==================================================Ascend ============================= test session starts ============================== platform linux -- Python 3.9.19, pytest-6.2.5, py-1.11.0, pluggy-1.5.0 rootdir: /home/jenkins/mindspore/testcases/testcases/tests/st/graph_kernel, configfile: ../../../../../../sault/virtual_test/virtualenv_007/sault/config/pytest.ini plugins: mock-3.14.0, hydra-core-1.3.2, forked-1.6.0, anyio-4.9.0, xdist-1.32.0 collected 1 item test_mix_precision.py [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:37:31.204.44 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:37:31.209.28 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.108939, [21] [bootstrap]: 0.00064634 [type_inference]: 0.0389518 [event_method]: 2.351e-05 [auto_monad]: 0.00011186 [graph_reusing]: 6.67002e-06 [inline]: 3.43e-06 [add_attr]: 0.0484843, [1] [add_attr_with_inline]: 0.0484678, [1] [Cycle 1]: 0.00013613, [2] [tag_attr]: 2.757e-05 [meta_addattr_fg_expand]: 6.22001e-06 [parallel-infer-symbol]: 4.37e-06 [pre_auto_parallel]: 5.798e-05 [insert-virtual-dataset]: 2.99999e-06 [parallel-infer-symbol-second]: 1.03001e-06 [dataset_repeat_opt]: 2.14e-06 [pipeline_split]: 1.86e-06 [optimize]: 0.0190183, [53] [py_interpret_to_execute]: 4.014e-05 [rewriter_before_opt_a]: 0.00011445 [opt_a]: 0.00364515, [2] [Cycle 1]: 0.0026718, [45] [expand_dump_flag]: 3.75e-06 [switch_simplify]: 4.397e-05 [loop_unroll]: 3.071e-05 [a_1]: 0.00072782 [with_stream_mark]: 2.678e-05 [recompute_prepare]: 1.375e-05 [updatestate_depend_eliminate]: 5.15999e-06 [updatestate_assign_eliminate]: 3.67998e-06 [updatestate_loads_eliminate]: 3.00998e-06 [parameter_eliminate]: 2.71e-06 [a_2]: 0.0001347 [accelerated_algorithm]: 2.388e-05 [shard]: 2.37001e-06 [meta_shard_fg_expand]: 2.50002e-06 [shard_inline]: 7.09001e-06 [merge_send_recv]: 9.54e-06 [auto_parallel]: 9.11998e-06 [parallel]: 4.206e-05 [flash_sp]: 1.261e-05 [merge_comm]: 5.87999e-06 [allreduce_fusion]: 3.45998e-06 [matmul_add_comm_reduction]: 1.206e-05 [allreduce_slice_to_reducescatter]: 1.17999e-06 [virtual_shard_identity]: 1.098e-05 [virtual_dataset]: 6.89001e-06 [get_grad_eliminate_]: 6.60997e-06 [virtual_output]: 6.70002e-06 [merge_forward]: 5.26998e-06 [cell_reuse_recompute_pass]: 1.71002e-06 [offload_activation]: 1.14e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.791e-05 [merge_recompute_call_nodes]: 2.24001e-06 [before_grad]: 1.528e-05 [set_forward_comm_id_for_comm_node_pass]: 4.38999e-06 [meta_fg_expand]: 3.83999e-06 [flash_sp_send_recv_attached]: 5.00001e-06 [receive_attached]: 1.278e-05 [after_resolve]: 1.625e-05 [a_after_grad]: 1.145e-05 [renormalize]: 0.00081065 [add_forward_monad_depend]: 7.55998e-06 [auto_monad_grad]: 2.61999e-06 [auto_monad_eliminator]: 1.99e-05 [cse]: 4.251e-05 [a_3]: 6.745e-05 [Cycle 2]: 0.00095589, [45] [expand_dump_flag]: 2.61999e-06 [switch_simplify]: 8.08999e-06 [loop_unroll]: 6.46e-06 [a_1]: 0.0001484 [with_stream_mark]: 1.612e-05 [recompute_prepare]: 7.65e-06 [updatestate_depend_eliminate]: 3.47002e-06 [updatestate_assign_eliminate]: 3.13e-06 [updatestate_loads_eliminate]: 2.79001e-06 [parameter_eliminate]: 2.20002e-06 [a_2]: 0.00010688 [accelerated_algorithm]: 1.084e-05 [shard]: 1.91998e-06 [meta_shard_fg_expand]: 2.17001e-06 [shard_inline]: 6.63998e-06 [merge_send_recv]: 7.18998e-06 [auto_parallel]: 9.30001e-06 [parallel]: 7.83999e-06 [flash_sp]: 3.5e-06 [merge_comm]: 4.15e-06 [allreduce_fusion]: 3.38e-06 [matmul_add_comm_reduction]: 1.21e-05 [allreduce_slice_to_reducescatter]: 8.50006e-07 [virtual_shard_identity]: 8.94e-06 [virtual_dataset]: 6.89001e-06 [get_grad_eliminate_]: 5.94e-06 [virtual_output]: 6.31998e-06 [merge_forward]: 4.47e-06 [cell_reuse_recompute_pass]: 2.18002e-06 [offload_activation]: 1.101e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.583e-05 [merge_recompute_call_nodes]: 1.63002e-06 [before_grad]: 1.06e-05 [set_forward_comm_id_for_comm_node_pass]: 5.02e-06 [meta_fg_expand]: 2.29999e-06 [flash_sp_send_recv_attached]: 1.86998e-06 [receive_attached]: 1.99e-06 [after_resolve]: 1.257e-05 [a_after_grad]: 9.60001e-06 [renormalize]: 1.00001e-07 [add_forward_monad_depend]: 3.28e-06 [auto_monad_grad]: 1.64e-06 [auto_monad_eliminator]: 1.107e-05 [cse]: 2.229e-05 [a_3]: 5.218e-05 [py_interpret_to_execute_after_opt_a]: 2.064e-05 [slice_cell_reuse_recomputed_activation]: 4.95001e-06 [rewriter_after_opt_a]: 7.596e-05 [convert_after_rewriter]: 1.216e-05 [order_py_execute_after_rewriter]: 8.13999e-06 [mutable_eliminate]: 0.0128715 [opt_b]: 0.00036138, [1] [Cycle 1]: 0.00034773, [7] [b_1]: 0.00018482 [b_2]: 9.89999e-06 [updatestate_depend_eliminate]: 1.314e-05 [updatestate_assign_eliminate]: 3.57997e-06 [updatestate_loads_eliminate]: 3.81001e-06 [renormalize]: 1.15001e-06 [cse]: 5.371e-05 [optimize_parallel_all_gather_comm]: 2.939e-05 [overlap_param_gather]: 5.22e-06 [cconv]: 4.516e-05 [loop_unroll]: 0.00076526 [opt_after_cconv]: 0.0001569, [1] [Cycle 1]: 0.00014633, [7] [c_1]: 3.463e-05 [parameter_eliminate]: 6.94999e-06 [updatestate_depend_eliminate]: 9.24e-06 [updatestate_assign_eliminate]: 3.18e-06 [updatestate_loads_eliminate]: 3.24001e-06 [cse]: 2.999e-05 [renormalize]: 7.00005e-07 [remove_dup_value]: 1.882e-05 [tuple_transform]: 0.00010243, [1] [Cycle 1]: 9.36e-05, [4] [d_1]: 5.285e-05 [none_parameter_eliminate]: 1.82999e-06 [renormalize]: 1.79978e-07 [switch_simplify]: 7.61999e-06 [partial_unused_args_eliminate]: 4.95001e-06 [add_recomputation]: 6.56e-05 [cse_after_recomputation]: 3.227e-05, [1] [Cycle 1]: 2.479e-05, [1] [cse]: 1.393e-05 [environ_conv]: 2.486e-05 [swap_dp_allreduce_reducescatter]: 8.74e-06 [bias_add_comm_swap]: 5.69999e-06 [label_micro_interleaved_index]: 1.056e-05 [label_fine_grained_interleaved_index]: 5.52001e-06 [merge_cast_opt]: 4.53999e-06 [slice_recompute_activation]: 5.30001e-06 [micro_interleaved_order_control]: 5.13002e-06 [assign_add_opt]: 4.18999e-06 [ForceFp32Comm]: 3.61999e-06 [remove_cast_before_assign_add]: 3.53999e-06 [full_micro_interleaved_order_control]: 4.70001e-06 [reorder_send_recv_between_fp_bp]: 5.17e-06 [comm_op_add_attrs]: 3.52002e-06 [add_comm_op_reuse_tag]: 3.75e-06 [interleave_split_concat_branches]: 3.83001e-06 [interleave_parallel_branches]: 3.57997e-06 [overlap_opt_shard_in_pipeline]: 2.844e-05 [overlap_opt_shard_grad_in_pipeline]: 4.42e-06 [control_data_broadcast_order]: 1.947e-05 [grouped_pairwise_exchange_alltoall]: 4.55001e-06 [offloading_packed_experts]: 7.19001e-06 [overlap_recompute_and_grad_model_parallel]: 8.3e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.98999e-06 [overlap_recompute_allgather_and_fa_grad]: 3.68e-06 [overlap_recompute_comm]: 5.10999e-06 [overlap_grad_ring_attention]: 6.59001e-06 [overlap_grad_flash_sp]: 5.364e-05 [begin_end_overlap_inline]: 3.51001e-06 [split_matmul_comm_elemetwise]: 5.02999e-06 [split_layernorm_comm]: 4.37998e-06 [handle_group_info]: 4.16001e-06 [symbol_engine_optimizer]: 0.00011647, [1] [Cycle 1]: 0.00010756, [6] [build]: 5.57001e-06 [elim_shapecalc]: 1.49e-05 [elim_not_effective]: 1.688e-05 [opt_reshape]: 8.41002e-06 [fold_const_symbol]: 1.085e-05 [renormalize]: 2.69996e-07 [detach_backward]: 6.07001e-06 [pipeline_parallel_scheduler]: 2.14e-06 [auto_monad_reorder]: 3.28e-05 [get_jit_bprop_graph]: 2.79001e-06 [rewriter_after_jit_bprop_graph]: 6.51999e-06 [opt_after_jit_grad]: 0.00075193 [validate]: 6.853e-05 Sums bootstrap : 0.000646s : 1.11% type_inference : 0.038952s : 66.75% event_method : 0.000024s : 0.04% auto_monad : 0.000112s : 0.19% graph_reusing : 0.000007s : 0.01% inline : 0.000003s : 0.01% add_attr.add_attr_with_inline.tag_attr : 0.000028s : 0.05% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.01% parallel-infer-symbol : 0.000004s : 0.01% pre_auto_parallel : 0.000058s : 0.10% insert-virtual-dataset : 0.000003s : 0.01% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000040s : 0.07% optimize.rewriter_before_opt_a : 0.000114s : 0.20% optimize.opt_a.expand_dump_flag : 0.000006s : 0.01% optimize.opt_a.switch_simplify : 0.000052s : 0.09% optimize.opt_a.loop_unroll : 0.000037s : 0.06% optimize.opt_a.a_1 : 0.000876s : 1.50% optimize.opt_a.with_stream_mark : 0.000043s : 0.07% optimize.opt_a.recompute_prepare : 0.000021s : 0.04% optimize.opt_a.updatestate_depend_eliminate : 0.000009s : 0.01% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.01% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.01% optimize.opt_a.parameter_eliminate : 0.000005s : 0.01% optimize.opt_a.a_2 : 0.000242s : 0.41% optimize.opt_a.accelerated_algorithm : 0.000035s : 0.06% optimize.opt_a.shard : 0.000004s : 0.01% optimize.opt_a.meta_shard_fg_expand : 0.000005s : 0.01% optimize.opt_a.shard_inline : 0.000014s : 0.02% optimize.opt_a.merge_send_recv : 0.000017s : 0.03% optimize.opt_a.auto_parallel : 0.000018s : 0.03% optimize.opt_a.parallel : 0.000050s : 0.09% optimize.opt_a.flash_sp : 0.000016s : 0.03% optimize.opt_a.merge_comm : 0.000010s : 0.02% optimize.opt_a.allreduce_fusion : 0.000007s : 0.01% optimize.opt_a.matmul_add_comm_reduction : 0.000024s : 0.04% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000002s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000020s : 0.03% optimize.opt_a.virtual_dataset : 0.000014s : 0.02% optimize.opt_a.get_grad_eliminate_ : 0.000013s : 0.02% optimize.opt_a.virtual_output : 0.000013s : 0.02% optimize.opt_a.merge_forward : 0.000010s : 0.02% optimize.opt_a.cell_reuse_recompute_pass : 0.000004s : 0.01% optimize.opt_a.offload_activation : 0.000022s : 0.04% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000034s : 0.06% optimize.opt_a.merge_recompute_call_nodes : 0.000004s : 0.01% optimize.opt_a.before_grad : 0.000026s : 0.04% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000009s : 0.02% optimize.opt_a.meta_fg_expand : 0.000006s : 0.01% optimize.opt_a.flash_sp_send_recv_attached : 0.000007s : 0.01% optimize.opt_a.receive_attached : 0.000015s : 0.03% optimize.opt_a.after_resolve : 0.000029s : 0.05% optimize.opt_a.a_after_grad : 0.000021s : 0.04% optimize.opt_a.renormalize : 0.000811s : 1.39% optimize.opt_a.add_forward_monad_depend : 0.000011s : 0.02% optimize.opt_a.auto_monad_grad : 0.000004s : 0.01% optimize.opt_a.auto_monad_eliminator : 0.000031s : 0.05% optimize.opt_a.cse : 0.000065s : 0.11% optimize.opt_a.a_3 : 0.000120s : 0.21% optimize.py_interpret_to_execute_after_opt_a : 0.000021s : 0.04% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.01% optimize.rewriter_after_opt_a : 0.000076s : 0.13% optimize.convert_after_rewriter : 0.000012s : 0.02% optimize.order_py_execute_after_rewriter : 0.000008s : 0.01% optimize.mutable_eliminate : 0.012871s : 22.06% optimize.opt_b.b_1 : 0.000185s : 0.32% optimize.opt_b.b_2 : 0.000010s : 0.02% optimize.opt_b.updatestate_depend_eliminate : 0.000013s : 0.02% optimize.opt_b.updatestate_assign_eliminate : 0.000004s : 0.01% optimize.opt_b.updatestate_loads_eliminate : 0.000004s : 0.01% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000054s : 0.09% optimize.optimize_parallel_all_gather_comm : 0.000029s : 0.05% optimize.overlap_param_gather : 0.000005s : 0.01% optimize.cconv : 0.000045s : 0.08% optimize.loop_unroll : 0.000765s : 1.31% optimize.opt_after_cconv.c_1 : 0.000035s : 0.06% optimize.opt_after_cconv.parameter_eliminate : 0.000007s : 0.01% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000009s : 0.02% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.cse : 0.000030s : 0.05% optimize.opt_after_cconv.renormalize : 0.000001s : 0.00% optimize.remove_dup_value : 0.000019s : 0.03% optimize.tuple_transform.d_1 : 0.000053s : 0.09% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000008s : 0.01% optimize.partial_unused_args_eliminate : 0.000005s : 0.01% optimize.add_recomputation : 0.000066s : 0.11% optimize.cse_after_recomputation.cse : 0.000014s : 0.02% optimize.environ_conv : 0.000025s : 0.04% optimize.swap_dp_allreduce_reducescatter : 0.000009s : 0.01% optimize.bias_add_comm_swap : 0.000006s : 0.01% optimize.label_micro_interleaved_index : 0.000011s : 0.02% optimize.label_fine_grained_interleaved_index : 0.000006s : 0.01% optimize.merge_cast_opt : 0.000005s : 0.01% optimize.slice_recompute_activation : 0.000005s : 0.01% optimize.micro_interleaved_order_control : 0.000005s : 0.01% optimize.assign_add_opt : 0.000004s : 0.01% optimize.ForceFp32Comm : 0.000004s : 0.01% optimize.remove_cast_before_assign_add : 0.000004s : 0.01% optimize.full_micro_interleaved_order_control : 0.000005s : 0.01% optimize.reorder_send_recv_between_fp_bp : 0.000005s : 0.01% optimize.comm_op_add_attrs : 0.000004s : 0.01% optimize.add_comm_op_reuse_tag : 0.000004s : 0.01% optimize.interleave_split_concat_branches : 0.000004s : 0.01% optimize.interleave_parallel_branches : 0.000004s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000028s : 0.05% optimize.overlap_opt_shard_grad_in_pipeline : 0.000004s : 0.01% optimize.control_data_broadcast_order : 0.000019s : 0.03% optimize.grouped_pairwise_exchange_alltoall : 0.000005s : 0.01% optimize.offloading_packed_experts : 0.000007s : 0.01% optimize.overlap_recompute_and_grad_model_parallel : 0.000008s : 0.01% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.01% optimize.overlap_recompute_comm : 0.000005s : 0.01% optimize.overlap_grad_ring_attention : 0.000007s : 0.01% optimize.overlap_grad_flash_sp : 0.000054s : 0.09% optimize.begin_end_overlap_inline : 0.000004s : 0.01% optimize.split_matmul_comm_elemetwise : 0.000005s : 0.01% optimize.split_layernorm_comm : 0.000004s : 0.01% optimize.handle_group_info : 0.000004s : 0.01% optimize.symbol_engine_optimizer.build : 0.000006s : 0.01% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000015s : 0.03% optimize.symbol_engine_optimizer.elim_not_effective : 0.000017s : 0.03% optimize.symbol_engine_optimizer.opt_reshape : 0.000008s : 0.01% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000011s : 0.02% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000006s : 0.01% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000033s : 0.06% get_jit_bprop_graph : 0.000003s : 0.00% rewriter_after_jit_bprop_graph : 0.000007s : 0.01% opt_after_jit_grad : 0.000752s : 1.29% validate : 0.000069s : 0.12% Time group info: ------[substitution.] 0.000300 36 13.30% : 0.000040s : 6: substitution.arithmetic_simplify 0.88% : 0.000003s : 2: substitution.elim_not_effective 0.50% : 0.000001s : 2: substitution.fold_const_symbol 2.20% : 0.000007s : 4: substitution.graph_param_transform 67.48% : 0.000202s : 4: substitution.inline 1.91% : 0.000006s : 4: substitution.j_node_and_user_rematch 4.48% : 0.000013s : 2: substitution.less_batch_normalization 1.92% : 0.000006s : 4: substitution.remove_not_recompute_node 2.20% : 0.000007s : 4: substitution.replace_old_param 5.12% : 0.000015s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.038874 2 97.55% : 0.037921s : 1: type_inference.infer 2.45% : 0.000952s : 1: type_inference.specialize ------[replace.] 0.000071 8 63.61% : 0.000045s : 4: replace.inline 36.39% : 0.000026s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000211 8 93.70% : 0.000198s : 4: match.inline 6.30% : 0.000013s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000236 1278 0.81% : 0.000002s : 13: predicate.accumulaten_eliminater 1.33% : 0.000003s : 4: predicate.ad_related_special_op_eliminate 0.51% : 0.000001s : 8: predicate.addn_check_dump 0.95% : 0.000002s : 13: predicate.addn_zero_filter 0.70% : 0.000002s : 13: predicate.adjust_all_reduce_mul_add 2.44% : 0.000006s : 21: predicate.arithmetic_simplify 0.98% : 0.000002s : 13: predicate.cast_eliminate 0.53% : 0.000001s : 8: predicate.check_bprop_eliminate 0.52% : 0.000001s : 8: predicate.compare_switch_simplify 0.16% : 0.000000s : 4: predicate.const_output_eliminate 0.54% : 0.000001s : 8: predicate.depend_value_elim 0.87% : 0.000002s : 13: predicate.dict_get_item_const_eliminator 0.93% : 0.000002s : 13: predicate.dict_get_item_eliminator 0.81% : 0.000002s : 13: predicate.dict_set_item_eliminator 1.37% : 0.000003s : 8: predicate.dumpgradient_eliminate 0.25% : 0.000001s : 4: predicate.elim_not_effective 0.57% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.11% : 0.000003s : 17: predicate.environ_add_const_eliminate 0.98% : 0.000002s : 17: predicate.environ_get_add_eliminate 1.02% : 0.000002s : 17: predicate.environ_get_depend_swap 1.80% : 0.000004s : 25: predicate.environ_get_eliminate 0.99% : 0.000002s : 17: predicate.environ_get_set_eliminate 1.29% : 0.000003s : 21: predicate.exchange_switch_depend_value 2.35% : 0.000006s : 21: predicate.float_depend_g_call 0.47% : 0.000001s : 8: predicate.float_environ_get_switch 0.73% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.14% : 0.000000s : 4: predicate.fold_const_symbol 0.63% : 0.000001s : 8: predicate.get_grad_eliminate 0.25% : 0.000001s : 4: predicate.graph_param_transform 0.54% : 0.000001s : 8: predicate.incorporate_call 0.43% : 0.000001s : 8: predicate.incorporate_call_switch 6.03% : 0.000014s : 58: predicate.inline 0.94% : 0.000002s : 8: predicate.inline_without_move 0.29% : 0.000001s : 8: predicate.j_node_and_user_rematch 0.75% : 0.000002s : 8: predicate.less_batch_normalization 1.96% : 0.000005s : 25: predicate.list_to_tuple_eliminator_ 2.20% : 0.000005s : 38: predicate.load_eliminater 1.56% : 0.000004s : 4: predicate.loop_unroll_after_grad 2.19% : 0.000005s : 34: predicate.loop_unroll_before_grad 1.66% : 0.000004s : 21: predicate.make_slice_get_slice_eliminator 0.45% : 0.000001s : 8: predicate.merge_addn 0.52% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.62% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.72% : 0.000002s : 13: predicate.minmaximum_grad 2.66% : 0.000006s : 4: predicate.mutable_eliminate 0.31% : 0.000001s : 4: predicate.opt_reshape 0.37% : 0.000001s : 4: predicate.parallel_virtual_node 1.63% : 0.000004s : 21: predicate.partial_defer_inline 1.42% : 0.000003s : 21: predicate.partial_eliminate 0.76% : 0.000002s : 13: predicate.print_const_string_wrapper 0.56% : 0.000001s : 8: predicate.reduce_all_const_elim 1.13% : 0.000003s : 13: predicate.reduce_eliminate 2.36% : 0.000006s : 38: predicate.redundant_stop_gradient_eliminater 0.64% : 0.000002s : 8: predicate.remove_not_recompute_node 1.32% : 0.000003s : 25: predicate.replace_applicator 0.64% : 0.000002s : 8: predicate.replace_old_param 0.50% : 0.000001s : 4: predicate.reset_defer_inline 0.81% : 0.000002s : 13: predicate.reshape_eliminate 0.58% : 0.000001s : 8: predicate.row_tensor_add_zeros_like 0.38% : 0.000001s : 4: predicate.row_tensor_eliminate 0.86% : 0.000002s : 8: predicate.same_eliminate 0.57% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.82% : 0.000002s : 8: predicate.shard_identity_eliminate 0.73% : 0.000002s : 8: predicate.special_op_eliminate 0.57% : 0.000001s : 8: predicate.specialize_transform 1.08% : 0.000003s : 8: predicate.split_environ_get_set_with_tuple_value 0.93% : 0.000002s : 8: predicate.stack_unstack_eliminate 0.34% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.44% : 0.000003s : 21: predicate.switch_defer_inline 1.75% : 0.000004s : 29: predicate.switch_layer_defer_inline 4.70% : 0.000011s : 67: predicate.switch_simplify 0.77% : 0.000002s : 13: predicate.tile_eliminate 0.84% : 0.000002s : 13: predicate.transpose_eliminate 1.50% : 0.000004s : 21: predicate.tuple_list_convert_item_index_to_positive 1.74% : 0.000004s : 21: predicate.tuple_list_get_item_const_eliminator 1.48% : 0.000003s : 21: predicate.tuple_list_get_item_depend_reorder 3.43% : 0.000008s : 33: predicate.tuple_list_get_item_eliminator 1.41% : 0.000003s : 21: predicate.tuple_list_get_set_item_eliminator 2.53% : 0.000006s : 29: predicate.tuple_list_set_item_eliminator 1.80% : 0.000004s : 25: predicate.tuple_to_list_eliminator_ 2.24% : 0.000005s : 38: predicate.updatestate_pure_node_eliminater 3.16% : 0.000007s : 46: predicate.updatestate_useless_node_eliminater 0.39% : 0.000001s : 4: predicate.value_based_eliminate 0.61% : 0.000001s : 8: predicate.virtual_dataset_eliminate 0.57% : 0.000001s : 8: predicate.virtual_output_eliminate 0.24% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.52% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000824 11 56.90% : 0.000469s : 5: func_graph_cloner_run.FuncGraphClonerGraph 43.10% : 0.000355s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.178784 192 0.00% : 0.000006s : 1: ForceFp32Comm 27.13% : 0.048497s : 1: add_attr 27.11% : 0.048473s : 1: add_attr_with_inline 0.00% : 0.000006s : 1: add_comm_op_reuse_tag 0.04% : 0.000070s : 1: add_recomputation 0.00% : 0.000007s : 1: assign_add_opt 0.07% : 0.000124s : 1: auto_monad 0.02% : 0.000041s : 1: auto_monad_reorder 0.00% : 0.000006s : 1: begin_end_overlap_inline 0.00% : 0.000009s : 1: bias_add_comm_swap 0.40% : 0.000724s : 1: bootstrap 0.03% : 0.000048s : 1: cconv 0.00% : 0.000006s : 1: comm_op_add_attrs 0.01% : 0.000023s : 1: control_data_broadcast_order 0.01% : 0.000016s : 1: convert_after_rewriter 0.02% : 0.000035s : 1: cse_after_recomputation 0.00% : 0.000008s : 1: dataset_repeat_opt 0.02% : 0.000032s : 1: detach_backward 0.02% : 0.000028s : 1: environ_conv 0.02% : 0.000034s : 1: event_method 0.00% : 0.000008s : 1: full_micro_interleaved_order_control 0.01% : 0.000009s : 1: get_jit_bprop_graph 0.01% : 0.000022s : 1: graph_reusing 0.00% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000007s : 1: handle_group_info 0.01% : 0.000009s : 1: inline 0.01% : 0.000010s : 1: insert-virtual-dataset 0.00% : 0.000006s : 1: interleave_parallel_branches 0.00% : 0.000006s : 1: interleave_split_concat_branches 0.00% : 0.000008s : 1: label_fine_grained_interleaved_index 0.01% : 0.000013s : 1: label_micro_interleaved_index 0.43% : 0.000774s : 1: loop_unroll 0.00% : 0.000007s : 1: merge_cast_opt 0.00% : 0.000008s : 1: micro_interleaved_order_control 7.21% : 0.012886s : 1: mutable_eliminate 0.01% : 0.000011s : 1: offloading_packed_experts 0.01% : 0.000023s : 1: opt.transform.loop_unroll_optimizer 0.02% : 0.000039s : 1: opt.transform.mutable_eliminate 0.77% : 0.001373s : 78: opt.transform.opt_a 0.02% : 0.000033s : 1: opt.transform.opt_after_cconv 0.02% : 0.000037s : 1: opt.transform.opt_after_jit_grad 0.07% : 0.000117s : 28: opt.transform.opt_b 0.03% : 0.000058s : 2: opt.transform.opt_trans_graph 0.03% : 0.000045s : 4: opt.transform.symbol_engine_opt 2.04% : 0.003649s : 1: opt_a 0.09% : 0.000161s : 1: opt_after_cconv 0.43% : 0.000765s : 1: opt_after_jit_grad 0.20% : 0.000366s : 1: opt_b 10.87% : 0.019434s : 1: optimize 0.02% : 0.000033s : 1: optimize_parallel_all_gather_comm 0.01% : 0.000011s : 1: order_py_execute_after_rewriter 0.03% : 0.000058s : 1: overlap_grad_flash_sp 0.00% : 0.000007s : 1: overlap_grad_matmul_and_grad_allreduce 0.01% : 0.000011s : 1: overlap_grad_ring_attention 0.00% : 0.000009s : 1: overlap_opt_shard_grad_in_pipeline 0.02% : 0.000032s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000008s : 1: overlap_param_gather 0.00% : 0.000006s : 1: overlap_recompute_allgather_and_fa_grad 0.01% : 0.000011s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000008s : 1: overlap_recompute_comm 0.01% : 0.000012s : 1: parallel-infer-symbol 0.00% : 0.000007s : 1: parallel-infer-symbol-second 0.00% : 0.000008s : 1: partial_unused_args_eliminate 0.01% : 0.000010s : 1: pipeline_parallel_scheduler 0.00% : 0.000007s : 1: pipeline_split 0.04% : 0.000068s : 1: pre_auto_parallel 0.03% : 0.000045s : 1: py_interpret_to_execute 0.01% : 0.000024s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000008s : 1: remove_cast_before_assign_add 0.01% : 0.000022s : 1: remove_dup_value 0.25% : 0.000445s : 1: renormalize.infer 0.20% : 0.000353s : 1: renormalize.specialize 0.00% : 0.000008s : 1: reorder_send_recv_between_fp_bp 0.01% : 0.000012s : 1: rewriter_after_jit_bprop_graph 0.05% : 0.000082s : 1: rewriter_after_opt_a 0.07% : 0.000119s : 1: rewriter_before_opt_a 0.00% : 0.000008s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000008s : 1: slice_recompute_activation 0.00% : 0.000007s : 1: split_layernorm_comm 0.00% : 0.000008s : 1: split_matmul_comm_elemetwise 0.01% : 0.000012s : 1: swap_dp_allreduce_reducescatter 0.07% : 0.000120s : 1: symbol_engine_optimizer 0.06% : 0.000106s : 1: tuple_transform 21.82% : 0.039007s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:37:56.508.384 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0416341, [21] [bootstrap]: 0.00053473 [type_inference]: 0.0313871 [event_method]: 1.856e-05 [auto_monad]: 6.576e-05 [graph_reusing]: 6.28e-06 [inline]: 2.46998e-06 [add_attr]: 0.00371726, [1] [add_attr_with_inline]: 0.00370448, [1] [Cycle 1]: 6.774e-05, [2] [tag_attr]: 2.033e-05 [meta_addattr_fg_expand]: 6.26998e-06 [parallel-infer-symbol]: 3.76999e-06 [pre_auto_parallel]: 3.41e-05 [insert-virtual-dataset]: 2.41e-06 [parallel-infer-symbol-second]: 7.89994e-07 [dataset_repeat_opt]: 2.58003e-06 [pipeline_split]: 1.84e-06 [optimize]: 0.00509679, [53] [py_interpret_to_execute]: 2.865e-05 [rewriter_before_opt_a]: 8.132e-05 [opt_a]: 0.00282672, [2] [Cycle 1]: 0.00210028, [45] [expand_dump_flag]: 2.98003e-06 [switch_simplify]: 4.289e-05 [loop_unroll]: 3.035e-05 [a_1]: 0.00065583 [with_stream_mark]: 1.997e-05 [recompute_prepare]: 1.089e-05 [updatestate_depend_eliminate]: 4.68999e-06 [updatestate_assign_eliminate]: 3.09001e-06 [updatestate_loads_eliminate]: 3.01001e-06 [parameter_eliminate]: 2.02001e-06 [a_2]: 8.555e-05 [accelerated_algorithm]: 1.906e-05 [shard]: 2.12001e-06 [meta_shard_fg_expand]: 2.34999e-06 [shard_inline]: 7.01999e-06 [merge_send_recv]: 8.57e-06 [auto_parallel]: 7.63999e-06 [parallel]: 2.875e-05 [flash_sp]: 7.97003e-06 [merge_comm]: 3.91999e-06 [allreduce_fusion]: 5.97999e-06 [matmul_add_comm_reduction]: 1.209e-05 [allreduce_slice_to_reducescatter]: 6.30011e-07 [virtual_shard_identity]: 1.054e-05 [virtual_dataset]: 7.4e-06 [get_grad_eliminate_]: 6.31e-06 [virtual_output]: 6.51999e-06 [merge_forward]: 4.62e-06 [cell_reuse_recompute_pass]: 1.19e-06 [offload_activation]: 1.004e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.194e-05 [merge_recompute_call_nodes]: 1.74e-06 [before_grad]: 9.99001e-06 [set_forward_comm_id_for_comm_node_pass]: 3.73001e-06 [meta_fg_expand]: 2.97002e-06 [flash_sp_send_recv_attached]: 4.77998e-06 [receive_attached]: 2.43e-06 [after_resolve]: 1.484e-05 [a_after_grad]: 1.003e-05 [renormalize]: 0.00065297 [add_forward_monad_depend]: 5.86998e-06 [auto_monad_grad]: 2.50002e-06 [auto_monad_eliminator]: 1.69e-05 [cse]: 2.995e-05 [a_3]: 4.92e-05 [Cycle 2]: 0.00071631, [45] [expand_dump_flag]: 1.81003e-06 [switch_simplify]: 8.84e-06 [loop_unroll]: 6.19999e-06 [a_1]: 0.00014306 [with_stream_mark]: 1.197e-05 [recompute_prepare]: 7.98001e-06 [updatestate_depend_eliminate]: 3.41999e-06 [updatestate_assign_eliminate]: 2.61e-06 [updatestate_loads_eliminate]: 2.77002e-06 [parameter_eliminate]: 1.52001e-06 [a_2]: 7.699e-05 [accelerated_algorithm]: 9.72999e-06 [shard]: 1.10999e-06 [meta_shard_fg_expand]: 1.49998e-06 [shard_inline]: 6.09001e-06 [merge_send_recv]: 6.48e-06 [auto_parallel]: 7.04001e-06 [parallel]: 6.77002e-06 [flash_sp]: 4.10998e-06 [merge_comm]: 3.41001e-06 [allreduce_fusion]: 3.28e-06 [matmul_add_comm_reduction]: 8.25999e-06 [allreduce_slice_to_reducescatter]: 3.60014e-07 [virtual_shard_identity]: 7.88001e-06 [virtual_dataset]: 6.06998e-06 [get_grad_eliminate_]: 5.69999e-06 [virtual_output]: 5.72001e-06 [merge_forward]: 3.3e-06 [cell_reuse_recompute_pass]: 2.06e-06 [offload_activation]: 9.24998e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.129e-05 [merge_recompute_call_nodes]: 1.07e-06 [before_grad]: 9.19e-06 [set_forward_comm_id_for_comm_node_pass]: 4.44002e-06 [meta_fg_expand]: 2.07999e-06 [flash_sp_send_recv_attached]: 1.02e-06 [receive_attached]: 1.68002e-06 [after_resolve]: 1.175e-05 [a_after_grad]: 9.44e-06 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 2.34999e-06 [auto_monad_grad]: 1.27999e-06 [auto_monad_eliminator]: 9.32001e-06 [cse]: 1.57e-05 [a_3]: 3.968e-05 [py_interpret_to_execute_after_opt_a]: 1.242e-05 [slice_cell_reuse_recomputed_activation]: 2.21e-06 [rewriter_after_opt_a]: 4.067e-05 [convert_after_rewriter]: 7.18e-06 [order_py_execute_after_rewriter]: 5.28002e-06 [mutable_eliminate]: 0.00060369 [opt_b]: 0.0002308, [1] [Cycle 1]: 0.0002229, [7] [b_1]: 0.00013019 [b_2]: 9.37999e-06 [updatestate_depend_eliminate]: 7.88999e-06 [updatestate_assign_eliminate]: 2.86e-06 [updatestate_loads_eliminate]: 2.91e-06 [renormalize]: 8.90024e-07 [cse]: 2.79e-05 [optimize_parallel_all_gather_comm]: 2.018e-05 [overlap_param_gather]: 2.32001e-06 [cconv]: 3.23e-05 [loop_unroll]: 0.00049445 [opt_after_cconv]: 0.00011418, [1] [Cycle 1]: 0.00010702, [7] [c_1]: 3.1e-05 [parameter_eliminate]: 4.84e-06 [updatestate_depend_eliminate]: 7.95e-06 [updatestate_assign_eliminate]: 2.69001e-06 [updatestate_loads_eliminate]: 2.58998e-06 [cse]: 2.175e-05 [renormalize]: 6.69999e-07 [remove_dup_value]: 1.611e-05 [tuple_transform]: 8.08e-05, [1] [Cycle 1]: 7.554e-05, [4] [d_1]: 4.839e-05 [none_parameter_eliminate]: 1.37e-06 [renormalize]: 1.69995e-07 [switch_simplify]: 6.90998e-06 [partial_unused_args_eliminate]: 2.48998e-06 [add_recomputation]: 5.296e-05 [cse_after_recomputation]: 2.164e-05, [1] [Cycle 1]: 1.721e-05, [1] [cse]: 1.137e-05 [environ_conv]: 6.33998e-06 [swap_dp_allreduce_reducescatter]: 5.06997e-06 [bias_add_comm_swap]: 3.42997e-06 [label_micro_interleaved_index]: 4.57998e-06 [label_fine_grained_interleaved_index]: 2.66999e-06 [merge_cast_opt]: 1.28002e-06 [slice_recompute_activation]: 2.42001e-06 [micro_interleaved_order_control]: 2.14999e-06 [assign_add_opt]: 1.22999e-06 [ForceFp32Comm]: 1.20001e-06 [remove_cast_before_assign_add]: 9.99979e-07 [full_micro_interleaved_order_control]: 2.12999e-06 [reorder_send_recv_between_fp_bp]: 2.72001e-06 [comm_op_add_attrs]: 1.02e-06 [add_comm_op_reuse_tag]: 1.16002e-06 [interleave_split_concat_branches]: 1.22e-06 [interleave_parallel_branches]: 1.04e-06 [overlap_opt_shard_in_pipeline]: 1.39e-06 [overlap_opt_shard_grad_in_pipeline]: 2.00002e-06 [control_data_broadcast_order]: 1.4e-05 [grouped_pairwise_exchange_alltoall]: 1.47999e-06 [offloading_packed_experts]: 4.97e-06 [overlap_recompute_and_grad_model_parallel]: 5.39e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.13001e-06 [overlap_recompute_allgather_and_fa_grad]: 1.35999e-06 [overlap_recompute_comm]: 2.26998e-06 [overlap_grad_ring_attention]: 4.45e-06 [overlap_grad_flash_sp]: 2.223e-05 [begin_end_overlap_inline]: 7.39994e-07 [split_matmul_comm_elemetwise]: 2.15002e-06 [split_layernorm_comm]: 2.05002e-06 [handle_group_info]: 1.03001e-06 [symbol_engine_optimizer]: 8.542e-05, [1] [Cycle 1]: 7.908e-05, [6] [build]: 3.69002e-06 [elim_shapecalc]: 1.24e-05 [elim_not_effective]: 1.385e-05 [opt_reshape]: 7.2e-06 [fold_const_symbol]: 1.036e-05 [renormalize]: 2.30008e-07 [detach_backward]: 2.66e-06 [pipeline_parallel_scheduler]: 1.49e-06 [auto_monad_reorder]: 1.753e-05 [get_jit_bprop_graph]: 2.03002e-06 [rewriter_after_jit_bprop_graph]: 5.82999e-06 [opt_after_jit_grad]: 0.00052074 [validate]: 4.452e-05 Sums bootstrap : 0.000535s : 1.45% type_inference : 0.031387s : 85.11% event_method : 0.000019s : 0.05% auto_monad : 0.000066s : 0.18% graph_reusing : 0.000006s : 0.02% inline : 0.000002s : 0.01% add_attr.add_attr_with_inline.tag_attr : 0.000020s : 0.06% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.02% parallel-infer-symbol : 0.000004s : 0.01% pre_auto_parallel : 0.000034s : 0.09% insert-virtual-dataset : 0.000002s : 0.01% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000003s : 0.01% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000029s : 0.08% optimize.rewriter_before_opt_a : 0.000081s : 0.22% optimize.opt_a.expand_dump_flag : 0.000005s : 0.01% optimize.opt_a.switch_simplify : 0.000052s : 0.14% optimize.opt_a.loop_unroll : 0.000037s : 0.10% optimize.opt_a.a_1 : 0.000799s : 2.17% optimize.opt_a.with_stream_mark : 0.000032s : 0.09% optimize.opt_a.recompute_prepare : 0.000019s : 0.05% optimize.opt_a.updatestate_depend_eliminate : 0.000008s : 0.02% optimize.opt_a.updatestate_assign_eliminate : 0.000006s : 0.02% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.02% optimize.opt_a.parameter_eliminate : 0.000004s : 0.01% optimize.opt_a.a_2 : 0.000163s : 0.44% optimize.opt_a.accelerated_algorithm : 0.000029s : 0.08% optimize.opt_a.shard : 0.000003s : 0.01% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.01% optimize.opt_a.shard_inline : 0.000013s : 0.04% optimize.opt_a.merge_send_recv : 0.000015s : 0.04% optimize.opt_a.auto_parallel : 0.000015s : 0.04% optimize.opt_a.parallel : 0.000036s : 0.10% optimize.opt_a.flash_sp : 0.000012s : 0.03% optimize.opt_a.merge_comm : 0.000007s : 0.02% optimize.opt_a.allreduce_fusion : 0.000009s : 0.03% optimize.opt_a.matmul_add_comm_reduction : 0.000020s : 0.06% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000018s : 0.05% optimize.opt_a.virtual_dataset : 0.000013s : 0.04% optimize.opt_a.get_grad_eliminate_ : 0.000012s : 0.03% optimize.opt_a.virtual_output : 0.000012s : 0.03% optimize.opt_a.merge_forward : 0.000008s : 0.02% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.01% optimize.opt_a.offload_activation : 0.000019s : 0.05% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000023s : 0.06% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.01% optimize.opt_a.before_grad : 0.000019s : 0.05% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000008s : 0.02% optimize.opt_a.meta_fg_expand : 0.000005s : 0.01% optimize.opt_a.flash_sp_send_recv_attached : 0.000006s : 0.02% optimize.opt_a.receive_attached : 0.000004s : 0.01% optimize.opt_a.after_resolve : 0.000027s : 0.07% optimize.opt_a.a_after_grad : 0.000019s : 0.05% optimize.opt_a.renormalize : 0.000653s : 1.77% optimize.opt_a.add_forward_monad_depend : 0.000008s : 0.02% optimize.opt_a.auto_monad_grad : 0.000004s : 0.01% optimize.opt_a.auto_monad_eliminator : 0.000026s : 0.07% optimize.opt_a.cse : 0.000046s : 0.12% optimize.opt_a.a_3 : 0.000089s : 0.24% optimize.py_interpret_to_execute_after_opt_a : 0.000012s : 0.03% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.01% optimize.rewriter_after_opt_a : 0.000041s : 0.11% optimize.convert_after_rewriter : 0.000007s : 0.02% optimize.order_py_execute_after_rewriter : 0.000005s : 0.01% optimize.mutable_eliminate : 0.000604s : 1.64% optimize.opt_b.b_1 : 0.000130s : 0.35% optimize.opt_b.b_2 : 0.000009s : 0.03% optimize.opt_b.updatestate_depend_eliminate : 0.000008s : 0.02% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.01% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.01% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000028s : 0.08% optimize.optimize_parallel_all_gather_comm : 0.000020s : 0.05% optimize.overlap_param_gather : 0.000002s : 0.01% optimize.cconv : 0.000032s : 0.09% optimize.loop_unroll : 0.000494s : 1.34% optimize.opt_after_cconv.c_1 : 0.000031s : 0.08% optimize.opt_after_cconv.parameter_eliminate : 0.000005s : 0.01% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000008s : 0.02% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.cse : 0.000022s : 0.06% optimize.opt_after_cconv.renormalize : 0.000001s : 0.00% optimize.remove_dup_value : 0.000016s : 0.04% optimize.tuple_transform.d_1 : 0.000048s : 0.13% optimize.tuple_transform.none_parameter_eliminate : 0.000001s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000007s : 0.02% optimize.partial_unused_args_eliminate : 0.000002s : 0.01% optimize.add_recomputation : 0.000053s : 0.14% optimize.cse_after_recomputation.cse : 0.000011s : 0.03% optimize.environ_conv : 0.000006s : 0.02% optimize.swap_dp_allreduce_reducescatter : 0.000005s : 0.01% optimize.bias_add_comm_swap : 0.000003s : 0.01% optimize.label_micro_interleaved_index : 0.000005s : 0.01% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.01% optimize.merge_cast_opt : 0.000001s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.01% optimize.micro_interleaved_order_control : 0.000002s : 0.01% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.00% optimize.full_micro_interleaved_order_control : 0.000002s : 0.01% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.01% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.01% optimize.control_data_broadcast_order : 0.000014s : 0.04% optimize.grouped_pairwise_exchange_alltoall : 0.000001s : 0.00% optimize.offloading_packed_experts : 0.000005s : 0.01% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.01% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.00% optimize.overlap_recompute_comm : 0.000002s : 0.01% optimize.overlap_grad_ring_attention : 0.000004s : 0.01% optimize.overlap_grad_flash_sp : 0.000022s : 0.06% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.01% optimize.split_layernorm_comm : 0.000002s : 0.01% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000004s : 0.01% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000012s : 0.03% optimize.symbol_engine_optimizer.elim_not_effective : 0.000014s : 0.04% optimize.symbol_engine_optimizer.opt_reshape : 0.000007s : 0.02% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000010s : 0.03% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000003s : 0.01% pipeline_parallel_scheduler : 0.000001s : 0.00% auto_monad_reorder : 0.000018s : 0.05% get_jit_bprop_graph : 0.000002s : 0.01% rewriter_after_jit_bprop_graph : 0.000006s : 0.02% opt_after_jit_grad : 0.000521s : 1.41% validate : 0.000045s : 0.12% Time group info: ------[substitution.] 0.000230 36 14.29% : 0.000033s : 6: substitution.arithmetic_simplify 0.86% : 0.000002s : 2: substitution.elim_not_effective 0.56% : 0.000001s : 2: substitution.fold_const_symbol 2.75% : 0.000006s : 4: substitution.graph_param_transform 63.12% : 0.000145s : 4: substitution.inline 1.46% : 0.000003s : 4: substitution.j_node_and_user_rematch 4.95% : 0.000011s : 2: substitution.less_batch_normalization 2.04% : 0.000005s : 4: substitution.remove_not_recompute_node 2.29% : 0.000005s : 4: substitution.replace_old_param 7.67% : 0.000018s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.031315 2 97.51% : 0.030537s : 1: type_inference.infer 2.49% : 0.000779s : 1: type_inference.specialize ------[replace.] 0.000061 8 62.15% : 0.000038s : 4: replace.inline 37.85% : 0.000023s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000158 8 89.98% : 0.000142s : 4: match.inline 10.02% : 0.000016s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000207 1278 0.89% : 0.000002s : 13: predicate.accumulaten_eliminater 1.00% : 0.000002s : 4: predicate.ad_related_special_op_eliminate 0.47% : 0.000001s : 8: predicate.addn_check_dump 0.94% : 0.000002s : 13: predicate.addn_zero_filter 0.84% : 0.000002s : 13: predicate.adjust_all_reduce_mul_add 2.30% : 0.000005s : 21: predicate.arithmetic_simplify 0.84% : 0.000002s : 13: predicate.cast_eliminate 0.65% : 0.000001s : 8: predicate.check_bprop_eliminate 0.48% : 0.000001s : 8: predicate.compare_switch_simplify 0.19% : 0.000000s : 4: predicate.const_output_eliminate 0.58% : 0.000001s : 8: predicate.depend_value_elim 0.89% : 0.000002s : 13: predicate.dict_get_item_const_eliminator 0.98% : 0.000002s : 13: predicate.dict_get_item_eliminator 0.93% : 0.000002s : 13: predicate.dict_set_item_eliminator 1.00% : 0.000002s : 8: predicate.dumpgradient_eliminate 0.18% : 0.000000s : 4: predicate.elim_not_effective 0.38% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.08% : 0.000002s : 17: predicate.environ_add_const_eliminate 1.09% : 0.000002s : 17: predicate.environ_get_add_eliminate 1.06% : 0.000002s : 17: predicate.environ_get_depend_swap 2.03% : 0.000004s : 25: predicate.environ_get_eliminate 1.04% : 0.000002s : 17: predicate.environ_get_set_eliminate 1.48% : 0.000003s : 21: predicate.exchange_switch_depend_value 2.22% : 0.000005s : 21: predicate.float_depend_g_call 0.49% : 0.000001s : 8: predicate.float_environ_get_switch 0.83% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.16% : 0.000000s : 4: predicate.fold_const_symbol 0.65% : 0.000001s : 8: predicate.get_grad_eliminate 0.27% : 0.000001s : 4: predicate.graph_param_transform 0.56% : 0.000001s : 8: predicate.incorporate_call 0.45% : 0.000001s : 8: predicate.incorporate_call_switch 6.46% : 0.000013s : 58: predicate.inline 0.72% : 0.000001s : 8: predicate.inline_without_move 0.31% : 0.000001s : 8: predicate.j_node_and_user_rematch 0.94% : 0.000002s : 8: predicate.less_batch_normalization 1.82% : 0.000004s : 25: predicate.list_to_tuple_eliminator_ 2.58% : 0.000005s : 38: predicate.load_eliminater 1.23% : 0.000003s : 4: predicate.loop_unroll_after_grad 2.32% : 0.000005s : 34: predicate.loop_unroll_before_grad 1.63% : 0.000003s : 21: predicate.make_slice_get_slice_eliminator 0.51% : 0.000001s : 8: predicate.merge_addn 0.55% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.72% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.79% : 0.000002s : 13: predicate.minmaximum_grad 1.44% : 0.000003s : 4: predicate.mutable_eliminate 0.33% : 0.000001s : 4: predicate.opt_reshape 0.37% : 0.000001s : 4: predicate.parallel_virtual_node 1.85% : 0.000004s : 21: predicate.partial_defer_inline 1.61% : 0.000003s : 21: predicate.partial_eliminate 0.88% : 0.000002s : 13: predicate.print_const_string_wrapper 0.55% : 0.000001s : 8: predicate.reduce_all_const_elim 1.20% : 0.000002s : 13: predicate.reduce_eliminate 2.48% : 0.000005s : 38: predicate.redundant_stop_gradient_eliminater 0.50% : 0.000001s : 8: predicate.remove_not_recompute_node 1.51% : 0.000003s : 25: predicate.replace_applicator 0.59% : 0.000001s : 8: predicate.replace_old_param 0.30% : 0.000001s : 4: predicate.reset_defer_inline 0.91% : 0.000002s : 13: predicate.reshape_eliminate 0.60% : 0.000001s : 8: predicate.row_tensor_add_zeros_like 0.56% : 0.000001s : 4: predicate.row_tensor_eliminate 0.72% : 0.000001s : 8: predicate.same_eliminate 0.47% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.80% : 0.000002s : 8: predicate.shard_identity_eliminate 0.71% : 0.000001s : 8: predicate.special_op_eliminate 0.65% : 0.000001s : 8: predicate.specialize_transform 0.96% : 0.000002s : 8: predicate.split_environ_get_set_with_tuple_value 0.68% : 0.000001s : 8: predicate.stack_unstack_eliminate 0.33% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.55% : 0.000003s : 21: predicate.switch_defer_inline 2.10% : 0.000004s : 29: predicate.switch_layer_defer_inline 5.15% : 0.000011s : 67: predicate.switch_simplify 0.88% : 0.000002s : 13: predicate.tile_eliminate 0.87% : 0.000002s : 13: predicate.transpose_eliminate 1.52% : 0.000003s : 21: predicate.tuple_list_convert_item_index_to_positive 1.55% : 0.000003s : 21: predicate.tuple_list_get_item_const_eliminator 1.35% : 0.000003s : 21: predicate.tuple_list_get_item_depend_reorder 3.29% : 0.000007s : 33: predicate.tuple_list_get_item_eliminator 1.41% : 0.000003s : 21: predicate.tuple_list_get_set_item_eliminator 2.21% : 0.000005s : 29: predicate.tuple_list_set_item_eliminator 1.76% : 0.000004s : 25: predicate.tuple_to_list_eliminator_ 2.44% : 0.000005s : 38: predicate.updatestate_pure_node_eliminater 2.94% : 0.000006s : 46: predicate.updatestate_useless_node_eliminater 0.44% : 0.000001s : 4: predicate.value_based_eliminate 0.70% : 0.000001s : 8: predicate.virtual_dataset_eliminate 0.61% : 0.000001s : 8: predicate.virtual_output_eliminate 0.28% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.39% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000645 11 60.53% : 0.000390s : 5: func_graph_cloner_run.FuncGraphClonerGraph 39.47% : 0.000254s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.052509 192 0.01% : 0.000004s : 1: ForceFp32Comm 7.09% : 0.003723s : 1: add_attr 7.06% : 0.003709s : 1: add_attr_with_inline 0.01% : 0.000005s : 1: add_comm_op_reuse_tag 0.11% : 0.000057s : 1: add_recomputation 0.01% : 0.000004s : 1: assign_add_opt 0.14% : 0.000071s : 1: auto_monad 0.04% : 0.000022s : 1: auto_monad_reorder 0.01% : 0.000003s : 1: begin_end_overlap_inline 0.01% : 0.000006s : 1: bias_add_comm_swap 1.08% : 0.000566s : 1: bootstrap 0.07% : 0.000036s : 1: cconv 0.01% : 0.000004s : 1: comm_op_add_attrs 0.03% : 0.000017s : 1: control_data_broadcast_order 0.02% : 0.000010s : 1: convert_after_rewriter 0.05% : 0.000025s : 1: cse_after_recomputation 0.01% : 0.000006s : 1: dataset_repeat_opt 0.01% : 0.000006s : 1: detach_backward 0.02% : 0.000010s : 1: environ_conv 0.05% : 0.000025s : 1: event_method 0.01% : 0.000005s : 1: full_micro_interleaved_order_control 0.01% : 0.000005s : 1: get_jit_bprop_graph 0.02% : 0.000011s : 1: graph_reusing 0.01% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000004s : 1: handle_group_info 0.01% : 0.000006s : 1: inline 0.01% : 0.000006s : 1: insert-virtual-dataset 0.01% : 0.000004s : 1: interleave_parallel_branches 0.01% : 0.000004s : 1: interleave_split_concat_branches 0.01% : 0.000006s : 1: label_fine_grained_interleaved_index 0.01% : 0.000007s : 1: label_micro_interleaved_index 0.96% : 0.000504s : 1: loop_unroll 0.01% : 0.000004s : 1: merge_cast_opt 0.01% : 0.000005s : 1: micro_interleaved_order_control 1.17% : 0.000615s : 1: mutable_eliminate 0.01% : 0.000008s : 1: offloading_packed_experts 0.03% : 0.000018s : 1: opt.transform.loop_unroll_optimizer 0.04% : 0.000021s : 1: opt.transform.mutable_eliminate 2.38% : 0.001251s : 78: opt.transform.opt_a 0.06% : 0.000030s : 1: opt.transform.opt_after_cconv 0.06% : 0.000029s : 1: opt.transform.opt_after_jit_grad 0.20% : 0.000107s : 28: opt.transform.opt_b 0.10% : 0.000053s : 2: opt.transform.opt_trans_graph 0.07% : 0.000039s : 4: opt.transform.symbol_engine_opt 5.39% : 0.002830s : 1: opt_a 0.22% : 0.000118s : 1: opt_after_cconv 1.01% : 0.000532s : 1: opt_after_jit_grad 0.45% : 0.000235s : 1: opt_b 9.72% : 0.005102s : 1: optimize 0.05% : 0.000024s : 1: optimize_parallel_all_gather_comm 0.02% : 0.000009s : 1: order_py_execute_after_rewriter 0.05% : 0.000025s : 1: overlap_grad_flash_sp 0.01% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.01% : 0.000007s : 1: overlap_grad_ring_attention 0.01% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.01% : 0.000006s : 1: overlap_param_gather 0.01% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.02% : 0.000009s : 1: overlap_recompute_and_grad_model_parallel 0.01% : 0.000005s : 1: overlap_recompute_comm 0.02% : 0.000008s : 1: parallel-infer-symbol 0.01% : 0.000004s : 1: parallel-infer-symbol-second 0.01% : 0.000006s : 1: partial_unused_args_eliminate 0.01% : 0.000004s : 1: pipeline_parallel_scheduler 0.01% : 0.000005s : 1: pipeline_split 0.07% : 0.000038s : 1: pre_auto_parallel 0.06% : 0.000033s : 1: py_interpret_to_execute 0.03% : 0.000016s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000004s : 1: remove_cast_before_assign_add 0.04% : 0.000020s : 1: remove_dup_value 0.65% : 0.000343s : 1: renormalize.infer 0.57% : 0.000302s : 1: renormalize.specialize 0.01% : 0.000005s : 1: reorder_send_recv_between_fp_bp 0.02% : 0.000009s : 1: rewriter_after_jit_bprop_graph 0.08% : 0.000045s : 1: rewriter_after_opt_a 0.16% : 0.000086s : 1: rewriter_before_opt_a 0.01% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.01% : 0.000006s : 1: slice_recompute_activation 0.01% : 0.000005s : 1: split_layernorm_comm 0.01% : 0.000005s : 1: split_matmul_comm_elemetwise 0.02% : 0.000008s : 1: swap_dp_allreduce_reducescatter 0.17% : 0.000088s : 1: symbol_engine_optimizer 0.16% : 0.000084s : 1: tuple_transform 59.81% : 0.031408s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:38:06.822.53 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:38:06.828.66 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.105641, [21] [bootstrap]: 0.00063635 [type_inference]: 0.0716762 [event_method]: 2.512e-05 [auto_monad]: 0.00010885 [graph_reusing]: 6.50002e-06 [inline]: 3.09001e-06 [add_attr]: 0.00524303, [1] [add_attr_with_inline]: 0.00522759, [1] [Cycle 1]: 9.762e-05, [2] [tag_attr]: 2.568e-05 [meta_addattr_fg_expand]: 6.57002e-06 [parallel-infer-symbol]: 3.93001e-06 [pre_auto_parallel]: 4.516e-05 [insert-virtual-dataset]: 2.49001e-06 [parallel-infer-symbol-second]: 8.10018e-07 [dataset_repeat_opt]: 1.97001e-06 [pipeline_split]: 1.50001e-06 [optimize]: 0.026079, [53] [py_interpret_to_execute]: 4.481e-05 [rewriter_before_opt_a]: 0.00010909 [opt_a]: 0.0227583, [2] [Cycle 1]: 0.0215324, [45] [expand_dump_flag]: 3.71001e-06 [switch_simplify]: 4.5e-05 [loop_unroll]: 3.195e-05 [a_1]: 0.00113292 [with_stream_mark]: 2.286e-05 [recompute_prepare]: 1.623e-05 [updatestate_depend_eliminate]: 5.23002e-06 [updatestate_assign_eliminate]: 4.37e-06 [updatestate_loads_eliminate]: 3.72002e-06 [parameter_eliminate]: 2.32001e-06 [a_2]: 0.00019034 [accelerated_algorithm]: 2.644e-05 [shard]: 2.44001e-06 [meta_shard_fg_expand]: 2.78e-06 [shard_inline]: 9.05999e-06 [merge_send_recv]: 1.015e-05 [auto_parallel]: 9.14998e-06 [parallel]: 3.566e-05 [flash_sp]: 9.92001e-06 [merge_comm]: 5.10001e-06 [allreduce_fusion]: 4.37998e-06 [matmul_add_comm_reduction]: 1.245e-05 [allreduce_slice_to_reducescatter]: 6.60017e-07 [virtual_shard_identity]: 1.052e-05 [virtual_dataset]: 8.91002e-06 [get_grad_eliminate_]: 9.00999e-06 [virtual_output]: 8.87e-06 [merge_forward]: 4.75001e-06 [cell_reuse_recompute_pass]: 1.50999e-06 [offload_activation]: 1.263e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.294e-05 [merge_recompute_call_nodes]: 1.57999e-06 [before_grad]: 1.445e-05 [set_forward_comm_id_for_comm_node_pass]: 5.05001e-06 [meta_fg_expand]: 3.68999e-06 [flash_sp_send_recv_attached]: 5.75001e-06 [receive_attached]: 2.68e-06 [after_resolve]: 1.536e-05 [a_after_grad]: 1.339e-05 [renormalize]: 0.0189772 [add_forward_monad_depend]: 1.479e-05 [auto_monad_grad]: 2.99999e-06 [auto_monad_eliminator]: 3.309e-05 [cse]: 4.518e-05 [a_3]: 0.00010108 [Cycle 2]: 0.00120607, [45] [expand_dump_flag]: 3.08998e-06 [switch_simplify]: 1.238e-05 [loop_unroll]: 9.27001e-06 [a_1]: 0.00024187 [with_stream_mark]: 2.137e-05 [recompute_prepare]: 1.023e-05 [updatestate_depend_eliminate]: 5.29e-06 [updatestate_assign_eliminate]: 3.93999e-06 [updatestate_loads_eliminate]: 3.97e-06 [parameter_eliminate]: 2.29001e-06 [a_2]: 0.0001445 [accelerated_algorithm]: 1.689e-05 [shard]: 2.58e-06 [meta_shard_fg_expand]: 2.61e-06 [shard_inline]: 9.94999e-06 [merge_send_recv]: 1.218e-05 [auto_parallel]: 1.289e-05 [parallel]: 1.013e-05 [flash_sp]: 5.19e-06 [merge_comm]: 4.72998e-06 [allreduce_fusion]: 4.26001e-06 [matmul_add_comm_reduction]: 1.44e-05 [allreduce_slice_to_reducescatter]: 8.79983e-07 [virtual_shard_identity]: 1.346e-05 [virtual_dataset]: 8.62998e-06 [get_grad_eliminate_]: 8.12e-06 [virtual_output]: 8.72e-06 [merge_forward]: 6.76e-06 [cell_reuse_recompute_pass]: 3.70998e-06 [offload_activation]: 1.449e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.154e-05 [merge_recompute_call_nodes]: 1.71998e-06 [before_grad]: 1.514e-05 [set_forward_comm_id_for_comm_node_pass]: 5.90002e-06 [meta_fg_expand]: 3.97998e-06 [flash_sp_send_recv_attached]: 2.11e-06 [receive_attached]: 2.74999e-06 [after_resolve]: 1.627e-05 [a_after_grad]: 1.201e-05 [renormalize]: 8.9989e-08 [add_forward_monad_depend]: 3.47002e-06 [auto_monad_grad]: 2.66e-06 [auto_monad_eliminator]: 1.433e-05 [cse]: 2.956e-05 [a_3]: 6.646e-05 [py_interpret_to_execute_after_opt_a]: 4.081e-05 [slice_cell_reuse_recomputed_activation]: 4.92e-06 [rewriter_after_opt_a]: 6.157e-05 [convert_after_rewriter]: 1.272e-05 [order_py_execute_after_rewriter]: 8.99e-06 [mutable_eliminate]: 0.0008001 [opt_b]: 0.00036888, [1] [Cycle 1]: 0.00035666, [7] [b_1]: 0.00021933 [b_2]: 9.47999e-06 [updatestate_depend_eliminate]: 1.248e-05 [updatestate_assign_eliminate]: 4.05e-06 [updatestate_loads_eliminate]: 3.78999e-06 [renormalize]: 8.2e-07 [cse]: 3.668e-05 [optimize_parallel_all_gather_comm]: 2.796e-05 [overlap_param_gather]: 6.19001e-06 [cconv]: 4.376e-05 [loop_unroll]: 0.00070575 [opt_after_cconv]: 0.00017411, [1] [Cycle 1]: 0.00016318, [7] [c_1]: 4.091e-05 [parameter_eliminate]: 6.02999e-06 [updatestate_depend_eliminate]: 8.98002e-06 [updatestate_assign_eliminate]: 3.35998e-06 [updatestate_loads_eliminate]: 3.2e-06 [cse]: 3.458e-05 [renormalize]: 5.3001e-07 [remove_dup_value]: 2.234e-05 [tuple_transform]: 0.00012042, [1] [Cycle 1]: 0.00011132, [4] [d_1]: 6.324e-05 [none_parameter_eliminate]: 1.72999e-06 [renormalize]: 2.19996e-07 [switch_simplify]: 9.00001e-06 [partial_unused_args_eliminate]: 4.86997e-06 [add_recomputation]: 7.733e-05 [cse_after_recomputation]: 3.628e-05, [1] [Cycle 1]: 2.819e-05, [1] [cse]: 1.706e-05 [environ_conv]: 1.198e-05 [swap_dp_allreduce_reducescatter]: 1.136e-05 [bias_add_comm_swap]: 6.64001e-06 [label_micro_interleaved_index]: 9.76e-06 [label_fine_grained_interleaved_index]: 6.17999e-06 [merge_cast_opt]: 4.58999e-06 [slice_recompute_activation]: 6.23002e-06 [micro_interleaved_order_control]: 5.62999e-06 [assign_add_opt]: 4.12e-06 [ForceFp32Comm]: 4.07e-06 [remove_cast_before_assign_add]: 4.06001e-06 [full_micro_interleaved_order_control]: 5.35999e-06 [reorder_send_recv_between_fp_bp]: 6.61e-06 [comm_op_add_attrs]: 3.92998e-06 [add_comm_op_reuse_tag]: 3.86999e-06 [interleave_split_concat_branches]: 4.22998e-06 [interleave_parallel_branches]: 5.96998e-06 [overlap_opt_shard_in_pipeline]: 4.45e-06 [overlap_opt_shard_grad_in_pipeline]: 8.83001e-06 [control_data_broadcast_order]: 2.397e-05 [grouped_pairwise_exchange_alltoall]: 4.94e-06 [offloading_packed_experts]: 1.128e-05 [overlap_recompute_and_grad_model_parallel]: 9.34e-06 [overlap_grad_matmul_and_grad_allreduce]: 7.51001e-06 [overlap_recompute_allgather_and_fa_grad]: 4.17e-06 [overlap_recompute_comm]: 7.03998e-06 [overlap_grad_ring_attention]: 7.63001e-06 [overlap_grad_flash_sp]: 2.956e-05 [begin_end_overlap_inline]: 5.90002e-06 [split_matmul_comm_elemetwise]: 5.47001e-06 [split_layernorm_comm]: 4.58001e-06 [handle_group_info]: 4.53001e-06 [symbol_engine_optimizer]: 0.00014175, [1] [Cycle 1]: 0.00013281, [6] [build]: 4.90999e-06 [elim_shapecalc]: 1.372e-05 [elim_not_effective]: 1.995e-05 [opt_reshape]: 9.27001e-06 [fold_const_symbol]: 1.372e-05 [renormalize]: 1.80007e-07 [detach_backward]: 5.17999e-06 [pipeline_parallel_scheduler]: 1.91e-06 [auto_monad_reorder]: 2.801e-05 [get_jit_bprop_graph]: 2.29001e-06 [rewriter_after_jit_bprop_graph]: 8.05e-06 [opt_after_jit_grad]: 0.00082788 [validate]: 6.163e-05 Sums bootstrap : 0.000636s : 0.65% type_inference : 0.071676s : 73.18% event_method : 0.000025s : 0.03% auto_monad : 0.000109s : 0.11% graph_reusing : 0.000007s : 0.01% inline : 0.000003s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000026s : 0.03% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000007s : 0.01% parallel-infer-symbol : 0.000004s : 0.00% pre_auto_parallel : 0.000045s : 0.05% insert-virtual-dataset : 0.000002s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000045s : 0.05% optimize.rewriter_before_opt_a : 0.000109s : 0.11% optimize.opt_a.expand_dump_flag : 0.000007s : 0.01% optimize.opt_a.switch_simplify : 0.000057s : 0.06% optimize.opt_a.loop_unroll : 0.000041s : 0.04% optimize.opt_a.a_1 : 0.001375s : 1.40% optimize.opt_a.with_stream_mark : 0.000044s : 0.05% optimize.opt_a.recompute_prepare : 0.000026s : 0.03% optimize.opt_a.updatestate_depend_eliminate : 0.000011s : 0.01% optimize.opt_a.updatestate_assign_eliminate : 0.000008s : 0.01% optimize.opt_a.updatestate_loads_eliminate : 0.000008s : 0.01% optimize.opt_a.parameter_eliminate : 0.000005s : 0.00% optimize.opt_a.a_2 : 0.000335s : 0.34% optimize.opt_a.accelerated_algorithm : 0.000043s : 0.04% optimize.opt_a.shard : 0.000005s : 0.01% optimize.opt_a.meta_shard_fg_expand : 0.000005s : 0.01% optimize.opt_a.shard_inline : 0.000019s : 0.02% optimize.opt_a.merge_send_recv : 0.000022s : 0.02% optimize.opt_a.auto_parallel : 0.000022s : 0.02% optimize.opt_a.parallel : 0.000046s : 0.05% optimize.opt_a.flash_sp : 0.000015s : 0.02% optimize.opt_a.merge_comm : 0.000010s : 0.01% optimize.opt_a.allreduce_fusion : 0.000009s : 0.01% optimize.opt_a.matmul_add_comm_reduction : 0.000027s : 0.03% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000002s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000024s : 0.02% optimize.opt_a.virtual_dataset : 0.000018s : 0.02% optimize.opt_a.get_grad_eliminate_ : 0.000017s : 0.02% optimize.opt_a.virtual_output : 0.000018s : 0.02% optimize.opt_a.merge_forward : 0.000012s : 0.01% optimize.opt_a.cell_reuse_recompute_pass : 0.000005s : 0.01% optimize.opt_a.offload_activation : 0.000027s : 0.03% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000044s : 0.05% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.00% optimize.opt_a.before_grad : 0.000030s : 0.03% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000011s : 0.01% optimize.opt_a.meta_fg_expand : 0.000008s : 0.01% optimize.opt_a.flash_sp_send_recv_attached : 0.000008s : 0.01% optimize.opt_a.receive_attached : 0.000005s : 0.01% optimize.opt_a.after_resolve : 0.000032s : 0.03% optimize.opt_a.a_after_grad : 0.000025s : 0.03% optimize.opt_a.renormalize : 0.018977s : 19.37% optimize.opt_a.add_forward_monad_depend : 0.000018s : 0.02% optimize.opt_a.auto_monad_grad : 0.000006s : 0.01% optimize.opt_a.auto_monad_eliminator : 0.000047s : 0.05% optimize.opt_a.cse : 0.000075s : 0.08% optimize.opt_a.a_3 : 0.000168s : 0.17% optimize.py_interpret_to_execute_after_opt_a : 0.000041s : 0.04% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.01% optimize.rewriter_after_opt_a : 0.000062s : 0.06% optimize.convert_after_rewriter : 0.000013s : 0.01% optimize.order_py_execute_after_rewriter : 0.000009s : 0.01% optimize.mutable_eliminate : 0.000800s : 0.82% optimize.opt_b.b_1 : 0.000219s : 0.22% optimize.opt_b.b_2 : 0.000009s : 0.01% optimize.opt_b.updatestate_depend_eliminate : 0.000012s : 0.01% optimize.opt_b.updatestate_assign_eliminate : 0.000004s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000004s : 0.00% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000037s : 0.04% optimize.optimize_parallel_all_gather_comm : 0.000028s : 0.03% optimize.overlap_param_gather : 0.000006s : 0.01% optimize.cconv : 0.000044s : 0.04% optimize.loop_unroll : 0.000706s : 0.72% optimize.opt_after_cconv.c_1 : 0.000041s : 0.04% optimize.opt_after_cconv.parameter_eliminate : 0.000006s : 0.01% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000009s : 0.01% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.cse : 0.000035s : 0.04% optimize.opt_after_cconv.renormalize : 0.000001s : 0.00% optimize.remove_dup_value : 0.000022s : 0.02% optimize.tuple_transform.d_1 : 0.000063s : 0.06% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000009s : 0.01% optimize.partial_unused_args_eliminate : 0.000005s : 0.00% optimize.add_recomputation : 0.000077s : 0.08% optimize.cse_after_recomputation.cse : 0.000017s : 0.02% optimize.environ_conv : 0.000012s : 0.01% optimize.swap_dp_allreduce_reducescatter : 0.000011s : 0.01% optimize.bias_add_comm_swap : 0.000007s : 0.01% optimize.label_micro_interleaved_index : 0.000010s : 0.01% optimize.label_fine_grained_interleaved_index : 0.000006s : 0.01% optimize.merge_cast_opt : 0.000005s : 0.00% optimize.slice_recompute_activation : 0.000006s : 0.01% optimize.micro_interleaved_order_control : 0.000006s : 0.01% optimize.assign_add_opt : 0.000004s : 0.00% optimize.ForceFp32Comm : 0.000004s : 0.00% optimize.remove_cast_before_assign_add : 0.000004s : 0.00% optimize.full_micro_interleaved_order_control : 0.000005s : 0.01% optimize.reorder_send_recv_between_fp_bp : 0.000007s : 0.01% optimize.comm_op_add_attrs : 0.000004s : 0.00% optimize.add_comm_op_reuse_tag : 0.000004s : 0.00% optimize.interleave_split_concat_branches : 0.000004s : 0.00% optimize.interleave_parallel_branches : 0.000006s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000009s : 0.01% optimize.control_data_broadcast_order : 0.000024s : 0.02% optimize.grouped_pairwise_exchange_alltoall : 0.000005s : 0.01% optimize.offloading_packed_experts : 0.000011s : 0.01% optimize.overlap_recompute_and_grad_model_parallel : 0.000009s : 0.01% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000008s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.00% optimize.overlap_recompute_comm : 0.000007s : 0.01% optimize.overlap_grad_ring_attention : 0.000008s : 0.01% optimize.overlap_grad_flash_sp : 0.000030s : 0.03% optimize.begin_end_overlap_inline : 0.000006s : 0.01% optimize.split_matmul_comm_elemetwise : 0.000005s : 0.01% optimize.split_layernorm_comm : 0.000005s : 0.00% optimize.handle_group_info : 0.000005s : 0.00% optimize.symbol_engine_optimizer.build : 0.000005s : 0.01% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000014s : 0.01% optimize.symbol_engine_optimizer.elim_not_effective : 0.000020s : 0.02% optimize.symbol_engine_optimizer.opt_reshape : 0.000009s : 0.01% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000014s : 0.01% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000005s : 0.01% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000028s : 0.03% get_jit_bprop_graph : 0.000002s : 0.00% rewriter_after_jit_bprop_graph : 0.000008s : 0.01% opt_after_jit_grad : 0.000828s : 0.85% validate : 0.000062s : 0.06% Time group info: ------[substitution.] 0.000435 45 26.45% : 0.000115s : 5: substitution.arithmetic_simplify 9.73% : 0.000042s : 3: substitution.cast_eliminate 0.52% : 0.000002s : 3: substitution.elim_not_effective 0.47% : 0.000002s : 3: substitution.fold_const_symbol 1.85% : 0.000008s : 5: substitution.graph_param_transform 47.85% : 0.000208s : 4: substitution.inline 1.32% : 0.000006s : 6: substitution.j_node_and_user_rematch 3.83% : 0.000017s : 2: substitution.less_batch_normalization 1.45% : 0.000006s : 6: substitution.remove_not_recompute_node 1.71% : 0.000007s : 4: substitution.replace_old_param 4.80% : 0.000021s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.071595 2 97.87% : 0.070067s : 1: type_inference.infer 2.13% : 0.001528s : 1: type_inference.specialize ------[replace.] 0.000098 8 63.25% : 0.000062s : 4: replace.inline 36.75% : 0.000036s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000224 8 91.74% : 0.000205s : 4: match.inline 8.26% : 0.000018s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000298 1504 0.96% : 0.000003s : 15: predicate.accumulaten_eliminater 1.00% : 0.000003s : 5: predicate.ad_related_special_op_eliminate 0.62% : 0.000002s : 10: predicate.addn_check_dump 0.95% : 0.000003s : 15: predicate.addn_zero_filter 0.80% : 0.000002s : 15: predicate.adjust_all_reduce_mul_add 4.39% : 0.000013s : 25: predicate.arithmetic_simplify 0.85% : 0.000003s : 15: predicate.cast_eliminate 0.78% : 0.000002s : 10: predicate.check_bprop_eliminate 0.56% : 0.000002s : 10: predicate.compare_switch_simplify 0.16% : 0.000000s : 5: predicate.const_output_eliminate 0.56% : 0.000002s : 10: predicate.depend_value_elim 0.90% : 0.000003s : 15: predicate.dict_get_item_const_eliminator 1.30% : 0.000004s : 15: predicate.dict_get_item_eliminator 0.80% : 0.000002s : 15: predicate.dict_set_item_eliminator 0.92% : 0.000003s : 10: predicate.dumpgradient_eliminate 0.30% : 0.000001s : 5: predicate.elim_not_effective 0.34% : 0.000001s : 5: predicate.elim_shapecalc_of_broadcastargs 1.03% : 0.000003s : 20: predicate.environ_add_const_eliminate 1.03% : 0.000003s : 20: predicate.environ_get_add_eliminate 0.99% : 0.000003s : 20: predicate.environ_get_depend_swap 1.60% : 0.000005s : 30: predicate.environ_get_eliminate 1.14% : 0.000003s : 20: predicate.environ_get_set_eliminate 1.17% : 0.000003s : 23: predicate.exchange_switch_depend_value 2.04% : 0.000006s : 23: predicate.float_depend_g_call 0.57% : 0.000002s : 10: predicate.float_environ_get_switch 0.75% : 0.000002s : 15: predicate.float_tuple_getitem_switch 0.13% : 0.000000s : 5: predicate.fold_const_symbol 0.68% : 0.000002s : 10: predicate.get_grad_eliminate 0.18% : 0.000001s : 5: predicate.graph_param_transform 0.65% : 0.000002s : 10: predicate.incorporate_call 0.48% : 0.000001s : 10: predicate.incorporate_call_switch 7.53% : 0.000022s : 68: predicate.inline 0.68% : 0.000002s : 10: predicate.inline_without_move 0.27% : 0.000001s : 10: predicate.j_node_and_user_rematch 0.94% : 0.000003s : 10: predicate.less_batch_normalization 1.72% : 0.000005s : 29: predicate.list_to_tuple_eliminator_ 2.26% : 0.000007s : 44: predicate.load_eliminater 1.20% : 0.000004s : 5: predicate.loop_unroll_after_grad 1.81% : 0.000005s : 36: predicate.loop_unroll_before_grad 1.67% : 0.000005s : 25: predicate.make_slice_get_slice_eliminator 0.51% : 0.000002s : 10: predicate.merge_addn 0.55% : 0.000002s : 10: predicate.micro_step_allgather_replace 0.53% : 0.000002s : 10: predicate.mini_step_allgather_replace 0.98% : 0.000003s : 15: predicate.minmaximum_grad 1.54% : 0.000005s : 5: predicate.mutable_eliminate 0.49% : 0.000001s : 5: predicate.opt_reshape 0.43% : 0.000001s : 5: predicate.parallel_virtual_node 1.49% : 0.000004s : 23: predicate.partial_defer_inline 1.31% : 0.000004s : 24: predicate.partial_eliminate 0.85% : 0.000003s : 15: predicate.print_const_string_wrapper 0.61% : 0.000002s : 10: predicate.reduce_all_const_elim 1.14% : 0.000003s : 15: predicate.reduce_eliminate 2.24% : 0.000007s : 44: predicate.redundant_stop_gradient_eliminater 0.54% : 0.000002s : 10: predicate.remove_not_recompute_node 1.51% : 0.000005s : 29: predicate.replace_applicator 0.50% : 0.000001s : 10: predicate.replace_old_param 0.33% : 0.000001s : 5: predicate.reset_defer_inline 0.91% : 0.000003s : 15: predicate.reshape_eliminate 0.86% : 0.000003s : 10: predicate.row_tensor_add_zeros_like 0.37% : 0.000001s : 5: predicate.row_tensor_eliminate 0.85% : 0.000003s : 10: predicate.same_eliminate 0.47% : 0.000001s : 10: predicate.set_cell_output_no_recompute 0.94% : 0.000003s : 10: predicate.shard_identity_eliminate 0.68% : 0.000002s : 10: predicate.special_op_eliminate 0.67% : 0.000002s : 10: predicate.specialize_transform 1.21% : 0.000004s : 10: predicate.split_environ_get_set_with_tuple_value 0.93% : 0.000003s : 10: predicate.stack_unstack_eliminate 0.28% : 0.000001s : 5: predicate.switch_call_monad_eliminater 1.30% : 0.000004s : 23: predicate.switch_defer_inline 1.76% : 0.000005s : 33: predicate.switch_layer_defer_inline 4.13% : 0.000012s : 74: predicate.switch_simplify 0.87% : 0.000003s : 15: predicate.tile_eliminate 0.93% : 0.000003s : 15: predicate.transpose_eliminate 1.50% : 0.000004s : 25: predicate.tuple_list_convert_item_index_to_positive 1.59% : 0.000005s : 25: predicate.tuple_list_get_item_const_eliminator 1.68% : 0.000005s : 25: predicate.tuple_list_get_item_depend_reorder 3.06% : 0.000009s : 39: predicate.tuple_list_get_item_eliminator 1.50% : 0.000004s : 25: predicate.tuple_list_get_set_item_eliminator 2.26% : 0.000007s : 35: predicate.tuple_list_set_item_eliminator 1.69% : 0.000005s : 29: predicate.tuple_to_list_eliminator_ 2.19% : 0.000007s : 44: predicate.updatestate_pure_node_eliminater 2.82% : 0.000008s : 54: predicate.updatestate_useless_node_eliminater 0.38% : 0.000001s : 5: predicate.value_based_eliminate 0.60% : 0.000002s : 10: predicate.virtual_dataset_eliminate 0.59% : 0.000002s : 10: predicate.virtual_output_eliminate 0.26% : 0.000001s : 5: predicate.virtual_view_grad_eliminate 0.46% : 0.000001s : 5: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.005053 11 8.18% : 0.000413s : 5: func_graph_cloner_run.FuncGraphClonerGraph 91.82% : 0.004640s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.158157 192 0.00% : 0.000008s : 1: ForceFp32Comm 3.32% : 0.005255s : 1: add_attr 3.31% : 0.005232s : 1: add_attr_with_inline 0.00% : 0.000007s : 1: add_comm_op_reuse_tag 0.05% : 0.000082s : 1: add_recomputation 0.00% : 0.000007s : 1: assign_add_opt 0.08% : 0.000120s : 1: auto_monad 0.02% : 0.000036s : 1: auto_monad_reorder 0.01% : 0.000009s : 1: begin_end_overlap_inline 0.01% : 0.000010s : 1: bias_add_comm_swap 0.45% : 0.000707s : 1: bootstrap 0.03% : 0.000048s : 1: cconv 0.00% : 0.000007s : 1: comm_op_add_attrs 0.02% : 0.000027s : 1: control_data_broadcast_order 0.01% : 0.000016s : 1: convert_after_rewriter 0.03% : 0.000040s : 1: cse_after_recomputation 0.00% : 0.000007s : 1: dataset_repeat_opt 0.02% : 0.000028s : 1: detach_backward 0.01% : 0.000015s : 1: environ_conv 0.02% : 0.000037s : 1: event_method 0.01% : 0.000008s : 1: full_micro_interleaved_order_control 0.01% : 0.000009s : 1: get_jit_bprop_graph 0.01% : 0.000013s : 1: graph_reusing 0.01% : 0.000008s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000010s : 1: handle_group_info 0.01% : 0.000010s : 1: inline 0.01% : 0.000009s : 1: insert-virtual-dataset 0.01% : 0.000009s : 1: interleave_parallel_branches 0.01% : 0.000010s : 1: interleave_split_concat_branches 0.01% : 0.000009s : 1: label_fine_grained_interleaved_index 0.01% : 0.000013s : 1: label_micro_interleaved_index 0.45% : 0.000714s : 1: loop_unroll 0.00% : 0.000008s : 1: merge_cast_opt 0.01% : 0.000009s : 1: micro_interleaved_order_control 0.51% : 0.000809s : 1: mutable_eliminate 0.01% : 0.000015s : 1: offloading_packed_experts 0.02% : 0.000024s : 1: opt.transform.loop_unroll_optimizer 0.02% : 0.000028s : 1: opt.transform.mutable_eliminate 1.29% : 0.002042s : 78: opt.transform.opt_a 0.02% : 0.000039s : 1: opt.transform.opt_after_cconv 0.03% : 0.000040s : 1: opt.transform.opt_after_jit_grad 0.09% : 0.000143s : 28: opt.transform.opt_b 0.04% : 0.000070s : 2: opt.transform.opt_trans_graph 0.03% : 0.000053s : 4: opt.transform.symbol_engine_opt 14.39% : 0.022763s : 1: opt_a 0.11% : 0.000178s : 1: opt_after_cconv 0.53% : 0.000843s : 1: opt_after_jit_grad 0.24% : 0.000374s : 1: opt_b 16.84% : 0.026632s : 1: optimize 0.02% : 0.000032s : 1: optimize_parallel_all_gather_comm 0.01% : 0.000012s : 1: order_py_execute_after_rewriter 0.02% : 0.000033s : 1: overlap_grad_flash_sp 0.01% : 0.000010s : 1: overlap_grad_matmul_and_grad_allreduce 0.01% : 0.000011s : 1: overlap_grad_ring_attention 0.01% : 0.000012s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000008s : 1: overlap_opt_shard_in_pipeline 0.01% : 0.000009s : 1: overlap_param_gather 0.00% : 0.000007s : 1: overlap_recompute_allgather_and_fa_grad 0.01% : 0.000012s : 1: overlap_recompute_and_grad_model_parallel 0.01% : 0.000014s : 1: overlap_recompute_comm 0.01% : 0.000010s : 1: parallel-infer-symbol 0.00% : 0.000006s : 1: parallel-infer-symbol-second 0.01% : 0.000008s : 1: partial_unused_args_eliminate 0.01% : 0.000009s : 1: pipeline_parallel_scheduler 0.00% : 0.000007s : 1: pipeline_split 0.03% : 0.000053s : 1: pre_auto_parallel 0.03% : 0.000049s : 1: py_interpret_to_execute 0.03% : 0.000045s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000007s : 1: remove_cast_before_assign_add 0.02% : 0.000026s : 1: remove_dup_value 1.10% : 0.001741s : 1: renormalize.infer 10.87% : 0.017188s : 1: renormalize.specialize 0.01% : 0.000009s : 1: reorder_send_recv_between_fp_bp 0.01% : 0.000014s : 1: rewriter_after_jit_bprop_graph 0.04% : 0.000066s : 1: rewriter_after_opt_a 0.07% : 0.000113s : 1: rewriter_before_opt_a 0.01% : 0.000008s : 1: slice_cell_reuse_recomputed_activation 0.01% : 0.000009s : 1: slice_recompute_activation 0.01% : 0.000011s : 1: split_layernorm_comm 0.01% : 0.000009s : 1: split_matmul_comm_elemetwise 0.01% : 0.000015s : 1: swap_dp_allreduce_reducescatter 0.09% : 0.000145s : 1: symbol_engine_optimizer 0.08% : 0.000123s : 1: tuple_transform 45.36% : 0.071736s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:38:07.750.926 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.23208, [21] [bootstrap]: 0.00047565 [type_inference]: 0.12669 [event_method]: 2.547e-05 [auto_monad]: 8.316e-05 [graph_reusing]: 6.83998e-06 [inline]: 3.08e-06 [add_attr]: 0.00402264, [1] [add_attr_with_inline]: 0.00400868, [1] [Cycle 1]: 8.114e-05, [2] [tag_attr]: 2.717e-05 [meta_addattr_fg_expand]: 7.03e-06 [parallel-infer-symbol]: 3.68999e-06 [pre_auto_parallel]: 4.803e-05 [insert-virtual-dataset]: 2.73e-06 [parallel-infer-symbol-second]: 8.39995e-07 [dataset_repeat_opt]: 2.81999e-06 [pipeline_split]: 2.43e-06 [optimize]: 0.099555, [53] [py_interpret_to_execute]: 3.798e-05 [rewriter_before_opt_a]: 0.00010995 [opt_a]: 0.00445187, [2] [Cycle 1]: 0.00342387, [45] [expand_dump_flag]: 4.52e-06 [switch_simplify]: 4.842e-05 [loop_unroll]: 3.195e-05 [a_1]: 0.00093092 [with_stream_mark]: 2.717e-05 [recompute_prepare]: 1.688e-05 [updatestate_depend_eliminate]: 5.76e-06 [updatestate_assign_eliminate]: 4.42e-06 [updatestate_loads_eliminate]: 4.02002e-06 [parameter_eliminate]: 3.23e-06 [a_2]: 0.0001315 [accelerated_algorithm]: 2.966e-05 [shard]: 2.48e-06 [meta_shard_fg_expand]: 3.03998e-06 [shard_inline]: 9.14998e-06 [merge_send_recv]: 1.065e-05 [auto_parallel]: 1.236e-05 [parallel]: 2.761e-05 [flash_sp]: 1.164e-05 [merge_comm]: 6.18998e-06 [allreduce_fusion]: 4.79998e-06 [matmul_add_comm_reduction]: 1.569e-05 [allreduce_slice_to_reducescatter]: 1.42e-06 [virtual_shard_identity]: 1.43e-05 [virtual_dataset]: 1.024e-05 [get_grad_eliminate_]: 9.07001e-06 [virtual_output]: 9.09003e-06 [merge_forward]: 6.79999e-06 [cell_reuse_recompute_pass]: 1.99999e-06 [offload_activation]: 1.481e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.257e-05 [merge_recompute_call_nodes]: 2.25002e-06 [before_grad]: 1.613e-05 [set_forward_comm_id_for_comm_node_pass]: 5.64e-06 [meta_fg_expand]: 3.55e-06 [flash_sp_send_recv_attached]: 8.08001e-06 [receive_attached]: 2.81e-06 [after_resolve]: 1.783e-05 [a_after_grad]: 1.431e-05 [renormalize]: 0.00138508 [add_forward_monad_depend]: 1.101e-05 [auto_monad_grad]: 2.92002e-06 [auto_monad_eliminator]: 2.816e-05 [cse]: 4.515e-05 [a_3]: 7.437e-05 [Cycle 2]: 0.00101046, [45] [expand_dump_flag]: 2.86e-06 [switch_simplify]: 1.208e-05 [loop_unroll]: 8.1e-06 [a_1]: 0.00021251 [with_stream_mark]: 2.165e-05 [recompute_prepare]: 9.91e-06 [updatestate_depend_eliminate]: 4.65999e-06 [updatestate_assign_eliminate]: 5.07e-06 [updatestate_loads_eliminate]: 3.63999e-06 [parameter_eliminate]: 2.29999e-06 [a_2]: 0.00010684 [accelerated_algorithm]: 1.467e-05 [shard]: 2.18002e-06 [meta_shard_fg_expand]: 2.60002e-06 [shard_inline]: 9.89001e-06 [merge_send_recv]: 1.196e-05 [auto_parallel]: 1.291e-05 [parallel]: 1.202e-05 [flash_sp]: 6.41e-06 [merge_comm]: 5.64e-06 [allreduce_fusion]: 4.53001e-06 [matmul_add_comm_reduction]: 1.532e-05 [allreduce_slice_to_reducescatter]: 9.00007e-07 [virtual_shard_identity]: 1.416e-05 [virtual_dataset]: 8.70001e-06 [get_grad_eliminate_]: 7.72002e-06 [virtual_output]: 7.85e-06 [merge_forward]: 6.89999e-06 [cell_reuse_recompute_pass]: 2.91999e-06 [offload_activation]: 1.631e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.91e-05 [merge_recompute_call_nodes]: 1.65001e-06 [before_grad]: 1.75e-05 [set_forward_comm_id_for_comm_node_pass]: 6.32001e-06 [meta_fg_expand]: 3.91999e-06 [flash_sp_send_recv_attached]: 2.02001e-06 [receive_attached]: 2.96999e-06 [after_resolve]: 1.757e-05 [a_after_grad]: 1.206e-05 [renormalize]: 8.00064e-08 [add_forward_monad_depend]: 3.09001e-06 [auto_monad_grad]: 2.08002e-06 [auto_monad_eliminator]: 1.556e-05 [cse]: 3.337e-05 [a_3]: 5.065e-05 [py_interpret_to_execute_after_opt_a]: 1.803e-05 [slice_cell_reuse_recomputed_activation]: 2.61999e-06 [rewriter_after_opt_a]: 5.554e-05 [convert_after_rewriter]: 9.27999e-06 [order_py_execute_after_rewriter]: 6.14999e-06 [mutable_eliminate]: 0.00082428 [opt_b]: 0.00028994, [1] [Cycle 1]: 0.000281, [7] [b_1]: 0.0001674 [b_2]: 1.079e-05 [updatestate_depend_eliminate]: 9.36002e-06 [updatestate_assign_eliminate]: 3.93999e-06 [updatestate_loads_eliminate]: 3.68999e-06 [renormalize]: 1.00001e-06 [cse]: 3.759e-05 [optimize_parallel_all_gather_comm]: 2.49e-05 [overlap_param_gather]: 3.13e-06 [cconv]: 3.958e-05 [loop_unroll]: 0.0927095 [opt_after_cconv]: 0.00021611, [1] [Cycle 1]: 0.00020142, [7] [c_1]: 5.806e-05 [parameter_eliminate]: 8.27e-06 [updatestate_depend_eliminate]: 1.577e-05 [updatestate_assign_eliminate]: 4.63999e-06 [updatestate_loads_eliminate]: 4.31002e-06 [cse]: 5.773e-05 [renormalize]: 1.44e-06 [remove_dup_value]: 2.333e-05 [tuple_transform]: 0.0001166, [1] [Cycle 1]: 0.00010516, [4] [d_1]: 7.229e-05 [none_parameter_eliminate]: 1.76e-06 [renormalize]: 2.80008e-07 [switch_simplify]: 8.87e-06 [partial_unused_args_eliminate]: 3.14001e-06 [add_recomputation]: 7.72e-05 [cse_after_recomputation]: 2.841e-05, [1] [Cycle 1]: 2.314e-05, [1] [cse]: 1.653e-05 [environ_conv]: 8.57e-06 [swap_dp_allreduce_reducescatter]: 7.08e-06 [bias_add_comm_swap]: 3.85e-06 [label_micro_interleaved_index]: 8.02998e-06 [label_fine_grained_interleaved_index]: 3.29001e-06 [merge_cast_opt]: 1.39998e-06 [slice_recompute_activation]: 2.59001e-06 [micro_interleaved_order_control]: 3.13998e-06 [assign_add_opt]: 1.62001e-06 [ForceFp32Comm]: 9.30013e-07 [remove_cast_before_assign_add]: 9.00007e-07 [full_micro_interleaved_order_control]: 2.52001e-06 [reorder_send_recv_between_fp_bp]: 3.01001e-06 [comm_op_add_attrs]: 1.09e-06 [add_comm_op_reuse_tag]: 1.39e-06 [interleave_split_concat_branches]: 1.39e-06 [interleave_parallel_branches]: 1.17e-06 [overlap_opt_shard_in_pipeline]: 1.74e-06 [overlap_opt_shard_grad_in_pipeline]: 2.04999e-06 [control_data_broadcast_order]: 2.105e-05 [grouped_pairwise_exchange_alltoall]: 1.64e-06 [offloading_packed_experts]: 5.54e-06 [overlap_recompute_and_grad_model_parallel]: 6.04001e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.40001e-06 [overlap_recompute_allgather_and_fa_grad]: 1.89999e-06 [overlap_recompute_comm]: 2.73998e-06 [overlap_grad_ring_attention]: 5.17999e-06 [overlap_grad_flash_sp]: 2.871e-05 [begin_end_overlap_inline]: 6.49976e-07 [split_matmul_comm_elemetwise]: 2.21e-06 [split_layernorm_comm]: 1.75001e-06 [handle_group_info]: 1.24e-06 [symbol_engine_optimizer]: 0.00010307, [1] [Cycle 1]: 9.749e-05, [6] [build]: 4.72e-06 [elim_shapecalc]: 1.484e-05 [elim_not_effective]: 1.702e-05 [opt_reshape]: 1.075e-05 [fold_const_symbol]: 1.368e-05 [renormalize]: 2.10013e-07 [detach_backward]: 2.31998e-06 [pipeline_parallel_scheduler]: 2.21998e-06 [auto_monad_reorder]: 2.292e-05 [get_jit_bprop_graph]: 2.26e-06 [rewriter_after_jit_bprop_graph]: 6.02001e-06 [opt_after_jit_grad]: 0.00087716 [validate]: 5.661e-05 Sums bootstrap : 0.000476s : 0.21% type_inference : 0.126690s : 55.87% event_method : 0.000025s : 0.01% auto_monad : 0.000083s : 0.04% graph_reusing : 0.000007s : 0.00% inline : 0.000003s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000027s : 0.01% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000007s : 0.00% parallel-infer-symbol : 0.000004s : 0.00% pre_auto_parallel : 0.000048s : 0.02% insert-virtual-dataset : 0.000003s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000003s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000038s : 0.02% optimize.rewriter_before_opt_a : 0.000110s : 0.05% optimize.opt_a.expand_dump_flag : 0.000007s : 0.00% optimize.opt_a.switch_simplify : 0.000060s : 0.03% optimize.opt_a.loop_unroll : 0.000040s : 0.02% optimize.opt_a.a_1 : 0.001143s : 0.50% optimize.opt_a.with_stream_mark : 0.000049s : 0.02% optimize.opt_a.recompute_prepare : 0.000027s : 0.01% optimize.opt_a.updatestate_depend_eliminate : 0.000010s : 0.00% optimize.opt_a.updatestate_assign_eliminate : 0.000009s : 0.00% optimize.opt_a.updatestate_loads_eliminate : 0.000008s : 0.00% optimize.opt_a.parameter_eliminate : 0.000006s : 0.00% optimize.opt_a.a_2 : 0.000238s : 0.11% optimize.opt_a.accelerated_algorithm : 0.000044s : 0.02% optimize.opt_a.shard : 0.000005s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.000006s : 0.00% optimize.opt_a.shard_inline : 0.000019s : 0.01% optimize.opt_a.merge_send_recv : 0.000023s : 0.01% optimize.opt_a.auto_parallel : 0.000025s : 0.01% optimize.opt_a.parallel : 0.000040s : 0.02% optimize.opt_a.flash_sp : 0.000018s : 0.01% optimize.opt_a.merge_comm : 0.000012s : 0.01% optimize.opt_a.allreduce_fusion : 0.000009s : 0.00% optimize.opt_a.matmul_add_comm_reduction : 0.000031s : 0.01% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000002s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000028s : 0.01% optimize.opt_a.virtual_dataset : 0.000019s : 0.01% optimize.opt_a.get_grad_eliminate_ : 0.000017s : 0.01% optimize.opt_a.virtual_output : 0.000017s : 0.01% optimize.opt_a.merge_forward : 0.000014s : 0.01% optimize.opt_a.cell_reuse_recompute_pass : 0.000005s : 0.00% optimize.opt_a.offload_activation : 0.000031s : 0.01% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000042s : 0.02% optimize.opt_a.merge_recompute_call_nodes : 0.000004s : 0.00% optimize.opt_a.before_grad : 0.000034s : 0.01% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000012s : 0.01% optimize.opt_a.meta_fg_expand : 0.000007s : 0.00% optimize.opt_a.flash_sp_send_recv_attached : 0.000010s : 0.00% optimize.opt_a.receive_attached : 0.000006s : 0.00% optimize.opt_a.after_resolve : 0.000035s : 0.02% optimize.opt_a.a_after_grad : 0.000026s : 0.01% optimize.opt_a.renormalize : 0.001385s : 0.61% optimize.opt_a.add_forward_monad_depend : 0.000014s : 0.01% optimize.opt_a.auto_monad_grad : 0.000005s : 0.00% optimize.opt_a.auto_monad_eliminator : 0.000044s : 0.02% optimize.opt_a.cse : 0.000079s : 0.03% optimize.opt_a.a_3 : 0.000125s : 0.06% optimize.py_interpret_to_execute_after_opt_a : 0.000018s : 0.01% optimize.slice_cell_reuse_recomputed_activation : 0.000003s : 0.00% optimize.rewriter_after_opt_a : 0.000056s : 0.02% optimize.convert_after_rewriter : 0.000009s : 0.00% optimize.order_py_execute_after_rewriter : 0.000006s : 0.00% optimize.mutable_eliminate : 0.000824s : 0.36% optimize.opt_b.b_1 : 0.000167s : 0.07% optimize.opt_b.b_2 : 0.000011s : 0.00% optimize.opt_b.updatestate_depend_eliminate : 0.000009s : 0.00% optimize.opt_b.updatestate_assign_eliminate : 0.000004s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000004s : 0.00% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000038s : 0.02% optimize.optimize_parallel_all_gather_comm : 0.000025s : 0.01% optimize.overlap_param_gather : 0.000003s : 0.00% optimize.cconv : 0.000040s : 0.02% optimize.loop_unroll : 0.092710s : 40.88% optimize.opt_after_cconv.c_1 : 0.000058s : 0.03% optimize.opt_after_cconv.parameter_eliminate : 0.000008s : 0.00% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000016s : 0.01% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000005s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000004s : 0.00% optimize.opt_after_cconv.cse : 0.000058s : 0.03% optimize.opt_after_cconv.renormalize : 0.000001s : 0.00% optimize.remove_dup_value : 0.000023s : 0.01% optimize.tuple_transform.d_1 : 0.000072s : 0.03% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000009s : 0.00% optimize.partial_unused_args_eliminate : 0.000003s : 0.00% optimize.add_recomputation : 0.000077s : 0.03% optimize.cse_after_recomputation.cse : 0.000017s : 0.01% optimize.environ_conv : 0.000009s : 0.00% optimize.swap_dp_allreduce_reducescatter : 0.000007s : 0.00% optimize.bias_add_comm_swap : 0.000004s : 0.00% optimize.label_micro_interleaved_index : 0.000008s : 0.00% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.00% optimize.merge_cast_opt : 0.000001s : 0.00% optimize.slice_recompute_activation : 0.000003s : 0.00% optimize.micro_interleaved_order_control : 0.000003s : 0.00% optimize.assign_add_opt : 0.000002s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.00% optimize.full_micro_interleaved_order_control : 0.000003s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.00% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000002s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.00% optimize.control_data_broadcast_order : 0.000021s : 0.01% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.00% optimize.offloading_packed_experts : 0.000006s : 0.00% optimize.overlap_recompute_and_grad_model_parallel : 0.000006s : 0.00% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000002s : 0.00% optimize.overlap_recompute_comm : 0.000003s : 0.00% optimize.overlap_grad_ring_attention : 0.000005s : 0.00% optimize.overlap_grad_flash_sp : 0.000029s : 0.01% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.00% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000005s : 0.00% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000015s : 0.01% optimize.symbol_engine_optimizer.elim_not_effective : 0.000017s : 0.01% optimize.symbol_engine_optimizer.opt_reshape : 0.000011s : 0.00% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000014s : 0.01% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.00% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000023s : 0.01% get_jit_bprop_graph : 0.000002s : 0.00% rewriter_after_jit_bprop_graph : 0.000006s : 0.00% opt_after_jit_grad : 0.000877s : 0.39% validate : 0.000057s : 0.02% Time group info: ------[substitution.] 0.000360 45 12.34% : 0.000044s : 5: substitution.arithmetic_simplify 10.83% : 0.000039s : 3: substitution.cast_eliminate 0.68% : 0.000002s : 3: substitution.elim_not_effective 0.50% : 0.000002s : 3: substitution.fold_const_symbol 2.36% : 0.000008s : 5: substitution.graph_param_transform 57.24% : 0.000206s : 4: substitution.inline 1.87% : 0.000007s : 6: substitution.j_node_and_user_rematch 4.84% : 0.000017s : 2: substitution.less_batch_normalization 1.94% : 0.000007s : 6: substitution.remove_not_recompute_node 1.99% : 0.000007s : 4: substitution.replace_old_param 5.41% : 0.000019s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.126602 2 99.03% : 0.125374s : 1: type_inference.infer 0.97% : 0.001228s : 1: type_inference.specialize ------[replace.] 0.000081 8 61.16% : 0.000050s : 4: replace.inline 38.84% : 0.000032s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000221 8 92.18% : 0.000203s : 4: match.inline 7.82% : 0.000017s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.092354 1504 0.00% : 0.000002s : 15: predicate.accumulaten_eliminater 0.00% : 0.000002s : 5: predicate.ad_related_special_op_eliminate 0.00% : 0.000002s : 10: predicate.addn_check_dump 0.00% : 0.000003s : 15: predicate.addn_zero_filter 0.00% : 0.000002s : 15: predicate.adjust_all_reduce_mul_add 0.01% : 0.000007s : 25: predicate.arithmetic_simplify 0.00% : 0.000003s : 15: predicate.cast_eliminate 0.00% : 0.000002s : 10: predicate.check_bprop_eliminate 0.00% : 0.000001s : 10: predicate.compare_switch_simplify 0.00% : 0.000001s : 5: predicate.const_output_eliminate 0.00% : 0.000002s : 10: predicate.depend_value_elim 0.00% : 0.000002s : 15: predicate.dict_get_item_const_eliminator 0.00% : 0.000003s : 15: predicate.dict_get_item_eliminator 0.00% : 0.000002s : 15: predicate.dict_set_item_eliminator 0.00% : 0.000002s : 10: predicate.dumpgradient_eliminate 0.00% : 0.000001s : 5: predicate.elim_not_effective 0.00% : 0.000001s : 5: predicate.elim_shapecalc_of_broadcastargs 0.00% : 0.000003s : 20: predicate.environ_add_const_eliminate 0.00% : 0.000003s : 20: predicate.environ_get_add_eliminate 0.00% : 0.000003s : 20: predicate.environ_get_depend_swap 0.00% : 0.000004s : 30: predicate.environ_get_eliminate 0.00% : 0.000003s : 20: predicate.environ_get_set_eliminate 0.00% : 0.000003s : 23: predicate.exchange_switch_depend_value 0.01% : 0.000006s : 23: predicate.float_depend_g_call 0.00% : 0.000001s : 10: predicate.float_environ_get_switch 0.00% : 0.000002s : 15: predicate.float_tuple_getitem_switch 0.00% : 0.000000s : 5: predicate.fold_const_symbol 0.00% : 0.000002s : 10: predicate.get_grad_eliminate 0.00% : 0.000001s : 5: predicate.graph_param_transform 0.00% : 0.000002s : 10: predicate.incorporate_call 0.00% : 0.000001s : 10: predicate.incorporate_call_switch 0.02% : 0.000020s : 68: predicate.inline 0.00% : 0.000002s : 10: predicate.inline_without_move 0.00% : 0.000001s : 10: predicate.j_node_and_user_rematch 0.00% : 0.000003s : 10: predicate.less_batch_normalization 0.01% : 0.000005s : 29: predicate.list_to_tuple_eliminator_ 0.01% : 0.000006s : 44: predicate.load_eliminater 99.71% : 0.092084s : 5: predicate.loop_unroll_after_grad 0.01% : 0.000005s : 36: predicate.loop_unroll_before_grad 0.01% : 0.000005s : 25: predicate.make_slice_get_slice_eliminator 0.00% : 0.000002s : 10: predicate.merge_addn 0.00% : 0.000002s : 10: predicate.micro_step_allgather_replace 0.00% : 0.000001s : 10: predicate.mini_step_allgather_replace 0.00% : 0.000002s : 15: predicate.minmaximum_grad 0.00% : 0.000004s : 5: predicate.mutable_eliminate 0.00% : 0.000001s : 5: predicate.opt_reshape 0.00% : 0.000001s : 5: predicate.parallel_virtual_node 0.01% : 0.000006s : 23: predicate.partial_defer_inline 0.00% : 0.000004s : 24: predicate.partial_eliminate 0.00% : 0.000002s : 15: predicate.print_const_string_wrapper 0.00% : 0.000002s : 10: predicate.reduce_all_const_elim 0.00% : 0.000003s : 15: predicate.reduce_eliminate 0.01% : 0.000006s : 44: predicate.redundant_stop_gradient_eliminater 0.00% : 0.000001s : 10: predicate.remove_not_recompute_node 0.00% : 0.000003s : 29: predicate.replace_applicator 0.00% : 0.000001s : 10: predicate.replace_old_param 0.00% : 0.000001s : 5: predicate.reset_defer_inline 0.00% : 0.000002s : 15: predicate.reshape_eliminate 0.00% : 0.000002s : 10: predicate.row_tensor_add_zeros_like 0.00% : 0.000001s : 5: predicate.row_tensor_eliminate 0.00% : 0.000002s : 10: predicate.same_eliminate 0.00% : 0.000001s : 10: predicate.set_cell_output_no_recompute 0.00% : 0.000003s : 10: predicate.shard_identity_eliminate 0.00% : 0.000002s : 10: predicate.special_op_eliminate 0.00% : 0.000002s : 10: predicate.specialize_transform 0.00% : 0.000004s : 10: predicate.split_environ_get_set_with_tuple_value 0.00% : 0.000002s : 10: predicate.stack_unstack_eliminate 0.00% : 0.000001s : 5: predicate.switch_call_monad_eliminater 0.00% : 0.000004s : 23: predicate.switch_defer_inline 0.01% : 0.000005s : 33: predicate.switch_layer_defer_inline 0.01% : 0.000013s : 74: predicate.switch_simplify 0.00% : 0.000002s : 15: predicate.tile_eliminate 0.00% : 0.000002s : 15: predicate.transpose_eliminate 0.00% : 0.000005s : 25: predicate.tuple_list_convert_item_index_to_positive 0.00% : 0.000004s : 25: predicate.tuple_list_get_item_const_eliminator 0.00% : 0.000004s : 25: predicate.tuple_list_get_item_depend_reorder 0.01% : 0.000009s : 39: predicate.tuple_list_get_item_eliminator 0.00% : 0.000004s : 25: predicate.tuple_list_get_set_item_eliminator 0.01% : 0.000006s : 35: predicate.tuple_list_set_item_eliminator 0.01% : 0.000005s : 29: predicate.tuple_to_list_eliminator_ 0.01% : 0.000006s : 44: predicate.updatestate_pure_node_eliminater 0.01% : 0.000009s : 54: predicate.updatestate_useless_node_eliminater 0.00% : 0.000001s : 5: predicate.value_based_eliminate 0.00% : 0.000002s : 10: predicate.virtual_dataset_eliminate 0.00% : 0.000002s : 10: predicate.virtual_output_eliminate 0.00% : 0.000001s : 5: predicate.virtual_view_grad_eliminate 0.00% : 0.000001s : 5: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.001102 11 36.02% : 0.000397s : 5: func_graph_cloner_run.FuncGraphClonerGraph 63.98% : 0.000705s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.431188 192 0.00% : 0.000005s : 1: ForceFp32Comm 0.93% : 0.004030s : 1: add_attr 0.93% : 0.004014s : 1: add_attr_with_inline 0.00% : 0.000005s : 1: add_comm_op_reuse_tag 0.02% : 0.000081s : 1: add_recomputation 0.00% : 0.000004s : 1: assign_add_opt 0.02% : 0.000091s : 1: auto_monad 0.01% : 0.000027s : 1: auto_monad_reorder 0.00% : 0.000004s : 1: begin_end_overlap_inline 0.00% : 0.000007s : 1: bias_add_comm_swap 0.12% : 0.000513s : 1: bootstrap 0.01% : 0.000044s : 1: cconv 0.00% : 0.000004s : 1: comm_op_add_attrs 0.01% : 0.000024s : 1: control_data_broadcast_order 0.00% : 0.000013s : 1: convert_after_rewriter 0.01% : 0.000032s : 1: cse_after_recomputation 0.00% : 0.000006s : 1: dataset_repeat_opt 0.00% : 0.000006s : 1: detach_backward 0.00% : 0.000012s : 1: environ_conv 0.01% : 0.000033s : 1: event_method 0.00% : 0.000005s : 1: full_micro_interleaved_order_control 0.00% : 0.000005s : 1: get_jit_bprop_graph 0.00% : 0.000011s : 1: graph_reusing 0.00% : 0.000005s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000004s : 1: handle_group_info 0.00% : 0.000006s : 1: inline 0.00% : 0.000006s : 1: insert-virtual-dataset 0.00% : 0.000004s : 1: interleave_parallel_branches 0.00% : 0.000004s : 1: interleave_split_concat_branches 0.00% : 0.000007s : 1: label_fine_grained_interleaved_index 0.00% : 0.000011s : 1: label_micro_interleaved_index 21.51% : 0.092734s : 1: loop_unroll 0.00% : 0.000004s : 1: merge_cast_opt 0.00% : 0.000006s : 1: micro_interleaved_order_control 0.19% : 0.000837s : 1: mutable_eliminate 0.00% : 0.000009s : 1: offloading_packed_experts 21.37% : 0.092137s : 1: opt.transform.loop_unroll_optimizer 0.01% : 0.000028s : 1: opt.transform.mutable_eliminate 0.42% : 0.001790s : 78: opt.transform.opt_a 0.01% : 0.000052s : 1: opt.transform.opt_after_cconv 0.01% : 0.000038s : 1: opt.transform.opt_after_jit_grad 0.03% : 0.000139s : 28: opt.transform.opt_b 0.02% : 0.000078s : 2: opt.transform.opt_trans_graph 0.01% : 0.000053s : 4: opt.transform.symbol_engine_opt 1.03% : 0.004456s : 1: opt_a 0.05% : 0.000222s : 1: opt_after_cconv 0.21% : 0.000889s : 1: opt_after_jit_grad 0.07% : 0.000294s : 1: opt_b 23.09% : 0.099562s : 1: optimize 0.01% : 0.000029s : 1: optimize_parallel_all_gather_comm 0.00% : 0.000009s : 1: order_py_execute_after_rewriter 0.01% : 0.000036s : 1: overlap_grad_flash_sp 0.00% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.00% : 0.000008s : 1: overlap_grad_ring_attention 0.00% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000006s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000007s : 1: overlap_param_gather 0.00% : 0.000005s : 1: overlap_recompute_allgather_and_fa_grad 0.00% : 0.000009s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000006s : 1: overlap_recompute_comm 0.00% : 0.000008s : 1: parallel-infer-symbol 0.00% : 0.000004s : 1: parallel-infer-symbol-second 0.00% : 0.000006s : 1: partial_unused_args_eliminate 0.00% : 0.000006s : 1: pipeline_parallel_scheduler 0.00% : 0.000005s : 1: pipeline_split 0.01% : 0.000053s : 1: pre_auto_parallel 0.01% : 0.000042s : 1: py_interpret_to_execute 0.01% : 0.000022s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000004s : 1: remove_cast_before_assign_add 0.01% : 0.000027s : 1: remove_dup_value 0.15% : 0.000654s : 1: renormalize.infer 0.17% : 0.000716s : 1: renormalize.specialize 0.00% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.00% : 0.000010s : 1: rewriter_after_jit_bprop_graph 0.01% : 0.000062s : 1: rewriter_after_opt_a 0.03% : 0.000115s : 1: rewriter_before_opt_a 0.00% : 0.000006s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000005s : 1: slice_recompute_activation 0.00% : 0.000005s : 1: split_layernorm_comm 0.00% : 0.000005s : 1: split_matmul_comm_elemetwise 0.00% : 0.000010s : 1: swap_dp_allreduce_reducescatter 0.02% : 0.000106s : 1: symbol_engine_optimizer 0.03% : 0.000120s : 1: tuple_transform 29.39% : 0.126719s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:38:09.530.366 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:38:09.530.649 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.187808, [21] [bootstrap]: 0.00050281 [type_inference]: 0.172606 [event_method]: 2.644e-05 [auto_monad]: 8.839e-05 [graph_reusing]: 7.4e-06 [inline]: 3.23e-06 [add_attr]: 0.00421359, [1] [add_attr_with_inline]: 0.00419832, [1] [Cycle 1]: 0.00010699, [2] [tag_attr]: 2.811e-05 [meta_addattr_fg_expand]: 6.56999e-06 [parallel-infer-symbol]: 3.71001e-06 [pre_auto_parallel]: 4.901e-05 [insert-virtual-dataset]: 2.49999e-06 [parallel-infer-symbol-second]: 9.60019e-07 [dataset_repeat_opt]: 2.01e-06 [pipeline_split]: 1.77999e-06 [optimize]: 0.00834907, [53] [py_interpret_to_execute]: 4.617e-05 [rewriter_before_opt_a]: 0.00011131 [opt_a]: 0.00474712, [2] [Cycle 1]: 0.00350153, [45] [expand_dump_flag]: 3.64002e-06 [switch_simplify]: 4.715e-05 [loop_unroll]: 3.345e-05 [a_1]: 0.00092294 [with_stream_mark]: 3.087e-05 [recompute_prepare]: 1.714e-05 [updatestate_depend_eliminate]: 7.85e-06 [updatestate_assign_eliminate]: 4.42998e-06 [updatestate_loads_eliminate]: 4.57e-06 [parameter_eliminate]: 2.23998e-06 [a_2]: 0.0001875 [accelerated_algorithm]: 2.848e-05 [shard]: 2.13998e-06 [meta_shard_fg_expand]: 3.08998e-06 [shard_inline]: 1.145e-05 [merge_send_recv]: 1.193e-05 [auto_parallel]: 1.077e-05 [parallel]: 2.371e-05 [flash_sp]: 1.076e-05 [merge_comm]: 5.70001e-06 [allreduce_fusion]: 5.57999e-06 [matmul_add_comm_reduction]: 1.581e-05 [allreduce_slice_to_reducescatter]: 8.70001e-07 [virtual_shard_identity]: 1.429e-05 [virtual_dataset]: 1.047e-05 [get_grad_eliminate_]: 1.109e-05 [virtual_output]: 1.02e-05 [merge_forward]: 6.10002e-06 [cell_reuse_recompute_pass]: 2.25002e-06 [offload_activation]: 1.462e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.491e-05 [merge_recompute_call_nodes]: 2.58003e-06 [before_grad]: 1.67e-05 [set_forward_comm_id_for_comm_node_pass]: 6.38e-06 [meta_fg_expand]: 4.90999e-06 [flash_sp_send_recv_attached]: 6.02999e-06 [receive_attached]: 4.02e-06 [after_resolve]: 1.751e-05 [a_after_grad]: 1.937e-05 [renormalize]: 0.00126888 [add_forward_monad_depend]: 9.15999e-06 [auto_monad_grad]: 2.77002e-06 [auto_monad_eliminator]: 2.583e-05 [cse]: 6.264e-05 [a_3]: 9.339e-05 [Cycle 2]: 0.00122783, [45] [expand_dump_flag]: 2.96001e-06 [switch_simplify]: 1.174e-05 [loop_unroll]: 9.54e-06 [a_1]: 0.0002535 [with_stream_mark]: 1.791e-05 [recompute_prepare]: 1.074e-05 [updatestate_depend_eliminate]: 6.72002e-06 [updatestate_assign_eliminate]: 4.27e-06 [updatestate_loads_eliminate]: 5.24998e-06 [parameter_eliminate]: 1.86998e-06 [a_2]: 0.00015764 [accelerated_algorithm]: 1.392e-05 [shard]: 1.93997e-06 [meta_shard_fg_expand]: 2.12001e-06 [shard_inline]: 9.94999e-06 [merge_send_recv]: 1.214e-05 [auto_parallel]: 1.127e-05 [parallel]: 9.61e-06 [flash_sp]: 4.94e-06 [merge_comm]: 5.69999e-06 [allreduce_fusion]: 5.10999e-06 [matmul_add_comm_reduction]: 1.299e-05 [allreduce_slice_to_reducescatter]: 8.30012e-07 [virtual_shard_identity]: 1.14e-05 [virtual_dataset]: 9.88998e-06 [get_grad_eliminate_]: 8.89998e-06 [virtual_output]: 8.57e-06 [merge_forward]: 5.91e-06 [cell_reuse_recompute_pass]: 2.87002e-06 [offload_activation]: 1.124e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.247e-05 [merge_recompute_call_nodes]: 1.30999e-06 [before_grad]: 1.739e-05 [set_forward_comm_id_for_comm_node_pass]: 6.00002e-06 [meta_fg_expand]: 3.53e-06 [flash_sp_send_recv_attached]: 1.72999e-06 [receive_attached]: 2.01e-06 [after_resolve]: 1.775e-05 [a_after_grad]: 1.637e-05 [renormalize]: 7.99773e-08 [add_forward_monad_depend]: 2.48e-06 [auto_monad_grad]: 2.22001e-06 [auto_monad_eliminator]: 1.544e-05 [cse]: 3.15e-05 [a_3]: 7.708e-05 [py_interpret_to_execute_after_opt_a]: 2.495e-05 [slice_cell_reuse_recomputed_activation]: 4.81002e-06 [rewriter_after_opt_a]: 6.482e-05 [convert_after_rewriter]: 1.28e-05 [order_py_execute_after_rewriter]: 1.04e-05 [mutable_eliminate]: 0.00084111 [opt_b]: 0.00046747, [1] [Cycle 1]: 0.00045345, [7] [b_1]: 0.00028816 [b_2]: 1.372e-05 [updatestate_depend_eliminate]: 1.483e-05 [updatestate_assign_eliminate]: 4.97e-06 [updatestate_loads_eliminate]: 5.24e-06 [renormalize]: 1.15999e-06 [cse]: 4.888e-05 [optimize_parallel_all_gather_comm]: 3.31e-05 [overlap_param_gather]: 6.15002e-06 [cconv]: 4.616e-05 [loop_unroll]: 0.00071576 [opt_after_cconv]: 0.00020709, [1] [Cycle 1]: 0.00019327, [7] [c_1]: 5.499e-05 [parameter_eliminate]: 5.86e-06 [updatestate_depend_eliminate]: 1.061e-05 [updatestate_assign_eliminate]: 3.92998e-06 [updatestate_loads_eliminate]: 4.00998e-06 [cse]: 4.016e-05 [renormalize]: 8.89995e-07 [remove_dup_value]: 0.00011543 [tuple_transform]: 0.00015399, [1] [Cycle 1]: 0.00014406, [4] [d_1]: 8.679e-05 [none_parameter_eliminate]: 3.50998e-06 [renormalize]: 3.39991e-07 [switch_simplify]: 1.102e-05 [partial_unused_args_eliminate]: 5.50001e-06 [add_recomputation]: 8.299e-05 [cse_after_recomputation]: 4.584e-05, [1] [Cycle 1]: 3.745e-05, [1] [cse]: 2.528e-05 [environ_conv]: 1.136e-05 [swap_dp_allreduce_reducescatter]: 1.068e-05 [bias_add_comm_swap]: 5.96e-06 [label_micro_interleaved_index]: 9.99001e-06 [label_fine_grained_interleaved_index]: 5.62999e-06 [merge_cast_opt]: 4.45999e-06 [slice_recompute_activation]: 5.32001e-06 [micro_interleaved_order_control]: 4.87998e-06 [assign_add_opt]: 3.88001e-06 [ForceFp32Comm]: 3.71001e-06 [remove_cast_before_assign_add]: 4.08001e-06 [full_micro_interleaved_order_control]: 4.72998e-06 [reorder_send_recv_between_fp_bp]: 5.70001e-06 [comm_op_add_attrs]: 3.81001e-06 [add_comm_op_reuse_tag]: 3.95e-06 [interleave_split_concat_branches]: 3.72998e-06 [interleave_parallel_branches]: 4.06001e-06 [overlap_opt_shard_in_pipeline]: 3.89002e-06 [overlap_opt_shard_grad_in_pipeline]: 4.87998e-06 [control_data_broadcast_order]: 2.202e-05 [grouped_pairwise_exchange_alltoall]: 4.60001e-06 [offloading_packed_experts]: 8.80001e-06 [overlap_recompute_and_grad_model_parallel]: 9.10999e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.57997e-06 [overlap_recompute_allgather_and_fa_grad]: 4.23999e-06 [overlap_recompute_comm]: 5.22e-06 [overlap_grad_ring_attention]: 8.70001e-06 [overlap_grad_flash_sp]: 3.126e-05 [begin_end_overlap_inline]: 3.23e-06 [split_matmul_comm_elemetwise]: 5.27001e-06 [split_layernorm_comm]: 4.34997e-06 [handle_group_info]: 3.6e-06 [symbol_engine_optimizer]: 0.00014207, [1] [Cycle 1]: 0.00013406, [6] [build]: 4.70001e-06 [elim_shapecalc]: 1.817e-05 [elim_not_effective]: 2.153e-05 [opt_reshape]: 1.246e-05 [fold_const_symbol]: 1.677e-05 [renormalize]: 2.69996e-07 [detach_backward]: 8.62e-06 [pipeline_parallel_scheduler]: 2.20002e-06 [auto_monad_reorder]: 3.247e-05 [get_jit_bprop_graph]: 2.63e-06 [rewriter_after_jit_bprop_graph]: 9.52001e-06 [opt_after_jit_grad]: 0.0008756 [validate]: 5.926e-05 Sums bootstrap : 0.000503s : 0.28% type_inference : 0.172606s : 95.24% event_method : 0.000026s : 0.01% auto_monad : 0.000088s : 0.05% graph_reusing : 0.000007s : 0.00% inline : 0.000003s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000028s : 0.02% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000007s : 0.00% parallel-infer-symbol : 0.000004s : 0.00% pre_auto_parallel : 0.000049s : 0.03% insert-virtual-dataset : 0.000002s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000046s : 0.03% optimize.rewriter_before_opt_a : 0.000111s : 0.06% optimize.opt_a.expand_dump_flag : 0.000007s : 0.00% optimize.opt_a.switch_simplify : 0.000059s : 0.03% optimize.opt_a.loop_unroll : 0.000043s : 0.02% optimize.opt_a.a_1 : 0.001176s : 0.65% optimize.opt_a.with_stream_mark : 0.000049s : 0.03% optimize.opt_a.recompute_prepare : 0.000028s : 0.02% optimize.opt_a.updatestate_depend_eliminate : 0.000015s : 0.01% optimize.opt_a.updatestate_assign_eliminate : 0.000009s : 0.00% optimize.opt_a.updatestate_loads_eliminate : 0.000010s : 0.01% optimize.opt_a.parameter_eliminate : 0.000004s : 0.00% optimize.opt_a.a_2 : 0.000345s : 0.19% optimize.opt_a.accelerated_algorithm : 0.000042s : 0.02% optimize.opt_a.shard : 0.000004s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.000005s : 0.00% optimize.opt_a.shard_inline : 0.000021s : 0.01% optimize.opt_a.merge_send_recv : 0.000024s : 0.01% optimize.opt_a.auto_parallel : 0.000022s : 0.01% optimize.opt_a.parallel : 0.000033s : 0.02% optimize.opt_a.flash_sp : 0.000016s : 0.01% optimize.opt_a.merge_comm : 0.000011s : 0.01% optimize.opt_a.allreduce_fusion : 0.000011s : 0.01% optimize.opt_a.matmul_add_comm_reduction : 0.000029s : 0.02% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000002s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000026s : 0.01% optimize.opt_a.virtual_dataset : 0.000020s : 0.01% optimize.opt_a.get_grad_eliminate_ : 0.000020s : 0.01% optimize.opt_a.virtual_output : 0.000019s : 0.01% optimize.opt_a.merge_forward : 0.000012s : 0.01% optimize.opt_a.cell_reuse_recompute_pass : 0.000005s : 0.00% optimize.opt_a.offload_activation : 0.000026s : 0.01% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000047s : 0.03% optimize.opt_a.merge_recompute_call_nodes : 0.000004s : 0.00% optimize.opt_a.before_grad : 0.000034s : 0.02% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000012s : 0.01% optimize.opt_a.meta_fg_expand : 0.000008s : 0.00% optimize.opt_a.flash_sp_send_recv_attached : 0.000008s : 0.00% optimize.opt_a.receive_attached : 0.000006s : 0.00% optimize.opt_a.after_resolve : 0.000035s : 0.02% optimize.opt_a.a_after_grad : 0.000036s : 0.02% optimize.opt_a.renormalize : 0.001269s : 0.70% optimize.opt_a.add_forward_monad_depend : 0.000012s : 0.01% optimize.opt_a.auto_monad_grad : 0.000005s : 0.00% optimize.opt_a.auto_monad_eliminator : 0.000041s : 0.02% optimize.opt_a.cse : 0.000094s : 0.05% optimize.opt_a.a_3 : 0.000170s : 0.09% optimize.py_interpret_to_execute_after_opt_a : 0.000025s : 0.01% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.00% optimize.rewriter_after_opt_a : 0.000065s : 0.04% optimize.convert_after_rewriter : 0.000013s : 0.01% optimize.order_py_execute_after_rewriter : 0.000010s : 0.01% optimize.mutable_eliminate : 0.000841s : 0.46% optimize.opt_b.b_1 : 0.000288s : 0.16% optimize.opt_b.b_2 : 0.000014s : 0.01% optimize.opt_b.updatestate_depend_eliminate : 0.000015s : 0.01% optimize.opt_b.updatestate_assign_eliminate : 0.000005s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000005s : 0.00% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000049s : 0.03% optimize.optimize_parallel_all_gather_comm : 0.000033s : 0.02% optimize.overlap_param_gather : 0.000006s : 0.00% optimize.cconv : 0.000046s : 0.03% optimize.loop_unroll : 0.000716s : 0.39% optimize.opt_after_cconv.c_1 : 0.000055s : 0.03% optimize.opt_after_cconv.parameter_eliminate : 0.000006s : 0.00% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000011s : 0.01% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000004s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000004s : 0.00% optimize.opt_after_cconv.cse : 0.000040s : 0.02% optimize.opt_after_cconv.renormalize : 0.000001s : 0.00% optimize.remove_dup_value : 0.000115s : 0.06% optimize.tuple_transform.d_1 : 0.000087s : 0.05% optimize.tuple_transform.none_parameter_eliminate : 0.000004s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000011s : 0.01% optimize.partial_unused_args_eliminate : 0.000006s : 0.00% optimize.add_recomputation : 0.000083s : 0.05% optimize.cse_after_recomputation.cse : 0.000025s : 0.01% optimize.environ_conv : 0.000011s : 0.01% optimize.swap_dp_allreduce_reducescatter : 0.000011s : 0.01% optimize.bias_add_comm_swap : 0.000006s : 0.00% optimize.label_micro_interleaved_index : 0.000010s : 0.01% optimize.label_fine_grained_interleaved_index : 0.000006s : 0.00% optimize.merge_cast_opt : 0.000004s : 0.00% optimize.slice_recompute_activation : 0.000005s : 0.00% optimize.micro_interleaved_order_control : 0.000005s : 0.00% optimize.assign_add_opt : 0.000004s : 0.00% optimize.ForceFp32Comm : 0.000004s : 0.00% optimize.remove_cast_before_assign_add : 0.000004s : 0.00% optimize.full_micro_interleaved_order_control : 0.000005s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000006s : 0.00% optimize.comm_op_add_attrs : 0.000004s : 0.00% optimize.add_comm_op_reuse_tag : 0.000004s : 0.00% optimize.interleave_split_concat_branches : 0.000004s : 0.00% optimize.interleave_parallel_branches : 0.000004s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000005s : 0.00% optimize.control_data_broadcast_order : 0.000022s : 0.01% optimize.grouped_pairwise_exchange_alltoall : 0.000005s : 0.00% optimize.offloading_packed_experts : 0.000009s : 0.00% optimize.overlap_recompute_and_grad_model_parallel : 0.000009s : 0.01% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.00% optimize.overlap_recompute_comm : 0.000005s : 0.00% optimize.overlap_grad_ring_attention : 0.000009s : 0.00% optimize.overlap_grad_flash_sp : 0.000031s : 0.02% optimize.begin_end_overlap_inline : 0.000003s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000005s : 0.00% optimize.split_layernorm_comm : 0.000004s : 0.00% optimize.handle_group_info : 0.000004s : 0.00% optimize.symbol_engine_optimizer.build : 0.000005s : 0.00% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000018s : 0.01% optimize.symbol_engine_optimizer.elim_not_effective : 0.000022s : 0.01% optimize.symbol_engine_optimizer.opt_reshape : 0.000012s : 0.01% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000017s : 0.01% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000009s : 0.00% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000032s : 0.02% get_jit_bprop_graph : 0.000003s : 0.00% rewriter_after_jit_bprop_graph : 0.000010s : 0.01% opt_after_jit_grad : 0.000876s : 0.48% validate : 0.000059s : 0.03% Time group info: ------[substitution.] 0.000332 54 11.78% : 0.000039s : 4: substitution.arithmetic_simplify 13.12% : 0.000044s : 6: substitution.cast_eliminate 0.76% : 0.000003s : 4: substitution.elim_not_effective 0.60% : 0.000002s : 4: substitution.fold_const_symbol 2.53% : 0.000008s : 6: substitution.graph_param_transform 55.19% : 0.000183s : 4: substitution.inline 1.81% : 0.000006s : 8: substitution.j_node_and_user_rematch 4.66% : 0.000015s : 2: substitution.less_batch_normalization 2.69% : 0.000009s : 8: substitution.remove_not_recompute_node 2.03% : 0.000007s : 4: substitution.replace_old_param 4.85% : 0.000016s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.172529 2 99.36% : 0.171418s : 1: type_inference.infer 0.64% : 0.001111s : 1: type_inference.specialize ------[replace.] 0.000076 8 58.40% : 0.000044s : 4: replace.inline 41.60% : 0.000032s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000193 8 92.89% : 0.000180s : 4: match.inline 7.11% : 0.000014s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000316 1730 0.81% : 0.000003s : 17: predicate.accumulaten_eliminater 1.47% : 0.000005s : 6: predicate.ad_related_special_op_eliminate 0.55% : 0.000002s : 12: predicate.addn_check_dump 0.86% : 0.000003s : 17: predicate.addn_zero_filter 0.78% : 0.000002s : 17: predicate.adjust_all_reduce_mul_add 2.40% : 0.000008s : 29: predicate.arithmetic_simplify 0.88% : 0.000003s : 17: predicate.cast_eliminate 0.71% : 0.000002s : 12: predicate.check_bprop_eliminate 0.59% : 0.000002s : 12: predicate.compare_switch_simplify 0.18% : 0.000001s : 6: predicate.const_output_eliminate 0.66% : 0.000002s : 12: predicate.depend_value_elim 0.88% : 0.000003s : 17: predicate.dict_get_item_const_eliminator 1.00% : 0.000003s : 17: predicate.dict_get_item_eliminator 0.91% : 0.000003s : 17: predicate.dict_set_item_eliminator 1.15% : 0.000004s : 12: predicate.dumpgradient_eliminate 0.24% : 0.000001s : 6: predicate.elim_not_effective 0.46% : 0.000001s : 6: predicate.elim_shapecalc_of_broadcastargs 1.08% : 0.000003s : 23: predicate.environ_add_const_eliminate 1.07% : 0.000003s : 23: predicate.environ_get_add_eliminate 1.17% : 0.000004s : 23: predicate.environ_get_depend_swap 1.79% : 0.000006s : 35: predicate.environ_get_eliminate 1.05% : 0.000003s : 23: predicate.environ_get_set_eliminate 1.15% : 0.000004s : 25: predicate.exchange_switch_depend_value 2.26% : 0.000007s : 25: predicate.float_depend_g_call 0.61% : 0.000002s : 12: predicate.float_environ_get_switch 0.97% : 0.000003s : 18: predicate.float_tuple_getitem_switch 0.18% : 0.000001s : 6: predicate.fold_const_symbol 0.71% : 0.000002s : 12: predicate.get_grad_eliminate 0.20% : 0.000001s : 6: predicate.graph_param_transform 0.63% : 0.000002s : 12: predicate.incorporate_call 0.53% : 0.000002s : 12: predicate.incorporate_call_switch 6.26% : 0.000020s : 78: predicate.inline 1.32% : 0.000004s : 12: predicate.inline_without_move 0.30% : 0.000001s : 12: predicate.j_node_and_user_rematch 0.89% : 0.000003s : 12: predicate.less_batch_normalization 1.75% : 0.000006s : 33: predicate.list_to_tuple_eliminator_ 2.32% : 0.000007s : 50: predicate.load_eliminater 1.17% : 0.000004s : 6: predicate.loop_unroll_after_grad 1.90% : 0.000006s : 38: predicate.loop_unroll_before_grad 1.69% : 0.000005s : 29: predicate.make_slice_get_slice_eliminator 0.66% : 0.000002s : 12: predicate.merge_addn 0.60% : 0.000002s : 12: predicate.micro_step_allgather_replace 0.61% : 0.000002s : 12: predicate.mini_step_allgather_replace 0.75% : 0.000002s : 17: predicate.minmaximum_grad 1.94% : 0.000006s : 6: predicate.mutable_eliminate 0.37% : 0.000001s : 6: predicate.opt_reshape 0.38% : 0.000001s : 6: predicate.parallel_virtual_node 1.67% : 0.000005s : 25: predicate.partial_defer_inline 1.46% : 0.000005s : 27: predicate.partial_eliminate 0.87% : 0.000003s : 17: predicate.print_const_string_wrapper 0.67% : 0.000002s : 12: predicate.reduce_all_const_elim 1.17% : 0.000004s : 17: predicate.reduce_eliminate 2.31% : 0.000007s : 50: predicate.redundant_stop_gradient_eliminater 0.38% : 0.000001s : 12: predicate.remove_not_recompute_node 1.23% : 0.000004s : 33: predicate.replace_applicator 0.51% : 0.000002s : 12: predicate.replace_old_param 0.29% : 0.000001s : 6: predicate.reset_defer_inline 0.97% : 0.000003s : 17: predicate.reshape_eliminate 0.63% : 0.000002s : 12: predicate.row_tensor_add_zeros_like 0.45% : 0.000001s : 6: predicate.row_tensor_eliminate 0.84% : 0.000003s : 12: predicate.same_eliminate 0.48% : 0.000002s : 12: predicate.set_cell_output_no_recompute 0.89% : 0.000003s : 12: predicate.shard_identity_eliminate 0.89% : 0.000003s : 12: predicate.special_op_eliminate 0.74% : 0.000002s : 12: predicate.specialize_transform 1.04% : 0.000003s : 12: predicate.split_environ_get_set_with_tuple_value 1.00% : 0.000003s : 12: predicate.stack_unstack_eliminate 0.34% : 0.000001s : 6: predicate.switch_call_monad_eliminater 1.21% : 0.000004s : 25: predicate.switch_defer_inline 1.80% : 0.000006s : 37: predicate.switch_layer_defer_inline 4.48% : 0.000014s : 81: predicate.switch_simplify 0.88% : 0.000003s : 17: predicate.tile_eliminate 0.89% : 0.000003s : 17: predicate.transpose_eliminate 1.65% : 0.000005s : 29: predicate.tuple_list_convert_item_index_to_positive 1.54% : 0.000005s : 29: predicate.tuple_list_get_item_const_eliminator 1.55% : 0.000005s : 29: predicate.tuple_list_get_item_depend_reorder 3.12% : 0.000010s : 45: predicate.tuple_list_get_item_eliminator 1.73% : 0.000005s : 29: predicate.tuple_list_get_set_item_eliminator 2.32% : 0.000007s : 41: predicate.tuple_list_set_item_eliminator 1.62% : 0.000005s : 33: predicate.tuple_to_list_eliminator_ 2.17% : 0.000007s : 50: predicate.updatestate_pure_node_eliminater 2.98% : 0.000009s : 62: predicate.updatestate_useless_node_eliminater 0.43% : 0.000001s : 6: predicate.value_based_eliminate 0.66% : 0.000002s : 12: predicate.virtual_dataset_eliminate 0.58% : 0.000002s : 12: predicate.virtual_output_eliminate 0.31% : 0.000001s : 6: predicate.virtual_view_grad_eliminate 0.39% : 0.000001s : 6: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000971 11 48.47% : 0.000471s : 5: func_graph_cloner_run.FuncGraphClonerGraph 51.53% : 0.000501s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.203890 192 0.00% : 0.000006s : 1: ForceFp32Comm 2.07% : 0.004227s : 1: add_attr 2.06% : 0.004203s : 1: add_attr_with_inline 0.00% : 0.000007s : 1: add_comm_op_reuse_tag 0.04% : 0.000087s : 1: add_recomputation 0.00% : 0.000006s : 1: assign_add_opt 0.05% : 0.000100s : 1: auto_monad 0.02% : 0.000041s : 1: auto_monad_reorder 0.00% : 0.000006s : 1: begin_end_overlap_inline 0.00% : 0.000009s : 1: bias_add_comm_swap 0.27% : 0.000556s : 1: bootstrap 0.02% : 0.000050s : 1: cconv 0.00% : 0.000007s : 1: comm_op_add_attrs 0.01% : 0.000025s : 1: control_data_broadcast_order 0.01% : 0.000016s : 1: convert_after_rewriter 0.02% : 0.000049s : 1: cse_after_recomputation 0.00% : 0.000008s : 1: dataset_repeat_opt 0.02% : 0.000039s : 1: detach_backward 0.01% : 0.000014s : 1: environ_conv 0.02% : 0.000039s : 1: event_method 0.00% : 0.000008s : 1: full_micro_interleaved_order_control 0.00% : 0.000009s : 1: get_jit_bprop_graph 0.01% : 0.000016s : 1: graph_reusing 0.00% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000007s : 1: handle_group_info 0.00% : 0.000010s : 1: inline 0.00% : 0.000009s : 1: insert-virtual-dataset 0.00% : 0.000007s : 1: interleave_parallel_branches 0.00% : 0.000006s : 1: interleave_split_concat_branches 0.00% : 0.000009s : 1: label_fine_grained_interleaved_index 0.01% : 0.000013s : 1: label_micro_interleaved_index 0.36% : 0.000725s : 1: loop_unroll 0.00% : 0.000007s : 1: merge_cast_opt 0.00% : 0.000008s : 1: micro_interleaved_order_control 0.42% : 0.000852s : 1: mutable_eliminate 0.01% : 0.000011s : 1: offloading_packed_experts 0.01% : 0.000025s : 1: opt.transform.loop_unroll_optimizer 0.02% : 0.000031s : 1: opt.transform.mutable_eliminate 0.94% : 0.001912s : 78: opt.transform.opt_a 0.03% : 0.000053s : 1: opt.transform.opt_after_cconv 0.03% : 0.000051s : 1: opt.transform.opt_after_jit_grad 0.10% : 0.000208s : 28: opt.transform.opt_b 0.05% : 0.000095s : 2: opt.transform.opt_trans_graph 0.03% : 0.000064s : 4: opt.transform.symbol_engine_opt 2.33% : 0.004751s : 1: opt_a 0.10% : 0.000212s : 1: opt_after_cconv 0.44% : 0.000893s : 1: opt_after_jit_grad 0.23% : 0.000473s : 1: opt_b 4.41% : 0.008983s : 1: optimize 0.02% : 0.000037s : 1: optimize_parallel_all_gather_comm 0.01% : 0.000014s : 1: order_py_execute_after_rewriter 0.02% : 0.000035s : 1: overlap_grad_flash_sp 0.00% : 0.000006s : 1: overlap_grad_matmul_and_grad_allreduce 0.01% : 0.000011s : 1: overlap_grad_ring_attention 0.00% : 0.000008s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000007s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000009s : 1: overlap_param_gather 0.00% : 0.000007s : 1: overlap_recompute_allgather_and_fa_grad 0.01% : 0.000013s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000008s : 1: overlap_recompute_comm 0.01% : 0.000011s : 1: parallel-infer-symbol 0.00% : 0.000007s : 1: parallel-infer-symbol-second 0.00% : 0.000009s : 1: partial_unused_args_eliminate 0.00% : 0.000009s : 1: pipeline_parallel_scheduler 0.00% : 0.000008s : 1: pipeline_split 0.03% : 0.000058s : 1: pre_auto_parallel 0.02% : 0.000050s : 1: py_interpret_to_execute 0.01% : 0.000029s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000007s : 1: remove_cast_before_assign_add 0.06% : 0.000122s : 1: remove_dup_value 0.33% : 0.000682s : 1: renormalize.infer 0.28% : 0.000572s : 1: renormalize.specialize 0.00% : 0.000008s : 1: reorder_send_recv_between_fp_bp 0.01% : 0.000016s : 1: rewriter_after_jit_bprop_graph 0.03% : 0.000069s : 1: rewriter_after_opt_a 0.06% : 0.000115s : 1: rewriter_before_opt_a 0.00% : 0.000008s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000008s : 1: slice_recompute_activation 0.00% : 0.000008s : 1: split_layernorm_comm 0.00% : 0.000008s : 1: split_matmul_comm_elemetwise 0.01% : 0.000014s : 1: swap_dp_allreduce_reducescatter 0.07% : 0.000145s : 1: symbol_engine_optimizer 0.08% : 0.000158s : 1: tuple_transform 84.69% : 0.172666s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:38:11.659.10 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.153077, [21] [bootstrap]: 0.00045042 [type_inference]: 0.0074021 [event_method]: 2.417e-05 [auto_monad]: 8.269e-05 [graph_reusing]: 6.96001e-06 [inline]: 3.03e-06 [add_attr]: 0.00368744, [1] [add_attr_with_inline]: 0.00367448, [1] [Cycle 1]: 7.699e-05, [2] [tag_attr]: 2.658e-05 [meta_addattr_fg_expand]: 6.74999e-06 [parallel-infer-symbol]: 3.86001e-06 [pre_auto_parallel]: 4.474e-05 [insert-virtual-dataset]: 3.73001e-06 [parallel-infer-symbol-second]: 9.00007e-07 [dataset_repeat_opt]: 2.24001e-06 [pipeline_split]: 2.36e-06 [optimize]: 0.140484, [53] [py_interpret_to_execute]: 3.672e-05 [rewriter_before_opt_a]: 0.00010769 [opt_a]: 0.137559, [2] [Cycle 1]: 0.136544, [45] [expand_dump_flag]: 3.66999e-06 [switch_simplify]: 4.767e-05 [loop_unroll]: 3.267e-05 [a_1]: 0.00081521 [with_stream_mark]: 2.714e-05 [recompute_prepare]: 1.672e-05 [updatestate_depend_eliminate]: 6.69999e-06 [updatestate_assign_eliminate]: 4.30999e-06 [updatestate_loads_eliminate]: 4.27e-06 [parameter_eliminate]: 2.19001e-06 [a_2]: 0.00013461 [accelerated_algorithm]: 2.56e-05 [shard]: 2.26e-06 [meta_shard_fg_expand]: 2.48e-06 [shard_inline]: 9.63002e-06 [merge_send_recv]: 1.162e-05 [auto_parallel]: 9.56998e-06 [parallel]: 2.526e-05 [flash_sp]: 1.008e-05 [merge_comm]: 5.64998e-06 [allreduce_fusion]: 4.82998e-06 [matmul_add_comm_reduction]: 1.506e-05 [allreduce_slice_to_reducescatter]: 1.02e-06 [virtual_shard_identity]: 1.379e-05 [virtual_dataset]: 9.99999e-06 [get_grad_eliminate_]: 9.14e-06 [virtual_output]: 9.72999e-06 [merge_forward]: 6.44001e-06 [cell_reuse_recompute_pass]: 1.52001e-06 [offload_activation]: 1.316e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.013e-05 [merge_recompute_call_nodes]: 2.39999e-06 [before_grad]: 1.772e-05 [set_forward_comm_id_for_comm_node_pass]: 5.63002e-06 [meta_fg_expand]: 4.09002e-06 [flash_sp_send_recv_attached]: 6.50002e-06 [receive_attached]: 2.43002e-06 [after_resolve]: 1.522e-05 [a_after_grad]: 1.575e-05 [renormalize]: 0.13468 [add_forward_monad_depend]: 1.26e-05 [auto_monad_grad]: 3.29001e-06 [auto_monad_eliminator]: 2.471e-05 [cse]: 5.134e-05 [a_3]: 9.14e-05 [Cycle 2]: 0.0009974, [45] [expand_dump_flag]: 2.27999e-06 [switch_simplify]: 1.352e-05 [loop_unroll]: 9.44e-06 [a_1]: 0.00026287 [with_stream_mark]: 1.779e-05 [recompute_prepare]: 9.72001e-06 [updatestate_depend_eliminate]: 5.44e-06 [updatestate_assign_eliminate]: 4.55001e-06 [updatestate_loads_eliminate]: 4.49002e-06 [parameter_eliminate]: 2.14999e-06 [a_2]: 0.00012059 [accelerated_algorithm]: 1.413e-05 [shard]: 2.22001e-06 [meta_shard_fg_expand]: 2.69001e-06 [shard_inline]: 9.36002e-06 [merge_send_recv]: 1.125e-05 [auto_parallel]: 1.165e-05 [parallel]: 1.057e-05 [flash_sp]: 5.33002e-06 [merge_comm]: 5.13002e-06 [allreduce_fusion]: 5.20999e-06 [matmul_add_comm_reduction]: 1.392e-05 [allreduce_slice_to_reducescatter]: 9.79984e-07 [virtual_shard_identity]: 1.097e-05 [virtual_dataset]: 9.32001e-06 [get_grad_eliminate_]: 8.64e-06 [virtual_output]: 8.54e-06 [merge_forward]: 5.70001e-06 [cell_reuse_recompute_pass]: 4.28999e-06 [offload_activation]: 1.337e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.776e-05 [merge_recompute_call_nodes]: 1.49e-06 [before_grad]: 1.56e-05 [set_forward_comm_id_for_comm_node_pass]: 5.89999e-06 [meta_fg_expand]: 4.05e-06 [flash_sp_send_recv_attached]: 2.04e-06 [receive_attached]: 2.34001e-06 [after_resolve]: 1.589e-05 [a_after_grad]: 1.417e-05 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 1.74e-06 [auto_monad_grad]: 1.22e-06 [auto_monad_eliminator]: 1.024e-05 [cse]: 2.65e-05 [a_3]: 5.555e-05 [py_interpret_to_execute_after_opt_a]: 1.923e-05 [slice_cell_reuse_recomputed_activation]: 2.24001e-06 [rewriter_after_opt_a]: 5.059e-05 [convert_after_rewriter]: 8.75001e-06 [order_py_execute_after_rewriter]: 7.5e-06 [mutable_eliminate]: 0.00079913 [opt_b]: 0.00033093, [1] [Cycle 1]: 0.00032177, [7] [b_1]: 0.00021434 [b_2]: 1.204e-05 [updatestate_depend_eliminate]: 7.83001e-06 [updatestate_assign_eliminate]: 4.08001e-06 [updatestate_loads_eliminate]: 3.88999e-06 [renormalize]: 9.80013e-07 [cse]: 3.525e-05 [optimize_parallel_all_gather_comm]: 2.14e-05 [overlap_param_gather]: 1.99999e-06 [cconv]: 3.601e-05 [loop_unroll]: 0.00050609 [opt_after_cconv]: 0.00022103, [1] [Cycle 1]: 0.00021189, [7] [c_1]: 5.188e-05 [parameter_eliminate]: 4.15999e-06 [updatestate_depend_eliminate]: 6.78e-06 [updatestate_assign_eliminate]: 3.63999e-06 [updatestate_loads_eliminate]: 3.81001e-06 [cse]: 2.952e-05 [renormalize]: 3.7998e-07 [remove_dup_value]: 5.686e-05 [tuple_transform]: 0.00012317, [1] [Cycle 1]: 0.00011669, [4] [d_1]: 8.064e-05 [none_parameter_eliminate]: 2.29001e-06 [renormalize]: 3.50003e-07 [switch_simplify]: 1.131e-05 [partial_unused_args_eliminate]: 2.12001e-06 [add_recomputation]: 7.88e-05 [cse_after_recomputation]: 3.479e-05, [1] [Cycle 1]: 2.949e-05, [1] [cse]: 2.27e-05 [environ_conv]: 7.54002e-06 [swap_dp_allreduce_reducescatter]: 8.07e-06 [bias_add_comm_swap]: 3.45e-06 [label_micro_interleaved_index]: 5.53002e-06 [label_fine_grained_interleaved_index]: 2.78e-06 [merge_cast_opt]: 1.67001e-06 [slice_recompute_activation]: 2.61e-06 [micro_interleaved_order_control]: 2.53e-06 [assign_add_opt]: 1.38002e-06 [ForceFp32Comm]: 9.5999e-07 [remove_cast_before_assign_add]: 1.47999e-06 [full_micro_interleaved_order_control]: 2.17999e-06 [reorder_send_recv_between_fp_bp]: 2.86999e-06 [comm_op_add_attrs]: 1.34e-06 [add_comm_op_reuse_tag]: 9.70002e-07 [interleave_split_concat_branches]: 1.24e-06 [interleave_parallel_branches]: 1.17999e-06 [overlap_opt_shard_in_pipeline]: 1.29e-06 [overlap_opt_shard_grad_in_pipeline]: 2.04e-06 [control_data_broadcast_order]: 1.815e-05 [grouped_pairwise_exchange_alltoall]: 2.00002e-06 [offloading_packed_experts]: 6.63e-06 [overlap_recompute_and_grad_model_parallel]: 6.53e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.23002e-06 [overlap_recompute_allgather_and_fa_grad]: 1.70001e-06 [overlap_recompute_comm]: 2.68e-06 [overlap_grad_ring_attention]: 5.12e-06 [overlap_grad_flash_sp]: 2.784e-05 [begin_end_overlap_inline]: 5.39992e-07 [split_matmul_comm_elemetwise]: 2.34999e-06 [split_layernorm_comm]: 2.06e-06 [handle_group_info]: 1.09e-06 [symbol_engine_optimizer]: 0.00011067, [1] [Cycle 1]: 0.0001054, [6] [build]: 3.93001e-06 [elim_shapecalc]: 1.774e-05 [elim_not_effective]: 2.069e-05 [opt_reshape]: 1.07e-05 [fold_const_symbol]: 1.594e-05 [renormalize]: 2.80008e-07 [detach_backward]: 2.39001e-06 [pipeline_parallel_scheduler]: 1.81e-06 [auto_monad_reorder]: 2.461e-05 [get_jit_bprop_graph]: 3.09999e-06 [rewriter_after_jit_bprop_graph]: 6.24001e-06 [opt_after_jit_grad]: 0.00060942 [validate]: 5.615e-05 Sums bootstrap : 0.000450s : 0.30% type_inference : 0.007402s : 4.99% event_method : 0.000024s : 0.02% auto_monad : 0.000083s : 0.06% graph_reusing : 0.000007s : 0.00% inline : 0.000003s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000027s : 0.02% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000007s : 0.00% parallel-infer-symbol : 0.000004s : 0.00% pre_auto_parallel : 0.000045s : 0.03% insert-virtual-dataset : 0.000004s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000037s : 0.02% optimize.rewriter_before_opt_a : 0.000108s : 0.07% optimize.opt_a.expand_dump_flag : 0.000006s : 0.00% optimize.opt_a.switch_simplify : 0.000061s : 0.04% optimize.opt_a.loop_unroll : 0.000042s : 0.03% optimize.opt_a.a_1 : 0.001078s : 0.73% optimize.opt_a.with_stream_mark : 0.000045s : 0.03% optimize.opt_a.recompute_prepare : 0.000026s : 0.02% optimize.opt_a.updatestate_depend_eliminate : 0.000012s : 0.01% optimize.opt_a.updatestate_assign_eliminate : 0.000009s : 0.01% optimize.opt_a.updatestate_loads_eliminate : 0.000009s : 0.01% optimize.opt_a.parameter_eliminate : 0.000004s : 0.00% optimize.opt_a.a_2 : 0.000255s : 0.17% optimize.opt_a.accelerated_algorithm : 0.000040s : 0.03% optimize.opt_a.shard : 0.000004s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.000005s : 0.00% optimize.opt_a.shard_inline : 0.000019s : 0.01% optimize.opt_a.merge_send_recv : 0.000023s : 0.02% optimize.opt_a.auto_parallel : 0.000021s : 0.01% optimize.opt_a.parallel : 0.000036s : 0.02% optimize.opt_a.flash_sp : 0.000015s : 0.01% optimize.opt_a.merge_comm : 0.000011s : 0.01% optimize.opt_a.allreduce_fusion : 0.000010s : 0.01% optimize.opt_a.matmul_add_comm_reduction : 0.000029s : 0.02% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000002s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000025s : 0.02% optimize.opt_a.virtual_dataset : 0.000019s : 0.01% optimize.opt_a.get_grad_eliminate_ : 0.000018s : 0.01% optimize.opt_a.virtual_output : 0.000018s : 0.01% optimize.opt_a.merge_forward : 0.000012s : 0.01% optimize.opt_a.cell_reuse_recompute_pass : 0.000006s : 0.00% optimize.opt_a.offload_activation : 0.000027s : 0.02% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000038s : 0.03% optimize.opt_a.merge_recompute_call_nodes : 0.000004s : 0.00% optimize.opt_a.before_grad : 0.000033s : 0.02% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000012s : 0.01% optimize.opt_a.meta_fg_expand : 0.000008s : 0.01% optimize.opt_a.flash_sp_send_recv_attached : 0.000009s : 0.01% optimize.opt_a.receive_attached : 0.000005s : 0.00% optimize.opt_a.after_resolve : 0.000031s : 0.02% optimize.opt_a.a_after_grad : 0.000030s : 0.02% optimize.opt_a.renormalize : 0.134680s : 90.87% optimize.opt_a.add_forward_monad_depend : 0.000014s : 0.01% optimize.opt_a.auto_monad_grad : 0.000005s : 0.00% optimize.opt_a.auto_monad_eliminator : 0.000035s : 0.02% optimize.opt_a.cse : 0.000078s : 0.05% optimize.opt_a.a_3 : 0.000147s : 0.10% optimize.py_interpret_to_execute_after_opt_a : 0.000019s : 0.01% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.00% optimize.rewriter_after_opt_a : 0.000051s : 0.03% optimize.convert_after_rewriter : 0.000009s : 0.01% optimize.order_py_execute_after_rewriter : 0.000007s : 0.01% optimize.mutable_eliminate : 0.000799s : 0.54% optimize.opt_b.b_1 : 0.000214s : 0.14% optimize.opt_b.b_2 : 0.000012s : 0.01% optimize.opt_b.updatestate_depend_eliminate : 0.000008s : 0.01% optimize.opt_b.updatestate_assign_eliminate : 0.000004s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000004s : 0.00% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000035s : 0.02% optimize.optimize_parallel_all_gather_comm : 0.000021s : 0.01% optimize.overlap_param_gather : 0.000002s : 0.00% optimize.cconv : 0.000036s : 0.02% optimize.loop_unroll : 0.000506s : 0.34% optimize.opt_after_cconv.c_1 : 0.000052s : 0.04% optimize.opt_after_cconv.parameter_eliminate : 0.000004s : 0.00% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000007s : 0.00% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000004s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000004s : 0.00% optimize.opt_after_cconv.cse : 0.000030s : 0.02% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000057s : 0.04% optimize.tuple_transform.d_1 : 0.000081s : 0.05% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000011s : 0.01% optimize.partial_unused_args_eliminate : 0.000002s : 0.00% optimize.add_recomputation : 0.000079s : 0.05% optimize.cse_after_recomputation.cse : 0.000023s : 0.02% optimize.environ_conv : 0.000008s : 0.01% optimize.swap_dp_allreduce_reducescatter : 0.000008s : 0.01% optimize.bias_add_comm_swap : 0.000003s : 0.00% optimize.label_micro_interleaved_index : 0.000006s : 0.00% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.00% optimize.merge_cast_opt : 0.000002s : 0.00% optimize.slice_recompute_activation : 0.000003s : 0.00% optimize.micro_interleaved_order_control : 0.000003s : 0.00% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.00% optimize.full_micro_interleaved_order_control : 0.000002s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.00% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.00% optimize.control_data_broadcast_order : 0.000018s : 0.01% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.00% optimize.offloading_packed_experts : 0.000007s : 0.00% optimize.overlap_recompute_and_grad_model_parallel : 0.000007s : 0.00% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000002s : 0.00% optimize.overlap_recompute_comm : 0.000003s : 0.00% optimize.overlap_grad_ring_attention : 0.000005s : 0.00% optimize.overlap_grad_flash_sp : 0.000028s : 0.02% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.00% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000004s : 0.00% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000018s : 0.01% optimize.symbol_engine_optimizer.elim_not_effective : 0.000021s : 0.01% optimize.symbol_engine_optimizer.opt_reshape : 0.000011s : 0.01% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000016s : 0.01% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.00% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000025s : 0.02% get_jit_bprop_graph : 0.000003s : 0.00% rewriter_after_jit_bprop_graph : 0.000006s : 0.00% opt_after_jit_grad : 0.000609s : 0.41% validate : 0.000056s : 0.04% Time group info: ------[substitution.] 0.000337 54 12.25% : 0.000041s : 4: substitution.arithmetic_simplify 13.76% : 0.000046s : 6: substitution.cast_eliminate 0.86% : 0.000003s : 4: substitution.elim_not_effective 0.58% : 0.000002s : 4: substitution.fold_const_symbol 2.36% : 0.000008s : 6: substitution.graph_param_transform 54.31% : 0.000183s : 4: substitution.inline 2.58% : 0.000009s : 8: substitution.j_node_and_user_rematch 4.69% : 0.000016s : 2: substitution.less_batch_normalization 2.40% : 0.000008s : 8: substitution.remove_not_recompute_node 1.93% : 0.000006s : 4: substitution.replace_old_param 4.30% : 0.000014s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.007325 2 87.02% : 0.006374s : 1: type_inference.infer 12.98% : 0.000951s : 1: type_inference.specialize ------[replace.] 0.000075 8 59.14% : 0.000044s : 4: replace.inline 40.86% : 0.000031s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000193 8 93.41% : 0.000180s : 4: match.inline 6.59% : 0.000013s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000293 1730 0.89% : 0.000003s : 17: predicate.accumulaten_eliminater 0.99% : 0.000003s : 6: predicate.ad_related_special_op_eliminate 0.61% : 0.000002s : 12: predicate.addn_check_dump 0.91% : 0.000003s : 17: predicate.addn_zero_filter 0.74% : 0.000002s : 17: predicate.adjust_all_reduce_mul_add 2.88% : 0.000008s : 29: predicate.arithmetic_simplify 0.89% : 0.000003s : 17: predicate.cast_eliminate 0.66% : 0.000002s : 12: predicate.check_bprop_eliminate 0.57% : 0.000002s : 12: predicate.compare_switch_simplify 0.19% : 0.000001s : 6: predicate.const_output_eliminate 0.78% : 0.000002s : 12: predicate.depend_value_elim 0.85% : 0.000002s : 17: predicate.dict_get_item_const_eliminator 1.01% : 0.000003s : 17: predicate.dict_get_item_eliminator 0.83% : 0.000002s : 17: predicate.dict_set_item_eliminator 1.00% : 0.000003s : 12: predicate.dumpgradient_eliminate 0.21% : 0.000001s : 6: predicate.elim_not_effective 0.43% : 0.000001s : 6: predicate.elim_shapecalc_of_broadcastargs 1.34% : 0.000004s : 23: predicate.environ_add_const_eliminate 1.04% : 0.000003s : 23: predicate.environ_get_add_eliminate 1.03% : 0.000003s : 23: predicate.environ_get_depend_swap 1.76% : 0.000005s : 35: predicate.environ_get_eliminate 1.11% : 0.000003s : 23: predicate.environ_get_set_eliminate 1.25% : 0.000004s : 25: predicate.exchange_switch_depend_value 2.20% : 0.000006s : 25: predicate.float_depend_g_call 0.59% : 0.000002s : 12: predicate.float_environ_get_switch 0.88% : 0.000003s : 18: predicate.float_tuple_getitem_switch 0.20% : 0.000001s : 6: predicate.fold_const_symbol 0.70% : 0.000002s : 12: predicate.get_grad_eliminate 0.24% : 0.000001s : 6: predicate.graph_param_transform 0.73% : 0.000002s : 12: predicate.incorporate_call 0.56% : 0.000002s : 12: predicate.incorporate_call_switch 6.20% : 0.000018s : 78: predicate.inline 0.80% : 0.000002s : 12: predicate.inline_without_move 0.37% : 0.000001s : 12: predicate.j_node_and_user_rematch 0.95% : 0.000003s : 12: predicate.less_batch_normalization 1.93% : 0.000006s : 33: predicate.list_to_tuple_eliminator_ 2.32% : 0.000007s : 50: predicate.load_eliminater 0.83% : 0.000002s : 6: predicate.loop_unroll_after_grad 1.88% : 0.000006s : 38: predicate.loop_unroll_before_grad 1.56% : 0.000005s : 29: predicate.make_slice_get_slice_eliminator 0.68% : 0.000002s : 12: predicate.merge_addn 0.59% : 0.000002s : 12: predicate.micro_step_allgather_replace 0.60% : 0.000002s : 12: predicate.mini_step_allgather_replace 0.81% : 0.000002s : 17: predicate.minmaximum_grad 1.01% : 0.000003s : 6: predicate.mutable_eliminate 0.39% : 0.000001s : 6: predicate.opt_reshape 0.42% : 0.000001s : 6: predicate.parallel_virtual_node 1.64% : 0.000005s : 25: predicate.partial_defer_inline 1.50% : 0.000004s : 27: predicate.partial_eliminate 0.96% : 0.000003s : 17: predicate.print_const_string_wrapper 0.79% : 0.000002s : 12: predicate.reduce_all_const_elim 1.31% : 0.000004s : 17: predicate.reduce_eliminate 2.37% : 0.000007s : 50: predicate.redundant_stop_gradient_eliminater 0.44% : 0.000001s : 12: predicate.remove_not_recompute_node 1.28% : 0.000004s : 33: predicate.replace_applicator 0.58% : 0.000002s : 12: predicate.replace_old_param 0.25% : 0.000001s : 6: predicate.reset_defer_inline 0.95% : 0.000003s : 17: predicate.reshape_eliminate 0.61% : 0.000002s : 12: predicate.row_tensor_add_zeros_like 0.40% : 0.000001s : 6: predicate.row_tensor_eliminate 0.86% : 0.000003s : 12: predicate.same_eliminate 0.48% : 0.000001s : 12: predicate.set_cell_output_no_recompute 0.86% : 0.000003s : 12: predicate.shard_identity_eliminate 0.84% : 0.000002s : 12: predicate.special_op_eliminate 0.75% : 0.000002s : 12: predicate.specialize_transform 0.96% : 0.000003s : 12: predicate.split_environ_get_set_with_tuple_value 0.81% : 0.000002s : 12: predicate.stack_unstack_eliminate 0.36% : 0.000001s : 6: predicate.switch_call_monad_eliminater 1.27% : 0.000004s : 25: predicate.switch_defer_inline 2.01% : 0.000006s : 37: predicate.switch_layer_defer_inline 4.68% : 0.000014s : 81: predicate.switch_simplify 0.91% : 0.000003s : 17: predicate.tile_eliminate 0.87% : 0.000003s : 17: predicate.transpose_eliminate 1.69% : 0.000005s : 29: predicate.tuple_list_convert_item_index_to_positive 1.72% : 0.000005s : 29: predicate.tuple_list_get_item_const_eliminator 1.45% : 0.000004s : 29: predicate.tuple_list_get_item_depend_reorder 3.42% : 0.000010s : 45: predicate.tuple_list_get_item_eliminator 1.57% : 0.000005s : 29: predicate.tuple_list_get_set_item_eliminator 2.39% : 0.000007s : 41: predicate.tuple_list_set_item_eliminator 1.93% : 0.000006s : 33: predicate.tuple_to_list_eliminator_ 2.26% : 0.000007s : 50: predicate.updatestate_pure_node_eliminater 2.94% : 0.000009s : 62: predicate.updatestate_useless_node_eliminater 0.40% : 0.000001s : 6: predicate.value_based_eliminate 0.71% : 0.000002s : 12: predicate.virtual_dataset_eliminate 0.65% : 0.000002s : 12: predicate.virtual_output_eliminate 0.37% : 0.000001s : 6: predicate.virtual_view_grad_eliminate 0.38% : 0.000001s : 6: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000781 11 46.16% : 0.000361s : 5: func_graph_cloner_run.FuncGraphClonerGraph 53.84% : 0.000421s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.434000 192 0.00% : 0.000004s : 1: ForceFp32Comm 0.85% : 0.003694s : 1: add_attr 0.85% : 0.003679s : 1: add_attr_with_inline 0.00% : 0.000005s : 1: add_comm_op_reuse_tag 0.02% : 0.000083s : 1: add_recomputation 0.00% : 0.000004s : 1: assign_add_opt 0.02% : 0.000089s : 1: auto_monad 0.01% : 0.000029s : 1: auto_monad_reorder 0.00% : 0.000003s : 1: begin_end_overlap_inline 0.00% : 0.000006s : 1: bias_add_comm_swap 0.11% : 0.000483s : 1: bootstrap 0.01% : 0.000040s : 1: cconv 0.00% : 0.000004s : 1: comm_op_add_attrs 0.00% : 0.000022s : 1: control_data_broadcast_order 0.00% : 0.000012s : 1: convert_after_rewriter 0.01% : 0.000038s : 1: cse_after_recomputation 0.00% : 0.000006s : 1: dataset_repeat_opt 0.00% : 0.000006s : 1: detach_backward 0.00% : 0.000011s : 1: environ_conv 0.01% : 0.000031s : 1: event_method 0.00% : 0.000005s : 1: full_micro_interleaved_order_control 0.00% : 0.000006s : 1: get_jit_bprop_graph 0.00% : 0.000011s : 1: graph_reusing 0.00% : 0.000005s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000004s : 1: handle_group_info 0.00% : 0.000006s : 1: inline 0.00% : 0.000007s : 1: insert-virtual-dataset 0.00% : 0.000004s : 1: interleave_parallel_branches 0.00% : 0.000004s : 1: interleave_split_concat_branches 0.00% : 0.000006s : 1: label_fine_grained_interleaved_index 0.00% : 0.000009s : 1: label_micro_interleaved_index 0.12% : 0.000514s : 1: loop_unroll 0.00% : 0.000005s : 1: merge_cast_opt 0.00% : 0.000006s : 1: micro_interleaved_order_control 0.19% : 0.000809s : 1: mutable_eliminate 0.00% : 0.000010s : 1: offloading_packed_experts 0.00% : 0.000021s : 1: opt.transform.loop_unroll_optimizer 0.01% : 0.000022s : 1: opt.transform.mutable_eliminate 0.41% : 0.001768s : 78: opt.transform.opt_a 0.01% : 0.000050s : 1: opt.transform.opt_after_cconv 0.01% : 0.000044s : 1: opt.transform.opt_after_jit_grad 0.04% : 0.000191s : 28: opt.transform.opt_b 0.02% : 0.000089s : 2: opt.transform.opt_trans_graph 0.01% : 0.000061s : 4: opt.transform.symbol_engine_opt 31.70% : 0.137563s : 1: opt_a 0.05% : 0.000225s : 1: opt_after_cconv 0.14% : 0.000621s : 1: opt_after_jit_grad 0.08% : 0.000335s : 1: opt_b 32.37% : 0.140491s : 1: optimize 0.01% : 0.000025s : 1: optimize_parallel_all_gather_comm 0.00% : 0.000011s : 1: order_py_execute_after_rewriter 0.01% : 0.000031s : 1: overlap_grad_flash_sp 0.00% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.00% : 0.000008s : 1: overlap_grad_ring_attention 0.00% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000005s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000005s : 1: overlap_param_gather 0.00% : 0.000005s : 1: overlap_recompute_allgather_and_fa_grad 0.00% : 0.000011s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000005s : 1: overlap_recompute_comm 0.00% : 0.000008s : 1: parallel-infer-symbol 0.00% : 0.000004s : 1: parallel-infer-symbol-second 0.00% : 0.000005s : 1: partial_unused_args_eliminate 0.00% : 0.000005s : 1: pipeline_parallel_scheduler 0.00% : 0.000005s : 1: pipeline_split 0.01% : 0.000050s : 1: pre_auto_parallel 0.01% : 0.000041s : 1: py_interpret_to_execute 0.01% : 0.000023s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000005s : 1: remove_cast_before_assign_add 0.01% : 0.000062s : 1: remove_dup_value 30.88% : 0.134008s : 1: renormalize.infer 0.15% : 0.000652s : 1: renormalize.specialize 0.00% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.00% : 0.000010s : 1: rewriter_after_jit_bprop_graph 0.01% : 0.000055s : 1: rewriter_after_opt_a 0.03% : 0.000112s : 1: rewriter_before_opt_a 0.00% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000005s : 1: slice_recompute_activation 0.00% : 0.000006s : 1: split_layernorm_comm 0.00% : 0.000005s : 1: split_matmul_comm_elemetwise 0.00% : 0.000011s : 1: swap_dp_allreduce_reducescatter 0.03% : 0.000114s : 1: symbol_engine_optimizer 0.03% : 0.000126s : 1: tuple_transform 1.71% : 0.007423s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:38:12.778.492 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:38:12.778.766 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.198087, [21] [bootstrap]: 0.00042098 [type_inference]: 0.182731 [event_method]: 2.184e-05 [auto_monad]: 7.76e-05 [graph_reusing]: 6.78e-06 [inline]: 2.97002e-06 [add_attr]: 0.00593574, [1] [add_attr_with_inline]: 0.00592123, [1] [Cycle 1]: 0.00010347, [2] [tag_attr]: 2.64e-05 [meta_addattr_fg_expand]: 7.06999e-06 [parallel-infer-symbol]: 3.78001e-06 [pre_auto_parallel]: 4.402e-05 [insert-virtual-dataset]: 2.53e-06 [parallel-infer-symbol-second]: 9.70002e-07 [dataset_repeat_opt]: 2.12999e-06 [pipeline_split]: 1.97001e-06 [optimize]: 0.00705962, [53] [py_interpret_to_execute]: 3.909e-05 [rewriter_before_opt_a]: 0.00010692 [opt_a]: 0.00404147, [2] [Cycle 1]: 0.00299959, [45] [expand_dump_flag]: 3.38e-06 [switch_simplify]: 4.583e-05 [loop_unroll]: 3.181e-05 [a_1]: 0.00085081 [with_stream_mark]: 2.354e-05 [recompute_prepare]: 1.356e-05 [updatestate_depend_eliminate]: 5.49998e-06 [updatestate_assign_eliminate]: 4.26001e-06 [updatestate_loads_eliminate]: 3.65998e-06 [parameter_eliminate]: 2.01e-06 [a_2]: 0.00014319 [accelerated_algorithm]: 2.323e-05 [shard]: 1.91e-06 [meta_shard_fg_expand]: 2.64999e-06 [shard_inline]: 8.16002e-06 [merge_send_recv]: 9.71e-06 [auto_parallel]: 9.05999e-06 [parallel]: 2.271e-05 [flash_sp]: 9.00001e-06 [merge_comm]: 4.70999e-06 [allreduce_fusion]: 4.15999e-06 [matmul_add_comm_reduction]: 1.319e-05 [allreduce_slice_to_reducescatter]: 8.30012e-07 [virtual_shard_identity]: 1.178e-05 [virtual_dataset]: 9.22999e-06 [get_grad_eliminate_]: 1.036e-05 [virtual_output]: 8.73001e-06 [merge_forward]: 4.53999e-06 [cell_reuse_recompute_pass]: 1.42999e-06 [offload_activation]: 1.156e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.028e-05 [merge_recompute_call_nodes]: 1.43002e-06 [before_grad]: 1.656e-05 [set_forward_comm_id_for_comm_node_pass]: 4.88001e-06 [meta_fg_expand]: 4.45e-06 [flash_sp_send_recv_attached]: 6.17001e-06 [receive_attached]: 2.31e-06 [after_resolve]: 1.546e-05 [a_after_grad]: 1.478e-05 [renormalize]: 0.00103786 [add_forward_monad_depend]: 6.59999e-06 [auto_monad_grad]: 2.79999e-06 [auto_monad_eliminator]: 2.107e-05 [cse]: 3.845e-05 [a_3]: 8.353e-05 [Cycle 2]: 0.00102501, [45] [expand_dump_flag]: 2.23998e-06 [switch_simplify]: 9.91998e-06 [loop_unroll]: 7.88999e-06 [a_1]: 0.0002072 [with_stream_mark]: 1.289e-05 [recompute_prepare]: 8.85999e-06 [updatestate_depend_eliminate]: 4.75001e-06 [updatestate_assign_eliminate]: 3.45998e-06 [updatestate_loads_eliminate]: 3.80998e-06 [parameter_eliminate]: 1.19e-06 [a_2]: 0.00013015 [accelerated_algorithm]: 1.19e-05 [shard]: 1.29e-06 [meta_shard_fg_expand]: 2.13002e-06 [shard_inline]: 7.63001e-06 [merge_send_recv]: 7.44002e-06 [auto_parallel]: 9.19e-06 [parallel]: 8.23999e-06 [flash_sp]: 3.91001e-06 [merge_comm]: 5.35999e-06 [allreduce_fusion]: 4.08001e-06 [matmul_add_comm_reduction]: 9.19e-06 [allreduce_slice_to_reducescatter]: 8.50006e-07 [virtual_shard_identity]: 8.69e-06 [virtual_dataset]: 8.89e-06 [get_grad_eliminate_]: 7.56999e-06 [virtual_output]: 7.33e-06 [merge_forward]: 4.3e-06 [cell_reuse_recompute_pass]: 2.70002e-06 [offload_activation]: 9.02999e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.704e-05 [merge_recompute_call_nodes]: 1.05001e-06 [before_grad]: 1.261e-05 [set_forward_comm_id_for_comm_node_pass]: 5.46002e-06 [meta_fg_expand]: 3.46001e-06 [flash_sp_send_recv_attached]: 1.36002e-06 [receive_attached]: 1.84e-06 [after_resolve]: 1.434e-05 [a_after_grad]: 1.226e-05 [renormalize]: 1.00001e-07 [add_forward_monad_depend]: 1.83997e-06 [auto_monad_grad]: 1.66e-06 [auto_monad_eliminator]: 1.029e-05 [cse]: 1.951e-05 [a_3]: 6.08e-05 [py_interpret_to_execute_after_opt_a]: 3.374e-05 [slice_cell_reuse_recomputed_activation]: 5.74e-06 [rewriter_after_opt_a]: 4.959e-05 [convert_after_rewriter]: 1.051e-05 [order_py_execute_after_rewriter]: 9.41e-06 [mutable_eliminate]: 0.0007958 [opt_b]: 0.00037249, [1] [Cycle 1]: 0.00036121, [7] [b_1]: 0.00023937 [b_2]: 1.066e-05 [updatestate_depend_eliminate]: 7.2e-06 [updatestate_assign_eliminate]: 3.45e-06 [updatestate_loads_eliminate]: 3.41001e-06 [renormalize]: 3.89991e-07 [cse]: 2.517e-05 [optimize_parallel_all_gather_comm]: 2.236e-05 [overlap_param_gather]: 6.16e-06 [cconv]: 3.148e-05 [loop_unroll]: 0.00049399 [opt_after_cconv]: 0.00016544, [1] [Cycle 1]: 0.00015452, [7] [c_1]: 4.45e-05 [parameter_eliminate]: 2.61e-06 [updatestate_depend_eliminate]: 6.35002e-06 [updatestate_assign_eliminate]: 3.35e-06 [updatestate_loads_eliminate]: 3.26001e-06 [cse]: 2.363e-05 [renormalize]: 6.39993e-07 [remove_dup_value]: 2.134e-05 [tuple_transform]: 0.00011984, [1] [Cycle 1]: 0.00011068, [4] [d_1]: 6.294e-05 [none_parameter_eliminate]: 1.66e-06 [renormalize]: 2.30008e-07 [switch_simplify]: 8.41002e-06 [partial_unused_args_eliminate]: 5.78002e-06 [add_recomputation]: 6.703e-05 [cse_after_recomputation]: 3.779e-05, [1] [Cycle 1]: 2.799e-05, [1] [cse]: 1.659e-05 [environ_conv]: 1.047e-05 [swap_dp_allreduce_reducescatter]: 1.006e-05 [bias_add_comm_swap]: 6.69999e-06 [label_micro_interleaved_index]: 3.589e-05 [label_fine_grained_interleaved_index]: 7.01001e-06 [merge_cast_opt]: 4.91002e-06 [slice_recompute_activation]: 5.90002e-06 [micro_interleaved_order_control]: 6.20002e-06 [assign_add_opt]: 4.18999e-06 [ForceFp32Comm]: 4.57998e-06 [remove_cast_before_assign_add]: 4.37998e-06 [full_micro_interleaved_order_control]: 5.07e-06 [reorder_send_recv_between_fp_bp]: 6.04001e-06 [comm_op_add_attrs]: 4.47e-06 [add_comm_op_reuse_tag]: 4.52e-06 [interleave_split_concat_branches]: 4.51002e-06 [interleave_parallel_branches]: 4.39002e-06 [overlap_opt_shard_in_pipeline]: 4.55001e-06 [overlap_opt_shard_grad_in_pipeline]: 4.73001e-06 [control_data_broadcast_order]: 1.91e-05 [grouped_pairwise_exchange_alltoall]: 5.09e-06 [offloading_packed_experts]: 9.38997e-06 [overlap_recompute_and_grad_model_parallel]: 9.35001e-06 [overlap_grad_matmul_and_grad_allreduce]: 4.71002e-06 [overlap_recompute_allgather_and_fa_grad]: 4.75001e-06 [overlap_recompute_comm]: 5.74e-06 [overlap_grad_ring_attention]: 9.40001e-06 [overlap_grad_flash_sp]: 2.929e-05 [begin_end_overlap_inline]: 4.62998e-06 [split_matmul_comm_elemetwise]: 5.61e-06 [split_layernorm_comm]: 5.00999e-06 [handle_group_info]: 4.26001e-06 [symbol_engine_optimizer]: 0.00013731, [1] [Cycle 1]: 0.00012825, [6] [build]: 4.90001e-06 [elim_shapecalc]: 1.662e-05 [elim_not_effective]: 1.898e-05 [opt_reshape]: 1.147e-05 [fold_const_symbol]: 1.358e-05 [renormalize]: 2.00002e-07 [detach_backward]: 4.90001e-06 [pipeline_parallel_scheduler]: 2.16998e-06 [auto_monad_reorder]: 2.801e-05 [get_jit_bprop_graph]: 2.04e-06 [rewriter_after_jit_bprop_graph]: 5.89e-06 [opt_after_jit_grad]: 0.00058657 [validate]: 4.496e-05 Sums bootstrap : 0.000421s : 0.22% type_inference : 0.182731s : 96.30% event_method : 0.000022s : 0.01% auto_monad : 0.000078s : 0.04% graph_reusing : 0.000007s : 0.00% inline : 0.000003s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000026s : 0.01% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000007s : 0.00% parallel-infer-symbol : 0.000004s : 0.00% pre_auto_parallel : 0.000044s : 0.02% insert-virtual-dataset : 0.000003s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000039s : 0.02% optimize.rewriter_before_opt_a : 0.000107s : 0.06% optimize.opt_a.expand_dump_flag : 0.000006s : 0.00% optimize.opt_a.switch_simplify : 0.000056s : 0.03% optimize.opt_a.loop_unroll : 0.000040s : 0.02% optimize.opt_a.a_1 : 0.001058s : 0.56% optimize.opt_a.with_stream_mark : 0.000036s : 0.02% optimize.opt_a.recompute_prepare : 0.000022s : 0.01% optimize.opt_a.updatestate_depend_eliminate : 0.000010s : 0.01% optimize.opt_a.updatestate_assign_eliminate : 0.000008s : 0.00% optimize.opt_a.updatestate_loads_eliminate : 0.000007s : 0.00% optimize.opt_a.parameter_eliminate : 0.000003s : 0.00% optimize.opt_a.a_2 : 0.000273s : 0.14% optimize.opt_a.accelerated_algorithm : 0.000035s : 0.02% optimize.opt_a.shard : 0.000003s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.000005s : 0.00% optimize.opt_a.shard_inline : 0.000016s : 0.01% optimize.opt_a.merge_send_recv : 0.000017s : 0.01% optimize.opt_a.auto_parallel : 0.000018s : 0.01% optimize.opt_a.parallel : 0.000031s : 0.02% optimize.opt_a.flash_sp : 0.000013s : 0.01% optimize.opt_a.merge_comm : 0.000010s : 0.01% optimize.opt_a.allreduce_fusion : 0.000008s : 0.00% optimize.opt_a.matmul_add_comm_reduction : 0.000022s : 0.01% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000002s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000020s : 0.01% optimize.opt_a.virtual_dataset : 0.000018s : 0.01% optimize.opt_a.get_grad_eliminate_ : 0.000018s : 0.01% optimize.opt_a.virtual_output : 0.000016s : 0.01% optimize.opt_a.merge_forward : 0.000009s : 0.00% optimize.opt_a.cell_reuse_recompute_pass : 0.000004s : 0.00% optimize.opt_a.offload_activation : 0.000021s : 0.01% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000037s : 0.02% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.00% optimize.opt_a.before_grad : 0.000029s : 0.02% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000010s : 0.01% optimize.opt_a.meta_fg_expand : 0.000008s : 0.00% optimize.opt_a.flash_sp_send_recv_attached : 0.000008s : 0.00% optimize.opt_a.receive_attached : 0.000004s : 0.00% optimize.opt_a.after_resolve : 0.000030s : 0.02% optimize.opt_a.a_after_grad : 0.000027s : 0.01% optimize.opt_a.renormalize : 0.001038s : 0.55% optimize.opt_a.add_forward_monad_depend : 0.000008s : 0.00% optimize.opt_a.auto_monad_grad : 0.000004s : 0.00% optimize.opt_a.auto_monad_eliminator : 0.000031s : 0.02% optimize.opt_a.cse : 0.000058s : 0.03% optimize.opt_a.a_3 : 0.000144s : 0.08% optimize.py_interpret_to_execute_after_opt_a : 0.000034s : 0.02% optimize.slice_cell_reuse_recomputed_activation : 0.000006s : 0.00% optimize.rewriter_after_opt_a : 0.000050s : 0.03% optimize.convert_after_rewriter : 0.000011s : 0.01% optimize.order_py_execute_after_rewriter : 0.000009s : 0.00% optimize.mutable_eliminate : 0.000796s : 0.42% optimize.opt_b.b_1 : 0.000239s : 0.13% optimize.opt_b.b_2 : 0.000011s : 0.01% optimize.opt_b.updatestate_depend_eliminate : 0.000007s : 0.00% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.00% optimize.opt_b.renormalize : 0.000000s : 0.00% optimize.opt_b.cse : 0.000025s : 0.01% optimize.optimize_parallel_all_gather_comm : 0.000022s : 0.01% optimize.overlap_param_gather : 0.000006s : 0.00% optimize.cconv : 0.000031s : 0.02% optimize.loop_unroll : 0.000494s : 0.26% optimize.opt_after_cconv.c_1 : 0.000045s : 0.02% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.00% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.cse : 0.000024s : 0.01% optimize.opt_after_cconv.renormalize : 0.000001s : 0.00% optimize.remove_dup_value : 0.000021s : 0.01% optimize.tuple_transform.d_1 : 0.000063s : 0.03% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000008s : 0.00% optimize.partial_unused_args_eliminate : 0.000006s : 0.00% optimize.add_recomputation : 0.000067s : 0.04% optimize.cse_after_recomputation.cse : 0.000017s : 0.01% optimize.environ_conv : 0.000010s : 0.01% optimize.swap_dp_allreduce_reducescatter : 0.000010s : 0.01% optimize.bias_add_comm_swap : 0.000007s : 0.00% optimize.label_micro_interleaved_index : 0.000036s : 0.02% optimize.label_fine_grained_interleaved_index : 0.000007s : 0.00% optimize.merge_cast_opt : 0.000005s : 0.00% optimize.slice_recompute_activation : 0.000006s : 0.00% optimize.micro_interleaved_order_control : 0.000006s : 0.00% optimize.assign_add_opt : 0.000004s : 0.00% optimize.ForceFp32Comm : 0.000005s : 0.00% optimize.remove_cast_before_assign_add : 0.000004s : 0.00% optimize.full_micro_interleaved_order_control : 0.000005s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000006s : 0.00% optimize.comm_op_add_attrs : 0.000004s : 0.00% optimize.add_comm_op_reuse_tag : 0.000005s : 0.00% optimize.interleave_split_concat_branches : 0.000005s : 0.00% optimize.interleave_parallel_branches : 0.000004s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000005s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000005s : 0.00% optimize.control_data_broadcast_order : 0.000019s : 0.01% optimize.grouped_pairwise_exchange_alltoall : 0.000005s : 0.00% optimize.offloading_packed_experts : 0.000009s : 0.00% optimize.overlap_recompute_and_grad_model_parallel : 0.000009s : 0.00% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000005s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000005s : 0.00% optimize.overlap_recompute_comm : 0.000006s : 0.00% optimize.overlap_grad_ring_attention : 0.000009s : 0.00% optimize.overlap_grad_flash_sp : 0.000029s : 0.02% optimize.begin_end_overlap_inline : 0.000005s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000006s : 0.00% optimize.split_layernorm_comm : 0.000005s : 0.00% optimize.handle_group_info : 0.000004s : 0.00% optimize.symbol_engine_optimizer.build : 0.000005s : 0.00% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000017s : 0.01% optimize.symbol_engine_optimizer.elim_not_effective : 0.000019s : 0.01% optimize.symbol_engine_optimizer.opt_reshape : 0.000011s : 0.01% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000014s : 0.01% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000005s : 0.00% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000028s : 0.01% get_jit_bprop_graph : 0.000002s : 0.00% rewriter_after_jit_bprop_graph : 0.000006s : 0.00% opt_after_jit_grad : 0.000587s : 0.31% validate : 0.000045s : 0.02% Time group info: ------[substitution.] 0.000294 45 12.81% : 0.000038s : 5: substitution.arithmetic_simplify 9.49% : 0.000028s : 3: substitution.cast_eliminate 0.81% : 0.000002s : 3: substitution.elim_not_effective 0.59% : 0.000002s : 3: substitution.fold_const_symbol 2.36% : 0.000007s : 5: substitution.graph_param_transform 56.59% : 0.000167s : 4: substitution.inline 1.97% : 0.000006s : 6: substitution.j_node_and_user_rematch 4.39% : 0.000013s : 2: substitution.less_batch_normalization 2.52% : 0.000007s : 6: substitution.remove_not_recompute_node 2.47% : 0.000007s : 4: substitution.replace_old_param 6.00% : 0.000018s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.182669 2 99.47% : 0.181696s : 1: type_inference.infer 0.53% : 0.000973s : 1: type_inference.specialize ------[replace.] 0.000068 8 59.69% : 0.000041s : 4: replace.inline 40.31% : 0.000027s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000179 8 91.23% : 0.000164s : 4: match.inline 8.77% : 0.000016s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000273 1596 1.02% : 0.000003s : 17: predicate.accumulaten_eliminater 0.83% : 0.000002s : 5: predicate.ad_related_special_op_eliminate 0.55% : 0.000002s : 10: predicate.addn_check_dump 0.91% : 0.000002s : 17: predicate.addn_zero_filter 0.92% : 0.000002s : 17: predicate.adjust_all_reduce_mul_add 2.46% : 0.000007s : 27: predicate.arithmetic_simplify 0.95% : 0.000003s : 17: predicate.cast_eliminate 0.57% : 0.000002s : 10: predicate.check_bprop_eliminate 0.72% : 0.000002s : 10: predicate.compare_switch_simplify 0.17% : 0.000000s : 5: predicate.const_output_eliminate 0.60% : 0.000002s : 10: predicate.depend_value_elim 1.02% : 0.000003s : 17: predicate.dict_get_item_const_eliminator 1.08% : 0.000003s : 17: predicate.dict_get_item_eliminator 0.92% : 0.000002s : 17: predicate.dict_set_item_eliminator 0.94% : 0.000003s : 10: predicate.dumpgradient_eliminate 0.28% : 0.000001s : 5: predicate.elim_not_effective 0.44% : 0.000001s : 5: predicate.elim_shapecalc_of_broadcastargs 1.25% : 0.000003s : 22: predicate.environ_add_const_eliminate 1.06% : 0.000003s : 22: predicate.environ_get_add_eliminate 1.12% : 0.000003s : 22: predicate.environ_get_depend_swap 1.74% : 0.000005s : 32: predicate.environ_get_eliminate 1.12% : 0.000003s : 22: predicate.environ_get_set_eliminate 1.39% : 0.000004s : 25: predicate.exchange_switch_depend_value 2.17% : 0.000006s : 25: predicate.float_depend_g_call 0.57% : 0.000002s : 10: predicate.float_environ_get_switch 0.81% : 0.000002s : 15: predicate.float_tuple_getitem_switch 0.18% : 0.000001s : 5: predicate.fold_const_symbol 0.71% : 0.000002s : 10: predicate.get_grad_eliminate 0.20% : 0.000001s : 5: predicate.graph_param_transform 0.59% : 0.000002s : 10: predicate.incorporate_call 0.47% : 0.000001s : 10: predicate.incorporate_call_switch 6.14% : 0.000017s : 72: predicate.inline 0.69% : 0.000002s : 10: predicate.inline_without_move 0.30% : 0.000001s : 10: predicate.j_node_and_user_rematch 0.97% : 0.000003s : 10: predicate.less_batch_normalization 1.81% : 0.000005s : 31: predicate.list_to_tuple_eliminator_ 2.59% : 0.000007s : 48: predicate.load_eliminater 0.83% : 0.000002s : 5: predicate.loop_unroll_after_grad 1.90% : 0.000005s : 36: predicate.loop_unroll_before_grad 1.57% : 0.000004s : 27: predicate.make_slice_get_slice_eliminator 0.58% : 0.000002s : 10: predicate.merge_addn 0.74% : 0.000002s : 10: predicate.micro_step_allgather_replace 0.58% : 0.000002s : 10: predicate.mini_step_allgather_replace 0.85% : 0.000002s : 17: predicate.minmaximum_grad 1.02% : 0.000003s : 5: predicate.mutable_eliminate 0.40% : 0.000001s : 5: predicate.opt_reshape 0.34% : 0.000001s : 5: predicate.parallel_virtual_node 1.78% : 0.000005s : 25: predicate.partial_defer_inline 1.62% : 0.000004s : 26: predicate.partial_eliminate 0.95% : 0.000003s : 17: predicate.print_const_string_wrapper 0.56% : 0.000002s : 10: predicate.reduce_all_const_elim 1.34% : 0.000004s : 17: predicate.reduce_eliminate 2.53% : 0.000007s : 48: predicate.redundant_stop_gradient_eliminater 0.59% : 0.000002s : 10: predicate.remove_not_recompute_node 1.38% : 0.000004s : 31: predicate.replace_applicator 0.46% : 0.000001s : 10: predicate.replace_old_param 0.26% : 0.000001s : 5: predicate.reset_defer_inline 0.98% : 0.000003s : 17: predicate.reshape_eliminate 0.62% : 0.000002s : 10: predicate.row_tensor_add_zeros_like 0.36% : 0.000001s : 5: predicate.row_tensor_eliminate 0.79% : 0.000002s : 10: predicate.same_eliminate 0.39% : 0.000001s : 10: predicate.set_cell_output_no_recompute 0.73% : 0.000002s : 10: predicate.shard_identity_eliminate 0.73% : 0.000002s : 10: predicate.special_op_eliminate 0.70% : 0.000002s : 10: predicate.specialize_transform 0.84% : 0.000002s : 10: predicate.split_environ_get_set_with_tuple_value 0.89% : 0.000002s : 10: predicate.stack_unstack_eliminate 0.39% : 0.000001s : 5: predicate.switch_call_monad_eliminater 1.59% : 0.000004s : 25: predicate.switch_defer_inline 1.97% : 0.000005s : 35: predicate.switch_layer_defer_inline 4.65% : 0.000013s : 76: predicate.switch_simplify 0.89% : 0.000002s : 17: predicate.tile_eliminate 1.05% : 0.000003s : 17: predicate.transpose_eliminate 1.57% : 0.000004s : 27: predicate.tuple_list_convert_item_index_to_positive 1.63% : 0.000004s : 27: predicate.tuple_list_get_item_const_eliminator 1.52% : 0.000004s : 27: predicate.tuple_list_get_item_depend_reorder 3.21% : 0.000009s : 41: predicate.tuple_list_get_item_eliminator 1.75% : 0.000005s : 27: predicate.tuple_list_get_set_item_eliminator 2.30% : 0.000006s : 37: predicate.tuple_list_set_item_eliminator 1.83% : 0.000005s : 31: predicate.tuple_to_list_eliminator_ 2.52% : 0.000007s : 48: predicate.updatestate_pure_node_eliminater 3.23% : 0.000009s : 58: predicate.updatestate_useless_node_eliminater 0.37% : 0.000001s : 5: predicate.value_based_eliminate 0.61% : 0.000002s : 10: predicate.virtual_dataset_eliminate 0.63% : 0.000002s : 10: predicate.virtual_output_eliminate 0.31% : 0.000001s : 5: predicate.virtual_view_grad_eliminate 0.43% : 0.000001s : 5: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000679 11 46.71% : 0.000317s : 5: func_graph_cloner_run.FuncGraphClonerGraph 53.29% : 0.000362s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.214021 192 0.00% : 0.000008s : 1: ForceFp32Comm 2.78% : 0.005948s : 1: add_attr 2.77% : 0.005926s : 1: add_attr_with_inline 0.00% : 0.000007s : 1: add_comm_op_reuse_tag 0.03% : 0.000071s : 1: add_recomputation 0.00% : 0.000007s : 1: assign_add_opt 0.04% : 0.000088s : 1: auto_monad 0.02% : 0.000037s : 1: auto_monad_reorder 0.00% : 0.000008s : 1: begin_end_overlap_inline 0.00% : 0.000010s : 1: bias_add_comm_swap 0.22% : 0.000468s : 1: bootstrap 0.02% : 0.000035s : 1: cconv 0.00% : 0.000007s : 1: comm_op_add_attrs 0.01% : 0.000022s : 1: control_data_broadcast_order 0.01% : 0.000013s : 1: convert_after_rewriter 0.02% : 0.000041s : 1: cse_after_recomputation 0.00% : 0.000007s : 1: dataset_repeat_opt 0.01% : 0.000026s : 1: detach_backward 0.01% : 0.000014s : 1: environ_conv 0.02% : 0.000032s : 1: event_method 0.00% : 0.000008s : 1: full_micro_interleaved_order_control 0.00% : 0.000009s : 1: get_jit_bprop_graph 0.01% : 0.000013s : 1: graph_reusing 0.00% : 0.000008s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000007s : 1: handle_group_info 0.00% : 0.000009s : 1: inline 0.00% : 0.000010s : 1: insert-virtual-dataset 0.00% : 0.000007s : 1: interleave_parallel_branches 0.00% : 0.000007s : 1: interleave_split_concat_branches 0.00% : 0.000010s : 1: label_fine_grained_interleaved_index 0.02% : 0.000040s : 1: label_micro_interleaved_index 0.23% : 0.000501s : 1: loop_unroll 0.00% : 0.000008s : 1: merge_cast_opt 0.00% : 0.000009s : 1: micro_interleaved_order_control 0.38% : 0.000803s : 1: mutable_eliminate 0.01% : 0.000013s : 1: offloading_packed_experts 0.01% : 0.000018s : 1: opt.transform.loop_unroll_optimizer 0.01% : 0.000019s : 1: opt.transform.mutable_eliminate 0.77% : 0.001651s : 78: opt.transform.opt_a 0.02% : 0.000043s : 1: opt.transform.opt_after_cconv 0.02% : 0.000038s : 1: opt.transform.opt_after_jit_grad 0.07% : 0.000159s : 28: opt.transform.opt_b 0.03% : 0.000069s : 2: opt.transform.opt_trans_graph 0.03% : 0.000056s : 4: opt.transform.symbol_engine_opt 1.89% : 0.004046s : 1: opt_a 0.08% : 0.000169s : 1: opt_after_cconv 0.28% : 0.000597s : 1: opt_after_jit_grad 0.18% : 0.000377s : 1: opt_b 3.68% : 0.007876s : 1: optimize 0.01% : 0.000026s : 1: optimize_parallel_all_gather_comm 0.01% : 0.000012s : 1: order_py_execute_after_rewriter 0.02% : 0.000033s : 1: overlap_grad_flash_sp 0.00% : 0.000008s : 1: overlap_grad_matmul_and_grad_allreduce 0.01% : 0.000013s : 1: overlap_grad_ring_attention 0.00% : 0.000008s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000008s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000010s : 1: overlap_param_gather 0.00% : 0.000008s : 1: overlap_recompute_allgather_and_fa_grad 0.01% : 0.000013s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000009s : 1: overlap_recompute_comm 0.01% : 0.000011s : 1: parallel-infer-symbol 0.00% : 0.000006s : 1: parallel-infer-symbol-second 0.00% : 0.000010s : 1: partial_unused_args_eliminate 0.01% : 0.000011s : 1: pipeline_parallel_scheduler 0.00% : 0.000008s : 1: pipeline_split 0.02% : 0.000051s : 1: pre_auto_parallel 0.02% : 0.000042s : 1: py_interpret_to_execute 0.02% : 0.000038s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000008s : 1: remove_cast_before_assign_add 0.01% : 0.000025s : 1: remove_dup_value 0.26% : 0.000564s : 1: renormalize.infer 0.22% : 0.000463s : 1: renormalize.specialize 0.00% : 0.000009s : 1: reorder_send_recv_between_fp_bp 0.01% : 0.000013s : 1: rewriter_after_jit_bprop_graph 0.02% : 0.000053s : 1: rewriter_after_opt_a 0.05% : 0.000111s : 1: rewriter_before_opt_a 0.00% : 0.000009s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000009s : 1: slice_recompute_activation 0.00% : 0.000008s : 1: split_layernorm_comm 0.00% : 0.000009s : 1: split_matmul_comm_elemetwise 0.01% : 0.000013s : 1: swap_dp_allreduce_reducescatter 0.07% : 0.000140s : 1: symbol_engine_optimizer 0.06% : 0.000123s : 1: tuple_transform 85.40% : 0.182782s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:38:15.346.346 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.113175, [21] [bootstrap]: 0.00078274 [type_inference]: 0.100879 [event_method]: 1.772e-05 [auto_monad]: 5.937e-05 [graph_reusing]: 4.72e-06 [inline]: 2.12999e-06 [add_attr]: 0.0037305, [1] [add_attr_with_inline]: 0.00371993, [1] [Cycle 1]: 6.102e-05, [2] [tag_attr]: 2.088e-05 [meta_addattr_fg_expand]: 5.84e-06 [parallel-infer-symbol]: 4.57e-06 [pre_auto_parallel]: 3.596e-05 [insert-virtual-dataset]: 3.15998e-06 [parallel-infer-symbol-second]: 8.70001e-07 [dataset_repeat_opt]: 2.31e-06 [pipeline_split]: 1.84e-06 [optimize]: 0.00677574, [53] [py_interpret_to_execute]: 3.193e-05 [rewriter_before_opt_a]: 9.28e-05 [opt_a]: 0.00385319, [2] [Cycle 1]: 0.00285717, [45] [expand_dump_flag]: 3.65998e-06 [switch_simplify]: 4.598e-05 [loop_unroll]: 3.369e-05 [a_1]: 0.00085236 [with_stream_mark]: 1.981e-05 [recompute_prepare]: 1.141e-05 [updatestate_depend_eliminate]: 5.88998e-06 [updatestate_assign_eliminate]: 4.2e-06 [updatestate_loads_eliminate]: 3.56001e-06 [parameter_eliminate]: 2.24001e-06 [a_2]: 0.00011986 [accelerated_algorithm]: 2.34e-05 [shard]: 2.32001e-06 [meta_shard_fg_expand]: 2.82002e-06 [shard_inline]: 9.52001e-06 [merge_send_recv]: 9.56e-06 [auto_parallel]: 8.92999e-06 [parallel]: 2.224e-05 [flash_sp]: 9.07999e-06 [merge_comm]: 5.16002e-06 [allreduce_fusion]: 6.04001e-06 [matmul_add_comm_reduction]: 1.341e-05 [allreduce_slice_to_reducescatter]: 6.69999e-07 [virtual_shard_identity]: 1.254e-05 [virtual_dataset]: 9.67001e-06 [get_grad_eliminate_]: 8.87e-06 [virtual_output]: 8.47e-06 [merge_forward]: 6.39001e-06 [cell_reuse_recompute_pass]: 1.87999e-06 [offload_activation]: 1.237e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.743e-05 [merge_recompute_call_nodes]: 1.64e-06 [before_grad]: 1.45e-05 [set_forward_comm_id_for_comm_node_pass]: 5.78997e-06 [meta_fg_expand]: 3.98001e-06 [flash_sp_send_recv_attached]: 5.89999e-06 [receive_attached]: 2.89001e-06 [after_resolve]: 1.726e-05 [a_after_grad]: 1.448e-05 [renormalize]: 0.00106499 [add_forward_monad_depend]: 6.71e-06 [auto_monad_grad]: 2.97002e-06 [auto_monad_eliminator]: 1.921e-05 [cse]: 4.345e-05 [a_3]: 6.825e-05 [Cycle 2]: 0.00098307, [45] [expand_dump_flag]: 2.29999e-06 [switch_simplify]: 1.078e-05 [loop_unroll]: 8.77e-06 [a_1]: 0.00021139 [with_stream_mark]: 1.272e-05 [recompute_prepare]: 9.41e-06 [updatestate_depend_eliminate]: 4.48001e-06 [updatestate_assign_eliminate]: 3.37002e-06 [updatestate_loads_eliminate]: 3.7e-06 [parameter_eliminate]: 1.50001e-06 [a_2]: 0.00017107 [accelerated_algorithm]: 1.374e-05 [shard]: 1.62999e-06 [meta_shard_fg_expand]: 2.07999e-06 [shard_inline]: 8.50001e-06 [merge_send_recv]: 1.236e-05 [auto_parallel]: 8.92e-06 [parallel]: 6.69999e-06 [flash_sp]: 4.94e-06 [merge_comm]: 7.31001e-06 [allreduce_fusion]: 4.08001e-06 [matmul_add_comm_reduction]: 8.90999e-06 [allreduce_slice_to_reducescatter]: 5.3001e-07 [virtual_shard_identity]: 1.13e-05 [virtual_dataset]: 1.109e-05 [get_grad_eliminate_]: 8.22998e-06 [virtual_output]: 7.77e-06 [merge_forward]: 4.47e-06 [cell_reuse_recompute_pass]: 2.07001e-06 [offload_activation]: 9.72001e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.609e-05 [merge_recompute_call_nodes]: 1.17e-06 [before_grad]: 1.3e-05 [set_forward_comm_id_for_comm_node_pass]: 5.79999e-06 [meta_fg_expand]: 3.65e-06 [flash_sp_send_recv_attached]: 1.38002e-06 [receive_attached]: 1.92001e-06 [after_resolve]: 1.599e-05 [a_after_grad]: 1.282e-05 [renormalize]: 1.19995e-07 [add_forward_monad_depend]: 1.77999e-06 [auto_monad_grad]: 1.82001e-06 [auto_monad_eliminator]: 1.188e-05 [cse]: 2.579e-05 [a_3]: 5.167e-05 [py_interpret_to_execute_after_opt_a]: 3.298e-05 [slice_cell_reuse_recomputed_activation]: 2.31e-06 [rewriter_after_opt_a]: 5.191e-05 [convert_after_rewriter]: 8.52e-06 [order_py_execute_after_rewriter]: 7.44002e-06 [mutable_eliminate]: 0.00082585 [opt_b]: 0.00033352, [1] [Cycle 1]: 0.00032305, [7] [b_1]: 0.00020037 [b_2]: 1.351e-05 [updatestate_depend_eliminate]: 1.065e-05 [updatestate_assign_eliminate]: 3.91999e-06 [updatestate_loads_eliminate]: 3.80998e-06 [renormalize]: 1.14e-06 [cse]: 3.062e-05 [optimize_parallel_all_gather_comm]: 2.335e-05 [overlap_param_gather]: 2.41e-06 [cconv]: 3.281e-05 [loop_unroll]: 0.00058155 [opt_after_cconv]: 0.00015007, [1] [Cycle 1]: 0.00014196, [7] [c_1]: 4.748e-05 [parameter_eliminate]: 4.30999e-06 [updatestate_depend_eliminate]: 7.87e-06 [updatestate_assign_eliminate]: 3.78999e-06 [updatestate_loads_eliminate]: 3.46999e-06 [cse]: 2.804e-05 [renormalize]: 4.30009e-07 [remove_dup_value]: 1.982e-05 [tuple_transform]: 0.00011326, [1] [Cycle 1]: 0.00010724, [4] [d_1]: 7.049e-05 [none_parameter_eliminate]: 2.31e-06 [renormalize]: 3.19997e-07 [switch_simplify]: 9.99999e-06 [partial_unused_args_eliminate]: 2.54999e-06 [add_recomputation]: 7.218e-05 [cse_after_recomputation]: 3.288e-05, [1] [Cycle 1]: 2.684e-05, [1] [cse]: 1.936e-05 [environ_conv]: 7.26999e-06 [swap_dp_allreduce_reducescatter]: 7.68999e-06 [bias_add_comm_swap]: 2.93998e-06 [label_micro_interleaved_index]: 5.94e-06 [label_fine_grained_interleaved_index]: 3.65998e-06 [merge_cast_opt]: 1.69998e-06 [slice_recompute_activation]: 2.59001e-06 [micro_interleaved_order_control]: 2.53e-06 [assign_add_opt]: 1.42e-06 [ForceFp32Comm]: 8.90024e-07 [remove_cast_before_assign_add]: 1.17999e-06 [full_micro_interleaved_order_control]: 2.36998e-06 [reorder_send_recv_between_fp_bp]: 3.11999e-06 [comm_op_add_attrs]: 1.17e-06 [add_comm_op_reuse_tag]: 9.70002e-07 [interleave_split_concat_branches]: 1.77999e-06 [interleave_parallel_branches]: 1.28002e-06 [overlap_opt_shard_in_pipeline]: 1.27e-06 [overlap_opt_shard_grad_in_pipeline]: 2.53e-06 [control_data_broadcast_order]: 1.808e-05 [grouped_pairwise_exchange_alltoall]: 1.67999e-06 [offloading_packed_experts]: 5.04e-06 [overlap_recompute_and_grad_model_parallel]: 6.99001e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.50999e-06 [overlap_recompute_allgather_and_fa_grad]: 1.62999e-06 [overlap_recompute_comm]: 2.63003e-06 [overlap_grad_ring_attention]: 5.04e-06 [overlap_grad_flash_sp]: 2.574e-05 [begin_end_overlap_inline]: 6.39993e-07 [split_matmul_comm_elemetwise]: 2.36e-06 [split_layernorm_comm]: 2.26e-06 [handle_group_info]: 1.02e-06 [symbol_engine_optimizer]: 0.00010908, [1] [Cycle 1]: 0.00010398, [6] [build]: 4.72e-06 [elim_shapecalc]: 1.48e-05 [elim_not_effective]: 1.747e-05 [opt_reshape]: 9.59e-06 [fold_const_symbol]: 1.641e-05 [renormalize]: 2.09984e-07 [detach_backward]: 2.19001e-06 [pipeline_parallel_scheduler]: 1.99e-06 [auto_monad_reorder]: 2.237e-05 [get_jit_bprop_graph]: 1.87001e-06 [rewriter_after_jit_bprop_graph]: 5.57001e-06 [opt_after_jit_grad]: 0.00061245 [validate]: 5.08e-05 Sums bootstrap : 0.000783s : 0.72% type_inference : 0.100879s : 93.18% event_method : 0.000018s : 0.02% auto_monad : 0.000059s : 0.05% graph_reusing : 0.000005s : 0.00% inline : 0.000002s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000021s : 0.02% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.01% parallel-infer-symbol : 0.000005s : 0.00% pre_auto_parallel : 0.000036s : 0.03% insert-virtual-dataset : 0.000003s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000032s : 0.03% optimize.rewriter_before_opt_a : 0.000093s : 0.09% optimize.opt_a.expand_dump_flag : 0.000006s : 0.01% optimize.opt_a.switch_simplify : 0.000057s : 0.05% optimize.opt_a.loop_unroll : 0.000042s : 0.04% optimize.opt_a.a_1 : 0.001064s : 0.98% optimize.opt_a.with_stream_mark : 0.000033s : 0.03% optimize.opt_a.recompute_prepare : 0.000021s : 0.02% optimize.opt_a.updatestate_depend_eliminate : 0.000010s : 0.01% optimize.opt_a.updatestate_assign_eliminate : 0.000008s : 0.01% optimize.opt_a.updatestate_loads_eliminate : 0.000007s : 0.01% optimize.opt_a.parameter_eliminate : 0.000004s : 0.00% optimize.opt_a.a_2 : 0.000291s : 0.27% optimize.opt_a.accelerated_algorithm : 0.000037s : 0.03% optimize.opt_a.shard : 0.000004s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.000005s : 0.00% optimize.opt_a.shard_inline : 0.000018s : 0.02% optimize.opt_a.merge_send_recv : 0.000022s : 0.02% optimize.opt_a.auto_parallel : 0.000018s : 0.02% optimize.opt_a.parallel : 0.000029s : 0.03% optimize.opt_a.flash_sp : 0.000014s : 0.01% optimize.opt_a.merge_comm : 0.000012s : 0.01% optimize.opt_a.allreduce_fusion : 0.000010s : 0.01% optimize.opt_a.matmul_add_comm_reduction : 0.000022s : 0.02% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000024s : 0.02% optimize.opt_a.virtual_dataset : 0.000021s : 0.02% optimize.opt_a.get_grad_eliminate_ : 0.000017s : 0.02% optimize.opt_a.virtual_output : 0.000016s : 0.02% optimize.opt_a.merge_forward : 0.000011s : 0.01% optimize.opt_a.cell_reuse_recompute_pass : 0.000004s : 0.00% optimize.opt_a.offload_activation : 0.000022s : 0.02% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000034s : 0.03% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.00% optimize.opt_a.before_grad : 0.000028s : 0.03% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000012s : 0.01% optimize.opt_a.meta_fg_expand : 0.000008s : 0.01% optimize.opt_a.flash_sp_send_recv_attached : 0.000007s : 0.01% optimize.opt_a.receive_attached : 0.000005s : 0.00% optimize.opt_a.after_resolve : 0.000033s : 0.03% optimize.opt_a.a_after_grad : 0.000027s : 0.03% optimize.opt_a.renormalize : 0.001065s : 0.98% optimize.opt_a.add_forward_monad_depend : 0.000008s : 0.01% optimize.opt_a.auto_monad_grad : 0.000005s : 0.00% optimize.opt_a.auto_monad_eliminator : 0.000031s : 0.03% optimize.opt_a.cse : 0.000069s : 0.06% optimize.opt_a.a_3 : 0.000120s : 0.11% optimize.py_interpret_to_execute_after_opt_a : 0.000033s : 0.03% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.00% optimize.rewriter_after_opt_a : 0.000052s : 0.05% optimize.convert_after_rewriter : 0.000009s : 0.01% optimize.order_py_execute_after_rewriter : 0.000007s : 0.01% optimize.mutable_eliminate : 0.000826s : 0.76% optimize.opt_b.b_1 : 0.000200s : 0.19% optimize.opt_b.b_2 : 0.000014s : 0.01% optimize.opt_b.updatestate_depend_eliminate : 0.000011s : 0.01% optimize.opt_b.updatestate_assign_eliminate : 0.000004s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000004s : 0.00% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000031s : 0.03% optimize.optimize_parallel_all_gather_comm : 0.000023s : 0.02% optimize.overlap_param_gather : 0.000002s : 0.00% optimize.cconv : 0.000033s : 0.03% optimize.loop_unroll : 0.000582s : 0.54% optimize.opt_after_cconv.c_1 : 0.000047s : 0.04% optimize.opt_after_cconv.parameter_eliminate : 0.000004s : 0.00% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000008s : 0.01% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000004s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.cse : 0.000028s : 0.03% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000020s : 0.02% optimize.tuple_transform.d_1 : 0.000070s : 0.07% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000010s : 0.01% optimize.partial_unused_args_eliminate : 0.000003s : 0.00% optimize.add_recomputation : 0.000072s : 0.07% optimize.cse_after_recomputation.cse : 0.000019s : 0.02% optimize.environ_conv : 0.000007s : 0.01% optimize.swap_dp_allreduce_reducescatter : 0.000008s : 0.01% optimize.bias_add_comm_swap : 0.000003s : 0.00% optimize.label_micro_interleaved_index : 0.000006s : 0.01% optimize.label_fine_grained_interleaved_index : 0.000004s : 0.00% optimize.merge_cast_opt : 0.000002s : 0.00% optimize.slice_recompute_activation : 0.000003s : 0.00% optimize.micro_interleaved_order_control : 0.000003s : 0.00% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.00% optimize.full_micro_interleaved_order_control : 0.000002s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.00% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000002s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000003s : 0.00% optimize.control_data_broadcast_order : 0.000018s : 0.02% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.00% optimize.offloading_packed_experts : 0.000005s : 0.00% optimize.overlap_recompute_and_grad_model_parallel : 0.000007s : 0.01% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000002s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000002s : 0.00% optimize.overlap_recompute_comm : 0.000003s : 0.00% optimize.overlap_grad_ring_attention : 0.000005s : 0.00% optimize.overlap_grad_flash_sp : 0.000026s : 0.02% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.00% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000005s : 0.00% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000015s : 0.01% optimize.symbol_engine_optimizer.elim_not_effective : 0.000017s : 0.02% optimize.symbol_engine_optimizer.opt_reshape : 0.000010s : 0.01% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000016s : 0.02% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.00% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000022s : 0.02% get_jit_bprop_graph : 0.000002s : 0.00% rewriter_after_jit_bprop_graph : 0.000006s : 0.01% opt_after_jit_grad : 0.000612s : 0.57% validate : 0.000051s : 0.05% Time group info: ------[substitution.] 0.000282 45 12.07% : 0.000034s : 5: substitution.arithmetic_simplify 9.33% : 0.000026s : 3: substitution.cast_eliminate 0.90% : 0.000003s : 3: substitution.elim_not_effective 0.65% : 0.000002s : 3: substitution.fold_const_symbol 2.85% : 0.000008s : 5: substitution.graph_param_transform 57.19% : 0.000161s : 4: substitution.inline 1.56% : 0.000004s : 6: substitution.j_node_and_user_rematch 4.64% : 0.000013s : 2: substitution.less_batch_normalization 2.66% : 0.000008s : 6: substitution.remove_not_recompute_node 2.31% : 0.000007s : 4: substitution.replace_old_param 5.83% : 0.000016s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.100828 2 99.05% : 0.099871s : 1: type_inference.infer 0.95% : 0.000957s : 1: type_inference.specialize ------[replace.] 0.000070 8 58.84% : 0.000041s : 4: replace.inline 41.16% : 0.000029s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000173 8 91.68% : 0.000158s : 4: match.inline 8.32% : 0.000014s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000287 1596 0.92% : 0.000003s : 17: predicate.accumulaten_eliminater 0.94% : 0.000003s : 5: predicate.ad_related_special_op_eliminate 0.49% : 0.000001s : 10: predicate.addn_check_dump 1.02% : 0.000003s : 17: predicate.addn_zero_filter 0.89% : 0.000003s : 17: predicate.adjust_all_reduce_mul_add 2.54% : 0.000007s : 27: predicate.arithmetic_simplify 0.95% : 0.000003s : 17: predicate.cast_eliminate 0.61% : 0.000002s : 10: predicate.check_bprop_eliminate 0.79% : 0.000002s : 10: predicate.compare_switch_simplify 0.20% : 0.000001s : 5: predicate.const_output_eliminate 0.55% : 0.000002s : 10: predicate.depend_value_elim 0.96% : 0.000003s : 17: predicate.dict_get_item_const_eliminator 1.19% : 0.000003s : 17: predicate.dict_get_item_eliminator 0.95% : 0.000003s : 17: predicate.dict_set_item_eliminator 0.93% : 0.000003s : 10: predicate.dumpgradient_eliminate 0.20% : 0.000001s : 5: predicate.elim_not_effective 0.44% : 0.000001s : 5: predicate.elim_shapecalc_of_broadcastargs 1.28% : 0.000004s : 22: predicate.environ_add_const_eliminate 1.13% : 0.000003s : 22: predicate.environ_get_add_eliminate 1.15% : 0.000003s : 22: predicate.environ_get_depend_swap 1.74% : 0.000005s : 32: predicate.environ_get_eliminate 1.23% : 0.000004s : 22: predicate.environ_get_set_eliminate 1.44% : 0.000004s : 25: predicate.exchange_switch_depend_value 2.36% : 0.000007s : 25: predicate.float_depend_g_call 0.48% : 0.000001s : 10: predicate.float_environ_get_switch 0.80% : 0.000002s : 15: predicate.float_tuple_getitem_switch 0.19% : 0.000001s : 5: predicate.fold_const_symbol 0.67% : 0.000002s : 10: predicate.get_grad_eliminate 0.28% : 0.000001s : 5: predicate.graph_param_transform 0.60% : 0.000002s : 10: predicate.incorporate_call 0.50% : 0.000001s : 10: predicate.incorporate_call_switch 6.32% : 0.000018s : 72: predicate.inline 1.00% : 0.000003s : 10: predicate.inline_without_move 0.32% : 0.000001s : 10: predicate.j_node_and_user_rematch 1.08% : 0.000003s : 10: predicate.less_batch_normalization 1.71% : 0.000005s : 31: predicate.list_to_tuple_eliminator_ 2.54% : 0.000007s : 48: predicate.load_eliminater 1.22% : 0.000003s : 5: predicate.loop_unroll_after_grad 1.92% : 0.000006s : 36: predicate.loop_unroll_before_grad 1.69% : 0.000005s : 27: predicate.make_slice_get_slice_eliminator 0.53% : 0.000002s : 10: predicate.merge_addn 0.62% : 0.000002s : 10: predicate.micro_step_allgather_replace 0.54% : 0.000002s : 10: predicate.mini_step_allgather_replace 0.83% : 0.000002s : 17: predicate.minmaximum_grad 1.15% : 0.000003s : 5: predicate.mutable_eliminate 0.33% : 0.000001s : 5: predicate.opt_reshape 0.35% : 0.000001s : 5: predicate.parallel_virtual_node 1.66% : 0.000005s : 25: predicate.partial_defer_inline 1.66% : 0.000005s : 26: predicate.partial_eliminate 0.93% : 0.000003s : 17: predicate.print_const_string_wrapper 0.56% : 0.000002s : 10: predicate.reduce_all_const_elim 1.17% : 0.000003s : 17: predicate.reduce_eliminate 2.53% : 0.000007s : 48: predicate.redundant_stop_gradient_eliminater 0.39% : 0.000001s : 10: predicate.remove_not_recompute_node 1.31% : 0.000004s : 31: predicate.replace_applicator 0.51% : 0.000001s : 10: predicate.replace_old_param 0.23% : 0.000001s : 5: predicate.reset_defer_inline 0.96% : 0.000003s : 17: predicate.reshape_eliminate 0.64% : 0.000002s : 10: predicate.row_tensor_add_zeros_like 0.43% : 0.000001s : 5: predicate.row_tensor_eliminate 0.74% : 0.000002s : 10: predicate.same_eliminate 0.40% : 0.000001s : 10: predicate.set_cell_output_no_recompute 0.84% : 0.000002s : 10: predicate.shard_identity_eliminate 0.72% : 0.000002s : 10: predicate.special_op_eliminate 0.68% : 0.000002s : 10: predicate.specialize_transform 0.79% : 0.000002s : 10: predicate.split_environ_get_set_with_tuple_value 0.73% : 0.000002s : 10: predicate.stack_unstack_eliminate 0.41% : 0.000001s : 5: predicate.switch_call_monad_eliminater 1.42% : 0.000004s : 25: predicate.switch_defer_inline 1.94% : 0.000006s : 35: predicate.switch_layer_defer_inline 4.50% : 0.000013s : 76: predicate.switch_simplify 0.90% : 0.000003s : 17: predicate.tile_eliminate 0.88% : 0.000003s : 17: predicate.transpose_eliminate 1.58% : 0.000005s : 27: predicate.tuple_list_convert_item_index_to_positive 1.59% : 0.000005s : 27: predicate.tuple_list_get_item_const_eliminator 1.39% : 0.000004s : 27: predicate.tuple_list_get_item_depend_reorder 3.22% : 0.000009s : 41: predicate.tuple_list_get_item_eliminator 1.45% : 0.000004s : 27: predicate.tuple_list_get_set_item_eliminator 2.39% : 0.000007s : 37: predicate.tuple_list_set_item_eliminator 1.69% : 0.000005s : 31: predicate.tuple_to_list_eliminator_ 2.53% : 0.000007s : 48: predicate.updatestate_pure_node_eliminater 3.26% : 0.000009s : 58: predicate.updatestate_useless_node_eliminater 0.38% : 0.000001s : 5: predicate.value_based_eliminate 0.66% : 0.000002s : 10: predicate.virtual_dataset_eliminate 0.59% : 0.000002s : 10: predicate.virtual_output_eliminate 0.38% : 0.000001s : 5: predicate.virtual_view_grad_eliminate 0.40% : 0.000001s : 5: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.001135 11 64.54% : 0.000732s : 5: func_graph_cloner_run.FuncGraphClonerGraph 35.46% : 0.000402s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.126768 192 0.00% : 0.000004s : 1: ForceFp32Comm 2.95% : 0.003736s : 1: add_attr 2.94% : 0.003725s : 1: add_attr_with_inline 0.00% : 0.000004s : 1: add_comm_op_reuse_tag 0.06% : 0.000077s : 1: add_recomputation 0.00% : 0.000005s : 1: assign_add_opt 0.05% : 0.000066s : 1: auto_monad 0.02% : 0.000027s : 1: auto_monad_reorder 0.00% : 0.000004s : 1: begin_end_overlap_inline 0.00% : 0.000006s : 1: bias_add_comm_swap 0.65% : 0.000829s : 1: bootstrap 0.03% : 0.000037s : 1: cconv 0.00% : 0.000005s : 1: comm_op_add_attrs 0.02% : 0.000022s : 1: control_data_broadcast_order 0.01% : 0.000013s : 1: convert_after_rewriter 0.03% : 0.000036s : 1: cse_after_recomputation 0.00% : 0.000005s : 1: dataset_repeat_opt 0.00% : 0.000006s : 1: detach_backward 0.01% : 0.000011s : 1: environ_conv 0.02% : 0.000024s : 1: event_method 0.00% : 0.000006s : 1: full_micro_interleaved_order_control 0.00% : 0.000005s : 1: get_jit_bprop_graph 0.01% : 0.000008s : 1: graph_reusing 0.00% : 0.000005s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000004s : 1: handle_group_info 0.00% : 0.000005s : 1: inline 0.01% : 0.000007s : 1: insert-virtual-dataset 0.00% : 0.000004s : 1: interleave_parallel_branches 0.00% : 0.000006s : 1: interleave_split_concat_branches 0.01% : 0.000008s : 1: label_fine_grained_interleaved_index 0.01% : 0.000010s : 1: label_micro_interleaved_index 0.47% : 0.000593s : 1: loop_unroll 0.00% : 0.000006s : 1: merge_cast_opt 0.01% : 0.000007s : 1: micro_interleaved_order_control 0.66% : 0.000836s : 1: mutable_eliminate 0.01% : 0.000008s : 1: offloading_packed_experts 0.02% : 0.000022s : 1: opt.transform.loop_unroll_optimizer 0.02% : 0.000023s : 1: opt.transform.mutable_eliminate 1.37% : 0.001734s : 78: opt.transform.opt_a 0.04% : 0.000045s : 1: opt.transform.opt_after_cconv 0.03% : 0.000038s : 1: opt.transform.opt_after_jit_grad 0.14% : 0.000175s : 28: opt.transform.opt_b 0.06% : 0.000078s : 2: opt.transform.opt_trans_graph 0.04% : 0.000054s : 4: opt.transform.symbol_engine_opt 3.04% : 0.003857s : 1: opt_a 0.12% : 0.000154s : 1: opt_after_cconv 0.49% : 0.000623s : 1: opt_after_jit_grad 0.27% : 0.000339s : 1: opt_b 5.35% : 0.006782s : 1: optimize 0.02% : 0.000028s : 1: optimize_parallel_all_gather_comm 0.01% : 0.000011s : 1: order_py_execute_after_rewriter 0.02% : 0.000029s : 1: overlap_grad_flash_sp 0.00% : 0.000005s : 1: overlap_grad_matmul_and_grad_allreduce 0.01% : 0.000011s : 1: overlap_grad_ring_attention 0.00% : 0.000006s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000006s : 1: overlap_param_gather 0.00% : 0.000006s : 1: overlap_recompute_allgather_and_fa_grad 0.01% : 0.000011s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000006s : 1: overlap_recompute_comm 0.01% : 0.000008s : 1: parallel-infer-symbol 0.00% : 0.000004s : 1: parallel-infer-symbol-second 0.00% : 0.000006s : 1: partial_unused_args_eliminate 0.00% : 0.000006s : 1: pipeline_parallel_scheduler 0.00% : 0.000005s : 1: pipeline_split 0.03% : 0.000040s : 1: pre_auto_parallel 0.03% : 0.000036s : 1: py_interpret_to_execute 0.03% : 0.000038s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000007s : 1: remove_cast_before_assign_add 0.02% : 0.000025s : 1: remove_dup_value 0.47% : 0.000591s : 1: renormalize.infer 0.37% : 0.000463s : 1: renormalize.specialize 0.01% : 0.000007s : 1: reorder_send_recv_between_fp_bp 0.01% : 0.000009s : 1: rewriter_after_jit_bprop_graph 0.05% : 0.000058s : 1: rewriter_after_opt_a 0.08% : 0.000097s : 1: rewriter_before_opt_a 0.01% : 0.000007s : 1: slice_cell_reuse_recomputed_activation 0.01% : 0.000006s : 1: slice_recompute_activation 0.00% : 0.000005s : 1: split_layernorm_comm 0.00% : 0.000006s : 1: split_matmul_comm_elemetwise 0.01% : 0.000011s : 1: swap_dp_allreduce_reducescatter 0.09% : 0.000112s : 1: symbol_engine_optimizer 0.09% : 0.000117s : 1: tuple_transform 79.59% : 0.100895s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:38:17.803.312 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:38:17.803.578 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.174275, [21] [bootstrap]: 0.00045752 [type_inference]: 0.0241774 [event_method]: 2.733e-05 [auto_monad]: 9.167e-05 [graph_reusing]: 8.05e-06 [inline]: 3.95e-06 [add_attr]: 0.00431776, [1] [add_attr_with_inline]: 0.00430354, [1] [Cycle 1]: 0.00010938, [2] [tag_attr]: 2.893e-05 [meta_addattr_fg_expand]: 7.8e-06 [parallel-infer-symbol]: 4.82998e-06 [pre_auto_parallel]: 4.888e-05 [insert-virtual-dataset]: 2.93e-06 [parallel-infer-symbol-second]: 9.5999e-07 [dataset_repeat_opt]: 2.63e-06 [pipeline_split]: 2.17999e-06 [optimize]: 0.143324, [53] [py_interpret_to_execute]: 4.417e-05 [rewriter_before_opt_a]: 0.00011586 [opt_a]: 0.140035, [2] [Cycle 1]: 0.138891, [45] [expand_dump_flag]: 4.21001e-06 [switch_simplify]: 4.909e-05 [loop_unroll]: 3.727e-05 [a_1]: 0.00090587 [with_stream_mark]: 2.361e-05 [recompute_prepare]: 1.356e-05 [updatestate_depend_eliminate]: 6.33e-06 [updatestate_assign_eliminate]: 4.60999e-06 [updatestate_loads_eliminate]: 4.06001e-06 [parameter_eliminate]: 2.27999e-06 [a_2]: 0.00016064 [accelerated_algorithm]: 2.589e-05 [shard]: 2.21e-06 [meta_shard_fg_expand]: 3.01999e-06 [shard_inline]: 9.46e-06 [merge_send_recv]: 1.038e-05 [auto_parallel]: 9.81e-06 [parallel]: 2.134e-05 [flash_sp]: 1.044e-05 [merge_comm]: 4.92e-06 [allreduce_fusion]: 4.40999e-06 [matmul_add_comm_reduction]: 1.349e-05 [allreduce_slice_to_reducescatter]: 1.00999e-06 [virtual_shard_identity]: 1.059e-05 [virtual_dataset]: 1.076e-05 [get_grad_eliminate_]: 8.18999e-06 [virtual_output]: 9.39e-06 [merge_forward]: 5.71998e-06 [cell_reuse_recompute_pass]: 1.92999e-06 [offload_activation]: 1.317e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.086e-05 [merge_recompute_call_nodes]: 2.19999e-06 [before_grad]: 1.568e-05 [set_forward_comm_id_for_comm_node_pass]: 5.27001e-06 [meta_fg_expand]: 4.54002e-06 [flash_sp_send_recv_attached]: 5.44998e-06 [receive_attached]: 2.41e-06 [after_resolve]: 1.645e-05 [a_after_grad]: 1.439e-05 [renormalize]: 0.136744 [add_forward_monad_depend]: 1.286e-05 [auto_monad_grad]: 2.55002e-06 [auto_monad_eliminator]: 2.921e-05 [cse]: 4.507e-05 [a_3]: 9.92e-05 [Cycle 2]: 0.0011246, [45] [expand_dump_flag]: 2.95998e-06 [switch_simplify]: 1.203e-05 [loop_unroll]: 8.75001e-06 [a_1]: 0.00022957 [with_stream_mark]: 1.987e-05 [recompute_prepare]: 9.11002e-06 [updatestate_depend_eliminate]: 5.70001e-06 [updatestate_assign_eliminate]: 4.11001e-06 [updatestate_loads_eliminate]: 3.8e-06 [parameter_eliminate]: 2.12001e-06 [a_2]: 0.00013468 [accelerated_algorithm]: 1.4e-05 [shard]: 2.14e-06 [meta_shard_fg_expand]: 2.46e-06 [shard_inline]: 9.84001e-06 [merge_send_recv]: 1.037e-05 [auto_parallel]: 1.127e-05 [parallel]: 1.05e-05 [flash_sp]: 4.95999e-06 [merge_comm]: 5.01002e-06 [allreduce_fusion]: 4.17e-06 [matmul_add_comm_reduction]: 1.374e-05 [allreduce_slice_to_reducescatter]: 1.38002e-06 [virtual_shard_identity]: 1.049e-05 [virtual_dataset]: 8.55001e-06 [get_grad_eliminate_]: 8.23999e-06 [virtual_output]: 8.15e-06 [merge_forward]: 5.37001e-06 [cell_reuse_recompute_pass]: 3.76001e-06 [offload_activation]: 1.446e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.833e-05 [merge_recompute_call_nodes]: 1.87001e-06 [before_grad]: 1.397e-05 [set_forward_comm_id_for_comm_node_pass]: 5.09e-06 [meta_fg_expand]: 3.06001e-06 [flash_sp_send_recv_attached]: 2.04e-06 [receive_attached]: 3.13e-06 [after_resolve]: 1.625e-05 [a_after_grad]: 1.323e-05 [renormalize]: 6.00121e-08 [add_forward_monad_depend]: 3.03e-06 [auto_monad_grad]: 1.55999e-06 [auto_monad_eliminator]: 1.21e-05 [cse]: 2.439e-05 [a_3]: 6.309e-05 [py_interpret_to_execute_after_opt_a]: 2.63e-05 [slice_cell_reuse_recomputed_activation]: 4.92e-06 [rewriter_after_opt_a]: 5.896e-05 [convert_after_rewriter]: 1.343e-05 [order_py_execute_after_rewriter]: 1.011e-05 [mutable_eliminate]: 0.0008512 [opt_b]: 0.00040423, [1] [Cycle 1]: 0.00038974, [7] [b_1]: 0.00024554 [b_2]: 1.18e-05 [updatestate_depend_eliminate]: 1.224e-05 [updatestate_assign_eliminate]: 4.18999e-06 [updatestate_loads_eliminate]: 3.51001e-06 [renormalize]: 1.07e-06 [cse]: 3.475e-05 [optimize_parallel_all_gather_comm]: 2.891e-05 [overlap_param_gather]: 6.26e-06 [cconv]: 4.371e-05 [loop_unroll]: 0.00059279 [opt_after_cconv]: 0.00017819, [1] [Cycle 1]: 0.00016723, [7] [c_1]: 4.649e-05 [parameter_eliminate]: 5.24998e-06 [updatestate_depend_eliminate]: 8.30999e-06 [updatestate_assign_eliminate]: 3.61001e-06 [updatestate_loads_eliminate]: 3.41001e-06 [cse]: 2.683e-05 [renormalize]: 9.39996e-07 [remove_dup_value]: 2.505e-05 [tuple_transform]: 0.00012792, [1] [Cycle 1]: 0.00011908, [4] [d_1]: 7.033e-05 [none_parameter_eliminate]: 2.16e-06 [renormalize]: 2.19996e-07 [switch_simplify]: 8.61002e-06 [partial_unused_args_eliminate]: 5.42001e-06 [add_recomputation]: 7.855e-05 [cse_after_recomputation]: 3.701e-05, [1] [Cycle 1]: 2.876e-05, [1] [cse]: 1.74e-05 [environ_conv]: 1.188e-05 [swap_dp_allreduce_reducescatter]: 1.069e-05 [bias_add_comm_swap]: 7.48e-06 [label_micro_interleaved_index]: 9.10999e-06 [label_fine_grained_interleaved_index]: 6.63e-06 [merge_cast_opt]: 4.48999e-06 [slice_recompute_activation]: 6.04999e-06 [micro_interleaved_order_control]: 5.97001e-06 [assign_add_opt]: 4.52998e-06 [ForceFp32Comm]: 3.77998e-06 [remove_cast_before_assign_add]: 4.38001e-06 [full_micro_interleaved_order_control]: 5.50001e-06 [reorder_send_recv_between_fp_bp]: 6.55002e-06 [comm_op_add_attrs]: 5.12999e-06 [add_comm_op_reuse_tag]: 5.16002e-06 [interleave_split_concat_branches]: 4.33999e-06 [interleave_parallel_branches]: 4.55001e-06 [overlap_opt_shard_in_pipeline]: 4.88001e-06 [overlap_opt_shard_grad_in_pipeline]: 5.61998e-06 [control_data_broadcast_order]: 2.235e-05 [grouped_pairwise_exchange_alltoall]: 5.47001e-06 [offloading_packed_experts]: 9.18002e-06 [overlap_recompute_and_grad_model_parallel]: 9.43002e-06 [overlap_grad_matmul_and_grad_allreduce]: 4.95999e-06 [overlap_recompute_allgather_and_fa_grad]: 5.69999e-06 [overlap_recompute_comm]: 6.16e-06 [overlap_grad_ring_attention]: 9.19e-06 [overlap_grad_flash_sp]: 3.357e-05 [begin_end_overlap_inline]: 4.08999e-06 [split_matmul_comm_elemetwise]: 5.54e-06 [split_layernorm_comm]: 5.21998e-06 [handle_group_info]: 4.49998e-06 [symbol_engine_optimizer]: 0.00013492, [1] [Cycle 1]: 0.00012411, [6] [build]: 4.97999e-06 [elim_shapecalc]: 1.488e-05 [elim_not_effective]: 1.933e-05 [opt_reshape]: 1.015e-05 [fold_const_symbol]: 1.401e-05 [renormalize]: 8.00006e-07 [detach_backward]: 7.98001e-06 [pipeline_parallel_scheduler]: 2.37999e-06 [auto_monad_reorder]: 2.918e-05 [get_jit_bprop_graph]: 2.71e-06 [rewriter_after_jit_bprop_graph]: 7.94997e-06 [opt_after_jit_grad]: 0.00077281 [validate]: 6.324e-05 Sums bootstrap : 0.000458s : 0.27% type_inference : 0.024177s : 14.42% event_method : 0.000027s : 0.02% auto_monad : 0.000092s : 0.05% graph_reusing : 0.000008s : 0.00% inline : 0.000004s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000029s : 0.02% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000008s : 0.00% parallel-infer-symbol : 0.000005s : 0.00% pre_auto_parallel : 0.000049s : 0.03% insert-virtual-dataset : 0.000003s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000003s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000044s : 0.03% optimize.rewriter_before_opt_a : 0.000116s : 0.07% optimize.opt_a.expand_dump_flag : 0.000007s : 0.00% optimize.opt_a.switch_simplify : 0.000061s : 0.04% optimize.opt_a.loop_unroll : 0.000046s : 0.03% optimize.opt_a.a_1 : 0.001135s : 0.68% optimize.opt_a.with_stream_mark : 0.000043s : 0.03% optimize.opt_a.recompute_prepare : 0.000023s : 0.01% optimize.opt_a.updatestate_depend_eliminate : 0.000012s : 0.01% optimize.opt_a.updatestate_assign_eliminate : 0.000009s : 0.01% optimize.opt_a.updatestate_loads_eliminate : 0.000008s : 0.00% optimize.opt_a.parameter_eliminate : 0.000004s : 0.00% optimize.opt_a.a_2 : 0.000295s : 0.18% optimize.opt_a.accelerated_algorithm : 0.000040s : 0.02% optimize.opt_a.shard : 0.000004s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.000005s : 0.00% optimize.opt_a.shard_inline : 0.000019s : 0.01% optimize.opt_a.merge_send_recv : 0.000021s : 0.01% optimize.opt_a.auto_parallel : 0.000021s : 0.01% optimize.opt_a.parallel : 0.000032s : 0.02% optimize.opt_a.flash_sp : 0.000015s : 0.01% optimize.opt_a.merge_comm : 0.000010s : 0.01% optimize.opt_a.allreduce_fusion : 0.000009s : 0.01% optimize.opt_a.matmul_add_comm_reduction : 0.000027s : 0.02% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000002s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000021s : 0.01% optimize.opt_a.virtual_dataset : 0.000019s : 0.01% optimize.opt_a.get_grad_eliminate_ : 0.000016s : 0.01% optimize.opt_a.virtual_output : 0.000018s : 0.01% optimize.opt_a.merge_forward : 0.000011s : 0.01% optimize.opt_a.cell_reuse_recompute_pass : 0.000006s : 0.00% optimize.opt_a.offload_activation : 0.000028s : 0.02% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000039s : 0.02% optimize.opt_a.merge_recompute_call_nodes : 0.000004s : 0.00% optimize.opt_a.before_grad : 0.000030s : 0.02% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000010s : 0.01% optimize.opt_a.meta_fg_expand : 0.000008s : 0.00% optimize.opt_a.flash_sp_send_recv_attached : 0.000007s : 0.00% optimize.opt_a.receive_attached : 0.000006s : 0.00% optimize.opt_a.after_resolve : 0.000033s : 0.02% optimize.opt_a.a_after_grad : 0.000028s : 0.02% optimize.opt_a.renormalize : 0.136744s : 81.57% optimize.opt_a.add_forward_monad_depend : 0.000016s : 0.01% optimize.opt_a.auto_monad_grad : 0.000004s : 0.00% optimize.opt_a.auto_monad_eliminator : 0.000041s : 0.02% optimize.opt_a.cse : 0.000069s : 0.04% optimize.opt_a.a_3 : 0.000162s : 0.10% optimize.py_interpret_to_execute_after_opt_a : 0.000026s : 0.02% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.00% optimize.rewriter_after_opt_a : 0.000059s : 0.04% optimize.convert_after_rewriter : 0.000013s : 0.01% optimize.order_py_execute_after_rewriter : 0.000010s : 0.01% optimize.mutable_eliminate : 0.000851s : 0.51% optimize.opt_b.b_1 : 0.000246s : 0.15% optimize.opt_b.b_2 : 0.000012s : 0.01% optimize.opt_b.updatestate_depend_eliminate : 0.000012s : 0.01% optimize.opt_b.updatestate_assign_eliminate : 0.000004s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000004s : 0.00% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000035s : 0.02% optimize.optimize_parallel_all_gather_comm : 0.000029s : 0.02% optimize.overlap_param_gather : 0.000006s : 0.00% optimize.cconv : 0.000044s : 0.03% optimize.loop_unroll : 0.000593s : 0.35% optimize.opt_after_cconv.c_1 : 0.000046s : 0.03% optimize.opt_after_cconv.parameter_eliminate : 0.000005s : 0.00% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000008s : 0.00% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000004s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.cse : 0.000027s : 0.02% optimize.opt_after_cconv.renormalize : 0.000001s : 0.00% optimize.remove_dup_value : 0.000025s : 0.01% optimize.tuple_transform.d_1 : 0.000070s : 0.04% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000009s : 0.01% optimize.partial_unused_args_eliminate : 0.000005s : 0.00% optimize.add_recomputation : 0.000079s : 0.05% optimize.cse_after_recomputation.cse : 0.000017s : 0.01% optimize.environ_conv : 0.000012s : 0.01% optimize.swap_dp_allreduce_reducescatter : 0.000011s : 0.01% optimize.bias_add_comm_swap : 0.000007s : 0.00% optimize.label_micro_interleaved_index : 0.000009s : 0.01% optimize.label_fine_grained_interleaved_index : 0.000007s : 0.00% optimize.merge_cast_opt : 0.000004s : 0.00% optimize.slice_recompute_activation : 0.000006s : 0.00% optimize.micro_interleaved_order_control : 0.000006s : 0.00% optimize.assign_add_opt : 0.000005s : 0.00% optimize.ForceFp32Comm : 0.000004s : 0.00% optimize.remove_cast_before_assign_add : 0.000004s : 0.00% optimize.full_micro_interleaved_order_control : 0.000006s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000007s : 0.00% optimize.comm_op_add_attrs : 0.000005s : 0.00% optimize.add_comm_op_reuse_tag : 0.000005s : 0.00% optimize.interleave_split_concat_branches : 0.000004s : 0.00% optimize.interleave_parallel_branches : 0.000005s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000005s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000006s : 0.00% optimize.control_data_broadcast_order : 0.000022s : 0.01% optimize.grouped_pairwise_exchange_alltoall : 0.000005s : 0.00% optimize.offloading_packed_experts : 0.000009s : 0.01% optimize.overlap_recompute_and_grad_model_parallel : 0.000009s : 0.01% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000005s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000006s : 0.00% optimize.overlap_recompute_comm : 0.000006s : 0.00% optimize.overlap_grad_ring_attention : 0.000009s : 0.01% optimize.overlap_grad_flash_sp : 0.000034s : 0.02% optimize.begin_end_overlap_inline : 0.000004s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000006s : 0.00% optimize.split_layernorm_comm : 0.000005s : 0.00% optimize.handle_group_info : 0.000004s : 0.00% optimize.symbol_engine_optimizer.build : 0.000005s : 0.00% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000015s : 0.01% optimize.symbol_engine_optimizer.elim_not_effective : 0.000019s : 0.01% optimize.symbol_engine_optimizer.opt_reshape : 0.000010s : 0.01% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000014s : 0.01% optimize.symbol_engine_optimizer.renormalize : 0.000001s : 0.00% detach_backward : 0.000008s : 0.00% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000029s : 0.02% get_jit_bprop_graph : 0.000003s : 0.00% rewriter_after_jit_bprop_graph : 0.000008s : 0.00% opt_after_jit_grad : 0.000773s : 0.46% validate : 0.000063s : 0.04% Time group info: ------[substitution.] 0.000332 45 12.28% : 0.000041s : 5: substitution.arithmetic_simplify 9.83% : 0.000033s : 3: substitution.cast_eliminate 0.77% : 0.000003s : 3: substitution.elim_not_effective 0.55% : 0.000002s : 3: substitution.fold_const_symbol 2.61% : 0.000009s : 5: substitution.graph_param_transform 57.47% : 0.000191s : 4: substitution.inline 2.01% : 0.000007s : 6: substitution.j_node_and_user_rematch 4.58% : 0.000015s : 2: substitution.less_batch_normalization 2.45% : 0.000008s : 6: substitution.remove_not_recompute_node 2.07% : 0.000007s : 4: substitution.replace_old_param 5.40% : 0.000018s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.024094 2 94.93% : 0.022872s : 1: type_inference.infer 5.07% : 0.001222s : 1: type_inference.specialize ------[replace.] 0.000072 8 59.95% : 0.000043s : 4: replace.inline 40.05% : 0.000029s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000204 8 92.27% : 0.000188s : 4: match.inline 7.73% : 0.000016s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000297 1596 0.94% : 0.000003s : 17: predicate.accumulaten_eliminater 0.91% : 0.000003s : 5: predicate.ad_related_special_op_eliminate 0.51% : 0.000002s : 10: predicate.addn_check_dump 1.01% : 0.000003s : 17: predicate.addn_zero_filter 0.85% : 0.000003s : 17: predicate.adjust_all_reduce_mul_add 2.54% : 0.000008s : 27: predicate.arithmetic_simplify 0.99% : 0.000003s : 17: predicate.cast_eliminate 0.54% : 0.000002s : 10: predicate.check_bprop_eliminate 0.59% : 0.000002s : 10: predicate.compare_switch_simplify 0.16% : 0.000000s : 5: predicate.const_output_eliminate 0.57% : 0.000002s : 10: predicate.depend_value_elim 0.94% : 0.000003s : 17: predicate.dict_get_item_const_eliminator 1.03% : 0.000003s : 17: predicate.dict_get_item_eliminator 0.99% : 0.000003s : 17: predicate.dict_set_item_eliminator 1.21% : 0.000004s : 10: predicate.dumpgradient_eliminate 0.21% : 0.000001s : 5: predicate.elim_not_effective 0.50% : 0.000001s : 5: predicate.elim_shapecalc_of_broadcastargs 1.13% : 0.000003s : 22: predicate.environ_add_const_eliminate 1.12% : 0.000003s : 22: predicate.environ_get_add_eliminate 1.15% : 0.000003s : 22: predicate.environ_get_depend_swap 1.66% : 0.000005s : 32: predicate.environ_get_eliminate 1.10% : 0.000003s : 22: predicate.environ_get_set_eliminate 1.32% : 0.000004s : 25: predicate.exchange_switch_depend_value 2.47% : 0.000007s : 25: predicate.float_depend_g_call 0.52% : 0.000002s : 10: predicate.float_environ_get_switch 0.81% : 0.000002s : 15: predicate.float_tuple_getitem_switch 0.16% : 0.000000s : 5: predicate.fold_const_symbol 0.66% : 0.000002s : 10: predicate.get_grad_eliminate 0.19% : 0.000001s : 5: predicate.graph_param_transform 0.53% : 0.000002s : 10: predicate.incorporate_call 0.47% : 0.000001s : 10: predicate.incorporate_call_switch 6.61% : 0.000020s : 72: predicate.inline 0.74% : 0.000002s : 10: predicate.inline_without_move 0.30% : 0.000001s : 10: predicate.j_node_and_user_rematch 0.97% : 0.000003s : 10: predicate.less_batch_normalization 1.74% : 0.000005s : 31: predicate.list_to_tuple_eliminator_ 2.39% : 0.000007s : 48: predicate.load_eliminater 0.87% : 0.000003s : 5: predicate.loop_unroll_after_grad 2.08% : 0.000006s : 36: predicate.loop_unroll_before_grad 1.64% : 0.000005s : 27: predicate.make_slice_get_slice_eliminator 0.64% : 0.000002s : 10: predicate.merge_addn 0.54% : 0.000002s : 10: predicate.micro_step_allgather_replace 0.53% : 0.000002s : 10: predicate.mini_step_allgather_replace 0.79% : 0.000002s : 17: predicate.minmaximum_grad 1.36% : 0.000004s : 5: predicate.mutable_eliminate 0.44% : 0.000001s : 5: predicate.opt_reshape 0.36% : 0.000001s : 5: predicate.parallel_virtual_node 1.88% : 0.000006s : 25: predicate.partial_defer_inline 1.55% : 0.000005s : 26: predicate.partial_eliminate 0.85% : 0.000003s : 17: predicate.print_const_string_wrapper 0.56% : 0.000002s : 10: predicate.reduce_all_const_elim 1.21% : 0.000004s : 17: predicate.reduce_eliminate 2.46% : 0.000007s : 48: predicate.redundant_stop_gradient_eliminater 0.32% : 0.000001s : 10: predicate.remove_not_recompute_node 1.33% : 0.000004s : 31: predicate.replace_applicator 0.61% : 0.000002s : 10: predicate.replace_old_param 0.35% : 0.000001s : 5: predicate.reset_defer_inline 1.03% : 0.000003s : 17: predicate.reshape_eliminate 0.57% : 0.000002s : 10: predicate.row_tensor_add_zeros_like 0.36% : 0.000001s : 5: predicate.row_tensor_eliminate 0.87% : 0.000003s : 10: predicate.same_eliminate 0.40% : 0.000001s : 10: predicate.set_cell_output_no_recompute 0.89% : 0.000003s : 10: predicate.shard_identity_eliminate 0.66% : 0.000002s : 10: predicate.special_op_eliminate 0.67% : 0.000002s : 10: predicate.specialize_transform 1.03% : 0.000003s : 10: predicate.split_environ_get_set_with_tuple_value 0.79% : 0.000002s : 10: predicate.stack_unstack_eliminate 0.47% : 0.000001s : 5: predicate.switch_call_monad_eliminater 1.44% : 0.000004s : 25: predicate.switch_defer_inline 1.89% : 0.000006s : 35: predicate.switch_layer_defer_inline 4.75% : 0.000014s : 76: predicate.switch_simplify 0.89% : 0.000003s : 17: predicate.tile_eliminate 1.07% : 0.000003s : 17: predicate.transpose_eliminate 1.50% : 0.000004s : 27: predicate.tuple_list_convert_item_index_to_positive 1.61% : 0.000005s : 27: predicate.tuple_list_get_item_const_eliminator 1.47% : 0.000004s : 27: predicate.tuple_list_get_item_depend_reorder 3.21% : 0.000010s : 41: predicate.tuple_list_get_item_eliminator 1.33% : 0.000004s : 27: predicate.tuple_list_get_set_item_eliminator 2.22% : 0.000007s : 37: predicate.tuple_list_set_item_eliminator 1.69% : 0.000005s : 31: predicate.tuple_to_list_eliminator_ 2.38% : 0.000007s : 48: predicate.updatestate_pure_node_eliminater 3.34% : 0.000010s : 58: predicate.updatestate_useless_node_eliminater 0.44% : 0.000001s : 5: predicate.value_based_eliminate 0.65% : 0.000002s : 10: predicate.virtual_dataset_eliminate 0.66% : 0.000002s : 10: predicate.virtual_output_eliminate 0.24% : 0.000001s : 5: predicate.virtual_view_grad_eliminate 0.57% : 0.000002s : 5: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.001006 11 43.92% : 0.000442s : 5: func_graph_cloner_run.FuncGraphClonerGraph 56.08% : 0.000564s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.460674 192 0.00% : 0.000007s : 1: ForceFp32Comm 0.94% : 0.004333s : 1: add_attr 0.94% : 0.004308s : 1: add_attr_with_inline 0.00% : 0.000008s : 1: add_comm_op_reuse_tag 0.02% : 0.000082s : 1: add_recomputation 0.00% : 0.000007s : 1: assign_add_opt 0.02% : 0.000104s : 1: auto_monad 0.01% : 0.000037s : 1: auto_monad_reorder 0.00% : 0.000007s : 1: begin_end_overlap_inline 0.00% : 0.000010s : 1: bias_add_comm_swap 0.11% : 0.000508s : 1: bootstrap 0.01% : 0.000048s : 1: cconv 0.00% : 0.000008s : 1: comm_op_add_attrs 0.01% : 0.000026s : 1: control_data_broadcast_order 0.00% : 0.000017s : 1: convert_after_rewriter 0.01% : 0.000040s : 1: cse_after_recomputation 0.00% : 0.000008s : 1: dataset_repeat_opt 0.01% : 0.000037s : 1: detach_backward 0.00% : 0.000015s : 1: environ_conv 0.01% : 0.000042s : 1: event_method 0.00% : 0.000010s : 1: full_micro_interleaved_order_control 0.00% : 0.000009s : 1: get_jit_bprop_graph 0.00% : 0.000016s : 1: graph_reusing 0.00% : 0.000008s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000007s : 1: handle_group_info 0.00% : 0.000010s : 1: inline 0.00% : 0.000010s : 1: insert-virtual-dataset 0.00% : 0.000008s : 1: interleave_parallel_branches 0.00% : 0.000008s : 1: interleave_split_concat_branches 0.00% : 0.000010s : 1: label_fine_grained_interleaved_index 0.00% : 0.000012s : 1: label_micro_interleaved_index 0.13% : 0.000601s : 1: loop_unroll 0.00% : 0.000008s : 1: merge_cast_opt 0.00% : 0.000009s : 1: micro_interleaved_order_control 0.19% : 0.000860s : 1: mutable_eliminate 0.00% : 0.000012s : 1: offloading_packed_experts 0.00% : 0.000020s : 1: opt.transform.loop_unroll_optimizer 0.01% : 0.000024s : 1: opt.transform.mutable_eliminate 0.39% : 0.001775s : 78: opt.transform.opt_a 0.01% : 0.000045s : 1: opt.transform.opt_after_cconv 0.01% : 0.000042s : 1: opt.transform.opt_after_jit_grad 0.04% : 0.000162s : 28: opt.transform.opt_b 0.02% : 0.000077s : 2: opt.transform.opt_trans_graph 0.01% : 0.000055s : 4: opt.transform.symbol_engine_opt 30.40% : 0.140039s : 1: opt_a 0.04% : 0.000183s : 1: opt_after_cconv 0.17% : 0.000786s : 1: opt_after_jit_grad 0.09% : 0.000409s : 1: opt_b 31.24% : 0.143915s : 1: optimize 0.01% : 0.000033s : 1: optimize_parallel_all_gather_comm 0.00% : 0.000014s : 1: order_py_execute_after_rewriter 0.01% : 0.000037s : 1: overlap_grad_flash_sp 0.00% : 0.000008s : 1: overlap_grad_matmul_and_grad_allreduce 0.00% : 0.000013s : 1: overlap_grad_ring_attention 0.00% : 0.000009s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000009s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000010s : 1: overlap_param_gather 0.00% : 0.000009s : 1: overlap_recompute_allgather_and_fa_grad 0.00% : 0.000013s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000010s : 1: overlap_recompute_comm 0.00% : 0.000012s : 1: parallel-infer-symbol 0.00% : 0.000007s : 1: parallel-infer-symbol-second 0.00% : 0.000009s : 1: partial_unused_args_eliminate 0.00% : 0.000012s : 1: pipeline_parallel_scheduler 0.00% : 0.000008s : 1: pipeline_split 0.01% : 0.000058s : 1: pre_auto_parallel 0.01% : 0.000048s : 1: py_interpret_to_execute 0.01% : 0.000031s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000007s : 1: remove_cast_before_assign_add 0.01% : 0.000029s : 1: remove_dup_value 29.54% : 0.136085s : 1: renormalize.infer 0.14% : 0.000638s : 1: renormalize.specialize 0.00% : 0.000010s : 1: reorder_send_recv_between_fp_bp 0.00% : 0.000015s : 1: rewriter_after_jit_bprop_graph 0.02% : 0.000074s : 1: rewriter_after_opt_a 0.03% : 0.000120s : 1: rewriter_before_opt_a 0.00% : 0.000008s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000009s : 1: slice_recompute_activation 0.00% : 0.000008s : 1: split_layernorm_comm 0.00% : 0.000009s : 1: split_matmul_comm_elemetwise 0.00% : 0.000014s : 1: swap_dp_allreduce_reducescatter 0.03% : 0.000138s : 1: symbol_engine_optimizer 0.03% : 0.000131s : 1: tuple_transform 5.26% : 0.024243s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:38:20.278.521 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.178792, [21] [bootstrap]: 0.00048576 [type_inference]: 0.166669 [event_method]: 2.219e-05 [auto_monad]: 7.781e-05 [graph_reusing]: 6.48998e-06 [inline]: 3.31001e-06 [add_attr]: 0.00387429, [1] [add_attr_with_inline]: 0.00386215, [1] [Cycle 1]: 7.966e-05, [2] [tag_attr]: 2.449e-05 [meta_addattr_fg_expand]: 6.11e-06 [parallel-infer-symbol]: 3.73001e-06 [pre_auto_parallel]: 4.588e-05 [insert-virtual-dataset]: 2.58e-06 [parallel-infer-symbol-second]: 7.90023e-07 [dataset_repeat_opt]: 2.84999e-06 [pipeline_split]: 1.59e-06 [optimize]: 0.00676566, [53] [py_interpret_to_execute]: 3.28e-05 [rewriter_before_opt_a]: 9.916e-05 [opt_a]: 0.00410375, [2] [Cycle 1]: 0.00322597, [45] [expand_dump_flag]: 3.03e-06 [switch_simplify]: 4.576e-05 [loop_unroll]: 3.116e-05 [a_1]: 0.00082924 [with_stream_mark]: 2.318e-05 [recompute_prepare]: 1.296e-05 [updatestate_depend_eliminate]: 5.09e-06 [updatestate_assign_eliminate]: 3.83001e-06 [updatestate_loads_eliminate]: 3.54002e-06 [parameter_eliminate]: 1.97999e-06 [a_2]: 0.0001133 [accelerated_algorithm]: 2.598e-05 [shard]: 2.03997e-06 [meta_shard_fg_expand]: 2.39001e-06 [shard_inline]: 8.64e-06 [merge_send_recv]: 9.67001e-06 [auto_parallel]: 7.9e-06 [parallel]: 2.258e-05 [flash_sp]: 9.76998e-06 [merge_comm]: 5.54e-06 [allreduce_fusion]: 4.18001e-06 [matmul_add_comm_reduction]: 1.374e-05 [allreduce_slice_to_reducescatter]: 6.10016e-07 [virtual_shard_identity]: 1.308e-05 [virtual_dataset]: 8.2e-06 [get_grad_eliminate_]: 7.73999e-06 [virtual_output]: 9.36e-06 [merge_forward]: 5.57001e-06 [cell_reuse_recompute_pass]: 1.79998e-06 [offload_activation]: 1.283e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.572e-05 [merge_recompute_call_nodes]: 1.81003e-06 [before_grad]: 1.568e-05 [set_forward_comm_id_for_comm_node_pass]: 4.59998e-06 [meta_fg_expand]: 3.58999e-06 [flash_sp_send_recv_attached]: 5.87999e-06 [receive_attached]: 2.71e-06 [after_resolve]: 1.483e-05 [a_after_grad]: 1.355e-05 [renormalize]: 0.00146053 [add_forward_monad_depend]: 9.79e-06 [auto_monad_grad]: 2.85002e-06 [auto_monad_eliminator]: 2.484e-05 [cse]: 4.066e-05 [a_3]: 7.249e-05 [Cycle 2]: 0.00086469, [45] [expand_dump_flag]: 2.29001e-06 [switch_simplify]: 1.09e-05 [loop_unroll]: 7.78001e-06 [a_1]: 0.00020871 [with_stream_mark]: 1.611e-05 [recompute_prepare]: 8.25999e-06 [updatestate_depend_eliminate]: 5.01002e-06 [updatestate_assign_eliminate]: 3.52997e-06 [updatestate_loads_eliminate]: 3.23e-06 [parameter_eliminate]: 2.36e-06 [a_2]: 0.00010131 [accelerated_algorithm]: 1.306e-05 [shard]: 1.46998e-06 [meta_shard_fg_expand]: 2.39001e-06 [shard_inline]: 7.68001e-06 [merge_send_recv]: 8.58001e-06 [auto_parallel]: 9.51998e-06 [parallel]: 1.006e-05 [flash_sp]: 5.19e-06 [merge_comm]: 4.31002e-06 [allreduce_fusion]: 4.23999e-06 [matmul_add_comm_reduction]: 1.188e-05 [allreduce_slice_to_reducescatter]: 8.70001e-07 [virtual_shard_identity]: 9.49999e-06 [virtual_dataset]: 7.56001e-06 [get_grad_eliminate_]: 7.01001e-06 [virtual_output]: 6.94999e-06 [merge_forward]: 4.57e-06 [cell_reuse_recompute_pass]: 2.48e-06 [offload_activation]: 1.118e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.607e-05 [merge_recompute_call_nodes]: 1.54e-06 [before_grad]: 1.226e-05 [set_forward_comm_id_for_comm_node_pass]: 5.80002e-06 [meta_fg_expand]: 3.2e-06 [flash_sp_send_recv_attached]: 1.61998e-06 [receive_attached]: 2.36e-06 [after_resolve]: 1.394e-05 [a_after_grad]: 1.109e-05 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 1.69e-06 [auto_monad_grad]: 2.09e-06 [auto_monad_eliminator]: 1.096e-05 [cse]: 2.268e-05 [a_3]: 4.735e-05 [py_interpret_to_execute_after_opt_a]: 1.686e-05 [slice_cell_reuse_recomputed_activation]: 2.11e-06 [rewriter_after_opt_a]: 4.644e-05 [convert_after_rewriter]: 8.68001e-06 [order_py_execute_after_rewriter]: 6.44001e-06 [mutable_eliminate]: 0.00076658 [opt_b]: 0.00028313, [1] [Cycle 1]: 0.00027461, [7] [b_1]: 0.00018114 [b_2]: 1.051e-05 [updatestate_depend_eliminate]: 8.07998e-06 [updatestate_assign_eliminate]: 3.58e-06 [updatestate_loads_eliminate]: 3.51999e-06 [renormalize]: 8.60018e-07 [cse]: 2.99e-05 [optimize_parallel_all_gather_comm]: 2.051e-05 [overlap_param_gather]: 2.51e-06 [cconv]: 3.265e-05 [loop_unroll]: 0.00049274 [opt_after_cconv]: 0.00013079, [1] [Cycle 1]: 0.00012396, [7] [c_1]: 4.355e-05 [parameter_eliminate]: 4.37e-06 [updatestate_depend_eliminate]: 7.22002e-06 [updatestate_assign_eliminate]: 3.55e-06 [updatestate_loads_eliminate]: 3.21001e-06 [cse]: 2.606e-05 [renormalize]: 5.39992e-07 [remove_dup_value]: 1.831e-05 [tuple_transform]: 0.00014852, [1] [Cycle 1]: 0.000144, [4] [d_1]: 0.00011354 [none_parameter_eliminate]: 2.09e-06 [renormalize]: 2.19996e-07 [switch_simplify]: 8.64998e-06 [partial_unused_args_eliminate]: 2.28998e-06 [add_recomputation]: 6.339e-05 [cse_after_recomputation]: 3.048e-05, [1] [Cycle 1]: 2.45e-05, [1] [cse]: 1.788e-05 [environ_conv]: 8.45999e-06 [swap_dp_allreduce_reducescatter]: 6.21e-06 [bias_add_comm_swap]: 3.17002e-06 [label_micro_interleaved_index]: 5.69e-06 [label_fine_grained_interleaved_index]: 3.05002e-06 [merge_cast_opt]: 1.45999e-06 [slice_recompute_activation]: 2.08998e-06 [micro_interleaved_order_control]: 2.69999e-06 [assign_add_opt]: 1.60001e-06 [ForceFp32Comm]: 9.49978e-07 [remove_cast_before_assign_add]: 1.50001e-06 [full_micro_interleaved_order_control]: 2.46998e-06 [reorder_send_recv_between_fp_bp]: 2.87002e-06 [comm_op_add_attrs]: 9.99979e-07 [add_comm_op_reuse_tag]: 1.54e-06 [interleave_split_concat_branches]: 1.27e-06 [interleave_parallel_branches]: 1.07e-06 [overlap_opt_shard_in_pipeline]: 1.42e-06 [overlap_opt_shard_grad_in_pipeline]: 1.92999e-06 [control_data_broadcast_order]: 1.678e-05 [grouped_pairwise_exchange_alltoall]: 1.68002e-06 [offloading_packed_experts]: 5.12e-06 [overlap_recompute_and_grad_model_parallel]: 5.68002e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.26002e-06 [overlap_recompute_allgather_and_fa_grad]: 1.45999e-06 [overlap_recompute_comm]: 2.74001e-06 [overlap_grad_ring_attention]: 5.28002e-06 [overlap_grad_flash_sp]: 2.46e-05 [begin_end_overlap_inline]: 6.19999e-07 [split_matmul_comm_elemetwise]: 2.14e-06 [split_layernorm_comm]: 1.72999e-06 [handle_group_info]: 9.70002e-07 [symbol_engine_optimizer]: 9.95e-05, [1] [Cycle 1]: 9.411e-05, [6] [build]: 4.42e-06 [elim_shapecalc]: 1.577e-05 [elim_not_effective]: 1.723e-05 [opt_reshape]: 9.42999e-06 [fold_const_symbol]: 1.422e-05 [renormalize]: 2.19996e-07 [detach_backward]: 2.26e-06 [pipeline_parallel_scheduler]: 1.55001e-06 [auto_monad_reorder]: 2.312e-05 [get_jit_bprop_graph]: 2.37001e-06 [rewriter_after_jit_bprop_graph]: 6.81001e-06 [opt_after_jit_grad]: 0.00056048 [validate]: 5.264e-05 Sums bootstrap : 0.000486s : 0.28% type_inference : 0.166669s : 95.87% event_method : 0.000022s : 0.01% auto_monad : 0.000078s : 0.04% graph_reusing : 0.000006s : 0.00% inline : 0.000003s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000024s : 0.01% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.00% parallel-infer-symbol : 0.000004s : 0.00% pre_auto_parallel : 0.000046s : 0.03% insert-virtual-dataset : 0.000003s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000003s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000033s : 0.02% optimize.rewriter_before_opt_a : 0.000099s : 0.06% optimize.opt_a.expand_dump_flag : 0.000005s : 0.00% optimize.opt_a.switch_simplify : 0.000057s : 0.03% optimize.opt_a.loop_unroll : 0.000039s : 0.02% optimize.opt_a.a_1 : 0.001038s : 0.60% optimize.opt_a.with_stream_mark : 0.000039s : 0.02% optimize.opt_a.recompute_prepare : 0.000021s : 0.01% optimize.opt_a.updatestate_depend_eliminate : 0.000010s : 0.01% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.00% optimize.opt_a.updatestate_loads_eliminate : 0.000007s : 0.00% optimize.opt_a.parameter_eliminate : 0.000004s : 0.00% optimize.opt_a.a_2 : 0.000215s : 0.12% optimize.opt_a.accelerated_algorithm : 0.000039s : 0.02% optimize.opt_a.shard : 0.000004s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.000005s : 0.00% optimize.opt_a.shard_inline : 0.000016s : 0.01% optimize.opt_a.merge_send_recv : 0.000018s : 0.01% optimize.opt_a.auto_parallel : 0.000017s : 0.01% optimize.opt_a.parallel : 0.000033s : 0.02% optimize.opt_a.flash_sp : 0.000015s : 0.01% optimize.opt_a.merge_comm : 0.000010s : 0.01% optimize.opt_a.allreduce_fusion : 0.000008s : 0.00% optimize.opt_a.matmul_add_comm_reduction : 0.000026s : 0.01% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000023s : 0.01% optimize.opt_a.virtual_dataset : 0.000016s : 0.01% optimize.opt_a.get_grad_eliminate_ : 0.000015s : 0.01% optimize.opt_a.virtual_output : 0.000016s : 0.01% optimize.opt_a.merge_forward : 0.000010s : 0.01% optimize.opt_a.cell_reuse_recompute_pass : 0.000004s : 0.00% optimize.opt_a.offload_activation : 0.000024s : 0.01% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000032s : 0.02% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.00% optimize.opt_a.before_grad : 0.000028s : 0.02% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000010s : 0.01% optimize.opt_a.meta_fg_expand : 0.000007s : 0.00% optimize.opt_a.flash_sp_send_recv_attached : 0.000007s : 0.00% optimize.opt_a.receive_attached : 0.000005s : 0.00% optimize.opt_a.after_resolve : 0.000029s : 0.02% optimize.opt_a.a_after_grad : 0.000025s : 0.01% optimize.opt_a.renormalize : 0.001461s : 0.84% optimize.opt_a.add_forward_monad_depend : 0.000011s : 0.01% optimize.opt_a.auto_monad_grad : 0.000005s : 0.00% optimize.opt_a.auto_monad_eliminator : 0.000036s : 0.02% optimize.opt_a.cse : 0.000063s : 0.04% optimize.opt_a.a_3 : 0.000120s : 0.07% optimize.py_interpret_to_execute_after_opt_a : 0.000017s : 0.01% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.00% optimize.rewriter_after_opt_a : 0.000046s : 0.03% optimize.convert_after_rewriter : 0.000009s : 0.00% optimize.order_py_execute_after_rewriter : 0.000006s : 0.00% optimize.mutable_eliminate : 0.000767s : 0.44% optimize.opt_b.b_1 : 0.000181s : 0.10% optimize.opt_b.b_2 : 0.000011s : 0.01% optimize.opt_b.updatestate_depend_eliminate : 0.000008s : 0.00% optimize.opt_b.updatestate_assign_eliminate : 0.000004s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000004s : 0.00% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000030s : 0.02% optimize.optimize_parallel_all_gather_comm : 0.000021s : 0.01% optimize.overlap_param_gather : 0.000003s : 0.00% optimize.cconv : 0.000033s : 0.02% optimize.loop_unroll : 0.000493s : 0.28% optimize.opt_after_cconv.c_1 : 0.000044s : 0.03% optimize.opt_after_cconv.parameter_eliminate : 0.000004s : 0.00% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000007s : 0.00% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000004s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.cse : 0.000026s : 0.01% optimize.opt_after_cconv.renormalize : 0.000001s : 0.00% optimize.remove_dup_value : 0.000018s : 0.01% optimize.tuple_transform.d_1 : 0.000114s : 0.07% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000009s : 0.00% optimize.partial_unused_args_eliminate : 0.000002s : 0.00% optimize.add_recomputation : 0.000063s : 0.04% optimize.cse_after_recomputation.cse : 0.000018s : 0.01% optimize.environ_conv : 0.000008s : 0.00% optimize.swap_dp_allreduce_reducescatter : 0.000006s : 0.00% optimize.bias_add_comm_swap : 0.000003s : 0.00% optimize.label_micro_interleaved_index : 0.000006s : 0.00% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.00% optimize.merge_cast_opt : 0.000001s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.00% optimize.micro_interleaved_order_control : 0.000003s : 0.00% optimize.assign_add_opt : 0.000002s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000002s : 0.00% optimize.full_micro_interleaved_order_control : 0.000002s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.00% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000002s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.00% optimize.control_data_broadcast_order : 0.000017s : 0.01% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.00% optimize.offloading_packed_experts : 0.000005s : 0.00% optimize.overlap_recompute_and_grad_model_parallel : 0.000006s : 0.00% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.00% optimize.overlap_recompute_comm : 0.000003s : 0.00% optimize.overlap_grad_ring_attention : 0.000005s : 0.00% optimize.overlap_grad_flash_sp : 0.000025s : 0.01% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.00% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000004s : 0.00% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000016s : 0.01% optimize.symbol_engine_optimizer.elim_not_effective : 0.000017s : 0.01% optimize.symbol_engine_optimizer.opt_reshape : 0.000009s : 0.01% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000014s : 0.01% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.00% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000023s : 0.01% get_jit_bprop_graph : 0.000002s : 0.00% rewriter_after_jit_bprop_graph : 0.000007s : 0.00% opt_after_jit_grad : 0.000560s : 0.32% validate : 0.000053s : 0.03% Time group info: ------[substitution.] 0.000308 45 12.56% : 0.000039s : 5: substitution.arithmetic_simplify 9.50% : 0.000029s : 3: substitution.cast_eliminate 0.78% : 0.000002s : 3: substitution.elim_not_effective 0.88% : 0.000003s : 3: substitution.fold_const_symbol 2.67% : 0.000008s : 5: substitution.graph_param_transform 56.93% : 0.000175s : 4: substitution.inline 2.23% : 0.000007s : 6: substitution.j_node_and_user_rematch 5.05% : 0.000016s : 2: substitution.less_batch_normalization 2.13% : 0.000007s : 6: substitution.remove_not_recompute_node 2.15% : 0.000007s : 4: substitution.replace_old_param 5.13% : 0.000016s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.166592 2 99.45% : 0.165673s : 1: type_inference.infer 0.55% : 0.000919s : 1: type_inference.specialize ------[replace.] 0.000071 8 59.18% : 0.000042s : 4: replace.inline 40.82% : 0.000029s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000186 8 92.52% : 0.000172s : 4: match.inline 7.48% : 0.000014s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000270 1596 0.89% : 0.000002s : 17: predicate.accumulaten_eliminater 0.89% : 0.000002s : 5: predicate.ad_related_special_op_eliminate 0.52% : 0.000001s : 10: predicate.addn_check_dump 0.92% : 0.000002s : 17: predicate.addn_zero_filter 0.83% : 0.000002s : 17: predicate.adjust_all_reduce_mul_add 2.65% : 0.000007s : 27: predicate.arithmetic_simplify 0.90% : 0.000002s : 17: predicate.cast_eliminate 0.59% : 0.000002s : 10: predicate.check_bprop_eliminate 0.75% : 0.000002s : 10: predicate.compare_switch_simplify 0.18% : 0.000000s : 5: predicate.const_output_eliminate 0.69% : 0.000002s : 10: predicate.depend_value_elim 0.91% : 0.000002s : 17: predicate.dict_get_item_const_eliminator 1.04% : 0.000003s : 17: predicate.dict_get_item_eliminator 0.92% : 0.000002s : 17: predicate.dict_set_item_eliminator 1.10% : 0.000003s : 10: predicate.dumpgradient_eliminate 0.19% : 0.000001s : 5: predicate.elim_not_effective 0.43% : 0.000001s : 5: predicate.elim_shapecalc_of_broadcastargs 1.14% : 0.000003s : 22: predicate.environ_add_const_eliminate 1.07% : 0.000003s : 22: predicate.environ_get_add_eliminate 1.13% : 0.000003s : 22: predicate.environ_get_depend_swap 1.63% : 0.000004s : 32: predicate.environ_get_eliminate 1.26% : 0.000003s : 22: predicate.environ_get_set_eliminate 1.49% : 0.000004s : 25: predicate.exchange_switch_depend_value 2.82% : 0.000008s : 25: predicate.float_depend_g_call 0.51% : 0.000001s : 10: predicate.float_environ_get_switch 0.73% : 0.000002s : 15: predicate.float_tuple_getitem_switch 0.16% : 0.000000s : 5: predicate.fold_const_symbol 0.78% : 0.000002s : 10: predicate.get_grad_eliminate 0.25% : 0.000001s : 5: predicate.graph_param_transform 0.56% : 0.000002s : 10: predicate.incorporate_call 0.49% : 0.000001s : 10: predicate.incorporate_call_switch 6.15% : 0.000017s : 72: predicate.inline 0.76% : 0.000002s : 10: predicate.inline_without_move 0.31% : 0.000001s : 10: predicate.j_node_and_user_rematch 0.90% : 0.000002s : 10: predicate.less_batch_normalization 1.92% : 0.000005s : 31: predicate.list_to_tuple_eliminator_ 2.48% : 0.000007s : 48: predicate.load_eliminater 0.90% : 0.000002s : 5: predicate.loop_unroll_after_grad 1.92% : 0.000005s : 36: predicate.loop_unroll_before_grad 1.72% : 0.000005s : 27: predicate.make_slice_get_slice_eliminator 0.53% : 0.000001s : 10: predicate.merge_addn 0.53% : 0.000001s : 10: predicate.micro_step_allgather_replace 0.55% : 0.000001s : 10: predicate.mini_step_allgather_replace 0.87% : 0.000002s : 17: predicate.minmaximum_grad 1.03% : 0.000003s : 5: predicate.mutable_eliminate 0.40% : 0.000001s : 5: predicate.opt_reshape 0.38% : 0.000001s : 5: predicate.parallel_virtual_node 1.70% : 0.000005s : 25: predicate.partial_defer_inline 1.60% : 0.000004s : 26: predicate.partial_eliminate 0.92% : 0.000002s : 17: predicate.print_const_string_wrapper 0.61% : 0.000002s : 10: predicate.reduce_all_const_elim 1.31% : 0.000004s : 17: predicate.reduce_eliminate 2.46% : 0.000007s : 48: predicate.redundant_stop_gradient_eliminater 0.34% : 0.000001s : 10: predicate.remove_not_recompute_node 1.43% : 0.000004s : 31: predicate.replace_applicator 0.55% : 0.000001s : 10: predicate.replace_old_param 0.34% : 0.000001s : 5: predicate.reset_defer_inline 1.02% : 0.000003s : 17: predicate.reshape_eliminate 0.59% : 0.000002s : 10: predicate.row_tensor_add_zeros_like 0.45% : 0.000001s : 5: predicate.row_tensor_eliminate 0.97% : 0.000003s : 10: predicate.same_eliminate 0.49% : 0.000001s : 10: predicate.set_cell_output_no_recompute 0.64% : 0.000002s : 10: predicate.shard_identity_eliminate 0.72% : 0.000002s : 10: predicate.special_op_eliminate 0.63% : 0.000002s : 10: predicate.specialize_transform 1.33% : 0.000004s : 10: predicate.split_environ_get_set_with_tuple_value 0.69% : 0.000002s : 10: predicate.stack_unstack_eliminate 0.50% : 0.000001s : 5: predicate.switch_call_monad_eliminater 1.46% : 0.000004s : 25: predicate.switch_defer_inline 2.00% : 0.000005s : 35: predicate.switch_layer_defer_inline 4.58% : 0.000012s : 76: predicate.switch_simplify 0.93% : 0.000003s : 17: predicate.tile_eliminate 0.96% : 0.000003s : 17: predicate.transpose_eliminate 1.63% : 0.000004s : 27: predicate.tuple_list_convert_item_index_to_positive 1.56% : 0.000004s : 27: predicate.tuple_list_get_item_const_eliminator 1.38% : 0.000004s : 27: predicate.tuple_list_get_item_depend_reorder 3.10% : 0.000008s : 41: predicate.tuple_list_get_item_eliminator 1.52% : 0.000004s : 27: predicate.tuple_list_get_set_item_eliminator 2.32% : 0.000006s : 37: predicate.tuple_list_set_item_eliminator 1.69% : 0.000005s : 31: predicate.tuple_to_list_eliminator_ 2.48% : 0.000007s : 48: predicate.updatestate_pure_node_eliminater 3.00% : 0.000008s : 58: predicate.updatestate_useless_node_eliminater 0.37% : 0.000001s : 5: predicate.value_based_eliminate 0.65% : 0.000002s : 10: predicate.virtual_dataset_eliminate 0.66% : 0.000002s : 10: predicate.virtual_output_eliminate 0.26% : 0.000001s : 5: predicate.virtual_view_grad_eliminate 0.36% : 0.000001s : 5: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.001173 11 38.77% : 0.000455s : 5: func_graph_cloner_run.FuncGraphClonerGraph 61.23% : 0.000718s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.192801 192 0.00% : 0.000005s : 1: ForceFp32Comm 2.01% : 0.003880s : 1: add_attr 2.01% : 0.003866s : 1: add_attr_with_inline 0.00% : 0.000004s : 1: add_comm_op_reuse_tag 0.04% : 0.000068s : 1: add_recomputation 0.00% : 0.000004s : 1: assign_add_opt 0.04% : 0.000084s : 1: auto_monad 0.01% : 0.000028s : 1: auto_monad_reorder 0.00% : 0.000004s : 1: begin_end_overlap_inline 0.00% : 0.000006s : 1: bias_add_comm_swap 0.27% : 0.000521s : 1: bootstrap 0.02% : 0.000036s : 1: cconv 0.00% : 0.000005s : 1: comm_op_add_attrs 0.01% : 0.000020s : 1: control_data_broadcast_order 0.01% : 0.000012s : 1: convert_after_rewriter 0.02% : 0.000033s : 1: cse_after_recomputation 0.00% : 0.000006s : 1: dataset_repeat_opt 0.00% : 0.000005s : 1: detach_backward 0.01% : 0.000012s : 1: environ_conv 0.02% : 0.000030s : 1: event_method 0.00% : 0.000005s : 1: full_micro_interleaved_order_control 0.00% : 0.000006s : 1: get_jit_bprop_graph 0.01% : 0.000010s : 1: graph_reusing 0.00% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000004s : 1: handle_group_info 0.00% : 0.000006s : 1: inline 0.00% : 0.000006s : 1: insert-virtual-dataset 0.00% : 0.000004s : 1: interleave_parallel_branches 0.00% : 0.000004s : 1: interleave_split_concat_branches 0.00% : 0.000006s : 1: label_fine_grained_interleaved_index 0.00% : 0.000009s : 1: label_micro_interleaved_index 0.26% : 0.000502s : 1: loop_unroll 0.00% : 0.000004s : 1: merge_cast_opt 0.00% : 0.000006s : 1: micro_interleaved_order_control 0.40% : 0.000777s : 1: mutable_eliminate 0.00% : 0.000008s : 1: offloading_packed_experts 0.01% : 0.000020s : 1: opt.transform.loop_unroll_optimizer 0.01% : 0.000023s : 1: opt.transform.mutable_eliminate 0.84% : 0.001622s : 78: opt.transform.opt_a 0.02% : 0.000042s : 1: opt.transform.opt_after_cconv 0.02% : 0.000038s : 1: opt.transform.opt_after_jit_grad 0.08% : 0.000157s : 28: opt.transform.opt_b 0.06% : 0.000120s : 2: opt.transform.opt_trans_graph 0.03% : 0.000053s : 4: opt.transform.symbol_engine_opt 2.13% : 0.004107s : 1: opt_a 0.07% : 0.000134s : 1: opt_after_cconv 0.30% : 0.000571s : 1: opt_after_jit_grad 0.15% : 0.000287s : 1: opt_b 3.51% : 0.006773s : 1: optimize 0.01% : 0.000024s : 1: optimize_parallel_all_gather_comm 0.00% : 0.000010s : 1: order_py_execute_after_rewriter 0.01% : 0.000029s : 1: overlap_grad_flash_sp 0.00% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.00% : 0.000008s : 1: overlap_grad_ring_attention 0.00% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000006s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000005s : 1: overlap_param_gather 0.00% : 0.000005s : 1: overlap_recompute_allgather_and_fa_grad 0.00% : 0.000009s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000007s : 1: overlap_recompute_comm 0.00% : 0.000007s : 1: parallel-infer-symbol 0.00% : 0.000004s : 1: parallel-infer-symbol-second 0.00% : 0.000006s : 1: partial_unused_args_eliminate 0.00% : 0.000006s : 1: pipeline_parallel_scheduler 0.00% : 0.000004s : 1: pipeline_split 0.03% : 0.000050s : 1: pre_auto_parallel 0.02% : 0.000037s : 1: py_interpret_to_execute 0.01% : 0.000021s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000004s : 1: remove_cast_before_assign_add 0.01% : 0.000022s : 1: remove_dup_value 0.32% : 0.000613s : 1: renormalize.infer 0.43% : 0.000835s : 1: renormalize.specialize 0.00% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.01% : 0.000010s : 1: rewriter_after_jit_bprop_graph 0.03% : 0.000051s : 1: rewriter_after_opt_a 0.05% : 0.000103s : 1: rewriter_before_opt_a 0.00% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000005s : 1: slice_recompute_activation 0.00% : 0.000004s : 1: split_layernorm_comm 0.00% : 0.000005s : 1: split_matmul_comm_elemetwise 0.00% : 0.000010s : 1: swap_dp_allreduce_reducescatter 0.05% : 0.000102s : 1: symbol_engine_optimizer 0.08% : 0.000151s : 1: tuple_transform 86.46% : 0.166694s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:38:22.771.502 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:38:22.771.784 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.23372, [21] [bootstrap]: 0.190531 [type_inference]: 0.03053 [event_method]: 2.369e-05 [auto_monad]: 8.132e-05 [graph_reusing]: 6.40002e-06 [inline]: 2.71e-06 [add_attr]: 0.00405839, [1] [add_attr_with_inline]: 0.00404669, [1] [Cycle 1]: 9.962e-05, [2] [tag_attr]: 2.654e-05 [meta_addattr_fg_expand]: 6.49999e-06 [parallel-infer-symbol]: 3.76999e-06 [pre_auto_parallel]: 3.962e-05 [insert-virtual-dataset]: 2.41e-06 [parallel-infer-symbol-second]: 8.00006e-07 [dataset_repeat_opt]: 2.22001e-06 [pipeline_split]: 1.75001e-06 [optimize]: 0.00671962, [53] [py_interpret_to_execute]: 3.713e-05 [rewriter_before_opt_a]: 9.824e-05 [opt_a]: 0.00357273, [2] [Cycle 1]: 0.00261662, [45] [expand_dump_flag]: 3.23998e-06 [switch_simplify]: 4.498e-05 [loop_unroll]: 3.116e-05 [a_1]: 0.00070101 [with_stream_mark]: 2.011e-05 [recompute_prepare]: 1.071e-05 [updatestate_depend_eliminate]: 4.53999e-06 [updatestate_assign_eliminate]: 3.16001e-06 [updatestate_loads_eliminate]: 3.23998e-06 [parameter_eliminate]: 2.20002e-06 [a_2]: 0.00011954 [accelerated_algorithm]: 2.156e-05 [shard]: 2.24999e-06 [meta_shard_fg_expand]: 2.14e-06 [shard_inline]: 7.18e-06 [merge_send_recv]: 9.12999e-06 [auto_parallel]: 7.46001e-06 [parallel]: 2.244e-05 [flash_sp]: 7.7e-06 [merge_comm]: 4.27e-06 [allreduce_fusion]: 3.44001e-06 [matmul_add_comm_reduction]: 1.214e-05 [allreduce_slice_to_reducescatter]: 1.00999e-06 [virtual_shard_identity]: 9.09998e-06 [virtual_dataset]: 7.32002e-06 [get_grad_eliminate_]: 6.49001e-06 [virtual_output]: 7.08e-06 [merge_forward]: 5.24998e-06 [cell_reuse_recompute_pass]: 1.20999e-06 [offload_activation]: 1.069e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.695e-05 [merge_recompute_call_nodes]: 1.45001e-06 [before_grad]: 1.142e-05 [set_forward_comm_id_for_comm_node_pass]: 4.03999e-06 [meta_fg_expand]: 3.26999e-06 [flash_sp_send_recv_attached]: 4.89e-06 [receive_attached]: 2.56e-06 [after_resolve]: 1.339e-05 [a_after_grad]: 1.035e-05 [renormalize]: 0.00092104 [add_forward_monad_depend]: 7.08e-06 [auto_monad_grad]: 2.64999e-06 [auto_monad_eliminator]: 1.785e-05 [cse]: 3.064e-05 [a_3]: 6.848e-05 [Cycle 2]: 0.00093834, [45] [expand_dump_flag]: 2.09999e-06 [switch_simplify]: 9.19e-06 [loop_unroll]: 6.79999e-06 [a_1]: 0.00015858 [with_stream_mark]: 1.427e-05 [recompute_prepare]: 7.03e-06 [updatestate_depend_eliminate]: 3.73999e-06 [updatestate_assign_eliminate]: 3.30998e-06 [updatestate_loads_eliminate]: 3.08998e-06 [parameter_eliminate]: 1.47001e-06 [a_2]: 0.00011302 [accelerated_algorithm]: 1.324e-05 [shard]: 1.79e-06 [meta_shard_fg_expand]: 2.35002e-06 [shard_inline]: 6.72002e-06 [merge_send_recv]: 6.17999e-06 [auto_parallel]: 8.28999e-06 [parallel]: 7.77e-06 [flash_sp]: 3.85998e-06 [merge_comm]: 3.45e-06 [allreduce_fusion]: 3.21999e-06 [matmul_add_comm_reduction]: 8.86002e-06 [allreduce_slice_to_reducescatter]: 6.59988e-07 [virtual_shard_identity]: 7.13e-06 [virtual_dataset]: 6.97002e-06 [get_grad_eliminate_]: 6.89999e-06 [virtual_output]: 6.23e-06 [merge_forward]: 3.41999e-06 [cell_reuse_recompute_pass]: 2.94999e-06 [offload_activation]: 9.77001e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.386e-05 [merge_recompute_call_nodes]: 9.39996e-07 [before_grad]: 1.021e-05 [set_forward_comm_id_for_comm_node_pass]: 4.62998e-06 [meta_fg_expand]: 2.61e-06 [flash_sp_send_recv_attached]: 1.16002e-06 [receive_attached]: 1.86003e-06 [after_resolve]: 1.238e-05 [a_after_grad]: 1.052e-05 [renormalize]: 7.99773e-08 [add_forward_monad_depend]: 1.25001e-06 [auto_monad_grad]: 1.49e-06 [auto_monad_eliminator]: 1.136e-05 [cse]: 1.984e-05 [a_3]: 5.383e-05 [py_interpret_to_execute_after_opt_a]: 1.863e-05 [slice_cell_reuse_recomputed_activation]: 4.77998e-06 [rewriter_after_opt_a]: 4.601e-05 [convert_after_rewriter]: 9.84999e-06 [order_py_execute_after_rewriter]: 9.10999e-06 [mutable_eliminate]: 0.00083617 [opt_b]: 0.00036751, [1] [Cycle 1]: 0.00035386, [7] [b_1]: 0.00021838 [b_2]: 1.07e-05 [updatestate_depend_eliminate]: 1.073e-05 [updatestate_assign_eliminate]: 3.36001e-06 [updatestate_loads_eliminate]: 3.18e-06 [renormalize]: 3.89991e-07 [cse]: 2.701e-05 [optimize_parallel_all_gather_comm]: 2.538e-05 [overlap_param_gather]: 5.54e-06 [cconv]: 3.784e-05 [loop_unroll]: 0.00068873 [opt_after_cconv]: 0.00015552, [1] [Cycle 1]: 0.00014462, [7] [c_1]: 3.945e-05 [parameter_eliminate]: 5.40001e-06 [updatestate_depend_eliminate]: 7.13e-06 [updatestate_assign_eliminate]: 2.90002e-06 [updatestate_loads_eliminate]: 3.96001e-06 [cse]: 2.5e-05 [renormalize]: 9.49978e-07 [remove_dup_value]: 2.006e-05 [tuple_transform]: 0.00011402, [1] [Cycle 1]: 0.00010547, [4] [d_1]: 6.264e-05 [none_parameter_eliminate]: 1.86e-06 [renormalize]: 5.59987e-07 [switch_simplify]: 7.93001e-06 [partial_unused_args_eliminate]: 5.42001e-06 [add_recomputation]: 6.143e-05 [cse_after_recomputation]: 3.116e-05, [1] [Cycle 1]: 2.384e-05, [1] [cse]: 1.41e-05 [environ_conv]: 1.123e-05 [swap_dp_allreduce_reducescatter]: 8.79003e-06 [bias_add_comm_swap]: 6.09001e-06 [label_micro_interleaved_index]: 9.17999e-06 [label_fine_grained_interleaved_index]: 5.14e-06 [merge_cast_opt]: 4.73001e-06 [slice_recompute_activation]: 6.46999e-06 [micro_interleaved_order_control]: 5.05001e-06 [assign_add_opt]: 4.03999e-06 [ForceFp32Comm]: 3.81999e-06 [remove_cast_before_assign_add]: 3.98001e-06 [full_micro_interleaved_order_control]: 5.52001e-06 [reorder_send_recv_between_fp_bp]: 6.69001e-06 [comm_op_add_attrs]: 1.071e-05 [add_comm_op_reuse_tag]: 4.42003e-06 [interleave_split_concat_branches]: 3.8e-06 [interleave_parallel_branches]: 3.71001e-06 [overlap_opt_shard_in_pipeline]: 4.75001e-06 [overlap_opt_shard_grad_in_pipeline]: 5.14e-06 [control_data_broadcast_order]: 1.847e-05 [grouped_pairwise_exchange_alltoall]: 4.68001e-06 [offloading_packed_experts]: 8.37e-06 [overlap_recompute_and_grad_model_parallel]: 8.50999e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.73001e-06 [overlap_recompute_allgather_and_fa_grad]: 3.96001e-06 [overlap_recompute_comm]: 5.60001e-06 [overlap_grad_ring_attention]: 8.17998e-06 [overlap_grad_flash_sp]: 2.713e-05 [begin_end_overlap_inline]: 4.98001e-06 [split_matmul_comm_elemetwise]: 4.89e-06 [split_layernorm_comm]: 4.58999e-06 [handle_group_info]: 4.22998e-06 [symbol_engine_optimizer]: 0.00011499, [1] [Cycle 1]: 0.00010736, [6] [build]: 3.93001e-06 [elim_shapecalc]: 1.528e-05 [elim_not_effective]: 1.695e-05 [opt_reshape]: 8.3e-06 [fold_const_symbol]: 1.226e-05 [renormalize]: 2.00002e-07 [detach_backward]: 5.24998e-06 [pipeline_parallel_scheduler]: 2.09e-06 [auto_monad_reorder]: 2.577e-05 [get_jit_bprop_graph]: 2.09999e-06 [rewriter_after_jit_bprop_graph]: 8.94998e-06 [opt_after_jit_grad]: 0.00077807 [validate]: 5.224e-05 Sums bootstrap : 0.190531s : 83.73% type_inference : 0.030530s : 13.42% event_method : 0.000024s : 0.01% auto_monad : 0.000081s : 0.04% graph_reusing : 0.000006s : 0.00% inline : 0.000003s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000027s : 0.01% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.00% parallel-infer-symbol : 0.000004s : 0.00% pre_auto_parallel : 0.000040s : 0.02% insert-virtual-dataset : 0.000002s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000037s : 0.02% optimize.rewriter_before_opt_a : 0.000098s : 0.04% optimize.opt_a.expand_dump_flag : 0.000005s : 0.00% optimize.opt_a.switch_simplify : 0.000054s : 0.02% optimize.opt_a.loop_unroll : 0.000038s : 0.02% optimize.opt_a.a_1 : 0.000860s : 0.38% optimize.opt_a.with_stream_mark : 0.000034s : 0.02% optimize.opt_a.recompute_prepare : 0.000018s : 0.01% optimize.opt_a.updatestate_depend_eliminate : 0.000008s : 0.00% optimize.opt_a.updatestate_assign_eliminate : 0.000006s : 0.00% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.00% optimize.opt_a.parameter_eliminate : 0.000004s : 0.00% optimize.opt_a.a_2 : 0.000233s : 0.10% optimize.opt_a.accelerated_algorithm : 0.000035s : 0.02% optimize.opt_a.shard : 0.000004s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.00% optimize.opt_a.shard_inline : 0.000014s : 0.01% optimize.opt_a.merge_send_recv : 0.000015s : 0.01% optimize.opt_a.auto_parallel : 0.000016s : 0.01% optimize.opt_a.parallel : 0.000030s : 0.01% optimize.opt_a.flash_sp : 0.000012s : 0.01% optimize.opt_a.merge_comm : 0.000008s : 0.00% optimize.opt_a.allreduce_fusion : 0.000007s : 0.00% optimize.opt_a.matmul_add_comm_reduction : 0.000021s : 0.01% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000002s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000016s : 0.01% optimize.opt_a.virtual_dataset : 0.000014s : 0.01% optimize.opt_a.get_grad_eliminate_ : 0.000013s : 0.01% optimize.opt_a.virtual_output : 0.000013s : 0.01% optimize.opt_a.merge_forward : 0.000009s : 0.00% optimize.opt_a.cell_reuse_recompute_pass : 0.000004s : 0.00% optimize.opt_a.offload_activation : 0.000020s : 0.01% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000031s : 0.01% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.00% optimize.opt_a.before_grad : 0.000022s : 0.01% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000009s : 0.00% optimize.opt_a.meta_fg_expand : 0.000006s : 0.00% optimize.opt_a.flash_sp_send_recv_attached : 0.000006s : 0.00% optimize.opt_a.receive_attached : 0.000004s : 0.00% optimize.opt_a.after_resolve : 0.000026s : 0.01% optimize.opt_a.a_after_grad : 0.000021s : 0.01% optimize.opt_a.renormalize : 0.000921s : 0.40% optimize.opt_a.add_forward_monad_depend : 0.000008s : 0.00% optimize.opt_a.auto_monad_grad : 0.000004s : 0.00% optimize.opt_a.auto_monad_eliminator : 0.000029s : 0.01% optimize.opt_a.cse : 0.000050s : 0.02% optimize.opt_a.a_3 : 0.000122s : 0.05% optimize.py_interpret_to_execute_after_opt_a : 0.000019s : 0.01% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.00% optimize.rewriter_after_opt_a : 0.000046s : 0.02% optimize.convert_after_rewriter : 0.000010s : 0.00% optimize.order_py_execute_after_rewriter : 0.000009s : 0.00% optimize.mutable_eliminate : 0.000836s : 0.37% optimize.opt_b.b_1 : 0.000218s : 0.10% optimize.opt_b.b_2 : 0.000011s : 0.00% optimize.opt_b.updatestate_depend_eliminate : 0.000011s : 0.00% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.00% optimize.opt_b.renormalize : 0.000000s : 0.00% optimize.opt_b.cse : 0.000027s : 0.01% optimize.optimize_parallel_all_gather_comm : 0.000025s : 0.01% optimize.overlap_param_gather : 0.000006s : 0.00% optimize.cconv : 0.000038s : 0.02% optimize.loop_unroll : 0.000689s : 0.30% optimize.opt_after_cconv.c_1 : 0.000039s : 0.02% optimize.opt_after_cconv.parameter_eliminate : 0.000005s : 0.00% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000007s : 0.00% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000004s : 0.00% optimize.opt_after_cconv.cse : 0.000025s : 0.01% optimize.opt_after_cconv.renormalize : 0.000001s : 0.00% optimize.remove_dup_value : 0.000020s : 0.01% optimize.tuple_transform.d_1 : 0.000063s : 0.03% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000001s : 0.00% optimize.tuple_transform.switch_simplify : 0.000008s : 0.00% optimize.partial_unused_args_eliminate : 0.000005s : 0.00% optimize.add_recomputation : 0.000061s : 0.03% optimize.cse_after_recomputation.cse : 0.000014s : 0.01% optimize.environ_conv : 0.000011s : 0.00% optimize.swap_dp_allreduce_reducescatter : 0.000009s : 0.00% optimize.bias_add_comm_swap : 0.000006s : 0.00% optimize.label_micro_interleaved_index : 0.000009s : 0.00% optimize.label_fine_grained_interleaved_index : 0.000005s : 0.00% optimize.merge_cast_opt : 0.000005s : 0.00% optimize.slice_recompute_activation : 0.000006s : 0.00% optimize.micro_interleaved_order_control : 0.000005s : 0.00% optimize.assign_add_opt : 0.000004s : 0.00% optimize.ForceFp32Comm : 0.000004s : 0.00% optimize.remove_cast_before_assign_add : 0.000004s : 0.00% optimize.full_micro_interleaved_order_control : 0.000006s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000007s : 0.00% optimize.comm_op_add_attrs : 0.000011s : 0.00% optimize.add_comm_op_reuse_tag : 0.000004s : 0.00% optimize.interleave_split_concat_branches : 0.000004s : 0.00% optimize.interleave_parallel_branches : 0.000004s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000005s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000005s : 0.00% optimize.control_data_broadcast_order : 0.000018s : 0.01% optimize.grouped_pairwise_exchange_alltoall : 0.000005s : 0.00% optimize.offloading_packed_experts : 0.000008s : 0.00% optimize.overlap_recompute_and_grad_model_parallel : 0.000009s : 0.00% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.00% optimize.overlap_recompute_comm : 0.000006s : 0.00% optimize.overlap_grad_ring_attention : 0.000008s : 0.00% optimize.overlap_grad_flash_sp : 0.000027s : 0.01% optimize.begin_end_overlap_inline : 0.000005s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000005s : 0.00% optimize.split_layernorm_comm : 0.000005s : 0.00% optimize.handle_group_info : 0.000004s : 0.00% optimize.symbol_engine_optimizer.build : 0.000004s : 0.00% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000015s : 0.01% optimize.symbol_engine_optimizer.elim_not_effective : 0.000017s : 0.01% optimize.symbol_engine_optimizer.opt_reshape : 0.000008s : 0.00% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000012s : 0.01% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000005s : 0.00% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000026s : 0.01% get_jit_bprop_graph : 0.000002s : 0.00% rewriter_after_jit_bprop_graph : 0.000009s : 0.00% opt_after_jit_grad : 0.000778s : 0.34% validate : 0.000052s : 0.02% Time group info: ------[substitution.] 0.000263 36 13.59% : 0.000036s : 6: substitution.arithmetic_simplify 0.80% : 0.000002s : 2: substitution.elim_not_effective 0.98% : 0.000003s : 2: substitution.fold_const_symbol 2.99% : 0.000008s : 4: substitution.graph_param_transform 64.35% : 0.000169s : 4: substitution.inline 1.68% : 0.000004s : 4: substitution.j_node_and_user_rematch 4.82% : 0.000013s : 2: substitution.less_batch_normalization 2.33% : 0.000006s : 4: substitution.remove_not_recompute_node 2.16% : 0.000006s : 4: substitution.replace_old_param 6.30% : 0.000017s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.030451 2 18.93% : 0.005763s : 1: type_inference.infer 81.07% : 0.024688s : 1: type_inference.specialize ------[replace.] 0.000067 8 62.28% : 0.000042s : 4: replace.inline 37.72% : 0.000025s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000181 8 91.93% : 0.000166s : 4: match.inline 8.07% : 0.000015s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000230 1278 0.92% : 0.000002s : 13: predicate.accumulaten_eliminater 0.98% : 0.000002s : 4: predicate.ad_related_special_op_eliminate 0.50% : 0.000001s : 8: predicate.addn_check_dump 0.95% : 0.000002s : 13: predicate.addn_zero_filter 0.73% : 0.000002s : 13: predicate.adjust_all_reduce_mul_add 2.76% : 0.000006s : 21: predicate.arithmetic_simplify 1.12% : 0.000003s : 13: predicate.cast_eliminate 0.72% : 0.000002s : 8: predicate.check_bprop_eliminate 0.49% : 0.000001s : 8: predicate.compare_switch_simplify 0.16% : 0.000000s : 4: predicate.const_output_eliminate 0.77% : 0.000002s : 8: predicate.depend_value_elim 0.85% : 0.000002s : 13: predicate.dict_get_item_const_eliminator 0.98% : 0.000002s : 13: predicate.dict_get_item_eliminator 1.10% : 0.000003s : 13: predicate.dict_set_item_eliminator 1.16% : 0.000003s : 8: predicate.dumpgradient_eliminate 0.26% : 0.000001s : 4: predicate.elim_not_effective 0.46% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.23% : 0.000003s : 17: predicate.environ_add_const_eliminate 1.08% : 0.000002s : 17: predicate.environ_get_add_eliminate 1.01% : 0.000002s : 17: predicate.environ_get_depend_swap 1.62% : 0.000004s : 25: predicate.environ_get_eliminate 1.04% : 0.000002s : 17: predicate.environ_get_set_eliminate 1.29% : 0.000003s : 21: predicate.exchange_switch_depend_value 2.52% : 0.000006s : 21: predicate.float_depend_g_call 0.49% : 0.000001s : 8: predicate.float_environ_get_switch 0.73% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.20% : 0.000000s : 4: predicate.fold_const_symbol 0.69% : 0.000002s : 8: predicate.get_grad_eliminate 0.19% : 0.000000s : 4: predicate.graph_param_transform 0.50% : 0.000001s : 8: predicate.incorporate_call 0.43% : 0.000001s : 8: predicate.incorporate_call_switch 6.23% : 0.000014s : 58: predicate.inline 0.63% : 0.000001s : 8: predicate.inline_without_move 0.30% : 0.000001s : 8: predicate.j_node_and_user_rematch 1.29% : 0.000003s : 8: predicate.less_batch_normalization 1.78% : 0.000004s : 25: predicate.list_to_tuple_eliminator_ 2.38% : 0.000005s : 38: predicate.load_eliminater 1.42% : 0.000003s : 4: predicate.loop_unroll_after_grad 2.21% : 0.000005s : 34: predicate.loop_unroll_before_grad 1.61% : 0.000004s : 21: predicate.make_slice_get_slice_eliminator 0.53% : 0.000001s : 8: predicate.merge_addn 0.65% : 0.000002s : 8: predicate.micro_step_allgather_replace 0.69% : 0.000002s : 8: predicate.mini_step_allgather_replace 0.70% : 0.000002s : 13: predicate.minmaximum_grad 1.59% : 0.000004s : 4: predicate.mutable_eliminate 0.35% : 0.000001s : 4: predicate.opt_reshape 0.36% : 0.000001s : 4: predicate.parallel_virtual_node 1.63% : 0.000004s : 21: predicate.partial_defer_inline 1.50% : 0.000003s : 21: predicate.partial_eliminate 0.84% : 0.000002s : 13: predicate.print_const_string_wrapper 0.57% : 0.000001s : 8: predicate.reduce_all_const_elim 1.27% : 0.000003s : 13: predicate.reduce_eliminate 2.36% : 0.000005s : 38: predicate.redundant_stop_gradient_eliminater 0.45% : 0.000001s : 8: predicate.remove_not_recompute_node 1.34% : 0.000003s : 25: predicate.replace_applicator 0.40% : 0.000001s : 8: predicate.replace_old_param 0.34% : 0.000001s : 4: predicate.reset_defer_inline 0.82% : 0.000002s : 13: predicate.reshape_eliminate 0.69% : 0.000002s : 8: predicate.row_tensor_add_zeros_like 0.53% : 0.000001s : 4: predicate.row_tensor_eliminate 0.92% : 0.000002s : 8: predicate.same_eliminate 0.48% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.70% : 0.000002s : 8: predicate.shard_identity_eliminate 0.84% : 0.000002s : 8: predicate.special_op_eliminate 0.66% : 0.000002s : 8: predicate.specialize_transform 0.99% : 0.000002s : 8: predicate.split_environ_get_set_with_tuple_value 0.78% : 0.000002s : 8: predicate.stack_unstack_eliminate 0.30% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.37% : 0.000003s : 21: predicate.switch_defer_inline 1.98% : 0.000005s : 29: predicate.switch_layer_defer_inline 4.77% : 0.000011s : 67: predicate.switch_simplify 0.86% : 0.000002s : 13: predicate.tile_eliminate 0.91% : 0.000002s : 13: predicate.transpose_eliminate 1.47% : 0.000003s : 21: predicate.tuple_list_convert_item_index_to_positive 1.80% : 0.000004s : 21: predicate.tuple_list_get_item_const_eliminator 1.48% : 0.000003s : 21: predicate.tuple_list_get_item_depend_reorder 3.22% : 0.000007s : 33: predicate.tuple_list_get_item_eliminator 1.51% : 0.000003s : 21: predicate.tuple_list_get_set_item_eliminator 2.25% : 0.000005s : 29: predicate.tuple_list_set_item_eliminator 1.78% : 0.000004s : 25: predicate.tuple_to_list_eliminator_ 2.16% : 0.000005s : 38: predicate.updatestate_pure_node_eliminater 2.88% : 0.000007s : 46: predicate.updatestate_useless_node_eliminater 0.37% : 0.000001s : 4: predicate.value_based_eliminate 0.76% : 0.000002s : 8: predicate.virtual_dataset_eliminate 0.63% : 0.000001s : 8: predicate.virtual_output_eliminate 0.23% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.47% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000679 11 49.33% : 0.000335s : 5: func_graph_cloner_run.FuncGraphClonerGraph 50.67% : 0.000344s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.246961 192 0.00% : 0.000006s : 1: ForceFp32Comm 1.65% : 0.004070s : 1: add_attr 1.64% : 0.004051s : 1: add_attr_with_inline 0.00% : 0.000007s : 1: add_comm_op_reuse_tag 0.03% : 0.000065s : 1: add_recomputation 0.00% : 0.000007s : 1: assign_add_opt 0.04% : 0.000091s : 1: auto_monad 0.01% : 0.000033s : 1: auto_monad_reorder 0.00% : 0.000008s : 1: begin_end_overlap_inline 0.00% : 0.000009s : 1: bias_add_comm_swap 77.17% : 0.190588s : 1: bootstrap 0.02% : 0.000041s : 1: cconv 0.01% : 0.000014s : 1: comm_op_add_attrs 0.01% : 0.000022s : 1: control_data_broadcast_order 0.01% : 0.000013s : 1: convert_after_rewriter 0.01% : 0.000034s : 1: cse_after_recomputation 0.00% : 0.000008s : 1: dataset_repeat_opt 0.01% : 0.000029s : 1: detach_backward 0.01% : 0.000014s : 1: environ_conv 0.01% : 0.000036s : 1: event_method 0.00% : 0.000008s : 1: full_micro_interleaved_order_control 0.00% : 0.000008s : 1: get_jit_bprop_graph 0.01% : 0.000014s : 1: graph_reusing 0.00% : 0.000008s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000007s : 1: handle_group_info 0.00% : 0.000009s : 1: inline 0.00% : 0.000010s : 1: insert-virtual-dataset 0.00% : 0.000007s : 1: interleave_parallel_branches 0.00% : 0.000007s : 1: interleave_split_concat_branches 0.00% : 0.000008s : 1: label_fine_grained_interleaved_index 0.00% : 0.000012s : 1: label_micro_interleaved_index 0.28% : 0.000698s : 1: loop_unroll 0.00% : 0.000007s : 1: merge_cast_opt 0.00% : 0.000008s : 1: micro_interleaved_order_control 0.34% : 0.000846s : 1: mutable_eliminate 0.00% : 0.000011s : 1: offloading_packed_experts 0.01% : 0.000021s : 1: opt.transform.loop_unroll_optimizer 0.01% : 0.000024s : 1: opt.transform.mutable_eliminate 0.54% : 0.001344s : 78: opt.transform.opt_a 0.02% : 0.000038s : 1: opt.transform.opt_after_cconv 0.01% : 0.000036s : 1: opt.transform.opt_after_jit_grad 0.05% : 0.000132s : 28: opt.transform.opt_b 0.03% : 0.000067s : 2: opt.transform.opt_trans_graph 0.02% : 0.000048s : 4: opt.transform.symbol_engine_opt 1.45% : 0.003577s : 1: opt_a 0.06% : 0.000159s : 1: opt_after_cconv 0.32% : 0.000793s : 1: opt_after_jit_grad 0.15% : 0.000372s : 1: opt_b 2.94% : 0.007250s : 1: optimize 0.01% : 0.000029s : 1: optimize_parallel_all_gather_comm 0.00% : 0.000012s : 1: order_py_execute_after_rewriter 0.01% : 0.000031s : 1: overlap_grad_flash_sp 0.00% : 0.000007s : 1: overlap_grad_matmul_and_grad_allreduce 0.00% : 0.000012s : 1: overlap_grad_ring_attention 0.00% : 0.000009s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000008s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000009s : 1: overlap_param_gather 0.00% : 0.000007s : 1: overlap_recompute_allgather_and_fa_grad 0.00% : 0.000011s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000009s : 1: overlap_recompute_comm 0.00% : 0.000010s : 1: parallel-infer-symbol 0.00% : 0.000006s : 1: parallel-infer-symbol-second 0.00% : 0.000008s : 1: partial_unused_args_eliminate 0.00% : 0.000010s : 1: pipeline_parallel_scheduler 0.00% : 0.000008s : 1: pipeline_split 0.02% : 0.000047s : 1: pre_auto_parallel 0.02% : 0.000041s : 1: py_interpret_to_execute 0.01% : 0.000022s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000008s : 1: remove_cast_before_assign_add 0.01% : 0.000024s : 1: remove_dup_value 0.21% : 0.000507s : 1: renormalize.infer 0.16% : 0.000405s : 1: renormalize.specialize 0.00% : 0.000010s : 1: reorder_send_recv_between_fp_bp 0.01% : 0.000015s : 1: rewriter_after_jit_bprop_graph 0.02% : 0.000049s : 1: rewriter_after_opt_a 0.04% : 0.000102s : 1: rewriter_before_opt_a 0.00% : 0.000008s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000009s : 1: slice_recompute_activation 0.00% : 0.000007s : 1: split_layernorm_comm 0.00% : 0.000008s : 1: split_matmul_comm_elemetwise 0.00% : 0.000012s : 1: swap_dp_allreduce_reducescatter 0.05% : 0.000118s : 1: symbol_engine_optimizer 0.05% : 0.000117s : 1: tuple_transform 12.39% : 0.030591s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:38:25.538.595 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.473213, [21] [bootstrap]: 0.00043538 [type_inference]: 0.292687 [event_method]: 2.065e-05 [auto_monad]: 7.556e-05 [graph_reusing]: 6.30002e-06 [inline]: 2.98e-06 [add_attr]: 0.00367081, [1] [add_attr_with_inline]: 0.00365953, [1] [Cycle 1]: 6.871e-05, [2] [tag_attr]: 2.353e-05 [meta_addattr_fg_expand]: 5.87999e-06 [parallel-infer-symbol]: 4.31002e-06 [pre_auto_parallel]: 3.967e-05 [insert-virtual-dataset]: 2.56998e-06 [parallel-infer-symbol-second]: 7.40023e-07 [dataset_repeat_opt]: 2.34001e-06 [pipeline_split]: 1.96e-06 [optimize]: 0.00532091, [53] [py_interpret_to_execute]: 2.866e-05 [rewriter_before_opt_a]: 8.878e-05 [opt_a]: 0.00305381, [2] [Cycle 1]: 0.00235254, [45] [expand_dump_flag]: 3.34001e-06 [switch_simplify]: 4.387e-05 [loop_unroll]: 3.053e-05 [a_1]: 0.00068538 [with_stream_mark]: 1.887e-05 [recompute_prepare]: 1.069e-05 [updatestate_depend_eliminate]: 4.25999e-06 [updatestate_assign_eliminate]: 3.04001e-06 [updatestate_loads_eliminate]: 3.13e-06 [parameter_eliminate]: 1.85001e-06 [a_2]: 8.933e-05 [accelerated_algorithm]: 2.003e-05 [shard]: 2.15002e-06 [meta_shard_fg_expand]: 2.19001e-06 [shard_inline]: 6.78998e-06 [merge_send_recv]: 9.44998e-06 [auto_parallel]: 6.90002e-06 [parallel]: 2.33e-05 [flash_sp]: 9.04e-06 [merge_comm]: 4.05998e-06 [allreduce_fusion]: 3.40003e-06 [matmul_add_comm_reduction]: 1.338e-05 [allreduce_slice_to_reducescatter]: 6.50005e-07 [virtual_shard_identity]: 1.055e-05 [virtual_dataset]: 6.97002e-06 [get_grad_eliminate_]: 6.28e-06 [virtual_output]: 6.71e-06 [merge_forward]: 4.74002e-06 [cell_reuse_recompute_pass]: 1.56998e-06 [offload_activation]: 1.071e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.415e-05 [merge_recompute_call_nodes]: 3.048e-05 [before_grad]: 1.198e-05 [set_forward_comm_id_for_comm_node_pass]: 4.10998e-06 [meta_fg_expand]: 3.14999e-06 [flash_sp_send_recv_attached]: 5.41998e-06 [receive_attached]: 2.24001e-06 [after_resolve]: 1.407e-05 [a_after_grad]: 9.96e-06 [renormalize]: 0.0008383 [add_forward_monad_depend]: 6.46e-06 [auto_monad_grad]: 2.74001e-06 [auto_monad_eliminator]: 1.685e-05 [cse]: 3.241e-05 [a_3]: 5.223e-05 [Cycle 2]: 0.0006877, [45] [expand_dump_flag]: 2.24999e-06 [switch_simplify]: 8.03001e-06 [loop_unroll]: 6.53e-06 [a_1]: 0.00015167 [with_stream_mark]: 1.013e-05 [recompute_prepare]: 7.06999e-06 [updatestate_depend_eliminate]: 3.53e-06 [updatestate_assign_eliminate]: 2.22001e-06 [updatestate_loads_eliminate]: 2.93998e-06 [parameter_eliminate]: 1.10001e-06 [a_2]: 8.025e-05 [accelerated_algorithm]: 1.021e-05 [shard]: 1.44e-06 [meta_shard_fg_expand]: 1.55999e-06 [shard_inline]: 6.53998e-06 [merge_send_recv]: 5.27001e-06 [auto_parallel]: 5.73002e-06 [parallel]: 6.89999e-06 [flash_sp]: 3.67998e-06 [merge_comm]: 3.38999e-06 [allreduce_fusion]: 3.14999e-06 [matmul_add_comm_reduction]: 7.06999e-06 [allreduce_slice_to_reducescatter]: 6.00005e-07 [virtual_shard_identity]: 6.94001e-06 [virtual_dataset]: 6.14999e-06 [get_grad_eliminate_]: 5.92999e-06 [virtual_output]: 6.26998e-06 [merge_forward]: 2.99999e-06 [cell_reuse_recompute_pass]: 1.64e-06 [offload_activation]: 8.08001e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.154e-05 [merge_recompute_call_nodes]: 7.2e-07 [before_grad]: 1.011e-05 [set_forward_comm_id_for_comm_node_pass]: 4.10998e-06 [meta_fg_expand]: 2.26e-06 [flash_sp_send_recv_attached]: 1.08001e-06 [receive_attached]: 1.39998e-06 [after_resolve]: 1.09e-05 [a_after_grad]: 9.17001e-06 [renormalize]: 7.99773e-08 [add_forward_monad_depend]: 1.12999e-06 [auto_monad_grad]: 1.07e-06 [auto_monad_eliminator]: 6.88998e-06 [cse]: 1.537e-05 [a_3]: 3.71e-05 [py_interpret_to_execute_after_opt_a]: 1.231e-05 [slice_cell_reuse_recomputed_activation]: 2.17001e-06 [rewriter_after_opt_a]: 3.881e-05 [convert_after_rewriter]: 6.29999e-06 [order_py_execute_after_rewriter]: 5.47999e-06 [mutable_eliminate]: 0.00064799 [opt_b]: 0.00022651, [1] [Cycle 1]: 0.00021977, [7] [b_1]: 0.00014442 [b_2]: 8.87e-06 [updatestate_depend_eliminate]: 6.51e-06 [updatestate_assign_eliminate]: 2.33998e-06 [updatestate_loads_eliminate]: 2.61e-06 [renormalize]: 6.59988e-07 [cse]: 1.92e-05 [optimize_parallel_all_gather_comm]: 1.791e-05 [overlap_param_gather]: 2.01e-06 [cconv]: 2.754e-05 [loop_unroll]: 0.00045783 [opt_after_cconv]: 0.0001097, [1] [Cycle 1]: 0.00010388, [7] [c_1]: 3.64e-05 [parameter_eliminate]: 3.45998e-06 [updatestate_depend_eliminate]: 5.44e-06 [updatestate_assign_eliminate]: 2.54999e-06 [updatestate_loads_eliminate]: 2.68e-06 [cse]: 1.824e-05 [renormalize]: 4.50003e-07 [remove_dup_value]: 1.426e-05 [tuple_transform]: 8.649e-05, [1] [Cycle 1]: 8.096e-05, [4] [d_1]: 5.247e-05 [none_parameter_eliminate]: 1.86e-06 [renormalize]: 3.19997e-07 [switch_simplify]: 7.73999e-06 [partial_unused_args_eliminate]: 1.82999e-06 [add_recomputation]: 5.045e-05 [cse_after_recomputation]: 2.449e-05, [1] [Cycle 1]: 1.948e-05, [1] [cse]: 1.278e-05 [environ_conv]: 5.48997e-06 [swap_dp_allreduce_reducescatter]: 5.51998e-06 [bias_add_comm_swap]: 3.16999e-06 [label_micro_interleaved_index]: 4.90999e-06 [label_fine_grained_interleaved_index]: 3.04001e-06 [merge_cast_opt]: 1.91e-06 [slice_recompute_activation]: 2.37999e-06 [micro_interleaved_order_control]: 2.31e-06 [assign_add_opt]: 1.45999e-06 [ForceFp32Comm]: 9.70002e-07 [remove_cast_before_assign_add]: 1.50999e-06 [full_micro_interleaved_order_control]: 2.41998e-06 [reorder_send_recv_between_fp_bp]: 3.06001e-06 [comm_op_add_attrs]: 1.05999e-06 [add_comm_op_reuse_tag]: 1.01002e-06 [interleave_split_concat_branches]: 1.21997e-06 [interleave_parallel_branches]: 1.12e-06 [overlap_opt_shard_in_pipeline]: 1.20001e-06 [overlap_opt_shard_grad_in_pipeline]: 1.97999e-06 [control_data_broadcast_order]: 1.416e-05 [grouped_pairwise_exchange_alltoall]: 2.07001e-06 [offloading_packed_experts]: 3.97e-06 [overlap_recompute_and_grad_model_parallel]: 4.86002e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.21997e-06 [overlap_recompute_allgather_and_fa_grad]: 1.39e-06 [overlap_recompute_comm]: 2.52001e-06 [overlap_grad_ring_attention]: 4.55999e-06 [overlap_grad_flash_sp]: 2.272e-05 [begin_end_overlap_inline]: 5.50004e-07 [split_matmul_comm_elemetwise]: 2.19001e-06 [split_layernorm_comm]: 1.75001e-06 [handle_group_info]: 9.20001e-07 [symbol_engine_optimizer]: 8.225e-05, [1] [Cycle 1]: 7.831e-05, [6] [build]: 3.32002e-06 [elim_shapecalc]: 1.042e-05 [elim_not_effective]: 1.384e-05 [opt_reshape]: 8.55001e-06 [fold_const_symbol]: 1.11e-05 [renormalize]: 3.00002e-07 [detach_backward]: 2.12999e-06 [pipeline_parallel_scheduler]: 1.87999e-06 [auto_monad_reorder]: 1.779e-05 [get_jit_bprop_graph]: 0.169766 [rewriter_after_jit_bprop_graph]: 1.45e-05 [opt_after_jit_grad]: 0.00085874 [validate]: 5.554e-05 Sums bootstrap : 0.000435s : 0.09% type_inference : 0.292687s : 62.48% event_method : 0.000021s : 0.00% auto_monad : 0.000076s : 0.02% graph_reusing : 0.000006s : 0.00% inline : 0.000003s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000024s : 0.01% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.00% parallel-infer-symbol : 0.000004s : 0.00% pre_auto_parallel : 0.000040s : 0.01% insert-virtual-dataset : 0.000003s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000029s : 0.01% optimize.rewriter_before_opt_a : 0.000089s : 0.02% optimize.opt_a.expand_dump_flag : 0.000006s : 0.00% optimize.opt_a.switch_simplify : 0.000052s : 0.01% optimize.opt_a.loop_unroll : 0.000037s : 0.01% optimize.opt_a.a_1 : 0.000837s : 0.18% optimize.opt_a.with_stream_mark : 0.000029s : 0.01% optimize.opt_a.recompute_prepare : 0.000018s : 0.00% optimize.opt_a.updatestate_depend_eliminate : 0.000008s : 0.00% optimize.opt_a.updatestate_assign_eliminate : 0.000005s : 0.00% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.00% optimize.opt_a.parameter_eliminate : 0.000003s : 0.00% optimize.opt_a.a_2 : 0.000170s : 0.04% optimize.opt_a.accelerated_algorithm : 0.000030s : 0.01% optimize.opt_a.shard : 0.000004s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.00% optimize.opt_a.shard_inline : 0.000013s : 0.00% optimize.opt_a.merge_send_recv : 0.000015s : 0.00% optimize.opt_a.auto_parallel : 0.000013s : 0.00% optimize.opt_a.parallel : 0.000030s : 0.01% optimize.opt_a.flash_sp : 0.000013s : 0.00% optimize.opt_a.merge_comm : 0.000007s : 0.00% optimize.opt_a.allreduce_fusion : 0.000007s : 0.00% optimize.opt_a.matmul_add_comm_reduction : 0.000020s : 0.00% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000017s : 0.00% optimize.opt_a.virtual_dataset : 0.000013s : 0.00% optimize.opt_a.get_grad_eliminate_ : 0.000012s : 0.00% optimize.opt_a.virtual_output : 0.000013s : 0.00% optimize.opt_a.merge_forward : 0.000008s : 0.00% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.00% optimize.opt_a.offload_activation : 0.000019s : 0.00% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000026s : 0.01% optimize.opt_a.merge_recompute_call_nodes : 0.000031s : 0.01% optimize.opt_a.before_grad : 0.000022s : 0.00% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000008s : 0.00% optimize.opt_a.meta_fg_expand : 0.000005s : 0.00% optimize.opt_a.flash_sp_send_recv_attached : 0.000006s : 0.00% optimize.opt_a.receive_attached : 0.000004s : 0.00% optimize.opt_a.after_resolve : 0.000025s : 0.01% optimize.opt_a.a_after_grad : 0.000019s : 0.00% optimize.opt_a.renormalize : 0.000838s : 0.18% optimize.opt_a.add_forward_monad_depend : 0.000008s : 0.00% optimize.opt_a.auto_monad_grad : 0.000004s : 0.00% optimize.opt_a.auto_monad_eliminator : 0.000024s : 0.01% optimize.opt_a.cse : 0.000048s : 0.01% optimize.opt_a.a_3 : 0.000089s : 0.02% optimize.py_interpret_to_execute_after_opt_a : 0.000012s : 0.00% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.00% optimize.rewriter_after_opt_a : 0.000039s : 0.01% optimize.convert_after_rewriter : 0.000006s : 0.00% optimize.order_py_execute_after_rewriter : 0.000005s : 0.00% optimize.mutable_eliminate : 0.000648s : 0.14% optimize.opt_b.b_1 : 0.000144s : 0.03% optimize.opt_b.b_2 : 0.000009s : 0.00% optimize.opt_b.updatestate_depend_eliminate : 0.000007s : 0.00% optimize.opt_b.updatestate_assign_eliminate : 0.000002s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.00% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000019s : 0.00% optimize.optimize_parallel_all_gather_comm : 0.000018s : 0.00% optimize.overlap_param_gather : 0.000002s : 0.00% optimize.cconv : 0.000028s : 0.01% optimize.loop_unroll : 0.000458s : 0.10% optimize.opt_after_cconv.c_1 : 0.000036s : 0.01% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000005s : 0.00% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.cse : 0.000018s : 0.00% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000014s : 0.00% optimize.tuple_transform.d_1 : 0.000052s : 0.01% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000008s : 0.00% optimize.partial_unused_args_eliminate : 0.000002s : 0.00% optimize.add_recomputation : 0.000050s : 0.01% optimize.cse_after_recomputation.cse : 0.000013s : 0.00% optimize.environ_conv : 0.000005s : 0.00% optimize.swap_dp_allreduce_reducescatter : 0.000006s : 0.00% optimize.bias_add_comm_swap : 0.000003s : 0.00% optimize.label_micro_interleaved_index : 0.000005s : 0.00% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.00% optimize.merge_cast_opt : 0.000002s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.00% optimize.micro_interleaved_order_control : 0.000002s : 0.00% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000002s : 0.00% optimize.full_micro_interleaved_order_control : 0.000002s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.00% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.00% optimize.control_data_broadcast_order : 0.000014s : 0.00% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.00% optimize.offloading_packed_experts : 0.000004s : 0.00% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.00% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.00% optimize.overlap_recompute_comm : 0.000003s : 0.00% optimize.overlap_grad_ring_attention : 0.000005s : 0.00% optimize.overlap_grad_flash_sp : 0.000023s : 0.00% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.00% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000003s : 0.00% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000010s : 0.00% optimize.symbol_engine_optimizer.elim_not_effective : 0.000014s : 0.00% optimize.symbol_engine_optimizer.opt_reshape : 0.000009s : 0.00% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000011s : 0.00% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.00% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000018s : 0.00% get_jit_bprop_graph : 0.169766s : 36.24% rewriter_after_jit_bprop_graph : 0.000015s : 0.00% opt_after_jit_grad : 0.000859s : 0.18% validate : 0.000056s : 0.01% Time group info: ------[substitution.] 0.000248 36 13.86% : 0.000034s : 6: substitution.arithmetic_simplify 0.70% : 0.000002s : 2: substitution.elim_not_effective 0.73% : 0.000002s : 2: substitution.fold_const_symbol 2.86% : 0.000007s : 4: substitution.graph_param_transform 64.66% : 0.000160s : 4: substitution.inline 1.63% : 0.000004s : 4: substitution.j_node_and_user_rematch 4.87% : 0.000012s : 2: substitution.less_batch_normalization 2.24% : 0.000006s : 4: substitution.remove_not_recompute_node 1.92% : 0.000005s : 4: substitution.replace_old_param 6.52% : 0.000016s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.292607 2 99.70% : 0.291736s : 1: type_inference.infer 0.30% : 0.000872s : 1: type_inference.specialize ------[replace.] 0.000064 8 61.75% : 0.000040s : 4: replace.inline 38.25% : 0.000025s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000172 8 91.63% : 0.000157s : 4: match.inline 8.37% : 0.000014s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000212 1278 0.85% : 0.000002s : 13: predicate.accumulaten_eliminater 1.89% : 0.000004s : 4: predicate.ad_related_special_op_eliminate 0.46% : 0.000001s : 8: predicate.addn_check_dump 0.86% : 0.000002s : 13: predicate.addn_zero_filter 0.74% : 0.000002s : 13: predicate.adjust_all_reduce_mul_add 2.48% : 0.000005s : 21: predicate.arithmetic_simplify 0.86% : 0.000002s : 13: predicate.cast_eliminate 0.61% : 0.000001s : 8: predicate.check_bprop_eliminate 0.50% : 0.000001s : 8: predicate.compare_switch_simplify 0.18% : 0.000000s : 4: predicate.const_output_eliminate 0.57% : 0.000001s : 8: predicate.depend_value_elim 0.86% : 0.000002s : 13: predicate.dict_get_item_const_eliminator 0.99% : 0.000002s : 13: predicate.dict_get_item_eliminator 0.82% : 0.000002s : 13: predicate.dict_set_item_eliminator 1.86% : 0.000004s : 8: predicate.dumpgradient_eliminate 0.23% : 0.000000s : 4: predicate.elim_not_effective 0.35% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.14% : 0.000002s : 17: predicate.environ_add_const_eliminate 1.00% : 0.000002s : 17: predicate.environ_get_add_eliminate 1.00% : 0.000002s : 17: predicate.environ_get_depend_swap 1.87% : 0.000004s : 25: predicate.environ_get_eliminate 1.04% : 0.000002s : 17: predicate.environ_get_set_eliminate 1.34% : 0.000003s : 21: predicate.exchange_switch_depend_value 2.33% : 0.000005s : 21: predicate.float_depend_g_call 0.49% : 0.000001s : 8: predicate.float_environ_get_switch 0.73% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.19% : 0.000000s : 4: predicate.fold_const_symbol 0.68% : 0.000001s : 8: predicate.get_grad_eliminate 0.20% : 0.000000s : 4: predicate.graph_param_transform 0.55% : 0.000001s : 8: predicate.incorporate_call 0.48% : 0.000001s : 8: predicate.incorporate_call_switch 6.59% : 0.000014s : 58: predicate.inline 0.71% : 0.000002s : 8: predicate.inline_without_move 0.31% : 0.000001s : 8: predicate.j_node_and_user_rematch 0.99% : 0.000002s : 8: predicate.less_batch_normalization 1.73% : 0.000004s : 25: predicate.list_to_tuple_eliminator_ 2.38% : 0.000005s : 38: predicate.load_eliminater 1.01% : 0.000002s : 4: predicate.loop_unroll_after_grad 2.34% : 0.000005s : 34: predicate.loop_unroll_before_grad 1.64% : 0.000003s : 21: predicate.make_slice_get_slice_eliminator 0.58% : 0.000001s : 8: predicate.merge_addn 0.57% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.55% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.75% : 0.000002s : 13: predicate.minmaximum_grad 1.41% : 0.000003s : 4: predicate.mutable_eliminate 0.35% : 0.000001s : 4: predicate.opt_reshape 0.40% : 0.000001s : 4: predicate.parallel_virtual_node 1.76% : 0.000004s : 21: predicate.partial_defer_inline 1.56% : 0.000003s : 21: predicate.partial_eliminate 0.90% : 0.000002s : 13: predicate.print_const_string_wrapper 0.59% : 0.000001s : 8: predicate.reduce_all_const_elim 1.15% : 0.000002s : 13: predicate.reduce_eliminate 2.30% : 0.000005s : 38: predicate.redundant_stop_gradient_eliminater 0.37% : 0.000001s : 8: predicate.remove_not_recompute_node 1.31% : 0.000003s : 25: predicate.replace_applicator 0.51% : 0.000001s : 8: predicate.replace_old_param 0.28% : 0.000001s : 4: predicate.reset_defer_inline 0.89% : 0.000002s : 13: predicate.reshape_eliminate 0.68% : 0.000001s : 8: predicate.row_tensor_add_zeros_like 0.37% : 0.000001s : 4: predicate.row_tensor_eliminate 0.97% : 0.000002s : 8: predicate.same_eliminate 0.52% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.80% : 0.000002s : 8: predicate.shard_identity_eliminate 0.77% : 0.000002s : 8: predicate.special_op_eliminate 0.68% : 0.000001s : 8: predicate.specialize_transform 0.99% : 0.000002s : 8: predicate.split_environ_get_set_with_tuple_value 0.68% : 0.000001s : 8: predicate.stack_unstack_eliminate 0.42% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.50% : 0.000003s : 21: predicate.switch_defer_inline 2.02% : 0.000004s : 29: predicate.switch_layer_defer_inline 4.82% : 0.000010s : 67: predicate.switch_simplify 0.85% : 0.000002s : 13: predicate.tile_eliminate 0.87% : 0.000002s : 13: predicate.transpose_eliminate 1.46% : 0.000003s : 21: predicate.tuple_list_convert_item_index_to_positive 1.60% : 0.000003s : 21: predicate.tuple_list_get_item_const_eliminator 1.37% : 0.000003s : 21: predicate.tuple_list_get_item_depend_reorder 3.31% : 0.000007s : 33: predicate.tuple_list_get_item_eliminator 1.42% : 0.000003s : 21: predicate.tuple_list_get_set_item_eliminator 2.32% : 0.000005s : 29: predicate.tuple_list_set_item_eliminator 1.70% : 0.000004s : 25: predicate.tuple_to_list_eliminator_ 2.38% : 0.000005s : 38: predicate.updatestate_pure_node_eliminater 2.99% : 0.000006s : 46: predicate.updatestate_useless_node_eliminater 0.38% : 0.000001s : 4: predicate.value_based_eliminate 0.56% : 0.000001s : 8: predicate.virtual_dataset_eliminate 0.84% : 0.000002s : 8: predicate.virtual_output_eliminate 0.24% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.41% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000790 11 58.68% : 0.000463s : 5: func_graph_cloner_run.FuncGraphClonerGraph 41.32% : 0.000326s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.484506 192 0.00% : 0.000004s : 1: ForceFp32Comm 0.76% : 0.003678s : 1: add_attr 0.76% : 0.003664s : 1: add_attr_with_inline 0.00% : 0.000004s : 1: add_comm_op_reuse_tag 0.01% : 0.000055s : 1: add_recomputation 0.00% : 0.000004s : 1: assign_add_opt 0.02% : 0.000081s : 1: auto_monad 0.00% : 0.000023s : 1: auto_monad_reorder 0.00% : 0.000003s : 1: begin_end_overlap_inline 0.00% : 0.000006s : 1: bias_add_comm_swap 0.10% : 0.000465s : 1: bootstrap 0.01% : 0.000031s : 1: cconv 0.00% : 0.000004s : 1: comm_op_add_attrs 0.00% : 0.000017s : 1: control_data_broadcast_order 0.00% : 0.000009s : 1: convert_after_rewriter 0.01% : 0.000027s : 1: cse_after_recomputation 0.00% : 0.000005s : 1: dataset_repeat_opt 0.00% : 0.000005s : 1: detach_backward 0.00% : 0.000009s : 1: environ_conv 0.01% : 0.000028s : 1: event_method 0.00% : 0.000005s : 1: full_micro_interleaved_order_control 35.05% : 0.169812s : 1: get_jit_bprop_graph 0.00% : 0.000010s : 1: graph_reusing 0.00% : 0.000005s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000004s : 1: handle_group_info 0.00% : 0.000006s : 1: inline 0.00% : 0.000006s : 1: insert-virtual-dataset 0.00% : 0.000004s : 1: interleave_parallel_branches 0.00% : 0.000004s : 1: interleave_split_concat_branches 0.00% : 0.000006s : 1: label_fine_grained_interleaved_index 0.00% : 0.000008s : 1: label_micro_interleaved_index 0.10% : 0.000466s : 1: loop_unroll 0.00% : 0.000005s : 1: merge_cast_opt 0.00% : 0.000005s : 1: micro_interleaved_order_control 0.14% : 0.000658s : 1: mutable_eliminate 0.00% : 0.000007s : 1: offloading_packed_experts 0.00% : 0.000015s : 1: opt.transform.loop_unroll_optimizer 0.00% : 0.000017s : 1: opt.transform.mutable_eliminate 0.27% : 0.001299s : 78: opt.transform.opt_a 0.01% : 0.000034s : 1: opt.transform.opt_after_cconv 0.01% : 0.000047s : 1: opt.transform.opt_after_jit_grad 0.02% : 0.000121s : 28: opt.transform.opt_b 0.01% : 0.000058s : 2: opt.transform.opt_trans_graph 0.01% : 0.000040s : 4: opt.transform.symbol_engine_opt 0.63% : 0.003057s : 1: opt_a 0.02% : 0.000113s : 1: opt_after_cconv 0.18% : 0.000870s : 1: opt_after_jit_grad 0.05% : 0.000230s : 1: opt_b 1.10% : 0.005326s : 1: optimize 0.00% : 0.000022s : 1: optimize_parallel_all_gather_comm 0.00% : 0.000009s : 1: order_py_execute_after_rewriter 0.01% : 0.000026s : 1: overlap_grad_flash_sp 0.00% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.00% : 0.000008s : 1: overlap_grad_ring_attention 0.00% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000005s : 1: overlap_param_gather 0.00% : 0.000005s : 1: overlap_recompute_allgather_and_fa_grad 0.00% : 0.000008s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000005s : 1: overlap_recompute_comm 0.00% : 0.000008s : 1: parallel-infer-symbol 0.00% : 0.000004s : 1: parallel-infer-symbol-second 0.00% : 0.000005s : 1: partial_unused_args_eliminate 0.00% : 0.000005s : 1: pipeline_parallel_scheduler 0.00% : 0.000005s : 1: pipeline_split 0.01% : 0.000044s : 1: pre_auto_parallel 0.01% : 0.000032s : 1: py_interpret_to_execute 0.00% : 0.000015s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000005s : 1: remove_cast_before_assign_add 0.00% : 0.000018s : 1: remove_dup_value 0.09% : 0.000450s : 1: renormalize.infer 0.08% : 0.000377s : 1: renormalize.specialize 0.00% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.01% : 0.000024s : 1: rewriter_after_jit_bprop_graph 0.01% : 0.000043s : 1: rewriter_after_opt_a 0.02% : 0.000093s : 1: rewriter_before_opt_a 0.00% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000005s : 1: slice_recompute_activation 0.00% : 0.000004s : 1: split_layernorm_comm 0.00% : 0.000005s : 1: split_matmul_comm_elemetwise 0.00% : 0.000009s : 1: swap_dp_allreduce_reducescatter 0.02% : 0.000085s : 1: symbol_engine_optimizer 0.02% : 0.000089s : 1: tuple_transform 60.41% : 0.292709s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:38:28.403.367 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:38:28.403.679 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.156294, [21] [bootstrap]: 0.00050416 [type_inference]: 0.00740908 [event_method]: 2.294e-05 [auto_monad]: 8.435e-05 [graph_reusing]: 7.16999e-06 [inline]: 2.79001e-06 [add_attr]: 0.00393341, [1] [add_attr_with_inline]: 0.00392199, [1] [Cycle 1]: 9.82e-05, [2] [tag_attr]: 2.646e-05 [meta_addattr_fg_expand]: 5.88002e-06 [parallel-infer-symbol]: 3.66999e-06 [pre_auto_parallel]: 4.339e-05 [insert-virtual-dataset]: 2.45002e-06 [parallel-infer-symbol-second]: 6.29982e-07 [dataset_repeat_opt]: 1.82999e-06 [pipeline_split]: 1.76e-06 [optimize]: 0.142602, [53] [py_interpret_to_execute]: 3.872e-05 [rewriter_before_opt_a]: 0.00010083 [opt_a]: 0.139595, [2] [Cycle 1]: 0.00318766, [45] [expand_dump_flag]: 8.79e-06 [switch_simplify]: 8.339e-05 [loop_unroll]: 3.297e-05 [a_1]: 0.00090366 [with_stream_mark]: 2.973e-05 [recompute_prepare]: 1.267e-05 [updatestate_depend_eliminate]: 7e-06 [updatestate_assign_eliminate]: 4.02e-06 [updatestate_loads_eliminate]: 3.97002e-06 [parameter_eliminate]: 3.15002e-06 [a_2]: 0.00014531 [accelerated_algorithm]: 2.459e-05 [shard]: 1.88002e-06 [meta_shard_fg_expand]: 3.26001e-06 [shard_inline]: 8.92999e-06 [merge_send_recv]: 1.145e-05 [auto_parallel]: 1.229e-05 [parallel]: 2.177e-05 [flash_sp]: 1.095e-05 [merge_comm]: 4.79e-06 [allreduce_fusion]: 4.70001e-06 [matmul_add_comm_reduction]: 1.432e-05 [allreduce_slice_to_reducescatter]: 8.39995e-07 [virtual_shard_identity]: 1.219e-05 [virtual_dataset]: 8.69e-06 [get_grad_eliminate_]: 8.47998e-06 [virtual_output]: 8.69998e-06 [merge_forward]: 5.51998e-06 [cell_reuse_recompute_pass]: 1.26002e-06 [offload_activation]: 1.265e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.925e-05 [merge_recompute_call_nodes]: 1.52999e-06 [before_grad]: 1.539e-05 [set_forward_comm_id_for_comm_node_pass]: 4.44002e-06 [meta_fg_expand]: 4.25999e-06 [flash_sp_send_recv_attached]: 5.57001e-06 [receive_attached]: 2.60002e-06 [after_resolve]: 1.388e-05 [a_after_grad]: 1.457e-05 [renormalize]: 0.00105696 [add_forward_monad_depend]: 7.35e-06 [auto_monad_grad]: 2.71e-06 [auto_monad_eliminator]: 2.249e-05 [cse]: 3.932e-05 [a_3]: 8.038e-05 [Cycle 2]: 0.00105034, [45] [expand_dump_flag]: 2.53998e-06 [switch_simplify]: 1.029e-05 [loop_unroll]: 7.82e-06 [a_1]: 0.0002008 [with_stream_mark]: 1.478e-05 [recompute_prepare]: 8.78001e-06 [updatestate_depend_eliminate]: 4.73001e-06 [updatestate_assign_eliminate]: 3.26999e-06 [updatestate_loads_eliminate]: 2.86e-06 [parameter_eliminate]: 1.72999e-06 [a_2]: 0.00012776 [accelerated_algorithm]: 1.308e-05 [shard]: 1.76e-06 [meta_shard_fg_expand]: 2.08002e-06 [shard_inline]: 8.48001e-06 [merge_send_recv]: 8.17e-06 [auto_parallel]: 9.55001e-06 [parallel]: 7.62002e-06 [flash_sp]: 4.32998e-06 [merge_comm]: 4.37998e-06 [allreduce_fusion]: 3.85e-06 [matmul_add_comm_reduction]: 1.061e-05 [allreduce_slice_to_reducescatter]: 6.60017e-07 [virtual_shard_identity]: 8.84998e-06 [virtual_dataset]: 8.58001e-06 [get_grad_eliminate_]: 7.51001e-06 [virtual_output]: 7.93999e-06 [merge_forward]: 4.70001e-06 [cell_reuse_recompute_pass]: 2.71999e-06 [offload_activation]: 1.137e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.817e-05 [merge_recompute_call_nodes]: 1.30001e-06 [before_grad]: 1.389e-05 [set_forward_comm_id_for_comm_node_pass]: 7.31999e-06 [meta_fg_expand]: 3.92998e-06 [flash_sp_send_recv_attached]: 1.45001e-06 [receive_attached]: 2.51e-06 [after_resolve]: 1.351e-05 [a_after_grad]: 1.193e-05 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 1.86998e-06 [auto_monad_grad]: 1.91e-06 [auto_monad_eliminator]: 1.199e-05 [cse]: 2.375e-05 [a_3]: 6.091e-05 [py_interpret_to_execute_after_opt_a]: 2.206e-05 [slice_cell_reuse_recomputed_activation]: 5.65001e-06 [rewriter_after_opt_a]: 5.003e-05 [convert_after_rewriter]: 1.127e-05 [order_py_execute_after_rewriter]: 9.24e-06 [mutable_eliminate]: 0.00082637 [opt_b]: 0.00039013, [1] [Cycle 1]: 0.00037714, [7] [b_1]: 0.00024426 [b_2]: 1.226e-05 [updatestate_depend_eliminate]: 8.90001e-06 [updatestate_assign_eliminate]: 3.83999e-06 [updatestate_loads_eliminate]: 3.46999e-06 [renormalize]: 8.39995e-07 [cse]: 2.748e-05 [optimize_parallel_all_gather_comm]: 2.471e-05 [overlap_param_gather]: 6.17001e-06 [cconv]: 3.372e-05 [loop_unroll]: 0.00053113 [opt_after_cconv]: 0.00015296, [1] [Cycle 1]: 0.00014315, [7] [c_1]: 4.306e-05 [parameter_eliminate]: 3.09999e-06 [updatestate_depend_eliminate]: 6.26e-06 [updatestate_assign_eliminate]: 3.32997e-06 [updatestate_loads_eliminate]: 3.73999e-06 [cse]: 2.65e-05 [renormalize]: 5.69999e-07 [remove_dup_value]: 2.095e-05 [tuple_transform]: 0.00011963, [1] [Cycle 1]: 0.00011109, [4] [d_1]: 6.397e-05 [none_parameter_eliminate]: 2.02001e-06 [renormalize]: 2.59985e-07 [switch_simplify]: 9.17001e-06 [partial_unused_args_eliminate]: 4.75999e-06 [add_recomputation]: 6.645e-05 [cse_after_recomputation]: 3.557e-05, [1] [Cycle 1]: 2.719e-05, [1] [cse]: 1.642e-05 [environ_conv]: 1.012e-05 [swap_dp_allreduce_reducescatter]: 9.16998e-06 [bias_add_comm_swap]: 5.65001e-06 [label_micro_interleaved_index]: 7.98999e-06 [label_fine_grained_interleaved_index]: 5.51e-06 [merge_cast_opt]: 3.8e-06 [slice_recompute_activation]: 5.91e-06 [micro_interleaved_order_control]: 5.30999e-06 [assign_add_opt]: 3.74002e-06 [ForceFp32Comm]: 3.7e-06 [remove_cast_before_assign_add]: 3.95e-06 [full_micro_interleaved_order_control]: 4.92999e-06 [reorder_send_recv_between_fp_bp]: 5.16002e-06 [comm_op_add_attrs]: 3.92002e-06 [add_comm_op_reuse_tag]: 3.85e-06 [interleave_split_concat_branches]: 3.85998e-06 [interleave_parallel_branches]: 3.71999e-06 [overlap_opt_shard_in_pipeline]: 4e-06 [overlap_opt_shard_grad_in_pipeline]: 4.39998e-06 [control_data_broadcast_order]: 2.007e-05 [grouped_pairwise_exchange_alltoall]: 4.38999e-06 [offloading_packed_experts]: 8.65999e-06 [overlap_recompute_and_grad_model_parallel]: 8.37e-06 [overlap_grad_matmul_and_grad_allreduce]: 4.02998e-06 [overlap_recompute_allgather_and_fa_grad]: 4.4e-06 [overlap_recompute_comm]: 5.64e-06 [overlap_grad_ring_attention]: 8.88002e-06 [overlap_grad_flash_sp]: 2.931e-05 [begin_end_overlap_inline]: 3.48999e-06 [split_matmul_comm_elemetwise]: 4.94998e-06 [split_layernorm_comm]: 4.60001e-06 [handle_group_info]: 3.6e-06 [symbol_engine_optimizer]: 0.00012265, [1] [Cycle 1]: 0.00011427, [6] [build]: 4.50999e-06 [elim_shapecalc]: 1.468e-05 [elim_not_effective]: 1.676e-05 [opt_reshape]: 9.47001e-06 [fold_const_symbol]: 1.29e-05 [renormalize]: 6.99976e-07 [detach_backward]: 4.77e-06 [pipeline_parallel_scheduler]: 2.24001e-06 [auto_monad_reorder]: 2.597e-05 [get_jit_bprop_graph]: 1.96e-06 [rewriter_after_jit_bprop_graph]: 5.26002e-06 [opt_after_jit_grad]: 0.00081029 [validate]: 4.899e-05 Sums bootstrap : 0.000504s : 3.38% type_inference : 0.007409s : 49.71% event_method : 0.000023s : 0.15% auto_monad : 0.000084s : 0.57% graph_reusing : 0.000007s : 0.05% inline : 0.000003s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000026s : 0.18% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.04% parallel-infer-symbol : 0.000004s : 0.02% pre_auto_parallel : 0.000043s : 0.29% insert-virtual-dataset : 0.000002s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.01% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000039s : 0.26% optimize.rewriter_before_opt_a : 0.000101s : 0.68% optimize.opt_a.expand_dump_flag : 0.000011s : 0.08% optimize.opt_a.switch_simplify : 0.000094s : 0.63% optimize.opt_a.loop_unroll : 0.000041s : 0.27% optimize.opt_a.a_1 : 0.001104s : 7.41% optimize.opt_a.with_stream_mark : 0.000045s : 0.30% optimize.opt_a.recompute_prepare : 0.000021s : 0.14% optimize.opt_a.updatestate_depend_eliminate : 0.000012s : 0.08% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.05% optimize.opt_a.updatestate_loads_eliminate : 0.000007s : 0.05% optimize.opt_a.parameter_eliminate : 0.000005s : 0.03% optimize.opt_a.a_2 : 0.000273s : 1.83% optimize.opt_a.accelerated_algorithm : 0.000038s : 0.25% optimize.opt_a.shard : 0.000004s : 0.02% optimize.opt_a.meta_shard_fg_expand : 0.000005s : 0.04% optimize.opt_a.shard_inline : 0.000017s : 0.12% optimize.opt_a.merge_send_recv : 0.000020s : 0.13% optimize.opt_a.auto_parallel : 0.000022s : 0.15% optimize.opt_a.parallel : 0.000029s : 0.20% optimize.opt_a.flash_sp : 0.000015s : 0.10% optimize.opt_a.merge_comm : 0.000009s : 0.06% optimize.opt_a.allreduce_fusion : 0.000009s : 0.06% optimize.opt_a.matmul_add_comm_reduction : 0.000025s : 0.17% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000002s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000021s : 0.14% optimize.opt_a.virtual_dataset : 0.000017s : 0.12% optimize.opt_a.get_grad_eliminate_ : 0.000016s : 0.11% optimize.opt_a.virtual_output : 0.000017s : 0.11% optimize.opt_a.merge_forward : 0.000010s : 0.07% optimize.opt_a.cell_reuse_recompute_pass : 0.000004s : 0.03% optimize.opt_a.offload_activation : 0.000024s : 0.16% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000037s : 0.25% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.02% optimize.opt_a.before_grad : 0.000029s : 0.20% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000012s : 0.08% optimize.opt_a.meta_fg_expand : 0.000008s : 0.05% optimize.opt_a.flash_sp_send_recv_attached : 0.000007s : 0.05% optimize.opt_a.receive_attached : 0.000005s : 0.03% optimize.opt_a.after_resolve : 0.000027s : 0.18% optimize.opt_a.a_after_grad : 0.000026s : 0.18% optimize.opt_a.renormalize : 0.001057s : 7.09% optimize.opt_a.add_forward_monad_depend : 0.000009s : 0.06% optimize.opt_a.auto_monad_grad : 0.000005s : 0.03% optimize.opt_a.auto_monad_eliminator : 0.000034s : 0.23% optimize.opt_a.cse : 0.000063s : 0.42% optimize.opt_a.a_3 : 0.000141s : 0.95% optimize.py_interpret_to_execute_after_opt_a : 0.000022s : 0.15% optimize.slice_cell_reuse_recomputed_activation : 0.000006s : 0.04% optimize.rewriter_after_opt_a : 0.000050s : 0.34% optimize.convert_after_rewriter : 0.000011s : 0.08% optimize.order_py_execute_after_rewriter : 0.000009s : 0.06% optimize.mutable_eliminate : 0.000826s : 5.54% optimize.opt_b.b_1 : 0.000244s : 1.64% optimize.opt_b.b_2 : 0.000012s : 0.08% optimize.opt_b.updatestate_depend_eliminate : 0.000009s : 0.06% optimize.opt_b.updatestate_assign_eliminate : 0.000004s : 0.03% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.02% optimize.opt_b.renormalize : 0.000001s : 0.01% optimize.opt_b.cse : 0.000027s : 0.18% optimize.optimize_parallel_all_gather_comm : 0.000025s : 0.17% optimize.overlap_param_gather : 0.000006s : 0.04% optimize.cconv : 0.000034s : 0.23% optimize.loop_unroll : 0.000531s : 3.56% optimize.opt_after_cconv.c_1 : 0.000043s : 0.29% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.02% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.04% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.02% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000004s : 0.03% optimize.opt_after_cconv.cse : 0.000026s : 0.18% optimize.opt_after_cconv.renormalize : 0.000001s : 0.00% optimize.remove_dup_value : 0.000021s : 0.14% optimize.tuple_transform.d_1 : 0.000064s : 0.43% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000009s : 0.06% optimize.partial_unused_args_eliminate : 0.000005s : 0.03% optimize.add_recomputation : 0.000066s : 0.45% optimize.cse_after_recomputation.cse : 0.000016s : 0.11% optimize.environ_conv : 0.000010s : 0.07% optimize.swap_dp_allreduce_reducescatter : 0.000009s : 0.06% optimize.bias_add_comm_swap : 0.000006s : 0.04% optimize.label_micro_interleaved_index : 0.000008s : 0.05% optimize.label_fine_grained_interleaved_index : 0.000006s : 0.04% optimize.merge_cast_opt : 0.000004s : 0.03% optimize.slice_recompute_activation : 0.000006s : 0.04% optimize.micro_interleaved_order_control : 0.000005s : 0.04% optimize.assign_add_opt : 0.000004s : 0.03% optimize.ForceFp32Comm : 0.000004s : 0.02% optimize.remove_cast_before_assign_add : 0.000004s : 0.03% optimize.full_micro_interleaved_order_control : 0.000005s : 0.03% optimize.reorder_send_recv_between_fp_bp : 0.000005s : 0.03% optimize.comm_op_add_attrs : 0.000004s : 0.03% optimize.add_comm_op_reuse_tag : 0.000004s : 0.03% optimize.interleave_split_concat_branches : 0.000004s : 0.03% optimize.interleave_parallel_branches : 0.000004s : 0.02% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.03% optimize.overlap_opt_shard_grad_in_pipeline : 0.000004s : 0.03% optimize.control_data_broadcast_order : 0.000020s : 0.13% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.03% optimize.offloading_packed_experts : 0.000009s : 0.06% optimize.overlap_recompute_and_grad_model_parallel : 0.000008s : 0.06% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.03% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.03% optimize.overlap_recompute_comm : 0.000006s : 0.04% optimize.overlap_grad_ring_attention : 0.000009s : 0.06% optimize.overlap_grad_flash_sp : 0.000029s : 0.20% optimize.begin_end_overlap_inline : 0.000003s : 0.02% optimize.split_matmul_comm_elemetwise : 0.000005s : 0.03% optimize.split_layernorm_comm : 0.000005s : 0.03% optimize.handle_group_info : 0.000004s : 0.02% optimize.symbol_engine_optimizer.build : 0.000005s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000015s : 0.10% optimize.symbol_engine_optimizer.elim_not_effective : 0.000017s : 0.11% optimize.symbol_engine_optimizer.opt_reshape : 0.000009s : 0.06% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000013s : 0.09% optimize.symbol_engine_optimizer.renormalize : 0.000001s : 0.00% detach_backward : 0.000005s : 0.03% pipeline_parallel_scheduler : 0.000002s : 0.02% auto_monad_reorder : 0.000026s : 0.17% get_jit_bprop_graph : 0.000002s : 0.01% rewriter_after_jit_bprop_graph : 0.000005s : 0.04% opt_after_jit_grad : 0.000810s : 5.44% validate : 0.000049s : 0.33% Time group info: ------[substitution.] 0.000331 45 11.33% : 0.000038s : 5: substitution.arithmetic_simplify 8.87% : 0.000029s : 3: substitution.cast_eliminate 0.81% : 0.000003s : 3: substitution.elim_not_effective 0.50% : 0.000002s : 3: substitution.fold_const_symbol 2.11% : 0.000007s : 5: substitution.graph_param_transform 60.74% : 0.000201s : 4: substitution.inline 1.93% : 0.000006s : 6: substitution.j_node_and_user_rematch 4.16% : 0.000014s : 2: substitution.less_batch_normalization 2.16% : 0.000007s : 6: substitution.remove_not_recompute_node 1.83% : 0.000006s : 4: substitution.replace_old_param 5.57% : 0.000018s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.007338 2 85.67% : 0.006287s : 1: type_inference.infer 14.33% : 0.001052s : 1: type_inference.specialize ------[replace.] 0.000075 8 59.97% : 0.000045s : 4: replace.inline 40.03% : 0.000030s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000215 8 92.30% : 0.000198s : 4: match.inline 7.70% : 0.000017s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000290 1596 0.88% : 0.000003s : 17: predicate.accumulaten_eliminater 0.92% : 0.000003s : 5: predicate.ad_related_special_op_eliminate 0.51% : 0.000001s : 10: predicate.addn_check_dump 0.90% : 0.000003s : 17: predicate.addn_zero_filter 0.82% : 0.000002s : 17: predicate.adjust_all_reduce_mul_add 2.51% : 0.000007s : 27: predicate.arithmetic_simplify 0.88% : 0.000003s : 17: predicate.cast_eliminate 0.55% : 0.000002s : 10: predicate.check_bprop_eliminate 0.54% : 0.000002s : 10: predicate.compare_switch_simplify 0.19% : 0.000001s : 5: predicate.const_output_eliminate 0.58% : 0.000002s : 10: predicate.depend_value_elim 0.90% : 0.000003s : 17: predicate.dict_get_item_const_eliminator 1.01% : 0.000003s : 17: predicate.dict_get_item_eliminator 0.91% : 0.000003s : 17: predicate.dict_set_item_eliminator 1.15% : 0.000003s : 10: predicate.dumpgradient_eliminate 0.22% : 0.000001s : 5: predicate.elim_not_effective 0.33% : 0.000001s : 5: predicate.elim_shapecalc_of_broadcastargs 1.08% : 0.000003s : 22: predicate.environ_add_const_eliminate 1.02% : 0.000003s : 22: predicate.environ_get_add_eliminate 1.07% : 0.000003s : 22: predicate.environ_get_depend_swap 1.63% : 0.000005s : 32: predicate.environ_get_eliminate 1.01% : 0.000003s : 22: predicate.environ_get_set_eliminate 1.25% : 0.000004s : 25: predicate.exchange_switch_depend_value 2.67% : 0.000008s : 25: predicate.float_depend_g_call 0.51% : 0.000001s : 10: predicate.float_environ_get_switch 0.83% : 0.000002s : 15: predicate.float_tuple_getitem_switch 0.16% : 0.000000s : 5: predicate.fold_const_symbol 0.58% : 0.000002s : 10: predicate.get_grad_eliminate 0.23% : 0.000001s : 5: predicate.graph_param_transform 0.58% : 0.000002s : 10: predicate.incorporate_call 0.44% : 0.000001s : 10: predicate.incorporate_call_switch 6.02% : 0.000017s : 72: predicate.inline 0.89% : 0.000003s : 10: predicate.inline_without_move 0.29% : 0.000001s : 10: predicate.j_node_and_user_rematch 0.95% : 0.000003s : 10: predicate.less_batch_normalization 1.78% : 0.000005s : 31: predicate.list_to_tuple_eliminator_ 2.29% : 0.000007s : 48: predicate.load_eliminater 0.87% : 0.000003s : 5: predicate.loop_unroll_after_grad 1.86% : 0.000005s : 36: predicate.loop_unroll_before_grad 1.51% : 0.000004s : 27: predicate.make_slice_get_slice_eliminator 0.54% : 0.000002s : 10: predicate.merge_addn 0.72% : 0.000002s : 10: predicate.micro_step_allgather_replace 0.53% : 0.000002s : 10: predicate.mini_step_allgather_replace 0.82% : 0.000002s : 17: predicate.minmaximum_grad 1.26% : 0.000004s : 5: predicate.mutable_eliminate 0.29% : 0.000001s : 5: predicate.opt_reshape 0.38% : 0.000001s : 5: predicate.parallel_virtual_node 1.56% : 0.000005s : 25: predicate.partial_defer_inline 1.46% : 0.000004s : 26: predicate.partial_eliminate 0.91% : 0.000003s : 17: predicate.print_const_string_wrapper 0.53% : 0.000002s : 10: predicate.reduce_all_const_elim 1.15% : 0.000003s : 17: predicate.reduce_eliminate 2.34% : 0.000007s : 48: predicate.redundant_stop_gradient_eliminater 0.34% : 0.000001s : 10: predicate.remove_not_recompute_node 1.21% : 0.000003s : 31: predicate.replace_applicator 0.45% : 0.000001s : 10: predicate.replace_old_param 0.23% : 0.000001s : 5: predicate.reset_defer_inline 0.99% : 0.000003s : 17: predicate.reshape_eliminate 0.66% : 0.000002s : 10: predicate.row_tensor_add_zeros_like 0.39% : 0.000001s : 5: predicate.row_tensor_eliminate 0.76% : 0.000002s : 10: predicate.same_eliminate 0.41% : 0.000001s : 10: predicate.set_cell_output_no_recompute 0.92% : 0.000003s : 10: predicate.shard_identity_eliminate 0.76% : 0.000002s : 10: predicate.special_op_eliminate 0.63% : 0.000002s : 10: predicate.specialize_transform 0.96% : 0.000003s : 10: predicate.split_environ_get_set_with_tuple_value 0.88% : 0.000003s : 10: predicate.stack_unstack_eliminate 0.37% : 0.000001s : 5: predicate.switch_call_monad_eliminater 1.31% : 0.000004s : 25: predicate.switch_defer_inline 1.84% : 0.000005s : 35: predicate.switch_layer_defer_inline 8.21% : 0.000024s : 76: predicate.switch_simplify 0.90% : 0.000003s : 17: predicate.tile_eliminate 0.86% : 0.000002s : 17: predicate.transpose_eliminate 1.61% : 0.000005s : 27: predicate.tuple_list_convert_item_index_to_positive 1.67% : 0.000005s : 27: predicate.tuple_list_get_item_const_eliminator 1.41% : 0.000004s : 27: predicate.tuple_list_get_item_depend_reorder 3.27% : 0.000009s : 41: predicate.tuple_list_get_item_eliminator 1.55% : 0.000005s : 27: predicate.tuple_list_get_set_item_eliminator 2.23% : 0.000006s : 37: predicate.tuple_list_set_item_eliminator 1.76% : 0.000005s : 31: predicate.tuple_to_list_eliminator_ 2.25% : 0.000007s : 48: predicate.updatestate_pure_node_eliminater 2.91% : 0.000008s : 58: predicate.updatestate_useless_node_eliminater 0.47% : 0.000001s : 5: predicate.value_based_eliminate 0.73% : 0.000002s : 10: predicate.virtual_dataset_eliminate 0.57% : 0.000002s : 10: predicate.virtual_output_eliminate 0.29% : 0.000001s : 5: predicate.virtual_view_grad_eliminate 0.41% : 0.000001s : 5: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000744 11 49.06% : 0.000365s : 5: func_graph_cloner_run.FuncGraphClonerGraph 50.94% : 0.000379s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.305866 192 0.00% : 0.000007s : 1: ForceFp32Comm 1.29% : 0.003944s : 1: add_attr 1.28% : 0.003926s : 1: add_attr_with_inline 0.00% : 0.000006s : 1: add_comm_op_reuse_tag 0.02% : 0.000071s : 1: add_recomputation 0.00% : 0.000006s : 1: assign_add_opt 0.03% : 0.000095s : 1: auto_monad 0.01% : 0.000034s : 1: auto_monad_reorder 0.00% : 0.000006s : 1: begin_end_overlap_inline 0.00% : 0.000008s : 1: bias_add_comm_swap 0.18% : 0.000555s : 1: bootstrap 0.01% : 0.000038s : 1: cconv 0.00% : 0.000007s : 1: comm_op_add_attrs 0.01% : 0.000023s : 1: control_data_broadcast_order 0.00% : 0.000015s : 1: convert_after_rewriter 0.01% : 0.000039s : 1: cse_after_recomputation 0.00% : 0.000009s : 1: dataset_repeat_opt 0.01% : 0.000028s : 1: detach_backward 0.00% : 0.000013s : 1: environ_conv 0.01% : 0.000035s : 1: event_method 0.00% : 0.000008s : 1: full_micro_interleaved_order_control 0.00% : 0.000009s : 1: get_jit_bprop_graph 0.00% : 0.000014s : 1: graph_reusing 0.00% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000007s : 1: handle_group_info 0.00% : 0.000009s : 1: inline 0.00% : 0.000008s : 1: insert-virtual-dataset 0.00% : 0.000006s : 1: interleave_parallel_branches 0.00% : 0.000007s : 1: interleave_split_concat_branches 0.00% : 0.000008s : 1: label_fine_grained_interleaved_index 0.00% : 0.000011s : 1: label_micro_interleaved_index 0.18% : 0.000538s : 1: loop_unroll 0.00% : 0.000006s : 1: merge_cast_opt 0.00% : 0.000008s : 1: micro_interleaved_order_control 0.27% : 0.000834s : 1: mutable_eliminate 0.00% : 0.000012s : 1: offloading_packed_experts 0.01% : 0.000019s : 1: opt.transform.loop_unroll_optimizer 0.01% : 0.000022s : 1: opt.transform.mutable_eliminate 0.57% : 0.001729s : 78: opt.transform.opt_a 0.01% : 0.000041s : 1: opt.transform.opt_after_cconv 0.01% : 0.000041s : 1: opt.transform.opt_after_jit_grad 0.05% : 0.000159s : 28: opt.transform.opt_b 0.02% : 0.000071s : 2: opt.transform.opt_trans_graph 0.02% : 0.000050s : 4: opt.transform.symbol_engine_opt 45.64% : 0.139599s : 1: opt_a 0.05% : 0.000157s : 1: opt_after_cconv 0.27% : 0.000823s : 1: opt_after_jit_grad 0.13% : 0.000395s : 1: opt_b 46.78% : 0.143080s : 1: optimize 0.01% : 0.000029s : 1: optimize_parallel_all_gather_comm 0.00% : 0.000012s : 1: order_py_execute_after_rewriter 0.01% : 0.000033s : 1: overlap_grad_flash_sp 0.00% : 0.000007s : 1: overlap_grad_matmul_and_grad_allreduce 0.00% : 0.000012s : 1: overlap_grad_ring_attention 0.00% : 0.000007s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000008s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000010s : 1: overlap_param_gather 0.00% : 0.000007s : 1: overlap_recompute_allgather_and_fa_grad 0.00% : 0.000011s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000009s : 1: overlap_recompute_comm 0.00% : 0.000011s : 1: parallel-infer-symbol 0.00% : 0.000006s : 1: parallel-infer-symbol-second 0.00% : 0.000008s : 1: partial_unused_args_eliminate 0.00% : 0.000010s : 1: pipeline_parallel_scheduler 0.00% : 0.000007s : 1: pipeline_split 0.02% : 0.000053s : 1: pre_auto_parallel 0.01% : 0.000043s : 1: py_interpret_to_execute 0.01% : 0.000025s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000007s : 1: remove_cast_before_assign_add 0.01% : 0.000025s : 1: remove_dup_value 0.19% : 0.000589s : 1: renormalize.infer 0.15% : 0.000457s : 1: renormalize.specialize 0.00% : 0.000008s : 1: reorder_send_recv_between_fp_bp 0.00% : 0.000012s : 1: rewriter_after_jit_bprop_graph 0.02% : 0.000053s : 1: rewriter_after_opt_a 0.03% : 0.000105s : 1: rewriter_before_opt_a 0.00% : 0.000009s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000008s : 1: slice_recompute_activation 0.00% : 0.000007s : 1: split_layernorm_comm 0.00% : 0.000008s : 1: split_matmul_comm_elemetwise 0.00% : 0.000012s : 1: swap_dp_allreduce_reducescatter 0.04% : 0.000125s : 1: symbol_engine_optimizer 0.04% : 0.000123s : 1: tuple_transform 2.44% : 0.007465s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:38:31.278.354 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.186886, [21] [bootstrap]: 0.00050013 [type_inference]: 0.00712339 [event_method]: 2.226e-05 [auto_monad]: 8.158e-05 [graph_reusing]: 6.93e-06 [inline]: 2.53e-06 [add_attr]: 0.0039549, [1] [add_attr_with_inline]: 0.00394282, [1] [Cycle 1]: 8.333e-05, [2] [tag_attr]: 2.634e-05 [meta_addattr_fg_expand]: 6.19001e-06 [parallel-infer-symbol]: 4.13999e-06 [pre_auto_parallel]: 4.257e-05 [insert-virtual-dataset]: 2.43998e-06 [parallel-infer-symbol-second]: 7.59988e-07 [dataset_repeat_opt]: 2.31998e-06 [pipeline_split]: 1.64e-06 [optimize]: 0.174082, [53] [py_interpret_to_execute]: 3.343e-05 [rewriter_before_opt_a]: 9.944e-05 [opt_a]: 0.171009, [2] [Cycle 1]: 0.170061, [45] [expand_dump_flag]: 3.18998e-06 [switch_simplify]: 4.463e-05 [loop_unroll]: 3.119e-05 [a_1]: 0.00085356 [with_stream_mark]: 4.033e-05 [recompute_prepare]: 1.329e-05 [updatestate_depend_eliminate]: 5.34e-06 [updatestate_assign_eliminate]: 4.18001e-06 [updatestate_loads_eliminate]: 3.32002e-06 [parameter_eliminate]: 2.24001e-06 [a_2]: 0.00011884 [accelerated_algorithm]: 2.268e-05 [shard]: 2.02001e-06 [meta_shard_fg_expand]: 2.26e-06 [shard_inline]: 8.82999e-06 [merge_send_recv]: 1.057e-05 [auto_parallel]: 9.04e-06 [parallel]: 2.507e-05 [flash_sp]: 9.28002e-06 [merge_comm]: 5.52001e-06 [allreduce_fusion]: 4.32998e-06 [matmul_add_comm_reduction]: 1.476e-05 [allreduce_slice_to_reducescatter]: 6.50005e-07 [virtual_shard_identity]: 1.202e-05 [virtual_dataset]: 8.47998e-06 [get_grad_eliminate_]: 8.05e-06 [virtual_output]: 7.93001e-06 [merge_forward]: 5.99e-06 [cell_reuse_recompute_pass]: 1.57001e-06 [offload_activation]: 1.304e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.842e-05 [merge_recompute_call_nodes]: 1.50001e-06 [before_grad]: 1.342e-05 [set_forward_comm_id_for_comm_node_pass]: 4.92e-06 [meta_fg_expand]: 3.83001e-06 [flash_sp_send_recv_attached]: 4.90001e-06 [receive_attached]: 2.67001e-06 [after_resolve]: 1.566e-05 [a_after_grad]: 1.282e-05 [renormalize]: 0.168205 [add_forward_monad_depend]: 1.424e-05 [auto_monad_grad]: 3.17002e-06 [auto_monad_eliminator]: 2.777e-05 [cse]: 4.449e-05 [a_3]: 8.454e-05 [Cycle 2]: 0.00093326, [45] [expand_dump_flag]: 2.60997e-06 [switch_simplify]: 1.189e-05 [loop_unroll]: 9.13002e-06 [a_1]: 0.00023136 [with_stream_mark]: 2.097e-05 [recompute_prepare]: 9.24e-06 [updatestate_depend_eliminate]: 5.29e-06 [updatestate_assign_eliminate]: 3.8e-06 [updatestate_loads_eliminate]: 3.5e-06 [parameter_eliminate]: 2.29001e-06 [a_2]: 0.00010374 [accelerated_algorithm]: 1.437e-05 [shard]: 2.44999e-06 [meta_shard_fg_expand]: 2.64001e-06 [shard_inline]: 8.47e-06 [merge_send_recv]: 1.07e-05 [auto_parallel]: 1.129e-05 [parallel]: 1.184e-05 [flash_sp]: 5.18002e-06 [merge_comm]: 4.72e-06 [allreduce_fusion]: 4.18999e-06 [matmul_add_comm_reduction]: 1.428e-05 [allreduce_slice_to_reducescatter]: 8.80013e-07 [virtual_shard_identity]: 1.02e-05 [virtual_dataset]: 7.85e-06 [get_grad_eliminate_]: 8.10999e-06 [virtual_output]: 7.98001e-06 [merge_forward]: 5.00001e-06 [cell_reuse_recompute_pass]: 3.37997e-06 [offload_activation]: 1.392e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.556e-05 [merge_recompute_call_nodes]: 1.93997e-06 [before_grad]: 1.375e-05 [set_forward_comm_id_for_comm_node_pass]: 5.17e-06 [meta_fg_expand]: 3.62002e-06 [flash_sp_send_recv_attached]: 1.77001e-06 [receive_attached]: 2.67001e-06 [after_resolve]: 1.461e-05 [a_after_grad]: 1.219e-05 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 1.86e-06 [auto_monad_grad]: 1.13001e-06 [auto_monad_eliminator]: 1.246e-05 [cse]: 2.448e-05 [a_3]: 5.047e-05 [py_interpret_to_execute_after_opt_a]: 2.115e-05 [slice_cell_reuse_recomputed_activation]: 2.26e-06 [rewriter_after_opt_a]: 5.048e-05 [convert_after_rewriter]: 8.72e-06 [order_py_execute_after_rewriter]: 6.79001e-06 [mutable_eliminate]: 0.00089538 [opt_b]: 0.00031644, [1] [Cycle 1]: 0.00030616, [7] [b_1]: 0.00019702 [b_2]: 1.122e-05 [updatestate_depend_eliminate]: 1.274e-05 [updatestate_assign_eliminate]: 3.75e-06 [updatestate_loads_eliminate]: 3.72002e-06 [renormalize]: 7.10017e-07 [cse]: 3.965e-05 [optimize_parallel_all_gather_comm]: 2.435e-05 [overlap_param_gather]: 2.05002e-06 [cconv]: 3.88e-05 [loop_unroll]: 0.00068817 [opt_after_cconv]: 0.00014975, [1] [Cycle 1]: 0.00014156, [7] [c_1]: 4.9e-05 [parameter_eliminate]: 6.07999e-06 [updatestate_depend_eliminate]: 1.089e-05 [updatestate_assign_eliminate]: 3.32002e-06 [updatestate_loads_eliminate]: 3.85e-06 [cse]: 3.161e-05 [renormalize]: 6.69999e-07 [remove_dup_value]: 1.996e-05 [tuple_transform]: 0.00011034, [1] [Cycle 1]: 0.00010551, [4] [d_1]: 7.481e-05 [none_parameter_eliminate]: 1.60001e-06 [renormalize]: 1.80007e-07 [switch_simplify]: 1.045e-05 [partial_unused_args_eliminate]: 1.89e-06 [add_recomputation]: 7.089e-05 [cse_after_recomputation]: 2.933e-05, [1] [Cycle 1]: 2.293e-05, [1] [cse]: 1.689e-05 [environ_conv]: 8.33999e-06 [swap_dp_allreduce_reducescatter]: 7.56999e-06 [bias_add_comm_swap]: 3.88999e-06 [label_micro_interleaved_index]: 6.53e-06 [label_fine_grained_interleaved_index]: 2.93e-06 [merge_cast_opt]: 1.32e-06 [slice_recompute_activation]: 2.66999e-06 [micro_interleaved_order_control]: 2.41e-06 [assign_add_opt]: 1.38002e-06 [ForceFp32Comm]: 1.25999e-06 [remove_cast_before_assign_add]: 1.33002e-06 [full_micro_interleaved_order_control]: 2.17999e-06 [reorder_send_recv_between_fp_bp]: 2.83e-06 [comm_op_add_attrs]: 9.79984e-07 [add_comm_op_reuse_tag]: 1.02e-06 [interleave_split_concat_branches]: 1.13001e-06 [interleave_parallel_branches]: 1.11002e-06 [overlap_opt_shard_in_pipeline]: 1.42e-06 [overlap_opt_shard_grad_in_pipeline]: 2.32999e-06 [control_data_broadcast_order]: 1.703e-05 [grouped_pairwise_exchange_alltoall]: 1.74e-06 [offloading_packed_experts]: 6.32001e-06 [overlap_recompute_and_grad_model_parallel]: 6.73e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.17e-06 [overlap_recompute_allgather_and_fa_grad]: 1.48002e-06 [overlap_recompute_comm]: 2.34001e-06 [overlap_grad_ring_attention]: 5.06997e-06 [overlap_grad_flash_sp]: 2.755e-05 [begin_end_overlap_inline]: 7.29982e-07 [split_matmul_comm_elemetwise]: 2.26e-06 [split_layernorm_comm]: 1.72999e-06 [handle_group_info]: 1.22999e-06 [symbol_engine_optimizer]: 0.00010884, [1] [Cycle 1]: 0.0001035, [6] [build]: 5.05001e-06 [elim_shapecalc]: 1.699e-05 [elim_not_effective]: 2.002e-05 [opt_reshape]: 8.98002e-06 [fold_const_symbol]: 1.343e-05 [renormalize]: 2.19996e-07 [detach_backward]: 2.46e-06 [pipeline_parallel_scheduler]: 1.50001e-06 [auto_monad_reorder]: 2.352e-05 [get_jit_bprop_graph]: 3.06001e-06 [rewriter_after_jit_bprop_graph]: 6.84001e-06 [opt_after_jit_grad]: 0.00076183 [validate]: 5.95e-05 Sums bootstrap : 0.000500s : 0.28% type_inference : 0.007123s : 3.92% event_method : 0.000022s : 0.01% auto_monad : 0.000082s : 0.04% graph_reusing : 0.000007s : 0.00% inline : 0.000003s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000026s : 0.01% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.00% parallel-infer-symbol : 0.000004s : 0.00% pre_auto_parallel : 0.000043s : 0.02% insert-virtual-dataset : 0.000002s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000033s : 0.02% optimize.rewriter_before_opt_a : 0.000099s : 0.05% optimize.opt_a.expand_dump_flag : 0.000006s : 0.00% optimize.opt_a.switch_simplify : 0.000057s : 0.03% optimize.opt_a.loop_unroll : 0.000040s : 0.02% optimize.opt_a.a_1 : 0.001085s : 0.60% optimize.opt_a.with_stream_mark : 0.000061s : 0.03% optimize.opt_a.recompute_prepare : 0.000023s : 0.01% optimize.opt_a.updatestate_depend_eliminate : 0.000011s : 0.01% optimize.opt_a.updatestate_assign_eliminate : 0.000008s : 0.00% optimize.opt_a.updatestate_loads_eliminate : 0.000007s : 0.00% optimize.opt_a.parameter_eliminate : 0.000005s : 0.00% optimize.opt_a.a_2 : 0.000223s : 0.12% optimize.opt_a.accelerated_algorithm : 0.000037s : 0.02% optimize.opt_a.shard : 0.000004s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.000005s : 0.00% optimize.opt_a.shard_inline : 0.000017s : 0.01% optimize.opt_a.merge_send_recv : 0.000021s : 0.01% optimize.opt_a.auto_parallel : 0.000020s : 0.01% optimize.opt_a.parallel : 0.000037s : 0.02% optimize.opt_a.flash_sp : 0.000014s : 0.01% optimize.opt_a.merge_comm : 0.000010s : 0.01% optimize.opt_a.allreduce_fusion : 0.000009s : 0.00% optimize.opt_a.matmul_add_comm_reduction : 0.000029s : 0.02% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000002s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000022s : 0.01% optimize.opt_a.virtual_dataset : 0.000016s : 0.01% optimize.opt_a.get_grad_eliminate_ : 0.000016s : 0.01% optimize.opt_a.virtual_output : 0.000016s : 0.01% optimize.opt_a.merge_forward : 0.000011s : 0.01% optimize.opt_a.cell_reuse_recompute_pass : 0.000005s : 0.00% optimize.opt_a.offload_activation : 0.000027s : 0.01% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000034s : 0.02% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.00% optimize.opt_a.before_grad : 0.000027s : 0.01% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000010s : 0.01% optimize.opt_a.meta_fg_expand : 0.000007s : 0.00% optimize.opt_a.flash_sp_send_recv_attached : 0.000007s : 0.00% optimize.opt_a.receive_attached : 0.000005s : 0.00% optimize.opt_a.after_resolve : 0.000030s : 0.02% optimize.opt_a.a_after_grad : 0.000025s : 0.01% optimize.opt_a.renormalize : 0.168205s : 92.53% optimize.opt_a.add_forward_monad_depend : 0.000016s : 0.01% optimize.opt_a.auto_monad_grad : 0.000004s : 0.00% optimize.opt_a.auto_monad_eliminator : 0.000040s : 0.02% optimize.opt_a.cse : 0.000069s : 0.04% optimize.opt_a.a_3 : 0.000135s : 0.07% optimize.py_interpret_to_execute_after_opt_a : 0.000021s : 0.01% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.00% optimize.rewriter_after_opt_a : 0.000050s : 0.03% optimize.convert_after_rewriter : 0.000009s : 0.00% optimize.order_py_execute_after_rewriter : 0.000007s : 0.00% optimize.mutable_eliminate : 0.000895s : 0.49% optimize.opt_b.b_1 : 0.000197s : 0.11% optimize.opt_b.b_2 : 0.000011s : 0.01% optimize.opt_b.updatestate_depend_eliminate : 0.000013s : 0.01% optimize.opt_b.updatestate_assign_eliminate : 0.000004s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000004s : 0.00% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000040s : 0.02% optimize.optimize_parallel_all_gather_comm : 0.000024s : 0.01% optimize.overlap_param_gather : 0.000002s : 0.00% optimize.cconv : 0.000039s : 0.02% optimize.loop_unroll : 0.000688s : 0.38% optimize.opt_after_cconv.c_1 : 0.000049s : 0.03% optimize.opt_after_cconv.parameter_eliminate : 0.000006s : 0.00% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000011s : 0.01% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000004s : 0.00% optimize.opt_after_cconv.cse : 0.000032s : 0.02% optimize.opt_after_cconv.renormalize : 0.000001s : 0.00% optimize.remove_dup_value : 0.000020s : 0.01% optimize.tuple_transform.d_1 : 0.000075s : 0.04% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000010s : 0.01% optimize.partial_unused_args_eliminate : 0.000002s : 0.00% optimize.add_recomputation : 0.000071s : 0.04% optimize.cse_after_recomputation.cse : 0.000017s : 0.01% optimize.environ_conv : 0.000008s : 0.00% optimize.swap_dp_allreduce_reducescatter : 0.000008s : 0.00% optimize.bias_add_comm_swap : 0.000004s : 0.00% optimize.label_micro_interleaved_index : 0.000007s : 0.00% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.00% optimize.merge_cast_opt : 0.000001s : 0.00% optimize.slice_recompute_activation : 0.000003s : 0.00% optimize.micro_interleaved_order_control : 0.000002s : 0.00% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.00% optimize.full_micro_interleaved_order_control : 0.000002s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.00% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.00% optimize.control_data_broadcast_order : 0.000017s : 0.01% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.00% optimize.offloading_packed_experts : 0.000006s : 0.00% optimize.overlap_recompute_and_grad_model_parallel : 0.000007s : 0.00% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.00% optimize.overlap_recompute_comm : 0.000002s : 0.00% optimize.overlap_grad_ring_attention : 0.000005s : 0.00% optimize.overlap_grad_flash_sp : 0.000028s : 0.02% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.00% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000005s : 0.00% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000017s : 0.01% optimize.symbol_engine_optimizer.elim_not_effective : 0.000020s : 0.01% optimize.symbol_engine_optimizer.opt_reshape : 0.000009s : 0.00% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000013s : 0.01% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.00% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000024s : 0.01% get_jit_bprop_graph : 0.000003s : 0.00% rewriter_after_jit_bprop_graph : 0.000007s : 0.00% opt_after_jit_grad : 0.000762s : 0.42% validate : 0.000060s : 0.03% Time group info: ------[substitution.] 0.000315 45 13.39% : 0.000042s : 5: substitution.arithmetic_simplify 10.50% : 0.000033s : 3: substitution.cast_eliminate 0.93% : 0.000003s : 3: substitution.elim_not_effective 0.49% : 0.000002s : 3: substitution.fold_const_symbol 2.72% : 0.000009s : 5: substitution.graph_param_transform 55.93% : 0.000176s : 4: substitution.inline 1.74% : 0.000005s : 6: substitution.j_node_and_user_rematch 4.34% : 0.000014s : 2: substitution.less_batch_normalization 2.46% : 0.000008s : 6: substitution.remove_not_recompute_node 2.09% : 0.000007s : 4: substitution.replace_old_param 5.41% : 0.000017s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.007044 2 86.30% : 0.006079s : 1: type_inference.infer 13.70% : 0.000965s : 1: type_inference.specialize ------[replace.] 0.000073 8 59.47% : 0.000043s : 4: replace.inline 40.53% : 0.000030s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000188 8 92.01% : 0.000173s : 4: match.inline 7.99% : 0.000015s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000290 1596 0.94% : 0.000003s : 17: predicate.accumulaten_eliminater 1.07% : 0.000003s : 5: predicate.ad_related_special_op_eliminate 0.51% : 0.000001s : 10: predicate.addn_check_dump 0.89% : 0.000003s : 17: predicate.addn_zero_filter 0.86% : 0.000002s : 17: predicate.adjust_all_reduce_mul_add 2.95% : 0.000009s : 27: predicate.arithmetic_simplify 0.84% : 0.000002s : 17: predicate.cast_eliminate 0.60% : 0.000002s : 10: predicate.check_bprop_eliminate 0.51% : 0.000001s : 10: predicate.compare_switch_simplify 0.16% : 0.000000s : 5: predicate.const_output_eliminate 0.71% : 0.000002s : 10: predicate.depend_value_elim 0.86% : 0.000002s : 17: predicate.dict_get_item_const_eliminator 1.12% : 0.000003s : 17: predicate.dict_get_item_eliminator 0.88% : 0.000003s : 17: predicate.dict_set_item_eliminator 1.02% : 0.000003s : 10: predicate.dumpgradient_eliminate 0.23% : 0.000001s : 5: predicate.elim_not_effective 0.51% : 0.000001s : 5: predicate.elim_shapecalc_of_broadcastargs 1.21% : 0.000003s : 22: predicate.environ_add_const_eliminate 1.03% : 0.000003s : 22: predicate.environ_get_add_eliminate 1.07% : 0.000003s : 22: predicate.environ_get_depend_swap 1.76% : 0.000005s : 32: predicate.environ_get_eliminate 1.05% : 0.000003s : 22: predicate.environ_get_set_eliminate 1.30% : 0.000004s : 25: predicate.exchange_switch_depend_value 2.15% : 0.000006s : 25: predicate.float_depend_g_call 0.51% : 0.000001s : 10: predicate.float_environ_get_switch 0.89% : 0.000003s : 15: predicate.float_tuple_getitem_switch 0.17% : 0.000000s : 5: predicate.fold_const_symbol 0.70% : 0.000002s : 10: predicate.get_grad_eliminate 0.24% : 0.000001s : 5: predicate.graph_param_transform 0.55% : 0.000002s : 10: predicate.incorporate_call 0.44% : 0.000001s : 10: predicate.incorporate_call_switch 6.06% : 0.000018s : 72: predicate.inline 0.75% : 0.000002s : 10: predicate.inline_without_move 0.30% : 0.000001s : 10: predicate.j_node_and_user_rematch 0.98% : 0.000003s : 10: predicate.less_batch_normalization 1.84% : 0.000005s : 31: predicate.list_to_tuple_eliminator_ 2.50% : 0.000007s : 48: predicate.load_eliminater 1.51% : 0.000004s : 5: predicate.loop_unroll_after_grad 1.85% : 0.000005s : 36: predicate.loop_unroll_before_grad 1.73% : 0.000005s : 27: predicate.make_slice_get_slice_eliminator 0.55% : 0.000002s : 10: predicate.merge_addn 0.76% : 0.000002s : 10: predicate.micro_step_allgather_replace 0.65% : 0.000002s : 10: predicate.mini_step_allgather_replace 0.80% : 0.000002s : 17: predicate.minmaximum_grad 1.63% : 0.000005s : 5: predicate.mutable_eliminate 0.32% : 0.000001s : 5: predicate.opt_reshape 0.37% : 0.000001s : 5: predicate.parallel_virtual_node 1.69% : 0.000005s : 25: predicate.partial_defer_inline 1.49% : 0.000004s : 26: predicate.partial_eliminate 0.86% : 0.000002s : 17: predicate.print_const_string_wrapper 0.62% : 0.000002s : 10: predicate.reduce_all_const_elim 1.15% : 0.000003s : 17: predicate.reduce_eliminate 2.35% : 0.000007s : 48: predicate.redundant_stop_gradient_eliminater 0.35% : 0.000001s : 10: predicate.remove_not_recompute_node 1.42% : 0.000004s : 31: predicate.replace_applicator 0.50% : 0.000001s : 10: predicate.replace_old_param 0.30% : 0.000001s : 5: predicate.reset_defer_inline 1.03% : 0.000003s : 17: predicate.reshape_eliminate 0.64% : 0.000002s : 10: predicate.row_tensor_add_zeros_like 0.45% : 0.000001s : 5: predicate.row_tensor_eliminate 0.83% : 0.000002s : 10: predicate.same_eliminate 0.36% : 0.000001s : 10: predicate.set_cell_output_no_recompute 0.67% : 0.000002s : 10: predicate.shard_identity_eliminate 0.85% : 0.000002s : 10: predicate.special_op_eliminate 0.67% : 0.000002s : 10: predicate.specialize_transform 0.94% : 0.000003s : 10: predicate.split_environ_get_set_with_tuple_value 0.84% : 0.000002s : 10: predicate.stack_unstack_eliminate 0.35% : 0.000001s : 5: predicate.switch_call_monad_eliminater 1.78% : 0.000005s : 25: predicate.switch_defer_inline 1.83% : 0.000005s : 35: predicate.switch_layer_defer_inline 4.24% : 0.000012s : 76: predicate.switch_simplify 0.90% : 0.000003s : 17: predicate.tile_eliminate 1.04% : 0.000003s : 17: predicate.transpose_eliminate 1.66% : 0.000005s : 27: predicate.tuple_list_convert_item_index_to_positive 1.51% : 0.000004s : 27: predicate.tuple_list_get_item_const_eliminator 1.59% : 0.000005s : 27: predicate.tuple_list_get_item_depend_reorder 3.13% : 0.000009s : 41: predicate.tuple_list_get_item_eliminator 1.48% : 0.000004s : 27: predicate.tuple_list_get_set_item_eliminator 2.57% : 0.000007s : 37: predicate.tuple_list_set_item_eliminator 1.82% : 0.000005s : 31: predicate.tuple_to_list_eliminator_ 2.41% : 0.000007s : 48: predicate.updatestate_pure_node_eliminater 2.90% : 0.000008s : 58: predicate.updatestate_useless_node_eliminater 0.50% : 0.000001s : 5: predicate.value_based_eliminate 0.70% : 0.000002s : 10: predicate.virtual_dataset_eliminate 0.60% : 0.000002s : 10: predicate.virtual_output_eliminate 0.25% : 0.000001s : 5: predicate.virtual_view_grad_eliminate 0.34% : 0.000001s : 5: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000797 11 44.53% : 0.000355s : 5: func_graph_cloner_run.FuncGraphClonerGraph 55.47% : 0.000442s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.535080 192 0.00% : 0.000005s : 1: ForceFp32Comm 0.74% : 0.003961s : 1: add_attr 0.74% : 0.003947s : 1: add_attr_with_inline 0.00% : 0.000004s : 1: add_comm_op_reuse_tag 0.01% : 0.000076s : 1: add_recomputation 0.00% : 0.000004s : 1: assign_add_opt 0.02% : 0.000089s : 1: auto_monad 0.01% : 0.000028s : 1: auto_monad_reorder 0.00% : 0.000004s : 1: begin_end_overlap_inline 0.00% : 0.000007s : 1: bias_add_comm_swap 0.10% : 0.000533s : 1: bootstrap 0.01% : 0.000042s : 1: cconv 0.00% : 0.000004s : 1: comm_op_add_attrs 0.00% : 0.000020s : 1: control_data_broadcast_order 0.00% : 0.000012s : 1: convert_after_rewriter 0.01% : 0.000032s : 1: cse_after_recomputation 0.00% : 0.000005s : 1: dataset_repeat_opt 0.00% : 0.000006s : 1: detach_backward 0.00% : 0.000011s : 1: environ_conv 0.01% : 0.000030s : 1: event_method 0.00% : 0.000005s : 1: full_micro_interleaved_order_control 0.00% : 0.000006s : 1: get_jit_bprop_graph 0.00% : 0.000011s : 1: graph_reusing 0.00% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000004s : 1: handle_group_info 0.00% : 0.000006s : 1: inline 0.00% : 0.000006s : 1: insert-virtual-dataset 0.00% : 0.000004s : 1: interleave_parallel_branches 0.00% : 0.000004s : 1: interleave_split_concat_branches 0.00% : 0.000007s : 1: label_fine_grained_interleaved_index 0.00% : 0.000009s : 1: label_micro_interleaved_index 0.13% : 0.000702s : 1: loop_unroll 0.00% : 0.000004s : 1: merge_cast_opt 0.00% : 0.000006s : 1: micro_interleaved_order_control 0.17% : 0.000912s : 1: mutable_eliminate 0.00% : 0.000009s : 1: offloading_packed_experts 0.00% : 0.000025s : 1: opt.transform.loop_unroll_optimizer 0.01% : 0.000030s : 1: opt.transform.mutable_eliminate 0.31% : 0.001685s : 78: opt.transform.opt_a 0.01% : 0.000047s : 1: opt.transform.opt_after_cconv 0.01% : 0.000043s : 1: opt.transform.opt_after_jit_grad 0.03% : 0.000168s : 28: opt.transform.opt_b 0.02% : 0.000083s : 2: opt.transform.opt_trans_graph 0.01% : 0.000055s : 4: opt.transform.symbol_engine_opt 31.96% : 0.171013s : 1: opt_a 0.03% : 0.000155s : 1: opt_after_cconv 0.15% : 0.000777s : 1: opt_after_jit_grad 0.06% : 0.000320s : 1: opt_b 32.54% : 0.174089s : 1: optimize 0.01% : 0.000028s : 1: optimize_parallel_all_gather_comm 0.00% : 0.000010s : 1: order_py_execute_after_rewriter 0.01% : 0.000031s : 1: overlap_grad_flash_sp 0.00% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.00% : 0.000008s : 1: overlap_grad_ring_attention 0.00% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000006s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000005s : 1: overlap_param_gather 0.00% : 0.000006s : 1: overlap_recompute_allgather_and_fa_grad 0.00% : 0.000010s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000014s : 1: overlap_recompute_comm 0.00% : 0.000009s : 1: parallel-infer-symbol 0.00% : 0.000004s : 1: parallel-infer-symbol-second 0.00% : 0.000006s : 1: partial_unused_args_eliminate 0.00% : 0.000020s : 1: pipeline_parallel_scheduler 0.00% : 0.000004s : 1: pipeline_split 0.01% : 0.000047s : 1: pre_auto_parallel 0.01% : 0.000037s : 1: py_interpret_to_execute 0.00% : 0.000024s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000006s : 1: remove_cast_before_assign_add 0.00% : 0.000023s : 1: remove_dup_value 31.32% : 0.167577s : 1: renormalize.infer 0.11% : 0.000605s : 1: renormalize.specialize 0.00% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.00% : 0.000011s : 1: rewriter_after_jit_bprop_graph 0.01% : 0.000056s : 1: rewriter_after_opt_a 0.02% : 0.000104s : 1: rewriter_before_opt_a 0.00% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000005s : 1: slice_recompute_activation 0.00% : 0.000004s : 1: split_layernorm_comm 0.00% : 0.000005s : 1: split_matmul_comm_elemetwise 0.00% : 0.000011s : 1: swap_dp_allreduce_reducescatter 0.02% : 0.000112s : 1: symbol_engine_optimizer 0.02% : 0.000113s : 1: tuple_transform 1.34% : 0.007149s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:38:33.881.067 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:38:33.881.372 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.210535, [21] [bootstrap]: 0.190582 [type_inference]: 0.00676941 [event_method]: 2.155e-05 [auto_monad]: 7.579e-05 [graph_reusing]: 6.89999e-06 [inline]: 2.33998e-06 [add_attr]: 0.00361863, [1] [add_attr_with_inline]: 0.00360801, [1] [Cycle 1]: 9.126e-05, [2] [tag_attr]: 2.431e-05 [meta_addattr_fg_expand]: 6.96999e-06 [parallel-infer-symbol]: 4.06001e-06 [pre_auto_parallel]: 4.019e-05 [insert-virtual-dataset]: 2.55002e-06 [parallel-infer-symbol-second]: 8.10018e-07 [dataset_repeat_opt]: 2.40002e-06 [pipeline_split]: 1.86e-06 [optimize]: 0.00755547, [53] [py_interpret_to_execute]: 3.473e-05 [rewriter_before_opt_a]: 9.965e-05 [opt_a]: 0.00414409, [2] [Cycle 1]: 0.00302123, [45] [expand_dump_flag]: 3.45e-06 [switch_simplify]: 4.36e-05 [loop_unroll]: 3.244e-05 [a_1]: 0.00084491 [with_stream_mark]: 2.186e-05 [recompute_prepare]: 1.139e-05 [updatestate_depend_eliminate]: 4.85001e-06 [updatestate_assign_eliminate]: 3.93999e-06 [updatestate_loads_eliminate]: 3.93999e-06 [parameter_eliminate]: 2.35002e-06 [a_2]: 0.00015099 [accelerated_algorithm]: 2.31e-05 [shard]: 2.27001e-06 [meta_shard_fg_expand]: 2.44999e-06 [shard_inline]: 8.85999e-06 [merge_send_recv]: 1.011e-05 [auto_parallel]: 8.04997e-06 [parallel]: 2.022e-05 [flash_sp]: 8.72e-06 [merge_comm]: 4.53999e-06 [allreduce_fusion]: 4.60001e-06 [matmul_add_comm_reduction]: 1.316e-05 [allreduce_slice_to_reducescatter]: 1.24998e-06 [virtual_shard_identity]: 1.049e-05 [virtual_dataset]: 9.52001e-06 [get_grad_eliminate_]: 8.04002e-06 [virtual_output]: 8.3e-06 [merge_forward]: 4.25999e-06 [cell_reuse_recompute_pass]: 1.94e-06 [offload_activation]: 1.129e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.929e-05 [merge_recompute_call_nodes]: 1.48002e-06 [before_grad]: 1.357e-05 [set_forward_comm_id_for_comm_node_pass]: 4.35e-06 [meta_fg_expand]: 4.31002e-06 [flash_sp_send_recv_attached]: 4.79998e-06 [receive_attached]: 2.59001e-06 [after_resolve]: 1.59e-05 [a_after_grad]: 1.34e-05 [renormalize]: 0.00107017 [add_forward_monad_depend]: 9.60001e-06 [auto_monad_grad]: 2.78e-06 [auto_monad_eliminator]: 2.349e-05 [cse]: 4.098e-05 [a_3]: 8.548e-05 [Cycle 2]: 0.00110428, [45] [expand_dump_flag]: 2.44001e-06 [switch_simplify]: 1.095e-05 [loop_unroll]: 8.55001e-06 [a_1]: 0.00021814 [with_stream_mark]: 1.753e-05 [recompute_prepare]: 9.44998e-06 [updatestate_depend_eliminate]: 4.68999e-06 [updatestate_assign_eliminate]: 3.70998e-06 [updatestate_loads_eliminate]: 3.55e-06 [parameter_eliminate]: 1.74998e-06 [a_2]: 0.00013411 [accelerated_algorithm]: 1.472e-05 [shard]: 1.96998e-06 [meta_shard_fg_expand]: 2.27999e-06 [shard_inline]: 8.25999e-06 [merge_send_recv]: 1.147e-05 [auto_parallel]: 1.045e-05 [parallel]: 8.21002e-06 [flash_sp]: 3.93001e-06 [merge_comm]: 6.02001e-06 [allreduce_fusion]: 4.32e-06 [matmul_add_comm_reduction]: 1.035e-05 [allreduce_slice_to_reducescatter]: 6.30011e-07 [virtual_shard_identity]: 1.066e-05 [virtual_dataset]: 9.75002e-06 [get_grad_eliminate_]: 8.48999e-06 [virtual_output]: 7.98001e-06 [merge_forward]: 5.03002e-06 [cell_reuse_recompute_pass]: 3.51001e-06 [offload_activation]: 1.126e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.895e-05 [merge_recompute_call_nodes]: 1.05001e-06 [before_grad]: 1.411e-05 [set_forward_comm_id_for_comm_node_pass]: 6.26998e-06 [meta_fg_expand]: 3.33e-06 [flash_sp_send_recv_attached]: 2.09e-06 [receive_attached]: 2.12001e-06 [after_resolve]: 1.627e-05 [a_after_grad]: 1.287e-05 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 2.07999e-06 [auto_monad_grad]: 2.03002e-06 [auto_monad_eliminator]: 1.381e-05 [cse]: 2.66e-05 [a_3]: 6.326e-05 [py_interpret_to_execute_after_opt_a]: 2.42e-05 [slice_cell_reuse_recomputed_activation]: 6.55002e-06 [rewriter_after_opt_a]: 5.269e-05 [convert_after_rewriter]: 1.186e-05 [order_py_execute_after_rewriter]: 9.94001e-06 [mutable_eliminate]: 0.00091508 [opt_b]: 0.00042173, [1] [Cycle 1]: 0.00040726, [7] [b_1]: 0.00025909 [b_2]: 1.26e-05 [updatestate_depend_eliminate]: 1.104e-05 [updatestate_assign_eliminate]: 4.07e-06 [updatestate_loads_eliminate]: 3.75998e-06 [renormalize]: 6.30011e-07 [cse]: 3.668e-05 [optimize_parallel_all_gather_comm]: 2.911e-05 [overlap_param_gather]: 5.85002e-06 [cconv]: 4.175e-05 [loop_unroll]: 0.00066751 [opt_after_cconv]: 0.0001845, [1] [Cycle 1]: 0.00017221, [7] [c_1]: 4.785e-05 [parameter_eliminate]: 5.29e-06 [updatestate_depend_eliminate]: 8.29002e-06 [updatestate_assign_eliminate]: 3.53e-06 [updatestate_loads_eliminate]: 3.63999e-06 [cse]: 3.192e-05 [renormalize]: 8.90024e-07 [remove_dup_value]: 2.394e-05 [tuple_transform]: 0.00013548, [1] [Cycle 1]: 0.00012583, [4] [d_1]: 7.337e-05 [none_parameter_eliminate]: 2.63998e-06 [renormalize]: 2.3999e-07 [switch_simplify]: 1.017e-05 [partial_unused_args_eliminate]: 6.56999e-06 [add_recomputation]: 7.575e-05 [cse_after_recomputation]: 4.047e-05, [1] [Cycle 1]: 3.082e-05, [1] [cse]: 1.868e-05 [environ_conv]: 1.268e-05 [swap_dp_allreduce_reducescatter]: 1.053e-05 [bias_add_comm_swap]: 7.19001e-06 [label_micro_interleaved_index]: 1.044e-05 [label_fine_grained_interleaved_index]: 6.45002e-06 [merge_cast_opt]: 4.77e-06 [slice_recompute_activation]: 5.52999e-06 [micro_interleaved_order_control]: 6.74001e-06 [assign_add_opt]: 4.90001e-06 [ForceFp32Comm]: 4.50999e-06 [remove_cast_before_assign_add]: 4.53999e-06 [full_micro_interleaved_order_control]: 5.14e-06 [reorder_send_recv_between_fp_bp]: 6.11e-06 [comm_op_add_attrs]: 4.50001e-06 [add_comm_op_reuse_tag]: 4.28999e-06 [interleave_split_concat_branches]: 4.87e-06 [interleave_parallel_branches]: 4.66002e-06 [overlap_opt_shard_in_pipeline]: 4.79e-06 [overlap_opt_shard_grad_in_pipeline]: 5.10001e-06 [control_data_broadcast_order]: 2.267e-05 [grouped_pairwise_exchange_alltoall]: 5.02e-06 [offloading_packed_experts]: 9.23002e-06 [overlap_recompute_and_grad_model_parallel]: 9.43002e-06 [overlap_grad_matmul_and_grad_allreduce]: 4.32e-06 [overlap_recompute_allgather_and_fa_grad]: 4.55999e-06 [overlap_recompute_comm]: 5.98002e-06 [overlap_grad_ring_attention]: 9.47999e-06 [overlap_grad_flash_sp]: 3.185e-05 [begin_end_overlap_inline]: 4.24997e-06 [split_matmul_comm_elemetwise]: 5.37001e-06 [split_layernorm_comm]: 5.72001e-06 [handle_group_info]: 4.26001e-06 [symbol_engine_optimizer]: 0.00013658, [1] [Cycle 1]: 0.00012672, [6] [build]: 4.74998e-06 [elim_shapecalc]: 1.533e-05 [elim_not_effective]: 1.956e-05 [opt_reshape]: 9.84999e-06 [fold_const_symbol]: 1.358e-05 [renormalize]: 2.20025e-07 [detach_backward]: 5.92999e-06 [pipeline_parallel_scheduler]: 2.36e-06 [auto_monad_reorder]: 3.071e-05 [get_jit_bprop_graph]: 2.04e-06 [rewriter_after_jit_bprop_graph]: 6.93e-06 [opt_after_jit_grad]: 0.00087573 [validate]: 6.294e-05 Sums bootstrap : 0.190582s : 93.10% type_inference : 0.006769s : 3.31% event_method : 0.000022s : 0.01% auto_monad : 0.000076s : 0.04% graph_reusing : 0.000007s : 0.00% inline : 0.000002s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000024s : 0.01% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000007s : 0.00% parallel-infer-symbol : 0.000004s : 0.00% pre_auto_parallel : 0.000040s : 0.02% insert-virtual-dataset : 0.000003s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000035s : 0.02% optimize.rewriter_before_opt_a : 0.000100s : 0.05% optimize.opt_a.expand_dump_flag : 0.000006s : 0.00% optimize.opt_a.switch_simplify : 0.000055s : 0.03% optimize.opt_a.loop_unroll : 0.000041s : 0.02% optimize.opt_a.a_1 : 0.001063s : 0.52% optimize.opt_a.with_stream_mark : 0.000039s : 0.02% optimize.opt_a.recompute_prepare : 0.000021s : 0.01% optimize.opt_a.updatestate_depend_eliminate : 0.000010s : 0.00% optimize.opt_a.updatestate_assign_eliminate : 0.000008s : 0.00% optimize.opt_a.updatestate_loads_eliminate : 0.000007s : 0.00% optimize.opt_a.parameter_eliminate : 0.000004s : 0.00% optimize.opt_a.a_2 : 0.000285s : 0.14% optimize.opt_a.accelerated_algorithm : 0.000038s : 0.02% optimize.opt_a.shard : 0.000004s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.000005s : 0.00% optimize.opt_a.shard_inline : 0.000017s : 0.01% optimize.opt_a.merge_send_recv : 0.000022s : 0.01% optimize.opt_a.auto_parallel : 0.000018s : 0.01% optimize.opt_a.parallel : 0.000028s : 0.01% optimize.opt_a.flash_sp : 0.000013s : 0.01% optimize.opt_a.merge_comm : 0.000011s : 0.01% optimize.opt_a.allreduce_fusion : 0.000009s : 0.00% optimize.opt_a.matmul_add_comm_reduction : 0.000024s : 0.01% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000002s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000021s : 0.01% optimize.opt_a.virtual_dataset : 0.000019s : 0.01% optimize.opt_a.get_grad_eliminate_ : 0.000017s : 0.01% optimize.opt_a.virtual_output : 0.000016s : 0.01% optimize.opt_a.merge_forward : 0.000009s : 0.00% optimize.opt_a.cell_reuse_recompute_pass : 0.000005s : 0.00% optimize.opt_a.offload_activation : 0.000023s : 0.01% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000038s : 0.02% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.00% optimize.opt_a.before_grad : 0.000028s : 0.01% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000011s : 0.01% optimize.opt_a.meta_fg_expand : 0.000008s : 0.00% optimize.opt_a.flash_sp_send_recv_attached : 0.000007s : 0.00% optimize.opt_a.receive_attached : 0.000005s : 0.00% optimize.opt_a.after_resolve : 0.000032s : 0.02% optimize.opt_a.a_after_grad : 0.000026s : 0.01% optimize.opt_a.renormalize : 0.001070s : 0.52% optimize.opt_a.add_forward_monad_depend : 0.000012s : 0.01% optimize.opt_a.auto_monad_grad : 0.000005s : 0.00% optimize.opt_a.auto_monad_eliminator : 0.000037s : 0.02% optimize.opt_a.cse : 0.000068s : 0.03% optimize.opt_a.a_3 : 0.000149s : 0.07% optimize.py_interpret_to_execute_after_opt_a : 0.000024s : 0.01% optimize.slice_cell_reuse_recomputed_activation : 0.000007s : 0.00% optimize.rewriter_after_opt_a : 0.000053s : 0.03% optimize.convert_after_rewriter : 0.000012s : 0.01% optimize.order_py_execute_after_rewriter : 0.000010s : 0.00% optimize.mutable_eliminate : 0.000915s : 0.45% optimize.opt_b.b_1 : 0.000259s : 0.13% optimize.opt_b.b_2 : 0.000013s : 0.01% optimize.opt_b.updatestate_depend_eliminate : 0.000011s : 0.01% optimize.opt_b.updatestate_assign_eliminate : 0.000004s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000004s : 0.00% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000037s : 0.02% optimize.optimize_parallel_all_gather_comm : 0.000029s : 0.01% optimize.overlap_param_gather : 0.000006s : 0.00% optimize.cconv : 0.000042s : 0.02% optimize.loop_unroll : 0.000668s : 0.33% optimize.opt_after_cconv.c_1 : 0.000048s : 0.02% optimize.opt_after_cconv.parameter_eliminate : 0.000005s : 0.00% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000008s : 0.00% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000004s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000004s : 0.00% optimize.opt_after_cconv.cse : 0.000032s : 0.02% optimize.opt_after_cconv.renormalize : 0.000001s : 0.00% optimize.remove_dup_value : 0.000024s : 0.01% optimize.tuple_transform.d_1 : 0.000073s : 0.04% optimize.tuple_transform.none_parameter_eliminate : 0.000003s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000010s : 0.00% optimize.partial_unused_args_eliminate : 0.000007s : 0.00% optimize.add_recomputation : 0.000076s : 0.04% optimize.cse_after_recomputation.cse : 0.000019s : 0.01% optimize.environ_conv : 0.000013s : 0.01% optimize.swap_dp_allreduce_reducescatter : 0.000011s : 0.01% optimize.bias_add_comm_swap : 0.000007s : 0.00% optimize.label_micro_interleaved_index : 0.000010s : 0.01% optimize.label_fine_grained_interleaved_index : 0.000006s : 0.00% optimize.merge_cast_opt : 0.000005s : 0.00% optimize.slice_recompute_activation : 0.000006s : 0.00% optimize.micro_interleaved_order_control : 0.000007s : 0.00% optimize.assign_add_opt : 0.000005s : 0.00% optimize.ForceFp32Comm : 0.000005s : 0.00% optimize.remove_cast_before_assign_add : 0.000005s : 0.00% optimize.full_micro_interleaved_order_control : 0.000005s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000006s : 0.00% optimize.comm_op_add_attrs : 0.000005s : 0.00% optimize.add_comm_op_reuse_tag : 0.000004s : 0.00% optimize.interleave_split_concat_branches : 0.000005s : 0.00% optimize.interleave_parallel_branches : 0.000005s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000005s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000005s : 0.00% optimize.control_data_broadcast_order : 0.000023s : 0.01% optimize.grouped_pairwise_exchange_alltoall : 0.000005s : 0.00% optimize.offloading_packed_experts : 0.000009s : 0.00% optimize.overlap_recompute_and_grad_model_parallel : 0.000009s : 0.00% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000005s : 0.00% optimize.overlap_recompute_comm : 0.000006s : 0.00% optimize.overlap_grad_ring_attention : 0.000009s : 0.00% optimize.overlap_grad_flash_sp : 0.000032s : 0.02% optimize.begin_end_overlap_inline : 0.000004s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000005s : 0.00% optimize.split_layernorm_comm : 0.000006s : 0.00% optimize.handle_group_info : 0.000004s : 0.00% optimize.symbol_engine_optimizer.build : 0.000005s : 0.00% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000015s : 0.01% optimize.symbol_engine_optimizer.elim_not_effective : 0.000020s : 0.01% optimize.symbol_engine_optimizer.opt_reshape : 0.000010s : 0.00% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000014s : 0.01% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000006s : 0.00% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000031s : 0.02% get_jit_bprop_graph : 0.000002s : 0.00% rewriter_after_jit_bprop_graph : 0.000007s : 0.00% opt_after_jit_grad : 0.000876s : 0.43% validate : 0.000063s : 0.03% Time group info: ------[substitution.] 0.000300 45 13.00% : 0.000039s : 5: substitution.arithmetic_simplify 9.71% : 0.000029s : 3: substitution.cast_eliminate 0.92% : 0.000003s : 3: substitution.elim_not_effective 0.57% : 0.000002s : 3: substitution.fold_const_symbol 2.57% : 0.000008s : 5: substitution.graph_param_transform 55.56% : 0.000167s : 4: substitution.inline 2.05% : 0.000006s : 6: substitution.j_node_and_user_rematch 4.67% : 0.000014s : 2: substitution.less_batch_normalization 2.55% : 0.000008s : 6: substitution.remove_not_recompute_node 2.80% : 0.000008s : 4: substitution.replace_old_param 5.60% : 0.000017s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.006707 2 87.04% : 0.005838s : 1: type_inference.infer 12.96% : 0.000869s : 1: type_inference.specialize ------[replace.] 0.000070 8 59.69% : 0.000042s : 4: replace.inline 40.31% : 0.000028s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000179 8 91.63% : 0.000164s : 4: match.inline 8.37% : 0.000015s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000307 1596 0.89% : 0.000003s : 17: predicate.accumulaten_eliminater 0.90% : 0.000003s : 5: predicate.ad_related_special_op_eliminate 0.47% : 0.000001s : 10: predicate.addn_check_dump 1.00% : 0.000003s : 17: predicate.addn_zero_filter 0.71% : 0.000002s : 17: predicate.adjust_all_reduce_mul_add 2.36% : 0.000007s : 27: predicate.arithmetic_simplify 8.82% : 0.000027s : 17: predicate.cast_eliminate 0.55% : 0.000002s : 10: predicate.check_bprop_eliminate 0.54% : 0.000002s : 10: predicate.compare_switch_simplify 0.17% : 0.000001s : 5: predicate.const_output_eliminate 0.52% : 0.000002s : 10: predicate.depend_value_elim 0.81% : 0.000002s : 17: predicate.dict_get_item_const_eliminator 1.00% : 0.000003s : 17: predicate.dict_get_item_eliminator 0.99% : 0.000003s : 17: predicate.dict_set_item_eliminator 0.99% : 0.000003s : 10: predicate.dumpgradient_eliminate 0.22% : 0.000001s : 5: predicate.elim_not_effective 0.50% : 0.000002s : 5: predicate.elim_shapecalc_of_broadcastargs 0.99% : 0.000003s : 22: predicate.environ_add_const_eliminate 1.03% : 0.000003s : 22: predicate.environ_get_add_eliminate 1.03% : 0.000003s : 22: predicate.environ_get_depend_swap 1.59% : 0.000005s : 32: predicate.environ_get_eliminate 1.11% : 0.000003s : 22: predicate.environ_get_set_eliminate 1.18% : 0.000004s : 25: predicate.exchange_switch_depend_value 2.07% : 0.000006s : 25: predicate.float_depend_g_call 0.52% : 0.000002s : 10: predicate.float_environ_get_switch 0.77% : 0.000002s : 15: predicate.float_tuple_getitem_switch 0.16% : 0.000001s : 5: predicate.fold_const_symbol 0.67% : 0.000002s : 10: predicate.get_grad_eliminate 0.18% : 0.000001s : 5: predicate.graph_param_transform 0.54% : 0.000002s : 10: predicate.incorporate_call 0.42% : 0.000001s : 10: predicate.incorporate_call_switch 5.51% : 0.000017s : 72: predicate.inline 0.86% : 0.000003s : 10: predicate.inline_without_move 0.27% : 0.000001s : 10: predicate.j_node_and_user_rematch 0.85% : 0.000003s : 10: predicate.less_batch_normalization 1.64% : 0.000005s : 31: predicate.list_to_tuple_eliminator_ 2.34% : 0.000007s : 48: predicate.load_eliminater 0.98% : 0.000003s : 5: predicate.loop_unroll_after_grad 1.78% : 0.000005s : 36: predicate.loop_unroll_before_grad 1.51% : 0.000005s : 27: predicate.make_slice_get_slice_eliminator 0.57% : 0.000002s : 10: predicate.merge_addn 0.52% : 0.000002s : 10: predicate.micro_step_allgather_replace 0.51% : 0.000002s : 10: predicate.mini_step_allgather_replace 0.81% : 0.000002s : 17: predicate.minmaximum_grad 1.72% : 0.000005s : 5: predicate.mutable_eliminate 0.35% : 0.000001s : 5: predicate.opt_reshape 0.50% : 0.000002s : 5: predicate.parallel_virtual_node 1.50% : 0.000005s : 25: predicate.partial_defer_inline 1.42% : 0.000004s : 26: predicate.partial_eliminate 0.84% : 0.000003s : 17: predicate.print_const_string_wrapper 0.62% : 0.000002s : 10: predicate.reduce_all_const_elim 1.06% : 0.000003s : 17: predicate.reduce_eliminate 2.30% : 0.000007s : 48: predicate.redundant_stop_gradient_eliminater 0.33% : 0.000001s : 10: predicate.remove_not_recompute_node 1.14% : 0.000004s : 31: predicate.replace_applicator 0.67% : 0.000002s : 10: predicate.replace_old_param 0.31% : 0.000001s : 5: predicate.reset_defer_inline 0.95% : 0.000003s : 17: predicate.reshape_eliminate 0.58% : 0.000002s : 10: predicate.row_tensor_add_zeros_like 0.43% : 0.000001s : 5: predicate.row_tensor_eliminate 0.85% : 0.000003s : 10: predicate.same_eliminate 0.39% : 0.000001s : 10: predicate.set_cell_output_no_recompute 0.65% : 0.000002s : 10: predicate.shard_identity_eliminate 0.76% : 0.000002s : 10: predicate.special_op_eliminate 0.64% : 0.000002s : 10: predicate.specialize_transform 0.90% : 0.000003s : 10: predicate.split_environ_get_set_with_tuple_value 0.68% : 0.000002s : 10: predicate.stack_unstack_eliminate 0.30% : 0.000001s : 5: predicate.switch_call_monad_eliminater 1.30% : 0.000004s : 25: predicate.switch_defer_inline 1.84% : 0.000006s : 35: predicate.switch_layer_defer_inline 4.00% : 0.000012s : 76: predicate.switch_simplify 0.81% : 0.000002s : 17: predicate.tile_eliminate 0.79% : 0.000002s : 17: predicate.transpose_eliminate 1.45% : 0.000004s : 27: predicate.tuple_list_convert_item_index_to_positive 1.55% : 0.000005s : 27: predicate.tuple_list_get_item_const_eliminator 1.20% : 0.000004s : 27: predicate.tuple_list_get_item_depend_reorder 2.95% : 0.000009s : 41: predicate.tuple_list_get_item_eliminator 1.38% : 0.000004s : 27: predicate.tuple_list_get_set_item_eliminator 2.07% : 0.000006s : 37: predicate.tuple_list_set_item_eliminator 1.64% : 0.000005s : 31: predicate.tuple_to_list_eliminator_ 2.25% : 0.000007s : 48: predicate.updatestate_pure_node_eliminater 2.80% : 0.000009s : 58: predicate.updatestate_useless_node_eliminater 0.43% : 0.000001s : 5: predicate.value_based_eliminate 0.56% : 0.000002s : 10: predicate.virtual_dataset_eliminate 0.57% : 0.000002s : 10: predicate.virtual_output_eliminate 0.26% : 0.000001s : 5: predicate.virtual_view_grad_eliminate 0.35% : 0.000001s : 5: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000762 11 55.93% : 0.000426s : 5: func_graph_cloner_run.FuncGraphClonerGraph 44.07% : 0.000336s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.224717 192 0.00% : 0.000007s : 1: ForceFp32Comm 1.62% : 0.003629s : 1: add_attr 1.61% : 0.003612s : 1: add_attr_with_inline 0.00% : 0.000007s : 1: add_comm_op_reuse_tag 0.04% : 0.000080s : 1: add_recomputation 0.00% : 0.000008s : 1: assign_add_opt 0.04% : 0.000087s : 1: auto_monad 0.02% : 0.000038s : 1: auto_monad_reorder 0.00% : 0.000007s : 1: begin_end_overlap_inline 0.00% : 0.000010s : 1: bias_add_comm_swap 84.84% : 0.190645s : 1: bootstrap 0.02% : 0.000046s : 1: cconv 0.00% : 0.000008s : 1: comm_op_add_attrs 0.01% : 0.000026s : 1: control_data_broadcast_order 0.01% : 0.000015s : 1: convert_after_rewriter 0.02% : 0.000044s : 1: cse_after_recomputation 0.00% : 0.000009s : 1: dataset_repeat_opt 0.01% : 0.000031s : 1: detach_backward 0.01% : 0.000016s : 1: environ_conv 0.01% : 0.000033s : 1: event_method 0.00% : 0.000008s : 1: full_micro_interleaved_order_control 0.00% : 0.000008s : 1: get_jit_bprop_graph 0.01% : 0.000014s : 1: graph_reusing 0.00% : 0.000008s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000007s : 1: handle_group_info 0.00% : 0.000009s : 1: inline 0.00% : 0.000008s : 1: insert-virtual-dataset 0.00% : 0.000008s : 1: interleave_parallel_branches 0.00% : 0.000008s : 1: interleave_split_concat_branches 0.00% : 0.000010s : 1: label_fine_grained_interleaved_index 0.01% : 0.000014s : 1: label_micro_interleaved_index 0.30% : 0.000676s : 1: loop_unroll 0.00% : 0.000008s : 1: merge_cast_opt 0.00% : 0.000010s : 1: micro_interleaved_order_control 0.41% : 0.000926s : 1: mutable_eliminate 0.01% : 0.000012s : 1: offloading_packed_experts 0.01% : 0.000022s : 1: opt.transform.loop_unroll_optimizer 0.01% : 0.000029s : 1: opt.transform.mutable_eliminate 0.74% : 0.001671s : 78: opt.transform.opt_a 0.02% : 0.000046s : 1: opt.transform.opt_after_cconv 0.02% : 0.000044s : 1: opt.transform.opt_after_jit_grad 0.08% : 0.000170s : 28: opt.transform.opt_b 0.04% : 0.000081s : 2: opt.transform.opt_trans_graph 0.02% : 0.000054s : 4: opt.transform.symbol_engine_opt 1.85% : 0.004148s : 1: opt_a 0.08% : 0.000189s : 1: opt_after_cconv 0.40% : 0.000889s : 1: opt_after_jit_grad 0.19% : 0.000426s : 1: opt_b 3.61% : 0.008114s : 1: optimize 0.01% : 0.000033s : 1: optimize_parallel_all_gather_comm 0.01% : 0.000013s : 1: order_py_execute_after_rewriter 0.02% : 0.000036s : 1: overlap_grad_flash_sp 0.00% : 0.000007s : 1: overlap_grad_matmul_and_grad_allreduce 0.01% : 0.000014s : 1: overlap_grad_ring_attention 0.00% : 0.000009s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000008s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000010s : 1: overlap_param_gather 0.00% : 0.000008s : 1: overlap_recompute_allgather_and_fa_grad 0.01% : 0.000013s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000009s : 1: overlap_recompute_comm 0.00% : 0.000011s : 1: parallel-infer-symbol 0.00% : 0.000006s : 1: parallel-infer-symbol-second 0.00% : 0.000011s : 1: partial_unused_args_eliminate 0.00% : 0.000010s : 1: pipeline_parallel_scheduler 0.00% : 0.000007s : 1: pipeline_split 0.02% : 0.000049s : 1: pre_auto_parallel 0.02% : 0.000038s : 1: py_interpret_to_execute 0.01% : 0.000028s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000008s : 1: remove_cast_before_assign_add 0.01% : 0.000028s : 1: remove_dup_value 0.26% : 0.000578s : 1: renormalize.infer 0.21% : 0.000480s : 1: renormalize.specialize 0.00% : 0.000010s : 1: reorder_send_recv_between_fp_bp 0.01% : 0.000014s : 1: rewriter_after_jit_bprop_graph 0.03% : 0.000057s : 1: rewriter_after_opt_a 0.05% : 0.000103s : 1: rewriter_before_opt_a 0.00% : 0.000010s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000008s : 1: slice_recompute_activation 0.00% : 0.000009s : 1: split_layernorm_comm 0.00% : 0.000008s : 1: split_matmul_comm_elemetwise 0.01% : 0.000014s : 1: swap_dp_allreduce_reducescatter 0.06% : 0.000140s : 1: symbol_engine_optimizer 0.06% : 0.000139s : 1: tuple_transform 3.03% : 0.006815s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:38:36.594.499 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.179099, [21] [bootstrap]: 0.00049271 [type_inference]: 0.168662 [event_method]: 2.007e-05 [auto_monad]: 7.593e-05 [graph_reusing]: 6.58e-06 [inline]: 2.84999e-06 [add_attr]: 0.00342437, [1] [add_attr_with_inline]: 0.00341448, [1] [Cycle 1]: 5.995e-05, [2] [tag_attr]: 2.094e-05 [meta_addattr_fg_expand]: 6.27001e-06 [parallel-infer-symbol]: 4.32998e-06 [pre_auto_parallel]: 3.448e-05 [insert-virtual-dataset]: 2.51998e-06 [parallel-infer-symbol-second]: 9.20001e-07 [dataset_repeat_opt]: 2.05002e-06 [pipeline_split]: 2.12001e-06 [optimize]: 0.00562117, [53] [py_interpret_to_execute]: 2.815e-05 [rewriter_before_opt_a]: 8.607e-05 [opt_a]: 0.00327487, [2] [Cycle 1]: 0.00244354, [45] [expand_dump_flag]: 3.48e-06 [switch_simplify]: 4.455e-05 [loop_unroll]: 3.204e-05 [a_1]: 0.00077825 [with_stream_mark]: 1.63e-05 [recompute_prepare]: 1.015e-05 [updatestate_depend_eliminate]: 5.59998e-06 [updatestate_assign_eliminate]: 3.93999e-06 [updatestate_loads_eliminate]: 3.45003e-06 [parameter_eliminate]: 2.40997e-06 [a_2]: 0.00011248 [accelerated_algorithm]: 2.105e-05 [shard]: 1.86e-06 [meta_shard_fg_expand]: 1.99999e-06 [shard_inline]: 7.97998e-06 [merge_send_recv]: 8.94998e-06 [auto_parallel]: 8.26002e-06 [parallel]: 2.024e-05 [flash_sp]: 8e-06 [merge_comm]: 4.51002e-06 [allreduce_fusion]: 4.42998e-06 [matmul_add_comm_reduction]: 1.152e-05 [allreduce_slice_to_reducescatter]: 6.50005e-07 [virtual_shard_identity]: 1.055e-05 [virtual_dataset]: 7.78999e-06 [get_grad_eliminate_]: 8.59998e-06 [virtual_output]: 7.51001e-06 [merge_forward]: 4.75999e-06 [cell_reuse_recompute_pass]: 1.30999e-06 [offload_activation]: 1.091e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.622e-05 [merge_recompute_call_nodes]: 1.41002e-06 [before_grad]: 1.222e-05 [set_forward_comm_id_for_comm_node_pass]: 4.45e-06 [meta_fg_expand]: 3.2e-06 [flash_sp_send_recv_attached]: 5.30001e-06 [receive_attached]: 2.41e-06 [after_resolve]: 1.387e-05 [a_after_grad]: 1.178e-05 [renormalize]: 0.00082838 [add_forward_monad_depend]: 6.61e-06 [auto_monad_grad]: 3.08e-06 [auto_monad_eliminator]: 1.811e-05 [cse]: 3.78e-05 [a_3]: 6.074e-05 [Cycle 2]: 0.00082039, [45] [expand_dump_flag]: 1.72001e-06 [switch_simplify]: 9.21002e-06 [loop_unroll]: 7.83001e-06 [a_1]: 0.00019441 [with_stream_mark]: 1.033e-05 [recompute_prepare]: 8.57e-06 [updatestate_depend_eliminate]: 3.96001e-06 [updatestate_assign_eliminate]: 3.07002e-06 [updatestate_loads_eliminate]: 3.25998e-06 [parameter_eliminate]: 1.35001e-06 [a_2]: 9.913e-05 [accelerated_algorithm]: 1.111e-05 [shard]: 1.20001e-06 [meta_shard_fg_expand]: 1.72001e-06 [shard_inline]: 7.76001e-06 [merge_send_recv]: 6.44999e-06 [auto_parallel]: 6.63e-06 [parallel]: 5.38002e-06 [flash_sp]: 4.08999e-06 [merge_comm]: 4.38999e-06 [allreduce_fusion]: 4.15999e-06 [matmul_add_comm_reduction]: 9.74e-06 [allreduce_slice_to_reducescatter]: 5.69999e-07 [virtual_shard_identity]: 8.57e-06 [virtual_dataset]: 7.21001e-06 [get_grad_eliminate_]: 7.36999e-06 [virtual_output]: 7.21999e-06 [merge_forward]: 3.97e-06 [cell_reuse_recompute_pass]: 2.12999e-06 [offload_activation]: 2.376e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.574e-05 [merge_recompute_call_nodes]: 9.10019e-07 [before_grad]: 1.241e-05 [set_forward_comm_id_for_comm_node_pass]: 4.94e-06 [meta_fg_expand]: 3.15998e-06 [flash_sp_send_recv_attached]: 1.08001e-06 [receive_attached]: 1.69998e-06 [after_resolve]: 1.359e-05 [a_after_grad]: 1.115e-05 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 1.87001e-06 [auto_monad_grad]: 1.60999e-06 [auto_monad_eliminator]: 8.97999e-06 [cse]: 2.107e-05 [a_3]: 4.622e-05 [py_interpret_to_execute_after_opt_a]: 1.137e-05 [slice_cell_reuse_recomputed_activation]: 2.31e-06 [rewriter_after_opt_a]: 4.223e-05 [convert_after_rewriter]: 7.94002e-06 [order_py_execute_after_rewriter]: 6.43e-06 [mutable_eliminate]: 0.00059268 [opt_b]: 0.00026742, [1] [Cycle 1]: 0.0002601, [7] [b_1]: 0.00017467 [b_2]: 1.015e-05 [updatestate_depend_eliminate]: 6.68003e-06 [updatestate_assign_eliminate]: 3.4e-06 [updatestate_loads_eliminate]: 3.65e-06 [renormalize]: 5.59987e-07 [cse]: 2.529e-05 [optimize_parallel_all_gather_comm]: 1.79e-05 [overlap_param_gather]: 2.26e-06 [cconv]: 2.554e-05 [loop_unroll]: 0.00045254 [opt_after_cconv]: 0.00012815, [1] [Cycle 1]: 0.00012206, [7] [c_1]: 4.479e-05 [parameter_eliminate]: 3.33998e-06 [updatestate_depend_eliminate]: 5.91e-06 [updatestate_assign_eliminate]: 3.32997e-06 [updatestate_loads_eliminate]: 3.44001e-06 [cse]: 2.5e-05 [renormalize]: 4.30009e-07 [remove_dup_value]: 1.618e-05 [tuple_transform]: 0.00012793, [1] [Cycle 1]: 0.00012337, [4] [d_1]: 6.214e-05 [none_parameter_eliminate]: 2.12001e-06 [renormalize]: 2.89991e-07 [switch_simplify]: 1.132e-05 [partial_unused_args_eliminate]: 2.03002e-06 [add_recomputation]: 5.665e-05 [cse_after_recomputation]: 3.051e-05, [1] [Cycle 1]: 2.51e-05, [1] [cse]: 1.935e-05 [environ_conv]: 6.14001e-06 [swap_dp_allreduce_reducescatter]: 6.22001e-06 [bias_add_comm_swap]: 2.63998e-06 [label_micro_interleaved_index]: 4.79e-06 [label_fine_grained_interleaved_index]: 2.57001e-06 [merge_cast_opt]: 1.31998e-06 [slice_recompute_activation]: 1.94999e-06 [micro_interleaved_order_control]: 2.36e-06 [assign_add_opt]: 1.26002e-06 [ForceFp32Comm]: 1.19e-06 [remove_cast_before_assign_add]: 1.19e-06 [full_micro_interleaved_order_control]: 2.14999e-06 [reorder_send_recv_between_fp_bp]: 2.96999e-06 [comm_op_add_attrs]: 1.36998e-06 [add_comm_op_reuse_tag]: 1.10001e-06 [interleave_split_concat_branches]: 1.45999e-06 [interleave_parallel_branches]: 1.07998e-06 [overlap_opt_shard_in_pipeline]: 1.20999e-06 [overlap_opt_shard_grad_in_pipeline]: 1.89e-06 [control_data_broadcast_order]: 1.827e-05 [grouped_pairwise_exchange_alltoall]: 1.55999e-06 [offloading_packed_experts]: 5.63002e-06 [overlap_recompute_and_grad_model_parallel]: 5.37001e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.40999e-06 [overlap_recompute_allgather_and_fa_grad]: 1.34e-06 [overlap_recompute_comm]: 2.37999e-06 [overlap_grad_ring_attention]: 4.55001e-06 [overlap_grad_flash_sp]: 2.579e-05 [begin_end_overlap_inline]: 5.90022e-07 [split_matmul_comm_elemetwise]: 2.19001e-06 [split_layernorm_comm]: 1.91003e-06 [handle_group_info]: 1.02e-06 [symbol_engine_optimizer]: 9.411e-05, [1] [Cycle 1]: 8.933e-05, [6] [build]: 4.00998e-06 [elim_shapecalc]: 1.313e-05 [elim_not_effective]: 1.808e-05 [opt_reshape]: 9.59e-06 [fold_const_symbol]: 1.337e-05 [renormalize]: 2.30008e-07 [detach_backward]: 2.04e-06 [pipeline_parallel_scheduler]: 1.43002e-06 [auto_monad_reorder]: 2.183e-05 [get_jit_bprop_graph]: 1.92999e-06 [rewriter_after_jit_bprop_graph]: 3.93999e-06 [opt_after_jit_grad]: 0.00050751 [validate]: 4.455e-05 Sums bootstrap : 0.000493s : 0.28% type_inference : 0.168662s : 96.57% event_method : 0.000020s : 0.01% auto_monad : 0.000076s : 0.04% graph_reusing : 0.000007s : 0.00% inline : 0.000003s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000021s : 0.01% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.00% parallel-infer-symbol : 0.000004s : 0.00% pre_auto_parallel : 0.000034s : 0.02% insert-virtual-dataset : 0.000003s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000028s : 0.02% optimize.rewriter_before_opt_a : 0.000086s : 0.05% optimize.opt_a.expand_dump_flag : 0.000005s : 0.00% optimize.opt_a.switch_simplify : 0.000054s : 0.03% optimize.opt_a.loop_unroll : 0.000040s : 0.02% optimize.opt_a.a_1 : 0.000973s : 0.56% optimize.opt_a.with_stream_mark : 0.000027s : 0.02% optimize.opt_a.recompute_prepare : 0.000019s : 0.01% optimize.opt_a.updatestate_depend_eliminate : 0.000010s : 0.01% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.00% optimize.opt_a.updatestate_loads_eliminate : 0.000007s : 0.00% optimize.opt_a.parameter_eliminate : 0.000004s : 0.00% optimize.opt_a.a_2 : 0.000212s : 0.12% optimize.opt_a.accelerated_algorithm : 0.000032s : 0.02% optimize.opt_a.shard : 0.000003s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.00% optimize.opt_a.shard_inline : 0.000016s : 0.01% optimize.opt_a.merge_send_recv : 0.000015s : 0.01% optimize.opt_a.auto_parallel : 0.000015s : 0.01% optimize.opt_a.parallel : 0.000026s : 0.01% optimize.opt_a.flash_sp : 0.000012s : 0.01% optimize.opt_a.merge_comm : 0.000009s : 0.01% optimize.opt_a.allreduce_fusion : 0.000009s : 0.00% optimize.opt_a.matmul_add_comm_reduction : 0.000021s : 0.01% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000019s : 0.01% optimize.opt_a.virtual_dataset : 0.000015s : 0.01% optimize.opt_a.get_grad_eliminate_ : 0.000016s : 0.01% optimize.opt_a.virtual_output : 0.000015s : 0.01% optimize.opt_a.merge_forward : 0.000009s : 0.00% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.00% optimize.opt_a.offload_activation : 0.000035s : 0.02% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000032s : 0.02% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.00% optimize.opt_a.before_grad : 0.000025s : 0.01% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000009s : 0.01% optimize.opt_a.meta_fg_expand : 0.000006s : 0.00% optimize.opt_a.flash_sp_send_recv_attached : 0.000006s : 0.00% optimize.opt_a.receive_attached : 0.000004s : 0.00% optimize.opt_a.after_resolve : 0.000027s : 0.02% optimize.opt_a.a_after_grad : 0.000023s : 0.01% optimize.opt_a.renormalize : 0.000828s : 0.47% optimize.opt_a.add_forward_monad_depend : 0.000008s : 0.00% optimize.opt_a.auto_monad_grad : 0.000005s : 0.00% optimize.opt_a.auto_monad_eliminator : 0.000027s : 0.02% optimize.opt_a.cse : 0.000059s : 0.03% optimize.opt_a.a_3 : 0.000107s : 0.06% optimize.py_interpret_to_execute_after_opt_a : 0.000011s : 0.01% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.00% optimize.rewriter_after_opt_a : 0.000042s : 0.02% optimize.convert_after_rewriter : 0.000008s : 0.00% optimize.order_py_execute_after_rewriter : 0.000006s : 0.00% optimize.mutable_eliminate : 0.000593s : 0.34% optimize.opt_b.b_1 : 0.000175s : 0.10% optimize.opt_b.b_2 : 0.000010s : 0.01% optimize.opt_b.updatestate_depend_eliminate : 0.000007s : 0.00% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000004s : 0.00% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000025s : 0.01% optimize.optimize_parallel_all_gather_comm : 0.000018s : 0.01% optimize.overlap_param_gather : 0.000002s : 0.00% optimize.cconv : 0.000026s : 0.01% optimize.loop_unroll : 0.000453s : 0.26% optimize.opt_after_cconv.c_1 : 0.000045s : 0.03% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.00% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.cse : 0.000025s : 0.01% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000016s : 0.01% optimize.tuple_transform.d_1 : 0.000062s : 0.04% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000011s : 0.01% optimize.partial_unused_args_eliminate : 0.000002s : 0.00% optimize.add_recomputation : 0.000057s : 0.03% optimize.cse_after_recomputation.cse : 0.000019s : 0.01% optimize.environ_conv : 0.000006s : 0.00% optimize.swap_dp_allreduce_reducescatter : 0.000006s : 0.00% optimize.bias_add_comm_swap : 0.000003s : 0.00% optimize.label_micro_interleaved_index : 0.000005s : 0.00% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.00% optimize.merge_cast_opt : 0.000001s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.00% optimize.micro_interleaved_order_control : 0.000002s : 0.00% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.00% optimize.full_micro_interleaved_order_control : 0.000002s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.00% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.00% optimize.control_data_broadcast_order : 0.000018s : 0.01% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.00% optimize.offloading_packed_experts : 0.000006s : 0.00% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.00% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.00% optimize.overlap_recompute_comm : 0.000002s : 0.00% optimize.overlap_grad_ring_attention : 0.000005s : 0.00% optimize.overlap_grad_flash_sp : 0.000026s : 0.01% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.00% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000004s : 0.00% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000013s : 0.01% optimize.symbol_engine_optimizer.elim_not_effective : 0.000018s : 0.01% optimize.symbol_engine_optimizer.opt_reshape : 0.000010s : 0.01% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000013s : 0.01% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.00% pipeline_parallel_scheduler : 0.000001s : 0.00% auto_monad_reorder : 0.000022s : 0.01% get_jit_bprop_graph : 0.000002s : 0.00% rewriter_after_jit_bprop_graph : 0.000004s : 0.00% opt_after_jit_grad : 0.000508s : 0.29% validate : 0.000045s : 0.03% Time group info: ------[substitution.] 0.000254 45 12.04% : 0.000031s : 5: substitution.arithmetic_simplify 8.66% : 0.000022s : 3: substitution.cast_eliminate 0.88% : 0.000002s : 3: substitution.elim_not_effective 0.94% : 0.000002s : 3: substitution.fold_const_symbol 3.08% : 0.000008s : 5: substitution.graph_param_transform 56.50% : 0.000144s : 4: substitution.inline 1.71% : 0.000004s : 6: substitution.j_node_and_user_rematch 4.67% : 0.000012s : 2: substitution.less_batch_normalization 3.19% : 0.000008s : 6: substitution.remove_not_recompute_node 2.32% : 0.000006s : 4: substitution.replace_old_param 6.01% : 0.000015s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.168591 2 99.48% : 0.167718s : 1: type_inference.infer 0.52% : 0.000873s : 1: type_inference.specialize ------[replace.] 0.000064 8 58.83% : 0.000038s : 4: replace.inline 41.17% : 0.000026s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000154 8 91.31% : 0.000141s : 4: match.inline 8.69% : 0.000013s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000254 1596 1.00% : 0.000003s : 17: predicate.accumulaten_eliminater 0.76% : 0.000002s : 5: predicate.ad_related_special_op_eliminate 0.54% : 0.000001s : 10: predicate.addn_check_dump 1.00% : 0.000003s : 17: predicate.addn_zero_filter 0.89% : 0.000002s : 17: predicate.adjust_all_reduce_mul_add 2.38% : 0.000006s : 27: predicate.arithmetic_simplify 0.93% : 0.000002s : 17: predicate.cast_eliminate 0.61% : 0.000002s : 10: predicate.check_bprop_eliminate 0.57% : 0.000001s : 10: predicate.compare_switch_simplify 0.19% : 0.000000s : 5: predicate.const_output_eliminate 0.58% : 0.000001s : 10: predicate.depend_value_elim 1.00% : 0.000003s : 17: predicate.dict_get_item_const_eliminator 1.04% : 0.000003s : 17: predicate.dict_get_item_eliminator 0.93% : 0.000002s : 17: predicate.dict_set_item_eliminator 0.94% : 0.000002s : 10: predicate.dumpgradient_eliminate 0.26% : 0.000001s : 5: predicate.elim_not_effective 0.44% : 0.000001s : 5: predicate.elim_shapecalc_of_broadcastargs 1.19% : 0.000003s : 22: predicate.environ_add_const_eliminate 1.12% : 0.000003s : 22: predicate.environ_get_add_eliminate 1.17% : 0.000003s : 22: predicate.environ_get_depend_swap 1.75% : 0.000004s : 32: predicate.environ_get_eliminate 1.14% : 0.000003s : 22: predicate.environ_get_set_eliminate 1.37% : 0.000003s : 25: predicate.exchange_switch_depend_value 2.26% : 0.000006s : 25: predicate.float_depend_g_call 0.54% : 0.000001s : 10: predicate.float_environ_get_switch 0.79% : 0.000002s : 15: predicate.float_tuple_getitem_switch 0.18% : 0.000000s : 5: predicate.fold_const_symbol 0.66% : 0.000002s : 10: predicate.get_grad_eliminate 0.23% : 0.000001s : 5: predicate.graph_param_transform 0.59% : 0.000001s : 10: predicate.incorporate_call 0.52% : 0.000001s : 10: predicate.incorporate_call_switch 6.33% : 0.000016s : 72: predicate.inline 0.76% : 0.000002s : 10: predicate.inline_without_move 0.32% : 0.000001s : 10: predicate.j_node_and_user_rematch 0.83% : 0.000002s : 10: predicate.less_batch_normalization 1.81% : 0.000005s : 31: predicate.list_to_tuple_eliminator_ 2.58% : 0.000007s : 48: predicate.load_eliminater 0.94% : 0.000002s : 5: predicate.loop_unroll_after_grad 2.07% : 0.000005s : 36: predicate.loop_unroll_before_grad 1.70% : 0.000004s : 27: predicate.make_slice_get_slice_eliminator 0.61% : 0.000002s : 10: predicate.merge_addn 0.56% : 0.000001s : 10: predicate.micro_step_allgather_replace 0.67% : 0.000002s : 10: predicate.mini_step_allgather_replace 0.86% : 0.000002s : 17: predicate.minmaximum_grad 1.05% : 0.000003s : 5: predicate.mutable_eliminate 0.38% : 0.000001s : 5: predicate.opt_reshape 0.54% : 0.000001s : 5: predicate.parallel_virtual_node 1.70% : 0.000004s : 25: predicate.partial_defer_inline 1.67% : 0.000004s : 26: predicate.partial_eliminate 0.89% : 0.000002s : 17: predicate.print_const_string_wrapper 0.56% : 0.000001s : 10: predicate.reduce_all_const_elim 1.24% : 0.000003s : 17: predicate.reduce_eliminate 2.59% : 0.000007s : 48: predicate.redundant_stop_gradient_eliminater 0.37% : 0.000001s : 10: predicate.remove_not_recompute_node 1.37% : 0.000003s : 31: predicate.replace_applicator 0.48% : 0.000001s : 10: predicate.replace_old_param 0.25% : 0.000001s : 5: predicate.reset_defer_inline 1.02% : 0.000003s : 17: predicate.reshape_eliminate 0.62% : 0.000002s : 10: predicate.row_tensor_add_zeros_like 0.36% : 0.000001s : 5: predicate.row_tensor_eliminate 0.86% : 0.000002s : 10: predicate.same_eliminate 0.39% : 0.000001s : 10: predicate.set_cell_output_no_recompute 0.72% : 0.000002s : 10: predicate.shard_identity_eliminate 0.81% : 0.000002s : 10: predicate.special_op_eliminate 0.70% : 0.000002s : 10: predicate.specialize_transform 0.84% : 0.000002s : 10: predicate.split_environ_get_set_with_tuple_value 0.68% : 0.000002s : 10: predicate.stack_unstack_eliminate 0.37% : 0.000001s : 5: predicate.switch_call_monad_eliminater 1.51% : 0.000004s : 25: predicate.switch_defer_inline 2.02% : 0.000005s : 35: predicate.switch_layer_defer_inline 4.88% : 0.000012s : 76: predicate.switch_simplify 0.96% : 0.000002s : 17: predicate.tile_eliminate 0.92% : 0.000002s : 17: predicate.transpose_eliminate 1.68% : 0.000004s : 27: predicate.tuple_list_convert_item_index_to_positive 1.61% : 0.000004s : 27: predicate.tuple_list_get_item_const_eliminator 1.47% : 0.000004s : 27: predicate.tuple_list_get_item_depend_reorder 3.10% : 0.000008s : 41: predicate.tuple_list_get_item_eliminator 1.70% : 0.000004s : 27: predicate.tuple_list_get_set_item_eliminator 2.38% : 0.000006s : 37: predicate.tuple_list_set_item_eliminator 1.71% : 0.000004s : 31: predicate.tuple_to_list_eliminator_ 2.49% : 0.000006s : 48: predicate.updatestate_pure_node_eliminater 3.22% : 0.000008s : 58: predicate.updatestate_useless_node_eliminater 0.36% : 0.000001s : 5: predicate.value_based_eliminate 0.66% : 0.000002s : 10: predicate.virtual_dataset_eliminate 0.59% : 0.000002s : 10: predicate.virtual_output_eliminate 0.28% : 0.000001s : 5: predicate.virtual_view_grad_eliminate 0.38% : 0.000001s : 5: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000669 11 52.98% : 0.000355s : 5: func_graph_cloner_run.FuncGraphClonerGraph 47.02% : 0.000315s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.190744 192 0.00% : 0.000005s : 1: ForceFp32Comm 1.80% : 0.003430s : 1: add_attr 1.79% : 0.003419s : 1: add_attr_with_inline 0.00% : 0.000004s : 1: add_comm_op_reuse_tag 0.03% : 0.000061s : 1: add_recomputation 0.00% : 0.000004s : 1: assign_add_opt 0.04% : 0.000083s : 1: auto_monad 0.01% : 0.000026s : 1: auto_monad_reorder 0.00% : 0.000003s : 1: begin_end_overlap_inline 0.00% : 0.000005s : 1: bias_add_comm_swap 0.27% : 0.000522s : 1: bootstrap 0.02% : 0.000029s : 1: cconv 0.00% : 0.000005s : 1: comm_op_add_attrs 0.01% : 0.000022s : 1: control_data_broadcast_order 0.01% : 0.000012s : 1: convert_after_rewriter 0.02% : 0.000033s : 1: cse_after_recomputation 0.00% : 0.000005s : 1: dataset_repeat_opt 0.00% : 0.000005s : 1: detach_backward 0.01% : 0.000010s : 1: environ_conv 0.01% : 0.000027s : 1: event_method 0.00% : 0.000005s : 1: full_micro_interleaved_order_control 0.00% : 0.000005s : 1: get_jit_bprop_graph 0.01% : 0.000010s : 1: graph_reusing 0.00% : 0.000005s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000004s : 1: handle_group_info 0.00% : 0.000006s : 1: inline 0.00% : 0.000007s : 1: insert-virtual-dataset 0.00% : 0.000004s : 1: interleave_parallel_branches 0.00% : 0.000004s : 1: interleave_split_concat_branches 0.00% : 0.000005s : 1: label_fine_grained_interleaved_index 0.00% : 0.000008s : 1: label_micro_interleaved_index 0.24% : 0.000461s : 1: loop_unroll 0.00% : 0.000004s : 1: merge_cast_opt 0.00% : 0.000005s : 1: micro_interleaved_order_control 0.32% : 0.000602s : 1: mutable_eliminate 0.00% : 0.000009s : 1: offloading_packed_experts 0.01% : 0.000018s : 1: opt.transform.loop_unroll_optimizer 0.01% : 0.000018s : 1: opt.transform.mutable_eliminate 0.80% : 0.001530s : 78: opt.transform.opt_a 0.02% : 0.000043s : 1: opt.transform.opt_after_cconv 0.02% : 0.000032s : 1: opt.transform.opt_after_jit_grad 0.08% : 0.000152s : 28: opt.transform.opt_b 0.04% : 0.000071s : 2: opt.transform.opt_trans_graph 0.03% : 0.000051s : 4: opt.transform.symbol_engine_opt 1.72% : 0.003278s : 1: opt_a 0.07% : 0.000132s : 1: opt_after_cconv 0.27% : 0.000517s : 1: opt_after_jit_grad 0.14% : 0.000271s : 1: opt_b 2.95% : 0.005627s : 1: optimize 0.01% : 0.000021s : 1: optimize_parallel_all_gather_comm 0.00% : 0.000009s : 1: order_py_execute_after_rewriter 0.02% : 0.000029s : 1: overlap_grad_flash_sp 0.00% : 0.000005s : 1: overlap_grad_matmul_and_grad_allreduce 0.00% : 0.000008s : 1: overlap_grad_ring_attention 0.00% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000005s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000005s : 1: overlap_param_gather 0.00% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.00% : 0.000008s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000006s : 1: overlap_recompute_comm 0.00% : 0.000008s : 1: parallel-infer-symbol 0.00% : 0.000004s : 1: parallel-infer-symbol-second 0.00% : 0.000006s : 1: partial_unused_args_eliminate 0.00% : 0.000005s : 1: pipeline_parallel_scheduler 0.00% : 0.000006s : 1: pipeline_split 0.02% : 0.000039s : 1: pre_auto_parallel 0.02% : 0.000032s : 1: py_interpret_to_execute 0.01% : 0.000015s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000004s : 1: remove_cast_before_assign_add 0.01% : 0.000020s : 1: remove_dup_value 0.24% : 0.000465s : 1: renormalize.infer 0.19% : 0.000354s : 1: renormalize.specialize 0.00% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.00% : 0.000007s : 1: rewriter_after_jit_bprop_graph 0.02% : 0.000046s : 1: rewriter_after_opt_a 0.05% : 0.000090s : 1: rewriter_before_opt_a 0.00% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000005s : 1: slice_recompute_activation 0.00% : 0.000005s : 1: split_layernorm_comm 0.00% : 0.000005s : 1: split_matmul_comm_elemetwise 0.01% : 0.000010s : 1: swap_dp_allreduce_reducescatter 0.05% : 0.000097s : 1: symbol_engine_optimizer 0.07% : 0.000131s : 1: tuple_transform 88.43% : 0.168683s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:38:39.504.489 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:38:39.505.054 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.170117, [21] [bootstrap]: 0.00046035 [type_inference]: 0.156412 [event_method]: 2.628e-05 [auto_monad]: 8.511e-05 [graph_reusing]: 7.58001e-06 [inline]: 3.14999e-06 [add_attr]: 0.00395257, [1] [add_attr_with_inline]: 0.00394029, [1] [Cycle 1]: 9.78e-05, [2] [tag_attr]: 2.71e-05 [meta_addattr_fg_expand]: 6.02001e-06 [parallel-infer-symbol]: 4.3e-06 [pre_auto_parallel]: 4.55e-05 [insert-virtual-dataset]: 2.68e-06 [parallel-infer-symbol-second]: 8.09989e-07 [dataset_repeat_opt]: 2.21998e-06 [pipeline_split]: 2.19999e-06 [optimize]: 0.00751562, [53] [py_interpret_to_execute]: 4.144e-05 [rewriter_before_opt_a]: 0.00010842 [opt_a]: 0.00428205, [2] [Cycle 1]: 0.00316436, [45] [expand_dump_flag]: 3.08e-06 [switch_simplify]: 4.459e-05 [loop_unroll]: 3.298e-05 [a_1]: 0.00089212 [with_stream_mark]: 2.115e-05 [recompute_prepare]: 1.266e-05 [updatestate_depend_eliminate]: 5.98998e-06 [updatestate_assign_eliminate]: 4.94e-06 [updatestate_loads_eliminate]: 4.4e-06 [parameter_eliminate]: 2.03002e-06 [a_2]: 0.00016253 [accelerated_algorithm]: 2.703e-05 [shard]: 1.89999e-06 [meta_shard_fg_expand]: 2.32999e-06 [shard_inline]: 9.40001e-06 [merge_send_recv]: 1.088e-05 [auto_parallel]: 8.62998e-06 [parallel]: 2.194e-05 [flash_sp]: 8.75001e-06 [merge_comm]: 5.17e-06 [allreduce_fusion]: 4.97e-06 [matmul_add_comm_reduction]: 1.424e-05 [allreduce_slice_to_reducescatter]: 6.80011e-07 [virtual_shard_identity]: 1.091e-05 [virtual_dataset]: 9.44e-06 [get_grad_eliminate_]: 1.024e-05 [virtual_output]: 9.12999e-06 [merge_forward]: 5.20999e-06 [cell_reuse_recompute_pass]: 1.27e-06 [offload_activation]: 1.305e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.267e-05 [merge_recompute_call_nodes]: 1.49e-06 [before_grad]: 1.536e-05 [set_forward_comm_id_for_comm_node_pass]: 5.47001e-06 [meta_fg_expand]: 4.95001e-06 [flash_sp_send_recv_attached]: 5.34e-06 [receive_attached]: 2.53e-06 [after_resolve]: 1.609e-05 [a_after_grad]: 1.455e-05 [renormalize]: 0.0011332 [add_forward_monad_depend]: 7.13e-06 [auto_monad_grad]: 2.71e-06 [auto_monad_eliminator]: 2.173e-05 [cse]: 4.525e-05 [a_3]: 8.747e-05 [Cycle 2]: 0.00110154, [45] [expand_dump_flag]: 2.36998e-06 [switch_simplify]: 1.105e-05 [loop_unroll]: 8.70999e-06 [a_1]: 0.00023705 [with_stream_mark]: 1.318e-05 [recompute_prepare]: 9.14e-06 [updatestate_depend_eliminate]: 4.87e-06 [updatestate_assign_eliminate]: 3.95e-06 [updatestate_loads_eliminate]: 3.99002e-06 [parameter_eliminate]: 1.42999e-06 [a_2]: 0.00015022 [accelerated_algorithm]: 1.364e-05 [shard]: 1.31002e-06 [meta_shard_fg_expand]: 1.91003e-06 [shard_inline]: 9.38002e-06 [merge_send_recv]: 8.71002e-06 [auto_parallel]: 7.95e-06 [parallel]: 6.66999e-06 [flash_sp]: 3.92998e-06 [merge_comm]: 5.42001e-06 [allreduce_fusion]: 4.92999e-06 [matmul_add_comm_reduction]: 9.92999e-06 [allreduce_slice_to_reducescatter]: 5.00004e-07 [virtual_shard_identity]: 1.047e-05 [virtual_dataset]: 9.31e-06 [get_grad_eliminate_]: 9.05001e-06 [virtual_output]: 8.41002e-06 [merge_forward]: 4.75999e-06 [cell_reuse_recompute_pass]: 2.30002e-06 [offload_activation]: 9.99001e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.911e-05 [merge_recompute_call_nodes]: 8.00006e-07 [before_grad]: 1.485e-05 [set_forward_comm_id_for_comm_node_pass]: 6.07999e-06 [meta_fg_expand]: 3.61999e-06 [flash_sp_send_recv_attached]: 1.52001e-06 [receive_attached]: 1.59e-06 [after_resolve]: 1.448e-05 [a_after_grad]: 1.374e-05 [renormalize]: 8.9989e-08 [add_forward_monad_depend]: 1.34e-06 [auto_monad_grad]: 1.31998e-06 [auto_monad_eliminator]: 1.102e-05 [cse]: 2.539e-05 [a_3]: 6.872e-05 [py_interpret_to_execute_after_opt_a]: 1.787e-05 [slice_cell_reuse_recomputed_activation]: 4.74e-06 [rewriter_after_opt_a]: 9.056e-05 [convert_after_rewriter]: 1.345e-05 [order_py_execute_after_rewriter]: 1.119e-05 [mutable_eliminate]: 0.0007629 [opt_b]: 0.00043763, [1] [Cycle 1]: 0.00042473, [7] [b_1]: 0.00027977 [b_2]: 1.254e-05 [updatestate_depend_eliminate]: 8.90001e-06 [updatestate_assign_eliminate]: 4.70001e-06 [updatestate_loads_eliminate]: 3.88999e-06 [renormalize]: 7.60017e-07 [cse]: 3.842e-05 [optimize_parallel_all_gather_comm]: 2.623e-05 [overlap_param_gather]: 5.84999e-06 [cconv]: 3.531e-05 [loop_unroll]: 0.00060611 [opt_after_cconv]: 0.0001785, [1] [Cycle 1]: 0.00016829, [7] [c_1]: 5.384e-05 [parameter_eliminate]: 4.28001e-06 [updatestate_depend_eliminate]: 8.42e-06 [updatestate_assign_eliminate]: 4.21001e-06 [updatestate_loads_eliminate]: 4.27e-06 [cse]: 3.214e-05 [renormalize]: 5.70028e-07 [remove_dup_value]: 5.72e-05 [tuple_transform]: 0.0001346, [1] [Cycle 1]: 0.00012553, [4] [d_1]: 8.093e-05 [none_parameter_eliminate]: 1.68002e-06 [renormalize]: 3.50003e-07 [switch_simplify]: 1.049e-05 [partial_unused_args_eliminate]: 4.99998e-06 [add_recomputation]: 7.765e-05 [cse_after_recomputation]: 3.757e-05, [1] [Cycle 1]: 3.01e-05, [1] [cse]: 1.958e-05 [environ_conv]: 1.12e-05 [swap_dp_allreduce_reducescatter]: 1.019e-05 [bias_add_comm_swap]: 6.39999e-06 [label_micro_interleaved_index]: 9.03002e-06 [label_fine_grained_interleaved_index]: 5.95002e-06 [merge_cast_opt]: 4.17e-06 [slice_recompute_activation]: 5.59e-06 [micro_interleaved_order_control]: 5.79e-06 [assign_add_opt]: 4.08001e-06 [ForceFp32Comm]: 3.52002e-06 [remove_cast_before_assign_add]: 3.73001e-06 [full_micro_interleaved_order_control]: 4.57998e-06 [reorder_send_recv_between_fp_bp]: 6.69999e-06 [comm_op_add_attrs]: 4.20999e-06 [add_comm_op_reuse_tag]: 3.96001e-06 [interleave_split_concat_branches]: 3.95e-06 [interleave_parallel_branches]: 3.83999e-06 [overlap_opt_shard_in_pipeline]: 4.2e-06 [overlap_opt_shard_grad_in_pipeline]: 4.36002e-06 [control_data_broadcast_order]: 2.099e-05 [grouped_pairwise_exchange_alltoall]: 4.17003e-06 [offloading_packed_experts]: 8.84998e-06 [overlap_recompute_and_grad_model_parallel]: 9.86e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.78001e-06 [overlap_recompute_allgather_and_fa_grad]: 3.78999e-06 [overlap_recompute_comm]: 5.54e-06 [overlap_grad_ring_attention]: 8.69998e-06 [overlap_grad_flash_sp]: 2.976e-05 [begin_end_overlap_inline]: 5.16002e-06 [split_matmul_comm_elemetwise]: 5.30001e-06 [split_layernorm_comm]: 4.87e-06 [handle_group_info]: 3.68e-06 [symbol_engine_optimizer]: 0.00013187, [1] [Cycle 1]: 0.00012434, [6] [build]: 4.76002e-06 [elim_shapecalc]: 1.642e-05 [elim_not_effective]: 2.08e-05 [opt_reshape]: 1.071e-05 [fold_const_symbol]: 1.615e-05 [renormalize]: 2.69996e-07 [detach_backward]: 4.89998e-06 [pipeline_parallel_scheduler]: 2.49999e-06 [auto_monad_reorder]: 2.817e-05 [get_jit_bprop_graph]: 1.97999e-06 [rewriter_after_jit_bprop_graph]: 6.74999e-06 [opt_after_jit_grad]: 0.00068733 [validate]: 5.295e-05 Sums bootstrap : 0.000460s : 0.28% type_inference : 0.156412s : 95.32% event_method : 0.000026s : 0.02% auto_monad : 0.000085s : 0.05% graph_reusing : 0.000008s : 0.00% inline : 0.000003s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000027s : 0.02% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.00% parallel-infer-symbol : 0.000004s : 0.00% pre_auto_parallel : 0.000046s : 0.03% insert-virtual-dataset : 0.000003s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000041s : 0.03% optimize.rewriter_before_opt_a : 0.000108s : 0.07% optimize.opt_a.expand_dump_flag : 0.000005s : 0.00% optimize.opt_a.switch_simplify : 0.000056s : 0.03% optimize.opt_a.loop_unroll : 0.000042s : 0.03% optimize.opt_a.a_1 : 0.001129s : 0.69% optimize.opt_a.with_stream_mark : 0.000034s : 0.02% optimize.opt_a.recompute_prepare : 0.000022s : 0.01% optimize.opt_a.updatestate_depend_eliminate : 0.000011s : 0.01% optimize.opt_a.updatestate_assign_eliminate : 0.000009s : 0.01% optimize.opt_a.updatestate_loads_eliminate : 0.000008s : 0.01% optimize.opt_a.parameter_eliminate : 0.000003s : 0.00% optimize.opt_a.a_2 : 0.000313s : 0.19% optimize.opt_a.accelerated_algorithm : 0.000041s : 0.02% optimize.opt_a.shard : 0.000003s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.00% optimize.opt_a.shard_inline : 0.000019s : 0.01% optimize.opt_a.merge_send_recv : 0.000020s : 0.01% optimize.opt_a.auto_parallel : 0.000017s : 0.01% optimize.opt_a.parallel : 0.000029s : 0.02% optimize.opt_a.flash_sp : 0.000013s : 0.01% optimize.opt_a.merge_comm : 0.000011s : 0.01% optimize.opt_a.allreduce_fusion : 0.000010s : 0.01% optimize.opt_a.matmul_add_comm_reduction : 0.000024s : 0.01% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000021s : 0.01% optimize.opt_a.virtual_dataset : 0.000019s : 0.01% optimize.opt_a.get_grad_eliminate_ : 0.000019s : 0.01% optimize.opt_a.virtual_output : 0.000018s : 0.01% optimize.opt_a.merge_forward : 0.000010s : 0.01% optimize.opt_a.cell_reuse_recompute_pass : 0.000004s : 0.00% optimize.opt_a.offload_activation : 0.000023s : 0.01% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000042s : 0.03% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.00% optimize.opt_a.before_grad : 0.000030s : 0.02% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000012s : 0.01% optimize.opt_a.meta_fg_expand : 0.000009s : 0.01% optimize.opt_a.flash_sp_send_recv_attached : 0.000007s : 0.00% optimize.opt_a.receive_attached : 0.000004s : 0.00% optimize.opt_a.after_resolve : 0.000031s : 0.02% optimize.opt_a.a_after_grad : 0.000028s : 0.02% optimize.opt_a.renormalize : 0.001133s : 0.69% optimize.opt_a.add_forward_monad_depend : 0.000008s : 0.01% optimize.opt_a.auto_monad_grad : 0.000004s : 0.00% optimize.opt_a.auto_monad_eliminator : 0.000033s : 0.02% optimize.opt_a.cse : 0.000071s : 0.04% optimize.opt_a.a_3 : 0.000156s : 0.10% optimize.py_interpret_to_execute_after_opt_a : 0.000018s : 0.01% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.00% optimize.rewriter_after_opt_a : 0.000091s : 0.06% optimize.convert_after_rewriter : 0.000013s : 0.01% optimize.order_py_execute_after_rewriter : 0.000011s : 0.01% optimize.mutable_eliminate : 0.000763s : 0.46% optimize.opt_b.b_1 : 0.000280s : 0.17% optimize.opt_b.b_2 : 0.000013s : 0.01% optimize.opt_b.updatestate_depend_eliminate : 0.000009s : 0.01% optimize.opt_b.updatestate_assign_eliminate : 0.000005s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000004s : 0.00% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000038s : 0.02% optimize.optimize_parallel_all_gather_comm : 0.000026s : 0.02% optimize.overlap_param_gather : 0.000006s : 0.00% optimize.cconv : 0.000035s : 0.02% optimize.loop_unroll : 0.000606s : 0.37% optimize.opt_after_cconv.c_1 : 0.000054s : 0.03% optimize.opt_after_cconv.parameter_eliminate : 0.000004s : 0.00% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000008s : 0.01% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000004s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000004s : 0.00% optimize.opt_after_cconv.cse : 0.000032s : 0.02% optimize.opt_after_cconv.renormalize : 0.000001s : 0.00% optimize.remove_dup_value : 0.000057s : 0.03% optimize.tuple_transform.d_1 : 0.000081s : 0.05% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000010s : 0.01% optimize.partial_unused_args_eliminate : 0.000005s : 0.00% optimize.add_recomputation : 0.000078s : 0.05% optimize.cse_after_recomputation.cse : 0.000020s : 0.01% optimize.environ_conv : 0.000011s : 0.01% optimize.swap_dp_allreduce_reducescatter : 0.000010s : 0.01% optimize.bias_add_comm_swap : 0.000006s : 0.00% optimize.label_micro_interleaved_index : 0.000009s : 0.01% optimize.label_fine_grained_interleaved_index : 0.000006s : 0.00% optimize.merge_cast_opt : 0.000004s : 0.00% optimize.slice_recompute_activation : 0.000006s : 0.00% optimize.micro_interleaved_order_control : 0.000006s : 0.00% optimize.assign_add_opt : 0.000004s : 0.00% optimize.ForceFp32Comm : 0.000004s : 0.00% optimize.remove_cast_before_assign_add : 0.000004s : 0.00% optimize.full_micro_interleaved_order_control : 0.000005s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000007s : 0.00% optimize.comm_op_add_attrs : 0.000004s : 0.00% optimize.add_comm_op_reuse_tag : 0.000004s : 0.00% optimize.interleave_split_concat_branches : 0.000004s : 0.00% optimize.interleave_parallel_branches : 0.000004s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000004s : 0.00% optimize.control_data_broadcast_order : 0.000021s : 0.01% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.00% optimize.offloading_packed_experts : 0.000009s : 0.01% optimize.overlap_recompute_and_grad_model_parallel : 0.000010s : 0.01% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.00% optimize.overlap_recompute_comm : 0.000006s : 0.00% optimize.overlap_grad_ring_attention : 0.000009s : 0.01% optimize.overlap_grad_flash_sp : 0.000030s : 0.02% optimize.begin_end_overlap_inline : 0.000005s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000005s : 0.00% optimize.split_layernorm_comm : 0.000005s : 0.00% optimize.handle_group_info : 0.000004s : 0.00% optimize.symbol_engine_optimizer.build : 0.000005s : 0.00% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000016s : 0.01% optimize.symbol_engine_optimizer.elim_not_effective : 0.000021s : 0.01% optimize.symbol_engine_optimizer.opt_reshape : 0.000011s : 0.01% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000016s : 0.01% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000005s : 0.00% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000028s : 0.02% get_jit_bprop_graph : 0.000002s : 0.00% rewriter_after_jit_bprop_graph : 0.000007s : 0.00% opt_after_jit_grad : 0.000687s : 0.42% validate : 0.000053s : 0.03% Time group info: ------[substitution.] 0.000396 54 7.71% : 0.000031s : 4: substitution.arithmetic_simplify 9.19% : 0.000036s : 6: substitution.cast_eliminate 0.68% : 0.000003s : 4: substitution.elim_not_effective 0.52% : 0.000002s : 4: substitution.fold_const_symbol 2.17% : 0.000009s : 6: substitution.graph_param_transform 66.85% : 0.000265s : 4: substitution.inline 1.48% : 0.000006s : 8: substitution.j_node_and_user_rematch 3.96% : 0.000016s : 2: substitution.less_batch_normalization 2.40% : 0.000010s : 8: substitution.remove_not_recompute_node 1.62% : 0.000006s : 4: substitution.replace_old_param 3.41% : 0.000013s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.156333 2 99.27% : 0.155192s : 1: type_inference.infer 0.73% : 0.001141s : 1: type_inference.specialize ------[replace.] 0.000067 8 63.79% : 0.000043s : 4: replace.inline 36.21% : 0.000024s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000273 8 95.74% : 0.000261s : 4: match.inline 4.26% : 0.000012s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000292 1730 0.89% : 0.000003s : 17: predicate.accumulaten_eliminater 0.73% : 0.000002s : 6: predicate.ad_related_special_op_eliminate 0.59% : 0.000002s : 12: predicate.addn_check_dump 0.86% : 0.000002s : 17: predicate.addn_zero_filter 0.83% : 0.000002s : 17: predicate.adjust_all_reduce_mul_add 2.37% : 0.000007s : 29: predicate.arithmetic_simplify 0.99% : 0.000003s : 17: predicate.cast_eliminate 0.67% : 0.000002s : 12: predicate.check_bprop_eliminate 0.57% : 0.000002s : 12: predicate.compare_switch_simplify 0.20% : 0.000001s : 6: predicate.const_output_eliminate 0.64% : 0.000002s : 12: predicate.depend_value_elim 0.91% : 0.000003s : 17: predicate.dict_get_item_const_eliminator 0.94% : 0.000003s : 17: predicate.dict_get_item_eliminator 0.85% : 0.000002s : 17: predicate.dict_set_item_eliminator 1.26% : 0.000004s : 12: predicate.dumpgradient_eliminate 0.27% : 0.000001s : 6: predicate.elim_not_effective 0.45% : 0.000001s : 6: predicate.elim_shapecalc_of_broadcastargs 1.13% : 0.000003s : 23: predicate.environ_add_const_eliminate 1.07% : 0.000003s : 23: predicate.environ_get_add_eliminate 1.15% : 0.000003s : 23: predicate.environ_get_depend_swap 1.80% : 0.000005s : 35: predicate.environ_get_eliminate 1.21% : 0.000004s : 23: predicate.environ_get_set_eliminate 1.24% : 0.000004s : 25: predicate.exchange_switch_depend_value 2.16% : 0.000006s : 25: predicate.float_depend_g_call 0.58% : 0.000002s : 12: predicate.float_environ_get_switch 0.90% : 0.000003s : 18: predicate.float_tuple_getitem_switch 0.19% : 0.000001s : 6: predicate.fold_const_symbol 0.74% : 0.000002s : 12: predicate.get_grad_eliminate 0.21% : 0.000001s : 6: predicate.graph_param_transform 0.66% : 0.000002s : 12: predicate.incorporate_call 0.58% : 0.000002s : 12: predicate.incorporate_call_switch 6.47% : 0.000019s : 78: predicate.inline 0.82% : 0.000002s : 12: predicate.inline_without_move 0.34% : 0.000001s : 12: predicate.j_node_and_user_rematch 0.94% : 0.000003s : 12: predicate.less_batch_normalization 1.96% : 0.000006s : 33: predicate.list_to_tuple_eliminator_ 2.38% : 0.000007s : 50: predicate.load_eliminater 1.14% : 0.000003s : 6: predicate.loop_unroll_after_grad 1.91% : 0.000006s : 38: predicate.loop_unroll_before_grad 1.81% : 0.000005s : 29: predicate.make_slice_get_slice_eliminator 0.67% : 0.000002s : 12: predicate.merge_addn 0.64% : 0.000002s : 12: predicate.micro_step_allgather_replace 0.62% : 0.000002s : 12: predicate.mini_step_allgather_replace 0.83% : 0.000002s : 17: predicate.minmaximum_grad 1.23% : 0.000004s : 6: predicate.mutable_eliminate 0.38% : 0.000001s : 6: predicate.opt_reshape 0.41% : 0.000001s : 6: predicate.parallel_virtual_node 1.58% : 0.000005s : 25: predicate.partial_defer_inline 1.54% : 0.000004s : 27: predicate.partial_eliminate 0.84% : 0.000002s : 17: predicate.print_const_string_wrapper 0.63% : 0.000002s : 12: predicate.reduce_all_const_elim 1.22% : 0.000004s : 17: predicate.reduce_eliminate 2.45% : 0.000007s : 50: predicate.redundant_stop_gradient_eliminater 0.37% : 0.000001s : 12: predicate.remove_not_recompute_node 1.27% : 0.000004s : 33: predicate.replace_applicator 0.43% : 0.000001s : 12: predicate.replace_old_param 0.31% : 0.000001s : 6: predicate.reset_defer_inline 0.95% : 0.000003s : 17: predicate.reshape_eliminate 0.70% : 0.000002s : 12: predicate.row_tensor_add_zeros_like 0.39% : 0.000001s : 6: predicate.row_tensor_eliminate 0.76% : 0.000002s : 12: predicate.same_eliminate 0.44% : 0.000001s : 12: predicate.set_cell_output_no_recompute 0.80% : 0.000002s : 12: predicate.shard_identity_eliminate 0.83% : 0.000002s : 12: predicate.special_op_eliminate 0.88% : 0.000003s : 12: predicate.specialize_transform 0.88% : 0.000003s : 12: predicate.split_environ_get_set_with_tuple_value 0.83% : 0.000002s : 12: predicate.stack_unstack_eliminate 0.53% : 0.000002s : 6: predicate.switch_call_monad_eliminater 1.33% : 0.000004s : 25: predicate.switch_defer_inline 1.86% : 0.000005s : 37: predicate.switch_layer_defer_inline 4.57% : 0.000013s : 81: predicate.switch_simplify 0.89% : 0.000003s : 17: predicate.tile_eliminate 1.00% : 0.000003s : 17: predicate.transpose_eliminate 1.63% : 0.000005s : 29: predicate.tuple_list_convert_item_index_to_positive 1.60% : 0.000005s : 29: predicate.tuple_list_get_item_const_eliminator 1.45% : 0.000004s : 29: predicate.tuple_list_get_item_depend_reorder 3.11% : 0.000009s : 45: predicate.tuple_list_get_item_eliminator 1.66% : 0.000005s : 29: predicate.tuple_list_get_set_item_eliminator 2.37% : 0.000007s : 41: predicate.tuple_list_set_item_eliminator 1.70% : 0.000005s : 33: predicate.tuple_to_list_eliminator_ 2.33% : 0.000007s : 50: predicate.updatestate_pure_node_eliminater 3.20% : 0.000009s : 62: predicate.updatestate_useless_node_eliminater 0.39% : 0.000001s : 6: predicate.value_based_eliminate 0.71% : 0.000002s : 12: predicate.virtual_dataset_eliminate 0.65% : 0.000002s : 12: predicate.virtual_output_eliminate 0.29% : 0.000001s : 6: predicate.virtual_view_grad_eliminate 0.40% : 0.000001s : 6: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000770 11 41.19% : 0.000317s : 5: func_graph_cloner_run.FuncGraphClonerGraph 58.81% : 0.000453s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.184837 192 0.00% : 0.000006s : 1: ForceFp32Comm 2.14% : 0.003963s : 1: add_attr 2.13% : 0.003944s : 1: add_attr_with_inline 0.00% : 0.000007s : 1: add_comm_op_reuse_tag 0.04% : 0.000081s : 1: add_recomputation 0.00% : 0.000007s : 1: assign_add_opt 0.05% : 0.000096s : 1: auto_monad 0.02% : 0.000037s : 1: auto_monad_reorder 0.00% : 0.000008s : 1: begin_end_overlap_inline 0.00% : 0.000009s : 1: bias_add_comm_swap 0.27% : 0.000507s : 1: bootstrap 0.02% : 0.000039s : 1: cconv 0.00% : 0.000007s : 1: comm_op_add_attrs 0.01% : 0.000024s : 1: control_data_broadcast_order 0.01% : 0.000017s : 1: convert_after_rewriter 0.02% : 0.000041s : 1: cse_after_recomputation 0.00% : 0.000008s : 1: dataset_repeat_opt 0.02% : 0.000028s : 1: detach_backward 0.01% : 0.000014s : 1: environ_conv 0.02% : 0.000038s : 1: event_method 0.00% : 0.000008s : 1: full_micro_interleaved_order_control 0.00% : 0.000008s : 1: get_jit_bprop_graph 0.01% : 0.000015s : 1: graph_reusing 0.00% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000006s : 1: handle_group_info 0.01% : 0.000009s : 1: inline 0.00% : 0.000009s : 1: insert-virtual-dataset 0.00% : 0.000007s : 1: interleave_parallel_branches 0.00% : 0.000007s : 1: interleave_split_concat_branches 0.00% : 0.000009s : 1: label_fine_grained_interleaved_index 0.01% : 0.000012s : 1: label_micro_interleaved_index 0.33% : 0.000614s : 1: loop_unroll 0.00% : 0.000007s : 1: merge_cast_opt 0.00% : 0.000009s : 1: micro_interleaved_order_control 0.42% : 0.000770s : 1: mutable_eliminate 0.01% : 0.000012s : 1: offloading_packed_experts 0.01% : 0.000021s : 1: opt.transform.loop_unroll_optimizer 0.01% : 0.000022s : 1: opt.transform.mutable_eliminate 0.97% : 0.001800s : 78: opt.transform.opt_a 0.03% : 0.000052s : 1: opt.transform.opt_after_cconv 0.02% : 0.000041s : 1: opt.transform.opt_after_jit_grad 0.10% : 0.000194s : 28: opt.transform.opt_b 0.05% : 0.000089s : 2: opt.transform.opt_trans_graph 0.03% : 0.000060s : 4: opt.transform.symbol_engine_opt 2.32% : 0.004286s : 1: opt_a 0.10% : 0.000182s : 1: opt_after_cconv 0.38% : 0.000700s : 1: opt_after_jit_grad 0.24% : 0.000442s : 1: opt_b 4.35% : 0.008037s : 1: optimize 0.02% : 0.000030s : 1: optimize_parallel_all_gather_comm 0.01% : 0.000014s : 1: order_py_execute_after_rewriter 0.02% : 0.000033s : 1: overlap_grad_flash_sp 0.00% : 0.000007s : 1: overlap_grad_matmul_and_grad_allreduce 0.01% : 0.000012s : 1: overlap_grad_ring_attention 0.00% : 0.000007s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000007s : 1: overlap_opt_shard_in_pipeline 0.01% : 0.000010s : 1: overlap_param_gather 0.00% : 0.000006s : 1: overlap_recompute_allgather_and_fa_grad 0.01% : 0.000014s : 1: overlap_recompute_and_grad_model_parallel 0.01% : 0.000009s : 1: overlap_recompute_comm 0.01% : 0.000012s : 1: parallel-infer-symbol 0.00% : 0.000008s : 1: parallel-infer-symbol-second 0.00% : 0.000009s : 1: partial_unused_args_eliminate 0.01% : 0.000011s : 1: pipeline_parallel_scheduler 0.00% : 0.000008s : 1: pipeline_split 0.03% : 0.000053s : 1: pre_auto_parallel 0.02% : 0.000045s : 1: py_interpret_to_execute 0.01% : 0.000021s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000007s : 1: remove_cast_before_assign_add 0.03% : 0.000061s : 1: remove_dup_value 0.35% : 0.000643s : 1: renormalize.infer 0.26% : 0.000479s : 1: renormalize.specialize 0.01% : 0.000010s : 1: reorder_send_recv_between_fp_bp 0.01% : 0.000014s : 1: rewriter_after_jit_bprop_graph 0.05% : 0.000096s : 1: rewriter_after_opt_a 0.06% : 0.000112s : 1: rewriter_before_opt_a 0.00% : 0.000008s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000008s : 1: slice_recompute_activation 0.00% : 0.000008s : 1: split_layernorm_comm 0.00% : 0.000008s : 1: split_matmul_comm_elemetwise 0.01% : 0.000013s : 1: swap_dp_allreduce_reducescatter 0.07% : 0.000135s : 1: symbol_engine_optimizer 0.07% : 0.000138s : 1: tuple_transform 84.65% : 0.156469s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:38:44.727.89 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.224473, [21] [bootstrap]: 0.00042585 [type_inference]: 0.211273 [event_method]: 2.493e-05 [auto_monad]: 0.00017555 [graph_reusing]: 8.69e-06 [inline]: 3.21001e-06 [add_attr]: 0.0041966, [1] [add_attr_with_inline]: 0.00418284, [1] [Cycle 1]: 8.075e-05, [2] [tag_attr]: 2.814e-05 [meta_addattr_fg_expand]: 6.24001e-06 [parallel-infer-symbol]: 3.65998e-06 [pre_auto_parallel]: 4.677e-05 [insert-virtual-dataset]: 2.28998e-06 [parallel-infer-symbol-second]: 8.00006e-07 [dataset_repeat_opt]: 2.29001e-06 [pipeline_split]: 2.10002e-06 [optimize]: 0.00733519, [53] [py_interpret_to_execute]: 3.829e-05 [rewriter_before_opt_a]: 0.0001065 [opt_a]: 0.0042333, [2] [Cycle 1]: 0.00315384, [45] [expand_dump_flag]: 3.29001e-06 [switch_simplify]: 4.716e-05 [loop_unroll]: 3.244e-05 [a_1]: 0.00083431 [with_stream_mark]: 2.652e-05 [recompute_prepare]: 1.438e-05 [updatestate_depend_eliminate]: 6.25002e-06 [updatestate_assign_eliminate]: 5.68002e-06 [updatestate_loads_eliminate]: 4.09002e-06 [parameter_eliminate]: 2.31998e-06 [a_2]: 0.00013454 [accelerated_algorithm]: 2.72e-05 [shard]: 2.12001e-06 [meta_shard_fg_expand]: 2.81999e-06 [shard_inline]: 1.027e-05 [merge_send_recv]: 1.082e-05 [auto_parallel]: 1.157e-05 [parallel]: 2.144e-05 [flash_sp]: 9.81998e-06 [merge_comm]: 6.68998e-06 [allreduce_fusion]: 5.29998e-06 [matmul_add_comm_reduction]: 1.582e-05 [allreduce_slice_to_reducescatter]: 9.50007e-07 [virtual_shard_identity]: 1.471e-05 [virtual_dataset]: 9.76e-06 [get_grad_eliminate_]: 9.24e-06 [virtual_output]: 1.126e-05 [merge_forward]: 5.99999e-06 [cell_reuse_recompute_pass]: 2.21e-06 [offload_activation]: 1.389e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.183e-05 [merge_recompute_call_nodes]: 1.54e-06 [before_grad]: 1.75e-05 [set_forward_comm_id_for_comm_node_pass]: 5.76e-06 [meta_fg_expand]: 4.12e-06 [flash_sp_send_recv_attached]: 5.72001e-06 [receive_attached]: 2.79999e-06 [after_resolve]: 1.791e-05 [a_after_grad]: 1.827e-05 [renormalize]: 0.00126978 [add_forward_monad_depend]: 9.02e-06 [auto_monad_grad]: 3.03e-06 [auto_monad_eliminator]: 2.762e-05 [cse]: 5.123e-05 [a_3]: 8.766e-05 [Cycle 2]: 0.0010634, [45] [expand_dump_flag]: 2.78e-06 [switch_simplify]: 1.155e-05 [loop_unroll]: 9.14998e-06 [a_1]: 0.00025817 [with_stream_mark]: 1.87e-05 [recompute_prepare]: 1.046e-05 [updatestate_depend_eliminate]: 5.72001e-06 [updatestate_assign_eliminate]: 4.4e-06 [updatestate_loads_eliminate]: 4.60999e-06 [parameter_eliminate]: 2.00002e-06 [a_2]: 0.00012089 [accelerated_algorithm]: 1.354e-05 [shard]: 1.89e-06 [meta_shard_fg_expand]: 3.04001e-06 [shard_inline]: 9.36998e-06 [merge_send_recv]: 9.76998e-06 [auto_parallel]: 1.14e-05 [parallel]: 9.64e-06 [flash_sp]: 4.61002e-06 [merge_comm]: 5.86e-06 [allreduce_fusion]: 5.08002e-06 [matmul_add_comm_reduction]: 1.298e-05 [allreduce_slice_to_reducescatter]: 6.00005e-07 [virtual_shard_identity]: 1.201e-05 [virtual_dataset]: 9.91e-06 [get_grad_eliminate_]: 8.56002e-06 [virtual_output]: 8.68001e-06 [merge_forward]: 5.64e-06 [cell_reuse_recompute_pass]: 3.46999e-06 [offload_activation]: 1.281e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.918e-05 [merge_recompute_call_nodes]: 1.32e-06 [before_grad]: 1.789e-05 [set_forward_comm_id_for_comm_node_pass]: 5.78002e-06 [meta_fg_expand]: 4.14002e-06 [flash_sp_send_recv_attached]: 2.12999e-06 [receive_attached]: 2.51998e-06 [after_resolve]: 4.334e-05 [a_after_grad]: 1.509e-05 [renormalize]: 8.00064e-08 [add_forward_monad_depend]: 2.77002e-06 [auto_monad_grad]: 2.05002e-06 [auto_monad_eliminator]: 1.52e-05 [cse]: 3.354e-05 [a_3]: 6.042e-05 [py_interpret_to_execute_after_opt_a]: 1.863e-05 [slice_cell_reuse_recomputed_activation]: 1.90001e-06 [rewriter_after_opt_a]: 5.549e-05 [convert_after_rewriter]: 8.93002e-06 [order_py_execute_after_rewriter]: 6.93e-06 [mutable_eliminate]: 0.00084003 [opt_b]: 0.00036162, [1] [Cycle 1]: 0.00035088, [7] [b_1]: 0.00022237 [b_2]: 1.268e-05 [updatestate_depend_eliminate]: 1.22e-05 [updatestate_assign_eliminate]: 4.77e-06 [updatestate_loads_eliminate]: 4.17e-06 [renormalize]: 9.20001e-07 [cse]: 4.378e-05 [optimize_parallel_all_gather_comm]: 2.432e-05 [overlap_param_gather]: 2.37999e-06 [cconv]: 3.539e-05 [loop_unroll]: 0.00060985 [opt_after_cconv]: 0.0001961, [1] [Cycle 1]: 0.00018752, [7] [c_1]: 5.13e-05 [parameter_eliminate]: 5.52001e-06 [updatestate_depend_eliminate]: 8.86997e-06 [updatestate_assign_eliminate]: 4.4e-06 [updatestate_loads_eliminate]: 3.78999e-06 [cse]: 3.745e-05 [renormalize]: 3.7998e-07 [remove_dup_value]: 5.73e-05 [tuple_transform]: 0.00012447, [1] [Cycle 1]: 0.00011829, [4] [d_1]: 8.161e-05 [none_parameter_eliminate]: 2.17999e-06 [renormalize]: 1.80007e-07 [switch_simplify]: 1.038e-05 [partial_unused_args_eliminate]: 1.87001e-06 [add_recomputation]: 8.242e-05 [cse_after_recomputation]: 3.27e-05, [1] [Cycle 1]: 2.683e-05, [1] [cse]: 1.976e-05 [environ_conv]: 8.54998e-06 [swap_dp_allreduce_reducescatter]: 7.06001e-06 [bias_add_comm_swap]: 3.13e-06 [label_micro_interleaved_index]: 6.07001e-06 [label_fine_grained_interleaved_index]: 3.01001e-06 [merge_cast_opt]: 1.37e-06 [slice_recompute_activation]: 2.82002e-06 [micro_interleaved_order_control]: 2.62001e-06 [assign_add_opt]: 1.74e-06 [ForceFp32Comm]: 9.80013e-07 [remove_cast_before_assign_add]: 1.84998e-06 [full_micro_interleaved_order_control]: 2.04999e-06 [reorder_send_recv_between_fp_bp]: 3.31999e-06 [comm_op_add_attrs]: 1.24e-06 [add_comm_op_reuse_tag]: 9.5999e-07 [interleave_split_concat_branches]: 1.48002e-06 [interleave_parallel_branches]: 1.09e-06 [overlap_opt_shard_in_pipeline]: 1.71002e-06 [overlap_opt_shard_grad_in_pipeline]: 2.98e-06 [control_data_broadcast_order]: 1.833e-05 [grouped_pairwise_exchange_alltoall]: 1.64e-06 [offloading_packed_experts]: 5.79999e-06 [overlap_recompute_and_grad_model_parallel]: 5.95002e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.45999e-06 [overlap_recompute_allgather_and_fa_grad]: 1.38002e-06 [overlap_recompute_comm]: 2.41e-06 [overlap_grad_ring_attention]: 5.37001e-06 [overlap_grad_flash_sp]: 2.926e-05 [begin_end_overlap_inline]: 9.20001e-07 [split_matmul_comm_elemetwise]: 2.65002e-06 [split_layernorm_comm]: 1.74e-06 [handle_group_info]: 1.30001e-06 [symbol_engine_optimizer]: 0.0001145, [1] [Cycle 1]: 0.00010927, [6] [build]: 5.91998e-06 [elim_shapecalc]: 1.603e-05 [elim_not_effective]: 2.099e-05 [opt_reshape]: 1.071e-05 [fold_const_symbol]: 1.627e-05 [renormalize]: 1.8999e-07 [detach_backward]: 2.26e-06 [pipeline_parallel_scheduler]: 1.42999e-06 [auto_monad_reorder]: 2.444e-05 [get_jit_bprop_graph]: 3.03e-06 [rewriter_after_jit_bprop_graph]: 6.68e-06 [opt_after_jit_grad]: 0.00068258 [validate]: 5.629e-05 Sums bootstrap : 0.000426s : 0.19% type_inference : 0.211273s : 96.44% event_method : 0.000025s : 0.01% auto_monad : 0.000176s : 0.08% graph_reusing : 0.000009s : 0.00% inline : 0.000003s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000028s : 0.01% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.00% parallel-infer-symbol : 0.000004s : 0.00% pre_auto_parallel : 0.000047s : 0.02% insert-virtual-dataset : 0.000002s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000038s : 0.02% optimize.rewriter_before_opt_a : 0.000106s : 0.05% optimize.opt_a.expand_dump_flag : 0.000006s : 0.00% optimize.opt_a.switch_simplify : 0.000059s : 0.03% optimize.opt_a.loop_unroll : 0.000042s : 0.02% optimize.opt_a.a_1 : 0.001092s : 0.50% optimize.opt_a.with_stream_mark : 0.000045s : 0.02% optimize.opt_a.recompute_prepare : 0.000025s : 0.01% optimize.opt_a.updatestate_depend_eliminate : 0.000012s : 0.01% optimize.opt_a.updatestate_assign_eliminate : 0.000010s : 0.00% optimize.opt_a.updatestate_loads_eliminate : 0.000009s : 0.00% optimize.opt_a.parameter_eliminate : 0.000004s : 0.00% optimize.opt_a.a_2 : 0.000255s : 0.12% optimize.opt_a.accelerated_algorithm : 0.000041s : 0.02% optimize.opt_a.shard : 0.000004s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.000006s : 0.00% optimize.opt_a.shard_inline : 0.000020s : 0.01% optimize.opt_a.merge_send_recv : 0.000021s : 0.01% optimize.opt_a.auto_parallel : 0.000023s : 0.01% optimize.opt_a.parallel : 0.000031s : 0.01% optimize.opt_a.flash_sp : 0.000014s : 0.01% optimize.opt_a.merge_comm : 0.000013s : 0.01% optimize.opt_a.allreduce_fusion : 0.000010s : 0.00% optimize.opt_a.matmul_add_comm_reduction : 0.000029s : 0.01% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000002s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000027s : 0.01% optimize.opt_a.virtual_dataset : 0.000020s : 0.01% optimize.opt_a.get_grad_eliminate_ : 0.000018s : 0.01% optimize.opt_a.virtual_output : 0.000020s : 0.01% optimize.opt_a.merge_forward : 0.000012s : 0.01% optimize.opt_a.cell_reuse_recompute_pass : 0.000006s : 0.00% optimize.opt_a.offload_activation : 0.000027s : 0.01% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000041s : 0.02% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.00% optimize.opt_a.before_grad : 0.000035s : 0.02% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000012s : 0.01% optimize.opt_a.meta_fg_expand : 0.000008s : 0.00% optimize.opt_a.flash_sp_send_recv_attached : 0.000008s : 0.00% optimize.opt_a.receive_attached : 0.000005s : 0.00% optimize.opt_a.after_resolve : 0.000061s : 0.03% optimize.opt_a.a_after_grad : 0.000033s : 0.02% optimize.opt_a.renormalize : 0.001270s : 0.58% optimize.opt_a.add_forward_monad_depend : 0.000012s : 0.01% optimize.opt_a.auto_monad_grad : 0.000005s : 0.00% optimize.opt_a.auto_monad_eliminator : 0.000043s : 0.02% optimize.opt_a.cse : 0.000085s : 0.04% optimize.opt_a.a_3 : 0.000148s : 0.07% optimize.py_interpret_to_execute_after_opt_a : 0.000019s : 0.01% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.00% optimize.rewriter_after_opt_a : 0.000055s : 0.03% optimize.convert_after_rewriter : 0.000009s : 0.00% optimize.order_py_execute_after_rewriter : 0.000007s : 0.00% optimize.mutable_eliminate : 0.000840s : 0.38% optimize.opt_b.b_1 : 0.000222s : 0.10% optimize.opt_b.b_2 : 0.000013s : 0.01% optimize.opt_b.updatestate_depend_eliminate : 0.000012s : 0.01% optimize.opt_b.updatestate_assign_eliminate : 0.000005s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000004s : 0.00% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000044s : 0.02% optimize.optimize_parallel_all_gather_comm : 0.000024s : 0.01% optimize.overlap_param_gather : 0.000002s : 0.00% optimize.cconv : 0.000035s : 0.02% optimize.loop_unroll : 0.000610s : 0.28% optimize.opt_after_cconv.c_1 : 0.000051s : 0.02% optimize.opt_after_cconv.parameter_eliminate : 0.000006s : 0.00% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000009s : 0.00% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000004s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000004s : 0.00% optimize.opt_after_cconv.cse : 0.000037s : 0.02% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000057s : 0.03% optimize.tuple_transform.d_1 : 0.000082s : 0.04% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000010s : 0.00% optimize.partial_unused_args_eliminate : 0.000002s : 0.00% optimize.add_recomputation : 0.000082s : 0.04% optimize.cse_after_recomputation.cse : 0.000020s : 0.01% optimize.environ_conv : 0.000009s : 0.00% optimize.swap_dp_allreduce_reducescatter : 0.000007s : 0.00% optimize.bias_add_comm_swap : 0.000003s : 0.00% optimize.label_micro_interleaved_index : 0.000006s : 0.00% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.00% optimize.merge_cast_opt : 0.000001s : 0.00% optimize.slice_recompute_activation : 0.000003s : 0.00% optimize.micro_interleaved_order_control : 0.000003s : 0.00% optimize.assign_add_opt : 0.000002s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000002s : 0.00% optimize.full_micro_interleaved_order_control : 0.000002s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.00% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000002s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000003s : 0.00% optimize.control_data_broadcast_order : 0.000018s : 0.01% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.00% optimize.offloading_packed_experts : 0.000006s : 0.00% optimize.overlap_recompute_and_grad_model_parallel : 0.000006s : 0.00% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.00% optimize.overlap_recompute_comm : 0.000002s : 0.00% optimize.overlap_grad_ring_attention : 0.000005s : 0.00% optimize.overlap_grad_flash_sp : 0.000029s : 0.01% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000003s : 0.00% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000006s : 0.00% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000016s : 0.01% optimize.symbol_engine_optimizer.elim_not_effective : 0.000021s : 0.01% optimize.symbol_engine_optimizer.opt_reshape : 0.000011s : 0.00% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000016s : 0.01% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.00% pipeline_parallel_scheduler : 0.000001s : 0.00% auto_monad_reorder : 0.000024s : 0.01% get_jit_bprop_graph : 0.000003s : 0.00% rewriter_after_jit_bprop_graph : 0.000007s : 0.00% opt_after_jit_grad : 0.000683s : 0.31% validate : 0.000056s : 0.03% Time group info: ------[substitution.] 0.000340 54 11.17% : 0.000038s : 4: substitution.arithmetic_simplify 13.06% : 0.000044s : 6: substitution.cast_eliminate 0.80% : 0.000003s : 4: substitution.elim_not_effective 0.59% : 0.000002s : 4: substitution.fold_const_symbol 2.61% : 0.000009s : 6: substitution.graph_param_transform 56.01% : 0.000191s : 4: substitution.inline 2.36% : 0.000008s : 8: substitution.j_node_and_user_rematch 4.81% : 0.000016s : 2: substitution.less_batch_normalization 2.50% : 0.000009s : 8: substitution.remove_not_recompute_node 1.99% : 0.000007s : 4: substitution.replace_old_param 4.12% : 0.000014s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.211186 2 99.49% : 0.210107s : 1: type_inference.infer 0.51% : 0.001079s : 1: type_inference.specialize ------[replace.] 0.000071 8 61.93% : 0.000044s : 4: replace.inline 38.07% : 0.000027s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000200 8 93.86% : 0.000187s : 4: match.inline 6.14% : 0.000012s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000305 1730 0.86% : 0.000003s : 17: predicate.accumulaten_eliminater 1.02% : 0.000003s : 6: predicate.ad_related_special_op_eliminate 0.57% : 0.000002s : 12: predicate.addn_check_dump 1.05% : 0.000003s : 17: predicate.addn_zero_filter 0.78% : 0.000002s : 17: predicate.adjust_all_reduce_mul_add 2.54% : 0.000008s : 29: predicate.arithmetic_simplify 0.86% : 0.000003s : 17: predicate.cast_eliminate 0.59% : 0.000002s : 12: predicate.check_bprop_eliminate 0.57% : 0.000002s : 12: predicate.compare_switch_simplify 0.19% : 0.000001s : 6: predicate.const_output_eliminate 0.65% : 0.000002s : 12: predicate.depend_value_elim 0.85% : 0.000003s : 17: predicate.dict_get_item_const_eliminator 1.19% : 0.000004s : 17: predicate.dict_get_item_eliminator 1.01% : 0.000003s : 17: predicate.dict_set_item_eliminator 1.44% : 0.000004s : 12: predicate.dumpgradient_eliminate 0.22% : 0.000001s : 6: predicate.elim_not_effective 0.36% : 0.000001s : 6: predicate.elim_shapecalc_of_broadcastargs 1.08% : 0.000003s : 23: predicate.environ_add_const_eliminate 1.09% : 0.000003s : 23: predicate.environ_get_add_eliminate 1.07% : 0.000003s : 23: predicate.environ_get_depend_swap 1.61% : 0.000005s : 35: predicate.environ_get_eliminate 1.02% : 0.000003s : 23: predicate.environ_get_set_eliminate 1.25% : 0.000004s : 25: predicate.exchange_switch_depend_value 2.45% : 0.000007s : 25: predicate.float_depend_g_call 0.57% : 0.000002s : 12: predicate.float_environ_get_switch 0.81% : 0.000002s : 18: predicate.float_tuple_getitem_switch 0.18% : 0.000001s : 6: predicate.fold_const_symbol 0.69% : 0.000002s : 12: predicate.get_grad_eliminate 0.30% : 0.000001s : 6: predicate.graph_param_transform 0.64% : 0.000002s : 12: predicate.incorporate_call 0.54% : 0.000002s : 12: predicate.incorporate_call_switch 6.12% : 0.000019s : 78: predicate.inline 0.83% : 0.000003s : 12: predicate.inline_without_move 0.31% : 0.000001s : 12: predicate.j_node_and_user_rematch 1.00% : 0.000003s : 12: predicate.less_batch_normalization 1.78% : 0.000005s : 33: predicate.list_to_tuple_eliminator_ 2.26% : 0.000007s : 50: predicate.load_eliminater 1.14% : 0.000003s : 6: predicate.loop_unroll_after_grad 1.82% : 0.000006s : 38: predicate.loop_unroll_before_grad 1.64% : 0.000005s : 29: predicate.make_slice_get_slice_eliminator 0.59% : 0.000002s : 12: predicate.merge_addn 0.64% : 0.000002s : 12: predicate.micro_step_allgather_replace 0.63% : 0.000002s : 12: predicate.mini_step_allgather_replace 0.74% : 0.000002s : 17: predicate.minmaximum_grad 1.61% : 0.000005s : 6: predicate.mutable_eliminate 0.37% : 0.000001s : 6: predicate.opt_reshape 0.42% : 0.000001s : 6: predicate.parallel_virtual_node 1.82% : 0.000006s : 25: predicate.partial_defer_inline 1.47% : 0.000005s : 27: predicate.partial_eliminate 0.89% : 0.000003s : 17: predicate.print_const_string_wrapper 0.60% : 0.000002s : 12: predicate.reduce_all_const_elim 1.16% : 0.000004s : 17: predicate.reduce_eliminate 2.33% : 0.000007s : 50: predicate.redundant_stop_gradient_eliminater 0.36% : 0.000001s : 12: predicate.remove_not_recompute_node 1.22% : 0.000004s : 33: predicate.replace_applicator 0.53% : 0.000002s : 12: predicate.replace_old_param 0.32% : 0.000001s : 6: predicate.reset_defer_inline 1.00% : 0.000003s : 17: predicate.reshape_eliminate 0.62% : 0.000002s : 12: predicate.row_tensor_add_zeros_like 0.45% : 0.000001s : 6: predicate.row_tensor_eliminate 1.16% : 0.000004s : 12: predicate.same_eliminate 0.52% : 0.000002s : 12: predicate.set_cell_output_no_recompute 0.97% : 0.000003s : 12: predicate.shard_identity_eliminate 0.71% : 0.000002s : 12: predicate.special_op_eliminate 0.75% : 0.000002s : 12: predicate.specialize_transform 1.36% : 0.000004s : 12: predicate.split_environ_get_set_with_tuple_value 0.87% : 0.000003s : 12: predicate.stack_unstack_eliminate 0.39% : 0.000001s : 6: predicate.switch_call_monad_eliminater 1.28% : 0.000004s : 25: predicate.switch_defer_inline 1.84% : 0.000006s : 37: predicate.switch_layer_defer_inline 4.30% : 0.000013s : 81: predicate.switch_simplify 0.80% : 0.000002s : 17: predicate.tile_eliminate 0.83% : 0.000003s : 17: predicate.transpose_eliminate 1.65% : 0.000005s : 29: predicate.tuple_list_convert_item_index_to_positive 1.68% : 0.000005s : 29: predicate.tuple_list_get_item_const_eliminator 1.46% : 0.000004s : 29: predicate.tuple_list_get_item_depend_reorder 3.18% : 0.000010s : 45: predicate.tuple_list_get_item_eliminator 1.45% : 0.000004s : 29: predicate.tuple_list_get_set_item_eliminator 2.32% : 0.000007s : 41: predicate.tuple_list_set_item_eliminator 1.84% : 0.000006s : 33: predicate.tuple_to_list_eliminator_ 2.23% : 0.000007s : 50: predicate.updatestate_pure_node_eliminater 2.95% : 0.000009s : 62: predicate.updatestate_useless_node_eliminater 0.39% : 0.000001s : 6: predicate.value_based_eliminate 0.66% : 0.000002s : 12: predicate.virtual_dataset_eliminate 0.79% : 0.000002s : 12: predicate.virtual_output_eliminate 0.30% : 0.000001s : 6: predicate.virtual_view_grad_eliminate 0.59% : 0.000002s : 6: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000803 11 47.34% : 0.000380s : 5: func_graph_cloner_run.FuncGraphClonerGraph 52.66% : 0.000423s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.239412 192 0.00% : 0.000004s : 1: ForceFp32Comm 1.76% : 0.004204s : 1: add_attr 1.75% : 0.004188s : 1: add_attr_with_inline 0.00% : 0.000004s : 1: add_comm_op_reuse_tag 0.04% : 0.000087s : 1: add_recomputation 0.00% : 0.000004s : 1: assign_add_opt 0.08% : 0.000186s : 1: auto_monad 0.01% : 0.000029s : 1: auto_monad_reorder 0.00% : 0.000005s : 1: begin_end_overlap_inline 0.00% : 0.000006s : 1: bias_add_comm_swap 0.19% : 0.000461s : 1: bootstrap 0.02% : 0.000040s : 1: cconv 0.00% : 0.000004s : 1: comm_op_add_attrs 0.01% : 0.000022s : 1: control_data_broadcast_order 0.01% : 0.000012s : 1: convert_after_rewriter 0.01% : 0.000036s : 1: cse_after_recomputation 0.00% : 0.000005s : 1: dataset_repeat_opt 0.00% : 0.000006s : 1: detach_backward 0.00% : 0.000012s : 1: environ_conv 0.01% : 0.000033s : 1: event_method 0.00% : 0.000005s : 1: full_micro_interleaved_order_control 0.00% : 0.000006s : 1: get_jit_bprop_graph 0.01% : 0.000014s : 1: graph_reusing 0.00% : 0.000005s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000004s : 1: handle_group_info 0.00% : 0.000007s : 1: inline 0.00% : 0.000006s : 1: insert-virtual-dataset 0.00% : 0.000004s : 1: interleave_parallel_branches 0.00% : 0.000004s : 1: interleave_split_concat_branches 0.00% : 0.000006s : 1: label_fine_grained_interleaved_index 0.00% : 0.000009s : 1: label_micro_interleaved_index 0.26% : 0.000621s : 1: loop_unroll 0.00% : 0.000004s : 1: merge_cast_opt 0.00% : 0.000005s : 1: micro_interleaved_order_control 0.36% : 0.000854s : 1: mutable_eliminate 0.00% : 0.000009s : 1: offloading_packed_experts 0.01% : 0.000025s : 1: opt.transform.loop_unroll_optimizer 0.01% : 0.000029s : 1: opt.transform.mutable_eliminate 0.76% : 0.001819s : 78: opt.transform.opt_a 0.02% : 0.000050s : 1: opt.transform.opt_after_cconv 0.02% : 0.000044s : 1: opt.transform.opt_after_jit_grad 0.08% : 0.000194s : 28: opt.transform.opt_b 0.04% : 0.000089s : 2: opt.transform.opt_trans_graph 0.03% : 0.000060s : 4: opt.transform.symbol_engine_opt 1.77% : 0.004238s : 1: opt_a 0.08% : 0.000201s : 1: opt_after_cconv 0.29% : 0.000695s : 1: opt_after_jit_grad 0.15% : 0.000366s : 1: opt_b 3.07% : 0.007342s : 1: optimize 0.01% : 0.000029s : 1: optimize_parallel_all_gather_comm 0.00% : 0.000010s : 1: order_py_execute_after_rewriter 0.01% : 0.000033s : 1: overlap_grad_flash_sp 0.00% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.00% : 0.000008s : 1: overlap_grad_ring_attention 0.00% : 0.000006s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000006s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000006s : 1: overlap_param_gather 0.00% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.00% : 0.000009s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000005s : 1: overlap_recompute_comm 0.00% : 0.000008s : 1: parallel-infer-symbol 0.00% : 0.000004s : 1: parallel-infer-symbol-second 0.00% : 0.000005s : 1: partial_unused_args_eliminate 0.00% : 0.000005s : 1: pipeline_parallel_scheduler 0.00% : 0.000005s : 1: pipeline_split 0.02% : 0.000051s : 1: pre_auto_parallel 0.02% : 0.000042s : 1: py_interpret_to_execute 0.01% : 0.000023s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000005s : 1: remove_cast_before_assign_add 0.03% : 0.000062s : 1: remove_dup_value 0.29% : 0.000692s : 1: renormalize.infer 0.23% : 0.000562s : 1: renormalize.specialize 0.00% : 0.000007s : 1: reorder_send_recv_between_fp_bp 0.00% : 0.000011s : 1: rewriter_after_jit_bprop_graph 0.03% : 0.000060s : 1: rewriter_after_opt_a 0.05% : 0.000112s : 1: rewriter_before_opt_a 0.00% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000005s : 1: slice_recompute_activation 0.00% : 0.000004s : 1: split_layernorm_comm 0.00% : 0.000005s : 1: split_matmul_comm_elemetwise 0.00% : 0.000010s : 1: swap_dp_allreduce_reducescatter 0.05% : 0.000117s : 1: symbol_engine_optimizer 0.05% : 0.000128s : 1: tuple_transform 88.26% : 0.211304s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:38:47.191.590 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:38:47.191.862 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.210755, [21] [bootstrap]: 0.0004657 [type_inference]: 0.0208082 [event_method]: 2.02e-05 [auto_monad]: 7.291e-05 [graph_reusing]: 7.02002e-06 [inline]: 2.86999e-06 [add_attr]: 0.00388024, [1] [add_attr_with_inline]: 0.00386483, [1] [Cycle 1]: 0.00010401, [2] [tag_attr]: 2.531e-05 [meta_addattr_fg_expand]: 7.17002e-06 [parallel-infer-symbol]: 3.46999e-06 [pre_auto_parallel]: 4.234e-05 [insert-virtual-dataset]: 2.75002e-06 [parallel-infer-symbol-second]: 8.50006e-07 [dataset_repeat_opt]: 2.63e-06 [pipeline_split]: 1.59e-06 [optimize]: 0.183883, [53] [py_interpret_to_execute]: 4.093e-05 [rewriter_before_opt_a]: 0.00010397 [opt_a]: 0.180548, [2] [Cycle 1]: 0.179483, [45] [expand_dump_flag]: 3.37002e-06 [switch_simplify]: 4.514e-05 [loop_unroll]: 3.19e-05 [a_1]: 0.177149 [with_stream_mark]: 3.951e-05 [recompute_prepare]: 1.807e-05 [updatestate_depend_eliminate]: 5.55001e-06 [updatestate_assign_eliminate]: 4.05e-06 [updatestate_loads_eliminate]: 3.41999e-06 [parameter_eliminate]: 2.64001e-06 [a_2]: 0.00014941 [accelerated_algorithm]: 2.785e-05 [shard]: 4.21001e-06 [meta_shard_fg_expand]: 4.47e-06 [shard_inline]: 9.04e-06 [merge_send_recv]: 1.193e-05 [auto_parallel]: 1.147e-05 [parallel]: 2.211e-05 [flash_sp]: 1.084e-05 [merge_comm]: 4.89e-06 [allreduce_fusion]: 4.32998e-06 [matmul_add_comm_reduction]: 1.313e-05 [allreduce_slice_to_reducescatter]: 8.09989e-07 [virtual_shard_identity]: 1.348e-05 [virtual_dataset]: 1.041e-05 [get_grad_eliminate_]: 9.49e-06 [virtual_output]: 9.00001e-06 [merge_forward]: 6.46e-06 [cell_reuse_recompute_pass]: 4.18001e-06 [offload_activation]: 1.464e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.376e-05 [merge_recompute_call_nodes]: 1.47999e-06 [before_grad]: 1.58e-05 [set_forward_comm_id_for_comm_node_pass]: 6.12999e-06 [meta_fg_expand]: 3.82002e-06 [flash_sp_send_recv_attached]: 7.07997e-06 [receive_attached]: 2.02999e-06 [after_resolve]: 1.705e-05 [a_after_grad]: 1.562e-05 [renormalize]: 0.00108547 [add_forward_monad_depend]: 8.46002e-06 [auto_monad_grad]: 3.55e-06 [auto_monad_eliminator]: 2.313e-05 [cse]: 4.048e-05 [a_3]: 8.442e-05 [Cycle 2]: 0.00104504, [45] [expand_dump_flag]: 2.43e-06 [switch_simplify]: 1.016e-05 [loop_unroll]: 8.67e-06 [a_1]: 0.00020678 [with_stream_mark]: 1.492e-05 [recompute_prepare]: 9.34e-06 [updatestate_depend_eliminate]: 4.17998e-06 [updatestate_assign_eliminate]: 3.7e-06 [updatestate_loads_eliminate]: 3.57002e-06 [parameter_eliminate]: 1.83002e-06 [a_2]: 0.00013093 [accelerated_algorithm]: 1.221e-05 [shard]: 1.39e-06 [meta_shard_fg_expand]: 2.19999e-06 [shard_inline]: 7.61001e-06 [merge_send_recv]: 1.055e-05 [auto_parallel]: 8.06001e-06 [parallel]: 8.13999e-06 [flash_sp]: 3.81001e-06 [merge_comm]: 5.05999e-06 [allreduce_fusion]: 3.80998e-06 [matmul_add_comm_reduction]: 1.071e-05 [allreduce_slice_to_reducescatter]: 5.60016e-07 [virtual_shard_identity]: 9.99999e-06 [virtual_dataset]: 9.35001e-06 [get_grad_eliminate_]: 7.86001e-06 [virtual_output]: 7.57998e-06 [merge_forward]: 3.95e-06 [cell_reuse_recompute_pass]: 2.59999e-06 [offload_activation]: 1.029e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.631e-05 [merge_recompute_call_nodes]: 1.30001e-06 [before_grad]: 1.265e-05 [set_forward_comm_id_for_comm_node_pass]: 6.32001e-06 [meta_fg_expand]: 3.01001e-06 [flash_sp_send_recv_attached]: 1.44e-06 [receive_attached]: 2.07001e-06 [after_resolve]: 1.47e-05 [a_after_grad]: 1.206e-05 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 2.01e-06 [auto_monad_grad]: 1.58002e-06 [auto_monad_eliminator]: 1.221e-05 [cse]: 2.527e-05 [a_3]: 6.086e-05 [py_interpret_to_execute_after_opt_a]: 2.074e-05 [slice_cell_reuse_recomputed_activation]: 5.86e-06 [rewriter_after_opt_a]: 5.211e-05 [convert_after_rewriter]: 1.21e-05 [order_py_execute_after_rewriter]: 9.51998e-06 [mutable_eliminate]: 0.0008068 [opt_b]: 0.00040234, [1] [Cycle 1]: 0.00038932, [7] [b_1]: 0.00024865 [b_2]: 1.136e-05 [updatestate_depend_eliminate]: 1e-05 [updatestate_assign_eliminate]: 3.73001e-06 [updatestate_loads_eliminate]: 3.83999e-06 [renormalize]: 1.05999e-06 [cse]: 3.349e-05 [optimize_parallel_all_gather_comm]: 2.8e-05 [overlap_param_gather]: 6.06e-06 [cconv]: 4.007e-05 [loop_unroll]: 0.00060944 [opt_after_cconv]: 0.00017594, [1] [Cycle 1]: 0.00016451, [7] [c_1]: 4.527e-05 [parameter_eliminate]: 4.80999e-06 [updatestate_depend_eliminate]: 7.35e-06 [updatestate_assign_eliminate]: 3.4e-06 [updatestate_loads_eliminate]: 3.10998e-06 [cse]: 2.892e-05 [renormalize]: 4.80009e-07 [remove_dup_value]: 2.057e-05 [tuple_transform]: 0.00012729, [1] [Cycle 1]: 0.00011853, [4] [d_1]: 6.88e-05 [none_parameter_eliminate]: 1.84e-06 [renormalize]: 2.29978e-07 [switch_simplify]: 9.66e-06 [partial_unused_args_eliminate]: 5.39e-06 [add_recomputation]: 7.266e-05 [cse_after_recomputation]: 3.849e-05, [1] [Cycle 1]: 2.92e-05, [1] [cse]: 1.742e-05 [environ_conv]: 1.11e-05 [swap_dp_allreduce_reducescatter]: 1.172e-05 [bias_add_comm_swap]: 7.03998e-06 [label_micro_interleaved_index]: 8.99e-06 [label_fine_grained_interleaved_index]: 6.05002e-06 [merge_cast_opt]: 4.99998e-06 [slice_recompute_activation]: 5.59998e-06 [micro_interleaved_order_control]: 5.99999e-06 [assign_add_opt]: 4.38001e-06 [ForceFp32Comm]: 3.68999e-06 [remove_cast_before_assign_add]: 4.64998e-06 [full_micro_interleaved_order_control]: 5.40999e-06 [reorder_send_recv_between_fp_bp]: 6.22001e-06 [comm_op_add_attrs]: 4.45e-06 [add_comm_op_reuse_tag]: 4.54002e-06 [interleave_split_concat_branches]: 4.32998e-06 [interleave_parallel_branches]: 4.48999e-06 [overlap_opt_shard_in_pipeline]: 4.81002e-06 [overlap_opt_shard_grad_in_pipeline]: 5.42999e-06 [control_data_broadcast_order]: 2.243e-05 [grouped_pairwise_exchange_alltoall]: 5.04e-06 [offloading_packed_experts]: 8.67998e-06 [overlap_recompute_and_grad_model_parallel]: 8.41002e-06 [overlap_grad_matmul_and_grad_allreduce]: 4.55999e-06 [overlap_recompute_allgather_and_fa_grad]: 4.95999e-06 [overlap_recompute_comm]: 6.06998e-06 [overlap_grad_ring_attention]: 8.10999e-06 [overlap_grad_flash_sp]: 0.00016865 [begin_end_overlap_inline]: 5.05001e-06 [split_matmul_comm_elemetwise]: 6.36998e-06 [split_layernorm_comm]: 5.89e-06 [handle_group_info]: 5.15001e-06 [symbol_engine_optimizer]: 0.00013913, [1] [Cycle 1]: 0.00012875, [6] [build]: 5.10001e-06 [elim_shapecalc]: 1.904e-05 [elim_not_effective]: 1.906e-05 [opt_reshape]: 1.015e-05 [fold_const_symbol]: 1.353e-05 [renormalize]: 2.3999e-07 [detach_backward]: 4.53999e-06 [pipeline_parallel_scheduler]: 2.01e-06 [auto_monad_reorder]: 2.773e-05 [get_jit_bprop_graph]: 1.87999e-06 [rewriter_after_jit_bprop_graph]: 6.59999e-06 [opt_after_jit_grad]: 0.0006746 [validate]: 5.407e-05 Sums bootstrap : 0.000466s : 0.23% type_inference : 0.020808s : 10.17% event_method : 0.000020s : 0.01% auto_monad : 0.000073s : 0.04% graph_reusing : 0.000007s : 0.00% inline : 0.000003s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000025s : 0.01% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000007s : 0.00% parallel-infer-symbol : 0.000003s : 0.00% pre_auto_parallel : 0.000042s : 0.02% insert-virtual-dataset : 0.000003s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000003s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000041s : 0.02% optimize.rewriter_before_opt_a : 0.000104s : 0.05% optimize.opt_a.expand_dump_flag : 0.000006s : 0.00% optimize.opt_a.switch_simplify : 0.000055s : 0.03% optimize.opt_a.loop_unroll : 0.000041s : 0.02% optimize.opt_a.a_1 : 0.177356s : 86.65% optimize.opt_a.with_stream_mark : 0.000054s : 0.03% optimize.opt_a.recompute_prepare : 0.000027s : 0.01% optimize.opt_a.updatestate_depend_eliminate : 0.000010s : 0.00% optimize.opt_a.updatestate_assign_eliminate : 0.000008s : 0.00% optimize.opt_a.updatestate_loads_eliminate : 0.000007s : 0.00% optimize.opt_a.parameter_eliminate : 0.000004s : 0.00% optimize.opt_a.a_2 : 0.000280s : 0.14% optimize.opt_a.accelerated_algorithm : 0.000040s : 0.02% optimize.opt_a.shard : 0.000006s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.000007s : 0.00% optimize.opt_a.shard_inline : 0.000017s : 0.01% optimize.opt_a.merge_send_recv : 0.000022s : 0.01% optimize.opt_a.auto_parallel : 0.000020s : 0.01% optimize.opt_a.parallel : 0.000030s : 0.01% optimize.opt_a.flash_sp : 0.000015s : 0.01% optimize.opt_a.merge_comm : 0.000010s : 0.00% optimize.opt_a.allreduce_fusion : 0.000008s : 0.00% optimize.opt_a.matmul_add_comm_reduction : 0.000024s : 0.01% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000023s : 0.01% optimize.opt_a.virtual_dataset : 0.000020s : 0.01% optimize.opt_a.get_grad_eliminate_ : 0.000017s : 0.01% optimize.opt_a.virtual_output : 0.000017s : 0.01% optimize.opt_a.merge_forward : 0.000010s : 0.01% optimize.opt_a.cell_reuse_recompute_pass : 0.000007s : 0.00% optimize.opt_a.offload_activation : 0.000025s : 0.01% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000040s : 0.02% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.00% optimize.opt_a.before_grad : 0.000028s : 0.01% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000012s : 0.01% optimize.opt_a.meta_fg_expand : 0.000007s : 0.00% optimize.opt_a.flash_sp_send_recv_attached : 0.000009s : 0.00% optimize.opt_a.receive_attached : 0.000004s : 0.00% optimize.opt_a.after_resolve : 0.000032s : 0.02% optimize.opt_a.a_after_grad : 0.000028s : 0.01% optimize.opt_a.renormalize : 0.001086s : 0.53% optimize.opt_a.add_forward_monad_depend : 0.000010s : 0.01% optimize.opt_a.auto_monad_grad : 0.000005s : 0.00% optimize.opt_a.auto_monad_eliminator : 0.000035s : 0.02% optimize.opt_a.cse : 0.000066s : 0.03% optimize.opt_a.a_3 : 0.000145s : 0.07% optimize.py_interpret_to_execute_after_opt_a : 0.000021s : 0.01% optimize.slice_cell_reuse_recomputed_activation : 0.000006s : 0.00% optimize.rewriter_after_opt_a : 0.000052s : 0.03% optimize.convert_after_rewriter : 0.000012s : 0.01% optimize.order_py_execute_after_rewriter : 0.000010s : 0.00% optimize.mutable_eliminate : 0.000807s : 0.39% optimize.opt_b.b_1 : 0.000249s : 0.12% optimize.opt_b.b_2 : 0.000011s : 0.01% optimize.opt_b.updatestate_depend_eliminate : 0.000010s : 0.00% optimize.opt_b.updatestate_assign_eliminate : 0.000004s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000004s : 0.00% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000033s : 0.02% optimize.optimize_parallel_all_gather_comm : 0.000028s : 0.01% optimize.overlap_param_gather : 0.000006s : 0.00% optimize.cconv : 0.000040s : 0.02% optimize.loop_unroll : 0.000609s : 0.30% optimize.opt_after_cconv.c_1 : 0.000045s : 0.02% optimize.opt_after_cconv.parameter_eliminate : 0.000005s : 0.00% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000007s : 0.00% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.cse : 0.000029s : 0.01% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000021s : 0.01% optimize.tuple_transform.d_1 : 0.000069s : 0.03% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000010s : 0.00% optimize.partial_unused_args_eliminate : 0.000005s : 0.00% optimize.add_recomputation : 0.000073s : 0.04% optimize.cse_after_recomputation.cse : 0.000017s : 0.01% optimize.environ_conv : 0.000011s : 0.01% optimize.swap_dp_allreduce_reducescatter : 0.000012s : 0.01% optimize.bias_add_comm_swap : 0.000007s : 0.00% optimize.label_micro_interleaved_index : 0.000009s : 0.00% optimize.label_fine_grained_interleaved_index : 0.000006s : 0.00% optimize.merge_cast_opt : 0.000005s : 0.00% optimize.slice_recompute_activation : 0.000006s : 0.00% optimize.micro_interleaved_order_control : 0.000006s : 0.00% optimize.assign_add_opt : 0.000004s : 0.00% optimize.ForceFp32Comm : 0.000004s : 0.00% optimize.remove_cast_before_assign_add : 0.000005s : 0.00% optimize.full_micro_interleaved_order_control : 0.000005s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000006s : 0.00% optimize.comm_op_add_attrs : 0.000004s : 0.00% optimize.add_comm_op_reuse_tag : 0.000005s : 0.00% optimize.interleave_split_concat_branches : 0.000004s : 0.00% optimize.interleave_parallel_branches : 0.000004s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000005s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000005s : 0.00% optimize.control_data_broadcast_order : 0.000022s : 0.01% optimize.grouped_pairwise_exchange_alltoall : 0.000005s : 0.00% optimize.offloading_packed_experts : 0.000009s : 0.00% optimize.overlap_recompute_and_grad_model_parallel : 0.000008s : 0.00% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000005s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000005s : 0.00% optimize.overlap_recompute_comm : 0.000006s : 0.00% optimize.overlap_grad_ring_attention : 0.000008s : 0.00% optimize.overlap_grad_flash_sp : 0.000169s : 0.08% optimize.begin_end_overlap_inline : 0.000005s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000006s : 0.00% optimize.split_layernorm_comm : 0.000006s : 0.00% optimize.handle_group_info : 0.000005s : 0.00% optimize.symbol_engine_optimizer.build : 0.000005s : 0.00% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000019s : 0.01% optimize.symbol_engine_optimizer.elim_not_effective : 0.000019s : 0.01% optimize.symbol_engine_optimizer.opt_reshape : 0.000010s : 0.00% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000014s : 0.01% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000005s : 0.00% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000028s : 0.01% get_jit_bprop_graph : 0.000002s : 0.00% rewriter_after_jit_bprop_graph : 0.000007s : 0.00% opt_after_jit_grad : 0.000675s : 0.33% validate : 0.000054s : 0.03% Time group info: ------[substitution.] 0.000357 45 13.89% : 0.000050s : 5: substitution.arithmetic_simplify 12.43% : 0.000044s : 3: substitution.cast_eliminate 0.69% : 0.000002s : 3: substitution.elim_not_effective 0.47% : 0.000002s : 3: substitution.fold_const_symbol 1.99% : 0.000007s : 5: substitution.graph_param_transform 51.97% : 0.000185s : 4: substitution.inline 1.72% : 0.000006s : 6: substitution.j_node_and_user_rematch 4.37% : 0.000016s : 2: substitution.less_batch_normalization 2.40% : 0.000009s : 6: substitution.remove_not_recompute_node 2.13% : 0.000008s : 4: substitution.replace_old_param 7.95% : 0.000028s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.020749 2 95.78% : 0.019874s : 1: type_inference.infer 4.22% : 0.000875s : 1: type_inference.specialize ------[replace.] 0.000121 8 36.43% : 0.000044s : 4: replace.inline 63.57% : 0.000077s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000206 8 87.92% : 0.000182s : 4: match.inline 12.08% : 0.000025s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.176463 1504 0.00% : 0.000003s : 15: predicate.accumulaten_eliminater 0.00% : 0.000002s : 5: predicate.ad_related_special_op_eliminate 0.00% : 0.000001s : 10: predicate.addn_check_dump 0.00% : 0.000003s : 15: predicate.addn_zero_filter 0.00% : 0.000002s : 15: predicate.adjust_all_reduce_mul_add 0.00% : 0.000009s : 25: predicate.arithmetic_simplify 0.00% : 0.000003s : 15: predicate.cast_eliminate 0.00% : 0.000002s : 10: predicate.check_bprop_eliminate 0.00% : 0.000001s : 10: predicate.compare_switch_simplify 0.00% : 0.000000s : 5: predicate.const_output_eliminate 0.00% : 0.000002s : 10: predicate.depend_value_elim 0.00% : 0.000002s : 15: predicate.dict_get_item_const_eliminator 0.00% : 0.000004s : 15: predicate.dict_get_item_eliminator 0.00% : 0.000002s : 15: predicate.dict_set_item_eliminator 0.00% : 0.000002s : 10: predicate.dumpgradient_eliminate 0.00% : 0.000001s : 5: predicate.elim_not_effective 0.00% : 0.000001s : 5: predicate.elim_shapecalc_of_broadcastargs 0.00% : 0.000004s : 20: predicate.environ_add_const_eliminate 0.00% : 0.000003s : 20: predicate.environ_get_add_eliminate 99.84% : 0.176189s : 20: predicate.environ_get_depend_swap 0.00% : 0.000005s : 30: predicate.environ_get_eliminate 0.00% : 0.000003s : 20: predicate.environ_get_set_eliminate 0.00% : 0.000003s : 23: predicate.exchange_switch_depend_value 0.00% : 0.000006s : 23: predicate.float_depend_g_call 0.00% : 0.000001s : 10: predicate.float_environ_get_switch 0.00% : 0.000002s : 15: predicate.float_tuple_getitem_switch 0.00% : 0.000000s : 5: predicate.fold_const_symbol 0.00% : 0.000003s : 10: predicate.get_grad_eliminate 0.00% : 0.000001s : 5: predicate.graph_param_transform 0.00% : 0.000002s : 10: predicate.incorporate_call 0.00% : 0.000001s : 10: predicate.incorporate_call_switch 0.01% : 0.000019s : 68: predicate.inline 0.00% : 0.000003s : 10: predicate.inline_without_move 0.00% : 0.000001s : 10: predicate.j_node_and_user_rematch 0.00% : 0.000003s : 10: predicate.less_batch_normalization 0.00% : 0.000005s : 29: predicate.list_to_tuple_eliminator_ 0.00% : 0.000006s : 44: predicate.load_eliminater 0.00% : 0.000003s : 5: predicate.loop_unroll_after_grad 0.00% : 0.000005s : 36: predicate.loop_unroll_before_grad 0.00% : 0.000005s : 25: predicate.make_slice_get_slice_eliminator 0.00% : 0.000002s : 10: predicate.merge_addn 0.00% : 0.000002s : 10: predicate.micro_step_allgather_replace 0.00% : 0.000002s : 10: predicate.mini_step_allgather_replace 0.00% : 0.000002s : 15: predicate.minmaximum_grad 0.00% : 0.000003s : 5: predicate.mutable_eliminate 0.00% : 0.000001s : 5: predicate.opt_reshape 0.00% : 0.000001s : 5: predicate.parallel_virtual_node 0.00% : 0.000006s : 23: predicate.partial_defer_inline 0.00% : 0.000004s : 24: predicate.partial_eliminate 0.00% : 0.000002s : 15: predicate.print_const_string_wrapper 0.00% : 0.000002s : 10: predicate.reduce_all_const_elim 0.00% : 0.000005s : 15: predicate.reduce_eliminate 0.00% : 0.000006s : 44: predicate.redundant_stop_gradient_eliminater 0.00% : 0.000001s : 10: predicate.remove_not_recompute_node 0.00% : 0.000004s : 29: predicate.replace_applicator 0.00% : 0.000001s : 10: predicate.replace_old_param 0.00% : 0.000001s : 5: predicate.reset_defer_inline 0.00% : 0.000003s : 15: predicate.reshape_eliminate 0.00% : 0.000002s : 10: predicate.row_tensor_add_zeros_like 0.00% : 0.000001s : 5: predicate.row_tensor_eliminate 0.00% : 0.000002s : 10: predicate.same_eliminate 0.00% : 0.000001s : 10: predicate.set_cell_output_no_recompute 0.00% : 0.000003s : 10: predicate.shard_identity_eliminate 0.00% : 0.000002s : 10: predicate.special_op_eliminate 0.00% : 0.000002s : 10: predicate.specialize_transform 0.00% : 0.000003s : 10: predicate.split_environ_get_set_with_tuple_value 0.00% : 0.000002s : 10: predicate.stack_unstack_eliminate 0.00% : 0.000001s : 5: predicate.switch_call_monad_eliminater 0.00% : 0.000004s : 23: predicate.switch_defer_inline 0.00% : 0.000005s : 33: predicate.switch_layer_defer_inline 0.01% : 0.000012s : 74: predicate.switch_simplify 0.00% : 0.000003s : 15: predicate.tile_eliminate 0.00% : 0.000002s : 15: predicate.transpose_eliminate 0.00% : 0.000004s : 25: predicate.tuple_list_convert_item_index_to_positive 0.00% : 0.000004s : 25: predicate.tuple_list_get_item_const_eliminator 0.00% : 0.000004s : 25: predicate.tuple_list_get_item_depend_reorder 0.00% : 0.000009s : 39: predicate.tuple_list_get_item_eliminator 0.00% : 0.000004s : 25: predicate.tuple_list_get_set_item_eliminator 0.00% : 0.000007s : 35: predicate.tuple_list_set_item_eliminator 0.00% : 0.000004s : 29: predicate.tuple_to_list_eliminator_ 0.00% : 0.000006s : 44: predicate.updatestate_pure_node_eliminater 0.00% : 0.000008s : 54: predicate.updatestate_useless_node_eliminater 0.00% : 0.000001s : 5: predicate.value_based_eliminate 0.00% : 0.000002s : 10: predicate.virtual_dataset_eliminate 0.00% : 0.000002s : 10: predicate.virtual_output_eliminate 0.00% : 0.000001s : 5: predicate.virtual_view_grad_eliminate 0.00% : 0.000002s : 5: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000733 11 53.46% : 0.000392s : 5: func_graph_cloner_run.FuncGraphClonerGraph 46.54% : 0.000341s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.577834 192 0.00% : 0.000007s : 1: ForceFp32Comm 0.67% : 0.003892s : 1: add_attr 0.67% : 0.003870s : 1: add_attr_with_inline 0.00% : 0.000007s : 1: add_comm_op_reuse_tag 0.01% : 0.000077s : 1: add_recomputation 0.00% : 0.000007s : 1: assign_add_opt 0.01% : 0.000084s : 1: auto_monad 0.01% : 0.000036s : 1: auto_monad_reorder 0.00% : 0.000009s : 1: begin_end_overlap_inline 0.00% : 0.000010s : 1: bias_add_comm_swap 0.09% : 0.000509s : 1: bootstrap 0.01% : 0.000044s : 1: cconv 0.00% : 0.000008s : 1: comm_op_add_attrs 0.00% : 0.000026s : 1: control_data_broadcast_order 0.00% : 0.000015s : 1: convert_after_rewriter 0.01% : 0.000042s : 1: cse_after_recomputation 0.00% : 0.000008s : 1: dataset_repeat_opt 0.00% : 0.000027s : 1: detach_backward 0.00% : 0.000014s : 1: environ_conv 0.01% : 0.000031s : 1: event_method 0.00% : 0.000008s : 1: full_micro_interleaved_order_control 0.00% : 0.000008s : 1: get_jit_bprop_graph 0.00% : 0.000014s : 1: graph_reusing 0.00% : 0.000008s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000008s : 1: handle_group_info 0.00% : 0.000010s : 1: inline 0.00% : 0.000009s : 1: insert-virtual-dataset 0.00% : 0.000007s : 1: interleave_parallel_branches 0.00% : 0.000007s : 1: interleave_split_concat_branches 0.00% : 0.000009s : 1: label_fine_grained_interleaved_index 0.00% : 0.000012s : 1: label_micro_interleaved_index 0.11% : 0.000616s : 1: loop_unroll 0.00% : 0.000008s : 1: merge_cast_opt 0.00% : 0.000009s : 1: micro_interleaved_order_control 0.14% : 0.000815s : 1: mutable_eliminate 0.00% : 0.000012s : 1: offloading_packed_experts 0.00% : 0.000023s : 1: opt.transform.loop_unroll_optimizer 0.00% : 0.000024s : 1: opt.transform.mutable_eliminate 30.80% : 0.177973s : 78: opt.transform.opt_a 0.01% : 0.000044s : 1: opt.transform.opt_after_cconv 0.01% : 0.000036s : 1: opt.transform.opt_after_jit_grad 0.03% : 0.000163s : 28: opt.transform.opt_b 0.01% : 0.000076s : 2: opt.transform.opt_trans_graph 0.01% : 0.000057s : 4: opt.transform.symbol_engine_opt 31.25% : 0.180552s : 1: opt_a 0.03% : 0.000180s : 1: opt_after_cconv 0.12% : 0.000687s : 1: opt_after_jit_grad 0.07% : 0.000407s : 1: opt_b 31.91% : 0.184393s : 1: optimize 0.01% : 0.000032s : 1: optimize_parallel_all_gather_comm 0.00% : 0.000013s : 1: order_py_execute_after_rewriter 0.03% : 0.000177s : 1: overlap_grad_flash_sp 0.00% : 0.000008s : 1: overlap_grad_matmul_and_grad_allreduce 0.00% : 0.000012s : 1: overlap_grad_ring_attention 0.00% : 0.000009s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000008s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000009s : 1: overlap_param_gather 0.00% : 0.000008s : 1: overlap_recompute_allgather_and_fa_grad 0.00% : 0.000012s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000009s : 1: overlap_recompute_comm 0.00% : 0.000011s : 1: parallel-infer-symbol 0.00% : 0.000007s : 1: parallel-infer-symbol-second 0.00% : 0.000009s : 1: partial_unused_args_eliminate 0.00% : 0.000011s : 1: pipeline_parallel_scheduler 0.00% : 0.000007s : 1: pipeline_split 0.01% : 0.000050s : 1: pre_auto_parallel 0.01% : 0.000045s : 1: py_interpret_to_execute 0.00% : 0.000024s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000009s : 1: remove_cast_before_assign_add 0.00% : 0.000024s : 1: remove_dup_value 0.10% : 0.000605s : 1: renormalize.infer 0.08% : 0.000469s : 1: renormalize.specialize 0.00% : 0.000010s : 1: reorder_send_recv_between_fp_bp 0.00% : 0.000013s : 1: rewriter_after_jit_bprop_graph 0.01% : 0.000056s : 1: rewriter_after_opt_a 0.02% : 0.000109s : 1: rewriter_before_opt_a 0.00% : 0.000009s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000008s : 1: slice_recompute_activation 0.00% : 0.000009s : 1: split_layernorm_comm 0.00% : 0.000010s : 1: split_matmul_comm_elemetwise 0.00% : 0.000015s : 1: swap_dp_allreduce_reducescatter 0.02% : 0.000143s : 1: symbol_engine_optimizer 0.02% : 0.000131s : 1: tuple_transform 3.61% : 0.020855s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:38:50.107.708 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.201847, [21] [bootstrap]: 0.00073868 [type_inference]: 0.00954542 [event_method]: 2.003e-05 [auto_monad]: 7.907e-05 [graph_reusing]: 6.02001e-06 [inline]: 3.6e-06 [add_attr]: 0.00478539, [1] [add_attr_with_inline]: 0.00477039, [1] [Cycle 1]: 7.817e-05, [2] [tag_attr]: 2.418e-05 [meta_addattr_fg_expand]: 7.22002e-06 [parallel-infer-symbol]: 6.73e-06 [pre_auto_parallel]: 5.345e-05 [insert-virtual-dataset]: 2.87002e-06 [parallel-infer-symbol-second]: 1.77999e-06 [dataset_repeat_opt]: 2.04e-06 [pipeline_split]: 1.72001e-06 [optimize]: 0.185542, [53] [py_interpret_to_execute]: 5.774e-05 [rewriter_before_opt_a]: 0.00013053 [opt_a]: 0.181725, [2] [Cycle 1]: 0.180654, [45] [expand_dump_flag]: 4.91997e-06 [switch_simplify]: 4.517e-05 [loop_unroll]: 3.393e-05 [a_1]: 0.00083484 [with_stream_mark]: 1.849e-05 [recompute_prepare]: 1.087e-05 [updatestate_depend_eliminate]: 5.25999e-06 [updatestate_assign_eliminate]: 3.76001e-06 [updatestate_loads_eliminate]: 3.4e-06 [parameter_eliminate]: 2.37001e-06 [a_2]: 0.00013208 [accelerated_algorithm]: 2.209e-05 [shard]: 2.24001e-06 [meta_shard_fg_expand]: 2.24001e-06 [shard_inline]: 8.08001e-06 [merge_send_recv]: 1.08e-05 [auto_parallel]: 8.19998e-06 [parallel]: 2.555e-05 [flash_sp]: 9.10001e-06 [merge_comm]: 5.48002e-06 [allreduce_fusion]: 4.62998e-06 [matmul_add_comm_reduction]: 1.254e-05 [allreduce_slice_to_reducescatter]: 6.59988e-07 [virtual_shard_identity]: 1.359e-05 [virtual_dataset]: 1.005e-05 [get_grad_eliminate_]: 9.91e-06 [virtual_output]: 9.95002e-06 [merge_forward]: 4.59002e-06 [cell_reuse_recompute_pass]: 2.56998e-06 [offload_activation]: 1.219e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.915e-05 [merge_recompute_call_nodes]: 1.57001e-06 [before_grad]: 1.589e-05 [set_forward_comm_id_for_comm_node_pass]: 4.88001e-06 [meta_fg_expand]: 4.20999e-06 [flash_sp_send_recv_attached]: 5.20001e-06 [receive_attached]: 2.58e-06 [after_resolve]: 1.768e-05 [a_after_grad]: 1.51e-05 [renormalize]: 0.17874 [add_forward_monad_depend]: 1.36e-05 [auto_monad_grad]: 2.71e-06 [auto_monad_eliminator]: 3.401e-05 [cse]: 5.751e-05 [a_3]: 0.00010446 [Cycle 2]: 0.00105457, [45] [expand_dump_flag]: 3.76001e-06 [switch_simplify]: 1.158e-05 [loop_unroll]: 1.261e-05 [a_1]: 0.00026292 [with_stream_mark]: 2.133e-05 [recompute_prepare]: 8.05e-06 [updatestate_depend_eliminate]: 5.15999e-06 [updatestate_assign_eliminate]: 4e-06 [updatestate_loads_eliminate]: 3.77998e-06 [parameter_eliminate]: 2.60002e-06 [a_2]: 0.00011234 [accelerated_algorithm]: 1.468e-05 [shard]: 2.12001e-06 [meta_shard_fg_expand]: 2.98998e-06 [shard_inline]: 8.74e-06 [merge_send_recv]: 1.57e-05 [auto_parallel]: 1.322e-05 [parallel]: 1.221e-05 [flash_sp]: 6.18002e-06 [merge_comm]: 6.58e-06 [allreduce_fusion]: 4.71002e-06 [matmul_add_comm_reduction]: 1.42e-05 [allreduce_slice_to_reducescatter]: 9.20001e-07 [virtual_shard_identity]: 1.169e-05 [virtual_dataset]: 1.022e-05 [get_grad_eliminate_]: 9.65002e-06 [virtual_output]: 9.67999e-06 [merge_forward]: 5.42001e-06 [cell_reuse_recompute_pass]: 5.52001e-06 [offload_activation]: 1.394e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.655e-05 [merge_recompute_call_nodes]: 1.79e-06 [before_grad]: 1.688e-05 [set_forward_comm_id_for_comm_node_pass]: 7.03998e-06 [meta_fg_expand]: 7.05e-06 [flash_sp_send_recv_attached]: 2.12999e-06 [receive_attached]: 2.94999e-06 [after_resolve]: 1.838e-05 [a_after_grad]: 1.506e-05 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 1.79998e-06 [auto_monad_grad]: 1.71998e-06 [auto_monad_eliminator]: 1.536e-05 [cse]: 3.396e-05 [a_3]: 4.835e-05 [py_interpret_to_execute_after_opt_a]: 3e-05 [slice_cell_reuse_recomputed_activation]: 2.17999e-06 [rewriter_after_opt_a]: 5.159e-05 [convert_after_rewriter]: 8.28001e-06 [order_py_execute_after_rewriter]: 1.522e-05 [mutable_eliminate]: 0.00134716 [opt_b]: 0.00038796, [1] [Cycle 1]: 0.00037141, [7] [b_1]: 0.00024104 [b_2]: 1.404e-05 [updatestate_depend_eliminate]: 1.094e-05 [updatestate_assign_eliminate]: 4.90999e-06 [updatestate_loads_eliminate]: 3.97e-06 [renormalize]: 6.79982e-07 [cse]: 3.87e-05 [optimize_parallel_all_gather_comm]: 2.631e-05 [overlap_param_gather]: 3.56999e-06 [cconv]: 3.627e-05 [loop_unroll]: 0.00065602 [opt_after_cconv]: 0.00016107, [1] [Cycle 1]: 0.00015179, [7] [c_1]: 5.095e-05 [parameter_eliminate]: 3.68999e-06 [updatestate_depend_eliminate]: 7.63999e-06 [updatestate_assign_eliminate]: 3.91999e-06 [updatestate_loads_eliminate]: 3.2e-06 [cse]: 3.32e-05 [renormalize]: 8.30012e-07 [remove_dup_value]: 2.022e-05 [tuple_transform]: 0.00013605, [1] [Cycle 1]: 0.00012749, [4] [d_1]: 8.802e-05 [none_parameter_eliminate]: 1.87001e-06 [renormalize]: 2.00002e-07 [switch_simplify]: 1.026e-05 [partial_unused_args_eliminate]: 2.23998e-06 [add_recomputation]: 8.291e-05 [cse_after_recomputation]: 3.829e-05, [1] [Cycle 1]: 3.18e-05, [1] [cse]: 2.137e-05 [environ_conv]: 9.59e-06 [swap_dp_allreduce_reducescatter]: 7.55e-06 [bias_add_comm_swap]: 5.64e-06 [label_micro_interleaved_index]: 9.20999e-06 [label_fine_grained_interleaved_index]: 2.91999e-06 [merge_cast_opt]: 1.71e-06 [slice_recompute_activation]: 2.02001e-06 [micro_interleaved_order_control]: 3.78001e-06 [assign_add_opt]: 1.32e-06 [ForceFp32Comm]: 8.29983e-07 [remove_cast_before_assign_add]: 1.19e-06 [full_micro_interleaved_order_control]: 2.44999e-06 [reorder_send_recv_between_fp_bp]: 3.07002e-06 [comm_op_add_attrs]: 1.00001e-06 [add_comm_op_reuse_tag]: 9.49978e-07 [interleave_split_concat_branches]: 1.22e-06 [interleave_parallel_branches]: 1.45999e-06 [overlap_opt_shard_in_pipeline]: 1.84e-06 [overlap_opt_shard_grad_in_pipeline]: 2.00002e-06 [control_data_broadcast_order]: 1.962e-05 [grouped_pairwise_exchange_alltoall]: 1.60001e-06 [offloading_packed_experts]: 4.85001e-06 [overlap_recompute_and_grad_model_parallel]: 9.27999e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.52999e-06 [overlap_recompute_allgather_and_fa_grad]: 1.96998e-06 [overlap_recompute_comm]: 2.72001e-06 [overlap_grad_ring_attention]: 5.74e-06 [overlap_grad_flash_sp]: 3.043e-05 [begin_end_overlap_inline]: 6.00005e-07 [split_matmul_comm_elemetwise]: 2.32999e-06 [split_layernorm_comm]: 1.53002e-06 [handle_group_info]: 1.22e-06 [symbol_engine_optimizer]: 0.00012738, [1] [Cycle 1]: 0.00012153, [6] [build]: 7.92e-06 [elim_shapecalc]: 1.991e-05 [elim_not_effective]: 2.137e-05 [opt_reshape]: 1.208e-05 [fold_const_symbol]: 1.519e-05 [renormalize]: 2.30008e-07 [detach_backward]: 2.30002e-06 [pipeline_parallel_scheduler]: 1.37999e-06 [auto_monad_reorder]: 2.2e-05 [get_jit_bprop_graph]: 1.86e-06 [rewriter_after_jit_bprop_graph]: 7.68999e-06 [opt_after_jit_grad]: 0.00075959 [validate]: 6.745e-05 Sums bootstrap : 0.000739s : 0.38% type_inference : 0.009545s : 4.88% event_method : 0.000020s : 0.01% auto_monad : 0.000079s : 0.04% graph_reusing : 0.000006s : 0.00% inline : 0.000004s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000024s : 0.01% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000007s : 0.00% parallel-infer-symbol : 0.000007s : 0.00% pre_auto_parallel : 0.000053s : 0.03% insert-virtual-dataset : 0.000003s : 0.00% parallel-infer-symbol-second : 0.000002s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000058s : 0.03% optimize.rewriter_before_opt_a : 0.000131s : 0.07% optimize.opt_a.expand_dump_flag : 0.000009s : 0.00% optimize.opt_a.switch_simplify : 0.000057s : 0.03% optimize.opt_a.loop_unroll : 0.000047s : 0.02% optimize.opt_a.a_1 : 0.001098s : 0.56% optimize.opt_a.with_stream_mark : 0.000040s : 0.02% optimize.opt_a.recompute_prepare : 0.000019s : 0.01% optimize.opt_a.updatestate_depend_eliminate : 0.000010s : 0.01% optimize.opt_a.updatestate_assign_eliminate : 0.000008s : 0.00% optimize.opt_a.updatestate_loads_eliminate : 0.000007s : 0.00% optimize.opt_a.parameter_eliminate : 0.000005s : 0.00% optimize.opt_a.a_2 : 0.000244s : 0.12% optimize.opt_a.accelerated_algorithm : 0.000037s : 0.02% optimize.opt_a.shard : 0.000004s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.000005s : 0.00% optimize.opt_a.shard_inline : 0.000017s : 0.01% optimize.opt_a.merge_send_recv : 0.000026s : 0.01% optimize.opt_a.auto_parallel : 0.000021s : 0.01% optimize.opt_a.parallel : 0.000038s : 0.02% optimize.opt_a.flash_sp : 0.000015s : 0.01% optimize.opt_a.merge_comm : 0.000012s : 0.01% optimize.opt_a.allreduce_fusion : 0.000009s : 0.00% optimize.opt_a.matmul_add_comm_reduction : 0.000027s : 0.01% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000002s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000025s : 0.01% optimize.opt_a.virtual_dataset : 0.000020s : 0.01% optimize.opt_a.get_grad_eliminate_ : 0.000020s : 0.01% optimize.opt_a.virtual_output : 0.000020s : 0.01% optimize.opt_a.merge_forward : 0.000010s : 0.01% optimize.opt_a.cell_reuse_recompute_pass : 0.000008s : 0.00% optimize.opt_a.offload_activation : 0.000026s : 0.01% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000036s : 0.02% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.00% optimize.opt_a.before_grad : 0.000033s : 0.02% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000012s : 0.01% optimize.opt_a.meta_fg_expand : 0.000011s : 0.01% optimize.opt_a.flash_sp_send_recv_attached : 0.000007s : 0.00% optimize.opt_a.receive_attached : 0.000006s : 0.00% optimize.opt_a.after_resolve : 0.000036s : 0.02% optimize.opt_a.a_after_grad : 0.000030s : 0.02% optimize.opt_a.renormalize : 0.178740s : 91.33% optimize.opt_a.add_forward_monad_depend : 0.000015s : 0.01% optimize.opt_a.auto_monad_grad : 0.000004s : 0.00% optimize.opt_a.auto_monad_eliminator : 0.000049s : 0.03% optimize.opt_a.cse : 0.000091s : 0.05% optimize.opt_a.a_3 : 0.000153s : 0.08% optimize.py_interpret_to_execute_after_opt_a : 0.000030s : 0.02% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.00% optimize.rewriter_after_opt_a : 0.000052s : 0.03% optimize.convert_after_rewriter : 0.000008s : 0.00% optimize.order_py_execute_after_rewriter : 0.000015s : 0.01% optimize.mutable_eliminate : 0.001347s : 0.69% optimize.opt_b.b_1 : 0.000241s : 0.12% optimize.opt_b.b_2 : 0.000014s : 0.01% optimize.opt_b.updatestate_depend_eliminate : 0.000011s : 0.01% optimize.opt_b.updatestate_assign_eliminate : 0.000005s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000004s : 0.00% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000039s : 0.02% optimize.optimize_parallel_all_gather_comm : 0.000026s : 0.01% optimize.overlap_param_gather : 0.000004s : 0.00% optimize.cconv : 0.000036s : 0.02% optimize.loop_unroll : 0.000656s : 0.34% optimize.opt_after_cconv.c_1 : 0.000051s : 0.03% optimize.opt_after_cconv.parameter_eliminate : 0.000004s : 0.00% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000008s : 0.00% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000004s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.cse : 0.000033s : 0.02% optimize.opt_after_cconv.renormalize : 0.000001s : 0.00% optimize.remove_dup_value : 0.000020s : 0.01% optimize.tuple_transform.d_1 : 0.000088s : 0.04% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000010s : 0.01% optimize.partial_unused_args_eliminate : 0.000002s : 0.00% optimize.add_recomputation : 0.000083s : 0.04% optimize.cse_after_recomputation.cse : 0.000021s : 0.01% optimize.environ_conv : 0.000010s : 0.00% optimize.swap_dp_allreduce_reducescatter : 0.000008s : 0.00% optimize.bias_add_comm_swap : 0.000006s : 0.00% optimize.label_micro_interleaved_index : 0.000009s : 0.00% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.00% optimize.merge_cast_opt : 0.000002s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.00% optimize.micro_interleaved_order_control : 0.000004s : 0.00% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.00% optimize.full_micro_interleaved_order_control : 0.000002s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.00% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000002s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.00% optimize.control_data_broadcast_order : 0.000020s : 0.01% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.00% optimize.offloading_packed_experts : 0.000005s : 0.00% optimize.overlap_recompute_and_grad_model_parallel : 0.000009s : 0.00% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000002s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000002s : 0.00% optimize.overlap_recompute_comm : 0.000003s : 0.00% optimize.overlap_grad_ring_attention : 0.000006s : 0.00% optimize.overlap_grad_flash_sp : 0.000030s : 0.02% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.00% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000008s : 0.00% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000020s : 0.01% optimize.symbol_engine_optimizer.elim_not_effective : 0.000021s : 0.01% optimize.symbol_engine_optimizer.opt_reshape : 0.000012s : 0.01% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000015s : 0.01% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.00% pipeline_parallel_scheduler : 0.000001s : 0.00% auto_monad_reorder : 0.000022s : 0.01% get_jit_bprop_graph : 0.000002s : 0.00% rewriter_after_jit_bprop_graph : 0.000008s : 0.00% opt_after_jit_grad : 0.000760s : 0.39% validate : 0.000067s : 0.03% Time group info: ------[substitution.] 0.000347 45 13.29% : 0.000046s : 5: substitution.arithmetic_simplify 10.84% : 0.000038s : 3: substitution.cast_eliminate 0.63% : 0.000002s : 3: substitution.elim_not_effective 0.65% : 0.000002s : 3: substitution.fold_const_symbol 2.41% : 0.000008s : 5: substitution.graph_param_transform 55.88% : 0.000194s : 4: substitution.inline 2.01% : 0.000007s : 6: substitution.j_node_and_user_rematch 4.31% : 0.000015s : 2: substitution.less_batch_normalization 2.71% : 0.000009s : 6: substitution.remove_not_recompute_node 3.36% : 0.000012s : 4: substitution.replace_old_param 3.92% : 0.000014s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.009466 2 87.67% : 0.008299s : 1: type_inference.infer 12.33% : 0.001167s : 1: type_inference.specialize ------[replace.] 0.000071 8 60.85% : 0.000043s : 4: replace.inline 39.15% : 0.000028s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000203 8 94.21% : 0.000191s : 4: match.inline 5.79% : 0.000012s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000306 1504 0.86% : 0.000003s : 15: predicate.accumulaten_eliminater 0.73% : 0.000002s : 5: predicate.ad_related_special_op_eliminate 0.54% : 0.000002s : 10: predicate.addn_check_dump 1.07% : 0.000003s : 15: predicate.addn_zero_filter 0.69% : 0.000002s : 15: predicate.adjust_all_reduce_mul_add 3.12% : 0.000010s : 25: predicate.arithmetic_simplify 0.97% : 0.000003s : 15: predicate.cast_eliminate 0.93% : 0.000003s : 10: predicate.check_bprop_eliminate 0.71% : 0.000002s : 10: predicate.compare_switch_simplify 0.16% : 0.000000s : 5: predicate.const_output_eliminate 0.95% : 0.000003s : 10: predicate.depend_value_elim 0.79% : 0.000002s : 15: predicate.dict_get_item_const_eliminator 1.21% : 0.000004s : 15: predicate.dict_get_item_eliminator 0.95% : 0.000003s : 15: predicate.dict_set_item_eliminator 1.30% : 0.000004s : 10: predicate.dumpgradient_eliminate 0.17% : 0.000001s : 5: predicate.elim_not_effective 0.39% : 0.000001s : 5: predicate.elim_shapecalc_of_broadcastargs 1.14% : 0.000004s : 20: predicate.environ_add_const_eliminate 0.95% : 0.000003s : 20: predicate.environ_get_add_eliminate 1.35% : 0.000004s : 20: predicate.environ_get_depend_swap 1.88% : 0.000006s : 30: predicate.environ_get_eliminate 0.98% : 0.000003s : 20: predicate.environ_get_set_eliminate 1.13% : 0.000003s : 23: predicate.exchange_switch_depend_value 1.95% : 0.000006s : 23: predicate.float_depend_g_call 0.49% : 0.000002s : 10: predicate.float_environ_get_switch 0.69% : 0.000002s : 15: predicate.float_tuple_getitem_switch 0.16% : 0.000000s : 5: predicate.fold_const_symbol 0.59% : 0.000002s : 10: predicate.get_grad_eliminate 0.21% : 0.000001s : 5: predicate.graph_param_transform 0.56% : 0.000002s : 10: predicate.incorporate_call 0.47% : 0.000001s : 10: predicate.incorporate_call_switch 5.45% : 0.000017s : 68: predicate.inline 0.78% : 0.000002s : 10: predicate.inline_without_move 0.28% : 0.000001s : 10: predicate.j_node_and_user_rematch 0.85% : 0.000003s : 10: predicate.less_batch_normalization 1.64% : 0.000005s : 29: predicate.list_to_tuple_eliminator_ 2.14% : 0.000007s : 44: predicate.load_eliminater 0.83% : 0.000003s : 5: predicate.loop_unroll_after_grad 1.97% : 0.000006s : 36: predicate.loop_unroll_before_grad 2.11% : 0.000006s : 25: predicate.make_slice_get_slice_eliminator 0.56% : 0.000002s : 10: predicate.merge_addn 0.57% : 0.000002s : 10: predicate.micro_step_allgather_replace 0.61% : 0.000002s : 10: predicate.mini_step_allgather_replace 0.78% : 0.000002s : 15: predicate.minmaximum_grad 1.17% : 0.000004s : 5: predicate.mutable_eliminate 0.92% : 0.000003s : 5: predicate.opt_reshape 0.35% : 0.000001s : 5: predicate.parallel_virtual_node 1.47% : 0.000004s : 23: predicate.partial_defer_inline 1.31% : 0.000004s : 24: predicate.partial_eliminate 0.80% : 0.000002s : 15: predicate.print_const_string_wrapper 0.68% : 0.000002s : 10: predicate.reduce_all_const_elim 1.50% : 0.000005s : 15: predicate.reduce_eliminate 2.07% : 0.000006s : 44: predicate.redundant_stop_gradient_eliminater 0.37% : 0.000001s : 10: predicate.remove_not_recompute_node 1.19% : 0.000004s : 29: predicate.replace_applicator 0.44% : 0.000001s : 10: predicate.replace_old_param 0.23% : 0.000001s : 5: predicate.reset_defer_inline 1.29% : 0.000004s : 15: predicate.reshape_eliminate 1.20% : 0.000004s : 10: predicate.row_tensor_add_zeros_like 0.31% : 0.000001s : 5: predicate.row_tensor_eliminate 1.12% : 0.000003s : 10: predicate.same_eliminate 0.34% : 0.000001s : 10: predicate.set_cell_output_no_recompute 0.65% : 0.000002s : 10: predicate.shard_identity_eliminate 0.76% : 0.000002s : 10: predicate.special_op_eliminate 0.63% : 0.000002s : 10: predicate.specialize_transform 1.23% : 0.000004s : 10: predicate.split_environ_get_set_with_tuple_value 1.04% : 0.000003s : 10: predicate.stack_unstack_eliminate 0.36% : 0.000001s : 5: predicate.switch_call_monad_eliminater 1.31% : 0.000004s : 23: predicate.switch_defer_inline 1.64% : 0.000005s : 33: predicate.switch_layer_defer_inline 3.96% : 0.000012s : 74: predicate.switch_simplify 0.81% : 0.000002s : 15: predicate.tile_eliminate 0.83% : 0.000003s : 15: predicate.transpose_eliminate 2.63% : 0.000008s : 25: predicate.tuple_list_convert_item_index_to_positive 1.41% : 0.000004s : 25: predicate.tuple_list_get_item_const_eliminator 2.01% : 0.000006s : 25: predicate.tuple_list_get_item_depend_reorder 4.03% : 0.000012s : 39: predicate.tuple_list_get_item_eliminator 1.22% : 0.000004s : 25: predicate.tuple_list_get_set_item_eliminator 2.14% : 0.000007s : 35: predicate.tuple_list_set_item_eliminator 2.46% : 0.000008s : 29: predicate.tuple_to_list_eliminator_ 2.06% : 0.000006s : 44: predicate.updatestate_pure_node_eliminater 2.76% : 0.000008s : 54: predicate.updatestate_useless_node_eliminater 0.43% : 0.000001s : 5: predicate.value_based_eliminate 0.59% : 0.000002s : 10: predicate.virtual_dataset_eliminate 0.99% : 0.000003s : 10: predicate.virtual_output_eliminate 0.25% : 0.000001s : 5: predicate.virtual_view_grad_eliminate 0.37% : 0.000001s : 5: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.001005 11 38.37% : 0.000386s : 5: func_graph_cloner_run.FuncGraphClonerGraph 61.63% : 0.000620s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.572985 192 0.00% : 0.000004s : 1: ForceFp32Comm 0.84% : 0.004793s : 1: add_attr 0.83% : 0.004776s : 1: add_attr_with_inline 0.00% : 0.000004s : 1: add_comm_op_reuse_tag 0.02% : 0.000088s : 1: add_recomputation 0.00% : 0.000005s : 1: assign_add_opt 0.01% : 0.000086s : 1: auto_monad 0.00% : 0.000027s : 1: auto_monad_reorder 0.00% : 0.000004s : 1: begin_end_overlap_inline 0.00% : 0.000009s : 1: bias_add_comm_swap 0.14% : 0.000776s : 1: bootstrap 0.01% : 0.000044s : 1: cconv 0.00% : 0.000005s : 1: comm_op_add_attrs 0.00% : 0.000024s : 1: control_data_broadcast_order 0.01% : 0.000071s : 1: convert_after_rewriter 0.01% : 0.000042s : 1: cse_after_recomputation 0.00% : 0.000005s : 1: dataset_repeat_opt 0.00% : 0.000005s : 1: detach_backward 0.00% : 0.000015s : 1: environ_conv 0.00% : 0.000027s : 1: event_method 0.00% : 0.000006s : 1: full_micro_interleaved_order_control 0.00% : 0.000005s : 1: get_jit_bprop_graph 0.00% : 0.000010s : 1: graph_reusing 0.00% : 0.000006s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000004s : 1: handle_group_info 0.00% : 0.000007s : 1: inline 0.00% : 0.000009s : 1: insert-virtual-dataset 0.00% : 0.000006s : 1: interleave_parallel_branches 0.00% : 0.000005s : 1: interleave_split_concat_branches 0.00% : 0.000007s : 1: label_fine_grained_interleaved_index 0.00% : 0.000013s : 1: label_micro_interleaved_index 0.12% : 0.000669s : 1: loop_unroll 0.00% : 0.000005s : 1: merge_cast_opt 0.00% : 0.000007s : 1: micro_interleaved_order_control 0.24% : 0.001366s : 1: mutable_eliminate 0.00% : 0.000009s : 1: offloading_packed_experts 0.00% : 0.000022s : 1: opt.transform.loop_unroll_optimizer 0.01% : 0.000029s : 1: opt.transform.mutable_eliminate 0.31% : 0.001754s : 78: opt.transform.opt_a 0.01% : 0.000050s : 1: opt.transform.opt_after_cconv 0.01% : 0.000047s : 1: opt.transform.opt_after_jit_grad 0.03% : 0.000197s : 28: opt.transform.opt_b 0.02% : 0.000096s : 2: opt.transform.opt_trans_graph 0.01% : 0.000065s : 4: opt.transform.symbol_engine_opt 31.72% : 0.181732s : 1: opt_a 0.03% : 0.000165s : 1: opt_after_cconv 0.13% : 0.000773s : 1: opt_after_jit_grad 0.07% : 0.000394s : 1: opt_b 32.38% : 0.185550s : 1: optimize 0.01% : 0.000031s : 1: optimize_parallel_all_gather_comm 0.00% : 0.000019s : 1: order_py_execute_after_rewriter 0.01% : 0.000035s : 1: overlap_grad_flash_sp 0.00% : 0.000005s : 1: overlap_grad_matmul_and_grad_allreduce 0.00% : 0.000010s : 1: overlap_grad_ring_attention 0.00% : 0.000008s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000005s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000007s : 1: overlap_param_gather 0.00% : 0.000006s : 1: overlap_recompute_allgather_and_fa_grad 0.00% : 0.000015s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000007s : 1: overlap_recompute_comm 0.00% : 0.000011s : 1: parallel-infer-symbol 0.00% : 0.000005s : 1: parallel-infer-symbol-second 0.00% : 0.000006s : 1: partial_unused_args_eliminate 0.00% : 0.000005s : 1: pipeline_parallel_scheduler 0.00% : 0.000007s : 1: pipeline_split 0.01% : 0.000058s : 1: pre_auto_parallel 0.01% : 0.000062s : 1: py_interpret_to_execute 0.01% : 0.000033s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000008s : 1: remove_cast_before_assign_add 0.00% : 0.000024s : 1: remove_dup_value 31.04% : 0.177876s : 1: renormalize.infer 0.15% : 0.000843s : 1: renormalize.specialize 0.00% : 0.000007s : 1: reorder_send_recv_between_fp_bp 0.00% : 0.000011s : 1: rewriter_after_jit_bprop_graph 0.01% : 0.000056s : 1: rewriter_after_opt_a 0.02% : 0.000135s : 1: rewriter_before_opt_a 0.00% : 0.000007s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000005s : 1: slice_recompute_activation 0.00% : 0.000005s : 1: split_layernorm_comm 0.00% : 0.000006s : 1: split_matmul_comm_elemetwise 0.00% : 0.000012s : 1: swap_dp_allreduce_reducescatter 0.02% : 0.000130s : 1: symbol_engine_optimizer 0.02% : 0.000140s : 1: tuple_transform 1.67% : 0.009573s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:38:53.878.356 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:38:53.878.798 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.146715, [21] [bootstrap]: 0.0004829 [type_inference]: 0.00737299 [event_method]: 2.217e-05 [auto_monad]: 7.471e-05 [graph_reusing]: 7.01999e-06 [inline]: 2.87002e-06 [add_attr]: 0.00396148, [1] [add_attr_with_inline]: 0.00394934, [1] [Cycle 1]: 9.525e-05, [2] [tag_attr]: 2.597e-05 [meta_addattr_fg_expand]: 5.86e-06 [parallel-infer-symbol]: 3.28e-06 [pre_auto_parallel]: 4.624e-05 [insert-virtual-dataset]: 2.51998e-06 [parallel-infer-symbol-second]: 8.2e-07 [dataset_repeat_opt]: 2.02999e-06 [pipeline_split]: 1.52001e-06 [optimize]: 0.132738, [53] [py_interpret_to_execute]: 3.726e-05 [rewriter_before_opt_a]: 9.712e-05 [opt_a]: 0.00360855, [2] [Cycle 1]: 0.00267545, [45] [expand_dump_flag]: 3.23e-06 [switch_simplify]: 4.256e-05 [loop_unroll]: 2.998e-05 [a_1]: 0.0006938 [with_stream_mark]: 1.954e-05 [recompute_prepare]: 1.006e-05 [updatestate_depend_eliminate]: 4.25e-06 [updatestate_assign_eliminate]: 3.73001e-06 [updatestate_loads_eliminate]: 2.73e-06 [parameter_eliminate]: 2.19999e-06 [a_2]: 0.0001212 [accelerated_algorithm]: 2.305e-05 [shard]: 2.05002e-06 [meta_shard_fg_expand]: 2.35002e-06 [shard_inline]: 7.15e-06 [merge_send_recv]: 8.92e-06 [auto_parallel]: 9.78002e-06 [parallel]: 2.251e-05 [flash_sp]: 8.28999e-06 [merge_comm]: 3.95e-06 [allreduce_fusion]: 4.74e-06 [matmul_add_comm_reduction]: 1.202e-05 [allreduce_slice_to_reducescatter]: 6.60017e-07 [virtual_shard_identity]: 9.09e-06 [virtual_dataset]: 8.57e-06 [get_grad_eliminate_]: 6.25002e-06 [virtual_output]: 8.02e-06 [merge_forward]: 4.73001e-06 [cell_reuse_recompute_pass]: 1.73002e-06 [offload_activation]: 1.189e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.738e-05 [merge_recompute_call_nodes]: 1.48002e-06 [before_grad]: 1.094e-05 [set_forward_comm_id_for_comm_node_pass]: 5.68997e-06 [meta_fg_expand]: 3.59002e-06 [flash_sp_send_recv_attached]: 5.92999e-06 [receive_attached]: 2.21e-06 [after_resolve]: 1.352e-05 [a_after_grad]: 1.196e-05 [renormalize]: 0.00096943 [add_forward_monad_depend]: 6.34999e-06 [auto_monad_grad]: 3.54002e-06 [auto_monad_eliminator]: 1.858e-05 [cse]: 3.495e-05 [a_3]: 7.204e-05 [Cycle 2]: 0.00091651, [45] [expand_dump_flag]: 2.36998e-06 [switch_simplify]: 8.21002e-06 [loop_unroll]: 6.54001e-06 [a_1]: 0.00015344 [with_stream_mark]: 1.338e-05 [recompute_prepare]: 7.45e-06 [updatestate_depend_eliminate]: 3.09999e-06 [updatestate_assign_eliminate]: 2.55002e-06 [updatestate_loads_eliminate]: 2.38002e-06 [parameter_eliminate]: 1.76998e-06 [a_2]: 0.00011299 [accelerated_algorithm]: 1.244e-05 [shard]: 1.52999e-06 [meta_shard_fg_expand]: 2.13998e-06 [shard_inline]: 6.84999e-06 [merge_send_recv]: 7.36001e-06 [auto_parallel]: 6.51e-06 [parallel]: 7.01001e-06 [flash_sp]: 4.02002e-06 [merge_comm]: 3.61999e-06 [allreduce_fusion]: 4.62e-06 [matmul_add_comm_reduction]: 8.28001e-06 [allreduce_slice_to_reducescatter]: 6.10016e-07 [virtual_shard_identity]: 9.37001e-06 [virtual_dataset]: 6.39001e-06 [get_grad_eliminate_]: 6.83e-06 [virtual_output]: 6.10002e-06 [merge_forward]: 3.8e-06 [cell_reuse_recompute_pass]: 2.47001e-06 [offload_activation]: 8e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.334e-05 [merge_recompute_call_nodes]: 9.70002e-07 [before_grad]: 1.005e-05 [set_forward_comm_id_for_comm_node_pass]: 3.58e-06 [meta_fg_expand]: 1.99999e-06 [flash_sp_send_recv_attached]: 1.48002e-06 [receive_attached]: 1.72999e-06 [after_resolve]: 1.345e-05 [a_after_grad]: 1.075e-05 [renormalize]: 7.99773e-08 [add_forward_monad_depend]: 1.63002e-06 [auto_monad_grad]: 1.45999e-06 [auto_monad_eliminator]: 8.85999e-06 [cse]: 1.746e-05 [a_3]: 5.43e-05 [py_interpret_to_execute_after_opt_a]: 1.586e-05 [slice_cell_reuse_recomputed_activation]: 5.98998e-06 [rewriter_after_opt_a]: 4.562e-05 [convert_after_rewriter]: 1.028e-05 [order_py_execute_after_rewriter]: 1.012e-05 [mutable_eliminate]: 0.00080637 [opt_b]: 0.00034641, [1] [Cycle 1]: 0.00033502, [7] [b_1]: 0.00021539 [b_2]: 9.83998e-06 [updatestate_depend_eliminate]: 7.65e-06 [updatestate_assign_eliminate]: 2.87002e-06 [updatestate_loads_eliminate]: 2.72001e-06 [renormalize]: 7.50006e-07 [cse]: 2.377e-05 [optimize_parallel_all_gather_comm]: 2.365e-05 [overlap_param_gather]: 6.45002e-06 [cconv]: 3.603e-05 [loop_unroll]: 0.0005652 [opt_after_cconv]: 0.00015655, [1] [Cycle 1]: 0.00014558, [7] [c_1]: 3.661e-05 [parameter_eliminate]: 3.49001e-06 [updatestate_depend_eliminate]: 6.16e-06 [updatestate_assign_eliminate]: 2.74001e-06 [updatestate_loads_eliminate]: 2.54001e-06 [cse]: 2.168e-05 [renormalize]: 4.70027e-07 [remove_dup_value]: 1.946e-05 [tuple_transform]: 0.00011004, [1] [Cycle 1]: 0.00010138, [4] [d_1]: 5.293e-05 [none_parameter_eliminate]: 1.67999e-06 [renormalize]: 4.19997e-07 [switch_simplify]: 8.40999e-06 [partial_unused_args_eliminate]: 5.49e-06 [add_recomputation]: 6.088e-05 [cse_after_recomputation]: 3.289e-05, [1] [Cycle 1]: 2.361e-05, [1] [cse]: 1.271e-05 [environ_conv]: 1.073e-05 [swap_dp_allreduce_reducescatter]: 9.82999e-06 [bias_add_comm_swap]: 6.14001e-06 [label_micro_interleaved_index]: 9.27001e-06 [label_fine_grained_interleaved_index]: 4.915e-05 [merge_cast_opt]: 7.3e-06 [slice_recompute_activation]: 7.08e-06 [micro_interleaved_order_control]: 7.7e-06 [assign_add_opt]: 5.07999e-06 [ForceFp32Comm]: 5.16002e-06 [remove_cast_before_assign_add]: 4.58001e-06 [full_micro_interleaved_order_control]: 5.74e-06 [reorder_send_recv_between_fp_bp]: 6.40002e-06 [comm_op_add_attrs]: 4.38999e-06 [add_comm_op_reuse_tag]: 4.29997e-06 [interleave_split_concat_branches]: 4.63999e-06 [interleave_parallel_branches]: 4.35999e-06 [overlap_opt_shard_in_pipeline]: 4.35e-06 [overlap_opt_shard_grad_in_pipeline]: 5.32001e-06 [control_data_broadcast_order]: 3.358e-05 [grouped_pairwise_exchange_alltoall]: 4.99e-06 [offloading_packed_experts]: 9.08002e-06 [overlap_recompute_and_grad_model_parallel]: 8.76002e-06 [overlap_grad_matmul_and_grad_allreduce]: 4.37e-06 [overlap_recompute_allgather_and_fa_grad]: 4.37e-06 [overlap_recompute_comm]: 6.48998e-06 [overlap_grad_ring_attention]: 8.33999e-06 [overlap_grad_flash_sp]: 3.413e-05 [begin_end_overlap_inline]: 3.75e-06 [split_matmul_comm_elemetwise]: 6.11e-06 [split_layernorm_comm]: 5.61003e-06 [handle_group_info]: 4.55001e-06 [symbol_engine_optimizer]: 0.00016971, [1] [Cycle 1]: 0.00015559, [6] [build]: 7.51001e-06 [elim_shapecalc]: 3.029e-05 [elim_not_effective]: 2.279e-05 [opt_reshape]: 1.029e-05 [fold_const_symbol]: 1.148e-05 [renormalize]: 6.50005e-07 [detach_backward]: 4.43999e-06 [pipeline_parallel_scheduler]: 1.72001e-06 [auto_monad_reorder]: 2.648e-05 [get_jit_bprop_graph]: 2.26e-06 [rewriter_after_jit_bprop_graph]: 6.21e-06 [opt_after_jit_grad]: 0.0010171 [validate]: 6.053e-05 Sums bootstrap : 0.000483s : 3.32% type_inference : 0.007373s : 50.72% event_method : 0.000022s : 0.15% auto_monad : 0.000075s : 0.51% graph_reusing : 0.000007s : 0.05% inline : 0.000003s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000026s : 0.18% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.04% parallel-infer-symbol : 0.000003s : 0.02% pre_auto_parallel : 0.000046s : 0.32% insert-virtual-dataset : 0.000003s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.01% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000037s : 0.26% optimize.rewriter_before_opt_a : 0.000097s : 0.67% optimize.opt_a.expand_dump_flag : 0.000006s : 0.04% optimize.opt_a.switch_simplify : 0.000051s : 0.35% optimize.opt_a.loop_unroll : 0.000037s : 0.25% optimize.opt_a.a_1 : 0.000847s : 5.83% optimize.opt_a.with_stream_mark : 0.000033s : 0.23% optimize.opt_a.recompute_prepare : 0.000018s : 0.12% optimize.opt_a.updatestate_depend_eliminate : 0.000007s : 0.05% optimize.opt_a.updatestate_assign_eliminate : 0.000006s : 0.04% optimize.opt_a.updatestate_loads_eliminate : 0.000005s : 0.04% optimize.opt_a.parameter_eliminate : 0.000004s : 0.03% optimize.opt_a.a_2 : 0.000234s : 1.61% optimize.opt_a.accelerated_algorithm : 0.000035s : 0.24% optimize.opt_a.shard : 0.000004s : 0.02% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.03% optimize.opt_a.shard_inline : 0.000014s : 0.10% optimize.opt_a.merge_send_recv : 0.000016s : 0.11% optimize.opt_a.auto_parallel : 0.000016s : 0.11% optimize.opt_a.parallel : 0.000030s : 0.20% optimize.opt_a.flash_sp : 0.000012s : 0.08% optimize.opt_a.merge_comm : 0.000008s : 0.05% optimize.opt_a.allreduce_fusion : 0.000009s : 0.06% optimize.opt_a.matmul_add_comm_reduction : 0.000020s : 0.14% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000018s : 0.13% optimize.opt_a.virtual_dataset : 0.000015s : 0.10% optimize.opt_a.get_grad_eliminate_ : 0.000013s : 0.09% optimize.opt_a.virtual_output : 0.000014s : 0.10% optimize.opt_a.merge_forward : 0.000009s : 0.06% optimize.opt_a.cell_reuse_recompute_pass : 0.000004s : 0.03% optimize.opt_a.offload_activation : 0.000020s : 0.14% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000031s : 0.21% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.02% optimize.opt_a.before_grad : 0.000021s : 0.14% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000009s : 0.06% optimize.opt_a.meta_fg_expand : 0.000006s : 0.04% optimize.opt_a.flash_sp_send_recv_attached : 0.000007s : 0.05% optimize.opt_a.receive_attached : 0.000004s : 0.03% optimize.opt_a.after_resolve : 0.000027s : 0.19% optimize.opt_a.a_after_grad : 0.000023s : 0.16% optimize.opt_a.renormalize : 0.000970s : 6.67% optimize.opt_a.add_forward_monad_depend : 0.000008s : 0.05% optimize.opt_a.auto_monad_grad : 0.000005s : 0.03% optimize.opt_a.auto_monad_eliminator : 0.000027s : 0.19% optimize.opt_a.cse : 0.000052s : 0.36% optimize.opt_a.a_3 : 0.000126s : 0.87% optimize.py_interpret_to_execute_after_opt_a : 0.000016s : 0.11% optimize.slice_cell_reuse_recomputed_activation : 0.000006s : 0.04% optimize.rewriter_after_opt_a : 0.000046s : 0.31% optimize.convert_after_rewriter : 0.000010s : 0.07% optimize.order_py_execute_after_rewriter : 0.000010s : 0.07% optimize.mutable_eliminate : 0.000806s : 5.55% optimize.opt_b.b_1 : 0.000215s : 1.48% optimize.opt_b.b_2 : 0.000010s : 0.07% optimize.opt_b.updatestate_depend_eliminate : 0.000008s : 0.05% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.02% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.02% optimize.opt_b.renormalize : 0.000001s : 0.01% optimize.opt_b.cse : 0.000024s : 0.16% optimize.optimize_parallel_all_gather_comm : 0.000024s : 0.16% optimize.overlap_param_gather : 0.000006s : 0.04% optimize.cconv : 0.000036s : 0.25% optimize.loop_unroll : 0.000565s : 3.89% optimize.opt_after_cconv.c_1 : 0.000037s : 0.25% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.02% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.04% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.02% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.02% optimize.opt_after_cconv.cse : 0.000022s : 0.15% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000019s : 0.13% optimize.tuple_transform.d_1 : 0.000053s : 0.36% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000008s : 0.06% optimize.partial_unused_args_eliminate : 0.000005s : 0.04% optimize.add_recomputation : 0.000061s : 0.42% optimize.cse_after_recomputation.cse : 0.000013s : 0.09% optimize.environ_conv : 0.000011s : 0.07% optimize.swap_dp_allreduce_reducescatter : 0.000010s : 0.07% optimize.bias_add_comm_swap : 0.000006s : 0.04% optimize.label_micro_interleaved_index : 0.000009s : 0.06% optimize.label_fine_grained_interleaved_index : 0.000049s : 0.34% optimize.merge_cast_opt : 0.000007s : 0.05% optimize.slice_recompute_activation : 0.000007s : 0.05% optimize.micro_interleaved_order_control : 0.000008s : 0.05% optimize.assign_add_opt : 0.000005s : 0.03% optimize.ForceFp32Comm : 0.000005s : 0.04% optimize.remove_cast_before_assign_add : 0.000005s : 0.03% optimize.full_micro_interleaved_order_control : 0.000006s : 0.04% optimize.reorder_send_recv_between_fp_bp : 0.000006s : 0.04% optimize.comm_op_add_attrs : 0.000004s : 0.03% optimize.add_comm_op_reuse_tag : 0.000004s : 0.03% optimize.interleave_split_concat_branches : 0.000005s : 0.03% optimize.interleave_parallel_branches : 0.000004s : 0.03% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.03% optimize.overlap_opt_shard_grad_in_pipeline : 0.000005s : 0.04% optimize.control_data_broadcast_order : 0.000034s : 0.23% optimize.grouped_pairwise_exchange_alltoall : 0.000005s : 0.03% optimize.offloading_packed_experts : 0.000009s : 0.06% optimize.overlap_recompute_and_grad_model_parallel : 0.000009s : 0.06% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.03% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.03% optimize.overlap_recompute_comm : 0.000006s : 0.04% optimize.overlap_grad_ring_attention : 0.000008s : 0.06% optimize.overlap_grad_flash_sp : 0.000034s : 0.23% optimize.begin_end_overlap_inline : 0.000004s : 0.03% optimize.split_matmul_comm_elemetwise : 0.000006s : 0.04% optimize.split_layernorm_comm : 0.000006s : 0.04% optimize.handle_group_info : 0.000005s : 0.03% optimize.symbol_engine_optimizer.build : 0.000008s : 0.05% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000030s : 0.21% optimize.symbol_engine_optimizer.elim_not_effective : 0.000023s : 0.16% optimize.symbol_engine_optimizer.opt_reshape : 0.000010s : 0.07% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000011s : 0.08% optimize.symbol_engine_optimizer.renormalize : 0.000001s : 0.00% detach_backward : 0.000004s : 0.03% pipeline_parallel_scheduler : 0.000002s : 0.01% auto_monad_reorder : 0.000026s : 0.18% get_jit_bprop_graph : 0.000002s : 0.02% rewriter_after_jit_bprop_graph : 0.000006s : 0.04% opt_after_jit_grad : 0.001017s : 7.00% validate : 0.000061s : 0.42% Time group info: ------[substitution.] 0.000256 36 14.18% : 0.000036s : 6: substitution.arithmetic_simplify 1.10% : 0.000003s : 2: substitution.elim_not_effective 0.56% : 0.000001s : 2: substitution.fold_const_symbol 2.41% : 0.000006s : 4: substitution.graph_param_transform 63.75% : 0.000163s : 4: substitution.inline 1.49% : 0.000004s : 4: substitution.j_node_and_user_rematch 5.41% : 0.000014s : 2: substitution.less_batch_normalization 1.97% : 0.000005s : 4: substitution.remove_not_recompute_node 2.25% : 0.000006s : 4: substitution.replace_old_param 6.90% : 0.000018s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.007306 2 87.36% : 0.006382s : 1: type_inference.infer 12.64% : 0.000923s : 1: type_inference.specialize ------[replace.] 0.000065 8 63.95% : 0.000042s : 4: replace.inline 36.05% : 0.000024s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000176 8 91.32% : 0.000161s : 4: match.inline 8.68% : 0.000015s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000224 1278 0.87% : 0.000002s : 13: predicate.accumulaten_eliminater 1.09% : 0.000002s : 4: predicate.ad_related_special_op_eliminate 0.45% : 0.000001s : 8: predicate.addn_check_dump 0.98% : 0.000002s : 13: predicate.addn_zero_filter 0.74% : 0.000002s : 13: predicate.adjust_all_reduce_mul_add 2.43% : 0.000005s : 21: predicate.arithmetic_simplify 0.83% : 0.000002s : 13: predicate.cast_eliminate 0.72% : 0.000002s : 8: predicate.check_bprop_eliminate 0.47% : 0.000001s : 8: predicate.compare_switch_simplify 0.17% : 0.000000s : 4: predicate.const_output_eliminate 0.54% : 0.000001s : 8: predicate.depend_value_elim 0.87% : 0.000002s : 13: predicate.dict_get_item_const_eliminator 0.95% : 0.000002s : 13: predicate.dict_get_item_eliminator 0.94% : 0.000002s : 13: predicate.dict_set_item_eliminator 1.04% : 0.000002s : 8: predicate.dumpgradient_eliminate 0.54% : 0.000001s : 4: predicate.elim_not_effective 0.74% : 0.000002s : 4: predicate.elim_shapecalc_of_broadcastargs 1.09% : 0.000002s : 17: predicate.environ_add_const_eliminate 0.97% : 0.000002s : 17: predicate.environ_get_add_eliminate 0.97% : 0.000002s : 17: predicate.environ_get_depend_swap 1.86% : 0.000004s : 25: predicate.environ_get_eliminate 1.02% : 0.000002s : 17: predicate.environ_get_set_eliminate 1.33% : 0.000003s : 21: predicate.exchange_switch_depend_value 2.31% : 0.000005s : 21: predicate.float_depend_g_call 0.47% : 0.000001s : 8: predicate.float_environ_get_switch 0.73% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.17% : 0.000000s : 4: predicate.fold_const_symbol 0.63% : 0.000001s : 8: predicate.get_grad_eliminate 0.20% : 0.000000s : 4: predicate.graph_param_transform 0.56% : 0.000001s : 8: predicate.incorporate_call 0.45% : 0.000001s : 8: predicate.incorporate_call_switch 6.35% : 0.000014s : 58: predicate.inline 0.88% : 0.000002s : 8: predicate.inline_without_move 0.29% : 0.000001s : 8: predicate.j_node_and_user_rematch 1.50% : 0.000003s : 8: predicate.less_batch_normalization 1.84% : 0.000004s : 25: predicate.list_to_tuple_eliminator_ 2.36% : 0.000005s : 38: predicate.load_eliminater 1.09% : 0.000002s : 4: predicate.loop_unroll_after_grad 2.24% : 0.000005s : 34: predicate.loop_unroll_before_grad 1.55% : 0.000003s : 21: predicate.make_slice_get_slice_eliminator 0.53% : 0.000001s : 8: predicate.merge_addn 0.57% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.59% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.72% : 0.000002s : 13: predicate.minmaximum_grad 1.56% : 0.000003s : 4: predicate.mutable_eliminate 0.51% : 0.000001s : 4: predicate.opt_reshape 0.34% : 0.000001s : 4: predicate.parallel_virtual_node 1.65% : 0.000004s : 21: predicate.partial_defer_inline 1.53% : 0.000003s : 21: predicate.partial_eliminate 0.77% : 0.000002s : 13: predicate.print_const_string_wrapper 0.68% : 0.000002s : 8: predicate.reduce_all_const_elim 1.11% : 0.000002s : 13: predicate.reduce_eliminate 2.24% : 0.000005s : 38: predicate.redundant_stop_gradient_eliminater 0.40% : 0.000001s : 8: predicate.remove_not_recompute_node 1.31% : 0.000003s : 25: predicate.replace_applicator 0.55% : 0.000001s : 8: predicate.replace_old_param 0.29% : 0.000001s : 4: predicate.reset_defer_inline 0.82% : 0.000002s : 13: predicate.reshape_eliminate 0.62% : 0.000001s : 8: predicate.row_tensor_add_zeros_like 0.41% : 0.000001s : 4: predicate.row_tensor_eliminate 0.93% : 0.000002s : 8: predicate.same_eliminate 0.40% : 0.000001s : 8: predicate.set_cell_output_no_recompute 1.04% : 0.000002s : 8: predicate.shard_identity_eliminate 1.11% : 0.000002s : 8: predicate.special_op_eliminate 0.60% : 0.000001s : 8: predicate.specialize_transform 1.02% : 0.000002s : 8: predicate.split_environ_get_set_with_tuple_value 0.69% : 0.000002s : 8: predicate.stack_unstack_eliminate 0.31% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.39% : 0.000003s : 21: predicate.switch_defer_inline 2.16% : 0.000005s : 29: predicate.switch_layer_defer_inline 4.75% : 0.000011s : 67: predicate.switch_simplify 0.87% : 0.000002s : 13: predicate.tile_eliminate 0.84% : 0.000002s : 13: predicate.transpose_eliminate 1.40% : 0.000003s : 21: predicate.tuple_list_convert_item_index_to_positive 1.73% : 0.000004s : 21: predicate.tuple_list_get_item_const_eliminator 1.34% : 0.000003s : 21: predicate.tuple_list_get_item_depend_reorder 3.73% : 0.000008s : 33: predicate.tuple_list_get_item_eliminator 1.34% : 0.000003s : 21: predicate.tuple_list_get_set_item_eliminator 2.21% : 0.000005s : 29: predicate.tuple_list_set_item_eliminator 1.69% : 0.000004s : 25: predicate.tuple_to_list_eliminator_ 2.31% : 0.000005s : 38: predicate.updatestate_pure_node_eliminater 3.38% : 0.000008s : 46: predicate.updatestate_useless_node_eliminater 0.43% : 0.000001s : 4: predicate.value_based_eliminate 0.59% : 0.000001s : 8: predicate.virtual_dataset_eliminate 0.61% : 0.000001s : 8: predicate.virtual_output_eliminate 0.25% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.43% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000749 11 49.20% : 0.000369s : 5: func_graph_cloner_run.FuncGraphClonerGraph 50.80% : 0.000381s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.285902 192 0.00% : 0.000008s : 1: ForceFp32Comm 1.39% : 0.003973s : 1: add_attr 1.38% : 0.003954s : 1: add_attr_with_inline 0.00% : 0.000008s : 1: add_comm_op_reuse_tag 0.02% : 0.000065s : 1: add_recomputation 0.00% : 0.000008s : 1: assign_add_opt 0.03% : 0.000085s : 1: auto_monad 0.01% : 0.000034s : 1: auto_monad_reorder 0.00% : 0.000007s : 1: begin_end_overlap_inline 0.00% : 0.000009s : 1: bias_add_comm_swap 0.19% : 0.000535s : 1: bootstrap 0.01% : 0.000040s : 1: cconv 0.00% : 0.000007s : 1: comm_op_add_attrs 0.01% : 0.000038s : 1: control_data_broadcast_order 0.00% : 0.000013s : 1: convert_after_rewriter 0.01% : 0.000037s : 1: cse_after_recomputation 0.00% : 0.000007s : 1: dataset_repeat_opt 0.01% : 0.000025s : 1: detach_backward 0.00% : 0.000014s : 1: environ_conv 0.01% : 0.000034s : 1: event_method 0.00% : 0.000009s : 1: full_micro_interleaved_order_control 0.00% : 0.000008s : 1: get_jit_bprop_graph 0.00% : 0.000014s : 1: graph_reusing 0.00% : 0.000008s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000008s : 1: handle_group_info 0.00% : 0.000009s : 1: inline 0.00% : 0.000009s : 1: insert-virtual-dataset 0.00% : 0.000007s : 1: interleave_parallel_branches 0.00% : 0.000008s : 1: interleave_split_concat_branches 0.02% : 0.000069s : 1: label_fine_grained_interleaved_index 44.07% : 0.126009s : 1: label_micro_interleaved_index 0.20% : 0.000572s : 1: loop_unroll 0.00% : 0.000012s : 1: merge_cast_opt 0.00% : 0.000012s : 1: micro_interleaved_order_control 0.29% : 0.000815s : 1: mutable_eliminate 0.00% : 0.000012s : 1: offloading_packed_experts 0.01% : 0.000017s : 1: opt.transform.loop_unroll_optimizer 0.01% : 0.000020s : 1: opt.transform.mutable_eliminate 0.47% : 0.001338s : 78: opt.transform.opt_a 0.01% : 0.000035s : 1: opt.transform.opt_after_cconv 0.01% : 0.000042s : 1: opt.transform.opt_after_jit_grad 0.04% : 0.000127s : 28: opt.transform.opt_b 0.02% : 0.000059s : 2: opt.transform.opt_trans_graph 0.02% : 0.000067s : 4: opt.transform.symbol_engine_opt 1.26% : 0.003612s : 1: opt_a 0.06% : 0.000161s : 1: opt_after_cconv 0.36% : 0.001030s : 1: opt_after_jit_grad 0.12% : 0.000351s : 1: opt_b 46.63% : 0.133323s : 1: optimize 0.01% : 0.000028s : 1: optimize_parallel_all_gather_comm 0.00% : 0.000013s : 1: order_py_execute_after_rewriter 0.01% : 0.000037s : 1: overlap_grad_flash_sp 0.00% : 0.000007s : 1: overlap_grad_matmul_and_grad_allreduce 0.00% : 0.000012s : 1: overlap_grad_ring_attention 0.00% : 0.000008s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000008s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000010s : 1: overlap_param_gather 0.00% : 0.000007s : 1: overlap_recompute_allgather_and_fa_grad 0.00% : 0.000013s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000009s : 1: overlap_recompute_comm 0.00% : 0.000010s : 1: parallel-infer-symbol 0.00% : 0.000007s : 1: parallel-infer-symbol-second 0.00% : 0.000009s : 1: partial_unused_args_eliminate 0.00% : 0.000010s : 1: pipeline_parallel_scheduler 0.00% : 0.000007s : 1: pipeline_split 0.02% : 0.000055s : 1: pre_auto_parallel 0.01% : 0.000041s : 1: py_interpret_to_execute 0.01% : 0.000019s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000008s : 1: remove_cast_before_assign_add 0.01% : 0.000023s : 1: remove_dup_value 0.18% : 0.000502s : 1: renormalize.infer 0.16% : 0.000456s : 1: renormalize.specialize 0.00% : 0.000010s : 1: reorder_send_recv_between_fp_bp 0.00% : 0.000014s : 1: rewriter_after_jit_bprop_graph 0.02% : 0.000049s : 1: rewriter_after_opt_a 0.04% : 0.000102s : 1: rewriter_before_opt_a 0.00% : 0.000009s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000011s : 1: slice_recompute_activation 0.00% : 0.000009s : 1: split_layernorm_comm 0.00% : 0.000009s : 1: split_matmul_comm_elemetwise 0.00% : 0.000013s : 1: swap_dp_allreduce_reducescatter 0.06% : 0.000173s : 1: symbol_engine_optimizer 0.04% : 0.000114s : 1: tuple_transform 2.60% : 0.007426s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:38:57.731.784 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.182609, [21] [bootstrap]: 0.00046252 [type_inference]: 0.171345 [event_method]: 2.096e-05 [auto_monad]: 7.144e-05 [graph_reusing]: 6.59001e-06 [inline]: 2.83e-06 [add_attr]: 0.00388408, [1] [add_attr_with_inline]: 0.00387173, [1] [Cycle 1]: 7.259e-05, [2] [tag_attr]: 2.476e-05 [meta_addattr_fg_expand]: 6.26e-06 [parallel-infer-symbol]: 3.31999e-06 [pre_auto_parallel]: 3.996e-05 [insert-virtual-dataset]: 2.58e-06 [parallel-infer-symbol-second]: 7.50006e-07 [dataset_repeat_opt]: 2.29001e-06 [pipeline_split]: 1.55999e-06 [optimize]: 0.0059141, [53] [py_interpret_to_execute]: 3.291e-05 [rewriter_before_opt_a]: 9.614e-05 [opt_a]: 0.00326242, [2] [Cycle 1]: 0.0025289, [45] [expand_dump_flag]: 3.13e-06 [switch_simplify]: 4.322e-05 [loop_unroll]: 3.013e-05 [a_1]: 0.00080511 [with_stream_mark]: 2.293e-05 [recompute_prepare]: 1.056e-05 [updatestate_depend_eliminate]: 4.05e-06 [updatestate_assign_eliminate]: 3.42997e-06 [updatestate_loads_eliminate]: 2.61999e-06 [parameter_eliminate]: 2.43998e-06 [a_2]: 9.218e-05 [accelerated_algorithm]: 2.07e-05 [shard]: 1.94e-06 [meta_shard_fg_expand]: 2.74999e-06 [shard_inline]: 8.14002e-06 [merge_send_recv]: 8.92e-06 [auto_parallel]: 7.12002e-06 [parallel]: 2.138e-05 [flash_sp]: 8.11002e-06 [merge_comm]: 3.97998e-06 [allreduce_fusion]: 3.38999e-06 [matmul_add_comm_reduction]: 1.155e-05 [allreduce_slice_to_reducescatter]: 7.59988e-07 [virtual_shard_identity]: 1.14e-05 [virtual_dataset]: 7.03e-06 [get_grad_eliminate_]: 7.55e-06 [virtual_output]: 6.88e-06 [merge_forward]: 4.17e-06 [cell_reuse_recompute_pass]: 1.60001e-06 [offload_activation]: 1.072e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.428e-05 [merge_recompute_call_nodes]: 1.65001e-06 [before_grad]: 1.306e-05 [set_forward_comm_id_for_comm_node_pass]: 3.86001e-06 [meta_fg_expand]: 3.16001e-06 [flash_sp_send_recv_attached]: 5.44e-06 [receive_attached]: 2.37001e-06 [after_resolve]: 1.451e-05 [a_after_grad]: 1.024e-05 [renormalize]: 0.00090124 [add_forward_monad_depend]: 6.69999e-06 [auto_monad_grad]: 3.01999e-06 [auto_monad_eliminator]: 1.803e-05 [cse]: 3.271e-05 [a_3]: 5.524e-05 [Cycle 2]: 0.00072224, [45] [expand_dump_flag]: 1.91e-06 [switch_simplify]: 8.28001e-06 [loop_unroll]: 6.71e-06 [a_1]: 0.00015052 [with_stream_mark]: 1.325e-05 [recompute_prepare]: 6.96001e-06 [updatestate_depend_eliminate]: 3.69002e-06 [updatestate_assign_eliminate]: 2.61e-06 [updatestate_loads_eliminate]: 2.65002e-06 [parameter_eliminate]: 1.76e-06 [a_2]: 8.27e-05 [accelerated_algorithm]: 1.087e-05 [shard]: 1.66e-06 [meta_shard_fg_expand]: 1.72001e-06 [shard_inline]: 6.24001e-06 [merge_send_recv]: 6.24001e-06 [auto_parallel]: 6.40002e-06 [parallel]: 6.56e-06 [flash_sp]: 3.73001e-06 [merge_comm]: 3.56001e-06 [allreduce_fusion]: 4.14002e-06 [matmul_add_comm_reduction]: 6.89001e-06 [allreduce_slice_to_reducescatter]: 6.69999e-07 [virtual_shard_identity]: 8.08999e-06 [virtual_dataset]: 6.56e-06 [get_grad_eliminate_]: 6.18998e-06 [virtual_output]: 5.89999e-06 [merge_forward]: 3.85e-06 [cell_reuse_recompute_pass]: 1.84998e-06 [offload_activation]: 8.69e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.153e-05 [merge_recompute_call_nodes]: 1.31002e-06 [before_grad]: 1.101e-05 [set_forward_comm_id_for_comm_node_pass]: 3.53999e-06 [meta_fg_expand]: 2.56e-06 [flash_sp_send_recv_attached]: 8.90024e-07 [receive_attached]: 1.87999e-06 [after_resolve]: 1.15e-05 [a_after_grad]: 1.107e-05 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 1.30001e-06 [auto_monad_grad]: 9.80013e-07 [auto_monad_eliminator]: 8.85999e-06 [cse]: 1.605e-05 [a_3]: 3.697e-05 [py_interpret_to_execute_after_opt_a]: 1.257e-05 [slice_cell_reuse_recomputed_activation]: 2.16998e-06 [rewriter_after_opt_a]: 3.869e-05 [convert_after_rewriter]: 6.81999e-06 [order_py_execute_after_rewriter]: 6.06e-06 [mutable_eliminate]: 0.0007473 [opt_b]: 0.00026313, [1] [Cycle 1]: 0.00025335, [7] [b_1]: 0.00015678 [b_2]: 9.40001e-06 [updatestate_depend_eliminate]: 8.37998e-06 [updatestate_assign_eliminate]: 3.28998e-06 [updatestate_loads_eliminate]: 2.71e-06 [renormalize]: 5.50004e-07 [cse]: 2.567e-05 [optimize_parallel_all_gather_comm]: 5.123e-05 [overlap_param_gather]: 3.68999e-06 [cconv]: 3.779e-05 [loop_unroll]: 0.00055956 [opt_after_cconv]: 0.0001322, [1] [Cycle 1]: 0.00012402, [7] [c_1]: 3.81e-05 [parameter_eliminate]: 4.67998e-06 [updatestate_depend_eliminate]: 8.52e-06 [updatestate_assign_eliminate]: 3.2e-06 [updatestate_loads_eliminate]: 2.77002e-06 [cse]: 2.367e-05 [renormalize]: 5.89993e-07 [remove_dup_value]: 1.691e-05 [tuple_transform]: 9.445e-05, [1] [Cycle 1]: 8.871e-05, [4] [d_1]: 5.485e-05 [none_parameter_eliminate]: 1.95001e-06 [renormalize]: 2.09984e-07 [switch_simplify]: 8.22e-06 [partial_unused_args_eliminate]: 2.54001e-06 [add_recomputation]: 6.007e-05 [cse_after_recomputation]: 2.612e-05, [1] [Cycle 1]: 2.06e-05, [1] [cse]: 1.336e-05 [environ_conv]: 6.47001e-06 [swap_dp_allreduce_reducescatter]: 5.63002e-06 [bias_add_comm_swap]: 3.11999e-06 [label_micro_interleaved_index]: 5.40999e-06 [label_fine_grained_interleaved_index]: 3.04001e-06 [merge_cast_opt]: 1.45999e-06 [slice_recompute_activation]: 2.13998e-06 [micro_interleaved_order_control]: 2.76e-06 [assign_add_opt]: 1.20001e-06 [ForceFp32Comm]: 1.05999e-06 [remove_cast_before_assign_add]: 1.12e-06 [full_micro_interleaved_order_control]: 2.19001e-06 [reorder_send_recv_between_fp_bp]: 2.91999e-06 [comm_op_add_attrs]: 1.30001e-06 [add_comm_op_reuse_tag]: 9.50007e-07 [interleave_split_concat_branches]: 1.29e-06 [interleave_parallel_branches]: 1.41998e-06 [overlap_opt_shard_in_pipeline]: 1.24e-06 [overlap_opt_shard_grad_in_pipeline]: 1.91e-06 [control_data_broadcast_order]: 1.56e-05 [grouped_pairwise_exchange_alltoall]: 1.79e-06 [offloading_packed_experts]: 4.27998e-06 [overlap_recompute_and_grad_model_parallel]: 4.94003e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.89e-06 [overlap_recompute_allgather_and_fa_grad]: 1.50999e-06 [overlap_recompute_comm]: 3.06999e-06 [overlap_grad_ring_attention]: 3.91999e-06 [overlap_grad_flash_sp]: 2.424e-05 [begin_end_overlap_inline]: 9.39996e-07 [split_matmul_comm_elemetwise]: 2.19001e-06 [split_layernorm_comm]: 1.76e-06 [handle_group_info]: 1.14003e-06 [symbol_engine_optimizer]: 9.689e-05, [1] [Cycle 1]: 9.135e-05, [6] [build]: 4.15e-06 [elim_shapecalc]: 1.342e-05 [elim_not_effective]: 1.52e-05 [opt_reshape]: 8.55999e-06 [fold_const_symbol]: 1.086e-05 [renormalize]: 2.69996e-07 [detach_backward]: 2.34999e-06 [pipeline_parallel_scheduler]: 1.37e-06 [auto_monad_reorder]: 1.876e-05 [get_jit_bprop_graph]: 1.87001e-06 [rewriter_after_jit_bprop_graph]: 6.23e-06 [opt_after_jit_grad]: 0.00059546 [validate]: 4.97e-05 Sums bootstrap : 0.000463s : 0.26% type_inference : 0.171345s : 96.46% event_method : 0.000021s : 0.01% auto_monad : 0.000071s : 0.04% graph_reusing : 0.000007s : 0.00% inline : 0.000003s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000025s : 0.01% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.00% parallel-infer-symbol : 0.000003s : 0.00% pre_auto_parallel : 0.000040s : 0.02% insert-virtual-dataset : 0.000003s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000033s : 0.02% optimize.rewriter_before_opt_a : 0.000096s : 0.05% optimize.opt_a.expand_dump_flag : 0.000005s : 0.00% optimize.opt_a.switch_simplify : 0.000052s : 0.03% optimize.opt_a.loop_unroll : 0.000037s : 0.02% optimize.opt_a.a_1 : 0.000956s : 0.54% optimize.opt_a.with_stream_mark : 0.000036s : 0.02% optimize.opt_a.recompute_prepare : 0.000018s : 0.01% optimize.opt_a.updatestate_depend_eliminate : 0.000008s : 0.00% optimize.opt_a.updatestate_assign_eliminate : 0.000006s : 0.00% optimize.opt_a.updatestate_loads_eliminate : 0.000005s : 0.00% optimize.opt_a.parameter_eliminate : 0.000004s : 0.00% optimize.opt_a.a_2 : 0.000175s : 0.10% optimize.opt_a.accelerated_algorithm : 0.000032s : 0.02% optimize.opt_a.shard : 0.000004s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.00% optimize.opt_a.shard_inline : 0.000014s : 0.01% optimize.opt_a.merge_send_recv : 0.000015s : 0.01% optimize.opt_a.auto_parallel : 0.000014s : 0.01% optimize.opt_a.parallel : 0.000028s : 0.02% optimize.opt_a.flash_sp : 0.000012s : 0.01% optimize.opt_a.merge_comm : 0.000008s : 0.00% optimize.opt_a.allreduce_fusion : 0.000008s : 0.00% optimize.opt_a.matmul_add_comm_reduction : 0.000018s : 0.01% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000019s : 0.01% optimize.opt_a.virtual_dataset : 0.000014s : 0.01% optimize.opt_a.get_grad_eliminate_ : 0.000014s : 0.01% optimize.opt_a.virtual_output : 0.000013s : 0.01% optimize.opt_a.merge_forward : 0.000008s : 0.00% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.00% optimize.opt_a.offload_activation : 0.000019s : 0.01% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000026s : 0.01% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.00% optimize.opt_a.before_grad : 0.000024s : 0.01% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000007s : 0.00% optimize.opt_a.meta_fg_expand : 0.000006s : 0.00% optimize.opt_a.flash_sp_send_recv_attached : 0.000006s : 0.00% optimize.opt_a.receive_attached : 0.000004s : 0.00% optimize.opt_a.after_resolve : 0.000026s : 0.01% optimize.opt_a.a_after_grad : 0.000021s : 0.01% optimize.opt_a.renormalize : 0.000901s : 0.51% optimize.opt_a.add_forward_monad_depend : 0.000008s : 0.00% optimize.opt_a.auto_monad_grad : 0.000004s : 0.00% optimize.opt_a.auto_monad_eliminator : 0.000027s : 0.02% optimize.opt_a.cse : 0.000049s : 0.03% optimize.opt_a.a_3 : 0.000092s : 0.05% optimize.py_interpret_to_execute_after_opt_a : 0.000013s : 0.01% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.00% optimize.rewriter_after_opt_a : 0.000039s : 0.02% optimize.convert_after_rewriter : 0.000007s : 0.00% optimize.order_py_execute_after_rewriter : 0.000006s : 0.00% optimize.mutable_eliminate : 0.000747s : 0.42% optimize.opt_b.b_1 : 0.000157s : 0.09% optimize.opt_b.b_2 : 0.000009s : 0.01% optimize.opt_b.updatestate_depend_eliminate : 0.000008s : 0.00% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.00% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000026s : 0.01% optimize.optimize_parallel_all_gather_comm : 0.000051s : 0.03% optimize.overlap_param_gather : 0.000004s : 0.00% optimize.cconv : 0.000038s : 0.02% optimize.loop_unroll : 0.000560s : 0.32% optimize.opt_after_cconv.c_1 : 0.000038s : 0.02% optimize.opt_after_cconv.parameter_eliminate : 0.000005s : 0.00% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000009s : 0.00% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.cse : 0.000024s : 0.01% optimize.opt_after_cconv.renormalize : 0.000001s : 0.00% optimize.remove_dup_value : 0.000017s : 0.01% optimize.tuple_transform.d_1 : 0.000055s : 0.03% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000008s : 0.00% optimize.partial_unused_args_eliminate : 0.000003s : 0.00% optimize.add_recomputation : 0.000060s : 0.03% optimize.cse_after_recomputation.cse : 0.000013s : 0.01% optimize.environ_conv : 0.000006s : 0.00% optimize.swap_dp_allreduce_reducescatter : 0.000006s : 0.00% optimize.bias_add_comm_swap : 0.000003s : 0.00% optimize.label_micro_interleaved_index : 0.000005s : 0.00% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.00% optimize.merge_cast_opt : 0.000001s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.00% optimize.micro_interleaved_order_control : 0.000003s : 0.00% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.00% optimize.full_micro_interleaved_order_control : 0.000002s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.00% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.00% optimize.control_data_broadcast_order : 0.000016s : 0.01% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.00% optimize.offloading_packed_experts : 0.000004s : 0.00% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.00% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000002s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000002s : 0.00% optimize.overlap_recompute_comm : 0.000003s : 0.00% optimize.overlap_grad_ring_attention : 0.000004s : 0.00% optimize.overlap_grad_flash_sp : 0.000024s : 0.01% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.00% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000004s : 0.00% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000013s : 0.01% optimize.symbol_engine_optimizer.elim_not_effective : 0.000015s : 0.01% optimize.symbol_engine_optimizer.opt_reshape : 0.000009s : 0.00% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000011s : 0.01% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.00% pipeline_parallel_scheduler : 0.000001s : 0.00% auto_monad_reorder : 0.000019s : 0.01% get_jit_bprop_graph : 0.000002s : 0.00% rewriter_after_jit_bprop_graph : 0.000006s : 0.00% opt_after_jit_grad : 0.000595s : 0.34% validate : 0.000050s : 0.03% Time group info: ------[substitution.] 0.000347 36 10.23% : 0.000035s : 6: substitution.arithmetic_simplify 0.63% : 0.000002s : 2: substitution.elim_not_effective 0.37% : 0.000001s : 2: substitution.fold_const_symbol 1.92% : 0.000007s : 4: substitution.graph_param_transform 54.25% : 0.000188s : 4: substitution.inline 1.32% : 0.000005s : 4: substitution.j_node_and_user_rematch 3.47% : 0.000012s : 2: substitution.less_batch_normalization 1.36% : 0.000005s : 4: substitution.remove_not_recompute_node 1.92% : 0.000007s : 4: substitution.replace_old_param 24.53% : 0.000085s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.171254 2 99.49% : 0.170388s : 1: type_inference.infer 0.51% : 0.000866s : 1: type_inference.specialize ------[replace.] 0.000072 8 59.13% : 0.000043s : 4: replace.inline 40.87% : 0.000029s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000268 8 69.16% : 0.000185s : 4: match.inline 30.84% : 0.000083s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000221 1278 0.88% : 0.000002s : 13: predicate.accumulaten_eliminater 0.93% : 0.000002s : 4: predicate.ad_related_special_op_eliminate 0.46% : 0.000001s : 8: predicate.addn_check_dump 1.04% : 0.000002s : 13: predicate.addn_zero_filter 0.75% : 0.000002s : 13: predicate.adjust_all_reduce_mul_add 2.52% : 0.000006s : 21: predicate.arithmetic_simplify 0.87% : 0.000002s : 13: predicate.cast_eliminate 0.57% : 0.000001s : 8: predicate.check_bprop_eliminate 0.44% : 0.000001s : 8: predicate.compare_switch_simplify 0.18% : 0.000000s : 4: predicate.const_output_eliminate 0.64% : 0.000001s : 8: predicate.depend_value_elim 0.91% : 0.000002s : 13: predicate.dict_get_item_const_eliminator 0.93% : 0.000002s : 13: predicate.dict_get_item_eliminator 0.96% : 0.000002s : 13: predicate.dict_set_item_eliminator 1.63% : 0.000004s : 8: predicate.dumpgradient_eliminate 0.21% : 0.000000s : 4: predicate.elim_not_effective 0.59% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.14% : 0.000003s : 17: predicate.environ_add_const_eliminate 1.05% : 0.000002s : 17: predicate.environ_get_add_eliminate 1.02% : 0.000002s : 17: predicate.environ_get_depend_swap 1.82% : 0.000004s : 25: predicate.environ_get_eliminate 0.98% : 0.000002s : 17: predicate.environ_get_set_eliminate 1.36% : 0.000003s : 21: predicate.exchange_switch_depend_value 2.30% : 0.000005s : 21: predicate.float_depend_g_call 0.44% : 0.000001s : 8: predicate.float_environ_get_switch 0.76% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.17% : 0.000000s : 4: predicate.fold_const_symbol 0.66% : 0.000001s : 8: predicate.get_grad_eliminate 0.20% : 0.000000s : 4: predicate.graph_param_transform 0.55% : 0.000001s : 8: predicate.incorporate_call 0.44% : 0.000001s : 8: predicate.incorporate_call_switch 6.50% : 0.000014s : 58: predicate.inline 0.77% : 0.000002s : 8: predicate.inline_without_move 0.31% : 0.000001s : 8: predicate.j_node_and_user_rematch 0.97% : 0.000002s : 8: predicate.less_batch_normalization 1.87% : 0.000004s : 25: predicate.list_to_tuple_eliminator_ 2.41% : 0.000005s : 38: predicate.load_eliminater 1.36% : 0.000003s : 4: predicate.loop_unroll_after_grad 2.28% : 0.000005s : 34: predicate.loop_unroll_before_grad 1.69% : 0.000004s : 21: predicate.make_slice_get_slice_eliminator 0.53% : 0.000001s : 8: predicate.merge_addn 0.55% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.53% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.79% : 0.000002s : 13: predicate.minmaximum_grad 1.87% : 0.000004s : 4: predicate.mutable_eliminate 0.43% : 0.000001s : 4: predicate.opt_reshape 0.39% : 0.000001s : 4: predicate.parallel_virtual_node 1.66% : 0.000004s : 21: predicate.partial_defer_inline 1.57% : 0.000003s : 21: predicate.partial_eliminate 0.82% : 0.000002s : 13: predicate.print_const_string_wrapper 0.52% : 0.000001s : 8: predicate.reduce_all_const_elim 1.38% : 0.000003s : 13: predicate.reduce_eliminate 2.39% : 0.000005s : 38: predicate.redundant_stop_gradient_eliminater 0.41% : 0.000001s : 8: predicate.remove_not_recompute_node 1.35% : 0.000003s : 25: predicate.replace_applicator 0.48% : 0.000001s : 8: predicate.replace_old_param 0.36% : 0.000001s : 4: predicate.reset_defer_inline 0.82% : 0.000002s : 13: predicate.reshape_eliminate 0.59% : 0.000001s : 8: predicate.row_tensor_add_zeros_like 0.42% : 0.000001s : 4: predicate.row_tensor_eliminate 0.77% : 0.000002s : 8: predicate.same_eliminate 0.40% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.93% : 0.000002s : 8: predicate.shard_identity_eliminate 0.71% : 0.000002s : 8: predicate.special_op_eliminate 0.64% : 0.000001s : 8: predicate.specialize_transform 0.97% : 0.000002s : 8: predicate.split_environ_get_set_with_tuple_value 0.68% : 0.000001s : 8: predicate.stack_unstack_eliminate 0.33% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.43% : 0.000003s : 21: predicate.switch_defer_inline 1.95% : 0.000004s : 29: predicate.switch_layer_defer_inline 4.91% : 0.000011s : 67: predicate.switch_simplify 0.88% : 0.000002s : 13: predicate.tile_eliminate 0.83% : 0.000002s : 13: predicate.transpose_eliminate 1.48% : 0.000003s : 21: predicate.tuple_list_convert_item_index_to_positive 1.62% : 0.000004s : 21: predicate.tuple_list_get_item_const_eliminator 1.28% : 0.000003s : 21: predicate.tuple_list_get_item_depend_reorder 3.20% : 0.000007s : 33: predicate.tuple_list_get_item_eliminator 1.61% : 0.000004s : 21: predicate.tuple_list_get_set_item_eliminator 2.38% : 0.000005s : 29: predicate.tuple_list_set_item_eliminator 2.06% : 0.000005s : 25: predicate.tuple_to_list_eliminator_ 2.22% : 0.000005s : 38: predicate.updatestate_pure_node_eliminater 2.87% : 0.000006s : 46: predicate.updatestate_useless_node_eliminater 0.47% : 0.000001s : 4: predicate.value_based_eliminate 0.64% : 0.000001s : 8: predicate.virtual_dataset_eliminate 0.62% : 0.000001s : 8: predicate.virtual_output_eliminate 0.27% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.40% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000704 11 52.70% : 0.000371s : 5: func_graph_cloner_run.FuncGraphClonerGraph 47.30% : 0.000333s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.194931 192 0.00% : 0.000004s : 1: ForceFp32Comm 2.00% : 0.003891s : 1: add_attr 1.99% : 0.003877s : 1: add_attr_with_inline 0.00% : 0.000005s : 1: add_comm_op_reuse_tag 0.03% : 0.000065s : 1: add_recomputation 0.00% : 0.000004s : 1: assign_add_opt 0.04% : 0.000077s : 1: auto_monad 0.01% : 0.000023s : 1: auto_monad_reorder 0.00% : 0.000004s : 1: begin_end_overlap_inline 0.00% : 0.000006s : 1: bias_add_comm_swap 0.25% : 0.000496s : 1: bootstrap 0.02% : 0.000042s : 1: cconv 0.00% : 0.000004s : 1: comm_op_add_attrs 0.01% : 0.000020s : 1: control_data_broadcast_order 0.01% : 0.000010s : 1: convert_after_rewriter 0.02% : 0.000029s : 1: cse_after_recomputation 0.00% : 0.000005s : 1: dataset_repeat_opt 0.00% : 0.000006s : 1: detach_backward 0.01% : 0.000010s : 1: environ_conv 0.01% : 0.000028s : 1: event_method 0.00% : 0.000005s : 1: full_micro_interleaved_order_control 0.00% : 0.000005s : 1: get_jit_bprop_graph 0.01% : 0.000010s : 1: graph_reusing 0.00% : 0.000005s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000004s : 1: handle_group_info 0.00% : 0.000006s : 1: inline 0.00% : 0.000006s : 1: insert-virtual-dataset 0.00% : 0.000004s : 1: interleave_parallel_branches 0.00% : 0.000004s : 1: interleave_split_concat_branches 0.00% : 0.000007s : 1: label_fine_grained_interleaved_index 0.00% : 0.000009s : 1: label_micro_interleaved_index 0.29% : 0.000569s : 1: loop_unroll 0.00% : 0.000004s : 1: merge_cast_opt 0.00% : 0.000006s : 1: micro_interleaved_order_control 0.39% : 0.000760s : 1: mutable_eliminate 0.00% : 0.000008s : 1: offloading_packed_experts 0.01% : 0.000020s : 1: opt.transform.loop_unroll_optimizer 0.01% : 0.000023s : 1: opt.transform.mutable_eliminate 0.73% : 0.001430s : 78: opt.transform.opt_a 0.02% : 0.000037s : 1: opt.transform.opt_after_cconv 0.02% : 0.000035s : 1: opt.transform.opt_after_jit_grad 0.06% : 0.000126s : 28: opt.transform.opt_b 0.03% : 0.000061s : 2: opt.transform.opt_trans_graph 0.02% : 0.000044s : 4: opt.transform.symbol_engine_opt 1.68% : 0.003266s : 1: opt_a 0.07% : 0.000136s : 1: opt_after_cconv 0.31% : 0.000607s : 1: opt_after_jit_grad 0.14% : 0.000268s : 1: opt_b 3.04% : 0.005920s : 1: optimize 0.03% : 0.000058s : 1: optimize_parallel_all_gather_comm 0.00% : 0.000009s : 1: order_py_execute_after_rewriter 0.01% : 0.000028s : 1: overlap_grad_flash_sp 0.00% : 0.000005s : 1: overlap_grad_matmul_and_grad_allreduce 0.00% : 0.000007s : 1: overlap_grad_ring_attention 0.00% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000005s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000007s : 1: overlap_param_gather 0.00% : 0.000005s : 1: overlap_recompute_allgather_and_fa_grad 0.00% : 0.000009s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000006s : 1: overlap_recompute_comm 0.00% : 0.000008s : 1: parallel-infer-symbol 0.00% : 0.000004s : 1: parallel-infer-symbol-second 0.00% : 0.000006s : 1: partial_unused_args_eliminate 0.00% : 0.000005s : 1: pipeline_parallel_scheduler 0.00% : 0.000004s : 1: pipeline_split 0.02% : 0.000044s : 1: pre_auto_parallel 0.02% : 0.000037s : 1: py_interpret_to_execute 0.01% : 0.000016s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000004s : 1: remove_cast_before_assign_add 0.01% : 0.000021s : 1: remove_dup_value 0.24% : 0.000476s : 1: renormalize.infer 0.21% : 0.000415s : 1: renormalize.specialize 0.00% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.01% : 0.000010s : 1: rewriter_after_jit_bprop_graph 0.02% : 0.000043s : 1: rewriter_after_opt_a 0.05% : 0.000100s : 1: rewriter_before_opt_a 0.00% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000006s : 1: slice_recompute_activation 0.00% : 0.000005s : 1: split_layernorm_comm 0.00% : 0.000005s : 1: split_matmul_comm_elemetwise 0.00% : 0.000009s : 1: swap_dp_allreduce_reducescatter 0.05% : 0.000100s : 1: symbol_engine_optimizer 0.05% : 0.000098s : 1: tuple_transform 87.91% : 0.171367s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:39:00.313.028 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:39:00.313.302 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.146522, [21] [bootstrap]: 0.00043377 [type_inference]: 0.132748 [event_method]: 2.334e-05 [auto_monad]: 8.262e-05 [graph_reusing]: 7.63999e-06 [inline]: 3.13e-06 [add_attr]: 0.00386958, [1] [add_attr_with_inline]: 0.00385767, [1] [Cycle 1]: 9.522e-05, [2] [tag_attr]: 2.438e-05 [meta_addattr_fg_expand]: 6.37001e-06 [parallel-infer-symbol]: 4.02e-06 [pre_auto_parallel]: 4.379e-05 [insert-virtual-dataset]: 2.69001e-06 [parallel-infer-symbol-second]: 7.09988e-07 [dataset_repeat_opt]: 2.00002e-06 [pipeline_split]: 1.62999e-06 [optimize]: 0.00782919, [53] [py_interpret_to_execute]: 4.025e-05 [rewriter_before_opt_a]: 0.000104 [opt_a]: 0.00472553, [2] [Cycle 1]: 0.00364872, [45] [expand_dump_flag]: 3.16001e-06 [switch_simplify]: 4.511e-05 [loop_unroll]: 3.13e-05 [a_1]: 0.00082806 [with_stream_mark]: 2.092e-05 [recompute_prepare]: 1.168e-05 [updatestate_depend_eliminate]: 5.32999e-06 [updatestate_assign_eliminate]: 4.06001e-06 [updatestate_loads_eliminate]: 3.58e-06 [parameter_eliminate]: 2.04999e-06 [a_2]: 0.00013974 [accelerated_algorithm]: 2.309e-05 [shard]: 2.09e-06 [meta_shard_fg_expand]: 2.72001e-06 [shard_inline]: 4.115e-05 [merge_send_recv]: 1.509e-05 [auto_parallel]: 1.329e-05 [parallel]: 2.296e-05 [flash_sp]: 1.182e-05 [merge_comm]: 5.37001e-06 [allreduce_fusion]: 5.00999e-06 [matmul_add_comm_reduction]: 1.572e-05 [allreduce_slice_to_reducescatter]: 7.7e-07 [virtual_shard_identity]: 3.745e-05 [virtual_dataset]: 9.47001e-06 [get_grad_eliminate_]: 9.24998e-06 [virtual_output]: 8.35999e-06 [merge_forward]: 5.34e-06 [cell_reuse_recompute_pass]: 3.11001e-06 [offload_activation]: 1.272e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.445e-05 [merge_recompute_call_nodes]: 1.76e-06 [before_grad]: 1.48e-05 [set_forward_comm_id_for_comm_node_pass]: 4.67998e-06 [meta_fg_expand]: 4.47e-06 [flash_sp_send_recv_attached]: 5.76e-06 [receive_attached]: 2.41e-06 [after_resolve]: 1.611e-05 [a_after_grad]: 1.326e-05 [renormalize]: 0.00104882 [add_forward_monad_depend]: 8.13999e-06 [auto_monad_grad]: 2.87002e-06 [auto_monad_eliminator]: 2.049e-05 [cse]: 3.949e-05 [a_3]: 8.293e-05 [Cycle 2]: 0.00105901, [45] [expand_dump_flag]: 2.29999e-06 [switch_simplify]: 1.042e-05 [loop_unroll]: 7.87003e-06 [a_1]: 0.00020792 [with_stream_mark]: 1.437e-05 [recompute_prepare]: 8.90001e-06 [updatestate_depend_eliminate]: 4.66002e-06 [updatestate_assign_eliminate]: 3.6e-06 [updatestate_loads_eliminate]: 3.67002e-06 [parameter_eliminate]: 1.35999e-06 [a_2]: 0.00013132 [accelerated_algorithm]: 1.219e-05 [shard]: 1.94e-06 [meta_shard_fg_expand]: 2.49001e-06 [shard_inline]: 7.6e-06 [merge_send_recv]: 9.29e-06 [auto_parallel]: 9.76e-06 [parallel]: 7.4e-06 [flash_sp]: 4.13001e-06 [merge_comm]: 6.44999e-06 [allreduce_fusion]: 4.17e-06 [matmul_add_comm_reduction]: 1.123e-05 [allreduce_slice_to_reducescatter]: 1.15001e-06 [virtual_shard_identity]: 9.47001e-06 [virtual_dataset]: 9.24e-06 [get_grad_eliminate_]: 7.43999e-06 [virtual_output]: 7.35e-06 [merge_forward]: 4.77998e-06 [cell_reuse_recompute_pass]: 2.61999e-06 [offload_activation]: 9.36e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.747e-05 [merge_recompute_call_nodes]: 1.07e-06 [before_grad]: 1.262e-05 [set_forward_comm_id_for_comm_node_pass]: 6.05002e-06 [meta_fg_expand]: 3.34001e-06 [flash_sp_send_recv_attached]: 1.22e-06 [receive_attached]: 2.39001e-06 [after_resolve]: 1.575e-05 [a_after_grad]: 1.253e-05 [renormalize]: 7.99773e-08 [add_forward_monad_depend]: 2.10002e-06 [auto_monad_grad]: 2.34001e-06 [auto_monad_eliminator]: 1.352e-05 [cse]: 2.519e-05 [a_3]: 6.198e-05 [py_interpret_to_execute_after_opt_a]: 1.998e-05 [slice_cell_reuse_recomputed_activation]: 6.03002e-06 [rewriter_after_opt_a]: 5.126e-05 [convert_after_rewriter]: 1.105e-05 [order_py_execute_after_rewriter]: 1.041e-05 [mutable_eliminate]: 0.00079204 [opt_b]: 0.00039506, [1] [Cycle 1]: 0.00038212, [7] [b_1]: 0.00024554 [b_2]: 1.099e-05 [updatestate_depend_eliminate]: 9.78002e-06 [updatestate_assign_eliminate]: 3.79002e-06 [updatestate_loads_eliminate]: 3.19001e-06 [renormalize]: 7.79983e-07 [cse]: 3.109e-05 [optimize_parallel_all_gather_comm]: 4.102e-05 [overlap_param_gather]: 6.61e-06 [cconv]: 3.741e-05 [loop_unroll]: 0.00056031 [opt_after_cconv]: 0.00017702, [1] [Cycle 1]: 0.00016573, [7] [c_1]: 4.594e-05 [parameter_eliminate]: 4.40999e-06 [updatestate_depend_eliminate]: 7.92003e-06 [updatestate_assign_eliminate]: 4.36002e-06 [updatestate_loads_eliminate]: 3.5e-06 [cse]: 2.662e-05 [renormalize]: 8.40024e-07 [remove_dup_value]: 2.166e-05 [tuple_transform]: 0.00012424, [1] [Cycle 1]: 0.00011513, [4] [d_1]: 6.402e-05 [none_parameter_eliminate]: 1.79e-06 [renormalize]: 2.19996e-07 [switch_simplify]: 9.12999e-06 [partial_unused_args_eliminate]: 5.26998e-06 [add_recomputation]: 6.718e-05 [cse_after_recomputation]: 3.729e-05, [1] [Cycle 1]: 2.888e-05, [1] [cse]: 1.761e-05 [environ_conv]: 1.204e-05 [swap_dp_allreduce_reducescatter]: 1.043e-05 [bias_add_comm_swap]: 6.39999e-06 [label_micro_interleaved_index]: 8.69e-06 [label_fine_grained_interleaved_index]: 5.87999e-06 [merge_cast_opt]: 4.55001e-06 [slice_recompute_activation]: 6.01e-06 [micro_interleaved_order_control]: 5.77001e-06 [assign_add_opt]: 4.52e-06 [ForceFp32Comm]: 3.89002e-06 [remove_cast_before_assign_add]: 4.41002e-06 [full_micro_interleaved_order_control]: 5.70001e-06 [reorder_send_recv_between_fp_bp]: 6.17999e-06 [comm_op_add_attrs]: 5.24e-06 [add_comm_op_reuse_tag]: 4.03999e-06 [interleave_split_concat_branches]: 4.28001e-06 [interleave_parallel_branches]: 4.74002e-06 [overlap_opt_shard_in_pipeline]: 4.94003e-06 [overlap_opt_shard_grad_in_pipeline]: 5.14e-06 [control_data_broadcast_order]: 2.05e-05 [grouped_pairwise_exchange_alltoall]: 5.67999e-06 [offloading_packed_experts]: 9.41998e-06 [overlap_recompute_and_grad_model_parallel]: 9.32999e-06 [overlap_grad_matmul_and_grad_allreduce]: 5.07999e-06 [overlap_recompute_allgather_and_fa_grad]: 4.09002e-06 [overlap_recompute_comm]: 6.25002e-06 [overlap_grad_ring_attention]: 8.2e-06 [overlap_grad_flash_sp]: 2.862e-05 [begin_end_overlap_inline]: 3.97e-06 [split_matmul_comm_elemetwise]: 5.54998e-06 [split_layernorm_comm]: 5.39e-06 [handle_group_info]: 4.37e-06 [symbol_engine_optimizer]: 0.00012644, [1] [Cycle 1]: 0.00011733, [6] [build]: 4.69998e-06 [elim_shapecalc]: 1.447e-05 [elim_not_effective]: 1.764e-05 [opt_reshape]: 9.31998e-06 [fold_const_symbol]: 1.311e-05 [renormalize]: 2.50002e-07 [detach_backward]: 4.23999e-06 [pipeline_parallel_scheduler]: 1.91e-06 [auto_monad_reorder]: 2.728e-05 [get_jit_bprop_graph]: 1.89999e-06 [rewriter_after_jit_bprop_graph]: 6.86001e-06 [opt_after_jit_grad]: 0.00060147 [validate]: 5.146e-05 Sums bootstrap : 0.000434s : 0.31% type_inference : 0.132748s : 94.84% event_method : 0.000023s : 0.02% auto_monad : 0.000083s : 0.06% graph_reusing : 0.000008s : 0.01% inline : 0.000003s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000024s : 0.02% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.00% parallel-infer-symbol : 0.000004s : 0.00% pre_auto_parallel : 0.000044s : 0.03% insert-virtual-dataset : 0.000003s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000040s : 0.03% optimize.rewriter_before_opt_a : 0.000104s : 0.07% optimize.opt_a.expand_dump_flag : 0.000005s : 0.00% optimize.opt_a.switch_simplify : 0.000056s : 0.04% optimize.opt_a.loop_unroll : 0.000039s : 0.03% optimize.opt_a.a_1 : 0.001036s : 0.74% optimize.opt_a.with_stream_mark : 0.000035s : 0.03% optimize.opt_a.recompute_prepare : 0.000021s : 0.01% optimize.opt_a.updatestate_depend_eliminate : 0.000010s : 0.01% optimize.opt_a.updatestate_assign_eliminate : 0.000008s : 0.01% optimize.opt_a.updatestate_loads_eliminate : 0.000007s : 0.01% optimize.opt_a.parameter_eliminate : 0.000003s : 0.00% optimize.opt_a.a_2 : 0.000271s : 0.19% optimize.opt_a.accelerated_algorithm : 0.000035s : 0.03% optimize.opt_a.shard : 0.000004s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.000005s : 0.00% optimize.opt_a.shard_inline : 0.000049s : 0.03% optimize.opt_a.merge_send_recv : 0.000024s : 0.02% optimize.opt_a.auto_parallel : 0.000023s : 0.02% optimize.opt_a.parallel : 0.000030s : 0.02% optimize.opt_a.flash_sp : 0.000016s : 0.01% optimize.opt_a.merge_comm : 0.000012s : 0.01% optimize.opt_a.allreduce_fusion : 0.000009s : 0.01% optimize.opt_a.matmul_add_comm_reduction : 0.000027s : 0.02% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000002s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000047s : 0.03% optimize.opt_a.virtual_dataset : 0.000019s : 0.01% optimize.opt_a.get_grad_eliminate_ : 0.000017s : 0.01% optimize.opt_a.virtual_output : 0.000016s : 0.01% optimize.opt_a.merge_forward : 0.000010s : 0.01% optimize.opt_a.cell_reuse_recompute_pass : 0.000006s : 0.00% optimize.opt_a.offload_activation : 0.000022s : 0.02% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000042s : 0.03% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.00% optimize.opt_a.before_grad : 0.000027s : 0.02% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000011s : 0.01% optimize.opt_a.meta_fg_expand : 0.000008s : 0.01% optimize.opt_a.flash_sp_send_recv_attached : 0.000007s : 0.00% optimize.opt_a.receive_attached : 0.000005s : 0.00% optimize.opt_a.after_resolve : 0.000032s : 0.02% optimize.opt_a.a_after_grad : 0.000026s : 0.02% optimize.opt_a.renormalize : 0.001049s : 0.75% optimize.opt_a.add_forward_monad_depend : 0.000010s : 0.01% optimize.opt_a.auto_monad_grad : 0.000005s : 0.00% optimize.opt_a.auto_monad_eliminator : 0.000034s : 0.02% optimize.opt_a.cse : 0.000065s : 0.05% optimize.opt_a.a_3 : 0.000145s : 0.10% optimize.py_interpret_to_execute_after_opt_a : 0.000020s : 0.01% optimize.slice_cell_reuse_recomputed_activation : 0.000006s : 0.00% optimize.rewriter_after_opt_a : 0.000051s : 0.04% optimize.convert_after_rewriter : 0.000011s : 0.01% optimize.order_py_execute_after_rewriter : 0.000010s : 0.01% optimize.mutable_eliminate : 0.000792s : 0.57% optimize.opt_b.b_1 : 0.000246s : 0.18% optimize.opt_b.b_2 : 0.000011s : 0.01% optimize.opt_b.updatestate_depend_eliminate : 0.000010s : 0.01% optimize.opt_b.updatestate_assign_eliminate : 0.000004s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.00% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000031s : 0.02% optimize.optimize_parallel_all_gather_comm : 0.000041s : 0.03% optimize.overlap_param_gather : 0.000007s : 0.00% optimize.cconv : 0.000037s : 0.03% optimize.loop_unroll : 0.000560s : 0.40% optimize.opt_after_cconv.c_1 : 0.000046s : 0.03% optimize.opt_after_cconv.parameter_eliminate : 0.000004s : 0.00% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000008s : 0.01% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000004s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.cse : 0.000027s : 0.02% optimize.opt_after_cconv.renormalize : 0.000001s : 0.00% optimize.remove_dup_value : 0.000022s : 0.02% optimize.tuple_transform.d_1 : 0.000064s : 0.05% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000009s : 0.01% optimize.partial_unused_args_eliminate : 0.000005s : 0.00% optimize.add_recomputation : 0.000067s : 0.05% optimize.cse_after_recomputation.cse : 0.000018s : 0.01% optimize.environ_conv : 0.000012s : 0.01% optimize.swap_dp_allreduce_reducescatter : 0.000010s : 0.01% optimize.bias_add_comm_swap : 0.000006s : 0.00% optimize.label_micro_interleaved_index : 0.000009s : 0.01% optimize.label_fine_grained_interleaved_index : 0.000006s : 0.00% optimize.merge_cast_opt : 0.000005s : 0.00% optimize.slice_recompute_activation : 0.000006s : 0.00% optimize.micro_interleaved_order_control : 0.000006s : 0.00% optimize.assign_add_opt : 0.000005s : 0.00% optimize.ForceFp32Comm : 0.000004s : 0.00% optimize.remove_cast_before_assign_add : 0.000004s : 0.00% optimize.full_micro_interleaved_order_control : 0.000006s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000006s : 0.00% optimize.comm_op_add_attrs : 0.000005s : 0.00% optimize.add_comm_op_reuse_tag : 0.000004s : 0.00% optimize.interleave_split_concat_branches : 0.000004s : 0.00% optimize.interleave_parallel_branches : 0.000005s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000005s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000005s : 0.00% optimize.control_data_broadcast_order : 0.000021s : 0.01% optimize.grouped_pairwise_exchange_alltoall : 0.000006s : 0.00% optimize.offloading_packed_experts : 0.000009s : 0.01% optimize.overlap_recompute_and_grad_model_parallel : 0.000009s : 0.01% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000005s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.00% optimize.overlap_recompute_comm : 0.000006s : 0.00% optimize.overlap_grad_ring_attention : 0.000008s : 0.01% optimize.overlap_grad_flash_sp : 0.000029s : 0.02% optimize.begin_end_overlap_inline : 0.000004s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000006s : 0.00% optimize.split_layernorm_comm : 0.000005s : 0.00% optimize.handle_group_info : 0.000004s : 0.00% optimize.symbol_engine_optimizer.build : 0.000005s : 0.00% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000014s : 0.01% optimize.symbol_engine_optimizer.elim_not_effective : 0.000018s : 0.01% optimize.symbol_engine_optimizer.opt_reshape : 0.000009s : 0.01% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000013s : 0.01% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000004s : 0.00% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000027s : 0.02% get_jit_bprop_graph : 0.000002s : 0.00% rewriter_after_jit_bprop_graph : 0.000007s : 0.00% opt_after_jit_grad : 0.000601s : 0.43% validate : 0.000051s : 0.04% Time group info: ------[substitution.] 0.000303 45 12.45% : 0.000038s : 5: substitution.arithmetic_simplify 9.80% : 0.000030s : 3: substitution.cast_eliminate 0.79% : 0.000002s : 3: substitution.elim_not_effective 0.52% : 0.000002s : 3: substitution.fold_const_symbol 2.23% : 0.000007s : 5: substitution.graph_param_transform 57.74% : 0.000175s : 4: substitution.inline 1.71% : 0.000005s : 6: substitution.j_node_and_user_rematch 4.27% : 0.000013s : 2: substitution.less_batch_normalization 2.22% : 0.000007s : 6: substitution.remove_not_recompute_node 2.87% : 0.000009s : 4: substitution.replace_old_param 5.39% : 0.000016s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.132680 2 99.23% : 0.131662s : 1: type_inference.infer 0.77% : 0.001018s : 1: type_inference.specialize ------[replace.] 0.000072 8 60.63% : 0.000043s : 4: replace.inline 39.37% : 0.000028s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000187 8 92.27% : 0.000173s : 4: match.inline 7.73% : 0.000014s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000275 1596 0.89% : 0.000002s : 17: predicate.accumulaten_eliminater 0.67% : 0.000002s : 5: predicate.ad_related_special_op_eliminate 0.52% : 0.000001s : 10: predicate.addn_check_dump 1.07% : 0.000003s : 17: predicate.addn_zero_filter 0.84% : 0.000002s : 17: predicate.adjust_all_reduce_mul_add 2.25% : 0.000006s : 27: predicate.arithmetic_simplify 0.95% : 0.000003s : 17: predicate.cast_eliminate 0.63% : 0.000002s : 10: predicate.check_bprop_eliminate 0.56% : 0.000002s : 10: predicate.compare_switch_simplify 0.19% : 0.000001s : 5: predicate.const_output_eliminate 0.61% : 0.000002s : 10: predicate.depend_value_elim 0.95% : 0.000003s : 17: predicate.dict_get_item_const_eliminator 1.09% : 0.000003s : 17: predicate.dict_get_item_eliminator 0.93% : 0.000003s : 17: predicate.dict_set_item_eliminator 0.92% : 0.000003s : 10: predicate.dumpgradient_eliminate 0.17% : 0.000000s : 5: predicate.elim_not_effective 0.48% : 0.000001s : 5: predicate.elim_shapecalc_of_broadcastargs 1.16% : 0.000003s : 22: predicate.environ_add_const_eliminate 1.07% : 0.000003s : 22: predicate.environ_get_add_eliminate 1.09% : 0.000003s : 22: predicate.environ_get_depend_swap 1.73% : 0.000005s : 32: predicate.environ_get_eliminate 1.05% : 0.000003s : 22: predicate.environ_get_set_eliminate 1.33% : 0.000004s : 25: predicate.exchange_switch_depend_value 2.21% : 0.000006s : 25: predicate.float_depend_g_call 0.54% : 0.000001s : 10: predicate.float_environ_get_switch 0.86% : 0.000002s : 15: predicate.float_tuple_getitem_switch 0.17% : 0.000000s : 5: predicate.fold_const_symbol 0.62% : 0.000002s : 10: predicate.get_grad_eliminate 0.19% : 0.000001s : 5: predicate.graph_param_transform 0.58% : 0.000002s : 10: predicate.incorporate_call 0.50% : 0.000001s : 10: predicate.incorporate_call_switch 7.40% : 0.000020s : 72: predicate.inline 0.90% : 0.000002s : 10: predicate.inline_without_move 0.31% : 0.000001s : 10: predicate.j_node_and_user_rematch 1.00% : 0.000003s : 10: predicate.less_batch_normalization 1.81% : 0.000005s : 31: predicate.list_to_tuple_eliminator_ 2.53% : 0.000007s : 48: predicate.load_eliminater 1.31% : 0.000004s : 5: predicate.loop_unroll_after_grad 1.92% : 0.000005s : 36: predicate.loop_unroll_before_grad 1.62% : 0.000004s : 27: predicate.make_slice_get_slice_eliminator 0.55% : 0.000002s : 10: predicate.merge_addn 0.55% : 0.000001s : 10: predicate.micro_step_allgather_replace 0.57% : 0.000002s : 10: predicate.mini_step_allgather_replace 0.90% : 0.000002s : 17: predicate.minmaximum_grad 1.31% : 0.000004s : 5: predicate.mutable_eliminate 0.42% : 0.000001s : 5: predicate.opt_reshape 0.43% : 0.000001s : 5: predicate.parallel_virtual_node 1.65% : 0.000005s : 25: predicate.partial_defer_inline 1.55% : 0.000004s : 26: predicate.partial_eliminate 0.89% : 0.000002s : 17: predicate.print_const_string_wrapper 0.56% : 0.000002s : 10: predicate.reduce_all_const_elim 1.21% : 0.000003s : 17: predicate.reduce_eliminate 2.50% : 0.000007s : 48: predicate.redundant_stop_gradient_eliminater 0.37% : 0.000001s : 10: predicate.remove_not_recompute_node 1.27% : 0.000003s : 31: predicate.replace_applicator 0.46% : 0.000001s : 10: predicate.replace_old_param 0.40% : 0.000001s : 5: predicate.reset_defer_inline 1.03% : 0.000003s : 17: predicate.reshape_eliminate 0.66% : 0.000002s : 10: predicate.row_tensor_add_zeros_like 0.36% : 0.000001s : 5: predicate.row_tensor_eliminate 0.88% : 0.000002s : 10: predicate.same_eliminate 0.42% : 0.000001s : 10: predicate.set_cell_output_no_recompute 0.95% : 0.000003s : 10: predicate.shard_identity_eliminate 0.68% : 0.000002s : 10: predicate.special_op_eliminate 0.66% : 0.000002s : 10: predicate.specialize_transform 0.97% : 0.000003s : 10: predicate.split_environ_get_set_with_tuple_value 0.74% : 0.000002s : 10: predicate.stack_unstack_eliminate 0.37% : 0.000001s : 5: predicate.switch_call_monad_eliminater 1.42% : 0.000004s : 25: predicate.switch_defer_inline 1.91% : 0.000005s : 35: predicate.switch_layer_defer_inline 4.65% : 0.000013s : 76: predicate.switch_simplify 0.92% : 0.000003s : 17: predicate.tile_eliminate 1.00% : 0.000003s : 17: predicate.transpose_eliminate 1.69% : 0.000005s : 27: predicate.tuple_list_convert_item_index_to_positive 1.66% : 0.000005s : 27: predicate.tuple_list_get_item_const_eliminator 1.42% : 0.000004s : 27: predicate.tuple_list_get_item_depend_reorder 3.04% : 0.000008s : 41: predicate.tuple_list_get_item_eliminator 1.54% : 0.000004s : 27: predicate.tuple_list_get_set_item_eliminator 2.30% : 0.000006s : 37: predicate.tuple_list_set_item_eliminator 1.76% : 0.000005s : 31: predicate.tuple_to_list_eliminator_ 2.44% : 0.000007s : 48: predicate.updatestate_pure_node_eliminater 3.06% : 0.000008s : 58: predicate.updatestate_useless_node_eliminater 0.37% : 0.000001s : 5: predicate.value_based_eliminate 0.64% : 0.000002s : 10: predicate.virtual_dataset_eliminate 0.63% : 0.000002s : 10: predicate.virtual_output_eliminate 0.27% : 0.000001s : 5: predicate.virtual_view_grad_eliminate 0.36% : 0.000001s : 5: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000764 11 47.75% : 0.000365s : 5: func_graph_cloner_run.FuncGraphClonerGraph 52.25% : 0.000399s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.161199 192 0.00% : 0.000007s : 1: ForceFp32Comm 2.41% : 0.003881s : 1: add_attr 2.40% : 0.003861s : 1: add_attr_with_inline 0.00% : 0.000007s : 1: add_comm_op_reuse_tag 0.04% : 0.000071s : 1: add_recomputation 0.00% : 0.000007s : 1: assign_add_opt 0.06% : 0.000095s : 1: auto_monad 0.02% : 0.000036s : 1: auto_monad_reorder 0.00% : 0.000007s : 1: begin_end_overlap_inline 0.01% : 0.000009s : 1: bias_add_comm_swap 0.30% : 0.000482s : 1: bootstrap 0.03% : 0.000041s : 1: cconv 0.01% : 0.000008s : 1: comm_op_add_attrs 0.01% : 0.000024s : 1: control_data_broadcast_order 0.01% : 0.000014s : 1: convert_after_rewriter 0.03% : 0.000041s : 1: cse_after_recomputation 0.00% : 0.000007s : 1: dataset_repeat_opt 0.02% : 0.000026s : 1: detach_backward 0.01% : 0.000015s : 1: environ_conv 0.02% : 0.000035s : 1: event_method 0.01% : 0.000009s : 1: full_micro_interleaved_order_control 0.01% : 0.000008s : 1: get_jit_bprop_graph 0.01% : 0.000015s : 1: graph_reusing 0.01% : 0.000009s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000007s : 1: handle_group_info 0.01% : 0.000009s : 1: inline 0.01% : 0.000010s : 1: insert-virtual-dataset 0.00% : 0.000008s : 1: interleave_parallel_branches 0.00% : 0.000007s : 1: interleave_split_concat_branches 0.01% : 0.000009s : 1: label_fine_grained_interleaved_index 0.01% : 0.000012s : 1: label_micro_interleaved_index 0.35% : 0.000569s : 1: loop_unroll 0.00% : 0.000007s : 1: merge_cast_opt 0.01% : 0.000009s : 1: micro_interleaved_order_control 0.50% : 0.000800s : 1: mutable_eliminate 0.01% : 0.000013s : 1: offloading_packed_experts 0.01% : 0.000021s : 1: opt.transform.loop_unroll_optimizer 0.01% : 0.000022s : 1: opt.transform.mutable_eliminate 1.04% : 0.001683s : 78: opt.transform.opt_a 0.03% : 0.000044s : 1: opt.transform.opt_after_cconv 0.02% : 0.000035s : 1: opt.transform.opt_after_jit_grad 0.10% : 0.000159s : 28: opt.transform.opt_b 0.04% : 0.000071s : 2: opt.transform.opt_trans_graph 0.03% : 0.000051s : 4: opt.transform.symbol_engine_opt 2.93% : 0.004729s : 1: opt_a 0.11% : 0.000181s : 1: opt_after_cconv 0.38% : 0.000613s : 1: opt_after_jit_grad 0.25% : 0.000400s : 1: opt_b 5.16% : 0.008314s : 1: optimize 0.03% : 0.000046s : 1: optimize_parallel_all_gather_comm 0.01% : 0.000014s : 1: order_py_execute_after_rewriter 0.02% : 0.000032s : 1: overlap_grad_flash_sp 0.01% : 0.000008s : 1: overlap_grad_matmul_and_grad_allreduce 0.01% : 0.000012s : 1: overlap_grad_ring_attention 0.01% : 0.000009s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000008s : 1: overlap_opt_shard_in_pipeline 0.01% : 0.000010s : 1: overlap_param_gather 0.00% : 0.000007s : 1: overlap_recompute_allgather_and_fa_grad 0.01% : 0.000013s : 1: overlap_recompute_and_grad_model_parallel 0.01% : 0.000009s : 1: overlap_recompute_comm 0.01% : 0.000012s : 1: parallel-infer-symbol 0.00% : 0.000006s : 1: parallel-infer-symbol-second 0.01% : 0.000009s : 1: partial_unused_args_eliminate 0.01% : 0.000011s : 1: pipeline_parallel_scheduler 0.00% : 0.000008s : 1: pipeline_split 0.03% : 0.000052s : 1: pre_auto_parallel 0.03% : 0.000044s : 1: py_interpret_to_execute 0.01% : 0.000024s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000009s : 1: remove_cast_before_assign_add 0.02% : 0.000025s : 1: remove_dup_value 0.36% : 0.000579s : 1: renormalize.infer 0.28% : 0.000458s : 1: renormalize.specialize 0.01% : 0.000010s : 1: reorder_send_recv_between_fp_bp 0.01% : 0.000013s : 1: rewriter_after_jit_bprop_graph 0.03% : 0.000055s : 1: rewriter_after_opt_a 0.07% : 0.000108s : 1: rewriter_before_opt_a 0.01% : 0.000009s : 1: slice_cell_reuse_recomputed_activation 0.01% : 0.000009s : 1: slice_recompute_activation 0.01% : 0.000008s : 1: split_layernorm_comm 0.01% : 0.000009s : 1: split_matmul_comm_elemetwise 0.01% : 0.000014s : 1: swap_dp_allreduce_reducescatter 0.08% : 0.000130s : 1: symbol_engine_optimizer 0.08% : 0.000128s : 1: tuple_transform 82.38% : 0.132803s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:39:02.826.776 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.239646, [21] [bootstrap]: 0.00038727 [type_inference]: 0.227077 [event_method]: 2.28e-05 [auto_monad]: 8.043e-05 [graph_reusing]: 6.62002e-06 [inline]: 3.19001e-06 [add_attr]: 0.00406987, [1] [add_attr_with_inline]: 0.00405558, [1] [Cycle 1]: 8.742e-05, [2] [tag_attr]: 3.027e-05 [meta_addattr_fg_expand]: 5.97001e-06 [parallel-infer-symbol]: 3.73001e-06 [pre_auto_parallel]: 4.471e-05 [insert-virtual-dataset]: 2.36998e-06 [parallel-infer-symbol-second]: 1.00001e-06 [dataset_repeat_opt]: 2.36e-06 [pipeline_split]: 1.89e-06 [optimize]: 0.00702396, [53] [py_interpret_to_execute]: 3.714e-05 [rewriter_before_opt_a]: 0.00010381 [opt_a]: 0.003966, [2] [Cycle 1]: 0.00300203, [45] [expand_dump_flag]: 3.38999e-06 [switch_simplify]: 4.704e-05 [loop_unroll]: 3.296e-05 [a_1]: 0.00088845 [with_stream_mark]: 2.553e-05 [recompute_prepare]: 1.588e-05 [updatestate_depend_eliminate]: 5.56e-06 [updatestate_assign_eliminate]: 3.86999e-06 [updatestate_loads_eliminate]: 4.13999e-06 [parameter_eliminate]: 2.59999e-06 [a_2]: 0.00012409 [accelerated_algorithm]: 2.886e-05 [shard]: 2.78e-06 [meta_shard_fg_expand]: 3.23e-06 [shard_inline]: 1.032e-05 [merge_send_recv]: 1.1e-05 [auto_parallel]: 9.69e-06 [parallel]: 2.269e-05 [flash_sp]: 1.158e-05 [merge_comm]: 5.44e-06 [allreduce_fusion]: 4.2e-06 [matmul_add_comm_reduction]: 1.308e-05 [allreduce_slice_to_reducescatter]: 1.23002e-06 [virtual_shard_identity]: 1.496e-05 [virtual_dataset]: 9.57999e-06 [get_grad_eliminate_]: 9.58002e-06 [virtual_output]: 8.59998e-06 [merge_forward]: 5.21002e-06 [cell_reuse_recompute_pass]: 1.74998e-06 [offload_activation]: 1.234e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.89e-05 [merge_recompute_call_nodes]: 1.65001e-06 [before_grad]: 1.579e-05 [set_forward_comm_id_for_comm_node_pass]: 5.34e-06 [meta_fg_expand]: 3.9e-06 [flash_sp_send_recv_attached]: 5.40999e-06 [receive_attached]: 2.26e-06 [after_resolve]: 1.645e-05 [a_after_grad]: 1.312e-05 [renormalize]: 0.00112049 [add_forward_monad_depend]: 8.18001e-06 [auto_monad_grad]: 2.64001e-06 [auto_monad_eliminator]: 2.261e-05 [cse]: 4.319e-05 [a_3]: 7.609e-05 [Cycle 2]: 0.00094887, [45] [expand_dump_flag]: 2.73e-06 [switch_simplify]: 1.186e-05 [loop_unroll]: 9.22999e-06 [a_1]: 0.00022901 [with_stream_mark]: 1.678e-05 [recompute_prepare]: 9.18002e-06 [updatestate_depend_eliminate]: 4.77e-06 [updatestate_assign_eliminate]: 3.72998e-06 [updatestate_loads_eliminate]: 3.56999e-06 [parameter_eliminate]: 2.05002e-06 [a_2]: 0.00011027 [accelerated_algorithm]: 1.439e-05 [shard]: 1.96e-06 [meta_shard_fg_expand]: 2.86999e-06 [shard_inline]: 8.60999e-06 [merge_send_recv]: 1.047e-05 [auto_parallel]: 1.033e-05 [parallel]: 8.83001e-06 [flash_sp]: 4.36002e-06 [merge_comm]: 4.46002e-06 [allreduce_fusion]: 4.20999e-06 [matmul_add_comm_reduction]: 1.34e-05 [allreduce_slice_to_reducescatter]: 8.99978e-07 [virtual_shard_identity]: 1.155e-05 [virtual_dataset]: 8.15e-06 [get_grad_eliminate_]: 8.3e-06 [virtual_output]: 7.8e-06 [merge_forward]: 5.59998e-06 [cell_reuse_recompute_pass]: 2.95002e-06 [offload_activation]: 1.165e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.741e-05 [merge_recompute_call_nodes]: 1.40999e-06 [before_grad]: 1.448e-05 [set_forward_comm_id_for_comm_node_pass]: 5.78997e-06 [meta_fg_expand]: 3.3e-06 [flash_sp_send_recv_attached]: 1.81e-06 [receive_attached]: 2.87002e-06 [after_resolve]: 1.658e-05 [a_after_grad]: 1.263e-05 [renormalize]: 8.00064e-08 [add_forward_monad_depend]: 2.13002e-06 [auto_monad_grad]: 2.34001e-06 [auto_monad_eliminator]: 1.139e-05 [cse]: 2.754e-05 [a_3]: 5.377e-05 [py_interpret_to_execute_after_opt_a]: 1.993e-05 [slice_cell_reuse_recomputed_activation]: 2.89001e-06 [rewriter_after_opt_a]: 5.014e-05 [convert_after_rewriter]: 8.33001e-06 [order_py_execute_after_rewriter]: 7.54002e-06 [mutable_eliminate]: 0.00088023 [opt_b]: 0.00033424, [1] [Cycle 1]: 0.00032485, [7] [b_1]: 0.00019751 [b_2]: 1.326e-05 [updatestate_depend_eliminate]: 1.276e-05 [updatestate_assign_eliminate]: 4.50001e-06 [updatestate_loads_eliminate]: 4.17e-06 [renormalize]: 9.80013e-07 [cse]: 4.073e-05 [optimize_parallel_all_gather_comm]: 2.49e-05 [overlap_param_gather]: 2.40002e-06 [cconv]: 3.751e-05 [loop_unroll]: 0.00060603 [opt_after_cconv]: 0.00016927, [1] [Cycle 1]: 0.00016018, [7] [c_1]: 5.156e-05 [parameter_eliminate]: 5.73997e-06 [updatestate_depend_eliminate]: 1.032e-05 [updatestate_assign_eliminate]: 4.25999e-06 [updatestate_loads_eliminate]: 3.59002e-06 [cse]: 3.635e-05 [renormalize]: 1.00001e-06 [remove_dup_value]: 2.113e-05 [tuple_transform]: 0.00012069, [1] [Cycle 1]: 0.0001149, [4] [d_1]: 7.564e-05 [none_parameter_eliminate]: 1.72999e-06 [renormalize]: 5.19998e-07 [switch_simplify]: 1.078e-05 [partial_unused_args_eliminate]: 2.38002e-06 [add_recomputation]: 7.612e-05 [cse_after_recomputation]: 3.328e-05, [1] [Cycle 1]: 2.738e-05, [1] [cse]: 1.905e-05 [environ_conv]: 8.69e-06 [swap_dp_allreduce_reducescatter]: 7.89002e-06 [bias_add_comm_swap]: 4.45999e-06 [label_micro_interleaved_index]: 6.02001e-06 [label_fine_grained_interleaved_index]: 3.90998e-06 [merge_cast_opt]: 2.04999e-06 [slice_recompute_activation]: 2.34001e-06 [micro_interleaved_order_control]: 3.28e-06 [assign_add_opt]: 1.52001e-06 [ForceFp32Comm]: 1.14e-06 [remove_cast_before_assign_add]: 1.17999e-06 [full_micro_interleaved_order_control]: 2.76999e-06 [reorder_send_recv_between_fp_bp]: 3.25998e-06 [comm_op_add_attrs]: 1.02998e-06 [add_comm_op_reuse_tag]: 9.89996e-07 [interleave_split_concat_branches]: 1.65001e-06 [interleave_parallel_branches]: 1.65001e-06 [overlap_opt_shard_in_pipeline]: 1.37999e-06 [overlap_opt_shard_grad_in_pipeline]: 2.70002e-06 [control_data_broadcast_order]: 1.979e-05 [grouped_pairwise_exchange_alltoall]: 2.10002e-06 [offloading_packed_experts]: 5.39998e-06 [overlap_recompute_and_grad_model_parallel]: 6.44001e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.45999e-06 [overlap_recompute_allgather_and_fa_grad]: 1.59e-06 [overlap_recompute_comm]: 2.91999e-06 [overlap_grad_ring_attention]: 5.14e-06 [overlap_grad_flash_sp]: 2.908e-05 [begin_end_overlap_inline]: 6.29982e-07 [split_matmul_comm_elemetwise]: 2.34001e-06 [split_layernorm_comm]: 1.99e-06 [handle_group_info]: 1.30001e-06 [symbol_engine_optimizer]: 0.00011668, [1] [Cycle 1]: 0.00011063, [6] [build]: 5.02e-06 [elim_shapecalc]: 1.89e-05 [elim_not_effective]: 1.833e-05 [opt_reshape]: 1.025e-05 [fold_const_symbol]: 1.493e-05 [renormalize]: 2.10013e-07 [detach_backward]: 2.37001e-06 [pipeline_parallel_scheduler]: 1.49e-06 [auto_monad_reorder]: 2.439e-05 [get_jit_bprop_graph]: 1.99e-06 [rewriter_after_jit_bprop_graph]: 4.68999e-06 [opt_after_jit_grad]: 0.00064776 [validate]: 5.415e-05 Sums bootstrap : 0.000387s : 0.17% type_inference : 0.227077s : 96.88% event_method : 0.000023s : 0.01% auto_monad : 0.000080s : 0.03% graph_reusing : 0.000007s : 0.00% inline : 0.000003s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000030s : 0.01% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.00% parallel-infer-symbol : 0.000004s : 0.00% pre_auto_parallel : 0.000045s : 0.02% insert-virtual-dataset : 0.000002s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000037s : 0.02% optimize.rewriter_before_opt_a : 0.000104s : 0.04% optimize.opt_a.expand_dump_flag : 0.000006s : 0.00% optimize.opt_a.switch_simplify : 0.000059s : 0.03% optimize.opt_a.loop_unroll : 0.000042s : 0.02% optimize.opt_a.a_1 : 0.001117s : 0.48% optimize.opt_a.with_stream_mark : 0.000042s : 0.02% optimize.opt_a.recompute_prepare : 0.000025s : 0.01% optimize.opt_a.updatestate_depend_eliminate : 0.000010s : 0.00% optimize.opt_a.updatestate_assign_eliminate : 0.000008s : 0.00% optimize.opt_a.updatestate_loads_eliminate : 0.000008s : 0.00% optimize.opt_a.parameter_eliminate : 0.000005s : 0.00% optimize.opt_a.a_2 : 0.000234s : 0.10% optimize.opt_a.accelerated_algorithm : 0.000043s : 0.02% optimize.opt_a.shard : 0.000005s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.000006s : 0.00% optimize.opt_a.shard_inline : 0.000019s : 0.01% optimize.opt_a.merge_send_recv : 0.000021s : 0.01% optimize.opt_a.auto_parallel : 0.000020s : 0.01% optimize.opt_a.parallel : 0.000032s : 0.01% optimize.opt_a.flash_sp : 0.000016s : 0.01% optimize.opt_a.merge_comm : 0.000010s : 0.00% optimize.opt_a.allreduce_fusion : 0.000008s : 0.00% optimize.opt_a.matmul_add_comm_reduction : 0.000026s : 0.01% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000002s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000027s : 0.01% optimize.opt_a.virtual_dataset : 0.000018s : 0.01% optimize.opt_a.get_grad_eliminate_ : 0.000018s : 0.01% optimize.opt_a.virtual_output : 0.000016s : 0.01% optimize.opt_a.merge_forward : 0.000011s : 0.00% optimize.opt_a.cell_reuse_recompute_pass : 0.000005s : 0.00% optimize.opt_a.offload_activation : 0.000024s : 0.01% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000036s : 0.02% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.00% optimize.opt_a.before_grad : 0.000030s : 0.01% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000011s : 0.00% optimize.opt_a.meta_fg_expand : 0.000007s : 0.00% optimize.opt_a.flash_sp_send_recv_attached : 0.000007s : 0.00% optimize.opt_a.receive_attached : 0.000005s : 0.00% optimize.opt_a.after_resolve : 0.000033s : 0.01% optimize.opt_a.a_after_grad : 0.000026s : 0.01% optimize.opt_a.renormalize : 0.001121s : 0.48% optimize.opt_a.add_forward_monad_depend : 0.000010s : 0.00% optimize.opt_a.auto_monad_grad : 0.000005s : 0.00% optimize.opt_a.auto_monad_eliminator : 0.000034s : 0.01% optimize.opt_a.cse : 0.000071s : 0.03% optimize.opt_a.a_3 : 0.000130s : 0.06% optimize.py_interpret_to_execute_after_opt_a : 0.000020s : 0.01% optimize.slice_cell_reuse_recomputed_activation : 0.000003s : 0.00% optimize.rewriter_after_opt_a : 0.000050s : 0.02% optimize.convert_after_rewriter : 0.000008s : 0.00% optimize.order_py_execute_after_rewriter : 0.000008s : 0.00% optimize.mutable_eliminate : 0.000880s : 0.38% optimize.opt_b.b_1 : 0.000198s : 0.08% optimize.opt_b.b_2 : 0.000013s : 0.01% optimize.opt_b.updatestate_depend_eliminate : 0.000013s : 0.01% optimize.opt_b.updatestate_assign_eliminate : 0.000005s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000004s : 0.00% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000041s : 0.02% optimize.optimize_parallel_all_gather_comm : 0.000025s : 0.01% optimize.overlap_param_gather : 0.000002s : 0.00% optimize.cconv : 0.000038s : 0.02% optimize.loop_unroll : 0.000606s : 0.26% optimize.opt_after_cconv.c_1 : 0.000052s : 0.02% optimize.opt_after_cconv.parameter_eliminate : 0.000006s : 0.00% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000010s : 0.00% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000004s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000004s : 0.00% optimize.opt_after_cconv.cse : 0.000036s : 0.02% optimize.opt_after_cconv.renormalize : 0.000001s : 0.00% optimize.remove_dup_value : 0.000021s : 0.01% optimize.tuple_transform.d_1 : 0.000076s : 0.03% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000001s : 0.00% optimize.tuple_transform.switch_simplify : 0.000011s : 0.00% optimize.partial_unused_args_eliminate : 0.000002s : 0.00% optimize.add_recomputation : 0.000076s : 0.03% optimize.cse_after_recomputation.cse : 0.000019s : 0.01% optimize.environ_conv : 0.000009s : 0.00% optimize.swap_dp_allreduce_reducescatter : 0.000008s : 0.00% optimize.bias_add_comm_swap : 0.000004s : 0.00% optimize.label_micro_interleaved_index : 0.000006s : 0.00% optimize.label_fine_grained_interleaved_index : 0.000004s : 0.00% optimize.merge_cast_opt : 0.000002s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.00% optimize.micro_interleaved_order_control : 0.000003s : 0.00% optimize.assign_add_opt : 0.000002s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.00% optimize.full_micro_interleaved_order_control : 0.000003s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.00% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000002s : 0.00% optimize.interleave_parallel_branches : 0.000002s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000003s : 0.00% optimize.control_data_broadcast_order : 0.000020s : 0.01% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.00% optimize.offloading_packed_experts : 0.000005s : 0.00% optimize.overlap_recompute_and_grad_model_parallel : 0.000006s : 0.00% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000002s : 0.00% optimize.overlap_recompute_comm : 0.000003s : 0.00% optimize.overlap_grad_ring_attention : 0.000005s : 0.00% optimize.overlap_grad_flash_sp : 0.000029s : 0.01% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.00% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000005s : 0.00% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000019s : 0.01% optimize.symbol_engine_optimizer.elim_not_effective : 0.000018s : 0.01% optimize.symbol_engine_optimizer.opt_reshape : 0.000010s : 0.00% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000015s : 0.01% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.00% pipeline_parallel_scheduler : 0.000001s : 0.00% auto_monad_reorder : 0.000024s : 0.01% get_jit_bprop_graph : 0.000002s : 0.00% rewriter_after_jit_bprop_graph : 0.000005s : 0.00% opt_after_jit_grad : 0.000648s : 0.28% validate : 0.000054s : 0.02% Time group info: ------[substitution.] 0.000354 45 11.59% : 0.000041s : 5: substitution.arithmetic_simplify 9.64% : 0.000034s : 3: substitution.cast_eliminate 0.72% : 0.000003s : 3: substitution.elim_not_effective 0.58% : 0.000002s : 3: substitution.fold_const_symbol 2.57% : 0.000009s : 5: substitution.graph_param_transform 59.60% : 0.000211s : 4: substitution.inline 1.72% : 0.000006s : 6: substitution.j_node_and_user_rematch 4.55% : 0.000016s : 2: substitution.less_batch_normalization 2.01% : 0.000007s : 6: substitution.remove_not_recompute_node 2.10% : 0.000007s : 4: substitution.replace_old_param 4.93% : 0.000017s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.226999 2 99.57% : 0.226018s : 1: type_inference.infer 0.43% : 0.000980s : 1: type_inference.specialize ------[replace.] 0.000074 8 61.06% : 0.000045s : 4: replace.inline 38.94% : 0.000029s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000224 8 93.00% : 0.000208s : 4: match.inline 7.00% : 0.000016s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000290 1596 0.98% : 0.000003s : 17: predicate.accumulaten_eliminater 0.80% : 0.000002s : 5: predicate.ad_related_special_op_eliminate 0.50% : 0.000001s : 10: predicate.addn_check_dump 0.95% : 0.000003s : 17: predicate.addn_zero_filter 0.80% : 0.000002s : 17: predicate.adjust_all_reduce_mul_add 2.57% : 0.000007s : 27: predicate.arithmetic_simplify 1.10% : 0.000003s : 17: predicate.cast_eliminate 0.58% : 0.000002s : 10: predicate.check_bprop_eliminate 0.57% : 0.000002s : 10: predicate.compare_switch_simplify 0.19% : 0.000001s : 5: predicate.const_output_eliminate 0.67% : 0.000002s : 10: predicate.depend_value_elim 1.00% : 0.000003s : 17: predicate.dict_get_item_const_eliminator 1.04% : 0.000003s : 17: predicate.dict_get_item_eliminator 1.06% : 0.000003s : 17: predicate.dict_set_item_eliminator 0.82% : 0.000002s : 10: predicate.dumpgradient_eliminate 0.24% : 0.000001s : 5: predicate.elim_not_effective 0.56% : 0.000002s : 5: predicate.elim_shapecalc_of_broadcastargs 1.13% : 0.000003s : 22: predicate.environ_add_const_eliminate 1.09% : 0.000003s : 22: predicate.environ_get_add_eliminate 1.03% : 0.000003s : 22: predicate.environ_get_depend_swap 1.64% : 0.000005s : 32: predicate.environ_get_eliminate 1.19% : 0.000003s : 22: predicate.environ_get_set_eliminate 1.27% : 0.000004s : 25: predicate.exchange_switch_depend_value 2.16% : 0.000006s : 25: predicate.float_depend_g_call 0.51% : 0.000001s : 10: predicate.float_environ_get_switch 0.72% : 0.000002s : 15: predicate.float_tuple_getitem_switch 0.17% : 0.000001s : 5: predicate.fold_const_symbol 0.68% : 0.000002s : 10: predicate.get_grad_eliminate 0.18% : 0.000001s : 5: predicate.graph_param_transform 0.61% : 0.000002s : 10: predicate.incorporate_call 0.46% : 0.000001s : 10: predicate.incorporate_call_switch 6.28% : 0.000018s : 72: predicate.inline 0.70% : 0.000002s : 10: predicate.inline_without_move 0.32% : 0.000001s : 10: predicate.j_node_and_user_rematch 1.12% : 0.000003s : 10: predicate.less_batch_normalization 1.62% : 0.000005s : 31: predicate.list_to_tuple_eliminator_ 2.36% : 0.000007s : 48: predicate.load_eliminater 1.47% : 0.000004s : 5: predicate.loop_unroll_after_grad 2.00% : 0.000006s : 36: predicate.loop_unroll_before_grad 1.97% : 0.000006s : 27: predicate.make_slice_get_slice_eliminator 0.58% : 0.000002s : 10: predicate.merge_addn 0.53% : 0.000002s : 10: predicate.micro_step_allgather_replace 0.56% : 0.000002s : 10: predicate.mini_step_allgather_replace 0.78% : 0.000002s : 17: predicate.minmaximum_grad 1.31% : 0.000004s : 5: predicate.mutable_eliminate 0.37% : 0.000001s : 5: predicate.opt_reshape 0.39% : 0.000001s : 5: predicate.parallel_virtual_node 1.84% : 0.000005s : 25: predicate.partial_defer_inline 1.52% : 0.000004s : 26: predicate.partial_eliminate 0.94% : 0.000003s : 17: predicate.print_const_string_wrapper 0.59% : 0.000002s : 10: predicate.reduce_all_const_elim 1.13% : 0.000003s : 17: predicate.reduce_eliminate 2.50% : 0.000007s : 48: predicate.redundant_stop_gradient_eliminater 0.33% : 0.000001s : 10: predicate.remove_not_recompute_node 1.41% : 0.000004s : 31: predicate.replace_applicator 0.55% : 0.000002s : 10: predicate.replace_old_param 0.44% : 0.000001s : 5: predicate.reset_defer_inline 0.96% : 0.000003s : 17: predicate.reshape_eliminate 0.63% : 0.000002s : 10: predicate.row_tensor_add_zeros_like 0.47% : 0.000001s : 5: predicate.row_tensor_eliminate 0.97% : 0.000003s : 10: predicate.same_eliminate 0.55% : 0.000002s : 10: predicate.set_cell_output_no_recompute 0.96% : 0.000003s : 10: predicate.shard_identity_eliminate 0.76% : 0.000002s : 10: predicate.special_op_eliminate 0.73% : 0.000002s : 10: predicate.specialize_transform 1.14% : 0.000003s : 10: predicate.split_environ_get_set_with_tuple_value 0.84% : 0.000002s : 10: predicate.stack_unstack_eliminate 0.44% : 0.000001s : 5: predicate.switch_call_monad_eliminater 1.37% : 0.000004s : 25: predicate.switch_defer_inline 1.82% : 0.000005s : 35: predicate.switch_layer_defer_inline 4.48% : 0.000013s : 76: predicate.switch_simplify 0.90% : 0.000003s : 17: predicate.tile_eliminate 0.81% : 0.000002s : 17: predicate.transpose_eliminate 1.58% : 0.000005s : 27: predicate.tuple_list_convert_item_index_to_positive 1.56% : 0.000005s : 27: predicate.tuple_list_get_item_const_eliminator 1.50% : 0.000004s : 27: predicate.tuple_list_get_item_depend_reorder 3.29% : 0.000010s : 41: predicate.tuple_list_get_item_eliminator 1.42% : 0.000004s : 27: predicate.tuple_list_get_set_item_eliminator 2.40% : 0.000007s : 37: predicate.tuple_list_set_item_eliminator 1.61% : 0.000005s : 31: predicate.tuple_to_list_eliminator_ 2.31% : 0.000007s : 48: predicate.updatestate_pure_node_eliminater 3.06% : 0.000009s : 58: predicate.updatestate_useless_node_eliminater 0.41% : 0.000001s : 5: predicate.value_based_eliminate 0.67% : 0.000002s : 10: predicate.virtual_dataset_eliminate 0.56% : 0.000002s : 10: predicate.virtual_output_eliminate 0.27% : 0.000001s : 5: predicate.virtual_view_grad_eliminate 0.70% : 0.000002s : 5: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.220765 11 99.83% : 0.220387s : 5: func_graph_cloner_run.FuncGraphClonerGraph 0.17% : 0.000378s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.253909 192 0.00% : 0.000005s : 1: ForceFp32Comm 1.61% : 0.004078s : 1: add_attr 1.60% : 0.004061s : 1: add_attr_with_inline 0.00% : 0.000004s : 1: add_comm_op_reuse_tag 0.03% : 0.000082s : 1: add_recomputation 0.00% : 0.000004s : 1: assign_add_opt 0.03% : 0.000088s : 1: auto_monad 0.01% : 0.000030s : 1: auto_monad_reorder 0.00% : 0.000004s : 1: begin_end_overlap_inline 0.00% : 0.000008s : 1: bias_add_comm_swap 0.16% : 0.000416s : 1: bootstrap 0.02% : 0.000042s : 1: cconv 0.00% : 0.000004s : 1: comm_op_add_attrs 0.01% : 0.000024s : 1: control_data_broadcast_order 0.00% : 0.000012s : 1: convert_after_rewriter 0.01% : 0.000037s : 1: cse_after_recomputation 0.00% : 0.000005s : 1: dataset_repeat_opt 0.00% : 0.000006s : 1: detach_backward 0.00% : 0.000013s : 1: environ_conv 0.01% : 0.000030s : 1: event_method 0.00% : 0.000006s : 1: full_micro_interleaved_order_control 0.00% : 0.000005s : 1: get_jit_bprop_graph 0.00% : 0.000010s : 1: graph_reusing 0.00% : 0.000005s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000005s : 1: handle_group_info 0.00% : 0.000006s : 1: inline 0.00% : 0.000006s : 1: insert-virtual-dataset 0.00% : 0.000005s : 1: interleave_parallel_branches 0.00% : 0.000005s : 1: interleave_split_concat_branches 0.00% : 0.000008s : 1: label_fine_grained_interleaved_index 0.00% : 0.000010s : 1: label_micro_interleaved_index 0.24% : 0.000620s : 1: loop_unroll 0.00% : 0.000005s : 1: merge_cast_opt 0.00% : 0.000006s : 1: micro_interleaved_order_control 0.35% : 0.000892s : 1: mutable_eliminate 0.00% : 0.000009s : 1: offloading_packed_experts 0.01% : 0.000026s : 1: opt.transform.loop_unroll_optimizer 0.01% : 0.000028s : 1: opt.transform.mutable_eliminate 0.69% : 0.001758s : 78: opt.transform.opt_a 0.02% : 0.000050s : 1: opt.transform.opt_after_cconv 0.01% : 0.000038s : 1: opt.transform.opt_after_jit_grad 0.07% : 0.000169s : 28: opt.transform.opt_b 0.03% : 0.000083s : 2: opt.transform.opt_trans_graph 0.02% : 0.000058s : 4: opt.transform.symbol_engine_opt 1.56% : 0.003970s : 1: opt_a 0.07% : 0.000174s : 1: opt_after_cconv 0.26% : 0.000661s : 1: opt_after_jit_grad 0.13% : 0.000339s : 1: opt_b 2.77% : 0.007031s : 1: optimize 0.01% : 0.000030s : 1: optimize_parallel_all_gather_comm 0.00% : 0.000011s : 1: order_py_execute_after_rewriter 0.01% : 0.000033s : 1: overlap_grad_flash_sp 0.00% : 0.000005s : 1: overlap_grad_matmul_and_grad_allreduce 0.00% : 0.000009s : 1: overlap_grad_ring_attention 0.00% : 0.000006s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000005s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000006s : 1: overlap_param_gather 0.00% : 0.000005s : 1: overlap_recompute_allgather_and_fa_grad 0.00% : 0.000010s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000007s : 1: overlap_recompute_comm 0.00% : 0.000007s : 1: parallel-infer-symbol 0.00% : 0.000004s : 1: parallel-infer-symbol-second 0.00% : 0.000006s : 1: partial_unused_args_eliminate 0.00% : 0.000006s : 1: pipeline_parallel_scheduler 0.00% : 0.000005s : 1: pipeline_split 0.02% : 0.000049s : 1: pre_auto_parallel 0.02% : 0.000041s : 1: py_interpret_to_execute 0.01% : 0.000024s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000004s : 1: remove_cast_before_assign_add 0.01% : 0.000025s : 1: remove_dup_value 0.24% : 0.000617s : 1: renormalize.infer 0.19% : 0.000491s : 1: renormalize.specialize 0.00% : 0.000007s : 1: reorder_send_recv_between_fp_bp 0.00% : 0.000008s : 1: rewriter_after_jit_bprop_graph 0.02% : 0.000055s : 1: rewriter_after_opt_a 0.04% : 0.000109s : 1: rewriter_before_opt_a 0.00% : 0.000006s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000006s : 1: slice_recompute_activation 0.00% : 0.000005s : 1: split_layernorm_comm 0.00% : 0.000006s : 1: split_matmul_comm_elemetwise 0.00% : 0.000012s : 1: swap_dp_allreduce_reducescatter 0.05% : 0.000120s : 1: symbol_engine_optimizer 0.05% : 0.000124s : 1: tuple_transform 89.44% : 0.227103s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:39:05.184.045 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:39:05.184.326 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.223924, [21] [bootstrap]: 0.00046177 [type_inference]: 0.210878 [event_method]: 2.247e-05 [auto_monad]: 7.626e-05 [graph_reusing]: 6.63e-06 [inline]: 3.18e-06 [add_attr]: 0.00389996, [1] [add_attr_with_inline]: 0.00388684, [1] [Cycle 1]: 8.408e-05, [2] [tag_attr]: 2.23e-05 [meta_addattr_fg_expand]: 6.71e-06 [parallel-infer-symbol]: 3.66001e-06 [pre_auto_parallel]: 3.791e-05 [insert-virtual-dataset]: 2.46998e-06 [parallel-infer-symbol-second]: 7.89994e-07 [dataset_repeat_opt]: 2.32999e-06 [pipeline_split]: 1.80001e-06 [optimize]: 0.00705058, [53] [py_interpret_to_execute]: 3.566e-05 [rewriter_before_opt_a]: 9.594e-05 [opt_a]: 0.00394317, [2] [Cycle 1]: 0.00285465, [45] [expand_dump_flag]: 3.03e-06 [switch_simplify]: 4.403e-05 [loop_unroll]: 3.224e-05 [a_1]: 0.00073147 [with_stream_mark]: 1.759e-05 [recompute_prepare]: 1.223e-05 [updatestate_depend_eliminate]: 5.761e-05 [updatestate_assign_eliminate]: 4.07998e-06 [updatestate_loads_eliminate]: 3.67002e-06 [parameter_eliminate]: 2.34999e-06 [a_2]: 0.00015607 [accelerated_algorithm]: 2.532e-05 [shard]: 2.29999e-06 [meta_shard_fg_expand]: 2.76e-06 [shard_inline]: 1.033e-05 [merge_send_recv]: 9.15999e-06 [auto_parallel]: 8.33999e-06 [parallel]: 2.211e-05 [flash_sp]: 1.032e-05 [merge_comm]: 4.80999e-06 [allreduce_fusion]: 4.70001e-06 [matmul_add_comm_reduction]: 1.253e-05 [allreduce_slice_to_reducescatter]: 6.50005e-07 [virtual_shard_identity]: 1.099e-05 [virtual_dataset]: 8.94998e-06 [get_grad_eliminate_]: 9.34e-06 [virtual_output]: 8.52e-06 [merge_forward]: 5.59998e-06 [cell_reuse_recompute_pass]: 1.85001e-06 [offload_activation]: 1.221e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.849e-05 [merge_recompute_call_nodes]: 1.86e-06 [before_grad]: 1.369e-05 [set_forward_comm_id_for_comm_node_pass]: 4.64998e-06 [meta_fg_expand]: 3.53999e-06 [flash_sp_send_recv_attached]: 5.74e-06 [receive_attached]: 2.46998e-06 [after_resolve]: 1.524e-05 [a_after_grad]: 1.336e-05 [renormalize]: 0.0009633 [add_forward_monad_depend]: 6.89001e-06 [auto_monad_grad]: 3.01999e-06 [auto_monad_eliminator]: 2.11e-05 [cse]: 3.934e-05 [a_3]: 8.389e-05 [Cycle 2]: 0.00106966, [45] [expand_dump_flag]: 2.22001e-06 [switch_simplify]: 9.93002e-06 [loop_unroll]: 8.60001e-06 [a_1]: 0.00020581 [with_stream_mark]: 1.458e-05 [recompute_prepare]: 8.64e-06 [updatestate_depend_eliminate]: 4.84e-06 [updatestate_assign_eliminate]: 4.17998e-06 [updatestate_loads_eliminate]: 3.37002e-06 [parameter_eliminate]: 1.84998e-06 [a_2]: 0.00013149 [accelerated_algorithm]: 1.304e-05 [shard]: 1.64998e-06 [meta_shard_fg_expand]: 2.29999e-06 [shard_inline]: 8.89998e-06 [merge_send_recv]: 9.77999e-06 [auto_parallel]: 8.78001e-06 [parallel]: 8.62e-06 [flash_sp]: 4.07e-06 [merge_comm]: 4.74e-06 [allreduce_fusion]: 3.97e-06 [matmul_add_comm_reduction]: 1.305e-05 [allreduce_slice_to_reducescatter]: 5.89993e-07 [virtual_shard_identity]: 1.042e-05 [virtual_dataset]: 8.22e-06 [get_grad_eliminate_]: 7.63001e-06 [virtual_output]: 8.55999e-06 [merge_forward]: 5.49998e-06 [cell_reuse_recompute_pass]: 2.02999e-06 [offload_activation]: 1.263e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.698e-05 [merge_recompute_call_nodes]: 1.34e-06 [before_grad]: 1.256e-05 [set_forward_comm_id_for_comm_node_pass]: 5.99999e-06 [meta_fg_expand]: 4.05e-06 [flash_sp_send_recv_attached]: 1.89999e-06 [receive_attached]: 2.16e-06 [after_resolve]: 1.327e-05 [a_after_grad]: 1.28e-05 [renormalize]: 8.9989e-08 [add_forward_monad_depend]: 3.31001e-06 [auto_monad_grad]: 1.94e-06 [auto_monad_eliminator]: 1.319e-05 [cse]: 2.615e-05 [a_3]: 6.384e-05 [py_interpret_to_execute_after_opt_a]: 2.097e-05 [slice_cell_reuse_recomputed_activation]: 4.89998e-06 [rewriter_after_opt_a]: 5.714e-05 [convert_after_rewriter]: 1.169e-05 [order_py_execute_after_rewriter]: 1.023e-05 [mutable_eliminate]: 0.00081161 [opt_b]: 0.00041306, [1] [Cycle 1]: 0.00040144, [7] [b_1]: 0.00026249 [b_2]: 1.262e-05 [updatestate_depend_eliminate]: 1.216e-05 [updatestate_assign_eliminate]: 4.17e-06 [updatestate_loads_eliminate]: 3.42002e-06 [renormalize]: 8.00006e-07 [cse]: 3.718e-05 [optimize_parallel_all_gather_comm]: 2.71e-05 [overlap_param_gather]: 5.84e-06 [cconv]: 4.255e-05 [loop_unroll]: 0.00056277 [opt_after_cconv]: 0.00017018, [1] [Cycle 1]: 0.00015987, [7] [c_1]: 4.589e-05 [parameter_eliminate]: 7.05e-06 [updatestate_depend_eliminate]: 8.51002e-06 [updatestate_assign_eliminate]: 3.25e-06 [updatestate_loads_eliminate]: 3.45e-06 [cse]: 3.125e-05 [renormalize]: 5.69999e-07 [remove_dup_value]: 2.249e-05 [tuple_transform]: 0.00012199, [1] [Cycle 1]: 0.00011323, [4] [d_1]: 6.823e-05 [none_parameter_eliminate]: 1.84e-06 [renormalize]: 1.8999e-07 [switch_simplify]: 9.29e-06 [partial_unused_args_eliminate]: 5.29998e-06 [add_recomputation]: 7.135e-05 [cse_after_recomputation]: 3.58e-05, [1] [Cycle 1]: 2.692e-05, [1] [cse]: 1.668e-05 [environ_conv]: 1.122e-05 [swap_dp_allreduce_reducescatter]: 9.95002e-06 [bias_add_comm_swap]: 6.02999e-06 [label_micro_interleaved_index]: 9.00001e-06 [label_fine_grained_interleaved_index]: 5.29e-06 [merge_cast_opt]: 3.97e-06 [slice_recompute_activation]: 4.90999e-06 [micro_interleaved_order_control]: 5.52001e-06 [assign_add_opt]: 4.1e-06 [ForceFp32Comm]: 3.51001e-06 [remove_cast_before_assign_add]: 4.05998e-06 [full_micro_interleaved_order_control]: 4.62e-06 [reorder_send_recv_between_fp_bp]: 6.54999e-06 [comm_op_add_attrs]: 4.08999e-06 [add_comm_op_reuse_tag]: 4.11001e-06 [interleave_split_concat_branches]: 4.42998e-06 [interleave_parallel_branches]: 4.12e-06 [overlap_opt_shard_in_pipeline]: 4.37998e-06 [overlap_opt_shard_grad_in_pipeline]: 4.87e-06 [control_data_broadcast_order]: 2.211e-05 [grouped_pairwise_exchange_alltoall]: 4.89e-06 [offloading_packed_experts]: 8.45999e-06 [overlap_recompute_and_grad_model_parallel]: 1.014e-05 [overlap_grad_matmul_and_grad_allreduce]: 3.52002e-06 [overlap_recompute_allgather_and_fa_grad]: 3.58999e-06 [overlap_recompute_comm]: 5.39e-06 [overlap_grad_ring_attention]: 8.64e-06 [overlap_grad_flash_sp]: 2.85e-05 [begin_end_overlap_inline]: 4.95001e-06 [split_matmul_comm_elemetwise]: 5.14e-06 [split_layernorm_comm]: 4.80999e-06 [handle_group_info]: 4.06001e-06 [symbol_engine_optimizer]: 0.00012527, [1] [Cycle 1]: 0.00011686, [6] [build]: 4.34997e-06 [elim_shapecalc]: 1.464e-05 [elim_not_effective]: 1.749e-05 [opt_reshape]: 9.41e-06 [fold_const_symbol]: 1.347e-05 [renormalize]: 2.59985e-07 [detach_backward]: 4.48001e-06 [pipeline_parallel_scheduler]: 1.89e-06 [auto_monad_reorder]: 2.69e-05 [get_jit_bprop_graph]: 1.91003e-06 [rewriter_after_jit_bprop_graph]: 6.31998e-06 [opt_after_jit_grad]: 0.00062865 [validate]: 5.177e-05 Sums bootstrap : 0.000462s : 0.21% type_inference : 0.210878s : 96.74% event_method : 0.000022s : 0.01% auto_monad : 0.000076s : 0.03% graph_reusing : 0.000007s : 0.00% inline : 0.000003s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000022s : 0.01% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000007s : 0.00% parallel-infer-symbol : 0.000004s : 0.00% pre_auto_parallel : 0.000038s : 0.02% insert-virtual-dataset : 0.000002s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000036s : 0.02% optimize.rewriter_before_opt_a : 0.000096s : 0.04% optimize.opt_a.expand_dump_flag : 0.000005s : 0.00% optimize.opt_a.switch_simplify : 0.000054s : 0.02% optimize.opt_a.loop_unroll : 0.000041s : 0.02% optimize.opt_a.a_1 : 0.000937s : 0.43% optimize.opt_a.with_stream_mark : 0.000032s : 0.01% optimize.opt_a.recompute_prepare : 0.000021s : 0.01% optimize.opt_a.updatestate_depend_eliminate : 0.000062s : 0.03% optimize.opt_a.updatestate_assign_eliminate : 0.000008s : 0.00% optimize.opt_a.updatestate_loads_eliminate : 0.000007s : 0.00% optimize.opt_a.parameter_eliminate : 0.000004s : 0.00% optimize.opt_a.a_2 : 0.000288s : 0.13% optimize.opt_a.accelerated_algorithm : 0.000038s : 0.02% optimize.opt_a.shard : 0.000004s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.000005s : 0.00% optimize.opt_a.shard_inline : 0.000019s : 0.01% optimize.opt_a.merge_send_recv : 0.000019s : 0.01% optimize.opt_a.auto_parallel : 0.000017s : 0.01% optimize.opt_a.parallel : 0.000031s : 0.01% optimize.opt_a.flash_sp : 0.000014s : 0.01% optimize.opt_a.merge_comm : 0.000010s : 0.00% optimize.opt_a.allreduce_fusion : 0.000009s : 0.00% optimize.opt_a.matmul_add_comm_reduction : 0.000026s : 0.01% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000021s : 0.01% optimize.opt_a.virtual_dataset : 0.000017s : 0.01% optimize.opt_a.get_grad_eliminate_ : 0.000017s : 0.01% optimize.opt_a.virtual_output : 0.000017s : 0.01% optimize.opt_a.merge_forward : 0.000011s : 0.01% optimize.opt_a.cell_reuse_recompute_pass : 0.000004s : 0.00% optimize.opt_a.offload_activation : 0.000025s : 0.01% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000035s : 0.02% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.00% optimize.opt_a.before_grad : 0.000026s : 0.01% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000011s : 0.00% optimize.opt_a.meta_fg_expand : 0.000008s : 0.00% optimize.opt_a.flash_sp_send_recv_attached : 0.000008s : 0.00% optimize.opt_a.receive_attached : 0.000005s : 0.00% optimize.opt_a.after_resolve : 0.000029s : 0.01% optimize.opt_a.a_after_grad : 0.000026s : 0.01% optimize.opt_a.renormalize : 0.000963s : 0.44% optimize.opt_a.add_forward_monad_depend : 0.000010s : 0.00% optimize.opt_a.auto_monad_grad : 0.000005s : 0.00% optimize.opt_a.auto_monad_eliminator : 0.000034s : 0.02% optimize.opt_a.cse : 0.000065s : 0.03% optimize.opt_a.a_3 : 0.000148s : 0.07% optimize.py_interpret_to_execute_after_opt_a : 0.000021s : 0.01% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.00% optimize.rewriter_after_opt_a : 0.000057s : 0.03% optimize.convert_after_rewriter : 0.000012s : 0.01% optimize.order_py_execute_after_rewriter : 0.000010s : 0.00% optimize.mutable_eliminate : 0.000812s : 0.37% optimize.opt_b.b_1 : 0.000262s : 0.12% optimize.opt_b.b_2 : 0.000013s : 0.01% optimize.opt_b.updatestate_depend_eliminate : 0.000012s : 0.01% optimize.opt_b.updatestate_assign_eliminate : 0.000004s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.00% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000037s : 0.02% optimize.optimize_parallel_all_gather_comm : 0.000027s : 0.01% optimize.overlap_param_gather : 0.000006s : 0.00% optimize.cconv : 0.000043s : 0.02% optimize.loop_unroll : 0.000563s : 0.26% optimize.opt_after_cconv.c_1 : 0.000046s : 0.02% optimize.opt_after_cconv.parameter_eliminate : 0.000007s : 0.00% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000009s : 0.00% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.cse : 0.000031s : 0.01% optimize.opt_after_cconv.renormalize : 0.000001s : 0.00% optimize.remove_dup_value : 0.000022s : 0.01% optimize.tuple_transform.d_1 : 0.000068s : 0.03% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000009s : 0.00% optimize.partial_unused_args_eliminate : 0.000005s : 0.00% optimize.add_recomputation : 0.000071s : 0.03% optimize.cse_after_recomputation.cse : 0.000017s : 0.01% optimize.environ_conv : 0.000011s : 0.01% optimize.swap_dp_allreduce_reducescatter : 0.000010s : 0.00% optimize.bias_add_comm_swap : 0.000006s : 0.00% optimize.label_micro_interleaved_index : 0.000009s : 0.00% optimize.label_fine_grained_interleaved_index : 0.000005s : 0.00% optimize.merge_cast_opt : 0.000004s : 0.00% optimize.slice_recompute_activation : 0.000005s : 0.00% optimize.micro_interleaved_order_control : 0.000006s : 0.00% optimize.assign_add_opt : 0.000004s : 0.00% optimize.ForceFp32Comm : 0.000004s : 0.00% optimize.remove_cast_before_assign_add : 0.000004s : 0.00% optimize.full_micro_interleaved_order_control : 0.000005s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000007s : 0.00% optimize.comm_op_add_attrs : 0.000004s : 0.00% optimize.add_comm_op_reuse_tag : 0.000004s : 0.00% optimize.interleave_split_concat_branches : 0.000004s : 0.00% optimize.interleave_parallel_branches : 0.000004s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000005s : 0.00% optimize.control_data_broadcast_order : 0.000022s : 0.01% optimize.grouped_pairwise_exchange_alltoall : 0.000005s : 0.00% optimize.offloading_packed_experts : 0.000008s : 0.00% optimize.overlap_recompute_and_grad_model_parallel : 0.000010s : 0.00% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.00% optimize.overlap_recompute_comm : 0.000005s : 0.00% optimize.overlap_grad_ring_attention : 0.000009s : 0.00% optimize.overlap_grad_flash_sp : 0.000029s : 0.01% optimize.begin_end_overlap_inline : 0.000005s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000005s : 0.00% optimize.split_layernorm_comm : 0.000005s : 0.00% optimize.handle_group_info : 0.000004s : 0.00% optimize.symbol_engine_optimizer.build : 0.000004s : 0.00% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000015s : 0.01% optimize.symbol_engine_optimizer.elim_not_effective : 0.000017s : 0.01% optimize.symbol_engine_optimizer.opt_reshape : 0.000009s : 0.00% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000013s : 0.01% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000004s : 0.00% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000027s : 0.01% get_jit_bprop_graph : 0.000002s : 0.00% rewriter_after_jit_bprop_graph : 0.000006s : 0.00% opt_after_jit_grad : 0.000629s : 0.29% validate : 0.000052s : 0.02% Time group info: ------[substitution.] 0.000280 45 13.78% : 0.000039s : 5: substitution.arithmetic_simplify 10.76% : 0.000030s : 3: substitution.cast_eliminate 0.84% : 0.000002s : 3: substitution.elim_not_effective 0.57% : 0.000002s : 3: substitution.fold_const_symbol 2.63% : 0.000007s : 5: substitution.graph_param_transform 55.43% : 0.000155s : 4: substitution.inline 1.93% : 0.000005s : 6: substitution.j_node_and_user_rematch 4.97% : 0.000014s : 2: substitution.less_batch_normalization 2.35% : 0.000007s : 6: substitution.remove_not_recompute_node 2.07% : 0.000006s : 4: substitution.replace_old_param 4.68% : 0.000013s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.210815 2 99.56% : 0.209884s : 1: type_inference.infer 0.44% : 0.000931s : 1: type_inference.specialize ------[replace.] 0.000064 8 60.30% : 0.000038s : 4: replace.inline 39.70% : 0.000025s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000163 8 93.10% : 0.000152s : 4: match.inline 6.90% : 0.000011s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000267 1504 0.78% : 0.000002s : 15: predicate.accumulaten_eliminater 1.01% : 0.000003s : 5: predicate.ad_related_special_op_eliminate 0.56% : 0.000001s : 10: predicate.addn_check_dump 0.93% : 0.000002s : 15: predicate.addn_zero_filter 0.74% : 0.000002s : 15: predicate.adjust_all_reduce_mul_add 2.40% : 0.000006s : 25: predicate.arithmetic_simplify 0.92% : 0.000002s : 15: predicate.cast_eliminate 0.67% : 0.000002s : 10: predicate.check_bprop_eliminate 0.60% : 0.000002s : 10: predicate.compare_switch_simplify 0.17% : 0.000000s : 5: predicate.const_output_eliminate 0.81% : 0.000002s : 10: predicate.depend_value_elim 0.81% : 0.000002s : 15: predicate.dict_get_item_const_eliminator 0.95% : 0.000003s : 15: predicate.dict_get_item_eliminator 0.89% : 0.000002s : 15: predicate.dict_set_item_eliminator 1.05% : 0.000003s : 10: predicate.dumpgradient_eliminate 0.25% : 0.000001s : 5: predicate.elim_not_effective 0.47% : 0.000001s : 5: predicate.elim_shapecalc_of_broadcastargs 1.16% : 0.000003s : 20: predicate.environ_add_const_eliminate 1.10% : 0.000003s : 20: predicate.environ_get_add_eliminate 1.08% : 0.000003s : 20: predicate.environ_get_depend_swap 1.67% : 0.000004s : 30: predicate.environ_get_eliminate 1.01% : 0.000003s : 20: predicate.environ_get_set_eliminate 1.33% : 0.000004s : 23: predicate.exchange_switch_depend_value 2.07% : 0.000006s : 23: predicate.float_depend_g_call 0.55% : 0.000001s : 10: predicate.float_environ_get_switch 0.97% : 0.000003s : 15: predicate.float_tuple_getitem_switch 0.18% : 0.000000s : 5: predicate.fold_const_symbol 0.74% : 0.000002s : 10: predicate.get_grad_eliminate 0.19% : 0.000000s : 5: predicate.graph_param_transform 0.68% : 0.000002s : 10: predicate.incorporate_call 0.51% : 0.000001s : 10: predicate.incorporate_call_switch 5.92% : 0.000016s : 68: predicate.inline 0.82% : 0.000002s : 10: predicate.inline_without_move 0.30% : 0.000001s : 10: predicate.j_node_and_user_rematch 1.19% : 0.000003s : 10: predicate.less_batch_normalization 1.84% : 0.000005s : 29: predicate.list_to_tuple_eliminator_ 2.21% : 0.000006s : 44: predicate.load_eliminater 1.23% : 0.000003s : 5: predicate.loop_unroll_after_grad 2.26% : 0.000006s : 36: predicate.loop_unroll_before_grad 1.78% : 0.000005s : 25: predicate.make_slice_get_slice_eliminator 0.64% : 0.000002s : 10: predicate.merge_addn 0.60% : 0.000002s : 10: predicate.micro_step_allgather_replace 0.65% : 0.000002s : 10: predicate.mini_step_allgather_replace 0.82% : 0.000002s : 15: predicate.minmaximum_grad 1.53% : 0.000004s : 5: predicate.mutable_eliminate 0.37% : 0.000001s : 5: predicate.opt_reshape 0.37% : 0.000001s : 5: predicate.parallel_virtual_node 1.69% : 0.000005s : 23: predicate.partial_defer_inline 1.46% : 0.000004s : 24: predicate.partial_eliminate 0.88% : 0.000002s : 15: predicate.print_const_string_wrapper 0.66% : 0.000002s : 10: predicate.reduce_all_const_elim 1.20% : 0.000003s : 15: predicate.reduce_eliminate 2.24% : 0.000006s : 44: predicate.redundant_stop_gradient_eliminater 0.33% : 0.000001s : 10: predicate.remove_not_recompute_node 1.45% : 0.000004s : 29: predicate.replace_applicator 0.45% : 0.000001s : 10: predicate.replace_old_param 0.29% : 0.000001s : 5: predicate.reset_defer_inline 0.85% : 0.000002s : 15: predicate.reshape_eliminate 0.66% : 0.000002s : 10: predicate.row_tensor_add_zeros_like 0.50% : 0.000001s : 5: predicate.row_tensor_eliminate 0.98% : 0.000003s : 10: predicate.same_eliminate 0.39% : 0.000001s : 10: predicate.set_cell_output_no_recompute 1.02% : 0.000003s : 10: predicate.shard_identity_eliminate 0.83% : 0.000002s : 10: predicate.special_op_eliminate 0.76% : 0.000002s : 10: predicate.specialize_transform 1.39% : 0.000004s : 10: predicate.split_environ_get_set_with_tuple_value 0.95% : 0.000003s : 10: predicate.stack_unstack_eliminate 0.38% : 0.000001s : 5: predicate.switch_call_monad_eliminater 1.28% : 0.000003s : 23: predicate.switch_defer_inline 1.83% : 0.000005s : 33: predicate.switch_layer_defer_inline 4.38% : 0.000012s : 74: predicate.switch_simplify 0.79% : 0.000002s : 15: predicate.tile_eliminate 0.88% : 0.000002s : 15: predicate.transpose_eliminate 1.71% : 0.000005s : 25: predicate.tuple_list_convert_item_index_to_positive 1.78% : 0.000005s : 25: predicate.tuple_list_get_item_const_eliminator 1.45% : 0.000004s : 25: predicate.tuple_list_get_item_depend_reorder 3.33% : 0.000009s : 39: predicate.tuple_list_get_item_eliminator 1.30% : 0.000003s : 25: predicate.tuple_list_get_set_item_eliminator 2.26% : 0.000006s : 35: predicate.tuple_list_set_item_eliminator 1.66% : 0.000004s : 29: predicate.tuple_to_list_eliminator_ 2.15% : 0.000006s : 44: predicate.updatestate_pure_node_eliminater 3.26% : 0.000009s : 54: predicate.updatestate_useless_node_eliminater 0.43% : 0.000001s : 5: predicate.value_based_eliminate 0.85% : 0.000002s : 10: predicate.virtual_dataset_eliminate 0.66% : 0.000002s : 10: predicate.virtual_output_eliminate 0.28% : 0.000001s : 5: predicate.virtual_view_grad_eliminate 0.59% : 0.000002s : 5: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000743 11 54.28% : 0.000403s : 5: func_graph_cloner_run.FuncGraphClonerGraph 45.72% : 0.000340s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.237653 192 0.00% : 0.000007s : 1: ForceFp32Comm 1.65% : 0.003911s : 1: add_attr 1.64% : 0.003892s : 1: add_attr_with_inline 0.00% : 0.000007s : 1: add_comm_op_reuse_tag 0.03% : 0.000075s : 1: add_recomputation 0.00% : 0.000007s : 1: assign_add_opt 0.04% : 0.000087s : 1: auto_monad 0.01% : 0.000035s : 1: auto_monad_reorder 0.00% : 0.000008s : 1: begin_end_overlap_inline 0.00% : 0.000010s : 1: bias_add_comm_swap 0.21% : 0.000510s : 1: bootstrap 0.02% : 0.000047s : 1: cconv 0.00% : 0.000007s : 1: comm_op_add_attrs 0.01% : 0.000025s : 1: control_data_broadcast_order 0.01% : 0.000015s : 1: convert_after_rewriter 0.02% : 0.000039s : 1: cse_after_recomputation 0.00% : 0.000008s : 1: dataset_repeat_opt 0.01% : 0.000026s : 1: detach_backward 0.01% : 0.000014s : 1: environ_conv 0.01% : 0.000034s : 1: event_method 0.00% : 0.000007s : 1: full_micro_interleaved_order_control 0.00% : 0.000008s : 1: get_jit_bprop_graph 0.01% : 0.000014s : 1: graph_reusing 0.00% : 0.000008s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000007s : 1: handle_group_info 0.00% : 0.000010s : 1: inline 0.00% : 0.000008s : 1: insert-virtual-dataset 0.00% : 0.000007s : 1: interleave_parallel_branches 0.00% : 0.000007s : 1: interleave_split_concat_branches 0.00% : 0.000008s : 1: label_fine_grained_interleaved_index 0.01% : 0.000012s : 1: label_micro_interleaved_index 0.24% : 0.000571s : 1: loop_unroll 0.00% : 0.000007s : 1: merge_cast_opt 0.00% : 0.000008s : 1: micro_interleaved_order_control 0.35% : 0.000820s : 1: mutable_eliminate 0.00% : 0.000011s : 1: offloading_packed_experts 0.01% : 0.000023s : 1: opt.transform.loop_unroll_optimizer 0.01% : 0.000028s : 1: opt.transform.mutable_eliminate 0.65% : 0.001540s : 78: opt.transform.opt_a 0.02% : 0.000044s : 1: opt.transform.opt_after_cconv 0.02% : 0.000037s : 1: opt.transform.opt_after_jit_grad 0.08% : 0.000181s : 28: opt.transform.opt_b 0.03% : 0.000075s : 2: opt.transform.opt_trans_graph 0.02% : 0.000051s : 4: opt.transform.symbol_engine_opt 1.66% : 0.003947s : 1: opt_a 0.07% : 0.000174s : 1: opt_after_cconv 0.27% : 0.000640s : 1: opt_after_jit_grad 0.18% : 0.000417s : 1: opt_b 3.17% : 0.007522s : 1: optimize 0.01% : 0.000031s : 1: optimize_parallel_all_gather_comm 0.01% : 0.000013s : 1: order_py_execute_after_rewriter 0.01% : 0.000032s : 1: overlap_grad_flash_sp 0.00% : 0.000006s : 1: overlap_grad_matmul_and_grad_allreduce 0.00% : 0.000012s : 1: overlap_grad_ring_attention 0.00% : 0.000008s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000009s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000009s : 1: overlap_param_gather 0.00% : 0.000006s : 1: overlap_recompute_allgather_and_fa_grad 0.01% : 0.000013s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000010s : 1: overlap_recompute_comm 0.00% : 0.000010s : 1: parallel-infer-symbol 0.00% : 0.000008s : 1: parallel-infer-symbol-second 0.00% : 0.000009s : 1: partial_unused_args_eliminate 0.00% : 0.000010s : 1: pipeline_parallel_scheduler 0.00% : 0.000007s : 1: pipeline_split 0.02% : 0.000045s : 1: pre_auto_parallel 0.02% : 0.000039s : 1: py_interpret_to_execute 0.01% : 0.000025s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000007s : 1: remove_cast_before_assign_add 0.01% : 0.000026s : 1: remove_dup_value 0.22% : 0.000531s : 1: renormalize.infer 0.18% : 0.000420s : 1: renormalize.specialize 0.00% : 0.000010s : 1: reorder_send_recv_between_fp_bp 0.01% : 0.000013s : 1: rewriter_after_jit_bprop_graph 0.03% : 0.000062s : 1: rewriter_after_opt_a 0.04% : 0.000099s : 1: rewriter_before_opt_a 0.00% : 0.000008s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000007s : 1: slice_recompute_activation 0.00% : 0.000007s : 1: split_layernorm_comm 0.00% : 0.000008s : 1: split_matmul_comm_elemetwise 0.01% : 0.000013s : 1: swap_dp_allreduce_reducescatter 0.05% : 0.000128s : 1: symbol_engine_optimizer 0.05% : 0.000125s : 1: tuple_transform 88.75% : 0.210928s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:39:07.798.283 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.186452, [21] [bootstrap]: 0.00044933 [type_inference]: 0.175076 [event_method]: 2.36e-05 [auto_monad]: 8.061e-05 [graph_reusing]: 6.19001e-06 [inline]: 3.06999e-06 [add_attr]: 0.0040184, [1] [add_attr_with_inline]: 0.00400645, [1] [Cycle 1]: 7.929e-05, [2] [tag_attr]: 2.546e-05 [meta_addattr_fg_expand]: 6.54001e-06 [parallel-infer-symbol]: 5.22e-06 [pre_auto_parallel]: 4.302e-05 [insert-virtual-dataset]: 2.39999e-06 [parallel-infer-symbol-second]: 9.20001e-07 [dataset_repeat_opt]: 2.12001e-06 [pipeline_split]: 1.63002e-06 [optimize]: 0.00594542, [53] [py_interpret_to_execute]: 3.199e-05 [rewriter_before_opt_a]: 9.657e-05 [opt_a]: 0.00346114, [2] [Cycle 1]: 0.00262107, [45] [expand_dump_flag]: 2.91e-06 [switch_simplify]: 4.476e-05 [loop_unroll]: 3.162e-05 [a_1]: 0.00074351 [with_stream_mark]: 1.994e-05 [recompute_prepare]: 1.185e-05 [updatestate_depend_eliminate]: 5.05001e-06 [updatestate_assign_eliminate]: 4.2e-06 [updatestate_loads_eliminate]: 3.80998e-06 [parameter_eliminate]: 2.04999e-06 [a_2]: 0.00011442 [accelerated_algorithm]: 2.377e-05 [shard]: 1.89e-06 [meta_shard_fg_expand]: 2.59999e-06 [shard_inline]: 8.50001e-06 [merge_send_recv]: 9.56e-06 [auto_parallel]: 7.65e-06 [parallel]: 2.304e-05 [flash_sp]: 9.37999e-06 [merge_comm]: 5.30999e-06 [allreduce_fusion]: 4.12e-06 [matmul_add_comm_reduction]: 1.528e-05 [allreduce_slice_to_reducescatter]: 9.5999e-07 [virtual_shard_identity]: 1.17e-05 [virtual_dataset]: 8.09002e-06 [get_grad_eliminate_]: 7.73001e-06 [virtual_output]: 7.77998e-06 [merge_forward]: 5.86998e-06 [cell_reuse_recompute_pass]: 1.61002e-06 [offload_activation]: 1.193e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.702e-05 [merge_recompute_call_nodes]: 1.57001e-06 [before_grad]: 1.351e-05 [set_forward_comm_id_for_comm_node_pass]: 4.91997e-06 [meta_fg_expand]: 3.38e-06 [flash_sp_send_recv_attached]: 4.89e-06 [receive_attached]: 2.22999e-06 [after_resolve]: 1.573e-05 [a_after_grad]: 1.262e-05 [renormalize]: 0.00098634 [add_forward_monad_depend]: 6.94999e-06 [auto_monad_grad]: 2.64999e-06 [auto_monad_eliminator]: 2.049e-05 [cse]: 3.835e-05 [a_3]: 6.508e-05 [Cycle 2]: 0.00082854, [45] [expand_dump_flag]: 2.01e-06 [switch_simplify]: 9.71003e-06 [loop_unroll]: 7.80998e-06 [a_1]: 0.00019376 [with_stream_mark]: 1.352e-05 [recompute_prepare]: 8.77e-06 [updatestate_depend_eliminate]: 4.72e-06 [updatestate_assign_eliminate]: 3.66999e-06 [updatestate_loads_eliminate]: 3.13998e-06 [parameter_eliminate]: 1.47999e-06 [a_2]: 0.00010027 [accelerated_algorithm]: 1.123e-05 [shard]: 1.37e-06 [meta_shard_fg_expand]: 2.04999e-06 [shard_inline]: 8.70001e-06 [merge_send_recv]: 7.94002e-06 [auto_parallel]: 7.38e-06 [parallel]: 7.24001e-06 [flash_sp]: 3.85e-06 [merge_comm]: 4.13999e-06 [allreduce_fusion]: 4.13001e-06 [matmul_add_comm_reduction]: 1.049e-05 [allreduce_slice_to_reducescatter]: 5.00004e-07 [virtual_shard_identity]: 9.15999e-06 [virtual_dataset]: 7.25e-06 [get_grad_eliminate_]: 7.44002e-06 [virtual_output]: 6.98e-06 [merge_forward]: 3.98999e-06 [cell_reuse_recompute_pass]: 1.71e-06 [offload_activation]: 1.192e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.497e-05 [merge_recompute_call_nodes]: 1.24998e-06 [before_grad]: 1.238e-05 [set_forward_comm_id_for_comm_node_pass]: 5.09998e-06 [meta_fg_expand]: 2.83e-06 [flash_sp_send_recv_attached]: 1.53002e-06 [receive_attached]: 1.42999e-06 [after_resolve]: 1.326e-05 [a_after_grad]: 1.124e-05 [renormalize]: 8.00064e-08 [add_forward_monad_depend]: 2.54001e-06 [auto_monad_grad]: 1.53002e-06 [auto_monad_eliminator]: 9.56e-06 [cse]: 2.315e-05 [a_3]: 4.83e-05 [py_interpret_to_execute_after_opt_a]: 1.484e-05 [slice_cell_reuse_recomputed_activation]: 2.31e-06 [rewriter_after_opt_a]: 4.776e-05 [convert_after_rewriter]: 7.83999e-06 [order_py_execute_after_rewriter]: 6.50002e-06 [mutable_eliminate]: 0.00068339 [opt_b]: 0.00029036, [1] [Cycle 1]: 0.00028254, [7] [b_1]: 0.00018514 [b_2]: 1.097e-05 [updatestate_depend_eliminate]: 9.27999e-06 [updatestate_assign_eliminate]: 3.55e-06 [updatestate_loads_eliminate]: 3.91999e-06 [renormalize]: 7.00005e-07 [cse]: 3.075e-05 [optimize_parallel_all_gather_comm]: 1.959e-05 [overlap_param_gather]: 1.82999e-06 [cconv]: 3.275e-05 [loop_unroll]: 0.00046879 [opt_after_cconv]: 0.00013187, [1] [Cycle 1]: 0.00012569, [7] [c_1]: 4.311e-05 [parameter_eliminate]: 4.36002e-06 [updatestate_depend_eliminate]: 7.92e-06 [updatestate_assign_eliminate]: 3.41001e-06 [updatestate_loads_eliminate]: 3.43999e-06 [cse]: 2.654e-05 [renormalize]: 7.30011e-07 [remove_dup_value]: 1.661e-05 [tuple_transform]: 0.00010012, [1] [Cycle 1]: 9.585e-05, [4] [d_1]: 6.607e-05 [none_parameter_eliminate]: 1.79e-06 [renormalize]: 2.30008e-07 [switch_simplify]: 8.95001e-06 [partial_unused_args_eliminate]: 1.92001e-06 [add_recomputation]: 6.191e-05 [cse_after_recomputation]: 2.783e-05, [1] [Cycle 1]: 2.241e-05, [1] [cse]: 1.65e-05 [environ_conv]: 8.05e-06 [swap_dp_allreduce_reducescatter]: 6.53e-06 [bias_add_comm_swap]: 3.92998e-06 [label_micro_interleaved_index]: 4.47e-06 [label_fine_grained_interleaved_index]: 2.57001e-06 [merge_cast_opt]: 1.21997e-06 [slice_recompute_activation]: 2.09e-06 [micro_interleaved_order_control]: 2.18998e-06 [assign_add_opt]: 1.41998e-06 [ForceFp32Comm]: 8.10018e-07 [remove_cast_before_assign_add]: 1.00999e-06 [full_micro_interleaved_order_control]: 2.63998e-06 [reorder_send_recv_between_fp_bp]: 2.98e-06 [comm_op_add_attrs]: 1.09998e-06 [add_comm_op_reuse_tag]: 9.70002e-07 [interleave_split_concat_branches]: 1.19e-06 [interleave_parallel_branches]: 1.10001e-06 [overlap_opt_shard_in_pipeline]: 1.33002e-06 [overlap_opt_shard_grad_in_pipeline]: 1.86e-06 [control_data_broadcast_order]: 1.623e-05 [grouped_pairwise_exchange_alltoall]: 1.52999e-06 [offloading_packed_experts]: 5.15999e-06 [overlap_recompute_and_grad_model_parallel]: 5.30999e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.14e-06 [overlap_recompute_allgather_and_fa_grad]: 1.53002e-06 [overlap_recompute_comm]: 2.66e-06 [overlap_grad_ring_attention]: 4.40999e-06 [overlap_grad_flash_sp]: 2.526e-05 [begin_end_overlap_inline]: 4.80009e-07 [split_matmul_comm_elemetwise]: 2.12999e-06 [split_layernorm_comm]: 1.59e-06 [handle_group_info]: 9.10019e-07 [symbol_engine_optimizer]: 9.303e-05, [1] [Cycle 1]: 8.879e-05, [6] [build]: 3.62002e-06 [elim_shapecalc]: 1.404e-05 [elim_not_effective]: 1.571e-05 [opt_reshape]: 9.57001e-06 [fold_const_symbol]: 1.327e-05 [renormalize]: 3.00002e-07 [detach_backward]: 2.19999e-06 [pipeline_parallel_scheduler]: 1.74e-06 [auto_monad_reorder]: 2.08e-05 [get_jit_bprop_graph]: 2.04e-06 [rewriter_after_jit_bprop_graph]: 4.59002e-06 [opt_after_jit_grad]: 0.00053179 [validate]: 4.922e-05 Sums bootstrap : 0.000449s : 0.25% type_inference : 0.175076s : 96.52% event_method : 0.000024s : 0.01% auto_monad : 0.000081s : 0.04% graph_reusing : 0.000006s : 0.00% inline : 0.000003s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000025s : 0.01% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000007s : 0.00% parallel-infer-symbol : 0.000005s : 0.00% pre_auto_parallel : 0.000043s : 0.02% insert-virtual-dataset : 0.000002s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000032s : 0.02% optimize.rewriter_before_opt_a : 0.000097s : 0.05% optimize.opt_a.expand_dump_flag : 0.000005s : 0.00% optimize.opt_a.switch_simplify : 0.000054s : 0.03% optimize.opt_a.loop_unroll : 0.000039s : 0.02% optimize.opt_a.a_1 : 0.000937s : 0.52% optimize.opt_a.with_stream_mark : 0.000033s : 0.02% optimize.opt_a.recompute_prepare : 0.000021s : 0.01% optimize.opt_a.updatestate_depend_eliminate : 0.000010s : 0.01% optimize.opt_a.updatestate_assign_eliminate : 0.000008s : 0.00% optimize.opt_a.updatestate_loads_eliminate : 0.000007s : 0.00% optimize.opt_a.parameter_eliminate : 0.000004s : 0.00% optimize.opt_a.a_2 : 0.000215s : 0.12% optimize.opt_a.accelerated_algorithm : 0.000035s : 0.02% optimize.opt_a.shard : 0.000003s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.000005s : 0.00% optimize.opt_a.shard_inline : 0.000017s : 0.01% optimize.opt_a.merge_send_recv : 0.000018s : 0.01% optimize.opt_a.auto_parallel : 0.000015s : 0.01% optimize.opt_a.parallel : 0.000030s : 0.02% optimize.opt_a.flash_sp : 0.000013s : 0.01% optimize.opt_a.merge_comm : 0.000009s : 0.01% optimize.opt_a.allreduce_fusion : 0.000008s : 0.00% optimize.opt_a.matmul_add_comm_reduction : 0.000026s : 0.01% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000021s : 0.01% optimize.opt_a.virtual_dataset : 0.000015s : 0.01% optimize.opt_a.get_grad_eliminate_ : 0.000015s : 0.01% optimize.opt_a.virtual_output : 0.000015s : 0.01% optimize.opt_a.merge_forward : 0.000010s : 0.01% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.00% optimize.opt_a.offload_activation : 0.000024s : 0.01% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000032s : 0.02% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.00% optimize.opt_a.before_grad : 0.000026s : 0.01% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000010s : 0.01% optimize.opt_a.meta_fg_expand : 0.000006s : 0.00% optimize.opt_a.flash_sp_send_recv_attached : 0.000006s : 0.00% optimize.opt_a.receive_attached : 0.000004s : 0.00% optimize.opt_a.after_resolve : 0.000029s : 0.02% optimize.opt_a.a_after_grad : 0.000024s : 0.01% optimize.opt_a.renormalize : 0.000986s : 0.54% optimize.opt_a.add_forward_monad_depend : 0.000009s : 0.01% optimize.opt_a.auto_monad_grad : 0.000004s : 0.00% optimize.opt_a.auto_monad_eliminator : 0.000030s : 0.02% optimize.opt_a.cse : 0.000062s : 0.03% optimize.opt_a.a_3 : 0.000113s : 0.06% optimize.py_interpret_to_execute_after_opt_a : 0.000015s : 0.01% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.00% optimize.rewriter_after_opt_a : 0.000048s : 0.03% optimize.convert_after_rewriter : 0.000008s : 0.00% optimize.order_py_execute_after_rewriter : 0.000007s : 0.00% optimize.mutable_eliminate : 0.000683s : 0.38% optimize.opt_b.b_1 : 0.000185s : 0.10% optimize.opt_b.b_2 : 0.000011s : 0.01% optimize.opt_b.updatestate_depend_eliminate : 0.000009s : 0.01% optimize.opt_b.updatestate_assign_eliminate : 0.000004s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000004s : 0.00% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000031s : 0.02% optimize.optimize_parallel_all_gather_comm : 0.000020s : 0.01% optimize.overlap_param_gather : 0.000002s : 0.00% optimize.cconv : 0.000033s : 0.02% optimize.loop_unroll : 0.000469s : 0.26% optimize.opt_after_cconv.c_1 : 0.000043s : 0.02% optimize.opt_after_cconv.parameter_eliminate : 0.000004s : 0.00% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000008s : 0.00% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.cse : 0.000027s : 0.01% optimize.opt_after_cconv.renormalize : 0.000001s : 0.00% optimize.remove_dup_value : 0.000017s : 0.01% optimize.tuple_transform.d_1 : 0.000066s : 0.04% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000009s : 0.00% optimize.partial_unused_args_eliminate : 0.000002s : 0.00% optimize.add_recomputation : 0.000062s : 0.03% optimize.cse_after_recomputation.cse : 0.000016s : 0.01% optimize.environ_conv : 0.000008s : 0.00% optimize.swap_dp_allreduce_reducescatter : 0.000007s : 0.00% optimize.bias_add_comm_swap : 0.000004s : 0.00% optimize.label_micro_interleaved_index : 0.000004s : 0.00% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.00% optimize.merge_cast_opt : 0.000001s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.00% optimize.micro_interleaved_order_control : 0.000002s : 0.00% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.00% optimize.full_micro_interleaved_order_control : 0.000003s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.00% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.00% optimize.control_data_broadcast_order : 0.000016s : 0.01% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.00% optimize.offloading_packed_experts : 0.000005s : 0.00% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.00% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000002s : 0.00% optimize.overlap_recompute_comm : 0.000003s : 0.00% optimize.overlap_grad_ring_attention : 0.000004s : 0.00% optimize.overlap_grad_flash_sp : 0.000025s : 0.01% optimize.begin_end_overlap_inline : 0.000000s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.00% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000004s : 0.00% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000014s : 0.01% optimize.symbol_engine_optimizer.elim_not_effective : 0.000016s : 0.01% optimize.symbol_engine_optimizer.opt_reshape : 0.000010s : 0.01% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000013s : 0.01% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.00% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000021s : 0.01% get_jit_bprop_graph : 0.000002s : 0.00% rewriter_after_jit_bprop_graph : 0.000005s : 0.00% opt_after_jit_grad : 0.000532s : 0.29% validate : 0.000049s : 0.03% Time group info: ------[substitution.] 0.000287 45 11.15% : 0.000032s : 5: substitution.arithmetic_simplify 9.89% : 0.000028s : 3: substitution.cast_eliminate 0.73% : 0.000002s : 3: substitution.elim_not_effective 0.80% : 0.000002s : 3: substitution.fold_const_symbol 2.53% : 0.000007s : 5: substitution.graph_param_transform 59.20% : 0.000170s : 4: substitution.inline 1.75% : 0.000005s : 6: substitution.j_node_and_user_rematch 4.68% : 0.000013s : 2: substitution.less_batch_normalization 2.72% : 0.000008s : 6: substitution.remove_not_recompute_node 1.93% : 0.000006s : 4: substitution.replace_old_param 4.61% : 0.000013s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.174984 2 99.35% : 0.173847s : 1: type_inference.infer 0.65% : 0.001136s : 1: type_inference.specialize ------[replace.] 0.000066 8 62.15% : 0.000041s : 4: replace.inline 37.85% : 0.000025s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000179 8 93.59% : 0.000167s : 4: match.inline 6.41% : 0.000011s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000241 1504 0.90% : 0.000002s : 15: predicate.accumulaten_eliminater 0.90% : 0.000002s : 5: predicate.ad_related_special_op_eliminate 0.61% : 0.000001s : 10: predicate.addn_check_dump 0.90% : 0.000002s : 15: predicate.addn_zero_filter 0.78% : 0.000002s : 15: predicate.adjust_all_reduce_mul_add 2.41% : 0.000006s : 25: predicate.arithmetic_simplify 0.91% : 0.000002s : 15: predicate.cast_eliminate 0.64% : 0.000002s : 10: predicate.check_bprop_eliminate 0.58% : 0.000001s : 10: predicate.compare_switch_simplify 0.21% : 0.000001s : 5: predicate.const_output_eliminate 0.66% : 0.000002s : 10: predicate.depend_value_elim 0.88% : 0.000002s : 15: predicate.dict_get_item_const_eliminator 0.96% : 0.000002s : 15: predicate.dict_get_item_eliminator 0.84% : 0.000002s : 15: predicate.dict_set_item_eliminator 1.10% : 0.000003s : 10: predicate.dumpgradient_eliminate 0.25% : 0.000001s : 5: predicate.elim_not_effective 0.41% : 0.000001s : 5: predicate.elim_shapecalc_of_broadcastargs 1.16% : 0.000003s : 20: predicate.environ_add_const_eliminate 1.05% : 0.000003s : 20: predicate.environ_get_add_eliminate 1.04% : 0.000003s : 20: predicate.environ_get_depend_swap 1.72% : 0.000004s : 30: predicate.environ_get_eliminate 1.07% : 0.000003s : 20: predicate.environ_get_set_eliminate 1.33% : 0.000003s : 23: predicate.exchange_switch_depend_value 2.32% : 0.000006s : 23: predicate.float_depend_g_call 0.52% : 0.000001s : 10: predicate.float_environ_get_switch 0.83% : 0.000002s : 15: predicate.float_tuple_getitem_switch 0.19% : 0.000000s : 5: predicate.fold_const_symbol 0.68% : 0.000002s : 10: predicate.get_grad_eliminate 0.29% : 0.000001s : 5: predicate.graph_param_transform 0.67% : 0.000002s : 10: predicate.incorporate_call 0.55% : 0.000001s : 10: predicate.incorporate_call_switch 6.31% : 0.000015s : 68: predicate.inline 0.85% : 0.000002s : 10: predicate.inline_without_move 0.33% : 0.000001s : 10: predicate.j_node_and_user_rematch 0.89% : 0.000002s : 10: predicate.less_batch_normalization 1.82% : 0.000004s : 29: predicate.list_to_tuple_eliminator_ 2.55% : 0.000006s : 44: predicate.load_eliminater 0.94% : 0.000002s : 5: predicate.loop_unroll_after_grad 2.11% : 0.000005s : 36: predicate.loop_unroll_before_grad 1.57% : 0.000004s : 25: predicate.make_slice_get_slice_eliminator 0.60% : 0.000001s : 10: predicate.merge_addn 0.63% : 0.000002s : 10: predicate.micro_step_allgather_replace 0.58% : 0.000001s : 10: predicate.mini_step_allgather_replace 0.80% : 0.000002s : 15: predicate.minmaximum_grad 1.24% : 0.000003s : 5: predicate.mutable_eliminate 0.37% : 0.000001s : 5: predicate.opt_reshape 0.41% : 0.000001s : 5: predicate.parallel_virtual_node 1.76% : 0.000004s : 23: predicate.partial_defer_inline 1.66% : 0.000004s : 24: predicate.partial_eliminate 0.94% : 0.000002s : 15: predicate.print_const_string_wrapper 0.58% : 0.000001s : 10: predicate.reduce_all_const_elim 1.15% : 0.000003s : 15: predicate.reduce_eliminate 2.39% : 0.000006s : 44: predicate.redundant_stop_gradient_eliminater 0.51% : 0.000001s : 10: predicate.remove_not_recompute_node 1.43% : 0.000003s : 29: predicate.replace_applicator 0.53% : 0.000001s : 10: predicate.replace_old_param 0.52% : 0.000001s : 5: predicate.reset_defer_inline 0.88% : 0.000002s : 15: predicate.reshape_eliminate 0.66% : 0.000002s : 10: predicate.row_tensor_add_zeros_like 0.36% : 0.000001s : 5: predicate.row_tensor_eliminate 0.75% : 0.000002s : 10: predicate.same_eliminate 0.44% : 0.000001s : 10: predicate.set_cell_output_no_recompute 0.87% : 0.000002s : 10: predicate.shard_identity_eliminate 0.89% : 0.000002s : 10: predicate.special_op_eliminate 0.75% : 0.000002s : 10: predicate.specialize_transform 0.94% : 0.000002s : 10: predicate.split_environ_get_set_with_tuple_value 0.72% : 0.000002s : 10: predicate.stack_unstack_eliminate 0.47% : 0.000001s : 5: predicate.switch_call_monad_eliminater 1.46% : 0.000004s : 23: predicate.switch_defer_inline 2.01% : 0.000005s : 33: predicate.switch_layer_defer_inline 4.93% : 0.000012s : 74: predicate.switch_simplify 0.86% : 0.000002s : 15: predicate.tile_eliminate 0.94% : 0.000002s : 15: predicate.transpose_eliminate 1.49% : 0.000004s : 25: predicate.tuple_list_convert_item_index_to_positive 1.54% : 0.000004s : 25: predicate.tuple_list_get_item_const_eliminator 1.38% : 0.000003s : 25: predicate.tuple_list_get_item_depend_reorder 3.19% : 0.000008s : 39: predicate.tuple_list_get_item_eliminator 1.51% : 0.000004s : 25: predicate.tuple_list_get_set_item_eliminator 2.12% : 0.000005s : 35: predicate.tuple_list_set_item_eliminator 1.80% : 0.000004s : 29: predicate.tuple_to_list_eliminator_ 2.44% : 0.000006s : 44: predicate.updatestate_pure_node_eliminater 3.21% : 0.000008s : 54: predicate.updatestate_useless_node_eliminater 0.49% : 0.000001s : 5: predicate.value_based_eliminate 0.69% : 0.000002s : 10: predicate.virtual_dataset_eliminate 0.65% : 0.000002s : 10: predicate.virtual_output_eliminate 0.31% : 0.000001s : 5: predicate.virtual_view_grad_eliminate 0.44% : 0.000001s : 5: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000759 11 44.68% : 0.000339s : 5: func_graph_cloner_run.FuncGraphClonerGraph 55.32% : 0.000420s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.199157 192 0.00% : 0.000004s : 1: ForceFp32Comm 2.02% : 0.004025s : 1: add_attr 2.01% : 0.004011s : 1: add_attr_with_inline 0.00% : 0.000004s : 1: add_comm_op_reuse_tag 0.03% : 0.000066s : 1: add_recomputation 0.00% : 0.000004s : 1: assign_add_opt 0.04% : 0.000088s : 1: auto_monad 0.01% : 0.000025s : 1: auto_monad_reorder 0.00% : 0.000003s : 1: begin_end_overlap_inline 0.00% : 0.000007s : 1: bias_add_comm_swap 0.24% : 0.000482s : 1: bootstrap 0.02% : 0.000036s : 1: cconv 0.00% : 0.000004s : 1: comm_op_add_attrs 0.01% : 0.000019s : 1: control_data_broadcast_order 0.01% : 0.000011s : 1: convert_after_rewriter 0.02% : 0.000031s : 1: cse_after_recomputation 0.00% : 0.000005s : 1: dataset_repeat_opt 0.00% : 0.000005s : 1: detach_backward 0.01% : 0.000012s : 1: environ_conv 0.02% : 0.000031s : 1: event_method 0.00% : 0.000005s : 1: full_micro_interleaved_order_control 0.00% : 0.000005s : 1: get_jit_bprop_graph 0.01% : 0.000010s : 1: graph_reusing 0.00% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000003s : 1: handle_group_info 0.00% : 0.000006s : 1: inline 0.00% : 0.000006s : 1: insert-virtual-dataset 0.00% : 0.000004s : 1: interleave_parallel_branches 0.00% : 0.000004s : 1: interleave_split_concat_branches 0.00% : 0.000005s : 1: label_fine_grained_interleaved_index 0.00% : 0.000007s : 1: label_micro_interleaved_index 0.24% : 0.000477s : 1: loop_unroll 0.00% : 0.000004s : 1: merge_cast_opt 0.00% : 0.000005s : 1: micro_interleaved_order_control 0.35% : 0.000695s : 1: mutable_eliminate 0.00% : 0.000008s : 1: offloading_packed_experts 0.01% : 0.000018s : 1: opt.transform.loop_unroll_optimizer 0.01% : 0.000023s : 1: opt.transform.mutable_eliminate 0.76% : 0.001507s : 78: opt.transform.opt_a 0.02% : 0.000042s : 1: opt.transform.opt_after_cconv 0.02% : 0.000035s : 1: opt.transform.opt_after_jit_grad 0.08% : 0.000162s : 28: opt.transform.opt_b 0.04% : 0.000072s : 2: opt.transform.opt_trans_graph 0.02% : 0.000049s : 4: opt.transform.symbol_engine_opt 1.74% : 0.003465s : 1: opt_a 0.07% : 0.000136s : 1: opt_after_cconv 0.27% : 0.000542s : 1: opt_after_jit_grad 0.15% : 0.000294s : 1: opt_b 2.99% : 0.005951s : 1: optimize 0.01% : 0.000023s : 1: optimize_parallel_all_gather_comm 0.00% : 0.000010s : 1: order_py_execute_after_rewriter 0.01% : 0.000029s : 1: overlap_grad_flash_sp 0.00% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.00% : 0.000008s : 1: overlap_grad_ring_attention 0.00% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000005s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000005s : 1: overlap_param_gather 0.00% : 0.000005s : 1: overlap_recompute_allgather_and_fa_grad 0.00% : 0.000008s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000006s : 1: overlap_recompute_comm 0.00% : 0.000009s : 1: parallel-infer-symbol 0.00% : 0.000004s : 1: parallel-infer-symbol-second 0.00% : 0.000006s : 1: partial_unused_args_eliminate 0.00% : 0.000006s : 1: pipeline_parallel_scheduler 0.00% : 0.000005s : 1: pipeline_split 0.02% : 0.000047s : 1: pre_auto_parallel 0.02% : 0.000036s : 1: py_interpret_to_execute 0.01% : 0.000018s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000004s : 1: remove_cast_before_assign_add 0.01% : 0.000020s : 1: remove_dup_value 0.28% : 0.000551s : 1: renormalize.infer 0.21% : 0.000424s : 1: renormalize.specialize 0.00% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.00% : 0.000008s : 1: rewriter_after_jit_bprop_graph 0.03% : 0.000053s : 1: rewriter_after_opt_a 0.05% : 0.000100s : 1: rewriter_before_opt_a 0.00% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000005s : 1: slice_recompute_activation 0.00% : 0.000004s : 1: split_layernorm_comm 0.00% : 0.000005s : 1: split_matmul_comm_elemetwise 0.00% : 0.000010s : 1: swap_dp_allreduce_reducescatter 0.05% : 0.000096s : 1: symbol_engine_optimizer 0.05% : 0.000103s : 1: tuple_transform 87.92% : 0.175106s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:39:10.415.578 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:39:10.415.883 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0182947, [21] [bootstrap]: 0.0004344 [type_inference]: 0.00612453 [event_method]: 2.088e-05 [auto_monad]: 7.402e-05 [graph_reusing]: 6.24001e-06 [inline]: 1.90001e-06 [add_attr]: 0.00350542, [1] [add_attr_with_inline]: 0.00349329, [1] [Cycle 1]: 8.946e-05, [2] [tag_attr]: 2.389e-05 [meta_addattr_fg_expand]: 5.99e-06 [parallel-infer-symbol]: 3.58e-06 [pre_auto_parallel]: 4.186e-05 [insert-virtual-dataset]: 2.36998e-06 [parallel-infer-symbol-second]: 7.50006e-07 [dataset_repeat_opt]: 1.89999e-06 [pipeline_split]: 1.80001e-06 [optimize]: 0.00667205, [53] [py_interpret_to_execute]: 3.655e-05 [rewriter_before_opt_a]: 9.783e-05 [opt_a]: 0.00381349, [2] [Cycle 1]: 0.00280388, [45] [expand_dump_flag]: 3.46001e-06 [switch_simplify]: 4.614e-05 [loop_unroll]: 3.229e-05 [a_1]: 0.00073195 [with_stream_mark]: 1.966e-05 [recompute_prepare]: 1.121e-05 [updatestate_depend_eliminate]: 4.75001e-06 [updatestate_assign_eliminate]: 4.24002e-06 [updatestate_loads_eliminate]: 4.55999e-06 [parameter_eliminate]: 2.46e-06 [a_2]: 0.00014223 [accelerated_algorithm]: 2.353e-05 [shard]: 2.16e-06 [meta_shard_fg_expand]: 2.38998e-06 [shard_inline]: 9.05999e-06 [merge_send_recv]: 1.035e-05 [auto_parallel]: 9.47001e-06 [parallel]: 2.039e-05 [flash_sp]: 8.01001e-06 [merge_comm]: 4.83001e-06 [allreduce_fusion]: 5.94e-06 [matmul_add_comm_reduction]: 1.277e-05 [allreduce_slice_to_reducescatter]: 1.07e-06 [virtual_shard_identity]: 1.062e-05 [virtual_dataset]: 9.51003e-06 [get_grad_eliminate_]: 1.862e-05 [virtual_output]: 1.016e-05 [merge_forward]: 5.00999e-06 [cell_reuse_recompute_pass]: 2.11003e-06 [offload_activation]: 1.329e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.154e-05 [merge_recompute_call_nodes]: 1.40999e-06 [before_grad]: 1.427e-05 [set_forward_comm_id_for_comm_node_pass]: 6.11e-06 [meta_fg_expand]: 3.6e-06 [flash_sp_send_recv_attached]: 5.29e-06 [receive_attached]: 2.11e-06 [after_resolve]: 1.595e-05 [a_after_grad]: 1.412e-05 [renormalize]: 0.00095085 [add_forward_monad_depend]: 7.32002e-06 [auto_monad_grad]: 3.01001e-06 [auto_monad_eliminator]: 2.046e-05 [cse]: 3.938e-05 [a_3]: 7.686e-05 [Cycle 2]: 0.00099331, [45] [expand_dump_flag]: 2.22001e-06 [switch_simplify]: 1.001e-05 [loop_unroll]: 7.83001e-06 [a_1]: 0.00019936 [with_stream_mark]: 1.26e-05 [recompute_prepare]: 8.06001e-06 [updatestate_depend_eliminate]: 4.67e-06 [updatestate_assign_eliminate]: 3.53e-06 [updatestate_loads_eliminate]: 3.14001e-06 [parameter_eliminate]: 1.40999e-06 [a_2]: 0.00012846 [accelerated_algorithm]: 1.163e-05 [shard]: 1.11002e-06 [meta_shard_fg_expand]: 1.73002e-06 [shard_inline]: 8.07e-06 [merge_send_recv]: 6.49999e-06 [auto_parallel]: 7.93001e-06 [parallel]: 6.79001e-06 [flash_sp]: 4.28999e-06 [merge_comm]: 4.23999e-06 [allreduce_fusion]: 3.85e-06 [matmul_add_comm_reduction]: 1.085e-05 [allreduce_slice_to_reducescatter]: 6.10016e-07 [virtual_shard_identity]: 9.12001e-06 [virtual_dataset]: 7.48e-06 [get_grad_eliminate_]: 7.6e-06 [virtual_output]: 7.20003e-06 [merge_forward]: 4.13001e-06 [cell_reuse_recompute_pass]: 3.11001e-06 [offload_activation]: 9.05001e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.609e-05 [merge_recompute_call_nodes]: 8.00006e-07 [before_grad]: 1.248e-05 [set_forward_comm_id_for_comm_node_pass]: 4.84003e-06 [meta_fg_expand]: 2.75002e-06 [flash_sp_send_recv_attached]: 1.37e-06 [receive_attached]: 1.52999e-06 [after_resolve]: 1.327e-05 [a_after_grad]: 1.17e-05 [renormalize]: 8.00064e-08 [add_forward_monad_depend]: 1.37e-06 [auto_monad_grad]: 1.67999e-06 [auto_monad_eliminator]: 9.98998e-06 [cse]: 2.025e-05 [a_3]: 6.068e-05 [py_interpret_to_execute_after_opt_a]: 1.706e-05 [slice_cell_reuse_recomputed_activation]: 5.09e-06 [rewriter_after_opt_a]: 4.783e-05 [convert_after_rewriter]: 1.034e-05 [order_py_execute_after_rewriter]: 9.29e-06 [mutable_eliminate]: 0.00070243 [opt_b]: 0.00038714, [1] [Cycle 1]: 0.0003755, [7] [b_1]: 0.0002452 [b_2]: 1.159e-05 [updatestate_depend_eliminate]: 8.05999e-06 [updatestate_assign_eliminate]: 3.86999e-06 [updatestate_loads_eliminate]: 3.09999e-06 [renormalize]: 6.09987e-07 [cse]: 2.656e-05 [optimize_parallel_all_gather_comm]: 2.446e-05 [overlap_param_gather]: 6.68e-06 [cconv]: 3.434e-05 [loop_unroll]: 0.00053171 [opt_after_cconv]: 0.00015942, [1] [Cycle 1]: 0.00014958, [7] [c_1]: 4.576e-05 [parameter_eliminate]: 5.13002e-06 [updatestate_depend_eliminate]: 7.48999e-06 [updatestate_assign_eliminate]: 3.26001e-06 [updatestate_loads_eliminate]: 3.66001e-06 [cse]: 2.545e-05 [renormalize]: 3.89991e-07 [remove_dup_value]: 2.037e-05 [tuple_transform]: 0.0001156, [1] [Cycle 1]: 0.00010826, [4] [d_1]: 6.492e-05 [none_parameter_eliminate]: 1.60999e-06 [renormalize]: 4.30009e-07 [switch_simplify]: 9.09e-06 [partial_unused_args_eliminate]: 5.20999e-06 [add_recomputation]: 6.533e-05 [cse_after_recomputation]: 3.433e-05, [1] [Cycle 1]: 2.661e-05, [1] [cse]: 1.686e-05 [environ_conv]: 9.82999e-06 [swap_dp_allreduce_reducescatter]: 9.24e-06 [bias_add_comm_swap]: 5.84999e-06 [label_micro_interleaved_index]: 8.40999e-06 [label_fine_grained_interleaved_index]: 5.19e-06 [merge_cast_opt]: 3.93001e-06 [slice_recompute_activation]: 5.46e-06 [micro_interleaved_order_control]: 5.27001e-06 [assign_add_opt]: 4.00998e-06 [ForceFp32Comm]: 3.65e-06 [remove_cast_before_assign_add]: 3.55e-06 [full_micro_interleaved_order_control]: 4.49002e-06 [reorder_send_recv_between_fp_bp]: 6.16e-06 [comm_op_add_attrs]: 3.85e-06 [add_comm_op_reuse_tag]: 4.12998e-06 [interleave_split_concat_branches]: 3.75998e-06 [interleave_parallel_branches]: 3.95998e-06 [overlap_opt_shard_in_pipeline]: 3.73999e-06 [overlap_opt_shard_grad_in_pipeline]: 4.70999e-06 [control_data_broadcast_order]: 1.878e-05 [grouped_pairwise_exchange_alltoall]: 4.2e-06 [offloading_packed_experts]: 7.83999e-06 [overlap_recompute_and_grad_model_parallel]: 8.70001e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.58999e-06 [overlap_recompute_allgather_and_fa_grad]: 3.77998e-06 [overlap_recompute_comm]: 5.64998e-06 [overlap_grad_ring_attention]: 7.60998e-06 [overlap_grad_flash_sp]: 2.657e-05 [begin_end_overlap_inline]: 4.08001e-06 [split_matmul_comm_elemetwise]: 5.38002e-06 [split_layernorm_comm]: 4.94998e-06 [handle_group_info]: 3.7e-06 [symbol_engine_optimizer]: 0.00012251, [1] [Cycle 1]: 0.00011401, [6] [build]: 3.97e-06 [elim_shapecalc]: 1.485e-05 [elim_not_effective]: 1.785e-05 [opt_reshape]: 9.86e-06 [fold_const_symbol]: 1.291e-05 [renormalize]: 2.19996e-07 [detach_backward]: 4.64002e-06 [pipeline_parallel_scheduler]: 2.27999e-06 [auto_monad_reorder]: 2.539e-05 [get_jit_bprop_graph]: 1.59e-06 [rewriter_after_jit_bprop_graph]: 4.43001e-06 [opt_after_jit_grad]: 0.00057204 [validate]: 4.345e-05 Sums bootstrap : 0.000434s : 3.40% type_inference : 0.006125s : 47.93% event_method : 0.000021s : 0.16% auto_monad : 0.000074s : 0.58% graph_reusing : 0.000006s : 0.05% inline : 0.000002s : 0.01% add_attr.add_attr_with_inline.tag_attr : 0.000024s : 0.19% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.05% parallel-infer-symbol : 0.000004s : 0.03% pre_auto_parallel : 0.000042s : 0.33% insert-virtual-dataset : 0.000002s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.01% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000037s : 0.29% optimize.rewriter_before_opt_a : 0.000098s : 0.77% optimize.opt_a.expand_dump_flag : 0.000006s : 0.04% optimize.opt_a.switch_simplify : 0.000056s : 0.44% optimize.opt_a.loop_unroll : 0.000040s : 0.31% optimize.opt_a.a_1 : 0.000931s : 7.29% optimize.opt_a.with_stream_mark : 0.000032s : 0.25% optimize.opt_a.recompute_prepare : 0.000019s : 0.15% optimize.opt_a.updatestate_depend_eliminate : 0.000009s : 0.07% optimize.opt_a.updatestate_assign_eliminate : 0.000008s : 0.06% optimize.opt_a.updatestate_loads_eliminate : 0.000008s : 0.06% optimize.opt_a.parameter_eliminate : 0.000004s : 0.03% optimize.opt_a.a_2 : 0.000271s : 2.12% optimize.opt_a.accelerated_algorithm : 0.000035s : 0.28% optimize.opt_a.shard : 0.000003s : 0.03% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.03% optimize.opt_a.shard_inline : 0.000017s : 0.13% optimize.opt_a.merge_send_recv : 0.000017s : 0.13% optimize.opt_a.auto_parallel : 0.000017s : 0.14% optimize.opt_a.parallel : 0.000027s : 0.21% optimize.opt_a.flash_sp : 0.000012s : 0.10% optimize.opt_a.merge_comm : 0.000009s : 0.07% optimize.opt_a.allreduce_fusion : 0.000010s : 0.08% optimize.opt_a.matmul_add_comm_reduction : 0.000024s : 0.18% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000002s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000020s : 0.15% optimize.opt_a.virtual_dataset : 0.000017s : 0.13% optimize.opt_a.get_grad_eliminate_ : 0.000026s : 0.21% optimize.opt_a.virtual_output : 0.000017s : 0.14% optimize.opt_a.merge_forward : 0.000009s : 0.07% optimize.opt_a.cell_reuse_recompute_pass : 0.000005s : 0.04% optimize.opt_a.offload_activation : 0.000022s : 0.17% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000038s : 0.29% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.02% optimize.opt_a.before_grad : 0.000027s : 0.21% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000011s : 0.09% optimize.opt_a.meta_fg_expand : 0.000006s : 0.05% optimize.opt_a.flash_sp_send_recv_attached : 0.000007s : 0.05% optimize.opt_a.receive_attached : 0.000004s : 0.03% optimize.opt_a.after_resolve : 0.000029s : 0.23% optimize.opt_a.a_after_grad : 0.000026s : 0.20% optimize.opt_a.renormalize : 0.000951s : 7.44% optimize.opt_a.add_forward_monad_depend : 0.000009s : 0.07% optimize.opt_a.auto_monad_grad : 0.000005s : 0.04% optimize.opt_a.auto_monad_eliminator : 0.000030s : 0.24% optimize.opt_a.cse : 0.000060s : 0.47% optimize.opt_a.a_3 : 0.000138s : 1.08% optimize.py_interpret_to_execute_after_opt_a : 0.000017s : 0.13% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.04% optimize.rewriter_after_opt_a : 0.000048s : 0.37% optimize.convert_after_rewriter : 0.000010s : 0.08% optimize.order_py_execute_after_rewriter : 0.000009s : 0.07% optimize.mutable_eliminate : 0.000702s : 5.50% optimize.opt_b.b_1 : 0.000245s : 1.92% optimize.opt_b.b_2 : 0.000012s : 0.09% optimize.opt_b.updatestate_depend_eliminate : 0.000008s : 0.06% optimize.opt_b.updatestate_assign_eliminate : 0.000004s : 0.03% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.02% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000027s : 0.21% optimize.optimize_parallel_all_gather_comm : 0.000024s : 0.19% optimize.overlap_param_gather : 0.000007s : 0.05% optimize.cconv : 0.000034s : 0.27% optimize.loop_unroll : 0.000532s : 4.16% optimize.opt_after_cconv.c_1 : 0.000046s : 0.36% optimize.opt_after_cconv.parameter_eliminate : 0.000005s : 0.04% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000007s : 0.06% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000004s : 0.03% optimize.opt_after_cconv.cse : 0.000025s : 0.20% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000020s : 0.16% optimize.tuple_transform.d_1 : 0.000065s : 0.51% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000009s : 0.07% optimize.partial_unused_args_eliminate : 0.000005s : 0.04% optimize.add_recomputation : 0.000065s : 0.51% optimize.cse_after_recomputation.cse : 0.000017s : 0.13% optimize.environ_conv : 0.000010s : 0.08% optimize.swap_dp_allreduce_reducescatter : 0.000009s : 0.07% optimize.bias_add_comm_swap : 0.000006s : 0.05% optimize.label_micro_interleaved_index : 0.000008s : 0.07% optimize.label_fine_grained_interleaved_index : 0.000005s : 0.04% optimize.merge_cast_opt : 0.000004s : 0.03% optimize.slice_recompute_activation : 0.000005s : 0.04% optimize.micro_interleaved_order_control : 0.000005s : 0.04% optimize.assign_add_opt : 0.000004s : 0.03% optimize.ForceFp32Comm : 0.000004s : 0.03% optimize.remove_cast_before_assign_add : 0.000004s : 0.03% optimize.full_micro_interleaved_order_control : 0.000004s : 0.04% optimize.reorder_send_recv_between_fp_bp : 0.000006s : 0.05% optimize.comm_op_add_attrs : 0.000004s : 0.03% optimize.add_comm_op_reuse_tag : 0.000004s : 0.03% optimize.interleave_split_concat_branches : 0.000004s : 0.03% optimize.interleave_parallel_branches : 0.000004s : 0.03% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.03% optimize.overlap_opt_shard_grad_in_pipeline : 0.000005s : 0.04% optimize.control_data_broadcast_order : 0.000019s : 0.15% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.03% optimize.offloading_packed_experts : 0.000008s : 0.06% optimize.overlap_recompute_and_grad_model_parallel : 0.000009s : 0.07% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.03% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.03% optimize.overlap_recompute_comm : 0.000006s : 0.04% optimize.overlap_grad_ring_attention : 0.000008s : 0.06% optimize.overlap_grad_flash_sp : 0.000027s : 0.21% optimize.begin_end_overlap_inline : 0.000004s : 0.03% optimize.split_matmul_comm_elemetwise : 0.000005s : 0.04% optimize.split_layernorm_comm : 0.000005s : 0.04% optimize.handle_group_info : 0.000004s : 0.03% optimize.symbol_engine_optimizer.build : 0.000004s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000015s : 0.12% optimize.symbol_engine_optimizer.elim_not_effective : 0.000018s : 0.14% optimize.symbol_engine_optimizer.opt_reshape : 0.000010s : 0.08% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000013s : 0.10% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000005s : 0.04% pipeline_parallel_scheduler : 0.000002s : 0.02% auto_monad_reorder : 0.000025s : 0.20% get_jit_bprop_graph : 0.000002s : 0.01% rewriter_after_jit_bprop_graph : 0.000004s : 0.03% opt_after_jit_grad : 0.000572s : 4.48% validate : 0.000043s : 0.34% Time group info: ------[substitution.] 0.000277 45 11.97% : 0.000033s : 5: substitution.arithmetic_simplify 10.75% : 0.000030s : 3: substitution.cast_eliminate 0.86% : 0.000002s : 3: substitution.elim_not_effective 0.57% : 0.000002s : 3: substitution.fold_const_symbol 2.57% : 0.000007s : 5: substitution.graph_param_transform 57.04% : 0.000158s : 4: substitution.inline 1.68% : 0.000005s : 6: substitution.j_node_and_user_rematch 5.09% : 0.000014s : 2: substitution.less_batch_normalization 2.46% : 0.000007s : 6: substitution.remove_not_recompute_node 2.18% : 0.000006s : 4: substitution.replace_old_param 4.84% : 0.000013s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.006063 2 86.01% : 0.005215s : 1: type_inference.infer 13.99% : 0.000848s : 1: type_inference.specialize ------[replace.] 0.000067 8 62.02% : 0.000041s : 4: replace.inline 37.98% : 0.000025s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000167 8 93.06% : 0.000155s : 4: match.inline 6.94% : 0.000012s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000249 1504 0.87% : 0.000002s : 15: predicate.accumulaten_eliminater 0.81% : 0.000002s : 5: predicate.ad_related_special_op_eliminate 0.56% : 0.000001s : 10: predicate.addn_check_dump 0.88% : 0.000002s : 15: predicate.addn_zero_filter 0.82% : 0.000002s : 15: predicate.adjust_all_reduce_mul_add 2.41% : 0.000006s : 25: predicate.arithmetic_simplify 0.91% : 0.000002s : 15: predicate.cast_eliminate 0.66% : 0.000002s : 10: predicate.check_bprop_eliminate 0.66% : 0.000002s : 10: predicate.compare_switch_simplify 0.21% : 0.000001s : 5: predicate.const_output_eliminate 0.60% : 0.000001s : 10: predicate.depend_value_elim 0.95% : 0.000002s : 15: predicate.dict_get_item_const_eliminator 1.01% : 0.000002s : 15: predicate.dict_get_item_eliminator 0.89% : 0.000002s : 15: predicate.dict_set_item_eliminator 1.02% : 0.000003s : 10: predicate.dumpgradient_eliminate 0.21% : 0.000001s : 5: predicate.elim_not_effective 0.45% : 0.000001s : 5: predicate.elim_shapecalc_of_broadcastargs 1.08% : 0.000003s : 20: predicate.environ_add_const_eliminate 1.08% : 0.000003s : 20: predicate.environ_get_add_eliminate 1.03% : 0.000003s : 20: predicate.environ_get_depend_swap 1.83% : 0.000005s : 30: predicate.environ_get_eliminate 1.07% : 0.000003s : 20: predicate.environ_get_set_eliminate 1.30% : 0.000003s : 23: predicate.exchange_switch_depend_value 2.41% : 0.000006s : 23: predicate.float_depend_g_call 0.54% : 0.000001s : 10: predicate.float_environ_get_switch 0.82% : 0.000002s : 15: predicate.float_tuple_getitem_switch 0.16% : 0.000000s : 5: predicate.fold_const_symbol 0.71% : 0.000002s : 10: predicate.get_grad_eliminate 0.29% : 0.000001s : 5: predicate.graph_param_transform 0.63% : 0.000002s : 10: predicate.incorporate_call 0.53% : 0.000001s : 10: predicate.incorporate_call_switch 6.82% : 0.000017s : 68: predicate.inline 0.93% : 0.000002s : 10: predicate.inline_without_move 0.33% : 0.000001s : 10: predicate.j_node_and_user_rematch 0.99% : 0.000002s : 10: predicate.less_batch_normalization 1.74% : 0.000004s : 29: predicate.list_to_tuple_eliminator_ 2.36% : 0.000006s : 44: predicate.load_eliminater 0.97% : 0.000002s : 5: predicate.loop_unroll_after_grad 2.25% : 0.000006s : 36: predicate.loop_unroll_before_grad 1.80% : 0.000004s : 25: predicate.make_slice_get_slice_eliminator 0.60% : 0.000001s : 10: predicate.merge_addn 0.72% : 0.000002s : 10: predicate.micro_step_allgather_replace 0.60% : 0.000001s : 10: predicate.mini_step_allgather_replace 0.78% : 0.000002s : 15: predicate.minmaximum_grad 1.30% : 0.000003s : 5: predicate.mutable_eliminate 0.41% : 0.000001s : 5: predicate.opt_reshape 0.43% : 0.000001s : 5: predicate.parallel_virtual_node 1.61% : 0.000004s : 23: predicate.partial_defer_inline 1.62% : 0.000004s : 24: predicate.partial_eliminate 0.80% : 0.000002s : 15: predicate.print_const_string_wrapper 0.56% : 0.000001s : 10: predicate.reduce_all_const_elim 1.05% : 0.000003s : 15: predicate.reduce_eliminate 2.37% : 0.000006s : 44: predicate.redundant_stop_gradient_eliminater 0.41% : 0.000001s : 10: predicate.remove_not_recompute_node 1.34% : 0.000003s : 29: predicate.replace_applicator 0.58% : 0.000001s : 10: predicate.replace_old_param 0.35% : 0.000001s : 5: predicate.reset_defer_inline 0.95% : 0.000002s : 15: predicate.reshape_eliminate 0.64% : 0.000002s : 10: predicate.row_tensor_add_zeros_like 0.39% : 0.000001s : 5: predicate.row_tensor_eliminate 0.94% : 0.000002s : 10: predicate.same_eliminate 0.41% : 0.000001s : 10: predicate.set_cell_output_no_recompute 0.76% : 0.000002s : 10: predicate.shard_identity_eliminate 0.87% : 0.000002s : 10: predicate.special_op_eliminate 0.75% : 0.000002s : 10: predicate.specialize_transform 1.08% : 0.000003s : 10: predicate.split_environ_get_set_with_tuple_value 0.85% : 0.000002s : 10: predicate.stack_unstack_eliminate 0.41% : 0.000001s : 5: predicate.switch_call_monad_eliminater 1.47% : 0.000004s : 23: predicate.switch_defer_inline 2.06% : 0.000005s : 33: predicate.switch_layer_defer_inline 4.87% : 0.000012s : 74: predicate.switch_simplify 0.83% : 0.000002s : 15: predicate.tile_eliminate 0.82% : 0.000002s : 15: predicate.transpose_eliminate 1.43% : 0.000004s : 25: predicate.tuple_list_convert_item_index_to_positive 1.53% : 0.000004s : 25: predicate.tuple_list_get_item_const_eliminator 1.40% : 0.000003s : 25: predicate.tuple_list_get_item_depend_reorder 3.41% : 0.000008s : 39: predicate.tuple_list_get_item_eliminator 1.38% : 0.000003s : 25: predicate.tuple_list_get_set_item_eliminator 2.14% : 0.000005s : 35: predicate.tuple_list_set_item_eliminator 1.66% : 0.000004s : 29: predicate.tuple_to_list_eliminator_ 2.29% : 0.000006s : 44: predicate.updatestate_pure_node_eliminater 3.09% : 0.000008s : 54: predicate.updatestate_useless_node_eliminater 0.41% : 0.000001s : 5: predicate.value_based_eliminate 0.62% : 0.000002s : 10: predicate.virtual_dataset_eliminate 0.62% : 0.000002s : 10: predicate.virtual_output_eliminate 0.29% : 0.000001s : 5: predicate.virtual_view_grad_eliminate 0.53% : 0.000001s : 5: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000629 11 48.32% : 0.000304s : 5: func_graph_cloner_run.FuncGraphClonerGraph 51.68% : 0.000325s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.031205 192 0.02% : 0.000007s : 1: ForceFp32Comm 11.27% : 0.003515s : 1: add_attr 11.21% : 0.003497s : 1: add_attr_with_inline 0.02% : 0.000007s : 1: add_comm_op_reuse_tag 0.22% : 0.000069s : 1: add_recomputation 0.02% : 0.000007s : 1: assign_add_opt 0.27% : 0.000085s : 1: auto_monad 0.11% : 0.000033s : 1: auto_monad_reorder 0.02% : 0.000007s : 1: begin_end_overlap_inline 0.03% : 0.000009s : 1: bias_add_comm_swap 1.55% : 0.000484s : 1: bootstrap 0.12% : 0.000038s : 1: cconv 0.02% : 0.000006s : 1: comm_op_add_attrs 0.07% : 0.000022s : 1: control_data_broadcast_order 0.04% : 0.000013s : 1: convert_after_rewriter 0.12% : 0.000038s : 1: cse_after_recomputation 0.03% : 0.000008s : 1: dataset_repeat_opt 0.08% : 0.000024s : 1: detach_backward 0.04% : 0.000013s : 1: environ_conv 0.10% : 0.000031s : 1: event_method 0.02% : 0.000007s : 1: full_micro_interleaved_order_control 0.02% : 0.000008s : 1: get_jit_bprop_graph 0.04% : 0.000013s : 1: graph_reusing 0.02% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.02% : 0.000006s : 1: handle_group_info 0.03% : 0.000008s : 1: inline 0.03% : 0.000008s : 1: insert-virtual-dataset 0.02% : 0.000007s : 1: interleave_parallel_branches 0.02% : 0.000006s : 1: interleave_split_concat_branches 0.03% : 0.000008s : 1: label_fine_grained_interleaved_index 0.04% : 0.000011s : 1: label_micro_interleaved_index 1.73% : 0.000538s : 1: loop_unroll 0.02% : 0.000007s : 1: merge_cast_opt 0.03% : 0.000008s : 1: micro_interleaved_order_control 2.28% : 0.000710s : 1: mutable_eliminate 0.03% : 0.000011s : 1: offloading_packed_experts 0.07% : 0.000021s : 1: opt.transform.loop_unroll_optimizer 0.07% : 0.000021s : 1: opt.transform.mutable_eliminate 4.89% : 0.001525s : 78: opt.transform.opt_a 0.14% : 0.000044s : 1: opt.transform.opt_after_cconv 0.11% : 0.000034s : 1: opt.transform.opt_after_jit_grad 0.52% : 0.000161s : 28: opt.transform.opt_b 0.23% : 0.000071s : 2: opt.transform.opt_trans_graph 0.17% : 0.000052s : 4: opt.transform.symbol_engine_opt 12.23% : 0.003817s : 1: opt_a 0.52% : 0.000163s : 1: opt_after_cconv 1.87% : 0.000582s : 1: opt_after_jit_grad 1.25% : 0.000391s : 1: opt_b 22.92% : 0.007152s : 1: optimize 0.09% : 0.000028s : 1: optimize_parallel_all_gather_comm 0.04% : 0.000012s : 1: order_py_execute_after_rewriter 0.09% : 0.000030s : 1: overlap_grad_flash_sp 0.02% : 0.000007s : 1: overlap_grad_matmul_and_grad_allreduce 0.03% : 0.000010s : 1: overlap_grad_ring_attention 0.02% : 0.000008s : 1: overlap_opt_shard_grad_in_pipeline 0.02% : 0.000007s : 1: overlap_opt_shard_in_pipeline 0.03% : 0.000010s : 1: overlap_param_gather 0.02% : 0.000007s : 1: overlap_recompute_allgather_and_fa_grad 0.04% : 0.000012s : 1: overlap_recompute_and_grad_model_parallel 0.03% : 0.000009s : 1: overlap_recompute_comm 0.03% : 0.000010s : 1: parallel-infer-symbol 0.02% : 0.000006s : 1: parallel-infer-symbol-second 0.03% : 0.000008s : 1: partial_unused_args_eliminate 0.03% : 0.000011s : 1: pipeline_parallel_scheduler 0.02% : 0.000007s : 1: pipeline_split 0.16% : 0.000051s : 1: pre_auto_parallel 0.13% : 0.000040s : 1: py_interpret_to_execute 0.07% : 0.000020s : 1: py_interpret_to_execute_after_opt_a 0.02% : 0.000006s : 1: remove_cast_before_assign_add 0.08% : 0.000024s : 1: remove_dup_value 1.70% : 0.000529s : 1: renormalize.infer 1.32% : 0.000412s : 1: renormalize.specialize 0.03% : 0.000009s : 1: reorder_send_recv_between_fp_bp 0.04% : 0.000011s : 1: rewriter_after_jit_bprop_graph 0.16% : 0.000051s : 1: rewriter_after_opt_a 0.33% : 0.000102s : 1: rewriter_before_opt_a 0.02% : 0.000008s : 1: slice_cell_reuse_recomputed_activation 0.03% : 0.000008s : 1: slice_recompute_activation 0.02% : 0.000008s : 1: split_layernorm_comm 0.03% : 0.000008s : 1: split_matmul_comm_elemetwise 0.04% : 0.000012s : 1: swap_dp_allreduce_reducescatter 0.40% : 0.000126s : 1: symbol_engine_optimizer 0.38% : 0.000118s : 1: tuple_transform 19.76% : 0.006165s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:39:13.266.680 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.266485, [21] [bootstrap]: 0.00054218 [type_inference]: 0.0193701 [event_method]: 2.332e-05 [auto_monad]: 7.918e-05 [graph_reusing]: 7.54002e-06 [inline]: 3.36999e-06 [add_attr]: 0.00370173, [1] [add_attr_with_inline]: 0.00369018, [1] [Cycle 1]: 7.451e-05, [2] [tag_attr]: 2.373e-05 [meta_addattr_fg_expand]: 6.07001e-06 [parallel-infer-symbol]: 3.58e-06 [pre_auto_parallel]: 4.339e-05 [insert-virtual-dataset]: 2.31e-06 [parallel-infer-symbol-second]: 1.87999e-06 [dataset_repeat_opt]: 2.07999e-06 [pipeline_split]: 1.57001e-06 [optimize]: 0.241761, [53] [py_interpret_to_execute]: 3.474e-05 [rewriter_before_opt_a]: 9.712e-05 [opt_a]: 0.238914, [2] [Cycle 1]: 0.237929, [45] [expand_dump_flag]: 2.82002e-06 [switch_simplify]: 4.51e-05 [loop_unroll]: 3.167e-05 [a_1]: 0.00074207 [with_stream_mark]: 2.229e-05 [recompute_prepare]: 1.096e-05 [updatestate_depend_eliminate]: 5.10001e-06 [updatestate_assign_eliminate]: 3.78001e-06 [updatestate_loads_eliminate]: 3.33e-06 [parameter_eliminate]: 2.07001e-06 [a_2]: 0.00011457 [accelerated_algorithm]: 2.35e-05 [shard]: 2.53e-06 [meta_shard_fg_expand]: 2.29001e-06 [shard_inline]: 8.02e-06 [merge_send_recv]: 9.24998e-06 [auto_parallel]: 8.25e-06 [parallel]: 2.185e-05 [flash_sp]: 8.47e-06 [merge_comm]: 5.46002e-06 [allreduce_fusion]: 4.85001e-06 [matmul_add_comm_reduction]: 1.209e-05 [allreduce_slice_to_reducescatter]: 7.00005e-07 [virtual_shard_identity]: 1.208e-05 [virtual_dataset]: 9.41e-06 [get_grad_eliminate_]: 7.9e-06 [virtual_output]: 7.84997e-06 [merge_forward]: 4.70001e-06 [cell_reuse_recompute_pass]: 1.57999e-06 [offload_activation]: 1.14e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.693e-05 [merge_recompute_call_nodes]: 1.47001e-06 [before_grad]: 1.352e-05 [set_forward_comm_id_for_comm_node_pass]: 4.52e-06 [meta_fg_expand]: 3.4e-06 [flash_sp_send_recv_attached]: 4.75001e-06 [receive_attached]: 2.31e-06 [after_resolve]: 1.515e-05 [a_after_grad]: 1.414e-05 [renormalize]: 0.236245 [add_forward_monad_depend]: 1.17e-05 [auto_monad_grad]: 2.68998e-06 [auto_monad_eliminator]: 2.7e-05 [cse]: 4.253e-05 [a_3]: 7.865e-05 [Cycle 2]: 0.00097073, [45] [expand_dump_flag]: 2.64001e-06 [switch_simplify]: 1.149e-05 [loop_unroll]: 8.50001e-06 [a_1]: 0.00022262 [with_stream_mark]: 1.932e-05 [recompute_prepare]: 8.94e-06 [updatestate_depend_eliminate]: 5.04e-06 [updatestate_assign_eliminate]: 4.08001e-06 [updatestate_loads_eliminate]: 3.99002e-06 [parameter_eliminate]: 2.00002e-06 [a_2]: 0.00010288 [accelerated_algorithm]: 1.428e-05 [shard]: 2.37001e-06 [meta_shard_fg_expand]: 2.63003e-06 [shard_inline]: 7.95998e-06 [merge_send_recv]: 1.079e-05 [auto_parallel]: 1.07e-05 [parallel]: 9.41998e-06 [flash_sp]: 4.53001e-06 [merge_comm]: 4.29997e-06 [allreduce_fusion]: 4.43001e-06 [matmul_add_comm_reduction]: 1.337e-05 [allreduce_slice_to_reducescatter]: 8.30012e-07 [virtual_shard_identity]: 6.672e-05 [virtual_dataset]: 8.35001e-06 [get_grad_eliminate_]: 7.67002e-06 [virtual_output]: 7.64002e-06 [merge_forward]: 5.90002e-06 [cell_reuse_recompute_pass]: 3.45e-06 [offload_activation]: 1.35e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.612e-05 [merge_recompute_call_nodes]: 1.54e-06 [before_grad]: 1.349e-05 [set_forward_comm_id_for_comm_node_pass]: 5.63997e-06 [meta_fg_expand]: 3.63e-06 [flash_sp_send_recv_attached]: 2.07999e-06 [receive_attached]: 2.14e-06 [after_resolve]: 1.528e-05 [a_after_grad]: 1.269e-05 [renormalize]: 8.9989e-08 [add_forward_monad_depend]: 1.92001e-06 [auto_monad_grad]: 2.12999e-06 [auto_monad_eliminator]: 1.113e-05 [cse]: 2.221e-05 [a_3]: 4.751e-05 [py_interpret_to_execute_after_opt_a]: 2.048e-05 [slice_cell_reuse_recomputed_activation]: 2.17001e-06 [rewriter_after_opt_a]: 4.72e-05 [convert_after_rewriter]: 7.68001e-06 [order_py_execute_after_rewriter]: 6.36e-06 [mutable_eliminate]: 0.00081642 [opt_b]: 0.00029824, [1] [Cycle 1]: 0.00028963, [7] [b_1]: 0.00018997 [b_2]: 1.174e-05 [updatestate_depend_eliminate]: 1.033e-05 [updatestate_assign_eliminate]: 3.43e-06 [updatestate_loads_eliminate]: 3.35e-06 [renormalize]: 1.04998e-06 [cse]: 3.052e-05 [optimize_parallel_all_gather_comm]: 2.11e-05 [overlap_param_gather]: 1.94e-06 [cconv]: 3.268e-05 [loop_unroll]: 0.00052089 [opt_after_cconv]: 0.00013637, [1] [Cycle 1]: 0.00013017, [7] [c_1]: 4.546e-05 [parameter_eliminate]: 5.72001e-06 [updatestate_depend_eliminate]: 7.11001e-06 [updatestate_assign_eliminate]: 3.21999e-06 [updatestate_loads_eliminate]: 3.88999e-06 [cse]: 2.704e-05 [renormalize]: 6.09987e-07 [remove_dup_value]: 1.782e-05 [tuple_transform]: 0.00010422, [1] [Cycle 1]: 9.912e-05, [4] [d_1]: 6.707e-05 [none_parameter_eliminate]: 2.32001e-06 [renormalize]: 2.10013e-07 [switch_simplify]: 9.96e-06 [partial_unused_args_eliminate]: 2.04999e-06 [add_recomputation]: 6.528e-05 [cse_after_recomputation]: 0.00012663, [1] [Cycle 1]: 0.00012058, [1] [cse]: 2.333e-05 [environ_conv]: 7.58001e-06 [swap_dp_allreduce_reducescatter]: 8.55999e-06 [bias_add_comm_swap]: 3.56001e-06 [label_micro_interleaved_index]: 5.25999e-06 [label_fine_grained_interleaved_index]: 2.74001e-06 [merge_cast_opt]: 1.51002e-06 [slice_recompute_activation]: 2.34999e-06 [micro_interleaved_order_control]: 2.74999e-06 [assign_add_opt]: 1.98997e-06 [ForceFp32Comm]: 9.00007e-07 [remove_cast_before_assign_add]: 1.07e-06 [full_micro_interleaved_order_control]: 2.07001e-06 [reorder_send_recv_between_fp_bp]: 2.96001e-06 [comm_op_add_attrs]: 1.04998e-06 [add_comm_op_reuse_tag]: 9.89996e-07 [interleave_split_concat_branches]: 1.16002e-06 [interleave_parallel_branches]: 1.14e-06 [overlap_opt_shard_in_pipeline]: 1.39e-06 [overlap_opt_shard_grad_in_pipeline]: 1.81998e-06 [control_data_broadcast_order]: 1.664e-05 [grouped_pairwise_exchange_alltoall]: 1.91e-06 [offloading_packed_experts]: 5.66e-06 [overlap_recompute_and_grad_model_parallel]: 5.71e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.22999e-06 [overlap_recompute_allgather_and_fa_grad]: 1.53002e-06 [overlap_recompute_comm]: 2.84001e-06 [overlap_grad_ring_attention]: 4.93001e-06 [overlap_grad_flash_sp]: 2.816e-05 [begin_end_overlap_inline]: 8.89995e-07 [split_matmul_comm_elemetwise]: 2.52001e-06 [split_layernorm_comm]: 2.51e-06 [handle_group_info]: 1.16002e-06 [symbol_engine_optimizer]: 0.00010967, [1] [Cycle 1]: 0.0001037, [6] [build]: 5.04e-06 [elim_shapecalc]: 1.644e-05 [elim_not_effective]: 1.884e-05 [opt_reshape]: 1.016e-05 [fold_const_symbol]: 1.394e-05 [renormalize]: 2.10013e-07 [detach_backward]: 2.26998e-06 [pipeline_parallel_scheduler]: 2.00002e-06 [auto_monad_reorder]: 2.335e-05 [get_jit_bprop_graph]: 3.43e-06 [rewriter_after_jit_bprop_graph]: 6.12999e-06 [opt_after_jit_grad]: 0.00066083 [validate]: 5.425e-05 Sums bootstrap : 0.000542s : 0.21% type_inference : 0.019370s : 7.41% event_method : 0.000023s : 0.01% auto_monad : 0.000079s : 0.03% graph_reusing : 0.000008s : 0.00% inline : 0.000003s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000024s : 0.01% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.00% parallel-infer-symbol : 0.000004s : 0.00% pre_auto_parallel : 0.000043s : 0.02% insert-virtual-dataset : 0.000002s : 0.00% parallel-infer-symbol-second : 0.000002s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000035s : 0.01% optimize.rewriter_before_opt_a : 0.000097s : 0.04% optimize.opt_a.expand_dump_flag : 0.000005s : 0.00% optimize.opt_a.switch_simplify : 0.000057s : 0.02% optimize.opt_a.loop_unroll : 0.000040s : 0.02% optimize.opt_a.a_1 : 0.000965s : 0.37% optimize.opt_a.with_stream_mark : 0.000042s : 0.02% optimize.opt_a.recompute_prepare : 0.000020s : 0.01% optimize.opt_a.updatestate_depend_eliminate : 0.000010s : 0.00% optimize.opt_a.updatestate_assign_eliminate : 0.000008s : 0.00% optimize.opt_a.updatestate_loads_eliminate : 0.000007s : 0.00% optimize.opt_a.parameter_eliminate : 0.000004s : 0.00% optimize.opt_a.a_2 : 0.000217s : 0.08% optimize.opt_a.accelerated_algorithm : 0.000038s : 0.01% optimize.opt_a.shard : 0.000005s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.000005s : 0.00% optimize.opt_a.shard_inline : 0.000016s : 0.01% optimize.opt_a.merge_send_recv : 0.000020s : 0.01% optimize.opt_a.auto_parallel : 0.000019s : 0.01% optimize.opt_a.parallel : 0.000031s : 0.01% optimize.opt_a.flash_sp : 0.000013s : 0.00% optimize.opt_a.merge_comm : 0.000010s : 0.00% optimize.opt_a.allreduce_fusion : 0.000009s : 0.00% optimize.opt_a.matmul_add_comm_reduction : 0.000025s : 0.01% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000002s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000079s : 0.03% optimize.opt_a.virtual_dataset : 0.000018s : 0.01% optimize.opt_a.get_grad_eliminate_ : 0.000016s : 0.01% optimize.opt_a.virtual_output : 0.000015s : 0.01% optimize.opt_a.merge_forward : 0.000011s : 0.00% optimize.opt_a.cell_reuse_recompute_pass : 0.000005s : 0.00% optimize.opt_a.offload_activation : 0.000025s : 0.01% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000033s : 0.01% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.00% optimize.opt_a.before_grad : 0.000027s : 0.01% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000010s : 0.00% optimize.opt_a.meta_fg_expand : 0.000007s : 0.00% optimize.opt_a.flash_sp_send_recv_attached : 0.000007s : 0.00% optimize.opt_a.receive_attached : 0.000004s : 0.00% optimize.opt_a.after_resolve : 0.000030s : 0.01% optimize.opt_a.a_after_grad : 0.000027s : 0.01% optimize.opt_a.renormalize : 0.236246s : 90.32% optimize.opt_a.add_forward_monad_depend : 0.000014s : 0.01% optimize.opt_a.auto_monad_grad : 0.000005s : 0.00% optimize.opt_a.auto_monad_eliminator : 0.000038s : 0.01% optimize.opt_a.cse : 0.000065s : 0.02% optimize.opt_a.a_3 : 0.000126s : 0.05% optimize.py_interpret_to_execute_after_opt_a : 0.000020s : 0.01% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.00% optimize.rewriter_after_opt_a : 0.000047s : 0.02% optimize.convert_after_rewriter : 0.000008s : 0.00% optimize.order_py_execute_after_rewriter : 0.000006s : 0.00% optimize.mutable_eliminate : 0.000816s : 0.31% optimize.opt_b.b_1 : 0.000190s : 0.07% optimize.opt_b.b_2 : 0.000012s : 0.00% optimize.opt_b.updatestate_depend_eliminate : 0.000010s : 0.00% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.00% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000031s : 0.01% optimize.optimize_parallel_all_gather_comm : 0.000021s : 0.01% optimize.overlap_param_gather : 0.000002s : 0.00% optimize.cconv : 0.000033s : 0.01% optimize.loop_unroll : 0.000521s : 0.20% optimize.opt_after_cconv.c_1 : 0.000045s : 0.02% optimize.opt_after_cconv.parameter_eliminate : 0.000006s : 0.00% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000007s : 0.00% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000004s : 0.00% optimize.opt_after_cconv.cse : 0.000027s : 0.01% optimize.opt_after_cconv.renormalize : 0.000001s : 0.00% optimize.remove_dup_value : 0.000018s : 0.01% optimize.tuple_transform.d_1 : 0.000067s : 0.03% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000010s : 0.00% optimize.partial_unused_args_eliminate : 0.000002s : 0.00% optimize.add_recomputation : 0.000065s : 0.02% optimize.cse_after_recomputation.cse : 0.000023s : 0.01% optimize.environ_conv : 0.000008s : 0.00% optimize.swap_dp_allreduce_reducescatter : 0.000009s : 0.00% optimize.bias_add_comm_swap : 0.000004s : 0.00% optimize.label_micro_interleaved_index : 0.000005s : 0.00% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.00% optimize.merge_cast_opt : 0.000002s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.00% optimize.micro_interleaved_order_control : 0.000003s : 0.00% optimize.assign_add_opt : 0.000002s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.00% optimize.full_micro_interleaved_order_control : 0.000002s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.00% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.00% optimize.control_data_broadcast_order : 0.000017s : 0.01% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.00% optimize.offloading_packed_experts : 0.000006s : 0.00% optimize.overlap_recompute_and_grad_model_parallel : 0.000006s : 0.00% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000002s : 0.00% optimize.overlap_recompute_comm : 0.000003s : 0.00% optimize.overlap_grad_ring_attention : 0.000005s : 0.00% optimize.overlap_grad_flash_sp : 0.000028s : 0.01% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000003s : 0.00% optimize.split_layernorm_comm : 0.000003s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000005s : 0.00% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000016s : 0.01% optimize.symbol_engine_optimizer.elim_not_effective : 0.000019s : 0.01% optimize.symbol_engine_optimizer.opt_reshape : 0.000010s : 0.00% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000014s : 0.01% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.00% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000023s : 0.01% get_jit_bprop_graph : 0.000003s : 0.00% rewriter_after_jit_bprop_graph : 0.000006s : 0.00% opt_after_jit_grad : 0.000661s : 0.25% validate : 0.000054s : 0.02% Time group info: ------[substitution.] 0.000304 45 12.30% : 0.000037s : 5: substitution.arithmetic_simplify 11.46% : 0.000035s : 3: substitution.cast_eliminate 0.72% : 0.000002s : 3: substitution.elim_not_effective 0.58% : 0.000002s : 3: substitution.fold_const_symbol 2.49% : 0.000008s : 5: substitution.graph_param_transform 56.75% : 0.000173s : 4: substitution.inline 1.78% : 0.000005s : 6: substitution.j_node_and_user_rematch 4.91% : 0.000015s : 2: substitution.less_batch_normalization 2.71% : 0.000008s : 6: substitution.remove_not_recompute_node 2.05% : 0.000006s : 4: substitution.replace_old_param 4.25% : 0.000013s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.019282 2 93.74% : 0.018074s : 1: type_inference.infer 6.26% : 0.001208s : 1: type_inference.specialize ------[replace.] 0.000068 8 62.01% : 0.000042s : 4: replace.inline 37.99% : 0.000026s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000181 8 93.87% : 0.000170s : 4: match.inline 6.13% : 0.000011s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000257 1504 0.89% : 0.000002s : 15: predicate.accumulaten_eliminater 0.81% : 0.000002s : 5: predicate.ad_related_special_op_eliminate 0.70% : 0.000002s : 10: predicate.addn_check_dump 0.87% : 0.000002s : 15: predicate.addn_zero_filter 0.78% : 0.000002s : 15: predicate.adjust_all_reduce_mul_add 2.31% : 0.000006s : 25: predicate.arithmetic_simplify 0.87% : 0.000002s : 15: predicate.cast_eliminate 0.58% : 0.000001s : 10: predicate.check_bprop_eliminate 0.60% : 0.000002s : 10: predicate.compare_switch_simplify 0.20% : 0.000001s : 5: predicate.const_output_eliminate 0.63% : 0.000002s : 10: predicate.depend_value_elim 0.82% : 0.000002s : 15: predicate.dict_get_item_const_eliminator 1.01% : 0.000003s : 15: predicate.dict_get_item_eliminator 0.90% : 0.000002s : 15: predicate.dict_set_item_eliminator 1.08% : 0.000003s : 10: predicate.dumpgradient_eliminate 0.26% : 0.000001s : 5: predicate.elim_not_effective 0.61% : 0.000002s : 5: predicate.elim_shapecalc_of_broadcastargs 1.22% : 0.000003s : 20: predicate.environ_add_const_eliminate 1.08% : 0.000003s : 20: predicate.environ_get_add_eliminate 1.18% : 0.000003s : 20: predicate.environ_get_depend_swap 1.70% : 0.000004s : 30: predicate.environ_get_eliminate 1.08% : 0.000003s : 20: predicate.environ_get_set_eliminate 1.28% : 0.000003s : 23: predicate.exchange_switch_depend_value 2.44% : 0.000006s : 23: predicate.float_depend_g_call 0.53% : 0.000001s : 10: predicate.float_environ_get_switch 0.88% : 0.000002s : 15: predicate.float_tuple_getitem_switch 0.17% : 0.000000s : 5: predicate.fold_const_symbol 0.72% : 0.000002s : 10: predicate.get_grad_eliminate 0.19% : 0.000001s : 5: predicate.graph_param_transform 0.62% : 0.000002s : 10: predicate.incorporate_call 0.53% : 0.000001s : 10: predicate.incorporate_call_switch 6.09% : 0.000016s : 68: predicate.inline 0.71% : 0.000002s : 10: predicate.inline_without_move 0.34% : 0.000001s : 10: predicate.j_node_and_user_rematch 0.90% : 0.000002s : 10: predicate.less_batch_normalization 1.97% : 0.000005s : 29: predicate.list_to_tuple_eliminator_ 2.34% : 0.000006s : 44: predicate.load_eliminater 0.95% : 0.000002s : 5: predicate.loop_unroll_after_grad 2.16% : 0.000006s : 36: predicate.loop_unroll_before_grad 1.65% : 0.000004s : 25: predicate.make_slice_get_slice_eliminator 0.57% : 0.000001s : 10: predicate.merge_addn 0.60% : 0.000002s : 10: predicate.micro_step_allgather_replace 0.59% : 0.000002s : 10: predicate.mini_step_allgather_replace 0.81% : 0.000002s : 15: predicate.minmaximum_grad 1.40% : 0.000004s : 5: predicate.mutable_eliminate 0.41% : 0.000001s : 5: predicate.opt_reshape 0.46% : 0.000001s : 5: predicate.parallel_virtual_node 1.51% : 0.000004s : 23: predicate.partial_defer_inline 1.49% : 0.000004s : 24: predicate.partial_eliminate 0.85% : 0.000002s : 15: predicate.print_const_string_wrapper 0.68% : 0.000002s : 10: predicate.reduce_all_const_elim 1.19% : 0.000003s : 15: predicate.reduce_eliminate 2.36% : 0.000006s : 44: predicate.redundant_stop_gradient_eliminater 0.39% : 0.000001s : 10: predicate.remove_not_recompute_node 1.50% : 0.000004s : 29: predicate.replace_applicator 0.55% : 0.000001s : 10: predicate.replace_old_param 0.39% : 0.000001s : 5: predicate.reset_defer_inline 0.95% : 0.000002s : 15: predicate.reshape_eliminate 0.58% : 0.000001s : 10: predicate.row_tensor_add_zeros_like 0.58% : 0.000001s : 5: predicate.row_tensor_eliminate 0.83% : 0.000002s : 10: predicate.same_eliminate 0.39% : 0.000001s : 10: predicate.set_cell_output_no_recompute 0.79% : 0.000002s : 10: predicate.shard_identity_eliminate 0.81% : 0.000002s : 10: predicate.special_op_eliminate 0.72% : 0.000002s : 10: predicate.specialize_transform 1.05% : 0.000003s : 10: predicate.split_environ_get_set_with_tuple_value 1.08% : 0.000003s : 10: predicate.stack_unstack_eliminate 0.39% : 0.000001s : 5: predicate.switch_call_monad_eliminater 1.34% : 0.000003s : 23: predicate.switch_defer_inline 1.86% : 0.000005s : 33: predicate.switch_layer_defer_inline 4.70% : 0.000012s : 74: predicate.switch_simplify 1.01% : 0.000003s : 15: predicate.tile_eliminate 1.02% : 0.000003s : 15: predicate.transpose_eliminate 1.57% : 0.000004s : 25: predicate.tuple_list_convert_item_index_to_positive 1.49% : 0.000004s : 25: predicate.tuple_list_get_item_const_eliminator 1.50% : 0.000004s : 25: predicate.tuple_list_get_item_depend_reorder 3.28% : 0.000008s : 39: predicate.tuple_list_get_item_eliminator 1.64% : 0.000004s : 25: predicate.tuple_list_get_set_item_eliminator 2.46% : 0.000006s : 35: predicate.tuple_list_set_item_eliminator 1.65% : 0.000004s : 29: predicate.tuple_to_list_eliminator_ 2.26% : 0.000006s : 44: predicate.updatestate_pure_node_eliminater 2.93% : 0.000008s : 54: predicate.updatestate_useless_node_eliminater 0.38% : 0.000001s : 5: predicate.value_based_eliminate 0.74% : 0.000002s : 10: predicate.virtual_dataset_eliminate 0.67% : 0.000002s : 10: predicate.virtual_output_eliminate 0.32% : 0.000001s : 5: predicate.virtual_view_grad_eliminate 0.60% : 0.000002s : 5: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000973 11 39.29% : 0.000382s : 5: func_graph_cloner_run.FuncGraphClonerGraph 60.71% : 0.000591s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.750051 192 0.00% : 0.000004s : 1: ForceFp32Comm 0.49% : 0.003708s : 1: add_attr 0.49% : 0.003695s : 1: add_attr_with_inline 0.00% : 0.000004s : 1: add_comm_op_reuse_tag 0.01% : 0.000070s : 1: add_recomputation 0.00% : 0.000005s : 1: assign_add_opt 0.01% : 0.000086s : 1: auto_monad 0.00% : 0.000028s : 1: auto_monad_reorder 0.00% : 0.000004s : 1: begin_end_overlap_inline 0.00% : 0.000006s : 1: bias_add_comm_swap 0.08% : 0.000584s : 1: bootstrap 0.00% : 0.000037s : 1: cconv 0.00% : 0.000004s : 1: comm_op_add_attrs 0.00% : 0.000021s : 1: control_data_broadcast_order 0.00% : 0.000011s : 1: convert_after_rewriter 0.02% : 0.000130s : 1: cse_after_recomputation 0.00% : 0.000005s : 1: dataset_repeat_opt 0.00% : 0.000006s : 1: detach_backward 0.00% : 0.000011s : 1: environ_conv 0.00% : 0.000031s : 1: event_method 0.00% : 0.000005s : 1: full_micro_interleaved_order_control 0.00% : 0.000006s : 1: get_jit_bprop_graph 0.00% : 0.000011s : 1: graph_reusing 0.00% : 0.000005s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000004s : 1: handle_group_info 0.00% : 0.000007s : 1: inline 0.00% : 0.000006s : 1: insert-virtual-dataset 0.00% : 0.000004s : 1: interleave_parallel_branches 0.00% : 0.000004s : 1: interleave_split_concat_branches 0.00% : 0.000007s : 1: label_fine_grained_interleaved_index 0.00% : 0.000008s : 1: label_micro_interleaved_index 0.07% : 0.000530s : 1: loop_unroll 0.00% : 0.000004s : 1: merge_cast_opt 0.00% : 0.000006s : 1: micro_interleaved_order_control 0.11% : 0.000827s : 1: mutable_eliminate 0.00% : 0.000008s : 1: offloading_packed_experts 0.00% : 0.000020s : 1: opt.transform.loop_unroll_optimizer 0.00% : 0.000023s : 1: opt.transform.mutable_eliminate 0.22% : 0.001614s : 78: opt.transform.opt_a 0.01% : 0.000044s : 1: opt.transform.opt_after_cconv 0.00% : 0.000036s : 1: opt.transform.opt_after_jit_grad 0.02% : 0.000164s : 28: opt.transform.opt_b 0.01% : 0.000075s : 2: opt.transform.opt_trans_graph 0.01% : 0.000055s : 4: opt.transform.symbol_engine_opt 31.85% : 0.238918s : 1: opt_a 0.02% : 0.000142s : 1: opt_after_cconv 0.09% : 0.000672s : 1: opt_after_jit_grad 0.04% : 0.000302s : 1: opt_b 32.23% : 0.241768s : 1: optimize 0.00% : 0.000025s : 1: optimize_parallel_all_gather_comm 0.00% : 0.000009s : 1: order_py_execute_after_rewriter 0.00% : 0.000032s : 1: overlap_grad_flash_sp 0.00% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.00% : 0.000008s : 1: overlap_grad_ring_attention 0.00% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000007s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000005s : 1: overlap_param_gather 0.00% : 0.000005s : 1: overlap_recompute_allgather_and_fa_grad 0.00% : 0.000009s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000007s : 1: overlap_recompute_comm 0.00% : 0.000007s : 1: parallel-infer-symbol 0.00% : 0.000005s : 1: parallel-infer-symbol-second 0.00% : 0.000006s : 1: partial_unused_args_eliminate 0.00% : 0.000006s : 1: pipeline_parallel_scheduler 0.00% : 0.000004s : 1: pipeline_split 0.01% : 0.000048s : 1: pre_auto_parallel 0.01% : 0.000039s : 1: py_interpret_to_execute 0.00% : 0.000024s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000005s : 1: remove_cast_before_assign_add 0.00% : 0.000022s : 1: remove_dup_value 31.42% : 0.235633s : 1: renormalize.infer 0.08% : 0.000593s : 1: renormalize.specialize 0.00% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.00% : 0.000009s : 1: rewriter_after_jit_bprop_graph 0.01% : 0.000052s : 1: rewriter_after_opt_a 0.01% : 0.000101s : 1: rewriter_before_opt_a 0.00% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000005s : 1: slice_recompute_activation 0.00% : 0.000007s : 1: split_layernorm_comm 0.00% : 0.000005s : 1: split_matmul_comm_elemetwise 0.00% : 0.000012s : 1: swap_dp_allreduce_reducescatter 0.01% : 0.000112s : 1: symbol_engine_optimizer 0.01% : 0.000107s : 1: tuple_transform 2.59% : 0.019399s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:39:15.887.665 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:39:15.887.921 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.209589, [21] [bootstrap]: 0.00043713 [type_inference]: 0.196379 [event_method]: 2.229e-05 [auto_monad]: 7.728e-05 [graph_reusing]: 6.63e-06 [inline]: 2.86e-06 [add_attr]: 0.00377784, [1] [add_attr_with_inline]: 0.0037665, [1] [Cycle 1]: 0.00012709, [2] [tag_attr]: 5.383e-05 [meta_addattr_fg_expand]: 6.41998e-06 [parallel-infer-symbol]: 3.71999e-06 [pre_auto_parallel]: 4.341e-05 [insert-virtual-dataset]: 2.37999e-06 [parallel-infer-symbol-second]: 9.79984e-07 [dataset_repeat_opt]: 2.04e-06 [pipeline_split]: 1.76998e-06 [optimize]: 0.00723379, [53] [py_interpret_to_execute]: 4.012e-05 [rewriter_before_opt_a]: 0.00010947 [opt_a]: 0.00402513, [2] [Cycle 1]: 0.00289897, [45] [expand_dump_flag]: 3.05002e-06 [switch_simplify]: 4.351e-05 [loop_unroll]: 3.207e-05 [a_1]: 0.00076197 [with_stream_mark]: 2.325e-05 [recompute_prepare]: 1.438e-05 [updatestate_depend_eliminate]: 5.51e-06 [updatestate_assign_eliminate]: 4.62e-06 [updatestate_loads_eliminate]: 3.63999e-06 [parameter_eliminate]: 1.89999e-06 [a_2]: 0.00014896 [accelerated_algorithm]: 2.543e-05 [shard]: 1.96e-06 [meta_shard_fg_expand]: 2.22999e-06 [shard_inline]: 9.72999e-06 [merge_send_recv]: 1.04e-05 [auto_parallel]: 1.061e-05 [parallel]: 2.138e-05 [flash_sp]: 1.176e-05 [merge_comm]: 4.89998e-06 [allreduce_fusion]: 4.15e-06 [matmul_add_comm_reduction]: 1.257e-05 [allreduce_slice_to_reducescatter]: 8.90024e-07 [virtual_shard_identity]: 1.162e-05 [virtual_dataset]: 8.27998e-06 [get_grad_eliminate_]: 9.03002e-06 [virtual_output]: 8.10999e-06 [merge_forward]: 4.75999e-06 [cell_reuse_recompute_pass]: 1.50001e-06 [offload_activation]: 1.247e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.877e-05 [merge_recompute_call_nodes]: 1.45999e-06 [before_grad]: 1.342e-05 [set_forward_comm_id_for_comm_node_pass]: 4.58001e-06 [meta_fg_expand]: 3.81999e-06 [flash_sp_send_recv_attached]: 5.35999e-06 [receive_attached]: 2.48e-06 [after_resolve]: 1.507e-05 [a_after_grad]: 1.283e-05 [renormalize]: 0.00102068 [add_forward_monad_depend]: 7.91001e-06 [auto_monad_grad]: 3.03998e-06 [auto_monad_eliminator]: 2.196e-05 [cse]: 3.946e-05 [a_3]: 8.165e-05 [Cycle 2]: 0.00110737, [45] [expand_dump_flag]: 2.79999e-06 [switch_simplify]: 1.083e-05 [loop_unroll]: 8.95999e-06 [a_1]: 0.00020826 [with_stream_mark]: 1.817e-05 [recompute_prepare]: 8.58001e-06 [updatestate_depend_eliminate]: 5.67001e-06 [updatestate_assign_eliminate]: 4.20999e-06 [updatestate_loads_eliminate]: 4.2e-06 [parameter_eliminate]: 1.81e-06 [a_2]: 0.00013203 [accelerated_algorithm]: 1.235e-05 [shard]: 1.52999e-06 [meta_shard_fg_expand]: 2.14999e-06 [shard_inline]: 7.82998e-06 [merge_send_recv]: 8.54e-06 [auto_parallel]: 9.04e-06 [parallel]: 8.54e-06 [flash_sp]: 4.62998e-06 [merge_comm]: 4.80999e-06 [allreduce_fusion]: 4.02998e-06 [matmul_add_comm_reduction]: 1.241e-05 [allreduce_slice_to_reducescatter]: 8.80013e-07 [virtual_shard_identity]: 9.77999e-06 [virtual_dataset]: 3.942e-05 [get_grad_eliminate_]: 8.05e-06 [virtual_output]: 7.38e-06 [merge_forward]: 6.02999e-06 [cell_reuse_recompute_pass]: 2.53003e-06 [offload_activation]: 1.328e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.939e-05 [merge_recompute_call_nodes]: 1.49e-06 [before_grad]: 1.36e-05 [set_forward_comm_id_for_comm_node_pass]: 5.14e-06 [meta_fg_expand]: 3.18e-06 [flash_sp_send_recv_attached]: 1.52001e-06 [receive_attached]: 1.99999e-06 [after_resolve]: 1.407e-05 [a_after_grad]: 1.187e-05 [renormalize]: 9.00181e-08 [add_forward_monad_depend]: 2.25002e-06 [auto_monad_grad]: 1.56002e-06 [auto_monad_eliminator]: 1.428e-05 [cse]: 2.74e-05 [a_3]: 6.056e-05 [py_interpret_to_execute_after_opt_a]: 2.135e-05 [slice_cell_reuse_recomputed_activation]: 4.75999e-06 [rewriter_after_opt_a]: 5.451e-05 [convert_after_rewriter]: 1.199e-05 [order_py_execute_after_rewriter]: 9.87999e-06 [mutable_eliminate]: 0.00083541 [opt_b]: 0.00040076, [1] [Cycle 1]: 0.00038887, [7] [b_1]: 0.00024658 [b_2]: 1.231e-05 [updatestate_depend_eliminate]: 1.282e-05 [updatestate_assign_eliminate]: 4.28001e-06 [updatestate_loads_eliminate]: 3.85998e-06 [renormalize]: 1.18001e-06 [cse]: 3.577e-05 [optimize_parallel_all_gather_comm]: 2.696e-05 [overlap_param_gather]: 5.87999e-06 [cconv]: 4.318e-05 [loop_unroll]: 0.00058209 [opt_after_cconv]: 0.00017703, [1] [Cycle 1]: 0.00016556, [7] [c_1]: 4.605e-05 [parameter_eliminate]: 7.22002e-06 [updatestate_depend_eliminate]: 8.24002e-06 [updatestate_assign_eliminate]: 3.71999e-06 [updatestate_loads_eliminate]: 4.18001e-06 [cse]: 3.221e-05 [renormalize]: 9.60019e-07 [remove_dup_value]: 2.326e-05 [tuple_transform]: 0.00012825, [1] [Cycle 1]: 0.00011959, [4] [d_1]: 7.157e-05 [none_parameter_eliminate]: 1.77999e-06 [renormalize]: 2.69996e-07 [switch_simplify]: 1.037e-05 [partial_unused_args_eliminate]: 5.98002e-06 [add_recomputation]: 7.182e-05 [cse_after_recomputation]: 3.905e-05, [1] [Cycle 1]: 2.972e-05, [1] [cse]: 1.862e-05 [environ_conv]: 1.169e-05 [swap_dp_allreduce_reducescatter]: 1.062e-05 [bias_add_comm_swap]: 6.98e-06 [label_micro_interleaved_index]: 8.93002e-06 [label_fine_grained_interleaved_index]: 5.30999e-06 [merge_cast_opt]: 3.88999e-06 [slice_recompute_activation]: 6.11e-06 [micro_interleaved_order_control]: 5.77001e-06 [assign_add_opt]: 4.68001e-06 [ForceFp32Comm]: 3.5e-06 [remove_cast_before_assign_add]: 3.99997e-06 [full_micro_interleaved_order_control]: 4.67e-06 [reorder_send_recv_between_fp_bp]: 6.33e-06 [comm_op_add_attrs]: 3.88001e-06 [add_comm_op_reuse_tag]: 3.78999e-06 [interleave_split_concat_branches]: 4.25e-06 [interleave_parallel_branches]: 4.22e-06 [overlap_opt_shard_in_pipeline]: 5.29e-06 [overlap_opt_shard_grad_in_pipeline]: 4.77e-06 [control_data_broadcast_order]: 2.402e-05 [grouped_pairwise_exchange_alltoall]: 4.17003e-06 [offloading_packed_experts]: 8.63001e-06 [overlap_recompute_and_grad_model_parallel]: 9.97999e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.93999e-06 [overlap_recompute_allgather_and_fa_grad]: 3.83001e-06 [overlap_recompute_comm]: 5.54998e-06 [overlap_grad_ring_attention]: 8.65999e-06 [overlap_grad_flash_sp]: 3.099e-05 [begin_end_overlap_inline]: 5.64998e-06 [split_matmul_comm_elemetwise]: 5.20999e-06 [split_layernorm_comm]: 5.08002e-06 [handle_group_info]: 4.08001e-06 [symbol_engine_optimizer]: 0.00013943, [1] [Cycle 1]: 0.00013068, [6] [build]: 5.69e-06 [elim_shapecalc]: 1.824e-05 [elim_not_effective]: 1.849e-05 [opt_reshape]: 1.023e-05 [fold_const_symbol]: 1.452e-05 [renormalize]: 5.09986e-07 [detach_backward]: 4.40999e-06 [pipeline_parallel_scheduler]: 1.86e-06 [auto_monad_reorder]: 4.698e-05 [get_jit_bprop_graph]: 2.17001e-06 [rewriter_after_jit_bprop_graph]: 7.53e-06 [opt_after_jit_grad]: 0.00071596 [validate]: 5.784e-05 Sums bootstrap : 0.000437s : 0.21% type_inference : 0.196379s : 96.38% event_method : 0.000022s : 0.01% auto_monad : 0.000077s : 0.04% graph_reusing : 0.000007s : 0.00% inline : 0.000003s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000054s : 0.03% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.00% parallel-infer-symbol : 0.000004s : 0.00% pre_auto_parallel : 0.000043s : 0.02% insert-virtual-dataset : 0.000002s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000040s : 0.02% optimize.rewriter_before_opt_a : 0.000109s : 0.05% optimize.opt_a.expand_dump_flag : 0.000006s : 0.00% optimize.opt_a.switch_simplify : 0.000054s : 0.03% optimize.opt_a.loop_unroll : 0.000041s : 0.02% optimize.opt_a.a_1 : 0.000970s : 0.48% optimize.opt_a.with_stream_mark : 0.000041s : 0.02% optimize.opt_a.recompute_prepare : 0.000023s : 0.01% optimize.opt_a.updatestate_depend_eliminate : 0.000011s : 0.01% optimize.opt_a.updatestate_assign_eliminate : 0.000009s : 0.00% optimize.opt_a.updatestate_loads_eliminate : 0.000008s : 0.00% optimize.opt_a.parameter_eliminate : 0.000004s : 0.00% optimize.opt_a.a_2 : 0.000281s : 0.14% optimize.opt_a.accelerated_algorithm : 0.000038s : 0.02% optimize.opt_a.shard : 0.000003s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.00% optimize.opt_a.shard_inline : 0.000018s : 0.01% optimize.opt_a.merge_send_recv : 0.000019s : 0.01% optimize.opt_a.auto_parallel : 0.000020s : 0.01% optimize.opt_a.parallel : 0.000030s : 0.01% optimize.opt_a.flash_sp : 0.000016s : 0.01% optimize.opt_a.merge_comm : 0.000010s : 0.00% optimize.opt_a.allreduce_fusion : 0.000008s : 0.00% optimize.opt_a.matmul_add_comm_reduction : 0.000025s : 0.01% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000002s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000021s : 0.01% optimize.opt_a.virtual_dataset : 0.000048s : 0.02% optimize.opt_a.get_grad_eliminate_ : 0.000017s : 0.01% optimize.opt_a.virtual_output : 0.000015s : 0.01% optimize.opt_a.merge_forward : 0.000011s : 0.01% optimize.opt_a.cell_reuse_recompute_pass : 0.000004s : 0.00% optimize.opt_a.offload_activation : 0.000026s : 0.01% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000038s : 0.02% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.00% optimize.opt_a.before_grad : 0.000027s : 0.01% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000010s : 0.00% optimize.opt_a.meta_fg_expand : 0.000007s : 0.00% optimize.opt_a.flash_sp_send_recv_attached : 0.000007s : 0.00% optimize.opt_a.receive_attached : 0.000004s : 0.00% optimize.opt_a.after_resolve : 0.000029s : 0.01% optimize.opt_a.a_after_grad : 0.000025s : 0.01% optimize.opt_a.renormalize : 0.001021s : 0.50% optimize.opt_a.add_forward_monad_depend : 0.000010s : 0.00% optimize.opt_a.auto_monad_grad : 0.000005s : 0.00% optimize.opt_a.auto_monad_eliminator : 0.000036s : 0.02% optimize.opt_a.cse : 0.000067s : 0.03% optimize.opt_a.a_3 : 0.000142s : 0.07% optimize.py_interpret_to_execute_after_opt_a : 0.000021s : 0.01% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.00% optimize.rewriter_after_opt_a : 0.000055s : 0.03% optimize.convert_after_rewriter : 0.000012s : 0.01% optimize.order_py_execute_after_rewriter : 0.000010s : 0.00% optimize.mutable_eliminate : 0.000835s : 0.41% optimize.opt_b.b_1 : 0.000247s : 0.12% optimize.opt_b.b_2 : 0.000012s : 0.01% optimize.opt_b.updatestate_depend_eliminate : 0.000013s : 0.01% optimize.opt_b.updatestate_assign_eliminate : 0.000004s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000004s : 0.00% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000036s : 0.02% optimize.optimize_parallel_all_gather_comm : 0.000027s : 0.01% optimize.overlap_param_gather : 0.000006s : 0.00% optimize.cconv : 0.000043s : 0.02% optimize.loop_unroll : 0.000582s : 0.29% optimize.opt_after_cconv.c_1 : 0.000046s : 0.02% optimize.opt_after_cconv.parameter_eliminate : 0.000007s : 0.00% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000008s : 0.00% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000004s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000004s : 0.00% optimize.opt_after_cconv.cse : 0.000032s : 0.02% optimize.opt_after_cconv.renormalize : 0.000001s : 0.00% optimize.remove_dup_value : 0.000023s : 0.01% optimize.tuple_transform.d_1 : 0.000072s : 0.04% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000010s : 0.01% optimize.partial_unused_args_eliminate : 0.000006s : 0.00% optimize.add_recomputation : 0.000072s : 0.04% optimize.cse_after_recomputation.cse : 0.000019s : 0.01% optimize.environ_conv : 0.000012s : 0.01% optimize.swap_dp_allreduce_reducescatter : 0.000011s : 0.01% optimize.bias_add_comm_swap : 0.000007s : 0.00% optimize.label_micro_interleaved_index : 0.000009s : 0.00% optimize.label_fine_grained_interleaved_index : 0.000005s : 0.00% optimize.merge_cast_opt : 0.000004s : 0.00% optimize.slice_recompute_activation : 0.000006s : 0.00% optimize.micro_interleaved_order_control : 0.000006s : 0.00% optimize.assign_add_opt : 0.000005s : 0.00% optimize.ForceFp32Comm : 0.000003s : 0.00% optimize.remove_cast_before_assign_add : 0.000004s : 0.00% optimize.full_micro_interleaved_order_control : 0.000005s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000006s : 0.00% optimize.comm_op_add_attrs : 0.000004s : 0.00% optimize.add_comm_op_reuse_tag : 0.000004s : 0.00% optimize.interleave_split_concat_branches : 0.000004s : 0.00% optimize.interleave_parallel_branches : 0.000004s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000005s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000005s : 0.00% optimize.control_data_broadcast_order : 0.000024s : 0.01% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.00% optimize.offloading_packed_experts : 0.000009s : 0.00% optimize.overlap_recompute_and_grad_model_parallel : 0.000010s : 0.00% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.00% optimize.overlap_recompute_comm : 0.000006s : 0.00% optimize.overlap_grad_ring_attention : 0.000009s : 0.00% optimize.overlap_grad_flash_sp : 0.000031s : 0.02% optimize.begin_end_overlap_inline : 0.000006s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000005s : 0.00% optimize.split_layernorm_comm : 0.000005s : 0.00% optimize.handle_group_info : 0.000004s : 0.00% optimize.symbol_engine_optimizer.build : 0.000006s : 0.00% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000018s : 0.01% optimize.symbol_engine_optimizer.elim_not_effective : 0.000018s : 0.01% optimize.symbol_engine_optimizer.opt_reshape : 0.000010s : 0.01% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000015s : 0.01% optimize.symbol_engine_optimizer.renormalize : 0.000001s : 0.00% detach_backward : 0.000004s : 0.00% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000047s : 0.02% get_jit_bprop_graph : 0.000002s : 0.00% rewriter_after_jit_bprop_graph : 0.000008s : 0.00% opt_after_jit_grad : 0.000716s : 0.35% validate : 0.000058s : 0.03% Time group info: ------[substitution.] 0.000306 45 12.44% : 0.000038s : 5: substitution.arithmetic_simplify 10.63% : 0.000032s : 3: substitution.cast_eliminate 0.74% : 0.000002s : 3: substitution.elim_not_effective 0.62% : 0.000002s : 3: substitution.fold_const_symbol 2.59% : 0.000008s : 5: substitution.graph_param_transform 58.40% : 0.000179s : 4: substitution.inline 1.60% : 0.000005s : 6: substitution.j_node_and_user_rematch 4.73% : 0.000014s : 2: substitution.less_batch_normalization 2.14% : 0.000007s : 6: substitution.remove_not_recompute_node 1.79% : 0.000005s : 4: substitution.replace_old_param 4.32% : 0.000013s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.196315 2 99.54% : 0.195409s : 1: type_inference.infer 0.46% : 0.000906s : 1: type_inference.specialize ------[replace.] 0.000068 8 62.08% : 0.000042s : 4: replace.inline 37.92% : 0.000026s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000187 8 93.91% : 0.000176s : 4: match.inline 6.09% : 0.000011s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000263 1504 0.85% : 0.000002s : 15: predicate.accumulaten_eliminater 1.39% : 0.000004s : 5: predicate.ad_related_special_op_eliminate 0.62% : 0.000002s : 10: predicate.addn_check_dump 0.76% : 0.000002s : 15: predicate.addn_zero_filter 0.74% : 0.000002s : 15: predicate.adjust_all_reduce_mul_add 2.32% : 0.000006s : 25: predicate.arithmetic_simplify 0.83% : 0.000002s : 15: predicate.cast_eliminate 0.63% : 0.000002s : 10: predicate.check_bprop_eliminate 0.56% : 0.000001s : 10: predicate.compare_switch_simplify 0.19% : 0.000001s : 5: predicate.const_output_eliminate 0.64% : 0.000002s : 10: predicate.depend_value_elim 0.87% : 0.000002s : 15: predicate.dict_get_item_const_eliminator 0.91% : 0.000002s : 15: predicate.dict_get_item_eliminator 0.84% : 0.000002s : 15: predicate.dict_set_item_eliminator 1.56% : 0.000004s : 10: predicate.dumpgradient_eliminate 0.25% : 0.000001s : 5: predicate.elim_not_effective 0.59% : 0.000002s : 5: predicate.elim_shapecalc_of_broadcastargs 1.09% : 0.000003s : 20: predicate.environ_add_const_eliminate 1.08% : 0.000003s : 20: predicate.environ_get_add_eliminate 0.99% : 0.000003s : 20: predicate.environ_get_depend_swap 1.69% : 0.000004s : 30: predicate.environ_get_eliminate 0.99% : 0.000003s : 20: predicate.environ_get_set_eliminate 1.24% : 0.000003s : 23: predicate.exchange_switch_depend_value 2.27% : 0.000006s : 23: predicate.float_depend_g_call 0.52% : 0.000001s : 10: predicate.float_environ_get_switch 0.77% : 0.000002s : 15: predicate.float_tuple_getitem_switch 0.24% : 0.000001s : 5: predicate.fold_const_symbol 0.86% : 0.000002s : 10: predicate.get_grad_eliminate 0.30% : 0.000001s : 5: predicate.graph_param_transform 0.58% : 0.000002s : 10: predicate.incorporate_call 0.50% : 0.000001s : 10: predicate.incorporate_call_switch 6.49% : 0.000017s : 68: predicate.inline 0.72% : 0.000002s : 10: predicate.inline_without_move 0.30% : 0.000001s : 10: predicate.j_node_and_user_rematch 1.01% : 0.000003s : 10: predicate.less_batch_normalization 1.69% : 0.000004s : 29: predicate.list_to_tuple_eliminator_ 2.31% : 0.000006s : 44: predicate.load_eliminater 1.42% : 0.000004s : 5: predicate.loop_unroll_after_grad 2.12% : 0.000006s : 36: predicate.loop_unroll_before_grad 1.83% : 0.000005s : 25: predicate.make_slice_get_slice_eliminator 0.75% : 0.000002s : 10: predicate.merge_addn 0.63% : 0.000002s : 10: predicate.micro_step_allgather_replace 0.56% : 0.000001s : 10: predicate.mini_step_allgather_replace 0.90% : 0.000002s : 15: predicate.minmaximum_grad 1.48% : 0.000004s : 5: predicate.mutable_eliminate 0.40% : 0.000001s : 5: predicate.opt_reshape 0.59% : 0.000002s : 5: predicate.parallel_virtual_node 1.57% : 0.000004s : 23: predicate.partial_defer_inline 1.53% : 0.000004s : 24: predicate.partial_eliminate 0.81% : 0.000002s : 15: predicate.print_const_string_wrapper 0.63% : 0.000002s : 10: predicate.reduce_all_const_elim 1.11% : 0.000003s : 15: predicate.reduce_eliminate 2.26% : 0.000006s : 44: predicate.redundant_stop_gradient_eliminater 0.36% : 0.000001s : 10: predicate.remove_not_recompute_node 1.25% : 0.000003s : 29: predicate.replace_applicator 0.55% : 0.000001s : 10: predicate.replace_old_param 0.40% : 0.000001s : 5: predicate.reset_defer_inline 0.98% : 0.000003s : 15: predicate.reshape_eliminate 0.59% : 0.000002s : 10: predicate.row_tensor_add_zeros_like 0.41% : 0.000001s : 5: predicate.row_tensor_eliminate 1.04% : 0.000003s : 10: predicate.same_eliminate 0.47% : 0.000001s : 10: predicate.set_cell_output_no_recompute 0.85% : 0.000002s : 10: predicate.shard_identity_eliminate 0.85% : 0.000002s : 10: predicate.special_op_eliminate 0.73% : 0.000002s : 10: predicate.specialize_transform 0.98% : 0.000003s : 10: predicate.split_environ_get_set_with_tuple_value 1.04% : 0.000003s : 10: predicate.stack_unstack_eliminate 0.37% : 0.000001s : 5: predicate.switch_call_monad_eliminater 1.27% : 0.000003s : 23: predicate.switch_defer_inline 1.87% : 0.000005s : 33: predicate.switch_layer_defer_inline 4.88% : 0.000013s : 74: predicate.switch_simplify 0.82% : 0.000002s : 15: predicate.tile_eliminate 0.89% : 0.000002s : 15: predicate.transpose_eliminate 1.53% : 0.000004s : 25: predicate.tuple_list_convert_item_index_to_positive 1.47% : 0.000004s : 25: predicate.tuple_list_get_item_const_eliminator 1.36% : 0.000004s : 25: predicate.tuple_list_get_item_depend_reorder 3.36% : 0.000009s : 39: predicate.tuple_list_get_item_eliminator 1.34% : 0.000004s : 25: predicate.tuple_list_get_set_item_eliminator 2.14% : 0.000006s : 35: predicate.tuple_list_set_item_eliminator 1.58% : 0.000004s : 29: predicate.tuple_to_list_eliminator_ 2.17% : 0.000006s : 44: predicate.updatestate_pure_node_eliminater 3.05% : 0.000008s : 54: predicate.updatestate_useless_node_eliminater 0.44% : 0.000001s : 5: predicate.value_based_eliminate 0.74% : 0.000002s : 10: predicate.virtual_dataset_eliminate 0.55% : 0.000001s : 10: predicate.virtual_output_eliminate 0.27% : 0.000001s : 5: predicate.virtual_view_grad_eliminate 0.53% : 0.000001s : 5: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000704 11 51.60% : 0.000363s : 5: func_graph_cloner_run.FuncGraphClonerGraph 48.40% : 0.000341s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.223488 192 0.00% : 0.000007s : 1: ForceFp32Comm 1.70% : 0.003789s : 1: add_attr 1.69% : 0.003771s : 1: add_attr_with_inline 0.00% : 0.000006s : 1: add_comm_op_reuse_tag 0.03% : 0.000077s : 1: add_recomputation 0.00% : 0.000007s : 1: assign_add_opt 0.04% : 0.000088s : 1: auto_monad 0.03% : 0.000057s : 1: auto_monad_reorder 0.00% : 0.000009s : 1: begin_end_overlap_inline 0.00% : 0.000011s : 1: bias_add_comm_swap 0.22% : 0.000485s : 1: bootstrap 0.02% : 0.000047s : 1: cconv 0.00% : 0.000006s : 1: comm_op_add_attrs 0.01% : 0.000028s : 1: control_data_broadcast_order 0.01% : 0.000015s : 1: convert_after_rewriter 0.02% : 0.000042s : 1: cse_after_recomputation 0.00% : 0.000009s : 1: dataset_repeat_opt 0.01% : 0.000032s : 1: detach_backward 0.01% : 0.000015s : 1: environ_conv 0.01% : 0.000033s : 1: event_method 0.00% : 0.000008s : 1: full_micro_interleaved_order_control 0.00% : 0.000008s : 1: get_jit_bprop_graph 0.01% : 0.000014s : 1: graph_reusing 0.00% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000007s : 1: handle_group_info 0.00% : 0.000009s : 1: inline 0.00% : 0.000008s : 1: insert-virtual-dataset 0.00% : 0.000007s : 1: interleave_parallel_branches 0.00% : 0.000007s : 1: interleave_split_concat_branches 0.00% : 0.000009s : 1: label_fine_grained_interleaved_index 0.01% : 0.000012s : 1: label_micro_interleaved_index 0.26% : 0.000592s : 1: loop_unroll 0.00% : 0.000006s : 1: merge_cast_opt 0.00% : 0.000009s : 1: micro_interleaved_order_control 0.38% : 0.000846s : 1: mutable_eliminate 0.01% : 0.000012s : 1: offloading_packed_experts 0.01% : 0.000023s : 1: opt.transform.loop_unroll_optimizer 0.01% : 0.000027s : 1: opt.transform.mutable_eliminate 0.71% : 0.001597s : 78: opt.transform.opt_a 0.02% : 0.000044s : 1: opt.transform.opt_after_cconv 0.02% : 0.000044s : 1: opt.transform.opt_after_jit_grad 0.08% : 0.000168s : 28: opt.transform.opt_b 0.04% : 0.000079s : 2: opt.transform.opt_trans_graph 0.03% : 0.000057s : 4: opt.transform.symbol_engine_opt 1.80% : 0.004029s : 1: opt_a 0.08% : 0.000181s : 1: opt_after_cconv 0.33% : 0.000730s : 1: opt_after_jit_grad 0.18% : 0.000405s : 1: opt_b 3.45% : 0.007705s : 1: optimize 0.01% : 0.000031s : 1: optimize_parallel_all_gather_comm 0.01% : 0.000013s : 1: order_py_execute_after_rewriter 0.02% : 0.000035s : 1: overlap_grad_flash_sp 0.00% : 0.000007s : 1: overlap_grad_matmul_and_grad_allreduce 0.01% : 0.000012s : 1: overlap_grad_ring_attention 0.00% : 0.000007s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000010s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000009s : 1: overlap_param_gather 0.00% : 0.000007s : 1: overlap_recompute_allgather_and_fa_grad 0.01% : 0.000013s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000010s : 1: overlap_recompute_comm 0.00% : 0.000010s : 1: parallel-infer-symbol 0.00% : 0.000007s : 1: parallel-infer-symbol-second 0.00% : 0.000009s : 1: partial_unused_args_eliminate 0.00% : 0.000010s : 1: pipeline_parallel_scheduler 0.00% : 0.000007s : 1: pipeline_split 0.02% : 0.000051s : 1: pre_auto_parallel 0.02% : 0.000044s : 1: py_interpret_to_execute 0.01% : 0.000025s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000007s : 1: remove_cast_before_assign_add 0.01% : 0.000027s : 1: remove_dup_value 0.26% : 0.000574s : 1: renormalize.infer 0.19% : 0.000436s : 1: renormalize.specialize 0.00% : 0.000009s : 1: reorder_send_recv_between_fp_bp 0.01% : 0.000015s : 1: rewriter_after_jit_bprop_graph 0.03% : 0.000059s : 1: rewriter_after_opt_a 0.05% : 0.000113s : 1: rewriter_before_opt_a 0.00% : 0.000008s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000009s : 1: slice_recompute_activation 0.00% : 0.000008s : 1: split_layernorm_comm 0.00% : 0.000008s : 1: split_matmul_comm_elemetwise 0.01% : 0.000014s : 1: swap_dp_allreduce_reducescatter 0.06% : 0.000142s : 1: symbol_engine_optimizer 0.06% : 0.000131s : 1: tuple_transform 87.89% : 0.196426s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:39:18.902.895 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.204976, [21] [bootstrap]: 0.00051942 [type_inference]: 0.191383 [event_method]: 2.491e-05 [auto_monad]: 8.756e-05 [graph_reusing]: 7.18e-06 [inline]: 3.25002e-06 [add_attr]: 0.00439607, [1] [add_attr_with_inline]: 0.00438225, [1] [Cycle 1]: 8.552e-05, [2] [tag_attr]: 2.829e-05 [meta_addattr_fg_expand]: 6.40997e-06 [parallel-infer-symbol]: 5.79999e-06 [pre_auto_parallel]: 4.452e-05 [insert-virtual-dataset]: 2.44999e-06 [parallel-infer-symbol-second]: 8.2e-07 [dataset_repeat_opt]: 2.01998e-06 [pipeline_split]: 1.66e-06 [optimize]: 0.00732197, [53] [py_interpret_to_execute]: 3.964e-05 [rewriter_before_opt_a]: 0.00010765 [opt_a]: 0.0040613, [2] [Cycle 1]: 0.0030998, [45] [expand_dump_flag]: 3.52002e-06 [switch_simplify]: 4.589e-05 [loop_unroll]: 3.148e-05 [a_1]: 0.00080917 [with_stream_mark]: 2.205e-05 [recompute_prepare]: 1.243e-05 [updatestate_depend_eliminate]: 7.46001e-06 [updatestate_assign_eliminate]: 5.43002e-06 [updatestate_loads_eliminate]: 3.75e-06 [parameter_eliminate]: 2.61999e-06 [a_2]: 0.0001206 [accelerated_algorithm]: 2.691e-05 [shard]: 2.97002e-06 [meta_shard_fg_expand]: 2.92002e-06 [shard_inline]: 8.48001e-06 [merge_send_recv]: 1.031e-05 [auto_parallel]: 1.033e-05 [parallel]: 2.432e-05 [flash_sp]: 9.48002e-06 [merge_comm]: 6.46e-06 [allreduce_fusion]: 4.43001e-06 [matmul_add_comm_reduction]: 1.33e-05 [allreduce_slice_to_reducescatter]: 9.50007e-07 [virtual_shard_identity]: 1.432e-05 [virtual_dataset]: 8.60001e-06 [get_grad_eliminate_]: 8.32e-06 [virtual_output]: 1.109e-05 [merge_forward]: 4.75999e-06 [cell_reuse_recompute_pass]: 1.59e-06 [offload_activation]: 1.222e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.649e-05 [merge_recompute_call_nodes]: 1.57999e-06 [before_grad]: 1.616e-05 [set_forward_comm_id_for_comm_node_pass]: 4.40999e-06 [meta_fg_expand]: 4.01001e-06 [flash_sp_send_recv_attached]: 6.02001e-06 [receive_attached]: 2.73e-06 [after_resolve]: 1.409e-05 [a_after_grad]: 1.483e-05 [renormalize]: 0.00132346 [add_forward_monad_depend]: 9.27999e-06 [auto_monad_grad]: 2.79999e-06 [auto_monad_eliminator]: 2.184e-05 [cse]: 4.216e-05 [a_3]: 7.39e-05 [Cycle 2]: 0.00094739, [45] [expand_dump_flag]: 2.98e-06 [switch_simplify]: 1.08e-05 [loop_unroll]: 8.28999e-06 [a_1]: 0.00022763 [with_stream_mark]: 1.688e-05 [recompute_prepare]: 9.48002e-06 [updatestate_depend_eliminate]: 5.39e-06 [updatestate_assign_eliminate]: 3.47997e-06 [updatestate_loads_eliminate]: 3.04001e-06 [parameter_eliminate]: 2.26e-06 [a_2]: 0.00010528 [accelerated_algorithm]: 1.29e-05 [shard]: 2.04e-06 [meta_shard_fg_expand]: 2.73e-06 [shard_inline]: 8.43999e-06 [merge_send_recv]: 9.97001e-06 [auto_parallel]: 1.144e-05 [parallel]: 1.03e-05 [flash_sp]: 4.42e-06 [merge_comm]: 4.95999e-06 [allreduce_fusion]: 4.78001e-06 [matmul_add_comm_reduction]: 1.251e-05 [allreduce_slice_to_reducescatter]: 7.89994e-07 [virtual_shard_identity]: 1.081e-05 [virtual_dataset]: 8.2e-06 [get_grad_eliminate_]: 8.60999e-06 [virtual_output]: 8.03001e-06 [merge_forward]: 5.29e-06 [cell_reuse_recompute_pass]: 3.3e-06 [offload_activation]: 1.405e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.624e-05 [merge_recompute_call_nodes]: 1.86e-06 [before_grad]: 1.347e-05 [set_forward_comm_id_for_comm_node_pass]: 5.91998e-06 [meta_fg_expand]: 3.53e-06 [flash_sp_send_recv_attached]: 1.96e-06 [receive_attached]: 2.43998e-06 [after_resolve]: 1.762e-05 [a_after_grad]: 1.368e-05 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 2.93e-06 [auto_monad_grad]: 2.24001e-06 [auto_monad_eliminator]: 1.517e-05 [cse]: 3.016e-05 [a_3]: 5.005e-05 [py_interpret_to_execute_after_opt_a]: 2.22e-05 [slice_cell_reuse_recomputed_activation]: 2.41e-06 [rewriter_after_opt_a]: 5.495e-05 [convert_after_rewriter]: 8.70999e-06 [order_py_execute_after_rewriter]: 6.46999e-06 [mutable_eliminate]: 0.00097813 [opt_b]: 0.00034026, [1] [Cycle 1]: 0.00032988, [7] [b_1]: 0.00020415 [b_2]: 1.29e-05 [updatestate_depend_eliminate]: 1.386e-05 [updatestate_assign_eliminate]: 4.02e-06 [updatestate_loads_eliminate]: 3.35998e-06 [renormalize]: 3.10014e-07 [cse]: 4.281e-05 [optimize_parallel_all_gather_comm]: 2.494e-05 [overlap_param_gather]: 2.37999e-06 [cconv]: 4.058e-05 [loop_unroll]: 0.00073251 [opt_after_cconv]: 0.00015789, [1] [Cycle 1]: 0.00014907, [7] [c_1]: 5.097e-05 [parameter_eliminate]: 6.67002e-06 [updatestate_depend_eliminate]: 8.48999e-06 [updatestate_assign_eliminate]: 3.38e-06 [updatestate_loads_eliminate]: 4.18001e-06 [cse]: 3.753e-05 [renormalize]: 8.60018e-07 [remove_dup_value]: 1.942e-05 [tuple_transform]: 0.00011368, [1] [Cycle 1]: 0.00010888, [4] [d_1]: 7.614e-05 [none_parameter_eliminate]: 2.32001e-06 [renormalize]: 1.90019e-07 [switch_simplify]: 1.028e-05 [partial_unused_args_eliminate]: 2.55002e-06 [add_recomputation]: 7.223e-05 [cse_after_recomputation]: 3.371e-05, [1] [Cycle 1]: 2.754e-05, [1] [cse]: 2e-05 [environ_conv]: 8.20999e-06 [swap_dp_allreduce_reducescatter]: 6.12001e-06 [bias_add_comm_swap]: 4.05998e-06 [label_micro_interleaved_index]: 8.22998e-06 [label_fine_grained_interleaved_index]: 2.71999e-06 [merge_cast_opt]: 1.67001e-06 [slice_recompute_activation]: 2.42001e-06 [micro_interleaved_order_control]: 2.46e-06 [assign_add_opt]: 1.37e-06 [ForceFp32Comm]: 8.39995e-07 [remove_cast_before_assign_add]: 1.72001e-06 [full_micro_interleaved_order_control]: 2.00002e-06 [reorder_send_recv_between_fp_bp]: 2.93e-06 [comm_op_add_attrs]: 1.24003e-06 [add_comm_op_reuse_tag]: 9.89996e-07 [interleave_split_concat_branches]: 1.27999e-06 [interleave_parallel_branches]: 1.21002e-06 [overlap_opt_shard_in_pipeline]: 1.33002e-06 [overlap_opt_shard_grad_in_pipeline]: 2.65002e-06 [control_data_broadcast_order]: 1.903e-05 [grouped_pairwise_exchange_alltoall]: 1.95001e-06 [offloading_packed_experts]: 6.43e-06 [overlap_recompute_and_grad_model_parallel]: 5.49e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.24998e-06 [overlap_recompute_allgather_and_fa_grad]: 1.35001e-06 [overlap_recompute_comm]: 2.69001e-06 [overlap_grad_ring_attention]: 5.02999e-06 [overlap_grad_flash_sp]: 2.86e-05 [begin_end_overlap_inline]: 6.30011e-07 [split_matmul_comm_elemetwise]: 2.16998e-06 [split_layernorm_comm]: 1.62001e-06 [handle_group_info]: 1.04e-06 [symbol_engine_optimizer]: 0.00010912, [1] [Cycle 1]: 0.00010395, [6] [build]: 4.97999e-06 [elim_shapecalc]: 1.752e-05 [elim_not_effective]: 1.794e-05 [opt_reshape]: 1.048e-05 [fold_const_symbol]: 1.368e-05 [renormalize]: 3.39991e-07 [detach_backward]: 2.71999e-06 [pipeline_parallel_scheduler]: 1.62001e-06 [auto_monad_reorder]: 2.283e-05 [get_jit_bprop_graph]: 3.63999e-06 [rewriter_after_jit_bprop_graph]: 7.26001e-06 [opt_after_jit_grad]: 0.00088356 [validate]: 5.764e-05 Sums bootstrap : 0.000519s : 0.26% type_inference : 0.191383s : 95.97% event_method : 0.000025s : 0.01% auto_monad : 0.000088s : 0.04% graph_reusing : 0.000007s : 0.00% inline : 0.000003s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000028s : 0.01% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.00% parallel-infer-symbol : 0.000006s : 0.00% pre_auto_parallel : 0.000045s : 0.02% insert-virtual-dataset : 0.000002s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000040s : 0.02% optimize.rewriter_before_opt_a : 0.000108s : 0.05% optimize.opt_a.expand_dump_flag : 0.000007s : 0.00% optimize.opt_a.switch_simplify : 0.000057s : 0.03% optimize.opt_a.loop_unroll : 0.000040s : 0.02% optimize.opt_a.a_1 : 0.001037s : 0.52% optimize.opt_a.with_stream_mark : 0.000039s : 0.02% optimize.opt_a.recompute_prepare : 0.000022s : 0.01% optimize.opt_a.updatestate_depend_eliminate : 0.000013s : 0.01% optimize.opt_a.updatestate_assign_eliminate : 0.000009s : 0.00% optimize.opt_a.updatestate_loads_eliminate : 0.000007s : 0.00% optimize.opt_a.parameter_eliminate : 0.000005s : 0.00% optimize.opt_a.a_2 : 0.000226s : 0.11% optimize.opt_a.accelerated_algorithm : 0.000040s : 0.02% optimize.opt_a.shard : 0.000005s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.000006s : 0.00% optimize.opt_a.shard_inline : 0.000017s : 0.01% optimize.opt_a.merge_send_recv : 0.000020s : 0.01% optimize.opt_a.auto_parallel : 0.000022s : 0.01% optimize.opt_a.parallel : 0.000035s : 0.02% optimize.opt_a.flash_sp : 0.000014s : 0.01% optimize.opt_a.merge_comm : 0.000011s : 0.01% optimize.opt_a.allreduce_fusion : 0.000009s : 0.00% optimize.opt_a.matmul_add_comm_reduction : 0.000026s : 0.01% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000002s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000025s : 0.01% optimize.opt_a.virtual_dataset : 0.000017s : 0.01% optimize.opt_a.get_grad_eliminate_ : 0.000017s : 0.01% optimize.opt_a.virtual_output : 0.000019s : 0.01% optimize.opt_a.merge_forward : 0.000010s : 0.01% optimize.opt_a.cell_reuse_recompute_pass : 0.000005s : 0.00% optimize.opt_a.offload_activation : 0.000026s : 0.01% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000033s : 0.02% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.00% optimize.opt_a.before_grad : 0.000030s : 0.01% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000010s : 0.01% optimize.opt_a.meta_fg_expand : 0.000008s : 0.00% optimize.opt_a.flash_sp_send_recv_attached : 0.000008s : 0.00% optimize.opt_a.receive_attached : 0.000005s : 0.00% optimize.opt_a.after_resolve : 0.000032s : 0.02% optimize.opt_a.a_after_grad : 0.000029s : 0.01% optimize.opt_a.renormalize : 0.001324s : 0.66% optimize.opt_a.add_forward_monad_depend : 0.000012s : 0.01% optimize.opt_a.auto_monad_grad : 0.000005s : 0.00% optimize.opt_a.auto_monad_eliminator : 0.000037s : 0.02% optimize.opt_a.cse : 0.000072s : 0.04% optimize.opt_a.a_3 : 0.000124s : 0.06% optimize.py_interpret_to_execute_after_opt_a : 0.000022s : 0.01% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.00% optimize.rewriter_after_opt_a : 0.000055s : 0.03% optimize.convert_after_rewriter : 0.000009s : 0.00% optimize.order_py_execute_after_rewriter : 0.000006s : 0.00% optimize.mutable_eliminate : 0.000978s : 0.49% optimize.opt_b.b_1 : 0.000204s : 0.10% optimize.opt_b.b_2 : 0.000013s : 0.01% optimize.opt_b.updatestate_depend_eliminate : 0.000014s : 0.01% optimize.opt_b.updatestate_assign_eliminate : 0.000004s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.00% optimize.opt_b.renormalize : 0.000000s : 0.00% optimize.opt_b.cse : 0.000043s : 0.02% optimize.optimize_parallel_all_gather_comm : 0.000025s : 0.01% optimize.overlap_param_gather : 0.000002s : 0.00% optimize.cconv : 0.000041s : 0.02% optimize.loop_unroll : 0.000733s : 0.37% optimize.opt_after_cconv.c_1 : 0.000051s : 0.03% optimize.opt_after_cconv.parameter_eliminate : 0.000007s : 0.00% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000008s : 0.00% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000004s : 0.00% optimize.opt_after_cconv.cse : 0.000038s : 0.02% optimize.opt_after_cconv.renormalize : 0.000001s : 0.00% optimize.remove_dup_value : 0.000019s : 0.01% optimize.tuple_transform.d_1 : 0.000076s : 0.04% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000010s : 0.01% optimize.partial_unused_args_eliminate : 0.000003s : 0.00% optimize.add_recomputation : 0.000072s : 0.04% optimize.cse_after_recomputation.cse : 0.000020s : 0.01% optimize.environ_conv : 0.000008s : 0.00% optimize.swap_dp_allreduce_reducescatter : 0.000006s : 0.00% optimize.bias_add_comm_swap : 0.000004s : 0.00% optimize.label_micro_interleaved_index : 0.000008s : 0.00% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.00% optimize.merge_cast_opt : 0.000002s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.00% optimize.micro_interleaved_order_control : 0.000002s : 0.00% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000002s : 0.00% optimize.full_micro_interleaved_order_control : 0.000002s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.00% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000003s : 0.00% optimize.control_data_broadcast_order : 0.000019s : 0.01% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.00% optimize.offloading_packed_experts : 0.000006s : 0.00% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.00% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.00% optimize.overlap_recompute_comm : 0.000003s : 0.00% optimize.overlap_grad_ring_attention : 0.000005s : 0.00% optimize.overlap_grad_flash_sp : 0.000029s : 0.01% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.00% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000005s : 0.00% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000018s : 0.01% optimize.symbol_engine_optimizer.elim_not_effective : 0.000018s : 0.01% optimize.symbol_engine_optimizer.opt_reshape : 0.000010s : 0.01% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000014s : 0.01% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000003s : 0.00% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000023s : 0.01% get_jit_bprop_graph : 0.000004s : 0.00% rewriter_after_jit_bprop_graph : 0.000007s : 0.00% opt_after_jit_grad : 0.000884s : 0.44% validate : 0.000058s : 0.03% Time group info: ------[substitution.] 0.000345 45 13.02% : 0.000045s : 5: substitution.arithmetic_simplify 10.42% : 0.000036s : 3: substitution.cast_eliminate 0.82% : 0.000003s : 3: substitution.elim_not_effective 0.51% : 0.000002s : 3: substitution.fold_const_symbol 2.34% : 0.000008s : 5: substitution.graph_param_transform 58.22% : 0.000201s : 4: substitution.inline 2.10% : 0.000007s : 6: substitution.j_node_and_user_rematch 4.49% : 0.000015s : 2: substitution.less_batch_normalization 2.05% : 0.000007s : 6: substitution.remove_not_recompute_node 2.05% : 0.000007s : 4: substitution.replace_old_param 3.98% : 0.000014s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.191286 2 99.37% : 0.190076s : 1: type_inference.infer 0.63% : 0.001210s : 1: type_inference.specialize ------[replace.] 0.000071 8 62.63% : 0.000044s : 4: replace.inline 37.37% : 0.000026s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000210 8 94.34% : 0.000198s : 4: match.inline 5.66% : 0.000012s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000274 1504 0.88% : 0.000002s : 15: predicate.accumulaten_eliminater 1.19% : 0.000003s : 5: predicate.ad_related_special_op_eliminate 0.80% : 0.000002s : 10: predicate.addn_check_dump 0.99% : 0.000003s : 15: predicate.addn_zero_filter 0.77% : 0.000002s : 15: predicate.adjust_all_reduce_mul_add 2.41% : 0.000007s : 25: predicate.arithmetic_simplify 0.89% : 0.000002s : 15: predicate.cast_eliminate 0.55% : 0.000002s : 10: predicate.check_bprop_eliminate 0.58% : 0.000002s : 10: predicate.compare_switch_simplify 0.17% : 0.000000s : 5: predicate.const_output_eliminate 0.63% : 0.000002s : 10: predicate.depend_value_elim 0.83% : 0.000002s : 15: predicate.dict_get_item_const_eliminator 1.01% : 0.000003s : 15: predicate.dict_get_item_eliminator 1.05% : 0.000003s : 15: predicate.dict_set_item_eliminator 1.54% : 0.000004s : 10: predicate.dumpgradient_eliminate 0.24% : 0.000001s : 5: predicate.elim_not_effective 0.45% : 0.000001s : 5: predicate.elim_shapecalc_of_broadcastargs 1.09% : 0.000003s : 20: predicate.environ_add_const_eliminate 1.02% : 0.000003s : 20: predicate.environ_get_add_eliminate 1.22% : 0.000003s : 20: predicate.environ_get_depend_swap 1.67% : 0.000005s : 30: predicate.environ_get_eliminate 1.05% : 0.000003s : 20: predicate.environ_get_set_eliminate 1.21% : 0.000003s : 23: predicate.exchange_switch_depend_value 2.19% : 0.000006s : 23: predicate.float_depend_g_call 0.56% : 0.000002s : 10: predicate.float_environ_get_switch 0.82% : 0.000002s : 15: predicate.float_tuple_getitem_switch 0.17% : 0.000000s : 5: predicate.fold_const_symbol 0.65% : 0.000002s : 10: predicate.get_grad_eliminate 0.19% : 0.000001s : 5: predicate.graph_param_transform 0.59% : 0.000002s : 10: predicate.incorporate_call 0.49% : 0.000001s : 10: predicate.incorporate_call_switch 6.15% : 0.000017s : 68: predicate.inline 0.73% : 0.000002s : 10: predicate.inline_without_move 0.31% : 0.000001s : 10: predicate.j_node_and_user_rematch 0.93% : 0.000003s : 10: predicate.less_batch_normalization 1.87% : 0.000005s : 29: predicate.list_to_tuple_eliminator_ 2.42% : 0.000007s : 44: predicate.load_eliminater 1.30% : 0.000004s : 5: predicate.loop_unroll_after_grad 1.97% : 0.000005s : 36: predicate.loop_unroll_before_grad 2.15% : 0.000006s : 25: predicate.make_slice_get_slice_eliminator 0.65% : 0.000002s : 10: predicate.merge_addn 0.59% : 0.000002s : 10: predicate.micro_step_allgather_replace 0.55% : 0.000002s : 10: predicate.mini_step_allgather_replace 0.79% : 0.000002s : 15: predicate.minmaximum_grad 1.45% : 0.000004s : 5: predicate.mutable_eliminate 0.40% : 0.000001s : 5: predicate.opt_reshape 0.38% : 0.000001s : 5: predicate.parallel_virtual_node 1.74% : 0.000005s : 23: predicate.partial_defer_inline 1.42% : 0.000004s : 24: predicate.partial_eliminate 1.02% : 0.000003s : 15: predicate.print_const_string_wrapper 0.58% : 0.000002s : 10: predicate.reduce_all_const_elim 1.20% : 0.000003s : 15: predicate.reduce_eliminate 2.26% : 0.000006s : 44: predicate.redundant_stop_gradient_eliminater 0.35% : 0.000001s : 10: predicate.remove_not_recompute_node 1.26% : 0.000003s : 29: predicate.replace_applicator 0.52% : 0.000001s : 10: predicate.replace_old_param 0.30% : 0.000001s : 5: predicate.reset_defer_inline 1.15% : 0.000003s : 15: predicate.reshape_eliminate 0.75% : 0.000002s : 10: predicate.row_tensor_add_zeros_like 0.39% : 0.000001s : 5: predicate.row_tensor_eliminate 0.78% : 0.000002s : 10: predicate.same_eliminate 0.47% : 0.000001s : 10: predicate.set_cell_output_no_recompute 0.90% : 0.000002s : 10: predicate.shard_identity_eliminate 0.86% : 0.000002s : 10: predicate.special_op_eliminate 0.74% : 0.000002s : 10: predicate.specialize_transform 1.11% : 0.000003s : 10: predicate.split_environ_get_set_with_tuple_value 0.99% : 0.000003s : 10: predicate.stack_unstack_eliminate 0.36% : 0.000001s : 5: predicate.switch_call_monad_eliminater 1.33% : 0.000004s : 23: predicate.switch_defer_inline 1.87% : 0.000005s : 33: predicate.switch_layer_defer_inline 4.33% : 0.000012s : 74: predicate.switch_simplify 0.89% : 0.000002s : 15: predicate.tile_eliminate 0.83% : 0.000002s : 15: predicate.transpose_eliminate 1.63% : 0.000004s : 25: predicate.tuple_list_convert_item_index_to_positive 1.50% : 0.000004s : 25: predicate.tuple_list_get_item_const_eliminator 1.64% : 0.000004s : 25: predicate.tuple_list_get_item_depend_reorder 2.98% : 0.000008s : 39: predicate.tuple_list_get_item_eliminator 1.62% : 0.000004s : 25: predicate.tuple_list_get_set_item_eliminator 2.23% : 0.000006s : 35: predicate.tuple_list_set_item_eliminator 1.96% : 0.000005s : 29: predicate.tuple_to_list_eliminator_ 2.12% : 0.000006s : 44: predicate.updatestate_pure_node_eliminater 2.76% : 0.000008s : 54: predicate.updatestate_useless_node_eliminater 0.44% : 0.000001s : 5: predicate.value_based_eliminate 0.77% : 0.000002s : 10: predicate.virtual_dataset_eliminate 0.69% : 0.000002s : 10: predicate.virtual_output_eliminate 0.24% : 0.000001s : 5: predicate.virtual_view_grad_eliminate 0.53% : 0.000001s : 5: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000917 11 46.62% : 0.000427s : 5: func_graph_cloner_run.FuncGraphClonerGraph 53.38% : 0.000489s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.219959 192 0.00% : 0.000004s : 1: ForceFp32Comm 2.00% : 0.004404s : 1: add_attr 1.99% : 0.004387s : 1: add_attr_with_inline 0.00% : 0.000003s : 1: add_comm_op_reuse_tag 0.03% : 0.000077s : 1: add_recomputation 0.00% : 0.000004s : 1: assign_add_opt 0.04% : 0.000095s : 1: auto_monad 0.01% : 0.000027s : 1: auto_monad_reorder 0.00% : 0.000003s : 1: begin_end_overlap_inline 0.00% : 0.000007s : 1: bias_add_comm_swap 0.25% : 0.000554s : 1: bootstrap 0.02% : 0.000045s : 1: cconv 0.00% : 0.000004s : 1: comm_op_add_attrs 0.01% : 0.000023s : 1: control_data_broadcast_order 0.01% : 0.000012s : 1: convert_after_rewriter 0.02% : 0.000037s : 1: cse_after_recomputation 0.00% : 0.000005s : 1: dataset_repeat_opt 0.00% : 0.000006s : 1: detach_backward 0.01% : 0.000011s : 1: environ_conv 0.01% : 0.000032s : 1: event_method 0.00% : 0.000005s : 1: full_micro_interleaved_order_control 0.00% : 0.000007s : 1: get_jit_bprop_graph 0.00% : 0.000011s : 1: graph_reusing 0.00% : 0.000005s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000004s : 1: handle_group_info 0.00% : 0.000007s : 1: inline 0.00% : 0.000007s : 1: insert-virtual-dataset 0.00% : 0.000004s : 1: interleave_parallel_branches 0.00% : 0.000004s : 1: interleave_split_concat_branches 0.00% : 0.000006s : 1: label_fine_grained_interleaved_index 0.01% : 0.000011s : 1: label_micro_interleaved_index 0.34% : 0.000746s : 1: loop_unroll 0.00% : 0.000004s : 1: merge_cast_opt 0.00% : 0.000006s : 1: micro_interleaved_order_control 0.45% : 0.000993s : 1: mutable_eliminate 0.00% : 0.000010s : 1: offloading_packed_experts 0.01% : 0.000026s : 1: opt.transform.loop_unroll_optimizer 0.01% : 0.000030s : 1: opt.transform.mutable_eliminate 0.75% : 0.001647s : 78: opt.transform.opt_a 0.02% : 0.000049s : 1: opt.transform.opt_after_cconv 0.02% : 0.000047s : 1: opt.transform.opt_after_jit_grad 0.08% : 0.000177s : 28: opt.transform.opt_b 0.04% : 0.000084s : 2: opt.transform.opt_trans_graph 0.03% : 0.000055s : 4: opt.transform.symbol_engine_opt 1.85% : 0.004066s : 1: opt_a 0.07% : 0.000163s : 1: opt_after_cconv 0.41% : 0.000900s : 1: opt_after_jit_grad 0.16% : 0.000346s : 1: opt_b 3.33% : 0.007330s : 1: optimize 0.01% : 0.000029s : 1: optimize_parallel_all_gather_comm 0.00% : 0.000009s : 1: order_py_execute_after_rewriter 0.01% : 0.000032s : 1: overlap_grad_flash_sp 0.00% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.00% : 0.000008s : 1: overlap_grad_ring_attention 0.00% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000007s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000006s : 1: overlap_param_gather 0.00% : 0.000006s : 1: overlap_recompute_allgather_and_fa_grad 0.00% : 0.000009s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000007s : 1: overlap_recompute_comm 0.00% : 0.000010s : 1: parallel-infer-symbol 0.00% : 0.000004s : 1: parallel-infer-symbol-second 0.00% : 0.000007s : 1: partial_unused_args_eliminate 0.00% : 0.000006s : 1: pipeline_parallel_scheduler 0.00% : 0.000006s : 1: pipeline_split 0.02% : 0.000049s : 1: pre_auto_parallel 0.02% : 0.000044s : 1: py_interpret_to_execute 0.01% : 0.000027s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000005s : 1: remove_cast_before_assign_add 0.01% : 0.000023s : 1: remove_dup_value 0.35% : 0.000771s : 1: renormalize.infer 0.24% : 0.000539s : 1: renormalize.specialize 0.00% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.01% : 0.000012s : 1: rewriter_after_jit_bprop_graph 0.03% : 0.000060s : 1: rewriter_after_opt_a 0.05% : 0.000113s : 1: rewriter_before_opt_a 0.00% : 0.000006s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000005s : 1: slice_recompute_activation 0.00% : 0.000004s : 1: split_layernorm_comm 0.00% : 0.000005s : 1: split_matmul_comm_elemetwise 0.00% : 0.000010s : 1: swap_dp_allreduce_reducescatter 0.05% : 0.000112s : 1: symbol_engine_optimizer 0.05% : 0.000117s : 1: tuple_transform 87.02% : 0.191413s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:39:21.518.824 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:39:21.519.112 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.245535, [21] [bootstrap]: 0.00050096 [type_inference]: 0.00715523 [event_method]: 2.176e-05 [auto_monad]: 7.144e-05 [graph_reusing]: 5.69e-06 [inline]: 2.89001e-06 [add_attr]: 0.00385962, [1] [add_attr_with_inline]: 0.00384772, [1] [Cycle 1]: 9.268e-05, [2] [tag_attr]: 2.381e-05 [meta_addattr_fg_expand]: 5.87001e-06 [parallel-infer-symbol]: 4.13999e-06 [pre_auto_parallel]: 4.067e-05 [insert-virtual-dataset]: 2.31998e-06 [parallel-infer-symbol-second]: 7.60017e-07 [dataset_repeat_opt]: 1.92999e-06 [pipeline_split]: 1.92999e-06 [optimize]: 0.232388, [53] [py_interpret_to_execute]: 3.838e-05 [rewriter_before_opt_a]: 9.897e-05 [opt_a]: 0.2294, [2] [Cycle 1]: 0.228414, [45] [expand_dump_flag]: 2.99001e-06 [switch_simplify]: 4.413e-05 [loop_unroll]: 3.149e-05 [a_1]: 0.00070935 [with_stream_mark]: 2.07e-05 [recompute_prepare]: 1.048e-05 [updatestate_depend_eliminate]: 5.49e-06 [updatestate_assign_eliminate]: 3.85e-06 [updatestate_loads_eliminate]: 3.08e-06 [parameter_eliminate]: 2.17001e-06 [a_2]: 0.00012463 [accelerated_algorithm]: 2.245e-05 [shard]: 2.63e-06 [meta_shard_fg_expand]: 1.98997e-06 [shard_inline]: 8.3e-06 [merge_send_recv]: 9.94001e-06 [auto_parallel]: 8.1e-06 [parallel]: 2.367e-05 [flash_sp]: 1.015e-05 [merge_comm]: 4.15e-06 [allreduce_fusion]: 3.46001e-06 [matmul_add_comm_reduction]: 1.262e-05 [allreduce_slice_to_reducescatter]: 1.37e-06 [virtual_shard_identity]: 1.139e-05 [virtual_dataset]: 6.82002e-06 [get_grad_eliminate_]: 6.80998e-06 [virtual_output]: 6.61e-06 [merge_forward]: 4.04002e-06 [cell_reuse_recompute_pass]: 1.50999e-06 [offload_activation]: 1.33e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.647e-05 [merge_recompute_call_nodes]: 1.81e-06 [before_grad]: 1.372e-05 [set_forward_comm_id_for_comm_node_pass]: 3.55003e-06 [meta_fg_expand]: 3.58e-06 [flash_sp_send_recv_attached]: 5.34998e-06 [receive_attached]: 2.67001e-06 [after_resolve]: 1.273e-05 [a_after_grad]: 1.132e-05 [renormalize]: 0.226608 [add_forward_monad_depend]: 1.311e-05 [auto_monad_grad]: 3.28e-06 [auto_monad_eliminator]: 2.671e-05 [cse]: 3.277e-05 [a_3]: 8.604e-05 [Cycle 2]: 0.00096608, [45] [expand_dump_flag]: 2.88e-06 [switch_simplify]: 1.044e-05 [loop_unroll]: 7.46001e-06 [a_1]: 0.00018105 [with_stream_mark]: 2.043e-05 [recompute_prepare]: 7.53999e-06 [updatestate_depend_eliminate]: 4.45999e-06 [updatestate_assign_eliminate]: 3.68999e-06 [updatestate_loads_eliminate]: 3.03998e-06 [parameter_eliminate]: 2.14999e-06 [a_2]: 0.0001128 [accelerated_algorithm]: 1.25e-05 [shard]: 2.39999e-06 [meta_shard_fg_expand]: 2.99001e-06 [shard_inline]: 7.08e-06 [merge_send_recv]: 9.51998e-06 [auto_parallel]: 1.069e-05 [parallel]: 1.017e-05 [flash_sp]: 4.57003e-06 [merge_comm]: 3.52002e-06 [allreduce_fusion]: 3.81001e-06 [matmul_add_comm_reduction]: 1.151e-05 [allreduce_slice_to_reducescatter]: 6.59988e-07 [virtual_shard_identity]: 7.86001e-06 [virtual_dataset]: 7.14001e-06 [get_grad_eliminate_]: 6.48e-06 [virtual_output]: 6.21998e-06 [merge_forward]: 4.33999e-06 [cell_reuse_recompute_pass]: 3.84002e-06 [offload_activation]: 1.162e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.466e-05 [merge_recompute_call_nodes]: 1.49e-06 [before_grad]: 1.066e-05 [set_forward_comm_id_for_comm_node_pass]: 4.43999e-06 [meta_fg_expand]: 2.86999e-06 [flash_sp_send_recv_attached]: 1.91e-06 [receive_attached]: 2.32999e-06 [after_resolve]: 1.296e-05 [a_after_grad]: 1.065e-05 [renormalize]: 7.99773e-08 [add_forward_monad_depend]: 1.32999e-06 [auto_monad_grad]: 1.67999e-06 [auto_monad_eliminator]: 8.08999e-06 [cse]: 1.608e-05 [a_3]: 5e-05 [py_interpret_to_execute_after_opt_a]: 2.334e-05 [slice_cell_reuse_recomputed_activation]: 4.85001e-06 [rewriter_after_opt_a]: 4.684e-05 [convert_after_rewriter]: 1.203e-05 [order_py_execute_after_rewriter]: 9.38002e-06 [mutable_eliminate]: 0.00082956 [opt_b]: 0.00032169, [1] [Cycle 1]: 0.00031099, [7] [b_1]: 0.00020451 [b_2]: 9.52001e-06 [updatestate_depend_eliminate]: 6.91001e-06 [updatestate_assign_eliminate]: 2.73998e-06 [updatestate_loads_eliminate]: 2.67001e-06 [renormalize]: 1.14e-06 [cse]: 2.166e-05 [optimize_parallel_all_gather_comm]: 2.032e-05 [overlap_param_gather]: 5.67001e-06 [cconv]: 3.795e-05 [loop_unroll]: 0.00049684 [opt_after_cconv]: 0.00020188, [1] [Cycle 1]: 0.00019117, [7] [c_1]: 3.87e-05 [parameter_eliminate]: 3.62002e-06 [updatestate_depend_eliminate]: 5.83002e-06 [updatestate_assign_eliminate]: 2.72001e-06 [updatestate_loads_eliminate]: 3.25e-06 [cse]: 1.972e-05 [renormalize]: 4.7998e-07 [remove_dup_value]: 1.953e-05 [tuple_transform]: 0.00010847, [1] [Cycle 1]: 9.933e-05, [4] [d_1]: 5.587e-05 [none_parameter_eliminate]: 2.23002e-06 [renormalize]: 2.50002e-07 [switch_simplify]: 8.2e-06 [partial_unused_args_eliminate]: 5.22e-06 [add_recomputation]: 5.801e-05 [cse_after_recomputation]: 3.036e-05, [1] [Cycle 1]: 2.191e-05, [1] [cse]: 1.166e-05 [environ_conv]: 1.001e-05 [swap_dp_allreduce_reducescatter]: 9.24998e-06 [bias_add_comm_swap]: 6.91001e-06 [label_micro_interleaved_index]: 8.30999e-06 [label_fine_grained_interleaved_index]: 5.31998e-06 [merge_cast_opt]: 4.50999e-06 [slice_recompute_activation]: 7.03998e-06 [micro_interleaved_order_control]: 5.01002e-06 [assign_add_opt]: 3.86999e-06 [ForceFp32Comm]: 4.08001e-06 [remove_cast_before_assign_add]: 3.80998e-06 [full_micro_interleaved_order_control]: 4.77e-06 [reorder_send_recv_between_fp_bp]: 6.26e-06 [comm_op_add_attrs]: 3.73001e-06 [add_comm_op_reuse_tag]: 4.12998e-06 [interleave_split_concat_branches]: 3.97e-06 [interleave_parallel_branches]: 4.02998e-06 [overlap_opt_shard_in_pipeline]: 3.89002e-06 [overlap_opt_shard_grad_in_pipeline]: 4.40999e-06 [control_data_broadcast_order]: 1.721e-05 [grouped_pairwise_exchange_alltoall]: 4.74e-06 [offloading_packed_experts]: 7.73001e-06 [overlap_recompute_and_grad_model_parallel]: 1.047e-05 [overlap_grad_matmul_and_grad_allreduce]: 3.56001e-06 [overlap_recompute_allgather_and_fa_grad]: 3.86999e-06 [overlap_recompute_comm]: 5.24e-06 [overlap_grad_ring_attention]: 8.62e-06 [overlap_grad_flash_sp]: 2.463e-05 [begin_end_overlap_inline]: 4.35e-06 [split_matmul_comm_elemetwise]: 5.31998e-06 [split_layernorm_comm]: 5.59e-06 [handle_group_info]: 4.17998e-06 [symbol_engine_optimizer]: 0.00018122, [1] [Cycle 1]: 0.00017272, [6] [build]: 4.67e-06 [elim_shapecalc]: 1.395e-05 [elim_not_effective]: 1.446e-05 [opt_reshape]: 1.207e-05 [fold_const_symbol]: 1.431e-05 [renormalize]: 2.09984e-07 [detach_backward]: 5.83002e-06 [pipeline_parallel_scheduler]: 2.06e-06 [auto_monad_reorder]: 3.396e-05 [get_jit_bprop_graph]: 2.45002e-06 [rewriter_after_jit_bprop_graph]: 6.83e-06 [opt_after_jit_grad]: 0.00067775 [validate]: 4.913e-05 Sums bootstrap : 0.000501s : 0.21% type_inference : 0.007155s : 2.99% event_method : 0.000022s : 0.01% auto_monad : 0.000071s : 0.03% graph_reusing : 0.000006s : 0.00% inline : 0.000003s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000024s : 0.01% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.00% parallel-infer-symbol : 0.000004s : 0.00% pre_auto_parallel : 0.000041s : 0.02% insert-virtual-dataset : 0.000002s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000038s : 0.02% optimize.rewriter_before_opt_a : 0.000099s : 0.04% optimize.opt_a.expand_dump_flag : 0.000006s : 0.00% optimize.opt_a.switch_simplify : 0.000055s : 0.02% optimize.opt_a.loop_unroll : 0.000039s : 0.02% optimize.opt_a.a_1 : 0.000890s : 0.37% optimize.opt_a.with_stream_mark : 0.000041s : 0.02% optimize.opt_a.recompute_prepare : 0.000018s : 0.01% optimize.opt_a.updatestate_depend_eliminate : 0.000010s : 0.00% optimize.opt_a.updatestate_assign_eliminate : 0.000008s : 0.00% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.00% optimize.opt_a.parameter_eliminate : 0.000004s : 0.00% optimize.opt_a.a_2 : 0.000237s : 0.10% optimize.opt_a.accelerated_algorithm : 0.000035s : 0.01% optimize.opt_a.shard : 0.000005s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.000005s : 0.00% optimize.opt_a.shard_inline : 0.000015s : 0.01% optimize.opt_a.merge_send_recv : 0.000019s : 0.01% optimize.opt_a.auto_parallel : 0.000019s : 0.01% optimize.opt_a.parallel : 0.000034s : 0.01% optimize.opt_a.flash_sp : 0.000015s : 0.01% optimize.opt_a.merge_comm : 0.000008s : 0.00% optimize.opt_a.allreduce_fusion : 0.000007s : 0.00% optimize.opt_a.matmul_add_comm_reduction : 0.000024s : 0.01% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000002s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000019s : 0.01% optimize.opt_a.virtual_dataset : 0.000014s : 0.01% optimize.opt_a.get_grad_eliminate_ : 0.000013s : 0.01% optimize.opt_a.virtual_output : 0.000013s : 0.01% optimize.opt_a.merge_forward : 0.000008s : 0.00% optimize.opt_a.cell_reuse_recompute_pass : 0.000005s : 0.00% optimize.opt_a.offload_activation : 0.000025s : 0.01% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000031s : 0.01% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.00% optimize.opt_a.before_grad : 0.000024s : 0.01% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000008s : 0.00% optimize.opt_a.meta_fg_expand : 0.000006s : 0.00% optimize.opt_a.flash_sp_send_recv_attached : 0.000007s : 0.00% optimize.opt_a.receive_attached : 0.000005s : 0.00% optimize.opt_a.after_resolve : 0.000026s : 0.01% optimize.opt_a.a_after_grad : 0.000022s : 0.01% optimize.opt_a.renormalize : 0.226608s : 94.59% optimize.opt_a.add_forward_monad_depend : 0.000014s : 0.01% optimize.opt_a.auto_monad_grad : 0.000005s : 0.00% optimize.opt_a.auto_monad_eliminator : 0.000035s : 0.01% optimize.opt_a.cse : 0.000049s : 0.02% optimize.opt_a.a_3 : 0.000136s : 0.06% optimize.py_interpret_to_execute_after_opt_a : 0.000023s : 0.01% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.00% optimize.rewriter_after_opt_a : 0.000047s : 0.02% optimize.convert_after_rewriter : 0.000012s : 0.01% optimize.order_py_execute_after_rewriter : 0.000009s : 0.00% optimize.mutable_eliminate : 0.000830s : 0.35% optimize.opt_b.b_1 : 0.000205s : 0.09% optimize.opt_b.b_2 : 0.000010s : 0.00% optimize.opt_b.updatestate_depend_eliminate : 0.000007s : 0.00% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.00% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000022s : 0.01% optimize.optimize_parallel_all_gather_comm : 0.000020s : 0.01% optimize.overlap_param_gather : 0.000006s : 0.00% optimize.cconv : 0.000038s : 0.02% optimize.loop_unroll : 0.000497s : 0.21% optimize.opt_after_cconv.c_1 : 0.000039s : 0.02% optimize.opt_after_cconv.parameter_eliminate : 0.000004s : 0.00% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.00% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.cse : 0.000020s : 0.01% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000020s : 0.01% optimize.tuple_transform.d_1 : 0.000056s : 0.02% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000008s : 0.00% optimize.partial_unused_args_eliminate : 0.000005s : 0.00% optimize.add_recomputation : 0.000058s : 0.02% optimize.cse_after_recomputation.cse : 0.000012s : 0.00% optimize.environ_conv : 0.000010s : 0.00% optimize.swap_dp_allreduce_reducescatter : 0.000009s : 0.00% optimize.bias_add_comm_swap : 0.000007s : 0.00% optimize.label_micro_interleaved_index : 0.000008s : 0.00% optimize.label_fine_grained_interleaved_index : 0.000005s : 0.00% optimize.merge_cast_opt : 0.000005s : 0.00% optimize.slice_recompute_activation : 0.000007s : 0.00% optimize.micro_interleaved_order_control : 0.000005s : 0.00% optimize.assign_add_opt : 0.000004s : 0.00% optimize.ForceFp32Comm : 0.000004s : 0.00% optimize.remove_cast_before_assign_add : 0.000004s : 0.00% optimize.full_micro_interleaved_order_control : 0.000005s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000006s : 0.00% optimize.comm_op_add_attrs : 0.000004s : 0.00% optimize.add_comm_op_reuse_tag : 0.000004s : 0.00% optimize.interleave_split_concat_branches : 0.000004s : 0.00% optimize.interleave_parallel_branches : 0.000004s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000004s : 0.00% optimize.control_data_broadcast_order : 0.000017s : 0.01% optimize.grouped_pairwise_exchange_alltoall : 0.000005s : 0.00% optimize.offloading_packed_experts : 0.000008s : 0.00% optimize.overlap_recompute_and_grad_model_parallel : 0.000010s : 0.00% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.00% optimize.overlap_recompute_comm : 0.000005s : 0.00% optimize.overlap_grad_ring_attention : 0.000009s : 0.00% optimize.overlap_grad_flash_sp : 0.000025s : 0.01% optimize.begin_end_overlap_inline : 0.000004s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000005s : 0.00% optimize.split_layernorm_comm : 0.000006s : 0.00% optimize.handle_group_info : 0.000004s : 0.00% optimize.symbol_engine_optimizer.build : 0.000005s : 0.00% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000014s : 0.01% optimize.symbol_engine_optimizer.elim_not_effective : 0.000014s : 0.01% optimize.symbol_engine_optimizer.opt_reshape : 0.000012s : 0.01% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000014s : 0.01% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000006s : 0.00% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000034s : 0.01% get_jit_bprop_graph : 0.000002s : 0.00% rewriter_after_jit_bprop_graph : 0.000007s : 0.00% opt_after_jit_grad : 0.000678s : 0.28% validate : 0.000049s : 0.02% Time group info: ------[substitution.] 0.000275 36 15.63% : 0.000043s : 6: substitution.arithmetic_simplify 1.04% : 0.000003s : 2: substitution.elim_not_effective 0.55% : 0.000002s : 2: substitution.fold_const_symbol 2.35% : 0.000006s : 4: substitution.graph_param_transform 62.95% : 0.000173s : 4: substitution.inline 2.17% : 0.000006s : 4: substitution.j_node_and_user_rematch 5.31% : 0.000015s : 2: substitution.less_batch_normalization 1.80% : 0.000005s : 4: substitution.remove_not_recompute_node 2.23% : 0.000006s : 4: substitution.replace_old_param 5.96% : 0.000016s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.007090 2 86.17% : 0.006109s : 1: type_inference.infer 13.83% : 0.000980s : 1: type_inference.specialize ------[replace.] 0.000068 8 63.23% : 0.000043s : 4: replace.inline 36.77% : 0.000025s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000184 8 92.10% : 0.000170s : 4: match.inline 7.90% : 0.000015s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000235 1278 0.99% : 0.000002s : 13: predicate.accumulaten_eliminater 0.81% : 0.000002s : 4: predicate.ad_related_special_op_eliminate 0.51% : 0.000001s : 8: predicate.addn_check_dump 0.85% : 0.000002s : 13: predicate.addn_zero_filter 0.74% : 0.000002s : 13: predicate.adjust_all_reduce_mul_add 2.76% : 0.000006s : 21: predicate.arithmetic_simplify 0.96% : 0.000002s : 13: predicate.cast_eliminate 0.71% : 0.000002s : 8: predicate.check_bprop_eliminate 0.57% : 0.000001s : 8: predicate.compare_switch_simplify 0.17% : 0.000000s : 4: predicate.const_output_eliminate 0.69% : 0.000002s : 8: predicate.depend_value_elim 0.84% : 0.000002s : 13: predicate.dict_get_item_const_eliminator 1.25% : 0.000003s : 13: predicate.dict_get_item_eliminator 0.84% : 0.000002s : 13: predicate.dict_set_item_eliminator 0.86% : 0.000002s : 8: predicate.dumpgradient_eliminate 0.18% : 0.000000s : 4: predicate.elim_not_effective 0.37% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.57% : 0.000004s : 17: predicate.environ_add_const_eliminate 1.00% : 0.000002s : 17: predicate.environ_get_add_eliminate 1.14% : 0.000003s : 17: predicate.environ_get_depend_swap 1.68% : 0.000004s : 25: predicate.environ_get_eliminate 1.26% : 0.000003s : 17: predicate.environ_get_set_eliminate 1.29% : 0.000003s : 21: predicate.exchange_switch_depend_value 2.25% : 0.000005s : 21: predicate.float_depend_g_call 0.54% : 0.000001s : 8: predicate.float_environ_get_switch 0.83% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.17% : 0.000000s : 4: predicate.fold_const_symbol 0.64% : 0.000001s : 8: predicate.get_grad_eliminate 0.20% : 0.000000s : 4: predicate.graph_param_transform 0.51% : 0.000001s : 8: predicate.incorporate_call 0.42% : 0.000001s : 8: predicate.incorporate_call_switch 6.17% : 0.000015s : 58: predicate.inline 0.78% : 0.000002s : 8: predicate.inline_without_move 0.28% : 0.000001s : 8: predicate.j_node_and_user_rematch 1.01% : 0.000002s : 8: predicate.less_batch_normalization 1.72% : 0.000004s : 25: predicate.list_to_tuple_eliminator_ 2.39% : 0.000006s : 38: predicate.load_eliminater 1.02% : 0.000002s : 4: predicate.loop_unroll_after_grad 2.27% : 0.000005s : 34: predicate.loop_unroll_before_grad 1.53% : 0.000004s : 21: predicate.make_slice_get_slice_eliminator 0.54% : 0.000001s : 8: predicate.merge_addn 0.54% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.54% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.74% : 0.000002s : 13: predicate.minmaximum_grad 1.39% : 0.000003s : 4: predicate.mutable_eliminate 0.43% : 0.000001s : 4: predicate.opt_reshape 0.36% : 0.000001s : 4: predicate.parallel_virtual_node 1.86% : 0.000004s : 21: predicate.partial_defer_inline 1.44% : 0.000003s : 21: predicate.partial_eliminate 1.05% : 0.000002s : 13: predicate.print_const_string_wrapper 0.59% : 0.000001s : 8: predicate.reduce_all_const_elim 1.26% : 0.000003s : 13: predicate.reduce_eliminate 2.52% : 0.000006s : 38: predicate.redundant_stop_gradient_eliminater 0.76% : 0.000002s : 8: predicate.remove_not_recompute_node 1.34% : 0.000003s : 25: predicate.replace_applicator 0.54% : 0.000001s : 8: predicate.replace_old_param 0.40% : 0.000001s : 4: predicate.reset_defer_inline 0.94% : 0.000002s : 13: predicate.reshape_eliminate 0.72% : 0.000002s : 8: predicate.row_tensor_add_zeros_like 0.39% : 0.000001s : 4: predicate.row_tensor_eliminate 0.96% : 0.000002s : 8: predicate.same_eliminate 0.48% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.97% : 0.000002s : 8: predicate.shard_identity_eliminate 0.74% : 0.000002s : 8: predicate.special_op_eliminate 0.63% : 0.000001s : 8: predicate.specialize_transform 1.34% : 0.000003s : 8: predicate.split_environ_get_set_with_tuple_value 0.85% : 0.000002s : 8: predicate.stack_unstack_eliminate 0.29% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.37% : 0.000003s : 21: predicate.switch_defer_inline 1.93% : 0.000005s : 29: predicate.switch_layer_defer_inline 4.72% : 0.000011s : 67: predicate.switch_simplify 0.94% : 0.000002s : 13: predicate.tile_eliminate 0.98% : 0.000002s : 13: predicate.transpose_eliminate 1.56% : 0.000004s : 21: predicate.tuple_list_convert_item_index_to_positive 1.44% : 0.000003s : 21: predicate.tuple_list_get_item_const_eliminator 1.26% : 0.000003s : 21: predicate.tuple_list_get_item_depend_reorder 3.23% : 0.000008s : 33: predicate.tuple_list_get_item_eliminator 1.75% : 0.000004s : 21: predicate.tuple_list_get_set_item_eliminator 2.28% : 0.000005s : 29: predicate.tuple_list_set_item_eliminator 1.73% : 0.000004s : 25: predicate.tuple_to_list_eliminator_ 2.20% : 0.000005s : 38: predicate.updatestate_pure_node_eliminater 3.01% : 0.000007s : 46: predicate.updatestate_useless_node_eliminater 0.36% : 0.000001s : 4: predicate.value_based_eliminate 0.62% : 0.000001s : 8: predicate.virtual_dataset_eliminate 0.55% : 0.000001s : 8: predicate.virtual_output_eliminate 0.24% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.46% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000839 11 46.74% : 0.000392s : 5: func_graph_cloner_run.FuncGraphClonerGraph 53.26% : 0.000447s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.709959 192 0.00% : 0.000007s : 1: ForceFp32Comm 0.55% : 0.003871s : 1: add_attr 0.54% : 0.003852s : 1: add_attr_with_inline 0.00% : 0.000007s : 1: add_comm_op_reuse_tag 0.01% : 0.000061s : 1: add_recomputation 0.00% : 0.000006s : 1: assign_add_opt 0.01% : 0.000081s : 1: auto_monad 0.01% : 0.000043s : 1: auto_monad_reorder 0.00% : 0.000007s : 1: begin_end_overlap_inline 0.00% : 0.000010s : 1: bias_add_comm_swap 0.08% : 0.000551s : 1: bootstrap 0.01% : 0.000042s : 1: cconv 0.00% : 0.000006s : 1: comm_op_add_attrs 0.00% : 0.000021s : 1: control_data_broadcast_order 0.00% : 0.000015s : 1: convert_after_rewriter 0.00% : 0.000033s : 1: cse_after_recomputation 0.00% : 0.000007s : 1: dataset_repeat_opt 0.00% : 0.000024s : 1: detach_backward 0.00% : 0.000013s : 1: environ_conv 0.00% : 0.000033s : 1: event_method 0.00% : 0.000008s : 1: full_micro_interleaved_order_control 0.00% : 0.000009s : 1: get_jit_bprop_graph 0.00% : 0.000012s : 1: graph_reusing 0.00% : 0.000008s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000007s : 1: handle_group_info 0.00% : 0.000009s : 1: inline 0.00% : 0.000010s : 1: insert-virtual-dataset 0.00% : 0.000007s : 1: interleave_parallel_branches 0.00% : 0.000007s : 1: interleave_split_concat_branches 0.00% : 0.000008s : 1: label_fine_grained_interleaved_index 0.00% : 0.000011s : 1: label_micro_interleaved_index 0.07% : 0.000503s : 1: loop_unroll 0.00% : 0.000007s : 1: merge_cast_opt 0.00% : 0.000008s : 1: micro_interleaved_order_control 0.12% : 0.000836s : 1: mutable_eliminate 0.00% : 0.000011s : 1: offloading_packed_experts 0.00% : 0.000016s : 1: opt.transform.loop_unroll_optimizer 0.00% : 0.000018s : 1: opt.transform.mutable_eliminate 0.20% : 0.001394s : 78: opt.transform.opt_a 0.01% : 0.000037s : 1: opt.transform.opt_after_cconv 0.00% : 0.000032s : 1: opt.transform.opt_after_jit_grad 0.02% : 0.000129s : 28: opt.transform.opt_b 0.01% : 0.000061s : 2: opt.transform.opt_trans_graph 0.01% : 0.000050s : 4: opt.transform.symbol_engine_opt 32.31% : 0.229404s : 1: opt_a 0.03% : 0.000206s : 1: opt_after_cconv 0.10% : 0.000690s : 1: opt_after_jit_grad 0.05% : 0.000325s : 1: opt_b 32.79% : 0.232802s : 1: optimize 0.00% : 0.000024s : 1: optimize_parallel_all_gather_comm 0.00% : 0.000013s : 1: order_py_execute_after_rewriter 0.00% : 0.000028s : 1: overlap_grad_flash_sp 0.00% : 0.000006s : 1: overlap_grad_matmul_and_grad_allreduce 0.00% : 0.000013s : 1: overlap_grad_ring_attention 0.00% : 0.000008s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000007s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000009s : 1: overlap_param_gather 0.00% : 0.000007s : 1: overlap_recompute_allgather_and_fa_grad 0.00% : 0.000014s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000009s : 1: overlap_recompute_comm 0.00% : 0.000011s : 1: parallel-infer-symbol 0.00% : 0.000007s : 1: parallel-infer-symbol-second 0.00% : 0.000008s : 1: partial_unused_args_eliminate 0.00% : 0.000010s : 1: pipeline_parallel_scheduler 0.00% : 0.000008s : 1: pipeline_split 0.01% : 0.000049s : 1: pre_auto_parallel 0.01% : 0.000042s : 1: py_interpret_to_execute 0.00% : 0.000027s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000008s : 1: remove_cast_before_assign_add 0.00% : 0.000023s : 1: remove_dup_value 31.84% : 0.226024s : 1: renormalize.infer 0.08% : 0.000564s : 1: renormalize.specialize 0.00% : 0.000009s : 1: reorder_send_recv_between_fp_bp 0.00% : 0.000014s : 1: rewriter_after_jit_bprop_graph 0.01% : 0.000051s : 1: rewriter_after_opt_a 0.01% : 0.000103s : 1: rewriter_before_opt_a 0.00% : 0.000008s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000010s : 1: slice_recompute_activation 0.00% : 0.000008s : 1: split_layernorm_comm 0.00% : 0.000008s : 1: split_matmul_comm_elemetwise 0.00% : 0.000013s : 1: swap_dp_allreduce_reducescatter 0.03% : 0.000184s : 1: symbol_engine_optimizer 0.02% : 0.000111s : 1: tuple_transform 1.02% : 0.007209s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:39:23.534.631 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.182792, [21] [bootstrap]: 0.164464 [type_inference]: 0.00707344 [event_method]: 2.252e-05 [auto_monad]: 7.195e-05 [graph_reusing]: 5.81998e-06 [inline]: 2.67001e-06 [add_attr]: 0.00411667, [1] [add_attr_with_inline]: 0.00410207, [1] [Cycle 1]: 0.00010073, [2] [tag_attr]: 2.535e-05 [meta_addattr_fg_expand]: 6.00002e-06 [parallel-infer-symbol]: 4.13999e-06 [pre_auto_parallel]: 4.69e-05 [insert-virtual-dataset]: 2.86e-06 [parallel-infer-symbol-second]: 7.80012e-07 [dataset_repeat_opt]: 1.97001e-06 [pipeline_split]: 1.85001e-06 [optimize]: 0.00603859, [53] [py_interpret_to_execute]: 0.00017892 [rewriter_before_opt_a]: 0.00010488 [opt_a]: 0.00323072, [2] [Cycle 1]: 0.00246894, [45] [expand_dump_flag]: 4.05e-06 [switch_simplify]: 4.784e-05 [loop_unroll]: 3.169e-05 [a_1]: 0.00072161 [with_stream_mark]: 2.039e-05 [recompute_prepare]: 1.148e-05 [updatestate_depend_eliminate]: 4.28999e-06 [updatestate_assign_eliminate]: 3.16999e-06 [updatestate_loads_eliminate]: 2.73e-06 [parameter_eliminate]: 2.09999e-06 [a_2]: 9.898e-05 [accelerated_algorithm]: 2.142e-05 [shard]: 2.34001e-06 [meta_shard_fg_expand]: 2.14e-06 [shard_inline]: 8.50001e-06 [merge_send_recv]: 8.86997e-06 [auto_parallel]: 7.55998e-06 [parallel]: 2.099e-05 [flash_sp]: 9.22999e-06 [merge_comm]: 3.96001e-06 [allreduce_fusion]: 3.33998e-06 [matmul_add_comm_reduction]: 1.223e-05 [allreduce_slice_to_reducescatter]: 8.80013e-07 [virtual_shard_identity]: 1.095e-05 [virtual_dataset]: 7.65998e-06 [get_grad_eliminate_]: 8.95999e-06 [virtual_output]: 6.85002e-06 [merge_forward]: 4.09997e-06 [cell_reuse_recompute_pass]: 1.47001e-06 [offload_activation]: 1.191e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.259e-05 [merge_recompute_call_nodes]: 1.71e-06 [before_grad]: 1.203e-05 [set_forward_comm_id_for_comm_node_pass]: 4.79002e-06 [meta_fg_expand]: 2.96999e-06 [flash_sp_send_recv_attached]: 5.05001e-06 [receive_attached]: 2.14e-06 [after_resolve]: 1.454e-05 [a_after_grad]: 1.011e-05 [renormalize]: 0.00091367 [add_forward_monad_depend]: 6.74001e-06 [auto_monad_grad]: 2.58e-06 [auto_monad_eliminator]: 1.838e-05 [cse]: 3.245e-05 [a_3]: 5.806e-05 [Cycle 2]: 0.00074795, [45] [expand_dump_flag]: 2.47001e-06 [switch_simplify]: 8.43001e-06 [loop_unroll]: 7.26001e-06 [a_1]: 0.00015695 [with_stream_mark]: 1.337e-05 [recompute_prepare]: 7.18998e-06 [updatestate_depend_eliminate]: 3.6e-06 [updatestate_assign_eliminate]: 2.66e-06 [updatestate_loads_eliminate]: 3.3e-06 [parameter_eliminate]: 1.60001e-06 [a_2]: 8.323e-05 [accelerated_algorithm]: 1.176e-05 [shard]: 1.66e-06 [meta_shard_fg_expand]: 1.58002e-06 [shard_inline]: 6.76999e-06 [merge_send_recv]: 6.36e-06 [auto_parallel]: 8.22998e-06 [parallel]: 6.71e-06 [flash_sp]: 4e-06 [merge_comm]: 3.85998e-06 [allreduce_fusion]: 3.6e-06 [matmul_add_comm_reduction]: 9.17001e-06 [allreduce_slice_to_reducescatter]: 8.09989e-07 [virtual_shard_identity]: 8.40001e-06 [virtual_dataset]: 7.01001e-06 [get_grad_eliminate_]: 6.60002e-06 [virtual_output]: 6.36e-06 [merge_forward]: 3.41001e-06 [cell_reuse_recompute_pass]: 2.63998e-06 [offload_activation]: 1.09e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.417e-05 [merge_recompute_call_nodes]: 1.64e-06 [before_grad]: 1.054e-05 [set_forward_comm_id_for_comm_node_pass]: 4.62998e-06 [meta_fg_expand]: 2.40002e-06 [flash_sp_send_recv_attached]: 1.62001e-06 [receive_attached]: 1.95001e-06 [after_resolve]: 1.325e-05 [a_after_grad]: 9.94001e-06 [renormalize]: 1.30007e-07 [add_forward_monad_depend]: 1.54e-06 [auto_monad_grad]: 1.57999e-06 [auto_monad_eliminator]: 9.25001e-06 [cse]: 1.829e-05 [a_3]: 3.864e-05 [py_interpret_to_execute_after_opt_a]: 1.483e-05 [slice_cell_reuse_recomputed_activation]: 2.21e-06 [rewriter_after_opt_a]: 4.1e-05 [convert_after_rewriter]: 6.71e-06 [order_py_execute_after_rewriter]: 6.07999e-06 [mutable_eliminate]: 0.00085576 [opt_b]: 0.00026215, [1] [Cycle 1]: 0.00025335, [7] [b_1]: 0.00015843 [b_2]: 1.034e-05 [updatestate_depend_eliminate]: 9.57001e-06 [updatestate_assign_eliminate]: 3.25e-06 [updatestate_loads_eliminate]: 2.91999e-06 [renormalize]: 1.13001e-06 [cse]: 2.721e-05 [optimize_parallel_all_gather_comm]: 1.974e-05 [overlap_param_gather]: 2.73e-06 [cconv]: 3.36e-05 [loop_unroll]: 0.00050478 [opt_after_cconv]: 0.00011968, [1] [Cycle 1]: 0.0001133, [7] [c_1]: 3.797e-05 [parameter_eliminate]: 5.02e-06 [updatestate_depend_eliminate]: 6.94001e-06 [updatestate_assign_eliminate]: 2.58e-06 [updatestate_loads_eliminate]: 3.13e-06 [cse]: 2.169e-05 [renormalize]: 3.99974e-07 [remove_dup_value]: 1.566e-05 [tuple_transform]: 9.689e-05, [1] [Cycle 1]: 9.159e-05, [4] [d_1]: 6.026e-05 [none_parameter_eliminate]: 1.94e-06 [renormalize]: 2.19996e-07 [switch_simplify]: 9.67001e-06 [partial_unused_args_eliminate]: 1.85001e-06 [add_recomputation]: 5.443e-05 [cse_after_recomputation]: 2.454e-05, [1] [Cycle 1]: 2.023e-05, [1] [cse]: 1.333e-05 [environ_conv]: 6.43e-06 [swap_dp_allreduce_reducescatter]: 5.72001e-06 [bias_add_comm_swap]: 3.03998e-06 [label_micro_interleaved_index]: 5.02e-06 [label_fine_grained_interleaved_index]: 3.08e-06 [merge_cast_opt]: 1.27999e-06 [slice_recompute_activation]: 2.16e-06 [micro_interleaved_order_control]: 2.69001e-06 [assign_add_opt]: 1.22e-06 [ForceFp32Comm]: 9.20001e-07 [remove_cast_before_assign_add]: 1.17e-06 [full_micro_interleaved_order_control]: 2.51998e-06 [reorder_send_recv_between_fp_bp]: 2.74999e-06 [comm_op_add_attrs]: 1.04e-06 [add_comm_op_reuse_tag]: 1.06997e-06 [interleave_split_concat_branches]: 1.25001e-06 [interleave_parallel_branches]: 1.03001e-06 [overlap_opt_shard_in_pipeline]: 1.50999e-06 [overlap_opt_shard_grad_in_pipeline]: 1.86e-06 [control_data_broadcast_order]: 1.392e-05 [grouped_pairwise_exchange_alltoall]: 1.60001e-06 [offloading_packed_experts]: 5.27001e-06 [overlap_recompute_and_grad_model_parallel]: 6.19999e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.29003e-06 [overlap_recompute_allgather_and_fa_grad]: 1.50001e-06 [overlap_recompute_comm]: 2.49001e-06 [overlap_grad_ring_attention]: 4.97999e-06 [overlap_grad_flash_sp]: 2.357e-05 [begin_end_overlap_inline]: 4.89992e-07 [split_matmul_comm_elemetwise]: 2.31e-06 [split_layernorm_comm]: 2.23998e-06 [handle_group_info]: 1.04e-06 [symbol_engine_optimizer]: 9.272e-05, [1] [Cycle 1]: 8.76e-05, [6] [build]: 4.29002e-06 [elim_shapecalc]: 1.22e-05 [elim_not_effective]: 1.555e-05 [opt_reshape]: 8.95001e-06 [fold_const_symbol]: 1.254e-05 [renormalize]: 2.00002e-07 [detach_backward]: 2.91999e-06 [pipeline_parallel_scheduler]: 1.62999e-06 [auto_monad_reorder]: 1.926e-05 [get_jit_bprop_graph]: 3.34001e-06 [rewriter_after_jit_bprop_graph]: 6.83e-06 [opt_after_jit_grad]: 0.00065012 [validate]: 5.519e-05 Sums bootstrap : 0.164464s : 92.61% type_inference : 0.007073s : 3.98% event_method : 0.000023s : 0.01% auto_monad : 0.000072s : 0.04% graph_reusing : 0.000006s : 0.00% inline : 0.000003s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000025s : 0.01% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.00% parallel-infer-symbol : 0.000004s : 0.00% pre_auto_parallel : 0.000047s : 0.03% insert-virtual-dataset : 0.000003s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000179s : 0.10% optimize.rewriter_before_opt_a : 0.000105s : 0.06% optimize.opt_a.expand_dump_flag : 0.000007s : 0.00% optimize.opt_a.switch_simplify : 0.000056s : 0.03% optimize.opt_a.loop_unroll : 0.000039s : 0.02% optimize.opt_a.a_1 : 0.000879s : 0.49% optimize.opt_a.with_stream_mark : 0.000034s : 0.02% optimize.opt_a.recompute_prepare : 0.000019s : 0.01% optimize.opt_a.updatestate_depend_eliminate : 0.000008s : 0.00% optimize.opt_a.updatestate_assign_eliminate : 0.000006s : 0.00% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.00% optimize.opt_a.parameter_eliminate : 0.000004s : 0.00% optimize.opt_a.a_2 : 0.000182s : 0.10% optimize.opt_a.accelerated_algorithm : 0.000033s : 0.02% optimize.opt_a.shard : 0.000004s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.00% optimize.opt_a.shard_inline : 0.000015s : 0.01% optimize.opt_a.merge_send_recv : 0.000015s : 0.01% optimize.opt_a.auto_parallel : 0.000016s : 0.01% optimize.opt_a.parallel : 0.000028s : 0.02% optimize.opt_a.flash_sp : 0.000013s : 0.01% optimize.opt_a.merge_comm : 0.000008s : 0.00% optimize.opt_a.allreduce_fusion : 0.000007s : 0.00% optimize.opt_a.matmul_add_comm_reduction : 0.000021s : 0.01% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000002s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000019s : 0.01% optimize.opt_a.virtual_dataset : 0.000015s : 0.01% optimize.opt_a.get_grad_eliminate_ : 0.000016s : 0.01% optimize.opt_a.virtual_output : 0.000013s : 0.01% optimize.opt_a.merge_forward : 0.000008s : 0.00% optimize.opt_a.cell_reuse_recompute_pass : 0.000004s : 0.00% optimize.opt_a.offload_activation : 0.000023s : 0.01% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000027s : 0.02% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.00% optimize.opt_a.before_grad : 0.000023s : 0.01% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000009s : 0.01% optimize.opt_a.meta_fg_expand : 0.000005s : 0.00% optimize.opt_a.flash_sp_send_recv_attached : 0.000007s : 0.00% optimize.opt_a.receive_attached : 0.000004s : 0.00% optimize.opt_a.after_resolve : 0.000028s : 0.02% optimize.opt_a.a_after_grad : 0.000020s : 0.01% optimize.opt_a.renormalize : 0.000914s : 0.51% optimize.opt_a.add_forward_monad_depend : 0.000008s : 0.00% optimize.opt_a.auto_monad_grad : 0.000004s : 0.00% optimize.opt_a.auto_monad_eliminator : 0.000028s : 0.02% optimize.opt_a.cse : 0.000051s : 0.03% optimize.opt_a.a_3 : 0.000097s : 0.05% optimize.py_interpret_to_execute_after_opt_a : 0.000015s : 0.01% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.00% optimize.rewriter_after_opt_a : 0.000041s : 0.02% optimize.convert_after_rewriter : 0.000007s : 0.00% optimize.order_py_execute_after_rewriter : 0.000006s : 0.00% optimize.mutable_eliminate : 0.000856s : 0.48% optimize.opt_b.b_1 : 0.000158s : 0.09% optimize.opt_b.b_2 : 0.000010s : 0.01% optimize.opt_b.updatestate_depend_eliminate : 0.000010s : 0.01% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.00% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000027s : 0.02% optimize.optimize_parallel_all_gather_comm : 0.000020s : 0.01% optimize.overlap_param_gather : 0.000003s : 0.00% optimize.cconv : 0.000034s : 0.02% optimize.loop_unroll : 0.000505s : 0.28% optimize.opt_after_cconv.c_1 : 0.000038s : 0.02% optimize.opt_after_cconv.parameter_eliminate : 0.000005s : 0.00% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000007s : 0.00% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.cse : 0.000022s : 0.01% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000016s : 0.01% optimize.tuple_transform.d_1 : 0.000060s : 0.03% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000010s : 0.01% optimize.partial_unused_args_eliminate : 0.000002s : 0.00% optimize.add_recomputation : 0.000054s : 0.03% optimize.cse_after_recomputation.cse : 0.000013s : 0.01% optimize.environ_conv : 0.000006s : 0.00% optimize.swap_dp_allreduce_reducescatter : 0.000006s : 0.00% optimize.bias_add_comm_swap : 0.000003s : 0.00% optimize.label_micro_interleaved_index : 0.000005s : 0.00% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.00% optimize.merge_cast_opt : 0.000001s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.00% optimize.micro_interleaved_order_control : 0.000003s : 0.00% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.00% optimize.full_micro_interleaved_order_control : 0.000003s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.00% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000002s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.00% optimize.control_data_broadcast_order : 0.000014s : 0.01% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.00% optimize.offloading_packed_experts : 0.000005s : 0.00% optimize.overlap_recompute_and_grad_model_parallel : 0.000006s : 0.00% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000002s : 0.00% optimize.overlap_recompute_comm : 0.000002s : 0.00% optimize.overlap_grad_ring_attention : 0.000005s : 0.00% optimize.overlap_grad_flash_sp : 0.000024s : 0.01% optimize.begin_end_overlap_inline : 0.000000s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.00% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000004s : 0.00% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000012s : 0.01% optimize.symbol_engine_optimizer.elim_not_effective : 0.000016s : 0.01% optimize.symbol_engine_optimizer.opt_reshape : 0.000009s : 0.01% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000013s : 0.01% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000003s : 0.00% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000019s : 0.01% get_jit_bprop_graph : 0.000003s : 0.00% rewriter_after_jit_bprop_graph : 0.000007s : 0.00% opt_after_jit_grad : 0.000650s : 0.37% validate : 0.000055s : 0.03% Time group info: ------[substitution.] 0.000268 36 13.39% : 0.000036s : 6: substitution.arithmetic_simplify 0.68% : 0.000002s : 2: substitution.elim_not_effective 0.90% : 0.000002s : 2: substitution.fold_const_symbol 2.56% : 0.000007s : 4: substitution.graph_param_transform 65.39% : 0.000175s : 4: substitution.inline 1.89% : 0.000005s : 4: substitution.j_node_and_user_rematch 4.72% : 0.000013s : 2: substitution.less_batch_normalization 1.91% : 0.000005s : 4: substitution.remove_not_recompute_node 2.09% : 0.000006s : 4: substitution.replace_old_param 6.47% : 0.000017s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.006995 2 86.98% : 0.006085s : 1: type_inference.infer 13.02% : 0.000910s : 1: type_inference.specialize ------[replace.] 0.000065 8 64.43% : 0.000042s : 4: replace.inline 35.57% : 0.000023s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000189 8 91.72% : 0.000173s : 4: match.inline 8.28% : 0.000016s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000232 1278 0.94% : 0.000002s : 13: predicate.accumulaten_eliminater 1.02% : 0.000002s : 4: predicate.ad_related_special_op_eliminate 0.55% : 0.000001s : 8: predicate.addn_check_dump 0.90% : 0.000002s : 13: predicate.addn_zero_filter 0.80% : 0.000002s : 13: predicate.adjust_all_reduce_mul_add 2.39% : 0.000006s : 21: predicate.arithmetic_simplify 0.96% : 0.000002s : 13: predicate.cast_eliminate 0.62% : 0.000001s : 8: predicate.check_bprop_eliminate 0.65% : 0.000002s : 8: predicate.compare_switch_simplify 0.17% : 0.000000s : 4: predicate.const_output_eliminate 0.53% : 0.000001s : 8: predicate.depend_value_elim 0.89% : 0.000002s : 13: predicate.dict_get_item_const_eliminator 1.05% : 0.000002s : 13: predicate.dict_get_item_eliminator 0.97% : 0.000002s : 13: predicate.dict_set_item_eliminator 1.20% : 0.000003s : 8: predicate.dumpgradient_eliminate 0.25% : 0.000001s : 4: predicate.elim_not_effective 0.47% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.22% : 0.000003s : 17: predicate.environ_add_const_eliminate 0.93% : 0.000002s : 17: predicate.environ_get_add_eliminate 1.03% : 0.000002s : 17: predicate.environ_get_depend_swap 1.61% : 0.000004s : 25: predicate.environ_get_eliminate 1.05% : 0.000002s : 17: predicate.environ_get_set_eliminate 1.39% : 0.000003s : 21: predicate.exchange_switch_depend_value 2.38% : 0.000006s : 21: predicate.float_depend_g_call 0.72% : 0.000002s : 8: predicate.float_environ_get_switch 0.99% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.16% : 0.000000s : 4: predicate.fold_const_symbol 0.63% : 0.000001s : 8: predicate.get_grad_eliminate 0.21% : 0.000000s : 4: predicate.graph_param_transform 0.53% : 0.000001s : 8: predicate.incorporate_call 0.43% : 0.000001s : 8: predicate.incorporate_call_switch 6.23% : 0.000014s : 58: predicate.inline 0.70% : 0.000002s : 8: predicate.inline_without_move 0.30% : 0.000001s : 8: predicate.j_node_and_user_rematch 0.92% : 0.000002s : 8: predicate.less_batch_normalization 1.72% : 0.000004s : 25: predicate.list_to_tuple_eliminator_ 2.49% : 0.000006s : 38: predicate.load_eliminater 0.85% : 0.000002s : 4: predicate.loop_unroll_after_grad 2.19% : 0.000005s : 34: predicate.loop_unroll_before_grad 1.73% : 0.000004s : 21: predicate.make_slice_get_slice_eliminator 0.56% : 0.000001s : 8: predicate.merge_addn 0.59% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.57% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.83% : 0.000002s : 13: predicate.minmaximum_grad 1.72% : 0.000004s : 4: predicate.mutable_eliminate 0.49% : 0.000001s : 4: predicate.opt_reshape 0.53% : 0.000001s : 4: predicate.parallel_virtual_node 1.70% : 0.000004s : 21: predicate.partial_defer_inline 1.44% : 0.000003s : 21: predicate.partial_eliminate 0.86% : 0.000002s : 13: predicate.print_const_string_wrapper 0.57% : 0.000001s : 8: predicate.reduce_all_const_elim 1.17% : 0.000003s : 13: predicate.reduce_eliminate 2.35% : 0.000005s : 38: predicate.redundant_stop_gradient_eliminater 0.41% : 0.000001s : 8: predicate.remove_not_recompute_node 1.24% : 0.000003s : 25: predicate.replace_applicator 0.69% : 0.000002s : 8: predicate.replace_old_param 0.26% : 0.000001s : 4: predicate.reset_defer_inline 0.92% : 0.000002s : 13: predicate.reshape_eliminate 0.62% : 0.000001s : 8: predicate.row_tensor_add_zeros_like 0.57% : 0.000001s : 4: predicate.row_tensor_eliminate 1.17% : 0.000003s : 8: predicate.same_eliminate 0.39% : 0.000001s : 8: predicate.set_cell_output_no_recompute 1.02% : 0.000002s : 8: predicate.shard_identity_eliminate 0.77% : 0.000002s : 8: predicate.special_op_eliminate 0.59% : 0.000001s : 8: predicate.specialize_transform 1.05% : 0.000002s : 8: predicate.split_environ_get_set_with_tuple_value 0.81% : 0.000002s : 8: predicate.stack_unstack_eliminate 0.28% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.42% : 0.000003s : 21: predicate.switch_defer_inline 1.93% : 0.000004s : 29: predicate.switch_layer_defer_inline 4.88% : 0.000011s : 67: predicate.switch_simplify 0.91% : 0.000002s : 13: predicate.tile_eliminate 0.99% : 0.000002s : 13: predicate.transpose_eliminate 1.57% : 0.000004s : 21: predicate.tuple_list_convert_item_index_to_positive 1.56% : 0.000004s : 21: predicate.tuple_list_get_item_const_eliminator 1.45% : 0.000003s : 21: predicate.tuple_list_get_item_depend_reorder 3.34% : 0.000008s : 33: predicate.tuple_list_get_item_eliminator 1.38% : 0.000003s : 21: predicate.tuple_list_get_set_item_eliminator 2.07% : 0.000005s : 29: predicate.tuple_list_set_item_eliminator 1.86% : 0.000004s : 25: predicate.tuple_to_list_eliminator_ 2.28% : 0.000005s : 38: predicate.updatestate_pure_node_eliminater 2.86% : 0.000007s : 46: predicate.updatestate_useless_node_eliminater 0.44% : 0.000001s : 4: predicate.value_based_eliminate 0.77% : 0.000002s : 8: predicate.virtual_dataset_eliminate 0.55% : 0.000001s : 8: predicate.virtual_output_eliminate 0.28% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.52% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000693 11 50.86% : 0.000353s : 5: func_graph_cloner_run.FuncGraphClonerGraph 49.14% : 0.000341s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.195417 192 0.00% : 0.000004s : 1: ForceFp32Comm 2.11% : 0.004123s : 1: add_attr 2.10% : 0.004107s : 1: add_attr_with_inline 0.00% : 0.000004s : 1: add_comm_op_reuse_tag 0.03% : 0.000059s : 1: add_recomputation 0.00% : 0.000004s : 1: assign_add_opt 0.04% : 0.000077s : 1: auto_monad 0.01% : 0.000025s : 1: auto_monad_reorder 0.00% : 0.000003s : 1: begin_end_overlap_inline 0.00% : 0.000006s : 1: bias_add_comm_swap 84.18% : 0.164503s : 1: bootstrap 0.02% : 0.000037s : 1: cconv 0.00% : 0.000005s : 1: comm_op_add_attrs 0.01% : 0.000018s : 1: control_data_broadcast_order 0.01% : 0.000010s : 1: convert_after_rewriter 0.01% : 0.000028s : 1: cse_after_recomputation 0.00% : 0.000005s : 1: dataset_repeat_opt 0.00% : 0.000008s : 1: detach_backward 0.01% : 0.000011s : 1: environ_conv 0.02% : 0.000030s : 1: event_method 0.00% : 0.000005s : 1: full_micro_interleaved_order_control 0.00% : 0.000007s : 1: get_jit_bprop_graph 0.00% : 0.000009s : 1: graph_reusing 0.00% : 0.000005s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000004s : 1: handle_group_info 0.00% : 0.000006s : 1: inline 0.00% : 0.000007s : 1: insert-virtual-dataset 0.00% : 0.000004s : 1: interleave_parallel_branches 0.00% : 0.000004s : 1: interleave_split_concat_branches 0.00% : 0.000006s : 1: label_fine_grained_interleaved_index 0.00% : 0.000008s : 1: label_micro_interleaved_index 0.26% : 0.000514s : 1: loop_unroll 0.00% : 0.000004s : 1: merge_cast_opt 0.00% : 0.000006s : 1: micro_interleaved_order_control 0.44% : 0.000869s : 1: mutable_eliminate 0.00% : 0.000009s : 1: offloading_packed_experts 0.01% : 0.000017s : 1: opt.transform.loop_unroll_optimizer 0.01% : 0.000023s : 1: opt.transform.mutable_eliminate 0.70% : 0.001374s : 78: opt.transform.opt_a 0.02% : 0.000037s : 1: opt.transform.opt_after_cconv 0.02% : 0.000038s : 1: opt.transform.opt_after_jit_grad 0.07% : 0.000132s : 28: opt.transform.opt_b 0.03% : 0.000068s : 2: opt.transform.opt_trans_graph 0.02% : 0.000045s : 4: opt.transform.symbol_engine_opt 1.66% : 0.003235s : 1: opt_a 0.06% : 0.000124s : 1: opt_after_cconv 0.34% : 0.000664s : 1: opt_after_jit_grad 0.14% : 0.000266s : 1: opt_b 3.09% : 0.006045s : 1: optimize 0.01% : 0.000023s : 1: optimize_parallel_all_gather_comm 0.00% : 0.000009s : 1: order_py_execute_after_rewriter 0.01% : 0.000027s : 1: overlap_grad_flash_sp 0.00% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.00% : 0.000009s : 1: overlap_grad_ring_attention 0.00% : 0.000006s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000006s : 1: overlap_param_gather 0.00% : 0.000005s : 1: overlap_recompute_allgather_and_fa_grad 0.00% : 0.000010s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000005s : 1: overlap_recompute_comm 0.00% : 0.000008s : 1: parallel-infer-symbol 0.00% : 0.000004s : 1: parallel-infer-symbol-second 0.00% : 0.000006s : 1: partial_unused_args_eliminate 0.00% : 0.000005s : 1: pipeline_parallel_scheduler 0.00% : 0.000005s : 1: pipeline_split 0.03% : 0.000052s : 1: pre_auto_parallel 0.10% : 0.000189s : 1: py_interpret_to_execute 0.01% : 0.000018s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000005s : 1: remove_cast_before_assign_add 0.01% : 0.000019s : 1: remove_dup_value 0.26% : 0.000500s : 1: renormalize.infer 0.21% : 0.000401s : 1: renormalize.specialize 0.00% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.01% : 0.000010s : 1: rewriter_after_jit_bprop_graph 0.02% : 0.000045s : 1: rewriter_after_opt_a 0.06% : 0.000111s : 1: rewriter_before_opt_a 0.00% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000005s : 1: slice_recompute_activation 0.00% : 0.000005s : 1: split_layernorm_comm 0.00% : 0.000005s : 1: split_matmul_comm_elemetwise 0.00% : 0.000009s : 1: swap_dp_allreduce_reducescatter 0.05% : 0.000096s : 1: symbol_engine_optimizer 0.05% : 0.000100s : 1: tuple_transform 3.63% : 0.007102s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:39:25.661.965 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:39:25.665.184 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.211316, [21] [bootstrap]: 0.00043058 [type_inference]: 0.198802 [event_method]: 2.228e-05 [auto_monad]: 7.291e-05 [graph_reusing]: 6.63e-06 [inline]: 2.99999e-06 [add_attr]: 0.00380557, [1] [add_attr_with_inline]: 0.0037938, [1] [Cycle 1]: 9.105e-05, [2] [tag_attr]: 2.232e-05 [meta_addattr_fg_expand]: 5.64e-06 [parallel-infer-symbol]: 3.78999e-06 [pre_auto_parallel]: 4.229e-05 [insert-virtual-dataset]: 2.91e-06 [parallel-infer-symbol-second]: 8.30012e-07 [dataset_repeat_opt]: 2.19001e-06 [pipeline_split]: 1.64998e-06 [optimize]: 0.0065405, [53] [py_interpret_to_execute]: 3.516e-05 [rewriter_before_opt_a]: 9.592e-05 [opt_a]: 0.00372035, [2] [Cycle 1]: 0.00277648, [45] [expand_dump_flag]: 3.33998e-06 [switch_simplify]: 4.405e-05 [loop_unroll]: 3.177e-05 [a_1]: 0.00068259 [with_stream_mark]: 2.122e-05 [recompute_prepare]: 9.65002e-06 [updatestate_depend_eliminate]: 4.58001e-06 [updatestate_assign_eliminate]: 3.55998e-06 [updatestate_loads_eliminate]: 2.93e-06 [parameter_eliminate]: 2.30002e-06 [a_2]: 0.00021543 [accelerated_algorithm]: 1.003e-05 [shard]: 2.39999e-06 [meta_shard_fg_expand]: 2.65002e-06 [shard_inline]: 7.9e-06 [merge_send_recv]: 2.81e-05 [auto_parallel]: 9.46e-06 [parallel]: 2.122e-05 [flash_sp]: 1.138e-05 [merge_comm]: 4.3e-06 [allreduce_fusion]: 3.50998e-06 [matmul_add_comm_reduction]: 1.062e-05 [allreduce_slice_to_reducescatter]: 9.20001e-07 [virtual_shard_identity]: 1.191e-05 [virtual_dataset]: 7.73999e-06 [get_grad_eliminate_]: 6.68e-06 [virtual_output]: 7e-06 [merge_forward]: 4.4e-06 [cell_reuse_recompute_pass]: 1.79e-06 [offload_activation]: 1.261e-05 [cell_reuse_handle_not_recompute_node_pass]: 4.556e-05 [merge_recompute_call_nodes]: 2.27999e-06 [before_grad]: 1.387e-05 [set_forward_comm_id_for_comm_node_pass]: 5.15001e-06 [meta_fg_expand]: 3.68e-06 [flash_sp_send_recv_attached]: 3.31001e-06 [receive_attached]: 2.16998e-06 [after_resolve]: 1.419e-05 [a_after_grad]: 1.244e-05 [renormalize]: 0.00092904 [add_forward_monad_depend]: 7.45e-06 [auto_monad_grad]: 2.69001e-06 [auto_monad_eliminator]: 1.925e-05 [cse]: 3.086e-05 [a_3]: 7.104e-05 [Cycle 2]: 0.00092624, [45] [expand_dump_flag]: 2.46e-06 [switch_simplify]: 9.29e-06 [loop_unroll]: 7.02997e-06 [a_1]: 0.00013795 [with_stream_mark]: 1.549e-05 [recompute_prepare]: 7.56001e-06 [updatestate_depend_eliminate]: 3.66001e-06 [updatestate_assign_eliminate]: 2.81e-06 [updatestate_loads_eliminate]: 2.98e-06 [parameter_eliminate]: 1.59998e-06 [a_2]: 0.0001033 [accelerated_algorithm]: 6.76999e-06 [shard]: 1.89e-06 [meta_shard_fg_expand]: 1.77999e-06 [shard_inline]: 6.90002e-06 [merge_send_recv]: 7.01999e-06 [auto_parallel]: 8.13001e-06 [parallel]: 8.12e-06 [flash_sp]: 3.88999e-06 [merge_comm]: 3.41999e-06 [allreduce_fusion]: 3.26001e-06 [matmul_add_comm_reduction]: 8.45999e-06 [allreduce_slice_to_reducescatter]: 8.2e-07 [virtual_shard_identity]: 7.97998e-06 [virtual_dataset]: 8.27e-06 [get_grad_eliminate_]: 6.59999e-06 [virtual_output]: 5.94e-06 [merge_forward]: 4.79e-06 [cell_reuse_recompute_pass]: 2.02001e-06 [offload_activation]: 9.82999e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.966e-05 [merge_recompute_call_nodes]: 1.15001e-06 [before_grad]: 1.126e-05 [set_forward_comm_id_for_comm_node_pass]: 5.30999e-06 [meta_fg_expand]: 2.54001e-06 [flash_sp_send_recv_attached]: 1.68002e-06 [receive_attached]: 2.19001e-06 [after_resolve]: 1.181e-05 [a_after_grad]: 9.66003e-06 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 1.89e-06 [auto_monad_grad]: 2.15002e-06 [auto_monad_eliminator]: 1.032e-05 [cse]: 1.905e-05 [a_3]: 5.165e-05 [py_interpret_to_execute_after_opt_a]: 1.738e-05 [slice_cell_reuse_recomputed_activation]: 4.64998e-06 [rewriter_after_opt_a]: 4.67e-05 [convert_after_rewriter]: 1.051e-05 [order_py_execute_after_rewriter]: 9.11998e-06 [mutable_eliminate]: 0.00078067 [opt_b]: 0.00033441, [1] [Cycle 1]: 0.00032349, [7] [b_1]: 0.00021156 [b_2]: 9.34e-06 [updatestate_depend_eliminate]: 8e-06 [updatestate_assign_eliminate]: 3.21001e-06 [updatestate_loads_eliminate]: 3.03e-06 [renormalize]: 5.59987e-07 [cse]: 2.21e-05 [optimize_parallel_all_gather_comm]: 2.076e-05 [overlap_param_gather]: 6.16998e-06 [cconv]: 3.115e-05 [loop_unroll]: 0.00051272 [opt_after_cconv]: 0.00013827, [1] [Cycle 1]: 0.00012859, [7] [c_1]: 3.76e-05 [parameter_eliminate]: 1.92999e-06 [updatestate_depend_eliminate]: 5.02e-06 [updatestate_assign_eliminate]: 2.76e-06 [updatestate_loads_eliminate]: 2.66e-06 [cse]: 1.896e-05 [renormalize]: 8.89995e-07 [remove_dup_value]: 1.954e-05 [tuple_transform]: 0.00010811, [1] [Cycle 1]: 9.959e-05, [4] [d_1]: 5.585e-05 [none_parameter_eliminate]: 1.93002e-06 [renormalize]: 2.40019e-07 [switch_simplify]: 7.48e-06 [partial_unused_args_eliminate]: 5.42999e-06 [add_recomputation]: 5.802e-05 [cse_after_recomputation]: 3.1e-05, [1] [Cycle 1]: 2.311e-05, [1] [cse]: 1.274e-05 [environ_conv]: 9.97999e-06 [swap_dp_allreduce_reducescatter]: 8.80999e-06 [bias_add_comm_swap]: 5.67999e-06 [label_micro_interleaved_index]: 7.36001e-06 [label_fine_grained_interleaved_index]: 5.32999e-06 [merge_cast_opt]: 4.05998e-06 [slice_recompute_activation]: 6.36e-06 [micro_interleaved_order_control]: 4.99e-06 [assign_add_opt]: 4.05998e-06 [ForceFp32Comm]: 3.73001e-06 [remove_cast_before_assign_add]: 4.29002e-06 [full_micro_interleaved_order_control]: 5.10999e-06 [reorder_send_recv_between_fp_bp]: 5.99e-06 [comm_op_add_attrs]: 4.02998e-06 [add_comm_op_reuse_tag]: 4.13999e-06 [interleave_split_concat_branches]: 3.73001e-06 [interleave_parallel_branches]: 4.64998e-06 [overlap_opt_shard_in_pipeline]: 4.21001e-06 [overlap_opt_shard_grad_in_pipeline]: 4.37e-06 [control_data_broadcast_order]: 1.97e-05 [grouped_pairwise_exchange_alltoall]: 5.02999e-06 [offloading_packed_experts]: 8.53001e-06 [overlap_recompute_and_grad_model_parallel]: 9.02999e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.95e-06 [overlap_recompute_allgather_and_fa_grad]: 3.97998e-06 [overlap_recompute_comm]: 5.39998e-06 [overlap_grad_ring_attention]: 8.77e-06 [overlap_grad_flash_sp]: 2.486e-05 [begin_end_overlap_inline]: 4.26001e-06 [split_matmul_comm_elemetwise]: 5.40999e-06 [split_layernorm_comm]: 4.74e-06 [handle_group_info]: 3.86001e-06 [symbol_engine_optimizer]: 0.00011504, [1] [Cycle 1]: 0.00010648, [6] [build]: 4.07e-06 [elim_shapecalc]: 1.707e-05 [elim_not_effective]: 1.492e-05 [opt_reshape]: 7.99002e-06 [fold_const_symbol]: 1.208e-05 [renormalize]: 3.50003e-07 [detach_backward]: 4.04997e-06 [pipeline_parallel_scheduler]: 1.89e-06 [auto_monad_reorder]: 2.134e-05 [get_jit_bprop_graph]: 1.66998e-06 [rewriter_after_jit_bprop_graph]: 6.16998e-06 [opt_after_jit_grad]: 0.00076041 [validate]: 4.035e-05 Sums bootstrap : 0.000431s : 0.21% type_inference : 0.198802s : 96.75% event_method : 0.000022s : 0.01% auto_monad : 0.000073s : 0.04% graph_reusing : 0.000007s : 0.00% inline : 0.000003s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000022s : 0.01% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.00% parallel-infer-symbol : 0.000004s : 0.00% pre_auto_parallel : 0.000042s : 0.02% insert-virtual-dataset : 0.000003s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000035s : 0.02% optimize.rewriter_before_opt_a : 0.000096s : 0.05% optimize.opt_a.expand_dump_flag : 0.000006s : 0.00% optimize.opt_a.switch_simplify : 0.000053s : 0.03% optimize.opt_a.loop_unroll : 0.000039s : 0.02% optimize.opt_a.a_1 : 0.000821s : 0.40% optimize.opt_a.with_stream_mark : 0.000037s : 0.02% optimize.opt_a.recompute_prepare : 0.000017s : 0.01% optimize.opt_a.updatestate_depend_eliminate : 0.000008s : 0.00% optimize.opt_a.updatestate_assign_eliminate : 0.000006s : 0.00% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.00% optimize.opt_a.parameter_eliminate : 0.000004s : 0.00% optimize.opt_a.a_2 : 0.000319s : 0.16% optimize.opt_a.accelerated_algorithm : 0.000017s : 0.01% optimize.opt_a.shard : 0.000004s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.00% optimize.opt_a.shard_inline : 0.000015s : 0.01% optimize.opt_a.merge_send_recv : 0.000035s : 0.02% optimize.opt_a.auto_parallel : 0.000018s : 0.01% optimize.opt_a.parallel : 0.000029s : 0.01% optimize.opt_a.flash_sp : 0.000015s : 0.01% optimize.opt_a.merge_comm : 0.000008s : 0.00% optimize.opt_a.allreduce_fusion : 0.000007s : 0.00% optimize.opt_a.matmul_add_comm_reduction : 0.000019s : 0.01% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000002s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000020s : 0.01% optimize.opt_a.virtual_dataset : 0.000016s : 0.01% optimize.opt_a.get_grad_eliminate_ : 0.000013s : 0.01% optimize.opt_a.virtual_output : 0.000013s : 0.01% optimize.opt_a.merge_forward : 0.000009s : 0.00% optimize.opt_a.cell_reuse_recompute_pass : 0.000004s : 0.00% optimize.opt_a.offload_activation : 0.000022s : 0.01% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000065s : 0.03% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.00% optimize.opt_a.before_grad : 0.000025s : 0.01% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000010s : 0.01% optimize.opt_a.meta_fg_expand : 0.000006s : 0.00% optimize.opt_a.flash_sp_send_recv_attached : 0.000005s : 0.00% optimize.opt_a.receive_attached : 0.000004s : 0.00% optimize.opt_a.after_resolve : 0.000026s : 0.01% optimize.opt_a.a_after_grad : 0.000022s : 0.01% optimize.opt_a.renormalize : 0.000929s : 0.45% optimize.opt_a.add_forward_monad_depend : 0.000009s : 0.00% optimize.opt_a.auto_monad_grad : 0.000005s : 0.00% optimize.opt_a.auto_monad_eliminator : 0.000030s : 0.01% optimize.opt_a.cse : 0.000050s : 0.02% optimize.opt_a.a_3 : 0.000123s : 0.06% optimize.py_interpret_to_execute_after_opt_a : 0.000017s : 0.01% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.00% optimize.rewriter_after_opt_a : 0.000047s : 0.02% optimize.convert_after_rewriter : 0.000011s : 0.01% optimize.order_py_execute_after_rewriter : 0.000009s : 0.00% optimize.mutable_eliminate : 0.000781s : 0.38% optimize.opt_b.b_1 : 0.000212s : 0.10% optimize.opt_b.b_2 : 0.000009s : 0.00% optimize.opt_b.updatestate_depend_eliminate : 0.000008s : 0.00% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.00% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000022s : 0.01% optimize.optimize_parallel_all_gather_comm : 0.000021s : 0.01% optimize.overlap_param_gather : 0.000006s : 0.00% optimize.cconv : 0.000031s : 0.02% optimize.loop_unroll : 0.000513s : 0.25% optimize.opt_after_cconv.c_1 : 0.000038s : 0.02% optimize.opt_after_cconv.parameter_eliminate : 0.000002s : 0.00% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000005s : 0.00% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.cse : 0.000019s : 0.01% optimize.opt_after_cconv.renormalize : 0.000001s : 0.00% optimize.remove_dup_value : 0.000020s : 0.01% optimize.tuple_transform.d_1 : 0.000056s : 0.03% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000007s : 0.00% optimize.partial_unused_args_eliminate : 0.000005s : 0.00% optimize.add_recomputation : 0.000058s : 0.03% optimize.cse_after_recomputation.cse : 0.000013s : 0.01% optimize.environ_conv : 0.000010s : 0.00% optimize.swap_dp_allreduce_reducescatter : 0.000009s : 0.00% optimize.bias_add_comm_swap : 0.000006s : 0.00% optimize.label_micro_interleaved_index : 0.000007s : 0.00% optimize.label_fine_grained_interleaved_index : 0.000005s : 0.00% optimize.merge_cast_opt : 0.000004s : 0.00% optimize.slice_recompute_activation : 0.000006s : 0.00% optimize.micro_interleaved_order_control : 0.000005s : 0.00% optimize.assign_add_opt : 0.000004s : 0.00% optimize.ForceFp32Comm : 0.000004s : 0.00% optimize.remove_cast_before_assign_add : 0.000004s : 0.00% optimize.full_micro_interleaved_order_control : 0.000005s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000006s : 0.00% optimize.comm_op_add_attrs : 0.000004s : 0.00% optimize.add_comm_op_reuse_tag : 0.000004s : 0.00% optimize.interleave_split_concat_branches : 0.000004s : 0.00% optimize.interleave_parallel_branches : 0.000005s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000004s : 0.00% optimize.control_data_broadcast_order : 0.000020s : 0.01% optimize.grouped_pairwise_exchange_alltoall : 0.000005s : 0.00% optimize.offloading_packed_experts : 0.000009s : 0.00% optimize.overlap_recompute_and_grad_model_parallel : 0.000009s : 0.00% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.00% optimize.overlap_recompute_comm : 0.000005s : 0.00% optimize.overlap_grad_ring_attention : 0.000009s : 0.00% optimize.overlap_grad_flash_sp : 0.000025s : 0.01% optimize.begin_end_overlap_inline : 0.000004s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000005s : 0.00% optimize.split_layernorm_comm : 0.000005s : 0.00% optimize.handle_group_info : 0.000004s : 0.00% optimize.symbol_engine_optimizer.build : 0.000004s : 0.00% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000017s : 0.01% optimize.symbol_engine_optimizer.elim_not_effective : 0.000015s : 0.01% optimize.symbol_engine_optimizer.opt_reshape : 0.000008s : 0.00% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000012s : 0.01% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000004s : 0.00% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000021s : 0.01% get_jit_bprop_graph : 0.000002s : 0.00% rewriter_after_jit_bprop_graph : 0.000006s : 0.00% opt_after_jit_grad : 0.000760s : 0.37% validate : 0.000040s : 0.02% Time group info: ------[substitution.] 0.000218 28 1.43% : 0.000003s : 2: substitution.elim_not_effective 0.66% : 0.000001s : 2: substitution.fold_const_symbol 3.12% : 0.000007s : 4: substitution.graph_param_transform 79.02% : 0.000172s : 4: substitution.inline 2.56% : 0.000006s : 4: substitution.j_node_and_user_rematch 3.12% : 0.000007s : 4: substitution.remove_not_recompute_node 2.93% : 0.000006s : 4: substitution.replace_old_param 7.17% : 0.000016s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.198735 2 99.50% : 0.197742s : 1: type_inference.infer 0.50% : 0.000994s : 1: type_inference.specialize ------[replace.] 0.000068 8 64.70% : 0.000044s : 4: replace.inline 35.30% : 0.000024s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000184 8 92.58% : 0.000170s : 4: match.inline 7.42% : 0.000014s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000220 1278 1.01% : 0.000002s : 13: predicate.accumulaten_eliminater 0.78% : 0.000002s : 4: predicate.ad_related_special_op_eliminate 0.53% : 0.000001s : 8: predicate.addn_check_dump 0.87% : 0.000002s : 13: predicate.addn_zero_filter 0.77% : 0.000002s : 13: predicate.adjust_all_reduce_mul_add 2.03% : 0.000004s : 21: predicate.arithmetic_simplify 0.89% : 0.000002s : 13: predicate.cast_eliminate 0.58% : 0.000001s : 8: predicate.check_bprop_eliminate 0.66% : 0.000001s : 8: predicate.compare_switch_simplify 0.18% : 0.000000s : 4: predicate.const_output_eliminate 0.62% : 0.000001s : 8: predicate.depend_value_elim 0.88% : 0.000002s : 13: predicate.dict_get_item_const_eliminator 0.97% : 0.000002s : 13: predicate.dict_get_item_eliminator 0.95% : 0.000002s : 13: predicate.dict_set_item_eliminator 0.97% : 0.000002s : 8: predicate.dumpgradient_eliminate 0.17% : 0.000000s : 4: predicate.elim_not_effective 0.52% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.18% : 0.000003s : 17: predicate.environ_add_const_eliminate 1.07% : 0.000002s : 17: predicate.environ_get_add_eliminate 1.12% : 0.000002s : 17: predicate.environ_get_depend_swap 1.63% : 0.000004s : 25: predicate.environ_get_eliminate 1.02% : 0.000002s : 17: predicate.environ_get_set_eliminate 1.41% : 0.000003s : 21: predicate.exchange_switch_depend_value 2.38% : 0.000005s : 21: predicate.float_depend_g_call 0.50% : 0.000001s : 8: predicate.float_environ_get_switch 0.86% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.19% : 0.000000s : 4: predicate.fold_const_symbol 0.69% : 0.000002s : 8: predicate.get_grad_eliminate 0.20% : 0.000000s : 4: predicate.graph_param_transform 0.56% : 0.000001s : 8: predicate.incorporate_call 0.46% : 0.000001s : 8: predicate.incorporate_call_switch 6.19% : 0.000014s : 58: predicate.inline 0.73% : 0.000002s : 8: predicate.inline_without_move 0.32% : 0.000001s : 8: predicate.j_node_and_user_rematch 1.12% : 0.000002s : 8: predicate.less_batch_normalization 1.88% : 0.000004s : 25: predicate.list_to_tuple_eliminator_ 2.45% : 0.000005s : 38: predicate.load_eliminater 0.75% : 0.000002s : 4: predicate.loop_unroll_after_grad 2.36% : 0.000005s : 34: predicate.loop_unroll_before_grad 1.54% : 0.000003s : 21: predicate.make_slice_get_slice_eliminator 0.49% : 0.000001s : 8: predicate.merge_addn 0.59% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.55% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.81% : 0.000002s : 13: predicate.minmaximum_grad 1.44% : 0.000003s : 4: predicate.mutable_eliminate 0.43% : 0.000001s : 4: predicate.opt_reshape 0.40% : 0.000001s : 4: predicate.parallel_virtual_node 1.91% : 0.000004s : 21: predicate.partial_defer_inline 1.57% : 0.000003s : 21: predicate.partial_eliminate 0.89% : 0.000002s : 13: predicate.print_const_string_wrapper 0.89% : 0.000002s : 8: predicate.reduce_all_const_elim 1.30% : 0.000003s : 13: predicate.reduce_eliminate 2.57% : 0.000006s : 38: predicate.redundant_stop_gradient_eliminater 0.49% : 0.000001s : 8: predicate.remove_not_recompute_node 1.48% : 0.000003s : 25: predicate.replace_applicator 0.56% : 0.000001s : 8: predicate.replace_old_param 0.24% : 0.000001s : 4: predicate.reset_defer_inline 1.07% : 0.000002s : 13: predicate.reshape_eliminate 0.62% : 0.000001s : 8: predicate.row_tensor_add_zeros_like 0.61% : 0.000001s : 4: predicate.row_tensor_eliminate 0.90% : 0.000002s : 8: predicate.same_eliminate 0.52% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.78% : 0.000002s : 8: predicate.shard_identity_eliminate 0.69% : 0.000002s : 8: predicate.special_op_eliminate 0.64% : 0.000001s : 8: predicate.specialize_transform 0.99% : 0.000002s : 8: predicate.split_environ_get_set_with_tuple_value 1.04% : 0.000002s : 8: predicate.stack_unstack_eliminate 0.34% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.51% : 0.000003s : 21: predicate.switch_defer_inline 2.13% : 0.000005s : 29: predicate.switch_layer_defer_inline 4.87% : 0.000011s : 67: predicate.switch_simplify 0.84% : 0.000002s : 13: predicate.tile_eliminate 0.87% : 0.000002s : 13: predicate.transpose_eliminate 1.48% : 0.000003s : 21: predicate.tuple_list_convert_item_index_to_positive 1.53% : 0.000003s : 21: predicate.tuple_list_get_item_const_eliminator 1.38% : 0.000003s : 21: predicate.tuple_list_get_item_depend_reorder 3.09% : 0.000007s : 33: predicate.tuple_list_get_item_eliminator 1.37% : 0.000003s : 21: predicate.tuple_list_get_set_item_eliminator 2.19% : 0.000005s : 29: predicate.tuple_list_set_item_eliminator 1.95% : 0.000004s : 25: predicate.tuple_to_list_eliminator_ 2.34% : 0.000005s : 38: predicate.updatestate_pure_node_eliminater 3.15% : 0.000007s : 46: predicate.updatestate_useless_node_eliminater 0.41% : 0.000001s : 4: predicate.value_based_eliminate 0.88% : 0.000002s : 8: predicate.virtual_dataset_eliminate 0.61% : 0.000001s : 8: predicate.virtual_output_eliminate 0.26% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.35% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000708 11 48.55% : 0.000344s : 5: func_graph_cloner_run.FuncGraphClonerGraph 51.45% : 0.000364s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.224142 192 0.00% : 0.000006s : 1: ForceFp32Comm 1.70% : 0.003817s : 1: add_attr 1.69% : 0.003798s : 1: add_attr_with_inline 0.00% : 0.000007s : 1: add_comm_op_reuse_tag 0.03% : 0.000062s : 1: add_recomputation 0.00% : 0.000007s : 1: assign_add_opt 0.04% : 0.000082s : 1: auto_monad 0.01% : 0.000029s : 1: auto_monad_reorder 0.00% : 0.000007s : 1: begin_end_overlap_inline 0.00% : 0.000009s : 1: bias_add_comm_swap 0.21% : 0.000478s : 1: bootstrap 0.02% : 0.000034s : 1: cconv 0.00% : 0.000007s : 1: comm_op_add_attrs 0.01% : 0.000024s : 1: control_data_broadcast_order 0.01% : 0.000014s : 1: convert_after_rewriter 0.02% : 0.000034s : 1: cse_after_recomputation 0.00% : 0.000009s : 1: dataset_repeat_opt 0.01% : 0.000022s : 1: detach_backward 0.01% : 0.000013s : 1: environ_conv 0.01% : 0.000033s : 1: event_method 0.00% : 0.000008s : 1: full_micro_interleaved_order_control 0.00% : 0.000008s : 1: get_jit_bprop_graph 0.01% : 0.000013s : 1: graph_reusing 0.00% : 0.000008s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000006s : 1: handle_group_info 0.00% : 0.000009s : 1: inline 0.00% : 0.000009s : 1: insert-virtual-dataset 0.00% : 0.000007s : 1: interleave_parallel_branches 0.00% : 0.000007s : 1: interleave_split_concat_branches 0.00% : 0.000008s : 1: label_fine_grained_interleaved_index 0.00% : 0.000010s : 1: label_micro_interleaved_index 0.23% : 0.000520s : 1: loop_unroll 0.00% : 0.000007s : 1: merge_cast_opt 0.00% : 0.000008s : 1: micro_interleaved_order_control 0.35% : 0.000789s : 1: mutable_eliminate 0.01% : 0.000012s : 1: offloading_packed_experts 0.01% : 0.000013s : 1: opt.transform.loop_unroll_optimizer 0.01% : 0.000017s : 1: opt.transform.mutable_eliminate 0.61% : 0.001367s : 78: opt.transform.opt_a 0.02% : 0.000036s : 1: opt.transform.opt_after_cconv 0.01% : 0.000031s : 1: opt.transform.opt_after_jit_grad 0.06% : 0.000128s : 28: opt.transform.opt_b 0.03% : 0.000060s : 2: opt.transform.opt_trans_graph 0.02% : 0.000048s : 4: opt.transform.symbol_engine_opt 1.66% : 0.003724s : 1: opt_a 0.06% : 0.000142s : 1: opt_after_cconv 0.35% : 0.000774s : 1: opt_after_jit_grad 0.15% : 0.000338s : 1: opt_b 3.13% : 0.007005s : 1: optimize 0.01% : 0.000024s : 1: optimize_parallel_all_gather_comm 0.01% : 0.000012s : 1: order_py_execute_after_rewriter 0.01% : 0.000028s : 1: overlap_grad_flash_sp 0.00% : 0.000007s : 1: overlap_grad_matmul_and_grad_allreduce 0.01% : 0.000012s : 1: overlap_grad_ring_attention 0.00% : 0.000008s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000007s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000010s : 1: overlap_param_gather 0.00% : 0.000007s : 1: overlap_recompute_allgather_and_fa_grad 0.01% : 0.000012s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000009s : 1: overlap_recompute_comm 0.00% : 0.000010s : 1: parallel-infer-symbol 0.00% : 0.000006s : 1: parallel-infer-symbol-second 0.00% : 0.000008s : 1: partial_unused_args_eliminate 0.00% : 0.000009s : 1: pipeline_parallel_scheduler 0.00% : 0.000007s : 1: pipeline_split 0.02% : 0.000051s : 1: pre_auto_parallel 0.02% : 0.000039s : 1: py_interpret_to_execute 0.01% : 0.000021s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000008s : 1: remove_cast_before_assign_add 0.01% : 0.000023s : 1: remove_dup_value 0.23% : 0.000516s : 1: renormalize.infer 0.18% : 0.000402s : 1: renormalize.specialize 0.00% : 0.000009s : 1: reorder_send_recv_between_fp_bp 0.01% : 0.000013s : 1: rewriter_after_jit_bprop_graph 0.02% : 0.000050s : 1: rewriter_after_opt_a 0.04% : 0.000100s : 1: rewriter_before_opt_a 0.00% : 0.000007s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000009s : 1: slice_recompute_activation 0.00% : 0.000008s : 1: split_layernorm_comm 0.00% : 0.000008s : 1: split_matmul_comm_elemetwise 0.01% : 0.000012s : 1: swap_dp_allreduce_reducescatter 0.05% : 0.000118s : 1: symbol_engine_optimizer 0.05% : 0.000111s : 1: tuple_transform 88.72% : 0.198856s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:39:28.717.105 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.126156, [21] [bootstrap]: 0.00046217 [type_inference]: 0.0067165 [event_method]: 2.219e-05 [auto_monad]: 6.821e-05 [graph_reusing]: 6.27001e-06 [inline]: 2.27999e-06 [add_attr]: 0.00383593, [1] [add_attr_with_inline]: 0.00382297, [1] [Cycle 1]: 8.331e-05, [2] [tag_attr]: 2.669e-05 [meta_addattr_fg_expand]: 6.31e-06 [parallel-infer-symbol]: 4.42e-06 [pre_auto_parallel]: 4.439e-05 [insert-virtual-dataset]: 2.41998e-06 [parallel-infer-symbol-second]: 7.39994e-07 [dataset_repeat_opt]: 1.97001e-06 [pipeline_split]: 1.64e-06 [optimize]: 0.00611361, [53] [py_interpret_to_execute]: 3.296e-05 [rewriter_before_opt_a]: 9.527e-05 [opt_a]: 0.00326116, [2] [Cycle 1]: 0.00245772, [45] [expand_dump_flag]: 3.33e-06 [switch_simplify]: 4.52e-05 [loop_unroll]: 3.097e-05 [a_1]: 0.00068864 [with_stream_mark]: 2.345e-05 [recompute_prepare]: 1.016e-05 [updatestate_depend_eliminate]: 4.67e-06 [updatestate_assign_eliminate]: 3.31001e-06 [updatestate_loads_eliminate]: 3.19001e-06 [parameter_eliminate]: 1.92001e-06 [a_2]: 8.734e-05 [accelerated_algorithm]: 8.57e-06 [shard]: 2.22999e-06 [meta_shard_fg_expand]: 1.89e-06 [shard_inline]: 6.51999e-06 [merge_send_recv]: 8.96002e-06 [auto_parallel]: 7.87e-06 [parallel]: 2.082e-05 [flash_sp]: 1.038e-05 [merge_comm]: 4.74e-06 [allreduce_fusion]: 4.12e-06 [matmul_add_comm_reduction]: 1.012e-05 [allreduce_slice_to_reducescatter]: 8.2e-07 [virtual_shard_identity]: 9.61e-06 [virtual_dataset]: 7.04001e-06 [get_grad_eliminate_]: 7.07997e-06 [virtual_output]: 7.23e-06 [merge_forward]: 5.07e-06 [cell_reuse_recompute_pass]: 1.39e-06 [offload_activation]: 1.032e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.497e-05 [merge_recompute_call_nodes]: 1.64e-06 [before_grad]: 1.256e-05 [set_forward_comm_id_for_comm_node_pass]: 3.94002e-06 [meta_fg_expand]: 3.21999e-06 [flash_sp_send_recv_attached]: 3.24001e-06 [receive_attached]: 2.07999e-06 [after_resolve]: 1.649e-05 [a_after_grad]: 1.371e-05 [renormalize]: 0.0009457 [add_forward_monad_depend]: 8.19998e-06 [auto_monad_grad]: 2.81e-06 [auto_monad_eliminator]: 2.033e-05 [cse]: 3.26e-05 [a_3]: 5.941e-05 [Cycle 2]: 0.00079002, [45] [expand_dump_flag]: 2.48998e-06 [switch_simplify]: 8.58001e-06 [loop_unroll]: 7.09001e-06 [a_1]: 0.00014432 [with_stream_mark]: 4.952e-05 [recompute_prepare]: 8.66002e-06 [updatestate_depend_eliminate]: 3.39001e-06 [updatestate_assign_eliminate]: 2.94001e-06 [updatestate_loads_eliminate]: 3.73999e-06 [parameter_eliminate]: 1.74998e-06 [a_2]: 7.894e-05 [accelerated_algorithm]: 6.89999e-06 [shard]: 2.16998e-06 [meta_shard_fg_expand]: 1.71998e-06 [shard_inline]: 6.66e-06 [merge_send_recv]: 7.16001e-06 [auto_parallel]: 8.28001e-06 [parallel]: 8.03001e-06 [flash_sp]: 4.00998e-06 [merge_comm]: 3.66001e-06 [allreduce_fusion]: 4.13999e-06 [matmul_add_comm_reduction]: 9.12001e-06 [allreduce_slice_to_reducescatter]: 7.10017e-07 [virtual_shard_identity]: 8.64e-06 [virtual_dataset]: 6.78998e-06 [get_grad_eliminate_]: 6.83e-06 [virtual_output]: 6.69001e-06 [merge_forward]: 3.75998e-06 [cell_reuse_recompute_pass]: 3.28e-06 [offload_activation]: 1.046e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.774e-05 [merge_recompute_call_nodes]: 1.47001e-06 [before_grad]: 1.144e-05 [set_forward_comm_id_for_comm_node_pass]: 3.91001e-06 [meta_fg_expand]: 2.66e-06 [flash_sp_send_recv_attached]: 1.47999e-06 [receive_attached]: 2.69999e-06 [after_resolve]: 1.24e-05 [a_after_grad]: 1.239e-05 [renormalize]: 6.00121e-08 [add_forward_monad_depend]: 2.07001e-06 [auto_monad_grad]: 1.99e-06 [auto_monad_eliminator]: 9.12999e-06 [cse]: 1.679e-05 [a_3]: 4.225e-05 [py_interpret_to_execute_after_opt_a]: 1.598e-05 [slice_cell_reuse_recomputed_activation]: 2.14e-06 [rewriter_after_opt_a]: 4.677e-05 [convert_after_rewriter]: 7.74002e-06 [order_py_execute_after_rewriter]: 5.52999e-06 [mutable_eliminate]: 0.00085894 [opt_b]: 0.00027141, [1] [Cycle 1]: 0.00026137, [7] [b_1]: 0.00015911 [b_2]: 9.87001e-06 [updatestate_depend_eliminate]: 1.023e-05 [updatestate_assign_eliminate]: 3.13e-06 [updatestate_loads_eliminate]: 2.79001e-06 [renormalize]: 7.89994e-07 [cse]: 2.907e-05 [optimize_parallel_all_gather_comm]: 2.05e-05 [overlap_param_gather]: 2.86e-06 [cconv]: 3.614e-05 [loop_unroll]: 0.00065555 [opt_after_cconv]: 0.00013596, [1] [Cycle 1]: 0.0001269, [7] [c_1]: 4.167e-05 [parameter_eliminate]: 5.40001e-06 [updatestate_depend_eliminate]: 7.89997e-06 [updatestate_assign_eliminate]: 2.89999e-06 [updatestate_loads_eliminate]: 2.56e-06 [cse]: 2.356e-05 [renormalize]: 3.4002e-07 [remove_dup_value]: 1.648e-05 [tuple_transform]: 0.00010118, [1] [Cycle 1]: 9.586e-05, [4] [d_1]: 6.262e-05 [none_parameter_eliminate]: 1.72001e-06 [renormalize]: 1.79978e-07 [switch_simplify]: 8.77e-06 [partial_unused_args_eliminate]: 2.20002e-06 [add_recomputation]: 6.076e-05 [cse_after_recomputation]: 2.41e-05, [1] [Cycle 1]: 1.924e-05, [1] [cse]: 1.299e-05 [environ_conv]: 5.84e-06 [swap_dp_allreduce_reducescatter]: 5.30999e-06 [bias_add_comm_swap]: 3.64002e-06 [label_micro_interleaved_index]: 6.10002e-06 [label_fine_grained_interleaved_index]: 3.03e-06 [merge_cast_opt]: 1.47001e-06 [slice_recompute_activation]: 2.09999e-06 [micro_interleaved_order_control]: 2.74001e-06 [assign_add_opt]: 1.54e-06 [ForceFp32Comm]: 9.00007e-07 [remove_cast_before_assign_add]: 1.40001e-06 [full_micro_interleaved_order_control]: 2.05002e-06 [reorder_send_recv_between_fp_bp]: 2.87002e-06 [comm_op_add_attrs]: 9.79984e-07 [add_comm_op_reuse_tag]: 9.5999e-07 [interleave_split_concat_branches]: 1.20999e-06 [interleave_parallel_branches]: 1.06997e-06 [overlap_opt_shard_in_pipeline]: 1.47999e-06 [overlap_opt_shard_grad_in_pipeline]: 2.19001e-06 [control_data_broadcast_order]: 1.663e-05 [grouped_pairwise_exchange_alltoall]: 2.50002e-06 [offloading_packed_experts]: 4.18001e-06 [overlap_recompute_and_grad_model_parallel]: 5.37001e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.49e-06 [overlap_recompute_allgather_and_fa_grad]: 1.37999e-06 [overlap_recompute_comm]: 2.27999e-06 [overlap_grad_ring_attention]: 4.46002e-06 [overlap_grad_flash_sp]: 2.326e-05 [begin_end_overlap_inline]: 9.00007e-07 [split_matmul_comm_elemetwise]: 2.66e-06 [split_layernorm_comm]: 1.65001e-06 [handle_group_info]: 1.68002e-06 [symbol_engine_optimizer]: 9.342e-05, [1] [Cycle 1]: 8.852e-05, [6] [build]: 4.27e-06 [elim_shapecalc]: 1.303e-05 [elim_not_effective]: 1.564e-05 [opt_reshape]: 8.75001e-06 [fold_const_symbol]: 1.136e-05 [renormalize]: 1.60013e-07 [detach_backward]: 2.44999e-06 [pipeline_parallel_scheduler]: 1.74e-06 [auto_monad_reorder]: 1.923e-05 [get_jit_bprop_graph]: 2.04e-06 [rewriter_after_jit_bprop_graph]: 6.81999e-06 [opt_after_jit_grad]: 0.108579 [validate]: 7.205e-05 Sums bootstrap : 0.000462s : 0.38% type_inference : 0.006717s : 5.54% event_method : 0.000022s : 0.02% auto_monad : 0.000068s : 0.06% graph_reusing : 0.000006s : 0.01% inline : 0.000002s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000027s : 0.02% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.01% parallel-infer-symbol : 0.000004s : 0.00% pre_auto_parallel : 0.000044s : 0.04% insert-virtual-dataset : 0.000002s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000033s : 0.03% optimize.rewriter_before_opt_a : 0.000095s : 0.08% optimize.opt_a.expand_dump_flag : 0.000006s : 0.00% optimize.opt_a.switch_simplify : 0.000054s : 0.04% optimize.opt_a.loop_unroll : 0.000038s : 0.03% optimize.opt_a.a_1 : 0.000833s : 0.69% optimize.opt_a.with_stream_mark : 0.000073s : 0.06% optimize.opt_a.recompute_prepare : 0.000019s : 0.02% optimize.opt_a.updatestate_depend_eliminate : 0.000008s : 0.01% optimize.opt_a.updatestate_assign_eliminate : 0.000006s : 0.01% optimize.opt_a.updatestate_loads_eliminate : 0.000007s : 0.01% optimize.opt_a.parameter_eliminate : 0.000004s : 0.00% optimize.opt_a.a_2 : 0.000166s : 0.14% optimize.opt_a.accelerated_algorithm : 0.000015s : 0.01% optimize.opt_a.shard : 0.000004s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.00% optimize.opt_a.shard_inline : 0.000013s : 0.01% optimize.opt_a.merge_send_recv : 0.000016s : 0.01% optimize.opt_a.auto_parallel : 0.000016s : 0.01% optimize.opt_a.parallel : 0.000029s : 0.02% optimize.opt_a.flash_sp : 0.000014s : 0.01% optimize.opt_a.merge_comm : 0.000008s : 0.01% optimize.opt_a.allreduce_fusion : 0.000008s : 0.01% optimize.opt_a.matmul_add_comm_reduction : 0.000019s : 0.02% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000002s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000018s : 0.02% optimize.opt_a.virtual_dataset : 0.000014s : 0.01% optimize.opt_a.get_grad_eliminate_ : 0.000014s : 0.01% optimize.opt_a.virtual_output : 0.000014s : 0.01% optimize.opt_a.merge_forward : 0.000009s : 0.01% optimize.opt_a.cell_reuse_recompute_pass : 0.000005s : 0.00% optimize.opt_a.offload_activation : 0.000021s : 0.02% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000033s : 0.03% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.00% optimize.opt_a.before_grad : 0.000024s : 0.02% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000008s : 0.01% optimize.opt_a.meta_fg_expand : 0.000006s : 0.00% optimize.opt_a.flash_sp_send_recv_attached : 0.000005s : 0.00% optimize.opt_a.receive_attached : 0.000005s : 0.00% optimize.opt_a.after_resolve : 0.000029s : 0.02% optimize.opt_a.a_after_grad : 0.000026s : 0.02% optimize.opt_a.renormalize : 0.000946s : 0.78% optimize.opt_a.add_forward_monad_depend : 0.000010s : 0.01% optimize.opt_a.auto_monad_grad : 0.000005s : 0.00% optimize.opt_a.auto_monad_eliminator : 0.000029s : 0.02% optimize.opt_a.cse : 0.000049s : 0.04% optimize.opt_a.a_3 : 0.000102s : 0.08% optimize.py_interpret_to_execute_after_opt_a : 0.000016s : 0.01% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.00% optimize.rewriter_after_opt_a : 0.000047s : 0.04% optimize.convert_after_rewriter : 0.000008s : 0.01% optimize.order_py_execute_after_rewriter : 0.000006s : 0.00% optimize.mutable_eliminate : 0.000859s : 0.71% optimize.opt_b.b_1 : 0.000159s : 0.13% optimize.opt_b.b_2 : 0.000010s : 0.01% optimize.opt_b.updatestate_depend_eliminate : 0.000010s : 0.01% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.00% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000029s : 0.02% optimize.optimize_parallel_all_gather_comm : 0.000021s : 0.02% optimize.overlap_param_gather : 0.000003s : 0.00% optimize.cconv : 0.000036s : 0.03% optimize.loop_unroll : 0.000656s : 0.54% optimize.opt_after_cconv.c_1 : 0.000042s : 0.03% optimize.opt_after_cconv.parameter_eliminate : 0.000005s : 0.00% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000008s : 0.01% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.cse : 0.000024s : 0.02% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000016s : 0.01% optimize.tuple_transform.d_1 : 0.000063s : 0.05% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000009s : 0.01% optimize.partial_unused_args_eliminate : 0.000002s : 0.00% optimize.add_recomputation : 0.000061s : 0.05% optimize.cse_after_recomputation.cse : 0.000013s : 0.01% optimize.environ_conv : 0.000006s : 0.00% optimize.swap_dp_allreduce_reducescatter : 0.000005s : 0.00% optimize.bias_add_comm_swap : 0.000004s : 0.00% optimize.label_micro_interleaved_index : 0.000006s : 0.01% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.00% optimize.merge_cast_opt : 0.000001s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.00% optimize.micro_interleaved_order_control : 0.000003s : 0.00% optimize.assign_add_opt : 0.000002s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.00% optimize.full_micro_interleaved_order_control : 0.000002s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.00% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.00% optimize.control_data_broadcast_order : 0.000017s : 0.01% optimize.grouped_pairwise_exchange_alltoall : 0.000003s : 0.00% optimize.offloading_packed_experts : 0.000004s : 0.00% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.00% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.00% optimize.overlap_recompute_comm : 0.000002s : 0.00% optimize.overlap_grad_ring_attention : 0.000004s : 0.00% optimize.overlap_grad_flash_sp : 0.000023s : 0.02% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000003s : 0.00% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000002s : 0.00% optimize.symbol_engine_optimizer.build : 0.000004s : 0.00% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000013s : 0.01% optimize.symbol_engine_optimizer.elim_not_effective : 0.000016s : 0.01% optimize.symbol_engine_optimizer.opt_reshape : 0.000009s : 0.01% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000011s : 0.01% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.00% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000019s : 0.02% get_jit_bprop_graph : 0.000002s : 0.00% rewriter_after_jit_bprop_graph : 0.000007s : 0.01% opt_after_jit_grad : 0.108579s : 89.59% validate : 0.000072s : 0.06% Time group info: ------[substitution.] 0.000225 28 0.89% : 0.000002s : 2: substitution.elim_not_effective 1.04% : 0.000002s : 2: substitution.fold_const_symbol 3.36% : 0.000008s : 4: substitution.graph_param_transform 79.91% : 0.000180s : 4: substitution.inline 2.04% : 0.000005s : 4: substitution.j_node_and_user_rematch 2.72% : 0.000006s : 4: substitution.remove_not_recompute_node 2.70% : 0.000006s : 4: substitution.replace_old_param 7.32% : 0.000016s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.006643 2 87.02% : 0.005781s : 1: type_inference.infer 12.98% : 0.000862s : 1: type_inference.specialize ------[replace.] 0.000067 8 64.68% : 0.000044s : 4: replace.inline 35.32% : 0.000024s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000192 8 92.41% : 0.000177s : 4: match.inline 7.59% : 0.000015s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000239 1278 0.82% : 0.000002s : 13: predicate.accumulaten_eliminater 3.24% : 0.000008s : 4: predicate.ad_related_special_op_eliminate 0.47% : 0.000001s : 8: predicate.addn_check_dump 0.91% : 0.000002s : 13: predicate.addn_zero_filter 0.75% : 0.000002s : 13: predicate.adjust_all_reduce_mul_add 1.95% : 0.000005s : 21: predicate.arithmetic_simplify 1.01% : 0.000002s : 13: predicate.cast_eliminate 0.58% : 0.000001s : 8: predicate.check_bprop_eliminate 0.46% : 0.000001s : 8: predicate.compare_switch_simplify 0.15% : 0.000000s : 4: predicate.const_output_eliminate 0.63% : 0.000001s : 8: predicate.depend_value_elim 0.79% : 0.000002s : 13: predicate.dict_get_item_const_eliminator 1.11% : 0.000003s : 13: predicate.dict_get_item_eliminator 0.79% : 0.000002s : 13: predicate.dict_set_item_eliminator 2.54% : 0.000006s : 8: predicate.dumpgradient_eliminate 0.27% : 0.000001s : 4: predicate.elim_not_effective 0.49% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 0.99% : 0.000002s : 17: predicate.environ_add_const_eliminate 0.99% : 0.000002s : 17: predicate.environ_get_add_eliminate 1.15% : 0.000003s : 17: predicate.environ_get_depend_swap 1.63% : 0.000004s : 25: predicate.environ_get_eliminate 1.02% : 0.000002s : 17: predicate.environ_get_set_eliminate 1.27% : 0.000003s : 21: predicate.exchange_switch_depend_value 2.31% : 0.000006s : 21: predicate.float_depend_g_call 0.52% : 0.000001s : 8: predicate.float_environ_get_switch 0.76% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.18% : 0.000000s : 4: predicate.fold_const_symbol 0.65% : 0.000002s : 8: predicate.get_grad_eliminate 0.29% : 0.000001s : 4: predicate.graph_param_transform 0.57% : 0.000001s : 8: predicate.incorporate_call 0.43% : 0.000001s : 8: predicate.incorporate_call_switch 6.05% : 0.000014s : 58: predicate.inline 0.66% : 0.000002s : 8: predicate.inline_without_move 0.32% : 0.000001s : 8: predicate.j_node_and_user_rematch 0.85% : 0.000002s : 8: predicate.less_batch_normalization 1.81% : 0.000004s : 25: predicate.list_to_tuple_eliminator_ 2.38% : 0.000006s : 38: predicate.load_eliminater 1.38% : 0.000003s : 4: predicate.loop_unroll_after_grad 2.26% : 0.000005s : 34: predicate.loop_unroll_before_grad 1.58% : 0.000004s : 21: predicate.make_slice_get_slice_eliminator 0.52% : 0.000001s : 8: predicate.merge_addn 0.54% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.60% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.73% : 0.000002s : 13: predicate.minmaximum_grad 1.75% : 0.000004s : 4: predicate.mutable_eliminate 0.35% : 0.000001s : 4: predicate.opt_reshape 0.38% : 0.000001s : 4: predicate.parallel_virtual_node 1.72% : 0.000004s : 21: predicate.partial_defer_inline 1.38% : 0.000003s : 21: predicate.partial_eliminate 0.85% : 0.000002s : 13: predicate.print_const_string_wrapper 0.69% : 0.000002s : 8: predicate.reduce_all_const_elim 1.18% : 0.000003s : 13: predicate.reduce_eliminate 2.26% : 0.000005s : 38: predicate.redundant_stop_gradient_eliminater 0.45% : 0.000001s : 8: predicate.remove_not_recompute_node 1.18% : 0.000003s : 25: predicate.replace_applicator 0.46% : 0.000001s : 8: predicate.replace_old_param 0.32% : 0.000001s : 4: predicate.reset_defer_inline 0.90% : 0.000002s : 13: predicate.reshape_eliminate 0.63% : 0.000002s : 8: predicate.row_tensor_add_zeros_like 0.38% : 0.000001s : 4: predicate.row_tensor_eliminate 0.81% : 0.000002s : 8: predicate.same_eliminate 0.38% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.80% : 0.000002s : 8: predicate.shard_identity_eliminate 1.14% : 0.000003s : 8: predicate.special_op_eliminate 0.56% : 0.000001s : 8: predicate.specialize_transform 1.05% : 0.000003s : 8: predicate.split_environ_get_set_with_tuple_value 0.85% : 0.000002s : 8: predicate.stack_unstack_eliminate 0.50% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.30% : 0.000003s : 21: predicate.switch_defer_inline 1.75% : 0.000004s : 29: predicate.switch_layer_defer_inline 4.54% : 0.000011s : 67: predicate.switch_simplify 0.87% : 0.000002s : 13: predicate.tile_eliminate 0.90% : 0.000002s : 13: predicate.transpose_eliminate 1.51% : 0.000004s : 21: predicate.tuple_list_convert_item_index_to_positive 1.37% : 0.000003s : 21: predicate.tuple_list_get_item_const_eliminator 1.46% : 0.000003s : 21: predicate.tuple_list_get_item_depend_reorder 3.37% : 0.000008s : 33: predicate.tuple_list_get_item_eliminator 1.56% : 0.000004s : 21: predicate.tuple_list_get_set_item_eliminator 2.16% : 0.000005s : 29: predicate.tuple_list_set_item_eliminator 1.62% : 0.000004s : 25: predicate.tuple_to_list_eliminator_ 2.16% : 0.000005s : 38: predicate.updatestate_pure_node_eliminater 2.76% : 0.000007s : 46: predicate.updatestate_useless_node_eliminater 0.36% : 0.000001s : 4: predicate.value_based_eliminate 0.69% : 0.000002s : 8: predicate.virtual_dataset_eliminate 0.63% : 0.000002s : 8: predicate.virtual_output_eliminate 0.31% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.35% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000693 11 49.82% : 0.000345s : 5: func_graph_cloner_run.FuncGraphClonerGraph 50.18% : 0.000348s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.246384 192 0.00% : 0.000003s : 1: ForceFp32Comm 1.56% : 0.003842s : 1: add_attr 1.55% : 0.003827s : 1: add_attr_with_inline 0.00% : 0.000004s : 1: add_comm_op_reuse_tag 0.03% : 0.000065s : 1: add_recomputation 0.00% : 0.000004s : 1: assign_add_opt 0.03% : 0.000073s : 1: auto_monad 0.01% : 0.000024s : 1: auto_monad_reorder 0.00% : 0.000003s : 1: begin_end_overlap_inline 0.00% : 0.000006s : 1: bias_add_comm_swap 0.20% : 0.000493s : 1: bootstrap 0.02% : 0.000040s : 1: cconv 0.00% : 0.000004s : 1: comm_op_add_attrs 0.01% : 0.000020s : 1: control_data_broadcast_order 0.00% : 0.000011s : 1: convert_after_rewriter 0.01% : 0.000027s : 1: cse_after_recomputation 0.00% : 0.000005s : 1: dataset_repeat_opt 0.00% : 0.000006s : 1: detach_backward 0.00% : 0.000009s : 1: environ_conv 0.01% : 0.000029s : 1: event_method 0.00% : 0.000005s : 1: full_micro_interleaved_order_control 0.00% : 0.000005s : 1: get_jit_bprop_graph 0.00% : 0.000010s : 1: graph_reusing 0.00% : 0.000005s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000004s : 1: handle_group_info 0.00% : 0.000005s : 1: inline 0.00% : 0.000006s : 1: insert-virtual-dataset 0.00% : 0.000004s : 1: interleave_parallel_branches 0.00% : 0.000004s : 1: interleave_split_concat_branches 0.00% : 0.000006s : 1: label_fine_grained_interleaved_index 0.00% : 0.000009s : 1: label_micro_interleaved_index 0.27% : 0.000667s : 1: loop_unroll 0.00% : 0.000004s : 1: merge_cast_opt 0.00% : 0.000006s : 1: micro_interleaved_order_control 0.36% : 0.000875s : 1: mutable_eliminate 0.00% : 0.000007s : 1: offloading_packed_experts 0.01% : 0.000020s : 1: opt.transform.loop_unroll_optimizer 0.01% : 0.000027s : 1: opt.transform.mutable_eliminate 0.53% : 0.001305s : 78: opt.transform.opt_a 0.02% : 0.000040s : 1: opt.transform.opt_after_cconv 43.79% : 0.107886s : 1: opt.transform.opt_after_jit_grad 0.05% : 0.000131s : 28: opt.transform.opt_b 0.03% : 0.000069s : 2: opt.transform.opt_trans_graph 0.02% : 0.000045s : 4: opt.transform.symbol_engine_opt 1.33% : 0.003265s : 1: opt_a 0.06% : 0.000140s : 1: opt_after_cconv 44.08% : 0.108605s : 1: opt_after_jit_grad 0.11% : 0.000276s : 1: opt_b 2.48% : 0.006121s : 1: optimize 0.01% : 0.000025s : 1: optimize_parallel_all_gather_comm 0.00% : 0.000009s : 1: order_py_execute_after_rewriter 0.01% : 0.000027s : 1: overlap_grad_flash_sp 0.00% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.00% : 0.000008s : 1: overlap_grad_ring_attention 0.00% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000006s : 1: overlap_param_gather 0.00% : 0.000005s : 1: overlap_recompute_allgather_and_fa_grad 0.00% : 0.000009s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000005s : 1: overlap_recompute_comm 0.00% : 0.000009s : 1: parallel-infer-symbol 0.00% : 0.000004s : 1: parallel-infer-symbol-second 0.00% : 0.000005s : 1: partial_unused_args_eliminate 0.00% : 0.000005s : 1: pipeline_parallel_scheduler 0.00% : 0.000005s : 1: pipeline_split 0.02% : 0.000048s : 1: pre_auto_parallel 0.02% : 0.000037s : 1: py_interpret_to_execute 0.01% : 0.000020s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000004s : 1: remove_cast_before_assign_add 0.01% : 0.000020s : 1: remove_dup_value 0.19% : 0.000479s : 1: renormalize.infer 0.18% : 0.000456s : 1: renormalize.specialize 0.00% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.00% : 0.000010s : 1: rewriter_after_jit_bprop_graph 0.02% : 0.000051s : 1: rewriter_after_opt_a 0.04% : 0.000099s : 1: rewriter_before_opt_a 0.00% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000006s : 1: slice_recompute_activation 0.00% : 0.000005s : 1: split_layernorm_comm 0.00% : 0.000005s : 1: split_matmul_comm_elemetwise 0.00% : 0.000008s : 1: swap_dp_allreduce_reducescatter 0.04% : 0.000096s : 1: symbol_engine_optimizer 0.04% : 0.000104s : 1: tuple_transform 2.73% : 0.006738s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:39:30.515.594 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:39:30.515.876 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.132155, [21] [bootstrap]: 0.00044404 [type_inference]: 0.0592878 [event_method]: 2.129e-05 [auto_monad]: 6.893e-05 [graph_reusing]: 6.09001e-06 [inline]: 2.41998e-06 [add_attr]: 0.00365739, [1] [add_attr_with_inline]: 0.00364441, [1] [Cycle 1]: 9.409e-05, [2] [tag_attr]: 2.373e-05 [meta_addattr_fg_expand]: 6.09001e-06 [parallel-infer-symbol]: 4.03999e-06 [pre_auto_parallel]: 4.391e-05 [insert-virtual-dataset]: 2.47001e-06 [parallel-infer-symbol-second]: 7.39994e-07 [dataset_repeat_opt]: 2.09999e-06 [pipeline_split]: 1.60999e-06 [optimize]: 0.0522994, [53] [py_interpret_to_execute]: 3.718e-05 [rewriter_before_opt_a]: 9.81e-05 [opt_a]: 0.0489463, [2] [Cycle 1]: 0.0478319, [45] [expand_dump_flag]: 2.84999e-06 [switch_simplify]: 4.433e-05 [loop_unroll]: 3.249e-05 [a_1]: 0.000756 [with_stream_mark]: 2.085e-05 [recompute_prepare]: 1.299e-05 [updatestate_depend_eliminate]: 5.29e-06 [updatestate_assign_eliminate]: 4.10998e-06 [updatestate_loads_eliminate]: 4.38001e-06 [parameter_eliminate]: 2.07999e-06 [a_2]: 0.0001501 [accelerated_algorithm]: 1e-05 [shard]: 2.26998e-06 [meta_shard_fg_expand]: 2.22999e-06 [shard_inline]: 8.35999e-06 [merge_send_recv]: 1.206e-05 [auto_parallel]: 8.92e-06 [parallel]: 2.097e-05 [flash_sp]: 1.206e-05 [merge_comm]: 5.33002e-06 [allreduce_fusion]: 5.09e-06 [matmul_add_comm_reduction]: 1.299e-05 [allreduce_slice_to_reducescatter]: 1.35999e-06 [virtual_shard_identity]: 1.133e-05 [virtual_dataset]: 9.87001e-06 [get_grad_eliminate_]: 9.28002e-06 [virtual_output]: 9.22999e-06 [merge_forward]: 5.39e-06 [cell_reuse_recompute_pass]: 1.62999e-06 [offload_activation]: 1.242e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.269e-05 [merge_recompute_call_nodes]: 1.50001e-06 [before_grad]: 1.569e-05 [set_forward_comm_id_for_comm_node_pass]: 5.02999e-06 [meta_fg_expand]: 3.38999e-06 [flash_sp_send_recv_attached]: 3.23e-06 [receive_attached]: 2.49999e-06 [after_resolve]: 1.547e-05 [a_after_grad]: 1.524e-05 [renormalize]: 0.0458897 [add_forward_monad_depend]: 1.361e-05 [auto_monad_grad]: 3.21999e-06 [auto_monad_eliminator]: 2.933e-05 [cse]: 4.299e-05 [a_3]: 9.561e-05 [Cycle 2]: 0.00109586, [45] [expand_dump_flag]: 2.68e-06 [switch_simplify]: 1.167e-05 [loop_unroll]: 8.33001e-06 [a_1]: 0.00020575 [with_stream_mark]: 2.333e-05 [recompute_prepare]: 9.50001e-06 [updatestate_depend_eliminate]: 5.58002e-06 [updatestate_assign_eliminate]: 4.21001e-06 [updatestate_loads_eliminate]: 3.75e-06 [parameter_eliminate]: 2.24999e-06 [a_2]: 0.00012593 [accelerated_algorithm]: 8.30999e-06 [shard]: 2.63e-06 [meta_shard_fg_expand]: 2.53e-06 [shard_inline]: 7.77e-06 [merge_send_recv]: 1.099e-05 [auto_parallel]: 1.262e-05 [parallel]: 9.50001e-06 [flash_sp]: 4.75001e-06 [merge_comm]: 5.99999e-06 [allreduce_fusion]: 4.28999e-06 [matmul_add_comm_reduction]: 1.198e-05 [allreduce_slice_to_reducescatter]: 1.22e-06 [virtual_shard_identity]: 1.072e-05 [virtual_dataset]: 9.31998e-06 [get_grad_eliminate_]: 7.9e-06 [virtual_output]: 7.95e-06 [merge_forward]: 5.07999e-06 [cell_reuse_recompute_pass]: 3.80998e-06 [offload_activation]: 1.177e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.967e-05 [merge_recompute_call_nodes]: 1.57999e-06 [before_grad]: 1.465e-05 [set_forward_comm_id_for_comm_node_pass]: 6.39001e-06 [meta_fg_expand]: 3.81001e-06 [flash_sp_send_recv_attached]: 2.06e-06 [receive_attached]: 2.53003e-06 [after_resolve]: 1.571e-05 [a_after_grad]: 1.312e-05 [renormalize]: 8.00064e-08 [add_forward_monad_depend]: 1.97999e-06 [auto_monad_grad]: 2.51e-06 [auto_monad_eliminator]: 1.541e-05 [cse]: 2.677e-05 [a_3]: 6.803e-05 [py_interpret_to_execute_after_opt_a]: 2.683e-05 [slice_cell_reuse_recomputed_activation]: 5.40999e-06 [rewriter_after_opt_a]: 5.723e-05 [convert_after_rewriter]: 1.368e-05 [order_py_execute_after_rewriter]: 9.39e-06 [mutable_eliminate]: 0.00088414 [opt_b]: 0.00045031, [1] [Cycle 1]: 0.00043533, [7] [b_1]: 0.00024745 [b_2]: 1.308e-05 [updatestate_depend_eliminate]: 1.455e-05 [updatestate_assign_eliminate]: 4.43999e-06 [updatestate_loads_eliminate]: 4.03001e-06 [renormalize]: 9.90025e-07 [cse]: 3.655e-05 [optimize_parallel_all_gather_comm]: 2.784e-05 [overlap_param_gather]: 5.81998e-06 [cconv]: 4.508e-05 [loop_unroll]: 0.00060285 [opt_after_cconv]: 0.00016767, [1] [Cycle 1]: 0.00015688, [7] [c_1]: 4.599e-05 [parameter_eliminate]: 5.16002e-06 [updatestate_depend_eliminate]: 8.79e-06 [updatestate_assign_eliminate]: 3.5e-06 [updatestate_loads_eliminate]: 3.3e-06 [cse]: 3.012e-05 [renormalize]: 3.49974e-07 [remove_dup_value]: 2.293e-05 [tuple_transform]: 0.00021096, [1] [Cycle 1]: 0.0002027, [4] [d_1]: 0.00014634 [none_parameter_eliminate]: 2.89001e-06 [renormalize]: 2.80008e-07 [switch_simplify]: 1.084e-05 [partial_unused_args_eliminate]: 6.09999e-06 [add_recomputation]: 7.124e-05 [cse_after_recomputation]: 4.06e-05, [1] [Cycle 1]: 3.134e-05, [1] [cse]: 2.055e-05 [environ_conv]: 1.092e-05 [swap_dp_allreduce_reducescatter]: 1.061e-05 [bias_add_comm_swap]: 6.06e-06 [label_micro_interleaved_index]: 7.9e-06 [label_fine_grained_interleaved_index]: 5.81003e-06 [merge_cast_opt]: 4.16001e-06 [slice_recompute_activation]: 5.49e-06 [micro_interleaved_order_control]: 5.59e-06 [assign_add_opt]: 3.98001e-06 [ForceFp32Comm]: 3.61001e-06 [remove_cast_before_assign_add]: 3.8e-06 [full_micro_interleaved_order_control]: 4.67e-06 [reorder_send_recv_between_fp_bp]: 5.63002e-06 [comm_op_add_attrs]: 3.81999e-06 [add_comm_op_reuse_tag]: 3.9e-06 [interleave_split_concat_branches]: 4.28999e-06 [interleave_parallel_branches]: 3.61001e-06 [overlap_opt_shard_in_pipeline]: 4.50999e-06 [overlap_opt_shard_grad_in_pipeline]: 4.32998e-06 [control_data_broadcast_order]: 2.106e-05 [grouped_pairwise_exchange_alltoall]: 4.17e-06 [offloading_packed_experts]: 8.63001e-06 [overlap_recompute_and_grad_model_parallel]: 8.29998e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.99002e-06 [overlap_recompute_allgather_and_fa_grad]: 4.2e-06 [overlap_recompute_comm]: 5.97999e-06 [overlap_grad_ring_attention]: 8.28001e-06 [overlap_grad_flash_sp]: 2.915e-05 [begin_end_overlap_inline]: 3.93001e-06 [split_matmul_comm_elemetwise]: 5.39998e-06 [split_layernorm_comm]: 4.72e-06 [handle_group_info]: 3.57002e-06 [symbol_engine_optimizer]: 0.00012098, [1] [Cycle 1]: 0.00011243, [6] [build]: 4.55001e-06 [elim_shapecalc]: 1.532e-05 [elim_not_effective]: 1.757e-05 [opt_reshape]: 9.44e-06 [fold_const_symbol]: 1.423e-05 [renormalize]: 1.99972e-07 [detach_backward]: 1.23e-05 [pipeline_parallel_scheduler]: 3.04001e-06 [auto_monad_reorder]: 3.912e-05 [get_jit_bprop_graph]: 3.16999e-06 [rewriter_after_jit_bprop_graph]: 1.133e-05 [opt_after_jit_grad]: 0.00090976 [validate]: 6.172e-05 Sums bootstrap : 0.000444s : 0.40% type_inference : 0.059288s : 53.01% event_method : 0.000021s : 0.02% auto_monad : 0.000069s : 0.06% graph_reusing : 0.000006s : 0.01% inline : 0.000002s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000024s : 0.02% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.01% parallel-infer-symbol : 0.000004s : 0.00% pre_auto_parallel : 0.000044s : 0.04% insert-virtual-dataset : 0.000002s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000037s : 0.03% optimize.rewriter_before_opt_a : 0.000098s : 0.09% optimize.opt_a.expand_dump_flag : 0.000006s : 0.00% optimize.opt_a.switch_simplify : 0.000056s : 0.05% optimize.opt_a.loop_unroll : 0.000041s : 0.04% optimize.opt_a.a_1 : 0.000962s : 0.86% optimize.opt_a.with_stream_mark : 0.000044s : 0.04% optimize.opt_a.recompute_prepare : 0.000022s : 0.02% optimize.opt_a.updatestate_depend_eliminate : 0.000011s : 0.01% optimize.opt_a.updatestate_assign_eliminate : 0.000008s : 0.01% optimize.opt_a.updatestate_loads_eliminate : 0.000008s : 0.01% optimize.opt_a.parameter_eliminate : 0.000004s : 0.00% optimize.opt_a.a_2 : 0.000276s : 0.25% optimize.opt_a.accelerated_algorithm : 0.000018s : 0.02% optimize.opt_a.shard : 0.000005s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.000005s : 0.00% optimize.opt_a.shard_inline : 0.000016s : 0.01% optimize.opt_a.merge_send_recv : 0.000023s : 0.02% optimize.opt_a.auto_parallel : 0.000022s : 0.02% optimize.opt_a.parallel : 0.000030s : 0.03% optimize.opt_a.flash_sp : 0.000017s : 0.02% optimize.opt_a.merge_comm : 0.000011s : 0.01% optimize.opt_a.allreduce_fusion : 0.000009s : 0.01% optimize.opt_a.matmul_add_comm_reduction : 0.000025s : 0.02% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000003s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000022s : 0.02% optimize.opt_a.virtual_dataset : 0.000019s : 0.02% optimize.opt_a.get_grad_eliminate_ : 0.000017s : 0.02% optimize.opt_a.virtual_output : 0.000017s : 0.02% optimize.opt_a.merge_forward : 0.000010s : 0.01% optimize.opt_a.cell_reuse_recompute_pass : 0.000005s : 0.00% optimize.opt_a.offload_activation : 0.000024s : 0.02% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000042s : 0.04% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.00% optimize.opt_a.before_grad : 0.000030s : 0.03% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000011s : 0.01% optimize.opt_a.meta_fg_expand : 0.000007s : 0.01% optimize.opt_a.flash_sp_send_recv_attached : 0.000005s : 0.00% optimize.opt_a.receive_attached : 0.000005s : 0.00% optimize.opt_a.after_resolve : 0.000031s : 0.03% optimize.opt_a.a_after_grad : 0.000028s : 0.03% optimize.opt_a.renormalize : 0.045890s : 41.03% optimize.opt_a.add_forward_monad_depend : 0.000016s : 0.01% optimize.opt_a.auto_monad_grad : 0.000006s : 0.01% optimize.opt_a.auto_monad_eliminator : 0.000045s : 0.04% optimize.opt_a.cse : 0.000070s : 0.06% optimize.opt_a.a_3 : 0.000164s : 0.15% optimize.py_interpret_to_execute_after_opt_a : 0.000027s : 0.02% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.00% optimize.rewriter_after_opt_a : 0.000057s : 0.05% optimize.convert_after_rewriter : 0.000014s : 0.01% optimize.order_py_execute_after_rewriter : 0.000009s : 0.01% optimize.mutable_eliminate : 0.000884s : 0.79% optimize.opt_b.b_1 : 0.000247s : 0.22% optimize.opt_b.b_2 : 0.000013s : 0.01% optimize.opt_b.updatestate_depend_eliminate : 0.000015s : 0.01% optimize.opt_b.updatestate_assign_eliminate : 0.000004s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000004s : 0.00% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000037s : 0.03% optimize.optimize_parallel_all_gather_comm : 0.000028s : 0.02% optimize.overlap_param_gather : 0.000006s : 0.01% optimize.cconv : 0.000045s : 0.04% optimize.loop_unroll : 0.000603s : 0.54% optimize.opt_after_cconv.c_1 : 0.000046s : 0.04% optimize.opt_after_cconv.parameter_eliminate : 0.000005s : 0.00% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000009s : 0.01% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.cse : 0.000030s : 0.03% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000023s : 0.02% optimize.tuple_transform.d_1 : 0.000146s : 0.13% optimize.tuple_transform.none_parameter_eliminate : 0.000003s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000011s : 0.01% optimize.partial_unused_args_eliminate : 0.000006s : 0.01% optimize.add_recomputation : 0.000071s : 0.06% optimize.cse_after_recomputation.cse : 0.000021s : 0.02% optimize.environ_conv : 0.000011s : 0.01% optimize.swap_dp_allreduce_reducescatter : 0.000011s : 0.01% optimize.bias_add_comm_swap : 0.000006s : 0.01% optimize.label_micro_interleaved_index : 0.000008s : 0.01% optimize.label_fine_grained_interleaved_index : 0.000006s : 0.01% optimize.merge_cast_opt : 0.000004s : 0.00% optimize.slice_recompute_activation : 0.000005s : 0.00% optimize.micro_interleaved_order_control : 0.000006s : 0.00% optimize.assign_add_opt : 0.000004s : 0.00% optimize.ForceFp32Comm : 0.000004s : 0.00% optimize.remove_cast_before_assign_add : 0.000004s : 0.00% optimize.full_micro_interleaved_order_control : 0.000005s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000006s : 0.01% optimize.comm_op_add_attrs : 0.000004s : 0.00% optimize.add_comm_op_reuse_tag : 0.000004s : 0.00% optimize.interleave_split_concat_branches : 0.000004s : 0.00% optimize.interleave_parallel_branches : 0.000004s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000005s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000004s : 0.00% optimize.control_data_broadcast_order : 0.000021s : 0.02% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.00% optimize.offloading_packed_experts : 0.000009s : 0.01% optimize.overlap_recompute_and_grad_model_parallel : 0.000008s : 0.01% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.00% optimize.overlap_recompute_comm : 0.000006s : 0.01% optimize.overlap_grad_ring_attention : 0.000008s : 0.01% optimize.overlap_grad_flash_sp : 0.000029s : 0.03% optimize.begin_end_overlap_inline : 0.000004s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000005s : 0.00% optimize.split_layernorm_comm : 0.000005s : 0.00% optimize.handle_group_info : 0.000004s : 0.00% optimize.symbol_engine_optimizer.build : 0.000005s : 0.00% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000015s : 0.01% optimize.symbol_engine_optimizer.elim_not_effective : 0.000018s : 0.02% optimize.symbol_engine_optimizer.opt_reshape : 0.000009s : 0.01% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000014s : 0.01% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000012s : 0.01% pipeline_parallel_scheduler : 0.000003s : 0.00% auto_monad_reorder : 0.000039s : 0.03% get_jit_bprop_graph : 0.000003s : 0.00% rewriter_after_jit_bprop_graph : 0.000011s : 0.01% opt_after_jit_grad : 0.000910s : 0.81% validate : 0.000062s : 0.06% Time group info: ------[substitution.] 0.000263 38 14.40% : 0.000038s : 3: substitution.cast_eliminate 1.37% : 0.000004s : 3: substitution.elim_not_effective 0.68% : 0.000002s : 3: substitution.fold_const_symbol 3.30% : 0.000009s : 5: substitution.graph_param_transform 65.67% : 0.000172s : 4: substitution.inline 2.12% : 0.000006s : 6: substitution.j_node_and_user_rematch 2.97% : 0.000008s : 6: substitution.remove_not_recompute_node 3.06% : 0.000008s : 4: substitution.replace_old_param 6.43% : 0.000017s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.059223 2 98.53% : 0.058353s : 1: type_inference.infer 1.47% : 0.000869s : 1: type_inference.specialize ------[replace.] 0.000070 8 59.55% : 0.000042s : 4: replace.inline 40.45% : 0.000028s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000184 8 92.11% : 0.000169s : 4: match.inline 7.89% : 0.000015s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000270 1504 0.95% : 0.000003s : 15: predicate.accumulaten_eliminater 1.40% : 0.000004s : 5: predicate.ad_related_special_op_eliminate 0.57% : 0.000002s : 10: predicate.addn_check_dump 0.86% : 0.000002s : 15: predicate.addn_zero_filter 0.75% : 0.000002s : 15: predicate.adjust_all_reduce_mul_add 2.18% : 0.000006s : 25: predicate.arithmetic_simplify 1.12% : 0.000003s : 15: predicate.cast_eliminate 0.63% : 0.000002s : 10: predicate.check_bprop_eliminate 0.53% : 0.000001s : 10: predicate.compare_switch_simplify 0.19% : 0.000001s : 5: predicate.const_output_eliminate 1.12% : 0.000003s : 10: predicate.depend_value_elim 0.97% : 0.000003s : 15: predicate.dict_get_item_const_eliminator 0.95% : 0.000003s : 15: predicate.dict_get_item_eliminator 1.06% : 0.000003s : 15: predicate.dict_set_item_eliminator 1.25% : 0.000003s : 10: predicate.dumpgradient_eliminate 0.20% : 0.000001s : 5: predicate.elim_not_effective 0.56% : 0.000002s : 5: predicate.elim_shapecalc_of_broadcastargs 1.24% : 0.000003s : 20: predicate.environ_add_const_eliminate 1.05% : 0.000003s : 20: predicate.environ_get_add_eliminate 1.09% : 0.000003s : 20: predicate.environ_get_depend_swap 1.76% : 0.000005s : 30: predicate.environ_get_eliminate 1.02% : 0.000003s : 20: predicate.environ_get_set_eliminate 1.24% : 0.000003s : 23: predicate.exchange_switch_depend_value 2.20% : 0.000006s : 23: predicate.float_depend_g_call 0.57% : 0.000002s : 10: predicate.float_environ_get_switch 0.81% : 0.000002s : 15: predicate.float_tuple_getitem_switch 0.18% : 0.000000s : 5: predicate.fold_const_symbol 0.62% : 0.000002s : 10: predicate.get_grad_eliminate 0.31% : 0.000001s : 5: predicate.graph_param_transform 0.58% : 0.000002s : 10: predicate.incorporate_call 0.48% : 0.000001s : 10: predicate.incorporate_call_switch 5.95% : 0.000016s : 68: predicate.inline 0.73% : 0.000002s : 10: predicate.inline_without_move 0.29% : 0.000001s : 10: predicate.j_node_and_user_rematch 0.85% : 0.000002s : 10: predicate.less_batch_normalization 1.89% : 0.000005s : 29: predicate.list_to_tuple_eliminator_ 2.38% : 0.000006s : 44: predicate.load_eliminater 0.98% : 0.000003s : 5: predicate.loop_unroll_after_grad 1.95% : 0.000005s : 36: predicate.loop_unroll_before_grad 1.62% : 0.000004s : 25: predicate.make_slice_get_slice_eliminator 0.64% : 0.000002s : 10: predicate.merge_addn 0.56% : 0.000002s : 10: predicate.micro_step_allgather_replace 0.57% : 0.000002s : 10: predicate.mini_step_allgather_replace 0.77% : 0.000002s : 15: predicate.minmaximum_grad 1.56% : 0.000004s : 5: predicate.mutable_eliminate 0.39% : 0.000001s : 5: predicate.opt_reshape 0.39% : 0.000001s : 5: predicate.parallel_virtual_node 1.70% : 0.000005s : 23: predicate.partial_defer_inline 1.50% : 0.000004s : 24: predicate.partial_eliminate 0.85% : 0.000002s : 15: predicate.print_const_string_wrapper 0.66% : 0.000002s : 10: predicate.reduce_all_const_elim 1.07% : 0.000003s : 15: predicate.reduce_eliminate 2.44% : 0.000007s : 44: predicate.redundant_stop_gradient_eliminater 0.39% : 0.000001s : 10: predicate.remove_not_recompute_node 1.33% : 0.000004s : 29: predicate.replace_applicator 0.52% : 0.000001s : 10: predicate.replace_old_param 0.41% : 0.000001s : 5: predicate.reset_defer_inline 0.93% : 0.000002s : 15: predicate.reshape_eliminate 0.63% : 0.000002s : 10: predicate.row_tensor_add_zeros_like 0.59% : 0.000002s : 5: predicate.row_tensor_eliminate 0.78% : 0.000002s : 10: predicate.same_eliminate 0.39% : 0.000001s : 10: predicate.set_cell_output_no_recompute 1.06% : 0.000003s : 10: predicate.shard_identity_eliminate 0.80% : 0.000002s : 10: predicate.special_op_eliminate 0.81% : 0.000002s : 10: predicate.specialize_transform 1.12% : 0.000003s : 10: predicate.split_environ_get_set_with_tuple_value 0.93% : 0.000003s : 10: predicate.stack_unstack_eliminate 0.40% : 0.000001s : 5: predicate.switch_call_monad_eliminater 1.44% : 0.000004s : 23: predicate.switch_defer_inline 1.85% : 0.000005s : 33: predicate.switch_layer_defer_inline 4.70% : 0.000013s : 74: predicate.switch_simplify 0.94% : 0.000003s : 15: predicate.tile_eliminate 0.83% : 0.000002s : 15: predicate.transpose_eliminate 1.43% : 0.000004s : 25: predicate.tuple_list_convert_item_index_to_positive 1.56% : 0.000004s : 25: predicate.tuple_list_get_item_const_eliminator 1.41% : 0.000004s : 25: predicate.tuple_list_get_item_depend_reorder 3.12% : 0.000008s : 39: predicate.tuple_list_get_item_eliminator 1.45% : 0.000004s : 25: predicate.tuple_list_get_set_item_eliminator 2.10% : 0.000006s : 35: predicate.tuple_list_set_item_eliminator 1.79% : 0.000005s : 29: predicate.tuple_to_list_eliminator_ 2.37% : 0.000006s : 44: predicate.updatestate_pure_node_eliminater 3.11% : 0.000008s : 54: predicate.updatestate_useless_node_eliminater 0.42% : 0.000001s : 5: predicate.value_based_eliminate 0.70% : 0.000002s : 10: predicate.virtual_dataset_eliminate 0.63% : 0.000002s : 10: predicate.virtual_output_eliminate 0.31% : 0.000001s : 5: predicate.virtual_view_grad_eliminate 0.65% : 0.000002s : 5: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.045486 11 0.82% : 0.000374s : 5: func_graph_cloner_run.FuncGraphClonerGraph 99.18% : 0.045112s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.235891 192 0.00% : 0.000006s : 1: ForceFp32Comm 1.55% : 0.003668s : 1: add_attr 1.55% : 0.003649s : 1: add_attr_with_inline 0.00% : 0.000006s : 1: add_comm_op_reuse_tag 0.03% : 0.000076s : 1: add_recomputation 0.00% : 0.000007s : 1: assign_add_opt 0.03% : 0.000079s : 1: auto_monad 0.02% : 0.000048s : 1: auto_monad_reorder 0.00% : 0.000007s : 1: begin_end_overlap_inline 0.00% : 0.000009s : 1: bias_add_comm_swap 0.21% : 0.000493s : 1: bootstrap 0.02% : 0.000049s : 1: cconv 0.00% : 0.000007s : 1: comm_op_add_attrs 0.01% : 0.000024s : 1: control_data_broadcast_order 0.01% : 0.000017s : 1: convert_after_rewriter 0.02% : 0.000044s : 1: cse_after_recomputation 0.00% : 0.000008s : 1: dataset_repeat_opt 0.03% : 0.000065s : 1: detach_backward 0.01% : 0.000014s : 1: environ_conv 0.01% : 0.000032s : 1: event_method 0.00% : 0.000007s : 1: full_micro_interleaved_order_control 0.00% : 0.000010s : 1: get_jit_bprop_graph 0.01% : 0.000013s : 1: graph_reusing 0.00% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000007s : 1: handle_group_info 0.00% : 0.000009s : 1: inline 0.00% : 0.000009s : 1: insert-virtual-dataset 0.00% : 0.000006s : 1: interleave_parallel_branches 0.00% : 0.000007s : 1: interleave_split_concat_branches 0.00% : 0.000009s : 1: label_fine_grained_interleaved_index 0.00% : 0.000011s : 1: label_micro_interleaved_index 0.26% : 0.000610s : 1: loop_unroll 0.00% : 0.000007s : 1: merge_cast_opt 0.00% : 0.000009s : 1: micro_interleaved_order_control 0.38% : 0.000895s : 1: mutable_eliminate 0.00% : 0.000012s : 1: offloading_packed_experts 0.01% : 0.000022s : 1: opt.transform.loop_unroll_optimizer 0.01% : 0.000027s : 1: opt.transform.mutable_eliminate 0.66% : 0.001562s : 78: opt.transform.opt_a 0.02% : 0.000044s : 1: opt.transform.opt_after_cconv 0.02% : 0.000053s : 1: opt.transform.opt_after_jit_grad 0.07% : 0.000166s : 28: opt.transform.opt_b 0.06% : 0.000153s : 2: opt.transform.opt_trans_graph 0.02% : 0.000053s : 4: opt.transform.symbol_engine_opt 20.75% : 0.048950s : 1: opt_a 0.07% : 0.000171s : 1: opt_after_cconv 0.39% : 0.000925s : 1: opt_after_jit_grad 0.19% : 0.000455s : 1: opt_b 28.51% : 0.067242s : 1: optimize 0.01% : 0.000032s : 1: optimize_parallel_all_gather_comm 0.01% : 0.000013s : 1: order_py_execute_after_rewriter 0.01% : 0.000033s : 1: overlap_grad_flash_sp 0.00% : 0.000007s : 1: overlap_grad_matmul_and_grad_allreduce 0.01% : 0.000012s : 1: overlap_grad_ring_attention 0.00% : 0.000008s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000007s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000009s : 1: overlap_param_gather 0.00% : 0.000007s : 1: overlap_recompute_allgather_and_fa_grad 0.00% : 0.000011s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000009s : 1: overlap_recompute_comm 0.00% : 0.000011s : 1: parallel-infer-symbol 0.00% : 0.000006s : 1: parallel-infer-symbol-second 0.00% : 0.000009s : 1: partial_unused_args_eliminate 0.01% : 0.000013s : 1: pipeline_parallel_scheduler 0.00% : 0.000007s : 1: pipeline_split 0.02% : 0.000052s : 1: pre_auto_parallel 0.02% : 0.000041s : 1: py_interpret_to_execute 0.01% : 0.000030s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000008s : 1: remove_cast_before_assign_add 0.01% : 0.000026s : 1: remove_dup_value 0.23% : 0.000539s : 1: renormalize.infer 19.22% : 0.045334s : 1: renormalize.specialize 0.00% : 0.000008s : 1: reorder_send_recv_between_fp_bp 0.01% : 0.000019s : 1: rewriter_after_jit_bprop_graph 0.03% : 0.000061s : 1: rewriter_after_opt_a 0.04% : 0.000102s : 1: rewriter_before_opt_a 0.00% : 0.000008s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000008s : 1: slice_recompute_activation 0.00% : 0.000007s : 1: split_layernorm_comm 0.00% : 0.000008s : 1: split_matmul_comm_elemetwise 0.01% : 0.000014s : 1: swap_dp_allreduce_reducescatter 0.05% : 0.000124s : 1: symbol_engine_optimizer 0.09% : 0.000214s : 1: tuple_transform 25.15% : 0.059335s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:39:32.538.407 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.19009, [21] [bootstrap]: 0.00047367 [type_inference]: 0.0962191 [event_method]: 2.447e-05 [auto_monad]: 7.724e-05 [graph_reusing]: 6.30002e-06 [inline]: 3.01999e-06 [add_attr]: 0.00389955, [1] [add_attr_with_inline]: 0.00388617, [1] [Cycle 1]: 7.982e-05, [2] [tag_attr]: 2.564e-05 [meta_addattr_fg_expand]: 6.49001e-06 [parallel-infer-symbol]: 4.06001e-06 [pre_auto_parallel]: 4.453e-05 [insert-virtual-dataset]: 2.51e-06 [parallel-infer-symbol-second]: 1.14998e-06 [dataset_repeat_opt]: 2.17999e-06 [pipeline_split]: 1.96e-06 [optimize]: 0.0885158, [53] [py_interpret_to_execute]: 3.316e-05 [rewriter_before_opt_a]: 9.87e-05 [opt_a]: 0.0858636, [2] [Cycle 1]: 0.0850015, [45] [expand_dump_flag]: 3.31001e-06 [switch_simplify]: 4.624e-05 [loop_unroll]: 3.189e-05 [a_1]: 0.0007299 [with_stream_mark]: 2.106e-05 [recompute_prepare]: 1.116e-05 [updatestate_depend_eliminate]: 4.59998e-06 [updatestate_assign_eliminate]: 4.07e-06 [updatestate_loads_eliminate]: 3.76999e-06 [parameter_eliminate]: 2.19999e-06 [a_2]: 0.00011262 [accelerated_algorithm]: 1.067e-05 [shard]: 2.34001e-06 [meta_shard_fg_expand]: 2.26e-06 [shard_inline]: 8.67e-06 [merge_send_recv]: 9.76998e-06 [auto_parallel]: 8.75001e-06 [parallel]: 2.278e-05 [flash_sp]: 9.49e-06 [merge_comm]: 5.09998e-06 [allreduce_fusion]: 4.48999e-06 [matmul_add_comm_reduction]: 1.188e-05 [allreduce_slice_to_reducescatter]: 8.29983e-07 [virtual_shard_identity]: 1.109e-05 [virtual_dataset]: 1.021e-05 [get_grad_eliminate_]: 1.04e-05 [virtual_output]: 9.27999e-06 [merge_forward]: 5.17999e-06 [cell_reuse_recompute_pass]: 1.86e-06 [offload_activation]: 1.194e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.955e-05 [merge_recompute_call_nodes]: 1.45001e-06 [before_grad]: 1.571e-05 [set_forward_comm_id_for_comm_node_pass]: 4.98001e-06 [meta_fg_expand]: 3.7e-06 [flash_sp_send_recv_attached]: 3.48e-06 [receive_attached]: 2.79001e-06 [after_resolve]: 1.718e-05 [a_after_grad]: 1.588e-05 [renormalize]: 0.0833326 [add_forward_monad_depend]: 9.40001e-06 [auto_monad_grad]: 2.42001e-06 [auto_monad_eliminator]: 2.351e-05 [cse]: 4.097e-05 [a_3]: 7.704e-05 [Cycle 2]: 0.00084856, [45] [expand_dump_flag]: 2.29001e-06 [switch_simplify]: 1.083e-05 [loop_unroll]: 8.15999e-06 [a_1]: 0.00020296 [with_stream_mark]: 1.967e-05 [recompute_prepare]: 8.43999e-06 [updatestate_depend_eliminate]: 4.62e-06 [updatestate_assign_eliminate]: 3.86001e-06 [updatestate_loads_eliminate]: 3.66999e-06 [parameter_eliminate]: 1.82001e-06 [a_2]: 9.68e-05 [accelerated_algorithm]: 8.20999e-06 [shard]: 2.64999e-06 [meta_shard_fg_expand]: 2.94999e-06 [shard_inline]: 7.83001e-06 [merge_send_recv]: 1.012e-05 [auto_parallel]: 1.056e-05 [parallel]: 9.79e-06 [flash_sp]: 4.17998e-06 [merge_comm]: 4.60999e-06 [allreduce_fusion]: 4.28999e-06 [matmul_add_comm_reduction]: 1.019e-05 [allreduce_slice_to_reducescatter]: 6.60017e-07 [virtual_shard_identity]: 8.65001e-06 [virtual_dataset]: 8.24002e-06 [get_grad_eliminate_]: 7.55e-06 [virtual_output]: 7.59002e-06 [merge_forward]: 4.75999e-06 [cell_reuse_recompute_pass]: 3.91999e-06 [offload_activation]: 1.083e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.552e-05 [merge_recompute_call_nodes]: 1.74e-06 [before_grad]: 1.264e-05 [set_forward_comm_id_for_comm_node_pass]: 4.97999e-06 [meta_fg_expand]: 3.47002e-06 [flash_sp_send_recv_attached]: 1.81003e-06 [receive_attached]: 2.38998e-06 [after_resolve]: 1.46e-05 [a_after_grad]: 1.173e-05 [renormalize]: 7.00238e-08 [add_forward_monad_depend]: 1.70001e-06 [auto_monad_grad]: 1.09998e-06 [auto_monad_eliminator]: 9.25999e-06 [cse]: 2.041e-05 [a_3]: 4.496e-05 [py_interpret_to_execute_after_opt_a]: 1.856e-05 [slice_cell_reuse_recomputed_activation]: 1.97001e-06 [rewriter_after_opt_a]: 4.354e-05 [convert_after_rewriter]: 7.85998e-06 [order_py_execute_after_rewriter]: 6.81001e-06 [mutable_eliminate]: 0.00081728 [opt_b]: 0.000298, [1] [Cycle 1]: 0.00028925, [7] [b_1]: 0.00019083 [b_2]: 1.133e-05 [updatestate_depend_eliminate]: 7.45e-06 [updatestate_assign_eliminate]: 3.50998e-06 [updatestate_loads_eliminate]: 3.26999e-06 [renormalize]: 7.00005e-07 [cse]: 2.821e-05 [optimize_parallel_all_gather_comm]: 1.952e-05 [overlap_param_gather]: 1.97001e-06 [cconv]: 3.001e-05 [loop_unroll]: 0.00049241 [opt_after_cconv]: 0.00012846, [1] [Cycle 1]: 0.00012172, [7] [c_1]: 4.326e-05 [parameter_eliminate]: 3.26001e-06 [updatestate_depend_eliminate]: 6.03002e-06 [updatestate_assign_eliminate]: 3.16001e-06 [updatestate_loads_eliminate]: 3.15002e-06 [cse]: 2.531e-05 [renormalize]: 6.00005e-07 [remove_dup_value]: 1.711e-05 [tuple_transform]: 9.855e-05, [1] [Cycle 1]: 9.407e-05, [4] [d_1]: 6.326e-05 [none_parameter_eliminate]: 1.94e-06 [renormalize]: 2.09984e-07 [switch_simplify]: 8.86002e-06 [partial_unused_args_eliminate]: 2.00002e-06 [add_recomputation]: 6.522e-05 [cse_after_recomputation]: 2.756e-05, [1] [Cycle 1]: 2.282e-05, [1] [cse]: 1.659e-05 [environ_conv]: 7.28999e-06 [swap_dp_allreduce_reducescatter]: 6.74001e-06 [bias_add_comm_swap]: 3.7e-06 [label_micro_interleaved_index]: 4.79e-06 [label_fine_grained_interleaved_index]: 3.25e-06 [merge_cast_opt]: 1.30999e-06 [slice_recompute_activation]: 2.12999e-06 [micro_interleaved_order_control]: 2.69999e-06 [assign_add_opt]: 1.39e-06 [ForceFp32Comm]: 1.14e-06 [remove_cast_before_assign_add]: 1.21002e-06 [full_micro_interleaved_order_control]: 2.12999e-06 [reorder_send_recv_between_fp_bp]: 2.98e-06 [comm_op_add_attrs]: 9.5999e-07 [add_comm_op_reuse_tag]: 9.50007e-07 [interleave_split_concat_branches]: 1.10999e-06 [interleave_parallel_branches]: 1.27999e-06 [overlap_opt_shard_in_pipeline]: 1.32999e-06 [overlap_opt_shard_grad_in_pipeline]: 2.02001e-06 [control_data_broadcast_order]: 1.514e-05 [grouped_pairwise_exchange_alltoall]: 1.52999e-06 [offloading_packed_experts]: 5.72001e-06 [overlap_recompute_and_grad_model_parallel]: 5.81e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.17999e-06 [overlap_recompute_allgather_and_fa_grad]: 1.35999e-06 [overlap_recompute_comm]: 2.27001e-06 [overlap_grad_ring_attention]: 4.43999e-06 [overlap_grad_flash_sp]: 2.552e-05 [begin_end_overlap_inline]: 4.80009e-07 [split_matmul_comm_elemetwise]: 2.17999e-06 [split_layernorm_comm]: 1.87001e-06 [handle_group_info]: 1.24e-06 [symbol_engine_optimizer]: 9.482e-05, [1] [Cycle 1]: 9.027e-05, [6] [build]: 4.38001e-06 [elim_shapecalc]: 1.366e-05 [elim_not_effective]: 1.624e-05 [opt_reshape]: 8.98002e-06 [fold_const_symbol]: 1.389e-05 [renormalize]: 2.00002e-07 [detach_backward]: 2.48e-06 [pipeline_parallel_scheduler]: 1.53997e-06 [auto_monad_reorder]: 2.237e-05 [get_jit_bprop_graph]: 1.89999e-06 [rewriter_after_jit_bprop_graph]: 5.19e-06 [opt_after_jit_grad]: 0.00055114 [validate]: 5.064e-05 Sums bootstrap : 0.000474s : 0.26% type_inference : 0.096219s : 51.98% event_method : 0.000024s : 0.01% auto_monad : 0.000077s : 0.04% graph_reusing : 0.000006s : 0.00% inline : 0.000003s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000026s : 0.01% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.00% parallel-infer-symbol : 0.000004s : 0.00% pre_auto_parallel : 0.000045s : 0.02% insert-virtual-dataset : 0.000003s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000033s : 0.02% optimize.rewriter_before_opt_a : 0.000099s : 0.05% optimize.opt_a.expand_dump_flag : 0.000006s : 0.00% optimize.opt_a.switch_simplify : 0.000057s : 0.03% optimize.opt_a.loop_unroll : 0.000040s : 0.02% optimize.opt_a.a_1 : 0.000933s : 0.50% optimize.opt_a.with_stream_mark : 0.000041s : 0.02% optimize.opt_a.recompute_prepare : 0.000020s : 0.01% optimize.opt_a.updatestate_depend_eliminate : 0.000009s : 0.00% optimize.opt_a.updatestate_assign_eliminate : 0.000008s : 0.00% optimize.opt_a.updatestate_loads_eliminate : 0.000007s : 0.00% optimize.opt_a.parameter_eliminate : 0.000004s : 0.00% optimize.opt_a.a_2 : 0.000209s : 0.11% optimize.opt_a.accelerated_algorithm : 0.000019s : 0.01% optimize.opt_a.shard : 0.000005s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.000005s : 0.00% optimize.opt_a.shard_inline : 0.000017s : 0.01% optimize.opt_a.merge_send_recv : 0.000020s : 0.01% optimize.opt_a.auto_parallel : 0.000019s : 0.01% optimize.opt_a.parallel : 0.000033s : 0.02% optimize.opt_a.flash_sp : 0.000014s : 0.01% optimize.opt_a.merge_comm : 0.000010s : 0.01% optimize.opt_a.allreduce_fusion : 0.000009s : 0.00% optimize.opt_a.matmul_add_comm_reduction : 0.000022s : 0.01% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000020s : 0.01% optimize.opt_a.virtual_dataset : 0.000018s : 0.01% optimize.opt_a.get_grad_eliminate_ : 0.000018s : 0.01% optimize.opt_a.virtual_output : 0.000017s : 0.01% optimize.opt_a.merge_forward : 0.000010s : 0.01% optimize.opt_a.cell_reuse_recompute_pass : 0.000006s : 0.00% optimize.opt_a.offload_activation : 0.000023s : 0.01% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000035s : 0.02% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.00% optimize.opt_a.before_grad : 0.000028s : 0.02% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000010s : 0.01% optimize.opt_a.meta_fg_expand : 0.000007s : 0.00% optimize.opt_a.flash_sp_send_recv_attached : 0.000005s : 0.00% optimize.opt_a.receive_attached : 0.000005s : 0.00% optimize.opt_a.after_resolve : 0.000032s : 0.02% optimize.opt_a.a_after_grad : 0.000028s : 0.01% optimize.opt_a.renormalize : 0.083333s : 45.02% optimize.opt_a.add_forward_monad_depend : 0.000011s : 0.01% optimize.opt_a.auto_monad_grad : 0.000004s : 0.00% optimize.opt_a.auto_monad_eliminator : 0.000033s : 0.02% optimize.opt_a.cse : 0.000061s : 0.03% optimize.opt_a.a_3 : 0.000122s : 0.07% optimize.py_interpret_to_execute_after_opt_a : 0.000019s : 0.01% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.00% optimize.rewriter_after_opt_a : 0.000044s : 0.02% optimize.convert_after_rewriter : 0.000008s : 0.00% optimize.order_py_execute_after_rewriter : 0.000007s : 0.00% optimize.mutable_eliminate : 0.000817s : 0.44% optimize.opt_b.b_1 : 0.000191s : 0.10% optimize.opt_b.b_2 : 0.000011s : 0.01% optimize.opt_b.updatestate_depend_eliminate : 0.000007s : 0.00% optimize.opt_b.updatestate_assign_eliminate : 0.000004s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.00% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000028s : 0.02% optimize.optimize_parallel_all_gather_comm : 0.000020s : 0.01% optimize.overlap_param_gather : 0.000002s : 0.00% optimize.cconv : 0.000030s : 0.02% optimize.loop_unroll : 0.000492s : 0.27% optimize.opt_after_cconv.c_1 : 0.000043s : 0.02% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.00% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.cse : 0.000025s : 0.01% optimize.opt_after_cconv.renormalize : 0.000001s : 0.00% optimize.remove_dup_value : 0.000017s : 0.01% optimize.tuple_transform.d_1 : 0.000063s : 0.03% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000009s : 0.00% optimize.partial_unused_args_eliminate : 0.000002s : 0.00% optimize.add_recomputation : 0.000065s : 0.04% optimize.cse_after_recomputation.cse : 0.000017s : 0.01% optimize.environ_conv : 0.000007s : 0.00% optimize.swap_dp_allreduce_reducescatter : 0.000007s : 0.00% optimize.bias_add_comm_swap : 0.000004s : 0.00% optimize.label_micro_interleaved_index : 0.000005s : 0.00% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.00% optimize.merge_cast_opt : 0.000001s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.00% optimize.micro_interleaved_order_control : 0.000003s : 0.00% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.00% optimize.full_micro_interleaved_order_control : 0.000002s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.00% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.00% optimize.control_data_broadcast_order : 0.000015s : 0.01% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.00% optimize.offloading_packed_experts : 0.000006s : 0.00% optimize.overlap_recompute_and_grad_model_parallel : 0.000006s : 0.00% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.00% optimize.overlap_recompute_comm : 0.000002s : 0.00% optimize.overlap_grad_ring_attention : 0.000004s : 0.00% optimize.overlap_grad_flash_sp : 0.000026s : 0.01% optimize.begin_end_overlap_inline : 0.000000s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.00% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000004s : 0.00% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000014s : 0.01% optimize.symbol_engine_optimizer.elim_not_effective : 0.000016s : 0.01% optimize.symbol_engine_optimizer.opt_reshape : 0.000009s : 0.00% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000014s : 0.01% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.00% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000022s : 0.01% get_jit_bprop_graph : 0.000002s : 0.00% rewriter_after_jit_bprop_graph : 0.000005s : 0.00% opt_after_jit_grad : 0.000551s : 0.30% validate : 0.000051s : 0.03% Time group info: ------[substitution.] 0.000264 38 13.42% : 0.000035s : 3: substitution.cast_eliminate 0.85% : 0.000002s : 3: substitution.elim_not_effective 0.93% : 0.000002s : 3: substitution.fold_const_symbol 2.67% : 0.000007s : 5: substitution.graph_param_transform 68.98% : 0.000182s : 4: substitution.inline 2.11% : 0.000006s : 6: substitution.j_node_and_user_rematch 2.73% : 0.000007s : 6: substitution.remove_not_recompute_node 3.06% : 0.000008s : 4: substitution.replace_old_param 5.25% : 0.000014s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.096130 2 98.87% : 0.095049s : 1: type_inference.infer 1.13% : 0.001082s : 1: type_inference.specialize ------[replace.] 0.000068 8 63.01% : 0.000043s : 4: replace.inline 36.99% : 0.000025s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000191 8 93.74% : 0.000179s : 4: match.inline 6.26% : 0.000012s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000259 1504 0.92% : 0.000002s : 15: predicate.accumulaten_eliminater 0.79% : 0.000002s : 5: predicate.ad_related_special_op_eliminate 0.56% : 0.000001s : 10: predicate.addn_check_dump 0.92% : 0.000002s : 15: predicate.addn_zero_filter 0.77% : 0.000002s : 15: predicate.adjust_all_reduce_mul_add 2.30% : 0.000006s : 25: predicate.arithmetic_simplify 0.96% : 0.000002s : 15: predicate.cast_eliminate 0.65% : 0.000002s : 10: predicate.check_bprop_eliminate 0.59% : 0.000002s : 10: predicate.compare_switch_simplify 0.18% : 0.000000s : 5: predicate.const_output_eliminate 0.67% : 0.000002s : 10: predicate.depend_value_elim 0.87% : 0.000002s : 15: predicate.dict_get_item_const_eliminator 1.03% : 0.000003s : 15: predicate.dict_get_item_eliminator 0.88% : 0.000002s : 15: predicate.dict_set_item_eliminator 1.03% : 0.000003s : 10: predicate.dumpgradient_eliminate 0.27% : 0.000001s : 5: predicate.elim_not_effective 0.49% : 0.000001s : 5: predicate.elim_shapecalc_of_broadcastargs 1.38% : 0.000004s : 20: predicate.environ_add_const_eliminate 1.09% : 0.000003s : 20: predicate.environ_get_add_eliminate 1.05% : 0.000003s : 20: predicate.environ_get_depend_swap 1.80% : 0.000005s : 30: predicate.environ_get_eliminate 1.13% : 0.000003s : 20: predicate.environ_get_set_eliminate 1.27% : 0.000003s : 23: predicate.exchange_switch_depend_value 2.16% : 0.000006s : 23: predicate.float_depend_g_call 0.55% : 0.000001s : 10: predicate.float_environ_get_switch 0.95% : 0.000002s : 15: predicate.float_tuple_getitem_switch 0.20% : 0.000001s : 5: predicate.fold_const_symbol 0.75% : 0.000002s : 10: predicate.get_grad_eliminate 0.20% : 0.000001s : 5: predicate.graph_param_transform 0.62% : 0.000002s : 10: predicate.incorporate_call 0.50% : 0.000001s : 10: predicate.incorporate_call_switch 6.09% : 0.000016s : 68: predicate.inline 0.97% : 0.000002s : 10: predicate.inline_without_move 0.33% : 0.000001s : 10: predicate.j_node_and_user_rematch 1.02% : 0.000003s : 10: predicate.less_batch_normalization 1.87% : 0.000005s : 29: predicate.list_to_tuple_eliminator_ 2.55% : 0.000007s : 44: predicate.load_eliminater 0.89% : 0.000002s : 5: predicate.loop_unroll_after_grad 2.21% : 0.000006s : 36: predicate.loop_unroll_before_grad 1.52% : 0.000004s : 25: predicate.make_slice_get_slice_eliminator 0.59% : 0.000002s : 10: predicate.merge_addn 0.57% : 0.000001s : 10: predicate.micro_step_allgather_replace 0.56% : 0.000001s : 10: predicate.mini_step_allgather_replace 0.77% : 0.000002s : 15: predicate.minmaximum_grad 1.27% : 0.000003s : 5: predicate.mutable_eliminate 0.37% : 0.000001s : 5: predicate.opt_reshape 0.55% : 0.000001s : 5: predicate.parallel_virtual_node 1.65% : 0.000004s : 23: predicate.partial_defer_inline 1.50% : 0.000004s : 24: predicate.partial_eliminate 0.89% : 0.000002s : 15: predicate.print_const_string_wrapper 0.63% : 0.000002s : 10: predicate.reduce_all_const_elim 1.43% : 0.000004s : 15: predicate.reduce_eliminate 2.36% : 0.000006s : 44: predicate.redundant_stop_gradient_eliminater 0.36% : 0.000001s : 10: predicate.remove_not_recompute_node 1.31% : 0.000003s : 29: predicate.replace_applicator 0.49% : 0.000001s : 10: predicate.replace_old_param 0.44% : 0.000001s : 5: predicate.reset_defer_inline 1.09% : 0.000003s : 15: predicate.reshape_eliminate 0.59% : 0.000002s : 10: predicate.row_tensor_add_zeros_like 0.51% : 0.000001s : 5: predicate.row_tensor_eliminate 0.99% : 0.000003s : 10: predicate.same_eliminate 0.39% : 0.000001s : 10: predicate.set_cell_output_no_recompute 0.84% : 0.000002s : 10: predicate.shard_identity_eliminate 0.75% : 0.000002s : 10: predicate.special_op_eliminate 0.68% : 0.000002s : 10: predicate.specialize_transform 0.94% : 0.000002s : 10: predicate.split_environ_get_set_with_tuple_value 0.87% : 0.000002s : 10: predicate.stack_unstack_eliminate 0.34% : 0.000001s : 5: predicate.switch_call_monad_eliminater 1.43% : 0.000004s : 23: predicate.switch_defer_inline 1.85% : 0.000005s : 33: predicate.switch_layer_defer_inline 4.67% : 0.000012s : 74: predicate.switch_simplify 0.95% : 0.000002s : 15: predicate.tile_eliminate 0.93% : 0.000002s : 15: predicate.transpose_eliminate 1.59% : 0.000004s : 25: predicate.tuple_list_convert_item_index_to_positive 1.52% : 0.000004s : 25: predicate.tuple_list_get_item_const_eliminator 1.53% : 0.000004s : 25: predicate.tuple_list_get_item_depend_reorder 3.18% : 0.000008s : 39: predicate.tuple_list_get_item_eliminator 1.41% : 0.000004s : 25: predicate.tuple_list_get_set_item_eliminator 2.39% : 0.000006s : 35: predicate.tuple_list_set_item_eliminator 1.79% : 0.000005s : 29: predicate.tuple_to_list_eliminator_ 2.31% : 0.000006s : 44: predicate.updatestate_pure_node_eliminater 3.03% : 0.000008s : 54: predicate.updatestate_useless_node_eliminater 0.46% : 0.000001s : 5: predicate.value_based_eliminate 0.65% : 0.000002s : 10: predicate.virtual_dataset_eliminate 0.81% : 0.000002s : 10: predicate.virtual_output_eliminate 0.35% : 0.000001s : 5: predicate.virtual_view_grad_eliminate 0.47% : 0.000001s : 5: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000832 11 44.32% : 0.000369s : 5: func_graph_cloner_run.FuncGraphClonerGraph 55.68% : 0.000463s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.367577 192 0.00% : 0.000004s : 1: ForceFp32Comm 1.06% : 0.003906s : 1: add_attr 1.06% : 0.003891s : 1: add_attr_with_inline 0.00% : 0.000003s : 1: add_comm_op_reuse_tag 0.02% : 0.000070s : 1: add_recomputation 0.00% : 0.000004s : 1: assign_add_opt 0.02% : 0.000085s : 1: auto_monad 0.01% : 0.000027s : 1: auto_monad_reorder 0.00% : 0.000003s : 1: begin_end_overlap_inline 0.00% : 0.000006s : 1: bias_add_comm_swap 0.14% : 0.000503s : 1: bootstrap 0.01% : 0.000034s : 1: cconv 0.00% : 0.000004s : 1: comm_op_add_attrs 0.01% : 0.000019s : 1: control_data_broadcast_order 0.00% : 0.000011s : 1: convert_after_rewriter 0.01% : 0.000030s : 1: cse_after_recomputation 0.00% : 0.000005s : 1: dataset_repeat_opt 0.00% : 0.000006s : 1: detach_backward 0.00% : 0.000010s : 1: environ_conv 0.01% : 0.000032s : 1: event_method 0.00% : 0.000005s : 1: full_micro_interleaved_order_control 0.00% : 0.000005s : 1: get_jit_bprop_graph 0.00% : 0.000010s : 1: graph_reusing 0.00% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000004s : 1: handle_group_info 0.00% : 0.000006s : 1: inline 0.00% : 0.000006s : 1: insert-virtual-dataset 0.00% : 0.000004s : 1: interleave_parallel_branches 0.00% : 0.000004s : 1: interleave_split_concat_branches 0.00% : 0.000007s : 1: label_fine_grained_interleaved_index 0.00% : 0.000008s : 1: label_micro_interleaved_index 0.14% : 0.000501s : 1: loop_unroll 0.00% : 0.000004s : 1: merge_cast_opt 0.00% : 0.000006s : 1: micro_interleaved_order_control 0.23% : 0.000827s : 1: mutable_eliminate 0.00% : 0.000009s : 1: offloading_packed_experts 0.00% : 0.000018s : 1: opt.transform.loop_unroll_optimizer 0.01% : 0.000023s : 1: opt.transform.mutable_eliminate 0.41% : 0.001504s : 78: opt.transform.opt_a 0.01% : 0.000042s : 1: opt.transform.opt_after_cconv 0.01% : 0.000034s : 1: opt.transform.opt_after_jit_grad 0.04% : 0.000161s : 28: opt.transform.opt_b 0.02% : 0.000069s : 2: opt.transform.opt_trans_graph 0.01% : 0.000049s : 4: opt.transform.symbol_engine_opt 23.36% : 0.085867s : 1: opt_a 0.04% : 0.000132s : 1: opt_after_cconv 0.15% : 0.000561s : 1: opt_after_jit_grad 0.08% : 0.000302s : 1: opt_b 24.08% : 0.088522s : 1: optimize 0.01% : 0.000024s : 1: optimize_parallel_all_gather_comm 0.00% : 0.000010s : 1: order_py_execute_after_rewriter 0.01% : 0.000029s : 1: overlap_grad_flash_sp 0.00% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.00% : 0.000009s : 1: overlap_grad_ring_attention 0.00% : 0.000006s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000005s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000005s : 1: overlap_param_gather 0.00% : 0.000005s : 1: overlap_recompute_allgather_and_fa_grad 0.00% : 0.000009s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000005s : 1: overlap_recompute_comm 0.00% : 0.000008s : 1: parallel-infer-symbol 0.00% : 0.000004s : 1: parallel-infer-symbol-second 0.00% : 0.000005s : 1: partial_unused_args_eliminate 0.00% : 0.000004s : 1: pipeline_parallel_scheduler 0.00% : 0.000005s : 1: pipeline_split 0.01% : 0.000049s : 1: pre_auto_parallel 0.01% : 0.000038s : 1: py_interpret_to_execute 0.01% : 0.000022s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000006s : 1: remove_cast_before_assign_add 0.01% : 0.000020s : 1: remove_dup_value 22.52% : 0.082775s : 1: renormalize.infer 0.15% : 0.000540s : 1: renormalize.specialize 0.00% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.00% : 0.000008s : 1: rewriter_after_jit_bprop_graph 0.01% : 0.000047s : 1: rewriter_after_opt_a 0.03% : 0.000103s : 1: rewriter_before_opt_a 0.00% : 0.000008s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000005s : 1: slice_recompute_activation 0.00% : 0.000004s : 1: split_layernorm_comm 0.00% : 0.000005s : 1: split_matmul_comm_elemetwise 0.00% : 0.000010s : 1: swap_dp_allreduce_reducescatter 0.03% : 0.000097s : 1: symbol_engine_optimizer 0.03% : 0.000101s : 1: tuple_transform 26.18% : 0.096250s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:39:35.353.862 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:39:35.354.187 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.165782, [21] [bootstrap]: 0.00049591 [type_inference]: 0.00712208 [event_method]: 2.482e-05 [auto_monad]: 7.69e-05 [graph_reusing]: 6.37001e-06 [inline]: 3.13e-06 [add_attr]: 0.00402105, [1] [add_attr_with_inline]: 0.00400561, [1] [Cycle 1]: 0.00010467, [2] [tag_attr]: 2.639e-05 [meta_addattr_fg_expand]: 6.33998e-06 [parallel-infer-symbol]: 3.96001e-06 [pre_auto_parallel]: 4.893e-05 [insert-virtual-dataset]: 2.176e-05 [parallel-infer-symbol-second]: 1.23002e-06 [dataset_repeat_opt]: 2.31998e-06 [pipeline_split]: 1.70001e-06 [optimize]: 0.152461, [53] [py_interpret_to_execute]: 4.598e-05 [rewriter_before_opt_a]: 0.00011138 [opt_a]: 0.149251, [2] [Cycle 1]: 0.00655012, [45] [expand_dump_flag]: 3.63e-06 [switch_simplify]: 4.44e-05 [loop_unroll]: 3.323e-05 [a_1]: 0.00078723 [with_stream_mark]: 2.904e-05 [recompute_prepare]: 1.526e-05 [updatestate_depend_eliminate]: 6.10002e-06 [updatestate_assign_eliminate]: 4.73001e-06 [updatestate_loads_eliminate]: 4.80001e-06 [parameter_eliminate]: 2.34001e-06 [a_2]: 0.00015235 [accelerated_algorithm]: 1.136e-05 [shard]: 2.64001e-06 [meta_shard_fg_expand]: 3.13e-06 [shard_inline]: 1.055e-05 [merge_send_recv]: 1.279e-05 [auto_parallel]: 1.133e-05 [parallel]: 2.228e-05 [flash_sp]: 1.12e-05 [merge_comm]: 5.98998e-06 [allreduce_fusion]: 5.46998e-06 [matmul_add_comm_reduction]: 1.431e-05 [allreduce_slice_to_reducescatter]: 8.69972e-07 [virtual_shard_identity]: 1.301e-05 [virtual_dataset]: 1.215e-05 [get_grad_eliminate_]: 1.15e-05 [virtual_output]: 1.105e-05 [merge_forward]: 6.00002e-06 [cell_reuse_recompute_pass]: 1.79e-06 [offload_activation]: 1.303e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.736e-05 [merge_recompute_call_nodes]: 1.79e-06 [before_grad]: 1.939e-05 [set_forward_comm_id_for_comm_node_pass]: 7.62002e-06 [meta_fg_expand]: 4.80001e-06 [flash_sp_send_recv_attached]: 3.76999e-06 [receive_attached]: 2.64001e-06 [after_resolve]: 2.026e-05 [a_after_grad]: 1.928e-05 [renormalize]: 0.00340208 [add_forward_monad_depend]: 5.252e-05 [auto_monad_grad]: 9.00001e-06 [auto_monad_eliminator]: 0.00013996 [cse]: 0.00020962 [a_3]: 0.00041441 [Cycle 2]: 0.142658, [45] [expand_dump_flag]: 0.141233 [switch_simplify]: 5.11e-05 [loop_unroll]: 1.161e-05 [a_1]: 0.00027295 [with_stream_mark]: 3.993e-05 [recompute_prepare]: 9.84001e-06 [updatestate_depend_eliminate]: 6.31e-06 [updatestate_assign_eliminate]: 4.79002e-06 [updatestate_loads_eliminate]: 4.58999e-06 [parameter_eliminate]: 2.78e-06 [a_2]: 0.00014946 [accelerated_algorithm]: 9.49999e-06 [shard]: 2.80002e-06 [meta_shard_fg_expand]: 4.62e-06 [shard_inline]: 1.038e-05 [merge_send_recv]: 1.332e-05 [auto_parallel]: 1.248e-05 [parallel]: 1.035e-05 [flash_sp]: 4.54002e-06 [merge_comm]: 5.75001e-06 [allreduce_fusion]: 5.24998e-06 [matmul_add_comm_reduction]: 1.53e-05 [allreduce_slice_to_reducescatter]: 9.20001e-07 [virtual_shard_identity]: 1.096e-05 [virtual_dataset]: 9.22001e-06 [get_grad_eliminate_]: 8.75999e-06 [virtual_output]: 9.02999e-06 [merge_forward]: 6.60002e-06 [cell_reuse_recompute_pass]: 3.42002e-06 [offload_activation]: 1.428e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.152e-05 [merge_recompute_call_nodes]: 1.78997e-06 [before_grad]: 1.718e-05 [set_forward_comm_id_for_comm_node_pass]: 5.74e-06 [meta_fg_expand]: 5.03002e-06 [flash_sp_send_recv_attached]: 1.81e-06 [receive_attached]: 2.52001e-06 [after_resolve]: 1.743e-05 [a_after_grad]: 1.406e-05 [renormalize]: 5.9983e-08 [add_forward_monad_depend]: 3.41001e-06 [auto_monad_grad]: 2.87002e-06 [auto_monad_eliminator]: 2.095e-05 [cse]: 4.977e-05 [a_3]: 7.129e-05 [py_interpret_to_execute_after_opt_a]: 3.087e-05 [slice_cell_reuse_recomputed_activation]: 5.01002e-06 [rewriter_after_opt_a]: 8.378e-05 [convert_after_rewriter]: 1.208e-05 [order_py_execute_after_rewriter]: 9.74999e-06 [mutable_eliminate]: 0.00082773 [opt_b]: 0.00042739, [1] [Cycle 1]: 0.0004157, [7] [b_1]: 0.00027618 [b_2]: 1.334e-05 [updatestate_depend_eliminate]: 1.043e-05 [updatestate_assign_eliminate]: 4.60999e-06 [updatestate_loads_eliminate]: 3.93999e-06 [renormalize]: 7.59988e-07 [cse]: 3.628e-05 [optimize_parallel_all_gather_comm]: 2.686e-05 [overlap_param_gather]: 5.30001e-06 [cconv]: 4.194e-05 [loop_unroll]: 0.00051462 [opt_after_cconv]: 0.00016681, [1] [Cycle 1]: 0.0001572, [7] [c_1]: 4.981e-05 [parameter_eliminate]: 3.97e-06 [updatestate_depend_eliminate]: 6.84999e-06 [updatestate_assign_eliminate]: 3.85998e-06 [updatestate_loads_eliminate]: 4.06001e-06 [cse]: 2.823e-05 [renormalize]: 4.80009e-07 [remove_dup_value]: 5.641e-05 [tuple_transform]: 0.00012737, [1] [Cycle 1]: 0.00011844, [4] [d_1]: 7.395e-05 [none_parameter_eliminate]: 2.14e-06 [renormalize]: 1.8999e-07 [switch_simplify]: 9.86003e-06 [partial_unused_args_eliminate]: 5.39e-06 [add_recomputation]: 7.383e-05 [cse_after_recomputation]: 5.782e-05, [1] [Cycle 1]: 4.973e-05, [1] [cse]: 3.839e-05 [environ_conv]: 1.15e-05 [swap_dp_allreduce_reducescatter]: 1.133e-05 [bias_add_comm_swap]: 6.75002e-06 [label_micro_interleaved_index]: 8.17998e-06 [label_fine_grained_interleaved_index]: 5.92001e-06 [merge_cast_opt]: 3.93999e-06 [slice_recompute_activation]: 5.07e-06 [micro_interleaved_order_control]: 4.99998e-06 [assign_add_opt]: 3.78001e-06 [ForceFp32Comm]: 3.35e-06 [remove_cast_before_assign_add]: 3.81999e-06 [full_micro_interleaved_order_control]: 4.95999e-06 [reorder_send_recv_between_fp_bp]: 5.69e-06 [comm_op_add_attrs]: 3.7e-06 [add_comm_op_reuse_tag]: 3.97e-06 [interleave_split_concat_branches]: 3.75e-06 [interleave_parallel_branches]: 3.76999e-06 [overlap_opt_shard_in_pipeline]: 4e-06 [overlap_opt_shard_grad_in_pipeline]: 5.10999e-06 [control_data_broadcast_order]: 2.077e-05 [grouped_pairwise_exchange_alltoall]: 4.26001e-06 [offloading_packed_experts]: 8.69e-06 [overlap_recompute_and_grad_model_parallel]: 8.77e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.83001e-06 [overlap_recompute_allgather_and_fa_grad]: 4.48001e-06 [overlap_recompute_comm]: 5.49e-06 [overlap_grad_ring_attention]: 9.34e-06 [overlap_grad_flash_sp]: 3.159e-05 [begin_end_overlap_inline]: 3.52002e-06 [split_matmul_comm_elemetwise]: 5.39e-06 [split_layernorm_comm]: 5.10001e-06 [handle_group_info]: 4.05e-06 [symbol_engine_optimizer]: 0.00012823, [1] [Cycle 1]: 0.00011955, [6] [build]: 5.21002e-06 [elim_shapecalc]: 1.656e-05 [elim_not_effective]: 1.969e-05 [opt_reshape]: 1.086e-05 [fold_const_symbol]: 1.593e-05 [renormalize]: 2.09984e-07 [detach_backward]: 4.90001e-06 [pipeline_parallel_scheduler]: 2.16e-06 [auto_monad_reorder]: 2.725e-05 [get_jit_bprop_graph]: 2.60002e-06 [rewriter_after_jit_bprop_graph]: 6.46e-06 [opt_after_jit_grad]: 0.00067065 [validate]: 5.531e-05 Sums bootstrap : 0.000496s : 0.31% type_inference : 0.007122s : 4.48% event_method : 0.000025s : 0.02% auto_monad : 0.000077s : 0.05% graph_reusing : 0.000006s : 0.00% inline : 0.000003s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000026s : 0.02% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.00% parallel-infer-symbol : 0.000004s : 0.00% pre_auto_parallel : 0.000049s : 0.03% insert-virtual-dataset : 0.000022s : 0.01% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000046s : 0.03% optimize.rewriter_before_opt_a : 0.000111s : 0.07% optimize.opt_a.expand_dump_flag : 0.141236s : 88.78% optimize.opt_a.switch_simplify : 0.000096s : 0.06% optimize.opt_a.loop_unroll : 0.000045s : 0.03% optimize.opt_a.a_1 : 0.001060s : 0.67% optimize.opt_a.with_stream_mark : 0.000069s : 0.04% optimize.opt_a.recompute_prepare : 0.000025s : 0.02% optimize.opt_a.updatestate_depend_eliminate : 0.000012s : 0.01% optimize.opt_a.updatestate_assign_eliminate : 0.000010s : 0.01% optimize.opt_a.updatestate_loads_eliminate : 0.000009s : 0.01% optimize.opt_a.parameter_eliminate : 0.000005s : 0.00% optimize.opt_a.a_2 : 0.000302s : 0.19% optimize.opt_a.accelerated_algorithm : 0.000021s : 0.01% optimize.opt_a.shard : 0.000005s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.000008s : 0.00% optimize.opt_a.shard_inline : 0.000021s : 0.01% optimize.opt_a.merge_send_recv : 0.000026s : 0.02% optimize.opt_a.auto_parallel : 0.000024s : 0.01% optimize.opt_a.parallel : 0.000033s : 0.02% optimize.opt_a.flash_sp : 0.000016s : 0.01% optimize.opt_a.merge_comm : 0.000012s : 0.01% optimize.opt_a.allreduce_fusion : 0.000011s : 0.01% optimize.opt_a.matmul_add_comm_reduction : 0.000030s : 0.02% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000002s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000024s : 0.02% optimize.opt_a.virtual_dataset : 0.000021s : 0.01% optimize.opt_a.get_grad_eliminate_ : 0.000020s : 0.01% optimize.opt_a.virtual_output : 0.000020s : 0.01% optimize.opt_a.merge_forward : 0.000013s : 0.01% optimize.opt_a.cell_reuse_recompute_pass : 0.000005s : 0.00% optimize.opt_a.offload_activation : 0.000027s : 0.02% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000049s : 0.03% optimize.opt_a.merge_recompute_call_nodes : 0.000004s : 0.00% optimize.opt_a.before_grad : 0.000037s : 0.02% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000013s : 0.01% optimize.opt_a.meta_fg_expand : 0.000010s : 0.01% optimize.opt_a.flash_sp_send_recv_attached : 0.000006s : 0.00% optimize.opt_a.receive_attached : 0.000005s : 0.00% optimize.opt_a.after_resolve : 0.000038s : 0.02% optimize.opt_a.a_after_grad : 0.000033s : 0.02% optimize.opt_a.renormalize : 0.003402s : 2.14% optimize.opt_a.add_forward_monad_depend : 0.000056s : 0.04% optimize.opt_a.auto_monad_grad : 0.000012s : 0.01% optimize.opt_a.auto_monad_eliminator : 0.000161s : 0.10% optimize.opt_a.cse : 0.000259s : 0.16% optimize.opt_a.a_3 : 0.000486s : 0.31% optimize.py_interpret_to_execute_after_opt_a : 0.000031s : 0.02% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.00% optimize.rewriter_after_opt_a : 0.000084s : 0.05% optimize.convert_after_rewriter : 0.000012s : 0.01% optimize.order_py_execute_after_rewriter : 0.000010s : 0.01% optimize.mutable_eliminate : 0.000828s : 0.52% optimize.opt_b.b_1 : 0.000276s : 0.17% optimize.opt_b.b_2 : 0.000013s : 0.01% optimize.opt_b.updatestate_depend_eliminate : 0.000010s : 0.01% optimize.opt_b.updatestate_assign_eliminate : 0.000005s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000004s : 0.00% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000036s : 0.02% optimize.optimize_parallel_all_gather_comm : 0.000027s : 0.02% optimize.overlap_param_gather : 0.000005s : 0.00% optimize.cconv : 0.000042s : 0.03% optimize.loop_unroll : 0.000515s : 0.32% optimize.opt_after_cconv.c_1 : 0.000050s : 0.03% optimize.opt_after_cconv.parameter_eliminate : 0.000004s : 0.00% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000007s : 0.00% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000004s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000004s : 0.00% optimize.opt_after_cconv.cse : 0.000028s : 0.02% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000056s : 0.04% optimize.tuple_transform.d_1 : 0.000074s : 0.05% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000010s : 0.01% optimize.partial_unused_args_eliminate : 0.000005s : 0.00% optimize.add_recomputation : 0.000074s : 0.05% optimize.cse_after_recomputation.cse : 0.000038s : 0.02% optimize.environ_conv : 0.000012s : 0.01% optimize.swap_dp_allreduce_reducescatter : 0.000011s : 0.01% optimize.bias_add_comm_swap : 0.000007s : 0.00% optimize.label_micro_interleaved_index : 0.000008s : 0.01% optimize.label_fine_grained_interleaved_index : 0.000006s : 0.00% optimize.merge_cast_opt : 0.000004s : 0.00% optimize.slice_recompute_activation : 0.000005s : 0.00% optimize.micro_interleaved_order_control : 0.000005s : 0.00% optimize.assign_add_opt : 0.000004s : 0.00% optimize.ForceFp32Comm : 0.000003s : 0.00% optimize.remove_cast_before_assign_add : 0.000004s : 0.00% optimize.full_micro_interleaved_order_control : 0.000005s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000006s : 0.00% optimize.comm_op_add_attrs : 0.000004s : 0.00% optimize.add_comm_op_reuse_tag : 0.000004s : 0.00% optimize.interleave_split_concat_branches : 0.000004s : 0.00% optimize.interleave_parallel_branches : 0.000004s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000005s : 0.00% optimize.control_data_broadcast_order : 0.000021s : 0.01% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.00% optimize.offloading_packed_experts : 0.000009s : 0.01% optimize.overlap_recompute_and_grad_model_parallel : 0.000009s : 0.01% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.00% optimize.overlap_recompute_comm : 0.000005s : 0.00% optimize.overlap_grad_ring_attention : 0.000009s : 0.01% optimize.overlap_grad_flash_sp : 0.000032s : 0.02% optimize.begin_end_overlap_inline : 0.000004s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000005s : 0.00% optimize.split_layernorm_comm : 0.000005s : 0.00% optimize.handle_group_info : 0.000004s : 0.00% optimize.symbol_engine_optimizer.build : 0.000005s : 0.00% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000017s : 0.01% optimize.symbol_engine_optimizer.elim_not_effective : 0.000020s : 0.01% optimize.symbol_engine_optimizer.opt_reshape : 0.000011s : 0.01% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000016s : 0.01% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000005s : 0.00% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000027s : 0.02% get_jit_bprop_graph : 0.000003s : 0.00% rewriter_after_jit_bprop_graph : 0.000006s : 0.00% opt_after_jit_grad : 0.000671s : 0.42% validate : 0.000055s : 0.03% Time group info: ------[substitution.] 0.000285 48 17.15% : 0.000049s : 6: substitution.cast_eliminate 0.96% : 0.000003s : 4: substitution.elim_not_effective 0.89% : 0.000003s : 4: substitution.fold_const_symbol 2.99% : 0.000009s : 6: substitution.graph_param_transform 64.77% : 0.000185s : 4: substitution.inline 2.34% : 0.000007s : 8: substitution.j_node_and_user_rematch 3.23% : 0.000009s : 8: substitution.remove_not_recompute_node 2.78% : 0.000008s : 4: substitution.replace_old_param 4.89% : 0.000014s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.007054 2 85.70% : 0.006045s : 1: type_inference.infer 14.30% : 0.001009s : 1: type_inference.specialize ------[replace.] 0.000074 8 61.76% : 0.000045s : 4: replace.inline 38.24% : 0.000028s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000193 8 93.91% : 0.000181s : 4: match.inline 6.09% : 0.000012s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000343 1730 0.78% : 0.000003s : 17: predicate.accumulaten_eliminater 0.76% : 0.000003s : 6: predicate.ad_related_special_op_eliminate 0.55% : 0.000002s : 12: predicate.addn_check_dump 0.80% : 0.000003s : 17: predicate.addn_zero_filter 0.66% : 0.000002s : 17: predicate.adjust_all_reduce_mul_add 1.89% : 0.000006s : 29: predicate.arithmetic_simplify 1.08% : 0.000004s : 17: predicate.cast_eliminate 1.60% : 0.000005s : 12: predicate.check_bprop_eliminate 0.70% : 0.000002s : 12: predicate.compare_switch_simplify 0.22% : 0.000001s : 6: predicate.const_output_eliminate 0.71% : 0.000002s : 12: predicate.depend_value_elim 0.77% : 0.000003s : 17: predicate.dict_get_item_const_eliminator 0.87% : 0.000003s : 17: predicate.dict_get_item_eliminator 0.79% : 0.000003s : 17: predicate.dict_set_item_eliminator 0.73% : 0.000002s : 12: predicate.dumpgradient_eliminate 0.23% : 0.000001s : 6: predicate.elim_not_effective 0.38% : 0.000001s : 6: predicate.elim_shapecalc_of_broadcastargs 1.11% : 0.000004s : 23: predicate.environ_add_const_eliminate 1.13% : 0.000004s : 23: predicate.environ_get_add_eliminate 0.95% : 0.000003s : 23: predicate.environ_get_depend_swap 1.62% : 0.000006s : 35: predicate.environ_get_eliminate 0.93% : 0.000003s : 23: predicate.environ_get_set_eliminate 1.14% : 0.000004s : 25: predicate.exchange_switch_depend_value 2.17% : 0.000007s : 25: predicate.float_depend_g_call 0.54% : 0.000002s : 12: predicate.float_environ_get_switch 0.83% : 0.000003s : 18: predicate.float_tuple_getitem_switch 0.18% : 0.000001s : 6: predicate.fold_const_symbol 0.68% : 0.000002s : 12: predicate.get_grad_eliminate 0.18% : 0.000001s : 6: predicate.graph_param_transform 0.55% : 0.000002s : 12: predicate.incorporate_call 0.47% : 0.000002s : 12: predicate.incorporate_call_switch 5.46% : 0.000019s : 78: predicate.inline 0.79% : 0.000003s : 12: predicate.inline_without_move 0.27% : 0.000001s : 12: predicate.j_node_and_user_rematch 0.86% : 0.000003s : 12: predicate.less_batch_normalization 1.63% : 0.000006s : 33: predicate.list_to_tuple_eliminator_ 2.10% : 0.000007s : 50: predicate.load_eliminater 0.93% : 0.000003s : 6: predicate.loop_unroll_after_grad 1.75% : 0.000006s : 38: predicate.loop_unroll_before_grad 1.63% : 0.000006s : 29: predicate.make_slice_get_slice_eliminator 0.67% : 0.000002s : 12: predicate.merge_addn 1.56% : 0.000005s : 12: predicate.micro_step_allgather_replace 1.50% : 0.000005s : 12: predicate.mini_step_allgather_replace 0.70% : 0.000002s : 17: predicate.minmaximum_grad 0.87% : 0.000003s : 6: predicate.mutable_eliminate 0.33% : 0.000001s : 6: predicate.opt_reshape 0.34% : 0.000001s : 6: predicate.parallel_virtual_node 1.35% : 0.000005s : 25: predicate.partial_defer_inline 1.29% : 0.000004s : 27: predicate.partial_eliminate 0.79% : 0.000003s : 17: predicate.print_const_string_wrapper 0.63% : 0.000002s : 12: predicate.reduce_all_const_elim 1.17% : 0.000004s : 17: predicate.reduce_eliminate 2.10% : 0.000007s : 50: predicate.redundant_stop_gradient_eliminater 0.38% : 0.000001s : 12: predicate.remove_not_recompute_node 2.88% : 0.000010s : 33: predicate.replace_applicator 0.48% : 0.000002s : 12: predicate.replace_old_param 0.27% : 0.000001s : 6: predicate.reset_defer_inline 0.90% : 0.000003s : 17: predicate.reshape_eliminate 1.55% : 0.000005s : 12: predicate.row_tensor_add_zeros_like 0.36% : 0.000001s : 6: predicate.row_tensor_eliminate 2.84% : 0.000010s : 12: predicate.same_eliminate 0.39% : 0.000001s : 12: predicate.set_cell_output_no_recompute 0.76% : 0.000003s : 12: predicate.shard_identity_eliminate 0.73% : 0.000002s : 12: predicate.special_op_eliminate 0.69% : 0.000002s : 12: predicate.specialize_transform 1.86% : 0.000006s : 12: predicate.split_environ_get_set_with_tuple_value 0.79% : 0.000003s : 12: predicate.stack_unstack_eliminate 0.30% : 0.000001s : 6: predicate.switch_call_monad_eliminater 1.21% : 0.000004s : 25: predicate.switch_defer_inline 1.72% : 0.000006s : 37: predicate.switch_layer_defer_inline 6.01% : 0.000021s : 81: predicate.switch_simplify 0.78% : 0.000003s : 17: predicate.tile_eliminate 0.85% : 0.000003s : 17: predicate.transpose_eliminate 1.61% : 0.000006s : 29: predicate.tuple_list_convert_item_index_to_positive 1.44% : 0.000005s : 29: predicate.tuple_list_get_item_const_eliminator 1.33% : 0.000005s : 29: predicate.tuple_list_get_item_depend_reorder 2.84% : 0.000010s : 45: predicate.tuple_list_get_item_eliminator 1.42% : 0.000005s : 29: predicate.tuple_list_get_set_item_eliminator 2.12% : 0.000007s : 41: predicate.tuple_list_set_item_eliminator 1.49% : 0.000005s : 33: predicate.tuple_to_list_eliminator_ 2.09% : 0.000007s : 50: predicate.updatestate_pure_node_eliminater 2.72% : 0.000009s : 62: predicate.updatestate_useless_node_eliminater 0.48% : 0.000002s : 6: predicate.value_based_eliminate 0.73% : 0.000002s : 12: predicate.virtual_dataset_eliminate 0.67% : 0.000002s : 12: predicate.virtual_output_eliminate 0.27% : 0.000001s : 6: predicate.virtual_view_grad_eliminate 0.33% : 0.000001s : 6: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.001204 11 30.89% : 0.000372s : 5: func_graph_cloner_run.FuncGraphClonerGraph 69.11% : 0.000832s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.327878 192 0.00% : 0.000006s : 1: ForceFp32Comm 1.23% : 0.004034s : 1: add_attr 1.22% : 0.004011s : 1: add_attr_with_inline 0.00% : 0.000007s : 1: add_comm_op_reuse_tag 0.02% : 0.000077s : 1: add_recomputation 0.00% : 0.000006s : 1: assign_add_opt 0.03% : 0.000086s : 1: auto_monad 0.01% : 0.000036s : 1: auto_monad_reorder 0.00% : 0.000008s : 1: begin_end_overlap_inline 0.00% : 0.000010s : 1: bias_add_comm_swap 0.16% : 0.000540s : 1: bootstrap 0.01% : 0.000045s : 1: cconv 0.00% : 0.000007s : 1: comm_op_add_attrs 0.01% : 0.000024s : 1: control_data_broadcast_order 0.00% : 0.000015s : 1: convert_after_rewriter 0.02% : 0.000061s : 1: cse_after_recomputation 0.00% : 0.000008s : 1: dataset_repeat_opt 0.01% : 0.000024s : 1: detach_backward 0.00% : 0.000014s : 1: environ_conv 0.01% : 0.000036s : 1: event_method 0.00% : 0.000008s : 1: full_micro_interleaved_order_control 0.00% : 0.000009s : 1: get_jit_bprop_graph 0.00% : 0.000013s : 1: graph_reusing 0.00% : 0.000008s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000007s : 1: handle_group_info 0.00% : 0.000009s : 1: inline 0.01% : 0.000030s : 1: insert-virtual-dataset 0.00% : 0.000006s : 1: interleave_parallel_branches 0.00% : 0.000007s : 1: interleave_split_concat_branches 0.00% : 0.000009s : 1: label_fine_grained_interleaved_index 0.00% : 0.000011s : 1: label_micro_interleaved_index 0.16% : 0.000521s : 1: loop_unroll 0.00% : 0.000007s : 1: merge_cast_opt 0.00% : 0.000008s : 1: micro_interleaved_order_control 0.25% : 0.000836s : 1: mutable_eliminate 0.00% : 0.000011s : 1: offloading_packed_experts 0.01% : 0.000020s : 1: opt.transform.loop_unroll_optimizer 0.01% : 0.000022s : 1: opt.transform.mutable_eliminate 0.60% : 0.001961s : 78: opt.transform.opt_a 0.01% : 0.000048s : 1: opt.transform.opt_after_cconv 0.01% : 0.000041s : 1: opt.transform.opt_after_jit_grad 0.06% : 0.000193s : 28: opt.transform.opt_b 0.02% : 0.000082s : 2: opt.transform.opt_trans_graph 0.02% : 0.000059s : 4: opt.transform.symbol_engine_opt 45.52% : 0.149255s : 1: opt_a 0.05% : 0.000171s : 1: opt_after_cconv 0.21% : 0.000682s : 1: opt_after_jit_grad 0.13% : 0.000431s : 1: opt_b 46.63% : 0.152899s : 1: optimize 0.01% : 0.000030s : 1: optimize_parallel_all_gather_comm 0.00% : 0.000013s : 1: order_py_execute_after_rewriter 0.01% : 0.000035s : 1: overlap_grad_flash_sp 0.00% : 0.000007s : 1: overlap_grad_matmul_and_grad_allreduce 0.00% : 0.000012s : 1: overlap_grad_ring_attention 0.00% : 0.000008s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000007s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000008s : 1: overlap_param_gather 0.00% : 0.000008s : 1: overlap_recompute_allgather_and_fa_grad 0.00% : 0.000012s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000008s : 1: overlap_recompute_comm 0.00% : 0.000012s : 1: parallel-infer-symbol 0.00% : 0.000007s : 1: parallel-infer-symbol-second 0.00% : 0.000009s : 1: partial_unused_args_eliminate 0.00% : 0.000009s : 1: pipeline_parallel_scheduler 0.00% : 0.000008s : 1: pipeline_split 0.02% : 0.000057s : 1: pre_auto_parallel 0.02% : 0.000050s : 1: py_interpret_to_execute 0.01% : 0.000034s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000006s : 1: remove_cast_before_assign_add 0.02% : 0.000060s : 1: remove_dup_value 0.25% : 0.000834s : 1: renormalize.infer 0.77% : 0.002514s : 1: renormalize.specialize 0.00% : 0.000009s : 1: reorder_send_recv_between_fp_bp 0.00% : 0.000013s : 1: rewriter_after_jit_bprop_graph 0.03% : 0.000088s : 1: rewriter_after_opt_a 0.04% : 0.000116s : 1: rewriter_before_opt_a 0.00% : 0.000008s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000008s : 1: slice_recompute_activation 0.00% : 0.000008s : 1: split_layernorm_comm 0.00% : 0.000008s : 1: split_matmul_comm_elemetwise 0.00% : 0.000014s : 1: swap_dp_allreduce_reducescatter 0.04% : 0.000132s : 1: symbol_engine_optimizer 0.04% : 0.000130s : 1: tuple_transform 2.19% : 0.007176s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:39:37.654.295 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.173067, [21] [bootstrap]: 0.00050769 [type_inference]: 0.0956709 [event_method]: 2.608e-05 [auto_monad]: 8.216e-05 [graph_reusing]: 6.89001e-06 [inline]: 3.03e-06 [add_attr]: 0.00413182, [1] [add_attr_with_inline]: 0.00411771, [1] [Cycle 1]: 9.29e-05, [2] [tag_attr]: 2.71e-05 [meta_addattr_fg_expand]: 6.09999e-06 [parallel-infer-symbol]: 4.21001e-06 [pre_auto_parallel]: 4.479e-05 [insert-virtual-dataset]: 2.73e-06 [parallel-infer-symbol-second]: 9.09989e-07 [dataset_repeat_opt]: 2.63e-06 [pipeline_split]: 1.77001e-06 [optimize]: 0.0715815, [53] [py_interpret_to_execute]: 3.815e-05 [rewriter_before_opt_a]: 0.00010831 [opt_a]: 0.0685343, [2] [Cycle 1]: 0.0675229, [45] [expand_dump_flag]: 3.04001e-06 [switch_simplify]: 4.843e-05 [loop_unroll]: 3.254e-05 [a_1]: 0.0553405 [with_stream_mark]: 3.742e-05 [recompute_prepare]: 1.874e-05 [updatestate_depend_eliminate]: 6.31e-06 [updatestate_assign_eliminate]: 4.55999e-06 [updatestate_loads_eliminate]: 4.60999e-06 [parameter_eliminate]: 2.46998e-06 [a_2]: 0.00013383 [accelerated_algorithm]: 1.188e-05 [shard]: 2.99999e-06 [meta_shard_fg_expand]: 4.50999e-06 [shard_inline]: 9.87001e-06 [merge_send_recv]: 1.336e-05 [auto_parallel]: 1.398e-05 [parallel]: 3.276e-05 [flash_sp]: 1.252e-05 [merge_comm]: 6.61e-06 [allreduce_fusion]: 5.77001e-06 [matmul_add_comm_reduction]: 1.381e-05 [allreduce_slice_to_reducescatter]: 9.39996e-07 [virtual_shard_identity]: 1.337e-05 [virtual_dataset]: 1.144e-05 [get_grad_eliminate_]: 1.091e-05 [virtual_output]: 1.12e-05 [merge_forward]: 6.31998e-06 [cell_reuse_recompute_pass]: 3.8e-06 [offload_activation]: 1.357e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.234e-05 [merge_recompute_call_nodes]: 1.64e-06 [before_grad]: 1.824e-05 [set_forward_comm_id_for_comm_node_pass]: 5.81998e-06 [meta_fg_expand]: 4.58001e-06 [flash_sp_send_recv_attached]: 3.18e-06 [receive_attached]: 2.24001e-06 [after_resolve]: 1.932e-05 [a_after_grad]: 1.848e-05 [renormalize]: 0.0110393 [add_forward_monad_depend]: 1.275e-05 [auto_monad_grad]: 3.00002e-06 [auto_monad_eliminator]: 3.091e-05 [cse]: 5.562e-05 [a_3]: 8.859e-05 [Cycle 2]: 0.00099553, [45] [expand_dump_flag]: 2.89999e-06 [switch_simplify]: 1.218e-05 [loop_unroll]: 9.37001e-06 [a_1]: 0.00024099 [with_stream_mark]: 2.449e-05 [recompute_prepare]: 9.51e-06 [updatestate_depend_eliminate]: 5.57999e-06 [updatestate_assign_eliminate]: 4.25999e-06 [updatestate_loads_eliminate]: 4.3e-06 [parameter_eliminate]: 2.23998e-06 [a_2]: 0.00011521 [accelerated_algorithm]: 1.053e-05 [shard]: 2.45002e-06 [meta_shard_fg_expand]: 3.02002e-06 [shard_inline]: 9.21002e-06 [merge_send_recv]: 1.081e-05 [auto_parallel]: 1.144e-05 [parallel]: 9.56998e-06 [flash_sp]: 4.31002e-06 [merge_comm]: 5.97999e-06 [allreduce_fusion]: 5.78002e-06 [matmul_add_comm_reduction]: 1.36e-05 [allreduce_slice_to_reducescatter]: 1.35999e-06 [virtual_shard_identity]: 1.072e-05 [virtual_dataset]: 9.54e-06 [get_grad_eliminate_]: 8.57998e-06 [virtual_output]: 8.78001e-06 [merge_forward]: 5.96003e-06 [cell_reuse_recompute_pass]: 3.34001e-06 [offload_activation]: 1.411e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.902e-05 [merge_recompute_call_nodes]: 1.49e-06 [before_grad]: 1.574e-05 [set_forward_comm_id_for_comm_node_pass]: 5.66e-06 [meta_fg_expand]: 4.18001e-06 [flash_sp_send_recv_attached]: 1.83997e-06 [receive_attached]: 2.35002e-06 [after_resolve]: 1.677e-05 [a_after_grad]: 1.501e-05 [renormalize]: 8.00064e-08 [add_forward_monad_depend]: 1.84e-06 [auto_monad_grad]: 2.00002e-06 [auto_monad_eliminator]: 1.325e-05 [cse]: 3.182e-05 [a_3]: 5.507e-05 [py_interpret_to_execute_after_opt_a]: 2.227e-05 [slice_cell_reuse_recomputed_activation]: 2.68e-06 [rewriter_after_opt_a]: 5.487e-05 [convert_after_rewriter]: 9.46e-06 [order_py_execute_after_rewriter]: 6.73e-06 [mutable_eliminate]: 0.00086467 [opt_b]: 0.00035759, [1] [Cycle 1]: 0.00034873, [7] [b_1]: 0.00022761 [b_2]: 1.331e-05 [updatestate_depend_eliminate]: 1.13e-05 [updatestate_assign_eliminate]: 4.33999e-06 [updatestate_loads_eliminate]: 4.2e-06 [renormalize]: 1.00001e-06 [cse]: 4.531e-05 [optimize_parallel_all_gather_comm]: 2.478e-05 [overlap_param_gather]: 7.59002e-06 [cconv]: 3.808e-05 [loop_unroll]: 0.00053202 [opt_after_cconv]: 0.00016372, [1] [Cycle 1]: 0.0001565, [7] [c_1]: 5.405e-05 [parameter_eliminate]: 6.26e-06 [updatestate_depend_eliminate]: 9.42999e-06 [updatestate_assign_eliminate]: 4.43999e-06 [updatestate_loads_eliminate]: 4.23001e-06 [cse]: 3.757e-05 [renormalize]: 9.70002e-07 [remove_dup_value]: 5.816e-05 [tuple_transform]: 0.00011746, [1] [Cycle 1]: 0.00011097, [4] [d_1]: 7.852e-05 [none_parameter_eliminate]: 1.91e-06 [renormalize]: 2.59985e-07 [switch_simplify]: 1.039e-05 [partial_unused_args_eliminate]: 2.63e-06 [add_recomputation]: 7.65e-05 [cse_after_recomputation]: 3.438e-05, [1] [Cycle 1]: 2.903e-05, [1] [cse]: 2.187e-05 [environ_conv]: 7.81001e-06 [swap_dp_allreduce_reducescatter]: 7.69002e-06 [bias_add_comm_swap]: 3.80998e-06 [label_micro_interleaved_index]: 6.48e-06 [label_fine_grained_interleaved_index]: 3.04999e-06 [merge_cast_opt]: 1.71998e-06 [slice_recompute_activation]: 3.09999e-06 [micro_interleaved_order_control]: 2.58e-06 [assign_add_opt]: 1.59998e-06 [ForceFp32Comm]: 8.79983e-07 [remove_cast_before_assign_add]: 1.25999e-06 [full_micro_interleaved_order_control]: 2.27999e-06 [reorder_send_recv_between_fp_bp]: 2.64001e-06 [comm_op_add_attrs]: 1.04e-06 [add_comm_op_reuse_tag]: 9.89996e-07 [interleave_split_concat_branches]: 1.22e-06 [interleave_parallel_branches]: 1.11997e-06 [overlap_opt_shard_in_pipeline]: 6.06e-06 [overlap_opt_shard_grad_in_pipeline]: 2.01e-06 [control_data_broadcast_order]: 2.256e-05 [grouped_pairwise_exchange_alltoall]: 1.67999e-06 [offloading_packed_experts]: 6.48e-06 [overlap_recompute_and_grad_model_parallel]: 6.96999e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.17e-06 [overlap_recompute_allgather_and_fa_grad]: 1.50999e-06 [overlap_recompute_comm]: 3.15998e-06 [overlap_grad_ring_attention]: 6.26e-06 [overlap_grad_flash_sp]: 2.971e-05 [begin_end_overlap_inline]: 6.09987e-07 [split_matmul_comm_elemetwise]: 2.28002e-06 [split_layernorm_comm]: 1.72001e-06 [handle_group_info]: 1.00001e-06 [symbol_engine_optimizer]: 0.00012072, [1] [Cycle 1]: 0.00011452, [6] [build]: 5.21002e-06 [elim_shapecalc]: 2.063e-05 [elim_not_effective]: 2.348e-05 [opt_reshape]: 1.074e-05 [fold_const_symbol]: 1.751e-05 [renormalize]: 4.19997e-07 [detach_backward]: 2.37999e-06 [pipeline_parallel_scheduler]: 1.82999e-06 [auto_monad_reorder]: 2.795e-05 [get_jit_bprop_graph]: 2.75997e-06 [rewriter_after_jit_bprop_graph]: 7.39002e-06 [opt_after_jit_grad]: 0.00070456 [validate]: 6.027e-05 Sums bootstrap : 0.000508s : 0.30% type_inference : 0.095671s : 57.04% event_method : 0.000026s : 0.02% auto_monad : 0.000082s : 0.05% graph_reusing : 0.000007s : 0.00% inline : 0.000003s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000027s : 0.02% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.00% parallel-infer-symbol : 0.000004s : 0.00% pre_auto_parallel : 0.000045s : 0.03% insert-virtual-dataset : 0.000003s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000003s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000038s : 0.02% optimize.rewriter_before_opt_a : 0.000108s : 0.06% optimize.opt_a.expand_dump_flag : 0.000006s : 0.00% optimize.opt_a.switch_simplify : 0.000061s : 0.04% optimize.opt_a.loop_unroll : 0.000042s : 0.02% optimize.opt_a.a_1 : 0.055581s : 33.14% optimize.opt_a.with_stream_mark : 0.000062s : 0.04% optimize.opt_a.recompute_prepare : 0.000028s : 0.02% optimize.opt_a.updatestate_depend_eliminate : 0.000012s : 0.01% optimize.opt_a.updatestate_assign_eliminate : 0.000009s : 0.01% optimize.opt_a.updatestate_loads_eliminate : 0.000009s : 0.01% optimize.opt_a.parameter_eliminate : 0.000005s : 0.00% optimize.opt_a.a_2 : 0.000249s : 0.15% optimize.opt_a.accelerated_algorithm : 0.000022s : 0.01% optimize.opt_a.shard : 0.000005s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.000008s : 0.00% optimize.opt_a.shard_inline : 0.000019s : 0.01% optimize.opt_a.merge_send_recv : 0.000024s : 0.01% optimize.opt_a.auto_parallel : 0.000025s : 0.02% optimize.opt_a.parallel : 0.000042s : 0.03% optimize.opt_a.flash_sp : 0.000017s : 0.01% optimize.opt_a.merge_comm : 0.000013s : 0.01% optimize.opt_a.allreduce_fusion : 0.000012s : 0.01% optimize.opt_a.matmul_add_comm_reduction : 0.000027s : 0.02% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000002s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000024s : 0.01% optimize.opt_a.virtual_dataset : 0.000021s : 0.01% optimize.opt_a.get_grad_eliminate_ : 0.000019s : 0.01% optimize.opt_a.virtual_output : 0.000020s : 0.01% optimize.opt_a.merge_forward : 0.000012s : 0.01% optimize.opt_a.cell_reuse_recompute_pass : 0.000007s : 0.00% optimize.opt_a.offload_activation : 0.000028s : 0.02% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000041s : 0.02% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.00% optimize.opt_a.before_grad : 0.000034s : 0.02% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000011s : 0.01% optimize.opt_a.meta_fg_expand : 0.000009s : 0.01% optimize.opt_a.flash_sp_send_recv_attached : 0.000005s : 0.00% optimize.opt_a.receive_attached : 0.000005s : 0.00% optimize.opt_a.after_resolve : 0.000036s : 0.02% optimize.opt_a.a_after_grad : 0.000033s : 0.02% optimize.opt_a.renormalize : 0.011039s : 6.58% optimize.opt_a.add_forward_monad_depend : 0.000015s : 0.01% optimize.opt_a.auto_monad_grad : 0.000005s : 0.00% optimize.opt_a.auto_monad_eliminator : 0.000044s : 0.03% optimize.opt_a.cse : 0.000087s : 0.05% optimize.opt_a.a_3 : 0.000144s : 0.09% optimize.py_interpret_to_execute_after_opt_a : 0.000022s : 0.01% optimize.slice_cell_reuse_recomputed_activation : 0.000003s : 0.00% optimize.rewriter_after_opt_a : 0.000055s : 0.03% optimize.convert_after_rewriter : 0.000009s : 0.01% optimize.order_py_execute_after_rewriter : 0.000007s : 0.00% optimize.mutable_eliminate : 0.000865s : 0.52% optimize.opt_b.b_1 : 0.000228s : 0.14% optimize.opt_b.b_2 : 0.000013s : 0.01% optimize.opt_b.updatestate_depend_eliminate : 0.000011s : 0.01% optimize.opt_b.updatestate_assign_eliminate : 0.000004s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000004s : 0.00% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000045s : 0.03% optimize.optimize_parallel_all_gather_comm : 0.000025s : 0.01% optimize.overlap_param_gather : 0.000008s : 0.00% optimize.cconv : 0.000038s : 0.02% optimize.loop_unroll : 0.000532s : 0.32% optimize.opt_after_cconv.c_1 : 0.000054s : 0.03% optimize.opt_after_cconv.parameter_eliminate : 0.000006s : 0.00% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000009s : 0.01% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000004s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000004s : 0.00% optimize.opt_after_cconv.cse : 0.000038s : 0.02% optimize.opt_after_cconv.renormalize : 0.000001s : 0.00% optimize.remove_dup_value : 0.000058s : 0.03% optimize.tuple_transform.d_1 : 0.000079s : 0.05% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000010s : 0.01% optimize.partial_unused_args_eliminate : 0.000003s : 0.00% optimize.add_recomputation : 0.000077s : 0.05% optimize.cse_after_recomputation.cse : 0.000022s : 0.01% optimize.environ_conv : 0.000008s : 0.00% optimize.swap_dp_allreduce_reducescatter : 0.000008s : 0.00% optimize.bias_add_comm_swap : 0.000004s : 0.00% optimize.label_micro_interleaved_index : 0.000006s : 0.00% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.00% optimize.merge_cast_opt : 0.000002s : 0.00% optimize.slice_recompute_activation : 0.000003s : 0.00% optimize.micro_interleaved_order_control : 0.000003s : 0.00% optimize.assign_add_opt : 0.000002s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.00% optimize.full_micro_interleaved_order_control : 0.000002s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.00% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000006s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.00% optimize.control_data_broadcast_order : 0.000023s : 0.01% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.00% optimize.offloading_packed_experts : 0.000006s : 0.00% optimize.overlap_recompute_and_grad_model_parallel : 0.000007s : 0.00% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000002s : 0.00% optimize.overlap_recompute_comm : 0.000003s : 0.00% optimize.overlap_grad_ring_attention : 0.000006s : 0.00% optimize.overlap_grad_flash_sp : 0.000030s : 0.02% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.00% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000005s : 0.00% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000021s : 0.01% optimize.symbol_engine_optimizer.elim_not_effective : 0.000023s : 0.01% optimize.symbol_engine_optimizer.opt_reshape : 0.000011s : 0.01% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000018s : 0.01% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.00% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000028s : 0.02% get_jit_bprop_graph : 0.000003s : 0.00% rewriter_after_jit_bprop_graph : 0.000007s : 0.00% opt_after_jit_grad : 0.000705s : 0.42% validate : 0.000060s : 0.04% Time group info: ------[substitution.] 0.054700 48 99.55% : 0.054453s : 6: substitution.cast_eliminate 0.01% : 0.000003s : 4: substitution.elim_not_effective 0.00% : 0.000002s : 4: substitution.fold_const_symbol 0.02% : 0.000008s : 6: substitution.graph_param_transform 0.34% : 0.000186s : 4: substitution.inline 0.01% : 0.000007s : 8: substitution.j_node_and_user_rematch 0.02% : 0.000009s : 8: substitution.remove_not_recompute_node 0.02% : 0.000009s : 4: substitution.replace_old_param 0.04% : 0.000023s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.095564 2 84.99% : 0.081218s : 1: type_inference.infer 15.01% : 0.014345s : 1: type_inference.specialize ------[replace.] 0.000121 8 36.07% : 0.000044s : 4: replace.inline 63.93% : 0.000077s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000201 8 91.05% : 0.000183s : 4: match.inline 8.95% : 0.000018s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000319 1730 0.93% : 0.000003s : 17: predicate.accumulaten_eliminater 1.07% : 0.000003s : 6: predicate.ad_related_special_op_eliminate 0.55% : 0.000002s : 12: predicate.addn_check_dump 0.86% : 0.000003s : 17: predicate.addn_zero_filter 0.74% : 0.000002s : 17: predicate.adjust_all_reduce_mul_add 2.52% : 0.000008s : 29: predicate.arithmetic_simplify 1.09% : 0.000003s : 17: predicate.cast_eliminate 0.60% : 0.000002s : 12: predicate.check_bprop_eliminate 0.55% : 0.000002s : 12: predicate.compare_switch_simplify 0.17% : 0.000001s : 6: predicate.const_output_eliminate 0.60% : 0.000002s : 12: predicate.depend_value_elim 0.82% : 0.000003s : 17: predicate.dict_get_item_const_eliminator 1.15% : 0.000004s : 17: predicate.dict_get_item_eliminator 0.95% : 0.000003s : 17: predicate.dict_set_item_eliminator 1.19% : 0.000004s : 12: predicate.dumpgradient_eliminate 0.28% : 0.000001s : 6: predicate.elim_not_effective 0.36% : 0.000001s : 6: predicate.elim_shapecalc_of_broadcastargs 1.20% : 0.000004s : 23: predicate.environ_add_const_eliminate 0.99% : 0.000003s : 23: predicate.environ_get_add_eliminate 1.02% : 0.000003s : 23: predicate.environ_get_depend_swap 1.83% : 0.000006s : 35: predicate.environ_get_eliminate 1.05% : 0.000003s : 23: predicate.environ_get_set_eliminate 1.25% : 0.000004s : 25: predicate.exchange_switch_depend_value 2.13% : 0.000007s : 25: predicate.float_depend_g_call 0.52% : 0.000002s : 12: predicate.float_environ_get_switch 0.79% : 0.000003s : 18: predicate.float_tuple_getitem_switch 0.17% : 0.000001s : 6: predicate.fold_const_symbol 0.72% : 0.000002s : 12: predicate.get_grad_eliminate 0.26% : 0.000001s : 6: predicate.graph_param_transform 0.60% : 0.000002s : 12: predicate.incorporate_call 0.52% : 0.000002s : 12: predicate.incorporate_call_switch 6.48% : 0.000021s : 78: predicate.inline 1.01% : 0.000003s : 12: predicate.inline_without_move 0.30% : 0.000001s : 12: predicate.j_node_and_user_rematch 0.99% : 0.000003s : 12: predicate.less_batch_normalization 1.72% : 0.000005s : 33: predicate.list_to_tuple_eliminator_ 2.27% : 0.000007s : 50: predicate.load_eliminater 1.05% : 0.000003s : 6: predicate.loop_unroll_after_grad 1.78% : 0.000006s : 38: predicate.loop_unroll_before_grad 1.65% : 0.000005s : 29: predicate.make_slice_get_slice_eliminator 0.59% : 0.000002s : 12: predicate.merge_addn 0.56% : 0.000002s : 12: predicate.micro_step_allgather_replace 0.54% : 0.000002s : 12: predicate.mini_step_allgather_replace 0.89% : 0.000003s : 17: predicate.minmaximum_grad 1.50% : 0.000005s : 6: predicate.mutable_eliminate 0.33% : 0.000001s : 6: predicate.opt_reshape 0.48% : 0.000002s : 6: predicate.parallel_virtual_node 1.91% : 0.000006s : 25: predicate.partial_defer_inline 1.40% : 0.000004s : 27: predicate.partial_eliminate 0.81% : 0.000003s : 17: predicate.print_const_string_wrapper 0.59% : 0.000002s : 12: predicate.reduce_all_const_elim 1.96% : 0.000006s : 17: predicate.reduce_eliminate 2.27% : 0.000007s : 50: predicate.redundant_stop_gradient_eliminater 0.32% : 0.000001s : 12: predicate.remove_not_recompute_node 1.24% : 0.000004s : 33: predicate.replace_applicator 0.44% : 0.000001s : 12: predicate.replace_old_param 0.33% : 0.000001s : 6: predicate.reset_defer_inline 1.49% : 0.000005s : 17: predicate.reshape_eliminate 0.68% : 0.000002s : 12: predicate.row_tensor_add_zeros_like 0.46% : 0.000001s : 6: predicate.row_tensor_eliminate 0.85% : 0.000003s : 12: predicate.same_eliminate 0.46% : 0.000001s : 12: predicate.set_cell_output_no_recompute 0.79% : 0.000003s : 12: predicate.shard_identity_eliminate 0.76% : 0.000002s : 12: predicate.special_op_eliminate 0.70% : 0.000002s : 12: predicate.specialize_transform 0.92% : 0.000003s : 12: predicate.split_environ_get_set_with_tuple_value 1.12% : 0.000004s : 12: predicate.stack_unstack_eliminate 0.39% : 0.000001s : 6: predicate.switch_call_monad_eliminater 1.25% : 0.000004s : 25: predicate.switch_defer_inline 1.74% : 0.000006s : 37: predicate.switch_layer_defer_inline 4.32% : 0.000014s : 81: predicate.switch_simplify 1.10% : 0.000003s : 17: predicate.tile_eliminate 0.87% : 0.000003s : 17: predicate.transpose_eliminate 1.66% : 0.000005s : 29: predicate.tuple_list_convert_item_index_to_positive 1.48% : 0.000005s : 29: predicate.tuple_list_get_item_const_eliminator 1.45% : 0.000005s : 29: predicate.tuple_list_get_item_depend_reorder 3.31% : 0.000011s : 45: predicate.tuple_list_get_item_eliminator 1.76% : 0.000006s : 29: predicate.tuple_list_get_set_item_eliminator 2.37% : 0.000008s : 41: predicate.tuple_list_set_item_eliminator 1.73% : 0.000006s : 33: predicate.tuple_to_list_eliminator_ 2.15% : 0.000007s : 50: predicate.updatestate_pure_node_eliminater 2.86% : 0.000009s : 62: predicate.updatestate_useless_node_eliminater 0.35% : 0.000001s : 6: predicate.value_based_eliminate 0.71% : 0.000002s : 12: predicate.virtual_dataset_eliminate 0.68% : 0.000002s : 12: predicate.virtual_output_eliminate 0.29% : 0.000001s : 6: predicate.virtual_view_grad_eliminate 0.42% : 0.000001s : 6: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000994 11 34.01% : 0.000338s : 5: func_graph_cloner_run.FuncGraphClonerGraph 65.99% : 0.000656s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.316391 192 0.00% : 0.000005s : 1: ForceFp32Comm 1.31% : 0.004138s : 1: add_attr 1.30% : 0.004123s : 1: add_attr_with_inline 0.00% : 0.000004s : 1: add_comm_op_reuse_tag 0.03% : 0.000082s : 1: add_recomputation 0.00% : 0.000005s : 1: assign_add_opt 0.03% : 0.000088s : 1: auto_monad 0.01% : 0.000032s : 1: auto_monad_reorder 0.00% : 0.000005s : 1: begin_end_overlap_inline 0.00% : 0.000007s : 1: bias_add_comm_swap 0.17% : 0.000542s : 1: bootstrap 0.01% : 0.000043s : 1: cconv 0.00% : 0.000004s : 1: comm_op_add_attrs 0.01% : 0.000027s : 1: control_data_broadcast_order 0.00% : 0.000013s : 1: convert_after_rewriter 0.01% : 0.000038s : 1: cse_after_recomputation 0.00% : 0.000005s : 1: dataset_repeat_opt 0.00% : 0.000006s : 1: detach_backward 0.00% : 0.000012s : 1: environ_conv 0.01% : 0.000034s : 1: event_method 0.00% : 0.000005s : 1: full_micro_interleaved_order_control 0.00% : 0.000006s : 1: get_jit_bprop_graph 0.00% : 0.000011s : 1: graph_reusing 0.00% : 0.000006s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000004s : 1: handle_group_info 0.00% : 0.000006s : 1: inline 0.00% : 0.000006s : 1: insert-virtual-dataset 0.00% : 0.000004s : 1: interleave_parallel_branches 0.00% : 0.000004s : 1: interleave_split_concat_branches 0.00% : 0.000006s : 1: label_fine_grained_interleaved_index 0.00% : 0.000010s : 1: label_micro_interleaved_index 0.17% : 0.000542s : 1: loop_unroll 0.00% : 0.000005s : 1: merge_cast_opt 0.00% : 0.000006s : 1: micro_interleaved_order_control 0.28% : 0.000878s : 1: mutable_eliminate 0.00% : 0.000009s : 1: offloading_packed_experts 0.01% : 0.000025s : 1: opt.transform.loop_unroll_optimizer 0.01% : 0.000031s : 1: opt.transform.mutable_eliminate 17.78% : 0.056256s : 78: opt.transform.opt_a 0.02% : 0.000052s : 1: opt.transform.opt_after_cconv 0.02% : 0.000049s : 1: opt.transform.opt_after_jit_grad 0.06% : 0.000197s : 28: opt.transform.opt_b 0.03% : 0.000086s : 2: opt.transform.opt_trans_graph 0.02% : 0.000067s : 4: opt.transform.symbol_engine_opt 21.66% : 0.068538s : 1: opt_a 0.05% : 0.000167s : 1: opt_after_cconv 0.23% : 0.000719s : 1: opt_after_jit_grad 0.11% : 0.000362s : 1: opt_b 22.63% : 0.071588s : 1: optimize 0.01% : 0.000029s : 1: optimize_parallel_all_gather_comm 0.00% : 0.000010s : 1: order_py_execute_after_rewriter 0.01% : 0.000033s : 1: overlap_grad_flash_sp 0.00% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.00% : 0.000010s : 1: overlap_grad_ring_attention 0.00% : 0.000006s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000010s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000011s : 1: overlap_param_gather 0.00% : 0.000006s : 1: overlap_recompute_allgather_and_fa_grad 0.00% : 0.000010s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000007s : 1: overlap_recompute_comm 0.00% : 0.000008s : 1: parallel-infer-symbol 0.00% : 0.000004s : 1: parallel-infer-symbol-second 0.00% : 0.000006s : 1: partial_unused_args_eliminate 0.00% : 0.000005s : 1: pipeline_parallel_scheduler 0.00% : 0.000005s : 1: pipeline_split 0.02% : 0.000049s : 1: pre_auto_parallel 0.01% : 0.000043s : 1: py_interpret_to_execute 0.01% : 0.000026s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000004s : 1: remove_cast_before_assign_add 0.02% : 0.000063s : 1: remove_dup_value 3.27% : 0.010358s : 1: renormalize.infer 0.21% : 0.000657s : 1: renormalize.specialize 0.00% : 0.000007s : 1: reorder_send_recv_between_fp_bp 0.00% : 0.000012s : 1: rewriter_after_jit_bprop_graph 0.02% : 0.000059s : 1: rewriter_after_opt_a 0.04% : 0.000114s : 1: rewriter_before_opt_a 0.00% : 0.000006s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000006s : 1: slice_recompute_activation 0.00% : 0.000005s : 1: split_layernorm_comm 0.00% : 0.000005s : 1: split_matmul_comm_elemetwise 0.00% : 0.000011s : 1: swap_dp_allreduce_reducescatter 0.04% : 0.000124s : 1: symbol_engine_optimizer 0.04% : 0.000120s : 1: tuple_transform 30.25% : 0.095702s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:39:40.363.620 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:39:40.363.925 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.150799, [21] [bootstrap]: 0.00040247 [type_inference]: 0.00588529 [event_method]: 1.83e-05 [auto_monad]: 6.721e-05 [graph_reusing]: 6.94001e-06 [inline]: 2.51e-06 [add_attr]: 0.00338738, [1] [add_attr_with_inline]: 0.00337727, [1] [Cycle 1]: 6.867e-05, [2] [tag_attr]: 2.114e-05 [meta_addattr_fg_expand]: 5.69e-06 [parallel-infer-symbol]: 3.7e-06 [pre_auto_parallel]: 3.511e-05 [insert-virtual-dataset]: 2.69001e-06 [parallel-infer-symbol-second]: 7.80012e-07 [dataset_repeat_opt]: 2.14999e-06 [pipeline_split]: 2.25002e-06 [optimize]: 0.139526, [53] [py_interpret_to_execute]: 2.878e-05 [rewriter_before_opt_a]: 9.131e-05 [opt_a]: 0.136484, [2] [Cycle 1]: 0.135396, [45] [expand_dump_flag]: 2.76999e-06 [switch_simplify]: 3.643e-05 [loop_unroll]: 3.145e-05 [a_1]: 0.00087644 [with_stream_mark]: 1.756e-05 [recompute_prepare]: 1.262e-05 [updatestate_depend_eliminate]: 4.54002e-06 [updatestate_assign_eliminate]: 3.61999e-06 [updatestate_loads_eliminate]: 3.55e-06 [parameter_eliminate]: 1.54998e-06 [a_2]: 0.0001349 [accelerated_algorithm]: 9.36002e-06 [shard]: 2.14e-06 [meta_shard_fg_expand]: 0.132041 [shard_inline]: 4.967e-05 [merge_send_recv]: 2.561e-05 [auto_parallel]: 2.149e-05 [parallel]: 2.493e-05 [flash_sp]: 1.631e-05 [merge_comm]: 5.20999e-06 [allreduce_fusion]: 4.4e-06 [matmul_add_comm_reduction]: 1.413e-05 [allreduce_slice_to_reducescatter]: 8.2e-07 [virtual_shard_identity]: 1.525e-05 [virtual_dataset]: 1.085e-05 [get_grad_eliminate_]: 8.19002e-06 [virtual_output]: 9.02999e-06 [merge_forward]: 5.65001e-06 [cell_reuse_recompute_pass]: 3.48e-06 [offload_activation]: 1.472e-05 [cell_reuse_handle_not_recompute_node_pass]: 3.431e-05 [merge_recompute_call_nodes]: 1.47001e-06 [before_grad]: 1.575e-05 [set_forward_comm_id_for_comm_node_pass]: 5.77001e-06 [meta_fg_expand]: 6.54001e-06 [flash_sp_send_recv_attached]: 2.94001e-06 [receive_attached]: 2.54001e-06 [after_resolve]: 1.587e-05 [a_after_grad]: 1.492e-05 [renormalize]: 0.0011817 [add_forward_monad_depend]: 1.07e-05 [auto_monad_grad]: 3.06001e-06 [auto_monad_eliminator]: 2.37e-05 [cse]: 3.903e-05 [a_3]: 8.401e-05 [Cycle 2]: 0.0010691, [45] [expand_dump_flag]: 3.08998e-06 [switch_simplify]: 1.101e-05 [loop_unroll]: 8.32003e-06 [a_1]: 0.00020372 [with_stream_mark]: 2.142e-05 [recompute_prepare]: 9.24e-06 [updatestate_depend_eliminate]: 5.24998e-06 [updatestate_assign_eliminate]: 4.4e-06 [updatestate_loads_eliminate]: 4.4e-06 [parameter_eliminate]: 2.00002e-06 [a_2]: 0.00012201 [accelerated_algorithm]: 8.52998e-06 [shard]: 2.61e-06 [meta_shard_fg_expand]: 2.56e-06 [shard_inline]: 8.14997e-06 [merge_send_recv]: 1.065e-05 [auto_parallel]: 1.074e-05 [parallel]: 9.47999e-06 [flash_sp]: 4.85001e-06 [merge_comm]: 5.61e-06 [allreduce_fusion]: 4.64002e-06 [matmul_add_comm_reduction]: 1.228e-05 [allreduce_slice_to_reducescatter]: 9.00007e-07 [virtual_shard_identity]: 1.029e-05 [virtual_dataset]: 7.87e-06 [get_grad_eliminate_]: 7.51999e-06 [virtual_output]: 7.45e-06 [merge_forward]: 4.72998e-06 [cell_reuse_recompute_pass]: 2.86e-06 [offload_activation]: 1.081e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.948e-05 [merge_recompute_call_nodes]: 1.26002e-06 [before_grad]: 1.342e-05 [set_forward_comm_id_for_comm_node_pass]: 6.76999e-06 [meta_fg_expand]: 3.3e-06 [flash_sp_send_recv_attached]: 1.56002e-06 [receive_attached]: 2.52001e-06 [after_resolve]: 1.42e-05 [a_after_grad]: 1.238e-05 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 3.58e-06 [auto_monad_grad]: 2.19001e-06 [auto_monad_eliminator]: 1.354e-05 [cse]: 2.535e-05 [a_3]: 7.002e-05 [py_interpret_to_execute_after_opt_a]: 2.227e-05 [slice_cell_reuse_recomputed_activation]: 4.84e-06 [rewriter_after_opt_a]: 5.584e-05 [convert_after_rewriter]: 1.145e-05 [order_py_execute_after_rewriter]: 8.90999e-06 [mutable_eliminate]: 0.00085934 [opt_b]: 0.00037922, [1] [Cycle 1]: 0.00036562, [7] [b_1]: 0.00022556 [b_2]: 1.074e-05 [updatestate_depend_eliminate]: 1.227e-05 [updatestate_assign_eliminate]: 4.53999e-06 [updatestate_loads_eliminate]: 3.76001e-06 [renormalize]: 4.89992e-07 [cse]: 3.974e-05 [optimize_parallel_all_gather_comm]: 2.733e-05 [overlap_param_gather]: 5.37999e-06 [cconv]: 4.161e-05 [loop_unroll]: 0.00052341 [opt_after_cconv]: 0.00016027, [1] [Cycle 1]: 0.00015022, [7] [c_1]: 4.436e-05 [parameter_eliminate]: 5.77001e-06 [updatestate_depend_eliminate]: 7.53e-06 [updatestate_assign_eliminate]: 3.93999e-06 [updatestate_loads_eliminate]: 2.94999e-06 [cse]: 2.673e-05 [renormalize]: 6.19999e-07 [remove_dup_value]: 2.255e-05 [tuple_transform]: 0.0001183, [1] [Cycle 1]: 0.00011032, [4] [d_1]: 6.602e-05 [none_parameter_eliminate]: 2.02999e-06 [renormalize]: 1.8999e-07 [switch_simplify]: 8.84003e-06 [partial_unused_args_eliminate]: 5.24e-06 [add_recomputation]: 6.665e-05 [cse_after_recomputation]: 3.591e-05, [1] [Cycle 1]: 2.774e-05, [1] [cse]: 1.754e-05 [environ_conv]: 1.086e-05 [swap_dp_allreduce_reducescatter]: 1.044e-05 [bias_add_comm_swap]: 5.67001e-06 [label_micro_interleaved_index]: 8.17e-06 [label_fine_grained_interleaved_index]: 5.50001e-06 [merge_cast_opt]: 4.17998e-06 [slice_recompute_activation]: 5.15999e-06 [micro_interleaved_order_control]: 5.15001e-06 [assign_add_opt]: 4.02998e-06 [ForceFp32Comm]: 3.83999e-06 [remove_cast_before_assign_add]: 3.95998e-06 [full_micro_interleaved_order_control]: 4.98001e-06 [reorder_send_recv_between_fp_bp]: 5.30001e-06 [comm_op_add_attrs]: 4.1e-06 [add_comm_op_reuse_tag]: 3.67998e-06 [interleave_split_concat_branches]: 3.96001e-06 [interleave_parallel_branches]: 4.10998e-06 [overlap_opt_shard_in_pipeline]: 4.33001e-06 [overlap_opt_shard_grad_in_pipeline]: 4.32998e-06 [control_data_broadcast_order]: 2.053e-05 [grouped_pairwise_exchange_alltoall]: 4.41002e-06 [offloading_packed_experts]: 8.81002e-06 [overlap_recompute_and_grad_model_parallel]: 8.95999e-06 [overlap_grad_matmul_and_grad_allreduce]: 4.13001e-06 [overlap_recompute_allgather_and_fa_grad]: 3.85e-06 [overlap_recompute_comm]: 5.82999e-06 [overlap_grad_ring_attention]: 9.03002e-06 [overlap_grad_flash_sp]: 3.044e-05 [begin_end_overlap_inline]: 3.67002e-06 [split_matmul_comm_elemetwise]: 4.99e-06 [split_layernorm_comm]: 4.82e-06 [handle_group_info]: 3.89002e-06 [symbol_engine_optimizer]: 0.0001243, [1] [Cycle 1]: 0.00011565, [6] [build]: 4.28999e-06 [elim_shapecalc]: 1.546e-05 [elim_not_effective]: 1.862e-05 [opt_reshape]: 9.24998e-06 [fold_const_symbol]: 1.413e-05 [renormalize]: 2.30008e-07 [detach_backward]: 5.90002e-06 [pipeline_parallel_scheduler]: 2.27001e-06 [auto_monad_reorder]: 2.817e-05 [get_jit_bprop_graph]: 2.61999e-06 [rewriter_after_jit_bprop_graph]: 6.14999e-06 [opt_after_jit_grad]: 0.0006517 [validate]: 5.47e-05 Sums bootstrap : 0.000402s : 0.28% type_inference : 0.005885s : 4.05% event_method : 0.000018s : 0.01% auto_monad : 0.000067s : 0.05% graph_reusing : 0.000007s : 0.00% inline : 0.000003s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000021s : 0.01% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.00% parallel-infer-symbol : 0.000004s : 0.00% pre_auto_parallel : 0.000035s : 0.02% insert-virtual-dataset : 0.000003s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000029s : 0.02% optimize.rewriter_before_opt_a : 0.000091s : 0.06% optimize.opt_a.expand_dump_flag : 0.000006s : 0.00% optimize.opt_a.switch_simplify : 0.000047s : 0.03% optimize.opt_a.loop_unroll : 0.000040s : 0.03% optimize.opt_a.a_1 : 0.001080s : 0.74% optimize.opt_a.with_stream_mark : 0.000039s : 0.03% optimize.opt_a.recompute_prepare : 0.000022s : 0.02% optimize.opt_a.updatestate_depend_eliminate : 0.000010s : 0.01% optimize.opt_a.updatestate_assign_eliminate : 0.000008s : 0.01% optimize.opt_a.updatestate_loads_eliminate : 0.000008s : 0.01% optimize.opt_a.parameter_eliminate : 0.000004s : 0.00% optimize.opt_a.a_2 : 0.000257s : 0.18% optimize.opt_a.accelerated_algorithm : 0.000018s : 0.01% optimize.opt_a.shard : 0.000005s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.132044s : 90.87% optimize.opt_a.shard_inline : 0.000058s : 0.04% optimize.opt_a.merge_send_recv : 0.000036s : 0.02% optimize.opt_a.auto_parallel : 0.000032s : 0.02% optimize.opt_a.parallel : 0.000034s : 0.02% optimize.opt_a.flash_sp : 0.000021s : 0.01% optimize.opt_a.merge_comm : 0.000011s : 0.01% optimize.opt_a.allreduce_fusion : 0.000009s : 0.01% optimize.opt_a.matmul_add_comm_reduction : 0.000026s : 0.02% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000002s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000026s : 0.02% optimize.opt_a.virtual_dataset : 0.000019s : 0.01% optimize.opt_a.get_grad_eliminate_ : 0.000016s : 0.01% optimize.opt_a.virtual_output : 0.000016s : 0.01% optimize.opt_a.merge_forward : 0.000010s : 0.01% optimize.opt_a.cell_reuse_recompute_pass : 0.000006s : 0.00% optimize.opt_a.offload_activation : 0.000026s : 0.02% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000054s : 0.04% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.00% optimize.opt_a.before_grad : 0.000029s : 0.02% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000013s : 0.01% optimize.opt_a.meta_fg_expand : 0.000010s : 0.01% optimize.opt_a.flash_sp_send_recv_attached : 0.000005s : 0.00% optimize.opt_a.receive_attached : 0.000005s : 0.00% optimize.opt_a.after_resolve : 0.000030s : 0.02% optimize.opt_a.a_after_grad : 0.000027s : 0.02% optimize.opt_a.renormalize : 0.001182s : 0.81% optimize.opt_a.add_forward_monad_depend : 0.000014s : 0.01% optimize.opt_a.auto_monad_grad : 0.000005s : 0.00% optimize.opt_a.auto_monad_eliminator : 0.000037s : 0.03% optimize.opt_a.cse : 0.000064s : 0.04% optimize.opt_a.a_3 : 0.000154s : 0.11% optimize.py_interpret_to_execute_after_opt_a : 0.000022s : 0.02% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.00% optimize.rewriter_after_opt_a : 0.000056s : 0.04% optimize.convert_after_rewriter : 0.000011s : 0.01% optimize.order_py_execute_after_rewriter : 0.000009s : 0.01% optimize.mutable_eliminate : 0.000859s : 0.59% optimize.opt_b.b_1 : 0.000226s : 0.16% optimize.opt_b.b_2 : 0.000011s : 0.01% optimize.opt_b.updatestate_depend_eliminate : 0.000012s : 0.01% optimize.opt_b.updatestate_assign_eliminate : 0.000005s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000004s : 0.00% optimize.opt_b.renormalize : 0.000000s : 0.00% optimize.opt_b.cse : 0.000040s : 0.03% optimize.optimize_parallel_all_gather_comm : 0.000027s : 0.02% optimize.overlap_param_gather : 0.000005s : 0.00% optimize.cconv : 0.000042s : 0.03% optimize.loop_unroll : 0.000523s : 0.36% optimize.opt_after_cconv.c_1 : 0.000044s : 0.03% optimize.opt_after_cconv.parameter_eliminate : 0.000006s : 0.00% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000008s : 0.01% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000004s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.cse : 0.000027s : 0.02% optimize.opt_after_cconv.renormalize : 0.000001s : 0.00% optimize.remove_dup_value : 0.000023s : 0.02% optimize.tuple_transform.d_1 : 0.000066s : 0.05% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000009s : 0.01% optimize.partial_unused_args_eliminate : 0.000005s : 0.00% optimize.add_recomputation : 0.000067s : 0.05% optimize.cse_after_recomputation.cse : 0.000018s : 0.01% optimize.environ_conv : 0.000011s : 0.01% optimize.swap_dp_allreduce_reducescatter : 0.000010s : 0.01% optimize.bias_add_comm_swap : 0.000006s : 0.00% optimize.label_micro_interleaved_index : 0.000008s : 0.01% optimize.label_fine_grained_interleaved_index : 0.000006s : 0.00% optimize.merge_cast_opt : 0.000004s : 0.00% optimize.slice_recompute_activation : 0.000005s : 0.00% optimize.micro_interleaved_order_control : 0.000005s : 0.00% optimize.assign_add_opt : 0.000004s : 0.00% optimize.ForceFp32Comm : 0.000004s : 0.00% optimize.remove_cast_before_assign_add : 0.000004s : 0.00% optimize.full_micro_interleaved_order_control : 0.000005s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000005s : 0.00% optimize.comm_op_add_attrs : 0.000004s : 0.00% optimize.add_comm_op_reuse_tag : 0.000004s : 0.00% optimize.interleave_split_concat_branches : 0.000004s : 0.00% optimize.interleave_parallel_branches : 0.000004s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000004s : 0.00% optimize.control_data_broadcast_order : 0.000021s : 0.01% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.00% optimize.offloading_packed_experts : 0.000009s : 0.01% optimize.overlap_recompute_and_grad_model_parallel : 0.000009s : 0.01% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.00% optimize.overlap_recompute_comm : 0.000006s : 0.00% optimize.overlap_grad_ring_attention : 0.000009s : 0.01% optimize.overlap_grad_flash_sp : 0.000030s : 0.02% optimize.begin_end_overlap_inline : 0.000004s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000005s : 0.00% optimize.split_layernorm_comm : 0.000005s : 0.00% optimize.handle_group_info : 0.000004s : 0.00% optimize.symbol_engine_optimizer.build : 0.000004s : 0.00% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000015s : 0.01% optimize.symbol_engine_optimizer.elim_not_effective : 0.000019s : 0.01% optimize.symbol_engine_optimizer.opt_reshape : 0.000009s : 0.01% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000014s : 0.01% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000006s : 0.00% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000028s : 0.02% get_jit_bprop_graph : 0.000003s : 0.00% rewriter_after_jit_bprop_graph : 0.000006s : 0.00% opt_after_jit_grad : 0.000652s : 0.45% validate : 0.000055s : 0.04% Time group info: ------[substitution.] 0.000246 38 12.93% : 0.000032s : 3: substitution.cast_eliminate 1.06% : 0.000003s : 3: substitution.elim_not_effective 0.99% : 0.000002s : 3: substitution.fold_const_symbol 3.30% : 0.000008s : 5: substitution.graph_param_transform 65.10% : 0.000160s : 4: substitution.inline 2.58% : 0.000006s : 6: substitution.j_node_and_user_rematch 2.99% : 0.000007s : 6: substitution.remove_not_recompute_node 3.17% : 0.000008s : 4: substitution.replace_old_param 7.87% : 0.000019s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.005835 2 86.58% : 0.005052s : 1: type_inference.infer 13.42% : 0.000783s : 1: type_inference.specialize ------[replace.] 0.000073 8 55.53% : 0.000040s : 4: replace.inline 44.47% : 0.000032s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000174 8 90.36% : 0.000158s : 4: match.inline 9.64% : 0.000017s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000287 1596 0.93% : 0.000003s : 17: predicate.accumulaten_eliminater 1.07% : 0.000003s : 5: predicate.ad_related_special_op_eliminate 0.52% : 0.000001s : 10: predicate.addn_check_dump 0.90% : 0.000003s : 17: predicate.addn_zero_filter 0.84% : 0.000002s : 17: predicate.adjust_all_reduce_mul_add 2.23% : 0.000006s : 27: predicate.arithmetic_simplify 0.93% : 0.000003s : 17: predicate.cast_eliminate 0.56% : 0.000002s : 10: predicate.check_bprop_eliminate 0.55% : 0.000002s : 10: predicate.compare_switch_simplify 0.16% : 0.000000s : 5: predicate.const_output_eliminate 0.61% : 0.000002s : 10: predicate.depend_value_elim 0.92% : 0.000003s : 17: predicate.dict_get_item_const_eliminator 1.05% : 0.000003s : 17: predicate.dict_get_item_eliminator 0.95% : 0.000003s : 17: predicate.dict_set_item_eliminator 1.28% : 0.000004s : 10: predicate.dumpgradient_eliminate 0.15% : 0.000000s : 5: predicate.elim_not_effective 0.34% : 0.000001s : 5: predicate.elim_shapecalc_of_broadcastargs 1.21% : 0.000003s : 22: predicate.environ_add_const_eliminate 1.09% : 0.000003s : 22: predicate.environ_get_add_eliminate 1.07% : 0.000003s : 22: predicate.environ_get_depend_swap 1.63% : 0.000005s : 32: predicate.environ_get_eliminate 1.03% : 0.000003s : 22: predicate.environ_get_set_eliminate 1.30% : 0.000004s : 25: predicate.exchange_switch_depend_value 2.07% : 0.000006s : 25: predicate.float_depend_g_call 0.57% : 0.000002s : 10: predicate.float_environ_get_switch 0.77% : 0.000002s : 15: predicate.float_tuple_getitem_switch 0.15% : 0.000000s : 5: predicate.fold_const_symbol 0.61% : 0.000002s : 10: predicate.get_grad_eliminate 0.25% : 0.000001s : 5: predicate.graph_param_transform 0.54% : 0.000002s : 10: predicate.incorporate_call 0.45% : 0.000001s : 10: predicate.incorporate_call_switch 8.66% : 0.000025s : 72: predicate.inline 0.72% : 0.000002s : 10: predicate.inline_without_move 0.28% : 0.000001s : 10: predicate.j_node_and_user_rematch 0.94% : 0.000003s : 10: predicate.less_batch_normalization 1.69% : 0.000005s : 31: predicate.list_to_tuple_eliminator_ 2.40% : 0.000007s : 48: predicate.load_eliminater 1.15% : 0.000003s : 5: predicate.loop_unroll_after_grad 1.84% : 0.000005s : 36: predicate.loop_unroll_before_grad 1.73% : 0.000005s : 27: predicate.make_slice_get_slice_eliminator 0.55% : 0.000002s : 10: predicate.merge_addn 0.53% : 0.000002s : 10: predicate.micro_step_allgather_replace 0.52% : 0.000001s : 10: predicate.mini_step_allgather_replace 0.82% : 0.000002s : 17: predicate.minmaximum_grad 1.57% : 0.000005s : 5: predicate.mutable_eliminate 0.31% : 0.000001s : 5: predicate.opt_reshape 0.34% : 0.000001s : 5: predicate.parallel_virtual_node 1.58% : 0.000005s : 25: predicate.partial_defer_inline 1.50% : 0.000004s : 26: predicate.partial_eliminate 1.06% : 0.000003s : 17: predicate.print_const_string_wrapper 0.55% : 0.000002s : 10: predicate.reduce_all_const_elim 1.36% : 0.000004s : 17: predicate.reduce_eliminate 2.36% : 0.000007s : 48: predicate.redundant_stop_gradient_eliminater 0.45% : 0.000001s : 10: predicate.remove_not_recompute_node 1.42% : 0.000004s : 31: predicate.replace_applicator 0.49% : 0.000001s : 10: predicate.replace_old_param 0.39% : 0.000001s : 5: predicate.reset_defer_inline 1.03% : 0.000003s : 17: predicate.reshape_eliminate 0.72% : 0.000002s : 10: predicate.row_tensor_add_zeros_like 0.46% : 0.000001s : 5: predicate.row_tensor_eliminate 0.86% : 0.000002s : 10: predicate.same_eliminate 0.36% : 0.000001s : 10: predicate.set_cell_output_no_recompute 1.24% : 0.000004s : 10: predicate.shard_identity_eliminate 0.70% : 0.000002s : 10: predicate.special_op_eliminate 0.65% : 0.000002s : 10: predicate.specialize_transform 0.93% : 0.000003s : 10: predicate.split_environ_get_set_with_tuple_value 0.96% : 0.000003s : 10: predicate.stack_unstack_eliminate 0.37% : 0.000001s : 5: predicate.switch_call_monad_eliminater 1.39% : 0.000004s : 25: predicate.switch_defer_inline 1.85% : 0.000005s : 35: predicate.switch_layer_defer_inline 4.30% : 0.000012s : 76: predicate.switch_simplify 0.90% : 0.000003s : 17: predicate.tile_eliminate 1.00% : 0.000003s : 17: predicate.transpose_eliminate 1.46% : 0.000004s : 27: predicate.tuple_list_convert_item_index_to_positive 1.51% : 0.000004s : 27: predicate.tuple_list_get_item_const_eliminator 1.57% : 0.000005s : 27: predicate.tuple_list_get_item_depend_reorder 2.95% : 0.000008s : 41: predicate.tuple_list_get_item_eliminator 1.56% : 0.000004s : 27: predicate.tuple_list_get_set_item_eliminator 2.17% : 0.000006s : 37: predicate.tuple_list_set_item_eliminator 1.60% : 0.000005s : 31: predicate.tuple_to_list_eliminator_ 2.32% : 0.000007s : 48: predicate.updatestate_pure_node_eliminater 2.93% : 0.000008s : 58: predicate.updatestate_useless_node_eliminater 0.37% : 0.000001s : 5: predicate.value_based_eliminate 0.65% : 0.000002s : 10: predicate.virtual_dataset_eliminate 0.55% : 0.000002s : 10: predicate.virtual_output_eliminate 0.25% : 0.000001s : 5: predicate.virtual_view_grad_eliminate 0.47% : 0.000001s : 5: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000685 11 40.49% : 0.000278s : 5: func_graph_cloner_run.FuncGraphClonerGraph 59.51% : 0.000408s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.296833 192 0.00% : 0.000006s : 1: ForceFp32Comm 1.14% : 0.003398s : 1: add_attr 1.14% : 0.003381s : 1: add_attr_with_inline 0.00% : 0.000007s : 1: add_comm_op_reuse_tag 0.02% : 0.000071s : 1: add_recomputation 0.00% : 0.000007s : 1: assign_add_opt 0.03% : 0.000076s : 1: auto_monad 0.01% : 0.000036s : 1: auto_monad_reorder 0.00% : 0.000007s : 1: begin_end_overlap_inline 0.00% : 0.000009s : 1: bias_add_comm_swap 0.15% : 0.000442s : 1: bootstrap 0.02% : 0.000045s : 1: cconv 0.00% : 0.000007s : 1: comm_op_add_attrs 0.01% : 0.000024s : 1: control_data_broadcast_order 0.00% : 0.000015s : 1: convert_after_rewriter 0.01% : 0.000039s : 1: cse_after_recomputation 0.00% : 0.000008s : 1: dataset_repeat_opt 0.01% : 0.000032s : 1: detach_backward 0.00% : 0.000014s : 1: environ_conv 0.01% : 0.000029s : 1: event_method 0.00% : 0.000008s : 1: full_micro_interleaved_order_control 0.00% : 0.000010s : 1: get_jit_bprop_graph 0.00% : 0.000014s : 1: graph_reusing 0.00% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000007s : 1: handle_group_info 0.00% : 0.000008s : 1: inline 0.00% : 0.000009s : 1: insert-virtual-dataset 0.00% : 0.000007s : 1: interleave_parallel_branches 0.00% : 0.000007s : 1: interleave_split_concat_branches 0.00% : 0.000009s : 1: label_fine_grained_interleaved_index 0.00% : 0.000011s : 1: label_micro_interleaved_index 0.18% : 0.000531s : 1: loop_unroll 0.00% : 0.000007s : 1: merge_cast_opt 0.00% : 0.000008s : 1: micro_interleaved_order_control 0.29% : 0.000870s : 1: mutable_eliminate 0.00% : 0.000012s : 1: offloading_packed_experts 0.01% : 0.000020s : 1: opt.transform.loop_unroll_optimizer 0.01% : 0.000027s : 1: opt.transform.mutable_eliminate 0.57% : 0.001692s : 78: opt.transform.opt_a 0.01% : 0.000043s : 1: opt.transform.opt_after_cconv 0.01% : 0.000039s : 1: opt.transform.opt_after_jit_grad 0.05% : 0.000153s : 28: opt.transform.opt_b 0.02% : 0.000073s : 2: opt.transform.opt_trans_graph 0.02% : 0.000053s : 4: opt.transform.symbol_engine_opt 45.98% : 0.136489s : 1: opt_a 0.06% : 0.000164s : 1: opt_after_cconv 0.22% : 0.000664s : 1: opt_after_jit_grad 0.13% : 0.000383s : 1: opt_b 47.15% : 0.139971s : 1: optimize 0.01% : 0.000031s : 1: optimize_parallel_all_gather_comm 0.00% : 0.000012s : 1: order_py_execute_after_rewriter 0.01% : 0.000034s : 1: overlap_grad_flash_sp 0.00% : 0.000007s : 1: overlap_grad_matmul_and_grad_allreduce 0.00% : 0.000013s : 1: overlap_grad_ring_attention 0.00% : 0.000008s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000007s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000008s : 1: overlap_param_gather 0.00% : 0.000006s : 1: overlap_recompute_allgather_and_fa_grad 0.00% : 0.000012s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000009s : 1: overlap_recompute_comm 0.00% : 0.000010s : 1: parallel-infer-symbol 0.00% : 0.000006s : 1: parallel-infer-symbol-second 0.00% : 0.000008s : 1: partial_unused_args_eliminate 0.00% : 0.000010s : 1: pipeline_parallel_scheduler 0.00% : 0.000008s : 1: pipeline_split 0.01% : 0.000043s : 1: pre_auto_parallel 0.01% : 0.000033s : 1: py_interpret_to_execute 0.01% : 0.000026s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000008s : 1: remove_cast_before_assign_add 0.01% : 0.000026s : 1: remove_dup_value 0.21% : 0.000610s : 1: renormalize.infer 0.19% : 0.000559s : 1: renormalize.specialize 0.00% : 0.000008s : 1: reorder_send_recv_between_fp_bp 0.00% : 0.000014s : 1: rewriter_after_jit_bprop_graph 0.02% : 0.000060s : 1: rewriter_after_opt_a 0.03% : 0.000095s : 1: rewriter_before_opt_a 0.00% : 0.000008s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000008s : 1: slice_recompute_activation 0.00% : 0.000008s : 1: split_layernorm_comm 0.00% : 0.000008s : 1: split_matmul_comm_elemetwise 0.00% : 0.000013s : 1: swap_dp_allreduce_reducescatter 0.04% : 0.000127s : 1: symbol_engine_optimizer 0.04% : 0.000122s : 1: tuple_transform 2.00% : 0.005923s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:39:42.582.491 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.212174, [21] [bootstrap]: 0.193817 [type_inference]: 0.00733077 [event_method]: 2.239e-05 [auto_monad]: 7.251e-05 [graph_reusing]: 6.32001e-06 [inline]: 2.49001e-06 [add_attr]: 0.00376568, [1] [add_attr_with_inline]: 0.0037523, [1] [Cycle 1]: 8.033e-05, [2] [tag_attr]: 2.64e-05 [meta_addattr_fg_expand]: 6.10002e-06 [parallel-infer-symbol]: 4.32e-06 [pre_auto_parallel]: 4.102e-05 [insert-virtual-dataset]: 3.09001e-06 [parallel-infer-symbol-second]: 7.2e-07 [dataset_repeat_opt]: 2.21e-06 [pipeline_split]: 1.72999e-06 [optimize]: 0.0062713, [53] [py_interpret_to_execute]: 3.288e-05 [rewriter_before_opt_a]: 0.0001025 [opt_a]: 0.00362754, [2] [Cycle 1]: 0.00275614, [45] [expand_dump_flag]: 3.43e-06 [switch_simplify]: 4.512e-05 [loop_unroll]: 3.145e-05 [a_1]: 0.00080474 [with_stream_mark]: 2.457e-05 [recompute_prepare]: 1.267e-05 [updatestate_depend_eliminate]: 5.77999e-06 [updatestate_assign_eliminate]: 4.13001e-06 [updatestate_loads_eliminate]: 3.7e-06 [parameter_eliminate]: 2.03002e-06 [a_2]: 0.00011201 [accelerated_algorithm]: 1.14e-05 [shard]: 2.31e-06 [meta_shard_fg_expand]: 2.11998e-06 [shard_inline]: 9.22999e-06 [merge_send_recv]: 1.118e-05 [auto_parallel]: 9.37001e-06 [parallel]: 2.226e-05 [flash_sp]: 1.013e-05 [merge_comm]: 5.17e-06 [allreduce_fusion]: 4.55001e-06 [matmul_add_comm_reduction]: 1.277e-05 [allreduce_slice_to_reducescatter]: 8.09989e-07 [virtual_shard_identity]: 1.191e-05 [virtual_dataset]: 9.42001e-06 [get_grad_eliminate_]: 9.61998e-06 [virtual_output]: 9.29e-06 [merge_forward]: 5.22e-06 [cell_reuse_recompute_pass]: 1.59998e-06 [offload_activation]: 1.286e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.171e-05 [merge_recompute_call_nodes]: 1.54998e-06 [before_grad]: 1.477e-05 [set_forward_comm_id_for_comm_node_pass]: 5.13002e-06 [meta_fg_expand]: 3.51999e-06 [flash_sp_send_recv_attached]: 3.25e-06 [receive_attached]: 3.17002e-06 [after_resolve]: 1.71e-05 [a_after_grad]: 1.518e-05 [renormalize]: 0.00103421 [add_forward_monad_depend]: 7.66999e-06 [auto_monad_grad]: 2.73e-06 [auto_monad_eliminator]: 2.133e-05 [cse]: 4.133e-05 [a_3]: 6.303e-05 [Cycle 2]: 0.00085801, [45] [expand_dump_flag]: 2.36e-06 [switch_simplify]: 9.81998e-06 [loop_unroll]: 7.83001e-06 [a_1]: 0.00019045 [with_stream_mark]: 1.708e-05 [recompute_prepare]: 7.78999e-06 [updatestate_depend_eliminate]: 4.55001e-06 [updatestate_assign_eliminate]: 2.96001e-06 [updatestate_loads_eliminate]: 3.03e-06 [parameter_eliminate]: 1.49e-06 [a_2]: 0.00011742 [accelerated_algorithm]: 8.52e-06 [shard]: 1.71998e-06 [meta_shard_fg_expand]: 1.79e-06 [shard_inline]: 7.78999e-06 [merge_send_recv]: 8.05999e-06 [auto_parallel]: 8.94998e-06 [parallel]: 7.12002e-06 [flash_sp]: 4.75001e-06 [merge_comm]: 4.90001e-06 [allreduce_fusion]: 4.18001e-06 [matmul_add_comm_reduction]: 9.50001e-06 [allreduce_slice_to_reducescatter]: 6.19999e-07 [virtual_shard_identity]: 9.76e-06 [virtual_dataset]: 8.32e-06 [get_grad_eliminate_]: 7.18e-06 [virtual_output]: 6.96001e-06 [merge_forward]: 4.20999e-06 [cell_reuse_recompute_pass]: 2.12999e-06 [offload_activation]: 9.20999e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.528e-05 [merge_recompute_call_nodes]: 9.49978e-07 [before_grad]: 1.183e-05 [set_forward_comm_id_for_comm_node_pass]: 4.70999e-06 [meta_fg_expand]: 3.01001e-06 [flash_sp_send_recv_attached]: 1.59998e-06 [receive_attached]: 1.87001e-06 [after_resolve]: 1.338e-05 [a_after_grad]: 1.167e-05 [renormalize]: 7.99773e-08 [add_forward_monad_depend]: 1.40001e-06 [auto_monad_grad]: 1.59e-06 [auto_monad_eliminator]: 1.112e-05 [cse]: 2.536e-05 [a_3]: 4.782e-05 [py_interpret_to_execute_after_opt_a]: 1.539e-05 [slice_cell_reuse_recomputed_activation]: 2.54999e-06 [rewriter_after_opt_a]: 4.791e-05 [convert_after_rewriter]: 8.15e-06 [order_py_execute_after_rewriter]: 5.67001e-06 [mutable_eliminate]: 0.00078554 [opt_b]: 0.00028818, [1] [Cycle 1]: 0.00028019, [7] [b_1]: 0.00017887 [b_2]: 1.153e-05 [updatestate_depend_eliminate]: 1.009e-05 [updatestate_assign_eliminate]: 3.50998e-06 [updatestate_loads_eliminate]: 3.66001e-06 [renormalize]: 5.39992e-07 [cse]: 3.186e-05 [optimize_parallel_all_gather_comm]: 2.103e-05 [overlap_param_gather]: 1.90001e-06 [cconv]: 3.725e-05 [loop_unroll]: 0.00049212 [opt_after_cconv]: 0.00013016, [1] [Cycle 1]: 0.00012255, [7] [c_1]: 3.939e-05 [parameter_eliminate]: 5.32999e-06 [updatestate_depend_eliminate]: 6.76999e-06 [updatestate_assign_eliminate]: 3.45e-06 [updatestate_loads_eliminate]: 3.16999e-06 [cse]: 2.742e-05 [renormalize]: 6.00005e-07 [remove_dup_value]: 1.746e-05 [tuple_transform]: 9.837e-05, [1] [Cycle 1]: 9.336e-05, [4] [d_1]: 6.272e-05 [none_parameter_eliminate]: 1.64e-06 [renormalize]: 3.39991e-07 [switch_simplify]: 8.79003e-06 [partial_unused_args_eliminate]: 2.04e-06 [add_recomputation]: 6.629e-05 [cse_after_recomputation]: 2.973e-05, [1] [Cycle 1]: 2.442e-05, [1] [cse]: 1.756e-05 [environ_conv]: 7.31001e-06 [swap_dp_allreduce_reducescatter]: 7e-06 [bias_add_comm_swap]: 3.14001e-06 [label_micro_interleaved_index]: 4.82998e-06 [label_fine_grained_interleaved_index]: 2.82002e-06 [merge_cast_opt]: 1.62001e-06 [slice_recompute_activation]: 2.11998e-06 [micro_interleaved_order_control]: 2.78e-06 [assign_add_opt]: 1.40999e-06 [ForceFp32Comm]: 8.89995e-07 [remove_cast_before_assign_add]: 1.32e-06 [full_micro_interleaved_order_control]: 2.02001e-06 [reorder_send_recv_between_fp_bp]: 2.78998e-06 [comm_op_add_attrs]: 1.10001e-06 [add_comm_op_reuse_tag]: 1.14e-06 [interleave_split_concat_branches]: 1.19e-06 [interleave_parallel_branches]: 1.29e-06 [overlap_opt_shard_in_pipeline]: 1.24e-06 [overlap_opt_shard_grad_in_pipeline]: 2.04999e-06 [control_data_broadcast_order]: 1.674e-05 [grouped_pairwise_exchange_alltoall]: 1.55001e-06 [offloading_packed_experts]: 5.38002e-06 [overlap_recompute_and_grad_model_parallel]: 6.29001e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.35999e-06 [overlap_recompute_allgather_and_fa_grad]: 1.35999e-06 [overlap_recompute_comm]: 2.29001e-06 [overlap_grad_ring_attention]: 5.10001e-06 [overlap_grad_flash_sp]: 2.706e-05 [begin_end_overlap_inline]: 6.00005e-07 [split_matmul_comm_elemetwise]: 2.30002e-06 [split_layernorm_comm]: 2.01998e-06 [handle_group_info]: 1.24e-06 [symbol_engine_optimizer]: 9.857e-05, [1] [Cycle 1]: 9.346e-05, [6] [build]: 4.98001e-06 [elim_shapecalc]: 1.578e-05 [elim_not_effective]: 1.713e-05 [opt_reshape]: 8.80999e-06 [fold_const_symbol]: 1.342e-05 [renormalize]: 3.00002e-07 [detach_backward]: 2.34001e-06 [pipeline_parallel_scheduler]: 1.59e-06 [auto_monad_reorder]: 2.173e-05 [get_jit_bprop_graph]: 2.34999e-06 [rewriter_after_jit_bprop_graph]: 7.21001e-06 [opt_after_jit_grad]: 0.00055115 [validate]: 5.243e-05 Sums bootstrap : 0.193817s : 93.49% type_inference : 0.007331s : 3.54% event_method : 0.000022s : 0.01% auto_monad : 0.000073s : 0.03% graph_reusing : 0.000006s : 0.00% inline : 0.000002s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000026s : 0.01% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.00% parallel-infer-symbol : 0.000004s : 0.00% pre_auto_parallel : 0.000041s : 0.02% insert-virtual-dataset : 0.000003s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000033s : 0.02% optimize.rewriter_before_opt_a : 0.000102s : 0.05% optimize.opt_a.expand_dump_flag : 0.000006s : 0.00% optimize.opt_a.switch_simplify : 0.000055s : 0.03% optimize.opt_a.loop_unroll : 0.000039s : 0.02% optimize.opt_a.a_1 : 0.000995s : 0.48% optimize.opt_a.with_stream_mark : 0.000042s : 0.02% optimize.opt_a.recompute_prepare : 0.000020s : 0.01% optimize.opt_a.updatestate_depend_eliminate : 0.000010s : 0.00% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.00% optimize.opt_a.updatestate_loads_eliminate : 0.000007s : 0.00% optimize.opt_a.parameter_eliminate : 0.000004s : 0.00% optimize.opt_a.a_2 : 0.000229s : 0.11% optimize.opt_a.accelerated_algorithm : 0.000020s : 0.01% optimize.opt_a.shard : 0.000004s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.00% optimize.opt_a.shard_inline : 0.000017s : 0.01% optimize.opt_a.merge_send_recv : 0.000019s : 0.01% optimize.opt_a.auto_parallel : 0.000018s : 0.01% optimize.opt_a.parallel : 0.000029s : 0.01% optimize.opt_a.flash_sp : 0.000015s : 0.01% optimize.opt_a.merge_comm : 0.000010s : 0.00% optimize.opt_a.allreduce_fusion : 0.000009s : 0.00% optimize.opt_a.matmul_add_comm_reduction : 0.000022s : 0.01% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000022s : 0.01% optimize.opt_a.virtual_dataset : 0.000018s : 0.01% optimize.opt_a.get_grad_eliminate_ : 0.000017s : 0.01% optimize.opt_a.virtual_output : 0.000016s : 0.01% optimize.opt_a.merge_forward : 0.000009s : 0.00% optimize.opt_a.cell_reuse_recompute_pass : 0.000004s : 0.00% optimize.opt_a.offload_activation : 0.000022s : 0.01% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000037s : 0.02% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.00% optimize.opt_a.before_grad : 0.000027s : 0.01% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000010s : 0.00% optimize.opt_a.meta_fg_expand : 0.000007s : 0.00% optimize.opt_a.flash_sp_send_recv_attached : 0.000005s : 0.00% optimize.opt_a.receive_attached : 0.000005s : 0.00% optimize.opt_a.after_resolve : 0.000030s : 0.01% optimize.opt_a.a_after_grad : 0.000027s : 0.01% optimize.opt_a.renormalize : 0.001034s : 0.50% optimize.opt_a.add_forward_monad_depend : 0.000009s : 0.00% optimize.opt_a.auto_monad_grad : 0.000004s : 0.00% optimize.opt_a.auto_monad_eliminator : 0.000032s : 0.02% optimize.opt_a.cse : 0.000067s : 0.03% optimize.opt_a.a_3 : 0.000111s : 0.05% optimize.py_interpret_to_execute_after_opt_a : 0.000015s : 0.01% optimize.slice_cell_reuse_recomputed_activation : 0.000003s : 0.00% optimize.rewriter_after_opt_a : 0.000048s : 0.02% optimize.convert_after_rewriter : 0.000008s : 0.00% optimize.order_py_execute_after_rewriter : 0.000006s : 0.00% optimize.mutable_eliminate : 0.000786s : 0.38% optimize.opt_b.b_1 : 0.000179s : 0.09% optimize.opt_b.b_2 : 0.000012s : 0.01% optimize.opt_b.updatestate_depend_eliminate : 0.000010s : 0.00% optimize.opt_b.updatestate_assign_eliminate : 0.000004s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000004s : 0.00% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000032s : 0.02% optimize.optimize_parallel_all_gather_comm : 0.000021s : 0.01% optimize.overlap_param_gather : 0.000002s : 0.00% optimize.cconv : 0.000037s : 0.02% optimize.loop_unroll : 0.000492s : 0.24% optimize.opt_after_cconv.c_1 : 0.000039s : 0.02% optimize.opt_after_cconv.parameter_eliminate : 0.000005s : 0.00% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000007s : 0.00% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.cse : 0.000027s : 0.01% optimize.opt_after_cconv.renormalize : 0.000001s : 0.00% optimize.remove_dup_value : 0.000017s : 0.01% optimize.tuple_transform.d_1 : 0.000063s : 0.03% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000009s : 0.00% optimize.partial_unused_args_eliminate : 0.000002s : 0.00% optimize.add_recomputation : 0.000066s : 0.03% optimize.cse_after_recomputation.cse : 0.000018s : 0.01% optimize.environ_conv : 0.000007s : 0.00% optimize.swap_dp_allreduce_reducescatter : 0.000007s : 0.00% optimize.bias_add_comm_swap : 0.000003s : 0.00% optimize.label_micro_interleaved_index : 0.000005s : 0.00% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.00% optimize.merge_cast_opt : 0.000002s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.00% optimize.micro_interleaved_order_control : 0.000003s : 0.00% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.00% optimize.full_micro_interleaved_order_control : 0.000002s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.00% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.00% optimize.control_data_broadcast_order : 0.000017s : 0.01% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.00% optimize.offloading_packed_experts : 0.000005s : 0.00% optimize.overlap_recompute_and_grad_model_parallel : 0.000006s : 0.00% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.00% optimize.overlap_recompute_comm : 0.000002s : 0.00% optimize.overlap_grad_ring_attention : 0.000005s : 0.00% optimize.overlap_grad_flash_sp : 0.000027s : 0.01% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.00% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000005s : 0.00% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000016s : 0.01% optimize.symbol_engine_optimizer.elim_not_effective : 0.000017s : 0.01% optimize.symbol_engine_optimizer.opt_reshape : 0.000009s : 0.00% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000013s : 0.01% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.00% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000022s : 0.01% get_jit_bprop_graph : 0.000002s : 0.00% rewriter_after_jit_bprop_graph : 0.000007s : 0.00% opt_after_jit_grad : 0.000551s : 0.27% validate : 0.000052s : 0.03% Time group info: ------[substitution.] 0.000262 38 11.39% : 0.000030s : 3: substitution.cast_eliminate 0.94% : 0.000002s : 3: substitution.elim_not_effective 0.76% : 0.000002s : 3: substitution.fold_const_symbol 2.97% : 0.000008s : 5: substitution.graph_param_transform 70.56% : 0.000185s : 4: substitution.inline 1.87% : 0.000005s : 6: substitution.j_node_and_user_rematch 2.71% : 0.000007s : 6: substitution.remove_not_recompute_node 2.40% : 0.000006s : 4: substitution.replace_old_param 6.38% : 0.000017s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.007253 2 88.03% : 0.006385s : 1: type_inference.infer 11.97% : 0.000868s : 1: type_inference.specialize ------[replace.] 0.000071 8 62.01% : 0.000044s : 4: replace.inline 37.99% : 0.000027s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000197 8 92.49% : 0.000182s : 4: match.inline 7.51% : 0.000015s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000293 1596 0.87% : 0.000003s : 17: predicate.accumulaten_eliminater 0.75% : 0.000002s : 5: predicate.ad_related_special_op_eliminate 7.73% : 0.000023s : 10: predicate.addn_check_dump 0.92% : 0.000003s : 17: predicate.addn_zero_filter 0.81% : 0.000002s : 17: predicate.adjust_all_reduce_mul_add 1.78% : 0.000005s : 27: predicate.arithmetic_simplify 1.00% : 0.000003s : 17: predicate.cast_eliminate 0.56% : 0.000002s : 10: predicate.check_bprop_eliminate 0.53% : 0.000002s : 10: predicate.compare_switch_simplify 0.16% : 0.000000s : 5: predicate.const_output_eliminate 0.61% : 0.000002s : 10: predicate.depend_value_elim 0.88% : 0.000003s : 17: predicate.dict_get_item_const_eliminator 0.97% : 0.000003s : 17: predicate.dict_get_item_eliminator 0.83% : 0.000002s : 17: predicate.dict_set_item_eliminator 1.05% : 0.000003s : 10: predicate.dumpgradient_eliminate 0.24% : 0.000001s : 5: predicate.elim_not_effective 0.50% : 0.000001s : 5: predicate.elim_shapecalc_of_broadcastargs 1.05% : 0.000003s : 22: predicate.environ_add_const_eliminate 1.00% : 0.000003s : 22: predicate.environ_get_add_eliminate 1.18% : 0.000003s : 22: predicate.environ_get_depend_swap 1.50% : 0.000004s : 32: predicate.environ_get_eliminate 0.97% : 0.000003s : 22: predicate.environ_get_set_eliminate 1.21% : 0.000004s : 25: predicate.exchange_switch_depend_value 2.21% : 0.000006s : 25: predicate.float_depend_g_call 0.48% : 0.000001s : 10: predicate.float_environ_get_switch 0.78% : 0.000002s : 15: predicate.float_tuple_getitem_switch 0.17% : 0.000001s : 5: predicate.fold_const_symbol 0.77% : 0.000002s : 10: predicate.get_grad_eliminate 0.17% : 0.000001s : 5: predicate.graph_param_transform 0.58% : 0.000002s : 10: predicate.incorporate_call 0.44% : 0.000001s : 10: predicate.incorporate_call_switch 5.78% : 0.000017s : 72: predicate.inline 0.76% : 0.000002s : 10: predicate.inline_without_move 0.27% : 0.000001s : 10: predicate.j_node_and_user_rematch 0.86% : 0.000003s : 10: predicate.less_batch_normalization 1.64% : 0.000005s : 31: predicate.list_to_tuple_eliminator_ 2.46% : 0.000007s : 48: predicate.load_eliminater 1.01% : 0.000003s : 5: predicate.loop_unroll_after_grad 1.80% : 0.000005s : 36: predicate.loop_unroll_before_grad 1.57% : 0.000005s : 27: predicate.make_slice_get_slice_eliminator 0.57% : 0.000002s : 10: predicate.merge_addn 0.49% : 0.000001s : 10: predicate.micro_step_allgather_replace 0.50% : 0.000001s : 10: predicate.mini_step_allgather_replace 0.83% : 0.000002s : 17: predicate.minmaximum_grad 1.36% : 0.000004s : 5: predicate.mutable_eliminate 0.33% : 0.000001s : 5: predicate.opt_reshape 0.34% : 0.000001s : 5: predicate.parallel_virtual_node 1.97% : 0.000006s : 25: predicate.partial_defer_inline 1.46% : 0.000004s : 26: predicate.partial_eliminate 0.99% : 0.000003s : 17: predicate.print_const_string_wrapper 0.50% : 0.000001s : 10: predicate.reduce_all_const_elim 1.21% : 0.000004s : 17: predicate.reduce_eliminate 2.33% : 0.000007s : 48: predicate.redundant_stop_gradient_eliminater 0.32% : 0.000001s : 10: predicate.remove_not_recompute_node 1.34% : 0.000004s : 31: predicate.replace_applicator 0.58% : 0.000002s : 10: predicate.replace_old_param 0.27% : 0.000001s : 5: predicate.reset_defer_inline 0.86% : 0.000003s : 17: predicate.reshape_eliminate 0.56% : 0.000002s : 10: predicate.row_tensor_add_zeros_like 0.44% : 0.000001s : 5: predicate.row_tensor_eliminate 0.78% : 0.000002s : 10: predicate.same_eliminate 0.47% : 0.000001s : 10: predicate.set_cell_output_no_recompute 0.84% : 0.000002s : 10: predicate.shard_identity_eliminate 0.62% : 0.000002s : 10: predicate.special_op_eliminate 0.64% : 0.000002s : 10: predicate.specialize_transform 0.81% : 0.000002s : 10: predicate.split_environ_get_set_with_tuple_value 0.68% : 0.000002s : 10: predicate.stack_unstack_eliminate 0.34% : 0.000001s : 5: predicate.switch_call_monad_eliminater 1.33% : 0.000004s : 25: predicate.switch_defer_inline 1.79% : 0.000005s : 35: predicate.switch_layer_defer_inline 4.34% : 0.000013s : 76: predicate.switch_simplify 0.82% : 0.000002s : 17: predicate.tile_eliminate 0.89% : 0.000003s : 17: predicate.transpose_eliminate 1.39% : 0.000004s : 27: predicate.tuple_list_convert_item_index_to_positive 1.56% : 0.000005s : 27: predicate.tuple_list_get_item_const_eliminator 1.36% : 0.000004s : 27: predicate.tuple_list_get_item_depend_reorder 2.75% : 0.000008s : 41: predicate.tuple_list_get_item_eliminator 1.47% : 0.000004s : 27: predicate.tuple_list_get_set_item_eliminator 2.18% : 0.000006s : 37: predicate.tuple_list_set_item_eliminator 1.56% : 0.000005s : 31: predicate.tuple_to_list_eliminator_ 2.29% : 0.000007s : 48: predicate.updatestate_pure_node_eliminater 2.77% : 0.000008s : 58: predicate.updatestate_useless_node_eliminater 0.36% : 0.000001s : 5: predicate.value_based_eliminate 0.72% : 0.000002s : 10: predicate.virtual_dataset_eliminate 0.57% : 0.000002s : 10: predicate.virtual_output_eliminate 0.26% : 0.000001s : 5: predicate.virtual_view_grad_eliminate 0.35% : 0.000001s : 5: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000744 11 55.27% : 0.000411s : 5: func_graph_cloner_run.FuncGraphClonerGraph 44.73% : 0.000333s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.225056 192 0.00% : 0.000004s : 1: ForceFp32Comm 1.68% : 0.003773s : 1: add_attr 1.67% : 0.003757s : 1: add_attr_with_inline 0.00% : 0.000004s : 1: add_comm_op_reuse_tag 0.03% : 0.000071s : 1: add_recomputation 0.00% : 0.000004s : 1: assign_add_opt 0.03% : 0.000079s : 1: auto_monad 0.01% : 0.000027s : 1: auto_monad_reorder 0.00% : 0.000004s : 1: begin_end_overlap_inline 0.00% : 0.000006s : 1: bias_add_comm_swap 86.14% : 0.193864s : 1: bootstrap 0.02% : 0.000041s : 1: cconv 0.00% : 0.000004s : 1: comm_op_add_attrs 0.01% : 0.000021s : 1: control_data_broadcast_order 0.01% : 0.000011s : 1: convert_after_rewriter 0.01% : 0.000033s : 1: cse_after_recomputation 0.00% : 0.000005s : 1: dataset_repeat_opt 0.00% : 0.000006s : 1: detach_backward 0.00% : 0.000011s : 1: environ_conv 0.01% : 0.000031s : 1: event_method 0.00% : 0.000005s : 1: full_micro_interleaved_order_control 0.00% : 0.000006s : 1: get_jit_bprop_graph 0.00% : 0.000010s : 1: graph_reusing 0.00% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000005s : 1: handle_group_info 0.00% : 0.000005s : 1: inline 0.00% : 0.000006s : 1: insert-virtual-dataset 0.00% : 0.000004s : 1: interleave_parallel_branches 0.00% : 0.000004s : 1: interleave_split_concat_branches 0.00% : 0.000006s : 1: label_fine_grained_interleaved_index 0.00% : 0.000008s : 1: label_micro_interleaved_index 0.22% : 0.000501s : 1: loop_unroll 0.00% : 0.000005s : 1: merge_cast_opt 0.00% : 0.000006s : 1: micro_interleaved_order_control 0.35% : 0.000796s : 1: mutable_eliminate 0.00% : 0.000008s : 1: offloading_packed_experts 0.01% : 0.000020s : 1: opt.transform.loop_unroll_optimizer 0.01% : 0.000023s : 1: opt.transform.mutable_eliminate 0.70% : 0.001578s : 78: opt.transform.opt_a 0.02% : 0.000038s : 1: opt.transform.opt_after_cconv 0.02% : 0.000037s : 1: opt.transform.opt_after_jit_grad 0.07% : 0.000156s : 28: opt.transform.opt_b 0.03% : 0.000069s : 2: opt.transform.opt_trans_graph 0.02% : 0.000051s : 4: opt.transform.symbol_engine_opt 1.61% : 0.003631s : 1: opt_a 0.06% : 0.000134s : 1: opt_after_cconv 0.25% : 0.000561s : 1: opt_after_jit_grad 0.13% : 0.000293s : 1: opt_b 2.79% : 0.006277s : 1: optimize 0.01% : 0.000025s : 1: optimize_parallel_all_gather_comm 0.00% : 0.000009s : 1: order_py_execute_after_rewriter 0.01% : 0.000030s : 1: overlap_grad_flash_sp 0.00% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.00% : 0.000009s : 1: overlap_grad_ring_attention 0.00% : 0.000006s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000005s : 1: overlap_param_gather 0.00% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.00% : 0.000010s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000005s : 1: overlap_recompute_comm 0.00% : 0.000009s : 1: parallel-infer-symbol 0.00% : 0.000004s : 1: parallel-infer-symbol-second 0.00% : 0.000005s : 1: partial_unused_args_eliminate 0.00% : 0.000005s : 1: pipeline_parallel_scheduler 0.00% : 0.000004s : 1: pipeline_split 0.02% : 0.000045s : 1: pre_auto_parallel 0.02% : 0.000037s : 1: py_interpret_to_execute 0.01% : 0.000019s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000005s : 1: remove_cast_before_assign_add 0.01% : 0.000021s : 1: remove_dup_value 0.26% : 0.000582s : 1: renormalize.infer 0.20% : 0.000441s : 1: renormalize.specialize 0.00% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.00% : 0.000011s : 1: rewriter_after_jit_bprop_graph 0.02% : 0.000052s : 1: rewriter_after_opt_a 0.05% : 0.000107s : 1: rewriter_before_opt_a 0.00% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000005s : 1: slice_recompute_activation 0.00% : 0.000005s : 1: split_layernorm_comm 0.00% : 0.000005s : 1: split_matmul_comm_elemetwise 0.00% : 0.000010s : 1: swap_dp_allreduce_reducescatter 0.05% : 0.000101s : 1: symbol_engine_optimizer 0.05% : 0.000101s : 1: tuple_transform 3.27% : 0.007357s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:39:45.159.425 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:39:45.159.695 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.200705, [21] [bootstrap]: 0.00047746 [type_inference]: 0.187506 [event_method]: 2.55e-05 [auto_monad]: 8.311e-05 [graph_reusing]: 7.77e-06 [inline]: 2.94999e-06 [add_attr]: 0.00417279, [1] [add_attr_with_inline]: 0.0041599, [1] [Cycle 1]: 0.00010441, [2] [tag_attr]: 2.663e-05 [meta_addattr_fg_expand]: 6.12001e-06 [parallel-infer-symbol]: 3.91999e-06 [pre_auto_parallel]: 4.782e-05 [insert-virtual-dataset]: 2.68998e-06 [parallel-infer-symbol-second]: 8.79983e-07 [dataset_repeat_opt]: 2.32001e-06 [pipeline_split]: 2.05002e-06 [optimize]: 0.00704712, [53] [py_interpret_to_execute]: 4.183e-05 [rewriter_before_opt_a]: 0.00010862 [opt_a]: 0.00394737, [2] [Cycle 1]: 0.00293278, [45] [expand_dump_flag]: 4.28999e-06 [switch_simplify]: 4.655e-05 [loop_unroll]: 3.555e-05 [a_1]: 0.00080808 [with_stream_mark]: 2.175e-05 [recompute_prepare]: 1.315e-05 [updatestate_depend_eliminate]: 5.64998e-06 [updatestate_assign_eliminate]: 4.01001e-06 [updatestate_loads_eliminate]: 3.83999e-06 [parameter_eliminate]: 2.32001e-06 [a_2]: 0.00013499 [accelerated_algorithm]: 9.32999e-06 [shard]: 1.73002e-06 [meta_shard_fg_expand]: 2.44001e-06 [shard_inline]: 8.62e-06 [merge_send_recv]: 1.083e-05 [auto_parallel]: 9.64e-06 [parallel]: 1.963e-05 [flash_sp]: 1.058e-05 [merge_comm]: 5.62001e-06 [allreduce_fusion]: 5.14998e-06 [matmul_add_comm_reduction]: 1.199e-05 [allreduce_slice_to_reducescatter]: 8.39995e-07 [virtual_shard_identity]: 1.413e-05 [virtual_dataset]: 1.001e-05 [get_grad_eliminate_]: 1.01e-05 [virtual_output]: 9.59999e-06 [merge_forward]: 5.57999e-06 [cell_reuse_recompute_pass]: 1.29e-06 [offload_activation]: 1.241e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.218e-05 [merge_recompute_call_nodes]: 1.62999e-06 [before_grad]: 1.656e-05 [set_forward_comm_id_for_comm_node_pass]: 5.75001e-06 [meta_fg_expand]: 3.91001e-06 [flash_sp_send_recv_attached]: 3.11999e-06 [receive_attached]: 2.83e-06 [after_resolve]: 1.596e-05 [a_after_grad]: 1.492e-05 [renormalize]: 0.00099442 [add_forward_monad_depend]: 6.54001e-06 [auto_monad_grad]: 2.83e-06 [auto_monad_eliminator]: 1.97e-05 [cse]: 4.058e-05 [a_3]: 7.612e-05 [Cycle 2]: 0.00099643, [45] [expand_dump_flag]: 2.06e-06 [switch_simplify]: 9.91003e-06 [loop_unroll]: 7.88999e-06 [a_1]: 0.00019075 [with_stream_mark]: 1.638e-05 [recompute_prepare]: 8.65001e-06 [updatestate_depend_eliminate]: 4.58001e-06 [updatestate_assign_eliminate]: 3.31001e-06 [updatestate_loads_eliminate]: 3.20998e-06 [parameter_eliminate]: 1.12e-06 [a_2]: 0.00012165 [accelerated_algorithm]: 8.24998e-06 [shard]: 1.55999e-06 [meta_shard_fg_expand]: 2.17999e-06 [shard_inline]: 7.63999e-06 [merge_send_recv]: 8.42e-06 [auto_parallel]: 8.13001e-06 [parallel]: 5.79e-06 [flash_sp]: 3.6e-06 [merge_comm]: 4.48001e-06 [allreduce_fusion]: 1.092e-05 [matmul_add_comm_reduction]: 7.19001e-06 [allreduce_slice_to_reducescatter]: 5.79981e-07 [virtual_shard_identity]: 8.85999e-06 [virtual_dataset]: 7.63001e-06 [get_grad_eliminate_]: 7.35e-06 [virtual_output]: 7.76001e-06 [merge_forward]: 3.95998e-06 [cell_reuse_recompute_pass]: 2.34001e-06 [offload_activation]: 9.11998e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.732e-05 [merge_recompute_call_nodes]: 1.43002e-06 [before_grad]: 1.325e-05 [set_forward_comm_id_for_comm_node_pass]: 4.82998e-06 [meta_fg_expand]: 3.31999e-06 [flash_sp_send_recv_attached]: 9.50007e-07 [receive_attached]: 2.16e-06 [after_resolve]: 1.292e-05 [a_after_grad]: 1.269e-05 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 1.64998e-06 [auto_monad_grad]: 1.32e-06 [auto_monad_eliminator]: 1.044e-05 [cse]: 2.022e-05 [a_3]: 5.847e-05 [py_interpret_to_execute_after_opt_a]: 1.811e-05 [slice_cell_reuse_recomputed_activation]: 4.84e-06 [rewriter_after_opt_a]: 5.15e-05 [convert_after_rewriter]: 1.131e-05 [order_py_execute_after_rewriter]: 9.34e-06 [mutable_eliminate]: 0.00087194 [opt_b]: 0.00035337, [1] [Cycle 1]: 0.00034302, [7] [b_1]: 0.00022624 [b_2]: 1.071e-05 [updatestate_depend_eliminate]: 8.15e-06 [updatestate_assign_eliminate]: 3.3e-06 [updatestate_loads_eliminate]: 3.2e-06 [renormalize]: 6.39993e-07 [cse]: 2.711e-05 [optimize_parallel_all_gather_comm]: 2.387e-05 [overlap_param_gather]: 5.27999e-06 [cconv]: 3.452e-05 [loop_unroll]: 0.00062404 [opt_after_cconv]: 0.00016551, [1] [Cycle 1]: 0.00015537, [7] [c_1]: 4.846e-05 [parameter_eliminate]: 4.87998e-06 [updatestate_depend_eliminate]: 8.07e-06 [updatestate_assign_eliminate]: 3.28e-06 [updatestate_loads_eliminate]: 3.15002e-06 [cse]: 2.707e-05 [renormalize]: 2.50002e-07 [remove_dup_value]: 2.03e-05 [tuple_transform]: 0.00011014, [1] [Cycle 1]: 0.00010211, [4] [d_1]: 5.915e-05 [none_parameter_eliminate]: 1.54998e-06 [renormalize]: 2.20025e-07 [switch_simplify]: 8.65001e-06 [partial_unused_args_eliminate]: 5.14e-06 [add_recomputation]: 6.407e-05 [cse_after_recomputation]: 3.323e-05, [1] [Cycle 1]: 2.597e-05, [1] [cse]: 1.68e-05 [environ_conv]: 1.073e-05 [swap_dp_allreduce_reducescatter]: 8.62e-06 [bias_add_comm_swap]: 5.72999e-06 [label_micro_interleaved_index]: 8.69e-06 [label_fine_grained_interleaved_index]: 5.16002e-06 [merge_cast_opt]: 3.93001e-06 [slice_recompute_activation]: 5.14e-06 [micro_interleaved_order_control]: 5.20999e-06 [assign_add_opt]: 3.58e-06 [ForceFp32Comm]: 3.65e-06 [remove_cast_before_assign_add]: 3.53e-06 [full_micro_interleaved_order_control]: 4.45e-06 [reorder_send_recv_between_fp_bp]: 5.70001e-06 [comm_op_add_attrs]: 4.1e-06 [add_comm_op_reuse_tag]: 4.02e-06 [interleave_split_concat_branches]: 3.83001e-06 [interleave_parallel_branches]: 3.73001e-06 [overlap_opt_shard_in_pipeline]: 3.68999e-06 [overlap_opt_shard_grad_in_pipeline]: 4.46002e-06 [control_data_broadcast_order]: 1.932e-05 [grouped_pairwise_exchange_alltoall]: 4.32e-06 [offloading_packed_experts]: 7.64002e-06 [overlap_recompute_and_grad_model_parallel]: 8.70999e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.73001e-06 [overlap_recompute_allgather_and_fa_grad]: 4.17e-06 [overlap_recompute_comm]: 5.46998e-06 [overlap_grad_ring_attention]: 7.73999e-06 [overlap_grad_flash_sp]: 2.775e-05 [begin_end_overlap_inline]: 2.98e-06 [split_matmul_comm_elemetwise]: 4.80001e-06 [split_layernorm_comm]: 4.15e-06 [handle_group_info]: 3.29001e-06 [symbol_engine_optimizer]: 0.0001139, [1] [Cycle 1]: 0.00010659, [6] [build]: 4.38999e-06 [elim_shapecalc]: 1.346e-05 [elim_not_effective]: 1.665e-05 [opt_reshape]: 9.70002e-06 [fold_const_symbol]: 1.395e-05 [renormalize]: 2.30008e-07 [detach_backward]: 4.46002e-06 [pipeline_parallel_scheduler]: 1.89e-06 [auto_monad_reorder]: 2.366e-05 [get_jit_bprop_graph]: 1.88002e-06 [rewriter_after_jit_bprop_graph]: 4.95001e-06 [opt_after_jit_grad]: 0.0005653 [validate]: 4.884e-05 Sums bootstrap : 0.000477s : 0.25% type_inference : 0.187506s : 96.37% event_method : 0.000025s : 0.01% auto_monad : 0.000083s : 0.04% graph_reusing : 0.000008s : 0.00% inline : 0.000003s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000027s : 0.01% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.00% parallel-infer-symbol : 0.000004s : 0.00% pre_auto_parallel : 0.000048s : 0.02% insert-virtual-dataset : 0.000003s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000042s : 0.02% optimize.rewriter_before_opt_a : 0.000109s : 0.06% optimize.opt_a.expand_dump_flag : 0.000006s : 0.00% optimize.opt_a.switch_simplify : 0.000056s : 0.03% optimize.opt_a.loop_unroll : 0.000043s : 0.02% optimize.opt_a.a_1 : 0.000999s : 0.51% optimize.opt_a.with_stream_mark : 0.000038s : 0.02% optimize.opt_a.recompute_prepare : 0.000022s : 0.01% optimize.opt_a.updatestate_depend_eliminate : 0.000010s : 0.01% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.00% optimize.opt_a.updatestate_loads_eliminate : 0.000007s : 0.00% optimize.opt_a.parameter_eliminate : 0.000003s : 0.00% optimize.opt_a.a_2 : 0.000257s : 0.13% optimize.opt_a.accelerated_algorithm : 0.000018s : 0.01% optimize.opt_a.shard : 0.000003s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.000005s : 0.00% optimize.opt_a.shard_inline : 0.000016s : 0.01% optimize.opt_a.merge_send_recv : 0.000019s : 0.01% optimize.opt_a.auto_parallel : 0.000018s : 0.01% optimize.opt_a.parallel : 0.000025s : 0.01% optimize.opt_a.flash_sp : 0.000014s : 0.01% optimize.opt_a.merge_comm : 0.000010s : 0.01% optimize.opt_a.allreduce_fusion : 0.000016s : 0.01% optimize.opt_a.matmul_add_comm_reduction : 0.000019s : 0.01% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000023s : 0.01% optimize.opt_a.virtual_dataset : 0.000018s : 0.01% optimize.opt_a.get_grad_eliminate_ : 0.000017s : 0.01% optimize.opt_a.virtual_output : 0.000017s : 0.01% optimize.opt_a.merge_forward : 0.000010s : 0.00% optimize.opt_a.cell_reuse_recompute_pass : 0.000004s : 0.00% optimize.opt_a.offload_activation : 0.000022s : 0.01% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000039s : 0.02% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.00% optimize.opt_a.before_grad : 0.000030s : 0.02% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000011s : 0.01% optimize.opt_a.meta_fg_expand : 0.000007s : 0.00% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.00% optimize.opt_a.receive_attached : 0.000005s : 0.00% optimize.opt_a.after_resolve : 0.000029s : 0.01% optimize.opt_a.a_after_grad : 0.000028s : 0.01% optimize.opt_a.renormalize : 0.000994s : 0.51% optimize.opt_a.add_forward_monad_depend : 0.000008s : 0.00% optimize.opt_a.auto_monad_grad : 0.000004s : 0.00% optimize.opt_a.auto_monad_eliminator : 0.000030s : 0.02% optimize.opt_a.cse : 0.000061s : 0.03% optimize.opt_a.a_3 : 0.000135s : 0.07% optimize.py_interpret_to_execute_after_opt_a : 0.000018s : 0.01% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.00% optimize.rewriter_after_opt_a : 0.000052s : 0.03% optimize.convert_after_rewriter : 0.000011s : 0.01% optimize.order_py_execute_after_rewriter : 0.000009s : 0.00% optimize.mutable_eliminate : 0.000872s : 0.45% optimize.opt_b.b_1 : 0.000226s : 0.12% optimize.opt_b.b_2 : 0.000011s : 0.01% optimize.opt_b.updatestate_depend_eliminate : 0.000008s : 0.00% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.00% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000027s : 0.01% optimize.optimize_parallel_all_gather_comm : 0.000024s : 0.01% optimize.overlap_param_gather : 0.000005s : 0.00% optimize.cconv : 0.000035s : 0.02% optimize.loop_unroll : 0.000624s : 0.32% optimize.opt_after_cconv.c_1 : 0.000048s : 0.02% optimize.opt_after_cconv.parameter_eliminate : 0.000005s : 0.00% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000008s : 0.00% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.cse : 0.000027s : 0.01% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000020s : 0.01% optimize.tuple_transform.d_1 : 0.000059s : 0.03% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000009s : 0.00% optimize.partial_unused_args_eliminate : 0.000005s : 0.00% optimize.add_recomputation : 0.000064s : 0.03% optimize.cse_after_recomputation.cse : 0.000017s : 0.01% optimize.environ_conv : 0.000011s : 0.01% optimize.swap_dp_allreduce_reducescatter : 0.000009s : 0.00% optimize.bias_add_comm_swap : 0.000006s : 0.00% optimize.label_micro_interleaved_index : 0.000009s : 0.00% optimize.label_fine_grained_interleaved_index : 0.000005s : 0.00% optimize.merge_cast_opt : 0.000004s : 0.00% optimize.slice_recompute_activation : 0.000005s : 0.00% optimize.micro_interleaved_order_control : 0.000005s : 0.00% optimize.assign_add_opt : 0.000004s : 0.00% optimize.ForceFp32Comm : 0.000004s : 0.00% optimize.remove_cast_before_assign_add : 0.000004s : 0.00% optimize.full_micro_interleaved_order_control : 0.000004s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000006s : 0.00% optimize.comm_op_add_attrs : 0.000004s : 0.00% optimize.add_comm_op_reuse_tag : 0.000004s : 0.00% optimize.interleave_split_concat_branches : 0.000004s : 0.00% optimize.interleave_parallel_branches : 0.000004s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000004s : 0.00% optimize.control_data_broadcast_order : 0.000019s : 0.01% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.00% optimize.offloading_packed_experts : 0.000008s : 0.00% optimize.overlap_recompute_and_grad_model_parallel : 0.000009s : 0.00% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.00% optimize.overlap_recompute_comm : 0.000005s : 0.00% optimize.overlap_grad_ring_attention : 0.000008s : 0.00% optimize.overlap_grad_flash_sp : 0.000028s : 0.01% optimize.begin_end_overlap_inline : 0.000003s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000005s : 0.00% optimize.split_layernorm_comm : 0.000004s : 0.00% optimize.handle_group_info : 0.000003s : 0.00% optimize.symbol_engine_optimizer.build : 0.000004s : 0.00% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000013s : 0.01% optimize.symbol_engine_optimizer.elim_not_effective : 0.000017s : 0.01% optimize.symbol_engine_optimizer.opt_reshape : 0.000010s : 0.00% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000014s : 0.01% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000004s : 0.00% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000024s : 0.01% get_jit_bprop_graph : 0.000002s : 0.00% rewriter_after_jit_bprop_graph : 0.000005s : 0.00% opt_after_jit_grad : 0.000565s : 0.29% validate : 0.000049s : 0.03% Time group info: ------[substitution.] 0.000265 38 11.52% : 0.000031s : 3: substitution.cast_eliminate 0.97% : 0.000003s : 3: substitution.elim_not_effective 0.70% : 0.000002s : 3: substitution.fold_const_symbol 2.84% : 0.000008s : 5: substitution.graph_param_transform 70.18% : 0.000186s : 4: substitution.inline 2.17% : 0.000006s : 6: substitution.j_node_and_user_rematch 2.74% : 0.000007s : 6: substitution.remove_not_recompute_node 2.29% : 0.000006s : 4: substitution.replace_old_param 6.58% : 0.000017s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.187429 2 99.46% : 0.186426s : 1: type_inference.infer 0.54% : 0.001003s : 1: type_inference.specialize ------[replace.] 0.000069 8 61.63% : 0.000042s : 4: replace.inline 38.37% : 0.000026s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000199 8 92.30% : 0.000183s : 4: match.inline 7.70% : 0.000015s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000271 1596 0.96% : 0.000003s : 17: predicate.accumulaten_eliminater 0.87% : 0.000002s : 5: predicate.ad_related_special_op_eliminate 0.61% : 0.000002s : 10: predicate.addn_check_dump 0.99% : 0.000003s : 17: predicate.addn_zero_filter 0.86% : 0.000002s : 17: predicate.adjust_all_reduce_mul_add 2.11% : 0.000006s : 27: predicate.arithmetic_simplify 0.94% : 0.000003s : 17: predicate.cast_eliminate 0.59% : 0.000002s : 10: predicate.check_bprop_eliminate 0.60% : 0.000002s : 10: predicate.compare_switch_simplify 0.17% : 0.000000s : 5: predicate.const_output_eliminate 0.65% : 0.000002s : 10: predicate.depend_value_elim 0.99% : 0.000003s : 17: predicate.dict_get_item_const_eliminator 1.12% : 0.000003s : 17: predicate.dict_get_item_eliminator 0.99% : 0.000003s : 17: predicate.dict_set_item_eliminator 0.97% : 0.000003s : 10: predicate.dumpgradient_eliminate 0.21% : 0.000001s : 5: predicate.elim_not_effective 0.41% : 0.000001s : 5: predicate.elim_shapecalc_of_broadcastargs 1.22% : 0.000003s : 22: predicate.environ_add_const_eliminate 1.12% : 0.000003s : 22: predicate.environ_get_add_eliminate 1.11% : 0.000003s : 22: predicate.environ_get_depend_swap 1.70% : 0.000005s : 32: predicate.environ_get_eliminate 1.12% : 0.000003s : 22: predicate.environ_get_set_eliminate 1.46% : 0.000004s : 25: predicate.exchange_switch_depend_value 2.14% : 0.000006s : 25: predicate.float_depend_g_call 0.55% : 0.000001s : 10: predicate.float_environ_get_switch 0.82% : 0.000002s : 15: predicate.float_tuple_getitem_switch 0.16% : 0.000000s : 5: predicate.fold_const_symbol 0.68% : 0.000002s : 10: predicate.get_grad_eliminate 0.17% : 0.000000s : 5: predicate.graph_param_transform 0.59% : 0.000002s : 10: predicate.incorporate_call 0.50% : 0.000001s : 10: predicate.incorporate_call_switch 6.40% : 0.000017s : 72: predicate.inline 0.93% : 0.000003s : 10: predicate.inline_without_move 0.30% : 0.000001s : 10: predicate.j_node_and_user_rematch 0.81% : 0.000002s : 10: predicate.less_batch_normalization 1.80% : 0.000005s : 31: predicate.list_to_tuple_eliminator_ 2.45% : 0.000007s : 48: predicate.load_eliminater 1.05% : 0.000003s : 5: predicate.loop_unroll_after_grad 2.18% : 0.000006s : 36: predicate.loop_unroll_before_grad 1.67% : 0.000005s : 27: predicate.make_slice_get_slice_eliminator 0.65% : 0.000002s : 10: predicate.merge_addn 0.64% : 0.000002s : 10: predicate.micro_step_allgather_replace 0.55% : 0.000001s : 10: predicate.mini_step_allgather_replace 0.91% : 0.000002s : 17: predicate.minmaximum_grad 1.04% : 0.000003s : 5: predicate.mutable_eliminate 0.33% : 0.000001s : 5: predicate.opt_reshape 0.33% : 0.000001s : 5: predicate.parallel_virtual_node 1.79% : 0.000005s : 25: predicate.partial_defer_inline 1.58% : 0.000004s : 26: predicate.partial_eliminate 0.91% : 0.000002s : 17: predicate.print_const_string_wrapper 0.65% : 0.000002s : 10: predicate.reduce_all_const_elim 1.24% : 0.000003s : 17: predicate.reduce_eliminate 2.44% : 0.000007s : 48: predicate.redundant_stop_gradient_eliminater 0.37% : 0.000001s : 10: predicate.remove_not_recompute_node 1.35% : 0.000004s : 31: predicate.replace_applicator 0.45% : 0.000001s : 10: predicate.replace_old_param 0.24% : 0.000001s : 5: predicate.reset_defer_inline 0.96% : 0.000003s : 17: predicate.reshape_eliminate 0.68% : 0.000002s : 10: predicate.row_tensor_add_zeros_like 0.39% : 0.000001s : 5: predicate.row_tensor_eliminate 0.73% : 0.000002s : 10: predicate.same_eliminate 0.43% : 0.000001s : 10: predicate.set_cell_output_no_recompute 0.89% : 0.000002s : 10: predicate.shard_identity_eliminate 0.72% : 0.000002s : 10: predicate.special_op_eliminate 0.67% : 0.000002s : 10: predicate.specialize_transform 1.02% : 0.000003s : 10: predicate.split_environ_get_set_with_tuple_value 0.76% : 0.000002s : 10: predicate.stack_unstack_eliminate 0.37% : 0.000001s : 5: predicate.switch_call_monad_eliminater 1.47% : 0.000004s : 25: predicate.switch_defer_inline 1.99% : 0.000005s : 35: predicate.switch_layer_defer_inline 4.79% : 0.000013s : 76: predicate.switch_simplify 1.00% : 0.000003s : 17: predicate.tile_eliminate 0.97% : 0.000003s : 17: predicate.transpose_eliminate 1.55% : 0.000004s : 27: predicate.tuple_list_convert_item_index_to_positive 1.58% : 0.000004s : 27: predicate.tuple_list_get_item_const_eliminator 1.61% : 0.000004s : 27: predicate.tuple_list_get_item_depend_reorder 3.32% : 0.000009s : 41: predicate.tuple_list_get_item_eliminator 1.57% : 0.000004s : 27: predicate.tuple_list_get_set_item_eliminator 2.33% : 0.000006s : 37: predicate.tuple_list_set_item_eliminator 1.72% : 0.000005s : 31: predicate.tuple_to_list_eliminator_ 2.44% : 0.000007s : 48: predicate.updatestate_pure_node_eliminater 3.22% : 0.000009s : 58: predicate.updatestate_useless_node_eliminater 0.39% : 0.000001s : 5: predicate.value_based_eliminate 0.65% : 0.000002s : 10: predicate.virtual_dataset_eliminate 0.65% : 0.000002s : 10: predicate.virtual_output_eliminate 0.26% : 0.000001s : 5: predicate.virtual_view_grad_eliminate 0.43% : 0.000001s : 5: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000739 11 49.47% : 0.000366s : 5: func_graph_cloner_run.FuncGraphClonerGraph 50.53% : 0.000374s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.214712 192 0.00% : 0.000006s : 1: ForceFp32Comm 1.95% : 0.004188s : 1: add_attr 1.94% : 0.004164s : 1: add_attr_with_inline 0.00% : 0.000007s : 1: add_comm_op_reuse_tag 0.03% : 0.000068s : 1: add_recomputation 0.00% : 0.000006s : 1: assign_add_opt 0.04% : 0.000093s : 1: auto_monad 0.01% : 0.000032s : 1: auto_monad_reorder 0.00% : 0.000006s : 1: begin_end_overlap_inline 0.00% : 0.000008s : 1: bias_add_comm_swap 0.24% : 0.000525s : 1: bootstrap 0.02% : 0.000038s : 1: cconv 0.00% : 0.000007s : 1: comm_op_add_attrs 0.01% : 0.000023s : 1: control_data_broadcast_order 0.01% : 0.000015s : 1: convert_after_rewriter 0.02% : 0.000036s : 1: cse_after_recomputation 0.00% : 0.000008s : 1: dataset_repeat_opt 0.01% : 0.000020s : 1: detach_backward 0.01% : 0.000014s : 1: environ_conv 0.02% : 0.000037s : 1: event_method 0.00% : 0.000007s : 1: full_micro_interleaved_order_control 0.00% : 0.000008s : 1: get_jit_bprop_graph 0.01% : 0.000014s : 1: graph_reusing 0.00% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000006s : 1: handle_group_info 0.00% : 0.000009s : 1: inline 0.00% : 0.000009s : 1: insert-virtual-dataset 0.00% : 0.000006s : 1: interleave_parallel_branches 0.00% : 0.000006s : 1: interleave_split_concat_branches 0.00% : 0.000008s : 1: label_fine_grained_interleaved_index 0.01% : 0.000011s : 1: label_micro_interleaved_index 0.29% : 0.000633s : 1: loop_unroll 0.00% : 0.000007s : 1: merge_cast_opt 0.00% : 0.000008s : 1: micro_interleaved_order_control 0.41% : 0.000880s : 1: mutable_eliminate 0.00% : 0.000010s : 1: offloading_packed_experts 0.01% : 0.000020s : 1: opt.transform.loop_unroll_optimizer 0.01% : 0.000020s : 1: opt.transform.mutable_eliminate 0.73% : 0.001565s : 78: opt.transform.opt_a 0.02% : 0.000047s : 1: opt.transform.opt_after_cconv 0.02% : 0.000033s : 1: opt.transform.opt_after_jit_grad 0.07% : 0.000150s : 28: opt.transform.opt_b 0.03% : 0.000066s : 2: opt.transform.opt_trans_graph 0.02% : 0.000050s : 4: opt.transform.symbol_engine_opt 1.84% : 0.003951s : 1: opt_a 0.08% : 0.000170s : 1: opt_after_cconv 0.27% : 0.000575s : 1: opt_after_jit_grad 0.17% : 0.000357s : 1: opt_b 3.46% : 0.007429s : 1: optimize 0.01% : 0.000027s : 1: optimize_parallel_all_gather_comm 0.01% : 0.000012s : 1: order_py_execute_after_rewriter 0.01% : 0.000031s : 1: overlap_grad_flash_sp 0.00% : 0.000006s : 1: overlap_grad_matmul_and_grad_allreduce 0.01% : 0.000012s : 1: overlap_grad_ring_attention 0.00% : 0.000008s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000006s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000008s : 1: overlap_param_gather 0.00% : 0.000007s : 1: overlap_recompute_allgather_and_fa_grad 0.01% : 0.000012s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000008s : 1: overlap_recompute_comm 0.01% : 0.000012s : 1: parallel-infer-symbol 0.00% : 0.000007s : 1: parallel-infer-symbol-second 0.00% : 0.000008s : 1: partial_unused_args_eliminate 0.00% : 0.000010s : 1: pipeline_parallel_scheduler 0.00% : 0.000008s : 1: pipeline_split 0.03% : 0.000056s : 1: pre_auto_parallel 0.02% : 0.000047s : 1: py_interpret_to_execute 0.01% : 0.000022s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000007s : 1: remove_cast_before_assign_add 0.01% : 0.000024s : 1: remove_dup_value 0.25% : 0.000544s : 1: renormalize.infer 0.20% : 0.000440s : 1: renormalize.specialize 0.00% : 0.000008s : 1: reorder_send_recv_between_fp_bp 0.01% : 0.000011s : 1: rewriter_after_jit_bprop_graph 0.03% : 0.000055s : 1: rewriter_after_opt_a 0.05% : 0.000113s : 1: rewriter_before_opt_a 0.00% : 0.000008s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000008s : 1: slice_recompute_activation 0.00% : 0.000007s : 1: split_layernorm_comm 0.00% : 0.000008s : 1: split_matmul_comm_elemetwise 0.01% : 0.000011s : 1: swap_dp_allreduce_reducescatter 0.05% : 0.000117s : 1: symbol_engine_optimizer 0.05% : 0.000113s : 1: tuple_transform 87.36% : 0.187571s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:39:47.632.333 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0294856, [21] [bootstrap]: 0.00037773 [type_inference]: 0.0179341 [event_method]: 1.955e-05 [auto_monad]: 7.174e-05 [graph_reusing]: 6.66999e-06 [inline]: 2.99999e-06 [add_attr]: 0.00391397, [1] [add_attr_with_inline]: 0.00390019, [1] [Cycle 1]: 8.449e-05, [2] [tag_attr]: 2.608e-05 [meta_addattr_fg_expand]: 6.26e-06 [parallel-infer-symbol]: 3.55e-06 [pre_auto_parallel]: 4.178e-05 [insert-virtual-dataset]: 2.65997e-06 [parallel-infer-symbol-second]: 7.80012e-07 [dataset_repeat_opt]: 2.44001e-06 [pipeline_split]: 1.72001e-06 [optimize]: 0.00628812, [53] [py_interpret_to_execute]: 3.11e-05 [rewriter_before_opt_a]: 9.725e-05 [opt_a]: 0.00362097, [2] [Cycle 1]: 0.00274579, [45] [expand_dump_flag]: 3.25998e-06 [switch_simplify]: 4.619e-05 [loop_unroll]: 3.164e-05 [a_1]: 0.00077875 [with_stream_mark]: 2.259e-05 [recompute_prepare]: 1.144e-05 [updatestate_depend_eliminate]: 5.62999e-06 [updatestate_assign_eliminate]: 4.03999e-06 [updatestate_loads_eliminate]: 3.43e-06 [parameter_eliminate]: 2.07001e-06 [a_2]: 0.0001054 [accelerated_algorithm]: 9.21998e-06 [shard]: 2.29999e-06 [meta_shard_fg_expand]: 2.69001e-06 [shard_inline]: 7.8e-06 [merge_send_recv]: 9.74e-06 [auto_parallel]: 8.04997e-06 [parallel]: 2.079e-05 [flash_sp]: 1.047e-05 [merge_comm]: 5.32999e-06 [allreduce_fusion]: 4.55001e-06 [matmul_add_comm_reduction]: 1.201e-05 [allreduce_slice_to_reducescatter]: 1.02998e-06 [virtual_shard_identity]: 1.138e-05 [virtual_dataset]: 9.50001e-06 [get_grad_eliminate_]: 8.18001e-06 [virtual_output]: 8.13999e-06 [merge_forward]: 5.05001e-06 [cell_reuse_recompute_pass]: 1.91e-06 [offload_activation]: 1.218e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.94e-05 [merge_recompute_call_nodes]: 2.17999e-06 [before_grad]: 1.662e-05 [set_forward_comm_id_for_comm_node_pass]: 5.25001e-06 [meta_fg_expand]: 4.18001e-06 [flash_sp_send_recv_attached]: 3.4e-06 [receive_attached]: 2.84001e-06 [after_resolve]: 1.511e-05 [a_after_grad]: 1.443e-05 [renormalize]: 0.0010663 [add_forward_monad_depend]: 8.02e-06 [auto_monad_grad]: 2.87002e-06 [auto_monad_eliminator]: 2.364e-05 [cse]: 4.258e-05 [a_3]: 6.762e-05 [Cycle 2]: 0.00086211, [45] [expand_dump_flag]: 2.80997e-06 [switch_simplify]: 1.049e-05 [loop_unroll]: 7.82e-06 [a_1]: 0.00018779 [with_stream_mark]: 1.927e-05 [recompute_prepare]: 8.62e-06 [updatestate_depend_eliminate]: 5.19e-06 [updatestate_assign_eliminate]: 3.28e-06 [updatestate_loads_eliminate]: 3.18e-06 [parameter_eliminate]: 1.65001e-06 [a_2]: 9.362e-05 [accelerated_algorithm]: 8.39002e-06 [shard]: 2.19001e-06 [meta_shard_fg_expand]: 2.12001e-06 [shard_inline]: 7.58001e-06 [merge_send_recv]: 9.94001e-06 [auto_parallel]: 9.16998e-06 [parallel]: 8.92999e-06 [flash_sp]: 4.57998e-06 [merge_comm]: 4.35e-06 [allreduce_fusion]: 1.054e-05 [matmul_add_comm_reduction]: 1.029e-05 [allreduce_slice_to_reducescatter]: 7.80012e-07 [virtual_shard_identity]: 8.97e-06 [virtual_dataset]: 7.61001e-06 [get_grad_eliminate_]: 7.13998e-06 [virtual_output]: 7.92e-06 [merge_forward]: 4.65001e-06 [cell_reuse_recompute_pass]: 2.86e-06 [offload_activation]: 1.047e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.802e-05 [merge_recompute_call_nodes]: 1.10999e-06 [before_grad]: 1.339e-05 [set_forward_comm_id_for_comm_node_pass]: 5.38002e-06 [meta_fg_expand]: 3.09999e-06 [flash_sp_send_recv_attached]: 1.54e-06 [receive_attached]: 2.22999e-06 [after_resolve]: 1.65e-05 [a_after_grad]: 1.239e-05 [renormalize]: 8.00064e-08 [add_forward_monad_depend]: 2.44999e-06 [auto_monad_grad]: 2.00002e-06 [auto_monad_eliminator]: 1.238e-05 [cse]: 2.377e-05 [a_3]: 4.609e-05 [py_interpret_to_execute_after_opt_a]: 1.782e-05 [slice_cell_reuse_recomputed_activation]: 2.84001e-06 [rewriter_after_opt_a]: 5.038e-05 [convert_after_rewriter]: 8.62e-06 [order_py_execute_after_rewriter]: 6.16998e-06 [mutable_eliminate]: 0.00078239 [opt_b]: 0.00028139, [1] [Cycle 1]: 0.0002737, [7] [b_1]: 0.00017484 [b_2]: 1.226e-05 [updatestate_depend_eliminate]: 6.78e-06 [updatestate_assign_eliminate]: 3.39001e-06 [updatestate_loads_eliminate]: 3.23998e-06 [renormalize]: 7.2e-07 [cse]: 3.099e-05 [optimize_parallel_all_gather_comm]: 2.028e-05 [overlap_param_gather]: 2.34001e-06 [cconv]: 3.531e-05 [loop_unroll]: 0.00051201 [opt_after_cconv]: 0.00013149, [1] [Cycle 1]: 0.00012397, [7] [c_1]: 4.373e-05 [parameter_eliminate]: 4.25999e-06 [updatestate_depend_eliminate]: 6.44001e-06 [updatestate_assign_eliminate]: 3.48999e-06 [updatestate_loads_eliminate]: 3.41999e-06 [cse]: 2.566e-05 [renormalize]: 7.2e-07 [remove_dup_value]: 1.772e-05 [tuple_transform]: 9.362e-05, [1] [Cycle 1]: 8.853e-05, [4] [d_1]: 5.813e-05 [none_parameter_eliminate]: 1.77999e-06 [renormalize]: 1.8999e-07 [switch_simplify]: 8.13001e-06 [partial_unused_args_eliminate]: 2.06e-06 [add_recomputation]: 6.583e-05 [cse_after_recomputation]: 2.884e-05, [1] [Cycle 1]: 2.358e-05, [1] [cse]: 1.762e-05 [environ_conv]: 6.12999e-06 [swap_dp_allreduce_reducescatter]: 6.00002e-06 [bias_add_comm_swap]: 4.07998e-06 [label_micro_interleaved_index]: 5.26002e-06 [label_fine_grained_interleaved_index]: 3.61001e-06 [merge_cast_opt]: 1.69998e-06 [slice_recompute_activation]: 2.45002e-06 [micro_interleaved_order_control]: 3.03e-06 [assign_add_opt]: 1.49998e-06 [ForceFp32Comm]: 1.19e-06 [remove_cast_before_assign_add]: 1.15001e-06 [full_micro_interleaved_order_control]: 2.56e-06 [reorder_send_recv_between_fp_bp]: 3.01999e-06 [comm_op_add_attrs]: 1.08001e-06 [add_comm_op_reuse_tag]: 1.07998e-06 [interleave_split_concat_branches]: 1.60001e-06 [interleave_parallel_branches]: 1.14e-06 [overlap_opt_shard_in_pipeline]: 1.67001e-06 [overlap_opt_shard_grad_in_pipeline]: 1.94e-06 [control_data_broadcast_order]: 1.586e-05 [grouped_pairwise_exchange_alltoall]: 1.64998e-06 [offloading_packed_experts]: 5.37999e-06 [overlap_recompute_and_grad_model_parallel]: 5.64e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.55001e-06 [overlap_recompute_allgather_and_fa_grad]: 1.36998e-06 [overlap_recompute_comm]: 2.44001e-06 [overlap_grad_ring_attention]: 5.66e-06 [overlap_grad_flash_sp]: 2.653e-05 [begin_end_overlap_inline]: 6.00005e-07 [split_matmul_comm_elemetwise]: 3.27002e-06 [split_layernorm_comm]: 1.86e-06 [handle_group_info]: 1.17999e-06 [symbol_engine_optimizer]: 0.00011522, [1] [Cycle 1]: 0.00011009, [6] [build]: 4.00998e-06 [elim_shapecalc]: 1.498e-05 [elim_not_effective]: 1.693e-05 [opt_reshape]: 8.96998e-06 [fold_const_symbol]: 3.151e-05 [renormalize]: 2.00002e-07 [detach_backward]: 2.41e-06 [pipeline_parallel_scheduler]: 1.65001e-06 [auto_monad_reorder]: 2.244e-05 [get_jit_bprop_graph]: 2.05002e-06 [rewriter_after_jit_bprop_graph]: 4.80999e-06 [opt_after_jit_grad]: 0.00056565 [validate]: 5.078e-05 Sums bootstrap : 0.000378s : 1.54% type_inference : 0.017934s : 73.21% event_method : 0.000020s : 0.08% auto_monad : 0.000072s : 0.29% graph_reusing : 0.000007s : 0.03% inline : 0.000003s : 0.01% add_attr.add_attr_with_inline.tag_attr : 0.000026s : 0.11% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.03% parallel-infer-symbol : 0.000004s : 0.01% pre_auto_parallel : 0.000042s : 0.17% insert-virtual-dataset : 0.000003s : 0.01% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.01% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000031s : 0.13% optimize.rewriter_before_opt_a : 0.000097s : 0.40% optimize.opt_a.expand_dump_flag : 0.000006s : 0.02% optimize.opt_a.switch_simplify : 0.000057s : 0.23% optimize.opt_a.loop_unroll : 0.000039s : 0.16% optimize.opt_a.a_1 : 0.000967s : 3.95% optimize.opt_a.with_stream_mark : 0.000042s : 0.17% optimize.opt_a.recompute_prepare : 0.000020s : 0.08% optimize.opt_a.updatestate_depend_eliminate : 0.000011s : 0.04% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.03% optimize.opt_a.updatestate_loads_eliminate : 0.000007s : 0.03% optimize.opt_a.parameter_eliminate : 0.000004s : 0.02% optimize.opt_a.a_2 : 0.000199s : 0.81% optimize.opt_a.accelerated_algorithm : 0.000018s : 0.07% optimize.opt_a.shard : 0.000004s : 0.02% optimize.opt_a.meta_shard_fg_expand : 0.000005s : 0.02% optimize.opt_a.shard_inline : 0.000015s : 0.06% optimize.opt_a.merge_send_recv : 0.000020s : 0.08% optimize.opt_a.auto_parallel : 0.000017s : 0.07% optimize.opt_a.parallel : 0.000030s : 0.12% optimize.opt_a.flash_sp : 0.000015s : 0.06% optimize.opt_a.merge_comm : 0.000010s : 0.04% optimize.opt_a.allreduce_fusion : 0.000015s : 0.06% optimize.opt_a.matmul_add_comm_reduction : 0.000022s : 0.09% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000002s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000020s : 0.08% optimize.opt_a.virtual_dataset : 0.000017s : 0.07% optimize.opt_a.get_grad_eliminate_ : 0.000015s : 0.06% optimize.opt_a.virtual_output : 0.000016s : 0.07% optimize.opt_a.merge_forward : 0.000010s : 0.04% optimize.opt_a.cell_reuse_recompute_pass : 0.000005s : 0.02% optimize.opt_a.offload_activation : 0.000023s : 0.09% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000037s : 0.15% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.01% optimize.opt_a.before_grad : 0.000030s : 0.12% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000011s : 0.04% optimize.opt_a.meta_fg_expand : 0.000007s : 0.03% optimize.opt_a.flash_sp_send_recv_attached : 0.000005s : 0.02% optimize.opt_a.receive_attached : 0.000005s : 0.02% optimize.opt_a.after_resolve : 0.000032s : 0.13% optimize.opt_a.a_after_grad : 0.000027s : 0.11% optimize.opt_a.renormalize : 0.001066s : 4.35% optimize.opt_a.add_forward_monad_depend : 0.000010s : 0.04% optimize.opt_a.auto_monad_grad : 0.000005s : 0.02% optimize.opt_a.auto_monad_eliminator : 0.000036s : 0.15% optimize.opt_a.cse : 0.000066s : 0.27% optimize.opt_a.a_3 : 0.000114s : 0.46% optimize.py_interpret_to_execute_after_opt_a : 0.000018s : 0.07% optimize.slice_cell_reuse_recomputed_activation : 0.000003s : 0.01% optimize.rewriter_after_opt_a : 0.000050s : 0.21% optimize.convert_after_rewriter : 0.000009s : 0.04% optimize.order_py_execute_after_rewriter : 0.000006s : 0.03% optimize.mutable_eliminate : 0.000782s : 3.19% optimize.opt_b.b_1 : 0.000175s : 0.71% optimize.opt_b.b_2 : 0.000012s : 0.05% optimize.opt_b.updatestate_depend_eliminate : 0.000007s : 0.03% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.01% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.01% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000031s : 0.13% optimize.optimize_parallel_all_gather_comm : 0.000020s : 0.08% optimize.overlap_param_gather : 0.000002s : 0.01% optimize.cconv : 0.000035s : 0.14% optimize.loop_unroll : 0.000512s : 2.09% optimize.opt_after_cconv.c_1 : 0.000044s : 0.18% optimize.opt_after_cconv.parameter_eliminate : 0.000004s : 0.02% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.03% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.cse : 0.000026s : 0.10% optimize.opt_after_cconv.renormalize : 0.000001s : 0.00% optimize.remove_dup_value : 0.000018s : 0.07% optimize.tuple_transform.d_1 : 0.000058s : 0.24% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000008s : 0.03% optimize.partial_unused_args_eliminate : 0.000002s : 0.01% optimize.add_recomputation : 0.000066s : 0.27% optimize.cse_after_recomputation.cse : 0.000018s : 0.07% optimize.environ_conv : 0.000006s : 0.03% optimize.swap_dp_allreduce_reducescatter : 0.000006s : 0.02% optimize.bias_add_comm_swap : 0.000004s : 0.02% optimize.label_micro_interleaved_index : 0.000005s : 0.02% optimize.label_fine_grained_interleaved_index : 0.000004s : 0.01% optimize.merge_cast_opt : 0.000002s : 0.01% optimize.slice_recompute_activation : 0.000002s : 0.01% optimize.micro_interleaved_order_control : 0.000003s : 0.01% optimize.assign_add_opt : 0.000001s : 0.01% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.00% optimize.full_micro_interleaved_order_control : 0.000003s : 0.01% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.01% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000002s : 0.01% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000002s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.01% optimize.control_data_broadcast_order : 0.000016s : 0.06% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.01% optimize.offloading_packed_experts : 0.000005s : 0.02% optimize.overlap_recompute_and_grad_model_parallel : 0.000006s : 0.02% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000002s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.01% optimize.overlap_recompute_comm : 0.000002s : 0.01% optimize.overlap_grad_ring_attention : 0.000006s : 0.02% optimize.overlap_grad_flash_sp : 0.000027s : 0.11% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000003s : 0.01% optimize.split_layernorm_comm : 0.000002s : 0.01% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000004s : 0.02% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000015s : 0.06% optimize.symbol_engine_optimizer.elim_not_effective : 0.000017s : 0.07% optimize.symbol_engine_optimizer.opt_reshape : 0.000009s : 0.04% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000032s : 0.13% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.01% pipeline_parallel_scheduler : 0.000002s : 0.01% auto_monad_reorder : 0.000022s : 0.09% get_jit_bprop_graph : 0.000002s : 0.01% rewriter_after_jit_bprop_graph : 0.000005s : 0.02% opt_after_jit_grad : 0.000566s : 2.31% validate : 0.000051s : 0.21% Time group info: ------[substitution.] 0.000241 38 12.16% : 0.000029s : 3: substitution.cast_eliminate 0.99% : 0.000002s : 3: substitution.elim_not_effective 1.22% : 0.000003s : 3: substitution.fold_const_symbol 3.57% : 0.000009s : 5: substitution.graph_param_transform 67.17% : 0.000162s : 4: substitution.inline 2.44% : 0.000006s : 6: substitution.j_node_and_user_rematch 3.07% : 0.000007s : 6: substitution.remove_not_recompute_node 3.13% : 0.000008s : 4: substitution.replace_old_param 6.24% : 0.000015s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.017865 2 95.51% : 0.017063s : 1: type_inference.infer 4.49% : 0.000802s : 1: type_inference.specialize ------[replace.] 0.000067 8 58.33% : 0.000039s : 4: replace.inline 41.67% : 0.000028s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000172 8 92.44% : 0.000159s : 4: match.inline 7.56% : 0.000013s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000268 1596 0.91% : 0.000002s : 17: predicate.accumulaten_eliminater 0.70% : 0.000002s : 5: predicate.ad_related_special_op_eliminate 0.57% : 0.000002s : 10: predicate.addn_check_dump 0.91% : 0.000002s : 17: predicate.addn_zero_filter 0.83% : 0.000002s : 17: predicate.adjust_all_reduce_mul_add 1.88% : 0.000005s : 27: predicate.arithmetic_simplify 0.99% : 0.000003s : 17: predicate.cast_eliminate 0.57% : 0.000002s : 10: predicate.check_bprop_eliminate 0.53% : 0.000001s : 10: predicate.compare_switch_simplify 0.18% : 0.000000s : 5: predicate.const_output_eliminate 0.55% : 0.000001s : 10: predicate.depend_value_elim 0.96% : 0.000003s : 17: predicate.dict_get_item_const_eliminator 1.12% : 0.000003s : 17: predicate.dict_get_item_eliminator 0.90% : 0.000002s : 17: predicate.dict_set_item_eliminator 0.89% : 0.000002s : 10: predicate.dumpgradient_eliminate 0.21% : 0.000001s : 5: predicate.elim_not_effective 0.37% : 0.000001s : 5: predicate.elim_shapecalc_of_broadcastargs 1.14% : 0.000003s : 22: predicate.environ_add_const_eliminate 1.17% : 0.000003s : 22: predicate.environ_get_add_eliminate 1.06% : 0.000003s : 22: predicate.environ_get_depend_swap 1.74% : 0.000005s : 32: predicate.environ_get_eliminate 1.14% : 0.000003s : 22: predicate.environ_get_set_eliminate 1.37% : 0.000004s : 25: predicate.exchange_switch_depend_value 2.33% : 0.000006s : 25: predicate.float_depend_g_call 0.65% : 0.000002s : 10: predicate.float_environ_get_switch 0.78% : 0.000002s : 15: predicate.float_tuple_getitem_switch 0.18% : 0.000000s : 5: predicate.fold_const_symbol 0.68% : 0.000002s : 10: predicate.get_grad_eliminate 0.25% : 0.000001s : 5: predicate.graph_param_transform 0.66% : 0.000002s : 10: predicate.incorporate_call 0.49% : 0.000001s : 10: predicate.incorporate_call_switch 6.34% : 0.000017s : 72: predicate.inline 0.74% : 0.000002s : 10: predicate.inline_without_move 0.32% : 0.000001s : 10: predicate.j_node_and_user_rematch 0.85% : 0.000002s : 10: predicate.less_batch_normalization 1.80% : 0.000005s : 31: predicate.list_to_tuple_eliminator_ 2.57% : 0.000007s : 48: predicate.load_eliminater 1.13% : 0.000003s : 5: predicate.loop_unroll_after_grad 2.03% : 0.000005s : 36: predicate.loop_unroll_before_grad 1.75% : 0.000005s : 27: predicate.make_slice_get_slice_eliminator 0.63% : 0.000002s : 10: predicate.merge_addn 0.54% : 0.000001s : 10: predicate.micro_step_allgather_replace 0.62% : 0.000002s : 10: predicate.mini_step_allgather_replace 0.85% : 0.000002s : 17: predicate.minmaximum_grad 0.80% : 0.000002s : 5: predicate.mutable_eliminate 0.36% : 0.000001s : 5: predicate.opt_reshape 0.37% : 0.000001s : 5: predicate.parallel_virtual_node 1.75% : 0.000005s : 25: predicate.partial_defer_inline 1.61% : 0.000004s : 26: predicate.partial_eliminate 0.99% : 0.000003s : 17: predicate.print_const_string_wrapper 0.56% : 0.000002s : 10: predicate.reduce_all_const_elim 1.24% : 0.000003s : 17: predicate.reduce_eliminate 2.57% : 0.000007s : 48: predicate.redundant_stop_gradient_eliminater 0.39% : 0.000001s : 10: predicate.remove_not_recompute_node 1.27% : 0.000003s : 31: predicate.replace_applicator 0.58% : 0.000002s : 10: predicate.replace_old_param 0.41% : 0.000001s : 5: predicate.reset_defer_inline 1.07% : 0.000003s : 17: predicate.reshape_eliminate 0.63% : 0.000002s : 10: predicate.row_tensor_add_zeros_like 0.40% : 0.000001s : 5: predicate.row_tensor_eliminate 0.87% : 0.000002s : 10: predicate.same_eliminate 0.54% : 0.000001s : 10: predicate.set_cell_output_no_recompute 0.86% : 0.000002s : 10: predicate.shard_identity_eliminate 0.89% : 0.000002s : 10: predicate.special_op_eliminate 0.69% : 0.000002s : 10: predicate.specialize_transform 1.07% : 0.000003s : 10: predicate.split_environ_get_set_with_tuple_value 0.91% : 0.000002s : 10: predicate.stack_unstack_eliminate 0.33% : 0.000001s : 5: predicate.switch_call_monad_eliminater 1.48% : 0.000004s : 25: predicate.switch_defer_inline 1.98% : 0.000005s : 35: predicate.switch_layer_defer_inline 4.81% : 0.000013s : 76: predicate.switch_simplify 0.99% : 0.000003s : 17: predicate.tile_eliminate 0.99% : 0.000003s : 17: predicate.transpose_eliminate 1.70% : 0.000005s : 27: predicate.tuple_list_convert_item_index_to_positive 1.63% : 0.000004s : 27: predicate.tuple_list_get_item_const_eliminator 1.56% : 0.000004s : 27: predicate.tuple_list_get_item_depend_reorder 3.32% : 0.000009s : 41: predicate.tuple_list_get_item_eliminator 1.52% : 0.000004s : 27: predicate.tuple_list_get_set_item_eliminator 2.17% : 0.000006s : 37: predicate.tuple_list_set_item_eliminator 1.83% : 0.000005s : 31: predicate.tuple_to_list_eliminator_ 2.44% : 0.000007s : 48: predicate.updatestate_pure_node_eliminater 3.15% : 0.000008s : 58: predicate.updatestate_useless_node_eliminater 0.39% : 0.000001s : 5: predicate.value_based_eliminate 0.73% : 0.000002s : 10: predicate.virtual_dataset_eliminate 0.64% : 0.000002s : 10: predicate.virtual_output_eliminate 0.28% : 0.000001s : 5: predicate.virtual_view_grad_eliminate 0.37% : 0.000001s : 5: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000595 11 46.70% : 0.000278s : 5: func_graph_cloner_run.FuncGraphClonerGraph 53.30% : 0.000317s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.042513 192 0.01% : 0.000004s : 1: ForceFp32Comm 9.22% : 0.003922s : 1: add_attr 9.18% : 0.003905s : 1: add_attr_with_inline 0.01% : 0.000004s : 1: add_comm_op_reuse_tag 0.17% : 0.000070s : 1: add_recomputation 0.01% : 0.000004s : 1: assign_add_opt 0.18% : 0.000077s : 1: auto_monad 0.06% : 0.000028s : 1: auto_monad_reorder 0.01% : 0.000003s : 1: begin_end_overlap_inline 0.02% : 0.000007s : 1: bias_add_comm_swap 0.95% : 0.000404s : 1: bootstrap 0.09% : 0.000039s : 1: cconv 0.01% : 0.000004s : 1: comm_op_add_attrs 0.05% : 0.000020s : 1: control_data_broadcast_order 0.03% : 0.000012s : 1: convert_after_rewriter 0.07% : 0.000032s : 1: cse_after_recomputation 0.01% : 0.000005s : 1: dataset_repeat_opt 0.01% : 0.000006s : 1: detach_backward 0.02% : 0.000010s : 1: environ_conv 0.06% : 0.000027s : 1: event_method 0.01% : 0.000006s : 1: full_micro_interleaved_order_control 0.01% : 0.000005s : 1: get_jit_bprop_graph 0.02% : 0.000010s : 1: graph_reusing 0.01% : 0.000005s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000004s : 1: handle_group_info 0.01% : 0.000006s : 1: inline 0.02% : 0.000006s : 1: insert-virtual-dataset 0.01% : 0.000004s : 1: interleave_parallel_branches 0.01% : 0.000004s : 1: interleave_split_concat_branches 0.02% : 0.000007s : 1: label_fine_grained_interleaved_index 0.02% : 0.000008s : 1: label_micro_interleaved_index 1.23% : 0.000522s : 1: loop_unroll 0.01% : 0.000005s : 1: merge_cast_opt 0.01% : 0.000006s : 1: micro_interleaved_order_control 1.87% : 0.000793s : 1: mutable_eliminate 0.02% : 0.000008s : 1: offloading_packed_experts 0.05% : 0.000020s : 1: opt.transform.loop_unroll_optimizer 0.05% : 0.000021s : 1: opt.transform.mutable_eliminate 3.57% : 0.001518s : 78: opt.transform.opt_a 0.10% : 0.000042s : 1: opt.transform.opt_after_cconv 0.08% : 0.000032s : 1: opt.transform.opt_after_jit_grad 0.35% : 0.000150s : 28: opt.transform.opt_b 0.15% : 0.000064s : 2: opt.transform.opt_trans_graph 0.16% : 0.000068s : 4: opt.transform.symbol_engine_opt 8.53% : 0.003625s : 1: opt_a 0.32% : 0.000135s : 1: opt_after_cconv 1.36% : 0.000576s : 1: opt_after_jit_grad 0.67% : 0.000285s : 1: opt_b 14.81% : 0.006295s : 1: optimize 0.06% : 0.000024s : 1: optimize_parallel_all_gather_comm 0.02% : 0.000009s : 1: order_py_execute_after_rewriter 0.07% : 0.000030s : 1: overlap_grad_flash_sp 0.01% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.02% : 0.000010s : 1: overlap_grad_ring_attention 0.01% : 0.000006s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000005s : 1: overlap_opt_shard_in_pipeline 0.01% : 0.000006s : 1: overlap_param_gather 0.01% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.02% : 0.000009s : 1: overlap_recompute_and_grad_model_parallel 0.01% : 0.000005s : 1: overlap_recompute_comm 0.02% : 0.000007s : 1: parallel-infer-symbol 0.01% : 0.000004s : 1: parallel-infer-symbol-second 0.01% : 0.000005s : 1: partial_unused_args_eliminate 0.01% : 0.000005s : 1: pipeline_parallel_scheduler 0.01% : 0.000004s : 1: pipeline_split 0.11% : 0.000046s : 1: pre_auto_parallel 0.08% : 0.000035s : 1: py_interpret_to_execute 0.05% : 0.000022s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000005s : 1: remove_cast_before_assign_add 0.05% : 0.000022s : 1: remove_dup_value 1.42% : 0.000605s : 1: renormalize.infer 1.06% : 0.000450s : 1: renormalize.specialize 0.01% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.02% : 0.000008s : 1: rewriter_after_jit_bprop_graph 0.13% : 0.000056s : 1: rewriter_after_opt_a 0.24% : 0.000101s : 1: rewriter_before_opt_a 0.01% : 0.000006s : 1: slice_cell_reuse_recomputed_activation 0.01% : 0.000006s : 1: slice_recompute_activation 0.01% : 0.000005s : 1: split_layernorm_comm 0.01% : 0.000006s : 1: split_matmul_comm_elemetwise 0.02% : 0.000009s : 1: swap_dp_allreduce_reducescatter 0.28% : 0.000118s : 1: symbol_engine_optimizer 0.23% : 0.000097s : 1: tuple_transform 42.24% : 0.017959s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:39:49.831.680 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:39:49.831.969 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.154668, [21] [bootstrap]: 0.0004383 [type_inference]: 0.142931 [event_method]: 2.365e-05 [auto_monad]: 7.221e-05 [graph_reusing]: 6.49999e-06 [inline]: 3.84002e-06 [add_attr]: 0.00367718, [1] [add_attr_with_inline]: 0.00366404, [1] [Cycle 1]: 9.501e-05, [2] [tag_attr]: 2.295e-05 [meta_addattr_fg_expand]: 6.38003e-06 [parallel-infer-symbol]: 3.38e-06 [pre_auto_parallel]: 4.153e-05 [insert-virtual-dataset]: 3.10002e-06 [parallel-infer-symbol-second]: 8.00006e-07 [dataset_repeat_opt]: 2.09e-06 [pipeline_split]: 1.54e-06 [optimize]: 0.00611129, [53] [py_interpret_to_execute]: 3.537e-05 [rewriter_before_opt_a]: 9.767e-05 [opt_a]: 0.0034981, [2] [Cycle 1]: 0.00247285, [45] [expand_dump_flag]: 3.25e-06 [switch_simplify]: 4.332e-05 [loop_unroll]: 3.139e-05 [a_1]: 0.00066651 [with_stream_mark]: 2.443e-05 [recompute_prepare]: 1.107e-05 [updatestate_depend_eliminate]: 5.25999e-06 [updatestate_assign_eliminate]: 3.36999e-06 [updatestate_loads_eliminate]: 3.13e-06 [parameter_eliminate]: 2.32999e-06 [a_2]: 0.00011205 [accelerated_algorithm]: 8.14997e-06 [shard]: 1.80001e-06 [meta_shard_fg_expand]: 2.14e-06 [shard_inline]: 7.05e-06 [merge_send_recv]: 9.84999e-06 [auto_parallel]: 7.77e-06 [parallel]: 2.085e-05 [flash_sp]: 9.80002e-06 [merge_comm]: 4.08999e-06 [allreduce_fusion]: 3.51999e-06 [matmul_add_comm_reduction]: 1.114e-05 [allreduce_slice_to_reducescatter]: 1.00999e-06 [virtual_shard_identity]: 9.39e-06 [virtual_dataset]: 7.38e-06 [get_grad_eliminate_]: 6.83e-06 [virtual_output]: 7.41999e-06 [merge_forward]: 4.47e-06 [cell_reuse_recompute_pass]: 1.17999e-06 [offload_activation]: 1.158e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.877e-05 [merge_recompute_call_nodes]: 2.21e-06 [before_grad]: 1.321e-05 [set_forward_comm_id_for_comm_node_pass]: 4.22e-06 [meta_fg_expand]: 2.86999e-06 [flash_sp_send_recv_attached]: 3.05002e-06 [receive_attached]: 2.38998e-06 [after_resolve]: 1.376e-05 [a_after_grad]: 1.174e-05 [renormalize]: 0.00082477 [add_forward_monad_depend]: 6.96999e-06 [auto_monad_grad]: 2.79999e-06 [auto_monad_eliminator]: 1.732e-05 [cse]: 2.858e-05 [a_3]: 6.516e-05 [Cycle 2]: 0.00100838, [45] [expand_dump_flag]: 1.73002e-06 [switch_simplify]: 8.70999e-06 [loop_unroll]: 6.34001e-06 [a_1]: 0.00013337 [with_stream_mark]: 1.564e-05 [recompute_prepare]: 6.95998e-06 [updatestate_depend_eliminate]: 3.08998e-06 [updatestate_assign_eliminate]: 2.42001e-06 [updatestate_loads_eliminate]: 2.16e-06 [parameter_eliminate]: 1.33002e-06 [a_2]: 0.00012479 [accelerated_algorithm]: 7.29001e-06 [shard]: 1.72001e-06 [meta_shard_fg_expand]: 1.81e-06 [shard_inline]: 6.71e-06 [merge_send_recv]: 6.49001e-06 [auto_parallel]: 6.39999e-06 [parallel]: 6.74999e-06 [flash_sp]: 4.31002e-06 [merge_comm]: 3.55e-06 [allreduce_fusion]: 3.3e-06 [matmul_add_comm_reduction]: 7.41999e-06 [allreduce_slice_to_reducescatter]: 4.70027e-07 [virtual_shard_identity]: 7.57998e-06 [virtual_dataset]: 6.31998e-06 [get_grad_eliminate_]: 6.17001e-06 [virtual_output]: 5.96e-06 [merge_forward]: 4e-06 [cell_reuse_recompute_pass]: 1.77999e-06 [offload_activation]: 1.29e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.573e-05 [merge_recompute_call_nodes]: 1.22e-06 [before_grad]: 1.139e-05 [set_forward_comm_id_for_comm_node_pass]: 5.59e-06 [meta_fg_expand]: 2.27001e-06 [flash_sp_send_recv_attached]: 1.34998e-06 [receive_attached]: 1.82001e-06 [after_resolve]: 1.442e-05 [a_after_grad]: 1.101e-05 [renormalize]: 8.00064e-08 [add_forward_monad_depend]: 3.68999e-06 [auto_monad_grad]: 2.94001e-06 [auto_monad_eliminator]: 1.316e-05 [cse]: 2.112e-05 [a_3]: 5.237e-05 [py_interpret_to_execute_after_opt_a]: 1.732e-05 [slice_cell_reuse_recomputed_activation]: 5.66e-06 [rewriter_after_opt_a]: 4.848e-05 [convert_after_rewriter]: 1.049e-05 [order_py_execute_after_rewriter]: 8.25999e-06 [mutable_eliminate]: 0.00065382 [opt_b]: 0.00028538, [1] [Cycle 1]: 0.00027527, [7] [b_1]: 0.00017174 [b_2]: 8.91997e-06 [updatestate_depend_eliminate]: 8.55001e-06 [updatestate_assign_eliminate]: 2.96001e-06 [updatestate_loads_eliminate]: 2.76e-06 [renormalize]: 6.69999e-07 [cse]: 2.189e-05 [optimize_parallel_all_gather_comm]: 2.699e-05 [overlap_param_gather]: 4.68999e-06 [cconv]: 3.722e-05 [loop_unroll]: 0.00048529 [opt_after_cconv]: 0.0001378, [1] [Cycle 1]: 0.00012806, [7] [c_1]: 3.264e-05 [parameter_eliminate]: 4.30999e-06 [updatestate_depend_eliminate]: 7.37002e-06 [updatestate_assign_eliminate]: 2.54001e-06 [updatestate_loads_eliminate]: 2.29001e-06 [cse]: 2.124e-05 [renormalize]: 2.69996e-07 [remove_dup_value]: 1.693e-05 [tuple_transform]: 9.86e-05, [1] [Cycle 1]: 9.096e-05, [4] [d_1]: 5.011e-05 [none_parameter_eliminate]: 1.53002e-06 [renormalize]: 2.59985e-07 [switch_simplify]: 7.82e-06 [partial_unused_args_eliminate]: 4.73001e-06 [add_recomputation]: 6.114e-05 [cse_after_recomputation]: 3.072e-05, [1] [Cycle 1]: 2.284e-05, [1] [cse]: 1.306e-05 [environ_conv]: 9.34e-06 [swap_dp_allreduce_reducescatter]: 9.60001e-06 [bias_add_comm_swap]: 5.44e-06 [label_micro_interleaved_index]: 8.18999e-06 [label_fine_grained_interleaved_index]: 5.81e-06 [merge_cast_opt]: 4.22003e-06 [slice_recompute_activation]: 4.88001e-06 [micro_interleaved_order_control]: 5.17e-06 [assign_add_opt]: 3.98001e-06 [ForceFp32Comm]: 3.66001e-06 [remove_cast_before_assign_add]: 3.66001e-06 [full_micro_interleaved_order_control]: 4.62998e-06 [reorder_send_recv_between_fp_bp]: 5.72999e-06 [comm_op_add_attrs]: 4.29997e-06 [add_comm_op_reuse_tag]: 3.83001e-06 [interleave_split_concat_branches]: 3.86999e-06 [interleave_parallel_branches]: 4.23001e-06 [overlap_opt_shard_in_pipeline]: 4.12e-06 [overlap_opt_shard_grad_in_pipeline]: 4.79e-06 [control_data_broadcast_order]: 1.87e-05 [grouped_pairwise_exchange_alltoall]: 4.2e-06 [offloading_packed_experts]: 7.31999e-06 [overlap_recompute_and_grad_model_parallel]: 8.53001e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.85e-06 [overlap_recompute_allgather_and_fa_grad]: 3.7e-06 [overlap_recompute_comm]: 5.67001e-06 [overlap_grad_ring_attention]: 6.89001e-06 [overlap_grad_flash_sp]: 3.043e-05 [begin_end_overlap_inline]: 2.96001e-06 [split_matmul_comm_elemetwise]: 4.73001e-06 [split_layernorm_comm]: 4.2e-06 [handle_group_info]: 3.66001e-06 [symbol_engine_optimizer]: 0.00010862, [1] [Cycle 1]: 0.00010012, [6] [build]: 3.95e-06 [elim_shapecalc]: 1.29e-05 [elim_not_effective]: 1.333e-05 [opt_reshape]: 7.83999e-06 [fold_const_symbol]: 1.092e-05 [renormalize]: 2.50002e-07 [detach_backward]: 3.86999e-06 [pipeline_parallel_scheduler]: 1.81e-06 [auto_monad_reorder]: 2.126e-05 [get_jit_bprop_graph]: 1.99999e-06 [rewriter_after_jit_bprop_graph]: 6.26e-06 [opt_after_jit_grad]: 0.00063903 [validate]: 4.482e-05 Sums bootstrap : 0.000438s : 0.29% type_inference : 0.142931s : 95.90% event_method : 0.000024s : 0.02% auto_monad : 0.000072s : 0.05% graph_reusing : 0.000006s : 0.00% inline : 0.000004s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000023s : 0.02% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.00% parallel-infer-symbol : 0.000003s : 0.00% pre_auto_parallel : 0.000042s : 0.03% insert-virtual-dataset : 0.000003s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000035s : 0.02% optimize.rewriter_before_opt_a : 0.000098s : 0.07% optimize.opt_a.expand_dump_flag : 0.000005s : 0.00% optimize.opt_a.switch_simplify : 0.000052s : 0.03% optimize.opt_a.loop_unroll : 0.000038s : 0.03% optimize.opt_a.a_1 : 0.000800s : 0.54% optimize.opt_a.with_stream_mark : 0.000040s : 0.03% optimize.opt_a.recompute_prepare : 0.000018s : 0.01% optimize.opt_a.updatestate_depend_eliminate : 0.000008s : 0.01% optimize.opt_a.updatestate_assign_eliminate : 0.000006s : 0.00% optimize.opt_a.updatestate_loads_eliminate : 0.000005s : 0.00% optimize.opt_a.parameter_eliminate : 0.000004s : 0.00% optimize.opt_a.a_2 : 0.000237s : 0.16% optimize.opt_a.accelerated_algorithm : 0.000015s : 0.01% optimize.opt_a.shard : 0.000004s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.00% optimize.opt_a.shard_inline : 0.000014s : 0.01% optimize.opt_a.merge_send_recv : 0.000016s : 0.01% optimize.opt_a.auto_parallel : 0.000014s : 0.01% optimize.opt_a.parallel : 0.000028s : 0.02% optimize.opt_a.flash_sp : 0.000014s : 0.01% optimize.opt_a.merge_comm : 0.000008s : 0.01% optimize.opt_a.allreduce_fusion : 0.000007s : 0.00% optimize.opt_a.matmul_add_comm_reduction : 0.000019s : 0.01% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000017s : 0.01% optimize.opt_a.virtual_dataset : 0.000014s : 0.01% optimize.opt_a.get_grad_eliminate_ : 0.000013s : 0.01% optimize.opt_a.virtual_output : 0.000013s : 0.01% optimize.opt_a.merge_forward : 0.000008s : 0.01% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.00% optimize.opt_a.offload_activation : 0.000024s : 0.02% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000044s : 0.03% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.00% optimize.opt_a.before_grad : 0.000025s : 0.02% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000010s : 0.01% optimize.opt_a.meta_fg_expand : 0.000005s : 0.00% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.00% optimize.opt_a.receive_attached : 0.000004s : 0.00% optimize.opt_a.after_resolve : 0.000028s : 0.02% optimize.opt_a.a_after_grad : 0.000023s : 0.02% optimize.opt_a.renormalize : 0.000825s : 0.55% optimize.opt_a.add_forward_monad_depend : 0.000011s : 0.01% optimize.opt_a.auto_monad_grad : 0.000006s : 0.00% optimize.opt_a.auto_monad_eliminator : 0.000030s : 0.02% optimize.opt_a.cse : 0.000050s : 0.03% optimize.opt_a.a_3 : 0.000118s : 0.08% optimize.py_interpret_to_execute_after_opt_a : 0.000017s : 0.01% optimize.slice_cell_reuse_recomputed_activation : 0.000006s : 0.00% optimize.rewriter_after_opt_a : 0.000048s : 0.03% optimize.convert_after_rewriter : 0.000010s : 0.01% optimize.order_py_execute_after_rewriter : 0.000008s : 0.01% optimize.mutable_eliminate : 0.000654s : 0.44% optimize.opt_b.b_1 : 0.000172s : 0.12% optimize.opt_b.b_2 : 0.000009s : 0.01% optimize.opt_b.updatestate_depend_eliminate : 0.000009s : 0.01% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.00% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000022s : 0.01% optimize.optimize_parallel_all_gather_comm : 0.000027s : 0.02% optimize.overlap_param_gather : 0.000005s : 0.00% optimize.cconv : 0.000037s : 0.02% optimize.loop_unroll : 0.000485s : 0.33% optimize.opt_after_cconv.c_1 : 0.000033s : 0.02% optimize.opt_after_cconv.parameter_eliminate : 0.000004s : 0.00% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000007s : 0.00% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.00% optimize.opt_after_cconv.cse : 0.000021s : 0.01% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000017s : 0.01% optimize.tuple_transform.d_1 : 0.000050s : 0.03% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000008s : 0.01% optimize.partial_unused_args_eliminate : 0.000005s : 0.00% optimize.add_recomputation : 0.000061s : 0.04% optimize.cse_after_recomputation.cse : 0.000013s : 0.01% optimize.environ_conv : 0.000009s : 0.01% optimize.swap_dp_allreduce_reducescatter : 0.000010s : 0.01% optimize.bias_add_comm_swap : 0.000005s : 0.00% optimize.label_micro_interleaved_index : 0.000008s : 0.01% optimize.label_fine_grained_interleaved_index : 0.000006s : 0.00% optimize.merge_cast_opt : 0.000004s : 0.00% optimize.slice_recompute_activation : 0.000005s : 0.00% optimize.micro_interleaved_order_control : 0.000005s : 0.00% optimize.assign_add_opt : 0.000004s : 0.00% optimize.ForceFp32Comm : 0.000004s : 0.00% optimize.remove_cast_before_assign_add : 0.000004s : 0.00% optimize.full_micro_interleaved_order_control : 0.000005s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000006s : 0.00% optimize.comm_op_add_attrs : 0.000004s : 0.00% optimize.add_comm_op_reuse_tag : 0.000004s : 0.00% optimize.interleave_split_concat_branches : 0.000004s : 0.00% optimize.interleave_parallel_branches : 0.000004s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000005s : 0.00% optimize.control_data_broadcast_order : 0.000019s : 0.01% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.00% optimize.offloading_packed_experts : 0.000007s : 0.00% optimize.overlap_recompute_and_grad_model_parallel : 0.000009s : 0.01% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.00% optimize.overlap_recompute_comm : 0.000006s : 0.00% optimize.overlap_grad_ring_attention : 0.000007s : 0.00% optimize.overlap_grad_flash_sp : 0.000030s : 0.02% optimize.begin_end_overlap_inline : 0.000003s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000005s : 0.00% optimize.split_layernorm_comm : 0.000004s : 0.00% optimize.handle_group_info : 0.000004s : 0.00% optimize.symbol_engine_optimizer.build : 0.000004s : 0.00% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000013s : 0.01% optimize.symbol_engine_optimizer.elim_not_effective : 0.000013s : 0.01% optimize.symbol_engine_optimizer.opt_reshape : 0.000008s : 0.01% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000011s : 0.01% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000004s : 0.00% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000021s : 0.01% get_jit_bprop_graph : 0.000002s : 0.00% rewriter_after_jit_bprop_graph : 0.000006s : 0.00% opt_after_jit_grad : 0.000639s : 0.43% validate : 0.000045s : 0.03% Time group info: ------[substitution.] 0.000214 28 0.89% : 0.000002s : 2: substitution.elim_not_effective 0.62% : 0.000001s : 2: substitution.fold_const_symbol 2.95% : 0.000006s : 4: substitution.graph_param_transform 79.05% : 0.000169s : 4: substitution.inline 2.27% : 0.000005s : 4: substitution.j_node_and_user_rematch 4.28% : 0.000009s : 4: substitution.remove_not_recompute_node 2.86% : 0.000006s : 4: substitution.replace_old_param 7.09% : 0.000015s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.142863 2 99.38% : 0.141972s : 1: type_inference.infer 0.62% : 0.000891s : 1: type_inference.specialize ------[replace.] 0.000065 8 64.58% : 0.000042s : 4: replace.inline 35.42% : 0.000023s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000180 8 92.58% : 0.000167s : 4: match.inline 7.42% : 0.000013s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000216 1278 0.98% : 0.000002s : 13: predicate.accumulaten_eliminater 0.97% : 0.000002s : 4: predicate.ad_related_special_op_eliminate 0.68% : 0.000001s : 8: predicate.addn_check_dump 0.92% : 0.000002s : 13: predicate.addn_zero_filter 0.82% : 0.000002s : 13: predicate.adjust_all_reduce_mul_add 2.27% : 0.000005s : 21: predicate.arithmetic_simplify 0.85% : 0.000002s : 13: predicate.cast_eliminate 0.56% : 0.000001s : 8: predicate.check_bprop_eliminate 0.50% : 0.000001s : 8: predicate.compare_switch_simplify 0.19% : 0.000000s : 4: predicate.const_output_eliminate 0.74% : 0.000002s : 8: predicate.depend_value_elim 0.87% : 0.000002s : 13: predicate.dict_get_item_const_eliminator 1.00% : 0.000002s : 13: predicate.dict_get_item_eliminator 0.88% : 0.000002s : 13: predicate.dict_set_item_eliminator 1.15% : 0.000002s : 8: predicate.dumpgradient_eliminate 0.22% : 0.000000s : 4: predicate.elim_not_effective 0.38% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.06% : 0.000002s : 17: predicate.environ_add_const_eliminate 1.03% : 0.000002s : 17: predicate.environ_get_add_eliminate 1.06% : 0.000002s : 17: predicate.environ_get_depend_swap 1.67% : 0.000004s : 25: predicate.environ_get_eliminate 1.02% : 0.000002s : 17: predicate.environ_get_set_eliminate 1.36% : 0.000003s : 21: predicate.exchange_switch_depend_value 2.48% : 0.000005s : 21: predicate.float_depend_g_call 0.56% : 0.000001s : 8: predicate.float_environ_get_switch 0.72% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.19% : 0.000000s : 4: predicate.fold_const_symbol 0.69% : 0.000001s : 8: predicate.get_grad_eliminate 0.21% : 0.000000s : 4: predicate.graph_param_transform 0.59% : 0.000001s : 8: predicate.incorporate_call 0.49% : 0.000001s : 8: predicate.incorporate_call_switch 6.54% : 0.000014s : 58: predicate.inline 1.12% : 0.000002s : 8: predicate.inline_without_move 0.34% : 0.000001s : 8: predicate.j_node_and_user_rematch 0.95% : 0.000002s : 8: predicate.less_batch_normalization 1.76% : 0.000004s : 25: predicate.list_to_tuple_eliminator_ 2.55% : 0.000006s : 38: predicate.load_eliminater 1.22% : 0.000003s : 4: predicate.loop_unroll_after_grad 2.34% : 0.000005s : 34: predicate.loop_unroll_before_grad 1.77% : 0.000004s : 21: predicate.make_slice_get_slice_eliminator 0.50% : 0.000001s : 8: predicate.merge_addn 0.56% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.57% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.84% : 0.000002s : 13: predicate.minmaximum_grad 1.24% : 0.000003s : 4: predicate.mutable_eliminate 0.30% : 0.000001s : 4: predicate.opt_reshape 0.42% : 0.000001s : 4: predicate.parallel_virtual_node 1.89% : 0.000004s : 21: predicate.partial_defer_inline 1.57% : 0.000003s : 21: predicate.partial_eliminate 0.85% : 0.000002s : 13: predicate.print_const_string_wrapper 0.61% : 0.000001s : 8: predicate.reduce_all_const_elim 1.10% : 0.000002s : 13: predicate.reduce_eliminate 2.36% : 0.000005s : 38: predicate.redundant_stop_gradient_eliminater 0.54% : 0.000001s : 8: predicate.remove_not_recompute_node 1.44% : 0.000003s : 25: predicate.replace_applicator 0.59% : 0.000001s : 8: predicate.replace_old_param 0.48% : 0.000001s : 4: predicate.reset_defer_inline 0.87% : 0.000002s : 13: predicate.reshape_eliminate 0.74% : 0.000002s : 8: predicate.row_tensor_add_zeros_like 0.43% : 0.000001s : 4: predicate.row_tensor_eliminate 0.71% : 0.000002s : 8: predicate.same_eliminate 0.56% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.71% : 0.000002s : 8: predicate.shard_identity_eliminate 0.72% : 0.000002s : 8: predicate.special_op_eliminate 0.68% : 0.000001s : 8: predicate.specialize_transform 0.99% : 0.000002s : 8: predicate.split_environ_get_set_with_tuple_value 0.80% : 0.000002s : 8: predicate.stack_unstack_eliminate 0.31% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.44% : 0.000003s : 21: predicate.switch_defer_inline 2.01% : 0.000004s : 29: predicate.switch_layer_defer_inline 5.02% : 0.000011s : 67: predicate.switch_simplify 0.86% : 0.000002s : 13: predicate.tile_eliminate 0.84% : 0.000002s : 13: predicate.transpose_eliminate 1.44% : 0.000003s : 21: predicate.tuple_list_convert_item_index_to_positive 1.48% : 0.000003s : 21: predicate.tuple_list_get_item_const_eliminator 1.35% : 0.000003s : 21: predicate.tuple_list_get_item_depend_reorder 3.24% : 0.000007s : 33: predicate.tuple_list_get_item_eliminator 1.40% : 0.000003s : 21: predicate.tuple_list_get_set_item_eliminator 2.16% : 0.000005s : 29: predicate.tuple_list_set_item_eliminator 1.67% : 0.000004s : 25: predicate.tuple_to_list_eliminator_ 2.31% : 0.000005s : 38: predicate.updatestate_pure_node_eliminater 3.21% : 0.000007s : 46: predicate.updatestate_useless_node_eliminater 0.43% : 0.000001s : 4: predicate.value_based_eliminate 0.64% : 0.000001s : 8: predicate.virtual_dataset_eliminate 0.60% : 0.000001s : 8: predicate.virtual_output_eliminate 0.36% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.47% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000671 11 51.35% : 0.000345s : 5: func_graph_cloner_run.FuncGraphClonerGraph 48.65% : 0.000326s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.166712 192 0.00% : 0.000006s : 1: ForceFp32Comm 2.21% : 0.003688s : 1: add_attr 2.20% : 0.003668s : 1: add_attr_with_inline 0.00% : 0.000008s : 1: add_comm_op_reuse_tag 0.04% : 0.000066s : 1: add_recomputation 0.00% : 0.000007s : 1: assign_add_opt 0.05% : 0.000081s : 1: auto_monad 0.02% : 0.000029s : 1: auto_monad_reorder 0.00% : 0.000006s : 1: begin_end_overlap_inline 0.01% : 0.000008s : 1: bias_add_comm_swap 0.29% : 0.000482s : 1: bootstrap 0.02% : 0.000041s : 1: cconv 0.00% : 0.000007s : 1: comm_op_add_attrs 0.01% : 0.000023s : 1: control_data_broadcast_order 0.01% : 0.000014s : 1: convert_after_rewriter 0.02% : 0.000034s : 1: cse_after_recomputation 0.00% : 0.000008s : 1: dataset_repeat_opt 0.01% : 0.000022s : 1: detach_backward 0.01% : 0.000012s : 1: environ_conv 0.02% : 0.000035s : 1: event_method 0.00% : 0.000008s : 1: full_micro_interleaved_order_control 0.00% : 0.000008s : 1: get_jit_bprop_graph 0.01% : 0.000013s : 1: graph_reusing 0.00% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000006s : 1: handle_group_info 0.01% : 0.000010s : 1: inline 0.01% : 0.000009s : 1: insert-virtual-dataset 0.00% : 0.000007s : 1: interleave_parallel_branches 0.00% : 0.000007s : 1: interleave_split_concat_branches 0.01% : 0.000009s : 1: label_fine_grained_interleaved_index 0.01% : 0.000011s : 1: label_micro_interleaved_index 0.29% : 0.000492s : 1: loop_unroll 0.00% : 0.000007s : 1: merge_cast_opt 0.00% : 0.000008s : 1: micro_interleaved_order_control 0.40% : 0.000661s : 1: mutable_eliminate 0.01% : 0.000010s : 1: offloading_packed_experts 0.01% : 0.000018s : 1: opt.transform.loop_unroll_optimizer 0.01% : 0.000018s : 1: opt.transform.mutable_eliminate 0.77% : 0.001286s : 78: opt.transform.opt_a 0.02% : 0.000031s : 1: opt.transform.opt_after_cconv 0.02% : 0.000031s : 1: opt.transform.opt_after_jit_grad 0.06% : 0.000107s : 28: opt.transform.opt_b 0.03% : 0.000056s : 2: opt.transform.opt_trans_graph 0.02% : 0.000041s : 4: opt.transform.symbol_engine_opt 2.10% : 0.003501s : 1: opt_a 0.08% : 0.000141s : 1: opt_after_cconv 0.39% : 0.000651s : 1: opt_after_jit_grad 0.17% : 0.000289s : 1: opt_b 3.88% : 0.006474s : 1: optimize 0.02% : 0.000030s : 1: optimize_parallel_all_gather_comm 0.01% : 0.000012s : 1: order_py_execute_after_rewriter 0.02% : 0.000034s : 1: overlap_grad_flash_sp 0.00% : 0.000007s : 1: overlap_grad_matmul_and_grad_allreduce 0.01% : 0.000010s : 1: overlap_grad_ring_attention 0.00% : 0.000008s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000007s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000008s : 1: overlap_param_gather 0.00% : 0.000006s : 1: overlap_recompute_allgather_and_fa_grad 0.01% : 0.000012s : 1: overlap_recompute_and_grad_model_parallel 0.01% : 0.000009s : 1: overlap_recompute_comm 0.01% : 0.000010s : 1: parallel-infer-symbol 0.00% : 0.000007s : 1: parallel-infer-symbol-second 0.00% : 0.000008s : 1: partial_unused_args_eliminate 0.01% : 0.000009s : 1: pipeline_parallel_scheduler 0.00% : 0.000007s : 1: pipeline_split 0.03% : 0.000050s : 1: pre_auto_parallel 0.02% : 0.000039s : 1: py_interpret_to_execute 0.01% : 0.000021s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000006s : 1: remove_cast_before_assign_add 0.01% : 0.000020s : 1: remove_dup_value 0.26% : 0.000440s : 1: renormalize.infer 0.22% : 0.000374s : 1: renormalize.specialize 0.01% : 0.000009s : 1: reorder_send_recv_between_fp_bp 0.01% : 0.000012s : 1: rewriter_after_jit_bprop_graph 0.03% : 0.000053s : 1: rewriter_after_opt_a 0.06% : 0.000101s : 1: rewriter_before_opt_a 0.01% : 0.000009s : 1: slice_cell_reuse_recomputed_activation 0.01% : 0.000009s : 1: slice_recompute_activation 0.00% : 0.000008s : 1: split_layernorm_comm 0.00% : 0.000007s : 1: split_matmul_comm_elemetwise 0.01% : 0.000013s : 1: swap_dp_allreduce_reducescatter 0.07% : 0.000112s : 1: symbol_engine_optimizer 0.06% : 0.000101s : 1: tuple_transform 85.77% : 0.142982s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:39:51.746.861 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0168413, [21] [bootstrap]: 0.00044517 [type_inference]: 0.00591151 [event_method]: 1.91e-05 [auto_monad]: 6.642e-05 [graph_reusing]: 5.74e-06 [inline]: 2.49001e-06 [add_attr]: 0.00372612, [1] [add_attr_with_inline]: 0.00371344, [1] [Cycle 1]: 7.854e-05, [2] [tag_attr]: 2.418e-05 [meta_addattr_fg_expand]: 6.14999e-06 [parallel-infer-symbol]: 3.78001e-06 [pre_auto_parallel]: 4.008e-05 [insert-virtual-dataset]: 2.99999e-06 [parallel-infer-symbol-second]: 9.10019e-07 [dataset_repeat_opt]: 2.43e-06 [pipeline_split]: 1.71e-06 [optimize]: 0.00585115, [53] [py_interpret_to_execute]: 3.011e-05 [rewriter_before_opt_a]: 9.348e-05 [opt_a]: 0.0032501, [2] [Cycle 1]: 0.00247846, [45] [expand_dump_flag]: 3.55e-06 [switch_simplify]: 4.613e-05 [loop_unroll]: 3.067e-05 [a_1]: 0.00072584 [with_stream_mark]: 2.167e-05 [recompute_prepare]: 1.148e-05 [updatestate_depend_eliminate]: 4.30999e-06 [updatestate_assign_eliminate]: 3.45003e-06 [updatestate_loads_eliminate]: 3.01999e-06 [parameter_eliminate]: 1.95001e-06 [a_2]: 8.311e-05 [accelerated_algorithm]: 8e-06 [shard]: 2.01e-06 [meta_shard_fg_expand]: 2.46e-06 [shard_inline]: 6.88e-06 [merge_send_recv]: 8.72e-06 [auto_parallel]: 8.51002e-06 [parallel]: 2.141e-05 [flash_sp]: 1.011e-05 [merge_comm]: 4.23999e-06 [allreduce_fusion]: 3.72998e-06 [matmul_add_comm_reduction]: 9.87999e-06 [allreduce_slice_to_reducescatter]: 6.19999e-07 [virtual_shard_identity]: 1.055e-05 [virtual_dataset]: 7.61999e-06 [get_grad_eliminate_]: 7.13e-06 [virtual_output]: 6.83e-06 [merge_forward]: 5.09e-06 [cell_reuse_recompute_pass]: 1.49e-06 [offload_activation]: 1.06e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.494e-05 [merge_recompute_call_nodes]: 1.46998e-06 [before_grad]: 1.3e-05 [set_forward_comm_id_for_comm_node_pass]: 3.83999e-06 [meta_fg_expand]: 3.16999e-06 [flash_sp_send_recv_attached]: 3.28e-06 [receive_attached]: 2.49999e-06 [after_resolve]: 1.615e-05 [a_after_grad]: 1.218e-05 [renormalize]: 0.00093669 [add_forward_monad_depend]: 6.97002e-06 [auto_monad_grad]: 2.47001e-06 [auto_monad_eliminator]: 1.804e-05 [cse]: 3.329e-05 [a_3]: 5.321e-05 [Cycle 2]: 0.00075855, [45] [expand_dump_flag]: 2.44999e-06 [switch_simplify]: 8.82e-06 [loop_unroll]: 6.28e-06 [a_1]: 0.00016957 [with_stream_mark]: 1.569e-05 [recompute_prepare]: 7.7e-06 [updatestate_depend_eliminate]: 3.25e-06 [updatestate_assign_eliminate]: 2.51e-06 [updatestate_loads_eliminate]: 2.73e-06 [parameter_eliminate]: 1.42e-06 [a_2]: 7.417e-05 [accelerated_algorithm]: 6.52001e-06 [shard]: 1.69e-06 [meta_shard_fg_expand]: 1.67001e-06 [shard_inline]: 6.46999e-06 [merge_send_recv]: 6.71e-06 [auto_parallel]: 6.75002e-06 [parallel]: 7.23e-06 [flash_sp]: 3.61999e-06 [merge_comm]: 3.50003e-06 [allreduce_fusion]: 3.43e-06 [matmul_add_comm_reduction]: 7.93001e-06 [allreduce_slice_to_reducescatter]: 6.50005e-07 [virtual_shard_identity]: 7.96001e-06 [virtual_dataset]: 6.54999e-06 [get_grad_eliminate_]: 6.83e-06 [virtual_output]: 5.62001e-06 [merge_forward]: 3.80998e-06 [cell_reuse_recompute_pass]: 2.39999e-06 [offload_activation]: 1.031e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.456e-05 [merge_recompute_call_nodes]: 1.07e-06 [before_grad]: 1.027e-05 [set_forward_comm_id_for_comm_node_pass]: 3.77002e-06 [meta_fg_expand]: 2.16e-06 [flash_sp_send_recv_attached]: 1.25999e-06 [receive_attached]: 1.77001e-06 [after_resolve]: 1.249e-05 [a_after_grad]: 1.065e-05 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 2.19001e-06 [auto_monad_grad]: 1.62999e-06 [auto_monad_eliminator]: 8.59e-06 [cse]: 1.664e-05 [a_3]: 3.621e-05 [py_interpret_to_execute_after_opt_a]: 1.215e-05 [slice_cell_reuse_recomputed_activation]: 2.39001e-06 [rewriter_after_opt_a]: 3.885e-05 [convert_after_rewriter]: 6.68998e-06 [order_py_execute_after_rewriter]: 4.94003e-06 [mutable_eliminate]: 0.00078273 [opt_b]: 0.00021708, [1] [Cycle 1]: 0.00020915, [7] [b_1]: 0.0001274 [b_2]: 9.37999e-06 [updatestate_depend_eliminate]: 8.32e-06 [updatestate_assign_eliminate]: 2.83e-06 [updatestate_loads_eliminate]: 2.38002e-06 [renormalize]: 7.7e-07 [cse]: 2.238e-05 [optimize_parallel_all_gather_comm]: 1.734e-05 [overlap_param_gather]: 2.04e-06 [cconv]: 3.181e-05 [loop_unroll]: 0.00063527 [opt_after_cconv]: 0.000119, [1] [Cycle 1]: 0.00011252, [7] [c_1]: 3.584e-05 [parameter_eliminate]: 4.95999e-06 [updatestate_depend_eliminate]: 6.84999e-06 [updatestate_assign_eliminate]: 2.59999e-06 [updatestate_loads_eliminate]: 2.43e-06 [cse]: 2.157e-05 [renormalize]: 4.69998e-07 [remove_dup_value]: 1.632e-05 [tuple_transform]: 8.531e-05, [1] [Cycle 1]: 8.009e-05, [4] [d_1]: 4.995e-05 [none_parameter_eliminate]: 1.78002e-06 [renormalize]: 2.80008e-07 [switch_simplify]: 7.98999e-06 [partial_unused_args_eliminate]: 2.51998e-06 [add_recomputation]: 5.408e-05 [cse_after_recomputation]: 2.348e-05, [1] [Cycle 1]: 1.81e-05, [1] [cse]: 1.214e-05 [environ_conv]: 6.02999e-06 [swap_dp_allreduce_reducescatter]: 5.31998e-06 [bias_add_comm_swap]: 2.84999e-06 [label_micro_interleaved_index]: 4.84e-06 [label_fine_grained_interleaved_index]: 2.86e-06 [merge_cast_opt]: 1.25999e-06 [slice_recompute_activation]: 2.93e-06 [micro_interleaved_order_control]: 2.19001e-06 [assign_add_opt]: 1.61998e-06 [ForceFp32Comm]: 1.17e-06 [remove_cast_before_assign_add]: 1.39e-06 [full_micro_interleaved_order_control]: 2.40002e-06 [reorder_send_recv_between_fp_bp]: 2.79999e-06 [comm_op_add_attrs]: 9.99979e-07 [add_comm_op_reuse_tag]: 1.07998e-06 [interleave_split_concat_branches]: 1.12e-06 [interleave_parallel_branches]: 1.40001e-06 [overlap_opt_shard_in_pipeline]: 1.23002e-06 [overlap_opt_shard_grad_in_pipeline]: 1.84998e-06 [control_data_broadcast_order]: 1.401e-05 [grouped_pairwise_exchange_alltoall]: 1.57001e-06 [offloading_packed_experts]: 3.95e-06 [overlap_recompute_and_grad_model_parallel]: 5.04e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.22e-06 [overlap_recompute_allgather_and_fa_grad]: 1.37e-06 [overlap_recompute_comm]: 2.19999e-06 [overlap_grad_ring_attention]: 4.26001e-06 [overlap_grad_flash_sp]: 2.294e-05 [begin_end_overlap_inline]: 6.59988e-07 [split_matmul_comm_elemetwise]: 2.75997e-06 [split_layernorm_comm]: 1.72001e-06 [handle_group_info]: 1.05999e-06 [symbol_engine_optimizer]: 8.076e-05, [1] [Cycle 1]: 7.609e-05, [6] [build]: 3.28998e-06 [elim_shapecalc]: 1.082e-05 [elim_not_effective]: 1.387e-05 [opt_reshape]: 7.8e-06 [fold_const_symbol]: 1.085e-05 [renormalize]: 2.89991e-07 [detach_backward]: 2.19999e-06 [pipeline_parallel_scheduler]: 1.67001e-06 [auto_monad_reorder]: 1.753e-05 [get_jit_bprop_graph]: 3.439e-05 [rewriter_after_jit_bprop_graph]: 5.87999e-06 [opt_after_jit_grad]: 0.00049484 [validate]: 4.355e-05 Sums bootstrap : 0.000445s : 3.69% type_inference : 0.005912s : 49.06% event_method : 0.000019s : 0.16% auto_monad : 0.000066s : 0.55% graph_reusing : 0.000006s : 0.05% inline : 0.000002s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000024s : 0.20% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.05% parallel-infer-symbol : 0.000004s : 0.03% pre_auto_parallel : 0.000040s : 0.33% insert-virtual-dataset : 0.000003s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000030s : 0.25% optimize.rewriter_before_opt_a : 0.000093s : 0.78% optimize.opt_a.expand_dump_flag : 0.000006s : 0.05% optimize.opt_a.switch_simplify : 0.000055s : 0.46% optimize.opt_a.loop_unroll : 0.000037s : 0.31% optimize.opt_a.a_1 : 0.000895s : 7.43% optimize.opt_a.with_stream_mark : 0.000037s : 0.31% optimize.opt_a.recompute_prepare : 0.000019s : 0.16% optimize.opt_a.updatestate_depend_eliminate : 0.000008s : 0.06% optimize.opt_a.updatestate_assign_eliminate : 0.000006s : 0.05% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.05% optimize.opt_a.parameter_eliminate : 0.000003s : 0.03% optimize.opt_a.a_2 : 0.000157s : 1.31% optimize.opt_a.accelerated_algorithm : 0.000015s : 0.12% optimize.opt_a.shard : 0.000004s : 0.03% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.03% optimize.opt_a.shard_inline : 0.000013s : 0.11% optimize.opt_a.merge_send_recv : 0.000015s : 0.13% optimize.opt_a.auto_parallel : 0.000015s : 0.13% optimize.opt_a.parallel : 0.000029s : 0.24% optimize.opt_a.flash_sp : 0.000014s : 0.11% optimize.opt_a.merge_comm : 0.000008s : 0.06% optimize.opt_a.allreduce_fusion : 0.000007s : 0.06% optimize.opt_a.matmul_add_comm_reduction : 0.000018s : 0.15% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000019s : 0.15% optimize.opt_a.virtual_dataset : 0.000014s : 0.12% optimize.opt_a.get_grad_eliminate_ : 0.000014s : 0.12% optimize.opt_a.virtual_output : 0.000012s : 0.10% optimize.opt_a.merge_forward : 0.000009s : 0.07% optimize.opt_a.cell_reuse_recompute_pass : 0.000004s : 0.03% optimize.opt_a.offload_activation : 0.000021s : 0.17% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000030s : 0.24% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.02% optimize.opt_a.before_grad : 0.000023s : 0.19% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000008s : 0.06% optimize.opt_a.meta_fg_expand : 0.000005s : 0.04% optimize.opt_a.flash_sp_send_recv_attached : 0.000005s : 0.04% optimize.opt_a.receive_attached : 0.000004s : 0.04% optimize.opt_a.after_resolve : 0.000029s : 0.24% optimize.opt_a.a_after_grad : 0.000023s : 0.19% optimize.opt_a.renormalize : 0.000937s : 7.77% optimize.opt_a.add_forward_monad_depend : 0.000009s : 0.08% optimize.opt_a.auto_monad_grad : 0.000004s : 0.03% optimize.opt_a.auto_monad_eliminator : 0.000027s : 0.22% optimize.opt_a.cse : 0.000050s : 0.41% optimize.opt_a.a_3 : 0.000089s : 0.74% optimize.py_interpret_to_execute_after_opt_a : 0.000012s : 0.10% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.02% optimize.rewriter_after_opt_a : 0.000039s : 0.32% optimize.convert_after_rewriter : 0.000007s : 0.06% optimize.order_py_execute_after_rewriter : 0.000005s : 0.04% optimize.mutable_eliminate : 0.000783s : 6.50% optimize.opt_b.b_1 : 0.000127s : 1.06% optimize.opt_b.b_2 : 0.000009s : 0.08% optimize.opt_b.updatestate_depend_eliminate : 0.000008s : 0.07% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.02% optimize.opt_b.updatestate_loads_eliminate : 0.000002s : 0.02% optimize.opt_b.renormalize : 0.000001s : 0.01% optimize.opt_b.cse : 0.000022s : 0.19% optimize.optimize_parallel_all_gather_comm : 0.000017s : 0.14% optimize.overlap_param_gather : 0.000002s : 0.02% optimize.cconv : 0.000032s : 0.26% optimize.loop_unroll : 0.000635s : 5.27% optimize.opt_after_cconv.c_1 : 0.000036s : 0.30% optimize.opt_after_cconv.parameter_eliminate : 0.000005s : 0.04% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000007s : 0.06% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.02% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.02% optimize.opt_after_cconv.cse : 0.000022s : 0.18% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000016s : 0.14% optimize.tuple_transform.d_1 : 0.000050s : 0.41% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000008s : 0.07% optimize.partial_unused_args_eliminate : 0.000003s : 0.02% optimize.add_recomputation : 0.000054s : 0.45% optimize.cse_after_recomputation.cse : 0.000012s : 0.10% optimize.environ_conv : 0.000006s : 0.05% optimize.swap_dp_allreduce_reducescatter : 0.000005s : 0.04% optimize.bias_add_comm_swap : 0.000003s : 0.02% optimize.label_micro_interleaved_index : 0.000005s : 0.04% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.02% optimize.merge_cast_opt : 0.000001s : 0.01% optimize.slice_recompute_activation : 0.000003s : 0.02% optimize.micro_interleaved_order_control : 0.000002s : 0.02% optimize.assign_add_opt : 0.000002s : 0.01% optimize.ForceFp32Comm : 0.000001s : 0.01% optimize.remove_cast_before_assign_add : 0.000001s : 0.01% optimize.full_micro_interleaved_order_control : 0.000002s : 0.02% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.02% optimize.comm_op_add_attrs : 0.000001s : 0.01% optimize.add_comm_op_reuse_tag : 0.000001s : 0.01% optimize.interleave_split_concat_branches : 0.000001s : 0.01% optimize.interleave_parallel_branches : 0.000001s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.02% optimize.control_data_broadcast_order : 0.000014s : 0.12% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.01% optimize.offloading_packed_experts : 0.000004s : 0.03% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.04% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.01% optimize.overlap_recompute_comm : 0.000002s : 0.02% optimize.overlap_grad_ring_attention : 0.000004s : 0.04% optimize.overlap_grad_flash_sp : 0.000023s : 0.19% optimize.begin_end_overlap_inline : 0.000001s : 0.01% optimize.split_matmul_comm_elemetwise : 0.000003s : 0.02% optimize.split_layernorm_comm : 0.000002s : 0.01% optimize.handle_group_info : 0.000001s : 0.01% optimize.symbol_engine_optimizer.build : 0.000003s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000011s : 0.09% optimize.symbol_engine_optimizer.elim_not_effective : 0.000014s : 0.12% optimize.symbol_engine_optimizer.opt_reshape : 0.000008s : 0.06% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000011s : 0.09% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.02% pipeline_parallel_scheduler : 0.000002s : 0.01% auto_monad_reorder : 0.000018s : 0.15% get_jit_bprop_graph : 0.000034s : 0.29% rewriter_after_jit_bprop_graph : 0.000006s : 0.05% opt_after_jit_grad : 0.000495s : 4.11% validate : 0.000044s : 0.36% Time group info: ------[substitution.] 0.000224 28 0.96% : 0.000002s : 2: substitution.elim_not_effective 0.66% : 0.000001s : 2: substitution.fold_const_symbol 3.07% : 0.000007s : 4: substitution.graph_param_transform 80.02% : 0.000179s : 4: substitution.inline 2.22% : 0.000005s : 4: substitution.j_node_and_user_rematch 2.60% : 0.000006s : 4: substitution.remove_not_recompute_node 2.93% : 0.000007s : 4: substitution.replace_old_param 7.53% : 0.000017s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.005844 2 87.41% : 0.005108s : 1: type_inference.infer 12.59% : 0.000736s : 1: type_inference.specialize ------[replace.] 0.000111 8 77.67% : 0.000086s : 4: replace.inline 22.33% : 0.000025s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000190 8 92.38% : 0.000176s : 4: match.inline 7.62% : 0.000015s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000217 1278 0.92% : 0.000002s : 13: predicate.accumulaten_eliminater 0.76% : 0.000002s : 4: predicate.ad_related_special_op_eliminate 0.45% : 0.000001s : 8: predicate.addn_check_dump 0.94% : 0.000002s : 13: predicate.addn_zero_filter 0.83% : 0.000002s : 13: predicate.adjust_all_reduce_mul_add 2.10% : 0.000005s : 21: predicate.arithmetic_simplify 0.95% : 0.000002s : 13: predicate.cast_eliminate 0.59% : 0.000001s : 8: predicate.check_bprop_eliminate 0.48% : 0.000001s : 8: predicate.compare_switch_simplify 0.19% : 0.000000s : 4: predicate.const_output_eliminate 0.65% : 0.000001s : 8: predicate.depend_value_elim 0.95% : 0.000002s : 13: predicate.dict_get_item_const_eliminator 1.21% : 0.000003s : 13: predicate.dict_get_item_eliminator 0.90% : 0.000002s : 13: predicate.dict_set_item_eliminator 0.98% : 0.000002s : 8: predicate.dumpgradient_eliminate 0.25% : 0.000001s : 4: predicate.elim_not_effective 0.41% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.03% : 0.000002s : 17: predicate.environ_add_const_eliminate 1.03% : 0.000002s : 17: predicate.environ_get_add_eliminate 1.25% : 0.000003s : 17: predicate.environ_get_depend_swap 1.67% : 0.000004s : 25: predicate.environ_get_eliminate 0.98% : 0.000002s : 17: predicate.environ_get_set_eliminate 1.44% : 0.000003s : 21: predicate.exchange_switch_depend_value 2.61% : 0.000006s : 21: predicate.float_depend_g_call 0.50% : 0.000001s : 8: predicate.float_environ_get_switch 0.73% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.18% : 0.000000s : 4: predicate.fold_const_symbol 0.71% : 0.000002s : 8: predicate.get_grad_eliminate 0.22% : 0.000000s : 4: predicate.graph_param_transform 0.57% : 0.000001s : 8: predicate.incorporate_call 0.46% : 0.000001s : 8: predicate.incorporate_call_switch 6.31% : 0.000014s : 58: predicate.inline 0.71% : 0.000002s : 8: predicate.inline_without_move 0.34% : 0.000001s : 8: predicate.j_node_and_user_rematch 0.91% : 0.000002s : 8: predicate.less_batch_normalization 1.78% : 0.000004s : 25: predicate.list_to_tuple_eliminator_ 2.32% : 0.000005s : 38: predicate.load_eliminater 1.13% : 0.000002s : 4: predicate.loop_unroll_after_grad 2.28% : 0.000005s : 34: predicate.loop_unroll_before_grad 1.68% : 0.000004s : 21: predicate.make_slice_get_slice_eliminator 0.56% : 0.000001s : 8: predicate.merge_addn 0.59% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.54% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.83% : 0.000002s : 13: predicate.minmaximum_grad 1.37% : 0.000003s : 4: predicate.mutable_eliminate 0.39% : 0.000001s : 4: predicate.opt_reshape 0.39% : 0.000001s : 4: predicate.parallel_virtual_node 1.78% : 0.000004s : 21: predicate.partial_defer_inline 1.60% : 0.000003s : 21: predicate.partial_eliminate 0.77% : 0.000002s : 13: predicate.print_const_string_wrapper 0.56% : 0.000001s : 8: predicate.reduce_all_const_elim 1.12% : 0.000002s : 13: predicate.reduce_eliminate 2.54% : 0.000005s : 38: predicate.redundant_stop_gradient_eliminater 0.46% : 0.000001s : 8: predicate.remove_not_recompute_node 1.43% : 0.000003s : 25: predicate.replace_applicator 0.67% : 0.000001s : 8: predicate.replace_old_param 0.37% : 0.000001s : 4: predicate.reset_defer_inline 1.21% : 0.000003s : 13: predicate.reshape_eliminate 0.74% : 0.000002s : 8: predicate.row_tensor_add_zeros_like 0.53% : 0.000001s : 4: predicate.row_tensor_eliminate 0.93% : 0.000002s : 8: predicate.same_eliminate 0.39% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.86% : 0.000002s : 8: predicate.shard_identity_eliminate 0.75% : 0.000002s : 8: predicate.special_op_eliminate 0.66% : 0.000001s : 8: predicate.specialize_transform 0.88% : 0.000002s : 8: predicate.split_environ_get_set_with_tuple_value 0.90% : 0.000002s : 8: predicate.stack_unstack_eliminate 0.41% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.46% : 0.000003s : 21: predicate.switch_defer_inline 1.94% : 0.000004s : 29: predicate.switch_layer_defer_inline 5.04% : 0.000011s : 67: predicate.switch_simplify 0.89% : 0.000002s : 13: predicate.tile_eliminate 0.99% : 0.000002s : 13: predicate.transpose_eliminate 1.67% : 0.000004s : 21: predicate.tuple_list_convert_item_index_to_positive 1.43% : 0.000003s : 21: predicate.tuple_list_get_item_const_eliminator 1.38% : 0.000003s : 21: predicate.tuple_list_get_item_depend_reorder 3.41% : 0.000007s : 33: predicate.tuple_list_get_item_eliminator 1.37% : 0.000003s : 21: predicate.tuple_list_get_set_item_eliminator 2.16% : 0.000005s : 29: predicate.tuple_list_set_item_eliminator 1.73% : 0.000004s : 25: predicate.tuple_to_list_eliminator_ 2.37% : 0.000005s : 38: predicate.updatestate_pure_node_eliminater 3.04% : 0.000007s : 46: predicate.updatestate_useless_node_eliminater 0.36% : 0.000001s : 4: predicate.value_based_eliminate 0.80% : 0.000002s : 8: predicate.virtual_dataset_eliminate 0.61% : 0.000001s : 8: predicate.virtual_output_eliminate 0.25% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.46% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000589 11 51.35% : 0.000303s : 5: func_graph_cloner_run.FuncGraphClonerGraph 48.65% : 0.000287s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.028852 192 0.01% : 0.000004s : 1: ForceFp32Comm 12.94% : 0.003733s : 1: add_attr 12.89% : 0.003718s : 1: add_attr_with_inline 0.02% : 0.000005s : 1: add_comm_op_reuse_tag 0.20% : 0.000058s : 1: add_recomputation 0.02% : 0.000005s : 1: assign_add_opt 0.25% : 0.000072s : 1: auto_monad 0.07% : 0.000021s : 1: auto_monad_reorder 0.01% : 0.000003s : 1: begin_end_overlap_inline 0.02% : 0.000006s : 1: bias_add_comm_swap 1.63% : 0.000470s : 1: bootstrap 0.12% : 0.000036s : 1: cconv 0.01% : 0.000004s : 1: comm_op_add_attrs 0.06% : 0.000017s : 1: control_data_broadcast_order 0.03% : 0.000010s : 1: convert_after_rewriter 0.09% : 0.000027s : 1: cse_after_recomputation 0.02% : 0.000005s : 1: dataset_repeat_opt 0.02% : 0.000005s : 1: detach_backward 0.03% : 0.000009s : 1: environ_conv 0.09% : 0.000026s : 1: event_method 0.02% : 0.000005s : 1: full_micro_interleaved_order_control 0.15% : 0.000042s : 1: get_jit_bprop_graph 0.03% : 0.000009s : 1: graph_reusing 0.02% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000004s : 1: handle_group_info 0.02% : 0.000005s : 1: inline 0.02% : 0.000006s : 1: insert-virtual-dataset 0.01% : 0.000004s : 1: interleave_parallel_branches 0.01% : 0.000004s : 1: interleave_split_concat_branches 0.02% : 0.000006s : 1: label_fine_grained_interleaved_index 0.03% : 0.000008s : 1: label_micro_interleaved_index 2.24% : 0.000646s : 1: loop_unroll 0.01% : 0.000004s : 1: merge_cast_opt 0.02% : 0.000005s : 1: micro_interleaved_order_control 2.75% : 0.000793s : 1: mutable_eliminate 0.02% : 0.000007s : 1: offloading_packed_experts 0.07% : 0.000020s : 1: opt.transform.loop_unroll_optimizer 0.07% : 0.000020s : 1: opt.transform.mutable_eliminate 4.66% : 0.001344s : 78: opt.transform.opt_a 0.12% : 0.000035s : 1: opt.transform.opt_after_cconv 0.09% : 0.000025s : 1: opt.transform.opt_after_jit_grad 0.36% : 0.000103s : 28: opt.transform.opt_b 0.19% : 0.000056s : 2: opt.transform.opt_trans_graph 0.14% : 0.000040s : 4: opt.transform.symbol_engine_opt 11.28% : 0.003253s : 1: opt_a 0.43% : 0.000123s : 1: opt_after_cconv 1.75% : 0.000504s : 1: opt_after_jit_grad 0.76% : 0.000221s : 1: opt_b 20.30% : 0.005856s : 1: optimize 0.07% : 0.000021s : 1: optimize_parallel_all_gather_comm 0.03% : 0.000008s : 1: order_py_execute_after_rewriter 0.09% : 0.000026s : 1: overlap_grad_flash_sp 0.01% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.02% : 0.000007s : 1: overlap_grad_ring_attention 0.02% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.02% : 0.000005s : 1: overlap_param_gather 0.01% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.03% : 0.000009s : 1: overlap_recompute_and_grad_model_parallel 0.02% : 0.000005s : 1: overlap_recompute_comm 0.03% : 0.000008s : 1: parallel-infer-symbol 0.01% : 0.000004s : 1: parallel-infer-symbol-second 0.02% : 0.000006s : 1: partial_unused_args_eliminate 0.02% : 0.000005s : 1: pipeline_parallel_scheduler 0.02% : 0.000004s : 1: pipeline_split 0.15% : 0.000045s : 1: pre_auto_parallel 0.12% : 0.000035s : 1: py_interpret_to_execute 0.06% : 0.000016s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000004s : 1: remove_cast_before_assign_add 0.07% : 0.000020s : 1: remove_dup_value 1.81% : 0.000522s : 1: renormalize.infer 1.40% : 0.000404s : 1: renormalize.specialize 0.02% : 0.000005s : 1: reorder_send_recv_between_fp_bp 0.03% : 0.000010s : 1: rewriter_after_jit_bprop_graph 0.15% : 0.000043s : 1: rewriter_after_opt_a 0.34% : 0.000099s : 1: rewriter_before_opt_a 0.02% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.02% : 0.000007s : 1: slice_recompute_activation 0.02% : 0.000005s : 1: split_layernorm_comm 0.02% : 0.000005s : 1: split_matmul_comm_elemetwise 0.03% : 0.000008s : 1: swap_dp_allreduce_reducescatter 0.29% : 0.000084s : 1: symbol_engine_optimizer 0.31% : 0.000089s : 1: tuple_transform 20.56% : 0.005931s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:39:53.562.136 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:39:53.562.400 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.156837, [21] [bootstrap]: 0.00046311 [type_inference]: 0.145094 [event_method]: 1.953e-05 [auto_monad]: 6.879e-05 [graph_reusing]: 6.43e-06 [inline]: 2.84999e-06 [add_attr]: 0.00334955, [1] [add_attr_with_inline]: 0.00334037, [1] [Cycle 1]: 8.449e-05, [2] [tag_attr]: 2.139e-05 [meta_addattr_fg_expand]: 7.45e-06 [parallel-infer-symbol]: 4.06001e-06 [pre_auto_parallel]: 3.53e-05 [insert-virtual-dataset]: 2.61e-06 [parallel-infer-symbol-second]: 9.50007e-07 [dataset_repeat_opt]: 2.59001e-06 [pipeline_split]: 1.94e-06 [optimize]: 0.00646321, [53] [py_interpret_to_execute]: 2.809e-05 [rewriter_before_opt_a]: 8.853e-05 [opt_a]: 0.00370101, [2] [Cycle 1]: 0.00266354, [45] [expand_dump_flag]: 3.04999e-06 [switch_simplify]: 4.346e-05 [loop_unroll]: 3.257e-05 [a_1]: 0.0007936 [with_stream_mark]: 2.024e-05 [recompute_prepare]: 1.219e-05 [updatestate_depend_eliminate]: 4.99e-06 [updatestate_assign_eliminate]: 3.9e-06 [updatestate_loads_eliminate]: 3.84002e-06 [parameter_eliminate]: 2.66e-06 [a_2]: 0.00013013 [accelerated_algorithm]: 8.17003e-06 [shard]: 1.97999e-06 [meta_shard_fg_expand]: 2.12999e-06 [shard_inline]: 8.37998e-06 [merge_send_recv]: 1.003e-05 [auto_parallel]: 9.30001e-06 [parallel]: 1.914e-05 [flash_sp]: 1.036e-05 [merge_comm]: 4.74e-06 [allreduce_fusion]: 4.63999e-06 [matmul_add_comm_reduction]: 1.095e-05 [allreduce_slice_to_reducescatter]: 8.70001e-07 [virtual_shard_identity]: 1.2e-05 [virtual_dataset]: 8.34998e-06 [get_grad_eliminate_]: 7.91001e-06 [virtual_output]: 8.14002e-06 [merge_forward]: 4.42998e-06 [cell_reuse_recompute_pass]: 1.77999e-06 [offload_activation]: 1.17e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.883e-05 [merge_recompute_call_nodes]: 1.94e-06 [before_grad]: 1.367e-05 [set_forward_comm_id_for_comm_node_pass]: 4.68001e-06 [meta_fg_expand]: 3.97e-06 [flash_sp_send_recv_attached]: 3.37002e-06 [receive_attached]: 2.71e-06 [after_resolve]: 1.354e-05 [a_after_grad]: 1.273e-05 [renormalize]: 0.00083294 [add_forward_monad_depend]: 7.04001e-06 [auto_monad_grad]: 2.96001e-06 [auto_monad_eliminator]: 2.079e-05 [cse]: 3.801e-05 [a_3]: 7.612e-05 [Cycle 2]: 0.00102105, [45] [expand_dump_flag]: 2.30002e-06 [switch_simplify]: 9.77001e-06 [loop_unroll]: 7.53e-06 [a_1]: 0.00018317 [with_stream_mark]: 1.715e-05 [recompute_prepare]: 8.25999e-06 [updatestate_depend_eliminate]: 4.33999e-06 [updatestate_assign_eliminate]: 3.43999e-06 [updatestate_loads_eliminate]: 3.11999e-06 [parameter_eliminate]: 1.79e-06 [a_2]: 0.00012186 [accelerated_algorithm]: 8.28999e-06 [shard]: 2.07999e-06 [meta_shard_fg_expand]: 1.93002e-06 [shard_inline]: 9.26998e-06 [merge_send_recv]: 7.62998e-06 [auto_parallel]: 8.56002e-06 [parallel]: 8.05e-06 [flash_sp]: 4.33999e-06 [merge_comm]: 4.23999e-06 [allreduce_fusion]: 4.3e-06 [matmul_add_comm_reduction]: 9.09e-06 [allreduce_slice_to_reducescatter]: 3.99974e-07 [virtual_shard_identity]: 8.82e-06 [virtual_dataset]: 7.94002e-06 [get_grad_eliminate_]: 8.45999e-06 [virtual_output]: 7.35e-06 [merge_forward]: 5.40999e-06 [cell_reuse_recompute_pass]: 2.49001e-06 [offload_activation]: 1.002e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.879e-05 [merge_recompute_call_nodes]: 1.76003e-06 [before_grad]: 1.336e-05 [set_forward_comm_id_for_comm_node_pass]: 5.40999e-06 [meta_fg_expand]: 3.37997e-06 [flash_sp_send_recv_attached]: 1.44e-06 [receive_attached]: 2.00002e-06 [after_resolve]: 1.324e-05 [a_after_grad]: 1.242e-05 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 2.84001e-06 [auto_monad_grad]: 1.76998e-06 [auto_monad_eliminator]: 1.207e-05 [cse]: 2.377e-05 [a_3]: 6.03e-05 [py_interpret_to_execute_after_opt_a]: 1.687e-05 [slice_cell_reuse_recomputed_activation]: 4.68999e-06 [rewriter_after_opt_a]: 4.994e-05 [convert_after_rewriter]: 1.168e-05 [order_py_execute_after_rewriter]: 9.15999e-06 [mutable_eliminate]: 0.0005745 [opt_b]: 0.0004405, [1] [Cycle 1]: 0.00043001, [7] [b_1]: 0.00031256 [b_2]: 1.11e-05 [updatestate_depend_eliminate]: 7.11001e-06 [updatestate_assign_eliminate]: 3.14001e-06 [updatestate_loads_eliminate]: 3.03e-06 [renormalize]: 4.30009e-07 [cse]: 3.016e-05 [optimize_parallel_all_gather_comm]: 2.274e-05 [overlap_param_gather]: 5.14998e-06 [cconv]: 3.13e-05 [loop_unroll]: 0.00050763 [opt_after_cconv]: 0.00015027, [1] [Cycle 1]: 0.00014059, [7] [c_1]: 4.009e-05 [parameter_eliminate]: 3.09001e-06 [updatestate_depend_eliminate]: 6.88e-06 [updatestate_assign_eliminate]: 3.30003e-06 [updatestate_loads_eliminate]: 3.65e-06 [cse]: 2.589e-05 [renormalize]: 3.9002e-07 [remove_dup_value]: 4.485e-05 [tuple_transform]: 0.00011294, [1] [Cycle 1]: 0.00010355, [4] [d_1]: 5.735e-05 [none_parameter_eliminate]: 2.39999e-06 [renormalize]: 1.8999e-07 [switch_simplify]: 9.15999e-06 [partial_unused_args_eliminate]: 4.67998e-06 [add_recomputation]: 6.429e-05 [cse_after_recomputation]: 3.62e-05, [1] [Cycle 1]: 2.899e-05, [1] [cse]: 1.963e-05 [environ_conv]: 1.057e-05 [swap_dp_allreduce_reducescatter]: 9.10001e-06 [bias_add_comm_swap]: 5.47999e-06 [label_micro_interleaved_index]: 7.32002e-06 [label_fine_grained_interleaved_index]: 5.74999e-06 [merge_cast_opt]: 4e-06 [slice_recompute_activation]: 4.47e-06 [micro_interleaved_order_control]: 5.22e-06 [assign_add_opt]: 4.79e-06 [ForceFp32Comm]: 3.68999e-06 [remove_cast_before_assign_add]: 3.36999e-06 [full_micro_interleaved_order_control]: 4.94e-06 [reorder_send_recv_between_fp_bp]: 5.72001e-06 [comm_op_add_attrs]: 4.04997e-06 [add_comm_op_reuse_tag]: 3.38e-06 [interleave_split_concat_branches]: 3.59002e-06 [interleave_parallel_branches]: 3.46999e-06 [overlap_opt_shard_in_pipeline]: 3.98999e-06 [overlap_opt_shard_grad_in_pipeline]: 4.38999e-06 [control_data_broadcast_order]: 2.026e-05 [grouped_pairwise_exchange_alltoall]: 4.42e-06 [offloading_packed_experts]: 7.13e-06 [overlap_recompute_and_grad_model_parallel]: 8.04002e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.53e-06 [overlap_recompute_allgather_and_fa_grad]: 3.76001e-06 [overlap_recompute_comm]: 6.02999e-06 [overlap_grad_ring_attention]: 7.35e-06 [overlap_grad_flash_sp]: 2.843e-05 [begin_end_overlap_inline]: 2.98e-06 [split_matmul_comm_elemetwise]: 4.53001e-06 [split_layernorm_comm]: 4.01001e-06 [handle_group_info]: 3.25998e-06 [symbol_engine_optimizer]: 0.00013201, [1] [Cycle 1]: 0.00012424, [6] [build]: 4.02e-06 [elim_shapecalc]: 1.238e-05 [elim_not_effective]: 3.115e-05 [opt_reshape]: 9.99001e-06 [fold_const_symbol]: 1.466e-05 [renormalize]: 2.10013e-07 [detach_backward]: 3.78999e-06 [pipeline_parallel_scheduler]: 1.82999e-06 [auto_monad_reorder]: 2.464e-05 [get_jit_bprop_graph]: 1.79e-06 [rewriter_after_jit_bprop_graph]: 5.10001e-06 [opt_after_jit_grad]: 0.00059043 [validate]: 4.726e-05 Sums bootstrap : 0.000463s : 0.31% type_inference : 0.145094s : 95.72% event_method : 0.000020s : 0.01% auto_monad : 0.000069s : 0.05% graph_reusing : 0.000006s : 0.00% inline : 0.000003s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000021s : 0.01% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000007s : 0.00% parallel-infer-symbol : 0.000004s : 0.00% pre_auto_parallel : 0.000035s : 0.02% insert-virtual-dataset : 0.000003s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000003s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000028s : 0.02% optimize.rewriter_before_opt_a : 0.000089s : 0.06% optimize.opt_a.expand_dump_flag : 0.000005s : 0.00% optimize.opt_a.switch_simplify : 0.000053s : 0.04% optimize.opt_a.loop_unroll : 0.000040s : 0.03% optimize.opt_a.a_1 : 0.000977s : 0.64% optimize.opt_a.with_stream_mark : 0.000037s : 0.02% optimize.opt_a.recompute_prepare : 0.000020s : 0.01% optimize.opt_a.updatestate_depend_eliminate : 0.000009s : 0.01% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.00% optimize.opt_a.updatestate_loads_eliminate : 0.000007s : 0.00% optimize.opt_a.parameter_eliminate : 0.000004s : 0.00% optimize.opt_a.a_2 : 0.000252s : 0.17% optimize.opt_a.accelerated_algorithm : 0.000016s : 0.01% optimize.opt_a.shard : 0.000004s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.00% optimize.opt_a.shard_inline : 0.000018s : 0.01% optimize.opt_a.merge_send_recv : 0.000018s : 0.01% optimize.opt_a.auto_parallel : 0.000018s : 0.01% optimize.opt_a.parallel : 0.000027s : 0.02% optimize.opt_a.flash_sp : 0.000015s : 0.01% optimize.opt_a.merge_comm : 0.000009s : 0.01% optimize.opt_a.allreduce_fusion : 0.000009s : 0.01% optimize.opt_a.matmul_add_comm_reduction : 0.000020s : 0.01% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000021s : 0.01% optimize.opt_a.virtual_dataset : 0.000016s : 0.01% optimize.opt_a.get_grad_eliminate_ : 0.000016s : 0.01% optimize.opt_a.virtual_output : 0.000015s : 0.01% optimize.opt_a.merge_forward : 0.000010s : 0.01% optimize.opt_a.cell_reuse_recompute_pass : 0.000004s : 0.00% optimize.opt_a.offload_activation : 0.000022s : 0.01% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000038s : 0.02% optimize.opt_a.merge_recompute_call_nodes : 0.000004s : 0.00% optimize.opt_a.before_grad : 0.000027s : 0.02% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000010s : 0.01% optimize.opt_a.meta_fg_expand : 0.000007s : 0.00% optimize.opt_a.flash_sp_send_recv_attached : 0.000005s : 0.00% optimize.opt_a.receive_attached : 0.000005s : 0.00% optimize.opt_a.after_resolve : 0.000027s : 0.02% optimize.opt_a.a_after_grad : 0.000025s : 0.02% optimize.opt_a.renormalize : 0.000833s : 0.55% optimize.opt_a.add_forward_monad_depend : 0.000010s : 0.01% optimize.opt_a.auto_monad_grad : 0.000005s : 0.00% optimize.opt_a.auto_monad_eliminator : 0.000033s : 0.02% optimize.opt_a.cse : 0.000062s : 0.04% optimize.opt_a.a_3 : 0.000136s : 0.09% optimize.py_interpret_to_execute_after_opt_a : 0.000017s : 0.01% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.00% optimize.rewriter_after_opt_a : 0.000050s : 0.03% optimize.convert_after_rewriter : 0.000012s : 0.01% optimize.order_py_execute_after_rewriter : 0.000009s : 0.01% optimize.mutable_eliminate : 0.000575s : 0.38% optimize.opt_b.b_1 : 0.000313s : 0.21% optimize.opt_b.b_2 : 0.000011s : 0.01% optimize.opt_b.updatestate_depend_eliminate : 0.000007s : 0.00% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.00% optimize.opt_b.renormalize : 0.000000s : 0.00% optimize.opt_b.cse : 0.000030s : 0.02% optimize.optimize_parallel_all_gather_comm : 0.000023s : 0.02% optimize.overlap_param_gather : 0.000005s : 0.00% optimize.cconv : 0.000031s : 0.02% optimize.loop_unroll : 0.000508s : 0.33% optimize.opt_after_cconv.c_1 : 0.000040s : 0.03% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000007s : 0.00% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000004s : 0.00% optimize.opt_after_cconv.cse : 0.000026s : 0.02% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000045s : 0.03% optimize.tuple_transform.d_1 : 0.000057s : 0.04% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000009s : 0.01% optimize.partial_unused_args_eliminate : 0.000005s : 0.00% optimize.add_recomputation : 0.000064s : 0.04% optimize.cse_after_recomputation.cse : 0.000020s : 0.01% optimize.environ_conv : 0.000011s : 0.01% optimize.swap_dp_allreduce_reducescatter : 0.000009s : 0.01% optimize.bias_add_comm_swap : 0.000005s : 0.00% optimize.label_micro_interleaved_index : 0.000007s : 0.00% optimize.label_fine_grained_interleaved_index : 0.000006s : 0.00% optimize.merge_cast_opt : 0.000004s : 0.00% optimize.slice_recompute_activation : 0.000004s : 0.00% optimize.micro_interleaved_order_control : 0.000005s : 0.00% optimize.assign_add_opt : 0.000005s : 0.00% optimize.ForceFp32Comm : 0.000004s : 0.00% optimize.remove_cast_before_assign_add : 0.000003s : 0.00% optimize.full_micro_interleaved_order_control : 0.000005s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000006s : 0.00% optimize.comm_op_add_attrs : 0.000004s : 0.00% optimize.add_comm_op_reuse_tag : 0.000003s : 0.00% optimize.interleave_split_concat_branches : 0.000004s : 0.00% optimize.interleave_parallel_branches : 0.000003s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000004s : 0.00% optimize.control_data_broadcast_order : 0.000020s : 0.01% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.00% optimize.offloading_packed_experts : 0.000007s : 0.00% optimize.overlap_recompute_and_grad_model_parallel : 0.000008s : 0.01% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.00% optimize.overlap_recompute_comm : 0.000006s : 0.00% optimize.overlap_grad_ring_attention : 0.000007s : 0.00% optimize.overlap_grad_flash_sp : 0.000028s : 0.02% optimize.begin_end_overlap_inline : 0.000003s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000005s : 0.00% optimize.split_layernorm_comm : 0.000004s : 0.00% optimize.handle_group_info : 0.000003s : 0.00% optimize.symbol_engine_optimizer.build : 0.000004s : 0.00% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000012s : 0.01% optimize.symbol_engine_optimizer.elim_not_effective : 0.000031s : 0.02% optimize.symbol_engine_optimizer.opt_reshape : 0.000010s : 0.01% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000015s : 0.01% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000004s : 0.00% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000025s : 0.02% get_jit_bprop_graph : 0.000002s : 0.00% rewriter_after_jit_bprop_graph : 0.000005s : 0.00% opt_after_jit_grad : 0.000590s : 0.39% validate : 0.000047s : 0.03% Time group info: ------[substitution.] 0.000234 38 11.28% : 0.000026s : 3: substitution.cast_eliminate 6.65% : 0.000016s : 3: substitution.elim_not_effective 0.89% : 0.000002s : 3: substitution.fold_const_symbol 3.47% : 0.000008s : 5: substitution.graph_param_transform 63.84% : 0.000149s : 4: substitution.inline 1.97% : 0.000005s : 6: substitution.j_node_and_user_rematch 2.69% : 0.000006s : 6: substitution.remove_not_recompute_node 2.24% : 0.000005s : 4: substitution.replace_old_param 6.96% : 0.000016s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.145037 2 99.46% : 0.144255s : 1: type_inference.infer 0.54% : 0.000782s : 1: type_inference.specialize ------[replace.] 0.000071 8 55.61% : 0.000039s : 4: replace.inline 44.39% : 0.000031s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000161 8 91.28% : 0.000147s : 4: match.inline 8.72% : 0.000014s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000272 1596 0.89% : 0.000002s : 17: predicate.accumulaten_eliminater 0.76% : 0.000002s : 5: predicate.ad_related_special_op_eliminate 0.50% : 0.000001s : 10: predicate.addn_check_dump 0.90% : 0.000002s : 17: predicate.addn_zero_filter 0.86% : 0.000002s : 17: predicate.adjust_all_reduce_mul_add 2.14% : 0.000006s : 27: predicate.arithmetic_simplify 1.00% : 0.000003s : 17: predicate.cast_eliminate 0.61% : 0.000002s : 10: predicate.check_bprop_eliminate 0.55% : 0.000001s : 10: predicate.compare_switch_simplify 0.17% : 0.000000s : 5: predicate.const_output_eliminate 0.59% : 0.000002s : 10: predicate.depend_value_elim 0.96% : 0.000003s : 17: predicate.dict_get_item_const_eliminator 1.04% : 0.000003s : 17: predicate.dict_get_item_eliminator 0.98% : 0.000003s : 17: predicate.dict_set_item_eliminator 0.92% : 0.000003s : 10: predicate.dumpgradient_eliminate 0.18% : 0.000000s : 5: predicate.elim_not_effective 0.49% : 0.000001s : 5: predicate.elim_shapecalc_of_broadcastargs 1.28% : 0.000003s : 22: predicate.environ_add_const_eliminate 1.12% : 0.000003s : 22: predicate.environ_get_add_eliminate 1.11% : 0.000003s : 22: predicate.environ_get_depend_swap 1.72% : 0.000005s : 32: predicate.environ_get_eliminate 1.14% : 0.000003s : 22: predicate.environ_get_set_eliminate 1.41% : 0.000004s : 25: predicate.exchange_switch_depend_value 2.36% : 0.000006s : 25: predicate.float_depend_g_call 0.51% : 0.000001s : 10: predicate.float_environ_get_switch 0.78% : 0.000002s : 15: predicate.float_tuple_getitem_switch 0.19% : 0.000001s : 5: predicate.fold_const_symbol 0.59% : 0.000002s : 10: predicate.get_grad_eliminate 0.30% : 0.000001s : 5: predicate.graph_param_transform 0.82% : 0.000002s : 10: predicate.incorporate_call 0.48% : 0.000001s : 10: predicate.incorporate_call_switch 6.51% : 0.000018s : 72: predicate.inline 0.85% : 0.000002s : 10: predicate.inline_without_move 0.32% : 0.000001s : 10: predicate.j_node_and_user_rematch 0.71% : 0.000002s : 10: predicate.less_batch_normalization 1.80% : 0.000005s : 31: predicate.list_to_tuple_eliminator_ 2.51% : 0.000007s : 48: predicate.load_eliminater 0.78% : 0.000002s : 5: predicate.loop_unroll_after_grad 1.86% : 0.000005s : 36: predicate.loop_unroll_before_grad 1.75% : 0.000005s : 27: predicate.make_slice_get_slice_eliminator 0.54% : 0.000001s : 10: predicate.merge_addn 0.61% : 0.000002s : 10: predicate.micro_step_allgather_replace 0.58% : 0.000002s : 10: predicate.mini_step_allgather_replace 0.93% : 0.000003s : 17: predicate.minmaximum_grad 0.84% : 0.000002s : 5: predicate.mutable_eliminate 0.44% : 0.000001s : 5: predicate.opt_reshape 0.36% : 0.000001s : 5: predicate.parallel_virtual_node 2.28% : 0.000006s : 25: predicate.partial_defer_inline 1.61% : 0.000004s : 26: predicate.partial_eliminate 1.05% : 0.000003s : 17: predicate.print_const_string_wrapper 0.57% : 0.000002s : 10: predicate.reduce_all_const_elim 1.15% : 0.000003s : 17: predicate.reduce_eliminate 2.54% : 0.000007s : 48: predicate.redundant_stop_gradient_eliminater 0.47% : 0.000001s : 10: predicate.remove_not_recompute_node 1.36% : 0.000004s : 31: predicate.replace_applicator 0.54% : 0.000001s : 10: predicate.replace_old_param 0.32% : 0.000001s : 5: predicate.reset_defer_inline 0.98% : 0.000003s : 17: predicate.reshape_eliminate 0.61% : 0.000002s : 10: predicate.row_tensor_add_zeros_like 0.51% : 0.000001s : 5: predicate.row_tensor_eliminate 0.74% : 0.000002s : 10: predicate.same_eliminate 0.42% : 0.000001s : 10: predicate.set_cell_output_no_recompute 1.04% : 0.000003s : 10: predicate.shard_identity_eliminate 0.72% : 0.000002s : 10: predicate.special_op_eliminate 0.71% : 0.000002s : 10: predicate.specialize_transform 1.00% : 0.000003s : 10: predicate.split_environ_get_set_with_tuple_value 0.71% : 0.000002s : 10: predicate.stack_unstack_eliminate 0.32% : 0.000001s : 5: predicate.switch_call_monad_eliminater 1.49% : 0.000004s : 25: predicate.switch_defer_inline 2.00% : 0.000005s : 35: predicate.switch_layer_defer_inline 4.67% : 0.000013s : 76: predicate.switch_simplify 0.86% : 0.000002s : 17: predicate.tile_eliminate 0.91% : 0.000002s : 17: predicate.transpose_eliminate 1.66% : 0.000005s : 27: predicate.tuple_list_convert_item_index_to_positive 1.56% : 0.000004s : 27: predicate.tuple_list_get_item_const_eliminator 1.48% : 0.000004s : 27: predicate.tuple_list_get_item_depend_reorder 3.29% : 0.000009s : 41: predicate.tuple_list_get_item_eliminator 1.52% : 0.000004s : 27: predicate.tuple_list_get_set_item_eliminator 2.67% : 0.000007s : 37: predicate.tuple_list_set_item_eliminator 1.64% : 0.000004s : 31: predicate.tuple_to_list_eliminator_ 2.49% : 0.000007s : 48: predicate.updatestate_pure_node_eliminater 3.13% : 0.000009s : 58: predicate.updatestate_useless_node_eliminater 0.42% : 0.000001s : 5: predicate.value_based_eliminate 0.57% : 0.000002s : 10: predicate.virtual_dataset_eliminate 0.57% : 0.000002s : 10: predicate.virtual_output_eliminate 0.28% : 0.000001s : 5: predicate.virtual_view_grad_eliminate 0.41% : 0.000001s : 5: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000633 11 54.91% : 0.000348s : 5: func_graph_cloner_run.FuncGraphClonerGraph 45.09% : 0.000285s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.169319 192 0.00% : 0.000006s : 1: ForceFp32Comm 1.98% : 0.003360s : 1: add_attr 1.97% : 0.003344s : 1: add_attr_with_inline 0.00% : 0.000006s : 1: add_comm_op_reuse_tag 0.04% : 0.000068s : 1: add_recomputation 0.00% : 0.000007s : 1: assign_add_opt 0.05% : 0.000079s : 1: auto_monad 0.02% : 0.000033s : 1: auto_monad_reorder 0.00% : 0.000006s : 1: begin_end_overlap_inline 0.00% : 0.000008s : 1: bias_add_comm_swap 0.30% : 0.000509s : 1: bootstrap 0.02% : 0.000034s : 1: cconv 0.00% : 0.000007s : 1: comm_op_add_attrs 0.01% : 0.000023s : 1: control_data_broadcast_order 0.01% : 0.000016s : 1: convert_after_rewriter 0.02% : 0.000040s : 1: cse_after_recomputation 0.00% : 0.000008s : 1: dataset_repeat_opt 0.01% : 0.000021s : 1: detach_backward 0.01% : 0.000014s : 1: environ_conv 0.02% : 0.000031s : 1: event_method 0.00% : 0.000008s : 1: full_micro_interleaved_order_control 0.00% : 0.000008s : 1: get_jit_bprop_graph 0.01% : 0.000013s : 1: graph_reusing 0.00% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000006s : 1: handle_group_info 0.01% : 0.000009s : 1: inline 0.01% : 0.000009s : 1: insert-virtual-dataset 0.00% : 0.000006s : 1: interleave_parallel_branches 0.00% : 0.000006s : 1: interleave_split_concat_branches 0.01% : 0.000009s : 1: label_fine_grained_interleaved_index 0.01% : 0.000010s : 1: label_micro_interleaved_index 0.30% : 0.000514s : 1: loop_unroll 0.00% : 0.000007s : 1: merge_cast_opt 0.00% : 0.000008s : 1: micro_interleaved_order_control 0.34% : 0.000581s : 1: mutable_eliminate 0.01% : 0.000010s : 1: offloading_packed_experts 0.01% : 0.000016s : 1: opt.transform.loop_unroll_optimizer 0.01% : 0.000017s : 1: opt.transform.mutable_eliminate 0.90% : 0.001518s : 78: opt.transform.opt_a 0.02% : 0.000038s : 1: opt.transform.opt_after_cconv 0.02% : 0.000031s : 1: opt.transform.opt_after_jit_grad 0.14% : 0.000242s : 28: opt.transform.opt_b 0.04% : 0.000064s : 2: opt.transform.opt_trans_graph 0.04% : 0.000064s : 4: opt.transform.symbol_engine_opt 2.19% : 0.003705s : 1: opt_a 0.09% : 0.000154s : 1: opt_after_cconv 0.35% : 0.000601s : 1: opt_after_jit_grad 0.26% : 0.000444s : 1: opt_b 4.04% : 0.006847s : 1: optimize 0.02% : 0.000026s : 1: optimize_parallel_all_gather_comm 0.01% : 0.000012s : 1: order_py_execute_after_rewriter 0.02% : 0.000031s : 1: overlap_grad_flash_sp 0.00% : 0.000006s : 1: overlap_grad_matmul_and_grad_allreduce 0.01% : 0.000011s : 1: overlap_grad_ring_attention 0.00% : 0.000008s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000007s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000008s : 1: overlap_param_gather 0.00% : 0.000006s : 1: overlap_recompute_allgather_and_fa_grad 0.01% : 0.000011s : 1: overlap_recompute_and_grad_model_parallel 0.01% : 0.000009s : 1: overlap_recompute_comm 0.01% : 0.000011s : 1: parallel-infer-symbol 0.00% : 0.000007s : 1: parallel-infer-symbol-second 0.00% : 0.000008s : 1: partial_unused_args_eliminate 0.01% : 0.000010s : 1: pipeline_parallel_scheduler 0.00% : 0.000007s : 1: pipeline_split 0.03% : 0.000042s : 1: pre_auto_parallel 0.02% : 0.000032s : 1: py_interpret_to_execute 0.01% : 0.000020s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000007s : 1: remove_cast_before_assign_add 0.03% : 0.000050s : 1: remove_dup_value 0.26% : 0.000433s : 1: renormalize.infer 0.23% : 0.000391s : 1: renormalize.specialize 0.00% : 0.000008s : 1: reorder_send_recv_between_fp_bp 0.01% : 0.000011s : 1: rewriter_after_jit_bprop_graph 0.03% : 0.000054s : 1: rewriter_after_opt_a 0.05% : 0.000092s : 1: rewriter_before_opt_a 0.00% : 0.000008s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000007s : 1: slice_recompute_activation 0.00% : 0.000007s : 1: split_layernorm_comm 0.00% : 0.000007s : 1: split_matmul_comm_elemetwise 0.01% : 0.000012s : 1: swap_dp_allreduce_reducescatter 0.08% : 0.000135s : 1: symbol_engine_optimizer 0.07% : 0.000116s : 1: tuple_transform 85.72% : 0.145137s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:39:55.530.164 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.301588, [21] [bootstrap]: 0.00041311 [type_inference]: 0.141597 [event_method]: 2.398e-05 [auto_monad]: 7.9e-05 [graph_reusing]: 6.50997e-06 [inline]: 3.08e-06 [add_attr]: 0.00390709, [1] [add_attr_with_inline]: 0.00389299, [1] [Cycle 1]: 8.014e-05, [2] [tag_attr]: 2.714e-05 [meta_addattr_fg_expand]: 6.58e-06 [parallel-infer-symbol]: 4.2e-06 [pre_auto_parallel]: 4.695e-05 [insert-virtual-dataset]: 2.58e-06 [parallel-infer-symbol-second]: 8.00006e-07 [dataset_repeat_opt]: 2.54001e-06 [pipeline_split]: 1.82001e-06 [optimize]: 0.00625802, [53] [py_interpret_to_execute]: 3.241e-05 [rewriter_before_opt_a]: 9.931e-05 [opt_a]: 0.00360974, [2] [Cycle 1]: 0.00273996, [45] [expand_dump_flag]: 3.00002e-06 [switch_simplify]: 4.753e-05 [loop_unroll]: 3.373e-05 [a_1]: 0.00083088 [with_stream_mark]: 2.126e-05 [recompute_prepare]: 1.193e-05 [updatestate_depend_eliminate]: 5.76e-06 [updatestate_assign_eliminate]: 4.43001e-06 [updatestate_loads_eliminate]: 3.68e-06 [parameter_eliminate]: 2.69999e-06 [a_2]: 0.000103 [accelerated_algorithm]: 9.00001e-06 [shard]: 2.79001e-06 [meta_shard_fg_expand]: 2.37001e-06 [shard_inline]: 8.50001e-06 [merge_send_recv]: 1.106e-05 [auto_parallel]: 8.43999e-06 [parallel]: 2.009e-05 [flash_sp]: 9.66e-06 [merge_comm]: 6.01e-06 [allreduce_fusion]: 4.41002e-06 [matmul_add_comm_reduction]: 1.283e-05 [allreduce_slice_to_reducescatter]: 6.59988e-07 [virtual_shard_identity]: 1.05e-05 [virtual_dataset]: 8.86002e-06 [get_grad_eliminate_]: 8.56002e-06 [virtual_output]: 8.92999e-06 [merge_forward]: 4.81002e-06 [cell_reuse_recompute_pass]: 1.75001e-06 [offload_activation]: 1.301e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.651e-05 [merge_recompute_call_nodes]: 1.92001e-06 [before_grad]: 1.421e-05 [set_forward_comm_id_for_comm_node_pass]: 4.66002e-06 [meta_fg_expand]: 3.56999e-06 [flash_sp_send_recv_attached]: 2.64001e-06 [receive_attached]: 2.83e-06 [after_resolve]: 1.345e-05 [a_after_grad]: 1.404e-05 [renormalize]: 0.00102153 [add_forward_monad_depend]: 8.43999e-06 [auto_monad_grad]: 2.71e-06 [auto_monad_eliminator]: 2.265e-05 [cse]: 4.175e-05 [a_3]: 6.748e-05 [Cycle 2]: 0.00085658, [45] [expand_dump_flag]: 1.77001e-06 [switch_simplify]: 9.73002e-06 [loop_unroll]: 7.92e-06 [a_1]: 0.00019053 [with_stream_mark]: 1.845e-05 [recompute_prepare]: 8.06001e-06 [updatestate_depend_eliminate]: 4.36002e-06 [updatestate_assign_eliminate]: 3.37002e-06 [updatestate_loads_eliminate]: 3.3e-06 [parameter_eliminate]: 1.52999e-06 [a_2]: 0.00010759 [accelerated_algorithm]: 8.99e-06 [shard]: 2.88998e-06 [meta_shard_fg_expand]: 2.73e-06 [shard_inline]: 8.46002e-06 [merge_send_recv]: 7.71001e-06 [auto_parallel]: 9.65002e-06 [parallel]: 7.77e-06 [flash_sp]: 4.16001e-06 [merge_comm]: 4.58999e-06 [allreduce_fusion]: 4.43999e-06 [matmul_add_comm_reduction]: 1.174e-05 [allreduce_slice_to_reducescatter]: 7.60017e-07 [virtual_shard_identity]: 1.023e-05 [virtual_dataset]: 7.93999e-06 [get_grad_eliminate_]: 8.3e-06 [virtual_output]: 7.88001e-06 [merge_forward]: 5.00001e-06 [cell_reuse_recompute_pass]: 2.65002e-06 [offload_activation]: 9.97001e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.6e-05 [merge_recompute_call_nodes]: 1.19e-06 [before_grad]: 1.305e-05 [set_forward_comm_id_for_comm_node_pass]: 4.67998e-06 [meta_fg_expand]: 3.14999e-06 [flash_sp_send_recv_attached]: 1.25999e-06 [receive_attached]: 1.65001e-06 [after_resolve]: 1.343e-05 [a_after_grad]: 1.203e-05 [renormalize]: 8.00064e-08 [add_forward_monad_depend]: 1.86998e-06 [auto_monad_grad]: 1.60999e-06 [auto_monad_eliminator]: 1.037e-05 [cse]: 2.248e-05 [a_3]: 4.78e-05 [py_interpret_to_execute_after_opt_a]: 1.596e-05 [slice_cell_reuse_recomputed_activation]: 1.95001e-06 [rewriter_after_opt_a]: 4.889e-05 [convert_after_rewriter]: 8.50999e-06 [order_py_execute_after_rewriter]: 6.14999e-06 [mutable_eliminate]: 0.00074517 [opt_b]: 0.00027543, [1] [Cycle 1]: 0.00026768, [7] [b_1]: 0.00016739 [b_2]: 1.135e-05 [updatestate_depend_eliminate]: 9.33002e-06 [updatestate_assign_eliminate]: 2.99001e-06 [updatestate_loads_eliminate]: 3.88999e-06 [renormalize]: 5.39992e-07 [cse]: 3.227e-05 [optimize_parallel_all_gather_comm]: 2.095e-05 [overlap_param_gather]: 1.99e-06 [cconv]: 3.382e-05 [loop_unroll]: 0.0005433 [opt_after_cconv]: 0.00013215, [1] [Cycle 1]: 0.00012519, [7] [c_1]: 4.111e-05 [parameter_eliminate]: 4.64998e-06 [updatestate_depend_eliminate]: 7.08998e-06 [updatestate_assign_eliminate]: 3.38999e-06 [updatestate_loads_eliminate]: 3.04999e-06 [cse]: 2.918e-05 [renormalize]: 3.7998e-07 [remove_dup_value]: 1.828e-05 [tuple_transform]: 9.816e-05, [1] [Cycle 1]: 9.303e-05, [4] [d_1]: 6.086e-05 [none_parameter_eliminate]: 2.13002e-06 [renormalize]: 2.30008e-07 [switch_simplify]: 9.17001e-06 [partial_unused_args_eliminate]: 2.11998e-06 [add_recomputation]: 6.762e-05 [cse_after_recomputation]: 2.818e-05, [1] [Cycle 1]: 2.322e-05, [1] [cse]: 1.751e-05 [environ_conv]: 7.16001e-06 [swap_dp_allreduce_reducescatter]: 6.49999e-06 [bias_add_comm_swap]: 3.26999e-06 [label_micro_interleaved_index]: 5.51e-06 [label_fine_grained_interleaved_index]: 2.99001e-06 [merge_cast_opt]: 1.39e-06 [slice_recompute_activation]: 2.44001e-06 [micro_interleaved_order_control]: 2.81999e-06 [assign_add_opt]: 1.35999e-06 [ForceFp32Comm]: 8.79983e-07 [remove_cast_before_assign_add]: 1.24e-06 [full_micro_interleaved_order_control]: 2.73998e-06 [reorder_send_recv_between_fp_bp]: 3.6e-06 [comm_op_add_attrs]: 1.10001e-06 [add_comm_op_reuse_tag]: 1.14e-06 [interleave_split_concat_branches]: 1.29998e-06 [interleave_parallel_branches]: 1.25001e-06 [overlap_opt_shard_in_pipeline]: 1.40999e-06 [overlap_opt_shard_grad_in_pipeline]: 1.84e-06 [control_data_broadcast_order]: 1.737e-05 [grouped_pairwise_exchange_alltoall]: 1.94999e-06 [offloading_packed_experts]: 4.85999e-06 [overlap_recompute_and_grad_model_parallel]: 5.72001e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.54998e-06 [overlap_recompute_allgather_and_fa_grad]: 1.54e-06 [overlap_recompute_comm]: 2.32999e-06 [overlap_grad_ring_attention]: 4.97e-06 [overlap_grad_flash_sp]: 2.422e-05 [begin_end_overlap_inline]: 5.59987e-07 [split_matmul_comm_elemetwise]: 2.56998e-06 [split_layernorm_comm]: 1.97001e-06 [handle_group_info]: 1.07e-06 [symbol_engine_optimizer]: 9.528e-05, [1] [Cycle 1]: 9.025e-05, [6] [build]: 4.72e-06 [elim_shapecalc]: 1.513e-05 [elim_not_effective]: 1.745e-05 [opt_reshape]: 8.90001e-06 [fold_const_symbol]: 1.308e-05 [renormalize]: 1.69995e-07 [detach_backward]: 2.39999e-06 [pipeline_parallel_scheduler]: 1.74e-06 [auto_monad_reorder]: 2.209e-05 [get_jit_bprop_graph]: 1.96998e-06 [rewriter_after_jit_bprop_graph]: 6.34001e-06 [opt_after_jit_grad]: 0.148946 [validate]: 6.405e-05 Sums bootstrap : 0.000413s : 0.14% type_inference : 0.141597s : 47.74% event_method : 0.000024s : 0.01% auto_monad : 0.000079s : 0.03% graph_reusing : 0.000007s : 0.00% inline : 0.000003s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000027s : 0.01% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000007s : 0.00% parallel-infer-symbol : 0.000004s : 0.00% pre_auto_parallel : 0.000047s : 0.02% insert-virtual-dataset : 0.000003s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000003s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000032s : 0.01% optimize.rewriter_before_opt_a : 0.000099s : 0.03% optimize.opt_a.expand_dump_flag : 0.000005s : 0.00% optimize.opt_a.switch_simplify : 0.000057s : 0.02% optimize.opt_a.loop_unroll : 0.000042s : 0.01% optimize.opt_a.a_1 : 0.001021s : 0.34% optimize.opt_a.with_stream_mark : 0.000040s : 0.01% optimize.opt_a.recompute_prepare : 0.000020s : 0.01% optimize.opt_a.updatestate_depend_eliminate : 0.000010s : 0.00% optimize.opt_a.updatestate_assign_eliminate : 0.000008s : 0.00% optimize.opt_a.updatestate_loads_eliminate : 0.000007s : 0.00% optimize.opt_a.parameter_eliminate : 0.000004s : 0.00% optimize.opt_a.a_2 : 0.000211s : 0.07% optimize.opt_a.accelerated_algorithm : 0.000018s : 0.01% optimize.opt_a.shard : 0.000006s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.000005s : 0.00% optimize.opt_a.shard_inline : 0.000017s : 0.01% optimize.opt_a.merge_send_recv : 0.000019s : 0.01% optimize.opt_a.auto_parallel : 0.000018s : 0.01% optimize.opt_a.parallel : 0.000028s : 0.01% optimize.opt_a.flash_sp : 0.000014s : 0.00% optimize.opt_a.merge_comm : 0.000011s : 0.00% optimize.opt_a.allreduce_fusion : 0.000009s : 0.00% optimize.opt_a.matmul_add_comm_reduction : 0.000025s : 0.01% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000021s : 0.01% optimize.opt_a.virtual_dataset : 0.000017s : 0.01% optimize.opt_a.get_grad_eliminate_ : 0.000017s : 0.01% optimize.opt_a.virtual_output : 0.000017s : 0.01% optimize.opt_a.merge_forward : 0.000010s : 0.00% optimize.opt_a.cell_reuse_recompute_pass : 0.000004s : 0.00% optimize.opt_a.offload_activation : 0.000023s : 0.01% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000033s : 0.01% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.00% optimize.opt_a.before_grad : 0.000027s : 0.01% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000009s : 0.00% optimize.opt_a.meta_fg_expand : 0.000007s : 0.00% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.00% optimize.opt_a.receive_attached : 0.000004s : 0.00% optimize.opt_a.after_resolve : 0.000027s : 0.01% optimize.opt_a.a_after_grad : 0.000026s : 0.01% optimize.opt_a.renormalize : 0.001022s : 0.34% optimize.opt_a.add_forward_monad_depend : 0.000010s : 0.00% optimize.opt_a.auto_monad_grad : 0.000004s : 0.00% optimize.opt_a.auto_monad_eliminator : 0.000033s : 0.01% optimize.opt_a.cse : 0.000064s : 0.02% optimize.opt_a.a_3 : 0.000115s : 0.04% optimize.py_interpret_to_execute_after_opt_a : 0.000016s : 0.01% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.00% optimize.rewriter_after_opt_a : 0.000049s : 0.02% optimize.convert_after_rewriter : 0.000009s : 0.00% optimize.order_py_execute_after_rewriter : 0.000006s : 0.00% optimize.mutable_eliminate : 0.000745s : 0.25% optimize.opt_b.b_1 : 0.000167s : 0.06% optimize.opt_b.b_2 : 0.000011s : 0.00% optimize.opt_b.updatestate_depend_eliminate : 0.000009s : 0.00% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000004s : 0.00% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000032s : 0.01% optimize.optimize_parallel_all_gather_comm : 0.000021s : 0.01% optimize.overlap_param_gather : 0.000002s : 0.00% optimize.cconv : 0.000034s : 0.01% optimize.loop_unroll : 0.000543s : 0.18% optimize.opt_after_cconv.c_1 : 0.000041s : 0.01% optimize.opt_after_cconv.parameter_eliminate : 0.000005s : 0.00% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000007s : 0.00% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.cse : 0.000029s : 0.01% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000018s : 0.01% optimize.tuple_transform.d_1 : 0.000061s : 0.02% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000009s : 0.00% optimize.partial_unused_args_eliminate : 0.000002s : 0.00% optimize.add_recomputation : 0.000068s : 0.02% optimize.cse_after_recomputation.cse : 0.000018s : 0.01% optimize.environ_conv : 0.000007s : 0.00% optimize.swap_dp_allreduce_reducescatter : 0.000006s : 0.00% optimize.bias_add_comm_swap : 0.000003s : 0.00% optimize.label_micro_interleaved_index : 0.000006s : 0.00% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.00% optimize.merge_cast_opt : 0.000001s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.00% optimize.micro_interleaved_order_control : 0.000003s : 0.00% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.00% optimize.full_micro_interleaved_order_control : 0.000003s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000004s : 0.00% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.00% optimize.control_data_broadcast_order : 0.000017s : 0.01% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.00% optimize.offloading_packed_experts : 0.000005s : 0.00% optimize.overlap_recompute_and_grad_model_parallel : 0.000006s : 0.00% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000002s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000002s : 0.00% optimize.overlap_recompute_comm : 0.000002s : 0.00% optimize.overlap_grad_ring_attention : 0.000005s : 0.00% optimize.overlap_grad_flash_sp : 0.000024s : 0.01% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000003s : 0.00% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000005s : 0.00% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000015s : 0.01% optimize.symbol_engine_optimizer.elim_not_effective : 0.000017s : 0.01% optimize.symbol_engine_optimizer.opt_reshape : 0.000009s : 0.00% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000013s : 0.00% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.00% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000022s : 0.01% get_jit_bprop_graph : 0.000002s : 0.00% rewriter_after_jit_bprop_graph : 0.000006s : 0.00% opt_after_jit_grad : 0.148946s : 50.22% validate : 0.000064s : 0.02% Time group info: ------[substitution.] 0.000266 38 11.32% : 0.000030s : 3: substitution.cast_eliminate 1.07% : 0.000003s : 3: substitution.elim_not_effective 0.76% : 0.000002s : 3: substitution.fold_const_symbol 3.03% : 0.000008s : 5: substitution.graph_param_transform 70.74% : 0.000188s : 4: substitution.inline 2.18% : 0.000006s : 6: substitution.j_node_and_user_rematch 2.67% : 0.000007s : 6: substitution.remove_not_recompute_node 2.25% : 0.000006s : 4: substitution.replace_old_param 6.00% : 0.000016s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.141510 2 99.27% : 0.140476s : 1: type_inference.infer 0.73% : 0.001034s : 1: type_inference.specialize ------[replace.] 0.000070 8 63.94% : 0.000045s : 4: replace.inline 36.06% : 0.000025s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000199 8 92.98% : 0.000185s : 4: match.inline 7.02% : 0.000014s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000287 1596 0.84% : 0.000002s : 17: predicate.accumulaten_eliminater 1.83% : 0.000005s : 5: predicate.ad_related_special_op_eliminate 0.53% : 0.000002s : 10: predicate.addn_check_dump 0.99% : 0.000003s : 17: predicate.addn_zero_filter 0.81% : 0.000002s : 17: predicate.adjust_all_reduce_mul_add 2.14% : 0.000006s : 27: predicate.arithmetic_simplify 1.15% : 0.000003s : 17: predicate.cast_eliminate 0.57% : 0.000002s : 10: predicate.check_bprop_eliminate 0.55% : 0.000002s : 10: predicate.compare_switch_simplify 0.16% : 0.000000s : 5: predicate.const_output_eliminate 0.58% : 0.000002s : 10: predicate.depend_value_elim 0.94% : 0.000003s : 17: predicate.dict_get_item_const_eliminator 1.10% : 0.000003s : 17: predicate.dict_get_item_eliminator 0.88% : 0.000003s : 17: predicate.dict_set_item_eliminator 1.76% : 0.000005s : 10: predicate.dumpgradient_eliminate 0.19% : 0.000001s : 5: predicate.elim_not_effective 0.42% : 0.000001s : 5: predicate.elim_shapecalc_of_broadcastargs 1.16% : 0.000003s : 22: predicate.environ_add_const_eliminate 1.14% : 0.000003s : 22: predicate.environ_get_add_eliminate 1.12% : 0.000003s : 22: predicate.environ_get_depend_swap 1.67% : 0.000005s : 32: predicate.environ_get_eliminate 1.12% : 0.000003s : 22: predicate.environ_get_set_eliminate 1.33% : 0.000004s : 25: predicate.exchange_switch_depend_value 2.21% : 0.000006s : 25: predicate.float_depend_g_call 0.56% : 0.000002s : 10: predicate.float_environ_get_switch 0.71% : 0.000002s : 15: predicate.float_tuple_getitem_switch 0.17% : 0.000000s : 5: predicate.fold_const_symbol 0.60% : 0.000002s : 10: predicate.get_grad_eliminate 0.16% : 0.000000s : 5: predicate.graph_param_transform 0.58% : 0.000002s : 10: predicate.incorporate_call 0.49% : 0.000001s : 10: predicate.incorporate_call_switch 6.27% : 0.000018s : 72: predicate.inline 0.92% : 0.000003s : 10: predicate.inline_without_move 0.38% : 0.000001s : 10: predicate.j_node_and_user_rematch 0.89% : 0.000003s : 10: predicate.less_batch_normalization 1.77% : 0.000005s : 31: predicate.list_to_tuple_eliminator_ 2.51% : 0.000007s : 48: predicate.load_eliminater 0.93% : 0.000003s : 5: predicate.loop_unroll_after_grad 1.96% : 0.000006s : 36: predicate.loop_unroll_before_grad 1.67% : 0.000005s : 27: predicate.make_slice_get_slice_eliminator 0.52% : 0.000001s : 10: predicate.merge_addn 0.54% : 0.000002s : 10: predicate.micro_step_allgather_replace 0.59% : 0.000002s : 10: predicate.mini_step_allgather_replace 0.77% : 0.000002s : 17: predicate.minmaximum_grad 1.49% : 0.000004s : 5: predicate.mutable_eliminate 0.32% : 0.000001s : 5: predicate.opt_reshape 0.40% : 0.000001s : 5: predicate.parallel_virtual_node 1.74% : 0.000005s : 25: predicate.partial_defer_inline 1.54% : 0.000004s : 26: predicate.partial_eliminate 0.94% : 0.000003s : 17: predicate.print_const_string_wrapper 0.58% : 0.000002s : 10: predicate.reduce_all_const_elim 1.19% : 0.000003s : 17: predicate.reduce_eliminate 2.59% : 0.000007s : 48: predicate.redundant_stop_gradient_eliminater 0.34% : 0.000001s : 10: predicate.remove_not_recompute_node 1.32% : 0.000004s : 31: predicate.replace_applicator 0.44% : 0.000001s : 10: predicate.replace_old_param 0.22% : 0.000001s : 5: predicate.reset_defer_inline 1.11% : 0.000003s : 17: predicate.reshape_eliminate 0.56% : 0.000002s : 10: predicate.row_tensor_add_zeros_like 0.49% : 0.000001s : 5: predicate.row_tensor_eliminate 0.93% : 0.000003s : 10: predicate.same_eliminate 0.37% : 0.000001s : 10: predicate.set_cell_output_no_recompute 0.84% : 0.000002s : 10: predicate.shard_identity_eliminate 0.81% : 0.000002s : 10: predicate.special_op_eliminate 0.63% : 0.000002s : 10: predicate.specialize_transform 1.01% : 0.000003s : 10: predicate.split_environ_get_set_with_tuple_value 0.71% : 0.000002s : 10: predicate.stack_unstack_eliminate 0.37% : 0.000001s : 5: predicate.switch_call_monad_eliminater 1.44% : 0.000004s : 25: predicate.switch_defer_inline 2.06% : 0.000006s : 35: predicate.switch_layer_defer_inline 4.49% : 0.000013s : 76: predicate.switch_simplify 0.87% : 0.000002s : 17: predicate.tile_eliminate 0.96% : 0.000003s : 17: predicate.transpose_eliminate 1.40% : 0.000004s : 27: predicate.tuple_list_convert_item_index_to_positive 1.59% : 0.000005s : 27: predicate.tuple_list_get_item_const_eliminator 1.52% : 0.000004s : 27: predicate.tuple_list_get_item_depend_reorder 3.08% : 0.000009s : 41: predicate.tuple_list_get_item_eliminator 1.46% : 0.000004s : 27: predicate.tuple_list_get_set_item_eliminator 2.43% : 0.000007s : 37: predicate.tuple_list_set_item_eliminator 1.66% : 0.000005s : 31: predicate.tuple_to_list_eliminator_ 2.35% : 0.000007s : 48: predicate.updatestate_pure_node_eliminater 3.13% : 0.000009s : 58: predicate.updatestate_useless_node_eliminater 0.36% : 0.000001s : 5: predicate.value_based_eliminate 0.67% : 0.000002s : 10: predicate.virtual_dataset_eliminate 0.65% : 0.000002s : 10: predicate.virtual_output_eliminate 0.26% : 0.000001s : 5: predicate.virtual_view_grad_eliminate 0.51% : 0.000001s : 5: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000699 11 43.56% : 0.000305s : 5: func_graph_cloner_run.FuncGraphClonerGraph 56.44% : 0.000395s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.314574 192 0.00% : 0.000004s : 1: ForceFp32Comm 1.24% : 0.003914s : 1: add_attr 1.24% : 0.003898s : 1: add_attr_with_inline 0.00% : 0.000004s : 1: add_comm_op_reuse_tag 0.02% : 0.000072s : 1: add_recomputation 0.00% : 0.000004s : 1: assign_add_opt 0.03% : 0.000085s : 1: auto_monad 0.01% : 0.000027s : 1: auto_monad_reorder 0.00% : 0.000004s : 1: begin_end_overlap_inline 0.00% : 0.000007s : 1: bias_add_comm_swap 0.14% : 0.000442s : 1: bootstrap 0.01% : 0.000038s : 1: cconv 0.00% : 0.000004s : 1: comm_op_add_attrs 0.01% : 0.000021s : 1: control_data_broadcast_order 0.00% : 0.000012s : 1: convert_after_rewriter 0.01% : 0.000031s : 1: cse_after_recomputation 0.00% : 0.000005s : 1: dataset_repeat_opt 0.00% : 0.000006s : 1: detach_backward 0.00% : 0.000011s : 1: environ_conv 0.01% : 0.000031s : 1: event_method 0.00% : 0.000006s : 1: full_micro_interleaved_order_control 0.00% : 0.000005s : 1: get_jit_bprop_graph 0.00% : 0.000010s : 1: graph_reusing 0.00% : 0.000005s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000004s : 1: handle_group_info 0.00% : 0.000006s : 1: inline 0.00% : 0.000006s : 1: insert-virtual-dataset 0.00% : 0.000004s : 1: interleave_parallel_branches 0.00% : 0.000005s : 1: interleave_split_concat_branches 0.00% : 0.000007s : 1: label_fine_grained_interleaved_index 0.00% : 0.000008s : 1: label_micro_interleaved_index 0.18% : 0.000552s : 1: loop_unroll 0.00% : 0.000004s : 1: merge_cast_opt 0.00% : 0.000006s : 1: micro_interleaved_order_control 0.24% : 0.000756s : 1: mutable_eliminate 0.00% : 0.000008s : 1: offloading_packed_experts 0.01% : 0.000018s : 1: opt.transform.loop_unroll_optimizer 0.01% : 0.000025s : 1: opt.transform.mutable_eliminate 0.50% : 0.001579s : 78: opt.transform.opt_a 0.01% : 0.000040s : 1: opt.transform.opt_after_cconv 0.02% : 0.000056s : 1: opt.transform.opt_after_jit_grad 0.04% : 0.000141s : 28: opt.transform.opt_b 0.02% : 0.000067s : 2: opt.transform.opt_trans_graph 0.02% : 0.000049s : 4: opt.transform.symbol_engine_opt 1.15% : 0.003614s : 1: opt_a 0.04% : 0.000136s : 1: opt_after_cconv 47.36% : 0.148974s : 1: opt_after_jit_grad 0.09% : 0.000280s : 1: opt_b 1.99% : 0.006264s : 1: optimize 0.01% : 0.000024s : 1: optimize_parallel_all_gather_comm 0.00% : 0.000009s : 1: order_py_execute_after_rewriter 0.01% : 0.000028s : 1: overlap_grad_flash_sp 0.00% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.00% : 0.000009s : 1: overlap_grad_ring_attention 0.00% : 0.000007s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000005s : 1: overlap_param_gather 0.00% : 0.000005s : 1: overlap_recompute_allgather_and_fa_grad 0.00% : 0.000009s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000006s : 1: overlap_recompute_comm 0.00% : 0.000008s : 1: parallel-infer-symbol 0.00% : 0.000004s : 1: parallel-infer-symbol-second 0.00% : 0.000006s : 1: partial_unused_args_eliminate 0.00% : 0.000005s : 1: pipeline_parallel_scheduler 0.00% : 0.000005s : 1: pipeline_split 0.02% : 0.000051s : 1: pre_auto_parallel 0.01% : 0.000037s : 1: py_interpret_to_execute 0.01% : 0.000020s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000005s : 1: remove_cast_before_assign_add 0.01% : 0.000022s : 1: remove_dup_value 0.17% : 0.000522s : 1: renormalize.infer 0.16% : 0.000488s : 1: renormalize.specialize 0.00% : 0.000007s : 1: reorder_send_recv_between_fp_bp 0.00% : 0.000010s : 1: rewriter_after_jit_bprop_graph 0.02% : 0.000053s : 1: rewriter_after_opt_a 0.03% : 0.000104s : 1: rewriter_before_opt_a 0.00% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000006s : 1: slice_recompute_activation 0.00% : 0.000005s : 1: split_layernorm_comm 0.00% : 0.000005s : 1: split_matmul_comm_elemetwise 0.00% : 0.000010s : 1: swap_dp_allreduce_reducescatter 0.03% : 0.000098s : 1: symbol_engine_optimizer 0.03% : 0.000101s : 1: tuple_transform 45.02% : 0.141628s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:39:57.535.495 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:39:57.535.775 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.205056, [21] [bootstrap]: 0.00039776 [type_inference]: 0.00572999 [event_method]: 1.966e-05 [auto_monad]: 6.418e-05 [graph_reusing]: 6.41e-06 [inline]: 2.28998e-06 [add_attr]: 0.00320819, [1] [add_attr_with_inline]: 0.00319889, [1] [Cycle 1]: 7.163e-05, [2] [tag_attr]: 2.075e-05 [meta_addattr_fg_expand]: 6.29999e-06 [parallel-infer-symbol]: 3.76999e-06 [pre_auto_parallel]: 3.275e-05 [insert-virtual-dataset]: 2.66999e-06 [parallel-infer-symbol-second]: 7.30011e-07 [dataset_repeat_opt]: 2.10002e-06 [pipeline_split]: 2.10002e-06 [optimize]: 0.0300754, [53] [py_interpret_to_execute]: 2.93e-05 [rewriter_before_opt_a]: 8.944e-05 [opt_a]: 0.0272401, [2] [Cycle 1]: 0.00266364, [45] [expand_dump_flag]: 2.91999e-06 [switch_simplify]: 3.76e-05 [loop_unroll]: 3.086e-05 [a_1]: 0.00074534 [with_stream_mark]: 1.45e-05 [recompute_prepare]: 9.94999e-06 [updatestate_depend_eliminate]: 4.25e-06 [updatestate_assign_eliminate]: 3.28e-06 [updatestate_loads_eliminate]: 2.94999e-06 [parameter_eliminate]: 1.35001e-06 [a_2]: 0.00012761 [accelerated_algorithm]: 8.15e-06 [shard]: 2.01e-06 [meta_shard_fg_expand]: 2.46e-06 [shard_inline]: 8.04002e-06 [merge_send_recv]: 1.024e-05 [auto_parallel]: 8.71997e-06 [parallel]: 2.045e-05 [flash_sp]: 1.013e-05 [merge_comm]: 4.77998e-06 [allreduce_fusion]: 4.33999e-06 [matmul_add_comm_reduction]: 8.80999e-06 [allreduce_slice_to_reducescatter]: 6.80011e-07 [virtual_shard_identity]: 9.86e-06 [virtual_dataset]: 8.60999e-06 [get_grad_eliminate_]: 8.03999e-06 [virtual_output]: 8.25e-06 [merge_forward]: 4.03001e-06 [cell_reuse_recompute_pass]: 1.15999e-06 [offload_activation]: 1.2e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.681e-05 [merge_recompute_call_nodes]: 1.49998e-06 [before_grad]: 1.445e-05 [set_forward_comm_id_for_comm_node_pass]: 5.07e-06 [meta_fg_expand]: 3.55e-06 [flash_sp_send_recv_attached]: 2.80997e-06 [receive_attached]: 2.08002e-06 [after_resolve]: 1.369e-05 [a_after_grad]: 1.315e-05 [renormalize]: 0.00094227 [add_forward_monad_depend]: 6.14999e-06 [auto_monad_grad]: 2.78e-06 [auto_monad_eliminator]: 1.741e-05 [cse]: 3.095e-05 [a_3]: 7.581e-05 [Cycle 2]: 0.0245589, [45] [expand_dump_flag]: 1.67999e-06 [switch_simplify]: 9.83998e-06 [loop_unroll]: 7.79997e-06 [a_1]: 0.00018152 [with_stream_mark]: 1.386e-05 [recompute_prepare]: 8.25999e-06 [updatestate_depend_eliminate]: 3.93999e-06 [updatestate_assign_eliminate]: 3.23e-06 [updatestate_loads_eliminate]: 0.0232519 [parameter_eliminate]: 1.024e-05 [a_2]: 0.0001726 [accelerated_algorithm]: 1.212e-05 [shard]: 5.27999e-06 [meta_shard_fg_expand]: 4.3e-06 [shard_inline]: 8.54e-06 [merge_send_recv]: 1.567e-05 [auto_parallel]: 1.541e-05 [parallel]: 1.178e-05 [flash_sp]: 6.66999e-06 [merge_comm]: 4.82e-06 [allreduce_fusion]: 4.4e-06 [matmul_add_comm_reduction]: 1.475e-05 [allreduce_slice_to_reducescatter]: 1.07e-06 [virtual_shard_identity]: 1.034e-05 [virtual_dataset]: 9.24e-06 [get_grad_eliminate_]: 8.82999e-06 [virtual_output]: 7.77998e-06 [merge_forward]: 5.32001e-06 [cell_reuse_recompute_pass]: 3.54002e-06 [offload_activation]: 1.359e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.916e-05 [merge_recompute_call_nodes]: 1.87999e-06 [before_grad]: 1.526e-05 [set_forward_comm_id_for_comm_node_pass]: 5.72001e-06 [meta_fg_expand]: 4.35e-06 [flash_sp_send_recv_attached]: 2.60002e-06 [receive_attached]: 2.27001e-06 [after_resolve]: 1.647e-05 [a_after_grad]: 1.305e-05 [renormalize]: 5.9983e-08 [add_forward_monad_depend]: 4.07e-06 [auto_monad_grad]: 2.96001e-06 [auto_monad_eliminator]: 2.187e-05 [cse]: 4.55e-05 [a_3]: 6.477e-05 [py_interpret_to_execute_after_opt_a]: 2.519e-05 [slice_cell_reuse_recomputed_activation]: 4.75001e-06 [rewriter_after_opt_a]: 5.618e-05 [convert_after_rewriter]: 1.181e-05 [order_py_execute_after_rewriter]: 8.79e-06 [mutable_eliminate]: 0.00077026 [opt_b]: 0.00034644, [1] [Cycle 1]: 0.00033254, [7] [b_1]: 0.00021704 [b_2]: 1.052e-05 [updatestate_depend_eliminate]: 7.41999e-06 [updatestate_assign_eliminate]: 3.95e-06 [updatestate_loads_eliminate]: 3.28998e-06 [renormalize]: 6.89994e-07 [cse]: 2.97e-05 [optimize_parallel_all_gather_comm]: 2.495e-05 [overlap_param_gather]: 5.42001e-06 [cconv]: 4.14e-05 [loop_unroll]: 0.00049569 [opt_after_cconv]: 0.00015328, [1] [Cycle 1]: 0.0001433, [7] [c_1]: 4.062e-05 [parameter_eliminate]: 3.73001e-06 [updatestate_depend_eliminate]: 8.37e-06 [updatestate_assign_eliminate]: 3.30998e-06 [updatestate_loads_eliminate]: 3.3e-06 [cse]: 2.636e-05 [renormalize]: 4.39992e-07 [remove_dup_value]: 2.175e-05 [tuple_transform]: 0.0001058, [1] [Cycle 1]: 9.842e-05, [4] [d_1]: 5.62e-05 [none_parameter_eliminate]: 1.68002e-06 [renormalize]: 3.50003e-07 [switch_simplify]: 8.78001e-06 [partial_unused_args_eliminate]: 4.77998e-06 [add_recomputation]: 6.394e-05 [cse_after_recomputation]: 3.498e-05, [1] [Cycle 1]: 2.708e-05, [1] [cse]: 1.753e-05 [environ_conv]: 1.052e-05 [swap_dp_allreduce_reducescatter]: 8.83001e-06 [bias_add_comm_swap]: 6.52001e-06 [label_micro_interleaved_index]: 7.95998e-06 [label_fine_grained_interleaved_index]: 5.24e-06 [merge_cast_opt]: 3.76001e-06 [slice_recompute_activation]: 4.77e-06 [micro_interleaved_order_control]: 4.51002e-06 [assign_add_opt]: 4.02998e-06 [ForceFp32Comm]: 3.34001e-06 [remove_cast_before_assign_add]: 3.23e-06 [full_micro_interleaved_order_control]: 4.68999e-06 [reorder_send_recv_between_fp_bp]: 5.89999e-06 [comm_op_add_attrs]: 3.68e-06 [add_comm_op_reuse_tag]: 3.36001e-06 [interleave_split_concat_branches]: 3.70003e-06 [interleave_parallel_branches]: 3.65998e-06 [overlap_opt_shard_in_pipeline]: 3.95998e-06 [overlap_opt_shard_grad_in_pipeline]: 4.26001e-06 [control_data_broadcast_order]: 1.922e-05 [grouped_pairwise_exchange_alltoall]: 4.05e-06 [offloading_packed_experts]: 7.39002e-06 [overlap_recompute_and_grad_model_parallel]: 8.45999e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.62002e-06 [overlap_recompute_allgather_and_fa_grad]: 3.83001e-06 [overlap_recompute_comm]: 5.42001e-06 [overlap_grad_ring_attention]: 7.16001e-06 [overlap_grad_flash_sp]: 2.923e-05 [begin_end_overlap_inline]: 2.88e-06 [split_matmul_comm_elemetwise]: 5.32001e-06 [split_layernorm_comm]: 3.92998e-06 [handle_group_info]: 3.27002e-06 [symbol_engine_optimizer]: 0.00012098, [1] [Cycle 1]: 0.00011339, [6] [build]: 4.60999e-06 [elim_shapecalc]: 1.515e-05 [elim_not_effective]: 1.789e-05 [opt_reshape]: 9.42001e-06 [fold_const_symbol]: 1.367e-05 [renormalize]: 2.00002e-07 [detach_backward]: 4.25e-06 [pipeline_parallel_scheduler]: 2.35002e-06 [auto_monad_reorder]: 0.163797 [get_jit_bprop_graph]: 3.18998e-06 [rewriter_after_jit_bprop_graph]: 1.314e-05 [opt_after_jit_grad]: 0.00088349 [validate]: 6.585e-05 Sums bootstrap : 0.000398s : 0.20% type_inference : 0.005730s : 2.87% event_method : 0.000020s : 0.01% auto_monad : 0.000064s : 0.03% graph_reusing : 0.000006s : 0.00% inline : 0.000002s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000021s : 0.01% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.00% parallel-infer-symbol : 0.000004s : 0.00% pre_auto_parallel : 0.000033s : 0.02% insert-virtual-dataset : 0.000003s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000029s : 0.01% optimize.rewriter_before_opt_a : 0.000089s : 0.04% optimize.opt_a.expand_dump_flag : 0.000005s : 0.00% optimize.opt_a.switch_simplify : 0.000047s : 0.02% optimize.opt_a.loop_unroll : 0.000039s : 0.02% optimize.opt_a.a_1 : 0.000927s : 0.46% optimize.opt_a.with_stream_mark : 0.000028s : 0.01% optimize.opt_a.recompute_prepare : 0.000018s : 0.01% optimize.opt_a.updatestate_depend_eliminate : 0.000008s : 0.00% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.00% optimize.opt_a.updatestate_loads_eliminate : 0.023255s : 11.64% optimize.opt_a.parameter_eliminate : 0.000012s : 0.01% optimize.opt_a.a_2 : 0.000300s : 0.15% optimize.opt_a.accelerated_algorithm : 0.000020s : 0.01% optimize.opt_a.shard : 0.000007s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.000007s : 0.00% optimize.opt_a.shard_inline : 0.000017s : 0.01% optimize.opt_a.merge_send_recv : 0.000026s : 0.01% optimize.opt_a.auto_parallel : 0.000024s : 0.01% optimize.opt_a.parallel : 0.000032s : 0.02% optimize.opt_a.flash_sp : 0.000017s : 0.01% optimize.opt_a.merge_comm : 0.000010s : 0.00% optimize.opt_a.allreduce_fusion : 0.000009s : 0.00% optimize.opt_a.matmul_add_comm_reduction : 0.000024s : 0.01% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000002s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000020s : 0.01% optimize.opt_a.virtual_dataset : 0.000018s : 0.01% optimize.opt_a.get_grad_eliminate_ : 0.000017s : 0.01% optimize.opt_a.virtual_output : 0.000016s : 0.01% optimize.opt_a.merge_forward : 0.000009s : 0.00% optimize.opt_a.cell_reuse_recompute_pass : 0.000005s : 0.00% optimize.opt_a.offload_activation : 0.000026s : 0.01% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000046s : 0.02% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.00% optimize.opt_a.before_grad : 0.000030s : 0.01% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000011s : 0.01% optimize.opt_a.meta_fg_expand : 0.000008s : 0.00% optimize.opt_a.flash_sp_send_recv_attached : 0.000005s : 0.00% optimize.opt_a.receive_attached : 0.000004s : 0.00% optimize.opt_a.after_resolve : 0.000030s : 0.02% optimize.opt_a.a_after_grad : 0.000026s : 0.01% optimize.opt_a.renormalize : 0.000942s : 0.47% optimize.opt_a.add_forward_monad_depend : 0.000010s : 0.01% optimize.opt_a.auto_monad_grad : 0.000006s : 0.00% optimize.opt_a.auto_monad_eliminator : 0.000039s : 0.02% optimize.opt_a.cse : 0.000076s : 0.04% optimize.opt_a.a_3 : 0.000141s : 0.07% optimize.py_interpret_to_execute_after_opt_a : 0.000025s : 0.01% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.00% optimize.rewriter_after_opt_a : 0.000056s : 0.03% optimize.convert_after_rewriter : 0.000012s : 0.01% optimize.order_py_execute_after_rewriter : 0.000009s : 0.00% optimize.mutable_eliminate : 0.000770s : 0.39% optimize.opt_b.b_1 : 0.000217s : 0.11% optimize.opt_b.b_2 : 0.000011s : 0.01% optimize.opt_b.updatestate_depend_eliminate : 0.000007s : 0.00% optimize.opt_b.updatestate_assign_eliminate : 0.000004s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.00% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000030s : 0.01% optimize.optimize_parallel_all_gather_comm : 0.000025s : 0.01% optimize.overlap_param_gather : 0.000005s : 0.00% optimize.cconv : 0.000041s : 0.02% optimize.loop_unroll : 0.000496s : 0.25% optimize.opt_after_cconv.c_1 : 0.000041s : 0.02% optimize.opt_after_cconv.parameter_eliminate : 0.000004s : 0.00% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000008s : 0.00% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.cse : 0.000026s : 0.01% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000022s : 0.01% optimize.tuple_transform.d_1 : 0.000056s : 0.03% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000009s : 0.00% optimize.partial_unused_args_eliminate : 0.000005s : 0.00% optimize.add_recomputation : 0.000064s : 0.03% optimize.cse_after_recomputation.cse : 0.000018s : 0.01% optimize.environ_conv : 0.000011s : 0.01% optimize.swap_dp_allreduce_reducescatter : 0.000009s : 0.00% optimize.bias_add_comm_swap : 0.000007s : 0.00% optimize.label_micro_interleaved_index : 0.000008s : 0.00% optimize.label_fine_grained_interleaved_index : 0.000005s : 0.00% optimize.merge_cast_opt : 0.000004s : 0.00% optimize.slice_recompute_activation : 0.000005s : 0.00% optimize.micro_interleaved_order_control : 0.000005s : 0.00% optimize.assign_add_opt : 0.000004s : 0.00% optimize.ForceFp32Comm : 0.000003s : 0.00% optimize.remove_cast_before_assign_add : 0.000003s : 0.00% optimize.full_micro_interleaved_order_control : 0.000005s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000006s : 0.00% optimize.comm_op_add_attrs : 0.000004s : 0.00% optimize.add_comm_op_reuse_tag : 0.000003s : 0.00% optimize.interleave_split_concat_branches : 0.000004s : 0.00% optimize.interleave_parallel_branches : 0.000004s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000004s : 0.00% optimize.control_data_broadcast_order : 0.000019s : 0.01% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.00% optimize.offloading_packed_experts : 0.000007s : 0.00% optimize.overlap_recompute_and_grad_model_parallel : 0.000008s : 0.00% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.00% optimize.overlap_recompute_comm : 0.000005s : 0.00% optimize.overlap_grad_ring_attention : 0.000007s : 0.00% optimize.overlap_grad_flash_sp : 0.000029s : 0.01% optimize.begin_end_overlap_inline : 0.000003s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000005s : 0.00% optimize.split_layernorm_comm : 0.000004s : 0.00% optimize.handle_group_info : 0.000003s : 0.00% optimize.symbol_engine_optimizer.build : 0.000005s : 0.00% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000015s : 0.01% optimize.symbol_engine_optimizer.elim_not_effective : 0.000018s : 0.01% optimize.symbol_engine_optimizer.opt_reshape : 0.000009s : 0.00% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000014s : 0.01% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000004s : 0.00% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.163797s : 82.00% get_jit_bprop_graph : 0.000003s : 0.00% rewriter_after_jit_bprop_graph : 0.000013s : 0.01% opt_after_jit_grad : 0.000883s : 0.44% validate : 0.000066s : 0.03% Time group info: ------[substitution.] 0.000221 38 11.16% : 0.000025s : 3: substitution.cast_eliminate 1.21% : 0.000003s : 3: substitution.elim_not_effective 0.79% : 0.000002s : 3: substitution.fold_const_symbol 3.20% : 0.000007s : 5: substitution.graph_param_transform 67.39% : 0.000149s : 4: substitution.inline 2.75% : 0.000006s : 6: substitution.j_node_and_user_rematch 3.41% : 0.000008s : 6: substitution.remove_not_recompute_node 3.33% : 0.000007s : 4: substitution.replace_old_param 6.76% : 0.000015s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.005677 2 86.85% : 0.004931s : 1: type_inference.infer 13.15% : 0.000747s : 1: type_inference.specialize ------[replace.] 0.000064 8 60.30% : 0.000038s : 4: replace.inline 39.70% : 0.000025s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000159 8 91.82% : 0.000146s : 4: match.inline 8.18% : 0.000013s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000279 1596 0.87% : 0.000002s : 17: predicate.accumulaten_eliminater 1.09% : 0.000003s : 5: predicate.ad_related_special_op_eliminate 0.51% : 0.000001s : 10: predicate.addn_check_dump 0.90% : 0.000003s : 17: predicate.addn_zero_filter 0.82% : 0.000002s : 17: predicate.adjust_all_reduce_mul_add 2.46% : 0.000007s : 27: predicate.arithmetic_simplify 0.99% : 0.000003s : 17: predicate.cast_eliminate 0.65% : 0.000002s : 10: predicate.check_bprop_eliminate 0.69% : 0.000002s : 10: predicate.compare_switch_simplify 0.24% : 0.000001s : 5: predicate.const_output_eliminate 0.60% : 0.000002s : 10: predicate.depend_value_elim 0.88% : 0.000002s : 17: predicate.dict_get_item_const_eliminator 0.97% : 0.000003s : 17: predicate.dict_get_item_eliminator 0.90% : 0.000003s : 17: predicate.dict_set_item_eliminator 1.44% : 0.000004s : 10: predicate.dumpgradient_eliminate 0.23% : 0.000001s : 5: predicate.elim_not_effective 0.40% : 0.000001s : 5: predicate.elim_shapecalc_of_broadcastargs 1.39% : 0.000004s : 22: predicate.environ_add_const_eliminate 1.05% : 0.000003s : 22: predicate.environ_get_add_eliminate 1.08% : 0.000003s : 22: predicate.environ_get_depend_swap 1.68% : 0.000005s : 32: predicate.environ_get_eliminate 1.09% : 0.000003s : 22: predicate.environ_get_set_eliminate 1.32% : 0.000004s : 25: predicate.exchange_switch_depend_value 1.94% : 0.000005s : 25: predicate.float_depend_g_call 0.65% : 0.000002s : 10: predicate.float_environ_get_switch 0.81% : 0.000002s : 15: predicate.float_tuple_getitem_switch 0.18% : 0.000001s : 5: predicate.fold_const_symbol 0.62% : 0.000002s : 10: predicate.get_grad_eliminate 0.19% : 0.000001s : 5: predicate.graph_param_transform 0.64% : 0.000002s : 10: predicate.incorporate_call 0.47% : 0.000001s : 10: predicate.incorporate_call_switch 6.12% : 0.000017s : 72: predicate.inline 0.89% : 0.000002s : 10: predicate.inline_without_move 0.28% : 0.000001s : 10: predicate.j_node_and_user_rematch 0.88% : 0.000002s : 10: predicate.less_batch_normalization 1.78% : 0.000005s : 31: predicate.list_to_tuple_eliminator_ 2.40% : 0.000007s : 48: predicate.load_eliminater 1.07% : 0.000003s : 5: predicate.loop_unroll_after_grad 1.90% : 0.000005s : 36: predicate.loop_unroll_before_grad 1.88% : 0.000005s : 27: predicate.make_slice_get_slice_eliminator 0.75% : 0.000002s : 10: predicate.merge_addn 0.57% : 0.000002s : 10: predicate.micro_step_allgather_replace 0.71% : 0.000002s : 10: predicate.mini_step_allgather_replace 0.82% : 0.000002s : 17: predicate.minmaximum_grad 0.94% : 0.000003s : 5: predicate.mutable_eliminate 0.37% : 0.000001s : 5: predicate.opt_reshape 0.38% : 0.000001s : 5: predicate.parallel_virtual_node 1.52% : 0.000004s : 25: predicate.partial_defer_inline 1.64% : 0.000005s : 26: predicate.partial_eliminate 0.84% : 0.000002s : 17: predicate.print_const_string_wrapper 0.73% : 0.000002s : 10: predicate.reduce_all_const_elim 1.09% : 0.000003s : 17: predicate.reduce_eliminate 2.42% : 0.000007s : 48: predicate.redundant_stop_gradient_eliminater 0.45% : 0.000001s : 10: predicate.remove_not_recompute_node 1.34% : 0.000004s : 31: predicate.replace_applicator 0.52% : 0.000001s : 10: predicate.replace_old_param 0.21% : 0.000001s : 5: predicate.reset_defer_inline 0.89% : 0.000002s : 17: predicate.reshape_eliminate 0.80% : 0.000002s : 10: predicate.row_tensor_add_zeros_like 0.39% : 0.000001s : 5: predicate.row_tensor_eliminate 0.75% : 0.000002s : 10: predicate.same_eliminate 0.40% : 0.000001s : 10: predicate.set_cell_output_no_recompute 0.86% : 0.000002s : 10: predicate.shard_identity_eliminate 0.75% : 0.000002s : 10: predicate.special_op_eliminate 1.37% : 0.000004s : 10: predicate.specialize_transform 1.10% : 0.000003s : 10: predicate.split_environ_get_set_with_tuple_value 0.76% : 0.000002s : 10: predicate.stack_unstack_eliminate 0.37% : 0.000001s : 5: predicate.switch_call_monad_eliminater 1.36% : 0.000004s : 25: predicate.switch_defer_inline 1.98% : 0.000006s : 35: predicate.switch_layer_defer_inline 4.84% : 0.000013s : 76: predicate.switch_simplify 0.85% : 0.000002s : 17: predicate.tile_eliminate 0.89% : 0.000002s : 17: predicate.transpose_eliminate 1.61% : 0.000004s : 27: predicate.tuple_list_convert_item_index_to_positive 1.57% : 0.000004s : 27: predicate.tuple_list_get_item_const_eliminator 1.52% : 0.000004s : 27: predicate.tuple_list_get_item_depend_reorder 2.93% : 0.000008s : 41: predicate.tuple_list_get_item_eliminator 1.50% : 0.000004s : 27: predicate.tuple_list_get_set_item_eliminator 2.46% : 0.000007s : 37: predicate.tuple_list_set_item_eliminator 1.69% : 0.000005s : 31: predicate.tuple_to_list_eliminator_ 2.48% : 0.000007s : 48: predicate.updatestate_pure_node_eliminater 3.17% : 0.000009s : 58: predicate.updatestate_useless_node_eliminater 0.35% : 0.000001s : 5: predicate.value_based_eliminate 0.65% : 0.000002s : 10: predicate.virtual_dataset_eliminate 0.62% : 0.000002s : 10: predicate.virtual_output_eliminate 0.25% : 0.000001s : 5: predicate.virtual_view_grad_eliminate 0.59% : 0.000002s : 5: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000559 11 47.70% : 0.000267s : 5: func_graph_cloner_run.FuncGraphClonerGraph 52.30% : 0.000293s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.240996 192 0.00% : 0.000006s : 1: ForceFp32Comm 1.34% : 0.003219s : 1: add_attr 1.33% : 0.003203s : 1: add_attr_with_inline 0.00% : 0.000006s : 1: add_comm_op_reuse_tag 0.03% : 0.000068s : 1: add_recomputation 0.00% : 0.000007s : 1: assign_add_opt 0.03% : 0.000073s : 1: auto_monad 67.99% : 0.163863s : 1: auto_monad_reorder 0.00% : 0.000006s : 1: begin_end_overlap_inline 0.00% : 0.000009s : 1: bias_add_comm_swap 0.18% : 0.000442s : 1: bootstrap 0.02% : 0.000045s : 1: cconv 0.00% : 0.000006s : 1: comm_op_add_attrs 0.01% : 0.000023s : 1: control_data_broadcast_order 0.01% : 0.000015s : 1: convert_after_rewriter 0.02% : 0.000038s : 1: cse_after_recomputation 0.00% : 0.000008s : 1: dataset_repeat_opt 0.01% : 0.000023s : 1: detach_backward 0.01% : 0.000014s : 1: environ_conv 0.01% : 0.000029s : 1: event_method 0.00% : 0.000007s : 1: full_micro_interleaved_order_control 0.01% : 0.000016s : 1: get_jit_bprop_graph 0.01% : 0.000013s : 1: graph_reusing 0.00% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000007s : 1: handle_group_info 0.00% : 0.000008s : 1: inline 0.00% : 0.000009s : 1: insert-virtual-dataset 0.00% : 0.000006s : 1: interleave_parallel_branches 0.00% : 0.000006s : 1: interleave_split_concat_branches 0.00% : 0.000008s : 1: label_fine_grained_interleaved_index 0.00% : 0.000011s : 1: label_micro_interleaved_index 0.21% : 0.000502s : 1: loop_unroll 0.00% : 0.000006s : 1: merge_cast_opt 0.00% : 0.000007s : 1: micro_interleaved_order_control 0.32% : 0.000777s : 1: mutable_eliminate 0.00% : 0.000010s : 1: offloading_packed_experts 0.01% : 0.000020s : 1: opt.transform.loop_unroll_optimizer 0.01% : 0.000019s : 1: opt.transform.mutable_eliminate 0.63% : 0.001507s : 78: opt.transform.opt_a 0.02% : 0.000039s : 1: opt.transform.opt_after_cconv 0.02% : 0.000045s : 1: opt.transform.opt_after_jit_grad 0.06% : 0.000146s : 28: opt.transform.opt_b 0.03% : 0.000063s : 2: opt.transform.opt_trans_graph 0.02% : 0.000051s : 4: opt.transform.symbol_engine_opt 11.30% : 0.027244s : 1: opt_a 0.07% : 0.000157s : 1: opt_after_cconv 0.37% : 0.000896s : 1: opt_after_jit_grad 0.15% : 0.000350s : 1: opt_b 12.65% : 0.030486s : 1: optimize 0.01% : 0.000028s : 1: optimize_parallel_all_gather_comm 0.00% : 0.000012s : 1: order_py_execute_after_rewriter 0.01% : 0.000032s : 1: overlap_grad_flash_sp 0.00% : 0.000006s : 1: overlap_grad_matmul_and_grad_allreduce 0.00% : 0.000011s : 1: overlap_grad_ring_attention 0.00% : 0.000008s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000007s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000009s : 1: overlap_param_gather 0.00% : 0.000006s : 1: overlap_recompute_allgather_and_fa_grad 0.00% : 0.000011s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000008s : 1: overlap_recompute_comm 0.00% : 0.000010s : 1: parallel-infer-symbol 0.00% : 0.000006s : 1: parallel-infer-symbol-second 0.00% : 0.000008s : 1: partial_unused_args_eliminate 0.00% : 0.000010s : 1: pipeline_parallel_scheduler 0.00% : 0.000009s : 1: pipeline_split 0.02% : 0.000040s : 1: pre_auto_parallel 0.01% : 0.000033s : 1: py_interpret_to_execute 0.01% : 0.000028s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000007s : 1: remove_cast_before_assign_add 0.01% : 0.000025s : 1: remove_dup_value 0.22% : 0.000529s : 1: renormalize.infer 0.17% : 0.000404s : 1: renormalize.specialize 0.00% : 0.000009s : 1: reorder_send_recv_between_fp_bp 0.01% : 0.000020s : 1: rewriter_after_jit_bprop_graph 0.02% : 0.000060s : 1: rewriter_after_opt_a 0.04% : 0.000093s : 1: rewriter_before_opt_a 0.00% : 0.000008s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000008s : 1: slice_recompute_activation 0.00% : 0.000007s : 1: split_layernorm_comm 0.00% : 0.000008s : 1: split_matmul_comm_elemetwise 0.00% : 0.000012s : 1: swap_dp_allreduce_reducescatter 0.05% : 0.000124s : 1: symbol_engine_optimizer 0.05% : 0.000109s : 1: tuple_transform 2.39% : 0.005767s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:39:59.242.900 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0421126, [21] [bootstrap]: 0.00046739 [type_inference]: 0.0063305 [event_method]: 2.193e-05 [auto_monad]: 7.04e-05 [graph_reusing]: 6.04999e-06 [inline]: 2.51e-06 [add_attr]: 0.00393638, [1] [add_attr_with_inline]: 0.00392176, [1] [Cycle 1]: 8.03e-05, [2] [tag_attr]: 2.835e-05 [meta_addattr_fg_expand]: 5.87001e-06 [parallel-infer-symbol]: 4.07003e-06 [pre_auto_parallel]: 4.46e-05 [insert-virtual-dataset]: 3.28e-06 [parallel-infer-symbol-second]: 9.39996e-07 [dataset_repeat_opt]: 2.21e-06 [pipeline_split]: 2.36e-06 [optimize]: 0.0302994, [53] [py_interpret_to_execute]: 3.484e-05 [rewriter_before_opt_a]: 0.00010009 [opt_a]: 0.00362441, [2] [Cycle 1]: 0.00278118, [45] [expand_dump_flag]: 3.29001e-06 [switch_simplify]: 4.586e-05 [loop_unroll]: 3.156e-05 [a_1]: 0.00083971 [with_stream_mark]: 2.418e-05 [recompute_prepare]: 1.158e-05 [updatestate_depend_eliminate]: 4.90999e-06 [updatestate_assign_eliminate]: 4.47998e-06 [updatestate_loads_eliminate]: 3.71001e-06 [parameter_eliminate]: 2.04999e-06 [a_2]: 0.00010739 [accelerated_algorithm]: 9.96e-06 [shard]: 2.83e-06 [meta_shard_fg_expand]: 3.08e-06 [shard_inline]: 8.42998e-06 [merge_send_recv]: 1.196e-05 [auto_parallel]: 9.09e-06 [parallel]: 2.066e-05 [flash_sp]: 1.093e-05 [merge_comm]: 5.49e-06 [allreduce_fusion]: 4.78001e-06 [matmul_add_comm_reduction]: 1.174e-05 [allreduce_slice_to_reducescatter]: 1.12e-06 [virtual_shard_identity]: 1.032e-05 [virtual_dataset]: 8.90999e-06 [get_grad_eliminate_]: 8.33999e-06 [virtual_output]: 9.07999e-06 [merge_forward]: 5.52001e-06 [cell_reuse_recompute_pass]: 1.34e-06 [offload_activation]: 1.321e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.815e-05 [merge_recompute_call_nodes]: 1.93002e-06 [before_grad]: 1.49e-05 [set_forward_comm_id_for_comm_node_pass]: 4.85001e-06 [meta_fg_expand]: 3.86999e-06 [flash_sp_send_recv_attached]: 3.65e-06 [receive_attached]: 2.53e-06 [after_resolve]: 1.456e-05 [a_after_grad]: 1.367e-05 [renormalize]: 0.00105284 [add_forward_monad_depend]: 8.10999e-06 [auto_monad_grad]: 2.72001e-06 [auto_monad_eliminator]: 2.141e-05 [cse]: 4.029e-05 [a_3]: 6.377e-05 [Cycle 2]: 0.00083073, [45] [expand_dump_flag]: 1.94e-06 [switch_simplify]: 9.77999e-06 [loop_unroll]: 7.70998e-06 [a_1]: 0.00019101 [with_stream_mark]: 1.78e-05 [recompute_prepare]: 8.60999e-06 [updatestate_depend_eliminate]: 4.42e-06 [updatestate_assign_eliminate]: 3.2e-06 [updatestate_loads_eliminate]: 2.91999e-06 [parameter_eliminate]: 1.23002e-06 [a_2]: 9.407e-05 [accelerated_algorithm]: 8.15e-06 [shard]: 1.38002e-06 [meta_shard_fg_expand]: 1.90001e-06 [shard_inline]: 7.77998e-06 [merge_send_recv]: 7.40998e-06 [auto_parallel]: 7.57998e-06 [parallel]: 7.56001e-06 [flash_sp]: 3.66001e-06 [merge_comm]: 4.41002e-06 [allreduce_fusion]: 3.99002e-06 [matmul_add_comm_reduction]: 8.65001e-06 [allreduce_slice_to_reducescatter]: 6.39993e-07 [virtual_shard_identity]: 9.36e-06 [virtual_dataset]: 7.61001e-06 [get_grad_eliminate_]: 7.06001e-06 [virtual_output]: 8.79e-06 [merge_forward]: 4.11001e-06 [cell_reuse_recompute_pass]: 2.32001e-06 [offload_activation]: 9.89001e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.764e-05 [merge_recompute_call_nodes]: 1.07e-06 [before_grad]: 1.296e-05 [set_forward_comm_id_for_comm_node_pass]: 4.79e-06 [meta_fg_expand]: 3.45e-06 [flash_sp_send_recv_attached]: 1.02998e-06 [receive_attached]: 1.40999e-06 [after_resolve]: 1.273e-05 [a_after_grad]: 1.184e-05 [renormalize]: 6.00121e-08 [add_forward_monad_depend]: 1.58002e-06 [auto_monad_grad]: 1.57001e-06 [auto_monad_eliminator]: 1.121e-05 [cse]: 2.188e-05 [a_3]: 4.658e-05 [py_interpret_to_execute_after_opt_a]: 1.684e-05 [slice_cell_reuse_recomputed_activation]: 2.36e-06 [rewriter_after_opt_a]: 4.925e-05 [convert_after_rewriter]: 9.15999e-06 [order_py_execute_after_rewriter]: 5.97999e-06 [mutable_eliminate]: 0.00072904 [opt_b]: 0.00029155, [1] [Cycle 1]: 0.00028383, [7] [b_1]: 0.00018771 [b_2]: 1.055e-05 [updatestate_depend_eliminate]: 9.30001e-06 [updatestate_assign_eliminate]: 3.2e-06 [updatestate_loads_eliminate]: 3.08e-06 [renormalize]: 1.00999e-06 [cse]: 2.955e-05 [optimize_parallel_all_gather_comm]: 2.186e-05 [overlap_param_gather]: 2.64999e-06 [cconv]: 3.344e-05 [loop_unroll]: 0.0244829 [opt_after_cconv]: 0.00018219, [1] [Cycle 1]: 0.00017147, [7] [c_1]: 4.789e-05 [parameter_eliminate]: 7.48e-06 [updatestate_depend_eliminate]: 1.523e-05 [updatestate_assign_eliminate]: 4.93001e-06 [updatestate_loads_eliminate]: 3.92998e-06 [cse]: 5.207e-05 [renormalize]: 8.30012e-07 [remove_dup_value]: 1.947e-05 [tuple_transform]: 0.00010573, [1] [Cycle 1]: 9.993e-05, [4] [d_1]: 6.846e-05 [none_parameter_eliminate]: 1.96e-06 [renormalize]: 1.69995e-07 [switch_simplify]: 9.04998e-06 [partial_unused_args_eliminate]: 2.12001e-06 [add_recomputation]: 7.468e-05 [cse_after_recomputation]: 2.875e-05, [1] [Cycle 1]: 2.348e-05, [1] [cse]: 1.751e-05 [environ_conv]: 9.20999e-06 [swap_dp_allreduce_reducescatter]: 6.31e-06 [bias_add_comm_swap]: 3.21001e-06 [label_micro_interleaved_index]: 8.28001e-06 [label_fine_grained_interleaved_index]: 3.12002e-06 [merge_cast_opt]: 1.59998e-06 [slice_recompute_activation]: 2.42001e-06 [micro_interleaved_order_control]: 2.62001e-06 [assign_add_opt]: 1.43002e-06 [ForceFp32Comm]: 1.46002e-06 [remove_cast_before_assign_add]: 1.19998e-06 [full_micro_interleaved_order_control]: 2.31998e-06 [reorder_send_recv_between_fp_bp]: 2.81e-06 [comm_op_add_attrs]: 1.10001e-06 [add_comm_op_reuse_tag]: 9.60019e-07 [interleave_split_concat_branches]: 1.17999e-06 [interleave_parallel_branches]: 1.09e-06 [overlap_opt_shard_in_pipeline]: 1.27e-06 [overlap_opt_shard_grad_in_pipeline]: 2.53998e-06 [control_data_broadcast_order]: 1.851e-05 [grouped_pairwise_exchange_alltoall]: 1.79e-06 [offloading_packed_experts]: 5.14e-06 [overlap_recompute_and_grad_model_parallel]: 6.02001e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.15001e-06 [overlap_recompute_allgather_and_fa_grad]: 1.64e-06 [overlap_recompute_comm]: 2.68e-06 [overlap_grad_ring_attention]: 4.92e-06 [overlap_grad_flash_sp]: 2.648e-05 [begin_end_overlap_inline]: 4.89992e-07 [split_matmul_comm_elemetwise]: 2.41e-06 [split_layernorm_comm]: 2.02999e-06 [handle_group_info]: 1.01002e-06 [symbol_engine_optimizer]: 9.717e-05, [1] [Cycle 1]: 9.168e-05, [6] [build]: 4.04002e-06 [elim_shapecalc]: 1.605e-05 [elim_not_effective]: 1.84e-05 [opt_reshape]: 9.65002e-06 [fold_const_symbol]: 1.319e-05 [renormalize]: 2.69996e-07 [detach_backward]: 2.22001e-06 [pipeline_parallel_scheduler]: 1.97001e-06 [auto_monad_reorder]: 2.231e-05 [get_jit_bprop_graph]: 2.49001e-06 [rewriter_after_jit_bprop_graph]: 6.43e-06 [opt_after_jit_grad]: 0.00065654 [validate]: 5.635e-05 Sums bootstrap : 0.000467s : 1.26% type_inference : 0.006331s : 17.07% event_method : 0.000022s : 0.06% auto_monad : 0.000070s : 0.19% graph_reusing : 0.000006s : 0.02% inline : 0.000003s : 0.01% add_attr.add_attr_with_inline.tag_attr : 0.000028s : 0.08% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.02% parallel-infer-symbol : 0.000004s : 0.01% pre_auto_parallel : 0.000045s : 0.12% insert-virtual-dataset : 0.000003s : 0.01% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.01% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000035s : 0.09% optimize.rewriter_before_opt_a : 0.000100s : 0.27% optimize.opt_a.expand_dump_flag : 0.000005s : 0.01% optimize.opt_a.switch_simplify : 0.000056s : 0.15% optimize.opt_a.loop_unroll : 0.000039s : 0.11% optimize.opt_a.a_1 : 0.001031s : 2.78% optimize.opt_a.with_stream_mark : 0.000042s : 0.11% optimize.opt_a.recompute_prepare : 0.000020s : 0.05% optimize.opt_a.updatestate_depend_eliminate : 0.000009s : 0.03% optimize.opt_a.updatestate_assign_eliminate : 0.000008s : 0.02% optimize.opt_a.updatestate_loads_eliminate : 0.000007s : 0.02% optimize.opt_a.parameter_eliminate : 0.000003s : 0.01% optimize.opt_a.a_2 : 0.000201s : 0.54% optimize.opt_a.accelerated_algorithm : 0.000018s : 0.05% optimize.opt_a.shard : 0.000004s : 0.01% optimize.opt_a.meta_shard_fg_expand : 0.000005s : 0.01% optimize.opt_a.shard_inline : 0.000016s : 0.04% optimize.opt_a.merge_send_recv : 0.000019s : 0.05% optimize.opt_a.auto_parallel : 0.000017s : 0.04% optimize.opt_a.parallel : 0.000028s : 0.08% optimize.opt_a.flash_sp : 0.000015s : 0.04% optimize.opt_a.merge_comm : 0.000010s : 0.03% optimize.opt_a.allreduce_fusion : 0.000009s : 0.02% optimize.opt_a.matmul_add_comm_reduction : 0.000020s : 0.05% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000002s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000020s : 0.05% optimize.opt_a.virtual_dataset : 0.000017s : 0.04% optimize.opt_a.get_grad_eliminate_ : 0.000015s : 0.04% optimize.opt_a.virtual_output : 0.000018s : 0.05% optimize.opt_a.merge_forward : 0.000010s : 0.03% optimize.opt_a.cell_reuse_recompute_pass : 0.000004s : 0.01% optimize.opt_a.offload_activation : 0.000023s : 0.06% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000036s : 0.10% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.01% optimize.opt_a.before_grad : 0.000028s : 0.08% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000010s : 0.03% optimize.opt_a.meta_fg_expand : 0.000007s : 0.02% optimize.opt_a.flash_sp_send_recv_attached : 0.000005s : 0.01% optimize.opt_a.receive_attached : 0.000004s : 0.01% optimize.opt_a.after_resolve : 0.000027s : 0.07% optimize.opt_a.a_after_grad : 0.000026s : 0.07% optimize.opt_a.renormalize : 0.001053s : 2.84% optimize.opt_a.add_forward_monad_depend : 0.000010s : 0.03% optimize.opt_a.auto_monad_grad : 0.000004s : 0.01% optimize.opt_a.auto_monad_eliminator : 0.000033s : 0.09% optimize.opt_a.cse : 0.000062s : 0.17% optimize.opt_a.a_3 : 0.000110s : 0.30% optimize.py_interpret_to_execute_after_opt_a : 0.000017s : 0.05% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.01% optimize.rewriter_after_opt_a : 0.000049s : 0.13% optimize.convert_after_rewriter : 0.000009s : 0.02% optimize.order_py_execute_after_rewriter : 0.000006s : 0.02% optimize.mutable_eliminate : 0.000729s : 1.97% optimize.opt_b.b_1 : 0.000188s : 0.51% optimize.opt_b.b_2 : 0.000011s : 0.03% optimize.opt_b.updatestate_depend_eliminate : 0.000009s : 0.03% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.01% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.01% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000030s : 0.08% optimize.optimize_parallel_all_gather_comm : 0.000022s : 0.06% optimize.overlap_param_gather : 0.000003s : 0.01% optimize.cconv : 0.000033s : 0.09% optimize.loop_unroll : 0.024483s : 66.00% optimize.opt_after_cconv.c_1 : 0.000048s : 0.13% optimize.opt_after_cconv.parameter_eliminate : 0.000007s : 0.02% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000015s : 0.04% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000005s : 0.01% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000004s : 0.01% optimize.opt_after_cconv.cse : 0.000052s : 0.14% optimize.opt_after_cconv.renormalize : 0.000001s : 0.00% optimize.remove_dup_value : 0.000019s : 0.05% optimize.tuple_transform.d_1 : 0.000068s : 0.18% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000009s : 0.02% optimize.partial_unused_args_eliminate : 0.000002s : 0.01% optimize.add_recomputation : 0.000075s : 0.20% optimize.cse_after_recomputation.cse : 0.000018s : 0.05% optimize.environ_conv : 0.000009s : 0.02% optimize.swap_dp_allreduce_reducescatter : 0.000006s : 0.02% optimize.bias_add_comm_swap : 0.000003s : 0.01% optimize.label_micro_interleaved_index : 0.000008s : 0.02% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.01% optimize.merge_cast_opt : 0.000002s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.01% optimize.micro_interleaved_order_control : 0.000003s : 0.01% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.00% optimize.full_micro_interleaved_order_control : 0.000002s : 0.01% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.01% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000003s : 0.01% optimize.control_data_broadcast_order : 0.000019s : 0.05% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.00% optimize.offloading_packed_experts : 0.000005s : 0.01% optimize.overlap_recompute_and_grad_model_parallel : 0.000006s : 0.02% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000002s : 0.00% optimize.overlap_recompute_comm : 0.000003s : 0.01% optimize.overlap_grad_ring_attention : 0.000005s : 0.01% optimize.overlap_grad_flash_sp : 0.000026s : 0.07% optimize.begin_end_overlap_inline : 0.000000s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.01% optimize.split_layernorm_comm : 0.000002s : 0.01% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000004s : 0.01% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000016s : 0.04% optimize.symbol_engine_optimizer.elim_not_effective : 0.000018s : 0.05% optimize.symbol_engine_optimizer.opt_reshape : 0.000010s : 0.03% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000013s : 0.04% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.01% pipeline_parallel_scheduler : 0.000002s : 0.01% auto_monad_reorder : 0.000022s : 0.06% get_jit_bprop_graph : 0.000002s : 0.01% rewriter_after_jit_bprop_graph : 0.000006s : 0.02% opt_after_jit_grad : 0.000657s : 1.77% validate : 0.000056s : 0.15% Time group info: ------[substitution.] 0.000262 38 11.88% : 0.000031s : 3: substitution.cast_eliminate 0.88% : 0.000002s : 3: substitution.elim_not_effective 0.73% : 0.000002s : 3: substitution.fold_const_symbol 3.27% : 0.000009s : 5: substitution.graph_param_transform 70.06% : 0.000184s : 4: substitution.inline 2.17% : 0.000006s : 6: substitution.j_node_and_user_rematch 2.76% : 0.000007s : 6: substitution.remove_not_recompute_node 1.88% : 0.000005s : 4: substitution.replace_old_param 6.37% : 0.000017s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.006255 2 86.94% : 0.005439s : 1: type_inference.infer 13.06% : 0.000817s : 1: type_inference.specialize ------[replace.] 0.000075 8 59.54% : 0.000044s : 4: replace.inline 40.46% : 0.000030s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000195 8 92.53% : 0.000180s : 4: match.inline 7.47% : 0.000015s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000283 1596 0.90% : 0.000003s : 17: predicate.accumulaten_eliminater 0.87% : 0.000002s : 5: predicate.ad_related_special_op_eliminate 0.60% : 0.000002s : 10: predicate.addn_check_dump 0.99% : 0.000003s : 17: predicate.addn_zero_filter 0.87% : 0.000002s : 17: predicate.adjust_all_reduce_mul_add 2.09% : 0.000006s : 27: predicate.arithmetic_simplify 1.06% : 0.000003s : 17: predicate.cast_eliminate 0.50% : 0.000001s : 10: predicate.check_bprop_eliminate 0.55% : 0.000002s : 10: predicate.compare_switch_simplify 0.17% : 0.000000s : 5: predicate.const_output_eliminate 0.55% : 0.000002s : 10: predicate.depend_value_elim 1.00% : 0.000003s : 17: predicate.dict_get_item_const_eliminator 1.00% : 0.000003s : 17: predicate.dict_get_item_eliminator 0.88% : 0.000002s : 17: predicate.dict_set_item_eliminator 1.12% : 0.000003s : 10: predicate.dumpgradient_eliminate 0.26% : 0.000001s : 5: predicate.elim_not_effective 0.39% : 0.000001s : 5: predicate.elim_shapecalc_of_broadcastargs 1.13% : 0.000003s : 22: predicate.environ_add_const_eliminate 1.10% : 0.000003s : 22: predicate.environ_get_add_eliminate 1.05% : 0.000003s : 22: predicate.environ_get_depend_swap 1.62% : 0.000005s : 32: predicate.environ_get_eliminate 1.08% : 0.000003s : 22: predicate.environ_get_set_eliminate 1.32% : 0.000004s : 25: predicate.exchange_switch_depend_value 2.34% : 0.000007s : 25: predicate.float_depend_g_call 0.54% : 0.000002s : 10: predicate.float_environ_get_switch 0.77% : 0.000002s : 15: predicate.float_tuple_getitem_switch 0.16% : 0.000000s : 5: predicate.fold_const_symbol 0.66% : 0.000002s : 10: predicate.get_grad_eliminate 0.23% : 0.000001s : 5: predicate.graph_param_transform 0.56% : 0.000002s : 10: predicate.incorporate_call 0.44% : 0.000001s : 10: predicate.incorporate_call_switch 6.48% : 0.000018s : 72: predicate.inline 0.79% : 0.000002s : 10: predicate.inline_without_move 0.30% : 0.000001s : 10: predicate.j_node_and_user_rematch 0.92% : 0.000003s : 10: predicate.less_batch_normalization 1.74% : 0.000005s : 31: predicate.list_to_tuple_eliminator_ 2.46% : 0.000007s : 48: predicate.load_eliminater 2.94% : 0.000008s : 5: predicate.loop_unroll_after_grad 1.85% : 0.000005s : 36: predicate.loop_unroll_before_grad 1.94% : 0.000005s : 27: predicate.make_slice_get_slice_eliminator 0.53% : 0.000002s : 10: predicate.merge_addn 0.53% : 0.000002s : 10: predicate.micro_step_allgather_replace 0.49% : 0.000001s : 10: predicate.mini_step_allgather_replace 0.88% : 0.000002s : 17: predicate.minmaximum_grad 1.04% : 0.000003s : 5: predicate.mutable_eliminate 0.37% : 0.000001s : 5: predicate.opt_reshape 0.64% : 0.000002s : 5: predicate.parallel_virtual_node 1.66% : 0.000005s : 25: predicate.partial_defer_inline 1.53% : 0.000004s : 26: predicate.partial_eliminate 0.94% : 0.000003s : 17: predicate.print_const_string_wrapper 0.61% : 0.000002s : 10: predicate.reduce_all_const_elim 1.23% : 0.000003s : 17: predicate.reduce_eliminate 2.76% : 0.000008s : 48: predicate.redundant_stop_gradient_eliminater 0.34% : 0.000001s : 10: predicate.remove_not_recompute_node 1.41% : 0.000004s : 31: predicate.replace_applicator 0.52% : 0.000001s : 10: predicate.replace_old_param 0.23% : 0.000001s : 5: predicate.reset_defer_inline 0.95% : 0.000003s : 17: predicate.reshape_eliminate 0.53% : 0.000002s : 10: predicate.row_tensor_add_zeros_like 0.37% : 0.000001s : 5: predicate.row_tensor_eliminate 0.64% : 0.000002s : 10: predicate.same_eliminate 0.40% : 0.000001s : 10: predicate.set_cell_output_no_recompute 0.74% : 0.000002s : 10: predicate.shard_identity_eliminate 0.74% : 0.000002s : 10: predicate.special_op_eliminate 0.72% : 0.000002s : 10: predicate.specialize_transform 0.75% : 0.000002s : 10: predicate.split_environ_get_set_with_tuple_value 0.82% : 0.000002s : 10: predicate.stack_unstack_eliminate 0.40% : 0.000001s : 5: predicate.switch_call_monad_eliminater 1.36% : 0.000004s : 25: predicate.switch_defer_inline 2.01% : 0.000006s : 35: predicate.switch_layer_defer_inline 4.45% : 0.000013s : 76: predicate.switch_simplify 0.86% : 0.000002s : 17: predicate.tile_eliminate 0.90% : 0.000003s : 17: predicate.transpose_eliminate 1.59% : 0.000004s : 27: predicate.tuple_list_convert_item_index_to_positive 1.78% : 0.000005s : 27: predicate.tuple_list_get_item_const_eliminator 1.54% : 0.000004s : 27: predicate.tuple_list_get_item_depend_reorder 3.11% : 0.000009s : 41: predicate.tuple_list_get_item_eliminator 1.71% : 0.000005s : 27: predicate.tuple_list_get_set_item_eliminator 2.55% : 0.000007s : 37: predicate.tuple_list_set_item_eliminator 1.63% : 0.000005s : 31: predicate.tuple_to_list_eliminator_ 2.37% : 0.000007s : 48: predicate.updatestate_pure_node_eliminater 2.95% : 0.000008s : 58: predicate.updatestate_useless_node_eliminater 0.43% : 0.000001s : 5: predicate.value_based_eliminate 0.58% : 0.000002s : 10: predicate.virtual_dataset_eliminate 0.55% : 0.000002s : 10: predicate.virtual_output_eliminate 0.32% : 0.000001s : 5: predicate.virtual_view_grad_eliminate 0.36% : 0.000001s : 5: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000643 11 51.43% : 0.000331s : 5: func_graph_cloner_run.FuncGraphClonerGraph 48.57% : 0.000312s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.079251 192 0.01% : 0.000004s : 1: ForceFp32Comm 4.98% : 0.003943s : 1: add_attr 4.95% : 0.003927s : 1: add_attr_with_inline 0.01% : 0.000004s : 1: add_comm_op_reuse_tag 0.10% : 0.000079s : 1: add_recomputation 0.01% : 0.000004s : 1: assign_add_opt 0.10% : 0.000075s : 1: auto_monad 0.03% : 0.000027s : 1: auto_monad_reorder 0.00% : 0.000004s : 1: begin_end_overlap_inline 0.01% : 0.000006s : 1: bias_add_comm_swap 0.63% : 0.000500s : 1: bootstrap 0.05% : 0.000037s : 1: cconv 0.00% : 0.000004s : 1: comm_op_add_attrs 0.03% : 0.000022s : 1: control_data_broadcast_order 0.02% : 0.000012s : 1: convert_after_rewriter 0.04% : 0.000032s : 1: cse_after_recomputation 0.01% : 0.000006s : 1: dataset_repeat_opt 0.01% : 0.000006s : 1: detach_backward 0.02% : 0.000013s : 1: environ_conv 0.04% : 0.000029s : 1: event_method 0.01% : 0.000005s : 1: full_micro_interleaved_order_control 0.01% : 0.000005s : 1: get_jit_bprop_graph 0.01% : 0.000010s : 1: graph_reusing 0.01% : 0.000005s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000004s : 1: handle_group_info 0.01% : 0.000006s : 1: inline 0.01% : 0.000007s : 1: insert-virtual-dataset 0.00% : 0.000004s : 1: interleave_parallel_branches 0.00% : 0.000004s : 1: interleave_split_concat_branches 0.01% : 0.000007s : 1: label_fine_grained_interleaved_index 0.01% : 0.000011s : 1: label_micro_interleaved_index 30.92% : 0.024507s : 1: loop_unroll 0.01% : 0.000005s : 1: merge_cast_opt 0.01% : 0.000005s : 1: micro_interleaved_order_control 0.93% : 0.000738s : 1: mutable_eliminate 0.01% : 0.000008s : 1: offloading_packed_experts 0.06% : 0.000044s : 1: opt.transform.loop_unroll_optimizer 0.03% : 0.000021s : 1: opt.transform.mutable_eliminate 1.99% : 0.001577s : 78: opt.transform.opt_a 0.06% : 0.000046s : 1: opt.transform.opt_after_cconv 0.05% : 0.000039s : 1: opt.transform.opt_after_jit_grad 0.21% : 0.000163s : 28: opt.transform.opt_b 0.09% : 0.000075s : 2: opt.transform.opt_trans_graph 0.07% : 0.000053s : 4: opt.transform.symbol_engine_opt 4.58% : 0.003628s : 1: opt_a 0.24% : 0.000187s : 1: opt_after_cconv 0.84% : 0.000668s : 1: opt_after_jit_grad 0.37% : 0.000296s : 1: opt_b 38.24% : 0.030306s : 1: optimize 0.03% : 0.000026s : 1: optimize_parallel_all_gather_comm 0.01% : 0.000009s : 1: order_py_execute_after_rewriter 0.04% : 0.000030s : 1: overlap_grad_flash_sp 0.01% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.01% : 0.000009s : 1: overlap_grad_ring_attention 0.01% : 0.000007s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.01% : 0.000006s : 1: overlap_param_gather 0.01% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.01% : 0.000009s : 1: overlap_recompute_and_grad_model_parallel 0.01% : 0.000006s : 1: overlap_recompute_comm 0.01% : 0.000008s : 1: parallel-infer-symbol 0.00% : 0.000004s : 1: parallel-infer-symbol-second 0.01% : 0.000005s : 1: partial_unused_args_eliminate 0.01% : 0.000005s : 1: pipeline_parallel_scheduler 0.01% : 0.000005s : 1: pipeline_split 0.06% : 0.000049s : 1: pre_auto_parallel 0.05% : 0.000039s : 1: py_interpret_to_execute 0.03% : 0.000020s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000005s : 1: remove_cast_before_assign_add 0.03% : 0.000023s : 1: remove_dup_value 0.76% : 0.000603s : 1: renormalize.infer 0.55% : 0.000440s : 1: renormalize.specialize 0.01% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.01% : 0.000009s : 1: rewriter_after_jit_bprop_graph 0.07% : 0.000054s : 1: rewriter_after_opt_a 0.13% : 0.000105s : 1: rewriter_before_opt_a 0.01% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.01% : 0.000005s : 1: slice_recompute_activation 0.01% : 0.000005s : 1: split_layernorm_comm 0.01% : 0.000005s : 1: split_matmul_comm_elemetwise 0.01% : 0.000009s : 1: swap_dp_allreduce_reducescatter 0.13% : 0.000100s : 1: symbol_engine_optimizer 0.14% : 0.000109s : 1: tuple_transform 8.02% : 0.006354s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:40:00.902.998 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:40:00.903.274 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.191877, [21] [bootstrap]: 0.00045582 [type_inference]: 0.0060363 [event_method]: 2.132e-05 [auto_monad]: 6.831e-05 [graph_reusing]: 6.31998e-06 [inline]: 2.04e-06 [add_attr]: 0.176357, [1] [add_attr_with_inline]: 0.176343, [1] [Cycle 1]: 0.00010855, [2] [tag_attr]: 2.757e-05 [meta_addattr_fg_expand]: 6.95002e-06 [parallel-infer-symbol]: 4.13001e-06 [pre_auto_parallel]: 4.928e-05 [insert-virtual-dataset]: 3.04999e-06 [parallel-infer-symbol-second]: 8.50006e-07 [dataset_repeat_opt]: 2.22999e-06 [pipeline_split]: 2.32999e-06 [optimize]: 0.00753786, [53] [py_interpret_to_execute]: 4.377e-05 [rewriter_before_opt_a]: 0.0001135 [opt_a]: 0.00435781, [2] [Cycle 1]: 0.00316381, [45] [expand_dump_flag]: 3.10002e-06 [switch_simplify]: 4.761e-05 [loop_unroll]: 3.325e-05 [a_1]: 0.00080331 [with_stream_mark]: 2.904e-05 [recompute_prepare]: 1.604e-05 [updatestate_depend_eliminate]: 7.23999e-06 [updatestate_assign_eliminate]: 5.14e-06 [updatestate_loads_eliminate]: 4.32e-06 [parameter_eliminate]: 2.29999e-06 [a_2]: 0.00015899 [accelerated_algorithm]: 1.288e-05 [shard]: 2.39001e-06 [meta_shard_fg_expand]: 2.64001e-06 [shard_inline]: 1.006e-05 [merge_send_recv]: 1.11e-05 [auto_parallel]: 1.205e-05 [parallel]: 2.303e-05 [flash_sp]: 1.361e-05 [merge_comm]: 7.51999e-06 [allreduce_fusion]: 4.94e-06 [matmul_add_comm_reduction]: 1.465e-05 [allreduce_slice_to_reducescatter]: 9.60019e-07 [virtual_shard_identity]: 1.415e-05 [virtual_dataset]: 1.046e-05 [get_grad_eliminate_]: 8.95999e-06 [virtual_output]: 1.036e-05 [merge_forward]: 6.07999e-06 [cell_reuse_recompute_pass]: 1.60001e-06 [offload_activation]: 1.434e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.589e-05 [merge_recompute_call_nodes]: 2.15002e-06 [before_grad]: 1.89e-05 [set_forward_comm_id_for_comm_node_pass]: 6.59001e-06 [meta_fg_expand]: 4.71002e-06 [flash_sp_send_recv_attached]: 3.91001e-06 [receive_attached]: 2.39999e-06 [after_resolve]: 1.628e-05 [a_after_grad]: 1.609e-05 [renormalize]: 0.00111426 [add_forward_monad_depend]: 1.03e-05 [auto_monad_grad]: 3.78999e-06 [auto_monad_eliminator]: 2.531e-05 [cse]: 4.997e-05 [a_3]: 9.014e-05 [Cycle 2]: 0.00117468, [45] [expand_dump_flag]: 2.56998e-06 [switch_simplify]: 1.121e-05 [loop_unroll]: 9.33002e-06 [a_1]: 0.00023721 [with_stream_mark]: 2.208e-05 [recompute_prepare]: 9.91e-06 [updatestate_depend_eliminate]: 5.79999e-06 [updatestate_assign_eliminate]: 4.54002e-06 [updatestate_loads_eliminate]: 4.48001e-06 [parameter_eliminate]: 1.63002e-06 [a_2]: 0.00014226 [accelerated_algorithm]: 1.057e-05 [shard]: 2.41e-06 [meta_shard_fg_expand]: 2.53e-06 [shard_inline]: 1.04e-05 [merge_send_recv]: 1.008e-05 [auto_parallel]: 1.019e-05 [parallel]: 7.92e-06 [flash_sp]: 4.77e-06 [merge_comm]: 5.68002e-06 [allreduce_fusion]: 7.03e-06 [matmul_add_comm_reduction]: 1.248e-05 [allreduce_slice_to_reducescatter]: 7.30011e-07 [virtual_shard_identity]: 1.196e-05 [virtual_dataset]: 9.25999e-06 [get_grad_eliminate_]: 9.24e-06 [virtual_output]: 9.41e-06 [merge_forward]: 6.17001e-06 [cell_reuse_recompute_pass]: 2.43e-06 [offload_activation]: 1.351e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.256e-05 [merge_recompute_call_nodes]: 1.30999e-06 [before_grad]: 1.536e-05 [set_forward_comm_id_for_comm_node_pass]: 5.62001e-06 [meta_fg_expand]: 3.82998e-06 [flash_sp_send_recv_attached]: 1.70001e-06 [receive_attached]: 2.05002e-06 [after_resolve]: 1.658e-05 [a_after_grad]: 1.504e-05 [renormalize]: 8.00064e-08 [add_forward_monad_depend]: 3.04001e-06 [auto_monad_grad]: 1.72001e-06 [auto_monad_eliminator]: 1.434e-05 [cse]: 3.375e-05 [a_3]: 7.028e-05 [py_interpret_to_execute_after_opt_a]: 2.05e-05 [slice_cell_reuse_recomputed_activation]: 4.88001e-06 [rewriter_after_opt_a]: 6.019e-05 [convert_after_rewriter]: 1.38e-05 [order_py_execute_after_rewriter]: 1.04e-05 [mutable_eliminate]: 0.00073481 [opt_b]: 0.00053769, [1] [Cycle 1]: 0.00052552, [7] [b_1]: 0.00029192 [b_2]: 1.26e-05 [updatestate_depend_eliminate]: 1.227e-05 [updatestate_assign_eliminate]: 4.18999e-06 [updatestate_loads_eliminate]: 4.25999e-06 [renormalize]: 8.2e-07 [cse]: 4.274e-05 [optimize_parallel_all_gather_comm]: 2.873e-05 [overlap_param_gather]: 6.15002e-06 [cconv]: 4.187e-05 [loop_unroll]: 0.00052738 [opt_after_cconv]: 0.00016924, [1] [Cycle 1]: 0.00015845, [7] [c_1]: 4.592e-05 [parameter_eliminate]: 4.70001e-06 [updatestate_depend_eliminate]: 8.62998e-06 [updatestate_assign_eliminate]: 4.17e-06 [updatestate_loads_eliminate]: 3.86001e-06 [cse]: 3.219e-05 [renormalize]: 3.30008e-07 [remove_dup_value]: 5.874e-05 [tuple_transform]: 0.00012062, [1] [Cycle 1]: 0.00011291, [4] [d_1]: 6.85e-05 [none_parameter_eliminate]: 1.67999e-06 [renormalize]: 2.40019e-07 [switch_simplify]: 1.003e-05 [partial_unused_args_eliminate]: 4.56002e-06 [add_recomputation]: 7.257e-05 [cse_after_recomputation]: 3.62e-05, [1] [Cycle 1]: 2.833e-05, [1] [cse]: 1.904e-05 [environ_conv]: 1.088e-05 [swap_dp_allreduce_reducescatter]: 9.95002e-06 [bias_add_comm_swap]: 6.29001e-06 [label_micro_interleaved_index]: 9.02999e-06 [label_fine_grained_interleaved_index]: 5.52999e-06 [merge_cast_opt]: 4.14002e-06 [slice_recompute_activation]: 5.11002e-06 [micro_interleaved_order_control]: 4.73001e-06 [assign_add_opt]: 4.09002e-06 [ForceFp32Comm]: 3.40003e-06 [remove_cast_before_assign_add]: 3.75998e-06 [full_micro_interleaved_order_control]: 5.35999e-06 [reorder_send_recv_between_fp_bp]: 6.41e-06 [comm_op_add_attrs]: 3.94002e-06 [add_comm_op_reuse_tag]: 3.58e-06 [interleave_split_concat_branches]: 3.97e-06 [interleave_parallel_branches]: 3.66001e-06 [overlap_opt_shard_in_pipeline]: 3.71999e-06 [overlap_opt_shard_grad_in_pipeline]: 4.44002e-06 [control_data_broadcast_order]: 2.126e-05 [grouped_pairwise_exchange_alltoall]: 4.40999e-06 [offloading_packed_experts]: 8.48999e-06 [overlap_recompute_and_grad_model_parallel]: 8.94e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.80998e-06 [overlap_recompute_allgather_and_fa_grad]: 3.91001e-06 [overlap_recompute_comm]: 4.92999e-06 [overlap_grad_ring_attention]: 7.79002e-06 [overlap_grad_flash_sp]: 3.233e-05 [begin_end_overlap_inline]: 3.11999e-06 [split_matmul_comm_elemetwise]: 5.14e-06 [split_layernorm_comm]: 4.00998e-06 [handle_group_info]: 3.41999e-06 [symbol_engine_optimizer]: 0.00012828, [1] [Cycle 1]: 0.00012, [6] [build]: 4.26001e-06 [elim_shapecalc]: 1.617e-05 [elim_not_effective]: 1.998e-05 [opt_reshape]: 1.177e-05 [fold_const_symbol]: 1.611e-05 [renormalize]: 2.60014e-07 [detach_backward]: 5.15999e-06 [pipeline_parallel_scheduler]: 2.24001e-06 [auto_monad_reorder]: 2.929e-05 [get_jit_bprop_graph]: 2.34999e-06 [rewriter_after_jit_bprop_graph]: 6.41e-06 [opt_after_jit_grad]: 0.00056405 [validate]: 4.985e-05 Sums bootstrap : 0.000456s : 3.40% type_inference : 0.006036s : 44.96% event_method : 0.000021s : 0.16% auto_monad : 0.000068s : 0.51% graph_reusing : 0.000006s : 0.05% inline : 0.000002s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000028s : 0.21% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000007s : 0.05% parallel-infer-symbol : 0.000004s : 0.03% pre_auto_parallel : 0.000049s : 0.37% insert-virtual-dataset : 0.000003s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.02% optimize.py_interpret_to_execute : 0.000044s : 0.33% optimize.rewriter_before_opt_a : 0.000114s : 0.85% optimize.opt_a.expand_dump_flag : 0.000006s : 0.04% optimize.opt_a.switch_simplify : 0.000059s : 0.44% optimize.opt_a.loop_unroll : 0.000043s : 0.32% optimize.opt_a.a_1 : 0.001041s : 7.75% optimize.opt_a.with_stream_mark : 0.000051s : 0.38% optimize.opt_a.recompute_prepare : 0.000026s : 0.19% optimize.opt_a.updatestate_depend_eliminate : 0.000013s : 0.10% optimize.opt_a.updatestate_assign_eliminate : 0.000010s : 0.07% optimize.opt_a.updatestate_loads_eliminate : 0.000009s : 0.07% optimize.opt_a.parameter_eliminate : 0.000004s : 0.03% optimize.opt_a.a_2 : 0.000301s : 2.24% optimize.opt_a.accelerated_algorithm : 0.000023s : 0.17% optimize.opt_a.shard : 0.000005s : 0.04% optimize.opt_a.meta_shard_fg_expand : 0.000005s : 0.04% optimize.opt_a.shard_inline : 0.000020s : 0.15% optimize.opt_a.merge_send_recv : 0.000021s : 0.16% optimize.opt_a.auto_parallel : 0.000022s : 0.17% optimize.opt_a.parallel : 0.000031s : 0.23% optimize.opt_a.flash_sp : 0.000018s : 0.14% optimize.opt_a.merge_comm : 0.000013s : 0.10% optimize.opt_a.allreduce_fusion : 0.000012s : 0.09% optimize.opt_a.matmul_add_comm_reduction : 0.000027s : 0.20% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000002s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000026s : 0.19% optimize.opt_a.virtual_dataset : 0.000020s : 0.15% optimize.opt_a.get_grad_eliminate_ : 0.000018s : 0.14% optimize.opt_a.virtual_output : 0.000020s : 0.15% optimize.opt_a.merge_forward : 0.000012s : 0.09% optimize.opt_a.cell_reuse_recompute_pass : 0.000004s : 0.03% optimize.opt_a.offload_activation : 0.000028s : 0.21% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000048s : 0.36% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.03% optimize.opt_a.before_grad : 0.000034s : 0.26% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000012s : 0.09% optimize.opt_a.meta_fg_expand : 0.000009s : 0.06% optimize.opt_a.flash_sp_send_recv_attached : 0.000006s : 0.04% optimize.opt_a.receive_attached : 0.000004s : 0.03% optimize.opt_a.after_resolve : 0.000033s : 0.24% optimize.opt_a.a_after_grad : 0.000031s : 0.23% optimize.opt_a.renormalize : 0.001114s : 8.30% optimize.opt_a.add_forward_monad_depend : 0.000013s : 0.10% optimize.opt_a.auto_monad_grad : 0.000006s : 0.04% optimize.opt_a.auto_monad_eliminator : 0.000040s : 0.30% optimize.opt_a.cse : 0.000084s : 0.62% optimize.opt_a.a_3 : 0.000160s : 1.19% optimize.py_interpret_to_execute_after_opt_a : 0.000021s : 0.15% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.04% optimize.rewriter_after_opt_a : 0.000060s : 0.45% optimize.convert_after_rewriter : 0.000014s : 0.10% optimize.order_py_execute_after_rewriter : 0.000010s : 0.08% optimize.mutable_eliminate : 0.000735s : 5.47% optimize.opt_b.b_1 : 0.000292s : 2.17% optimize.opt_b.b_2 : 0.000013s : 0.09% optimize.opt_b.updatestate_depend_eliminate : 0.000012s : 0.09% optimize.opt_b.updatestate_assign_eliminate : 0.000004s : 0.03% optimize.opt_b.updatestate_loads_eliminate : 0.000004s : 0.03% optimize.opt_b.renormalize : 0.000001s : 0.01% optimize.opt_b.cse : 0.000043s : 0.32% optimize.optimize_parallel_all_gather_comm : 0.000029s : 0.21% optimize.overlap_param_gather : 0.000006s : 0.05% optimize.cconv : 0.000042s : 0.31% optimize.loop_unroll : 0.000527s : 3.93% optimize.opt_after_cconv.c_1 : 0.000046s : 0.34% optimize.opt_after_cconv.parameter_eliminate : 0.000005s : 0.04% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000009s : 0.06% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000004s : 0.03% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000004s : 0.03% optimize.opt_after_cconv.cse : 0.000032s : 0.24% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000059s : 0.44% optimize.tuple_transform.d_1 : 0.000068s : 0.51% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000010s : 0.07% optimize.partial_unused_args_eliminate : 0.000005s : 0.03% optimize.add_recomputation : 0.000073s : 0.54% optimize.cse_after_recomputation.cse : 0.000019s : 0.14% optimize.environ_conv : 0.000011s : 0.08% optimize.swap_dp_allreduce_reducescatter : 0.000010s : 0.07% optimize.bias_add_comm_swap : 0.000006s : 0.05% optimize.label_micro_interleaved_index : 0.000009s : 0.07% optimize.label_fine_grained_interleaved_index : 0.000006s : 0.04% optimize.merge_cast_opt : 0.000004s : 0.03% optimize.slice_recompute_activation : 0.000005s : 0.04% optimize.micro_interleaved_order_control : 0.000005s : 0.04% optimize.assign_add_opt : 0.000004s : 0.03% optimize.ForceFp32Comm : 0.000003s : 0.03% optimize.remove_cast_before_assign_add : 0.000004s : 0.03% optimize.full_micro_interleaved_order_control : 0.000005s : 0.04% optimize.reorder_send_recv_between_fp_bp : 0.000006s : 0.05% optimize.comm_op_add_attrs : 0.000004s : 0.03% optimize.add_comm_op_reuse_tag : 0.000004s : 0.03% optimize.interleave_split_concat_branches : 0.000004s : 0.03% optimize.interleave_parallel_branches : 0.000004s : 0.03% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.03% optimize.overlap_opt_shard_grad_in_pipeline : 0.000004s : 0.03% optimize.control_data_broadcast_order : 0.000021s : 0.16% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.03% optimize.offloading_packed_experts : 0.000008s : 0.06% optimize.overlap_recompute_and_grad_model_parallel : 0.000009s : 0.07% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.03% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.03% optimize.overlap_recompute_comm : 0.000005s : 0.04% optimize.overlap_grad_ring_attention : 0.000008s : 0.06% optimize.overlap_grad_flash_sp : 0.000032s : 0.24% optimize.begin_end_overlap_inline : 0.000003s : 0.02% optimize.split_matmul_comm_elemetwise : 0.000005s : 0.04% optimize.split_layernorm_comm : 0.000004s : 0.03% optimize.handle_group_info : 0.000003s : 0.03% optimize.symbol_engine_optimizer.build : 0.000004s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000016s : 0.12% optimize.symbol_engine_optimizer.elim_not_effective : 0.000020s : 0.15% optimize.symbol_engine_optimizer.opt_reshape : 0.000012s : 0.09% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000016s : 0.12% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000005s : 0.04% pipeline_parallel_scheduler : 0.000002s : 0.02% auto_monad_reorder : 0.000029s : 0.22% get_jit_bprop_graph : 0.000002s : 0.02% rewriter_after_jit_bprop_graph : 0.000006s : 0.05% opt_after_jit_grad : 0.000564s : 4.20% validate : 0.000050s : 0.37% Time group info: ------[substitution.] 0.000296 48 17.76% : 0.000053s : 6: substitution.cast_eliminate 0.88% : 0.000003s : 4: substitution.elim_not_effective 0.81% : 0.000002s : 4: substitution.fold_const_symbol 3.03% : 0.000009s : 6: substitution.graph_param_transform 65.11% : 0.000193s : 4: substitution.inline 2.29% : 0.000007s : 8: substitution.j_node_and_user_rematch 2.96% : 0.000009s : 8: substitution.remove_not_recompute_node 2.18% : 0.000006s : 4: substitution.replace_old_param 4.97% : 0.000015s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.005980 2 86.78% : 0.005190s : 1: type_inference.infer 13.22% : 0.000791s : 1: type_inference.specialize ------[replace.] 0.000073 8 61.47% : 0.000045s : 4: replace.inline 38.53% : 0.000028s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000203 8 93.61% : 0.000190s : 4: match.inline 6.39% : 0.000013s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000302 1730 0.91% : 0.000003s : 17: predicate.accumulaten_eliminater 0.73% : 0.000002s : 6: predicate.ad_related_special_op_eliminate 0.63% : 0.000002s : 12: predicate.addn_check_dump 0.88% : 0.000003s : 17: predicate.addn_zero_filter 0.82% : 0.000002s : 17: predicate.adjust_all_reduce_mul_add 2.13% : 0.000006s : 29: predicate.arithmetic_simplify 1.14% : 0.000003s : 17: predicate.cast_eliminate 0.73% : 0.000002s : 12: predicate.check_bprop_eliminate 0.70% : 0.000002s : 12: predicate.compare_switch_simplify 0.19% : 0.000001s : 6: predicate.const_output_eliminate 0.77% : 0.000002s : 12: predicate.depend_value_elim 0.86% : 0.000003s : 17: predicate.dict_get_item_const_eliminator 0.89% : 0.000003s : 17: predicate.dict_get_item_eliminator 0.82% : 0.000002s : 17: predicate.dict_set_item_eliminator 1.12% : 0.000003s : 12: predicate.dumpgradient_eliminate 0.19% : 0.000001s : 6: predicate.elim_not_effective 0.63% : 0.000002s : 6: predicate.elim_shapecalc_of_broadcastargs 1.02% : 0.000003s : 23: predicate.environ_add_const_eliminate 1.02% : 0.000003s : 23: predicate.environ_get_add_eliminate 1.03% : 0.000003s : 23: predicate.environ_get_depend_swap 1.84% : 0.000006s : 35: predicate.environ_get_eliminate 1.03% : 0.000003s : 23: predicate.environ_get_set_eliminate 1.28% : 0.000004s : 25: predicate.exchange_switch_depend_value 2.26% : 0.000007s : 25: predicate.float_depend_g_call 0.60% : 0.000002s : 12: predicate.float_environ_get_switch 0.81% : 0.000002s : 18: predicate.float_tuple_getitem_switch 0.18% : 0.000001s : 6: predicate.fold_const_symbol 0.67% : 0.000002s : 12: predicate.get_grad_eliminate 0.31% : 0.000001s : 6: predicate.graph_param_transform 0.69% : 0.000002s : 12: predicate.incorporate_call 0.58% : 0.000002s : 12: predicate.incorporate_call_switch 6.44% : 0.000019s : 78: predicate.inline 0.95% : 0.000003s : 12: predicate.inline_without_move 0.39% : 0.000001s : 12: predicate.j_node_and_user_rematch 0.93% : 0.000003s : 12: predicate.less_batch_normalization 1.67% : 0.000005s : 33: predicate.list_to_tuple_eliminator_ 2.35% : 0.000007s : 50: predicate.load_eliminater 0.95% : 0.000003s : 6: predicate.loop_unroll_after_grad 1.90% : 0.000006s : 38: predicate.loop_unroll_before_grad 1.64% : 0.000005s : 29: predicate.make_slice_get_slice_eliminator 0.60% : 0.000002s : 12: predicate.merge_addn 0.59% : 0.000002s : 12: predicate.micro_step_allgather_replace 0.70% : 0.000002s : 12: predicate.mini_step_allgather_replace 0.80% : 0.000002s : 17: predicate.minmaximum_grad 1.47% : 0.000004s : 6: predicate.mutable_eliminate 0.47% : 0.000001s : 6: predicate.opt_reshape 0.36% : 0.000001s : 6: predicate.parallel_virtual_node 1.70% : 0.000005s : 25: predicate.partial_defer_inline 1.48% : 0.000004s : 27: predicate.partial_eliminate 0.88% : 0.000003s : 17: predicate.print_const_string_wrapper 0.69% : 0.000002s : 12: predicate.reduce_all_const_elim 1.01% : 0.000003s : 17: predicate.reduce_eliminate 2.37% : 0.000007s : 50: predicate.redundant_stop_gradient_eliminater 0.46% : 0.000001s : 12: predicate.remove_not_recompute_node 1.49% : 0.000005s : 33: predicate.replace_applicator 0.52% : 0.000002s : 12: predicate.replace_old_param 0.38% : 0.000001s : 6: predicate.reset_defer_inline 0.88% : 0.000003s : 17: predicate.reshape_eliminate 0.60% : 0.000002s : 12: predicate.row_tensor_add_zeros_like 0.49% : 0.000001s : 6: predicate.row_tensor_eliminate 0.78% : 0.000002s : 12: predicate.same_eliminate 0.45% : 0.000001s : 12: predicate.set_cell_output_no_recompute 1.00% : 0.000003s : 12: predicate.shard_identity_eliminate 0.91% : 0.000003s : 12: predicate.special_op_eliminate 0.73% : 0.000002s : 12: predicate.specialize_transform 1.21% : 0.000004s : 12: predicate.split_environ_get_set_with_tuple_value 0.82% : 0.000002s : 12: predicate.stack_unstack_eliminate 0.40% : 0.000001s : 6: predicate.switch_call_monad_eliminater 1.31% : 0.000004s : 25: predicate.switch_defer_inline 1.91% : 0.000006s : 37: predicate.switch_layer_defer_inline 4.76% : 0.000014s : 81: predicate.switch_simplify 0.91% : 0.000003s : 17: predicate.tile_eliminate 0.83% : 0.000002s : 17: predicate.transpose_eliminate 1.56% : 0.000005s : 29: predicate.tuple_list_convert_item_index_to_positive 1.50% : 0.000005s : 29: predicate.tuple_list_get_item_const_eliminator 1.43% : 0.000004s : 29: predicate.tuple_list_get_item_depend_reorder 3.28% : 0.000010s : 45: predicate.tuple_list_get_item_eliminator 1.60% : 0.000005s : 29: predicate.tuple_list_get_set_item_eliminator 2.17% : 0.000007s : 41: predicate.tuple_list_set_item_eliminator 1.63% : 0.000005s : 33: predicate.tuple_to_list_eliminator_ 2.27% : 0.000007s : 50: predicate.updatestate_pure_node_eliminater 3.20% : 0.000010s : 62: predicate.updatestate_useless_node_eliminater 0.42% : 0.000001s : 6: predicate.value_based_eliminate 0.72% : 0.000002s : 12: predicate.virtual_dataset_eliminate 0.75% : 0.000002s : 12: predicate.virtual_output_eliminate 0.30% : 0.000001s : 6: predicate.virtual_view_grad_eliminate 0.51% : 0.000002s : 6: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000633 11 51.27% : 0.000324s : 5: func_graph_cloner_run.FuncGraphClonerGraph 48.73% : 0.000308s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.378924 192 0.00% : 0.000006s : 1: ForceFp32Comm 46.54% : 0.176370s : 1: add_attr 46.54% : 0.176347s : 1: add_attr_with_inline 0.00% : 0.000006s : 1: add_comm_op_reuse_tag 0.02% : 0.000076s : 1: add_recomputation 0.00% : 0.000007s : 1: assign_add_opt 0.02% : 0.000077s : 1: auto_monad 0.01% : 0.000038s : 1: auto_monad_reorder 0.00% : 0.000007s : 1: begin_end_overlap_inline 0.00% : 0.000009s : 1: bias_add_comm_swap 0.13% : 0.000501s : 1: bootstrap 0.01% : 0.000045s : 1: cconv 0.00% : 0.000007s : 1: comm_op_add_attrs 0.01% : 0.000024s : 1: control_data_broadcast_order 0.00% : 0.000017s : 1: convert_after_rewriter 0.01% : 0.000040s : 1: cse_after_recomputation 0.00% : 0.000008s : 1: dataset_repeat_opt 0.01% : 0.000025s : 1: detach_backward 0.00% : 0.000014s : 1: environ_conv 0.01% : 0.000031s : 1: event_method 0.00% : 0.000008s : 1: full_micro_interleaved_order_control 0.00% : 0.000010s : 1: get_jit_bprop_graph 0.00% : 0.000013s : 1: graph_reusing 0.00% : 0.000008s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000006s : 1: handle_group_info 0.00% : 0.000008s : 1: inline 0.00% : 0.000010s : 1: insert-virtual-dataset 0.00% : 0.000006s : 1: interleave_parallel_branches 0.00% : 0.000007s : 1: interleave_split_concat_branches 0.00% : 0.000009s : 1: label_fine_grained_interleaved_index 0.00% : 0.000012s : 1: label_micro_interleaved_index 0.14% : 0.000534s : 1: loop_unroll 0.00% : 0.000007s : 1: merge_cast_opt 0.00% : 0.000007s : 1: micro_interleaved_order_control 0.20% : 0.000744s : 1: mutable_eliminate 0.00% : 0.000011s : 1: offloading_packed_experts 0.01% : 0.000021s : 1: opt.transform.loop_unroll_optimizer 0.01% : 0.000027s : 1: opt.transform.mutable_eliminate 0.45% : 0.001713s : 78: opt.transform.opt_a 0.01% : 0.000044s : 1: opt.transform.opt_after_cconv 0.01% : 0.000037s : 1: opt.transform.opt_after_jit_grad 0.06% : 0.000223s : 28: opt.transform.opt_b 0.02% : 0.000076s : 2: opt.transform.opt_trans_graph 0.02% : 0.000061s : 4: opt.transform.symbol_engine_opt 1.15% : 0.004362s : 1: opt_a 0.05% : 0.000173s : 1: opt_after_cconv 0.15% : 0.000576s : 1: opt_after_jit_grad 0.14% : 0.000542s : 1: opt_b 2.09% : 0.007937s : 1: optimize 0.01% : 0.000032s : 1: optimize_parallel_all_gather_comm 0.00% : 0.000013s : 1: order_py_execute_after_rewriter 0.01% : 0.000035s : 1: overlap_grad_flash_sp 0.00% : 0.000007s : 1: overlap_grad_matmul_and_grad_allreduce 0.00% : 0.000012s : 1: overlap_grad_ring_attention 0.00% : 0.000007s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000006s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000009s : 1: overlap_param_gather 0.00% : 0.000007s : 1: overlap_recompute_allgather_and_fa_grad 0.00% : 0.000012s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000008s : 1: overlap_recompute_comm 0.00% : 0.000011s : 1: parallel-infer-symbol 0.00% : 0.000006s : 1: parallel-infer-symbol-second 0.00% : 0.000008s : 1: partial_unused_args_eliminate 0.00% : 0.000011s : 1: pipeline_parallel_scheduler 0.00% : 0.000008s : 1: pipeline_split 0.02% : 0.000058s : 1: pre_auto_parallel 0.01% : 0.000048s : 1: py_interpret_to_execute 0.01% : 0.000024s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000006s : 1: remove_cast_before_assign_add 0.02% : 0.000062s : 1: remove_dup_value 0.17% : 0.000635s : 1: renormalize.infer 0.12% : 0.000468s : 1: renormalize.specialize 0.00% : 0.000010s : 1: reorder_send_recv_between_fp_bp 0.00% : 0.000013s : 1: rewriter_after_jit_bprop_graph 0.02% : 0.000064s : 1: rewriter_after_opt_a 0.03% : 0.000119s : 1: rewriter_before_opt_a 0.00% : 0.000008s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000008s : 1: slice_recompute_activation 0.00% : 0.000007s : 1: split_layernorm_comm 0.00% : 0.000008s : 1: split_matmul_comm_elemetwise 0.00% : 0.000013s : 1: swap_dp_allreduce_reducescatter 0.03% : 0.000131s : 1: symbol_engine_optimizer 0.03% : 0.000124s : 1: tuple_transform 1.60% : 0.006080s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:40:02.862.497 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.156703, [21] [bootstrap]: 0.0004769 [type_inference]: 0.144544 [event_method]: 2.228e-05 [auto_monad]: 7.22e-05 [graph_reusing]: 6.71e-06 [inline]: 3.36999e-06 [add_attr]: 0.00387991, [1] [add_attr_with_inline]: 0.00386505, [1] [Cycle 1]: 8.593e-05, [2] [tag_attr]: 3.004e-05 [meta_addattr_fg_expand]: 6.62002e-06 [parallel-infer-symbol]: 4.10998e-06 [pre_auto_parallel]: 4.701e-05 [insert-virtual-dataset]: 3.38e-06 [parallel-infer-symbol-second]: 1.03001e-06 [dataset_repeat_opt]: 2.09e-06 [pipeline_split]: 1.92999e-06 [optimize]: 0.00675372, [53] [py_interpret_to_execute]: 3.954e-05 [rewriter_before_opt_a]: 0.00011224 [opt_a]: 0.00397282, [2] [Cycle 1]: 0.00298006, [45] [expand_dump_flag]: 3.75e-06 [switch_simplify]: 5.041e-05 [loop_unroll]: 3.362e-05 [a_1]: 0.00083245 [with_stream_mark]: 2.34e-05 [recompute_prepare]: 1.434e-05 [updatestate_depend_eliminate]: 7.26999e-06 [updatestate_assign_eliminate]: 4.55999e-06 [updatestate_loads_eliminate]: 4.72e-06 [parameter_eliminate]: 2.17001e-06 [a_2]: 0.00012614 [accelerated_algorithm]: 1.199e-05 [shard]: 2.89999e-06 [meta_shard_fg_expand]: 3.43e-06 [shard_inline]: 9.80002e-06 [merge_send_recv]: 1.258e-05 [auto_parallel]: 1.155e-05 [parallel]: 2.136e-05 [flash_sp]: 1.216e-05 [merge_comm]: 6.02001e-06 [allreduce_fusion]: 5.26002e-06 [matmul_add_comm_reduction]: 1.333e-05 [allreduce_slice_to_reducescatter]: 9.70002e-07 [virtual_shard_identity]: 1.388e-05 [virtual_dataset]: 1.04e-05 [get_grad_eliminate_]: 9.64999e-06 [virtual_output]: 1.023e-05 [merge_forward]: 6.12999e-06 [cell_reuse_recompute_pass]: 2.25002e-06 [offload_activation]: 1.323e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.327e-05 [merge_recompute_call_nodes]: 1.84998e-06 [before_grad]: 1.844e-05 [set_forward_comm_id_for_comm_node_pass]: 6.21e-06 [meta_fg_expand]: 5.39998e-06 [flash_sp_send_recv_attached]: 3.06999e-06 [receive_attached]: 2.97002e-06 [after_resolve]: 1.726e-05 [a_after_grad]: 1.524e-05 [renormalize]: 0.00114273 [add_forward_monad_depend]: 9.89001e-06 [auto_monad_grad]: 2.81e-06 [auto_monad_eliminator]: 2.752e-05 [cse]: 5.061e-05 [a_3]: 8.099e-05 [Cycle 2]: 0.00097925, [45] [expand_dump_flag]: 2.98e-06 [switch_simplify]: 1.255e-05 [loop_unroll]: 8.90999e-06 [a_1]: 0.00022777 [with_stream_mark]: 2.21e-05 [recompute_prepare]: 9.87999e-06 [updatestate_depend_eliminate]: 6.26998e-06 [updatestate_assign_eliminate]: 4.35999e-06 [updatestate_loads_eliminate]: 4.35e-06 [parameter_eliminate]: 1.54e-06 [a_2]: 0.00011555 [accelerated_algorithm]: 1.024e-05 [shard]: 2.13002e-06 [meta_shard_fg_expand]: 2.81e-06 [shard_inline]: 9.81998e-06 [merge_send_recv]: 9.89001e-06 [auto_parallel]: 1.121e-05 [parallel]: 9.57999e-06 [flash_sp]: 4.33001e-06 [merge_comm]: 7.04001e-06 [allreduce_fusion]: 5.84e-06 [matmul_add_comm_reduction]: 1.311e-05 [allreduce_slice_to_reducescatter]: 7.99977e-07 [virtual_shard_identity]: 1.034e-05 [virtual_dataset]: 9.07001e-06 [get_grad_eliminate_]: 8.67e-06 [virtual_output]: 8.85001e-06 [merge_forward]: 5.42999e-06 [cell_reuse_recompute_pass]: 2.81999e-06 [offload_activation]: 1.312e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.835e-05 [merge_recompute_call_nodes]: 1.39e-06 [before_grad]: 1.501e-05 [set_forward_comm_id_for_comm_node_pass]: 6.74001e-06 [meta_fg_expand]: 3.98999e-06 [flash_sp_send_recv_attached]: 1.88997e-06 [receive_attached]: 2.48e-06 [after_resolve]: 1.518e-05 [a_after_grad]: 1.323e-05 [renormalize]: 6.00121e-08 [add_forward_monad_depend]: 2.44999e-06 [auto_monad_grad]: 1.77001e-06 [auto_monad_eliminator]: 1.365e-05 [cse]: 2.978e-05 [a_3]: 5.556e-05 [py_interpret_to_execute_after_opt_a]: 1.78e-05 [slice_cell_reuse_recomputed_activation]: 2.12999e-06 [rewriter_after_opt_a]: 5.596e-05 [convert_after_rewriter]: 1.005e-05 [order_py_execute_after_rewriter]: 6.39001e-06 [mutable_eliminate]: 0.00079176 [opt_b]: 0.00030843, [1] [Cycle 1]: 0.00029967, [7] [b_1]: 0.00018749 [b_2]: 1.211e-05 [updatestate_depend_eliminate]: 1.135e-05 [updatestate_assign_eliminate]: 3.86999e-06 [updatestate_loads_eliminate]: 4.38001e-06 [renormalize]: 5.19998e-07 [cse]: 4.01e-05 [optimize_parallel_all_gather_comm]: 2.395e-05 [overlap_param_gather]: 1.92001e-06 [cconv]: 3.637e-05 [loop_unroll]: 0.00049439 [opt_after_cconv]: 0.00014307, [1] [Cycle 1]: 0.00013678, [7] [c_1]: 4.375e-05 [parameter_eliminate]: 5.09998e-06 [updatestate_depend_eliminate]: 8.79998e-06 [updatestate_assign_eliminate]: 4.12e-06 [updatestate_loads_eliminate]: 3.93001e-06 [cse]: 3.212e-05 [renormalize]: 4.19997e-07 [remove_dup_value]: 5.683e-05 [tuple_transform]: 0.00010332, [1] [Cycle 1]: 9.795e-05, [4] [d_1]: 6.517e-05 [none_parameter_eliminate]: 2.04e-06 [renormalize]: 2.9002e-07 [switch_simplify]: 1.005e-05 [partial_unused_args_eliminate]: 2.02999e-06 [add_recomputation]: 7.285e-05 [cse_after_recomputation]: 3.045e-05, [1] [Cycle 1]: 2.516e-05, [1] [cse]: 1.934e-05 [environ_conv]: 8.28999e-06 [swap_dp_allreduce_reducescatter]: 6.89999e-06 [bias_add_comm_swap]: 2.64999e-06 [label_micro_interleaved_index]: 5.27001e-06 [label_fine_grained_interleaved_index]: 2.94999e-06 [merge_cast_opt]: 1.89999e-06 [slice_recompute_activation]: 2.38002e-06 [micro_interleaved_order_control]: 2.62001e-06 [assign_add_opt]: 1.49e-06 [ForceFp32Comm]: 1.04e-06 [remove_cast_before_assign_add]: 1.29998e-06 [full_micro_interleaved_order_control]: 2.09e-06 [reorder_send_recv_between_fp_bp]: 3.07002e-06 [comm_op_add_attrs]: 1.15001e-06 [add_comm_op_reuse_tag]: 1.25001e-06 [interleave_split_concat_branches]: 1.19998e-06 [interleave_parallel_branches]: 1.05001e-06 [overlap_opt_shard_in_pipeline]: 1.47001e-06 [overlap_opt_shard_grad_in_pipeline]: 1.84e-06 [control_data_broadcast_order]: 1.846e-05 [grouped_pairwise_exchange_alltoall]: 1.81998e-06 [offloading_packed_experts]: 4.98001e-06 [overlap_recompute_and_grad_model_parallel]: 5.68002e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.20001e-06 [overlap_recompute_allgather_and_fa_grad]: 1.54e-06 [overlap_recompute_comm]: 2.28002e-06 [overlap_grad_ring_attention]: 5.24e-06 [overlap_grad_flash_sp]: 2.925e-05 [begin_end_overlap_inline]: 5.19998e-07 [split_matmul_comm_elemetwise]: 2.27999e-06 [split_layernorm_comm]: 1.92999e-06 [handle_group_info]: 9.70002e-07 [symbol_engine_optimizer]: 0.00010353, [1] [Cycle 1]: 9.79e-05, [6] [build]: 4.48999e-06 [elim_shapecalc]: 1.583e-05 [elim_not_effective]: 1.906e-05 [opt_reshape]: 1.006e-05 [fold_const_symbol]: 1.499e-05 [renormalize]: 2.19996e-07 [detach_backward]: 2.30002e-06 [pipeline_parallel_scheduler]: 1.59e-06 [auto_monad_reorder]: 2.37e-05 [get_jit_bprop_graph]: 1.87999e-06 [rewriter_after_jit_bprop_graph]: 7.54002e-06 [opt_after_jit_grad]: 0.00061458 [validate]: 5.832e-05 Sums bootstrap : 0.000477s : 0.31% type_inference : 0.144544s : 95.26% event_method : 0.000022s : 0.01% auto_monad : 0.000072s : 0.05% graph_reusing : 0.000007s : 0.00% inline : 0.000003s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000030s : 0.02% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000007s : 0.00% parallel-infer-symbol : 0.000004s : 0.00% pre_auto_parallel : 0.000047s : 0.03% insert-virtual-dataset : 0.000003s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000040s : 0.03% optimize.rewriter_before_opt_a : 0.000112s : 0.07% optimize.opt_a.expand_dump_flag : 0.000007s : 0.00% optimize.opt_a.switch_simplify : 0.000063s : 0.04% optimize.opt_a.loop_unroll : 0.000043s : 0.03% optimize.opt_a.a_1 : 0.001060s : 0.70% optimize.opt_a.with_stream_mark : 0.000045s : 0.03% optimize.opt_a.recompute_prepare : 0.000024s : 0.02% optimize.opt_a.updatestate_depend_eliminate : 0.000014s : 0.01% optimize.opt_a.updatestate_assign_eliminate : 0.000009s : 0.01% optimize.opt_a.updatestate_loads_eliminate : 0.000009s : 0.01% optimize.opt_a.parameter_eliminate : 0.000004s : 0.00% optimize.opt_a.a_2 : 0.000242s : 0.16% optimize.opt_a.accelerated_algorithm : 0.000022s : 0.01% optimize.opt_a.shard : 0.000005s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.000006s : 0.00% optimize.opt_a.shard_inline : 0.000020s : 0.01% optimize.opt_a.merge_send_recv : 0.000022s : 0.01% optimize.opt_a.auto_parallel : 0.000023s : 0.02% optimize.opt_a.parallel : 0.000031s : 0.02% optimize.opt_a.flash_sp : 0.000016s : 0.01% optimize.opt_a.merge_comm : 0.000013s : 0.01% optimize.opt_a.allreduce_fusion : 0.000011s : 0.01% optimize.opt_a.matmul_add_comm_reduction : 0.000026s : 0.02% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000002s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000024s : 0.02% optimize.opt_a.virtual_dataset : 0.000019s : 0.01% optimize.opt_a.get_grad_eliminate_ : 0.000018s : 0.01% optimize.opt_a.virtual_output : 0.000019s : 0.01% optimize.opt_a.merge_forward : 0.000012s : 0.01% optimize.opt_a.cell_reuse_recompute_pass : 0.000005s : 0.00% optimize.opt_a.offload_activation : 0.000026s : 0.02% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000042s : 0.03% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.00% optimize.opt_a.before_grad : 0.000033s : 0.02% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000013s : 0.01% optimize.opt_a.meta_fg_expand : 0.000009s : 0.01% optimize.opt_a.flash_sp_send_recv_attached : 0.000005s : 0.00% optimize.opt_a.receive_attached : 0.000005s : 0.00% optimize.opt_a.after_resolve : 0.000032s : 0.02% optimize.opt_a.a_after_grad : 0.000028s : 0.02% optimize.opt_a.renormalize : 0.001143s : 0.75% optimize.opt_a.add_forward_monad_depend : 0.000012s : 0.01% optimize.opt_a.auto_monad_grad : 0.000005s : 0.00% optimize.opt_a.auto_monad_eliminator : 0.000041s : 0.03% optimize.opt_a.cse : 0.000080s : 0.05% optimize.opt_a.a_3 : 0.000137s : 0.09% optimize.py_interpret_to_execute_after_opt_a : 0.000018s : 0.01% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.00% optimize.rewriter_after_opt_a : 0.000056s : 0.04% optimize.convert_after_rewriter : 0.000010s : 0.01% optimize.order_py_execute_after_rewriter : 0.000006s : 0.00% optimize.mutable_eliminate : 0.000792s : 0.52% optimize.opt_b.b_1 : 0.000187s : 0.12% optimize.opt_b.b_2 : 0.000012s : 0.01% optimize.opt_b.updatestate_depend_eliminate : 0.000011s : 0.01% optimize.opt_b.updatestate_assign_eliminate : 0.000004s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000004s : 0.00% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000040s : 0.03% optimize.optimize_parallel_all_gather_comm : 0.000024s : 0.02% optimize.overlap_param_gather : 0.000002s : 0.00% optimize.cconv : 0.000036s : 0.02% optimize.loop_unroll : 0.000494s : 0.33% optimize.opt_after_cconv.c_1 : 0.000044s : 0.03% optimize.opt_after_cconv.parameter_eliminate : 0.000005s : 0.00% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000009s : 0.01% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000004s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000004s : 0.00% optimize.opt_after_cconv.cse : 0.000032s : 0.02% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000057s : 0.04% optimize.tuple_transform.d_1 : 0.000065s : 0.04% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000010s : 0.01% optimize.partial_unused_args_eliminate : 0.000002s : 0.00% optimize.add_recomputation : 0.000073s : 0.05% optimize.cse_after_recomputation.cse : 0.000019s : 0.01% optimize.environ_conv : 0.000008s : 0.01% optimize.swap_dp_allreduce_reducescatter : 0.000007s : 0.00% optimize.bias_add_comm_swap : 0.000003s : 0.00% optimize.label_micro_interleaved_index : 0.000005s : 0.00% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.00% optimize.merge_cast_opt : 0.000002s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.00% optimize.micro_interleaved_order_control : 0.000003s : 0.00% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.00% optimize.full_micro_interleaved_order_control : 0.000002s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.00% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.00% optimize.control_data_broadcast_order : 0.000018s : 0.01% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.00% optimize.offloading_packed_experts : 0.000005s : 0.00% optimize.overlap_recompute_and_grad_model_parallel : 0.000006s : 0.00% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000002s : 0.00% optimize.overlap_recompute_comm : 0.000002s : 0.00% optimize.overlap_grad_ring_attention : 0.000005s : 0.00% optimize.overlap_grad_flash_sp : 0.000029s : 0.02% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.00% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000004s : 0.00% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000016s : 0.01% optimize.symbol_engine_optimizer.elim_not_effective : 0.000019s : 0.01% optimize.symbol_engine_optimizer.opt_reshape : 0.000010s : 0.01% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000015s : 0.01% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.00% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000024s : 0.02% get_jit_bprop_graph : 0.000002s : 0.00% rewriter_after_jit_bprop_graph : 0.000008s : 0.00% opt_after_jit_grad : 0.000615s : 0.41% validate : 0.000058s : 0.04% Time group info: ------[substitution.] 0.000298 48 14.33% : 0.000043s : 6: substitution.cast_eliminate 0.93% : 0.000003s : 4: substitution.elim_not_effective 0.71% : 0.000002s : 4: substitution.fold_const_symbol 2.79% : 0.000008s : 6: substitution.graph_param_transform 68.51% : 0.000204s : 4: substitution.inline 2.38% : 0.000007s : 8: substitution.j_node_and_user_rematch 2.98% : 0.000009s : 8: substitution.remove_not_recompute_node 2.38% : 0.000007s : 4: substitution.replace_old_param 4.99% : 0.000015s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.144472 2 99.40% : 0.143605s : 1: type_inference.infer 0.60% : 0.000867s : 1: type_inference.specialize ------[replace.] 0.000076 8 62.03% : 0.000047s : 4: replace.inline 37.97% : 0.000029s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000213 8 93.96% : 0.000200s : 4: match.inline 6.04% : 0.000013s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000299 1730 0.81% : 0.000002s : 17: predicate.accumulaten_eliminater 1.03% : 0.000003s : 6: predicate.ad_related_special_op_eliminate 0.66% : 0.000002s : 12: predicate.addn_check_dump 0.86% : 0.000003s : 17: predicate.addn_zero_filter 0.81% : 0.000002s : 17: predicate.adjust_all_reduce_mul_add 2.15% : 0.000006s : 29: predicate.arithmetic_simplify 1.24% : 0.000004s : 17: predicate.cast_eliminate 0.61% : 0.000002s : 12: predicate.check_bprop_eliminate 0.62% : 0.000002s : 12: predicate.compare_switch_simplify 0.17% : 0.000001s : 6: predicate.const_output_eliminate 0.67% : 0.000002s : 12: predicate.depend_value_elim 0.88% : 0.000003s : 17: predicate.dict_get_item_const_eliminator 1.06% : 0.000003s : 17: predicate.dict_get_item_eliminator 0.91% : 0.000003s : 17: predicate.dict_set_item_eliminator 1.18% : 0.000004s : 12: predicate.dumpgradient_eliminate 0.25% : 0.000001s : 6: predicate.elim_not_effective 0.52% : 0.000002s : 6: predicate.elim_shapecalc_of_broadcastargs 1.10% : 0.000003s : 23: predicate.environ_add_const_eliminate 1.18% : 0.000004s : 23: predicate.environ_get_add_eliminate 1.07% : 0.000003s : 23: predicate.environ_get_depend_swap 1.68% : 0.000005s : 35: predicate.environ_get_eliminate 1.19% : 0.000004s : 23: predicate.environ_get_set_eliminate 1.27% : 0.000004s : 25: predicate.exchange_switch_depend_value 2.19% : 0.000007s : 25: predicate.float_depend_g_call 0.58% : 0.000002s : 12: predicate.float_environ_get_switch 0.96% : 0.000003s : 18: predicate.float_tuple_getitem_switch 0.18% : 0.000001s : 6: predicate.fold_const_symbol 0.78% : 0.000002s : 12: predicate.get_grad_eliminate 0.26% : 0.000001s : 6: predicate.graph_param_transform 0.67% : 0.000002s : 12: predicate.incorporate_call 0.54% : 0.000002s : 12: predicate.incorporate_call_switch 6.22% : 0.000019s : 78: predicate.inline 0.89% : 0.000003s : 12: predicate.inline_without_move 0.31% : 0.000001s : 12: predicate.j_node_and_user_rematch 1.29% : 0.000004s : 12: predicate.less_batch_normalization 1.91% : 0.000006s : 33: predicate.list_to_tuple_eliminator_ 2.44% : 0.000007s : 50: predicate.load_eliminater 1.07% : 0.000003s : 6: predicate.loop_unroll_after_grad 1.94% : 0.000006s : 38: predicate.loop_unroll_before_grad 1.54% : 0.000005s : 29: predicate.make_slice_get_slice_eliminator 0.67% : 0.000002s : 12: predicate.merge_addn 0.64% : 0.000002s : 12: predicate.micro_step_allgather_replace 0.60% : 0.000002s : 12: predicate.mini_step_allgather_replace 0.85% : 0.000003s : 17: predicate.minmaximum_grad 1.34% : 0.000004s : 6: predicate.mutable_eliminate 0.45% : 0.000001s : 6: predicate.opt_reshape 0.38% : 0.000001s : 6: predicate.parallel_virtual_node 1.43% : 0.000004s : 25: predicate.partial_defer_inline 1.46% : 0.000004s : 27: predicate.partial_eliminate 0.94% : 0.000003s : 17: predicate.print_const_string_wrapper 0.63% : 0.000002s : 12: predicate.reduce_all_const_elim 1.23% : 0.000004s : 17: predicate.reduce_eliminate 2.37% : 0.000007s : 50: predicate.redundant_stop_gradient_eliminater 0.34% : 0.000001s : 12: predicate.remove_not_recompute_node 1.22% : 0.000004s : 33: predicate.replace_applicator 0.50% : 0.000001s : 12: predicate.replace_old_param 0.29% : 0.000001s : 6: predicate.reset_defer_inline 0.93% : 0.000003s : 17: predicate.reshape_eliminate 0.64% : 0.000002s : 12: predicate.row_tensor_add_zeros_like 0.50% : 0.000001s : 6: predicate.row_tensor_eliminate 0.93% : 0.000003s : 12: predicate.same_eliminate 0.40% : 0.000001s : 12: predicate.set_cell_output_no_recompute 0.87% : 0.000003s : 12: predicate.shard_identity_eliminate 0.77% : 0.000002s : 12: predicate.special_op_eliminate 0.75% : 0.000002s : 12: predicate.specialize_transform 0.99% : 0.000003s : 12: predicate.split_environ_get_set_with_tuple_value 0.88% : 0.000003s : 12: predicate.stack_unstack_eliminate 0.42% : 0.000001s : 6: predicate.switch_call_monad_eliminater 1.40% : 0.000004s : 25: predicate.switch_defer_inline 1.96% : 0.000006s : 37: predicate.switch_layer_defer_inline 4.54% : 0.000014s : 81: predicate.switch_simplify 0.89% : 0.000003s : 17: predicate.tile_eliminate 0.88% : 0.000003s : 17: predicate.transpose_eliminate 1.59% : 0.000005s : 29: predicate.tuple_list_convert_item_index_to_positive 1.70% : 0.000005s : 29: predicate.tuple_list_get_item_const_eliminator 1.54% : 0.000005s : 29: predicate.tuple_list_get_item_depend_reorder 3.08% : 0.000009s : 45: predicate.tuple_list_get_item_eliminator 1.46% : 0.000004s : 29: predicate.tuple_list_get_set_item_eliminator 2.42% : 0.000007s : 41: predicate.tuple_list_set_item_eliminator 1.68% : 0.000005s : 33: predicate.tuple_to_list_eliminator_ 2.44% : 0.000007s : 50: predicate.updatestate_pure_node_eliminater 2.96% : 0.000009s : 62: predicate.updatestate_useless_node_eliminater 0.37% : 0.000001s : 6: predicate.value_based_eliminate 0.70% : 0.000002s : 12: predicate.virtual_dataset_eliminate 0.67% : 0.000002s : 12: predicate.virtual_output_eliminate 0.28% : 0.000001s : 6: predicate.virtual_view_grad_eliminate 0.33% : 0.000001s : 6: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000687 11 52.71% : 0.000362s : 5: func_graph_cloner_run.FuncGraphClonerGraph 47.29% : 0.000325s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.170450 192 0.00% : 0.000004s : 1: ForceFp32Comm 2.28% : 0.003886s : 1: add_attr 2.27% : 0.003870s : 1: add_attr_with_inline 0.00% : 0.000004s : 1: add_comm_op_reuse_tag 0.05% : 0.000078s : 1: add_recomputation 0.00% : 0.000005s : 1: assign_add_opt 0.05% : 0.000078s : 1: auto_monad 0.02% : 0.000028s : 1: auto_monad_reorder 0.00% : 0.000004s : 1: begin_end_overlap_inline 0.00% : 0.000006s : 1: bias_add_comm_swap 0.30% : 0.000511s : 1: bootstrap 0.02% : 0.000041s : 1: cconv 0.00% : 0.000004s : 1: comm_op_add_attrs 0.01% : 0.000022s : 1: control_data_broadcast_order 0.01% : 0.000013s : 1: convert_after_rewriter 0.02% : 0.000033s : 1: cse_after_recomputation 0.00% : 0.000005s : 1: dataset_repeat_opt 0.00% : 0.000006s : 1: detach_backward 0.01% : 0.000012s : 1: environ_conv 0.02% : 0.000031s : 1: event_method 0.00% : 0.000005s : 1: full_micro_interleaved_order_control 0.00% : 0.000005s : 1: get_jit_bprop_graph 0.01% : 0.000010s : 1: graph_reusing 0.00% : 0.000005s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000004s : 1: handle_group_info 0.00% : 0.000006s : 1: inline 0.00% : 0.000007s : 1: insert-virtual-dataset 0.00% : 0.000004s : 1: interleave_parallel_branches 0.00% : 0.000004s : 1: interleave_split_concat_branches 0.00% : 0.000006s : 1: label_fine_grained_interleaved_index 0.00% : 0.000008s : 1: label_micro_interleaved_index 0.30% : 0.000503s : 1: loop_unroll 0.00% : 0.000005s : 1: merge_cast_opt 0.00% : 0.000005s : 1: micro_interleaved_order_control 0.47% : 0.000803s : 1: mutable_eliminate 0.00% : 0.000008s : 1: offloading_packed_experts 0.01% : 0.000020s : 1: opt.transform.loop_unroll_optimizer 0.02% : 0.000026s : 1: opt.transform.mutable_eliminate 1.01% : 0.001716s : 78: opt.transform.opt_a 0.02% : 0.000042s : 1: opt.transform.opt_after_cconv 0.02% : 0.000041s : 1: opt.transform.opt_after_jit_grad 0.10% : 0.000165s : 28: opt.transform.opt_b 0.04% : 0.000073s : 2: opt.transform.opt_trans_graph 0.03% : 0.000056s : 4: opt.transform.symbol_engine_opt 2.33% : 0.003976s : 1: opt_a 0.09% : 0.000148s : 1: opt_after_cconv 0.37% : 0.000626s : 1: opt_after_jit_grad 0.18% : 0.000312s : 1: opt_b 3.97% : 0.006760s : 1: optimize 0.02% : 0.000028s : 1: optimize_parallel_all_gather_comm 0.01% : 0.000010s : 1: order_py_execute_after_rewriter 0.02% : 0.000033s : 1: overlap_grad_flash_sp 0.00% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.00% : 0.000008s : 1: overlap_grad_ring_attention 0.00% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000005s : 1: overlap_param_gather 0.00% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.01% : 0.000009s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000005s : 1: overlap_recompute_comm 0.00% : 0.000008s : 1: parallel-infer-symbol 0.00% : 0.000004s : 1: parallel-infer-symbol-second 0.00% : 0.000005s : 1: partial_unused_args_eliminate 0.00% : 0.000005s : 1: pipeline_parallel_scheduler 0.00% : 0.000005s : 1: pipeline_split 0.03% : 0.000051s : 1: pre_auto_parallel 0.03% : 0.000045s : 1: py_interpret_to_execute 0.01% : 0.000022s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000004s : 1: remove_cast_before_assign_add 0.04% : 0.000062s : 1: remove_dup_value 0.38% : 0.000646s : 1: renormalize.infer 0.28% : 0.000482s : 1: renormalize.specialize 0.00% : 0.000007s : 1: reorder_send_recv_between_fp_bp 0.01% : 0.000012s : 1: rewriter_after_jit_bprop_graph 0.04% : 0.000060s : 1: rewriter_after_opt_a 0.07% : 0.000117s : 1: rewriter_before_opt_a 0.00% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000005s : 1: slice_recompute_activation 0.00% : 0.000005s : 1: split_layernorm_comm 0.00% : 0.000005s : 1: split_matmul_comm_elemetwise 0.01% : 0.000010s : 1: swap_dp_allreduce_reducescatter 0.06% : 0.000107s : 1: symbol_engine_optimizer 0.06% : 0.000106s : 1: tuple_transform 84.81% : 0.144565s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:40:04.953.203 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:40:04.953.536 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.184508, [21] [bootstrap]: 0.00070623 [type_inference]: 0.00839312 [event_method]: 1.96e-05 [auto_monad]: 6.773e-05 [graph_reusing]: 5.74e-06 [inline]: 3.86999e-06 [add_attr]: 0.00489787, [1] [add_attr_with_inline]: 0.00488465, [1] [Cycle 1]: 9.524e-05, [2] [tag_attr]: 2.548e-05 [meta_addattr_fg_expand]: 6.29999e-06 [parallel-infer-symbol]: 4.12e-06 [pre_auto_parallel]: 4.21e-05 [insert-virtual-dataset]: 2.31998e-06 [parallel-infer-symbol-second]: 9.70002e-07 [dataset_repeat_opt]: 2.16e-06 [pipeline_split]: 1.62001e-06 [optimize]: 0.167402, [53] [py_interpret_to_execute]: 3.864e-05 [rewriter_before_opt_a]: 0.00010638 [opt_a]: 0.164179, [2] [Cycle 1]: 0.162978, [45] [expand_dump_flag]: 3.05002e-06 [switch_simplify]: 4.457e-05 [loop_unroll]: 3.145e-05 [a_1]: 0.0007502 [with_stream_mark]: 1.937e-05 [recompute_prepare]: 1.189e-05 [updatestate_depend_eliminate]: 5.24e-06 [updatestate_assign_eliminate]: 4.15e-06 [updatestate_loads_eliminate]: 4.27e-06 [parameter_eliminate]: 2.93e-06 [a_2]: 0.00014339 [accelerated_algorithm]: 9.84999e-06 [shard]: 2.81e-06 [meta_shard_fg_expand]: 2.68998e-06 [shard_inline]: 7.97e-06 [merge_send_recv]: 1.147e-05 [auto_parallel]: 9.71e-06 [parallel]: 2.297e-05 [flash_sp]: 1.124e-05 [merge_comm]: 5.61998e-06 [allreduce_fusion]: 4.75001e-06 [matmul_add_comm_reduction]: 1.227e-05 [allreduce_slice_to_reducescatter]: 6.80011e-07 [virtual_shard_identity]: 1.139e-05 [virtual_dataset]: 1.014e-05 [get_grad_eliminate_]: 8.82999e-06 [virtual_output]: 9.46e-06 [merge_forward]: 4.93001e-06 [cell_reuse_recompute_pass]: 2.03002e-06 [offload_activation]: 1.237e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.894e-05 [merge_recompute_call_nodes]: 1.89e-06 [before_grad]: 1.656e-05 [set_forward_comm_id_for_comm_node_pass]: 4.79e-06 [meta_fg_expand]: 4.45e-06 [flash_sp_send_recv_attached]: 2.66999e-06 [receive_attached]: 2.90002e-06 [after_resolve]: 1.78e-05 [a_after_grad]: 1.356e-05 [renormalize]: 0.161026 [add_forward_monad_depend]: 1.317e-05 [auto_monad_grad]: 2.94001e-06 [auto_monad_eliminator]: 3.094e-05 [cse]: 5.064e-05 [a_3]: 0.00010142 [Cycle 2]: 0.00118009, [45] [expand_dump_flag]: 2.56e-06 [switch_simplify]: 1.132e-05 [loop_unroll]: 1.091e-05 [a_1]: 0.00024246 [with_stream_mark]: 2.421e-05 [recompute_prepare]: 8.64e-06 [updatestate_depend_eliminate]: 5.20001e-06 [updatestate_assign_eliminate]: 3.99002e-06 [updatestate_loads_eliminate]: 3.81001e-06 [parameter_eliminate]: 2.80002e-06 [a_2]: 0.00016244 [accelerated_algorithm]: 1.035e-05 [shard]: 3.73999e-06 [meta_shard_fg_expand]: 3.49001e-06 [shard_inline]: 7.99002e-06 [merge_send_recv]: 1.085e-05 [auto_parallel]: 1.183e-05 [parallel]: 1.263e-05 [flash_sp]: 5.74e-06 [merge_comm]: 5.49e-06 [allreduce_fusion]: 5.91e-06 [matmul_add_comm_reduction]: 1.221e-05 [allreduce_slice_to_reducescatter]: 7.80012e-07 [virtual_shard_identity]: 1.101e-05 [virtual_dataset]: 1.016e-05 [get_grad_eliminate_]: 1.015e-05 [virtual_output]: 8.32998e-06 [merge_forward]: 5.12999e-06 [cell_reuse_recompute_pass]: 5.49e-06 [offload_activation]: 1.254e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.821e-05 [merge_recompute_call_nodes]: 1.80001e-06 [before_grad]: 1.447e-05 [set_forward_comm_id_for_comm_node_pass]: 4.80001e-06 [meta_fg_expand]: 5.00001e-06 [flash_sp_send_recv_attached]: 1.76e-06 [receive_attached]: 2.91999e-06 [after_resolve]: 1.681e-05 [a_after_grad]: 1.357e-05 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 1.67999e-06 [auto_monad_grad]: 1.77001e-06 [auto_monad_eliminator]: 1.133e-05 [cse]: 2.232e-05 [a_3]: 6.188e-05 [py_interpret_to_execute_after_opt_a]: 2.863e-05 [slice_cell_reuse_recomputed_activation]: 5.06002e-06 [rewriter_after_opt_a]: 5.501e-05 [convert_after_rewriter]: 1.081e-05 [order_py_execute_after_rewriter]: 8.68001e-06 [mutable_eliminate]: 0.00100137 [opt_b]: 0.00034583, [1] [Cycle 1]: 0.00033234, [7] [b_1]: 0.0002183 [b_2]: 1.194e-05 [updatestate_depend_eliminate]: 8.92999e-06 [updatestate_assign_eliminate]: 3.14001e-06 [updatestate_loads_eliminate]: 3.08998e-06 [renormalize]: 6.30011e-07 [cse]: 2.676e-05 [optimize_parallel_all_gather_comm]: 2.348e-05 [overlap_param_gather]: 6.07999e-06 [cconv]: 3.809e-05 [loop_unroll]: 0.00057929 [opt_after_cconv]: 0.00015399, [1] [Cycle 1]: 0.00014373, [7] [c_1]: 3.947e-05 [parameter_eliminate]: 4.68001e-06 [updatestate_depend_eliminate]: 7.24001e-06 [updatestate_assign_eliminate]: 3.27002e-06 [updatestate_loads_eliminate]: 3.11001e-06 [cse]: 2.725e-05 [renormalize]: 4.09986e-07 [remove_dup_value]: 2.237e-05 [tuple_transform]: 0.00010833, [1] [Cycle 1]: 0.00010025, [4] [d_1]: 5.687e-05 [none_parameter_eliminate]: 1.96998e-06 [renormalize]: 3.19997e-07 [switch_simplify]: 9.20999e-06 [partial_unused_args_eliminate]: 4.76002e-06 [add_recomputation]: 7.205e-05 [cse_after_recomputation]: 3.537e-05, [1] [Cycle 1]: 2.767e-05, [1] [cse]: 1.818e-05 [environ_conv]: 9.81998e-06 [swap_dp_allreduce_reducescatter]: 8.92e-06 [bias_add_comm_swap]: 5.91e-06 [label_micro_interleaved_index]: 8.35999e-06 [label_fine_grained_interleaved_index]: 5.10001e-06 [merge_cast_opt]: 4.33001e-06 [slice_recompute_activation]: 5.05001e-06 [micro_interleaved_order_control]: 5.15999e-06 [assign_add_opt]: 3.78001e-06 [ForceFp32Comm]: 3.32002e-06 [remove_cast_before_assign_add]: 3.53e-06 [full_micro_interleaved_order_control]: 4.42e-06 [reorder_send_recv_between_fp_bp]: 6.06e-06 [comm_op_add_attrs]: 3.31001e-06 [add_comm_op_reuse_tag]: 3.50998e-06 [interleave_split_concat_branches]: 3.78001e-06 [interleave_parallel_branches]: 3.66999e-06 [overlap_opt_shard_in_pipeline]: 3.83999e-06 [overlap_opt_shard_grad_in_pipeline]: 4.10998e-06 [control_data_broadcast_order]: 1.899e-05 [grouped_pairwise_exchange_alltoall]: 4.4e-06 [offloading_packed_experts]: 7.87003e-06 [overlap_recompute_and_grad_model_parallel]: 1.043e-05 [overlap_grad_matmul_and_grad_allreduce]: 3.85998e-06 [overlap_recompute_allgather_and_fa_grad]: 4.48999e-06 [overlap_recompute_comm]: 8.30999e-06 [overlap_grad_ring_attention]: 8.06001e-06 [overlap_grad_flash_sp]: 3.17e-05 [begin_end_overlap_inline]: 2.98003e-06 [split_matmul_comm_elemetwise]: 4.67e-06 [split_layernorm_comm]: 4.24002e-06 [handle_group_info]: 3.53e-06 [symbol_engine_optimizer]: 0.00013244, [1] [Cycle 1]: 0.00012486, [6] [build]: 7.8e-06 [elim_shapecalc]: 1.525e-05 [elim_not_effective]: 2.028e-05 [opt_reshape]: 1.232e-05 [fold_const_symbol]: 1.49e-05 [renormalize]: 1.8999e-07 [detach_backward]: 6.68998e-06 [pipeline_parallel_scheduler]: 2.66999e-06 [auto_monad_reorder]: 3.092e-05 [get_jit_bprop_graph]: 2.51e-06 [rewriter_after_jit_bprop_graph]: 7.51001e-06 [opt_after_jit_grad]: 0.00078248 [validate]: 5.482e-05 Sums bootstrap : 0.000706s : 0.40% type_inference : 0.008393s : 4.76% event_method : 0.000020s : 0.01% auto_monad : 0.000068s : 0.04% graph_reusing : 0.000006s : 0.00% inline : 0.000004s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000025s : 0.01% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.00% parallel-infer-symbol : 0.000004s : 0.00% pre_auto_parallel : 0.000042s : 0.02% insert-virtual-dataset : 0.000002s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000039s : 0.02% optimize.rewriter_before_opt_a : 0.000106s : 0.06% optimize.opt_a.expand_dump_flag : 0.000006s : 0.00% optimize.opt_a.switch_simplify : 0.000056s : 0.03% optimize.opt_a.loop_unroll : 0.000042s : 0.02% optimize.opt_a.a_1 : 0.000993s : 0.56% optimize.opt_a.with_stream_mark : 0.000044s : 0.02% optimize.opt_a.recompute_prepare : 0.000021s : 0.01% optimize.opt_a.updatestate_depend_eliminate : 0.000010s : 0.01% optimize.opt_a.updatestate_assign_eliminate : 0.000008s : 0.00% optimize.opt_a.updatestate_loads_eliminate : 0.000008s : 0.00% optimize.opt_a.parameter_eliminate : 0.000006s : 0.00% optimize.opt_a.a_2 : 0.000306s : 0.17% optimize.opt_a.accelerated_algorithm : 0.000020s : 0.01% optimize.opt_a.shard : 0.000007s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.000006s : 0.00% optimize.opt_a.shard_inline : 0.000016s : 0.01% optimize.opt_a.merge_send_recv : 0.000022s : 0.01% optimize.opt_a.auto_parallel : 0.000022s : 0.01% optimize.opt_a.parallel : 0.000036s : 0.02% optimize.opt_a.flash_sp : 0.000017s : 0.01% optimize.opt_a.merge_comm : 0.000011s : 0.01% optimize.opt_a.allreduce_fusion : 0.000011s : 0.01% optimize.opt_a.matmul_add_comm_reduction : 0.000024s : 0.01% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000022s : 0.01% optimize.opt_a.virtual_dataset : 0.000020s : 0.01% optimize.opt_a.get_grad_eliminate_ : 0.000019s : 0.01% optimize.opt_a.virtual_output : 0.000018s : 0.01% optimize.opt_a.merge_forward : 0.000010s : 0.01% optimize.opt_a.cell_reuse_recompute_pass : 0.000008s : 0.00% optimize.opt_a.offload_activation : 0.000025s : 0.01% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000037s : 0.02% optimize.opt_a.merge_recompute_call_nodes : 0.000004s : 0.00% optimize.opt_a.before_grad : 0.000031s : 0.02% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000010s : 0.01% optimize.opt_a.meta_fg_expand : 0.000009s : 0.01% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.00% optimize.opt_a.receive_attached : 0.000006s : 0.00% optimize.opt_a.after_resolve : 0.000035s : 0.02% optimize.opt_a.a_after_grad : 0.000027s : 0.02% optimize.opt_a.renormalize : 0.161026s : 91.39% optimize.opt_a.add_forward_monad_depend : 0.000015s : 0.01% optimize.opt_a.auto_monad_grad : 0.000005s : 0.00% optimize.opt_a.auto_monad_eliminator : 0.000042s : 0.02% optimize.opt_a.cse : 0.000073s : 0.04% optimize.opt_a.a_3 : 0.000163s : 0.09% optimize.py_interpret_to_execute_after_opt_a : 0.000029s : 0.02% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.00% optimize.rewriter_after_opt_a : 0.000055s : 0.03% optimize.convert_after_rewriter : 0.000011s : 0.01% optimize.order_py_execute_after_rewriter : 0.000009s : 0.00% optimize.mutable_eliminate : 0.001001s : 0.57% optimize.opt_b.b_1 : 0.000218s : 0.12% optimize.opt_b.b_2 : 0.000012s : 0.01% optimize.opt_b.updatestate_depend_eliminate : 0.000009s : 0.01% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.00% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000027s : 0.02% optimize.optimize_parallel_all_gather_comm : 0.000023s : 0.01% optimize.overlap_param_gather : 0.000006s : 0.00% optimize.cconv : 0.000038s : 0.02% optimize.loop_unroll : 0.000579s : 0.33% optimize.opt_after_cconv.c_1 : 0.000039s : 0.02% optimize.opt_after_cconv.parameter_eliminate : 0.000005s : 0.00% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000007s : 0.00% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.cse : 0.000027s : 0.02% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000022s : 0.01% optimize.tuple_transform.d_1 : 0.000057s : 0.03% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000009s : 0.01% optimize.partial_unused_args_eliminate : 0.000005s : 0.00% optimize.add_recomputation : 0.000072s : 0.04% optimize.cse_after_recomputation.cse : 0.000018s : 0.01% optimize.environ_conv : 0.000010s : 0.01% optimize.swap_dp_allreduce_reducescatter : 0.000009s : 0.01% optimize.bias_add_comm_swap : 0.000006s : 0.00% optimize.label_micro_interleaved_index : 0.000008s : 0.00% optimize.label_fine_grained_interleaved_index : 0.000005s : 0.00% optimize.merge_cast_opt : 0.000004s : 0.00% optimize.slice_recompute_activation : 0.000005s : 0.00% optimize.micro_interleaved_order_control : 0.000005s : 0.00% optimize.assign_add_opt : 0.000004s : 0.00% optimize.ForceFp32Comm : 0.000003s : 0.00% optimize.remove_cast_before_assign_add : 0.000004s : 0.00% optimize.full_micro_interleaved_order_control : 0.000004s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000006s : 0.00% optimize.comm_op_add_attrs : 0.000003s : 0.00% optimize.add_comm_op_reuse_tag : 0.000004s : 0.00% optimize.interleave_split_concat_branches : 0.000004s : 0.00% optimize.interleave_parallel_branches : 0.000004s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000004s : 0.00% optimize.control_data_broadcast_order : 0.000019s : 0.01% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.00% optimize.offloading_packed_experts : 0.000008s : 0.00% optimize.overlap_recompute_and_grad_model_parallel : 0.000010s : 0.01% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.00% optimize.overlap_recompute_comm : 0.000008s : 0.00% optimize.overlap_grad_ring_attention : 0.000008s : 0.00% optimize.overlap_grad_flash_sp : 0.000032s : 0.02% optimize.begin_end_overlap_inline : 0.000003s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000005s : 0.00% optimize.split_layernorm_comm : 0.000004s : 0.00% optimize.handle_group_info : 0.000004s : 0.00% optimize.symbol_engine_optimizer.build : 0.000008s : 0.00% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000015s : 0.01% optimize.symbol_engine_optimizer.elim_not_effective : 0.000020s : 0.01% optimize.symbol_engine_optimizer.opt_reshape : 0.000012s : 0.01% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000015s : 0.01% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000007s : 0.00% pipeline_parallel_scheduler : 0.000003s : 0.00% auto_monad_reorder : 0.000031s : 0.02% get_jit_bprop_graph : 0.000003s : 0.00% rewriter_after_jit_bprop_graph : 0.000008s : 0.00% opt_after_jit_grad : 0.000782s : 0.44% validate : 0.000055s : 0.03% Time group info: ------[substitution.] 0.000278 38 17.48% : 0.000049s : 3: substitution.cast_eliminate 1.33% : 0.000004s : 3: substitution.elim_not_effective 0.60% : 0.000002s : 3: substitution.fold_const_symbol 2.65% : 0.000007s : 5: substitution.graph_param_transform 64.63% : 0.000179s : 4: substitution.inline 2.33% : 0.000006s : 6: substitution.j_node_and_user_rematch 3.02% : 0.000008s : 6: substitution.remove_not_recompute_node 2.55% : 0.000007s : 4: substitution.replace_old_param 5.39% : 0.000015s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.008327 2 89.89% : 0.007486s : 1: type_inference.infer 10.11% : 0.000842s : 1: type_inference.specialize ------[replace.] 0.000073 8 64.05% : 0.000047s : 4: replace.inline 35.95% : 0.000026s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000190 8 93.10% : 0.000177s : 4: match.inline 6.90% : 0.000013s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000303 1504 0.79% : 0.000002s : 15: predicate.accumulaten_eliminater 0.58% : 0.000002s : 5: predicate.ad_related_special_op_eliminate 0.49% : 0.000001s : 10: predicate.addn_check_dump 1.20% : 0.000004s : 15: predicate.addn_zero_filter 0.92% : 0.000003s : 15: predicate.adjust_all_reduce_mul_add 2.31% : 0.000007s : 25: predicate.arithmetic_simplify 0.96% : 0.000003s : 15: predicate.cast_eliminate 0.77% : 0.000002s : 10: predicate.check_bprop_eliminate 0.48% : 0.000001s : 10: predicate.compare_switch_simplify 0.16% : 0.000000s : 5: predicate.const_output_eliminate 0.67% : 0.000002s : 10: predicate.depend_value_elim 0.99% : 0.000003s : 15: predicate.dict_get_item_const_eliminator 1.80% : 0.000005s : 15: predicate.dict_get_item_eliminator 0.86% : 0.000003s : 15: predicate.dict_set_item_eliminator 1.06% : 0.000003s : 10: predicate.dumpgradient_eliminate 0.24% : 0.000001s : 5: predicate.elim_not_effective 0.74% : 0.000002s : 5: predicate.elim_shapecalc_of_broadcastargs 1.04% : 0.000003s : 20: predicate.environ_add_const_eliminate 1.38% : 0.000004s : 20: predicate.environ_get_add_eliminate 0.99% : 0.000003s : 20: predicate.environ_get_depend_swap 1.84% : 0.000006s : 30: predicate.environ_get_eliminate 0.88% : 0.000003s : 20: predicate.environ_get_set_eliminate 1.11% : 0.000003s : 23: predicate.exchange_switch_depend_value 1.90% : 0.000006s : 23: predicate.float_depend_g_call 0.56% : 0.000002s : 10: predicate.float_environ_get_switch 1.29% : 0.000004s : 15: predicate.float_tuple_getitem_switch 0.15% : 0.000000s : 5: predicate.fold_const_symbol 0.78% : 0.000002s : 10: predicate.get_grad_eliminate 0.18% : 0.000001s : 5: predicate.graph_param_transform 0.56% : 0.000002s : 10: predicate.incorporate_call 0.47% : 0.000001s : 10: predicate.incorporate_call_switch 5.24% : 0.000016s : 68: predicate.inline 0.66% : 0.000002s : 10: predicate.inline_without_move 0.26% : 0.000001s : 10: predicate.j_node_and_user_rematch 1.05% : 0.000003s : 10: predicate.less_batch_normalization 1.58% : 0.000005s : 29: predicate.list_to_tuple_eliminator_ 2.12% : 0.000006s : 44: predicate.load_eliminater 0.75% : 0.000002s : 5: predicate.loop_unroll_after_grad 1.76% : 0.000005s : 36: predicate.loop_unroll_before_grad 1.98% : 0.000006s : 25: predicate.make_slice_get_slice_eliminator 0.54% : 0.000002s : 10: predicate.merge_addn 0.49% : 0.000001s : 10: predicate.micro_step_allgather_replace 0.76% : 0.000002s : 10: predicate.mini_step_allgather_replace 0.65% : 0.000002s : 15: predicate.minmaximum_grad 1.05% : 0.000003s : 5: predicate.mutable_eliminate 0.55% : 0.000002s : 5: predicate.opt_reshape 0.33% : 0.000001s : 5: predicate.parallel_virtual_node 1.47% : 0.000004s : 23: predicate.partial_defer_inline 1.28% : 0.000004s : 24: predicate.partial_eliminate 0.81% : 0.000002s : 15: predicate.print_const_string_wrapper 0.76% : 0.000002s : 10: predicate.reduce_all_const_elim 1.53% : 0.000005s : 15: predicate.reduce_eliminate 2.05% : 0.000006s : 44: predicate.redundant_stop_gradient_eliminater 0.29% : 0.000001s : 10: predicate.remove_not_recompute_node 1.19% : 0.000004s : 29: predicate.replace_applicator 0.42% : 0.000001s : 10: predicate.replace_old_param 0.28% : 0.000001s : 5: predicate.reset_defer_inline 1.94% : 0.000006s : 15: predicate.reshape_eliminate 1.27% : 0.000004s : 10: predicate.row_tensor_add_zeros_like 0.37% : 0.000001s : 5: predicate.row_tensor_eliminate 1.10% : 0.000003s : 10: predicate.same_eliminate 0.37% : 0.000001s : 10: predicate.set_cell_output_no_recompute 0.83% : 0.000003s : 10: predicate.shard_identity_eliminate 0.90% : 0.000003s : 10: predicate.special_op_eliminate 0.72% : 0.000002s : 10: predicate.specialize_transform 1.15% : 0.000003s : 10: predicate.split_environ_get_set_with_tuple_value 1.06% : 0.000003s : 10: predicate.stack_unstack_eliminate 0.27% : 0.000001s : 5: predicate.switch_call_monad_eliminater 1.23% : 0.000004s : 23: predicate.switch_defer_inline 2.24% : 0.000007s : 33: predicate.switch_layer_defer_inline 3.90% : 0.000012s : 74: predicate.switch_simplify 0.96% : 0.000003s : 15: predicate.tile_eliminate 1.85% : 0.000006s : 15: predicate.transpose_eliminate 2.76% : 0.000008s : 25: predicate.tuple_list_convert_item_index_to_positive 1.44% : 0.000004s : 25: predicate.tuple_list_get_item_const_eliminator 1.42% : 0.000004s : 25: predicate.tuple_list_get_item_depend_reorder 2.86% : 0.000009s : 39: predicate.tuple_list_get_item_eliminator 1.42% : 0.000004s : 25: predicate.tuple_list_get_set_item_eliminator 2.70% : 0.000008s : 35: predicate.tuple_list_set_item_eliminator 1.72% : 0.000005s : 29: predicate.tuple_to_list_eliminator_ 1.92% : 0.000006s : 44: predicate.updatestate_pure_node_eliminater 2.62% : 0.000008s : 54: predicate.updatestate_useless_node_eliminater 0.49% : 0.000001s : 5: predicate.value_based_eliminate 0.55% : 0.000002s : 10: predicate.virtual_dataset_eliminate 1.10% : 0.000003s : 10: predicate.virtual_output_eliminate 0.23% : 0.000001s : 5: predicate.virtual_view_grad_eliminate 0.57% : 0.000002s : 5: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000769 11 45.34% : 0.000349s : 5: func_graph_cloner_run.FuncGraphClonerGraph 54.66% : 0.000421s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.519536 192 0.00% : 0.000006s : 1: ForceFp32Comm 0.95% : 0.004911s : 1: add_attr 0.94% : 0.004889s : 1: add_attr_with_inline 0.00% : 0.000006s : 1: add_comm_op_reuse_tag 0.01% : 0.000076s : 1: add_recomputation 0.00% : 0.000006s : 1: assign_add_opt 0.02% : 0.000079s : 1: auto_monad 0.01% : 0.000039s : 1: auto_monad_reorder 0.00% : 0.000006s : 1: begin_end_overlap_inline 0.00% : 0.000009s : 1: bias_add_comm_swap 0.15% : 0.000765s : 1: bootstrap 0.01% : 0.000041s : 1: cconv 0.00% : 0.000006s : 1: comm_op_add_attrs 0.00% : 0.000022s : 1: control_data_broadcast_order 0.00% : 0.000014s : 1: convert_after_rewriter 0.01% : 0.000039s : 1: cse_after_recomputation 0.00% : 0.000008s : 1: dataset_repeat_opt 0.02% : 0.000087s : 1: detach_backward 0.00% : 0.000013s : 1: environ_conv 0.01% : 0.000030s : 1: event_method 0.00% : 0.000007s : 1: full_micro_interleaved_order_control 0.00% : 0.000010s : 1: get_jit_bprop_graph 0.00% : 0.000012s : 1: graph_reusing 0.00% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000006s : 1: handle_group_info 0.00% : 0.000010s : 1: inline 0.00% : 0.000008s : 1: insert-virtual-dataset 0.00% : 0.000006s : 1: interleave_parallel_branches 0.00% : 0.000006s : 1: interleave_split_concat_branches 0.00% : 0.000008s : 1: label_fine_grained_interleaved_index 0.00% : 0.000011s : 1: label_micro_interleaved_index 0.11% : 0.000587s : 1: loop_unroll 0.00% : 0.000007s : 1: merge_cast_opt 0.00% : 0.000008s : 1: micro_interleaved_order_control 0.19% : 0.001008s : 1: mutable_eliminate 0.00% : 0.000011s : 1: offloading_packed_experts 0.00% : 0.000017s : 1: opt.transform.loop_unroll_optimizer 0.00% : 0.000020s : 1: opt.transform.mutable_eliminate 0.31% : 0.001619s : 78: opt.transform.opt_a 0.01% : 0.000038s : 1: opt.transform.opt_after_cconv 0.01% : 0.000035s : 1: opt.transform.opt_after_jit_grad 0.03% : 0.000152s : 28: opt.transform.opt_b 0.01% : 0.000063s : 2: opt.transform.opt_trans_graph 0.01% : 0.000059s : 4: opt.transform.symbol_engine_opt 31.60% : 0.164184s : 1: opt_a 0.03% : 0.000158s : 1: opt_after_cconv 0.15% : 0.000795s : 1: opt_after_jit_grad 0.07% : 0.000350s : 1: opt_b 32.53% : 0.169002s : 1: optimize 0.01% : 0.000027s : 1: optimize_parallel_all_gather_comm 0.00% : 0.000012s : 1: order_py_execute_after_rewriter 0.01% : 0.000035s : 1: overlap_grad_flash_sp 0.00% : 0.000006s : 1: overlap_grad_matmul_and_grad_allreduce 0.00% : 0.000014s : 1: overlap_grad_ring_attention 0.00% : 0.000007s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000006s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000009s : 1: overlap_param_gather 0.00% : 0.000007s : 1: overlap_recompute_allgather_and_fa_grad 0.00% : 0.000013s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000011s : 1: overlap_recompute_comm 0.00% : 0.000012s : 1: parallel-infer-symbol 0.00% : 0.000006s : 1: parallel-infer-symbol-second 0.00% : 0.000008s : 1: partial_unused_args_eliminate 0.00% : 0.000012s : 1: pipeline_parallel_scheduler 0.00% : 0.000007s : 1: pipeline_split 0.01% : 0.000050s : 1: pre_auto_parallel 0.01% : 0.000042s : 1: py_interpret_to_execute 0.01% : 0.000032s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000007s : 1: remove_cast_before_assign_add 0.00% : 0.000026s : 1: remove_dup_value 30.86% : 0.160323s : 1: renormalize.infer 0.13% : 0.000680s : 1: renormalize.specialize 0.00% : 0.000009s : 1: reorder_send_recv_between_fp_bp 0.00% : 0.000013s : 1: rewriter_after_jit_bprop_graph 0.01% : 0.000059s : 1: rewriter_after_opt_a 0.02% : 0.000112s : 1: rewriter_before_opt_a 0.00% : 0.000008s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000008s : 1: slice_recompute_activation 0.00% : 0.000007s : 1: split_layernorm_comm 0.00% : 0.000007s : 1: split_matmul_comm_elemetwise 0.00% : 0.000012s : 1: swap_dp_allreduce_reducescatter 0.03% : 0.000136s : 1: symbol_engine_optimizer 0.02% : 0.000111s : 1: tuple_transform 1.63% : 0.008450s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:40:08.726.872 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.229468, [21] [bootstrap]: 0.00042882 [type_inference]: 0.21847 [event_method]: 1.961e-05 [auto_monad]: 6.772e-05 [graph_reusing]: 5.94e-06 [inline]: 2.66e-06 [add_attr]: 0.00345014, [1] [add_attr_with_inline]: 0.00343877, [1] [Cycle 1]: 6.898e-05, [2] [tag_attr]: 2.309e-05 [meta_addattr_fg_expand]: 6.09999e-06 [parallel-infer-symbol]: 3.73999e-06 [pre_auto_parallel]: 4.083e-05 [insert-virtual-dataset]: 2.48e-06 [parallel-infer-symbol-second]: 7.89994e-07 [dataset_repeat_opt]: 2.37999e-06 [pipeline_split]: 1.76998e-06 [optimize]: 0.00624801, [53] [py_interpret_to_execute]: 2.995e-05 [rewriter_before_opt_a]: 0.00010994 [opt_a]: 0.0036348, [2] [Cycle 1]: 0.00267069, [45] [expand_dump_flag]: 3.32002e-06 [switch_simplify]: 4.683e-05 [loop_unroll]: 3.115e-05 [a_1]: 0.00083692 [with_stream_mark]: 2.192e-05 [recompute_prepare]: 1.21e-05 [updatestate_depend_eliminate]: 5.51002e-06 [updatestate_assign_eliminate]: 4.08999e-06 [updatestate_loads_eliminate]: 3.83999e-06 [parameter_eliminate]: 2.15002e-06 [a_2]: 0.00010566 [accelerated_algorithm]: 9.12001e-06 [shard]: 1.86e-06 [meta_shard_fg_expand]: 3.22997e-06 [shard_inline]: 7.9e-06 [merge_send_recv]: 1.016e-05 [auto_parallel]: 8.25999e-06 [parallel]: 2.143e-05 [flash_sp]: 1.17e-05 [merge_comm]: 4.77998e-06 [allreduce_fusion]: 4.43999e-06 [matmul_add_comm_reduction]: 1.165e-05 [allreduce_slice_to_reducescatter]: 1.19e-06 [virtual_shard_identity]: 1.151e-05 [virtual_dataset]: 8.57e-06 [get_grad_eliminate_]: 8.43999e-06 [virtual_output]: 8.95999e-06 [merge_forward]: 4.55999e-06 [cell_reuse_recompute_pass]: 1.67999e-06 [offload_activation]: 1.216e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.756e-05 [merge_recompute_call_nodes]: 1.67999e-06 [before_grad]: 1.464e-05 [set_forward_comm_id_for_comm_node_pass]: 5.22e-06 [meta_fg_expand]: 3.65e-06 [flash_sp_send_recv_attached]: 2.53e-06 [receive_attached]: 2.91999e-06 [after_resolve]: 1.467e-05 [a_after_grad]: 1.26e-05 [renormalize]: 0.00096414 [add_forward_monad_depend]: 6.73998e-06 [auto_monad_grad]: 3.29001e-06 [auto_monad_eliminator]: 2.058e-05 [cse]: 4.211e-05 [a_3]: 6.446e-05 [Cycle 2]: 0.00095263, [45] [expand_dump_flag]: 2.31e-06 [switch_simplify]: 9.35001e-06 [loop_unroll]: 7.5e-06 [a_1]: 0.00017888 [with_stream_mark]: 1.582e-05 [recompute_prepare]: 7.81001e-06 [updatestate_depend_eliminate]: 4.1e-06 [updatestate_assign_eliminate]: 3.16999e-06 [updatestate_loads_eliminate]: 2.94999e-06 [parameter_eliminate]: 1.26002e-06 [a_2]: 0.00022992 [accelerated_algorithm]: 9.80002e-06 [shard]: 2.02001e-06 [meta_shard_fg_expand]: 2.01e-06 [shard_inline]: 8.45999e-06 [merge_send_recv]: 8.08999e-06 [auto_parallel]: 8.56002e-06 [parallel]: 6.11e-06 [flash_sp]: 4.09002e-06 [merge_comm]: 4.49998e-06 [allreduce_fusion]: 4.73001e-06 [matmul_add_comm_reduction]: 1.061e-05 [allreduce_slice_to_reducescatter]: 7.2e-07 [virtual_shard_identity]: 8.99998e-06 [virtual_dataset]: 8.28999e-06 [get_grad_eliminate_]: 7.93999e-06 [virtual_output]: 7.58001e-06 [merge_forward]: 5.07e-06 [cell_reuse_recompute_pass]: 2.55997e-06 [offload_activation]: 8.90001e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.704e-05 [merge_recompute_call_nodes]: 8.59989e-07 [before_grad]: 1.276e-05 [set_forward_comm_id_for_comm_node_pass]: 5.29e-06 [meta_fg_expand]: 2.96001e-06 [flash_sp_send_recv_attached]: 8.79983e-07 [receive_attached]: 1.99e-06 [after_resolve]: 1.304e-05 [a_after_grad]: 1.138e-05 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 2.09e-06 [auto_monad_grad]: 1.66e-06 [auto_monad_eliminator]: 1.057e-05 [cse]: 2.229e-05 [a_3]: 4.704e-05 [py_interpret_to_execute_after_opt_a]: 1.422e-05 [slice_cell_reuse_recomputed_activation]: 2.09999e-06 [rewriter_after_opt_a]: 4.447e-05 [convert_after_rewriter]: 7.79002e-06 [order_py_execute_after_rewriter]: 6.01998e-06 [mutable_eliminate]: 0.00086943 [opt_b]: 0.00025033, [1] [Cycle 1]: 0.00024367, [7] [b_1]: 0.00015794 [b_2]: 9.68997e-06 [updatestate_depend_eliminate]: 7.55e-06 [updatestate_assign_eliminate]: 3.5e-06 [updatestate_loads_eliminate]: 3.07002e-06 [renormalize]: 6.69999e-07 [cse]: 2.52e-05 [optimize_parallel_all_gather_comm]: 1.926e-05 [overlap_param_gather]: 1.97001e-06 [cconv]: 3.306e-05 [loop_unroll]: 0.00047591 [opt_after_cconv]: 0.00012108, [1] [Cycle 1]: 0.0001149, [7] [c_1]: 3.956e-05 [parameter_eliminate]: 3.77002e-06 [updatestate_depend_eliminate]: 6.03002e-06 [updatestate_assign_eliminate]: 3.08e-06 [updatestate_loads_eliminate]: 3.08e-06 [cse]: 2.371e-05 [renormalize]: 3.99974e-07 [remove_dup_value]: 1.666e-05 [tuple_transform]: 8.725e-05, [1] [Cycle 1]: 8.254e-05, [4] [d_1]: 5.258e-05 [none_parameter_eliminate]: 1.89999e-06 [renormalize]: 2.50002e-07 [switch_simplify]: 8.68001e-06 [partial_unused_args_eliminate]: 2.26998e-06 [add_recomputation]: 6.347e-05 [cse_after_recomputation]: 2.613e-05, [1] [Cycle 1]: 2.115e-05, [1] [cse]: 1.557e-05 [environ_conv]: 6.94001e-06 [swap_dp_allreduce_reducescatter]: 6.56999e-06 [bias_add_comm_swap]: 2.83e-06 [label_micro_interleaved_index]: 4.53001e-06 [label_fine_grained_interleaved_index]: 2.97002e-06 [merge_cast_opt]: 1.60999e-06 [slice_recompute_activation]: 2.12001e-06 [micro_interleaved_order_control]: 2.63e-06 [assign_add_opt]: 1.62001e-06 [ForceFp32Comm]: 9.89996e-07 [remove_cast_before_assign_add]: 1.03001e-06 [full_micro_interleaved_order_control]: 2.26e-06 [reorder_send_recv_between_fp_bp]: 3.4e-06 [comm_op_add_attrs]: 1.04e-06 [add_comm_op_reuse_tag]: 9.70002e-07 [interleave_split_concat_branches]: 1.23002e-06 [interleave_parallel_branches]: 1.14e-06 [overlap_opt_shard_in_pipeline]: 1.59998e-06 [overlap_opt_shard_grad_in_pipeline]: 2.04999e-06 [control_data_broadcast_order]: 1.521e-05 [grouped_pairwise_exchange_alltoall]: 1.96e-06 [offloading_packed_experts]: 4.17003e-06 [overlap_recompute_and_grad_model_parallel]: 5.29998e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.29998e-06 [overlap_recompute_allgather_and_fa_grad]: 1.79e-06 [overlap_recompute_comm]: 2.44999e-06 [overlap_grad_ring_attention]: 4.69998e-06 [overlap_grad_flash_sp]: 2.632e-05 [begin_end_overlap_inline]: 5.99975e-07 [split_matmul_comm_elemetwise]: 2.63e-06 [split_layernorm_comm]: 1.91e-06 [handle_group_info]: 9.29984e-07 [symbol_engine_optimizer]: 8.618e-05, [1] [Cycle 1]: 8.172e-05, [6] [build]: 3.69002e-06 [elim_shapecalc]: 1.191e-05 [elim_not_effective]: 1.571e-05 [opt_reshape]: 8.50999e-06 [fold_const_symbol]: 1.265e-05 [renormalize]: 2.09984e-07 [detach_backward]: 2.36998e-06 [pipeline_parallel_scheduler]: 1.84e-06 [auto_monad_reorder]: 2.178e-05 [get_jit_bprop_graph]: 1.91998e-06 [rewriter_after_jit_bprop_graph]: 4.08999e-06 [opt_after_jit_grad]: 0.00048203 [validate]: 4.933e-05 Sums bootstrap : 0.000429s : 0.19% type_inference : 0.218470s : 97.10% event_method : 0.000020s : 0.01% auto_monad : 0.000068s : 0.03% graph_reusing : 0.000006s : 0.00% inline : 0.000003s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000023s : 0.01% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.00% parallel-infer-symbol : 0.000004s : 0.00% pre_auto_parallel : 0.000041s : 0.02% insert-virtual-dataset : 0.000002s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000030s : 0.01% optimize.rewriter_before_opt_a : 0.000110s : 0.05% optimize.opt_a.expand_dump_flag : 0.000006s : 0.00% optimize.opt_a.switch_simplify : 0.000056s : 0.02% optimize.opt_a.loop_unroll : 0.000039s : 0.02% optimize.opt_a.a_1 : 0.001016s : 0.45% optimize.opt_a.with_stream_mark : 0.000038s : 0.02% optimize.opt_a.recompute_prepare : 0.000020s : 0.01% optimize.opt_a.updatestate_depend_eliminate : 0.000010s : 0.00% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.00% optimize.opt_a.updatestate_loads_eliminate : 0.000007s : 0.00% optimize.opt_a.parameter_eliminate : 0.000003s : 0.00% optimize.opt_a.a_2 : 0.000336s : 0.15% optimize.opt_a.accelerated_algorithm : 0.000019s : 0.01% optimize.opt_a.shard : 0.000004s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.000005s : 0.00% optimize.opt_a.shard_inline : 0.000016s : 0.01% optimize.opt_a.merge_send_recv : 0.000018s : 0.01% optimize.opt_a.auto_parallel : 0.000017s : 0.01% optimize.opt_a.parallel : 0.000028s : 0.01% optimize.opt_a.flash_sp : 0.000016s : 0.01% optimize.opt_a.merge_comm : 0.000009s : 0.00% optimize.opt_a.allreduce_fusion : 0.000009s : 0.00% optimize.opt_a.matmul_add_comm_reduction : 0.000022s : 0.01% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000002s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000021s : 0.01% optimize.opt_a.virtual_dataset : 0.000017s : 0.01% optimize.opt_a.get_grad_eliminate_ : 0.000016s : 0.01% optimize.opt_a.virtual_output : 0.000017s : 0.01% optimize.opt_a.merge_forward : 0.000010s : 0.00% optimize.opt_a.cell_reuse_recompute_pass : 0.000004s : 0.00% optimize.opt_a.offload_activation : 0.000021s : 0.01% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000035s : 0.02% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.00% optimize.opt_a.before_grad : 0.000027s : 0.01% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000011s : 0.00% optimize.opt_a.meta_fg_expand : 0.000007s : 0.00% optimize.opt_a.flash_sp_send_recv_attached : 0.000003s : 0.00% optimize.opt_a.receive_attached : 0.000005s : 0.00% optimize.opt_a.after_resolve : 0.000028s : 0.01% optimize.opt_a.a_after_grad : 0.000024s : 0.01% optimize.opt_a.renormalize : 0.000964s : 0.43% optimize.opt_a.add_forward_monad_depend : 0.000009s : 0.00% optimize.opt_a.auto_monad_grad : 0.000005s : 0.00% optimize.opt_a.auto_monad_eliminator : 0.000031s : 0.01% optimize.opt_a.cse : 0.000064s : 0.03% optimize.opt_a.a_3 : 0.000111s : 0.05% optimize.py_interpret_to_execute_after_opt_a : 0.000014s : 0.01% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.00% optimize.rewriter_after_opt_a : 0.000044s : 0.02% optimize.convert_after_rewriter : 0.000008s : 0.00% optimize.order_py_execute_after_rewriter : 0.000006s : 0.00% optimize.mutable_eliminate : 0.000869s : 0.39% optimize.opt_b.b_1 : 0.000158s : 0.07% optimize.opt_b.b_2 : 0.000010s : 0.00% optimize.opt_b.updatestate_depend_eliminate : 0.000008s : 0.00% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.00% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000025s : 0.01% optimize.optimize_parallel_all_gather_comm : 0.000019s : 0.01% optimize.overlap_param_gather : 0.000002s : 0.00% optimize.cconv : 0.000033s : 0.01% optimize.loop_unroll : 0.000476s : 0.21% optimize.opt_after_cconv.c_1 : 0.000040s : 0.02% optimize.opt_after_cconv.parameter_eliminate : 0.000004s : 0.00% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.00% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.cse : 0.000024s : 0.01% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000017s : 0.01% optimize.tuple_transform.d_1 : 0.000053s : 0.02% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000009s : 0.00% optimize.partial_unused_args_eliminate : 0.000002s : 0.00% optimize.add_recomputation : 0.000063s : 0.03% optimize.cse_after_recomputation.cse : 0.000016s : 0.01% optimize.environ_conv : 0.000007s : 0.00% optimize.swap_dp_allreduce_reducescatter : 0.000007s : 0.00% optimize.bias_add_comm_swap : 0.000003s : 0.00% optimize.label_micro_interleaved_index : 0.000005s : 0.00% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.00% optimize.merge_cast_opt : 0.000002s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.00% optimize.micro_interleaved_order_control : 0.000003s : 0.00% optimize.assign_add_opt : 0.000002s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.00% optimize.full_micro_interleaved_order_control : 0.000002s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.00% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000002s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.00% optimize.control_data_broadcast_order : 0.000015s : 0.01% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.00% optimize.offloading_packed_experts : 0.000004s : 0.00% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.00% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000002s : 0.00% optimize.overlap_recompute_comm : 0.000002s : 0.00% optimize.overlap_grad_ring_attention : 0.000005s : 0.00% optimize.overlap_grad_flash_sp : 0.000026s : 0.01% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000003s : 0.00% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000004s : 0.00% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000012s : 0.01% optimize.symbol_engine_optimizer.elim_not_effective : 0.000016s : 0.01% optimize.symbol_engine_optimizer.opt_reshape : 0.000009s : 0.00% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000013s : 0.01% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.00% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000022s : 0.01% get_jit_bprop_graph : 0.000002s : 0.00% rewriter_after_jit_bprop_graph : 0.000004s : 0.00% opt_after_jit_grad : 0.000482s : 0.21% validate : 0.000049s : 0.02% Time group info: ------[substitution.] 0.000308 38 28.65% : 0.000088s : 3: substitution.cast_eliminate 0.77% : 0.000002s : 3: substitution.elim_not_effective 0.59% : 0.000002s : 3: substitution.fold_const_symbol 2.32% : 0.000007s : 5: substitution.graph_param_transform 56.41% : 0.000174s : 4: substitution.inline 1.60% : 0.000005s : 6: substitution.j_node_and_user_rematch 2.48% : 0.000008s : 6: substitution.remove_not_recompute_node 1.96% : 0.000006s : 4: substitution.replace_old_param 5.23% : 0.000016s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.218401 2 99.64% : 0.217607s : 1: type_inference.infer 0.36% : 0.000793s : 1: type_inference.specialize ------[replace.] 0.000074 8 60.17% : 0.000044s : 4: replace.inline 39.83% : 0.000029s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000185 8 92.47% : 0.000171s : 4: match.inline 7.53% : 0.000014s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000251 1504 1.01% : 0.000003s : 15: predicate.accumulaten_eliminater 0.74% : 0.000002s : 5: predicate.ad_related_special_op_eliminate 0.57% : 0.000001s : 10: predicate.addn_check_dump 0.87% : 0.000002s : 15: predicate.addn_zero_filter 0.77% : 0.000002s : 15: predicate.adjust_all_reduce_mul_add 2.12% : 0.000005s : 25: predicate.arithmetic_simplify 1.06% : 0.000003s : 15: predicate.cast_eliminate 0.61% : 0.000002s : 10: predicate.check_bprop_eliminate 0.55% : 0.000001s : 10: predicate.compare_switch_simplify 0.19% : 0.000000s : 5: predicate.const_output_eliminate 0.88% : 0.000002s : 10: predicate.depend_value_elim 0.92% : 0.000002s : 15: predicate.dict_get_item_const_eliminator 1.02% : 0.000003s : 15: predicate.dict_get_item_eliminator 0.94% : 0.000002s : 15: predicate.dict_set_item_eliminator 0.90% : 0.000002s : 10: predicate.dumpgradient_eliminate 0.26% : 0.000001s : 5: predicate.elim_not_effective 0.35% : 0.000001s : 5: predicate.elim_shapecalc_of_broadcastargs 1.10% : 0.000003s : 20: predicate.environ_add_const_eliminate 1.03% : 0.000003s : 20: predicate.environ_get_add_eliminate 1.03% : 0.000003s : 20: predicate.environ_get_depend_swap 1.91% : 0.000005s : 30: predicate.environ_get_eliminate 1.10% : 0.000003s : 20: predicate.environ_get_set_eliminate 1.30% : 0.000003s : 23: predicate.exchange_switch_depend_value 2.30% : 0.000006s : 23: predicate.float_depend_g_call 0.52% : 0.000001s : 10: predicate.float_environ_get_switch 0.83% : 0.000002s : 15: predicate.float_tuple_getitem_switch 0.18% : 0.000000s : 5: predicate.fold_const_symbol 0.79% : 0.000002s : 10: predicate.get_grad_eliminate 0.21% : 0.000001s : 5: predicate.graph_param_transform 0.65% : 0.000002s : 10: predicate.incorporate_call 0.67% : 0.000002s : 10: predicate.incorporate_call_switch 7.23% : 0.000018s : 68: predicate.inline 0.74% : 0.000002s : 10: predicate.inline_without_move 0.31% : 0.000001s : 10: predicate.j_node_and_user_rematch 0.91% : 0.000002s : 10: predicate.less_batch_normalization 1.73% : 0.000004s : 29: predicate.list_to_tuple_eliminator_ 2.45% : 0.000006s : 44: predicate.load_eliminater 0.89% : 0.000002s : 5: predicate.loop_unroll_after_grad 2.00% : 0.000005s : 36: predicate.loop_unroll_before_grad 1.71% : 0.000004s : 25: predicate.make_slice_get_slice_eliminator 0.57% : 0.000001s : 10: predicate.merge_addn 0.67% : 0.000002s : 10: predicate.micro_step_allgather_replace 0.61% : 0.000002s : 10: predicate.mini_step_allgather_replace 0.79% : 0.000002s : 15: predicate.minmaximum_grad 1.44% : 0.000004s : 5: predicate.mutable_eliminate 0.37% : 0.000001s : 5: predicate.opt_reshape 0.40% : 0.000001s : 5: predicate.parallel_virtual_node 1.64% : 0.000004s : 23: predicate.partial_defer_inline 1.60% : 0.000004s : 24: predicate.partial_eliminate 0.86% : 0.000002s : 15: predicate.print_const_string_wrapper 0.60% : 0.000002s : 10: predicate.reduce_all_const_elim 1.16% : 0.000003s : 15: predicate.reduce_eliminate 2.38% : 0.000006s : 44: predicate.redundant_stop_gradient_eliminater 0.39% : 0.000001s : 10: predicate.remove_not_recompute_node 1.30% : 0.000003s : 29: predicate.replace_applicator 0.40% : 0.000001s : 10: predicate.replace_old_param 0.33% : 0.000001s : 5: predicate.reset_defer_inline 0.93% : 0.000002s : 15: predicate.reshape_eliminate 0.60% : 0.000002s : 10: predicate.row_tensor_add_zeros_like 0.45% : 0.000001s : 5: predicate.row_tensor_eliminate 0.85% : 0.000002s : 10: predicate.same_eliminate 0.51% : 0.000001s : 10: predicate.set_cell_output_no_recompute 0.85% : 0.000002s : 10: predicate.shard_identity_eliminate 0.73% : 0.000002s : 10: predicate.special_op_eliminate 0.79% : 0.000002s : 10: predicate.specialize_transform 1.02% : 0.000003s : 10: predicate.split_environ_get_set_with_tuple_value 0.72% : 0.000002s : 10: predicate.stack_unstack_eliminate 0.35% : 0.000001s : 5: predicate.switch_call_monad_eliminater 1.37% : 0.000003s : 23: predicate.switch_defer_inline 1.90% : 0.000005s : 33: predicate.switch_layer_defer_inline 4.59% : 0.000012s : 74: predicate.switch_simplify 0.83% : 0.000002s : 15: predicate.tile_eliminate 0.97% : 0.000002s : 15: predicate.transpose_eliminate 1.47% : 0.000004s : 25: predicate.tuple_list_convert_item_index_to_positive 1.69% : 0.000004s : 25: predicate.tuple_list_get_item_const_eliminator 1.45% : 0.000004s : 25: predicate.tuple_list_get_item_depend_reorder 3.16% : 0.000008s : 39: predicate.tuple_list_get_item_eliminator 1.47% : 0.000004s : 25: predicate.tuple_list_get_set_item_eliminator 2.48% : 0.000006s : 35: predicate.tuple_list_set_item_eliminator 1.79% : 0.000004s : 29: predicate.tuple_to_list_eliminator_ 2.35% : 0.000006s : 44: predicate.updatestate_pure_node_eliminater 3.26% : 0.000008s : 54: predicate.updatestate_useless_node_eliminater 0.37% : 0.000001s : 5: predicate.value_based_eliminate 0.60% : 0.000002s : 10: predicate.virtual_dataset_eliminate 0.73% : 0.000002s : 10: predicate.virtual_output_eliminate 0.45% : 0.000001s : 5: predicate.virtual_view_grad_eliminate 0.39% : 0.000001s : 5: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000625 11 53.77% : 0.000336s : 5: func_graph_cloner_run.FuncGraphClonerGraph 46.23% : 0.000289s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.242016 192 0.00% : 0.000004s : 1: ForceFp32Comm 1.43% : 0.003456s : 1: add_attr 1.42% : 0.003443s : 1: add_attr_with_inline 0.00% : 0.000004s : 1: add_comm_op_reuse_tag 0.03% : 0.000067s : 1: add_recomputation 0.00% : 0.000005s : 1: assign_add_opt 0.03% : 0.000074s : 1: auto_monad 0.01% : 0.000027s : 1: auto_monad_reorder 0.00% : 0.000004s : 1: begin_end_overlap_inline 0.00% : 0.000006s : 1: bias_add_comm_swap 0.19% : 0.000457s : 1: bootstrap 0.02% : 0.000037s : 1: cconv 0.00% : 0.000004s : 1: comm_op_add_attrs 0.01% : 0.000018s : 1: control_data_broadcast_order 0.00% : 0.000011s : 1: convert_after_rewriter 0.01% : 0.000029s : 1: cse_after_recomputation 0.00% : 0.000005s : 1: dataset_repeat_opt 0.00% : 0.000006s : 1: detach_backward 0.00% : 0.000010s : 1: environ_conv 0.01% : 0.000027s : 1: event_method 0.00% : 0.000005s : 1: full_micro_interleaved_order_control 0.00% : 0.000005s : 1: get_jit_bprop_graph 0.00% : 0.000009s : 1: graph_reusing 0.00% : 0.000005s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000004s : 1: handle_group_info 0.00% : 0.000006s : 1: inline 0.00% : 0.000006s : 1: insert-virtual-dataset 0.00% : 0.000004s : 1: interleave_parallel_branches 0.00% : 0.000004s : 1: interleave_split_concat_branches 0.00% : 0.000006s : 1: label_fine_grained_interleaved_index 0.00% : 0.000007s : 1: label_micro_interleaved_index 0.20% : 0.000485s : 1: loop_unroll 0.00% : 0.000005s : 1: merge_cast_opt 0.00% : 0.000005s : 1: micro_interleaved_order_control 0.36% : 0.000880s : 1: mutable_eliminate 0.00% : 0.000007s : 1: offloading_packed_experts 0.01% : 0.000017s : 1: opt.transform.loop_unroll_optimizer 0.01% : 0.000021s : 1: opt.transform.mutable_eliminate 0.70% : 0.001691s : 78: opt.transform.opt_a 0.02% : 0.000038s : 1: opt.transform.opt_after_cconv 0.01% : 0.000030s : 1: opt.transform.opt_after_jit_grad 0.06% : 0.000135s : 28: opt.transform.opt_b 0.02% : 0.000059s : 2: opt.transform.opt_trans_graph 0.02% : 0.000044s : 4: opt.transform.symbol_engine_opt 1.50% : 0.003638s : 1: opt_a 0.05% : 0.000125s : 1: opt_after_cconv 0.20% : 0.000491s : 1: opt_after_jit_grad 0.10% : 0.000254s : 1: opt_b 2.58% : 0.006254s : 1: optimize 0.01% : 0.000023s : 1: optimize_parallel_all_gather_comm 0.00% : 0.000009s : 1: order_py_execute_after_rewriter 0.01% : 0.000030s : 1: overlap_grad_flash_sp 0.00% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.00% : 0.000009s : 1: overlap_grad_ring_attention 0.00% : 0.000006s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000005s : 1: overlap_param_gather 0.00% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.00% : 0.000008s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000006s : 1: overlap_recompute_comm 0.00% : 0.000007s : 1: parallel-infer-symbol 0.00% : 0.000004s : 1: parallel-infer-symbol-second 0.00% : 0.000005s : 1: partial_unused_args_eliminate 0.00% : 0.000005s : 1: pipeline_parallel_scheduler 0.00% : 0.000005s : 1: pipeline_split 0.02% : 0.000045s : 1: pre_auto_parallel 0.01% : 0.000034s : 1: py_interpret_to_execute 0.01% : 0.000018s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000005s : 1: remove_cast_before_assign_add 0.01% : 0.000020s : 1: remove_dup_value 0.23% : 0.000556s : 1: renormalize.infer 0.16% : 0.000398s : 1: renormalize.specialize 0.00% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.00% : 0.000007s : 1: rewriter_after_jit_bprop_graph 0.02% : 0.000048s : 1: rewriter_after_opt_a 0.05% : 0.000115s : 1: rewriter_before_opt_a 0.00% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000005s : 1: slice_recompute_activation 0.00% : 0.000005s : 1: split_layernorm_comm 0.00% : 0.000005s : 1: split_matmul_comm_elemetwise 0.00% : 0.000009s : 1: swap_dp_allreduce_reducescatter 0.04% : 0.000089s : 1: symbol_engine_optimizer 0.04% : 0.000090s : 1: tuple_transform 90.28% : 0.218490s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:40:10.947.605 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:40:10.947.885 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.176245, [21] [bootstrap]: 0.00045134 [type_inference]: 0.164415 [event_method]: 2.398e-05 [auto_monad]: 7.17e-05 [graph_reusing]: 6.94001e-06 [inline]: 2.74001e-06 [add_attr]: 0.00377844, [1] [add_attr_with_inline]: 0.00376595, [1] [Cycle 1]: 9.37e-05, [2] [tag_attr]: 2.591e-05 [meta_addattr_fg_expand]: 6.60997e-06 [parallel-infer-symbol]: 3.6e-06 [pre_auto_parallel]: 4.017e-05 [insert-virtual-dataset]: 2.79999e-06 [parallel-infer-symbol-second]: 6.69999e-07 [dataset_repeat_opt]: 2.12001e-06 [pipeline_split]: 1.60001e-06 [optimize]: 0.00612079, [53] [py_interpret_to_execute]: 3.432e-05 [rewriter_before_opt_a]: 9.763e-05 [opt_a]: 0.00349222, [2] [Cycle 1]: 0.00258835, [45] [expand_dump_flag]: 3.08e-06 [switch_simplify]: 4.548e-05 [loop_unroll]: 3.185e-05 [a_1]: 0.00067969 [with_stream_mark]: 1.958e-05 [recompute_prepare]: 9.63997e-06 [updatestate_depend_eliminate]: 4.27e-06 [updatestate_assign_eliminate]: 3.58e-06 [updatestate_loads_eliminate]: 3.34001e-06 [parameter_eliminate]: 2.14e-06 [a_2]: 0.00011547 [accelerated_algorithm]: 8.13999e-06 [shard]: 2.02999e-06 [meta_shard_fg_expand]: 1.89999e-06 [shard_inline]: 7.48e-06 [merge_send_recv]: 9.20001e-06 [auto_parallel]: 7.06999e-06 [parallel]: 2.019e-05 [flash_sp]: 1.096e-05 [merge_comm]: 4.37998e-06 [allreduce_fusion]: 3.51001e-06 [matmul_add_comm_reduction]: 1.004e-05 [allreduce_slice_to_reducescatter]: 9.30013e-07 [virtual_shard_identity]: 9.51e-06 [virtual_dataset]: 8.23001e-06 [get_grad_eliminate_]: 7.31001e-06 [virtual_output]: 8.11002e-06 [merge_forward]: 5.03002e-06 [cell_reuse_recompute_pass]: 1.32e-06 [offload_activation]: 1.153e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.826e-05 [merge_recompute_call_nodes]: 1.75001e-06 [before_grad]: 1.195e-05 [set_forward_comm_id_for_comm_node_pass]: 4.13999e-06 [meta_fg_expand]: 2.99999e-06 [flash_sp_send_recv_attached]: 2.68e-06 [receive_attached]: 2.31e-06 [after_resolve]: 1.301e-05 [a_after_grad]: 1.271e-05 [renormalize]: 0.00091088 [add_forward_monad_depend]: 7.63999e-06 [auto_monad_grad]: 2.83e-06 [auto_monad_eliminator]: 1.818e-05 [cse]: 3.322e-05 [a_3]: 7.228e-05 [Cycle 2]: 0.00088759, [45] [expand_dump_flag]: 2.07999e-06 [switch_simplify]: 7.72002e-06 [loop_unroll]: 6.53998e-06 [a_1]: 0.00014559 [with_stream_mark]: 1.502e-05 [recompute_prepare]: 6.46e-06 [updatestate_depend_eliminate]: 3.16001e-06 [updatestate_assign_eliminate]: 2.81e-06 [updatestate_loads_eliminate]: 2.44999e-06 [parameter_eliminate]: 1.62999e-06 [a_2]: 0.00010148 [accelerated_algorithm]: 6.59999e-06 [shard]: 2.32001e-06 [meta_shard_fg_expand]: 1.74e-06 [shard_inline]: 1.438e-05 [merge_send_recv]: 5.78002e-06 [auto_parallel]: 6.49999e-06 [parallel]: 7.46999e-06 [flash_sp]: 3.78999e-06 [merge_comm]: 3.64002e-06 [allreduce_fusion]: 3.27002e-06 [matmul_add_comm_reduction]: 8.13001e-06 [allreduce_slice_to_reducescatter]: 6.59988e-07 [virtual_shard_identity]: 1.033e-05 [virtual_dataset]: 6.16e-06 [get_grad_eliminate_]: 5.89999e-06 [virtual_output]: 6.16e-06 [merge_forward]: 3.8e-06 [cell_reuse_recompute_pass]: 2.16e-06 [offload_activation]: 8.46002e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.848e-05 [merge_recompute_call_nodes]: 8.70001e-07 [before_grad]: 1.048e-05 [set_forward_comm_id_for_comm_node_pass]: 4.12e-06 [meta_fg_expand]: 2.61999e-06 [flash_sp_send_recv_attached]: 1.19003e-06 [receive_attached]: 1.35001e-06 [after_resolve]: 1.161e-05 [a_after_grad]: 9.66e-06 [renormalize]: 7.99773e-08 [add_forward_monad_depend]: 1.60999e-06 [auto_monad_grad]: 1.37e-06 [auto_monad_eliminator]: 9.56e-06 [cse]: 1.799e-05 [a_3]: 5.13e-05 [py_interpret_to_execute_after_opt_a]: 1.646e-05 [slice_cell_reuse_recomputed_activation]: 4.81002e-06 [rewriter_after_opt_a]: 4.441e-05 [convert_after_rewriter]: 1.073e-05 [order_py_execute_after_rewriter]: 8.56002e-06 [mutable_eliminate]: 0.00071966 [opt_b]: 0.00029095, [1] [Cycle 1]: 0.00028068, [7] [b_1]: 0.00017388 [b_2]: 8.42e-06 [updatestate_depend_eliminate]: 7.93999e-06 [updatestate_assign_eliminate]: 2.91999e-06 [updatestate_loads_eliminate]: 2.69999e-06 [renormalize]: 5.89993e-07 [cse]: 2.459e-05 [optimize_parallel_all_gather_comm]: 2.105e-05 [overlap_param_gather]: 4.94e-06 [cconv]: 3.669e-05 [loop_unroll]: 0.00048331 [opt_after_cconv]: 0.00013383, [1] [Cycle 1]: 0.00012384, [7] [c_1]: 3.202e-05 [parameter_eliminate]: 4e-06 [updatestate_depend_eliminate]: 5.92001e-06 [updatestate_assign_eliminate]: 2.56998e-06 [updatestate_loads_eliminate]: 2.27999e-06 [cse]: 1.922e-05 [renormalize]: 3.39991e-07 [remove_dup_value]: 1.847e-05 [tuple_transform]: 9.469e-05, [1] [Cycle 1]: 8.733e-05, [4] [d_1]: 4.641e-05 [none_parameter_eliminate]: 1.89e-06 [renormalize]: 2.09984e-07 [switch_simplify]: 7e-06 [partial_unused_args_eliminate]: 4.63999e-06 [add_recomputation]: 5.433e-05 [cse_after_recomputation]: 2.991e-05, [1] [Cycle 1]: 2.237e-05, [1] [cse]: 1.256e-05 [environ_conv]: 8.25999e-06 [swap_dp_allreduce_reducescatter]: 8.1e-06 [bias_add_comm_swap]: 5.52999e-06 [label_micro_interleaved_index]: 7.58001e-06 [label_fine_grained_interleaved_index]: 5.38002e-06 [merge_cast_opt]: 3.9e-06 [slice_recompute_activation]: 4.79e-06 [micro_interleaved_order_control]: 5.28002e-06 [assign_add_opt]: 3.85998e-06 [ForceFp32Comm]: 3.6e-06 [remove_cast_before_assign_add]: 4.15999e-06 [full_micro_interleaved_order_control]: 4.97e-06 [reorder_send_recv_between_fp_bp]: 5.82001e-06 [comm_op_add_attrs]: 3.41999e-06 [add_comm_op_reuse_tag]: 3.44001e-06 [interleave_split_concat_branches]: 3.67002e-06 [interleave_parallel_branches]: 3.58e-06 [overlap_opt_shard_in_pipeline]: 3.59002e-06 [overlap_opt_shard_grad_in_pipeline]: 4.47998e-06 [control_data_broadcast_order]: 1.655e-05 [grouped_pairwise_exchange_alltoall]: 4.06001e-06 [offloading_packed_experts]: 7.09001e-06 [overlap_recompute_and_grad_model_parallel]: 7.90998e-06 [overlap_grad_matmul_and_grad_allreduce]: 4.17998e-06 [overlap_recompute_allgather_and_fa_grad]: 3.76001e-06 [overlap_recompute_comm]: 5.10999e-06 [overlap_grad_ring_attention]: 7.21001e-06 [overlap_grad_flash_sp]: 2.651e-05 [begin_end_overlap_inline]: 3.11001e-06 [split_matmul_comm_elemetwise]: 4.70001e-06 [split_layernorm_comm]: 4.47e-06 [handle_group_info]: 3.75e-06 [symbol_engine_optimizer]: 0.00010358, [1] [Cycle 1]: 9.646e-05, [6] [build]: 3.69002e-06 [elim_shapecalc]: 1.045e-05 [elim_not_effective]: 1.379e-05 [opt_reshape]: 7.23e-06 [fold_const_symbol]: 1.136e-05 [renormalize]: 2.3999e-07 [detach_backward]: 4.43001e-06 [pipeline_parallel_scheduler]: 1.86e-06 [auto_monad_reorder]: 2.508e-05 [get_jit_bprop_graph]: 2.14999e-06 [rewriter_after_jit_bprop_graph]: 6.48e-06 [opt_after_jit_grad]: 0.00062087 [validate]: 4.479e-05 Sums bootstrap : 0.000451s : 0.26% type_inference : 0.164415s : 96.36% event_method : 0.000024s : 0.01% auto_monad : 0.000072s : 0.04% graph_reusing : 0.000007s : 0.00% inline : 0.000003s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000026s : 0.02% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000007s : 0.00% parallel-infer-symbol : 0.000004s : 0.00% pre_auto_parallel : 0.000040s : 0.02% insert-virtual-dataset : 0.000003s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000034s : 0.02% optimize.rewriter_before_opt_a : 0.000098s : 0.06% optimize.opt_a.expand_dump_flag : 0.000005s : 0.00% optimize.opt_a.switch_simplify : 0.000053s : 0.03% optimize.opt_a.loop_unroll : 0.000038s : 0.02% optimize.opt_a.a_1 : 0.000825s : 0.48% optimize.opt_a.with_stream_mark : 0.000035s : 0.02% optimize.opt_a.recompute_prepare : 0.000016s : 0.01% optimize.opt_a.updatestate_depend_eliminate : 0.000007s : 0.00% optimize.opt_a.updatestate_assign_eliminate : 0.000006s : 0.00% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.00% optimize.opt_a.parameter_eliminate : 0.000004s : 0.00% optimize.opt_a.a_2 : 0.000217s : 0.13% optimize.opt_a.accelerated_algorithm : 0.000015s : 0.01% optimize.opt_a.shard : 0.000004s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.00% optimize.opt_a.shard_inline : 0.000022s : 0.01% optimize.opt_a.merge_send_recv : 0.000015s : 0.01% optimize.opt_a.auto_parallel : 0.000014s : 0.01% optimize.opt_a.parallel : 0.000028s : 0.02% optimize.opt_a.flash_sp : 0.000015s : 0.01% optimize.opt_a.merge_comm : 0.000008s : 0.00% optimize.opt_a.allreduce_fusion : 0.000007s : 0.00% optimize.opt_a.matmul_add_comm_reduction : 0.000018s : 0.01% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000002s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000020s : 0.01% optimize.opt_a.virtual_dataset : 0.000014s : 0.01% optimize.opt_a.get_grad_eliminate_ : 0.000013s : 0.01% optimize.opt_a.virtual_output : 0.000014s : 0.01% optimize.opt_a.merge_forward : 0.000009s : 0.01% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.00% optimize.opt_a.offload_activation : 0.000020s : 0.01% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000037s : 0.02% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.00% optimize.opt_a.before_grad : 0.000022s : 0.01% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000008s : 0.00% optimize.opt_a.meta_fg_expand : 0.000006s : 0.00% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.00% optimize.opt_a.receive_attached : 0.000004s : 0.00% optimize.opt_a.after_resolve : 0.000025s : 0.01% optimize.opt_a.a_after_grad : 0.000022s : 0.01% optimize.opt_a.renormalize : 0.000911s : 0.53% optimize.opt_a.add_forward_monad_depend : 0.000009s : 0.01% optimize.opt_a.auto_monad_grad : 0.000004s : 0.00% optimize.opt_a.auto_monad_eliminator : 0.000028s : 0.02% optimize.opt_a.cse : 0.000051s : 0.03% optimize.opt_a.a_3 : 0.000124s : 0.07% optimize.py_interpret_to_execute_after_opt_a : 0.000016s : 0.01% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.00% optimize.rewriter_after_opt_a : 0.000044s : 0.03% optimize.convert_after_rewriter : 0.000011s : 0.01% optimize.order_py_execute_after_rewriter : 0.000009s : 0.01% optimize.mutable_eliminate : 0.000720s : 0.42% optimize.opt_b.b_1 : 0.000174s : 0.10% optimize.opt_b.b_2 : 0.000008s : 0.00% optimize.opt_b.updatestate_depend_eliminate : 0.000008s : 0.00% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.00% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000025s : 0.01% optimize.optimize_parallel_all_gather_comm : 0.000021s : 0.01% optimize.overlap_param_gather : 0.000005s : 0.00% optimize.cconv : 0.000037s : 0.02% optimize.loop_unroll : 0.000483s : 0.28% optimize.opt_after_cconv.c_1 : 0.000032s : 0.02% optimize.opt_after_cconv.parameter_eliminate : 0.000004s : 0.00% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.00% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.00% optimize.opt_after_cconv.cse : 0.000019s : 0.01% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000018s : 0.01% optimize.tuple_transform.d_1 : 0.000046s : 0.03% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000007s : 0.00% optimize.partial_unused_args_eliminate : 0.000005s : 0.00% optimize.add_recomputation : 0.000054s : 0.03% optimize.cse_after_recomputation.cse : 0.000013s : 0.01% optimize.environ_conv : 0.000008s : 0.00% optimize.swap_dp_allreduce_reducescatter : 0.000008s : 0.00% optimize.bias_add_comm_swap : 0.000006s : 0.00% optimize.label_micro_interleaved_index : 0.000008s : 0.00% optimize.label_fine_grained_interleaved_index : 0.000005s : 0.00% optimize.merge_cast_opt : 0.000004s : 0.00% optimize.slice_recompute_activation : 0.000005s : 0.00% optimize.micro_interleaved_order_control : 0.000005s : 0.00% optimize.assign_add_opt : 0.000004s : 0.00% optimize.ForceFp32Comm : 0.000004s : 0.00% optimize.remove_cast_before_assign_add : 0.000004s : 0.00% optimize.full_micro_interleaved_order_control : 0.000005s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000006s : 0.00% optimize.comm_op_add_attrs : 0.000003s : 0.00% optimize.add_comm_op_reuse_tag : 0.000003s : 0.00% optimize.interleave_split_concat_branches : 0.000004s : 0.00% optimize.interleave_parallel_branches : 0.000004s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000004s : 0.00% optimize.control_data_broadcast_order : 0.000017s : 0.01% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.00% optimize.offloading_packed_experts : 0.000007s : 0.00% optimize.overlap_recompute_and_grad_model_parallel : 0.000008s : 0.00% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.00% optimize.overlap_recompute_comm : 0.000005s : 0.00% optimize.overlap_grad_ring_attention : 0.000007s : 0.00% optimize.overlap_grad_flash_sp : 0.000027s : 0.02% optimize.begin_end_overlap_inline : 0.000003s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000005s : 0.00% optimize.split_layernorm_comm : 0.000004s : 0.00% optimize.handle_group_info : 0.000004s : 0.00% optimize.symbol_engine_optimizer.build : 0.000004s : 0.00% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000010s : 0.01% optimize.symbol_engine_optimizer.elim_not_effective : 0.000014s : 0.01% optimize.symbol_engine_optimizer.opt_reshape : 0.000007s : 0.00% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000011s : 0.01% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000004s : 0.00% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000025s : 0.01% get_jit_bprop_graph : 0.000002s : 0.00% rewriter_after_jit_bprop_graph : 0.000006s : 0.00% opt_after_jit_grad : 0.000621s : 0.36% validate : 0.000045s : 0.03% Time group info: ------[substitution.] 0.000212 28 1.11% : 0.000002s : 2: substitution.elim_not_effective 0.89% : 0.000002s : 2: substitution.fold_const_symbol 3.08% : 0.000007s : 4: substitution.graph_param_transform 79.12% : 0.000168s : 4: substitution.inline 2.04% : 0.000004s : 4: substitution.j_node_and_user_rematch 2.97% : 0.000006s : 4: substitution.remove_not_recompute_node 2.52% : 0.000005s : 4: substitution.replace_old_param 8.28% : 0.000018s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.164342 2 99.39% : 0.163335s : 1: type_inference.infer 0.61% : 0.001007s : 1: type_inference.specialize ------[replace.] 0.000068 8 63.88% : 0.000043s : 4: replace.inline 36.12% : 0.000024s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000181 8 91.45% : 0.000165s : 4: match.inline 8.55% : 0.000015s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000222 1278 1.02% : 0.000002s : 13: predicate.accumulaten_eliminater 1.22% : 0.000003s : 4: predicate.ad_related_special_op_eliminate 0.48% : 0.000001s : 8: predicate.addn_check_dump 0.88% : 0.000002s : 13: predicate.addn_zero_filter 0.80% : 0.000002s : 13: predicate.adjust_all_reduce_mul_add 2.25% : 0.000005s : 21: predicate.arithmetic_simplify 1.02% : 0.000002s : 13: predicate.cast_eliminate 0.77% : 0.000002s : 8: predicate.check_bprop_eliminate 0.49% : 0.000001s : 8: predicate.compare_switch_simplify 0.28% : 0.000001s : 4: predicate.const_output_eliminate 0.59% : 0.000001s : 8: predicate.depend_value_elim 0.85% : 0.000002s : 13: predicate.dict_get_item_const_eliminator 1.03% : 0.000002s : 13: predicate.dict_get_item_eliminator 0.87% : 0.000002s : 13: predicate.dict_set_item_eliminator 1.68% : 0.000004s : 8: predicate.dumpgradient_eliminate 0.27% : 0.000001s : 4: predicate.elim_not_effective 0.38% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.04% : 0.000002s : 17: predicate.environ_add_const_eliminate 1.05% : 0.000002s : 17: predicate.environ_get_add_eliminate 1.03% : 0.000002s : 17: predicate.environ_get_depend_swap 1.65% : 0.000004s : 25: predicate.environ_get_eliminate 1.07% : 0.000002s : 17: predicate.environ_get_set_eliminate 1.38% : 0.000003s : 21: predicate.exchange_switch_depend_value 2.62% : 0.000006s : 21: predicate.float_depend_g_call 0.60% : 0.000001s : 8: predicate.float_environ_get_switch 0.91% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.18% : 0.000000s : 4: predicate.fold_const_symbol 0.60% : 0.000001s : 8: predicate.get_grad_eliminate 0.22% : 0.000000s : 4: predicate.graph_param_transform 0.52% : 0.000001s : 8: predicate.incorporate_call 0.46% : 0.000001s : 8: predicate.incorporate_call_switch 6.56% : 0.000015s : 58: predicate.inline 0.92% : 0.000002s : 8: predicate.inline_without_move 0.30% : 0.000001s : 8: predicate.j_node_and_user_rematch 0.88% : 0.000002s : 8: predicate.less_batch_normalization 1.75% : 0.000004s : 25: predicate.list_to_tuple_eliminator_ 2.39% : 0.000005s : 38: predicate.load_eliminater 0.94% : 0.000002s : 4: predicate.loop_unroll_after_grad 2.23% : 0.000005s : 34: predicate.loop_unroll_before_grad 1.56% : 0.000003s : 21: predicate.make_slice_get_slice_eliminator 0.49% : 0.000001s : 8: predicate.merge_addn 0.55% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.58% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.75% : 0.000002s : 13: predicate.minmaximum_grad 1.35% : 0.000003s : 4: predicate.mutable_eliminate 0.33% : 0.000001s : 4: predicate.opt_reshape 0.54% : 0.000001s : 4: predicate.parallel_virtual_node 1.73% : 0.000004s : 21: predicate.partial_defer_inline 1.61% : 0.000004s : 21: predicate.partial_eliminate 0.84% : 0.000002s : 13: predicate.print_const_string_wrapper 0.53% : 0.000001s : 8: predicate.reduce_all_const_elim 1.13% : 0.000003s : 13: predicate.reduce_eliminate 2.38% : 0.000005s : 38: predicate.redundant_stop_gradient_eliminater 0.54% : 0.000001s : 8: predicate.remove_not_recompute_node 1.28% : 0.000003s : 25: predicate.replace_applicator 0.59% : 0.000001s : 8: predicate.replace_old_param 0.35% : 0.000001s : 4: predicate.reset_defer_inline 0.88% : 0.000002s : 13: predicate.reshape_eliminate 0.61% : 0.000001s : 8: predicate.row_tensor_add_zeros_like 0.41% : 0.000001s : 4: predicate.row_tensor_eliminate 1.04% : 0.000002s : 8: predicate.same_eliminate 0.44% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.84% : 0.000002s : 8: predicate.shard_identity_eliminate 0.69% : 0.000002s : 8: predicate.special_op_eliminate 0.74% : 0.000002s : 8: predicate.specialize_transform 1.02% : 0.000002s : 8: predicate.split_environ_get_set_with_tuple_value 0.69% : 0.000002s : 8: predicate.stack_unstack_eliminate 0.30% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.46% : 0.000003s : 21: predicate.switch_defer_inline 1.95% : 0.000004s : 29: predicate.switch_layer_defer_inline 5.13% : 0.000011s : 67: predicate.switch_simplify 0.86% : 0.000002s : 13: predicate.tile_eliminate 0.93% : 0.000002s : 13: predicate.transpose_eliminate 1.53% : 0.000003s : 21: predicate.tuple_list_convert_item_index_to_positive 1.44% : 0.000003s : 21: predicate.tuple_list_get_item_const_eliminator 1.52% : 0.000003s : 21: predicate.tuple_list_get_item_depend_reorder 3.24% : 0.000007s : 33: predicate.tuple_list_get_item_eliminator 1.30% : 0.000003s : 21: predicate.tuple_list_get_set_item_eliminator 1.99% : 0.000004s : 29: predicate.tuple_list_set_item_eliminator 1.65% : 0.000004s : 25: predicate.tuple_to_list_eliminator_ 2.34% : 0.000005s : 38: predicate.updatestate_pure_node_eliminater 2.93% : 0.000006s : 46: predicate.updatestate_useless_node_eliminater 0.42% : 0.000001s : 4: predicate.value_based_eliminate 0.74% : 0.000002s : 8: predicate.virtual_dataset_eliminate 0.80% : 0.000002s : 8: predicate.virtual_output_eliminate 0.24% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.53% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000755 11 49.64% : 0.000375s : 5: func_graph_cloner_run.FuncGraphClonerGraph 50.36% : 0.000380s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.188491 192 0.00% : 0.000006s : 1: ForceFp32Comm 2.01% : 0.003790s : 1: add_attr 2.00% : 0.003771s : 1: add_attr_with_inline 0.00% : 0.000007s : 1: add_comm_op_reuse_tag 0.03% : 0.000058s : 1: add_recomputation 0.00% : 0.000006s : 1: assign_add_opt 0.04% : 0.000080s : 1: auto_monad 0.02% : 0.000033s : 1: auto_monad_reorder 0.00% : 0.000006s : 1: begin_end_overlap_inline 0.00% : 0.000008s : 1: bias_add_comm_swap 0.26% : 0.000499s : 1: bootstrap 0.02% : 0.000040s : 1: cconv 0.00% : 0.000007s : 1: comm_op_add_attrs 0.01% : 0.000019s : 1: control_data_broadcast_order 0.01% : 0.000014s : 1: convert_after_rewriter 0.02% : 0.000033s : 1: cse_after_recomputation 0.00% : 0.000008s : 1: dataset_repeat_opt 0.01% : 0.000023s : 1: detach_backward 0.01% : 0.000011s : 1: environ_conv 0.02% : 0.000035s : 1: event_method 0.00% : 0.000008s : 1: full_micro_interleaved_order_control 0.01% : 0.000010s : 1: get_jit_bprop_graph 0.01% : 0.000013s : 1: graph_reusing 0.00% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000007s : 1: handle_group_info 0.00% : 0.000009s : 1: inline 0.00% : 0.000009s : 1: insert-virtual-dataset 0.00% : 0.000006s : 1: interleave_parallel_branches 0.00% : 0.000006s : 1: interleave_split_concat_branches 0.00% : 0.000008s : 1: label_fine_grained_interleaved_index 0.01% : 0.000010s : 1: label_micro_interleaved_index 0.26% : 0.000490s : 1: loop_unroll 0.00% : 0.000007s : 1: merge_cast_opt 0.00% : 0.000008s : 1: micro_interleaved_order_control 0.39% : 0.000727s : 1: mutable_eliminate 0.01% : 0.000010s : 1: offloading_packed_experts 0.01% : 0.000016s : 1: opt.transform.loop_unroll_optimizer 0.01% : 0.000020s : 1: opt.transform.mutable_eliminate 0.69% : 0.001295s : 78: opt.transform.opt_a 0.02% : 0.000030s : 1: opt.transform.opt_after_cconv 0.02% : 0.000033s : 1: opt.transform.opt_after_jit_grad 0.06% : 0.000108s : 28: opt.transform.opt_b 0.03% : 0.000051s : 2: opt.transform.opt_trans_graph 0.02% : 0.000039s : 4: opt.transform.symbol_engine_opt 1.85% : 0.003496s : 1: opt_a 0.07% : 0.000138s : 1: opt_after_cconv 0.34% : 0.000633s : 1: opt_after_jit_grad 0.16% : 0.000295s : 1: opt_b 3.43% : 0.006458s : 1: optimize 0.01% : 0.000025s : 1: optimize_parallel_all_gather_comm 0.01% : 0.000012s : 1: order_py_execute_after_rewriter 0.02% : 0.000030s : 1: overlap_grad_flash_sp 0.00% : 0.000007s : 1: overlap_grad_matmul_and_grad_allreduce 0.01% : 0.000010s : 1: overlap_grad_ring_attention 0.00% : 0.000007s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000006s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000008s : 1: overlap_param_gather 0.00% : 0.000006s : 1: overlap_recompute_allgather_and_fa_grad 0.01% : 0.000012s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000008s : 1: overlap_recompute_comm 0.01% : 0.000010s : 1: parallel-infer-symbol 0.00% : 0.000006s : 1: parallel-infer-symbol-second 0.00% : 0.000008s : 1: partial_unused_args_eliminate 0.00% : 0.000009s : 1: pipeline_parallel_scheduler 0.00% : 0.000007s : 1: pipeline_split 0.03% : 0.000048s : 1: pre_auto_parallel 0.02% : 0.000039s : 1: py_interpret_to_execute 0.01% : 0.000020s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000007s : 1: remove_cast_before_assign_add 0.01% : 0.000022s : 1: remove_dup_value 0.27% : 0.000506s : 1: renormalize.infer 0.21% : 0.000393s : 1: renormalize.specialize 0.00% : 0.000009s : 1: reorder_send_recv_between_fp_bp 0.01% : 0.000013s : 1: rewriter_after_jit_bprop_graph 0.03% : 0.000048s : 1: rewriter_after_opt_a 0.05% : 0.000101s : 1: rewriter_before_opt_a 0.00% : 0.000008s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000009s : 1: slice_recompute_activation 0.00% : 0.000008s : 1: split_layernorm_comm 0.00% : 0.000007s : 1: split_matmul_comm_elemetwise 0.01% : 0.000011s : 1: swap_dp_allreduce_reducescatter 0.06% : 0.000106s : 1: symbol_engine_optimizer 0.05% : 0.000098s : 1: tuple_transform 87.26% : 0.164469s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:40:12.958.028 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.294337, [21] [bootstrap]: 0.00044498 [type_inference]: 0.165846 [event_method]: 2.007e-05 [auto_monad]: 6.455e-05 [graph_reusing]: 5.81e-06 [inline]: 2.85998e-06 [add_attr]: 0.00366181, [1] [add_attr_with_inline]: 0.00364855, [1] [Cycle 1]: 7.947e-05, [2] [tag_attr]: 2.697e-05 [meta_addattr_fg_expand]: 5.92999e-06 [parallel-infer-symbol]: 4.26001e-06 [pre_auto_parallel]: 4.13e-05 [insert-virtual-dataset]: 2.35002e-06 [parallel-infer-symbol-second]: 7.89994e-07 [dataset_repeat_opt]: 2.23002e-06 [pipeline_split]: 1.62001e-06 [optimize]: 0.123162, [53] [py_interpret_to_execute]: 3.233e-05 [rewriter_before_opt_a]: 9.908e-05 [opt_a]: 0.00322424, [2] [Cycle 1]: 0.0024781, [45] [expand_dump_flag]: 3.9e-06 [switch_simplify]: 4.461e-05 [loop_unroll]: 3.041e-05 [a_1]: 0.00067946 [with_stream_mark]: 2.421e-05 [recompute_prepare]: 1.215e-05 [updatestate_depend_eliminate]: 4.52998e-06 [updatestate_assign_eliminate]: 3.31001e-06 [updatestate_loads_eliminate]: 3.61999e-06 [parameter_eliminate]: 2.22001e-06 [a_2]: 8.699e-05 [accelerated_algorithm]: 8.91002e-06 [shard]: 2.59001e-06 [meta_shard_fg_expand]: 2.11e-06 [shard_inline]: 7.77e-06 [merge_send_recv]: 1.006e-05 [auto_parallel]: 8.37e-06 [parallel]: 2.161e-05 [flash_sp]: 1.129e-05 [merge_comm]: 4.17e-06 [allreduce_fusion]: 3.69002e-06 [matmul_add_comm_reduction]: 1.092e-05 [allreduce_slice_to_reducescatter]: 1.06997e-06 [virtual_shard_identity]: 1.277e-05 [virtual_dataset]: 7.47998e-06 [get_grad_eliminate_]: 6.76e-06 [virtual_output]: 7.25998e-06 [merge_forward]: 5.17e-06 [cell_reuse_recompute_pass]: 1.64998e-06 [offload_activation]: 1.091e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.613e-05 [merge_recompute_call_nodes]: 1.72001e-06 [before_grad]: 1.282e-05 [set_forward_comm_id_for_comm_node_pass]: 4.92e-06 [meta_fg_expand]: 2.98e-06 [flash_sp_send_recv_attached]: 3.00002e-06 [receive_attached]: 2.06e-06 [after_resolve]: 1.401e-05 [a_after_grad]: 1.106e-05 [renormalize]: 0.00095977 [add_forward_monad_depend]: 7.77e-06 [auto_monad_grad]: 2.78e-06 [auto_monad_eliminator]: 2.073e-05 [cse]: 3.473e-05 [a_3]: 5.616e-05 [Cycle 2]: 0.00073237, [45] [expand_dump_flag]: 1.97999e-06 [switch_simplify]: 8.24998e-06 [loop_unroll]: 6.53e-06 [a_1]: 0.00013595 [with_stream_mark]: 1.732e-05 [recompute_prepare]: 6.66999e-06 [updatestate_depend_eliminate]: 3.18998e-06 [updatestate_assign_eliminate]: 2.79001e-06 [updatestate_loads_eliminate]: 2.56998e-06 [parameter_eliminate]: 1.42e-06 [a_2]: 7.413e-05 [accelerated_algorithm]: 6.82002e-06 [shard]: 2.29999e-06 [meta_shard_fg_expand]: 1.81e-06 [shard_inline]: 1.473e-05 [merge_send_recv]: 7.06001e-06 [auto_parallel]: 7.51999e-06 [parallel]: 6.63e-06 [flash_sp]: 3.98999e-06 [merge_comm]: 3.85e-06 [allreduce_fusion]: 3.3e-06 [matmul_add_comm_reduction]: 8.93002e-06 [allreduce_slice_to_reducescatter]: 9.79984e-07 [virtual_shard_identity]: 7.65998e-06 [virtual_dataset]: 6.08998e-06 [get_grad_eliminate_]: 5.84e-06 [virtual_output]: 6.18002e-06 [merge_forward]: 3.48999e-06 [cell_reuse_recompute_pass]: 3.13e-06 [offload_activation]: 9.27001e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.627e-05 [merge_recompute_call_nodes]: 1.22999e-06 [before_grad]: 1.133e-05 [set_forward_comm_id_for_comm_node_pass]: 3.86001e-06 [meta_fg_expand]: 2.69001e-06 [flash_sp_send_recv_attached]: 1.29998e-06 [receive_attached]: 1.97001e-06 [after_resolve]: 1.2e-05 [a_after_grad]: 1.039e-05 [renormalize]: 7.99773e-08 [add_forward_monad_depend]: 1.35999e-06 [auto_monad_grad]: 1.45999e-06 [auto_monad_eliminator]: 9.32001e-06 [cse]: 1.776e-05 [a_3]: 3.665e-05 [py_interpret_to_execute_after_opt_a]: 1.318e-05 [slice_cell_reuse_recomputed_activation]: 2.64999e-06 [rewriter_after_opt_a]: 4.018e-05 [convert_after_rewriter]: 7.53999e-06 [order_py_execute_after_rewriter]: 5.25001e-06 [mutable_eliminate]: 0.00073988 [opt_b]: 0.00021844, [1] [Cycle 1]: 0.00021139, [7] [b_1]: 0.00012837 [b_2]: 8.48001e-06 [updatestate_depend_eliminate]: 8.47e-06 [updatestate_assign_eliminate]: 2.34999e-06 [updatestate_loads_eliminate]: 2.39001e-06 [renormalize]: 6.69999e-07 [cse]: 2.321e-05 [optimize_parallel_all_gather_comm]: 1.703e-05 [overlap_param_gather]: 2.21998e-06 [cconv]: 3.349e-05 [loop_unroll]: 0.00046968 [opt_after_cconv]: 0.00010903, [1] [Cycle 1]: 0.00010282, [7] [c_1]: 3.28e-05 [parameter_eliminate]: 4.32e-06 [updatestate_depend_eliminate]: 5.59998e-06 [updatestate_assign_eliminate]: 2.30002e-06 [updatestate_loads_eliminate]: 2.44999e-06 [cse]: 1.923e-05 [renormalize]: 2.80008e-07 [remove_dup_value]: 1.529e-05 [tuple_transform]: 7.991e-05, [1] [Cycle 1]: 7.525e-05, [4] [d_1]: 4.601e-05 [none_parameter_eliminate]: 1.87999e-06 [renormalize]: 3.30008e-07 [switch_simplify]: 7.36001e-06 [partial_unused_args_eliminate]: 1.79998e-06 [add_recomputation]: 5.115e-05 [cse_after_recomputation]: 2.281e-05, [1] [Cycle 1]: 1.789e-05, [1] [cse]: 1.202e-05 [environ_conv]: 5.44998e-06 [swap_dp_allreduce_reducescatter]: 0.117402 [bias_add_comm_swap]: 5.02999e-06 [label_micro_interleaved_index]: 8.89998e-06 [label_fine_grained_interleaved_index]: 2.79999e-06 [merge_cast_opt]: 1.42e-06 [slice_recompute_activation]: 2.56e-06 [micro_interleaved_order_control]: 2.43002e-06 [assign_add_opt]: 1.46002e-06 [ForceFp32Comm]: 9.00007e-07 [remove_cast_before_assign_add]: 1.25999e-06 [full_micro_interleaved_order_control]: 2.06e-06 [reorder_send_recv_between_fp_bp]: 2.95998e-06 [comm_op_add_attrs]: 1.05001e-06 [add_comm_op_reuse_tag]: 1.02e-06 [interleave_split_concat_branches]: 1.17e-06 [interleave_parallel_branches]: 1.11002e-06 [overlap_opt_shard_in_pipeline]: 1.37999e-06 [overlap_opt_shard_grad_in_pipeline]: 2.22999e-06 [control_data_broadcast_order]: 3.226e-05 [grouped_pairwise_exchange_alltoall]: 1.64998e-06 [offloading_packed_experts]: 4.70001e-06 [overlap_recompute_and_grad_model_parallel]: 5.20999e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.17e-06 [overlap_recompute_allgather_and_fa_grad]: 1.59998e-06 [overlap_recompute_comm]: 2.58e-06 [overlap_grad_ring_attention]: 4.39998e-06 [overlap_grad_flash_sp]: 3.039e-05 [begin_end_overlap_inline]: 5.89993e-07 [split_matmul_comm_elemetwise]: 2.40002e-06 [split_layernorm_comm]: 1.81e-06 [handle_group_info]: 1.29e-06 [symbol_engine_optimizer]: 0.00014574, [1] [Cycle 1]: 0.00013356, [6] [build]: 8.46002e-06 [elim_shapecalc]: 3.235e-05 [elim_not_effective]: 2.53e-05 [opt_reshape]: 9.19e-06 [fold_const_symbol]: 1.218e-05 [renormalize]: 7.7e-07 [detach_backward]: 3.04001e-06 [pipeline_parallel_scheduler]: 1.86e-06 [auto_monad_reorder]: 2.668e-05 [get_jit_bprop_graph]: 2.38998e-06 [rewriter_after_jit_bprop_graph]: 8.99e-06 [opt_after_jit_grad]: 0.00080996 [validate]: 5.083e-05 Sums bootstrap : 0.000445s : 0.15% type_inference : 0.165846s : 57.28% event_method : 0.000020s : 0.01% auto_monad : 0.000065s : 0.02% graph_reusing : 0.000006s : 0.00% inline : 0.000003s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000027s : 0.01% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.00% parallel-infer-symbol : 0.000004s : 0.00% pre_auto_parallel : 0.000041s : 0.01% insert-virtual-dataset : 0.000002s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000032s : 0.01% optimize.rewriter_before_opt_a : 0.000099s : 0.03% optimize.opt_a.expand_dump_flag : 0.000006s : 0.00% optimize.opt_a.switch_simplify : 0.000053s : 0.02% optimize.opt_a.loop_unroll : 0.000037s : 0.01% optimize.opt_a.a_1 : 0.000815s : 0.28% optimize.opt_a.with_stream_mark : 0.000042s : 0.01% optimize.opt_a.recompute_prepare : 0.000019s : 0.01% optimize.opt_a.updatestate_depend_eliminate : 0.000008s : 0.00% optimize.opt_a.updatestate_assign_eliminate : 0.000006s : 0.00% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.00% optimize.opt_a.parameter_eliminate : 0.000004s : 0.00% optimize.opt_a.a_2 : 0.000161s : 0.06% optimize.opt_a.accelerated_algorithm : 0.000016s : 0.01% optimize.opt_a.shard : 0.000005s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.00% optimize.opt_a.shard_inline : 0.000022s : 0.01% optimize.opt_a.merge_send_recv : 0.000017s : 0.01% optimize.opt_a.auto_parallel : 0.000016s : 0.01% optimize.opt_a.parallel : 0.000028s : 0.01% optimize.opt_a.flash_sp : 0.000015s : 0.01% optimize.opt_a.merge_comm : 0.000008s : 0.00% optimize.opt_a.allreduce_fusion : 0.000007s : 0.00% optimize.opt_a.matmul_add_comm_reduction : 0.000020s : 0.01% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000002s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000020s : 0.01% optimize.opt_a.virtual_dataset : 0.000014s : 0.00% optimize.opt_a.get_grad_eliminate_ : 0.000013s : 0.00% optimize.opt_a.virtual_output : 0.000013s : 0.00% optimize.opt_a.merge_forward : 0.000009s : 0.00% optimize.opt_a.cell_reuse_recompute_pass : 0.000005s : 0.00% optimize.opt_a.offload_activation : 0.000020s : 0.01% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000032s : 0.01% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.00% optimize.opt_a.before_grad : 0.000024s : 0.01% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000009s : 0.00% optimize.opt_a.meta_fg_expand : 0.000006s : 0.00% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.00% optimize.opt_a.receive_attached : 0.000004s : 0.00% optimize.opt_a.after_resolve : 0.000026s : 0.01% optimize.opt_a.a_after_grad : 0.000021s : 0.01% optimize.opt_a.renormalize : 0.000960s : 0.33% optimize.opt_a.add_forward_monad_depend : 0.000009s : 0.00% optimize.opt_a.auto_monad_grad : 0.000004s : 0.00% optimize.opt_a.auto_monad_eliminator : 0.000030s : 0.01% optimize.opt_a.cse : 0.000052s : 0.02% optimize.opt_a.a_3 : 0.000093s : 0.03% optimize.py_interpret_to_execute_after_opt_a : 0.000013s : 0.00% optimize.slice_cell_reuse_recomputed_activation : 0.000003s : 0.00% optimize.rewriter_after_opt_a : 0.000040s : 0.01% optimize.convert_after_rewriter : 0.000008s : 0.00% optimize.order_py_execute_after_rewriter : 0.000005s : 0.00% optimize.mutable_eliminate : 0.000740s : 0.26% optimize.opt_b.b_1 : 0.000128s : 0.04% optimize.opt_b.b_2 : 0.000008s : 0.00% optimize.opt_b.updatestate_depend_eliminate : 0.000008s : 0.00% optimize.opt_b.updatestate_assign_eliminate : 0.000002s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000002s : 0.00% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000023s : 0.01% optimize.optimize_parallel_all_gather_comm : 0.000017s : 0.01% optimize.overlap_param_gather : 0.000002s : 0.00% optimize.cconv : 0.000033s : 0.01% optimize.loop_unroll : 0.000470s : 0.16% optimize.opt_after_cconv.c_1 : 0.000033s : 0.01% optimize.opt_after_cconv.parameter_eliminate : 0.000004s : 0.00% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.00% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000002s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.00% optimize.opt_after_cconv.cse : 0.000019s : 0.01% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000015s : 0.01% optimize.tuple_transform.d_1 : 0.000046s : 0.02% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000007s : 0.00% optimize.partial_unused_args_eliminate : 0.000002s : 0.00% optimize.add_recomputation : 0.000051s : 0.02% optimize.cse_after_recomputation.cse : 0.000012s : 0.00% optimize.environ_conv : 0.000005s : 0.00% optimize.swap_dp_allreduce_reducescatter : 0.117402s : 40.55% optimize.bias_add_comm_swap : 0.000005s : 0.00% optimize.label_micro_interleaved_index : 0.000009s : 0.00% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.00% optimize.merge_cast_opt : 0.000001s : 0.00% optimize.slice_recompute_activation : 0.000003s : 0.00% optimize.micro_interleaved_order_control : 0.000002s : 0.00% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.00% optimize.full_micro_interleaved_order_control : 0.000002s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.00% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.00% optimize.control_data_broadcast_order : 0.000032s : 0.01% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.00% optimize.offloading_packed_experts : 0.000005s : 0.00% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.00% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000002s : 0.00% optimize.overlap_recompute_comm : 0.000003s : 0.00% optimize.overlap_grad_ring_attention : 0.000004s : 0.00% optimize.overlap_grad_flash_sp : 0.000030s : 0.01% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.00% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000008s : 0.00% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000032s : 0.01% optimize.symbol_engine_optimizer.elim_not_effective : 0.000025s : 0.01% optimize.symbol_engine_optimizer.opt_reshape : 0.000009s : 0.00% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000012s : 0.00% optimize.symbol_engine_optimizer.renormalize : 0.000001s : 0.00% detach_backward : 0.000003s : 0.00% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000027s : 0.01% get_jit_bprop_graph : 0.000002s : 0.00% rewriter_after_jit_bprop_graph : 0.000009s : 0.00% opt_after_jit_grad : 0.000810s : 0.28% validate : 0.000051s : 0.02% Time group info: ------[substitution.] 0.000216 28 1.14% : 0.000002s : 2: substitution.elim_not_effective 0.78% : 0.000002s : 2: substitution.fold_const_symbol 2.78% : 0.000006s : 4: substitution.graph_param_transform 80.85% : 0.000174s : 4: substitution.inline 2.06% : 0.000004s : 4: substitution.j_node_and_user_rematch 2.65% : 0.000006s : 4: substitution.remove_not_recompute_node 2.24% : 0.000005s : 4: substitution.replace_old_param 7.50% : 0.000016s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.165777 2 99.54% : 0.165008s : 1: type_inference.infer 0.46% : 0.000769s : 1: type_inference.specialize ------[replace.] 0.000072 8 60.55% : 0.000044s : 4: replace.inline 39.45% : 0.000029s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000186 8 92.30% : 0.000171s : 4: match.inline 7.70% : 0.000014s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000221 1278 0.84% : 0.000002s : 13: predicate.accumulaten_eliminater 0.88% : 0.000002s : 4: predicate.ad_related_special_op_eliminate 0.47% : 0.000001s : 8: predicate.addn_check_dump 1.13% : 0.000003s : 13: predicate.addn_zero_filter 0.73% : 0.000002s : 13: predicate.adjust_all_reduce_mul_add 1.89% : 0.000004s : 21: predicate.arithmetic_simplify 0.84% : 0.000002s : 13: predicate.cast_eliminate 0.70% : 0.000002s : 8: predicate.check_bprop_eliminate 0.49% : 0.000001s : 8: predicate.compare_switch_simplify 0.18% : 0.000000s : 4: predicate.const_output_eliminate 0.62% : 0.000001s : 8: predicate.depend_value_elim 1.07% : 0.000002s : 13: predicate.dict_get_item_const_eliminator 0.91% : 0.000002s : 13: predicate.dict_get_item_eliminator 0.98% : 0.000002s : 13: predicate.dict_set_item_eliminator 1.31% : 0.000003s : 8: predicate.dumpgradient_eliminate 0.65% : 0.000001s : 4: predicate.elim_not_effective 0.58% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.07% : 0.000002s : 17: predicate.environ_add_const_eliminate 0.97% : 0.000002s : 17: predicate.environ_get_add_eliminate 1.06% : 0.000002s : 17: predicate.environ_get_depend_swap 1.45% : 0.000003s : 25: predicate.environ_get_eliminate 1.02% : 0.000002s : 17: predicate.environ_get_set_eliminate 1.35% : 0.000003s : 21: predicate.exchange_switch_depend_value 2.53% : 0.000006s : 21: predicate.float_depend_g_call 0.54% : 0.000001s : 8: predicate.float_environ_get_switch 0.76% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.20% : 0.000000s : 4: predicate.fold_const_symbol 0.62% : 0.000001s : 8: predicate.get_grad_eliminate 0.20% : 0.000000s : 4: predicate.graph_param_transform 0.79% : 0.000002s : 8: predicate.incorporate_call 0.43% : 0.000001s : 8: predicate.incorporate_call_switch 6.67% : 0.000015s : 58: predicate.inline 1.02% : 0.000002s : 8: predicate.inline_without_move 0.34% : 0.000001s : 8: predicate.j_node_and_user_rematch 0.93% : 0.000002s : 8: predicate.less_batch_normalization 1.79% : 0.000004s : 25: predicate.list_to_tuple_eliminator_ 2.32% : 0.000005s : 38: predicate.load_eliminater 1.01% : 0.000002s : 4: predicate.loop_unroll_after_grad 2.17% : 0.000005s : 34: predicate.loop_unroll_before_grad 1.64% : 0.000004s : 21: predicate.make_slice_get_slice_eliminator 0.51% : 0.000001s : 8: predicate.merge_addn 0.55% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.51% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.74% : 0.000002s : 13: predicate.minmaximum_grad 1.39% : 0.000003s : 4: predicate.mutable_eliminate 0.53% : 0.000001s : 4: predicate.opt_reshape 0.42% : 0.000001s : 4: predicate.parallel_virtual_node 2.33% : 0.000005s : 21: predicate.partial_defer_inline 1.52% : 0.000003s : 21: predicate.partial_eliminate 0.84% : 0.000002s : 13: predicate.print_const_string_wrapper 0.58% : 0.000001s : 8: predicate.reduce_all_const_elim 1.17% : 0.000003s : 13: predicate.reduce_eliminate 2.32% : 0.000005s : 38: predicate.redundant_stop_gradient_eliminater 0.65% : 0.000001s : 8: predicate.remove_not_recompute_node 1.32% : 0.000003s : 25: predicate.replace_applicator 0.46% : 0.000001s : 8: predicate.replace_old_param 0.26% : 0.000001s : 4: predicate.reset_defer_inline 0.84% : 0.000002s : 13: predicate.reshape_eliminate 0.66% : 0.000001s : 8: predicate.row_tensor_add_zeros_like 0.35% : 0.000001s : 4: predicate.row_tensor_eliminate 0.98% : 0.000002s : 8: predicate.same_eliminate 0.58% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.98% : 0.000002s : 8: predicate.shard_identity_eliminate 0.87% : 0.000002s : 8: predicate.special_op_eliminate 0.91% : 0.000002s : 8: predicate.specialize_transform 0.83% : 0.000002s : 8: predicate.split_environ_get_set_with_tuple_value 0.80% : 0.000002s : 8: predicate.stack_unstack_eliminate 0.32% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.48% : 0.000003s : 21: predicate.switch_defer_inline 1.88% : 0.000004s : 29: predicate.switch_layer_defer_inline 4.96% : 0.000011s : 67: predicate.switch_simplify 0.91% : 0.000002s : 13: predicate.tile_eliminate 0.94% : 0.000002s : 13: predicate.transpose_eliminate 1.55% : 0.000003s : 21: predicate.tuple_list_convert_item_index_to_positive 1.58% : 0.000004s : 21: predicate.tuple_list_get_item_const_eliminator 1.26% : 0.000003s : 21: predicate.tuple_list_get_item_depend_reorder 3.24% : 0.000007s : 33: predicate.tuple_list_get_item_eliminator 1.41% : 0.000003s : 21: predicate.tuple_list_get_set_item_eliminator 2.15% : 0.000005s : 29: predicate.tuple_list_set_item_eliminator 1.83% : 0.000004s : 25: predicate.tuple_to_list_eliminator_ 2.24% : 0.000005s : 38: predicate.updatestate_pure_node_eliminater 2.83% : 0.000006s : 46: predicate.updatestate_useless_node_eliminater 0.51% : 0.000001s : 4: predicate.value_based_eliminate 0.60% : 0.000001s : 8: predicate.virtual_dataset_eliminate 0.62% : 0.000001s : 8: predicate.virtual_output_eliminate 0.24% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.42% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000672 11 54.73% : 0.000368s : 5: func_graph_cloner_run.FuncGraphClonerGraph 45.27% : 0.000304s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.423550 192 0.00% : 0.000004s : 1: ForceFp32Comm 0.87% : 0.003669s : 1: add_attr 0.86% : 0.003653s : 1: add_attr_with_inline 0.00% : 0.000006s : 1: add_comm_op_reuse_tag 0.01% : 0.000055s : 1: add_recomputation 0.00% : 0.000004s : 1: assign_add_opt 0.02% : 0.000070s : 1: auto_monad 0.01% : 0.000030s : 1: auto_monad_reorder 0.00% : 0.000003s : 1: begin_end_overlap_inline 0.00% : 0.000015s : 1: bias_add_comm_swap 0.11% : 0.000479s : 1: bootstrap 0.01% : 0.000037s : 1: cconv 0.00% : 0.000004s : 1: comm_op_add_attrs 0.01% : 0.000036s : 1: control_data_broadcast_order 0.00% : 0.000011s : 1: convert_after_rewriter 0.01% : 0.000026s : 1: cse_after_recomputation 0.00% : 0.000005s : 1: dataset_repeat_opt 0.00% : 0.000007s : 1: detach_backward 0.00% : 0.000009s : 1: environ_conv 0.01% : 0.000027s : 1: event_method 0.00% : 0.000005s : 1: full_micro_interleaved_order_control 0.00% : 0.000005s : 1: get_jit_bprop_graph 0.00% : 0.000009s : 1: graph_reusing 0.00% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000004s : 1: handle_group_info 0.00% : 0.000006s : 1: inline 0.00% : 0.000006s : 1: insert-virtual-dataset 0.00% : 0.000004s : 1: interleave_parallel_branches 0.00% : 0.000004s : 1: interleave_split_concat_branches 0.00% : 0.000008s : 1: label_fine_grained_interleaved_index 0.00% : 0.000013s : 1: label_micro_interleaved_index 0.11% : 0.000478s : 1: loop_unroll 0.00% : 0.000005s : 1: merge_cast_opt 0.00% : 0.000005s : 1: micro_interleaved_order_control 0.18% : 0.000750s : 1: mutable_eliminate 0.00% : 0.000008s : 1: offloading_packed_experts 0.00% : 0.000015s : 1: opt.transform.loop_unroll_optimizer 0.00% : 0.000019s : 1: opt.transform.mutable_eliminate 0.30% : 0.001278s : 78: opt.transform.opt_a 0.01% : 0.000031s : 1: opt.transform.opt_after_cconv 0.01% : 0.000032s : 1: opt.transform.opt_after_jit_grad 0.02% : 0.000104s : 28: opt.transform.opt_b 0.01% : 0.000051s : 2: opt.transform.opt_trans_graph 0.02% : 0.000072s : 4: opt.transform.symbol_engine_opt 0.76% : 0.003228s : 1: opt_a 0.03% : 0.000113s : 1: opt_after_cconv 0.19% : 0.000822s : 1: opt_after_jit_grad 0.05% : 0.000223s : 1: opt_b 29.08% : 0.123170s : 1: optimize 0.00% : 0.000021s : 1: optimize_parallel_all_gather_comm 0.00% : 0.000008s : 1: order_py_execute_after_rewriter 0.01% : 0.000034s : 1: overlap_grad_flash_sp 0.00% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.00% : 0.000007s : 1: overlap_grad_ring_attention 0.00% : 0.000006s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000005s : 1: overlap_param_gather 0.00% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.00% : 0.000010s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000006s : 1: overlap_recompute_comm 0.00% : 0.000008s : 1: parallel-infer-symbol 0.00% : 0.000004s : 1: parallel-infer-symbol-second 0.00% : 0.000005s : 1: partial_unused_args_eliminate 0.00% : 0.000005s : 1: pipeline_parallel_scheduler 0.00% : 0.000004s : 1: pipeline_split 0.01% : 0.000046s : 1: pre_auto_parallel 0.01% : 0.000037s : 1: py_interpret_to_execute 0.00% : 0.000016s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000004s : 1: remove_cast_before_assign_add 0.00% : 0.000019s : 1: remove_dup_value 0.12% : 0.000515s : 1: renormalize.infer 0.10% : 0.000432s : 1: renormalize.specialize 0.00% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.00% : 0.000012s : 1: rewriter_after_jit_bprop_graph 0.01% : 0.000044s : 1: rewriter_after_opt_a 0.02% : 0.000103s : 1: rewriter_before_opt_a 0.00% : 0.000006s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000007s : 1: slice_recompute_activation 0.00% : 0.000006s : 1: split_layernorm_comm 0.00% : 0.000005s : 1: split_matmul_comm_elemetwise 27.73% : 0.117439s : 1: swap_dp_allreduce_reducescatter 0.04% : 0.000148s : 1: symbol_engine_optimizer 0.02% : 0.000083s : 1: tuple_transform 39.16% : 0.165867s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:40:15.143.052 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:40:15.143.319 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.186952, [21] [bootstrap]: 0.0004558 [type_inference]: 0.00621673 [event_method]: 2.062e-05 [auto_monad]: 7.423e-05 [graph_reusing]: 6.61e-06 [inline]: 2.51e-06 [add_attr]: 0.171702, [1] [add_attr_with_inline]: 0.171689, [1] [Cycle 1]: 0.00010233, [2] [tag_attr]: 2.692e-05 [meta_addattr_fg_expand]: 6.44999e-06 [parallel-infer-symbol]: 4.30999e-06 [pre_auto_parallel]: 4.631e-05 [insert-virtual-dataset]: 2.66e-06 [parallel-infer-symbol-second]: 7.2e-07 [dataset_repeat_opt]: 2.30002e-06 [pipeline_split]: 2.16e-06 [optimize]: 0.00705065, [53] [py_interpret_to_execute]: 3.938e-05 [rewriter_before_opt_a]: 0.00011019 [opt_a]: 0.00407451, [2] [Cycle 1]: 0.00299097, [45] [expand_dump_flag]: 3.81999e-06 [switch_simplify]: 4.595e-05 [loop_unroll]: 3.195e-05 [a_1]: 0.00081799 [with_stream_mark]: 2.246e-05 [recompute_prepare]: 1.189e-05 [updatestate_depend_eliminate]: 5.02999e-06 [updatestate_assign_eliminate]: 4.39002e-06 [updatestate_loads_eliminate]: 3.5e-06 [parameter_eliminate]: 2.12001e-06 [a_2]: 0.0001329 [accelerated_algorithm]: 8.85001e-06 [shard]: 2.04e-06 [meta_shard_fg_expand]: 2.66e-06 [shard_inline]: 8.54002e-06 [merge_send_recv]: 1.073e-05 [auto_parallel]: 9.47001e-06 [parallel]: 2.468e-05 [flash_sp]: 1.073e-05 [merge_comm]: 5.54e-06 [allreduce_fusion]: 4.74998e-06 [matmul_add_comm_reduction]: 1.206e-05 [allreduce_slice_to_reducescatter]: 7.2e-07 [virtual_shard_identity]: 1.101e-05 [virtual_dataset]: 9.05999e-06 [get_grad_eliminate_]: 8.16002e-06 [virtual_output]: 8.84e-06 [merge_forward]: 5.29e-06 [cell_reuse_recompute_pass]: 1.85001e-06 [offload_activation]: 1.233e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.16e-05 [merge_recompute_call_nodes]: 1.57001e-06 [before_grad]: 1.54e-05 [set_forward_comm_id_for_comm_node_pass]: 5.07999e-06 [meta_fg_expand]: 4.05998e-06 [flash_sp_send_recv_attached]: 2.79001e-06 [receive_attached]: 2.68e-06 [after_resolve]: 1.426e-05 [a_after_grad]: 1.244e-05 [renormalize]: 0.00108001 [add_forward_monad_depend]: 8.3e-06 [auto_monad_grad]: 2.99001e-06 [auto_monad_eliminator]: 2.266e-05 [cse]: 4.037e-05 [a_3]: 7.841e-05 [Cycle 2]: 0.00106788, [45] [expand_dump_flag]: 2.24001e-06 [switch_simplify]: 9.41e-06 [loop_unroll]: 7.65e-06 [a_1]: 0.00018236 [with_stream_mark]: 1.877e-05 [recompute_prepare]: 8.90999e-06 [updatestate_depend_eliminate]: 4.75001e-06 [updatestate_assign_eliminate]: 3.4e-06 [updatestate_loads_eliminate]: 3.58e-06 [parameter_eliminate]: 1.59e-06 [a_2]: 0.00015061 [accelerated_algorithm]: 1.022e-05 [shard]: 2.71999e-06 [meta_shard_fg_expand]: 2.33998e-06 [shard_inline]: 7.7e-06 [merge_send_recv]: 1.067e-05 [auto_parallel]: 9.56e-06 [parallel]: 8.08999e-06 [flash_sp]: 4.82e-06 [merge_comm]: 4.85999e-06 [allreduce_fusion]: 4.55001e-06 [matmul_add_comm_reduction]: 1.015e-05 [allreduce_slice_to_reducescatter]: 7.00005e-07 [virtual_shard_identity]: 9.75002e-06 [virtual_dataset]: 9.98002e-06 [get_grad_eliminate_]: 8.27e-06 [virtual_output]: 7.51001e-06 [merge_forward]: 4.48001e-06 [cell_reuse_recompute_pass]: 3.42002e-06 [offload_activation]: 1.149e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.081e-05 [merge_recompute_call_nodes]: 1.58002e-06 [before_grad]: 1.392e-05 [set_forward_comm_id_for_comm_node_pass]: 5.34e-06 [meta_fg_expand]: 3.42002e-06 [flash_sp_send_recv_attached]: 1.68002e-06 [receive_attached]: 2.76999e-06 [after_resolve]: 1.546e-05 [a_after_grad]: 1.193e-05 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 2.17001e-06 [auto_monad_grad]: 1.32999e-06 [auto_monad_eliminator]: 1.274e-05 [cse]: 2.747e-05 [a_3]: 6.129e-05 [py_interpret_to_execute_after_opt_a]: 2.114e-05 [slice_cell_reuse_recomputed_activation]: 5.99e-06 [rewriter_after_opt_a]: 5.816e-05 [convert_after_rewriter]: 1.163e-05 [order_py_execute_after_rewriter]: 9.09e-06 [mutable_eliminate]: 0.00075595 [opt_b]: 0.00033178, [1] [Cycle 1]: 0.00032002, [7] [b_1]: 0.00020239 [b_2]: 9.74e-06 [updatestate_depend_eliminate]: 9.02e-06 [updatestate_assign_eliminate]: 3.5e-06 [updatestate_loads_eliminate]: 3.76999e-06 [renormalize]: 8.10018e-07 [cse]: 3.025e-05 [optimize_parallel_all_gather_comm]: 2.675e-05 [overlap_param_gather]: 5.46e-06 [cconv]: 4.102e-05 [loop_unroll]: 0.0005352 [opt_after_cconv]: 0.0001658, [1] [Cycle 1]: 0.00015499, [7] [c_1]: 4.212e-05 [parameter_eliminate]: 5.12999e-06 [updatestate_depend_eliminate]: 7.29001e-06 [updatestate_assign_eliminate]: 3.38e-06 [updatestate_loads_eliminate]: 3.7e-06 [cse]: 2.933e-05 [renormalize]: 9.49978e-07 [remove_dup_value]: 2.308e-05 [tuple_transform]: 0.00015664, [1] [Cycle 1]: 0.00014751, [4] [d_1]: 6.083e-05 [none_parameter_eliminate]: 3.278e-05 [renormalize]: 4.19997e-07 [switch_simplify]: 1.138e-05 [partial_unused_args_eliminate]: 6.03002e-06 [add_recomputation]: 7.334e-05 [cse_after_recomputation]: 3.959e-05, [1] [Cycle 1]: 3.151e-05, [1] [cse]: 2.072e-05 [environ_conv]: 1.112e-05 [swap_dp_allreduce_reducescatter]: 9.67999e-06 [bias_add_comm_swap]: 5.91e-06 [label_micro_interleaved_index]: 8.68001e-06 [label_fine_grained_interleaved_index]: 6.06e-06 [merge_cast_opt]: 4.50001e-06 [slice_recompute_activation]: 5.36002e-06 [micro_interleaved_order_control]: 5.00999e-06 [assign_add_opt]: 4.43999e-06 [ForceFp32Comm]: 3.85e-06 [remove_cast_before_assign_add]: 3.71001e-06 [full_micro_interleaved_order_control]: 5.35999e-06 [reorder_send_recv_between_fp_bp]: 6.17999e-06 [comm_op_add_attrs]: 3.77002e-06 [add_comm_op_reuse_tag]: 3.51001e-06 [interleave_split_concat_branches]: 3.78999e-06 [interleave_parallel_branches]: 3.46999e-06 [overlap_opt_shard_in_pipeline]: 3.76999e-06 [overlap_opt_shard_grad_in_pipeline]: 4.77e-06 [control_data_broadcast_order]: 2.08e-05 [grouped_pairwise_exchange_alltoall]: 4.66002e-06 [offloading_packed_experts]: 6.81999e-06 [overlap_recompute_and_grad_model_parallel]: 7.97e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.86999e-06 [overlap_recompute_allgather_and_fa_grad]: 4.04002e-06 [overlap_recompute_comm]: 5.21002e-06 [overlap_grad_ring_attention]: 7.06999e-06 [overlap_grad_flash_sp]: 2.828e-05 [begin_end_overlap_inline]: 3.07002e-06 [split_matmul_comm_elemetwise]: 4.92e-06 [split_layernorm_comm]: 4.67e-06 [handle_group_info]: 3.45e-06 [symbol_engine_optimizer]: 0.0001193, [1] [Cycle 1]: 0.0001118, [6] [build]: 5.02e-06 [elim_shapecalc]: 1.489e-05 [elim_not_effective]: 1.809e-05 [opt_reshape]: 8.93002e-06 [fold_const_symbol]: 1.391e-05 [renormalize]: 2.10013e-07 [detach_backward]: 3.6e-06 [pipeline_parallel_scheduler]: 1.96e-06 [auto_monad_reorder]: 2.6e-05 [get_jit_bprop_graph]: 2.01e-06 [rewriter_after_jit_bprop_graph]: 5.39e-06 [opt_after_jit_grad]: 0.00062189 [validate]: 5.041e-05 Sums bootstrap : 0.000456s : 3.43% type_inference : 0.006217s : 46.72% event_method : 0.000021s : 0.15% auto_monad : 0.000074s : 0.56% graph_reusing : 0.000007s : 0.05% inline : 0.000003s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000027s : 0.20% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.05% parallel-infer-symbol : 0.000004s : 0.03% pre_auto_parallel : 0.000046s : 0.35% insert-virtual-dataset : 0.000003s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.02% optimize.py_interpret_to_execute : 0.000039s : 0.30% optimize.rewriter_before_opt_a : 0.000110s : 0.83% optimize.opt_a.expand_dump_flag : 0.000006s : 0.05% optimize.opt_a.switch_simplify : 0.000055s : 0.42% optimize.opt_a.loop_unroll : 0.000040s : 0.30% optimize.opt_a.a_1 : 0.001000s : 7.52% optimize.opt_a.with_stream_mark : 0.000041s : 0.31% optimize.opt_a.recompute_prepare : 0.000021s : 0.16% optimize.opt_a.updatestate_depend_eliminate : 0.000010s : 0.07% optimize.opt_a.updatestate_assign_eliminate : 0.000008s : 0.06% optimize.opt_a.updatestate_loads_eliminate : 0.000007s : 0.05% optimize.opt_a.parameter_eliminate : 0.000004s : 0.03% optimize.opt_a.a_2 : 0.000284s : 2.13% optimize.opt_a.accelerated_algorithm : 0.000019s : 0.14% optimize.opt_a.shard : 0.000005s : 0.04% optimize.opt_a.meta_shard_fg_expand : 0.000005s : 0.04% optimize.opt_a.shard_inline : 0.000016s : 0.12% optimize.opt_a.merge_send_recv : 0.000021s : 0.16% optimize.opt_a.auto_parallel : 0.000019s : 0.14% optimize.opt_a.parallel : 0.000033s : 0.25% optimize.opt_a.flash_sp : 0.000016s : 0.12% optimize.opt_a.merge_comm : 0.000010s : 0.08% optimize.opt_a.allreduce_fusion : 0.000009s : 0.07% optimize.opt_a.matmul_add_comm_reduction : 0.000022s : 0.17% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000021s : 0.16% optimize.opt_a.virtual_dataset : 0.000019s : 0.14% optimize.opt_a.get_grad_eliminate_ : 0.000016s : 0.12% optimize.opt_a.virtual_output : 0.000016s : 0.12% optimize.opt_a.merge_forward : 0.000010s : 0.07% optimize.opt_a.cell_reuse_recompute_pass : 0.000005s : 0.04% optimize.opt_a.offload_activation : 0.000024s : 0.18% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000042s : 0.32% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.02% optimize.opt_a.before_grad : 0.000029s : 0.22% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000010s : 0.08% optimize.opt_a.meta_fg_expand : 0.000007s : 0.06% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.03% optimize.opt_a.receive_attached : 0.000005s : 0.04% optimize.opt_a.after_resolve : 0.000030s : 0.22% optimize.opt_a.a_after_grad : 0.000024s : 0.18% optimize.opt_a.renormalize : 0.001080s : 8.12% optimize.opt_a.add_forward_monad_depend : 0.000010s : 0.08% optimize.opt_a.auto_monad_grad : 0.000004s : 0.03% optimize.opt_a.auto_monad_eliminator : 0.000035s : 0.27% optimize.opt_a.cse : 0.000068s : 0.51% optimize.opt_a.a_3 : 0.000140s : 1.05% optimize.py_interpret_to_execute_after_opt_a : 0.000021s : 0.16% optimize.slice_cell_reuse_recomputed_activation : 0.000006s : 0.05% optimize.rewriter_after_opt_a : 0.000058s : 0.44% optimize.convert_after_rewriter : 0.000012s : 0.09% optimize.order_py_execute_after_rewriter : 0.000009s : 0.07% optimize.mutable_eliminate : 0.000756s : 5.68% optimize.opt_b.b_1 : 0.000202s : 1.52% optimize.opt_b.b_2 : 0.000010s : 0.07% optimize.opt_b.updatestate_depend_eliminate : 0.000009s : 0.07% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_b.updatestate_loads_eliminate : 0.000004s : 0.03% optimize.opt_b.renormalize : 0.000001s : 0.01% optimize.opt_b.cse : 0.000030s : 0.23% optimize.optimize_parallel_all_gather_comm : 0.000027s : 0.20% optimize.overlap_param_gather : 0.000005s : 0.04% optimize.cconv : 0.000041s : 0.31% optimize.loop_unroll : 0.000535s : 4.02% optimize.opt_after_cconv.c_1 : 0.000042s : 0.32% optimize.opt_after_cconv.parameter_eliminate : 0.000005s : 0.04% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000007s : 0.05% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000004s : 0.03% optimize.opt_after_cconv.cse : 0.000029s : 0.22% optimize.opt_after_cconv.renormalize : 0.000001s : 0.01% optimize.remove_dup_value : 0.000023s : 0.17% optimize.tuple_transform.d_1 : 0.000061s : 0.46% optimize.tuple_transform.none_parameter_eliminate : 0.000033s : 0.25% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000011s : 0.09% optimize.partial_unused_args_eliminate : 0.000006s : 0.05% optimize.add_recomputation : 0.000073s : 0.55% optimize.cse_after_recomputation.cse : 0.000021s : 0.16% optimize.environ_conv : 0.000011s : 0.08% optimize.swap_dp_allreduce_reducescatter : 0.000010s : 0.07% optimize.bias_add_comm_swap : 0.000006s : 0.04% optimize.label_micro_interleaved_index : 0.000009s : 0.07% optimize.label_fine_grained_interleaved_index : 0.000006s : 0.05% optimize.merge_cast_opt : 0.000005s : 0.03% optimize.slice_recompute_activation : 0.000005s : 0.04% optimize.micro_interleaved_order_control : 0.000005s : 0.04% optimize.assign_add_opt : 0.000004s : 0.03% optimize.ForceFp32Comm : 0.000004s : 0.03% optimize.remove_cast_before_assign_add : 0.000004s : 0.03% optimize.full_micro_interleaved_order_control : 0.000005s : 0.04% optimize.reorder_send_recv_between_fp_bp : 0.000006s : 0.05% optimize.comm_op_add_attrs : 0.000004s : 0.03% optimize.add_comm_op_reuse_tag : 0.000004s : 0.03% optimize.interleave_split_concat_branches : 0.000004s : 0.03% optimize.interleave_parallel_branches : 0.000003s : 0.03% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.03% optimize.overlap_opt_shard_grad_in_pipeline : 0.000005s : 0.04% optimize.control_data_broadcast_order : 0.000021s : 0.16% optimize.grouped_pairwise_exchange_alltoall : 0.000005s : 0.04% optimize.offloading_packed_experts : 0.000007s : 0.05% optimize.overlap_recompute_and_grad_model_parallel : 0.000008s : 0.06% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.03% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.03% optimize.overlap_recompute_comm : 0.000005s : 0.04% optimize.overlap_grad_ring_attention : 0.000007s : 0.05% optimize.overlap_grad_flash_sp : 0.000028s : 0.21% optimize.begin_end_overlap_inline : 0.000003s : 0.02% optimize.split_matmul_comm_elemetwise : 0.000005s : 0.04% optimize.split_layernorm_comm : 0.000005s : 0.04% optimize.handle_group_info : 0.000003s : 0.03% optimize.symbol_engine_optimizer.build : 0.000005s : 0.04% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000015s : 0.11% optimize.symbol_engine_optimizer.elim_not_effective : 0.000018s : 0.14% optimize.symbol_engine_optimizer.opt_reshape : 0.000009s : 0.07% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000014s : 0.10% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000004s : 0.03% pipeline_parallel_scheduler : 0.000002s : 0.01% auto_monad_reorder : 0.000026s : 0.20% get_jit_bprop_graph : 0.000002s : 0.02% rewriter_after_jit_bprop_graph : 0.000005s : 0.04% opt_after_jit_grad : 0.000622s : 4.67% validate : 0.000050s : 0.38% Time group info: ------[substitution.] 0.000264 38 10.77% : 0.000028s : 3: substitution.cast_eliminate 1.10% : 0.000003s : 3: substitution.elim_not_effective 0.67% : 0.000002s : 3: substitution.fold_const_symbol 3.08% : 0.000008s : 5: substitution.graph_param_transform 70.34% : 0.000186s : 4: substitution.inline 2.50% : 0.000007s : 6: substitution.j_node_and_user_rematch 2.77% : 0.000007s : 6: substitution.remove_not_recompute_node 2.52% : 0.000007s : 4: substitution.replace_old_param 6.24% : 0.000016s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.006157 2 86.36% : 0.005317s : 1: type_inference.infer 13.64% : 0.000840s : 1: type_inference.specialize ------[replace.] 0.000080 8 65.57% : 0.000053s : 4: replace.inline 34.43% : 0.000028s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000197 8 92.90% : 0.000183s : 4: match.inline 7.10% : 0.000014s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000275 1596 0.94% : 0.000003s : 17: predicate.accumulaten_eliminater 0.92% : 0.000003s : 5: predicate.ad_related_special_op_eliminate 0.57% : 0.000002s : 10: predicate.addn_check_dump 0.99% : 0.000003s : 17: predicate.addn_zero_filter 0.92% : 0.000003s : 17: predicate.adjust_all_reduce_mul_add 2.02% : 0.000006s : 27: predicate.arithmetic_simplify 0.95% : 0.000003s : 17: predicate.cast_eliminate 0.58% : 0.000002s : 10: predicate.check_bprop_eliminate 0.55% : 0.000002s : 10: predicate.compare_switch_simplify 0.19% : 0.000001s : 5: predicate.const_output_eliminate 0.58% : 0.000002s : 10: predicate.depend_value_elim 0.93% : 0.000003s : 17: predicate.dict_get_item_const_eliminator 0.96% : 0.000003s : 17: predicate.dict_get_item_eliminator 0.97% : 0.000003s : 17: predicate.dict_set_item_eliminator 1.16% : 0.000003s : 10: predicate.dumpgradient_eliminate 0.28% : 0.000001s : 5: predicate.elim_not_effective 0.45% : 0.000001s : 5: predicate.elim_shapecalc_of_broadcastargs 1.26% : 0.000003s : 22: predicate.environ_add_const_eliminate 1.06% : 0.000003s : 22: predicate.environ_get_add_eliminate 1.11% : 0.000003s : 22: predicate.environ_get_depend_swap 1.68% : 0.000005s : 32: predicate.environ_get_eliminate 1.12% : 0.000003s : 22: predicate.environ_get_set_eliminate 1.34% : 0.000004s : 25: predicate.exchange_switch_depend_value 2.30% : 0.000006s : 25: predicate.float_depend_g_call 0.64% : 0.000002s : 10: predicate.float_environ_get_switch 0.79% : 0.000002s : 15: predicate.float_tuple_getitem_switch 0.16% : 0.000000s : 5: predicate.fold_const_symbol 0.60% : 0.000002s : 10: predicate.get_grad_eliminate 0.18% : 0.000000s : 5: predicate.graph_param_transform 0.64% : 0.000002s : 10: predicate.incorporate_call 0.48% : 0.000001s : 10: predicate.incorporate_call_switch 6.41% : 0.000018s : 72: predicate.inline 0.74% : 0.000002s : 10: predicate.inline_without_move 0.28% : 0.000001s : 10: predicate.j_node_and_user_rematch 1.15% : 0.000003s : 10: predicate.less_batch_normalization 1.89% : 0.000005s : 31: predicate.list_to_tuple_eliminator_ 2.43% : 0.000007s : 48: predicate.load_eliminater 1.00% : 0.000003s : 5: predicate.loop_unroll_after_grad 1.89% : 0.000005s : 36: predicate.loop_unroll_before_grad 1.78% : 0.000005s : 27: predicate.make_slice_get_slice_eliminator 0.51% : 0.000001s : 10: predicate.merge_addn 0.56% : 0.000002s : 10: predicate.micro_step_allgather_replace 0.61% : 0.000002s : 10: predicate.mini_step_allgather_replace 0.80% : 0.000002s : 17: predicate.minmaximum_grad 1.34% : 0.000004s : 5: predicate.mutable_eliminate 0.34% : 0.000001s : 5: predicate.opt_reshape 0.34% : 0.000001s : 5: predicate.parallel_virtual_node 1.66% : 0.000005s : 25: predicate.partial_defer_inline 1.58% : 0.000004s : 26: predicate.partial_eliminate 0.89% : 0.000002s : 17: predicate.print_const_string_wrapper 0.64% : 0.000002s : 10: predicate.reduce_all_const_elim 1.28% : 0.000004s : 17: predicate.reduce_eliminate 2.46% : 0.000007s : 48: predicate.redundant_stop_gradient_eliminater 0.49% : 0.000001s : 10: predicate.remove_not_recompute_node 1.33% : 0.000004s : 31: predicate.replace_applicator 0.61% : 0.000002s : 10: predicate.replace_old_param 0.40% : 0.000001s : 5: predicate.reset_defer_inline 0.91% : 0.000002s : 17: predicate.reshape_eliminate 0.60% : 0.000002s : 10: predicate.row_tensor_add_zeros_like 0.40% : 0.000001s : 5: predicate.row_tensor_eliminate 0.88% : 0.000002s : 10: predicate.same_eliminate 0.38% : 0.000001s : 10: predicate.set_cell_output_no_recompute 0.91% : 0.000003s : 10: predicate.shard_identity_eliminate 0.76% : 0.000002s : 10: predicate.special_op_eliminate 0.77% : 0.000002s : 10: predicate.specialize_transform 1.00% : 0.000003s : 10: predicate.split_environ_get_set_with_tuple_value 0.69% : 0.000002s : 10: predicate.stack_unstack_eliminate 0.31% : 0.000001s : 5: predicate.switch_call_monad_eliminater 1.45% : 0.000004s : 25: predicate.switch_defer_inline 1.98% : 0.000005s : 35: predicate.switch_layer_defer_inline 5.05% : 0.000014s : 76: predicate.switch_simplify 0.96% : 0.000003s : 17: predicate.tile_eliminate 1.00% : 0.000003s : 17: predicate.transpose_eliminate 1.52% : 0.000004s : 27: predicate.tuple_list_convert_item_index_to_positive 1.57% : 0.000004s : 27: predicate.tuple_list_get_item_const_eliminator 1.44% : 0.000004s : 27: predicate.tuple_list_get_item_depend_reorder 3.25% : 0.000009s : 41: predicate.tuple_list_get_item_eliminator 1.51% : 0.000004s : 27: predicate.tuple_list_get_set_item_eliminator 2.16% : 0.000006s : 37: predicate.tuple_list_set_item_eliminator 1.73% : 0.000005s : 31: predicate.tuple_to_list_eliminator_ 2.35% : 0.000006s : 48: predicate.updatestate_pure_node_eliminater 3.12% : 0.000009s : 58: predicate.updatestate_useless_node_eliminater 0.42% : 0.000001s : 5: predicate.value_based_eliminate 0.73% : 0.000002s : 10: predicate.virtual_dataset_eliminate 0.73% : 0.000002s : 10: predicate.virtual_output_eliminate 0.27% : 0.000001s : 5: predicate.virtual_view_grad_eliminate 0.36% : 0.000001s : 5: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000656 11 52.52% : 0.000345s : 5: func_graph_cloner_run.FuncGraphClonerGraph 47.48% : 0.000311s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.368577 192 0.00% : 0.000007s : 1: ForceFp32Comm 46.59% : 0.171715s : 1: add_attr 46.58% : 0.171693s : 1: add_attr_with_inline 0.00% : 0.000007s : 1: add_comm_op_reuse_tag 0.02% : 0.000078s : 1: add_recomputation 0.00% : 0.000007s : 1: assign_add_opt 0.02% : 0.000083s : 1: auto_monad 0.01% : 0.000034s : 1: auto_monad_reorder 0.00% : 0.000006s : 1: begin_end_overlap_inline 0.00% : 0.000009s : 1: bias_add_comm_swap 0.14% : 0.000500s : 1: bootstrap 0.01% : 0.000044s : 1: cconv 0.00% : 0.000007s : 1: comm_op_add_attrs 0.01% : 0.000024s : 1: control_data_broadcast_order 0.00% : 0.000015s : 1: convert_after_rewriter 0.01% : 0.000043s : 1: cse_after_recomputation 0.00% : 0.000008s : 1: dataset_repeat_opt 0.01% : 0.000023s : 1: detach_backward 0.00% : 0.000014s : 1: environ_conv 0.01% : 0.000032s : 1: event_method 0.00% : 0.000008s : 1: full_micro_interleaved_order_control 0.00% : 0.000008s : 1: get_jit_bprop_graph 0.00% : 0.000014s : 1: graph_reusing 0.00% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000006s : 1: handle_group_info 0.00% : 0.000008s : 1: inline 0.00% : 0.000009s : 1: insert-virtual-dataset 0.00% : 0.000006s : 1: interleave_parallel_branches 0.00% : 0.000006s : 1: interleave_split_concat_branches 0.00% : 0.000009s : 1: label_fine_grained_interleaved_index 0.00% : 0.000011s : 1: label_micro_interleaved_index 0.15% : 0.000543s : 1: loop_unroll 0.00% : 0.000007s : 1: merge_cast_opt 0.00% : 0.000008s : 1: micro_interleaved_order_control 0.21% : 0.000764s : 1: mutable_eliminate 0.00% : 0.000010s : 1: offloading_packed_experts 0.01% : 0.000019s : 1: opt.transform.loop_unroll_optimizer 0.01% : 0.000022s : 1: opt.transform.mutable_eliminate 0.43% : 0.001578s : 78: opt.transform.opt_a 0.01% : 0.000040s : 1: opt.transform.opt_after_cconv 0.01% : 0.000035s : 1: opt.transform.opt_after_jit_grad 0.04% : 0.000138s : 28: opt.transform.opt_b 0.02% : 0.000069s : 2: opt.transform.opt_trans_graph 0.01% : 0.000052s : 4: opt.transform.symbol_engine_opt 1.11% : 0.004078s : 1: opt_a 0.05% : 0.000170s : 1: opt_after_cconv 0.17% : 0.000634s : 1: opt_after_jit_grad 0.09% : 0.000336s : 1: opt_b 2.02% : 0.007430s : 1: optimize 0.01% : 0.000031s : 1: optimize_parallel_all_gather_comm 0.00% : 0.000012s : 1: order_py_execute_after_rewriter 0.01% : 0.000032s : 1: overlap_grad_flash_sp 0.00% : 0.000007s : 1: overlap_grad_matmul_and_grad_allreduce 0.00% : 0.000011s : 1: overlap_grad_ring_attention 0.00% : 0.000008s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000007s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000009s : 1: overlap_param_gather 0.00% : 0.000007s : 1: overlap_recompute_allgather_and_fa_grad 0.00% : 0.000011s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000008s : 1: overlap_recompute_comm 0.00% : 0.000011s : 1: parallel-infer-symbol 0.00% : 0.000007s : 1: parallel-infer-symbol-second 0.00% : 0.000009s : 1: partial_unused_args_eliminate 0.00% : 0.000009s : 1: pipeline_parallel_scheduler 0.00% : 0.000008s : 1: pipeline_split 0.01% : 0.000054s : 1: pre_auto_parallel 0.01% : 0.000043s : 1: py_interpret_to_execute 0.01% : 0.000025s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000008s : 1: remove_cast_before_assign_add 0.01% : 0.000026s : 1: remove_dup_value 0.17% : 0.000629s : 1: renormalize.infer 0.12% : 0.000439s : 1: renormalize.specialize 0.00% : 0.000009s : 1: reorder_send_recv_between_fp_bp 0.00% : 0.000011s : 1: rewriter_after_jit_bprop_graph 0.02% : 0.000062s : 1: rewriter_after_opt_a 0.03% : 0.000114s : 1: rewriter_before_opt_a 0.00% : 0.000009s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000008s : 1: slice_recompute_activation 0.00% : 0.000007s : 1: split_layernorm_comm 0.00% : 0.000007s : 1: split_matmul_comm_elemetwise 0.00% : 0.000013s : 1: swap_dp_allreduce_reducescatter 0.03% : 0.000122s : 1: symbol_engine_optimizer 0.04% : 0.000160s : 1: tuple_transform 1.70% : 0.006267s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:40:17.653.094 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.224169, [21] [bootstrap]: 0.00042629 [type_inference]: 0.21202 [event_method]: 2.422e-05 [auto_monad]: 7.746e-05 [graph_reusing]: 5.91998e-06 [inline]: 2.93e-06 [add_attr]: 0.0040778, [1] [add_attr_with_inline]: 0.00406421, [1] [Cycle 1]: 8.31e-05, [2] [tag_attr]: 2.884e-05 [meta_addattr_fg_expand]: 6.48e-06 [parallel-infer-symbol]: 3.69002e-06 [pre_auto_parallel]: 4.469e-05 [insert-virtual-dataset]: 2.61e-06 [parallel-infer-symbol-second]: 1.07e-06 [dataset_repeat_opt]: 1.91e-06 [pipeline_split]: 2.06e-06 [optimize]: 0.0066284, [53] [py_interpret_to_execute]: 3.694e-05 [rewriter_before_opt_a]: 0.00010359 [opt_a]: 0.00380246, [2] [Cycle 1]: 0.00293902, [45] [expand_dump_flag]: 3.12002e-06 [switch_simplify]: 4.576e-05 [loop_unroll]: 3.216e-05 [a_1]: 0.00093138 [with_stream_mark]: 2.595e-05 [recompute_prepare]: 1.241e-05 [updatestate_depend_eliminate]: 5.99e-06 [updatestate_assign_eliminate]: 3.95e-06 [updatestate_loads_eliminate]: 4.23999e-06 [parameter_eliminate]: 2.06e-06 [a_2]: 0.00010607 [accelerated_algorithm]: 8.92999e-06 [shard]: 2.47001e-06 [meta_shard_fg_expand]: 2.54001e-06 [shard_inline]: 8.2e-06 [merge_send_recv]: 1.06e-05 [auto_parallel]: 9.29e-06 [parallel]: 2.222e-05 [flash_sp]: 1.104e-05 [merge_comm]: 5.31002e-06 [allreduce_fusion]: 4.42e-06 [matmul_add_comm_reduction]: 1.101e-05 [allreduce_slice_to_reducescatter]: 9.00007e-07 [virtual_shard_identity]: 1.081e-05 [virtual_dataset]: 8.40999e-06 [get_grad_eliminate_]: 8.95999e-06 [virtual_output]: 8.23999e-06 [merge_forward]: 4.76002e-06 [cell_reuse_recompute_pass]: 1.67001e-06 [offload_activation]: 1.184e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.77e-05 [merge_recompute_call_nodes]: 1.45999e-06 [before_grad]: 1.506e-05 [set_forward_comm_id_for_comm_node_pass]: 5.54e-06 [meta_fg_expand]: 3.85e-06 [flash_sp_send_recv_attached]: 2.76999e-06 [receive_attached]: 2.43002e-06 [after_resolve]: 1.527e-05 [a_after_grad]: 1.434e-05 [renormalize]: 0.00110733 [add_forward_monad_depend]: 7.56001e-06 [auto_monad_grad]: 3.24001e-06 [auto_monad_eliminator]: 2.163e-05 [cse]: 4.383e-05 [a_3]: 6.703e-05 [Cycle 2]: 0.00085053, [45] [expand_dump_flag]: 2.32999e-06 [switch_simplify]: 1.045e-05 [loop_unroll]: 8.23001e-06 [a_1]: 0.00019338 [with_stream_mark]: 1.957e-05 [recompute_prepare]: 8.18999e-06 [updatestate_depend_eliminate]: 4.53001e-06 [updatestate_assign_eliminate]: 3.26999e-06 [updatestate_loads_eliminate]: 3.04999e-06 [parameter_eliminate]: 2.30002e-06 [a_2]: 9.498e-05 [accelerated_algorithm]: 8.12e-06 [shard]: 2.32999e-06 [meta_shard_fg_expand]: 2.22001e-06 [shard_inline]: 7.3e-06 [merge_send_recv]: 1.048e-05 [auto_parallel]: 9.10999e-06 [parallel]: 7.95998e-06 [flash_sp]: 3.31999e-06 [merge_comm]: 5.05001e-06 [allreduce_fusion]: 4.79e-06 [matmul_add_comm_reduction]: 9.54e-06 [allreduce_slice_to_reducescatter]: 1.00999e-06 [virtual_shard_identity]: 9.65002e-06 [virtual_dataset]: 8.14002e-06 [get_grad_eliminate_]: 7.91001e-06 [virtual_output]: 7.31999e-06 [merge_forward]: 4.72e-06 [cell_reuse_recompute_pass]: 2.93e-06 [offload_activation]: 1.113e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.662e-05 [merge_recompute_call_nodes]: 1.39e-06 [before_grad]: 1.307e-05 [set_forward_comm_id_for_comm_node_pass]: 4.58001e-06 [meta_fg_expand]: 3.38e-06 [flash_sp_send_recv_attached]: 1.36998e-06 [receive_attached]: 2.53e-06 [after_resolve]: 1.538e-05 [a_after_grad]: 1.173e-05 [renormalize]: 8.00064e-08 [add_forward_monad_depend]: 1.92001e-06 [auto_monad_grad]: 1.62001e-06 [auto_monad_eliminator]: 1.218e-05 [cse]: 2.42e-05 [a_3]: 4.676e-05 [py_interpret_to_execute_after_opt_a]: 1.831e-05 [slice_cell_reuse_recomputed_activation]: 2.37999e-06 [rewriter_after_opt_a]: 4.928e-05 [convert_after_rewriter]: 8.78001e-06 [order_py_execute_after_rewriter]: 6.55002e-06 [mutable_eliminate]: 0.00083818 [opt_b]: 0.00027708, [1] [Cycle 1]: 0.00026723, [7] [b_1]: 0.00016463 [b_2]: 1.06e-05 [updatestate_depend_eliminate]: 1.103e-05 [updatestate_assign_eliminate]: 3.04001e-06 [updatestate_loads_eliminate]: 3.81999e-06 [renormalize]: 6.40022e-07 [cse]: 3.296e-05 [optimize_parallel_all_gather_comm]: 2.063e-05 [overlap_param_gather]: 2.08002e-06 [cconv]: 3.784e-05 [loop_unroll]: 0.00058148 [opt_after_cconv]: 0.000132, [1] [Cycle 1]: 0.00012521, [7] [c_1]: 3.957e-05 [parameter_eliminate]: 5.19998e-06 [updatestate_depend_eliminate]: 7.68001e-06 [updatestate_assign_eliminate]: 3.14999e-06 [updatestate_loads_eliminate]: 2.93e-06 [cse]: 3.002e-05 [renormalize]: 6.39993e-07 [remove_dup_value]: 2.002e-05 [tuple_transform]: 9.57e-05, [1] [Cycle 1]: 9.046e-05, [4] [d_1]: 5.923e-05 [none_parameter_eliminate]: 1.86e-06 [renormalize]: 3.29979e-07 [switch_simplify]: 9.11998e-06 [partial_unused_args_eliminate]: 2.13002e-06 [add_recomputation]: 6.661e-05 [cse_after_recomputation]: 2.796e-05, [1] [Cycle 1]: 2.241e-05, [1] [cse]: 1.64e-05 [environ_conv]: 8.72e-06 [swap_dp_allreduce_reducescatter]: 6.30002e-06 [bias_add_comm_swap]: 3.75e-06 [label_micro_interleaved_index]: 5.51e-06 [label_fine_grained_interleaved_index]: 3.23e-06 [merge_cast_opt]: 1.43002e-06 [slice_recompute_activation]: 2.38002e-06 [micro_interleaved_order_control]: 2.61e-06 [assign_add_opt]: 1.45999e-06 [ForceFp32Comm]: 9.49978e-07 [remove_cast_before_assign_add]: 1.27e-06 [full_micro_interleaved_order_control]: 2.56e-06 [reorder_send_recv_between_fp_bp]: 2.80002e-06 [comm_op_add_attrs]: 1.05001e-06 [add_comm_op_reuse_tag]: 1.17999e-06 [interleave_split_concat_branches]: 1.20999e-06 [interleave_parallel_branches]: 1.05999e-06 [overlap_opt_shard_in_pipeline]: 1.42e-06 [overlap_opt_shard_grad_in_pipeline]: 1.89999e-06 [control_data_broadcast_order]: 1.65e-05 [grouped_pairwise_exchange_alltoall]: 1.69e-06 [offloading_packed_experts]: 4.37e-06 [overlap_recompute_and_grad_model_parallel]: 5.94e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.24e-06 [overlap_recompute_allgather_and_fa_grad]: 1.32999e-06 [overlap_recompute_comm]: 2.07999e-06 [overlap_grad_ring_attention]: 4.45999e-06 [overlap_grad_flash_sp]: 2.536e-05 [begin_end_overlap_inline]: 6.39993e-07 [split_matmul_comm_elemetwise]: 2.11998e-06 [split_layernorm_comm]: 1.83002e-06 [handle_group_info]: 9.80013e-07 [symbol_engine_optimizer]: 9.213e-05, [1] [Cycle 1]: 8.743e-05, [6] [build]: 4.55999e-06 [elim_shapecalc]: 1.365e-05 [elim_not_effective]: 1.651e-05 [opt_reshape]: 8.82999e-06 [fold_const_symbol]: 1.274e-05 [renormalize]: 2.00002e-07 [detach_backward]: 2.68e-06 [pipeline_parallel_scheduler]: 1.62001e-06 [auto_monad_reorder]: 2.151e-05 [get_jit_bprop_graph]: 2.01e-06 [rewriter_after_jit_bprop_graph]: 6.52001e-06 [opt_after_jit_grad]: 0.00058696 [validate]: 5.139e-05 Sums bootstrap : 0.000426s : 0.19% type_inference : 0.212020s : 96.82% event_method : 0.000024s : 0.01% auto_monad : 0.000077s : 0.04% graph_reusing : 0.000006s : 0.00% inline : 0.000003s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000029s : 0.01% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.00% parallel-infer-symbol : 0.000004s : 0.00% pre_auto_parallel : 0.000045s : 0.02% insert-virtual-dataset : 0.000003s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000037s : 0.02% optimize.rewriter_before_opt_a : 0.000104s : 0.05% optimize.opt_a.expand_dump_flag : 0.000005s : 0.00% optimize.opt_a.switch_simplify : 0.000056s : 0.03% optimize.opt_a.loop_unroll : 0.000040s : 0.02% optimize.opt_a.a_1 : 0.001125s : 0.51% optimize.opt_a.with_stream_mark : 0.000046s : 0.02% optimize.opt_a.recompute_prepare : 0.000021s : 0.01% optimize.opt_a.updatestate_depend_eliminate : 0.000011s : 0.00% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.00% optimize.opt_a.updatestate_loads_eliminate : 0.000007s : 0.00% optimize.opt_a.parameter_eliminate : 0.000004s : 0.00% optimize.opt_a.a_2 : 0.000201s : 0.09% optimize.opt_a.accelerated_algorithm : 0.000017s : 0.01% optimize.opt_a.shard : 0.000005s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.000005s : 0.00% optimize.opt_a.shard_inline : 0.000016s : 0.01% optimize.opt_a.merge_send_recv : 0.000021s : 0.01% optimize.opt_a.auto_parallel : 0.000018s : 0.01% optimize.opt_a.parallel : 0.000030s : 0.01% optimize.opt_a.flash_sp : 0.000014s : 0.01% optimize.opt_a.merge_comm : 0.000010s : 0.00% optimize.opt_a.allreduce_fusion : 0.000009s : 0.00% optimize.opt_a.matmul_add_comm_reduction : 0.000021s : 0.01% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000002s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000020s : 0.01% optimize.opt_a.virtual_dataset : 0.000017s : 0.01% optimize.opt_a.get_grad_eliminate_ : 0.000017s : 0.01% optimize.opt_a.virtual_output : 0.000016s : 0.01% optimize.opt_a.merge_forward : 0.000009s : 0.00% optimize.opt_a.cell_reuse_recompute_pass : 0.000005s : 0.00% optimize.opt_a.offload_activation : 0.000023s : 0.01% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000034s : 0.02% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.00% optimize.opt_a.before_grad : 0.000028s : 0.01% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000010s : 0.00% optimize.opt_a.meta_fg_expand : 0.000007s : 0.00% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.00% optimize.opt_a.receive_attached : 0.000005s : 0.00% optimize.opt_a.after_resolve : 0.000031s : 0.01% optimize.opt_a.a_after_grad : 0.000026s : 0.01% optimize.opt_a.renormalize : 0.001107s : 0.51% optimize.opt_a.add_forward_monad_depend : 0.000009s : 0.00% optimize.opt_a.auto_monad_grad : 0.000005s : 0.00% optimize.opt_a.auto_monad_eliminator : 0.000034s : 0.02% optimize.opt_a.cse : 0.000068s : 0.03% optimize.opt_a.a_3 : 0.000114s : 0.05% optimize.py_interpret_to_execute_after_opt_a : 0.000018s : 0.01% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.00% optimize.rewriter_after_opt_a : 0.000049s : 0.02% optimize.convert_after_rewriter : 0.000009s : 0.00% optimize.order_py_execute_after_rewriter : 0.000007s : 0.00% optimize.mutable_eliminate : 0.000838s : 0.38% optimize.opt_b.b_1 : 0.000165s : 0.08% optimize.opt_b.b_2 : 0.000011s : 0.00% optimize.opt_b.updatestate_depend_eliminate : 0.000011s : 0.01% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000004s : 0.00% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000033s : 0.02% optimize.optimize_parallel_all_gather_comm : 0.000021s : 0.01% optimize.overlap_param_gather : 0.000002s : 0.00% optimize.cconv : 0.000038s : 0.02% optimize.loop_unroll : 0.000581s : 0.27% optimize.opt_after_cconv.c_1 : 0.000040s : 0.02% optimize.opt_after_cconv.parameter_eliminate : 0.000005s : 0.00% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000008s : 0.00% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.cse : 0.000030s : 0.01% optimize.opt_after_cconv.renormalize : 0.000001s : 0.00% optimize.remove_dup_value : 0.000020s : 0.01% optimize.tuple_transform.d_1 : 0.000059s : 0.03% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000009s : 0.00% optimize.partial_unused_args_eliminate : 0.000002s : 0.00% optimize.add_recomputation : 0.000067s : 0.03% optimize.cse_after_recomputation.cse : 0.000016s : 0.01% optimize.environ_conv : 0.000009s : 0.00% optimize.swap_dp_allreduce_reducescatter : 0.000006s : 0.00% optimize.bias_add_comm_swap : 0.000004s : 0.00% optimize.label_micro_interleaved_index : 0.000006s : 0.00% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.00% optimize.merge_cast_opt : 0.000001s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.00% optimize.micro_interleaved_order_control : 0.000003s : 0.00% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.00% optimize.full_micro_interleaved_order_control : 0.000003s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.00% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.00% optimize.control_data_broadcast_order : 0.000016s : 0.01% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.00% optimize.offloading_packed_experts : 0.000004s : 0.00% optimize.overlap_recompute_and_grad_model_parallel : 0.000006s : 0.00% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.00% optimize.overlap_recompute_comm : 0.000002s : 0.00% optimize.overlap_grad_ring_attention : 0.000004s : 0.00% optimize.overlap_grad_flash_sp : 0.000025s : 0.01% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.00% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000005s : 0.00% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000014s : 0.01% optimize.symbol_engine_optimizer.elim_not_effective : 0.000017s : 0.01% optimize.symbol_engine_optimizer.opt_reshape : 0.000009s : 0.00% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000013s : 0.01% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000003s : 0.00% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000022s : 0.01% get_jit_bprop_graph : 0.000002s : 0.00% rewriter_after_jit_bprop_graph : 0.000007s : 0.00% opt_after_jit_grad : 0.000587s : 0.27% validate : 0.000051s : 0.02% Time group info: ------[substitution.] 0.000265 38 11.95% : 0.000032s : 3: substitution.cast_eliminate 0.92% : 0.000002s : 3: substitution.elim_not_effective 0.66% : 0.000002s : 3: substitution.fold_const_symbol 2.72% : 0.000007s : 5: substitution.graph_param_transform 68.81% : 0.000182s : 4: substitution.inline 2.13% : 0.000006s : 6: substitution.j_node_and_user_rematch 2.84% : 0.000008s : 6: substitution.remove_not_recompute_node 2.89% : 0.000008s : 4: substitution.replace_old_param 7.08% : 0.000019s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.211926 2 99.50% : 0.210875s : 1: type_inference.infer 0.50% : 0.001052s : 1: type_inference.specialize ------[replace.] 0.000077 8 54.54% : 0.000042s : 4: replace.inline 45.46% : 0.000035s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000196 8 91.58% : 0.000180s : 4: match.inline 8.42% : 0.000017s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000279 1596 0.95% : 0.000003s : 17: predicate.accumulaten_eliminater 0.79% : 0.000002s : 5: predicate.ad_related_special_op_eliminate 0.57% : 0.000002s : 10: predicate.addn_check_dump 0.99% : 0.000003s : 17: predicate.addn_zero_filter 0.81% : 0.000002s : 17: predicate.adjust_all_reduce_mul_add 2.14% : 0.000006s : 27: predicate.arithmetic_simplify 1.16% : 0.000003s : 17: predicate.cast_eliminate 0.61% : 0.000002s : 10: predicate.check_bprop_eliminate 0.62% : 0.000002s : 10: predicate.compare_switch_simplify 0.17% : 0.000000s : 5: predicate.const_output_eliminate 0.67% : 0.000002s : 10: predicate.depend_value_elim 0.96% : 0.000003s : 17: predicate.dict_get_item_const_eliminator 1.03% : 0.000003s : 17: predicate.dict_get_item_eliminator 1.02% : 0.000003s : 17: predicate.dict_set_item_eliminator 0.87% : 0.000002s : 10: predicate.dumpgradient_eliminate 0.19% : 0.000001s : 5: predicate.elim_not_effective 0.36% : 0.000001s : 5: predicate.elim_shapecalc_of_broadcastargs 1.17% : 0.000003s : 22: predicate.environ_add_const_eliminate 1.09% : 0.000003s : 22: predicate.environ_get_add_eliminate 1.15% : 0.000003s : 22: predicate.environ_get_depend_swap 1.62% : 0.000005s : 32: predicate.environ_get_eliminate 1.14% : 0.000003s : 22: predicate.environ_get_set_eliminate 1.37% : 0.000004s : 25: predicate.exchange_switch_depend_value 2.20% : 0.000006s : 25: predicate.float_depend_g_call 0.57% : 0.000002s : 10: predicate.float_environ_get_switch 0.77% : 0.000002s : 15: predicate.float_tuple_getitem_switch 0.18% : 0.000000s : 5: predicate.fold_const_symbol 0.76% : 0.000002s : 10: predicate.get_grad_eliminate 0.20% : 0.000001s : 5: predicate.graph_param_transform 0.56% : 0.000002s : 10: predicate.incorporate_call 0.45% : 0.000001s : 10: predicate.incorporate_call_switch 5.96% : 0.000017s : 72: predicate.inline 0.69% : 0.000002s : 10: predicate.inline_without_move 0.27% : 0.000001s : 10: predicate.j_node_and_user_rematch 0.91% : 0.000003s : 10: predicate.less_batch_normalization 1.74% : 0.000005s : 31: predicate.list_to_tuple_eliminator_ 3.06% : 0.000009s : 48: predicate.load_eliminater 0.82% : 0.000002s : 5: predicate.loop_unroll_after_grad 2.17% : 0.000006s : 36: predicate.loop_unroll_before_grad 1.69% : 0.000005s : 27: predicate.make_slice_get_slice_eliminator 0.59% : 0.000002s : 10: predicate.merge_addn 0.52% : 0.000001s : 10: predicate.micro_step_allgather_replace 0.55% : 0.000002s : 10: predicate.mini_step_allgather_replace 0.86% : 0.000002s : 17: predicate.minmaximum_grad 1.21% : 0.000003s : 5: predicate.mutable_eliminate 0.32% : 0.000001s : 5: predicate.opt_reshape 0.50% : 0.000001s : 5: predicate.parallel_virtual_node 1.62% : 0.000005s : 25: predicate.partial_defer_inline 1.68% : 0.000005s : 26: predicate.partial_eliminate 1.02% : 0.000003s : 17: predicate.print_const_string_wrapper 0.56% : 0.000002s : 10: predicate.reduce_all_const_elim 1.50% : 0.000004s : 17: predicate.reduce_eliminate 2.60% : 0.000007s : 48: predicate.redundant_stop_gradient_eliminater 0.36% : 0.000001s : 10: predicate.remove_not_recompute_node 1.31% : 0.000004s : 31: predicate.replace_applicator 0.65% : 0.000002s : 10: predicate.replace_old_param 0.22% : 0.000001s : 5: predicate.reset_defer_inline 1.06% : 0.000003s : 17: predicate.reshape_eliminate 0.66% : 0.000002s : 10: predicate.row_tensor_add_zeros_like 0.39% : 0.000001s : 5: predicate.row_tensor_eliminate 0.75% : 0.000002s : 10: predicate.same_eliminate 0.56% : 0.000002s : 10: predicate.set_cell_output_no_recompute 0.81% : 0.000002s : 10: predicate.shard_identity_eliminate 0.70% : 0.000002s : 10: predicate.special_op_eliminate 0.64% : 0.000002s : 10: predicate.specialize_transform 1.04% : 0.000003s : 10: predicate.split_environ_get_set_with_tuple_value 0.89% : 0.000002s : 10: predicate.stack_unstack_eliminate 0.33% : 0.000001s : 5: predicate.switch_call_monad_eliminater 1.36% : 0.000004s : 25: predicate.switch_defer_inline 1.88% : 0.000005s : 35: predicate.switch_layer_defer_inline 4.49% : 0.000013s : 76: predicate.switch_simplify 0.89% : 0.000002s : 17: predicate.tile_eliminate 1.00% : 0.000003s : 17: predicate.transpose_eliminate 1.62% : 0.000005s : 27: predicate.tuple_list_convert_item_index_to_positive 1.64% : 0.000005s : 27: predicate.tuple_list_get_item_const_eliminator 1.62% : 0.000005s : 27: predicate.tuple_list_get_item_depend_reorder 3.48% : 0.000010s : 41: predicate.tuple_list_get_item_eliminator 1.60% : 0.000004s : 27: predicate.tuple_list_get_set_item_eliminator 2.33% : 0.000007s : 37: predicate.tuple_list_set_item_eliminator 1.70% : 0.000005s : 31: predicate.tuple_to_list_eliminator_ 2.43% : 0.000007s : 48: predicate.updatestate_pure_node_eliminater 3.08% : 0.000009s : 58: predicate.updatestate_useless_node_eliminater 0.45% : 0.000001s : 5: predicate.value_based_eliminate 0.79% : 0.000002s : 10: predicate.virtual_dataset_eliminate 0.64% : 0.000002s : 10: predicate.virtual_output_eliminate 0.25% : 0.000001s : 5: predicate.virtual_view_grad_eliminate 0.43% : 0.000001s : 5: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000763 11 46.45% : 0.000354s : 5: func_graph_cloner_run.FuncGraphClonerGraph 53.55% : 0.000408s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.237855 192 0.02% : 0.000040s : 1: ForceFp32Comm 1.72% : 0.004085s : 1: add_attr 1.71% : 0.004069s : 1: add_attr_with_inline 0.00% : 0.000004s : 1: add_comm_op_reuse_tag 0.03% : 0.000071s : 1: add_recomputation 0.00% : 0.000004s : 1: assign_add_opt 0.03% : 0.000083s : 1: auto_monad 0.01% : 0.000026s : 1: auto_monad_reorder 0.00% : 0.000004s : 1: begin_end_overlap_inline 0.00% : 0.000007s : 1: bias_add_comm_swap 0.19% : 0.000457s : 1: bootstrap 0.02% : 0.000042s : 1: cconv 0.00% : 0.000004s : 1: comm_op_add_attrs 0.01% : 0.000020s : 1: control_data_broadcast_order 0.01% : 0.000012s : 1: convert_after_rewriter 0.01% : 0.000031s : 1: cse_after_recomputation 0.00% : 0.000005s : 1: dataset_repeat_opt 0.00% : 0.000006s : 1: detach_backward 0.01% : 0.000012s : 1: environ_conv 0.01% : 0.000032s : 1: event_method 0.00% : 0.000005s : 1: full_micro_interleaved_order_control 0.00% : 0.000005s : 1: get_jit_bprop_graph 0.00% : 0.000009s : 1: graph_reusing 0.00% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000004s : 1: handle_group_info 0.00% : 0.000006s : 1: inline 0.00% : 0.000006s : 1: insert-virtual-dataset 0.00% : 0.000004s : 1: interleave_parallel_branches 0.00% : 0.000004s : 1: interleave_split_concat_branches 0.00% : 0.000007s : 1: label_fine_grained_interleaved_index 0.00% : 0.000008s : 1: label_micro_interleaved_index 0.25% : 0.000593s : 1: loop_unroll 0.00% : 0.000004s : 1: merge_cast_opt 0.00% : 0.000005s : 1: micro_interleaved_order_control 0.36% : 0.000851s : 1: mutable_eliminate 0.00% : 0.000007s : 1: offloading_packed_experts 0.01% : 0.000019s : 1: opt.transform.loop_unroll_optimizer 0.01% : 0.000023s : 1: opt.transform.mutable_eliminate 0.70% : 0.001672s : 78: opt.transform.opt_a 0.02% : 0.000038s : 1: opt.transform.opt_after_cconv 0.01% : 0.000031s : 1: opt.transform.opt_after_jit_grad 0.06% : 0.000139s : 28: opt.transform.opt_b 0.03% : 0.000065s : 2: opt.transform.opt_trans_graph 0.02% : 0.000048s : 4: opt.transform.symbol_engine_opt 1.60% : 0.003807s : 1: opt_a 0.06% : 0.000136s : 1: opt_after_cconv 0.25% : 0.000598s : 1: opt_after_jit_grad 0.12% : 0.000281s : 1: opt_b 2.79% : 0.006635s : 1: optimize 0.01% : 0.000024s : 1: optimize_parallel_all_gather_comm 0.00% : 0.000010s : 1: order_py_execute_after_rewriter 0.01% : 0.000029s : 1: overlap_grad_flash_sp 0.00% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.00% : 0.000009s : 1: overlap_grad_ring_attention 0.00% : 0.000007s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000006s : 1: overlap_param_gather 0.00% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.00% : 0.000009s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000005s : 1: overlap_recompute_comm 0.00% : 0.000007s : 1: parallel-infer-symbol 0.00% : 0.000004s : 1: parallel-infer-symbol-second 0.00% : 0.000005s : 1: partial_unused_args_eliminate 0.00% : 0.000005s : 1: pipeline_parallel_scheduler 0.00% : 0.000005s : 1: pipeline_split 0.02% : 0.000049s : 1: pre_auto_parallel 0.02% : 0.000041s : 1: py_interpret_to_execute 0.01% : 0.000022s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000009s : 1: remove_cast_before_assign_add 0.01% : 0.000024s : 1: remove_dup_value 0.27% : 0.000637s : 1: renormalize.infer 0.19% : 0.000458s : 1: renormalize.specialize 0.00% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.00% : 0.000010s : 1: rewriter_after_jit_bprop_graph 0.02% : 0.000054s : 1: rewriter_after_opt_a 0.05% : 0.000108s : 1: rewriter_before_opt_a 0.00% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000005s : 1: slice_recompute_activation 0.00% : 0.000005s : 1: split_layernorm_comm 0.00% : 0.000005s : 1: split_matmul_comm_elemetwise 0.00% : 0.000009s : 1: swap_dp_allreduce_reducescatter 0.04% : 0.000095s : 1: symbol_engine_optimizer 0.04% : 0.000099s : 1: tuple_transform 89.15% : 0.212047s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:40:19.558.995 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:40:19.559.261 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.253634, [21] [bootstrap]: 0.00043677 [type_inference]: 0.240107 [event_method]: 2.735e-05 [auto_monad]: 8.201e-05 [graph_reusing]: 6.88998e-06 [inline]: 2.37999e-06 [add_attr]: 0.00398521, [1] [add_attr_with_inline]: 0.00397145, [1] [Cycle 1]: 0.00010996, [2] [tag_attr]: 3.134e-05 [meta_addattr_fg_expand]: 6.43e-06 [parallel-infer-symbol]: 3.81001e-06 [pre_auto_parallel]: 4.529e-05 [insert-virtual-dataset]: 2.56e-06 [parallel-infer-symbol-second]: 8.80013e-07 [dataset_repeat_opt]: 2.39999e-06 [pipeline_split]: 1.97999e-06 [optimize]: 0.00752316, [53] [py_interpret_to_execute]: 4.269e-05 [rewriter_before_opt_a]: 0.00011155 [opt_a]: 0.00430109, [2] [Cycle 1]: 0.00317184, [45] [expand_dump_flag]: 3.37002e-06 [switch_simplify]: 4.729e-05 [loop_unroll]: 3.303e-05 [a_1]: 0.00077474 [with_stream_mark]: 2.85e-05 [recompute_prepare]: 1.592e-05 [updatestate_depend_eliminate]: 6.21998e-06 [updatestate_assign_eliminate]: 4.60001e-06 [updatestate_loads_eliminate]: 3.78001e-06 [parameter_eliminate]: 2.87002e-06 [a_2]: 0.00014575 [accelerated_algorithm]: 1.22e-05 [shard]: 2.83e-06 [meta_shard_fg_expand]: 3.73001e-06 [shard_inline]: 9.25999e-06 [merge_send_recv]: 1.175e-05 [auto_parallel]: 1.197e-05 [parallel]: 2.345e-05 [flash_sp]: 1.327e-05 [merge_comm]: 5.76e-06 [allreduce_fusion]: 4.2e-06 [matmul_add_comm_reduction]: 1.35e-05 [allreduce_slice_to_reducescatter]: 1.17e-06 [virtual_shard_identity]: 1.539e-05 [virtual_dataset]: 9.56e-06 [get_grad_eliminate_]: 8.94e-06 [virtual_output]: 8.65999e-06 [merge_forward]: 5.19998e-06 [cell_reuse_recompute_pass]: 2.64999e-06 [offload_activation]: 1.284e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.649e-05 [merge_recompute_call_nodes]: 1.73002e-06 [before_grad]: 1.672e-05 [set_forward_comm_id_for_comm_node_pass]: 5.80002e-06 [meta_fg_expand]: 3.83999e-06 [flash_sp_send_recv_attached]: 3.83001e-06 [receive_attached]: 2.43e-06 [after_resolve]: 1.715e-05 [a_after_grad]: 1.425e-05 [renormalize]: 0.0011782 [add_forward_monad_depend]: 8.98002e-06 [auto_monad_grad]: 2.64001e-06 [auto_monad_eliminator]: 2.429e-05 [cse]: 4.134e-05 [a_3]: 8.939e-05 [Cycle 2]: 0.00110982, [45] [expand_dump_flag]: 3.37997e-06 [switch_simplify]: 1.013e-05 [loop_unroll]: 8.95999e-06 [a_1]: 0.00019567 [with_stream_mark]: 2.278e-05 [recompute_prepare]: 8.85999e-06 [updatestate_depend_eliminate]: 5.00001e-06 [updatestate_assign_eliminate]: 4.03001e-06 [updatestate_loads_eliminate]: 3.55e-06 [parameter_eliminate]: 2.22001e-06 [a_2]: 0.0001273 [accelerated_algorithm]: 8.60001e-06 [shard]: 3.03e-06 [meta_shard_fg_expand]: 2.81e-06 [shard_inline]: 9.10999e-06 [merge_send_recv]: 9.89001e-06 [auto_parallel]: 1.177e-05 [parallel]: 1.103e-05 [flash_sp]: 4.95001e-06 [merge_comm]: 5.37001e-06 [allreduce_fusion]: 4.63999e-06 [matmul_add_comm_reduction]: 1.295e-05 [allreduce_slice_to_reducescatter]: 1.02e-06 [virtual_shard_identity]: 1.248e-05 [virtual_dataset]: 7.81001e-06 [get_grad_eliminate_]: 8.56002e-06 [virtual_output]: 7.26999e-06 [merge_forward]: 5.47001e-06 [cell_reuse_recompute_pass]: 2.72001e-06 [offload_activation]: 1.14e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.127e-05 [merge_recompute_call_nodes]: 1.35999e-06 [before_grad]: 1.427e-05 [set_forward_comm_id_for_comm_node_pass]: 4.81002e-06 [meta_fg_expand]: 4.08001e-06 [flash_sp_send_recv_attached]: 1.53002e-06 [receive_attached]: 2.65002e-06 [after_resolve]: 1.541e-05 [a_after_grad]: 1.253e-05 [renormalize]: 5.9983e-08 [add_forward_monad_depend]: 3.11999e-06 [auto_monad_grad]: 1.99999e-06 [auto_monad_eliminator]: 1.445e-05 [cse]: 2.885e-05 [a_3]: 6.639e-05 [py_interpret_to_execute_after_opt_a]: 2.341e-05 [slice_cell_reuse_recomputed_activation]: 5.81e-06 [rewriter_after_opt_a]: 5.872e-05 [convert_after_rewriter]: 1.389e-05 [order_py_execute_after_rewriter]: 1.036e-05 [mutable_eliminate]: 0.00092298 [opt_b]: 0.00037743, [1] [Cycle 1]: 0.00036272, [7] [b_1]: 0.00022648 [b_2]: 1.18e-05 [updatestate_depend_eliminate]: 1.247e-05 [updatestate_assign_eliminate]: 3.93999e-06 [updatestate_loads_eliminate]: 3.65e-06 [renormalize]: 9.10019e-07 [cse]: 3.99e-05 [optimize_parallel_all_gather_comm]: 2.973e-05 [overlap_param_gather]: 5.91998e-06 [cconv]: 4.646e-05 [loop_unroll]: 0.00059193 [opt_after_cconv]: 0.00015999, [1] [Cycle 1]: 0.00015008, [7] [c_1]: 4.014e-05 [parameter_eliminate]: 5.56e-06 [updatestate_depend_eliminate]: 8.37e-06 [updatestate_assign_eliminate]: 3.59002e-06 [updatestate_loads_eliminate]: 3.01001e-06 [cse]: 2.81e-05 [renormalize]: 9.60019e-07 [remove_dup_value]: 2.301e-05 [tuple_transform]: 0.00011649, [1] [Cycle 1]: 0.00010855, [4] [d_1]: 6.247e-05 [none_parameter_eliminate]: 1.85001e-06 [renormalize]: 3.70026e-07 [switch_simplify]: 9.45001e-06 [partial_unused_args_eliminate]: 4.4e-06 [add_recomputation]: 6.825e-05 [cse_after_recomputation]: 3.331e-05, [1] [Cycle 1]: 2.54e-05, [1] [cse]: 1.587e-05 [environ_conv]: 1.095e-05 [swap_dp_allreduce_reducescatter]: 9.22001e-06 [bias_add_comm_swap]: 5.89e-06 [label_micro_interleaved_index]: 8.18001e-06 [label_fine_grained_interleaved_index]: 5.46e-06 [merge_cast_opt]: 3.92002e-06 [slice_recompute_activation]: 4.97e-06 [micro_interleaved_order_control]: 5.23002e-06 [assign_add_opt]: 3.71001e-06 [ForceFp32Comm]: 3.6e-06 [remove_cast_before_assign_add]: 4.19002e-06 [full_micro_interleaved_order_control]: 4.80001e-06 [reorder_send_recv_between_fp_bp]: 6.83e-06 [comm_op_add_attrs]: 4.67e-06 [add_comm_op_reuse_tag]: 3.60998e-06 [interleave_split_concat_branches]: 3.81001e-06 [interleave_parallel_branches]: 3.76001e-06 [overlap_opt_shard_in_pipeline]: 4.02e-06 [overlap_opt_shard_grad_in_pipeline]: 4.39002e-06 [control_data_broadcast_order]: 2.167e-05 [grouped_pairwise_exchange_alltoall]: 4.08001e-06 [offloading_packed_experts]: 7.65e-06 [overlap_recompute_and_grad_model_parallel]: 9.12001e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.95e-06 [overlap_recompute_allgather_and_fa_grad]: 3.68999e-06 [overlap_recompute_comm]: 5.25001e-06 [overlap_grad_ring_attention]: 8.28999e-06 [overlap_grad_flash_sp]: 2.896e-05 [begin_end_overlap_inline]: 3.12002e-06 [split_matmul_comm_elemetwise]: 4.70001e-06 [split_layernorm_comm]: 4.75999e-06 [handle_group_info]: 3.97e-06 [symbol_engine_optimizer]: 0.00012776, [1] [Cycle 1]: 0.0001201, [6] [build]: 4.85001e-06 [elim_shapecalc]: 1.556e-05 [elim_not_effective]: 1.957e-05 [opt_reshape]: 1.112e-05 [fold_const_symbol]: 1.414e-05 [renormalize]: 2.00002e-07 [detach_backward]: 4.58001e-06 [pipeline_parallel_scheduler]: 1.96e-06 [auto_monad_reorder]: 2.831e-05 [get_jit_bprop_graph]: 2.23002e-06 [rewriter_after_jit_bprop_graph]: 7.42002e-06 [opt_after_jit_grad]: 0.00060693 [validate]: 5.182e-05 Sums bootstrap : 0.000437s : 0.18% type_inference : 0.240107s : 96.98% event_method : 0.000027s : 0.01% auto_monad : 0.000082s : 0.03% graph_reusing : 0.000007s : 0.00% inline : 0.000002s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000031s : 0.01% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.00% parallel-infer-symbol : 0.000004s : 0.00% pre_auto_parallel : 0.000045s : 0.02% insert-virtual-dataset : 0.000003s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000043s : 0.02% optimize.rewriter_before_opt_a : 0.000112s : 0.05% optimize.opt_a.expand_dump_flag : 0.000007s : 0.00% optimize.opt_a.switch_simplify : 0.000057s : 0.02% optimize.opt_a.loop_unroll : 0.000042s : 0.02% optimize.opt_a.a_1 : 0.000970s : 0.39% optimize.opt_a.with_stream_mark : 0.000051s : 0.02% optimize.opt_a.recompute_prepare : 0.000025s : 0.01% optimize.opt_a.updatestate_depend_eliminate : 0.000011s : 0.00% optimize.opt_a.updatestate_assign_eliminate : 0.000009s : 0.00% optimize.opt_a.updatestate_loads_eliminate : 0.000007s : 0.00% optimize.opt_a.parameter_eliminate : 0.000005s : 0.00% optimize.opt_a.a_2 : 0.000273s : 0.11% optimize.opt_a.accelerated_algorithm : 0.000021s : 0.01% optimize.opt_a.shard : 0.000006s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.000007s : 0.00% optimize.opt_a.shard_inline : 0.000018s : 0.01% optimize.opt_a.merge_send_recv : 0.000022s : 0.01% optimize.opt_a.auto_parallel : 0.000024s : 0.01% optimize.opt_a.parallel : 0.000034s : 0.01% optimize.opt_a.flash_sp : 0.000018s : 0.01% optimize.opt_a.merge_comm : 0.000011s : 0.00% optimize.opt_a.allreduce_fusion : 0.000009s : 0.00% optimize.opt_a.matmul_add_comm_reduction : 0.000026s : 0.01% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000002s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000028s : 0.01% optimize.opt_a.virtual_dataset : 0.000017s : 0.01% optimize.opt_a.get_grad_eliminate_ : 0.000018s : 0.01% optimize.opt_a.virtual_output : 0.000016s : 0.01% optimize.opt_a.merge_forward : 0.000011s : 0.00% optimize.opt_a.cell_reuse_recompute_pass : 0.000005s : 0.00% optimize.opt_a.offload_activation : 0.000024s : 0.01% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000048s : 0.02% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.00% optimize.opt_a.before_grad : 0.000031s : 0.01% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000011s : 0.00% optimize.opt_a.meta_fg_expand : 0.000008s : 0.00% optimize.opt_a.flash_sp_send_recv_attached : 0.000005s : 0.00% optimize.opt_a.receive_attached : 0.000005s : 0.00% optimize.opt_a.after_resolve : 0.000033s : 0.01% optimize.opt_a.a_after_grad : 0.000027s : 0.01% optimize.opt_a.renormalize : 0.001178s : 0.48% optimize.opt_a.add_forward_monad_depend : 0.000012s : 0.00% optimize.opt_a.auto_monad_grad : 0.000005s : 0.00% optimize.opt_a.auto_monad_eliminator : 0.000039s : 0.02% optimize.opt_a.cse : 0.000070s : 0.03% optimize.opt_a.a_3 : 0.000156s : 0.06% optimize.py_interpret_to_execute_after_opt_a : 0.000023s : 0.01% optimize.slice_cell_reuse_recomputed_activation : 0.000006s : 0.00% optimize.rewriter_after_opt_a : 0.000059s : 0.02% optimize.convert_after_rewriter : 0.000014s : 0.01% optimize.order_py_execute_after_rewriter : 0.000010s : 0.00% optimize.mutable_eliminate : 0.000923s : 0.37% optimize.opt_b.b_1 : 0.000226s : 0.09% optimize.opt_b.b_2 : 0.000012s : 0.00% optimize.opt_b.updatestate_depend_eliminate : 0.000012s : 0.01% optimize.opt_b.updatestate_assign_eliminate : 0.000004s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000004s : 0.00% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000040s : 0.02% optimize.optimize_parallel_all_gather_comm : 0.000030s : 0.01% optimize.overlap_param_gather : 0.000006s : 0.00% optimize.cconv : 0.000046s : 0.02% optimize.loop_unroll : 0.000592s : 0.24% optimize.opt_after_cconv.c_1 : 0.000040s : 0.02% optimize.opt_after_cconv.parameter_eliminate : 0.000006s : 0.00% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000008s : 0.00% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000004s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.cse : 0.000028s : 0.01% optimize.opt_after_cconv.renormalize : 0.000001s : 0.00% optimize.remove_dup_value : 0.000023s : 0.01% optimize.tuple_transform.d_1 : 0.000062s : 0.03% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000009s : 0.00% optimize.partial_unused_args_eliminate : 0.000004s : 0.00% optimize.add_recomputation : 0.000068s : 0.03% optimize.cse_after_recomputation.cse : 0.000016s : 0.01% optimize.environ_conv : 0.000011s : 0.00% optimize.swap_dp_allreduce_reducescatter : 0.000009s : 0.00% optimize.bias_add_comm_swap : 0.000006s : 0.00% optimize.label_micro_interleaved_index : 0.000008s : 0.00% optimize.label_fine_grained_interleaved_index : 0.000005s : 0.00% optimize.merge_cast_opt : 0.000004s : 0.00% optimize.slice_recompute_activation : 0.000005s : 0.00% optimize.micro_interleaved_order_control : 0.000005s : 0.00% optimize.assign_add_opt : 0.000004s : 0.00% optimize.ForceFp32Comm : 0.000004s : 0.00% optimize.remove_cast_before_assign_add : 0.000004s : 0.00% optimize.full_micro_interleaved_order_control : 0.000005s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000007s : 0.00% optimize.comm_op_add_attrs : 0.000005s : 0.00% optimize.add_comm_op_reuse_tag : 0.000004s : 0.00% optimize.interleave_split_concat_branches : 0.000004s : 0.00% optimize.interleave_parallel_branches : 0.000004s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000004s : 0.00% optimize.control_data_broadcast_order : 0.000022s : 0.01% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.00% optimize.offloading_packed_experts : 0.000008s : 0.00% optimize.overlap_recompute_and_grad_model_parallel : 0.000009s : 0.00% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.00% optimize.overlap_recompute_comm : 0.000005s : 0.00% optimize.overlap_grad_ring_attention : 0.000008s : 0.00% optimize.overlap_grad_flash_sp : 0.000029s : 0.01% optimize.begin_end_overlap_inline : 0.000003s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000005s : 0.00% optimize.split_layernorm_comm : 0.000005s : 0.00% optimize.handle_group_info : 0.000004s : 0.00% optimize.symbol_engine_optimizer.build : 0.000005s : 0.00% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000016s : 0.01% optimize.symbol_engine_optimizer.elim_not_effective : 0.000020s : 0.01% optimize.symbol_engine_optimizer.opt_reshape : 0.000011s : 0.00% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000014s : 0.01% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000005s : 0.00% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000028s : 0.01% get_jit_bprop_graph : 0.000002s : 0.00% rewriter_after_jit_bprop_graph : 0.000007s : 0.00% opt_after_jit_grad : 0.000607s : 0.25% validate : 0.000052s : 0.02% Time group info: ------[substitution.] 0.000285 38 12.24% : 0.000035s : 3: substitution.cast_eliminate 0.83% : 0.000002s : 3: substitution.elim_not_effective 0.89% : 0.000003s : 3: substitution.fold_const_symbol 2.72% : 0.000008s : 5: substitution.graph_param_transform 70.26% : 0.000201s : 4: substitution.inline 2.47% : 0.000007s : 6: substitution.j_node_and_user_rematch 2.76% : 0.000008s : 6: substitution.remove_not_recompute_node 2.65% : 0.000008s : 4: substitution.replace_old_param 5.18% : 0.000015s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.240029 2 99.64% : 0.239156s : 1: type_inference.infer 0.36% : 0.000873s : 1: type_inference.specialize ------[replace.] 0.000073 8 63.98% : 0.000047s : 4: replace.inline 36.02% : 0.000026s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000210 8 93.85% : 0.000197s : 4: match.inline 6.15% : 0.000013s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000277 1504 0.84% : 0.000002s : 15: predicate.accumulaten_eliminater 0.69% : 0.000002s : 5: predicate.ad_related_special_op_eliminate 0.61% : 0.000002s : 10: predicate.addn_check_dump 0.78% : 0.000002s : 15: predicate.addn_zero_filter 0.71% : 0.000002s : 15: predicate.adjust_all_reduce_mul_add 2.20% : 0.000006s : 25: predicate.arithmetic_simplify 1.17% : 0.000003s : 15: predicate.cast_eliminate 0.67% : 0.000002s : 10: predicate.check_bprop_eliminate 0.62% : 0.000002s : 10: predicate.compare_switch_simplify 0.18% : 0.000000s : 5: predicate.const_output_eliminate 0.78% : 0.000002s : 10: predicate.depend_value_elim 0.95% : 0.000003s : 15: predicate.dict_get_item_const_eliminator 0.86% : 0.000002s : 15: predicate.dict_get_item_eliminator 0.96% : 0.000003s : 15: predicate.dict_set_item_eliminator 0.91% : 0.000003s : 10: predicate.dumpgradient_eliminate 0.22% : 0.000001s : 5: predicate.elim_not_effective 0.39% : 0.000001s : 5: predicate.elim_shapecalc_of_broadcastargs 1.10% : 0.000003s : 20: predicate.environ_add_const_eliminate 1.07% : 0.000003s : 20: predicate.environ_get_add_eliminate 0.99% : 0.000003s : 20: predicate.environ_get_depend_swap 1.78% : 0.000005s : 30: predicate.environ_get_eliminate 1.18% : 0.000003s : 20: predicate.environ_get_set_eliminate 1.16% : 0.000003s : 23: predicate.exchange_switch_depend_value 1.99% : 0.000006s : 23: predicate.float_depend_g_call 0.68% : 0.000002s : 10: predicate.float_environ_get_switch 0.88% : 0.000002s : 15: predicate.float_tuple_getitem_switch 0.15% : 0.000000s : 5: predicate.fold_const_symbol 0.76% : 0.000002s : 10: predicate.get_grad_eliminate 0.25% : 0.000001s : 5: predicate.graph_param_transform 0.64% : 0.000002s : 10: predicate.incorporate_call 0.47% : 0.000001s : 10: predicate.incorporate_call_switch 6.17% : 0.000017s : 68: predicate.inline 1.11% : 0.000003s : 10: predicate.inline_without_move 0.28% : 0.000001s : 10: predicate.j_node_and_user_rematch 1.21% : 0.000003s : 10: predicate.less_batch_normalization 1.88% : 0.000005s : 29: predicate.list_to_tuple_eliminator_ 2.28% : 0.000006s : 44: predicate.load_eliminater 1.11% : 0.000003s : 5: predicate.loop_unroll_after_grad 2.15% : 0.000006s : 36: predicate.loop_unroll_before_grad 1.92% : 0.000005s : 25: predicate.make_slice_get_slice_eliminator 0.67% : 0.000002s : 10: predicate.merge_addn 0.58% : 0.000002s : 10: predicate.micro_step_allgather_replace 0.55% : 0.000002s : 10: predicate.mini_step_allgather_replace 0.77% : 0.000002s : 15: predicate.minmaximum_grad 1.68% : 0.000005s : 5: predicate.mutable_eliminate 0.45% : 0.000001s : 5: predicate.opt_reshape 0.47% : 0.000001s : 5: predicate.parallel_virtual_node 1.99% : 0.000006s : 23: predicate.partial_defer_inline 1.40% : 0.000004s : 24: predicate.partial_eliminate 0.82% : 0.000002s : 15: predicate.print_const_string_wrapper 0.67% : 0.000002s : 10: predicate.reduce_all_const_elim 1.13% : 0.000003s : 15: predicate.reduce_eliminate 2.35% : 0.000006s : 44: predicate.redundant_stop_gradient_eliminater 0.51% : 0.000001s : 10: predicate.remove_not_recompute_node 1.18% : 0.000003s : 29: predicate.replace_applicator 0.53% : 0.000001s : 10: predicate.replace_old_param 0.34% : 0.000001s : 5: predicate.reset_defer_inline 0.82% : 0.000002s : 15: predicate.reshape_eliminate 0.61% : 0.000002s : 10: predicate.row_tensor_add_zeros_like 0.38% : 0.000001s : 5: predicate.row_tensor_eliminate 0.95% : 0.000003s : 10: predicate.same_eliminate 0.38% : 0.000001s : 10: predicate.set_cell_output_no_recompute 1.28% : 0.000004s : 10: predicate.shard_identity_eliminate 0.76% : 0.000002s : 10: predicate.special_op_eliminate 0.76% : 0.000002s : 10: predicate.specialize_transform 1.14% : 0.000003s : 10: predicate.split_environ_get_set_with_tuple_value 0.81% : 0.000002s : 10: predicate.stack_unstack_eliminate 0.31% : 0.000001s : 5: predicate.switch_call_monad_eliminater 1.28% : 0.000004s : 23: predicate.switch_defer_inline 1.84% : 0.000005s : 33: predicate.switch_layer_defer_inline 5.29% : 0.000015s : 74: predicate.switch_simplify 0.99% : 0.000003s : 15: predicate.tile_eliminate 0.78% : 0.000002s : 15: predicate.transpose_eliminate 1.62% : 0.000004s : 25: predicate.tuple_list_convert_item_index_to_positive 1.43% : 0.000004s : 25: predicate.tuple_list_get_item_const_eliminator 1.33% : 0.000004s : 25: predicate.tuple_list_get_item_depend_reorder 3.26% : 0.000009s : 39: predicate.tuple_list_get_item_eliminator 1.53% : 0.000004s : 25: predicate.tuple_list_get_set_item_eliminator 2.44% : 0.000007s : 35: predicate.tuple_list_set_item_eliminator 1.52% : 0.000004s : 29: predicate.tuple_to_list_eliminator_ 2.14% : 0.000006s : 44: predicate.updatestate_pure_node_eliminater 2.83% : 0.000008s : 54: predicate.updatestate_useless_node_eliminater 0.45% : 0.000001s : 5: predicate.value_based_eliminate 0.91% : 0.000003s : 10: predicate.virtual_dataset_eliminate 0.64% : 0.000002s : 10: predicate.virtual_output_eliminate 0.26% : 0.000001s : 5: predicate.virtual_view_grad_eliminate 0.45% : 0.000001s : 5: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000732 11 51.08% : 0.000374s : 5: func_graph_cloner_run.FuncGraphClonerGraph 48.92% : 0.000358s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.268133 192 0.00% : 0.000006s : 1: ForceFp32Comm 1.49% : 0.003998s : 1: add_attr 1.48% : 0.003976s : 1: add_attr_with_inline 0.00% : 0.000006s : 1: add_comm_op_reuse_tag 0.03% : 0.000072s : 1: add_recomputation 0.00% : 0.000006s : 1: assign_add_opt 0.03% : 0.000094s : 1: auto_monad 0.01% : 0.000038s : 1: auto_monad_reorder 0.00% : 0.000006s : 1: begin_end_overlap_inline 0.00% : 0.000009s : 1: bias_add_comm_swap 0.18% : 0.000481s : 1: bootstrap 0.02% : 0.000050s : 1: cconv 0.00% : 0.000008s : 1: comm_op_add_attrs 0.01% : 0.000025s : 1: control_data_broadcast_order 0.01% : 0.000017s : 1: convert_after_rewriter 0.01% : 0.000037s : 1: cse_after_recomputation 0.00% : 0.000008s : 1: dataset_repeat_opt 0.01% : 0.000024s : 1: detach_backward 0.01% : 0.000014s : 1: environ_conv 0.01% : 0.000039s : 1: event_method 0.00% : 0.000008s : 1: full_micro_interleaved_order_control 0.00% : 0.000009s : 1: get_jit_bprop_graph 0.01% : 0.000013s : 1: graph_reusing 0.00% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000007s : 1: handle_group_info 0.00% : 0.000008s : 1: inline 0.00% : 0.000010s : 1: insert-virtual-dataset 0.00% : 0.000006s : 1: interleave_parallel_branches 0.00% : 0.000007s : 1: interleave_split_concat_branches 0.00% : 0.000008s : 1: label_fine_grained_interleaved_index 0.00% : 0.000011s : 1: label_micro_interleaved_index 0.22% : 0.000599s : 1: loop_unroll 0.00% : 0.000007s : 1: merge_cast_opt 0.00% : 0.000008s : 1: micro_interleaved_order_control 0.35% : 0.000933s : 1: mutable_eliminate 0.00% : 0.000011s : 1: offloading_packed_experts 0.01% : 0.000020s : 1: opt.transform.loop_unroll_optimizer 0.01% : 0.000032s : 1: opt.transform.mutable_eliminate 0.59% : 0.001579s : 78: opt.transform.opt_a 0.01% : 0.000039s : 1: opt.transform.opt_after_cconv 0.01% : 0.000034s : 1: opt.transform.opt_after_jit_grad 0.06% : 0.000152s : 28: opt.transform.opt_b 0.03% : 0.000068s : 2: opt.transform.opt_trans_graph 0.02% : 0.000056s : 4: opt.transform.symbol_engine_opt 1.61% : 0.004305s : 1: opt_a 0.06% : 0.000164s : 1: opt_after_cconv 0.23% : 0.000618s : 1: opt_after_jit_grad 0.14% : 0.000382s : 1: opt_b 2.96% : 0.007935s : 1: optimize 0.01% : 0.000033s : 1: optimize_parallel_all_gather_comm 0.01% : 0.000014s : 1: order_py_execute_after_rewriter 0.01% : 0.000032s : 1: overlap_grad_flash_sp 0.00% : 0.000007s : 1: overlap_grad_matmul_and_grad_allreduce 0.00% : 0.000012s : 1: overlap_grad_ring_attention 0.00% : 0.000008s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000007s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000009s : 1: overlap_param_gather 0.00% : 0.000006s : 1: overlap_recompute_allgather_and_fa_grad 0.00% : 0.000012s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000008s : 1: overlap_recompute_comm 0.00% : 0.000011s : 1: parallel-infer-symbol 0.00% : 0.000007s : 1: parallel-infer-symbol-second 0.00% : 0.000007s : 1: partial_unused_args_eliminate 0.00% : 0.000009s : 1: pipeline_parallel_scheduler 0.00% : 0.000008s : 1: pipeline_split 0.02% : 0.000053s : 1: pre_auto_parallel 0.02% : 0.000047s : 1: py_interpret_to_execute 0.01% : 0.000027s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000008s : 1: remove_cast_before_assign_add 0.01% : 0.000026s : 1: remove_dup_value 0.24% : 0.000648s : 1: renormalize.infer 0.19% : 0.000516s : 1: renormalize.specialize 0.00% : 0.000010s : 1: reorder_send_recv_between_fp_bp 0.01% : 0.000014s : 1: rewriter_after_jit_bprop_graph 0.02% : 0.000064s : 1: rewriter_after_opt_a 0.04% : 0.000116s : 1: rewriter_before_opt_a 0.00% : 0.000009s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000008s : 1: slice_recompute_activation 0.00% : 0.000008s : 1: split_layernorm_comm 0.00% : 0.000007s : 1: split_matmul_comm_elemetwise 0.00% : 0.000012s : 1: swap_dp_allreduce_reducescatter 0.05% : 0.000131s : 1: symbol_engine_optimizer 0.04% : 0.000119s : 1: tuple_transform 89.57% : 0.240167s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:40:21.958.249 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.110576, [21] [bootstrap]: 0.00046329 [type_inference]: 0.0990043 [event_method]: 2.132e-05 [auto_monad]: 6.96e-05 [graph_reusing]: 7.28e-06 [inline]: 3.08e-06 [add_attr]: 0.00392581, [1] [add_attr_with_inline]: 0.00391282, [1] [Cycle 1]: 8.286e-05, [2] [tag_attr]: 3.057e-05 [meta_addattr_fg_expand]: 6.70002e-06 [parallel-infer-symbol]: 4.18001e-06 [pre_auto_parallel]: 4.374e-05 [insert-virtual-dataset]: 3.27002e-06 [parallel-infer-symbol-second]: 9.39996e-07 [dataset_repeat_opt]: 2.96001e-06 [pipeline_split]: 1.95001e-06 [optimize]: 0.00620685, [53] [py_interpret_to_execute]: 3.385e-05 [rewriter_before_opt_a]: 0.00010466 [opt_a]: 0.00365475, [2] [Cycle 1]: 0.00278045, [45] [expand_dump_flag]: 3.59002e-06 [switch_simplify]: 4.826e-05 [loop_unroll]: 3.221e-05 [a_1]: 0.00075522 [with_stream_mark]: 2.184e-05 [recompute_prepare]: 1.196e-05 [updatestate_depend_eliminate]: 4.95999e-06 [updatestate_assign_eliminate]: 4.03001e-06 [updatestate_loads_eliminate]: 3.88999e-06 [parameter_eliminate]: 1.95001e-06 [a_2]: 0.00010522 [accelerated_algorithm]: 9.33002e-06 [shard]: 1.98002e-06 [meta_shard_fg_expand]: 2.33998e-06 [shard_inline]: 8.50001e-06 [merge_send_recv]: 1.064e-05 [auto_parallel]: 8.39998e-06 [parallel]: 2.033e-05 [flash_sp]: 1.066e-05 [merge_comm]: 4.99e-06 [allreduce_fusion]: 4.47e-06 [matmul_add_comm_reduction]: 1.065e-05 [allreduce_slice_to_reducescatter]: 8.00006e-07 [virtual_shard_identity]: 1.071e-05 [virtual_dataset]: 9.13002e-06 [get_grad_eliminate_]: 8.45001e-06 [virtual_output]: 8.67e-06 [merge_forward]: 5.26002e-06 [cell_reuse_recompute_pass]: 1.13001e-06 [offload_activation]: 1.205e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.845e-05 [merge_recompute_call_nodes]: 1.57001e-06 [before_grad]: 1.498e-05 [set_forward_comm_id_for_comm_node_pass]: 4.99998e-06 [meta_fg_expand]: 3.75e-06 [flash_sp_send_recv_attached]: 3.17002e-06 [receive_attached]: 3.35e-06 [after_resolve]: 1.597e-05 [a_after_grad]: 1.328e-05 [renormalize]: 0.00114886 [add_forward_monad_depend]: 8.52998e-06 [auto_monad_grad]: 2.61999e-06 [auto_monad_eliminator]: 2.11e-05 [cse]: 4.159e-05 [a_3]: 6.851e-05 [Cycle 2]: 0.00086175, [45] [expand_dump_flag]: 2.22999e-06 [switch_simplify]: 1.022e-05 [loop_unroll]: 7.69002e-06 [a_1]: 0.00018755 [with_stream_mark]: 1.986e-05 [recompute_prepare]: 9.00999e-06 [updatestate_depend_eliminate]: 4.44002e-06 [updatestate_assign_eliminate]: 4.28999e-06 [updatestate_loads_eliminate]: 3.5e-06 [parameter_eliminate]: 1.49998e-06 [a_2]: 9.545e-05 [accelerated_algorithm]: 8.79e-06 [shard]: 1.49998e-06 [meta_shard_fg_expand]: 2.78e-06 [shard_inline]: 7.89002e-06 [merge_send_recv]: 2.006e-05 [auto_parallel]: 8.3e-06 [parallel]: 8.45001e-06 [flash_sp]: 3.9e-06 [merge_comm]: 5.83002e-06 [allreduce_fusion]: 4.13999e-06 [matmul_add_comm_reduction]: 1.006e-05 [allreduce_slice_to_reducescatter]: 5.8001e-07 [virtual_shard_identity]: 1.028e-05 [virtual_dataset]: 8.60001e-06 [get_grad_eliminate_]: 7.11999e-06 [virtual_output]: 7.56999e-06 [merge_forward]: 4.31002e-06 [cell_reuse_recompute_pass]: 2.89999e-06 [offload_activation]: 1.066e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.668e-05 [merge_recompute_call_nodes]: 1.36998e-06 [before_grad]: 1.384e-05 [set_forward_comm_id_for_comm_node_pass]: 4.92999e-06 [meta_fg_expand]: 3.66001e-06 [flash_sp_send_recv_attached]: 1.30999e-06 [receive_attached]: 2.11e-06 [after_resolve]: 1.48e-05 [a_after_grad]: 1.335e-05 [renormalize]: 8.9989e-08 [add_forward_monad_depend]: 2.07999e-06 [auto_monad_grad]: 1.57001e-06 [auto_monad_eliminator]: 1.098e-05 [cse]: 2.316e-05 [a_3]: 4.778e-05 [py_interpret_to_execute_after_opt_a]: 1.708e-05 [slice_cell_reuse_recomputed_activation]: 2.37999e-06 [rewriter_after_opt_a]: 4.613e-05 [convert_after_rewriter]: 7.67998e-06 [order_py_execute_after_rewriter]: 5.67001e-06 [mutable_eliminate]: 0.00073821 [opt_b]: 0.00026766, [1] [Cycle 1]: 0.00025995, [7] [b_1]: 0.00016375 [b_2]: 1.066e-05 [updatestate_depend_eliminate]: 8.92999e-06 [updatestate_assign_eliminate]: 2.99999e-06 [updatestate_loads_eliminate]: 2.80002e-06 [renormalize]: 7.30011e-07 [cse]: 3.019e-05 [optimize_parallel_all_gather_comm]: 2.05e-05 [overlap_param_gather]: 2.09e-06 [cconv]: 3.445e-05 [loop_unroll]: 0.00049283 [opt_after_cconv]: 0.00012684, [1] [Cycle 1]: 0.00012032, [7] [c_1]: 3.96e-05 [parameter_eliminate]: 4.28999e-06 [updatestate_depend_eliminate]: 6.82002e-06 [updatestate_assign_eliminate]: 3.46001e-06 [updatestate_loads_eliminate]: 3.01001e-06 [cse]: 2.587e-05 [renormalize]: 5.50004e-07 [remove_dup_value]: 1.811e-05 [tuple_transform]: 9.222e-05, [1] [Cycle 1]: 8.725e-05, [4] [d_1]: 5.677e-05 [none_parameter_eliminate]: 1.92999e-06 [renormalize]: 4.09986e-07 [switch_simplify]: 8.56002e-06 [partial_unused_args_eliminate]: 2.00002e-06 [add_recomputation]: 6.544e-05 [cse_after_recomputation]: 2.832e-05, [1] [Cycle 1]: 2.28e-05, [1] [cse]: 1.652e-05 [environ_conv]: 7.05e-06 [swap_dp_allreduce_reducescatter]: 6.36998e-06 [bias_add_comm_swap]: 3.4e-06 [label_micro_interleaved_index]: 4.80001e-06 [label_fine_grained_interleaved_index]: 3.31001e-06 [merge_cast_opt]: 1.47999e-06 [slice_recompute_activation]: 2.12001e-06 [micro_interleaved_order_control]: 2.98e-06 [assign_add_opt]: 1.42e-06 [ForceFp32Comm]: 8.80013e-07 [remove_cast_before_assign_add]: 1.39003e-06 [full_micro_interleaved_order_control]: 2.31e-06 [reorder_send_recv_between_fp_bp]: 2.81e-06 [comm_op_add_attrs]: 1.04998e-06 [add_comm_op_reuse_tag]: 1.45999e-06 [interleave_split_concat_branches]: 1.15001e-06 [interleave_parallel_branches]: 1.07998e-06 [overlap_opt_shard_in_pipeline]: 1.20999e-06 [overlap_opt_shard_grad_in_pipeline]: 1.79e-06 [control_data_broadcast_order]: 1.567e-05 [grouped_pairwise_exchange_alltoall]: 2.02001e-06 [offloading_packed_experts]: 4.73001e-06 [overlap_recompute_and_grad_model_parallel]: 5.25001e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.38002e-06 [overlap_recompute_allgather_and_fa_grad]: 1.39998e-06 [overlap_recompute_comm]: 2.46e-06 [overlap_grad_ring_attention]: 4.84003e-06 [overlap_grad_flash_sp]: 2.548e-05 [begin_end_overlap_inline]: 6.30011e-07 [split_matmul_comm_elemetwise]: 2.77002e-06 [split_layernorm_comm]: 1.86998e-06 [handle_group_info]: 1.59e-06 [symbol_engine_optimizer]: 9.231e-05, [1] [Cycle 1]: 8.729e-05, [6] [build]: 4.28001e-06 [elim_shapecalc]: 1.357e-05 [elim_not_effective]: 1.639e-05 [opt_reshape]: 8.80999e-06 [fold_const_symbol]: 1.284e-05 [renormalize]: 3.59985e-07 [detach_backward]: 2.31e-06 [pipeline_parallel_scheduler]: 1.54998e-06 [auto_monad_reorder]: 2.227e-05 [get_jit_bprop_graph]: 2.30002e-06 [rewriter_after_jit_bprop_graph]: 5.04e-06 [opt_after_jit_grad]: 0.00055819 [validate]: 5.032e-05 Sums bootstrap : 0.000463s : 0.44% type_inference : 0.099004s : 93.75% event_method : 0.000021s : 0.02% auto_monad : 0.000070s : 0.07% graph_reusing : 0.000007s : 0.01% inline : 0.000003s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000031s : 0.03% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000007s : 0.01% parallel-infer-symbol : 0.000004s : 0.00% pre_auto_parallel : 0.000044s : 0.04% insert-virtual-dataset : 0.000003s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000003s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000034s : 0.03% optimize.rewriter_before_opt_a : 0.000105s : 0.10% optimize.opt_a.expand_dump_flag : 0.000006s : 0.01% optimize.opt_a.switch_simplify : 0.000058s : 0.06% optimize.opt_a.loop_unroll : 0.000040s : 0.04% optimize.opt_a.a_1 : 0.000943s : 0.89% optimize.opt_a.with_stream_mark : 0.000042s : 0.04% optimize.opt_a.recompute_prepare : 0.000021s : 0.02% optimize.opt_a.updatestate_depend_eliminate : 0.000009s : 0.01% optimize.opt_a.updatestate_assign_eliminate : 0.000008s : 0.01% optimize.opt_a.updatestate_loads_eliminate : 0.000007s : 0.01% optimize.opt_a.parameter_eliminate : 0.000003s : 0.00% optimize.opt_a.a_2 : 0.000201s : 0.19% optimize.opt_a.accelerated_algorithm : 0.000018s : 0.02% optimize.opt_a.shard : 0.000003s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.000005s : 0.00% optimize.opt_a.shard_inline : 0.000016s : 0.02% optimize.opt_a.merge_send_recv : 0.000031s : 0.03% optimize.opt_a.auto_parallel : 0.000017s : 0.02% optimize.opt_a.parallel : 0.000029s : 0.03% optimize.opt_a.flash_sp : 0.000015s : 0.01% optimize.opt_a.merge_comm : 0.000011s : 0.01% optimize.opt_a.allreduce_fusion : 0.000009s : 0.01% optimize.opt_a.matmul_add_comm_reduction : 0.000021s : 0.02% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000021s : 0.02% optimize.opt_a.virtual_dataset : 0.000018s : 0.02% optimize.opt_a.get_grad_eliminate_ : 0.000016s : 0.01% optimize.opt_a.virtual_output : 0.000016s : 0.02% optimize.opt_a.merge_forward : 0.000010s : 0.01% optimize.opt_a.cell_reuse_recompute_pass : 0.000004s : 0.00% optimize.opt_a.offload_activation : 0.000023s : 0.02% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000035s : 0.03% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.00% optimize.opt_a.before_grad : 0.000029s : 0.03% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000010s : 0.01% optimize.opt_a.meta_fg_expand : 0.000007s : 0.01% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.00% optimize.opt_a.receive_attached : 0.000005s : 0.01% optimize.opt_a.after_resolve : 0.000031s : 0.03% optimize.opt_a.a_after_grad : 0.000027s : 0.03% optimize.opt_a.renormalize : 0.001149s : 1.09% optimize.opt_a.add_forward_monad_depend : 0.000011s : 0.01% optimize.opt_a.auto_monad_grad : 0.000004s : 0.00% optimize.opt_a.auto_monad_eliminator : 0.000032s : 0.03% optimize.opt_a.cse : 0.000065s : 0.06% optimize.opt_a.a_3 : 0.000116s : 0.11% optimize.py_interpret_to_execute_after_opt_a : 0.000017s : 0.02% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.00% optimize.rewriter_after_opt_a : 0.000046s : 0.04% optimize.convert_after_rewriter : 0.000008s : 0.01% optimize.order_py_execute_after_rewriter : 0.000006s : 0.01% optimize.mutable_eliminate : 0.000738s : 0.70% optimize.opt_b.b_1 : 0.000164s : 0.16% optimize.opt_b.b_2 : 0.000011s : 0.01% optimize.opt_b.updatestate_depend_eliminate : 0.000009s : 0.01% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.00% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000030s : 0.03% optimize.optimize_parallel_all_gather_comm : 0.000021s : 0.02% optimize.overlap_param_gather : 0.000002s : 0.00% optimize.cconv : 0.000034s : 0.03% optimize.loop_unroll : 0.000493s : 0.47% optimize.opt_after_cconv.c_1 : 0.000040s : 0.04% optimize.opt_after_cconv.parameter_eliminate : 0.000004s : 0.00% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000007s : 0.01% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.cse : 0.000026s : 0.02% optimize.opt_after_cconv.renormalize : 0.000001s : 0.00% optimize.remove_dup_value : 0.000018s : 0.02% optimize.tuple_transform.d_1 : 0.000057s : 0.05% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000009s : 0.01% optimize.partial_unused_args_eliminate : 0.000002s : 0.00% optimize.add_recomputation : 0.000065s : 0.06% optimize.cse_after_recomputation.cse : 0.000017s : 0.02% optimize.environ_conv : 0.000007s : 0.01% optimize.swap_dp_allreduce_reducescatter : 0.000006s : 0.01% optimize.bias_add_comm_swap : 0.000003s : 0.00% optimize.label_micro_interleaved_index : 0.000005s : 0.00% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.00% optimize.merge_cast_opt : 0.000001s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.00% optimize.micro_interleaved_order_control : 0.000003s : 0.00% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.00% optimize.full_micro_interleaved_order_control : 0.000002s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.00% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.00% optimize.control_data_broadcast_order : 0.000016s : 0.01% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.00% optimize.offloading_packed_experts : 0.000005s : 0.00% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.00% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.00% optimize.overlap_recompute_comm : 0.000002s : 0.00% optimize.overlap_grad_ring_attention : 0.000005s : 0.00% optimize.overlap_grad_flash_sp : 0.000025s : 0.02% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000003s : 0.00% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000002s : 0.00% optimize.symbol_engine_optimizer.build : 0.000004s : 0.00% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000014s : 0.01% optimize.symbol_engine_optimizer.elim_not_effective : 0.000016s : 0.02% optimize.symbol_engine_optimizer.opt_reshape : 0.000009s : 0.01% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000013s : 0.01% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.00% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000022s : 0.02% get_jit_bprop_graph : 0.000002s : 0.00% rewriter_after_jit_bprop_graph : 0.000005s : 0.00% opt_after_jit_grad : 0.000558s : 0.53% validate : 0.000050s : 0.05% Time group info: ------[substitution.] 0.000272 38 10.95% : 0.000030s : 3: substitution.cast_eliminate 0.88% : 0.000002s : 3: substitution.elim_not_effective 0.75% : 0.000002s : 3: substitution.fold_const_symbol 2.93% : 0.000008s : 5: substitution.graph_param_transform 72.06% : 0.000196s : 4: substitution.inline 2.33% : 0.000006s : 6: substitution.j_node_and_user_rematch 2.69% : 0.000007s : 6: substitution.remove_not_recompute_node 2.24% : 0.000006s : 4: substitution.replace_old_param 5.17% : 0.000014s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.098931 2 99.12% : 0.098064s : 1: type_inference.infer 0.88% : 0.000867s : 1: type_inference.specialize ------[replace.] 0.000071 8 63.77% : 0.000045s : 4: replace.inline 36.23% : 0.000026s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000205 8 94.06% : 0.000193s : 4: match.inline 5.94% : 0.000012s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000257 1504 0.97% : 0.000002s : 15: predicate.accumulaten_eliminater 0.97% : 0.000002s : 5: predicate.ad_related_special_op_eliminate 0.51% : 0.000001s : 10: predicate.addn_check_dump 1.10% : 0.000003s : 15: predicate.addn_zero_filter 0.81% : 0.000002s : 15: predicate.adjust_all_reduce_mul_add 2.15% : 0.000006s : 25: predicate.arithmetic_simplify 1.05% : 0.000003s : 15: predicate.cast_eliminate 0.60% : 0.000002s : 10: predicate.check_bprop_eliminate 0.58% : 0.000001s : 10: predicate.compare_switch_simplify 0.18% : 0.000000s : 5: predicate.const_output_eliminate 0.64% : 0.000002s : 10: predicate.depend_value_elim 0.83% : 0.000002s : 15: predicate.dict_get_item_const_eliminator 1.09% : 0.000003s : 15: predicate.dict_get_item_eliminator 0.89% : 0.000002s : 15: predicate.dict_set_item_eliminator 1.20% : 0.000003s : 10: predicate.dumpgradient_eliminate 0.20% : 0.000001s : 5: predicate.elim_not_effective 0.40% : 0.000001s : 5: predicate.elim_shapecalc_of_broadcastargs 1.14% : 0.000003s : 20: predicate.environ_add_const_eliminate 1.04% : 0.000003s : 20: predicate.environ_get_add_eliminate 1.04% : 0.000003s : 20: predicate.environ_get_depend_swap 1.72% : 0.000004s : 30: predicate.environ_get_eliminate 1.08% : 0.000003s : 20: predicate.environ_get_set_eliminate 1.34% : 0.000003s : 23: predicate.exchange_switch_depend_value 2.30% : 0.000006s : 23: predicate.float_depend_g_call 0.65% : 0.000002s : 10: predicate.float_environ_get_switch 0.83% : 0.000002s : 15: predicate.float_tuple_getitem_switch 0.17% : 0.000000s : 5: predicate.fold_const_symbol 0.61% : 0.000002s : 10: predicate.get_grad_eliminate 0.21% : 0.000001s : 5: predicate.graph_param_transform 0.70% : 0.000002s : 10: predicate.incorporate_call 0.50% : 0.000001s : 10: predicate.incorporate_call_switch 6.06% : 0.000016s : 68: predicate.inline 0.89% : 0.000002s : 10: predicate.inline_without_move 0.32% : 0.000001s : 10: predicate.j_node_and_user_rematch 1.05% : 0.000003s : 10: predicate.less_batch_normalization 1.64% : 0.000004s : 29: predicate.list_to_tuple_eliminator_ 2.45% : 0.000006s : 44: predicate.load_eliminater 0.98% : 0.000003s : 5: predicate.loop_unroll_after_grad 2.01% : 0.000005s : 36: predicate.loop_unroll_before_grad 1.58% : 0.000004s : 25: predicate.make_slice_get_slice_eliminator 0.64% : 0.000002s : 10: predicate.merge_addn 0.59% : 0.000002s : 10: predicate.micro_step_allgather_replace 0.63% : 0.000002s : 10: predicate.mini_step_allgather_replace 0.80% : 0.000002s : 15: predicate.minmaximum_grad 1.44% : 0.000004s : 5: predicate.mutable_eliminate 0.35% : 0.000001s : 5: predicate.opt_reshape 0.39% : 0.000001s : 5: predicate.parallel_virtual_node 1.66% : 0.000004s : 23: predicate.partial_defer_inline 1.60% : 0.000004s : 24: predicate.partial_eliminate 0.87% : 0.000002s : 15: predicate.print_const_string_wrapper 0.54% : 0.000001s : 10: predicate.reduce_all_const_elim 1.24% : 0.000003s : 15: predicate.reduce_eliminate 2.37% : 0.000006s : 44: predicate.redundant_stop_gradient_eliminater 0.46% : 0.000001s : 10: predicate.remove_not_recompute_node 1.44% : 0.000004s : 29: predicate.replace_applicator 0.58% : 0.000001s : 10: predicate.replace_old_param 0.46% : 0.000001s : 5: predicate.reset_defer_inline 0.85% : 0.000002s : 15: predicate.reshape_eliminate 0.62% : 0.000002s : 10: predicate.row_tensor_add_zeros_like 0.38% : 0.000001s : 5: predicate.row_tensor_eliminate 0.98% : 0.000003s : 10: predicate.same_eliminate 0.42% : 0.000001s : 10: predicate.set_cell_output_no_recompute 0.99% : 0.000003s : 10: predicate.shard_identity_eliminate 0.82% : 0.000002s : 10: predicate.special_op_eliminate 0.74% : 0.000002s : 10: predicate.specialize_transform 0.99% : 0.000003s : 10: predicate.split_environ_get_set_with_tuple_value 0.95% : 0.000002s : 10: predicate.stack_unstack_eliminate 0.40% : 0.000001s : 5: predicate.switch_call_monad_eliminater 1.42% : 0.000004s : 23: predicate.switch_defer_inline 1.98% : 0.000005s : 33: predicate.switch_layer_defer_inline 4.78% : 0.000012s : 74: predicate.switch_simplify 0.92% : 0.000002s : 15: predicate.tile_eliminate 0.83% : 0.000002s : 15: predicate.transpose_eliminate 1.52% : 0.000004s : 25: predicate.tuple_list_convert_item_index_to_positive 1.52% : 0.000004s : 25: predicate.tuple_list_get_item_const_eliminator 1.50% : 0.000004s : 25: predicate.tuple_list_get_item_depend_reorder 3.14% : 0.000008s : 39: predicate.tuple_list_get_item_eliminator 1.41% : 0.000004s : 25: predicate.tuple_list_get_set_item_eliminator 2.42% : 0.000006s : 35: predicate.tuple_list_set_item_eliminator 1.70% : 0.000004s : 29: predicate.tuple_to_list_eliminator_ 2.31% : 0.000006s : 44: predicate.updatestate_pure_node_eliminater 2.99% : 0.000008s : 54: predicate.updatestate_useless_node_eliminater 0.48% : 0.000001s : 5: predicate.value_based_eliminate 0.99% : 0.000003s : 10: predicate.virtual_dataset_eliminate 0.72% : 0.000002s : 10: predicate.virtual_output_eliminate 0.27% : 0.000001s : 5: predicate.virtual_view_grad_eliminate 0.44% : 0.000001s : 5: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000713 11 52.44% : 0.000374s : 5: func_graph_cloner_run.FuncGraphClonerGraph 47.56% : 0.000339s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.123561 192 0.00% : 0.000004s : 1: ForceFp32Comm 3.18% : 0.003932s : 1: add_attr 3.17% : 0.003918s : 1: add_attr_with_inline 0.00% : 0.000004s : 1: add_comm_op_reuse_tag 0.06% : 0.000069s : 1: add_recomputation 0.00% : 0.000004s : 1: assign_add_opt 0.06% : 0.000076s : 1: auto_monad 0.02% : 0.000027s : 1: auto_monad_reorder 0.00% : 0.000004s : 1: begin_end_overlap_inline 0.01% : 0.000007s : 1: bias_add_comm_swap 0.40% : 0.000499s : 1: bootstrap 0.03% : 0.000038s : 1: cconv 0.00% : 0.000004s : 1: comm_op_add_attrs 0.02% : 0.000019s : 1: control_data_broadcast_order 0.01% : 0.000011s : 1: convert_after_rewriter 0.03% : 0.000031s : 1: cse_after_recomputation 0.00% : 0.000006s : 1: dataset_repeat_opt 0.00% : 0.000006s : 1: detach_backward 0.01% : 0.000010s : 1: environ_conv 0.02% : 0.000029s : 1: event_method 0.00% : 0.000005s : 1: full_micro_interleaved_order_control 0.00% : 0.000005s : 1: get_jit_bprop_graph 0.01% : 0.000011s : 1: graph_reusing 0.00% : 0.000005s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000004s : 1: handle_group_info 0.01% : 0.000006s : 1: inline 0.01% : 0.000007s : 1: insert-virtual-dataset 0.00% : 0.000004s : 1: interleave_parallel_branches 0.00% : 0.000004s : 1: interleave_split_concat_branches 0.01% : 0.000007s : 1: label_fine_grained_interleaved_index 0.01% : 0.000008s : 1: label_micro_interleaved_index 0.41% : 0.000501s : 1: loop_unroll 0.00% : 0.000004s : 1: merge_cast_opt 0.00% : 0.000006s : 1: micro_interleaved_order_control 0.61% : 0.000748s : 1: mutable_eliminate 0.01% : 0.000008s : 1: offloading_packed_experts 0.01% : 0.000018s : 1: opt.transform.loop_unroll_optimizer 0.02% : 0.000022s : 1: opt.transform.mutable_eliminate 1.21% : 0.001501s : 78: opt.transform.opt_a 0.03% : 0.000038s : 1: opt.transform.opt_after_cconv 0.03% : 0.000034s : 1: opt.transform.opt_after_jit_grad 0.11% : 0.000139s : 28: opt.transform.opt_b 0.05% : 0.000063s : 2: opt.transform.opt_trans_graph 0.04% : 0.000047s : 4: opt.transform.symbol_engine_opt 2.96% : 0.003658s : 1: opt_a 0.11% : 0.000131s : 1: opt_after_cconv 0.46% : 0.000568s : 1: opt_after_jit_grad 0.22% : 0.000272s : 1: opt_b 5.03% : 0.006213s : 1: optimize 0.02% : 0.000024s : 1: optimize_parallel_all_gather_comm 0.01% : 0.000009s : 1: order_py_execute_after_rewriter 0.02% : 0.000029s : 1: overlap_grad_flash_sp 0.00% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.01% : 0.000009s : 1: overlap_grad_ring_attention 0.01% : 0.000006s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000005s : 1: overlap_param_gather 0.00% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.01% : 0.000008s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000005s : 1: overlap_recompute_comm 0.01% : 0.000008s : 1: parallel-infer-symbol 0.00% : 0.000004s : 1: parallel-infer-symbol-second 0.00% : 0.000005s : 1: partial_unused_args_eliminate 0.00% : 0.000005s : 1: pipeline_parallel_scheduler 0.00% : 0.000005s : 1: pipeline_split 0.04% : 0.000048s : 1: pre_auto_parallel 0.03% : 0.000038s : 1: py_interpret_to_execute 0.02% : 0.000021s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000005s : 1: remove_cast_before_assign_add 0.02% : 0.000022s : 1: remove_dup_value 0.49% : 0.000605s : 1: renormalize.infer 0.43% : 0.000533s : 1: renormalize.specialize 0.00% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.01% : 0.000008s : 1: rewriter_after_jit_bprop_graph 0.04% : 0.000051s : 1: rewriter_after_opt_a 0.09% : 0.000109s : 1: rewriter_before_opt_a 0.00% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000005s : 1: slice_recompute_activation 0.00% : 0.000005s : 1: split_layernorm_comm 0.00% : 0.000006s : 1: split_matmul_comm_elemetwise 0.01% : 0.000009s : 1: swap_dp_allreduce_reducescatter 0.08% : 0.000095s : 1: symbol_engine_optimizer 0.08% : 0.000096s : 1: tuple_transform 80.14% : 0.099027s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:40:24.240.715 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:40:24.240.990 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.199464, [21] [bootstrap]: 0.00047915 [type_inference]: 0.186675 [event_method]: 1.974e-05 [auto_monad]: 6.864e-05 [graph_reusing]: 6.43e-06 [inline]: 2.71e-06 [add_attr]: 0.00373821, [1] [add_attr_with_inline]: 0.00372585, [1] [Cycle 1]: 9.156e-05, [2] [tag_attr]: 2.685e-05 [meta_addattr_fg_expand]: 6.59001e-06 [parallel-infer-symbol]: 4.23001e-06 [pre_auto_parallel]: 3.762e-05 [insert-virtual-dataset]: 2.46e-06 [parallel-infer-symbol-second]: 7.30011e-07 [dataset_repeat_opt]: 1.97999e-06 [pipeline_split]: 1.90001e-06 [optimize]: 0.00697067, [53] [py_interpret_to_execute]: 3.287e-05 [rewriter_before_opt_a]: 0.00010259 [opt_a]: 0.00401518, [2] [Cycle 1]: 0.00296284, [45] [expand_dump_flag]: 3.02002e-06 [switch_simplify]: 4.438e-05 [loop_unroll]: 3.1e-05 [a_1]: 0.00079857 [with_stream_mark]: 2.152e-05 [recompute_prepare]: 1.301e-05 [updatestate_depend_eliminate]: 5.87001e-06 [updatestate_assign_eliminate]: 4.01001e-06 [updatestate_loads_eliminate]: 3.94002e-06 [parameter_eliminate]: 2.22999e-06 [a_2]: 0.00014125 [accelerated_algorithm]: 9.66998e-06 [shard]: 2.24001e-06 [meta_shard_fg_expand]: 2.78e-06 [shard_inline]: 9.15001e-06 [merge_send_recv]: 1.175e-05 [auto_parallel]: 9.55001e-06 [parallel]: 2.063e-05 [flash_sp]: 1.062e-05 [merge_comm]: 4.94e-06 [allreduce_fusion]: 4.63001e-06 [matmul_add_comm_reduction]: 1.231e-05 [allreduce_slice_to_reducescatter]: 1.28002e-06 [virtual_shard_identity]: 1.172e-05 [virtual_dataset]: 8.90999e-06 [get_grad_eliminate_]: 8.31002e-06 [virtual_output]: 8.56002e-06 [merge_forward]: 5.56998e-06 [cell_reuse_recompute_pass]: 1.80001e-06 [offload_activation]: 1.263e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.056e-05 [merge_recompute_call_nodes]: 1.47001e-06 [before_grad]: 1.487e-05 [set_forward_comm_id_for_comm_node_pass]: 5.71e-06 [meta_fg_expand]: 3.78999e-06 [flash_sp_send_recv_attached]: 3.04999e-06 [receive_attached]: 2.58e-06 [after_resolve]: 1.671e-05 [a_after_grad]: 1.358e-05 [renormalize]: 0.0010665 [add_forward_monad_depend]: 7.63999e-06 [auto_monad_grad]: 2.66e-06 [auto_monad_eliminator]: 2.176e-05 [cse]: 4.361e-05 [a_3]: 8.278e-05 [Cycle 2]: 0.00103423, [45] [expand_dump_flag]: 2.34001e-06 [switch_simplify]: 1.021e-05 [loop_unroll]: 7.94002e-06 [a_1]: 0.00018831 [with_stream_mark]: 1.898e-05 [recompute_prepare]: 9.09998e-06 [updatestate_depend_eliminate]: 4.27e-06 [updatestate_assign_eliminate]: 3.63e-06 [updatestate_loads_eliminate]: 3.07002e-06 [parameter_eliminate]: 1.80001e-06 [a_2]: 0.00013351 [accelerated_algorithm]: 8.24998e-06 [shard]: 2.04999e-06 [meta_shard_fg_expand]: 1.92001e-06 [shard_inline]: 7.76001e-06 [merge_send_recv]: 8.15e-06 [auto_parallel]: 1.069e-05 [parallel]: 7.55e-06 [flash_sp]: 4.87e-06 [merge_comm]: 5.62001e-06 [allreduce_fusion]: 4.06001e-06 [matmul_add_comm_reduction]: 9.07001e-06 [allreduce_slice_to_reducescatter]: 5.3001e-07 [virtual_shard_identity]: 9.25001e-06 [virtual_dataset]: 9.22001e-06 [get_grad_eliminate_]: 8.52e-06 [virtual_output]: 7.65e-06 [merge_forward]: 5.10001e-06 [cell_reuse_recompute_pass]: 2.99001e-06 [offload_activation]: 1.196e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.842e-05 [merge_recompute_call_nodes]: 9.5999e-07 [before_grad]: 1.381e-05 [set_forward_comm_id_for_comm_node_pass]: 4.63001e-06 [meta_fg_expand]: 3.38e-06 [flash_sp_send_recv_attached]: 1.81e-06 [receive_attached]: 2.27999e-06 [after_resolve]: 1.447e-05 [a_after_grad]: 1.248e-05 [renormalize]: 6.00121e-08 [add_forward_monad_depend]: 1.75001e-06 [auto_monad_grad]: 1.83002e-06 [auto_monad_eliminator]: 1.125e-05 [cse]: 2.278e-05 [a_3]: 6.138e-05 [py_interpret_to_execute_after_opt_a]: 1.919e-05 [slice_cell_reuse_recomputed_activation]: 5.99e-06 [rewriter_after_opt_a]: 5.312e-05 [convert_after_rewriter]: 1.161e-05 [order_py_execute_after_rewriter]: 9.08002e-06 [mutable_eliminate]: 0.00077528 [opt_b]: 0.00033527, [1] [Cycle 1]: 0.00032451, [7] [b_1]: 0.0002086 [b_2]: 1.036e-05 [updatestate_depend_eliminate]: 8.05e-06 [updatestate_assign_eliminate]: 3.76001e-06 [updatestate_loads_eliminate]: 3.51999e-06 [renormalize]: 9.5999e-07 [cse]: 2.96e-05 [optimize_parallel_all_gather_comm]: 2.449e-05 [overlap_param_gather]: 4.87998e-06 [cconv]: 3.729e-05 [loop_unroll]: 0.00056337 [opt_after_cconv]: 0.00015664, [1] [Cycle 1]: 0.00014634, [7] [c_1]: 4.014e-05 [parameter_eliminate]: 5.30001e-06 [updatestate_depend_eliminate]: 7.56001e-06 [updatestate_assign_eliminate]: 3.33e-06 [updatestate_loads_eliminate]: 3.17997e-06 [cse]: 2.748e-05 [renormalize]: 6.00005e-07 [remove_dup_value]: 2.191e-05 [tuple_transform]: 0.00011336, [1] [Cycle 1]: 0.0001054, [4] [d_1]: 6.021e-05 [none_parameter_eliminate]: 1.71002e-06 [renormalize]: 3.69997e-07 [switch_simplify]: 9.77999e-06 [partial_unused_args_eliminate]: 5.04998e-06 [add_recomputation]: 7.76e-05 [cse_after_recomputation]: 3.879e-05, [1] [Cycle 1]: 2.995e-05, [1] [cse]: 1.915e-05 [environ_conv]: 1.151e-05 [swap_dp_allreduce_reducescatter]: 9.87001e-06 [bias_add_comm_swap]: 6.04001e-06 [label_micro_interleaved_index]: 8.57998e-06 [label_fine_grained_interleaved_index]: 5.77001e-06 [merge_cast_opt]: 4.63001e-06 [slice_recompute_activation]: 4.95999e-06 [micro_interleaved_order_control]: 5.67999e-06 [assign_add_opt]: 4.02e-06 [ForceFp32Comm]: 3.4e-06 [remove_cast_before_assign_add]: 3.91999e-06 [full_micro_interleaved_order_control]: 5.10001e-06 [reorder_send_recv_between_fp_bp]: 6.41e-06 [comm_op_add_attrs]: 4.01001e-06 [add_comm_op_reuse_tag]: 3.63e-06 [interleave_split_concat_branches]: 3.61001e-06 [interleave_parallel_branches]: 3.46001e-06 [overlap_opt_shard_in_pipeline]: 3.95e-06 [overlap_opt_shard_grad_in_pipeline]: 4.53999e-06 [control_data_broadcast_order]: 2.247e-05 [grouped_pairwise_exchange_alltoall]: 4.16001e-06 [offloading_packed_experts]: 7.82002e-06 [overlap_recompute_and_grad_model_parallel]: 8.64e-06 [overlap_grad_matmul_and_grad_allreduce]: 4.03001e-06 [overlap_recompute_allgather_and_fa_grad]: 3.86999e-06 [overlap_recompute_comm]: 5.64998e-06 [overlap_grad_ring_attention]: 7.55998e-06 [overlap_grad_flash_sp]: 3.047e-05 [begin_end_overlap_inline]: 3.03998e-06 [split_matmul_comm_elemetwise]: 4.57e-06 [split_layernorm_comm]: 4.21001e-06 [handle_group_info]: 3.3e-06 [symbol_engine_optimizer]: 0.00012567, [1] [Cycle 1]: 0.00011734, [6] [build]: 4.20999e-06 [elim_shapecalc]: 1.435e-05 [elim_not_effective]: 1.878e-05 [opt_reshape]: 1.037e-05 [fold_const_symbol]: 1.388e-05 [renormalize]: 4.09986e-07 [detach_backward]: 4.31002e-06 [pipeline_parallel_scheduler]: 2.48e-06 [auto_monad_reorder]: 2.982e-05 [get_jit_bprop_graph]: 1.94999e-06 [rewriter_after_jit_bprop_graph]: 6.69001e-06 [opt_after_jit_grad]: 0.0006924 [validate]: 5.643e-05 Sums bootstrap : 0.000479s : 0.25% type_inference : 0.186675s : 96.32% event_method : 0.000020s : 0.01% auto_monad : 0.000069s : 0.04% graph_reusing : 0.000006s : 0.00% inline : 0.000003s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000027s : 0.01% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000007s : 0.00% parallel-infer-symbol : 0.000004s : 0.00% pre_auto_parallel : 0.000038s : 0.02% insert-virtual-dataset : 0.000002s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000033s : 0.02% optimize.rewriter_before_opt_a : 0.000103s : 0.05% optimize.opt_a.expand_dump_flag : 0.000005s : 0.00% optimize.opt_a.switch_simplify : 0.000055s : 0.03% optimize.opt_a.loop_unroll : 0.000039s : 0.02% optimize.opt_a.a_1 : 0.000987s : 0.51% optimize.opt_a.with_stream_mark : 0.000041s : 0.02% optimize.opt_a.recompute_prepare : 0.000022s : 0.01% optimize.opt_a.updatestate_depend_eliminate : 0.000010s : 0.01% optimize.opt_a.updatestate_assign_eliminate : 0.000008s : 0.00% optimize.opt_a.updatestate_loads_eliminate : 0.000007s : 0.00% optimize.opt_a.parameter_eliminate : 0.000004s : 0.00% optimize.opt_a.a_2 : 0.000275s : 0.14% optimize.opt_a.accelerated_algorithm : 0.000018s : 0.01% optimize.opt_a.shard : 0.000004s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.000005s : 0.00% optimize.opt_a.shard_inline : 0.000017s : 0.01% optimize.opt_a.merge_send_recv : 0.000020s : 0.01% optimize.opt_a.auto_parallel : 0.000020s : 0.01% optimize.opt_a.parallel : 0.000028s : 0.01% optimize.opt_a.flash_sp : 0.000015s : 0.01% optimize.opt_a.merge_comm : 0.000011s : 0.01% optimize.opt_a.allreduce_fusion : 0.000009s : 0.00% optimize.opt_a.matmul_add_comm_reduction : 0.000021s : 0.01% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000002s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000021s : 0.01% optimize.opt_a.virtual_dataset : 0.000018s : 0.01% optimize.opt_a.get_grad_eliminate_ : 0.000017s : 0.01% optimize.opt_a.virtual_output : 0.000016s : 0.01% optimize.opt_a.merge_forward : 0.000011s : 0.01% optimize.opt_a.cell_reuse_recompute_pass : 0.000005s : 0.00% optimize.opt_a.offload_activation : 0.000025s : 0.01% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000039s : 0.02% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.00% optimize.opt_a.before_grad : 0.000029s : 0.01% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000010s : 0.01% optimize.opt_a.meta_fg_expand : 0.000007s : 0.00% optimize.opt_a.flash_sp_send_recv_attached : 0.000005s : 0.00% optimize.opt_a.receive_attached : 0.000005s : 0.00% optimize.opt_a.after_resolve : 0.000031s : 0.02% optimize.opt_a.a_after_grad : 0.000026s : 0.01% optimize.opt_a.renormalize : 0.001067s : 0.55% optimize.opt_a.add_forward_monad_depend : 0.000009s : 0.00% optimize.opt_a.auto_monad_grad : 0.000004s : 0.00% optimize.opt_a.auto_monad_eliminator : 0.000033s : 0.02% optimize.opt_a.cse : 0.000066s : 0.03% optimize.opt_a.a_3 : 0.000144s : 0.07% optimize.py_interpret_to_execute_after_opt_a : 0.000019s : 0.01% optimize.slice_cell_reuse_recomputed_activation : 0.000006s : 0.00% optimize.rewriter_after_opt_a : 0.000053s : 0.03% optimize.convert_after_rewriter : 0.000012s : 0.01% optimize.order_py_execute_after_rewriter : 0.000009s : 0.00% optimize.mutable_eliminate : 0.000775s : 0.40% optimize.opt_b.b_1 : 0.000209s : 0.11% optimize.opt_b.b_2 : 0.000010s : 0.01% optimize.opt_b.updatestate_depend_eliminate : 0.000008s : 0.00% optimize.opt_b.updatestate_assign_eliminate : 0.000004s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000004s : 0.00% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000030s : 0.02% optimize.optimize_parallel_all_gather_comm : 0.000024s : 0.01% optimize.overlap_param_gather : 0.000005s : 0.00% optimize.cconv : 0.000037s : 0.02% optimize.loop_unroll : 0.000563s : 0.29% optimize.opt_after_cconv.c_1 : 0.000040s : 0.02% optimize.opt_after_cconv.parameter_eliminate : 0.000005s : 0.00% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000008s : 0.00% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.cse : 0.000027s : 0.01% optimize.opt_after_cconv.renormalize : 0.000001s : 0.00% optimize.remove_dup_value : 0.000022s : 0.01% optimize.tuple_transform.d_1 : 0.000060s : 0.03% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000010s : 0.01% optimize.partial_unused_args_eliminate : 0.000005s : 0.00% optimize.add_recomputation : 0.000078s : 0.04% optimize.cse_after_recomputation.cse : 0.000019s : 0.01% optimize.environ_conv : 0.000012s : 0.01% optimize.swap_dp_allreduce_reducescatter : 0.000010s : 0.01% optimize.bias_add_comm_swap : 0.000006s : 0.00% optimize.label_micro_interleaved_index : 0.000009s : 0.00% optimize.label_fine_grained_interleaved_index : 0.000006s : 0.00% optimize.merge_cast_opt : 0.000005s : 0.00% optimize.slice_recompute_activation : 0.000005s : 0.00% optimize.micro_interleaved_order_control : 0.000006s : 0.00% optimize.assign_add_opt : 0.000004s : 0.00% optimize.ForceFp32Comm : 0.000003s : 0.00% optimize.remove_cast_before_assign_add : 0.000004s : 0.00% optimize.full_micro_interleaved_order_control : 0.000005s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000006s : 0.00% optimize.comm_op_add_attrs : 0.000004s : 0.00% optimize.add_comm_op_reuse_tag : 0.000004s : 0.00% optimize.interleave_split_concat_branches : 0.000004s : 0.00% optimize.interleave_parallel_branches : 0.000003s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000005s : 0.00% optimize.control_data_broadcast_order : 0.000022s : 0.01% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.00% optimize.offloading_packed_experts : 0.000008s : 0.00% optimize.overlap_recompute_and_grad_model_parallel : 0.000009s : 0.00% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.00% optimize.overlap_recompute_comm : 0.000006s : 0.00% optimize.overlap_grad_ring_attention : 0.000008s : 0.00% optimize.overlap_grad_flash_sp : 0.000030s : 0.02% optimize.begin_end_overlap_inline : 0.000003s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000005s : 0.00% optimize.split_layernorm_comm : 0.000004s : 0.00% optimize.handle_group_info : 0.000003s : 0.00% optimize.symbol_engine_optimizer.build : 0.000004s : 0.00% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000014s : 0.01% optimize.symbol_engine_optimizer.elim_not_effective : 0.000019s : 0.01% optimize.symbol_engine_optimizer.opt_reshape : 0.000010s : 0.01% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000014s : 0.01% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000004s : 0.00% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000030s : 0.02% get_jit_bprop_graph : 0.000002s : 0.00% rewriter_after_jit_bprop_graph : 0.000007s : 0.00% opt_after_jit_grad : 0.000692s : 0.36% validate : 0.000056s : 0.03% Time group info: ------[substitution.] 0.000308 38 9.08% : 0.000028s : 3: substitution.cast_eliminate 0.80% : 0.000002s : 3: substitution.elim_not_effective 0.61% : 0.000002s : 3: substitution.fold_const_symbol 2.69% : 0.000008s : 5: substitution.graph_param_transform 75.88% : 0.000234s : 4: substitution.inline 1.85% : 0.000006s : 6: substitution.j_node_and_user_rematch 2.28% : 0.000007s : 6: substitution.remove_not_recompute_node 2.35% : 0.000007s : 4: substitution.replace_old_param 4.48% : 0.000014s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.186616 2 99.57% : 0.185813s : 1: type_inference.infer 0.43% : 0.000803s : 1: type_inference.specialize ------[replace.] 0.000073 8 62.81% : 0.000046s : 4: replace.inline 37.19% : 0.000027s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000243 8 95.07% : 0.000231s : 4: match.inline 4.93% : 0.000012s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000265 1504 0.88% : 0.000002s : 15: predicate.accumulaten_eliminater 0.90% : 0.000002s : 5: predicate.ad_related_special_op_eliminate 0.58% : 0.000002s : 10: predicate.addn_check_dump 0.87% : 0.000002s : 15: predicate.addn_zero_filter 0.86% : 0.000002s : 15: predicate.adjust_all_reduce_mul_add 2.14% : 0.000006s : 25: predicate.arithmetic_simplify 1.08% : 0.000003s : 15: predicate.cast_eliminate 0.60% : 0.000002s : 10: predicate.check_bprop_eliminate 0.74% : 0.000002s : 10: predicate.compare_switch_simplify 0.17% : 0.000000s : 5: predicate.const_output_eliminate 0.68% : 0.000002s : 10: predicate.depend_value_elim 0.86% : 0.000002s : 15: predicate.dict_get_item_const_eliminator 0.91% : 0.000002s : 15: predicate.dict_get_item_eliminator 0.91% : 0.000002s : 15: predicate.dict_set_item_eliminator 1.28% : 0.000003s : 10: predicate.dumpgradient_eliminate 0.22% : 0.000001s : 5: predicate.elim_not_effective 0.47% : 0.000001s : 5: predicate.elim_shapecalc_of_broadcastargs 1.20% : 0.000003s : 20: predicate.environ_add_const_eliminate 1.18% : 0.000003s : 20: predicate.environ_get_add_eliminate 0.97% : 0.000003s : 20: predicate.environ_get_depend_swap 1.89% : 0.000005s : 30: predicate.environ_get_eliminate 1.03% : 0.000003s : 20: predicate.environ_get_set_eliminate 1.26% : 0.000003s : 23: predicate.exchange_switch_depend_value 2.22% : 0.000006s : 23: predicate.float_depend_g_call 0.51% : 0.000001s : 10: predicate.float_environ_get_switch 0.91% : 0.000002s : 15: predicate.float_tuple_getitem_switch 0.20% : 0.000001s : 5: predicate.fold_const_symbol 0.71% : 0.000002s : 10: predicate.get_grad_eliminate 0.26% : 0.000001s : 5: predicate.graph_param_transform 0.60% : 0.000002s : 10: predicate.incorporate_call 0.51% : 0.000001s : 10: predicate.incorporate_call_switch 6.33% : 0.000017s : 68: predicate.inline 0.86% : 0.000002s : 10: predicate.inline_without_move 0.33% : 0.000001s : 10: predicate.j_node_and_user_rematch 0.98% : 0.000003s : 10: predicate.less_batch_normalization 1.90% : 0.000005s : 29: predicate.list_to_tuple_eliminator_ 2.23% : 0.000006s : 44: predicate.load_eliminater 1.29% : 0.000003s : 5: predicate.loop_unroll_after_grad 1.99% : 0.000005s : 36: predicate.loop_unroll_before_grad 1.75% : 0.000005s : 25: predicate.make_slice_get_slice_eliminator 0.57% : 0.000002s : 10: predicate.merge_addn 0.59% : 0.000002s : 10: predicate.micro_step_allgather_replace 0.75% : 0.000002s : 10: predicate.mini_step_allgather_replace 0.83% : 0.000002s : 15: predicate.minmaximum_grad 1.35% : 0.000004s : 5: predicate.mutable_eliminate 0.75% : 0.000002s : 5: predicate.opt_reshape 0.41% : 0.000001s : 5: predicate.parallel_virtual_node 1.63% : 0.000004s : 23: predicate.partial_defer_inline 1.45% : 0.000004s : 24: predicate.partial_eliminate 0.90% : 0.000002s : 15: predicate.print_const_string_wrapper 0.68% : 0.000002s : 10: predicate.reduce_all_const_elim 1.28% : 0.000003s : 15: predicate.reduce_eliminate 2.30% : 0.000006s : 44: predicate.redundant_stop_gradient_eliminater 0.37% : 0.000001s : 10: predicate.remove_not_recompute_node 1.25% : 0.000003s : 29: predicate.replace_applicator 0.55% : 0.000001s : 10: predicate.replace_old_param 0.31% : 0.000001s : 5: predicate.reset_defer_inline 0.84% : 0.000002s : 15: predicate.reshape_eliminate 0.64% : 0.000002s : 10: predicate.row_tensor_add_zeros_like 0.39% : 0.000001s : 5: predicate.row_tensor_eliminate 0.98% : 0.000003s : 10: predicate.same_eliminate 0.45% : 0.000001s : 10: predicate.set_cell_output_no_recompute 0.79% : 0.000002s : 10: predicate.shard_identity_eliminate 0.78% : 0.000002s : 10: predicate.special_op_eliminate 0.75% : 0.000002s : 10: predicate.specialize_transform 1.02% : 0.000003s : 10: predicate.split_environ_get_set_with_tuple_value 1.00% : 0.000003s : 10: predicate.stack_unstack_eliminate 0.39% : 0.000001s : 5: predicate.switch_call_monad_eliminater 1.30% : 0.000003s : 23: predicate.switch_defer_inline 1.84% : 0.000005s : 33: predicate.switch_layer_defer_inline 4.69% : 0.000012s : 74: predicate.switch_simplify 0.85% : 0.000002s : 15: predicate.tile_eliminate 0.80% : 0.000002s : 15: predicate.transpose_eliminate 1.52% : 0.000004s : 25: predicate.tuple_list_convert_item_index_to_positive 1.61% : 0.000004s : 25: predicate.tuple_list_get_item_const_eliminator 1.40% : 0.000004s : 25: predicate.tuple_list_get_item_depend_reorder 3.13% : 0.000008s : 39: predicate.tuple_list_get_item_eliminator 1.58% : 0.000004s : 25: predicate.tuple_list_get_set_item_eliminator 2.29% : 0.000006s : 35: predicate.tuple_list_set_item_eliminator 1.71% : 0.000005s : 29: predicate.tuple_to_list_eliminator_ 2.43% : 0.000006s : 44: predicate.updatestate_pure_node_eliminater 3.00% : 0.000008s : 54: predicate.updatestate_useless_node_eliminater 0.53% : 0.000001s : 5: predicate.value_based_eliminate 0.65% : 0.000002s : 10: predicate.virtual_dataset_eliminate 0.73% : 0.000002s : 10: predicate.virtual_output_eliminate 0.29% : 0.000001s : 5: predicate.virtual_view_grad_eliminate 0.40% : 0.000001s : 5: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000664 11 55.09% : 0.000366s : 5: func_graph_cloner_run.FuncGraphClonerGraph 44.91% : 0.000298s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.213011 192 0.00% : 0.000006s : 1: ForceFp32Comm 1.76% : 0.003750s : 1: add_attr 1.75% : 0.003730s : 1: add_attr_with_inline 0.00% : 0.000006s : 1: add_comm_op_reuse_tag 0.04% : 0.000082s : 1: add_recomputation 0.00% : 0.000007s : 1: assign_add_opt 0.04% : 0.000079s : 1: auto_monad 0.02% : 0.000038s : 1: auto_monad_reorder 0.00% : 0.000006s : 1: begin_end_overlap_inline 0.00% : 0.000009s : 1: bias_add_comm_swap 0.25% : 0.000527s : 1: bootstrap 0.02% : 0.000041s : 1: cconv 0.00% : 0.000007s : 1: comm_op_add_attrs 0.01% : 0.000025s : 1: control_data_broadcast_order 0.01% : 0.000015s : 1: convert_after_rewriter 0.02% : 0.000042s : 1: cse_after_recomputation 0.00% : 0.000007s : 1: dataset_repeat_opt 0.01% : 0.000025s : 1: detach_backward 0.01% : 0.000014s : 1: environ_conv 0.01% : 0.000030s : 1: event_method 0.00% : 0.000008s : 1: full_micro_interleaved_order_control 0.00% : 0.000009s : 1: get_jit_bprop_graph 0.01% : 0.000013s : 1: graph_reusing 0.00% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000007s : 1: handle_group_info 0.00% : 0.000010s : 1: inline 0.00% : 0.000009s : 1: insert-virtual-dataset 0.00% : 0.000006s : 1: interleave_parallel_branches 0.00% : 0.000006s : 1: interleave_split_concat_branches 0.00% : 0.000009s : 1: label_fine_grained_interleaved_index 0.01% : 0.000011s : 1: label_micro_interleaved_index 0.27% : 0.000571s : 1: loop_unroll 0.00% : 0.000007s : 1: merge_cast_opt 0.00% : 0.000009s : 1: micro_interleaved_order_control 0.37% : 0.000783s : 1: mutable_eliminate 0.01% : 0.000011s : 1: offloading_packed_experts 0.01% : 0.000021s : 1: opt.transform.loop_unroll_optimizer 0.01% : 0.000022s : 1: opt.transform.mutable_eliminate 0.73% : 0.001561s : 78: opt.transform.opt_a 0.02% : 0.000039s : 1: opt.transform.opt_after_cconv 0.02% : 0.000036s : 1: opt.transform.opt_after_jit_grad 0.07% : 0.000142s : 28: opt.transform.opt_b 0.03% : 0.000067s : 2: opt.transform.opt_trans_graph 0.02% : 0.000053s : 4: opt.transform.symbol_engine_opt 1.89% : 0.004019s : 1: opt_a 0.08% : 0.000160s : 1: opt_after_cconv 0.33% : 0.000704s : 1: opt_after_jit_grad 0.16% : 0.000339s : 1: opt_b 3.45% : 0.007355s : 1: optimize 0.01% : 0.000028s : 1: optimize_parallel_all_gather_comm 0.01% : 0.000012s : 1: order_py_execute_after_rewriter 0.02% : 0.000033s : 1: overlap_grad_flash_sp 0.00% : 0.000007s : 1: overlap_grad_matmul_and_grad_allreduce 0.01% : 0.000011s : 1: overlap_grad_ring_attention 0.00% : 0.000009s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000007s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000008s : 1: overlap_param_gather 0.00% : 0.000006s : 1: overlap_recompute_allgather_and_fa_grad 0.01% : 0.000011s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000008s : 1: overlap_recompute_comm 0.01% : 0.000011s : 1: parallel-infer-symbol 0.00% : 0.000006s : 1: parallel-infer-symbol-second 0.00% : 0.000008s : 1: partial_unused_args_eliminate 0.00% : 0.000010s : 1: pipeline_parallel_scheduler 0.00% : 0.000007s : 1: pipeline_split 0.02% : 0.000046s : 1: pre_auto_parallel 0.02% : 0.000037s : 1: py_interpret_to_execute 0.01% : 0.000023s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000008s : 1: remove_cast_before_assign_add 0.01% : 0.000025s : 1: remove_dup_value 0.28% : 0.000597s : 1: renormalize.infer 0.21% : 0.000457s : 1: renormalize.specialize 0.00% : 0.000009s : 1: reorder_send_recv_between_fp_bp 0.01% : 0.000013s : 1: rewriter_after_jit_bprop_graph 0.03% : 0.000057s : 1: rewriter_after_opt_a 0.05% : 0.000106s : 1: rewriter_before_opt_a 0.00% : 0.000009s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000008s : 1: slice_recompute_activation 0.00% : 0.000007s : 1: split_layernorm_comm 0.00% : 0.000007s : 1: split_matmul_comm_elemetwise 0.01% : 0.000013s : 1: swap_dp_allreduce_reducescatter 0.06% : 0.000129s : 1: symbol_engine_optimizer 0.05% : 0.000116s : 1: tuple_transform 87.66% : 0.186720s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:40:27.788.20 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0307176, [21] [bootstrap]: 0.00045604 [type_inference]: 0.00666308 [event_method]: 2.269e-05 [auto_monad]: 7.259e-05 [graph_reusing]: 6.96999e-06 [inline]: 3.77002e-06 [add_attr]: 0.00377649, [1] [add_attr_with_inline]: 0.0037606, [1] [Cycle 1]: 8.654e-05, [2] [tag_attr]: 3.096e-05 [meta_addattr_fg_expand]: 6.86999e-06 [parallel-infer-symbol]: 3.68e-06 [pre_auto_parallel]: 4.569e-05 [insert-virtual-dataset]: 3.28e-06 [parallel-infer-symbol-second]: 8.39995e-07 [dataset_repeat_opt]: 2.55997e-06 [pipeline_split]: 1.89e-06 [optimize]: 0.00670828, [53] [py_interpret_to_execute]: 3.792e-05 [rewriter_before_opt_a]: 0.00010851 [opt_a]: 0.00391423, [2] [Cycle 1]: 0.00295111, [45] [expand_dump_flag]: 3.63e-06 [switch_simplify]: 4.684e-05 [loop_unroll]: 3.215e-05 [a_1]: 0.0007907 [with_stream_mark]: 2.193e-05 [recompute_prepare]: 1.297e-05 [updatestate_depend_eliminate]: 5.52999e-06 [updatestate_assign_eliminate]: 4.12e-06 [updatestate_loads_eliminate]: 3.65998e-06 [parameter_eliminate]: 2.19999e-06 [a_2]: 0.00010992 [accelerated_algorithm]: 9.96e-06 [shard]: 2.16998e-06 [meta_shard_fg_expand]: 3.13e-06 [shard_inline]: 9.22999e-06 [merge_send_recv]: 1.09e-05 [auto_parallel]: 8.30999e-06 [parallel]: 2.41e-05 [flash_sp]: 1.141e-05 [merge_comm]: 5.51e-06 [allreduce_fusion]: 4.64002e-06 [matmul_add_comm_reduction]: 1.302e-05 [allreduce_slice_to_reducescatter]: 6.69999e-07 [virtual_shard_identity]: 1.124e-05 [virtual_dataset]: 8.84e-06 [get_grad_eliminate_]: 8.03999e-06 [virtual_output]: 8.06001e-06 [merge_forward]: 5.05999e-06 [cell_reuse_recompute_pass]: 1.57001e-06 [offload_activation]: 1.396e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.714e-05 [merge_recompute_call_nodes]: 1.50999e-06 [before_grad]: 1.501e-05 [set_forward_comm_id_for_comm_node_pass]: 5.42999e-06 [meta_fg_expand]: 4.07e-06 [flash_sp_send_recv_attached]: 3.04999e-06 [receive_attached]: 2.31998e-06 [after_resolve]: 1.897e-05 [a_after_grad]: 1.413e-05 [renormalize]: 0.00122063 [add_forward_monad_depend]: 9.09e-06 [auto_monad_grad]: 2.70002e-06 [auto_monad_eliminator]: 2.666e-05 [cse]: 4.534e-05 [a_3]: 7.125e-05 [Cycle 2]: 0.00094834, [45] [expand_dump_flag]: 2.96999e-06 [switch_simplify]: 1.176e-05 [loop_unroll]: 8.15e-06 [a_1]: 0.00020205 [with_stream_mark]: 2.361e-05 [recompute_prepare]: 9.96e-06 [updatestate_depend_eliminate]: 4.99e-06 [updatestate_assign_eliminate]: 3.97998e-06 [updatestate_loads_eliminate]: 3.63999e-06 [parameter_eliminate]: 1.72999e-06 [a_2]: 0.00011247 [accelerated_algorithm]: 1.001e-05 [shard]: 3.09001e-06 [meta_shard_fg_expand]: 3.09999e-06 [shard_inline]: 8.64e-06 [merge_send_recv]: 9.92001e-06 [auto_parallel]: 1.259e-05 [parallel]: 7.87e-06 [flash_sp]: 4.38001e-06 [merge_comm]: 4.53001e-06 [allreduce_fusion]: 4.43999e-06 [matmul_add_comm_reduction]: 1.006e-05 [allreduce_slice_to_reducescatter]: 9.00007e-07 [virtual_shard_identity]: 9.86e-06 [virtual_dataset]: 7.92998e-06 [get_grad_eliminate_]: 8.89e-06 [virtual_output]: 7.63001e-06 [merge_forward]: 6.49999e-06 [cell_reuse_recompute_pass]: 2.93998e-06 [offload_activation]: 1.397e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.945e-05 [merge_recompute_call_nodes]: 1.66002e-06 [before_grad]: 1.604e-05 [set_forward_comm_id_for_comm_node_pass]: 5.43002e-06 [meta_fg_expand]: 3.70998e-06 [flash_sp_send_recv_attached]: 2.06e-06 [receive_attached]: 2.76e-06 [after_resolve]: 1.703e-05 [a_after_grad]: 1.399e-05 [renormalize]: 1.19995e-07 [add_forward_monad_depend]: 3.10998e-06 [auto_monad_grad]: 1.92999e-06 [auto_monad_eliminator]: 1.51e-05 [cse]: 2.812e-05 [a_3]: 5.136e-05 [py_interpret_to_execute_after_opt_a]: 2.322e-05 [slice_cell_reuse_recomputed_activation]: 2.56e-06 [rewriter_after_opt_a]: 5.229e-05 [convert_after_rewriter]: 9.07001e-06 [order_py_execute_after_rewriter]: 6.48e-06 [mutable_eliminate]: 0.00080308 [opt_b]: 0.00028699, [1] [Cycle 1]: 0.00027757, [7] [b_1]: 0.00016709 [b_2]: 1.075e-05 [updatestate_depend_eliminate]: 1.309e-05 [updatestate_assign_eliminate]: 3.91999e-06 [updatestate_loads_eliminate]: 3.08998e-06 [renormalize]: 7.2e-07 [cse]: 3.947e-05 [optimize_parallel_all_gather_comm]: 2.586e-05 [overlap_param_gather]: 2.31e-06 [cconv]: 4.202e-05 [loop_unroll]: 0.00053302 [opt_after_cconv]: 0.00013365, [1] [Cycle 1]: 0.00012743, [7] [c_1]: 4.072e-05 [parameter_eliminate]: 5.49e-06 [updatestate_depend_eliminate]: 8.12e-06 [updatestate_assign_eliminate]: 3.6e-06 [updatestate_loads_eliminate]: 2.99999e-06 [cse]: 2.995e-05 [renormalize]: 5.59987e-07 [remove_dup_value]: 1.95e-05 [tuple_transform]: 9.805e-05, [1] [Cycle 1]: 9.29e-05, [4] [d_1]: 6.146e-05 [none_parameter_eliminate]: 1.71e-06 [renormalize]: 3.00002e-07 [switch_simplify]: 9.15999e-06 [partial_unused_args_eliminate]: 1.83997e-06 [add_recomputation]: 7.087e-05 [cse_after_recomputation]: 2.868e-05, [1] [Cycle 1]: 2.32e-05, [1] [cse]: 1.705e-05 [environ_conv]: 7.87998e-06 [swap_dp_allreduce_reducescatter]: 6.28e-06 [bias_add_comm_swap]: 3.66999e-06 [label_micro_interleaved_index]: 5.82999e-06 [label_fine_grained_interleaved_index]: 2.94001e-06 [merge_cast_opt]: 1.40999e-06 [slice_recompute_activation]: 2.16e-06 [micro_interleaved_order_control]: 2.48e-06 [assign_add_opt]: 1.24e-06 [ForceFp32Comm]: 1.10001e-06 [remove_cast_before_assign_add]: 1.10001e-06 [full_micro_interleaved_order_control]: 2.46e-06 [reorder_send_recv_between_fp_bp]: 3.18998e-06 [comm_op_add_attrs]: 1.65001e-06 [add_comm_op_reuse_tag]: 1.07998e-06 [interleave_split_concat_branches]: 1.24003e-06 [interleave_parallel_branches]: 1.10001e-06 [overlap_opt_shard_in_pipeline]: 1.77001e-06 [overlap_opt_shard_grad_in_pipeline]: 2.03002e-06 [control_data_broadcast_order]: 1.734e-05 [grouped_pairwise_exchange_alltoall]: 1.69e-06 [offloading_packed_experts]: 4.99e-06 [overlap_recompute_and_grad_model_parallel]: 5.60001e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.38002e-06 [overlap_recompute_allgather_and_fa_grad]: 1.49e-06 [overlap_recompute_comm]: 2.46998e-06 [overlap_grad_ring_attention]: 4.50001e-06 [overlap_grad_flash_sp]: 2.742e-05 [begin_end_overlap_inline]: 1.07e-06 [split_matmul_comm_elemetwise]: 2.64001e-06 [split_layernorm_comm]: 2.11998e-06 [handle_group_info]: 9.89996e-07 [symbol_engine_optimizer]: 0.00013608, [1] [Cycle 1]: 0.00013126, [6] [build]: 5.00999e-06 [elim_shapecalc]: 1.42e-05 [elim_not_effective]: 5.505e-05 [opt_reshape]: 9.12999e-06 [fold_const_symbol]: 1.358e-05 [renormalize]: 3.50003e-07 [detach_backward]: 2.29999e-06 [pipeline_parallel_scheduler]: 1.65001e-06 [auto_monad_reorder]: 2.399e-05 [get_jit_bprop_graph]: 2.32999e-06 [rewriter_after_jit_bprop_graph]: 6.42001e-06 [opt_after_jit_grad]: 0.0126542 [validate]: 7.147e-05 Sums bootstrap : 0.000456s : 1.77% type_inference : 0.006663s : 25.81% event_method : 0.000023s : 0.09% auto_monad : 0.000073s : 0.28% graph_reusing : 0.000007s : 0.03% inline : 0.000004s : 0.01% add_attr.add_attr_with_inline.tag_attr : 0.000031s : 0.12% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000007s : 0.03% parallel-infer-symbol : 0.000004s : 0.01% pre_auto_parallel : 0.000046s : 0.18% insert-virtual-dataset : 0.000003s : 0.01% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000003s : 0.01% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000038s : 0.15% optimize.rewriter_before_opt_a : 0.000109s : 0.42% optimize.opt_a.expand_dump_flag : 0.000007s : 0.03% optimize.opt_a.switch_simplify : 0.000059s : 0.23% optimize.opt_a.loop_unroll : 0.000040s : 0.16% optimize.opt_a.a_1 : 0.000993s : 3.85% optimize.opt_a.with_stream_mark : 0.000046s : 0.18% optimize.opt_a.recompute_prepare : 0.000023s : 0.09% optimize.opt_a.updatestate_depend_eliminate : 0.000011s : 0.04% optimize.opt_a.updatestate_assign_eliminate : 0.000008s : 0.03% optimize.opt_a.updatestate_loads_eliminate : 0.000007s : 0.03% optimize.opt_a.parameter_eliminate : 0.000004s : 0.02% optimize.opt_a.a_2 : 0.000222s : 0.86% optimize.opt_a.accelerated_algorithm : 0.000020s : 0.08% optimize.opt_a.shard : 0.000005s : 0.02% optimize.opt_a.meta_shard_fg_expand : 0.000006s : 0.02% optimize.opt_a.shard_inline : 0.000018s : 0.07% optimize.opt_a.merge_send_recv : 0.000021s : 0.08% optimize.opt_a.auto_parallel : 0.000021s : 0.08% optimize.opt_a.parallel : 0.000032s : 0.12% optimize.opt_a.flash_sp : 0.000016s : 0.06% optimize.opt_a.merge_comm : 0.000010s : 0.04% optimize.opt_a.allreduce_fusion : 0.000009s : 0.04% optimize.opt_a.matmul_add_comm_reduction : 0.000023s : 0.09% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000002s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000021s : 0.08% optimize.opt_a.virtual_dataset : 0.000017s : 0.06% optimize.opt_a.get_grad_eliminate_ : 0.000017s : 0.07% optimize.opt_a.virtual_output : 0.000016s : 0.06% optimize.opt_a.merge_forward : 0.000012s : 0.04% optimize.opt_a.cell_reuse_recompute_pass : 0.000005s : 0.02% optimize.opt_a.offload_activation : 0.000028s : 0.11% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000037s : 0.14% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.01% optimize.opt_a.before_grad : 0.000031s : 0.12% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000011s : 0.04% optimize.opt_a.meta_fg_expand : 0.000008s : 0.03% optimize.opt_a.flash_sp_send_recv_attached : 0.000005s : 0.02% optimize.opt_a.receive_attached : 0.000005s : 0.02% optimize.opt_a.after_resolve : 0.000036s : 0.14% optimize.opt_a.a_after_grad : 0.000028s : 0.11% optimize.opt_a.renormalize : 0.001221s : 4.73% optimize.opt_a.add_forward_monad_depend : 0.000012s : 0.05% optimize.opt_a.auto_monad_grad : 0.000005s : 0.02% optimize.opt_a.auto_monad_eliminator : 0.000042s : 0.16% optimize.opt_a.cse : 0.000073s : 0.28% optimize.opt_a.a_3 : 0.000123s : 0.47% optimize.py_interpret_to_execute_after_opt_a : 0.000023s : 0.09% optimize.slice_cell_reuse_recomputed_activation : 0.000003s : 0.01% optimize.rewriter_after_opt_a : 0.000052s : 0.20% optimize.convert_after_rewriter : 0.000009s : 0.04% optimize.order_py_execute_after_rewriter : 0.000006s : 0.03% optimize.mutable_eliminate : 0.000803s : 3.11% optimize.opt_b.b_1 : 0.000167s : 0.65% optimize.opt_b.b_2 : 0.000011s : 0.04% optimize.opt_b.updatestate_depend_eliminate : 0.000013s : 0.05% optimize.opt_b.updatestate_assign_eliminate : 0.000004s : 0.02% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.01% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000039s : 0.15% optimize.optimize_parallel_all_gather_comm : 0.000026s : 0.10% optimize.overlap_param_gather : 0.000002s : 0.01% optimize.cconv : 0.000042s : 0.16% optimize.loop_unroll : 0.000533s : 2.06% optimize.opt_after_cconv.c_1 : 0.000041s : 0.16% optimize.opt_after_cconv.parameter_eliminate : 0.000005s : 0.02% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000008s : 0.03% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000004s : 0.01% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.cse : 0.000030s : 0.12% optimize.opt_after_cconv.renormalize : 0.000001s : 0.00% optimize.remove_dup_value : 0.000020s : 0.08% optimize.tuple_transform.d_1 : 0.000061s : 0.24% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000009s : 0.04% optimize.partial_unused_args_eliminate : 0.000002s : 0.01% optimize.add_recomputation : 0.000071s : 0.27% optimize.cse_after_recomputation.cse : 0.000017s : 0.07% optimize.environ_conv : 0.000008s : 0.03% optimize.swap_dp_allreduce_reducescatter : 0.000006s : 0.02% optimize.bias_add_comm_swap : 0.000004s : 0.01% optimize.label_micro_interleaved_index : 0.000006s : 0.02% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.01% optimize.merge_cast_opt : 0.000001s : 0.01% optimize.slice_recompute_activation : 0.000002s : 0.01% optimize.micro_interleaved_order_control : 0.000002s : 0.01% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.00% optimize.full_micro_interleaved_order_control : 0.000002s : 0.01% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.01% optimize.comm_op_add_attrs : 0.000002s : 0.01% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000002s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.01% optimize.control_data_broadcast_order : 0.000017s : 0.07% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.01% optimize.offloading_packed_experts : 0.000005s : 0.02% optimize.overlap_recompute_and_grad_model_parallel : 0.000006s : 0.02% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.01% optimize.overlap_recompute_comm : 0.000002s : 0.01% optimize.overlap_grad_ring_attention : 0.000005s : 0.02% optimize.overlap_grad_flash_sp : 0.000027s : 0.11% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000003s : 0.01% optimize.split_layernorm_comm : 0.000002s : 0.01% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000005s : 0.02% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000014s : 0.06% optimize.symbol_engine_optimizer.elim_not_effective : 0.000055s : 0.21% optimize.symbol_engine_optimizer.opt_reshape : 0.000009s : 0.04% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000014s : 0.05% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.01% pipeline_parallel_scheduler : 0.000002s : 0.01% auto_monad_reorder : 0.000024s : 0.09% get_jit_bprop_graph : 0.000002s : 0.01% rewriter_after_jit_bprop_graph : 0.000006s : 0.02% opt_after_jit_grad : 0.012654s : 49.01% validate : 0.000071s : 0.28% Time group info: ------[substitution.] 0.000319 38 11.60% : 0.000037s : 3: substitution.cast_eliminate 0.83% : 0.000003s : 3: substitution.elim_not_effective 0.69% : 0.000002s : 3: substitution.fold_const_symbol 2.39% : 0.000008s : 5: substitution.graph_param_transform 72.81% : 0.000232s : 4: substitution.inline 2.26% : 0.000007s : 6: substitution.j_node_and_user_rematch 2.46% : 0.000008s : 6: substitution.remove_not_recompute_node 2.44% : 0.000008s : 4: substitution.replace_old_param 4.51% : 0.000014s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.006590 2 86.52% : 0.005702s : 1: type_inference.infer 13.48% : 0.000888s : 1: type_inference.specialize ------[replace.] 0.000073 8 62.83% : 0.000046s : 4: replace.inline 37.17% : 0.000027s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000241 8 94.82% : 0.000228s : 4: match.inline 5.18% : 0.000012s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000272 1504 0.84% : 0.000002s : 15: predicate.accumulaten_eliminater 2.63% : 0.000007s : 5: predicate.ad_related_special_op_eliminate 0.53% : 0.000001s : 10: predicate.addn_check_dump 0.83% : 0.000002s : 15: predicate.addn_zero_filter 0.77% : 0.000002s : 15: predicate.adjust_all_reduce_mul_add 2.25% : 0.000006s : 25: predicate.arithmetic_simplify 1.05% : 0.000003s : 15: predicate.cast_eliminate 0.58% : 0.000002s : 10: predicate.check_bprop_eliminate 0.50% : 0.000001s : 10: predicate.compare_switch_simplify 0.17% : 0.000000s : 5: predicate.const_output_eliminate 0.61% : 0.000002s : 10: predicate.depend_value_elim 0.81% : 0.000002s : 15: predicate.dict_get_item_const_eliminator 0.86% : 0.000002s : 15: predicate.dict_get_item_eliminator 0.85% : 0.000002s : 15: predicate.dict_set_item_eliminator 2.03% : 0.000006s : 10: predicate.dumpgradient_eliminate 0.20% : 0.000001s : 5: predicate.elim_not_effective 0.46% : 0.000001s : 5: predicate.elim_shapecalc_of_broadcastargs 1.11% : 0.000003s : 20: predicate.environ_add_const_eliminate 1.17% : 0.000003s : 20: predicate.environ_get_add_eliminate 1.09% : 0.000003s : 20: predicate.environ_get_depend_swap 1.74% : 0.000005s : 30: predicate.environ_get_eliminate 1.04% : 0.000003s : 20: predicate.environ_get_set_eliminate 1.19% : 0.000003s : 23: predicate.exchange_switch_depend_value 2.36% : 0.000006s : 23: predicate.float_depend_g_call 0.55% : 0.000002s : 10: predicate.float_environ_get_switch 0.80% : 0.000002s : 15: predicate.float_tuple_getitem_switch 0.20% : 0.000001s : 5: predicate.fold_const_symbol 0.66% : 0.000002s : 10: predicate.get_grad_eliminate 0.19% : 0.000001s : 5: predicate.graph_param_transform 0.63% : 0.000002s : 10: predicate.incorporate_call 0.50% : 0.000001s : 10: predicate.incorporate_call_switch 6.14% : 0.000017s : 68: predicate.inline 0.98% : 0.000003s : 10: predicate.inline_without_move 0.32% : 0.000001s : 10: predicate.j_node_and_user_rematch 1.03% : 0.000003s : 10: predicate.less_batch_normalization 1.83% : 0.000005s : 29: predicate.list_to_tuple_eliminator_ 2.27% : 0.000006s : 44: predicate.load_eliminater 0.89% : 0.000002s : 5: predicate.loop_unroll_after_grad 1.96% : 0.000005s : 36: predicate.loop_unroll_before_grad 1.70% : 0.000005s : 25: predicate.make_slice_get_slice_eliminator 0.53% : 0.000001s : 10: predicate.merge_addn 0.56% : 0.000002s : 10: predicate.micro_step_allgather_replace 0.56% : 0.000002s : 10: predicate.mini_step_allgather_replace 0.75% : 0.000002s : 15: predicate.minmaximum_grad 1.38% : 0.000004s : 5: predicate.mutable_eliminate 0.36% : 0.000001s : 5: predicate.opt_reshape 0.35% : 0.000001s : 5: predicate.parallel_virtual_node 1.58% : 0.000004s : 23: predicate.partial_defer_inline 1.45% : 0.000004s : 24: predicate.partial_eliminate 0.90% : 0.000002s : 15: predicate.print_const_string_wrapper 0.65% : 0.000002s : 10: predicate.reduce_all_const_elim 1.15% : 0.000003s : 15: predicate.reduce_eliminate 2.22% : 0.000006s : 44: predicate.redundant_stop_gradient_eliminater 0.35% : 0.000001s : 10: predicate.remove_not_recompute_node 1.30% : 0.000004s : 29: predicate.replace_applicator 0.49% : 0.000001s : 10: predicate.replace_old_param 0.32% : 0.000001s : 5: predicate.reset_defer_inline 0.93% : 0.000003s : 15: predicate.reshape_eliminate 0.72% : 0.000002s : 10: predicate.row_tensor_add_zeros_like 0.45% : 0.000001s : 5: predicate.row_tensor_eliminate 0.95% : 0.000003s : 10: predicate.same_eliminate 0.50% : 0.000001s : 10: predicate.set_cell_output_no_recompute 0.81% : 0.000002s : 10: predicate.shard_identity_eliminate 0.76% : 0.000002s : 10: predicate.special_op_eliminate 0.71% : 0.000002s : 10: predicate.specialize_transform 1.35% : 0.000004s : 10: predicate.split_environ_get_set_with_tuple_value 1.04% : 0.000003s : 10: predicate.stack_unstack_eliminate 0.42% : 0.000001s : 5: predicate.switch_call_monad_eliminater 1.24% : 0.000003s : 23: predicate.switch_defer_inline 1.73% : 0.000005s : 33: predicate.switch_layer_defer_inline 4.77% : 0.000013s : 74: predicate.switch_simplify 0.82% : 0.000002s : 15: predicate.tile_eliminate 0.84% : 0.000002s : 15: predicate.transpose_eliminate 1.68% : 0.000005s : 25: predicate.tuple_list_convert_item_index_to_positive 1.48% : 0.000004s : 25: predicate.tuple_list_get_item_const_eliminator 1.32% : 0.000004s : 25: predicate.tuple_list_get_item_depend_reorder 3.12% : 0.000008s : 39: predicate.tuple_list_get_item_eliminator 1.46% : 0.000004s : 25: predicate.tuple_list_get_set_item_eliminator 2.29% : 0.000006s : 35: predicate.tuple_list_set_item_eliminator 1.74% : 0.000005s : 29: predicate.tuple_to_list_eliminator_ 2.22% : 0.000006s : 44: predicate.updatestate_pure_node_eliminater 2.86% : 0.000008s : 54: predicate.updatestate_useless_node_eliminater 0.39% : 0.000001s : 5: predicate.value_based_eliminate 0.68% : 0.000002s : 10: predicate.virtual_dataset_eliminate 0.61% : 0.000002s : 10: predicate.virtual_output_eliminate 0.28% : 0.000001s : 5: predicate.virtual_view_grad_eliminate 0.58% : 0.000002s : 5: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000717 11 51.27% : 0.000368s : 5: func_graph_cloner_run.FuncGraphClonerGraph 48.73% : 0.000350s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.044275 192 0.01% : 0.000004s : 1: ForceFp32Comm 8.54% : 0.003782s : 1: add_attr 8.50% : 0.003765s : 1: add_attr_with_inline 0.01% : 0.000004s : 1: add_comm_op_reuse_tag 0.17% : 0.000075s : 1: add_recomputation 0.01% : 0.000004s : 1: assign_add_opt 0.18% : 0.000079s : 1: auto_monad 0.06% : 0.000029s : 1: auto_monad_reorder 0.01% : 0.000004s : 1: begin_end_overlap_inline 0.02% : 0.000007s : 1: bias_add_comm_swap 1.10% : 0.000487s : 1: bootstrap 0.10% : 0.000046s : 1: cconv 0.01% : 0.000005s : 1: comm_op_add_attrs 0.05% : 0.000020s : 1: control_data_broadcast_order 0.03% : 0.000012s : 1: convert_after_rewriter 0.07% : 0.000032s : 1: cse_after_recomputation 0.01% : 0.000006s : 1: dataset_repeat_opt 0.01% : 0.000006s : 1: detach_backward 0.02% : 0.000011s : 1: environ_conv 0.07% : 0.000030s : 1: event_method 0.01% : 0.000005s : 1: full_micro_interleaved_order_control 0.01% : 0.000005s : 1: get_jit_bprop_graph 0.02% : 0.000011s : 1: graph_reusing 0.01% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000004s : 1: handle_group_info 0.02% : 0.000007s : 1: inline 0.01% : 0.000007s : 1: insert-virtual-dataset 0.01% : 0.000004s : 1: interleave_parallel_branches 0.01% : 0.000004s : 1: interleave_split_concat_branches 0.01% : 0.000007s : 1: label_fine_grained_interleaved_index 0.02% : 0.000009s : 1: label_micro_interleaved_index 1.23% : 0.000542s : 1: loop_unroll 0.01% : 0.000004s : 1: merge_cast_opt 0.01% : 0.000005s : 1: micro_interleaved_order_control 1.84% : 0.000816s : 1: mutable_eliminate 0.02% : 0.000008s : 1: offloading_packed_experts 0.04% : 0.000019s : 1: opt.transform.loop_unroll_optimizer 0.06% : 0.000027s : 1: opt.transform.mutable_eliminate 3.58% : 0.001587s : 78: opt.transform.opt_a 0.09% : 0.000039s : 1: opt.transform.opt_after_cconv 0.16% : 0.000072s : 1: opt.transform.opt_after_jit_grad 0.32% : 0.000141s : 28: opt.transform.opt_b 0.15% : 0.000068s : 2: opt.transform.opt_trans_graph 0.20% : 0.000088s : 4: opt.transform.symbol_engine_opt 8.85% : 0.003918s : 1: opt_a 0.31% : 0.000137s : 1: opt_after_cconv 28.64% : 0.012679s : 1: opt_after_jit_grad 0.66% : 0.000292s : 1: opt_b 15.17% : 0.006714s : 1: optimize 0.07% : 0.000029s : 1: optimize_parallel_all_gather_comm 0.02% : 0.000010s : 1: order_py_execute_after_rewriter 0.07% : 0.000031s : 1: overlap_grad_flash_sp 0.01% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.02% : 0.000009s : 1: overlap_grad_ring_attention 0.02% : 0.000007s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000005s : 1: overlap_opt_shard_in_pipeline 0.01% : 0.000005s : 1: overlap_param_gather 0.01% : 0.000005s : 1: overlap_recompute_allgather_and_fa_grad 0.02% : 0.000008s : 1: overlap_recompute_and_grad_model_parallel 0.01% : 0.000005s : 1: overlap_recompute_comm 0.02% : 0.000008s : 1: parallel-infer-symbol 0.01% : 0.000004s : 1: parallel-infer-symbol-second 0.01% : 0.000005s : 1: partial_unused_args_eliminate 0.01% : 0.000005s : 1: pipeline_parallel_scheduler 0.01% : 0.000005s : 1: pipeline_split 0.11% : 0.000050s : 1: pre_auto_parallel 0.10% : 0.000043s : 1: py_interpret_to_execute 0.06% : 0.000027s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000005s : 1: remove_cast_before_assign_add 0.05% : 0.000023s : 1: remove_dup_value 1.48% : 0.000656s : 1: renormalize.infer 1.25% : 0.000552s : 1: renormalize.specialize 0.01% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.02% : 0.000010s : 1: rewriter_after_jit_bprop_graph 0.13% : 0.000059s : 1: rewriter_after_opt_a 0.26% : 0.000114s : 1: rewriter_before_opt_a 0.01% : 0.000006s : 1: slice_cell_reuse_recomputed_activation 0.01% : 0.000005s : 1: slice_recompute_activation 0.01% : 0.000005s : 1: split_layernorm_comm 0.01% : 0.000005s : 1: split_matmul_comm_elemetwise 0.02% : 0.000009s : 1: swap_dp_allreduce_reducescatter 0.31% : 0.000139s : 1: symbol_engine_optimizer 0.23% : 0.000101s : 1: tuple_transform 15.10% : 0.006686s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:40:29.279.449 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:40:29.279.736 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.288751, [21] [bootstrap]: 0.00045141 [type_inference]: 0.271199 [event_method]: 2.418e-05 [auto_monad]: 7.502e-05 [graph_reusing]: 6.16e-06 [inline]: 4.21001e-06 [add_attr]: 0.0065006, [1] [add_attr_with_inline]: 0.00648186, [1] [Cycle 1]: 0.00026337, [2] [tag_attr]: 3.418e-05 [meta_addattr_fg_expand]: 7.33e-06 [parallel-infer-symbol]: 4.22003e-06 [pre_auto_parallel]: 5.329e-05 [insert-virtual-dataset]: 2.43998e-06 [parallel-infer-symbol-second]: 7.39994e-07 [dataset_repeat_opt]: 2.04e-06 [pipeline_split]: 1.87001e-06 [optimize]: 0.00782105, [53] [py_interpret_to_execute]: 5.275e-05 [rewriter_before_opt_a]: 0.00013028 [opt_a]: 0.00440879, [2] [Cycle 1]: 0.00329963, [45] [expand_dump_flag]: 3.51001e-06 [switch_simplify]: 4.59e-05 [loop_unroll]: 3.074e-05 [a_1]: 0.00081005 [with_stream_mark]: 2.282e-05 [recompute_prepare]: 1.232e-05 [updatestate_depend_eliminate]: 5.83002e-06 [updatestate_assign_eliminate]: 4.05e-06 [updatestate_loads_eliminate]: 4.08999e-06 [parameter_eliminate]: 2.29999e-06 [a_2]: 0.00014035 [accelerated_algorithm]: 1.191e-05 [shard]: 2.54001e-06 [meta_shard_fg_expand]: 2.21e-06 [shard_inline]: 8.57e-06 [merge_send_recv]: 1.227e-05 [auto_parallel]: 7.66999e-06 [parallel]: 2.151e-05 [flash_sp]: 9.75002e-06 [merge_comm]: 6.29999e-06 [allreduce_fusion]: 5.22e-06 [matmul_add_comm_reduction]: 1.256e-05 [allreduce_slice_to_reducescatter]: 9.50007e-07 [virtual_shard_identity]: 1.109e-05 [virtual_dataset]: 9.42999e-06 [get_grad_eliminate_]: 8.92e-06 [virtual_output]: 8.94e-06 [merge_forward]: 4.47e-06 [cell_reuse_recompute_pass]: 1.43002e-06 [offload_activation]: 1.241e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.026e-05 [merge_recompute_call_nodes]: 2.04999e-06 [before_grad]: 1.526e-05 [set_forward_comm_id_for_comm_node_pass]: 4.79e-06 [meta_fg_expand]: 4.87e-06 [flash_sp_send_recv_attached]: 2.68e-06 [receive_attached]: 3.14999e-06 [after_resolve]: 1.516e-05 [a_after_grad]: 1.497e-05 [renormalize]: 0.00133227 [add_forward_monad_depend]: 8.1e-06 [auto_monad_grad]: 2.41e-06 [auto_monad_eliminator]: 6.983e-05 [cse]: 4.716e-05 [a_3]: 8.289e-05 [Cycle 2]: 0.00108968, [45] [expand_dump_flag]: 3.41001e-06 [switch_simplify]: 1.102e-05 [loop_unroll]: 8.02998e-06 [a_1]: 0.0001986 [with_stream_mark]: 2.187e-05 [recompute_prepare]: 8.21002e-06 [updatestate_depend_eliminate]: 4.52e-06 [updatestate_assign_eliminate]: 3.27002e-06 [updatestate_loads_eliminate]: 2.88e-06 [parameter_eliminate]: 1.42999e-06 [a_2]: 0.00012567 [accelerated_algorithm]: 1.059e-05 [shard]: 2.58e-06 [meta_shard_fg_expand]: 2.23002e-06 [shard_inline]: 9.34e-06 [merge_send_recv]: 9.56e-06 [auto_parallel]: 1.026e-05 [parallel]: 8.84e-06 [flash_sp]: 4.86002e-06 [merge_comm]: 4.37e-06 [allreduce_fusion]: 5.22999e-06 [matmul_add_comm_reduction]: 1.02e-05 [allreduce_slice_to_reducescatter]: 4.69998e-07 [virtual_shard_identity]: 9.70002e-06 [virtual_dataset]: 8.77e-06 [get_grad_eliminate_]: 7.77e-06 [virtual_output]: 8.18001e-06 [merge_forward]: 4.78001e-06 [cell_reuse_recompute_pass]: 5.46e-06 [offload_activation]: 1.134e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.028e-05 [merge_recompute_call_nodes]: 1.35999e-06 [before_grad]: 1.434e-05 [set_forward_comm_id_for_comm_node_pass]: 4.65999e-06 [meta_fg_expand]: 4.4e-06 [flash_sp_send_recv_attached]: 1.82001e-06 [receive_attached]: 2.85002e-06 [after_resolve]: 1.465e-05 [a_after_grad]: 1.36e-05 [renormalize]: 8.00064e-08 [add_forward_monad_depend]: 1.77999e-06 [auto_monad_grad]: 2.17001e-06 [auto_monad_eliminator]: 1.381e-05 [cse]: 2.81e-05 [a_3]: 6.418e-05 [py_interpret_to_execute_after_opt_a]: 2.775e-05 [slice_cell_reuse_recomputed_activation]: 5.15001e-06 [rewriter_after_opt_a]: 6.001e-05 [convert_after_rewriter]: 1.219e-05 [order_py_execute_after_rewriter]: 8.95001e-06 [mutable_eliminate]: 0.00100919 [opt_b]: 0.00035433, [1] [Cycle 1]: 0.00034031, [7] [b_1]: 0.00021953 [b_2]: 1.094e-05 [updatestate_depend_eliminate]: 8.87999e-06 [updatestate_assign_eliminate]: 3.09001e-06 [updatestate_loads_eliminate]: 3.06001e-06 [renormalize]: 6.10016e-07 [cse]: 3.238e-05 [optimize_parallel_all_gather_comm]: 2.834e-05 [overlap_param_gather]: 8.12e-06 [cconv]: 4.05e-05 [loop_unroll]: 0.00060008 [opt_after_cconv]: 0.00015911, [1] [Cycle 1]: 0.00014996, [7] [c_1]: 4.088e-05 [parameter_eliminate]: 4.76002e-06 [updatestate_depend_eliminate]: 7.39002e-06 [updatestate_assign_eliminate]: 3.78999e-06 [updatestate_loads_eliminate]: 2.91e-06 [cse]: 3.023e-05 [renormalize]: 7.09988e-07 [remove_dup_value]: 2.36e-05 [tuple_transform]: 0.00011339, [1] [Cycle 1]: 0.00010617, [4] [d_1]: 6.192e-05 [none_parameter_eliminate]: 2.06998e-06 [renormalize]: 2.10013e-07 [switch_simplify]: 9.59999e-06 [partial_unused_args_eliminate]: 4.89e-06 [add_recomputation]: 7.446e-05 [cse_after_recomputation]: 3.885e-05, [1] [Cycle 1]: 2.991e-05, [1] [cse]: 1.938e-05 [environ_conv]: 1.257e-05 [swap_dp_allreduce_reducescatter]: 5.444e-05 [bias_add_comm_swap]: 6.23002e-06 [label_micro_interleaved_index]: 1.414e-05 [label_fine_grained_interleaved_index]: 6.34001e-06 [merge_cast_opt]: 4.51002e-06 [slice_recompute_activation]: 4.66002e-06 [micro_interleaved_order_control]: 6.56999e-06 [assign_add_opt]: 4.42e-06 [ForceFp32Comm]: 3.62002e-06 [remove_cast_before_assign_add]: 3.48e-06 [full_micro_interleaved_order_control]: 5.71998e-06 [reorder_send_recv_between_fp_bp]: 6.74999e-06 [comm_op_add_attrs]: 3.69002e-06 [add_comm_op_reuse_tag]: 3.27002e-06 [interleave_split_concat_branches]: 3.68999e-06 [interleave_parallel_branches]: 4.28001e-06 [overlap_opt_shard_in_pipeline]: 5.34998e-06 [overlap_opt_shard_grad_in_pipeline]: 4.25e-06 [control_data_broadcast_order]: 2.473e-05 [grouped_pairwise_exchange_alltoall]: 4.43001e-06 [offloading_packed_experts]: 1.045e-05 [overlap_recompute_and_grad_model_parallel]: 1.248e-05 [overlap_grad_matmul_and_grad_allreduce]: 3.64002e-06 [overlap_recompute_allgather_and_fa_grad]: 3.75e-06 [overlap_recompute_comm]: 5.65001e-06 [overlap_grad_ring_attention]: 7.83001e-06 [overlap_grad_flash_sp]: 3.374e-05 [begin_end_overlap_inline]: 3.01001e-06 [split_matmul_comm_elemetwise]: 4.59998e-06 [split_layernorm_comm]: 4.21001e-06 [handle_group_info]: 4.07e-06 [symbol_engine_optimizer]: 0.00013658, [1] [Cycle 1]: 0.00012704, [6] [build]: 6.48e-06 [elim_shapecalc]: 1.602e-05 [elim_not_effective]: 2.196e-05 [opt_reshape]: 1.166e-05 [fold_const_symbol]: 1.403e-05 [renormalize]: 2.60014e-07 [detach_backward]: 5.89999e-06 [pipeline_parallel_scheduler]: 2.43e-06 [auto_monad_reorder]: 3.226e-05 [get_jit_bprop_graph]: 2.29999e-06 [rewriter_after_jit_bprop_graph]: 1.015e-05 [opt_after_jit_grad]: 0.00089125 [validate]: 6.694e-05 Sums bootstrap : 0.000451s : 0.16% type_inference : 0.271199s : 97.08% event_method : 0.000024s : 0.01% auto_monad : 0.000075s : 0.03% graph_reusing : 0.000006s : 0.00% inline : 0.000004s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000034s : 0.01% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000007s : 0.00% parallel-infer-symbol : 0.000004s : 0.00% pre_auto_parallel : 0.000053s : 0.02% insert-virtual-dataset : 0.000002s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000053s : 0.02% optimize.rewriter_before_opt_a : 0.000130s : 0.05% optimize.opt_a.expand_dump_flag : 0.000007s : 0.00% optimize.opt_a.switch_simplify : 0.000057s : 0.02% optimize.opt_a.loop_unroll : 0.000039s : 0.01% optimize.opt_a.a_1 : 0.001009s : 0.36% optimize.opt_a.with_stream_mark : 0.000045s : 0.02% optimize.opt_a.recompute_prepare : 0.000021s : 0.01% optimize.opt_a.updatestate_depend_eliminate : 0.000010s : 0.00% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.00% optimize.opt_a.updatestate_loads_eliminate : 0.000007s : 0.00% optimize.opt_a.parameter_eliminate : 0.000004s : 0.00% optimize.opt_a.a_2 : 0.000266s : 0.10% optimize.opt_a.accelerated_algorithm : 0.000023s : 0.01% optimize.opt_a.shard : 0.000005s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.00% optimize.opt_a.shard_inline : 0.000018s : 0.01% optimize.opt_a.merge_send_recv : 0.000022s : 0.01% optimize.opt_a.auto_parallel : 0.000018s : 0.01% optimize.opt_a.parallel : 0.000030s : 0.01% optimize.opt_a.flash_sp : 0.000015s : 0.01% optimize.opt_a.merge_comm : 0.000011s : 0.00% optimize.opt_a.allreduce_fusion : 0.000010s : 0.00% optimize.opt_a.matmul_add_comm_reduction : 0.000023s : 0.01% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000021s : 0.01% optimize.opt_a.virtual_dataset : 0.000018s : 0.01% optimize.opt_a.get_grad_eliminate_ : 0.000017s : 0.01% optimize.opt_a.virtual_output : 0.000017s : 0.01% optimize.opt_a.merge_forward : 0.000009s : 0.00% optimize.opt_a.cell_reuse_recompute_pass : 0.000007s : 0.00% optimize.opt_a.offload_activation : 0.000024s : 0.01% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000041s : 0.01% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.00% optimize.opt_a.before_grad : 0.000030s : 0.01% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000009s : 0.00% optimize.opt_a.meta_fg_expand : 0.000009s : 0.00% optimize.opt_a.flash_sp_send_recv_attached : 0.000005s : 0.00% optimize.opt_a.receive_attached : 0.000006s : 0.00% optimize.opt_a.after_resolve : 0.000030s : 0.01% optimize.opt_a.a_after_grad : 0.000029s : 0.01% optimize.opt_a.renormalize : 0.001332s : 0.48% optimize.opt_a.add_forward_monad_depend : 0.000010s : 0.00% optimize.opt_a.auto_monad_grad : 0.000005s : 0.00% optimize.opt_a.auto_monad_eliminator : 0.000084s : 0.03% optimize.opt_a.cse : 0.000075s : 0.03% optimize.opt_a.a_3 : 0.000147s : 0.05% optimize.py_interpret_to_execute_after_opt_a : 0.000028s : 0.01% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.00% optimize.rewriter_after_opt_a : 0.000060s : 0.02% optimize.convert_after_rewriter : 0.000012s : 0.00% optimize.order_py_execute_after_rewriter : 0.000009s : 0.00% optimize.mutable_eliminate : 0.001009s : 0.36% optimize.opt_b.b_1 : 0.000220s : 0.08% optimize.opt_b.b_2 : 0.000011s : 0.00% optimize.opt_b.updatestate_depend_eliminate : 0.000009s : 0.00% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.00% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000032s : 0.01% optimize.optimize_parallel_all_gather_comm : 0.000028s : 0.01% optimize.overlap_param_gather : 0.000008s : 0.00% optimize.cconv : 0.000040s : 0.01% optimize.loop_unroll : 0.000600s : 0.21% optimize.opt_after_cconv.c_1 : 0.000041s : 0.01% optimize.opt_after_cconv.parameter_eliminate : 0.000005s : 0.00% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000007s : 0.00% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000004s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.cse : 0.000030s : 0.01% optimize.opt_after_cconv.renormalize : 0.000001s : 0.00% optimize.remove_dup_value : 0.000024s : 0.01% optimize.tuple_transform.d_1 : 0.000062s : 0.02% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000010s : 0.00% optimize.partial_unused_args_eliminate : 0.000005s : 0.00% optimize.add_recomputation : 0.000074s : 0.03% optimize.cse_after_recomputation.cse : 0.000019s : 0.01% optimize.environ_conv : 0.000013s : 0.00% optimize.swap_dp_allreduce_reducescatter : 0.000054s : 0.02% optimize.bias_add_comm_swap : 0.000006s : 0.00% optimize.label_micro_interleaved_index : 0.000014s : 0.01% optimize.label_fine_grained_interleaved_index : 0.000006s : 0.00% optimize.merge_cast_opt : 0.000005s : 0.00% optimize.slice_recompute_activation : 0.000005s : 0.00% optimize.micro_interleaved_order_control : 0.000007s : 0.00% optimize.assign_add_opt : 0.000004s : 0.00% optimize.ForceFp32Comm : 0.000004s : 0.00% optimize.remove_cast_before_assign_add : 0.000003s : 0.00% optimize.full_micro_interleaved_order_control : 0.000006s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000007s : 0.00% optimize.comm_op_add_attrs : 0.000004s : 0.00% optimize.add_comm_op_reuse_tag : 0.000003s : 0.00% optimize.interleave_split_concat_branches : 0.000004s : 0.00% optimize.interleave_parallel_branches : 0.000004s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000005s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000004s : 0.00% optimize.control_data_broadcast_order : 0.000025s : 0.01% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.00% optimize.offloading_packed_experts : 0.000010s : 0.00% optimize.overlap_recompute_and_grad_model_parallel : 0.000012s : 0.00% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.00% optimize.overlap_recompute_comm : 0.000006s : 0.00% optimize.overlap_grad_ring_attention : 0.000008s : 0.00% optimize.overlap_grad_flash_sp : 0.000034s : 0.01% optimize.begin_end_overlap_inline : 0.000003s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000005s : 0.00% optimize.split_layernorm_comm : 0.000004s : 0.00% optimize.handle_group_info : 0.000004s : 0.00% optimize.symbol_engine_optimizer.build : 0.000006s : 0.00% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000016s : 0.01% optimize.symbol_engine_optimizer.elim_not_effective : 0.000022s : 0.01% optimize.symbol_engine_optimizer.opt_reshape : 0.000012s : 0.00% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000014s : 0.01% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000006s : 0.00% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000032s : 0.01% get_jit_bprop_graph : 0.000002s : 0.00% rewriter_after_jit_bprop_graph : 0.000010s : 0.00% opt_after_jit_grad : 0.000891s : 0.32% validate : 0.000067s : 0.02% Time group info: ------[substitution.] 0.000310 38 10.58% : 0.000033s : 3: substitution.cast_eliminate 0.87% : 0.000003s : 3: substitution.elim_not_effective 0.72% : 0.000002s : 3: substitution.fold_const_symbol 2.78% : 0.000009s : 5: substitution.graph_param_transform 73.23% : 0.000227s : 4: substitution.inline 1.85% : 0.000006s : 6: substitution.j_node_and_user_rematch 2.42% : 0.000008s : 6: substitution.remove_not_recompute_node 2.19% : 0.000007s : 4: substitution.replace_old_param 5.36% : 0.000017s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.271117 2 99.55% : 0.269889s : 1: type_inference.infer 0.45% : 0.001228s : 1: type_inference.specialize ------[replace.] 0.000076 8 63.92% : 0.000049s : 4: replace.inline 36.08% : 0.000028s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000239 8 93.82% : 0.000224s : 4: match.inline 6.18% : 0.000015s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000290 1504 0.76% : 0.000002s : 15: predicate.accumulaten_eliminater 0.92% : 0.000003s : 5: predicate.ad_related_special_op_eliminate 0.56% : 0.000002s : 10: predicate.addn_check_dump 0.73% : 0.000002s : 15: predicate.addn_zero_filter 0.72% : 0.000002s : 15: predicate.adjust_all_reduce_mul_add 2.22% : 0.000006s : 25: predicate.arithmetic_simplify 0.90% : 0.000003s : 15: predicate.cast_eliminate 0.78% : 0.000002s : 10: predicate.check_bprop_eliminate 0.49% : 0.000001s : 10: predicate.compare_switch_simplify 0.16% : 0.000000s : 5: predicate.const_output_eliminate 1.29% : 0.000004s : 10: predicate.depend_value_elim 0.78% : 0.000002s : 15: predicate.dict_get_item_const_eliminator 2.29% : 0.000007s : 15: predicate.dict_get_item_eliminator 0.74% : 0.000002s : 15: predicate.dict_set_item_eliminator 1.56% : 0.000005s : 10: predicate.dumpgradient_eliminate 0.22% : 0.000001s : 5: predicate.elim_not_effective 0.88% : 0.000003s : 5: predicate.elim_shapecalc_of_broadcastargs 1.01% : 0.000003s : 20: predicate.environ_add_const_eliminate 0.94% : 0.000003s : 20: predicate.environ_get_add_eliminate 0.93% : 0.000003s : 20: predicate.environ_get_depend_swap 2.29% : 0.000007s : 30: predicate.environ_get_eliminate 1.29% : 0.000004s : 20: predicate.environ_get_set_eliminate 1.10% : 0.000003s : 23: predicate.exchange_switch_depend_value 1.97% : 0.000006s : 23: predicate.float_depend_g_call 0.53% : 0.000002s : 10: predicate.float_environ_get_switch 0.78% : 0.000002s : 15: predicate.float_tuple_getitem_switch 0.18% : 0.000001s : 5: predicate.fold_const_symbol 0.64% : 0.000002s : 10: predicate.get_grad_eliminate 0.19% : 0.000001s : 5: predicate.graph_param_transform 0.54% : 0.000002s : 10: predicate.incorporate_call 0.46% : 0.000001s : 10: predicate.incorporate_call_switch 5.60% : 0.000016s : 68: predicate.inline 0.84% : 0.000002s : 10: predicate.inline_without_move 0.40% : 0.000001s : 10: predicate.j_node_and_user_rematch 1.88% : 0.000005s : 10: predicate.less_batch_normalization 1.58% : 0.000005s : 29: predicate.list_to_tuple_eliminator_ 2.15% : 0.000006s : 44: predicate.load_eliminater 0.99% : 0.000003s : 5: predicate.loop_unroll_after_grad 1.83% : 0.000005s : 36: predicate.loop_unroll_before_grad 1.85% : 0.000005s : 25: predicate.make_slice_get_slice_eliminator 0.57% : 0.000002s : 10: predicate.merge_addn 0.90% : 0.000003s : 10: predicate.micro_step_allgather_replace 1.17% : 0.000003s : 10: predicate.mini_step_allgather_replace 0.68% : 0.000002s : 15: predicate.minmaximum_grad 1.24% : 0.000004s : 5: predicate.mutable_eliminate 0.43% : 0.000001s : 5: predicate.opt_reshape 0.40% : 0.000001s : 5: predicate.parallel_virtual_node 1.52% : 0.000004s : 23: predicate.partial_defer_inline 1.35% : 0.000004s : 24: predicate.partial_eliminate 0.70% : 0.000002s : 15: predicate.print_const_string_wrapper 0.79% : 0.000002s : 10: predicate.reduce_all_const_elim 1.40% : 0.000004s : 15: predicate.reduce_eliminate 2.12% : 0.000006s : 44: predicate.redundant_stop_gradient_eliminater 0.34% : 0.000001s : 10: predicate.remove_not_recompute_node 1.20% : 0.000003s : 29: predicate.replace_applicator 0.44% : 0.000001s : 10: predicate.replace_old_param 0.39% : 0.000001s : 5: predicate.reset_defer_inline 1.20% : 0.000003s : 15: predicate.reshape_eliminate 0.78% : 0.000002s : 10: predicate.row_tensor_add_zeros_like 0.36% : 0.000001s : 5: predicate.row_tensor_eliminate 0.77% : 0.000002s : 10: predicate.same_eliminate 0.38% : 0.000001s : 10: predicate.set_cell_output_no_recompute 0.72% : 0.000002s : 10: predicate.shard_identity_eliminate 1.22% : 0.000004s : 10: predicate.special_op_eliminate 0.67% : 0.000002s : 10: predicate.specialize_transform 1.39% : 0.000004s : 10: predicate.split_environ_get_set_with_tuple_value 0.73% : 0.000002s : 10: predicate.stack_unstack_eliminate 0.29% : 0.000001s : 5: predicate.switch_call_monad_eliminater 1.18% : 0.000003s : 23: predicate.switch_defer_inline 1.64% : 0.000005s : 33: predicate.switch_layer_defer_inline 4.36% : 0.000013s : 74: predicate.switch_simplify 0.75% : 0.000002s : 15: predicate.tile_eliminate 0.78% : 0.000002s : 15: predicate.transpose_eliminate 1.56% : 0.000005s : 25: predicate.tuple_list_convert_item_index_to_positive 1.47% : 0.000004s : 25: predicate.tuple_list_get_item_const_eliminator 2.51% : 0.000007s : 25: predicate.tuple_list_get_item_depend_reorder 2.84% : 0.000008s : 39: predicate.tuple_list_get_item_eliminator 1.31% : 0.000004s : 25: predicate.tuple_list_get_set_item_eliminator 1.92% : 0.000006s : 35: predicate.tuple_list_set_item_eliminator 1.67% : 0.000005s : 29: predicate.tuple_to_list_eliminator_ 2.30% : 0.000007s : 44: predicate.updatestate_pure_node_eliminater 2.68% : 0.000008s : 54: predicate.updatestate_useless_node_eliminater 0.76% : 0.000002s : 5: predicate.value_based_eliminate 0.71% : 0.000002s : 10: predicate.virtual_dataset_eliminate 0.57% : 0.000002s : 10: predicate.virtual_output_eliminate 0.26% : 0.000001s : 5: predicate.virtual_view_grad_eliminate 0.61% : 0.000002s : 5: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000868 11 39.77% : 0.000345s : 5: func_graph_cloner_run.FuncGraphClonerGraph 60.23% : 0.000523s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.306199 192 0.00% : 0.000006s : 1: ForceFp32Comm 2.13% : 0.006517s : 1: add_attr 2.12% : 0.006486s : 1: add_attr_with_inline 0.00% : 0.000006s : 1: add_comm_op_reuse_tag 0.03% : 0.000078s : 1: add_recomputation 0.00% : 0.000007s : 1: assign_add_opt 0.03% : 0.000087s : 1: auto_monad 0.01% : 0.000041s : 1: auto_monad_reorder 0.00% : 0.000006s : 1: begin_end_overlap_inline 0.00% : 0.000009s : 1: bias_add_comm_swap 0.16% : 0.000498s : 1: bootstrap 0.01% : 0.000044s : 1: cconv 0.00% : 0.000006s : 1: comm_op_add_attrs 0.01% : 0.000028s : 1: control_data_broadcast_order 0.01% : 0.000015s : 1: convert_after_rewriter 0.01% : 0.000043s : 1: cse_after_recomputation 0.00% : 0.000007s : 1: dataset_repeat_opt 0.01% : 0.000032s : 1: detach_backward 0.01% : 0.000015s : 1: environ_conv 0.01% : 0.000035s : 1: event_method 0.00% : 0.000008s : 1: full_micro_interleaved_order_control 0.00% : 0.000009s : 1: get_jit_bprop_graph 0.00% : 0.000012s : 1: graph_reusing 0.00% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000007s : 1: handle_group_info 0.00% : 0.000011s : 1: inline 0.00% : 0.000010s : 1: insert-virtual-dataset 0.00% : 0.000007s : 1: interleave_parallel_branches 0.00% : 0.000006s : 1: interleave_split_concat_branches 0.00% : 0.000009s : 1: label_fine_grained_interleaved_index 0.01% : 0.000017s : 1: label_micro_interleaved_index 0.20% : 0.000608s : 1: loop_unroll 0.00% : 0.000007s : 1: merge_cast_opt 0.00% : 0.000009s : 1: micro_interleaved_order_control 0.33% : 0.001019s : 1: mutable_eliminate 0.00% : 0.000013s : 1: offloading_packed_experts 0.01% : 0.000020s : 1: opt.transform.loop_unroll_optimizer 0.01% : 0.000022s : 1: opt.transform.mutable_eliminate 0.52% : 0.001591s : 78: opt.transform.opt_a 0.01% : 0.000039s : 1: opt.transform.opt_after_cconv 0.01% : 0.000042s : 1: opt.transform.opt_after_jit_grad 0.05% : 0.000151s : 28: opt.transform.opt_b 0.02% : 0.000069s : 2: opt.transform.opt_trans_graph 0.02% : 0.000060s : 4: opt.transform.symbol_engine_opt 1.44% : 0.004412s : 1: opt_a 0.05% : 0.000163s : 1: opt_after_cconv 0.30% : 0.000905s : 1: opt_after_jit_grad 0.12% : 0.000359s : 1: opt_b 2.97% : 0.009092s : 1: optimize 0.01% : 0.000032s : 1: optimize_parallel_all_gather_comm 0.00% : 0.000012s : 1: order_py_execute_after_rewriter 0.01% : 0.000037s : 1: overlap_grad_flash_sp 0.00% : 0.000007s : 1: overlap_grad_matmul_and_grad_allreduce 0.00% : 0.000013s : 1: overlap_grad_ring_attention 0.00% : 0.000010s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000008s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000012s : 1: overlap_param_gather 0.00% : 0.000006s : 1: overlap_recompute_allgather_and_fa_grad 0.00% : 0.000015s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000009s : 1: overlap_recompute_comm 0.00% : 0.000011s : 1: parallel-infer-symbol 0.00% : 0.000006s : 1: parallel-infer-symbol-second 0.00% : 0.000008s : 1: partial_unused_args_eliminate 0.00% : 0.000009s : 1: pipeline_parallel_scheduler 0.00% : 0.000008s : 1: pipeline_split 0.02% : 0.000061s : 1: pre_auto_parallel 0.02% : 0.000058s : 1: py_interpret_to_execute 0.01% : 0.000031s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000008s : 1: remove_cast_before_assign_add 0.01% : 0.000027s : 1: remove_dup_value 0.26% : 0.000782s : 1: renormalize.infer 0.18% : 0.000540s : 1: renormalize.specialize 0.00% : 0.000009s : 1: reorder_send_recv_between_fp_bp 0.01% : 0.000016s : 1: rewriter_after_jit_bprop_graph 0.02% : 0.000064s : 1: rewriter_after_opt_a 0.04% : 0.000134s : 1: rewriter_before_opt_a 0.00% : 0.000008s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000007s : 1: slice_recompute_activation 0.00% : 0.000007s : 1: split_layernorm_comm 0.00% : 0.000007s : 1: split_matmul_comm_elemetwise 0.02% : 0.000058s : 1: swap_dp_allreduce_reducescatter 0.05% : 0.000139s : 1: symbol_engine_optimizer 0.04% : 0.000116s : 1: tuple_transform 88.59% : 0.271271s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:40:32.355.26 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.240399, [21] [bootstrap]: 0.00048869 [type_inference]: 0.229101 [event_method]: 2.401e-05 [auto_monad]: 7.549e-05 [graph_reusing]: 6.25997e-06 [inline]: 3.43e-06 [add_attr]: 0.00378429, [1] [add_attr_with_inline]: 0.00377138, [1] [Cycle 1]: 7.852e-05, [2] [tag_attr]: 2.726e-05 [meta_addattr_fg_expand]: 6.19999e-06 [parallel-infer-symbol]: 3.85e-06 [pre_auto_parallel]: 4.386e-05 [insert-virtual-dataset]: 2.51e-06 [parallel-infer-symbol-second]: 9.29984e-07 [dataset_repeat_opt]: 1.99999e-06 [pipeline_split]: 1.79998e-06 [optimize]: 0.00597097, [53] [py_interpret_to_execute]: 3.283e-05 [rewriter_before_opt_a]: 0.00010046 [opt_a]: 0.00348173, [2] [Cycle 1]: 0.00263394, [45] [expand_dump_flag]: 3.01001e-06 [switch_simplify]: 4.932e-05 [loop_unroll]: 3.165e-05 [a_1]: 0.00073706 [with_stream_mark]: 2.05e-05 [recompute_prepare]: 1.203e-05 [updatestate_depend_eliminate]: 5.27001e-06 [updatestate_assign_eliminate]: 4.1e-06 [updatestate_loads_eliminate]: 3.82998e-06 [parameter_eliminate]: 1.96e-06 [a_2]: 0.00010941 [accelerated_algorithm]: 9.27001e-06 [shard]: 2.53003e-06 [meta_shard_fg_expand]: 2.71999e-06 [shard_inline]: 8.75999e-06 [merge_send_recv]: 1.015e-05 [auto_parallel]: 8.70001e-06 [parallel]: 2.262e-05 [flash_sp]: 1.057e-05 [merge_comm]: 5.32001e-06 [allreduce_fusion]: 4.50001e-06 [matmul_add_comm_reduction]: 1.089e-05 [allreduce_slice_to_reducescatter]: 6.50005e-07 [virtual_shard_identity]: 1.105e-05 [virtual_dataset]: 8.12998e-06 [get_grad_eliminate_]: 8.15e-06 [virtual_output]: 8.18999e-06 [merge_forward]: 4.93001e-06 [cell_reuse_recompute_pass]: 1.28002e-06 [offload_activation]: 1.187e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.659e-05 [merge_recompute_call_nodes]: 1.71e-06 [before_grad]: 1.463e-05 [set_forward_comm_id_for_comm_node_pass]: 4.56002e-06 [meta_fg_expand]: 3.74002e-06 [flash_sp_send_recv_attached]: 2.73998e-06 [receive_attached]: 2.29001e-06 [after_resolve]: 1.464e-05 [a_after_grad]: 1.292e-05 [renormalize]: 0.00100623 [add_forward_monad_depend]: 2.852e-05 [auto_monad_grad]: 3.66001e-06 [auto_monad_eliminator]: 2.019e-05 [cse]: 3.922e-05 [a_3]: 6.436e-05 [Cycle 2]: 0.00083556, [45] [expand_dump_flag]: 1.95001e-06 [switch_simplify]: 9.46e-06 [loop_unroll]: 8.1e-06 [a_1]: 0.00018568 [with_stream_mark]: 1.708e-05 [recompute_prepare]: 8.62998e-06 [updatestate_depend_eliminate]: 4.18001e-06 [updatestate_assign_eliminate]: 3.48999e-06 [updatestate_loads_eliminate]: 3.16001e-06 [parameter_eliminate]: 1.66e-06 [a_2]: 9.434e-05 [accelerated_algorithm]: 8.23001e-06 [shard]: 1.66e-06 [meta_shard_fg_expand]: 2.44001e-06 [shard_inline]: 7.92e-06 [merge_send_recv]: 8.06001e-06 [auto_parallel]: 7.11001e-06 [parallel]: 6.79999e-06 [flash_sp]: 4.13001e-06 [merge_comm]: 4.79998e-06 [allreduce_fusion]: 4.87e-06 [matmul_add_comm_reduction]: 8.05e-06 [allreduce_slice_to_reducescatter]: 5.49975e-07 [virtual_shard_identity]: 9.49e-06 [virtual_dataset]: 7.71001e-06 [get_grad_eliminate_]: 7.66999e-06 [virtual_output]: 7.26999e-06 [merge_forward]: 4.09997e-06 [cell_reuse_recompute_pass]: 1.81e-06 [offload_activation]: 9.86998e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.626e-05 [merge_recompute_call_nodes]: 1.32999e-06 [before_grad]: 1.329e-05 [set_forward_comm_id_for_comm_node_pass]: 5.54e-06 [meta_fg_expand]: 3.14999e-06 [flash_sp_send_recv_attached]: 1.37999e-06 [receive_attached]: 1.55999e-06 [after_resolve]: 1.41e-05 [a_after_grad]: 1.207e-05 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 1.87001e-06 [auto_monad_grad]: 1.80001e-06 [auto_monad_eliminator]: 1.076e-05 [cse]: 2.252e-05 [a_3]: 4.71e-05 [py_interpret_to_execute_after_opt_a]: 1.275e-05 [slice_cell_reuse_recomputed_activation]: 2.36e-06 [rewriter_after_opt_a]: 4.86e-05 [convert_after_rewriter]: 8.90999e-06 [order_py_execute_after_rewriter]: 6.45002e-06 [mutable_eliminate]: 0.0006754 [opt_b]: 0.00025963, [1] [Cycle 1]: 0.00025278, [7] [b_1]: 0.0001598 [b_2]: 1.082e-05 [updatestate_depend_eliminate]: 8.2e-06 [updatestate_assign_eliminate]: 3.25e-06 [updatestate_loads_eliminate]: 3.25e-06 [renormalize]: 7.7e-07 [cse]: 2.741e-05 [optimize_parallel_all_gather_comm]: 1.912e-05 [overlap_param_gather]: 2.16998e-06 [cconv]: 3.293e-05 [loop_unroll]: 0.00047094 [opt_after_cconv]: 0.00012139, [1] [Cycle 1]: 0.00011497, [7] [c_1]: 3.816e-05 [parameter_eliminate]: 3.38999e-06 [updatestate_depend_eliminate]: 6.48998e-06 [updatestate_assign_eliminate]: 3.08e-06 [updatestate_loads_eliminate]: 2.90998e-06 [cse]: 2.496e-05 [renormalize]: 3.59985e-07 [remove_dup_value]: 1.644e-05 [tuple_transform]: 0.00013579, [1] [Cycle 1]: 0.00013045, [4] [d_1]: 5.428e-05 [none_parameter_eliminate]: 2.91999e-06 [renormalize]: 2.89991e-07 [switch_simplify]: 1.068e-05 [partial_unused_args_eliminate]: 2.04e-06 [add_recomputation]: 6.544e-05 [cse_after_recomputation]: 3.114e-05, [1] [Cycle 1]: 2.588e-05, [1] [cse]: 1.998e-05 [environ_conv]: 6.69001e-06 [swap_dp_allreduce_reducescatter]: 6.73e-06 [bias_add_comm_swap]: 3.58e-06 [label_micro_interleaved_index]: 5.51e-06 [label_fine_grained_interleaved_index]: 2.76999e-06 [merge_cast_opt]: 1.42999e-06 [slice_recompute_activation]: 2.07999e-06 [micro_interleaved_order_control]: 2.34999e-06 [assign_add_opt]: 1.25999e-06 [ForceFp32Comm]: 1.12e-06 [remove_cast_before_assign_add]: 1.19e-06 [full_micro_interleaved_order_control]: 2.21003e-06 [reorder_send_recv_between_fp_bp]: 3.11001e-06 [comm_op_add_attrs]: 1.00999e-06 [add_comm_op_reuse_tag]: 9.50007e-07 [interleave_split_concat_branches]: 1.15999e-06 [interleave_parallel_branches]: 1.37e-06 [overlap_opt_shard_in_pipeline]: 1.24e-06 [overlap_opt_shard_grad_in_pipeline]: 1.92999e-06 [control_data_broadcast_order]: 1.557e-05 [grouped_pairwise_exchange_alltoall]: 1.83997e-06 [offloading_packed_experts]: 5.02e-06 [overlap_recompute_and_grad_model_parallel]: 5.87999e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.30999e-06 [overlap_recompute_allgather_and_fa_grad]: 1.61998e-06 [overlap_recompute_comm]: 2.58e-06 [overlap_grad_ring_attention]: 4.57e-06 [overlap_grad_flash_sp]: 2.497e-05 [begin_end_overlap_inline]: 5.69999e-07 [split_matmul_comm_elemetwise]: 2.35002e-06 [split_layernorm_comm]: 1.80001e-06 [handle_group_info]: 1.14e-06 [symbol_engine_optimizer]: 9.462e-05, [1] [Cycle 1]: 8.925e-05, [6] [build]: 4.68001e-06 [elim_shapecalc]: 1.323e-05 [elim_not_effective]: 1.749e-05 [opt_reshape]: 9.29e-06 [fold_const_symbol]: 1.369e-05 [renormalize]: 3.00002e-07 [detach_backward]: 2.80997e-06 [pipeline_parallel_scheduler]: 1.58002e-06 [auto_monad_reorder]: 2.141e-05 [get_jit_bprop_graph]: 2.19001e-06 [rewriter_after_jit_bprop_graph]: 3.93001e-06 [opt_after_jit_grad]: 0.00052871 [validate]: 0.00015548 Sums bootstrap : 0.000489s : 0.21% type_inference : 0.229101s : 97.27% event_method : 0.000024s : 0.01% auto_monad : 0.000075s : 0.03% graph_reusing : 0.000006s : 0.00% inline : 0.000003s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000027s : 0.01% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.00% parallel-infer-symbol : 0.000004s : 0.00% pre_auto_parallel : 0.000044s : 0.02% insert-virtual-dataset : 0.000003s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000033s : 0.01% optimize.rewriter_before_opt_a : 0.000100s : 0.04% optimize.opt_a.expand_dump_flag : 0.000005s : 0.00% optimize.opt_a.switch_simplify : 0.000059s : 0.02% optimize.opt_a.loop_unroll : 0.000040s : 0.02% optimize.opt_a.a_1 : 0.000923s : 0.39% optimize.opt_a.with_stream_mark : 0.000038s : 0.02% optimize.opt_a.recompute_prepare : 0.000021s : 0.01% optimize.opt_a.updatestate_depend_eliminate : 0.000009s : 0.00% optimize.opt_a.updatestate_assign_eliminate : 0.000008s : 0.00% optimize.opt_a.updatestate_loads_eliminate : 0.000007s : 0.00% optimize.opt_a.parameter_eliminate : 0.000004s : 0.00% optimize.opt_a.a_2 : 0.000204s : 0.09% optimize.opt_a.accelerated_algorithm : 0.000018s : 0.01% optimize.opt_a.shard : 0.000004s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.000005s : 0.00% optimize.opt_a.shard_inline : 0.000017s : 0.01% optimize.opt_a.merge_send_recv : 0.000018s : 0.01% optimize.opt_a.auto_parallel : 0.000016s : 0.01% optimize.opt_a.parallel : 0.000029s : 0.01% optimize.opt_a.flash_sp : 0.000015s : 0.01% optimize.opt_a.merge_comm : 0.000010s : 0.00% optimize.opt_a.allreduce_fusion : 0.000009s : 0.00% optimize.opt_a.matmul_add_comm_reduction : 0.000019s : 0.01% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000021s : 0.01% optimize.opt_a.virtual_dataset : 0.000016s : 0.01% optimize.opt_a.get_grad_eliminate_ : 0.000016s : 0.01% optimize.opt_a.virtual_output : 0.000015s : 0.01% optimize.opt_a.merge_forward : 0.000009s : 0.00% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.00% optimize.opt_a.offload_activation : 0.000022s : 0.01% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000033s : 0.01% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.00% optimize.opt_a.before_grad : 0.000028s : 0.01% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000010s : 0.00% optimize.opt_a.meta_fg_expand : 0.000007s : 0.00% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.00% optimize.opt_a.receive_attached : 0.000004s : 0.00% optimize.opt_a.after_resolve : 0.000029s : 0.01% optimize.opt_a.a_after_grad : 0.000025s : 0.01% optimize.opt_a.renormalize : 0.001006s : 0.43% optimize.opt_a.add_forward_monad_depend : 0.000030s : 0.01% optimize.opt_a.auto_monad_grad : 0.000005s : 0.00% optimize.opt_a.auto_monad_eliminator : 0.000031s : 0.01% optimize.opt_a.cse : 0.000062s : 0.03% optimize.opt_a.a_3 : 0.000111s : 0.05% optimize.py_interpret_to_execute_after_opt_a : 0.000013s : 0.01% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.00% optimize.rewriter_after_opt_a : 0.000049s : 0.02% optimize.convert_after_rewriter : 0.000009s : 0.00% optimize.order_py_execute_after_rewriter : 0.000006s : 0.00% optimize.mutable_eliminate : 0.000675s : 0.29% optimize.opt_b.b_1 : 0.000160s : 0.07% optimize.opt_b.b_2 : 0.000011s : 0.00% optimize.opt_b.updatestate_depend_eliminate : 0.000008s : 0.00% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.00% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000027s : 0.01% optimize.optimize_parallel_all_gather_comm : 0.000019s : 0.01% optimize.overlap_param_gather : 0.000002s : 0.00% optimize.cconv : 0.000033s : 0.01% optimize.loop_unroll : 0.000471s : 0.20% optimize.opt_after_cconv.c_1 : 0.000038s : 0.02% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.00% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.cse : 0.000025s : 0.01% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000016s : 0.01% optimize.tuple_transform.d_1 : 0.000054s : 0.02% optimize.tuple_transform.none_parameter_eliminate : 0.000003s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000011s : 0.00% optimize.partial_unused_args_eliminate : 0.000002s : 0.00% optimize.add_recomputation : 0.000065s : 0.03% optimize.cse_after_recomputation.cse : 0.000020s : 0.01% optimize.environ_conv : 0.000007s : 0.00% optimize.swap_dp_allreduce_reducescatter : 0.000007s : 0.00% optimize.bias_add_comm_swap : 0.000004s : 0.00% optimize.label_micro_interleaved_index : 0.000006s : 0.00% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.00% optimize.merge_cast_opt : 0.000001s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.00% optimize.micro_interleaved_order_control : 0.000002s : 0.00% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.00% optimize.full_micro_interleaved_order_control : 0.000002s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.00% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.00% optimize.control_data_broadcast_order : 0.000016s : 0.01% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.00% optimize.offloading_packed_experts : 0.000005s : 0.00% optimize.overlap_recompute_and_grad_model_parallel : 0.000006s : 0.00% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000002s : 0.00% optimize.overlap_recompute_comm : 0.000003s : 0.00% optimize.overlap_grad_ring_attention : 0.000005s : 0.00% optimize.overlap_grad_flash_sp : 0.000025s : 0.01% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.00% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000005s : 0.00% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000013s : 0.01% optimize.symbol_engine_optimizer.elim_not_effective : 0.000017s : 0.01% optimize.symbol_engine_optimizer.opt_reshape : 0.000009s : 0.00% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000014s : 0.01% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000003s : 0.00% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000021s : 0.01% get_jit_bprop_graph : 0.000002s : 0.00% rewriter_after_jit_bprop_graph : 0.000004s : 0.00% opt_after_jit_grad : 0.000529s : 0.22% validate : 0.000155s : 0.07% Time group info: ------[substitution.] 0.000254 38 11.88% : 0.000030s : 3: substitution.cast_eliminate 1.09% : 0.000003s : 3: substitution.elim_not_effective 0.66% : 0.000002s : 3: substitution.fold_const_symbol 2.80% : 0.000007s : 5: substitution.graph_param_transform 70.73% : 0.000180s : 4: substitution.inline 2.43% : 0.000006s : 6: substitution.j_node_and_user_rematch 2.76% : 0.000007s : 6: substitution.remove_not_recompute_node 2.35% : 0.000006s : 4: substitution.replace_old_param 5.30% : 0.000013s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.229014 2 99.53% : 0.227946s : 1: type_inference.infer 0.47% : 0.001068s : 1: type_inference.specialize ------[replace.] 0.000069 8 64.76% : 0.000045s : 4: replace.inline 35.24% : 0.000024s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000189 8 93.82% : 0.000177s : 4: match.inline 6.18% : 0.000012s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000247 1504 0.86% : 0.000002s : 15: predicate.accumulaten_eliminater 0.89% : 0.000002s : 5: predicate.ad_related_special_op_eliminate 0.57% : 0.000001s : 10: predicate.addn_check_dump 0.91% : 0.000002s : 15: predicate.addn_zero_filter 0.81% : 0.000002s : 15: predicate.adjust_all_reduce_mul_add 1.97% : 0.000005s : 25: predicate.arithmetic_simplify 1.13% : 0.000003s : 15: predicate.cast_eliminate 0.67% : 0.000002s : 10: predicate.check_bprop_eliminate 0.60% : 0.000001s : 10: predicate.compare_switch_simplify 0.20% : 0.000000s : 5: predicate.const_output_eliminate 0.66% : 0.000002s : 10: predicate.depend_value_elim 0.90% : 0.000002s : 15: predicate.dict_get_item_const_eliminator 1.04% : 0.000003s : 15: predicate.dict_get_item_eliminator 0.86% : 0.000002s : 15: predicate.dict_set_item_eliminator 1.03% : 0.000003s : 10: predicate.dumpgradient_eliminate 0.25% : 0.000001s : 5: predicate.elim_not_effective 0.40% : 0.000001s : 5: predicate.elim_shapecalc_of_broadcastargs 1.12% : 0.000003s : 20: predicate.environ_add_const_eliminate 1.04% : 0.000003s : 20: predicate.environ_get_add_eliminate 1.06% : 0.000003s : 20: predicate.environ_get_depend_swap 1.90% : 0.000005s : 30: predicate.environ_get_eliminate 1.08% : 0.000003s : 20: predicate.environ_get_set_eliminate 1.33% : 0.000003s : 23: predicate.exchange_switch_depend_value 2.30% : 0.000006s : 23: predicate.float_depend_g_call 0.74% : 0.000002s : 10: predicate.float_environ_get_switch 0.87% : 0.000002s : 15: predicate.float_tuple_getitem_switch 0.18% : 0.000000s : 5: predicate.fold_const_symbol 0.71% : 0.000002s : 10: predicate.get_grad_eliminate 0.21% : 0.000001s : 5: predicate.graph_param_transform 0.62% : 0.000002s : 10: predicate.incorporate_call 0.55% : 0.000001s : 10: predicate.incorporate_call_switch 6.50% : 0.000016s : 68: predicate.inline 0.81% : 0.000002s : 10: predicate.inline_without_move 0.34% : 0.000001s : 10: predicate.j_node_and_user_rematch 0.85% : 0.000002s : 10: predicate.less_batch_normalization 1.72% : 0.000004s : 29: predicate.list_to_tuple_eliminator_ 2.49% : 0.000006s : 44: predicate.load_eliminater 1.19% : 0.000003s : 5: predicate.loop_unroll_after_grad 2.15% : 0.000005s : 36: predicate.loop_unroll_before_grad 1.58% : 0.000004s : 25: predicate.make_slice_get_slice_eliminator 0.57% : 0.000001s : 10: predicate.merge_addn 0.59% : 0.000001s : 10: predicate.micro_step_allgather_replace 0.60% : 0.000001s : 10: predicate.mini_step_allgather_replace 0.81% : 0.000002s : 15: predicate.minmaximum_grad 1.30% : 0.000003s : 5: predicate.mutable_eliminate 0.47% : 0.000001s : 5: predicate.opt_reshape 0.44% : 0.000001s : 5: predicate.parallel_virtual_node 1.74% : 0.000004s : 23: predicate.partial_defer_inline 1.60% : 0.000004s : 24: predicate.partial_eliminate 0.87% : 0.000002s : 15: predicate.print_const_string_wrapper 0.57% : 0.000001s : 10: predicate.reduce_all_const_elim 1.22% : 0.000003s : 15: predicate.reduce_eliminate 2.42% : 0.000006s : 44: predicate.redundant_stop_gradient_eliminater 0.38% : 0.000001s : 10: predicate.remove_not_recompute_node 1.40% : 0.000003s : 29: predicate.replace_applicator 0.50% : 0.000001s : 10: predicate.replace_old_param 0.35% : 0.000001s : 5: predicate.reset_defer_inline 0.99% : 0.000002s : 15: predicate.reshape_eliminate 0.63% : 0.000002s : 10: predicate.row_tensor_add_zeros_like 0.40% : 0.000001s : 5: predicate.row_tensor_eliminate 1.05% : 0.000003s : 10: predicate.same_eliminate 0.44% : 0.000001s : 10: predicate.set_cell_output_no_recompute 0.72% : 0.000002s : 10: predicate.shard_identity_eliminate 0.67% : 0.000002s : 10: predicate.special_op_eliminate 0.75% : 0.000002s : 10: predicate.specialize_transform 0.93% : 0.000002s : 10: predicate.split_environ_get_set_with_tuple_value 0.86% : 0.000002s : 10: predicate.stack_unstack_eliminate 0.34% : 0.000001s : 5: predicate.switch_call_monad_eliminater 1.47% : 0.000004s : 23: predicate.switch_defer_inline 1.97% : 0.000005s : 33: predicate.switch_layer_defer_inline 5.16% : 0.000013s : 74: predicate.switch_simplify 0.86% : 0.000002s : 15: predicate.tile_eliminate 0.85% : 0.000002s : 15: predicate.transpose_eliminate 1.53% : 0.000004s : 25: predicate.tuple_list_convert_item_index_to_positive 1.66% : 0.000004s : 25: predicate.tuple_list_get_item_const_eliminator 1.36% : 0.000003s : 25: predicate.tuple_list_get_item_depend_reorder 3.14% : 0.000008s : 39: predicate.tuple_list_get_item_eliminator 1.47% : 0.000004s : 25: predicate.tuple_list_get_set_item_eliminator 2.17% : 0.000005s : 35: predicate.tuple_list_set_item_eliminator 1.68% : 0.000004s : 29: predicate.tuple_to_list_eliminator_ 2.41% : 0.000006s : 44: predicate.updatestate_pure_node_eliminater 3.18% : 0.000008s : 54: predicate.updatestate_useless_node_eliminater 0.40% : 0.000001s : 5: predicate.value_based_eliminate 0.65% : 0.000002s : 10: predicate.virtual_dataset_eliminate 0.64% : 0.000002s : 10: predicate.virtual_output_eliminate 0.29% : 0.000001s : 5: predicate.virtual_view_grad_eliminate 0.39% : 0.000001s : 5: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000761 11 45.47% : 0.000346s : 5: func_graph_cloner_run.FuncGraphClonerGraph 54.53% : 0.000415s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.252729 192 0.00% : 0.000004s : 1: ForceFp32Comm 1.50% : 0.003792s : 1: add_attr 1.49% : 0.003777s : 1: add_attr_with_inline 0.00% : 0.000004s : 1: add_comm_op_reuse_tag 0.03% : 0.000070s : 1: add_recomputation 0.00% : 0.000004s : 1: assign_add_opt 0.03% : 0.000082s : 1: auto_monad 0.01% : 0.000026s : 1: auto_monad_reorder 0.00% : 0.000003s : 1: begin_end_overlap_inline 0.00% : 0.000006s : 1: bias_add_comm_swap 0.21% : 0.000519s : 1: bootstrap 0.01% : 0.000036s : 1: cconv 0.00% : 0.000004s : 1: comm_op_add_attrs 0.01% : 0.000019s : 1: control_data_broadcast_order 0.00% : 0.000012s : 1: convert_after_rewriter 0.01% : 0.000034s : 1: cse_after_recomputation 0.00% : 0.000005s : 1: dataset_repeat_opt 0.00% : 0.000006s : 1: detach_backward 0.00% : 0.000010s : 1: environ_conv 0.01% : 0.000032s : 1: event_method 0.00% : 0.000005s : 1: full_micro_interleaved_order_control 0.00% : 0.000005s : 1: get_jit_bprop_graph 0.00% : 0.000010s : 1: graph_reusing 0.00% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000004s : 1: handle_group_info 0.00% : 0.000006s : 1: inline 0.00% : 0.000006s : 1: insert-virtual-dataset 0.00% : 0.000004s : 1: interleave_parallel_branches 0.00% : 0.000004s : 1: interleave_split_concat_branches 0.00% : 0.000006s : 1: label_fine_grained_interleaved_index 0.00% : 0.000008s : 1: label_micro_interleaved_index 0.19% : 0.000479s : 1: loop_unroll 0.00% : 0.000004s : 1: merge_cast_opt 0.00% : 0.000005s : 1: micro_interleaved_order_control 0.27% : 0.000686s : 1: mutable_eliminate 0.00% : 0.000008s : 1: offloading_packed_experts 0.01% : 0.000018s : 1: opt.transform.loop_unroll_optimizer 0.01% : 0.000020s : 1: opt.transform.mutable_eliminate 0.58% : 0.001473s : 78: opt.transform.opt_a 0.01% : 0.000037s : 1: opt.transform.opt_after_cconv 0.01% : 0.000034s : 1: opt.transform.opt_after_jit_grad 0.05% : 0.000137s : 28: opt.transform.opt_b 0.02% : 0.000062s : 2: opt.transform.opt_trans_graph 0.02% : 0.000049s : 4: opt.transform.symbol_engine_opt 1.38% : 0.003485s : 1: opt_a 0.05% : 0.000125s : 1: opt_after_cconv 0.21% : 0.000539s : 1: opt_after_jit_grad 0.10% : 0.000263s : 1: opt_b 2.37% : 0.005977s : 1: optimize 0.01% : 0.000023s : 1: optimize_parallel_all_gather_comm 0.00% : 0.000010s : 1: order_py_execute_after_rewriter 0.01% : 0.000028s : 1: overlap_grad_flash_sp 0.00% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.00% : 0.000009s : 1: overlap_grad_ring_attention 0.00% : 0.000007s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000005s : 1: overlap_param_gather 0.00% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.00% : 0.000009s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000005s : 1: overlap_recompute_comm 0.00% : 0.000008s : 1: parallel-infer-symbol 0.00% : 0.000004s : 1: parallel-infer-symbol-second 0.00% : 0.000005s : 1: partial_unused_args_eliminate 0.00% : 0.000005s : 1: pipeline_parallel_scheduler 0.00% : 0.000005s : 1: pipeline_split 0.02% : 0.000048s : 1: pre_auto_parallel 0.01% : 0.000037s : 1: py_interpret_to_execute 0.01% : 0.000016s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000005s : 1: remove_cast_before_assign_add 0.01% : 0.000020s : 1: remove_dup_value 0.22% : 0.000553s : 1: renormalize.infer 0.17% : 0.000442s : 1: renormalize.specialize 0.00% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.00% : 0.000007s : 1: rewriter_after_jit_bprop_graph 0.02% : 0.000053s : 1: rewriter_after_opt_a 0.04% : 0.000104s : 1: rewriter_before_opt_a 0.00% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000005s : 1: slice_recompute_activation 0.00% : 0.000005s : 1: split_layernorm_comm 0.00% : 0.000005s : 1: split_matmul_comm_elemetwise 0.00% : 0.000010s : 1: swap_dp_allreduce_reducescatter 0.04% : 0.000098s : 1: symbol_engine_optimizer 0.05% : 0.000139s : 1: tuple_transform 90.66% : 0.229128s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:40:34.559.093 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:40:34.559.358 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0425869, [21] [bootstrap]: 0.00050113 [type_inference]: 0.00711326 [event_method]: 2.219e-05 [auto_monad]: 6.562e-05 [graph_reusing]: 7.05e-06 [inline]: 3.18e-06 [add_attr]: 0.00397653, [1] [add_attr_with_inline]: 0.00396317, [1] [Cycle 1]: 0.00010231, [2] [tag_attr]: 2.831e-05 [meta_addattr_fg_expand]: 6.06e-06 [parallel-infer-symbol]: 3.83999e-06 [pre_auto_parallel]: 4.498e-05 [insert-virtual-dataset]: 2.54999e-06 [parallel-infer-symbol-second]: 8.00006e-07 [dataset_repeat_opt]: 2.65002e-06 [pipeline_split]: 1.71998e-06 [optimize]: 0.0294427, [53] [py_interpret_to_execute]: 4.09e-05 [rewriter_before_opt_a]: 0.00011041 [opt_a]: 0.0266737, [2] [Cycle 1]: 0.0256983, [45] [expand_dump_flag]: 3.48e-06 [switch_simplify]: 4.506e-05 [loop_unroll]: 3.089e-05 [a_1]: 0.00082554 [with_stream_mark]: 3.022e-05 [recompute_prepare]: 1.446e-05 [updatestate_depend_eliminate]: 5.44e-06 [updatestate_assign_eliminate]: 3.61001e-06 [updatestate_loads_eliminate]: 3.21001e-06 [parameter_eliminate]: 2.89999e-06 [a_2]: 0.00012271 [accelerated_algorithm]: 1.051e-05 [shard]: 3.98999e-06 [meta_shard_fg_expand]: 2.78e-06 [shard_inline]: 8.99e-06 [merge_send_recv]: 1.064e-05 [auto_parallel]: 1.113e-05 [parallel]: 2.228e-05 [flash_sp]: 1.209e-05 [merge_comm]: 4.82e-06 [allreduce_fusion]: 4.24002e-06 [matmul_add_comm_reduction]: 1.075e-05 [allreduce_slice_to_reducescatter]: 8.90024e-07 [virtual_shard_identity]: 1.299e-05 [virtual_dataset]: 7.97e-06 [get_grad_eliminate_]: 7.87e-06 [virtual_output]: 9.05001e-06 [merge_forward]: 5.86e-06 [cell_reuse_recompute_pass]: 1.92999e-06 [offload_activation]: 1.311e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.276e-05 [merge_recompute_call_nodes]: 2.07001e-06 [before_grad]: 1.373e-05 [set_forward_comm_id_for_comm_node_pass]: 6.02001e-06 [meta_fg_expand]: 4.03999e-06 [flash_sp_send_recv_attached]: 3.61999e-06 [receive_attached]: 2.27999e-06 [after_resolve]: 1.828e-05 [a_after_grad]: 1.271e-05 [renormalize]: 0.0236866 [add_forward_monad_depend]: 1.237e-05 [auto_monad_grad]: 2.99999e-06 [auto_monad_eliminator]: 2.69e-05 [cse]: 3.228e-05 [a_3]: 8.249e-05 [Cycle 2]: 0.00095549, [45] [expand_dump_flag]: 2.91e-06 [switch_simplify]: 9.77001e-06 [loop_unroll]: 7.26001e-06 [a_1]: 0.00015188 [with_stream_mark]: 2.3e-05 [recompute_prepare]: 7.40998e-06 [updatestate_depend_eliminate]: 4.71002e-06 [updatestate_assign_eliminate]: 3.2e-06 [updatestate_loads_eliminate]: 2.89999e-06 [parameter_eliminate]: 2.22001e-06 [a_2]: 0.00010336 [accelerated_algorithm]: 7.83001e-06 [shard]: 2.83998e-06 [meta_shard_fg_expand]: 2.26998e-06 [shard_inline]: 1.342e-05 [merge_send_recv]: 9.19998e-06 [auto_parallel]: 1.085e-05 [parallel]: 9.95997e-06 [flash_sp]: 4.33001e-06 [merge_comm]: 3.68999e-06 [allreduce_fusion]: 3.53e-06 [matmul_add_comm_reduction]: 9.51e-06 [allreduce_slice_to_reducescatter]: 7.79983e-07 [virtual_shard_identity]: 7.98001e-06 [virtual_dataset]: 6.88998e-06 [get_grad_eliminate_]: 6.93e-06 [virtual_output]: 6.84999e-06 [merge_forward]: 4.45999e-06 [cell_reuse_recompute_pass]: 3.55e-06 [offload_activation]: 1.136e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.003e-05 [merge_recompute_call_nodes]: 1.54e-06 [before_grad]: 1.143e-05 [set_forward_comm_id_for_comm_node_pass]: 3.85e-06 [meta_fg_expand]: 2.94001e-06 [flash_sp_send_recv_attached]: 1.84998e-06 [receive_attached]: 3.04999e-06 [after_resolve]: 1.418e-05 [a_after_grad]: 1.123e-05 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 2.04e-06 [auto_monad_grad]: 2.02999e-06 [auto_monad_eliminator]: 8.54e-06 [cse]: 1.733e-05 [a_3]: 4.992e-05 [py_interpret_to_execute_after_opt_a]: 2.279e-05 [slice_cell_reuse_recomputed_activation]: 5.92999e-06 [rewriter_after_opt_a]: 4.631e-05 [convert_after_rewriter]: 9.86998e-06 [order_py_execute_after_rewriter]: 8.23999e-06 [mutable_eliminate]: 0.00078336 [opt_b]: 0.00028957, [1] [Cycle 1]: 0.00027746, [7] [b_1]: 0.00017681 [b_2]: 8.36002e-06 [updatestate_depend_eliminate]: 7.38999e-06 [updatestate_assign_eliminate]: 2.36e-06 [updatestate_loads_eliminate]: 2.54999e-06 [renormalize]: 7.2e-07 [cse]: 2.136e-05 [optimize_parallel_all_gather_comm]: 2.164e-05 [overlap_param_gather]: 5.02e-06 [cconv]: 3.665e-05 [loop_unroll]: 0.00049319 [opt_after_cconv]: 0.00013608, [1] [Cycle 1]: 0.00012697, [7] [c_1]: 3.453e-05 [parameter_eliminate]: 4.81002e-06 [updatestate_depend_eliminate]: 5.78002e-06 [updatestate_assign_eliminate]: 2.56e-06 [updatestate_loads_eliminate]: 2.63003e-06 [cse]: 1.855e-05 [renormalize]: 3.89991e-07 [remove_dup_value]: 1.842e-05 [tuple_transform]: 0.00013195, [1] [Cycle 1]: 0.00012416, [4] [d_1]: 8.013e-05 [none_parameter_eliminate]: 1.59e-06 [renormalize]: 2.19996e-07 [switch_simplify]: 8.50001e-06 [partial_unused_args_eliminate]: 4.75999e-06 [add_recomputation]: 5.573e-05 [cse_after_recomputation]: 3.013e-05, [1] [Cycle 1]: 2.294e-05, [1] [cse]: 1.38e-05 [environ_conv]: 9.00001e-06 [swap_dp_allreduce_reducescatter]: 7.78999e-06 [bias_add_comm_swap]: 5.56e-06 [label_micro_interleaved_index]: 7.1e-06 [label_fine_grained_interleaved_index]: 5.98002e-06 [merge_cast_opt]: 4.21001e-06 [slice_recompute_activation]: 4.70001e-06 [micro_interleaved_order_control]: 5.02e-06 [assign_add_opt]: 3.70998e-06 [ForceFp32Comm]: 3.36999e-06 [remove_cast_before_assign_add]: 3.61999e-06 [full_micro_interleaved_order_control]: 4.55001e-06 [reorder_send_recv_between_fp_bp]: 6.29999e-06 [comm_op_add_attrs]: 3.75e-06 [add_comm_op_reuse_tag]: 3.67002e-06 [interleave_split_concat_branches]: 3.79002e-06 [interleave_parallel_branches]: 3.63999e-06 [overlap_opt_shard_in_pipeline]: 3.7e-06 [overlap_opt_shard_grad_in_pipeline]: 4.45999e-06 [control_data_broadcast_order]: 1.606e-05 [grouped_pairwise_exchange_alltoall]: 4.01001e-06 [offloading_packed_experts]: 7.05998e-06 [overlap_recompute_and_grad_model_parallel]: 8.14997e-06 [overlap_grad_matmul_and_grad_allreduce]: 4.10998e-06 [overlap_recompute_allgather_and_fa_grad]: 3.7e-06 [overlap_recompute_comm]: 5.10001e-06 [overlap_grad_ring_attention]: 6.48e-06 [overlap_grad_flash_sp]: 2.454e-05 [begin_end_overlap_inline]: 2.94999e-06 [split_matmul_comm_elemetwise]: 4.73001e-06 [split_layernorm_comm]: 3.90998e-06 [handle_group_info]: 3.25e-06 [symbol_engine_optimizer]: 0.00010401, [1] [Cycle 1]: 9.679e-05, [6] [build]: 3.5e-06 [elim_shapecalc]: 1.067e-05 [elim_not_effective]: 1.382e-05 [opt_reshape]: 7.97e-06 [fold_const_symbol]: 1.127e-05 [renormalize]: 2.00002e-07 [detach_backward]: 4.78001e-06 [pipeline_parallel_scheduler]: 2.12999e-06 [auto_monad_reorder]: 2.271e-05 [get_jit_bprop_graph]: 1.99e-06 [rewriter_after_jit_bprop_graph]: 6.41e-06 [opt_after_jit_grad]: 0.00066463 [validate]: 4.844e-05 Sums bootstrap : 0.000501s : 1.37% type_inference : 0.007113s : 19.42% event_method : 0.000022s : 0.06% auto_monad : 0.000066s : 0.18% graph_reusing : 0.000007s : 0.02% inline : 0.000003s : 0.01% add_attr.add_attr_with_inline.tag_attr : 0.000028s : 0.08% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.02% parallel-infer-symbol : 0.000004s : 0.01% pre_auto_parallel : 0.000045s : 0.12% insert-virtual-dataset : 0.000003s : 0.01% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000003s : 0.01% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000041s : 0.11% optimize.rewriter_before_opt_a : 0.000110s : 0.30% optimize.opt_a.expand_dump_flag : 0.000006s : 0.02% optimize.opt_a.switch_simplify : 0.000055s : 0.15% optimize.opt_a.loop_unroll : 0.000038s : 0.10% optimize.opt_a.a_1 : 0.000977s : 2.67% optimize.opt_a.with_stream_mark : 0.000053s : 0.15% optimize.opt_a.recompute_prepare : 0.000022s : 0.06% optimize.opt_a.updatestate_depend_eliminate : 0.000010s : 0.03% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.02% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.02% optimize.opt_a.parameter_eliminate : 0.000005s : 0.01% optimize.opt_a.a_2 : 0.000226s : 0.62% optimize.opt_a.accelerated_algorithm : 0.000018s : 0.05% optimize.opt_a.shard : 0.000007s : 0.02% optimize.opt_a.meta_shard_fg_expand : 0.000005s : 0.01% optimize.opt_a.shard_inline : 0.000022s : 0.06% optimize.opt_a.merge_send_recv : 0.000020s : 0.05% optimize.opt_a.auto_parallel : 0.000022s : 0.06% optimize.opt_a.parallel : 0.000032s : 0.09% optimize.opt_a.flash_sp : 0.000016s : 0.04% optimize.opt_a.merge_comm : 0.000009s : 0.02% optimize.opt_a.allreduce_fusion : 0.000008s : 0.02% optimize.opt_a.matmul_add_comm_reduction : 0.000020s : 0.06% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000002s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000021s : 0.06% optimize.opt_a.virtual_dataset : 0.000015s : 0.04% optimize.opt_a.get_grad_eliminate_ : 0.000015s : 0.04% optimize.opt_a.virtual_output : 0.000016s : 0.04% optimize.opt_a.merge_forward : 0.000010s : 0.03% optimize.opt_a.cell_reuse_recompute_pass : 0.000005s : 0.01% optimize.opt_a.offload_activation : 0.000024s : 0.07% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000043s : 0.12% optimize.opt_a.merge_recompute_call_nodes : 0.000004s : 0.01% optimize.opt_a.before_grad : 0.000025s : 0.07% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000010s : 0.03% optimize.opt_a.meta_fg_expand : 0.000007s : 0.02% optimize.opt_a.flash_sp_send_recv_attached : 0.000005s : 0.01% optimize.opt_a.receive_attached : 0.000005s : 0.01% optimize.opt_a.after_resolve : 0.000032s : 0.09% optimize.opt_a.a_after_grad : 0.000024s : 0.07% optimize.opt_a.renormalize : 0.023687s : 64.68% optimize.opt_a.add_forward_monad_depend : 0.000014s : 0.04% optimize.opt_a.auto_monad_grad : 0.000005s : 0.01% optimize.opt_a.auto_monad_eliminator : 0.000035s : 0.10% optimize.opt_a.cse : 0.000050s : 0.14% optimize.opt_a.a_3 : 0.000132s : 0.36% optimize.py_interpret_to_execute_after_opt_a : 0.000023s : 0.06% optimize.slice_cell_reuse_recomputed_activation : 0.000006s : 0.02% optimize.rewriter_after_opt_a : 0.000046s : 0.13% optimize.convert_after_rewriter : 0.000010s : 0.03% optimize.order_py_execute_after_rewriter : 0.000008s : 0.02% optimize.mutable_eliminate : 0.000783s : 2.14% optimize.opt_b.b_1 : 0.000177s : 0.48% optimize.opt_b.b_2 : 0.000008s : 0.02% optimize.opt_b.updatestate_depend_eliminate : 0.000007s : 0.02% optimize.opt_b.updatestate_assign_eliminate : 0.000002s : 0.01% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.01% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000021s : 0.06% optimize.optimize_parallel_all_gather_comm : 0.000022s : 0.06% optimize.overlap_param_gather : 0.000005s : 0.01% optimize.cconv : 0.000037s : 0.10% optimize.loop_unroll : 0.000493s : 1.35% optimize.opt_after_cconv.c_1 : 0.000035s : 0.09% optimize.opt_after_cconv.parameter_eliminate : 0.000005s : 0.01% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.02% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.cse : 0.000019s : 0.05% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000018s : 0.05% optimize.tuple_transform.d_1 : 0.000080s : 0.22% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000009s : 0.02% optimize.partial_unused_args_eliminate : 0.000005s : 0.01% optimize.add_recomputation : 0.000056s : 0.15% optimize.cse_after_recomputation.cse : 0.000014s : 0.04% optimize.environ_conv : 0.000009s : 0.02% optimize.swap_dp_allreduce_reducescatter : 0.000008s : 0.02% optimize.bias_add_comm_swap : 0.000006s : 0.02% optimize.label_micro_interleaved_index : 0.000007s : 0.02% optimize.label_fine_grained_interleaved_index : 0.000006s : 0.02% optimize.merge_cast_opt : 0.000004s : 0.01% optimize.slice_recompute_activation : 0.000005s : 0.01% optimize.micro_interleaved_order_control : 0.000005s : 0.01% optimize.assign_add_opt : 0.000004s : 0.01% optimize.ForceFp32Comm : 0.000003s : 0.01% optimize.remove_cast_before_assign_add : 0.000004s : 0.01% optimize.full_micro_interleaved_order_control : 0.000005s : 0.01% optimize.reorder_send_recv_between_fp_bp : 0.000006s : 0.02% optimize.comm_op_add_attrs : 0.000004s : 0.01% optimize.add_comm_op_reuse_tag : 0.000004s : 0.01% optimize.interleave_split_concat_branches : 0.000004s : 0.01% optimize.interleave_parallel_branches : 0.000004s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000004s : 0.01% optimize.control_data_broadcast_order : 0.000016s : 0.04% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.01% optimize.offloading_packed_experts : 0.000007s : 0.02% optimize.overlap_recompute_and_grad_model_parallel : 0.000008s : 0.02% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.01% optimize.overlap_recompute_comm : 0.000005s : 0.01% optimize.overlap_grad_ring_attention : 0.000006s : 0.02% optimize.overlap_grad_flash_sp : 0.000025s : 0.07% optimize.begin_end_overlap_inline : 0.000003s : 0.01% optimize.split_matmul_comm_elemetwise : 0.000005s : 0.01% optimize.split_layernorm_comm : 0.000004s : 0.01% optimize.handle_group_info : 0.000003s : 0.01% optimize.symbol_engine_optimizer.build : 0.000003s : 0.01% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000011s : 0.03% optimize.symbol_engine_optimizer.elim_not_effective : 0.000014s : 0.04% optimize.symbol_engine_optimizer.opt_reshape : 0.000008s : 0.02% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000011s : 0.03% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000005s : 0.01% pipeline_parallel_scheduler : 0.000002s : 0.01% auto_monad_reorder : 0.000023s : 0.06% get_jit_bprop_graph : 0.000002s : 0.01% rewriter_after_jit_bprop_graph : 0.000006s : 0.02% opt_after_jit_grad : 0.000665s : 1.81% validate : 0.000048s : 0.13% Time group info: ------[substitution.] 0.000253 28 0.83% : 0.000002s : 2: substitution.elim_not_effective 0.64% : 0.000002s : 2: substitution.fold_const_symbol 2.53% : 0.000006s : 4: substitution.graph_param_transform 80.24% : 0.000203s : 4: substitution.inline 2.26% : 0.000006s : 4: substitution.j_node_and_user_rematch 2.77% : 0.000007s : 4: substitution.remove_not_recompute_node 2.93% : 0.000007s : 4: substitution.replace_old_param 7.79% : 0.000020s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.007051 2 88.33% : 0.006229s : 1: type_inference.infer 11.67% : 0.000823s : 1: type_inference.specialize ------[replace.] 0.000085 8 56.59% : 0.000048s : 4: replace.inline 43.41% : 0.000037s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000216 8 92.26% : 0.000199s : 4: match.inline 7.74% : 0.000017s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000230 1278 1.11% : 0.000003s : 13: predicate.accumulaten_eliminater 0.90% : 0.000002s : 4: predicate.ad_related_special_op_eliminate 0.47% : 0.000001s : 8: predicate.addn_check_dump 0.80% : 0.000002s : 13: predicate.addn_zero_filter 0.84% : 0.000002s : 13: predicate.adjust_all_reduce_mul_add 2.53% : 0.000006s : 21: predicate.arithmetic_simplify 0.94% : 0.000002s : 13: predicate.cast_eliminate 0.57% : 0.000001s : 8: predicate.check_bprop_eliminate 0.47% : 0.000001s : 8: predicate.compare_switch_simplify 0.17% : 0.000000s : 4: predicate.const_output_eliminate 0.64% : 0.000001s : 8: predicate.depend_value_elim 0.86% : 0.000002s : 13: predicate.dict_get_item_const_eliminator 1.22% : 0.000003s : 13: predicate.dict_get_item_eliminator 0.80% : 0.000002s : 13: predicate.dict_set_item_eliminator 0.89% : 0.000002s : 8: predicate.dumpgradient_eliminate 0.26% : 0.000001s : 4: predicate.elim_not_effective 0.33% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.19% : 0.000003s : 17: predicate.environ_add_const_eliminate 0.96% : 0.000002s : 17: predicate.environ_get_add_eliminate 0.98% : 0.000002s : 17: predicate.environ_get_depend_swap 1.54% : 0.000004s : 25: predicate.environ_get_eliminate 1.14% : 0.000003s : 17: predicate.environ_get_set_eliminate 1.34% : 0.000003s : 21: predicate.exchange_switch_depend_value 2.42% : 0.000006s : 21: predicate.float_depend_g_call 0.55% : 0.000001s : 8: predicate.float_environ_get_switch 0.69% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.17% : 0.000000s : 4: predicate.fold_const_symbol 0.73% : 0.000002s : 8: predicate.get_grad_eliminate 0.21% : 0.000000s : 4: predicate.graph_param_transform 0.57% : 0.000001s : 8: predicate.incorporate_call 0.46% : 0.000001s : 8: predicate.incorporate_call_switch 6.39% : 0.000015s : 58: predicate.inline 0.81% : 0.000002s : 8: predicate.inline_without_move 0.30% : 0.000001s : 8: predicate.j_node_and_user_rematch 0.96% : 0.000002s : 8: predicate.less_batch_normalization 2.13% : 0.000005s : 25: predicate.list_to_tuple_eliminator_ 2.35% : 0.000005s : 38: predicate.load_eliminater 0.97% : 0.000002s : 4: predicate.loop_unroll_after_grad 2.19% : 0.000005s : 34: predicate.loop_unroll_before_grad 1.73% : 0.000004s : 21: predicate.make_slice_get_slice_eliminator 0.57% : 0.000001s : 8: predicate.merge_addn 0.53% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.68% : 0.000002s : 8: predicate.mini_step_allgather_replace 0.97% : 0.000002s : 13: predicate.minmaximum_grad 1.14% : 0.000003s : 4: predicate.mutable_eliminate 0.35% : 0.000001s : 4: predicate.opt_reshape 0.66% : 0.000002s : 4: predicate.parallel_virtual_node 1.90% : 0.000004s : 21: predicate.partial_defer_inline 1.49% : 0.000003s : 21: predicate.partial_eliminate 0.87% : 0.000002s : 13: predicate.print_const_string_wrapper 0.82% : 0.000002s : 8: predicate.reduce_all_const_elim 1.21% : 0.000003s : 13: predicate.reduce_eliminate 2.36% : 0.000005s : 38: predicate.redundant_stop_gradient_eliminater 0.54% : 0.000001s : 8: predicate.remove_not_recompute_node 1.42% : 0.000003s : 25: predicate.replace_applicator 0.63% : 0.000001s : 8: predicate.replace_old_param 0.33% : 0.000001s : 4: predicate.reset_defer_inline 0.88% : 0.000002s : 13: predicate.reshape_eliminate 0.61% : 0.000001s : 8: predicate.row_tensor_add_zeros_like 0.39% : 0.000001s : 4: predicate.row_tensor_eliminate 0.79% : 0.000002s : 8: predicate.same_eliminate 0.40% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.87% : 0.000002s : 8: predicate.shard_identity_eliminate 0.63% : 0.000001s : 8: predicate.special_op_eliminate 0.79% : 0.000002s : 8: predicate.specialize_transform 1.11% : 0.000003s : 8: predicate.split_environ_get_set_with_tuple_value 0.78% : 0.000002s : 8: predicate.stack_unstack_eliminate 0.29% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.44% : 0.000003s : 21: predicate.switch_defer_inline 1.92% : 0.000004s : 29: predicate.switch_layer_defer_inline 5.27% : 0.000012s : 67: predicate.switch_simplify 0.89% : 0.000002s : 13: predicate.tile_eliminate 0.99% : 0.000002s : 13: predicate.transpose_eliminate 1.35% : 0.000003s : 21: predicate.tuple_list_convert_item_index_to_positive 1.49% : 0.000003s : 21: predicate.tuple_list_get_item_const_eliminator 1.47% : 0.000003s : 21: predicate.tuple_list_get_item_depend_reorder 3.39% : 0.000008s : 33: predicate.tuple_list_get_item_eliminator 1.30% : 0.000003s : 21: predicate.tuple_list_get_set_item_eliminator 2.49% : 0.000006s : 29: predicate.tuple_list_set_item_eliminator 1.65% : 0.000004s : 25: predicate.tuple_to_list_eliminator_ 2.27% : 0.000005s : 38: predicate.updatestate_pure_node_eliminater 3.02% : 0.000007s : 46: predicate.updatestate_useless_node_eliminater 0.35% : 0.000001s : 4: predicate.value_based_eliminate 0.72% : 0.000002s : 8: predicate.virtual_dataset_eliminate 0.67% : 0.000002s : 8: predicate.virtual_output_eliminate 0.26% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.46% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000764 11 48.71% : 0.000372s : 5: func_graph_cloner_run.FuncGraphClonerGraph 51.29% : 0.000392s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.101337 192 0.01% : 0.000006s : 1: ForceFp32Comm 3.94% : 0.003989s : 1: add_attr 3.92% : 0.003968s : 1: add_attr_with_inline 0.01% : 0.000008s : 1: add_comm_op_reuse_tag 0.06% : 0.000059s : 1: add_recomputation 0.01% : 0.000006s : 1: assign_add_opt 0.07% : 0.000075s : 1: auto_monad 0.03% : 0.000031s : 1: auto_monad_reorder 0.01% : 0.000006s : 1: begin_end_overlap_inline 0.01% : 0.000008s : 1: bias_add_comm_swap 0.54% : 0.000552s : 1: bootstrap 0.04% : 0.000040s : 1: cconv 0.01% : 0.000006s : 1: comm_op_add_attrs 0.02% : 0.000019s : 1: control_data_broadcast_order 0.01% : 0.000013s : 1: convert_after_rewriter 0.03% : 0.000034s : 1: cse_after_recomputation 0.01% : 0.000008s : 1: dataset_repeat_opt 0.02% : 0.000023s : 1: detach_backward 0.01% : 0.000012s : 1: environ_conv 0.03% : 0.000034s : 1: event_method 0.01% : 0.000007s : 1: full_micro_interleaved_order_control 0.01% : 0.000009s : 1: get_jit_bprop_graph 0.01% : 0.000014s : 1: graph_reusing 0.01% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000006s : 1: handle_group_info 0.01% : 0.000009s : 1: inline 0.01% : 0.000009s : 1: insert-virtual-dataset 0.01% : 0.000006s : 1: interleave_parallel_branches 0.01% : 0.000007s : 1: interleave_split_concat_branches 0.01% : 0.000009s : 1: label_fine_grained_interleaved_index 0.01% : 0.000010s : 1: label_micro_interleaved_index 0.49% : 0.000500s : 1: loop_unroll 0.01% : 0.000007s : 1: merge_cast_opt 0.01% : 0.000008s : 1: micro_interleaved_order_control 0.78% : 0.000791s : 1: mutable_eliminate 0.01% : 0.000010s : 1: offloading_packed_experts 0.02% : 0.000016s : 1: opt.transform.loop_unroll_optimizer 0.02% : 0.000017s : 1: opt.transform.mutable_eliminate 1.46% : 0.001484s : 78: opt.transform.opt_a 0.03% : 0.000033s : 1: opt.transform.opt_after_cconv 0.03% : 0.000031s : 1: opt.transform.opt_after_jit_grad 0.11% : 0.000110s : 28: opt.transform.opt_b 0.08% : 0.000086s : 2: opt.transform.opt_trans_graph 0.04% : 0.000040s : 4: opt.transform.symbol_engine_opt 26.33% : 0.026677s : 1: opt_a 0.14% : 0.000140s : 1: opt_after_cconv 0.67% : 0.000677s : 1: opt_after_jit_grad 0.29% : 0.000294s : 1: opt_b 29.41% : 0.029807s : 1: optimize 0.02% : 0.000025s : 1: optimize_parallel_all_gather_comm 0.01% : 0.000011s : 1: order_py_execute_after_rewriter 0.03% : 0.000028s : 1: overlap_grad_flash_sp 0.01% : 0.000007s : 1: overlap_grad_matmul_and_grad_allreduce 0.01% : 0.000009s : 1: overlap_grad_ring_attention 0.01% : 0.000007s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000006s : 1: overlap_opt_shard_in_pipeline 0.01% : 0.000008s : 1: overlap_param_gather 0.01% : 0.000006s : 1: overlap_recompute_allgather_and_fa_grad 0.01% : 0.000012s : 1: overlap_recompute_and_grad_model_parallel 0.01% : 0.000008s : 1: overlap_recompute_comm 0.01% : 0.000012s : 1: parallel-infer-symbol 0.01% : 0.000007s : 1: parallel-infer-symbol-second 0.01% : 0.000008s : 1: partial_unused_args_eliminate 0.01% : 0.000009s : 1: pipeline_parallel_scheduler 0.01% : 0.000008s : 1: pipeline_split 0.05% : 0.000053s : 1: pre_auto_parallel 0.05% : 0.000046s : 1: py_interpret_to_execute 0.03% : 0.000026s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000006s : 1: remove_cast_before_assign_add 0.02% : 0.000022s : 1: remove_dup_value 22.78% : 0.023089s : 1: renormalize.infer 0.57% : 0.000575s : 1: renormalize.specialize 0.01% : 0.000009s : 1: reorder_send_recv_between_fp_bp 0.01% : 0.000013s : 1: rewriter_after_jit_bprop_graph 0.05% : 0.000050s : 1: rewriter_after_opt_a 0.11% : 0.000115s : 1: rewriter_before_opt_a 0.01% : 0.000009s : 1: slice_cell_reuse_recomputed_activation 0.01% : 0.000009s : 1: slice_recompute_activation 0.01% : 0.000008s : 1: split_layernorm_comm 0.01% : 0.000007s : 1: split_matmul_comm_elemetwise 0.01% : 0.000011s : 1: swap_dp_allreduce_reducescatter 0.11% : 0.000107s : 1: symbol_engine_optimizer 0.13% : 0.000135s : 1: tuple_transform 7.07% : 0.007162s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:40:37.885.83 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.257734, [21] [bootstrap]: 0.0004423 [type_inference]: 0.24682 [event_method]: 2.411e-05 [auto_monad]: 7.476e-05 [graph_reusing]: 7.21001e-06 [inline]: 3.38e-06 [add_attr]: 0.00385197, [1] [add_attr_with_inline]: 0.00383809, [1] [Cycle 1]: 7.8e-05, [2] [tag_attr]: 2.816e-05 [meta_addattr_fg_expand]: 6.02999e-06 [parallel-infer-symbol]: 3.81999e-06 [pre_auto_parallel]: 4.275e-05 [insert-virtual-dataset]: 2.91e-06 [parallel-infer-symbol-second]: 7.29982e-07 [dataset_repeat_opt]: 2.38998e-06 [pipeline_split]: 1.76e-06 [optimize]: 0.00568629, [53] [py_interpret_to_execute]: 3.653e-05 [rewriter_before_opt_a]: 9.763e-05 [opt_a]: 0.00324974, [2] [Cycle 1]: 0.00226305, [45] [expand_dump_flag]: 3.25e-06 [switch_simplify]: 4.742e-05 [loop_unroll]: 3.191e-05 [a_1]: 0.00069667 [with_stream_mark]: 2.296e-05 [recompute_prepare]: 1.095e-05 [updatestate_depend_eliminate]: 4.46002e-06 [updatestate_assign_eliminate]: 3.23e-06 [updatestate_loads_eliminate]: 2.98e-06 [parameter_eliminate]: 2.43998e-06 [a_2]: 8.53e-05 [accelerated_algorithm]: 7.96001e-06 [shard]: 2.21998e-06 [meta_shard_fg_expand]: 2.41e-06 [shard_inline]: 7.04001e-06 [merge_send_recv]: 9.62001e-06 [auto_parallel]: 7.61001e-06 [parallel]: 2.047e-05 [flash_sp]: 9.92001e-06 [merge_comm]: 3.93001e-06 [allreduce_fusion]: 3.68e-06 [matmul_add_comm_reduction]: 1.01e-05 [allreduce_slice_to_reducescatter]: 7.79983e-07 [virtual_shard_identity]: 8.62e-06 [virtual_dataset]: 7.45e-06 [get_grad_eliminate_]: 7.31999e-06 [virtual_output]: 7.23e-06 [merge_forward]: 4.41002e-06 [cell_reuse_recompute_pass]: 1.60999e-06 [offload_activation]: 1.072e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.402e-05 [merge_recompute_call_nodes]: 1.62999e-06 [before_grad]: 1.232e-05 [set_forward_comm_id_for_comm_node_pass]: 3.95998e-06 [meta_fg_expand]: 3.66001e-06 [flash_sp_send_recv_attached]: 3.51999e-06 [receive_attached]: 2.20002e-06 [after_resolve]: 1.264e-05 [a_after_grad]: 1.119e-05 [renormalize]: 0.00077462 [add_forward_monad_depend]: 6.36e-06 [auto_monad_grad]: 2.58e-06 [auto_monad_eliminator]: 1.848e-05 [cse]: 3.307e-05 [a_3]: 5.528e-05 [Cycle 2]: 0.00097447, [45] [expand_dump_flag]: 2.33002e-06 [switch_simplify]: 8.75001e-06 [loop_unroll]: 6.48e-06 [a_1]: 0.00013912 [with_stream_mark]: 1.645e-05 [recompute_prepare]: 6.91999e-06 [updatestate_depend_eliminate]: 3.42002e-06 [updatestate_assign_eliminate]: 2.76999e-06 [updatestate_loads_eliminate]: 2.99999e-06 [parameter_eliminate]: 7.099e-05 [a_2]: 7.679e-05 [accelerated_algorithm]: 7.21999e-06 [shard]: 1.83002e-06 [meta_shard_fg_expand]: 2.24999e-06 [shard_inline]: 6.74999e-06 [merge_send_recv]: 0.00013627 [auto_parallel]: 1.319e-05 [parallel]: 7.98001e-06 [flash_sp]: 1.422e-05 [merge_comm]: 5.60001e-06 [allreduce_fusion]: 3.48e-06 [matmul_add_comm_reduction]: 9.02999e-06 [allreduce_slice_to_reducescatter]: 7.50006e-07 [virtual_shard_identity]: 1.28e-05 [virtual_dataset]: 7.51001e-06 [get_grad_eliminate_]: 6.57002e-06 [virtual_output]: 6.74001e-06 [merge_forward]: 3.42002e-06 [cell_reuse_recompute_pass]: 3.25e-06 [offload_activation]: 9.32001e-06 [cell_reuse_handle_not_recompute_node_pass]: 2.055e-05 [merge_recompute_call_nodes]: 1.34e-06 [before_grad]: 1.217e-05 [set_forward_comm_id_for_comm_node_pass]: 4.52998e-06 [meta_fg_expand]: 2.94999e-06 [flash_sp_send_recv_attached]: 1.28002e-06 [receive_attached]: 2.21e-06 [after_resolve]: 1.475e-05 [a_after_grad]: 1.083e-05 [renormalize]: 8.00064e-08 [add_forward_monad_depend]: 2.87002e-06 [auto_monad_grad]: 1.39e-06 [auto_monad_eliminator]: 9.53997e-06 [cse]: 2.053e-05 [a_3]: 3.978e-05 [py_interpret_to_execute_after_opt_a]: 1.373e-05 [slice_cell_reuse_recomputed_activation]: 1.98997e-06 [rewriter_after_opt_a]: 4.192e-05 [convert_after_rewriter]: 7.21999e-06 [order_py_execute_after_rewriter]: 5.67001e-06 [mutable_eliminate]: 0.0007651 [opt_b]: 0.00022617, [1] [Cycle 1]: 0.00021808, [7] [b_1]: 0.00013354 [b_2]: 9.72999e-06 [updatestate_depend_eliminate]: 7.41001e-06 [updatestate_assign_eliminate]: 2.59999e-06 [updatestate_loads_eliminate]: 2.78e-06 [renormalize]: 6.90023e-07 [cse]: 2.391e-05 [optimize_parallel_all_gather_comm]: 1.869e-05 [overlap_param_gather]: 2.16e-06 [cconv]: 3.273e-05 [loop_unroll]: 0.00048119 [opt_after_cconv]: 0.00010922, [1] [Cycle 1]: 0.00010276, [7] [c_1]: 3.234e-05 [parameter_eliminate]: 3.82002e-06 [updatestate_depend_eliminate]: 5.77999e-06 [updatestate_assign_eliminate]: 2.59001e-06 [updatestate_loads_eliminate]: 2.48002e-06 [cse]: 1.843e-05 [renormalize]: 1.17999e-06 [remove_dup_value]: 1.547e-05 [tuple_transform]: 8.339e-05, [1] [Cycle 1]: 7.876e-05, [4] [d_1]: 4.8e-05 [none_parameter_eliminate]: 1.94e-06 [renormalize]: 3.29979e-07 [switch_simplify]: 7.58001e-06 [partial_unused_args_eliminate]: 1.84e-06 [add_recomputation]: 5.251e-05 [cse_after_recomputation]: 2.182e-05, [1] [Cycle 1]: 1.704e-05, [1] [cse]: 1.128e-05 [environ_conv]: 5.51e-06 [swap_dp_allreduce_reducescatter]: 4.89e-06 [bias_add_comm_swap]: 2.89001e-06 [label_micro_interleaved_index]: 4.4e-06 [label_fine_grained_interleaved_index]: 2.78998e-06 [merge_cast_opt]: 1.27999e-06 [slice_recompute_activation]: 2.18002e-06 [micro_interleaved_order_control]: 2.29999e-06 [assign_add_opt]: 1.35999e-06 [ForceFp32Comm]: 1.18001e-06 [remove_cast_before_assign_add]: 1.03001e-06 [full_micro_interleaved_order_control]: 2.17999e-06 [reorder_send_recv_between_fp_bp]: 2.51998e-06 [comm_op_add_attrs]: 1.25999e-06 [add_comm_op_reuse_tag]: 1.06002e-06 [interleave_split_concat_branches]: 1.20001e-06 [interleave_parallel_branches]: 1.30999e-06 [overlap_opt_shard_in_pipeline]: 1.72999e-06 [overlap_opt_shard_grad_in_pipeline]: 1.72001e-06 [control_data_broadcast_order]: 1.264e-05 [grouped_pairwise_exchange_alltoall]: 1.72999e-06 [offloading_packed_experts]: 4.35e-06 [overlap_recompute_and_grad_model_parallel]: 4.79e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.35001e-06 [overlap_recompute_allgather_and_fa_grad]: 1.35999e-06 [overlap_recompute_comm]: 2.30002e-06 [overlap_grad_ring_attention]: 4.18999e-06 [overlap_grad_flash_sp]: 2.181e-05 [begin_end_overlap_inline]: 8.80013e-07 [split_matmul_comm_elemetwise]: 2.22001e-06 [split_layernorm_comm]: 2.53e-06 [handle_group_info]: 1.02e-06 [symbol_engine_optimizer]: 8.7e-05, [1] [Cycle 1]: 8.181e-05, [6] [build]: 4.14002e-06 [elim_shapecalc]: 1.196e-05 [elim_not_effective]: 1.463e-05 [opt_reshape]: 8.41002e-06 [fold_const_symbol]: 1.067e-05 [renormalize]: 2.69996e-07 [detach_backward]: 2.27001e-06 [pipeline_parallel_scheduler]: 1.94999e-06 [auto_monad_reorder]: 1.835e-05 [get_jit_bprop_graph]: 1.94e-06 [rewriter_after_jit_bprop_graph]: 5.10001e-06 [opt_after_jit_grad]: 0.00052334 [validate]: 4.576e-05 Sums bootstrap : 0.000442s : 0.17% type_inference : 0.246820s : 97.62% event_method : 0.000024s : 0.01% auto_monad : 0.000075s : 0.03% graph_reusing : 0.000007s : 0.00% inline : 0.000003s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000028s : 0.01% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.00% parallel-infer-symbol : 0.000004s : 0.00% pre_auto_parallel : 0.000043s : 0.02% insert-virtual-dataset : 0.000003s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000037s : 0.01% optimize.rewriter_before_opt_a : 0.000098s : 0.04% optimize.opt_a.expand_dump_flag : 0.000006s : 0.00% optimize.opt_a.switch_simplify : 0.000056s : 0.02% optimize.opt_a.loop_unroll : 0.000038s : 0.02% optimize.opt_a.a_1 : 0.000836s : 0.33% optimize.opt_a.with_stream_mark : 0.000039s : 0.02% optimize.opt_a.recompute_prepare : 0.000018s : 0.01% optimize.opt_a.updatestate_depend_eliminate : 0.000008s : 0.00% optimize.opt_a.updatestate_assign_eliminate : 0.000006s : 0.00% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.00% optimize.opt_a.parameter_eliminate : 0.000073s : 0.03% optimize.opt_a.a_2 : 0.000162s : 0.06% optimize.opt_a.accelerated_algorithm : 0.000015s : 0.01% optimize.opt_a.shard : 0.000004s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.000005s : 0.00% optimize.opt_a.shard_inline : 0.000014s : 0.01% optimize.opt_a.merge_send_recv : 0.000146s : 0.06% optimize.opt_a.auto_parallel : 0.000021s : 0.01% optimize.opt_a.parallel : 0.000028s : 0.01% optimize.opt_a.flash_sp : 0.000024s : 0.01% optimize.opt_a.merge_comm : 0.000010s : 0.00% optimize.opt_a.allreduce_fusion : 0.000007s : 0.00% optimize.opt_a.matmul_add_comm_reduction : 0.000019s : 0.01% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000002s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000021s : 0.01% optimize.opt_a.virtual_dataset : 0.000015s : 0.01% optimize.opt_a.get_grad_eliminate_ : 0.000014s : 0.01% optimize.opt_a.virtual_output : 0.000014s : 0.01% optimize.opt_a.merge_forward : 0.000008s : 0.00% optimize.opt_a.cell_reuse_recompute_pass : 0.000005s : 0.00% optimize.opt_a.offload_activation : 0.000020s : 0.01% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000035s : 0.01% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.00% optimize.opt_a.before_grad : 0.000024s : 0.01% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000008s : 0.00% optimize.opt_a.meta_fg_expand : 0.000007s : 0.00% optimize.opt_a.flash_sp_send_recv_attached : 0.000005s : 0.00% optimize.opt_a.receive_attached : 0.000004s : 0.00% optimize.opt_a.after_resolve : 0.000027s : 0.01% optimize.opt_a.a_after_grad : 0.000022s : 0.01% optimize.opt_a.renormalize : 0.000775s : 0.31% optimize.opt_a.add_forward_monad_depend : 0.000009s : 0.00% optimize.opt_a.auto_monad_grad : 0.000004s : 0.00% optimize.opt_a.auto_monad_eliminator : 0.000028s : 0.01% optimize.opt_a.cse : 0.000054s : 0.02% optimize.opt_a.a_3 : 0.000095s : 0.04% optimize.py_interpret_to_execute_after_opt_a : 0.000014s : 0.01% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.00% optimize.rewriter_after_opt_a : 0.000042s : 0.02% optimize.convert_after_rewriter : 0.000007s : 0.00% optimize.order_py_execute_after_rewriter : 0.000006s : 0.00% optimize.mutable_eliminate : 0.000765s : 0.30% optimize.opt_b.b_1 : 0.000134s : 0.05% optimize.opt_b.b_2 : 0.000010s : 0.00% optimize.opt_b.updatestate_depend_eliminate : 0.000007s : 0.00% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.00% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000024s : 0.01% optimize.optimize_parallel_all_gather_comm : 0.000019s : 0.01% optimize.overlap_param_gather : 0.000002s : 0.00% optimize.cconv : 0.000033s : 0.01% optimize.loop_unroll : 0.000481s : 0.19% optimize.opt_after_cconv.c_1 : 0.000032s : 0.01% optimize.opt_after_cconv.parameter_eliminate : 0.000004s : 0.00% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.00% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.00% optimize.opt_after_cconv.cse : 0.000018s : 0.01% optimize.opt_after_cconv.renormalize : 0.000001s : 0.00% optimize.remove_dup_value : 0.000015s : 0.01% optimize.tuple_transform.d_1 : 0.000048s : 0.02% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000008s : 0.00% optimize.partial_unused_args_eliminate : 0.000002s : 0.00% optimize.add_recomputation : 0.000053s : 0.02% optimize.cse_after_recomputation.cse : 0.000011s : 0.00% optimize.environ_conv : 0.000006s : 0.00% optimize.swap_dp_allreduce_reducescatter : 0.000005s : 0.00% optimize.bias_add_comm_swap : 0.000003s : 0.00% optimize.label_micro_interleaved_index : 0.000004s : 0.00% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.00% optimize.merge_cast_opt : 0.000001s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.00% optimize.micro_interleaved_order_control : 0.000002s : 0.00% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.00% optimize.full_micro_interleaved_order_control : 0.000002s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.00% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000002s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.00% optimize.control_data_broadcast_order : 0.000013s : 0.00% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.00% optimize.offloading_packed_experts : 0.000004s : 0.00% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.00% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.00% optimize.overlap_recompute_comm : 0.000002s : 0.00% optimize.overlap_grad_ring_attention : 0.000004s : 0.00% optimize.overlap_grad_flash_sp : 0.000022s : 0.01% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.00% optimize.split_layernorm_comm : 0.000003s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000004s : 0.00% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000012s : 0.00% optimize.symbol_engine_optimizer.elim_not_effective : 0.000015s : 0.01% optimize.symbol_engine_optimizer.opt_reshape : 0.000008s : 0.00% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000011s : 0.00% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.00% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000018s : 0.01% get_jit_bprop_graph : 0.000002s : 0.00% rewriter_after_jit_bprop_graph : 0.000005s : 0.00% opt_after_jit_grad : 0.000523s : 0.21% validate : 0.000046s : 0.02% Time group info: ------[substitution.] 0.000230 28 0.97% : 0.000002s : 2: substitution.elim_not_effective 0.75% : 0.000002s : 2: substitution.fold_const_symbol 2.92% : 0.000007s : 4: substitution.graph_param_transform 79.35% : 0.000182s : 4: substitution.inline 2.06% : 0.000005s : 4: substitution.j_node_and_user_rematch 3.85% : 0.000009s : 4: substitution.remove_not_recompute_node 2.90% : 0.000007s : 4: substitution.replace_old_param 7.22% : 0.000017s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.246729 2 99.57% : 0.245672s : 1: type_inference.infer 0.43% : 0.001057s : 1: type_inference.specialize ------[replace.] 0.000071 8 65.13% : 0.000046s : 4: replace.inline 34.87% : 0.000025s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000193 8 92.50% : 0.000179s : 4: match.inline 7.50% : 0.000014s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000222 1278 0.90% : 0.000002s : 13: predicate.accumulaten_eliminater 0.94% : 0.000002s : 4: predicate.ad_related_special_op_eliminate 0.56% : 0.000001s : 8: predicate.addn_check_dump 1.01% : 0.000002s : 13: predicate.addn_zero_filter 0.74% : 0.000002s : 13: predicate.adjust_all_reduce_mul_add 1.97% : 0.000004s : 21: predicate.arithmetic_simplify 1.03% : 0.000002s : 13: predicate.cast_eliminate 0.67% : 0.000001s : 8: predicate.check_bprop_eliminate 0.50% : 0.000001s : 8: predicate.compare_switch_simplify 0.17% : 0.000000s : 4: predicate.const_output_eliminate 0.65% : 0.000001s : 8: predicate.depend_value_elim 0.81% : 0.000002s : 13: predicate.dict_get_item_const_eliminator 0.97% : 0.000002s : 13: predicate.dict_get_item_eliminator 0.82% : 0.000002s : 13: predicate.dict_set_item_eliminator 1.25% : 0.000003s : 8: predicate.dumpgradient_eliminate 0.28% : 0.000001s : 4: predicate.elim_not_effective 0.43% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.01% : 0.000002s : 17: predicate.environ_add_const_eliminate 1.00% : 0.000002s : 17: predicate.environ_get_add_eliminate 1.05% : 0.000002s : 17: predicate.environ_get_depend_swap 1.47% : 0.000003s : 25: predicate.environ_get_eliminate 0.99% : 0.000002s : 17: predicate.environ_get_set_eliminate 1.34% : 0.000003s : 21: predicate.exchange_switch_depend_value 2.44% : 0.000005s : 21: predicate.float_depend_g_call 0.48% : 0.000001s : 8: predicate.float_environ_get_switch 0.84% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.18% : 0.000000s : 4: predicate.fold_const_symbol 0.96% : 0.000002s : 8: predicate.get_grad_eliminate 0.27% : 0.000001s : 4: predicate.graph_param_transform 0.55% : 0.000001s : 8: predicate.incorporate_call 0.41% : 0.000001s : 8: predicate.incorporate_call_switch 6.07% : 0.000013s : 58: predicate.inline 0.76% : 0.000002s : 8: predicate.inline_without_move 0.31% : 0.000001s : 8: predicate.j_node_and_user_rematch 0.99% : 0.000002s : 8: predicate.less_batch_normalization 1.75% : 0.000004s : 25: predicate.list_to_tuple_eliminator_ 2.45% : 0.000005s : 38: predicate.load_eliminater 0.93% : 0.000002s : 4: predicate.loop_unroll_after_grad 2.34% : 0.000005s : 34: predicate.loop_unroll_before_grad 1.48% : 0.000003s : 21: predicate.make_slice_get_slice_eliminator 0.49% : 0.000001s : 8: predicate.merge_addn 0.55% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.58% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.76% : 0.000002s : 13: predicate.minmaximum_grad 1.80% : 0.000004s : 4: predicate.mutable_eliminate 0.43% : 0.000001s : 4: predicate.opt_reshape 0.60% : 0.000001s : 4: predicate.parallel_virtual_node 1.73% : 0.000004s : 21: predicate.partial_defer_inline 1.47% : 0.000003s : 21: predicate.partial_eliminate 0.85% : 0.000002s : 13: predicate.print_const_string_wrapper 0.75% : 0.000002s : 8: predicate.reduce_all_const_elim 1.15% : 0.000003s : 13: predicate.reduce_eliminate 2.33% : 0.000005s : 38: predicate.redundant_stop_gradient_eliminater 0.57% : 0.000001s : 8: predicate.remove_not_recompute_node 1.45% : 0.000003s : 25: predicate.replace_applicator 0.55% : 0.000001s : 8: predicate.replace_old_param 0.32% : 0.000001s : 4: predicate.reset_defer_inline 0.87% : 0.000002s : 13: predicate.reshape_eliminate 0.87% : 0.000002s : 8: predicate.row_tensor_add_zeros_like 0.54% : 0.000001s : 4: predicate.row_tensor_eliminate 0.87% : 0.000002s : 8: predicate.same_eliminate 0.52% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.91% : 0.000002s : 8: predicate.shard_identity_eliminate 0.77% : 0.000002s : 8: predicate.special_op_eliminate 0.62% : 0.000001s : 8: predicate.specialize_transform 1.19% : 0.000003s : 8: predicate.split_environ_get_set_with_tuple_value 0.87% : 0.000002s : 8: predicate.stack_unstack_eliminate 0.37% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.56% : 0.000003s : 21: predicate.switch_defer_inline 1.89% : 0.000004s : 29: predicate.switch_layer_defer_inline 5.37% : 0.000012s : 67: predicate.switch_simplify 0.85% : 0.000002s : 13: predicate.tile_eliminate 0.88% : 0.000002s : 13: predicate.transpose_eliminate 1.41% : 0.000003s : 21: predicate.tuple_list_convert_item_index_to_positive 1.37% : 0.000003s : 21: predicate.tuple_list_get_item_const_eliminator 1.37% : 0.000003s : 21: predicate.tuple_list_get_item_depend_reorder 3.04% : 0.000007s : 33: predicate.tuple_list_get_item_eliminator 1.54% : 0.000003s : 21: predicate.tuple_list_get_set_item_eliminator 2.35% : 0.000005s : 29: predicate.tuple_list_set_item_eliminator 1.64% : 0.000004s : 25: predicate.tuple_to_list_eliminator_ 2.39% : 0.000005s : 38: predicate.updatestate_pure_node_eliminater 2.98% : 0.000007s : 46: predicate.updatestate_useless_node_eliminater 0.44% : 0.000001s : 4: predicate.value_based_eliminate 0.94% : 0.000002s : 8: predicate.virtual_dataset_eliminate 0.68% : 0.000001s : 8: predicate.virtual_output_eliminate 0.28% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.47% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000724 11 47.15% : 0.000341s : 5: func_graph_cloner_run.FuncGraphClonerGraph 52.85% : 0.000383s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.269493 192 0.00% : 0.000004s : 1: ForceFp32Comm 1.43% : 0.003858s : 1: add_attr 1.43% : 0.003843s : 1: add_attr_with_inline 0.00% : 0.000005s : 1: add_comm_op_reuse_tag 0.02% : 0.000056s : 1: add_recomputation 0.00% : 0.000004s : 1: assign_add_opt 0.03% : 0.000080s : 1: auto_monad 0.01% : 0.000022s : 1: auto_monad_reorder 0.00% : 0.000004s : 1: begin_end_overlap_inline 0.00% : 0.000006s : 1: bias_add_comm_swap 0.18% : 0.000473s : 1: bootstrap 0.01% : 0.000037s : 1: cconv 0.00% : 0.000004s : 1: comm_op_add_attrs 0.01% : 0.000016s : 1: control_data_broadcast_order 0.00% : 0.000010s : 1: convert_after_rewriter 0.01% : 0.000025s : 1: cse_after_recomputation 0.00% : 0.000005s : 1: dataset_repeat_opt 0.00% : 0.000005s : 1: detach_backward 0.00% : 0.000009s : 1: environ_conv 0.01% : 0.000032s : 1: event_method 0.00% : 0.000005s : 1: full_micro_interleaved_order_control 0.00% : 0.000005s : 1: get_jit_bprop_graph 0.00% : 0.000011s : 1: graph_reusing 0.00% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000004s : 1: handle_group_info 0.00% : 0.000006s : 1: inline 0.00% : 0.000006s : 1: insert-virtual-dataset 0.00% : 0.000004s : 1: interleave_parallel_branches 0.00% : 0.000004s : 1: interleave_split_concat_branches 0.00% : 0.000006s : 1: label_fine_grained_interleaved_index 0.00% : 0.000007s : 1: label_micro_interleaved_index 0.18% : 0.000490s : 1: loop_unroll 0.00% : 0.000004s : 1: merge_cast_opt 0.00% : 0.000005s : 1: micro_interleaved_order_control 0.29% : 0.000775s : 1: mutable_eliminate 0.00% : 0.000007s : 1: offloading_packed_experts 0.01% : 0.000015s : 1: opt.transform.loop_unroll_optimizer 0.01% : 0.000020s : 1: opt.transform.mutable_eliminate 0.48% : 0.001301s : 78: opt.transform.opt_a 0.01% : 0.000031s : 1: opt.transform.opt_after_cconv 0.01% : 0.000027s : 1: opt.transform.opt_after_jit_grad 0.04% : 0.000108s : 28: opt.transform.opt_b 0.02% : 0.000053s : 2: opt.transform.opt_trans_graph 0.02% : 0.000041s : 4: opt.transform.symbol_engine_opt 1.21% : 0.003254s : 1: opt_a 0.04% : 0.000113s : 1: opt_after_cconv 0.20% : 0.000533s : 1: opt_after_jit_grad 0.09% : 0.000230s : 1: opt_b 2.11% : 0.005693s : 1: optimize 0.01% : 0.000022s : 1: optimize_parallel_all_gather_comm 0.00% : 0.000009s : 1: order_py_execute_after_rewriter 0.01% : 0.000025s : 1: overlap_grad_flash_sp 0.00% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.00% : 0.000007s : 1: overlap_grad_ring_attention 0.00% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000005s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000005s : 1: overlap_param_gather 0.00% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.00% : 0.000009s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000005s : 1: overlap_recompute_comm 0.00% : 0.000007s : 1: parallel-infer-symbol 0.00% : 0.000004s : 1: parallel-infer-symbol-second 0.00% : 0.000005s : 1: partial_unused_args_eliminate 0.00% : 0.000005s : 1: pipeline_parallel_scheduler 0.00% : 0.000005s : 1: pipeline_split 0.02% : 0.000047s : 1: pre_auto_parallel 0.02% : 0.000041s : 1: py_interpret_to_execute 0.01% : 0.000017s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000004s : 1: remove_cast_before_assign_add 0.01% : 0.000020s : 1: remove_dup_value 0.14% : 0.000385s : 1: renormalize.infer 0.14% : 0.000380s : 1: renormalize.specialize 0.00% : 0.000005s : 1: reorder_send_recv_between_fp_bp 0.00% : 0.000008s : 1: rewriter_after_jit_bprop_graph 0.02% : 0.000046s : 1: rewriter_after_opt_a 0.04% : 0.000102s : 1: rewriter_before_opt_a 0.00% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000006s : 1: slice_recompute_activation 0.00% : 0.000006s : 1: split_layernorm_comm 0.00% : 0.000005s : 1: split_matmul_comm_elemetwise 0.00% : 0.000008s : 1: swap_dp_allreduce_reducescatter 0.03% : 0.000090s : 1: symbol_engine_optimizer 0.03% : 0.000086s : 1: tuple_transform 91.60% : 0.246845s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:40:40.783.99 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:40:40.786.74 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.372864, [21] [bootstrap]: 0.00084853 [type_inference]: 0.173147 [event_method]: 2.176e-05 [auto_monad]: 6.976e-05 [graph_reusing]: 6.19001e-06 [inline]: 5.02999e-06 [add_attr]: 0.00521986, [1] [add_attr_with_inline]: 0.00520716, [1] [Cycle 1]: 0.00010436, [2] [tag_attr]: 2.645e-05 [meta_addattr_fg_expand]: 6.26e-06 [parallel-infer-symbol]: 4e-06 [pre_auto_parallel]: 4.3e-05 [insert-virtual-dataset]: 2.61999e-06 [parallel-infer-symbol-second]: 8.80013e-07 [dataset_repeat_opt]: 1.96e-06 [pipeline_split]: 1.54e-06 [optimize]: 0.190491, [53] [py_interpret_to_execute]: 3.721e-05 [rewriter_before_opt_a]: 0.00010888 [opt_a]: 0.00428589, [2] [Cycle 1]: 0.00334324, [45] [expand_dump_flag]: 2.95002e-06 [switch_simplify]: 4.38e-05 [loop_unroll]: 3.088e-05 [a_1]: 0.00077161 [with_stream_mark]: 1.923e-05 [recompute_prepare]: 9.59e-06 [updatestate_depend_eliminate]: 4.35e-06 [updatestate_assign_eliminate]: 4.26001e-06 [updatestate_loads_eliminate]: 3.03e-06 [parameter_eliminate]: 2.21e-06 [a_2]: 0.0001188 [accelerated_algorithm]: 9.73002e-06 [shard]: 1.83002e-06 [meta_shard_fg_expand]: 1.98002e-06 [shard_inline]: 6.48e-06 [merge_send_recv]: 9.42999e-06 [auto_parallel]: 7.34002e-06 [parallel]: 2.459e-05 [flash_sp]: 1.118e-05 [merge_comm]: 4.95999e-06 [allreduce_fusion]: 4.87e-06 [matmul_add_comm_reduction]: 1.219e-05 [allreduce_slice_to_reducescatter]: 6.50005e-07 [virtual_shard_identity]: 1.023e-05 [virtual_dataset]: 8.75999e-06 [get_grad_eliminate_]: 7.11999e-06 [virtual_output]: 1.015e-05 [merge_forward]: 4.1e-06 [cell_reuse_recompute_pass]: 1.96003e-06 [offload_activation]: 1.397e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.521e-05 [merge_recompute_call_nodes]: 1.72999e-06 [before_grad]: 1.185e-05 [set_forward_comm_id_for_comm_node_pass]: 3.68e-06 [meta_fg_expand]: 3.55998e-06 [flash_sp_send_recv_attached]: 3.3e-06 [receive_attached]: 3.16001e-06 [after_resolve]: 1.541e-05 [a_after_grad]: 1.242e-05 [renormalize]: 0.00144547 [add_forward_monad_depend]: 6.07001e-06 [auto_monad_grad]: 2.67001e-06 [auto_monad_eliminator]: 2.017e-05 [cse]: 3.445e-05 [a_3]: 7.058e-05 [Cycle 2]: 0.00092437, [45] [expand_dump_flag]: 2.83998e-06 [switch_simplify]: 7.92998e-06 [loop_unroll]: 7.95998e-06 [a_1]: 0.00015635 [with_stream_mark]: 1.429e-05 [recompute_prepare]: 6.70998e-06 [updatestate_depend_eliminate]: 3.39001e-06 [updatestate_assign_eliminate]: 2.46e-06 [updatestate_loads_eliminate]: 2.54001e-06 [parameter_eliminate]: 1.43002e-06 [a_2]: 0.00011017 [accelerated_algorithm]: 7.77e-06 [shard]: 2.69001e-06 [meta_shard_fg_expand]: 2.79999e-06 [shard_inline]: 7e-06 [merge_send_recv]: 6.75002e-06 [auto_parallel]: 7.93001e-06 [parallel]: 6.12999e-06 [flash_sp]: 5.35999e-06 [merge_comm]: 3.41999e-06 [allreduce_fusion]: 4.65999e-06 [matmul_add_comm_reduction]: 7.70998e-06 [allreduce_slice_to_reducescatter]: 3.09985e-07 [virtual_shard_identity]: 8.21002e-06 [virtual_dataset]: 6.84001e-06 [get_grad_eliminate_]: 6.01998e-06 [virtual_output]: 6.87002e-06 [merge_forward]: 3.32002e-06 [cell_reuse_recompute_pass]: 3.56001e-06 [offload_activation]: 9.64e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.38e-05 [merge_recompute_call_nodes]: 9.10019e-07 [before_grad]: 1.045e-05 [set_forward_comm_id_for_comm_node_pass]: 3.16999e-06 [meta_fg_expand]: 3.41999e-06 [flash_sp_send_recv_attached]: 1.02e-06 [receive_attached]: 1.61002e-06 [after_resolve]: 1.293e-05 [a_after_grad]: 9.51e-06 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 1.29e-06 [auto_monad_grad]: 1.75001e-06 [auto_monad_eliminator]: 1.028e-05 [cse]: 1.829e-05 [a_3]: 5.605e-05 [py_interpret_to_execute_after_opt_a]: 2.18e-05 [slice_cell_reuse_recomputed_activation]: 5.07e-06 [rewriter_after_opt_a]: 4.55e-05 [convert_after_rewriter]: 1.029e-05 [order_py_execute_after_rewriter]: 9.19e-06 [mutable_eliminate]: 0.00110466 [opt_b]: 0.00036923, [1] [Cycle 1]: 0.00035586, [7] [b_1]: 0.0001858 [b_2]: 9.84001e-06 [updatestate_depend_eliminate]: 8.95001e-06 [updatestate_assign_eliminate]: 2.76999e-06 [updatestate_loads_eliminate]: 2.39001e-06 [renormalize]: 1.47999e-06 [cse]: 2.958e-05 [optimize_parallel_all_gather_comm]: 3.005e-05 [overlap_param_gather]: 8.02e-06 [cconv]: 4.229e-05 [loop_unroll]: 0.183199 [opt_after_cconv]: 0.00022248, [1] [Cycle 1]: 0.00020162, [7] [c_1]: 5.007e-05 [parameter_eliminate]: 7.51999e-06 [updatestate_depend_eliminate]: 1.762e-05 [updatestate_assign_eliminate]: 4.17e-06 [updatestate_loads_eliminate]: 3.31001e-06 [cse]: 5.29e-05 [renormalize]: 2.19996e-07 [remove_dup_value]: 2.161e-05 [tuple_transform]: 0.00013261, [1] [Cycle 1]: 0.00011896, [4] [d_1]: 7.136e-05 [none_parameter_eliminate]: 1.82001e-06 [renormalize]: 1.47999e-06 [switch_simplify]: 1.026e-05 [partial_unused_args_eliminate]: 5.09e-06 [add_recomputation]: 8.602e-05 [cse_after_recomputation]: 3.679e-05, [1] [Cycle 1]: 2.807e-05, [1] [cse]: 1.51e-05 [environ_conv]: 1.55e-05 [swap_dp_allreduce_reducescatter]: 5.192e-05 [bias_add_comm_swap]: 9.84001e-06 [label_micro_interleaved_index]: 2.456e-05 [label_fine_grained_interleaved_index]: 6.19001e-06 [merge_cast_opt]: 6.11998e-06 [slice_recompute_activation]: 4.90001e-06 [micro_interleaved_order_control]: 5.04e-06 [assign_add_opt]: 6.14001e-06 [ForceFp32Comm]: 3.45e-06 [remove_cast_before_assign_add]: 4.27003e-06 [full_micro_interleaved_order_control]: 5.92001e-06 [reorder_send_recv_between_fp_bp]: 7.7e-06 [comm_op_add_attrs]: 5.05999e-06 [add_comm_op_reuse_tag]: 3.23e-06 [interleave_split_concat_branches]: 4.32998e-06 [interleave_parallel_branches]: 3.52002e-06 [overlap_opt_shard_in_pipeline]: 6.19001e-06 [overlap_opt_shard_grad_in_pipeline]: 4.16001e-06 [control_data_broadcast_order]: 2.338e-05 [grouped_pairwise_exchange_alltoall]: 3.82002e-06 [offloading_packed_experts]: 6.39999e-06 [overlap_recompute_and_grad_model_parallel]: 8.87e-06 [overlap_grad_matmul_and_grad_allreduce]: 4.03001e-06 [overlap_recompute_allgather_and_fa_grad]: 3.98999e-06 [overlap_recompute_comm]: 7.20998e-06 [overlap_grad_ring_attention]: 8.33001e-06 [overlap_grad_flash_sp]: 3.529e-05 [begin_end_overlap_inline]: 3.70003e-06 [split_matmul_comm_elemetwise]: 6.14999e-06 [split_layernorm_comm]: 3.98999e-06 [handle_group_info]: 3.32002e-06 [symbol_engine_optimizer]: 0.00012642, [1] [Cycle 1]: 0.00011492, [6] [build]: 6.91001e-06 [elim_shapecalc]: 1.491e-05 [elim_not_effective]: 1.93e-05 [opt_reshape]: 1.169e-05 [fold_const_symbol]: 1.107e-05 [renormalize]: 2.50002e-07 [detach_backward]: 5.74e-06 [pipeline_parallel_scheduler]: 1.99999e-06 [auto_monad_reorder]: 2.817e-05 [get_jit_bprop_graph]: 2.66999e-06 [rewriter_after_jit_bprop_graph]: 1.267e-05 [opt_after_jit_grad]: 0.00118283 [validate]: 8.083e-05 Sums bootstrap : 0.000849s : 0.23% type_inference : 0.173147s : 47.51% event_method : 0.000022s : 0.01% auto_monad : 0.000070s : 0.02% graph_reusing : 0.000006s : 0.00% inline : 0.000005s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000026s : 0.01% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.00% parallel-infer-symbol : 0.000004s : 0.00% pre_auto_parallel : 0.000043s : 0.01% insert-virtual-dataset : 0.000003s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000037s : 0.01% optimize.rewriter_before_opt_a : 0.000109s : 0.03% optimize.opt_a.expand_dump_flag : 0.000006s : 0.00% optimize.opt_a.switch_simplify : 0.000052s : 0.01% optimize.opt_a.loop_unroll : 0.000039s : 0.01% optimize.opt_a.a_1 : 0.000928s : 0.25% optimize.opt_a.with_stream_mark : 0.000034s : 0.01% optimize.opt_a.recompute_prepare : 0.000016s : 0.00% optimize.opt_a.updatestate_depend_eliminate : 0.000008s : 0.00% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.00% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.00% optimize.opt_a.parameter_eliminate : 0.000004s : 0.00% optimize.opt_a.a_2 : 0.000229s : 0.06% optimize.opt_a.accelerated_algorithm : 0.000018s : 0.00% optimize.opt_a.shard : 0.000005s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.000005s : 0.00% optimize.opt_a.shard_inline : 0.000013s : 0.00% optimize.opt_a.merge_send_recv : 0.000016s : 0.00% optimize.opt_a.auto_parallel : 0.000015s : 0.00% optimize.opt_a.parallel : 0.000031s : 0.01% optimize.opt_a.flash_sp : 0.000017s : 0.00% optimize.opt_a.merge_comm : 0.000008s : 0.00% optimize.opt_a.allreduce_fusion : 0.000010s : 0.00% optimize.opt_a.matmul_add_comm_reduction : 0.000020s : 0.01% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000018s : 0.01% optimize.opt_a.virtual_dataset : 0.000016s : 0.00% optimize.opt_a.get_grad_eliminate_ : 0.000013s : 0.00% optimize.opt_a.virtual_output : 0.000017s : 0.00% optimize.opt_a.merge_forward : 0.000007s : 0.00% optimize.opt_a.cell_reuse_recompute_pass : 0.000006s : 0.00% optimize.opt_a.offload_activation : 0.000024s : 0.01% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000029s : 0.01% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.00% optimize.opt_a.before_grad : 0.000022s : 0.01% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000007s : 0.00% optimize.opt_a.meta_fg_expand : 0.000007s : 0.00% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.00% optimize.opt_a.receive_attached : 0.000005s : 0.00% optimize.opt_a.after_resolve : 0.000028s : 0.01% optimize.opt_a.a_after_grad : 0.000022s : 0.01% optimize.opt_a.renormalize : 0.001446s : 0.40% optimize.opt_a.add_forward_monad_depend : 0.000007s : 0.00% optimize.opt_a.auto_monad_grad : 0.000004s : 0.00% optimize.opt_a.auto_monad_eliminator : 0.000030s : 0.01% optimize.opt_a.cse : 0.000053s : 0.01% optimize.opt_a.a_3 : 0.000127s : 0.03% optimize.py_interpret_to_execute_after_opt_a : 0.000022s : 0.01% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.00% optimize.rewriter_after_opt_a : 0.000045s : 0.01% optimize.convert_after_rewriter : 0.000010s : 0.00% optimize.order_py_execute_after_rewriter : 0.000009s : 0.00% optimize.mutable_eliminate : 0.001105s : 0.30% optimize.opt_b.b_1 : 0.000186s : 0.05% optimize.opt_b.b_2 : 0.000010s : 0.00% optimize.opt_b.updatestate_depend_eliminate : 0.000009s : 0.00% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000002s : 0.00% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000030s : 0.01% optimize.optimize_parallel_all_gather_comm : 0.000030s : 0.01% optimize.overlap_param_gather : 0.000008s : 0.00% optimize.cconv : 0.000042s : 0.01% optimize.loop_unroll : 0.183199s : 50.27% optimize.opt_after_cconv.c_1 : 0.000050s : 0.01% optimize.opt_after_cconv.parameter_eliminate : 0.000008s : 0.00% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000018s : 0.00% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000004s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.cse : 0.000053s : 0.01% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000022s : 0.01% optimize.tuple_transform.d_1 : 0.000071s : 0.02% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000001s : 0.00% optimize.tuple_transform.switch_simplify : 0.000010s : 0.00% optimize.partial_unused_args_eliminate : 0.000005s : 0.00% optimize.add_recomputation : 0.000086s : 0.02% optimize.cse_after_recomputation.cse : 0.000015s : 0.00% optimize.environ_conv : 0.000016s : 0.00% optimize.swap_dp_allreduce_reducescatter : 0.000052s : 0.01% optimize.bias_add_comm_swap : 0.000010s : 0.00% optimize.label_micro_interleaved_index : 0.000025s : 0.01% optimize.label_fine_grained_interleaved_index : 0.000006s : 0.00% optimize.merge_cast_opt : 0.000006s : 0.00% optimize.slice_recompute_activation : 0.000005s : 0.00% optimize.micro_interleaved_order_control : 0.000005s : 0.00% optimize.assign_add_opt : 0.000006s : 0.00% optimize.ForceFp32Comm : 0.000003s : 0.00% optimize.remove_cast_before_assign_add : 0.000004s : 0.00% optimize.full_micro_interleaved_order_control : 0.000006s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000008s : 0.00% optimize.comm_op_add_attrs : 0.000005s : 0.00% optimize.add_comm_op_reuse_tag : 0.000003s : 0.00% optimize.interleave_split_concat_branches : 0.000004s : 0.00% optimize.interleave_parallel_branches : 0.000004s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000006s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000004s : 0.00% optimize.control_data_broadcast_order : 0.000023s : 0.01% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.00% optimize.offloading_packed_experts : 0.000006s : 0.00% optimize.overlap_recompute_and_grad_model_parallel : 0.000009s : 0.00% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.00% optimize.overlap_recompute_comm : 0.000007s : 0.00% optimize.overlap_grad_ring_attention : 0.000008s : 0.00% optimize.overlap_grad_flash_sp : 0.000035s : 0.01% optimize.begin_end_overlap_inline : 0.000004s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000006s : 0.00% optimize.split_layernorm_comm : 0.000004s : 0.00% optimize.handle_group_info : 0.000003s : 0.00% optimize.symbol_engine_optimizer.build : 0.000007s : 0.00% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000015s : 0.00% optimize.symbol_engine_optimizer.elim_not_effective : 0.000019s : 0.01% optimize.symbol_engine_optimizer.opt_reshape : 0.000012s : 0.00% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000011s : 0.00% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000006s : 0.00% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000028s : 0.01% get_jit_bprop_graph : 0.000003s : 0.00% rewriter_after_jit_bprop_graph : 0.000013s : 0.00% opt_after_jit_grad : 0.001183s : 0.32% validate : 0.000081s : 0.02% Time group info: ------[substitution.] 0.000282 34 12.79% : 0.000036s : 6: substitution.arithmetic_simplify 0.99% : 0.000003s : 2: substitution.elim_not_effective 0.63% : 0.000002s : 2: substitution.fold_const_symbol 3.82% : 0.000011s : 4: substitution.graph_param_transform 69.87% : 0.000197s : 4: substitution.inline 1.97% : 0.000006s : 4: substitution.j_node_and_user_rematch 1.91% : 0.000005s : 4: substitution.remove_not_recompute_node 2.02% : 0.000006s : 4: substitution.replace_old_param 6.00% : 0.000017s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.173070 2 99.39% : 0.172015s : 1: type_inference.infer 0.61% : 0.001055s : 1: type_inference.specialize ------[replace.] 0.000103 8 73.61% : 0.000076s : 4: replace.inline 26.39% : 0.000027s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000209 8 92.80% : 0.000194s : 4: match.inline 7.20% : 0.000015s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000252 1278 0.79% : 0.000002s : 13: predicate.accumulaten_eliminater 0.79% : 0.000002s : 4: predicate.ad_related_special_op_eliminate 0.60% : 0.000002s : 8: predicate.addn_check_dump 1.07% : 0.000003s : 13: predicate.addn_zero_filter 0.73% : 0.000002s : 13: predicate.adjust_all_reduce_mul_add 2.32% : 0.000006s : 21: predicate.arithmetic_simplify 0.78% : 0.000002s : 13: predicate.cast_eliminate 0.54% : 0.000001s : 8: predicate.check_bprop_eliminate 0.62% : 0.000002s : 8: predicate.compare_switch_simplify 0.16% : 0.000000s : 4: predicate.const_output_eliminate 1.02% : 0.000003s : 8: predicate.depend_value_elim 0.77% : 0.000002s : 13: predicate.dict_get_item_const_eliminator 1.29% : 0.000003s : 13: predicate.dict_get_item_eliminator 0.72% : 0.000002s : 13: predicate.dict_set_item_eliminator 0.91% : 0.000002s : 8: predicate.dumpgradient_eliminate 0.24% : 0.000001s : 4: predicate.elim_not_effective 0.72% : 0.000002s : 4: predicate.elim_shapecalc_of_broadcastargs 0.90% : 0.000002s : 17: predicate.environ_add_const_eliminate 0.84% : 0.000002s : 17: predicate.environ_get_add_eliminate 0.86% : 0.000002s : 17: predicate.environ_get_depend_swap 1.95% : 0.000005s : 25: predicate.environ_get_eliminate 1.18% : 0.000003s : 17: predicate.environ_get_set_eliminate 1.19% : 0.000003s : 21: predicate.exchange_switch_depend_value 2.08% : 0.000005s : 21: predicate.float_depend_g_call 0.43% : 0.000001s : 8: predicate.float_environ_get_switch 1.06% : 0.000003s : 12: predicate.float_tuple_getitem_switch 0.15% : 0.000000s : 4: predicate.fold_const_symbol 0.65% : 0.000002s : 8: predicate.get_grad_eliminate 0.23% : 0.000001s : 4: predicate.graph_param_transform 0.49% : 0.000001s : 8: predicate.incorporate_call 0.40% : 0.000001s : 8: predicate.incorporate_call_switch 6.22% : 0.000016s : 58: predicate.inline 0.94% : 0.000002s : 8: predicate.inline_without_move 0.27% : 0.000001s : 8: predicate.j_node_and_user_rematch 1.84% : 0.000005s : 8: predicate.less_batch_normalization 1.70% : 0.000004s : 25: predicate.list_to_tuple_eliminator_ 2.14% : 0.000005s : 38: predicate.load_eliminater 3.62% : 0.000009s : 4: predicate.loop_unroll_after_grad 1.91% : 0.000005s : 34: predicate.loop_unroll_before_grad 2.13% : 0.000005s : 21: predicate.make_slice_get_slice_eliminator 0.43% : 0.000001s : 8: predicate.merge_addn 0.52% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.72% : 0.000002s : 8: predicate.mini_step_allgather_replace 1.18% : 0.000003s : 13: predicate.minmaximum_grad 1.12% : 0.000003s : 4: predicate.mutable_eliminate 0.63% : 0.000002s : 4: predicate.opt_reshape 0.57% : 0.000001s : 4: predicate.parallel_virtual_node 1.95% : 0.000005s : 21: predicate.partial_defer_inline 1.32% : 0.000003s : 21: predicate.partial_eliminate 0.71% : 0.000002s : 13: predicate.print_const_string_wrapper 0.75% : 0.000002s : 8: predicate.reduce_all_const_elim 0.94% : 0.000002s : 13: predicate.reduce_eliminate 2.25% : 0.000006s : 38: predicate.redundant_stop_gradient_eliminater 0.44% : 0.000001s : 8: predicate.remove_not_recompute_node 1.13% : 0.000003s : 25: predicate.replace_applicator 0.38% : 0.000001s : 8: predicate.replace_old_param 0.20% : 0.000001s : 4: predicate.reset_defer_inline 1.37% : 0.000003s : 13: predicate.reshape_eliminate 1.28% : 0.000003s : 8: predicate.row_tensor_add_zeros_like 0.43% : 0.000001s : 4: predicate.row_tensor_eliminate 0.87% : 0.000002s : 8: predicate.same_eliminate 0.36% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.68% : 0.000002s : 8: predicate.shard_identity_eliminate 0.77% : 0.000002s : 8: predicate.special_op_eliminate 0.54% : 0.000001s : 8: predicate.specialize_transform 0.71% : 0.000002s : 8: predicate.split_environ_get_set_with_tuple_value 0.64% : 0.000002s : 8: predicate.stack_unstack_eliminate 0.34% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.27% : 0.000003s : 21: predicate.switch_defer_inline 1.69% : 0.000004s : 29: predicate.switch_layer_defer_inline 4.18% : 0.000011s : 67: predicate.switch_simplify 0.70% : 0.000002s : 13: predicate.tile_eliminate 0.82% : 0.000002s : 13: predicate.transpose_eliminate 1.29% : 0.000003s : 21: predicate.tuple_list_convert_item_index_to_positive 1.38% : 0.000003s : 21: predicate.tuple_list_get_item_const_eliminator 1.60% : 0.000004s : 21: predicate.tuple_list_get_item_depend_reorder 3.50% : 0.000009s : 33: predicate.tuple_list_get_item_eliminator 1.40% : 0.000004s : 21: predicate.tuple_list_get_set_item_eliminator 1.94% : 0.000005s : 29: predicate.tuple_list_set_item_eliminator 1.58% : 0.000004s : 25: predicate.tuple_to_list_eliminator_ 2.39% : 0.000006s : 38: predicate.updatestate_pure_node_eliminater 2.50% : 0.000006s : 46: predicate.updatestate_useless_node_eliminater 0.71% : 0.000002s : 4: predicate.value_based_eliminate 0.52% : 0.000001s : 8: predicate.virtual_dataset_eliminate 0.94% : 0.000002s : 8: predicate.virtual_output_eliminate 0.21% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.89% : 0.000002s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000914 11 48.92% : 0.000447s : 5: func_graph_cloner_run.FuncGraphClonerGraph 51.08% : 0.000467s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.571598 192 0.00% : 0.000006s : 1: ForceFp32Comm 0.92% : 0.005232s : 1: add_attr 0.91% : 0.005211s : 1: add_attr_with_inline 0.00% : 0.000006s : 1: add_comm_op_reuse_tag 0.02% : 0.000090s : 1: add_recomputation 0.00% : 0.000009s : 1: assign_add_opt 0.01% : 0.000079s : 1: auto_monad 0.01% : 0.000035s : 1: auto_monad_reorder 0.00% : 0.000006s : 1: begin_end_overlap_inline 0.00% : 0.000014s : 1: bias_add_comm_swap 0.16% : 0.000917s : 1: bootstrap 0.01% : 0.000046s : 1: cconv 0.00% : 0.000008s : 1: comm_op_add_attrs 0.00% : 0.000027s : 1: control_data_broadcast_order 0.00% : 0.000013s : 1: convert_after_rewriter 0.01% : 0.000041s : 1: cse_after_recomputation 0.00% : 0.000008s : 1: dataset_repeat_opt 0.01% : 0.000037s : 1: detach_backward 0.00% : 0.000018s : 1: environ_conv 0.01% : 0.000033s : 1: event_method 0.00% : 0.000009s : 1: full_micro_interleaved_order_control 0.00% : 0.000009s : 1: get_jit_bprop_graph 0.00% : 0.000013s : 1: graph_reusing 0.00% : 0.000006s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000006s : 1: handle_group_info 0.00% : 0.000011s : 1: inline 0.00% : 0.000009s : 1: insert-virtual-dataset 0.00% : 0.000006s : 1: interleave_parallel_branches 0.00% : 0.000007s : 1: interleave_split_concat_branches 0.00% : 0.000009s : 1: label_fine_grained_interleaved_index 0.00% : 0.000027s : 1: label_micro_interleaved_index 32.05% : 0.183225s : 1: loop_unroll 0.00% : 0.000009s : 1: merge_cast_opt 0.00% : 0.000008s : 1: micro_interleaved_order_control 0.19% : 0.001114s : 1: mutable_eliminate 0.00% : 0.000009s : 1: offloading_packed_experts 0.01% : 0.000051s : 1: opt.transform.loop_unroll_optimizer 0.00% : 0.000021s : 1: opt.transform.mutable_eliminate 0.25% : 0.001401s : 78: opt.transform.opt_a 0.01% : 0.000046s : 1: opt.transform.opt_after_cconv 0.01% : 0.000037s : 1: opt.transform.opt_after_jit_grad 0.02% : 0.000115s : 28: opt.transform.opt_b 0.01% : 0.000079s : 2: opt.transform.opt_trans_graph 0.01% : 0.000053s : 4: opt.transform.symbol_engine_opt 0.75% : 0.004289s : 1: opt_a 0.04% : 0.000227s : 1: opt_after_cconv 0.21% : 0.001196s : 1: opt_after_jit_grad 0.07% : 0.000373s : 1: opt_b 33.56% : 0.191837s : 1: optimize 0.01% : 0.000034s : 1: optimize_parallel_all_gather_comm 0.00% : 0.000012s : 1: order_py_execute_after_rewriter 0.01% : 0.000038s : 1: overlap_grad_flash_sp 0.00% : 0.000007s : 1: overlap_grad_matmul_and_grad_allreduce 0.00% : 0.000012s : 1: overlap_grad_ring_attention 0.00% : 0.000013s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000009s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000011s : 1: overlap_param_gather 0.00% : 0.000007s : 1: overlap_recompute_allgather_and_fa_grad 0.00% : 0.000012s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000010s : 1: overlap_recompute_comm 0.00% : 0.000011s : 1: parallel-infer-symbol 0.00% : 0.000007s : 1: parallel-infer-symbol-second 0.00% : 0.000008s : 1: partial_unused_args_eliminate 0.00% : 0.000010s : 1: pipeline_parallel_scheduler 0.00% : 0.000007s : 1: pipeline_split 0.01% : 0.000050s : 1: pre_auto_parallel 0.01% : 0.000041s : 1: py_interpret_to_execute 0.00% : 0.000025s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000014s : 1: remove_cast_before_assign_add 0.00% : 0.000025s : 1: remove_dup_value 0.16% : 0.000916s : 1: renormalize.infer 0.09% : 0.000517s : 1: renormalize.specialize 0.00% : 0.000010s : 1: reorder_send_recv_between_fp_bp 0.00% : 0.000019s : 1: rewriter_after_jit_bprop_graph 0.01% : 0.000049s : 1: rewriter_after_opt_a 0.02% : 0.000114s : 1: rewriter_before_opt_a 0.00% : 0.000008s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000008s : 1: slice_recompute_activation 0.00% : 0.000007s : 1: split_layernorm_comm 0.00% : 0.000009s : 1: split_matmul_comm_elemetwise 0.01% : 0.000058s : 1: swap_dp_allreduce_reducescatter 0.02% : 0.000129s : 1: symbol_engine_optimizer 0.02% : 0.000136s : 1: tuple_transform 30.30% : 0.173218s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:40:43.279.511 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.135102, [21] [bootstrap]: 0.00047606 [type_inference]: 0.122572 [event_method]: 2.65e-05 [auto_monad]: 7.998e-05 [graph_reusing]: 5.94e-06 [inline]: 2.81e-06 [add_attr]: 0.0042705, [1] [add_attr_with_inline]: 0.00425483, [1] [Cycle 1]: 8.971e-05, [2] [tag_attr]: 3.014e-05 [meta_addattr_fg_expand]: 6.44999e-06 [parallel-infer-symbol]: 5.56e-06 [pre_auto_parallel]: 4.692e-05 [insert-virtual-dataset]: 2.81e-06 [parallel-infer-symbol-second]: 8.10018e-07 [dataset_repeat_opt]: 2.38002e-06 [pipeline_split]: 1.82999e-06 [optimize]: 0.00656158, [53] [py_interpret_to_execute]: 3.776e-05 [rewriter_before_opt_a]: 0.00010969 [opt_a]: 0.00346722, [2] [Cycle 1]: 0.00270778, [45] [expand_dump_flag]: 3.70998e-06 [switch_simplify]: 4.958e-05 [loop_unroll]: 3.397e-05 [a_1]: 0.00077937 [with_stream_mark]: 2.726e-05 [recompute_prepare]: 1.616e-05 [updatestate_depend_eliminate]: 5.74e-06 [updatestate_assign_eliminate]: 3.71999e-06 [updatestate_loads_eliminate]: 3.91001e-06 [parameter_eliminate]: 2.34999e-06 [a_2]: 0.00010069 [accelerated_algorithm]: 8.64e-06 [shard]: 3.04001e-06 [meta_shard_fg_expand]: 2.69999e-06 [shard_inline]: 7.27002e-06 [merge_send_recv]: 1.006e-05 [auto_parallel]: 1.053e-05 [parallel]: 2.246e-05 [flash_sp]: 1.181e-05 [merge_comm]: 4.22e-06 [allreduce_fusion]: 4.04002e-06 [matmul_add_comm_reduction]: 1.4e-05 [allreduce_slice_to_reducescatter]: 7.00005e-07 [virtual_shard_identity]: 1.101e-05 [virtual_dataset]: 8.23001e-06 [get_grad_eliminate_]: 7.30998e-06 [virtual_output]: 7.17002e-06 [merge_forward]: 5.00001e-06 [cell_reuse_recompute_pass]: 2.07001e-06 [offload_activation]: 1.245e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.575e-05 [merge_recompute_call_nodes]: 1.89e-06 [before_grad]: 1.248e-05 [set_forward_comm_id_for_comm_node_pass]: 4.69002e-06 [meta_fg_expand]: 4.38999e-06 [flash_sp_send_recv_attached]: 3.01001e-06 [receive_attached]: 2.71999e-06 [after_resolve]: 1.364e-05 [a_after_grad]: 1.057e-05 [renormalize]: 0.00101651 [add_forward_monad_depend]: 8.38001e-06 [auto_monad_grad]: 3.81001e-06 [auto_monad_eliminator]: 2.099e-05 [cse]: 3.38e-05 [a_3]: 5.611e-05 [Cycle 2]: 0.0007462, [45] [expand_dump_flag]: 2.48998e-06 [switch_simplify]: 8.66002e-06 [loop_unroll]: 6.64001e-06 [a_1]: 0.00015249 [with_stream_mark]: 1.644e-05 [recompute_prepare]: 7.50003e-06 [updatestate_depend_eliminate]: 4.03001e-06 [updatestate_assign_eliminate]: 2.96001e-06 [updatestate_loads_eliminate]: 2.78e-06 [parameter_eliminate]: 1.62999e-06 [a_2]: 7.953e-05 [accelerated_algorithm]: 7.16999e-06 [shard]: 1.77001e-06 [meta_shard_fg_expand]: 2.40002e-06 [shard_inline]: 6.38998e-06 [merge_send_recv]: 7.95e-06 [auto_parallel]: 8.64998e-06 [parallel]: 8.71002e-06 [flash_sp]: 6.26998e-06 [merge_comm]: 3.58e-06 [allreduce_fusion]: 3.81999e-06 [matmul_add_comm_reduction]: 8.32e-06 [allreduce_slice_to_reducescatter]: 6.50005e-07 [virtual_shard_identity]: 9.34e-06 [virtual_dataset]: 6.19001e-06 [get_grad_eliminate_]: 6.88e-06 [virtual_output]: 6.04001e-06 [merge_forward]: 3.95e-06 [cell_reuse_recompute_pass]: 2.71999e-06 [offload_activation]: 9.45001e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.368e-05 [merge_recompute_call_nodes]: 1.22e-06 [before_grad]: 1.106e-05 [set_forward_comm_id_for_comm_node_pass]: 4.84e-06 [meta_fg_expand]: 2.29999e-06 [flash_sp_send_recv_attached]: 1.14e-06 [receive_attached]: 1.71e-06 [after_resolve]: 1.179e-05 [a_after_grad]: 9.74e-06 [renormalize]: 8.9989e-08 [add_forward_monad_depend]: 1.57999e-06 [auto_monad_grad]: 1.82999e-06 [auto_monad_eliminator]: 9.29e-06 [cse]: 1.812e-05 [a_3]: 3.769e-05 [py_interpret_to_execute_after_opt_a]: 1.805e-05 [slice_cell_reuse_recomputed_activation]: 2.52001e-06 [rewriter_after_opt_a]: 4.25e-05 [convert_after_rewriter]: 7.45e-06 [order_py_execute_after_rewriter]: 5.56e-06 [mutable_eliminate]: 0.00096171 [opt_b]: 0.00024805, [1] [Cycle 1]: 0.00023875, [7] [b_1]: 0.0001393 [b_2]: 9.81998e-06 [updatestate_depend_eliminate]: 1.107e-05 [updatestate_assign_eliminate]: 2.72001e-06 [updatestate_loads_eliminate]: 3.18e-06 [renormalize]: 1.46998e-06 [cse]: 3.337e-05 [optimize_parallel_all_gather_comm]: 2.305e-05 [overlap_param_gather]: 2.63e-06 [cconv]: 4.313e-05 [loop_unroll]: 0.00061623 [opt_after_cconv]: 0.00013027, [1] [Cycle 1]: 0.00012255, [7] [c_1]: 3.563e-05 [parameter_eliminate]: 7.32997e-06 [updatestate_depend_eliminate]: 9.07001e-06 [updatestate_assign_eliminate]: 2.69001e-06 [updatestate_loads_eliminate]: 2.49001e-06 [cse]: 2.713e-05 [renormalize]: 6.90023e-07 [remove_dup_value]: 0.0001263 [tuple_transform]: 9.985e-05, [1] [Cycle 1]: 9.342e-05, [4] [d_1]: 5.961e-05 [none_parameter_eliminate]: 2.67001e-06 [renormalize]: 4.80009e-07 [switch_simplify]: 8.2e-06 [partial_unused_args_eliminate]: 2.07001e-06 [add_recomputation]: 6.886e-05 [cse_after_recomputation]: 2.789e-05, [1] [Cycle 1]: 2.169e-05, [1] [cse]: 1.517e-05 [environ_conv]: 6.31e-06 [swap_dp_allreduce_reducescatter]: 8.69e-06 [bias_add_comm_swap]: 4.82e-06 [label_micro_interleaved_index]: 7.05998e-06 [label_fine_grained_interleaved_index]: 3.18998e-06 [merge_cast_opt]: 1.52999e-06 [slice_recompute_activation]: 2.66e-06 [micro_interleaved_order_control]: 3.11999e-06 [assign_add_opt]: 1.78997e-06 [ForceFp32Comm]: 1.29e-06 [remove_cast_before_assign_add]: 8.70001e-07 [full_micro_interleaved_order_control]: 2.44999e-06 [reorder_send_recv_between_fp_bp]: 2.88e-06 [comm_op_add_attrs]: 1.70001e-06 [add_comm_op_reuse_tag]: 1.52999e-06 [interleave_split_concat_branches]: 1.31998e-06 [interleave_parallel_branches]: 1.16002e-06 [overlap_opt_shard_in_pipeline]: 2.39999e-06 [overlap_opt_shard_grad_in_pipeline]: 2.76e-06 [control_data_broadcast_order]: 1.825e-05 [grouped_pairwise_exchange_alltoall]: 1.99e-06 [offloading_packed_experts]: 4.1e-06 [overlap_recompute_and_grad_model_parallel]: 5.05999e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.37e-06 [overlap_recompute_allgather_and_fa_grad]: 1.42e-06 [overlap_recompute_comm]: 3.08e-06 [overlap_grad_ring_attention]: 4.37998e-06 [overlap_grad_flash_sp]: 2.649e-05 [begin_end_overlap_inline]: 7.80012e-07 [split_matmul_comm_elemetwise]: 2.28002e-06 [split_layernorm_comm]: 2.03002e-06 [handle_group_info]: 1.05001e-06 [symbol_engine_optimizer]: 9.444e-05, [1] [Cycle 1]: 8.904e-05, [6] [build]: 5.24998e-06 [elim_shapecalc]: 1.527e-05 [elim_not_effective]: 1.702e-05 [opt_reshape]: 7.73001e-06 [fold_const_symbol]: 1.185e-05 [renormalize]: 2.20025e-07 [detach_backward]: 3.13e-06 [pipeline_parallel_scheduler]: 2.02999e-06 [auto_monad_reorder]: 2.133e-05 [get_jit_bprop_graph]: 1.99e-06 [rewriter_after_jit_bprop_graph]: 7.66999e-06 [opt_after_jit_grad]: 0.00076141 [validate]: 5.456e-05 Sums bootstrap : 0.000476s : 0.37% type_inference : 0.122572s : 94.54% event_method : 0.000026s : 0.02% auto_monad : 0.000080s : 0.06% graph_reusing : 0.000006s : 0.00% inline : 0.000003s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000030s : 0.02% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.00% parallel-infer-symbol : 0.000006s : 0.00% pre_auto_parallel : 0.000047s : 0.04% insert-virtual-dataset : 0.000003s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000038s : 0.03% optimize.rewriter_before_opt_a : 0.000110s : 0.08% optimize.opt_a.expand_dump_flag : 0.000006s : 0.00% optimize.opt_a.switch_simplify : 0.000058s : 0.04% optimize.opt_a.loop_unroll : 0.000041s : 0.03% optimize.opt_a.a_1 : 0.000932s : 0.72% optimize.opt_a.with_stream_mark : 0.000044s : 0.03% optimize.opt_a.recompute_prepare : 0.000024s : 0.02% optimize.opt_a.updatestate_depend_eliminate : 0.000010s : 0.01% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.01% optimize.opt_a.updatestate_loads_eliminate : 0.000007s : 0.01% optimize.opt_a.parameter_eliminate : 0.000004s : 0.00% optimize.opt_a.a_2 : 0.000180s : 0.14% optimize.opt_a.accelerated_algorithm : 0.000016s : 0.01% optimize.opt_a.shard : 0.000005s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.000005s : 0.00% optimize.opt_a.shard_inline : 0.000014s : 0.01% optimize.opt_a.merge_send_recv : 0.000018s : 0.01% optimize.opt_a.auto_parallel : 0.000019s : 0.01% optimize.opt_a.parallel : 0.000031s : 0.02% optimize.opt_a.flash_sp : 0.000018s : 0.01% optimize.opt_a.merge_comm : 0.000008s : 0.01% optimize.opt_a.allreduce_fusion : 0.000008s : 0.01% optimize.opt_a.matmul_add_comm_reduction : 0.000022s : 0.02% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000020s : 0.02% optimize.opt_a.virtual_dataset : 0.000014s : 0.01% optimize.opt_a.get_grad_eliminate_ : 0.000014s : 0.01% optimize.opt_a.virtual_output : 0.000013s : 0.01% optimize.opt_a.merge_forward : 0.000009s : 0.01% optimize.opt_a.cell_reuse_recompute_pass : 0.000005s : 0.00% optimize.opt_a.offload_activation : 0.000022s : 0.02% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000029s : 0.02% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.00% optimize.opt_a.before_grad : 0.000024s : 0.02% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000010s : 0.01% optimize.opt_a.meta_fg_expand : 0.000007s : 0.01% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.00% optimize.opt_a.receive_attached : 0.000004s : 0.00% optimize.opt_a.after_resolve : 0.000025s : 0.02% optimize.opt_a.a_after_grad : 0.000020s : 0.02% optimize.opt_a.renormalize : 0.001017s : 0.78% optimize.opt_a.add_forward_monad_depend : 0.000010s : 0.01% optimize.opt_a.auto_monad_grad : 0.000006s : 0.00% optimize.opt_a.auto_monad_eliminator : 0.000030s : 0.02% optimize.opt_a.cse : 0.000052s : 0.04% optimize.opt_a.a_3 : 0.000094s : 0.07% optimize.py_interpret_to_execute_after_opt_a : 0.000018s : 0.01% optimize.slice_cell_reuse_recomputed_activation : 0.000003s : 0.00% optimize.rewriter_after_opt_a : 0.000043s : 0.03% optimize.convert_after_rewriter : 0.000007s : 0.01% optimize.order_py_execute_after_rewriter : 0.000006s : 0.00% optimize.mutable_eliminate : 0.000962s : 0.74% optimize.opt_b.b_1 : 0.000139s : 0.11% optimize.opt_b.b_2 : 0.000010s : 0.01% optimize.opt_b.updatestate_depend_eliminate : 0.000011s : 0.01% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.00% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000033s : 0.03% optimize.optimize_parallel_all_gather_comm : 0.000023s : 0.02% optimize.overlap_param_gather : 0.000003s : 0.00% optimize.cconv : 0.000043s : 0.03% optimize.loop_unroll : 0.000616s : 0.48% optimize.opt_after_cconv.c_1 : 0.000036s : 0.03% optimize.opt_after_cconv.parameter_eliminate : 0.000007s : 0.01% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000009s : 0.01% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.00% optimize.opt_after_cconv.cse : 0.000027s : 0.02% optimize.opt_after_cconv.renormalize : 0.000001s : 0.00% optimize.remove_dup_value : 0.000126s : 0.10% optimize.tuple_transform.d_1 : 0.000060s : 0.05% optimize.tuple_transform.none_parameter_eliminate : 0.000003s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000008s : 0.01% optimize.partial_unused_args_eliminate : 0.000002s : 0.00% optimize.add_recomputation : 0.000069s : 0.05% optimize.cse_after_recomputation.cse : 0.000015s : 0.01% optimize.environ_conv : 0.000006s : 0.00% optimize.swap_dp_allreduce_reducescatter : 0.000009s : 0.01% optimize.bias_add_comm_swap : 0.000005s : 0.00% optimize.label_micro_interleaved_index : 0.000007s : 0.01% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.00% optimize.merge_cast_opt : 0.000002s : 0.00% optimize.slice_recompute_activation : 0.000003s : 0.00% optimize.micro_interleaved_order_control : 0.000003s : 0.00% optimize.assign_add_opt : 0.000002s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.00% optimize.full_micro_interleaved_order_control : 0.000002s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.00% optimize.comm_op_add_attrs : 0.000002s : 0.00% optimize.add_comm_op_reuse_tag : 0.000002s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000002s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000003s : 0.00% optimize.control_data_broadcast_order : 0.000018s : 0.01% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.00% optimize.offloading_packed_experts : 0.000004s : 0.00% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.00% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.00% optimize.overlap_recompute_comm : 0.000003s : 0.00% optimize.overlap_grad_ring_attention : 0.000004s : 0.00% optimize.overlap_grad_flash_sp : 0.000026s : 0.02% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.00% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000005s : 0.00% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000015s : 0.01% optimize.symbol_engine_optimizer.elim_not_effective : 0.000017s : 0.01% optimize.symbol_engine_optimizer.opt_reshape : 0.000008s : 0.01% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000012s : 0.01% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000003s : 0.00% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000021s : 0.02% get_jit_bprop_graph : 0.000002s : 0.00% rewriter_after_jit_bprop_graph : 0.000008s : 0.01% opt_after_jit_grad : 0.000761s : 0.59% validate : 0.000055s : 0.04% Time group info: ------[substitution.] 0.000299 34 14.08% : 0.000042s : 6: substitution.arithmetic_simplify 0.90% : 0.000003s : 2: substitution.elim_not_effective 0.53% : 0.000002s : 2: substitution.fold_const_symbol 2.60% : 0.000008s : 4: substitution.graph_param_transform 69.16% : 0.000207s : 4: substitution.inline 1.79% : 0.000005s : 4: substitution.j_node_and_user_rematch 2.10% : 0.000006s : 4: substitution.remove_not_recompute_node 1.94% : 0.000006s : 4: substitution.replace_old_param 6.91% : 0.000021s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.122477 2 99.23% : 0.121535s : 1: type_inference.infer 0.77% : 0.000942s : 1: type_inference.specialize ------[replace.] 0.000077 8 64.34% : 0.000050s : 4: replace.inline 35.66% : 0.000028s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000222 8 91.65% : 0.000204s : 4: match.inline 8.35% : 0.000019s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000237 1278 0.83% : 0.000002s : 13: predicate.accumulaten_eliminater 1.21% : 0.000003s : 4: predicate.ad_related_special_op_eliminate 0.52% : 0.000001s : 8: predicate.addn_check_dump 1.02% : 0.000002s : 13: predicate.addn_zero_filter 0.72% : 0.000002s : 13: predicate.adjust_all_reduce_mul_add 2.28% : 0.000005s : 21: predicate.arithmetic_simplify 1.00% : 0.000002s : 13: predicate.cast_eliminate 0.55% : 0.000001s : 8: predicate.check_bprop_eliminate 0.51% : 0.000001s : 8: predicate.compare_switch_simplify 0.16% : 0.000000s : 4: predicate.const_output_eliminate 0.58% : 0.000001s : 8: predicate.depend_value_elim 1.08% : 0.000003s : 13: predicate.dict_get_item_const_eliminator 1.00% : 0.000002s : 13: predicate.dict_get_item_eliminator 0.89% : 0.000002s : 13: predicate.dict_set_item_eliminator 1.10% : 0.000003s : 8: predicate.dumpgradient_eliminate 0.53% : 0.000001s : 4: predicate.elim_not_effective 0.44% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.28% : 0.000003s : 17: predicate.environ_add_const_eliminate 0.95% : 0.000002s : 17: predicate.environ_get_add_eliminate 1.02% : 0.000002s : 17: predicate.environ_get_depend_swap 1.58% : 0.000004s : 25: predicate.environ_get_eliminate 1.05% : 0.000002s : 17: predicate.environ_get_set_eliminate 1.32% : 0.000003s : 21: predicate.exchange_switch_depend_value 2.49% : 0.000006s : 21: predicate.float_depend_g_call 0.57% : 0.000001s : 8: predicate.float_environ_get_switch 0.77% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.16% : 0.000000s : 4: predicate.fold_const_symbol 0.58% : 0.000001s : 8: predicate.get_grad_eliminate 0.26% : 0.000001s : 4: predicate.graph_param_transform 0.49% : 0.000001s : 8: predicate.incorporate_call 0.43% : 0.000001s : 8: predicate.incorporate_call_switch 6.32% : 0.000015s : 58: predicate.inline 0.76% : 0.000002s : 8: predicate.inline_without_move 0.27% : 0.000001s : 8: predicate.j_node_and_user_rematch 0.89% : 0.000002s : 8: predicate.less_batch_normalization 1.89% : 0.000004s : 25: predicate.list_to_tuple_eliminator_ 2.24% : 0.000005s : 38: predicate.load_eliminater 1.22% : 0.000003s : 4: predicate.loop_unroll_after_grad 2.51% : 0.000006s : 34: predicate.loop_unroll_before_grad 1.61% : 0.000004s : 21: predicate.make_slice_get_slice_eliminator 0.51% : 0.000001s : 8: predicate.merge_addn 0.53% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.54% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.73% : 0.000002s : 13: predicate.minmaximum_grad 1.47% : 0.000003s : 4: predicate.mutable_eliminate 0.34% : 0.000001s : 4: predicate.opt_reshape 0.37% : 0.000001s : 4: predicate.parallel_virtual_node 2.22% : 0.000005s : 21: predicate.partial_defer_inline 1.54% : 0.000004s : 21: predicate.partial_eliminate 0.93% : 0.000002s : 13: predicate.print_const_string_wrapper 0.51% : 0.000001s : 8: predicate.reduce_all_const_elim 1.15% : 0.000003s : 13: predicate.reduce_eliminate 2.54% : 0.000006s : 38: predicate.redundant_stop_gradient_eliminater 0.51% : 0.000001s : 8: predicate.remove_not_recompute_node 1.33% : 0.000003s : 25: predicate.replace_applicator 0.46% : 0.000001s : 8: predicate.replace_old_param 0.41% : 0.000001s : 4: predicate.reset_defer_inline 0.83% : 0.000002s : 13: predicate.reshape_eliminate 0.56% : 0.000001s : 8: predicate.row_tensor_add_zeros_like 0.40% : 0.000001s : 4: predicate.row_tensor_eliminate 0.75% : 0.000002s : 8: predicate.same_eliminate 0.52% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.76% : 0.000002s : 8: predicate.shard_identity_eliminate 0.78% : 0.000002s : 8: predicate.special_op_eliminate 0.62% : 0.000001s : 8: predicate.specialize_transform 0.92% : 0.000002s : 8: predicate.split_environ_get_set_with_tuple_value 0.71% : 0.000002s : 8: predicate.stack_unstack_eliminate 0.29% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.40% : 0.000003s : 21: predicate.switch_defer_inline 2.02% : 0.000005s : 29: predicate.switch_layer_defer_inline 4.83% : 0.000011s : 67: predicate.switch_simplify 0.87% : 0.000002s : 13: predicate.tile_eliminate 0.91% : 0.000002s : 13: predicate.transpose_eliminate 1.38% : 0.000003s : 21: predicate.tuple_list_convert_item_index_to_positive 1.56% : 0.000004s : 21: predicate.tuple_list_get_item_const_eliminator 1.70% : 0.000004s : 21: predicate.tuple_list_get_item_depend_reorder 3.80% : 0.000009s : 33: predicate.tuple_list_get_item_eliminator 1.78% : 0.000004s : 21: predicate.tuple_list_get_set_item_eliminator 2.16% : 0.000005s : 29: predicate.tuple_list_set_item_eliminator 1.55% : 0.000004s : 25: predicate.tuple_to_list_eliminator_ 2.23% : 0.000005s : 38: predicate.updatestate_pure_node_eliminater 3.19% : 0.000008s : 46: predicate.updatestate_useless_node_eliminater 0.38% : 0.000001s : 4: predicate.value_based_eliminate 0.57% : 0.000001s : 8: predicate.virtual_dataset_eliminate 0.60% : 0.000001s : 8: predicate.virtual_output_eliminate 0.27% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.48% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000769 11 52.99% : 0.000408s : 5: func_graph_cloner_run.FuncGraphClonerGraph 47.01% : 0.000362s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.148526 192 0.00% : 0.000004s : 1: ForceFp32Comm 2.88% : 0.004278s : 1: add_attr 2.87% : 0.004261s : 1: add_attr_with_inline 0.00% : 0.000005s : 1: add_comm_op_reuse_tag 0.05% : 0.000074s : 1: add_recomputation 0.00% : 0.000005s : 1: assign_add_opt 0.06% : 0.000087s : 1: auto_monad 0.02% : 0.000026s : 1: auto_monad_reorder 0.00% : 0.000004s : 1: begin_end_overlap_inline 0.01% : 0.000008s : 1: bias_add_comm_swap 0.34% : 0.000510s : 1: bootstrap 0.03% : 0.000047s : 1: cconv 0.00% : 0.000004s : 1: comm_op_add_attrs 0.01% : 0.000022s : 1: control_data_broadcast_order 0.01% : 0.000011s : 1: convert_after_rewriter 0.02% : 0.000031s : 1: cse_after_recomputation 0.00% : 0.000005s : 1: dataset_repeat_opt 0.00% : 0.000006s : 1: detach_backward 0.03% : 0.000040s : 1: environ_conv 0.02% : 0.000035s : 1: event_method 0.00% : 0.000005s : 1: full_micro_interleaved_order_control 0.00% : 0.000005s : 1: get_jit_bprop_graph 0.01% : 0.000009s : 1: graph_reusing 0.00% : 0.000005s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000004s : 1: handle_group_info 0.00% : 0.000006s : 1: inline 0.00% : 0.000007s : 1: insert-virtual-dataset 0.00% : 0.000004s : 1: interleave_parallel_branches 0.00% : 0.000004s : 1: interleave_split_concat_branches 0.00% : 0.000007s : 1: label_fine_grained_interleaved_index 0.01% : 0.000010s : 1: label_micro_interleaved_index 0.42% : 0.000627s : 1: loop_unroll 0.00% : 0.000004s : 1: merge_cast_opt 0.00% : 0.000006s : 1: micro_interleaved_order_control 0.66% : 0.000977s : 1: mutable_eliminate 0.00% : 0.000007s : 1: offloading_packed_experts 0.01% : 0.000020s : 1: opt.transform.loop_unroll_optimizer 0.02% : 0.000025s : 1: opt.transform.mutable_eliminate 0.95% : 0.001415s : 78: opt.transform.opt_a 0.02% : 0.000034s : 1: opt.transform.opt_after_cconv 0.02% : 0.000036s : 1: opt.transform.opt_after_jit_grad 0.07% : 0.000110s : 28: opt.transform.opt_b 0.04% : 0.000065s : 2: opt.transform.opt_trans_graph 0.03% : 0.000047s : 4: opt.transform.symbol_engine_opt 2.34% : 0.003471s : 1: opt_a 0.09% : 0.000134s : 1: opt_after_cconv 0.52% : 0.000776s : 1: opt_after_jit_grad 0.17% : 0.000253s : 1: opt_b 4.42% : 0.006568s : 1: optimize 0.02% : 0.000027s : 1: optimize_parallel_all_gather_comm 0.01% : 0.000009s : 1: order_py_execute_after_rewriter 0.02% : 0.000030s : 1: overlap_grad_flash_sp 0.00% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.00% : 0.000007s : 1: overlap_grad_ring_attention 0.00% : 0.000006s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000005s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000006s : 1: overlap_param_gather 0.00% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.01% : 0.000010s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000006s : 1: overlap_recompute_comm 0.01% : 0.000009s : 1: parallel-infer-symbol 0.00% : 0.000005s : 1: parallel-infer-symbol-second 0.00% : 0.000005s : 1: partial_unused_args_eliminate 0.00% : 0.000005s : 1: pipeline_parallel_scheduler 0.00% : 0.000005s : 1: pipeline_split 0.03% : 0.000051s : 1: pre_auto_parallel 0.03% : 0.000043s : 1: py_interpret_to_execute 0.01% : 0.000022s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000003s : 1: remove_cast_before_assign_add 0.09% : 0.000135s : 1: remove_dup_value 0.37% : 0.000544s : 1: renormalize.infer 0.31% : 0.000462s : 1: renormalize.specialize 0.00% : 0.000005s : 1: reorder_send_recv_between_fp_bp 0.01% : 0.000011s : 1: rewriter_after_jit_bprop_graph 0.03% : 0.000047s : 1: rewriter_after_opt_a 0.08% : 0.000117s : 1: rewriter_before_opt_a 0.00% : 0.000006s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000007s : 1: slice_recompute_activation 0.00% : 0.000007s : 1: split_layernorm_comm 0.00% : 0.000005s : 1: split_matmul_comm_elemetwise 0.01% : 0.000015s : 1: swap_dp_allreduce_reducescatter 0.07% : 0.000097s : 1: symbol_engine_optimizer 0.07% : 0.000103s : 1: tuple_transform 82.55% : 0.122602s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:40:48.295.108 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:40:48.295.396 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.351795, [21] [bootstrap]: 0.00047742 [type_inference]: 0.17488 [event_method]: 2.063e-05 [auto_monad]: 6.38e-05 [graph_reusing]: 6.29999e-06 [inline]: 2.29999e-06 [add_attr]: 0.00344469, [1] [add_attr_with_inline]: 0.00343437, [1] [Cycle 1]: 8.244e-05, [2] [tag_attr]: 2.155e-05 [meta_addattr_fg_expand]: 6.43e-06 [parallel-infer-symbol]: 3.33998e-06 [pre_auto_parallel]: 3.649e-05 [insert-virtual-dataset]: 2.73e-06 [parallel-infer-symbol-second]: 8.59989e-07 [dataset_repeat_opt]: 2.28998e-06 [pipeline_split]: 1.57999e-06 [optimize]: 0.00597673, [53] [py_interpret_to_execute]: 3.136e-05 [rewriter_before_opt_a]: 9.43e-05 [opt_a]: 0.00338586, [2] [Cycle 1]: 0.00247727, [45] [expand_dump_flag]: 3.18e-06 [switch_simplify]: 4.37e-05 [loop_unroll]: 3.081e-05 [a_1]: 0.00068255 [with_stream_mark]: 1.811e-05 [recompute_prepare]: 9.82999e-06 [updatestate_depend_eliminate]: 4.07003e-06 [updatestate_assign_eliminate]: 3.31999e-06 [updatestate_loads_eliminate]: 2.89999e-06 [parameter_eliminate]: 2.04e-06 [a_2]: 0.00012142 [accelerated_algorithm]: 7.82e-06 [shard]: 2.54001e-06 [meta_shard_fg_expand]: 2.12001e-06 [shard_inline]: 6.63e-06 [merge_send_recv]: 9.16998e-06 [auto_parallel]: 6.62002e-06 [parallel]: 1.937e-05 [flash_sp]: 9.32999e-06 [merge_comm]: 4.43001e-06 [allreduce_fusion]: 3.32002e-06 [matmul_add_comm_reduction]: 1.063e-05 [allreduce_slice_to_reducescatter]: 6.90023e-07 [virtual_shard_identity]: 7.86001e-06 [virtual_dataset]: 6.91999e-06 [get_grad_eliminate_]: 6.28e-06 [virtual_output]: 6.58e-06 [merge_forward]: 4.75999e-06 [cell_reuse_recompute_pass]: 1.32e-06 [offload_activation]: 1.111e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.479e-05 [merge_recompute_call_nodes]: 1.63002e-06 [before_grad]: 1.323e-05 [set_forward_comm_id_for_comm_node_pass]: 4.42998e-06 [meta_fg_expand]: 3.23e-06 [flash_sp_send_recv_attached]: 2.56e-06 [receive_attached]: 2.73e-06 [after_resolve]: 1.272e-05 [a_after_grad]: 1.078e-05 [renormalize]: 0.00084598 [add_forward_monad_depend]: 6.91999e-06 [auto_monad_grad]: 2.79001e-06 [auto_monad_eliminator]: 1.696e-05 [cse]: 3.15e-05 [a_3]: 6.645e-05 [Cycle 2]: 0.00089293, [45] [expand_dump_flag]: 1.81e-06 [switch_simplify]: 8.19002e-06 [loop_unroll]: 7.15003e-06 [a_1]: 0.00015538 [with_stream_mark]: 1.42e-05 [recompute_prepare]: 7.53999e-06 [updatestate_depend_eliminate]: 3.26001e-06 [updatestate_assign_eliminate]: 2.64001e-06 [updatestate_loads_eliminate]: 2.40002e-06 [parameter_eliminate]: 1.65001e-06 [a_2]: 0.00010664 [accelerated_algorithm]: 6.73e-06 [shard]: 1.52001e-06 [meta_shard_fg_expand]: 1.55001e-06 [shard_inline]: 6.11e-06 [merge_send_recv]: 5.49e-06 [auto_parallel]: 6.56e-06 [parallel]: 6.76999e-06 [flash_sp]: 3.98999e-06 [merge_comm]: 3.88999e-06 [allreduce_fusion]: 4e-06 [matmul_add_comm_reduction]: 6.71999e-06 [allreduce_slice_to_reducescatter]: 5.09986e-07 [virtual_shard_identity]: 7.56999e-06 [virtual_dataset]: 6.17001e-06 [get_grad_eliminate_]: 5.97999e-06 [virtual_output]: 5.90002e-06 [merge_forward]: 4.17998e-06 [cell_reuse_recompute_pass]: 2.11e-06 [offload_activation]: 9.10999e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.474e-05 [merge_recompute_call_nodes]: 1.35001e-06 [before_grad]: 1.118e-05 [set_forward_comm_id_for_comm_node_pass]: 4.32e-06 [meta_fg_expand]: 2.59999e-06 [flash_sp_send_recv_attached]: 1.00999e-06 [receive_attached]: 1.69e-06 [after_resolve]: 1.206e-05 [a_after_grad]: 9.69e-06 [renormalize]: 8.00064e-08 [add_forward_monad_depend]: 1.96998e-06 [auto_monad_grad]: 1.42999e-06 [auto_monad_eliminator]: 8.40999e-06 [cse]: 1.549e-05 [a_3]: 4.985e-05 [py_interpret_to_execute_after_opt_a]: 1.511e-05 [slice_cell_reuse_recomputed_activation]: 5.56e-06 [rewriter_after_opt_a]: 4.271e-05 [convert_after_rewriter]: 1.017e-05 [order_py_execute_after_rewriter]: 8.03001e-06 [mutable_eliminate]: 0.00070989 [opt_b]: 0.00028135, [1] [Cycle 1]: 0.00027118, [7] [b_1]: 0.00017203 [b_2]: 8.22998e-06 [updatestate_depend_eliminate]: 6.74001e-06 [updatestate_assign_eliminate]: 2.57001e-06 [updatestate_loads_eliminate]: 2.44001e-06 [renormalize]: 7.60017e-07 [cse]: 2.088e-05 [optimize_parallel_all_gather_comm]: 2.038e-05 [overlap_param_gather]: 5.64e-06 [cconv]: 3.463e-05 [loop_unroll]: 0.00048223 [opt_after_cconv]: 0.00013397, [1] [Cycle 1]: 0.00012503, [7] [c_1]: 3.257e-05 [parameter_eliminate]: 5.12e-06 [updatestate_depend_eliminate]: 6.49999e-06 [updatestate_assign_eliminate]: 2.67001e-06 [updatestate_loads_eliminate]: 2.34001e-06 [cse]: 1.819e-05 [renormalize]: 6.30011e-07 [remove_dup_value]: 1.716e-05 [tuple_transform]: 9.561e-05, [1] [Cycle 1]: 8.773e-05, [4] [d_1]: 4.786e-05 [none_parameter_eliminate]: 2.36e-06 [renormalize]: 2.40019e-07 [switch_simplify]: 7.56999e-06 [partial_unused_args_eliminate]: 4.71002e-06 [add_recomputation]: 5.264e-05 [cse_after_recomputation]: 2.911e-05, [1] [Cycle 1]: 2.216e-05, [1] [cse]: 1.268e-05 [environ_conv]: 8.43999e-06 [swap_dp_allreduce_reducescatter]: 7.55e-06 [bias_add_comm_swap]: 5.14e-06 [label_micro_interleaved_index]: 8.18999e-06 [label_fine_grained_interleaved_index]: 6.16998e-06 [merge_cast_opt]: 3.83001e-06 [slice_recompute_activation]: 4.62e-06 [micro_interleaved_order_control]: 4.57e-06 [assign_add_opt]: 3.59002e-06 [ForceFp32Comm]: 3.23998e-06 [remove_cast_before_assign_add]: 3.62002e-06 [full_micro_interleaved_order_control]: 4.65001e-06 [reorder_send_recv_between_fp_bp]: 5.60001e-06 [comm_op_add_attrs]: 3.52002e-06 [add_comm_op_reuse_tag]: 3.91001e-06 [interleave_split_concat_branches]: 3.53e-06 [interleave_parallel_branches]: 4.02002e-06 [overlap_opt_shard_in_pipeline]: 3.76999e-06 [overlap_opt_shard_grad_in_pipeline]: 5.03002e-06 [control_data_broadcast_order]: 1.61e-05 [grouped_pairwise_exchange_alltoall]: 4.26001e-06 [offloading_packed_experts]: 7.03e-06 [overlap_recompute_and_grad_model_parallel]: 7.98001e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.8e-06 [overlap_recompute_allgather_and_fa_grad]: 4e-06 [overlap_recompute_comm]: 5.27999e-06 [overlap_grad_ring_attention]: 6.79001e-06 [overlap_grad_flash_sp]: 2.496e-05 [begin_end_overlap_inline]: 3.04001e-06 [split_matmul_comm_elemetwise]: 4.48999e-06 [split_layernorm_comm]: 3.9e-06 [handle_group_info]: 3.33e-06 [symbol_engine_optimizer]: 0.00010553, [1] [Cycle 1]: 9.813e-05, [6] [build]: 3.78999e-06 [elim_shapecalc]: 1.133e-05 [elim_not_effective]: 1.33e-05 [opt_reshape]: 8.17e-06 [fold_const_symbol]: 1.128e-05 [renormalize]: 2.80008e-07 [detach_backward]: 1.195e-05 [pipeline_parallel_scheduler]: 2.71e-06 [auto_monad_reorder]: 4.829e-05 [get_jit_bprop_graph]: 2.84001e-06 [rewriter_after_jit_bprop_graph]: 1.353e-05 [opt_after_jit_grad]: 0.00087045 [validate]: 5.749e-05 Sums bootstrap : 0.000477s : 0.26% type_inference : 0.174880s : 96.47% event_method : 0.000021s : 0.01% auto_monad : 0.000064s : 0.04% graph_reusing : 0.000006s : 0.00% inline : 0.000002s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000022s : 0.01% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.00% parallel-infer-symbol : 0.000003s : 0.00% pre_auto_parallel : 0.000036s : 0.02% insert-virtual-dataset : 0.000003s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000031s : 0.02% optimize.rewriter_before_opt_a : 0.000094s : 0.05% optimize.opt_a.expand_dump_flag : 0.000005s : 0.00% optimize.opt_a.switch_simplify : 0.000052s : 0.03% optimize.opt_a.loop_unroll : 0.000038s : 0.02% optimize.opt_a.a_1 : 0.000838s : 0.46% optimize.opt_a.with_stream_mark : 0.000032s : 0.02% optimize.opt_a.recompute_prepare : 0.000017s : 0.01% optimize.opt_a.updatestate_depend_eliminate : 0.000007s : 0.00% optimize.opt_a.updatestate_assign_eliminate : 0.000006s : 0.00% optimize.opt_a.updatestate_loads_eliminate : 0.000005s : 0.00% optimize.opt_a.parameter_eliminate : 0.000004s : 0.00% optimize.opt_a.a_2 : 0.000228s : 0.13% optimize.opt_a.accelerated_algorithm : 0.000015s : 0.01% optimize.opt_a.shard : 0.000004s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.00% optimize.opt_a.shard_inline : 0.000013s : 0.01% optimize.opt_a.merge_send_recv : 0.000015s : 0.01% optimize.opt_a.auto_parallel : 0.000013s : 0.01% optimize.opt_a.parallel : 0.000026s : 0.01% optimize.opt_a.flash_sp : 0.000013s : 0.01% optimize.opt_a.merge_comm : 0.000008s : 0.00% optimize.opt_a.allreduce_fusion : 0.000007s : 0.00% optimize.opt_a.matmul_add_comm_reduction : 0.000017s : 0.01% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000015s : 0.01% optimize.opt_a.virtual_dataset : 0.000013s : 0.01% optimize.opt_a.get_grad_eliminate_ : 0.000012s : 0.01% optimize.opt_a.virtual_output : 0.000012s : 0.01% optimize.opt_a.merge_forward : 0.000009s : 0.00% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.00% optimize.opt_a.offload_activation : 0.000020s : 0.01% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000030s : 0.02% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.00% optimize.opt_a.before_grad : 0.000024s : 0.01% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000009s : 0.00% optimize.opt_a.meta_fg_expand : 0.000006s : 0.00% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.00% optimize.opt_a.receive_attached : 0.000004s : 0.00% optimize.opt_a.after_resolve : 0.000025s : 0.01% optimize.opt_a.a_after_grad : 0.000020s : 0.01% optimize.opt_a.renormalize : 0.000846s : 0.47% optimize.opt_a.add_forward_monad_depend : 0.000009s : 0.00% optimize.opt_a.auto_monad_grad : 0.000004s : 0.00% optimize.opt_a.auto_monad_eliminator : 0.000025s : 0.01% optimize.opt_a.cse : 0.000047s : 0.03% optimize.opt_a.a_3 : 0.000116s : 0.06% optimize.py_interpret_to_execute_after_opt_a : 0.000015s : 0.01% optimize.slice_cell_reuse_recomputed_activation : 0.000006s : 0.00% optimize.rewriter_after_opt_a : 0.000043s : 0.02% optimize.convert_after_rewriter : 0.000010s : 0.01% optimize.order_py_execute_after_rewriter : 0.000008s : 0.00% optimize.mutable_eliminate : 0.000710s : 0.39% optimize.opt_b.b_1 : 0.000172s : 0.09% optimize.opt_b.b_2 : 0.000008s : 0.00% optimize.opt_b.updatestate_depend_eliminate : 0.000007s : 0.00% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000002s : 0.00% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000021s : 0.01% optimize.optimize_parallel_all_gather_comm : 0.000020s : 0.01% optimize.overlap_param_gather : 0.000006s : 0.00% optimize.cconv : 0.000035s : 0.02% optimize.loop_unroll : 0.000482s : 0.27% optimize.opt_after_cconv.c_1 : 0.000033s : 0.02% optimize.opt_after_cconv.parameter_eliminate : 0.000005s : 0.00% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.00% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.00% optimize.opt_after_cconv.cse : 0.000018s : 0.01% optimize.opt_after_cconv.renormalize : 0.000001s : 0.00% optimize.remove_dup_value : 0.000017s : 0.01% optimize.tuple_transform.d_1 : 0.000048s : 0.03% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000008s : 0.00% optimize.partial_unused_args_eliminate : 0.000005s : 0.00% optimize.add_recomputation : 0.000053s : 0.03% optimize.cse_after_recomputation.cse : 0.000013s : 0.01% optimize.environ_conv : 0.000008s : 0.00% optimize.swap_dp_allreduce_reducescatter : 0.000008s : 0.00% optimize.bias_add_comm_swap : 0.000005s : 0.00% optimize.label_micro_interleaved_index : 0.000008s : 0.00% optimize.label_fine_grained_interleaved_index : 0.000006s : 0.00% optimize.merge_cast_opt : 0.000004s : 0.00% optimize.slice_recompute_activation : 0.000005s : 0.00% optimize.micro_interleaved_order_control : 0.000005s : 0.00% optimize.assign_add_opt : 0.000004s : 0.00% optimize.ForceFp32Comm : 0.000003s : 0.00% optimize.remove_cast_before_assign_add : 0.000004s : 0.00% optimize.full_micro_interleaved_order_control : 0.000005s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000006s : 0.00% optimize.comm_op_add_attrs : 0.000004s : 0.00% optimize.add_comm_op_reuse_tag : 0.000004s : 0.00% optimize.interleave_split_concat_branches : 0.000004s : 0.00% optimize.interleave_parallel_branches : 0.000004s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000005s : 0.00% optimize.control_data_broadcast_order : 0.000016s : 0.01% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.00% optimize.offloading_packed_experts : 0.000007s : 0.00% optimize.overlap_recompute_and_grad_model_parallel : 0.000008s : 0.00% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.00% optimize.overlap_recompute_comm : 0.000005s : 0.00% optimize.overlap_grad_ring_attention : 0.000007s : 0.00% optimize.overlap_grad_flash_sp : 0.000025s : 0.01% optimize.begin_end_overlap_inline : 0.000003s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000004s : 0.00% optimize.split_layernorm_comm : 0.000004s : 0.00% optimize.handle_group_info : 0.000003s : 0.00% optimize.symbol_engine_optimizer.build : 0.000004s : 0.00% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000011s : 0.01% optimize.symbol_engine_optimizer.elim_not_effective : 0.000013s : 0.01% optimize.symbol_engine_optimizer.opt_reshape : 0.000008s : 0.00% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000011s : 0.01% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000012s : 0.01% pipeline_parallel_scheduler : 0.000003s : 0.00% auto_monad_reorder : 0.000048s : 0.03% get_jit_bprop_graph : 0.000003s : 0.00% rewriter_after_jit_bprop_graph : 0.000014s : 0.01% opt_after_jit_grad : 0.000870s : 0.48% validate : 0.000057s : 0.03% Time group info: ------[substitution.] 0.000242 34 14.48% : 0.000035s : 6: substitution.arithmetic_simplify 0.75% : 0.000002s : 2: substitution.elim_not_effective 0.62% : 0.000002s : 2: substitution.fold_const_symbol 2.81% : 0.000007s : 4: substitution.graph_param_transform 68.16% : 0.000165s : 4: substitution.inline 1.84% : 0.000004s : 4: substitution.j_node_and_user_rematch 2.42% : 0.000006s : 4: substitution.remove_not_recompute_node 2.30% : 0.000006s : 4: substitution.replace_old_param 6.62% : 0.000016s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.174822 2 99.55% : 0.174032s : 1: type_inference.infer 0.45% : 0.000790s : 1: type_inference.specialize ------[replace.] 0.000065 8 62.93% : 0.000041s : 4: replace.inline 37.07% : 0.000024s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000176 8 91.93% : 0.000162s : 4: match.inline 8.07% : 0.000014s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000221 1278 1.01% : 0.000002s : 13: predicate.accumulaten_eliminater 1.28% : 0.000003s : 4: predicate.ad_related_special_op_eliminate 0.59% : 0.000001s : 8: predicate.addn_check_dump 0.91% : 0.000002s : 13: predicate.addn_zero_filter 0.76% : 0.000002s : 13: predicate.adjust_all_reduce_mul_add 2.43% : 0.000005s : 21: predicate.arithmetic_simplify 0.89% : 0.000002s : 13: predicate.cast_eliminate 0.53% : 0.000001s : 8: predicate.check_bprop_eliminate 0.54% : 0.000001s : 8: predicate.compare_switch_simplify 0.17% : 0.000000s : 4: predicate.const_output_eliminate 0.63% : 0.000001s : 8: predicate.depend_value_elim 0.90% : 0.000002s : 13: predicate.dict_get_item_const_eliminator 1.01% : 0.000002s : 13: predicate.dict_get_item_eliminator 0.81% : 0.000002s : 13: predicate.dict_set_item_eliminator 1.60% : 0.000004s : 8: predicate.dumpgradient_eliminate 0.28% : 0.000001s : 4: predicate.elim_not_effective 0.51% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.07% : 0.000002s : 17: predicate.environ_add_const_eliminate 1.13% : 0.000002s : 17: predicate.environ_get_add_eliminate 1.06% : 0.000002s : 17: predicate.environ_get_depend_swap 1.60% : 0.000004s : 25: predicate.environ_get_eliminate 1.04% : 0.000002s : 17: predicate.environ_get_set_eliminate 1.57% : 0.000003s : 21: predicate.exchange_switch_depend_value 2.38% : 0.000005s : 21: predicate.float_depend_g_call 0.55% : 0.000001s : 8: predicate.float_environ_get_switch 0.75% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.18% : 0.000000s : 4: predicate.fold_const_symbol 0.59% : 0.000001s : 8: predicate.get_grad_eliminate 0.20% : 0.000000s : 4: predicate.graph_param_transform 0.56% : 0.000001s : 8: predicate.incorporate_call 0.45% : 0.000001s : 8: predicate.incorporate_call_switch 5.95% : 0.000013s : 58: predicate.inline 0.71% : 0.000002s : 8: predicate.inline_without_move 0.31% : 0.000001s : 8: predicate.j_node_and_user_rematch 0.94% : 0.000002s : 8: predicate.less_batch_normalization 2.16% : 0.000005s : 25: predicate.list_to_tuple_eliminator_ 2.60% : 0.000006s : 38: predicate.load_eliminater 1.17% : 0.000003s : 4: predicate.loop_unroll_after_grad 2.39% : 0.000005s : 34: predicate.loop_unroll_before_grad 1.59% : 0.000004s : 21: predicate.make_slice_get_slice_eliminator 0.49% : 0.000001s : 8: predicate.merge_addn 0.66% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.65% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.84% : 0.000002s : 13: predicate.minmaximum_grad 1.32% : 0.000003s : 4: predicate.mutable_eliminate 0.35% : 0.000001s : 4: predicate.opt_reshape 0.36% : 0.000001s : 4: predicate.parallel_virtual_node 1.79% : 0.000004s : 21: predicate.partial_defer_inline 1.52% : 0.000003s : 21: predicate.partial_eliminate 0.89% : 0.000002s : 13: predicate.print_const_string_wrapper 0.57% : 0.000001s : 8: predicate.reduce_all_const_elim 1.15% : 0.000003s : 13: predicate.reduce_eliminate 2.34% : 0.000005s : 38: predicate.redundant_stop_gradient_eliminater 0.39% : 0.000001s : 8: predicate.remove_not_recompute_node 1.41% : 0.000003s : 25: predicate.replace_applicator 0.51% : 0.000001s : 8: predicate.replace_old_param 0.25% : 0.000001s : 4: predicate.reset_defer_inline 1.07% : 0.000002s : 13: predicate.reshape_eliminate 0.63% : 0.000001s : 8: predicate.row_tensor_add_zeros_like 0.37% : 0.000001s : 4: predicate.row_tensor_eliminate 0.79% : 0.000002s : 8: predicate.same_eliminate 0.51% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.80% : 0.000002s : 8: predicate.shard_identity_eliminate 0.77% : 0.000002s : 8: predicate.special_op_eliminate 0.63% : 0.000001s : 8: predicate.specialize_transform 0.91% : 0.000002s : 8: predicate.split_environ_get_set_with_tuple_value 0.82% : 0.000002s : 8: predicate.stack_unstack_eliminate 0.30% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.46% : 0.000003s : 21: predicate.switch_defer_inline 1.95% : 0.000004s : 29: predicate.switch_layer_defer_inline 4.83% : 0.000011s : 67: predicate.switch_simplify 0.95% : 0.000002s : 13: predicate.tile_eliminate 0.85% : 0.000002s : 13: predicate.transpose_eliminate 1.55% : 0.000003s : 21: predicate.tuple_list_convert_item_index_to_positive 1.41% : 0.000003s : 21: predicate.tuple_list_get_item_const_eliminator 1.81% : 0.000004s : 21: predicate.tuple_list_get_item_depend_reorder 3.18% : 0.000007s : 33: predicate.tuple_list_get_item_eliminator 1.68% : 0.000004s : 21: predicate.tuple_list_get_set_item_eliminator 2.10% : 0.000005s : 29: predicate.tuple_list_set_item_eliminator 1.65% : 0.000004s : 25: predicate.tuple_to_list_eliminator_ 2.24% : 0.000005s : 38: predicate.updatestate_pure_node_eliminater 3.02% : 0.000007s : 46: predicate.updatestate_useless_node_eliminater 0.39% : 0.000001s : 4: predicate.value_based_eliminate 0.66% : 0.000001s : 8: predicate.virtual_dataset_eliminate 0.56% : 0.000001s : 8: predicate.virtual_output_eliminate 0.27% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.55% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000675 11 56.54% : 0.000382s : 5: func_graph_cloner_run.FuncGraphClonerGraph 43.46% : 0.000293s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.363491 192 0.00% : 0.000006s : 1: ForceFp32Comm 0.95% : 0.003455s : 1: add_attr 0.95% : 0.003438s : 1: add_attr_with_inline 0.00% : 0.000008s : 1: add_comm_op_reuse_tag 0.02% : 0.000057s : 1: add_recomputation 0.00% : 0.000006s : 1: assign_add_opt 0.02% : 0.000072s : 1: auto_monad 0.02% : 0.000058s : 1: auto_monad_reorder 0.00% : 0.000006s : 1: begin_end_overlap_inline 0.00% : 0.000008s : 1: bias_add_comm_swap 0.14% : 0.000525s : 1: bootstrap 0.01% : 0.000038s : 1: cconv 0.00% : 0.000006s : 1: comm_op_add_attrs 0.01% : 0.000019s : 1: control_data_broadcast_order 0.00% : 0.000013s : 1: convert_after_rewriter 0.01% : 0.000032s : 1: cse_after_recomputation 0.00% : 0.000008s : 1: dataset_repeat_opt 0.02% : 0.000089s : 1: detach_backward 0.00% : 0.000011s : 1: environ_conv 0.01% : 0.000032s : 1: event_method 0.00% : 0.000007s : 1: full_micro_interleaved_order_control 0.00% : 0.000011s : 1: get_jit_bprop_graph 0.00% : 0.000013s : 1: graph_reusing 0.00% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000006s : 1: handle_group_info 0.00% : 0.000008s : 1: inline 0.00% : 0.000009s : 1: insert-virtual-dataset 0.00% : 0.000007s : 1: interleave_parallel_branches 0.00% : 0.000006s : 1: interleave_split_concat_branches 0.00% : 0.000009s : 1: label_fine_grained_interleaved_index 0.00% : 0.000011s : 1: label_micro_interleaved_index 0.13% : 0.000488s : 1: loop_unroll 0.00% : 0.000006s : 1: merge_cast_opt 0.00% : 0.000007s : 1: micro_interleaved_order_control 0.20% : 0.000717s : 1: mutable_eliminate 0.00% : 0.000010s : 1: offloading_packed_experts 0.00% : 0.000016s : 1: opt.transform.loop_unroll_optimizer 0.00% : 0.000018s : 1: opt.transform.mutable_eliminate 0.36% : 0.001292s : 78: opt.transform.opt_a 0.01% : 0.000031s : 1: opt.transform.opt_after_cconv 0.01% : 0.000054s : 1: opt.transform.opt_after_jit_grad 0.03% : 0.000106s : 28: opt.transform.opt_b 0.01% : 0.000053s : 2: opt.transform.opt_trans_graph 0.01% : 0.000040s : 4: opt.transform.symbol_engine_opt 0.93% : 0.003389s : 1: opt_a 0.04% : 0.000138s : 1: opt_after_cconv 0.24% : 0.000884s : 1: opt_after_jit_grad 0.08% : 0.000285s : 1: opt_b 47.18% : 0.171487s : 1: optimize 0.01% : 0.000024s : 1: optimize_parallel_all_gather_comm 0.00% : 0.000011s : 1: order_py_execute_after_rewriter 0.01% : 0.000029s : 1: overlap_grad_flash_sp 0.00% : 0.000007s : 1: overlap_grad_matmul_and_grad_allreduce 0.00% : 0.000010s : 1: overlap_grad_ring_attention 0.00% : 0.000008s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000006s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000009s : 1: overlap_param_gather 0.00% : 0.000007s : 1: overlap_recompute_allgather_and_fa_grad 0.00% : 0.000012s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000008s : 1: overlap_recompute_comm 0.00% : 0.000010s : 1: parallel-infer-symbol 0.00% : 0.000007s : 1: parallel-infer-symbol-second 0.00% : 0.000008s : 1: partial_unused_args_eliminate 0.00% : 0.000012s : 1: pipeline_parallel_scheduler 0.00% : 0.000007s : 1: pipeline_split 0.01% : 0.000044s : 1: pre_auto_parallel 0.01% : 0.000035s : 1: py_interpret_to_execute 0.01% : 0.000019s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000006s : 1: remove_cast_before_assign_add 0.01% : 0.000020s : 1: remove_dup_value 0.12% : 0.000439s : 1: renormalize.infer 0.11% : 0.000398s : 1: renormalize.specialize 0.00% : 0.000008s : 1: reorder_send_recv_between_fp_bp 0.01% : 0.000021s : 1: rewriter_after_jit_bprop_graph 0.01% : 0.000047s : 1: rewriter_after_opt_a 0.03% : 0.000098s : 1: rewriter_before_opt_a 0.00% : 0.000008s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000009s : 1: slice_recompute_activation 0.00% : 0.000008s : 1: split_layernorm_comm 0.00% : 0.000007s : 1: split_matmul_comm_elemetwise 0.00% : 0.000010s : 1: swap_dp_allreduce_reducescatter 0.03% : 0.000109s : 1: symbol_engine_optimizer 0.03% : 0.000099s : 1: tuple_transform 48.12% : 0.174927s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:40:50.894.913 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.222325, [21] [bootstrap]: 0.00048145 [type_inference]: 0.210958 [event_method]: 2.248e-05 [auto_monad]: 6.925e-05 [graph_reusing]: 6.09999e-06 [inline]: 2.73998e-06 [add_attr]: 0.00381533, [1] [add_attr_with_inline]: 0.0038019, [1] [Cycle 1]: 8.104e-05, [2] [tag_attr]: 2.523e-05 [meta_addattr_fg_expand]: 6.21e-06 [parallel-infer-symbol]: 3.89002e-06 [pre_auto_parallel]: 4.316e-05 [insert-virtual-dataset]: 2.40002e-06 [parallel-infer-symbol-second]: 8.60018e-07 [dataset_repeat_opt]: 2.01998e-06 [pipeline_split]: 1.67999e-06 [optimize]: 0.00600795, [53] [py_interpret_to_execute]: 3.136e-05 [rewriter_before_opt_a]: 9.943e-05 [opt_a]: 0.00349885, [2] [Cycle 1]: 0.0026913, [45] [expand_dump_flag]: 3.6e-06 [switch_simplify]: 4.532e-05 [loop_unroll]: 3.104e-05 [a_1]: 0.00072068 [with_stream_mark]: 2.492e-05 [recompute_prepare]: 1.188e-05 [updatestate_depend_eliminate]: 4.53999e-06 [updatestate_assign_eliminate]: 3.11999e-06 [updatestate_loads_eliminate]: 3.22002e-06 [parameter_eliminate]: 1.97001e-06 [a_2]: 9.048e-05 [accelerated_algorithm]: 9.22001e-06 [shard]: 2.10002e-06 [meta_shard_fg_expand]: 2.11e-06 [shard_inline]: 6.90002e-06 [merge_send_recv]: 9.27001e-06 [auto_parallel]: 8.23001e-06 [parallel]: 2.192e-05 [flash_sp]: 1.134e-05 [merge_comm]: 4.47e-06 [allreduce_fusion]: 3.66001e-06 [matmul_add_comm_reduction]: 1.254e-05 [allreduce_slice_to_reducescatter]: 8.2e-07 [virtual_shard_identity]: 1.082e-05 [virtual_dataset]: 7.16999e-06 [get_grad_eliminate_]: 7.32002e-06 [virtual_output]: 6.98e-06 [merge_forward]: 3.99002e-06 [cell_reuse_recompute_pass]: 1.34e-06 [offload_activation]: 1.094e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.482e-05 [merge_recompute_call_nodes]: 1.64e-06 [before_grad]: 1.153e-05 [set_forward_comm_id_for_comm_node_pass]: 4.18001e-06 [meta_fg_expand]: 3.16001e-06 [flash_sp_send_recv_attached]: 2.66e-06 [receive_attached]: 2.88003e-06 [after_resolve]: 1.34e-05 [a_after_grad]: 1.058e-05 [renormalize]: 0.00112633 [add_forward_monad_depend]: 8.00999e-06 [auto_monad_grad]: 3.56999e-06 [auto_monad_eliminator]: 2.191e-05 [cse]: 3.451e-05 [a_3]: 5.981e-05 [Cycle 2]: 0.00079373, [45] [expand_dump_flag]: 2.20002e-06 [switch_simplify]: 8.41002e-06 [loop_unroll]: 6.59001e-06 [a_1]: 0.00016073 [with_stream_mark]: 1.76e-05 [recompute_prepare]: 7.43999e-06 [updatestate_depend_eliminate]: 3.29001e-06 [updatestate_assign_eliminate]: 2.69999e-06 [updatestate_loads_eliminate]: 4.02e-06 [parameter_eliminate]: 1.59e-06 [a_2]: 7.994e-05 [accelerated_algorithm]: 7.71001e-06 [shard]: 2.14e-06 [meta_shard_fg_expand]: 2.35002e-06 [shard_inline]: 6.21998e-06 [merge_send_recv]: 7.65e-06 [auto_parallel]: 7.04001e-06 [parallel]: 7.54002e-06 [flash_sp]: 4.23999e-06 [merge_comm]: 3.73999e-06 [allreduce_fusion]: 3.51001e-06 [matmul_add_comm_reduction]: 8.68001e-06 [allreduce_slice_to_reducescatter]: 7.39994e-07 [virtual_shard_identity]: 8.08001e-06 [virtual_dataset]: 6.54999e-06 [get_grad_eliminate_]: 6.19999e-06 [virtual_output]: 5.76e-06 [merge_forward]: 4.57998e-06 [cell_reuse_recompute_pass]: 2.43e-06 [offload_activation]: 1.083e-05 [cell_reuse_handle_not_recompute_node_pass]: 3.522e-05 [merge_recompute_call_nodes]: 1.69e-06 [before_grad]: 1.434e-05 [set_forward_comm_id_for_comm_node_pass]: 5.34e-06 [meta_fg_expand]: 3.19001e-06 [flash_sp_send_recv_attached]: 1.76e-06 [receive_attached]: 2.21998e-06 [after_resolve]: 1.332e-05 [a_after_grad]: 1.075e-05 [renormalize]: 9.00181e-08 [add_forward_monad_depend]: 2.01998e-06 [auto_monad_grad]: 2.07999e-06 [auto_monad_eliminator]: 1.067e-05 [cse]: 1.759e-05 [a_3]: 3.806e-05 [py_interpret_to_execute_after_opt_a]: 1.56e-05 [slice_cell_reuse_recomputed_activation]: 2.28998e-06 [rewriter_after_opt_a]: 4.208e-05 [convert_after_rewriter]: 7.8e-06 [order_py_execute_after_rewriter]: 5.46e-06 [mutable_eliminate]: 0.00073159 [opt_b]: 0.00022502, [1] [Cycle 1]: 0.00021565, [7] [b_1]: 0.0001329 [b_2]: 9.04e-06 [updatestate_depend_eliminate]: 7e-06 [updatestate_assign_eliminate]: 2.58003e-06 [updatestate_loads_eliminate]: 2.34001e-06 [renormalize]: 8.39995e-07 [cse]: 2.083e-05 [optimize_parallel_all_gather_comm]: 1.762e-05 [overlap_param_gather]: 2.35002e-06 [cconv]: 3.268e-05 [loop_unroll]: 0.00053025 [opt_after_cconv]: 0.00010983, [1] [Cycle 1]: 0.00010318, [7] [c_1]: 3.29e-05 [parameter_eliminate]: 3.81001e-06 [updatestate_depend_eliminate]: 6.21e-06 [updatestate_assign_eliminate]: 2.51e-06 [updatestate_loads_eliminate]: 2.53003e-06 [cse]: 1.913e-05 [renormalize]: 5.3001e-07 [remove_dup_value]: 1.411e-05 [tuple_transform]: 9.911e-05, [1] [Cycle 1]: 9.322e-05, [4] [d_1]: 4.9e-05 [none_parameter_eliminate]: 1.84e-06 [renormalize]: 2.59985e-07 [switch_simplify]: 2.16e-05 [partial_unused_args_eliminate]: 2.31998e-06 [add_recomputation]: 5.621e-05 [cse_after_recomputation]: 2.518e-05, [1] [Cycle 1]: 1.986e-05, [1] [cse]: 1.411e-05 [environ_conv]: 6.10002e-06 [swap_dp_allreduce_reducescatter]: 5.25999e-06 [bias_add_comm_swap]: 2.58998e-06 [label_micro_interleaved_index]: 5.35001e-06 [label_fine_grained_interleaved_index]: 3.15998e-06 [merge_cast_opt]: 1.39e-06 [slice_recompute_activation]: 2.07001e-06 [micro_interleaved_order_control]: 2.49999e-06 [assign_add_opt]: 1.27e-06 [ForceFp32Comm]: 1.22999e-06 [remove_cast_before_assign_add]: 1.12e-06 [full_micro_interleaved_order_control]: 2.61e-06 [reorder_send_recv_between_fp_bp]: 2.78e-06 [comm_op_add_attrs]: 1.90001e-06 [add_comm_op_reuse_tag]: 1.35001e-06 [interleave_split_concat_branches]: 1.25001e-06 [interleave_parallel_branches]: 1.44e-06 [overlap_opt_shard_in_pipeline]: 1.37999e-06 [overlap_opt_shard_grad_in_pipeline]: 2.73e-06 [control_data_broadcast_order]: 1.524e-05 [grouped_pairwise_exchange_alltoall]: 2.14e-06 [offloading_packed_experts]: 4.12e-06 [overlap_recompute_and_grad_model_parallel]: 5.12e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.28002e-06 [overlap_recompute_allgather_and_fa_grad]: 1.55999e-06 [overlap_recompute_comm]: 2.61e-06 [overlap_grad_ring_attention]: 4.52e-06 [overlap_grad_flash_sp]: 2.396e-05 [begin_end_overlap_inline]: 5.29981e-07 [split_matmul_comm_elemetwise]: 2.40997e-06 [split_layernorm_comm]: 1.99999e-06 [handle_group_info]: 1.42e-06 [symbol_engine_optimizer]: 8.803e-05, [1] [Cycle 1]: 8.356e-05, [6] [build]: 4.88001e-06 [elim_shapecalc]: 1.332e-05 [elim_not_effective]: 1.519e-05 [opt_reshape]: 7.51001e-06 [fold_const_symbol]: 1.133e-05 [renormalize]: 3.30008e-07 [detach_backward]: 2.56998e-06 [pipeline_parallel_scheduler]: 1.81e-06 [auto_monad_reorder]: 1.877e-05 [get_jit_bprop_graph]: 2.78e-06 [rewriter_after_jit_bprop_graph]: 5.16998e-06 [opt_after_jit_grad]: 0.00064788 [validate]: 4.945e-05 Sums bootstrap : 0.000481s : 0.22% type_inference : 0.210958s : 97.04% event_method : 0.000022s : 0.01% auto_monad : 0.000069s : 0.03% graph_reusing : 0.000006s : 0.00% inline : 0.000003s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000025s : 0.01% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.00% parallel-infer-symbol : 0.000004s : 0.00% pre_auto_parallel : 0.000043s : 0.02% insert-virtual-dataset : 0.000002s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000031s : 0.01% optimize.rewriter_before_opt_a : 0.000099s : 0.05% optimize.opt_a.expand_dump_flag : 0.000006s : 0.00% optimize.opt_a.switch_simplify : 0.000054s : 0.02% optimize.opt_a.loop_unroll : 0.000038s : 0.02% optimize.opt_a.a_1 : 0.000881s : 0.41% optimize.opt_a.with_stream_mark : 0.000043s : 0.02% optimize.opt_a.recompute_prepare : 0.000019s : 0.01% optimize.opt_a.updatestate_depend_eliminate : 0.000008s : 0.00% optimize.opt_a.updatestate_assign_eliminate : 0.000006s : 0.00% optimize.opt_a.updatestate_loads_eliminate : 0.000007s : 0.00% optimize.opt_a.parameter_eliminate : 0.000004s : 0.00% optimize.opt_a.a_2 : 0.000170s : 0.08% optimize.opt_a.accelerated_algorithm : 0.000017s : 0.01% optimize.opt_a.shard : 0.000004s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.00% optimize.opt_a.shard_inline : 0.000013s : 0.01% optimize.opt_a.merge_send_recv : 0.000017s : 0.01% optimize.opt_a.auto_parallel : 0.000015s : 0.01% optimize.opt_a.parallel : 0.000029s : 0.01% optimize.opt_a.flash_sp : 0.000016s : 0.01% optimize.opt_a.merge_comm : 0.000008s : 0.00% optimize.opt_a.allreduce_fusion : 0.000007s : 0.00% optimize.opt_a.matmul_add_comm_reduction : 0.000021s : 0.01% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000002s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000019s : 0.01% optimize.opt_a.virtual_dataset : 0.000014s : 0.01% optimize.opt_a.get_grad_eliminate_ : 0.000014s : 0.01% optimize.opt_a.virtual_output : 0.000013s : 0.01% optimize.opt_a.merge_forward : 0.000009s : 0.00% optimize.opt_a.cell_reuse_recompute_pass : 0.000004s : 0.00% optimize.opt_a.offload_activation : 0.000022s : 0.01% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000050s : 0.02% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.00% optimize.opt_a.before_grad : 0.000026s : 0.01% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000010s : 0.00% optimize.opt_a.meta_fg_expand : 0.000006s : 0.00% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.00% optimize.opt_a.receive_attached : 0.000005s : 0.00% optimize.opt_a.after_resolve : 0.000027s : 0.01% optimize.opt_a.a_after_grad : 0.000021s : 0.01% optimize.opt_a.renormalize : 0.001126s : 0.52% optimize.opt_a.add_forward_monad_depend : 0.000010s : 0.00% optimize.opt_a.auto_monad_grad : 0.000006s : 0.00% optimize.opt_a.auto_monad_eliminator : 0.000033s : 0.01% optimize.opt_a.cse : 0.000052s : 0.02% optimize.opt_a.a_3 : 0.000098s : 0.05% optimize.py_interpret_to_execute_after_opt_a : 0.000016s : 0.01% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.00% optimize.rewriter_after_opt_a : 0.000042s : 0.02% optimize.convert_after_rewriter : 0.000008s : 0.00% optimize.order_py_execute_after_rewriter : 0.000005s : 0.00% optimize.mutable_eliminate : 0.000732s : 0.34% optimize.opt_b.b_1 : 0.000133s : 0.06% optimize.opt_b.b_2 : 0.000009s : 0.00% optimize.opt_b.updatestate_depend_eliminate : 0.000007s : 0.00% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000002s : 0.00% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000021s : 0.01% optimize.optimize_parallel_all_gather_comm : 0.000018s : 0.01% optimize.overlap_param_gather : 0.000002s : 0.00% optimize.cconv : 0.000033s : 0.02% optimize.loop_unroll : 0.000530s : 0.24% optimize.opt_after_cconv.c_1 : 0.000033s : 0.02% optimize.opt_after_cconv.parameter_eliminate : 0.000004s : 0.00% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.00% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.cse : 0.000019s : 0.01% optimize.opt_after_cconv.renormalize : 0.000001s : 0.00% optimize.remove_dup_value : 0.000014s : 0.01% optimize.tuple_transform.d_1 : 0.000049s : 0.02% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000022s : 0.01% optimize.partial_unused_args_eliminate : 0.000002s : 0.00% optimize.add_recomputation : 0.000056s : 0.03% optimize.cse_after_recomputation.cse : 0.000014s : 0.01% optimize.environ_conv : 0.000006s : 0.00% optimize.swap_dp_allreduce_reducescatter : 0.000005s : 0.00% optimize.bias_add_comm_swap : 0.000003s : 0.00% optimize.label_micro_interleaved_index : 0.000005s : 0.00% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.00% optimize.merge_cast_opt : 0.000001s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.00% optimize.micro_interleaved_order_control : 0.000002s : 0.00% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.00% optimize.full_micro_interleaved_order_control : 0.000003s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.00% optimize.comm_op_add_attrs : 0.000002s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000003s : 0.00% optimize.control_data_broadcast_order : 0.000015s : 0.01% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.00% optimize.offloading_packed_experts : 0.000004s : 0.00% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.00% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000002s : 0.00% optimize.overlap_recompute_comm : 0.000003s : 0.00% optimize.overlap_grad_ring_attention : 0.000005s : 0.00% optimize.overlap_grad_flash_sp : 0.000024s : 0.01% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.00% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000005s : 0.00% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000013s : 0.01% optimize.symbol_engine_optimizer.elim_not_effective : 0.000015s : 0.01% optimize.symbol_engine_optimizer.opt_reshape : 0.000008s : 0.00% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000011s : 0.01% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000003s : 0.00% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000019s : 0.01% get_jit_bprop_graph : 0.000003s : 0.00% rewriter_after_jit_bprop_graph : 0.000005s : 0.00% opt_after_jit_grad : 0.000648s : 0.30% validate : 0.000049s : 0.02% Time group info: ------[substitution.] 0.000277 34 14.15% : 0.000039s : 6: substitution.arithmetic_simplify 0.74% : 0.000002s : 2: substitution.elim_not_effective 0.66% : 0.000002s : 2: substitution.fold_const_symbol 2.51% : 0.000007s : 4: substitution.graph_param_transform 68.62% : 0.000190s : 4: substitution.inline 2.09% : 0.000006s : 4: substitution.j_node_and_user_rematch 2.42% : 0.000007s : 4: substitution.remove_not_recompute_node 2.57% : 0.000007s : 4: substitution.replace_old_param 6.23% : 0.000017s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.210877 2 99.60% : 0.210033s : 1: type_inference.infer 0.40% : 0.000845s : 1: type_inference.specialize ------[replace.] 0.000072 8 63.47% : 0.000046s : 4: replace.inline 36.53% : 0.000026s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000203 8 92.39% : 0.000187s : 4: match.inline 7.61% : 0.000015s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000236 1278 0.79% : 0.000002s : 13: predicate.accumulaten_eliminater 0.93% : 0.000002s : 4: predicate.ad_related_special_op_eliminate 0.51% : 0.000001s : 8: predicate.addn_check_dump 0.87% : 0.000002s : 13: predicate.addn_zero_filter 0.72% : 0.000002s : 13: predicate.adjust_all_reduce_mul_add 2.51% : 0.000006s : 21: predicate.arithmetic_simplify 0.89% : 0.000002s : 13: predicate.cast_eliminate 0.51% : 0.000001s : 8: predicate.check_bprop_eliminate 0.45% : 0.000001s : 8: predicate.compare_switch_simplify 0.17% : 0.000000s : 4: predicate.const_output_eliminate 0.60% : 0.000001s : 8: predicate.depend_value_elim 0.84% : 0.000002s : 13: predicate.dict_get_item_const_eliminator 0.88% : 0.000002s : 13: predicate.dict_get_item_eliminator 0.94% : 0.000002s : 13: predicate.dict_set_item_eliminator 1.08% : 0.000003s : 8: predicate.dumpgradient_eliminate 0.17% : 0.000000s : 4: predicate.elim_not_effective 0.48% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.12% : 0.000003s : 17: predicate.environ_add_const_eliminate 1.00% : 0.000002s : 17: predicate.environ_get_add_eliminate 0.94% : 0.000002s : 17: predicate.environ_get_depend_swap 1.64% : 0.000004s : 25: predicate.environ_get_eliminate 0.93% : 0.000002s : 17: predicate.environ_get_set_eliminate 1.33% : 0.000003s : 21: predicate.exchange_switch_depend_value 2.47% : 0.000006s : 21: predicate.float_depend_g_call 0.48% : 0.000001s : 8: predicate.float_environ_get_switch 0.67% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.16% : 0.000000s : 4: predicate.fold_const_symbol 0.69% : 0.000002s : 8: predicate.get_grad_eliminate 0.29% : 0.000001s : 4: predicate.graph_param_transform 0.52% : 0.000001s : 8: predicate.incorporate_call 0.43% : 0.000001s : 8: predicate.incorporate_call_switch 5.84% : 0.000014s : 58: predicate.inline 0.80% : 0.000002s : 8: predicate.inline_without_move 0.39% : 0.000001s : 8: predicate.j_node_and_user_rematch 1.18% : 0.000003s : 8: predicate.less_batch_normalization 1.70% : 0.000004s : 25: predicate.list_to_tuple_eliminator_ 2.32% : 0.000005s : 38: predicate.load_eliminater 1.00% : 0.000002s : 4: predicate.loop_unroll_after_grad 2.16% : 0.000005s : 34: predicate.loop_unroll_before_grad 1.44% : 0.000003s : 21: predicate.make_slice_get_slice_eliminator 0.49% : 0.000001s : 8: predicate.merge_addn 0.58% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.56% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.76% : 0.000002s : 13: predicate.minmaximum_grad 1.06% : 0.000003s : 4: predicate.mutable_eliminate 0.36% : 0.000001s : 4: predicate.opt_reshape 0.48% : 0.000001s : 4: predicate.parallel_virtual_node 1.82% : 0.000004s : 21: predicate.partial_defer_inline 1.44% : 0.000003s : 21: predicate.partial_eliminate 0.84% : 0.000002s : 13: predicate.print_const_string_wrapper 0.62% : 0.000001s : 8: predicate.reduce_all_const_elim 1.12% : 0.000003s : 13: predicate.reduce_eliminate 2.26% : 0.000005s : 38: predicate.redundant_stop_gradient_eliminater 0.44% : 0.000001s : 8: predicate.remove_not_recompute_node 1.33% : 0.000003s : 25: predicate.replace_applicator 0.47% : 0.000001s : 8: predicate.replace_old_param 0.39% : 0.000001s : 4: predicate.reset_defer_inline 0.84% : 0.000002s : 13: predicate.reshape_eliminate 0.56% : 0.000001s : 8: predicate.row_tensor_add_zeros_like 0.42% : 0.000001s : 4: predicate.row_tensor_eliminate 0.90% : 0.000002s : 8: predicate.same_eliminate 0.40% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.74% : 0.000002s : 8: predicate.shard_identity_eliminate 0.67% : 0.000002s : 8: predicate.special_op_eliminate 0.64% : 0.000002s : 8: predicate.specialize_transform 0.92% : 0.000002s : 8: predicate.split_environ_get_set_with_tuple_value 0.75% : 0.000002s : 8: predicate.stack_unstack_eliminate 0.36% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.34% : 0.000003s : 21: predicate.switch_defer_inline 1.79% : 0.000004s : 29: predicate.switch_layer_defer_inline 10.12% : 0.000024s : 67: predicate.switch_simplify 0.78% : 0.000002s : 13: predicate.tile_eliminate 1.05% : 0.000002s : 13: predicate.transpose_eliminate 1.36% : 0.000003s : 21: predicate.tuple_list_convert_item_index_to_positive 1.38% : 0.000003s : 21: predicate.tuple_list_get_item_const_eliminator 1.30% : 0.000003s : 21: predicate.tuple_list_get_item_depend_reorder 3.02% : 0.000007s : 33: predicate.tuple_list_get_item_eliminator 1.38% : 0.000003s : 21: predicate.tuple_list_get_set_item_eliminator 2.05% : 0.000005s : 29: predicate.tuple_list_set_item_eliminator 1.63% : 0.000004s : 25: predicate.tuple_to_list_eliminator_ 2.21% : 0.000005s : 38: predicate.updatestate_pure_node_eliminater 2.87% : 0.000007s : 46: predicate.updatestate_useless_node_eliminater 0.48% : 0.000001s : 4: predicate.value_based_eliminate 0.71% : 0.000002s : 8: predicate.virtual_dataset_eliminate 0.67% : 0.000002s : 8: predicate.virtual_output_eliminate 0.25% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.52% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000705 11 55.28% : 0.000390s : 5: func_graph_cloner_run.FuncGraphClonerGraph 44.72% : 0.000315s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.234779 192 0.00% : 0.000004s : 1: ForceFp32Comm 1.63% : 0.003823s : 1: add_attr 1.62% : 0.003806s : 1: add_attr_with_inline 0.00% : 0.000005s : 1: add_comm_op_reuse_tag 0.03% : 0.000060s : 1: add_recomputation 0.00% : 0.000004s : 1: assign_add_opt 0.03% : 0.000076s : 1: auto_monad 0.01% : 0.000022s : 1: auto_monad_reorder 0.00% : 0.000003s : 1: begin_end_overlap_inline 0.00% : 0.000006s : 1: bias_add_comm_swap 0.22% : 0.000514s : 1: bootstrap 0.02% : 0.000036s : 1: cconv 0.00% : 0.000005s : 1: comm_op_add_attrs 0.01% : 0.000019s : 1: control_data_broadcast_order 0.00% : 0.000011s : 1: convert_after_rewriter 0.01% : 0.000028s : 1: cse_after_recomputation 0.00% : 0.000005s : 1: dataset_repeat_opt 0.00% : 0.000006s : 1: detach_backward 0.00% : 0.000009s : 1: environ_conv 0.01% : 0.000030s : 1: event_method 0.00% : 0.000005s : 1: full_micro_interleaved_order_control 0.00% : 0.000006s : 1: get_jit_bprop_graph 0.00% : 0.000011s : 1: graph_reusing 0.00% : 0.000005s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000004s : 1: handle_group_info 0.00% : 0.000006s : 1: inline 0.00% : 0.000006s : 1: insert-virtual-dataset 0.00% : 0.000004s : 1: interleave_parallel_branches 0.00% : 0.000004s : 1: interleave_split_concat_branches 0.00% : 0.000006s : 1: label_fine_grained_interleaved_index 0.00% : 0.000008s : 1: label_micro_interleaved_index 0.23% : 0.000540s : 1: loop_unroll 0.00% : 0.000004s : 1: merge_cast_opt 0.00% : 0.000005s : 1: micro_interleaved_order_control 0.32% : 0.000761s : 1: mutable_eliminate 0.00% : 0.000007s : 1: offloading_packed_experts 0.01% : 0.000017s : 1: opt.transform.loop_unroll_optimizer 0.01% : 0.000019s : 1: opt.transform.mutable_eliminate 0.57% : 0.001349s : 78: opt.transform.opt_a 0.01% : 0.000032s : 1: opt.transform.opt_after_cconv 0.01% : 0.000030s : 1: opt.transform.opt_after_jit_grad 0.05% : 0.000107s : 28: opt.transform.opt_b 0.03% : 0.000068s : 2: opt.transform.opt_trans_graph 0.02% : 0.000043s : 4: opt.transform.symbol_engine_opt 1.49% : 0.003502s : 1: opt_a 0.05% : 0.000113s : 1: opt_after_cconv 0.28% : 0.000659s : 1: opt_after_jit_grad 0.10% : 0.000229s : 1: opt_b 2.56% : 0.006014s : 1: optimize 0.01% : 0.000021s : 1: optimize_parallel_all_gather_comm 0.00% : 0.000009s : 1: order_py_execute_after_rewriter 0.01% : 0.000027s : 1: overlap_grad_flash_sp 0.00% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.00% : 0.000007s : 1: overlap_grad_ring_attention 0.00% : 0.000007s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000005s : 1: overlap_param_gather 0.00% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.00% : 0.000009s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000005s : 1: overlap_recompute_comm 0.00% : 0.000007s : 1: parallel-infer-symbol 0.00% : 0.000004s : 1: parallel-infer-symbol-second 0.00% : 0.000006s : 1: partial_unused_args_eliminate 0.00% : 0.000005s : 1: pipeline_parallel_scheduler 0.00% : 0.000005s : 1: pipeline_split 0.02% : 0.000048s : 1: pre_auto_parallel 0.02% : 0.000035s : 1: py_interpret_to_execute 0.01% : 0.000019s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000004s : 1: remove_cast_before_assign_add 0.01% : 0.000017s : 1: remove_dup_value 0.21% : 0.000495s : 1: renormalize.infer 0.26% : 0.000621s : 1: renormalize.specialize 0.00% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.00% : 0.000008s : 1: rewriter_after_jit_bprop_graph 0.02% : 0.000046s : 1: rewriter_after_opt_a 0.04% : 0.000104s : 1: rewriter_before_opt_a 0.00% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000006s : 1: slice_recompute_activation 0.00% : 0.000006s : 1: split_layernorm_comm 0.00% : 0.000005s : 1: split_matmul_comm_elemetwise 0.00% : 0.000008s : 1: swap_dp_allreduce_reducescatter 0.04% : 0.000091s : 1: symbol_engine_optimizer 0.04% : 0.000102s : 1: tuple_transform 89.86% : 0.210982s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:40:53.506.210 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:40:53.506.485 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.292336, [21] [bootstrap]: 0.102457 [type_inference]: 0.00639222 [event_method]: 2.261e-05 [auto_monad]: 6.923e-05 [graph_reusing]: 7.18e-06 [inline]: 3.08e-06 [add_attr]: 0.00397608, [1] [add_attr_with_inline]: 0.00396148, [1] [Cycle 1]: 9.729e-05, [2] [tag_attr]: 2.507e-05 [meta_addattr_fg_expand]: 5.94e-06 [parallel-infer-symbol]: 3.70998e-06 [pre_auto_parallel]: 4.362e-05 [insert-virtual-dataset]: 2.53003e-06 [parallel-infer-symbol-second]: 9.99979e-07 [dataset_repeat_opt]: 2.07001e-06 [pipeline_split]: 1.74e-06 [optimize]: 0.177581, [53] [py_interpret_to_execute]: 6.449e-05 [rewriter_before_opt_a]: 0.0001054 [opt_a]: 0.00374675, [2] [Cycle 1]: 0.00277932, [45] [expand_dump_flag]: 3.36999e-06 [switch_simplify]: 4.55e-05 [loop_unroll]: 3.095e-05 [a_1]: 0.00071896 [with_stream_mark]: 4.229e-05 [recompute_prepare]: 1.284e-05 [updatestate_depend_eliminate]: 5.25999e-06 [updatestate_assign_eliminate]: 4.18001e-06 [updatestate_loads_eliminate]: 3.13e-06 [parameter_eliminate]: 2.94001e-06 [a_2]: 0.00012564 [accelerated_algorithm]: 9.69e-06 [shard]: 2.79001e-06 [meta_shard_fg_expand]: 2.78e-06 [shard_inline]: 6.84001e-06 [merge_send_recv]: 9.47999e-06 [auto_parallel]: 8.38001e-06 [parallel]: 2.185e-05 [flash_sp]: 1.138e-05 [merge_comm]: 4.22e-06 [allreduce_fusion]: 3.47002e-06 [matmul_add_comm_reduction]: 1.114e-05 [allreduce_slice_to_reducescatter]: 1.35001e-06 [virtual_shard_identity]: 1.023e-05 [virtual_dataset]: 7.90998e-06 [get_grad_eliminate_]: 6.73e-06 [virtual_output]: 7.15e-06 [merge_forward]: 3.8e-06 [cell_reuse_recompute_pass]: 1.79998e-06 [offload_activation]: 1.226e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.608e-05 [merge_recompute_call_nodes]: 2.07001e-06 [before_grad]: 1.335e-05 [set_forward_comm_id_for_comm_node_pass]: 4.42e-06 [meta_fg_expand]: 2.78e-06 [flash_sp_send_recv_attached]: 2.74001e-06 [receive_attached]: 2.61e-06 [after_resolve]: 1.615e-05 [a_after_grad]: 1.125e-05 [renormalize]: 0.00096705 [add_forward_monad_depend]: 7.61001e-06 [auto_monad_grad]: 2.83998e-06 [auto_monad_eliminator]: 2.114e-05 [cse]: 3.125e-05 [a_3]: 0.0001034 [Cycle 2]: 0.00094919, [45] [expand_dump_flag]: 2.71e-06 [switch_simplify]: 8.72998e-06 [loop_unroll]: 6.51999e-06 [a_1]: 0.00015729 [with_stream_mark]: 3.427e-05 [recompute_prepare]: 8.05e-06 [updatestate_depend_eliminate]: 4.45e-06 [updatestate_assign_eliminate]: 3.63e-06 [updatestate_loads_eliminate]: 3.20002e-06 [parameter_eliminate]: 2.08998e-06 [a_2]: 0.00011246 [accelerated_algorithm]: 7.36001e-06 [shard]: 2.04999e-06 [meta_shard_fg_expand]: 2.26998e-06 [shard_inline]: 6.96999e-06 [merge_send_recv]: 8.30999e-06 [auto_parallel]: 9.17001e-06 [parallel]: 8.77999e-06 [flash_sp]: 4.68001e-06 [merge_comm]: 3.83001e-06 [allreduce_fusion]: 4.05e-06 [matmul_add_comm_reduction]: 9.94999e-06 [allreduce_slice_to_reducescatter]: 6.30011e-07 [virtual_shard_identity]: 7.16999e-06 [virtual_dataset]: 7.18998e-06 [get_grad_eliminate_]: 6.26998e-06 [virtual_output]: 6.34999e-06 [merge_forward]: 3.6e-06 [cell_reuse_recompute_pass]: 3.93001e-06 [offload_activation]: 1.067e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.447e-05 [merge_recompute_call_nodes]: 1.42999e-06 [before_grad]: 1.079e-05 [set_forward_comm_id_for_comm_node_pass]: 3.76001e-06 [meta_fg_expand]: 3.11999e-06 [flash_sp_send_recv_attached]: 1.16997e-06 [receive_attached]: 2.22001e-06 [after_resolve]: 1.251e-05 [a_after_grad]: 1.07e-05 [renormalize]: 6.00121e-08 [add_forward_monad_depend]: 2.49001e-06 [auto_monad_grad]: 1.87001e-06 [auto_monad_eliminator]: 1.091e-05 [cse]: 1.887e-05 [a_3]: 5.238e-05 [py_interpret_to_execute_after_opt_a]: 1.896e-05 [slice_cell_reuse_recomputed_activation]: 5.74999e-06 [rewriter_after_opt_a]: 5.264e-05 [convert_after_rewriter]: 1.165e-05 [order_py_execute_after_rewriter]: 9.64e-06 [mutable_eliminate]: 0.00081387 [opt_b]: 0.00029916, [1] [Cycle 1]: 0.00028611, [7] [b_1]: 0.00017525 [b_2]: 8.99998e-06 [updatestate_depend_eliminate]: 9.04e-06 [updatestate_assign_eliminate]: 3.16999e-06 [updatestate_loads_eliminate]: 2.32999e-06 [renormalize]: 9.60019e-07 [cse]: 2.759e-05 [optimize_parallel_all_gather_comm]: 2.507e-05 [overlap_param_gather]: 4.97999e-06 [cconv]: 4.25e-05 [loop_unroll]: 0.0005484 [opt_after_cconv]: 0.0001409, [1] [Cycle 1]: 0.00013066, [7] [c_1]: 3.433e-05 [parameter_eliminate]: 5.35999e-06 [updatestate_depend_eliminate]: 6.46999e-06 [updatestate_assign_eliminate]: 2.58998e-06 [updatestate_loads_eliminate]: 2.43998e-06 [cse]: 2.183e-05 [renormalize]: 4.09986e-07 [remove_dup_value]: 1.917e-05 [tuple_transform]: 0.00010302, [1] [Cycle 1]: 9.28e-05, [4] [d_1]: 5.175e-05 [none_parameter_eliminate]: 1.97001e-06 [renormalize]: 1.8999e-07 [switch_simplify]: 7.19001e-06 [partial_unused_args_eliminate]: 5.10999e-06 [add_recomputation]: 6.013e-05 [cse_after_recomputation]: 3.014e-05, [1] [Cycle 1]: 2.215e-05, [1] [cse]: 1.21e-05 [environ_conv]: 1.046e-05 [swap_dp_allreduce_reducescatter]: 8.37e-06 [bias_add_comm_swap]: 5.92001e-06 [label_micro_interleaved_index]: 7.39002e-06 [label_fine_grained_interleaved_index]: 5.30999e-06 [merge_cast_opt]: 4.03001e-06 [slice_recompute_activation]: 4.94e-06 [micro_interleaved_order_control]: 5.54998e-06 [assign_add_opt]: 3.94002e-06 [ForceFp32Comm]: 3.39001e-06 [remove_cast_before_assign_add]: 4.02e-06 [full_micro_interleaved_order_control]: 4.65001e-06 [reorder_send_recv_between_fp_bp]: 5.44e-06 [comm_op_add_attrs]: 3.61001e-06 [add_comm_op_reuse_tag]: 3.87998e-06 [interleave_split_concat_branches]: 3.71999e-06 [interleave_parallel_branches]: 3.83999e-06 [overlap_opt_shard_in_pipeline]: 4.66002e-06 [overlap_opt_shard_grad_in_pipeline]: 4.38999e-06 [control_data_broadcast_order]: 0.170811 [grouped_pairwise_exchange_alltoall]: 8.33999e-06 [offloading_packed_experts]: 1.706e-05 [overlap_recompute_and_grad_model_parallel]: 1.148e-05 [overlap_grad_matmul_and_grad_allreduce]: 4.12e-06 [overlap_recompute_allgather_and_fa_grad]: 4.18001e-06 [overlap_recompute_comm]: 6.71999e-06 [overlap_grad_ring_attention]: 7.09001e-06 [overlap_grad_flash_sp]: 3.68e-05 [begin_end_overlap_inline]: 3.18e-06 [split_matmul_comm_elemetwise]: 5.05999e-06 [split_layernorm_comm]: 4.84e-06 [handle_group_info]: 3.32002e-06 [symbol_engine_optimizer]: 0.00016971, [1] [Cycle 1]: 0.00015475, [6] [build]: 9.13002e-06 [elim_shapecalc]: 3.369e-05 [elim_not_effective]: 2.6e-05 [opt_reshape]: 1.056e-05 [fold_const_symbol]: 1.234e-05 [renormalize]: 8.10018e-07 [detach_backward]: 5.60001e-06 [pipeline_parallel_scheduler]: 2.14e-06 [auto_monad_reorder]: 2.719e-05 [get_jit_bprop_graph]: 2.62001e-06 [rewriter_after_jit_bprop_graph]: 9.87001e-06 [opt_after_jit_grad]: 0.00093519 [validate]: 5.542e-05 Sums bootstrap : 0.102457s : 35.79% type_inference : 0.006392s : 2.23% event_method : 0.000023s : 0.01% auto_monad : 0.000069s : 0.02% graph_reusing : 0.000007s : 0.00% inline : 0.000003s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000025s : 0.01% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.00% parallel-infer-symbol : 0.000004s : 0.00% pre_auto_parallel : 0.000044s : 0.02% insert-virtual-dataset : 0.000003s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000064s : 0.02% optimize.rewriter_before_opt_a : 0.000105s : 0.04% optimize.opt_a.expand_dump_flag : 0.000006s : 0.00% optimize.opt_a.switch_simplify : 0.000054s : 0.02% optimize.opt_a.loop_unroll : 0.000037s : 0.01% optimize.opt_a.a_1 : 0.000876s : 0.31% optimize.opt_a.with_stream_mark : 0.000077s : 0.03% optimize.opt_a.recompute_prepare : 0.000021s : 0.01% optimize.opt_a.updatestate_depend_eliminate : 0.000010s : 0.00% optimize.opt_a.updatestate_assign_eliminate : 0.000008s : 0.00% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.00% optimize.opt_a.parameter_eliminate : 0.000005s : 0.00% optimize.opt_a.a_2 : 0.000238s : 0.08% optimize.opt_a.accelerated_algorithm : 0.000017s : 0.01% optimize.opt_a.shard : 0.000005s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.000005s : 0.00% optimize.opt_a.shard_inline : 0.000014s : 0.00% optimize.opt_a.merge_send_recv : 0.000018s : 0.01% optimize.opt_a.auto_parallel : 0.000018s : 0.01% optimize.opt_a.parallel : 0.000031s : 0.01% optimize.opt_a.flash_sp : 0.000016s : 0.01% optimize.opt_a.merge_comm : 0.000008s : 0.00% optimize.opt_a.allreduce_fusion : 0.000008s : 0.00% optimize.opt_a.matmul_add_comm_reduction : 0.000021s : 0.01% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000002s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000017s : 0.01% optimize.opt_a.virtual_dataset : 0.000015s : 0.01% optimize.opt_a.get_grad_eliminate_ : 0.000013s : 0.00% optimize.opt_a.virtual_output : 0.000013s : 0.00% optimize.opt_a.merge_forward : 0.000007s : 0.00% optimize.opt_a.cell_reuse_recompute_pass : 0.000006s : 0.00% optimize.opt_a.offload_activation : 0.000023s : 0.01% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000031s : 0.01% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.00% optimize.opt_a.before_grad : 0.000024s : 0.01% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000008s : 0.00% optimize.opt_a.meta_fg_expand : 0.000006s : 0.00% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.00% optimize.opt_a.receive_attached : 0.000005s : 0.00% optimize.opt_a.after_resolve : 0.000029s : 0.01% optimize.opt_a.a_after_grad : 0.000022s : 0.01% optimize.opt_a.renormalize : 0.000967s : 0.34% optimize.opt_a.add_forward_monad_depend : 0.000010s : 0.00% optimize.opt_a.auto_monad_grad : 0.000005s : 0.00% optimize.opt_a.auto_monad_eliminator : 0.000032s : 0.01% optimize.opt_a.cse : 0.000050s : 0.02% optimize.opt_a.a_3 : 0.000156s : 0.05% optimize.py_interpret_to_execute_after_opt_a : 0.000019s : 0.01% optimize.slice_cell_reuse_recomputed_activation : 0.000006s : 0.00% optimize.rewriter_after_opt_a : 0.000053s : 0.02% optimize.convert_after_rewriter : 0.000012s : 0.00% optimize.order_py_execute_after_rewriter : 0.000010s : 0.00% optimize.mutable_eliminate : 0.000814s : 0.28% optimize.opt_b.b_1 : 0.000175s : 0.06% optimize.opt_b.b_2 : 0.000009s : 0.00% optimize.opt_b.updatestate_depend_eliminate : 0.000009s : 0.00% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000002s : 0.00% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000028s : 0.01% optimize.optimize_parallel_all_gather_comm : 0.000025s : 0.01% optimize.overlap_param_gather : 0.000005s : 0.00% optimize.cconv : 0.000043s : 0.01% optimize.loop_unroll : 0.000548s : 0.19% optimize.opt_after_cconv.c_1 : 0.000034s : 0.01% optimize.opt_after_cconv.parameter_eliminate : 0.000005s : 0.00% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.00% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.00% optimize.opt_after_cconv.cse : 0.000022s : 0.01% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000019s : 0.01% optimize.tuple_transform.d_1 : 0.000052s : 0.02% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000007s : 0.00% optimize.partial_unused_args_eliminate : 0.000005s : 0.00% optimize.add_recomputation : 0.000060s : 0.02% optimize.cse_after_recomputation.cse : 0.000012s : 0.00% optimize.environ_conv : 0.000010s : 0.00% optimize.swap_dp_allreduce_reducescatter : 0.000008s : 0.00% optimize.bias_add_comm_swap : 0.000006s : 0.00% optimize.label_micro_interleaved_index : 0.000007s : 0.00% optimize.label_fine_grained_interleaved_index : 0.000005s : 0.00% optimize.merge_cast_opt : 0.000004s : 0.00% optimize.slice_recompute_activation : 0.000005s : 0.00% optimize.micro_interleaved_order_control : 0.000006s : 0.00% optimize.assign_add_opt : 0.000004s : 0.00% optimize.ForceFp32Comm : 0.000003s : 0.00% optimize.remove_cast_before_assign_add : 0.000004s : 0.00% optimize.full_micro_interleaved_order_control : 0.000005s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000005s : 0.00% optimize.comm_op_add_attrs : 0.000004s : 0.00% optimize.add_comm_op_reuse_tag : 0.000004s : 0.00% optimize.interleave_split_concat_branches : 0.000004s : 0.00% optimize.interleave_parallel_branches : 0.000004s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000005s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000004s : 0.00% optimize.control_data_broadcast_order : 0.170811s : 59.66% optimize.grouped_pairwise_exchange_alltoall : 0.000008s : 0.00% optimize.offloading_packed_experts : 0.000017s : 0.01% optimize.overlap_recompute_and_grad_model_parallel : 0.000011s : 0.00% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.00% optimize.overlap_recompute_comm : 0.000007s : 0.00% optimize.overlap_grad_ring_attention : 0.000007s : 0.00% optimize.overlap_grad_flash_sp : 0.000037s : 0.01% optimize.begin_end_overlap_inline : 0.000003s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000005s : 0.00% optimize.split_layernorm_comm : 0.000005s : 0.00% optimize.handle_group_info : 0.000003s : 0.00% optimize.symbol_engine_optimizer.build : 0.000009s : 0.00% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000034s : 0.01% optimize.symbol_engine_optimizer.elim_not_effective : 0.000026s : 0.01% optimize.symbol_engine_optimizer.opt_reshape : 0.000011s : 0.00% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000012s : 0.00% optimize.symbol_engine_optimizer.renormalize : 0.000001s : 0.00% detach_backward : 0.000006s : 0.00% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000027s : 0.01% get_jit_bprop_graph : 0.000003s : 0.00% rewriter_after_jit_bprop_graph : 0.000010s : 0.00% opt_after_jit_grad : 0.000935s : 0.33% validate : 0.000055s : 0.02% Time group info: ------[substitution.] 0.000278 34 16.08% : 0.000045s : 6: substitution.arithmetic_simplify 1.03% : 0.000003s : 2: substitution.elim_not_effective 0.69% : 0.000002s : 2: substitution.fold_const_symbol 2.52% : 0.000007s : 4: substitution.graph_param_transform 67.73% : 0.000188s : 4: substitution.inline 1.66% : 0.000005s : 4: substitution.j_node_and_user_rematch 1.92% : 0.000005s : 4: substitution.remove_not_recompute_node 2.39% : 0.000007s : 4: substitution.replace_old_param 5.98% : 0.000017s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.006326 2 85.58% : 0.005413s : 1: type_inference.infer 14.42% : 0.000912s : 1: type_inference.specialize ------[replace.] 0.000072 8 62.34% : 0.000045s : 4: replace.inline 37.66% : 0.000027s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000200 8 92.61% : 0.000185s : 4: match.inline 7.39% : 0.000015s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000239 1278 0.91% : 0.000002s : 13: predicate.accumulaten_eliminater 1.39% : 0.000003s : 4: predicate.ad_related_special_op_eliminate 0.53% : 0.000001s : 8: predicate.addn_check_dump 0.93% : 0.000002s : 13: predicate.addn_zero_filter 0.76% : 0.000002s : 13: predicate.adjust_all_reduce_mul_add 2.64% : 0.000006s : 21: predicate.arithmetic_simplify 1.05% : 0.000003s : 13: predicate.cast_eliminate 0.72% : 0.000002s : 8: predicate.check_bprop_eliminate 0.55% : 0.000001s : 8: predicate.compare_switch_simplify 0.17% : 0.000000s : 4: predicate.const_output_eliminate 0.70% : 0.000002s : 8: predicate.depend_value_elim 0.80% : 0.000002s : 13: predicate.dict_get_item_const_eliminator 0.96% : 0.000002s : 13: predicate.dict_get_item_eliminator 0.84% : 0.000002s : 13: predicate.dict_set_item_eliminator 1.41% : 0.000003s : 8: predicate.dumpgradient_eliminate 0.43% : 0.000001s : 4: predicate.elim_not_effective 0.98% : 0.000002s : 4: predicate.elim_shapecalc_of_broadcastargs 1.02% : 0.000002s : 17: predicate.environ_add_const_eliminate 1.02% : 0.000002s : 17: predicate.environ_get_add_eliminate 0.99% : 0.000002s : 17: predicate.environ_get_depend_swap 1.60% : 0.000004s : 25: predicate.environ_get_eliminate 1.10% : 0.000003s : 17: predicate.environ_get_set_eliminate 1.32% : 0.000003s : 21: predicate.exchange_switch_depend_value 2.32% : 0.000006s : 21: predicate.float_depend_g_call 0.65% : 0.000002s : 8: predicate.float_environ_get_switch 0.86% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.17% : 0.000000s : 4: predicate.fold_const_symbol 0.63% : 0.000002s : 8: predicate.get_grad_eliminate 0.17% : 0.000000s : 4: predicate.graph_param_transform 0.53% : 0.000001s : 8: predicate.incorporate_call 0.41% : 0.000001s : 8: predicate.incorporate_call_switch 5.75% : 0.000014s : 58: predicate.inline 0.84% : 0.000002s : 8: predicate.inline_without_move 0.28% : 0.000001s : 8: predicate.j_node_and_user_rematch 1.06% : 0.000003s : 8: predicate.less_batch_normalization 1.76% : 0.000004s : 25: predicate.list_to_tuple_eliminator_ 2.12% : 0.000005s : 38: predicate.load_eliminater 1.13% : 0.000003s : 4: predicate.loop_unroll_after_grad 2.28% : 0.000005s : 34: predicate.loop_unroll_before_grad 1.75% : 0.000004s : 21: predicate.make_slice_get_slice_eliminator 0.59% : 0.000001s : 8: predicate.merge_addn 0.78% : 0.000002s : 8: predicate.micro_step_allgather_replace 0.77% : 0.000002s : 8: predicate.mini_step_allgather_replace 0.70% : 0.000002s : 13: predicate.minmaximum_grad 1.46% : 0.000003s : 4: predicate.mutable_eliminate 0.61% : 0.000001s : 4: predicate.opt_reshape 0.34% : 0.000001s : 4: predicate.parallel_virtual_node 1.83% : 0.000004s : 21: predicate.partial_defer_inline 1.37% : 0.000003s : 21: predicate.partial_eliminate 0.89% : 0.000002s : 13: predicate.print_const_string_wrapper 0.63% : 0.000002s : 8: predicate.reduce_all_const_elim 1.20% : 0.000003s : 13: predicate.reduce_eliminate 2.35% : 0.000006s : 38: predicate.redundant_stop_gradient_eliminater 0.40% : 0.000001s : 8: predicate.remove_not_recompute_node 1.31% : 0.000003s : 25: predicate.replace_applicator 0.85% : 0.000002s : 8: predicate.replace_old_param 0.40% : 0.000001s : 4: predicate.reset_defer_inline 0.78% : 0.000002s : 13: predicate.reshape_eliminate 0.71% : 0.000002s : 8: predicate.row_tensor_add_zeros_like 0.42% : 0.000001s : 4: predicate.row_tensor_eliminate 1.04% : 0.000002s : 8: predicate.same_eliminate 0.42% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.84% : 0.000002s : 8: predicate.shard_identity_eliminate 0.62% : 0.000001s : 8: predicate.special_op_eliminate 0.64% : 0.000002s : 8: predicate.specialize_transform 1.26% : 0.000003s : 8: predicate.split_environ_get_set_with_tuple_value 1.08% : 0.000003s : 8: predicate.stack_unstack_eliminate 0.36% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.35% : 0.000003s : 21: predicate.switch_defer_inline 1.92% : 0.000005s : 29: predicate.switch_layer_defer_inline 5.03% : 0.000012s : 67: predicate.switch_simplify 0.88% : 0.000002s : 13: predicate.tile_eliminate 0.77% : 0.000002s : 13: predicate.transpose_eliminate 1.52% : 0.000004s : 21: predicate.tuple_list_convert_item_index_to_positive 1.44% : 0.000003s : 21: predicate.tuple_list_get_item_const_eliminator 1.29% : 0.000003s : 21: predicate.tuple_list_get_item_depend_reorder 3.10% : 0.000007s : 33: predicate.tuple_list_get_item_eliminator 1.45% : 0.000003s : 21: predicate.tuple_list_get_set_item_eliminator 1.96% : 0.000005s : 29: predicate.tuple_list_set_item_eliminator 1.83% : 0.000004s : 25: predicate.tuple_to_list_eliminator_ 2.08% : 0.000005s : 38: predicate.updatestate_pure_node_eliminater 2.76% : 0.000007s : 46: predicate.updatestate_useless_node_eliminater 0.36% : 0.000001s : 4: predicate.value_based_eliminate 0.81% : 0.000002s : 8: predicate.virtual_dataset_eliminate 0.69% : 0.000002s : 8: predicate.virtual_output_eliminate 0.24% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.38% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000653 11 51.60% : 0.000337s : 5: func_graph_cloner_run.FuncGraphClonerGraph 48.40% : 0.000316s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.476404 192 0.00% : 0.000006s : 1: ForceFp32Comm 0.84% : 0.003990s : 1: add_attr 0.83% : 0.003967s : 1: add_attr_with_inline 0.00% : 0.000007s : 1: add_comm_op_reuse_tag 0.01% : 0.000064s : 1: add_recomputation 0.00% : 0.000007s : 1: assign_add_opt 0.02% : 0.000078s : 1: auto_monad 0.01% : 0.000035s : 1: auto_monad_reorder 0.00% : 0.000006s : 1: begin_end_overlap_inline 0.00% : 0.000009s : 1: bias_add_comm_swap 21.52% : 0.102519s : 1: bootstrap 0.01% : 0.000046s : 1: cconv 0.00% : 0.000006s : 1: comm_op_add_attrs 35.86% : 0.170840s : 1: control_data_broadcast_order 0.00% : 0.000015s : 1: convert_after_rewriter 0.01% : 0.000033s : 1: cse_after_recomputation 0.00% : 0.000008s : 1: dataset_repeat_opt 0.00% : 0.000024s : 1: detach_backward 0.00% : 0.000013s : 1: environ_conv 0.01% : 0.000035s : 1: event_method 0.00% : 0.000007s : 1: full_micro_interleaved_order_control 0.00% : 0.000009s : 1: get_jit_bprop_graph 0.00% : 0.000013s : 1: graph_reusing 0.00% : 0.000015s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000006s : 1: handle_group_info 0.00% : 0.000009s : 1: inline 0.00% : 0.000009s : 1: insert-virtual-dataset 0.00% : 0.000006s : 1: interleave_parallel_branches 0.00% : 0.000006s : 1: interleave_split_concat_branches 0.00% : 0.000008s : 1: label_fine_grained_interleaved_index 0.00% : 0.000010s : 1: label_micro_interleaved_index 0.12% : 0.000555s : 1: loop_unroll 0.00% : 0.000007s : 1: merge_cast_opt 0.00% : 0.000008s : 1: micro_interleaved_order_control 0.17% : 0.000824s : 1: mutable_eliminate 0.00% : 0.000021s : 1: offloading_packed_experts 0.00% : 0.000016s : 1: opt.transform.loop_unroll_optimizer 0.00% : 0.000022s : 1: opt.transform.mutable_eliminate 0.29% : 0.001391s : 78: opt.transform.opt_a 0.01% : 0.000032s : 1: opt.transform.opt_after_cconv 0.01% : 0.000038s : 1: opt.transform.opt_after_jit_grad 0.02% : 0.000108s : 28: opt.transform.opt_b 0.01% : 0.000057s : 2: opt.transform.opt_trans_graph 0.02% : 0.000072s : 4: opt.transform.symbol_engine_opt 0.79% : 0.003751s : 1: opt_a 0.03% : 0.000145s : 1: opt_after_cconv 0.20% : 0.000951s : 1: opt_after_jit_grad 0.06% : 0.000303s : 1: opt_b 37.36% : 0.177996s : 1: optimize 0.01% : 0.000028s : 1: optimize_parallel_all_gather_comm 0.00% : 0.000013s : 1: order_py_execute_after_rewriter 0.01% : 0.000040s : 1: overlap_grad_flash_sp 0.00% : 0.000007s : 1: overlap_grad_matmul_and_grad_allreduce 0.00% : 0.000012s : 1: overlap_grad_ring_attention 0.00% : 0.000008s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000007s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000008s : 1: overlap_param_gather 0.00% : 0.000007s : 1: overlap_recompute_allgather_and_fa_grad 0.00% : 0.000015s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000011s : 1: overlap_recompute_comm 0.00% : 0.000010s : 1: parallel-infer-symbol 0.00% : 0.000007s : 1: parallel-infer-symbol-second 0.00% : 0.000008s : 1: partial_unused_args_eliminate 0.00% : 0.000010s : 1: pipeline_parallel_scheduler 0.00% : 0.000007s : 1: pipeline_split 0.01% : 0.000051s : 1: pre_auto_parallel 0.01% : 0.000070s : 1: py_interpret_to_execute 0.00% : 0.000023s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000008s : 1: remove_cast_before_assign_add 0.00% : 0.000022s : 1: remove_dup_value 0.12% : 0.000551s : 1: renormalize.infer 0.08% : 0.000403s : 1: renormalize.specialize 0.00% : 0.000008s : 1: reorder_send_recv_between_fp_bp 0.00% : 0.000016s : 1: rewriter_after_jit_bprop_graph 0.01% : 0.000057s : 1: rewriter_after_opt_a 0.02% : 0.000109s : 1: rewriter_before_opt_a 0.00% : 0.000009s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000008s : 1: slice_recompute_activation 0.00% : 0.000007s : 1: split_layernorm_comm 0.00% : 0.000008s : 1: split_matmul_comm_elemetwise 0.00% : 0.000011s : 1: swap_dp_allreduce_reducescatter 0.04% : 0.000173s : 1: symbol_engine_optimizer 0.02% : 0.000106s : 1: tuple_transform 1.36% : 0.006461s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:40:56.453.992 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.251749, [21] [bootstrap]: 0.00064558 [type_inference]: 0.222867 [event_method]: 1.947e-05 [auto_monad]: 6.811e-05 [graph_reusing]: 6.12001e-06 [inline]: 3.40998e-06 [add_attr]: 0.0057263, [1] [add_attr_with_inline]: 0.00571046, [1] [Cycle 1]: 9.59e-05, [2] [tag_attr]: 3.134e-05 [meta_addattr_fg_expand]: 6.16998e-06 [parallel-infer-symbol]: 4.31002e-06 [pre_auto_parallel]: 4.683e-05 [insert-virtual-dataset]: 2.42001e-06 [parallel-infer-symbol-second]: 9.60019e-07 [dataset_repeat_opt]: 2.41e-06 [pipeline_split]: 1.99999e-06 [optimize]: 0.00764373, [53] [py_interpret_to_execute]: 0.00021581 [rewriter_before_opt_a]: 0.00011497 [opt_a]: 0.00425605, [2] [Cycle 1]: 0.00330703, [45] [expand_dump_flag]: 3.09001e-06 [switch_simplify]: 4.494e-05 [loop_unroll]: 3.022e-05 [a_1]: 0.00098694 [with_stream_mark]: 2.346e-05 [recompute_prepare]: 1.013e-05 [updatestate_depend_eliminate]: 4.53001e-06 [updatestate_assign_eliminate]: 3.27002e-06 [updatestate_loads_eliminate]: 3.17002e-06 [parameter_eliminate]: 2.19999e-06 [a_2]: 9.569e-05 [accelerated_algorithm]: 8.53001e-06 [shard]: 2.02999e-06 [meta_shard_fg_expand]: 2.65002e-06 [shard_inline]: 7.16001e-06 [merge_send_recv]: 9.27999e-06 [auto_parallel]: 8.28001e-06 [parallel]: 2.377e-05 [flash_sp]: 9.25001e-06 [merge_comm]: 4.2e-06 [allreduce_fusion]: 4.53001e-06 [matmul_add_comm_reduction]: 1.178e-05 [allreduce_slice_to_reducescatter]: 7.89994e-07 [virtual_shard_identity]: 9.42999e-06 [virtual_dataset]: 6.78e-06 [get_grad_eliminate_]: 7.92998e-06 [virtual_output]: 8.05e-06 [merge_forward]: 4.43001e-06 [cell_reuse_recompute_pass]: 2.07001e-06 [offload_activation]: 1.24e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.463e-05 [merge_recompute_call_nodes]: 1.44e-06 [before_grad]: 1.109e-05 [set_forward_comm_id_for_comm_node_pass]: 4.07998e-06 [meta_fg_expand]: 3.60998e-06 [flash_sp_send_recv_attached]: 2.66e-06 [receive_attached]: 3.33e-06 [after_resolve]: 1.253e-05 [a_after_grad]: 1.091e-05 [renormalize]: 0.00140307 [add_forward_monad_depend]: 6.69999e-06 [auto_monad_grad]: 2.84001e-06 [auto_monad_eliminator]: 1.935e-05 [cse]: 3.7e-05 [a_3]: 5.668e-05 [Cycle 2]: 0.00093127, [45] [expand_dump_flag]: 1.91998e-06 [switch_simplify]: 8.75001e-06 [loop_unroll]: 6.36e-06 [a_1]: 0.0001603 [with_stream_mark]: 1.563e-05 [recompute_prepare]: 7.38e-06 [updatestate_depend_eliminate]: 3.58e-06 [updatestate_assign_eliminate]: 2.72001e-06 [updatestate_loads_eliminate]: 2.56998e-06 [parameter_eliminate]: 1.45001e-06 [a_2]: 8.278e-05 [accelerated_algorithm]: 7.21999e-06 [shard]: 3.93001e-06 [meta_shard_fg_expand]: 2.72001e-06 [shard_inline]: 6.21998e-06 [merge_send_recv]: 8.08999e-06 [auto_parallel]: 7.40003e-06 [parallel]: 8.10999e-06 [flash_sp]: 5.32001e-06 [merge_comm]: 3.48e-06 [allreduce_fusion]: 4.74998e-06 [matmul_add_comm_reduction]: 8.27e-06 [allreduce_slice_to_reducescatter]: 6.59988e-07 [virtual_shard_identity]: 1.146e-05 [virtual_dataset]: 6.16e-06 [get_grad_eliminate_]: 8.08999e-06 [virtual_output]: 5.86998e-06 [merge_forward]: 4.18001e-06 [cell_reuse_recompute_pass]: 4.76002e-06 [offload_activation]: 9.81e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.41e-05 [merge_recompute_call_nodes]: 1.04e-06 [before_grad]: 1.463e-05 [set_forward_comm_id_for_comm_node_pass]: 3.27997e-06 [meta_fg_expand]: 3.7e-06 [flash_sp_send_recv_attached]: 1.37e-06 [receive_attached]: 1.44e-06 [after_resolve]: 1.934e-05 [a_after_grad]: 9.59e-06 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 1.77999e-06 [auto_monad_grad]: 1.09e-06 [auto_monad_eliminator]: 8.18001e-06 [cse]: 2.329e-05 [a_3]: 0.00012899 [py_interpret_to_execute_after_opt_a]: 1.515e-05 [slice_cell_reuse_recomputed_activation]: 2.79999e-06 [rewriter_after_opt_a]: 4.567e-05 [convert_after_rewriter]: 7.10002e-06 [order_py_execute_after_rewriter]: 5.14e-06 [mutable_eliminate]: 0.00112626 [opt_b]: 0.00022932, [1] [Cycle 1]: 0.00021938, [7] [b_1]: 0.00013407 [b_2]: 1.01e-05 [updatestate_depend_eliminate]: 6.94001e-06 [updatestate_assign_eliminate]: 2.59001e-06 [updatestate_loads_eliminate]: 2.28998e-06 [renormalize]: 6.39993e-07 [cse]: 2.521e-05 [optimize_parallel_all_gather_comm]: 2.15e-05 [overlap_param_gather]: 2.86e-06 [cconv]: 3.644e-05 [loop_unroll]: 0.00075583 [opt_after_cconv]: 0.00013102, [1] [Cycle 1]: 0.00011954, [7] [c_1]: 3.711e-05 [parameter_eliminate]: 3.58e-06 [updatestate_depend_eliminate]: 7.8e-06 [updatestate_assign_eliminate]: 2.54001e-06 [updatestate_loads_eliminate]: 2.32999e-06 [cse]: 2.585e-05 [renormalize]: 2.12001e-06 [remove_dup_value]: 1.836e-05 [tuple_transform]: 9.691e-05, [1] [Cycle 1]: 9.224e-05, [4] [d_1]: 6.138e-05 [none_parameter_eliminate]: 2.02001e-06 [renormalize]: 3.09985e-07 [switch_simplify]: 7.11001e-06 [partial_unused_args_eliminate]: 1.91e-06 [add_recomputation]: 6.187e-05 [cse_after_recomputation]: 2.702e-05, [1] [Cycle 1]: 2.129e-05, [1] [cse]: 1.552e-05 [environ_conv]: 6.52001e-06 [swap_dp_allreduce_reducescatter]: 5.00999e-06 [bias_add_comm_swap]: 4.22998e-06 [label_micro_interleaved_index]: 7.25e-06 [label_fine_grained_interleaved_index]: 2.73e-06 [merge_cast_opt]: 1.93002e-06 [slice_recompute_activation]: 2.04999e-06 [micro_interleaved_order_control]: 2.52001e-06 [assign_add_opt]: 1.19e-06 [ForceFp32Comm]: 7.99977e-07 [remove_cast_before_assign_add]: 1.07e-06 [full_micro_interleaved_order_control]: 2.27999e-06 [reorder_send_recv_between_fp_bp]: 3.11999e-06 [comm_op_add_attrs]: 1.15999e-06 [add_comm_op_reuse_tag]: 9.89996e-07 [interleave_split_concat_branches]: 1.39e-06 [interleave_parallel_branches]: 1.21997e-06 [overlap_opt_shard_in_pipeline]: 2.81e-06 [overlap_opt_shard_grad_in_pipeline]: 2.94999e-06 [control_data_broadcast_order]: 1.467e-05 [grouped_pairwise_exchange_alltoall]: 1.55001e-06 [offloading_packed_experts]: 4.26001e-06 [overlap_recompute_and_grad_model_parallel]: 5.64e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.19e-06 [overlap_recompute_allgather_and_fa_grad]: 1.34e-06 [overlap_recompute_comm]: 2.51e-06 [overlap_grad_ring_attention]: 4.43999e-06 [overlap_grad_flash_sp]: 2.387e-05 [begin_end_overlap_inline]: 9.00007e-07 [split_matmul_comm_elemetwise]: 2.29999e-06 [split_layernorm_comm]: 1.98997e-06 [handle_group_info]: 1.07e-06 [symbol_engine_optimizer]: 8.787e-05, [1] [Cycle 1]: 8.322e-05, [6] [build]: 4.43001e-06 [elim_shapecalc]: 1.172e-05 [elim_not_effective]: 1.428e-05 [opt_reshape]: 8.50001e-06 [fold_const_symbol]: 1.134e-05 [renormalize]: 2.80008e-07 [detach_backward]: 2.17001e-06 [pipeline_parallel_scheduler]: 1.80001e-06 [auto_monad_reorder]: 1.802e-05 [get_jit_bprop_graph]: 2.76999e-06 [rewriter_after_jit_bprop_graph]: 5.87999e-06 [opt_after_jit_grad]: 0.0143796 [validate]: 7.221e-05 Sums bootstrap : 0.000646s : 0.26% type_inference : 0.222867s : 91.07% event_method : 0.000019s : 0.01% auto_monad : 0.000068s : 0.03% graph_reusing : 0.000006s : 0.00% inline : 0.000003s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000031s : 0.01% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.00% parallel-infer-symbol : 0.000004s : 0.00% pre_auto_parallel : 0.000047s : 0.02% insert-virtual-dataset : 0.000002s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000216s : 0.09% optimize.rewriter_before_opt_a : 0.000115s : 0.05% optimize.opt_a.expand_dump_flag : 0.000005s : 0.00% optimize.opt_a.switch_simplify : 0.000054s : 0.02% optimize.opt_a.loop_unroll : 0.000037s : 0.01% optimize.opt_a.a_1 : 0.001147s : 0.47% optimize.opt_a.with_stream_mark : 0.000039s : 0.02% optimize.opt_a.recompute_prepare : 0.000018s : 0.01% optimize.opt_a.updatestate_depend_eliminate : 0.000008s : 0.00% optimize.opt_a.updatestate_assign_eliminate : 0.000006s : 0.00% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.00% optimize.opt_a.parameter_eliminate : 0.000004s : 0.00% optimize.opt_a.a_2 : 0.000178s : 0.07% optimize.opt_a.accelerated_algorithm : 0.000016s : 0.01% optimize.opt_a.shard : 0.000006s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.000005s : 0.00% optimize.opt_a.shard_inline : 0.000013s : 0.01% optimize.opt_a.merge_send_recv : 0.000017s : 0.01% optimize.opt_a.auto_parallel : 0.000016s : 0.01% optimize.opt_a.parallel : 0.000032s : 0.01% optimize.opt_a.flash_sp : 0.000015s : 0.01% optimize.opt_a.merge_comm : 0.000008s : 0.00% optimize.opt_a.allreduce_fusion : 0.000009s : 0.00% optimize.opt_a.matmul_add_comm_reduction : 0.000020s : 0.01% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000021s : 0.01% optimize.opt_a.virtual_dataset : 0.000013s : 0.01% optimize.opt_a.get_grad_eliminate_ : 0.000016s : 0.01% optimize.opt_a.virtual_output : 0.000014s : 0.01% optimize.opt_a.merge_forward : 0.000009s : 0.00% optimize.opt_a.cell_reuse_recompute_pass : 0.000007s : 0.00% optimize.opt_a.offload_activation : 0.000022s : 0.01% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000029s : 0.01% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.00% optimize.opt_a.before_grad : 0.000026s : 0.01% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000007s : 0.00% optimize.opt_a.meta_fg_expand : 0.000007s : 0.00% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.00% optimize.opt_a.receive_attached : 0.000005s : 0.00% optimize.opt_a.after_resolve : 0.000032s : 0.01% optimize.opt_a.a_after_grad : 0.000021s : 0.01% optimize.opt_a.renormalize : 0.001403s : 0.57% optimize.opt_a.add_forward_monad_depend : 0.000008s : 0.00% optimize.opt_a.auto_monad_grad : 0.000004s : 0.00% optimize.opt_a.auto_monad_eliminator : 0.000028s : 0.01% optimize.opt_a.cse : 0.000060s : 0.02% optimize.opt_a.a_3 : 0.000186s : 0.08% optimize.py_interpret_to_execute_after_opt_a : 0.000015s : 0.01% optimize.slice_cell_reuse_recomputed_activation : 0.000003s : 0.00% optimize.rewriter_after_opt_a : 0.000046s : 0.02% optimize.convert_after_rewriter : 0.000007s : 0.00% optimize.order_py_execute_after_rewriter : 0.000005s : 0.00% optimize.mutable_eliminate : 0.001126s : 0.46% optimize.opt_b.b_1 : 0.000134s : 0.05% optimize.opt_b.b_2 : 0.000010s : 0.00% optimize.opt_b.updatestate_depend_eliminate : 0.000007s : 0.00% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000002s : 0.00% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000025s : 0.01% optimize.optimize_parallel_all_gather_comm : 0.000021s : 0.01% optimize.overlap_param_gather : 0.000003s : 0.00% optimize.cconv : 0.000036s : 0.01% optimize.loop_unroll : 0.000756s : 0.31% optimize.opt_after_cconv.c_1 : 0.000037s : 0.02% optimize.opt_after_cconv.parameter_eliminate : 0.000004s : 0.00% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000008s : 0.00% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.00% optimize.opt_after_cconv.cse : 0.000026s : 0.01% optimize.opt_after_cconv.renormalize : 0.000002s : 0.00% optimize.remove_dup_value : 0.000018s : 0.01% optimize.tuple_transform.d_1 : 0.000061s : 0.03% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000007s : 0.00% optimize.partial_unused_args_eliminate : 0.000002s : 0.00% optimize.add_recomputation : 0.000062s : 0.03% optimize.cse_after_recomputation.cse : 0.000016s : 0.01% optimize.environ_conv : 0.000007s : 0.00% optimize.swap_dp_allreduce_reducescatter : 0.000005s : 0.00% optimize.bias_add_comm_swap : 0.000004s : 0.00% optimize.label_micro_interleaved_index : 0.000007s : 0.00% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.00% optimize.merge_cast_opt : 0.000002s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.00% optimize.micro_interleaved_order_control : 0.000003s : 0.00% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.00% optimize.full_micro_interleaved_order_control : 0.000002s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.00% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000003s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000003s : 0.00% optimize.control_data_broadcast_order : 0.000015s : 0.01% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.00% optimize.offloading_packed_experts : 0.000004s : 0.00% optimize.overlap_recompute_and_grad_model_parallel : 0.000006s : 0.00% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.00% optimize.overlap_recompute_comm : 0.000003s : 0.00% optimize.overlap_grad_ring_attention : 0.000004s : 0.00% optimize.overlap_grad_flash_sp : 0.000024s : 0.01% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.00% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000004s : 0.00% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000012s : 0.00% optimize.symbol_engine_optimizer.elim_not_effective : 0.000014s : 0.01% optimize.symbol_engine_optimizer.opt_reshape : 0.000009s : 0.00% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000011s : 0.00% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.00% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000018s : 0.01% get_jit_bprop_graph : 0.000003s : 0.00% rewriter_after_jit_bprop_graph : 0.000006s : 0.00% opt_after_jit_grad : 0.014380s : 5.88% validate : 0.000072s : 0.03% Time group info: ------[substitution.] 0.000428 34 9.44% : 0.000040s : 6: substitution.arithmetic_simplify 0.55% : 0.000002s : 2: substitution.elim_not_effective 0.49% : 0.000002s : 2: substitution.fold_const_symbol 1.50% : 0.000006s : 4: substitution.graph_param_transform 79.88% : 0.000342s : 4: substitution.inline 1.39% : 0.000006s : 4: substitution.j_node_and_user_rematch 1.23% : 0.000005s : 4: substitution.remove_not_recompute_node 1.43% : 0.000006s : 4: substitution.replace_old_param 4.08% : 0.000017s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.222784 2 99.61% : 0.221911s : 1: type_inference.infer 0.39% : 0.000873s : 1: type_inference.specialize ------[replace.] 0.000159 8 29.96% : 0.000048s : 4: replace.inline 70.04% : 0.000112s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000354 8 95.61% : 0.000339s : 4: match.inline 4.39% : 0.000016s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000287 1278 0.69% : 0.000002s : 13: predicate.accumulaten_eliminater 2.33% : 0.000007s : 4: predicate.ad_related_special_op_eliminate 0.80% : 0.000002s : 8: predicate.addn_check_dump 1.69% : 0.000005s : 13: predicate.addn_zero_filter 1.21% : 0.000003s : 13: predicate.adjust_all_reduce_mul_add 2.96% : 0.000008s : 21: predicate.arithmetic_simplify 0.81% : 0.000002s : 13: predicate.cast_eliminate 0.63% : 0.000002s : 8: predicate.check_bprop_eliminate 0.45% : 0.000001s : 8: predicate.compare_switch_simplify 0.14% : 0.000000s : 4: predicate.const_output_eliminate 0.89% : 0.000003s : 8: predicate.depend_value_elim 1.22% : 0.000003s : 13: predicate.dict_get_item_const_eliminator 1.08% : 0.000003s : 13: predicate.dict_get_item_eliminator 0.68% : 0.000002s : 13: predicate.dict_set_item_eliminator 1.51% : 0.000004s : 8: predicate.dumpgradient_eliminate 0.18% : 0.000001s : 4: predicate.elim_not_effective 0.63% : 0.000002s : 4: predicate.elim_shapecalc_of_broadcastargs 0.95% : 0.000003s : 17: predicate.environ_add_const_eliminate 0.78% : 0.000002s : 17: predicate.environ_get_add_eliminate 0.80% : 0.000002s : 17: predicate.environ_get_depend_swap 1.68% : 0.000005s : 25: predicate.environ_get_eliminate 0.83% : 0.000002s : 17: predicate.environ_get_set_eliminate 1.05% : 0.000003s : 21: predicate.exchange_switch_depend_value 1.87% : 0.000005s : 21: predicate.float_depend_g_call 0.41% : 0.000001s : 8: predicate.float_environ_get_switch 0.53% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.12% : 0.000000s : 4: predicate.fold_const_symbol 1.67% : 0.000005s : 8: predicate.get_grad_eliminate 0.15% : 0.000000s : 4: predicate.graph_param_transform 0.41% : 0.000001s : 8: predicate.incorporate_call 0.34% : 0.000001s : 8: predicate.incorporate_call_switch 4.73% : 0.000014s : 58: predicate.inline 0.53% : 0.000002s : 8: predicate.inline_without_move 0.24% : 0.000001s : 8: predicate.j_node_and_user_rematch 1.26% : 0.000004s : 8: predicate.less_batch_normalization 1.50% : 0.000004s : 25: predicate.list_to_tuple_eliminator_ 2.01% : 0.000006s : 38: predicate.load_eliminater 1.34% : 0.000004s : 4: predicate.loop_unroll_after_grad 1.72% : 0.000005s : 34: predicate.loop_unroll_before_grad 1.71% : 0.000005s : 21: predicate.make_slice_get_slice_eliminator 0.87% : 0.000002s : 8: predicate.merge_addn 0.56% : 0.000002s : 8: predicate.micro_step_allgather_replace 0.71% : 0.000002s : 8: predicate.mini_step_allgather_replace 0.60% : 0.000002s : 13: predicate.minmaximum_grad 1.48% : 0.000004s : 4: predicate.mutable_eliminate 0.41% : 0.000001s : 4: predicate.opt_reshape 0.49% : 0.000001s : 4: predicate.parallel_virtual_node 1.91% : 0.000005s : 21: predicate.partial_defer_inline 1.19% : 0.000003s : 21: predicate.partial_eliminate 0.71% : 0.000002s : 13: predicate.print_const_string_wrapper 0.99% : 0.000003s : 8: predicate.reduce_all_const_elim 1.38% : 0.000004s : 13: predicate.reduce_eliminate 2.02% : 0.000006s : 38: predicate.redundant_stop_gradient_eliminater 0.29% : 0.000001s : 8: predicate.remove_not_recompute_node 0.95% : 0.000003s : 25: predicate.replace_applicator 0.41% : 0.000001s : 8: predicate.replace_old_param 0.18% : 0.000001s : 4: predicate.reset_defer_inline 1.05% : 0.000003s : 13: predicate.reshape_eliminate 1.06% : 0.000003s : 8: predicate.row_tensor_add_zeros_like 0.60% : 0.000002s : 4: predicate.row_tensor_eliminate 0.76% : 0.000002s : 8: predicate.same_eliminate 0.34% : 0.000001s : 8: predicate.set_cell_output_no_recompute 1.77% : 0.000005s : 8: predicate.shard_identity_eliminate 0.68% : 0.000002s : 8: predicate.special_op_eliminate 0.50% : 0.000001s : 8: predicate.specialize_transform 1.26% : 0.000004s : 8: predicate.split_environ_get_set_with_tuple_value 0.77% : 0.000002s : 8: predicate.stack_unstack_eliminate 0.24% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.20% : 0.000003s : 21: predicate.switch_defer_inline 2.30% : 0.000007s : 29: predicate.switch_layer_defer_inline 3.55% : 0.000010s : 67: predicate.switch_simplify 0.63% : 0.000002s : 13: predicate.tile_eliminate 0.68% : 0.000002s : 13: predicate.transpose_eliminate 2.60% : 0.000007s : 21: predicate.tuple_list_convert_item_index_to_positive 2.31% : 0.000007s : 21: predicate.tuple_list_get_item_const_eliminator 2.87% : 0.000008s : 21: predicate.tuple_list_get_item_depend_reorder 2.65% : 0.000008s : 33: predicate.tuple_list_get_item_eliminator 1.19% : 0.000003s : 21: predicate.tuple_list_get_set_item_eliminator 2.89% : 0.000008s : 29: predicate.tuple_list_set_item_eliminator 1.30% : 0.000004s : 25: predicate.tuple_to_list_eliminator_ 2.05% : 0.000006s : 38: predicate.updatestate_pure_node_eliminater 2.38% : 0.000007s : 46: predicate.updatestate_useless_node_eliminater 0.78% : 0.000002s : 4: predicate.value_based_eliminate 0.57% : 0.000002s : 8: predicate.virtual_dataset_eliminate 0.88% : 0.000003s : 8: predicate.virtual_output_eliminate 0.18% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.30% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000930 11 60.78% : 0.000565s : 5: func_graph_cloner_run.FuncGraphClonerGraph 39.22% : 0.000365s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.268392 192 0.00% : 0.000004s : 1: ForceFp32Comm 2.14% : 0.005733s : 1: add_attr 2.13% : 0.005716s : 1: add_attr_with_inline 0.00% : 0.000006s : 1: add_comm_op_reuse_tag 0.02% : 0.000066s : 1: add_recomputation 0.00% : 0.000004s : 1: assign_add_opt 0.03% : 0.000074s : 1: auto_monad 0.01% : 0.000022s : 1: auto_monad_reorder 0.00% : 0.000004s : 1: begin_end_overlap_inline 0.00% : 0.000007s : 1: bias_add_comm_swap 0.26% : 0.000692s : 1: bootstrap 0.01% : 0.000040s : 1: cconv 0.00% : 0.000004s : 1: comm_op_add_attrs 0.01% : 0.000018s : 1: control_data_broadcast_order 0.00% : 0.000011s : 1: convert_after_rewriter 0.01% : 0.000030s : 1: cse_after_recomputation 0.00% : 0.000005s : 1: dataset_repeat_opt 0.00% : 0.000006s : 1: detach_backward 0.00% : 0.000010s : 1: environ_conv 0.01% : 0.000027s : 1: event_method 0.00% : 0.000005s : 1: full_micro_interleaved_order_control 0.00% : 0.000006s : 1: get_jit_bprop_graph 0.00% : 0.000010s : 1: graph_reusing 0.00% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000004s : 1: handle_group_info 0.00% : 0.000006s : 1: inline 0.00% : 0.000006s : 1: insert-virtual-dataset 0.00% : 0.000004s : 1: interleave_parallel_branches 0.00% : 0.000004s : 1: interleave_split_concat_branches 0.00% : 0.000006s : 1: label_fine_grained_interleaved_index 0.00% : 0.000010s : 1: label_micro_interleaved_index 0.29% : 0.000767s : 1: loop_unroll 0.00% : 0.000005s : 1: merge_cast_opt 0.00% : 0.000006s : 1: micro_interleaved_order_control 0.42% : 0.001139s : 1: mutable_eliminate 0.00% : 0.000008s : 1: offloading_packed_experts 0.01% : 0.000024s : 1: opt.transform.loop_unroll_optimizer 0.01% : 0.000023s : 1: opt.transform.mutable_eliminate 0.64% : 0.001713s : 78: opt.transform.opt_a 0.01% : 0.000034s : 1: opt.transform.opt_after_cconv 0.02% : 0.000057s : 1: opt.transform.opt_after_jit_grad 0.04% : 0.000110s : 28: opt.transform.opt_b 0.02% : 0.000066s : 2: opt.transform.opt_trans_graph 0.02% : 0.000042s : 4: opt.transform.symbol_engine_opt 1.59% : 0.004262s : 1: opt_a 0.05% : 0.000138s : 1: opt_after_cconv 5.37% : 0.014410s : 1: opt_after_jit_grad 0.09% : 0.000233s : 1: opt_b 2.85% : 0.007650s : 1: optimize 0.01% : 0.000025s : 1: optimize_parallel_all_gather_comm 0.00% : 0.000008s : 1: order_py_execute_after_rewriter 0.01% : 0.000027s : 1: overlap_grad_flash_sp 0.00% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.00% : 0.000007s : 1: overlap_grad_ring_attention 0.00% : 0.000006s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000006s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000006s : 1: overlap_param_gather 0.00% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.00% : 0.000011s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000005s : 1: overlap_recompute_comm 0.00% : 0.000008s : 1: parallel-infer-symbol 0.00% : 0.000004s : 1: parallel-infer-symbol-second 0.00% : 0.000005s : 1: partial_unused_args_eliminate 0.00% : 0.000005s : 1: pipeline_parallel_scheduler 0.00% : 0.000005s : 1: pipeline_split 0.02% : 0.000052s : 1: pre_auto_parallel 0.08% : 0.000225s : 1: py_interpret_to_execute 0.01% : 0.000019s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000004s : 1: remove_cast_before_assign_add 0.01% : 0.000022s : 1: remove_dup_value 0.33% : 0.000883s : 1: renormalize.infer 0.19% : 0.000509s : 1: renormalize.specialize 0.00% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.00% : 0.000010s : 1: rewriter_after_jit_bprop_graph 0.02% : 0.000053s : 1: rewriter_after_opt_a 0.04% : 0.000120s : 1: rewriter_before_opt_a 0.00% : 0.000007s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000007s : 1: slice_recompute_activation 0.00% : 0.000007s : 1: split_layernorm_comm 0.00% : 0.000005s : 1: split_matmul_comm_elemetwise 0.00% : 0.000008s : 1: swap_dp_allreduce_reducescatter 0.03% : 0.000091s : 1: symbol_engine_optimizer 0.04% : 0.000100s : 1: tuple_transform 83.05% : 0.222900s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:41:00.865.282 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:41:00.866.668 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.403466, [21] [bootstrap]: 0.00108444 [type_inference]: 0.19796 [event_method]: 2.296e-05 [auto_monad]: 7.577e-05 [graph_reusing]: 6.63e-06 [inline]: 3.73999e-06 [add_attr]: 0.00674444, [1] [add_attr_with_inline]: 0.00673025, [1] [Cycle 1]: 0.00011938, [2] [tag_attr]: 2.746e-05 [meta_addattr_fg_expand]: 6.16e-06 [parallel-infer-symbol]: 3.8e-06 [pre_auto_parallel]: 4.527e-05 [insert-virtual-dataset]: 2.36e-06 [parallel-infer-symbol-second]: 1.14e-06 [dataset_repeat_opt]: 2.13002e-06 [pipeline_split]: 1.62001e-06 [optimize]: 0.194685, [53] [py_interpret_to_execute]: 4.982e-05 [rewriter_before_opt_a]: 0.00010754 [opt_a]: 0.191266, [2] [Cycle 1]: 0.190295, [45] [expand_dump_flag]: 3.13e-06 [switch_simplify]: 4.407e-05 [loop_unroll]: 3.121e-05 [a_1]: 0.00093325 [with_stream_mark]: 2.65e-05 [recompute_prepare]: 9.82001e-06 [updatestate_depend_eliminate]: 5.17e-06 [updatestate_assign_eliminate]: 3.63e-06 [updatestate_loads_eliminate]: 3.08e-06 [parameter_eliminate]: 2.21998e-06 [a_2]: 0.186843 [accelerated_algorithm]: 1.793e-05 [shard]: 7.95998e-06 [meta_shard_fg_expand]: 4.96997e-06 [shard_inline]: 1.046e-05 [merge_send_recv]: 2.688e-05 [auto_parallel]: 2.066e-05 [parallel]: 4.486e-05 [flash_sp]: 1.611e-05 [merge_comm]: 4.35999e-06 [allreduce_fusion]: 4.02002e-06 [matmul_add_comm_reduction]: 1.434e-05 [allreduce_slice_to_reducescatter]: 8.50006e-07 [virtual_shard_identity]: 9.77001e-06 [virtual_dataset]: 7.82e-06 [get_grad_eliminate_]: 8.06001e-06 [virtual_output]: 4.97e-05 [merge_forward]: 6.79001e-06 [cell_reuse_recompute_pass]: 5.28002e-06 [offload_activation]: 1.286e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.622e-05 [merge_recompute_call_nodes]: 1.85001e-06 [before_grad]: 1.376e-05 [set_forward_comm_id_for_comm_node_pass]: 4.37998e-06 [meta_fg_expand]: 5.59e-06 [flash_sp_send_recv_attached]: 2.79001e-06 [receive_attached]: 3.03e-06 [after_resolve]: 1.67e-05 [a_after_grad]: 1.279e-05 [renormalize]: 0.00137571 [add_forward_monad_depend]: 9.59e-06 [auto_monad_grad]: 2.64999e-06 [auto_monad_eliminator]: 2.357e-05 [cse]: 3.478e-05 [a_3]: 7.314e-05 [Cycle 2]: 0.00095052, [45] [expand_dump_flag]: 2.59001e-06 [switch_simplify]: 9.00999e-06 [loop_unroll]: 8.01001e-06 [a_1]: 0.00018893 [with_stream_mark]: 1.815e-05 [recompute_prepare]: 6.66e-06 [updatestate_depend_eliminate]: 3.48999e-06 [updatestate_assign_eliminate]: 3.13e-06 [updatestate_loads_eliminate]: 3.72002e-06 [parameter_eliminate]: 2.17999e-06 [a_2]: 0.00010796 [accelerated_algorithm]: 9.02e-06 [shard]: 1.72001e-06 [meta_shard_fg_expand]: 1.57999e-06 [shard_inline]: 7.78999e-06 [merge_send_recv]: 9.10999e-06 [auto_parallel]: 9.18002e-06 [parallel]: 1.046e-05 [flash_sp]: 5.59e-06 [merge_comm]: 3.4e-06 [allreduce_fusion]: 3.93001e-06 [matmul_add_comm_reduction]: 9.79e-06 [allreduce_slice_to_reducescatter]: 5.59987e-07 [virtual_shard_identity]: 6.53998e-06 [virtual_dataset]: 6.24001e-06 [get_grad_eliminate_]: 6.25002e-06 [virtual_output]: 7.77e-06 [merge_forward]: 4.97999e-06 [cell_reuse_recompute_pass]: 5.44e-06 [offload_activation]: 1.031e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.509e-05 [merge_recompute_call_nodes]: 1.02998e-06 [before_grad]: 1.073e-05 [set_forward_comm_id_for_comm_node_pass]: 3.6e-06 [meta_fg_expand]: 3.51001e-06 [flash_sp_send_recv_attached]: 1.20001e-06 [receive_attached]: 1.50001e-06 [after_resolve]: 1.317e-05 [a_after_grad]: 1.035e-05 [renormalize]: 1.19995e-07 [add_forward_monad_depend]: 1.50001e-06 [auto_monad_grad]: 1.29e-06 [auto_monad_eliminator]: 7.50998e-06 [cse]: 1.674e-05 [a_3]: 5.027e-05 [py_interpret_to_execute_after_opt_a]: 2.468e-05 [slice_cell_reuse_recomputed_activation]: 4.82e-06 [rewriter_after_opt_a]: 5.044e-05 [convert_after_rewriter]: 1.001e-05 [order_py_execute_after_rewriter]: 1.005e-05 [mutable_eliminate]: 0.00105841 [opt_b]: 0.00031305, [1] [Cycle 1]: 0.00029879, [7] [b_1]: 0.00018654 [b_2]: 1.072e-05 [updatestate_depend_eliminate]: 6.24999e-06 [updatestate_assign_eliminate]: 2.49999e-06 [updatestate_loads_eliminate]: 2.31e-06 [renormalize]: 5.59987e-07 [cse]: 2.982e-05 [optimize_parallel_all_gather_comm]: 2.578e-05 [overlap_param_gather]: 6.28e-06 [cconv]: 4.159e-05 [loop_unroll]: 0.00075063 [opt_after_cconv]: 0.0001456, [1] [Cycle 1]: 0.00013315, [7] [c_1]: 3.505e-05 [parameter_eliminate]: 4.98001e-06 [updatestate_depend_eliminate]: 5.57001e-06 [updatestate_assign_eliminate]: 2.56e-06 [updatestate_loads_eliminate]: 2.46e-06 [cse]: 2.095e-05 [renormalize]: 3.69997e-07 [remove_dup_value]: 1.834e-05 [tuple_transform]: 0.00010559, [1] [Cycle 1]: 9.52e-05, [4] [d_1]: 5.452e-05 [none_parameter_eliminate]: 1.54e-06 [renormalize]: 2.59985e-07 [switch_simplify]: 8.10999e-06 [partial_unused_args_eliminate]: 4.78001e-06 [add_recomputation]: 7.775e-05 [cse_after_recomputation]: 3.611e-05, [1] [Cycle 1]: 2.812e-05, [1] [cse]: 1.795e-05 [environ_conv]: 9.79e-06 [swap_dp_allreduce_reducescatter]: 7.81001e-06 [bias_add_comm_swap]: 7.20998e-06 [label_micro_interleaved_index]: 9.88002e-06 [label_fine_grained_interleaved_index]: 5.19998e-06 [merge_cast_opt]: 4.85001e-06 [slice_recompute_activation]: 4.53999e-06 [micro_interleaved_order_control]: 5.34998e-06 [assign_add_opt]: 3.77002e-06 [ForceFp32Comm]: 3.58999e-06 [remove_cast_before_assign_add]: 3.83999e-06 [full_micro_interleaved_order_control]: 4.58001e-06 [reorder_send_recv_between_fp_bp]: 6.18998e-06 [comm_op_add_attrs]: 3.36999e-06 [add_comm_op_reuse_tag]: 3.81999e-06 [interleave_split_concat_branches]: 4.03001e-06 [interleave_parallel_branches]: 3.89002e-06 [overlap_opt_shard_in_pipeline]: 6.41998e-06 [overlap_opt_shard_grad_in_pipeline]: 4.77e-06 [control_data_broadcast_order]: 1.646e-05 [grouped_pairwise_exchange_alltoall]: 4.61002e-06 [offloading_packed_experts]: 8.05e-06 [overlap_recompute_and_grad_model_parallel]: 1.077e-05 [overlap_grad_matmul_and_grad_allreduce]: 3.58e-06 [overlap_recompute_allgather_and_fa_grad]: 3.78001e-06 [overlap_recompute_comm]: 6.98e-06 [overlap_grad_ring_attention]: 7.06999e-06 [overlap_grad_flash_sp]: 2.907e-05 [begin_end_overlap_inline]: 2.89999e-06 [split_matmul_comm_elemetwise]: 5.96e-06 [split_layernorm_comm]: 4.23999e-06 [handle_group_info]: 3.28e-06 [symbol_engine_optimizer]: 0.00012433, [1] [Cycle 1]: 0.00011491, [6] [build]: 8e-06 [elim_shapecalc]: 1.263e-05 [elim_not_effective]: 1.834e-05 [opt_reshape]: 7.98001e-06 [fold_const_symbol]: 1.19e-05 [renormalize]: 3.00002e-07 [detach_backward]: 1.163e-05 [pipeline_parallel_scheduler]: 2.21e-06 [auto_monad_reorder]: 2.831e-05 [get_jit_bprop_graph]: 2.19999e-06 [rewriter_after_jit_bprop_graph]: 8.81002e-06 [opt_after_jit_grad]: 0.00096534 [validate]: 5.895e-05 Sums bootstrap : 0.001084s : 0.28% type_inference : 0.197960s : 50.29% event_method : 0.000023s : 0.01% auto_monad : 0.000076s : 0.02% graph_reusing : 0.000007s : 0.00% inline : 0.000004s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000027s : 0.01% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.00% parallel-infer-symbol : 0.000004s : 0.00% pre_auto_parallel : 0.000045s : 0.01% insert-virtual-dataset : 0.000002s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000050s : 0.01% optimize.rewriter_before_opt_a : 0.000108s : 0.03% optimize.opt_a.expand_dump_flag : 0.000006s : 0.00% optimize.opt_a.switch_simplify : 0.000053s : 0.01% optimize.opt_a.loop_unroll : 0.000039s : 0.01% optimize.opt_a.a_1 : 0.001122s : 0.29% optimize.opt_a.with_stream_mark : 0.000045s : 0.01% optimize.opt_a.recompute_prepare : 0.000016s : 0.00% optimize.opt_a.updatestate_depend_eliminate : 0.000009s : 0.00% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.00% optimize.opt_a.updatestate_loads_eliminate : 0.000007s : 0.00% optimize.opt_a.parameter_eliminate : 0.000004s : 0.00% optimize.opt_a.a_2 : 0.186951s : 47.50% optimize.opt_a.accelerated_algorithm : 0.000027s : 0.01% optimize.opt_a.shard : 0.000010s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.000007s : 0.00% optimize.opt_a.shard_inline : 0.000018s : 0.00% optimize.opt_a.merge_send_recv : 0.000036s : 0.01% optimize.opt_a.auto_parallel : 0.000030s : 0.01% optimize.opt_a.parallel : 0.000055s : 0.01% optimize.opt_a.flash_sp : 0.000022s : 0.01% optimize.opt_a.merge_comm : 0.000008s : 0.00% optimize.opt_a.allreduce_fusion : 0.000008s : 0.00% optimize.opt_a.matmul_add_comm_reduction : 0.000024s : 0.01% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000016s : 0.00% optimize.opt_a.virtual_dataset : 0.000014s : 0.00% optimize.opt_a.get_grad_eliminate_ : 0.000014s : 0.00% optimize.opt_a.virtual_output : 0.000057s : 0.01% optimize.opt_a.merge_forward : 0.000012s : 0.00% optimize.opt_a.cell_reuse_recompute_pass : 0.000011s : 0.00% optimize.opt_a.offload_activation : 0.000023s : 0.01% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000041s : 0.01% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.00% optimize.opt_a.before_grad : 0.000024s : 0.01% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000008s : 0.00% optimize.opt_a.meta_fg_expand : 0.000009s : 0.00% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.00% optimize.opt_a.receive_attached : 0.000005s : 0.00% optimize.opt_a.after_resolve : 0.000030s : 0.01% optimize.opt_a.a_after_grad : 0.000023s : 0.01% optimize.opt_a.renormalize : 0.001376s : 0.35% optimize.opt_a.add_forward_monad_depend : 0.000011s : 0.00% optimize.opt_a.auto_monad_grad : 0.000004s : 0.00% optimize.opt_a.auto_monad_eliminator : 0.000031s : 0.01% optimize.opt_a.cse : 0.000052s : 0.01% optimize.opt_a.a_3 : 0.000123s : 0.03% optimize.py_interpret_to_execute_after_opt_a : 0.000025s : 0.01% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.00% optimize.rewriter_after_opt_a : 0.000050s : 0.01% optimize.convert_after_rewriter : 0.000010s : 0.00% optimize.order_py_execute_after_rewriter : 0.000010s : 0.00% optimize.mutable_eliminate : 0.001058s : 0.27% optimize.opt_b.b_1 : 0.000187s : 0.05% optimize.opt_b.b_2 : 0.000011s : 0.00% optimize.opt_b.updatestate_depend_eliminate : 0.000006s : 0.00% optimize.opt_b.updatestate_assign_eliminate : 0.000002s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000002s : 0.00% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000030s : 0.01% optimize.optimize_parallel_all_gather_comm : 0.000026s : 0.01% optimize.overlap_param_gather : 0.000006s : 0.00% optimize.cconv : 0.000042s : 0.01% optimize.loop_unroll : 0.000751s : 0.19% optimize.opt_after_cconv.c_1 : 0.000035s : 0.01% optimize.opt_after_cconv.parameter_eliminate : 0.000005s : 0.00% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.00% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.00% optimize.opt_after_cconv.cse : 0.000021s : 0.01% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000018s : 0.00% optimize.tuple_transform.d_1 : 0.000055s : 0.01% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000008s : 0.00% optimize.partial_unused_args_eliminate : 0.000005s : 0.00% optimize.add_recomputation : 0.000078s : 0.02% optimize.cse_after_recomputation.cse : 0.000018s : 0.00% optimize.environ_conv : 0.000010s : 0.00% optimize.swap_dp_allreduce_reducescatter : 0.000008s : 0.00% optimize.bias_add_comm_swap : 0.000007s : 0.00% optimize.label_micro_interleaved_index : 0.000010s : 0.00% optimize.label_fine_grained_interleaved_index : 0.000005s : 0.00% optimize.merge_cast_opt : 0.000005s : 0.00% optimize.slice_recompute_activation : 0.000005s : 0.00% optimize.micro_interleaved_order_control : 0.000005s : 0.00% optimize.assign_add_opt : 0.000004s : 0.00% optimize.ForceFp32Comm : 0.000004s : 0.00% optimize.remove_cast_before_assign_add : 0.000004s : 0.00% optimize.full_micro_interleaved_order_control : 0.000005s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000006s : 0.00% optimize.comm_op_add_attrs : 0.000003s : 0.00% optimize.add_comm_op_reuse_tag : 0.000004s : 0.00% optimize.interleave_split_concat_branches : 0.000004s : 0.00% optimize.interleave_parallel_branches : 0.000004s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000006s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000005s : 0.00% optimize.control_data_broadcast_order : 0.000016s : 0.00% optimize.grouped_pairwise_exchange_alltoall : 0.000005s : 0.00% optimize.offloading_packed_experts : 0.000008s : 0.00% optimize.overlap_recompute_and_grad_model_parallel : 0.000011s : 0.00% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.00% optimize.overlap_recompute_comm : 0.000007s : 0.00% optimize.overlap_grad_ring_attention : 0.000007s : 0.00% optimize.overlap_grad_flash_sp : 0.000029s : 0.01% optimize.begin_end_overlap_inline : 0.000003s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000006s : 0.00% optimize.split_layernorm_comm : 0.000004s : 0.00% optimize.handle_group_info : 0.000003s : 0.00% optimize.symbol_engine_optimizer.build : 0.000008s : 0.00% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000013s : 0.00% optimize.symbol_engine_optimizer.elim_not_effective : 0.000018s : 0.00% optimize.symbol_engine_optimizer.opt_reshape : 0.000008s : 0.00% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000012s : 0.00% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000012s : 0.00% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000028s : 0.01% get_jit_bprop_graph : 0.000002s : 0.00% rewriter_after_jit_bprop_graph : 0.000009s : 0.00% opt_after_jit_grad : 0.000965s : 0.25% validate : 0.000059s : 0.01% Time group info: ------[substitution.] 0.000382 34 12.13% : 0.000046s : 6: substitution.arithmetic_simplify 0.97% : 0.000004s : 2: substitution.elim_not_effective 0.35% : 0.000001s : 2: substitution.fold_const_symbol 1.74% : 0.000007s : 4: substitution.graph_param_transform 74.32% : 0.000284s : 4: substitution.inline 1.77% : 0.000007s : 4: substitution.j_node_and_user_rematch 1.71% : 0.000007s : 4: substitution.remove_not_recompute_node 2.48% : 0.000009s : 4: substitution.replace_old_param 4.54% : 0.000017s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.197881 2 98.86% : 0.195627s : 1: type_inference.infer 1.14% : 0.002255s : 1: type_inference.specialize ------[replace.] 0.000074 8 65.27% : 0.000049s : 4: replace.inline 34.73% : 0.000026s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000296 8 94.89% : 0.000281s : 4: match.inline 5.11% : 0.000015s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000280 1278 0.78% : 0.000002s : 13: predicate.accumulaten_eliminater 0.70% : 0.000002s : 4: predicate.ad_related_special_op_eliminate 0.42% : 0.000001s : 8: predicate.addn_check_dump 2.41% : 0.000007s : 13: predicate.addn_zero_filter 0.69% : 0.000002s : 13: predicate.adjust_all_reduce_mul_add 2.47% : 0.000007s : 21: predicate.arithmetic_simplify 1.12% : 0.000003s : 13: predicate.cast_eliminate 0.66% : 0.000002s : 8: predicate.check_bprop_eliminate 0.42% : 0.000001s : 8: predicate.compare_switch_simplify 0.15% : 0.000000s : 4: predicate.const_output_eliminate 0.62% : 0.000002s : 8: predicate.depend_value_elim 0.72% : 0.000002s : 13: predicate.dict_get_item_const_eliminator 1.37% : 0.000004s : 13: predicate.dict_get_item_eliminator 0.74% : 0.000002s : 13: predicate.dict_set_item_eliminator 0.86% : 0.000002s : 8: predicate.dumpgradient_eliminate 0.22% : 0.000001s : 4: predicate.elim_not_effective 0.73% : 0.000002s : 4: predicate.elim_shapecalc_of_broadcastargs 1.30% : 0.000004s : 17: predicate.environ_add_const_eliminate 1.42% : 0.000004s : 17: predicate.environ_get_add_eliminate 0.90% : 0.000003s : 17: predicate.environ_get_depend_swap 3.61% : 0.000010s : 25: predicate.environ_get_eliminate 1.02% : 0.000003s : 17: predicate.environ_get_set_eliminate 1.42% : 0.000004s : 21: predicate.exchange_switch_depend_value 1.98% : 0.000006s : 21: predicate.float_depend_g_call 0.68% : 0.000002s : 8: predicate.float_environ_get_switch 0.65% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.15% : 0.000000s : 4: predicate.fold_const_symbol 0.56% : 0.000002s : 8: predicate.get_grad_eliminate 0.16% : 0.000000s : 4: predicate.graph_param_transform 1.02% : 0.000003s : 8: predicate.incorporate_call 0.35% : 0.000001s : 8: predicate.incorporate_call_switch 5.57% : 0.000016s : 58: predicate.inline 0.63% : 0.000002s : 8: predicate.inline_without_move 0.24% : 0.000001s : 8: predicate.j_node_and_user_rematch 2.08% : 0.000006s : 8: predicate.less_batch_normalization 1.42% : 0.000004s : 25: predicate.list_to_tuple_eliminator_ 2.48% : 0.000007s : 38: predicate.load_eliminater 0.87% : 0.000002s : 4: predicate.loop_unroll_after_grad 1.88% : 0.000005s : 34: predicate.loop_unroll_before_grad 1.71% : 0.000005s : 21: predicate.make_slice_get_slice_eliminator 0.42% : 0.000001s : 8: predicate.merge_addn 0.47% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.58% : 0.000002s : 8: predicate.mini_step_allgather_replace 0.77% : 0.000002s : 13: predicate.minmaximum_grad 1.44% : 0.000004s : 4: predicate.mutable_eliminate 0.40% : 0.000001s : 4: predicate.opt_reshape 0.56% : 0.000002s : 4: predicate.parallel_virtual_node 1.56% : 0.000004s : 21: predicate.partial_defer_inline 1.21% : 0.000003s : 21: predicate.partial_eliminate 0.83% : 0.000002s : 13: predicate.print_const_string_wrapper 1.06% : 0.000003s : 8: predicate.reduce_all_const_elim 1.31% : 0.000004s : 13: predicate.reduce_eliminate 1.83% : 0.000005s : 38: predicate.redundant_stop_gradient_eliminater 0.45% : 0.000001s : 8: predicate.remove_not_recompute_node 1.00% : 0.000003s : 25: predicate.replace_applicator 0.45% : 0.000001s : 8: predicate.replace_old_param 0.20% : 0.000001s : 4: predicate.reset_defer_inline 1.27% : 0.000004s : 13: predicate.reshape_eliminate 0.97% : 0.000003s : 8: predicate.row_tensor_add_zeros_like 0.66% : 0.000002s : 4: predicate.row_tensor_eliminate 0.52% : 0.000001s : 8: predicate.same_eliminate 0.32% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.57% : 0.000002s : 8: predicate.shard_identity_eliminate 1.12% : 0.000003s : 8: predicate.special_op_eliminate 0.54% : 0.000002s : 8: predicate.specialize_transform 1.37% : 0.000004s : 8: predicate.split_environ_get_set_with_tuple_value 0.92% : 0.000003s : 8: predicate.stack_unstack_eliminate 0.24% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.15% : 0.000003s : 21: predicate.switch_defer_inline 1.55% : 0.000004s : 29: predicate.switch_layer_defer_inline 3.73% : 0.000010s : 67: predicate.switch_simplify 0.66% : 0.000002s : 13: predicate.tile_eliminate 0.71% : 0.000002s : 13: predicate.transpose_eliminate 1.31% : 0.000004s : 21: predicate.tuple_list_convert_item_index_to_positive 1.35% : 0.000004s : 21: predicate.tuple_list_get_item_const_eliminator 1.17% : 0.000003s : 21: predicate.tuple_list_get_item_depend_reorder 5.04% : 0.000014s : 33: predicate.tuple_list_get_item_eliminator 2.08% : 0.000006s : 21: predicate.tuple_list_get_set_item_eliminator 2.98% : 0.000008s : 29: predicate.tuple_list_set_item_eliminator 1.33% : 0.000004s : 25: predicate.tuple_to_list_eliminator_ 1.93% : 0.000005s : 38: predicate.updatestate_pure_node_eliminater 2.34% : 0.000007s : 46: predicate.updatestate_useless_node_eliminater 0.55% : 0.000002s : 4: predicate.value_based_eliminate 0.52% : 0.000001s : 8: predicate.virtual_dataset_eliminate 0.82% : 0.000002s : 8: predicate.virtual_output_eliminate 0.18% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.39% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.001279 11 59.18% : 0.000757s : 5: func_graph_cloner_run.FuncGraphClonerGraph 40.82% : 0.000522s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.608106 192 0.00% : 0.000006s : 1: ForceFp32Comm 1.11% : 0.006758s : 1: add_attr 1.11% : 0.006735s : 1: add_attr_with_inline 0.00% : 0.000007s : 1: add_comm_op_reuse_tag 0.01% : 0.000081s : 1: add_recomputation 0.00% : 0.000006s : 1: assign_add_opt 0.01% : 0.000085s : 1: auto_monad 0.01% : 0.000036s : 1: auto_monad_reorder 0.00% : 0.000006s : 1: begin_end_overlap_inline 0.00% : 0.000010s : 1: bias_add_comm_swap 0.19% : 0.001151s : 1: bootstrap 0.01% : 0.000045s : 1: cconv 0.00% : 0.000006s : 1: comm_op_add_attrs 0.00% : 0.000020s : 1: control_data_broadcast_order 0.00% : 0.000013s : 1: convert_after_rewriter 0.01% : 0.000040s : 1: cse_after_recomputation 0.00% : 0.000008s : 1: dataset_repeat_opt 0.01% : 0.000043s : 1: detach_backward 0.00% : 0.000013s : 1: environ_conv 0.01% : 0.000035s : 1: event_method 0.00% : 0.000008s : 1: full_micro_interleaved_order_control 0.00% : 0.000009s : 1: get_jit_bprop_graph 0.00% : 0.000014s : 1: graph_reusing 0.00% : 0.000008s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000007s : 1: handle_group_info 0.00% : 0.000010s : 1: inline 0.00% : 0.000009s : 1: insert-virtual-dataset 0.00% : 0.000007s : 1: interleave_parallel_branches 0.00% : 0.000007s : 1: interleave_split_concat_branches 0.00% : 0.000008s : 1: label_fine_grained_interleaved_index 0.00% : 0.000013s : 1: label_micro_interleaved_index 0.12% : 0.000759s : 1: loop_unroll 0.00% : 0.000008s : 1: merge_cast_opt 0.00% : 0.000008s : 1: micro_interleaved_order_control 0.18% : 0.001067s : 1: mutable_eliminate 0.00% : 0.000011s : 1: offloading_packed_experts 0.00% : 0.000018s : 1: opt.transform.loop_unroll_optimizer 0.00% : 0.000023s : 1: opt.transform.mutable_eliminate 0.28% : 0.001697s : 78: opt.transform.opt_a 0.01% : 0.000033s : 1: opt.transform.opt_after_cconv 0.01% : 0.000032s : 1: opt.transform.opt_after_jit_grad 0.02% : 0.000116s : 28: opt.transform.opt_b 0.01% : 0.000059s : 2: opt.transform.opt_trans_graph 0.01% : 0.000046s : 4: opt.transform.symbol_engine_opt 31.45% : 0.191271s : 1: opt_a 0.02% : 0.000149s : 1: opt_after_cconv 0.16% : 0.000980s : 1: opt_after_jit_grad 0.05% : 0.000316s : 1: opt_b 32.25% : 0.196098s : 1: optimize 0.00% : 0.000029s : 1: optimize_parallel_all_gather_comm 0.00% : 0.000014s : 1: order_py_execute_after_rewriter 0.01% : 0.000032s : 1: overlap_grad_flash_sp 0.00% : 0.000006s : 1: overlap_grad_matmul_and_grad_allreduce 0.00% : 0.000012s : 1: overlap_grad_ring_attention 0.00% : 0.000009s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000009s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000009s : 1: overlap_param_gather 0.00% : 0.000006s : 1: overlap_recompute_allgather_and_fa_grad 0.00% : 0.000014s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000010s : 1: overlap_recompute_comm 0.00% : 0.000012s : 1: parallel-infer-symbol 0.00% : 0.000007s : 1: parallel-infer-symbol-second 0.00% : 0.000008s : 1: partial_unused_args_eliminate 0.00% : 0.000009s : 1: pipeline_parallel_scheduler 0.00% : 0.000007s : 1: pipeline_split 0.01% : 0.000053s : 1: pre_auto_parallel 0.01% : 0.000054s : 1: py_interpret_to_execute 0.00% : 0.000029s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000007s : 1: remove_cast_before_assign_add 0.00% : 0.000022s : 1: remove_dup_value 0.13% : 0.000765s : 1: renormalize.infer 0.10% : 0.000595s : 1: renormalize.specialize 0.00% : 0.000009s : 1: reorder_send_recv_between_fp_bp 0.00% : 0.000017s : 1: rewriter_after_jit_bprop_graph 0.01% : 0.000054s : 1: rewriter_after_opt_a 0.02% : 0.000112s : 1: rewriter_before_opt_a 0.00% : 0.000008s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000007s : 1: slice_recompute_activation 0.00% : 0.000007s : 1: split_layernorm_comm 0.00% : 0.000009s : 1: split_matmul_comm_elemetwise 0.00% : 0.000011s : 1: swap_dp_allreduce_reducescatter 0.02% : 0.000129s : 1: symbol_engine_optimizer 0.02% : 0.000108s : 1: tuple_transform 32.56% : 0.198023s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:41:06.183.357 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.199202, [21] [bootstrap]: 0.00055625 [type_inference]: 0.00974617 [event_method]: 2.096e-05 [auto_monad]: 6.137e-05 [graph_reusing]: 4.92e-06 [inline]: 3.66999e-06 [add_attr]: 0.181094, [1] [add_attr_with_inline]: 0.181078, [1] [Cycle 1]: 8.751e-05, [2] [tag_attr]: 3.043e-05 [meta_addattr_fg_expand]: 5.56e-06 [parallel-infer-symbol]: 3.9e-06 [pre_auto_parallel]: 4.823e-05 [insert-virtual-dataset]: 2.40997e-06 [parallel-infer-symbol-second]: 1.25001e-06 [dataset_repeat_opt]: 1.89e-06 [pipeline_split]: 1.65001e-06 [optimize]: 0.00660048, [53] [py_interpret_to_execute]: 3.49e-05 [rewriter_before_opt_a]: 0.00010595 [opt_a]: 0.00348883, [2] [Cycle 1]: 0.00270702, [45] [expand_dump_flag]: 2.81e-06 [switch_simplify]: 4.294e-05 [loop_unroll]: 2.983e-05 [a_1]: 0.00072212 [with_stream_mark]: 2.004e-05 [recompute_prepare]: 9.29998e-06 [updatestate_depend_eliminate]: 3.92998e-06 [updatestate_assign_eliminate]: 3.40998e-06 [updatestate_loads_eliminate]: 3.14999e-06 [parameter_eliminate]: 1.91e-06 [a_2]: 9.086e-05 [accelerated_algorithm]: 8.08001e-06 [shard]: 1.99999e-06 [meta_shard_fg_expand]: 1.87999e-06 [shard_inline]: 7.97e-06 [merge_send_recv]: 9.72001e-06 [auto_parallel]: 6.74001e-06 [parallel]: 2.248e-05 [flash_sp]: 8.49998e-06 [merge_comm]: 4.87e-06 [allreduce_fusion]: 4.03001e-06 [matmul_add_comm_reduction]: 1.076e-05 [allreduce_slice_to_reducescatter]: 7.59988e-07 [virtual_shard_identity]: 8.48001e-06 [virtual_dataset]: 7.18e-06 [get_grad_eliminate_]: 6.27001e-06 [virtual_output]: 6.80998e-06 [merge_forward]: 3.68e-06 [cell_reuse_recompute_pass]: 1.62001e-06 [offload_activation]: 1.058e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.16e-05 [merge_recompute_call_nodes]: 1.42e-06 [before_grad]: 1.075e-05 [set_forward_comm_id_for_comm_node_pass]: 3.50998e-06 [meta_fg_expand]: 3.43e-06 [flash_sp_send_recv_attached]: 2.66e-06 [receive_attached]: 2.64001e-06 [after_resolve]: 1.155e-05 [a_after_grad]: 1.051e-05 [renormalize]: 0.00119908 [add_forward_monad_depend]: 5.42999e-06 [auto_monad_grad]: 2.73e-06 [auto_monad_eliminator]: 1.851e-05 [cse]: 3.607e-05 [a_3]: 6.247e-05 [Cycle 2]: 0.0007697, [45] [expand_dump_flag]: 1.76003e-06 [switch_simplify]: 8.58001e-06 [loop_unroll]: 6.73e-06 [a_1]: 0.00016136 [with_stream_mark]: 1.533e-05 [recompute_prepare]: 6.75002e-06 [updatestate_depend_eliminate]: 4.28999e-06 [updatestate_assign_eliminate]: 2.33002e-06 [updatestate_loads_eliminate]: 2.52001e-06 [parameter_eliminate]: 1.27e-06 [a_2]: 0.00010226 [accelerated_algorithm]: 7.83999e-06 [shard]: 1.71e-06 [meta_shard_fg_expand]: 1.87999e-06 [shard_inline]: 7.40998e-06 [merge_send_recv]: 8.06001e-06 [auto_parallel]: 9.34e-06 [parallel]: 7.89997e-06 [flash_sp]: 3.98001e-06 [merge_comm]: 3.18e-06 [allreduce_fusion]: 5.14e-06 [matmul_add_comm_reduction]: 8.05999e-06 [allreduce_slice_to_reducescatter]: 8.10018e-07 [virtual_shard_identity]: 6.89999e-06 [virtual_dataset]: 6.76e-06 [get_grad_eliminate_]: 9.05001e-06 [virtual_output]: 6.07999e-06 [merge_forward]: 2.79999e-06 [cell_reuse_recompute_pass]: 2.17001e-06 [offload_activation]: 8.94e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.428e-05 [merge_recompute_call_nodes]: 8.89995e-07 [before_grad]: 9.84001e-06 [set_forward_comm_id_for_comm_node_pass]: 3.4e-06 [meta_fg_expand]: 2.07001e-06 [flash_sp_send_recv_attached]: 1.42999e-06 [receive_attached]: 3.48999e-06 [after_resolve]: 1.185e-05 [a_after_grad]: 9.96e-06 [renormalize]: 7.99773e-08 [add_forward_monad_depend]: 1.37999e-06 [auto_monad_grad]: 1.32e-06 [auto_monad_eliminator]: 8.74e-06 [cse]: 1.595e-05 [a_3]: 4.074e-05 [py_interpret_to_execute_after_opt_a]: 1.282e-05 [slice_cell_reuse_recomputed_activation]: 2.01e-06 [rewriter_after_opt_a]: 3.908e-05 [convert_after_rewriter]: 7.15e-06 [order_py_execute_after_rewriter]: 4.92999e-06 [mutable_eliminate]: 0.00113486 [opt_b]: 0.0002178, [1] [Cycle 1]: 0.00020921, [7] [b_1]: 0.00013241 [b_2]: 8.60001e-06 [updatestate_depend_eliminate]: 5.57001e-06 [updatestate_assign_eliminate]: 2.44001e-06 [updatestate_loads_eliminate]: 2.31998e-06 [renormalize]: 9.79984e-07 [cse]: 2.031e-05 [optimize_parallel_all_gather_comm]: 1.774e-05 [overlap_param_gather]: 2.88e-06 [cconv]: 3.104e-05 [loop_unroll]: 0.00072558 [opt_after_cconv]: 0.000149, [1] [Cycle 1]: 0.00014017, [7] [c_1]: 3.253e-05 [parameter_eliminate]: 3.13e-06 [updatestate_depend_eliminate]: 5.09e-06 [updatestate_assign_eliminate]: 2.40002e-06 [updatestate_loads_eliminate]: 2.49999e-06 [cse]: 2.091e-05 [renormalize]: 1.10001e-06 [remove_dup_value]: 1.618e-05 [tuple_transform]: 8.872e-05, [1] [Cycle 1]: 8.136e-05, [4] [d_1]: 5.171e-05 [none_parameter_eliminate]: 1.77999e-06 [renormalize]: 2.60014e-07 [switch_simplify]: 7.56999e-06 [partial_unused_args_eliminate]: 1.70001e-06 [add_recomputation]: 6.022e-05 [cse_after_recomputation]: 2.411e-05, [1] [Cycle 1]: 1.792e-05, [1] [cse]: 1.205e-05 [environ_conv]: 5.41002e-06 [swap_dp_allreduce_reducescatter]: 4.89e-06 [bias_add_comm_swap]: 3.12002e-06 [label_micro_interleaved_index]: 5.73002e-06 [label_fine_grained_interleaved_index]: 2.97002e-06 [merge_cast_opt]: 2.01e-06 [slice_recompute_activation]: 2.24001e-06 [micro_interleaved_order_control]: 2.16e-06 [assign_add_opt]: 1.43002e-06 [ForceFp32Comm]: 8.39995e-07 [remove_cast_before_assign_add]: 9.79984e-07 [full_micro_interleaved_order_control]: 1.97999e-06 [reorder_send_recv_between_fp_bp]: 2.56e-06 [comm_op_add_attrs]: 1.04e-06 [add_comm_op_reuse_tag]: 1.00999e-06 [interleave_split_concat_branches]: 1.15999e-06 [interleave_parallel_branches]: 1.12e-06 [overlap_opt_shard_in_pipeline]: 1.59998e-06 [overlap_opt_shard_grad_in_pipeline]: 1.94e-06 [control_data_broadcast_order]: 1.425e-05 [grouped_pairwise_exchange_alltoall]: 1.94e-06 [offloading_packed_experts]: 3.58e-06 [overlap_recompute_and_grad_model_parallel]: 5.70001e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.20001e-06 [overlap_recompute_allgather_and_fa_grad]: 1.32999e-06 [overlap_recompute_comm]: 2.29999e-06 [overlap_grad_ring_attention]: 4.34002e-06 [overlap_grad_flash_sp]: 2.414e-05 [begin_end_overlap_inline]: 5.60016e-07 [split_matmul_comm_elemetwise]: 2.09e-06 [split_layernorm_comm]: 1.87001e-06 [handle_group_info]: 9.5999e-07 [symbol_engine_optimizer]: 8.696e-05, [1] [Cycle 1]: 8.236e-05, [6] [build]: 6.11e-06 [elim_shapecalc]: 1.272e-05 [elim_not_effective]: 1.367e-05 [opt_reshape]: 8.83001e-06 [fold_const_symbol]: 1.059e-05 [renormalize]: 2.50002e-07 [detach_backward]: 2.21e-06 [pipeline_parallel_scheduler]: 1.67999e-06 [auto_monad_reorder]: 1.822e-05 [get_jit_bprop_graph]: 2.21998e-06 [rewriter_after_jit_bprop_graph]: 5.09998e-06 [opt_after_jit_grad]: 0.00077529 [validate]: 6.29e-05 Sums bootstrap : 0.000556s : 3.27% type_inference : 0.009746s : 57.27% event_method : 0.000021s : 0.12% auto_monad : 0.000061s : 0.36% graph_reusing : 0.000005s : 0.03% inline : 0.000004s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000030s : 0.18% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.03% parallel-infer-symbol : 0.000004s : 0.02% pre_auto_parallel : 0.000048s : 0.28% insert-virtual-dataset : 0.000002s : 0.01% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.01% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000035s : 0.21% optimize.rewriter_before_opt_a : 0.000106s : 0.62% optimize.opt_a.expand_dump_flag : 0.000005s : 0.03% optimize.opt_a.switch_simplify : 0.000052s : 0.30% optimize.opt_a.loop_unroll : 0.000037s : 0.21% optimize.opt_a.a_1 : 0.000883s : 5.19% optimize.opt_a.with_stream_mark : 0.000035s : 0.21% optimize.opt_a.recompute_prepare : 0.000016s : 0.09% optimize.opt_a.updatestate_depend_eliminate : 0.000008s : 0.05% optimize.opt_a.updatestate_assign_eliminate : 0.000006s : 0.03% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.03% optimize.opt_a.parameter_eliminate : 0.000003s : 0.02% optimize.opt_a.a_2 : 0.000193s : 1.13% optimize.opt_a.accelerated_algorithm : 0.000016s : 0.09% optimize.opt_a.shard : 0.000004s : 0.02% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.02% optimize.opt_a.shard_inline : 0.000015s : 0.09% optimize.opt_a.merge_send_recv : 0.000018s : 0.10% optimize.opt_a.auto_parallel : 0.000016s : 0.09% optimize.opt_a.parallel : 0.000030s : 0.18% optimize.opt_a.flash_sp : 0.000012s : 0.07% optimize.opt_a.merge_comm : 0.000008s : 0.05% optimize.opt_a.allreduce_fusion : 0.000009s : 0.05% optimize.opt_a.matmul_add_comm_reduction : 0.000019s : 0.11% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000002s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000015s : 0.09% optimize.opt_a.virtual_dataset : 0.000014s : 0.08% optimize.opt_a.get_grad_eliminate_ : 0.000015s : 0.09% optimize.opt_a.virtual_output : 0.000013s : 0.08% optimize.opt_a.merge_forward : 0.000006s : 0.04% optimize.opt_a.cell_reuse_recompute_pass : 0.000004s : 0.02% optimize.opt_a.offload_activation : 0.000020s : 0.11% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000026s : 0.15% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.01% optimize.opt_a.before_grad : 0.000021s : 0.12% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000007s : 0.04% optimize.opt_a.meta_fg_expand : 0.000006s : 0.03% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.02% optimize.opt_a.receive_attached : 0.000006s : 0.04% optimize.opt_a.after_resolve : 0.000023s : 0.14% optimize.opt_a.a_after_grad : 0.000020s : 0.12% optimize.opt_a.renormalize : 0.001199s : 7.05% optimize.opt_a.add_forward_monad_depend : 0.000007s : 0.04% optimize.opt_a.auto_monad_grad : 0.000004s : 0.02% optimize.opt_a.auto_monad_eliminator : 0.000027s : 0.16% optimize.opt_a.cse : 0.000052s : 0.31% optimize.opt_a.a_3 : 0.000103s : 0.61% optimize.py_interpret_to_execute_after_opt_a : 0.000013s : 0.08% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.01% optimize.rewriter_after_opt_a : 0.000039s : 0.23% optimize.convert_after_rewriter : 0.000007s : 0.04% optimize.order_py_execute_after_rewriter : 0.000005s : 0.03% optimize.mutable_eliminate : 0.001135s : 6.67% optimize.opt_b.b_1 : 0.000132s : 0.78% optimize.opt_b.b_2 : 0.000009s : 0.05% optimize.opt_b.updatestate_depend_eliminate : 0.000006s : 0.03% optimize.opt_b.updatestate_assign_eliminate : 0.000002s : 0.01% optimize.opt_b.updatestate_loads_eliminate : 0.000002s : 0.01% optimize.opt_b.renormalize : 0.000001s : 0.01% optimize.opt_b.cse : 0.000020s : 0.12% optimize.optimize_parallel_all_gather_comm : 0.000018s : 0.10% optimize.overlap_param_gather : 0.000003s : 0.02% optimize.cconv : 0.000031s : 0.18% optimize.loop_unroll : 0.000726s : 4.26% optimize.opt_after_cconv.c_1 : 0.000033s : 0.19% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.02% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000005s : 0.03% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000002s : 0.01% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.01% optimize.opt_after_cconv.cse : 0.000021s : 0.12% optimize.opt_after_cconv.renormalize : 0.000001s : 0.01% optimize.remove_dup_value : 0.000016s : 0.10% optimize.tuple_transform.d_1 : 0.000052s : 0.30% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000008s : 0.04% optimize.partial_unused_args_eliminate : 0.000002s : 0.01% optimize.add_recomputation : 0.000060s : 0.35% optimize.cse_after_recomputation.cse : 0.000012s : 0.07% optimize.environ_conv : 0.000005s : 0.03% optimize.swap_dp_allreduce_reducescatter : 0.000005s : 0.03% optimize.bias_add_comm_swap : 0.000003s : 0.02% optimize.label_micro_interleaved_index : 0.000006s : 0.03% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.02% optimize.merge_cast_opt : 0.000002s : 0.01% optimize.slice_recompute_activation : 0.000002s : 0.01% optimize.micro_interleaved_order_control : 0.000002s : 0.01% optimize.assign_add_opt : 0.000001s : 0.01% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.01% optimize.full_micro_interleaved_order_control : 0.000002s : 0.01% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.02% optimize.comm_op_add_attrs : 0.000001s : 0.01% optimize.add_comm_op_reuse_tag : 0.000001s : 0.01% optimize.interleave_split_concat_branches : 0.000001s : 0.01% optimize.interleave_parallel_branches : 0.000001s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000002s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.01% optimize.control_data_broadcast_order : 0.000014s : 0.08% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.01% optimize.offloading_packed_experts : 0.000004s : 0.02% optimize.overlap_recompute_and_grad_model_parallel : 0.000006s : 0.03% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.01% optimize.overlap_recompute_comm : 0.000002s : 0.01% optimize.overlap_grad_ring_attention : 0.000004s : 0.03% optimize.overlap_grad_flash_sp : 0.000024s : 0.14% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.01% optimize.split_layernorm_comm : 0.000002s : 0.01% optimize.handle_group_info : 0.000001s : 0.01% optimize.symbol_engine_optimizer.build : 0.000006s : 0.04% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000013s : 0.07% optimize.symbol_engine_optimizer.elim_not_effective : 0.000014s : 0.08% optimize.symbol_engine_optimizer.opt_reshape : 0.000009s : 0.05% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000011s : 0.06% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.01% pipeline_parallel_scheduler : 0.000002s : 0.01% auto_monad_reorder : 0.000018s : 0.11% get_jit_bprop_graph : 0.000002s : 0.01% rewriter_after_jit_bprop_graph : 0.000005s : 0.03% opt_after_jit_grad : 0.000775s : 4.56% validate : 0.000063s : 0.37% Time group info: ------[substitution.] 0.000273 34 14.33% : 0.000039s : 6: substitution.arithmetic_simplify 0.69% : 0.000002s : 2: substitution.elim_not_effective 0.61% : 0.000002s : 2: substitution.fold_const_symbol 2.80% : 0.000008s : 4: substitution.graph_param_transform 70.08% : 0.000191s : 4: substitution.inline 1.61% : 0.000004s : 4: substitution.j_node_and_user_rematch 1.77% : 0.000005s : 4: substitution.remove_not_recompute_node 2.19% : 0.000006s : 4: substitution.replace_old_param 5.93% : 0.000016s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.009670 2 89.20% : 0.008626s : 1: type_inference.infer 10.80% : 0.001044s : 1: type_inference.specialize ------[replace.] 0.000069 8 64.70% : 0.000045s : 4: replace.inline 35.30% : 0.000024s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000203 8 92.94% : 0.000188s : 4: match.inline 7.06% : 0.000014s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000262 1278 0.71% : 0.000002s : 13: predicate.accumulaten_eliminater 0.59% : 0.000002s : 4: predicate.ad_related_special_op_eliminate 0.44% : 0.000001s : 8: predicate.addn_check_dump 0.99% : 0.000003s : 13: predicate.addn_zero_filter 0.66% : 0.000002s : 13: predicate.adjust_all_reduce_mul_add 2.46% : 0.000006s : 21: predicate.arithmetic_simplify 0.81% : 0.000002s : 13: predicate.cast_eliminate 0.57% : 0.000001s : 8: predicate.check_bprop_eliminate 0.43% : 0.000001s : 8: predicate.compare_switch_simplify 0.15% : 0.000000s : 4: predicate.const_output_eliminate 0.71% : 0.000002s : 8: predicate.depend_value_elim 0.88% : 0.000002s : 13: predicate.dict_get_item_const_eliminator 1.13% : 0.000003s : 13: predicate.dict_get_item_eliminator 0.66% : 0.000002s : 13: predicate.dict_set_item_eliminator 1.00% : 0.000003s : 8: predicate.dumpgradient_eliminate 0.22% : 0.000001s : 4: predicate.elim_not_effective 0.57% : 0.000002s : 4: predicate.elim_shapecalc_of_broadcastargs 1.94% : 0.000005s : 17: predicate.environ_add_const_eliminate 0.86% : 0.000002s : 17: predicate.environ_get_add_eliminate 0.90% : 0.000002s : 17: predicate.environ_get_depend_swap 2.40% : 0.000006s : 25: predicate.environ_get_eliminate 0.89% : 0.000002s : 17: predicate.environ_get_set_eliminate 1.18% : 0.000003s : 21: predicate.exchange_switch_depend_value 1.96% : 0.000005s : 21: predicate.float_depend_g_call 0.38% : 0.000001s : 8: predicate.float_environ_get_switch 0.62% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.15% : 0.000000s : 4: predicate.fold_const_symbol 0.71% : 0.000002s : 8: predicate.get_grad_eliminate 0.23% : 0.000001s : 4: predicate.graph_param_transform 0.44% : 0.000001s : 8: predicate.incorporate_call 0.38% : 0.000001s : 8: predicate.incorporate_call_switch 5.30% : 0.000014s : 58: predicate.inline 0.59% : 0.000002s : 8: predicate.inline_without_move 0.26% : 0.000001s : 8: predicate.j_node_and_user_rematch 1.08% : 0.000003s : 8: predicate.less_batch_normalization 1.57% : 0.000004s : 25: predicate.list_to_tuple_eliminator_ 1.97% : 0.000005s : 38: predicate.load_eliminater 0.75% : 0.000002s : 4: predicate.loop_unroll_after_grad 1.83% : 0.000005s : 34: predicate.loop_unroll_before_grad 1.36% : 0.000004s : 21: predicate.make_slice_get_slice_eliminator 0.41% : 0.000001s : 8: predicate.merge_addn 0.46% : 0.000001s : 8: predicate.micro_step_allgather_replace 1.33% : 0.000003s : 8: predicate.mini_step_allgather_replace 0.63% : 0.000002s : 13: predicate.minmaximum_grad 0.81% : 0.000002s : 4: predicate.mutable_eliminate 0.28% : 0.000001s : 4: predicate.opt_reshape 0.64% : 0.000002s : 4: predicate.parallel_virtual_node 1.39% : 0.000004s : 21: predicate.partial_defer_inline 1.26% : 0.000003s : 21: predicate.partial_eliminate 1.35% : 0.000004s : 13: predicate.print_const_string_wrapper 8.51% : 0.000022s : 8: predicate.reduce_all_const_elim 1.28% : 0.000003s : 13: predicate.reduce_eliminate 1.95% : 0.000005s : 38: predicate.redundant_stop_gradient_eliminater 0.43% : 0.000001s : 8: predicate.remove_not_recompute_node 1.09% : 0.000003s : 25: predicate.replace_applicator 0.42% : 0.000001s : 8: predicate.replace_old_param 0.28% : 0.000001s : 4: predicate.reset_defer_inline 1.72% : 0.000004s : 13: predicate.reshape_eliminate 2.48% : 0.000006s : 8: predicate.row_tensor_add_zeros_like 0.49% : 0.000001s : 4: predicate.row_tensor_eliminate 0.65% : 0.000002s : 8: predicate.same_eliminate 0.34% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.60% : 0.000002s : 8: predicate.shard_identity_eliminate 1.22% : 0.000003s : 8: predicate.special_op_eliminate 0.50% : 0.000001s : 8: predicate.specialize_transform 1.62% : 0.000004s : 8: predicate.split_environ_get_set_with_tuple_value 0.71% : 0.000002s : 8: predicate.stack_unstack_eliminate 0.31% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.20% : 0.000003s : 21: predicate.switch_defer_inline 1.60% : 0.000004s : 29: predicate.switch_layer_defer_inline 4.15% : 0.000011s : 67: predicate.switch_simplify 0.71% : 0.000002s : 13: predicate.tile_eliminate 0.71% : 0.000002s : 13: predicate.transpose_eliminate 1.28% : 0.000003s : 21: predicate.tuple_list_convert_item_index_to_positive 1.33% : 0.000003s : 21: predicate.tuple_list_get_item_const_eliminator 1.34% : 0.000004s : 21: predicate.tuple_list_get_item_depend_reorder 2.64% : 0.000007s : 33: predicate.tuple_list_get_item_eliminator 1.22% : 0.000003s : 21: predicate.tuple_list_get_set_item_eliminator 1.86% : 0.000005s : 29: predicate.tuple_list_set_item_eliminator 1.55% : 0.000004s : 25: predicate.tuple_to_list_eliminator_ 1.96% : 0.000005s : 38: predicate.updatestate_pure_node_eliminater 2.46% : 0.000006s : 46: predicate.updatestate_useless_node_eliminater 0.47% : 0.000001s : 4: predicate.value_based_eliminate 0.47% : 0.000001s : 8: predicate.virtual_dataset_eliminate 0.65% : 0.000002s : 8: predicate.virtual_output_eliminate 0.23% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.32% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000667 11 45.69% : 0.000305s : 5: func_graph_cloner_run.FuncGraphClonerGraph 54.31% : 0.000362s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.389587 192 0.00% : 0.000003s : 1: ForceFp32Comm 46.49% : 0.181102s : 1: add_attr 46.48% : 0.181083s : 1: add_attr_with_inline 0.00% : 0.000007s : 1: add_comm_op_reuse_tag 0.02% : 0.000065s : 1: add_recomputation 0.00% : 0.000004s : 1: assign_add_opt 0.02% : 0.000066s : 1: auto_monad 0.01% : 0.000022s : 1: auto_monad_reorder 0.00% : 0.000003s : 1: begin_end_overlap_inline 0.00% : 0.000006s : 1: bias_add_comm_swap 0.15% : 0.000594s : 1: bootstrap 0.01% : 0.000034s : 1: cconv 0.00% : 0.000004s : 1: comm_op_add_attrs 0.00% : 0.000018s : 1: control_data_broadcast_order 0.00% : 0.000010s : 1: convert_after_rewriter 0.01% : 0.000027s : 1: cse_after_recomputation 0.00% : 0.000005s : 1: dataset_repeat_opt 0.00% : 0.000005s : 1: detach_backward 0.00% : 0.000009s : 1: environ_conv 0.01% : 0.000029s : 1: event_method 0.00% : 0.000005s : 1: full_micro_interleaved_order_control 0.00% : 0.000005s : 1: get_jit_bprop_graph 0.00% : 0.000008s : 1: graph_reusing 0.00% : 0.000005s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000004s : 1: handle_group_info 0.00% : 0.000007s : 1: inline 0.00% : 0.000006s : 1: insert-virtual-dataset 0.00% : 0.000004s : 1: interleave_parallel_branches 0.00% : 0.000004s : 1: interleave_split_concat_branches 0.00% : 0.000006s : 1: label_fine_grained_interleaved_index 0.00% : 0.000009s : 1: label_micro_interleaved_index 0.19% : 0.000738s : 1: loop_unroll 0.00% : 0.000005s : 1: merge_cast_opt 0.00% : 0.000005s : 1: micro_interleaved_order_control 0.29% : 0.001147s : 1: mutable_eliminate 0.00% : 0.000007s : 1: offloading_packed_experts 0.00% : 0.000015s : 1: opt.transform.loop_unroll_optimizer 0.00% : 0.000017s : 1: opt.transform.mutable_eliminate 0.35% : 0.001364s : 78: opt.transform.opt_a 0.01% : 0.000031s : 1: opt.transform.opt_after_cconv 0.01% : 0.000028s : 1: opt.transform.opt_after_jit_grad 0.03% : 0.000107s : 28: opt.transform.opt_b 0.01% : 0.000056s : 2: opt.transform.opt_trans_graph 0.01% : 0.000041s : 4: opt.transform.symbol_engine_opt 0.90% : 0.003492s : 1: opt_a 0.04% : 0.000152s : 1: opt_after_cconv 0.20% : 0.000788s : 1: opt_after_jit_grad 0.06% : 0.000222s : 1: opt_b 1.70% : 0.006608s : 1: optimize 0.01% : 0.000022s : 1: optimize_parallel_all_gather_comm 0.00% : 0.000008s : 1: order_py_execute_after_rewriter 0.01% : 0.000028s : 1: overlap_grad_flash_sp 0.00% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.00% : 0.000007s : 1: overlap_grad_ring_attention 0.00% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000005s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000006s : 1: overlap_param_gather 0.00% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.00% : 0.000009s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000005s : 1: overlap_recompute_comm 0.00% : 0.000008s : 1: parallel-infer-symbol 0.00% : 0.000004s : 1: parallel-infer-symbol-second 0.00% : 0.000005s : 1: partial_unused_args_eliminate 0.00% : 0.000005s : 1: pipeline_parallel_scheduler 0.00% : 0.000004s : 1: pipeline_split 0.01% : 0.000053s : 1: pre_auto_parallel 0.01% : 0.000039s : 1: py_interpret_to_execute 0.00% : 0.000016s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000004s : 1: remove_cast_before_assign_add 0.01% : 0.000020s : 1: remove_dup_value 0.18% : 0.000717s : 1: renormalize.infer 0.12% : 0.000473s : 1: renormalize.specialize 0.00% : 0.000005s : 1: reorder_send_recv_between_fp_bp 0.00% : 0.000008s : 1: rewriter_after_jit_bprop_graph 0.01% : 0.000043s : 1: rewriter_after_opt_a 0.03% : 0.000110s : 1: rewriter_before_opt_a 0.00% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000006s : 1: slice_recompute_activation 0.00% : 0.000006s : 1: split_layernorm_comm 0.00% : 0.000005s : 1: split_matmul_comm_elemetwise 0.00% : 0.000008s : 1: swap_dp_allreduce_reducescatter 0.02% : 0.000090s : 1: symbol_engine_optimizer 0.02% : 0.000092s : 1: tuple_transform 2.51% : 0.009776s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:41:11.428.835 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:41:11.429.276 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.20817, [21] [bootstrap]: 0.00046517 [type_inference]: 0.0066134 [event_method]: 0.18876 [auto_monad]: 0.00010365 [graph_reusing]: 6.36e-06 [inline]: 3.11001e-06 [add_attr]: 0.00400097, [1] [add_attr_with_inline]: 0.00398823, [1] [Cycle 1]: 9.961e-05, [2] [tag_attr]: 2.655e-05 [meta_addattr_fg_expand]: 6.24001e-06 [parallel-infer-symbol]: 3.78001e-06 [pre_auto_parallel]: 4.543e-05 [insert-virtual-dataset]: 2.64001e-06 [parallel-infer-symbol-second]: 8.50006e-07 [dataset_repeat_opt]: 2.25002e-06 [pipeline_split]: 2.29001e-06 [optimize]: 0.00670854, [53] [py_interpret_to_execute]: 3.746e-05 [rewriter_before_opt_a]: 0.00010439 [opt_a]: 0.00374945, [2] [Cycle 1]: 0.00275098, [45] [expand_dump_flag]: 3.46001e-06 [switch_simplify]: 4.667e-05 [loop_unroll]: 3.125e-05 [a_1]: 0.0007231 [with_stream_mark]: 2.673e-05 [recompute_prepare]: 1.188e-05 [updatestate_depend_eliminate]: 5.20999e-06 [updatestate_assign_eliminate]: 3.47002e-06 [updatestate_loads_eliminate]: 3.18e-06 [parameter_eliminate]: 2.37999e-06 [a_2]: 0.00012554 [accelerated_algorithm]: 8.22e-06 [shard]: 2.69999e-06 [meta_shard_fg_expand]: 2.73e-06 [shard_inline]: 7.46999e-06 [merge_send_recv]: 9.46e-06 [auto_parallel]: 8.67998e-06 [parallel]: 2.21e-05 [flash_sp]: 1.097e-05 [merge_comm]: 4.43999e-06 [allreduce_fusion]: 3.55998e-06 [matmul_add_comm_reduction]: 1.122e-05 [allreduce_slice_to_reducescatter]: 1.19e-06 [virtual_shard_identity]: 9.47001e-06 [virtual_dataset]: 7.94002e-06 [get_grad_eliminate_]: 6.69999e-06 [virtual_output]: 7.82e-06 [merge_forward]: 4.23001e-06 [cell_reuse_recompute_pass]: 1.56002e-06 [offload_activation]: 1.202e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.855e-05 [merge_recompute_call_nodes]: 1.69998e-06 [before_grad]: 1.35e-05 [set_forward_comm_id_for_comm_node_pass]: 4.3e-06 [meta_fg_expand]: 3.41999e-06 [flash_sp_send_recv_attached]: 3.55e-06 [receive_attached]: 2.49999e-06 [after_resolve]: 1.306e-05 [a_after_grad]: 1.084e-05 [renormalize]: 0.00096585 [add_forward_monad_depend]: 9.79e-06 [auto_monad_grad]: 2.60002e-06 [auto_monad_eliminator]: 2.172e-05 [cse]: 3.225e-05 [a_3]: 7.256e-05 [Cycle 2]: 0.00097887, [45] [expand_dump_flag]: 2.16998e-06 [switch_simplify]: 1.038e-05 [loop_unroll]: 6.54001e-06 [a_1]: 0.00015482 [with_stream_mark]: 2.057e-05 [recompute_prepare]: 6.61999e-06 [updatestate_depend_eliminate]: 4.23001e-06 [updatestate_assign_eliminate]: 3.55e-06 [updatestate_loads_eliminate]: 2.83e-06 [parameter_eliminate]: 1.44998e-06 [a_2]: 0.00013351 [accelerated_algorithm]: 7.76001e-06 [shard]: 2.66e-06 [meta_shard_fg_expand]: 2.10002e-06 [shard_inline]: 6.84999e-06 [merge_send_recv]: 8.22998e-06 [auto_parallel]: 9.24e-06 [parallel]: 8.66002e-06 [flash_sp]: 4.26001e-06 [merge_comm]: 4.52e-06 [allreduce_fusion]: 4.15e-06 [matmul_add_comm_reduction]: 8.92e-06 [allreduce_slice_to_reducescatter]: 9.5999e-07 [virtual_shard_identity]: 7.95e-06 [virtual_dataset]: 6.66999e-06 [get_grad_eliminate_]: 6.11998e-06 [virtual_output]: 6.20002e-06 [merge_forward]: 3.76001e-06 [cell_reuse_recompute_pass]: 2.62001e-06 [offload_activation]: 1.098e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.678e-05 [merge_recompute_call_nodes]: 1.27999e-06 [before_grad]: 9.54e-06 [set_forward_comm_id_for_comm_node_pass]: 4.47e-06 [meta_fg_expand]: 3.16999e-06 [flash_sp_send_recv_attached]: 1.45999e-06 [receive_attached]: 1.82999e-06 [after_resolve]: 1.397e-05 [a_after_grad]: 9.63002e-06 [renormalize]: 7.00238e-08 [add_forward_monad_depend]: 2.78998e-06 [auto_monad_grad]: 2.37001e-06 [auto_monad_eliminator]: 1.205e-05 [cse]: 1.892e-05 [a_3]: 5.409e-05 [py_interpret_to_execute_after_opt_a]: 2.097e-05 [slice_cell_reuse_recomputed_activation]: 5.56e-06 [rewriter_after_opt_a]: 4.857e-05 [convert_after_rewriter]: 1.139e-05 [order_py_execute_after_rewriter]: 9.10001e-06 [mutable_eliminate]: 0.00086437 [opt_b]: 0.0003034, [1] [Cycle 1]: 0.00029111, [7] [b_1]: 0.00017481 [b_2]: 9.75002e-06 [updatestate_depend_eliminate]: 1.101e-05 [updatestate_assign_eliminate]: 2.91e-06 [updatestate_loads_eliminate]: 3.09999e-06 [renormalize]: 1.00001e-06 [cse]: 2.786e-05 [optimize_parallel_all_gather_comm]: 2.438e-05 [overlap_param_gather]: 5.70001e-06 [cconv]: 4.12e-05 [loop_unroll]: 0.00055025 [opt_after_cconv]: 0.00014236, [1] [Cycle 1]: 0.00013314, [7] [c_1]: 3.338e-05 [parameter_eliminate]: 6.12001e-06 [updatestate_depend_eliminate]: 6.96001e-06 [updatestate_assign_eliminate]: 2.78003e-06 [updatestate_loads_eliminate]: 2.98e-06 [cse]: 2.217e-05 [renormalize]: 5.69999e-07 [remove_dup_value]: 1.888e-05 [tuple_transform]: 9.929e-05, [1] [Cycle 1]: 9.186e-05, [4] [d_1]: 5.032e-05 [none_parameter_eliminate]: 1.92999e-06 [renormalize]: 4.09986e-07 [switch_simplify]: 8.03001e-06 [partial_unused_args_eliminate]: 4.80001e-06 [add_recomputation]: 5.978e-05 [cse_after_recomputation]: 2.979e-05, [1] [Cycle 1]: 2.264e-05, [1] [cse]: 1.29e-05 [environ_conv]: 9.10001e-06 [swap_dp_allreduce_reducescatter]: 8.25e-06 [bias_add_comm_swap]: 6.15002e-06 [label_micro_interleaved_index]: 7.98001e-06 [label_fine_grained_interleaved_index]: 5.31998e-06 [merge_cast_opt]: 4.4e-06 [slice_recompute_activation]: 4.79e-06 [micro_interleaved_order_control]: 4.68999e-06 [assign_add_opt]: 4.22003e-06 [ForceFp32Comm]: 3.43e-06 [remove_cast_before_assign_add]: 3.36001e-06 [full_micro_interleaved_order_control]: 4.62e-06 [reorder_send_recv_between_fp_bp]: 5.89999e-06 [comm_op_add_attrs]: 3.6e-06 [add_comm_op_reuse_tag]: 3.85998e-06 [interleave_split_concat_branches]: 3.45e-06 [interleave_parallel_branches]: 3.63e-06 [overlap_opt_shard_in_pipeline]: 3.56999e-06 [overlap_opt_shard_grad_in_pipeline]: 4.55001e-06 [control_data_broadcast_order]: 1.77e-05 [grouped_pairwise_exchange_alltoall]: 3.97e-06 [offloading_packed_experts]: 7.10998e-06 [overlap_recompute_and_grad_model_parallel]: 7.92e-06 [overlap_grad_matmul_and_grad_allreduce]: 4.07e-06 [overlap_recompute_allgather_and_fa_grad]: 2.278e-05 [overlap_recompute_comm]: 6.26e-06 [overlap_grad_ring_attention]: 8.92999e-06 [overlap_grad_flash_sp]: 2.78e-05 [begin_end_overlap_inline]: 3.51999e-06 [split_matmul_comm_elemetwise]: 5.52001e-06 [split_layernorm_comm]: 4.82998e-06 [handle_group_info]: 3.8e-06 [symbol_engine_optimizer]: 0.0001157, [1] [Cycle 1]: 0.00010746, [6] [build]: 4.74998e-06 [elim_shapecalc]: 1.45e-05 [elim_not_effective]: 1.519e-05 [opt_reshape]: 8.94998e-06 [fold_const_symbol]: 1.107e-05 [renormalize]: 2.60014e-07 [detach_backward]: 4.48001e-06 [pipeline_parallel_scheduler]: 2.00002e-06 [auto_monad_reorder]: 2.314e-05 [get_jit_bprop_graph]: 2.27001e-06 [rewriter_after_jit_bprop_graph]: 6.98e-06 [opt_after_jit_grad]: 0.00063262 [validate]: 5.107e-05 Sums bootstrap : 0.000465s : 0.23% type_inference : 0.006613s : 3.27% event_method : 0.188760s : 93.38% auto_monad : 0.000104s : 0.05% graph_reusing : 0.000006s : 0.00% inline : 0.000003s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000027s : 0.01% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.00% parallel-infer-symbol : 0.000004s : 0.00% pre_auto_parallel : 0.000045s : 0.02% insert-virtual-dataset : 0.000003s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000037s : 0.02% optimize.rewriter_before_opt_a : 0.000104s : 0.05% optimize.opt_a.expand_dump_flag : 0.000006s : 0.00% optimize.opt_a.switch_simplify : 0.000057s : 0.03% optimize.opt_a.loop_unroll : 0.000038s : 0.02% optimize.opt_a.a_1 : 0.000878s : 0.43% optimize.opt_a.with_stream_mark : 0.000047s : 0.02% optimize.opt_a.recompute_prepare : 0.000018s : 0.01% optimize.opt_a.updatestate_depend_eliminate : 0.000009s : 0.00% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.00% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.00% optimize.opt_a.parameter_eliminate : 0.000004s : 0.00% optimize.opt_a.a_2 : 0.000259s : 0.13% optimize.opt_a.accelerated_algorithm : 0.000016s : 0.01% optimize.opt_a.shard : 0.000005s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.000005s : 0.00% optimize.opt_a.shard_inline : 0.000014s : 0.01% optimize.opt_a.merge_send_recv : 0.000018s : 0.01% optimize.opt_a.auto_parallel : 0.000018s : 0.01% optimize.opt_a.parallel : 0.000031s : 0.02% optimize.opt_a.flash_sp : 0.000015s : 0.01% optimize.opt_a.merge_comm : 0.000009s : 0.00% optimize.opt_a.allreduce_fusion : 0.000008s : 0.00% optimize.opt_a.matmul_add_comm_reduction : 0.000020s : 0.01% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000002s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000017s : 0.01% optimize.opt_a.virtual_dataset : 0.000015s : 0.01% optimize.opt_a.get_grad_eliminate_ : 0.000013s : 0.01% optimize.opt_a.virtual_output : 0.000014s : 0.01% optimize.opt_a.merge_forward : 0.000008s : 0.00% optimize.opt_a.cell_reuse_recompute_pass : 0.000004s : 0.00% optimize.opt_a.offload_activation : 0.000023s : 0.01% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000035s : 0.02% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.00% optimize.opt_a.before_grad : 0.000023s : 0.01% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000009s : 0.00% optimize.opt_a.meta_fg_expand : 0.000007s : 0.00% optimize.opt_a.flash_sp_send_recv_attached : 0.000005s : 0.00% optimize.opt_a.receive_attached : 0.000004s : 0.00% optimize.opt_a.after_resolve : 0.000027s : 0.01% optimize.opt_a.a_after_grad : 0.000020s : 0.01% optimize.opt_a.renormalize : 0.000966s : 0.48% optimize.opt_a.add_forward_monad_depend : 0.000013s : 0.01% optimize.opt_a.auto_monad_grad : 0.000005s : 0.00% optimize.opt_a.auto_monad_eliminator : 0.000034s : 0.02% optimize.opt_a.cse : 0.000051s : 0.03% optimize.opt_a.a_3 : 0.000127s : 0.06% optimize.py_interpret_to_execute_after_opt_a : 0.000021s : 0.01% optimize.slice_cell_reuse_recomputed_activation : 0.000006s : 0.00% optimize.rewriter_after_opt_a : 0.000049s : 0.02% optimize.convert_after_rewriter : 0.000011s : 0.01% optimize.order_py_execute_after_rewriter : 0.000009s : 0.00% optimize.mutable_eliminate : 0.000864s : 0.43% optimize.opt_b.b_1 : 0.000175s : 0.09% optimize.opt_b.b_2 : 0.000010s : 0.00% optimize.opt_b.updatestate_depend_eliminate : 0.000011s : 0.01% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.00% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000028s : 0.01% optimize.optimize_parallel_all_gather_comm : 0.000024s : 0.01% optimize.overlap_param_gather : 0.000006s : 0.00% optimize.cconv : 0.000041s : 0.02% optimize.loop_unroll : 0.000550s : 0.27% optimize.opt_after_cconv.c_1 : 0.000033s : 0.02% optimize.opt_after_cconv.parameter_eliminate : 0.000006s : 0.00% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000007s : 0.00% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.cse : 0.000022s : 0.01% optimize.opt_after_cconv.renormalize : 0.000001s : 0.00% optimize.remove_dup_value : 0.000019s : 0.01% optimize.tuple_transform.d_1 : 0.000050s : 0.02% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000008s : 0.00% optimize.partial_unused_args_eliminate : 0.000005s : 0.00% optimize.add_recomputation : 0.000060s : 0.03% optimize.cse_after_recomputation.cse : 0.000013s : 0.01% optimize.environ_conv : 0.000009s : 0.00% optimize.swap_dp_allreduce_reducescatter : 0.000008s : 0.00% optimize.bias_add_comm_swap : 0.000006s : 0.00% optimize.label_micro_interleaved_index : 0.000008s : 0.00% optimize.label_fine_grained_interleaved_index : 0.000005s : 0.00% optimize.merge_cast_opt : 0.000004s : 0.00% optimize.slice_recompute_activation : 0.000005s : 0.00% optimize.micro_interleaved_order_control : 0.000005s : 0.00% optimize.assign_add_opt : 0.000004s : 0.00% optimize.ForceFp32Comm : 0.000003s : 0.00% optimize.remove_cast_before_assign_add : 0.000003s : 0.00% optimize.full_micro_interleaved_order_control : 0.000005s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000006s : 0.00% optimize.comm_op_add_attrs : 0.000004s : 0.00% optimize.add_comm_op_reuse_tag : 0.000004s : 0.00% optimize.interleave_split_concat_branches : 0.000003s : 0.00% optimize.interleave_parallel_branches : 0.000004s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000005s : 0.00% optimize.control_data_broadcast_order : 0.000018s : 0.01% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.00% optimize.offloading_packed_experts : 0.000007s : 0.00% optimize.overlap_recompute_and_grad_model_parallel : 0.000008s : 0.00% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000023s : 0.01% optimize.overlap_recompute_comm : 0.000006s : 0.00% optimize.overlap_grad_ring_attention : 0.000009s : 0.00% optimize.overlap_grad_flash_sp : 0.000028s : 0.01% optimize.begin_end_overlap_inline : 0.000004s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000006s : 0.00% optimize.split_layernorm_comm : 0.000005s : 0.00% optimize.handle_group_info : 0.000004s : 0.00% optimize.symbol_engine_optimizer.build : 0.000005s : 0.00% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000015s : 0.01% optimize.symbol_engine_optimizer.elim_not_effective : 0.000015s : 0.01% optimize.symbol_engine_optimizer.opt_reshape : 0.000009s : 0.00% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000011s : 0.01% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000004s : 0.00% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000023s : 0.01% get_jit_bprop_graph : 0.000002s : 0.00% rewriter_after_jit_bprop_graph : 0.000007s : 0.00% opt_after_jit_grad : 0.000633s : 0.31% validate : 0.000051s : 0.03% Time group info: ------[substitution.] 0.000277 34 15.48% : 0.000043s : 6: substitution.arithmetic_simplify 0.81% : 0.000002s : 2: substitution.elim_not_effective 0.64% : 0.000002s : 2: substitution.fold_const_symbol 2.47% : 0.000007s : 4: substitution.graph_param_transform 67.55% : 0.000187s : 4: substitution.inline 1.83% : 0.000005s : 4: substitution.j_node_and_user_rematch 2.41% : 0.000007s : 4: substitution.remove_not_recompute_node 2.74% : 0.000008s : 4: substitution.replace_old_param 6.07% : 0.000017s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.006546 2 87.09% : 0.005700s : 1: type_inference.infer 12.91% : 0.000845s : 1: type_inference.specialize ------[replace.] 0.000072 8 62.76% : 0.000045s : 4: replace.inline 37.24% : 0.000027s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000199 8 92.48% : 0.000184s : 4: match.inline 7.52% : 0.000015s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000229 1278 0.86% : 0.000002s : 13: predicate.accumulaten_eliminater 1.01% : 0.000002s : 4: predicate.ad_related_special_op_eliminate 0.49% : 0.000001s : 8: predicate.addn_check_dump 0.92% : 0.000002s : 13: predicate.addn_zero_filter 0.75% : 0.000002s : 13: predicate.adjust_all_reduce_mul_add 2.65% : 0.000006s : 21: predicate.arithmetic_simplify 1.00% : 0.000002s : 13: predicate.cast_eliminate 0.79% : 0.000002s : 8: predicate.check_bprop_eliminate 0.47% : 0.000001s : 8: predicate.compare_switch_simplify 0.20% : 0.000000s : 4: predicate.const_output_eliminate 0.59% : 0.000001s : 8: predicate.depend_value_elim 1.03% : 0.000002s : 13: predicate.dict_get_item_const_eliminator 0.98% : 0.000002s : 13: predicate.dict_get_item_eliminator 0.80% : 0.000002s : 13: predicate.dict_set_item_eliminator 0.96% : 0.000002s : 8: predicate.dumpgradient_eliminate 0.25% : 0.000001s : 4: predicate.elim_not_effective 0.48% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.19% : 0.000003s : 17: predicate.environ_add_const_eliminate 1.11% : 0.000003s : 17: predicate.environ_get_add_eliminate 1.10% : 0.000003s : 17: predicate.environ_get_depend_swap 1.50% : 0.000003s : 25: predicate.environ_get_eliminate 0.96% : 0.000002s : 17: predicate.environ_get_set_eliminate 1.37% : 0.000003s : 21: predicate.exchange_switch_depend_value 2.45% : 0.000006s : 21: predicate.float_depend_g_call 0.57% : 0.000001s : 8: predicate.float_environ_get_switch 0.94% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.16% : 0.000000s : 4: predicate.fold_const_symbol 0.59% : 0.000001s : 8: predicate.get_grad_eliminate 0.27% : 0.000001s : 4: predicate.graph_param_transform 0.51% : 0.000001s : 8: predicate.incorporate_call 0.42% : 0.000001s : 8: predicate.incorporate_call_switch 6.72% : 0.000015s : 58: predicate.inline 0.81% : 0.000002s : 8: predicate.inline_without_move 0.32% : 0.000001s : 8: predicate.j_node_and_user_rematch 0.83% : 0.000002s : 8: predicate.less_batch_normalization 1.58% : 0.000004s : 25: predicate.list_to_tuple_eliminator_ 2.32% : 0.000005s : 38: predicate.load_eliminater 1.20% : 0.000003s : 4: predicate.loop_unroll_after_grad 2.06% : 0.000005s : 34: predicate.loop_unroll_before_grad 1.65% : 0.000004s : 21: predicate.make_slice_get_slice_eliminator 0.64% : 0.000001s : 8: predicate.merge_addn 0.55% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.56% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.78% : 0.000002s : 13: predicate.minmaximum_grad 1.47% : 0.000003s : 4: predicate.mutable_eliminate 0.48% : 0.000001s : 4: predicate.opt_reshape 0.39% : 0.000001s : 4: predicate.parallel_virtual_node 1.82% : 0.000004s : 21: predicate.partial_defer_inline 1.54% : 0.000004s : 21: predicate.partial_eliminate 1.01% : 0.000002s : 13: predicate.print_const_string_wrapper 0.56% : 0.000001s : 8: predicate.reduce_all_const_elim 1.16% : 0.000003s : 13: predicate.reduce_eliminate 2.41% : 0.000005s : 38: predicate.redundant_stop_gradient_eliminater 0.55% : 0.000001s : 8: predicate.remove_not_recompute_node 1.33% : 0.000003s : 25: predicate.replace_applicator 0.53% : 0.000001s : 8: predicate.replace_old_param 0.47% : 0.000001s : 4: predicate.reset_defer_inline 0.87% : 0.000002s : 13: predicate.reshape_eliminate 0.95% : 0.000002s : 8: predicate.row_tensor_add_zeros_like 0.49% : 0.000001s : 4: predicate.row_tensor_eliminate 0.74% : 0.000002s : 8: predicate.same_eliminate 0.59% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.86% : 0.000002s : 8: predicate.shard_identity_eliminate 0.62% : 0.000001s : 8: predicate.special_op_eliminate 0.76% : 0.000002s : 8: predicate.specialize_transform 1.01% : 0.000002s : 8: predicate.split_environ_get_set_with_tuple_value 0.76% : 0.000002s : 8: predicate.stack_unstack_eliminate 0.48% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.41% : 0.000003s : 21: predicate.switch_defer_inline 1.85% : 0.000004s : 29: predicate.switch_layer_defer_inline 4.91% : 0.000011s : 67: predicate.switch_simplify 0.83% : 0.000002s : 13: predicate.tile_eliminate 0.82% : 0.000002s : 13: predicate.transpose_eliminate 1.61% : 0.000004s : 21: predicate.tuple_list_convert_item_index_to_positive 1.82% : 0.000004s : 21: predicate.tuple_list_get_item_const_eliminator 1.30% : 0.000003s : 21: predicate.tuple_list_get_item_depend_reorder 3.21% : 0.000007s : 33: predicate.tuple_list_get_item_eliminator 1.48% : 0.000003s : 21: predicate.tuple_list_get_set_item_eliminator 2.29% : 0.000005s : 29: predicate.tuple_list_set_item_eliminator 1.53% : 0.000003s : 25: predicate.tuple_to_list_eliminator_ 2.19% : 0.000005s : 38: predicate.updatestate_pure_node_eliminater 2.88% : 0.000007s : 46: predicate.updatestate_useless_node_eliminater 0.35% : 0.000001s : 4: predicate.value_based_eliminate 0.77% : 0.000002s : 8: predicate.virtual_dataset_eliminate 0.83% : 0.000002s : 8: predicate.virtual_output_eliminate 0.22% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.47% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000716 11 57.85% : 0.000414s : 5: func_graph_cloner_run.FuncGraphClonerGraph 42.15% : 0.000302s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.221339 192 0.00% : 0.000006s : 1: ForceFp32Comm 1.81% : 0.004014s : 1: add_attr 1.80% : 0.003992s : 1: add_attr_with_inline 0.00% : 0.000007s : 1: add_comm_op_reuse_tag 0.03% : 0.000063s : 1: add_recomputation 0.00% : 0.000007s : 1: assign_add_opt 0.05% : 0.000120s : 1: auto_monad 0.01% : 0.000032s : 1: auto_monad_reorder 0.00% : 0.000006s : 1: begin_end_overlap_inline 0.00% : 0.000009s : 1: bias_add_comm_swap 0.23% : 0.000512s : 1: bootstrap 0.02% : 0.000045s : 1: cconv 0.00% : 0.000006s : 1: comm_op_add_attrs 0.01% : 0.000021s : 1: control_data_broadcast_order 0.01% : 0.000014s : 1: convert_after_rewriter 0.02% : 0.000033s : 1: cse_after_recomputation 0.00% : 0.000008s : 1: dataset_repeat_opt 0.01% : 0.000021s : 1: detach_backward 0.01% : 0.000012s : 1: environ_conv 85.31% : 0.188828s : 1: event_method 0.00% : 0.000007s : 1: full_micro_interleaved_order_control 0.00% : 0.000009s : 1: get_jit_bprop_graph 0.01% : 0.000013s : 1: graph_reusing 0.00% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000007s : 1: handle_group_info 0.00% : 0.000009s : 1: inline 0.00% : 0.000009s : 1: insert-virtual-dataset 0.00% : 0.000006s : 1: interleave_parallel_branches 0.00% : 0.000006s : 1: interleave_split_concat_branches 0.00% : 0.000008s : 1: label_fine_grained_interleaved_index 0.00% : 0.000011s : 1: label_micro_interleaved_index 0.25% : 0.000559s : 1: loop_unroll 0.00% : 0.000007s : 1: merge_cast_opt 0.00% : 0.000008s : 1: micro_interleaved_order_control 0.40% : 0.000874s : 1: mutable_eliminate 0.00% : 0.000010s : 1: offloading_packed_experts 0.01% : 0.000019s : 1: opt.transform.loop_unroll_optimizer 0.01% : 0.000023s : 1: opt.transform.mutable_eliminate 0.61% : 0.001359s : 78: opt.transform.opt_a 0.01% : 0.000032s : 1: opt.transform.opt_after_cconv 0.01% : 0.000030s : 1: opt.transform.opt_after_jit_grad 0.05% : 0.000108s : 28: opt.transform.opt_b 0.03% : 0.000056s : 2: opt.transform.opt_trans_graph 0.02% : 0.000044s : 4: opt.transform.symbol_engine_opt 1.70% : 0.003753s : 1: opt_a 0.07% : 0.000146s : 1: opt_after_cconv 0.29% : 0.000646s : 1: opt_after_jit_grad 0.14% : 0.000307s : 1: opt_b 3.20% : 0.007087s : 1: optimize 0.01% : 0.000028s : 1: optimize_parallel_all_gather_comm 0.01% : 0.000012s : 1: order_py_execute_after_rewriter 0.01% : 0.000031s : 1: overlap_grad_flash_sp 0.00% : 0.000007s : 1: overlap_grad_matmul_and_grad_allreduce 0.01% : 0.000012s : 1: overlap_grad_ring_attention 0.00% : 0.000007s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000006s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000009s : 1: overlap_param_gather 0.02% : 0.000034s : 1: overlap_recompute_allgather_and_fa_grad 0.01% : 0.000012s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000010s : 1: overlap_recompute_comm 0.00% : 0.000010s : 1: parallel-infer-symbol 0.00% : 0.000007s : 1: parallel-infer-symbol-second 0.00% : 0.000008s : 1: partial_unused_args_eliminate 0.00% : 0.000010s : 1: pipeline_parallel_scheduler 0.00% : 0.000008s : 1: pipeline_split 0.02% : 0.000054s : 1: pre_auto_parallel 0.02% : 0.000042s : 1: py_interpret_to_execute 0.01% : 0.000024s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000006s : 1: remove_cast_before_assign_add 0.01% : 0.000022s : 1: remove_dup_value 0.24% : 0.000523s : 1: renormalize.infer 0.19% : 0.000429s : 1: renormalize.specialize 0.00% : 0.000009s : 1: reorder_send_recv_between_fp_bp 0.01% : 0.000013s : 1: rewriter_after_jit_bprop_graph 0.02% : 0.000052s : 1: rewriter_after_opt_a 0.05% : 0.000108s : 1: rewriter_before_opt_a 0.00% : 0.000008s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000009s : 1: slice_recompute_activation 0.00% : 0.000008s : 1: split_layernorm_comm 0.00% : 0.000008s : 1: split_matmul_comm_elemetwise 0.01% : 0.000011s : 1: swap_dp_allreduce_reducescatter 0.05% : 0.000119s : 1: symbol_engine_optimizer 0.05% : 0.000102s : 1: tuple_transform 3.01% : 0.006663s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:41:17.190.87 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.159887, [21] [bootstrap]: 0.00045765 [type_inference]: 0.148829 [event_method]: 2.297e-05 [auto_monad]: 9.32e-05 [graph_reusing]: 6.69999e-06 [inline]: 2.72001e-06 [add_attr]: 0.00382285, [1] [add_attr_with_inline]: 0.00380938, [1] [Cycle 1]: 7.611e-05, [2] [tag_attr]: 2.514e-05 [meta_addattr_fg_expand]: 6.24999e-06 [parallel-infer-symbol]: 4.12e-06 [pre_auto_parallel]: 4.228e-05 [insert-virtual-dataset]: 2.56e-06 [parallel-infer-symbol-second]: 8.60018e-07 [dataset_repeat_opt]: 2.12001e-06 [pipeline_split]: 1.77001e-06 [optimize]: 0.00581727, [53] [py_interpret_to_execute]: 3.244e-05 [rewriter_before_opt_a]: 0.00010015 [opt_a]: 0.00315585, [2] [Cycle 1]: 0.0024563, [45] [expand_dump_flag]: 3.40003e-06 [switch_simplify]: 4.927e-05 [loop_unroll]: 5.462e-05 [a_1]: 0.00073077 [with_stream_mark]: 2.089e-05 [recompute_prepare]: 1.149e-05 [updatestate_depend_eliminate]: 4.50001e-06 [updatestate_assign_eliminate]: 4.33999e-06 [updatestate_loads_eliminate]: 3.51001e-06 [parameter_eliminate]: 1.91998e-06 [a_2]: 0.0001039 [accelerated_algorithm]: 9.32999e-06 [shard]: 2.16e-06 [meta_shard_fg_expand]: 2.19999e-06 [shard_inline]: 7.10998e-06 [merge_send_recv]: 9.42001e-06 [auto_parallel]: 8.55001e-06 [parallel]: 3.618e-05 [flash_sp]: 1.039e-05 [merge_comm]: 6.20002e-06 [allreduce_fusion]: 4.29002e-06 [matmul_add_comm_reduction]: 1.022e-05 [allreduce_slice_to_reducescatter]: 6.99976e-07 [virtual_shard_identity]: 9.86998e-06 [virtual_dataset]: 7.84002e-06 [get_grad_eliminate_]: 6.81001e-06 [virtual_output]: 7.50003e-06 [merge_forward]: 4.10998e-06 [cell_reuse_recompute_pass]: 1.47001e-06 [offload_activation]: 1.253e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.428e-05 [merge_recompute_call_nodes]: 1.59998e-06 [before_grad]: 1.284e-05 [set_forward_comm_id_for_comm_node_pass]: 4.03999e-06 [meta_fg_expand]: 3.03998e-06 [flash_sp_send_recv_attached]: 2.69999e-06 [receive_attached]: 2.78e-06 [after_resolve]: 1.384e-05 [a_after_grad]: 1.137e-05 [renormalize]: 0.00085506 [add_forward_monad_depend]: 7.03998e-06 [auto_monad_grad]: 2.63998e-06 [auto_monad_eliminator]: 1.687e-05 [cse]: 3.068e-05 [a_3]: 5.112e-05 [Cycle 2]: 0.00068745, [45] [expand_dump_flag]: 1.25999e-06 [switch_simplify]: 8.37e-06 [loop_unroll]: 6.86001e-06 [a_1]: 0.00014557 [with_stream_mark]: 1.396e-05 [recompute_prepare]: 6.96999e-06 [updatestate_depend_eliminate]: 3.32002e-06 [updatestate_assign_eliminate]: 2.56998e-06 [updatestate_loads_eliminate]: 2.78e-06 [parameter_eliminate]: 1.39e-06 [a_2]: 7.909e-05 [accelerated_algorithm]: 6.37001e-06 [shard]: 1.35999e-06 [meta_shard_fg_expand]: 1.82999e-06 [shard_inline]: 6.16998e-06 [merge_send_recv]: 5.69e-06 [auto_parallel]: 5.59e-06 [parallel]: 5.84999e-06 [flash_sp]: 3.66001e-06 [merge_comm]: 3.75e-06 [allreduce_fusion]: 3.31999e-06 [matmul_add_comm_reduction]: 6.24001e-06 [allreduce_slice_to_reducescatter]: 3.59985e-07 [virtual_shard_identity]: 9.46003e-06 [virtual_dataset]: 6.01e-06 [get_grad_eliminate_]: 5.68002e-06 [virtual_output]: 5.86998e-06 [merge_forward]: 3.68999e-06 [cell_reuse_recompute_pass]: 1.59e-06 [offload_activation]: 8.13999e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.12e-05 [merge_recompute_call_nodes]: 1.35999e-06 [before_grad]: 9.35001e-06 [set_forward_comm_id_for_comm_node_pass]: 3.36999e-06 [meta_fg_expand]: 2.53998e-06 [flash_sp_send_recv_attached]: 1.11002e-06 [receive_attached]: 1.39e-06 [after_resolve]: 9.86e-06 [a_after_grad]: 9.14e-06 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 1.40999e-06 [auto_monad_grad]: 1.32999e-06 [auto_monad_eliminator]: 8.17998e-06 [cse]: 1.483e-05 [a_3]: 3.834e-05 [py_interpret_to_execute_after_opt_a]: 1.022e-05 [slice_cell_reuse_recomputed_activation]: 2.02001e-06 [rewriter_after_opt_a]: 3.709e-05 [convert_after_rewriter]: 7.00998e-06 [order_py_execute_after_rewriter]: 5.49e-06 [mutable_eliminate]: 0.00061933 [opt_b]: 0.00029255, [1] [Cycle 1]: 0.00028455, [7] [b_1]: 0.00012536 [b_2]: 8.51002e-06 [updatestate_depend_eliminate]: 7.58999e-06 [updatestate_assign_eliminate]: 2.84999e-06 [updatestate_loads_eliminate]: 2.58e-06 [renormalize]: 6.29982e-07 [cse]: 3.815e-05 [optimize_parallel_all_gather_comm]: 2.429e-05 [overlap_param_gather]: 2.11998e-06 [cconv]: 3.103e-05 [loop_unroll]: 0.00048293 [opt_after_cconv]: 0.00013412, [1] [Cycle 1]: 0.00012679, [7] [c_1]: 4.922e-05 [parameter_eliminate]: 4.71002e-06 [updatestate_depend_eliminate]: 7.03e-06 [updatestate_assign_eliminate]: 2.58e-06 [updatestate_loads_eliminate]: 2.16998e-06 [cse]: 2.178e-05 [renormalize]: 7.09988e-07 [remove_dup_value]: 1.371e-05 [tuple_transform]: 0.00011499, [1] [Cycle 1]: 0.00010939, [4] [d_1]: 7.435e-05 [none_parameter_eliminate]: 2.24001e-06 [renormalize]: 2.60014e-07 [switch_simplify]: 8.18001e-06 [partial_unused_args_eliminate]: 2.27001e-06 [add_recomputation]: 5.348e-05 [cse_after_recomputation]: 2.419e-05, [1] [Cycle 1]: 1.907e-05, [1] [cse]: 1.309e-05 [environ_conv]: 5.31002e-06 [swap_dp_allreduce_reducescatter]: 5.67001e-06 [bias_add_comm_swap]: 3.07002e-06 [label_micro_interleaved_index]: 5.03002e-06 [label_fine_grained_interleaved_index]: 3.14001e-06 [merge_cast_opt]: 1.60999e-06 [slice_recompute_activation]: 2.19999e-06 [micro_interleaved_order_control]: 2.39999e-06 [assign_add_opt]: 1.14e-06 [ForceFp32Comm]: 8.89995e-07 [remove_cast_before_assign_add]: 1.34e-06 [full_micro_interleaved_order_control]: 2.39001e-06 [reorder_send_recv_between_fp_bp]: 2.86e-06 [comm_op_add_attrs]: 1.05999e-06 [add_comm_op_reuse_tag]: 1.02e-06 [interleave_split_concat_branches]: 1.12e-06 [interleave_parallel_branches]: 1.536e-05 [overlap_opt_shard_in_pipeline]: 1.71998e-06 [overlap_opt_shard_grad_in_pipeline]: 1.99e-06 [control_data_broadcast_order]: 1.437e-05 [grouped_pairwise_exchange_alltoall]: 1.60999e-06 [offloading_packed_experts]: 3.78001e-06 [overlap_recompute_and_grad_model_parallel]: 4.87e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.35001e-06 [overlap_recompute_allgather_and_fa_grad]: 1.47999e-06 [overlap_recompute_comm]: 2.61e-06 [overlap_grad_ring_attention]: 4.1e-06 [overlap_grad_flash_sp]: 2.202e-05 [begin_end_overlap_inline]: 5.8001e-07 [split_matmul_comm_elemetwise]: 2.17999e-06 [split_layernorm_comm]: 1.67999e-06 [handle_group_info]: 9.89996e-07 [symbol_engine_optimizer]: 8.278e-05, [1] [Cycle 1]: 7.751e-05, [6] [build]: 4.10998e-06 [elim_shapecalc]: 1.198e-05 [elim_not_effective]: 1.345e-05 [opt_reshape]: 7.31001e-06 [fold_const_symbol]: 1.036e-05 [renormalize]: 2.09984e-07 [detach_backward]: 2.43e-06 [pipeline_parallel_scheduler]: 1.57999e-06 [auto_monad_reorder]: 1.754e-05 [get_jit_bprop_graph]: 1.76e-06 [rewriter_after_jit_bprop_graph]: 5.71003e-06 [opt_after_jit_grad]: 0.00052986 [validate]: 4.432e-05 Sums bootstrap : 0.000458s : 0.30% type_inference : 0.148829s : 96.19% event_method : 0.000023s : 0.01% auto_monad : 0.000093s : 0.06% graph_reusing : 0.000007s : 0.00% inline : 0.000003s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000025s : 0.02% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.00% parallel-infer-symbol : 0.000004s : 0.00% pre_auto_parallel : 0.000042s : 0.03% insert-virtual-dataset : 0.000003s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000032s : 0.02% optimize.rewriter_before_opt_a : 0.000100s : 0.06% optimize.opt_a.expand_dump_flag : 0.000005s : 0.00% optimize.opt_a.switch_simplify : 0.000058s : 0.04% optimize.opt_a.loop_unroll : 0.000061s : 0.04% optimize.opt_a.a_1 : 0.000876s : 0.57% optimize.opt_a.with_stream_mark : 0.000035s : 0.02% optimize.opt_a.recompute_prepare : 0.000018s : 0.01% optimize.opt_a.updatestate_depend_eliminate : 0.000008s : 0.01% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.00% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.00% optimize.opt_a.parameter_eliminate : 0.000003s : 0.00% optimize.opt_a.a_2 : 0.000183s : 0.12% optimize.opt_a.accelerated_algorithm : 0.000016s : 0.01% optimize.opt_a.shard : 0.000004s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.00% optimize.opt_a.shard_inline : 0.000013s : 0.01% optimize.opt_a.merge_send_recv : 0.000015s : 0.01% optimize.opt_a.auto_parallel : 0.000014s : 0.01% optimize.opt_a.parallel : 0.000042s : 0.03% optimize.opt_a.flash_sp : 0.000014s : 0.01% optimize.opt_a.merge_comm : 0.000010s : 0.01% optimize.opt_a.allreduce_fusion : 0.000008s : 0.00% optimize.opt_a.matmul_add_comm_reduction : 0.000016s : 0.01% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000019s : 0.01% optimize.opt_a.virtual_dataset : 0.000014s : 0.01% optimize.opt_a.get_grad_eliminate_ : 0.000012s : 0.01% optimize.opt_a.virtual_output : 0.000013s : 0.01% optimize.opt_a.merge_forward : 0.000008s : 0.01% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.00% optimize.opt_a.offload_activation : 0.000021s : 0.01% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000025s : 0.02% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.00% optimize.opt_a.before_grad : 0.000022s : 0.01% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000007s : 0.00% optimize.opt_a.meta_fg_expand : 0.000006s : 0.00% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.00% optimize.opt_a.receive_attached : 0.000004s : 0.00% optimize.opt_a.after_resolve : 0.000024s : 0.02% optimize.opt_a.a_after_grad : 0.000021s : 0.01% optimize.opt_a.renormalize : 0.000855s : 0.55% optimize.opt_a.add_forward_monad_depend : 0.000008s : 0.01% optimize.opt_a.auto_monad_grad : 0.000004s : 0.00% optimize.opt_a.auto_monad_eliminator : 0.000025s : 0.02% optimize.opt_a.cse : 0.000046s : 0.03% optimize.opt_a.a_3 : 0.000089s : 0.06% optimize.py_interpret_to_execute_after_opt_a : 0.000010s : 0.01% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.00% optimize.rewriter_after_opt_a : 0.000037s : 0.02% optimize.convert_after_rewriter : 0.000007s : 0.00% optimize.order_py_execute_after_rewriter : 0.000005s : 0.00% optimize.mutable_eliminate : 0.000619s : 0.40% optimize.opt_b.b_1 : 0.000125s : 0.08% optimize.opt_b.b_2 : 0.000009s : 0.01% optimize.opt_b.updatestate_depend_eliminate : 0.000008s : 0.00% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.00% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000038s : 0.02% optimize.optimize_parallel_all_gather_comm : 0.000024s : 0.02% optimize.overlap_param_gather : 0.000002s : 0.00% optimize.cconv : 0.000031s : 0.02% optimize.loop_unroll : 0.000483s : 0.31% optimize.opt_after_cconv.c_1 : 0.000049s : 0.03% optimize.opt_after_cconv.parameter_eliminate : 0.000005s : 0.00% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000007s : 0.00% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.00% optimize.opt_after_cconv.cse : 0.000022s : 0.01% optimize.opt_after_cconv.renormalize : 0.000001s : 0.00% optimize.remove_dup_value : 0.000014s : 0.01% optimize.tuple_transform.d_1 : 0.000074s : 0.05% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000008s : 0.01% optimize.partial_unused_args_eliminate : 0.000002s : 0.00% optimize.add_recomputation : 0.000053s : 0.03% optimize.cse_after_recomputation.cse : 0.000013s : 0.01% optimize.environ_conv : 0.000005s : 0.00% optimize.swap_dp_allreduce_reducescatter : 0.000006s : 0.00% optimize.bias_add_comm_swap : 0.000003s : 0.00% optimize.label_micro_interleaved_index : 0.000005s : 0.00% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.00% optimize.merge_cast_opt : 0.000002s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.00% optimize.micro_interleaved_order_control : 0.000002s : 0.00% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.00% optimize.full_micro_interleaved_order_control : 0.000002s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.00% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000015s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000002s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.00% optimize.control_data_broadcast_order : 0.000014s : 0.01% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.00% optimize.offloading_packed_experts : 0.000004s : 0.00% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.00% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.00% optimize.overlap_recompute_comm : 0.000003s : 0.00% optimize.overlap_grad_ring_attention : 0.000004s : 0.00% optimize.overlap_grad_flash_sp : 0.000022s : 0.01% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.00% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000004s : 0.00% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000012s : 0.01% optimize.symbol_engine_optimizer.elim_not_effective : 0.000013s : 0.01% optimize.symbol_engine_optimizer.opt_reshape : 0.000007s : 0.00% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000010s : 0.01% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.00% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000018s : 0.01% get_jit_bprop_graph : 0.000002s : 0.00% rewriter_after_jit_bprop_graph : 0.000006s : 0.00% opt_after_jit_grad : 0.000530s : 0.34% validate : 0.000044s : 0.03% Time group info: ------[substitution.] 0.000249 34 13.56% : 0.000034s : 6: substitution.arithmetic_simplify 0.83% : 0.000002s : 2: substitution.elim_not_effective 0.56% : 0.000001s : 2: substitution.fold_const_symbol 2.54% : 0.000006s : 4: substitution.graph_param_transform 69.75% : 0.000174s : 4: substitution.inline 1.97% : 0.000005s : 4: substitution.j_node_and_user_rematch 2.09% : 0.000005s : 4: substitution.remove_not_recompute_node 1.81% : 0.000005s : 4: substitution.replace_old_param 6.90% : 0.000017s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.148749 2 99.41% : 0.147868s : 1: type_inference.infer 0.59% : 0.000881s : 1: type_inference.specialize ------[replace.] 0.000070 8 61.70% : 0.000043s : 4: replace.inline 38.30% : 0.000027s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000186 8 91.76% : 0.000171s : 4: match.inline 8.24% : 0.000015s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000226 1278 1.00% : 0.000002s : 13: predicate.accumulaten_eliminater 0.84% : 0.000002s : 4: predicate.ad_related_special_op_eliminate 0.44% : 0.000001s : 8: predicate.addn_check_dump 0.89% : 0.000002s : 13: predicate.addn_zero_filter 0.81% : 0.000002s : 13: predicate.adjust_all_reduce_mul_add 2.58% : 0.000006s : 21: predicate.arithmetic_simplify 0.94% : 0.000002s : 13: predicate.cast_eliminate 0.59% : 0.000001s : 8: predicate.check_bprop_eliminate 0.53% : 0.000001s : 8: predicate.compare_switch_simplify 0.16% : 0.000000s : 4: predicate.const_output_eliminate 0.60% : 0.000001s : 8: predicate.depend_value_elim 0.88% : 0.000002s : 13: predicate.dict_get_item_const_eliminator 1.05% : 0.000002s : 13: predicate.dict_get_item_eliminator 0.90% : 0.000002s : 13: predicate.dict_set_item_eliminator 0.92% : 0.000002s : 8: predicate.dumpgradient_eliminate 0.24% : 0.000001s : 4: predicate.elim_not_effective 0.38% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.07% : 0.000002s : 17: predicate.environ_add_const_eliminate 0.98% : 0.000002s : 17: predicate.environ_get_add_eliminate 1.05% : 0.000002s : 17: predicate.environ_get_depend_swap 1.55% : 0.000003s : 25: predicate.environ_get_eliminate 1.18% : 0.000003s : 17: predicate.environ_get_set_eliminate 1.47% : 0.000003s : 21: predicate.exchange_switch_depend_value 2.38% : 0.000005s : 21: predicate.float_depend_g_call 0.54% : 0.000001s : 8: predicate.float_environ_get_switch 0.86% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.18% : 0.000000s : 4: predicate.fold_const_symbol 0.55% : 0.000001s : 8: predicate.get_grad_eliminate 0.26% : 0.000001s : 4: predicate.graph_param_transform 0.56% : 0.000001s : 8: predicate.incorporate_call 0.47% : 0.000001s : 8: predicate.incorporate_call_switch 6.15% : 0.000014s : 58: predicate.inline 0.84% : 0.000002s : 8: predicate.inline_without_move 0.30% : 0.000001s : 8: predicate.j_node_and_user_rematch 1.05% : 0.000002s : 8: predicate.less_batch_normalization 1.70% : 0.000004s : 25: predicate.list_to_tuple_eliminator_ 2.60% : 0.000006s : 38: predicate.load_eliminater 0.97% : 0.000002s : 4: predicate.loop_unroll_after_grad 2.62% : 0.000006s : 34: predicate.loop_unroll_before_grad 1.57% : 0.000004s : 21: predicate.make_slice_get_slice_eliminator 0.54% : 0.000001s : 8: predicate.merge_addn 0.63% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.65% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.87% : 0.000002s : 13: predicate.minmaximum_grad 1.32% : 0.000003s : 4: predicate.mutable_eliminate 0.31% : 0.000001s : 4: predicate.opt_reshape 0.37% : 0.000001s : 4: predicate.parallel_virtual_node 2.02% : 0.000005s : 21: predicate.partial_defer_inline 1.60% : 0.000004s : 21: predicate.partial_eliminate 1.07% : 0.000002s : 13: predicate.print_const_string_wrapper 0.56% : 0.000001s : 8: predicate.reduce_all_const_elim 1.22% : 0.000003s : 13: predicate.reduce_eliminate 2.41% : 0.000005s : 38: predicate.redundant_stop_gradient_eliminater 0.39% : 0.000001s : 8: predicate.remove_not_recompute_node 1.42% : 0.000003s : 25: predicate.replace_applicator 0.50% : 0.000001s : 8: predicate.replace_old_param 0.32% : 0.000001s : 4: predicate.reset_defer_inline 0.88% : 0.000002s : 13: predicate.reshape_eliminate 0.58% : 0.000001s : 8: predicate.row_tensor_add_zeros_like 0.43% : 0.000001s : 4: predicate.row_tensor_eliminate 0.68% : 0.000002s : 8: predicate.same_eliminate 0.42% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.76% : 0.000002s : 8: predicate.shard_identity_eliminate 0.77% : 0.000002s : 8: predicate.special_op_eliminate 0.66% : 0.000001s : 8: predicate.specialize_transform 0.76% : 0.000002s : 8: predicate.split_environ_get_set_with_tuple_value 0.72% : 0.000002s : 8: predicate.stack_unstack_eliminate 0.38% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.54% : 0.000003s : 21: predicate.switch_defer_inline 2.01% : 0.000005s : 29: predicate.switch_layer_defer_inline 5.15% : 0.000012s : 67: predicate.switch_simplify 0.88% : 0.000002s : 13: predicate.tile_eliminate 0.92% : 0.000002s : 13: predicate.transpose_eliminate 1.88% : 0.000004s : 21: predicate.tuple_list_convert_item_index_to_positive 1.55% : 0.000003s : 21: predicate.tuple_list_get_item_const_eliminator 1.61% : 0.000004s : 21: predicate.tuple_list_get_item_depend_reorder 3.06% : 0.000007s : 33: predicate.tuple_list_get_item_eliminator 1.74% : 0.000004s : 21: predicate.tuple_list_get_set_item_eliminator 2.13% : 0.000005s : 29: predicate.tuple_list_set_item_eliminator 1.70% : 0.000004s : 25: predicate.tuple_to_list_eliminator_ 2.39% : 0.000005s : 38: predicate.updatestate_pure_node_eliminater 3.32% : 0.000008s : 46: predicate.updatestate_useless_node_eliminater 0.31% : 0.000001s : 4: predicate.value_based_eliminate 0.60% : 0.000001s : 8: predicate.virtual_dataset_eliminate 0.63% : 0.000001s : 8: predicate.virtual_output_eliminate 0.23% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.44% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000680 11 55.57% : 0.000378s : 5: func_graph_cloner_run.FuncGraphClonerGraph 44.43% : 0.000302s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.171695 192 0.00% : 0.000004s : 1: ForceFp32Comm 2.23% : 0.003830s : 1: add_attr 2.22% : 0.003814s : 1: add_attr_with_inline 0.00% : 0.000005s : 1: add_comm_op_reuse_tag 0.03% : 0.000058s : 1: add_recomputation 0.00% : 0.000004s : 1: assign_add_opt 0.06% : 0.000101s : 1: auto_monad 0.01% : 0.000021s : 1: auto_monad_reorder 0.00% : 0.000003s : 1: begin_end_overlap_inline 0.00% : 0.000006s : 1: bias_add_comm_swap 0.28% : 0.000489s : 1: bootstrap 0.02% : 0.000034s : 1: cconv 0.00% : 0.000004s : 1: comm_op_add_attrs 0.01% : 0.000018s : 1: control_data_broadcast_order 0.01% : 0.000010s : 1: convert_after_rewriter 0.02% : 0.000027s : 1: cse_after_recomputation 0.00% : 0.000005s : 1: dataset_repeat_opt 0.00% : 0.000006s : 1: detach_backward 0.00% : 0.000008s : 1: environ_conv 0.02% : 0.000032s : 1: event_method 0.00% : 0.000005s : 1: full_micro_interleaved_order_control 0.00% : 0.000005s : 1: get_jit_bprop_graph 0.01% : 0.000011s : 1: graph_reusing 0.00% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000004s : 1: handle_group_info 0.00% : 0.000006s : 1: inline 0.00% : 0.000006s : 1: insert-virtual-dataset 0.01% : 0.000019s : 1: interleave_parallel_branches 0.00% : 0.000004s : 1: interleave_split_concat_branches 0.00% : 0.000007s : 1: label_fine_grained_interleaved_index 0.00% : 0.000008s : 1: label_micro_interleaved_index 0.29% : 0.000492s : 1: loop_unroll 0.00% : 0.000004s : 1: merge_cast_opt 0.00% : 0.000005s : 1: micro_interleaved_order_control 0.37% : 0.000628s : 1: mutable_eliminate 0.00% : 0.000007s : 1: offloading_packed_experts 0.01% : 0.000016s : 1: opt.transform.loop_unroll_optimizer 0.01% : 0.000017s : 1: opt.transform.mutable_eliminate 0.79% : 0.001361s : 78: opt.transform.opt_a 0.03% : 0.000048s : 1: opt.transform.opt_after_cconv 0.02% : 0.000029s : 1: opt.transform.opt_after_jit_grad 0.06% : 0.000102s : 28: opt.transform.opt_b 0.05% : 0.000080s : 2: opt.transform.opt_trans_graph 0.02% : 0.000039s : 4: opt.transform.symbol_engine_opt 1.84% : 0.003160s : 1: opt_a 0.08% : 0.000138s : 1: opt_after_cconv 0.32% : 0.000541s : 1: opt_after_jit_grad 0.17% : 0.000296s : 1: opt_b 3.39% : 0.005823s : 1: optimize 0.02% : 0.000034s : 1: optimize_parallel_all_gather_comm 0.00% : 0.000008s : 1: order_py_execute_after_rewriter 0.01% : 0.000026s : 1: overlap_grad_flash_sp 0.00% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.00% : 0.000007s : 1: overlap_grad_ring_attention 0.00% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000005s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000006s : 1: overlap_param_gather 0.00% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.00% : 0.000009s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000005s : 1: overlap_recompute_comm 0.00% : 0.000008s : 1: parallel-infer-symbol 0.00% : 0.000005s : 1: parallel-infer-symbol-second 0.00% : 0.000005s : 1: partial_unused_args_eliminate 0.00% : 0.000005s : 1: pipeline_parallel_scheduler 0.00% : 0.000005s : 1: pipeline_split 0.03% : 0.000046s : 1: pre_auto_parallel 0.02% : 0.000037s : 1: py_interpret_to_execute 0.01% : 0.000013s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000004s : 1: remove_cast_before_assign_add 0.01% : 0.000017s : 1: remove_dup_value 0.27% : 0.000464s : 1: renormalize.infer 0.22% : 0.000381s : 1: renormalize.specialize 0.00% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.01% : 0.000009s : 1: rewriter_after_jit_bprop_graph 0.02% : 0.000041s : 1: rewriter_after_opt_a 0.06% : 0.000105s : 1: rewriter_before_opt_a 0.00% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000006s : 1: slice_recompute_activation 0.00% : 0.000005s : 1: split_layernorm_comm 0.00% : 0.000005s : 1: split_matmul_comm_elemetwise 0.00% : 0.000009s : 1: swap_dp_allreduce_reducescatter 0.05% : 0.000086s : 1: symbol_engine_optimizer 0.07% : 0.000118s : 1: tuple_transform 86.70% : 0.148851s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:41:19.439.410 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:41:19.439.686 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.168525, [21] [bootstrap]: 0.00044934 [type_inference]: 0.00713938 [event_method]: 2.092e-05 [auto_monad]: 7.065e-05 [graph_reusing]: 7.35003e-06 [inline]: 2.73998e-06 [add_attr]: 0.00398201, [1] [add_attr_with_inline]: 0.00397003, [1] [Cycle 1]: 9.17e-05, [2] [tag_attr]: 2.313e-05 [meta_addattr_fg_expand]: 5.88002e-06 [parallel-infer-symbol]: 3.77002e-06 [pre_auto_parallel]: 4.184e-05 [insert-virtual-dataset]: 2.66999e-06 [parallel-infer-symbol-second]: 7.29982e-07 [dataset_repeat_opt]: 2.50002e-06 [pipeline_split]: 1.67999e-06 [optimize]: 0.15542, [53] [py_interpret_to_execute]: 3.463e-05 [rewriter_before_opt_a]: 9.67e-05 [opt_a]: 0.152683, [2] [Cycle 1]: 0.151656, [45] [expand_dump_flag]: 3.06001e-06 [switch_simplify]: 4.27e-05 [loop_unroll]: 3.05e-05 [a_1]: 0.00120118 [with_stream_mark]: 2.619e-05 [recompute_prepare]: 1.305e-05 [updatestate_depend_eliminate]: 4.89998e-06 [updatestate_assign_eliminate]: 3.63999e-06 [updatestate_loads_eliminate]: 4.70999e-06 [parameter_eliminate]: 2.71e-06 [a_2]: 0.00017608 [accelerated_algorithm]: 9.02e-06 [shard]: 2.88003e-06 [meta_shard_fg_expand]: 2.61e-06 [shard_inline]: 9.41003e-06 [merge_send_recv]: 1.047e-05 [auto_parallel]: 9.99001e-06 [parallel]: 2.207e-05 [flash_sp]: 1.05e-05 [merge_comm]: 4.02002e-06 [allreduce_fusion]: 3.98001e-06 [matmul_add_comm_reduction]: 1.049e-05 [allreduce_slice_to_reducescatter]: 7.7e-07 [virtual_shard_identity]: 9.02e-06 [virtual_dataset]: 7.06999e-06 [get_grad_eliminate_]: 6.26e-06 [virtual_output]: 6.56e-06 [merge_forward]: 4.15e-06 [cell_reuse_recompute_pass]: 1.72001e-06 [offload_activation]: 1.048e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.648e-05 [merge_recompute_call_nodes]: 1.57001e-06 [before_grad]: 1.171e-05 [set_forward_comm_id_for_comm_node_pass]: 3.78999e-06 [meta_fg_expand]: 3.01999e-06 [flash_sp_send_recv_attached]: 2.72001e-06 [receive_attached]: 2.71999e-06 [after_resolve]: 1.243e-05 [a_after_grad]: 1.059e-05 [renormalize]: 0.149164 [add_forward_monad_depend]: 1.367e-05 [auto_monad_grad]: 2.48998e-06 [auto_monad_eliminator]: 2.656e-05 [cse]: 3.361e-05 [a_3]: 8.483e-05 [Cycle 2]: 0.00100816, [45] [expand_dump_flag]: 3.01001e-06 [switch_simplify]: 9.51e-06 [loop_unroll]: 7.65e-06 [a_1]: 0.0001803 [with_stream_mark]: 2.122e-05 [recompute_prepare]: 8.18001e-06 [updatestate_depend_eliminate]: 4.12998e-06 [updatestate_assign_eliminate]: 4.02e-06 [updatestate_loads_eliminate]: 2.90002e-06 [parameter_eliminate]: 2.53998e-06 [a_2]: 0.00011234 [accelerated_algorithm]: 7.68001e-06 [shard]: 2.96001e-06 [meta_shard_fg_expand]: 2.39001e-06 [shard_inline]: 7.28e-06 [merge_send_recv]: 9.59999e-06 [auto_parallel]: 9.94001e-06 [parallel]: 9.92999e-06 [flash_sp]: 4.74e-06 [merge_comm]: 4.36002e-06 [allreduce_fusion]: 3.66999e-06 [matmul_add_comm_reduction]: 1.468e-05 [allreduce_slice_to_reducescatter]: 1.20999e-06 [virtual_shard_identity]: 8.26002e-06 [virtual_dataset]: 6.73998e-06 [get_grad_eliminate_]: 7.87e-06 [virtual_output]: 6.36998e-06 [merge_forward]: 4.67e-06 [cell_reuse_recompute_pass]: 3.76999e-06 [offload_activation]: 1.137e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.677e-05 [merge_recompute_call_nodes]: 1.67001e-06 [before_grad]: 1.132e-05 [set_forward_comm_id_for_comm_node_pass]: 4.55001e-06 [meta_fg_expand]: 3.13998e-06 [flash_sp_send_recv_attached]: 1.85001e-06 [receive_attached]: 2.46998e-06 [after_resolve]: 1.336e-05 [a_after_grad]: 1.024e-05 [renormalize]: 8.00064e-08 [add_forward_monad_depend]: 2.74999e-06 [auto_monad_grad]: 2.26998e-06 [auto_monad_eliminator]: 9.41e-06 [cse]: 1.868e-05 [a_3]: 5.707e-05 [py_interpret_to_execute_after_opt_a]: 2.149e-05 [slice_cell_reuse_recomputed_activation]: 5.17999e-06 [rewriter_after_opt_a]: 4.65e-05 [convert_after_rewriter]: 1.091e-05 [order_py_execute_after_rewriter]: 8.48999e-06 [mutable_eliminate]: 0.00077301 [opt_b]: 0.00029688, [1] [Cycle 1]: 0.00028448, [7] [b_1]: 0.0001784 [b_2]: 8.2e-06 [updatestate_depend_eliminate]: 7.88999e-06 [updatestate_assign_eliminate]: 3.14999e-06 [updatestate_loads_eliminate]: 3.25998e-06 [renormalize]: 8.70001e-07 [cse]: 2.377e-05 [optimize_parallel_all_gather_comm]: 2.371e-05 [overlap_param_gather]: 5.43002e-06 [cconv]: 3.908e-05 [loop_unroll]: 0.00050053 [opt_after_cconv]: 0.0001409, [1] [Cycle 1]: 0.00013036, [7] [c_1]: 3.423e-05 [parameter_eliminate]: 5.68997e-06 [updatestate_depend_eliminate]: 6.94001e-06 [updatestate_assign_eliminate]: 2.74001e-06 [updatestate_loads_eliminate]: 2.49001e-06 [cse]: 2.104e-05 [renormalize]: 2.60014e-07 [remove_dup_value]: 1.95e-05 [tuple_transform]: 9.758e-05, [1] [Cycle 1]: 8.987e-05, [4] [d_1]: 4.876e-05 [none_parameter_eliminate]: 1.74e-06 [renormalize]: 2.00002e-07 [switch_simplify]: 8.09002e-06 [partial_unused_args_eliminate]: 5.00999e-06 [add_recomputation]: 5.57e-05 [cse_after_recomputation]: 2.919e-05, [1] [Cycle 1]: 2.169e-05, [1] [cse]: 1.236e-05 [environ_conv]: 8.90999e-06 [swap_dp_allreduce_reducescatter]: 7.97e-06 [bias_add_comm_swap]: 5.87001e-06 [label_micro_interleaved_index]: 7.08e-06 [label_fine_grained_interleaved_index]: 5.86e-06 [merge_cast_opt]: 3.81999e-06 [slice_recompute_activation]: 5.09998e-06 [micro_interleaved_order_control]: 5.30999e-06 [assign_add_opt]: 4.17998e-06 [ForceFp32Comm]: 3.31999e-06 [remove_cast_before_assign_add]: 3.80998e-06 [full_micro_interleaved_order_control]: 5.07999e-06 [reorder_send_recv_between_fp_bp]: 5.92999e-06 [comm_op_add_attrs]: 3.61001e-06 [add_comm_op_reuse_tag]: 3.18e-06 [interleave_split_concat_branches]: 3.95e-06 [interleave_parallel_branches]: 3.68e-06 [overlap_opt_shard_in_pipeline]: 3.69002e-06 [overlap_opt_shard_grad_in_pipeline]: 4.57e-06 [control_data_broadcast_order]: 1.828e-05 [grouped_pairwise_exchange_alltoall]: 4.03999e-06 [offloading_packed_experts]: 7.11001e-06 [overlap_recompute_and_grad_model_parallel]: 7.2e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.95e-06 [overlap_recompute_allgather_and_fa_grad]: 3.82002e-06 [overlap_recompute_comm]: 5.04e-06 [overlap_grad_ring_attention]: 7.51001e-06 [overlap_grad_flash_sp]: 2.529e-05 [begin_end_overlap_inline]: 3.31999e-06 [split_matmul_comm_elemetwise]: 4.39002e-06 [split_layernorm_comm]: 4.37e-06 [handle_group_info]: 3.65998e-06 [symbol_engine_optimizer]: 0.0001048, [1] [Cycle 1]: 9.739e-05, [6] [build]: 4.52e-06 [elim_shapecalc]: 1.191e-05 [elim_not_effective]: 1.443e-05 [opt_reshape]: 8.11002e-06 [fold_const_symbol]: 1.059e-05 [renormalize]: 2.00002e-07 [detach_backward]: 5.30999e-06 [pipeline_parallel_scheduler]: 1.96998e-06 [auto_monad_reorder]: 2.057e-05 [get_jit_bprop_graph]: 2.15002e-06 [rewriter_after_jit_bprop_graph]: 6.33e-06 [opt_after_jit_grad]: 0.00061999 [validate]: 4.8e-05 Sums bootstrap : 0.000449s : 0.28% type_inference : 0.007139s : 4.40% event_method : 0.000021s : 0.01% auto_monad : 0.000071s : 0.04% graph_reusing : 0.000007s : 0.00% inline : 0.000003s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000023s : 0.01% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.00% parallel-infer-symbol : 0.000004s : 0.00% pre_auto_parallel : 0.000042s : 0.03% insert-virtual-dataset : 0.000003s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000003s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000035s : 0.02% optimize.rewriter_before_opt_a : 0.000097s : 0.06% optimize.opt_a.expand_dump_flag : 0.000006s : 0.00% optimize.opt_a.switch_simplify : 0.000052s : 0.03% optimize.opt_a.loop_unroll : 0.000038s : 0.02% optimize.opt_a.a_1 : 0.001381s : 0.85% optimize.opt_a.with_stream_mark : 0.000047s : 0.03% optimize.opt_a.recompute_prepare : 0.000021s : 0.01% optimize.opt_a.updatestate_depend_eliminate : 0.000009s : 0.01% optimize.opt_a.updatestate_assign_eliminate : 0.000008s : 0.00% optimize.opt_a.updatestate_loads_eliminate : 0.000008s : 0.00% optimize.opt_a.parameter_eliminate : 0.000005s : 0.00% optimize.opt_a.a_2 : 0.000288s : 0.18% optimize.opt_a.accelerated_algorithm : 0.000017s : 0.01% optimize.opt_a.shard : 0.000006s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.000005s : 0.00% optimize.opt_a.shard_inline : 0.000017s : 0.01% optimize.opt_a.merge_send_recv : 0.000020s : 0.01% optimize.opt_a.auto_parallel : 0.000020s : 0.01% optimize.opt_a.parallel : 0.000032s : 0.02% optimize.opt_a.flash_sp : 0.000015s : 0.01% optimize.opt_a.merge_comm : 0.000008s : 0.01% optimize.opt_a.allreduce_fusion : 0.000008s : 0.00% optimize.opt_a.matmul_add_comm_reduction : 0.000025s : 0.02% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000002s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000017s : 0.01% optimize.opt_a.virtual_dataset : 0.000014s : 0.01% optimize.opt_a.get_grad_eliminate_ : 0.000014s : 0.01% optimize.opt_a.virtual_output : 0.000013s : 0.01% optimize.opt_a.merge_forward : 0.000009s : 0.01% optimize.opt_a.cell_reuse_recompute_pass : 0.000005s : 0.00% optimize.opt_a.offload_activation : 0.000022s : 0.01% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000033s : 0.02% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.00% optimize.opt_a.before_grad : 0.000023s : 0.01% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000008s : 0.01% optimize.opt_a.meta_fg_expand : 0.000006s : 0.00% optimize.opt_a.flash_sp_send_recv_attached : 0.000005s : 0.00% optimize.opt_a.receive_attached : 0.000005s : 0.00% optimize.opt_a.after_resolve : 0.000026s : 0.02% optimize.opt_a.a_after_grad : 0.000021s : 0.01% optimize.opt_a.renormalize : 0.149164s : 91.84% optimize.opt_a.add_forward_monad_depend : 0.000016s : 0.01% optimize.opt_a.auto_monad_grad : 0.000005s : 0.00% optimize.opt_a.auto_monad_eliminator : 0.000036s : 0.02% optimize.opt_a.cse : 0.000052s : 0.03% optimize.opt_a.a_3 : 0.000142s : 0.09% optimize.py_interpret_to_execute_after_opt_a : 0.000021s : 0.01% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.00% optimize.rewriter_after_opt_a : 0.000046s : 0.03% optimize.convert_after_rewriter : 0.000011s : 0.01% optimize.order_py_execute_after_rewriter : 0.000008s : 0.01% optimize.mutable_eliminate : 0.000773s : 0.48% optimize.opt_b.b_1 : 0.000178s : 0.11% optimize.opt_b.b_2 : 0.000008s : 0.01% optimize.opt_b.updatestate_depend_eliminate : 0.000008s : 0.00% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.00% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000024s : 0.01% optimize.optimize_parallel_all_gather_comm : 0.000024s : 0.01% optimize.overlap_param_gather : 0.000005s : 0.00% optimize.cconv : 0.000039s : 0.02% optimize.loop_unroll : 0.000501s : 0.31% optimize.opt_after_cconv.c_1 : 0.000034s : 0.02% optimize.opt_after_cconv.parameter_eliminate : 0.000006s : 0.00% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000007s : 0.00% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.00% optimize.opt_after_cconv.cse : 0.000021s : 0.01% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000020s : 0.01% optimize.tuple_transform.d_1 : 0.000049s : 0.03% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000008s : 0.00% optimize.partial_unused_args_eliminate : 0.000005s : 0.00% optimize.add_recomputation : 0.000056s : 0.03% optimize.cse_after_recomputation.cse : 0.000012s : 0.01% optimize.environ_conv : 0.000009s : 0.01% optimize.swap_dp_allreduce_reducescatter : 0.000008s : 0.00% optimize.bias_add_comm_swap : 0.000006s : 0.00% optimize.label_micro_interleaved_index : 0.000007s : 0.00% optimize.label_fine_grained_interleaved_index : 0.000006s : 0.00% optimize.merge_cast_opt : 0.000004s : 0.00% optimize.slice_recompute_activation : 0.000005s : 0.00% optimize.micro_interleaved_order_control : 0.000005s : 0.00% optimize.assign_add_opt : 0.000004s : 0.00% optimize.ForceFp32Comm : 0.000003s : 0.00% optimize.remove_cast_before_assign_add : 0.000004s : 0.00% optimize.full_micro_interleaved_order_control : 0.000005s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000006s : 0.00% optimize.comm_op_add_attrs : 0.000004s : 0.00% optimize.add_comm_op_reuse_tag : 0.000003s : 0.00% optimize.interleave_split_concat_branches : 0.000004s : 0.00% optimize.interleave_parallel_branches : 0.000004s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000005s : 0.00% optimize.control_data_broadcast_order : 0.000018s : 0.01% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.00% optimize.offloading_packed_experts : 0.000007s : 0.00% optimize.overlap_recompute_and_grad_model_parallel : 0.000007s : 0.00% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.00% optimize.overlap_recompute_comm : 0.000005s : 0.00% optimize.overlap_grad_ring_attention : 0.000008s : 0.00% optimize.overlap_grad_flash_sp : 0.000025s : 0.02% optimize.begin_end_overlap_inline : 0.000003s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000004s : 0.00% optimize.split_layernorm_comm : 0.000004s : 0.00% optimize.handle_group_info : 0.000004s : 0.00% optimize.symbol_engine_optimizer.build : 0.000005s : 0.00% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000012s : 0.01% optimize.symbol_engine_optimizer.elim_not_effective : 0.000014s : 0.01% optimize.symbol_engine_optimizer.opt_reshape : 0.000008s : 0.00% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000011s : 0.01% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000005s : 0.00% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000021s : 0.01% get_jit_bprop_graph : 0.000002s : 0.00% rewriter_after_jit_bprop_graph : 0.000006s : 0.00% opt_after_jit_grad : 0.000620s : 0.38% validate : 0.000048s : 0.03% Time group info: ------[substitution.] 0.000737 34 6.27% : 0.000046s : 6: substitution.arithmetic_simplify 0.28% : 0.000002s : 2: substitution.elim_not_effective 0.22% : 0.000002s : 2: substitution.fold_const_symbol 0.93% : 0.000007s : 4: substitution.graph_param_transform 87.90% : 0.000648s : 4: substitution.inline 0.61% : 0.000005s : 4: substitution.j_node_and_user_rematch 0.78% : 0.000006s : 4: substitution.remove_not_recompute_node 0.85% : 0.000006s : 4: substitution.replace_old_param 2.15% : 0.000016s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.007000 2 87.11% : 0.006098s : 1: type_inference.infer 12.89% : 0.000902s : 1: type_inference.specialize ------[replace.] 0.000082 8 66.45% : 0.000054s : 4: replace.inline 33.55% : 0.000027s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000658 8 97.86% : 0.000644s : 4: match.inline 2.14% : 0.000014s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000236 1278 0.99% : 0.000002s : 13: predicate.accumulaten_eliminater 0.79% : 0.000002s : 4: predicate.ad_related_special_op_eliminate 0.52% : 0.000001s : 8: predicate.addn_check_dump 0.91% : 0.000002s : 13: predicate.addn_zero_filter 0.73% : 0.000002s : 13: predicate.adjust_all_reduce_mul_add 2.88% : 0.000007s : 21: predicate.arithmetic_simplify 0.87% : 0.000002s : 13: predicate.cast_eliminate 0.56% : 0.000001s : 8: predicate.check_bprop_eliminate 0.55% : 0.000001s : 8: predicate.compare_switch_simplify 0.16% : 0.000000s : 4: predicate.const_output_eliminate 0.61% : 0.000001s : 8: predicate.depend_value_elim 0.85% : 0.000002s : 13: predicate.dict_get_item_const_eliminator 0.97% : 0.000002s : 13: predicate.dict_get_item_eliminator 1.00% : 0.000002s : 13: predicate.dict_set_item_eliminator 0.98% : 0.000002s : 8: predicate.dumpgradient_eliminate 0.17% : 0.000000s : 4: predicate.elim_not_effective 0.45% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.03% : 0.000002s : 17: predicate.environ_add_const_eliminate 1.52% : 0.000004s : 17: predicate.environ_get_add_eliminate 1.03% : 0.000002s : 17: predicate.environ_get_depend_swap 1.59% : 0.000004s : 25: predicate.environ_get_eliminate 0.96% : 0.000002s : 17: predicate.environ_get_set_eliminate 1.47% : 0.000003s : 21: predicate.exchange_switch_depend_value 2.48% : 0.000006s : 21: predicate.float_depend_g_call 0.82% : 0.000002s : 8: predicate.float_environ_get_switch 0.72% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.16% : 0.000000s : 4: predicate.fold_const_symbol 0.68% : 0.000002s : 8: predicate.get_grad_eliminate 0.17% : 0.000000s : 4: predicate.graph_param_transform 0.53% : 0.000001s : 8: predicate.incorporate_call 0.41% : 0.000001s : 8: predicate.incorporate_call_switch 6.34% : 0.000015s : 58: predicate.inline 0.84% : 0.000002s : 8: predicate.inline_without_move 0.30% : 0.000001s : 8: predicate.j_node_and_user_rematch 1.04% : 0.000002s : 8: predicate.less_batch_normalization 1.82% : 0.000004s : 25: predicate.list_to_tuple_eliminator_ 2.46% : 0.000006s : 38: predicate.load_eliminater 1.12% : 0.000003s : 4: predicate.loop_unroll_after_grad 2.14% : 0.000005s : 34: predicate.loop_unroll_before_grad 1.66% : 0.000004s : 21: predicate.make_slice_get_slice_eliminator 0.57% : 0.000001s : 8: predicate.merge_addn 0.61% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.59% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.72% : 0.000002s : 13: predicate.minmaximum_grad 1.05% : 0.000002s : 4: predicate.mutable_eliminate 0.39% : 0.000001s : 4: predicate.opt_reshape 0.32% : 0.000001s : 4: predicate.parallel_virtual_node 1.95% : 0.000005s : 21: predicate.partial_defer_inline 1.41% : 0.000003s : 21: predicate.partial_eliminate 1.00% : 0.000002s : 13: predicate.print_const_string_wrapper 0.69% : 0.000002s : 8: predicate.reduce_all_const_elim 1.14% : 0.000003s : 13: predicate.reduce_eliminate 2.51% : 0.000006s : 38: predicate.redundant_stop_gradient_eliminater 0.45% : 0.000001s : 8: predicate.remove_not_recompute_node 1.48% : 0.000003s : 25: predicate.replace_applicator 0.54% : 0.000001s : 8: predicate.replace_old_param 0.32% : 0.000001s : 4: predicate.reset_defer_inline 0.95% : 0.000002s : 13: predicate.reshape_eliminate 0.66% : 0.000002s : 8: predicate.row_tensor_add_zeros_like 0.36% : 0.000001s : 4: predicate.row_tensor_eliminate 0.86% : 0.000002s : 8: predicate.same_eliminate 0.42% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.84% : 0.000002s : 8: predicate.shard_identity_eliminate 0.66% : 0.000002s : 8: predicate.special_op_eliminate 0.65% : 0.000002s : 8: predicate.specialize_transform 1.11% : 0.000003s : 8: predicate.split_environ_get_set_with_tuple_value 0.78% : 0.000002s : 8: predicate.stack_unstack_eliminate 0.29% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.41% : 0.000003s : 21: predicate.switch_defer_inline 2.10% : 0.000005s : 29: predicate.switch_layer_defer_inline 4.86% : 0.000011s : 67: predicate.switch_simplify 0.90% : 0.000002s : 13: predicate.tile_eliminate 1.06% : 0.000002s : 13: predicate.transpose_eliminate 1.51% : 0.000004s : 21: predicate.tuple_list_convert_item_index_to_positive 1.49% : 0.000004s : 21: predicate.tuple_list_get_item_const_eliminator 1.45% : 0.000003s : 21: predicate.tuple_list_get_item_depend_reorder 3.48% : 0.000008s : 33: predicate.tuple_list_get_item_eliminator 1.39% : 0.000003s : 21: predicate.tuple_list_get_set_item_eliminator 2.15% : 0.000005s : 29: predicate.tuple_list_set_item_eliminator 1.91% : 0.000004s : 25: predicate.tuple_to_list_eliminator_ 2.46% : 0.000006s : 38: predicate.updatestate_pure_node_eliminater 2.79% : 0.000007s : 46: predicate.updatestate_useless_node_eliminater 0.56% : 0.000001s : 4: predicate.value_based_eliminate 0.66% : 0.000002s : 8: predicate.virtual_dataset_eliminate 0.58% : 0.000001s : 8: predicate.virtual_output_eliminate 0.27% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.39% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000724 11 48.89% : 0.000354s : 5: func_graph_cloner_run.FuncGraphClonerGraph 51.11% : 0.000370s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.479097 192 0.00% : 0.000006s : 1: ForceFp32Comm 0.83% : 0.003993s : 1: add_attr 0.83% : 0.003974s : 1: add_attr_with_inline 0.00% : 0.000006s : 1: add_comm_op_reuse_tag 0.01% : 0.000060s : 1: add_recomputation 0.00% : 0.000007s : 1: assign_add_opt 0.02% : 0.000080s : 1: auto_monad 0.01% : 0.000029s : 1: auto_monad_reorder 0.00% : 0.000006s : 1: begin_end_overlap_inline 0.00% : 0.000009s : 1: bias_add_comm_swap 0.10% : 0.000495s : 1: bootstrap 0.01% : 0.000043s : 1: cconv 0.00% : 0.000006s : 1: comm_op_add_attrs 0.00% : 0.000021s : 1: control_data_broadcast_order 0.00% : 0.000014s : 1: convert_after_rewriter 0.01% : 0.000032s : 1: cse_after_recomputation 0.00% : 0.000008s : 1: dataset_repeat_opt 0.00% : 0.000023s : 1: detach_backward 0.00% : 0.000012s : 1: environ_conv 0.01% : 0.000033s : 1: event_method 0.00% : 0.000008s : 1: full_micro_interleaved_order_control 0.00% : 0.000008s : 1: get_jit_bprop_graph 0.00% : 0.000014s : 1: graph_reusing 0.00% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000006s : 1: handle_group_info 0.00% : 0.000009s : 1: inline 0.00% : 0.000009s : 1: insert-virtual-dataset 0.00% : 0.000006s : 1: interleave_parallel_branches 0.00% : 0.000007s : 1: interleave_split_concat_branches 0.00% : 0.000009s : 1: label_fine_grained_interleaved_index 0.00% : 0.000010s : 1: label_micro_interleaved_index 0.11% : 0.000507s : 1: loop_unroll 0.00% : 0.000006s : 1: merge_cast_opt 0.00% : 0.000008s : 1: micro_interleaved_order_control 0.16% : 0.000781s : 1: mutable_eliminate 0.00% : 0.000010s : 1: offloading_packed_experts 0.00% : 0.000017s : 1: opt.transform.loop_unroll_optimizer 0.00% : 0.000020s : 1: opt.transform.mutable_eliminate 0.39% : 0.001868s : 78: opt.transform.opt_a 0.01% : 0.000033s : 1: opt.transform.opt_after_cconv 0.01% : 0.000033s : 1: opt.transform.opt_after_jit_grad 0.02% : 0.000109s : 28: opt.transform.opt_b 0.01% : 0.000055s : 2: opt.transform.opt_trans_graph 0.01% : 0.000041s : 4: opt.transform.symbol_engine_opt 31.87% : 0.152688s : 1: opt_a 0.03% : 0.000145s : 1: opt_after_cconv 0.13% : 0.000633s : 1: opt_after_jit_grad 0.06% : 0.000301s : 1: opt_b 32.52% : 0.155821s : 1: optimize 0.01% : 0.000027s : 1: optimize_parallel_all_gather_comm 0.00% : 0.000012s : 1: order_py_execute_after_rewriter 0.01% : 0.000029s : 1: overlap_grad_flash_sp 0.00% : 0.000007s : 1: overlap_grad_matmul_and_grad_allreduce 0.00% : 0.000010s : 1: overlap_grad_ring_attention 0.00% : 0.000007s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000006s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000009s : 1: overlap_param_gather 0.00% : 0.000006s : 1: overlap_recompute_allgather_and_fa_grad 0.00% : 0.000011s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000008s : 1: overlap_recompute_comm 0.00% : 0.000011s : 1: parallel-infer-symbol 0.00% : 0.000006s : 1: parallel-infer-symbol-second 0.00% : 0.000008s : 1: partial_unused_args_eliminate 0.00% : 0.000009s : 1: pipeline_parallel_scheduler 0.00% : 0.000007s : 1: pipeline_split 0.01% : 0.000050s : 1: pre_auto_parallel 0.01% : 0.000039s : 1: py_interpret_to_execute 0.01% : 0.000025s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000006s : 1: remove_cast_before_assign_add 0.00% : 0.000023s : 1: remove_dup_value 31.01% : 0.148549s : 1: renormalize.infer 0.12% : 0.000591s : 1: renormalize.specialize 0.00% : 0.000009s : 1: reorder_send_recv_between_fp_bp 0.00% : 0.000013s : 1: rewriter_after_jit_bprop_graph 0.01% : 0.000050s : 1: rewriter_after_opt_a 0.02% : 0.000100s : 1: rewriter_before_opt_a 0.00% : 0.000008s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000009s : 1: slice_recompute_activation 0.00% : 0.000008s : 1: split_layernorm_comm 0.00% : 0.000008s : 1: split_matmul_comm_elemetwise 0.00% : 0.000011s : 1: swap_dp_allreduce_reducescatter 0.02% : 0.000108s : 1: symbol_engine_optimizer 0.02% : 0.000101s : 1: tuple_transform 1.50% : 0.007191s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:41:21.959.062 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.173244, [21] [bootstrap]: 0.0005065 [type_inference]: 0.00745901 [event_method]: 2.381e-05 [auto_monad]: 7.389e-05 [graph_reusing]: 6.19999e-06 [inline]: 2.90998e-06 [add_attr]: 0.159115, [1] [add_attr_with_inline]: 0.159105, [1] [Cycle 1]: 5.927e-05, [2] [tag_attr]: 2.17e-05 [meta_addattr_fg_expand]: 5.28002e-06 [parallel-infer-symbol]: 3.36001e-06 [pre_auto_parallel]: 3.577e-05 [insert-virtual-dataset]: 2.53e-06 [parallel-infer-symbol-second]: 1.00001e-06 [dataset_repeat_opt]: 2.44999e-06 [pipeline_split]: 1.70001e-06 [optimize]: 0.00521952, [53] [py_interpret_to_execute]: 2.476e-05 [rewriter_before_opt_a]: 8.958e-05 [opt_a]: 0.0028097, [2] [Cycle 1]: 0.00212574, [45] [expand_dump_flag]: 3.26999e-06 [switch_simplify]: 3.766e-05 [loop_unroll]: 3.076e-05 [a_1]: 0.00068907 [with_stream_mark]: 1.468e-05 [recompute_prepare]: 9.08002e-06 [updatestate_depend_eliminate]: 3.23e-06 [updatestate_assign_eliminate]: 2.52001e-06 [updatestate_loads_eliminate]: 2.19999e-06 [parameter_eliminate]: 1.45001e-06 [a_2]: 8.58e-05 [accelerated_algorithm]: 7.21001e-06 [shard]: 2.21e-06 [meta_shard_fg_expand]: 1.57001e-06 [shard_inline]: 6.70002e-06 [merge_send_recv]: 8.84e-06 [auto_parallel]: 7.9e-06 [parallel]: 1.994e-05 [flash_sp]: 9.10001e-06 [merge_comm]: 4.21001e-06 [allreduce_fusion]: 3.50003e-06 [matmul_add_comm_reduction]: 7.50998e-06 [allreduce_slice_to_reducescatter]: 1.34998e-06 [virtual_shard_identity]: 8.18001e-06 [virtual_dataset]: 6.81999e-06 [get_grad_eliminate_]: 6.66999e-06 [virtual_output]: 6.82002e-06 [merge_forward]: 3.48e-06 [cell_reuse_recompute_pass]: 1.09e-06 [offload_activation]: 1.056e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.182e-05 [merge_recompute_call_nodes]: 1.55001e-06 [before_grad]: 1.099e-05 [set_forward_comm_id_for_comm_node_pass]: 3.64002e-06 [meta_fg_expand]: 2.71e-06 [flash_sp_send_recv_attached]: 2.63e-06 [receive_attached]: 2.65002e-06 [after_resolve]: 1.212e-05 [a_after_grad]: 1.064e-05 [renormalize]: 0.00072718 [add_forward_monad_depend]: 5.97999e-06 [auto_monad_grad]: 3.54002e-06 [auto_monad_eliminator]: 1.422e-05 [cse]: 1.914e-05 [a_3]: 4.84e-05 [Cycle 2]: 0.00067351, [45] [expand_dump_flag]: 1.43002e-06 [switch_simplify]: 7.83001e-06 [loop_unroll]: 6.48e-06 [a_1]: 0.00014464 [with_stream_mark]: 1.302e-05 [recompute_prepare]: 6.33e-06 [updatestate_depend_eliminate]: 2.73998e-06 [updatestate_assign_eliminate]: 2.71e-06 [updatestate_loads_eliminate]: 2.48e-06 [parameter_eliminate]: 1.49e-06 [a_2]: 7.752e-05 [accelerated_algorithm]: 6.08998e-06 [shard]: 1.29e-06 [meta_shard_fg_expand]: 1.32e-06 [shard_inline]: 6.12001e-06 [merge_send_recv]: 5.46e-06 [auto_parallel]: 6.96001e-06 [parallel]: 5.46e-06 [flash_sp]: 3.45e-06 [merge_comm]: 3.43e-06 [allreduce_fusion]: 3.89002e-06 [matmul_add_comm_reduction]: 5.19e-06 [allreduce_slice_to_reducescatter]: 4.2998e-07 [virtual_shard_identity]: 6.61999e-06 [virtual_dataset]: 5.84e-06 [get_grad_eliminate_]: 5.84999e-06 [virtual_output]: 5.79e-06 [merge_forward]: 3.08998e-06 [cell_reuse_recompute_pass]: 1.49e-06 [offload_activation]: 7.35e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.114e-05 [merge_recompute_call_nodes]: 9.29984e-07 [before_grad]: 9.79e-06 [set_forward_comm_id_for_comm_node_pass]: 3.06001e-06 [meta_fg_expand]: 2.21e-06 [flash_sp_send_recv_attached]: 1.11002e-06 [receive_attached]: 1.98002e-06 [after_resolve]: 1.049e-05 [a_after_grad]: 9.13002e-06 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 1.54e-06 [auto_monad_grad]: 8.70001e-07 [auto_monad_eliminator]: 7.99002e-06 [cse]: 1.519e-05 [a_3]: 3.701e-05 [py_interpret_to_execute_after_opt_a]: 1.026e-05 [slice_cell_reuse_recomputed_activation]: 2.18002e-06 [rewriter_after_opt_a]: 3.492e-05 [convert_after_rewriter]: 7.01001e-06 [order_py_execute_after_rewriter]: 5.35001e-06 [mutable_eliminate]: 0.00059005 [opt_b]: 0.00020853, [1] [Cycle 1]: 0.0002011, [7] [b_1]: 0.00012608 [b_2]: 8.78001e-06 [updatestate_depend_eliminate]: 5.81e-06 [updatestate_assign_eliminate]: 2.59001e-06 [updatestate_loads_eliminate]: 2.34001e-06 [renormalize]: 8.30012e-07 [cse]: 1.801e-05 [optimize_parallel_all_gather_comm]: 1.717e-05 [overlap_param_gather]: 1.91e-06 [cconv]: 2.694e-05 [loop_unroll]: 0.00046686 [opt_after_cconv]: 0.00010373, [1] [Cycle 1]: 9.738e-05, [7] [c_1]: 3.127e-05 [parameter_eliminate]: 3.16001e-06 [updatestate_depend_eliminate]: 5.19998e-06 [updatestate_assign_eliminate]: 2.71999e-06 [updatestate_loads_eliminate]: 2.32001e-06 [cse]: 1.722e-05 [renormalize]: 7.40023e-07 [remove_dup_value]: 1.355e-05 [tuple_transform]: 7.73e-05, [1] [Cycle 1]: 7.302e-05, [4] [d_1]: 4.541e-05 [none_parameter_eliminate]: 1.67001e-06 [renormalize]: 2.09984e-07 [switch_simplify]: 7.08e-06 [partial_unused_args_eliminate]: 1.86e-06 [add_recomputation]: 4.493e-05 [cse_after_recomputation]: 2.197e-05, [1] [Cycle 1]: 1.709e-05, [1] [cse]: 1.093e-05 [environ_conv]: 4.88001e-06 [swap_dp_allreduce_reducescatter]: 4.97e-06 [bias_add_comm_swap]: 3.06001e-06 [label_micro_interleaved_index]: 4.17e-06 [label_fine_grained_interleaved_index]: 2.93998e-06 [merge_cast_opt]: 1.30001e-06 [slice_recompute_activation]: 2.02999e-06 [micro_interleaved_order_control]: 2.63e-06 [assign_add_opt]: 1.15001e-06 [ForceFp32Comm]: 8.50006e-07 [remove_cast_before_assign_add]: 1.02998e-06 [full_micro_interleaved_order_control]: 2.04e-06 [reorder_send_recv_between_fp_bp]: 2.80002e-06 [comm_op_add_attrs]: 1.15999e-06 [add_comm_op_reuse_tag]: 1.35001e-06 [interleave_split_concat_branches]: 2.97002e-06 [interleave_parallel_branches]: 1.15999e-06 [overlap_opt_shard_in_pipeline]: 1.60999e-06 [overlap_opt_shard_grad_in_pipeline]: 2.08002e-06 [control_data_broadcast_order]: 1.828e-05 [grouped_pairwise_exchange_alltoall]: 2.22001e-06 [offloading_packed_experts]: 4.52e-06 [overlap_recompute_and_grad_model_parallel]: 5.01002e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.20001e-06 [overlap_recompute_allgather_and_fa_grad]: 1.61998e-06 [overlap_recompute_comm]: 2.51e-06 [overlap_grad_ring_attention]: 4.50001e-06 [overlap_grad_flash_sp]: 2.115e-05 [begin_end_overlap_inline]: 5.19998e-07 [split_matmul_comm_elemetwise]: 2.66e-06 [split_layernorm_comm]: 1.74998e-06 [handle_group_info]: 1.30001e-06 [symbol_engine_optimizer]: 9.461e-05, [1] [Cycle 1]: 8.834e-05, [6] [build]: 3.75e-06 [elim_shapecalc]: 1.467e-05 [elim_not_effective]: 1.647e-05 [opt_reshape]: 8.65001e-06 [fold_const_symbol]: 1.064e-05 [renormalize]: 2.19996e-07 [detach_backward]: 2.69001e-06 [pipeline_parallel_scheduler]: 1.52999e-06 [auto_monad_reorder]: 1.831e-05 [get_jit_bprop_graph]: 1.72001e-06 [rewriter_after_jit_bprop_graph]: 5.40001e-06 [opt_after_jit_grad]: 0.00051245 [validate]: 6.418e-05 Sums bootstrap : 0.000506s : 3.93% type_inference : 0.007459s : 57.85% event_method : 0.000024s : 0.18% auto_monad : 0.000074s : 0.57% graph_reusing : 0.000006s : 0.05% inline : 0.000003s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000022s : 0.17% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000005s : 0.04% parallel-infer-symbol : 0.000003s : 0.03% pre_auto_parallel : 0.000036s : 0.28% insert-virtual-dataset : 0.000003s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000025s : 0.19% optimize.rewriter_before_opt_a : 0.000090s : 0.69% optimize.opt_a.expand_dump_flag : 0.000005s : 0.04% optimize.opt_a.switch_simplify : 0.000045s : 0.35% optimize.opt_a.loop_unroll : 0.000037s : 0.29% optimize.opt_a.a_1 : 0.000834s : 6.47% optimize.opt_a.with_stream_mark : 0.000028s : 0.21% optimize.opt_a.recompute_prepare : 0.000015s : 0.12% optimize.opt_a.updatestate_depend_eliminate : 0.000006s : 0.05% optimize.opt_a.updatestate_assign_eliminate : 0.000005s : 0.04% optimize.opt_a.updatestate_loads_eliminate : 0.000005s : 0.04% optimize.opt_a.parameter_eliminate : 0.000003s : 0.02% optimize.opt_a.a_2 : 0.000163s : 1.27% optimize.opt_a.accelerated_algorithm : 0.000013s : 0.10% optimize.opt_a.shard : 0.000003s : 0.03% optimize.opt_a.meta_shard_fg_expand : 0.000003s : 0.02% optimize.opt_a.shard_inline : 0.000013s : 0.10% optimize.opt_a.merge_send_recv : 0.000014s : 0.11% optimize.opt_a.auto_parallel : 0.000015s : 0.12% optimize.opt_a.parallel : 0.000025s : 0.20% optimize.opt_a.flash_sp : 0.000013s : 0.10% optimize.opt_a.merge_comm : 0.000008s : 0.06% optimize.opt_a.allreduce_fusion : 0.000007s : 0.06% optimize.opt_a.matmul_add_comm_reduction : 0.000013s : 0.10% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000002s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000015s : 0.11% optimize.opt_a.virtual_dataset : 0.000013s : 0.10% optimize.opt_a.get_grad_eliminate_ : 0.000013s : 0.10% optimize.opt_a.virtual_output : 0.000013s : 0.10% optimize.opt_a.merge_forward : 0.000007s : 0.05% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.02% optimize.opt_a.offload_activation : 0.000018s : 0.14% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000023s : 0.18% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.02% optimize.opt_a.before_grad : 0.000021s : 0.16% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000007s : 0.05% optimize.opt_a.meta_fg_expand : 0.000005s : 0.04% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.03% optimize.opt_a.receive_attached : 0.000005s : 0.04% optimize.opt_a.after_resolve : 0.000023s : 0.18% optimize.opt_a.a_after_grad : 0.000020s : 0.15% optimize.opt_a.renormalize : 0.000727s : 5.64% optimize.opt_a.add_forward_monad_depend : 0.000008s : 0.06% optimize.opt_a.auto_monad_grad : 0.000004s : 0.03% optimize.opt_a.auto_monad_eliminator : 0.000022s : 0.17% optimize.opt_a.cse : 0.000034s : 0.27% optimize.opt_a.a_3 : 0.000085s : 0.66% optimize.py_interpret_to_execute_after_opt_a : 0.000010s : 0.08% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.02% optimize.rewriter_after_opt_a : 0.000035s : 0.27% optimize.convert_after_rewriter : 0.000007s : 0.05% optimize.order_py_execute_after_rewriter : 0.000005s : 0.04% optimize.mutable_eliminate : 0.000590s : 4.58% optimize.opt_b.b_1 : 0.000126s : 0.98% optimize.opt_b.b_2 : 0.000009s : 0.07% optimize.opt_b.updatestate_depend_eliminate : 0.000006s : 0.05% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.02% optimize.opt_b.updatestate_loads_eliminate : 0.000002s : 0.02% optimize.opt_b.renormalize : 0.000001s : 0.01% optimize.opt_b.cse : 0.000018s : 0.14% optimize.optimize_parallel_all_gather_comm : 0.000017s : 0.13% optimize.overlap_param_gather : 0.000002s : 0.01% optimize.cconv : 0.000027s : 0.21% optimize.loop_unroll : 0.000467s : 3.62% optimize.opt_after_cconv.c_1 : 0.000031s : 0.24% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.02% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000005s : 0.04% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.02% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.02% optimize.opt_after_cconv.cse : 0.000017s : 0.13% optimize.opt_after_cconv.renormalize : 0.000001s : 0.01% optimize.remove_dup_value : 0.000014s : 0.11% optimize.tuple_transform.d_1 : 0.000045s : 0.35% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000007s : 0.05% optimize.partial_unused_args_eliminate : 0.000002s : 0.01% optimize.add_recomputation : 0.000045s : 0.35% optimize.cse_after_recomputation.cse : 0.000011s : 0.08% optimize.environ_conv : 0.000005s : 0.04% optimize.swap_dp_allreduce_reducescatter : 0.000005s : 0.04% optimize.bias_add_comm_swap : 0.000003s : 0.02% optimize.label_micro_interleaved_index : 0.000004s : 0.03% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.02% optimize.merge_cast_opt : 0.000001s : 0.01% optimize.slice_recompute_activation : 0.000002s : 0.02% optimize.micro_interleaved_order_control : 0.000003s : 0.02% optimize.assign_add_opt : 0.000001s : 0.01% optimize.ForceFp32Comm : 0.000001s : 0.01% optimize.remove_cast_before_assign_add : 0.000001s : 0.01% optimize.full_micro_interleaved_order_control : 0.000002s : 0.02% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.02% optimize.comm_op_add_attrs : 0.000001s : 0.01% optimize.add_comm_op_reuse_tag : 0.000001s : 0.01% optimize.interleave_split_concat_branches : 0.000003s : 0.02% optimize.interleave_parallel_branches : 0.000001s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000002s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.02% optimize.control_data_broadcast_order : 0.000018s : 0.14% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.02% optimize.offloading_packed_experts : 0.000005s : 0.04% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.04% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000002s : 0.01% optimize.overlap_recompute_comm : 0.000003s : 0.02% optimize.overlap_grad_ring_attention : 0.000005s : 0.03% optimize.overlap_grad_flash_sp : 0.000021s : 0.16% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000003s : 0.02% optimize.split_layernorm_comm : 0.000002s : 0.01% optimize.handle_group_info : 0.000001s : 0.01% optimize.symbol_engine_optimizer.build : 0.000004s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000015s : 0.11% optimize.symbol_engine_optimizer.elim_not_effective : 0.000016s : 0.13% optimize.symbol_engine_optimizer.opt_reshape : 0.000009s : 0.07% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000011s : 0.08% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000003s : 0.02% pipeline_parallel_scheduler : 0.000002s : 0.01% auto_monad_reorder : 0.000018s : 0.14% get_jit_bprop_graph : 0.000002s : 0.01% rewriter_after_jit_bprop_graph : 0.000005s : 0.04% opt_after_jit_grad : 0.000512s : 3.97% validate : 0.000064s : 0.50% Time group info: ------[substitution.] 0.000234 34 14.08% : 0.000033s : 6: substitution.arithmetic_simplify 0.87% : 0.000002s : 2: substitution.elim_not_effective 0.64% : 0.000002s : 2: substitution.fold_const_symbol 2.46% : 0.000006s : 4: substitution.graph_param_transform 70.10% : 0.000164s : 4: substitution.inline 1.74% : 0.000004s : 4: substitution.j_node_and_user_rematch 1.93% : 0.000005s : 4: substitution.remove_not_recompute_node 1.78% : 0.000004s : 4: substitution.replace_old_param 6.39% : 0.000015s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.007374 2 79.70% : 0.005877s : 1: type_inference.infer 20.30% : 0.001497s : 1: type_inference.specialize ------[replace.] 0.000064 8 62.93% : 0.000040s : 4: replace.inline 37.07% : 0.000024s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000175 8 92.46% : 0.000162s : 4: match.inline 7.54% : 0.000013s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000221 1278 0.82% : 0.000002s : 13: predicate.accumulaten_eliminater 1.01% : 0.000002s : 4: predicate.ad_related_special_op_eliminate 0.53% : 0.000001s : 8: predicate.addn_check_dump 1.19% : 0.000003s : 13: predicate.addn_zero_filter 0.86% : 0.000002s : 13: predicate.adjust_all_reduce_mul_add 2.51% : 0.000006s : 21: predicate.arithmetic_simplify 0.94% : 0.000002s : 13: predicate.cast_eliminate 0.49% : 0.000001s : 8: predicate.check_bprop_eliminate 0.53% : 0.000001s : 8: predicate.compare_switch_simplify 0.17% : 0.000000s : 4: predicate.const_output_eliminate 0.54% : 0.000001s : 8: predicate.depend_value_elim 0.95% : 0.000002s : 13: predicate.dict_get_item_const_eliminator 0.97% : 0.000002s : 13: predicate.dict_get_item_eliminator 0.81% : 0.000002s : 13: predicate.dict_set_item_eliminator 0.83% : 0.000002s : 8: predicate.dumpgradient_eliminate 0.25% : 0.000001s : 4: predicate.elim_not_effective 0.42% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.00% : 0.000002s : 17: predicate.environ_add_const_eliminate 1.03% : 0.000002s : 17: predicate.environ_get_add_eliminate 1.06% : 0.000002s : 17: predicate.environ_get_depend_swap 6.91% : 0.000015s : 25: predicate.environ_get_eliminate 0.94% : 0.000002s : 17: predicate.environ_get_set_eliminate 1.41% : 0.000003s : 21: predicate.exchange_switch_depend_value 2.03% : 0.000004s : 21: predicate.float_depend_g_call 0.55% : 0.000001s : 8: predicate.float_environ_get_switch 0.67% : 0.000001s : 12: predicate.float_tuple_getitem_switch 0.15% : 0.000000s : 4: predicate.fold_const_symbol 0.57% : 0.000001s : 8: predicate.get_grad_eliminate 0.19% : 0.000000s : 4: predicate.graph_param_transform 0.54% : 0.000001s : 8: predicate.incorporate_call 0.45% : 0.000001s : 8: predicate.incorporate_call_switch 5.62% : 0.000012s : 58: predicate.inline 0.72% : 0.000002s : 8: predicate.inline_without_move 0.28% : 0.000001s : 8: predicate.j_node_and_user_rematch 0.67% : 0.000001s : 8: predicate.less_batch_normalization 1.81% : 0.000004s : 25: predicate.list_to_tuple_eliminator_ 2.33% : 0.000005s : 38: predicate.load_eliminater 1.04% : 0.000002s : 4: predicate.loop_unroll_after_grad 2.39% : 0.000005s : 34: predicate.loop_unroll_before_grad 1.56% : 0.000003s : 21: predicate.make_slice_get_slice_eliminator 0.54% : 0.000001s : 8: predicate.merge_addn 0.55% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.56% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.77% : 0.000002s : 13: predicate.minmaximum_grad 1.00% : 0.000002s : 4: predicate.mutable_eliminate 0.40% : 0.000001s : 4: predicate.opt_reshape 0.37% : 0.000001s : 4: predicate.parallel_virtual_node 1.75% : 0.000004s : 21: predicate.partial_defer_inline 1.51% : 0.000003s : 21: predicate.partial_eliminate 0.83% : 0.000002s : 13: predicate.print_const_string_wrapper 0.50% : 0.000001s : 8: predicate.reduce_all_const_elim 1.15% : 0.000003s : 13: predicate.reduce_eliminate 2.29% : 0.000005s : 38: predicate.redundant_stop_gradient_eliminater 0.43% : 0.000001s : 8: predicate.remove_not_recompute_node 1.22% : 0.000003s : 25: predicate.replace_applicator 0.48% : 0.000001s : 8: predicate.replace_old_param 0.23% : 0.000001s : 4: predicate.reset_defer_inline 0.94% : 0.000002s : 13: predicate.reshape_eliminate 0.65% : 0.000001s : 8: predicate.row_tensor_add_zeros_like 0.37% : 0.000001s : 4: predicate.row_tensor_eliminate 0.77% : 0.000002s : 8: predicate.same_eliminate 0.42% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.65% : 0.000001s : 8: predicate.shard_identity_eliminate 0.84% : 0.000002s : 8: predicate.special_op_eliminate 0.62% : 0.000001s : 8: predicate.specialize_transform 0.82% : 0.000002s : 8: predicate.split_environ_get_set_with_tuple_value 0.77% : 0.000002s : 8: predicate.stack_unstack_eliminate 0.41% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.49% : 0.000003s : 21: predicate.switch_defer_inline 1.98% : 0.000004s : 29: predicate.switch_layer_defer_inline 4.64% : 0.000010s : 67: predicate.switch_simplify 0.89% : 0.000002s : 13: predicate.tile_eliminate 0.88% : 0.000002s : 13: predicate.transpose_eliminate 1.79% : 0.000004s : 21: predicate.tuple_list_convert_item_index_to_positive 1.53% : 0.000003s : 21: predicate.tuple_list_get_item_const_eliminator 1.35% : 0.000003s : 21: predicate.tuple_list_get_item_depend_reorder 2.92% : 0.000006s : 33: predicate.tuple_list_get_item_eliminator 1.52% : 0.000003s : 21: predicate.tuple_list_get_set_item_eliminator 2.34% : 0.000005s : 29: predicate.tuple_list_set_item_eliminator 1.66% : 0.000004s : 25: predicate.tuple_to_list_eliminator_ 2.27% : 0.000005s : 38: predicate.updatestate_pure_node_eliminater 2.94% : 0.000007s : 46: predicate.updatestate_useless_node_eliminater 0.39% : 0.000001s : 4: predicate.value_based_eliminate 0.58% : 0.000001s : 8: predicate.virtual_dataset_eliminate 0.58% : 0.000001s : 8: predicate.virtual_output_eliminate 0.29% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.40% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.001157 11 33.18% : 0.000384s : 5: func_graph_cloner_run.FuncGraphClonerGraph 66.82% : 0.000773s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.339683 192 0.00% : 0.000004s : 1: ForceFp32Comm 46.84% : 0.159122s : 1: add_attr 46.84% : 0.159109s : 1: add_attr_with_inline 0.05% : 0.000174s : 1: add_comm_op_reuse_tag 0.01% : 0.000049s : 1: add_recomputation 0.00% : 0.000004s : 1: assign_add_opt 0.02% : 0.000081s : 1: auto_monad 0.01% : 0.000022s : 1: auto_monad_reorder 0.00% : 0.000004s : 1: begin_end_overlap_inline 0.00% : 0.000006s : 1: bias_add_comm_swap 0.16% : 0.000542s : 1: bootstrap 0.01% : 0.000030s : 1: cconv 0.02% : 0.000060s : 1: comm_op_add_attrs 0.01% : 0.000022s : 1: control_data_broadcast_order 0.00% : 0.000010s : 1: convert_after_rewriter 0.01% : 0.000025s : 1: cse_after_recomputation 0.00% : 0.000005s : 1: dataset_repeat_opt 0.00% : 0.000006s : 1: detach_backward 0.00% : 0.000008s : 1: environ_conv 0.01% : 0.000034s : 1: event_method 0.00% : 0.000005s : 1: full_micro_interleaved_order_control 0.00% : 0.000005s : 1: get_jit_bprop_graph 0.00% : 0.000010s : 1: graph_reusing 0.00% : 0.000005s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000004s : 1: handle_group_info 0.00% : 0.000006s : 1: inline 0.00% : 0.000006s : 1: insert-virtual-dataset 0.00% : 0.000004s : 1: interleave_parallel_branches 0.00% : 0.000007s : 1: interleave_split_concat_branches 0.00% : 0.000006s : 1: label_fine_grained_interleaved_index 0.00% : 0.000007s : 1: label_micro_interleaved_index 0.14% : 0.000476s : 1: loop_unroll 0.00% : 0.000004s : 1: merge_cast_opt 0.00% : 0.000005s : 1: micro_interleaved_order_control 0.18% : 0.000599s : 1: mutable_eliminate 0.00% : 0.000008s : 1: offloading_packed_experts 0.00% : 0.000015s : 1: opt.transform.loop_unroll_optimizer 0.00% : 0.000016s : 1: opt.transform.mutable_eliminate 0.37% : 0.001258s : 78: opt.transform.opt_a 0.01% : 0.000030s : 1: opt.transform.opt_after_cconv 0.01% : 0.000027s : 1: opt.transform.opt_after_jit_grad 0.03% : 0.000102s : 28: opt.transform.opt_b 0.01% : 0.000050s : 2: opt.transform.opt_trans_graph 0.01% : 0.000046s : 4: opt.transform.symbol_engine_opt 0.83% : 0.002813s : 1: opt_a 0.03% : 0.000107s : 1: opt_after_cconv 0.15% : 0.000522s : 1: opt_after_jit_grad 0.06% : 0.000212s : 1: opt_b 1.54% : 0.005225s : 1: optimize 0.01% : 0.000021s : 1: optimize_parallel_all_gather_comm 0.00% : 0.000008s : 1: order_py_execute_after_rewriter 0.01% : 0.000025s : 1: overlap_grad_flash_sp 0.00% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.00% : 0.000007s : 1: overlap_grad_ring_attention 0.00% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000005s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000005s : 1: overlap_param_gather 0.00% : 0.000005s : 1: overlap_recompute_allgather_and_fa_grad 0.00% : 0.000009s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000006s : 1: overlap_recompute_comm 0.00% : 0.000007s : 1: parallel-infer-symbol 0.00% : 0.000004s : 1: parallel-infer-symbol-second 0.00% : 0.000005s : 1: partial_unused_args_eliminate 0.00% : 0.000005s : 1: pipeline_parallel_scheduler 0.00% : 0.000005s : 1: pipeline_split 0.01% : 0.000040s : 1: pre_auto_parallel 0.01% : 0.000029s : 1: py_interpret_to_execute 0.00% : 0.000013s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000004s : 1: remove_cast_before_assign_add 0.01% : 0.000017s : 1: remove_dup_value 0.12% : 0.000408s : 1: renormalize.infer 0.09% : 0.000312s : 1: renormalize.specialize 0.00% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.00% : 0.000009s : 1: rewriter_after_jit_bprop_graph 0.01% : 0.000039s : 1: rewriter_after_opt_a 0.03% : 0.000094s : 1: rewriter_before_opt_a 0.00% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000005s : 1: slice_recompute_activation 0.00% : 0.000005s : 1: split_layernorm_comm 0.00% : 0.000006s : 1: split_matmul_comm_elemetwise 0.00% : 0.000008s : 1: swap_dp_allreduce_reducescatter 0.03% : 0.000098s : 1: symbol_engine_optimizer 0.02% : 0.000080s : 1: tuple_transform 2.20% : 0.007486s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:41:24.320.175 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:41:24.320.457 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.162455, [21] [bootstrap]: 0.00047707 [type_inference]: 0.00623644 [event_method]: 1.993e-05 [auto_monad]: 0.00022916 [graph_reusing]: 6.38e-06 [inline]: 2.39001e-06 [add_attr]: 0.147361, [1] [add_attr_with_inline]: 0.147346, [1] [Cycle 1]: 0.00010327, [2] [tag_attr]: 2.609e-05 [meta_addattr_fg_expand]: 6.20002e-06 [parallel-infer-symbol]: 4.35999e-06 [pre_auto_parallel]: 4.64e-05 [insert-virtual-dataset]: 2.68003e-06 [parallel-infer-symbol-second]: 7.09988e-07 [dataset_repeat_opt]: 1.97001e-06 [pipeline_split]: 1.79e-06 [optimize]: 0.00671037, [53] [py_interpret_to_execute]: 3.775e-05 [rewriter_before_opt_a]: 0.00012218 [opt_a]: 0.00386777, [2] [Cycle 1]: 0.00273352, [45] [expand_dump_flag]: 3.98999e-06 [switch_simplify]: 4.721e-05 [loop_unroll]: 3.188e-05 [a_1]: 0.00073815 [with_stream_mark]: 2.396e-05 [recompute_prepare]: 1.156e-05 [updatestate_depend_eliminate]: 5.09e-06 [updatestate_assign_eliminate]: 3.85e-06 [updatestate_loads_eliminate]: 3.36999e-06 [parameter_eliminate]: 2.15002e-06 [a_2]: 0.00012364 [accelerated_algorithm]: 8.43999e-06 [shard]: 1.84998e-06 [meta_shard_fg_expand]: 2.03002e-06 [shard_inline]: 6.94001e-06 [merge_send_recv]: 1.006e-05 [auto_parallel]: 8.14997e-06 [parallel]: 2.158e-05 [flash_sp]: 1.03e-05 [merge_comm]: 4.3e-06 [allreduce_fusion]: 3.76999e-06 [matmul_add_comm_reduction]: 1.056e-05 [allreduce_slice_to_reducescatter]: 9.00007e-07 [virtual_shard_identity]: 1.05e-05 [virtual_dataset]: 7.30003e-06 [get_grad_eliminate_]: 6.71999e-06 [virtual_output]: 6.98998e-06 [merge_forward]: 4.99998e-06 [cell_reuse_recompute_pass]: 1.45999e-06 [offload_activation]: 1.077e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.526e-05 [merge_recompute_call_nodes]: 1.58002e-06 [before_grad]: 1.115e-05 [set_forward_comm_id_for_comm_node_pass]: 4.19997e-06 [meta_fg_expand]: 3.28e-06 [flash_sp_send_recv_attached]: 3.11001e-06 [receive_attached]: 2.87002e-06 [after_resolve]: 1.25e-05 [a_after_grad]: 1.086e-05 [renormalize]: 0.00098283 [add_forward_monad_depend]: 7.5e-06 [auto_monad_grad]: 2.90002e-06 [auto_monad_eliminator]: 1.808e-05 [cse]: 3.197e-05 [a_3]: 8.113e-05 [Cycle 2]: 0.00111532, [45] [expand_dump_flag]: 2.16e-06 [switch_simplify]: 8.60999e-06 [loop_unroll]: 1.168e-05 [a_1]: 0.00016154 [with_stream_mark]: 2.319e-05 [recompute_prepare]: 7.41999e-06 [updatestate_depend_eliminate]: 3.98999e-06 [updatestate_assign_eliminate]: 2.76e-06 [updatestate_loads_eliminate]: 2.78e-06 [parameter_eliminate]: 2.36e-06 [a_2]: 0.00024046 [accelerated_algorithm]: 8.12e-06 [shard]: 2.30002e-06 [meta_shard_fg_expand]: 2.32001e-06 [shard_inline]: 7.68001e-06 [merge_send_recv]: 8.59e-06 [auto_parallel]: 1.034e-05 [parallel]: 7.75e-06 [flash_sp]: 4.34002e-06 [merge_comm]: 1.441e-05 [allreduce_fusion]: 4.25e-06 [matmul_add_comm_reduction]: 8.58001e-06 [allreduce_slice_to_reducescatter]: 7.7e-07 [virtual_shard_identity]: 8.27e-06 [virtual_dataset]: 6.97002e-06 [get_grad_eliminate_]: 5.93002e-06 [virtual_output]: 6.12999e-06 [merge_forward]: 4.64002e-06 [cell_reuse_recompute_pass]: 2.31e-06 [offload_activation]: 9.49e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.584e-05 [merge_recompute_call_nodes]: 1.15001e-06 [before_grad]: 1.033e-05 [set_forward_comm_id_for_comm_node_pass]: 6.46999e-06 [meta_fg_expand]: 2.49001e-06 [flash_sp_send_recv_attached]: 1.64e-06 [receive_attached]: 2.58e-06 [after_resolve]: 1.267e-05 [a_after_grad]: 1.073e-05 [renormalize]: 1.00001e-07 [add_forward_monad_depend]: 2.29001e-06 [auto_monad_grad]: 2.17999e-06 [auto_monad_eliminator]: 9.84999e-06 [cse]: 1.812e-05 [a_3]: 5.176e-05 [py_interpret_to_execute_after_opt_a]: 1.948e-05 [slice_cell_reuse_recomputed_activation]: 5.00999e-06 [rewriter_after_opt_a]: 4.619e-05 [convert_after_rewriter]: 1.067e-05 [order_py_execute_after_rewriter]: 8.20999e-06 [mutable_eliminate]: 0.00080385 [opt_b]: 0.00029794, [1] [Cycle 1]: 0.00028604, [7] [b_1]: 0.000179 [b_2]: 9.64999e-06 [updatestate_depend_eliminate]: 9.68002e-06 [updatestate_assign_eliminate]: 3.09001e-06 [updatestate_loads_eliminate]: 3.03e-06 [renormalize]: 1.06002e-06 [cse]: 2.299e-05 [optimize_parallel_all_gather_comm]: 2.329e-05 [overlap_param_gather]: 5.61e-06 [cconv]: 3.998e-05 [loop_unroll]: 0.00050675 [opt_after_cconv]: 0.00017986, [1] [Cycle 1]: 0.00017001, [7] [c_1]: 3.235e-05 [parameter_eliminate]: 5.01997e-06 [updatestate_depend_eliminate]: 5.46998e-06 [updatestate_assign_eliminate]: 2.61999e-06 [updatestate_loads_eliminate]: 4.452e-05 [cse]: 2.112e-05 [renormalize]: 9.20001e-07 [remove_dup_value]: 1.838e-05 [tuple_transform]: 0.00010076, [1] [Cycle 1]: 9.308e-05, [4] [d_1]: 5.115e-05 [none_parameter_eliminate]: 2.07001e-06 [renormalize]: 2.10013e-07 [switch_simplify]: 7.78999e-06 [partial_unused_args_eliminate]: 4.76002e-06 [add_recomputation]: 5.615e-05 [cse_after_recomputation]: 2.838e-05, [1] [Cycle 1]: 2.129e-05, [1] [cse]: 1.236e-05 [environ_conv]: 9.21002e-06 [swap_dp_allreduce_reducescatter]: 8.19002e-06 [bias_add_comm_swap]: 6.09001e-06 [label_micro_interleaved_index]: 7.75e-06 [label_fine_grained_interleaved_index]: 5.49998e-06 [merge_cast_opt]: 4.35e-06 [slice_recompute_activation]: 4.70999e-06 [micro_interleaved_order_control]: 5.05999e-06 [assign_add_opt]: 3.55e-06 [ForceFp32Comm]: 3.56999e-06 [remove_cast_before_assign_add]: 3.41999e-06 [full_micro_interleaved_order_control]: 4.65001e-06 [reorder_send_recv_between_fp_bp]: 6.09999e-06 [comm_op_add_attrs]: 3.98999e-06 [add_comm_op_reuse_tag]: 3.63e-06 [interleave_split_concat_branches]: 3.72002e-06 [interleave_parallel_branches]: 3.68999e-06 [overlap_opt_shard_in_pipeline]: 4.02e-06 [overlap_opt_shard_grad_in_pipeline]: 4.23001e-06 [control_data_broadcast_order]: 1.794e-05 [grouped_pairwise_exchange_alltoall]: 4.07e-06 [offloading_packed_experts]: 7.01999e-06 [overlap_recompute_and_grad_model_parallel]: 7.73001e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.81999e-06 [overlap_recompute_allgather_and_fa_grad]: 4.4e-06 [overlap_recompute_comm]: 4.85999e-06 [overlap_grad_ring_attention]: 6.78998e-06 [overlap_grad_flash_sp]: 2.561e-05 [begin_end_overlap_inline]: 3.35e-06 [split_matmul_comm_elemetwise]: 4.92e-06 [split_layernorm_comm]: 4.65001e-06 [handle_group_info]: 3.46999e-06 [symbol_engine_optimizer]: 0.00010586, [1] [Cycle 1]: 9.869e-05, [6] [build]: 4.43001e-06 [elim_shapecalc]: 1.065e-05 [elim_not_effective]: 1.553e-05 [opt_reshape]: 8.55001e-06 [fold_const_symbol]: 1.127e-05 [renormalize]: 3.09985e-07 [detach_backward]: 3.85e-06 [pipeline_parallel_scheduler]: 1.72001e-06 [auto_monad_reorder]: 2.128e-05 [get_jit_bprop_graph]: 1.81e-06 [rewriter_after_jit_bprop_graph]: 5.29e-06 [opt_after_jit_grad]: 0.0005626 [validate]: 4.746e-05 Sums bootstrap : 0.000477s : 3.63% type_inference : 0.006236s : 47.49% event_method : 0.000020s : 0.15% auto_monad : 0.000229s : 1.74% graph_reusing : 0.000006s : 0.05% inline : 0.000002s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000026s : 0.20% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.05% parallel-infer-symbol : 0.000004s : 0.03% pre_auto_parallel : 0.000046s : 0.35% insert-virtual-dataset : 0.000003s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000038s : 0.29% optimize.rewriter_before_opt_a : 0.000122s : 0.93% optimize.opt_a.expand_dump_flag : 0.000006s : 0.05% optimize.opt_a.switch_simplify : 0.000056s : 0.43% optimize.opt_a.loop_unroll : 0.000044s : 0.33% optimize.opt_a.a_1 : 0.000900s : 6.85% optimize.opt_a.with_stream_mark : 0.000047s : 0.36% optimize.opt_a.recompute_prepare : 0.000019s : 0.14% optimize.opt_a.updatestate_depend_eliminate : 0.000009s : 0.07% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.05% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.05% optimize.opt_a.parameter_eliminate : 0.000005s : 0.03% optimize.opt_a.a_2 : 0.000364s : 2.77% optimize.opt_a.accelerated_algorithm : 0.000017s : 0.13% optimize.opt_a.shard : 0.000004s : 0.03% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.03% optimize.opt_a.shard_inline : 0.000015s : 0.11% optimize.opt_a.merge_send_recv : 0.000019s : 0.14% optimize.opt_a.auto_parallel : 0.000018s : 0.14% optimize.opt_a.parallel : 0.000029s : 0.22% optimize.opt_a.flash_sp : 0.000015s : 0.11% optimize.opt_a.merge_comm : 0.000019s : 0.14% optimize.opt_a.allreduce_fusion : 0.000008s : 0.06% optimize.opt_a.matmul_add_comm_reduction : 0.000019s : 0.15% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000002s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000019s : 0.14% optimize.opt_a.virtual_dataset : 0.000014s : 0.11% optimize.opt_a.get_grad_eliminate_ : 0.000013s : 0.10% optimize.opt_a.virtual_output : 0.000013s : 0.10% optimize.opt_a.merge_forward : 0.000010s : 0.07% optimize.opt_a.cell_reuse_recompute_pass : 0.000004s : 0.03% optimize.opt_a.offload_activation : 0.000020s : 0.15% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000031s : 0.24% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.02% optimize.opt_a.before_grad : 0.000021s : 0.16% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000011s : 0.08% optimize.opt_a.meta_fg_expand : 0.000006s : 0.04% optimize.opt_a.flash_sp_send_recv_attached : 0.000005s : 0.04% optimize.opt_a.receive_attached : 0.000005s : 0.04% optimize.opt_a.after_resolve : 0.000025s : 0.19% optimize.opt_a.a_after_grad : 0.000022s : 0.16% optimize.opt_a.renormalize : 0.000983s : 7.48% optimize.opt_a.add_forward_monad_depend : 0.000010s : 0.07% optimize.opt_a.auto_monad_grad : 0.000005s : 0.04% optimize.opt_a.auto_monad_eliminator : 0.000028s : 0.21% optimize.opt_a.cse : 0.000050s : 0.38% optimize.opt_a.a_3 : 0.000133s : 1.01% optimize.py_interpret_to_execute_after_opt_a : 0.000019s : 0.15% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.04% optimize.rewriter_after_opt_a : 0.000046s : 0.35% optimize.convert_after_rewriter : 0.000011s : 0.08% optimize.order_py_execute_after_rewriter : 0.000008s : 0.06% optimize.mutable_eliminate : 0.000804s : 6.12% optimize.opt_b.b_1 : 0.000179s : 1.36% optimize.opt_b.b_2 : 0.000010s : 0.07% optimize.opt_b.updatestate_depend_eliminate : 0.000010s : 0.07% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.02% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.02% optimize.opt_b.renormalize : 0.000001s : 0.01% optimize.opt_b.cse : 0.000023s : 0.18% optimize.optimize_parallel_all_gather_comm : 0.000023s : 0.18% optimize.overlap_param_gather : 0.000006s : 0.04% optimize.cconv : 0.000040s : 0.30% optimize.loop_unroll : 0.000507s : 3.86% optimize.opt_after_cconv.c_1 : 0.000032s : 0.25% optimize.opt_after_cconv.parameter_eliminate : 0.000005s : 0.04% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000005s : 0.04% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.02% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000045s : 0.34% optimize.opt_after_cconv.cse : 0.000021s : 0.16% optimize.opt_after_cconv.renormalize : 0.000001s : 0.01% optimize.remove_dup_value : 0.000018s : 0.14% optimize.tuple_transform.d_1 : 0.000051s : 0.39% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.02% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000008s : 0.06% optimize.partial_unused_args_eliminate : 0.000005s : 0.04% optimize.add_recomputation : 0.000056s : 0.43% optimize.cse_after_recomputation.cse : 0.000012s : 0.09% optimize.environ_conv : 0.000009s : 0.07% optimize.swap_dp_allreduce_reducescatter : 0.000008s : 0.06% optimize.bias_add_comm_swap : 0.000006s : 0.05% optimize.label_micro_interleaved_index : 0.000008s : 0.06% optimize.label_fine_grained_interleaved_index : 0.000005s : 0.04% optimize.merge_cast_opt : 0.000004s : 0.03% optimize.slice_recompute_activation : 0.000005s : 0.04% optimize.micro_interleaved_order_control : 0.000005s : 0.04% optimize.assign_add_opt : 0.000004s : 0.03% optimize.ForceFp32Comm : 0.000004s : 0.03% optimize.remove_cast_before_assign_add : 0.000003s : 0.03% optimize.full_micro_interleaved_order_control : 0.000005s : 0.04% optimize.reorder_send_recv_between_fp_bp : 0.000006s : 0.05% optimize.comm_op_add_attrs : 0.000004s : 0.03% optimize.add_comm_op_reuse_tag : 0.000004s : 0.03% optimize.interleave_split_concat_branches : 0.000004s : 0.03% optimize.interleave_parallel_branches : 0.000004s : 0.03% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.03% optimize.overlap_opt_shard_grad_in_pipeline : 0.000004s : 0.03% optimize.control_data_broadcast_order : 0.000018s : 0.14% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.03% optimize.offloading_packed_experts : 0.000007s : 0.05% optimize.overlap_recompute_and_grad_model_parallel : 0.000008s : 0.06% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.03% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.03% optimize.overlap_recompute_comm : 0.000005s : 0.04% optimize.overlap_grad_ring_attention : 0.000007s : 0.05% optimize.overlap_grad_flash_sp : 0.000026s : 0.20% optimize.begin_end_overlap_inline : 0.000003s : 0.03% optimize.split_matmul_comm_elemetwise : 0.000005s : 0.04% optimize.split_layernorm_comm : 0.000005s : 0.04% optimize.handle_group_info : 0.000003s : 0.03% optimize.symbol_engine_optimizer.build : 0.000004s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000011s : 0.08% optimize.symbol_engine_optimizer.elim_not_effective : 0.000016s : 0.12% optimize.symbol_engine_optimizer.opt_reshape : 0.000009s : 0.07% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000011s : 0.09% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000004s : 0.03% pipeline_parallel_scheduler : 0.000002s : 0.01% auto_monad_reorder : 0.000021s : 0.16% get_jit_bprop_graph : 0.000002s : 0.01% rewriter_after_jit_bprop_graph : 0.000005s : 0.04% opt_after_jit_grad : 0.000563s : 4.28% validate : 0.000047s : 0.36% Time group info: ------[substitution.] 0.000284 34 15.19% : 0.000043s : 6: substitution.arithmetic_simplify 0.97% : 0.000003s : 2: substitution.elim_not_effective 0.52% : 0.000001s : 2: substitution.fold_const_symbol 2.39% : 0.000007s : 4: substitution.graph_param_transform 68.94% : 0.000196s : 4: substitution.inline 1.67% : 0.000005s : 4: substitution.j_node_and_user_rematch 1.90% : 0.000005s : 4: substitution.remove_not_recompute_node 2.19% : 0.000006s : 4: substitution.replace_old_param 6.23% : 0.000018s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.006177 2 88.03% : 0.005438s : 1: type_inference.infer 11.97% : 0.000739s : 1: type_inference.specialize ------[replace.] 0.000069 8 64.27% : 0.000044s : 4: replace.inline 35.73% : 0.000025s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000209 8 92.47% : 0.000193s : 4: match.inline 7.53% : 0.000016s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000237 1278 0.89% : 0.000002s : 13: predicate.accumulaten_eliminater 0.73% : 0.000002s : 4: predicate.ad_related_special_op_eliminate 0.55% : 0.000001s : 8: predicate.addn_check_dump 0.93% : 0.000002s : 13: predicate.addn_zero_filter 0.69% : 0.000002s : 13: predicate.adjust_all_reduce_mul_add 2.45% : 0.000006s : 21: predicate.arithmetic_simplify 0.95% : 0.000002s : 13: predicate.cast_eliminate 0.57% : 0.000001s : 8: predicate.check_bprop_eliminate 0.90% : 0.000002s : 8: predicate.compare_switch_simplify 0.16% : 0.000000s : 4: predicate.const_output_eliminate 0.60% : 0.000001s : 8: predicate.depend_value_elim 0.90% : 0.000002s : 13: predicate.dict_get_item_const_eliminator 1.30% : 0.000003s : 13: predicate.dict_get_item_eliminator 0.86% : 0.000002s : 13: predicate.dict_set_item_eliminator 0.95% : 0.000002s : 8: predicate.dumpgradient_eliminate 0.27% : 0.000001s : 4: predicate.elim_not_effective 0.50% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.15% : 0.000003s : 17: predicate.environ_add_const_eliminate 1.00% : 0.000002s : 17: predicate.environ_get_add_eliminate 1.07% : 0.000003s : 17: predicate.environ_get_depend_swap 1.56% : 0.000004s : 25: predicate.environ_get_eliminate 0.97% : 0.000002s : 17: predicate.environ_get_set_eliminate 1.45% : 0.000003s : 21: predicate.exchange_switch_depend_value 2.36% : 0.000006s : 21: predicate.float_depend_g_call 0.79% : 0.000002s : 8: predicate.float_environ_get_switch 0.88% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.16% : 0.000000s : 4: predicate.fold_const_symbol 0.50% : 0.000001s : 8: predicate.get_grad_eliminate 0.28% : 0.000001s : 4: predicate.graph_param_transform 0.60% : 0.000001s : 8: predicate.incorporate_call 0.46% : 0.000001s : 8: predicate.incorporate_call_switch 6.42% : 0.000015s : 58: predicate.inline 0.75% : 0.000002s : 8: predicate.inline_without_move 0.30% : 0.000001s : 8: predicate.j_node_and_user_rematch 0.73% : 0.000002s : 8: predicate.less_batch_normalization 1.66% : 0.000004s : 25: predicate.list_to_tuple_eliminator_ 2.26% : 0.000005s : 38: predicate.load_eliminater 1.07% : 0.000003s : 4: predicate.loop_unroll_after_grad 2.21% : 0.000005s : 34: predicate.loop_unroll_before_grad 1.60% : 0.000004s : 21: predicate.make_slice_get_slice_eliminator 0.67% : 0.000002s : 8: predicate.merge_addn 0.68% : 0.000002s : 8: predicate.micro_step_allgather_replace 0.60% : 0.000001s : 8: predicate.mini_step_allgather_replace 1.11% : 0.000003s : 13: predicate.minmaximum_grad 1.52% : 0.000004s : 4: predicate.mutable_eliminate 0.46% : 0.000001s : 4: predicate.opt_reshape 0.38% : 0.000001s : 4: predicate.parallel_virtual_node 1.89% : 0.000004s : 21: predicate.partial_defer_inline 1.50% : 0.000004s : 21: predicate.partial_eliminate 1.04% : 0.000002s : 13: predicate.print_const_string_wrapper 0.60% : 0.000001s : 8: predicate.reduce_all_const_elim 1.27% : 0.000003s : 13: predicate.reduce_eliminate 2.30% : 0.000005s : 38: predicate.redundant_stop_gradient_eliminater 0.59% : 0.000001s : 8: predicate.remove_not_recompute_node 1.18% : 0.000003s : 25: predicate.replace_applicator 0.49% : 0.000001s : 8: predicate.replace_old_param 0.24% : 0.000001s : 4: predicate.reset_defer_inline 0.91% : 0.000002s : 13: predicate.reshape_eliminate 0.61% : 0.000001s : 8: predicate.row_tensor_add_zeros_like 0.51% : 0.000001s : 4: predicate.row_tensor_eliminate 0.85% : 0.000002s : 8: predicate.same_eliminate 0.44% : 0.000001s : 8: predicate.set_cell_output_no_recompute 1.01% : 0.000002s : 8: predicate.shard_identity_eliminate 0.57% : 0.000001s : 8: predicate.special_op_eliminate 0.57% : 0.000001s : 8: predicate.specialize_transform 1.23% : 0.000003s : 8: predicate.split_environ_get_set_with_tuple_value 0.78% : 0.000002s : 8: predicate.stack_unstack_eliminate 0.51% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.38% : 0.000003s : 21: predicate.switch_defer_inline 1.86% : 0.000004s : 29: predicate.switch_layer_defer_inline 5.10% : 0.000012s : 67: predicate.switch_simplify 0.83% : 0.000002s : 13: predicate.tile_eliminate 1.01% : 0.000002s : 13: predicate.transpose_eliminate 1.77% : 0.000004s : 21: predicate.tuple_list_convert_item_index_to_positive 1.53% : 0.000004s : 21: predicate.tuple_list_get_item_const_eliminator 1.36% : 0.000003s : 21: predicate.tuple_list_get_item_depend_reorder 3.08% : 0.000007s : 33: predicate.tuple_list_get_item_eliminator 1.73% : 0.000004s : 21: predicate.tuple_list_get_set_item_eliminator 2.36% : 0.000006s : 29: predicate.tuple_list_set_item_eliminator 1.66% : 0.000004s : 25: predicate.tuple_to_list_eliminator_ 2.45% : 0.000006s : 38: predicate.updatestate_pure_node_eliminater 2.85% : 0.000007s : 46: predicate.updatestate_useless_node_eliminater 0.35% : 0.000001s : 4: predicate.value_based_eliminate 0.61% : 0.000001s : 8: predicate.virtual_dataset_eliminate 0.59% : 0.000001s : 8: predicate.virtual_output_eliminate 0.22% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.35% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000635 11 52.43% : 0.000333s : 5: func_graph_cloner_run.FuncGraphClonerGraph 47.57% : 0.000302s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.319051 192 0.00% : 0.000006s : 1: ForceFp32Comm 46.19% : 0.147374s : 1: add_attr 46.18% : 0.147350s : 1: add_attr_with_inline 0.00% : 0.000007s : 1: add_comm_op_reuse_tag 0.02% : 0.000060s : 1: add_recomputation 0.00% : 0.000006s : 1: assign_add_opt 0.08% : 0.000244s : 1: auto_monad 0.01% : 0.000029s : 1: auto_monad_reorder 0.00% : 0.000006s : 1: begin_end_overlap_inline 0.00% : 0.000009s : 1: bias_add_comm_swap 0.16% : 0.000518s : 1: bootstrap 0.01% : 0.000044s : 1: cconv 0.00% : 0.000007s : 1: comm_op_add_attrs 0.01% : 0.000021s : 1: control_data_broadcast_order 0.00% : 0.000014s : 1: convert_after_rewriter 0.01% : 0.000032s : 1: cse_after_recomputation 0.00% : 0.000007s : 1: dataset_repeat_opt 0.01% : 0.000019s : 1: detach_backward 0.00% : 0.000012s : 1: environ_conv 0.01% : 0.000031s : 1: event_method 0.00% : 0.000007s : 1: full_micro_interleaved_order_control 0.00% : 0.000009s : 1: get_jit_bprop_graph 0.00% : 0.000014s : 1: graph_reusing 0.00% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000006s : 1: handle_group_info 0.00% : 0.000008s : 1: inline 0.00% : 0.000009s : 1: insert-virtual-dataset 0.00% : 0.000006s : 1: interleave_parallel_branches 0.00% : 0.000006s : 1: interleave_split_concat_branches 0.00% : 0.000008s : 1: label_fine_grained_interleaved_index 0.00% : 0.000011s : 1: label_micro_interleaved_index 0.16% : 0.000514s : 1: loop_unroll 0.00% : 0.000007s : 1: merge_cast_opt 0.00% : 0.000008s : 1: micro_interleaved_order_control 0.25% : 0.000813s : 1: mutable_eliminate 0.00% : 0.000010s : 1: offloading_packed_experts 0.00% : 0.000016s : 1: opt.transform.loop_unroll_optimizer 0.01% : 0.000022s : 1: opt.transform.mutable_eliminate 0.44% : 0.001398s : 78: opt.transform.opt_a 0.01% : 0.000031s : 1: opt.transform.opt_after_cconv 0.01% : 0.000026s : 1: opt.transform.opt_after_jit_grad 0.03% : 0.000110s : 28: opt.transform.opt_b 0.02% : 0.000057s : 2: opt.transform.opt_trans_graph 0.01% : 0.000042s : 4: opt.transform.symbol_engine_opt 1.21% : 0.003871s : 1: opt_a 0.06% : 0.000184s : 1: opt_after_cconv 0.18% : 0.000575s : 1: opt_after_jit_grad 0.09% : 0.000302s : 1: opt_b 2.24% : 0.007149s : 1: optimize 0.01% : 0.000027s : 1: optimize_parallel_all_gather_comm 0.00% : 0.000011s : 1: order_py_execute_after_rewriter 0.01% : 0.000029s : 1: overlap_grad_flash_sp 0.00% : 0.000007s : 1: overlap_grad_matmul_and_grad_allreduce 0.00% : 0.000010s : 1: overlap_grad_ring_attention 0.00% : 0.000007s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000007s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000009s : 1: overlap_param_gather 0.00% : 0.000007s : 1: overlap_recompute_allgather_and_fa_grad 0.00% : 0.000012s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000008s : 1: overlap_recompute_comm 0.00% : 0.000012s : 1: parallel-infer-symbol 0.00% : 0.000007s : 1: parallel-infer-symbol-second 0.00% : 0.000008s : 1: partial_unused_args_eliminate 0.00% : 0.000010s : 1: pipeline_parallel_scheduler 0.00% : 0.000007s : 1: pipeline_split 0.02% : 0.000054s : 1: pre_auto_parallel 0.01% : 0.000042s : 1: py_interpret_to_execute 0.01% : 0.000023s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000006s : 1: remove_cast_before_assign_add 0.01% : 0.000022s : 1: remove_dup_value 0.17% : 0.000532s : 1: renormalize.infer 0.14% : 0.000440s : 1: renormalize.specialize 0.00% : 0.000009s : 1: reorder_send_recv_between_fp_bp 0.00% : 0.000012s : 1: rewriter_after_jit_bprop_graph 0.02% : 0.000050s : 1: rewriter_after_opt_a 0.04% : 0.000126s : 1: rewriter_before_opt_a 0.00% : 0.000008s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000008s : 1: slice_recompute_activation 0.00% : 0.000008s : 1: split_layernorm_comm 0.00% : 0.000008s : 1: split_matmul_comm_elemetwise 0.00% : 0.000011s : 1: swap_dp_allreduce_reducescatter 0.03% : 0.000109s : 1: symbol_engine_optimizer 0.03% : 0.000104s : 1: tuple_transform 1.97% : 0.006282s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:41:26.497.890 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.15832, [21] [bootstrap]: 0.00048994 [type_inference]: 0.00625635 [event_method]: 2.158e-05 [auto_monad]: 6.877e-05 [graph_reusing]: 5.82001e-06 [inline]: 2.58e-06 [add_attr]: 0.144842, [1] [add_attr_with_inline]: 0.144827, [1] [Cycle 1]: 8.679e-05, [2] [tag_attr]: 2.766e-05 [meta_addattr_fg_expand]: 6.33e-06 [parallel-infer-symbol]: 4.1e-06 [pre_auto_parallel]: 4.935e-05 [insert-virtual-dataset]: 2.53998e-06 [parallel-infer-symbol-second]: 1.39e-06 [dataset_repeat_opt]: 2.63e-06 [pipeline_split]: 2.07999e-06 [optimize]: 0.00573478, [53] [py_interpret_to_execute]: 5.142e-05 [rewriter_before_opt_a]: 0.00011332 [opt_a]: 0.00319434, [2] [Cycle 1]: 0.00243579, [45] [expand_dump_flag]: 3.86999e-06 [switch_simplify]: 4.632e-05 [loop_unroll]: 3.179e-05 [a_1]: 0.00072233 [with_stream_mark]: 2.674e-05 [recompute_prepare]: 1.296e-05 [updatestate_depend_eliminate]: 4.77998e-06 [updatestate_assign_eliminate]: 3.18e-06 [updatestate_loads_eliminate]: 2.92002e-06 [parameter_eliminate]: 1.97001e-06 [a_2]: 9.363e-05 [accelerated_algorithm]: 8.80999e-06 [shard]: 2.36e-06 [meta_shard_fg_expand]: 2.40002e-06 [shard_inline]: 7.73001e-06 [merge_send_recv]: 1.028e-05 [auto_parallel]: 8.2e-06 [parallel]: 2.119e-05 [flash_sp]: 1.035e-05 [merge_comm]: 4.37e-06 [allreduce_fusion]: 3.3e-06 [matmul_add_comm_reduction]: 1.068e-05 [allreduce_slice_to_reducescatter]: 9.69972e-07 [virtual_shard_identity]: 9.22001e-06 [virtual_dataset]: 7.56001e-06 [get_grad_eliminate_]: 6.19001e-06 [virtual_output]: 6.74999e-06 [merge_forward]: 4.03001e-06 [cell_reuse_recompute_pass]: 2.09e-06 [offload_activation]: 1.134e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.36e-05 [merge_recompute_call_nodes]: 1.62001e-06 [before_grad]: 1.226e-05 [set_forward_comm_id_for_comm_node_pass]: 3.93999e-06 [meta_fg_expand]: 2.89999e-06 [flash_sp_send_recv_attached]: 2.51e-06 [receive_attached]: 2.53003e-06 [after_resolve]: 1.275e-05 [a_after_grad]: 1.074e-05 [renormalize]: 0.00088191 [add_forward_monad_depend]: 7.72998e-06 [auto_monad_grad]: 3.18e-06 [auto_monad_eliminator]: 2.024e-05 [cse]: 3.413e-05 [a_3]: 5.869e-05 [Cycle 2]: 0.00074595, [45] [expand_dump_flag]: 2.14e-06 [switch_simplify]: 8.69e-06 [loop_unroll]: 6.73e-06 [a_1]: 0.00015106 [with_stream_mark]: 1.834e-05 [recompute_prepare]: 7.85e-06 [updatestate_depend_eliminate]: 4.02e-06 [updatestate_assign_eliminate]: 2.69999e-06 [updatestate_loads_eliminate]: 3.4e-06 [parameter_eliminate]: 1.10999e-06 [a_2]: 8.054e-05 [accelerated_algorithm]: 6.75998e-06 [shard]: 2.37001e-06 [meta_shard_fg_expand]: 1.94999e-06 [shard_inline]: 6.75998e-06 [merge_send_recv]: 7.53e-06 [auto_parallel]: 8.25e-06 [parallel]: 6.58e-06 [flash_sp]: 4.53999e-06 [merge_comm]: 3.97998e-06 [allreduce_fusion]: 3.9e-06 [matmul_add_comm_reduction]: 8.83001e-06 [allreduce_slice_to_reducescatter]: 6.09987e-07 [virtual_shard_identity]: 8.11002e-06 [virtual_dataset]: 6.11998e-06 [get_grad_eliminate_]: 5.50001e-06 [virtual_output]: 5.79e-06 [merge_forward]: 4.18001e-06 [cell_reuse_recompute_pass]: 2.59999e-06 [offload_activation]: 9.94001e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.336e-05 [merge_recompute_call_nodes]: 1.44998e-06 [before_grad]: 1.114e-05 [set_forward_comm_id_for_comm_node_pass]: 3.45e-06 [meta_fg_expand]: 2.27001e-06 [flash_sp_send_recv_attached]: 1.20999e-06 [receive_attached]: 2.55002e-06 [after_resolve]: 1.26e-05 [a_after_grad]: 9.71e-06 [renormalize]: 5.9983e-08 [add_forward_monad_depend]: 2.48e-06 [auto_monad_grad]: 1.64998e-06 [auto_monad_eliminator]: 1.142e-05 [cse]: 2.025e-05 [a_3]: 3.788e-05 [py_interpret_to_execute_after_opt_a]: 1.58e-05 [slice_cell_reuse_recomputed_activation]: 2.22999e-06 [rewriter_after_opt_a]: 4.856e-05 [convert_after_rewriter]: 9.09e-06 [order_py_execute_after_rewriter]: 5.44998e-06 [mutable_eliminate]: 0.00079145 [opt_b]: 0.00023001, [1] [Cycle 1]: 0.00022124, [7] [b_1]: 0.00013198 [b_2]: 8.90001e-06 [updatestate_depend_eliminate]: 1.003e-05 [updatestate_assign_eliminate]: 2.81e-06 [updatestate_loads_eliminate]: 2.86e-06 [renormalize]: 1.05999e-06 [cse]: 2.61e-05 [optimize_parallel_all_gather_comm]: 2.129e-05 [overlap_param_gather]: 2.26e-06 [cconv]: 3.638e-05 [loop_unroll]: 0.00047771 [opt_after_cconv]: 0.00011841, [1] [Cycle 1]: 0.00011177, [7] [c_1]: 3.593e-05 [parameter_eliminate]: 6.40997e-06 [updatestate_depend_eliminate]: 6.33e-06 [updatestate_assign_eliminate]: 3.07002e-06 [updatestate_loads_eliminate]: 2.37001e-06 [cse]: 2.078e-05 [renormalize]: 5.59987e-07 [remove_dup_value]: 1.612e-05 [tuple_transform]: 8.35e-05, [1] [Cycle 1]: 7.837e-05, [4] [d_1]: 4.893e-05 [none_parameter_eliminate]: 1.87999e-06 [renormalize]: 2.10013e-07 [switch_simplify]: 6.84001e-06 [partial_unused_args_eliminate]: 1.91e-06 [add_recomputation]: 5.47e-05 [cse_after_recomputation]: 2.341e-05, [1] [Cycle 1]: 1.839e-05, [1] [cse]: 1.196e-05 [environ_conv]: 5.27999e-06 [swap_dp_allreduce_reducescatter]: 5.65001e-06 [bias_add_comm_swap]: 3.24001e-06 [label_micro_interleaved_index]: 4.3e-06 [label_fine_grained_interleaved_index]: 2.61e-06 [merge_cast_opt]: 1.24e-06 [slice_recompute_activation]: 2.49001e-06 [micro_interleaved_order_control]: 2.17999e-06 [assign_add_opt]: 1.30999e-06 [ForceFp32Comm]: 8.49977e-07 [remove_cast_before_assign_add]: 1.04003e-06 [full_micro_interleaved_order_control]: 2.56e-06 [reorder_send_recv_between_fp_bp]: 2.81e-06 [comm_op_add_attrs]: 1.12e-06 [add_comm_op_reuse_tag]: 1.35999e-06 [interleave_split_concat_branches]: 1.38002e-06 [interleave_parallel_branches]: 1.07998e-06 [overlap_opt_shard_in_pipeline]: 1.74e-06 [overlap_opt_shard_grad_in_pipeline]: 1.84e-06 [control_data_broadcast_order]: 1.491e-05 [grouped_pairwise_exchange_alltoall]: 2.16e-06 [offloading_packed_experts]: 3.88001e-06 [overlap_recompute_and_grad_model_parallel]: 5.22e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.24e-06 [overlap_recompute_allgather_and_fa_grad]: 1.44e-06 [overlap_recompute_comm]: 2.16e-06 [overlap_grad_ring_attention]: 4.29002e-06 [overlap_grad_flash_sp]: 2.263e-05 [begin_end_overlap_inline]: 5.69999e-07 [split_matmul_comm_elemetwise]: 2.63998e-06 [split_layernorm_comm]: 1.72001e-06 [handle_group_info]: 1.08001e-06 [symbol_engine_optimizer]: 8.488e-05, [1] [Cycle 1]: 7.972e-05, [6] [build]: 4.87e-06 [elim_shapecalc]: 1.084e-05 [elim_not_effective]: 1.434e-05 [opt_reshape]: 8.28999e-06 [fold_const_symbol]: 1.081e-05 [renormalize]: 1.79978e-07 [detach_backward]: 2.21e-06 [pipeline_parallel_scheduler]: 1.86e-06 [auto_monad_reorder]: 1.823e-05 [get_jit_bprop_graph]: 1.87999e-06 [rewriter_after_jit_bprop_graph]: 5.79e-06 [opt_after_jit_grad]: 0.00051277 [validate]: 4.489e-05 Sums bootstrap : 0.000490s : 3.97% type_inference : 0.006256s : 50.69% event_method : 0.000022s : 0.17% auto_monad : 0.000069s : 0.56% graph_reusing : 0.000006s : 0.05% inline : 0.000003s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000028s : 0.22% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.05% parallel-infer-symbol : 0.000004s : 0.03% pre_auto_parallel : 0.000049s : 0.40% insert-virtual-dataset : 0.000003s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000003s : 0.02% pipeline_split : 0.000002s : 0.02% optimize.py_interpret_to_execute : 0.000051s : 0.42% optimize.rewriter_before_opt_a : 0.000113s : 0.92% optimize.opt_a.expand_dump_flag : 0.000006s : 0.05% optimize.opt_a.switch_simplify : 0.000055s : 0.45% optimize.opt_a.loop_unroll : 0.000039s : 0.31% optimize.opt_a.a_1 : 0.000873s : 7.08% optimize.opt_a.with_stream_mark : 0.000045s : 0.37% optimize.opt_a.recompute_prepare : 0.000021s : 0.17% optimize.opt_a.updatestate_depend_eliminate : 0.000009s : 0.07% optimize.opt_a.updatestate_assign_eliminate : 0.000006s : 0.05% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.05% optimize.opt_a.parameter_eliminate : 0.000003s : 0.02% optimize.opt_a.a_2 : 0.000174s : 1.41% optimize.opt_a.accelerated_algorithm : 0.000016s : 0.13% optimize.opt_a.shard : 0.000005s : 0.04% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.04% optimize.opt_a.shard_inline : 0.000014s : 0.12% optimize.opt_a.merge_send_recv : 0.000018s : 0.14% optimize.opt_a.auto_parallel : 0.000016s : 0.13% optimize.opt_a.parallel : 0.000028s : 0.23% optimize.opt_a.flash_sp : 0.000015s : 0.12% optimize.opt_a.merge_comm : 0.000008s : 0.07% optimize.opt_a.allreduce_fusion : 0.000007s : 0.06% optimize.opt_a.matmul_add_comm_reduction : 0.000020s : 0.16% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000002s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000017s : 0.14% optimize.opt_a.virtual_dataset : 0.000014s : 0.11% optimize.opt_a.get_grad_eliminate_ : 0.000012s : 0.09% optimize.opt_a.virtual_output : 0.000013s : 0.10% optimize.opt_a.merge_forward : 0.000008s : 0.07% optimize.opt_a.cell_reuse_recompute_pass : 0.000005s : 0.04% optimize.opt_a.offload_activation : 0.000021s : 0.17% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000027s : 0.22% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.02% optimize.opt_a.before_grad : 0.000023s : 0.19% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000007s : 0.06% optimize.opt_a.meta_fg_expand : 0.000005s : 0.04% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.03% optimize.opt_a.receive_attached : 0.000005s : 0.04% optimize.opt_a.after_resolve : 0.000025s : 0.21% optimize.opt_a.a_after_grad : 0.000020s : 0.17% optimize.opt_a.renormalize : 0.000882s : 7.15% optimize.opt_a.add_forward_monad_depend : 0.000010s : 0.08% optimize.opt_a.auto_monad_grad : 0.000005s : 0.04% optimize.opt_a.auto_monad_eliminator : 0.000032s : 0.26% optimize.opt_a.cse : 0.000054s : 0.44% optimize.opt_a.a_3 : 0.000097s : 0.78% optimize.py_interpret_to_execute_after_opt_a : 0.000016s : 0.13% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.02% optimize.rewriter_after_opt_a : 0.000049s : 0.39% optimize.convert_after_rewriter : 0.000009s : 0.07% optimize.order_py_execute_after_rewriter : 0.000005s : 0.04% optimize.mutable_eliminate : 0.000791s : 6.41% optimize.opt_b.b_1 : 0.000132s : 1.07% optimize.opt_b.b_2 : 0.000009s : 0.07% optimize.opt_b.updatestate_depend_eliminate : 0.000010s : 0.08% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.02% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.02% optimize.opt_b.renormalize : 0.000001s : 0.01% optimize.opt_b.cse : 0.000026s : 0.21% optimize.optimize_parallel_all_gather_comm : 0.000021s : 0.17% optimize.overlap_param_gather : 0.000002s : 0.02% optimize.cconv : 0.000036s : 0.29% optimize.loop_unroll : 0.000478s : 3.87% optimize.opt_after_cconv.c_1 : 0.000036s : 0.29% optimize.opt_after_cconv.parameter_eliminate : 0.000006s : 0.05% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.05% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.02% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.02% optimize.opt_after_cconv.cse : 0.000021s : 0.17% optimize.opt_after_cconv.renormalize : 0.000001s : 0.00% optimize.remove_dup_value : 0.000016s : 0.13% optimize.tuple_transform.d_1 : 0.000049s : 0.40% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.02% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000007s : 0.06% optimize.partial_unused_args_eliminate : 0.000002s : 0.02% optimize.add_recomputation : 0.000055s : 0.44% optimize.cse_after_recomputation.cse : 0.000012s : 0.10% optimize.environ_conv : 0.000005s : 0.04% optimize.swap_dp_allreduce_reducescatter : 0.000006s : 0.05% optimize.bias_add_comm_swap : 0.000003s : 0.03% optimize.label_micro_interleaved_index : 0.000004s : 0.03% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.02% optimize.merge_cast_opt : 0.000001s : 0.01% optimize.slice_recompute_activation : 0.000002s : 0.02% optimize.micro_interleaved_order_control : 0.000002s : 0.02% optimize.assign_add_opt : 0.000001s : 0.01% optimize.ForceFp32Comm : 0.000001s : 0.01% optimize.remove_cast_before_assign_add : 0.000001s : 0.01% optimize.full_micro_interleaved_order_control : 0.000003s : 0.02% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.02% optimize.comm_op_add_attrs : 0.000001s : 0.01% optimize.add_comm_op_reuse_tag : 0.000001s : 0.01% optimize.interleave_split_concat_branches : 0.000001s : 0.01% optimize.interleave_parallel_branches : 0.000001s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000002s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.01% optimize.control_data_broadcast_order : 0.000015s : 0.12% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.02% optimize.offloading_packed_experts : 0.000004s : 0.03% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.04% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.01% optimize.overlap_recompute_comm : 0.000002s : 0.02% optimize.overlap_grad_ring_attention : 0.000004s : 0.03% optimize.overlap_grad_flash_sp : 0.000023s : 0.18% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000003s : 0.02% optimize.split_layernorm_comm : 0.000002s : 0.01% optimize.handle_group_info : 0.000001s : 0.01% optimize.symbol_engine_optimizer.build : 0.000005s : 0.04% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000011s : 0.09% optimize.symbol_engine_optimizer.elim_not_effective : 0.000014s : 0.12% optimize.symbol_engine_optimizer.opt_reshape : 0.000008s : 0.07% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000011s : 0.09% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.02% pipeline_parallel_scheduler : 0.000002s : 0.02% auto_monad_reorder : 0.000018s : 0.15% get_jit_bprop_graph : 0.000002s : 0.02% rewriter_after_jit_bprop_graph : 0.000006s : 0.05% opt_after_jit_grad : 0.000513s : 4.15% validate : 0.000045s : 0.36% Time group info: ------[substitution.] 0.000269 34 15.18% : 0.000041s : 6: substitution.arithmetic_simplify 0.71% : 0.000002s : 2: substitution.elim_not_effective 0.71% : 0.000002s : 2: substitution.fold_const_symbol 2.41% : 0.000006s : 4: substitution.graph_param_transform 68.45% : 0.000184s : 4: substitution.inline 1.71% : 0.000005s : 4: substitution.j_node_and_user_rematch 1.94% : 0.000005s : 4: substitution.remove_not_recompute_node 2.30% : 0.000006s : 4: substitution.replace_old_param 6.59% : 0.000018s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.006186 2 88.14% : 0.005452s : 1: type_inference.infer 11.86% : 0.000734s : 1: type_inference.specialize ------[replace.] 0.000070 8 64.56% : 0.000045s : 4: replace.inline 35.44% : 0.000025s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000197 8 91.99% : 0.000181s : 4: match.inline 8.01% : 0.000016s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000226 1278 1.11% : 0.000003s : 13: predicate.accumulaten_eliminater 0.66% : 0.000001s : 4: predicate.ad_related_special_op_eliminate 0.53% : 0.000001s : 8: predicate.addn_check_dump 0.99% : 0.000002s : 13: predicate.addn_zero_filter 0.74% : 0.000002s : 13: predicate.adjust_all_reduce_mul_add 2.52% : 0.000006s : 21: predicate.arithmetic_simplify 0.96% : 0.000002s : 13: predicate.cast_eliminate 0.91% : 0.000002s : 8: predicate.check_bprop_eliminate 0.48% : 0.000001s : 8: predicate.compare_switch_simplify 0.16% : 0.000000s : 4: predicate.const_output_eliminate 0.60% : 0.000001s : 8: predicate.depend_value_elim 0.89% : 0.000002s : 13: predicate.dict_get_item_const_eliminator 1.14% : 0.000003s : 13: predicate.dict_get_item_eliminator 0.82% : 0.000002s : 13: predicate.dict_set_item_eliminator 0.89% : 0.000002s : 8: predicate.dumpgradient_eliminate 0.22% : 0.000001s : 4: predicate.elim_not_effective 0.41% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.08% : 0.000002s : 17: predicate.environ_add_const_eliminate 1.09% : 0.000002s : 17: predicate.environ_get_add_eliminate 1.05% : 0.000002s : 17: predicate.environ_get_depend_swap 1.66% : 0.000004s : 25: predicate.environ_get_eliminate 1.14% : 0.000003s : 17: predicate.environ_get_set_eliminate 1.34% : 0.000003s : 21: predicate.exchange_switch_depend_value 2.30% : 0.000005s : 21: predicate.float_depend_g_call 0.47% : 0.000001s : 8: predicate.float_environ_get_switch 0.92% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.17% : 0.000000s : 4: predicate.fold_const_symbol 0.59% : 0.000001s : 8: predicate.get_grad_eliminate 0.28% : 0.000001s : 4: predicate.graph_param_transform 0.53% : 0.000001s : 8: predicate.incorporate_call 0.44% : 0.000001s : 8: predicate.incorporate_call_switch 6.58% : 0.000015s : 58: predicate.inline 0.60% : 0.000001s : 8: predicate.inline_without_move 0.30% : 0.000001s : 8: predicate.j_node_and_user_rematch 0.88% : 0.000002s : 8: predicate.less_batch_normalization 1.68% : 0.000004s : 25: predicate.list_to_tuple_eliminator_ 2.50% : 0.000006s : 38: predicate.load_eliminater 0.99% : 0.000002s : 4: predicate.loop_unroll_after_grad 2.47% : 0.000006s : 34: predicate.loop_unroll_before_grad 1.63% : 0.000004s : 21: predicate.make_slice_get_slice_eliminator 0.65% : 0.000001s : 8: predicate.merge_addn 0.55% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.54% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.84% : 0.000002s : 13: predicate.minmaximum_grad 1.39% : 0.000003s : 4: predicate.mutable_eliminate 0.37% : 0.000001s : 4: predicate.opt_reshape 0.39% : 0.000001s : 4: predicate.parallel_virtual_node 1.71% : 0.000004s : 21: predicate.partial_defer_inline 1.60% : 0.000004s : 21: predicate.partial_eliminate 0.79% : 0.000002s : 13: predicate.print_const_string_wrapper 0.65% : 0.000001s : 8: predicate.reduce_all_const_elim 1.31% : 0.000003s : 13: predicate.reduce_eliminate 2.42% : 0.000005s : 38: predicate.redundant_stop_gradient_eliminater 0.55% : 0.000001s : 8: predicate.remove_not_recompute_node 1.61% : 0.000004s : 25: predicate.replace_applicator 0.55% : 0.000001s : 8: predicate.replace_old_param 0.42% : 0.000001s : 4: predicate.reset_defer_inline 0.85% : 0.000002s : 13: predicate.reshape_eliminate 0.71% : 0.000002s : 8: predicate.row_tensor_add_zeros_like 0.51% : 0.000001s : 4: predicate.row_tensor_eliminate 0.95% : 0.000002s : 8: predicate.same_eliminate 0.49% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.71% : 0.000002s : 8: predicate.shard_identity_eliminate 0.73% : 0.000002s : 8: predicate.special_op_eliminate 0.61% : 0.000001s : 8: predicate.specialize_transform 1.08% : 0.000002s : 8: predicate.split_environ_get_set_with_tuple_value 0.81% : 0.000002s : 8: predicate.stack_unstack_eliminate 0.37% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.41% : 0.000003s : 21: predicate.switch_defer_inline 1.89% : 0.000004s : 29: predicate.switch_layer_defer_inline 4.98% : 0.000011s : 67: predicate.switch_simplify 0.99% : 0.000002s : 13: predicate.tile_eliminate 1.12% : 0.000003s : 13: predicate.transpose_eliminate 1.47% : 0.000003s : 21: predicate.tuple_list_convert_item_index_to_positive 1.34% : 0.000003s : 21: predicate.tuple_list_get_item_const_eliminator 1.50% : 0.000003s : 21: predicate.tuple_list_get_item_depend_reorder 3.13% : 0.000007s : 33: predicate.tuple_list_get_item_eliminator 1.57% : 0.000004s : 21: predicate.tuple_list_get_set_item_eliminator 2.35% : 0.000005s : 29: predicate.tuple_list_set_item_eliminator 1.64% : 0.000004s : 25: predicate.tuple_to_list_eliminator_ 2.29% : 0.000005s : 38: predicate.updatestate_pure_node_eliminater 2.79% : 0.000006s : 46: predicate.updatestate_useless_node_eliminater 0.37% : 0.000001s : 4: predicate.value_based_eliminate 0.82% : 0.000002s : 8: predicate.virtual_dataset_eliminate 0.59% : 0.000001s : 8: predicate.virtual_output_eliminate 0.27% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.59% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000627 11 57.63% : 0.000361s : 5: func_graph_cloner_run.FuncGraphClonerGraph 42.37% : 0.000266s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.311226 192 0.00% : 0.000004s : 1: ForceFp32Comm 46.54% : 0.144849s : 1: add_attr 46.54% : 0.144833s : 1: add_attr_with_inline 0.00% : 0.000005s : 1: add_comm_op_reuse_tag 0.02% : 0.000059s : 1: add_recomputation 0.00% : 0.000004s : 1: assign_add_opt 0.02% : 0.000074s : 1: auto_monad 0.01% : 0.000022s : 1: auto_monad_reorder 0.00% : 0.000004s : 1: begin_end_overlap_inline 0.00% : 0.000006s : 1: bias_add_comm_swap 0.17% : 0.000523s : 1: bootstrap 0.01% : 0.000040s : 1: cconv 0.00% : 0.000004s : 1: comm_op_add_attrs 0.01% : 0.000018s : 1: control_data_broadcast_order 0.00% : 0.000012s : 1: convert_after_rewriter 0.01% : 0.000026s : 1: cse_after_recomputation 0.01% : 0.000024s : 1: dataset_repeat_opt 0.00% : 0.000005s : 1: detach_backward 0.00% : 0.000009s : 1: environ_conv 0.01% : 0.000030s : 1: event_method 0.00% : 0.000005s : 1: full_micro_interleaved_order_control 0.00% : 0.000005s : 1: get_jit_bprop_graph 0.00% : 0.000010s : 1: graph_reusing 0.00% : 0.000005s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000004s : 1: handle_group_info 0.00% : 0.000006s : 1: inline 0.00% : 0.000007s : 1: insert-virtual-dataset 0.00% : 0.000004s : 1: interleave_parallel_branches 0.00% : 0.000004s : 1: interleave_split_concat_branches 0.00% : 0.000006s : 1: label_fine_grained_interleaved_index 0.00% : 0.000007s : 1: label_micro_interleaved_index 0.16% : 0.000488s : 1: loop_unroll 0.00% : 0.000004s : 1: merge_cast_opt 0.00% : 0.000005s : 1: micro_interleaved_order_control 0.26% : 0.000803s : 1: mutable_eliminate 0.00% : 0.000007s : 1: offloading_packed_experts 0.01% : 0.000017s : 1: opt.transform.loop_unroll_optimizer 0.01% : 0.000024s : 1: opt.transform.mutable_eliminate 0.43% : 0.001336s : 78: opt.transform.opt_a 0.01% : 0.000034s : 1: opt.transform.opt_after_cconv 0.01% : 0.000028s : 1: opt.transform.opt_after_jit_grad 0.03% : 0.000105s : 28: opt.transform.opt_b 0.02% : 0.000054s : 2: opt.transform.opt_trans_graph 0.01% : 0.000040s : 4: opt.transform.symbol_engine_opt 1.03% : 0.003198s : 1: opt_a 0.04% : 0.000122s : 1: opt_after_cconv 0.17% : 0.000524s : 1: opt_after_jit_grad 0.08% : 0.000234s : 1: opt_b 1.84% : 0.005741s : 1: optimize 0.01% : 0.000025s : 1: optimize_parallel_all_gather_comm 0.00% : 0.000009s : 1: order_py_execute_after_rewriter 0.01% : 0.000026s : 1: overlap_grad_flash_sp 0.00% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.00% : 0.000007s : 1: overlap_grad_ring_attention 0.00% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000005s : 1: overlap_param_gather 0.00% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.00% : 0.000008s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000005s : 1: overlap_recompute_comm 0.00% : 0.000008s : 1: parallel-infer-symbol 0.00% : 0.000004s : 1: parallel-infer-symbol-second 0.00% : 0.000005s : 1: partial_unused_args_eliminate 0.00% : 0.000005s : 1: pipeline_parallel_scheduler 0.00% : 0.000006s : 1: pipeline_split 0.02% : 0.000055s : 1: pre_auto_parallel 0.02% : 0.000056s : 1: py_interpret_to_execute 0.01% : 0.000020s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000004s : 1: remove_cast_before_assign_add 0.01% : 0.000020s : 1: remove_dup_value 0.15% : 0.000474s : 1: renormalize.infer 0.13% : 0.000397s : 1: renormalize.specialize 0.00% : 0.000005s : 1: reorder_send_recv_between_fp_bp 0.00% : 0.000009s : 1: rewriter_after_jit_bprop_graph 0.02% : 0.000054s : 1: rewriter_after_opt_a 0.04% : 0.000119s : 1: rewriter_before_opt_a 0.00% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000006s : 1: slice_recompute_activation 0.00% : 0.000005s : 1: split_layernorm_comm 0.00% : 0.000005s : 1: split_matmul_comm_elemetwise 0.00% : 0.000008s : 1: swap_dp_allreduce_reducescatter 0.03% : 0.000088s : 1: symbol_engine_optimizer 0.03% : 0.000086s : 1: tuple_transform 2.02% : 0.006298s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:41:28.673.069 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:41:28.673.359 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.187923, [21] [bootstrap]: 0.00043307 [type_inference]: 0.00699205 [event_method]: 2.087e-05 [auto_monad]: 9.046e-05 [graph_reusing]: 6.48998e-06 [inline]: 2.78e-06 [add_attr]: 0.00368994, [1] [add_attr_with_inline]: 0.00367703, [1] [Cycle 1]: 9.148e-05, [2] [tag_attr]: 2.424e-05 [meta_addattr_fg_expand]: 6.81999e-06 [parallel-infer-symbol]: 3.86999e-06 [pre_auto_parallel]: 4.289e-05 [insert-virtual-dataset]: 3.28e-06 [parallel-infer-symbol-second]: 9.50007e-07 [dataset_repeat_opt]: 1.99999e-06 [pipeline_split]: 2.05002e-06 [optimize]: 0.175104, [53] [py_interpret_to_execute]: 3.775e-05 [rewriter_before_opt_a]: 0.0001041 [opt_a]: 0.0230299, [2] [Cycle 1]: 0.00905235, [45] [expand_dump_flag]: 3.47002e-06 [switch_simplify]: 4.134e-05 [loop_unroll]: 3.08e-05 [a_1]: 0.00073164 [with_stream_mark]: 2.394e-05 [recompute_prepare]: 9.62999e-06 [updatestate_depend_eliminate]: 4.17e-06 [updatestate_assign_eliminate]: 3.54002e-06 [updatestate_loads_eliminate]: 3.16999e-06 [parameter_eliminate]: 2.04e-06 [a_2]: 0.00012469 [accelerated_algorithm]: 8.87e-06 [shard]: 2.28002e-06 [meta_shard_fg_expand]: 2.54999e-06 [shard_inline]: 7.92e-06 [merge_send_recv]: 9.65002e-06 [auto_parallel]: 8.39998e-06 [parallel]: 2.295e-05 [flash_sp]: 1.036e-05 [merge_comm]: 4.33999e-06 [allreduce_fusion]: 4.23001e-06 [matmul_add_comm_reduction]: 1.125e-05 [allreduce_slice_to_reducescatter]: 9.80013e-07 [virtual_shard_identity]: 1.13e-05 [virtual_dataset]: 8.33999e-06 [get_grad_eliminate_]: 7.53e-06 [virtual_output]: 6.86001e-06 [merge_forward]: 6.19999e-06 [cell_reuse_recompute_pass]: 1.84e-06 [offload_activation]: 1.396e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.72e-05 [merge_recompute_call_nodes]: 1.71e-06 [before_grad]: 1.196e-05 [set_forward_comm_id_for_comm_node_pass]: 4.23999e-06 [meta_fg_expand]: 3.11001e-06 [flash_sp_send_recv_attached]: 3.25e-06 [receive_attached]: 2.36998e-06 [after_resolve]: 1.325e-05 [a_after_grad]: 1.153e-05 [renormalize]: 0.00722672 [add_forward_monad_depend]: 1.074e-05 [auto_monad_grad]: 2.96999e-06 [auto_monad_eliminator]: 2.332e-05 [cse]: 3.181e-05 [a_3]: 8.051e-05 [Cycle 2]: 0.0139568, [45] [expand_dump_flag]: 2.81e-06 [switch_simplify]: 9.15999e-06 [loop_unroll]: 7.23e-06 [a_1]: 0.00016611 [with_stream_mark]: 5.258e-05 [recompute_prepare]: 3.628e-05 [updatestate_depend_eliminate]: 6.51e-06 [updatestate_assign_eliminate]: 3.71999e-06 [updatestate_loads_eliminate]: 3.04999e-06 [parameter_eliminate]: 2.79001e-06 [a_2]: 0.00015306 [accelerated_algorithm]: 9.49e-06 [shard]: 4.43001e-06 [meta_shard_fg_expand]: 2.96001e-06 [shard_inline]: 6.98e-06 [merge_send_recv]: 1.043e-05 [auto_parallel]: 9.81998e-06 [parallel]: 1.049e-05 [flash_sp]: 4.41002e-06 [merge_comm]: 3.7e-06 [allreduce_fusion]: 3.87998e-06 [matmul_add_comm_reduction]: 1.19e-05 [allreduce_slice_to_reducescatter]: 8.70001e-07 [virtual_shard_identity]: 8.46002e-06 [virtual_dataset]: 6.91999e-06 [get_grad_eliminate_]: 6.54001e-06 [virtual_output]: 6.54999e-06 [merge_forward]: 4.52e-06 [cell_reuse_recompute_pass]: 3.26999e-06 [offload_activation]: 1.097e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.756e-05 [merge_recompute_call_nodes]: 1.77999e-06 [before_grad]: 7.044e-05 [set_forward_comm_id_for_comm_node_pass]: 6.14001e-06 [meta_fg_expand]: 3.32997e-06 [flash_sp_send_recv_attached]: 1.93002e-06 [receive_attached]: 2.39999e-06 [after_resolve]: 3.751e-05 [a_after_grad]: 1.405e-05 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 3.58e-06 [auto_monad_grad]: 3.46001e-06 [auto_monad_eliminator]: 2.169e-05 [cse]: 3.468e-05 [a_3]: 5.634e-05 [py_interpret_to_execute_after_opt_a]: 2.419e-05 [slice_cell_reuse_recomputed_activation]: 5.49998e-06 [rewriter_after_opt_a]: 4.943e-05 [convert_after_rewriter]: 1.045e-05 [order_py_execute_after_rewriter]: 8.58001e-06 [mutable_eliminate]: 0.0008052 [opt_b]: 0.00030064, [1] [Cycle 1]: 0.0002883, [7] [b_1]: 0.00018137 [b_2]: 9.51e-06 [updatestate_depend_eliminate]: 8.90001e-06 [updatestate_assign_eliminate]: 3.02002e-06 [updatestate_loads_eliminate]: 2.62001e-06 [renormalize]: 9.49978e-07 [cse]: 2.395e-05 [optimize_parallel_all_gather_comm]: 2.349e-05 [overlap_param_gather]: 5.24e-06 [cconv]: 3.896e-05 [loop_unroll]: 0.0004791 [opt_after_cconv]: 0.149334, [1] [Cycle 1]: 0.149316, [7] [c_1]: 3.311e-05 [parameter_eliminate]: 4.72998e-06 [updatestate_depend_eliminate]: 0.149082 [updatestate_assign_eliminate]: 8.03999e-06 [updatestate_loads_eliminate]: 4.27e-06 [cse]: 4.552e-05 [renormalize]: 9.39996e-07 [remove_dup_value]: 2.247e-05 [tuple_transform]: 0.00014803, [1] [Cycle 1]: 0.00013498, [4] [d_1]: 8.489e-05 [none_parameter_eliminate]: 6.26e-06 [renormalize]: 2.9002e-07 [switch_simplify]: 8.99e-06 [partial_unused_args_eliminate]: 5.30001e-06 [add_recomputation]: 7.606e-05 [cse_after_recomputation]: 3.405e-05, [1] [Cycle 1]: 2.55e-05, [1] [cse]: 1.512e-05 [environ_conv]: 1.179e-05 [swap_dp_allreduce_reducescatter]: 9.32001e-06 [bias_add_comm_swap]: 6.45002e-06 [label_micro_interleaved_index]: 1.227e-05 [label_fine_grained_interleaved_index]: 6.17999e-06 [merge_cast_opt]: 4.60001e-06 [slice_recompute_activation]: 5.64998e-06 [micro_interleaved_order_control]: 5.77999e-06 [assign_add_opt]: 4.28999e-06 [ForceFp32Comm]: 3.77002e-06 [remove_cast_before_assign_add]: 4.07e-06 [full_micro_interleaved_order_control]: 5.23002e-06 [reorder_send_recv_between_fp_bp]: 7.43e-06 [comm_op_add_attrs]: 4.42998e-06 [add_comm_op_reuse_tag]: 3.24001e-06 [interleave_split_concat_branches]: 3.73001e-06 [interleave_parallel_branches]: 3.56999e-06 [overlap_opt_shard_in_pipeline]: 3.70998e-06 [overlap_opt_shard_grad_in_pipeline]: 4.63999e-06 [control_data_broadcast_order]: 2.168e-05 [grouped_pairwise_exchange_alltoall]: 4.15e-06 [offloading_packed_experts]: 7.15e-06 [overlap_recompute_and_grad_model_parallel]: 7.55e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.75e-06 [overlap_recompute_allgather_and_fa_grad]: 4.12e-06 [overlap_recompute_comm]: 4.94998e-06 [overlap_grad_ring_attention]: 7.09001e-06 [overlap_grad_flash_sp]: 2.702e-05 [begin_end_overlap_inline]: 3.01001e-06 [split_matmul_comm_elemetwise]: 4.90001e-06 [split_layernorm_comm]: 4.47e-06 [handle_group_info]: 3.31001e-06 [symbol_engine_optimizer]: 0.00010908, [1] [Cycle 1]: 0.0001019, [6] [build]: 4.50001e-06 [elim_shapecalc]: 1.291e-05 [elim_not_effective]: 1.511e-05 [opt_reshape]: 9.05999e-06 [fold_const_symbol]: 1.079e-05 [renormalize]: 3.20026e-07 [detach_backward]: 4.08999e-06 [pipeline_parallel_scheduler]: 1.99e-06 [auto_monad_reorder]: 2.203e-05 [get_jit_bprop_graph]: 1.91e-06 [rewriter_after_jit_bprop_graph]: 7.78001e-06 [opt_after_jit_grad]: 0.00080823 [validate]: 4.568e-05 Sums bootstrap : 0.000433s : 0.26% type_inference : 0.006992s : 4.13% event_method : 0.000021s : 0.01% auto_monad : 0.000090s : 0.05% graph_reusing : 0.000006s : 0.00% inline : 0.000003s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000024s : 0.01% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000007s : 0.00% parallel-infer-symbol : 0.000004s : 0.00% pre_auto_parallel : 0.000043s : 0.03% insert-virtual-dataset : 0.000003s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000038s : 0.02% optimize.rewriter_before_opt_a : 0.000104s : 0.06% optimize.opt_a.expand_dump_flag : 0.000006s : 0.00% optimize.opt_a.switch_simplify : 0.000050s : 0.03% optimize.opt_a.loop_unroll : 0.000038s : 0.02% optimize.opt_a.a_1 : 0.000898s : 0.53% optimize.opt_a.with_stream_mark : 0.000077s : 0.05% optimize.opt_a.recompute_prepare : 0.000046s : 0.03% optimize.opt_a.updatestate_depend_eliminate : 0.000011s : 0.01% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.00% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.00% optimize.opt_a.parameter_eliminate : 0.000005s : 0.00% optimize.opt_a.a_2 : 0.000278s : 0.16% optimize.opt_a.accelerated_algorithm : 0.000018s : 0.01% optimize.opt_a.shard : 0.000007s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.000006s : 0.00% optimize.opt_a.shard_inline : 0.000015s : 0.01% optimize.opt_a.merge_send_recv : 0.000020s : 0.01% optimize.opt_a.auto_parallel : 0.000018s : 0.01% optimize.opt_a.parallel : 0.000033s : 0.02% optimize.opt_a.flash_sp : 0.000015s : 0.01% optimize.opt_a.merge_comm : 0.000008s : 0.00% optimize.opt_a.allreduce_fusion : 0.000008s : 0.00% optimize.opt_a.matmul_add_comm_reduction : 0.000023s : 0.01% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000002s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000020s : 0.01% optimize.opt_a.virtual_dataset : 0.000015s : 0.01% optimize.opt_a.get_grad_eliminate_ : 0.000014s : 0.01% optimize.opt_a.virtual_output : 0.000013s : 0.01% optimize.opt_a.merge_forward : 0.000011s : 0.01% optimize.opt_a.cell_reuse_recompute_pass : 0.000005s : 0.00% optimize.opt_a.offload_activation : 0.000025s : 0.01% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000035s : 0.02% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.00% optimize.opt_a.before_grad : 0.000082s : 0.05% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000010s : 0.01% optimize.opt_a.meta_fg_expand : 0.000006s : 0.00% optimize.opt_a.flash_sp_send_recv_attached : 0.000005s : 0.00% optimize.opt_a.receive_attached : 0.000005s : 0.00% optimize.opt_a.after_resolve : 0.000051s : 0.03% optimize.opt_a.a_after_grad : 0.000026s : 0.02% optimize.opt_a.renormalize : 0.007227s : 4.27% optimize.opt_a.add_forward_monad_depend : 0.000014s : 0.01% optimize.opt_a.auto_monad_grad : 0.000006s : 0.00% optimize.opt_a.auto_monad_eliminator : 0.000045s : 0.03% optimize.opt_a.cse : 0.000066s : 0.04% optimize.opt_a.a_3 : 0.000137s : 0.08% optimize.py_interpret_to_execute_after_opt_a : 0.000024s : 0.01% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.00% optimize.rewriter_after_opt_a : 0.000049s : 0.03% optimize.convert_after_rewriter : 0.000010s : 0.01% optimize.order_py_execute_after_rewriter : 0.000009s : 0.01% optimize.mutable_eliminate : 0.000805s : 0.48% optimize.opt_b.b_1 : 0.000181s : 0.11% optimize.opt_b.b_2 : 0.000010s : 0.01% optimize.opt_b.updatestate_depend_eliminate : 0.000009s : 0.01% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.00% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000024s : 0.01% optimize.optimize_parallel_all_gather_comm : 0.000023s : 0.01% optimize.overlap_param_gather : 0.000005s : 0.00% optimize.cconv : 0.000039s : 0.02% optimize.loop_unroll : 0.000479s : 0.28% optimize.opt_after_cconv.c_1 : 0.000033s : 0.02% optimize.opt_after_cconv.parameter_eliminate : 0.000005s : 0.00% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.149082s : 87.99% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000008s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000004s : 0.00% optimize.opt_after_cconv.cse : 0.000046s : 0.03% optimize.opt_after_cconv.renormalize : 0.000001s : 0.00% optimize.remove_dup_value : 0.000022s : 0.01% optimize.tuple_transform.d_1 : 0.000085s : 0.05% optimize.tuple_transform.none_parameter_eliminate : 0.000006s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000009s : 0.01% optimize.partial_unused_args_eliminate : 0.000005s : 0.00% optimize.add_recomputation : 0.000076s : 0.04% optimize.cse_after_recomputation.cse : 0.000015s : 0.01% optimize.environ_conv : 0.000012s : 0.01% optimize.swap_dp_allreduce_reducescatter : 0.000009s : 0.01% optimize.bias_add_comm_swap : 0.000006s : 0.00% optimize.label_micro_interleaved_index : 0.000012s : 0.01% optimize.label_fine_grained_interleaved_index : 0.000006s : 0.00% optimize.merge_cast_opt : 0.000005s : 0.00% optimize.slice_recompute_activation : 0.000006s : 0.00% optimize.micro_interleaved_order_control : 0.000006s : 0.00% optimize.assign_add_opt : 0.000004s : 0.00% optimize.ForceFp32Comm : 0.000004s : 0.00% optimize.remove_cast_before_assign_add : 0.000004s : 0.00% optimize.full_micro_interleaved_order_control : 0.000005s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000007s : 0.00% optimize.comm_op_add_attrs : 0.000004s : 0.00% optimize.add_comm_op_reuse_tag : 0.000003s : 0.00% optimize.interleave_split_concat_branches : 0.000004s : 0.00% optimize.interleave_parallel_branches : 0.000004s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000005s : 0.00% optimize.control_data_broadcast_order : 0.000022s : 0.01% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.00% optimize.offloading_packed_experts : 0.000007s : 0.00% optimize.overlap_recompute_and_grad_model_parallel : 0.000008s : 0.00% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.00% optimize.overlap_recompute_comm : 0.000005s : 0.00% optimize.overlap_grad_ring_attention : 0.000007s : 0.00% optimize.overlap_grad_flash_sp : 0.000027s : 0.02% optimize.begin_end_overlap_inline : 0.000003s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000005s : 0.00% optimize.split_layernorm_comm : 0.000004s : 0.00% optimize.handle_group_info : 0.000003s : 0.00% optimize.symbol_engine_optimizer.build : 0.000005s : 0.00% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000013s : 0.01% optimize.symbol_engine_optimizer.elim_not_effective : 0.000015s : 0.01% optimize.symbol_engine_optimizer.opt_reshape : 0.000009s : 0.01% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000011s : 0.01% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000004s : 0.00% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000022s : 0.01% get_jit_bprop_graph : 0.000002s : 0.00% rewriter_after_jit_bprop_graph : 0.000008s : 0.00% opt_after_jit_grad : 0.000808s : 0.48% validate : 0.000046s : 0.03% Time group info: ------[substitution.] 0.000309 34 17.64% : 0.000055s : 6: substitution.arithmetic_simplify 0.92% : 0.000003s : 2: substitution.elim_not_effective 0.47% : 0.000001s : 2: substitution.fold_const_symbol 3.25% : 0.000010s : 4: substitution.graph_param_transform 65.58% : 0.000203s : 4: substitution.inline 1.83% : 0.000006s : 4: substitution.j_node_and_user_rematch 2.22% : 0.000007s : 4: substitution.remove_not_recompute_node 2.64% : 0.000008s : 4: substitution.replace_old_param 5.44% : 0.000017s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.006928 2 88.31% : 0.006118s : 1: type_inference.infer 11.69% : 0.000810s : 1: type_inference.specialize ------[replace.] 0.000068 8 63.74% : 0.000044s : 4: replace.inline 36.26% : 0.000025s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000215 8 93.05% : 0.000200s : 4: match.inline 6.95% : 0.000015s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000250 1278 0.86% : 0.000002s : 13: predicate.accumulaten_eliminater 0.69% : 0.000002s : 4: predicate.ad_related_special_op_eliminate 0.46% : 0.000001s : 8: predicate.addn_check_dump 0.91% : 0.000002s : 13: predicate.addn_zero_filter 0.66% : 0.000002s : 13: predicate.adjust_all_reduce_mul_add 3.36% : 0.000008s : 21: predicate.arithmetic_simplify 0.89% : 0.000002s : 13: predicate.cast_eliminate 0.54% : 0.000001s : 8: predicate.check_bprop_eliminate 0.53% : 0.000001s : 8: predicate.compare_switch_simplify 0.16% : 0.000000s : 4: predicate.const_output_eliminate 0.66% : 0.000002s : 8: predicate.depend_value_elim 0.82% : 0.000002s : 13: predicate.dict_get_item_const_eliminator 0.93% : 0.000002s : 13: predicate.dict_get_item_eliminator 0.76% : 0.000002s : 13: predicate.dict_set_item_eliminator 1.10% : 0.000003s : 8: predicate.dumpgradient_eliminate 0.28% : 0.000001s : 4: predicate.elim_not_effective 0.56% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.28% : 0.000003s : 17: predicate.environ_add_const_eliminate 0.95% : 0.000002s : 17: predicate.environ_get_add_eliminate 0.98% : 0.000002s : 17: predicate.environ_get_depend_swap 1.64% : 0.000004s : 25: predicate.environ_get_eliminate 0.98% : 0.000002s : 17: predicate.environ_get_set_eliminate 1.29% : 0.000003s : 21: predicate.exchange_switch_depend_value 2.23% : 0.000006s : 21: predicate.float_depend_g_call 0.60% : 0.000001s : 8: predicate.float_environ_get_switch 0.97% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.17% : 0.000000s : 4: predicate.fold_const_symbol 0.85% : 0.000002s : 8: predicate.get_grad_eliminate 0.40% : 0.000001s : 4: predicate.graph_param_transform 0.69% : 0.000002s : 8: predicate.incorporate_call 0.45% : 0.000001s : 8: predicate.incorporate_call_switch 5.58% : 0.000014s : 58: predicate.inline 0.90% : 0.000002s : 8: predicate.inline_without_move 0.30% : 0.000001s : 8: predicate.j_node_and_user_rematch 1.19% : 0.000003s : 8: predicate.less_batch_normalization 1.80% : 0.000005s : 25: predicate.list_to_tuple_eliminator_ 2.21% : 0.000006s : 38: predicate.load_eliminater 0.98% : 0.000002s : 4: predicate.loop_unroll_after_grad 2.17% : 0.000005s : 34: predicate.loop_unroll_before_grad 1.42% : 0.000004s : 21: predicate.make_slice_get_slice_eliminator 0.62% : 0.000002s : 8: predicate.merge_addn 0.63% : 0.000002s : 8: predicate.micro_step_allgather_replace 0.69% : 0.000002s : 8: predicate.mini_step_allgather_replace 0.77% : 0.000002s : 13: predicate.minmaximum_grad 1.35% : 0.000003s : 4: predicate.mutable_eliminate 0.36% : 0.000001s : 4: predicate.opt_reshape 0.38% : 0.000001s : 4: predicate.parallel_virtual_node 1.41% : 0.000004s : 21: predicate.partial_defer_inline 1.46% : 0.000004s : 21: predicate.partial_eliminate 0.77% : 0.000002s : 13: predicate.print_const_string_wrapper 0.68% : 0.000002s : 8: predicate.reduce_all_const_elim 1.05% : 0.000003s : 13: predicate.reduce_eliminate 2.36% : 0.000006s : 38: predicate.redundant_stop_gradient_eliminater 0.39% : 0.000001s : 8: predicate.remove_not_recompute_node 1.27% : 0.000003s : 25: predicate.replace_applicator 0.52% : 0.000001s : 8: predicate.replace_old_param 0.50% : 0.000001s : 4: predicate.reset_defer_inline 1.16% : 0.000003s : 13: predicate.reshape_eliminate 0.79% : 0.000002s : 8: predicate.row_tensor_add_zeros_like 0.38% : 0.000001s : 4: predicate.row_tensor_eliminate 0.87% : 0.000002s : 8: predicate.same_eliminate 0.86% : 0.000002s : 8: predicate.set_cell_output_no_recompute 1.12% : 0.000003s : 8: predicate.shard_identity_eliminate 0.64% : 0.000002s : 8: predicate.special_op_eliminate 0.96% : 0.000002s : 8: predicate.specialize_transform 1.04% : 0.000003s : 8: predicate.split_environ_get_set_with_tuple_value 1.04% : 0.000003s : 8: predicate.stack_unstack_eliminate 0.34% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.28% : 0.000003s : 21: predicate.switch_defer_inline 1.86% : 0.000005s : 29: predicate.switch_layer_defer_inline 4.46% : 0.000011s : 67: predicate.switch_simplify 0.86% : 0.000002s : 13: predicate.tile_eliminate 0.90% : 0.000002s : 13: predicate.transpose_eliminate 1.76% : 0.000004s : 21: predicate.tuple_list_convert_item_index_to_positive 1.38% : 0.000003s : 21: predicate.tuple_list_get_item_const_eliminator 1.66% : 0.000004s : 21: predicate.tuple_list_get_item_depend_reorder 3.81% : 0.000010s : 33: predicate.tuple_list_get_item_eliminator 1.40% : 0.000004s : 21: predicate.tuple_list_get_set_item_eliminator 2.69% : 0.000007s : 29: predicate.tuple_list_set_item_eliminator 1.85% : 0.000005s : 25: predicate.tuple_to_list_eliminator_ 2.15% : 0.000005s : 38: predicate.updatestate_pure_node_eliminater 2.91% : 0.000007s : 46: predicate.updatestate_useless_node_eliminater 0.51% : 0.000001s : 4: predicate.value_based_eliminate 0.71% : 0.000002s : 8: predicate.virtual_dataset_eliminate 0.52% : 0.000001s : 8: predicate.virtual_output_eliminate 0.24% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.42% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000696 11 50.90% : 0.000354s : 5: func_graph_cloner_run.FuncGraphClonerGraph 49.10% : 0.000342s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.375637 192 0.00% : 0.000007s : 1: ForceFp32Comm 0.99% : 0.003701s : 1: add_attr 0.98% : 0.003681s : 1: add_attr_with_inline 0.00% : 0.000007s : 1: add_comm_op_reuse_tag 0.02% : 0.000080s : 1: add_recomputation 0.00% : 0.000007s : 1: assign_add_opt 0.03% : 0.000101s : 1: auto_monad 0.01% : 0.000029s : 1: auto_monad_reorder 0.00% : 0.000006s : 1: begin_end_overlap_inline 0.00% : 0.000010s : 1: bias_add_comm_swap 0.13% : 0.000479s : 1: bootstrap 0.01% : 0.000042s : 1: cconv 0.00% : 0.000007s : 1: comm_op_add_attrs 0.01% : 0.000025s : 1: control_data_broadcast_order 0.00% : 0.000013s : 1: convert_after_rewriter 0.01% : 0.000038s : 1: cse_after_recomputation 0.00% : 0.000008s : 1: dataset_repeat_opt 0.01% : 0.000021s : 1: detach_backward 0.00% : 0.000015s : 1: environ_conv 0.01% : 0.000032s : 1: event_method 0.00% : 0.000008s : 1: full_micro_interleaved_order_control 0.00% : 0.000010s : 1: get_jit_bprop_graph 0.00% : 0.000013s : 1: graph_reusing 0.00% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000006s : 1: handle_group_info 0.00% : 0.000009s : 1: inline 0.00% : 0.000010s : 1: insert-virtual-dataset 0.00% : 0.000006s : 1: interleave_parallel_branches 0.00% : 0.000006s : 1: interleave_split_concat_branches 0.00% : 0.000009s : 1: label_fine_grained_interleaved_index 0.00% : 0.000015s : 1: label_micro_interleaved_index 0.13% : 0.000485s : 1: loop_unroll 0.00% : 0.000007s : 1: merge_cast_opt 0.00% : 0.000008s : 1: micro_interleaved_order_control 0.22% : 0.000814s : 1: mutable_eliminate 0.00% : 0.000010s : 1: offloading_packed_experts 0.00% : 0.000015s : 1: opt.transform.loop_unroll_optimizer 0.01% : 0.000020s : 1: opt.transform.mutable_eliminate 0.41% : 0.001526s : 78: opt.transform.opt_a 0.01% : 0.000032s : 1: opt.transform.opt_after_cconv 0.01% : 0.000031s : 1: opt.transform.opt_after_jit_grad 0.03% : 0.000113s : 28: opt.transform.opt_b 0.02% : 0.000087s : 2: opt.transform.opt_trans_graph 0.01% : 0.000044s : 4: opt.transform.symbol_engine_opt 6.13% : 0.023034s : 1: opt_a 39.76% : 0.149340s : 1: opt_after_cconv 0.22% : 0.000820s : 1: opt_after_jit_grad 0.08% : 0.000305s : 1: opt_b 46.71% : 0.175465s : 1: optimize 0.01% : 0.000027s : 1: optimize_parallel_all_gather_comm 0.00% : 0.000012s : 1: order_py_execute_after_rewriter 0.01% : 0.000030s : 1: overlap_grad_flash_sp 0.00% : 0.000007s : 1: overlap_grad_matmul_and_grad_allreduce 0.00% : 0.000010s : 1: overlap_grad_ring_attention 0.00% : 0.000007s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000006s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000009s : 1: overlap_param_gather 0.00% : 0.000007s : 1: overlap_recompute_allgather_and_fa_grad 0.00% : 0.000011s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000008s : 1: overlap_recompute_comm 0.00% : 0.000011s : 1: parallel-infer-symbol 0.00% : 0.000007s : 1: parallel-infer-symbol-second 0.00% : 0.000008s : 1: partial_unused_args_eliminate 0.00% : 0.000010s : 1: pipeline_parallel_scheduler 0.00% : 0.000008s : 1: pipeline_split 0.01% : 0.000051s : 1: pre_auto_parallel 0.01% : 0.000042s : 1: py_interpret_to_execute 0.01% : 0.000028s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000007s : 1: remove_cast_before_assign_add 0.01% : 0.000026s : 1: remove_dup_value 1.79% : 0.006706s : 1: renormalize.infer 0.13% : 0.000500s : 1: renormalize.specialize 0.00% : 0.000010s : 1: reorder_send_recv_between_fp_bp 0.00% : 0.000014s : 1: rewriter_after_jit_bprop_graph 0.01% : 0.000052s : 1: rewriter_after_opt_a 0.03% : 0.000108s : 1: rewriter_before_opt_a 0.00% : 0.000008s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000010s : 1: slice_recompute_activation 0.00% : 0.000008s : 1: split_layernorm_comm 0.00% : 0.000008s : 1: split_matmul_comm_elemetwise 0.00% : 0.000012s : 1: swap_dp_allreduce_reducescatter 0.03% : 0.000112s : 1: symbol_engine_optimizer 0.04% : 0.000151s : 1: tuple_transform 1.87% : 0.007041s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:41:31.153.09 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.155129, [21] [bootstrap]: 0.00049826 [type_inference]: 0.144272 [event_method]: 1.919e-05 [auto_monad]: 6.269e-05 [graph_reusing]: 6.90002e-06 [inline]: 2.06e-06 [add_attr]: 0.00357906, [1] [add_attr_with_inline]: 0.00356683, [1] [Cycle 1]: 7.311e-05, [2] [tag_attr]: 2.304e-05 [meta_addattr_fg_expand]: 6.64999e-06 [parallel-infer-symbol]: 3.8e-06 [pre_auto_parallel]: 4.068e-05 [insert-virtual-dataset]: 2.67001e-06 [parallel-infer-symbol-second]: 7.80012e-07 [dataset_repeat_opt]: 1.77999e-06 [pipeline_split]: 2.04999e-06 [optimize]: 0.00585425, [53] [py_interpret_to_execute]: 2.904e-05 [rewriter_before_opt_a]: 8.925e-05 [opt_a]: 0.00332712, [2] [Cycle 1]: 0.00242383, [45] [expand_dump_flag]: 2.96999e-06 [switch_simplify]: 4.393e-05 [loop_unroll]: 3.051e-05 [a_1]: 0.00075012 [with_stream_mark]: 2.097e-05 [recompute_prepare]: 1.012e-05 [updatestate_depend_eliminate]: 5.17e-06 [updatestate_assign_eliminate]: 3.18e-06 [updatestate_loads_eliminate]: 3.06999e-06 [parameter_eliminate]: 2.07001e-06 [a_2]: 9.491e-05 [accelerated_algorithm]: 7.62002e-06 [shard]: 2.48e-06 [meta_shard_fg_expand]: 2.09999e-06 [shard_inline]: 7.13e-06 [merge_send_recv]: 9.54e-06 [auto_parallel]: 8.55001e-06 [parallel]: 2.064e-05 [flash_sp]: 9.49999e-06 [merge_comm]: 4.55001e-06 [allreduce_fusion]: 3.78001e-06 [matmul_add_comm_reduction]: 1.023e-05 [allreduce_slice_to_reducescatter]: 8.60018e-07 [virtual_shard_identity]: 9.17001e-06 [virtual_dataset]: 6.91001e-06 [get_grad_eliminate_]: 6.48003e-06 [virtual_output]: 6.71999e-06 [merge_forward]: 4.63001e-06 [cell_reuse_recompute_pass]: 1.20999e-06 [offload_activation]: 1.128e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.318e-05 [merge_recompute_call_nodes]: 1.80001e-06 [before_grad]: 1.129e-05 [set_forward_comm_id_for_comm_node_pass]: 4.32e-06 [meta_fg_expand]: 3.55e-06 [flash_sp_send_recv_attached]: 2.94999e-06 [receive_attached]: 2.07001e-06 [after_resolve]: 1.255e-05 [a_after_grad]: 1.067e-05 [renormalize]: 0.0008793 [add_forward_monad_depend]: 6.86999e-06 [auto_monad_grad]: 2.88998e-06 [auto_monad_eliminator]: 1.785e-05 [cse]: 3.202e-05 [a_3]: 5.239e-05 [Cycle 2]: 0.0008901, [45] [expand_dump_flag]: 2.63e-06 [switch_simplify]: 8.79003e-06 [loop_unroll]: 6.53998e-06 [a_1]: 0.00015207 [with_stream_mark]: 1.53e-05 [recompute_prepare]: 7.43999e-06 [updatestate_depend_eliminate]: 3.48e-06 [updatestate_assign_eliminate]: 2.63998e-06 [updatestate_loads_eliminate]: 3.15002e-06 [parameter_eliminate]: 1.84998e-06 [a_2]: 7.989e-05 [accelerated_algorithm]: 6.73e-06 [shard]: 1.37e-06 [meta_shard_fg_expand]: 2.01998e-06 [shard_inline]: 6.53e-06 [merge_send_recv]: 6.17999e-06 [auto_parallel]: 7.55e-06 [parallel]: 7.28999e-06 [flash_sp]: 3.56999e-06 [merge_comm]: 3.56999e-06 [allreduce_fusion]: 3.26001e-06 [matmul_add_comm_reduction]: 7.82e-06 [allreduce_slice_to_reducescatter]: 5.50004e-07 [virtual_shard_identity]: 1.137e-05 [virtual_dataset]: 1.277e-05 [get_grad_eliminate_]: 7.03e-06 [virtual_output]: 6.46e-06 [merge_forward]: 5.88002e-06 [cell_reuse_recompute_pass]: 3.11001e-06 [offload_activation]: 1.19e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.428e-05 [merge_recompute_call_nodes]: 1.42999e-06 [before_grad]: 1.055e-05 [set_forward_comm_id_for_comm_node_pass]: 4.3e-06 [meta_fg_expand]: 2.58e-06 [flash_sp_send_recv_attached]: 1.33002e-06 [receive_attached]: 1.89e-06 [after_resolve]: 1.253e-05 [a_after_grad]: 9.99001e-06 [renormalize]: 8.00064e-08 [add_forward_monad_depend]: 2.39001e-06 [auto_monad_grad]: 1.65001e-06 [auto_monad_eliminator]: 1.067e-05 [cse]: 1.949e-05 [a_3]: 4.067e-05 [py_interpret_to_execute_after_opt_a]: 1.405e-05 [slice_cell_reuse_recomputed_activation]: 2.51e-06 [rewriter_after_opt_a]: 4.012e-05 [convert_after_rewriter]: 7.03e-06 [order_py_execute_after_rewriter]: 5.89e-06 [mutable_eliminate]: 0.00077176 [opt_b]: 0.0002682, [1] [Cycle 1]: 0.00025939, [7] [b_1]: 0.00012981 [b_2]: 8.77999e-06 [updatestate_depend_eliminate]: 9.84999e-06 [updatestate_assign_eliminate]: 2.59001e-06 [updatestate_loads_eliminate]: 2.53e-06 [renormalize]: 5.89993e-07 [cse]: 2.684e-05 [optimize_parallel_all_gather_comm]: 1.976e-05 [overlap_param_gather]: 1.92001e-06 [cconv]: 3.399e-05 [loop_unroll]: 0.00049563 [opt_after_cconv]: 0.00014455, [1] [Cycle 1]: 0.0001379, [7] [c_1]: 3.524e-05 [parameter_eliminate]: 5.29e-06 [updatestate_depend_eliminate]: 7.66001e-06 [updatestate_assign_eliminate]: 2.89999e-06 [updatestate_loads_eliminate]: 2.23998e-06 [cse]: 2.122e-05 [renormalize]: 3.69997e-07 [remove_dup_value]: 1.579e-05 [tuple_transform]: 8.412e-05, [1] [Cycle 1]: 7.81e-05, [4] [d_1]: 4.934e-05 [none_parameter_eliminate]: 1.68997e-06 [renormalize]: 2.50002e-07 [switch_simplify]: 7.06999e-06 [partial_unused_args_eliminate]: 1.96998e-06 [add_recomputation]: 5.574e-05 [cse_after_recomputation]: 2.202e-05, [1] [Cycle 1]: 1.681e-05, [1] [cse]: 1.122e-05 [environ_conv]: 6.02999e-06 [swap_dp_allreduce_reducescatter]: 5.69999e-06 [bias_add_comm_swap]: 2.62001e-06 [label_micro_interleaved_index]: 5.00001e-06 [label_fine_grained_interleaved_index]: 3.04001e-06 [merge_cast_opt]: 1.53002e-06 [slice_recompute_activation]: 2.34001e-06 [micro_interleaved_order_control]: 2.48e-06 [assign_add_opt]: 1.47999e-06 [ForceFp32Comm]: 1.05001e-06 [remove_cast_before_assign_add]: 1.09e-06 [full_micro_interleaved_order_control]: 2.36998e-06 [reorder_send_recv_between_fp_bp]: 2.80002e-06 [comm_op_add_attrs]: 1.03001e-06 [add_comm_op_reuse_tag]: 1.19e-06 [interleave_split_concat_branches]: 1.19998e-06 [interleave_parallel_branches]: 1.02e-06 [overlap_opt_shard_in_pipeline]: 1.62001e-06 [overlap_opt_shard_grad_in_pipeline]: 2.34999e-06 [control_data_broadcast_order]: 1.334e-05 [grouped_pairwise_exchange_alltoall]: 2.07999e-06 [offloading_packed_experts]: 3.91999e-06 [overlap_recompute_and_grad_model_parallel]: 4.60999e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.22e-06 [overlap_recompute_allgather_and_fa_grad]: 1.39003e-06 [overlap_recompute_comm]: 3.04001e-06 [overlap_grad_ring_attention]: 3.91999e-06 [overlap_grad_flash_sp]: 2.25e-05 [begin_end_overlap_inline]: 6.69999e-07 [split_matmul_comm_elemetwise]: 2.37999e-06 [split_layernorm_comm]: 1.86998e-06 [handle_group_info]: 1.11002e-06 [symbol_engine_optimizer]: 8.187e-05, [1] [Cycle 1]: 7.635e-05, [6] [build]: 4.02e-06 [elim_shapecalc]: 1.017e-05 [elim_not_effective]: 1.318e-05 [opt_reshape]: 7.30998e-06 [fold_const_symbol]: 1.12e-05 [renormalize]: 1.69995e-07 [detach_backward]: 2.14999e-06 [pipeline_parallel_scheduler]: 1.89999e-06 [auto_monad_reorder]: 1.761e-05 [get_jit_bprop_graph]: 2.29001e-06 [rewriter_after_jit_bprop_graph]: 6.29999e-06 [opt_after_jit_grad]: 0.00053871 [validate]: 4.92e-05 Sums bootstrap : 0.000498s : 0.33% type_inference : 0.144272s : 95.98% event_method : 0.000019s : 0.01% auto_monad : 0.000063s : 0.04% graph_reusing : 0.000007s : 0.00% inline : 0.000002s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000023s : 0.02% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000007s : 0.00% parallel-infer-symbol : 0.000004s : 0.00% pre_auto_parallel : 0.000041s : 0.03% insert-virtual-dataset : 0.000003s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000029s : 0.02% optimize.rewriter_before_opt_a : 0.000089s : 0.06% optimize.opt_a.expand_dump_flag : 0.000006s : 0.00% optimize.opt_a.switch_simplify : 0.000053s : 0.04% optimize.opt_a.loop_unroll : 0.000037s : 0.02% optimize.opt_a.a_1 : 0.000902s : 0.60% optimize.opt_a.with_stream_mark : 0.000036s : 0.02% optimize.opt_a.recompute_prepare : 0.000018s : 0.01% optimize.opt_a.updatestate_depend_eliminate : 0.000009s : 0.01% optimize.opt_a.updatestate_assign_eliminate : 0.000006s : 0.00% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.00% optimize.opt_a.parameter_eliminate : 0.000004s : 0.00% optimize.opt_a.a_2 : 0.000175s : 0.12% optimize.opt_a.accelerated_algorithm : 0.000014s : 0.01% optimize.opt_a.shard : 0.000004s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.00% optimize.opt_a.shard_inline : 0.000014s : 0.01% optimize.opt_a.merge_send_recv : 0.000016s : 0.01% optimize.opt_a.auto_parallel : 0.000016s : 0.01% optimize.opt_a.parallel : 0.000028s : 0.02% optimize.opt_a.flash_sp : 0.000013s : 0.01% optimize.opt_a.merge_comm : 0.000008s : 0.01% optimize.opt_a.allreduce_fusion : 0.000007s : 0.00% optimize.opt_a.matmul_add_comm_reduction : 0.000018s : 0.01% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000021s : 0.01% optimize.opt_a.virtual_dataset : 0.000020s : 0.01% optimize.opt_a.get_grad_eliminate_ : 0.000014s : 0.01% optimize.opt_a.virtual_output : 0.000013s : 0.01% optimize.opt_a.merge_forward : 0.000011s : 0.01% optimize.opt_a.cell_reuse_recompute_pass : 0.000004s : 0.00% optimize.opt_a.offload_activation : 0.000023s : 0.02% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000027s : 0.02% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.00% optimize.opt_a.before_grad : 0.000022s : 0.01% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000009s : 0.01% optimize.opt_a.meta_fg_expand : 0.000006s : 0.00% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.00% optimize.opt_a.receive_attached : 0.000004s : 0.00% optimize.opt_a.after_resolve : 0.000025s : 0.02% optimize.opt_a.a_after_grad : 0.000021s : 0.01% optimize.opt_a.renormalize : 0.000879s : 0.59% optimize.opt_a.add_forward_monad_depend : 0.000009s : 0.01% optimize.opt_a.auto_monad_grad : 0.000005s : 0.00% optimize.opt_a.auto_monad_eliminator : 0.000029s : 0.02% optimize.opt_a.cse : 0.000052s : 0.03% optimize.opt_a.a_3 : 0.000093s : 0.06% optimize.py_interpret_to_execute_after_opt_a : 0.000014s : 0.01% optimize.slice_cell_reuse_recomputed_activation : 0.000003s : 0.00% optimize.rewriter_after_opt_a : 0.000040s : 0.03% optimize.convert_after_rewriter : 0.000007s : 0.00% optimize.order_py_execute_after_rewriter : 0.000006s : 0.00% optimize.mutable_eliminate : 0.000772s : 0.51% optimize.opt_b.b_1 : 0.000130s : 0.09% optimize.opt_b.b_2 : 0.000009s : 0.01% optimize.opt_b.updatestate_depend_eliminate : 0.000010s : 0.01% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.00% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000027s : 0.02% optimize.optimize_parallel_all_gather_comm : 0.000020s : 0.01% optimize.overlap_param_gather : 0.000002s : 0.00% optimize.cconv : 0.000034s : 0.02% optimize.loop_unroll : 0.000496s : 0.33% optimize.opt_after_cconv.c_1 : 0.000035s : 0.02% optimize.opt_after_cconv.parameter_eliminate : 0.000005s : 0.00% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000008s : 0.01% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.00% optimize.opt_after_cconv.cse : 0.000021s : 0.01% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000016s : 0.01% optimize.tuple_transform.d_1 : 0.000049s : 0.03% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000007s : 0.00% optimize.partial_unused_args_eliminate : 0.000002s : 0.00% optimize.add_recomputation : 0.000056s : 0.04% optimize.cse_after_recomputation.cse : 0.000011s : 0.01% optimize.environ_conv : 0.000006s : 0.00% optimize.swap_dp_allreduce_reducescatter : 0.000006s : 0.00% optimize.bias_add_comm_swap : 0.000003s : 0.00% optimize.label_micro_interleaved_index : 0.000005s : 0.00% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.00% optimize.merge_cast_opt : 0.000002s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.00% optimize.micro_interleaved_order_control : 0.000002s : 0.00% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.00% optimize.full_micro_interleaved_order_control : 0.000002s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.00% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000002s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.00% optimize.control_data_broadcast_order : 0.000013s : 0.01% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.00% optimize.offloading_packed_experts : 0.000004s : 0.00% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.00% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.00% optimize.overlap_recompute_comm : 0.000003s : 0.00% optimize.overlap_grad_ring_attention : 0.000004s : 0.00% optimize.overlap_grad_flash_sp : 0.000022s : 0.01% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.00% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000004s : 0.00% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000010s : 0.01% optimize.symbol_engine_optimizer.elim_not_effective : 0.000013s : 0.01% optimize.symbol_engine_optimizer.opt_reshape : 0.000007s : 0.00% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000011s : 0.01% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.00% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000018s : 0.01% get_jit_bprop_graph : 0.000002s : 0.00% rewriter_after_jit_bprop_graph : 0.000006s : 0.00% opt_after_jit_grad : 0.000539s : 0.36% validate : 0.000049s : 0.03% Time group info: ------[substitution.] 0.000250 34 15.92% : 0.000040s : 6: substitution.arithmetic_simplify 0.72% : 0.000002s : 2: substitution.elim_not_effective 0.87% : 0.000002s : 2: substitution.fold_const_symbol 2.71% : 0.000007s : 4: substitution.graph_param_transform 67.28% : 0.000168s : 4: substitution.inline 1.61% : 0.000004s : 4: substitution.j_node_and_user_rematch 2.28% : 0.000006s : 4: substitution.remove_not_recompute_node 2.08% : 0.000005s : 4: substitution.replace_old_param 6.53% : 0.000016s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.144204 2 99.48% : 0.143454s : 1: type_inference.infer 0.52% : 0.000751s : 1: type_inference.specialize ------[replace.] 0.000069 8 62.80% : 0.000043s : 4: replace.inline 37.20% : 0.000026s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000180 8 92.03% : 0.000165s : 4: match.inline 7.97% : 0.000014s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000225 1278 0.82% : 0.000002s : 13: predicate.accumulaten_eliminater 0.94% : 0.000002s : 4: predicate.ad_related_special_op_eliminate 0.48% : 0.000001s : 8: predicate.addn_check_dump 0.99% : 0.000002s : 13: predicate.addn_zero_filter 0.75% : 0.000002s : 13: predicate.adjust_all_reduce_mul_add 2.49% : 0.000006s : 21: predicate.arithmetic_simplify 0.99% : 0.000002s : 13: predicate.cast_eliminate 0.54% : 0.000001s : 8: predicate.check_bprop_eliminate 0.54% : 0.000001s : 8: predicate.compare_switch_simplify 0.17% : 0.000000s : 4: predicate.const_output_eliminate 0.58% : 0.000001s : 8: predicate.depend_value_elim 0.86% : 0.000002s : 13: predicate.dict_get_item_const_eliminator 0.90% : 0.000002s : 13: predicate.dict_get_item_eliminator 0.83% : 0.000002s : 13: predicate.dict_set_item_eliminator 1.15% : 0.000003s : 8: predicate.dumpgradient_eliminate 0.18% : 0.000000s : 4: predicate.elim_not_effective 0.32% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.07% : 0.000002s : 17: predicate.environ_add_const_eliminate 1.02% : 0.000002s : 17: predicate.environ_get_add_eliminate 1.02% : 0.000002s : 17: predicate.environ_get_depend_swap 1.75% : 0.000004s : 25: predicate.environ_get_eliminate 1.16% : 0.000003s : 17: predicate.environ_get_set_eliminate 1.45% : 0.000003s : 21: predicate.exchange_switch_depend_value 2.39% : 0.000005s : 21: predicate.float_depend_g_call 0.61% : 0.000001s : 8: predicate.float_environ_get_switch 0.87% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.17% : 0.000000s : 4: predicate.fold_const_symbol 0.71% : 0.000002s : 8: predicate.get_grad_eliminate 0.20% : 0.000000s : 4: predicate.graph_param_transform 0.55% : 0.000001s : 8: predicate.incorporate_call 0.45% : 0.000001s : 8: predicate.incorporate_call_switch 6.51% : 0.000015s : 58: predicate.inline 0.90% : 0.000002s : 8: predicate.inline_without_move 0.30% : 0.000001s : 8: predicate.j_node_and_user_rematch 0.99% : 0.000002s : 8: predicate.less_batch_normalization 1.96% : 0.000004s : 25: predicate.list_to_tuple_eliminator_ 2.42% : 0.000005s : 38: predicate.load_eliminater 0.88% : 0.000002s : 4: predicate.loop_unroll_after_grad 2.30% : 0.000005s : 34: predicate.loop_unroll_before_grad 1.91% : 0.000004s : 21: predicate.make_slice_get_slice_eliminator 0.50% : 0.000001s : 8: predicate.merge_addn 0.54% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.57% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.75% : 0.000002s : 13: predicate.minmaximum_grad 1.45% : 0.000003s : 4: predicate.mutable_eliminate 0.31% : 0.000001s : 4: predicate.opt_reshape 0.32% : 0.000001s : 4: predicate.parallel_virtual_node 1.98% : 0.000004s : 21: predicate.partial_defer_inline 1.49% : 0.000003s : 21: predicate.partial_eliminate 1.00% : 0.000002s : 13: predicate.print_const_string_wrapper 0.61% : 0.000001s : 8: predicate.reduce_all_const_elim 1.22% : 0.000003s : 13: predicate.reduce_eliminate 2.43% : 0.000005s : 38: predicate.redundant_stop_gradient_eliminater 0.46% : 0.000001s : 8: predicate.remove_not_recompute_node 1.29% : 0.000003s : 25: predicate.replace_applicator 0.58% : 0.000001s : 8: predicate.replace_old_param 0.36% : 0.000001s : 4: predicate.reset_defer_inline 0.87% : 0.000002s : 13: predicate.reshape_eliminate 0.66% : 0.000001s : 8: predicate.row_tensor_add_zeros_like 0.41% : 0.000001s : 4: predicate.row_tensor_eliminate 0.76% : 0.000002s : 8: predicate.same_eliminate 0.41% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.81% : 0.000002s : 8: predicate.shard_identity_eliminate 0.69% : 0.000002s : 8: predicate.special_op_eliminate 0.78% : 0.000002s : 8: predicate.specialize_transform 1.20% : 0.000003s : 8: predicate.split_environ_get_set_with_tuple_value 0.82% : 0.000002s : 8: predicate.stack_unstack_eliminate 0.41% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.43% : 0.000003s : 21: predicate.switch_defer_inline 1.85% : 0.000004s : 29: predicate.switch_layer_defer_inline 4.94% : 0.000011s : 67: predicate.switch_simplify 0.84% : 0.000002s : 13: predicate.tile_eliminate 0.95% : 0.000002s : 13: predicate.transpose_eliminate 1.56% : 0.000004s : 21: predicate.tuple_list_convert_item_index_to_positive 1.47% : 0.000003s : 21: predicate.tuple_list_get_item_const_eliminator 1.42% : 0.000003s : 21: predicate.tuple_list_get_item_depend_reorder 3.29% : 0.000007s : 33: predicate.tuple_list_get_item_eliminator 1.28% : 0.000003s : 21: predicate.tuple_list_get_set_item_eliminator 2.25% : 0.000005s : 29: predicate.tuple_list_set_item_eliminator 1.65% : 0.000004s : 25: predicate.tuple_to_list_eliminator_ 2.29% : 0.000005s : 38: predicate.updatestate_pure_node_eliminater 2.96% : 0.000007s : 46: predicate.updatestate_useless_node_eliminater 0.57% : 0.000001s : 4: predicate.value_based_eliminate 1.20% : 0.000003s : 8: predicate.virtual_dataset_eliminate 0.63% : 0.000001s : 8: predicate.virtual_output_eliminate 0.25% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.38% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000666 11 56.46% : 0.000376s : 5: func_graph_cloner_run.FuncGraphClonerGraph 43.54% : 0.000290s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.166950 192 0.00% : 0.000004s : 1: ForceFp32Comm 2.15% : 0.003585s : 1: add_attr 2.14% : 0.003572s : 1: add_attr_with_inline 0.00% : 0.000005s : 1: add_comm_op_reuse_tag 0.04% : 0.000059s : 1: add_recomputation 0.00% : 0.000004s : 1: assign_add_opt 0.04% : 0.000068s : 1: auto_monad 0.01% : 0.000021s : 1: auto_monad_reorder 0.00% : 0.000003s : 1: begin_end_overlap_inline 0.00% : 0.000005s : 1: bias_add_comm_swap 0.32% : 0.000529s : 1: bootstrap 0.02% : 0.000037s : 1: cconv 0.00% : 0.000004s : 1: comm_op_add_attrs 0.01% : 0.000016s : 1: control_data_broadcast_order 0.01% : 0.000010s : 1: convert_after_rewriter 0.01% : 0.000025s : 1: cse_after_recomputation 0.00% : 0.000005s : 1: dataset_repeat_opt 0.00% : 0.000006s : 1: detach_backward 0.01% : 0.000010s : 1: environ_conv 0.02% : 0.000026s : 1: event_method 0.00% : 0.000005s : 1: full_micro_interleaved_order_control 0.00% : 0.000005s : 1: get_jit_bprop_graph 0.01% : 0.000010s : 1: graph_reusing 0.00% : 0.000005s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000004s : 1: handle_group_info 0.00% : 0.000005s : 1: inline 0.00% : 0.000006s : 1: insert-virtual-dataset 0.00% : 0.000004s : 1: interleave_parallel_branches 0.00% : 0.000004s : 1: interleave_split_concat_branches 0.00% : 0.000006s : 1: label_fine_grained_interleaved_index 0.00% : 0.000008s : 1: label_micro_interleaved_index 0.30% : 0.000504s : 1: loop_unroll 0.00% : 0.000005s : 1: merge_cast_opt 0.00% : 0.000005s : 1: micro_interleaved_order_control 0.47% : 0.000784s : 1: mutable_eliminate 0.00% : 0.000007s : 1: offloading_packed_experts 0.01% : 0.000016s : 1: opt.transform.loop_unroll_optimizer 0.01% : 0.000022s : 1: opt.transform.mutable_eliminate 0.82% : 0.001367s : 78: opt.transform.opt_a 0.02% : 0.000033s : 1: opt.transform.opt_after_cconv 0.02% : 0.000028s : 1: opt.transform.opt_after_jit_grad 0.06% : 0.000104s : 28: opt.transform.opt_b 0.03% : 0.000054s : 2: opt.transform.opt_trans_graph 0.02% : 0.000037s : 4: opt.transform.symbol_engine_opt 2.00% : 0.003331s : 1: opt_a 0.09% : 0.000148s : 1: opt_after_cconv 0.33% : 0.000550s : 1: opt_after_jit_grad 0.16% : 0.000272s : 1: opt_b 3.51% : 0.005860s : 1: optimize 0.01% : 0.000023s : 1: optimize_parallel_all_gather_comm 0.01% : 0.000009s : 1: order_py_execute_after_rewriter 0.02% : 0.000026s : 1: overlap_grad_flash_sp 0.00% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.00% : 0.000007s : 1: overlap_grad_ring_attention 0.00% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000005s : 1: overlap_param_gather 0.00% : 0.000005s : 1: overlap_recompute_allgather_and_fa_grad 0.00% : 0.000008s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000006s : 1: overlap_recompute_comm 0.00% : 0.000008s : 1: parallel-infer-symbol 0.00% : 0.000004s : 1: parallel-infer-symbol-second 0.00% : 0.000005s : 1: partial_unused_args_eliminate 0.00% : 0.000005s : 1: pipeline_parallel_scheduler 0.00% : 0.000005s : 1: pipeline_split 0.03% : 0.000045s : 1: pre_auto_parallel 0.02% : 0.000033s : 1: py_interpret_to_execute 0.01% : 0.000018s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000004s : 1: remove_cast_before_assign_add 0.01% : 0.000019s : 1: remove_dup_value 0.28% : 0.000473s : 1: renormalize.infer 0.24% : 0.000397s : 1: renormalize.specialize 0.00% : 0.000005s : 1: reorder_send_recv_between_fp_bp 0.01% : 0.000009s : 1: rewriter_after_jit_bprop_graph 0.03% : 0.000044s : 1: rewriter_after_opt_a 0.06% : 0.000094s : 1: rewriter_before_opt_a 0.00% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000006s : 1: slice_recompute_activation 0.00% : 0.000005s : 1: split_layernorm_comm 0.00% : 0.000005s : 1: split_matmul_comm_elemetwise 0.01% : 0.000009s : 1: swap_dp_allreduce_reducescatter 0.05% : 0.000085s : 1: symbol_engine_optimizer 0.05% : 0.000087s : 1: tuple_transform 86.43% : 0.144292s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:41:33.483.209 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:41:33.483.516 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.197739, [21] [bootstrap]: 0.00046609 [type_inference]: 0.18573 [event_method]: 2.088e-05 [auto_monad]: 6.707e-05 [graph_reusing]: 6.39001e-06 [inline]: 2.41e-06 [add_attr]: 0.00377183, [1] [add_attr_with_inline]: 0.00375842, [1] [Cycle 1]: 9.423e-05, [2] [tag_attr]: 2.534e-05 [meta_addattr_fg_expand]: 5.92001e-06 [parallel-infer-symbol]: 4.05998e-06 [pre_auto_parallel]: 4.157e-05 [insert-virtual-dataset]: 2.68e-06 [parallel-infer-symbol-second]: 9.10019e-07 [dataset_repeat_opt]: 2.02999e-06 [pipeline_split]: 1.63002e-06 [optimize]: 0.00633055, [53] [py_interpret_to_execute]: 3.418e-05 [rewriter_before_opt_a]: 9.793e-05 [opt_a]: 0.00361513, [2] [Cycle 1]: 0.00263491, [45] [expand_dump_flag]: 3.06001e-06 [switch_simplify]: 4.601e-05 [loop_unroll]: 3.053e-05 [a_1]: 0.00067934 [with_stream_mark]: 2.138e-05 [recompute_prepare]: 1.077e-05 [updatestate_depend_eliminate]: 5.65001e-06 [updatestate_assign_eliminate]: 4.23001e-06 [updatestate_loads_eliminate]: 3.21001e-06 [parameter_eliminate]: 2.48e-06 [a_2]: 0.00012087 [accelerated_algorithm]: 8.24002e-06 [shard]: 2.36998e-06 [meta_shard_fg_expand]: 2.00002e-06 [shard_inline]: 6.79001e-06 [merge_send_recv]: 1.009e-05 [auto_parallel]: 8.55001e-06 [parallel]: 2.035e-05 [flash_sp]: 9.89001e-06 [merge_comm]: 4.45999e-06 [allreduce_fusion]: 4.17e-06 [matmul_add_comm_reduction]: 1.049e-05 [allreduce_slice_to_reducescatter]: 8.29983e-07 [virtual_shard_identity]: 9.29998e-06 [virtual_dataset]: 7.62998e-06 [get_grad_eliminate_]: 6.49001e-06 [virtual_output]: 7.38e-06 [merge_forward]: 4.82998e-06 [cell_reuse_recompute_pass]: 1.35999e-06 [offload_activation]: 1.074e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.591e-05 [merge_recompute_call_nodes]: 1.82999e-06 [before_grad]: 1.156e-05 [set_forward_comm_id_for_comm_node_pass]: 3.87998e-06 [meta_fg_expand]: 3.13e-06 [flash_sp_send_recv_attached]: 3.28998e-06 [receive_attached]: 3.11001e-06 [after_resolve]: 1.398e-05 [a_after_grad]: 1.075e-05 [renormalize]: 0.0009443 [add_forward_monad_depend]: 7.05e-06 [auto_monad_grad]: 3.23e-06 [auto_monad_eliminator]: 1.888e-05 [cse]: 2.993e-05 [a_3]: 6.659e-05 [Cycle 2]: 0.00096188, [45] [expand_dump_flag]: 2.43e-06 [switch_simplify]: 7.83001e-06 [loop_unroll]: 6.78e-06 [a_1]: 0.00015052 [with_stream_mark]: 1.638e-05 [recompute_prepare]: 7.1e-06 [updatestate_depend_eliminate]: 3.86999e-06 [updatestate_assign_eliminate]: 3.09001e-06 [updatestate_loads_eliminate]: 2.36e-06 [parameter_eliminate]: 1.25001e-06 [a_2]: 0.00014159 [accelerated_algorithm]: 7.8e-06 [shard]: 1.89999e-06 [meta_shard_fg_expand]: 2.03997e-06 [shard_inline]: 7.56001e-06 [merge_send_recv]: 6.94001e-06 [auto_parallel]: 8.48001e-06 [parallel]: 7.86001e-06 [flash_sp]: 4.13001e-06 [merge_comm]: 3.85e-06 [allreduce_fusion]: 4.22e-06 [matmul_add_comm_reduction]: 8.59e-06 [allreduce_slice_to_reducescatter]: 3.59985e-07 [virtual_shard_identity]: 8.05e-06 [virtual_dataset]: 6.74999e-06 [get_grad_eliminate_]: 8.87e-06 [virtual_output]: 5.86e-06 [merge_forward]: 3.27002e-06 [cell_reuse_recompute_pass]: 2.51998e-06 [offload_activation]: 8.85999e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.623e-05 [merge_recompute_call_nodes]: 1.08001e-06 [before_grad]: 1.051e-05 [set_forward_comm_id_for_comm_node_pass]: 4.11001e-06 [meta_fg_expand]: 2.32999e-06 [flash_sp_send_recv_attached]: 1.24e-06 [receive_attached]: 1.84e-06 [after_resolve]: 1.244e-05 [a_after_grad]: 1.037e-05 [renormalize]: 8.9989e-08 [add_forward_monad_depend]: 2.69999e-06 [auto_monad_grad]: 1.67999e-06 [auto_monad_eliminator]: 1.149e-05 [cse]: 1.825e-05 [a_3]: 5.369e-05 [py_interpret_to_execute_after_opt_a]: 1.8e-05 [slice_cell_reuse_recomputed_activation]: 5.67999e-06 [rewriter_after_opt_a]: 4.726e-05 [convert_after_rewriter]: 1.077e-05 [order_py_execute_after_rewriter]: 8.79e-06 [mutable_eliminate]: 0.00074808 [opt_b]: 0.0002836, [1] [Cycle 1]: 0.00027274, [7] [b_1]: 0.00017137 [b_2]: 9.07999e-06 [updatestate_depend_eliminate]: 7.08e-06 [updatestate_assign_eliminate]: 2.97002e-06 [updatestate_loads_eliminate]: 2.48998e-06 [renormalize]: 7.7e-07 [cse]: 2.077e-05 [optimize_parallel_all_gather_comm]: 2.105e-05 [overlap_param_gather]: 5.22999e-06 [cconv]: 3.492e-05 [loop_unroll]: 0.00054736 [opt_after_cconv]: 0.00013534, [1] [Cycle 1]: 0.00012598, [7] [c_1]: 3.233e-05 [parameter_eliminate]: 5.99e-06 [updatestate_depend_eliminate]: 6.66999e-06 [updatestate_assign_eliminate]: 3.06001e-06 [updatestate_loads_eliminate]: 2.27999e-06 [cse]: 1.953e-05 [renormalize]: 4.39992e-07 [remove_dup_value]: 1.874e-05 [tuple_transform]: 9.704e-05, [1] [Cycle 1]: 9.026e-05, [4] [d_1]: 4.92e-05 [none_parameter_eliminate]: 1.67001e-06 [renormalize]: 1.8999e-07 [switch_simplify]: 7.32997e-06 [partial_unused_args_eliminate]: 4.52e-06 [add_recomputation]: 5.758e-05 [cse_after_recomputation]: 2.729e-05, [1] [Cycle 1]: 2.014e-05, [1] [cse]: 1.084e-05 [environ_conv]: 8.97e-06 [swap_dp_allreduce_reducescatter]: 8.02e-06 [bias_add_comm_swap]: 5.37999e-06 [label_micro_interleaved_index]: 7.1e-06 [label_fine_grained_interleaved_index]: 5.44e-06 [merge_cast_opt]: 3.73999e-06 [slice_recompute_activation]: 4.48999e-06 [micro_interleaved_order_control]: 5.24e-06 [assign_add_opt]: 3.83999e-06 [ForceFp32Comm]: 3.21001e-06 [remove_cast_before_assign_add]: 3.38e-06 [full_micro_interleaved_order_control]: 4.88001e-06 [reorder_send_recv_between_fp_bp]: 5.59e-06 [comm_op_add_attrs]: 3.83001e-06 [add_comm_op_reuse_tag]: 3.52997e-06 [interleave_split_concat_branches]: 4.18001e-06 [interleave_parallel_branches]: 3.33e-06 [overlap_opt_shard_in_pipeline]: 3.46999e-06 [overlap_opt_shard_grad_in_pipeline]: 4.05e-06 [control_data_broadcast_order]: 1.739e-05 [grouped_pairwise_exchange_alltoall]: 4.37e-06 [offloading_packed_experts]: 6.71e-06 [overlap_recompute_and_grad_model_parallel]: 8.07e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.81001e-06 [overlap_recompute_allgather_and_fa_grad]: 4.13999e-06 [overlap_recompute_comm]: 4.95001e-06 [overlap_grad_ring_attention]: 6.59001e-06 [overlap_grad_flash_sp]: 2.598e-05 [begin_end_overlap_inline]: 2.93998e-06 [split_matmul_comm_elemetwise]: 4.44002e-06 [split_layernorm_comm]: 4.72e-06 [handle_group_info]: 3.35003e-06 [symbol_engine_optimizer]: 0.00010385, [1] [Cycle 1]: 9.614e-05, [6] [build]: 4.12e-06 [elim_shapecalc]: 1.095e-05 [elim_not_effective]: 1.332e-05 [opt_reshape]: 7.25e-06 [fold_const_symbol]: 1.092e-05 [renormalize]: 1.99972e-07 [detach_backward]: 4.43999e-06 [pipeline_parallel_scheduler]: 2.04e-06 [auto_monad_reorder]: 2.036e-05 [get_jit_bprop_graph]: 2.59999e-06 [rewriter_after_jit_bprop_graph]: 5.51e-06 [opt_after_jit_grad]: 0.00055067 [validate]: 4.502e-05 Sums bootstrap : 0.000466s : 0.24% type_inference : 0.185730s : 96.71% event_method : 0.000021s : 0.01% auto_monad : 0.000067s : 0.03% graph_reusing : 0.000006s : 0.00% inline : 0.000002s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000025s : 0.01% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.00% parallel-infer-symbol : 0.000004s : 0.00% pre_auto_parallel : 0.000042s : 0.02% insert-virtual-dataset : 0.000003s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000034s : 0.02% optimize.rewriter_before_opt_a : 0.000098s : 0.05% optimize.opt_a.expand_dump_flag : 0.000005s : 0.00% optimize.opt_a.switch_simplify : 0.000054s : 0.03% optimize.opt_a.loop_unroll : 0.000037s : 0.02% optimize.opt_a.a_1 : 0.000830s : 0.43% optimize.opt_a.with_stream_mark : 0.000038s : 0.02% optimize.opt_a.recompute_prepare : 0.000018s : 0.01% optimize.opt_a.updatestate_depend_eliminate : 0.000010s : 0.00% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.00% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.00% optimize.opt_a.parameter_eliminate : 0.000004s : 0.00% optimize.opt_a.a_2 : 0.000262s : 0.14% optimize.opt_a.accelerated_algorithm : 0.000016s : 0.01% optimize.opt_a.shard : 0.000004s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.00% optimize.opt_a.shard_inline : 0.000014s : 0.01% optimize.opt_a.merge_send_recv : 0.000017s : 0.01% optimize.opt_a.auto_parallel : 0.000017s : 0.01% optimize.opt_a.parallel : 0.000028s : 0.01% optimize.opt_a.flash_sp : 0.000014s : 0.01% optimize.opt_a.merge_comm : 0.000008s : 0.00% optimize.opt_a.allreduce_fusion : 0.000008s : 0.00% optimize.opt_a.matmul_add_comm_reduction : 0.000019s : 0.01% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000017s : 0.01% optimize.opt_a.virtual_dataset : 0.000014s : 0.01% optimize.opt_a.get_grad_eliminate_ : 0.000015s : 0.01% optimize.opt_a.virtual_output : 0.000013s : 0.01% optimize.opt_a.merge_forward : 0.000008s : 0.00% optimize.opt_a.cell_reuse_recompute_pass : 0.000004s : 0.00% optimize.opt_a.offload_activation : 0.000020s : 0.01% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000032s : 0.02% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.00% optimize.opt_a.before_grad : 0.000022s : 0.01% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000008s : 0.00% optimize.opt_a.meta_fg_expand : 0.000005s : 0.00% optimize.opt_a.flash_sp_send_recv_attached : 0.000005s : 0.00% optimize.opt_a.receive_attached : 0.000005s : 0.00% optimize.opt_a.after_resolve : 0.000026s : 0.01% optimize.opt_a.a_after_grad : 0.000021s : 0.01% optimize.opt_a.renormalize : 0.000944s : 0.49% optimize.opt_a.add_forward_monad_depend : 0.000010s : 0.01% optimize.opt_a.auto_monad_grad : 0.000005s : 0.00% optimize.opt_a.auto_monad_eliminator : 0.000030s : 0.02% optimize.opt_a.cse : 0.000048s : 0.03% optimize.opt_a.a_3 : 0.000120s : 0.06% optimize.py_interpret_to_execute_after_opt_a : 0.000018s : 0.01% optimize.slice_cell_reuse_recomputed_activation : 0.000006s : 0.00% optimize.rewriter_after_opt_a : 0.000047s : 0.02% optimize.convert_after_rewriter : 0.000011s : 0.01% optimize.order_py_execute_after_rewriter : 0.000009s : 0.00% optimize.mutable_eliminate : 0.000748s : 0.39% optimize.opt_b.b_1 : 0.000171s : 0.09% optimize.opt_b.b_2 : 0.000009s : 0.00% optimize.opt_b.updatestate_depend_eliminate : 0.000007s : 0.00% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000002s : 0.00% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000021s : 0.01% optimize.optimize_parallel_all_gather_comm : 0.000021s : 0.01% optimize.overlap_param_gather : 0.000005s : 0.00% optimize.cconv : 0.000035s : 0.02% optimize.loop_unroll : 0.000547s : 0.28% optimize.opt_after_cconv.c_1 : 0.000032s : 0.02% optimize.opt_after_cconv.parameter_eliminate : 0.000006s : 0.00% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000007s : 0.00% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.00% optimize.opt_after_cconv.cse : 0.000020s : 0.01% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000019s : 0.01% optimize.tuple_transform.d_1 : 0.000049s : 0.03% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000007s : 0.00% optimize.partial_unused_args_eliminate : 0.000005s : 0.00% optimize.add_recomputation : 0.000058s : 0.03% optimize.cse_after_recomputation.cse : 0.000011s : 0.01% optimize.environ_conv : 0.000009s : 0.00% optimize.swap_dp_allreduce_reducescatter : 0.000008s : 0.00% optimize.bias_add_comm_swap : 0.000005s : 0.00% optimize.label_micro_interleaved_index : 0.000007s : 0.00% optimize.label_fine_grained_interleaved_index : 0.000005s : 0.00% optimize.merge_cast_opt : 0.000004s : 0.00% optimize.slice_recompute_activation : 0.000004s : 0.00% optimize.micro_interleaved_order_control : 0.000005s : 0.00% optimize.assign_add_opt : 0.000004s : 0.00% optimize.ForceFp32Comm : 0.000003s : 0.00% optimize.remove_cast_before_assign_add : 0.000003s : 0.00% optimize.full_micro_interleaved_order_control : 0.000005s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000006s : 0.00% optimize.comm_op_add_attrs : 0.000004s : 0.00% optimize.add_comm_op_reuse_tag : 0.000004s : 0.00% optimize.interleave_split_concat_branches : 0.000004s : 0.00% optimize.interleave_parallel_branches : 0.000003s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000003s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000004s : 0.00% optimize.control_data_broadcast_order : 0.000017s : 0.01% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.00% optimize.offloading_packed_experts : 0.000007s : 0.00% optimize.overlap_recompute_and_grad_model_parallel : 0.000008s : 0.00% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.00% optimize.overlap_recompute_comm : 0.000005s : 0.00% optimize.overlap_grad_ring_attention : 0.000007s : 0.00% optimize.overlap_grad_flash_sp : 0.000026s : 0.01% optimize.begin_end_overlap_inline : 0.000003s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000004s : 0.00% optimize.split_layernorm_comm : 0.000005s : 0.00% optimize.handle_group_info : 0.000003s : 0.00% optimize.symbol_engine_optimizer.build : 0.000004s : 0.00% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000011s : 0.01% optimize.symbol_engine_optimizer.elim_not_effective : 0.000013s : 0.01% optimize.symbol_engine_optimizer.opt_reshape : 0.000007s : 0.00% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000011s : 0.01% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000004s : 0.00% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000020s : 0.01% get_jit_bprop_graph : 0.000003s : 0.00% rewriter_after_jit_bprop_graph : 0.000006s : 0.00% opt_after_jit_grad : 0.000551s : 0.29% validate : 0.000045s : 0.02% Time group info: ------[substitution.] 0.000244 34 15.38% : 0.000038s : 6: substitution.arithmetic_simplify 0.75% : 0.000002s : 2: substitution.elim_not_effective 0.60% : 0.000001s : 2: substitution.fold_const_symbol 2.91% : 0.000007s : 4: substitution.graph_param_transform 67.42% : 0.000165s : 4: substitution.inline 1.88% : 0.000005s : 4: substitution.j_node_and_user_rematch 1.93% : 0.000005s : 4: substitution.remove_not_recompute_node 2.62% : 0.000006s : 4: substitution.replace_old_param 6.52% : 0.000016s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.185667 2 99.56% : 0.184843s : 1: type_inference.infer 0.44% : 0.000824s : 1: type_inference.specialize ------[replace.] 0.000068 8 62.30% : 0.000042s : 4: replace.inline 37.70% : 0.000026s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000176 8 92.01% : 0.000162s : 4: match.inline 7.99% : 0.000014s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000247 1278 0.83% : 0.000002s : 13: predicate.accumulaten_eliminater 0.81% : 0.000002s : 4: predicate.ad_related_special_op_eliminate 0.50% : 0.000001s : 8: predicate.addn_check_dump 0.79% : 0.000002s : 13: predicate.addn_zero_filter 0.69% : 0.000002s : 13: predicate.adjust_all_reduce_mul_add 2.36% : 0.000006s : 21: predicate.arithmetic_simplify 0.85% : 0.000002s : 13: predicate.cast_eliminate 0.54% : 0.000001s : 8: predicate.check_bprop_eliminate 0.41% : 0.000001s : 8: predicate.compare_switch_simplify 0.16% : 0.000000s : 4: predicate.const_output_eliminate 11.07% : 0.000027s : 8: predicate.depend_value_elim 0.77% : 0.000002s : 13: predicate.dict_get_item_const_eliminator 0.94% : 0.000002s : 13: predicate.dict_get_item_eliminator 0.71% : 0.000002s : 13: predicate.dict_set_item_eliminator 0.90% : 0.000002s : 8: predicate.dumpgradient_eliminate 0.24% : 0.000001s : 4: predicate.elim_not_effective 0.41% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 0.98% : 0.000002s : 17: predicate.environ_add_const_eliminate 0.90% : 0.000002s : 17: predicate.environ_get_add_eliminate 0.92% : 0.000002s : 17: predicate.environ_get_depend_swap 1.45% : 0.000004s : 25: predicate.environ_get_eliminate 0.92% : 0.000002s : 17: predicate.environ_get_set_eliminate 1.26% : 0.000003s : 21: predicate.exchange_switch_depend_value 2.28% : 0.000006s : 21: predicate.float_depend_g_call 0.44% : 0.000001s : 8: predicate.float_environ_get_switch 0.68% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.17% : 0.000000s : 4: predicate.fold_const_symbol 0.58% : 0.000001s : 8: predicate.get_grad_eliminate 0.22% : 0.000001s : 4: predicate.graph_param_transform 0.50% : 0.000001s : 8: predicate.incorporate_call 0.40% : 0.000001s : 8: predicate.incorporate_call_switch 5.58% : 0.000014s : 58: predicate.inline 0.73% : 0.000002s : 8: predicate.inline_without_move 0.26% : 0.000001s : 8: predicate.j_node_and_user_rematch 1.01% : 0.000002s : 8: predicate.less_batch_normalization 1.67% : 0.000004s : 25: predicate.list_to_tuple_eliminator_ 2.09% : 0.000005s : 38: predicate.load_eliminater 0.92% : 0.000002s : 4: predicate.loop_unroll_after_grad 2.03% : 0.000005s : 34: predicate.loop_unroll_before_grad 1.44% : 0.000004s : 21: predicate.make_slice_get_slice_eliminator 0.44% : 0.000001s : 8: predicate.merge_addn 0.51% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.52% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.67% : 0.000002s : 13: predicate.minmaximum_grad 1.03% : 0.000003s : 4: predicate.mutable_eliminate 0.34% : 0.000001s : 4: predicate.opt_reshape 0.41% : 0.000001s : 4: predicate.parallel_virtual_node 1.55% : 0.000004s : 21: predicate.partial_defer_inline 1.52% : 0.000004s : 21: predicate.partial_eliminate 0.80% : 0.000002s : 13: predicate.print_const_string_wrapper 0.68% : 0.000002s : 8: predicate.reduce_all_const_elim 1.04% : 0.000003s : 13: predicate.reduce_eliminate 2.13% : 0.000005s : 38: predicate.redundant_stop_gradient_eliminater 0.49% : 0.000001s : 8: predicate.remove_not_recompute_node 1.17% : 0.000003s : 25: predicate.replace_applicator 0.58% : 0.000001s : 8: predicate.replace_old_param 0.26% : 0.000001s : 4: predicate.reset_defer_inline 0.77% : 0.000002s : 13: predicate.reshape_eliminate 0.71% : 0.000002s : 8: predicate.row_tensor_add_zeros_like 0.43% : 0.000001s : 4: predicate.row_tensor_eliminate 0.71% : 0.000002s : 8: predicate.same_eliminate 0.34% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.76% : 0.000002s : 8: predicate.shard_identity_eliminate 0.66% : 0.000002s : 8: predicate.special_op_eliminate 0.68% : 0.000002s : 8: predicate.specialize_transform 0.92% : 0.000002s : 8: predicate.split_environ_get_set_with_tuple_value 0.88% : 0.000002s : 8: predicate.stack_unstack_eliminate 0.26% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.26% : 0.000003s : 21: predicate.switch_defer_inline 1.86% : 0.000005s : 29: predicate.switch_layer_defer_inline 4.61% : 0.000011s : 67: predicate.switch_simplify 0.94% : 0.000002s : 13: predicate.tile_eliminate 0.81% : 0.000002s : 13: predicate.transpose_eliminate 1.50% : 0.000004s : 21: predicate.tuple_list_convert_item_index_to_positive 1.27% : 0.000003s : 21: predicate.tuple_list_get_item_const_eliminator 1.30% : 0.000003s : 21: predicate.tuple_list_get_item_depend_reorder 2.93% : 0.000007s : 33: predicate.tuple_list_get_item_eliminator 1.35% : 0.000003s : 21: predicate.tuple_list_get_set_item_eliminator 1.86% : 0.000005s : 29: predicate.tuple_list_set_item_eliminator 1.78% : 0.000004s : 25: predicate.tuple_to_list_eliminator_ 1.99% : 0.000005s : 38: predicate.updatestate_pure_node_eliminater 2.53% : 0.000006s : 46: predicate.updatestate_useless_node_eliminater 0.34% : 0.000001s : 4: predicate.value_based_eliminate 0.75% : 0.000002s : 8: predicate.virtual_dataset_eliminate 0.62% : 0.000002s : 8: predicate.virtual_output_eliminate 0.21% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.46% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.167286 11 99.81% : 0.166970s : 5: func_graph_cloner_run.FuncGraphClonerGraph 0.19% : 0.000316s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.210251 192 0.00% : 0.000006s : 1: ForceFp32Comm 1.80% : 0.003784s : 1: add_attr 1.79% : 0.003763s : 1: add_attr_with_inline 0.00% : 0.000006s : 1: add_comm_op_reuse_tag 0.03% : 0.000061s : 1: add_recomputation 0.00% : 0.000006s : 1: assign_add_opt 0.04% : 0.000076s : 1: auto_monad 0.01% : 0.000028s : 1: auto_monad_reorder 0.00% : 0.000006s : 1: begin_end_overlap_inline 0.00% : 0.000008s : 1: bias_add_comm_swap 0.24% : 0.000514s : 1: bootstrap 0.02% : 0.000038s : 1: cconv 0.00% : 0.000007s : 1: comm_op_add_attrs 0.01% : 0.000020s : 1: control_data_broadcast_order 0.01% : 0.000014s : 1: convert_after_rewriter 0.01% : 0.000030s : 1: cse_after_recomputation 0.00% : 0.000008s : 1: dataset_repeat_opt 0.01% : 0.000022s : 1: detach_backward 0.01% : 0.000012s : 1: environ_conv 0.02% : 0.000032s : 1: event_method 0.00% : 0.000008s : 1: full_micro_interleaved_order_control 0.00% : 0.000009s : 1: get_jit_bprop_graph 0.01% : 0.000013s : 1: graph_reusing 0.00% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000006s : 1: handle_group_info 0.00% : 0.000008s : 1: inline 0.00% : 0.000009s : 1: insert-virtual-dataset 0.00% : 0.000006s : 1: interleave_parallel_branches 0.00% : 0.000007s : 1: interleave_split_concat_branches 0.00% : 0.000008s : 1: label_fine_grained_interleaved_index 0.00% : 0.000010s : 1: label_micro_interleaved_index 0.26% : 0.000554s : 1: loop_unroll 0.00% : 0.000006s : 1: merge_cast_opt 0.00% : 0.000008s : 1: micro_interleaved_order_control 0.36% : 0.000756s : 1: mutable_eliminate 0.00% : 0.000010s : 1: offloading_packed_experts 0.01% : 0.000016s : 1: opt.transform.loop_unroll_optimizer 0.01% : 0.000018s : 1: opt.transform.mutable_eliminate 0.63% : 0.001327s : 78: opt.transform.opt_a 0.01% : 0.000030s : 1: opt.transform.opt_after_cconv 0.01% : 0.000028s : 1: opt.transform.opt_after_jit_grad 0.05% : 0.000107s : 28: opt.transform.opt_b 0.03% : 0.000054s : 2: opt.transform.opt_trans_graph 0.02% : 0.000038s : 4: opt.transform.symbol_engine_opt 1.72% : 0.003619s : 1: opt_a 0.07% : 0.000139s : 1: opt_after_cconv 0.27% : 0.000561s : 1: opt_after_jit_grad 0.14% : 0.000287s : 1: opt_b 3.19% : 0.006713s : 1: optimize 0.01% : 0.000025s : 1: optimize_parallel_all_gather_comm 0.01% : 0.000012s : 1: order_py_execute_after_rewriter 0.01% : 0.000029s : 1: overlap_grad_flash_sp 0.00% : 0.000006s : 1: overlap_grad_matmul_and_grad_allreduce 0.00% : 0.000009s : 1: overlap_grad_ring_attention 0.00% : 0.000007s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000006s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000008s : 1: overlap_param_gather 0.00% : 0.000007s : 1: overlap_recompute_allgather_and_fa_grad 0.01% : 0.000011s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000008s : 1: overlap_recompute_comm 0.01% : 0.000011s : 1: parallel-infer-symbol 0.00% : 0.000007s : 1: parallel-infer-symbol-second 0.00% : 0.000007s : 1: partial_unused_args_eliminate 0.00% : 0.000010s : 1: pipeline_parallel_scheduler 0.00% : 0.000007s : 1: pipeline_split 0.02% : 0.000049s : 1: pre_auto_parallel 0.02% : 0.000038s : 1: py_interpret_to_execute 0.01% : 0.000021s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000006s : 1: remove_cast_before_assign_add 0.01% : 0.000022s : 1: remove_dup_value 0.26% : 0.000543s : 1: renormalize.infer 0.19% : 0.000389s : 1: renormalize.specialize 0.00% : 0.000008s : 1: reorder_send_recv_between_fp_bp 0.01% : 0.000012s : 1: rewriter_after_jit_bprop_graph 0.02% : 0.000051s : 1: rewriter_after_opt_a 0.05% : 0.000102s : 1: rewriter_before_opt_a 0.00% : 0.000009s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000008s : 1: slice_recompute_activation 0.00% : 0.000008s : 1: split_layernorm_comm 0.00% : 0.000007s : 1: split_matmul_comm_elemetwise 0.01% : 0.000011s : 1: swap_dp_allreduce_reducescatter 0.05% : 0.000107s : 1: symbol_engine_optimizer 0.05% : 0.000100s : 1: tuple_transform 88.36% : 0.185784s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:41:35.824.534 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.164395, [21] [bootstrap]: 0.00045317 [type_inference]: 0.00639487 [event_method]: 2.006e-05 [auto_monad]: 6.493e-05 [graph_reusing]: 6.37001e-06 [inline]: 3.26001e-06 [add_attr]: 0.00364171, [1] [add_attr_with_inline]: 0.00362982, [1] [Cycle 1]: 7.307e-05, [2] [tag_attr]: 2.399e-05 [meta_addattr_fg_expand]: 6.39001e-06 [parallel-infer-symbol]: 3.33e-06 [pre_auto_parallel]: 4.206e-05 [insert-virtual-dataset]: 3.13998e-06 [parallel-infer-symbol-second]: 8.59989e-07 [dataset_repeat_opt]: 2.16998e-06 [pipeline_split]: 2.37001e-06 [optimize]: 0.152884, [53] [py_interpret_to_execute]: 3.078e-05 [rewriter_before_opt_a]: 9.52e-05 [opt_a]: 0.00339641, [2] [Cycle 1]: 0.00266028, [45] [expand_dump_flag]: 2.89999e-06 [switch_simplify]: 4.474e-05 [loop_unroll]: 3.261e-05 [a_1]: 0.00089697 [with_stream_mark]: 2.229e-05 [recompute_prepare]: 1.239e-05 [updatestate_depend_eliminate]: 4.16001e-06 [updatestate_assign_eliminate]: 3.32997e-06 [updatestate_loads_eliminate]: 3.23e-06 [parameter_eliminate]: 2.28998e-06 [a_2]: 8.829e-05 [accelerated_algorithm]: 7.13e-06 [shard]: 2.02001e-06 [meta_shard_fg_expand]: 2.19001e-06 [shard_inline]: 7.03e-06 [merge_send_recv]: 8.74e-06 [auto_parallel]: 8.71002e-06 [parallel]: 2.064e-05 [flash_sp]: 9.49e-06 [merge_comm]: 4.33001e-06 [allreduce_fusion]: 3.66999e-06 [matmul_add_comm_reduction]: 1.071e-05 [allreduce_slice_to_reducescatter]: 7.89994e-07 [virtual_shard_identity]: 8.95999e-06 [virtual_dataset]: 7.41001e-06 [get_grad_eliminate_]: 6.52001e-06 [virtual_output]: 7.18e-06 [merge_forward]: 4.53999e-06 [cell_reuse_recompute_pass]: 1.45999e-06 [offload_activation]: 1.154e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.397e-05 [merge_recompute_call_nodes]: 1.89e-06 [before_grad]: 1.141e-05 [set_forward_comm_id_for_comm_node_pass]: 4.05e-06 [meta_fg_expand]: 3.25998e-06 [flash_sp_send_recv_attached]: 2.88e-06 [receive_attached]: 2.02001e-06 [after_resolve]: 4.116e-05 [a_after_grad]: 1.183e-05 [renormalize]: 0.00089323 [add_forward_monad_depend]: 7.85998e-06 [auto_monad_grad]: 2.78e-06 [auto_monad_eliminator]: 1.872e-05 [cse]: 3.235e-05 [a_3]: 5.629e-05 [Cycle 2]: 0.00072398, [45] [expand_dump_flag]: 1.81e-06 [switch_simplify]: 8.69e-06 [loop_unroll]: 7e-06 [a_1]: 0.00015241 [with_stream_mark]: 1.656e-05 [recompute_prepare]: 6.75002e-06 [updatestate_depend_eliminate]: 3.14999e-06 [updatestate_assign_eliminate]: 3.06999e-06 [updatestate_loads_eliminate]: 3.35e-06 [parameter_eliminate]: 1.92999e-06 [a_2]: 7.846e-05 [accelerated_algorithm]: 6.43998e-06 [shard]: 1.72999e-06 [meta_shard_fg_expand]: 1.70001e-06 [shard_inline]: 6.11e-06 [merge_send_recv]: 6.33998e-06 [auto_parallel]: 6.88998e-06 [parallel]: 7.27002e-06 [flash_sp]: 4.25e-06 [merge_comm]: 3.55e-06 [allreduce_fusion]: 4.01001e-06 [matmul_add_comm_reduction]: 1.081e-05 [allreduce_slice_to_reducescatter]: 1.12e-06 [virtual_shard_identity]: 7.82e-06 [virtual_dataset]: 6.19999e-06 [get_grad_eliminate_]: 5.81e-06 [virtual_output]: 6.28e-06 [merge_forward]: 3.48e-06 [cell_reuse_recompute_pass]: 1.47001e-06 [offload_activation]: 8.22998e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.221e-05 [merge_recompute_call_nodes]: 1.47001e-06 [before_grad]: 9.34e-06 [set_forward_comm_id_for_comm_node_pass]: 3.51999e-06 [meta_fg_expand]: 2.25002e-06 [flash_sp_send_recv_attached]: 1.10001e-06 [receive_attached]: 1.73997e-06 [after_resolve]: 1.114e-05 [a_after_grad]: 9.34e-06 [renormalize]: 7.99773e-08 [add_forward_monad_depend]: 1.79998e-06 [auto_monad_grad]: 1.64e-06 [auto_monad_eliminator]: 9.02e-06 [cse]: 1.649e-05 [a_3]: 3.852e-05 [py_interpret_to_execute_after_opt_a]: 1.27e-05 [slice_cell_reuse_recomputed_activation]: 2.54001e-06 [rewriter_after_opt_a]: 3.825e-05 [convert_after_rewriter]: 6.91001e-06 [order_py_execute_after_rewriter]: 5.19e-06 [mutable_eliminate]: 0.0128402 [opt_b]: 0.134725, [1] [Cycle 1]: 0.134712, [7] [b_1]: 0.00014624 [b_2]: 1.097e-05 [updatestate_depend_eliminate]: 1.38e-05 [updatestate_assign_eliminate]: 3.30003e-06 [updatestate_loads_eliminate]: 3.55e-06 [renormalize]: 2.21e-06 [cse]: 6.293e-05 [optimize_parallel_all_gather_comm]: 3.229e-05 [overlap_param_gather]: 2.00002e-06 [cconv]: 4.394e-05 [loop_unroll]: 0.00083786 [opt_after_cconv]: 0.00013727, [1] [Cycle 1]: 0.00012651, [7] [c_1]: 3.936e-05 [parameter_eliminate]: 6.97997e-06 [updatestate_depend_eliminate]: 8.94998e-06 [updatestate_assign_eliminate]: 3.5e-06 [updatestate_loads_eliminate]: 3.13e-06 [cse]: 2.651e-05 [renormalize]: 3.69997e-07 [remove_dup_value]: 1.494e-05 [tuple_transform]: 9.112e-05, [1] [Cycle 1]: 8.645e-05, [4] [d_1]: 5.705e-05 [none_parameter_eliminate]: 1.77001e-06 [renormalize]: 2.00002e-07 [switch_simplify]: 8.2e-06 [partial_unused_args_eliminate]: 2.16e-06 [add_recomputation]: 6.346e-05 [cse_after_recomputation]: 2.395e-05, [1] [Cycle 1]: 1.878e-05, [1] [cse]: 1.313e-05 [environ_conv]: 7.38e-06 [swap_dp_allreduce_reducescatter]: 6.04001e-06 [bias_add_comm_swap]: 3.65e-06 [label_micro_interleaved_index]: 7.58001e-06 [label_fine_grained_interleaved_index]: 2.64001e-06 [merge_cast_opt]: 1.40999e-06 [slice_recompute_activation]: 2.19999e-06 [micro_interleaved_order_control]: 2.59001e-06 [assign_add_opt]: 1.39e-06 [ForceFp32Comm]: 9.00007e-07 [remove_cast_before_assign_add]: 1.20999e-06 [full_micro_interleaved_order_control]: 2.58e-06 [reorder_send_recv_between_fp_bp]: 2.68998e-06 [comm_op_add_attrs]: 1.07998e-06 [add_comm_op_reuse_tag]: 1.19e-06 [interleave_split_concat_branches]: 1.20999e-06 [interleave_parallel_branches]: 1.25001e-06 [overlap_opt_shard_in_pipeline]: 1.66e-06 [overlap_opt_shard_grad_in_pipeline]: 2.29001e-06 [control_data_broadcast_order]: 1.499e-05 [grouped_pairwise_exchange_alltoall]: 1.54e-06 [offloading_packed_experts]: 5.12e-06 [overlap_recompute_and_grad_model_parallel]: 4.82e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.23002e-06 [overlap_recompute_allgather_and_fa_grad]: 1.45001e-06 [overlap_recompute_comm]: 2.59999e-06 [overlap_grad_ring_attention]: 4.79e-06 [overlap_grad_flash_sp]: 2.294e-05 [begin_end_overlap_inline]: 5.69999e-07 [split_matmul_comm_elemetwise]: 2.70997e-06 [split_layernorm_comm]: 1.67999e-06 [handle_group_info]: 1.35999e-06 [symbol_engine_optimizer]: 0.00010153, [1] [Cycle 1]: 9.631e-05, [6] [build]: 4.32e-06 [elim_shapecalc]: 1.365e-05 [elim_not_effective]: 1.595e-05 [opt_reshape]: 1.719e-05 [fold_const_symbol]: 1.134e-05 [renormalize]: 4.19997e-07 [detach_backward]: 2.33998e-06 [pipeline_parallel_scheduler]: 1.69e-06 [auto_monad_reorder]: 1.942e-05 [get_jit_bprop_graph]: 2.93998e-06 [rewriter_after_jit_bprop_graph]: 7.83001e-06 [opt_after_jit_grad]: 0.00061204 [validate]: 5.131e-05 Sums bootstrap : 0.000453s : 1.80% type_inference : 0.006395s : 25.38% event_method : 0.000020s : 0.08% auto_monad : 0.000065s : 0.26% graph_reusing : 0.000006s : 0.03% inline : 0.000003s : 0.01% add_attr.add_attr_with_inline.tag_attr : 0.000024s : 0.10% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.03% parallel-infer-symbol : 0.000003s : 0.01% pre_auto_parallel : 0.000042s : 0.17% insert-virtual-dataset : 0.000003s : 0.01% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.01% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000031s : 0.12% optimize.rewriter_before_opt_a : 0.000095s : 0.38% optimize.opt_a.expand_dump_flag : 0.000005s : 0.02% optimize.opt_a.switch_simplify : 0.000053s : 0.21% optimize.opt_a.loop_unroll : 0.000040s : 0.16% optimize.opt_a.a_1 : 0.001049s : 4.16% optimize.opt_a.with_stream_mark : 0.000039s : 0.15% optimize.opt_a.recompute_prepare : 0.000019s : 0.08% optimize.opt_a.updatestate_depend_eliminate : 0.000007s : 0.03% optimize.opt_a.updatestate_assign_eliminate : 0.000006s : 0.03% optimize.opt_a.updatestate_loads_eliminate : 0.000007s : 0.03% optimize.opt_a.parameter_eliminate : 0.000004s : 0.02% optimize.opt_a.a_2 : 0.000167s : 0.66% optimize.opt_a.accelerated_algorithm : 0.000014s : 0.05% optimize.opt_a.shard : 0.000004s : 0.01% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.02% optimize.opt_a.shard_inline : 0.000013s : 0.05% optimize.opt_a.merge_send_recv : 0.000015s : 0.06% optimize.opt_a.auto_parallel : 0.000016s : 0.06% optimize.opt_a.parallel : 0.000028s : 0.11% optimize.opt_a.flash_sp : 0.000014s : 0.05% optimize.opt_a.merge_comm : 0.000008s : 0.03% optimize.opt_a.allreduce_fusion : 0.000008s : 0.03% optimize.opt_a.matmul_add_comm_reduction : 0.000022s : 0.09% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000002s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000017s : 0.07% optimize.opt_a.virtual_dataset : 0.000014s : 0.05% optimize.opt_a.get_grad_eliminate_ : 0.000012s : 0.05% optimize.opt_a.virtual_output : 0.000013s : 0.05% optimize.opt_a.merge_forward : 0.000008s : 0.03% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.01% optimize.opt_a.offload_activation : 0.000020s : 0.08% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000026s : 0.10% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.01% optimize.opt_a.before_grad : 0.000021s : 0.08% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000008s : 0.03% optimize.opt_a.meta_fg_expand : 0.000006s : 0.02% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.02% optimize.opt_a.receive_attached : 0.000004s : 0.01% optimize.opt_a.after_resolve : 0.000052s : 0.21% optimize.opt_a.a_after_grad : 0.000021s : 0.08% optimize.opt_a.renormalize : 0.000893s : 3.55% optimize.opt_a.add_forward_monad_depend : 0.000010s : 0.04% optimize.opt_a.auto_monad_grad : 0.000004s : 0.02% optimize.opt_a.auto_monad_eliminator : 0.000028s : 0.11% optimize.opt_a.cse : 0.000049s : 0.19% optimize.opt_a.a_3 : 0.000095s : 0.38% optimize.py_interpret_to_execute_after_opt_a : 0.000013s : 0.05% optimize.slice_cell_reuse_recomputed_activation : 0.000003s : 0.01% optimize.rewriter_after_opt_a : 0.000038s : 0.15% optimize.convert_after_rewriter : 0.000007s : 0.03% optimize.order_py_execute_after_rewriter : 0.000005s : 0.02% optimize.mutable_eliminate : 0.012840s : 50.96% optimize.opt_b.b_1 : 0.000146s : 0.58% optimize.opt_b.b_2 : 0.000011s : 0.04% optimize.opt_b.updatestate_depend_eliminate : 0.000014s : 0.05% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.01% optimize.opt_b.updatestate_loads_eliminate : 0.000004s : 0.01% optimize.opt_b.renormalize : 0.000002s : 0.01% optimize.opt_b.cse : 0.000063s : 0.25% optimize.optimize_parallel_all_gather_comm : 0.000032s : 0.13% optimize.overlap_param_gather : 0.000002s : 0.01% optimize.cconv : 0.000044s : 0.17% optimize.loop_unroll : 0.000838s : 3.33% optimize.opt_after_cconv.c_1 : 0.000039s : 0.16% optimize.opt_after_cconv.parameter_eliminate : 0.000007s : 0.03% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000009s : 0.04% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.cse : 0.000027s : 0.11% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000015s : 0.06% optimize.tuple_transform.d_1 : 0.000057s : 0.23% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000008s : 0.03% optimize.partial_unused_args_eliminate : 0.000002s : 0.01% optimize.add_recomputation : 0.000063s : 0.25% optimize.cse_after_recomputation.cse : 0.000013s : 0.05% optimize.environ_conv : 0.000007s : 0.03% optimize.swap_dp_allreduce_reducescatter : 0.000006s : 0.02% optimize.bias_add_comm_swap : 0.000004s : 0.01% optimize.label_micro_interleaved_index : 0.000008s : 0.03% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.01% optimize.merge_cast_opt : 0.000001s : 0.01% optimize.slice_recompute_activation : 0.000002s : 0.01% optimize.micro_interleaved_order_control : 0.000003s : 0.01% optimize.assign_add_opt : 0.000001s : 0.01% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.00% optimize.full_micro_interleaved_order_control : 0.000003s : 0.01% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.01% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000002s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.01% optimize.control_data_broadcast_order : 0.000015s : 0.06% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.01% optimize.offloading_packed_experts : 0.000005s : 0.02% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.02% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.01% optimize.overlap_recompute_comm : 0.000003s : 0.01% optimize.overlap_grad_ring_attention : 0.000005s : 0.02% optimize.overlap_grad_flash_sp : 0.000023s : 0.09% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000003s : 0.01% optimize.split_layernorm_comm : 0.000002s : 0.01% optimize.handle_group_info : 0.000001s : 0.01% optimize.symbol_engine_optimizer.build : 0.000004s : 0.02% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000014s : 0.05% optimize.symbol_engine_optimizer.elim_not_effective : 0.000016s : 0.06% optimize.symbol_engine_optimizer.opt_reshape : 0.000017s : 0.07% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000011s : 0.05% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.01% pipeline_parallel_scheduler : 0.000002s : 0.01% auto_monad_reorder : 0.000019s : 0.08% get_jit_bprop_graph : 0.000003s : 0.01% rewriter_after_jit_bprop_graph : 0.000008s : 0.03% opt_after_jit_grad : 0.000612s : 2.43% validate : 0.000051s : 0.20% Time group info: ------[substitution.] 0.000391 34 10.11% : 0.000039s : 6: substitution.arithmetic_simplify 0.53% : 0.000002s : 2: substitution.elim_not_effective 0.37% : 0.000001s : 2: substitution.fold_const_symbol 1.77% : 0.000007s : 4: substitution.graph_param_transform 79.37% : 0.000310s : 4: substitution.inline 1.18% : 0.000005s : 4: substitution.j_node_and_user_rematch 1.23% : 0.000005s : 4: substitution.remove_not_recompute_node 1.45% : 0.000006s : 4: substitution.replace_old_param 3.99% : 0.000016s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.006322 2 87.58% : 0.005537s : 1: type_inference.infer 12.42% : 0.000785s : 1: type_inference.specialize ------[replace.] 0.000091 8 73.33% : 0.000066s : 4: replace.inline 26.67% : 0.000024s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000321 8 95.74% : 0.000307s : 4: match.inline 4.26% : 0.000014s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000244 1278 0.81% : 0.000002s : 13: predicate.accumulaten_eliminater 0.69% : 0.000002s : 4: predicate.ad_related_special_op_eliminate 0.42% : 0.000001s : 8: predicate.addn_check_dump 0.90% : 0.000002s : 13: predicate.addn_zero_filter 0.69% : 0.000002s : 13: predicate.adjust_all_reduce_mul_add 2.42% : 0.000006s : 21: predicate.arithmetic_simplify 0.89% : 0.000002s : 13: predicate.cast_eliminate 0.58% : 0.000001s : 8: predicate.check_bprop_eliminate 0.43% : 0.000001s : 8: predicate.compare_switch_simplify 0.16% : 0.000000s : 4: predicate.const_output_eliminate 0.50% : 0.000001s : 8: predicate.depend_value_elim 0.81% : 0.000002s : 13: predicate.dict_get_item_const_eliminator 0.84% : 0.000002s : 13: predicate.dict_get_item_eliminator 0.75% : 0.000002s : 13: predicate.dict_set_item_eliminator 1.06% : 0.000003s : 8: predicate.dumpgradient_eliminate 0.24% : 0.000001s : 4: predicate.elim_not_effective 0.55% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.20% : 0.000003s : 17: predicate.environ_add_const_eliminate 0.91% : 0.000002s : 17: predicate.environ_get_add_eliminate 0.93% : 0.000002s : 17: predicate.environ_get_depend_swap 1.56% : 0.000004s : 25: predicate.environ_get_eliminate 0.99% : 0.000002s : 17: predicate.environ_get_set_eliminate 1.27% : 0.000003s : 21: predicate.exchange_switch_depend_value 2.47% : 0.000006s : 21: predicate.float_depend_g_call 0.42% : 0.000001s : 8: predicate.float_environ_get_switch 0.77% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.16% : 0.000000s : 4: predicate.fold_const_symbol 0.54% : 0.000001s : 8: predicate.get_grad_eliminate 0.17% : 0.000000s : 4: predicate.graph_param_transform 0.47% : 0.000001s : 8: predicate.incorporate_call 0.38% : 0.000001s : 8: predicate.incorporate_call_switch 6.46% : 0.000016s : 58: predicate.inline 0.81% : 0.000002s : 8: predicate.inline_without_move 0.28% : 0.000001s : 8: predicate.j_node_and_user_rematch 0.73% : 0.000002s : 8: predicate.less_batch_normalization 1.71% : 0.000004s : 25: predicate.list_to_tuple_eliminator_ 2.46% : 0.000006s : 38: predicate.load_eliminater 1.50% : 0.000004s : 4: predicate.loop_unroll_after_grad 2.21% : 0.000005s : 34: predicate.loop_unroll_before_grad 1.65% : 0.000004s : 21: predicate.make_slice_get_slice_eliminator 0.45% : 0.000001s : 8: predicate.merge_addn 0.49% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.49% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.66% : 0.000002s : 13: predicate.minmaximum_grad 3.10% : 0.000008s : 4: predicate.mutable_eliminate 3.70% : 0.000009s : 4: predicate.opt_reshape 0.38% : 0.000001s : 4: predicate.parallel_virtual_node 1.62% : 0.000004s : 21: predicate.partial_defer_inline 1.38% : 0.000003s : 21: predicate.partial_eliminate 0.79% : 0.000002s : 13: predicate.print_const_string_wrapper 0.51% : 0.000001s : 8: predicate.reduce_all_const_elim 1.24% : 0.000003s : 13: predicate.reduce_eliminate 2.26% : 0.000006s : 38: predicate.redundant_stop_gradient_eliminater 0.40% : 0.000001s : 8: predicate.remove_not_recompute_node 1.25% : 0.000003s : 25: predicate.replace_applicator 0.36% : 0.000001s : 8: predicate.replace_old_param 0.50% : 0.000001s : 4: predicate.reset_defer_inline 0.86% : 0.000002s : 13: predicate.reshape_eliminate 0.75% : 0.000002s : 8: predicate.row_tensor_add_zeros_like 0.55% : 0.000001s : 4: predicate.row_tensor_eliminate 0.73% : 0.000002s : 8: predicate.same_eliminate 0.38% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.84% : 0.000002s : 8: predicate.shard_identity_eliminate 0.87% : 0.000002s : 8: predicate.special_op_eliminate 0.60% : 0.000001s : 8: predicate.specialize_transform 0.74% : 0.000002s : 8: predicate.split_environ_get_set_with_tuple_value 0.80% : 0.000002s : 8: predicate.stack_unstack_eliminate 0.29% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.26% : 0.000003s : 21: predicate.switch_defer_inline 1.86% : 0.000005s : 29: predicate.switch_layer_defer_inline 4.56% : 0.000011s : 67: predicate.switch_simplify 0.82% : 0.000002s : 13: predicate.tile_eliminate 0.77% : 0.000002s : 13: predicate.transpose_eliminate 1.86% : 0.000005s : 21: predicate.tuple_list_convert_item_index_to_positive 1.86% : 0.000005s : 21: predicate.tuple_list_get_item_const_eliminator 1.35% : 0.000003s : 21: predicate.tuple_list_get_item_depend_reorder 3.43% : 0.000008s : 33: predicate.tuple_list_get_item_eliminator 1.51% : 0.000004s : 21: predicate.tuple_list_get_set_item_eliminator 2.07% : 0.000005s : 29: predicate.tuple_list_set_item_eliminator 1.61% : 0.000004s : 25: predicate.tuple_to_list_eliminator_ 2.16% : 0.000005s : 38: predicate.updatestate_pure_node_eliminater 2.89% : 0.000007s : 46: predicate.updatestate_useless_node_eliminater 0.36% : 0.000001s : 4: predicate.value_based_eliminate 0.63% : 0.000002s : 8: predicate.virtual_dataset_eliminate 0.57% : 0.000001s : 8: predicate.virtual_output_eliminate 0.21% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.42% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000639 11 54.32% : 0.000347s : 5: func_graph_cloner_run.FuncGraphClonerGraph 45.68% : 0.000292s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.323511 192 0.00% : 0.000004s : 1: ForceFp32Comm 1.13% : 0.003648s : 1: add_attr 1.12% : 0.003634s : 1: add_attr_with_inline 0.00% : 0.000004s : 1: add_comm_op_reuse_tag 0.02% : 0.000068s : 1: add_recomputation 0.00% : 0.000004s : 1: assign_add_opt 0.02% : 0.000071s : 1: auto_monad 0.01% : 0.000024s : 1: auto_monad_reorder 0.00% : 0.000004s : 1: begin_end_overlap_inline 0.00% : 0.000007s : 1: bias_add_comm_swap 0.15% : 0.000488s : 1: bootstrap 0.01% : 0.000048s : 1: cconv 0.00% : 0.000004s : 1: comm_op_add_attrs 0.01% : 0.000019s : 1: control_data_broadcast_order 0.00% : 0.000010s : 1: convert_after_rewriter 0.01% : 0.000027s : 1: cse_after_recomputation 0.00% : 0.000005s : 1: dataset_repeat_opt 0.00% : 0.000006s : 1: detach_backward 0.00% : 0.000011s : 1: environ_conv 0.01% : 0.000027s : 1: event_method 0.00% : 0.000005s : 1: full_micro_interleaved_order_control 0.00% : 0.000006s : 1: get_jit_bprop_graph 0.00% : 0.000010s : 1: graph_reusing 0.00% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000004s : 1: handle_group_info 0.00% : 0.000006s : 1: inline 0.00% : 0.000007s : 1: insert-virtual-dataset 0.00% : 0.000004s : 1: interleave_parallel_branches 0.00% : 0.000004s : 1: interleave_split_concat_branches 0.00% : 0.000006s : 1: label_fine_grained_interleaved_index 0.00% : 0.000010s : 1: label_micro_interleaved_index 0.26% : 0.000849s : 1: loop_unroll 0.00% : 0.000005s : 1: merge_cast_opt 0.00% : 0.000005s : 1: micro_interleaved_order_control 3.97% : 0.012859s : 1: mutable_eliminate 0.00% : 0.000008s : 1: offloading_packed_experts 0.01% : 0.000026s : 1: opt.transform.loop_unroll_optimizer 0.01% : 0.000038s : 1: opt.transform.mutable_eliminate 0.46% : 0.001500s : 78: opt.transform.opt_a 0.01% : 0.000037s : 1: opt.transform.opt_after_cconv 0.01% : 0.000029s : 1: opt.transform.opt_after_jit_grad 0.04% : 0.000117s : 28: opt.transform.opt_b 0.02% : 0.000062s : 2: opt.transform.opt_trans_graph 0.02% : 0.000053s : 4: opt.transform.symbol_engine_opt 1.05% : 0.003400s : 1: opt_a 0.04% : 0.000142s : 1: opt_after_cconv 0.19% : 0.000624s : 1: opt_after_jit_grad 41.65% : 0.134735s : 1: opt_b 47.26% : 0.152891s : 1: optimize 0.01% : 0.000037s : 1: optimize_parallel_all_gather_comm 0.00% : 0.000008s : 1: order_py_execute_after_rewriter 0.01% : 0.000027s : 1: overlap_grad_flash_sp 0.00% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.00% : 0.000008s : 1: overlap_grad_ring_attention 0.00% : 0.000006s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000005s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000005s : 1: overlap_param_gather 0.00% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.00% : 0.000009s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000006s : 1: overlap_recompute_comm 0.00% : 0.000007s : 1: parallel-infer-symbol 0.00% : 0.000004s : 1: parallel-infer-symbol-second 0.00% : 0.000005s : 1: partial_unused_args_eliminate 0.00% : 0.000005s : 1: pipeline_parallel_scheduler 0.00% : 0.000005s : 1: pipeline_split 0.01% : 0.000047s : 1: pre_auto_parallel 0.01% : 0.000035s : 1: py_interpret_to_execute 0.00% : 0.000016s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000004s : 1: remove_cast_before_assign_add 0.01% : 0.000019s : 1: remove_dup_value 0.15% : 0.000488s : 1: renormalize.infer 0.12% : 0.000394s : 1: renormalize.specialize 0.00% : 0.000005s : 1: reorder_send_recv_between_fp_bp 0.00% : 0.000011s : 1: rewriter_after_jit_bprop_graph 0.01% : 0.000043s : 1: rewriter_after_opt_a 0.03% : 0.000099s : 1: rewriter_before_opt_a 0.00% : 0.000006s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000007s : 1: slice_recompute_activation 0.00% : 0.000005s : 1: split_layernorm_comm 0.00% : 0.000005s : 1: split_matmul_comm_elemetwise 0.00% : 0.000009s : 1: swap_dp_allreduce_reducescatter 0.03% : 0.000104s : 1: symbol_engine_optimizer 0.03% : 0.000094s : 1: tuple_transform 1.98% : 0.006416s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:41:37.883.536 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:41:37.883.823 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.167646, [21] [bootstrap]: 0.0004446 [type_inference]: 0.00708455 [event_method]: 2.145e-05 [auto_monad]: 7.042e-05 [graph_reusing]: 6.71e-06 [inline]: 3.09999e-06 [add_attr]: 0.00417485, [1] [add_attr_with_inline]: 0.00416132, [1] [Cycle 1]: 0.00012999, [2] [tag_attr]: 5.535e-05 [meta_addattr_fg_expand]: 6.00002e-06 [parallel-infer-symbol]: 3.73001e-06 [pre_auto_parallel]: 4.225e-05 [insert-virtual-dataset]: 2.40002e-06 [parallel-infer-symbol-second]: 8.79983e-07 [dataset_repeat_opt]: 1.87001e-06 [pipeline_split]: 1.79e-06 [optimize]: 0.0184477, [53] [py_interpret_to_execute]: 3.667e-05 [rewriter_before_opt_a]: 0.00010057 [opt_a]: 0.015273, [2] [Cycle 1]: 0.0139732, [45] [expand_dump_flag]: 3.41001e-06 [switch_simplify]: 4.261e-05 [loop_unroll]: 3.06e-05 [a_1]: 0.00075547 [with_stream_mark]: 2.362e-05 [recompute_prepare]: 1.147e-05 [updatestate_depend_eliminate]: 4.03999e-06 [updatestate_assign_eliminate]: 3.38e-06 [updatestate_loads_eliminate]: 2.93998e-06 [parameter_eliminate]: 1.95001e-06 [a_2]: 0.00011971 [accelerated_algorithm]: 8.30999e-06 [shard]: 2.70002e-06 [meta_shard_fg_expand]: 2.21e-06 [shard_inline]: 7e-06 [merge_send_recv]: 9.07999e-06 [auto_parallel]: 8.33001e-06 [parallel]: 2.127e-05 [flash_sp]: 1.08e-05 [merge_comm]: 4.55001e-06 [allreduce_fusion]: 3.51999e-06 [matmul_add_comm_reduction]: 1.095e-05 [allreduce_slice_to_reducescatter]: 1.06002e-06 [virtual_shard_identity]: 8.32e-06 [virtual_dataset]: 6.84001e-06 [get_grad_eliminate_]: 6.89001e-06 [virtual_output]: 7.14001e-06 [merge_forward]: 4.15999e-06 [cell_reuse_recompute_pass]: 1.79e-06 [offload_activation]: 9.99001e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.562e-05 [merge_recompute_call_nodes]: 1.92999e-06 [before_grad]: 1.232e-05 [set_forward_comm_id_for_comm_node_pass]: 4.21001e-06 [meta_fg_expand]: 3.21999e-06 [flash_sp_send_recv_attached]: 2.91999e-06 [receive_attached]: 2.32999e-06 [after_resolve]: 1.273e-05 [a_after_grad]: 9.96e-06 [renormalize]: 0.0121414 [add_forward_monad_depend]: 1.101e-05 [auto_monad_grad]: 2.79001e-06 [auto_monad_eliminator]: 5.151e-05 [cse]: 3.503e-05 [a_3]: 8.231e-05 [Cycle 2]: 0.00127888, [45] [expand_dump_flag]: 2.66e-06 [switch_simplify]: 1.089e-05 [loop_unroll]: 7.75e-06 [a_1]: 0.00016933 [with_stream_mark]: 2.245e-05 [recompute_prepare]: 8.76002e-06 [updatestate_depend_eliminate]: 4.47e-06 [updatestate_assign_eliminate]: 3.4e-06 [updatestate_loads_eliminate]: 2.79001e-06 [parameter_eliminate]: 2.11998e-06 [a_2]: 0.00028951 [accelerated_algorithm]: 1.019e-05 [shard]: 3.59002e-06 [meta_shard_fg_expand]: 2.50002e-06 [shard_inline]: 8.40999e-06 [merge_send_recv]: 1.305e-05 [auto_parallel]: 1.138e-05 [parallel]: 1.013e-05 [flash_sp]: 5.01002e-06 [merge_comm]: 4.2e-06 [allreduce_fusion]: 4.13001e-06 [matmul_add_comm_reduction]: 1.157e-05 [allreduce_slice_to_reducescatter]: 1.79e-06 [virtual_shard_identity]: 1.501e-05 [virtual_dataset]: 6.62002e-06 [get_grad_eliminate_]: 6.31e-06 [virtual_output]: 9.71e-06 [merge_forward]: 6.86001e-06 [cell_reuse_recompute_pass]: 3.8e-06 [offload_activation]: 1.291e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.878e-05 [merge_recompute_call_nodes]: 1.64998e-06 [before_grad]: 1.173e-05 [set_forward_comm_id_for_comm_node_pass]: 5.96998e-06 [meta_fg_expand]: 3.19001e-06 [flash_sp_send_recv_attached]: 2.25002e-06 [receive_attached]: 2.59001e-06 [after_resolve]: 1.564e-05 [a_after_grad]: 1.089e-05 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 3.90998e-06 [auto_monad_grad]: 2.53e-06 [auto_monad_eliminator]: 1.466e-05 [cse]: 2.719e-05 [a_3]: 6.023e-05 [py_interpret_to_execute_after_opt_a]: 2.361e-05 [slice_cell_reuse_recomputed_activation]: 5.34e-06 [rewriter_after_opt_a]: 4.884e-05 [convert_after_rewriter]: 1.05e-05 [order_py_execute_after_rewriter]: 3.134e-05 [mutable_eliminate]: 0.0008438 [opt_b]: 0.00031516, [1] [Cycle 1]: 0.00030262, [7] [b_1]: 0.00017983 [b_2]: 9.28002e-06 [updatestate_depend_eliminate]: 1.078e-05 [updatestate_assign_eliminate]: 3.39001e-06 [updatestate_loads_eliminate]: 3.51001e-06 [renormalize]: 6.59988e-07 [cse]: 2.751e-05 [optimize_parallel_all_gather_comm]: 2.504e-05 [overlap_param_gather]: 6.58e-06 [cconv]: 4.246e-05 [loop_unroll]: 0.00063749 [opt_after_cconv]: 0.00014713, [1] [Cycle 1]: 0.00013583, [7] [c_1]: 3.306e-05 [parameter_eliminate]: 6.16e-06 [updatestate_depend_eliminate]: 7.97998e-06 [updatestate_assign_eliminate]: 2.93e-06 [updatestate_loads_eliminate]: 2.94001e-06 [cse]: 2.502e-05 [renormalize]: 6.50005e-07 [remove_dup_value]: 1.857e-05 [tuple_transform]: 9.895e-05, [1] [Cycle 1]: 9.128e-05, [4] [d_1]: 5.043e-05 [none_parameter_eliminate]: 1.70001e-06 [renormalize]: 4.19997e-07 [switch_simplify]: 7.61999e-06 [partial_unused_args_eliminate]: 4.70999e-06 [add_recomputation]: 5.76e-05 [cse_after_recomputation]: 2.755e-05, [1] [Cycle 1]: 2.07e-05, [1] [cse]: 1.163e-05 [environ_conv]: 9.35001e-06 [swap_dp_allreduce_reducescatter]: 7.94997e-06 [bias_add_comm_swap]: 5.50001e-06 [label_micro_interleaved_index]: 8.1e-06 [label_fine_grained_interleaved_index]: 5.87999e-06 [merge_cast_opt]: 4.08999e-06 [slice_recompute_activation]: 4.62998e-06 [micro_interleaved_order_control]: 5.17999e-06 [assign_add_opt]: 3.74002e-06 [ForceFp32Comm]: 3.31001e-06 [remove_cast_before_assign_add]: 3.73001e-06 [full_micro_interleaved_order_control]: 4.72998e-06 [reorder_send_recv_between_fp_bp]: 6.09001e-06 [comm_op_add_attrs]: 3.64002e-06 [add_comm_op_reuse_tag]: 3.20998e-06 [interleave_split_concat_branches]: 3.86999e-06 [interleave_parallel_branches]: 3.62002e-06 [overlap_opt_shard_in_pipeline]: 3.98001e-06 [overlap_opt_shard_grad_in_pipeline]: 4.37998e-06 [control_data_broadcast_order]: 1.827e-05 [grouped_pairwise_exchange_alltoall]: 4.28001e-06 [offloading_packed_experts]: 7.25e-06 [overlap_recompute_and_grad_model_parallel]: 7.36001e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.58e-06 [overlap_recompute_allgather_and_fa_grad]: 4.03999e-06 [overlap_recompute_comm]: 5.67001e-06 [overlap_grad_ring_attention]: 7.63999e-06 [overlap_grad_flash_sp]: 2.693e-05 [begin_end_overlap_inline]: 3.41999e-06 [split_matmul_comm_elemetwise]: 5.11002e-06 [split_layernorm_comm]: 4.48001e-06 [handle_group_info]: 3.63999e-06 [symbol_engine_optimizer]: 0.00025547, [1] [Cycle 1]: 0.00024668, [6] [build]: 4.77e-06 [elim_shapecalc]: 1.322e-05 [elim_not_effective]: 1.466e-05 [opt_reshape]: 9.03002e-06 [fold_const_symbol]: 1.139e-05 [renormalize]: 2.40019e-07 [detach_backward]: 4.95999e-06 [pipeline_parallel_scheduler]: 1.88002e-06 [auto_monad_reorder]: 2.396e-05 [get_jit_bprop_graph]: 2.14999e-06 [rewriter_after_jit_bprop_graph]: 6.06998e-06 [opt_after_jit_grad]: 0.136599 [validate]: 5.244e-05 Sums bootstrap : 0.000445s : 0.28% type_inference : 0.007085s : 4.39% event_method : 0.000021s : 0.01% auto_monad : 0.000070s : 0.04% graph_reusing : 0.000007s : 0.00% inline : 0.000003s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000055s : 0.03% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.00% parallel-infer-symbol : 0.000004s : 0.00% pre_auto_parallel : 0.000042s : 0.03% insert-virtual-dataset : 0.000002s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000037s : 0.02% optimize.rewriter_before_opt_a : 0.000101s : 0.06% optimize.opt_a.expand_dump_flag : 0.000006s : 0.00% optimize.opt_a.switch_simplify : 0.000053s : 0.03% optimize.opt_a.loop_unroll : 0.000038s : 0.02% optimize.opt_a.a_1 : 0.000925s : 0.57% optimize.opt_a.with_stream_mark : 0.000046s : 0.03% optimize.opt_a.recompute_prepare : 0.000020s : 0.01% optimize.opt_a.updatestate_depend_eliminate : 0.000009s : 0.01% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.00% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.00% optimize.opt_a.parameter_eliminate : 0.000004s : 0.00% optimize.opt_a.a_2 : 0.000409s : 0.25% optimize.opt_a.accelerated_algorithm : 0.000018s : 0.01% optimize.opt_a.shard : 0.000006s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.000005s : 0.00% optimize.opt_a.shard_inline : 0.000015s : 0.01% optimize.opt_a.merge_send_recv : 0.000022s : 0.01% optimize.opt_a.auto_parallel : 0.000020s : 0.01% optimize.opt_a.parallel : 0.000031s : 0.02% optimize.opt_a.flash_sp : 0.000016s : 0.01% optimize.opt_a.merge_comm : 0.000009s : 0.01% optimize.opt_a.allreduce_fusion : 0.000008s : 0.00% optimize.opt_a.matmul_add_comm_reduction : 0.000023s : 0.01% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000003s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000023s : 0.01% optimize.opt_a.virtual_dataset : 0.000013s : 0.01% optimize.opt_a.get_grad_eliminate_ : 0.000013s : 0.01% optimize.opt_a.virtual_output : 0.000017s : 0.01% optimize.opt_a.merge_forward : 0.000011s : 0.01% optimize.opt_a.cell_reuse_recompute_pass : 0.000006s : 0.00% optimize.opt_a.offload_activation : 0.000023s : 0.01% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000034s : 0.02% optimize.opt_a.merge_recompute_call_nodes : 0.000004s : 0.00% optimize.opt_a.before_grad : 0.000024s : 0.01% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000010s : 0.01% optimize.opt_a.meta_fg_expand : 0.000006s : 0.00% optimize.opt_a.flash_sp_send_recv_attached : 0.000005s : 0.00% optimize.opt_a.receive_attached : 0.000005s : 0.00% optimize.opt_a.after_resolve : 0.000028s : 0.02% optimize.opt_a.a_after_grad : 0.000021s : 0.01% optimize.opt_a.renormalize : 0.012142s : 7.53% optimize.opt_a.add_forward_monad_depend : 0.000015s : 0.01% optimize.opt_a.auto_monad_grad : 0.000005s : 0.00% optimize.opt_a.auto_monad_eliminator : 0.000066s : 0.04% optimize.opt_a.cse : 0.000062s : 0.04% optimize.opt_a.a_3 : 0.000143s : 0.09% optimize.py_interpret_to_execute_after_opt_a : 0.000024s : 0.01% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.00% optimize.rewriter_after_opt_a : 0.000049s : 0.03% optimize.convert_after_rewriter : 0.000010s : 0.01% optimize.order_py_execute_after_rewriter : 0.000031s : 0.02% optimize.mutable_eliminate : 0.000844s : 0.52% optimize.opt_b.b_1 : 0.000180s : 0.11% optimize.opt_b.b_2 : 0.000009s : 0.01% optimize.opt_b.updatestate_depend_eliminate : 0.000011s : 0.01% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000004s : 0.00% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000028s : 0.02% optimize.optimize_parallel_all_gather_comm : 0.000025s : 0.02% optimize.overlap_param_gather : 0.000007s : 0.00% optimize.cconv : 0.000042s : 0.03% optimize.loop_unroll : 0.000637s : 0.40% optimize.opt_after_cconv.c_1 : 0.000033s : 0.02% optimize.opt_after_cconv.parameter_eliminate : 0.000006s : 0.00% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000008s : 0.00% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.cse : 0.000025s : 0.02% optimize.opt_after_cconv.renormalize : 0.000001s : 0.00% optimize.remove_dup_value : 0.000019s : 0.01% optimize.tuple_transform.d_1 : 0.000050s : 0.03% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000008s : 0.00% optimize.partial_unused_args_eliminate : 0.000005s : 0.00% optimize.add_recomputation : 0.000058s : 0.04% optimize.cse_after_recomputation.cse : 0.000012s : 0.01% optimize.environ_conv : 0.000009s : 0.01% optimize.swap_dp_allreduce_reducescatter : 0.000008s : 0.00% optimize.bias_add_comm_swap : 0.000006s : 0.00% optimize.label_micro_interleaved_index : 0.000008s : 0.01% optimize.label_fine_grained_interleaved_index : 0.000006s : 0.00% optimize.merge_cast_opt : 0.000004s : 0.00% optimize.slice_recompute_activation : 0.000005s : 0.00% optimize.micro_interleaved_order_control : 0.000005s : 0.00% optimize.assign_add_opt : 0.000004s : 0.00% optimize.ForceFp32Comm : 0.000003s : 0.00% optimize.remove_cast_before_assign_add : 0.000004s : 0.00% optimize.full_micro_interleaved_order_control : 0.000005s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000006s : 0.00% optimize.comm_op_add_attrs : 0.000004s : 0.00% optimize.add_comm_op_reuse_tag : 0.000003s : 0.00% optimize.interleave_split_concat_branches : 0.000004s : 0.00% optimize.interleave_parallel_branches : 0.000004s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000004s : 0.00% optimize.control_data_broadcast_order : 0.000018s : 0.01% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.00% optimize.offloading_packed_experts : 0.000007s : 0.00% optimize.overlap_recompute_and_grad_model_parallel : 0.000007s : 0.00% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.00% optimize.overlap_recompute_comm : 0.000006s : 0.00% optimize.overlap_grad_ring_attention : 0.000008s : 0.00% optimize.overlap_grad_flash_sp : 0.000027s : 0.02% optimize.begin_end_overlap_inline : 0.000003s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000005s : 0.00% optimize.split_layernorm_comm : 0.000004s : 0.00% optimize.handle_group_info : 0.000004s : 0.00% optimize.symbol_engine_optimizer.build : 0.000005s : 0.00% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000013s : 0.01% optimize.symbol_engine_optimizer.elim_not_effective : 0.000015s : 0.01% optimize.symbol_engine_optimizer.opt_reshape : 0.000009s : 0.01% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000011s : 0.01% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000005s : 0.00% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000024s : 0.01% get_jit_bprop_graph : 0.000002s : 0.00% rewriter_after_jit_bprop_graph : 0.000006s : 0.00% opt_after_jit_grad : 0.136599s : 84.67% validate : 0.000052s : 0.03% Time group info: ------[substitution.] 0.000272 34 17.93% : 0.000049s : 6: substitution.arithmetic_simplify 0.74% : 0.000002s : 2: substitution.elim_not_effective 0.81% : 0.000002s : 2: substitution.fold_const_symbol 2.59% : 0.000007s : 4: substitution.graph_param_transform 64.79% : 0.000176s : 4: substitution.inline 1.89% : 0.000005s : 4: substitution.j_node_and_user_rematch 2.19% : 0.000006s : 4: substitution.remove_not_recompute_node 2.71% : 0.000007s : 4: substitution.replace_old_param 6.35% : 0.000017s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.006988 2 88.07% : 0.006154s : 1: type_inference.infer 11.93% : 0.000833s : 1: type_inference.specialize ------[replace.] 0.000069 8 58.84% : 0.000041s : 4: replace.inline 41.16% : 0.000028s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000188 8 91.80% : 0.000173s : 4: match.inline 8.20% : 0.000015s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000248 1278 1.00% : 0.000002s : 13: predicate.accumulaten_eliminater 1.30% : 0.000003s : 4: predicate.ad_related_special_op_eliminate 0.51% : 0.000001s : 8: predicate.addn_check_dump 0.86% : 0.000002s : 13: predicate.addn_zero_filter 0.74% : 0.000002s : 13: predicate.adjust_all_reduce_mul_add 2.74% : 0.000007s : 21: predicate.arithmetic_simplify 0.91% : 0.000002s : 13: predicate.cast_eliminate 0.66% : 0.000002s : 8: predicate.check_bprop_eliminate 0.55% : 0.000001s : 8: predicate.compare_switch_simplify 0.16% : 0.000000s : 4: predicate.const_output_eliminate 0.75% : 0.000002s : 8: predicate.depend_value_elim 0.91% : 0.000002s : 13: predicate.dict_get_item_const_eliminator 1.01% : 0.000002s : 13: predicate.dict_get_item_eliminator 0.79% : 0.000002s : 13: predicate.dict_set_item_eliminator 1.57% : 0.000004s : 8: predicate.dumpgradient_eliminate 0.23% : 0.000001s : 4: predicate.elim_not_effective 0.53% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.21% : 0.000003s : 17: predicate.environ_add_const_eliminate 1.05% : 0.000003s : 17: predicate.environ_get_add_eliminate 0.95% : 0.000002s : 17: predicate.environ_get_depend_swap 1.56% : 0.000004s : 25: predicate.environ_get_eliminate 1.15% : 0.000003s : 17: predicate.environ_get_set_eliminate 1.30% : 0.000003s : 21: predicate.exchange_switch_depend_value 2.22% : 0.000006s : 21: predicate.float_depend_g_call 0.98% : 0.000002s : 8: predicate.float_environ_get_switch 0.80% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.16% : 0.000000s : 4: predicate.fold_const_symbol 0.78% : 0.000002s : 8: predicate.get_grad_eliminate 0.26% : 0.000001s : 4: predicate.graph_param_transform 0.59% : 0.000001s : 8: predicate.incorporate_call 0.43% : 0.000001s : 8: predicate.incorporate_call_switch 6.64% : 0.000016s : 58: predicate.inline 0.66% : 0.000002s : 8: predicate.inline_without_move 0.28% : 0.000001s : 8: predicate.j_node_and_user_rematch 1.10% : 0.000003s : 8: predicate.less_batch_normalization 1.80% : 0.000004s : 25: predicate.list_to_tuple_eliminator_ 2.31% : 0.000006s : 38: predicate.load_eliminater 1.21% : 0.000003s : 4: predicate.loop_unroll_after_grad 2.17% : 0.000005s : 34: predicate.loop_unroll_before_grad 1.68% : 0.000004s : 21: predicate.make_slice_get_slice_eliminator 0.62% : 0.000002s : 8: predicate.merge_addn 0.60% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.52% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.78% : 0.000002s : 13: predicate.minmaximum_grad 1.67% : 0.000004s : 4: predicate.mutable_eliminate 0.29% : 0.000001s : 4: predicate.opt_reshape 0.49% : 0.000001s : 4: predicate.parallel_virtual_node 1.58% : 0.000004s : 21: predicate.partial_defer_inline 1.40% : 0.000003s : 21: predicate.partial_eliminate 0.77% : 0.000002s : 13: predicate.print_const_string_wrapper 0.55% : 0.000001s : 8: predicate.reduce_all_const_elim 1.37% : 0.000003s : 13: predicate.reduce_eliminate 2.58% : 0.000006s : 38: predicate.redundant_stop_gradient_eliminater 0.48% : 0.000001s : 8: predicate.remove_not_recompute_node 1.43% : 0.000004s : 25: predicate.replace_applicator 0.43% : 0.000001s : 8: predicate.replace_old_param 0.32% : 0.000001s : 4: predicate.reset_defer_inline 0.87% : 0.000002s : 13: predicate.reshape_eliminate 0.57% : 0.000001s : 8: predicate.row_tensor_add_zeros_like 0.43% : 0.000001s : 4: predicate.row_tensor_eliminate 1.01% : 0.000003s : 8: predicate.same_eliminate 0.40% : 0.000001s : 8: predicate.set_cell_output_no_recompute 1.10% : 0.000003s : 8: predicate.shard_identity_eliminate 0.72% : 0.000002s : 8: predicate.special_op_eliminate 0.57% : 0.000001s : 8: predicate.specialize_transform 1.49% : 0.000004s : 8: predicate.split_environ_get_set_with_tuple_value 0.75% : 0.000002s : 8: predicate.stack_unstack_eliminate 0.34% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.29% : 0.000003s : 21: predicate.switch_defer_inline 1.79% : 0.000004s : 29: predicate.switch_layer_defer_inline 4.49% : 0.000011s : 67: predicate.switch_simplify 0.79% : 0.000002s : 13: predicate.tile_eliminate 0.92% : 0.000002s : 13: predicate.transpose_eliminate 1.48% : 0.000004s : 21: predicate.tuple_list_convert_item_index_to_positive 1.48% : 0.000004s : 21: predicate.tuple_list_get_item_const_eliminator 1.36% : 0.000003s : 21: predicate.tuple_list_get_item_depend_reorder 3.19% : 0.000008s : 33: predicate.tuple_list_get_item_eliminator 1.26% : 0.000003s : 21: predicate.tuple_list_get_set_item_eliminator 2.07% : 0.000005s : 29: predicate.tuple_list_set_item_eliminator 1.85% : 0.000005s : 25: predicate.tuple_to_list_eliminator_ 2.09% : 0.000005s : 38: predicate.updatestate_pure_node_eliminater 2.82% : 0.000007s : 46: predicate.updatestate_useless_node_eliminater 0.30% : 0.000001s : 4: predicate.value_based_eliminate 0.64% : 0.000002s : 8: predicate.virtual_dataset_eliminate 0.79% : 0.000002s : 8: predicate.virtual_output_eliminate 0.28% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.48% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000726 11 51.00% : 0.000370s : 5: func_graph_cloner_run.FuncGraphClonerGraph 49.00% : 0.000356s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.204003 192 0.00% : 0.000006s : 1: ForceFp32Comm 2.05% : 0.004187s : 1: add_attr 2.04% : 0.004166s : 1: add_attr_with_inline 0.00% : 0.000007s : 1: add_comm_op_reuse_tag 0.03% : 0.000062s : 1: add_recomputation 0.00% : 0.000006s : 1: assign_add_opt 0.04% : 0.000080s : 1: auto_monad 0.02% : 0.000032s : 1: auto_monad_reorder 0.00% : 0.000007s : 1: begin_end_overlap_inline 0.00% : 0.000009s : 1: bias_add_comm_swap 0.24% : 0.000492s : 1: bootstrap 0.02% : 0.000047s : 1: cconv 0.00% : 0.000006s : 1: comm_op_add_attrs 0.01% : 0.000021s : 1: control_data_broadcast_order 0.01% : 0.000014s : 1: convert_after_rewriter 0.02% : 0.000031s : 1: cse_after_recomputation 0.00% : 0.000008s : 1: dataset_repeat_opt 0.01% : 0.000023s : 1: detach_backward 0.01% : 0.000012s : 1: environ_conv 0.02% : 0.000033s : 1: event_method 0.00% : 0.000007s : 1: full_micro_interleaved_order_control 0.00% : 0.000009s : 1: get_jit_bprop_graph 0.01% : 0.000013s : 1: graph_reusing 0.00% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000006s : 1: handle_group_info 0.00% : 0.000009s : 1: inline 0.00% : 0.000009s : 1: insert-virtual-dataset 0.00% : 0.000006s : 1: interleave_parallel_branches 0.00% : 0.000007s : 1: interleave_split_concat_branches 0.00% : 0.000009s : 1: label_fine_grained_interleaved_index 0.01% : 0.000011s : 1: label_micro_interleaved_index 0.32% : 0.000645s : 1: loop_unroll 0.00% : 0.000007s : 1: merge_cast_opt 0.00% : 0.000008s : 1: micro_interleaved_order_control 0.42% : 0.000854s : 1: mutable_eliminate 0.00% : 0.000010s : 1: offloading_packed_experts 0.01% : 0.000020s : 1: opt.transform.loop_unroll_optimizer 0.01% : 0.000023s : 1: opt.transform.mutable_eliminate 0.70% : 0.001436s : 78: opt.transform.opt_a 0.02% : 0.000031s : 1: opt.transform.opt_after_cconv 0.02% : 0.000045s : 1: opt.transform.opt_after_jit_grad 0.05% : 0.000112s : 28: opt.transform.opt_b 0.03% : 0.000055s : 2: opt.transform.opt_trans_graph 0.02% : 0.000044s : 4: opt.transform.symbol_engine_opt 7.49% : 0.015277s : 1: opt_a 0.07% : 0.000151s : 1: opt_after_cconv 66.97% : 0.136621s : 1: opt_after_jit_grad 0.16% : 0.000319s : 1: opt_b 9.22% : 0.018804s : 1: optimize 0.01% : 0.000030s : 1: optimize_parallel_all_gather_comm 0.02% : 0.000037s : 1: order_py_execute_after_rewriter 0.01% : 0.000031s : 1: overlap_grad_flash_sp 0.00% : 0.000006s : 1: overlap_grad_matmul_and_grad_allreduce 0.01% : 0.000011s : 1: overlap_grad_ring_attention 0.00% : 0.000007s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000007s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000010s : 1: overlap_param_gather 0.00% : 0.000007s : 1: overlap_recompute_allgather_and_fa_grad 0.01% : 0.000010s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000009s : 1: overlap_recompute_comm 0.01% : 0.000011s : 1: parallel-infer-symbol 0.00% : 0.000007s : 1: parallel-infer-symbol-second 0.00% : 0.000008s : 1: partial_unused_args_eliminate 0.00% : 0.000010s : 1: pipeline_parallel_scheduler 0.00% : 0.000007s : 1: pipeline_split 0.02% : 0.000051s : 1: pre_auto_parallel 0.02% : 0.000040s : 1: py_interpret_to_execute 0.01% : 0.000029s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000007s : 1: remove_cast_before_assign_add 0.01% : 0.000022s : 1: remove_dup_value 5.70% : 0.011627s : 1: renormalize.infer 0.24% : 0.000497s : 1: renormalize.specialize 0.00% : 0.000009s : 1: reorder_send_recv_between_fp_bp 0.01% : 0.000012s : 1: rewriter_after_jit_bprop_graph 0.03% : 0.000053s : 1: rewriter_after_opt_a 0.05% : 0.000105s : 1: rewriter_before_opt_a 0.00% : 0.000008s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000008s : 1: slice_recompute_activation 0.00% : 0.000008s : 1: split_layernorm_comm 0.00% : 0.000008s : 1: split_matmul_comm_elemetwise 0.01% : 0.000011s : 1: swap_dp_allreduce_reducescatter 0.13% : 0.000259s : 1: symbol_engine_optimizer 0.05% : 0.000102s : 1: tuple_transform 3.50% : 0.007138s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:41:40.215.114 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.161238, [21] [bootstrap]: 0.00046688 [type_inference]: 0.00608262 [event_method]: 1.933e-05 [auto_monad]: 6.581e-05 [graph_reusing]: 5.69e-06 [inline]: 2.39999e-06 [add_attr]: 0.148357, [1] [add_attr_with_inline]: 0.148345, [1] [Cycle 1]: 7.792e-05, [2] [tag_attr]: 2.617e-05 [meta_addattr_fg_expand]: 5.69e-06 [parallel-infer-symbol]: 3.7e-06 [pre_auto_parallel]: 4.438e-05 [insert-virtual-dataset]: 2.55002e-06 [parallel-infer-symbol-second]: 8.2e-07 [dataset_repeat_opt]: 1.96e-06 [pipeline_split]: 1.97999e-06 [optimize]: 0.00543533, [53] [py_interpret_to_execute]: 3.541e-05 [rewriter_before_opt_a]: 0.00010006 [opt_a]: 0.0031108, [2] [Cycle 1]: 0.00231775, [45] [expand_dump_flag]: 3.32997e-06 [switch_simplify]: 4.541e-05 [loop_unroll]: 3.075e-05 [a_1]: 0.00070011 [with_stream_mark]: 2.143e-05 [recompute_prepare]: 1.038e-05 [updatestate_depend_eliminate]: 4.13001e-06 [updatestate_assign_eliminate]: 3.65998e-06 [updatestate_loads_eliminate]: 3.13e-06 [parameter_eliminate]: 1.94e-06 [a_2]: 9.063e-05 [accelerated_algorithm]: 7.50998e-06 [shard]: 1.72001e-06 [meta_shard_fg_expand]: 2.04e-06 [shard_inline]: 6.70002e-06 [merge_send_recv]: 9.07001e-06 [auto_parallel]: 8.55999e-06 [parallel]: 2.371e-05 [flash_sp]: 9.81998e-06 [merge_comm]: 4.15999e-06 [allreduce_fusion]: 3.66001e-06 [matmul_add_comm_reduction]: 1.104e-05 [allreduce_slice_to_reducescatter]: 8.39995e-07 [virtual_shard_identity]: 8.29998e-06 [virtual_dataset]: 7.40998e-06 [get_grad_eliminate_]: 6.34001e-06 [virtual_output]: 6.45997e-06 [merge_forward]: 3.89997e-06 [cell_reuse_recompute_pass]: 1.60999e-06 [offload_activation]: 1.148e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.263e-05 [merge_recompute_call_nodes]: 1.69e-06 [before_grad]: 1.07e-05 [set_forward_comm_id_for_comm_node_pass]: 3.52002e-06 [meta_fg_expand]: 3.26001e-06 [flash_sp_send_recv_attached]: 2.72001e-06 [receive_attached]: 2.37999e-06 [after_resolve]: 1.29e-05 [a_after_grad]: 9.97001e-06 [renormalize]: 0.00084295 [add_forward_monad_depend]: 6.63e-06 [auto_monad_grad]: 2.72001e-06 [auto_monad_eliminator]: 1.717e-05 [cse]: 3.147e-05 [a_3]: 4.937e-05 [Cycle 2]: 0.00078153, [45] [expand_dump_flag]: 1.52001e-06 [switch_simplify]: 8.18001e-06 [loop_unroll]: 6.19999e-06 [a_1]: 0.0001463 [with_stream_mark]: 1.524e-05 [recompute_prepare]: 7.92e-06 [updatestate_depend_eliminate]: 3.57997e-06 [updatestate_assign_eliminate]: 2.31e-06 [updatestate_loads_eliminate]: 2.51e-06 [parameter_eliminate]: 1.25999e-06 [a_2]: 0.00012522 [accelerated_algorithm]: 7.67998e-06 [shard]: 1.52001e-06 [meta_shard_fg_expand]: 1.91998e-06 [shard_inline]: 6.81999e-06 [merge_send_recv]: 7.06001e-06 [auto_parallel]: 6.56e-06 [parallel]: 5.86998e-06 [flash_sp]: 3.75e-06 [merge_comm]: 3.68999e-06 [allreduce_fusion]: 3.73001e-06 [matmul_add_comm_reduction]: 6.93e-06 [allreduce_slice_to_reducescatter]: 5.39992e-07 [virtual_shard_identity]: 7.71999e-06 [virtual_dataset]: 6.39001e-06 [get_grad_eliminate_]: 5.74999e-06 [virtual_output]: 6.08998e-06 [merge_forward]: 3.03e-06 [cell_reuse_recompute_pass]: 2.06e-06 [offload_activation]: 7.31999e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.198e-05 [merge_recompute_call_nodes]: 9.89996e-07 [before_grad]: 1.019e-05 [set_forward_comm_id_for_comm_node_pass]: 3.51999e-06 [meta_fg_expand]: 2.58e-06 [flash_sp_send_recv_attached]: 1.69e-06 [receive_attached]: 1.99999e-06 [after_resolve]: 1.057e-05 [a_after_grad]: 9.05999e-06 [renormalize]: 8.9989e-08 [add_forward_monad_depend]: 1.72001e-06 [auto_monad_grad]: 1.97999e-06 [auto_monad_eliminator]: 8.69e-06 [cse]: 1.81e-05 [a_3]: 3.965e-05 [py_interpret_to_execute_after_opt_a]: 1.22e-05 [slice_cell_reuse_recomputed_activation]: 2.24001e-06 [rewriter_after_opt_a]: 3.72e-05 [convert_after_rewriter]: 6.88e-06 [order_py_execute_after_rewriter]: 5.42001e-06 [mutable_eliminate]: 0.00069281 [opt_b]: 0.00020836, [1] [Cycle 1]: 0.0002012, [7] [b_1]: 0.00012674 [b_2]: 8.13999e-06 [updatestate_depend_eliminate]: 6.81999e-06 [updatestate_assign_eliminate]: 2.34999e-06 [updatestate_loads_eliminate]: 2.19999e-06 [renormalize]: 5.59987e-07 [cse]: 1.843e-05 [optimize_parallel_all_gather_comm]: 1.719e-05 [overlap_param_gather]: 2.22001e-06 [cconv]: 3.057e-05 [loop_unroll]: 0.00045895 [opt_after_cconv]: 0.00010787, [1] [Cycle 1]: 0.00010128, [7] [c_1]: 3.235e-05 [parameter_eliminate]: 3.93999e-06 [updatestate_depend_eliminate]: 5.94999e-06 [updatestate_assign_eliminate]: 2.51998e-06 [updatestate_loads_eliminate]: 2.76e-06 [cse]: 1.827e-05 [renormalize]: 3.39991e-07 [remove_dup_value]: 1.337e-05 [tuple_transform]: 0.00010874, [1] [Cycle 1]: 0.00010408, [4] [d_1]: 7.134e-05 [none_parameter_eliminate]: 2.03002e-06 [renormalize]: 2.59985e-07 [switch_simplify]: 7.95e-06 [partial_unused_args_eliminate]: 2.53e-06 [add_recomputation]: 5.524e-05 [cse_after_recomputation]: 2.293e-05, [1] [Cycle 1]: 1.811e-05, [1] [cse]: 1.236e-05 [environ_conv]: 5.11002e-06 [swap_dp_allreduce_reducescatter]: 5.00001e-06 [bias_add_comm_swap]: 2.79999e-06 [label_micro_interleaved_index]: 4.80999e-06 [label_fine_grained_interleaved_index]: 3.13e-06 [merge_cast_opt]: 1.37e-06 [slice_recompute_activation]: 2.09e-06 [micro_interleaved_order_control]: 2.50002e-06 [assign_add_opt]: 1.29e-06 [ForceFp32Comm]: 8.00006e-07 [remove_cast_before_assign_add]: 1.05001e-06 [full_micro_interleaved_order_control]: 1.95001e-06 [reorder_send_recv_between_fp_bp]: 2.91999e-06 [comm_op_add_attrs]: 9.89996e-07 [add_comm_op_reuse_tag]: 9.60019e-07 [interleave_split_concat_branches]: 1.27e-06 [interleave_parallel_branches]: 1.03001e-06 [overlap_opt_shard_in_pipeline]: 1.37e-06 [overlap_opt_shard_grad_in_pipeline]: 1.81e-06 [control_data_broadcast_order]: 1.288e-05 [grouped_pairwise_exchange_alltoall]: 1.90001e-06 [offloading_packed_experts]: 4.1e-06 [overlap_recompute_and_grad_model_parallel]: 4.73001e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.15999e-06 [overlap_recompute_allgather_and_fa_grad]: 1.40001e-06 [overlap_recompute_comm]: 2.29001e-06 [overlap_grad_ring_attention]: 4.47e-06 [overlap_grad_flash_sp]: 2.149e-05 [begin_end_overlap_inline]: 5.50004e-07 [split_matmul_comm_elemetwise]: 2.02999e-06 [split_layernorm_comm]: 1.61002e-06 [handle_group_info]: 1.29e-06 [symbol_engine_optimizer]: 7.81e-05, [1] [Cycle 1]: 7.317e-05, [6] [build]: 3.43999e-06 [elim_shapecalc]: 1.046e-05 [elim_not_effective]: 1.263e-05 [opt_reshape]: 6.96001e-06 [fold_const_symbol]: 1.068e-05 [renormalize]: 2.10013e-07 [detach_backward]: 2.29001e-06 [pipeline_parallel_scheduler]: 1.55999e-06 [auto_monad_reorder]: 1.699e-05 [get_jit_bprop_graph]: 2.02999e-06 [rewriter_after_jit_bprop_graph]: 5.22999e-06 [opt_after_jit_grad]: 0.00048969 [validate]: 6.62e-05 Sums bootstrap : 0.000467s : 3.94% type_inference : 0.006083s : 51.31% event_method : 0.000019s : 0.16% auto_monad : 0.000066s : 0.56% graph_reusing : 0.000006s : 0.05% inline : 0.000002s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000026s : 0.22% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.05% parallel-infer-symbol : 0.000004s : 0.03% pre_auto_parallel : 0.000044s : 0.37% insert-virtual-dataset : 0.000003s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.02% optimize.py_interpret_to_execute : 0.000035s : 0.30% optimize.rewriter_before_opt_a : 0.000100s : 0.84% optimize.opt_a.expand_dump_flag : 0.000005s : 0.04% optimize.opt_a.switch_simplify : 0.000054s : 0.45% optimize.opt_a.loop_unroll : 0.000037s : 0.31% optimize.opt_a.a_1 : 0.000846s : 7.14% optimize.opt_a.with_stream_mark : 0.000037s : 0.31% optimize.opt_a.recompute_prepare : 0.000018s : 0.15% optimize.opt_a.updatestate_depend_eliminate : 0.000008s : 0.07% optimize.opt_a.updatestate_assign_eliminate : 0.000006s : 0.05% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.05% optimize.opt_a.parameter_eliminate : 0.000003s : 0.03% optimize.opt_a.a_2 : 0.000216s : 1.82% optimize.opt_a.accelerated_algorithm : 0.000015s : 0.13% optimize.opt_a.shard : 0.000003s : 0.03% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.03% optimize.opt_a.shard_inline : 0.000014s : 0.11% optimize.opt_a.merge_send_recv : 0.000016s : 0.14% optimize.opt_a.auto_parallel : 0.000015s : 0.13% optimize.opt_a.parallel : 0.000030s : 0.25% optimize.opt_a.flash_sp : 0.000014s : 0.11% optimize.opt_a.merge_comm : 0.000008s : 0.07% optimize.opt_a.allreduce_fusion : 0.000007s : 0.06% optimize.opt_a.matmul_add_comm_reduction : 0.000018s : 0.15% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000016s : 0.14% optimize.opt_a.virtual_dataset : 0.000014s : 0.12% optimize.opt_a.get_grad_eliminate_ : 0.000012s : 0.10% optimize.opt_a.virtual_output : 0.000013s : 0.11% optimize.opt_a.merge_forward : 0.000007s : 0.06% optimize.opt_a.cell_reuse_recompute_pass : 0.000004s : 0.03% optimize.opt_a.offload_activation : 0.000019s : 0.16% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000025s : 0.21% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.02% optimize.opt_a.before_grad : 0.000021s : 0.18% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000007s : 0.06% optimize.opt_a.meta_fg_expand : 0.000006s : 0.05% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.04% optimize.opt_a.receive_attached : 0.000004s : 0.04% optimize.opt_a.after_resolve : 0.000023s : 0.20% optimize.opt_a.a_after_grad : 0.000019s : 0.16% optimize.opt_a.renormalize : 0.000843s : 7.11% optimize.opt_a.add_forward_monad_depend : 0.000008s : 0.07% optimize.opt_a.auto_monad_grad : 0.000005s : 0.04% optimize.opt_a.auto_monad_eliminator : 0.000026s : 0.22% optimize.opt_a.cse : 0.000050s : 0.42% optimize.opt_a.a_3 : 0.000089s : 0.75% optimize.py_interpret_to_execute_after_opt_a : 0.000012s : 0.10% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.02% optimize.rewriter_after_opt_a : 0.000037s : 0.31% optimize.convert_after_rewriter : 0.000007s : 0.06% optimize.order_py_execute_after_rewriter : 0.000005s : 0.05% optimize.mutable_eliminate : 0.000693s : 5.84% optimize.opt_b.b_1 : 0.000127s : 1.07% optimize.opt_b.b_2 : 0.000008s : 0.07% optimize.opt_b.updatestate_depend_eliminate : 0.000007s : 0.06% optimize.opt_b.updatestate_assign_eliminate : 0.000002s : 0.02% optimize.opt_b.updatestate_loads_eliminate : 0.000002s : 0.02% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000018s : 0.16% optimize.optimize_parallel_all_gather_comm : 0.000017s : 0.15% optimize.overlap_param_gather : 0.000002s : 0.02% optimize.cconv : 0.000031s : 0.26% optimize.loop_unroll : 0.000459s : 3.87% optimize.opt_after_cconv.c_1 : 0.000032s : 0.27% optimize.opt_after_cconv.parameter_eliminate : 0.000004s : 0.03% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.05% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.02% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.02% optimize.opt_after_cconv.cse : 0.000018s : 0.15% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000013s : 0.11% optimize.tuple_transform.d_1 : 0.000071s : 0.60% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.02% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000008s : 0.07% optimize.partial_unused_args_eliminate : 0.000003s : 0.02% optimize.add_recomputation : 0.000055s : 0.47% optimize.cse_after_recomputation.cse : 0.000012s : 0.10% optimize.environ_conv : 0.000005s : 0.04% optimize.swap_dp_allreduce_reducescatter : 0.000005s : 0.04% optimize.bias_add_comm_swap : 0.000003s : 0.02% optimize.label_micro_interleaved_index : 0.000005s : 0.04% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.03% optimize.merge_cast_opt : 0.000001s : 0.01% optimize.slice_recompute_activation : 0.000002s : 0.02% optimize.micro_interleaved_order_control : 0.000003s : 0.02% optimize.assign_add_opt : 0.000001s : 0.01% optimize.ForceFp32Comm : 0.000001s : 0.01% optimize.remove_cast_before_assign_add : 0.000001s : 0.01% optimize.full_micro_interleaved_order_control : 0.000002s : 0.02% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.02% optimize.comm_op_add_attrs : 0.000001s : 0.01% optimize.add_comm_op_reuse_tag : 0.000001s : 0.01% optimize.interleave_split_concat_branches : 0.000001s : 0.01% optimize.interleave_parallel_branches : 0.000001s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.02% optimize.control_data_broadcast_order : 0.000013s : 0.11% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.02% optimize.offloading_packed_experts : 0.000004s : 0.03% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.04% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.01% optimize.overlap_recompute_comm : 0.000002s : 0.02% optimize.overlap_grad_ring_attention : 0.000004s : 0.04% optimize.overlap_grad_flash_sp : 0.000021s : 0.18% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.02% optimize.split_layernorm_comm : 0.000002s : 0.01% optimize.handle_group_info : 0.000001s : 0.01% optimize.symbol_engine_optimizer.build : 0.000003s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000010s : 0.09% optimize.symbol_engine_optimizer.elim_not_effective : 0.000013s : 0.11% optimize.symbol_engine_optimizer.opt_reshape : 0.000007s : 0.06% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000011s : 0.09% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.02% pipeline_parallel_scheduler : 0.000002s : 0.01% auto_monad_reorder : 0.000017s : 0.14% get_jit_bprop_graph : 0.000002s : 0.02% rewriter_after_jit_bprop_graph : 0.000005s : 0.04% opt_after_jit_grad : 0.000490s : 4.13% validate : 0.000066s : 0.56% Time group info: ------[substitution.] 0.000257 34 14.34% : 0.000037s : 6: substitution.arithmetic_simplify 0.72% : 0.000002s : 2: substitution.elim_not_effective 0.55% : 0.000001s : 2: substitution.fold_const_symbol 2.71% : 0.000007s : 4: substitution.graph_param_transform 70.04% : 0.000180s : 4: substitution.inline 1.78% : 0.000005s : 4: substitution.j_node_and_user_rematch 1.91% : 0.000005s : 4: substitution.remove_not_recompute_node 1.86% : 0.000005s : 4: substitution.replace_old_param 6.09% : 0.000016s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.006017 2 87.51% : 0.005265s : 1: type_inference.infer 12.49% : 0.000752s : 1: type_inference.specialize ------[replace.] 0.000069 8 65.54% : 0.000045s : 4: replace.inline 34.46% : 0.000024s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000191 8 92.80% : 0.000177s : 4: match.inline 7.20% : 0.000014s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000236 1278 0.82% : 0.000002s : 13: predicate.accumulaten_eliminater 0.76% : 0.000002s : 4: predicate.ad_related_special_op_eliminate 0.45% : 0.000001s : 8: predicate.addn_check_dump 0.81% : 0.000002s : 13: predicate.addn_zero_filter 0.71% : 0.000002s : 13: predicate.adjust_all_reduce_mul_add 2.22% : 0.000005s : 21: predicate.arithmetic_simplify 0.81% : 0.000002s : 13: predicate.cast_eliminate 0.55% : 0.000001s : 8: predicate.check_bprop_eliminate 0.44% : 0.000001s : 8: predicate.compare_switch_simplify 0.17% : 0.000000s : 4: predicate.const_output_eliminate 0.56% : 0.000001s : 8: predicate.depend_value_elim 0.83% : 0.000002s : 13: predicate.dict_get_item_const_eliminator 0.90% : 0.000002s : 13: predicate.dict_get_item_eliminator 0.77% : 0.000002s : 13: predicate.dict_set_item_eliminator 0.78% : 0.000002s : 8: predicate.dumpgradient_eliminate 0.40% : 0.000001s : 4: predicate.elim_not_effective 0.34% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 0.99% : 0.000002s : 17: predicate.environ_add_const_eliminate 0.98% : 0.000002s : 17: predicate.environ_get_add_eliminate 1.02% : 0.000002s : 17: predicate.environ_get_depend_swap 1.70% : 0.000004s : 25: predicate.environ_get_eliminate 0.99% : 0.000002s : 17: predicate.environ_get_set_eliminate 1.25% : 0.000003s : 21: predicate.exchange_switch_depend_value 2.24% : 0.000005s : 21: predicate.float_depend_g_call 0.48% : 0.000001s : 8: predicate.float_environ_get_switch 0.66% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.17% : 0.000000s : 4: predicate.fold_const_symbol 0.69% : 0.000002s : 8: predicate.get_grad_eliminate 0.30% : 0.000001s : 4: predicate.graph_param_transform 0.57% : 0.000001s : 8: predicate.incorporate_call 0.53% : 0.000001s : 8: predicate.incorporate_call_switch 5.81% : 0.000014s : 58: predicate.inline 0.64% : 0.000002s : 8: predicate.inline_without_move 0.28% : 0.000001s : 8: predicate.j_node_and_user_rematch 0.77% : 0.000002s : 8: predicate.less_batch_normalization 1.58% : 0.000004s : 25: predicate.list_to_tuple_eliminator_ 2.22% : 0.000005s : 38: predicate.load_eliminater 0.83% : 0.000002s : 4: predicate.loop_unroll_after_grad 2.26% : 0.000005s : 34: predicate.loop_unroll_before_grad 1.64% : 0.000004s : 21: predicate.make_slice_get_slice_eliminator 0.53% : 0.000001s : 8: predicate.merge_addn 0.49% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.57% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.72% : 0.000002s : 13: predicate.minmaximum_grad 1.15% : 0.000003s : 4: predicate.mutable_eliminate 0.31% : 0.000001s : 4: predicate.opt_reshape 0.31% : 0.000001s : 4: predicate.parallel_virtual_node 1.65% : 0.000004s : 21: predicate.partial_defer_inline 1.47% : 0.000003s : 21: predicate.partial_eliminate 0.92% : 0.000002s : 13: predicate.print_const_string_wrapper 0.68% : 0.000002s : 8: predicate.reduce_all_const_elim 0.99% : 0.000002s : 13: predicate.reduce_eliminate 2.22% : 0.000005s : 38: predicate.redundant_stop_gradient_eliminater 0.53% : 0.000001s : 8: predicate.remove_not_recompute_node 1.19% : 0.000003s : 25: predicate.replace_applicator 0.44% : 0.000001s : 8: predicate.replace_old_param 0.33% : 0.000001s : 4: predicate.reset_defer_inline 0.79% : 0.000002s : 13: predicate.reshape_eliminate 0.61% : 0.000001s : 8: predicate.row_tensor_add_zeros_like 0.47% : 0.000001s : 4: predicate.row_tensor_eliminate 0.74% : 0.000002s : 8: predicate.same_eliminate 0.41% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.74% : 0.000002s : 8: predicate.shard_identity_eliminate 0.62% : 0.000001s : 8: predicate.special_op_eliminate 0.62% : 0.000001s : 8: predicate.specialize_transform 0.79% : 0.000002s : 8: predicate.split_environ_get_set_with_tuple_value 0.65% : 0.000002s : 8: predicate.stack_unstack_eliminate 0.27% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.33% : 0.000003s : 21: predicate.switch_defer_inline 1.94% : 0.000005s : 29: predicate.switch_layer_defer_inline 4.92% : 0.000012s : 67: predicate.switch_simplify 0.85% : 0.000002s : 13: predicate.tile_eliminate 0.81% : 0.000002s : 13: predicate.transpose_eliminate 1.51% : 0.000004s : 21: predicate.tuple_list_convert_item_index_to_positive 1.43% : 0.000003s : 21: predicate.tuple_list_get_item_const_eliminator 10.06% : 0.000024s : 21: predicate.tuple_list_get_item_depend_reorder 2.85% : 0.000007s : 33: predicate.tuple_list_get_item_eliminator 1.49% : 0.000004s : 21: predicate.tuple_list_get_set_item_eliminator 2.05% : 0.000005s : 29: predicate.tuple_list_set_item_eliminator 1.53% : 0.000004s : 25: predicate.tuple_to_list_eliminator_ 2.12% : 0.000005s : 38: predicate.updatestate_pure_node_eliminater 2.61% : 0.000006s : 46: predicate.updatestate_useless_node_eliminater 0.32% : 0.000001s : 4: predicate.value_based_eliminate 0.66% : 0.000002s : 8: predicate.virtual_dataset_eliminate 0.62% : 0.000001s : 8: predicate.virtual_output_eliminate 0.36% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.35% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000592 11 54.64% : 0.000323s : 5: func_graph_cloner_run.FuncGraphClonerGraph 45.36% : 0.000268s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.317342 192 0.00% : 0.000004s : 1: ForceFp32Comm 46.75% : 0.148364s : 1: add_attr 46.75% : 0.148349s : 1: add_attr_with_inline 0.00% : 0.000004s : 1: add_comm_op_reuse_tag 0.02% : 0.000059s : 1: add_recomputation 0.00% : 0.000004s : 1: assign_add_opt 0.02% : 0.000071s : 1: auto_monad 0.01% : 0.000021s : 1: auto_monad_reorder 0.00% : 0.000003s : 1: begin_end_overlap_inline 0.00% : 0.000006s : 1: bias_add_comm_swap 0.16% : 0.000498s : 1: bootstrap 0.01% : 0.000034s : 1: cconv 0.00% : 0.000004s : 1: comm_op_add_attrs 0.01% : 0.000016s : 1: control_data_broadcast_order 0.00% : 0.000010s : 1: convert_after_rewriter 0.01% : 0.000026s : 1: cse_after_recomputation 0.00% : 0.000005s : 1: dataset_repeat_opt 0.00% : 0.000005s : 1: detach_backward 0.00% : 0.000008s : 1: environ_conv 0.01% : 0.000027s : 1: event_method 0.00% : 0.000005s : 1: full_micro_interleaved_order_control 0.00% : 0.000005s : 1: get_jit_bprop_graph 0.00% : 0.000009s : 1: graph_reusing 0.00% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000004s : 1: handle_group_info 0.00% : 0.000005s : 1: inline 0.00% : 0.000006s : 1: insert-virtual-dataset 0.00% : 0.000004s : 1: interleave_parallel_branches 0.00% : 0.000004s : 1: interleave_split_concat_branches 0.00% : 0.000007s : 1: label_fine_grained_interleaved_index 0.00% : 0.000008s : 1: label_micro_interleaved_index 0.15% : 0.000467s : 1: loop_unroll 0.00% : 0.000004s : 1: merge_cast_opt 0.00% : 0.000005s : 1: micro_interleaved_order_control 0.22% : 0.000703s : 1: mutable_eliminate 0.00% : 0.000007s : 1: offloading_packed_experts 0.00% : 0.000014s : 1: opt.transform.loop_unroll_optimizer 0.01% : 0.000018s : 1: opt.transform.mutable_eliminate 0.42% : 0.001332s : 78: opt.transform.opt_a 0.01% : 0.000031s : 1: opt.transform.opt_after_cconv 0.01% : 0.000026s : 1: opt.transform.opt_after_jit_grad 0.03% : 0.000103s : 28: opt.transform.opt_b 0.02% : 0.000076s : 2: opt.transform.opt_trans_graph 0.01% : 0.000037s : 4: opt.transform.symbol_engine_opt 0.98% : 0.003114s : 1: opt_a 0.04% : 0.000112s : 1: opt_after_cconv 0.16% : 0.000499s : 1: opt_after_jit_grad 0.07% : 0.000212s : 1: opt_b 1.71% : 0.005441s : 1: optimize 0.01% : 0.000021s : 1: optimize_parallel_all_gather_comm 0.00% : 0.000008s : 1: order_py_execute_after_rewriter 0.01% : 0.000025s : 1: overlap_grad_flash_sp 0.00% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.00% : 0.000007s : 1: overlap_grad_ring_attention 0.00% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000006s : 1: overlap_param_gather 0.00% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.00% : 0.000008s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000005s : 1: overlap_recompute_comm 0.00% : 0.000007s : 1: parallel-infer-symbol 0.00% : 0.000004s : 1: parallel-infer-symbol-second 0.00% : 0.000006s : 1: partial_unused_args_eliminate 0.00% : 0.000005s : 1: pipeline_parallel_scheduler 0.00% : 0.000005s : 1: pipeline_split 0.02% : 0.000049s : 1: pre_auto_parallel 0.01% : 0.000040s : 1: py_interpret_to_execute 0.00% : 0.000016s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000004s : 1: remove_cast_before_assign_add 0.01% : 0.000017s : 1: remove_dup_value 0.14% : 0.000452s : 1: renormalize.infer 0.12% : 0.000382s : 1: renormalize.specialize 0.00% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.00% : 0.000008s : 1: rewriter_after_jit_bprop_graph 0.01% : 0.000042s : 1: rewriter_after_opt_a 0.03% : 0.000105s : 1: rewriter_before_opt_a 0.00% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000005s : 1: slice_recompute_activation 0.00% : 0.000005s : 1: split_layernorm_comm 0.00% : 0.000005s : 1: split_matmul_comm_elemetwise 0.00% : 0.000008s : 1: swap_dp_allreduce_reducescatter 0.03% : 0.000081s : 1: symbol_engine_optimizer 0.04% : 0.000112s : 1: tuple_transform 1.92% : 0.006102s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:41:42.209.523 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:41:42.209.804 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0181586, [21] [bootstrap]: 0.00042262 [type_inference]: 0.00626839 [event_method]: 1.942e-05 [auto_monad]: 6.825e-05 [graph_reusing]: 7.1e-06 [inline]: 2.87002e-06 [add_attr]: 0.00383036, [1] [add_attr_with_inline]: 0.00381911, [1] [Cycle 1]: 8.805e-05, [2] [tag_attr]: 2.408e-05 [meta_addattr_fg_expand]: 6.28998e-06 [parallel-infer-symbol]: 3.45003e-06 [pre_auto_parallel]: 4.008e-05 [insert-virtual-dataset]: 2.49001e-06 [parallel-infer-symbol-second]: 7.7e-07 [dataset_repeat_opt]: 2.39999e-06 [pipeline_split]: 1.59e-06 [optimize]: 0.0062299, [53] [py_interpret_to_execute]: 2.883e-05 [rewriter_before_opt_a]: 9.668e-05 [opt_a]: 0.00366364, [2] [Cycle 1]: 0.00273537, [45] [expand_dump_flag]: 3.03998e-06 [switch_simplify]: 4.218e-05 [loop_unroll]: 3.046e-05 [a_1]: 0.00089163 [with_stream_mark]: 2.527e-05 [recompute_prepare]: 9.20001e-06 [updatestate_depend_eliminate]: 5.94e-06 [updatestate_assign_eliminate]: 3.55e-06 [updatestate_loads_eliminate]: 2.99001e-06 [parameter_eliminate]: 2.90998e-06 [a_2]: 0.0001244 [accelerated_algorithm]: 7.89002e-06 [shard]: 2.61e-06 [meta_shard_fg_expand]: 2.84001e-06 [shard_inline]: 6.65998e-06 [merge_send_recv]: 9.47999e-06 [auto_parallel]: 9.02999e-06 [parallel]: 2.063e-05 [flash_sp]: 1.028e-05 [merge_comm]: 3.83999e-06 [allreduce_fusion]: 3.63999e-06 [matmul_add_comm_reduction]: 1.08e-05 [allreduce_slice_to_reducescatter]: 7.59988e-07 [virtual_shard_identity]: 8.40001e-06 [virtual_dataset]: 7.21001e-06 [get_grad_eliminate_]: 6.79999e-06 [virtual_output]: 7.36999e-06 [merge_forward]: 4.13999e-06 [cell_reuse_recompute_pass]: 2.14999e-06 [offload_activation]: 1.08e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.437e-05 [merge_recompute_call_nodes]: 1.73002e-06 [before_grad]: 1.165e-05 [set_forward_comm_id_for_comm_node_pass]: 3.68e-06 [meta_fg_expand]: 3.18e-06 [flash_sp_send_recv_attached]: 3.00002e-06 [receive_attached]: 2.62001e-06 [after_resolve]: 1.271e-05 [a_after_grad]: 1.093e-05 [renormalize]: 0.00087391 [add_forward_monad_depend]: 6.79001e-06 [auto_monad_grad]: 2.55002e-06 [auto_monad_eliminator]: 1.734e-05 [cse]: 2.938e-05 [a_3]: 6.257e-05 [Cycle 2]: 0.00091192, [45] [expand_dump_flag]: 2.61e-06 [switch_simplify]: 7.72998e-06 [loop_unroll]: 6.41998e-06 [a_1]: 0.00015052 [with_stream_mark]: 1.825e-05 [recompute_prepare]: 7.08e-06 [updatestate_depend_eliminate]: 3.25e-06 [updatestate_assign_eliminate]: 3.2e-06 [updatestate_loads_eliminate]: 2.16e-06 [parameter_eliminate]: 1.47001e-06 [a_2]: 0.00011063 [accelerated_algorithm]: 6.58e-06 [shard]: 1.90001e-06 [meta_shard_fg_expand]: 1.94e-06 [shard_inline]: 6.89001e-06 [merge_send_recv]: 6.15002e-06 [auto_parallel]: 7.18e-06 [parallel]: 7.19001e-06 [flash_sp]: 3.73999e-06 [merge_comm]: 3.75e-06 [allreduce_fusion]: 3.41999e-06 [matmul_add_comm_reduction]: 1.165e-05 [allreduce_slice_to_reducescatter]: 4.69998e-07 [virtual_shard_identity]: 7.26001e-06 [virtual_dataset]: 6.07999e-06 [get_grad_eliminate_]: 6.05002e-06 [virtual_output]: 5.94e-06 [merge_forward]: 3.8e-06 [cell_reuse_recompute_pass]: 2.58e-06 [offload_activation]: 8.58001e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.416e-05 [merge_recompute_call_nodes]: 1.11002e-06 [before_grad]: 9.82999e-06 [set_forward_comm_id_for_comm_node_pass]: 3.35e-06 [meta_fg_expand]: 2.47001e-06 [flash_sp_send_recv_attached]: 1.71002e-06 [receive_attached]: 2.74999e-06 [after_resolve]: 1.208e-05 [a_after_grad]: 9.79999e-06 [renormalize]: 7.00238e-08 [add_forward_monad_depend]: 1.49e-06 [auto_monad_grad]: 2.09e-06 [auto_monad_eliminator]: 9.61e-06 [cse]: 1.96e-05 [a_3]: 5.566e-05 [py_interpret_to_execute_after_opt_a]: 1.746e-05 [slice_cell_reuse_recomputed_activation]: 5.39e-06 [rewriter_after_opt_a]: 4.439e-05 [convert_after_rewriter]: 1.009e-05 [order_py_execute_after_rewriter]: 8.34002e-06 [mutable_eliminate]: 0.00070861 [opt_b]: 0.00027604, [1] [Cycle 1]: 0.00026453, [7] [b_1]: 0.00017017 [b_2]: 9.72999e-06 [updatestate_depend_eliminate]: 5.64e-06 [updatestate_assign_eliminate]: 2.49999e-06 [updatestate_loads_eliminate]: 2.58998e-06 [renormalize]: 9.50007e-07 [cse]: 1.732e-05 [optimize_parallel_all_gather_comm]: 2.169e-05 [overlap_param_gather]: 5.27001e-06 [cconv]: 3.513e-05 [loop_unroll]: 0.00047224 [opt_after_cconv]: 0.00013081, [1] [Cycle 1]: 0.00012106, [7] [c_1]: 3.197e-05 [parameter_eliminate]: 3.04999e-06 [updatestate_depend_eliminate]: 5.34e-06 [updatestate_assign_eliminate]: 2.63e-06 [updatestate_loads_eliminate]: 2.73e-06 [cse]: 1.772e-05 [renormalize]: 3.00002e-07 [remove_dup_value]: 1.632e-05 [tuple_transform]: 9.259e-05, [1] [Cycle 1]: 8.461e-05, [4] [d_1]: 4.532e-05 [none_parameter_eliminate]: 1.07998e-06 [renormalize]: 1.90019e-07 [switch_simplify]: 7.28e-06 [partial_unused_args_eliminate]: 5.07e-06 [add_recomputation]: 5.493e-05 [cse_after_recomputation]: 2.841e-05, [1] [Cycle 1]: 2.108e-05, [1] [cse]: 1.15e-05 [environ_conv]: 8.37e-06 [swap_dp_allreduce_reducescatter]: 8.22e-06 [bias_add_comm_swap]: 5.49e-06 [label_micro_interleaved_index]: 7.97998e-06 [label_fine_grained_interleaved_index]: 5.56e-06 [merge_cast_opt]: 3.81001e-06 [slice_recompute_activation]: 4.62998e-06 [micro_interleaved_order_control]: 5.09e-06 [assign_add_opt]: 3.97e-06 [ForceFp32Comm]: 3.51999e-06 [remove_cast_before_assign_add]: 4.13001e-06 [full_micro_interleaved_order_control]: 5.22e-06 [reorder_send_recv_between_fp_bp]: 5.91e-06 [comm_op_add_attrs]: 3.64002e-06 [add_comm_op_reuse_tag]: 3.6e-06 [interleave_split_concat_branches]: 3.81999e-06 [interleave_parallel_branches]: 3.77998e-06 [overlap_opt_shard_in_pipeline]: 3.97002e-06 [overlap_opt_shard_grad_in_pipeline]: 5.13002e-06 [control_data_broadcast_order]: 1.724e-05 [grouped_pairwise_exchange_alltoall]: 3.90998e-06 [offloading_packed_experts]: 6.71e-06 [overlap_recompute_and_grad_model_parallel]: 7.87e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.68999e-06 [overlap_recompute_allgather_and_fa_grad]: 3.73001e-06 [overlap_recompute_comm]: 4.90999e-06 [overlap_grad_ring_attention]: 6.91999e-06 [overlap_grad_flash_sp]: 2.477e-05 [begin_end_overlap_inline]: 3.08998e-06 [split_matmul_comm_elemetwise]: 4.48999e-06 [split_layernorm_comm]: 4.50999e-06 [handle_group_info]: 3.55003e-06 [symbol_engine_optimizer]: 9.884e-05, [1] [Cycle 1]: 9.18e-05, [6] [build]: 3.7e-06 [elim_shapecalc]: 9.53002e-06 [elim_not_effective]: 1.299e-05 [opt_reshape]: 7.18e-06 [fold_const_symbol]: 1.164e-05 [renormalize]: 1.69995e-07 [detach_backward]: 3.86999e-06 [pipeline_parallel_scheduler]: 2.06e-06 [auto_monad_reorder]: 1.969e-05 [get_jit_bprop_graph]: 2.08002e-06 [rewriter_after_jit_bprop_graph]: 5.94e-06 [opt_after_jit_grad]: 0.00057525 [validate]: 4.383e-05 Sums bootstrap : 0.000423s : 3.38% type_inference : 0.006268s : 50.06% event_method : 0.000019s : 0.16% auto_monad : 0.000068s : 0.55% graph_reusing : 0.000007s : 0.06% inline : 0.000003s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000024s : 0.19% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.05% parallel-infer-symbol : 0.000003s : 0.03% pre_auto_parallel : 0.000040s : 0.32% insert-virtual-dataset : 0.000002s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000029s : 0.23% optimize.rewriter_before_opt_a : 0.000097s : 0.77% optimize.opt_a.expand_dump_flag : 0.000006s : 0.05% optimize.opt_a.switch_simplify : 0.000050s : 0.40% optimize.opt_a.loop_unroll : 0.000037s : 0.29% optimize.opt_a.a_1 : 0.001042s : 8.32% optimize.opt_a.with_stream_mark : 0.000044s : 0.35% optimize.opt_a.recompute_prepare : 0.000016s : 0.13% optimize.opt_a.updatestate_depend_eliminate : 0.000009s : 0.07% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.05% optimize.opt_a.updatestate_loads_eliminate : 0.000005s : 0.04% optimize.opt_a.parameter_eliminate : 0.000004s : 0.03% optimize.opt_a.a_2 : 0.000235s : 1.88% optimize.opt_a.accelerated_algorithm : 0.000014s : 0.12% optimize.opt_a.shard : 0.000005s : 0.04% optimize.opt_a.meta_shard_fg_expand : 0.000005s : 0.04% optimize.opt_a.shard_inline : 0.000014s : 0.11% optimize.opt_a.merge_send_recv : 0.000016s : 0.12% optimize.opt_a.auto_parallel : 0.000016s : 0.13% optimize.opt_a.parallel : 0.000028s : 0.22% optimize.opt_a.flash_sp : 0.000014s : 0.11% optimize.opt_a.merge_comm : 0.000008s : 0.06% optimize.opt_a.allreduce_fusion : 0.000007s : 0.06% optimize.opt_a.matmul_add_comm_reduction : 0.000022s : 0.18% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000016s : 0.13% optimize.opt_a.virtual_dataset : 0.000013s : 0.11% optimize.opt_a.get_grad_eliminate_ : 0.000013s : 0.10% optimize.opt_a.virtual_output : 0.000013s : 0.11% optimize.opt_a.merge_forward : 0.000008s : 0.06% optimize.opt_a.cell_reuse_recompute_pass : 0.000005s : 0.04% optimize.opt_a.offload_activation : 0.000019s : 0.15% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000029s : 0.23% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.02% optimize.opt_a.before_grad : 0.000021s : 0.17% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000007s : 0.06% optimize.opt_a.meta_fg_expand : 0.000006s : 0.05% optimize.opt_a.flash_sp_send_recv_attached : 0.000005s : 0.04% optimize.opt_a.receive_attached : 0.000005s : 0.04% optimize.opt_a.after_resolve : 0.000025s : 0.20% optimize.opt_a.a_after_grad : 0.000021s : 0.17% optimize.opt_a.renormalize : 0.000874s : 6.98% optimize.opt_a.add_forward_monad_depend : 0.000008s : 0.07% optimize.opt_a.auto_monad_grad : 0.000005s : 0.04% optimize.opt_a.auto_monad_eliminator : 0.000027s : 0.22% optimize.opt_a.cse : 0.000049s : 0.39% optimize.opt_a.a_3 : 0.000118s : 0.94% optimize.py_interpret_to_execute_after_opt_a : 0.000017s : 0.14% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.04% optimize.rewriter_after_opt_a : 0.000044s : 0.35% optimize.convert_after_rewriter : 0.000010s : 0.08% optimize.order_py_execute_after_rewriter : 0.000008s : 0.07% optimize.mutable_eliminate : 0.000709s : 5.66% optimize.opt_b.b_1 : 0.000170s : 1.36% optimize.opt_b.b_2 : 0.000010s : 0.08% optimize.opt_b.updatestate_depend_eliminate : 0.000006s : 0.05% optimize.opt_b.updatestate_assign_eliminate : 0.000002s : 0.02% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.02% optimize.opt_b.renormalize : 0.000001s : 0.01% optimize.opt_b.cse : 0.000017s : 0.14% optimize.optimize_parallel_all_gather_comm : 0.000022s : 0.17% optimize.overlap_param_gather : 0.000005s : 0.04% optimize.cconv : 0.000035s : 0.28% optimize.loop_unroll : 0.000472s : 3.77% optimize.opt_after_cconv.c_1 : 0.000032s : 0.26% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.02% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000005s : 0.04% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.02% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.02% optimize.opt_after_cconv.cse : 0.000018s : 0.14% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000016s : 0.13% optimize.tuple_transform.d_1 : 0.000045s : 0.36% optimize.tuple_transform.none_parameter_eliminate : 0.000001s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000007s : 0.06% optimize.partial_unused_args_eliminate : 0.000005s : 0.04% optimize.add_recomputation : 0.000055s : 0.44% optimize.cse_after_recomputation.cse : 0.000012s : 0.09% optimize.environ_conv : 0.000008s : 0.07% optimize.swap_dp_allreduce_reducescatter : 0.000008s : 0.07% optimize.bias_add_comm_swap : 0.000005s : 0.04% optimize.label_micro_interleaved_index : 0.000008s : 0.06% optimize.label_fine_grained_interleaved_index : 0.000006s : 0.04% optimize.merge_cast_opt : 0.000004s : 0.03% optimize.slice_recompute_activation : 0.000005s : 0.04% optimize.micro_interleaved_order_control : 0.000005s : 0.04% optimize.assign_add_opt : 0.000004s : 0.03% optimize.ForceFp32Comm : 0.000004s : 0.03% optimize.remove_cast_before_assign_add : 0.000004s : 0.03% optimize.full_micro_interleaved_order_control : 0.000005s : 0.04% optimize.reorder_send_recv_between_fp_bp : 0.000006s : 0.05% optimize.comm_op_add_attrs : 0.000004s : 0.03% optimize.add_comm_op_reuse_tag : 0.000004s : 0.03% optimize.interleave_split_concat_branches : 0.000004s : 0.03% optimize.interleave_parallel_branches : 0.000004s : 0.03% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.03% optimize.overlap_opt_shard_grad_in_pipeline : 0.000005s : 0.04% optimize.control_data_broadcast_order : 0.000017s : 0.14% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.03% optimize.offloading_packed_experts : 0.000007s : 0.05% optimize.overlap_recompute_and_grad_model_parallel : 0.000008s : 0.06% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.03% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.03% optimize.overlap_recompute_comm : 0.000005s : 0.04% optimize.overlap_grad_ring_attention : 0.000007s : 0.06% optimize.overlap_grad_flash_sp : 0.000025s : 0.20% optimize.begin_end_overlap_inline : 0.000003s : 0.02% optimize.split_matmul_comm_elemetwise : 0.000004s : 0.04% optimize.split_layernorm_comm : 0.000005s : 0.04% optimize.handle_group_info : 0.000004s : 0.03% optimize.symbol_engine_optimizer.build : 0.000004s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000010s : 0.08% optimize.symbol_engine_optimizer.elim_not_effective : 0.000013s : 0.10% optimize.symbol_engine_optimizer.opt_reshape : 0.000007s : 0.06% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000012s : 0.09% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000004s : 0.03% pipeline_parallel_scheduler : 0.000002s : 0.02% auto_monad_reorder : 0.000020s : 0.16% get_jit_bprop_graph : 0.000002s : 0.02% rewriter_after_jit_bprop_graph : 0.000006s : 0.05% opt_after_jit_grad : 0.000575s : 4.59% validate : 0.000044s : 0.35% Time group info: ------[substitution.] 0.000414 34 48.71% : 0.000201s : 6: substitution.arithmetic_simplify 0.46% : 0.000002s : 2: substitution.elim_not_effective 0.45% : 0.000002s : 2: substitution.fold_const_symbol 1.18% : 0.000005s : 4: substitution.graph_param_transform 41.14% : 0.000170s : 4: substitution.inline 1.06% : 0.000004s : 4: substitution.j_node_and_user_rematch 1.21% : 0.000005s : 4: substitution.remove_not_recompute_node 1.38% : 0.000006s : 4: substitution.replace_old_param 4.41% : 0.000018s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.006209 2 86.94% : 0.005399s : 1: type_inference.infer 13.06% : 0.000811s : 1: type_inference.specialize ------[replace.] 0.000093 8 64.08% : 0.000059s : 4: replace.inline 35.92% : 0.000033s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000184 8 91.29% : 0.000168s : 4: match.inline 8.71% : 0.000016s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000218 1278 1.07% : 0.000002s : 13: predicate.accumulaten_eliminater 0.94% : 0.000002s : 4: predicate.ad_related_special_op_eliminate 0.50% : 0.000001s : 8: predicate.addn_check_dump 0.91% : 0.000002s : 13: predicate.addn_zero_filter 0.79% : 0.000002s : 13: predicate.adjust_all_reduce_mul_add 2.59% : 0.000006s : 21: predicate.arithmetic_simplify 0.96% : 0.000002s : 13: predicate.cast_eliminate 0.61% : 0.000001s : 8: predicate.check_bprop_eliminate 0.51% : 0.000001s : 8: predicate.compare_switch_simplify 0.17% : 0.000000s : 4: predicate.const_output_eliminate 0.63% : 0.000001s : 8: predicate.depend_value_elim 0.87% : 0.000002s : 13: predicate.dict_get_item_const_eliminator 1.03% : 0.000002s : 13: predicate.dict_get_item_eliminator 0.96% : 0.000002s : 13: predicate.dict_set_item_eliminator 1.12% : 0.000002s : 8: predicate.dumpgradient_eliminate 0.21% : 0.000000s : 4: predicate.elim_not_effective 0.35% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.10% : 0.000002s : 17: predicate.environ_add_const_eliminate 1.36% : 0.000003s : 17: predicate.environ_get_add_eliminate 1.07% : 0.000002s : 17: predicate.environ_get_depend_swap 1.76% : 0.000004s : 25: predicate.environ_get_eliminate 1.00% : 0.000002s : 17: predicate.environ_get_set_eliminate 1.41% : 0.000003s : 21: predicate.exchange_switch_depend_value 2.57% : 0.000006s : 21: predicate.float_depend_g_call 0.80% : 0.000002s : 8: predicate.float_environ_get_switch 0.94% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.18% : 0.000000s : 4: predicate.fold_const_symbol 0.61% : 0.000001s : 8: predicate.get_grad_eliminate 0.20% : 0.000000s : 4: predicate.graph_param_transform 0.56% : 0.000001s : 8: predicate.incorporate_call 0.46% : 0.000001s : 8: predicate.incorporate_call_switch 6.20% : 0.000014s : 58: predicate.inline 0.77% : 0.000002s : 8: predicate.inline_without_move 0.32% : 0.000001s : 8: predicate.j_node_and_user_rematch 0.77% : 0.000002s : 8: predicate.less_batch_normalization 1.68% : 0.000004s : 25: predicate.list_to_tuple_eliminator_ 2.33% : 0.000005s : 38: predicate.load_eliminater 0.85% : 0.000002s : 4: predicate.loop_unroll_after_grad 2.24% : 0.000005s : 34: predicate.loop_unroll_before_grad 1.52% : 0.000003s : 21: predicate.make_slice_get_slice_eliminator 0.62% : 0.000001s : 8: predicate.merge_addn 0.56% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.58% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.76% : 0.000002s : 13: predicate.minmaximum_grad 0.83% : 0.000002s : 4: predicate.mutable_eliminate 0.34% : 0.000001s : 4: predicate.opt_reshape 0.40% : 0.000001s : 4: predicate.parallel_virtual_node 1.70% : 0.000004s : 21: predicate.partial_defer_inline 1.61% : 0.000004s : 21: predicate.partial_eliminate 0.83% : 0.000002s : 13: predicate.print_const_string_wrapper 0.68% : 0.000001s : 8: predicate.reduce_all_const_elim 1.36% : 0.000003s : 13: predicate.reduce_eliminate 2.35% : 0.000005s : 38: predicate.redundant_stop_gradient_eliminater 0.47% : 0.000001s : 8: predicate.remove_not_recompute_node 1.35% : 0.000003s : 25: predicate.replace_applicator 0.65% : 0.000001s : 8: predicate.replace_old_param 0.27% : 0.000001s : 4: predicate.reset_defer_inline 1.03% : 0.000002s : 13: predicate.reshape_eliminate 0.66% : 0.000001s : 8: predicate.row_tensor_add_zeros_like 0.90% : 0.000002s : 4: predicate.row_tensor_eliminate 0.90% : 0.000002s : 8: predicate.same_eliminate 0.42% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.79% : 0.000002s : 8: predicate.shard_identity_eliminate 0.70% : 0.000002s : 8: predicate.special_op_eliminate 0.71% : 0.000002s : 8: predicate.specialize_transform 0.82% : 0.000002s : 8: predicate.split_environ_get_set_with_tuple_value 0.81% : 0.000002s : 8: predicate.stack_unstack_eliminate 0.30% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.47% : 0.000003s : 21: predicate.switch_defer_inline 1.98% : 0.000004s : 29: predicate.switch_layer_defer_inline 4.90% : 0.000011s : 67: predicate.switch_simplify 0.83% : 0.000002s : 13: predicate.tile_eliminate 0.96% : 0.000002s : 13: predicate.transpose_eliminate 1.63% : 0.000004s : 21: predicate.tuple_list_convert_item_index_to_positive 1.46% : 0.000003s : 21: predicate.tuple_list_get_item_const_eliminator 1.43% : 0.000003s : 21: predicate.tuple_list_get_item_depend_reorder 3.50% : 0.000008s : 33: predicate.tuple_list_get_item_eliminator 1.41% : 0.000003s : 21: predicate.tuple_list_get_set_item_eliminator 2.45% : 0.000005s : 29: predicate.tuple_list_set_item_eliminator 1.72% : 0.000004s : 25: predicate.tuple_to_list_eliminator_ 2.38% : 0.000005s : 38: predicate.updatestate_pure_node_eliminater 3.31% : 0.000007s : 46: predicate.updatestate_useless_node_eliminater 0.39% : 0.000001s : 4: predicate.value_based_eliminate 0.58% : 0.000001s : 8: predicate.virtual_dataset_eliminate 0.63% : 0.000001s : 8: predicate.virtual_output_eliminate 0.24% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.42% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000626 11 51.14% : 0.000320s : 5: func_graph_cloner_run.FuncGraphClonerGraph 48.86% : 0.000306s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.030718 192 0.02% : 0.000006s : 1: ForceFp32Comm 12.50% : 0.003841s : 1: add_attr 12.45% : 0.003823s : 1: add_attr_with_inline 0.02% : 0.000007s : 1: add_comm_op_reuse_tag 0.19% : 0.000058s : 1: add_recomputation 0.02% : 0.000007s : 1: assign_add_opt 0.25% : 0.000078s : 1: auto_monad 0.09% : 0.000027s : 1: auto_monad_reorder 0.02% : 0.000006s : 1: begin_end_overlap_inline 0.03% : 0.000008s : 1: bias_add_comm_swap 1.51% : 0.000465s : 1: bootstrap 0.13% : 0.000039s : 1: cconv 0.02% : 0.000006s : 1: comm_op_add_attrs 0.07% : 0.000020s : 1: control_data_broadcast_order 0.04% : 0.000013s : 1: convert_after_rewriter 0.10% : 0.000032s : 1: cse_after_recomputation 0.03% : 0.000008s : 1: dataset_repeat_opt 0.07% : 0.000021s : 1: detach_backward 0.04% : 0.000011s : 1: environ_conv 0.10% : 0.000030s : 1: event_method 0.03% : 0.000008s : 1: full_micro_interleaved_order_control 0.03% : 0.000009s : 1: get_jit_bprop_graph 0.05% : 0.000014s : 1: graph_reusing 0.02% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.02% : 0.000006s : 1: handle_group_info 0.03% : 0.000009s : 1: inline 0.03% : 0.000009s : 1: insert-virtual-dataset 0.02% : 0.000006s : 1: interleave_parallel_branches 0.02% : 0.000006s : 1: interleave_split_concat_branches 0.03% : 0.000008s : 1: label_fine_grained_interleaved_index 0.03% : 0.000011s : 1: label_micro_interleaved_index 1.56% : 0.000478s : 1: loop_unroll 0.02% : 0.000006s : 1: merge_cast_opt 0.03% : 0.000008s : 1: micro_interleaved_order_control 2.33% : 0.000715s : 1: mutable_eliminate 0.03% : 0.000010s : 1: offloading_packed_experts 0.05% : 0.000015s : 1: opt.transform.loop_unroll_optimizer 0.05% : 0.000015s : 1: opt.transform.mutable_eliminate 4.86% : 0.001493s : 78: opt.transform.opt_a 0.10% : 0.000030s : 1: opt.transform.opt_after_cconv 0.09% : 0.000026s : 1: opt.transform.opt_after_jit_grad 0.35% : 0.000106s : 28: opt.transform.opt_b 0.16% : 0.000050s : 2: opt.transform.opt_trans_graph 0.12% : 0.000037s : 4: opt.transform.symbol_engine_opt 11.94% : 0.003667s : 1: opt_a 0.44% : 0.000134s : 1: opt_after_cconv 1.91% : 0.000586s : 1: opt_after_jit_grad 0.91% : 0.000280s : 1: opt_b 21.41% : 0.006577s : 1: optimize 0.08% : 0.000025s : 1: optimize_parallel_all_gather_comm 0.04% : 0.000011s : 1: order_py_execute_after_rewriter 0.09% : 0.000028s : 1: overlap_grad_flash_sp 0.02% : 0.000006s : 1: overlap_grad_matmul_and_grad_allreduce 0.03% : 0.000010s : 1: overlap_grad_ring_attention 0.03% : 0.000008s : 1: overlap_opt_shard_grad_in_pipeline 0.02% : 0.000007s : 1: overlap_opt_shard_in_pipeline 0.03% : 0.000008s : 1: overlap_param_gather 0.02% : 0.000006s : 1: overlap_recompute_allgather_and_fa_grad 0.04% : 0.000011s : 1: overlap_recompute_and_grad_model_parallel 0.03% : 0.000008s : 1: overlap_recompute_comm 0.03% : 0.000010s : 1: parallel-infer-symbol 0.02% : 0.000006s : 1: parallel-infer-symbol-second 0.03% : 0.000008s : 1: partial_unused_args_eliminate 0.03% : 0.000010s : 1: pipeline_parallel_scheduler 0.02% : 0.000007s : 1: pipeline_split 0.15% : 0.000048s : 1: pre_auto_parallel 0.11% : 0.000033s : 1: py_interpret_to_execute 0.07% : 0.000020s : 1: py_interpret_to_execute_after_opt_a 0.02% : 0.000007s : 1: remove_cast_before_assign_add 0.06% : 0.000019s : 1: remove_dup_value 1.56% : 0.000479s : 1: renormalize.infer 1.25% : 0.000385s : 1: renormalize.specialize 0.03% : 0.000009s : 1: reorder_send_recv_between_fp_bp 0.04% : 0.000012s : 1: rewriter_after_jit_bprop_graph 0.16% : 0.000048s : 1: rewriter_after_opt_a 0.33% : 0.000101s : 1: rewriter_before_opt_a 0.03% : 0.000008s : 1: slice_cell_reuse_recomputed_activation 0.03% : 0.000009s : 1: slice_recompute_activation 0.02% : 0.000008s : 1: split_layernorm_comm 0.02% : 0.000007s : 1: split_matmul_comm_elemetwise 0.04% : 0.000011s : 1: swap_dp_allreduce_reducescatter 0.33% : 0.000102s : 1: symbol_engine_optimizer 0.31% : 0.000095s : 1: tuple_transform 20.55% : 0.006313s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:41:44.278.066 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.187352, [21] [bootstrap]: 0.00056389 [type_inference]: 0.133219 [event_method]: 2.385e-05 [auto_monad]: 7.24e-05 [graph_reusing]: 6.64001e-06 [inline]: 3.4e-06 [add_attr]: 0.00383837, [1] [add_attr_with_inline]: 0.0038245, [1] [Cycle 1]: 7.651e-05, [2] [tag_attr]: 2.525e-05 [meta_addattr_fg_expand]: 5.93002e-06 [parallel-infer-symbol]: 4.12998e-06 [pre_auto_parallel]: 4.581e-05 [insert-virtual-dataset]: 2.70002e-06 [parallel-infer-symbol-second]: 9.39996e-07 [dataset_repeat_opt]: 2.17999e-06 [pipeline_split]: 1.97999e-06 [optimize]: 0.0488346, [53] [py_interpret_to_execute]: 3.291e-05 [rewriter_before_opt_a]: 9.535e-05 [opt_a]: 0.0464428, [2] [Cycle 1]: 0.0456178, [45] [expand_dump_flag]: 3.01999e-06 [switch_simplify]: 4.524e-05 [loop_unroll]: 3.056e-05 [a_1]: 0.0007043 [with_stream_mark]: 2.196e-05 [recompute_prepare]: 9.66e-06 [updatestate_depend_eliminate]: 4.79002e-06 [updatestate_assign_eliminate]: 3.31999e-06 [updatestate_loads_eliminate]: 2.96999e-06 [parameter_eliminate]: 2.54001e-06 [a_2]: 9.144e-05 [accelerated_algorithm]: 7.85998e-06 [shard]: 2.25002e-06 [meta_shard_fg_expand]: 2.43e-06 [shard_inline]: 7.45e-06 [merge_send_recv]: 9.02999e-06 [auto_parallel]: 7.22002e-06 [parallel]: 2.228e-05 [flash_sp]: 9.18002e-06 [merge_comm]: 4.45e-06 [allreduce_fusion]: 3.54002e-06 [matmul_add_comm_reduction]: 1.051e-05 [allreduce_slice_to_reducescatter]: 7.60017e-07 [virtual_shard_identity]: 8.80999e-06 [virtual_dataset]: 7e-06 [get_grad_eliminate_]: 6.52001e-06 [virtual_output]: 6.88e-06 [merge_forward]: 4.75001e-06 [cell_reuse_recompute_pass]: 1.55001e-06 [offload_activation]: 1.128e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.483e-05 [merge_recompute_call_nodes]: 2.04e-06 [before_grad]: 1.137e-05 [set_forward_comm_id_for_comm_node_pass]: 4.05e-06 [meta_fg_expand]: 2.81e-06 [flash_sp_send_recv_attached]: 3.16001e-06 [receive_attached]: 2.53998e-06 [after_resolve]: 1.248e-05 [a_after_grad]: 1.058e-05 [renormalize]: 0.0440829 [add_forward_monad_depend]: 1.157e-05 [auto_monad_grad]: 3.04001e-06 [auto_monad_eliminator]: 2.542e-05 [cse]: 2.997e-05 [a_3]: 6.731e-05 [Cycle 2]: 0.00081202, [45] [expand_dump_flag]: 2.37001e-06 [switch_simplify]: 9.57999e-06 [loop_unroll]: 7.4e-06 [a_1]: 0.00016731 [with_stream_mark]: 2.159e-05 [recompute_prepare]: 7.36001e-06 [updatestate_depend_eliminate]: 4.23001e-06 [updatestate_assign_eliminate]: 3.23998e-06 [updatestate_loads_eliminate]: 3.33e-06 [parameter_eliminate]: 2.66e-06 [a_2]: 8.102e-05 [accelerated_algorithm]: 6.98e-06 [shard]: 2.99001e-06 [meta_shard_fg_expand]: 2.51998e-06 [shard_inline]: 6.54001e-06 [merge_send_recv]: 8.72e-06 [auto_parallel]: 9.28997e-06 [parallel]: 1.035e-05 [flash_sp]: 4.38001e-06 [merge_comm]: 3.56001e-06 [allreduce_fusion]: 3.53999e-06 [matmul_add_comm_reduction]: 1.014e-05 [allreduce_slice_to_reducescatter]: 9.30013e-07 [virtual_shard_identity]: 7.65998e-06 [virtual_dataset]: 6.74999e-06 [get_grad_eliminate_]: 6.08002e-06 [virtual_output]: 6.16e-06 [merge_forward]: 4.58001e-06 [cell_reuse_recompute_pass]: 3.18e-06 [offload_activation]: 4.71e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.34e-05 [merge_recompute_call_nodes]: 1.44998e-06 [before_grad]: 1.15e-05 [set_forward_comm_id_for_comm_node_pass]: 4.02e-06 [meta_fg_expand]: 2.59999e-06 [flash_sp_send_recv_attached]: 2.07001e-06 [receive_attached]: 2.15002e-06 [after_resolve]: 1.278e-05 [a_after_grad]: 1.02e-05 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 1.62999e-06 [auto_monad_grad]: 1.67001e-06 [auto_monad_eliminator]: 8.87999e-06 [cse]: 1.623e-05 [a_3]: 3.893e-05 [py_interpret_to_execute_after_opt_a]: 1.843e-05 [slice_cell_reuse_recomputed_activation]: 2.24001e-06 [rewriter_after_opt_a]: 3.926e-05 [convert_after_rewriter]: 6.69001e-06 [order_py_execute_after_rewriter]: 4.94e-06 [mutable_eliminate]: 0.00073336 [opt_b]: 0.00022352, [1] [Cycle 1]: 0.00021512, [7] [b_1]: 0.00013191 [b_2]: 1.012e-05 [updatestate_depend_eliminate]: 6.31e-06 [updatestate_assign_eliminate]: 2.82002e-06 [updatestate_loads_eliminate]: 2.53003e-06 [renormalize]: 8.89995e-07 [cse]: 2.096e-05 [optimize_parallel_all_gather_comm]: 1.833e-05 [overlap_param_gather]: 1.97999e-06 [cconv]: 3.139e-05 [loop_unroll]: 0.00048232 [opt_after_cconv]: 0.00010495, [1] [Cycle 1]: 9.888e-05, [7] [c_1]: 3.268e-05 [parameter_eliminate]: 3.90998e-06 [updatestate_depend_eliminate]: 5.61e-06 [updatestate_assign_eliminate]: 2.58e-06 [updatestate_loads_eliminate]: 2.26998e-06 [cse]: 1.736e-05 [renormalize]: 6.09987e-07 [remove_dup_value]: 1.496e-05 [tuple_transform]: 7.863e-05, [1] [Cycle 1]: 7.427e-05, [4] [d_1]: 4.638e-05 [none_parameter_eliminate]: 1.52001e-06 [renormalize]: 1.59984e-07 [switch_simplify]: 6.89001e-06 [partial_unused_args_eliminate]: 2.07999e-06 [add_recomputation]: 5.393e-05 [cse_after_recomputation]: 2.351e-05, [1] [Cycle 1]: 1.834e-05, [1] [cse]: 1.243e-05 [environ_conv]: 5.42001e-06 [swap_dp_allreduce_reducescatter]: 5.31002e-06 [bias_add_comm_swap]: 4.20999e-06 [label_micro_interleaved_index]: 4.52e-06 [label_fine_grained_interleaved_index]: 2.81e-06 [merge_cast_opt]: 1.71e-06 [slice_recompute_activation]: 2.29999e-06 [micro_interleaved_order_control]: 2.83e-06 [assign_add_opt]: 1.24998e-06 [ForceFp32Comm]: 1.27999e-06 [remove_cast_before_assign_add]: 1.32e-06 [full_micro_interleaved_order_control]: 2.31998e-06 [reorder_send_recv_between_fp_bp]: 2.94001e-06 [comm_op_add_attrs]: 1.04e-06 [add_comm_op_reuse_tag]: 1.09e-06 [interleave_split_concat_branches]: 1.20001e-06 [interleave_parallel_branches]: 1.29e-06 [overlap_opt_shard_in_pipeline]: 1.40999e-06 [overlap_opt_shard_grad_in_pipeline]: 2.56e-06 [control_data_broadcast_order]: 1.315e-05 [grouped_pairwise_exchange_alltoall]: 1.69998e-06 [offloading_packed_experts]: 4.35e-06 [overlap_recompute_and_grad_model_parallel]: 5.04998e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.29e-06 [overlap_recompute_allgather_and_fa_grad]: 1.40999e-06 [overlap_recompute_comm]: 2.17001e-06 [overlap_grad_ring_attention]: 4.08999e-06 [overlap_grad_flash_sp]: 2.295e-05 [begin_end_overlap_inline]: 5.49975e-07 [split_matmul_comm_elemetwise]: 2.45997e-06 [split_layernorm_comm]: 2.17999e-06 [handle_group_info]: 1.37999e-06 [symbol_engine_optimizer]: 8.417e-05, [1] [Cycle 1]: 7.936e-05, [6] [build]: 4.33999e-06 [elim_shapecalc]: 1.135e-05 [elim_not_effective]: 1.483e-05 [opt_reshape]: 8.1e-06 [fold_const_symbol]: 1.075e-05 [renormalize]: 2.19996e-07 [detach_backward]: 2.43e-06 [pipeline_parallel_scheduler]: 2.16e-06 [auto_monad_reorder]: 1.762e-05 [get_jit_bprop_graph]: 2.58998e-06 [rewriter_after_jit_bprop_graph]: 4.70999e-06 [opt_after_jit_grad]: 0.00048835 [validate]: 4.807e-05 Sums bootstrap : 0.000564s : 0.31% type_inference : 0.133219s : 73.01% event_method : 0.000024s : 0.01% auto_monad : 0.000072s : 0.04% graph_reusing : 0.000007s : 0.00% inline : 0.000003s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000025s : 0.01% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.00% parallel-infer-symbol : 0.000004s : 0.00% pre_auto_parallel : 0.000046s : 0.03% insert-virtual-dataset : 0.000003s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000033s : 0.02% optimize.rewriter_before_opt_a : 0.000095s : 0.05% optimize.opt_a.expand_dump_flag : 0.000005s : 0.00% optimize.opt_a.switch_simplify : 0.000055s : 0.03% optimize.opt_a.loop_unroll : 0.000038s : 0.02% optimize.opt_a.a_1 : 0.000872s : 0.48% optimize.opt_a.with_stream_mark : 0.000044s : 0.02% optimize.opt_a.recompute_prepare : 0.000017s : 0.01% optimize.opt_a.updatestate_depend_eliminate : 0.000009s : 0.00% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.00% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.00% optimize.opt_a.parameter_eliminate : 0.000005s : 0.00% optimize.opt_a.a_2 : 0.000172s : 0.09% optimize.opt_a.accelerated_algorithm : 0.000015s : 0.01% optimize.opt_a.shard : 0.000005s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.000005s : 0.00% optimize.opt_a.shard_inline : 0.000014s : 0.01% optimize.opt_a.merge_send_recv : 0.000018s : 0.01% optimize.opt_a.auto_parallel : 0.000017s : 0.01% optimize.opt_a.parallel : 0.000033s : 0.02% optimize.opt_a.flash_sp : 0.000014s : 0.01% optimize.opt_a.merge_comm : 0.000008s : 0.00% optimize.opt_a.allreduce_fusion : 0.000007s : 0.00% optimize.opt_a.matmul_add_comm_reduction : 0.000021s : 0.01% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000002s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000016s : 0.01% optimize.opt_a.virtual_dataset : 0.000014s : 0.01% optimize.opt_a.get_grad_eliminate_ : 0.000013s : 0.01% optimize.opt_a.virtual_output : 0.000013s : 0.01% optimize.opt_a.merge_forward : 0.000009s : 0.01% optimize.opt_a.cell_reuse_recompute_pass : 0.000005s : 0.00% optimize.opt_a.offload_activation : 0.000058s : 0.03% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000028s : 0.02% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.00% optimize.opt_a.before_grad : 0.000023s : 0.01% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000008s : 0.00% optimize.opt_a.meta_fg_expand : 0.000005s : 0.00% optimize.opt_a.flash_sp_send_recv_attached : 0.000005s : 0.00% optimize.opt_a.receive_attached : 0.000005s : 0.00% optimize.opt_a.after_resolve : 0.000025s : 0.01% optimize.opt_a.a_after_grad : 0.000021s : 0.01% optimize.opt_a.renormalize : 0.044083s : 24.16% optimize.opt_a.add_forward_monad_depend : 0.000013s : 0.01% optimize.opt_a.auto_monad_grad : 0.000005s : 0.00% optimize.opt_a.auto_monad_eliminator : 0.000034s : 0.02% optimize.opt_a.cse : 0.000046s : 0.03% optimize.opt_a.a_3 : 0.000106s : 0.06% optimize.py_interpret_to_execute_after_opt_a : 0.000018s : 0.01% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.00% optimize.rewriter_after_opt_a : 0.000039s : 0.02% optimize.convert_after_rewriter : 0.000007s : 0.00% optimize.order_py_execute_after_rewriter : 0.000005s : 0.00% optimize.mutable_eliminate : 0.000733s : 0.40% optimize.opt_b.b_1 : 0.000132s : 0.07% optimize.opt_b.b_2 : 0.000010s : 0.01% optimize.opt_b.updatestate_depend_eliminate : 0.000006s : 0.00% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.00% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000021s : 0.01% optimize.optimize_parallel_all_gather_comm : 0.000018s : 0.01% optimize.overlap_param_gather : 0.000002s : 0.00% optimize.cconv : 0.000031s : 0.02% optimize.loop_unroll : 0.000482s : 0.26% optimize.opt_after_cconv.c_1 : 0.000033s : 0.02% optimize.opt_after_cconv.parameter_eliminate : 0.000004s : 0.00% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.00% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.00% optimize.opt_after_cconv.cse : 0.000017s : 0.01% optimize.opt_after_cconv.renormalize : 0.000001s : 0.00% optimize.remove_dup_value : 0.000015s : 0.01% optimize.tuple_transform.d_1 : 0.000046s : 0.03% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000007s : 0.00% optimize.partial_unused_args_eliminate : 0.000002s : 0.00% optimize.add_recomputation : 0.000054s : 0.03% optimize.cse_after_recomputation.cse : 0.000012s : 0.01% optimize.environ_conv : 0.000005s : 0.00% optimize.swap_dp_allreduce_reducescatter : 0.000005s : 0.00% optimize.bias_add_comm_swap : 0.000004s : 0.00% optimize.label_micro_interleaved_index : 0.000005s : 0.00% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.00% optimize.merge_cast_opt : 0.000002s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.00% optimize.micro_interleaved_order_control : 0.000003s : 0.00% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.00% optimize.full_micro_interleaved_order_control : 0.000002s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.00% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000003s : 0.00% optimize.control_data_broadcast_order : 0.000013s : 0.01% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.00% optimize.offloading_packed_experts : 0.000004s : 0.00% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.00% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.00% optimize.overlap_recompute_comm : 0.000002s : 0.00% optimize.overlap_grad_ring_attention : 0.000004s : 0.00% optimize.overlap_grad_flash_sp : 0.000023s : 0.01% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.00% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000004s : 0.00% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000011s : 0.01% optimize.symbol_engine_optimizer.elim_not_effective : 0.000015s : 0.01% optimize.symbol_engine_optimizer.opt_reshape : 0.000008s : 0.00% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000011s : 0.01% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.00% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000018s : 0.01% get_jit_bprop_graph : 0.000003s : 0.00% rewriter_after_jit_bprop_graph : 0.000005s : 0.00% opt_after_jit_grad : 0.000488s : 0.27% validate : 0.000048s : 0.03% Time group info: ------[substitution.] 0.000267 34 16.30% : 0.000044s : 6: substitution.arithmetic_simplify 0.92% : 0.000002s : 2: substitution.elim_not_effective 0.57% : 0.000002s : 2: substitution.fold_const_symbol 2.51% : 0.000007s : 4: substitution.graph_param_transform 66.72% : 0.000178s : 4: substitution.inline 1.72% : 0.000005s : 4: substitution.j_node_and_user_rematch 2.18% : 0.000006s : 4: substitution.remove_not_recompute_node 2.43% : 0.000006s : 4: substitution.replace_old_param 6.65% : 0.000018s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.133135 2 99.24% : 0.132121s : 1: type_inference.infer 0.76% : 0.001014s : 1: type_inference.specialize ------[replace.] 0.000068 8 64.35% : 0.000044s : 4: replace.inline 35.65% : 0.000024s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000191 8 91.68% : 0.000175s : 4: match.inline 8.32% : 0.000016s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000228 1278 1.04% : 0.000002s : 13: predicate.accumulaten_eliminater 0.67% : 0.000002s : 4: predicate.ad_related_special_op_eliminate 0.52% : 0.000001s : 8: predicate.addn_check_dump 1.21% : 0.000003s : 13: predicate.addn_zero_filter 0.74% : 0.000002s : 13: predicate.adjust_all_reduce_mul_add 2.99% : 0.000007s : 21: predicate.arithmetic_simplify 0.94% : 0.000002s : 13: predicate.cast_eliminate 0.54% : 0.000001s : 8: predicate.check_bprop_eliminate 0.51% : 0.000001s : 8: predicate.compare_switch_simplify 0.19% : 0.000000s : 4: predicate.const_output_eliminate 0.56% : 0.000001s : 8: predicate.depend_value_elim 1.13% : 0.000003s : 13: predicate.dict_get_item_const_eliminator 1.20% : 0.000003s : 13: predicate.dict_get_item_eliminator 0.79% : 0.000002s : 13: predicate.dict_set_item_eliminator 0.89% : 0.000002s : 8: predicate.dumpgradient_eliminate 0.25% : 0.000001s : 4: predicate.elim_not_effective 0.37% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.13% : 0.000003s : 17: predicate.environ_add_const_eliminate 0.99% : 0.000002s : 17: predicate.environ_get_add_eliminate 1.12% : 0.000003s : 17: predicate.environ_get_depend_swap 1.83% : 0.000004s : 25: predicate.environ_get_eliminate 1.12% : 0.000003s : 17: predicate.environ_get_set_eliminate 1.40% : 0.000003s : 21: predicate.exchange_switch_depend_value 2.40% : 0.000005s : 21: predicate.float_depend_g_call 0.53% : 0.000001s : 8: predicate.float_environ_get_switch 0.84% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.17% : 0.000000s : 4: predicate.fold_const_symbol 0.81% : 0.000002s : 8: predicate.get_grad_eliminate 0.19% : 0.000000s : 4: predicate.graph_param_transform 0.55% : 0.000001s : 8: predicate.incorporate_call 0.43% : 0.000001s : 8: predicate.incorporate_call_switch 6.59% : 0.000015s : 58: predicate.inline 0.76% : 0.000002s : 8: predicate.inline_without_move 0.44% : 0.000001s : 8: predicate.j_node_and_user_rematch 1.02% : 0.000002s : 8: predicate.less_batch_normalization 1.84% : 0.000004s : 25: predicate.list_to_tuple_eliminator_ 2.48% : 0.000006s : 38: predicate.load_eliminater 0.75% : 0.000002s : 4: predicate.loop_unroll_after_grad 2.16% : 0.000005s : 34: predicate.loop_unroll_before_grad 1.70% : 0.000004s : 21: predicate.make_slice_get_slice_eliminator 0.50% : 0.000001s : 8: predicate.merge_addn 0.52% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.53% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.89% : 0.000002s : 13: predicate.minmaximum_grad 1.21% : 0.000003s : 4: predicate.mutable_eliminate 0.36% : 0.000001s : 4: predicate.opt_reshape 0.43% : 0.000001s : 4: predicate.parallel_virtual_node 1.70% : 0.000004s : 21: predicate.partial_defer_inline 1.52% : 0.000003s : 21: predicate.partial_eliminate 0.86% : 0.000002s : 13: predicate.print_const_string_wrapper 0.67% : 0.000002s : 8: predicate.reduce_all_const_elim 1.16% : 0.000003s : 13: predicate.reduce_eliminate 2.37% : 0.000005s : 38: predicate.redundant_stop_gradient_eliminater 0.52% : 0.000001s : 8: predicate.remove_not_recompute_node 1.42% : 0.000003s : 25: predicate.replace_applicator 0.57% : 0.000001s : 8: predicate.replace_old_param 0.25% : 0.000001s : 4: predicate.reset_defer_inline 0.99% : 0.000002s : 13: predicate.reshape_eliminate 0.62% : 0.000001s : 8: predicate.row_tensor_add_zeros_like 0.50% : 0.000001s : 4: predicate.row_tensor_eliminate 0.86% : 0.000002s : 8: predicate.same_eliminate 0.38% : 0.000001s : 8: predicate.set_cell_output_no_recompute 1.03% : 0.000002s : 8: predicate.shard_identity_eliminate 0.60% : 0.000001s : 8: predicate.special_op_eliminate 0.62% : 0.000001s : 8: predicate.specialize_transform 1.06% : 0.000002s : 8: predicate.split_environ_get_set_with_tuple_value 0.84% : 0.000002s : 8: predicate.stack_unstack_eliminate 0.30% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.42% : 0.000003s : 21: predicate.switch_defer_inline 1.88% : 0.000004s : 29: predicate.switch_layer_defer_inline 4.86% : 0.000011s : 67: predicate.switch_simplify 0.92% : 0.000002s : 13: predicate.tile_eliminate 0.92% : 0.000002s : 13: predicate.transpose_eliminate 1.49% : 0.000003s : 21: predicate.tuple_list_convert_item_index_to_positive 1.48% : 0.000003s : 21: predicate.tuple_list_get_item_const_eliminator 1.83% : 0.000004s : 21: predicate.tuple_list_get_item_depend_reorder 2.99% : 0.000007s : 33: predicate.tuple_list_get_item_eliminator 1.53% : 0.000003s : 21: predicate.tuple_list_get_set_item_eliminator 2.10% : 0.000005s : 29: predicate.tuple_list_set_item_eliminator 1.85% : 0.000004s : 25: predicate.tuple_to_list_eliminator_ 2.30% : 0.000005s : 38: predicate.updatestate_pure_node_eliminater 2.75% : 0.000006s : 46: predicate.updatestate_useless_node_eliminater 0.43% : 0.000001s : 4: predicate.value_based_eliminate 0.78% : 0.000002s : 8: predicate.virtual_dataset_eliminate 0.65% : 0.000001s : 8: predicate.virtual_output_eliminate 0.28% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.43% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000753 11 41.67% : 0.000314s : 5: func_graph_cloner_run.FuncGraphClonerGraph 58.33% : 0.000439s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.285567 192 0.00% : 0.000004s : 1: ForceFp32Comm 1.35% : 0.003844s : 1: add_attr 1.34% : 0.003829s : 1: add_attr_with_inline 0.00% : 0.000004s : 1: add_comm_op_reuse_tag 0.02% : 0.000058s : 1: add_recomputation 0.00% : 0.000004s : 1: assign_add_opt 0.03% : 0.000077s : 1: auto_monad 0.01% : 0.000021s : 1: auto_monad_reorder 0.00% : 0.000004s : 1: begin_end_overlap_inline 0.00% : 0.000007s : 1: bias_add_comm_swap 0.21% : 0.000596s : 1: bootstrap 0.01% : 0.000035s : 1: cconv 0.00% : 0.000004s : 1: comm_op_add_attrs 0.01% : 0.000016s : 1: control_data_broadcast_order 0.00% : 0.000010s : 1: convert_after_rewriter 0.01% : 0.000026s : 1: cse_after_recomputation 0.00% : 0.000005s : 1: dataset_repeat_opt 0.00% : 0.000006s : 1: detach_backward 0.00% : 0.000009s : 1: environ_conv 0.01% : 0.000032s : 1: event_method 0.00% : 0.000005s : 1: full_micro_interleaved_order_control 0.00% : 0.000006s : 1: get_jit_bprop_graph 0.00% : 0.000010s : 1: graph_reusing 0.00% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000004s : 1: handle_group_info 0.00% : 0.000006s : 1: inline 0.00% : 0.000006s : 1: insert-virtual-dataset 0.00% : 0.000004s : 1: interleave_parallel_branches 0.00% : 0.000004s : 1: interleave_split_concat_branches 0.00% : 0.000006s : 1: label_fine_grained_interleaved_index 0.00% : 0.000007s : 1: label_micro_interleaved_index 0.17% : 0.000490s : 1: loop_unroll 0.00% : 0.000005s : 1: merge_cast_opt 0.00% : 0.000006s : 1: micro_interleaved_order_control 0.26% : 0.000743s : 1: mutable_eliminate 0.00% : 0.000007s : 1: offloading_packed_experts 0.01% : 0.000015s : 1: opt.transform.loop_unroll_optimizer 0.01% : 0.000016s : 1: opt.transform.mutable_eliminate 0.47% : 0.001332s : 78: opt.transform.opt_a 0.01% : 0.000032s : 1: opt.transform.opt_after_cconv 0.01% : 0.000026s : 1: opt.transform.opt_after_jit_grad 0.04% : 0.000106s : 28: opt.transform.opt_b 0.02% : 0.000051s : 2: opt.transform.opt_trans_graph 0.01% : 0.000041s : 4: opt.transform.symbol_engine_opt 16.26% : 0.046447s : 1: opt_a 0.04% : 0.000109s : 1: opt_after_cconv 0.17% : 0.000498s : 1: opt_after_jit_grad 0.08% : 0.000228s : 1: opt_b 17.10% : 0.048841s : 1: optimize 0.01% : 0.000022s : 1: optimize_parallel_all_gather_comm 0.00% : 0.000008s : 1: order_py_execute_after_rewriter 0.01% : 0.000027s : 1: overlap_grad_flash_sp 0.00% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.00% : 0.000007s : 1: overlap_grad_ring_attention 0.00% : 0.000006s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000005s : 1: overlap_param_gather 0.00% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.00% : 0.000008s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000005s : 1: overlap_recompute_comm 0.00% : 0.000008s : 1: parallel-infer-symbol 0.00% : 0.000004s : 1: parallel-infer-symbol-second 0.00% : 0.000005s : 1: partial_unused_args_eliminate 0.00% : 0.000005s : 1: pipeline_parallel_scheduler 0.00% : 0.000005s : 1: pipeline_split 0.02% : 0.000050s : 1: pre_auto_parallel 0.01% : 0.000037s : 1: py_interpret_to_execute 0.01% : 0.000022s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000004s : 1: remove_cast_before_assign_add 0.01% : 0.000019s : 1: remove_dup_value 15.25% : 0.043549s : 1: renormalize.infer 0.18% : 0.000515s : 1: renormalize.specialize 0.00% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.00% : 0.000008s : 1: rewriter_after_jit_bprop_graph 0.02% : 0.000043s : 1: rewriter_after_opt_a 0.04% : 0.000100s : 1: rewriter_before_opt_a 0.00% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000006s : 1: slice_recompute_activation 0.00% : 0.000005s : 1: split_layernorm_comm 0.00% : 0.000005s : 1: split_matmul_comm_elemetwise 0.00% : 0.000008s : 1: swap_dp_allreduce_reducescatter 0.03% : 0.000087s : 1: symbol_engine_optimizer 0.03% : 0.000082s : 1: tuple_transform 46.66% : 0.133242s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:41:46.787.991 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:41:46.788.267 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0191018, [21] [bootstrap]: 0.000444 [type_inference]: 0.00673687 [event_method]: 2.377e-05 [auto_monad]: 7.038e-05 [graph_reusing]: 6.43998e-06 [inline]: 2.81e-06 [add_attr]: 0.00398345, [1] [add_attr_with_inline]: 0.00396956, [1] [Cycle 1]: 0.00010363, [2] [tag_attr]: 2.583e-05 [meta_addattr_fg_expand]: 6.09001e-06 [parallel-infer-symbol]: 3.55003e-06 [pre_auto_parallel]: 4.655e-05 [insert-virtual-dataset]: 2.56e-06 [parallel-infer-symbol-second]: 7.89994e-07 [dataset_repeat_opt]: 1.84e-06 [pipeline_split]: 1.55001e-06 [optimize]: 0.00652035, [53] [py_interpret_to_execute]: 4.04e-05 [rewriter_before_opt_a]: 0.00010855 [opt_a]: 0.00368147, [2] [Cycle 1]: 0.00272827, [45] [expand_dump_flag]: 3.43e-06 [switch_simplify]: 4.366e-05 [loop_unroll]: 3.094e-05 [a_1]: 0.00071793 [with_stream_mark]: 5.889e-05 [recompute_prepare]: 1.239e-05 [updatestate_depend_eliminate]: 4.70999e-06 [updatestate_assign_eliminate]: 3.93001e-06 [updatestate_loads_eliminate]: 3.46001e-06 [parameter_eliminate]: 2.46e-06 [a_2]: 0.00012444 [accelerated_algorithm]: 7.58999e-06 [shard]: 2.21e-06 [meta_shard_fg_expand]: 2.09e-06 [shard_inline]: 7.14001e-06 [merge_send_recv]: 9.34e-06 [auto_parallel]: 9.19e-06 [parallel]: 2.254e-05 [flash_sp]: 1.177e-05 [merge_comm]: 4.72e-06 [allreduce_fusion]: 3.57002e-06 [matmul_add_comm_reduction]: 1.111e-05 [allreduce_slice_to_reducescatter]: 1.04003e-06 [virtual_shard_identity]: 3.804e-05 [virtual_dataset]: 9.02e-06 [get_grad_eliminate_]: 7.69002e-06 [virtual_output]: 7.59002e-06 [merge_forward]: 5.79999e-06 [cell_reuse_recompute_pass]: 2.43998e-06 [offload_activation]: 1.197e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.69e-05 [merge_recompute_call_nodes]: 1.50999e-06 [before_grad]: 1.259e-05 [set_forward_comm_id_for_comm_node_pass]: 4.34002e-06 [meta_fg_expand]: 3.53999e-06 [flash_sp_send_recv_attached]: 3.31999e-06 [receive_attached]: 2.56e-06 [after_resolve]: 1.4e-05 [a_after_grad]: 1.217e-05 [renormalize]: 0.00090447 [add_forward_monad_depend]: 9.24e-06 [auto_monad_grad]: 2.43e-06 [auto_monad_eliminator]: 1.943e-05 [cse]: 3.102e-05 [a_3]: 7.104e-05 [Cycle 2]: 0.00093562, [45] [expand_dump_flag]: 2.61e-06 [switch_simplify]: 8.47e-06 [loop_unroll]: 7.24001e-06 [a_1]: 0.00015418 [with_stream_mark]: 2.216e-05 [recompute_prepare]: 6.63e-06 [updatestate_depend_eliminate]: 3.92998e-06 [updatestate_assign_eliminate]: 3.19001e-06 [updatestate_loads_eliminate]: 2.90002e-06 [parameter_eliminate]: 2.05002e-06 [a_2]: 0.0001096 [accelerated_algorithm]: 6.79001e-06 [shard]: 1.50001e-06 [meta_shard_fg_expand]: 2.14999e-06 [shard_inline]: 6.93998e-06 [merge_send_recv]: 6.49001e-06 [auto_parallel]: 8.90999e-06 [parallel]: 8e-06 [flash_sp]: 3.66999e-06 [merge_comm]: 3.82002e-06 [allreduce_fusion]: 3.84002e-06 [matmul_add_comm_reduction]: 8.47998e-06 [allreduce_slice_to_reducescatter]: 5.40022e-07 [virtual_shard_identity]: 7.76001e-06 [virtual_dataset]: 9.04e-06 [get_grad_eliminate_]: 5.97999e-06 [virtual_output]: 5.99999e-06 [merge_forward]: 4.52e-06 [cell_reuse_recompute_pass]: 2.81999e-06 [offload_activation]: 1.103e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.626e-05 [merge_recompute_call_nodes]: 1.44998e-06 [before_grad]: 1.096e-05 [set_forward_comm_id_for_comm_node_pass]: 5.43002e-06 [meta_fg_expand]: 2.32001e-06 [flash_sp_send_recv_attached]: 1.77001e-06 [receive_attached]: 2.36998e-06 [after_resolve]: 1.323e-05 [a_after_grad]: 1.05e-05 [renormalize]: 8.00064e-08 [add_forward_monad_depend]: 1.77999e-06 [auto_monad_grad]: 1.97999e-06 [auto_monad_eliminator]: 1.288e-05 [cse]: 1.728e-05 [a_3]: 5.485e-05 [py_interpret_to_execute_after_opt_a]: 1.954e-05 [slice_cell_reuse_recomputed_activation]: 4.81002e-06 [rewriter_after_opt_a]: 4.983e-05 [convert_after_rewriter]: 1.257e-05 [order_py_execute_after_rewriter]: 8.27e-06 [mutable_eliminate]: 0.0007953 [opt_b]: 0.0003401, [1] [Cycle 1]: 0.00032846, [7] [b_1]: 0.00017647 [b_2]: 9.52001e-06 [updatestate_depend_eliminate]: 9.39998e-06 [updatestate_assign_eliminate]: 3.14999e-06 [updatestate_loads_eliminate]: 2.79999e-06 [renormalize]: 1.20999e-06 [cse]: 2.517e-05 [optimize_parallel_all_gather_comm]: 2.4e-05 [overlap_param_gather]: 5.75001e-06 [cconv]: 3.89e-05 [loop_unroll]: 0.00053371 [opt_after_cconv]: 0.00013267, [1] [Cycle 1]: 0.00012358, [7] [c_1]: 3.245e-05 [parameter_eliminate]: 4.70999e-06 [updatestate_depend_eliminate]: 6.52001e-06 [updatestate_assign_eliminate]: 2.82002e-06 [updatestate_loads_eliminate]: 2.32999e-06 [cse]: 1.912e-05 [renormalize]: 5.59987e-07 [remove_dup_value]: 1.877e-05 [tuple_transform]: 9.628e-05, [1] [Cycle 1]: 8.901e-05, [4] [d_1]: 4.913e-05 [none_parameter_eliminate]: 1.69e-06 [renormalize]: 1.8999e-07 [switch_simplify]: 7.31999e-06 [partial_unused_args_eliminate]: 4.32003e-06 [add_recomputation]: 5.769e-05 [cse_after_recomputation]: 2.768e-05, [1] [Cycle 1]: 2.075e-05, [1] [cse]: 1.161e-05 [environ_conv]: 8.92e-06 [swap_dp_allreduce_reducescatter]: 7.71999e-06 [bias_add_comm_swap]: 5.97999e-06 [label_micro_interleaved_index]: 7.08e-06 [label_fine_grained_interleaved_index]: 5.15001e-06 [merge_cast_opt]: 3.78001e-06 [slice_recompute_activation]: 4.64002e-06 [micro_interleaved_order_control]: 4.67e-06 [assign_add_opt]: 3.92998e-06 [ForceFp32Comm]: 3.95e-06 [remove_cast_before_assign_add]: 3.73999e-06 [full_micro_interleaved_order_control]: 4.93001e-06 [reorder_send_recv_between_fp_bp]: 5.84999e-06 [comm_op_add_attrs]: 3.68999e-06 [add_comm_op_reuse_tag]: 3.56001e-06 [interleave_split_concat_branches]: 3.85e-06 [interleave_parallel_branches]: 3.5e-06 [overlap_opt_shard_in_pipeline]: 3.73001e-06 [overlap_opt_shard_grad_in_pipeline]: 4.58999e-06 [control_data_broadcast_order]: 1.692e-05 [grouped_pairwise_exchange_alltoall]: 4e-06 [offloading_packed_experts]: 7.31999e-06 [overlap_recompute_and_grad_model_parallel]: 7.31001e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.68e-06 [overlap_recompute_allgather_and_fa_grad]: 3.71999e-06 [overlap_recompute_comm]: 5.15999e-06 [overlap_grad_ring_attention]: 6.88e-06 [overlap_grad_flash_sp]: 2.59e-05 [begin_end_overlap_inline]: 3.34001e-06 [split_matmul_comm_elemetwise]: 4.95001e-06 [split_layernorm_comm]: 4.42998e-06 [handle_group_info]: 3.63999e-06 [symbol_engine_optimizer]: 0.00010086, [1] [Cycle 1]: 9.372e-05, [6] [build]: 3.73001e-06 [elim_shapecalc]: 1.101e-05 [elim_not_effective]: 1.401e-05 [opt_reshape]: 7.36999e-06 [fold_const_symbol]: 1.014e-05 [renormalize]: 2.50002e-07 [detach_backward]: 4.07998e-06 [pipeline_parallel_scheduler]: 1.79e-06 [auto_monad_reorder]: 2.122e-05 [get_jit_bprop_graph]: 2.27999e-06 [rewriter_after_jit_bprop_graph]: 5.20001e-06 [opt_after_jit_grad]: 0.00055578 [validate]: 4.465e-05 Sums bootstrap : 0.000444s : 3.36% type_inference : 0.006737s : 51.01% event_method : 0.000024s : 0.18% auto_monad : 0.000070s : 0.53% graph_reusing : 0.000006s : 0.05% inline : 0.000003s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000026s : 0.20% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.05% parallel-infer-symbol : 0.000004s : 0.03% pre_auto_parallel : 0.000047s : 0.35% insert-virtual-dataset : 0.000003s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.01% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000040s : 0.31% optimize.rewriter_before_opt_a : 0.000109s : 0.82% optimize.opt_a.expand_dump_flag : 0.000006s : 0.05% optimize.opt_a.switch_simplify : 0.000052s : 0.39% optimize.opt_a.loop_unroll : 0.000038s : 0.29% optimize.opt_a.a_1 : 0.000872s : 6.60% optimize.opt_a.with_stream_mark : 0.000081s : 0.61% optimize.opt_a.recompute_prepare : 0.000019s : 0.14% optimize.opt_a.updatestate_depend_eliminate : 0.000009s : 0.07% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.05% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.05% optimize.opt_a.parameter_eliminate : 0.000005s : 0.03% optimize.opt_a.a_2 : 0.000234s : 1.77% optimize.opt_a.accelerated_algorithm : 0.000014s : 0.11% optimize.opt_a.shard : 0.000004s : 0.03% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.03% optimize.opt_a.shard_inline : 0.000014s : 0.11% optimize.opt_a.merge_send_recv : 0.000016s : 0.12% optimize.opt_a.auto_parallel : 0.000018s : 0.14% optimize.opt_a.parallel : 0.000031s : 0.23% optimize.opt_a.flash_sp : 0.000015s : 0.12% optimize.opt_a.merge_comm : 0.000009s : 0.06% optimize.opt_a.allreduce_fusion : 0.000007s : 0.06% optimize.opt_a.matmul_add_comm_reduction : 0.000020s : 0.15% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000002s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000046s : 0.35% optimize.opt_a.virtual_dataset : 0.000018s : 0.14% optimize.opt_a.get_grad_eliminate_ : 0.000014s : 0.10% optimize.opt_a.virtual_output : 0.000014s : 0.10% optimize.opt_a.merge_forward : 0.000010s : 0.08% optimize.opt_a.cell_reuse_recompute_pass : 0.000005s : 0.04% optimize.opt_a.offload_activation : 0.000023s : 0.17% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000033s : 0.25% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.02% optimize.opt_a.before_grad : 0.000024s : 0.18% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000010s : 0.07% optimize.opt_a.meta_fg_expand : 0.000006s : 0.04% optimize.opt_a.flash_sp_send_recv_attached : 0.000005s : 0.04% optimize.opt_a.receive_attached : 0.000005s : 0.04% optimize.opt_a.after_resolve : 0.000027s : 0.21% optimize.opt_a.a_after_grad : 0.000023s : 0.17% optimize.opt_a.renormalize : 0.000905s : 6.85% optimize.opt_a.add_forward_monad_depend : 0.000011s : 0.08% optimize.opt_a.auto_monad_grad : 0.000004s : 0.03% optimize.opt_a.auto_monad_eliminator : 0.000032s : 0.24% optimize.opt_a.cse : 0.000048s : 0.37% optimize.opt_a.a_3 : 0.000126s : 0.95% optimize.py_interpret_to_execute_after_opt_a : 0.000020s : 0.15% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.04% optimize.rewriter_after_opt_a : 0.000050s : 0.38% optimize.convert_after_rewriter : 0.000013s : 0.10% optimize.order_py_execute_after_rewriter : 0.000008s : 0.06% optimize.mutable_eliminate : 0.000795s : 6.02% optimize.opt_b.b_1 : 0.000176s : 1.34% optimize.opt_b.b_2 : 0.000010s : 0.07% optimize.opt_b.updatestate_depend_eliminate : 0.000009s : 0.07% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.02% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.02% optimize.opt_b.renormalize : 0.000001s : 0.01% optimize.opt_b.cse : 0.000025s : 0.19% optimize.optimize_parallel_all_gather_comm : 0.000024s : 0.18% optimize.overlap_param_gather : 0.000006s : 0.04% optimize.cconv : 0.000039s : 0.29% optimize.loop_unroll : 0.000534s : 4.04% optimize.opt_after_cconv.c_1 : 0.000032s : 0.25% optimize.opt_after_cconv.parameter_eliminate : 0.000005s : 0.04% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000007s : 0.05% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.02% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.02% optimize.opt_after_cconv.cse : 0.000019s : 0.14% optimize.opt_after_cconv.renormalize : 0.000001s : 0.00% optimize.remove_dup_value : 0.000019s : 0.14% optimize.tuple_transform.d_1 : 0.000049s : 0.37% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000007s : 0.06% optimize.partial_unused_args_eliminate : 0.000004s : 0.03% optimize.add_recomputation : 0.000058s : 0.44% optimize.cse_after_recomputation.cse : 0.000012s : 0.09% optimize.environ_conv : 0.000009s : 0.07% optimize.swap_dp_allreduce_reducescatter : 0.000008s : 0.06% optimize.bias_add_comm_swap : 0.000006s : 0.05% optimize.label_micro_interleaved_index : 0.000007s : 0.05% optimize.label_fine_grained_interleaved_index : 0.000005s : 0.04% optimize.merge_cast_opt : 0.000004s : 0.03% optimize.slice_recompute_activation : 0.000005s : 0.04% optimize.micro_interleaved_order_control : 0.000005s : 0.04% optimize.assign_add_opt : 0.000004s : 0.03% optimize.ForceFp32Comm : 0.000004s : 0.03% optimize.remove_cast_before_assign_add : 0.000004s : 0.03% optimize.full_micro_interleaved_order_control : 0.000005s : 0.04% optimize.reorder_send_recv_between_fp_bp : 0.000006s : 0.04% optimize.comm_op_add_attrs : 0.000004s : 0.03% optimize.add_comm_op_reuse_tag : 0.000004s : 0.03% optimize.interleave_split_concat_branches : 0.000004s : 0.03% optimize.interleave_parallel_branches : 0.000003s : 0.03% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.03% optimize.overlap_opt_shard_grad_in_pipeline : 0.000005s : 0.03% optimize.control_data_broadcast_order : 0.000017s : 0.13% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.03% optimize.offloading_packed_experts : 0.000007s : 0.06% optimize.overlap_recompute_and_grad_model_parallel : 0.000007s : 0.06% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.03% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.03% optimize.overlap_recompute_comm : 0.000005s : 0.04% optimize.overlap_grad_ring_attention : 0.000007s : 0.05% optimize.overlap_grad_flash_sp : 0.000026s : 0.20% optimize.begin_end_overlap_inline : 0.000003s : 0.03% optimize.split_matmul_comm_elemetwise : 0.000005s : 0.04% optimize.split_layernorm_comm : 0.000004s : 0.03% optimize.handle_group_info : 0.000004s : 0.03% optimize.symbol_engine_optimizer.build : 0.000004s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000011s : 0.08% optimize.symbol_engine_optimizer.elim_not_effective : 0.000014s : 0.11% optimize.symbol_engine_optimizer.opt_reshape : 0.000007s : 0.06% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000010s : 0.08% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000004s : 0.03% pipeline_parallel_scheduler : 0.000002s : 0.01% auto_monad_reorder : 0.000021s : 0.16% get_jit_bprop_graph : 0.000002s : 0.02% rewriter_after_jit_bprop_graph : 0.000005s : 0.04% opt_after_jit_grad : 0.000556s : 4.21% validate : 0.000045s : 0.34% Time group info: ------[substitution.] 0.000268 34 15.34% : 0.000041s : 6: substitution.arithmetic_simplify 0.69% : 0.000002s : 2: substitution.elim_not_effective 0.58% : 0.000002s : 2: substitution.fold_const_symbol 2.35% : 0.000006s : 4: substitution.graph_param_transform 67.79% : 0.000182s : 4: substitution.inline 2.00% : 0.000005s : 4: substitution.j_node_and_user_rematch 2.07% : 0.000006s : 4: substitution.remove_not_recompute_node 2.74% : 0.000007s : 4: substitution.replace_old_param 6.45% : 0.000017s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.006673 2 87.81% : 0.005859s : 1: type_inference.infer 12.19% : 0.000813s : 1: type_inference.specialize ------[replace.] 0.000069 8 66.96% : 0.000047s : 4: replace.inline 33.04% : 0.000023s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000194 8 92.06% : 0.000179s : 4: match.inline 7.94% : 0.000015s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000235 1278 0.84% : 0.000002s : 13: predicate.accumulaten_eliminater 0.89% : 0.000002s : 4: predicate.ad_related_special_op_eliminate 0.54% : 0.000001s : 8: predicate.addn_check_dump 0.80% : 0.000002s : 13: predicate.addn_zero_filter 0.70% : 0.000002s : 13: predicate.adjust_all_reduce_mul_add 2.28% : 0.000005s : 21: predicate.arithmetic_simplify 0.96% : 0.000002s : 13: predicate.cast_eliminate 0.72% : 0.000002s : 8: predicate.check_bprop_eliminate 0.54% : 0.000001s : 8: predicate.compare_switch_simplify 0.17% : 0.000000s : 4: predicate.const_output_eliminate 0.61% : 0.000001s : 8: predicate.depend_value_elim 0.81% : 0.000002s : 13: predicate.dict_get_item_const_eliminator 1.19% : 0.000003s : 13: predicate.dict_get_item_eliminator 0.81% : 0.000002s : 13: predicate.dict_set_item_eliminator 1.08% : 0.000003s : 8: predicate.dumpgradient_eliminate 0.25% : 0.000001s : 4: predicate.elim_not_effective 0.47% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 0.98% : 0.000002s : 17: predicate.environ_add_const_eliminate 0.95% : 0.000002s : 17: predicate.environ_get_add_eliminate 0.95% : 0.000002s : 17: predicate.environ_get_depend_swap 1.66% : 0.000004s : 25: predicate.environ_get_eliminate 1.01% : 0.000002s : 17: predicate.environ_get_set_eliminate 1.31% : 0.000003s : 21: predicate.exchange_switch_depend_value 2.47% : 0.000006s : 21: predicate.float_depend_g_call 0.66% : 0.000002s : 8: predicate.float_environ_get_switch 0.72% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.14% : 0.000000s : 4: predicate.fold_const_symbol 0.66% : 0.000002s : 8: predicate.get_grad_eliminate 0.35% : 0.000001s : 4: predicate.graph_param_transform 0.55% : 0.000001s : 8: predicate.incorporate_call 0.44% : 0.000001s : 8: predicate.incorporate_call_switch 6.52% : 0.000015s : 58: predicate.inline 1.02% : 0.000002s : 8: predicate.inline_without_move 0.30% : 0.000001s : 8: predicate.j_node_and_user_rematch 0.91% : 0.000002s : 8: predicate.less_batch_normalization 1.92% : 0.000005s : 25: predicate.list_to_tuple_eliminator_ 2.19% : 0.000005s : 38: predicate.load_eliminater 1.40% : 0.000003s : 4: predicate.loop_unroll_after_grad 2.24% : 0.000005s : 34: predicate.loop_unroll_before_grad 1.51% : 0.000004s : 21: predicate.make_slice_get_slice_eliminator 0.52% : 0.000001s : 8: predicate.merge_addn 0.54% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.67% : 0.000002s : 8: predicate.mini_step_allgather_replace 0.79% : 0.000002s : 13: predicate.minmaximum_grad 1.88% : 0.000004s : 4: predicate.mutable_eliminate 0.30% : 0.000001s : 4: predicate.opt_reshape 0.53% : 0.000001s : 4: predicate.parallel_virtual_node 1.79% : 0.000004s : 21: predicate.partial_defer_inline 1.44% : 0.000003s : 21: predicate.partial_eliminate 0.79% : 0.000002s : 13: predicate.print_const_string_wrapper 0.56% : 0.000001s : 8: predicate.reduce_all_const_elim 1.24% : 0.000003s : 13: predicate.reduce_eliminate 2.27% : 0.000005s : 38: predicate.redundant_stop_gradient_eliminater 0.63% : 0.000001s : 8: predicate.remove_not_recompute_node 1.44% : 0.000003s : 25: predicate.replace_applicator 0.54% : 0.000001s : 8: predicate.replace_old_param 0.43% : 0.000001s : 4: predicate.reset_defer_inline 0.78% : 0.000002s : 13: predicate.reshape_eliminate 0.93% : 0.000002s : 8: predicate.row_tensor_add_zeros_like 0.44% : 0.000001s : 4: predicate.row_tensor_eliminate 1.09% : 0.000003s : 8: predicate.same_eliminate 0.52% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.97% : 0.000002s : 8: predicate.shard_identity_eliminate 0.66% : 0.000002s : 8: predicate.special_op_eliminate 0.60% : 0.000001s : 8: predicate.specialize_transform 1.25% : 0.000003s : 8: predicate.split_environ_get_set_with_tuple_value 1.12% : 0.000003s : 8: predicate.stack_unstack_eliminate 0.35% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.39% : 0.000003s : 21: predicate.switch_defer_inline 1.92% : 0.000005s : 29: predicate.switch_layer_defer_inline 5.00% : 0.000012s : 67: predicate.switch_simplify 0.83% : 0.000002s : 13: predicate.tile_eliminate 0.87% : 0.000002s : 13: predicate.transpose_eliminate 1.62% : 0.000004s : 21: predicate.tuple_list_convert_item_index_to_positive 1.64% : 0.000004s : 21: predicate.tuple_list_get_item_const_eliminator 1.44% : 0.000003s : 21: predicate.tuple_list_get_item_depend_reorder 2.97% : 0.000007s : 33: predicate.tuple_list_get_item_eliminator 1.36% : 0.000003s : 21: predicate.tuple_list_get_set_item_eliminator 2.23% : 0.000005s : 29: predicate.tuple_list_set_item_eliminator 1.63% : 0.000004s : 25: predicate.tuple_to_list_eliminator_ 2.13% : 0.000005s : 38: predicate.updatestate_pure_node_eliminater 2.94% : 0.000007s : 46: predicate.updatestate_useless_node_eliminater 0.45% : 0.000001s : 4: predicate.value_based_eliminate 0.71% : 0.000002s : 8: predicate.virtual_dataset_eliminate 0.65% : 0.000002s : 8: predicate.virtual_output_eliminate 0.23% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.38% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000649 11 54.70% : 0.000355s : 5: func_graph_cloner_run.FuncGraphClonerGraph 45.30% : 0.000294s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.032034 192 0.02% : 0.000007s : 1: ForceFp32Comm 12.47% : 0.003996s : 1: add_attr 12.40% : 0.003974s : 1: add_attr_with_inline 0.02% : 0.000007s : 1: add_comm_op_reuse_tag 0.19% : 0.000061s : 1: add_recomputation 0.02% : 0.000007s : 1: assign_add_opt 0.25% : 0.000080s : 1: auto_monad 0.09% : 0.000028s : 1: auto_monad_reorder 0.02% : 0.000006s : 1: begin_end_overlap_inline 0.03% : 0.000009s : 1: bias_add_comm_swap 1.52% : 0.000488s : 1: bootstrap 0.13% : 0.000042s : 1: cconv 0.02% : 0.000007s : 1: comm_op_add_attrs 0.06% : 0.000020s : 1: control_data_broadcast_order 0.05% : 0.000016s : 1: convert_after_rewriter 0.10% : 0.000031s : 1: cse_after_recomputation 0.02% : 0.000007s : 1: dataset_repeat_opt 0.06% : 0.000020s : 1: detach_backward 0.04% : 0.000012s : 1: environ_conv 0.11% : 0.000035s : 1: event_method 0.02% : 0.000008s : 1: full_micro_interleaved_order_control 0.03% : 0.000009s : 1: get_jit_bprop_graph 0.04% : 0.000013s : 1: graph_reusing 0.02% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.02% : 0.000006s : 1: handle_group_info 0.03% : 0.000009s : 1: inline 0.03% : 0.000009s : 1: insert-virtual-dataset 0.02% : 0.000006s : 1: interleave_parallel_branches 0.02% : 0.000006s : 1: interleave_split_concat_branches 0.02% : 0.000008s : 1: label_fine_grained_interleaved_index 0.03% : 0.000010s : 1: label_micro_interleaved_index 1.69% : 0.000541s : 1: loop_unroll 0.02% : 0.000006s : 1: merge_cast_opt 0.02% : 0.000007s : 1: micro_interleaved_order_control 2.51% : 0.000805s : 1: mutable_eliminate 0.03% : 0.000010s : 1: offloading_packed_experts 0.06% : 0.000018s : 1: opt.transform.loop_unroll_optimizer 0.07% : 0.000023s : 1: opt.transform.mutable_eliminate 4.31% : 0.001381s : 78: opt.transform.opt_a 0.10% : 0.000031s : 1: opt.transform.opt_after_cconv 0.09% : 0.000028s : 1: opt.transform.opt_after_jit_grad 0.34% : 0.000110s : 28: opt.transform.opt_b 0.17% : 0.000054s : 2: opt.transform.opt_trans_graph 0.12% : 0.000038s : 4: opt.transform.symbol_engine_opt 11.51% : 0.003686s : 1: opt_a 0.43% : 0.000137s : 1: opt_after_cconv 1.77% : 0.000567s : 1: opt_after_jit_grad 1.07% : 0.000344s : 1: opt_b 21.43% : 0.006866s : 1: optimize 0.09% : 0.000027s : 1: optimize_parallel_all_gather_comm 0.03% : 0.000011s : 1: order_py_execute_after_rewriter 0.09% : 0.000029s : 1: overlap_grad_flash_sp 0.02% : 0.000006s : 1: overlap_grad_matmul_and_grad_allreduce 0.03% : 0.000010s : 1: overlap_grad_ring_attention 0.02% : 0.000007s : 1: overlap_opt_shard_grad_in_pipeline 0.02% : 0.000006s : 1: overlap_opt_shard_in_pipeline 0.03% : 0.000009s : 1: overlap_param_gather 0.02% : 0.000006s : 1: overlap_recompute_allgather_and_fa_grad 0.03% : 0.000011s : 1: overlap_recompute_and_grad_model_parallel 0.03% : 0.000008s : 1: overlap_recompute_comm 0.04% : 0.000012s : 1: parallel-infer-symbol 0.02% : 0.000006s : 1: parallel-infer-symbol-second 0.02% : 0.000007s : 1: partial_unused_args_eliminate 0.03% : 0.000009s : 1: pipeline_parallel_scheduler 0.02% : 0.000007s : 1: pipeline_split 0.17% : 0.000055s : 1: pre_auto_parallel 0.14% : 0.000044s : 1: py_interpret_to_execute 0.07% : 0.000024s : 1: py_interpret_to_execute_after_opt_a 0.02% : 0.000006s : 1: remove_cast_before_assign_add 0.07% : 0.000022s : 1: remove_dup_value 1.54% : 0.000494s : 1: renormalize.infer 1.25% : 0.000399s : 1: renormalize.specialize 0.03% : 0.000008s : 1: reorder_send_recv_between_fp_bp 0.04% : 0.000012s : 1: rewriter_after_jit_bprop_graph 0.17% : 0.000054s : 1: rewriter_after_opt_a 0.35% : 0.000112s : 1: rewriter_before_opt_a 0.02% : 0.000008s : 1: slice_cell_reuse_recomputed_activation 0.03% : 0.000008s : 1: slice_recompute_activation 0.02% : 0.000008s : 1: split_layernorm_comm 0.02% : 0.000008s : 1: split_matmul_comm_elemetwise 0.03% : 0.000011s : 1: swap_dp_allreduce_reducescatter 0.32% : 0.000104s : 1: symbol_engine_optimizer 0.31% : 0.000099s : 1: tuple_transform 21.20% : 0.006790s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:41:49.558.591 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.166905, [21] [bootstrap]: 0.00050566 [type_inference]: 0.156105 [event_method]: 2.388e-05 [auto_monad]: 6.73e-05 [graph_reusing]: 6.39999e-06 [inline]: 3.21999e-06 [add_attr]: 0.00376188, [1] [add_attr_with_inline]: 0.00374968, [1] [Cycle 1]: 7.415e-05, [2] [tag_attr]: 2.342e-05 [meta_addattr_fg_expand]: 5.98002e-06 [parallel-infer-symbol]: 3.68999e-06 [pre_auto_parallel]: 4.207e-05 [insert-virtual-dataset]: 2.38002e-06 [parallel-infer-symbol-second]: 6.80011e-07 [dataset_repeat_opt]: 1.84e-06 [pipeline_split]: 1.99999e-06 [optimize]: 0.00559557, [53] [py_interpret_to_execute]: 2.943e-05 [rewriter_before_opt_a]: 9.274e-05 [opt_a]: 0.00318472, [2] [Cycle 1]: 0.00226371, [45] [expand_dump_flag]: 3.12002e-06 [switch_simplify]: 4.519e-05 [loop_unroll]: 3.057e-05 [a_1]: 0.0006791 [with_stream_mark]: 1.993e-05 [recompute_prepare]: 9.60001e-06 [updatestate_depend_eliminate]: 3.90998e-06 [updatestate_assign_eliminate]: 3.31999e-06 [updatestate_loads_eliminate]: 2.94999e-06 [parameter_eliminate]: 1.97001e-06 [a_2]: 9.004e-05 [accelerated_algorithm]: 7.66001e-06 [shard]: 1.82001e-06 [meta_shard_fg_expand]: 1.96e-06 [shard_inline]: 7.15003e-06 [merge_send_recv]: 8.75999e-06 [auto_parallel]: 7.13998e-06 [parallel]: 1.989e-05 [flash_sp]: 9.79e-06 [merge_comm]: 4.37e-06 [allreduce_fusion]: 3.36001e-06 [matmul_add_comm_reduction]: 9.92001e-06 [allreduce_slice_to_reducescatter]: 8.10018e-07 [virtual_shard_identity]: 9.00001e-06 [virtual_dataset]: 7.42998e-06 [get_grad_eliminate_]: 6.16998e-06 [virtual_output]: 6.59999e-06 [merge_forward]: 4.11001e-06 [cell_reuse_recompute_pass]: 1.30999e-06 [offload_activation]: 1.071e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.26e-05 [merge_recompute_call_nodes]: 1.87001e-06 [before_grad]: 1.062e-05 [set_forward_comm_id_for_comm_node_pass]: 3.86001e-06 [meta_fg_expand]: 2.86e-06 [flash_sp_send_recv_attached]: 3.12002e-06 [receive_attached]: 3.04999e-06 [after_resolve]: 1.229e-05 [a_after_grad]: 1.092e-05 [renormalize]: 0.00081865 [add_forward_monad_depend]: 7.31999e-06 [auto_monad_grad]: 2.56998e-06 [auto_monad_eliminator]: 1.793e-05 [cse]: 3.052e-05 [a_3]: 5.256e-05 [Cycle 2]: 0.0009083, [45] [expand_dump_flag]: 1.53002e-06 [switch_simplify]: 8.99e-06 [loop_unroll]: 6.55002e-06 [a_1]: 0.00014924 [with_stream_mark]: 1.343e-05 [recompute_prepare]: 6.84999e-06 [updatestate_depend_eliminate]: 3.19001e-06 [updatestate_assign_eliminate]: 2.39999e-06 [updatestate_loads_eliminate]: 2.43e-06 [parameter_eliminate]: 1.23002e-06 [a_2]: 7.731e-05 [accelerated_algorithm]: 6.56e-06 [shard]: 1.33002e-06 [meta_shard_fg_expand]: 1.43002e-06 [shard_inline]: 6.26e-06 [merge_send_recv]: 5.67999e-06 [auto_parallel]: 6.12999e-06 [parallel]: 5.74999e-06 [flash_sp]: 3.83001e-06 [merge_comm]: 3.35e-06 [allreduce_fusion]: 3.28e-06 [matmul_add_comm_reduction]: 6.42001e-06 [allreduce_slice_to_reducescatter]: 7.00005e-07 [virtual_shard_identity]: 2.522e-05 [virtual_dataset]: 6.38e-06 [get_grad_eliminate_]: 5.82001e-06 [virtual_output]: 6.66e-06 [merge_forward]: 4.15e-06 [cell_reuse_recompute_pass]: 2.27001e-06 [offload_activation]: 7.77998e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.335e-05 [merge_recompute_call_nodes]: 1.27999e-06 [before_grad]: 1.047e-05 [set_forward_comm_id_for_comm_node_pass]: 4.38999e-06 [meta_fg_expand]: 2.31998e-06 [flash_sp_send_recv_attached]: 1.09e-06 [receive_attached]: 0.0001515 [after_resolve]: 1.894e-05 [a_after_grad]: 1.186e-05 [renormalize]: 9.00181e-08 [add_forward_monad_depend]: 2.96001e-06 [auto_monad_grad]: 1.82001e-06 [auto_monad_eliminator]: 1.183e-05 [cse]: 1.997e-05 [a_3]: 4.023e-05 [py_interpret_to_execute_after_opt_a]: 1.345e-05 [slice_cell_reuse_recomputed_activation]: 2.43e-06 [rewriter_after_opt_a]: 4.261e-05 [convert_after_rewriter]: 7.82e-06 [order_py_execute_after_rewriter]: 6.11e-06 [mutable_eliminate]: 0.0007128 [opt_b]: 0.00021994, [1] [Cycle 1]: 0.00021268, [7] [b_1]: 0.00013173 [b_2]: 9.14e-06 [updatestate_depend_eliminate]: 6.43e-06 [updatestate_assign_eliminate]: 2.84999e-06 [updatestate_loads_eliminate]: 2.46e-06 [renormalize]: 7.80012e-07 [cse]: 2.084e-05 [optimize_parallel_all_gather_comm]: 1.864e-05 [overlap_param_gather]: 2.10002e-06 [cconv]: 3.248e-05 [loop_unroll]: 0.00051441 [opt_after_cconv]: 0.00011246, [1] [Cycle 1]: 0.00010486, [7] [c_1]: 3.249e-05 [parameter_eliminate]: 4.12e-06 [updatestate_depend_eliminate]: 6.55002e-06 [updatestate_assign_eliminate]: 2.58e-06 [updatestate_loads_eliminate]: 2.84999e-06 [cse]: 2.057e-05 [renormalize]: 6.79982e-07 [remove_dup_value]: 1.451e-05 [tuple_transform]: 8.498e-05, [1] [Cycle 1]: 8.014e-05, [4] [d_1]: 5.049e-05 [none_parameter_eliminate]: 2.16e-06 [renormalize]: 2.89991e-07 [switch_simplify]: 7.56001e-06 [partial_unused_args_eliminate]: 2.05002e-06 [add_recomputation]: 5.399e-05 [cse_after_recomputation]: 2.35e-05, [1] [Cycle 1]: 1.867e-05, [1] [cse]: 1.202e-05 [environ_conv]: 5.98002e-06 [swap_dp_allreduce_reducescatter]: 5.22e-06 [bias_add_comm_swap]: 3.04999e-06 [label_micro_interleaved_index]: 4.70999e-06 [label_fine_grained_interleaved_index]: 2.89001e-06 [merge_cast_opt]: 1.57999e-06 [slice_recompute_activation]: 2.33998e-06 [micro_interleaved_order_control]: 2.81e-06 [assign_add_opt]: 1.16997e-06 [ForceFp32Comm]: 8.2e-07 [remove_cast_before_assign_add]: 1.21002e-06 [full_micro_interleaved_order_control]: 2.14e-06 [reorder_send_recv_between_fp_bp]: 3.01001e-06 [comm_op_add_attrs]: 9.5999e-07 [add_comm_op_reuse_tag]: 1.01002e-06 [interleave_split_concat_branches]: 1.25999e-06 [interleave_parallel_branches]: 1.10999e-06 [overlap_opt_shard_in_pipeline]: 1.28002e-06 [overlap_opt_shard_grad_in_pipeline]: 1.82001e-06 [control_data_broadcast_order]: 1.355e-05 [grouped_pairwise_exchange_alltoall]: 1.59998e-06 [offloading_packed_experts]: 4.55001e-06 [overlap_recompute_and_grad_model_parallel]: 4.85001e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.14e-06 [overlap_recompute_allgather_and_fa_grad]: 1.34998e-06 [overlap_recompute_comm]: 2.39001e-06 [overlap_grad_ring_attention]: 4.84e-06 [overlap_grad_flash_sp]: 2.156e-05 [begin_end_overlap_inline]: 4.60015e-07 [split_matmul_comm_elemetwise]: 2.48e-06 [split_layernorm_comm]: 1.64e-06 [handle_group_info]: 1.82001e-06 [symbol_engine_optimizer]: 8.403e-05, [1] [Cycle 1]: 7.956e-05, [6] [build]: 3.83999e-06 [elim_shapecalc]: 1.249e-05 [elim_not_effective]: 1.433e-05 [opt_reshape]: 7.23e-06 [fold_const_symbol]: 1.059e-05 [renormalize]: 3.60014e-07 [detach_backward]: 2.26e-06 [pipeline_parallel_scheduler]: 1.62999e-06 [auto_monad_reorder]: 1.718e-05 [get_jit_bprop_graph]: 2.04999e-06 [rewriter_after_jit_bprop_graph]: 5.20999e-06 [opt_after_jit_grad]: 0.00053839 [validate]: 4.537e-05 Sums bootstrap : 0.000506s : 0.31% type_inference : 0.156105s : 96.30% event_method : 0.000024s : 0.01% auto_monad : 0.000067s : 0.04% graph_reusing : 0.000006s : 0.00% inline : 0.000003s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000023s : 0.01% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.00% parallel-infer-symbol : 0.000004s : 0.00% pre_auto_parallel : 0.000042s : 0.03% insert-virtual-dataset : 0.000002s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000029s : 0.02% optimize.rewriter_before_opt_a : 0.000093s : 0.06% optimize.opt_a.expand_dump_flag : 0.000005s : 0.00% optimize.opt_a.switch_simplify : 0.000054s : 0.03% optimize.opt_a.loop_unroll : 0.000037s : 0.02% optimize.opt_a.a_1 : 0.000828s : 0.51% optimize.opt_a.with_stream_mark : 0.000033s : 0.02% optimize.opt_a.recompute_prepare : 0.000016s : 0.01% optimize.opt_a.updatestate_depend_eliminate : 0.000007s : 0.00% optimize.opt_a.updatestate_assign_eliminate : 0.000006s : 0.00% optimize.opt_a.updatestate_loads_eliminate : 0.000005s : 0.00% optimize.opt_a.parameter_eliminate : 0.000003s : 0.00% optimize.opt_a.a_2 : 0.000167s : 0.10% optimize.opt_a.accelerated_algorithm : 0.000014s : 0.01% optimize.opt_a.shard : 0.000003s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.000003s : 0.00% optimize.opt_a.shard_inline : 0.000013s : 0.01% optimize.opt_a.merge_send_recv : 0.000014s : 0.01% optimize.opt_a.auto_parallel : 0.000013s : 0.01% optimize.opt_a.parallel : 0.000026s : 0.02% optimize.opt_a.flash_sp : 0.000014s : 0.01% optimize.opt_a.merge_comm : 0.000008s : 0.00% optimize.opt_a.allreduce_fusion : 0.000007s : 0.00% optimize.opt_a.matmul_add_comm_reduction : 0.000016s : 0.01% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000002s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000034s : 0.02% optimize.opt_a.virtual_dataset : 0.000014s : 0.01% optimize.opt_a.get_grad_eliminate_ : 0.000012s : 0.01% optimize.opt_a.virtual_output : 0.000013s : 0.01% optimize.opt_a.merge_forward : 0.000008s : 0.01% optimize.opt_a.cell_reuse_recompute_pass : 0.000004s : 0.00% optimize.opt_a.offload_activation : 0.000018s : 0.01% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000026s : 0.02% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.00% optimize.opt_a.before_grad : 0.000021s : 0.01% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000008s : 0.01% optimize.opt_a.meta_fg_expand : 0.000005s : 0.00% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.00% optimize.opt_a.receive_attached : 0.000155s : 0.10% optimize.opt_a.after_resolve : 0.000031s : 0.02% optimize.opt_a.a_after_grad : 0.000023s : 0.01% optimize.opt_a.renormalize : 0.000819s : 0.51% optimize.opt_a.add_forward_monad_depend : 0.000010s : 0.01% optimize.opt_a.auto_monad_grad : 0.000004s : 0.00% optimize.opt_a.auto_monad_eliminator : 0.000030s : 0.02% optimize.opt_a.cse : 0.000050s : 0.03% optimize.opt_a.a_3 : 0.000093s : 0.06% optimize.py_interpret_to_execute_after_opt_a : 0.000013s : 0.01% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.00% optimize.rewriter_after_opt_a : 0.000043s : 0.03% optimize.convert_after_rewriter : 0.000008s : 0.00% optimize.order_py_execute_after_rewriter : 0.000006s : 0.00% optimize.mutable_eliminate : 0.000713s : 0.44% optimize.opt_b.b_1 : 0.000132s : 0.08% optimize.opt_b.b_2 : 0.000009s : 0.01% optimize.opt_b.updatestate_depend_eliminate : 0.000006s : 0.00% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000002s : 0.00% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000021s : 0.01% optimize.optimize_parallel_all_gather_comm : 0.000019s : 0.01% optimize.overlap_param_gather : 0.000002s : 0.00% optimize.cconv : 0.000032s : 0.02% optimize.loop_unroll : 0.000514s : 0.32% optimize.opt_after_cconv.c_1 : 0.000032s : 0.02% optimize.opt_after_cconv.parameter_eliminate : 0.000004s : 0.00% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000007s : 0.00% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.cse : 0.000021s : 0.01% optimize.opt_after_cconv.renormalize : 0.000001s : 0.00% optimize.remove_dup_value : 0.000015s : 0.01% optimize.tuple_transform.d_1 : 0.000050s : 0.03% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000008s : 0.00% optimize.partial_unused_args_eliminate : 0.000002s : 0.00% optimize.add_recomputation : 0.000054s : 0.03% optimize.cse_after_recomputation.cse : 0.000012s : 0.01% optimize.environ_conv : 0.000006s : 0.00% optimize.swap_dp_allreduce_reducescatter : 0.000005s : 0.00% optimize.bias_add_comm_swap : 0.000003s : 0.00% optimize.label_micro_interleaved_index : 0.000005s : 0.00% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.00% optimize.merge_cast_opt : 0.000002s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.00% optimize.micro_interleaved_order_control : 0.000003s : 0.00% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.00% optimize.full_micro_interleaved_order_control : 0.000002s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.00% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.00% optimize.control_data_broadcast_order : 0.000014s : 0.01% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.00% optimize.offloading_packed_experts : 0.000005s : 0.00% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.00% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.00% optimize.overlap_recompute_comm : 0.000002s : 0.00% optimize.overlap_grad_ring_attention : 0.000005s : 0.00% optimize.overlap_grad_flash_sp : 0.000022s : 0.01% optimize.begin_end_overlap_inline : 0.000000s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.00% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000002s : 0.00% optimize.symbol_engine_optimizer.build : 0.000004s : 0.00% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000012s : 0.01% optimize.symbol_engine_optimizer.elim_not_effective : 0.000014s : 0.01% optimize.symbol_engine_optimizer.opt_reshape : 0.000007s : 0.00% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000011s : 0.01% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.00% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000017s : 0.01% get_jit_bprop_graph : 0.000002s : 0.00% rewriter_after_jit_bprop_graph : 0.000005s : 0.00% opt_after_jit_grad : 0.000538s : 0.33% validate : 0.000045s : 0.03% Time group info: ------[substitution.] 0.000242 34 15.21% : 0.000037s : 6: substitution.arithmetic_simplify 0.79% : 0.000002s : 2: substitution.elim_not_effective 0.59% : 0.000001s : 2: substitution.fold_const_symbol 2.65% : 0.000006s : 4: substitution.graph_param_transform 67.37% : 0.000163s : 4: substitution.inline 1.75% : 0.000004s : 4: substitution.j_node_and_user_rematch 2.27% : 0.000005s : 4: substitution.remove_not_recompute_node 2.55% : 0.000006s : 4: substitution.replace_old_param 6.82% : 0.000017s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.156019 2 99.36% : 0.155018s : 1: type_inference.infer 0.64% : 0.001002s : 1: type_inference.specialize ------[replace.] 0.000065 8 64.00% : 0.000042s : 4: replace.inline 36.00% : 0.000024s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000175 8 91.67% : 0.000161s : 4: match.inline 8.33% : 0.000015s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000216 1278 1.01% : 0.000002s : 13: predicate.accumulaten_eliminater 0.91% : 0.000002s : 4: predicate.ad_related_special_op_eliminate 0.55% : 0.000001s : 8: predicate.addn_check_dump 0.89% : 0.000002s : 13: predicate.addn_zero_filter 0.78% : 0.000002s : 13: predicate.adjust_all_reduce_mul_add 2.46% : 0.000005s : 21: predicate.arithmetic_simplify 0.84% : 0.000002s : 13: predicate.cast_eliminate 0.78% : 0.000002s : 8: predicate.check_bprop_eliminate 0.52% : 0.000001s : 8: predicate.compare_switch_simplify 0.19% : 0.000000s : 4: predicate.const_output_eliminate 0.88% : 0.000002s : 8: predicate.depend_value_elim 0.91% : 0.000002s : 13: predicate.dict_get_item_const_eliminator 1.03% : 0.000002s : 13: predicate.dict_get_item_eliminator 0.86% : 0.000002s : 13: predicate.dict_set_item_eliminator 1.32% : 0.000003s : 8: predicate.dumpgradient_eliminate 0.27% : 0.000001s : 4: predicate.elim_not_effective 0.39% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.04% : 0.000002s : 17: predicate.environ_add_const_eliminate 1.09% : 0.000002s : 17: predicate.environ_get_add_eliminate 1.13% : 0.000002s : 17: predicate.environ_get_depend_swap 1.66% : 0.000004s : 25: predicate.environ_get_eliminate 1.13% : 0.000002s : 17: predicate.environ_get_set_eliminate 1.44% : 0.000003s : 21: predicate.exchange_switch_depend_value 2.47% : 0.000005s : 21: predicate.float_depend_g_call 0.49% : 0.000001s : 8: predicate.float_environ_get_switch 0.74% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.18% : 0.000000s : 4: predicate.fold_const_symbol 0.62% : 0.000001s : 8: predicate.get_grad_eliminate 0.30% : 0.000001s : 4: predicate.graph_param_transform 0.57% : 0.000001s : 8: predicate.incorporate_call 0.45% : 0.000001s : 8: predicate.incorporate_call_switch 5.95% : 0.000013s : 58: predicate.inline 1.03% : 0.000002s : 8: predicate.inline_without_move 0.33% : 0.000001s : 8: predicate.j_node_and_user_rematch 0.89% : 0.000002s : 8: predicate.less_batch_normalization 1.67% : 0.000004s : 25: predicate.list_to_tuple_eliminator_ 2.51% : 0.000005s : 38: predicate.load_eliminater 1.13% : 0.000002s : 4: predicate.loop_unroll_after_grad 2.28% : 0.000005s : 34: predicate.loop_unroll_before_grad 1.52% : 0.000003s : 21: predicate.make_slice_get_slice_eliminator 0.49% : 0.000001s : 8: predicate.merge_addn 0.55% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.53% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.78% : 0.000002s : 13: predicate.minmaximum_grad 1.03% : 0.000002s : 4: predicate.mutable_eliminate 0.34% : 0.000001s : 4: predicate.opt_reshape 0.47% : 0.000001s : 4: predicate.parallel_virtual_node 1.82% : 0.000004s : 21: predicate.partial_defer_inline 1.59% : 0.000003s : 21: predicate.partial_eliminate 0.81% : 0.000002s : 13: predicate.print_const_string_wrapper 0.56% : 0.000001s : 8: predicate.reduce_all_const_elim 1.16% : 0.000003s : 13: predicate.reduce_eliminate 2.51% : 0.000005s : 38: predicate.redundant_stop_gradient_eliminater 0.61% : 0.000001s : 8: predicate.remove_not_recompute_node 1.35% : 0.000003s : 25: predicate.replace_applicator 0.63% : 0.000001s : 8: predicate.replace_old_param 0.26% : 0.000001s : 4: predicate.reset_defer_inline 1.05% : 0.000002s : 13: predicate.reshape_eliminate 0.60% : 0.000001s : 8: predicate.row_tensor_add_zeros_like 0.43% : 0.000001s : 4: predicate.row_tensor_eliminate 0.91% : 0.000002s : 8: predicate.same_eliminate 0.40% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.72% : 0.000002s : 8: predicate.shard_identity_eliminate 0.66% : 0.000001s : 8: predicate.special_op_eliminate 0.65% : 0.000001s : 8: predicate.specialize_transform 1.11% : 0.000002s : 8: predicate.split_environ_get_set_with_tuple_value 0.97% : 0.000002s : 8: predicate.stack_unstack_eliminate 0.31% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.43% : 0.000003s : 21: predicate.switch_defer_inline 1.98% : 0.000004s : 29: predicate.switch_layer_defer_inline 5.38% : 0.000012s : 67: predicate.switch_simplify 0.89% : 0.000002s : 13: predicate.tile_eliminate 0.86% : 0.000002s : 13: predicate.transpose_eliminate 1.47% : 0.000003s : 21: predicate.tuple_list_convert_item_index_to_positive 1.46% : 0.000003s : 21: predicate.tuple_list_get_item_const_eliminator 1.35% : 0.000003s : 21: predicate.tuple_list_get_item_depend_reorder 3.38% : 0.000007s : 33: predicate.tuple_list_get_item_eliminator 1.60% : 0.000003s : 21: predicate.tuple_list_get_set_item_eliminator 2.31% : 0.000005s : 29: predicate.tuple_list_set_item_eliminator 1.61% : 0.000003s : 25: predicate.tuple_to_list_eliminator_ 2.23% : 0.000005s : 38: predicate.updatestate_pure_node_eliminater 2.98% : 0.000006s : 46: predicate.updatestate_useless_node_eliminater 0.41% : 0.000001s : 4: predicate.value_based_eliminate 0.62% : 0.000001s : 8: predicate.virtual_dataset_eliminate 0.74% : 0.000002s : 8: predicate.virtual_output_eliminate 0.30% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.49% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000719 11 48.34% : 0.000347s : 5: func_graph_cloner_run.FuncGraphClonerGraph 51.66% : 0.000371s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.178531 192 0.00% : 0.000004s : 1: ForceFp32Comm 2.11% : 0.003767s : 1: add_attr 2.10% : 0.003755s : 1: add_attr_with_inline 0.00% : 0.000004s : 1: add_comm_op_reuse_tag 0.03% : 0.000058s : 1: add_recomputation 0.00% : 0.000004s : 1: assign_add_opt 0.04% : 0.000072s : 1: auto_monad 0.01% : 0.000021s : 1: auto_monad_reorder 0.00% : 0.000004s : 1: begin_end_overlap_inline 0.00% : 0.000006s : 1: bias_add_comm_swap 0.30% : 0.000538s : 1: bootstrap 0.02% : 0.000036s : 1: cconv 0.00% : 0.000004s : 1: comm_op_add_attrs 0.01% : 0.000017s : 1: control_data_broadcast_order 0.01% : 0.000012s : 1: convert_after_rewriter 0.01% : 0.000026s : 1: cse_after_recomputation 0.00% : 0.000005s : 1: dataset_repeat_opt 0.00% : 0.000005s : 1: detach_backward 0.01% : 0.000009s : 1: environ_conv 0.02% : 0.000031s : 1: event_method 0.00% : 0.000005s : 1: full_micro_interleaved_order_control 0.00% : 0.000005s : 1: get_jit_bprop_graph 0.01% : 0.000010s : 1: graph_reusing 0.00% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000005s : 1: handle_group_info 0.00% : 0.000006s : 1: inline 0.00% : 0.000006s : 1: insert-virtual-dataset 0.00% : 0.000004s : 1: interleave_parallel_branches 0.00% : 0.000004s : 1: interleave_split_concat_branches 0.00% : 0.000007s : 1: label_fine_grained_interleaved_index 0.00% : 0.000008s : 1: label_micro_interleaved_index 0.29% : 0.000524s : 1: loop_unroll 0.00% : 0.000004s : 1: merge_cast_opt 0.00% : 0.000006s : 1: micro_interleaved_order_control 0.40% : 0.000722s : 1: mutable_eliminate 0.00% : 0.000008s : 1: offloading_packed_experts 0.01% : 0.000016s : 1: opt.transform.loop_unroll_optimizer 0.01% : 0.000017s : 1: opt.transform.mutable_eliminate 0.73% : 0.001299s : 78: opt.transform.opt_a 0.02% : 0.000031s : 1: opt.transform.opt_after_cconv 0.02% : 0.000029s : 1: opt.transform.opt_after_jit_grad 0.06% : 0.000107s : 28: opt.transform.opt_b 0.03% : 0.000056s : 2: opt.transform.opt_trans_graph 0.02% : 0.000041s : 4: opt.transform.symbol_engine_opt 1.79% : 0.003189s : 1: opt_a 0.07% : 0.000116s : 1: opt_after_cconv 0.31% : 0.000549s : 1: opt_after_jit_grad 0.13% : 0.000224s : 1: opt_b 3.14% : 0.005601s : 1: optimize 0.01% : 0.000022s : 1: optimize_parallel_all_gather_comm 0.01% : 0.000009s : 1: order_py_execute_after_rewriter 0.01% : 0.000025s : 1: overlap_grad_flash_sp 0.00% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.00% : 0.000008s : 1: overlap_grad_ring_attention 0.00% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000005s : 1: overlap_param_gather 0.00% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.00% : 0.000008s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000005s : 1: overlap_recompute_comm 0.00% : 0.000007s : 1: parallel-infer-symbol 0.00% : 0.000004s : 1: parallel-infer-symbol-second 0.00% : 0.000005s : 1: partial_unused_args_eliminate 0.00% : 0.000005s : 1: pipeline_parallel_scheduler 0.00% : 0.000005s : 1: pipeline_split 0.03% : 0.000046s : 1: pre_auto_parallel 0.02% : 0.000034s : 1: py_interpret_to_execute 0.01% : 0.000017s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000004s : 1: remove_cast_before_assign_add 0.01% : 0.000019s : 1: remove_dup_value 0.24% : 0.000434s : 1: renormalize.infer 0.21% : 0.000376s : 1: renormalize.specialize 0.00% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.00% : 0.000008s : 1: rewriter_after_jit_bprop_graph 0.03% : 0.000047s : 1: rewriter_after_opt_a 0.05% : 0.000097s : 1: rewriter_before_opt_a 0.00% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000006s : 1: slice_recompute_activation 0.00% : 0.000006s : 1: split_layernorm_comm 0.00% : 0.000005s : 1: split_matmul_comm_elemetwise 0.00% : 0.000008s : 1: swap_dp_allreduce_reducescatter 0.05% : 0.000087s : 1: symbol_engine_optimizer 0.05% : 0.000088s : 1: tuple_transform 87.45% : 0.156134s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:41:51.747.194 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:41:51.747.460 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.20686, [21] [bootstrap]: 0.00046761 [type_inference]: 0.194798 [event_method]: 2.181e-05 [auto_monad]: 6.752e-05 [graph_reusing]: 6.05002e-06 [inline]: 2.96001e-06 [add_attr]: 0.0036319, [1] [add_attr_with_inline]: 0.00361984, [1] [Cycle 1]: 9.201e-05, [2] [tag_attr]: 2.484e-05 [meta_addattr_fg_expand]: 5.97999e-06 [parallel-infer-symbol]: 3.5e-06 [pre_auto_parallel]: 4.154e-05 [insert-virtual-dataset]: 2.74999e-06 [parallel-infer-symbol-second]: 9.00007e-07 [dataset_repeat_opt]: 2.00002e-06 [pipeline_split]: 2.11e-06 [optimize]: 0.00639027, [53] [py_interpret_to_execute]: 3.508e-05 [rewriter_before_opt_a]: 9.795e-05 [opt_a]: 0.00364347, [2] [Cycle 1]: 0.00248744, [45] [expand_dump_flag]: 2.99001e-06 [switch_simplify]: 4.429e-05 [loop_unroll]: 3.078e-05 [a_1]: 0.00069046 [with_stream_mark]: 2.048e-05 [recompute_prepare]: 9.76998e-06 [updatestate_depend_eliminate]: 5.19e-06 [updatestate_assign_eliminate]: 3.47997e-06 [updatestate_loads_eliminate]: 2.89999e-06 [parameter_eliminate]: 1.99e-06 [a_2]: 0.00011818 [accelerated_algorithm]: 7.65e-06 [shard]: 2.36998e-06 [meta_shard_fg_expand]: 2.70997e-06 [shard_inline]: 6.59001e-06 [merge_send_recv]: 1.019e-05 [auto_parallel]: 8.27e-06 [parallel]: 2.003e-05 [flash_sp]: 9.73002e-06 [merge_comm]: 4.55999e-06 [allreduce_fusion]: 4.2e-06 [matmul_add_comm_reduction]: 9.92999e-06 [allreduce_slice_to_reducescatter]: 1.19e-06 [virtual_shard_identity]: 1.068e-05 [virtual_dataset]: 6.83e-06 [get_grad_eliminate_]: 6.78e-06 [virtual_output]: 6.58e-06 [merge_forward]: 4.70999e-06 [cell_reuse_recompute_pass]: 1.24e-06 [offload_activation]: 1.088e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.528e-05 [merge_recompute_call_nodes]: 1.56002e-06 [before_grad]: 1.144e-05 [set_forward_comm_id_for_comm_node_pass]: 4.08001e-06 [meta_fg_expand]: 3.06001e-06 [flash_sp_send_recv_attached]: 2.69001e-06 [receive_attached]: 2.54001e-06 [after_resolve]: 1.25e-05 [a_after_grad]: 1.02e-05 [renormalize]: 0.00083526 [add_forward_monad_depend]: 6.69001e-06 [auto_monad_grad]: 3.31001e-06 [auto_monad_eliminator]: 1.73e-05 [cse]: 2.998e-05 [a_3]: 6.456e-05 [Cycle 2]: 0.00113863, [45] [expand_dump_flag]: 2.29001e-06 [switch_simplify]: 8.05e-06 [loop_unroll]: 6.44999e-06 [a_1]: 0.00014594 [with_stream_mark]: 6.729e-05 [recompute_prepare]: 9.34e-06 [updatestate_depend_eliminate]: 6.04001e-06 [updatestate_assign_eliminate]: 3.11001e-06 [updatestate_loads_eliminate]: 2.64999e-06 [parameter_eliminate]: 2.24001e-06 [a_2]: 0.00017169 [accelerated_algorithm]: 8.25e-06 [shard]: 2.09e-06 [meta_shard_fg_expand]: 1.71002e-06 [shard_inline]: 7.18e-06 [merge_send_recv]: 8.12998e-06 [auto_parallel]: 9.45001e-06 [parallel]: 6.96999e-06 [flash_sp]: 4.13001e-06 [merge_comm]: 3.96001e-06 [allreduce_fusion]: 3.88999e-06 [matmul_add_comm_reduction]: 1.255e-05 [allreduce_slice_to_reducescatter]: 6.69999e-07 [virtual_shard_identity]: 8.38999e-06 [virtual_dataset]: 6.31e-06 [get_grad_eliminate_]: 6.15002e-06 [virtual_output]: 5.94999e-06 [merge_forward]: 4.40999e-06 [cell_reuse_recompute_pass]: 2.48e-06 [offload_activation]: 9.92999e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.608e-05 [merge_recompute_call_nodes]: 1.20001e-06 [before_grad]: 1.029e-05 [set_forward_comm_id_for_comm_node_pass]: 4.50999e-06 [meta_fg_expand]: 2.34999e-06 [flash_sp_send_recv_attached]: 1.43002e-06 [receive_attached]: 1.66e-06 [after_resolve]: 1.305e-05 [a_after_grad]: 1.082e-05 [renormalize]: 9.00181e-08 [add_forward_monad_depend]: 2.31998e-06 [auto_monad_grad]: 2.78003e-06 [auto_monad_eliminator]: 9.98002e-06 [cse]: 1.991e-05 [a_3]: 5.294e-05 [py_interpret_to_execute_after_opt_a]: 1.82e-05 [slice_cell_reuse_recomputed_activation]: 4.79e-06 [rewriter_after_opt_a]: 4.387e-05 [convert_after_rewriter]: 1.08e-05 [order_py_execute_after_rewriter]: 8.3e-06 [mutable_eliminate]: 0.00077055 [opt_b]: 0.00029376, [1] [Cycle 1]: 0.00028293, [7] [b_1]: 0.0001743 [b_2]: 9.54e-06 [updatestate_depend_eliminate]: 9.56998e-06 [updatestate_assign_eliminate]: 3.16001e-06 [updatestate_loads_eliminate]: 3.2e-06 [renormalize]: 9.5999e-07 [cse]: 2.247e-05 [optimize_parallel_all_gather_comm]: 2.362e-05 [overlap_param_gather]: 5.00001e-06 [cconv]: 3.959e-05 [loop_unroll]: 0.00049744 [opt_after_cconv]: 0.00013371, [1] [Cycle 1]: 0.00012422, [7] [c_1]: 3.305e-05 [parameter_eliminate]: 4.83001e-06 [updatestate_depend_eliminate]: 5.77999e-06 [updatestate_assign_eliminate]: 2.74001e-06 [updatestate_loads_eliminate]: 2.39001e-06 [cse]: 1.891e-05 [renormalize]: 4.50003e-07 [remove_dup_value]: 1.875e-05 [tuple_transform]: 0.00010151, [1] [Cycle 1]: 9.307e-05, [4] [d_1]: 5.18e-05 [none_parameter_eliminate]: 1.97001e-06 [renormalize]: 1.79978e-07 [switch_simplify]: 7.23e-06 [partial_unused_args_eliminate]: 4.65001e-06 [add_recomputation]: 5.524e-05 [cse_after_recomputation]: 3e-05, [1] [Cycle 1]: 2.227e-05, [1] [cse]: 1.179e-05 [environ_conv]: 8.70999e-06 [swap_dp_allreduce_reducescatter]: 7.75998e-06 [bias_add_comm_swap]: 5.56998e-06 [label_micro_interleaved_index]: 7.94002e-06 [label_fine_grained_interleaved_index]: 5.71e-06 [merge_cast_opt]: 4.18999e-06 [slice_recompute_activation]: 4.62e-06 [micro_interleaved_order_control]: 5.22999e-06 [assign_add_opt]: 3.75998e-06 [ForceFp32Comm]: 3.95998e-06 [remove_cast_before_assign_add]: 3.45e-06 [full_micro_interleaved_order_control]: 4.62e-06 [reorder_send_recv_between_fp_bp]: 5.09998e-06 [comm_op_add_attrs]: 3.36001e-06 [add_comm_op_reuse_tag]: 3.85e-06 [interleave_split_concat_branches]: 3.53e-06 [interleave_parallel_branches]: 3.66001e-06 [overlap_opt_shard_in_pipeline]: 3.49001e-06 [overlap_opt_shard_grad_in_pipeline]: 4.31002e-06 [control_data_broadcast_order]: 1.755e-05 [grouped_pairwise_exchange_alltoall]: 3.89002e-06 [offloading_packed_experts]: 7.46001e-06 [overlap_recompute_and_grad_model_parallel]: 7.38e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.91001e-06 [overlap_recompute_allgather_and_fa_grad]: 3.83999e-06 [overlap_recompute_comm]: 5.12999e-06 [overlap_grad_ring_attention]: 6.43e-06 [overlap_grad_flash_sp]: 2.478e-05 [begin_end_overlap_inline]: 2.86999e-06 [split_matmul_comm_elemetwise]: 4.30999e-06 [split_layernorm_comm]: 4.27e-06 [handle_group_info]: 3.46999e-06 [symbol_engine_optimizer]: 0.00014224, [1] [Cycle 1]: 0.00013467, [6] [build]: 4.31002e-06 [elim_shapecalc]: 1.163e-05 [elim_not_effective]: 1.463e-05 [opt_reshape]: 7.92e-06 [fold_const_symbol]: 1.014e-05 [renormalize]: 1.30007e-07 [detach_backward]: 4.63999e-06 [pipeline_parallel_scheduler]: 1.99999e-06 [auto_monad_reorder]: 2.338e-05 [get_jit_bprop_graph]: 1.99e-06 [rewriter_after_jit_bprop_graph]: 7.35998e-06 [opt_after_jit_grad]: 0.00070296 [validate]: 4.922e-05 Sums bootstrap : 0.000468s : 0.23% type_inference : 0.194798s : 96.80% event_method : 0.000022s : 0.01% auto_monad : 0.000068s : 0.03% graph_reusing : 0.000006s : 0.00% inline : 0.000003s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000025s : 0.01% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.00% parallel-infer-symbol : 0.000003s : 0.00% pre_auto_parallel : 0.000042s : 0.02% insert-virtual-dataset : 0.000003s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000035s : 0.02% optimize.rewriter_before_opt_a : 0.000098s : 0.05% optimize.opt_a.expand_dump_flag : 0.000005s : 0.00% optimize.opt_a.switch_simplify : 0.000052s : 0.03% optimize.opt_a.loop_unroll : 0.000037s : 0.02% optimize.opt_a.a_1 : 0.000836s : 0.42% optimize.opt_a.with_stream_mark : 0.000088s : 0.04% optimize.opt_a.recompute_prepare : 0.000019s : 0.01% optimize.opt_a.updatestate_depend_eliminate : 0.000011s : 0.01% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.00% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.00% optimize.opt_a.parameter_eliminate : 0.000004s : 0.00% optimize.opt_a.a_2 : 0.000290s : 0.14% optimize.opt_a.accelerated_algorithm : 0.000016s : 0.01% optimize.opt_a.shard : 0.000004s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.00% optimize.opt_a.shard_inline : 0.000014s : 0.01% optimize.opt_a.merge_send_recv : 0.000018s : 0.01% optimize.opt_a.auto_parallel : 0.000018s : 0.01% optimize.opt_a.parallel : 0.000027s : 0.01% optimize.opt_a.flash_sp : 0.000014s : 0.01% optimize.opt_a.merge_comm : 0.000009s : 0.00% optimize.opt_a.allreduce_fusion : 0.000008s : 0.00% optimize.opt_a.matmul_add_comm_reduction : 0.000022s : 0.01% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000002s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000019s : 0.01% optimize.opt_a.virtual_dataset : 0.000013s : 0.01% optimize.opt_a.get_grad_eliminate_ : 0.000013s : 0.01% optimize.opt_a.virtual_output : 0.000013s : 0.01% optimize.opt_a.merge_forward : 0.000009s : 0.00% optimize.opt_a.cell_reuse_recompute_pass : 0.000004s : 0.00% optimize.opt_a.offload_activation : 0.000021s : 0.01% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000031s : 0.02% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.00% optimize.opt_a.before_grad : 0.000022s : 0.01% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000009s : 0.00% optimize.opt_a.meta_fg_expand : 0.000005s : 0.00% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.00% optimize.opt_a.receive_attached : 0.000004s : 0.00% optimize.opt_a.after_resolve : 0.000026s : 0.01% optimize.opt_a.a_after_grad : 0.000021s : 0.01% optimize.opt_a.renormalize : 0.000835s : 0.42% optimize.opt_a.add_forward_monad_depend : 0.000009s : 0.00% optimize.opt_a.auto_monad_grad : 0.000006s : 0.00% optimize.opt_a.auto_monad_eliminator : 0.000027s : 0.01% optimize.opt_a.cse : 0.000050s : 0.02% optimize.opt_a.a_3 : 0.000117s : 0.06% optimize.py_interpret_to_execute_after_opt_a : 0.000018s : 0.01% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.00% optimize.rewriter_after_opt_a : 0.000044s : 0.02% optimize.convert_after_rewriter : 0.000011s : 0.01% optimize.order_py_execute_after_rewriter : 0.000008s : 0.00% optimize.mutable_eliminate : 0.000771s : 0.38% optimize.opt_b.b_1 : 0.000174s : 0.09% optimize.opt_b.b_2 : 0.000010s : 0.00% optimize.opt_b.updatestate_depend_eliminate : 0.000010s : 0.00% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.00% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000022s : 0.01% optimize.optimize_parallel_all_gather_comm : 0.000024s : 0.01% optimize.overlap_param_gather : 0.000005s : 0.00% optimize.cconv : 0.000040s : 0.02% optimize.loop_unroll : 0.000497s : 0.25% optimize.opt_after_cconv.c_1 : 0.000033s : 0.02% optimize.opt_after_cconv.parameter_eliminate : 0.000005s : 0.00% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.00% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.00% optimize.opt_after_cconv.cse : 0.000019s : 0.01% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000019s : 0.01% optimize.tuple_transform.d_1 : 0.000052s : 0.03% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000007s : 0.00% optimize.partial_unused_args_eliminate : 0.000005s : 0.00% optimize.add_recomputation : 0.000055s : 0.03% optimize.cse_after_recomputation.cse : 0.000012s : 0.01% optimize.environ_conv : 0.000009s : 0.00% optimize.swap_dp_allreduce_reducescatter : 0.000008s : 0.00% optimize.bias_add_comm_swap : 0.000006s : 0.00% optimize.label_micro_interleaved_index : 0.000008s : 0.00% optimize.label_fine_grained_interleaved_index : 0.000006s : 0.00% optimize.merge_cast_opt : 0.000004s : 0.00% optimize.slice_recompute_activation : 0.000005s : 0.00% optimize.micro_interleaved_order_control : 0.000005s : 0.00% optimize.assign_add_opt : 0.000004s : 0.00% optimize.ForceFp32Comm : 0.000004s : 0.00% optimize.remove_cast_before_assign_add : 0.000003s : 0.00% optimize.full_micro_interleaved_order_control : 0.000005s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000005s : 0.00% optimize.comm_op_add_attrs : 0.000003s : 0.00% optimize.add_comm_op_reuse_tag : 0.000004s : 0.00% optimize.interleave_split_concat_branches : 0.000004s : 0.00% optimize.interleave_parallel_branches : 0.000004s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000003s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000004s : 0.00% optimize.control_data_broadcast_order : 0.000018s : 0.01% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.00% optimize.offloading_packed_experts : 0.000007s : 0.00% optimize.overlap_recompute_and_grad_model_parallel : 0.000007s : 0.00% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.00% optimize.overlap_recompute_comm : 0.000005s : 0.00% optimize.overlap_grad_ring_attention : 0.000006s : 0.00% optimize.overlap_grad_flash_sp : 0.000025s : 0.01% optimize.begin_end_overlap_inline : 0.000003s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000004s : 0.00% optimize.split_layernorm_comm : 0.000004s : 0.00% optimize.handle_group_info : 0.000003s : 0.00% optimize.symbol_engine_optimizer.build : 0.000004s : 0.00% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000012s : 0.01% optimize.symbol_engine_optimizer.elim_not_effective : 0.000015s : 0.01% optimize.symbol_engine_optimizer.opt_reshape : 0.000008s : 0.00% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000010s : 0.01% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000005s : 0.00% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000023s : 0.01% get_jit_bprop_graph : 0.000002s : 0.00% rewriter_after_jit_bprop_graph : 0.000007s : 0.00% opt_after_jit_grad : 0.000703s : 0.35% validate : 0.000049s : 0.02% Time group info: ------[substitution.] 0.000254 34 15.33% : 0.000039s : 6: substitution.arithmetic_simplify 0.70% : 0.000002s : 2: substitution.elim_not_effective 0.70% : 0.000002s : 2: substitution.fold_const_symbol 2.67% : 0.000007s : 4: substitution.graph_param_transform 68.04% : 0.000173s : 4: substitution.inline 1.75% : 0.000004s : 4: substitution.j_node_and_user_rematch 1.86% : 0.000005s : 4: substitution.remove_not_recompute_node 2.59% : 0.000007s : 4: substitution.replace_old_param 6.36% : 0.000016s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.194731 2 99.57% : 0.193884s : 1: type_inference.infer 0.43% : 0.000847s : 1: type_inference.specialize ------[replace.] 0.000069 8 64.06% : 0.000044s : 4: replace.inline 35.94% : 0.000025s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000184 8 92.26% : 0.000170s : 4: match.inline 7.74% : 0.000014s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000224 1278 0.83% : 0.000002s : 13: predicate.accumulaten_eliminater 1.10% : 0.000002s : 4: predicate.ad_related_special_op_eliminate 0.48% : 0.000001s : 8: predicate.addn_check_dump 0.94% : 0.000002s : 13: predicate.addn_zero_filter 0.73% : 0.000002s : 13: predicate.adjust_all_reduce_mul_add 2.57% : 0.000006s : 21: predicate.arithmetic_simplify 1.06% : 0.000002s : 13: predicate.cast_eliminate 0.63% : 0.000001s : 8: predicate.check_bprop_eliminate 0.63% : 0.000001s : 8: predicate.compare_switch_simplify 0.17% : 0.000000s : 4: predicate.const_output_eliminate 0.57% : 0.000001s : 8: predicate.depend_value_elim 0.86% : 0.000002s : 13: predicate.dict_get_item_const_eliminator 0.97% : 0.000002s : 13: predicate.dict_get_item_eliminator 0.89% : 0.000002s : 13: predicate.dict_set_item_eliminator 1.41% : 0.000003s : 8: predicate.dumpgradient_eliminate 0.26% : 0.000001s : 4: predicate.elim_not_effective 0.47% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.02% : 0.000002s : 17: predicate.environ_add_const_eliminate 1.16% : 0.000003s : 17: predicate.environ_get_add_eliminate 1.03% : 0.000002s : 17: predicate.environ_get_depend_swap 1.63% : 0.000004s : 25: predicate.environ_get_eliminate 0.97% : 0.000002s : 17: predicate.environ_get_set_eliminate 1.38% : 0.000003s : 21: predicate.exchange_switch_depend_value 2.33% : 0.000005s : 21: predicate.float_depend_g_call 0.53% : 0.000001s : 8: predicate.float_environ_get_switch 0.81% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.18% : 0.000000s : 4: predicate.fold_const_symbol 0.72% : 0.000002s : 8: predicate.get_grad_eliminate 0.28% : 0.000001s : 4: predicate.graph_param_transform 0.63% : 0.000001s : 8: predicate.incorporate_call 0.46% : 0.000001s : 8: predicate.incorporate_call_switch 6.17% : 0.000014s : 58: predicate.inline 0.72% : 0.000002s : 8: predicate.inline_without_move 0.30% : 0.000001s : 8: predicate.j_node_and_user_rematch 0.93% : 0.000002s : 8: predicate.less_batch_normalization 1.69% : 0.000004s : 25: predicate.list_to_tuple_eliminator_ 2.38% : 0.000005s : 38: predicate.load_eliminater 0.92% : 0.000002s : 4: predicate.loop_unroll_after_grad 2.31% : 0.000005s : 34: predicate.loop_unroll_before_grad 1.56% : 0.000003s : 21: predicate.make_slice_get_slice_eliminator 0.73% : 0.000002s : 8: predicate.merge_addn 0.68% : 0.000002s : 8: predicate.micro_step_allgather_replace 0.54% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.75% : 0.000002s : 13: predicate.minmaximum_grad 1.44% : 0.000003s : 4: predicate.mutable_eliminate 0.47% : 0.000001s : 4: predicate.opt_reshape 0.41% : 0.000001s : 4: predicate.parallel_virtual_node 1.69% : 0.000004s : 21: predicate.partial_defer_inline 1.53% : 0.000003s : 21: predicate.partial_eliminate 0.81% : 0.000002s : 13: predicate.print_const_string_wrapper 0.64% : 0.000001s : 8: predicate.reduce_all_const_elim 1.35% : 0.000003s : 13: predicate.reduce_eliminate 2.26% : 0.000005s : 38: predicate.redundant_stop_gradient_eliminater 0.43% : 0.000001s : 8: predicate.remove_not_recompute_node 1.47% : 0.000003s : 25: predicate.replace_applicator 0.60% : 0.000001s : 8: predicate.replace_old_param 0.45% : 0.000001s : 4: predicate.reset_defer_inline 0.97% : 0.000002s : 13: predicate.reshape_eliminate 0.59% : 0.000001s : 8: predicate.row_tensor_add_zeros_like 0.62% : 0.000001s : 4: predicate.row_tensor_eliminate 0.93% : 0.000002s : 8: predicate.same_eliminate 0.50% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.98% : 0.000002s : 8: predicate.shard_identity_eliminate 1.08% : 0.000002s : 8: predicate.special_op_eliminate 0.95% : 0.000002s : 8: predicate.specialize_transform 1.17% : 0.000003s : 8: predicate.split_environ_get_set_with_tuple_value 0.64% : 0.000001s : 8: predicate.stack_unstack_eliminate 0.30% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.36% : 0.000003s : 21: predicate.switch_defer_inline 1.94% : 0.000004s : 29: predicate.switch_layer_defer_inline 4.77% : 0.000011s : 67: predicate.switch_simplify 0.84% : 0.000002s : 13: predicate.tile_eliminate 0.95% : 0.000002s : 13: predicate.transpose_eliminate 1.37% : 0.000003s : 21: predicate.tuple_list_convert_item_index_to_positive 1.42% : 0.000003s : 21: predicate.tuple_list_get_item_const_eliminator 1.34% : 0.000003s : 21: predicate.tuple_list_get_item_depend_reorder 3.36% : 0.000008s : 33: predicate.tuple_list_get_item_eliminator 1.35% : 0.000003s : 21: predicate.tuple_list_get_set_item_eliminator 2.37% : 0.000005s : 29: predicate.tuple_list_set_item_eliminator 1.67% : 0.000004s : 25: predicate.tuple_to_list_eliminator_ 2.27% : 0.000005s : 38: predicate.updatestate_pure_node_eliminater 2.85% : 0.000006s : 46: predicate.updatestate_useless_node_eliminater 0.36% : 0.000001s : 4: predicate.value_based_eliminate 0.63% : 0.000001s : 8: predicate.virtual_dataset_eliminate 0.58% : 0.000001s : 8: predicate.virtual_output_eliminate 0.23% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.69% : 0.000002s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000662 11 53.54% : 0.000355s : 5: func_graph_cloner_run.FuncGraphClonerGraph 46.46% : 0.000308s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.219172 192 0.00% : 0.000007s : 1: ForceFp32Comm 1.66% : 0.003644s : 1: add_attr 1.65% : 0.003624s : 1: add_attr_with_inline 0.00% : 0.000007s : 1: add_comm_op_reuse_tag 0.03% : 0.000059s : 1: add_recomputation 0.00% : 0.000006s : 1: assign_add_opt 0.03% : 0.000076s : 1: auto_monad 0.01% : 0.000031s : 1: auto_monad_reorder 0.00% : 0.000006s : 1: begin_end_overlap_inline 0.00% : 0.000008s : 1: bias_add_comm_swap 0.24% : 0.000516s : 1: bootstrap 0.02% : 0.000043s : 1: cconv 0.00% : 0.000006s : 1: comm_op_add_attrs 0.01% : 0.000021s : 1: control_data_broadcast_order 0.01% : 0.000014s : 1: convert_after_rewriter 0.02% : 0.000033s : 1: cse_after_recomputation 0.00% : 0.000008s : 1: dataset_repeat_opt 0.01% : 0.000022s : 1: detach_backward 0.01% : 0.000012s : 1: environ_conv 0.01% : 0.000032s : 1: event_method 0.00% : 0.000007s : 1: full_micro_interleaved_order_control 0.00% : 0.000008s : 1: get_jit_bprop_graph 0.01% : 0.000013s : 1: graph_reusing 0.00% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000006s : 1: handle_group_info 0.00% : 0.000009s : 1: inline 0.00% : 0.000009s : 1: insert-virtual-dataset 0.00% : 0.000006s : 1: interleave_parallel_branches 0.00% : 0.000006s : 1: interleave_split_concat_branches 0.00% : 0.000008s : 1: label_fine_grained_interleaved_index 0.00% : 0.000011s : 1: label_micro_interleaved_index 0.23% : 0.000505s : 1: loop_unroll 0.00% : 0.000007s : 1: merge_cast_opt 0.00% : 0.000008s : 1: micro_interleaved_order_control 0.36% : 0.000779s : 1: mutable_eliminate 0.00% : 0.000010s : 1: offloading_packed_experts 0.01% : 0.000015s : 1: opt.transform.loop_unroll_optimizer 0.01% : 0.000020s : 1: opt.transform.mutable_eliminate 0.60% : 0.001306s : 78: opt.transform.opt_a 0.01% : 0.000031s : 1: opt.transform.opt_after_cconv 0.01% : 0.000033s : 1: opt.transform.opt_after_jit_grad 0.05% : 0.000110s : 28: opt.transform.opt_b 0.03% : 0.000057s : 2: opt.transform.opt_trans_graph 0.02% : 0.000040s : 4: opt.transform.symbol_engine_opt 1.66% : 0.003647s : 1: opt_a 0.06% : 0.000137s : 1: opt_after_cconv 0.33% : 0.000717s : 1: opt_after_jit_grad 0.14% : 0.000298s : 1: opt_b 3.08% : 0.006741s : 1: optimize 0.01% : 0.000027s : 1: optimize_parallel_all_gather_comm 0.01% : 0.000012s : 1: order_py_execute_after_rewriter 0.01% : 0.000028s : 1: overlap_grad_flash_sp 0.00% : 0.000006s : 1: overlap_grad_matmul_and_grad_allreduce 0.00% : 0.000010s : 1: overlap_grad_ring_attention 0.00% : 0.000007s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000006s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000008s : 1: overlap_param_gather 0.00% : 0.000006s : 1: overlap_recompute_allgather_and_fa_grad 0.00% : 0.000010s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000008s : 1: overlap_recompute_comm 0.00% : 0.000011s : 1: parallel-infer-symbol 0.00% : 0.000007s : 1: parallel-infer-symbol-second 0.00% : 0.000008s : 1: partial_unused_args_eliminate 0.00% : 0.000010s : 1: pipeline_parallel_scheduler 0.00% : 0.000008s : 1: pipeline_split 0.02% : 0.000050s : 1: pre_auto_parallel 0.02% : 0.000039s : 1: py_interpret_to_execute 0.01% : 0.000021s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000006s : 1: remove_cast_before_assign_add 0.01% : 0.000022s : 1: remove_dup_value 0.20% : 0.000440s : 1: renormalize.infer 0.18% : 0.000386s : 1: renormalize.specialize 0.00% : 0.000008s : 1: reorder_send_recv_between_fp_bp 0.01% : 0.000014s : 1: rewriter_after_jit_bprop_graph 0.02% : 0.000047s : 1: rewriter_after_opt_a 0.05% : 0.000102s : 1: rewriter_before_opt_a 0.00% : 0.000008s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000008s : 1: slice_recompute_activation 0.00% : 0.000008s : 1: split_layernorm_comm 0.00% : 0.000007s : 1: split_matmul_comm_elemetwise 0.00% : 0.000011s : 1: swap_dp_allreduce_reducescatter 0.07% : 0.000145s : 1: symbol_engine_optimizer 0.05% : 0.000104s : 1: tuple_transform 88.90% : 0.194850s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:41:54.823.66 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.168144, [21] [bootstrap]: 0.00046927 [type_inference]: 0.144574 [event_method]: 2.138e-05 [auto_monad]: 6.816e-05 [graph_reusing]: 6.28e-06 [inline]: 2.93e-06 [add_attr]: 0.00398501, [1] [add_attr_with_inline]: 0.00397324, [1] [Cycle 1]: 7.836e-05, [2] [tag_attr]: 2.491e-05 [meta_addattr_fg_expand]: 5.85002e-06 [parallel-infer-symbol]: 3.71999e-06 [pre_auto_parallel]: 4.193e-05 [insert-virtual-dataset]: 2.69999e-06 [parallel-infer-symbol-second]: 8.2e-07 [dataset_repeat_opt]: 2.21998e-06 [pipeline_split]: 1.83002e-06 [optimize]: 0.017731, [53] [py_interpret_to_execute]: 3.133e-05 [rewriter_before_opt_a]: 9.682e-05 [opt_a]: 0.00316121, [2] [Cycle 1]: 0.0024388, [45] [expand_dump_flag]: 4.07003e-06 [switch_simplify]: 4.459e-05 [loop_unroll]: 3.044e-05 [a_1]: 0.00069181 [with_stream_mark]: 1.945e-05 [recompute_prepare]: 1.018e-05 [updatestate_depend_eliminate]: 4.3e-06 [updatestate_assign_eliminate]: 3.88001e-06 [updatestate_loads_eliminate]: 3.02002e-06 [parameter_eliminate]: 2.46998e-06 [a_2]: 9.094e-05 [accelerated_algorithm]: 7.85e-06 [shard]: 2.17001e-06 [meta_shard_fg_expand]: 2.04999e-06 [shard_inline]: 6.69001e-06 [merge_send_recv]: 8.76002e-06 [auto_parallel]: 7.44002e-06 [parallel]: 2.17e-05 [flash_sp]: 9.51003e-06 [merge_comm]: 4.65999e-06 [allreduce_fusion]: 3.54002e-06 [matmul_add_comm_reduction]: 1.023e-05 [allreduce_slice_to_reducescatter]: 1.44e-06 [virtual_shard_identity]: 9.22001e-06 [virtual_dataset]: 7.43e-06 [get_grad_eliminate_]: 6.33e-06 [virtual_output]: 7.07002e-06 [merge_forward]: 4.43001e-06 [cell_reuse_recompute_pass]: 1.15001e-06 [offload_activation]: 1.092e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.365e-05 [merge_recompute_call_nodes]: 1.47001e-06 [before_grad]: 4.738e-05 [set_forward_comm_id_for_comm_node_pass]: 4.25999e-06 [meta_fg_expand]: 3.13e-06 [flash_sp_send_recv_attached]: 2.65002e-06 [receive_attached]: 2.49001e-06 [after_resolve]: 1.495e-05 [a_after_grad]: 1.003e-05 [renormalize]: 0.00092816 [add_forward_monad_depend]: 7.06001e-06 [auto_monad_grad]: 2.53e-06 [auto_monad_eliminator]: 1.789e-05 [cse]: 3.191e-05 [a_3]: 5.277e-05 [Cycle 2]: 0.00070995, [45] [expand_dump_flag]: 1.79e-06 [switch_simplify]: 8.95001e-06 [loop_unroll]: 6.72002e-06 [a_1]: 0.00014773 [with_stream_mark]: 1.454e-05 [recompute_prepare]: 7e-06 [updatestate_depend_eliminate]: 3.02002e-06 [updatestate_assign_eliminate]: 2.71e-06 [updatestate_loads_eliminate]: 3.57002e-06 [parameter_eliminate]: 1.87001e-06 [a_2]: 7.942e-05 [accelerated_algorithm]: 7.35003e-06 [shard]: 1.47999e-06 [meta_shard_fg_expand]: 1.97999e-06 [shard_inline]: 6.58998e-06 [merge_send_recv]: 6.26e-06 [auto_parallel]: 6.86001e-06 [parallel]: 7.61999e-06 [flash_sp]: 4.42e-06 [merge_comm]: 4.17e-06 [allreduce_fusion]: 3.28998e-06 [matmul_add_comm_reduction]: 6.19001e-06 [allreduce_slice_to_reducescatter]: 1.05001e-06 [virtual_shard_identity]: 7.73999e-06 [virtual_dataset]: 5.99e-06 [get_grad_eliminate_]: 5.69e-06 [virtual_output]: 6.04999e-06 [merge_forward]: 3.63e-06 [cell_reuse_recompute_pass]: 2.51e-06 [offload_activation]: 7.38e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.16e-05 [merge_recompute_call_nodes]: 1.32e-06 [before_grad]: 1.15e-05 [set_forward_comm_id_for_comm_node_pass]: 3.31999e-06 [meta_fg_expand]: 2.43e-06 [flash_sp_send_recv_attached]: 9.50007e-07 [receive_attached]: 1.70001e-06 [after_resolve]: 1.252e-05 [a_after_grad]: 1.011e-05 [renormalize]: 8.00064e-08 [add_forward_monad_depend]: 1.63002e-06 [auto_monad_grad]: 1.45999e-06 [auto_monad_eliminator]: 8.57e-06 [cse]: 1.513e-05 [a_3]: 3.694e-05 [py_interpret_to_execute_after_opt_a]: 1.394e-05 [slice_cell_reuse_recomputed_activation]: 2.31e-06 [rewriter_after_opt_a]: 3.959e-05 [convert_after_rewriter]: 7.03e-06 [order_py_execute_after_rewriter]: 5.50001e-06 [mutable_eliminate]: 0.00077758 [opt_b]: 0.00022512, [1] [Cycle 1]: 0.00021707, [7] [b_1]: 0.00013269 [b_2]: 8.83001e-06 [updatestate_depend_eliminate]: 7.88001e-06 [updatestate_assign_eliminate]: 2.55002e-06 [updatestate_loads_eliminate]: 2.81e-06 [renormalize]: 6.40022e-07 [cse]: 2.452e-05 [optimize_parallel_all_gather_comm]: 1.824e-05 [overlap_param_gather]: 2.39999e-06 [cconv]: 3.335e-05 [loop_unroll]: 0.00048 [opt_after_cconv]: 0.00010864, [1] [Cycle 1]: 0.00010223, [7] [c_1]: 3.142e-05 [parameter_eliminate]: 4.38999e-06 [updatestate_depend_eliminate]: 5.60001e-06 [updatestate_assign_eliminate]: 2.56e-06 [updatestate_loads_eliminate]: 2.85002e-06 [cse]: 1.98e-05 [renormalize]: 5.59987e-07 [remove_dup_value]: 1.473e-05 [tuple_transform]: 8.32e-05, [1] [Cycle 1]: 7.856e-05, [4] [d_1]: 4.892e-05 [none_parameter_eliminate]: 1.87001e-06 [renormalize]: 2.40019e-07 [switch_simplify]: 8.17e-06 [partial_unused_args_eliminate]: 1.89e-06 [add_recomputation]: 5.658e-05 [cse_after_recomputation]: 2.304e-05, [1] [Cycle 1]: 1.835e-05, [1] [cse]: 1.251e-05 [environ_conv]: 5.36998e-06 [swap_dp_allreduce_reducescatter]: 5.62999e-06 [bias_add_comm_swap]: 3.45998e-06 [label_micro_interleaved_index]: 4.18999e-06 [label_fine_grained_interleaved_index]: 3.35998e-06 [merge_cast_opt]: 1.57001e-06 [slice_recompute_activation]: 2.18998e-06 [micro_interleaved_order_control]: 2.41e-06 [assign_add_opt]: 1.15001e-06 [ForceFp32Comm]: 1.16002e-06 [remove_cast_before_assign_add]: 1.05001e-06 [full_micro_interleaved_order_control]: 2.20002e-06 [reorder_send_recv_between_fp_bp]: 2.89001e-06 [comm_op_add_attrs]: 1.48002e-06 [add_comm_op_reuse_tag]: 1.14e-06 [interleave_split_concat_branches]: 1.41998e-06 [interleave_parallel_branches]: 1.04e-06 [overlap_opt_shard_in_pipeline]: 1.55999e-06 [overlap_opt_shard_grad_in_pipeline]: 1.90001e-06 [control_data_broadcast_order]: 1.389e-05 [grouped_pairwise_exchange_alltoall]: 6.86001e-06 [offloading_packed_experts]: 2.629e-05 [overlap_recompute_and_grad_model_parallel]: 8.2e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.29e-06 [overlap_recompute_allgather_and_fa_grad]: 1.45001e-06 [overlap_recompute_comm]: 2.54999e-06 [overlap_grad_ring_attention]: 4.70001e-06 [overlap_grad_flash_sp]: 3.268e-05 [begin_end_overlap_inline]: 1.07e-06 [split_matmul_comm_elemetwise]: 2.36e-06 [split_layernorm_comm]: 1.87001e-06 [handle_group_info]: 8.89995e-07 [symbol_engine_optimizer]: 0.00013871, [1] [Cycle 1]: 0.00012823, [6] [build]: 7.98999e-06 [elim_shapecalc]: 2.987e-05 [elim_not_effective]: 2.336e-05 [opt_reshape]: 9.66e-06 [fold_const_symbol]: 1.199e-05 [renormalize]: 6.19999e-07 [detach_backward]: 3.22002e-06 [pipeline_parallel_scheduler]: 2.46e-06 [auto_monad_reorder]: 3.088e-05 [get_jit_bprop_graph]: 2.11e-06 [rewriter_after_jit_bprop_graph]: 9.38002e-06 [opt_after_jit_grad]: 0.0008653 [validate]: 5.231e-05 Sums bootstrap : 0.000469s : 0.31% type_inference : 0.144574s : 95.75% event_method : 0.000021s : 0.01% auto_monad : 0.000068s : 0.05% graph_reusing : 0.000006s : 0.00% inline : 0.000003s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000025s : 0.02% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.00% parallel-infer-symbol : 0.000004s : 0.00% pre_auto_parallel : 0.000042s : 0.03% insert-virtual-dataset : 0.000003s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000031s : 0.02% optimize.rewriter_before_opt_a : 0.000097s : 0.06% optimize.opt_a.expand_dump_flag : 0.000006s : 0.00% optimize.opt_a.switch_simplify : 0.000054s : 0.04% optimize.opt_a.loop_unroll : 0.000037s : 0.02% optimize.opt_a.a_1 : 0.000840s : 0.56% optimize.opt_a.with_stream_mark : 0.000034s : 0.02% optimize.opt_a.recompute_prepare : 0.000017s : 0.01% optimize.opt_a.updatestate_depend_eliminate : 0.000007s : 0.00% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.00% optimize.opt_a.updatestate_loads_eliminate : 0.000007s : 0.00% optimize.opt_a.parameter_eliminate : 0.000004s : 0.00% optimize.opt_a.a_2 : 0.000170s : 0.11% optimize.opt_a.accelerated_algorithm : 0.000015s : 0.01% optimize.opt_a.shard : 0.000004s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.00% optimize.opt_a.shard_inline : 0.000013s : 0.01% optimize.opt_a.merge_send_recv : 0.000015s : 0.01% optimize.opt_a.auto_parallel : 0.000014s : 0.01% optimize.opt_a.parallel : 0.000029s : 0.02% optimize.opt_a.flash_sp : 0.000014s : 0.01% optimize.opt_a.merge_comm : 0.000009s : 0.01% optimize.opt_a.allreduce_fusion : 0.000007s : 0.00% optimize.opt_a.matmul_add_comm_reduction : 0.000016s : 0.01% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000002s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000017s : 0.01% optimize.opt_a.virtual_dataset : 0.000013s : 0.01% optimize.opt_a.get_grad_eliminate_ : 0.000012s : 0.01% optimize.opt_a.virtual_output : 0.000013s : 0.01% optimize.opt_a.merge_forward : 0.000008s : 0.01% optimize.opt_a.cell_reuse_recompute_pass : 0.000004s : 0.00% optimize.opt_a.offload_activation : 0.000018s : 0.01% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000025s : 0.02% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.00% optimize.opt_a.before_grad : 0.000059s : 0.04% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000008s : 0.01% optimize.opt_a.meta_fg_expand : 0.000006s : 0.00% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.00% optimize.opt_a.receive_attached : 0.000004s : 0.00% optimize.opt_a.after_resolve : 0.000027s : 0.02% optimize.opt_a.a_after_grad : 0.000020s : 0.01% optimize.opt_a.renormalize : 0.000928s : 0.61% optimize.opt_a.add_forward_monad_depend : 0.000009s : 0.01% optimize.opt_a.auto_monad_grad : 0.000004s : 0.00% optimize.opt_a.auto_monad_eliminator : 0.000026s : 0.02% optimize.opt_a.cse : 0.000047s : 0.03% optimize.opt_a.a_3 : 0.000090s : 0.06% optimize.py_interpret_to_execute_after_opt_a : 0.000014s : 0.01% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.00% optimize.rewriter_after_opt_a : 0.000040s : 0.03% optimize.convert_after_rewriter : 0.000007s : 0.00% optimize.order_py_execute_after_rewriter : 0.000006s : 0.00% optimize.mutable_eliminate : 0.000778s : 0.51% optimize.opt_b.b_1 : 0.000133s : 0.09% optimize.opt_b.b_2 : 0.000009s : 0.01% optimize.opt_b.updatestate_depend_eliminate : 0.000008s : 0.01% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.00% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000025s : 0.02% optimize.optimize_parallel_all_gather_comm : 0.000018s : 0.01% optimize.overlap_param_gather : 0.000002s : 0.00% optimize.cconv : 0.000033s : 0.02% optimize.loop_unroll : 0.000480s : 0.32% optimize.opt_after_cconv.c_1 : 0.000031s : 0.02% optimize.opt_after_cconv.parameter_eliminate : 0.000004s : 0.00% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.00% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.cse : 0.000020s : 0.01% optimize.opt_after_cconv.renormalize : 0.000001s : 0.00% optimize.remove_dup_value : 0.000015s : 0.01% optimize.tuple_transform.d_1 : 0.000049s : 0.03% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000008s : 0.01% optimize.partial_unused_args_eliminate : 0.000002s : 0.00% optimize.add_recomputation : 0.000057s : 0.04% optimize.cse_after_recomputation.cse : 0.000013s : 0.01% optimize.environ_conv : 0.000005s : 0.00% optimize.swap_dp_allreduce_reducescatter : 0.000006s : 0.00% optimize.bias_add_comm_swap : 0.000003s : 0.00% optimize.label_micro_interleaved_index : 0.000004s : 0.00% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.00% optimize.merge_cast_opt : 0.000002s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.00% optimize.micro_interleaved_order_control : 0.000002s : 0.00% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.00% optimize.full_micro_interleaved_order_control : 0.000002s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.00% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000002s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.00% optimize.control_data_broadcast_order : 0.000014s : 0.01% optimize.grouped_pairwise_exchange_alltoall : 0.000007s : 0.00% optimize.offloading_packed_experts : 0.000026s : 0.02% optimize.overlap_recompute_and_grad_model_parallel : 0.000008s : 0.01% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.00% optimize.overlap_recompute_comm : 0.000003s : 0.00% optimize.overlap_grad_ring_attention : 0.000005s : 0.00% optimize.overlap_grad_flash_sp : 0.000033s : 0.02% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.00% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000008s : 0.01% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000030s : 0.02% optimize.symbol_engine_optimizer.elim_not_effective : 0.000023s : 0.02% optimize.symbol_engine_optimizer.opt_reshape : 0.000010s : 0.01% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000012s : 0.01% optimize.symbol_engine_optimizer.renormalize : 0.000001s : 0.00% detach_backward : 0.000003s : 0.00% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000031s : 0.02% get_jit_bprop_graph : 0.000002s : 0.00% rewriter_after_jit_bprop_graph : 0.000009s : 0.01% opt_after_jit_grad : 0.000865s : 0.57% validate : 0.000052s : 0.03% Time group info: ------[substitution.] 0.000281 34 12.28% : 0.000035s : 6: substitution.arithmetic_simplify 0.88% : 0.000002s : 2: substitution.elim_not_effective 0.64% : 0.000002s : 2: substitution.fold_const_symbol 2.68% : 0.000008s : 4: substitution.graph_param_transform 59.66% : 0.000168s : 4: substitution.inline 14.08% : 0.000040s : 4: substitution.j_node_and_user_rematch 1.66% : 0.000005s : 4: substitution.remove_not_recompute_node 2.34% : 0.000007s : 4: substitution.replace_old_param 5.77% : 0.000016s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.144499 2 99.45% : 0.143704s : 1: type_inference.infer 0.55% : 0.000795s : 1: type_inference.specialize ------[replace.] 0.000070 8 62.68% : 0.000044s : 4: replace.inline 37.32% : 0.000026s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000179 8 92.07% : 0.000165s : 4: match.inline 7.93% : 0.000014s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000220 1278 0.84% : 0.000002s : 13: predicate.accumulaten_eliminater 1.03% : 0.000002s : 4: predicate.ad_related_special_op_eliminate 0.46% : 0.000001s : 8: predicate.addn_check_dump 1.00% : 0.000002s : 13: predicate.addn_zero_filter 0.73% : 0.000002s : 13: predicate.adjust_all_reduce_mul_add 2.82% : 0.000006s : 21: predicate.arithmetic_simplify 0.98% : 0.000002s : 13: predicate.cast_eliminate 0.64% : 0.000001s : 8: predicate.check_bprop_eliminate 0.61% : 0.000001s : 8: predicate.compare_switch_simplify 0.19% : 0.000000s : 4: predicate.const_output_eliminate 0.62% : 0.000001s : 8: predicate.depend_value_elim 0.81% : 0.000002s : 13: predicate.dict_get_item_const_eliminator 0.90% : 0.000002s : 13: predicate.dict_get_item_eliminator 0.90% : 0.000002s : 13: predicate.dict_set_item_eliminator 1.18% : 0.000003s : 8: predicate.dumpgradient_eliminate 0.52% : 0.000001s : 4: predicate.elim_not_effective 0.61% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.08% : 0.000002s : 17: predicate.environ_add_const_eliminate 1.03% : 0.000002s : 17: predicate.environ_get_add_eliminate 1.08% : 0.000002s : 17: predicate.environ_get_depend_swap 1.71% : 0.000004s : 25: predicate.environ_get_eliminate 1.05% : 0.000002s : 17: predicate.environ_get_set_eliminate 1.48% : 0.000003s : 21: predicate.exchange_switch_depend_value 2.41% : 0.000005s : 21: predicate.float_depend_g_call 0.50% : 0.000001s : 8: predicate.float_environ_get_switch 0.75% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.18% : 0.000000s : 4: predicate.fold_const_symbol 0.64% : 0.000001s : 8: predicate.get_grad_eliminate 0.18% : 0.000000s : 4: predicate.graph_param_transform 0.56% : 0.000001s : 8: predicate.incorporate_call 0.46% : 0.000001s : 8: predicate.incorporate_call_switch 6.58% : 0.000014s : 58: predicate.inline 0.74% : 0.000002s : 8: predicate.inline_without_move 0.31% : 0.000001s : 8: predicate.j_node_and_user_rematch 0.76% : 0.000002s : 8: predicate.less_batch_normalization 1.84% : 0.000004s : 25: predicate.list_to_tuple_eliminator_ 2.42% : 0.000005s : 38: predicate.load_eliminater 1.10% : 0.000002s : 4: predicate.loop_unroll_after_grad 2.29% : 0.000005s : 34: predicate.loop_unroll_before_grad 1.50% : 0.000003s : 21: predicate.make_slice_get_slice_eliminator 0.52% : 0.000001s : 8: predicate.merge_addn 0.59% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.55% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.82% : 0.000002s : 13: predicate.minmaximum_grad 1.29% : 0.000003s : 4: predicate.mutable_eliminate 0.65% : 0.000001s : 4: predicate.opt_reshape 0.39% : 0.000001s : 4: predicate.parallel_virtual_node 1.75% : 0.000004s : 21: predicate.partial_defer_inline 1.59% : 0.000003s : 21: predicate.partial_eliminate 0.80% : 0.000002s : 13: predicate.print_const_string_wrapper 0.63% : 0.000001s : 8: predicate.reduce_all_const_elim 1.26% : 0.000003s : 13: predicate.reduce_eliminate 2.48% : 0.000005s : 38: predicate.redundant_stop_gradient_eliminater 0.43% : 0.000001s : 8: predicate.remove_not_recompute_node 1.28% : 0.000003s : 25: predicate.replace_applicator 0.49% : 0.000001s : 8: predicate.replace_old_param 0.45% : 0.000001s : 4: predicate.reset_defer_inline 0.85% : 0.000002s : 13: predicate.reshape_eliminate 0.65% : 0.000001s : 8: predicate.row_tensor_add_zeros_like 0.40% : 0.000001s : 4: predicate.row_tensor_eliminate 0.75% : 0.000002s : 8: predicate.same_eliminate 0.42% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.75% : 0.000002s : 8: predicate.shard_identity_eliminate 0.92% : 0.000002s : 8: predicate.special_op_eliminate 0.67% : 0.000001s : 8: predicate.specialize_transform 1.05% : 0.000002s : 8: predicate.split_environ_get_set_with_tuple_value 0.75% : 0.000002s : 8: predicate.stack_unstack_eliminate 0.30% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.45% : 0.000003s : 21: predicate.switch_defer_inline 1.95% : 0.000004s : 29: predicate.switch_layer_defer_inline 5.17% : 0.000011s : 67: predicate.switch_simplify 1.06% : 0.000002s : 13: predicate.tile_eliminate 0.90% : 0.000002s : 13: predicate.transpose_eliminate 1.46% : 0.000003s : 21: predicate.tuple_list_convert_item_index_to_positive 1.45% : 0.000003s : 21: predicate.tuple_list_get_item_const_eliminator 1.50% : 0.000003s : 21: predicate.tuple_list_get_item_depend_reorder 3.04% : 0.000007s : 33: predicate.tuple_list_get_item_eliminator 1.52% : 0.000003s : 21: predicate.tuple_list_get_set_item_eliminator 2.15% : 0.000005s : 29: predicate.tuple_list_set_item_eliminator 1.69% : 0.000004s : 25: predicate.tuple_to_list_eliminator_ 2.32% : 0.000005s : 38: predicate.updatestate_pure_node_eliminater 2.82% : 0.000006s : 46: predicate.updatestate_useless_node_eliminater 0.51% : 0.000001s : 4: predicate.value_based_eliminate 0.65% : 0.000001s : 8: predicate.virtual_dataset_eliminate 0.64% : 0.000001s : 8: predicate.virtual_output_eliminate 0.30% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.44% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000741 11 60.12% : 0.000446s : 5: func_graph_cloner_run.FuncGraphClonerGraph 39.88% : 0.000296s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.192262 192 0.00% : 0.000004s : 1: ForceFp32Comm 2.08% : 0.003992s : 1: add_attr 2.07% : 0.003978s : 1: add_attr_with_inline 0.00% : 0.000005s : 1: add_comm_op_reuse_tag 0.03% : 0.000061s : 1: add_recomputation 0.00% : 0.000004s : 1: assign_add_opt 0.04% : 0.000074s : 1: auto_monad 0.02% : 0.000035s : 1: auto_monad_reorder 0.00% : 0.000004s : 1: begin_end_overlap_inline 0.00% : 0.000006s : 1: bias_add_comm_swap 0.26% : 0.000502s : 1: bootstrap 0.02% : 0.000037s : 1: cconv 0.00% : 0.000005s : 1: comm_op_add_attrs 6.23% : 0.011987s : 1: control_data_broadcast_order 0.01% : 0.000010s : 1: convert_after_rewriter 0.01% : 0.000026s : 1: cse_after_recomputation 0.00% : 0.000005s : 1: dataset_repeat_opt 0.00% : 0.000009s : 1: detach_backward 0.00% : 0.000009s : 1: environ_conv 0.01% : 0.000029s : 1: event_method 0.00% : 0.000005s : 1: full_micro_interleaved_order_control 0.00% : 0.000005s : 1: get_jit_bprop_graph 0.01% : 0.000010s : 1: graph_reusing 0.02% : 0.000031s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000004s : 1: handle_group_info 0.00% : 0.000006s : 1: inline 0.00% : 0.000006s : 1: insert-virtual-dataset 0.00% : 0.000004s : 1: interleave_parallel_branches 0.00% : 0.000005s : 1: interleave_split_concat_branches 0.00% : 0.000007s : 1: label_fine_grained_interleaved_index 0.00% : 0.000007s : 1: label_micro_interleaved_index 0.25% : 0.000490s : 1: loop_unroll 0.00% : 0.000005s : 1: merge_cast_opt 0.00% : 0.000005s : 1: micro_interleaved_order_control 0.41% : 0.000788s : 1: mutable_eliminate 0.02% : 0.000030s : 1: offloading_packed_experts 0.01% : 0.000016s : 1: opt.transform.loop_unroll_optimizer 0.01% : 0.000018s : 1: opt.transform.mutable_eliminate 0.69% : 0.001326s : 78: opt.transform.opt_a 0.02% : 0.000030s : 1: opt.transform.opt_after_cconv 0.02% : 0.000032s : 1: opt.transform.opt_after_jit_grad 0.06% : 0.000106s : 28: opt.transform.opt_b 0.03% : 0.000054s : 2: opt.transform.opt_trans_graph 0.03% : 0.000067s : 4: opt.transform.symbol_engine_opt 1.65% : 0.003165s : 1: opt_a 0.06% : 0.000112s : 1: opt_after_cconv 0.46% : 0.000877s : 1: opt_after_jit_grad 0.12% : 0.000229s : 1: opt_b 9.27% : 0.017822s : 1: optimize 0.01% : 0.000022s : 1: optimize_parallel_all_gather_comm 0.00% : 0.000008s : 1: order_py_execute_after_rewriter 0.02% : 0.000036s : 1: overlap_grad_flash_sp 0.00% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.00% : 0.000008s : 1: overlap_grad_ring_attention 0.00% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000005s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000006s : 1: overlap_param_gather 0.00% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.01% : 0.000013s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000006s : 1: overlap_recompute_comm 0.00% : 0.000008s : 1: parallel-infer-symbol 0.00% : 0.000004s : 1: parallel-infer-symbol-second 0.00% : 0.000005s : 1: partial_unused_args_eliminate 0.00% : 0.000006s : 1: pipeline_parallel_scheduler 0.00% : 0.000005s : 1: pipeline_split 0.02% : 0.000046s : 1: pre_auto_parallel 0.02% : 0.000035s : 1: py_interpret_to_execute 0.01% : 0.000017s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000004s : 1: remove_cast_before_assign_add 0.01% : 0.000019s : 1: remove_dup_value 0.26% : 0.000492s : 1: renormalize.infer 0.22% : 0.000425s : 1: renormalize.specialize 0.01% : 0.000022s : 1: reorder_send_recv_between_fp_bp 0.01% : 0.000013s : 1: rewriter_after_jit_bprop_graph 0.02% : 0.000043s : 1: rewriter_after_opt_a 0.05% : 0.000102s : 1: rewriter_before_opt_a 0.00% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000006s : 1: slice_recompute_activation 0.00% : 0.000006s : 1: split_layernorm_comm 0.00% : 0.000005s : 1: split_matmul_comm_elemetwise 0.00% : 0.000008s : 1: swap_dp_allreduce_reducescatter 0.07% : 0.000142s : 1: symbol_engine_optimizer 0.04% : 0.000086s : 1: tuple_transform 75.21% : 0.144599s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:41:56.203.495 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:41:56.203.778 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.180346, [21] [bootstrap]: 0.00043889 [type_inference]: 0.00634516 [event_method]: 1.981e-05 [auto_monad]: 6.71e-05 [graph_reusing]: 6.19001e-06 [inline]: 2.67001e-06 [add_attr]: 0.00364351, [1] [add_attr_with_inline]: 0.00362946, [1] [Cycle 1]: 9.665e-05, [2] [tag_attr]: 2.422e-05 [meta_addattr_fg_expand]: 6.51e-06 [parallel-infer-symbol]: 4.17e-06 [pre_auto_parallel]: 4.228e-05 [insert-virtual-dataset]: 2.71e-06 [parallel-infer-symbol-second]: 8.10018e-07 [dataset_repeat_opt]: 2.87002e-06 [pipeline_split]: 1.61998e-06 [optimize]: 0.168488, [53] [py_interpret_to_execute]: 3.554e-05 [rewriter_before_opt_a]: 9.944e-05 [opt_a]: 0.165746, [2] [Cycle 1]: 0.164802, [45] [expand_dump_flag]: 3.11001e-06 [switch_simplify]: 0.162089 [loop_unroll]: 4.389e-05 [a_1]: 0.00075893 [with_stream_mark]: 3.198e-05 [recompute_prepare]: 1.096e-05 [updatestate_depend_eliminate]: 5.07999e-06 [updatestate_assign_eliminate]: 3.76001e-06 [updatestate_loads_eliminate]: 3.66999e-06 [parameter_eliminate]: 3.60003e-06 [a_2]: 0.00012578 [accelerated_algorithm]: 9.38002e-06 [shard]: 2.43e-06 [meta_shard_fg_expand]: 2.88998e-06 [shard_inline]: 6.74001e-06 [merge_send_recv]: 1.025e-05 [auto_parallel]: 1.122e-05 [parallel]: 2.212e-05 [flash_sp]: 1.106e-05 [merge_comm]: 4.69998e-06 [allreduce_fusion]: 3.91001e-06 [matmul_add_comm_reduction]: 1.193e-05 [allreduce_slice_to_reducescatter]: 9.40025e-07 [virtual_shard_identity]: 1.127e-05 [virtual_dataset]: 7.3e-06 [get_grad_eliminate_]: 6.89999e-06 [virtual_output]: 7.4e-06 [merge_forward]: 5.21002e-06 [cell_reuse_recompute_pass]: 2.29001e-06 [offload_activation]: 1.244e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.709e-05 [merge_recompute_call_nodes]: 1.69e-06 [before_grad]: 1.342e-05 [set_forward_comm_id_for_comm_node_pass]: 3.76999e-06 [meta_fg_expand]: 3.35003e-06 [flash_sp_send_recv_attached]: 2.86e-06 [receive_attached]: 2.46998e-06 [after_resolve]: 1.532e-05 [a_after_grad]: 1.203e-05 [renormalize]: 0.00087035 [add_forward_monad_depend]: 6.69001e-06 [auto_monad_grad]: 2.85002e-06 [auto_monad_eliminator]: 1.736e-05 [cse]: 3.166e-05 [a_3]: 6.794e-05 [Cycle 2]: 0.00092616, [45] [expand_dump_flag]: 2.62001e-06 [switch_simplify]: 9.04e-06 [loop_unroll]: 6.96999e-06 [a_1]: 0.00015873 [with_stream_mark]: 1.741e-05 [recompute_prepare]: 7.03998e-06 [updatestate_depend_eliminate]: 3.3e-06 [updatestate_assign_eliminate]: 3.15002e-06 [updatestate_loads_eliminate]: 2.56e-06 [parameter_eliminate]: 2.27001e-06 [a_2]: 0.00010933 [accelerated_algorithm]: 6.72002e-06 [shard]: 1.93002e-06 [meta_shard_fg_expand]: 1.87001e-06 [shard_inline]: 7.18998e-06 [merge_send_recv]: 7.68001e-06 [auto_parallel]: 7.18998e-06 [parallel]: 7.08998e-06 [flash_sp]: 3.85e-06 [merge_comm]: 3.66999e-06 [allreduce_fusion]: 3.6e-06 [matmul_add_comm_reduction]: 7.64002e-06 [allreduce_slice_to_reducescatter]: 5.89993e-07 [virtual_shard_identity]: 9.20001e-06 [virtual_dataset]: 6.17001e-06 [get_grad_eliminate_]: 5.77001e-06 [virtual_output]: 6.48998e-06 [merge_forward]: 3.44001e-06 [cell_reuse_recompute_pass]: 2.29999e-06 [offload_activation]: 1.022e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.424e-05 [merge_recompute_call_nodes]: 1.19e-06 [before_grad]: 9.73002e-06 [set_forward_comm_id_for_comm_node_pass]: 3.90998e-06 [meta_fg_expand]: 2.26e-06 [flash_sp_send_recv_attached]: 1.59998e-06 [receive_attached]: 2.25002e-06 [after_resolve]: 1.238e-05 [a_after_grad]: 1.066e-05 [renormalize]: 1.39989e-07 [add_forward_monad_depend]: 2.29001e-06 [auto_monad_grad]: 1.72999e-06 [auto_monad_eliminator]: 8.72e-06 [cse]: 1.835e-05 [a_3]: 5.566e-05 [py_interpret_to_execute_after_opt_a]: 1.783e-05 [slice_cell_reuse_recomputed_activation]: 5.72001e-06 [rewriter_after_opt_a]: 4.45e-05 [convert_after_rewriter]: 1.062e-05 [order_py_execute_after_rewriter]: 8.38999e-06 [mutable_eliminate]: 0.00080495 [opt_b]: 0.00029751, [1] [Cycle 1]: 0.00028534, [7] [b_1]: 0.00017478 [b_2]: 8.99e-06 [updatestate_depend_eliminate]: 9.94999e-06 [updatestate_assign_eliminate]: 2.68998e-06 [updatestate_loads_eliminate]: 3.18e-06 [renormalize]: 1.10999e-06 [cse]: 2.516e-05 [optimize_parallel_all_gather_comm]: 2.227e-05 [overlap_param_gather]: 5.92999e-06 [cconv]: 3.866e-05 [loop_unroll]: 0.00048122 [opt_after_cconv]: 0.00013609, [1] [Cycle 1]: 0.0001267, [7] [c_1]: 3.179e-05 [parameter_eliminate]: 5.28002e-06 [updatestate_depend_eliminate]: 6.81001e-06 [updatestate_assign_eliminate]: 2.70997e-06 [updatestate_loads_eliminate]: 2.46e-06 [cse]: 2.087e-05 [renormalize]: 6.89994e-07 [remove_dup_value]: 2.012e-05 [tuple_transform]: 9.81e-05, [1] [Cycle 1]: 9.029e-05, [4] [d_1]: 4.832e-05 [none_parameter_eliminate]: 1.64e-06 [renormalize]: 4.50003e-07 [switch_simplify]: 7.16999e-06 [partial_unused_args_eliminate]: 4.61002e-06 [add_recomputation]: 6.066e-05 [cse_after_recomputation]: 3.106e-05, [1] [Cycle 1]: 2.351e-05, [1] [cse]: 1.285e-05 [environ_conv]: 8.69998e-06 [swap_dp_allreduce_reducescatter]: 8.57e-06 [bias_add_comm_swap]: 5.29e-06 [label_micro_interleaved_index]: 8e-06 [label_fine_grained_interleaved_index]: 5.44e-06 [merge_cast_opt]: 3.83999e-06 [slice_recompute_activation]: 4.72998e-06 [micro_interleaved_order_control]: 4.97e-06 [assign_add_opt]: 3.89002e-06 [ForceFp32Comm]: 3.41999e-06 [remove_cast_before_assign_add]: 3.43e-06 [full_micro_interleaved_order_control]: 4.87e-06 [reorder_send_recv_between_fp_bp]: 5.72001e-06 [comm_op_add_attrs]: 4.10998e-06 [add_comm_op_reuse_tag]: 3.53e-06 [interleave_split_concat_branches]: 3.53e-06 [interleave_parallel_branches]: 3.40998e-06 [overlap_opt_shard_in_pipeline]: 3.75998e-06 [overlap_opt_shard_grad_in_pipeline]: 4.72998e-06 [control_data_broadcast_order]: 1.709e-05 [grouped_pairwise_exchange_alltoall]: 3.91001e-06 [offloading_packed_experts]: 7.58999e-06 [overlap_recompute_and_grad_model_parallel]: 7.30998e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.61999e-06 [overlap_recompute_allgather_and_fa_grad]: 4.18001e-06 [overlap_recompute_comm]: 4.94003e-06 [overlap_grad_ring_attention]: 7.5e-06 [overlap_grad_flash_sp]: 2.501e-05 [begin_end_overlap_inline]: 2.98e-06 [split_matmul_comm_elemetwise]: 4.89e-06 [split_layernorm_comm]: 4.07998e-06 [handle_group_info]: 3.46999e-06 [symbol_engine_optimizer]: 0.0001075, [1] [Cycle 1]: 0.00010011, [6] [build]: 4.33999e-06 [elim_shapecalc]: 1.175e-05 [elim_not_effective]: 1.521e-05 [opt_reshape]: 7.97e-06 [fold_const_symbol]: 1.103e-05 [renormalize]: 2.89991e-07 [detach_backward]: 3.75998e-06 [pipeline_parallel_scheduler]: 1.76998e-06 [auto_monad_reorder]: 2.046e-05 [get_jit_bprop_graph]: 2.05002e-06 [rewriter_after_jit_bprop_graph]: 6.54999e-06 [opt_after_jit_grad]: 0.00058017 [validate]: 4.443e-05 Sums bootstrap : 0.000439s : 0.25% type_inference : 0.006345s : 3.63% event_method : 0.000020s : 0.01% auto_monad : 0.000067s : 0.04% graph_reusing : 0.000006s : 0.00% inline : 0.000003s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000024s : 0.01% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000007s : 0.00% parallel-infer-symbol : 0.000004s : 0.00% pre_auto_parallel : 0.000042s : 0.02% insert-virtual-dataset : 0.000003s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000003s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000036s : 0.02% optimize.rewriter_before_opt_a : 0.000099s : 0.06% optimize.opt_a.expand_dump_flag : 0.000006s : 0.00% optimize.opt_a.switch_simplify : 0.162098s : 92.75% optimize.opt_a.loop_unroll : 0.000051s : 0.03% optimize.opt_a.a_1 : 0.000918s : 0.53% optimize.opt_a.with_stream_mark : 0.000049s : 0.03% optimize.opt_a.recompute_prepare : 0.000018s : 0.01% optimize.opt_a.updatestate_depend_eliminate : 0.000008s : 0.00% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.00% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.00% optimize.opt_a.parameter_eliminate : 0.000006s : 0.00% optimize.opt_a.a_2 : 0.000235s : 0.13% optimize.opt_a.accelerated_algorithm : 0.000016s : 0.01% optimize.opt_a.shard : 0.000004s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.000005s : 0.00% optimize.opt_a.shard_inline : 0.000014s : 0.01% optimize.opt_a.merge_send_recv : 0.000018s : 0.01% optimize.opt_a.auto_parallel : 0.000018s : 0.01% optimize.opt_a.parallel : 0.000029s : 0.02% optimize.opt_a.flash_sp : 0.000015s : 0.01% optimize.opt_a.merge_comm : 0.000008s : 0.00% optimize.opt_a.allreduce_fusion : 0.000008s : 0.00% optimize.opt_a.matmul_add_comm_reduction : 0.000020s : 0.01% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000002s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000020s : 0.01% optimize.opt_a.virtual_dataset : 0.000013s : 0.01% optimize.opt_a.get_grad_eliminate_ : 0.000013s : 0.01% optimize.opt_a.virtual_output : 0.000014s : 0.01% optimize.opt_a.merge_forward : 0.000009s : 0.00% optimize.opt_a.cell_reuse_recompute_pass : 0.000005s : 0.00% optimize.opt_a.offload_activation : 0.000023s : 0.01% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000031s : 0.02% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.00% optimize.opt_a.before_grad : 0.000023s : 0.01% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000008s : 0.00% optimize.opt_a.meta_fg_expand : 0.000006s : 0.00% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.00% optimize.opt_a.receive_attached : 0.000005s : 0.00% optimize.opt_a.after_resolve : 0.000028s : 0.02% optimize.opt_a.a_after_grad : 0.000023s : 0.01% optimize.opt_a.renormalize : 0.000870s : 0.50% optimize.opt_a.add_forward_monad_depend : 0.000009s : 0.01% optimize.opt_a.auto_monad_grad : 0.000005s : 0.00% optimize.opt_a.auto_monad_eliminator : 0.000026s : 0.01% optimize.opt_a.cse : 0.000050s : 0.03% optimize.opt_a.a_3 : 0.000124s : 0.07% optimize.py_interpret_to_execute_after_opt_a : 0.000018s : 0.01% optimize.slice_cell_reuse_recomputed_activation : 0.000006s : 0.00% optimize.rewriter_after_opt_a : 0.000044s : 0.03% optimize.convert_after_rewriter : 0.000011s : 0.01% optimize.order_py_execute_after_rewriter : 0.000008s : 0.00% optimize.mutable_eliminate : 0.000805s : 0.46% optimize.opt_b.b_1 : 0.000175s : 0.10% optimize.opt_b.b_2 : 0.000009s : 0.01% optimize.opt_b.updatestate_depend_eliminate : 0.000010s : 0.01% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.00% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000025s : 0.01% optimize.optimize_parallel_all_gather_comm : 0.000022s : 0.01% optimize.overlap_param_gather : 0.000006s : 0.00% optimize.cconv : 0.000039s : 0.02% optimize.loop_unroll : 0.000481s : 0.28% optimize.opt_after_cconv.c_1 : 0.000032s : 0.02% optimize.opt_after_cconv.parameter_eliminate : 0.000005s : 0.00% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000007s : 0.00% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.00% optimize.opt_after_cconv.cse : 0.000021s : 0.01% optimize.opt_after_cconv.renormalize : 0.000001s : 0.00% optimize.remove_dup_value : 0.000020s : 0.01% optimize.tuple_transform.d_1 : 0.000048s : 0.03% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000007s : 0.00% optimize.partial_unused_args_eliminate : 0.000005s : 0.00% optimize.add_recomputation : 0.000061s : 0.03% optimize.cse_after_recomputation.cse : 0.000013s : 0.01% optimize.environ_conv : 0.000009s : 0.00% optimize.swap_dp_allreduce_reducescatter : 0.000009s : 0.00% optimize.bias_add_comm_swap : 0.000005s : 0.00% optimize.label_micro_interleaved_index : 0.000008s : 0.00% optimize.label_fine_grained_interleaved_index : 0.000005s : 0.00% optimize.merge_cast_opt : 0.000004s : 0.00% optimize.slice_recompute_activation : 0.000005s : 0.00% optimize.micro_interleaved_order_control : 0.000005s : 0.00% optimize.assign_add_opt : 0.000004s : 0.00% optimize.ForceFp32Comm : 0.000003s : 0.00% optimize.remove_cast_before_assign_add : 0.000003s : 0.00% optimize.full_micro_interleaved_order_control : 0.000005s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000006s : 0.00% optimize.comm_op_add_attrs : 0.000004s : 0.00% optimize.add_comm_op_reuse_tag : 0.000004s : 0.00% optimize.interleave_split_concat_branches : 0.000004s : 0.00% optimize.interleave_parallel_branches : 0.000003s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000005s : 0.00% optimize.control_data_broadcast_order : 0.000017s : 0.01% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.00% optimize.offloading_packed_experts : 0.000008s : 0.00% optimize.overlap_recompute_and_grad_model_parallel : 0.000007s : 0.00% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.00% optimize.overlap_recompute_comm : 0.000005s : 0.00% optimize.overlap_grad_ring_attention : 0.000007s : 0.00% optimize.overlap_grad_flash_sp : 0.000025s : 0.01% optimize.begin_end_overlap_inline : 0.000003s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000005s : 0.00% optimize.split_layernorm_comm : 0.000004s : 0.00% optimize.handle_group_info : 0.000003s : 0.00% optimize.symbol_engine_optimizer.build : 0.000004s : 0.00% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000012s : 0.01% optimize.symbol_engine_optimizer.elim_not_effective : 0.000015s : 0.01% optimize.symbol_engine_optimizer.opt_reshape : 0.000008s : 0.00% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000011s : 0.01% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000004s : 0.00% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000020s : 0.01% get_jit_bprop_graph : 0.000002s : 0.00% rewriter_after_jit_bprop_graph : 0.000007s : 0.00% opt_after_jit_grad : 0.000580s : 0.33% validate : 0.000044s : 0.03% Time group info: ------[substitution.] 0.000288 34 15.52% : 0.000045s : 6: substitution.arithmetic_simplify 0.72% : 0.000002s : 2: substitution.elim_not_effective 0.46% : 0.000001s : 2: substitution.fold_const_symbol 2.47% : 0.000007s : 4: substitution.graph_param_transform 68.89% : 0.000198s : 4: substitution.inline 1.52% : 0.000004s : 4: substitution.j_node_and_user_rematch 2.17% : 0.000006s : 4: substitution.remove_not_recompute_node 2.10% : 0.000006s : 4: substitution.replace_old_param 6.16% : 0.000018s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.006283 2 87.77% : 0.005514s : 1: type_inference.infer 12.23% : 0.000768s : 1: type_inference.specialize ------[replace.] 0.000074 8 65.28% : 0.000048s : 4: replace.inline 34.72% : 0.000026s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000212 8 92.47% : 0.000196s : 4: match.inline 7.53% : 0.000016s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000244 1278 0.84% : 0.000002s : 13: predicate.accumulaten_eliminater 0.89% : 0.000002s : 4: predicate.ad_related_special_op_eliminate 0.48% : 0.000001s : 8: predicate.addn_check_dump 0.86% : 0.000002s : 13: predicate.addn_zero_filter 0.74% : 0.000002s : 13: predicate.adjust_all_reduce_mul_add 2.33% : 0.000006s : 21: predicate.arithmetic_simplify 0.84% : 0.000002s : 13: predicate.cast_eliminate 0.52% : 0.000001s : 8: predicate.check_bprop_eliminate 0.63% : 0.000002s : 8: predicate.compare_switch_simplify 0.17% : 0.000000s : 4: predicate.const_output_eliminate 0.61% : 0.000001s : 8: predicate.depend_value_elim 0.85% : 0.000002s : 13: predicate.dict_get_item_const_eliminator 1.09% : 0.000003s : 13: predicate.dict_get_item_eliminator 0.78% : 0.000002s : 13: predicate.dict_set_item_eliminator 1.22% : 0.000003s : 8: predicate.dumpgradient_eliminate 0.25% : 0.000001s : 4: predicate.elim_not_effective 0.50% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 0.98% : 0.000002s : 17: predicate.environ_add_const_eliminate 0.98% : 0.000002s : 17: predicate.environ_get_add_eliminate 0.98% : 0.000002s : 17: predicate.environ_get_depend_swap 1.58% : 0.000004s : 25: predicate.environ_get_eliminate 1.16% : 0.000003s : 17: predicate.environ_get_set_eliminate 1.30% : 0.000003s : 21: predicate.exchange_switch_depend_value 3.00% : 0.000007s : 21: predicate.float_depend_g_call 0.50% : 0.000001s : 8: predicate.float_environ_get_switch 0.79% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.14% : 0.000000s : 4: predicate.fold_const_symbol 0.57% : 0.000001s : 8: predicate.get_grad_eliminate 0.17% : 0.000000s : 4: predicate.graph_param_transform 0.52% : 0.000001s : 8: predicate.incorporate_call 0.42% : 0.000001s : 8: predicate.incorporate_call_switch 5.83% : 0.000014s : 58: predicate.inline 0.89% : 0.000002s : 8: predicate.inline_without_move 0.29% : 0.000001s : 8: predicate.j_node_and_user_rematch 0.89% : 0.000002s : 8: predicate.less_batch_normalization 1.68% : 0.000004s : 25: predicate.list_to_tuple_eliminator_ 2.26% : 0.000006s : 38: predicate.load_eliminater 0.82% : 0.000002s : 4: predicate.loop_unroll_after_grad 2.59% : 0.000006s : 34: predicate.loop_unroll_before_grad 1.60% : 0.000004s : 21: predicate.make_slice_get_slice_eliminator 0.50% : 0.000001s : 8: predicate.merge_addn 0.58% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.61% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.73% : 0.000002s : 13: predicate.minmaximum_grad 1.36% : 0.000003s : 4: predicate.mutable_eliminate 0.36% : 0.000001s : 4: predicate.opt_reshape 0.48% : 0.000001s : 4: predicate.parallel_virtual_node 1.82% : 0.000004s : 21: predicate.partial_defer_inline 1.39% : 0.000003s : 21: predicate.partial_eliminate 0.78% : 0.000002s : 13: predicate.print_const_string_wrapper 0.66% : 0.000002s : 8: predicate.reduce_all_const_elim 1.23% : 0.000003s : 13: predicate.reduce_eliminate 2.44% : 0.000006s : 38: predicate.redundant_stop_gradient_eliminater 0.47% : 0.000001s : 8: predicate.remove_not_recompute_node 1.28% : 0.000003s : 25: predicate.replace_applicator 0.52% : 0.000001s : 8: predicate.replace_old_param 0.32% : 0.000001s : 4: predicate.reset_defer_inline 1.01% : 0.000002s : 13: predicate.reshape_eliminate 0.57% : 0.000001s : 8: predicate.row_tensor_add_zeros_like 0.49% : 0.000001s : 4: predicate.row_tensor_eliminate 1.02% : 0.000002s : 8: predicate.same_eliminate 0.38% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.73% : 0.000002s : 8: predicate.shard_identity_eliminate 0.69% : 0.000002s : 8: predicate.special_op_eliminate 0.63% : 0.000002s : 8: predicate.specialize_transform 1.05% : 0.000003s : 8: predicate.split_environ_get_set_with_tuple_value 0.69% : 0.000002s : 8: predicate.stack_unstack_eliminate 0.36% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.30% : 0.000003s : 21: predicate.switch_defer_inline 1.80% : 0.000004s : 29: predicate.switch_layer_defer_inline 9.17% : 0.000022s : 67: predicate.switch_simplify 0.87% : 0.000002s : 13: predicate.tile_eliminate 0.77% : 0.000002s : 13: predicate.transpose_eliminate 1.37% : 0.000003s : 21: predicate.tuple_list_convert_item_index_to_positive 1.54% : 0.000004s : 21: predicate.tuple_list_get_item_const_eliminator 1.49% : 0.000004s : 21: predicate.tuple_list_get_item_depend_reorder 2.94% : 0.000007s : 33: predicate.tuple_list_get_item_eliminator 1.33% : 0.000003s : 21: predicate.tuple_list_get_set_item_eliminator 2.15% : 0.000005s : 29: predicate.tuple_list_set_item_eliminator 1.60% : 0.000004s : 25: predicate.tuple_to_list_eliminator_ 2.09% : 0.000005s : 38: predicate.updatestate_pure_node_eliminater 2.75% : 0.000007s : 46: predicate.updatestate_useless_node_eliminater 0.41% : 0.000001s : 4: predicate.value_based_eliminate 0.54% : 0.000001s : 8: predicate.virtual_dataset_eliminate 0.62% : 0.000002s : 8: predicate.virtual_output_eliminate 0.23% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.36% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000613 11 52.68% : 0.000323s : 5: func_graph_cloner_run.FuncGraphClonerGraph 47.32% : 0.000290s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.516940 192 0.00% : 0.000006s : 1: ForceFp32Comm 0.71% : 0.003655s : 1: add_attr 0.70% : 0.003634s : 1: add_attr_with_inline 0.00% : 0.000007s : 1: add_comm_op_reuse_tag 0.01% : 0.000065s : 1: add_recomputation 0.00% : 0.000006s : 1: assign_add_opt 0.01% : 0.000077s : 1: auto_monad 0.01% : 0.000029s : 1: auto_monad_reorder 0.00% : 0.000006s : 1: begin_end_overlap_inline 0.00% : 0.000008s : 1: bias_add_comm_swap 0.09% : 0.000485s : 1: bootstrap 0.01% : 0.000042s : 1: cconv 0.00% : 0.000007s : 1: comm_op_add_attrs 0.00% : 0.000020s : 1: control_data_broadcast_order 0.00% : 0.000014s : 1: convert_after_rewriter 0.01% : 0.000034s : 1: cse_after_recomputation 0.00% : 0.000008s : 1: dataset_repeat_opt 0.00% : 0.000021s : 1: detach_backward 0.00% : 0.000012s : 1: environ_conv 0.01% : 0.000031s : 1: event_method 0.00% : 0.000007s : 1: full_micro_interleaved_order_control 0.00% : 0.000008s : 1: get_jit_bprop_graph 0.00% : 0.000013s : 1: graph_reusing 0.00% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000006s : 1: handle_group_info 0.00% : 0.000008s : 1: inline 0.00% : 0.000009s : 1: insert-virtual-dataset 0.00% : 0.000006s : 1: interleave_parallel_branches 0.00% : 0.000006s : 1: interleave_split_concat_branches 0.00% : 0.000008s : 1: label_fine_grained_interleaved_index 0.00% : 0.000011s : 1: label_micro_interleaved_index 0.09% : 0.000488s : 1: loop_unroll 0.00% : 0.000006s : 1: merge_cast_opt 0.00% : 0.000008s : 1: micro_interleaved_order_control 0.16% : 0.000813s : 1: mutable_eliminate 0.00% : 0.000010s : 1: offloading_packed_experts 0.00% : 0.000016s : 1: opt.transform.loop_unroll_optimizer 0.00% : 0.000021s : 1: opt.transform.mutable_eliminate 31.62% : 0.163443s : 78: opt.transform.opt_a 0.01% : 0.000030s : 1: opt.transform.opt_after_cconv 0.01% : 0.000030s : 1: opt.transform.opt_after_jit_grad 0.02% : 0.000107s : 28: opt.transform.opt_b 0.01% : 0.000053s : 2: opt.transform.opt_trans_graph 0.01% : 0.000042s : 4: opt.transform.symbol_engine_opt 32.06% : 0.165750s : 1: opt_a 0.03% : 0.000140s : 1: opt_after_cconv 0.11% : 0.000591s : 1: opt_after_jit_grad 0.06% : 0.000301s : 1: opt_b 32.66% : 0.168842s : 1: optimize 0.00% : 0.000026s : 1: optimize_parallel_all_gather_comm 0.00% : 0.000012s : 1: order_py_execute_after_rewriter 0.01% : 0.000028s : 1: overlap_grad_flash_sp 0.00% : 0.000006s : 1: overlap_grad_matmul_and_grad_allreduce 0.00% : 0.000010s : 1: overlap_grad_ring_attention 0.00% : 0.000007s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000007s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000009s : 1: overlap_param_gather 0.00% : 0.000007s : 1: overlap_recompute_allgather_and_fa_grad 0.00% : 0.000010s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000008s : 1: overlap_recompute_comm 0.00% : 0.000011s : 1: parallel-infer-symbol 0.00% : 0.000007s : 1: parallel-infer-symbol-second 0.00% : 0.000008s : 1: partial_unused_args_eliminate 0.00% : 0.000009s : 1: pipeline_parallel_scheduler 0.00% : 0.000007s : 1: pipeline_split 0.01% : 0.000051s : 1: pre_auto_parallel 0.01% : 0.000040s : 1: py_interpret_to_execute 0.00% : 0.000021s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000006s : 1: remove_cast_before_assign_add 0.00% : 0.000023s : 1: remove_dup_value 0.09% : 0.000470s : 1: renormalize.infer 0.08% : 0.000391s : 1: renormalize.specialize 0.00% : 0.000008s : 1: reorder_send_recv_between_fp_bp 0.00% : 0.000012s : 1: rewriter_after_jit_bprop_graph 0.01% : 0.000048s : 1: rewriter_after_opt_a 0.02% : 0.000104s : 1: rewriter_before_opt_a 0.00% : 0.000009s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000008s : 1: slice_recompute_activation 0.00% : 0.000007s : 1: split_layernorm_comm 0.00% : 0.000008s : 1: split_matmul_comm_elemetwise 0.00% : 0.000011s : 1: swap_dp_allreduce_reducescatter 0.02% : 0.000111s : 1: symbol_engine_optimizer 0.02% : 0.000101s : 1: tuple_transform 1.24% : 0.006392s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:41:58.118.466 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.15425, [21] [bootstrap]: 0.00044374 [type_inference]: 0.143014 [event_method]: 2.106e-05 [auto_monad]: 6.627e-05 [graph_reusing]: 6.33e-06 [inline]: 2.41e-06 [add_attr]: 0.0037127, [1] [add_attr_with_inline]: 0.00370106, [1] [Cycle 1]: 7.784e-05, [2] [tag_attr]: 2.563e-05 [meta_addattr_fg_expand]: 5.87999e-06 [parallel-infer-symbol]: 3.5e-06 [pre_auto_parallel]: 4.178e-05 [insert-virtual-dataset]: 3.08e-06 [parallel-infer-symbol-second]: 8.50006e-07 [dataset_repeat_opt]: 1.92999e-06 [pipeline_split]: 1.60999e-06 [optimize]: 0.00610514, [53] [py_interpret_to_execute]: 3.183e-05 [rewriter_before_opt_a]: 9.447e-05 [opt_a]: 0.00345927, [2] [Cycle 1]: 0.00266923, [45] [expand_dump_flag]: 3.41001e-06 [switch_simplify]: 4.531e-05 [loop_unroll]: 3.083e-05 [a_1]: 0.00089155 [with_stream_mark]: 2.548e-05 [recompute_prepare]: 1.31e-05 [updatestate_depend_eliminate]: 4.15e-06 [updatestate_assign_eliminate]: 3.15998e-06 [updatestate_loads_eliminate]: 2.96001e-06 [parameter_eliminate]: 2.17999e-06 [a_2]: 9.665e-05 [accelerated_algorithm]: 8.95001e-06 [shard]: 3.21999e-06 [meta_shard_fg_expand]: 2.89999e-06 [shard_inline]: 7.05002e-06 [merge_send_recv]: 1.04e-05 [auto_parallel]: 9.45001e-06 [parallel]: 2.247e-05 [flash_sp]: 1.263e-05 [merge_comm]: 4.38001e-06 [allreduce_fusion]: 4.08001e-06 [matmul_add_comm_reduction]: 1.04e-05 [allreduce_slice_to_reducescatter]: 8.00006e-07 [virtual_shard_identity]: 1.155e-05 [virtual_dataset]: 7.41999e-06 [get_grad_eliminate_]: 6.91999e-06 [virtual_output]: 6.94001e-06 [merge_forward]: 4.89e-06 [cell_reuse_recompute_pass]: 2.26e-06 [offload_activation]: 1.016e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.496e-05 [merge_recompute_call_nodes]: 1.72999e-06 [before_grad]: 1.217e-05 [set_forward_comm_id_for_comm_node_pass]: 4.03001e-06 [meta_fg_expand]: 3.14999e-06 [flash_sp_send_recv_attached]: 3.05998e-06 [receive_attached]: 2.26998e-06 [after_resolve]: 1.369e-05 [a_after_grad]: 1.059e-05 [renormalize]: 0.00091047 [add_forward_monad_depend]: 8.38999e-06 [auto_monad_grad]: 2.94999e-06 [auto_monad_eliminator]: 2.031e-05 [cse]: 3.353e-05 [a_3]: 5.972e-05 [Cycle 2]: 0.00077659, [45] [expand_dump_flag]: 2.46e-06 [switch_simplify]: 1.001e-05 [loop_unroll]: 6.53998e-06 [a_1]: 0.00015507 [with_stream_mark]: 2.064e-05 [recompute_prepare]: 7.43e-06 [updatestate_depend_eliminate]: 4.4e-06 [updatestate_assign_eliminate]: 2.93e-06 [updatestate_loads_eliminate]: 3.79002e-06 [parameter_eliminate]: 2.12001e-06 [a_2]: 8.001e-05 [accelerated_algorithm]: 6.63998e-06 [shard]: 2.86e-06 [meta_shard_fg_expand]: 2.09e-06 [shard_inline]: 6.01e-06 [merge_send_recv]: 1.124e-05 [auto_parallel]: 9.32001e-06 [parallel]: 8.33001e-06 [flash_sp]: 4.75001e-06 [merge_comm]: 3.82998e-06 [allreduce_fusion]: 3.55e-06 [matmul_add_comm_reduction]: 9.04e-06 [allreduce_slice_to_reducescatter]: 9.20001e-07 [virtual_shard_identity]: 8.37e-06 [virtual_dataset]: 6.26e-06 [get_grad_eliminate_]: 7.26999e-06 [virtual_output]: 5.74e-06 [merge_forward]: 4.11001e-06 [cell_reuse_recompute_pass]: 2.93e-06 [offload_activation]: 1.269e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.398e-05 [merge_recompute_call_nodes]: 1.42999e-06 [before_grad]: 1.035e-05 [set_forward_comm_id_for_comm_node_pass]: 3.95e-06 [meta_fg_expand]: 2.83e-06 [flash_sp_send_recv_attached]: 1.79998e-06 [receive_attached]: 2.59001e-06 [after_resolve]: 1.525e-05 [a_after_grad]: 1.017e-05 [renormalize]: 2.20025e-07 [add_forward_monad_depend]: 1.75001e-06 [auto_monad_grad]: 1.86e-06 [auto_monad_eliminator]: 1.168e-05 [cse]: 1.823e-05 [a_3]: 3.716e-05 [py_interpret_to_execute_after_opt_a]: 1.833e-05 [slice_cell_reuse_recomputed_activation]: 2.63e-06 [rewriter_after_opt_a]: 4.39e-05 [convert_after_rewriter]: 7.2e-06 [order_py_execute_after_rewriter]: 5.66e-06 [mutable_eliminate]: 0.00080414 [opt_b]: 0.00025334, [1] [Cycle 1]: 0.00024321, [7] [b_1]: 0.00013913 [b_2]: 9.25999e-06 [updatestate_depend_eliminate]: 1.151e-05 [updatestate_assign_eliminate]: 2.86e-06 [updatestate_loads_eliminate]: 3.51001e-06 [renormalize]: 1.05999e-06 [cse]: 3.392e-05 [optimize_parallel_all_gather_comm]: 2.19e-05 [overlap_param_gather]: 2.17999e-06 [cconv]: 3.735e-05 [loop_unroll]: 0.00054538 [opt_after_cconv]: 0.00012392, [1] [Cycle 1]: 0.00011558, [7] [c_1]: 3.372e-05 [parameter_eliminate]: 5.94e-06 [updatestate_depend_eliminate]: 8.41002e-06 [updatestate_assign_eliminate]: 2.63998e-06 [updatestate_loads_eliminate]: 2.74999e-06 [cse]: 2.431e-05 [renormalize]: 3.50003e-07 [remove_dup_value]: 1.69e-05 [tuple_transform]: 8.905e-05, [1] [Cycle 1]: 8.44e-05, [4] [d_1]: 5.353e-05 [none_parameter_eliminate]: 1.63002e-06 [renormalize]: 2.19996e-07 [switch_simplify]: 8.04002e-06 [partial_unused_args_eliminate]: 2.50002e-06 [add_recomputation]: 6.251e-05 [cse_after_recomputation]: 2.455e-05, [1] [Cycle 1]: 1.936e-05, [1] [cse]: 1.347e-05 [environ_conv]: 6.79999e-06 [swap_dp_allreduce_reducescatter]: 4.92e-06 [bias_add_comm_swap]: 3.96001e-06 [label_micro_interleaved_index]: 5.56e-06 [label_fine_grained_interleaved_index]: 2.68e-06 [merge_cast_opt]: 1.40001e-06 [slice_recompute_activation]: 2.31e-06 [micro_interleaved_order_control]: 2.68e-06 [assign_add_opt]: 1.40999e-06 [ForceFp32Comm]: 9.80013e-07 [remove_cast_before_assign_add]: 1.02998e-06 [full_micro_interleaved_order_control]: 2.43e-06 [reorder_send_recv_between_fp_bp]: 3.16999e-06 [comm_op_add_attrs]: 1.37e-06 [add_comm_op_reuse_tag]: 9.70002e-07 [interleave_split_concat_branches]: 1.25999e-06 [interleave_parallel_branches]: 1.07998e-06 [overlap_opt_shard_in_pipeline]: 1.27e-06 [overlap_opt_shard_grad_in_pipeline]: 2.22999e-06 [control_data_broadcast_order]: 1.65e-05 [grouped_pairwise_exchange_alltoall]: 1.57999e-06 [offloading_packed_experts]: 4.35999e-06 [overlap_recompute_and_grad_model_parallel]: 5.37001e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.34998e-06 [overlap_recompute_allgather_and_fa_grad]: 1.44e-06 [overlap_recompute_comm]: 2.96999e-06 [overlap_grad_ring_attention]: 4.67e-06 [overlap_grad_flash_sp]: 2.435e-05 [begin_end_overlap_inline]: 7.39994e-07 [split_matmul_comm_elemetwise]: 2.54999e-06 [split_layernorm_comm]: 2.11e-06 [handle_group_info]: 1.54e-06 [symbol_engine_optimizer]: 8.97e-05, [1] [Cycle 1]: 8.362e-05, [6] [build]: 5.29e-06 [elim_shapecalc]: 1.314e-05 [elim_not_effective]: 1.406e-05 [opt_reshape]: 8.03001e-06 [fold_const_symbol]: 1.038e-05 [renormalize]: 3.19997e-07 [detach_backward]: 2.49001e-06 [pipeline_parallel_scheduler]: 2.41e-06 [auto_monad_reorder]: 1.952e-05 [get_jit_bprop_graph]: 2.58e-06 [rewriter_after_jit_bprop_graph]: 6.55997e-06 [opt_after_jit_grad]: 0.0005765 [validate]: 5.001e-05 Sums bootstrap : 0.000444s : 0.30% type_inference : 0.143014s : 95.71% event_method : 0.000021s : 0.01% auto_monad : 0.000066s : 0.04% graph_reusing : 0.000006s : 0.00% inline : 0.000002s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000026s : 0.02% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.00% parallel-infer-symbol : 0.000003s : 0.00% pre_auto_parallel : 0.000042s : 0.03% insert-virtual-dataset : 0.000003s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000032s : 0.02% optimize.rewriter_before_opt_a : 0.000094s : 0.06% optimize.opt_a.expand_dump_flag : 0.000006s : 0.00% optimize.opt_a.switch_simplify : 0.000055s : 0.04% optimize.opt_a.loop_unroll : 0.000037s : 0.03% optimize.opt_a.a_1 : 0.001047s : 0.70% optimize.opt_a.with_stream_mark : 0.000046s : 0.03% optimize.opt_a.recompute_prepare : 0.000021s : 0.01% optimize.opt_a.updatestate_depend_eliminate : 0.000009s : 0.01% optimize.opt_a.updatestate_assign_eliminate : 0.000006s : 0.00% optimize.opt_a.updatestate_loads_eliminate : 0.000007s : 0.00% optimize.opt_a.parameter_eliminate : 0.000004s : 0.00% optimize.opt_a.a_2 : 0.000177s : 0.12% optimize.opt_a.accelerated_algorithm : 0.000016s : 0.01% optimize.opt_a.shard : 0.000006s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.000005s : 0.00% optimize.opt_a.shard_inline : 0.000013s : 0.01% optimize.opt_a.merge_send_recv : 0.000022s : 0.01% optimize.opt_a.auto_parallel : 0.000019s : 0.01% optimize.opt_a.parallel : 0.000031s : 0.02% optimize.opt_a.flash_sp : 0.000017s : 0.01% optimize.opt_a.merge_comm : 0.000008s : 0.01% optimize.opt_a.allreduce_fusion : 0.000008s : 0.01% optimize.opt_a.matmul_add_comm_reduction : 0.000019s : 0.01% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000002s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000020s : 0.01% optimize.opt_a.virtual_dataset : 0.000014s : 0.01% optimize.opt_a.get_grad_eliminate_ : 0.000014s : 0.01% optimize.opt_a.virtual_output : 0.000013s : 0.01% optimize.opt_a.merge_forward : 0.000009s : 0.01% optimize.opt_a.cell_reuse_recompute_pass : 0.000005s : 0.00% optimize.opt_a.offload_activation : 0.000023s : 0.02% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000029s : 0.02% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.00% optimize.opt_a.before_grad : 0.000023s : 0.02% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000008s : 0.01% optimize.opt_a.meta_fg_expand : 0.000006s : 0.00% optimize.opt_a.flash_sp_send_recv_attached : 0.000005s : 0.00% optimize.opt_a.receive_attached : 0.000005s : 0.00% optimize.opt_a.after_resolve : 0.000029s : 0.02% optimize.opt_a.a_after_grad : 0.000021s : 0.01% optimize.opt_a.renormalize : 0.000911s : 0.61% optimize.opt_a.add_forward_monad_depend : 0.000010s : 0.01% optimize.opt_a.auto_monad_grad : 0.000005s : 0.00% optimize.opt_a.auto_monad_eliminator : 0.000032s : 0.02% optimize.opt_a.cse : 0.000052s : 0.03% optimize.opt_a.a_3 : 0.000097s : 0.06% optimize.py_interpret_to_execute_after_opt_a : 0.000018s : 0.01% optimize.slice_cell_reuse_recomputed_activation : 0.000003s : 0.00% optimize.rewriter_after_opt_a : 0.000044s : 0.03% optimize.convert_after_rewriter : 0.000007s : 0.00% optimize.order_py_execute_after_rewriter : 0.000006s : 0.00% optimize.mutable_eliminate : 0.000804s : 0.54% optimize.opt_b.b_1 : 0.000139s : 0.09% optimize.opt_b.b_2 : 0.000009s : 0.01% optimize.opt_b.updatestate_depend_eliminate : 0.000012s : 0.01% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000004s : 0.00% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000034s : 0.02% optimize.optimize_parallel_all_gather_comm : 0.000022s : 0.01% optimize.overlap_param_gather : 0.000002s : 0.00% optimize.cconv : 0.000037s : 0.02% optimize.loop_unroll : 0.000545s : 0.36% optimize.opt_after_cconv.c_1 : 0.000034s : 0.02% optimize.opt_after_cconv.parameter_eliminate : 0.000006s : 0.00% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000008s : 0.01% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.cse : 0.000024s : 0.02% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000017s : 0.01% optimize.tuple_transform.d_1 : 0.000054s : 0.04% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000008s : 0.01% optimize.partial_unused_args_eliminate : 0.000003s : 0.00% optimize.add_recomputation : 0.000063s : 0.04% optimize.cse_after_recomputation.cse : 0.000013s : 0.01% optimize.environ_conv : 0.000007s : 0.00% optimize.swap_dp_allreduce_reducescatter : 0.000005s : 0.00% optimize.bias_add_comm_swap : 0.000004s : 0.00% optimize.label_micro_interleaved_index : 0.000006s : 0.00% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.00% optimize.merge_cast_opt : 0.000001s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.00% optimize.micro_interleaved_order_control : 0.000003s : 0.00% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.00% optimize.full_micro_interleaved_order_control : 0.000002s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.00% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.00% optimize.control_data_broadcast_order : 0.000017s : 0.01% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.00% optimize.offloading_packed_experts : 0.000004s : 0.00% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.00% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.00% optimize.overlap_recompute_comm : 0.000003s : 0.00% optimize.overlap_grad_ring_attention : 0.000005s : 0.00% optimize.overlap_grad_flash_sp : 0.000024s : 0.02% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000003s : 0.00% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000002s : 0.00% optimize.symbol_engine_optimizer.build : 0.000005s : 0.00% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000013s : 0.01% optimize.symbol_engine_optimizer.elim_not_effective : 0.000014s : 0.01% optimize.symbol_engine_optimizer.opt_reshape : 0.000008s : 0.01% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000010s : 0.01% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.00% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000020s : 0.01% get_jit_bprop_graph : 0.000003s : 0.00% rewriter_after_jit_bprop_graph : 0.000007s : 0.00% opt_after_jit_grad : 0.000576s : 0.39% validate : 0.000050s : 0.03% Time group info: ------[substitution.] 0.000269 34 15.81% : 0.000043s : 6: substitution.arithmetic_simplify 0.68% : 0.000002s : 2: substitution.elim_not_effective 0.55% : 0.000001s : 2: substitution.fold_const_symbol 2.49% : 0.000007s : 4: substitution.graph_param_transform 67.53% : 0.000182s : 4: substitution.inline 1.71% : 0.000005s : 4: substitution.j_node_and_user_rematch 2.09% : 0.000006s : 4: substitution.remove_not_recompute_node 2.33% : 0.000006s : 4: substitution.replace_old_param 6.80% : 0.000018s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.142938 2 99.44% : 0.142134s : 1: type_inference.infer 0.56% : 0.000803s : 1: type_inference.specialize ------[replace.] 0.000075 8 57.31% : 0.000043s : 4: replace.inline 42.69% : 0.000032s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000195 8 91.70% : 0.000179s : 4: match.inline 8.30% : 0.000016s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000250 1278 0.74% : 0.000002s : 13: predicate.accumulaten_eliminater 1.10% : 0.000003s : 4: predicate.ad_related_special_op_eliminate 0.44% : 0.000001s : 8: predicate.addn_check_dump 1.14% : 0.000003s : 13: predicate.addn_zero_filter 0.79% : 0.000002s : 13: predicate.adjust_all_reduce_mul_add 2.68% : 0.000007s : 21: predicate.arithmetic_simplify 0.89% : 0.000002s : 13: predicate.cast_eliminate 0.50% : 0.000001s : 8: predicate.check_bprop_eliminate 0.49% : 0.000001s : 8: predicate.compare_switch_simplify 0.16% : 0.000000s : 4: predicate.const_output_eliminate 0.62% : 0.000002s : 8: predicate.depend_value_elim 0.80% : 0.000002s : 13: predicate.dict_get_item_const_eliminator 0.85% : 0.000002s : 13: predicate.dict_get_item_eliminator 0.77% : 0.000002s : 13: predicate.dict_set_item_eliminator 0.89% : 0.000002s : 8: predicate.dumpgradient_eliminate 0.25% : 0.000001s : 4: predicate.elim_not_effective 0.61% : 0.000002s : 4: predicate.elim_shapecalc_of_broadcastargs 1.04% : 0.000003s : 17: predicate.environ_add_const_eliminate 0.98% : 0.000002s : 17: predicate.environ_get_add_eliminate 1.14% : 0.000003s : 17: predicate.environ_get_depend_swap 1.53% : 0.000004s : 25: predicate.environ_get_eliminate 0.92% : 0.000002s : 17: predicate.environ_get_set_eliminate 1.23% : 0.000003s : 21: predicate.exchange_switch_depend_value 2.20% : 0.000006s : 21: predicate.float_depend_g_call 0.45% : 0.000001s : 8: predicate.float_environ_get_switch 0.66% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.17% : 0.000000s : 4: predicate.fold_const_symbol 0.64% : 0.000002s : 8: predicate.get_grad_eliminate 0.28% : 0.000001s : 4: predicate.graph_param_transform 0.51% : 0.000001s : 8: predicate.incorporate_call 0.39% : 0.000001s : 8: predicate.incorporate_call_switch 6.16% : 0.000015s : 58: predicate.inline 0.69% : 0.000002s : 8: predicate.inline_without_move 0.35% : 0.000001s : 8: predicate.j_node_and_user_rematch 0.72% : 0.000002s : 8: predicate.less_batch_normalization 1.41% : 0.000004s : 25: predicate.list_to_tuple_eliminator_ 2.18% : 0.000005s : 38: predicate.load_eliminater 1.24% : 0.000003s : 4: predicate.loop_unroll_after_grad 2.01% : 0.000005s : 34: predicate.loop_unroll_before_grad 1.29% : 0.000003s : 21: predicate.make_slice_get_slice_eliminator 0.57% : 0.000001s : 8: predicate.merge_addn 0.48% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.45% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.76% : 0.000002s : 13: predicate.minmaximum_grad 1.71% : 0.000004s : 4: predicate.mutable_eliminate 0.35% : 0.000001s : 4: predicate.opt_reshape 0.30% : 0.000001s : 4: predicate.parallel_virtual_node 2.00% : 0.000005s : 21: predicate.partial_defer_inline 1.40% : 0.000003s : 21: predicate.partial_eliminate 0.88% : 0.000002s : 13: predicate.print_const_string_wrapper 0.58% : 0.000001s : 8: predicate.reduce_all_const_elim 1.12% : 0.000003s : 13: predicate.reduce_eliminate 2.09% : 0.000005s : 38: predicate.redundant_stop_gradient_eliminater 0.48% : 0.000001s : 8: predicate.remove_not_recompute_node 1.19% : 0.000003s : 25: predicate.replace_applicator 0.50% : 0.000001s : 8: predicate.replace_old_param 0.46% : 0.000001s : 4: predicate.reset_defer_inline 0.90% : 0.000002s : 13: predicate.reshape_eliminate 0.61% : 0.000002s : 8: predicate.row_tensor_add_zeros_like 0.35% : 0.000001s : 4: predicate.row_tensor_eliminate 0.89% : 0.000002s : 8: predicate.same_eliminate 0.47% : 0.000001s : 8: predicate.set_cell_output_no_recompute 1.01% : 0.000003s : 8: predicate.shard_identity_eliminate 0.58% : 0.000001s : 8: predicate.special_op_eliminate 0.54% : 0.000001s : 8: predicate.specialize_transform 1.14% : 0.000003s : 8: predicate.split_environ_get_set_with_tuple_value 0.72% : 0.000002s : 8: predicate.stack_unstack_eliminate 0.41% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.25% : 0.000003s : 21: predicate.switch_defer_inline 7.57% : 0.000019s : 29: predicate.switch_layer_defer_inline 4.84% : 0.000012s : 67: predicate.switch_simplify 0.90% : 0.000002s : 13: predicate.tile_eliminate 0.86% : 0.000002s : 13: predicate.transpose_eliminate 1.53% : 0.000004s : 21: predicate.tuple_list_convert_item_index_to_positive 1.43% : 0.000004s : 21: predicate.tuple_list_get_item_const_eliminator 1.52% : 0.000004s : 21: predicate.tuple_list_get_item_depend_reorder 3.12% : 0.000008s : 33: predicate.tuple_list_get_item_eliminator 1.22% : 0.000003s : 21: predicate.tuple_list_get_set_item_eliminator 2.03% : 0.000005s : 29: predicate.tuple_list_set_item_eliminator 1.48% : 0.000004s : 25: predicate.tuple_to_list_eliminator_ 2.12% : 0.000005s : 38: predicate.updatestate_pure_node_eliminater 2.82% : 0.000007s : 46: predicate.updatestate_useless_node_eliminater 0.39% : 0.000001s : 4: predicate.value_based_eliminate 0.72% : 0.000002s : 8: predicate.virtual_dataset_eliminate 0.52% : 0.000001s : 8: predicate.virtual_output_eliminate 0.29% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.57% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000656 11 53.79% : 0.000353s : 5: func_graph_cloner_run.FuncGraphClonerGraph 46.21% : 0.000303s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.166655 192 0.00% : 0.000004s : 1: ForceFp32Comm 2.23% : 0.003719s : 1: add_attr 2.22% : 0.003705s : 1: add_attr_with_inline 0.00% : 0.000005s : 1: add_comm_op_reuse_tag 0.04% : 0.000067s : 1: add_recomputation 0.00% : 0.000004s : 1: assign_add_opt 0.04% : 0.000071s : 1: auto_monad 0.01% : 0.000023s : 1: auto_monad_reorder 0.00% : 0.000004s : 1: begin_end_overlap_inline 0.00% : 0.000007s : 1: bias_add_comm_swap 0.28% : 0.000472s : 1: bootstrap 0.02% : 0.000041s : 1: cconv 0.00% : 0.000004s : 1: comm_op_add_attrs 0.01% : 0.000020s : 1: control_data_broadcast_order 0.01% : 0.000011s : 1: convert_after_rewriter 0.02% : 0.000028s : 1: cse_after_recomputation 0.00% : 0.000005s : 1: dataset_repeat_opt 0.00% : 0.000006s : 1: detach_backward 0.01% : 0.000010s : 1: environ_conv 0.02% : 0.000028s : 1: event_method 0.00% : 0.000006s : 1: full_micro_interleaved_order_control 0.00% : 0.000006s : 1: get_jit_bprop_graph 0.01% : 0.000010s : 1: graph_reusing 0.00% : 0.000005s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000004s : 1: handle_group_info 0.00% : 0.000005s : 1: inline 0.00% : 0.000007s : 1: insert-virtual-dataset 0.00% : 0.000004s : 1: interleave_parallel_branches 0.00% : 0.000004s : 1: interleave_split_concat_branches 0.00% : 0.000006s : 1: label_fine_grained_interleaved_index 0.01% : 0.000009s : 1: label_micro_interleaved_index 0.33% : 0.000556s : 1: loop_unroll 0.00% : 0.000004s : 1: merge_cast_opt 0.00% : 0.000006s : 1: micro_interleaved_order_control 0.49% : 0.000820s : 1: mutable_eliminate 0.00% : 0.000007s : 1: offloading_packed_experts 0.01% : 0.000020s : 1: opt.transform.loop_unroll_optimizer 0.01% : 0.000024s : 1: opt.transform.mutable_eliminate 0.91% : 0.001517s : 78: opt.transform.opt_a 0.02% : 0.000032s : 1: opt.transform.opt_after_cconv 0.02% : 0.000031s : 1: opt.transform.opt_after_jit_grad 0.07% : 0.000110s : 28: opt.transform.opt_b 0.04% : 0.000059s : 2: opt.transform.opt_trans_graph 0.02% : 0.000041s : 4: opt.transform.symbol_engine_opt 2.08% : 0.003463s : 1: opt_a 0.08% : 0.000128s : 1: opt_after_cconv 0.35% : 0.000588s : 1: opt_after_jit_grad 0.15% : 0.000258s : 1: opt_b 3.67% : 0.006112s : 1: optimize 0.02% : 0.000026s : 1: optimize_parallel_all_gather_comm 0.01% : 0.000009s : 1: order_py_execute_after_rewriter 0.02% : 0.000029s : 1: overlap_grad_flash_sp 0.00% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.00% : 0.000008s : 1: overlap_grad_ring_attention 0.00% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000006s : 1: overlap_param_gather 0.00% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.01% : 0.000010s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000006s : 1: overlap_recompute_comm 0.00% : 0.000007s : 1: parallel-infer-symbol 0.00% : 0.000004s : 1: parallel-infer-symbol-second 0.00% : 0.000006s : 1: partial_unused_args_eliminate 0.00% : 0.000006s : 1: pipeline_parallel_scheduler 0.00% : 0.000004s : 1: pipeline_split 0.03% : 0.000046s : 1: pre_auto_parallel 0.02% : 0.000036s : 1: py_interpret_to_execute 0.01% : 0.000023s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000004s : 1: remove_cast_before_assign_add 0.01% : 0.000021s : 1: remove_dup_value 0.30% : 0.000492s : 1: renormalize.infer 0.24% : 0.000407s : 1: renormalize.specialize 0.00% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.01% : 0.000010s : 1: rewriter_after_jit_bprop_graph 0.03% : 0.000049s : 1: rewriter_after_opt_a 0.06% : 0.000100s : 1: rewriter_before_opt_a 0.00% : 0.000006s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000006s : 1: slice_recompute_activation 0.00% : 0.000006s : 1: split_layernorm_comm 0.00% : 0.000005s : 1: split_matmul_comm_elemetwise 0.00% : 0.000008s : 1: swap_dp_allreduce_reducescatter 0.06% : 0.000092s : 1: symbol_engine_optimizer 0.06% : 0.000092s : 1: tuple_transform 85.83% : 0.143036s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:42:00.203.646 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:42:00.203.958 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.171995, [21] [bootstrap]: 0.00048882 [type_inference]: 0.160336 [event_method]: 2.12e-05 [auto_monad]: 6.827e-05 [graph_reusing]: 7.11999e-06 [inline]: 2.76999e-06 [add_attr]: 0.00356559, [1] [add_attr_with_inline]: 0.00355292, [1] [Cycle 1]: 9.414e-05, [2] [tag_attr]: 2.32e-05 [meta_addattr_fg_expand]: 6.44001e-06 [parallel-infer-symbol]: 3.86999e-06 [pre_auto_parallel]: 4.161e-05 [insert-virtual-dataset]: 2.59999e-06 [parallel-infer-symbol-second]: 6.80011e-07 [dataset_repeat_opt]: 1.78002e-06 [pipeline_split]: 1.79998e-06 [optimize]: 0.00623246, [53] [py_interpret_to_execute]: 3.465e-05 [rewriter_before_opt_a]: 9.822e-05 [opt_a]: 0.00349691, [2] [Cycle 1]: 0.00252128, [45] [expand_dump_flag]: 3.06001e-06 [switch_simplify]: 4.317e-05 [loop_unroll]: 3.064e-05 [a_1]: 0.00069511 [with_stream_mark]: 2.027e-05 [recompute_prepare]: 1.015e-05 [updatestate_depend_eliminate]: 4.46002e-06 [updatestate_assign_eliminate]: 3.61999e-06 [updatestate_loads_eliminate]: 3.26999e-06 [parameter_eliminate]: 2.28998e-06 [a_2]: 0.00011928 [accelerated_algorithm]: 8.18001e-06 [shard]: 2.51e-06 [meta_shard_fg_expand]: 2.36998e-06 [shard_inline]: 7.56999e-06 [merge_send_recv]: 9.97001e-06 [auto_parallel]: 7.38e-06 [parallel]: 1.978e-05 [flash_sp]: 8.85001e-06 [merge_comm]: 4.02e-06 [allreduce_fusion]: 3.57002e-06 [matmul_add_comm_reduction]: 1.174e-05 [allreduce_slice_to_reducescatter]: 6.19999e-07 [virtual_shard_identity]: 8.36002e-06 [virtual_dataset]: 7.03998e-06 [get_grad_eliminate_]: 6.27001e-06 [virtual_output]: 6.90002e-06 [merge_forward]: 3.86999e-06 [cell_reuse_recompute_pass]: 1.24e-06 [offload_activation]: 1.108e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.545e-05 [merge_recompute_call_nodes]: 1.70001e-06 [before_grad]: 1.058e-05 [set_forward_comm_id_for_comm_node_pass]: 3.92998e-06 [meta_fg_expand]: 3.01999e-06 [flash_sp_send_recv_attached]: 2.86e-06 [receive_attached]: 2.39001e-06 [after_resolve]: 1.231e-05 [a_after_grad]: 9.76e-06 [renormalize]: 0.00085783 [add_forward_monad_depend]: 7.28e-06 [auto_monad_grad]: 2.53e-06 [auto_monad_eliminator]: 1.812e-05 [cse]: 3.168e-05 [a_3]: 7.025e-05 [Cycle 2]: 0.00095814, [45] [expand_dump_flag]: 1.91e-06 [switch_simplify]: 9.41003e-06 [loop_unroll]: 6.58e-06 [a_1]: 0.00015174 [with_stream_mark]: 1.796e-05 [recompute_prepare]: 6.81999e-06 [updatestate_depend_eliminate]: 3.68999e-06 [updatestate_assign_eliminate]: 3.14999e-06 [updatestate_loads_eliminate]: 2.98e-06 [parameter_eliminate]: 1.79e-06 [a_2]: 0.00010933 [accelerated_algorithm]: 7.61001e-06 [shard]: 1.40999e-06 [meta_shard_fg_expand]: 2.24999e-06 [shard_inline]: 6.79999e-06 [merge_send_recv]: 6.81001e-06 [auto_parallel]: 8.06001e-06 [parallel]: 7.4e-06 [flash_sp]: 4.59002e-06 [merge_comm]: 4.13001e-06 [allreduce_fusion]: 3.26999e-06 [matmul_add_comm_reduction]: 8.32e-06 [allreduce_slice_to_reducescatter]: 3.50003e-07 [virtual_shard_identity]: 7.58001e-06 [virtual_dataset]: 6.59001e-06 [get_grad_eliminate_]: 5.80002e-06 [virtual_output]: 6.32001e-06 [merge_forward]: 3.90998e-06 [cell_reuse_recompute_pass]: 2.44001e-06 [offload_activation]: 8.08001e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.465e-05 [merge_recompute_call_nodes]: 1.28002e-06 [before_grad]: 1.046e-05 [set_forward_comm_id_for_comm_node_pass]: 4.03999e-06 [meta_fg_expand]: 2.68e-06 [flash_sp_send_recv_attached]: 1.41002e-06 [receive_attached]: 9.80013e-07 [after_resolve]: 1.167e-05 [a_after_grad]: 1.013e-05 [renormalize]: 5.00004e-08 [add_forward_monad_depend]: 3.4e-06 [auto_monad_grad]: 1.54e-06 [auto_monad_eliminator]: 1.177e-05 [cse]: 2.099e-05 [a_3]: 5.2e-05 [py_interpret_to_execute_after_opt_a]: 1.76e-05 [slice_cell_reuse_recomputed_activation]: 5.04003e-06 [rewriter_after_opt_a]: 4.649e-05 [convert_after_rewriter]: 1.077e-05 [order_py_execute_after_rewriter]: 8.80001e-06 [mutable_eliminate]: 0.00077732 [opt_b]: 0.00030479, [1] [Cycle 1]: 0.00029363, [7] [b_1]: 0.00018491 [b_2]: 8.84e-06 [updatestate_depend_eliminate]: 8.60999e-06 [updatestate_assign_eliminate]: 3.36001e-06 [updatestate_loads_eliminate]: 3.63e-06 [renormalize]: 8.39995e-07 [cse]: 2.314e-05 [optimize_parallel_all_gather_comm]: 2.394e-05 [overlap_param_gather]: 5.49e-06 [cconv]: 4.056e-05 [loop_unroll]: 0.00050243 [opt_after_cconv]: 0.00013733, [1] [Cycle 1]: 0.00012723, [7] [c_1]: 3.357e-05 [parameter_eliminate]: 3.83999e-06 [updatestate_depend_eliminate]: 6.31e-06 [updatestate_assign_eliminate]: 2.83e-06 [updatestate_loads_eliminate]: 2.66e-06 [cse]: 1.992e-05 [renormalize]: 5.10016e-07 [remove_dup_value]: 1.719e-05 [tuple_transform]: 9.628e-05, [1] [Cycle 1]: 8.87e-05, [4] [d_1]: 4.808e-05 [none_parameter_eliminate]: 1.70001e-06 [renormalize]: 1.70025e-07 [switch_simplify]: 7.43999e-06 [partial_unused_args_eliminate]: 4.85999e-06 [add_recomputation]: 5.617e-05 [cse_after_recomputation]: 2.946e-05, [1] [Cycle 1]: 2.219e-05, [1] [cse]: 1.208e-05 [environ_conv]: 8.97e-06 [swap_dp_allreduce_reducescatter]: 8.06001e-06 [bias_add_comm_swap]: 5.36998e-06 [label_micro_interleaved_index]: 6.90998e-06 [label_fine_grained_interleaved_index]: 5.74999e-06 [merge_cast_opt]: 4.06001e-06 [slice_recompute_activation]: 4.65001e-06 [micro_interleaved_order_control]: 5.00999e-06 [assign_add_opt]: 3.79002e-06 [ForceFp32Comm]: 3.20998e-06 [remove_cast_before_assign_add]: 3.68999e-06 [full_micro_interleaved_order_control]: 4.43999e-06 [reorder_send_recv_between_fp_bp]: 5.56e-06 [comm_op_add_attrs]: 3.31001e-06 [add_comm_op_reuse_tag]: 3.28e-06 [interleave_split_concat_branches]: 4.2e-06 [interleave_parallel_branches]: 3.7e-06 [overlap_opt_shard_in_pipeline]: 4.22998e-06 [overlap_opt_shard_grad_in_pipeline]: 5.20001e-06 [control_data_broadcast_order]: 1.688e-05 [grouped_pairwise_exchange_alltoall]: 4.12998e-06 [offloading_packed_experts]: 7.4e-06 [overlap_recompute_and_grad_model_parallel]: 7.36001e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.55998e-06 [overlap_recompute_allgather_and_fa_grad]: 4.05e-06 [overlap_recompute_comm]: 5.96e-06 [overlap_grad_ring_attention]: 7.28e-06 [overlap_grad_flash_sp]: 2.541e-05 [begin_end_overlap_inline]: 2.98998e-06 [split_matmul_comm_elemetwise]: 4.72e-06 [split_layernorm_comm]: 4.26001e-06 [handle_group_info]: 3.30998e-06 [symbol_engine_optimizer]: 0.00010288, [1] [Cycle 1]: 9.559e-05, [6] [build]: 4.1e-06 [elim_shapecalc]: 1.153e-05 [elim_not_effective]: 1.445e-05 [opt_reshape]: 7.50998e-06 [fold_const_symbol]: 1.04e-05 [renormalize]: 1.80007e-07 [detach_backward]: 3.48e-06 [pipeline_parallel_scheduler]: 2.14e-06 [auto_monad_reorder]: 2.11e-05 [get_jit_bprop_graph]: 2.01e-06 [rewriter_after_jit_bprop_graph]: 5.41998e-06 [opt_after_jit_grad]: 0.00052101 [validate]: 4.344e-05 Sums bootstrap : 0.000489s : 0.29% type_inference : 0.160336s : 96.27% event_method : 0.000021s : 0.01% auto_monad : 0.000068s : 0.04% graph_reusing : 0.000007s : 0.00% inline : 0.000003s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000023s : 0.01% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.00% parallel-infer-symbol : 0.000004s : 0.00% pre_auto_parallel : 0.000042s : 0.02% insert-virtual-dataset : 0.000003s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000035s : 0.02% optimize.rewriter_before_opt_a : 0.000098s : 0.06% optimize.opt_a.expand_dump_flag : 0.000005s : 0.00% optimize.opt_a.switch_simplify : 0.000053s : 0.03% optimize.opt_a.loop_unroll : 0.000037s : 0.02% optimize.opt_a.a_1 : 0.000847s : 0.51% optimize.opt_a.with_stream_mark : 0.000038s : 0.02% optimize.opt_a.recompute_prepare : 0.000017s : 0.01% optimize.opt_a.updatestate_depend_eliminate : 0.000008s : 0.00% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.00% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.00% optimize.opt_a.parameter_eliminate : 0.000004s : 0.00% optimize.opt_a.a_2 : 0.000229s : 0.14% optimize.opt_a.accelerated_algorithm : 0.000016s : 0.01% optimize.opt_a.shard : 0.000004s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.000005s : 0.00% optimize.opt_a.shard_inline : 0.000014s : 0.01% optimize.opt_a.merge_send_recv : 0.000017s : 0.01% optimize.opt_a.auto_parallel : 0.000015s : 0.01% optimize.opt_a.parallel : 0.000027s : 0.02% optimize.opt_a.flash_sp : 0.000013s : 0.01% optimize.opt_a.merge_comm : 0.000008s : 0.00% optimize.opt_a.allreduce_fusion : 0.000007s : 0.00% optimize.opt_a.matmul_add_comm_reduction : 0.000020s : 0.01% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000016s : 0.01% optimize.opt_a.virtual_dataset : 0.000014s : 0.01% optimize.opt_a.get_grad_eliminate_ : 0.000012s : 0.01% optimize.opt_a.virtual_output : 0.000013s : 0.01% optimize.opt_a.merge_forward : 0.000008s : 0.00% optimize.opt_a.cell_reuse_recompute_pass : 0.000004s : 0.00% optimize.opt_a.offload_activation : 0.000019s : 0.01% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000030s : 0.02% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.00% optimize.opt_a.before_grad : 0.000021s : 0.01% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000008s : 0.00% optimize.opt_a.meta_fg_expand : 0.000006s : 0.00% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.00% optimize.opt_a.receive_attached : 0.000003s : 0.00% optimize.opt_a.after_resolve : 0.000024s : 0.01% optimize.opt_a.a_after_grad : 0.000020s : 0.01% optimize.opt_a.renormalize : 0.000858s : 0.52% optimize.opt_a.add_forward_monad_depend : 0.000011s : 0.01% optimize.opt_a.auto_monad_grad : 0.000004s : 0.00% optimize.opt_a.auto_monad_eliminator : 0.000030s : 0.02% optimize.opt_a.cse : 0.000053s : 0.03% optimize.opt_a.a_3 : 0.000122s : 0.07% optimize.py_interpret_to_execute_after_opt_a : 0.000018s : 0.01% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.00% optimize.rewriter_after_opt_a : 0.000046s : 0.03% optimize.convert_after_rewriter : 0.000011s : 0.01% optimize.order_py_execute_after_rewriter : 0.000009s : 0.01% optimize.mutable_eliminate : 0.000777s : 0.47% optimize.opt_b.b_1 : 0.000185s : 0.11% optimize.opt_b.b_2 : 0.000009s : 0.01% optimize.opt_b.updatestate_depend_eliminate : 0.000009s : 0.01% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000004s : 0.00% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000023s : 0.01% optimize.optimize_parallel_all_gather_comm : 0.000024s : 0.01% optimize.overlap_param_gather : 0.000005s : 0.00% optimize.cconv : 0.000041s : 0.02% optimize.loop_unroll : 0.000502s : 0.30% optimize.opt_after_cconv.c_1 : 0.000034s : 0.02% optimize.opt_after_cconv.parameter_eliminate : 0.000004s : 0.00% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.00% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.cse : 0.000020s : 0.01% optimize.opt_after_cconv.renormalize : 0.000001s : 0.00% optimize.remove_dup_value : 0.000017s : 0.01% optimize.tuple_transform.d_1 : 0.000048s : 0.03% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000007s : 0.00% optimize.partial_unused_args_eliminate : 0.000005s : 0.00% optimize.add_recomputation : 0.000056s : 0.03% optimize.cse_after_recomputation.cse : 0.000012s : 0.01% optimize.environ_conv : 0.000009s : 0.01% optimize.swap_dp_allreduce_reducescatter : 0.000008s : 0.00% optimize.bias_add_comm_swap : 0.000005s : 0.00% optimize.label_micro_interleaved_index : 0.000007s : 0.00% optimize.label_fine_grained_interleaved_index : 0.000006s : 0.00% optimize.merge_cast_opt : 0.000004s : 0.00% optimize.slice_recompute_activation : 0.000005s : 0.00% optimize.micro_interleaved_order_control : 0.000005s : 0.00% optimize.assign_add_opt : 0.000004s : 0.00% optimize.ForceFp32Comm : 0.000003s : 0.00% optimize.remove_cast_before_assign_add : 0.000004s : 0.00% optimize.full_micro_interleaved_order_control : 0.000004s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000006s : 0.00% optimize.comm_op_add_attrs : 0.000003s : 0.00% optimize.add_comm_op_reuse_tag : 0.000003s : 0.00% optimize.interleave_split_concat_branches : 0.000004s : 0.00% optimize.interleave_parallel_branches : 0.000004s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000005s : 0.00% optimize.control_data_broadcast_order : 0.000017s : 0.01% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.00% optimize.offloading_packed_experts : 0.000007s : 0.00% optimize.overlap_recompute_and_grad_model_parallel : 0.000007s : 0.00% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.00% optimize.overlap_recompute_comm : 0.000006s : 0.00% optimize.overlap_grad_ring_attention : 0.000007s : 0.00% optimize.overlap_grad_flash_sp : 0.000025s : 0.02% optimize.begin_end_overlap_inline : 0.000003s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000005s : 0.00% optimize.split_layernorm_comm : 0.000004s : 0.00% optimize.handle_group_info : 0.000003s : 0.00% optimize.symbol_engine_optimizer.build : 0.000004s : 0.00% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000012s : 0.01% optimize.symbol_engine_optimizer.elim_not_effective : 0.000014s : 0.01% optimize.symbol_engine_optimizer.opt_reshape : 0.000008s : 0.00% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000010s : 0.01% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000003s : 0.00% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000021s : 0.01% get_jit_bprop_graph : 0.000002s : 0.00% rewriter_after_jit_bprop_graph : 0.000005s : 0.00% opt_after_jit_grad : 0.000521s : 0.31% validate : 0.000043s : 0.03% Time group info: ------[substitution.] 0.000250 34 15.37% : 0.000038s : 6: substitution.arithmetic_simplify 0.88% : 0.000002s : 2: substitution.elim_not_effective 0.58% : 0.000001s : 2: substitution.fold_const_symbol 2.82% : 0.000007s : 4: substitution.graph_param_transform 67.48% : 0.000169s : 4: substitution.inline 1.68% : 0.000004s : 4: substitution.j_node_and_user_rematch 2.15% : 0.000005s : 4: substitution.remove_not_recompute_node 2.33% : 0.000006s : 4: substitution.replace_old_param 6.73% : 0.000017s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.160271 2 99.49% : 0.159455s : 1: type_inference.infer 0.51% : 0.000816s : 1: type_inference.specialize ------[replace.] 0.000067 8 62.75% : 0.000042s : 4: replace.inline 37.25% : 0.000025s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000181 8 91.70% : 0.000166s : 4: match.inline 8.30% : 0.000015s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000223 1278 0.83% : 0.000002s : 13: predicate.accumulaten_eliminater 0.83% : 0.000002s : 4: predicate.ad_related_special_op_eliminate 0.52% : 0.000001s : 8: predicate.addn_check_dump 0.90% : 0.000002s : 13: predicate.addn_zero_filter 0.77% : 0.000002s : 13: predicate.adjust_all_reduce_mul_add 2.79% : 0.000006s : 21: predicate.arithmetic_simplify 0.83% : 0.000002s : 13: predicate.cast_eliminate 0.76% : 0.000002s : 8: predicate.check_bprop_eliminate 0.48% : 0.000001s : 8: predicate.compare_switch_simplify 0.17% : 0.000000s : 4: predicate.const_output_eliminate 0.59% : 0.000001s : 8: predicate.depend_value_elim 0.92% : 0.000002s : 13: predicate.dict_get_item_const_eliminator 1.08% : 0.000002s : 13: predicate.dict_get_item_eliminator 0.88% : 0.000002s : 13: predicate.dict_set_item_eliminator 1.07% : 0.000002s : 8: predicate.dumpgradient_eliminate 0.24% : 0.000001s : 4: predicate.elim_not_effective 0.43% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.12% : 0.000002s : 17: predicate.environ_add_const_eliminate 1.04% : 0.000002s : 17: predicate.environ_get_add_eliminate 0.98% : 0.000002s : 17: predicate.environ_get_depend_swap 1.84% : 0.000004s : 25: predicate.environ_get_eliminate 1.04% : 0.000002s : 17: predicate.environ_get_set_eliminate 1.37% : 0.000003s : 21: predicate.exchange_switch_depend_value 2.57% : 0.000006s : 21: predicate.float_depend_g_call 0.48% : 0.000001s : 8: predicate.float_environ_get_switch 0.97% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.17% : 0.000000s : 4: predicate.fold_const_symbol 0.60% : 0.000001s : 8: predicate.get_grad_eliminate 0.19% : 0.000000s : 4: predicate.graph_param_transform 0.54% : 0.000001s : 8: predicate.incorporate_call 0.49% : 0.000001s : 8: predicate.incorporate_call_switch 6.38% : 0.000014s : 58: predicate.inline 0.72% : 0.000002s : 8: predicate.inline_without_move 0.30% : 0.000001s : 8: predicate.j_node_and_user_rematch 1.13% : 0.000003s : 8: predicate.less_batch_normalization 1.82% : 0.000004s : 25: predicate.list_to_tuple_eliminator_ 2.43% : 0.000005s : 38: predicate.load_eliminater 1.07% : 0.000002s : 4: predicate.loop_unroll_after_grad 2.31% : 0.000005s : 34: predicate.loop_unroll_before_grad 1.75% : 0.000004s : 21: predicate.make_slice_get_slice_eliminator 0.54% : 0.000001s : 8: predicate.merge_addn 0.61% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.61% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.77% : 0.000002s : 13: predicate.minmaximum_grad 1.39% : 0.000003s : 4: predicate.mutable_eliminate 0.40% : 0.000001s : 4: predicate.opt_reshape 0.33% : 0.000001s : 4: predicate.parallel_virtual_node 1.78% : 0.000004s : 21: predicate.partial_defer_inline 1.53% : 0.000003s : 21: predicate.partial_eliminate 0.78% : 0.000002s : 13: predicate.print_const_string_wrapper 0.58% : 0.000001s : 8: predicate.reduce_all_const_elim 1.15% : 0.000003s : 13: predicate.reduce_eliminate 2.31% : 0.000005s : 38: predicate.redundant_stop_gradient_eliminater 0.45% : 0.000001s : 8: predicate.remove_not_recompute_node 1.46% : 0.000003s : 25: predicate.replace_applicator 0.68% : 0.000002s : 8: predicate.replace_old_param 0.36% : 0.000001s : 4: predicate.reset_defer_inline 0.91% : 0.000002s : 13: predicate.reshape_eliminate 0.57% : 0.000001s : 8: predicate.row_tensor_add_zeros_like 0.46% : 0.000001s : 4: predicate.row_tensor_eliminate 0.94% : 0.000002s : 8: predicate.same_eliminate 0.43% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.83% : 0.000002s : 8: predicate.shard_identity_eliminate 0.69% : 0.000002s : 8: predicate.special_op_eliminate 0.68% : 0.000002s : 8: predicate.specialize_transform 0.88% : 0.000002s : 8: predicate.split_environ_get_set_with_tuple_value 0.99% : 0.000002s : 8: predicate.stack_unstack_eliminate 0.42% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.45% : 0.000003s : 21: predicate.switch_defer_inline 1.91% : 0.000004s : 29: predicate.switch_layer_defer_inline 4.82% : 0.000011s : 67: predicate.switch_simplify 0.99% : 0.000002s : 13: predicate.tile_eliminate 0.84% : 0.000002s : 13: predicate.transpose_eliminate 1.54% : 0.000003s : 21: predicate.tuple_list_convert_item_index_to_positive 1.58% : 0.000004s : 21: predicate.tuple_list_get_item_const_eliminator 1.42% : 0.000003s : 21: predicate.tuple_list_get_item_depend_reorder 3.18% : 0.000007s : 33: predicate.tuple_list_get_item_eliminator 1.36% : 0.000003s : 21: predicate.tuple_list_get_set_item_eliminator 2.03% : 0.000005s : 29: predicate.tuple_list_set_item_eliminator 1.75% : 0.000004s : 25: predicate.tuple_to_list_eliminator_ 2.47% : 0.000006s : 38: predicate.updatestate_pure_node_eliminater 3.05% : 0.000007s : 46: predicate.updatestate_useless_node_eliminater 0.42% : 0.000001s : 4: predicate.value_based_eliminate 0.77% : 0.000002s : 8: predicate.virtual_dataset_eliminate 0.64% : 0.000001s : 8: predicate.virtual_output_eliminate 0.25% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.60% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000691 11 56.32% : 0.000389s : 5: func_graph_cloner_run.FuncGraphClonerGraph 43.68% : 0.000302s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.184107 192 0.00% : 0.000006s : 1: ForceFp32Comm 1.94% : 0.003577s : 1: add_attr 1.93% : 0.003557s : 1: add_attr_with_inline 0.00% : 0.000006s : 1: add_comm_op_reuse_tag 0.03% : 0.000060s : 1: add_recomputation 0.00% : 0.000006s : 1: assign_add_opt 0.04% : 0.000078s : 1: auto_monad 0.02% : 0.000029s : 1: auto_monad_reorder 0.00% : 0.000006s : 1: begin_end_overlap_inline 0.00% : 0.000008s : 1: bias_add_comm_swap 0.29% : 0.000537s : 1: bootstrap 0.02% : 0.000044s : 1: cconv 0.00% : 0.000006s : 1: comm_op_add_attrs 0.01% : 0.000020s : 1: control_data_broadcast_order 0.01% : 0.000014s : 1: convert_after_rewriter 0.02% : 0.000033s : 1: cse_after_recomputation 0.00% : 0.000008s : 1: dataset_repeat_opt 0.01% : 0.000018s : 1: detach_backward 0.01% : 0.000012s : 1: environ_conv 0.02% : 0.000032s : 1: event_method 0.00% : 0.000007s : 1: full_micro_interleaved_order_control 0.00% : 0.000008s : 1: get_jit_bprop_graph 0.01% : 0.000013s : 1: graph_reusing 0.00% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000006s : 1: handle_group_info 0.00% : 0.000008s : 1: inline 0.00% : 0.000009s : 1: insert-virtual-dataset 0.00% : 0.000006s : 1: interleave_parallel_branches 0.00% : 0.000007s : 1: interleave_split_concat_branches 0.00% : 0.000008s : 1: label_fine_grained_interleaved_index 0.01% : 0.000010s : 1: label_micro_interleaved_index 0.28% : 0.000509s : 1: loop_unroll 0.00% : 0.000007s : 1: merge_cast_opt 0.00% : 0.000008s : 1: micro_interleaved_order_control 0.43% : 0.000786s : 1: mutable_eliminate 0.01% : 0.000010s : 1: offloading_packed_experts 0.01% : 0.000016s : 1: opt.transform.loop_unroll_optimizer 0.01% : 0.000021s : 1: opt.transform.mutable_eliminate 0.71% : 0.001304s : 78: opt.transform.opt_a 0.02% : 0.000032s : 1: opt.transform.opt_after_cconv 0.02% : 0.000028s : 1: opt.transform.opt_after_jit_grad 0.06% : 0.000114s : 28: opt.transform.opt_b 0.03% : 0.000053s : 2: opt.transform.opt_trans_graph 0.02% : 0.000040s : 4: opt.transform.symbol_engine_opt 1.90% : 0.003500s : 1: opt_a 0.08% : 0.000141s : 1: opt_after_cconv 0.29% : 0.000531s : 1: opt_after_jit_grad 0.17% : 0.000309s : 1: opt_b 3.58% : 0.006585s : 1: optimize 0.02% : 0.000028s : 1: optimize_parallel_all_gather_comm 0.01% : 0.000012s : 1: order_py_execute_after_rewriter 0.02% : 0.000029s : 1: overlap_grad_flash_sp 0.00% : 0.000006s : 1: overlap_grad_matmul_and_grad_allreduce 0.01% : 0.000010s : 1: overlap_grad_ring_attention 0.00% : 0.000008s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000007s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000009s : 1: overlap_param_gather 0.00% : 0.000007s : 1: overlap_recompute_allgather_and_fa_grad 0.01% : 0.000011s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000009s : 1: overlap_recompute_comm 0.01% : 0.000011s : 1: parallel-infer-symbol 0.00% : 0.000007s : 1: parallel-infer-symbol-second 0.00% : 0.000008s : 1: partial_unused_args_eliminate 0.01% : 0.000010s : 1: pipeline_parallel_scheduler 0.00% : 0.000008s : 1: pipeline_split 0.03% : 0.000049s : 1: pre_auto_parallel 0.02% : 0.000039s : 1: py_interpret_to_execute 0.01% : 0.000021s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000006s : 1: remove_cast_before_assign_add 0.01% : 0.000021s : 1: remove_dup_value 0.25% : 0.000468s : 1: renormalize.infer 0.21% : 0.000379s : 1: renormalize.specialize 0.00% : 0.000008s : 1: reorder_send_recv_between_fp_bp 0.01% : 0.000011s : 1: rewriter_after_jit_bprop_graph 0.03% : 0.000050s : 1: rewriter_after_opt_a 0.06% : 0.000102s : 1: rewriter_before_opt_a 0.00% : 0.000008s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000008s : 1: slice_recompute_activation 0.00% : 0.000007s : 1: split_layernorm_comm 0.00% : 0.000007s : 1: split_matmul_comm_elemetwise 0.01% : 0.000011s : 1: swap_dp_allreduce_reducescatter 0.06% : 0.000106s : 1: symbol_engine_optimizer 0.05% : 0.000099s : 1: tuple_transform 87.11% : 0.160384s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:42:01.812.828 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.164945, [21] [bootstrap]: 0.00050252 [type_inference]: 0.00634131 [event_method]: 1.91e-05 [auto_monad]: 6.592e-05 [graph_reusing]: 6.84001e-06 [inline]: 2.60002e-06 [add_attr]: 0.00341891, [1] [add_attr_with_inline]: 0.00340661, [1] [Cycle 1]: 7.246e-05, [2] [tag_attr]: 2.377e-05 [meta_addattr_fg_expand]: 6.06e-06 [parallel-infer-symbol]: 3.45e-06 [pre_auto_parallel]: 4.016e-05 [insert-virtual-dataset]: 2.39999e-06 [parallel-infer-symbol-second]: 8.39995e-07 [dataset_repeat_opt]: 2.39001e-06 [pipeline_split]: 1.70001e-06 [optimize]: 0.0053685, [53] [py_interpret_to_execute]: 2.918e-05 [rewriter_before_opt_a]: 8.839e-05 [opt_a]: 0.00304176, [2] [Cycle 1]: 0.00231951, [45] [expand_dump_flag]: 3.09999e-06 [switch_simplify]: 4.472e-05 [loop_unroll]: 3.008e-05 [a_1]: 0.00067273 [with_stream_mark]: 2.089e-05 [recompute_prepare]: 9.82999e-06 [updatestate_depend_eliminate]: 3.9e-06 [updatestate_assign_eliminate]: 3.8e-06 [updatestate_loads_eliminate]: 3.04999e-06 [parameter_eliminate]: 2.02999e-06 [a_2]: 8.93e-05 [accelerated_algorithm]: 7.43999e-06 [shard]: 2.32001e-06 [meta_shard_fg_expand]: 2.26e-06 [shard_inline]: 6.79999e-06 [merge_send_recv]: 8.85999e-06 [auto_parallel]: 7.41999e-06 [parallel]: 2.058e-05 [flash_sp]: 8.97e-06 [merge_comm]: 4.38999e-06 [allreduce_fusion]: 3.65e-06 [matmul_add_comm_reduction]: 1.028e-05 [allreduce_slice_to_reducescatter]: 8.2e-07 [virtual_shard_identity]: 8.50001e-06 [virtual_dataset]: 6.78e-06 [get_grad_eliminate_]: 6.29001e-06 [virtual_output]: 6.24001e-06 [merge_forward]: 4.89e-06 [cell_reuse_recompute_pass]: 1.30001e-06 [offload_activation]: 1.103e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.182e-05 [merge_recompute_call_nodes]: 1.55001e-06 [before_grad]: 1.177e-05 [set_forward_comm_id_for_comm_node_pass]: 4.18001e-06 [meta_fg_expand]: 3.2e-06 [flash_sp_send_recv_attached]: 2.86999e-06 [receive_attached]: 2.16998e-06 [after_resolve]: 1.313e-05 [a_after_grad]: 1.055e-05 [renormalize]: 0.00086798 [add_forward_monad_depend]: 7.01001e-06 [auto_monad_grad]: 2.71999e-06 [auto_monad_eliminator]: 1.853e-05 [cse]: 3.284e-05 [a_3]: 5.504e-05 [Cycle 2]: 0.00070968, [45] [expand_dump_flag]: 1.88997e-06 [switch_simplify]: 8.79998e-06 [loop_unroll]: 6.83e-06 [a_1]: 0.00014701 [with_stream_mark]: 1.542e-05 [recompute_prepare]: 6.42001e-06 [updatestate_depend_eliminate]: 3.48e-06 [updatestate_assign_eliminate]: 3.03e-06 [updatestate_loads_eliminate]: 2.99999e-06 [parameter_eliminate]: 1.54e-06 [a_2]: 7.843e-05 [accelerated_algorithm]: 6.86999e-06 [shard]: 1.28002e-06 [meta_shard_fg_expand]: 1.34e-06 [shard_inline]: 6.14999e-06 [merge_send_recv]: 6.51e-06 [auto_parallel]: 6.69001e-06 [parallel]: 6.20002e-06 [flash_sp]: 4.03001e-06 [merge_comm]: 3.61999e-06 [allreduce_fusion]: 3.55e-06 [matmul_add_comm_reduction]: 7.31999e-06 [allreduce_slice_to_reducescatter]: 5.00004e-07 [virtual_shard_identity]: 1.118e-05 [virtual_dataset]: 5.99999e-06 [get_grad_eliminate_]: 5.92001e-06 [virtual_output]: 5.82001e-06 [merge_forward]: 4.63001e-06 [cell_reuse_recompute_pass]: 2.15002e-06 [offload_activation]: 8.54e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.134e-05 [merge_recompute_call_nodes]: 1.15001e-06 [before_grad]: 9.92999e-06 [set_forward_comm_id_for_comm_node_pass]: 3.55998e-06 [meta_fg_expand]: 2.23998e-06 [flash_sp_send_recv_attached]: 1.44e-06 [receive_attached]: 1.68002e-06 [after_resolve]: 1.325e-05 [a_after_grad]: 9.04e-06 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 1.51998e-06 [auto_monad_grad]: 1.44998e-06 [auto_monad_eliminator]: 9.17999e-06 [cse]: 1.672e-05 [a_3]: 3.721e-05 [py_interpret_to_execute_after_opt_a]: 1.354e-05 [slice_cell_reuse_recomputed_activation]: 1.99e-06 [rewriter_after_opt_a]: 3.97e-05 [convert_after_rewriter]: 7.48e-06 [order_py_execute_after_rewriter]: 5.21002e-06 [mutable_eliminate]: 0.00066531 [opt_b]: 0.00022668, [1] [Cycle 1]: 0.00021822, [7] [b_1]: 0.00013056 [b_2]: 7.7e-06 [updatestate_depend_eliminate]: 9.76003e-06 [updatestate_assign_eliminate]: 2.71999e-06 [updatestate_loads_eliminate]: 2.97002e-06 [renormalize]: 5.50004e-07 [cse]: 2.601e-05 [optimize_parallel_all_gather_comm]: 2.002e-05 [overlap_param_gather]: 2.06998e-06 [cconv]: 3.268e-05 [loop_unroll]: 0.00047604 [opt_after_cconv]: 0.00011271, [1] [Cycle 1]: 0.00010529, [7] [c_1]: 3.185e-05 [parameter_eliminate]: 5.61998e-06 [updatestate_depend_eliminate]: 6.86001e-06 [updatestate_assign_eliminate]: 2.64001e-06 [updatestate_loads_eliminate]: 2.51e-06 [cse]: 2.01e-05 [renormalize]: 5.09986e-07 [remove_dup_value]: 1.542e-05 [tuple_transform]: 8.282e-05, [1] [Cycle 1]: 7.747e-05, [4] [d_1]: 4.795e-05 [none_parameter_eliminate]: 1.69e-06 [renormalize]: 2.00002e-07 [switch_simplify]: 7.98999e-06 [partial_unused_args_eliminate]: 2.75002e-06 [add_recomputation]: 5.743e-05 [cse_after_recomputation]: 2.214e-05, [1] [Cycle 1]: 1.727e-05, [1] [cse]: 1.172e-05 [environ_conv]: 5.82001e-06 [swap_dp_allreduce_reducescatter]: 5.51e-06 [bias_add_comm_swap]: 3.16999e-06 [label_micro_interleaved_index]: 4.37e-06 [label_fine_grained_interleaved_index]: 2.73998e-06 [merge_cast_opt]: 1.40999e-06 [slice_recompute_activation]: 1.98997e-06 [micro_interleaved_order_control]: 2.90998e-06 [assign_add_opt]: 1.89999e-06 [ForceFp32Comm]: 9.89996e-07 [remove_cast_before_assign_add]: 1.02e-06 [full_micro_interleaved_order_control]: 1.94999e-06 [reorder_send_recv_between_fp_bp]: 2.88e-06 [comm_op_add_attrs]: 1.14e-06 [add_comm_op_reuse_tag]: 9.70002e-07 [interleave_split_concat_branches]: 1.20999e-06 [interleave_parallel_branches]: 1.07e-06 [overlap_opt_shard_in_pipeline]: 1.54e-06 [overlap_opt_shard_grad_in_pipeline]: 2.01e-06 [control_data_broadcast_order]: 1.515e-05 [grouped_pairwise_exchange_alltoall]: 1.63002e-06 [offloading_packed_experts]: 4e-06 [overlap_recompute_and_grad_model_parallel]: 5.22e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.26997e-06 [overlap_recompute_allgather_and_fa_grad]: 1.34e-06 [overlap_recompute_comm]: 2.36e-06 [overlap_grad_ring_attention]: 4.4e-06 [overlap_grad_flash_sp]: 2.283e-05 [begin_end_overlap_inline]: 5.00004e-07 [split_matmul_comm_elemetwise]: 2.37999e-06 [split_layernorm_comm]: 1.89e-06 [handle_group_info]: 1.04e-06 [symbol_engine_optimizer]: 8.531e-05, [1] [Cycle 1]: 7.968e-05, [6] [build]: 3.91001e-06 [elim_shapecalc]: 1.213e-05 [elim_not_effective]: 1.385e-05 [opt_reshape]: 7.34002e-06 [fold_const_symbol]: 1.142e-05 [renormalize]: 2.00002e-07 [detach_backward]: 2.32999e-06 [pipeline_parallel_scheduler]: 1.49998e-06 [auto_monad_reorder]: 1.868e-05 [get_jit_bprop_graph]: 1.86e-06 [rewriter_after_jit_bprop_graph]: 5.53002e-06 [opt_after_jit_grad]: 0.148895 [validate]: 5.939e-05 Sums bootstrap : 0.000503s : 0.31% type_inference : 0.006341s : 3.95% event_method : 0.000019s : 0.01% auto_monad : 0.000066s : 0.04% graph_reusing : 0.000007s : 0.00% inline : 0.000003s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000024s : 0.01% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.00% parallel-infer-symbol : 0.000003s : 0.00% pre_auto_parallel : 0.000040s : 0.03% insert-virtual-dataset : 0.000002s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000029s : 0.02% optimize.rewriter_before_opt_a : 0.000088s : 0.06% optimize.opt_a.expand_dump_flag : 0.000005s : 0.00% optimize.opt_a.switch_simplify : 0.000054s : 0.03% optimize.opt_a.loop_unroll : 0.000037s : 0.02% optimize.opt_a.a_1 : 0.000820s : 0.51% optimize.opt_a.with_stream_mark : 0.000036s : 0.02% optimize.opt_a.recompute_prepare : 0.000016s : 0.01% optimize.opt_a.updatestate_depend_eliminate : 0.000007s : 0.00% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.00% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.00% optimize.opt_a.parameter_eliminate : 0.000004s : 0.00% optimize.opt_a.a_2 : 0.000168s : 0.10% optimize.opt_a.accelerated_algorithm : 0.000014s : 0.01% optimize.opt_a.shard : 0.000004s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.00% optimize.opt_a.shard_inline : 0.000013s : 0.01% optimize.opt_a.merge_send_recv : 0.000015s : 0.01% optimize.opt_a.auto_parallel : 0.000014s : 0.01% optimize.opt_a.parallel : 0.000027s : 0.02% optimize.opt_a.flash_sp : 0.000013s : 0.01% optimize.opt_a.merge_comm : 0.000008s : 0.00% optimize.opt_a.allreduce_fusion : 0.000007s : 0.00% optimize.opt_a.matmul_add_comm_reduction : 0.000018s : 0.01% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000020s : 0.01% optimize.opt_a.virtual_dataset : 0.000013s : 0.01% optimize.opt_a.get_grad_eliminate_ : 0.000012s : 0.01% optimize.opt_a.virtual_output : 0.000012s : 0.01% optimize.opt_a.merge_forward : 0.000010s : 0.01% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.00% optimize.opt_a.offload_activation : 0.000020s : 0.01% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000023s : 0.01% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.00% optimize.opt_a.before_grad : 0.000022s : 0.01% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000008s : 0.00% optimize.opt_a.meta_fg_expand : 0.000005s : 0.00% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.00% optimize.opt_a.receive_attached : 0.000004s : 0.00% optimize.opt_a.after_resolve : 0.000026s : 0.02% optimize.opt_a.a_after_grad : 0.000020s : 0.01% optimize.opt_a.renormalize : 0.000868s : 0.54% optimize.opt_a.add_forward_monad_depend : 0.000009s : 0.01% optimize.opt_a.auto_monad_grad : 0.000004s : 0.00% optimize.opt_a.auto_monad_eliminator : 0.000028s : 0.02% optimize.opt_a.cse : 0.000050s : 0.03% optimize.opt_a.a_3 : 0.000092s : 0.06% optimize.py_interpret_to_execute_after_opt_a : 0.000014s : 0.01% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.00% optimize.rewriter_after_opt_a : 0.000040s : 0.02% optimize.convert_after_rewriter : 0.000007s : 0.00% optimize.order_py_execute_after_rewriter : 0.000005s : 0.00% optimize.mutable_eliminate : 0.000665s : 0.41% optimize.opt_b.b_1 : 0.000131s : 0.08% optimize.opt_b.b_2 : 0.000008s : 0.00% optimize.opt_b.updatestate_depend_eliminate : 0.000010s : 0.01% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.00% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000026s : 0.02% optimize.optimize_parallel_all_gather_comm : 0.000020s : 0.01% optimize.overlap_param_gather : 0.000002s : 0.00% optimize.cconv : 0.000033s : 0.02% optimize.loop_unroll : 0.000476s : 0.30% optimize.opt_after_cconv.c_1 : 0.000032s : 0.02% optimize.opt_after_cconv.parameter_eliminate : 0.000006s : 0.00% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000007s : 0.00% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.cse : 0.000020s : 0.01% optimize.opt_after_cconv.renormalize : 0.000001s : 0.00% optimize.remove_dup_value : 0.000015s : 0.01% optimize.tuple_transform.d_1 : 0.000048s : 0.03% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000008s : 0.00% optimize.partial_unused_args_eliminate : 0.000003s : 0.00% optimize.add_recomputation : 0.000057s : 0.04% optimize.cse_after_recomputation.cse : 0.000012s : 0.01% optimize.environ_conv : 0.000006s : 0.00% optimize.swap_dp_allreduce_reducescatter : 0.000006s : 0.00% optimize.bias_add_comm_swap : 0.000003s : 0.00% optimize.label_micro_interleaved_index : 0.000004s : 0.00% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.00% optimize.merge_cast_opt : 0.000001s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.00% optimize.micro_interleaved_order_control : 0.000003s : 0.00% optimize.assign_add_opt : 0.000002s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.00% optimize.full_micro_interleaved_order_control : 0.000002s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.00% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000002s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.00% optimize.control_data_broadcast_order : 0.000015s : 0.01% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.00% optimize.offloading_packed_experts : 0.000004s : 0.00% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.00% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.00% optimize.overlap_recompute_comm : 0.000002s : 0.00% optimize.overlap_grad_ring_attention : 0.000004s : 0.00% optimize.overlap_grad_flash_sp : 0.000023s : 0.01% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.00% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000004s : 0.00% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000012s : 0.01% optimize.symbol_engine_optimizer.elim_not_effective : 0.000014s : 0.01% optimize.symbol_engine_optimizer.opt_reshape : 0.000007s : 0.00% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000011s : 0.01% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.00% pipeline_parallel_scheduler : 0.000001s : 0.00% auto_monad_reorder : 0.000019s : 0.01% get_jit_bprop_graph : 0.000002s : 0.00% rewriter_after_jit_bprop_graph : 0.000006s : 0.00% opt_after_jit_grad : 0.148895s : 92.78% validate : 0.000059s : 0.04% Time group info: ------[substitution.] 0.000240 34 15.75% : 0.000038s : 6: substitution.arithmetic_simplify 0.75% : 0.000002s : 2: substitution.elim_not_effective 0.81% : 0.000002s : 2: substitution.fold_const_symbol 2.89% : 0.000007s : 4: substitution.graph_param_transform 66.96% : 0.000161s : 4: substitution.inline 2.00% : 0.000005s : 4: substitution.j_node_and_user_rematch 2.00% : 0.000005s : 4: substitution.remove_not_recompute_node 2.48% : 0.000006s : 4: substitution.replace_old_param 6.36% : 0.000015s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.006271 2 88.20% : 0.005531s : 1: type_inference.infer 11.80% : 0.000740s : 1: type_inference.specialize ------[replace.] 0.000065 8 64.20% : 0.000042s : 4: replace.inline 35.80% : 0.000023s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000171 8 92.12% : 0.000158s : 4: match.inline 7.88% : 0.000014s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000220 1278 1.06% : 0.000002s : 13: predicate.accumulaten_eliminater 2.29% : 0.000005s : 4: predicate.ad_related_special_op_eliminate 0.46% : 0.000001s : 8: predicate.addn_check_dump 1.00% : 0.000002s : 13: predicate.addn_zero_filter 0.76% : 0.000002s : 13: predicate.adjust_all_reduce_mul_add 2.70% : 0.000006s : 21: predicate.arithmetic_simplify 0.92% : 0.000002s : 13: predicate.cast_eliminate 0.67% : 0.000001s : 8: predicate.check_bprop_eliminate 0.55% : 0.000001s : 8: predicate.compare_switch_simplify 0.18% : 0.000000s : 4: predicate.const_output_eliminate 0.59% : 0.000001s : 8: predicate.depend_value_elim 0.83% : 0.000002s : 13: predicate.dict_get_item_const_eliminator 0.92% : 0.000002s : 13: predicate.dict_get_item_eliminator 0.87% : 0.000002s : 13: predicate.dict_set_item_eliminator 1.94% : 0.000004s : 8: predicate.dumpgradient_eliminate 0.18% : 0.000000s : 4: predicate.elim_not_effective 0.37% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.10% : 0.000002s : 17: predicate.environ_add_const_eliminate 1.09% : 0.000002s : 17: predicate.environ_get_add_eliminate 0.98% : 0.000002s : 17: predicate.environ_get_depend_swap 1.70% : 0.000004s : 25: predicate.environ_get_eliminate 1.07% : 0.000002s : 17: predicate.environ_get_set_eliminate 1.41% : 0.000003s : 21: predicate.exchange_switch_depend_value 2.49% : 0.000005s : 21: predicate.float_depend_g_call 0.49% : 0.000001s : 8: predicate.float_environ_get_switch 0.66% : 0.000001s : 12: predicate.float_tuple_getitem_switch 0.15% : 0.000000s : 4: predicate.fold_const_symbol 0.67% : 0.000001s : 8: predicate.get_grad_eliminate 0.26% : 0.000001s : 4: predicate.graph_param_transform 0.53% : 0.000001s : 8: predicate.incorporate_call 0.43% : 0.000001s : 8: predicate.incorporate_call_switch 6.32% : 0.000014s : 58: predicate.inline 0.71% : 0.000002s : 8: predicate.inline_without_move 0.31% : 0.000001s : 8: predicate.j_node_and_user_rematch 0.80% : 0.000002s : 8: predicate.less_batch_normalization 1.84% : 0.000004s : 25: predicate.list_to_tuple_eliminator_ 2.28% : 0.000005s : 38: predicate.load_eliminater 1.28% : 0.000003s : 4: predicate.loop_unroll_after_grad 2.43% : 0.000005s : 34: predicate.loop_unroll_before_grad 1.85% : 0.000004s : 21: predicate.make_slice_get_slice_eliminator 0.49% : 0.000001s : 8: predicate.merge_addn 0.55% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.54% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.74% : 0.000002s : 13: predicate.minmaximum_grad 1.39% : 0.000003s : 4: predicate.mutable_eliminate 0.30% : 0.000001s : 4: predicate.opt_reshape 0.33% : 0.000001s : 4: predicate.parallel_virtual_node 1.92% : 0.000004s : 21: predicate.partial_defer_inline 1.52% : 0.000003s : 21: predicate.partial_eliminate 0.85% : 0.000002s : 13: predicate.print_const_string_wrapper 0.56% : 0.000001s : 8: predicate.reduce_all_const_elim 1.06% : 0.000002s : 13: predicate.reduce_eliminate 2.29% : 0.000005s : 38: predicate.redundant_stop_gradient_eliminater 0.39% : 0.000001s : 8: predicate.remove_not_recompute_node 1.36% : 0.000003s : 25: predicate.replace_applicator 0.56% : 0.000001s : 8: predicate.replace_old_param 0.34% : 0.000001s : 4: predicate.reset_defer_inline 0.80% : 0.000002s : 13: predicate.reshape_eliminate 0.55% : 0.000001s : 8: predicate.row_tensor_add_zeros_like 0.35% : 0.000001s : 4: predicate.row_tensor_eliminate 0.80% : 0.000002s : 8: predicate.same_eliminate 0.39% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.84% : 0.000002s : 8: predicate.shard_identity_eliminate 0.98% : 0.000002s : 8: predicate.special_op_eliminate 0.64% : 0.000001s : 8: predicate.specialize_transform 0.93% : 0.000002s : 8: predicate.split_environ_get_set_with_tuple_value 0.72% : 0.000002s : 8: predicate.stack_unstack_eliminate 0.36% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.43% : 0.000003s : 21: predicate.switch_defer_inline 1.94% : 0.000004s : 29: predicate.switch_layer_defer_inline 4.97% : 0.000011s : 67: predicate.switch_simplify 0.90% : 0.000002s : 13: predicate.tile_eliminate 0.89% : 0.000002s : 13: predicate.transpose_eliminate 1.60% : 0.000004s : 21: predicate.tuple_list_convert_item_index_to_positive 1.78% : 0.000004s : 21: predicate.tuple_list_get_item_const_eliminator 1.35% : 0.000003s : 21: predicate.tuple_list_get_item_depend_reorder 2.99% : 0.000007s : 33: predicate.tuple_list_get_item_eliminator 1.51% : 0.000003s : 21: predicate.tuple_list_get_set_item_eliminator 2.08% : 0.000005s : 29: predicate.tuple_list_set_item_eliminator 1.61% : 0.000004s : 25: predicate.tuple_to_list_eliminator_ 2.21% : 0.000005s : 38: predicate.updatestate_pure_node_eliminater 2.89% : 0.000006s : 46: predicate.updatestate_useless_node_eliminater 0.39% : 0.000001s : 4: predicate.value_based_eliminate 0.65% : 0.000001s : 8: predicate.virtual_dataset_eliminate 0.55% : 0.000001s : 8: predicate.virtual_output_eliminate 0.25% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.38% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000620 11 55.15% : 0.000342s : 5: func_graph_cloner_run.FuncGraphClonerGraph 44.85% : 0.000278s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.176027 192 0.00% : 0.000004s : 1: ForceFp32Comm 1.95% : 0.003425s : 1: add_attr 1.94% : 0.003411s : 1: add_attr_with_inline 0.00% : 0.000004s : 1: add_comm_op_reuse_tag 0.03% : 0.000061s : 1: add_recomputation 0.00% : 0.000005s : 1: assign_add_opt 0.04% : 0.000071s : 1: auto_monad 0.01% : 0.000023s : 1: auto_monad_reorder 0.00% : 0.000003s : 1: begin_end_overlap_inline 0.00% : 0.000006s : 1: bias_add_comm_swap 0.30% : 0.000535s : 1: bootstrap 0.02% : 0.000036s : 1: cconv 0.00% : 0.000004s : 1: comm_op_add_attrs 0.01% : 0.000019s : 1: control_data_broadcast_order 0.01% : 0.000011s : 1: convert_after_rewriter 0.01% : 0.000025s : 1: cse_after_recomputation 0.00% : 0.000005s : 1: dataset_repeat_opt 0.00% : 0.000006s : 1: detach_backward 0.01% : 0.000009s : 1: environ_conv 0.01% : 0.000026s : 1: event_method 0.00% : 0.000005s : 1: full_micro_interleaved_order_control 0.00% : 0.000005s : 1: get_jit_bprop_graph 0.01% : 0.000010s : 1: graph_reusing 0.00% : 0.000005s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000004s : 1: handle_group_info 0.00% : 0.000005s : 1: inline 0.00% : 0.000006s : 1: insert-virtual-dataset 0.00% : 0.000004s : 1: interleave_parallel_branches 0.00% : 0.000004s : 1: interleave_split_concat_branches 0.00% : 0.000006s : 1: label_fine_grained_interleaved_index 0.00% : 0.000007s : 1: label_micro_interleaved_index 0.28% : 0.000486s : 1: loop_unroll 0.00% : 0.000004s : 1: merge_cast_opt 0.00% : 0.000006s : 1: micro_interleaved_order_control 0.39% : 0.000678s : 1: mutable_eliminate 0.00% : 0.000007s : 1: offloading_packed_experts 0.01% : 0.000017s : 1: opt.transform.loop_unroll_optimizer 0.01% : 0.000020s : 1: opt.transform.mutable_eliminate 0.72% : 0.001265s : 78: opt.transform.opt_a 0.02% : 0.000030s : 1: opt.transform.opt_after_cconv 0.03% : 0.000056s : 1: opt.transform.opt_after_jit_grad 0.06% : 0.000106s : 28: opt.transform.opt_b 0.03% : 0.000053s : 2: opt.transform.opt_trans_graph 0.02% : 0.000040s : 4: opt.transform.symbol_engine_opt 1.73% : 0.003045s : 1: opt_a 0.07% : 0.000117s : 1: opt_after_cconv 84.60% : 0.148921s : 1: opt_after_jit_grad 0.13% : 0.000230s : 1: opt_b 3.05% : 0.005375s : 1: optimize 0.01% : 0.000024s : 1: optimize_parallel_all_gather_comm 0.00% : 0.000008s : 1: order_py_execute_after_rewriter 0.01% : 0.000026s : 1: overlap_grad_flash_sp 0.00% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.00% : 0.000008s : 1: overlap_grad_ring_attention 0.00% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000005s : 1: overlap_param_gather 0.00% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.00% : 0.000009s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000005s : 1: overlap_recompute_comm 0.00% : 0.000007s : 1: parallel-infer-symbol 0.00% : 0.000004s : 1: parallel-infer-symbol-second 0.00% : 0.000006s : 1: partial_unused_args_eliminate 0.00% : 0.000005s : 1: pipeline_parallel_scheduler 0.00% : 0.000005s : 1: pipeline_split 0.03% : 0.000044s : 1: pre_auto_parallel 0.02% : 0.000034s : 1: py_interpret_to_execute 0.01% : 0.000018s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000004s : 1: remove_cast_before_assign_add 0.01% : 0.000019s : 1: remove_dup_value 0.27% : 0.000480s : 1: renormalize.infer 0.21% : 0.000378s : 1: renormalize.specialize 0.00% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.00% : 0.000009s : 1: rewriter_after_jit_bprop_graph 0.03% : 0.000044s : 1: rewriter_after_opt_a 0.05% : 0.000092s : 1: rewriter_before_opt_a 0.00% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000005s : 1: slice_recompute_activation 0.00% : 0.000005s : 1: split_layernorm_comm 0.00% : 0.000005s : 1: split_matmul_comm_elemetwise 0.00% : 0.000008s : 1: swap_dp_allreduce_reducescatter 0.05% : 0.000088s : 1: symbol_engine_optimizer 0.05% : 0.000086s : 1: tuple_transform 3.61% : 0.006362s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:42:03.621.062 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:42:03.621.328 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.202465, [21] [bootstrap]: 0.00045518 [type_inference]: 0.190585 [event_method]: 2.129e-05 [auto_monad]: 6.722e-05 [graph_reusing]: 6.23e-06 [inline]: 2.63998e-06 [add_attr]: 0.00393754, [1] [add_attr_with_inline]: 0.00392462, [1] [Cycle 1]: 0.00015277, [2] [tag_attr]: 2.38e-05 [meta_addattr_fg_expand]: 7.01999e-06 [parallel-infer-symbol]: 3.61999e-06 [pre_auto_parallel]: 4.238e-05 [insert-virtual-dataset]: 2.39999e-06 [parallel-infer-symbol-second]: 8.09989e-07 [dataset_repeat_opt]: 1.69e-06 [pipeline_split]: 1.61002e-06 [optimize]: 0.00611828, [53] [py_interpret_to_execute]: 3.714e-05 [rewriter_before_opt_a]: 9.932e-05 [opt_a]: 0.00348431, [2] [Cycle 1]: 0.00254791, [45] [expand_dump_flag]: 3.22002e-06 [switch_simplify]: 4.556e-05 [loop_unroll]: 3.093e-05 [a_1]: 0.00069404 [with_stream_mark]: 2.355e-05 [recompute_prepare]: 1.224e-05 [updatestate_depend_eliminate]: 4.68999e-06 [updatestate_assign_eliminate]: 3.14999e-06 [updatestate_loads_eliminate]: 2.99999e-06 [parameter_eliminate]: 2.08002e-06 [a_2]: 0.00012291 [accelerated_algorithm]: 7.66001e-06 [shard]: 1.99999e-06 [meta_shard_fg_expand]: 1.84998e-06 [shard_inline]: 6.70002e-06 [merge_send_recv]: 1.009e-05 [auto_parallel]: 8.03001e-06 [parallel]: 2.015e-05 [flash_sp]: 9.46003e-06 [merge_comm]: 3.97998e-06 [allreduce_fusion]: 3.61999e-06 [matmul_add_comm_reduction]: 1.073e-05 [allreduce_slice_to_reducescatter]: 1.02e-06 [virtual_shard_identity]: 8.99e-06 [virtual_dataset]: 6.68e-06 [get_grad_eliminate_]: 6.76e-06 [virtual_output]: 7.03e-06 [merge_forward]: 4.23999e-06 [cell_reuse_recompute_pass]: 2.06e-06 [offload_activation]: 1.139e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.639e-05 [merge_recompute_call_nodes]: 1.50999e-06 [before_grad]: 1.178e-05 [set_forward_comm_id_for_comm_node_pass]: 4.14002e-06 [meta_fg_expand]: 3.16999e-06 [flash_sp_send_recv_attached]: 3.75e-06 [receive_attached]: 2.93e-06 [after_resolve]: 1.354e-05 [a_after_grad]: 1.122e-05 [renormalize]: 0.0008669 [add_forward_monad_depend]: 7.07002e-06 [auto_monad_grad]: 2.94001e-06 [auto_monad_eliminator]: 1.732e-05 [cse]: 3.079e-05 [a_3]: 6.63e-05 [Cycle 2]: 0.0009196, [45] [expand_dump_flag]: 2.42001e-06 [switch_simplify]: 8.62998e-06 [loop_unroll]: 6.63e-06 [a_1]: 0.00014975 [with_stream_mark]: 1.481e-05 [recompute_prepare]: 6.65998e-06 [updatestate_depend_eliminate]: 3.51999e-06 [updatestate_assign_eliminate]: 3.09999e-06 [updatestate_loads_eliminate]: 2.46998e-06 [parameter_eliminate]: 1.54e-06 [a_2]: 0.00010993 [accelerated_algorithm]: 6.83e-06 [shard]: 1.92999e-06 [meta_shard_fg_expand]: 1.51998e-06 [shard_inline]: 6.61e-06 [merge_send_recv]: 6.96001e-06 [auto_parallel]: 7.65e-06 [parallel]: 1.034e-05 [flash_sp]: 4.50001e-06 [merge_comm]: 3.68999e-06 [allreduce_fusion]: 3.44001e-06 [matmul_add_comm_reduction]: 7.13e-06 [allreduce_slice_to_reducescatter]: 8.50006e-07 [virtual_shard_identity]: 7.32002e-06 [virtual_dataset]: 6.61e-06 [get_grad_eliminate_]: 6.00002e-06 [virtual_output]: 8.02e-06 [merge_forward]: 3.89002e-06 [cell_reuse_recompute_pass]: 2.94999e-06 [offload_activation]: 9.42001e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.423e-05 [merge_recompute_call_nodes]: 1.34e-06 [before_grad]: 1.124e-05 [set_forward_comm_id_for_comm_node_pass]: 3.73999e-06 [meta_fg_expand]: 2.16998e-06 [flash_sp_send_recv_attached]: 1.43002e-06 [receive_attached]: 1.69e-06 [after_resolve]: 1.287e-05 [a_after_grad]: 1e-05 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 2.21e-06 [auto_monad_grad]: 1.48002e-06 [auto_monad_eliminator]: 8.40001e-06 [cse]: 1.781e-05 [a_3]: 5.04e-05 [py_interpret_to_execute_after_opt_a]: 1.737e-05 [slice_cell_reuse_recomputed_activation]: 4.92999e-06 [rewriter_after_opt_a]: 4.404e-05 [convert_after_rewriter]: 9.92001e-06 [order_py_execute_after_rewriter]: 8.69e-06 [mutable_eliminate]: 0.00074971 [opt_b]: 0.00029126, [1] [Cycle 1]: 0.00028058, [7] [b_1]: 0.00017198 [b_2]: 9.29998e-06 [updatestate_depend_eliminate]: 8.90999e-06 [updatestate_assign_eliminate]: 2.96001e-06 [updatestate_loads_eliminate]: 3.06999e-06 [renormalize]: 6.69999e-07 [cse]: 2.407e-05 [optimize_parallel_all_gather_comm]: 2.284e-05 [overlap_param_gather]: 5.46002e-06 [cconv]: 3.751e-05 [loop_unroll]: 0.00046018 [opt_after_cconv]: 0.00013071, [1] [Cycle 1]: 0.00012197, [7] [c_1]: 3.249e-05 [parameter_eliminate]: 4.22998e-06 [updatestate_depend_eliminate]: 5.91998e-06 [updatestate_assign_eliminate]: 2.69001e-06 [updatestate_loads_eliminate]: 2.41998e-06 [cse]: 1.792e-05 [renormalize]: 4.09986e-07 [remove_dup_value]: 1.688e-05 [tuple_transform]: 9.631e-05, [1] [Cycle 1]: 8.904e-05, [4] [d_1]: 4.83e-05 [none_parameter_eliminate]: 2.04e-06 [renormalize]: 2.00002e-07 [switch_simplify]: 7.64002e-06 [partial_unused_args_eliminate]: 4.84998e-06 [add_recomputation]: 5.602e-05 [cse_after_recomputation]: 2.809e-05, [1] [Cycle 1]: 2.093e-05, [1] [cse]: 1.191e-05 [environ_conv]: 8.59002e-06 [swap_dp_allreduce_reducescatter]: 7.8e-06 [bias_add_comm_swap]: 5.34e-06 [label_micro_interleaved_index]: 7.36999e-06 [label_fine_grained_interleaved_index]: 5.41998e-06 [merge_cast_opt]: 3.77002e-06 [slice_recompute_activation]: 4.41002e-06 [micro_interleaved_order_control]: 4.99003e-06 [assign_add_opt]: 3.91999e-06 [ForceFp32Comm]: 3.24001e-06 [remove_cast_before_assign_add]: 3.48999e-06 [full_micro_interleaved_order_control]: 4.63001e-06 [reorder_send_recv_between_fp_bp]: 5.46e-06 [comm_op_add_attrs]: 3.71999e-06 [add_comm_op_reuse_tag]: 3.24001e-06 [interleave_split_concat_branches]: 3.55e-06 [interleave_parallel_branches]: 3.41999e-06 [overlap_opt_shard_in_pipeline]: 3.52002e-06 [overlap_opt_shard_grad_in_pipeline]: 4.51002e-06 [control_data_broadcast_order]: 1.675e-05 [grouped_pairwise_exchange_alltoall]: 4.45999e-06 [offloading_packed_experts]: 7.31999e-06 [overlap_recompute_and_grad_model_parallel]: 7.36001e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.63e-06 [overlap_recompute_allgather_and_fa_grad]: 3.74002e-06 [overlap_recompute_comm]: 5.24e-06 [overlap_grad_ring_attention]: 7.01001e-06 [overlap_grad_flash_sp]: 2.514e-05 [begin_end_overlap_inline]: 2.91999e-06 [split_matmul_comm_elemetwise]: 4.89003e-06 [split_layernorm_comm]: 4.38999e-06 [handle_group_info]: 3.92998e-06 [symbol_engine_optimizer]: 0.00010356, [1] [Cycle 1]: 9.617e-05, [6] [build]: 4.17e-06 [elim_shapecalc]: 1.137e-05 [elim_not_effective]: 1.407e-05 [opt_reshape]: 7.6e-06 [fold_const_symbol]: 1.109e-05 [renormalize]: 5.19998e-07 [detach_backward]: 4.2e-06 [pipeline_parallel_scheduler]: 1.76e-06 [auto_monad_reorder]: 1.996e-05 [get_jit_bprop_graph]: 2.10002e-06 [rewriter_after_jit_bprop_graph]: 6.07001e-06 [opt_after_jit_grad]: 0.00053605 [validate]: 4.29e-05 Sums bootstrap : 0.000455s : 0.23% type_inference : 0.190585s : 96.89% event_method : 0.000021s : 0.01% auto_monad : 0.000067s : 0.03% graph_reusing : 0.000006s : 0.00% inline : 0.000003s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000024s : 0.01% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000007s : 0.00% parallel-infer-symbol : 0.000004s : 0.00% pre_auto_parallel : 0.000042s : 0.02% insert-virtual-dataset : 0.000002s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000037s : 0.02% optimize.rewriter_before_opt_a : 0.000099s : 0.05% optimize.opt_a.expand_dump_flag : 0.000006s : 0.00% optimize.opt_a.switch_simplify : 0.000054s : 0.03% optimize.opt_a.loop_unroll : 0.000038s : 0.02% optimize.opt_a.a_1 : 0.000844s : 0.43% optimize.opt_a.with_stream_mark : 0.000038s : 0.02% optimize.opt_a.recompute_prepare : 0.000019s : 0.01% optimize.opt_a.updatestate_depend_eliminate : 0.000008s : 0.00% optimize.opt_a.updatestate_assign_eliminate : 0.000006s : 0.00% optimize.opt_a.updatestate_loads_eliminate : 0.000005s : 0.00% optimize.opt_a.parameter_eliminate : 0.000004s : 0.00% optimize.opt_a.a_2 : 0.000233s : 0.12% optimize.opt_a.accelerated_algorithm : 0.000014s : 0.01% optimize.opt_a.shard : 0.000004s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.000003s : 0.00% optimize.opt_a.shard_inline : 0.000013s : 0.01% optimize.opt_a.merge_send_recv : 0.000017s : 0.01% optimize.opt_a.auto_parallel : 0.000016s : 0.01% optimize.opt_a.parallel : 0.000030s : 0.02% optimize.opt_a.flash_sp : 0.000014s : 0.01% optimize.opt_a.merge_comm : 0.000008s : 0.00% optimize.opt_a.allreduce_fusion : 0.000007s : 0.00% optimize.opt_a.matmul_add_comm_reduction : 0.000018s : 0.01% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000002s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000016s : 0.01% optimize.opt_a.virtual_dataset : 0.000013s : 0.01% optimize.opt_a.get_grad_eliminate_ : 0.000013s : 0.01% optimize.opt_a.virtual_output : 0.000015s : 0.01% optimize.opt_a.merge_forward : 0.000008s : 0.00% optimize.opt_a.cell_reuse_recompute_pass : 0.000005s : 0.00% optimize.opt_a.offload_activation : 0.000021s : 0.01% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000031s : 0.02% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.00% optimize.opt_a.before_grad : 0.000023s : 0.01% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000008s : 0.00% optimize.opt_a.meta_fg_expand : 0.000005s : 0.00% optimize.opt_a.flash_sp_send_recv_attached : 0.000005s : 0.00% optimize.opt_a.receive_attached : 0.000005s : 0.00% optimize.opt_a.after_resolve : 0.000026s : 0.01% optimize.opt_a.a_after_grad : 0.000021s : 0.01% optimize.opt_a.renormalize : 0.000867s : 0.44% optimize.opt_a.add_forward_monad_depend : 0.000009s : 0.00% optimize.opt_a.auto_monad_grad : 0.000004s : 0.00% optimize.opt_a.auto_monad_eliminator : 0.000026s : 0.01% optimize.opt_a.cse : 0.000049s : 0.02% optimize.opt_a.a_3 : 0.000117s : 0.06% optimize.py_interpret_to_execute_after_opt_a : 0.000017s : 0.01% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.00% optimize.rewriter_after_opt_a : 0.000044s : 0.02% optimize.convert_after_rewriter : 0.000010s : 0.01% optimize.order_py_execute_after_rewriter : 0.000009s : 0.00% optimize.mutable_eliminate : 0.000750s : 0.38% optimize.opt_b.b_1 : 0.000172s : 0.09% optimize.opt_b.b_2 : 0.000009s : 0.00% optimize.opt_b.updatestate_depend_eliminate : 0.000009s : 0.00% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.00% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000024s : 0.01% optimize.optimize_parallel_all_gather_comm : 0.000023s : 0.01% optimize.overlap_param_gather : 0.000005s : 0.00% optimize.cconv : 0.000038s : 0.02% optimize.loop_unroll : 0.000460s : 0.23% optimize.opt_after_cconv.c_1 : 0.000032s : 0.02% optimize.opt_after_cconv.parameter_eliminate : 0.000004s : 0.00% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.00% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.00% optimize.opt_after_cconv.cse : 0.000018s : 0.01% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000017s : 0.01% optimize.tuple_transform.d_1 : 0.000048s : 0.02% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000008s : 0.00% optimize.partial_unused_args_eliminate : 0.000005s : 0.00% optimize.add_recomputation : 0.000056s : 0.03% optimize.cse_after_recomputation.cse : 0.000012s : 0.01% optimize.environ_conv : 0.000009s : 0.00% optimize.swap_dp_allreduce_reducescatter : 0.000008s : 0.00% optimize.bias_add_comm_swap : 0.000005s : 0.00% optimize.label_micro_interleaved_index : 0.000007s : 0.00% optimize.label_fine_grained_interleaved_index : 0.000005s : 0.00% optimize.merge_cast_opt : 0.000004s : 0.00% optimize.slice_recompute_activation : 0.000004s : 0.00% optimize.micro_interleaved_order_control : 0.000005s : 0.00% optimize.assign_add_opt : 0.000004s : 0.00% optimize.ForceFp32Comm : 0.000003s : 0.00% optimize.remove_cast_before_assign_add : 0.000003s : 0.00% optimize.full_micro_interleaved_order_control : 0.000005s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000005s : 0.00% optimize.comm_op_add_attrs : 0.000004s : 0.00% optimize.add_comm_op_reuse_tag : 0.000003s : 0.00% optimize.interleave_split_concat_branches : 0.000004s : 0.00% optimize.interleave_parallel_branches : 0.000003s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000005s : 0.00% optimize.control_data_broadcast_order : 0.000017s : 0.01% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.00% optimize.offloading_packed_experts : 0.000007s : 0.00% optimize.overlap_recompute_and_grad_model_parallel : 0.000007s : 0.00% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.00% optimize.overlap_recompute_comm : 0.000005s : 0.00% optimize.overlap_grad_ring_attention : 0.000007s : 0.00% optimize.overlap_grad_flash_sp : 0.000025s : 0.01% optimize.begin_end_overlap_inline : 0.000003s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000005s : 0.00% optimize.split_layernorm_comm : 0.000004s : 0.00% optimize.handle_group_info : 0.000004s : 0.00% optimize.symbol_engine_optimizer.build : 0.000004s : 0.00% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000011s : 0.01% optimize.symbol_engine_optimizer.elim_not_effective : 0.000014s : 0.01% optimize.symbol_engine_optimizer.opt_reshape : 0.000008s : 0.00% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000011s : 0.01% optimize.symbol_engine_optimizer.renormalize : 0.000001s : 0.00% detach_backward : 0.000004s : 0.00% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000020s : 0.01% get_jit_bprop_graph : 0.000002s : 0.00% rewriter_after_jit_bprop_graph : 0.000006s : 0.00% opt_after_jit_grad : 0.000536s : 0.27% validate : 0.000043s : 0.02% Time group info: ------[substitution.] 0.000256 34 15.73% : 0.000040s : 6: substitution.arithmetic_simplify 0.85% : 0.000002s : 2: substitution.elim_not_effective 0.55% : 0.000001s : 2: substitution.fold_const_symbol 2.67% : 0.000007s : 4: substitution.graph_param_transform 67.27% : 0.000172s : 4: substitution.inline 2.17% : 0.000006s : 4: substitution.j_node_and_user_rematch 2.07% : 0.000005s : 4: substitution.remove_not_recompute_node 2.52% : 0.000006s : 4: substitution.replace_old_param 6.16% : 0.000016s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.190525 2 99.60% : 0.189765s : 1: type_inference.infer 0.40% : 0.000760s : 1: type_inference.specialize ------[replace.] 0.000067 8 63.81% : 0.000043s : 4: replace.inline 36.19% : 0.000024s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000183 8 92.40% : 0.000169s : 4: match.inline 7.60% : 0.000014s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000222 1278 0.86% : 0.000002s : 13: predicate.accumulaten_eliminater 1.03% : 0.000002s : 4: predicate.ad_related_special_op_eliminate 0.48% : 0.000001s : 8: predicate.addn_check_dump 0.79% : 0.000002s : 13: predicate.addn_zero_filter 0.79% : 0.000002s : 13: predicate.adjust_all_reduce_mul_add 2.66% : 0.000006s : 21: predicate.arithmetic_simplify 0.88% : 0.000002s : 13: predicate.cast_eliminate 0.68% : 0.000002s : 8: predicate.check_bprop_eliminate 0.52% : 0.000001s : 8: predicate.compare_switch_simplify 0.18% : 0.000000s : 4: predicate.const_output_eliminate 0.71% : 0.000002s : 8: predicate.depend_value_elim 0.96% : 0.000002s : 13: predicate.dict_get_item_const_eliminator 1.02% : 0.000002s : 13: predicate.dict_get_item_eliminator 0.92% : 0.000002s : 13: predicate.dict_set_item_eliminator 1.26% : 0.000003s : 8: predicate.dumpgradient_eliminate 0.21% : 0.000000s : 4: predicate.elim_not_effective 0.57% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.11% : 0.000002s : 17: predicate.environ_add_const_eliminate 1.17% : 0.000003s : 17: predicate.environ_get_add_eliminate 1.00% : 0.000002s : 17: predicate.environ_get_depend_swap 1.60% : 0.000004s : 25: predicate.environ_get_eliminate 1.03% : 0.000002s : 17: predicate.environ_get_set_eliminate 1.33% : 0.000003s : 21: predicate.exchange_switch_depend_value 2.44% : 0.000005s : 21: predicate.float_depend_g_call 0.50% : 0.000001s : 8: predicate.float_environ_get_switch 0.73% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.21% : 0.000000s : 4: predicate.fold_const_symbol 0.82% : 0.000002s : 8: predicate.get_grad_eliminate 0.30% : 0.000001s : 4: predicate.graph_param_transform 0.57% : 0.000001s : 8: predicate.incorporate_call 0.46% : 0.000001s : 8: predicate.incorporate_call_switch 6.32% : 0.000014s : 58: predicate.inline 0.88% : 0.000002s : 8: predicate.inline_without_move 0.32% : 0.000001s : 8: predicate.j_node_and_user_rematch 0.80% : 0.000002s : 8: predicate.less_batch_normalization 1.64% : 0.000004s : 25: predicate.list_to_tuple_eliminator_ 2.62% : 0.000006s : 38: predicate.load_eliminater 1.06% : 0.000002s : 4: predicate.loop_unroll_after_grad 2.28% : 0.000005s : 34: predicate.loop_unroll_before_grad 1.48% : 0.000003s : 21: predicate.make_slice_get_slice_eliminator 0.59% : 0.000001s : 8: predicate.merge_addn 0.62% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.61% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.82% : 0.000002s : 13: predicate.minmaximum_grad 1.25% : 0.000003s : 4: predicate.mutable_eliminate 0.37% : 0.000001s : 4: predicate.opt_reshape 0.41% : 0.000001s : 4: predicate.parallel_virtual_node 1.82% : 0.000004s : 21: predicate.partial_defer_inline 1.53% : 0.000003s : 21: predicate.partial_eliminate 0.86% : 0.000002s : 13: predicate.print_const_string_wrapper 0.68% : 0.000002s : 8: predicate.reduce_all_const_elim 1.21% : 0.000003s : 13: predicate.reduce_eliminate 2.30% : 0.000005s : 38: predicate.redundant_stop_gradient_eliminater 0.55% : 0.000001s : 8: predicate.remove_not_recompute_node 1.30% : 0.000003s : 25: predicate.replace_applicator 0.54% : 0.000001s : 8: predicate.replace_old_param 0.26% : 0.000001s : 4: predicate.reset_defer_inline 0.89% : 0.000002s : 13: predicate.reshape_eliminate 0.61% : 0.000001s : 8: predicate.row_tensor_add_zeros_like 0.63% : 0.000001s : 4: predicate.row_tensor_eliminate 0.87% : 0.000002s : 8: predicate.same_eliminate 0.41% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.78% : 0.000002s : 8: predicate.shard_identity_eliminate 0.71% : 0.000002s : 8: predicate.special_op_eliminate 0.72% : 0.000002s : 8: predicate.specialize_transform 1.03% : 0.000002s : 8: predicate.split_environ_get_set_with_tuple_value 0.88% : 0.000002s : 8: predicate.stack_unstack_eliminate 0.29% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.46% : 0.000003s : 21: predicate.switch_defer_inline 2.02% : 0.000004s : 29: predicate.switch_layer_defer_inline 4.97% : 0.000011s : 67: predicate.switch_simplify 0.87% : 0.000002s : 13: predicate.tile_eliminate 1.02% : 0.000002s : 13: predicate.transpose_eliminate 1.42% : 0.000003s : 21: predicate.tuple_list_convert_item_index_to_positive 1.39% : 0.000003s : 21: predicate.tuple_list_get_item_const_eliminator 1.36% : 0.000003s : 21: predicate.tuple_list_get_item_depend_reorder 3.10% : 0.000007s : 33: predicate.tuple_list_get_item_eliminator 1.55% : 0.000003s : 21: predicate.tuple_list_get_set_item_eliminator 2.30% : 0.000005s : 29: predicate.tuple_list_set_item_eliminator 1.69% : 0.000004s : 25: predicate.tuple_to_list_eliminator_ 2.38% : 0.000005s : 38: predicate.updatestate_pure_node_eliminater 3.19% : 0.000007s : 46: predicate.updatestate_useless_node_eliminater 0.32% : 0.000001s : 4: predicate.value_based_eliminate 0.67% : 0.000001s : 8: predicate.virtual_dataset_eliminate 0.79% : 0.000002s : 8: predicate.virtual_output_eliminate 0.25% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.53% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000644 11 55.85% : 0.000360s : 5: func_graph_cloner_run.FuncGraphClonerGraph 44.15% : 0.000284s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.214842 192 0.00% : 0.000006s : 1: ForceFp32Comm 1.84% : 0.003950s : 1: add_attr 1.83% : 0.003929s : 1: add_attr_with_inline 0.00% : 0.000007s : 1: add_comm_op_reuse_tag 0.03% : 0.000060s : 1: add_recomputation 0.00% : 0.000007s : 1: assign_add_opt 0.04% : 0.000076s : 1: auto_monad 0.01% : 0.000028s : 1: auto_monad_reorder 0.00% : 0.000006s : 1: begin_end_overlap_inline 0.00% : 0.000008s : 1: bias_add_comm_swap 0.23% : 0.000504s : 1: bootstrap 0.02% : 0.000041s : 1: cconv 0.00% : 0.000006s : 1: comm_op_add_attrs 0.01% : 0.000020s : 1: control_data_broadcast_order 0.01% : 0.000013s : 1: convert_after_rewriter 0.01% : 0.000031s : 1: cse_after_recomputation 0.00% : 0.000007s : 1: dataset_repeat_opt 0.01% : 0.000019s : 1: detach_backward 0.01% : 0.000012s : 1: environ_conv 0.01% : 0.000031s : 1: event_method 0.00% : 0.000007s : 1: full_micro_interleaved_order_control 0.00% : 0.000009s : 1: get_jit_bprop_graph 0.01% : 0.000012s : 1: graph_reusing 0.00% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000007s : 1: handle_group_info 0.00% : 0.000008s : 1: inline 0.00% : 0.000009s : 1: insert-virtual-dataset 0.00% : 0.000006s : 1: interleave_parallel_branches 0.00% : 0.000006s : 1: interleave_split_concat_branches 0.00% : 0.000008s : 1: label_fine_grained_interleaved_index 0.00% : 0.000010s : 1: label_micro_interleaved_index 0.22% : 0.000467s : 1: loop_unroll 0.00% : 0.000006s : 1: merge_cast_opt 0.00% : 0.000008s : 1: micro_interleaved_order_control 0.35% : 0.000758s : 1: mutable_eliminate 0.00% : 0.000010s : 1: offloading_packed_experts 0.01% : 0.000015s : 1: opt.transform.loop_unroll_optimizer 0.01% : 0.000020s : 1: opt.transform.mutable_eliminate 0.61% : 0.001308s : 78: opt.transform.opt_a 0.01% : 0.000031s : 1: opt.transform.opt_after_cconv 0.01% : 0.000028s : 1: opt.transform.opt_after_jit_grad 0.05% : 0.000107s : 28: opt.transform.opt_b 0.02% : 0.000054s : 2: opt.transform.opt_trans_graph 0.02% : 0.000040s : 4: opt.transform.symbol_engine_opt 1.62% : 0.003488s : 1: opt_a 0.06% : 0.000135s : 1: opt_after_cconv 0.25% : 0.000547s : 1: opt_after_jit_grad 0.14% : 0.000295s : 1: opt_b 3.01% : 0.006463s : 1: optimize 0.01% : 0.000026s : 1: optimize_parallel_all_gather_comm 0.01% : 0.000012s : 1: order_py_execute_after_rewriter 0.01% : 0.000028s : 1: overlap_grad_flash_sp 0.00% : 0.000006s : 1: overlap_grad_matmul_and_grad_allreduce 0.00% : 0.000010s : 1: overlap_grad_ring_attention 0.00% : 0.000007s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000006s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000009s : 1: overlap_param_gather 0.00% : 0.000006s : 1: overlap_recompute_allgather_and_fa_grad 0.01% : 0.000011s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000008s : 1: overlap_recompute_comm 0.00% : 0.000010s : 1: parallel-infer-symbol 0.00% : 0.000006s : 1: parallel-infer-symbol-second 0.00% : 0.000008s : 1: partial_unused_args_eliminate 0.00% : 0.000009s : 1: pipeline_parallel_scheduler 0.00% : 0.000007s : 1: pipeline_split 0.02% : 0.000050s : 1: pre_auto_parallel 0.02% : 0.000041s : 1: py_interpret_to_execute 0.01% : 0.000021s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000006s : 1: remove_cast_before_assign_add 0.01% : 0.000020s : 1: remove_dup_value 0.22% : 0.000476s : 1: renormalize.infer 0.18% : 0.000380s : 1: renormalize.specialize 0.00% : 0.000008s : 1: reorder_send_recv_between_fp_bp 0.01% : 0.000013s : 1: rewriter_after_jit_bprop_graph 0.02% : 0.000048s : 1: rewriter_after_opt_a 0.05% : 0.000103s : 1: rewriter_before_opt_a 0.00% : 0.000008s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000008s : 1: slice_recompute_activation 0.00% : 0.000008s : 1: split_layernorm_comm 0.00% : 0.000008s : 1: split_matmul_comm_elemetwise 0.00% : 0.000011s : 1: swap_dp_allreduce_reducescatter 0.05% : 0.000107s : 1: symbol_engine_optimizer 0.05% : 0.000099s : 1: tuple_transform 88.73% : 0.190629s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:42:05.851.516 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.198863, [21] [bootstrap]: 0.00046431 [type_inference]: 0.165803 [event_method]: 2.041e-05 [auto_monad]: 6.686e-05 [graph_reusing]: 5.61e-06 [inline]: 2.78e-06 [add_attr]: 0.00356232, [1] [add_attr_with_inline]: 0.0035493, [1] [Cycle 1]: 7.025e-05, [2] [tag_attr]: 2.248e-05 [meta_addattr_fg_expand]: 5.95002e-06 [parallel-infer-symbol]: 3.36999e-06 [pre_auto_parallel]: 4.069e-05 [insert-virtual-dataset]: 2.67001e-06 [parallel-infer-symbol-second]: 8.2e-07 [dataset_repeat_opt]: 1.75001e-06 [pipeline_split]: 2.12001e-06 [optimize]: 0.028048, [53] [py_interpret_to_execute]: 2.801e-05 [rewriter_before_opt_a]: 9.269e-05 [opt_a]: 0.00298623, [2] [Cycle 1]: 0.00225318, [45] [expand_dump_flag]: 3.13e-06 [switch_simplify]: 4.666e-05 [loop_unroll]: 3.171e-05 [a_1]: 0.0006936 [with_stream_mark]: 1.905e-05 [recompute_prepare]: 9.44998e-06 [updatestate_depend_eliminate]: 3.83001e-06 [updatestate_assign_eliminate]: 3.41999e-06 [updatestate_loads_eliminate]: 2.97002e-06 [parameter_eliminate]: 1.91e-06 [a_2]: 9.077e-05 [accelerated_algorithm]: 7.57002e-06 [shard]: 2.49999e-06 [meta_shard_fg_expand]: 1.81998e-06 [shard_inline]: 6.56e-06 [merge_send_recv]: 9.34e-06 [auto_parallel]: 7.41999e-06 [parallel]: 1.988e-05 [flash_sp]: 8.53001e-06 [merge_comm]: 4.28999e-06 [allreduce_fusion]: 3.6e-06 [matmul_add_comm_reduction]: 1.043e-05 [allreduce_slice_to_reducescatter]: 6.50005e-07 [virtual_shard_identity]: 8.60001e-06 [virtual_dataset]: 6.59999e-06 [get_grad_eliminate_]: 6.04999e-06 [virtual_output]: 6.41998e-06 [merge_forward]: 4.15999e-06 [cell_reuse_recompute_pass]: 1.27999e-06 [offload_activation]: 1.057e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.204e-05 [merge_recompute_call_nodes]: 1.55999e-06 [before_grad]: 1.039e-05 [set_forward_comm_id_for_comm_node_pass]: 3.72002e-06 [meta_fg_expand]: 3.16999e-06 [flash_sp_send_recv_attached]: 2.51998e-06 [receive_attached]: 2.74001e-06 [after_resolve]: 1.282e-05 [a_after_grad]: 1.084e-05 [renormalize]: 0.00080063 [add_forward_monad_depend]: 6.52001e-06 [auto_monad_grad]: 2.56e-06 [auto_monad_eliminator]: 1.588e-05 [cse]: 3.259e-05 [a_3]: 4.937e-05 [Cycle 2]: 0.00072248, [45] [expand_dump_flag]: 1.48002e-06 [switch_simplify]: 8.33999e-06 [loop_unroll]: 6.95998e-06 [a_1]: 0.00014635 [with_stream_mark]: 1.538e-05 [recompute_prepare]: 7.16001e-06 [updatestate_depend_eliminate]: 3.25e-06 [updatestate_assign_eliminate]: 2.22999e-06 [updatestate_loads_eliminate]: 2.46e-06 [parameter_eliminate]: 1.32e-06 [a_2]: 8.149e-05 [accelerated_algorithm]: 6.91001e-06 [shard]: 1.54e-06 [meta_shard_fg_expand]: 1.87999e-06 [shard_inline]: 6.41e-06 [merge_send_recv]: 6.09999e-06 [auto_parallel]: 6.93e-06 [parallel]: 5.72001e-06 [flash_sp]: 3.91001e-06 [merge_comm]: 3.61001e-06 [allreduce_fusion]: 3.62998e-06 [matmul_add_comm_reduction]: 6.62002e-06 [allreduce_slice_to_reducescatter]: 6.69999e-07 [virtual_shard_identity]: 6.64999e-06 [virtual_dataset]: 6.12001e-06 [get_grad_eliminate_]: 6.12001e-06 [virtual_output]: 5.73002e-06 [merge_forward]: 2.83998e-06 [cell_reuse_recompute_pass]: 2.15002e-06 [offload_activation]: 7.21999e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.118e-05 [merge_recompute_call_nodes]: 8.70001e-07 [before_grad]: 9.44998e-06 [set_forward_comm_id_for_comm_node_pass]: 3.68e-06 [meta_fg_expand]: 2.27001e-06 [flash_sp_send_recv_attached]: 1.30999e-06 [receive_attached]: 1.76e-06 [after_resolve]: 1.009e-05 [a_after_grad]: 1.162e-05 [renormalize]: 7.99773e-08 [add_forward_monad_depend]: 1.64998e-06 [auto_monad_grad]: 1.92001e-06 [auto_monad_eliminator]: 8.33999e-06 [cse]: 1.654e-05 [a_3]: 3.729e-05 [py_interpret_to_execute_after_opt_a]: 1.133e-05 [slice_cell_reuse_recomputed_activation]: 1.90001e-06 [rewriter_after_opt_a]: 3.811e-05 [convert_after_rewriter]: 6.99001e-06 [order_py_execute_after_rewriter]: 5.51e-06 [mutable_eliminate]: 0.00067435 [opt_b]: 0.00020977, [1] [Cycle 1]: 0.00020266, [7] [b_1]: 0.0001256 [b_2]: 8.59002e-06 [updatestate_depend_eliminate]: 6.56e-06 [updatestate_assign_eliminate]: 2.39001e-06 [updatestate_loads_eliminate]: 2.45997e-06 [renormalize]: 1.07e-06 [cse]: 2.048e-05 [optimize_parallel_all_gather_comm]: 1.695e-05 [overlap_param_gather]: 2.09999e-06 [cconv]: 3.021e-05 [loop_unroll]: 0.0231347 [opt_after_cconv]: 0.00015752, [1] [Cycle 1]: 0.00014707, [7] [c_1]: 3.807e-05 [parameter_eliminate]: 8.02998e-06 [updatestate_depend_eliminate]: 1.335e-05 [updatestate_assign_eliminate]: 3.35998e-06 [updatestate_loads_eliminate]: 3.53e-06 [cse]: 4.176e-05 [renormalize]: 7.40023e-07 [remove_dup_value]: 1.739e-05 [tuple_transform]: 9.258e-05, [1] [Cycle 1]: 8.708e-05, [4] [d_1]: 5.81e-05 [none_parameter_eliminate]: 1.94e-06 [renormalize]: 1.50001e-07 [switch_simplify]: 7.46999e-06 [partial_unused_args_eliminate]: 2.29999e-06 [add_recomputation]: 6.413e-05 [cse_after_recomputation]: 2.469e-05, [1] [Cycle 1]: 1.982e-05, [1] [cse]: 1.419e-05 [environ_conv]: 6.89999e-06 [swap_dp_allreduce_reducescatter]: 5.93002e-06 [bias_add_comm_swap]: 3.48999e-06 [label_micro_interleaved_index]: 6.63e-06 [label_fine_grained_interleaved_index]: 2.63e-06 [merge_cast_opt]: 1.39e-06 [slice_recompute_activation]: 2.07001e-06 [micro_interleaved_order_control]: 2.50002e-06 [assign_add_opt]: 1.40001e-06 [ForceFp32Comm]: 1.00001e-06 [remove_cast_before_assign_add]: 1.22e-06 [full_micro_interleaved_order_control]: 2.02999e-06 [reorder_send_recv_between_fp_bp]: 2.94999e-06 [comm_op_add_attrs]: 1.24e-06 [add_comm_op_reuse_tag]: 1.20999e-06 [interleave_split_concat_branches]: 1.49e-06 [interleave_parallel_branches]: 1.30999e-06 [overlap_opt_shard_in_pipeline]: 1.47001e-06 [overlap_opt_shard_grad_in_pipeline]: 1.80001e-06 [control_data_broadcast_order]: 1.59e-05 [grouped_pairwise_exchange_alltoall]: 1.57001e-06 [offloading_packed_experts]: 4.53999e-06 [overlap_recompute_and_grad_model_parallel]: 4.94998e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.36002e-06 [overlap_recompute_allgather_and_fa_grad]: 1.37e-06 [overlap_recompute_comm]: 2.69999e-06 [overlap_grad_ring_attention]: 4.17e-06 [overlap_grad_flash_sp]: 2.173e-05 [begin_end_overlap_inline]: 5.39992e-07 [split_matmul_comm_elemetwise]: 2.62001e-06 [split_layernorm_comm]: 2.01e-06 [handle_group_info]: 1.27e-06 [symbol_engine_optimizer]: 8.46e-05, [1] [Cycle 1]: 7.981e-05, [6] [build]: 4.25999e-06 [elim_shapecalc]: 1.18e-05 [elim_not_effective]: 1.442e-05 [opt_reshape]: 7.92e-06 [fold_const_symbol]: 1.092e-05 [renormalize]: 2.00002e-07 [detach_backward]: 2.51998e-06 [pipeline_parallel_scheduler]: 1.77999e-06 [auto_monad_reorder]: 1.84e-05 [get_jit_bprop_graph]: 2.39001e-06 [rewriter_after_jit_bprop_graph]: 6.68e-06 [opt_after_jit_grad]: 0.00059454 [validate]: 4.729e-05 Sums bootstrap : 0.000464s : 0.24% type_inference : 0.165803s : 85.36% event_method : 0.000020s : 0.01% auto_monad : 0.000067s : 0.03% graph_reusing : 0.000006s : 0.00% inline : 0.000003s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000022s : 0.01% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.00% parallel-infer-symbol : 0.000003s : 0.00% pre_auto_parallel : 0.000041s : 0.02% insert-virtual-dataset : 0.000003s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000028s : 0.01% optimize.rewriter_before_opt_a : 0.000093s : 0.05% optimize.opt_a.expand_dump_flag : 0.000005s : 0.00% optimize.opt_a.switch_simplify : 0.000055s : 0.03% optimize.opt_a.loop_unroll : 0.000039s : 0.02% optimize.opt_a.a_1 : 0.000840s : 0.43% optimize.opt_a.with_stream_mark : 0.000034s : 0.02% optimize.opt_a.recompute_prepare : 0.000017s : 0.01% optimize.opt_a.updatestate_depend_eliminate : 0.000007s : 0.00% optimize.opt_a.updatestate_assign_eliminate : 0.000006s : 0.00% optimize.opt_a.updatestate_loads_eliminate : 0.000005s : 0.00% optimize.opt_a.parameter_eliminate : 0.000003s : 0.00% optimize.opt_a.a_2 : 0.000172s : 0.09% optimize.opt_a.accelerated_algorithm : 0.000014s : 0.01% optimize.opt_a.shard : 0.000004s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.00% optimize.opt_a.shard_inline : 0.000013s : 0.01% optimize.opt_a.merge_send_recv : 0.000015s : 0.01% optimize.opt_a.auto_parallel : 0.000014s : 0.01% optimize.opt_a.parallel : 0.000026s : 0.01% optimize.opt_a.flash_sp : 0.000012s : 0.01% optimize.opt_a.merge_comm : 0.000008s : 0.00% optimize.opt_a.allreduce_fusion : 0.000007s : 0.00% optimize.opt_a.matmul_add_comm_reduction : 0.000017s : 0.01% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000015s : 0.01% optimize.opt_a.virtual_dataset : 0.000013s : 0.01% optimize.opt_a.get_grad_eliminate_ : 0.000012s : 0.01% optimize.opt_a.virtual_output : 0.000012s : 0.01% optimize.opt_a.merge_forward : 0.000007s : 0.00% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.00% optimize.opt_a.offload_activation : 0.000018s : 0.01% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000023s : 0.01% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.00% optimize.opt_a.before_grad : 0.000020s : 0.01% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000007s : 0.00% optimize.opt_a.meta_fg_expand : 0.000005s : 0.00% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.00% optimize.opt_a.receive_attached : 0.000005s : 0.00% optimize.opt_a.after_resolve : 0.000023s : 0.01% optimize.opt_a.a_after_grad : 0.000022s : 0.01% optimize.opt_a.renormalize : 0.000801s : 0.41% optimize.opt_a.add_forward_monad_depend : 0.000008s : 0.00% optimize.opt_a.auto_monad_grad : 0.000004s : 0.00% optimize.opt_a.auto_monad_eliminator : 0.000024s : 0.01% optimize.opt_a.cse : 0.000049s : 0.03% optimize.opt_a.a_3 : 0.000087s : 0.04% optimize.py_interpret_to_execute_after_opt_a : 0.000011s : 0.01% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.00% optimize.rewriter_after_opt_a : 0.000038s : 0.02% optimize.convert_after_rewriter : 0.000007s : 0.00% optimize.order_py_execute_after_rewriter : 0.000006s : 0.00% optimize.mutable_eliminate : 0.000674s : 0.35% optimize.opt_b.b_1 : 0.000126s : 0.06% optimize.opt_b.b_2 : 0.000009s : 0.00% optimize.opt_b.updatestate_depend_eliminate : 0.000007s : 0.00% optimize.opt_b.updatestate_assign_eliminate : 0.000002s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000002s : 0.00% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000020s : 0.01% optimize.optimize_parallel_all_gather_comm : 0.000017s : 0.01% optimize.overlap_param_gather : 0.000002s : 0.00% optimize.cconv : 0.000030s : 0.02% optimize.loop_unroll : 0.023135s : 11.91% optimize.opt_after_cconv.c_1 : 0.000038s : 0.02% optimize.opt_after_cconv.parameter_eliminate : 0.000008s : 0.00% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000013s : 0.01% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000004s : 0.00% optimize.opt_after_cconv.cse : 0.000042s : 0.02% optimize.opt_after_cconv.renormalize : 0.000001s : 0.00% optimize.remove_dup_value : 0.000017s : 0.01% optimize.tuple_transform.d_1 : 0.000058s : 0.03% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000007s : 0.00% optimize.partial_unused_args_eliminate : 0.000002s : 0.00% optimize.add_recomputation : 0.000064s : 0.03% optimize.cse_after_recomputation.cse : 0.000014s : 0.01% optimize.environ_conv : 0.000007s : 0.00% optimize.swap_dp_allreduce_reducescatter : 0.000006s : 0.00% optimize.bias_add_comm_swap : 0.000003s : 0.00% optimize.label_micro_interleaved_index : 0.000007s : 0.00% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.00% optimize.merge_cast_opt : 0.000001s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.00% optimize.micro_interleaved_order_control : 0.000003s : 0.00% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.00% optimize.full_micro_interleaved_order_control : 0.000002s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.00% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.00% optimize.control_data_broadcast_order : 0.000016s : 0.01% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.00% optimize.offloading_packed_experts : 0.000005s : 0.00% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.00% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.00% optimize.overlap_recompute_comm : 0.000003s : 0.00% optimize.overlap_grad_ring_attention : 0.000004s : 0.00% optimize.overlap_grad_flash_sp : 0.000022s : 0.01% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000003s : 0.00% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000004s : 0.00% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000012s : 0.01% optimize.symbol_engine_optimizer.elim_not_effective : 0.000014s : 0.01% optimize.symbol_engine_optimizer.opt_reshape : 0.000008s : 0.00% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000011s : 0.01% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000003s : 0.00% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000018s : 0.01% get_jit_bprop_graph : 0.000002s : 0.00% rewriter_after_jit_bprop_graph : 0.000007s : 0.00% opt_after_jit_grad : 0.000595s : 0.31% validate : 0.000047s : 0.02% Time group info: ------[substitution.] 0.000240 34 14.51% : 0.000035s : 6: substitution.arithmetic_simplify 0.94% : 0.000002s : 2: substitution.elim_not_effective 0.80% : 0.000002s : 2: substitution.fold_const_symbol 2.86% : 0.000007s : 4: substitution.graph_param_transform 68.70% : 0.000165s : 4: substitution.inline 1.53% : 0.000004s : 4: substitution.j_node_and_user_rematch 1.94% : 0.000005s : 4: substitution.remove_not_recompute_node 1.88% : 0.000005s : 4: substitution.replace_old_param 6.84% : 0.000016s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.165728 2 99.53% : 0.164953s : 1: type_inference.infer 0.47% : 0.000775s : 1: type_inference.specialize ------[replace.] 0.000065 8 63.10% : 0.000041s : 4: replace.inline 36.90% : 0.000024s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000177 8 91.75% : 0.000162s : 4: match.inline 8.25% : 0.000015s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000227 1278 0.87% : 0.000002s : 13: predicate.accumulaten_eliminater 0.78% : 0.000002s : 4: predicate.ad_related_special_op_eliminate 0.50% : 0.000001s : 8: predicate.addn_check_dump 0.87% : 0.000002s : 13: predicate.addn_zero_filter 0.83% : 0.000002s : 13: predicate.adjust_all_reduce_mul_add 2.81% : 0.000006s : 21: predicate.arithmetic_simplify 1.01% : 0.000002s : 13: predicate.cast_eliminate 0.58% : 0.000001s : 8: predicate.check_bprop_eliminate 0.44% : 0.000001s : 8: predicate.compare_switch_simplify 0.17% : 0.000000s : 4: predicate.const_output_eliminate 0.72% : 0.000002s : 8: predicate.depend_value_elim 0.91% : 0.000002s : 13: predicate.dict_get_item_const_eliminator 0.95% : 0.000002s : 13: predicate.dict_get_item_eliminator 0.96% : 0.000002s : 13: predicate.dict_set_item_eliminator 1.14% : 0.000003s : 8: predicate.dumpgradient_eliminate 0.21% : 0.000000s : 4: predicate.elim_not_effective 0.36% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.05% : 0.000002s : 17: predicate.environ_add_const_eliminate 1.01% : 0.000002s : 17: predicate.environ_get_add_eliminate 1.09% : 0.000002s : 17: predicate.environ_get_depend_swap 1.63% : 0.000004s : 25: predicate.environ_get_eliminate 1.00% : 0.000002s : 17: predicate.environ_get_set_eliminate 1.41% : 0.000003s : 21: predicate.exchange_switch_depend_value 2.32% : 0.000005s : 21: predicate.float_depend_g_call 0.45% : 0.000001s : 8: predicate.float_environ_get_switch 0.65% : 0.000001s : 12: predicate.float_tuple_getitem_switch 0.17% : 0.000000s : 4: predicate.fold_const_symbol 0.62% : 0.000001s : 8: predicate.get_grad_eliminate 0.27% : 0.000001s : 4: predicate.graph_param_transform 0.56% : 0.000001s : 8: predicate.incorporate_call 0.44% : 0.000001s : 8: predicate.incorporate_call_switch 6.75% : 0.000015s : 58: predicate.inline 0.76% : 0.000002s : 8: predicate.inline_without_move 0.30% : 0.000001s : 8: predicate.j_node_and_user_rematch 0.92% : 0.000002s : 8: predicate.less_batch_normalization 1.71% : 0.000004s : 25: predicate.list_to_tuple_eliminator_ 2.39% : 0.000005s : 38: predicate.load_eliminater 3.01% : 0.000007s : 4: predicate.loop_unroll_after_grad 2.28% : 0.000005s : 34: predicate.loop_unroll_before_grad 1.90% : 0.000004s : 21: predicate.make_slice_get_slice_eliminator 0.56% : 0.000001s : 8: predicate.merge_addn 0.53% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.50% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.75% : 0.000002s : 13: predicate.minmaximum_grad 1.05% : 0.000002s : 4: predicate.mutable_eliminate 0.47% : 0.000001s : 4: predicate.opt_reshape 0.32% : 0.000001s : 4: predicate.parallel_virtual_node 1.69% : 0.000004s : 21: predicate.partial_defer_inline 1.50% : 0.000003s : 21: predicate.partial_eliminate 0.83% : 0.000002s : 13: predicate.print_const_string_wrapper 0.52% : 0.000001s : 8: predicate.reduce_all_const_elim 1.30% : 0.000003s : 13: predicate.reduce_eliminate 2.47% : 0.000006s : 38: predicate.redundant_stop_gradient_eliminater 0.35% : 0.000001s : 8: predicate.remove_not_recompute_node 1.22% : 0.000003s : 25: predicate.replace_applicator 0.47% : 0.000001s : 8: predicate.replace_old_param 0.24% : 0.000001s : 4: predicate.reset_defer_inline 1.22% : 0.000003s : 13: predicate.reshape_eliminate 0.59% : 0.000001s : 8: predicate.row_tensor_add_zeros_like 0.34% : 0.000001s : 4: predicate.row_tensor_eliminate 0.84% : 0.000002s : 8: predicate.same_eliminate 0.41% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.80% : 0.000002s : 8: predicate.shard_identity_eliminate 0.62% : 0.000001s : 8: predicate.special_op_eliminate 0.62% : 0.000001s : 8: predicate.specialize_transform 0.75% : 0.000002s : 8: predicate.split_environ_get_set_with_tuple_value 0.69% : 0.000002s : 8: predicate.stack_unstack_eliminate 0.38% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.49% : 0.000003s : 21: predicate.switch_defer_inline 1.91% : 0.000004s : 29: predicate.switch_layer_defer_inline 5.02% : 0.000011s : 67: predicate.switch_simplify 0.82% : 0.000002s : 13: predicate.tile_eliminate 0.88% : 0.000002s : 13: predicate.transpose_eliminate 1.43% : 0.000003s : 21: predicate.tuple_list_convert_item_index_to_positive 1.45% : 0.000003s : 21: predicate.tuple_list_get_item_const_eliminator 1.58% : 0.000004s : 21: predicate.tuple_list_get_item_depend_reorder 3.28% : 0.000007s : 33: predicate.tuple_list_get_item_eliminator 1.55% : 0.000004s : 21: predicate.tuple_list_get_set_item_eliminator 2.42% : 0.000005s : 29: predicate.tuple_list_set_item_eliminator 1.65% : 0.000004s : 25: predicate.tuple_to_list_eliminator_ 2.28% : 0.000005s : 38: predicate.updatestate_pure_node_eliminater 2.97% : 0.000007s : 46: predicate.updatestate_useless_node_eliminater 0.55% : 0.000001s : 4: predicate.value_based_eliminate 0.64% : 0.000001s : 8: predicate.virtual_dataset_eliminate 0.53% : 0.000001s : 8: predicate.virtual_output_eliminate 0.25% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.42% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000592 11 55.40% : 0.000328s : 5: func_graph_cloner_run.FuncGraphClonerGraph 44.60% : 0.000264s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.232732 192 0.00% : 0.000004s : 1: ForceFp32Comm 1.53% : 0.003568s : 1: add_attr 1.53% : 0.003554s : 1: add_attr_with_inline 0.00% : 0.000006s : 1: add_comm_op_reuse_tag 0.03% : 0.000069s : 1: add_recomputation 0.00% : 0.000004s : 1: assign_add_opt 0.03% : 0.000072s : 1: auto_monad 0.01% : 0.000022s : 1: auto_monad_reorder 0.00% : 0.000003s : 1: begin_end_overlap_inline 0.00% : 0.000006s : 1: bias_add_comm_swap 0.21% : 0.000495s : 1: bootstrap 0.01% : 0.000034s : 1: cconv 0.00% : 0.000004s : 1: comm_op_add_attrs 0.01% : 0.000019s : 1: control_data_broadcast_order 0.00% : 0.000010s : 1: convert_after_rewriter 0.01% : 0.000028s : 1: cse_after_recomputation 0.00% : 0.000005s : 1: dataset_repeat_opt 0.00% : 0.000006s : 1: detach_backward 0.00% : 0.000010s : 1: environ_conv 0.01% : 0.000028s : 1: event_method 0.00% : 0.000005s : 1: full_micro_interleaved_order_control 0.00% : 0.000005s : 1: get_jit_bprop_graph 0.00% : 0.000009s : 1: graph_reusing 0.00% : 0.000005s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000004s : 1: handle_group_info 0.00% : 0.000006s : 1: inline 0.00% : 0.000006s : 1: insert-virtual-dataset 0.00% : 0.000004s : 1: interleave_parallel_branches 0.00% : 0.000004s : 1: interleave_split_concat_branches 0.00% : 0.000006s : 1: label_fine_grained_interleaved_index 0.00% : 0.000010s : 1: label_micro_interleaved_index 9.95% : 0.023157s : 1: loop_unroll 0.00% : 0.000004s : 1: merge_cast_opt 0.00% : 0.000005s : 1: micro_interleaved_order_control 0.29% : 0.000684s : 1: mutable_eliminate 0.00% : 0.000007s : 1: offloading_packed_experts 0.02% : 0.000041s : 1: opt.transform.loop_unroll_optimizer 0.01% : 0.000016s : 1: opt.transform.mutable_eliminate 0.55% : 0.001281s : 78: opt.transform.opt_a 0.02% : 0.000036s : 1: opt.transform.opt_after_cconv 0.01% : 0.000030s : 1: opt.transform.opt_after_jit_grad 0.04% : 0.000102s : 28: opt.transform.opt_b 0.03% : 0.000062s : 2: opt.transform.opt_trans_graph 0.02% : 0.000041s : 4: opt.transform.symbol_engine_opt 1.28% : 0.002989s : 1: opt_a 0.07% : 0.000162s : 1: opt_after_cconv 0.26% : 0.000605s : 1: opt_after_jit_grad 0.09% : 0.000214s : 1: opt_b 12.05% : 0.028054s : 1: optimize 0.01% : 0.000020s : 1: optimize_parallel_all_gather_comm 0.00% : 0.000009s : 1: order_py_execute_after_rewriter 0.01% : 0.000025s : 1: overlap_grad_flash_sp 0.00% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.00% : 0.000007s : 1: overlap_grad_ring_attention 0.00% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000005s : 1: overlap_param_gather 0.00% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.00% : 0.000009s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000006s : 1: overlap_recompute_comm 0.00% : 0.000007s : 1: parallel-infer-symbol 0.00% : 0.000004s : 1: parallel-infer-symbol-second 0.00% : 0.000006s : 1: partial_unused_args_eliminate 0.00% : 0.000005s : 1: pipeline_parallel_scheduler 0.00% : 0.000005s : 1: pipeline_split 0.02% : 0.000045s : 1: pre_auto_parallel 0.01% : 0.000032s : 1: py_interpret_to_execute 0.01% : 0.000015s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000004s : 1: remove_cast_before_assign_add 0.01% : 0.000021s : 1: remove_dup_value 0.19% : 0.000437s : 1: renormalize.infer 0.15% : 0.000355s : 1: renormalize.specialize 0.00% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.00% : 0.000010s : 1: rewriter_after_jit_bprop_graph 0.02% : 0.000042s : 1: rewriter_after_opt_a 0.04% : 0.000097s : 1: rewriter_before_opt_a 0.00% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000006s : 1: slice_recompute_activation 0.00% : 0.000006s : 1: split_layernorm_comm 0.00% : 0.000005s : 1: split_matmul_comm_elemetwise 0.00% : 0.000009s : 1: swap_dp_allreduce_reducescatter 0.04% : 0.000088s : 1: symbol_engine_optimizer 0.04% : 0.000096s : 1: tuple_transform 71.25% : 0.165824s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:42:07.975.051 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:42:07.975.330 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0401363, [21] [bootstrap]: 0.00049377 [type_inference]: 0.00601268 [event_method]: 1.889e-05 [auto_monad]: 6.424e-05 [graph_reusing]: 6.19999e-06 [inline]: 2.31e-06 [add_attr]: 0.00338109, [1] [add_attr_with_inline]: 0.00337005, [1] [Cycle 1]: 8.12e-05, [2] [tag_attr]: 2.061e-05 [meta_addattr_fg_expand]: 5.81e-06 [parallel-infer-symbol]: 3.66001e-06 [pre_auto_parallel]: 3.77e-05 [insert-virtual-dataset]: 2.64999e-06 [parallel-infer-symbol-second]: 7.59988e-07 [dataset_repeat_opt]: 2.14e-06 [pipeline_split]: 1.73002e-06 [optimize]: 0.0288243, [53] [py_interpret_to_execute]: 3.158e-05 [rewriter_before_opt_a]: 8.896e-05 [opt_a]: 0.0260824, [2] [Cycle 1]: 0.00239052, [45] [expand_dump_flag]: 3.26001e-06 [switch_simplify]: 4.34e-05 [loop_unroll]: 3.139e-05 [a_1]: 0.00067213 [with_stream_mark]: 1.922e-05 [recompute_prepare]: 1.032e-05 [updatestate_depend_eliminate]: 4.16001e-06 [updatestate_assign_eliminate]: 3.45e-06 [updatestate_loads_eliminate]: 3.56001e-06 [parameter_eliminate]: 1.99e-06 [a_2]: 0.00011692 [accelerated_algorithm]: 7.83999e-06 [shard]: 2.12001e-06 [meta_shard_fg_expand]: 1.97999e-06 [shard_inline]: 6.90002e-06 [merge_send_recv]: 9.99001e-06 [auto_parallel]: 7.07002e-06 [parallel]: 2.102e-05 [flash_sp]: 9.58002e-06 [merge_comm]: 3.8e-06 [allreduce_fusion]: 3.6e-06 [matmul_add_comm_reduction]: 9.72999e-06 [allreduce_slice_to_reducescatter]: 7.00005e-07 [virtual_shard_identity]: 7.95e-06 [virtual_dataset]: 7.3e-06 [get_grad_eliminate_]: 6.95998e-06 [virtual_output]: 6.61e-06 [merge_forward]: 4.22e-06 [cell_reuse_recompute_pass]: 1.55001e-06 [offload_activation]: 1.066e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.466e-05 [merge_recompute_call_nodes]: 1.47001e-06 [before_grad]: 1.048e-05 [set_forward_comm_id_for_comm_node_pass]: 3.98001e-06 [meta_fg_expand]: 1.799e-05 [flash_sp_send_recv_attached]: 3.35998e-06 [receive_attached]: 2.51e-06 [after_resolve]: 1.347e-05 [a_after_grad]: 1.124e-05 [renormalize]: 0.00075808 [add_forward_monad_depend]: 6.11e-06 [auto_monad_grad]: 2.83e-06 [auto_monad_eliminator]: 1.748e-05 [cse]: 2.957e-05 [a_3]: 6.523e-05 [Cycle 2]: 0.0236727, [45] [expand_dump_flag]: 1.69e-06 [switch_simplify]: 7.81001e-06 [loop_unroll]: 6.26e-06 [a_1]: 0.00014544 [with_stream_mark]: 1.323e-05 [recompute_prepare]: 6.64001e-06 [updatestate_depend_eliminate]: 3.3e-06 [updatestate_assign_eliminate]: 2.39001e-06 [updatestate_loads_eliminate]: 2.66999e-06 [parameter_eliminate]: 1.56002e-06 [a_2]: 0.00010702 [accelerated_algorithm]: 6.94999e-06 [shard]: 1.27999e-06 [meta_shard_fg_expand]: 1.47999e-06 [shard_inline]: 6.61e-06 [merge_send_recv]: 5.70001e-06 [auto_parallel]: 7.44002e-06 [parallel]: 6.84999e-06 [flash_sp]: 3.62998e-06 [merge_comm]: 3.94002e-06 [allreduce_fusion]: 3.23e-06 [matmul_add_comm_reduction]: 7.18998e-06 [allreduce_slice_to_reducescatter]: 7.40023e-07 [virtual_shard_identity]: 7.11999e-06 [virtual_dataset]: 6.06e-06 [get_grad_eliminate_]: 6.01998e-06 [virtual_output]: 7e-06 [merge_forward]: 3.63e-06 [cell_reuse_recompute_pass]: 1.162e-05 [offload_activation]: 4.055e-05 [cell_reuse_handle_not_recompute_node_pass]: 5.409e-05 [merge_recompute_call_nodes]: 3.4e-06 [before_grad]: 1.621e-05 [set_forward_comm_id_for_comm_node_pass]: 1.134e-05 [meta_fg_expand]: 4.94e-06 [flash_sp_send_recv_attached]: 3.21999e-06 [receive_attached]: 2.58003e-06 [after_resolve]: 1.889e-05 [a_after_grad]: 1.231e-05 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 8.17e-06 [auto_monad_grad]: 3.46999e-06 [auto_monad_eliminator]: 2.113e-05 [cse]: 3.501e-05 [a_3]: 5.953e-05 [py_interpret_to_execute_after_opt_a]: 2.285e-05 [slice_cell_reuse_recomputed_activation]: 4.53001e-06 [rewriter_after_opt_a]: 7.099e-05 [convert_after_rewriter]: 1.101e-05 [order_py_execute_after_rewriter]: 8.37e-06 [mutable_eliminate]: 0.0007886 [opt_b]: 0.00030277, [1] [Cycle 1]: 0.00028931, [7] [b_1]: 0.00018206 [b_2]: 8.64998e-06 [updatestate_depend_eliminate]: 7.65998e-06 [updatestate_assign_eliminate]: 3.21999e-06 [updatestate_loads_eliminate]: 3.65e-06 [renormalize]: 8.79983e-07 [cse]: 2.449e-05 [optimize_parallel_all_gather_comm]: 2.319e-05 [overlap_param_gather]: 5.51e-06 [cconv]: 3.968e-05 [loop_unroll]: 0.00048077 [opt_after_cconv]: 0.00013826, [1] [Cycle 1]: 0.00012857, [7] [c_1]: 3.289e-05 [parameter_eliminate]: 4.42e-06 [updatestate_depend_eliminate]: 6.07999e-06 [updatestate_assign_eliminate]: 2.56e-06 [updatestate_loads_eliminate]: 2.93e-06 [cse]: 2.1e-05 [renormalize]: 4.09986e-07 [remove_dup_value]: 2.118e-05 [tuple_transform]: 9.932e-05, [1] [Cycle 1]: 8.995e-05, [4] [d_1]: 4.97e-05 [none_parameter_eliminate]: 1.97001e-06 [renormalize]: 2.19996e-07 [switch_simplify]: 7.28e-06 [partial_unused_args_eliminate]: 4.4e-06 [add_recomputation]: 5.698e-05 [cse_after_recomputation]: 2.979e-05, [1] [Cycle 1]: 2.209e-05, [1] [cse]: 1.295e-05 [environ_conv]: 8.82999e-06 [swap_dp_allreduce_reducescatter]: 8.04002e-06 [bias_add_comm_swap]: 5.54e-06 [label_micro_interleaved_index]: 6.69001e-06 [label_fine_grained_interleaved_index]: 5.12e-06 [merge_cast_opt]: 3.81001e-06 [slice_recompute_activation]: 4.47e-06 [micro_interleaved_order_control]: 5.12999e-06 [assign_add_opt]: 3.51001e-06 [ForceFp32Comm]: 3.35e-06 [remove_cast_before_assign_add]: 3.63999e-06 [full_micro_interleaved_order_control]: 4.73001e-06 [reorder_send_recv_between_fp_bp]: 5.32001e-06 [comm_op_add_attrs]: 3.59002e-06 [add_comm_op_reuse_tag]: 3.43e-06 [interleave_split_concat_branches]: 3.53e-06 [interleave_parallel_branches]: 3.49001e-06 [overlap_opt_shard_in_pipeline]: 3.56001e-06 [overlap_opt_shard_grad_in_pipeline]: 4.45999e-06 [control_data_broadcast_order]: 1.671e-05 [grouped_pairwise_exchange_alltoall]: 4.77e-06 [offloading_packed_experts]: 7.93001e-06 [overlap_recompute_and_grad_model_parallel]: 7.78999e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.53999e-06 [overlap_recompute_allgather_and_fa_grad]: 3.73999e-06 [overlap_recompute_comm]: 4.75999e-06 [overlap_grad_ring_attention]: 7.15998e-06 [overlap_grad_flash_sp]: 2.486e-05 [begin_end_overlap_inline]: 3.56999e-06 [split_matmul_comm_elemetwise]: 4.58999e-06 [split_layernorm_comm]: 4.21001e-06 [handle_group_info]: 3.43999e-06 [symbol_engine_optimizer]: 0.00010199, [1] [Cycle 1]: 9.374e-05, [6] [build]: 3.63e-06 [elim_shapecalc]: 1.067e-05 [elim_not_effective]: 1.366e-05 [opt_reshape]: 8.07e-06 [fold_const_symbol]: 1.114e-05 [renormalize]: 1.90019e-07 [detach_backward]: 3.75e-06 [pipeline_parallel_scheduler]: 1.68002e-06 [auto_monad_reorder]: 2.109e-05 [get_jit_bprop_graph]: 2.21e-06 [rewriter_after_jit_bprop_graph]: 6.19999e-06 [opt_after_jit_grad]: 0.00056146 [validate]: 4.466e-05 Sums bootstrap : 0.000494s : 4.02% type_inference : 0.006013s : 49.01% event_method : 0.000019s : 0.15% auto_monad : 0.000064s : 0.52% graph_reusing : 0.000006s : 0.05% inline : 0.000002s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000021s : 0.17% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.05% parallel-infer-symbol : 0.000004s : 0.03% pre_auto_parallel : 0.000038s : 0.31% insert-virtual-dataset : 0.000003s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000032s : 0.26% optimize.rewriter_before_opt_a : 0.000089s : 0.73% optimize.opt_a.expand_dump_flag : 0.000005s : 0.04% optimize.opt_a.switch_simplify : 0.000051s : 0.42% optimize.opt_a.loop_unroll : 0.000038s : 0.31% optimize.opt_a.a_1 : 0.000818s : 6.66% optimize.opt_a.with_stream_mark : 0.000032s : 0.26% optimize.opt_a.recompute_prepare : 0.000017s : 0.14% optimize.opt_a.updatestate_depend_eliminate : 0.000007s : 0.06% optimize.opt_a.updatestate_assign_eliminate : 0.000006s : 0.05% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.05% optimize.opt_a.parameter_eliminate : 0.000004s : 0.03% optimize.opt_a.a_2 : 0.000224s : 1.83% optimize.opt_a.accelerated_algorithm : 0.000015s : 0.12% optimize.opt_a.shard : 0.000003s : 0.03% optimize.opt_a.meta_shard_fg_expand : 0.000003s : 0.03% optimize.opt_a.shard_inline : 0.000014s : 0.11% optimize.opt_a.merge_send_recv : 0.000016s : 0.13% optimize.opt_a.auto_parallel : 0.000015s : 0.12% optimize.opt_a.parallel : 0.000028s : 0.23% optimize.opt_a.flash_sp : 0.000013s : 0.11% optimize.opt_a.merge_comm : 0.000008s : 0.06% optimize.opt_a.allreduce_fusion : 0.000007s : 0.06% optimize.opt_a.matmul_add_comm_reduction : 0.000017s : 0.14% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000015s : 0.12% optimize.opt_a.virtual_dataset : 0.000013s : 0.11% optimize.opt_a.get_grad_eliminate_ : 0.000013s : 0.11% optimize.opt_a.virtual_output : 0.000014s : 0.11% optimize.opt_a.merge_forward : 0.000008s : 0.06% optimize.opt_a.cell_reuse_recompute_pass : 0.000013s : 0.11% optimize.opt_a.offload_activation : 0.000051s : 0.42% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000069s : 0.56% optimize.opt_a.merge_recompute_call_nodes : 0.000005s : 0.04% optimize.opt_a.before_grad : 0.000027s : 0.22% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000015s : 0.12% optimize.opt_a.meta_fg_expand : 0.000023s : 0.19% optimize.opt_a.flash_sp_send_recv_attached : 0.000007s : 0.05% optimize.opt_a.receive_attached : 0.000005s : 0.04% optimize.opt_a.after_resolve : 0.000032s : 0.26% optimize.opt_a.a_after_grad : 0.000024s : 0.19% optimize.opt_a.renormalize : 0.000758s : 6.18% optimize.opt_a.add_forward_monad_depend : 0.000014s : 0.12% optimize.opt_a.auto_monad_grad : 0.000006s : 0.05% optimize.opt_a.auto_monad_eliminator : 0.000039s : 0.31% optimize.opt_a.cse : 0.000065s : 0.53% optimize.opt_a.a_3 : 0.000125s : 1.02% optimize.py_interpret_to_execute_after_opt_a : 0.000023s : 0.19% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.04% optimize.rewriter_after_opt_a : 0.000071s : 0.58% optimize.convert_after_rewriter : 0.000011s : 0.09% optimize.order_py_execute_after_rewriter : 0.000008s : 0.07% optimize.mutable_eliminate : 0.000789s : 6.43% optimize.opt_b.b_1 : 0.000182s : 1.48% optimize.opt_b.b_2 : 0.000009s : 0.07% optimize.opt_b.updatestate_depend_eliminate : 0.000008s : 0.06% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_b.updatestate_loads_eliminate : 0.000004s : 0.03% optimize.opt_b.renormalize : 0.000001s : 0.01% optimize.opt_b.cse : 0.000024s : 0.20% optimize.optimize_parallel_all_gather_comm : 0.000023s : 0.19% optimize.overlap_param_gather : 0.000006s : 0.04% optimize.cconv : 0.000040s : 0.32% optimize.loop_unroll : 0.000481s : 3.92% optimize.opt_after_cconv.c_1 : 0.000033s : 0.27% optimize.opt_after_cconv.parameter_eliminate : 0.000004s : 0.04% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.05% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.02% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.02% optimize.opt_after_cconv.cse : 0.000021s : 0.17% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000021s : 0.17% optimize.tuple_transform.d_1 : 0.000050s : 0.41% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.02% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000007s : 0.06% optimize.partial_unused_args_eliminate : 0.000004s : 0.04% optimize.add_recomputation : 0.000057s : 0.46% optimize.cse_after_recomputation.cse : 0.000013s : 0.11% optimize.environ_conv : 0.000009s : 0.07% optimize.swap_dp_allreduce_reducescatter : 0.000008s : 0.07% optimize.bias_add_comm_swap : 0.000006s : 0.05% optimize.label_micro_interleaved_index : 0.000007s : 0.05% optimize.label_fine_grained_interleaved_index : 0.000005s : 0.04% optimize.merge_cast_opt : 0.000004s : 0.03% optimize.slice_recompute_activation : 0.000004s : 0.04% optimize.micro_interleaved_order_control : 0.000005s : 0.04% optimize.assign_add_opt : 0.000004s : 0.03% optimize.ForceFp32Comm : 0.000003s : 0.03% optimize.remove_cast_before_assign_add : 0.000004s : 0.03% optimize.full_micro_interleaved_order_control : 0.000005s : 0.04% optimize.reorder_send_recv_between_fp_bp : 0.000005s : 0.04% optimize.comm_op_add_attrs : 0.000004s : 0.03% optimize.add_comm_op_reuse_tag : 0.000003s : 0.03% optimize.interleave_split_concat_branches : 0.000004s : 0.03% optimize.interleave_parallel_branches : 0.000003s : 0.03% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.03% optimize.overlap_opt_shard_grad_in_pipeline : 0.000004s : 0.04% optimize.control_data_broadcast_order : 0.000017s : 0.14% optimize.grouped_pairwise_exchange_alltoall : 0.000005s : 0.04% optimize.offloading_packed_experts : 0.000008s : 0.06% optimize.overlap_recompute_and_grad_model_parallel : 0.000008s : 0.06% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.03% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.03% optimize.overlap_recompute_comm : 0.000005s : 0.04% optimize.overlap_grad_ring_attention : 0.000007s : 0.06% optimize.overlap_grad_flash_sp : 0.000025s : 0.20% optimize.begin_end_overlap_inline : 0.000004s : 0.03% optimize.split_matmul_comm_elemetwise : 0.000005s : 0.04% optimize.split_layernorm_comm : 0.000004s : 0.03% optimize.handle_group_info : 0.000003s : 0.03% optimize.symbol_engine_optimizer.build : 0.000004s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000011s : 0.09% optimize.symbol_engine_optimizer.elim_not_effective : 0.000014s : 0.11% optimize.symbol_engine_optimizer.opt_reshape : 0.000008s : 0.07% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000011s : 0.09% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000004s : 0.03% pipeline_parallel_scheduler : 0.000002s : 0.01% auto_monad_reorder : 0.000021s : 0.17% get_jit_bprop_graph : 0.000002s : 0.02% rewriter_after_jit_bprop_graph : 0.000006s : 0.05% opt_after_jit_grad : 0.000561s : 4.58% validate : 0.000045s : 0.36% Time group info: ------[substitution.] 0.000238 34 14.56% : 0.000035s : 6: substitution.arithmetic_simplify 0.74% : 0.000002s : 2: substitution.elim_not_effective 0.59% : 0.000001s : 2: substitution.fold_const_symbol 2.69% : 0.000006s : 4: substitution.graph_param_transform 65.38% : 0.000155s : 4: substitution.inline 2.26% : 0.000005s : 4: substitution.j_node_and_user_rematch 3.18% : 0.000008s : 4: substitution.remove_not_recompute_node 4.05% : 0.000010s : 4: substitution.replace_old_param 6.55% : 0.000016s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.005956 2 87.99% : 0.005241s : 1: type_inference.infer 12.01% : 0.000715s : 1: type_inference.specialize ------[replace.] 0.000064 8 61.75% : 0.000040s : 4: replace.inline 38.25% : 0.000024s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000166 8 91.76% : 0.000152s : 4: match.inline 8.24% : 0.000014s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000224 1278 0.89% : 0.000002s : 13: predicate.accumulaten_eliminater 0.89% : 0.000002s : 4: predicate.ad_related_special_op_eliminate 0.63% : 0.000001s : 8: predicate.addn_check_dump 0.94% : 0.000002s : 13: predicate.addn_zero_filter 0.74% : 0.000002s : 13: predicate.adjust_all_reduce_mul_add 2.21% : 0.000005s : 21: predicate.arithmetic_simplify 1.07% : 0.000002s : 13: predicate.cast_eliminate 0.81% : 0.000002s : 8: predicate.check_bprop_eliminate 0.56% : 0.000001s : 8: predicate.compare_switch_simplify 0.17% : 0.000000s : 4: predicate.const_output_eliminate 0.70% : 0.000002s : 8: predicate.depend_value_elim 0.91% : 0.000002s : 13: predicate.dict_get_item_const_eliminator 0.91% : 0.000002s : 13: predicate.dict_get_item_eliminator 0.85% : 0.000002s : 13: predicate.dict_set_item_eliminator 1.05% : 0.000002s : 8: predicate.dumpgradient_eliminate 0.24% : 0.000001s : 4: predicate.elim_not_effective 0.52% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.09% : 0.000002s : 17: predicate.environ_add_const_eliminate 1.10% : 0.000002s : 17: predicate.environ_get_add_eliminate 1.03% : 0.000002s : 17: predicate.environ_get_depend_swap 1.55% : 0.000003s : 25: predicate.environ_get_eliminate 1.06% : 0.000002s : 17: predicate.environ_get_set_eliminate 1.33% : 0.000003s : 21: predicate.exchange_switch_depend_value 2.43% : 0.000005s : 21: predicate.float_depend_g_call 0.59% : 0.000001s : 8: predicate.float_environ_get_switch 0.72% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.17% : 0.000000s : 4: predicate.fold_const_symbol 0.80% : 0.000002s : 8: predicate.get_grad_eliminate 0.29% : 0.000001s : 4: predicate.graph_param_transform 0.53% : 0.000001s : 8: predicate.incorporate_call 0.46% : 0.000001s : 8: predicate.incorporate_call_switch 6.38% : 0.000014s : 58: predicate.inline 1.20% : 0.000003s : 8: predicate.inline_without_move 0.30% : 0.000001s : 8: predicate.j_node_and_user_rematch 0.73% : 0.000002s : 8: predicate.less_batch_normalization 1.65% : 0.000004s : 25: predicate.list_to_tuple_eliminator_ 2.33% : 0.000005s : 38: predicate.load_eliminater 1.10% : 0.000002s : 4: predicate.loop_unroll_after_grad 2.31% : 0.000005s : 34: predicate.loop_unroll_before_grad 1.80% : 0.000004s : 21: predicate.make_slice_get_slice_eliminator 0.51% : 0.000001s : 8: predicate.merge_addn 0.69% : 0.000002s : 8: predicate.micro_step_allgather_replace 0.77% : 0.000002s : 8: predicate.mini_step_allgather_replace 0.81% : 0.000002s : 13: predicate.minmaximum_grad 1.13% : 0.000003s : 4: predicate.mutable_eliminate 0.38% : 0.000001s : 4: predicate.opt_reshape 0.43% : 0.000001s : 4: predicate.parallel_virtual_node 1.64% : 0.000004s : 21: predicate.partial_defer_inline 1.52% : 0.000003s : 21: predicate.partial_eliminate 0.85% : 0.000002s : 13: predicate.print_const_string_wrapper 0.63% : 0.000001s : 8: predicate.reduce_all_const_elim 1.12% : 0.000003s : 13: predicate.reduce_eliminate 2.40% : 0.000005s : 38: predicate.redundant_stop_gradient_eliminater 0.73% : 0.000002s : 8: predicate.remove_not_recompute_node 1.32% : 0.000003s : 25: predicate.replace_applicator 0.56% : 0.000001s : 8: predicate.replace_old_param 0.35% : 0.000001s : 4: predicate.reset_defer_inline 0.81% : 0.000002s : 13: predicate.reshape_eliminate 0.73% : 0.000002s : 8: predicate.row_tensor_add_zeros_like 0.40% : 0.000001s : 4: predicate.row_tensor_eliminate 1.10% : 0.000002s : 8: predicate.same_eliminate 0.42% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.72% : 0.000002s : 8: predicate.shard_identity_eliminate 0.78% : 0.000002s : 8: predicate.special_op_eliminate 0.76% : 0.000002s : 8: predicate.specialize_transform 1.38% : 0.000003s : 8: predicate.split_environ_get_set_with_tuple_value 0.87% : 0.000002s : 8: predicate.stack_unstack_eliminate 0.38% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.39% : 0.000003s : 21: predicate.switch_defer_inline 1.93% : 0.000004s : 29: predicate.switch_layer_defer_inline 4.68% : 0.000010s : 67: predicate.switch_simplify 0.80% : 0.000002s : 13: predicate.tile_eliminate 0.89% : 0.000002s : 13: predicate.transpose_eliminate 1.81% : 0.000004s : 21: predicate.tuple_list_convert_item_index_to_positive 1.47% : 0.000003s : 21: predicate.tuple_list_get_item_const_eliminator 1.60% : 0.000004s : 21: predicate.tuple_list_get_item_depend_reorder 3.09% : 0.000007s : 33: predicate.tuple_list_get_item_eliminator 1.40% : 0.000003s : 21: predicate.tuple_list_get_set_item_eliminator 2.08% : 0.000005s : 29: predicate.tuple_list_set_item_eliminator 1.90% : 0.000004s : 25: predicate.tuple_to_list_eliminator_ 2.26% : 0.000005s : 38: predicate.updatestate_pure_node_eliminater 3.02% : 0.000007s : 46: predicate.updatestate_useless_node_eliminater 0.43% : 0.000001s : 4: predicate.value_based_eliminate 0.68% : 0.000002s : 8: predicate.virtual_dataset_eliminate 0.64% : 0.000001s : 8: predicate.virtual_output_eliminate 0.25% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.43% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000585 11 54.29% : 0.000318s : 5: func_graph_cloner_run.FuncGraphClonerGraph 45.71% : 0.000267s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.074566 192 0.01% : 0.000006s : 1: ForceFp32Comm 4.55% : 0.003391s : 1: add_attr 4.52% : 0.003374s : 1: add_attr_with_inline 0.01% : 0.000006s : 1: add_comm_op_reuse_tag 0.08% : 0.000061s : 1: add_recomputation 0.01% : 0.000006s : 1: assign_add_opt 0.10% : 0.000073s : 1: auto_monad 0.04% : 0.000028s : 1: auto_monad_reorder 0.01% : 0.000007s : 1: begin_end_overlap_inline 0.01% : 0.000008s : 1: bias_add_comm_swap 0.73% : 0.000542s : 1: bootstrap 0.06% : 0.000043s : 1: cconv 0.01% : 0.000006s : 1: comm_op_add_attrs 0.03% : 0.000020s : 1: control_data_broadcast_order 0.02% : 0.000014s : 1: convert_after_rewriter 0.04% : 0.000033s : 1: cse_after_recomputation 0.01% : 0.000008s : 1: dataset_repeat_opt 0.03% : 0.000021s : 1: detach_backward 0.02% : 0.000012s : 1: environ_conv 0.04% : 0.000029s : 1: event_method 0.01% : 0.000007s : 1: full_micro_interleaved_order_control 0.01% : 0.000009s : 1: get_jit_bprop_graph 0.02% : 0.000013s : 1: graph_reusing 0.01% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000006s : 1: handle_group_info 0.01% : 0.000008s : 1: inline 0.01% : 0.000009s : 1: insert-virtual-dataset 0.01% : 0.000006s : 1: interleave_parallel_branches 0.01% : 0.000006s : 1: interleave_split_concat_branches 0.01% : 0.000008s : 1: label_fine_grained_interleaved_index 0.01% : 0.000009s : 1: label_micro_interleaved_index 0.65% : 0.000487s : 1: loop_unroll 0.01% : 0.000006s : 1: merge_cast_opt 0.01% : 0.000008s : 1: micro_interleaved_order_control 1.07% : 0.000796s : 1: mutable_eliminate 0.02% : 0.000011s : 1: offloading_packed_experts 0.02% : 0.000016s : 1: opt.transform.loop_unroll_optimizer 0.02% : 0.000018s : 1: opt.transform.mutable_eliminate 1.76% : 0.001314s : 78: opt.transform.opt_a 0.04% : 0.000031s : 1: opt.transform.opt_after_cconv 0.04% : 0.000028s : 1: opt.transform.opt_after_jit_grad 0.15% : 0.000112s : 28: opt.transform.opt_b 0.07% : 0.000055s : 2: opt.transform.opt_trans_graph 0.05% : 0.000039s : 4: opt.transform.symbol_engine_opt 34.98% : 0.026087s : 1: opt_a 0.19% : 0.000142s : 1: opt_after_cconv 0.77% : 0.000572s : 1: opt_after_jit_grad 0.41% : 0.000306s : 1: opt_b 39.18% : 0.029215s : 1: optimize 0.04% : 0.000026s : 1: optimize_parallel_all_gather_comm 0.02% : 0.000011s : 1: order_py_execute_after_rewriter 0.04% : 0.000028s : 1: overlap_grad_flash_sp 0.01% : 0.000006s : 1: overlap_grad_matmul_and_grad_allreduce 0.01% : 0.000010s : 1: overlap_grad_ring_attention 0.01% : 0.000007s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000006s : 1: overlap_opt_shard_in_pipeline 0.01% : 0.000009s : 1: overlap_param_gather 0.01% : 0.000006s : 1: overlap_recompute_allgather_and_fa_grad 0.01% : 0.000011s : 1: overlap_recompute_and_grad_model_parallel 0.01% : 0.000008s : 1: overlap_recompute_comm 0.01% : 0.000010s : 1: parallel-infer-symbol 0.01% : 0.000006s : 1: parallel-infer-symbol-second 0.01% : 0.000007s : 1: partial_unused_args_eliminate 0.01% : 0.000008s : 1: pipeline_parallel_scheduler 0.01% : 0.000007s : 1: pipeline_split 0.06% : 0.000045s : 1: pre_auto_parallel 0.05% : 0.000036s : 1: py_interpret_to_execute 0.04% : 0.000026s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000008s : 1: remove_cast_before_assign_add 0.03% : 0.000024s : 1: remove_dup_value 0.57% : 0.000422s : 1: renormalize.infer 0.44% : 0.000327s : 1: renormalize.specialize 0.01% : 0.000008s : 1: reorder_send_recv_between_fp_bp 0.02% : 0.000012s : 1: rewriter_after_jit_bprop_graph 0.10% : 0.000075s : 1: rewriter_after_opt_a 0.13% : 0.000093s : 1: rewriter_before_opt_a 0.01% : 0.000007s : 1: slice_cell_reuse_recomputed_activation 0.01% : 0.000007s : 1: slice_recompute_activation 0.01% : 0.000007s : 1: split_layernorm_comm 0.01% : 0.000007s : 1: split_matmul_comm_elemetwise 0.01% : 0.000011s : 1: swap_dp_allreduce_reducescatter 0.14% : 0.000105s : 1: symbol_engine_optimizer 0.14% : 0.000102s : 1: tuple_transform 8.12% : 0.006052s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:42:10.266.362 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0698817, [21] [bootstrap]: 0.00045562 [type_inference]: 0.0589231 [event_method]: 2.361e-05 [auto_monad]: 7.025e-05 [graph_reusing]: 6.46999e-06 [inline]: 3.09001e-06 [add_attr]: 0.00399773, [1] [add_attr_with_inline]: 0.00398399, [1] [Cycle 1]: 8.266e-05, [2] [tag_attr]: 2.574e-05 [meta_addattr_fg_expand]: 5.92999e-06 [parallel-infer-symbol]: 3.86001e-06 [pre_auto_parallel]: 4.342e-05 [insert-virtual-dataset]: 2.54999e-06 [parallel-infer-symbol-second]: 7.80012e-07 [dataset_repeat_opt]: 2.04e-06 [pipeline_split]: 1.84e-06 [optimize]: 0.00557889, [53] [py_interpret_to_execute]: 3.258e-05 [rewriter_before_opt_a]: 9.736e-05 [opt_a]: 0.00309722, [2] [Cycle 1]: 0.00236036, [45] [expand_dump_flag]: 2.95002e-06 [switch_simplify]: 4.511e-05 [loop_unroll]: 3.074e-05 [a_1]: 0.00069709 [with_stream_mark]: 2.384e-05 [recompute_prepare]: 1.134e-05 [updatestate_depend_eliminate]: 3.83999e-06 [updatestate_assign_eliminate]: 3.43e-06 [updatestate_loads_eliminate]: 3.04001e-06 [parameter_eliminate]: 1.97999e-06 [a_2]: 9.165e-05 [accelerated_algorithm]: 7.93001e-06 [shard]: 1.74e-06 [meta_shard_fg_expand]: 2.24001e-06 [shard_inline]: 6.72002e-06 [merge_send_recv]: 8.99e-06 [auto_parallel]: 8.08001e-06 [parallel]: 2.626e-05 [flash_sp]: 9.74e-06 [merge_comm]: 4.18999e-06 [allreduce_fusion]: 3.65e-06 [matmul_add_comm_reduction]: 1.102e-05 [allreduce_slice_to_reducescatter]: 7.40023e-07 [virtual_shard_identity]: 1.005e-05 [virtual_dataset]: 7.3e-06 [get_grad_eliminate_]: 7.1e-06 [virtual_output]: 7.86001e-06 [merge_forward]: 4.37e-06 [cell_reuse_recompute_pass]: 1.50001e-06 [offload_activation]: 1.097e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.257e-05 [merge_recompute_call_nodes]: 1.47001e-06 [before_grad]: 1.196e-05 [set_forward_comm_id_for_comm_node_pass]: 3.6e-06 [meta_fg_expand]: 2.98e-06 [flash_sp_send_recv_attached]: 2.58998e-06 [receive_attached]: 2.92002e-06 [after_resolve]: 1.307e-05 [a_after_grad]: 1.206e-05 [renormalize]: 0.00085601 [add_forward_monad_depend]: 7.48e-06 [auto_monad_grad]: 2.51e-06 [auto_monad_eliminator]: 1.848e-05 [cse]: 3.24e-05 [a_3]: 5.427e-05 [Cycle 2]: 0.00072399, [45] [expand_dump_flag]: 2.07999e-06 [switch_simplify]: 9.11998e-06 [loop_unroll]: 6.68e-06 [a_1]: 0.00014946 [with_stream_mark]: 1.611e-05 [recompute_prepare]: 6.59001e-06 [updatestate_depend_eliminate]: 3.77998e-06 [updatestate_assign_eliminate]: 3.08e-06 [updatestate_loads_eliminate]: 3.04001e-06 [parameter_eliminate]: 1.66002e-06 [a_2]: 8.016e-05 [accelerated_algorithm]: 6.61999e-06 [shard]: 1.78002e-06 [meta_shard_fg_expand]: 1.96998e-06 [shard_inline]: 5.94e-06 [merge_send_recv]: 8.27998e-06 [auto_parallel]: 6.04999e-06 [parallel]: 7.11999e-06 [flash_sp]: 3.26001e-06 [merge_comm]: 3.52002e-06 [allreduce_fusion]: 3.44001e-06 [matmul_add_comm_reduction]: 7.65e-06 [allreduce_slice_to_reducescatter]: 6.69999e-07 [virtual_shard_identity]: 7.3e-06 [virtual_dataset]: 6.06e-06 [get_grad_eliminate_]: 6.44999e-06 [virtual_output]: 5.92999e-06 [merge_forward]: 3.97e-06 [cell_reuse_recompute_pass]: 3.03e-06 [offload_activation]: 8.77999e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.243e-05 [merge_recompute_call_nodes]: 1.47001e-06 [before_grad]: 1.005e-05 [set_forward_comm_id_for_comm_node_pass]: 4.18999e-06 [meta_fg_expand]: 2.48e-06 [flash_sp_send_recv_attached]: 1.27e-06 [receive_attached]: 1.94999e-06 [after_resolve]: 1.383e-05 [a_after_grad]: 1.046e-05 [renormalize]: 7.99773e-08 [add_forward_monad_depend]: 1.82999e-06 [auto_monad_grad]: 1.59e-06 [auto_monad_eliminator]: 9.34998e-06 [cse]: 1.556e-05 [a_3]: 3.655e-05 [py_interpret_to_execute_after_opt_a]: 1.432e-05 [slice_cell_reuse_recomputed_activation]: 2.24001e-06 [rewriter_after_opt_a]: 6.673e-05 [convert_after_rewriter]: 8.94e-06 [order_py_execute_after_rewriter]: 5.46e-06 [mutable_eliminate]: 0.00075287 [opt_b]: 0.00022069, [1] [Cycle 1]: 0.00021268, [7] [b_1]: 0.00013068 [b_2]: 9.05001e-06 [updatestate_depend_eliminate]: 8.53001e-06 [updatestate_assign_eliminate]: 2.85998e-06 [updatestate_loads_eliminate]: 2.37001e-06 [renormalize]: 5.69999e-07 [cse]: 2.163e-05 [optimize_parallel_all_gather_comm]: 1.968e-05 [overlap_param_gather]: 5.78002e-06 [cconv]: 3.422e-05 [loop_unroll]: 0.00047844 [opt_after_cconv]: 0.00011393, [1] [Cycle 1]: 0.00010795, [7] [c_1]: 3.44e-05 [parameter_eliminate]: 5.35999e-06 [updatestate_depend_eliminate]: 6.69999e-06 [updatestate_assign_eliminate]: 2.53998e-06 [updatestate_loads_eliminate]: 2.34001e-06 [cse]: 1.855e-05 [renormalize]: 2.70025e-07 [remove_dup_value]: 1.398e-05 [tuple_transform]: 8.67e-05, [1] [Cycle 1]: 8.178e-05, [4] [d_1]: 5.195e-05 [none_parameter_eliminate]: 2.38002e-06 [renormalize]: 2.3999e-07 [switch_simplify]: 7.76001e-06 [partial_unused_args_eliminate]: 1.99999e-06 [add_recomputation]: 5.556e-05 [cse_after_recomputation]: 2.378e-05, [1] [Cycle 1]: 1.828e-05, [1] [cse]: 1.222e-05 [environ_conv]: 6.58003e-06 [swap_dp_allreduce_reducescatter]: 5.71e-06 [bias_add_comm_swap]: 3.24001e-06 [label_micro_interleaved_index]: 4.76002e-06 [label_fine_grained_interleaved_index]: 3.21001e-06 [merge_cast_opt]: 1.54e-06 [slice_recompute_activation]: 2.51e-06 [micro_interleaved_order_control]: 2.51e-06 [assign_add_opt]: 1.59e-06 [ForceFp32Comm]: 8.89995e-07 [remove_cast_before_assign_add]: 1.19e-06 [full_micro_interleaved_order_control]: 2.19001e-06 [reorder_send_recv_between_fp_bp]: 3.3e-06 [comm_op_add_attrs]: 1.12e-06 [add_comm_op_reuse_tag]: 1.01997e-06 [interleave_split_concat_branches]: 1.20001e-06 [interleave_parallel_branches]: 1.48002e-06 [overlap_opt_shard_in_pipeline]: 3.88999e-06 [overlap_opt_shard_grad_in_pipeline]: 1.82001e-06 [control_data_broadcast_order]: 1.401e-05 [grouped_pairwise_exchange_alltoall]: 1.59e-06 [offloading_packed_experts]: 3.98999e-06 [overlap_recompute_and_grad_model_parallel]: 5.24e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.22999e-06 [overlap_recompute_allgather_and_fa_grad]: 1.50001e-06 [overlap_recompute_comm]: 2.37999e-06 [overlap_grad_ring_attention]: 4.54002e-06 [overlap_grad_flash_sp]: 2.464e-05 [begin_end_overlap_inline]: 5.50004e-07 [split_matmul_comm_elemetwise]: 2.30002e-06 [split_layernorm_comm]: 1.72001e-06 [handle_group_info]: 9.70002e-07 [symbol_engine_optimizer]: 8.526e-05, [1] [Cycle 1]: 8.074e-05, [6] [build]: 4.11001e-06 [elim_shapecalc]: 1.155e-05 [elim_not_effective]: 1.517e-05 [opt_reshape]: 8.09002e-06 [fold_const_symbol]: 1.114e-05 [renormalize]: 2.50002e-07 [detach_backward]: 2.19999e-06 [pipeline_parallel_scheduler]: 1.90001e-06 [auto_monad_reorder]: 1.69e-05 [get_jit_bprop_graph]: 2.29001e-06 [rewriter_after_jit_bprop_graph]: 5.40001e-06 [opt_after_jit_grad]: 0.000522 [validate]: 4.644e-05 Sums bootstrap : 0.000456s : 0.70% type_inference : 0.058923s : 90.88% event_method : 0.000024s : 0.04% auto_monad : 0.000070s : 0.11% graph_reusing : 0.000006s : 0.01% inline : 0.000003s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000026s : 0.04% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.01% parallel-infer-symbol : 0.000004s : 0.01% pre_auto_parallel : 0.000043s : 0.07% insert-virtual-dataset : 0.000003s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000033s : 0.05% optimize.rewriter_before_opt_a : 0.000097s : 0.15% optimize.opt_a.expand_dump_flag : 0.000005s : 0.01% optimize.opt_a.switch_simplify : 0.000054s : 0.08% optimize.opt_a.loop_unroll : 0.000037s : 0.06% optimize.opt_a.a_1 : 0.000847s : 1.31% optimize.opt_a.with_stream_mark : 0.000040s : 0.06% optimize.opt_a.recompute_prepare : 0.000018s : 0.03% optimize.opt_a.updatestate_depend_eliminate : 0.000008s : 0.01% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.01% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.01% optimize.opt_a.parameter_eliminate : 0.000004s : 0.01% optimize.opt_a.a_2 : 0.000172s : 0.27% optimize.opt_a.accelerated_algorithm : 0.000015s : 0.02% optimize.opt_a.shard : 0.000004s : 0.01% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.01% optimize.opt_a.shard_inline : 0.000013s : 0.02% optimize.opt_a.merge_send_recv : 0.000017s : 0.03% optimize.opt_a.auto_parallel : 0.000014s : 0.02% optimize.opt_a.parallel : 0.000033s : 0.05% optimize.opt_a.flash_sp : 0.000013s : 0.02% optimize.opt_a.merge_comm : 0.000008s : 0.01% optimize.opt_a.allreduce_fusion : 0.000007s : 0.01% optimize.opt_a.matmul_add_comm_reduction : 0.000019s : 0.03% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000017s : 0.03% optimize.opt_a.virtual_dataset : 0.000013s : 0.02% optimize.opt_a.get_grad_eliminate_ : 0.000014s : 0.02% optimize.opt_a.virtual_output : 0.000014s : 0.02% optimize.opt_a.merge_forward : 0.000008s : 0.01% optimize.opt_a.cell_reuse_recompute_pass : 0.000005s : 0.01% optimize.opt_a.offload_activation : 0.000020s : 0.03% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000025s : 0.04% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.00% optimize.opt_a.before_grad : 0.000022s : 0.03% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000008s : 0.01% optimize.opt_a.meta_fg_expand : 0.000005s : 0.01% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.01% optimize.opt_a.receive_attached : 0.000005s : 0.01% optimize.opt_a.after_resolve : 0.000027s : 0.04% optimize.opt_a.a_after_grad : 0.000023s : 0.03% optimize.opt_a.renormalize : 0.000856s : 1.32% optimize.opt_a.add_forward_monad_depend : 0.000009s : 0.01% optimize.opt_a.auto_monad_grad : 0.000004s : 0.01% optimize.opt_a.auto_monad_eliminator : 0.000028s : 0.04% optimize.opt_a.cse : 0.000048s : 0.07% optimize.opt_a.a_3 : 0.000091s : 0.14% optimize.py_interpret_to_execute_after_opt_a : 0.000014s : 0.02% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.00% optimize.rewriter_after_opt_a : 0.000067s : 0.10% optimize.convert_after_rewriter : 0.000009s : 0.01% optimize.order_py_execute_after_rewriter : 0.000005s : 0.01% optimize.mutable_eliminate : 0.000753s : 1.16% optimize.opt_b.b_1 : 0.000131s : 0.20% optimize.opt_b.b_2 : 0.000009s : 0.01% optimize.opt_b.updatestate_depend_eliminate : 0.000009s : 0.01% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000002s : 0.00% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000022s : 0.03% optimize.optimize_parallel_all_gather_comm : 0.000020s : 0.03% optimize.overlap_param_gather : 0.000006s : 0.01% optimize.cconv : 0.000034s : 0.05% optimize.loop_unroll : 0.000478s : 0.74% optimize.opt_after_cconv.c_1 : 0.000034s : 0.05% optimize.opt_after_cconv.parameter_eliminate : 0.000005s : 0.01% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000007s : 0.01% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.00% optimize.opt_after_cconv.cse : 0.000019s : 0.03% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000014s : 0.02% optimize.tuple_transform.d_1 : 0.000052s : 0.08% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000008s : 0.01% optimize.partial_unused_args_eliminate : 0.000002s : 0.00% optimize.add_recomputation : 0.000056s : 0.09% optimize.cse_after_recomputation.cse : 0.000012s : 0.02% optimize.environ_conv : 0.000007s : 0.01% optimize.swap_dp_allreduce_reducescatter : 0.000006s : 0.01% optimize.bias_add_comm_swap : 0.000003s : 0.00% optimize.label_micro_interleaved_index : 0.000005s : 0.01% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.00% optimize.merge_cast_opt : 0.000002s : 0.00% optimize.slice_recompute_activation : 0.000003s : 0.00% optimize.micro_interleaved_order_control : 0.000003s : 0.00% optimize.assign_add_opt : 0.000002s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.00% optimize.full_micro_interleaved_order_control : 0.000002s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.01% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.00% optimize.control_data_broadcast_order : 0.000014s : 0.02% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.00% optimize.offloading_packed_experts : 0.000004s : 0.01% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.01% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000002s : 0.00% optimize.overlap_recompute_comm : 0.000002s : 0.00% optimize.overlap_grad_ring_attention : 0.000005s : 0.01% optimize.overlap_grad_flash_sp : 0.000025s : 0.04% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.00% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000004s : 0.01% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000012s : 0.02% optimize.symbol_engine_optimizer.elim_not_effective : 0.000015s : 0.02% optimize.symbol_engine_optimizer.opt_reshape : 0.000008s : 0.01% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000011s : 0.02% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.00% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000017s : 0.03% get_jit_bprop_graph : 0.000002s : 0.00% rewriter_after_jit_bprop_graph : 0.000005s : 0.01% opt_after_jit_grad : 0.000522s : 0.81% validate : 0.000046s : 0.07% Time group info: ------[substitution.] 0.000258 34 14.87% : 0.000038s : 6: substitution.arithmetic_simplify 0.78% : 0.000002s : 2: substitution.elim_not_effective 0.72% : 0.000002s : 2: substitution.fold_const_symbol 2.58% : 0.000007s : 4: substitution.graph_param_transform 68.56% : 0.000177s : 4: substitution.inline 1.98% : 0.000005s : 4: substitution.j_node_and_user_rematch 1.85% : 0.000005s : 4: substitution.remove_not_recompute_node 2.46% : 0.000006s : 4: substitution.replace_old_param 6.20% : 0.000016s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.058849 2 98.61% : 0.058033s : 1: type_inference.infer 1.39% : 0.000816s : 1: type_inference.specialize ------[replace.] 0.000065 8 62.17% : 0.000041s : 4: replace.inline 37.83% : 0.000025s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000188 8 92.50% : 0.000174s : 4: match.inline 7.50% : 0.000014s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000221 1278 0.99% : 0.000002s : 13: predicate.accumulaten_eliminater 0.79% : 0.000002s : 4: predicate.ad_related_special_op_eliminate 0.47% : 0.000001s : 8: predicate.addn_check_dump 0.86% : 0.000002s : 13: predicate.addn_zero_filter 0.77% : 0.000002s : 13: predicate.adjust_all_reduce_mul_add 2.40% : 0.000005s : 21: predicate.arithmetic_simplify 1.06% : 0.000002s : 13: predicate.cast_eliminate 0.61% : 0.000001s : 8: predicate.check_bprop_eliminate 0.56% : 0.000001s : 8: predicate.compare_switch_simplify 0.17% : 0.000000s : 4: predicate.const_output_eliminate 0.57% : 0.000001s : 8: predicate.depend_value_elim 0.82% : 0.000002s : 13: predicate.dict_get_item_const_eliminator 0.91% : 0.000002s : 13: predicate.dict_get_item_eliminator 0.82% : 0.000002s : 13: predicate.dict_set_item_eliminator 1.05% : 0.000002s : 8: predicate.dumpgradient_eliminate 0.24% : 0.000001s : 4: predicate.elim_not_effective 0.42% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.11% : 0.000002s : 17: predicate.environ_add_const_eliminate 1.02% : 0.000002s : 17: predicate.environ_get_add_eliminate 1.05% : 0.000002s : 17: predicate.environ_get_depend_swap 1.72% : 0.000004s : 25: predicate.environ_get_eliminate 1.02% : 0.000002s : 17: predicate.environ_get_set_eliminate 1.44% : 0.000003s : 21: predicate.exchange_switch_depend_value 2.67% : 0.000006s : 21: predicate.float_depend_g_call 0.59% : 0.000001s : 8: predicate.float_environ_get_switch 0.76% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.20% : 0.000000s : 4: predicate.fold_const_symbol 0.74% : 0.000002s : 8: predicate.get_grad_eliminate 0.28% : 0.000001s : 4: predicate.graph_param_transform 0.53% : 0.000001s : 8: predicate.incorporate_call 0.43% : 0.000001s : 8: predicate.incorporate_call_switch 6.43% : 0.000014s : 58: predicate.inline 0.97% : 0.000002s : 8: predicate.inline_without_move 0.31% : 0.000001s : 8: predicate.j_node_and_user_rematch 0.93% : 0.000002s : 8: predicate.less_batch_normalization 1.84% : 0.000004s : 25: predicate.list_to_tuple_eliminator_ 2.43% : 0.000005s : 38: predicate.load_eliminater 0.91% : 0.000002s : 4: predicate.loop_unroll_after_grad 2.36% : 0.000005s : 34: predicate.loop_unroll_before_grad 1.62% : 0.000004s : 21: predicate.make_slice_get_slice_eliminator 0.61% : 0.000001s : 8: predicate.merge_addn 0.61% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.57% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.75% : 0.000002s : 13: predicate.minmaximum_grad 1.29% : 0.000003s : 4: predicate.mutable_eliminate 0.42% : 0.000001s : 4: predicate.opt_reshape 0.34% : 0.000001s : 4: predicate.parallel_virtual_node 1.67% : 0.000004s : 21: predicate.partial_defer_inline 1.54% : 0.000003s : 21: predicate.partial_eliminate 0.81% : 0.000002s : 13: predicate.print_const_string_wrapper 0.48% : 0.000001s : 8: predicate.reduce_all_const_elim 1.12% : 0.000002s : 13: predicate.reduce_eliminate 2.45% : 0.000005s : 38: predicate.redundant_stop_gradient_eliminater 0.47% : 0.000001s : 8: predicate.remove_not_recompute_node 1.27% : 0.000003s : 25: predicate.replace_applicator 0.50% : 0.000001s : 8: predicate.replace_old_param 0.46% : 0.000001s : 4: predicate.reset_defer_inline 0.84% : 0.000002s : 13: predicate.reshape_eliminate 0.56% : 0.000001s : 8: predicate.row_tensor_add_zeros_like 0.67% : 0.000001s : 4: predicate.row_tensor_eliminate 0.90% : 0.000002s : 8: predicate.same_eliminate 0.42% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.92% : 0.000002s : 8: predicate.shard_identity_eliminate 0.80% : 0.000002s : 8: predicate.special_op_eliminate 0.64% : 0.000001s : 8: predicate.specialize_transform 0.93% : 0.000002s : 8: predicate.split_environ_get_set_with_tuple_value 1.17% : 0.000003s : 8: predicate.stack_unstack_eliminate 0.32% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.44% : 0.000003s : 21: predicate.switch_defer_inline 1.92% : 0.000004s : 29: predicate.switch_layer_defer_inline 5.16% : 0.000011s : 67: predicate.switch_simplify 0.85% : 0.000002s : 13: predicate.tile_eliminate 0.81% : 0.000002s : 13: predicate.transpose_eliminate 1.49% : 0.000003s : 21: predicate.tuple_list_convert_item_index_to_positive 1.62% : 0.000004s : 21: predicate.tuple_list_get_item_const_eliminator 1.51% : 0.000003s : 21: predicate.tuple_list_get_item_depend_reorder 3.29% : 0.000007s : 33: predicate.tuple_list_get_item_eliminator 1.56% : 0.000003s : 21: predicate.tuple_list_get_set_item_eliminator 2.13% : 0.000005s : 29: predicate.tuple_list_set_item_eliminator 1.82% : 0.000004s : 25: predicate.tuple_to_list_eliminator_ 2.30% : 0.000005s : 38: predicate.updatestate_pure_node_eliminater 3.03% : 0.000007s : 46: predicate.updatestate_useless_node_eliminater 0.38% : 0.000001s : 4: predicate.value_based_eliminate 0.69% : 0.000002s : 8: predicate.virtual_dataset_eliminate 0.84% : 0.000002s : 8: predicate.virtual_output_eliminate 0.27% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.52% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000650 11 52.39% : 0.000340s : 5: func_graph_cloner_run.FuncGraphClonerGraph 47.61% : 0.000309s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.081791 192 0.00% : 0.000004s : 1: ForceFp32Comm 4.90% : 0.004004s : 1: add_attr 4.88% : 0.003989s : 1: add_attr_with_inline 0.01% : 0.000004s : 1: add_comm_op_reuse_tag 0.07% : 0.000060s : 1: add_recomputation 0.01% : 0.000005s : 1: assign_add_opt 0.09% : 0.000077s : 1: auto_monad 0.03% : 0.000021s : 1: auto_monad_reorder 0.00% : 0.000004s : 1: begin_end_overlap_inline 0.01% : 0.000006s : 1: bias_add_comm_swap 0.60% : 0.000488s : 1: bootstrap 0.05% : 0.000038s : 1: cconv 0.00% : 0.000004s : 1: comm_op_add_attrs 0.02% : 0.000018s : 1: control_data_broadcast_order 0.02% : 0.000013s : 1: convert_after_rewriter 0.03% : 0.000027s : 1: cse_after_recomputation 0.01% : 0.000005s : 1: dataset_repeat_opt 0.01% : 0.000005s : 1: detach_backward 0.01% : 0.000010s : 1: environ_conv 0.04% : 0.000031s : 1: event_method 0.01% : 0.000005s : 1: full_micro_interleaved_order_control 0.01% : 0.000005s : 1: get_jit_bprop_graph 0.01% : 0.000010s : 1: graph_reusing 0.01% : 0.000005s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000004s : 1: handle_group_info 0.01% : 0.000006s : 1: inline 0.01% : 0.000006s : 1: insert-virtual-dataset 0.01% : 0.000004s : 1: interleave_parallel_branches 0.01% : 0.000004s : 1: interleave_split_concat_branches 0.01% : 0.000007s : 1: label_fine_grained_interleaved_index 0.01% : 0.000008s : 1: label_micro_interleaved_index 0.60% : 0.000487s : 1: loop_unroll 0.01% : 0.000005s : 1: merge_cast_opt 0.01% : 0.000005s : 1: micro_interleaved_order_control 0.93% : 0.000764s : 1: mutable_eliminate 0.01% : 0.000007s : 1: offloading_packed_experts 0.02% : 0.000015s : 1: opt.transform.loop_unroll_optimizer 0.06% : 0.000045s : 1: opt.transform.mutable_eliminate 1.59% : 0.001303s : 78: opt.transform.opt_a 0.04% : 0.000033s : 1: opt.transform.opt_after_cconv 0.03% : 0.000028s : 1: opt.transform.opt_after_jit_grad 0.13% : 0.000106s : 28: opt.transform.opt_b 0.07% : 0.000058s : 2: opt.transform.opt_trans_graph 0.05% : 0.000042s : 4: opt.transform.symbol_engine_opt 3.79% : 0.003101s : 1: opt_a 0.14% : 0.000119s : 1: opt_after_cconv 0.65% : 0.000531s : 1: opt_after_jit_grad 0.28% : 0.000225s : 1: opt_b 6.83% : 0.005584s : 1: optimize 0.03% : 0.000023s : 1: optimize_parallel_all_gather_comm 0.01% : 0.000009s : 1: order_py_execute_after_rewriter 0.04% : 0.000029s : 1: overlap_grad_flash_sp 0.01% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.01% : 0.000008s : 1: overlap_grad_ring_attention 0.01% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000007s : 1: overlap_opt_shard_in_pipeline 0.01% : 0.000009s : 1: overlap_param_gather 0.01% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.01% : 0.000009s : 1: overlap_recompute_and_grad_model_parallel 0.01% : 0.000006s : 1: overlap_recompute_comm 0.01% : 0.000008s : 1: parallel-infer-symbol 0.00% : 0.000004s : 1: parallel-infer-symbol-second 0.01% : 0.000005s : 1: partial_unused_args_eliminate 0.01% : 0.000005s : 1: pipeline_parallel_scheduler 0.01% : 0.000005s : 1: pipeline_split 0.06% : 0.000047s : 1: pre_auto_parallel 0.04% : 0.000037s : 1: py_interpret_to_execute 0.02% : 0.000017s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000004s : 1: remove_cast_before_assign_add 0.02% : 0.000018s : 1: remove_dup_value 0.57% : 0.000470s : 1: renormalize.infer 0.46% : 0.000375s : 1: renormalize.specialize 0.01% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.01% : 0.000009s : 1: rewriter_after_jit_bprop_graph 0.09% : 0.000072s : 1: rewriter_after_opt_a 0.13% : 0.000103s : 1: rewriter_before_opt_a 0.01% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.01% : 0.000007s : 1: slice_recompute_activation 0.01% : 0.000006s : 1: split_layernorm_comm 0.01% : 0.000005s : 1: split_matmul_comm_elemetwise 0.01% : 0.000009s : 1: swap_dp_allreduce_reducescatter 0.11% : 0.000088s : 1: symbol_engine_optimizer 0.11% : 0.000090s : 1: tuple_transform 72.07% : 0.058949s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:42:12.443.708 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:42:12.443.991 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0985233, [21] [bootstrap]: 0.00045761 [type_inference]: 0.0866699 [event_method]: 2.052e-05 [auto_monad]: 6.576e-05 [graph_reusing]: 5.97999e-06 [inline]: 2.51998e-06 [add_attr]: 0.0035189, [1] [add_attr_with_inline]: 0.00350762, [1] [Cycle 1]: 9.409e-05, [2] [tag_attr]: 2.332e-05 [meta_addattr_fg_expand]: 5.97999e-06 [parallel-infer-symbol]: 3.61001e-06 [pre_auto_parallel]: 4.145e-05 [insert-virtual-dataset]: 2.43998e-06 [parallel-infer-symbol-second]: 8.2e-07 [dataset_repeat_opt]: 1.74e-06 [pipeline_split]: 1.55999e-06 [optimize]: 0.00638184, [53] [py_interpret_to_execute]: 3.376e-05 [rewriter_before_opt_a]: 0.00011867 [opt_a]: 0.00358632, [2] [Cycle 1]: 0.002688, [45] [expand_dump_flag]: 3.53999e-06 [switch_simplify]: 4.389e-05 [loop_unroll]: 3.14e-05 [a_1]: 0.00088539 [with_stream_mark]: 2.328e-05 [recompute_prepare]: 1.017e-05 [updatestate_depend_eliminate]: 4.72998e-06 [updatestate_assign_eliminate]: 3.61999e-06 [updatestate_loads_eliminate]: 3.4e-06 [parameter_eliminate]: 2.04e-06 [a_2]: 0.00011896 [accelerated_algorithm]: 7.15003e-06 [shard]: 2.17999e-06 [meta_shard_fg_expand]: 2.31998e-06 [shard_inline]: 6.51e-06 [merge_send_recv]: 9.87999e-06 [auto_parallel]: 7.90998e-06 [parallel]: 2.048e-05 [flash_sp]: 9.25999e-06 [merge_comm]: 4.43001e-06 [allreduce_fusion]: 3.78001e-06 [matmul_add_comm_reduction]: 1.016e-05 [allreduce_slice_to_reducescatter]: 9.89996e-07 [virtual_shard_identity]: 8.94e-06 [virtual_dataset]: 7.19001e-06 [get_grad_eliminate_]: 6.58998e-06 [virtual_output]: 7.11999e-06 [merge_forward]: 3.8e-06 [cell_reuse_recompute_pass]: 1.62001e-06 [offload_activation]: 1.098e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.516e-05 [merge_recompute_call_nodes]: 1.43002e-06 [before_grad]: 1.095e-05 [set_forward_comm_id_for_comm_node_pass]: 4.53001e-06 [meta_fg_expand]: 3.6e-06 [flash_sp_send_recv_attached]: 3.01001e-06 [receive_attached]: 2.74001e-06 [after_resolve]: 1.224e-05 [a_after_grad]: 1.068e-05 [renormalize]: 0.000826 [add_forward_monad_depend]: 6.29999e-06 [auto_monad_grad]: 2.69999e-06 [auto_monad_eliminator]: 1.798e-05 [cse]: 2.858e-05 [a_3]: 6.38e-05 [Cycle 2]: 0.00088012, [45] [expand_dump_flag]: 2.12001e-06 [switch_simplify]: 8.22998e-06 [loop_unroll]: 6.56e-06 [a_1]: 0.00014784 [with_stream_mark]: 1.562e-05 [recompute_prepare]: 6.14999e-06 [updatestate_depend_eliminate]: 3.49001e-06 [updatestate_assign_eliminate]: 2.48e-06 [updatestate_loads_eliminate]: 2.39001e-06 [parameter_eliminate]: 1.37e-06 [a_2]: 0.00010495 [accelerated_algorithm]: 7.01001e-06 [shard]: 1.59998e-06 [meta_shard_fg_expand]: 1.50999e-06 [shard_inline]: 6.60997e-06 [merge_send_recv]: 5.51e-06 [auto_parallel]: 7.78001e-06 [parallel]: 6.30002e-06 [flash_sp]: 3.61001e-06 [merge_comm]: 3.41999e-06 [allreduce_fusion]: 3.29001e-06 [matmul_add_comm_reduction]: 1.113e-05 [allreduce_slice_to_reducescatter]: 8.39995e-07 [virtual_shard_identity]: 7.6e-06 [virtual_dataset]: 6.12001e-06 [get_grad_eliminate_]: 6.05002e-06 [virtual_output]: 5.77999e-06 [merge_forward]: 3.33e-06 [cell_reuse_recompute_pass]: 2.46e-06 [offload_activation]: 7.9e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.421e-05 [merge_recompute_call_nodes]: 1.10001e-06 [before_grad]: 9.91e-06 [set_forward_comm_id_for_comm_node_pass]: 4e-06 [meta_fg_expand]: 2.83998e-06 [flash_sp_send_recv_attached]: 1.27999e-06 [receive_attached]: 1.66002e-06 [after_resolve]: 1.18e-05 [a_after_grad]: 1.056e-05 [renormalize]: 1.00001e-07 [add_forward_monad_depend]: 2.26e-06 [auto_monad_grad]: 2.11998e-06 [auto_monad_eliminator]: 9.14e-06 [cse]: 1.586e-05 [a_3]: 4.953e-05 [py_interpret_to_execute_after_opt_a]: 1.64e-05 [slice_cell_reuse_recomputed_activation]: 4.90999e-06 [rewriter_after_opt_a]: 4.51e-05 [convert_after_rewriter]: 1.04e-05 [order_py_execute_after_rewriter]: 9.05999e-06 [mutable_eliminate]: 0.00080506 [opt_b]: 0.00029297, [1] [Cycle 1]: 0.00028034, [7] [b_1]: 0.00017214 [b_2]: 8.67998e-06 [updatestate_depend_eliminate]: 9.79e-06 [updatestate_assign_eliminate]: 3.32002e-06 [updatestate_loads_eliminate]: 2.84999e-06 [renormalize]: 6.80011e-07 [cse]: 2.425e-05 [optimize_parallel_all_gather_comm]: 2.425e-05 [overlap_param_gather]: 5.72999e-06 [cconv]: 3.762e-05 [loop_unroll]: 0.00051468 [opt_after_cconv]: 0.00014194, [1] [Cycle 1]: 0.00013138, [7] [c_1]: 3.368e-05 [parameter_eliminate]: 4.58999e-06 [updatestate_depend_eliminate]: 6.24999e-06 [updatestate_assign_eliminate]: 2.79001e-06 [updatestate_loads_eliminate]: 2.86999e-06 [cse]: 2.19e-05 [renormalize]: 3.9002e-07 [remove_dup_value]: 1.902e-05 [tuple_transform]: 9.931e-05, [1] [Cycle 1]: 9.187e-05, [4] [d_1]: 5.281e-05 [none_parameter_eliminate]: 1.60001e-06 [renormalize]: 2.09984e-07 [switch_simplify]: 7.18e-06 [partial_unused_args_eliminate]: 4.62998e-06 [add_recomputation]: 5.614e-05 [cse_after_recomputation]: 2.808e-05, [1] [Cycle 1]: 2.113e-05, [1] [cse]: 1.177e-05 [environ_conv]: 9.20999e-06 [swap_dp_allreduce_reducescatter]: 8.48001e-06 [bias_add_comm_swap]: 5.82001e-06 [label_micro_interleaved_index]: 7.45e-06 [label_fine_grained_interleaved_index]: 6.16998e-06 [merge_cast_opt]: 4.30999e-06 [slice_recompute_activation]: 5.00001e-06 [micro_interleaved_order_control]: 5.38002e-06 [assign_add_opt]: 4.38001e-06 [ForceFp32Comm]: 4.07e-06 [remove_cast_before_assign_add]: 4.05e-06 [full_micro_interleaved_order_control]: 5.07e-06 [reorder_send_recv_between_fp_bp]: 5.80002e-06 [comm_op_add_attrs]: 3.66999e-06 [add_comm_op_reuse_tag]: 3.49001e-06 [interleave_split_concat_branches]: 3.78001e-06 [interleave_parallel_branches]: 3.48e-06 [overlap_opt_shard_in_pipeline]: 3.91999e-06 [overlap_opt_shard_grad_in_pipeline]: 4.89998e-06 [control_data_broadcast_order]: 1.746e-05 [grouped_pairwise_exchange_alltoall]: 3.91999e-06 [offloading_packed_experts]: 7.1e-06 [overlap_recompute_and_grad_model_parallel]: 7.90998e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.73001e-06 [overlap_recompute_allgather_and_fa_grad]: 3.73001e-06 [overlap_recompute_comm]: 5.12999e-06 [overlap_grad_ring_attention]: 6.49999e-06 [overlap_grad_flash_sp]: 2.502e-05 [begin_end_overlap_inline]: 3.46999e-06 [split_matmul_comm_elemetwise]: 5.04e-06 [split_layernorm_comm]: 4.25e-06 [handle_group_info]: 3.41001e-06 [symbol_engine_optimizer]: 0.00010383, [1] [Cycle 1]: 9.643e-05, [6] [build]: 3.73001e-06 [elim_shapecalc]: 1.106e-05 [elim_not_effective]: 1.527e-05 [opt_reshape]: 7.62998e-06 [fold_const_symbol]: 1.107e-05 [renormalize]: 2.30008e-07 [detach_backward]: 4.36002e-06 [pipeline_parallel_scheduler]: 2.09e-06 [auto_monad_reorder]: 2.08e-05 [get_jit_bprop_graph]: 1.96e-06 [rewriter_after_jit_bprop_graph]: 5.68002e-06 [opt_after_jit_grad]: 0.00063918 [validate]: 4.212e-05 Sums bootstrap : 0.000458s : 0.49% type_inference : 0.086670s : 93.05% event_method : 0.000021s : 0.02% auto_monad : 0.000066s : 0.07% graph_reusing : 0.000006s : 0.01% inline : 0.000003s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000023s : 0.03% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.01% parallel-infer-symbol : 0.000004s : 0.00% pre_auto_parallel : 0.000041s : 0.04% insert-virtual-dataset : 0.000002s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000034s : 0.04% optimize.rewriter_before_opt_a : 0.000119s : 0.13% optimize.opt_a.expand_dump_flag : 0.000006s : 0.01% optimize.opt_a.switch_simplify : 0.000052s : 0.06% optimize.opt_a.loop_unroll : 0.000038s : 0.04% optimize.opt_a.a_1 : 0.001033s : 1.11% optimize.opt_a.with_stream_mark : 0.000039s : 0.04% optimize.opt_a.recompute_prepare : 0.000016s : 0.02% optimize.opt_a.updatestate_depend_eliminate : 0.000008s : 0.01% optimize.opt_a.updatestate_assign_eliminate : 0.000006s : 0.01% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.01% optimize.opt_a.parameter_eliminate : 0.000003s : 0.00% optimize.opt_a.a_2 : 0.000224s : 0.24% optimize.opt_a.accelerated_algorithm : 0.000014s : 0.02% optimize.opt_a.shard : 0.000004s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.00% optimize.opt_a.shard_inline : 0.000013s : 0.01% optimize.opt_a.merge_send_recv : 0.000015s : 0.02% optimize.opt_a.auto_parallel : 0.000016s : 0.02% optimize.opt_a.parallel : 0.000027s : 0.03% optimize.opt_a.flash_sp : 0.000013s : 0.01% optimize.opt_a.merge_comm : 0.000008s : 0.01% optimize.opt_a.allreduce_fusion : 0.000007s : 0.01% optimize.opt_a.matmul_add_comm_reduction : 0.000021s : 0.02% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000002s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000017s : 0.02% optimize.opt_a.virtual_dataset : 0.000013s : 0.01% optimize.opt_a.get_grad_eliminate_ : 0.000013s : 0.01% optimize.opt_a.virtual_output : 0.000013s : 0.01% optimize.opt_a.merge_forward : 0.000007s : 0.01% optimize.opt_a.cell_reuse_recompute_pass : 0.000004s : 0.00% optimize.opt_a.offload_activation : 0.000019s : 0.02% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000029s : 0.03% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.00% optimize.opt_a.before_grad : 0.000021s : 0.02% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000009s : 0.01% optimize.opt_a.meta_fg_expand : 0.000006s : 0.01% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.00% optimize.opt_a.receive_attached : 0.000004s : 0.00% optimize.opt_a.after_resolve : 0.000024s : 0.03% optimize.opt_a.a_after_grad : 0.000021s : 0.02% optimize.opt_a.renormalize : 0.000826s : 0.89% optimize.opt_a.add_forward_monad_depend : 0.000009s : 0.01% optimize.opt_a.auto_monad_grad : 0.000005s : 0.01% optimize.opt_a.auto_monad_eliminator : 0.000027s : 0.03% optimize.opt_a.cse : 0.000044s : 0.05% optimize.opt_a.a_3 : 0.000113s : 0.12% optimize.py_interpret_to_execute_after_opt_a : 0.000016s : 0.02% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.01% optimize.rewriter_after_opt_a : 0.000045s : 0.05% optimize.convert_after_rewriter : 0.000010s : 0.01% optimize.order_py_execute_after_rewriter : 0.000009s : 0.01% optimize.mutable_eliminate : 0.000805s : 0.86% optimize.opt_b.b_1 : 0.000172s : 0.18% optimize.opt_b.b_2 : 0.000009s : 0.01% optimize.opt_b.updatestate_depend_eliminate : 0.000010s : 0.01% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.00% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000024s : 0.03% optimize.optimize_parallel_all_gather_comm : 0.000024s : 0.03% optimize.overlap_param_gather : 0.000006s : 0.01% optimize.cconv : 0.000038s : 0.04% optimize.loop_unroll : 0.000515s : 0.55% optimize.opt_after_cconv.c_1 : 0.000034s : 0.04% optimize.opt_after_cconv.parameter_eliminate : 0.000005s : 0.00% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.01% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.cse : 0.000022s : 0.02% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000019s : 0.02% optimize.tuple_transform.d_1 : 0.000053s : 0.06% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000007s : 0.01% optimize.partial_unused_args_eliminate : 0.000005s : 0.00% optimize.add_recomputation : 0.000056s : 0.06% optimize.cse_after_recomputation.cse : 0.000012s : 0.01% optimize.environ_conv : 0.000009s : 0.01% optimize.swap_dp_allreduce_reducescatter : 0.000008s : 0.01% optimize.bias_add_comm_swap : 0.000006s : 0.01% optimize.label_micro_interleaved_index : 0.000007s : 0.01% optimize.label_fine_grained_interleaved_index : 0.000006s : 0.01% optimize.merge_cast_opt : 0.000004s : 0.00% optimize.slice_recompute_activation : 0.000005s : 0.01% optimize.micro_interleaved_order_control : 0.000005s : 0.01% optimize.assign_add_opt : 0.000004s : 0.00% optimize.ForceFp32Comm : 0.000004s : 0.00% optimize.remove_cast_before_assign_add : 0.000004s : 0.00% optimize.full_micro_interleaved_order_control : 0.000005s : 0.01% optimize.reorder_send_recv_between_fp_bp : 0.000006s : 0.01% optimize.comm_op_add_attrs : 0.000004s : 0.00% optimize.add_comm_op_reuse_tag : 0.000003s : 0.00% optimize.interleave_split_concat_branches : 0.000004s : 0.00% optimize.interleave_parallel_branches : 0.000003s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000005s : 0.01% optimize.control_data_broadcast_order : 0.000017s : 0.02% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.00% optimize.offloading_packed_experts : 0.000007s : 0.01% optimize.overlap_recompute_and_grad_model_parallel : 0.000008s : 0.01% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.00% optimize.overlap_recompute_comm : 0.000005s : 0.01% optimize.overlap_grad_ring_attention : 0.000006s : 0.01% optimize.overlap_grad_flash_sp : 0.000025s : 0.03% optimize.begin_end_overlap_inline : 0.000003s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000005s : 0.01% optimize.split_layernorm_comm : 0.000004s : 0.00% optimize.handle_group_info : 0.000003s : 0.00% optimize.symbol_engine_optimizer.build : 0.000004s : 0.00% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000011s : 0.01% optimize.symbol_engine_optimizer.elim_not_effective : 0.000015s : 0.02% optimize.symbol_engine_optimizer.opt_reshape : 0.000008s : 0.01% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000011s : 0.01% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000004s : 0.00% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000021s : 0.02% get_jit_bprop_graph : 0.000002s : 0.00% rewriter_after_jit_bprop_graph : 0.000006s : 0.01% opt_after_jit_grad : 0.000639s : 0.69% validate : 0.000042s : 0.05% Time group info: ------[substitution.] 0.000428 34 9.33% : 0.000040s : 6: substitution.arithmetic_simplify 0.55% : 0.000002s : 2: substitution.elim_not_effective 0.32% : 0.000001s : 2: substitution.fold_const_symbol 1.75% : 0.000007s : 4: substitution.graph_param_transform 80.63% : 0.000345s : 4: substitution.inline 0.98% : 0.000004s : 4: substitution.j_node_and_user_rematch 1.29% : 0.000006s : 4: substitution.remove_not_recompute_node 1.29% : 0.000006s : 4: substitution.replace_old_param 3.86% : 0.000017s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.086605 2 99.12% : 0.085846s : 1: type_inference.infer 0.88% : 0.000758s : 1: type_inference.specialize ------[replace.] 0.000075 8 66.51% : 0.000050s : 4: replace.inline 33.49% : 0.000025s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000357 8 95.91% : 0.000342s : 4: match.inline 4.09% : 0.000015s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000226 1278 0.81% : 0.000002s : 13: predicate.accumulaten_eliminater 0.88% : 0.000002s : 4: predicate.ad_related_special_op_eliminate 0.47% : 0.000001s : 8: predicate.addn_check_dump 1.03% : 0.000002s : 13: predicate.addn_zero_filter 0.77% : 0.000002s : 13: predicate.adjust_all_reduce_mul_add 2.56% : 0.000006s : 21: predicate.arithmetic_simplify 1.00% : 0.000002s : 13: predicate.cast_eliminate 0.57% : 0.000001s : 8: predicate.check_bprop_eliminate 0.53% : 0.000001s : 8: predicate.compare_switch_simplify 0.16% : 0.000000s : 4: predicate.const_output_eliminate 0.61% : 0.000001s : 8: predicate.depend_value_elim 0.80% : 0.000002s : 13: predicate.dict_get_item_const_eliminator 1.14% : 0.000003s : 13: predicate.dict_get_item_eliminator 0.93% : 0.000002s : 13: predicate.dict_set_item_eliminator 1.17% : 0.000003s : 8: predicate.dumpgradient_eliminate 0.35% : 0.000001s : 4: predicate.elim_not_effective 0.42% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.18% : 0.000003s : 17: predicate.environ_add_const_eliminate 1.10% : 0.000002s : 17: predicate.environ_get_add_eliminate 1.08% : 0.000002s : 17: predicate.environ_get_depend_swap 1.56% : 0.000004s : 25: predicate.environ_get_eliminate 1.03% : 0.000002s : 17: predicate.environ_get_set_eliminate 1.34% : 0.000003s : 21: predicate.exchange_switch_depend_value 2.40% : 0.000005s : 21: predicate.float_depend_g_call 0.50% : 0.000001s : 8: predicate.float_environ_get_switch 0.74% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.16% : 0.000000s : 4: predicate.fold_const_symbol 0.60% : 0.000001s : 8: predicate.get_grad_eliminate 0.20% : 0.000000s : 4: predicate.graph_param_transform 0.55% : 0.000001s : 8: predicate.incorporate_call 0.43% : 0.000001s : 8: predicate.incorporate_call_switch 6.33% : 0.000014s : 58: predicate.inline 0.92% : 0.000002s : 8: predicate.inline_without_move 0.30% : 0.000001s : 8: predicate.j_node_and_user_rematch 1.07% : 0.000002s : 8: predicate.less_batch_normalization 1.73% : 0.000004s : 25: predicate.list_to_tuple_eliminator_ 2.32% : 0.000005s : 38: predicate.load_eliminater 1.11% : 0.000003s : 4: predicate.loop_unroll_after_grad 2.46% : 0.000006s : 34: predicate.loop_unroll_before_grad 1.63% : 0.000004s : 21: predicate.make_slice_get_slice_eliminator 0.50% : 0.000001s : 8: predicate.merge_addn 0.54% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.55% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.83% : 0.000002s : 13: predicate.minmaximum_grad 1.74% : 0.000004s : 4: predicate.mutable_eliminate 0.42% : 0.000001s : 4: predicate.opt_reshape 0.57% : 0.000001s : 4: predicate.parallel_virtual_node 1.96% : 0.000004s : 21: predicate.partial_defer_inline 1.49% : 0.000003s : 21: predicate.partial_eliminate 0.96% : 0.000002s : 13: predicate.print_const_string_wrapper 0.64% : 0.000001s : 8: predicate.reduce_all_const_elim 1.13% : 0.000003s : 13: predicate.reduce_eliminate 2.37% : 0.000005s : 38: predicate.redundant_stop_gradient_eliminater 0.46% : 0.000001s : 8: predicate.remove_not_recompute_node 1.24% : 0.000003s : 25: predicate.replace_applicator 0.49% : 0.000001s : 8: predicate.replace_old_param 0.35% : 0.000001s : 4: predicate.reset_defer_inline 0.93% : 0.000002s : 13: predicate.reshape_eliminate 0.73% : 0.000002s : 8: predicate.row_tensor_add_zeros_like 0.38% : 0.000001s : 4: predicate.row_tensor_eliminate 0.73% : 0.000002s : 8: predicate.same_eliminate 0.38% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.85% : 0.000002s : 8: predicate.shard_identity_eliminate 0.68% : 0.000002s : 8: predicate.special_op_eliminate 0.78% : 0.000002s : 8: predicate.specialize_transform 0.86% : 0.000002s : 8: predicate.split_environ_get_set_with_tuple_value 0.69% : 0.000002s : 8: predicate.stack_unstack_eliminate 0.38% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.47% : 0.000003s : 21: predicate.switch_defer_inline 1.95% : 0.000004s : 29: predicate.switch_layer_defer_inline 4.91% : 0.000011s : 67: predicate.switch_simplify 0.97% : 0.000002s : 13: predicate.tile_eliminate 0.92% : 0.000002s : 13: predicate.transpose_eliminate 1.57% : 0.000004s : 21: predicate.tuple_list_convert_item_index_to_positive 1.71% : 0.000004s : 21: predicate.tuple_list_get_item_const_eliminator 1.34% : 0.000003s : 21: predicate.tuple_list_get_item_depend_reorder 3.34% : 0.000008s : 33: predicate.tuple_list_get_item_eliminator 1.48% : 0.000003s : 21: predicate.tuple_list_get_set_item_eliminator 2.35% : 0.000005s : 29: predicate.tuple_list_set_item_eliminator 1.68% : 0.000004s : 25: predicate.tuple_to_list_eliminator_ 2.23% : 0.000005s : 38: predicate.updatestate_pure_node_eliminater 3.06% : 0.000007s : 46: predicate.updatestate_useless_node_eliminater 0.43% : 0.000001s : 4: predicate.value_based_eliminate 0.63% : 0.000001s : 8: predicate.virtual_dataset_eliminate 0.67% : 0.000002s : 8: predicate.virtual_output_eliminate 0.26% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.43% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000622 11 56.44% : 0.000351s : 5: func_graph_cloner_run.FuncGraphClonerGraph 43.56% : 0.000271s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.110885 192 0.01% : 0.000007s : 1: ForceFp32Comm 3.18% : 0.003530s : 1: add_attr 3.17% : 0.003512s : 1: add_attr_with_inline 0.01% : 0.000007s : 1: add_comm_op_reuse_tag 0.05% : 0.000060s : 1: add_recomputation 0.01% : 0.000007s : 1: assign_add_opt 0.07% : 0.000075s : 1: auto_monad 0.03% : 0.000029s : 1: auto_monad_reorder 0.01% : 0.000006s : 1: begin_end_overlap_inline 0.01% : 0.000009s : 1: bias_add_comm_swap 0.45% : 0.000503s : 1: bootstrap 0.04% : 0.000041s : 1: cconv 0.01% : 0.000006s : 1: comm_op_add_attrs 0.02% : 0.000020s : 1: control_data_broadcast_order 0.01% : 0.000014s : 1: convert_after_rewriter 0.03% : 0.000031s : 1: cse_after_recomputation 0.01% : 0.000007s : 1: dataset_repeat_opt 0.02% : 0.000020s : 1: detach_backward 0.01% : 0.000012s : 1: environ_conv 0.03% : 0.000032s : 1: event_method 0.01% : 0.000008s : 1: full_micro_interleaved_order_control 0.01% : 0.000008s : 1: get_jit_bprop_graph 0.01% : 0.000012s : 1: graph_reusing 0.01% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000006s : 1: handle_group_info 0.01% : 0.000008s : 1: inline 0.01% : 0.000009s : 1: insert-virtual-dataset 0.01% : 0.000006s : 1: interleave_parallel_branches 0.01% : 0.000006s : 1: interleave_split_concat_branches 0.01% : 0.000009s : 1: label_fine_grained_interleaved_index 0.01% : 0.000010s : 1: label_micro_interleaved_index 0.47% : 0.000521s : 1: loop_unroll 0.01% : 0.000007s : 1: merge_cast_opt 0.01% : 0.000008s : 1: micro_interleaved_order_control 0.73% : 0.000813s : 1: mutable_eliminate 0.01% : 0.000010s : 1: offloading_packed_experts 0.02% : 0.000017s : 1: opt.transform.loop_unroll_optimizer 0.02% : 0.000021s : 1: opt.transform.mutable_eliminate 1.33% : 0.001479s : 78: opt.transform.opt_a 0.03% : 0.000032s : 1: opt.transform.opt_after_cconv 0.03% : 0.000029s : 1: opt.transform.opt_after_jit_grad 0.10% : 0.000106s : 28: opt.transform.opt_b 0.05% : 0.000058s : 2: opt.transform.opt_trans_graph 0.04% : 0.000041s : 4: opt.transform.symbol_engine_opt 3.24% : 0.003590s : 1: opt_a 0.13% : 0.000146s : 1: opt_after_cconv 0.59% : 0.000650s : 1: opt_after_jit_grad 0.27% : 0.000297s : 1: opt_b 6.09% : 0.006751s : 1: optimize 0.02% : 0.000028s : 1: optimize_parallel_all_gather_comm 0.01% : 0.000012s : 1: order_py_execute_after_rewriter 0.03% : 0.000028s : 1: overlap_grad_flash_sp 0.01% : 0.000006s : 1: overlap_grad_matmul_and_grad_allreduce 0.01% : 0.000009s : 1: overlap_grad_ring_attention 0.01% : 0.000007s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000007s : 1: overlap_opt_shard_in_pipeline 0.01% : 0.000009s : 1: overlap_param_gather 0.01% : 0.000006s : 1: overlap_recompute_allgather_and_fa_grad 0.01% : 0.000012s : 1: overlap_recompute_and_grad_model_parallel 0.01% : 0.000008s : 1: overlap_recompute_comm 0.01% : 0.000010s : 1: parallel-infer-symbol 0.01% : 0.000006s : 1: parallel-infer-symbol-second 0.01% : 0.000008s : 1: partial_unused_args_eliminate 0.01% : 0.000010s : 1: pipeline_parallel_scheduler 0.01% : 0.000007s : 1: pipeline_split 0.04% : 0.000049s : 1: pre_auto_parallel 0.03% : 0.000038s : 1: py_interpret_to_execute 0.02% : 0.000020s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000007s : 1: remove_cast_before_assign_add 0.02% : 0.000022s : 1: remove_dup_value 0.41% : 0.000451s : 1: renormalize.infer 0.33% : 0.000366s : 1: renormalize.specialize 0.01% : 0.000008s : 1: reorder_send_recv_between_fp_bp 0.01% : 0.000012s : 1: rewriter_after_jit_bprop_graph 0.04% : 0.000049s : 1: rewriter_after_opt_a 0.11% : 0.000123s : 1: rewriter_before_opt_a 0.01% : 0.000008s : 1: slice_cell_reuse_recomputed_activation 0.01% : 0.000009s : 1: slice_recompute_activation 0.01% : 0.000008s : 1: split_layernorm_comm 0.01% : 0.000008s : 1: split_matmul_comm_elemetwise 0.01% : 0.000012s : 1: swap_dp_allreduce_reducescatter 0.10% : 0.000107s : 1: symbol_engine_optimizer 0.09% : 0.000102s : 1: tuple_transform 78.21% : 0.086719s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:42:14.442.442 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.146184, [21] [bootstrap]: 0.00043738 [type_inference]: 0.00641319 [event_method]: 2.065e-05 [auto_monad]: 6.445e-05 [graph_reusing]: 5.92001e-06 [inline]: 2.26e-06 [add_attr]: 0.00356415, [1] [add_attr_with_inline]: 0.00355224, [1] [Cycle 1]: 7.442e-05, [2] [tag_attr]: 2.372e-05 [meta_addattr_fg_expand]: 6.22001e-06 [parallel-infer-symbol]: 4.42e-06 [pre_auto_parallel]: 4.215e-05 [insert-virtual-dataset]: 2.52001e-06 [parallel-infer-symbol-second]: 9.30013e-07 [dataset_repeat_opt]: 2.07999e-06 [pipeline_split]: 1.82001e-06 [optimize]: 0.134905, [53] [py_interpret_to_execute]: 3.12e-05 [rewriter_before_opt_a]: 9.525e-05 [opt_a]: 0.132473, [2] [Cycle 1]: 0.131687, [45] [expand_dump_flag]: 3.93999e-06 [switch_simplify]: 4.436e-05 [loop_unroll]: 3.103e-05 [a_1]: 0.00070282 [with_stream_mark]: 2.348e-05 [recompute_prepare]: 1.119e-05 [updatestate_depend_eliminate]: 4.65999e-06 [updatestate_assign_eliminate]: 3.54002e-06 [updatestate_loads_eliminate]: 3.01999e-06 [parameter_eliminate]: 2.22999e-06 [a_2]: 9.047e-05 [accelerated_algorithm]: 8.31002e-06 [shard]: 2.41e-06 [meta_shard_fg_expand]: 2.06998e-06 [shard_inline]: 7.65e-06 [merge_send_recv]: 9.53002e-06 [auto_parallel]: 8.92999e-06 [parallel]: 2.076e-05 [flash_sp]: 1.028e-05 [merge_comm]: 4.35e-06 [allreduce_fusion]: 3.62002e-06 [matmul_add_comm_reduction]: 1.099e-05 [allreduce_slice_to_reducescatter]: 9.79984e-07 [virtual_shard_identity]: 9.44e-06 [virtual_dataset]: 7.18e-06 [get_grad_eliminate_]: 6.33998e-06 [virtual_output]: 6.49999e-06 [merge_forward]: 3.97998e-06 [cell_reuse_recompute_pass]: 1.93002e-06 [offload_activation]: 1.072e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.382e-05 [merge_recompute_call_nodes]: 1.46002e-06 [before_grad]: 1.115e-05 [set_forward_comm_id_for_comm_node_pass]: 3.85998e-06 [meta_fg_expand]: 3.13e-06 [flash_sp_send_recv_attached]: 3.00002e-06 [receive_attached]: 2.09999e-06 [after_resolve]: 1.333e-05 [a_after_grad]: 1.022e-05 [renormalize]: 0.130122 [add_forward_monad_depend]: 1.21e-05 [auto_monad_grad]: 2.49001e-06 [auto_monad_eliminator]: 2.479e-05 [cse]: 3.49e-05 [a_3]: 6.738e-05 [Cycle 2]: 0.00077302, [45] [expand_dump_flag]: 1.97999e-06 [switch_simplify]: 1.048e-05 [loop_unroll]: 7.14001e-06 [a_1]: 0.00016804 [with_stream_mark]: 2.085e-05 [recompute_prepare]: 7.03e-06 [updatestate_depend_eliminate]: 4.11001e-06 [updatestate_assign_eliminate]: 3.26001e-06 [updatestate_loads_eliminate]: 3.14001e-06 [parameter_eliminate]: 1.75001e-06 [a_2]: 8.104e-05 [accelerated_algorithm]: 6.79001e-06 [shard]: 2.62001e-06 [meta_shard_fg_expand]: 2.16e-06 [shard_inline]: 6.03998e-06 [merge_send_recv]: 8.96002e-06 [auto_parallel]: 9.77999e-06 [parallel]: 9.92001e-06 [flash_sp]: 8.40001e-06 [merge_comm]: 3.73999e-06 [allreduce_fusion]: 3.42002e-06 [matmul_add_comm_reduction]: 1.013e-05 [allreduce_slice_to_reducescatter]: 8.50006e-07 [virtual_shard_identity]: 7.71999e-06 [virtual_dataset]: 6.06e-06 [get_grad_eliminate_]: 5.86998e-06 [virtual_output]: 6.31998e-06 [merge_forward]: 3.88999e-06 [cell_reuse_recompute_pass]: 3.65998e-06 [offload_activation]: 1.077e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.258e-05 [merge_recompute_call_nodes]: 1.42e-06 [before_grad]: 1.028e-05 [set_forward_comm_id_for_comm_node_pass]: 3.76001e-06 [meta_fg_expand]: 2.80997e-06 [flash_sp_send_recv_attached]: 1.72999e-06 [receive_attached]: 2.66999e-06 [after_resolve]: 1.312e-05 [a_after_grad]: 1.108e-05 [renormalize]: 7.00238e-08 [add_forward_monad_depend]: 1.52999e-06 [auto_monad_grad]: 1.19e-06 [auto_monad_eliminator]: 9.03002e-06 [cse]: 1.707e-05 [a_3]: 3.71e-05 [py_interpret_to_execute_after_opt_a]: 1.835e-05 [slice_cell_reuse_recomputed_activation]: 1.96e-06 [rewriter_after_opt_a]: 4.209e-05 [convert_after_rewriter]: 6.59999e-06 [order_py_execute_after_rewriter]: 5.38002e-06 [mutable_eliminate]: 0.00074783 [opt_b]: 0.00022875, [1] [Cycle 1]: 0.00022046, [7] [b_1]: 0.00013467 [b_2]: 9.35001e-06 [updatestate_depend_eliminate]: 7.6e-06 [updatestate_assign_eliminate]: 2.93998e-06 [updatestate_loads_eliminate]: 2.99999e-06 [renormalize]: 7.09988e-07 [cse]: 2.327e-05 [optimize_parallel_all_gather_comm]: 2.093e-05 [overlap_param_gather]: 1.91e-06 [cconv]: 3.31e-05 [loop_unroll]: 0.00046404 [opt_after_cconv]: 0.00010965, [1] [Cycle 1]: 0.00010326, [7] [c_1]: 3.208e-05 [parameter_eliminate]: 5.79999e-06 [updatestate_depend_eliminate]: 6.32001e-06 [updatestate_assign_eliminate]: 2.41998e-06 [updatestate_loads_eliminate]: 2.34999e-06 [cse]: 1.872e-05 [renormalize]: 3.7998e-07 [remove_dup_value]: 1.494e-05 [tuple_transform]: 0.00010667, [1] [Cycle 1]: 7.834e-05, [4] [d_1]: 4.923e-05 [none_parameter_eliminate]: 1.90001e-06 [renormalize]: 1.8999e-07 [switch_simplify]: 7.4e-06 [partial_unused_args_eliminate]: 1.96998e-06 [add_recomputation]: 5.576e-05 [cse_after_recomputation]: 2.341e-05, [1] [Cycle 1]: 1.822e-05, [1] [cse]: 1.253e-05 [environ_conv]: 5.47999e-06 [swap_dp_allreduce_reducescatter]: 5.51e-06 [bias_add_comm_swap]: 3.08e-06 [label_micro_interleaved_index]: 4.53999e-06 [label_fine_grained_interleaved_index]: 3.36001e-06 [merge_cast_opt]: 1.60001e-06 [slice_recompute_activation]: 2.07001e-06 [micro_interleaved_order_control]: 2.45997e-06 [assign_add_opt]: 1.19e-06 [ForceFp32Comm]: 1.02e-06 [remove_cast_before_assign_add]: 1.10999e-06 [full_micro_interleaved_order_control]: 2.11998e-06 [reorder_send_recv_between_fp_bp]: 3.07002e-06 [comm_op_add_attrs]: 1.40001e-06 [add_comm_op_reuse_tag]: 9.60019e-07 [interleave_split_concat_branches]: 1.59998e-06 [interleave_parallel_branches]: 1.20001e-06 [overlap_opt_shard_in_pipeline]: 1.27e-06 [overlap_opt_shard_grad_in_pipeline]: 2.28002e-06 [control_data_broadcast_order]: 1.278e-05 [grouped_pairwise_exchange_alltoall]: 1.60999e-06 [offloading_packed_experts]: 4.1e-06 [overlap_recompute_and_grad_model_parallel]: 5.25001e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.20999e-06 [overlap_recompute_allgather_and_fa_grad]: 1.40999e-06 [overlap_recompute_comm]: 2.17001e-06 [overlap_grad_ring_attention]: 4.56002e-06 [overlap_grad_flash_sp]: 2.22e-05 [begin_end_overlap_inline]: 4.80009e-07 [split_matmul_comm_elemetwise]: 2.16e-06 [split_layernorm_comm]: 1.71e-06 [handle_group_info]: 1.19e-06 [symbol_engine_optimizer]: 7.873e-05, [1] [Cycle 1]: 7.419e-05, [6] [build]: 3.93999e-06 [elim_shapecalc]: 1.018e-05 [elim_not_effective]: 1.271e-05 [opt_reshape]: 7.43e-06 [fold_const_symbol]: 1.071e-05 [renormalize]: 1.69995e-07 [detach_backward]: 2.32999e-06 [pipeline_parallel_scheduler]: 1.41998e-06 [auto_monad_reorder]: 1.647e-05 [get_jit_bprop_graph]: 1.98002e-06 [rewriter_after_jit_bprop_graph]: 5.87001e-06 [opt_after_jit_grad]: 0.00048294 [validate]: 4.292e-05 Sums bootstrap : 0.000437s : 0.31% type_inference : 0.006413s : 4.53% event_method : 0.000021s : 0.01% auto_monad : 0.000064s : 0.05% graph_reusing : 0.000006s : 0.00% inline : 0.000002s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000024s : 0.02% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.00% parallel-infer-symbol : 0.000004s : 0.00% pre_auto_parallel : 0.000042s : 0.03% insert-virtual-dataset : 0.000003s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000031s : 0.02% optimize.rewriter_before_opt_a : 0.000095s : 0.07% optimize.opt_a.expand_dump_flag : 0.000006s : 0.00% optimize.opt_a.switch_simplify : 0.000055s : 0.04% optimize.opt_a.loop_unroll : 0.000038s : 0.03% optimize.opt_a.a_1 : 0.000871s : 0.62% optimize.opt_a.with_stream_mark : 0.000044s : 0.03% optimize.opt_a.recompute_prepare : 0.000018s : 0.01% optimize.opt_a.updatestate_depend_eliminate : 0.000009s : 0.01% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.00% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.00% optimize.opt_a.parameter_eliminate : 0.000004s : 0.00% optimize.opt_a.a_2 : 0.000172s : 0.12% optimize.opt_a.accelerated_algorithm : 0.000015s : 0.01% optimize.opt_a.shard : 0.000005s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.00% optimize.opt_a.shard_inline : 0.000014s : 0.01% optimize.opt_a.merge_send_recv : 0.000018s : 0.01% optimize.opt_a.auto_parallel : 0.000019s : 0.01% optimize.opt_a.parallel : 0.000031s : 0.02% optimize.opt_a.flash_sp : 0.000019s : 0.01% optimize.opt_a.merge_comm : 0.000008s : 0.01% optimize.opt_a.allreduce_fusion : 0.000007s : 0.00% optimize.opt_a.matmul_add_comm_reduction : 0.000021s : 0.01% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000002s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000017s : 0.01% optimize.opt_a.virtual_dataset : 0.000013s : 0.01% optimize.opt_a.get_grad_eliminate_ : 0.000012s : 0.01% optimize.opt_a.virtual_output : 0.000013s : 0.01% optimize.opt_a.merge_forward : 0.000008s : 0.01% optimize.opt_a.cell_reuse_recompute_pass : 0.000006s : 0.00% optimize.opt_a.offload_activation : 0.000021s : 0.02% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000026s : 0.02% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.00% optimize.opt_a.before_grad : 0.000021s : 0.02% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000008s : 0.01% optimize.opt_a.meta_fg_expand : 0.000006s : 0.00% optimize.opt_a.flash_sp_send_recv_attached : 0.000005s : 0.00% optimize.opt_a.receive_attached : 0.000005s : 0.00% optimize.opt_a.after_resolve : 0.000026s : 0.02% optimize.opt_a.a_after_grad : 0.000021s : 0.02% optimize.opt_a.renormalize : 0.130122s : 91.94% optimize.opt_a.add_forward_monad_depend : 0.000014s : 0.01% optimize.opt_a.auto_monad_grad : 0.000004s : 0.00% optimize.opt_a.auto_monad_eliminator : 0.000034s : 0.02% optimize.opt_a.cse : 0.000052s : 0.04% optimize.opt_a.a_3 : 0.000104s : 0.07% optimize.py_interpret_to_execute_after_opt_a : 0.000018s : 0.01% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.00% optimize.rewriter_after_opt_a : 0.000042s : 0.03% optimize.convert_after_rewriter : 0.000007s : 0.00% optimize.order_py_execute_after_rewriter : 0.000005s : 0.00% optimize.mutable_eliminate : 0.000748s : 0.53% optimize.opt_b.b_1 : 0.000135s : 0.10% optimize.opt_b.b_2 : 0.000009s : 0.01% optimize.opt_b.updatestate_depend_eliminate : 0.000008s : 0.01% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.00% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000023s : 0.02% optimize.optimize_parallel_all_gather_comm : 0.000021s : 0.01% optimize.overlap_param_gather : 0.000002s : 0.00% optimize.cconv : 0.000033s : 0.02% optimize.loop_unroll : 0.000464s : 0.33% optimize.opt_after_cconv.c_1 : 0.000032s : 0.02% optimize.opt_after_cconv.parameter_eliminate : 0.000006s : 0.00% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.00% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000002s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.00% optimize.opt_after_cconv.cse : 0.000019s : 0.01% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000015s : 0.01% optimize.tuple_transform.d_1 : 0.000049s : 0.03% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000007s : 0.01% optimize.partial_unused_args_eliminate : 0.000002s : 0.00% optimize.add_recomputation : 0.000056s : 0.04% optimize.cse_after_recomputation.cse : 0.000013s : 0.01% optimize.environ_conv : 0.000005s : 0.00% optimize.swap_dp_allreduce_reducescatter : 0.000006s : 0.00% optimize.bias_add_comm_swap : 0.000003s : 0.00% optimize.label_micro_interleaved_index : 0.000005s : 0.00% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.00% optimize.merge_cast_opt : 0.000002s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.00% optimize.micro_interleaved_order_control : 0.000002s : 0.00% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.00% optimize.full_micro_interleaved_order_control : 0.000002s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.00% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000002s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.00% optimize.control_data_broadcast_order : 0.000013s : 0.01% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.00% optimize.offloading_packed_experts : 0.000004s : 0.00% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.00% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.00% optimize.overlap_recompute_comm : 0.000002s : 0.00% optimize.overlap_grad_ring_attention : 0.000005s : 0.00% optimize.overlap_grad_flash_sp : 0.000022s : 0.02% optimize.begin_end_overlap_inline : 0.000000s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.00% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000004s : 0.00% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000010s : 0.01% optimize.symbol_engine_optimizer.elim_not_effective : 0.000013s : 0.01% optimize.symbol_engine_optimizer.opt_reshape : 0.000007s : 0.01% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000011s : 0.01% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.00% pipeline_parallel_scheduler : 0.000001s : 0.00% auto_monad_reorder : 0.000016s : 0.01% get_jit_bprop_graph : 0.000002s : 0.00% rewriter_after_jit_bprop_graph : 0.000006s : 0.00% opt_after_jit_grad : 0.000483s : 0.34% validate : 0.000043s : 0.03% Time group info: ------[substitution.] 0.000261 34 16.46% : 0.000043s : 6: substitution.arithmetic_simplify 0.66% : 0.000002s : 2: substitution.elim_not_effective 0.64% : 0.000002s : 2: substitution.fold_const_symbol 2.67% : 0.000007s : 4: substitution.graph_param_transform 66.56% : 0.000174s : 4: substitution.inline 1.79% : 0.000005s : 4: substitution.j_node_and_user_rematch 2.17% : 0.000006s : 4: substitution.remove_not_recompute_node 2.24% : 0.000006s : 4: substitution.replace_old_param 6.81% : 0.000018s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.006343 2 88.42% : 0.005609s : 1: type_inference.infer 11.58% : 0.000734s : 1: type_inference.specialize ------[replace.] 0.000067 8 63.92% : 0.000043s : 4: replace.inline 36.08% : 0.000024s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000187 8 91.50% : 0.000171s : 4: match.inline 8.50% : 0.000016s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000228 1278 1.22% : 0.000003s : 13: predicate.accumulaten_eliminater 0.60% : 0.000001s : 4: predicate.ad_related_special_op_eliminate 0.50% : 0.000001s : 8: predicate.addn_check_dump 1.01% : 0.000002s : 13: predicate.addn_zero_filter 0.87% : 0.000002s : 13: predicate.adjust_all_reduce_mul_add 2.49% : 0.000006s : 21: predicate.arithmetic_simplify 1.03% : 0.000002s : 13: predicate.cast_eliminate 0.56% : 0.000001s : 8: predicate.check_bprop_eliminate 0.52% : 0.000001s : 8: predicate.compare_switch_simplify 0.18% : 0.000000s : 4: predicate.const_output_eliminate 0.60% : 0.000001s : 8: predicate.depend_value_elim 0.90% : 0.000002s : 13: predicate.dict_get_item_const_eliminator 1.19% : 0.000003s : 13: predicate.dict_get_item_eliminator 0.99% : 0.000002s : 13: predicate.dict_set_item_eliminator 0.84% : 0.000002s : 8: predicate.dumpgradient_eliminate 0.18% : 0.000000s : 4: predicate.elim_not_effective 0.41% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.15% : 0.000003s : 17: predicate.environ_add_const_eliminate 1.08% : 0.000002s : 17: predicate.environ_get_add_eliminate 1.12% : 0.000003s : 17: predicate.environ_get_depend_swap 1.60% : 0.000004s : 25: predicate.environ_get_eliminate 0.93% : 0.000002s : 17: predicate.environ_get_set_eliminate 1.44% : 0.000003s : 21: predicate.exchange_switch_depend_value 2.33% : 0.000005s : 21: predicate.float_depend_g_call 0.60% : 0.000001s : 8: predicate.float_environ_get_switch 0.78% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.15% : 0.000000s : 4: predicate.fold_const_symbol 0.65% : 0.000001s : 8: predicate.get_grad_eliminate 0.18% : 0.000000s : 4: predicate.graph_param_transform 0.53% : 0.000001s : 8: predicate.incorporate_call 0.42% : 0.000001s : 8: predicate.incorporate_call_switch 6.16% : 0.000014s : 58: predicate.inline 0.76% : 0.000002s : 8: predicate.inline_without_move 0.30% : 0.000001s : 8: predicate.j_node_and_user_rematch 1.10% : 0.000003s : 8: predicate.less_batch_normalization 1.87% : 0.000004s : 25: predicate.list_to_tuple_eliminator_ 2.38% : 0.000005s : 38: predicate.load_eliminater 0.97% : 0.000002s : 4: predicate.loop_unroll_after_grad 2.41% : 0.000005s : 34: predicate.loop_unroll_before_grad 1.51% : 0.000003s : 21: predicate.make_slice_get_slice_eliminator 0.48% : 0.000001s : 8: predicate.merge_addn 0.59% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.63% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.81% : 0.000002s : 13: predicate.minmaximum_grad 1.44% : 0.000003s : 4: predicate.mutable_eliminate 0.38% : 0.000001s : 4: predicate.opt_reshape 0.39% : 0.000001s : 4: predicate.parallel_virtual_node 1.61% : 0.000004s : 21: predicate.partial_defer_inline 1.51% : 0.000003s : 21: predicate.partial_eliminate 0.83% : 0.000002s : 13: predicate.print_const_string_wrapper 0.55% : 0.000001s : 8: predicate.reduce_all_const_elim 1.36% : 0.000003s : 13: predicate.reduce_eliminate 2.39% : 0.000005s : 38: predicate.redundant_stop_gradient_eliminater 0.51% : 0.000001s : 8: predicate.remove_not_recompute_node 1.37% : 0.000003s : 25: predicate.replace_applicator 0.55% : 0.000001s : 8: predicate.replace_old_param 0.43% : 0.000001s : 4: predicate.reset_defer_inline 0.94% : 0.000002s : 13: predicate.reshape_eliminate 0.64% : 0.000001s : 8: predicate.row_tensor_add_zeros_like 0.46% : 0.000001s : 4: predicate.row_tensor_eliminate 0.91% : 0.000002s : 8: predicate.same_eliminate 0.40% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.85% : 0.000002s : 8: predicate.shard_identity_eliminate 0.97% : 0.000002s : 8: predicate.special_op_eliminate 0.63% : 0.000001s : 8: predicate.specialize_transform 1.28% : 0.000003s : 8: predicate.split_environ_get_set_with_tuple_value 0.97% : 0.000002s : 8: predicate.stack_unstack_eliminate 0.36% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.48% : 0.000003s : 21: predicate.switch_defer_inline 1.93% : 0.000004s : 29: predicate.switch_layer_defer_inline 4.79% : 0.000011s : 67: predicate.switch_simplify 0.87% : 0.000002s : 13: predicate.tile_eliminate 0.87% : 0.000002s : 13: predicate.transpose_eliminate 1.44% : 0.000003s : 21: predicate.tuple_list_convert_item_index_to_positive 1.44% : 0.000003s : 21: predicate.tuple_list_get_item_const_eliminator 1.78% : 0.000004s : 21: predicate.tuple_list_get_item_depend_reorder 3.18% : 0.000007s : 33: predicate.tuple_list_get_item_eliminator 1.35% : 0.000003s : 21: predicate.tuple_list_get_set_item_eliminator 2.28% : 0.000005s : 29: predicate.tuple_list_set_item_eliminator 1.84% : 0.000004s : 25: predicate.tuple_to_list_eliminator_ 2.36% : 0.000005s : 38: predicate.updatestate_pure_node_eliminater 3.19% : 0.000007s : 46: predicate.updatestate_useless_node_eliminater 0.38% : 0.000001s : 4: predicate.value_based_eliminate 0.64% : 0.000001s : 8: predicate.virtual_dataset_eliminate 0.59% : 0.000001s : 8: predicate.virtual_output_eliminate 0.24% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.61% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000665 11 49.42% : 0.000329s : 5: func_graph_cloner_run.FuncGraphClonerGraph 50.58% : 0.000336s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.416240 192 0.00% : 0.000004s : 1: ForceFp32Comm 0.86% : 0.003570s : 1: add_attr 0.85% : 0.003556s : 1: add_attr_with_inline 0.00% : 0.000005s : 1: add_comm_op_reuse_tag 0.01% : 0.000060s : 1: add_recomputation 0.00% : 0.000004s : 1: assign_add_opt 0.02% : 0.000069s : 1: auto_monad 0.00% : 0.000020s : 1: auto_monad_reorder 0.00% : 0.000004s : 1: begin_end_overlap_inline 0.00% : 0.000006s : 1: bias_add_comm_swap 0.11% : 0.000467s : 1: bootstrap 0.01% : 0.000037s : 1: cconv 0.00% : 0.000004s : 1: comm_op_add_attrs 0.00% : 0.000016s : 1: control_data_broadcast_order 0.00% : 0.000011s : 1: convert_after_rewriter 0.01% : 0.000026s : 1: cse_after_recomputation 0.00% : 0.000005s : 1: dataset_repeat_opt 0.00% : 0.000005s : 1: detach_backward 0.00% : 0.000009s : 1: environ_conv 0.01% : 0.000027s : 1: event_method 0.00% : 0.000005s : 1: full_micro_interleaved_order_control 0.00% : 0.000005s : 1: get_jit_bprop_graph 0.00% : 0.000010s : 1: graph_reusing 0.00% : 0.000005s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000004s : 1: handle_group_info 0.00% : 0.000005s : 1: inline 0.00% : 0.000006s : 1: insert-virtual-dataset 0.00% : 0.000004s : 1: interleave_parallel_branches 0.00% : 0.000004s : 1: interleave_split_concat_branches 0.00% : 0.000007s : 1: label_fine_grained_interleaved_index 0.00% : 0.000007s : 1: label_micro_interleaved_index 0.11% : 0.000473s : 1: loop_unroll 0.00% : 0.000005s : 1: merge_cast_opt 0.00% : 0.000005s : 1: micro_interleaved_order_control 0.18% : 0.000759s : 1: mutable_eliminate 0.00% : 0.000007s : 1: offloading_packed_experts 0.00% : 0.000015s : 1: opt.transform.loop_unroll_optimizer 0.00% : 0.000021s : 1: opt.transform.mutable_eliminate 0.32% : 0.001330s : 78: opt.transform.opt_a 0.01% : 0.000031s : 1: opt.transform.opt_after_cconv 0.01% : 0.000025s : 1: opt.transform.opt_after_jit_grad 0.03% : 0.000107s : 28: opt.transform.opt_b 0.01% : 0.000054s : 2: opt.transform.opt_trans_graph 0.01% : 0.000037s : 4: opt.transform.symbol_engine_opt 31.83% : 0.132477s : 1: opt_a 0.03% : 0.000113s : 1: opt_after_cconv 0.12% : 0.000492s : 1: opt_after_jit_grad 0.06% : 0.000232s : 1: opt_b 32.41% : 0.134911s : 1: optimize 0.01% : 0.000024s : 1: optimize_parallel_all_gather_comm 0.00% : 0.000008s : 1: order_py_execute_after_rewriter 0.01% : 0.000026s : 1: overlap_grad_flash_sp 0.00% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.00% : 0.000008s : 1: overlap_grad_ring_attention 0.00% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000005s : 1: overlap_param_gather 0.00% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.00% : 0.000009s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000005s : 1: overlap_recompute_comm 0.00% : 0.000008s : 1: parallel-infer-symbol 0.00% : 0.000004s : 1: parallel-infer-symbol-second 0.00% : 0.000005s : 1: partial_unused_args_eliminate 0.00% : 0.000004s : 1: pipeline_parallel_scheduler 0.00% : 0.000005s : 1: pipeline_split 0.01% : 0.000046s : 1: pre_auto_parallel 0.01% : 0.000036s : 1: py_interpret_to_execute 0.01% : 0.000022s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000004s : 1: remove_cast_before_assign_add 0.00% : 0.000018s : 1: remove_dup_value 31.13% : 0.129574s : 1: renormalize.infer 0.13% : 0.000528s : 1: renormalize.specialize 0.00% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.00% : 0.000010s : 1: rewriter_after_jit_bprop_graph 0.01% : 0.000047s : 1: rewriter_after_opt_a 0.02% : 0.000100s : 1: rewriter_before_opt_a 0.00% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000006s : 1: slice_recompute_activation 0.00% : 0.000005s : 1: split_layernorm_comm 0.00% : 0.000005s : 1: split_matmul_comm_elemetwise 0.00% : 0.000009s : 1: swap_dp_allreduce_reducescatter 0.02% : 0.000082s : 1: symbol_engine_optimizer 0.03% : 0.000110s : 1: tuple_transform 1.55% : 0.006435s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:42:16.219.569 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:42:16.219.860 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0303559, [21] [bootstrap]: 0.0004254 [type_inference]: 0.00625406 [event_method]: 2.156e-05 [auto_monad]: 6.643e-05 [graph_reusing]: 5.77001e-06 [inline]: 2.09e-06 [add_attr]: 0.0162896, [1] [add_attr_with_inline]: 0.0162733, [1] [Cycle 1]: 0.00010546, [2] [tag_attr]: 2.711e-05 [meta_addattr_fg_expand]: 5.70001e-06 [parallel-infer-symbol]: 3.45e-06 [pre_auto_parallel]: 4.445e-05 [insert-virtual-dataset]: 2.41e-06 [parallel-infer-symbol-second]: 7.59988e-07 [dataset_repeat_opt]: 2.04e-06 [pipeline_split]: 1.70001e-06 [optimize]: 0.00597997, [53] [py_interpret_to_execute]: 3.98e-05 [rewriter_before_opt_a]: 0.00010231 [opt_a]: 0.00340956, [2] [Cycle 1]: 0.00252319, [45] [expand_dump_flag]: 3.18998e-06 [switch_simplify]: 4.371e-05 [loop_unroll]: 3.056e-05 [a_1]: 0.00070166 [with_stream_mark]: 2.056e-05 [recompute_prepare]: 1.094e-05 [updatestate_depend_eliminate]: 4.78001e-06 [updatestate_assign_eliminate]: 3.2e-06 [updatestate_loads_eliminate]: 3.14999e-06 [parameter_eliminate]: 2.09999e-06 [a_2]: 0.00011982 [accelerated_algorithm]: 7.83001e-06 [shard]: 2.63e-06 [meta_shard_fg_expand]: 2.32001e-06 [shard_inline]: 7.80998e-06 [merge_send_recv]: 9.69999e-06 [auto_parallel]: 8.46002e-06 [parallel]: 2.095e-05 [flash_sp]: 9.40001e-06 [merge_comm]: 3.81001e-06 [allreduce_fusion]: 3.45003e-06 [matmul_add_comm_reduction]: 1.01e-05 [allreduce_slice_to_reducescatter]: 1.00001e-06 [virtual_shard_identity]: 8.93002e-06 [virtual_dataset]: 7.19001e-06 [get_grad_eliminate_]: 6.57002e-06 [virtual_output]: 6.73998e-06 [merge_forward]: 4.20999e-06 [cell_reuse_recompute_pass]: 1.24e-06 [offload_activation]: 1.025e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.608e-05 [merge_recompute_call_nodes]: 1.40001e-06 [before_grad]: 1.252e-05 [set_forward_comm_id_for_comm_node_pass]: 4.03999e-06 [meta_fg_expand]: 2.90998e-06 [flash_sp_send_recv_attached]: 2.71e-06 [receive_attached]: 2.21e-06 [after_resolve]: 1.274e-05 [a_after_grad]: 1.137e-05 [renormalize]: 0.00082423 [add_forward_monad_depend]: 6.71e-06 [auto_monad_grad]: 2.74001e-06 [auto_monad_eliminator]: 1.776e-05 [cse]: 2.996e-05 [a_3]: 6.688e-05 [Cycle 2]: 0.00087014, [45] [expand_dump_flag]: 2.10002e-06 [switch_simplify]: 8.80999e-06 [loop_unroll]: 6.60002e-06 [a_1]: 0.00014752 [with_stream_mark]: 1.674e-05 [recompute_prepare]: 7.18e-06 [updatestate_depend_eliminate]: 3.68e-06 [updatestate_assign_eliminate]: 2.63e-06 [updatestate_loads_eliminate]: 3.05998e-06 [parameter_eliminate]: 1.69998e-06 [a_2]: 0.00010568 [accelerated_algorithm]: 6.66e-06 [shard]: 2.16e-06 [meta_shard_fg_expand]: 1.60001e-06 [shard_inline]: 6.37001e-06 [merge_send_recv]: 6.84999e-06 [auto_parallel]: 7.09001e-06 [parallel]: 6.77002e-06 [flash_sp]: 3.57002e-06 [merge_comm]: 3.67998e-06 [allreduce_fusion]: 3.30998e-06 [matmul_add_comm_reduction]: 6.37001e-06 [allreduce_slice_to_reducescatter]: 6.69999e-07 [virtual_shard_identity]: 7.56001e-06 [virtual_dataset]: 6.89001e-06 [get_grad_eliminate_]: 6.09001e-06 [virtual_output]: 5.99e-06 [merge_forward]: 3.32002e-06 [cell_reuse_recompute_pass]: 1.91e-06 [offload_activation]: 7.7e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.389e-05 [merge_recompute_call_nodes]: 1.14003e-06 [before_grad]: 1.034e-05 [set_forward_comm_id_for_comm_node_pass]: 4.60001e-06 [meta_fg_expand]: 2.12999e-06 [flash_sp_send_recv_attached]: 1.30999e-06 [receive_attached]: 1.69e-06 [after_resolve]: 1.289e-05 [a_after_grad]: 9.44e-06 [renormalize]: 8.00064e-08 [add_forward_monad_depend]: 1.65001e-06 [auto_monad_grad]: 1.50001e-06 [auto_monad_eliminator]: 8.94998e-06 [cse]: 1.599e-05 [a_3]: 5.085e-05 [py_interpret_to_execute_after_opt_a]: 1.653e-05 [slice_cell_reuse_recomputed_activation]: 5.22e-06 [rewriter_after_opt_a]: 4.444e-05 [convert_after_rewriter]: 1.053e-05 [order_py_execute_after_rewriter]: 9.02e-06 [mutable_eliminate]: 0.00069911 [opt_b]: 0.00028431, [1] [Cycle 1]: 0.00027381, [7] [b_1]: 0.00017153 [b_2]: 7.83001e-06 [updatestate_depend_eliminate]: 7.25003e-06 [updatestate_assign_eliminate]: 3.11001e-06 [updatestate_loads_eliminate]: 2.61e-06 [renormalize]: 1.12e-06 [cse]: 2.242e-05 [optimize_parallel_all_gather_comm]: 2.193e-05 [overlap_param_gather]: 4.61002e-06 [cconv]: 3.54e-05 [loop_unroll]: 0.00046105 [opt_after_cconv]: 0.00013144, [1] [Cycle 1]: 0.00012188, [7] [c_1]: 3.196e-05 [parameter_eliminate]: 3.7e-06 [updatestate_depend_eliminate]: 5.78997e-06 [updatestate_assign_eliminate]: 2.84001e-06 [updatestate_loads_eliminate]: 2.44999e-06 [cse]: 1.844e-05 [renormalize]: 6.80011e-07 [remove_dup_value]: 1.725e-05 [tuple_transform]: 9.139e-05, [1] [Cycle 1]: 8.396e-05, [4] [d_1]: 4.533e-05 [none_parameter_eliminate]: 1.75001e-06 [renormalize]: 2.09984e-07 [switch_simplify]: 6.93998e-06 [partial_unused_args_eliminate]: 5.31998e-06 [add_recomputation]: 5.217e-05 [cse_after_recomputation]: 2.856e-05, [1] [Cycle 1]: 2.162e-05, [1] [cse]: 1.235e-05 [environ_conv]: 8.55001e-06 [swap_dp_allreduce_reducescatter]: 8.34002e-06 [bias_add_comm_swap]: 5.05999e-06 [label_micro_interleaved_index]: 7.56999e-06 [label_fine_grained_interleaved_index]: 5.38002e-06 [merge_cast_opt]: 3.64002e-06 [slice_recompute_activation]: 4.23001e-06 [micro_interleaved_order_control]: 4.79e-06 [assign_add_opt]: 3.61001e-06 [ForceFp32Comm]: 3.28e-06 [remove_cast_before_assign_add]: 4.51002e-06 [full_micro_interleaved_order_control]: 4.54002e-06 [reorder_send_recv_between_fp_bp]: 4.99e-06 [comm_op_add_attrs]: 3.35998e-06 [add_comm_op_reuse_tag]: 3.36999e-06 [interleave_split_concat_branches]: 3.85e-06 [interleave_parallel_branches]: 3.43999e-06 [overlap_opt_shard_in_pipeline]: 4.18001e-06 [overlap_opt_shard_grad_in_pipeline]: 4.43999e-06 [control_data_broadcast_order]: 1.748e-05 [grouped_pairwise_exchange_alltoall]: 4.19002e-06 [offloading_packed_experts]: 7.28999e-06 [overlap_recompute_and_grad_model_parallel]: 7.71001e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.55e-06 [overlap_recompute_allgather_and_fa_grad]: 3.70998e-06 [overlap_recompute_comm]: 4.77e-06 [overlap_grad_ring_attention]: 7.06001e-06 [overlap_grad_flash_sp]: 2.69e-05 [begin_end_overlap_inline]: 2.94999e-06 [split_matmul_comm_elemetwise]: 4.95999e-06 [split_layernorm_comm]: 4.27e-06 [handle_group_info]: 3.32002e-06 [symbol_engine_optimizer]: 0.00010283, [1] [Cycle 1]: 9.524e-05, [6] [build]: 3.52997e-06 [elim_shapecalc]: 1.099e-05 [elim_not_effective]: 1.375e-05 [opt_reshape]: 7.38999e-06 [fold_const_symbol]: 1.077e-05 [renormalize]: 2.80008e-07 [detach_backward]: 4.4e-06 [pipeline_parallel_scheduler]: 1.70001e-06 [auto_monad_reorder]: 2.067e-05 [get_jit_bprop_graph]: 1.75001e-06 [rewriter_after_jit_bprop_graph]: 5.42001e-06 [opt_after_jit_grad]: 0.00055312 [validate]: 4.214e-05 Sums bootstrap : 0.000425s : 3.48% type_inference : 0.006254s : 51.14% event_method : 0.000022s : 0.18% auto_monad : 0.000066s : 0.54% graph_reusing : 0.000006s : 0.05% inline : 0.000002s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000027s : 0.22% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.05% parallel-infer-symbol : 0.000003s : 0.03% pre_auto_parallel : 0.000044s : 0.36% insert-virtual-dataset : 0.000002s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000040s : 0.33% optimize.rewriter_before_opt_a : 0.000102s : 0.84% optimize.opt_a.expand_dump_flag : 0.000005s : 0.04% optimize.opt_a.switch_simplify : 0.000053s : 0.43% optimize.opt_a.loop_unroll : 0.000037s : 0.30% optimize.opt_a.a_1 : 0.000849s : 6.94% optimize.opt_a.with_stream_mark : 0.000037s : 0.31% optimize.opt_a.recompute_prepare : 0.000018s : 0.15% optimize.opt_a.updatestate_depend_eliminate : 0.000008s : 0.07% optimize.opt_a.updatestate_assign_eliminate : 0.000006s : 0.05% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.05% optimize.opt_a.parameter_eliminate : 0.000004s : 0.03% optimize.opt_a.a_2 : 0.000225s : 1.84% optimize.opt_a.accelerated_algorithm : 0.000014s : 0.12% optimize.opt_a.shard : 0.000005s : 0.04% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.03% optimize.opt_a.shard_inline : 0.000014s : 0.12% optimize.opt_a.merge_send_recv : 0.000017s : 0.14% optimize.opt_a.auto_parallel : 0.000016s : 0.13% optimize.opt_a.parallel : 0.000028s : 0.23% optimize.opt_a.flash_sp : 0.000013s : 0.11% optimize.opt_a.merge_comm : 0.000007s : 0.06% optimize.opt_a.allreduce_fusion : 0.000007s : 0.06% optimize.opt_a.matmul_add_comm_reduction : 0.000016s : 0.13% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000002s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000016s : 0.13% optimize.opt_a.virtual_dataset : 0.000014s : 0.12% optimize.opt_a.get_grad_eliminate_ : 0.000013s : 0.10% optimize.opt_a.virtual_output : 0.000013s : 0.10% optimize.opt_a.merge_forward : 0.000008s : 0.06% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.03% optimize.opt_a.offload_activation : 0.000018s : 0.15% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000030s : 0.25% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.02% optimize.opt_a.before_grad : 0.000023s : 0.19% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000009s : 0.07% optimize.opt_a.meta_fg_expand : 0.000005s : 0.04% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.03% optimize.opt_a.receive_attached : 0.000004s : 0.03% optimize.opt_a.after_resolve : 0.000026s : 0.21% optimize.opt_a.a_after_grad : 0.000021s : 0.17% optimize.opt_a.renormalize : 0.000824s : 6.74% optimize.opt_a.add_forward_monad_depend : 0.000008s : 0.07% optimize.opt_a.auto_monad_grad : 0.000004s : 0.03% optimize.opt_a.auto_monad_eliminator : 0.000027s : 0.22% optimize.opt_a.cse : 0.000046s : 0.38% optimize.opt_a.a_3 : 0.000118s : 0.96% optimize.py_interpret_to_execute_after_opt_a : 0.000017s : 0.14% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.04% optimize.rewriter_after_opt_a : 0.000044s : 0.36% optimize.convert_after_rewriter : 0.000011s : 0.09% optimize.order_py_execute_after_rewriter : 0.000009s : 0.07% optimize.mutable_eliminate : 0.000699s : 5.72% optimize.opt_b.b_1 : 0.000172s : 1.40% optimize.opt_b.b_2 : 0.000008s : 0.06% optimize.opt_b.updatestate_depend_eliminate : 0.000007s : 0.06% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.02% optimize.opt_b.renormalize : 0.000001s : 0.01% optimize.opt_b.cse : 0.000022s : 0.18% optimize.optimize_parallel_all_gather_comm : 0.000022s : 0.18% optimize.overlap_param_gather : 0.000005s : 0.04% optimize.cconv : 0.000035s : 0.29% optimize.loop_unroll : 0.000461s : 3.77% optimize.opt_after_cconv.c_1 : 0.000032s : 0.26% optimize.opt_after_cconv.parameter_eliminate : 0.000004s : 0.03% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.05% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.02% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.02% optimize.opt_after_cconv.cse : 0.000018s : 0.15% optimize.opt_after_cconv.renormalize : 0.000001s : 0.01% optimize.remove_dup_value : 0.000017s : 0.14% optimize.tuple_transform.d_1 : 0.000045s : 0.37% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000007s : 0.06% optimize.partial_unused_args_eliminate : 0.000005s : 0.04% optimize.add_recomputation : 0.000052s : 0.43% optimize.cse_after_recomputation.cse : 0.000012s : 0.10% optimize.environ_conv : 0.000009s : 0.07% optimize.swap_dp_allreduce_reducescatter : 0.000008s : 0.07% optimize.bias_add_comm_swap : 0.000005s : 0.04% optimize.label_micro_interleaved_index : 0.000008s : 0.06% optimize.label_fine_grained_interleaved_index : 0.000005s : 0.04% optimize.merge_cast_opt : 0.000004s : 0.03% optimize.slice_recompute_activation : 0.000004s : 0.03% optimize.micro_interleaved_order_control : 0.000005s : 0.04% optimize.assign_add_opt : 0.000004s : 0.03% optimize.ForceFp32Comm : 0.000003s : 0.03% optimize.remove_cast_before_assign_add : 0.000005s : 0.04% optimize.full_micro_interleaved_order_control : 0.000005s : 0.04% optimize.reorder_send_recv_between_fp_bp : 0.000005s : 0.04% optimize.comm_op_add_attrs : 0.000003s : 0.03% optimize.add_comm_op_reuse_tag : 0.000003s : 0.03% optimize.interleave_split_concat_branches : 0.000004s : 0.03% optimize.interleave_parallel_branches : 0.000003s : 0.03% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.03% optimize.overlap_opt_shard_grad_in_pipeline : 0.000004s : 0.04% optimize.control_data_broadcast_order : 0.000017s : 0.14% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.03% optimize.offloading_packed_experts : 0.000007s : 0.06% optimize.overlap_recompute_and_grad_model_parallel : 0.000008s : 0.06% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.03% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.03% optimize.overlap_recompute_comm : 0.000005s : 0.04% optimize.overlap_grad_ring_attention : 0.000007s : 0.06% optimize.overlap_grad_flash_sp : 0.000027s : 0.22% optimize.begin_end_overlap_inline : 0.000003s : 0.02% optimize.split_matmul_comm_elemetwise : 0.000005s : 0.04% optimize.split_layernorm_comm : 0.000004s : 0.03% optimize.handle_group_info : 0.000003s : 0.03% optimize.symbol_engine_optimizer.build : 0.000004s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000011s : 0.09% optimize.symbol_engine_optimizer.elim_not_effective : 0.000014s : 0.11% optimize.symbol_engine_optimizer.opt_reshape : 0.000007s : 0.06% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000011s : 0.09% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000004s : 0.04% pipeline_parallel_scheduler : 0.000002s : 0.01% auto_monad_reorder : 0.000021s : 0.17% get_jit_bprop_graph : 0.000002s : 0.01% rewriter_after_jit_bprop_graph : 0.000005s : 0.04% opt_after_jit_grad : 0.000553s : 4.52% validate : 0.000042s : 0.34% Time group info: ------[substitution.] 0.000258 34 14.84% : 0.000038s : 6: substitution.arithmetic_simplify 0.90% : 0.000002s : 2: substitution.elim_not_effective 0.54% : 0.000001s : 2: substitution.fold_const_symbol 2.36% : 0.000006s : 4: substitution.graph_param_transform 68.79% : 0.000177s : 4: substitution.inline 2.10% : 0.000005s : 4: substitution.j_node_and_user_rematch 1.80% : 0.000005s : 4: substitution.remove_not_recompute_node 2.31% : 0.000006s : 4: substitution.replace_old_param 6.36% : 0.000016s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.006192 2 88.06% : 0.005452s : 1: type_inference.infer 11.94% : 0.000739s : 1: type_inference.specialize ------[replace.] 0.000066 8 64.74% : 0.000043s : 4: replace.inline 35.26% : 0.000023s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000189 8 92.30% : 0.000175s : 4: match.inline 7.70% : 0.000015s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000216 1278 0.87% : 0.000002s : 13: predicate.accumulaten_eliminater 0.95% : 0.000002s : 4: predicate.ad_related_special_op_eliminate 0.58% : 0.000001s : 8: predicate.addn_check_dump 0.91% : 0.000002s : 13: predicate.addn_zero_filter 0.81% : 0.000002s : 13: predicate.adjust_all_reduce_mul_add 2.38% : 0.000005s : 21: predicate.arithmetic_simplify 1.11% : 0.000002s : 13: predicate.cast_eliminate 0.57% : 0.000001s : 8: predicate.check_bprop_eliminate 0.56% : 0.000001s : 8: predicate.compare_switch_simplify 0.19% : 0.000000s : 4: predicate.const_output_eliminate 0.67% : 0.000001s : 8: predicate.depend_value_elim 0.96% : 0.000002s : 13: predicate.dict_get_item_const_eliminator 1.00% : 0.000002s : 13: predicate.dict_get_item_eliminator 0.93% : 0.000002s : 13: predicate.dict_set_item_eliminator 1.09% : 0.000002s : 8: predicate.dumpgradient_eliminate 0.19% : 0.000000s : 4: predicate.elim_not_effective 0.38% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.10% : 0.000002s : 17: predicate.environ_add_const_eliminate 1.10% : 0.000002s : 17: predicate.environ_get_add_eliminate 1.05% : 0.000002s : 17: predicate.environ_get_depend_swap 1.62% : 0.000004s : 25: predicate.environ_get_eliminate 1.09% : 0.000002s : 17: predicate.environ_get_set_eliminate 1.44% : 0.000003s : 21: predicate.exchange_switch_depend_value 2.49% : 0.000005s : 21: predicate.float_depend_g_call 0.56% : 0.000001s : 8: predicate.float_environ_get_switch 0.79% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.15% : 0.000000s : 4: predicate.fold_const_symbol 0.60% : 0.000001s : 8: predicate.get_grad_eliminate 0.30% : 0.000001s : 4: predicate.graph_param_transform 0.61% : 0.000001s : 8: predicate.incorporate_call 0.48% : 0.000001s : 8: predicate.incorporate_call_switch 6.44% : 0.000014s : 58: predicate.inline 0.99% : 0.000002s : 8: predicate.inline_without_move 0.31% : 0.000001s : 8: predicate.j_node_and_user_rematch 0.79% : 0.000002s : 8: predicate.less_batch_normalization 1.79% : 0.000004s : 25: predicate.list_to_tuple_eliminator_ 2.61% : 0.000006s : 38: predicate.load_eliminater 0.80% : 0.000002s : 4: predicate.loop_unroll_after_grad 2.28% : 0.000005s : 34: predicate.loop_unroll_before_grad 1.69% : 0.000004s : 21: predicate.make_slice_get_slice_eliminator 0.50% : 0.000001s : 8: predicate.merge_addn 0.62% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.65% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.83% : 0.000002s : 13: predicate.minmaximum_grad 1.02% : 0.000002s : 4: predicate.mutable_eliminate 0.33% : 0.000001s : 4: predicate.opt_reshape 0.42% : 0.000001s : 4: predicate.parallel_virtual_node 1.69% : 0.000004s : 21: predicate.partial_defer_inline 1.58% : 0.000003s : 21: predicate.partial_eliminate 0.85% : 0.000002s : 13: predicate.print_const_string_wrapper 0.57% : 0.000001s : 8: predicate.reduce_all_const_elim 1.16% : 0.000003s : 13: predicate.reduce_eliminate 2.46% : 0.000005s : 38: predicate.redundant_stop_gradient_eliminater 0.56% : 0.000001s : 8: predicate.remove_not_recompute_node 1.34% : 0.000003s : 25: predicate.replace_applicator 0.48% : 0.000001s : 8: predicate.replace_old_param 0.25% : 0.000001s : 4: predicate.reset_defer_inline 0.93% : 0.000002s : 13: predicate.reshape_eliminate 0.57% : 0.000001s : 8: predicate.row_tensor_add_zeros_like 0.41% : 0.000001s : 4: predicate.row_tensor_eliminate 0.91% : 0.000002s : 8: predicate.same_eliminate 0.42% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.74% : 0.000002s : 8: predicate.shard_identity_eliminate 0.70% : 0.000002s : 8: predicate.special_op_eliminate 0.74% : 0.000002s : 8: predicate.specialize_transform 1.02% : 0.000002s : 8: predicate.split_environ_get_set_with_tuple_value 0.94% : 0.000002s : 8: predicate.stack_unstack_eliminate 0.38% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.52% : 0.000003s : 21: predicate.switch_defer_inline 2.01% : 0.000004s : 29: predicate.switch_layer_defer_inline 4.93% : 0.000011s : 67: predicate.switch_simplify 0.87% : 0.000002s : 13: predicate.tile_eliminate 0.87% : 0.000002s : 13: predicate.transpose_eliminate 1.67% : 0.000004s : 21: predicate.tuple_list_convert_item_index_to_positive 1.66% : 0.000004s : 21: predicate.tuple_list_get_item_const_eliminator 1.49% : 0.000003s : 21: predicate.tuple_list_get_item_depend_reorder 3.31% : 0.000007s : 33: predicate.tuple_list_get_item_eliminator 1.48% : 0.000003s : 21: predicate.tuple_list_get_set_item_eliminator 2.17% : 0.000005s : 29: predicate.tuple_list_set_item_eliminator 1.74% : 0.000004s : 25: predicate.tuple_to_list_eliminator_ 2.38% : 0.000005s : 38: predicate.updatestate_pure_node_eliminater 3.11% : 0.000007s : 46: predicate.updatestate_useless_node_eliminater 0.38% : 0.000001s : 4: predicate.value_based_eliminate 0.78% : 0.000002s : 8: predicate.virtual_dataset_eliminate 0.60% : 0.000001s : 8: predicate.virtual_output_eliminate 0.28% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.44% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000583 11 53.32% : 0.000311s : 5: func_graph_cloner_run.FuncGraphClonerGraph 46.68% : 0.000272s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.054889 192 0.01% : 0.000006s : 1: ForceFp32Comm 29.70% : 0.016302s : 1: add_attr 29.66% : 0.016278s : 1: add_attr_with_inline 0.01% : 0.000006s : 1: add_comm_op_reuse_tag 0.10% : 0.000056s : 1: add_recomputation 0.01% : 0.000006s : 1: assign_add_opt 0.14% : 0.000075s : 1: auto_monad 0.05% : 0.000029s : 1: auto_monad_reorder 0.01% : 0.000006s : 1: begin_end_overlap_inline 0.01% : 0.000008s : 1: bias_add_comm_swap 0.86% : 0.000471s : 1: bootstrap 0.07% : 0.000039s : 1: cconv 0.01% : 0.000006s : 1: comm_op_add_attrs 0.04% : 0.000021s : 1: control_data_broadcast_order 0.03% : 0.000014s : 1: convert_after_rewriter 0.06% : 0.000032s : 1: cse_after_recomputation 0.01% : 0.000008s : 1: dataset_repeat_opt 0.04% : 0.000023s : 1: detach_backward 0.02% : 0.000011s : 1: environ_conv 0.06% : 0.000033s : 1: event_method 0.01% : 0.000007s : 1: full_micro_interleaved_order_control 0.01% : 0.000008s : 1: get_jit_bprop_graph 0.02% : 0.000012s : 1: graph_reusing 0.01% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000006s : 1: handle_group_info 0.01% : 0.000008s : 1: inline 0.02% : 0.000009s : 1: insert-virtual-dataset 0.01% : 0.000006s : 1: interleave_parallel_branches 0.01% : 0.000006s : 1: interleave_split_concat_branches 0.01% : 0.000008s : 1: label_fine_grained_interleaved_index 0.02% : 0.000010s : 1: label_micro_interleaved_index 0.85% : 0.000467s : 1: loop_unroll 0.01% : 0.000006s : 1: merge_cast_opt 0.01% : 0.000007s : 1: micro_interleaved_order_control 1.29% : 0.000706s : 1: mutable_eliminate 0.02% : 0.000010s : 1: offloading_packed_experts 0.03% : 0.000014s : 1: opt.transform.loop_unroll_optimizer 0.03% : 0.000017s : 1: opt.transform.mutable_eliminate 2.38% : 0.001306s : 78: opt.transform.opt_a 0.06% : 0.000030s : 1: opt.transform.opt_after_cconv 0.05% : 0.000026s : 1: opt.transform.opt_after_jit_grad 0.19% : 0.000105s : 28: opt.transform.opt_b 0.09% : 0.000050s : 2: opt.transform.opt_trans_graph 0.07% : 0.000039s : 4: opt.transform.symbol_engine_opt 6.22% : 0.003413s : 1: opt_a 0.25% : 0.000135s : 1: opt_after_cconv 1.03% : 0.000565s : 1: opt_after_jit_grad 0.52% : 0.000288s : 1: opt_b 11.55% : 0.006338s : 1: optimize 0.05% : 0.000025s : 1: optimize_parallel_all_gather_comm 0.02% : 0.000012s : 1: order_py_execute_after_rewriter 0.06% : 0.000031s : 1: overlap_grad_flash_sp 0.01% : 0.000006s : 1: overlap_grad_matmul_and_grad_allreduce 0.02% : 0.000010s : 1: overlap_grad_ring_attention 0.01% : 0.000008s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000007s : 1: overlap_opt_shard_in_pipeline 0.01% : 0.000008s : 1: overlap_param_gather 0.01% : 0.000006s : 1: overlap_recompute_allgather_and_fa_grad 0.02% : 0.000011s : 1: overlap_recompute_and_grad_model_parallel 0.01% : 0.000008s : 1: overlap_recompute_comm 0.02% : 0.000010s : 1: parallel-infer-symbol 0.01% : 0.000007s : 1: parallel-infer-symbol-second 0.02% : 0.000008s : 1: partial_unused_args_eliminate 0.02% : 0.000010s : 1: pipeline_parallel_scheduler 0.01% : 0.000007s : 1: pipeline_split 0.09% : 0.000052s : 1: pre_auto_parallel 0.08% : 0.000044s : 1: py_interpret_to_execute 0.04% : 0.000020s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000008s : 1: remove_cast_before_assign_add 0.04% : 0.000020s : 1: remove_dup_value 0.84% : 0.000462s : 1: renormalize.infer 0.64% : 0.000352s : 1: renormalize.specialize 0.01% : 0.000008s : 1: reorder_send_recv_between_fp_bp 0.02% : 0.000011s : 1: rewriter_after_jit_bprop_graph 0.09% : 0.000048s : 1: rewriter_after_opt_a 0.19% : 0.000106s : 1: rewriter_before_opt_a 0.02% : 0.000008s : 1: slice_cell_reuse_recomputed_activation 0.01% : 0.000007s : 1: slice_recompute_activation 0.01% : 0.000007s : 1: split_layernorm_comm 0.01% : 0.000008s : 1: split_matmul_comm_elemetwise 0.02% : 0.000011s : 1: swap_dp_allreduce_reducescatter 0.19% : 0.000106s : 1: symbol_engine_optimizer 0.17% : 0.000094s : 1: tuple_transform 11.48% : 0.006303s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:42:18.119.131 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0807803, [21] [bootstrap]: 0.00043678 [type_inference]: 0.0705548 [event_method]: 1.946e-05 [auto_monad]: 9.039e-05 [graph_reusing]: 5.59e-06 [inline]: 2.39999e-06 [add_attr]: 0.00355678, [1] [add_attr_with_inline]: 0.00354477, [1] [Cycle 1]: 6.95e-05, [2] [tag_attr]: 2.181e-05 [meta_addattr_fg_expand]: 5.64998e-06 [parallel-infer-symbol]: 3.5e-06 [pre_auto_parallel]: 3.905e-05 [insert-virtual-dataset]: 2.81e-06 [parallel-infer-symbol-second]: 8.70001e-07 [dataset_repeat_opt]: 2.54001e-06 [pipeline_split]: 1.60001e-06 [optimize]: 0.00534241, [53] [py_interpret_to_execute]: 3.017e-05 [rewriter_before_opt_a]: 9.555e-05 [opt_a]: 0.0030489, [2] [Cycle 1]: 0.00230447, [45] [expand_dump_flag]: 3.09001e-06 [switch_simplify]: 4.535e-05 [loop_unroll]: 3.094e-05 [a_1]: 0.00069759 [with_stream_mark]: 1.883e-05 [recompute_prepare]: 9.15999e-06 [updatestate_depend_eliminate]: 4.37e-06 [updatestate_assign_eliminate]: 3.06001e-06 [updatestate_loads_eliminate]: 3.12002e-06 [parameter_eliminate]: 1.92001e-06 [a_2]: 8.794e-05 [accelerated_algorithm]: 8.05e-06 [shard]: 2.46e-06 [meta_shard_fg_expand]: 2.37999e-06 [shard_inline]: 6.66999e-06 [merge_send_recv]: 8.72e-06 [auto_parallel]: 7.87e-06 [parallel]: 2.056e-05 [flash_sp]: 9.96998e-06 [merge_comm]: 4.08999e-06 [allreduce_fusion]: 3.56999e-06 [matmul_add_comm_reduction]: 1.05e-05 [allreduce_slice_to_reducescatter]: 6.40022e-07 [virtual_shard_identity]: 7.89002e-06 [virtual_dataset]: 6.76e-06 [get_grad_eliminate_]: 6.61999e-06 [virtual_output]: 6.33e-06 [merge_forward]: 3.98999e-06 [cell_reuse_recompute_pass]: 1.55001e-06 [offload_activation]: 1.019e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.219e-05 [merge_recompute_call_nodes]: 1.49e-06 [before_grad]: 1.129e-05 [set_forward_comm_id_for_comm_node_pass]: 4.23999e-06 [meta_fg_expand]: 3.49001e-06 [flash_sp_send_recv_attached]: 3.5e-06 [receive_attached]: 2.43998e-06 [after_resolve]: 1.251e-05 [a_after_grad]: 1.021e-05 [renormalize]: 0.00083827 [add_forward_monad_depend]: 7.21999e-06 [auto_monad_grad]: 3.28e-06 [auto_monad_eliminator]: 1.792e-05 [cse]: 3.065e-05 [a_3]: 5.159e-05 [Cycle 2]: 0.00073325, [45] [expand_dump_flag]: 1.70001e-06 [switch_simplify]: 8.49002e-06 [loop_unroll]: 6.88e-06 [a_1]: 0.00015915 [with_stream_mark]: 1.728e-05 [recompute_prepare]: 7.53999e-06 [updatestate_depend_eliminate]: 3.4e-06 [updatestate_assign_eliminate]: 2.68e-06 [updatestate_loads_eliminate]: 2.65997e-06 [parameter_eliminate]: 1.66998e-06 [a_2]: 8.08e-05 [accelerated_algorithm]: 6.96001e-06 [shard]: 1.74e-06 [meta_shard_fg_expand]: 2.36e-06 [shard_inline]: 6.28e-06 [merge_send_recv]: 6.39999e-06 [auto_parallel]: 6.49001e-06 [parallel]: 6.94001e-06 [flash_sp]: 3.46999e-06 [merge_comm]: 4.07e-06 [allreduce_fusion]: 3.65e-06 [matmul_add_comm_reduction]: 7.66999e-06 [allreduce_slice_to_reducescatter]: 3.69997e-07 [virtual_shard_identity]: 6.56999e-06 [virtual_dataset]: 5.92001e-06 [get_grad_eliminate_]: 6.21e-06 [virtual_output]: 6.09001e-06 [merge_forward]: 3.61001e-06 [cell_reuse_recompute_pass]: 2.17999e-06 [offload_activation]: 8.54998e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.243e-05 [merge_recompute_call_nodes]: 8.80013e-07 [before_grad]: 1.028e-05 [set_forward_comm_id_for_comm_node_pass]: 3.48e-06 [meta_fg_expand]: 3.09001e-06 [flash_sp_send_recv_attached]: 1.32e-06 [receive_attached]: 2.21e-06 [after_resolve]: 1.238e-05 [a_after_grad]: 1.015e-05 [renormalize]: 9.00181e-08 [add_forward_monad_depend]: 1.49e-06 [auto_monad_grad]: 1.40999e-06 [auto_monad_eliminator]: 7.43e-06 [cse]: 1.529e-05 [a_3]: 3.864e-05 [py_interpret_to_execute_after_opt_a]: 1.184e-05 [slice_cell_reuse_recomputed_activation]: 2.31e-06 [rewriter_after_opt_a]: 4.143e-05 [convert_after_rewriter]: 7.7e-06 [order_py_execute_after_rewriter]: 5.81e-06 [mutable_eliminate]: 0.00066942 [opt_b]: 0.00020949, [1] [Cycle 1]: 0.00020232, [7] [b_1]: 0.00012614 [b_2]: 8.33001e-06 [updatestate_depend_eliminate]: 6.73998e-06 [updatestate_assign_eliminate]: 2.53e-06 [updatestate_loads_eliminate]: 2.22001e-06 [renormalize]: 5.69999e-07 [cse]: 1.926e-05 [optimize_parallel_all_gather_comm]: 1.761e-05 [overlap_param_gather]: 2.19001e-06 [cconv]: 3.13e-05 [loop_unroll]: 0.00048587 [opt_after_cconv]: 0.00010636, [1] [Cycle 1]: 9.983e-05, [7] [c_1]: 3.241e-05 [parameter_eliminate]: 3.61001e-06 [updatestate_depend_eliminate]: 6.78e-06 [updatestate_assign_eliminate]: 2.51e-06 [updatestate_loads_eliminate]: 2.37001e-06 [cse]: 1.775e-05 [renormalize]: 4.2998e-07 [remove_dup_value]: 1.415e-05 [tuple_transform]: 7.937e-05, [1] [Cycle 1]: 7.489e-05, [4] [d_1]: 4.72e-05 [none_parameter_eliminate]: 1.59e-06 [renormalize]: 2.40019e-07 [switch_simplify]: 6.89001e-06 [partial_unused_args_eliminate]: 1.74e-06 [add_recomputation]: 5.276e-05 [cse_after_recomputation]: 2.156e-05, [1] [Cycle 1]: 1.734e-05, [1] [cse]: 1.166e-05 [environ_conv]: 5.02e-06 [swap_dp_allreduce_reducescatter]: 5.46e-06 [bias_add_comm_swap]: 3.26001e-06 [label_micro_interleaved_index]: 4.52998e-06 [label_fine_grained_interleaved_index]: 2.69001e-06 [merge_cast_opt]: 1.34e-06 [slice_recompute_activation]: 2.10002e-06 [micro_interleaved_order_control]: 2.53e-06 [assign_add_opt]: 1.27e-06 [ForceFp32Comm]: 1.10001e-06 [remove_cast_before_assign_add]: 1.02e-06 [full_micro_interleaved_order_control]: 2.29999e-06 [reorder_send_recv_between_fp_bp]: 2.99999e-06 [comm_op_add_attrs]: 9.89996e-07 [add_comm_op_reuse_tag]: 1.00999e-06 [interleave_split_concat_branches]: 1.20999e-06 [interleave_parallel_branches]: 1.10001e-06 [overlap_opt_shard_in_pipeline]: 1.19e-06 [overlap_opt_shard_grad_in_pipeline]: 1.95001e-06 [control_data_broadcast_order]: 1.267e-05 [grouped_pairwise_exchange_alltoall]: 1.77999e-06 [offloading_packed_experts]: 4.40999e-06 [overlap_recompute_and_grad_model_parallel]: 4.69002e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.20001e-06 [overlap_recompute_allgather_and_fa_grad]: 1.37e-06 [overlap_recompute_comm]: 2.22999e-06 [overlap_grad_ring_attention]: 4.13001e-06 [overlap_grad_flash_sp]: 1.983e-05 [begin_end_overlap_inline]: 9.00007e-07 [split_matmul_comm_elemetwise]: 2.11e-06 [split_layernorm_comm]: 1.55001e-06 [handle_group_info]: 8.89995e-07 [symbol_engine_optimizer]: 7.97e-05, [1] [Cycle 1]: 7.508e-05, [6] [build]: 3.42002e-06 [elim_shapecalc]: 1.04e-05 [elim_not_effective]: 1.432e-05 [opt_reshape]: 7.73999e-06 [fold_const_symbol]: 1.038e-05 [renormalize]: 4.09986e-07 [detach_backward]: 2.04e-06 [pipeline_parallel_scheduler]: 2.06e-06 [auto_monad_reorder]: 1.71e-05 [get_jit_bprop_graph]: 2.26e-06 [rewriter_after_jit_bprop_graph]: 5.05001e-06 [opt_after_jit_grad]: 0.00048976 [validate]: 4.093e-05 Sums bootstrap : 0.000437s : 0.57% type_inference : 0.070555s : 92.57% event_method : 0.000019s : 0.03% auto_monad : 0.000090s : 0.12% graph_reusing : 0.000006s : 0.01% inline : 0.000002s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000022s : 0.03% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.01% parallel-infer-symbol : 0.000003s : 0.00% pre_auto_parallel : 0.000039s : 0.05% insert-virtual-dataset : 0.000003s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000003s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000030s : 0.04% optimize.rewriter_before_opt_a : 0.000096s : 0.13% optimize.opt_a.expand_dump_flag : 0.000005s : 0.01% optimize.opt_a.switch_simplify : 0.000054s : 0.07% optimize.opt_a.loop_unroll : 0.000038s : 0.05% optimize.opt_a.a_1 : 0.000857s : 1.12% optimize.opt_a.with_stream_mark : 0.000036s : 0.05% optimize.opt_a.recompute_prepare : 0.000017s : 0.02% optimize.opt_a.updatestate_depend_eliminate : 0.000008s : 0.01% optimize.opt_a.updatestate_assign_eliminate : 0.000006s : 0.01% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.01% optimize.opt_a.parameter_eliminate : 0.000004s : 0.00% optimize.opt_a.a_2 : 0.000169s : 0.22% optimize.opt_a.accelerated_algorithm : 0.000015s : 0.02% optimize.opt_a.shard : 0.000004s : 0.01% optimize.opt_a.meta_shard_fg_expand : 0.000005s : 0.01% optimize.opt_a.shard_inline : 0.000013s : 0.02% optimize.opt_a.merge_send_recv : 0.000015s : 0.02% optimize.opt_a.auto_parallel : 0.000014s : 0.02% optimize.opt_a.parallel : 0.000028s : 0.04% optimize.opt_a.flash_sp : 0.000013s : 0.02% optimize.opt_a.merge_comm : 0.000008s : 0.01% optimize.opt_a.allreduce_fusion : 0.000007s : 0.01% optimize.opt_a.matmul_add_comm_reduction : 0.000018s : 0.02% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000014s : 0.02% optimize.opt_a.virtual_dataset : 0.000013s : 0.02% optimize.opt_a.get_grad_eliminate_ : 0.000013s : 0.02% optimize.opt_a.virtual_output : 0.000012s : 0.02% optimize.opt_a.merge_forward : 0.000008s : 0.01% optimize.opt_a.cell_reuse_recompute_pass : 0.000004s : 0.00% optimize.opt_a.offload_activation : 0.000019s : 0.02% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000025s : 0.03% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.00% optimize.opt_a.before_grad : 0.000022s : 0.03% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000008s : 0.01% optimize.opt_a.meta_fg_expand : 0.000007s : 0.01% optimize.opt_a.flash_sp_send_recv_attached : 0.000005s : 0.01% optimize.opt_a.receive_attached : 0.000005s : 0.01% optimize.opt_a.after_resolve : 0.000025s : 0.03% optimize.opt_a.a_after_grad : 0.000020s : 0.03% optimize.opt_a.renormalize : 0.000838s : 1.10% optimize.opt_a.add_forward_monad_depend : 0.000009s : 0.01% optimize.opt_a.auto_monad_grad : 0.000005s : 0.01% optimize.opt_a.auto_monad_eliminator : 0.000025s : 0.03% optimize.opt_a.cse : 0.000046s : 0.06% optimize.opt_a.a_3 : 0.000090s : 0.12% optimize.py_interpret_to_execute_after_opt_a : 0.000012s : 0.02% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.00% optimize.rewriter_after_opt_a : 0.000041s : 0.05% optimize.convert_after_rewriter : 0.000008s : 0.01% optimize.order_py_execute_after_rewriter : 0.000006s : 0.01% optimize.mutable_eliminate : 0.000669s : 0.88% optimize.opt_b.b_1 : 0.000126s : 0.17% optimize.opt_b.b_2 : 0.000008s : 0.01% optimize.opt_b.updatestate_depend_eliminate : 0.000007s : 0.01% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000002s : 0.00% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000019s : 0.03% optimize.optimize_parallel_all_gather_comm : 0.000018s : 0.02% optimize.overlap_param_gather : 0.000002s : 0.00% optimize.cconv : 0.000031s : 0.04% optimize.loop_unroll : 0.000486s : 0.64% optimize.opt_after_cconv.c_1 : 0.000032s : 0.04% optimize.opt_after_cconv.parameter_eliminate : 0.000004s : 0.00% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000007s : 0.01% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.00% optimize.opt_after_cconv.cse : 0.000018s : 0.02% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000014s : 0.02% optimize.tuple_transform.d_1 : 0.000047s : 0.06% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000007s : 0.01% optimize.partial_unused_args_eliminate : 0.000002s : 0.00% optimize.add_recomputation : 0.000053s : 0.07% optimize.cse_after_recomputation.cse : 0.000012s : 0.02% optimize.environ_conv : 0.000005s : 0.01% optimize.swap_dp_allreduce_reducescatter : 0.000005s : 0.01% optimize.bias_add_comm_swap : 0.000003s : 0.00% optimize.label_micro_interleaved_index : 0.000005s : 0.01% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.00% optimize.merge_cast_opt : 0.000001s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.00% optimize.micro_interleaved_order_control : 0.000003s : 0.00% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.00% optimize.full_micro_interleaved_order_control : 0.000002s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.00% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.00% optimize.control_data_broadcast_order : 0.000013s : 0.02% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.00% optimize.offloading_packed_experts : 0.000004s : 0.01% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.01% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.00% optimize.overlap_recompute_comm : 0.000002s : 0.00% optimize.overlap_grad_ring_attention : 0.000004s : 0.01% optimize.overlap_grad_flash_sp : 0.000020s : 0.03% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.00% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000003s : 0.00% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000010s : 0.01% optimize.symbol_engine_optimizer.elim_not_effective : 0.000014s : 0.02% optimize.symbol_engine_optimizer.opt_reshape : 0.000008s : 0.01% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000010s : 0.01% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.00% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000017s : 0.02% get_jit_bprop_graph : 0.000002s : 0.00% rewriter_after_jit_bprop_graph : 0.000005s : 0.01% opt_after_jit_grad : 0.000490s : 0.64% validate : 0.000041s : 0.05% Time group info: ------[substitution.] 0.000236 34 15.06% : 0.000036s : 6: substitution.arithmetic_simplify 0.88% : 0.000002s : 2: substitution.elim_not_effective 0.61% : 0.000001s : 2: substitution.fold_const_symbol 2.55% : 0.000006s : 4: substitution.graph_param_transform 68.16% : 0.000161s : 4: substitution.inline 1.65% : 0.000004s : 4: substitution.j_node_and_user_rematch 2.09% : 0.000005s : 4: substitution.remove_not_recompute_node 2.37% : 0.000006s : 4: substitution.replace_old_param 6.63% : 0.000016s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.070487 2 98.95% : 0.069749s : 1: type_inference.infer 1.05% : 0.000738s : 1: type_inference.specialize ------[replace.] 0.000066 8 63.54% : 0.000042s : 4: replace.inline 36.46% : 0.000024s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000172 8 91.97% : 0.000158s : 4: match.inline 8.03% : 0.000014s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000221 1278 0.99% : 0.000002s : 13: predicate.accumulaten_eliminater 0.72% : 0.000002s : 4: predicate.ad_related_special_op_eliminate 0.54% : 0.000001s : 8: predicate.addn_check_dump 1.31% : 0.000003s : 13: predicate.addn_zero_filter 0.86% : 0.000002s : 13: predicate.adjust_all_reduce_mul_add 2.51% : 0.000006s : 21: predicate.arithmetic_simplify 1.20% : 0.000003s : 13: predicate.cast_eliminate 0.69% : 0.000002s : 8: predicate.check_bprop_eliminate 0.44% : 0.000001s : 8: predicate.compare_switch_simplify 0.18% : 0.000000s : 4: predicate.const_output_eliminate 0.76% : 0.000002s : 8: predicate.depend_value_elim 0.94% : 0.000002s : 13: predicate.dict_get_item_const_eliminator 0.99% : 0.000002s : 13: predicate.dict_get_item_eliminator 0.84% : 0.000002s : 13: predicate.dict_set_item_eliminator 1.02% : 0.000002s : 8: predicate.dumpgradient_eliminate 0.23% : 0.000001s : 4: predicate.elim_not_effective 0.40% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.11% : 0.000002s : 17: predicate.environ_add_const_eliminate 1.02% : 0.000002s : 17: predicate.environ_get_add_eliminate 1.02% : 0.000002s : 17: predicate.environ_get_depend_swap 1.60% : 0.000004s : 25: predicate.environ_get_eliminate 0.99% : 0.000002s : 17: predicate.environ_get_set_eliminate 1.68% : 0.000004s : 21: predicate.exchange_switch_depend_value 2.56% : 0.000006s : 21: predicate.float_depend_g_call 0.62% : 0.000001s : 8: predicate.float_environ_get_switch 0.78% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.19% : 0.000000s : 4: predicate.fold_const_symbol 0.75% : 0.000002s : 8: predicate.get_grad_eliminate 0.19% : 0.000000s : 4: predicate.graph_param_transform 0.57% : 0.000001s : 8: predicate.incorporate_call 0.45% : 0.000001s : 8: predicate.incorporate_call_switch 6.65% : 0.000015s : 58: predicate.inline 0.67% : 0.000001s : 8: predicate.inline_without_move 0.39% : 0.000001s : 8: predicate.j_node_and_user_rematch 0.89% : 0.000002s : 8: predicate.less_batch_normalization 1.73% : 0.000004s : 25: predicate.list_to_tuple_eliminator_ 2.45% : 0.000005s : 38: predicate.load_eliminater 0.99% : 0.000002s : 4: predicate.loop_unroll_after_grad 2.24% : 0.000005s : 34: predicate.loop_unroll_before_grad 1.56% : 0.000003s : 21: predicate.make_slice_get_slice_eliminator 0.59% : 0.000001s : 8: predicate.merge_addn 0.62% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.56% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.82% : 0.000002s : 13: predicate.minmaximum_grad 1.07% : 0.000002s : 4: predicate.mutable_eliminate 0.36% : 0.000001s : 4: predicate.opt_reshape 0.37% : 0.000001s : 4: predicate.parallel_virtual_node 1.80% : 0.000004s : 21: predicate.partial_defer_inline 1.53% : 0.000003s : 21: predicate.partial_eliminate 0.91% : 0.000002s : 13: predicate.print_const_string_wrapper 0.72% : 0.000002s : 8: predicate.reduce_all_const_elim 1.20% : 0.000003s : 13: predicate.reduce_eliminate 2.30% : 0.000005s : 38: predicate.redundant_stop_gradient_eliminater 0.42% : 0.000001s : 8: predicate.remove_not_recompute_node 1.31% : 0.000003s : 25: predicate.replace_applicator 0.38% : 0.000001s : 8: predicate.replace_old_param 0.33% : 0.000001s : 4: predicate.reset_defer_inline 1.06% : 0.000002s : 13: predicate.reshape_eliminate 0.54% : 0.000001s : 8: predicate.row_tensor_add_zeros_like 0.46% : 0.000001s : 4: predicate.row_tensor_eliminate 0.74% : 0.000002s : 8: predicate.same_eliminate 0.45% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.65% : 0.000001s : 8: predicate.shard_identity_eliminate 0.63% : 0.000001s : 8: predicate.special_op_eliminate 0.62% : 0.000001s : 8: predicate.specialize_transform 0.94% : 0.000002s : 8: predicate.split_environ_get_set_with_tuple_value 0.80% : 0.000002s : 8: predicate.stack_unstack_eliminate 0.35% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.49% : 0.000003s : 21: predicate.switch_defer_inline 1.97% : 0.000004s : 29: predicate.switch_layer_defer_inline 5.36% : 0.000012s : 67: predicate.switch_simplify 0.99% : 0.000002s : 13: predicate.tile_eliminate 0.92% : 0.000002s : 13: predicate.transpose_eliminate 1.88% : 0.000004s : 21: predicate.tuple_list_convert_item_index_to_positive 1.58% : 0.000003s : 21: predicate.tuple_list_get_item_const_eliminator 1.42% : 0.000003s : 21: predicate.tuple_list_get_item_depend_reorder 3.07% : 0.000007s : 33: predicate.tuple_list_get_item_eliminator 1.33% : 0.000003s : 21: predicate.tuple_list_get_set_item_eliminator 2.25% : 0.000005s : 29: predicate.tuple_list_set_item_eliminator 1.72% : 0.000004s : 25: predicate.tuple_to_list_eliminator_ 2.27% : 0.000005s : 38: predicate.updatestate_pure_node_eliminater 3.08% : 0.000007s : 46: predicate.updatestate_useless_node_eliminater 0.30% : 0.000001s : 4: predicate.value_based_eliminate 0.66% : 0.000001s : 8: predicate.virtual_dataset_eliminate 0.67% : 0.000001s : 8: predicate.virtual_output_eliminate 0.27% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.50% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000608 11 54.64% : 0.000332s : 5: func_graph_cloner_run.FuncGraphClonerGraph 45.36% : 0.000276s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.091964 192 0.00% : 0.000004s : 1: ForceFp32Comm 3.87% : 0.003563s : 1: add_attr 3.86% : 0.003549s : 1: add_attr_with_inline 0.00% : 0.000004s : 1: add_comm_op_reuse_tag 0.06% : 0.000057s : 1: add_recomputation 0.00% : 0.000004s : 1: assign_add_opt 0.10% : 0.000096s : 1: auto_monad 0.02% : 0.000021s : 1: auto_monad_reorder 0.00% : 0.000004s : 1: begin_end_overlap_inline 0.01% : 0.000006s : 1: bias_add_comm_swap 0.51% : 0.000467s : 1: bootstrap 0.04% : 0.000035s : 1: cconv 0.00% : 0.000004s : 1: comm_op_add_attrs 0.02% : 0.000016s : 1: control_data_broadcast_order 0.01% : 0.000011s : 1: convert_after_rewriter 0.03% : 0.000025s : 1: cse_after_recomputation 0.01% : 0.000006s : 1: dataset_repeat_opt 0.01% : 0.000005s : 1: detach_backward 0.01% : 0.000008s : 1: environ_conv 0.03% : 0.000027s : 1: event_method 0.01% : 0.000005s : 1: full_micro_interleaved_order_control 0.01% : 0.000005s : 1: get_jit_bprop_graph 0.01% : 0.000009s : 1: graph_reusing 0.00% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000004s : 1: handle_group_info 0.01% : 0.000006s : 1: inline 0.01% : 0.000007s : 1: insert-virtual-dataset 0.00% : 0.000004s : 1: interleave_parallel_branches 0.00% : 0.000004s : 1: interleave_split_concat_branches 0.01% : 0.000006s : 1: label_fine_grained_interleaved_index 0.01% : 0.000007s : 1: label_micro_interleaved_index 0.54% : 0.000494s : 1: loop_unroll 0.00% : 0.000004s : 1: merge_cast_opt 0.01% : 0.000005s : 1: micro_interleaved_order_control 0.74% : 0.000679s : 1: mutable_eliminate 0.01% : 0.000007s : 1: offloading_packed_experts 0.02% : 0.000016s : 1: opt.transform.loop_unroll_optimizer 0.02% : 0.000017s : 1: opt.transform.mutable_eliminate 1.42% : 0.001303s : 78: opt.transform.opt_a 0.03% : 0.000031s : 1: opt.transform.opt_after_cconv 0.03% : 0.000026s : 1: opt.transform.opt_after_jit_grad 0.11% : 0.000103s : 28: opt.transform.opt_b 0.06% : 0.000052s : 2: opt.transform.opt_trans_graph 0.04% : 0.000039s : 4: opt.transform.symbol_engine_opt 3.32% : 0.003053s : 1: opt_a 0.12% : 0.000110s : 1: opt_after_cconv 0.54% : 0.000499s : 1: opt_after_jit_grad 0.23% : 0.000213s : 1: opt_b 5.82% : 0.005348s : 1: optimize 0.02% : 0.000021s : 1: optimize_parallel_all_gather_comm 0.01% : 0.000009s : 1: order_py_execute_after_rewriter 0.02% : 0.000023s : 1: overlap_grad_flash_sp 0.00% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.01% : 0.000007s : 1: overlap_grad_ring_attention 0.01% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.01% : 0.000005s : 1: overlap_param_gather 0.00% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.01% : 0.000009s : 1: overlap_recompute_and_grad_model_parallel 0.01% : 0.000005s : 1: overlap_recompute_comm 0.01% : 0.000008s : 1: parallel-infer-symbol 0.00% : 0.000004s : 1: parallel-infer-symbol-second 0.01% : 0.000005s : 1: partial_unused_args_eliminate 0.01% : 0.000005s : 1: pipeline_parallel_scheduler 0.00% : 0.000004s : 1: pipeline_split 0.05% : 0.000043s : 1: pre_auto_parallel 0.04% : 0.000034s : 1: py_interpret_to_execute 0.02% : 0.000015s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000004s : 1: remove_cast_before_assign_add 0.02% : 0.000018s : 1: remove_dup_value 0.50% : 0.000458s : 1: renormalize.infer 0.40% : 0.000370s : 1: renormalize.specialize 0.01% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.01% : 0.000008s : 1: rewriter_after_jit_bprop_graph 0.05% : 0.000046s : 1: rewriter_after_opt_a 0.11% : 0.000100s : 1: rewriter_before_opt_a 0.01% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.01% : 0.000005s : 1: slice_recompute_activation 0.01% : 0.000006s : 1: split_layernorm_comm 0.01% : 0.000005s : 1: split_matmul_comm_elemetwise 0.01% : 0.000008s : 1: swap_dp_allreduce_reducescatter 0.09% : 0.000083s : 1: symbol_engine_optimizer 0.09% : 0.000082s : 1: tuple_transform 76.74% : 0.070575s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:42:19.875.490 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:42:19.875.770 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0179387, [21] [bootstrap]: 0.00044383 [type_inference]: 0.00626371 [event_method]: 2.151e-05 [auto_monad]: 6.71e-05 [graph_reusing]: 5.80002e-06 [inline]: 2.88e-06 [add_attr]: 0.00350789, [1] [add_attr_with_inline]: 0.00349819, [1] [Cycle 1]: 8.736e-05, [2] [tag_attr]: 2.17e-05 [meta_addattr_fg_expand]: 6.39001e-06 [parallel-infer-symbol]: 3.75e-06 [pre_auto_parallel]: 3.974e-05 [insert-virtual-dataset]: 2.36e-06 [parallel-infer-symbol-second]: 8.89995e-07 [dataset_repeat_opt]: 2.51998e-06 [pipeline_split]: 1.64998e-06 [optimize]: 0.00626286, [53] [py_interpret_to_execute]: 3.526e-05 [rewriter_before_opt_a]: 9.602e-05 [opt_a]: 0.00360071, [2] [Cycle 1]: 0.00266222, [45] [expand_dump_flag]: 3.41999e-06 [switch_simplify]: 4.494e-05 [loop_unroll]: 3.165e-05 [a_1]: 0.00086226 [with_stream_mark]: 2.111e-05 [recompute_prepare]: 1.071e-05 [updatestate_depend_eliminate]: 4.19002e-06 [updatestate_assign_eliminate]: 3.66999e-06 [updatestate_loads_eliminate]: 3.16001e-06 [parameter_eliminate]: 2.26e-06 [a_2]: 0.00011923 [accelerated_algorithm]: 7.5e-06 [shard]: 1.82999e-06 [meta_shard_fg_expand]: 2.22999e-06 [shard_inline]: 7.23e-06 [merge_send_recv]: 9.15999e-06 [auto_parallel]: 6.85002e-06 [parallel]: 1.985e-05 [flash_sp]: 9.96e-06 [merge_comm]: 4.13999e-06 [allreduce_fusion]: 3.9e-06 [matmul_add_comm_reduction]: 1.024e-05 [allreduce_slice_to_reducescatter]: 6.59988e-07 [virtual_shard_identity]: 8.80999e-06 [virtual_dataset]: 7.18998e-06 [get_grad_eliminate_]: 6.54001e-06 [virtual_output]: 6.91999e-06 [merge_forward]: 4.13999e-06 [cell_reuse_recompute_pass]: 1.42999e-06 [offload_activation]: 1.133e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.412e-05 [merge_recompute_call_nodes]: 1.82999e-06 [before_grad]: 1.192e-05 [set_forward_comm_id_for_comm_node_pass]: 3.68e-06 [meta_fg_expand]: 3.43e-06 [flash_sp_send_recv_attached]: 3.31001e-06 [receive_attached]: 2.07999e-06 [after_resolve]: 1.27e-05 [a_after_grad]: 1.067e-05 [renormalize]: 0.00082522 [add_forward_monad_depend]: 6.38e-06 [auto_monad_grad]: 2.46e-06 [auto_monad_eliminator]: 1.87e-05 [cse]: 2.921e-05 [a_3]: 6.514e-05 [Cycle 2]: 0.00092178, [45] [expand_dump_flag]: 1.78002e-06 [switch_simplify]: 8.54e-06 [loop_unroll]: 6.35002e-06 [a_1]: 0.0001471 [with_stream_mark]: 1.496e-05 [recompute_prepare]: 7.54002e-06 [updatestate_depend_eliminate]: 3.25002e-06 [updatestate_assign_eliminate]: 2.73998e-06 [updatestate_loads_eliminate]: 2.47001e-06 [parameter_eliminate]: 1.59e-06 [a_2]: 0.00011091 [accelerated_algorithm]: 7.06999e-06 [shard]: 1.77001e-06 [meta_shard_fg_expand]: 2.12999e-06 [shard_inline]: 7.15e-06 [merge_send_recv]: 6.29001e-06 [auto_parallel]: 6.98998e-06 [parallel]: 6.94999e-06 [flash_sp]: 4.1e-06 [merge_comm]: 3.89002e-06 [allreduce_fusion]: 3.81001e-06 [matmul_add_comm_reduction]: 7.78999e-06 [allreduce_slice_to_reducescatter]: 5.50004e-07 [virtual_shard_identity]: 9.19998e-06 [virtual_dataset]: 6.98e-06 [get_grad_eliminate_]: 7.3e-06 [virtual_output]: 6.56999e-06 [merge_forward]: 3.58e-06 [cell_reuse_recompute_pass]: 1.82999e-06 [offload_activation]: 9.30001e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.576e-05 [merge_recompute_call_nodes]: 1.29003e-06 [before_grad]: 1.101e-05 [set_forward_comm_id_for_comm_node_pass]: 4.05e-06 [meta_fg_expand]: 2.07999e-06 [flash_sp_send_recv_attached]: 1.47999e-06 [receive_attached]: 1.72001e-06 [after_resolve]: 1.287e-05 [a_after_grad]: 1.02e-05 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 1.65001e-06 [auto_monad_grad]: 2.25002e-06 [auto_monad_eliminator]: 8.82e-06 [cse]: 1.702e-05 [a_3]: 5.84e-05 [py_interpret_to_execute_after_opt_a]: 1.915e-05 [slice_cell_reuse_recomputed_activation]: 5.35999e-06 [rewriter_after_opt_a]: 4.575e-05 [convert_after_rewriter]: 1.001e-05 [order_py_execute_after_rewriter]: 7.92e-06 [mutable_eliminate]: 0.00075325 [opt_b]: 0.00028753, [1] [Cycle 1]: 0.00027754, [7] [b_1]: 0.00017279 [b_2]: 8.54e-06 [updatestate_depend_eliminate]: 9.47001e-06 [updatestate_assign_eliminate]: 2.99001e-06 [updatestate_loads_eliminate]: 2.26e-06 [renormalize]: 5.59987e-07 [cse]: 2.362e-05 [optimize_parallel_all_gather_comm]: 2.101e-05 [overlap_param_gather]: 4.91002e-06 [cconv]: 4.093e-05 [loop_unroll]: 0.0004805 [opt_after_cconv]: 0.0001337, [1] [Cycle 1]: 0.00012472, [7] [c_1]: 3.207e-05 [parameter_eliminate]: 4.27e-06 [updatestate_depend_eliminate]: 5.51e-06 [updatestate_assign_eliminate]: 2.48e-06 [updatestate_loads_eliminate]: 2.91999e-06 [cse]: 1.81e-05 [renormalize]: 5.59987e-07 [remove_dup_value]: 1.833e-05 [tuple_transform]: 9.861e-05, [1] [Cycle 1]: 9.111e-05, [4] [d_1]: 4.911e-05 [none_parameter_eliminate]: 1.81e-06 [renormalize]: 2.69996e-07 [switch_simplify]: 8.32e-06 [partial_unused_args_eliminate]: 5.00999e-06 [add_recomputation]: 5.717e-05 [cse_after_recomputation]: 2.847e-05, [1] [Cycle 1]: 2.083e-05, [1] [cse]: 1.179e-05 [environ_conv]: 8.98002e-06 [swap_dp_allreduce_reducescatter]: 8.50999e-06 [bias_add_comm_swap]: 5.69e-06 [label_micro_interleaved_index]: 7.15e-06 [label_fine_grained_interleaved_index]: 5.37001e-06 [merge_cast_opt]: 3.93001e-06 [slice_recompute_activation]: 4.72e-06 [micro_interleaved_order_control]: 5.40999e-06 [assign_add_opt]: 3.78001e-06 [ForceFp32Comm]: 3.61999e-06 [remove_cast_before_assign_add]: 3.95e-06 [full_micro_interleaved_order_control]: 4.65001e-06 [reorder_send_recv_between_fp_bp]: 6.03998e-06 [comm_op_add_attrs]: 3.46001e-06 [add_comm_op_reuse_tag]: 3.33998e-06 [interleave_split_concat_branches]: 3.70003e-06 [interleave_parallel_branches]: 3.47002e-06 [overlap_opt_shard_in_pipeline]: 3.68999e-06 [overlap_opt_shard_grad_in_pipeline]: 5.09e-06 [control_data_broadcast_order]: 1.661e-05 [grouped_pairwise_exchange_alltoall]: 3.95e-06 [offloading_packed_experts]: 6.67002e-06 [overlap_recompute_and_grad_model_parallel]: 7.83999e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.75e-06 [overlap_recompute_allgather_and_fa_grad]: 3.78001e-06 [overlap_recompute_comm]: 4.72998e-06 [overlap_grad_ring_attention]: 6.79999e-06 [overlap_grad_flash_sp]: 2.44e-05 [begin_end_overlap_inline]: 2.96001e-06 [split_matmul_comm_elemetwise]: 4.92e-06 [split_layernorm_comm]: 4.05e-06 [handle_group_info]: 3.33e-06 [symbol_engine_optimizer]: 0.00010243, [1] [Cycle 1]: 9.514e-05, [6] [build]: 4.68999e-06 [elim_shapecalc]: 1.014e-05 [elim_not_effective]: 1.416e-05 [opt_reshape]: 7.88001e-06 [fold_const_symbol]: 1.104e-05 [renormalize]: 1.79978e-07 [detach_backward]: 4.25e-06 [pipeline_parallel_scheduler]: 1.87001e-06 [auto_monad_reorder]: 2.083e-05 [get_jit_bprop_graph]: 1.96e-06 [rewriter_after_jit_bprop_graph]: 5.74999e-06 [opt_after_jit_grad]: 0.00056316 [validate]: 4.514e-05 Sums bootstrap : 0.000444s : 3.54% type_inference : 0.006264s : 50.00% event_method : 0.000022s : 0.17% auto_monad : 0.000067s : 0.54% graph_reusing : 0.000006s : 0.05% inline : 0.000003s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000022s : 0.17% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.05% parallel-infer-symbol : 0.000004s : 0.03% pre_auto_parallel : 0.000040s : 0.32% insert-virtual-dataset : 0.000002s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000003s : 0.02% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000035s : 0.28% optimize.rewriter_before_opt_a : 0.000096s : 0.77% optimize.opt_a.expand_dump_flag : 0.000005s : 0.04% optimize.opt_a.switch_simplify : 0.000053s : 0.43% optimize.opt_a.loop_unroll : 0.000038s : 0.30% optimize.opt_a.a_1 : 0.001009s : 8.06% optimize.opt_a.with_stream_mark : 0.000036s : 0.29% optimize.opt_a.recompute_prepare : 0.000018s : 0.15% optimize.opt_a.updatestate_depend_eliminate : 0.000007s : 0.06% optimize.opt_a.updatestate_assign_eliminate : 0.000006s : 0.05% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.04% optimize.opt_a.parameter_eliminate : 0.000004s : 0.03% optimize.opt_a.a_2 : 0.000230s : 1.84% optimize.opt_a.accelerated_algorithm : 0.000015s : 0.12% optimize.opt_a.shard : 0.000004s : 0.03% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.03% optimize.opt_a.shard_inline : 0.000014s : 0.11% optimize.opt_a.merge_send_recv : 0.000015s : 0.12% optimize.opt_a.auto_parallel : 0.000014s : 0.11% optimize.opt_a.parallel : 0.000027s : 0.21% optimize.opt_a.flash_sp : 0.000014s : 0.11% optimize.opt_a.merge_comm : 0.000008s : 0.06% optimize.opt_a.allreduce_fusion : 0.000008s : 0.06% optimize.opt_a.matmul_add_comm_reduction : 0.000018s : 0.14% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000018s : 0.14% optimize.opt_a.virtual_dataset : 0.000014s : 0.11% optimize.opt_a.get_grad_eliminate_ : 0.000014s : 0.11% optimize.opt_a.virtual_output : 0.000013s : 0.11% optimize.opt_a.merge_forward : 0.000008s : 0.06% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.03% optimize.opt_a.offload_activation : 0.000021s : 0.16% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000030s : 0.24% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.02% optimize.opt_a.before_grad : 0.000023s : 0.18% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000008s : 0.06% optimize.opt_a.meta_fg_expand : 0.000006s : 0.04% optimize.opt_a.flash_sp_send_recv_attached : 0.000005s : 0.04% optimize.opt_a.receive_attached : 0.000004s : 0.03% optimize.opt_a.after_resolve : 0.000026s : 0.20% optimize.opt_a.a_after_grad : 0.000021s : 0.17% optimize.opt_a.renormalize : 0.000825s : 6.59% optimize.opt_a.add_forward_monad_depend : 0.000008s : 0.06% optimize.opt_a.auto_monad_grad : 0.000005s : 0.04% optimize.opt_a.auto_monad_eliminator : 0.000028s : 0.22% optimize.opt_a.cse : 0.000046s : 0.37% optimize.opt_a.a_3 : 0.000124s : 0.99% optimize.py_interpret_to_execute_after_opt_a : 0.000019s : 0.15% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.04% optimize.rewriter_after_opt_a : 0.000046s : 0.37% optimize.convert_after_rewriter : 0.000010s : 0.08% optimize.order_py_execute_after_rewriter : 0.000008s : 0.06% optimize.mutable_eliminate : 0.000753s : 6.01% optimize.opt_b.b_1 : 0.000173s : 1.38% optimize.opt_b.b_2 : 0.000009s : 0.07% optimize.opt_b.updatestate_depend_eliminate : 0.000009s : 0.08% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.02% optimize.opt_b.updatestate_loads_eliminate : 0.000002s : 0.02% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000024s : 0.19% optimize.optimize_parallel_all_gather_comm : 0.000021s : 0.17% optimize.overlap_param_gather : 0.000005s : 0.04% optimize.cconv : 0.000041s : 0.33% optimize.loop_unroll : 0.000480s : 3.84% optimize.opt_after_cconv.c_1 : 0.000032s : 0.26% optimize.opt_after_cconv.parameter_eliminate : 0.000004s : 0.03% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.04% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000002s : 0.02% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.02% optimize.opt_after_cconv.cse : 0.000018s : 0.14% optimize.opt_after_cconv.renormalize : 0.000001s : 0.00% optimize.remove_dup_value : 0.000018s : 0.15% optimize.tuple_transform.d_1 : 0.000049s : 0.39% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000008s : 0.07% optimize.partial_unused_args_eliminate : 0.000005s : 0.04% optimize.add_recomputation : 0.000057s : 0.46% optimize.cse_after_recomputation.cse : 0.000012s : 0.09% optimize.environ_conv : 0.000009s : 0.07% optimize.swap_dp_allreduce_reducescatter : 0.000009s : 0.07% optimize.bias_add_comm_swap : 0.000006s : 0.05% optimize.label_micro_interleaved_index : 0.000007s : 0.06% optimize.label_fine_grained_interleaved_index : 0.000005s : 0.04% optimize.merge_cast_opt : 0.000004s : 0.03% optimize.slice_recompute_activation : 0.000005s : 0.04% optimize.micro_interleaved_order_control : 0.000005s : 0.04% optimize.assign_add_opt : 0.000004s : 0.03% optimize.ForceFp32Comm : 0.000004s : 0.03% optimize.remove_cast_before_assign_add : 0.000004s : 0.03% optimize.full_micro_interleaved_order_control : 0.000005s : 0.04% optimize.reorder_send_recv_between_fp_bp : 0.000006s : 0.05% optimize.comm_op_add_attrs : 0.000003s : 0.03% optimize.add_comm_op_reuse_tag : 0.000003s : 0.03% optimize.interleave_split_concat_branches : 0.000004s : 0.03% optimize.interleave_parallel_branches : 0.000003s : 0.03% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.03% optimize.overlap_opt_shard_grad_in_pipeline : 0.000005s : 0.04% optimize.control_data_broadcast_order : 0.000017s : 0.13% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.03% optimize.offloading_packed_experts : 0.000007s : 0.05% optimize.overlap_recompute_and_grad_model_parallel : 0.000008s : 0.06% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.03% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.03% optimize.overlap_recompute_comm : 0.000005s : 0.04% optimize.overlap_grad_ring_attention : 0.000007s : 0.05% optimize.overlap_grad_flash_sp : 0.000024s : 0.19% optimize.begin_end_overlap_inline : 0.000003s : 0.02% optimize.split_matmul_comm_elemetwise : 0.000005s : 0.04% optimize.split_layernorm_comm : 0.000004s : 0.03% optimize.handle_group_info : 0.000003s : 0.03% optimize.symbol_engine_optimizer.build : 0.000005s : 0.04% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000010s : 0.08% optimize.symbol_engine_optimizer.elim_not_effective : 0.000014s : 0.11% optimize.symbol_engine_optimizer.opt_reshape : 0.000008s : 0.06% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000011s : 0.09% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000004s : 0.03% pipeline_parallel_scheduler : 0.000002s : 0.01% auto_monad_reorder : 0.000021s : 0.17% get_jit_bprop_graph : 0.000002s : 0.02% rewriter_after_jit_bprop_graph : 0.000006s : 0.05% opt_after_jit_grad : 0.000563s : 4.50% validate : 0.000045s : 0.36% Time group info: ------[substitution.] 0.000241 34 16.16% : 0.000039s : 6: substitution.arithmetic_simplify 0.85% : 0.000002s : 2: substitution.elim_not_effective 0.87% : 0.000002s : 2: substitution.fold_const_symbol 2.69% : 0.000006s : 4: substitution.graph_param_transform 65.50% : 0.000158s : 4: substitution.inline 1.82% : 0.000004s : 4: substitution.j_node_and_user_rematch 2.07% : 0.000005s : 4: substitution.remove_not_recompute_node 2.54% : 0.000006s : 4: substitution.replace_old_param 7.50% : 0.000018s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.006201 2 87.60% : 0.005432s : 1: type_inference.infer 12.40% : 0.000769s : 1: type_inference.specialize ------[replace.] 0.000074 8 56.91% : 0.000042s : 4: replace.inline 43.09% : 0.000032s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000171 8 90.65% : 0.000155s : 4: match.inline 9.35% : 0.000016s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000226 1278 1.00% : 0.000002s : 13: predicate.accumulaten_eliminater 0.70% : 0.000002s : 4: predicate.ad_related_special_op_eliminate 0.48% : 0.000001s : 8: predicate.addn_check_dump 0.90% : 0.000002s : 13: predicate.addn_zero_filter 0.76% : 0.000002s : 13: predicate.adjust_all_reduce_mul_add 2.65% : 0.000006s : 21: predicate.arithmetic_simplify 0.88% : 0.000002s : 13: predicate.cast_eliminate 0.59% : 0.000001s : 8: predicate.check_bprop_eliminate 0.55% : 0.000001s : 8: predicate.compare_switch_simplify 0.18% : 0.000000s : 4: predicate.const_output_eliminate 0.55% : 0.000001s : 8: predicate.depend_value_elim 0.84% : 0.000002s : 13: predicate.dict_get_item_const_eliminator 0.96% : 0.000002s : 13: predicate.dict_get_item_eliminator 0.82% : 0.000002s : 13: predicate.dict_set_item_eliminator 0.93% : 0.000002s : 8: predicate.dumpgradient_eliminate 0.27% : 0.000001s : 4: predicate.elim_not_effective 0.37% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.11% : 0.000003s : 17: predicate.environ_add_const_eliminate 1.02% : 0.000002s : 17: predicate.environ_get_add_eliminate 1.03% : 0.000002s : 17: predicate.environ_get_depend_swap 1.60% : 0.000004s : 25: predicate.environ_get_eliminate 1.07% : 0.000002s : 17: predicate.environ_get_set_eliminate 1.41% : 0.000003s : 21: predicate.exchange_switch_depend_value 2.39% : 0.000005s : 21: predicate.float_depend_g_call 0.53% : 0.000001s : 8: predicate.float_environ_get_switch 0.79% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.16% : 0.000000s : 4: predicate.fold_const_symbol 0.78% : 0.000002s : 8: predicate.get_grad_eliminate 0.19% : 0.000000s : 4: predicate.graph_param_transform 0.55% : 0.000001s : 8: predicate.incorporate_call 0.46% : 0.000001s : 8: predicate.incorporate_call_switch 6.14% : 0.000014s : 58: predicate.inline 0.77% : 0.000002s : 8: predicate.inline_without_move 0.32% : 0.000001s : 8: predicate.j_node_and_user_rematch 0.92% : 0.000002s : 8: predicate.less_batch_normalization 1.93% : 0.000004s : 25: predicate.list_to_tuple_eliminator_ 3.18% : 0.000007s : 38: predicate.load_eliminater 0.95% : 0.000002s : 4: predicate.loop_unroll_after_grad 2.25% : 0.000005s : 34: predicate.loop_unroll_before_grad 1.66% : 0.000004s : 21: predicate.make_slice_get_slice_eliminator 0.57% : 0.000001s : 8: predicate.merge_addn 0.54% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.57% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.81% : 0.000002s : 13: predicate.minmaximum_grad 1.20% : 0.000003s : 4: predicate.mutable_eliminate 0.41% : 0.000001s : 4: predicate.opt_reshape 0.36% : 0.000001s : 4: predicate.parallel_virtual_node 1.83% : 0.000004s : 21: predicate.partial_defer_inline 1.75% : 0.000004s : 21: predicate.partial_eliminate 0.88% : 0.000002s : 13: predicate.print_const_string_wrapper 0.57% : 0.000001s : 8: predicate.reduce_all_const_elim 1.19% : 0.000003s : 13: predicate.reduce_eliminate 2.41% : 0.000005s : 38: predicate.redundant_stop_gradient_eliminater 0.37% : 0.000001s : 8: predicate.remove_not_recompute_node 1.47% : 0.000003s : 25: predicate.replace_applicator 0.60% : 0.000001s : 8: predicate.replace_old_param 0.27% : 0.000001s : 4: predicate.reset_defer_inline 0.87% : 0.000002s : 13: predicate.reshape_eliminate 0.71% : 0.000002s : 8: predicate.row_tensor_add_zeros_like 0.40% : 0.000001s : 4: predicate.row_tensor_eliminate 0.88% : 0.000002s : 8: predicate.same_eliminate 0.44% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.86% : 0.000002s : 8: predicate.shard_identity_eliminate 0.79% : 0.000002s : 8: predicate.special_op_eliminate 0.67% : 0.000002s : 8: predicate.specialize_transform 1.07% : 0.000002s : 8: predicate.split_environ_get_set_with_tuple_value 0.99% : 0.000002s : 8: predicate.stack_unstack_eliminate 0.36% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.46% : 0.000003s : 21: predicate.switch_defer_inline 2.06% : 0.000005s : 29: predicate.switch_layer_defer_inline 4.88% : 0.000011s : 67: predicate.switch_simplify 0.86% : 0.000002s : 13: predicate.tile_eliminate 0.82% : 0.000002s : 13: predicate.transpose_eliminate 1.54% : 0.000003s : 21: predicate.tuple_list_convert_item_index_to_positive 1.40% : 0.000003s : 21: predicate.tuple_list_get_item_const_eliminator 1.39% : 0.000003s : 21: predicate.tuple_list_get_item_depend_reorder 3.81% : 0.000009s : 33: predicate.tuple_list_get_item_eliminator 1.69% : 0.000004s : 21: predicate.tuple_list_get_set_item_eliminator 2.31% : 0.000005s : 29: predicate.tuple_list_set_item_eliminator 1.82% : 0.000004s : 25: predicate.tuple_to_list_eliminator_ 2.22% : 0.000005s : 38: predicate.updatestate_pure_node_eliminater 2.90% : 0.000007s : 46: predicate.updatestate_useless_node_eliminater 0.33% : 0.000001s : 4: predicate.value_based_eliminate 0.76% : 0.000002s : 8: predicate.virtual_dataset_eliminate 0.59% : 0.000001s : 8: predicate.virtual_output_eliminate 0.23% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.47% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000601 11 54.74% : 0.000329s : 5: func_graph_cloner_run.FuncGraphClonerGraph 45.26% : 0.000272s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.030156 192 0.02% : 0.000006s : 1: ForceFp32Comm 11.76% : 0.003546s : 1: add_attr 11.61% : 0.003502s : 1: add_attr_with_inline 0.02% : 0.000007s : 1: add_comm_op_reuse_tag 0.20% : 0.000061s : 1: add_recomputation 0.02% : 0.000006s : 1: assign_add_opt 0.25% : 0.000076s : 1: auto_monad 0.09% : 0.000028s : 1: auto_monad_reorder 0.02% : 0.000006s : 1: begin_end_overlap_inline 0.03% : 0.000009s : 1: bias_add_comm_swap 1.63% : 0.000490s : 1: bootstrap 0.15% : 0.000044s : 1: cconv 0.02% : 0.000006s : 1: comm_op_add_attrs 0.06% : 0.000020s : 1: control_data_broadcast_order 0.04% : 0.000013s : 1: convert_after_rewriter 0.11% : 0.000032s : 1: cse_after_recomputation 0.03% : 0.000009s : 1: dataset_repeat_opt 0.07% : 0.000020s : 1: detach_backward 0.04% : 0.000012s : 1: environ_conv 0.11% : 0.000032s : 1: event_method 0.02% : 0.000007s : 1: full_micro_interleaved_order_control 0.03% : 0.000008s : 1: get_jit_bprop_graph 0.04% : 0.000012s : 1: graph_reusing 0.02% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.02% : 0.000006s : 1: handle_group_info 0.03% : 0.000009s : 1: inline 0.03% : 0.000009s : 1: insert-virtual-dataset 0.02% : 0.000006s : 1: interleave_parallel_branches 0.02% : 0.000006s : 1: interleave_split_concat_branches 0.03% : 0.000008s : 1: label_fine_grained_interleaved_index 0.03% : 0.000010s : 1: label_micro_interleaved_index 1.61% : 0.000487s : 1: loop_unroll 0.02% : 0.000007s : 1: merge_cast_opt 0.03% : 0.000008s : 1: micro_interleaved_order_control 2.52% : 0.000761s : 1: mutable_eliminate 0.03% : 0.000010s : 1: offloading_packed_experts 0.05% : 0.000015s : 1: opt.transform.loop_unroll_optimizer 0.07% : 0.000020s : 1: opt.transform.mutable_eliminate 4.89% : 0.001476s : 78: opt.transform.opt_a 0.10% : 0.000031s : 1: opt.transform.opt_after_cconv 0.09% : 0.000027s : 1: opt.transform.opt_after_jit_grad 0.35% : 0.000107s : 28: opt.transform.opt_b 0.18% : 0.000055s : 2: opt.transform.opt_trans_graph 0.13% : 0.000040s : 4: opt.transform.symbol_engine_opt 11.95% : 0.003604s : 1: opt_a 0.46% : 0.000137s : 1: opt_after_cconv 1.90% : 0.000574s : 1: opt_after_jit_grad 0.97% : 0.000292s : 1: opt_b 22.00% : 0.006635s : 1: optimize 0.08% : 0.000024s : 1: optimize_parallel_all_gather_comm 0.04% : 0.000011s : 1: order_py_execute_after_rewriter 0.09% : 0.000028s : 1: overlap_grad_flash_sp 0.02% : 0.000006s : 1: overlap_grad_matmul_and_grad_allreduce 0.03% : 0.000010s : 1: overlap_grad_ring_attention 0.03% : 0.000008s : 1: overlap_opt_shard_grad_in_pipeline 0.02% : 0.000006s : 1: overlap_opt_shard_in_pipeline 0.03% : 0.000008s : 1: overlap_param_gather 0.02% : 0.000007s : 1: overlap_recompute_allgather_and_fa_grad 0.04% : 0.000012s : 1: overlap_recompute_and_grad_model_parallel 0.03% : 0.000008s : 1: overlap_recompute_comm 0.04% : 0.000012s : 1: parallel-infer-symbol 0.02% : 0.000007s : 1: parallel-infer-symbol-second 0.03% : 0.000008s : 1: partial_unused_args_eliminate 0.03% : 0.000009s : 1: pipeline_parallel_scheduler 0.02% : 0.000007s : 1: pipeline_split 0.16% : 0.000048s : 1: pre_auto_parallel 0.13% : 0.000039s : 1: py_interpret_to_execute 0.08% : 0.000023s : 1: py_interpret_to_execute_after_opt_a 0.02% : 0.000007s : 1: remove_cast_before_assign_add 0.07% : 0.000022s : 1: remove_dup_value 1.47% : 0.000443s : 1: renormalize.infer 1.23% : 0.000372s : 1: renormalize.specialize 0.03% : 0.000009s : 1: reorder_send_recv_between_fp_bp 0.04% : 0.000012s : 1: rewriter_after_jit_bprop_graph 0.16% : 0.000050s : 1: rewriter_after_opt_a 0.33% : 0.000100s : 1: rewriter_before_opt_a 0.03% : 0.000009s : 1: slice_cell_reuse_recomputed_activation 0.03% : 0.000008s : 1: slice_recompute_activation 0.02% : 0.000007s : 1: split_layernorm_comm 0.03% : 0.000008s : 1: split_matmul_comm_elemetwise 0.04% : 0.000011s : 1: swap_dp_allreduce_reducescatter 0.35% : 0.000106s : 1: symbol_engine_optimizer 0.34% : 0.000102s : 1: tuple_transform 20.94% : 0.006314s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:42:22.624.52 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0182901, [21] [bootstrap]: 0.00047426 [type_inference]: 0.00778317 [event_method]: 2.219e-05 [auto_monad]: 6.454e-05 [graph_reusing]: 5.82999e-06 [inline]: 2.72001e-06 [add_attr]: 0.00365882, [1] [add_attr_with_inline]: 0.00364562, [1] [Cycle 1]: 7.732e-05, [2] [tag_attr]: 2.551e-05 [meta_addattr_fg_expand]: 5.79e-06 [parallel-infer-symbol]: 3.63999e-06 [pre_auto_parallel]: 4.127e-05 [insert-virtual-dataset]: 2.89001e-06 [parallel-infer-symbol-second]: 8.2e-07 [dataset_repeat_opt]: 1.85001e-06 [pipeline_split]: 1.72001e-06 [optimize]: 0.00549714, [53] [py_interpret_to_execute]: 3.128e-05 [rewriter_before_opt_a]: 9.625e-05 [opt_a]: 0.00312846, [2] [Cycle 1]: 0.0023493, [45] [expand_dump_flag]: 3.14999e-06 [switch_simplify]: 4.406e-05 [loop_unroll]: 3.012e-05 [a_1]: 0.0006853 [with_stream_mark]: 2.185e-05 [recompute_prepare]: 1.204e-05 [updatestate_depend_eliminate]: 5.00001e-06 [updatestate_assign_eliminate]: 3.2e-06 [updatestate_loads_eliminate]: 2.64001e-06 [parameter_eliminate]: 1.94999e-06 [a_2]: 9.086e-05 [accelerated_algorithm]: 8.23999e-06 [shard]: 2.09e-06 [meta_shard_fg_expand]: 1.98002e-06 [shard_inline]: 7.13e-06 [merge_send_recv]: 9.62001e-06 [auto_parallel]: 8.23001e-06 [parallel]: 2.014e-05 [flash_sp]: 1.113e-05 [merge_comm]: 4.26001e-06 [allreduce_fusion]: 3.63e-06 [matmul_add_comm_reduction]: 1.049e-05 [allreduce_slice_to_reducescatter]: 8.00006e-07 [virtual_shard_identity]: 9.99001e-06 [virtual_dataset]: 7.92998e-06 [get_grad_eliminate_]: 7.58999e-06 [virtual_output]: 7.21001e-06 [merge_forward]: 3.88999e-06 [cell_reuse_recompute_pass]: 1.39e-06 [offload_activation]: 1.076e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.519e-05 [merge_recompute_call_nodes]: 1.47001e-06 [before_grad]: 1.192e-05 [set_forward_comm_id_for_comm_node_pass]: 3.79002e-06 [meta_fg_expand]: 2.93998e-06 [flash_sp_send_recv_attached]: 2.86999e-06 [receive_attached]: 2.39999e-06 [after_resolve]: 1.275e-05 [a_after_grad]: 1.047e-05 [renormalize]: 0.00086409 [add_forward_monad_depend]: 6.68e-06 [auto_monad_grad]: 3.08e-06 [auto_monad_eliminator]: 1.86e-05 [cse]: 3.264e-05 [a_3]: 5.397e-05 [Cycle 2]: 0.0007654, [45] [expand_dump_flag]: 1.98002e-06 [switch_simplify]: 8.77999e-06 [loop_unroll]: 7.04001e-06 [a_1]: 0.00018238 [with_stream_mark]: 1.716e-05 [recompute_prepare]: 7.68999e-06 [updatestate_depend_eliminate]: 3.73001e-06 [updatestate_assign_eliminate]: 2.94001e-06 [updatestate_loads_eliminate]: 2.86e-06 [parameter_eliminate]: 1.35999e-06 [a_2]: 8.335e-05 [accelerated_algorithm]: 6.91999e-06 [shard]: 2.31e-06 [meta_shard_fg_expand]: 1.89999e-06 [shard_inline]: 6.04001e-06 [merge_send_recv]: 6.13002e-06 [auto_parallel]: 7.61001e-06 [parallel]: 7.31001e-06 [flash_sp]: 4.3e-06 [merge_comm]: 3.8e-06 [allreduce_fusion]: 3.79002e-06 [matmul_add_comm_reduction]: 7.92e-06 [allreduce_slice_to_reducescatter]: 3.9002e-07 [virtual_shard_identity]: 6.91999e-06 [virtual_dataset]: 8.72e-06 [get_grad_eliminate_]: 5.81e-06 [virtual_output]: 5.82001e-06 [merge_forward]: 3.95e-06 [cell_reuse_recompute_pass]: 1.81998e-06 [offload_activation]: 8.70999e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.149e-05 [merge_recompute_call_nodes]: 1.14e-06 [before_grad]: 1.025e-05 [set_forward_comm_id_for_comm_node_pass]: 4.82e-06 [meta_fg_expand]: 2.33998e-06 [flash_sp_send_recv_attached]: 1.38002e-06 [receive_attached]: 1.67001e-06 [after_resolve]: 1.212e-05 [a_after_grad]: 9.42001e-06 [renormalize]: 8.9989e-08 [add_forward_monad_depend]: 2.27999e-06 [auto_monad_grad]: 1.52999e-06 [auto_monad_eliminator]: 1.069e-05 [cse]: 1.899e-05 [a_3]: 3.94e-05 [py_interpret_to_execute_after_opt_a]: 1.471e-05 [slice_cell_reuse_recomputed_activation]: 1.95001e-06 [rewriter_after_opt_a]: 4.016e-05 [convert_after_rewriter]: 6.96999e-06 [order_py_execute_after_rewriter]: 5.02e-06 [mutable_eliminate]: 0.00073734 [opt_b]: 0.00022237, [1] [Cycle 1]: 0.00021483, [7] [b_1]: 0.00013037 [b_2]: 8.32e-06 [updatestate_depend_eliminate]: 8.52e-06 [updatestate_assign_eliminate]: 2.93e-06 [updatestate_loads_eliminate]: 2.64999e-06 [renormalize]: 9.00007e-07 [cse]: 2.411e-05 [optimize_parallel_all_gather_comm]: 1.964e-05 [overlap_param_gather]: 2.04e-06 [cconv]: 3.408e-05 [loop_unroll]: 0.00046679 [opt_after_cconv]: 0.00010771, [1] [Cycle 1]: 0.00010139, [7] [c_1]: 3.215e-05 [parameter_eliminate]: 4.94e-06 [updatestate_depend_eliminate]: 5.76e-06 [updatestate_assign_eliminate]: 2.58e-06 [updatestate_loads_eliminate]: 2.71e-06 [cse]: 1.85e-05 [renormalize]: 3.89991e-07 [remove_dup_value]: 1.463e-05 [tuple_transform]: 7.954e-05, [1] [Cycle 1]: 7.467e-05, [4] [d_1]: 4.653e-05 [none_parameter_eliminate]: 1.76e-06 [renormalize]: 1.79978e-07 [switch_simplify]: 7.51001e-06 [partial_unused_args_eliminate]: 1.94e-06 [add_recomputation]: 5.243e-05 [cse_after_recomputation]: 2.236e-05, [1] [Cycle 1]: 1.735e-05, [1] [cse]: 1.16e-05 [environ_conv]: 5.44e-06 [swap_dp_allreduce_reducescatter]: 5.17999e-06 [bias_add_comm_swap]: 2.56e-06 [label_micro_interleaved_index]: 4.52998e-06 [label_fine_grained_interleaved_index]: 2.86e-06 [merge_cast_opt]: 1.27999e-06 [slice_recompute_activation]: 1.97001e-06 [micro_interleaved_order_control]: 2.06e-06 [assign_add_opt]: 1.20001e-06 [ForceFp32Comm]: 9.70002e-07 [remove_cast_before_assign_add]: 1.20001e-06 [full_micro_interleaved_order_control]: 2.34999e-06 [reorder_send_recv_between_fp_bp]: 2.96999e-06 [comm_op_add_attrs]: 9.60019e-07 [add_comm_op_reuse_tag]: 1.04e-06 [interleave_split_concat_branches]: 1.09003e-06 [interleave_parallel_branches]: 1.02e-06 [overlap_opt_shard_in_pipeline]: 1.23002e-06 [overlap_opt_shard_grad_in_pipeline]: 1.72001e-06 [control_data_broadcast_order]: 1.298e-05 [grouped_pairwise_exchange_alltoall]: 1.57001e-06 [offloading_packed_experts]: 3.71001e-06 [overlap_recompute_and_grad_model_parallel]: 4.83001e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.22e-06 [overlap_recompute_allgather_and_fa_grad]: 1.34e-06 [overlap_recompute_comm]: 2.29999e-06 [overlap_grad_ring_attention]: 3.83001e-06 [overlap_grad_flash_sp]: 2.226e-05 [begin_end_overlap_inline]: 5.69999e-07 [split_matmul_comm_elemetwise]: 2.39999e-06 [split_layernorm_comm]: 1.74e-06 [handle_group_info]: 9.80013e-07 [symbol_engine_optimizer]: 7.94e-05, [1] [Cycle 1]: 7.484e-05, [6] [build]: 4.21001e-06 [elim_shapecalc]: 1.04e-05 [elim_not_effective]: 1.304e-05 [opt_reshape]: 7.85e-06 [fold_const_symbol]: 1.049e-05 [renormalize]: 2.10013e-07 [detach_backward]: 2.54999e-06 [pipeline_parallel_scheduler]: 1.55001e-06 [auto_monad_reorder]: 1.757e-05 [get_jit_bprop_graph]: 2.27001e-06 [rewriter_after_jit_bprop_graph]: 6.38e-06 [opt_after_jit_grad]: 0.00049149 [validate]: 4.341e-05 Sums bootstrap : 0.000474s : 3.48% type_inference : 0.007783s : 57.17% event_method : 0.000022s : 0.16% auto_monad : 0.000065s : 0.47% graph_reusing : 0.000006s : 0.04% inline : 0.000003s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000026s : 0.19% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.04% parallel-infer-symbol : 0.000004s : 0.03% pre_auto_parallel : 0.000041s : 0.30% insert-virtual-dataset : 0.000003s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.01% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000031s : 0.23% optimize.rewriter_before_opt_a : 0.000096s : 0.71% optimize.opt_a.expand_dump_flag : 0.000005s : 0.04% optimize.opt_a.switch_simplify : 0.000053s : 0.39% optimize.opt_a.loop_unroll : 0.000037s : 0.27% optimize.opt_a.a_1 : 0.000868s : 6.37% optimize.opt_a.with_stream_mark : 0.000039s : 0.29% optimize.opt_a.recompute_prepare : 0.000020s : 0.14% optimize.opt_a.updatestate_depend_eliminate : 0.000009s : 0.06% optimize.opt_a.updatestate_assign_eliminate : 0.000006s : 0.05% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.04% optimize.opt_a.parameter_eliminate : 0.000003s : 0.02% optimize.opt_a.a_2 : 0.000174s : 1.28% optimize.opt_a.accelerated_algorithm : 0.000015s : 0.11% optimize.opt_a.shard : 0.000004s : 0.03% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.03% optimize.opt_a.shard_inline : 0.000013s : 0.10% optimize.opt_a.merge_send_recv : 0.000016s : 0.12% optimize.opt_a.auto_parallel : 0.000016s : 0.12% optimize.opt_a.parallel : 0.000027s : 0.20% optimize.opt_a.flash_sp : 0.000015s : 0.11% optimize.opt_a.merge_comm : 0.000008s : 0.06% optimize.opt_a.allreduce_fusion : 0.000007s : 0.05% optimize.opt_a.matmul_add_comm_reduction : 0.000018s : 0.14% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000017s : 0.12% optimize.opt_a.virtual_dataset : 0.000017s : 0.12% optimize.opt_a.get_grad_eliminate_ : 0.000013s : 0.10% optimize.opt_a.virtual_output : 0.000013s : 0.10% optimize.opt_a.merge_forward : 0.000008s : 0.06% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.02% optimize.opt_a.offload_activation : 0.000019s : 0.14% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000027s : 0.20% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.02% optimize.opt_a.before_grad : 0.000022s : 0.16% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000009s : 0.06% optimize.opt_a.meta_fg_expand : 0.000005s : 0.04% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.03% optimize.opt_a.receive_attached : 0.000004s : 0.03% optimize.opt_a.after_resolve : 0.000025s : 0.18% optimize.opt_a.a_after_grad : 0.000020s : 0.15% optimize.opt_a.renormalize : 0.000864s : 6.35% optimize.opt_a.add_forward_monad_depend : 0.000009s : 0.07% optimize.opt_a.auto_monad_grad : 0.000005s : 0.03% optimize.opt_a.auto_monad_eliminator : 0.000029s : 0.22% optimize.opt_a.cse : 0.000052s : 0.38% optimize.opt_a.a_3 : 0.000093s : 0.69% optimize.py_interpret_to_execute_after_opt_a : 0.000015s : 0.11% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.01% optimize.rewriter_after_opt_a : 0.000040s : 0.30% optimize.convert_after_rewriter : 0.000007s : 0.05% optimize.order_py_execute_after_rewriter : 0.000005s : 0.04% optimize.mutable_eliminate : 0.000737s : 5.42% optimize.opt_b.b_1 : 0.000130s : 0.96% optimize.opt_b.b_2 : 0.000008s : 0.06% optimize.opt_b.updatestate_depend_eliminate : 0.000009s : 0.06% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.02% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.02% optimize.opt_b.renormalize : 0.000001s : 0.01% optimize.opt_b.cse : 0.000024s : 0.18% optimize.optimize_parallel_all_gather_comm : 0.000020s : 0.14% optimize.overlap_param_gather : 0.000002s : 0.01% optimize.cconv : 0.000034s : 0.25% optimize.loop_unroll : 0.000467s : 3.43% optimize.opt_after_cconv.c_1 : 0.000032s : 0.24% optimize.opt_after_cconv.parameter_eliminate : 0.000005s : 0.04% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.04% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.02% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.02% optimize.opt_after_cconv.cse : 0.000018s : 0.14% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000015s : 0.11% optimize.tuple_transform.d_1 : 0.000047s : 0.34% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000008s : 0.06% optimize.partial_unused_args_eliminate : 0.000002s : 0.01% optimize.add_recomputation : 0.000052s : 0.39% optimize.cse_after_recomputation.cse : 0.000012s : 0.09% optimize.environ_conv : 0.000005s : 0.04% optimize.swap_dp_allreduce_reducescatter : 0.000005s : 0.04% optimize.bias_add_comm_swap : 0.000003s : 0.02% optimize.label_micro_interleaved_index : 0.000005s : 0.03% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.02% optimize.merge_cast_opt : 0.000001s : 0.01% optimize.slice_recompute_activation : 0.000002s : 0.01% optimize.micro_interleaved_order_control : 0.000002s : 0.02% optimize.assign_add_opt : 0.000001s : 0.01% optimize.ForceFp32Comm : 0.000001s : 0.01% optimize.remove_cast_before_assign_add : 0.000001s : 0.01% optimize.full_micro_interleaved_order_control : 0.000002s : 0.02% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.02% optimize.comm_op_add_attrs : 0.000001s : 0.01% optimize.add_comm_op_reuse_tag : 0.000001s : 0.01% optimize.interleave_split_concat_branches : 0.000001s : 0.01% optimize.interleave_parallel_branches : 0.000001s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.01% optimize.control_data_broadcast_order : 0.000013s : 0.10% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.01% optimize.offloading_packed_experts : 0.000004s : 0.03% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.04% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.01% optimize.overlap_recompute_comm : 0.000002s : 0.02% optimize.overlap_grad_ring_attention : 0.000004s : 0.03% optimize.overlap_grad_flash_sp : 0.000022s : 0.16% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.02% optimize.split_layernorm_comm : 0.000002s : 0.01% optimize.handle_group_info : 0.000001s : 0.01% optimize.symbol_engine_optimizer.build : 0.000004s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000010s : 0.08% optimize.symbol_engine_optimizer.elim_not_effective : 0.000013s : 0.10% optimize.symbol_engine_optimizer.opt_reshape : 0.000008s : 0.06% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000010s : 0.08% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000003s : 0.02% pipeline_parallel_scheduler : 0.000002s : 0.01% auto_monad_reorder : 0.000018s : 0.13% get_jit_bprop_graph : 0.000002s : 0.02% rewriter_after_jit_bprop_graph : 0.000006s : 0.05% opt_after_jit_grad : 0.000491s : 3.61% validate : 0.000043s : 0.32% Time group info: ------[substitution.] 0.000248 34 15.08% : 0.000037s : 6: substitution.arithmetic_simplify 0.72% : 0.000002s : 2: substitution.elim_not_effective 0.56% : 0.000001s : 2: substitution.fold_const_symbol 2.40% : 0.000006s : 4: substitution.graph_param_transform 68.72% : 0.000171s : 4: substitution.inline 1.82% : 0.000005s : 4: substitution.j_node_and_user_rematch 1.98% : 0.000005s : 4: substitution.remove_not_recompute_node 2.41% : 0.000006s : 4: substitution.replace_old_param 6.32% : 0.000016s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.007707 2 89.97% : 0.006934s : 1: type_inference.infer 10.03% : 0.000773s : 1: type_inference.specialize ------[replace.] 0.000067 8 63.66% : 0.000042s : 4: replace.inline 36.34% : 0.000024s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000182 8 92.39% : 0.000168s : 4: match.inline 7.61% : 0.000014s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000247 1278 0.81% : 0.000002s : 13: predicate.accumulaten_eliminater 0.79% : 0.000002s : 4: predicate.ad_related_special_op_eliminate 0.41% : 0.000001s : 8: predicate.addn_check_dump 0.89% : 0.000002s : 13: predicate.addn_zero_filter 0.84% : 0.000002s : 13: predicate.adjust_all_reduce_mul_add 2.14% : 0.000005s : 21: predicate.arithmetic_simplify 0.78% : 0.000002s : 13: predicate.cast_eliminate 0.49% : 0.000001s : 8: predicate.check_bprop_eliminate 0.40% : 0.000001s : 8: predicate.compare_switch_simplify 0.17% : 0.000000s : 4: predicate.const_output_eliminate 0.66% : 0.000002s : 8: predicate.depend_value_elim 0.76% : 0.000002s : 13: predicate.dict_get_item_const_eliminator 0.91% : 0.000002s : 13: predicate.dict_get_item_eliminator 0.74% : 0.000002s : 13: predicate.dict_set_item_eliminator 0.92% : 0.000002s : 8: predicate.dumpgradient_eliminate 0.16% : 0.000000s : 4: predicate.elim_not_effective 0.33% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 0.92% : 0.000002s : 17: predicate.environ_add_const_eliminate 0.88% : 0.000002s : 17: predicate.environ_get_add_eliminate 0.88% : 0.000002s : 17: predicate.environ_get_depend_swap 15.01% : 0.000037s : 25: predicate.environ_get_eliminate 0.89% : 0.000002s : 17: predicate.environ_get_set_eliminate 1.20% : 0.000003s : 21: predicate.exchange_switch_depend_value 2.17% : 0.000005s : 21: predicate.float_depend_g_call 0.46% : 0.000001s : 8: predicate.float_environ_get_switch 0.70% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.15% : 0.000000s : 4: predicate.fold_const_symbol 0.54% : 0.000001s : 8: predicate.get_grad_eliminate 0.17% : 0.000000s : 4: predicate.graph_param_transform 0.49% : 0.000001s : 8: predicate.incorporate_call 0.40% : 0.000001s : 8: predicate.incorporate_call_switch 5.59% : 0.000014s : 58: predicate.inline 0.61% : 0.000002s : 8: predicate.inline_without_move 0.26% : 0.000001s : 8: predicate.j_node_and_user_rematch 0.80% : 0.000002s : 8: predicate.less_batch_normalization 1.58% : 0.000004s : 25: predicate.list_to_tuple_eliminator_ 2.11% : 0.000005s : 38: predicate.load_eliminater 0.93% : 0.000002s : 4: predicate.loop_unroll_after_grad 2.06% : 0.000005s : 34: predicate.loop_unroll_before_grad 1.57% : 0.000004s : 21: predicate.make_slice_get_slice_eliminator 0.47% : 0.000001s : 8: predicate.merge_addn 0.51% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.55% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.71% : 0.000002s : 13: predicate.minmaximum_grad 1.12% : 0.000003s : 4: predicate.mutable_eliminate 0.26% : 0.000001s : 4: predicate.opt_reshape 0.54% : 0.000001s : 4: predicate.parallel_virtual_node 1.42% : 0.000004s : 21: predicate.partial_defer_inline 1.37% : 0.000003s : 21: predicate.partial_eliminate 0.75% : 0.000002s : 13: predicate.print_const_string_wrapper 0.50% : 0.000001s : 8: predicate.reduce_all_const_elim 1.07% : 0.000003s : 13: predicate.reduce_eliminate 2.09% : 0.000005s : 38: predicate.redundant_stop_gradient_eliminater 0.41% : 0.000001s : 8: predicate.remove_not_recompute_node 1.13% : 0.000003s : 25: predicate.replace_applicator 0.43% : 0.000001s : 8: predicate.replace_old_param 0.34% : 0.000001s : 4: predicate.reset_defer_inline 0.73% : 0.000002s : 13: predicate.reshape_eliminate 0.51% : 0.000001s : 8: predicate.row_tensor_add_zeros_like 0.37% : 0.000001s : 4: predicate.row_tensor_eliminate 0.66% : 0.000002s : 8: predicate.same_eliminate 0.45% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.80% : 0.000002s : 8: predicate.shard_identity_eliminate 0.55% : 0.000001s : 8: predicate.special_op_eliminate 0.59% : 0.000001s : 8: predicate.specialize_transform 0.86% : 0.000002s : 8: predicate.split_environ_get_set_with_tuple_value 0.67% : 0.000002s : 8: predicate.stack_unstack_eliminate 0.36% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.25% : 0.000003s : 21: predicate.switch_defer_inline 1.73% : 0.000004s : 29: predicate.switch_layer_defer_inline 4.36% : 0.000011s : 67: predicate.switch_simplify 0.75% : 0.000002s : 13: predicate.tile_eliminate 0.77% : 0.000002s : 13: predicate.transpose_eliminate 1.49% : 0.000004s : 21: predicate.tuple_list_convert_item_index_to_positive 1.33% : 0.000003s : 21: predicate.tuple_list_get_item_const_eliminator 1.12% : 0.000003s : 21: predicate.tuple_list_get_item_depend_reorder 2.87% : 0.000007s : 33: predicate.tuple_list_get_item_eliminator 1.29% : 0.000003s : 21: predicate.tuple_list_get_set_item_eliminator 1.97% : 0.000005s : 29: predicate.tuple_list_set_item_eliminator 1.43% : 0.000004s : 25: predicate.tuple_to_list_eliminator_ 2.13% : 0.000005s : 38: predicate.updatestate_pure_node_eliminater 2.56% : 0.000006s : 46: predicate.updatestate_useless_node_eliminater 0.32% : 0.000001s : 4: predicate.value_based_eliminate 0.61% : 0.000002s : 8: predicate.virtual_dataset_eliminate 0.64% : 0.000002s : 8: predicate.virtual_output_eliminate 0.27% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.36% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.001608 11 82.07% : 0.001319s : 5: func_graph_cloner_run.FuncGraphClonerGraph 17.93% : 0.000288s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.029774 192 0.01% : 0.000004s : 1: ForceFp32Comm 12.31% : 0.003665s : 1: add_attr 12.26% : 0.003650s : 1: add_attr_with_inline 0.01% : 0.000004s : 1: add_comm_op_reuse_tag 0.19% : 0.000056s : 1: add_recomputation 0.01% : 0.000004s : 1: assign_add_opt 0.23% : 0.000069s : 1: auto_monad 0.07% : 0.000022s : 1: auto_monad_reorder 0.01% : 0.000003s : 1: begin_end_overlap_inline 0.02% : 0.000005s : 1: bias_add_comm_swap 1.70% : 0.000505s : 1: bootstrap 0.13% : 0.000038s : 1: cconv 0.01% : 0.000004s : 1: comm_op_add_attrs 0.05% : 0.000016s : 1: control_data_broadcast_order 0.03% : 0.000010s : 1: convert_after_rewriter 0.08% : 0.000025s : 1: cse_after_recomputation 0.02% : 0.000005s : 1: dataset_repeat_opt 0.02% : 0.000006s : 1: detach_backward 0.03% : 0.000009s : 1: environ_conv 0.10% : 0.000028s : 1: event_method 0.02% : 0.000005s : 1: full_micro_interleaved_order_control 0.02% : 0.000006s : 1: get_jit_bprop_graph 0.03% : 0.000009s : 1: graph_reusing 0.01% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000004s : 1: handle_group_info 0.02% : 0.000006s : 1: inline 0.02% : 0.000006s : 1: insert-virtual-dataset 0.01% : 0.000004s : 1: interleave_parallel_branches 0.01% : 0.000004s : 1: interleave_split_concat_branches 0.02% : 0.000006s : 1: label_fine_grained_interleaved_index 0.02% : 0.000007s : 1: label_micro_interleaved_index 1.60% : 0.000476s : 1: loop_unroll 0.01% : 0.000004s : 1: merge_cast_opt 0.02% : 0.000005s : 1: micro_interleaved_order_control 2.51% : 0.000747s : 1: mutable_eliminate 0.02% : 0.000007s : 1: offloading_packed_experts 0.05% : 0.000016s : 1: opt.transform.loop_unroll_optimizer 0.07% : 0.000019s : 1: opt.transform.mutable_eliminate 4.45% : 0.001326s : 78: opt.transform.opt_a 0.10% : 0.000031s : 1: opt.transform.opt_after_cconv 0.09% : 0.000026s : 1: opt.transform.opt_after_jit_grad 0.35% : 0.000104s : 28: opt.transform.opt_b 0.17% : 0.000052s : 2: opt.transform.opt_trans_graph 0.13% : 0.000038s : 4: opt.transform.symbol_engine_opt 10.52% : 0.003132s : 1: opt_a 0.37% : 0.000112s : 1: opt_after_cconv 1.68% : 0.000501s : 1: opt_after_jit_grad 0.76% : 0.000226s : 1: opt_b 18.48% : 0.005502s : 1: optimize 0.08% : 0.000023s : 1: optimize_parallel_all_gather_comm 0.03% : 0.000008s : 1: order_py_execute_after_rewriter 0.09% : 0.000026s : 1: overlap_grad_flash_sp 0.02% : 0.000005s : 1: overlap_grad_matmul_and_grad_allreduce 0.02% : 0.000007s : 1: overlap_grad_ring_attention 0.02% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.02% : 0.000005s : 1: overlap_param_gather 0.01% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.03% : 0.000008s : 1: overlap_recompute_and_grad_model_parallel 0.02% : 0.000005s : 1: overlap_recompute_comm 0.02% : 0.000007s : 1: parallel-infer-symbol 0.01% : 0.000004s : 1: parallel-infer-symbol-second 0.02% : 0.000005s : 1: partial_unused_args_eliminate 0.02% : 0.000005s : 1: pipeline_parallel_scheduler 0.02% : 0.000005s : 1: pipeline_split 0.15% : 0.000046s : 1: pre_auto_parallel 0.12% : 0.000035s : 1: py_interpret_to_execute 0.06% : 0.000019s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000004s : 1: remove_cast_before_assign_add 0.06% : 0.000018s : 1: remove_dup_value 1.58% : 0.000471s : 1: renormalize.infer 1.28% : 0.000382s : 1: renormalize.specialize 0.02% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.03% : 0.000010s : 1: rewriter_after_jit_bprop_graph 0.15% : 0.000044s : 1: rewriter_after_opt_a 0.34% : 0.000101s : 1: rewriter_before_opt_a 0.02% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.02% : 0.000006s : 1: slice_recompute_activation 0.02% : 0.000005s : 1: split_layernorm_comm 0.02% : 0.000005s : 1: split_matmul_comm_elemetwise 0.03% : 0.000008s : 1: swap_dp_allreduce_reducescatter 0.28% : 0.000082s : 1: symbol_engine_optimizer 0.28% : 0.000083s : 1: tuple_transform 26.22% : 0.007806s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:42:23.803.008 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:42:23.803.284 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.142463, [21] [bootstrap]: 0.00047291 [type_inference]: 0.00673378 [event_method]: 2.214e-05 [auto_monad]: 6.779e-05 [graph_reusing]: 7.00002e-06 [inline]: 2.78e-06 [add_attr]: 0.126194, [1] [add_attr_with_inline]: 0.126178, [1] [Cycle 1]: 0.00011236, [2] [tag_attr]: 2.701e-05 [meta_addattr_fg_expand]: 6.49999e-06 [parallel-infer-symbol]: 4.03001e-06 [pre_auto_parallel]: 4.72e-05 [insert-virtual-dataset]: 2.34001e-06 [parallel-infer-symbol-second]: 7.2e-07 [dataset_repeat_opt]: 2.44999e-06 [pipeline_split]: 1.71e-06 [optimize]: 0.00756499, [53] [py_interpret_to_execute]: 4.065e-05 [rewriter_before_opt_a]: 0.00011104 [opt_a]: 0.00387276, [2] [Cycle 1]: 0.00285687, [45] [expand_dump_flag]: 3.35e-06 [switch_simplify]: 4.555e-05 [loop_unroll]: 3.183e-05 [a_1]: 0.00074526 [with_stream_mark]: 2.462e-05 [recompute_prepare]: 1.324e-05 [updatestate_depend_eliminate]: 4.54002e-06 [updatestate_assign_eliminate]: 3.55e-06 [updatestate_loads_eliminate]: 3.08e-06 [parameter_eliminate]: 3.21001e-06 [a_2]: 0.00014427 [accelerated_algorithm]: 1.041e-05 [shard]: 2.54999e-06 [meta_shard_fg_expand]: 2.54001e-06 [shard_inline]: 7.22002e-06 [merge_send_recv]: 1.06e-05 [auto_parallel]: 1.008e-05 [parallel]: 2.19e-05 [flash_sp]: 1.198e-05 [merge_comm]: 4.87e-06 [allreduce_fusion]: 3.86999e-06 [matmul_add_comm_reduction]: 1.14e-05 [allreduce_slice_to_reducescatter]: 8.70001e-07 [virtual_shard_identity]: 1.222e-05 [virtual_dataset]: 8.22e-06 [get_grad_eliminate_]: 7.56999e-06 [virtual_output]: 7.26001e-06 [merge_forward]: 4.1e-06 [cell_reuse_recompute_pass]: 2.53003e-06 [offload_activation]: 1.13e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.127e-05 [merge_recompute_call_nodes]: 1.74e-06 [before_grad]: 1.518e-05 [set_forward_comm_id_for_comm_node_pass]: 5.21998e-06 [meta_fg_expand]: 4.25999e-06 [flash_sp_send_recv_attached]: 4.04997e-06 [receive_attached]: 2.57001e-06 [after_resolve]: 1.516e-05 [a_after_grad]: 1.271e-05 [renormalize]: 0.00097917 [add_forward_monad_depend]: 1.053e-05 [auto_monad_grad]: 2.89999e-06 [auto_monad_eliminator]: 2.62e-05 [cse]: 3.179e-05 [a_3]: 7.671e-05 [Cycle 2]: 0.00099494, [45] [expand_dump_flag]: 2.66e-06 [switch_simplify]: 9.20001e-06 [loop_unroll]: 7.21999e-06 [a_1]: 0.00016615 [with_stream_mark]: 2.113e-05 [recompute_prepare]: 8.82e-06 [updatestate_depend_eliminate]: 4.13001e-06 [updatestate_assign_eliminate]: 3.08e-06 [updatestate_loads_eliminate]: 2.78e-06 [parameter_eliminate]: 2.53003e-06 [a_2]: 0.00011244 [accelerated_algorithm]: 8.42e-06 [shard]: 3.35e-06 [meta_shard_fg_expand]: 2.48002e-06 [shard_inline]: 6.22001e-06 [merge_send_recv]: 9.82999e-06 [auto_parallel]: 1.013e-05 [parallel]: 8.77e-06 [flash_sp]: 3.84002e-06 [merge_comm]: 4e-06 [allreduce_fusion]: 3.64002e-06 [matmul_add_comm_reduction]: 1.25e-05 [allreduce_slice_to_reducescatter]: 9.89996e-07 [virtual_shard_identity]: 8.35001e-06 [virtual_dataset]: 6.63e-06 [get_grad_eliminate_]: 6.33e-06 [virtual_output]: 6.07999e-06 [merge_forward]: 4.26001e-06 [cell_reuse_recompute_pass]: 3.60998e-06 [offload_activation]: 1.003e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.534e-05 [merge_recompute_call_nodes]: 1.75001e-06 [before_grad]: 1.085e-05 [set_forward_comm_id_for_comm_node_pass]: 4.07e-06 [meta_fg_expand]: 2.49999e-06 [flash_sp_send_recv_attached]: 2.21e-06 [receive_attached]: 2.39999e-06 [after_resolve]: 1.38e-05 [a_after_grad]: 1.022e-05 [renormalize]: 8.9989e-08 [add_forward_monad_depend]: 2.54001e-06 [auto_monad_grad]: 2.28002e-06 [auto_monad_eliminator]: 1.232e-05 [cse]: 2.121e-05 [a_3]: 5.455e-05 [py_interpret_to_execute_after_opt_a]: 2.191e-05 [slice_cell_reuse_recomputed_activation]: 5.40999e-06 [rewriter_after_opt_a]: 4.94e-05 [convert_after_rewriter]: 1.05e-05 [order_py_execute_after_rewriter]: 8.27e-06 [mutable_eliminate]: 0.00077744 [opt_b]: 0.00030222, [1] [Cycle 1]: 0.00028975, [7] [b_1]: 0.00017815 [b_2]: 9.49e-06 [updatestate_depend_eliminate]: 1.032e-05 [updatestate_assign_eliminate]: 2.99999e-06 [updatestate_loads_eliminate]: 3.13e-06 [renormalize]: 8.00006e-07 [cse]: 2.691e-05 [optimize_parallel_all_gather_comm]: 4.353e-05 [overlap_param_gather]: 6.19001e-06 [cconv]: 4.065e-05 [loop_unroll]: 0.00053509 [opt_after_cconv]: 0.00013854, [1] [Cycle 1]: 0.00012883, [7] [c_1]: 3.267e-05 [parameter_eliminate]: 6.32001e-06 [updatestate_depend_eliminate]: 6.74001e-06 [updatestate_assign_eliminate]: 2.78e-06 [updatestate_loads_eliminate]: 2.32999e-06 [cse]: 2.064e-05 [renormalize]: 7.2e-07 [remove_dup_value]: 1.794e-05 [tuple_transform]: 9.619e-05, [1] [Cycle 1]: 8.87e-05, [4] [d_1]: 4.778e-05 [none_parameter_eliminate]: 2.16998e-06 [renormalize]: 1.8999e-07 [switch_simplify]: 7.03998e-06 [partial_unused_args_eliminate]: 4.47e-06 [add_recomputation]: 5.637e-05 [cse_after_recomputation]: 2.895e-05, [1] [Cycle 1]: 2.088e-05, [1] [cse]: 1.193e-05 [environ_conv]: 1.049e-05 [swap_dp_allreduce_reducescatter]: 8.17003e-06 [bias_add_comm_swap]: 6.17001e-06 [label_micro_interleaved_index]: 7.83999e-06 [label_fine_grained_interleaved_index]: 5.42001e-06 [merge_cast_opt]: 4.41002e-06 [slice_recompute_activation]: 4.56002e-06 [micro_interleaved_order_control]: 4.86002e-06 [assign_add_opt]: 3.53999e-06 [ForceFp32Comm]: 3.5e-06 [remove_cast_before_assign_add]: 3.73001e-06 [full_micro_interleaved_order_control]: 4.52e-06 [reorder_send_recv_between_fp_bp]: 5.74e-06 [comm_op_add_attrs]: 3.63999e-06 [add_comm_op_reuse_tag]: 3.26001e-06 [interleave_split_concat_branches]: 3.58e-06 [interleave_parallel_branches]: 3.70998e-06 [overlap_opt_shard_in_pipeline]: 3.36999e-06 [overlap_opt_shard_grad_in_pipeline]: 4.30999e-06 [control_data_broadcast_order]: 1.668e-05 [grouped_pairwise_exchange_alltoall]: 4.07e-06 [offloading_packed_experts]: 6.68e-06 [overlap_recompute_and_grad_model_parallel]: 7.35998e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.80998e-06 [overlap_recompute_allgather_and_fa_grad]: 3.88001e-06 [overlap_recompute_comm]: 5.04003e-06 [overlap_grad_ring_attention]: 6.34001e-06 [overlap_grad_flash_sp]: 0.00085911 [begin_end_overlap_inline]: 4.50001e-06 [split_matmul_comm_elemetwise]: 5.76e-06 [split_layernorm_comm]: 4.45999e-06 [handle_group_info]: 3.68e-06 [symbol_engine_optimizer]: 0.00012481, [1] [Cycle 1]: 0.00011427, [6] [build]: 5.91e-06 [elim_shapecalc]: 1.782e-05 [elim_not_effective]: 1.831e-05 [opt_reshape]: 7.91001e-06 [fold_const_symbol]: 1.057e-05 [renormalize]: 5.59987e-07 [detach_backward]: 4.2e-06 [pipeline_parallel_scheduler]: 1.91998e-06 [auto_monad_reorder]: 2.375e-05 [get_jit_bprop_graph]: 2.09999e-06 [rewriter_after_jit_bprop_graph]: 6.39001e-06 [opt_after_jit_grad]: 0.00067013 [validate]: 4.535e-05 Sums bootstrap : 0.000473s : 3.30% type_inference : 0.006734s : 46.96% event_method : 0.000022s : 0.15% auto_monad : 0.000068s : 0.47% graph_reusing : 0.000007s : 0.05% inline : 0.000003s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000027s : 0.19% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.05% parallel-infer-symbol : 0.000004s : 0.03% pre_auto_parallel : 0.000047s : 0.33% insert-virtual-dataset : 0.000002s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000041s : 0.28% optimize.rewriter_before_opt_a : 0.000111s : 0.77% optimize.opt_a.expand_dump_flag : 0.000006s : 0.04% optimize.opt_a.switch_simplify : 0.000055s : 0.38% optimize.opt_a.loop_unroll : 0.000039s : 0.27% optimize.opt_a.a_1 : 0.000911s : 6.36% optimize.opt_a.with_stream_mark : 0.000046s : 0.32% optimize.opt_a.recompute_prepare : 0.000022s : 0.15% optimize.opt_a.updatestate_depend_eliminate : 0.000009s : 0.06% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.05% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.04% optimize.opt_a.parameter_eliminate : 0.000006s : 0.04% optimize.opt_a.a_2 : 0.000257s : 1.79% optimize.opt_a.accelerated_algorithm : 0.000019s : 0.13% optimize.opt_a.shard : 0.000006s : 0.04% optimize.opt_a.meta_shard_fg_expand : 0.000005s : 0.04% optimize.opt_a.shard_inline : 0.000013s : 0.09% optimize.opt_a.merge_send_recv : 0.000020s : 0.14% optimize.opt_a.auto_parallel : 0.000020s : 0.14% optimize.opt_a.parallel : 0.000031s : 0.21% optimize.opt_a.flash_sp : 0.000016s : 0.11% optimize.opt_a.merge_comm : 0.000009s : 0.06% optimize.opt_a.allreduce_fusion : 0.000008s : 0.05% optimize.opt_a.matmul_add_comm_reduction : 0.000024s : 0.17% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000002s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000021s : 0.14% optimize.opt_a.virtual_dataset : 0.000015s : 0.10% optimize.opt_a.get_grad_eliminate_ : 0.000014s : 0.10% optimize.opt_a.virtual_output : 0.000013s : 0.09% optimize.opt_a.merge_forward : 0.000008s : 0.06% optimize.opt_a.cell_reuse_recompute_pass : 0.000006s : 0.04% optimize.opt_a.offload_activation : 0.000021s : 0.15% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000037s : 0.26% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.02% optimize.opt_a.before_grad : 0.000026s : 0.18% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000009s : 0.06% optimize.opt_a.meta_fg_expand : 0.000007s : 0.05% optimize.opt_a.flash_sp_send_recv_attached : 0.000006s : 0.04% optimize.opt_a.receive_attached : 0.000005s : 0.03% optimize.opt_a.after_resolve : 0.000029s : 0.20% optimize.opt_a.a_after_grad : 0.000023s : 0.16% optimize.opt_a.renormalize : 0.000979s : 6.83% optimize.opt_a.add_forward_monad_depend : 0.000013s : 0.09% optimize.opt_a.auto_monad_grad : 0.000005s : 0.04% optimize.opt_a.auto_monad_eliminator : 0.000039s : 0.27% optimize.opt_a.cse : 0.000053s : 0.37% optimize.opt_a.a_3 : 0.000131s : 0.92% optimize.py_interpret_to_execute_after_opt_a : 0.000022s : 0.15% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.04% optimize.rewriter_after_opt_a : 0.000049s : 0.34% optimize.convert_after_rewriter : 0.000011s : 0.07% optimize.order_py_execute_after_rewriter : 0.000008s : 0.06% optimize.mutable_eliminate : 0.000777s : 5.42% optimize.opt_b.b_1 : 0.000178s : 1.24% optimize.opt_b.b_2 : 0.000009s : 0.07% optimize.opt_b.updatestate_depend_eliminate : 0.000010s : 0.07% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.02% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.02% optimize.opt_b.renormalize : 0.000001s : 0.01% optimize.opt_b.cse : 0.000027s : 0.19% optimize.optimize_parallel_all_gather_comm : 0.000044s : 0.30% optimize.overlap_param_gather : 0.000006s : 0.04% optimize.cconv : 0.000041s : 0.28% optimize.loop_unroll : 0.000535s : 3.73% optimize.opt_after_cconv.c_1 : 0.000033s : 0.23% optimize.opt_after_cconv.parameter_eliminate : 0.000006s : 0.04% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000007s : 0.05% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.02% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.02% optimize.opt_after_cconv.cse : 0.000021s : 0.14% optimize.opt_after_cconv.renormalize : 0.000001s : 0.01% optimize.remove_dup_value : 0.000018s : 0.13% optimize.tuple_transform.d_1 : 0.000048s : 0.33% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.02% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000007s : 0.05% optimize.partial_unused_args_eliminate : 0.000004s : 0.03% optimize.add_recomputation : 0.000056s : 0.39% optimize.cse_after_recomputation.cse : 0.000012s : 0.08% optimize.environ_conv : 0.000010s : 0.07% optimize.swap_dp_allreduce_reducescatter : 0.000008s : 0.06% optimize.bias_add_comm_swap : 0.000006s : 0.04% optimize.label_micro_interleaved_index : 0.000008s : 0.05% optimize.label_fine_grained_interleaved_index : 0.000005s : 0.04% optimize.merge_cast_opt : 0.000004s : 0.03% optimize.slice_recompute_activation : 0.000005s : 0.03% optimize.micro_interleaved_order_control : 0.000005s : 0.03% optimize.assign_add_opt : 0.000004s : 0.02% optimize.ForceFp32Comm : 0.000003s : 0.02% optimize.remove_cast_before_assign_add : 0.000004s : 0.03% optimize.full_micro_interleaved_order_control : 0.000005s : 0.03% optimize.reorder_send_recv_between_fp_bp : 0.000006s : 0.04% optimize.comm_op_add_attrs : 0.000004s : 0.03% optimize.add_comm_op_reuse_tag : 0.000003s : 0.02% optimize.interleave_split_concat_branches : 0.000004s : 0.02% optimize.interleave_parallel_branches : 0.000004s : 0.03% optimize.overlap_opt_shard_in_pipeline : 0.000003s : 0.02% optimize.overlap_opt_shard_grad_in_pipeline : 0.000004s : 0.03% optimize.control_data_broadcast_order : 0.000017s : 0.12% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.03% optimize.offloading_packed_experts : 0.000007s : 0.05% optimize.overlap_recompute_and_grad_model_parallel : 0.000007s : 0.05% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.03% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.03% optimize.overlap_recompute_comm : 0.000005s : 0.04% optimize.overlap_grad_ring_attention : 0.000006s : 0.04% optimize.overlap_grad_flash_sp : 0.000859s : 5.99% optimize.begin_end_overlap_inline : 0.000005s : 0.03% optimize.split_matmul_comm_elemetwise : 0.000006s : 0.04% optimize.split_layernorm_comm : 0.000004s : 0.03% optimize.handle_group_info : 0.000004s : 0.03% optimize.symbol_engine_optimizer.build : 0.000006s : 0.04% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000018s : 0.12% optimize.symbol_engine_optimizer.elim_not_effective : 0.000018s : 0.13% optimize.symbol_engine_optimizer.opt_reshape : 0.000008s : 0.06% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000011s : 0.07% optimize.symbol_engine_optimizer.renormalize : 0.000001s : 0.00% detach_backward : 0.000004s : 0.03% pipeline_parallel_scheduler : 0.000002s : 0.01% auto_monad_reorder : 0.000024s : 0.17% get_jit_bprop_graph : 0.000002s : 0.01% rewriter_after_jit_bprop_graph : 0.000006s : 0.04% opt_after_jit_grad : 0.000670s : 4.67% validate : 0.000045s : 0.32% Time group info: ------[substitution.] 0.000285 34 15.90% : 0.000045s : 6: substitution.arithmetic_simplify 0.73% : 0.000002s : 2: substitution.elim_not_effective 0.54% : 0.000002s : 2: substitution.fold_const_symbol 2.23% : 0.000006s : 4: substitution.graph_param_transform 67.72% : 0.000193s : 4: substitution.inline 2.44% : 0.000007s : 4: substitution.j_node_and_user_rematch 2.07% : 0.000006s : 4: substitution.remove_not_recompute_node 2.78% : 0.000008s : 4: substitution.replace_old_param 5.58% : 0.000016s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.006667 2 87.70% : 0.005847s : 1: type_inference.infer 12.30% : 0.000820s : 1: type_inference.specialize ------[replace.] 0.000073 8 59.80% : 0.000044s : 4: replace.inline 40.20% : 0.000030s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000204 8 93.08% : 0.000190s : 4: match.inline 6.92% : 0.000014s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000244 1278 1.03% : 0.000003s : 13: predicate.accumulaten_eliminater 0.73% : 0.000002s : 4: predicate.ad_related_special_op_eliminate 0.66% : 0.000002s : 8: predicate.addn_check_dump 0.89% : 0.000002s : 13: predicate.addn_zero_filter 0.71% : 0.000002s : 13: predicate.adjust_all_reduce_mul_add 3.12% : 0.000008s : 21: predicate.arithmetic_simplify 0.97% : 0.000002s : 13: predicate.cast_eliminate 0.54% : 0.000001s : 8: predicate.check_bprop_eliminate 0.62% : 0.000002s : 8: predicate.compare_switch_simplify 0.16% : 0.000000s : 4: predicate.const_output_eliminate 0.58% : 0.000001s : 8: predicate.depend_value_elim 0.86% : 0.000002s : 13: predicate.dict_get_item_const_eliminator 1.17% : 0.000003s : 13: predicate.dict_get_item_eliminator 0.79% : 0.000002s : 13: predicate.dict_set_item_eliminator 0.87% : 0.000002s : 8: predicate.dumpgradient_eliminate 0.29% : 0.000001s : 4: predicate.elim_not_effective 0.67% : 0.000002s : 4: predicate.elim_shapecalc_of_broadcastargs 1.26% : 0.000003s : 17: predicate.environ_add_const_eliminate 1.20% : 0.000003s : 17: predicate.environ_get_add_eliminate 1.20% : 0.000003s : 17: predicate.environ_get_depend_swap 1.57% : 0.000004s : 25: predicate.environ_get_eliminate 1.21% : 0.000003s : 17: predicate.environ_get_set_eliminate 1.28% : 0.000003s : 21: predicate.exchange_switch_depend_value 2.18% : 0.000005s : 21: predicate.float_depend_g_call 0.62% : 0.000002s : 8: predicate.float_environ_get_switch 0.74% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.14% : 0.000000s : 4: predicate.fold_const_symbol 0.81% : 0.000002s : 8: predicate.get_grad_eliminate 0.28% : 0.000001s : 4: predicate.graph_param_transform 0.48% : 0.000001s : 8: predicate.incorporate_call 0.42% : 0.000001s : 8: predicate.incorporate_call_switch 5.95% : 0.000015s : 58: predicate.inline 0.80% : 0.000002s : 8: predicate.inline_without_move 0.27% : 0.000001s : 8: predicate.j_node_and_user_rematch 0.92% : 0.000002s : 8: predicate.less_batch_normalization 1.73% : 0.000004s : 25: predicate.list_to_tuple_eliminator_ 2.27% : 0.000006s : 38: predicate.load_eliminater 0.93% : 0.000002s : 4: predicate.loop_unroll_after_grad 2.27% : 0.000006s : 34: predicate.loop_unroll_before_grad 1.68% : 0.000004s : 21: predicate.make_slice_get_slice_eliminator 0.55% : 0.000001s : 8: predicate.merge_addn 0.60% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.54% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.72% : 0.000002s : 13: predicate.minmaximum_grad 1.67% : 0.000004s : 4: predicate.mutable_eliminate 0.35% : 0.000001s : 4: predicate.opt_reshape 0.34% : 0.000001s : 4: predicate.parallel_virtual_node 1.79% : 0.000004s : 21: predicate.partial_defer_inline 1.38% : 0.000003s : 21: predicate.partial_eliminate 1.03% : 0.000003s : 13: predicate.print_const_string_wrapper 0.70% : 0.000002s : 8: predicate.reduce_all_const_elim 1.57% : 0.000004s : 13: predicate.reduce_eliminate 2.59% : 0.000006s : 38: predicate.redundant_stop_gradient_eliminater 0.47% : 0.000001s : 8: predicate.remove_not_recompute_node 1.56% : 0.000004s : 25: predicate.replace_applicator 0.51% : 0.000001s : 8: predicate.replace_old_param 0.39% : 0.000001s : 4: predicate.reset_defer_inline 1.05% : 0.000003s : 13: predicate.reshape_eliminate 0.58% : 0.000001s : 8: predicate.row_tensor_add_zeros_like 0.41% : 0.000001s : 4: predicate.row_tensor_eliminate 0.80% : 0.000002s : 8: predicate.same_eliminate 0.65% : 0.000002s : 8: predicate.set_cell_output_no_recompute 0.87% : 0.000002s : 8: predicate.shard_identity_eliminate 0.63% : 0.000002s : 8: predicate.special_op_eliminate 0.66% : 0.000002s : 8: predicate.specialize_transform 0.97% : 0.000002s : 8: predicate.split_environ_get_set_with_tuple_value 1.04% : 0.000003s : 8: predicate.stack_unstack_eliminate 0.34% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.35% : 0.000003s : 21: predicate.switch_defer_inline 1.80% : 0.000004s : 29: predicate.switch_layer_defer_inline 4.53% : 0.000011s : 67: predicate.switch_simplify 1.00% : 0.000002s : 13: predicate.tile_eliminate 0.94% : 0.000002s : 13: predicate.transpose_eliminate 1.60% : 0.000004s : 21: predicate.tuple_list_convert_item_index_to_positive 1.33% : 0.000003s : 21: predicate.tuple_list_get_item_const_eliminator 1.39% : 0.000003s : 21: predicate.tuple_list_get_item_depend_reorder 3.61% : 0.000009s : 33: predicate.tuple_list_get_item_eliminator 1.42% : 0.000003s : 21: predicate.tuple_list_get_set_item_eliminator 2.17% : 0.000005s : 29: predicate.tuple_list_set_item_eliminator 1.78% : 0.000004s : 25: predicate.tuple_to_list_eliminator_ 2.14% : 0.000005s : 38: predicate.updatestate_pure_node_eliminater 2.73% : 0.000007s : 46: predicate.updatestate_useless_node_eliminater 0.55% : 0.000001s : 4: predicate.value_based_eliminate 0.66% : 0.000002s : 8: predicate.virtual_dataset_eliminate 0.66% : 0.000002s : 8: predicate.virtual_output_eliminate 0.23% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.46% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000698 11 53.38% : 0.000373s : 5: func_graph_cloner_run.FuncGraphClonerGraph 46.62% : 0.000325s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.278769 192 0.00% : 0.000006s : 1: ForceFp32Comm 45.27% : 0.126208s : 1: add_attr 45.26% : 0.126182s : 1: add_attr_with_inline 0.00% : 0.000007s : 1: add_comm_op_reuse_tag 0.02% : 0.000060s : 1: add_recomputation 0.00% : 0.000006s : 1: assign_add_opt 0.03% : 0.000077s : 1: auto_monad 0.01% : 0.000031s : 1: auto_monad_reorder 0.00% : 0.000009s : 1: begin_end_overlap_inline 0.00% : 0.000009s : 1: bias_add_comm_swap 0.19% : 0.000519s : 1: bootstrap 0.02% : 0.000044s : 1: cconv 0.00% : 0.000006s : 1: comm_op_add_attrs 0.01% : 0.000020s : 1: control_data_broadcast_order 0.00% : 0.000014s : 1: convert_after_rewriter 0.01% : 0.000032s : 1: cse_after_recomputation 0.00% : 0.000008s : 1: dataset_repeat_opt 0.01% : 0.000019s : 1: detach_backward 0.00% : 0.000014s : 1: environ_conv 0.01% : 0.000033s : 1: event_method 0.00% : 0.000007s : 1: full_micro_interleaved_order_control 0.00% : 0.000008s : 1: get_jit_bprop_graph 0.00% : 0.000014s : 1: graph_reusing 0.00% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000006s : 1: handle_group_info 0.00% : 0.000009s : 1: inline 0.00% : 0.000009s : 1: insert-virtual-dataset 0.00% : 0.000006s : 1: interleave_parallel_branches 0.00% : 0.000006s : 1: interleave_split_concat_branches 0.00% : 0.000008s : 1: label_fine_grained_interleaved_index 0.00% : 0.000011s : 1: label_micro_interleaved_index 0.19% : 0.000542s : 1: loop_unroll 0.00% : 0.000007s : 1: merge_cast_opt 0.00% : 0.000008s : 1: micro_interleaved_order_control 0.28% : 0.000787s : 1: mutable_eliminate 0.00% : 0.000009s : 1: offloading_packed_experts 0.01% : 0.000017s : 1: opt.transform.loop_unroll_optimizer 0.01% : 0.000023s : 1: opt.transform.mutable_eliminate 0.51% : 0.001421s : 78: opt.transform.opt_a 0.01% : 0.000031s : 1: opt.transform.opt_after_cconv 0.01% : 0.000028s : 1: opt.transform.opt_after_jit_grad 0.04% : 0.000111s : 28: opt.transform.opt_b 0.02% : 0.000053s : 2: opt.transform.opt_trans_graph 0.02% : 0.000049s : 4: opt.transform.symbol_engine_opt 1.39% : 0.003877s : 1: opt_a 0.05% : 0.000142s : 1: opt_after_cconv 0.24% : 0.000681s : 1: opt_after_jit_grad 0.11% : 0.000306s : 1: opt_b 2.83% : 0.007878s : 1: optimize 0.02% : 0.000047s : 1: optimize_parallel_all_gather_comm 0.00% : 0.000011s : 1: order_py_execute_after_rewriter 0.31% : 0.000872s : 1: overlap_grad_flash_sp 0.00% : 0.000007s : 1: overlap_grad_matmul_and_grad_allreduce 0.00% : 0.000009s : 1: overlap_grad_ring_attention 0.00% : 0.000007s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000006s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000010s : 1: overlap_param_gather 0.00% : 0.000006s : 1: overlap_recompute_allgather_and_fa_grad 0.00% : 0.000010s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000008s : 1: overlap_recompute_comm 0.00% : 0.000012s : 1: parallel-infer-symbol 0.00% : 0.000007s : 1: parallel-infer-symbol-second 0.00% : 0.000008s : 1: partial_unused_args_eliminate 0.00% : 0.000010s : 1: pipeline_parallel_scheduler 0.00% : 0.000007s : 1: pipeline_split 0.02% : 0.000055s : 1: pre_auto_parallel 0.02% : 0.000045s : 1: py_interpret_to_execute 0.01% : 0.000026s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000006s : 1: remove_cast_before_assign_add 0.01% : 0.000021s : 1: remove_dup_value 0.18% : 0.000509s : 1: renormalize.infer 0.16% : 0.000456s : 1: renormalize.specialize 0.00% : 0.000008s : 1: reorder_send_recv_between_fp_bp 0.00% : 0.000012s : 1: rewriter_after_jit_bprop_graph 0.02% : 0.000054s : 1: rewriter_after_opt_a 0.04% : 0.000116s : 1: rewriter_before_opt_a 0.00% : 0.000008s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000008s : 1: slice_recompute_activation 0.00% : 0.000008s : 1: split_layernorm_comm 0.00% : 0.000009s : 1: split_matmul_comm_elemetwise 0.00% : 0.000011s : 1: swap_dp_allreduce_reducescatter 0.05% : 0.000128s : 1: symbol_engine_optimizer 0.04% : 0.000099s : 1: tuple_transform 2.43% : 0.006786s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:42:25.414.390 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.140973, [21] [bootstrap]: 0.124576 [type_inference]: 0.00599355 [event_method]: 1.95e-05 [auto_monad]: 6.403e-05 [graph_reusing]: 5.81e-06 [inline]: 2.96999e-06 [add_attr]: 0.00381006, [1] [add_attr_with_inline]: 0.00379796, [1] [Cycle 1]: 7.662e-05, [2] [tag_attr]: 2.549e-05 [meta_addattr_fg_expand]: 6.11e-06 [parallel-infer-symbol]: 3.65998e-06 [pre_auto_parallel]: 4.135e-05 [insert-virtual-dataset]: 2.43e-06 [parallel-infer-symbol-second]: 8.79983e-07 [dataset_repeat_opt]: 2.84999e-06 [pipeline_split]: 1.62001e-06 [optimize]: 0.00562757, [53] [py_interpret_to_execute]: 2.855e-05 [rewriter_before_opt_a]: 9.175e-05 [opt_a]: 0.00314831, [2] [Cycle 1]: 0.00235595, [45] [expand_dump_flag]: 2.89001e-06 [switch_simplify]: 4.44e-05 [loop_unroll]: 3.028e-05 [a_1]: 0.00067972 [with_stream_mark]: 2.02e-05 [recompute_prepare]: 9.22001e-06 [updatestate_depend_eliminate]: 4.38001e-06 [updatestate_assign_eliminate]: 3.11001e-06 [updatestate_loads_eliminate]: 3.18998e-06 [parameter_eliminate]: 2.17001e-06 [a_2]: 8.877e-05 [accelerated_algorithm]: 7.47002e-06 [shard]: 2.43e-06 [meta_shard_fg_expand]: 2.01998e-06 [shard_inline]: 7.13e-06 [merge_send_recv]: 9.54e-06 [auto_parallel]: 6.89001e-06 [parallel]: 1.989e-05 [flash_sp]: 8.81002e-06 [merge_comm]: 4.02998e-06 [allreduce_fusion]: 3.51999e-06 [matmul_add_comm_reduction]: 1.109e-05 [allreduce_slice_to_reducescatter]: 8.59989e-07 [virtual_shard_identity]: 9.46e-06 [virtual_dataset]: 6.81001e-06 [get_grad_eliminate_]: 6.59001e-06 [virtual_output]: 7.00998e-06 [merge_forward]: 4.21001e-06 [cell_reuse_recompute_pass]: 1.22e-06 [offload_activation]: 1.003e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.375e-05 [merge_recompute_call_nodes]: 1.45001e-06 [before_grad]: 1.173e-05 [set_forward_comm_id_for_comm_node_pass]: 3.95e-06 [meta_fg_expand]: 3.05998e-06 [flash_sp_send_recv_attached]: 2.59001e-06 [receive_attached]: 2.73e-06 [after_resolve]: 1.482e-05 [a_after_grad]: 1.095e-05 [renormalize]: 0.00085734 [add_forward_monad_depend]: 8.18001e-06 [auto_monad_grad]: 2.49999e-06 [auto_monad_eliminator]: 1.937e-05 [cse]: 3.069e-05 [a_3]: 5.56e-05 [Cycle 2]: 0.00078048, [45] [expand_dump_flag]: 2.41e-06 [switch_simplify]: 9.81e-06 [loop_unroll]: 6.23e-06 [a_1]: 0.00016362 [with_stream_mark]: 1.986e-05 [recompute_prepare]: 7.63999e-06 [updatestate_depend_eliminate]: 3.88999e-06 [updatestate_assign_eliminate]: 3.2e-06 [updatestate_loads_eliminate]: 3.21001e-06 [parameter_eliminate]: 2.10002e-06 [a_2]: 8.918e-05 [accelerated_algorithm]: 7.79002e-06 [shard]: 2.16998e-06 [meta_shard_fg_expand]: 2.31998e-06 [shard_inline]: 7.33e-06 [merge_send_recv]: 8.34998e-06 [auto_parallel]: 8.97e-06 [parallel]: 8.38999e-06 [flash_sp]: 3.95e-06 [merge_comm]: 3.92998e-06 [allreduce_fusion]: 3.62002e-06 [matmul_add_comm_reduction]: 1.183e-05 [allreduce_slice_to_reducescatter]: 7.30011e-07 [virtual_shard_identity]: 8.25999e-06 [virtual_dataset]: 6.69001e-06 [get_grad_eliminate_]: 6.53e-06 [virtual_output]: 6.76e-06 [merge_forward]: 4.23001e-06 [cell_reuse_recompute_pass]: 3.04001e-06 [offload_activation]: 8.99998e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.31e-05 [merge_recompute_call_nodes]: 1.92001e-06 [before_grad]: 9.74e-06 [set_forward_comm_id_for_comm_node_pass]: 3.6e-06 [meta_fg_expand]: 2.70002e-06 [flash_sp_send_recv_attached]: 1.63002e-06 [receive_attached]: 1.99e-06 [after_resolve]: 1.235e-05 [a_after_grad]: 9.47999e-06 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 2.66e-06 [auto_monad_grad]: 1.30001e-06 [auto_monad_eliminator]: 9.48002e-06 [cse]: 1.742e-05 [a_3]: 3.694e-05 [py_interpret_to_execute_after_opt_a]: 1.429e-05 [slice_cell_reuse_recomputed_activation]: 2.12999e-06 [rewriter_after_opt_a]: 3.993e-05 [convert_after_rewriter]: 7.37002e-06 [order_py_execute_after_rewriter]: 4.87998e-06 [mutable_eliminate]: 0.00072503 [opt_b]: 0.00022712, [1] [Cycle 1]: 0.0002186, [7] [b_1]: 0.00013006 [b_2]: 9.02e-06 [updatestate_depend_eliminate]: 9.24998e-06 [updatestate_assign_eliminate]: 2.81e-06 [updatestate_loads_eliminate]: 2.84999e-06 [renormalize]: 6.30011e-07 [cse]: 2.493e-05 [optimize_parallel_all_gather_comm]: 1.962e-05 [overlap_param_gather]: 1.92999e-06 [cconv]: 3.572e-05 [loop_unroll]: 0.00050323 [opt_after_cconv]: 0.0001137, [1] [Cycle 1]: 0.00010724, [7] [c_1]: 3.136e-05 [parameter_eliminate]: 4.92999e-06 [updatestate_depend_eliminate]: 8e-06 [updatestate_assign_eliminate]: 2.61999e-06 [updatestate_loads_eliminate]: 2.79999e-06 [cse]: 2.222e-05 [renormalize]: 4.99975e-07 [remove_dup_value]: 1.513e-05 [tuple_transform]: 8.425e-05, [1] [Cycle 1]: 7.952e-05, [4] [d_1]: 4.969e-05 [none_parameter_eliminate]: 1.67001e-06 [renormalize]: 4.39992e-07 [switch_simplify]: 7.78001e-06 [partial_unused_args_eliminate]: 2.51e-06 [add_recomputation]: 5.669e-05 [cse_after_recomputation]: 2.289e-05, [1] [Cycle 1]: 1.747e-05, [1] [cse]: 1.154e-05 [environ_conv]: 5.91e-06 [swap_dp_allreduce_reducescatter]: 5.59998e-06 [bias_add_comm_swap]: 3.16999e-06 [label_micro_interleaved_index]: 5.99e-06 [label_fine_grained_interleaved_index]: 2.70997e-06 [merge_cast_opt]: 1.22e-06 [slice_recompute_activation]: 2.24001e-06 [micro_interleaved_order_control]: 2.56e-06 [assign_add_opt]: 1.26997e-06 [ForceFp32Comm]: 1.18001e-06 [remove_cast_before_assign_add]: 1.74e-06 [full_micro_interleaved_order_control]: 2.37999e-06 [reorder_send_recv_between_fp_bp]: 2.90002e-06 [comm_op_add_attrs]: 1.09003e-06 [add_comm_op_reuse_tag]: 9.70002e-07 [interleave_split_concat_branches]: 1.45999e-06 [interleave_parallel_branches]: 1.29e-06 [overlap_opt_shard_in_pipeline]: 1.59e-06 [overlap_opt_shard_grad_in_pipeline]: 2.15002e-06 [control_data_broadcast_order]: 5.66e-05 [grouped_pairwise_exchange_alltoall]: 1.73997e-06 [offloading_packed_experts]: 4.40999e-06 [overlap_recompute_and_grad_model_parallel]: 5.49e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.23002e-06 [overlap_recompute_allgather_and_fa_grad]: 1.39998e-06 [overlap_recompute_comm]: 2.69001e-06 [overlap_grad_ring_attention]: 4.57e-06 [overlap_grad_flash_sp]: 2.305e-05 [begin_end_overlap_inline]: 5.69999e-07 [split_matmul_comm_elemetwise]: 2.08998e-06 [split_layernorm_comm]: 1.89999e-06 [handle_group_info]: 9.39996e-07 [symbol_engine_optimizer]: 8.514e-05, [1] [Cycle 1]: 8.008e-05, [6] [build]: 4.35e-06 [elim_shapecalc]: 1.178e-05 [elim_not_effective]: 1.417e-05 [opt_reshape]: 8.03999e-06 [fold_const_symbol]: 1.085e-05 [renormalize]: 2.79979e-07 [detach_backward]: 2.06998e-06 [pipeline_parallel_scheduler]: 1.52001e-06 [auto_monad_reorder]: 1.971e-05 [get_jit_bprop_graph]: 2.21998e-06 [rewriter_after_jit_bprop_graph]: 6.68998e-06 [opt_after_jit_grad]: 0.0005696 [validate]: 4.905e-05 Sums bootstrap : 0.124576s : 91.55% type_inference : 0.005994s : 4.40% event_method : 0.000020s : 0.01% auto_monad : 0.000064s : 0.05% graph_reusing : 0.000006s : 0.00% inline : 0.000003s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000025s : 0.02% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.00% parallel-infer-symbol : 0.000004s : 0.00% pre_auto_parallel : 0.000041s : 0.03% insert-virtual-dataset : 0.000002s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000003s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000029s : 0.02% optimize.rewriter_before_opt_a : 0.000092s : 0.07% optimize.opt_a.expand_dump_flag : 0.000005s : 0.00% optimize.opt_a.switch_simplify : 0.000054s : 0.04% optimize.opt_a.loop_unroll : 0.000037s : 0.03% optimize.opt_a.a_1 : 0.000843s : 0.62% optimize.opt_a.with_stream_mark : 0.000040s : 0.03% optimize.opt_a.recompute_prepare : 0.000017s : 0.01% optimize.opt_a.updatestate_depend_eliminate : 0.000008s : 0.01% optimize.opt_a.updatestate_assign_eliminate : 0.000006s : 0.00% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.00% optimize.opt_a.parameter_eliminate : 0.000004s : 0.00% optimize.opt_a.a_2 : 0.000178s : 0.13% optimize.opt_a.accelerated_algorithm : 0.000015s : 0.01% optimize.opt_a.shard : 0.000005s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.00% optimize.opt_a.shard_inline : 0.000014s : 0.01% optimize.opt_a.merge_send_recv : 0.000018s : 0.01% optimize.opt_a.auto_parallel : 0.000016s : 0.01% optimize.opt_a.parallel : 0.000028s : 0.02% optimize.opt_a.flash_sp : 0.000013s : 0.01% optimize.opt_a.merge_comm : 0.000008s : 0.01% optimize.opt_a.allreduce_fusion : 0.000007s : 0.01% optimize.opt_a.matmul_add_comm_reduction : 0.000023s : 0.02% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000002s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000018s : 0.01% optimize.opt_a.virtual_dataset : 0.000014s : 0.01% optimize.opt_a.get_grad_eliminate_ : 0.000013s : 0.01% optimize.opt_a.virtual_output : 0.000014s : 0.01% optimize.opt_a.merge_forward : 0.000008s : 0.01% optimize.opt_a.cell_reuse_recompute_pass : 0.000004s : 0.00% optimize.opt_a.offload_activation : 0.000019s : 0.01% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000027s : 0.02% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.00% optimize.opt_a.before_grad : 0.000021s : 0.02% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000008s : 0.01% optimize.opt_a.meta_fg_expand : 0.000006s : 0.00% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.00% optimize.opt_a.receive_attached : 0.000005s : 0.00% optimize.opt_a.after_resolve : 0.000027s : 0.02% optimize.opt_a.a_after_grad : 0.000020s : 0.02% optimize.opt_a.renormalize : 0.000857s : 0.63% optimize.opt_a.add_forward_monad_depend : 0.000011s : 0.01% optimize.opt_a.auto_monad_grad : 0.000004s : 0.00% optimize.opt_a.auto_monad_eliminator : 0.000029s : 0.02% optimize.opt_a.cse : 0.000048s : 0.04% optimize.opt_a.a_3 : 0.000093s : 0.07% optimize.py_interpret_to_execute_after_opt_a : 0.000014s : 0.01% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.00% optimize.rewriter_after_opt_a : 0.000040s : 0.03% optimize.convert_after_rewriter : 0.000007s : 0.01% optimize.order_py_execute_after_rewriter : 0.000005s : 0.00% optimize.mutable_eliminate : 0.000725s : 0.53% optimize.opt_b.b_1 : 0.000130s : 0.10% optimize.opt_b.b_2 : 0.000009s : 0.01% optimize.opt_b.updatestate_depend_eliminate : 0.000009s : 0.01% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.00% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000025s : 0.02% optimize.optimize_parallel_all_gather_comm : 0.000020s : 0.01% optimize.overlap_param_gather : 0.000002s : 0.00% optimize.cconv : 0.000036s : 0.03% optimize.loop_unroll : 0.000503s : 0.37% optimize.opt_after_cconv.c_1 : 0.000031s : 0.02% optimize.opt_after_cconv.parameter_eliminate : 0.000005s : 0.00% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000008s : 0.01% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.cse : 0.000022s : 0.02% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000015s : 0.01% optimize.tuple_transform.d_1 : 0.000050s : 0.04% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000008s : 0.01% optimize.partial_unused_args_eliminate : 0.000003s : 0.00% optimize.add_recomputation : 0.000057s : 0.04% optimize.cse_after_recomputation.cse : 0.000012s : 0.01% optimize.environ_conv : 0.000006s : 0.00% optimize.swap_dp_allreduce_reducescatter : 0.000006s : 0.00% optimize.bias_add_comm_swap : 0.000003s : 0.00% optimize.label_micro_interleaved_index : 0.000006s : 0.00% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.00% optimize.merge_cast_opt : 0.000001s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.00% optimize.micro_interleaved_order_control : 0.000003s : 0.00% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000002s : 0.00% optimize.full_micro_interleaved_order_control : 0.000002s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.00% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000002s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.00% optimize.control_data_broadcast_order : 0.000057s : 0.04% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.00% optimize.offloading_packed_experts : 0.000004s : 0.00% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.00% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.00% optimize.overlap_recompute_comm : 0.000003s : 0.00% optimize.overlap_grad_ring_attention : 0.000005s : 0.00% optimize.overlap_grad_flash_sp : 0.000023s : 0.02% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.00% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000004s : 0.00% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000012s : 0.01% optimize.symbol_engine_optimizer.elim_not_effective : 0.000014s : 0.01% optimize.symbol_engine_optimizer.opt_reshape : 0.000008s : 0.01% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000011s : 0.01% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.00% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000020s : 0.01% get_jit_bprop_graph : 0.000002s : 0.00% rewriter_after_jit_bprop_graph : 0.000007s : 0.00% opt_after_jit_grad : 0.000570s : 0.42% validate : 0.000049s : 0.04% Time group info: ------[substitution.] 0.000248 34 15.52% : 0.000039s : 6: substitution.arithmetic_simplify 0.78% : 0.000002s : 2: substitution.elim_not_effective 0.55% : 0.000001s : 2: substitution.fold_const_symbol 2.64% : 0.000007s : 4: substitution.graph_param_transform 67.19% : 0.000167s : 4: substitution.inline 1.59% : 0.000004s : 4: substitution.j_node_and_user_rematch 2.46% : 0.000006s : 4: substitution.remove_not_recompute_node 2.72% : 0.000007s : 4: substitution.replace_old_param 6.55% : 0.000016s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.005925 2 87.69% : 0.005196s : 1: type_inference.infer 12.31% : 0.000729s : 1: type_inference.specialize ------[replace.] 0.000065 8 62.93% : 0.000041s : 4: replace.inline 37.07% : 0.000024s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000179 8 91.89% : 0.000164s : 4: match.inline 8.11% : 0.000015s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000224 1278 1.00% : 0.000002s : 13: predicate.accumulaten_eliminater 1.00% : 0.000002s : 4: predicate.ad_related_special_op_eliminate 0.50% : 0.000001s : 8: predicate.addn_check_dump 0.86% : 0.000002s : 13: predicate.addn_zero_filter 0.75% : 0.000002s : 13: predicate.adjust_all_reduce_mul_add 2.59% : 0.000006s : 21: predicate.arithmetic_simplify 0.87% : 0.000002s : 13: predicate.cast_eliminate 0.61% : 0.000001s : 8: predicate.check_bprop_eliminate 0.48% : 0.000001s : 8: predicate.compare_switch_simplify 0.17% : 0.000000s : 4: predicate.const_output_eliminate 0.61% : 0.000001s : 8: predicate.depend_value_elim 0.97% : 0.000002s : 13: predicate.dict_get_item_const_eliminator 1.09% : 0.000002s : 13: predicate.dict_get_item_eliminator 0.95% : 0.000002s : 13: predicate.dict_set_item_eliminator 1.45% : 0.000003s : 8: predicate.dumpgradient_eliminate 0.27% : 0.000001s : 4: predicate.elim_not_effective 0.43% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.29% : 0.000003s : 17: predicate.environ_add_const_eliminate 1.04% : 0.000002s : 17: predicate.environ_get_add_eliminate 1.02% : 0.000002s : 17: predicate.environ_get_depend_swap 1.78% : 0.000004s : 25: predicate.environ_get_eliminate 1.02% : 0.000002s : 17: predicate.environ_get_set_eliminate 1.42% : 0.000003s : 21: predicate.exchange_switch_depend_value 2.49% : 0.000006s : 21: predicate.float_depend_g_call 0.57% : 0.000001s : 8: predicate.float_environ_get_switch 0.71% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.16% : 0.000000s : 4: predicate.fold_const_symbol 0.68% : 0.000002s : 8: predicate.get_grad_eliminate 0.31% : 0.000001s : 4: predicate.graph_param_transform 0.56% : 0.000001s : 8: predicate.incorporate_call 0.46% : 0.000001s : 8: predicate.incorporate_call_switch 6.27% : 0.000014s : 58: predicate.inline 0.98% : 0.000002s : 8: predicate.inline_without_move 0.54% : 0.000001s : 8: predicate.j_node_and_user_rematch 0.91% : 0.000002s : 8: predicate.less_batch_normalization 1.77% : 0.000004s : 25: predicate.list_to_tuple_eliminator_ 2.45% : 0.000005s : 38: predicate.load_eliminater 0.92% : 0.000002s : 4: predicate.loop_unroll_after_grad 2.18% : 0.000005s : 34: predicate.loop_unroll_before_grad 1.42% : 0.000003s : 21: predicate.make_slice_get_slice_eliminator 0.57% : 0.000001s : 8: predicate.merge_addn 0.54% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.60% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.86% : 0.000002s : 13: predicate.minmaximum_grad 1.37% : 0.000003s : 4: predicate.mutable_eliminate 0.47% : 0.000001s : 4: predicate.opt_reshape 0.43% : 0.000001s : 4: predicate.parallel_virtual_node 1.59% : 0.000004s : 21: predicate.partial_defer_inline 1.53% : 0.000003s : 21: predicate.partial_eliminate 0.84% : 0.000002s : 13: predicate.print_const_string_wrapper 0.54% : 0.000001s : 8: predicate.reduce_all_const_elim 1.10% : 0.000002s : 13: predicate.reduce_eliminate 2.55% : 0.000006s : 38: predicate.redundant_stop_gradient_eliminater 0.63% : 0.000001s : 8: predicate.remove_not_recompute_node 1.25% : 0.000003s : 25: predicate.replace_applicator 0.56% : 0.000001s : 8: predicate.replace_old_param 0.43% : 0.000001s : 4: predicate.reset_defer_inline 0.94% : 0.000002s : 13: predicate.reshape_eliminate 0.63% : 0.000001s : 8: predicate.row_tensor_add_zeros_like 0.41% : 0.000001s : 4: predicate.row_tensor_eliminate 0.79% : 0.000002s : 8: predicate.same_eliminate 0.42% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.91% : 0.000002s : 8: predicate.shard_identity_eliminate 0.77% : 0.000002s : 8: predicate.special_op_eliminate 0.75% : 0.000002s : 8: predicate.specialize_transform 0.89% : 0.000002s : 8: predicate.split_environ_get_set_with_tuple_value 0.71% : 0.000002s : 8: predicate.stack_unstack_eliminate 0.37% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.45% : 0.000003s : 21: predicate.switch_defer_inline 1.93% : 0.000004s : 29: predicate.switch_layer_defer_inline 5.07% : 0.000011s : 67: predicate.switch_simplify 0.82% : 0.000002s : 13: predicate.tile_eliminate 1.16% : 0.000003s : 13: predicate.transpose_eliminate 1.57% : 0.000004s : 21: predicate.tuple_list_convert_item_index_to_positive 1.53% : 0.000003s : 21: predicate.tuple_list_get_item_const_eliminator 1.37% : 0.000003s : 21: predicate.tuple_list_get_item_depend_reorder 3.20% : 0.000007s : 33: predicate.tuple_list_get_item_eliminator 1.31% : 0.000003s : 21: predicate.tuple_list_get_set_item_eliminator 2.19% : 0.000005s : 29: predicate.tuple_list_set_item_eliminator 1.70% : 0.000004s : 25: predicate.tuple_to_list_eliminator_ 2.41% : 0.000005s : 38: predicate.updatestate_pure_node_eliminater 2.89% : 0.000006s : 46: predicate.updatestate_useless_node_eliminater 0.40% : 0.000001s : 4: predicate.value_based_eliminate 0.64% : 0.000001s : 8: predicate.virtual_dataset_eliminate 0.66% : 0.000001s : 8: predicate.virtual_output_eliminate 0.29% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.41% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000579 11 53.37% : 0.000309s : 5: func_graph_cloner_run.FuncGraphClonerGraph 46.63% : 0.000270s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.152721 192 0.00% : 0.000004s : 1: ForceFp32Comm 2.50% : 0.003816s : 1: add_attr 2.49% : 0.003803s : 1: add_attr_with_inline 0.00% : 0.000005s : 1: add_comm_op_reuse_tag 0.04% : 0.000061s : 1: add_recomputation 0.00% : 0.000004s : 1: assign_add_opt 0.04% : 0.000069s : 1: auto_monad 0.02% : 0.000024s : 1: auto_monad_reorder 0.00% : 0.000003s : 1: begin_end_overlap_inline 0.00% : 0.000006s : 1: bias_add_comm_swap 81.59% : 0.124612s : 1: bootstrap 0.03% : 0.000039s : 1: cconv 0.00% : 0.000004s : 1: comm_op_add_attrs 0.04% : 0.000063s : 1: control_data_broadcast_order 0.01% : 0.000011s : 1: convert_after_rewriter 0.02% : 0.000026s : 1: cse_after_recomputation 0.00% : 0.000006s : 1: dataset_repeat_opt 0.00% : 0.000005s : 1: detach_backward 0.01% : 0.000009s : 1: environ_conv 0.02% : 0.000026s : 1: event_method 0.00% : 0.000005s : 1: full_micro_interleaved_order_control 0.00% : 0.000005s : 1: get_jit_bprop_graph 0.01% : 0.000009s : 1: graph_reusing 0.00% : 0.000005s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000004s : 1: handle_group_info 0.00% : 0.000006s : 1: inline 0.00% : 0.000006s : 1: insert-virtual-dataset 0.00% : 0.000004s : 1: interleave_parallel_branches 0.00% : 0.000005s : 1: interleave_split_concat_branches 0.00% : 0.000006s : 1: label_fine_grained_interleaved_index 0.01% : 0.000009s : 1: label_micro_interleaved_index 0.34% : 0.000512s : 1: loop_unroll 0.00% : 0.000004s : 1: merge_cast_opt 0.00% : 0.000006s : 1: micro_interleaved_order_control 0.48% : 0.000738s : 1: mutable_eliminate 0.00% : 0.000007s : 1: offloading_packed_experts 0.01% : 0.000016s : 1: opt.transform.loop_unroll_optimizer 0.01% : 0.000020s : 1: opt.transform.mutable_eliminate 0.85% : 0.001305s : 78: opt.transform.opt_a 0.02% : 0.000030s : 1: opt.transform.opt_after_cconv 0.02% : 0.000030s : 1: opt.transform.opt_after_jit_grad 0.07% : 0.000106s : 28: opt.transform.opt_b 0.04% : 0.000055s : 2: opt.transform.opt_trans_graph 0.03% : 0.000041s : 4: opt.transform.symbol_engine_opt 2.06% : 0.003152s : 1: opt_a 0.08% : 0.000118s : 1: opt_after_cconv 0.38% : 0.000581s : 1: opt_after_jit_grad 0.15% : 0.000231s : 1: opt_b 3.69% : 0.005634s : 1: optimize 0.02% : 0.000023s : 1: optimize_parallel_all_gather_comm 0.01% : 0.000008s : 1: order_py_execute_after_rewriter 0.02% : 0.000026s : 1: overlap_grad_flash_sp 0.00% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.00% : 0.000007s : 1: overlap_grad_ring_attention 0.00% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000005s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000005s : 1: overlap_param_gather 0.00% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.01% : 0.000010s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000006s : 1: overlap_recompute_comm 0.00% : 0.000008s : 1: parallel-infer-symbol 0.00% : 0.000004s : 1: parallel-infer-symbol-second 0.00% : 0.000006s : 1: partial_unused_args_eliminate 0.00% : 0.000005s : 1: pipeline_parallel_scheduler 0.00% : 0.000004s : 1: pipeline_split 0.03% : 0.000046s : 1: pre_auto_parallel 0.02% : 0.000033s : 1: py_interpret_to_execute 0.01% : 0.000018s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000005s : 1: remove_cast_before_assign_add 0.01% : 0.000019s : 1: remove_dup_value 0.31% : 0.000474s : 1: renormalize.infer 0.24% : 0.000374s : 1: renormalize.specialize 0.00% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.01% : 0.000010s : 1: rewriter_after_jit_bprop_graph 0.03% : 0.000045s : 1: rewriter_after_opt_a 0.06% : 0.000097s : 1: rewriter_before_opt_a 0.00% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000006s : 1: slice_recompute_activation 0.00% : 0.000005s : 1: split_layernorm_comm 0.00% : 0.000005s : 1: split_matmul_comm_elemetwise 0.01% : 0.000009s : 1: swap_dp_allreduce_reducescatter 0.06% : 0.000088s : 1: symbol_engine_optimizer 0.06% : 0.000087s : 1: tuple_transform 3.94% : 0.006014s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:42:27.203.487 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:42:27.203.770 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.14353, [21] [bootstrap]: 0.00043248 [type_inference]: 0.131289 [event_method]: 2.586e-05 [auto_monad]: 7.028e-05 [graph_reusing]: 5.97001e-06 [inline]: 3.04999e-06 [add_attr]: 0.00378781, [1] [add_attr_with_inline]: 0.00377368, [1] [Cycle 1]: 9.587e-05, [2] [tag_attr]: 2.34e-05 [meta_addattr_fg_expand]: 5.71003e-06 [parallel-infer-symbol]: 3.79002e-06 [pre_auto_parallel]: 4.526e-05 [insert-virtual-dataset]: 2.38998e-06 [parallel-infer-symbol-second]: 7.00005e-07 [dataset_repeat_opt]: 2.39999e-06 [pipeline_split]: 1.57001e-06 [optimize]: 0.00655783, [53] [py_interpret_to_execute]: 3.604e-05 [rewriter_before_opt_a]: 0.00010179 [opt_a]: 0.00367855, [2] [Cycle 1]: 0.00268456, [45] [expand_dump_flag]: 3.25e-06 [switch_simplify]: 4.541e-05 [loop_unroll]: 3.073e-05 [a_1]: 0.0007334 [with_stream_mark]: 2.743e-05 [recompute_prepare]: 1.185e-05 [updatestate_depend_eliminate]: 4.38001e-06 [updatestate_assign_eliminate]: 3.48e-06 [updatestate_loads_eliminate]: 2.94001e-06 [parameter_eliminate]: 2.44001e-06 [a_2]: 0.00012195 [accelerated_algorithm]: 9.43002e-06 [shard]: 2.11e-06 [meta_shard_fg_expand]: 2.71999e-06 [shard_inline]: 8.30999e-06 [merge_send_recv]: 1.003e-05 [auto_parallel]: 1.077e-05 [parallel]: 2.04e-05 [flash_sp]: 1.178e-05 [merge_comm]: 4.02002e-06 [allreduce_fusion]: 4.23001e-06 [matmul_add_comm_reduction]: 1.085e-05 [allreduce_slice_to_reducescatter]: 9.79984e-07 [virtual_shard_identity]: 1.322e-05 [virtual_dataset]: 7.88001e-06 [get_grad_eliminate_]: 6.91999e-06 [virtual_output]: 7.3e-06 [merge_forward]: 4.94e-06 [cell_reuse_recompute_pass]: 2.01e-06 [offload_activation]: 1.293e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.936e-05 [merge_recompute_call_nodes]: 1.64e-06 [before_grad]: 1.256e-05 [set_forward_comm_id_for_comm_node_pass]: 4.18999e-06 [meta_fg_expand]: 2.83e-06 [flash_sp_send_recv_attached]: 3.51999e-06 [receive_attached]: 2.43e-06 [after_resolve]: 1.409e-05 [a_after_grad]: 1.139e-05 [renormalize]: 0.00089813 [add_forward_monad_depend]: 8.3e-06 [auto_monad_grad]: 2.83e-06 [auto_monad_eliminator]: 2.093e-05 [cse]: 3.109e-05 [a_3]: 7.095e-05 [Cycle 2]: 0.00097372, [45] [expand_dump_flag]: 2.53998e-06 [switch_simplify]: 1.071e-05 [loop_unroll]: 6.84999e-06 [a_1]: 0.00015332 [with_stream_mark]: 1.933e-05 [recompute_prepare]: 8.07e-06 [updatestate_depend_eliminate]: 3.58e-06 [updatestate_assign_eliminate]: 3.38e-06 [updatestate_loads_eliminate]: 2.79001e-06 [parameter_eliminate]: 1.69998e-06 [a_2]: 0.0001125 [accelerated_algorithm]: 8.12e-06 [shard]: 2.34999e-06 [meta_shard_fg_expand]: 2.62001e-06 [shard_inline]: 6.54001e-06 [merge_send_recv]: 9.46003e-06 [auto_parallel]: 9.32001e-06 [parallel]: 8.15e-06 [flash_sp]: 5.32001e-06 [merge_comm]: 3.88999e-06 [allreduce_fusion]: 3.67002e-06 [matmul_add_comm_reduction]: 9.83998e-06 [allreduce_slice_to_reducescatter]: 7.10017e-07 [virtual_shard_identity]: 9.36e-06 [virtual_dataset]: 6.04999e-06 [get_grad_eliminate_]: 5.92001e-06 [virtual_output]: 7.88001e-06 [merge_forward]: 6.49999e-06 [cell_reuse_recompute_pass]: 2.59001e-06 [offload_activation]: 9.77001e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.644e-05 [merge_recompute_call_nodes]: 1.54e-06 [before_grad]: 1.074e-05 [set_forward_comm_id_for_comm_node_pass]: 4.75999e-06 [meta_fg_expand]: 2.36e-06 [flash_sp_send_recv_attached]: 1.50999e-06 [receive_attached]: 2.29999e-06 [after_resolve]: 1.552e-05 [a_after_grad]: 1.077e-05 [renormalize]: 8.9989e-08 [add_forward_monad_depend]: 3.39001e-06 [auto_monad_grad]: 2.48e-06 [auto_monad_eliminator]: 1.28e-05 [cse]: 2.143e-05 [a_3]: 5.513e-05 [py_interpret_to_execute_after_opt_a]: 2.261e-05 [slice_cell_reuse_recomputed_activation]: 5.42001e-06 [rewriter_after_opt_a]: 5.044e-05 [convert_after_rewriter]: 1.187e-05 [order_py_execute_after_rewriter]: 8.42998e-06 [mutable_eliminate]: 0.00079447 [opt_b]: 0.00030335, [1] [Cycle 1]: 0.00029073, [7] [b_1]: 0.00017454 [b_2]: 9.31002e-06 [updatestate_depend_eliminate]: 1.074e-05 [updatestate_assign_eliminate]: 3.2e-06 [updatestate_loads_eliminate]: 2.88003e-06 [renormalize]: 9.10019e-07 [cse]: 2.998e-05 [optimize_parallel_all_gather_comm]: 2.687e-05 [overlap_param_gather]: 4.84998e-06 [cconv]: 4.389e-05 [loop_unroll]: 0.00052458 [opt_after_cconv]: 0.00013965, [1] [Cycle 1]: 0.00012964, [7] [c_1]: 3.266e-05 [parameter_eliminate]: 5.24998e-06 [updatestate_depend_eliminate]: 7.39002e-06 [updatestate_assign_eliminate]: 3.18e-06 [updatestate_loads_eliminate]: 2.75002e-06 [cse]: 2.203e-05 [renormalize]: 4.69998e-07 [remove_dup_value]: 1.774e-05 [tuple_transform]: 9.863e-05, [1] [Cycle 1]: 9.113e-05, [4] [d_1]: 5.072e-05 [none_parameter_eliminate]: 1.57999e-06 [renormalize]: 2.3999e-07 [switch_simplify]: 7.16001e-06 [partial_unused_args_eliminate]: 5.04e-06 [add_recomputation]: 6.003e-05 [cse_after_recomputation]: 2.893e-05, [1] [Cycle 1]: 2.161e-05, [1] [cse]: 1.199e-05 [environ_conv]: 9.77001e-06 [swap_dp_allreduce_reducescatter]: 7.95998e-06 [bias_add_comm_swap]: 6.06e-06 [label_micro_interleaved_index]: 7.71999e-06 [label_fine_grained_interleaved_index]: 5.74999e-06 [merge_cast_opt]: 3.75998e-06 [slice_recompute_activation]: 4.58999e-06 [micro_interleaved_order_control]: 5.69999e-06 [assign_add_opt]: 3.73001e-06 [ForceFp32Comm]: 3.37002e-06 [remove_cast_before_assign_add]: 3.37002e-06 [full_micro_interleaved_order_control]: 5.18002e-06 [reorder_send_recv_between_fp_bp]: 5.77001e-06 [comm_op_add_attrs]: 3.7e-06 [add_comm_op_reuse_tag]: 3.55e-06 [interleave_split_concat_branches]: 3.78001e-06 [interleave_parallel_branches]: 3.58e-06 [overlap_opt_shard_in_pipeline]: 3.63999e-06 [overlap_opt_shard_grad_in_pipeline]: 4.45999e-06 [control_data_broadcast_order]: 1.598e-05 [grouped_pairwise_exchange_alltoall]: 4.28001e-06 [offloading_packed_experts]: 7.70998e-06 [overlap_recompute_and_grad_model_parallel]: 8.07e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.74002e-06 [overlap_recompute_allgather_and_fa_grad]: 3.66999e-06 [overlap_recompute_comm]: 5.406e-05 [overlap_grad_ring_attention]: 8.94e-06 [overlap_grad_flash_sp]: 2.689e-05 [begin_end_overlap_inline]: 3.08998e-06 [split_matmul_comm_elemetwise]: 5.10001e-06 [split_layernorm_comm]: 4.23001e-06 [handle_group_info]: 3.54002e-06 [symbol_engine_optimizer]: 0.00010637, [1] [Cycle 1]: 9.723e-05, [6] [build]: 4.57e-06 [elim_shapecalc]: 1.11e-05 [elim_not_effective]: 1.465e-05 [opt_reshape]: 7.38999e-06 [fold_const_symbol]: 1.046e-05 [renormalize]: 2.40019e-07 [detach_backward]: 4.15e-06 [pipeline_parallel_scheduler]: 1.76e-06 [auto_monad_reorder]: 2.039e-05 [get_jit_bprop_graph]: 2.14999e-06 [rewriter_after_jit_bprop_graph]: 5.62001e-06 [opt_after_jit_grad]: 0.00059782 [validate]: 4.537e-05 Sums bootstrap : 0.000432s : 0.31% type_inference : 0.131289s : 95.26% event_method : 0.000026s : 0.02% auto_monad : 0.000070s : 0.05% graph_reusing : 0.000006s : 0.00% inline : 0.000003s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000023s : 0.02% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.00% parallel-infer-symbol : 0.000004s : 0.00% pre_auto_parallel : 0.000045s : 0.03% insert-virtual-dataset : 0.000002s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000036s : 0.03% optimize.rewriter_before_opt_a : 0.000102s : 0.07% optimize.opt_a.expand_dump_flag : 0.000006s : 0.00% optimize.opt_a.switch_simplify : 0.000056s : 0.04% optimize.opt_a.loop_unroll : 0.000038s : 0.03% optimize.opt_a.a_1 : 0.000887s : 0.64% optimize.opt_a.with_stream_mark : 0.000047s : 0.03% optimize.opt_a.recompute_prepare : 0.000020s : 0.01% optimize.opt_a.updatestate_depend_eliminate : 0.000008s : 0.01% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.00% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.00% optimize.opt_a.parameter_eliminate : 0.000004s : 0.00% optimize.opt_a.a_2 : 0.000234s : 0.17% optimize.opt_a.accelerated_algorithm : 0.000018s : 0.01% optimize.opt_a.shard : 0.000004s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.000005s : 0.00% optimize.opt_a.shard_inline : 0.000015s : 0.01% optimize.opt_a.merge_send_recv : 0.000019s : 0.01% optimize.opt_a.auto_parallel : 0.000020s : 0.01% optimize.opt_a.parallel : 0.000029s : 0.02% optimize.opt_a.flash_sp : 0.000017s : 0.01% optimize.opt_a.merge_comm : 0.000008s : 0.01% optimize.opt_a.allreduce_fusion : 0.000008s : 0.01% optimize.opt_a.matmul_add_comm_reduction : 0.000021s : 0.02% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000002s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000023s : 0.02% optimize.opt_a.virtual_dataset : 0.000014s : 0.01% optimize.opt_a.get_grad_eliminate_ : 0.000013s : 0.01% optimize.opt_a.virtual_output : 0.000015s : 0.01% optimize.opt_a.merge_forward : 0.000011s : 0.01% optimize.opt_a.cell_reuse_recompute_pass : 0.000005s : 0.00% optimize.opt_a.offload_activation : 0.000023s : 0.02% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000036s : 0.03% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.00% optimize.opt_a.before_grad : 0.000023s : 0.02% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000009s : 0.01% optimize.opt_a.meta_fg_expand : 0.000005s : 0.00% optimize.opt_a.flash_sp_send_recv_attached : 0.000005s : 0.00% optimize.opt_a.receive_attached : 0.000005s : 0.00% optimize.opt_a.after_resolve : 0.000030s : 0.02% optimize.opt_a.a_after_grad : 0.000022s : 0.02% optimize.opt_a.renormalize : 0.000898s : 0.65% optimize.opt_a.add_forward_monad_depend : 0.000012s : 0.01% optimize.opt_a.auto_monad_grad : 0.000005s : 0.00% optimize.opt_a.auto_monad_eliminator : 0.000034s : 0.02% optimize.opt_a.cse : 0.000053s : 0.04% optimize.opt_a.a_3 : 0.000126s : 0.09% optimize.py_interpret_to_execute_after_opt_a : 0.000023s : 0.02% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.00% optimize.rewriter_after_opt_a : 0.000050s : 0.04% optimize.convert_after_rewriter : 0.000012s : 0.01% optimize.order_py_execute_after_rewriter : 0.000008s : 0.01% optimize.mutable_eliminate : 0.000794s : 0.58% optimize.opt_b.b_1 : 0.000175s : 0.13% optimize.opt_b.b_2 : 0.000009s : 0.01% optimize.opt_b.updatestate_depend_eliminate : 0.000011s : 0.01% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.00% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000030s : 0.02% optimize.optimize_parallel_all_gather_comm : 0.000027s : 0.02% optimize.overlap_param_gather : 0.000005s : 0.00% optimize.cconv : 0.000044s : 0.03% optimize.loop_unroll : 0.000525s : 0.38% optimize.opt_after_cconv.c_1 : 0.000033s : 0.02% optimize.opt_after_cconv.parameter_eliminate : 0.000005s : 0.00% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000007s : 0.01% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.cse : 0.000022s : 0.02% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000018s : 0.01% optimize.tuple_transform.d_1 : 0.000051s : 0.04% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000007s : 0.01% optimize.partial_unused_args_eliminate : 0.000005s : 0.00% optimize.add_recomputation : 0.000060s : 0.04% optimize.cse_after_recomputation.cse : 0.000012s : 0.01% optimize.environ_conv : 0.000010s : 0.01% optimize.swap_dp_allreduce_reducescatter : 0.000008s : 0.01% optimize.bias_add_comm_swap : 0.000006s : 0.00% optimize.label_micro_interleaved_index : 0.000008s : 0.01% optimize.label_fine_grained_interleaved_index : 0.000006s : 0.00% optimize.merge_cast_opt : 0.000004s : 0.00% optimize.slice_recompute_activation : 0.000005s : 0.00% optimize.micro_interleaved_order_control : 0.000006s : 0.00% optimize.assign_add_opt : 0.000004s : 0.00% optimize.ForceFp32Comm : 0.000003s : 0.00% optimize.remove_cast_before_assign_add : 0.000003s : 0.00% optimize.full_micro_interleaved_order_control : 0.000005s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000006s : 0.00% optimize.comm_op_add_attrs : 0.000004s : 0.00% optimize.add_comm_op_reuse_tag : 0.000004s : 0.00% optimize.interleave_split_concat_branches : 0.000004s : 0.00% optimize.interleave_parallel_branches : 0.000004s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000004s : 0.00% optimize.control_data_broadcast_order : 0.000016s : 0.01% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.00% optimize.offloading_packed_experts : 0.000008s : 0.01% optimize.overlap_recompute_and_grad_model_parallel : 0.000008s : 0.01% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.00% optimize.overlap_recompute_comm : 0.000054s : 0.04% optimize.overlap_grad_ring_attention : 0.000009s : 0.01% optimize.overlap_grad_flash_sp : 0.000027s : 0.02% optimize.begin_end_overlap_inline : 0.000003s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000005s : 0.00% optimize.split_layernorm_comm : 0.000004s : 0.00% optimize.handle_group_info : 0.000004s : 0.00% optimize.symbol_engine_optimizer.build : 0.000005s : 0.00% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000011s : 0.01% optimize.symbol_engine_optimizer.elim_not_effective : 0.000015s : 0.01% optimize.symbol_engine_optimizer.opt_reshape : 0.000007s : 0.01% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000010s : 0.01% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000004s : 0.00% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000020s : 0.01% get_jit_bprop_graph : 0.000002s : 0.00% rewriter_after_jit_bprop_graph : 0.000006s : 0.00% opt_after_jit_grad : 0.000598s : 0.43% validate : 0.000045s : 0.03% Time group info: ------[substitution.] 0.000265 34 16.36% : 0.000043s : 6: substitution.arithmetic_simplify 0.88% : 0.000002s : 2: substitution.elim_not_effective 0.74% : 0.000002s : 2: substitution.fold_const_symbol 2.51% : 0.000007s : 4: substitution.graph_param_transform 66.50% : 0.000176s : 4: substitution.inline 1.85% : 0.000005s : 4: substitution.j_node_and_user_rematch 2.36% : 0.000006s : 4: substitution.remove_not_recompute_node 2.78% : 0.000007s : 4: substitution.replace_old_param 6.03% : 0.000016s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.131217 2 99.35% : 0.130369s : 1: type_inference.infer 0.65% : 0.000849s : 1: type_inference.specialize ------[replace.] 0.000070 8 61.23% : 0.000043s : 4: replace.inline 38.77% : 0.000027s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000186 8 92.40% : 0.000172s : 4: match.inline 7.60% : 0.000014s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000232 1278 0.78% : 0.000002s : 13: predicate.accumulaten_eliminater 0.84% : 0.000002s : 4: predicate.ad_related_special_op_eliminate 0.49% : 0.000001s : 8: predicate.addn_check_dump 1.12% : 0.000003s : 13: predicate.addn_zero_filter 0.76% : 0.000002s : 13: predicate.adjust_all_reduce_mul_add 2.63% : 0.000006s : 21: predicate.arithmetic_simplify 0.91% : 0.000002s : 13: predicate.cast_eliminate 0.83% : 0.000002s : 8: predicate.check_bprop_eliminate 0.58% : 0.000001s : 8: predicate.compare_switch_simplify 0.17% : 0.000000s : 4: predicate.const_output_eliminate 0.64% : 0.000001s : 8: predicate.depend_value_elim 0.80% : 0.000002s : 13: predicate.dict_get_item_const_eliminator 1.04% : 0.000002s : 13: predicate.dict_get_item_eliminator 0.83% : 0.000002s : 13: predicate.dict_set_item_eliminator 1.14% : 0.000003s : 8: predicate.dumpgradient_eliminate 0.25% : 0.000001s : 4: predicate.elim_not_effective 0.38% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.17% : 0.000003s : 17: predicate.environ_add_const_eliminate 0.95% : 0.000002s : 17: predicate.environ_get_add_eliminate 0.96% : 0.000002s : 17: predicate.environ_get_depend_swap 1.58% : 0.000004s : 25: predicate.environ_get_eliminate 0.99% : 0.000002s : 17: predicate.environ_get_set_eliminate 1.45% : 0.000003s : 21: predicate.exchange_switch_depend_value 2.59% : 0.000006s : 21: predicate.float_depend_g_call 0.52% : 0.000001s : 8: predicate.float_environ_get_switch 0.73% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.16% : 0.000000s : 4: predicate.fold_const_symbol 0.63% : 0.000001s : 8: predicate.get_grad_eliminate 0.28% : 0.000001s : 4: predicate.graph_param_transform 0.53% : 0.000001s : 8: predicate.incorporate_call 0.41% : 0.000001s : 8: predicate.incorporate_call_switch 7.01% : 0.000016s : 58: predicate.inline 0.92% : 0.000002s : 8: predicate.inline_without_move 0.29% : 0.000001s : 8: predicate.j_node_and_user_rematch 1.22% : 0.000003s : 8: predicate.less_batch_normalization 1.76% : 0.000004s : 25: predicate.list_to_tuple_eliminator_ 2.21% : 0.000005s : 38: predicate.load_eliminater 1.25% : 0.000003s : 4: predicate.loop_unroll_after_grad 2.25% : 0.000005s : 34: predicate.loop_unroll_before_grad 1.49% : 0.000003s : 21: predicate.make_slice_get_slice_eliminator 0.57% : 0.000001s : 8: predicate.merge_addn 0.59% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.56% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.75% : 0.000002s : 13: predicate.minmaximum_grad 1.52% : 0.000004s : 4: predicate.mutable_eliminate 0.41% : 0.000001s : 4: predicate.opt_reshape 0.52% : 0.000001s : 4: predicate.parallel_virtual_node 2.01% : 0.000005s : 21: predicate.partial_defer_inline 1.41% : 0.000003s : 21: predicate.partial_eliminate 0.82% : 0.000002s : 13: predicate.print_const_string_wrapper 0.61% : 0.000001s : 8: predicate.reduce_all_const_elim 1.14% : 0.000003s : 13: predicate.reduce_eliminate 2.37% : 0.000005s : 38: predicate.redundant_stop_gradient_eliminater 0.54% : 0.000001s : 8: predicate.remove_not_recompute_node 1.17% : 0.000003s : 25: predicate.replace_applicator 0.61% : 0.000001s : 8: predicate.replace_old_param 0.54% : 0.000001s : 4: predicate.reset_defer_inline 0.86% : 0.000002s : 13: predicate.reshape_eliminate 0.63% : 0.000001s : 8: predicate.row_tensor_add_zeros_like 0.42% : 0.000001s : 4: predicate.row_tensor_eliminate 0.98% : 0.000002s : 8: predicate.same_eliminate 0.41% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.85% : 0.000002s : 8: predicate.shard_identity_eliminate 0.76% : 0.000002s : 8: predicate.special_op_eliminate 0.64% : 0.000001s : 8: predicate.specialize_transform 1.14% : 0.000003s : 8: predicate.split_environ_get_set_with_tuple_value 1.10% : 0.000003s : 8: predicate.stack_unstack_eliminate 0.37% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.34% : 0.000003s : 21: predicate.switch_defer_inline 1.82% : 0.000004s : 29: predicate.switch_layer_defer_inline 4.88% : 0.000011s : 67: predicate.switch_simplify 0.89% : 0.000002s : 13: predicate.tile_eliminate 0.77% : 0.000002s : 13: predicate.transpose_eliminate 1.36% : 0.000003s : 21: predicate.tuple_list_convert_item_index_to_positive 1.32% : 0.000003s : 21: predicate.tuple_list_get_item_const_eliminator 1.23% : 0.000003s : 21: predicate.tuple_list_get_item_depend_reorder 3.36% : 0.000008s : 33: predicate.tuple_list_get_item_eliminator 1.30% : 0.000003s : 21: predicate.tuple_list_get_set_item_eliminator 2.73% : 0.000006s : 29: predicate.tuple_list_set_item_eliminator 1.68% : 0.000004s : 25: predicate.tuple_to_list_eliminator_ 2.09% : 0.000005s : 38: predicate.updatestate_pure_node_eliminater 2.89% : 0.000007s : 46: predicate.updatestate_useless_node_eliminater 0.32% : 0.000001s : 4: predicate.value_based_eliminate 0.67% : 0.000002s : 8: predicate.virtual_dataset_eliminate 0.58% : 0.000001s : 8: predicate.virtual_output_eliminate 0.25% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.66% : 0.000002s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.124782 11 99.74% : 0.124452s : 5: func_graph_cloner_run.FuncGraphClonerGraph 0.26% : 0.000329s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.156323 192 0.00% : 0.000006s : 1: ForceFp32Comm 2.43% : 0.003800s : 1: add_attr 2.42% : 0.003778s : 1: add_attr_with_inline 0.00% : 0.000007s : 1: add_comm_op_reuse_tag 0.04% : 0.000064s : 1: add_recomputation 0.00% : 0.000007s : 1: assign_add_opt 0.05% : 0.000081s : 1: auto_monad 0.02% : 0.000029s : 1: auto_monad_reorder 0.00% : 0.000006s : 1: begin_end_overlap_inline 0.01% : 0.000009s : 1: bias_add_comm_swap 0.31% : 0.000479s : 1: bootstrap 0.03% : 0.000048s : 1: cconv 0.00% : 0.000006s : 1: comm_op_add_attrs 0.01% : 0.000019s : 1: control_data_broadcast_order 0.01% : 0.000015s : 1: convert_after_rewriter 0.02% : 0.000033s : 1: cse_after_recomputation 0.01% : 0.000008s : 1: dataset_repeat_opt 0.01% : 0.000022s : 1: detach_backward 0.01% : 0.000013s : 1: environ_conv 0.02% : 0.000038s : 1: event_method 0.00% : 0.000008s : 1: full_micro_interleaved_order_control 0.01% : 0.000008s : 1: get_jit_bprop_graph 0.01% : 0.000013s : 1: graph_reusing 0.00% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000006s : 1: handle_group_info 0.01% : 0.000009s : 1: inline 0.01% : 0.000009s : 1: insert-virtual-dataset 0.00% : 0.000006s : 1: interleave_parallel_branches 0.00% : 0.000007s : 1: interleave_split_concat_branches 0.01% : 0.000009s : 1: label_fine_grained_interleaved_index 0.01% : 0.000010s : 1: label_micro_interleaved_index 0.34% : 0.000533s : 1: loop_unroll 0.00% : 0.000006s : 1: merge_cast_opt 0.01% : 0.000009s : 1: micro_interleaved_order_control 0.52% : 0.000806s : 1: mutable_eliminate 0.01% : 0.000011s : 1: offloading_packed_experts 0.01% : 0.000019s : 1: opt.transform.loop_unroll_optimizer 0.03% : 0.000046s : 1: opt.transform.mutable_eliminate 0.88% : 0.001380s : 78: opt.transform.opt_a 0.02% : 0.000031s : 1: opt.transform.opt_after_cconv 0.02% : 0.000029s : 1: opt.transform.opt_after_jit_grad 0.07% : 0.000108s : 28: opt.transform.opt_b 0.04% : 0.000056s : 2: opt.transform.opt_trans_graph 0.03% : 0.000040s : 4: opt.transform.symbol_engine_opt 2.36% : 0.003683s : 1: opt_a 0.09% : 0.000143s : 1: opt_after_cconv 0.39% : 0.000609s : 1: opt_after_jit_grad 0.20% : 0.000307s : 1: opt_b 4.41% : 0.006901s : 1: optimize 0.02% : 0.000031s : 1: optimize_parallel_all_gather_comm 0.01% : 0.000012s : 1: order_py_execute_after_rewriter 0.02% : 0.000030s : 1: overlap_grad_flash_sp 0.00% : 0.000006s : 1: overlap_grad_matmul_and_grad_allreduce 0.01% : 0.000012s : 1: overlap_grad_ring_attention 0.00% : 0.000007s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000006s : 1: overlap_opt_shard_in_pipeline 0.01% : 0.000008s : 1: overlap_param_gather 0.00% : 0.000006s : 1: overlap_recompute_allgather_and_fa_grad 0.01% : 0.000012s : 1: overlap_recompute_and_grad_model_parallel 0.04% : 0.000059s : 1: overlap_recompute_comm 0.01% : 0.000011s : 1: parallel-infer-symbol 0.00% : 0.000006s : 1: parallel-infer-symbol-second 0.00% : 0.000008s : 1: partial_unused_args_eliminate 0.01% : 0.000009s : 1: pipeline_parallel_scheduler 0.00% : 0.000007s : 1: pipeline_split 0.03% : 0.000053s : 1: pre_auto_parallel 0.03% : 0.000040s : 1: py_interpret_to_execute 0.02% : 0.000028s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000006s : 1: remove_cast_before_assign_add 0.01% : 0.000021s : 1: remove_dup_value 0.30% : 0.000474s : 1: renormalize.infer 0.26% : 0.000413s : 1: renormalize.specialize 0.01% : 0.000008s : 1: reorder_send_recv_between_fp_bp 0.01% : 0.000012s : 1: rewriter_after_jit_bprop_graph 0.04% : 0.000056s : 1: rewriter_after_opt_a 0.07% : 0.000107s : 1: rewriter_before_opt_a 0.01% : 0.000008s : 1: slice_cell_reuse_recomputed_activation 0.01% : 0.000008s : 1: slice_recompute_activation 0.00% : 0.000008s : 1: split_layernorm_comm 0.00% : 0.000008s : 1: split_matmul_comm_elemetwise 0.01% : 0.000011s : 1: swap_dp_allreduce_reducescatter 0.07% : 0.000109s : 1: symbol_engine_optimizer 0.06% : 0.000102s : 1: tuple_transform 84.02% : 0.131344s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:42:28.942.495 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.163592, [21] [bootstrap]: 0.00048905 [type_inference]: 0.152475 [event_method]: 2.325e-05 [auto_monad]: 7.279e-05 [graph_reusing]: 6.49001e-06 [inline]: 2.94001e-06 [add_attr]: 0.00387677, [1] [add_attr_with_inline]: 0.00386386, [1] [Cycle 1]: 7.632e-05, [2] [tag_attr]: 2.376e-05 [meta_addattr_fg_expand]: 5.76e-06 [parallel-infer-symbol]: 3.67002e-06 [pre_auto_parallel]: 4.225e-05 [insert-virtual-dataset]: 2.81e-06 [parallel-infer-symbol-second]: 8.30012e-07 [dataset_repeat_opt]: 2.33998e-06 [pipeline_split]: 1.97001e-06 [optimize]: 0.00578419, [53] [py_interpret_to_execute]: 3.166e-05 [rewriter_before_opt_a]: 9.28e-05 [opt_a]: 0.00340474, [2] [Cycle 1]: 0.00226108, [45] [expand_dump_flag]: 3.14999e-06 [switch_simplify]: 4.403e-05 [loop_unroll]: 3.026e-05 [a_1]: 0.00067963 [with_stream_mark]: 1.923e-05 [recompute_prepare]: 9.96e-06 [updatestate_depend_eliminate]: 4.62998e-06 [updatestate_assign_eliminate]: 3.2e-06 [updatestate_loads_eliminate]: 3.07002e-06 [parameter_eliminate]: 2.64999e-06 [a_2]: 8.722e-05 [accelerated_algorithm]: 7.84002e-06 [shard]: 2.17001e-06 [meta_shard_fg_expand]: 1.94e-06 [shard_inline]: 6.44999e-06 [merge_send_recv]: 8.76997e-06 [auto_parallel]: 6.69999e-06 [parallel]: 2.014e-05 [flash_sp]: 8.67e-06 [merge_comm]: 4.13999e-06 [allreduce_fusion]: 3.65e-06 [matmul_add_comm_reduction]: 9.34e-06 [allreduce_slice_to_reducescatter]: 8.49977e-07 [virtual_shard_identity]: 8.07e-06 [virtual_dataset]: 6.70998e-06 [get_grad_eliminate_]: 6.12001e-06 [virtual_output]: 6.70998e-06 [merge_forward]: 4.23001e-06 [cell_reuse_recompute_pass]: 1.57999e-06 [offload_activation]: 1.083e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.166e-05 [merge_recompute_call_nodes]: 1.45999e-06 [before_grad]: 1.055e-05 [set_forward_comm_id_for_comm_node_pass]: 4.08999e-06 [meta_fg_expand]: 2.93e-06 [flash_sp_send_recv_attached]: 2.36998e-06 [receive_attached]: 2.70002e-06 [after_resolve]: 1.191e-05 [a_after_grad]: 1.007e-05 [renormalize]: 0.00081066 [add_forward_monad_depend]: 7.48999e-06 [auto_monad_grad]: 2.64999e-06 [auto_monad_eliminator]: 2.008e-05 [cse]: 3.335e-05 [a_3]: 5.461e-05 [Cycle 2]: 0.00113134, [45] [expand_dump_flag]: 1.47001e-06 [switch_simplify]: 9.34e-06 [loop_unroll]: 6.24999e-06 [a_1]: 0.00014895 [with_stream_mark]: 1.652e-05 [recompute_prepare]: 8.15e-06 [updatestate_depend_eliminate]: 4.18001e-06 [updatestate_assign_eliminate]: 3.03e-06 [updatestate_loads_eliminate]: 2.79999e-06 [parameter_eliminate]: 1.59e-06 [a_2]: 8.081e-05 [accelerated_algorithm]: 0.0002032 [shard]: 1.63002e-06 [meta_shard_fg_expand]: 3.6e-06 [shard_inline]: 1.268e-05 [merge_send_recv]: 1.006e-05 [auto_parallel]: 9.49e-06 [parallel]: 9.02e-06 [flash_sp]: 4.65999e-06 [merge_comm]: 4.42998e-06 [allreduce_fusion]: 3.28e-06 [matmul_add_comm_reduction]: 8.09002e-06 [allreduce_slice_to_reducescatter]: 1.15001e-06 [virtual_shard_identity]: 9.32001e-06 [virtual_dataset]: 6.44999e-06 [get_grad_eliminate_]: 6.29999e-06 [virtual_output]: 5.78002e-06 [merge_forward]: 4.35e-06 [cell_reuse_recompute_pass]: 2.37999e-06 [offload_activation]: 9.64e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.54e-05 [merge_recompute_call_nodes]: 1.32e-06 [before_grad]: 1.269e-05 [set_forward_comm_id_for_comm_node_pass]: 4e-06 [meta_fg_expand]: 2.34999e-06 [flash_sp_send_recv_attached]: 1.44998e-06 [receive_attached]: 2.22001e-06 [after_resolve]: 1.33e-05 [a_after_grad]: 1.003e-05 [renormalize]: 1.00001e-07 [add_forward_monad_depend]: 4e-06 [auto_monad_grad]: 1.80001e-06 [auto_monad_eliminator]: 1.139e-05 [cse]: 1.918e-05 [a_3]: 3.962e-05 [py_interpret_to_execute_after_opt_a]: 1.686e-05 [slice_cell_reuse_recomputed_activation]: 2.22999e-06 [rewriter_after_opt_a]: 3.974e-05 [convert_after_rewriter]: 7.55e-06 [order_py_execute_after_rewriter]: 5.19e-06 [mutable_eliminate]: 0.00070875 [opt_b]: 0.00022096, [1] [Cycle 1]: 0.00021297, [7] [b_1]: 0.00012565 [b_2]: 8.79e-06 [updatestate_depend_eliminate]: 9.89001e-06 [updatestate_assign_eliminate]: 2.78998e-06 [updatestate_loads_eliminate]: 2.44001e-06 [renormalize]: 8.09989e-07 [cse]: 2.516e-05 [optimize_parallel_all_gather_comm]: 2.023e-05 [overlap_param_gather]: 2.02999e-06 [cconv]: 3.915e-05 [loop_unroll]: 0.00048209 [opt_after_cconv]: 0.00011373, [1] [Cycle 1]: 0.0001065, [7] [c_1]: 3.321e-05 [parameter_eliminate]: 4.87e-06 [updatestate_depend_eliminate]: 6.16e-06 [updatestate_assign_eliminate]: 2.50997e-06 [updatestate_loads_eliminate]: 3.14999e-06 [cse]: 2.148e-05 [renormalize]: 3.89991e-07 [remove_dup_value]: 1.544e-05 [tuple_transform]: 7.903e-05, [1] [Cycle 1]: 7.421e-05, [4] [d_1]: 4.713e-05 [none_parameter_eliminate]: 1.84e-06 [renormalize]: 4.19997e-07 [switch_simplify]: 7.01999e-06 [partial_unused_args_eliminate]: 1.81e-06 [add_recomputation]: 5.481e-05 [cse_after_recomputation]: 2.238e-05, [1] [Cycle 1]: 1.787e-05, [1] [cse]: 1.177e-05 [environ_conv]: 6.02001e-06 [swap_dp_allreduce_reducescatter]: 5.43002e-06 [bias_add_comm_swap]: 3.31999e-06 [label_micro_interleaved_index]: 4.90001e-06 [label_fine_grained_interleaved_index]: 2.83e-06 [merge_cast_opt]: 1.31002e-06 [slice_recompute_activation]: 2.17001e-06 [micro_interleaved_order_control]: 2.73998e-06 [assign_add_opt]: 1.15999e-06 [ForceFp32Comm]: 1.08001e-06 [remove_cast_before_assign_add]: 1.10999e-06 [full_micro_interleaved_order_control]: 2.41998e-06 [reorder_send_recv_between_fp_bp]: 3.03e-06 [comm_op_add_attrs]: 1.05001e-06 [add_comm_op_reuse_tag]: 9.89996e-07 [interleave_split_concat_branches]: 1.20001e-06 [interleave_parallel_branches]: 1.07e-06 [overlap_opt_shard_in_pipeline]: 1.31998e-06 [overlap_opt_shard_grad_in_pipeline]: 2.02001e-06 [control_data_broadcast_order]: 1.54e-05 [grouped_pairwise_exchange_alltoall]: 1.52001e-06 [offloading_packed_experts]: 4.25e-06 [overlap_recompute_and_grad_model_parallel]: 4.93001e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.27e-06 [overlap_recompute_allgather_and_fa_grad]: 1.74e-06 [overlap_recompute_comm]: 2.49001e-06 [overlap_grad_ring_attention]: 4.34997e-06 [overlap_grad_flash_sp]: 2.259e-05 [begin_end_overlap_inline]: 6.19999e-07 [split_matmul_comm_elemetwise]: 2.07001e-06 [split_layernorm_comm]: 1.67999e-06 [handle_group_info]: 1.08001e-06 [symbol_engine_optimizer]: 8.124e-05, [1] [Cycle 1]: 7.65e-05, [6] [build]: 3.89997e-06 [elim_shapecalc]: 1.091e-05 [elim_not_effective]: 1.421e-05 [opt_reshape]: 8.01001e-06 [fold_const_symbol]: 1.012e-05 [renormalize]: 3.30008e-07 [detach_backward]: 2.33998e-06 [pipeline_parallel_scheduler]: 1.58002e-06 [auto_monad_reorder]: 1.739e-05 [get_jit_bprop_graph]: 1.91e-06 [rewriter_after_jit_bprop_graph]: 6.53e-06 [opt_after_jit_grad]: 0.00055937 [validate]: 4.427e-05 Sums bootstrap : 0.000489s : 0.31% type_inference : 0.152475s : 96.19% event_method : 0.000023s : 0.01% auto_monad : 0.000073s : 0.05% graph_reusing : 0.000006s : 0.00% inline : 0.000003s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000024s : 0.01% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.00% parallel-infer-symbol : 0.000004s : 0.00% pre_auto_parallel : 0.000042s : 0.03% insert-virtual-dataset : 0.000003s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000032s : 0.02% optimize.rewriter_before_opt_a : 0.000093s : 0.06% optimize.opt_a.expand_dump_flag : 0.000005s : 0.00% optimize.opt_a.switch_simplify : 0.000053s : 0.03% optimize.opt_a.loop_unroll : 0.000037s : 0.02% optimize.opt_a.a_1 : 0.000829s : 0.52% optimize.opt_a.with_stream_mark : 0.000036s : 0.02% optimize.opt_a.recompute_prepare : 0.000018s : 0.01% optimize.opt_a.updatestate_depend_eliminate : 0.000009s : 0.01% optimize.opt_a.updatestate_assign_eliminate : 0.000006s : 0.00% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.00% optimize.opt_a.parameter_eliminate : 0.000004s : 0.00% optimize.opt_a.a_2 : 0.000168s : 0.11% optimize.opt_a.accelerated_algorithm : 0.000211s : 0.13% optimize.opt_a.shard : 0.000004s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.000006s : 0.00% optimize.opt_a.shard_inline : 0.000019s : 0.01% optimize.opt_a.merge_send_recv : 0.000019s : 0.01% optimize.opt_a.auto_parallel : 0.000016s : 0.01% optimize.opt_a.parallel : 0.000029s : 0.02% optimize.opt_a.flash_sp : 0.000013s : 0.01% optimize.opt_a.merge_comm : 0.000009s : 0.01% optimize.opt_a.allreduce_fusion : 0.000007s : 0.00% optimize.opt_a.matmul_add_comm_reduction : 0.000017s : 0.01% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000002s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000017s : 0.01% optimize.opt_a.virtual_dataset : 0.000013s : 0.01% optimize.opt_a.get_grad_eliminate_ : 0.000012s : 0.01% optimize.opt_a.virtual_output : 0.000012s : 0.01% optimize.opt_a.merge_forward : 0.000009s : 0.01% optimize.opt_a.cell_reuse_recompute_pass : 0.000004s : 0.00% optimize.opt_a.offload_activation : 0.000020s : 0.01% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000027s : 0.02% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.00% optimize.opt_a.before_grad : 0.000023s : 0.01% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000008s : 0.01% optimize.opt_a.meta_fg_expand : 0.000005s : 0.00% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.00% optimize.opt_a.receive_attached : 0.000005s : 0.00% optimize.opt_a.after_resolve : 0.000025s : 0.02% optimize.opt_a.a_after_grad : 0.000020s : 0.01% optimize.opt_a.renormalize : 0.000811s : 0.51% optimize.opt_a.add_forward_monad_depend : 0.000011s : 0.01% optimize.opt_a.auto_monad_grad : 0.000004s : 0.00% optimize.opt_a.auto_monad_eliminator : 0.000031s : 0.02% optimize.opt_a.cse : 0.000053s : 0.03% optimize.opt_a.a_3 : 0.000094s : 0.06% optimize.py_interpret_to_execute_after_opt_a : 0.000017s : 0.01% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.00% optimize.rewriter_after_opt_a : 0.000040s : 0.03% optimize.convert_after_rewriter : 0.000008s : 0.00% optimize.order_py_execute_after_rewriter : 0.000005s : 0.00% optimize.mutable_eliminate : 0.000709s : 0.45% optimize.opt_b.b_1 : 0.000126s : 0.08% optimize.opt_b.b_2 : 0.000009s : 0.01% optimize.opt_b.updatestate_depend_eliminate : 0.000010s : 0.01% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000002s : 0.00% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000025s : 0.02% optimize.optimize_parallel_all_gather_comm : 0.000020s : 0.01% optimize.overlap_param_gather : 0.000002s : 0.00% optimize.cconv : 0.000039s : 0.02% optimize.loop_unroll : 0.000482s : 0.30% optimize.opt_after_cconv.c_1 : 0.000033s : 0.02% optimize.opt_after_cconv.parameter_eliminate : 0.000005s : 0.00% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.00% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.cse : 0.000021s : 0.01% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000015s : 0.01% optimize.tuple_transform.d_1 : 0.000047s : 0.03% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000007s : 0.00% optimize.partial_unused_args_eliminate : 0.000002s : 0.00% optimize.add_recomputation : 0.000055s : 0.03% optimize.cse_after_recomputation.cse : 0.000012s : 0.01% optimize.environ_conv : 0.000006s : 0.00% optimize.swap_dp_allreduce_reducescatter : 0.000005s : 0.00% optimize.bias_add_comm_swap : 0.000003s : 0.00% optimize.label_micro_interleaved_index : 0.000005s : 0.00% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.00% optimize.merge_cast_opt : 0.000001s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.00% optimize.micro_interleaved_order_control : 0.000003s : 0.00% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.00% optimize.full_micro_interleaved_order_control : 0.000002s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.00% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.00% optimize.control_data_broadcast_order : 0.000015s : 0.01% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.00% optimize.offloading_packed_experts : 0.000004s : 0.00% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.00% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000002s : 0.00% optimize.overlap_recompute_comm : 0.000002s : 0.00% optimize.overlap_grad_ring_attention : 0.000004s : 0.00% optimize.overlap_grad_flash_sp : 0.000023s : 0.01% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.00% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000004s : 0.00% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000011s : 0.01% optimize.symbol_engine_optimizer.elim_not_effective : 0.000014s : 0.01% optimize.symbol_engine_optimizer.opt_reshape : 0.000008s : 0.01% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000010s : 0.01% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.00% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000017s : 0.01% get_jit_bprop_graph : 0.000002s : 0.00% rewriter_after_jit_bprop_graph : 0.000007s : 0.00% opt_after_jit_grad : 0.000559s : 0.35% validate : 0.000044s : 0.03% Time group info: ------[substitution.] 0.000244 34 15.15% : 0.000037s : 6: substitution.arithmetic_simplify 0.73% : 0.000002s : 2: substitution.elim_not_effective 0.60% : 0.000001s : 2: substitution.fold_const_symbol 2.76% : 0.000007s : 4: substitution.graph_param_transform 68.85% : 0.000168s : 4: substitution.inline 1.67% : 0.000004s : 4: substitution.j_node_and_user_rematch 1.80% : 0.000004s : 4: substitution.remove_not_recompute_node 2.49% : 0.000006s : 4: substitution.replace_old_param 5.95% : 0.000015s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.152383 2 99.36% : 0.151409s : 1: type_inference.infer 0.64% : 0.000975s : 1: type_inference.specialize ------[replace.] 0.000066 8 64.39% : 0.000043s : 4: replace.inline 35.61% : 0.000024s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000178 8 92.87% : 0.000166s : 4: match.inline 7.13% : 0.000013s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000221 1278 0.96% : 0.000002s : 13: predicate.accumulaten_eliminater 0.93% : 0.000002s : 4: predicate.ad_related_special_op_eliminate 0.45% : 0.000001s : 8: predicate.addn_check_dump 1.25% : 0.000003s : 13: predicate.addn_zero_filter 0.78% : 0.000002s : 13: predicate.adjust_all_reduce_mul_add 2.47% : 0.000005s : 21: predicate.arithmetic_simplify 0.98% : 0.000002s : 13: predicate.cast_eliminate 0.94% : 0.000002s : 8: predicate.check_bprop_eliminate 0.46% : 0.000001s : 8: predicate.compare_switch_simplify 0.18% : 0.000000s : 4: predicate.const_output_eliminate 0.55% : 0.000001s : 8: predicate.depend_value_elim 0.99% : 0.000002s : 13: predicate.dict_get_item_const_eliminator 0.92% : 0.000002s : 13: predicate.dict_get_item_eliminator 0.77% : 0.000002s : 13: predicate.dict_set_item_eliminator 1.43% : 0.000003s : 8: predicate.dumpgradient_eliminate 0.23% : 0.000001s : 4: predicate.elim_not_effective 0.38% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.06% : 0.000002s : 17: predicate.environ_add_const_eliminate 0.99% : 0.000002s : 17: predicate.environ_get_add_eliminate 1.01% : 0.000002s : 17: predicate.environ_get_depend_swap 1.57% : 0.000003s : 25: predicate.environ_get_eliminate 1.01% : 0.000002s : 17: predicate.environ_get_set_eliminate 1.33% : 0.000003s : 21: predicate.exchange_switch_depend_value 2.35% : 0.000005s : 21: predicate.float_depend_g_call 0.56% : 0.000001s : 8: predicate.float_environ_get_switch 0.70% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.18% : 0.000000s : 4: predicate.fold_const_symbol 0.63% : 0.000001s : 8: predicate.get_grad_eliminate 0.34% : 0.000001s : 4: predicate.graph_param_transform 0.58% : 0.000001s : 8: predicate.incorporate_call 0.48% : 0.000001s : 8: predicate.incorporate_call_switch 7.34% : 0.000016s : 58: predicate.inline 0.79% : 0.000002s : 8: predicate.inline_without_move 0.31% : 0.000001s : 8: predicate.j_node_and_user_rematch 1.45% : 0.000003s : 8: predicate.less_batch_normalization 1.65% : 0.000004s : 25: predicate.list_to_tuple_eliminator_ 2.34% : 0.000005s : 38: predicate.load_eliminater 1.12% : 0.000002s : 4: predicate.loop_unroll_after_grad 2.23% : 0.000005s : 34: predicate.loop_unroll_before_grad 1.66% : 0.000004s : 21: predicate.make_slice_get_slice_eliminator 0.46% : 0.000001s : 8: predicate.merge_addn 0.64% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.72% : 0.000002s : 8: predicate.mini_step_allgather_replace 0.80% : 0.000002s : 13: predicate.minmaximum_grad 1.76% : 0.000004s : 4: predicate.mutable_eliminate 0.36% : 0.000001s : 4: predicate.opt_reshape 0.39% : 0.000001s : 4: predicate.parallel_virtual_node 1.83% : 0.000004s : 21: predicate.partial_defer_inline 1.55% : 0.000003s : 21: predicate.partial_eliminate 0.93% : 0.000002s : 13: predicate.print_const_string_wrapper 0.57% : 0.000001s : 8: predicate.reduce_all_const_elim 1.03% : 0.000002s : 13: predicate.reduce_eliminate 2.33% : 0.000005s : 38: predicate.redundant_stop_gradient_eliminater 0.52% : 0.000001s : 8: predicate.remove_not_recompute_node 1.35% : 0.000003s : 25: predicate.replace_applicator 0.41% : 0.000001s : 8: predicate.replace_old_param 0.43% : 0.000001s : 4: predicate.reset_defer_inline 0.82% : 0.000002s : 13: predicate.reshape_eliminate 0.71% : 0.000002s : 8: predicate.row_tensor_add_zeros_like 0.34% : 0.000001s : 4: predicate.row_tensor_eliminate 1.04% : 0.000002s : 8: predicate.same_eliminate 0.63% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.87% : 0.000002s : 8: predicate.shard_identity_eliminate 0.68% : 0.000002s : 8: predicate.special_op_eliminate 0.64% : 0.000001s : 8: predicate.specialize_transform 1.00% : 0.000002s : 8: predicate.split_environ_get_set_with_tuple_value 0.73% : 0.000002s : 8: predicate.stack_unstack_eliminate 0.38% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.43% : 0.000003s : 21: predicate.switch_defer_inline 1.92% : 0.000004s : 29: predicate.switch_layer_defer_inline 5.15% : 0.000011s : 67: predicate.switch_simplify 0.87% : 0.000002s : 13: predicate.tile_eliminate 0.80% : 0.000002s : 13: predicate.transpose_eliminate 1.39% : 0.000003s : 21: predicate.tuple_list_convert_item_index_to_positive 1.43% : 0.000003s : 21: predicate.tuple_list_get_item_const_eliminator 1.44% : 0.000003s : 21: predicate.tuple_list_get_item_depend_reorder 2.98% : 0.000007s : 33: predicate.tuple_list_get_item_eliminator 1.30% : 0.000003s : 21: predicate.tuple_list_get_set_item_eliminator 2.09% : 0.000005s : 29: predicate.tuple_list_set_item_eliminator 1.61% : 0.000004s : 25: predicate.tuple_to_list_eliminator_ 2.28% : 0.000005s : 38: predicate.updatestate_pure_node_eliminater 2.84% : 0.000006s : 46: predicate.updatestate_useless_node_eliminater 0.33% : 0.000001s : 4: predicate.value_based_eliminate 0.62% : 0.000001s : 8: predicate.virtual_dataset_eliminate 0.59% : 0.000001s : 8: predicate.virtual_output_eliminate 0.25% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.43% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000698 11 48.22% : 0.000337s : 5: func_graph_cloner_run.FuncGraphClonerGraph 51.78% : 0.000362s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.175693 192 0.00% : 0.000004s : 1: ForceFp32Comm 2.21% : 0.003884s : 1: add_attr 2.20% : 0.003868s : 1: add_attr_with_inline 0.00% : 0.000004s : 1: add_comm_op_reuse_tag 0.03% : 0.000059s : 1: add_recomputation 0.00% : 0.000004s : 1: assign_add_opt 0.04% : 0.000079s : 1: auto_monad 0.01% : 0.000022s : 1: auto_monad_reorder 0.00% : 0.000003s : 1: begin_end_overlap_inline 0.00% : 0.000006s : 1: bias_add_comm_swap 0.30% : 0.000523s : 1: bootstrap 0.02% : 0.000043s : 1: cconv 0.00% : 0.000004s : 1: comm_op_add_attrs 0.01% : 0.000019s : 1: control_data_broadcast_order 0.01% : 0.000011s : 1: convert_after_rewriter 0.01% : 0.000025s : 1: cse_after_recomputation 0.00% : 0.000005s : 1: dataset_repeat_opt 0.00% : 0.000006s : 1: detach_backward 0.01% : 0.000009s : 1: environ_conv 0.02% : 0.000030s : 1: event_method 0.00% : 0.000005s : 1: full_micro_interleaved_order_control 0.00% : 0.000005s : 1: get_jit_bprop_graph 0.01% : 0.000010s : 1: graph_reusing 0.00% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000004s : 1: handle_group_info 0.00% : 0.000006s : 1: inline 0.00% : 0.000006s : 1: insert-virtual-dataset 0.00% : 0.000004s : 1: interleave_parallel_branches 0.00% : 0.000004s : 1: interleave_split_concat_branches 0.00% : 0.000006s : 1: label_fine_grained_interleaved_index 0.00% : 0.000008s : 1: label_micro_interleaved_index 0.28% : 0.000491s : 1: loop_unroll 0.00% : 0.000004s : 1: merge_cast_opt 0.00% : 0.000005s : 1: micro_interleaved_order_control 0.41% : 0.000721s : 1: mutable_eliminate 0.00% : 0.000007s : 1: offloading_packed_experts 0.01% : 0.000018s : 1: opt.transform.loop_unroll_optimizer 0.01% : 0.000021s : 1: opt.transform.mutable_eliminate 0.84% : 0.001481s : 78: opt.transform.opt_a 0.02% : 0.000032s : 1: opt.transform.opt_after_cconv 0.02% : 0.000029s : 1: opt.transform.opt_after_jit_grad 0.06% : 0.000101s : 28: opt.transform.opt_b 0.03% : 0.000052s : 2: opt.transform.opt_trans_graph 0.02% : 0.000039s : 4: opt.transform.symbol_engine_opt 1.94% : 0.003408s : 1: opt_a 0.07% : 0.000118s : 1: opt_after_cconv 0.32% : 0.000571s : 1: opt_after_jit_grad 0.13% : 0.000225s : 1: opt_b 3.30% : 0.005789s : 1: optimize 0.01% : 0.000024s : 1: optimize_parallel_all_gather_comm 0.00% : 0.000008s : 1: order_py_execute_after_rewriter 0.01% : 0.000026s : 1: overlap_grad_flash_sp 0.00% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.00% : 0.000007s : 1: overlap_grad_ring_attention 0.00% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000005s : 1: overlap_param_gather 0.00% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.00% : 0.000009s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000005s : 1: overlap_recompute_comm 0.00% : 0.000007s : 1: parallel-infer-symbol 0.00% : 0.000004s : 1: parallel-infer-symbol-second 0.00% : 0.000005s : 1: partial_unused_args_eliminate 0.00% : 0.000005s : 1: pipeline_parallel_scheduler 0.00% : 0.000005s : 1: pipeline_split 0.03% : 0.000047s : 1: pre_auto_parallel 0.02% : 0.000036s : 1: py_interpret_to_execute 0.01% : 0.000021s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000004s : 1: remove_cast_before_assign_add 0.01% : 0.000019s : 1: remove_dup_value 0.23% : 0.000403s : 1: renormalize.infer 0.23% : 0.000399s : 1: renormalize.specialize 0.00% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.01% : 0.000010s : 1: rewriter_after_jit_bprop_graph 0.03% : 0.000045s : 1: rewriter_after_opt_a 0.06% : 0.000098s : 1: rewriter_before_opt_a 0.00% : 0.000006s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000005s : 1: slice_recompute_activation 0.00% : 0.000005s : 1: split_layernorm_comm 0.00% : 0.000005s : 1: split_matmul_comm_elemetwise 0.00% : 0.000008s : 1: swap_dp_allreduce_reducescatter 0.05% : 0.000084s : 1: symbol_engine_optimizer 0.05% : 0.000082s : 1: tuple_transform 86.80% : 0.152501s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:42:30.511.605 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:42:30.511.893 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.167551, [21] [bootstrap]: 0.00046638 [type_inference]: 0.0953185 [event_method]: 2.026e-05 [auto_monad]: 6.442e-05 [graph_reusing]: 6.17999e-06 [inline]: 2.56998e-06 [add_attr]: 0.00373264, [1] [add_attr_with_inline]: 0.00372033, [1] [Cycle 1]: 9.495e-05, [2] [tag_attr]: 2.338e-05 [meta_addattr_fg_expand]: 6.04001e-06 [parallel-infer-symbol]: 4.30999e-06 [pre_auto_parallel]: 4.095e-05 [insert-virtual-dataset]: 2.51998e-06 [parallel-infer-symbol-second]: 7.59988e-07 [dataset_repeat_opt]: 2.09999e-06 [pipeline_split]: 1.59998e-06 [optimize]: 0.0665364, [53] [py_interpret_to_execute]: 3.432e-05 [rewriter_before_opt_a]: 9.927e-05 [opt_a]: 0.0637532, [2] [Cycle 1]: 0.00249414, [45] [expand_dump_flag]: 3.18e-06 [switch_simplify]: 4.245e-05 [loop_unroll]: 3.065e-05 [a_1]: 0.00068348 [with_stream_mark]: 2.352e-05 [recompute_prepare]: 1.194e-05 [updatestate_depend_eliminate]: 4.01001e-06 [updatestate_assign_eliminate]: 3.50003e-06 [updatestate_loads_eliminate]: 2.93e-06 [parameter_eliminate]: 2.41e-06 [a_2]: 0.00011854 [accelerated_algorithm]: 7.88001e-06 [shard]: 2.14999e-06 [meta_shard_fg_expand]: 2.69001e-06 [shard_inline]: 6.99001e-06 [merge_send_recv]: 9.34998e-06 [auto_parallel]: 8.23999e-06 [parallel]: 2.008e-05 [flash_sp]: 9.78998e-06 [merge_comm]: 4.42e-06 [allreduce_fusion]: 3.61001e-06 [matmul_add_comm_reduction]: 1.072e-05 [allreduce_slice_to_reducescatter]: 7.2e-07 [virtual_shard_identity]: 9.61e-06 [virtual_dataset]: 6.90998e-06 [get_grad_eliminate_]: 6.94001e-06 [virtual_output]: 7.45e-06 [merge_forward]: 4.75001e-06 [cell_reuse_recompute_pass]: 1.57999e-06 [offload_activation]: 1.079e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.565e-05 [merge_recompute_call_nodes]: 1.50999e-06 [before_grad]: 1.179e-05 [set_forward_comm_id_for_comm_node_pass]: 4.01001e-06 [meta_fg_expand]: 3.18e-06 [flash_sp_send_recv_attached]: 2.51e-06 [receive_attached]: 2.02001e-06 [after_resolve]: 1.313e-05 [a_after_grad]: 1.106e-05 [renormalize]: 0.00082615 [add_forward_monad_depend]: 6.94999e-06 [auto_monad_grad]: 2.52001e-06 [auto_monad_eliminator]: 1.803e-05 [cse]: 3.118e-05 [a_3]: 6.859e-05 [Cycle 2]: 0.0612384, [45] [expand_dump_flag]: 2.79001e-06 [switch_simplify]: 9.02e-06 [loop_unroll]: 7.13998e-06 [a_1]: 0.0602299 [with_stream_mark]: 4.378e-05 [recompute_prepare]: 1.881e-05 [updatestate_depend_eliminate]: 5.85002e-06 [updatestate_assign_eliminate]: 3.43e-06 [updatestate_loads_eliminate]: 2.95998e-06 [parameter_eliminate]: 2.62001e-06 [a_2]: 0.00014657 [accelerated_algorithm]: 8.20999e-06 [shard]: 3.93001e-06 [meta_shard_fg_expand]: 3.07002e-06 [shard_inline]: 7.51999e-06 [merge_send_recv]: 1.102e-05 [auto_parallel]: 1.062e-05 [parallel]: 1.025e-05 [flash_sp]: 4.43001e-06 [merge_comm]: 3.65998e-06 [allreduce_fusion]: 1.588e-05 [matmul_add_comm_reduction]: 1.304e-05 [allreduce_slice_to_reducescatter]: 1.07998e-06 [virtual_shard_identity]: 7.63001e-06 [virtual_dataset]: 6.64999e-06 [get_grad_eliminate_]: 6.65998e-06 [virtual_output]: 6.12999e-06 [merge_forward]: 4.91997e-06 [cell_reuse_recompute_pass]: 3.53e-06 [offload_activation]: 1.073e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.516e-05 [merge_recompute_call_nodes]: 1.77001e-06 [before_grad]: 1.129e-05 [set_forward_comm_id_for_comm_node_pass]: 4.04002e-06 [meta_fg_expand]: 3.08e-06 [flash_sp_send_recv_attached]: 2.34001e-06 [receive_attached]: 2.37001e-06 [after_resolve]: 1.482e-05 [a_after_grad]: 1.092e-05 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 3.41001e-06 [auto_monad_grad]: 2.86999e-06 [auto_monad_eliminator]: 2.012e-05 [cse]: 3.157e-05 [a_3]: 5.288e-05 [py_interpret_to_execute_after_opt_a]: 2.284e-05 [slice_cell_reuse_recomputed_activation]: 4.67e-06 [rewriter_after_opt_a]: 4.988e-05 [convert_after_rewriter]: 1.031e-05 [order_py_execute_after_rewriter]: 8.27e-06 [mutable_eliminate]: 0.00079563 [opt_b]: 0.0003032, [1] [Cycle 1]: 0.00029107, [7] [b_1]: 0.00017912 [b_2]: 9.39e-06 [updatestate_depend_eliminate]: 9.17001e-06 [updatestate_assign_eliminate]: 3.14999e-06 [updatestate_loads_eliminate]: 3.72998e-06 [renormalize]: 7.10017e-07 [cse]: 2.707e-05 [optimize_parallel_all_gather_comm]: 2.332e-05 [overlap_param_gather]: 5.68997e-06 [cconv]: 4.354e-05 [loop_unroll]: 0.0005192 [opt_after_cconv]: 0.00013556, [1] [Cycle 1]: 0.00012584, [7] [c_1]: 3.289e-05 [parameter_eliminate]: 4.67e-06 [updatestate_depend_eliminate]: 6.04001e-06 [updatestate_assign_eliminate]: 2.64999e-06 [updatestate_loads_eliminate]: 2.21e-06 [cse]: 2.038e-05 [renormalize]: 9.40025e-07 [remove_dup_value]: 1.931e-05 [tuple_transform]: 9.9e-05, [1] [Cycle 1]: 9.088e-05, [4] [d_1]: 5.014e-05 [none_parameter_eliminate]: 1.89e-06 [renormalize]: 4.19997e-07 [switch_simplify]: 7.35998e-06 [partial_unused_args_eliminate]: 4.50001e-06 [add_recomputation]: 5.883e-05 [cse_after_recomputation]: 2.87e-05, [1] [Cycle 1]: 2.136e-05, [1] [cse]: 1.199e-05 [environ_conv]: 8.81002e-06 [swap_dp_allreduce_reducescatter]: 8.15e-06 [bias_add_comm_swap]: 6.29001e-06 [label_micro_interleaved_index]: 8.55001e-06 [label_fine_grained_interleaved_index]: 5.00001e-06 [merge_cast_opt]: 4.33001e-06 [slice_recompute_activation]: 4.68001e-06 [micro_interleaved_order_control]: 4.84e-06 [assign_add_opt]: 3.58999e-06 [ForceFp32Comm]: 3.14999e-06 [remove_cast_before_assign_add]: 3.71999e-06 [full_micro_interleaved_order_control]: 4.74e-06 [reorder_send_recv_between_fp_bp]: 5.42999e-06 [comm_op_add_attrs]: 3.31999e-06 [add_comm_op_reuse_tag]: 3.33998e-06 [interleave_split_concat_branches]: 3.51001e-06 [interleave_parallel_branches]: 3.31001e-06 [overlap_opt_shard_in_pipeline]: 3.66999e-06 [overlap_opt_shard_grad_in_pipeline]: 4.25e-06 [control_data_broadcast_order]: 1.7e-05 [grouped_pairwise_exchange_alltoall]: 4.34002e-06 [offloading_packed_experts]: 6.39999e-06 [overlap_recompute_and_grad_model_parallel]: 7.73999e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.46001e-06 [overlap_recompute_allgather_and_fa_grad]: 3.73001e-06 [overlap_recompute_comm]: 4.89998e-06 [overlap_grad_ring_attention]: 7.08e-06 [overlap_grad_flash_sp]: 2.59e-05 [begin_end_overlap_inline]: 3.04999e-06 [split_matmul_comm_elemetwise]: 4.13999e-06 [split_layernorm_comm]: 4.17e-06 [handle_group_info]: 3.39001e-06 [symbol_engine_optimizer]: 0.00010012, [1] [Cycle 1]: 9.237e-05, [6] [build]: 4.19997e-06 [elim_shapecalc]: 1.071e-05 [elim_not_effective]: 1.35e-05 [opt_reshape]: 7.3e-06 [fold_const_symbol]: 1.047e-05 [renormalize]: 1.69995e-07 [detach_backward]: 5.46e-06 [pipeline_parallel_scheduler]: 2.24999e-06 [auto_monad_reorder]: 2.121e-05 [get_jit_bprop_graph]: 1.95001e-06 [rewriter_after_jit_bprop_graph]: 7.05002e-06 [opt_after_jit_grad]: 0.00060844 [validate]: 4.44e-05 Sums bootstrap : 0.000466s : 0.29% type_inference : 0.095319s : 58.90% event_method : 0.000020s : 0.01% auto_monad : 0.000064s : 0.04% graph_reusing : 0.000006s : 0.00% inline : 0.000003s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000023s : 0.01% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.00% parallel-infer-symbol : 0.000004s : 0.00% pre_auto_parallel : 0.000041s : 0.03% insert-virtual-dataset : 0.000003s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000034s : 0.02% optimize.rewriter_before_opt_a : 0.000099s : 0.06% optimize.opt_a.expand_dump_flag : 0.000006s : 0.00% optimize.opt_a.switch_simplify : 0.000051s : 0.03% optimize.opt_a.loop_unroll : 0.000038s : 0.02% optimize.opt_a.a_1 : 0.060913s : 37.64% optimize.opt_a.with_stream_mark : 0.000067s : 0.04% optimize.opt_a.recompute_prepare : 0.000031s : 0.02% optimize.opt_a.updatestate_depend_eliminate : 0.000010s : 0.01% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.00% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.00% optimize.opt_a.parameter_eliminate : 0.000005s : 0.00% optimize.opt_a.a_2 : 0.000265s : 0.16% optimize.opt_a.accelerated_algorithm : 0.000016s : 0.01% optimize.opt_a.shard : 0.000006s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.000006s : 0.00% optimize.opt_a.shard_inline : 0.000015s : 0.01% optimize.opt_a.merge_send_recv : 0.000020s : 0.01% optimize.opt_a.auto_parallel : 0.000019s : 0.01% optimize.opt_a.parallel : 0.000030s : 0.02% optimize.opt_a.flash_sp : 0.000014s : 0.01% optimize.opt_a.merge_comm : 0.000008s : 0.00% optimize.opt_a.allreduce_fusion : 0.000019s : 0.01% optimize.opt_a.matmul_add_comm_reduction : 0.000024s : 0.01% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000002s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000017s : 0.01% optimize.opt_a.virtual_dataset : 0.000014s : 0.01% optimize.opt_a.get_grad_eliminate_ : 0.000014s : 0.01% optimize.opt_a.virtual_output : 0.000014s : 0.01% optimize.opt_a.merge_forward : 0.000010s : 0.01% optimize.opt_a.cell_reuse_recompute_pass : 0.000005s : 0.00% optimize.opt_a.offload_activation : 0.000022s : 0.01% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000031s : 0.02% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.00% optimize.opt_a.before_grad : 0.000023s : 0.01% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000008s : 0.00% optimize.opt_a.meta_fg_expand : 0.000006s : 0.00% optimize.opt_a.flash_sp_send_recv_attached : 0.000005s : 0.00% optimize.opt_a.receive_attached : 0.000004s : 0.00% optimize.opt_a.after_resolve : 0.000028s : 0.02% optimize.opt_a.a_after_grad : 0.000022s : 0.01% optimize.opt_a.renormalize : 0.000826s : 0.51% optimize.opt_a.add_forward_monad_depend : 0.000010s : 0.01% optimize.opt_a.auto_monad_grad : 0.000005s : 0.00% optimize.opt_a.auto_monad_eliminator : 0.000038s : 0.02% optimize.opt_a.cse : 0.000063s : 0.04% optimize.opt_a.a_3 : 0.000121s : 0.08% optimize.py_interpret_to_execute_after_opt_a : 0.000023s : 0.01% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.00% optimize.rewriter_after_opt_a : 0.000050s : 0.03% optimize.convert_after_rewriter : 0.000010s : 0.01% optimize.order_py_execute_after_rewriter : 0.000008s : 0.01% optimize.mutable_eliminate : 0.000796s : 0.49% optimize.opt_b.b_1 : 0.000179s : 0.11% optimize.opt_b.b_2 : 0.000009s : 0.01% optimize.opt_b.updatestate_depend_eliminate : 0.000009s : 0.01% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000004s : 0.00% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000027s : 0.02% optimize.optimize_parallel_all_gather_comm : 0.000023s : 0.01% optimize.overlap_param_gather : 0.000006s : 0.00% optimize.cconv : 0.000044s : 0.03% optimize.loop_unroll : 0.000519s : 0.32% optimize.opt_after_cconv.c_1 : 0.000033s : 0.02% optimize.opt_after_cconv.parameter_eliminate : 0.000005s : 0.00% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.00% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.00% optimize.opt_after_cconv.cse : 0.000020s : 0.01% optimize.opt_after_cconv.renormalize : 0.000001s : 0.00% optimize.remove_dup_value : 0.000019s : 0.01% optimize.tuple_transform.d_1 : 0.000050s : 0.03% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000007s : 0.00% optimize.partial_unused_args_eliminate : 0.000005s : 0.00% optimize.add_recomputation : 0.000059s : 0.04% optimize.cse_after_recomputation.cse : 0.000012s : 0.01% optimize.environ_conv : 0.000009s : 0.01% optimize.swap_dp_allreduce_reducescatter : 0.000008s : 0.01% optimize.bias_add_comm_swap : 0.000006s : 0.00% optimize.label_micro_interleaved_index : 0.000009s : 0.01% optimize.label_fine_grained_interleaved_index : 0.000005s : 0.00% optimize.merge_cast_opt : 0.000004s : 0.00% optimize.slice_recompute_activation : 0.000005s : 0.00% optimize.micro_interleaved_order_control : 0.000005s : 0.00% optimize.assign_add_opt : 0.000004s : 0.00% optimize.ForceFp32Comm : 0.000003s : 0.00% optimize.remove_cast_before_assign_add : 0.000004s : 0.00% optimize.full_micro_interleaved_order_control : 0.000005s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000005s : 0.00% optimize.comm_op_add_attrs : 0.000003s : 0.00% optimize.add_comm_op_reuse_tag : 0.000003s : 0.00% optimize.interleave_split_concat_branches : 0.000004s : 0.00% optimize.interleave_parallel_branches : 0.000003s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000004s : 0.00% optimize.control_data_broadcast_order : 0.000017s : 0.01% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.00% optimize.offloading_packed_experts : 0.000006s : 0.00% optimize.overlap_recompute_and_grad_model_parallel : 0.000008s : 0.00% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000003s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.00% optimize.overlap_recompute_comm : 0.000005s : 0.00% optimize.overlap_grad_ring_attention : 0.000007s : 0.00% optimize.overlap_grad_flash_sp : 0.000026s : 0.02% optimize.begin_end_overlap_inline : 0.000003s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000004s : 0.00% optimize.split_layernorm_comm : 0.000004s : 0.00% optimize.handle_group_info : 0.000003s : 0.00% optimize.symbol_engine_optimizer.build : 0.000004s : 0.00% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000011s : 0.01% optimize.symbol_engine_optimizer.elim_not_effective : 0.000013s : 0.01% optimize.symbol_engine_optimizer.opt_reshape : 0.000007s : 0.00% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000010s : 0.01% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000005s : 0.00% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000021s : 0.01% get_jit_bprop_graph : 0.000002s : 0.00% rewriter_after_jit_bprop_graph : 0.000007s : 0.00% opt_after_jit_grad : 0.000608s : 0.38% validate : 0.000044s : 0.03% Time group info: ------[substitution.] 0.000261 34 18.77% : 0.000049s : 6: substitution.arithmetic_simplify 0.73% : 0.000002s : 2: substitution.elim_not_effective 0.51% : 0.000001s : 2: substitution.fold_const_symbol 2.42% : 0.000006s : 4: substitution.graph_param_transform 64.47% : 0.000168s : 4: substitution.inline 1.97% : 0.000005s : 4: substitution.j_node_and_user_rematch 1.87% : 0.000005s : 4: substitution.remove_not_recompute_node 3.15% : 0.000008s : 4: substitution.replace_old_param 6.11% : 0.000016s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.095257 2 99.22% : 0.094515s : 1: type_inference.infer 0.78% : 0.000742s : 1: type_inference.specialize ------[replace.] 0.000066 8 65.07% : 0.000043s : 4: replace.inline 34.93% : 0.000023s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000180 8 92.12% : 0.000166s : 4: match.inline 7.88% : 0.000014s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000240 1278 0.82% : 0.000002s : 13: predicate.accumulaten_eliminater 0.79% : 0.000002s : 4: predicate.ad_related_special_op_eliminate 0.66% : 0.000002s : 8: predicate.addn_check_dump 0.94% : 0.000002s : 13: predicate.addn_zero_filter 0.69% : 0.000002s : 13: predicate.adjust_all_reduce_mul_add 3.48% : 0.000008s : 21: predicate.arithmetic_simplify 1.00% : 0.000002s : 13: predicate.cast_eliminate 0.79% : 0.000002s : 8: predicate.check_bprop_eliminate 0.49% : 0.000001s : 8: predicate.compare_switch_simplify 0.20% : 0.000000s : 4: predicate.const_output_eliminate 0.80% : 0.000002s : 8: predicate.depend_value_elim 0.79% : 0.000002s : 13: predicate.dict_get_item_const_eliminator 0.89% : 0.000002s : 13: predicate.dict_get_item_eliminator 0.86% : 0.000002s : 13: predicate.dict_set_item_eliminator 1.06% : 0.000003s : 8: predicate.dumpgradient_eliminate 0.17% : 0.000000s : 4: predicate.elim_not_effective 0.47% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.17% : 0.000003s : 17: predicate.environ_add_const_eliminate 1.01% : 0.000002s : 17: predicate.environ_get_add_eliminate 0.96% : 0.000002s : 17: predicate.environ_get_depend_swap 1.73% : 0.000004s : 25: predicate.environ_get_eliminate 0.92% : 0.000002s : 17: predicate.environ_get_set_eliminate 1.29% : 0.000003s : 21: predicate.exchange_switch_depend_value 2.55% : 0.000006s : 21: predicate.float_depend_g_call 0.75% : 0.000002s : 8: predicate.float_environ_get_switch 0.97% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.17% : 0.000000s : 4: predicate.fold_const_symbol 0.68% : 0.000002s : 8: predicate.get_grad_eliminate 0.18% : 0.000000s : 4: predicate.graph_param_transform 0.54% : 0.000001s : 8: predicate.incorporate_call 0.45% : 0.000001s : 8: predicate.incorporate_call_switch 6.29% : 0.000015s : 58: predicate.inline 0.80% : 0.000002s : 8: predicate.inline_without_move 0.27% : 0.000001s : 8: predicate.j_node_and_user_rematch 0.90% : 0.000002s : 8: predicate.less_batch_normalization 1.70% : 0.000004s : 25: predicate.list_to_tuple_eliminator_ 2.16% : 0.000005s : 38: predicate.load_eliminater 0.89% : 0.000002s : 4: predicate.loop_unroll_after_grad 2.02% : 0.000005s : 34: predicate.loop_unroll_before_grad 1.44% : 0.000003s : 21: predicate.make_slice_get_slice_eliminator 0.64% : 0.000002s : 8: predicate.merge_addn 0.84% : 0.000002s : 8: predicate.micro_step_allgather_replace 0.50% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.76% : 0.000002s : 13: predicate.minmaximum_grad 1.64% : 0.000004s : 4: predicate.mutable_eliminate 0.32% : 0.000001s : 4: predicate.opt_reshape 0.43% : 0.000001s : 4: predicate.parallel_virtual_node 1.51% : 0.000004s : 21: predicate.partial_defer_inline 1.38% : 0.000003s : 21: predicate.partial_eliminate 0.89% : 0.000002s : 13: predicate.print_const_string_wrapper 0.66% : 0.000002s : 8: predicate.reduce_all_const_elim 1.06% : 0.000003s : 13: predicate.reduce_eliminate 2.41% : 0.000006s : 38: predicate.redundant_stop_gradient_eliminater 0.48% : 0.000001s : 8: predicate.remove_not_recompute_node 1.38% : 0.000003s : 25: predicate.replace_applicator 0.51% : 0.000001s : 8: predicate.replace_old_param 0.49% : 0.000001s : 4: predicate.reset_defer_inline 0.85% : 0.000002s : 13: predicate.reshape_eliminate 0.68% : 0.000002s : 8: predicate.row_tensor_add_zeros_like 0.41% : 0.000001s : 4: predicate.row_tensor_eliminate 0.95% : 0.000002s : 8: predicate.same_eliminate 0.42% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.82% : 0.000002s : 8: predicate.shard_identity_eliminate 0.62% : 0.000001s : 8: predicate.special_op_eliminate 0.87% : 0.000002s : 8: predicate.specialize_transform 0.98% : 0.000002s : 8: predicate.split_environ_get_set_with_tuple_value 0.94% : 0.000002s : 8: predicate.stack_unstack_eliminate 0.33% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.29% : 0.000003s : 21: predicate.switch_defer_inline 1.86% : 0.000004s : 29: predicate.switch_layer_defer_inline 4.68% : 0.000011s : 67: predicate.switch_simplify 0.79% : 0.000002s : 13: predicate.tile_eliminate 1.98% : 0.000005s : 13: predicate.transpose_eliminate 1.67% : 0.000004s : 21: predicate.tuple_list_convert_item_index_to_positive 1.62% : 0.000004s : 21: predicate.tuple_list_get_item_const_eliminator 1.31% : 0.000003s : 21: predicate.tuple_list_get_item_depend_reorder 3.09% : 0.000007s : 33: predicate.tuple_list_get_item_eliminator 1.60% : 0.000004s : 21: predicate.tuple_list_get_set_item_eliminator 2.21% : 0.000005s : 29: predicate.tuple_list_set_item_eliminator 1.71% : 0.000004s : 25: predicate.tuple_to_list_eliminator_ 2.14% : 0.000005s : 38: predicate.updatestate_pure_node_eliminater 2.90% : 0.000007s : 46: predicate.updatestate_useless_node_eliminater 0.33% : 0.000001s : 4: predicate.value_based_eliminate 0.69% : 0.000002s : 8: predicate.virtual_dataset_eliminate 0.73% : 0.000002s : 8: predicate.virtual_output_eliminate 0.26% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.63% : 0.000002s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000671 11 58.50% : 0.000393s : 5: func_graph_cloner_run.FuncGraphClonerGraph 41.50% : 0.000279s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.300210 192 0.00% : 0.000006s : 1: ForceFp32Comm 1.25% : 0.003745s : 1: add_attr 1.24% : 0.003725s : 1: add_attr_with_inline 0.00% : 0.000007s : 1: add_comm_op_reuse_tag 0.02% : 0.000063s : 1: add_recomputation 0.00% : 0.000006s : 1: assign_add_opt 0.02% : 0.000073s : 1: auto_monad 0.01% : 0.000029s : 1: auto_monad_reorder 0.00% : 0.000006s : 1: begin_end_overlap_inline 0.00% : 0.000009s : 1: bias_add_comm_swap 0.17% : 0.000514s : 1: bootstrap 0.02% : 0.000047s : 1: cconv 0.00% : 0.000006s : 1: comm_op_add_attrs 0.01% : 0.000020s : 1: control_data_broadcast_order 0.00% : 0.000014s : 1: convert_after_rewriter 0.01% : 0.000032s : 1: cse_after_recomputation 0.00% : 0.000008s : 1: dataset_repeat_opt 0.01% : 0.000023s : 1: detach_backward 0.00% : 0.000012s : 1: environ_conv 0.01% : 0.000032s : 1: event_method 0.00% : 0.000007s : 1: full_micro_interleaved_order_control 0.00% : 0.000008s : 1: get_jit_bprop_graph 0.00% : 0.000013s : 1: graph_reusing 0.00% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000006s : 1: handle_group_info 0.00% : 0.000008s : 1: inline 0.00% : 0.000009s : 1: insert-virtual-dataset 0.00% : 0.000006s : 1: interleave_parallel_branches 0.00% : 0.000006s : 1: interleave_split_concat_branches 0.00% : 0.000008s : 1: label_fine_grained_interleaved_index 0.00% : 0.000011s : 1: label_micro_interleaved_index 0.18% : 0.000527s : 1: loop_unroll 0.00% : 0.000007s : 1: merge_cast_opt 0.00% : 0.000007s : 1: micro_interleaved_order_control 0.27% : 0.000804s : 1: mutable_eliminate 0.00% : 0.000009s : 1: offloading_packed_experts 0.01% : 0.000016s : 1: opt.transform.loop_unroll_optimizer 0.01% : 0.000021s : 1: opt.transform.mutable_eliminate 20.46% : 0.061414s : 78: opt.transform.opt_a 0.01% : 0.000031s : 1: opt.transform.opt_after_cconv 0.01% : 0.000028s : 1: opt.transform.opt_after_jit_grad 0.04% : 0.000113s : 28: opt.transform.opt_b 0.02% : 0.000055s : 2: opt.transform.opt_trans_graph 0.01% : 0.000039s : 4: opt.transform.symbol_engine_opt 21.24% : 0.063757s : 1: opt_a 0.05% : 0.000139s : 1: opt_after_cconv 0.21% : 0.000620s : 1: opt_after_jit_grad 0.10% : 0.000307s : 1: opt_b 22.29% : 0.066926s : 1: optimize 0.01% : 0.000027s : 1: optimize_parallel_all_gather_comm 0.00% : 0.000011s : 1: order_py_execute_after_rewriter 0.01% : 0.000029s : 1: overlap_grad_flash_sp 0.00% : 0.000006s : 1: overlap_grad_matmul_and_grad_allreduce 0.00% : 0.000010s : 1: overlap_grad_ring_attention 0.00% : 0.000007s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000007s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000009s : 1: overlap_param_gather 0.00% : 0.000006s : 1: overlap_recompute_allgather_and_fa_grad 0.00% : 0.000011s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000008s : 1: overlap_recompute_comm 0.00% : 0.000011s : 1: parallel-infer-symbol 0.00% : 0.000006s : 1: parallel-infer-symbol-second 0.00% : 0.000008s : 1: partial_unused_args_eliminate 0.00% : 0.000010s : 1: pipeline_parallel_scheduler 0.00% : 0.000007s : 1: pipeline_split 0.02% : 0.000049s : 1: pre_auto_parallel 0.01% : 0.000038s : 1: py_interpret_to_execute 0.01% : 0.000026s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000006s : 1: remove_cast_before_assign_add 0.01% : 0.000023s : 1: remove_dup_value 0.15% : 0.000440s : 1: renormalize.infer 0.13% : 0.000377s : 1: renormalize.specialize 0.00% : 0.000008s : 1: reorder_send_recv_between_fp_bp 0.00% : 0.000013s : 1: rewriter_after_jit_bprop_graph 0.02% : 0.000054s : 1: rewriter_after_opt_a 0.03% : 0.000104s : 1: rewriter_before_opt_a 0.00% : 0.000007s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000009s : 1: slice_recompute_activation 0.00% : 0.000007s : 1: split_layernorm_comm 0.00% : 0.000007s : 1: split_matmul_comm_elemetwise 0.00% : 0.000011s : 1: swap_dp_allreduce_reducescatter 0.03% : 0.000103s : 1: symbol_engine_optimizer 0.03% : 0.000102s : 1: tuple_transform 31.77% : 0.095366s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:42:31.986.572 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0283325, [21] [bootstrap]: 0.00047971 [type_inference]: 0.00635882 [event_method]: 2.054e-05 [auto_monad]: 6.608e-05 [graph_reusing]: 6.14001e-06 [inline]: 2.51998e-06 [add_attr]: 0.00355845, [1] [add_attr_with_inline]: 0.00354728, [1] [Cycle 1]: 7.99e-05, [2] [tag_attr]: 2.288e-05 [meta_addattr_fg_expand]: 5.86e-06 [parallel-infer-symbol]: 3.91999e-06 [pre_auto_parallel]: 4.014e-05 [insert-virtual-dataset]: 2.74001e-06 [parallel-infer-symbol-second]: 7.2e-07 [dataset_repeat_opt]: 1.91998e-06 [pipeline_split]: 1.74998e-06 [optimize]: 0.0170672, [53] [py_interpret_to_execute]: 2.869e-05 [rewriter_before_opt_a]: 9.242e-05 [opt_a]: 0.0147783, [2] [Cycle 1]: 0.014003, [45] [expand_dump_flag]: 2.99999e-06 [switch_simplify]: 4.529e-05 [loop_unroll]: 3.031e-05 [a_1]: 0.00067594 [with_stream_mark]: 1.735e-05 [recompute_prepare]: 9.72001e-06 [updatestate_depend_eliminate]: 4.36002e-06 [updatestate_assign_eliminate]: 3.14999e-06 [updatestate_loads_eliminate]: 2.86999e-06 [parameter_eliminate]: 2.27999e-06 [a_2]: 8.809e-05 [accelerated_algorithm]: 7.31001e-06 [shard]: 1.76e-06 [meta_shard_fg_expand]: 2.17001e-06 [shard_inline]: 6.52001e-06 [merge_send_recv]: 8.39998e-06 [auto_parallel]: 6.89999e-06 [parallel]: 1.998e-05 [flash_sp]: 8.2e-06 [merge_comm]: 4.05e-06 [allreduce_fusion]: 3.57002e-06 [matmul_add_comm_reduction]: 3.561e-05 [allreduce_slice_to_reducescatter]: 6.50005e-07 [virtual_shard_identity]: 9.57999e-06 [virtual_dataset]: 6.89001e-06 [get_grad_eliminate_]: 6.11e-06 [virtual_output]: 6.68e-06 [merge_forward]: 4.24997e-06 [cell_reuse_recompute_pass]: 1.44e-06 [offload_activation]: 1.038e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.255e-05 [merge_recompute_call_nodes]: 1.44e-06 [before_grad]: 1.118e-05 [set_forward_comm_id_for_comm_node_pass]: 4.51002e-06 [meta_fg_expand]: 2.76e-06 [flash_sp_send_recv_attached]: 2.91e-06 [receive_attached]: 2.27001e-06 [after_resolve]: 1.226e-05 [a_after_grad]: 1.055e-05 [renormalize]: 0.0124835 [add_forward_monad_depend]: 1.073e-05 [auto_monad_grad]: 3.58999e-06 [auto_monad_eliminator]: 2.409e-05 [cse]: 3.505e-05 [a_3]: 6.581e-05 [Cycle 2]: 0.00076185, [45] [expand_dump_flag]: 2.08002e-06 [switch_simplify]: 9.70002e-06 [loop_unroll]: 7.08998e-06 [a_1]: 0.00016391 [with_stream_mark]: 2.008e-05 [recompute_prepare]: 7.36999e-06 [updatestate_depend_eliminate]: 4.4e-06 [updatestate_assign_eliminate]: 3.14999e-06 [updatestate_loads_eliminate]: 3.36999e-06 [parameter_eliminate]: 2.76e-06 [a_2]: 7.916e-05 [accelerated_algorithm]: 6.56e-06 [shard]: 2.16998e-06 [meta_shard_fg_expand]: 2.31e-06 [shard_inline]: 5.92999e-06 [merge_send_recv]: 9.42999e-06 [auto_parallel]: 9.49e-06 [parallel]: 1.018e-05 [flash_sp]: 4.16001e-06 [merge_comm]: 3.76999e-06 [allreduce_fusion]: 4.12003e-06 [matmul_add_comm_reduction]: 9.77999e-06 [allreduce_slice_to_reducescatter]: 7.39994e-07 [virtual_shard_identity]: 7.49002e-06 [virtual_dataset]: 6.43998e-06 [get_grad_eliminate_]: 6.25002e-06 [virtual_output]: 6.46999e-06 [merge_forward]: 4.11001e-06 [cell_reuse_recompute_pass]: 3.50003e-06 [offload_activation]: 1.043e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.221e-05 [merge_recompute_call_nodes]: 1.46998e-06 [before_grad]: 9.89999e-06 [set_forward_comm_id_for_comm_node_pass]: 3.8e-06 [meta_fg_expand]: 2.95998e-06 [flash_sp_send_recv_attached]: 2.12999e-06 [receive_attached]: 3.05002e-06 [after_resolve]: 1.212e-05 [a_after_grad]: 9.92001e-06 [renormalize]: 6.00121e-08 [add_forward_monad_depend]: 1.42e-06 [auto_monad_grad]: 1.34e-06 [auto_monad_eliminator]: 9.05999e-06 [cse]: 1.537e-05 [a_3]: 3.775e-05 [py_interpret_to_execute_after_opt_a]: 1.726e-05 [slice_cell_reuse_recomputed_activation]: 2.56e-06 [rewriter_after_opt_a]: 4.186e-05 [convert_after_rewriter]: 7.43e-06 [order_py_execute_after_rewriter]: 5.81998e-06 [mutable_eliminate]: 0.00073817 [opt_b]: 0.00021043, [1] [Cycle 1]: 0.00020288, [7] [b_1]: 0.00012703 [b_2]: 8.38999e-06 [updatestate_depend_eliminate]: 6.63e-06 [updatestate_assign_eliminate]: 2.47001e-06 [updatestate_loads_eliminate]: 2.73998e-06 [renormalize]: 6.09987e-07 [cse]: 1.974e-05 [optimize_parallel_all_gather_comm]: 1.731e-05 [overlap_param_gather]: 2.02001e-06 [cconv]: 2.93e-05 [loop_unroll]: 0.00042911 [opt_after_cconv]: 0.00010294, [1] [Cycle 1]: 9.719e-05, [7] [c_1]: 3.195e-05 [parameter_eliminate]: 3.25e-06 [updatestate_depend_eliminate]: 5.49e-06 [updatestate_assign_eliminate]: 2.51998e-06 [updatestate_loads_eliminate]: 2.32001e-06 [cse]: 1.632e-05 [renormalize]: 3.9002e-07 [remove_dup_value]: 1.359e-05 [tuple_transform]: 7.704e-05, [1] [Cycle 1]: 7.246e-05, [4] [d_1]: 4.467e-05 [none_parameter_eliminate]: 1.64e-06 [renormalize]: 3.60014e-07 [switch_simplify]: 6.97997e-06 [partial_unused_args_eliminate]: 2.05002e-06 [add_recomputation]: 4.951e-05 [cse_after_recomputation]: 2.171e-05, [1] [Cycle 1]: 1.697e-05, [1] [cse]: 1.108e-05 [environ_conv]: 5.62001e-06 [swap_dp_allreduce_reducescatter]: 5.25001e-06 [bias_add_comm_swap]: 2.79999e-06 [label_micro_interleaved_index]: 4.68999e-06 [label_fine_grained_interleaved_index]: 3.17002e-06 [merge_cast_opt]: 1.30001e-06 [slice_recompute_activation]: 1.97999e-06 [micro_interleaved_order_control]: 2.69001e-06 [assign_add_opt]: 1.17e-06 [ForceFp32Comm]: 9.00007e-07 [remove_cast_before_assign_add]: 1.24e-06 [full_micro_interleaved_order_control]: 2.06e-06 [reorder_send_recv_between_fp_bp]: 2.58e-06 [comm_op_add_attrs]: 1.04998e-06 [add_comm_op_reuse_tag]: 9.79984e-07 [interleave_split_concat_branches]: 1.12e-06 [interleave_parallel_branches]: 1.09003e-06 [overlap_opt_shard_in_pipeline]: 1.22e-06 [overlap_opt_shard_grad_in_pipeline]: 1.91e-06 [control_data_broadcast_order]: 1.281e-05 [grouped_pairwise_exchange_alltoall]: 1.81998e-06 [offloading_packed_experts]: 3.85e-06 [overlap_recompute_and_grad_model_parallel]: 4.74e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.21997e-06 [overlap_recompute_allgather_and_fa_grad]: 1.36998e-06 [overlap_recompute_comm]: 2.26998e-06 [overlap_grad_ring_attention]: 4.07003e-06 [overlap_grad_flash_sp]: 2.131e-05 [begin_end_overlap_inline]: 6.19999e-07 [split_matmul_comm_elemetwise]: 2.17001e-06 [split_layernorm_comm]: 1.87999e-06 [handle_group_info]: 1.27e-06 [symbol_engine_optimizer]: 7.474e-05, [1] [Cycle 1]: 7.06e-05, [6] [build]: 3.45e-06 [elim_shapecalc]: 9.89999e-06 [elim_not_effective]: 1.267e-05 [opt_reshape]: 6.81001e-06 [fold_const_symbol]: 9.98002e-06 [renormalize]: 1.60013e-07 [detach_backward]: 2.51e-06 [pipeline_parallel_scheduler]: 1.39998e-06 [auto_monad_reorder]: 1.575e-05 [get_jit_bprop_graph]: 1.57999e-06 [rewriter_after_jit_bprop_graph]: 4.23001e-06 [opt_after_jit_grad]: 0.00048664 [validate]: 4.361e-05 Sums bootstrap : 0.000480s : 2.02% type_inference : 0.006359s : 26.78% event_method : 0.000021s : 0.09% auto_monad : 0.000066s : 0.28% graph_reusing : 0.000006s : 0.03% inline : 0.000003s : 0.01% add_attr.add_attr_with_inline.tag_attr : 0.000023s : 0.10% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.02% parallel-infer-symbol : 0.000004s : 0.02% pre_auto_parallel : 0.000040s : 0.17% insert-virtual-dataset : 0.000003s : 0.01% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.01% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000029s : 0.12% optimize.rewriter_before_opt_a : 0.000092s : 0.39% optimize.opt_a.expand_dump_flag : 0.000005s : 0.02% optimize.opt_a.switch_simplify : 0.000055s : 0.23% optimize.opt_a.loop_unroll : 0.000037s : 0.16% optimize.opt_a.a_1 : 0.000840s : 3.54% optimize.opt_a.with_stream_mark : 0.000037s : 0.16% optimize.opt_a.recompute_prepare : 0.000017s : 0.07% optimize.opt_a.updatestate_depend_eliminate : 0.000009s : 0.04% optimize.opt_a.updatestate_assign_eliminate : 0.000006s : 0.03% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.03% optimize.opt_a.parameter_eliminate : 0.000005s : 0.02% optimize.opt_a.a_2 : 0.000167s : 0.70% optimize.opt_a.accelerated_algorithm : 0.000014s : 0.06% optimize.opt_a.shard : 0.000004s : 0.02% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.02% optimize.opt_a.shard_inline : 0.000012s : 0.05% optimize.opt_a.merge_send_recv : 0.000018s : 0.08% optimize.opt_a.auto_parallel : 0.000016s : 0.07% optimize.opt_a.parallel : 0.000030s : 0.13% optimize.opt_a.flash_sp : 0.000012s : 0.05% optimize.opt_a.merge_comm : 0.000008s : 0.03% optimize.opt_a.allreduce_fusion : 0.000008s : 0.03% optimize.opt_a.matmul_add_comm_reduction : 0.000045s : 0.19% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000017s : 0.07% optimize.opt_a.virtual_dataset : 0.000013s : 0.06% optimize.opt_a.get_grad_eliminate_ : 0.000012s : 0.05% optimize.opt_a.virtual_output : 0.000013s : 0.06% optimize.opt_a.merge_forward : 0.000008s : 0.04% optimize.opt_a.cell_reuse_recompute_pass : 0.000005s : 0.02% optimize.opt_a.offload_activation : 0.000021s : 0.09% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000025s : 0.10% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.01% optimize.opt_a.before_grad : 0.000021s : 0.09% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000008s : 0.04% optimize.opt_a.meta_fg_expand : 0.000006s : 0.02% optimize.opt_a.flash_sp_send_recv_attached : 0.000005s : 0.02% optimize.opt_a.receive_attached : 0.000005s : 0.02% optimize.opt_a.after_resolve : 0.000024s : 0.10% optimize.opt_a.a_after_grad : 0.000020s : 0.09% optimize.opt_a.renormalize : 0.012484s : 52.58% optimize.opt_a.add_forward_monad_depend : 0.000012s : 0.05% optimize.opt_a.auto_monad_grad : 0.000005s : 0.02% optimize.opt_a.auto_monad_eliminator : 0.000033s : 0.14% optimize.opt_a.cse : 0.000050s : 0.21% optimize.opt_a.a_3 : 0.000104s : 0.44% optimize.py_interpret_to_execute_after_opt_a : 0.000017s : 0.07% optimize.slice_cell_reuse_recomputed_activation : 0.000003s : 0.01% optimize.rewriter_after_opt_a : 0.000042s : 0.18% optimize.convert_after_rewriter : 0.000007s : 0.03% optimize.order_py_execute_after_rewriter : 0.000006s : 0.02% optimize.mutable_eliminate : 0.000738s : 3.11% optimize.opt_b.b_1 : 0.000127s : 0.54% optimize.opt_b.b_2 : 0.000008s : 0.04% optimize.opt_b.updatestate_depend_eliminate : 0.000007s : 0.03% optimize.opt_b.updatestate_assign_eliminate : 0.000002s : 0.01% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.01% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000020s : 0.08% optimize.optimize_parallel_all_gather_comm : 0.000017s : 0.07% optimize.overlap_param_gather : 0.000002s : 0.01% optimize.cconv : 0.000029s : 0.12% optimize.loop_unroll : 0.000429s : 1.81% optimize.opt_after_cconv.c_1 : 0.000032s : 0.13% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000005s : 0.02% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.01% optimize.opt_after_cconv.cse : 0.000016s : 0.07% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000014s : 0.06% optimize.tuple_transform.d_1 : 0.000045s : 0.19% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000007s : 0.03% optimize.partial_unused_args_eliminate : 0.000002s : 0.01% optimize.add_recomputation : 0.000050s : 0.21% optimize.cse_after_recomputation.cse : 0.000011s : 0.05% optimize.environ_conv : 0.000006s : 0.02% optimize.swap_dp_allreduce_reducescatter : 0.000005s : 0.02% optimize.bias_add_comm_swap : 0.000003s : 0.01% optimize.label_micro_interleaved_index : 0.000005s : 0.02% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.01% optimize.merge_cast_opt : 0.000001s : 0.01% optimize.slice_recompute_activation : 0.000002s : 0.01% optimize.micro_interleaved_order_control : 0.000003s : 0.01% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.01% optimize.full_micro_interleaved_order_control : 0.000002s : 0.01% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.01% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.01% optimize.control_data_broadcast_order : 0.000013s : 0.05% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.01% optimize.offloading_packed_experts : 0.000004s : 0.02% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.02% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.01% optimize.overlap_recompute_comm : 0.000002s : 0.01% optimize.overlap_grad_ring_attention : 0.000004s : 0.02% optimize.overlap_grad_flash_sp : 0.000021s : 0.09% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.01% optimize.split_layernorm_comm : 0.000002s : 0.01% optimize.handle_group_info : 0.000001s : 0.01% optimize.symbol_engine_optimizer.build : 0.000003s : 0.01% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000010s : 0.04% optimize.symbol_engine_optimizer.elim_not_effective : 0.000013s : 0.05% optimize.symbol_engine_optimizer.opt_reshape : 0.000007s : 0.03% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000010s : 0.04% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000003s : 0.01% pipeline_parallel_scheduler : 0.000001s : 0.01% auto_monad_reorder : 0.000016s : 0.07% get_jit_bprop_graph : 0.000002s : 0.01% rewriter_after_jit_bprop_graph : 0.000004s : 0.02% opt_after_jit_grad : 0.000487s : 2.05% validate : 0.000044s : 0.18% Time group info: ------[substitution.] 0.000247 34 17.27% : 0.000043s : 6: substitution.arithmetic_simplify 0.77% : 0.000002s : 2: substitution.elim_not_effective 0.63% : 0.000002s : 2: substitution.fold_const_symbol 2.42% : 0.000006s : 4: substitution.graph_param_transform 66.39% : 0.000164s : 4: substitution.inline 1.81% : 0.000004s : 4: substitution.j_node_and_user_rematch 2.16% : 0.000005s : 4: substitution.remove_not_recompute_node 2.33% : 0.000006s : 4: substitution.replace_old_param 6.21% : 0.000015s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.006286 2 88.13% : 0.005540s : 1: type_inference.infer 11.87% : 0.000746s : 1: type_inference.specialize ------[replace.] 0.000065 8 64.00% : 0.000042s : 4: replace.inline 36.00% : 0.000023s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000175 8 92.26% : 0.000162s : 4: match.inline 7.74% : 0.000014s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000218 1278 1.02% : 0.000002s : 13: predicate.accumulaten_eliminater 0.83% : 0.000002s : 4: predicate.ad_related_special_op_eliminate 0.50% : 0.000001s : 8: predicate.addn_check_dump 1.00% : 0.000002s : 13: predicate.addn_zero_filter 0.80% : 0.000002s : 13: predicate.adjust_all_reduce_mul_add 2.70% : 0.000006s : 21: predicate.arithmetic_simplify 0.97% : 0.000002s : 13: predicate.cast_eliminate 0.59% : 0.000001s : 8: predicate.check_bprop_eliminate 0.55% : 0.000001s : 8: predicate.compare_switch_simplify 0.18% : 0.000000s : 4: predicate.const_output_eliminate 0.54% : 0.000001s : 8: predicate.depend_value_elim 0.92% : 0.000002s : 13: predicate.dict_get_item_const_eliminator 1.09% : 0.000002s : 13: predicate.dict_get_item_eliminator 0.80% : 0.000002s : 13: predicate.dict_set_item_eliminator 0.91% : 0.000002s : 8: predicate.dumpgradient_eliminate 0.19% : 0.000000s : 4: predicate.elim_not_effective 0.39% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.10% : 0.000002s : 17: predicate.environ_add_const_eliminate 1.37% : 0.000003s : 17: predicate.environ_get_add_eliminate 1.03% : 0.000002s : 17: predicate.environ_get_depend_swap 1.76% : 0.000004s : 25: predicate.environ_get_eliminate 1.06% : 0.000002s : 17: predicate.environ_get_set_eliminate 1.49% : 0.000003s : 21: predicate.exchange_switch_depend_value 2.54% : 0.000006s : 21: predicate.float_depend_g_call 0.52% : 0.000001s : 8: predicate.float_environ_get_switch 1.04% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.17% : 0.000000s : 4: predicate.fold_const_symbol 0.62% : 0.000001s : 8: predicate.get_grad_eliminate 0.20% : 0.000000s : 4: predicate.graph_param_transform 0.54% : 0.000001s : 8: predicate.incorporate_call 0.46% : 0.000001s : 8: predicate.incorporate_call_switch 6.21% : 0.000014s : 58: predicate.inline 0.75% : 0.000002s : 8: predicate.inline_without_move 0.31% : 0.000001s : 8: predicate.j_node_and_user_rematch 1.00% : 0.000002s : 8: predicate.less_batch_normalization 1.84% : 0.000004s : 25: predicate.list_to_tuple_eliminator_ 2.46% : 0.000005s : 38: predicate.load_eliminater 0.73% : 0.000002s : 4: predicate.loop_unroll_after_grad 2.32% : 0.000005s : 34: predicate.loop_unroll_before_grad 1.63% : 0.000004s : 21: predicate.make_slice_get_slice_eliminator 0.69% : 0.000002s : 8: predicate.merge_addn 0.56% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.57% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.90% : 0.000002s : 13: predicate.minmaximum_grad 1.12% : 0.000002s : 4: predicate.mutable_eliminate 0.27% : 0.000001s : 4: predicate.opt_reshape 0.35% : 0.000001s : 4: predicate.parallel_virtual_node 1.73% : 0.000004s : 21: predicate.partial_defer_inline 1.54% : 0.000003s : 21: predicate.partial_eliminate 0.89% : 0.000002s : 13: predicate.print_const_string_wrapper 0.55% : 0.000001s : 8: predicate.reduce_all_const_elim 1.12% : 0.000002s : 13: predicate.reduce_eliminate 2.51% : 0.000005s : 38: predicate.redundant_stop_gradient_eliminater 0.55% : 0.000001s : 8: predicate.remove_not_recompute_node 1.38% : 0.000003s : 25: predicate.replace_applicator 0.60% : 0.000001s : 8: predicate.replace_old_param 0.27% : 0.000001s : 4: predicate.reset_defer_inline 1.07% : 0.000002s : 13: predicate.reshape_eliminate 0.66% : 0.000001s : 8: predicate.row_tensor_add_zeros_like 0.49% : 0.000001s : 4: predicate.row_tensor_eliminate 0.93% : 0.000002s : 8: predicate.same_eliminate 0.40% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.78% : 0.000002s : 8: predicate.shard_identity_eliminate 0.66% : 0.000001s : 8: predicate.special_op_eliminate 0.59% : 0.000001s : 8: predicate.specialize_transform 1.20% : 0.000003s : 8: predicate.split_environ_get_set_with_tuple_value 1.24% : 0.000003s : 8: predicate.stack_unstack_eliminate 0.28% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.45% : 0.000003s : 21: predicate.switch_defer_inline 1.99% : 0.000004s : 29: predicate.switch_layer_defer_inline 4.83% : 0.000011s : 67: predicate.switch_simplify 0.98% : 0.000002s : 13: predicate.tile_eliminate 0.89% : 0.000002s : 13: predicate.transpose_eliminate 1.61% : 0.000004s : 21: predicate.tuple_list_convert_item_index_to_positive 1.56% : 0.000003s : 21: predicate.tuple_list_get_item_const_eliminator 1.49% : 0.000003s : 21: predicate.tuple_list_get_item_depend_reorder 3.30% : 0.000007s : 33: predicate.tuple_list_get_item_eliminator 1.53% : 0.000003s : 21: predicate.tuple_list_get_set_item_eliminator 2.32% : 0.000005s : 29: predicate.tuple_list_set_item_eliminator 1.68% : 0.000004s : 25: predicate.tuple_to_list_eliminator_ 2.28% : 0.000005s : 38: predicate.updatestate_pure_node_eliminater 2.89% : 0.000006s : 46: predicate.updatestate_useless_node_eliminater 0.36% : 0.000001s : 4: predicate.value_based_eliminate 0.61% : 0.000001s : 8: predicate.virtual_dataset_eliminate 0.67% : 0.000001s : 8: predicate.virtual_output_eliminate 0.25% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.33% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000677 11 50.16% : 0.000340s : 5: func_graph_cloner_run.FuncGraphClonerGraph 49.84% : 0.000337s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.062856 192 0.01% : 0.000004s : 1: ForceFp32Comm 5.67% : 0.003565s : 1: add_attr 5.65% : 0.003552s : 1: add_attr_with_inline 0.01% : 0.000004s : 1: add_comm_op_reuse_tag 0.08% : 0.000053s : 1: add_recomputation 0.01% : 0.000004s : 1: assign_add_opt 0.11% : 0.000071s : 1: auto_monad 0.03% : 0.000019s : 1: auto_monad_reorder 0.01% : 0.000003s : 1: begin_end_overlap_inline 0.01% : 0.000006s : 1: bias_add_comm_swap 0.82% : 0.000514s : 1: bootstrap 0.05% : 0.000033s : 1: cconv 0.01% : 0.000004s : 1: comm_op_add_attrs 0.03% : 0.000016s : 1: control_data_broadcast_order 0.02% : 0.000011s : 1: convert_after_rewriter 0.04% : 0.000025s : 1: cse_after_recomputation 0.01% : 0.000005s : 1: dataset_repeat_opt 0.01% : 0.000006s : 1: detach_backward 0.01% : 0.000009s : 1: environ_conv 0.04% : 0.000027s : 1: event_method 0.01% : 0.000005s : 1: full_micro_interleaved_order_control 0.01% : 0.000005s : 1: get_jit_bprop_graph 0.02% : 0.000010s : 1: graph_reusing 0.01% : 0.000005s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000004s : 1: handle_group_info 0.01% : 0.000005s : 1: inline 0.01% : 0.000006s : 1: insert-virtual-dataset 0.01% : 0.000004s : 1: interleave_parallel_branches 0.01% : 0.000004s : 1: interleave_split_concat_branches 0.01% : 0.000007s : 1: label_fine_grained_interleaved_index 0.01% : 0.000007s : 1: label_micro_interleaved_index 0.69% : 0.000437s : 1: loop_unroll 0.01% : 0.000004s : 1: merge_cast_opt 0.01% : 0.000005s : 1: micro_interleaved_order_control 1.19% : 0.000748s : 1: mutable_eliminate 0.01% : 0.000007s : 1: offloading_packed_experts 0.02% : 0.000013s : 1: opt.transform.loop_unroll_optimizer 0.03% : 0.000018s : 1: opt.transform.mutable_eliminate 2.05% : 0.001291s : 78: opt.transform.opt_a 0.05% : 0.000030s : 1: opt.transform.opt_after_cconv 0.04% : 0.000026s : 1: opt.transform.opt_after_jit_grad 0.16% : 0.000102s : 28: opt.transform.opt_b 0.08% : 0.000050s : 2: opt.transform.opt_trans_graph 0.06% : 0.000036s : 4: opt.transform.symbol_engine_opt 23.52% : 0.014782s : 1: opt_a 0.17% : 0.000107s : 1: opt_after_cconv 0.79% : 0.000496s : 1: opt_after_jit_grad 0.34% : 0.000214s : 1: opt_b 27.16% : 0.017072s : 1: optimize 0.03% : 0.000021s : 1: optimize_parallel_all_gather_comm 0.01% : 0.000009s : 1: order_py_execute_after_rewriter 0.04% : 0.000025s : 1: overlap_grad_flash_sp 0.01% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.01% : 0.000007s : 1: overlap_grad_ring_attention 0.01% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.01% : 0.000005s : 1: overlap_param_gather 0.01% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.01% : 0.000008s : 1: overlap_recompute_and_grad_model_parallel 0.01% : 0.000005s : 1: overlap_recompute_comm 0.01% : 0.000008s : 1: parallel-infer-symbol 0.01% : 0.000004s : 1: parallel-infer-symbol-second 0.01% : 0.000005s : 1: partial_unused_args_eliminate 0.01% : 0.000004s : 1: pipeline_parallel_scheduler 0.01% : 0.000004s : 1: pipeline_split 0.07% : 0.000045s : 1: pre_auto_parallel 0.05% : 0.000033s : 1: py_interpret_to_execute 0.03% : 0.000021s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000004s : 1: remove_cast_before_assign_add 0.03% : 0.000017s : 1: remove_dup_value 19.03% : 0.011961s : 1: renormalize.infer 0.80% : 0.000503s : 1: renormalize.specialize 0.01% : 0.000005s : 1: reorder_send_recv_between_fp_bp 0.01% : 0.000007s : 1: rewriter_after_jit_bprop_graph 0.07% : 0.000046s : 1: rewriter_after_opt_a 0.15% : 0.000096s : 1: rewriter_before_opt_a 0.01% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.01% : 0.000005s : 1: slice_recompute_activation 0.01% : 0.000005s : 1: split_layernorm_comm 0.01% : 0.000005s : 1: split_matmul_comm_elemetwise 0.01% : 0.000008s : 1: swap_dp_allreduce_reducescatter 0.12% : 0.000078s : 1: symbol_engine_optimizer 0.13% : 0.000080s : 1: tuple_transform 10.15% : 0.006381s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:42:33.135.032 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:42:33.135.302 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0992089, [21] [bootstrap]: 0.00049561 [type_inference]: 0.0871572 [event_method]: 2.093e-05 [auto_monad]: 6.847e-05 [graph_reusing]: 5.82999e-06 [inline]: 2.41e-06 [add_attr]: 0.00374727, [1] [add_attr_with_inline]: 0.00373549, [1] [Cycle 1]: 9.51e-05, [2] [tag_attr]: 2.324e-05 [meta_addattr_fg_expand]: 6.24999e-06 [parallel-infer-symbol]: 3.29001e-06 [pre_auto_parallel]: 4.23e-05 [insert-virtual-dataset]: 2.37001e-06 [parallel-infer-symbol-second]: 8.99978e-07 [dataset_repeat_opt]: 2.12999e-06 [pipeline_split]: 1.66e-06 [optimize]: 0.00643301, [53] [py_interpret_to_execute]: 3.392e-05 [rewriter_before_opt_a]: 9.681e-05 [opt_a]: 0.0036982, [2] [Cycle 1]: 0.00273239, [45] [expand_dump_flag]: 3.35e-06 [switch_simplify]: 4.305e-05 [loop_unroll]: 3.058e-05 [a_1]: 0.000679 [with_stream_mark]: 2.225e-05 [recompute_prepare]: 1.067e-05 [updatestate_depend_eliminate]: 4.53999e-06 [updatestate_assign_eliminate]: 3.38999e-06 [updatestate_loads_eliminate]: 3.36001e-06 [parameter_eliminate]: 2.24001e-06 [a_2]: 0.00012284 [accelerated_algorithm]: 8.90999e-06 [shard]: 1.87999e-06 [meta_shard_fg_expand]: 1.94999e-06 [shard_inline]: 6.78e-06 [merge_send_recv]: 1.016e-05 [auto_parallel]: 7.43999e-06 [parallel]: 2.098e-05 [flash_sp]: 1.082e-05 [merge_comm]: 4.43001e-06 [allreduce_fusion]: 4.33001e-06 [matmul_add_comm_reduction]: 1.05e-05 [allreduce_slice_to_reducescatter]: 8.29983e-07 [virtual_shard_identity]: 1.077e-05 [virtual_dataset]: 7.6e-06 [get_grad_eliminate_]: 6.88e-06 [virtual_output]: 6.61e-06 [merge_forward]: 4.79e-06 [cell_reuse_recompute_pass]: 1.60999e-06 [offload_activation]: 1.262e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.818e-05 [merge_recompute_call_nodes]: 1.42999e-06 [before_grad]: 1.154e-05 [set_forward_comm_id_for_comm_node_pass]: 4.53999e-06 [meta_fg_expand]: 3.14999e-06 [flash_sp_send_recv_attached]: 3.16001e-06 [receive_attached]: 2.89999e-06 [after_resolve]: 1.389e-05 [a_after_grad]: 1.163e-05 [renormalize]: 0.00099175 [add_forward_monad_depend]: 9.34e-06 [auto_monad_grad]: 2.74999e-06 [auto_monad_eliminator]: 2.019e-05 [cse]: 3.123e-05 [a_3]: 7.412e-05 [Cycle 2]: 0.00094689, [45] [expand_dump_flag]: 1.94e-06 [switch_simplify]: 9.46e-06 [loop_unroll]: 6.52001e-06 [a_1]: 0.00015241 [with_stream_mark]: 1.866e-05 [recompute_prepare]: 6.57002e-06 [updatestate_depend_eliminate]: 4.17998e-06 [updatestate_assign_eliminate]: 3.39001e-06 [updatestate_loads_eliminate]: 3.35e-06 [parameter_eliminate]: 1.97999e-06 [a_2]: 0.0001093 [accelerated_algorithm]: 8.28001e-06 [shard]: 1.59e-06 [meta_shard_fg_expand]: 2.39001e-06 [shard_inline]: 7.51999e-06 [merge_send_recv]: 9.00999e-06 [auto_parallel]: 9.89999e-06 [parallel]: 9.15001e-06 [flash_sp]: 4.65999e-06 [merge_comm]: 3.35e-06 [allreduce_fusion]: 3.26001e-06 [matmul_add_comm_reduction]: 9.87999e-06 [allreduce_slice_to_reducescatter]: 5.60016e-07 [virtual_shard_identity]: 9.19e-06 [virtual_dataset]: 7.03998e-06 [get_grad_eliminate_]: 5.94e-06 [virtual_output]: 6.16e-06 [merge_forward]: 3.87002e-06 [cell_reuse_recompute_pass]: 2.48e-06 [offload_activation]: 9.30001e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.53e-05 [merge_recompute_call_nodes]: 1.24e-06 [before_grad]: 1.269e-05 [set_forward_comm_id_for_comm_node_pass]: 4.49998e-06 [meta_fg_expand]: 2.96999e-06 [flash_sp_send_recv_attached]: 2.22999e-06 [receive_attached]: 1.78002e-06 [after_resolve]: 1.475e-05 [a_after_grad]: 1.017e-05 [renormalize]: 8.00064e-08 [add_forward_monad_depend]: 2.15002e-06 [auto_monad_grad]: 2.19001e-06 [auto_monad_eliminator]: 1.088e-05 [cse]: 1.891e-05 [a_3]: 5.216e-05 [py_interpret_to_execute_after_opt_a]: 2.105e-05 [slice_cell_reuse_recomputed_activation]: 5.25001e-06 [rewriter_after_opt_a]: 4.752e-05 [convert_after_rewriter]: 1.146e-05 [order_py_execute_after_rewriter]: 9.86e-06 [mutable_eliminate]: 0.00076434 [opt_b]: 0.00031163, [1] [Cycle 1]: 0.00029952, [7] [b_1]: 0.00018245 [b_2]: 9.46e-06 [updatestate_depend_eliminate]: 9.51e-06 [updatestate_assign_eliminate]: 2.89999e-06 [updatestate_loads_eliminate]: 3.18e-06 [renormalize]: 6.50005e-07 [cse]: 2.95e-05 [optimize_parallel_all_gather_comm]: 2.461e-05 [overlap_param_gather]: 5.39e-06 [cconv]: 3.877e-05 [loop_unroll]: 0.00051665 [opt_after_cconv]: 0.00013302, [1] [Cycle 1]: 0.00012365, [7] [c_1]: 3.154e-05 [parameter_eliminate]: 5.59e-06 [updatestate_depend_eliminate]: 6.38e-06 [updatestate_assign_eliminate]: 2.76e-06 [updatestate_loads_eliminate]: 2.41e-06 [cse]: 1.86e-05 [renormalize]: 2.30008e-07 [remove_dup_value]: 1.946e-05 [tuple_transform]: 9.622e-05, [1] [Cycle 1]: 8.892e-05, [4] [d_1]: 4.702e-05 [none_parameter_eliminate]: 1.59998e-06 [renormalize]: 2.30008e-07 [switch_simplify]: 7.11001e-06 [partial_unused_args_eliminate]: 4.94003e-06 [add_recomputation]: 5.62e-05 [cse_after_recomputation]: 2.856e-05, [1] [Cycle 1]: 2.124e-05, [1] [cse]: 1.205e-05 [environ_conv]: 8.87e-06 [swap_dp_allreduce_reducescatter]: 7.97998e-06 [bias_add_comm_swap]: 5.84e-06 [label_micro_interleaved_index]: 6.86999e-06 [label_fine_grained_interleaved_index]: 5.92001e-06 [merge_cast_opt]: 4.10998e-06 [slice_recompute_activation]: 4.48001e-06 [micro_interleaved_order_control]: 5.04e-06 [assign_add_opt]: 4.50001e-06 [ForceFp32Comm]: 3.31999e-06 [remove_cast_before_assign_add]: 3.65e-06 [full_micro_interleaved_order_control]: 4.80001e-06 [reorder_send_recv_between_fp_bp]: 5.69e-06 [comm_op_add_attrs]: 3.43e-06 [add_comm_op_reuse_tag]: 3.56001e-06 [interleave_split_concat_branches]: 3.55e-06 [interleave_parallel_branches]: 3.55998e-06 [overlap_opt_shard_in_pipeline]: 3.6e-06 [overlap_opt_shard_grad_in_pipeline]: 4.15e-06 [control_data_broadcast_order]: 1.692e-05 [grouped_pairwise_exchange_alltoall]: 4.05e-06 [offloading_packed_experts]: 6.49999e-06 [overlap_recompute_and_grad_model_parallel]: 7.16999e-06 [overlap_grad_matmul_and_grad_allreduce]: 4.02998e-06 [overlap_recompute_allgather_and_fa_grad]: 3.86001e-06 [overlap_recompute_comm]: 4.65001e-06 [overlap_grad_ring_attention]: 6.54999e-06 [overlap_grad_flash_sp]: 2.445e-05 [begin_end_overlap_inline]: 3.04999e-06 [split_matmul_comm_elemetwise]: 4.72e-06 [split_layernorm_comm]: 4.57e-06 [handle_group_info]: 3.49001e-06 [symbol_engine_optimizer]: 9.926e-05, [1] [Cycle 1]: 9.118e-05, [6] [build]: 3.93001e-06 [elim_shapecalc]: 9.51e-06 [elim_not_effective]: 1.354e-05 [opt_reshape]: 7.13e-06 [fold_const_symbol]: 1.05e-05 [renormalize]: 2.3999e-07 [detach_backward]: 3.41999e-06 [pipeline_parallel_scheduler]: 1.82999e-06 [auto_monad_reorder]: 2.047e-05 [get_jit_bprop_graph]: 2.21e-06 [rewriter_after_jit_bprop_graph]: 5.72999e-06 [opt_after_jit_grad]: 0.00053074 [validate]: 4.384e-05 Sums bootstrap : 0.000496s : 0.53% type_inference : 0.087157s : 93.17% event_method : 0.000021s : 0.02% auto_monad : 0.000068s : 0.07% graph_reusing : 0.000006s : 0.01% inline : 0.000002s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000023s : 0.02% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.01% parallel-infer-symbol : 0.000003s : 0.00% pre_auto_parallel : 0.000042s : 0.05% insert-virtual-dataset : 0.000002s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000034s : 0.04% optimize.rewriter_before_opt_a : 0.000097s : 0.10% optimize.opt_a.expand_dump_flag : 0.000005s : 0.01% optimize.opt_a.switch_simplify : 0.000053s : 0.06% optimize.opt_a.loop_unroll : 0.000037s : 0.04% optimize.opt_a.a_1 : 0.000831s : 0.89% optimize.opt_a.with_stream_mark : 0.000041s : 0.04% optimize.opt_a.recompute_prepare : 0.000017s : 0.02% optimize.opt_a.updatestate_depend_eliminate : 0.000009s : 0.01% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.01% optimize.opt_a.updatestate_loads_eliminate : 0.000007s : 0.01% optimize.opt_a.parameter_eliminate : 0.000004s : 0.00% optimize.opt_a.a_2 : 0.000232s : 0.25% optimize.opt_a.accelerated_algorithm : 0.000017s : 0.02% optimize.opt_a.shard : 0.000003s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.00% optimize.opt_a.shard_inline : 0.000014s : 0.02% optimize.opt_a.merge_send_recv : 0.000019s : 0.02% optimize.opt_a.auto_parallel : 0.000017s : 0.02% optimize.opt_a.parallel : 0.000030s : 0.03% optimize.opt_a.flash_sp : 0.000015s : 0.02% optimize.opt_a.merge_comm : 0.000008s : 0.01% optimize.opt_a.allreduce_fusion : 0.000008s : 0.01% optimize.opt_a.matmul_add_comm_reduction : 0.000020s : 0.02% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000020s : 0.02% optimize.opt_a.virtual_dataset : 0.000015s : 0.02% optimize.opt_a.get_grad_eliminate_ : 0.000013s : 0.01% optimize.opt_a.virtual_output : 0.000013s : 0.01% optimize.opt_a.merge_forward : 0.000009s : 0.01% optimize.opt_a.cell_reuse_recompute_pass : 0.000004s : 0.00% optimize.opt_a.offload_activation : 0.000022s : 0.02% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000033s : 0.04% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.00% optimize.opt_a.before_grad : 0.000024s : 0.03% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000009s : 0.01% optimize.opt_a.meta_fg_expand : 0.000006s : 0.01% optimize.opt_a.flash_sp_send_recv_attached : 0.000005s : 0.01% optimize.opt_a.receive_attached : 0.000005s : 0.01% optimize.opt_a.after_resolve : 0.000029s : 0.03% optimize.opt_a.a_after_grad : 0.000022s : 0.02% optimize.opt_a.renormalize : 0.000992s : 1.06% optimize.opt_a.add_forward_monad_depend : 0.000011s : 0.01% optimize.opt_a.auto_monad_grad : 0.000005s : 0.01% optimize.opt_a.auto_monad_eliminator : 0.000031s : 0.03% optimize.opt_a.cse : 0.000050s : 0.05% optimize.opt_a.a_3 : 0.000126s : 0.13% optimize.py_interpret_to_execute_after_opt_a : 0.000021s : 0.02% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.01% optimize.rewriter_after_opt_a : 0.000048s : 0.05% optimize.convert_after_rewriter : 0.000011s : 0.01% optimize.order_py_execute_after_rewriter : 0.000010s : 0.01% optimize.mutable_eliminate : 0.000764s : 0.82% optimize.opt_b.b_1 : 0.000182s : 0.20% optimize.opt_b.b_2 : 0.000009s : 0.01% optimize.opt_b.updatestate_depend_eliminate : 0.000010s : 0.01% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.00% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000029s : 0.03% optimize.optimize_parallel_all_gather_comm : 0.000025s : 0.03% optimize.overlap_param_gather : 0.000005s : 0.01% optimize.cconv : 0.000039s : 0.04% optimize.loop_unroll : 0.000517s : 0.55% optimize.opt_after_cconv.c_1 : 0.000032s : 0.03% optimize.opt_after_cconv.parameter_eliminate : 0.000006s : 0.01% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.01% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.00% optimize.opt_after_cconv.cse : 0.000019s : 0.02% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000019s : 0.02% optimize.tuple_transform.d_1 : 0.000047s : 0.05% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000007s : 0.01% optimize.partial_unused_args_eliminate : 0.000005s : 0.01% optimize.add_recomputation : 0.000056s : 0.06% optimize.cse_after_recomputation.cse : 0.000012s : 0.01% optimize.environ_conv : 0.000009s : 0.01% optimize.swap_dp_allreduce_reducescatter : 0.000008s : 0.01% optimize.bias_add_comm_swap : 0.000006s : 0.01% optimize.label_micro_interleaved_index : 0.000007s : 0.01% optimize.label_fine_grained_interleaved_index : 0.000006s : 0.01% optimize.merge_cast_opt : 0.000004s : 0.00% optimize.slice_recompute_activation : 0.000004s : 0.00% optimize.micro_interleaved_order_control : 0.000005s : 0.01% optimize.assign_add_opt : 0.000005s : 0.00% optimize.ForceFp32Comm : 0.000003s : 0.00% optimize.remove_cast_before_assign_add : 0.000004s : 0.00% optimize.full_micro_interleaved_order_control : 0.000005s : 0.01% optimize.reorder_send_recv_between_fp_bp : 0.000006s : 0.01% optimize.comm_op_add_attrs : 0.000003s : 0.00% optimize.add_comm_op_reuse_tag : 0.000004s : 0.00% optimize.interleave_split_concat_branches : 0.000004s : 0.00% optimize.interleave_parallel_branches : 0.000004s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000004s : 0.00% optimize.control_data_broadcast_order : 0.000017s : 0.02% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.00% optimize.offloading_packed_experts : 0.000006s : 0.01% optimize.overlap_recompute_and_grad_model_parallel : 0.000007s : 0.01% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.00% optimize.overlap_recompute_comm : 0.000005s : 0.00% optimize.overlap_grad_ring_attention : 0.000007s : 0.01% optimize.overlap_grad_flash_sp : 0.000024s : 0.03% optimize.begin_end_overlap_inline : 0.000003s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000005s : 0.01% optimize.split_layernorm_comm : 0.000005s : 0.00% optimize.handle_group_info : 0.000003s : 0.00% optimize.symbol_engine_optimizer.build : 0.000004s : 0.00% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000010s : 0.01% optimize.symbol_engine_optimizer.elim_not_effective : 0.000014s : 0.01% optimize.symbol_engine_optimizer.opt_reshape : 0.000007s : 0.01% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000011s : 0.01% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000003s : 0.00% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000020s : 0.02% get_jit_bprop_graph : 0.000002s : 0.00% rewriter_after_jit_bprop_graph : 0.000006s : 0.01% opt_after_jit_grad : 0.000531s : 0.57% validate : 0.000044s : 0.05% Time group info: ------[substitution.] 0.000250 34 16.19% : 0.000040s : 6: substitution.arithmetic_simplify 0.91% : 0.000002s : 2: substitution.elim_not_effective 0.55% : 0.000001s : 2: substitution.fold_const_symbol 2.48% : 0.000006s : 4: substitution.graph_param_transform 66.90% : 0.000167s : 4: substitution.inline 1.81% : 0.000005s : 4: substitution.j_node_and_user_rematch 1.95% : 0.000005s : 4: substitution.remove_not_recompute_node 3.06% : 0.000008s : 4: substitution.replace_old_param 6.14% : 0.000015s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.087092 2 99.11% : 0.086313s : 1: type_inference.infer 0.89% : 0.000778s : 1: type_inference.specialize ------[replace.] 0.000066 8 64.20% : 0.000043s : 4: replace.inline 35.80% : 0.000024s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000178 8 92.36% : 0.000164s : 4: match.inline 7.64% : 0.000014s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000218 1278 0.86% : 0.000002s : 13: predicate.accumulaten_eliminater 0.77% : 0.000002s : 4: predicate.ad_related_special_op_eliminate 0.52% : 0.000001s : 8: predicate.addn_check_dump 0.82% : 0.000002s : 13: predicate.addn_zero_filter 0.74% : 0.000002s : 13: predicate.adjust_all_reduce_mul_add 2.58% : 0.000006s : 21: predicate.arithmetic_simplify 1.01% : 0.000002s : 13: predicate.cast_eliminate 0.59% : 0.000001s : 8: predicate.check_bprop_eliminate 0.71% : 0.000002s : 8: predicate.compare_switch_simplify 0.19% : 0.000000s : 4: predicate.const_output_eliminate 0.65% : 0.000001s : 8: predicate.depend_value_elim 0.93% : 0.000002s : 13: predicate.dict_get_item_const_eliminator 0.93% : 0.000002s : 13: predicate.dict_get_item_eliminator 0.79% : 0.000002s : 13: predicate.dict_set_item_eliminator 1.22% : 0.000003s : 8: predicate.dumpgradient_eliminate 0.21% : 0.000000s : 4: predicate.elim_not_effective 0.36% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.07% : 0.000002s : 17: predicate.environ_add_const_eliminate 1.02% : 0.000002s : 17: predicate.environ_get_add_eliminate 1.07% : 0.000002s : 17: predicate.environ_get_depend_swap 1.73% : 0.000004s : 25: predicate.environ_get_eliminate 1.06% : 0.000002s : 17: predicate.environ_get_set_eliminate 1.35% : 0.000003s : 21: predicate.exchange_switch_depend_value 2.51% : 0.000005s : 21: predicate.float_depend_g_call 0.57% : 0.000001s : 8: predicate.float_environ_get_switch 0.81% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.18% : 0.000000s : 4: predicate.fold_const_symbol 0.58% : 0.000001s : 8: predicate.get_grad_eliminate 0.31% : 0.000001s : 4: predicate.graph_param_transform 0.60% : 0.000001s : 8: predicate.incorporate_call 0.48% : 0.000001s : 8: predicate.incorporate_call_switch 6.29% : 0.000014s : 58: predicate.inline 0.96% : 0.000002s : 8: predicate.inline_without_move 0.30% : 0.000001s : 8: predicate.j_node_and_user_rematch 0.92% : 0.000002s : 8: predicate.less_batch_normalization 1.77% : 0.000004s : 25: predicate.list_to_tuple_eliminator_ 2.40% : 0.000005s : 38: predicate.load_eliminater 1.05% : 0.000002s : 4: predicate.loop_unroll_after_grad 2.34% : 0.000005s : 34: predicate.loop_unroll_before_grad 1.48% : 0.000003s : 21: predicate.make_slice_get_slice_eliminator 0.49% : 0.000001s : 8: predicate.merge_addn 0.56% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.60% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.73% : 0.000002s : 13: predicate.minmaximum_grad 1.63% : 0.000004s : 4: predicate.mutable_eliminate 0.33% : 0.000001s : 4: predicate.opt_reshape 0.59% : 0.000001s : 4: predicate.parallel_virtual_node 1.63% : 0.000004s : 21: predicate.partial_defer_inline 1.58% : 0.000003s : 21: predicate.partial_eliminate 0.88% : 0.000002s : 13: predicate.print_const_string_wrapper 0.66% : 0.000001s : 8: predicate.reduce_all_const_elim 1.11% : 0.000002s : 13: predicate.reduce_eliminate 2.87% : 0.000006s : 38: predicate.redundant_stop_gradient_eliminater 0.63% : 0.000001s : 8: predicate.remove_not_recompute_node 1.50% : 0.000003s : 25: predicate.replace_applicator 0.57% : 0.000001s : 8: predicate.replace_old_param 0.44% : 0.000001s : 4: predicate.reset_defer_inline 0.88% : 0.000002s : 13: predicate.reshape_eliminate 0.72% : 0.000002s : 8: predicate.row_tensor_add_zeros_like 0.39% : 0.000001s : 4: predicate.row_tensor_eliminate 0.79% : 0.000002s : 8: predicate.same_eliminate 0.38% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.99% : 0.000002s : 8: predicate.shard_identity_eliminate 0.67% : 0.000001s : 8: predicate.special_op_eliminate 0.74% : 0.000002s : 8: predicate.specialize_transform 1.13% : 0.000002s : 8: predicate.split_environ_get_set_with_tuple_value 0.65% : 0.000001s : 8: predicate.stack_unstack_eliminate 0.38% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.41% : 0.000003s : 21: predicate.switch_defer_inline 2.00% : 0.000004s : 29: predicate.switch_layer_defer_inline 4.98% : 0.000011s : 67: predicate.switch_simplify 0.83% : 0.000002s : 13: predicate.tile_eliminate 0.83% : 0.000002s : 13: predicate.transpose_eliminate 1.44% : 0.000003s : 21: predicate.tuple_list_convert_item_index_to_positive 1.44% : 0.000003s : 21: predicate.tuple_list_get_item_const_eliminator 1.33% : 0.000003s : 21: predicate.tuple_list_get_item_depend_reorder 3.17% : 0.000007s : 33: predicate.tuple_list_get_item_eliminator 1.52% : 0.000003s : 21: predicate.tuple_list_get_set_item_eliminator 2.16% : 0.000005s : 29: predicate.tuple_list_set_item_eliminator 1.59% : 0.000003s : 25: predicate.tuple_to_list_eliminator_ 2.30% : 0.000005s : 38: predicate.updatestate_pure_node_eliminater 2.99% : 0.000007s : 46: predicate.updatestate_useless_node_eliminater 0.41% : 0.000001s : 4: predicate.value_based_eliminate 0.91% : 0.000002s : 8: predicate.virtual_dataset_eliminate 0.59% : 0.000001s : 8: predicate.virtual_output_eliminate 0.25% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.62% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000767 11 48.33% : 0.000371s : 5: func_graph_cloner_run.FuncGraphClonerGraph 51.67% : 0.000396s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.111836 192 0.01% : 0.000006s : 1: ForceFp32Comm 3.36% : 0.003759s : 1: add_attr 3.34% : 0.003739s : 1: add_attr_with_inline 0.01% : 0.000006s : 1: add_comm_op_reuse_tag 0.05% : 0.000060s : 1: add_recomputation 0.01% : 0.000007s : 1: assign_add_opt 0.07% : 0.000078s : 1: auto_monad 0.03% : 0.000028s : 1: auto_monad_reorder 0.01% : 0.000006s : 1: begin_end_overlap_inline 0.01% : 0.000009s : 1: bias_add_comm_swap 0.49% : 0.000550s : 1: bootstrap 0.04% : 0.000042s : 1: cconv 0.01% : 0.000006s : 1: comm_op_add_attrs 0.02% : 0.000020s : 1: control_data_broadcast_order 0.01% : 0.000015s : 1: convert_after_rewriter 0.03% : 0.000032s : 1: cse_after_recomputation 0.01% : 0.000008s : 1: dataset_repeat_opt 0.02% : 0.000020s : 1: detach_backward 0.01% : 0.000012s : 1: environ_conv 0.03% : 0.000032s : 1: event_method 0.01% : 0.000008s : 1: full_micro_interleaved_order_control 0.01% : 0.000009s : 1: get_jit_bprop_graph 0.01% : 0.000012s : 1: graph_reusing 0.01% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000006s : 1: handle_group_info 0.01% : 0.000008s : 1: inline 0.01% : 0.000009s : 1: insert-virtual-dataset 0.01% : 0.000006s : 1: interleave_parallel_branches 0.01% : 0.000006s : 1: interleave_split_concat_branches 0.01% : 0.000009s : 1: label_fine_grained_interleaved_index 0.01% : 0.000010s : 1: label_micro_interleaved_index 0.47% : 0.000523s : 1: loop_unroll 0.01% : 0.000007s : 1: merge_cast_opt 0.01% : 0.000008s : 1: micro_interleaved_order_control 0.69% : 0.000773s : 1: mutable_eliminate 0.01% : 0.000009s : 1: offloading_packed_experts 0.01% : 0.000016s : 1: opt.transform.loop_unroll_optimizer 0.02% : 0.000024s : 1: opt.transform.mutable_eliminate 1.17% : 0.001308s : 78: opt.transform.opt_a 0.03% : 0.000030s : 1: opt.transform.opt_after_cconv 0.02% : 0.000027s : 1: opt.transform.opt_after_jit_grad 0.10% : 0.000111s : 28: opt.transform.opt_b 0.05% : 0.000052s : 2: opt.transform.opt_trans_graph 0.03% : 0.000037s : 4: opt.transform.symbol_engine_opt 3.31% : 0.003701s : 1: opt_a 0.12% : 0.000137s : 1: opt_after_cconv 0.48% : 0.000542s : 1: opt_after_jit_grad 0.28% : 0.000316s : 1: opt_b 6.05% : 0.006771s : 1: optimize 0.02% : 0.000028s : 1: optimize_parallel_all_gather_comm 0.01% : 0.000013s : 1: order_py_execute_after_rewriter 0.02% : 0.000028s : 1: overlap_grad_flash_sp 0.01% : 0.000007s : 1: overlap_grad_matmul_and_grad_allreduce 0.01% : 0.000010s : 1: overlap_grad_ring_attention 0.01% : 0.000007s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000006s : 1: overlap_opt_shard_in_pipeline 0.01% : 0.000009s : 1: overlap_param_gather 0.01% : 0.000007s : 1: overlap_recompute_allgather_and_fa_grad 0.01% : 0.000010s : 1: overlap_recompute_and_grad_model_parallel 0.01% : 0.000007s : 1: overlap_recompute_comm 0.01% : 0.000010s : 1: parallel-infer-symbol 0.01% : 0.000007s : 1: parallel-infer-symbol-second 0.01% : 0.000008s : 1: partial_unused_args_eliminate 0.01% : 0.000009s : 1: pipeline_parallel_scheduler 0.01% : 0.000007s : 1: pipeline_split 0.05% : 0.000050s : 1: pre_auto_parallel 0.03% : 0.000038s : 1: py_interpret_to_execute 0.02% : 0.000025s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000006s : 1: remove_cast_before_assign_add 0.02% : 0.000023s : 1: remove_dup_value 0.43% : 0.000476s : 1: renormalize.infer 0.45% : 0.000505s : 1: renormalize.specialize 0.01% : 0.000008s : 1: reorder_send_recv_between_fp_bp 0.01% : 0.000012s : 1: rewriter_after_jit_bprop_graph 0.05% : 0.000052s : 1: rewriter_after_opt_a 0.09% : 0.000101s : 1: rewriter_before_opt_a 0.01% : 0.000008s : 1: slice_cell_reuse_recomputed_activation 0.01% : 0.000008s : 1: slice_recompute_activation 0.01% : 0.000007s : 1: split_layernorm_comm 0.01% : 0.000008s : 1: split_matmul_comm_elemetwise 0.01% : 0.000011s : 1: swap_dp_allreduce_reducescatter 0.09% : 0.000103s : 1: symbol_engine_optimizer 0.09% : 0.000099s : 1: tuple_transform 77.98% : 0.087207s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:42:34.379.261 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0158063, [21] [bootstrap]: 0.00046947 [type_inference]: 0.00616411 [event_method]: 2.086e-05 [auto_monad]: 6.513e-05 [graph_reusing]: 6.39999e-06 [inline]: 2.40002e-06 [add_attr]: 0.00336, [1] [add_attr_with_inline]: 0.00335005, [1] [Cycle 1]: 6.37e-05, [2] [tag_attr]: 1.985e-05 [meta_addattr_fg_expand]: 6.13998e-06 [parallel-infer-symbol]: 3.19001e-06 [pre_auto_parallel]: 3.765e-05 [insert-virtual-dataset]: 2.41e-06 [parallel-infer-symbol-second]: 7.7e-07 [dataset_repeat_opt]: 2.19001e-06 [pipeline_split]: 1.71e-06 [optimize]: 0.00495692, [53] [py_interpret_to_execute]: 2.655e-05 [rewriter_before_opt_a]: 8.294e-05 [opt_a]: 0.00284316, [2] [Cycle 1]: 0.0021588, [45] [expand_dump_flag]: 3.22002e-06 [switch_simplify]: 4.433e-05 [loop_unroll]: 2.963e-05 [a_1]: 0.00066638 [with_stream_mark]: 1.819e-05 [recompute_prepare]: 8.85999e-06 [updatestate_depend_eliminate]: 4.31002e-06 [updatestate_assign_eliminate]: 3.22997e-06 [updatestate_loads_eliminate]: 2.96001e-06 [parameter_eliminate]: 1.89999e-06 [a_2]: 9.196e-05 [accelerated_algorithm]: 7.68001e-06 [shard]: 1.65001e-06 [meta_shard_fg_expand]: 1.97001e-06 [shard_inline]: 7.06001e-06 [merge_send_recv]: 9.10001e-06 [auto_parallel]: 7.33e-06 [parallel]: 1.964e-05 [flash_sp]: 8.83001e-06 [merge_comm]: 4.40999e-06 [allreduce_fusion]: 3.92998e-06 [matmul_add_comm_reduction]: 9.69e-06 [allreduce_slice_to_reducescatter]: 6.49976e-07 [virtual_shard_identity]: 8.42e-06 [virtual_dataset]: 7.18e-06 [get_grad_eliminate_]: 6.78e-06 [virtual_output]: 6.94001e-06 [merge_forward]: 3.70998e-06 [cell_reuse_recompute_pass]: 1.28002e-06 [offload_activation]: 1.075e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.208e-05 [merge_recompute_call_nodes]: 1.55999e-06 [before_grad]: 1.09e-05 [set_forward_comm_id_for_comm_node_pass]: 3.95e-06 [meta_fg_expand]: 2.84999e-06 [flash_sp_send_recv_attached]: 2.50997e-06 [receive_attached]: 2.24001e-06 [after_resolve]: 1.25e-05 [a_after_grad]: 1.036e-05 [renormalize]: 0.0007449 [add_forward_monad_depend]: 5.87999e-06 [auto_monad_grad]: 2.58998e-06 [auto_monad_eliminator]: 1.623e-05 [cse]: 3.111e-05 [a_3]: 4.86e-05 [Cycle 2]: 0.00067352, [45] [expand_dump_flag]: 1.62001e-06 [switch_simplify]: 7.63999e-06 [loop_unroll]: 6.25002e-06 [a_1]: 0.00014145 [with_stream_mark]: 1.241e-05 [recompute_prepare]: 6.91999e-06 [updatestate_depend_eliminate]: 2.72001e-06 [updatestate_assign_eliminate]: 2.36e-06 [updatestate_loads_eliminate]: 2.61999e-06 [parameter_eliminate]: 1.47999e-06 [a_2]: 7.875e-05 [accelerated_algorithm]: 6.16998e-06 [shard]: 1.08001e-06 [meta_shard_fg_expand]: 1.93002e-06 [shard_inline]: 6.12999e-06 [merge_send_recv]: 4.80999e-06 [auto_parallel]: 5.49e-06 [parallel]: 5.00999e-06 [flash_sp]: 3.69002e-06 [merge_comm]: 3.36001e-06 [allreduce_fusion]: 3.43999e-06 [matmul_add_comm_reduction]: 5.75001e-06 [allreduce_slice_to_reducescatter]: 3.49974e-07 [virtual_shard_identity]: 7.09001e-06 [virtual_dataset]: 6.26e-06 [get_grad_eliminate_]: 5.62999e-06 [virtual_output]: 6.33e-06 [merge_forward]: 2.93998e-06 [cell_reuse_recompute_pass]: 1.92001e-06 [offload_activation]: 6.58998e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.056e-05 [merge_recompute_call_nodes]: 9.29984e-07 [before_grad]: 9.91e-06 [set_forward_comm_id_for_comm_node_pass]: 3.16001e-06 [meta_fg_expand]: 2.05002e-06 [flash_sp_send_recv_attached]: 8.70001e-07 [receive_attached]: 1.08001e-06 [after_resolve]: 1.185e-05 [a_after_grad]: 8.82999e-06 [renormalize]: 8.00064e-08 [add_forward_monad_depend]: 1.35001e-06 [auto_monad_grad]: 1.24e-06 [auto_monad_eliminator]: 7.8e-06 [cse]: 1.497e-05 [a_3]: 3.792e-05 [py_interpret_to_execute_after_opt_a]: 8.75999e-06 [slice_cell_reuse_recomputed_activation]: 2.23002e-06 [rewriter_after_opt_a]: 3.489e-05 [convert_after_rewriter]: 6.96999e-06 [order_py_execute_after_rewriter]: 5.29998e-06 [mutable_eliminate]: 0.00055489 [opt_b]: 0.00020839, [1] [Cycle 1]: 0.00020155, [7] [b_1]: 0.00012599 [b_2]: 7.93999e-06 [updatestate_depend_eliminate]: 6.69001e-06 [updatestate_assign_eliminate]: 2.69999e-06 [updatestate_loads_eliminate]: 2.34999e-06 [renormalize]: 5.09986e-07 [cse]: 1.856e-05 [optimize_parallel_all_gather_comm]: 1.651e-05 [overlap_param_gather]: 1.91e-06 [cconv]: 2.826e-05 [loop_unroll]: 0.00045678 [opt_after_cconv]: 0.00010585, [1] [Cycle 1]: 9.937e-05, [7] [c_1]: 3.108e-05 [parameter_eliminate]: 3.83001e-06 [updatestate_depend_eliminate]: 5.61e-06 [updatestate_assign_eliminate]: 2.43e-06 [updatestate_loads_eliminate]: 2.34001e-06 [cse]: 1.865e-05 [renormalize]: 6.00005e-07 [remove_dup_value]: 1.274e-05 [tuple_transform]: 8.005e-05, [1] [Cycle 1]: 7.52e-05, [4] [d_1]: 4.599e-05 [none_parameter_eliminate]: 1.86e-06 [renormalize]: 2.59985e-07 [switch_simplify]: 6.76e-06 [partial_unused_args_eliminate]: 2.08998e-06 [add_recomputation]: 4.828e-05 [cse_after_recomputation]: 2.105e-05, [1] [Cycle 1]: 1.611e-05, [1] [cse]: 1.054e-05 [environ_conv]: 5.15999e-06 [swap_dp_allreduce_reducescatter]: 4.80001e-06 [bias_add_comm_swap]: 2.47001e-06 [label_micro_interleaved_index]: 5.30001e-06 [label_fine_grained_interleaved_index]: 2.79001e-06 [merge_cast_opt]: 1.42e-06 [slice_recompute_activation]: 2.09e-06 [micro_interleaved_order_control]: 2.63e-06 [assign_add_opt]: 1.16002e-06 [ForceFp32Comm]: 8.50006e-07 [remove_cast_before_assign_add]: 1.05999e-06 [full_micro_interleaved_order_control]: 2.49999e-06 [reorder_send_recv_between_fp_bp]: 2.73998e-06 [comm_op_add_attrs]: 1.55999e-06 [add_comm_op_reuse_tag]: 1.00001e-06 [interleave_split_concat_branches]: 1.27e-06 [interleave_parallel_branches]: 1.19e-06 [overlap_opt_shard_in_pipeline]: 1.37e-06 [overlap_opt_shard_grad_in_pipeline]: 1.94999e-06 [control_data_broadcast_order]: 1.26e-05 [grouped_pairwise_exchange_alltoall]: 1.81e-06 [offloading_packed_experts]: 4.09997e-06 [overlap_recompute_and_grad_model_parallel]: 5.12e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.20001e-06 [overlap_recompute_allgather_and_fa_grad]: 1.35001e-06 [overlap_recompute_comm]: 2.52001e-06 [overlap_grad_ring_attention]: 4.18999e-06 [overlap_grad_flash_sp]: 2.055e-05 [begin_end_overlap_inline]: 6.39993e-07 [split_matmul_comm_elemetwise]: 2.38002e-06 [split_layernorm_comm]: 1.76e-06 [handle_group_info]: 1.37e-06 [symbol_engine_optimizer]: 8.095e-05, [1] [Cycle 1]: 7.634e-05, [6] [build]: 3.24001e-06 [elim_shapecalc]: 1.044e-05 [elim_not_effective]: 1.403e-05 [opt_reshape]: 7.57998e-06 [fold_const_symbol]: 1.05e-05 [renormalize]: 2.49973e-07 [detach_backward]: 1.97001e-06 [pipeline_parallel_scheduler]: 1.82001e-06 [auto_monad_reorder]: 1.598e-05 [get_jit_bprop_graph]: 1.86e-06 [rewriter_after_jit_bprop_graph]: 4.03999e-06 [opt_after_jit_grad]: 0.00049177 [validate]: 3.962e-05 Sums bootstrap : 0.000469s : 4.10% type_inference : 0.006164s : 53.77% event_method : 0.000021s : 0.18% auto_monad : 0.000065s : 0.57% graph_reusing : 0.000006s : 0.06% inline : 0.000002s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000020s : 0.17% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.05% parallel-infer-symbol : 0.000003s : 0.03% pre_auto_parallel : 0.000038s : 0.33% insert-virtual-dataset : 0.000002s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000027s : 0.23% optimize.rewriter_before_opt_a : 0.000083s : 0.72% optimize.opt_a.expand_dump_flag : 0.000005s : 0.04% optimize.opt_a.switch_simplify : 0.000052s : 0.45% optimize.opt_a.loop_unroll : 0.000036s : 0.31% optimize.opt_a.a_1 : 0.000808s : 7.05% optimize.opt_a.with_stream_mark : 0.000031s : 0.27% optimize.opt_a.recompute_prepare : 0.000016s : 0.14% optimize.opt_a.updatestate_depend_eliminate : 0.000007s : 0.06% optimize.opt_a.updatestate_assign_eliminate : 0.000006s : 0.05% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.05% optimize.opt_a.parameter_eliminate : 0.000003s : 0.03% optimize.opt_a.a_2 : 0.000171s : 1.49% optimize.opt_a.accelerated_algorithm : 0.000014s : 0.12% optimize.opt_a.shard : 0.000003s : 0.02% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.03% optimize.opt_a.shard_inline : 0.000013s : 0.12% optimize.opt_a.merge_send_recv : 0.000014s : 0.12% optimize.opt_a.auto_parallel : 0.000013s : 0.11% optimize.opt_a.parallel : 0.000025s : 0.22% optimize.opt_a.flash_sp : 0.000013s : 0.11% optimize.opt_a.merge_comm : 0.000008s : 0.07% optimize.opt_a.allreduce_fusion : 0.000007s : 0.06% optimize.opt_a.matmul_add_comm_reduction : 0.000015s : 0.13% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000016s : 0.14% optimize.opt_a.virtual_dataset : 0.000013s : 0.12% optimize.opt_a.get_grad_eliminate_ : 0.000012s : 0.11% optimize.opt_a.virtual_output : 0.000013s : 0.12% optimize.opt_a.merge_forward : 0.000007s : 0.06% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.03% optimize.opt_a.offload_activation : 0.000017s : 0.15% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000023s : 0.20% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.02% optimize.opt_a.before_grad : 0.000021s : 0.18% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000007s : 0.06% optimize.opt_a.meta_fg_expand : 0.000005s : 0.04% optimize.opt_a.flash_sp_send_recv_attached : 0.000003s : 0.03% optimize.opt_a.receive_attached : 0.000003s : 0.03% optimize.opt_a.after_resolve : 0.000024s : 0.21% optimize.opt_a.a_after_grad : 0.000019s : 0.17% optimize.opt_a.renormalize : 0.000745s : 6.50% optimize.opt_a.add_forward_monad_depend : 0.000007s : 0.06% optimize.opt_a.auto_monad_grad : 0.000004s : 0.03% optimize.opt_a.auto_monad_eliminator : 0.000024s : 0.21% optimize.opt_a.cse : 0.000046s : 0.40% optimize.opt_a.a_3 : 0.000087s : 0.75% optimize.py_interpret_to_execute_after_opt_a : 0.000009s : 0.08% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.02% optimize.rewriter_after_opt_a : 0.000035s : 0.30% optimize.convert_after_rewriter : 0.000007s : 0.06% optimize.order_py_execute_after_rewriter : 0.000005s : 0.05% optimize.mutable_eliminate : 0.000555s : 4.84% optimize.opt_b.b_1 : 0.000126s : 1.10% optimize.opt_b.b_2 : 0.000008s : 0.07% optimize.opt_b.updatestate_depend_eliminate : 0.000007s : 0.06% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.02% optimize.opt_b.updatestate_loads_eliminate : 0.000002s : 0.02% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000019s : 0.16% optimize.optimize_parallel_all_gather_comm : 0.000017s : 0.14% optimize.overlap_param_gather : 0.000002s : 0.02% optimize.cconv : 0.000028s : 0.25% optimize.loop_unroll : 0.000457s : 3.98% optimize.opt_after_cconv.c_1 : 0.000031s : 0.27% optimize.opt_after_cconv.parameter_eliminate : 0.000004s : 0.03% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.05% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000002s : 0.02% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.02% optimize.opt_after_cconv.cse : 0.000019s : 0.16% optimize.opt_after_cconv.renormalize : 0.000001s : 0.01% optimize.remove_dup_value : 0.000013s : 0.11% optimize.tuple_transform.d_1 : 0.000046s : 0.40% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.02% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000007s : 0.06% optimize.partial_unused_args_eliminate : 0.000002s : 0.02% optimize.add_recomputation : 0.000048s : 0.42% optimize.cse_after_recomputation.cse : 0.000011s : 0.09% optimize.environ_conv : 0.000005s : 0.05% optimize.swap_dp_allreduce_reducescatter : 0.000005s : 0.04% optimize.bias_add_comm_swap : 0.000002s : 0.02% optimize.label_micro_interleaved_index : 0.000005s : 0.05% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.02% optimize.merge_cast_opt : 0.000001s : 0.01% optimize.slice_recompute_activation : 0.000002s : 0.02% optimize.micro_interleaved_order_control : 0.000003s : 0.02% optimize.assign_add_opt : 0.000001s : 0.01% optimize.ForceFp32Comm : 0.000001s : 0.01% optimize.remove_cast_before_assign_add : 0.000001s : 0.01% optimize.full_micro_interleaved_order_control : 0.000002s : 0.02% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.02% optimize.comm_op_add_attrs : 0.000002s : 0.01% optimize.add_comm_op_reuse_tag : 0.000001s : 0.01% optimize.interleave_split_concat_branches : 0.000001s : 0.01% optimize.interleave_parallel_branches : 0.000001s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.02% optimize.control_data_broadcast_order : 0.000013s : 0.11% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.02% optimize.offloading_packed_experts : 0.000004s : 0.04% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.04% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.01% optimize.overlap_recompute_comm : 0.000003s : 0.02% optimize.overlap_grad_ring_attention : 0.000004s : 0.04% optimize.overlap_grad_flash_sp : 0.000021s : 0.18% optimize.begin_end_overlap_inline : 0.000001s : 0.01% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.02% optimize.split_layernorm_comm : 0.000002s : 0.02% optimize.handle_group_info : 0.000001s : 0.01% optimize.symbol_engine_optimizer.build : 0.000003s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000010s : 0.09% optimize.symbol_engine_optimizer.elim_not_effective : 0.000014s : 0.12% optimize.symbol_engine_optimizer.opt_reshape : 0.000008s : 0.07% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000010s : 0.09% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.02% pipeline_parallel_scheduler : 0.000002s : 0.02% auto_monad_reorder : 0.000016s : 0.14% get_jit_bprop_graph : 0.000002s : 0.02% rewriter_after_jit_bprop_graph : 0.000004s : 0.04% opt_after_jit_grad : 0.000492s : 4.29% validate : 0.000040s : 0.35% Time group info: ------[substitution.] 0.000220 34 15.08% : 0.000033s : 6: substitution.arithmetic_simplify 1.02% : 0.000002s : 2: substitution.elim_not_effective 0.70% : 0.000002s : 2: substitution.fold_const_symbol 2.94% : 0.000006s : 4: substitution.graph_param_transform 67.29% : 0.000148s : 4: substitution.inline 1.83% : 0.000004s : 4: substitution.j_node_and_user_rematch 2.00% : 0.000004s : 4: substitution.remove_not_recompute_node 2.03% : 0.000004s : 4: substitution.replace_old_param 7.12% : 0.000016s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.006087 2 87.38% : 0.005318s : 1: type_inference.infer 12.62% : 0.000768s : 1: type_inference.specialize ------[replace.] 0.000061 8 62.60% : 0.000038s : 4: replace.inline 37.40% : 0.000023s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000160 8 91.33% : 0.000146s : 4: match.inline 8.67% : 0.000014s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000206 1278 1.08% : 0.000002s : 13: predicate.accumulaten_eliminater 0.83% : 0.000002s : 4: predicate.ad_related_special_op_eliminate 0.61% : 0.000001s : 8: predicate.addn_check_dump 0.92% : 0.000002s : 13: predicate.addn_zero_filter 0.81% : 0.000002s : 13: predicate.adjust_all_reduce_mul_add 2.61% : 0.000005s : 21: predicate.arithmetic_simplify 0.99% : 0.000002s : 13: predicate.cast_eliminate 0.55% : 0.000001s : 8: predicate.check_bprop_eliminate 0.53% : 0.000001s : 8: predicate.compare_switch_simplify 0.19% : 0.000000s : 4: predicate.const_output_eliminate 0.64% : 0.000001s : 8: predicate.depend_value_elim 0.90% : 0.000002s : 13: predicate.dict_get_item_const_eliminator 0.97% : 0.000002s : 13: predicate.dict_get_item_eliminator 0.86% : 0.000002s : 13: predicate.dict_set_item_eliminator 1.04% : 0.000002s : 8: predicate.dumpgradient_eliminate 0.25% : 0.000001s : 4: predicate.elim_not_effective 0.39% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.22% : 0.000003s : 17: predicate.environ_add_const_eliminate 1.12% : 0.000002s : 17: predicate.environ_get_add_eliminate 1.11% : 0.000002s : 17: predicate.environ_get_depend_swap 1.69% : 0.000003s : 25: predicate.environ_get_eliminate 1.07% : 0.000002s : 17: predicate.environ_get_set_eliminate 1.41% : 0.000003s : 21: predicate.exchange_switch_depend_value 2.43% : 0.000005s : 21: predicate.float_depend_g_call 0.47% : 0.000001s : 8: predicate.float_environ_get_switch 0.84% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.24% : 0.000000s : 4: predicate.fold_const_symbol 0.68% : 0.000001s : 8: predicate.get_grad_eliminate 0.23% : 0.000000s : 4: predicate.graph_param_transform 0.66% : 0.000001s : 8: predicate.incorporate_call 0.50% : 0.000001s : 8: predicate.incorporate_call_switch 6.39% : 0.000013s : 58: predicate.inline 0.74% : 0.000002s : 8: predicate.inline_without_move 0.31% : 0.000001s : 8: predicate.j_node_and_user_rematch 0.75% : 0.000002s : 8: predicate.less_batch_normalization 1.87% : 0.000004s : 25: predicate.list_to_tuple_eliminator_ 2.48% : 0.000005s : 38: predicate.load_eliminater 1.01% : 0.000002s : 4: predicate.loop_unroll_after_grad 2.27% : 0.000005s : 34: predicate.loop_unroll_before_grad 1.71% : 0.000004s : 21: predicate.make_slice_get_slice_eliminator 0.52% : 0.000001s : 8: predicate.merge_addn 0.62% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.58% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.87% : 0.000002s : 13: predicate.minmaximum_grad 1.11% : 0.000002s : 4: predicate.mutable_eliminate 0.33% : 0.000001s : 4: predicate.opt_reshape 0.40% : 0.000001s : 4: predicate.parallel_virtual_node 1.81% : 0.000004s : 21: predicate.partial_defer_inline 1.68% : 0.000003s : 21: predicate.partial_eliminate 0.86% : 0.000002s : 13: predicate.print_const_string_wrapper 0.55% : 0.000001s : 8: predicate.reduce_all_const_elim 1.14% : 0.000002s : 13: predicate.reduce_eliminate 2.77% : 0.000006s : 38: predicate.redundant_stop_gradient_eliminater 0.48% : 0.000001s : 8: predicate.remove_not_recompute_node 1.40% : 0.000003s : 25: predicate.replace_applicator 0.51% : 0.000001s : 8: predicate.replace_old_param 0.29% : 0.000001s : 4: predicate.reset_defer_inline 0.92% : 0.000002s : 13: predicate.reshape_eliminate 0.62% : 0.000001s : 8: predicate.row_tensor_add_zeros_like 0.36% : 0.000001s : 4: predicate.row_tensor_eliminate 0.75% : 0.000002s : 8: predicate.same_eliminate 0.42% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.97% : 0.000002s : 8: predicate.shard_identity_eliminate 0.70% : 0.000001s : 8: predicate.special_op_eliminate 0.65% : 0.000001s : 8: predicate.specialize_transform 1.08% : 0.000002s : 8: predicate.split_environ_get_set_with_tuple_value 0.87% : 0.000002s : 8: predicate.stack_unstack_eliminate 0.33% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.54% : 0.000003s : 21: predicate.switch_defer_inline 2.04% : 0.000004s : 29: predicate.switch_layer_defer_inline 5.05% : 0.000010s : 67: predicate.switch_simplify 0.80% : 0.000002s : 13: predicate.tile_eliminate 0.87% : 0.000002s : 13: predicate.transpose_eliminate 1.61% : 0.000003s : 21: predicate.tuple_list_convert_item_index_to_positive 1.50% : 0.000003s : 21: predicate.tuple_list_get_item_const_eliminator 1.45% : 0.000003s : 21: predicate.tuple_list_get_item_depend_reorder 3.01% : 0.000006s : 33: predicate.tuple_list_get_item_eliminator 1.41% : 0.000003s : 21: predicate.tuple_list_get_set_item_eliminator 2.19% : 0.000005s : 29: predicate.tuple_list_set_item_eliminator 1.78% : 0.000004s : 25: predicate.tuple_to_list_eliminator_ 2.36% : 0.000005s : 38: predicate.updatestate_pure_node_eliminater 3.01% : 0.000006s : 46: predicate.updatestate_useless_node_eliminater 0.36% : 0.000001s : 4: predicate.value_based_eliminate 0.66% : 0.000001s : 8: predicate.virtual_dataset_eliminate 0.63% : 0.000001s : 8: predicate.virtual_output_eliminate 0.32% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.47% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000570 11 54.72% : 0.000312s : 5: func_graph_cloner_run.FuncGraphClonerGraph 45.28% : 0.000258s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.026263 192 0.01% : 0.000004s : 1: ForceFp32Comm 12.81% : 0.003365s : 1: add_attr 12.77% : 0.003354s : 1: add_attr_with_inline 0.02% : 0.000004s : 1: add_comm_op_reuse_tag 0.20% : 0.000052s : 1: add_recomputation 0.01% : 0.000004s : 1: assign_add_opt 0.27% : 0.000071s : 1: auto_monad 0.08% : 0.000020s : 1: auto_monad_reorder 0.01% : 0.000003s : 1: begin_end_overlap_inline 0.02% : 0.000005s : 1: bias_add_comm_swap 1.90% : 0.000499s : 1: bootstrap 0.12% : 0.000032s : 1: cconv 0.02% : 0.000004s : 1: comm_op_add_attrs 0.06% : 0.000016s : 1: control_data_broadcast_order 0.04% : 0.000010s : 1: convert_after_rewriter 0.09% : 0.000024s : 1: cse_after_recomputation 0.02% : 0.000006s : 1: dataset_repeat_opt 0.02% : 0.000005s : 1: detach_backward 0.03% : 0.000008s : 1: environ_conv 0.11% : 0.000028s : 1: event_method 0.02% : 0.000005s : 1: full_micro_interleaved_order_control 0.02% : 0.000005s : 1: get_jit_bprop_graph 0.04% : 0.000010s : 1: graph_reusing 0.02% : 0.000005s : 1: grouped_pairwise_exchange_alltoall 0.02% : 0.000004s : 1: handle_group_info 0.02% : 0.000005s : 1: inline 0.02% : 0.000006s : 1: insert-virtual-dataset 0.02% : 0.000004s : 1: interleave_parallel_branches 0.02% : 0.000004s : 1: interleave_split_concat_branches 0.02% : 0.000006s : 1: label_fine_grained_interleaved_index 0.03% : 0.000008s : 1: label_micro_interleaved_index 1.77% : 0.000466s : 1: loop_unroll 0.02% : 0.000004s : 1: merge_cast_opt 0.02% : 0.000005s : 1: micro_interleaved_order_control 2.15% : 0.000564s : 1: mutable_eliminate 0.03% : 0.000007s : 1: offloading_packed_experts 0.06% : 0.000015s : 1: opt.transform.loop_unroll_optimizer 0.07% : 0.000017s : 1: opt.transform.mutable_eliminate 4.74% : 0.001246s : 78: opt.transform.opt_a 0.11% : 0.000030s : 1: opt.transform.opt_after_cconv 0.10% : 0.000027s : 1: opt.transform.opt_after_jit_grad 0.39% : 0.000102s : 28: opt.transform.opt_b 0.19% : 0.000051s : 2: opt.transform.opt_trans_graph 0.15% : 0.000039s : 4: opt.transform.symbol_engine_opt 10.84% : 0.002847s : 1: opt_a 0.42% : 0.000109s : 1: opt_after_cconv 1.91% : 0.000501s : 1: opt_after_jit_grad 0.81% : 0.000212s : 1: opt_b 18.89% : 0.004962s : 1: optimize 0.08% : 0.000020s : 1: optimize_parallel_all_gather_comm 0.03% : 0.000008s : 1: order_py_execute_after_rewriter 0.09% : 0.000024s : 1: overlap_grad_flash_sp 0.02% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.03% : 0.000007s : 1: overlap_grad_ring_attention 0.02% : 0.000006s : 1: overlap_opt_shard_grad_in_pipeline 0.02% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.02% : 0.000005s : 1: overlap_param_gather 0.02% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.03% : 0.000008s : 1: overlap_recompute_and_grad_model_parallel 0.02% : 0.000005s : 1: overlap_recompute_comm 0.03% : 0.000007s : 1: parallel-infer-symbol 0.01% : 0.000004s : 1: parallel-infer-symbol-second 0.02% : 0.000005s : 1: partial_unused_args_eliminate 0.02% : 0.000005s : 1: pipeline_parallel_scheduler 0.02% : 0.000005s : 1: pipeline_split 0.16% : 0.000042s : 1: pre_auto_parallel 0.12% : 0.000031s : 1: py_interpret_to_execute 0.05% : 0.000013s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000004s : 1: remove_cast_before_assign_add 0.06% : 0.000016s : 1: remove_dup_value 1.57% : 0.000412s : 1: renormalize.infer 1.23% : 0.000324s : 1: renormalize.specialize 0.02% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.03% : 0.000007s : 1: rewriter_after_jit_bprop_graph 0.15% : 0.000039s : 1: rewriter_after_opt_a 0.33% : 0.000087s : 1: rewriter_before_opt_a 0.02% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.02% : 0.000005s : 1: slice_recompute_activation 0.02% : 0.000005s : 1: split_layernorm_comm 0.02% : 0.000005s : 1: split_matmul_comm_elemetwise 0.03% : 0.000008s : 1: swap_dp_allreduce_reducescatter 0.32% : 0.000084s : 1: symbol_engine_optimizer 0.32% : 0.000083s : 1: tuple_transform 23.55% : 0.006185s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:42:35.411.288 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:42:35.411.590 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.018073, [21] [bootstrap]: 0.00046425 [type_inference]: 0.0064839 [event_method]: 2.03e-05 [auto_monad]: 6.502e-05 [graph_reusing]: 5.95002e-06 [inline]: 3.23e-06 [add_attr]: 0.003674, [1] [add_attr_with_inline]: 0.0036599, [1] [Cycle 1]: 0.00010201, [2] [tag_attr]: 2.455e-05 [meta_addattr_fg_expand]: 6.29999e-06 [parallel-infer-symbol]: 3.68e-06 [pre_auto_parallel]: 4.362e-05 [insert-virtual-dataset]: 2.46998e-06 [parallel-infer-symbol-second]: 7.89994e-07 [dataset_repeat_opt]: 1.96e-06 [pipeline_split]: 1.66e-06 [optimize]: 0.00602607, [53] [py_interpret_to_execute]: 3.378e-05 [rewriter_before_opt_a]: 9.726e-05 [opt_a]: 0.0034361, [2] [Cycle 1]: 0.00249642, [45] [expand_dump_flag]: 3.38e-06 [switch_simplify]: 4.277e-05 [loop_unroll]: 3.16e-05 [a_1]: 0.0006739 [with_stream_mark]: 2.133e-05 [recompute_prepare]: 9.87999e-06 [updatestate_depend_eliminate]: 4.09002e-06 [updatestate_assign_eliminate]: 3.34001e-06 [updatestate_loads_eliminate]: 3.38e-06 [parameter_eliminate]: 2.12999e-06 [a_2]: 0.00012324 [accelerated_algorithm]: 8.50999e-06 [shard]: 2.11e-06 [meta_shard_fg_expand]: 2.09e-06 [shard_inline]: 6.91999e-06 [merge_send_recv]: 8.90001e-06 [auto_parallel]: 7.3e-06 [parallel]: 2.003e-05 [flash_sp]: 9.51e-06 [merge_comm]: 4.02e-06 [allreduce_fusion]: 4.03001e-06 [matmul_add_comm_reduction]: 1.052e-05 [allreduce_slice_to_reducescatter]: 7.00005e-07 [virtual_shard_identity]: 8.72e-06 [virtual_dataset]: 7.01001e-06 [get_grad_eliminate_]: 6.34001e-06 [virtual_output]: 6.65998e-06 [merge_forward]: 4.35e-06 [cell_reuse_recompute_pass]: 1.27e-06 [offload_activation]: 1.074e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.498e-05 [merge_recompute_call_nodes]: 1.58997e-06 [before_grad]: 1.223e-05 [set_forward_comm_id_for_comm_node_pass]: 3.63999e-06 [meta_fg_expand]: 3.09999e-06 [flash_sp_send_recv_attached]: 2.59999e-06 [receive_attached]: 2.29999e-06 [after_resolve]: 1.22e-05 [a_after_grad]: 1.135e-05 [renormalize]: 0.00085831 [add_forward_monad_depend]: 7.18e-06 [auto_monad_grad]: 2.46e-06 [auto_monad_eliminator]: 1.578e-05 [cse]: 2.903e-05 [a_3]: 6.614e-05 [Cycle 2]: 0.00092279, [45] [expand_dump_flag]: 1.82999e-06 [switch_simplify]: 9.56e-06 [loop_unroll]: 6.66999e-06 [a_1]: 0.00015707 [with_stream_mark]: 1.378e-05 [recompute_prepare]: 6.99001e-06 [updatestate_depend_eliminate]: 3.35e-06 [updatestate_assign_eliminate]: 2.36e-06 [updatestate_loads_eliminate]: 2.24999e-06 [parameter_eliminate]: 1.33002e-06 [a_2]: 0.00012427 [accelerated_algorithm]: 7.75e-06 [shard]: 1.47001e-06 [meta_shard_fg_expand]: 2.36e-06 [shard_inline]: 6.91999e-06 [merge_send_recv]: 6.01e-06 [auto_parallel]: 6.80002e-06 [parallel]: 7.02002e-06 [flash_sp]: 4.48999e-06 [merge_comm]: 3.85998e-06 [allreduce_fusion]: 3.73999e-06 [matmul_add_comm_reduction]: 6.69999e-06 [allreduce_slice_to_reducescatter]: 8.09989e-07 [virtual_shard_identity]: 7.97003e-06 [virtual_dataset]: 6.30002e-06 [get_grad_eliminate_]: 6.06e-06 [virtual_output]: 5.67999e-06 [merge_forward]: 3.88999e-06 [cell_reuse_recompute_pass]: 2.36e-06 [offload_activation]: 8.79003e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.419e-05 [merge_recompute_call_nodes]: 1.34998e-06 [before_grad]: 1.145e-05 [set_forward_comm_id_for_comm_node_pass]: 4.26001e-06 [meta_fg_expand]: 2.51e-06 [flash_sp_send_recv_attached]: 1.00999e-06 [receive_attached]: 1.52001e-06 [after_resolve]: 1.446e-05 [a_after_grad]: 1.112e-05 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 2.07001e-06 [auto_monad_grad]: 1.74e-06 [auto_monad_eliminator]: 9.34e-06 [cse]: 1.683e-05 [a_3]: 5.089e-05 [py_interpret_to_execute_after_opt_a]: 1.602e-05 [slice_cell_reuse_recomputed_activation]: 5.05001e-06 [rewriter_after_opt_a]: 4.239e-05 [convert_after_rewriter]: 9.96e-06 [order_py_execute_after_rewriter]: 7.83001e-06 [mutable_eliminate]: 0.00073846 [opt_b]: 0.00027929, [1] [Cycle 1]: 0.00026855, [7] [b_1]: 0.00017031 [b_2]: 8.42e-06 [updatestate_depend_eliminate]: 7.26999e-06 [updatestate_assign_eliminate]: 2.44999e-06 [updatestate_loads_eliminate]: 2.43e-06 [renormalize]: 6.39993e-07 [cse]: 1.976e-05 [optimize_parallel_all_gather_comm]: 2.114e-05 [overlap_param_gather]: 5.14003e-06 [cconv]: 3.389e-05 [loop_unroll]: 0.00047239 [opt_after_cconv]: 0.00013021, [1] [Cycle 1]: 0.00012046, [7] [c_1]: 3.152e-05 [parameter_eliminate]: 4.25e-06 [updatestate_depend_eliminate]: 5.48002e-06 [updatestate_assign_eliminate]: 2.61e-06 [updatestate_loads_eliminate]: 2.37999e-06 [cse]: 1.823e-05 [renormalize]: 5.00004e-07 [remove_dup_value]: 1.674e-05 [tuple_transform]: 9.164e-05, [1] [Cycle 1]: 8.448e-05, [4] [d_1]: 4.538e-05 [none_parameter_eliminate]: 1.81e-06 [renormalize]: 2.30008e-07 [switch_simplify]: 7.06999e-06 [partial_unused_args_eliminate]: 4.43001e-06 [add_recomputation]: 5.37e-05 [cse_after_recomputation]: 2.691e-05, [1] [Cycle 1]: 2.016e-05, [1] [cse]: 1.141e-05 [environ_conv]: 8.48999e-06 [swap_dp_allreduce_reducescatter]: 7.84002e-06 [bias_add_comm_swap]: 6.20002e-06 [label_micro_interleaved_index]: 6.83e-06 [label_fine_grained_interleaved_index]: 5.07999e-06 [merge_cast_opt]: 3.81999e-06 [slice_recompute_activation]: 4.62998e-06 [micro_interleaved_order_control]: 5.32001e-06 [assign_add_opt]: 4.07e-06 [ForceFp32Comm]: 3.24001e-06 [remove_cast_before_assign_add]: 3.38e-06 [full_micro_interleaved_order_control]: 5.12e-06 [reorder_send_recv_between_fp_bp]: 5.41998e-06 [comm_op_add_attrs]: 3.5e-06 [add_comm_op_reuse_tag]: 3.42002e-06 [interleave_split_concat_branches]: 3.75e-06 [interleave_parallel_branches]: 3.51001e-06 [overlap_opt_shard_in_pipeline]: 3.8e-06 [overlap_opt_shard_grad_in_pipeline]: 4.44002e-06 [control_data_broadcast_order]: 1.518e-05 [grouped_pairwise_exchange_alltoall]: 4.20999e-06 [offloading_packed_experts]: 6.61e-06 [overlap_recompute_and_grad_model_parallel]: 7.18998e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.70998e-06 [overlap_recompute_allgather_and_fa_grad]: 3.90998e-06 [overlap_recompute_comm]: 5.19998e-06 [overlap_grad_ring_attention]: 7.18998e-06 [overlap_grad_flash_sp]: 2.573e-05 [begin_end_overlap_inline]: 3.35e-06 [split_matmul_comm_elemetwise]: 4.72e-06 [split_layernorm_comm]: 4.17e-06 [handle_group_info]: 3.56001e-06 [symbol_engine_optimizer]: 0.00010114, [1] [Cycle 1]: 9.379e-05, [6] [build]: 3.83001e-06 [elim_shapecalc]: 1.038e-05 [elim_not_effective]: 1.407e-05 [opt_reshape]: 7.85e-06 [fold_const_symbol]: 1.093e-05 [renormalize]: 1.8999e-07 [detach_backward]: 4.15999e-06 [pipeline_parallel_scheduler]: 1.87999e-06 [auto_monad_reorder]: 2.089e-05 [get_jit_bprop_graph]: 1.89999e-06 [rewriter_after_jit_bprop_graph]: 5.46998e-06 [opt_after_jit_grad]: 0.00058585 [validate]: 4.527e-05 Sums bootstrap : 0.000464s : 3.69% type_inference : 0.006484s : 51.50% event_method : 0.000020s : 0.16% auto_monad : 0.000065s : 0.52% graph_reusing : 0.000006s : 0.05% inline : 0.000003s : 0.03% add_attr.add_attr_with_inline.tag_attr : 0.000025s : 0.19% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.05% parallel-infer-symbol : 0.000004s : 0.03% pre_auto_parallel : 0.000044s : 0.35% insert-virtual-dataset : 0.000002s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000034s : 0.27% optimize.rewriter_before_opt_a : 0.000097s : 0.77% optimize.opt_a.expand_dump_flag : 0.000005s : 0.04% optimize.opt_a.switch_simplify : 0.000052s : 0.42% optimize.opt_a.loop_unroll : 0.000038s : 0.30% optimize.opt_a.a_1 : 0.000831s : 6.60% optimize.opt_a.with_stream_mark : 0.000035s : 0.28% optimize.opt_a.recompute_prepare : 0.000017s : 0.13% optimize.opt_a.updatestate_depend_eliminate : 0.000007s : 0.06% optimize.opt_a.updatestate_assign_eliminate : 0.000006s : 0.05% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.04% optimize.opt_a.parameter_eliminate : 0.000003s : 0.03% optimize.opt_a.a_2 : 0.000248s : 1.97% optimize.opt_a.accelerated_algorithm : 0.000016s : 0.13% optimize.opt_a.shard : 0.000004s : 0.03% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.04% optimize.opt_a.shard_inline : 0.000014s : 0.11% optimize.opt_a.merge_send_recv : 0.000015s : 0.12% optimize.opt_a.auto_parallel : 0.000014s : 0.11% optimize.opt_a.parallel : 0.000027s : 0.21% optimize.opt_a.flash_sp : 0.000014s : 0.11% optimize.opt_a.merge_comm : 0.000008s : 0.06% optimize.opt_a.allreduce_fusion : 0.000008s : 0.06% optimize.opt_a.matmul_add_comm_reduction : 0.000017s : 0.14% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000002s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000017s : 0.13% optimize.opt_a.virtual_dataset : 0.000013s : 0.11% optimize.opt_a.get_grad_eliminate_ : 0.000012s : 0.10% optimize.opt_a.virtual_output : 0.000012s : 0.10% optimize.opt_a.merge_forward : 0.000008s : 0.07% optimize.opt_a.cell_reuse_recompute_pass : 0.000004s : 0.03% optimize.opt_a.offload_activation : 0.000020s : 0.16% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000029s : 0.23% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.02% optimize.opt_a.before_grad : 0.000024s : 0.19% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000008s : 0.06% optimize.opt_a.meta_fg_expand : 0.000006s : 0.04% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.03% optimize.opt_a.receive_attached : 0.000004s : 0.03% optimize.opt_a.after_resolve : 0.000027s : 0.21% optimize.opt_a.a_after_grad : 0.000022s : 0.18% optimize.opt_a.renormalize : 0.000858s : 6.82% optimize.opt_a.add_forward_monad_depend : 0.000009s : 0.07% optimize.opt_a.auto_monad_grad : 0.000004s : 0.03% optimize.opt_a.auto_monad_eliminator : 0.000025s : 0.20% optimize.opt_a.cse : 0.000046s : 0.36% optimize.opt_a.a_3 : 0.000117s : 0.93% optimize.py_interpret_to_execute_after_opt_a : 0.000016s : 0.13% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.04% optimize.rewriter_after_opt_a : 0.000042s : 0.34% optimize.convert_after_rewriter : 0.000010s : 0.08% optimize.order_py_execute_after_rewriter : 0.000008s : 0.06% optimize.mutable_eliminate : 0.000738s : 5.87% optimize.opt_b.b_1 : 0.000170s : 1.35% optimize.opt_b.b_2 : 0.000008s : 0.07% optimize.opt_b.updatestate_depend_eliminate : 0.000007s : 0.06% optimize.opt_b.updatestate_assign_eliminate : 0.000002s : 0.02% optimize.opt_b.updatestate_loads_eliminate : 0.000002s : 0.02% optimize.opt_b.renormalize : 0.000001s : 0.01% optimize.opt_b.cse : 0.000020s : 0.16% optimize.optimize_parallel_all_gather_comm : 0.000021s : 0.17% optimize.overlap_param_gather : 0.000005s : 0.04% optimize.cconv : 0.000034s : 0.27% optimize.loop_unroll : 0.000472s : 3.75% optimize.opt_after_cconv.c_1 : 0.000032s : 0.25% optimize.opt_after_cconv.parameter_eliminate : 0.000004s : 0.03% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000005s : 0.04% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.02% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.02% optimize.opt_after_cconv.cse : 0.000018s : 0.14% optimize.opt_after_cconv.renormalize : 0.000001s : 0.00% optimize.remove_dup_value : 0.000017s : 0.13% optimize.tuple_transform.d_1 : 0.000045s : 0.36% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000007s : 0.06% optimize.partial_unused_args_eliminate : 0.000004s : 0.04% optimize.add_recomputation : 0.000054s : 0.43% optimize.cse_after_recomputation.cse : 0.000011s : 0.09% optimize.environ_conv : 0.000008s : 0.07% optimize.swap_dp_allreduce_reducescatter : 0.000008s : 0.06% optimize.bias_add_comm_swap : 0.000006s : 0.05% optimize.label_micro_interleaved_index : 0.000007s : 0.05% optimize.label_fine_grained_interleaved_index : 0.000005s : 0.04% optimize.merge_cast_opt : 0.000004s : 0.03% optimize.slice_recompute_activation : 0.000005s : 0.04% optimize.micro_interleaved_order_control : 0.000005s : 0.04% optimize.assign_add_opt : 0.000004s : 0.03% optimize.ForceFp32Comm : 0.000003s : 0.03% optimize.remove_cast_before_assign_add : 0.000003s : 0.03% optimize.full_micro_interleaved_order_control : 0.000005s : 0.04% optimize.reorder_send_recv_between_fp_bp : 0.000005s : 0.04% optimize.comm_op_add_attrs : 0.000003s : 0.03% optimize.add_comm_op_reuse_tag : 0.000003s : 0.03% optimize.interleave_split_concat_branches : 0.000004s : 0.03% optimize.interleave_parallel_branches : 0.000004s : 0.03% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.03% optimize.overlap_opt_shard_grad_in_pipeline : 0.000004s : 0.04% optimize.control_data_broadcast_order : 0.000015s : 0.12% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.03% optimize.offloading_packed_experts : 0.000007s : 0.05% optimize.overlap_recompute_and_grad_model_parallel : 0.000007s : 0.06% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.03% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.03% optimize.overlap_recompute_comm : 0.000005s : 0.04% optimize.overlap_grad_ring_attention : 0.000007s : 0.06% optimize.overlap_grad_flash_sp : 0.000026s : 0.20% optimize.begin_end_overlap_inline : 0.000003s : 0.03% optimize.split_matmul_comm_elemetwise : 0.000005s : 0.04% optimize.split_layernorm_comm : 0.000004s : 0.03% optimize.handle_group_info : 0.000004s : 0.03% optimize.symbol_engine_optimizer.build : 0.000004s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000010s : 0.08% optimize.symbol_engine_optimizer.elim_not_effective : 0.000014s : 0.11% optimize.symbol_engine_optimizer.opt_reshape : 0.000008s : 0.06% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000011s : 0.09% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000004s : 0.03% pipeline_parallel_scheduler : 0.000002s : 0.01% auto_monad_reorder : 0.000021s : 0.17% get_jit_bprop_graph : 0.000002s : 0.02% rewriter_after_jit_bprop_graph : 0.000005s : 0.04% opt_after_jit_grad : 0.000586s : 4.65% validate : 0.000045s : 0.36% Time group info: ------[substitution.] 0.000242 34 15.63% : 0.000038s : 6: substitution.arithmetic_simplify 0.77% : 0.000002s : 2: substitution.elim_not_effective 0.73% : 0.000002s : 2: substitution.fold_const_symbol 2.59% : 0.000006s : 4: substitution.graph_param_transform 67.58% : 0.000164s : 4: substitution.inline 1.90% : 0.000005s : 4: substitution.j_node_and_user_rematch 2.07% : 0.000005s : 4: substitution.remove_not_recompute_node 2.57% : 0.000006s : 4: substitution.replace_old_param 6.17% : 0.000015s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.006422 2 85.76% : 0.005507s : 1: type_inference.infer 14.24% : 0.000915s : 1: type_inference.specialize ------[replace.] 0.000065 8 62.49% : 0.000041s : 4: replace.inline 37.51% : 0.000024s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000174 8 92.46% : 0.000161s : 4: match.inline 7.54% : 0.000013s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000225 1278 0.89% : 0.000002s : 13: predicate.accumulaten_eliminater 0.84% : 0.000002s : 4: predicate.ad_related_special_op_eliminate 0.46% : 0.000001s : 8: predicate.addn_check_dump 1.03% : 0.000002s : 13: predicate.addn_zero_filter 0.77% : 0.000002s : 13: predicate.adjust_all_reduce_mul_add 2.50% : 0.000006s : 21: predicate.arithmetic_simplify 1.10% : 0.000002s : 13: predicate.cast_eliminate 0.75% : 0.000002s : 8: predicate.check_bprop_eliminate 0.46% : 0.000001s : 8: predicate.compare_switch_simplify 0.18% : 0.000000s : 4: predicate.const_output_eliminate 0.72% : 0.000002s : 8: predicate.depend_value_elim 1.33% : 0.000003s : 13: predicate.dict_get_item_const_eliminator 0.97% : 0.000002s : 13: predicate.dict_get_item_eliminator 0.88% : 0.000002s : 13: predicate.dict_set_item_eliminator 0.90% : 0.000002s : 8: predicate.dumpgradient_eliminate 0.20% : 0.000000s : 4: predicate.elim_not_effective 0.50% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.22% : 0.000003s : 17: predicate.environ_add_const_eliminate 0.99% : 0.000002s : 17: predicate.environ_get_add_eliminate 1.01% : 0.000002s : 17: predicate.environ_get_depend_swap 1.60% : 0.000004s : 25: predicate.environ_get_eliminate 1.26% : 0.000003s : 17: predicate.environ_get_set_eliminate 1.28% : 0.000003s : 21: predicate.exchange_switch_depend_value 2.36% : 0.000005s : 21: predicate.float_depend_g_call 0.51% : 0.000001s : 8: predicate.float_environ_get_switch 0.71% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.16% : 0.000000s : 4: predicate.fold_const_symbol 0.65% : 0.000001s : 8: predicate.get_grad_eliminate 0.20% : 0.000000s : 4: predicate.graph_param_transform 0.71% : 0.000002s : 8: predicate.incorporate_call 0.44% : 0.000001s : 8: predicate.incorporate_call_switch 6.55% : 0.000015s : 58: predicate.inline 0.85% : 0.000002s : 8: predicate.inline_without_move 0.38% : 0.000001s : 8: predicate.j_node_and_user_rematch 1.06% : 0.000002s : 8: predicate.less_batch_normalization 1.72% : 0.000004s : 25: predicate.list_to_tuple_eliminator_ 2.31% : 0.000005s : 38: predicate.load_eliminater 0.78% : 0.000002s : 4: predicate.loop_unroll_after_grad 2.26% : 0.000005s : 34: predicate.loop_unroll_before_grad 1.53% : 0.000003s : 21: predicate.make_slice_get_slice_eliminator 0.91% : 0.000002s : 8: predicate.merge_addn 0.60% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.56% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.83% : 0.000002s : 13: predicate.minmaximum_grad 1.06% : 0.000002s : 4: predicate.mutable_eliminate 0.35% : 0.000001s : 4: predicate.opt_reshape 0.68% : 0.000002s : 4: predicate.parallel_virtual_node 1.80% : 0.000004s : 21: predicate.partial_defer_inline 1.53% : 0.000003s : 21: predicate.partial_eliminate 0.95% : 0.000002s : 13: predicate.print_const_string_wrapper 0.55% : 0.000001s : 8: predicate.reduce_all_const_elim 1.26% : 0.000003s : 13: predicate.reduce_eliminate 2.30% : 0.000005s : 38: predicate.redundant_stop_gradient_eliminater 0.46% : 0.000001s : 8: predicate.remove_not_recompute_node 1.26% : 0.000003s : 25: predicate.replace_applicator 0.55% : 0.000001s : 8: predicate.replace_old_param 0.24% : 0.000001s : 4: predicate.reset_defer_inline 1.01% : 0.000002s : 13: predicate.reshape_eliminate 0.63% : 0.000001s : 8: predicate.row_tensor_add_zeros_like 0.40% : 0.000001s : 4: predicate.row_tensor_eliminate 0.70% : 0.000002s : 8: predicate.same_eliminate 0.42% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.85% : 0.000002s : 8: predicate.shard_identity_eliminate 0.68% : 0.000002s : 8: predicate.special_op_eliminate 0.76% : 0.000002s : 8: predicate.specialize_transform 1.03% : 0.000002s : 8: predicate.split_environ_get_set_with_tuple_value 1.13% : 0.000003s : 8: predicate.stack_unstack_eliminate 0.29% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.37% : 0.000003s : 21: predicate.switch_defer_inline 1.86% : 0.000004s : 29: predicate.switch_layer_defer_inline 5.00% : 0.000011s : 67: predicate.switch_simplify 0.99% : 0.000002s : 13: predicate.tile_eliminate 0.94% : 0.000002s : 13: predicate.transpose_eliminate 1.47% : 0.000003s : 21: predicate.tuple_list_convert_item_index_to_positive 1.54% : 0.000003s : 21: predicate.tuple_list_get_item_const_eliminator 1.38% : 0.000003s : 21: predicate.tuple_list_get_item_depend_reorder 3.09% : 0.000007s : 33: predicate.tuple_list_get_item_eliminator 1.44% : 0.000003s : 21: predicate.tuple_list_get_set_item_eliminator 2.31% : 0.000005s : 29: predicate.tuple_list_set_item_eliminator 1.92% : 0.000004s : 25: predicate.tuple_to_list_eliminator_ 2.20% : 0.000005s : 38: predicate.updatestate_pure_node_eliminater 3.38% : 0.000008s : 46: predicate.updatestate_useless_node_eliminater 0.42% : 0.000001s : 4: predicate.value_based_eliminate 0.62% : 0.000001s : 8: predicate.virtual_dataset_eliminate 0.61% : 0.000001s : 8: predicate.virtual_output_eliminate 0.24% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.44% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000609 11 52.41% : 0.000319s : 5: func_graph_cloner_run.FuncGraphClonerGraph 47.59% : 0.000290s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.030074 192 0.02% : 0.000006s : 1: ForceFp32Comm 12.25% : 0.003686s : 1: add_attr 12.19% : 0.003665s : 1: add_attr_with_inline 0.02% : 0.000006s : 1: add_comm_op_reuse_tag 0.19% : 0.000058s : 1: add_recomputation 0.02% : 0.000007s : 1: assign_add_opt 0.25% : 0.000074s : 1: auto_monad 0.10% : 0.000029s : 1: auto_monad_reorder 0.02% : 0.000006s : 1: begin_end_overlap_inline 0.03% : 0.000009s : 1: bias_add_comm_swap 1.70% : 0.000512s : 1: bootstrap 0.12% : 0.000037s : 1: cconv 0.02% : 0.000006s : 1: comm_op_add_attrs 0.06% : 0.000018s : 1: control_data_broadcast_order 0.04% : 0.000013s : 1: convert_after_rewriter 0.10% : 0.000030s : 1: cse_after_recomputation 0.03% : 0.000008s : 1: dataset_repeat_opt 0.07% : 0.000020s : 1: detach_backward 0.04% : 0.000011s : 1: environ_conv 0.10% : 0.000031s : 1: event_method 0.03% : 0.000008s : 1: full_micro_interleaved_order_control 0.03% : 0.000009s : 1: get_jit_bprop_graph 0.04% : 0.000013s : 1: graph_reusing 0.02% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.02% : 0.000006s : 1: handle_group_info 0.03% : 0.000009s : 1: inline 0.03% : 0.000009s : 1: insert-virtual-dataset 0.02% : 0.000006s : 1: interleave_parallel_branches 0.02% : 0.000006s : 1: interleave_split_concat_branches 0.03% : 0.000008s : 1: label_fine_grained_interleaved_index 0.03% : 0.000010s : 1: label_micro_interleaved_index 1.59% : 0.000479s : 1: loop_unroll 0.02% : 0.000006s : 1: merge_cast_opt 0.03% : 0.000008s : 1: micro_interleaved_order_control 2.48% : 0.000746s : 1: mutable_eliminate 0.03% : 0.000010s : 1: offloading_packed_experts 0.05% : 0.000014s : 1: opt.transform.loop_unroll_optimizer 0.06% : 0.000017s : 1: opt.transform.mutable_eliminate 4.34% : 0.001306s : 78: opt.transform.opt_a 0.10% : 0.000030s : 1: opt.transform.opt_after_cconv 0.09% : 0.000028s : 1: opt.transform.opt_after_jit_grad 0.35% : 0.000105s : 28: opt.transform.opt_b 0.17% : 0.000050s : 2: opt.transform.opt_trans_graph 0.13% : 0.000039s : 4: opt.transform.symbol_engine_opt 11.44% : 0.003439s : 1: opt_a 0.44% : 0.000134s : 1: opt_after_cconv 1.99% : 0.000597s : 1: opt_after_jit_grad 0.94% : 0.000283s : 1: opt_b 21.18% : 0.006370s : 1: optimize 0.08% : 0.000024s : 1: optimize_parallel_all_gather_comm 0.04% : 0.000011s : 1: order_py_execute_after_rewriter 0.10% : 0.000029s : 1: overlap_grad_flash_sp 0.02% : 0.000006s : 1: overlap_grad_matmul_and_grad_allreduce 0.03% : 0.000010s : 1: overlap_grad_ring_attention 0.02% : 0.000007s : 1: overlap_opt_shard_grad_in_pipeline 0.02% : 0.000006s : 1: overlap_opt_shard_in_pipeline 0.03% : 0.000008s : 1: overlap_param_gather 0.02% : 0.000007s : 1: overlap_recompute_allgather_and_fa_grad 0.03% : 0.000010s : 1: overlap_recompute_and_grad_model_parallel 0.03% : 0.000008s : 1: overlap_recompute_comm 0.04% : 0.000011s : 1: parallel-infer-symbol 0.02% : 0.000006s : 1: parallel-infer-symbol-second 0.03% : 0.000008s : 1: partial_unused_args_eliminate 0.03% : 0.000009s : 1: pipeline_parallel_scheduler 0.02% : 0.000007s : 1: pipeline_split 0.17% : 0.000052s : 1: pre_auto_parallel 0.13% : 0.000038s : 1: py_interpret_to_execute 0.06% : 0.000019s : 1: py_interpret_to_execute_after_opt_a 0.02% : 0.000006s : 1: remove_cast_before_assign_add 0.07% : 0.000020s : 1: remove_dup_value 1.62% : 0.000486s : 1: renormalize.infer 1.20% : 0.000362s : 1: renormalize.specialize 0.03% : 0.000008s : 1: reorder_send_recv_between_fp_bp 0.04% : 0.000011s : 1: rewriter_after_jit_bprop_graph 0.15% : 0.000046s : 1: rewriter_after_opt_a 0.34% : 0.000101s : 1: rewriter_before_opt_a 0.03% : 0.000008s : 1: slice_cell_reuse_recomputed_activation 0.02% : 0.000008s : 1: slice_recompute_activation 0.02% : 0.000007s : 1: split_layernorm_comm 0.02% : 0.000007s : 1: split_matmul_comm_elemetwise 0.04% : 0.000011s : 1: swap_dp_allreduce_reducescatter 0.35% : 0.000104s : 1: symbol_engine_optimizer 0.31% : 0.000094s : 1: tuple_transform 21.71% : 0.006530s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:42:36.526.908 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0294992, [21] [bootstrap]: 0.00043745 [type_inference]: 0.00592456 [event_method]: 1.961e-05 [auto_monad]: 6.068e-05 [graph_reusing]: 5.42001e-06 [inline]: 2.04999e-06 [add_attr]: 0.00334926, [1] [add_attr_with_inline]: 0.00333878, [1] [Cycle 1]: 6.637e-05, [2] [tag_attr]: 2.037e-05 [meta_addattr_fg_expand]: 6.07001e-06 [parallel-infer-symbol]: 3.81999e-06 [pre_auto_parallel]: 3.809e-05 [insert-virtual-dataset]: 2.60002e-06 [parallel-infer-symbol-second]: 9.50007e-07 [dataset_repeat_opt]: 2.02999e-06 [pipeline_split]: 1.72999e-06 [optimize]: 0.0189236, [53] [py_interpret_to_execute]: 2.517e-05 [rewriter_before_opt_a]: 8.381e-05 [opt_a]: 0.0163844, [2] [Cycle 1]: 0.0155863, [45] [expand_dump_flag]: 3.63e-06 [switch_simplify]: 4.276e-05 [loop_unroll]: 2.971e-05 [a_1]: 0.00065571 [with_stream_mark]: 1.892e-05 [recompute_prepare]: 1.034e-05 [updatestate_depend_eliminate]: 4.1e-06 [updatestate_assign_eliminate]: 3.48e-06 [updatestate_loads_eliminate]: 3.21001e-06 [parameter_eliminate]: 1.91998e-06 [a_2]: 8.731e-05 [accelerated_algorithm]: 7.63999e-06 [shard]: 1.64e-06 [meta_shard_fg_expand]: 1.84e-06 [shard_inline]: 6.63998e-06 [merge_send_recv]: 8.74003e-06 [auto_parallel]: 8.20999e-06 [parallel]: 1.949e-05 [flash_sp]: 9.32999e-06 [merge_comm]: 3.95e-06 [allreduce_fusion]: 3.72002e-06 [matmul_add_comm_reduction]: 9.42001e-06 [allreduce_slice_to_reducescatter]: 1.09e-06 [virtual_shard_identity]: 8.89e-06 [virtual_dataset]: 6.70002e-06 [get_grad_eliminate_]: 6.38003e-06 [virtual_output]: 6.53e-06 [merge_forward]: 4.24002e-06 [cell_reuse_recompute_pass]: 1.35001e-06 [offload_activation]: 1.019e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.215e-05 [merge_recompute_call_nodes]: 1.40999e-06 [before_grad]: 1.044e-05 [set_forward_comm_id_for_comm_node_pass]: 3.39001e-06 [meta_fg_expand]: 2.70997e-06 [flash_sp_send_recv_attached]: 2.36998e-06 [receive_attached]: 2.34999e-06 [after_resolve]: 1.141e-05 [a_after_grad]: 1.034e-05 [renormalize]: 0.0141203 [add_forward_monad_depend]: 1.138e-05 [auto_monad_grad]: 2.77002e-06 [auto_monad_eliminator]: 2.529e-05 [cse]: 3.299e-05 [a_3]: 6.586e-05 [Cycle 2]: 0.0007842, [45] [expand_dump_flag]: 2.06e-06 [switch_simplify]: 9.49999e-06 [loop_unroll]: 7.26999e-06 [a_1]: 0.00016521 [with_stream_mark]: 2.205e-05 [recompute_prepare]: 7.06001e-06 [updatestate_depend_eliminate]: 4.60999e-06 [updatestate_assign_eliminate]: 3.18e-06 [updatestate_loads_eliminate]: 3.17002e-06 [parameter_eliminate]: 1.84e-06 [a_2]: 8.266e-05 [accelerated_algorithm]: 8.41002e-06 [shard]: 3.48999e-06 [meta_shard_fg_expand]: 2.30002e-06 [shard_inline]: 6.68e-06 [merge_send_recv]: 9.12999e-06 [auto_parallel]: 9.66e-06 [parallel]: 9.42001e-06 [flash_sp]: 5.00999e-06 [merge_comm]: 3.82998e-06 [allreduce_fusion]: 3.9e-06 [matmul_add_comm_reduction]: 9.43002e-06 [allreduce_slice_to_reducescatter]: 1.02e-06 [virtual_shard_identity]: 8.79e-06 [virtual_dataset]: 6.13998e-06 [get_grad_eliminate_]: 6.01e-06 [virtual_output]: 9.15001e-06 [merge_forward]: 4.35e-06 [cell_reuse_recompute_pass]: 3.65e-06 [offload_activation]: 1.131e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.331e-05 [merge_recompute_call_nodes]: 1.79e-06 [before_grad]: 1.045e-05 [set_forward_comm_id_for_comm_node_pass]: 4.70001e-06 [meta_fg_expand]: 2.51e-06 [flash_sp_send_recv_attached]: 1.71002e-06 [receive_attached]: 3.36001e-06 [after_resolve]: 1.296e-05 [a_after_grad]: 1.066e-05 [renormalize]: 7.99773e-08 [add_forward_monad_depend]: 1.84e-06 [auto_monad_grad]: 1.60999e-06 [auto_monad_eliminator]: 1.036e-05 [cse]: 1.805e-05 [a_3]: 3.755e-05 [py_interpret_to_execute_after_opt_a]: 1.757e-05 [slice_cell_reuse_recomputed_activation]: 2.39001e-06 [rewriter_after_opt_a]: 3.992e-05 [convert_after_rewriter]: 7.8e-06 [order_py_execute_after_rewriter]: 4.87e-06 [mutable_eliminate]: 0.00086904 [opt_b]: 0.00022972, [1] [Cycle 1]: 0.00022081, [7] [b_1]: 0.00013385 [b_2]: 8.33999e-06 [updatestate_depend_eliminate]: 9.16998e-06 [updatestate_assign_eliminate]: 3.06999e-06 [updatestate_loads_eliminate]: 2.74001e-06 [renormalize]: 5.49975e-07 [cse]: 2.403e-05 [optimize_parallel_all_gather_comm]: 1.834e-05 [overlap_param_gather]: 1.93002e-06 [cconv]: 3.269e-05 [loop_unroll]: 0.00048125 [opt_after_cconv]: 0.00011407, [1] [Cycle 1]: 0.00010678, [7] [c_1]: 3.176e-05 [parameter_eliminate]: 5.51998e-06 [updatestate_depend_eliminate]: 6.46e-06 [updatestate_assign_eliminate]: 3.58999e-06 [updatestate_loads_eliminate]: 2.29001e-06 [cse]: 2.031e-05 [renormalize]: 5.40022e-07 [remove_dup_value]: 1.377e-05 [tuple_transform]: 8.196e-05, [1] [Cycle 1]: 7.767e-05, [4] [d_1]: 4.916e-05 [none_parameter_eliminate]: 2.39001e-06 [renormalize]: 2.60014e-07 [switch_simplify]: 6.89999e-06 [partial_unused_args_eliminate]: 1.81e-06 [add_recomputation]: 5.676e-05 [cse_after_recomputation]: 2.14e-05, [1] [Cycle 1]: 1.667e-05, [1] [cse]: 1.142e-05 [environ_conv]: 5.62001e-06 [swap_dp_allreduce_reducescatter]: 5.43002e-06 [bias_add_comm_swap]: 2.94001e-06 [label_micro_interleaved_index]: 4.95001e-06 [label_fine_grained_interleaved_index]: 2.63998e-06 [merge_cast_opt]: 1.33002e-06 [slice_recompute_activation]: 2.27999e-06 [micro_interleaved_order_control]: 2.69999e-06 [assign_add_opt]: 1.19998e-06 [ForceFp32Comm]: 8.59989e-07 [remove_cast_before_assign_add]: 1.06997e-06 [full_micro_interleaved_order_control]: 2.38002e-06 [reorder_send_recv_between_fp_bp]: 3.03e-06 [comm_op_add_attrs]: 1.04e-06 [add_comm_op_reuse_tag]: 9.39996e-07 [interleave_split_concat_branches]: 1.13001e-06 [interleave_parallel_branches]: 1.03001e-06 [overlap_opt_shard_in_pipeline]: 1.16002e-06 [overlap_opt_shard_grad_in_pipeline]: 1.86e-06 [control_data_broadcast_order]: 1.381e-05 [grouped_pairwise_exchange_alltoall]: 1.51998e-06 [offloading_packed_experts]: 3.99002e-06 [overlap_recompute_and_grad_model_parallel]: 4.72e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.21002e-06 [overlap_recompute_allgather_and_fa_grad]: 1.33002e-06 [overlap_recompute_comm]: 2.89001e-06 [overlap_grad_ring_attention]: 4.37e-06 [overlap_grad_flash_sp]: 2.193e-05 [begin_end_overlap_inline]: 5.8001e-07 [split_matmul_comm_elemetwise]: 2.66e-06 [split_layernorm_comm]: 2.62001e-06 [handle_group_info]: 1.07e-06 [symbol_engine_optimizer]: 8.205e-05, [1] [Cycle 1]: 7.666e-05, [6] [build]: 4.22e-06 [elim_shapecalc]: 1.065e-05 [elim_not_effective]: 1.349e-05 [opt_reshape]: 7.92998e-06 [fold_const_symbol]: 1.017e-05 [renormalize]: 2.59985e-07 [detach_backward]: 2.47001e-06 [pipeline_parallel_scheduler]: 1.72001e-06 [auto_monad_reorder]: 1.688e-05 [get_jit_bprop_graph]: 2.63003e-06 [rewriter_after_jit_bprop_graph]: 6.24999e-06 [opt_after_jit_grad]: 0.00049432 [validate]: 4.548e-05 Sums bootstrap : 0.000437s : 1.74% type_inference : 0.005925s : 23.61% event_method : 0.000020s : 0.08% auto_monad : 0.000061s : 0.24% graph_reusing : 0.000005s : 0.02% inline : 0.000002s : 0.01% add_attr.add_attr_with_inline.tag_attr : 0.000020s : 0.08% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.02% parallel-infer-symbol : 0.000004s : 0.02% pre_auto_parallel : 0.000038s : 0.15% insert-virtual-dataset : 0.000003s : 0.01% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.01% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000025s : 0.10% optimize.rewriter_before_opt_a : 0.000084s : 0.33% optimize.opt_a.expand_dump_flag : 0.000006s : 0.02% optimize.opt_a.switch_simplify : 0.000052s : 0.21% optimize.opt_a.loop_unroll : 0.000037s : 0.15% optimize.opt_a.a_1 : 0.000821s : 3.27% optimize.opt_a.with_stream_mark : 0.000041s : 0.16% optimize.opt_a.recompute_prepare : 0.000017s : 0.07% optimize.opt_a.updatestate_depend_eliminate : 0.000009s : 0.03% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.03% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.03% optimize.opt_a.parameter_eliminate : 0.000004s : 0.01% optimize.opt_a.a_2 : 0.000170s : 0.68% optimize.opt_a.accelerated_algorithm : 0.000016s : 0.06% optimize.opt_a.shard : 0.000005s : 0.02% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.02% optimize.opt_a.shard_inline : 0.000013s : 0.05% optimize.opt_a.merge_send_recv : 0.000018s : 0.07% optimize.opt_a.auto_parallel : 0.000018s : 0.07% optimize.opt_a.parallel : 0.000029s : 0.12% optimize.opt_a.flash_sp : 0.000014s : 0.06% optimize.opt_a.merge_comm : 0.000008s : 0.03% optimize.opt_a.allreduce_fusion : 0.000008s : 0.03% optimize.opt_a.matmul_add_comm_reduction : 0.000019s : 0.08% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000002s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000018s : 0.07% optimize.opt_a.virtual_dataset : 0.000013s : 0.05% optimize.opt_a.get_grad_eliminate_ : 0.000012s : 0.05% optimize.opt_a.virtual_output : 0.000016s : 0.06% optimize.opt_a.merge_forward : 0.000009s : 0.03% optimize.opt_a.cell_reuse_recompute_pass : 0.000005s : 0.02% optimize.opt_a.offload_activation : 0.000022s : 0.09% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000025s : 0.10% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.01% optimize.opt_a.before_grad : 0.000021s : 0.08% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000008s : 0.03% optimize.opt_a.meta_fg_expand : 0.000005s : 0.02% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.02% optimize.opt_a.receive_attached : 0.000006s : 0.02% optimize.opt_a.after_resolve : 0.000024s : 0.10% optimize.opt_a.a_after_grad : 0.000021s : 0.08% optimize.opt_a.renormalize : 0.014120s : 56.28% optimize.opt_a.add_forward_monad_depend : 0.000013s : 0.05% optimize.opt_a.auto_monad_grad : 0.000004s : 0.02% optimize.opt_a.auto_monad_eliminator : 0.000036s : 0.14% optimize.opt_a.cse : 0.000051s : 0.20% optimize.opt_a.a_3 : 0.000103s : 0.41% optimize.py_interpret_to_execute_after_opt_a : 0.000018s : 0.07% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.01% optimize.rewriter_after_opt_a : 0.000040s : 0.16% optimize.convert_after_rewriter : 0.000008s : 0.03% optimize.order_py_execute_after_rewriter : 0.000005s : 0.02% optimize.mutable_eliminate : 0.000869s : 3.46% optimize.opt_b.b_1 : 0.000134s : 0.53% optimize.opt_b.b_2 : 0.000008s : 0.03% optimize.opt_b.updatestate_depend_eliminate : 0.000009s : 0.04% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.01% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.01% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000024s : 0.10% optimize.optimize_parallel_all_gather_comm : 0.000018s : 0.07% optimize.overlap_param_gather : 0.000002s : 0.01% optimize.cconv : 0.000033s : 0.13% optimize.loop_unroll : 0.000481s : 1.92% optimize.opt_after_cconv.c_1 : 0.000032s : 0.13% optimize.opt_after_cconv.parameter_eliminate : 0.000006s : 0.02% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.03% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000004s : 0.01% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.01% optimize.opt_after_cconv.cse : 0.000020s : 0.08% optimize.opt_after_cconv.renormalize : 0.000001s : 0.00% optimize.remove_dup_value : 0.000014s : 0.05% optimize.tuple_transform.d_1 : 0.000049s : 0.20% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000007s : 0.03% optimize.partial_unused_args_eliminate : 0.000002s : 0.01% optimize.add_recomputation : 0.000057s : 0.23% optimize.cse_after_recomputation.cse : 0.000011s : 0.05% optimize.environ_conv : 0.000006s : 0.02% optimize.swap_dp_allreduce_reducescatter : 0.000005s : 0.02% optimize.bias_add_comm_swap : 0.000003s : 0.01% optimize.label_micro_interleaved_index : 0.000005s : 0.02% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.01% optimize.merge_cast_opt : 0.000001s : 0.01% optimize.slice_recompute_activation : 0.000002s : 0.01% optimize.micro_interleaved_order_control : 0.000003s : 0.01% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.00% optimize.full_micro_interleaved_order_control : 0.000002s : 0.01% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.01% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.01% optimize.control_data_broadcast_order : 0.000014s : 0.06% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.01% optimize.offloading_packed_experts : 0.000004s : 0.02% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.02% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.01% optimize.overlap_recompute_comm : 0.000003s : 0.01% optimize.overlap_grad_ring_attention : 0.000004s : 0.02% optimize.overlap_grad_flash_sp : 0.000022s : 0.09% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000003s : 0.01% optimize.split_layernorm_comm : 0.000003s : 0.01% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000004s : 0.02% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000011s : 0.04% optimize.symbol_engine_optimizer.elim_not_effective : 0.000013s : 0.05% optimize.symbol_engine_optimizer.opt_reshape : 0.000008s : 0.03% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000010s : 0.04% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.01% pipeline_parallel_scheduler : 0.000002s : 0.01% auto_monad_reorder : 0.000017s : 0.07% get_jit_bprop_graph : 0.000003s : 0.01% rewriter_after_jit_bprop_graph : 0.000006s : 0.02% opt_after_jit_grad : 0.000494s : 1.97% validate : 0.000045s : 0.18% Time group info: ------[substitution.] 0.000232 34 17.74% : 0.000041s : 6: substitution.arithmetic_simplify 0.77% : 0.000002s : 2: substitution.elim_not_effective 0.62% : 0.000001s : 2: substitution.fold_const_symbol 2.90% : 0.000007s : 4: substitution.graph_param_transform 64.89% : 0.000151s : 4: substitution.inline 1.88% : 0.000004s : 4: substitution.j_node_and_user_rematch 2.39% : 0.000006s : 4: substitution.remove_not_recompute_node 2.35% : 0.000005s : 4: substitution.replace_old_param 6.47% : 0.000015s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.005859 2 88.07% : 0.005160s : 1: type_inference.infer 11.93% : 0.000699s : 1: type_inference.specialize ------[replace.] 0.000062 8 62.84% : 0.000039s : 4: replace.inline 37.16% : 0.000023s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000161 8 91.77% : 0.000148s : 4: match.inline 8.23% : 0.000013s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000222 1278 0.89% : 0.000002s : 13: predicate.accumulaten_eliminater 0.65% : 0.000001s : 4: predicate.ad_related_special_op_eliminate 0.51% : 0.000001s : 8: predicate.addn_check_dump 1.04% : 0.000002s : 13: predicate.addn_zero_filter 0.79% : 0.000002s : 13: predicate.adjust_all_reduce_mul_add 2.45% : 0.000005s : 21: predicate.arithmetic_simplify 0.92% : 0.000002s : 13: predicate.cast_eliminate 0.57% : 0.000001s : 8: predicate.check_bprop_eliminate 0.51% : 0.000001s : 8: predicate.compare_switch_simplify 0.18% : 0.000000s : 4: predicate.const_output_eliminate 0.55% : 0.000001s : 8: predicate.depend_value_elim 0.98% : 0.000002s : 13: predicate.dict_get_item_const_eliminator 1.07% : 0.000002s : 13: predicate.dict_get_item_eliminator 0.93% : 0.000002s : 13: predicate.dict_set_item_eliminator 1.03% : 0.000002s : 8: predicate.dumpgradient_eliminate 0.26% : 0.000001s : 4: predicate.elim_not_effective 0.37% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.19% : 0.000003s : 17: predicate.environ_add_const_eliminate 1.16% : 0.000003s : 17: predicate.environ_get_add_eliminate 1.21% : 0.000003s : 17: predicate.environ_get_depend_swap 1.71% : 0.000004s : 25: predicate.environ_get_eliminate 1.06% : 0.000002s : 17: predicate.environ_get_set_eliminate 1.35% : 0.000003s : 21: predicate.exchange_switch_depend_value 2.30% : 0.000005s : 21: predicate.float_depend_g_call 0.49% : 0.000001s : 8: predicate.float_environ_get_switch 0.78% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.15% : 0.000000s : 4: predicate.fold_const_symbol 0.77% : 0.000002s : 8: predicate.get_grad_eliminate 0.28% : 0.000001s : 4: predicate.graph_param_transform 0.52% : 0.000001s : 8: predicate.incorporate_call 0.46% : 0.000001s : 8: predicate.incorporate_call_switch 6.57% : 0.000015s : 58: predicate.inline 0.80% : 0.000002s : 8: predicate.inline_without_move 0.31% : 0.000001s : 8: predicate.j_node_and_user_rematch 1.25% : 0.000003s : 8: predicate.less_batch_normalization 1.71% : 0.000004s : 25: predicate.list_to_tuple_eliminator_ 2.26% : 0.000005s : 38: predicate.load_eliminater 0.97% : 0.000002s : 4: predicate.loop_unroll_after_grad 2.23% : 0.000005s : 34: predicate.loop_unroll_before_grad 1.43% : 0.000003s : 21: predicate.make_slice_get_slice_eliminator 0.51% : 0.000001s : 8: predicate.merge_addn 0.59% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.58% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.78% : 0.000002s : 13: predicate.minmaximum_grad 1.31% : 0.000003s : 4: predicate.mutable_eliminate 0.35% : 0.000001s : 4: predicate.opt_reshape 0.40% : 0.000001s : 4: predicate.parallel_virtual_node 1.92% : 0.000004s : 21: predicate.partial_defer_inline 1.54% : 0.000003s : 21: predicate.partial_eliminate 0.95% : 0.000002s : 13: predicate.print_const_string_wrapper 0.55% : 0.000001s : 8: predicate.reduce_all_const_elim 1.11% : 0.000002s : 13: predicate.reduce_eliminate 2.32% : 0.000005s : 38: predicate.redundant_stop_gradient_eliminater 0.48% : 0.000001s : 8: predicate.remove_not_recompute_node 1.52% : 0.000003s : 25: predicate.replace_applicator 0.70% : 0.000002s : 8: predicate.replace_old_param 0.34% : 0.000001s : 4: predicate.reset_defer_inline 0.92% : 0.000002s : 13: predicate.reshape_eliminate 0.63% : 0.000001s : 8: predicate.row_tensor_add_zeros_like 0.41% : 0.000001s : 4: predicate.row_tensor_eliminate 0.84% : 0.000002s : 8: predicate.same_eliminate 0.60% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.92% : 0.000002s : 8: predicate.shard_identity_eliminate 0.62% : 0.000001s : 8: predicate.special_op_eliminate 0.80% : 0.000002s : 8: predicate.specialize_transform 1.07% : 0.000002s : 8: predicate.split_environ_get_set_with_tuple_value 0.86% : 0.000002s : 8: predicate.stack_unstack_eliminate 0.30% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.45% : 0.000003s : 21: predicate.switch_defer_inline 1.92% : 0.000004s : 29: predicate.switch_layer_defer_inline 4.69% : 0.000010s : 67: predicate.switch_simplify 0.88% : 0.000002s : 13: predicate.tile_eliminate 0.91% : 0.000002s : 13: predicate.transpose_eliminate 1.61% : 0.000004s : 21: predicate.tuple_list_convert_item_index_to_positive 1.65% : 0.000004s : 21: predicate.tuple_list_get_item_const_eliminator 1.58% : 0.000003s : 21: predicate.tuple_list_get_item_depend_reorder 3.30% : 0.000007s : 33: predicate.tuple_list_get_item_eliminator 1.61% : 0.000004s : 21: predicate.tuple_list_get_set_item_eliminator 2.26% : 0.000005s : 29: predicate.tuple_list_set_item_eliminator 1.66% : 0.000004s : 25: predicate.tuple_to_list_eliminator_ 2.27% : 0.000005s : 38: predicate.updatestate_pure_node_eliminater 2.90% : 0.000006s : 46: predicate.updatestate_useless_node_eliminater 0.45% : 0.000001s : 4: predicate.value_based_eliminate 0.72% : 0.000002s : 8: predicate.virtual_dataset_eliminate 0.78% : 0.000002s : 8: predicate.virtual_output_eliminate 0.28% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.43% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000627 11 46.29% : 0.000290s : 5: func_graph_cloner_run.FuncGraphClonerGraph 53.71% : 0.000337s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.067317 192 0.01% : 0.000004s : 1: ForceFp32Comm 4.98% : 0.003355s : 1: add_attr 4.97% : 0.003343s : 1: add_attr_with_inline 0.01% : 0.000004s : 1: add_comm_op_reuse_tag 0.09% : 0.000061s : 1: add_recomputation 0.01% : 0.000004s : 1: assign_add_opt 0.10% : 0.000066s : 1: auto_monad 0.03% : 0.000021s : 1: auto_monad_reorder 0.01% : 0.000003s : 1: begin_end_overlap_inline 0.01% : 0.000006s : 1: bias_add_comm_swap 0.69% : 0.000467s : 1: bootstrap 0.05% : 0.000036s : 1: cconv 0.01% : 0.000004s : 1: comm_op_add_attrs 0.03% : 0.000017s : 1: control_data_broadcast_order 0.02% : 0.000011s : 1: convert_after_rewriter 0.04% : 0.000024s : 1: cse_after_recomputation 0.01% : 0.000005s : 1: dataset_repeat_opt 0.01% : 0.000006s : 1: detach_backward 0.01% : 0.000009s : 1: environ_conv 0.04% : 0.000026s : 1: event_method 0.01% : 0.000005s : 1: full_micro_interleaved_order_control 0.01% : 0.000006s : 1: get_jit_bprop_graph 0.01% : 0.000009s : 1: graph_reusing 0.01% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000004s : 1: handle_group_info 0.01% : 0.000005s : 1: inline 0.01% : 0.000006s : 1: insert-virtual-dataset 0.01% : 0.000004s : 1: interleave_parallel_branches 0.01% : 0.000004s : 1: interleave_split_concat_branches 0.01% : 0.000006s : 1: label_fine_grained_interleaved_index 0.01% : 0.000008s : 1: label_micro_interleaved_index 0.73% : 0.000491s : 1: loop_unroll 0.01% : 0.000004s : 1: merge_cast_opt 0.01% : 0.000005s : 1: micro_interleaved_order_control 1.31% : 0.000881s : 1: mutable_eliminate 0.01% : 0.000007s : 1: offloading_packed_experts 0.03% : 0.000019s : 1: opt.transform.loop_unroll_optimizer 0.03% : 0.000022s : 1: opt.transform.mutable_eliminate 1.90% : 0.001277s : 78: opt.transform.opt_a 0.04% : 0.000030s : 1: opt.transform.opt_after_cconv 0.04% : 0.000025s : 1: opt.transform.opt_after_jit_grad 0.16% : 0.000106s : 28: opt.transform.opt_b 0.08% : 0.000054s : 2: opt.transform.opt_trans_graph 0.06% : 0.000038s : 4: opt.transform.symbol_engine_opt 24.34% : 0.016388s : 1: opt_a 0.18% : 0.000118s : 1: opt_after_cconv 0.75% : 0.000504s : 1: opt_after_jit_grad 0.35% : 0.000234s : 1: opt_b 28.12% : 0.018929s : 1: optimize 0.03% : 0.000022s : 1: optimize_parallel_all_gather_comm 0.01% : 0.000008s : 1: order_py_execute_after_rewriter 0.04% : 0.000025s : 1: overlap_grad_flash_sp 0.01% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.01% : 0.000007s : 1: overlap_grad_ring_attention 0.01% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.01% : 0.000005s : 1: overlap_param_gather 0.01% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.01% : 0.000008s : 1: overlap_recompute_and_grad_model_parallel 0.01% : 0.000006s : 1: overlap_recompute_comm 0.01% : 0.000008s : 1: parallel-infer-symbol 0.01% : 0.000004s : 1: parallel-infer-symbol-second 0.01% : 0.000005s : 1: partial_unused_args_eliminate 0.01% : 0.000005s : 1: pipeline_parallel_scheduler 0.01% : 0.000005s : 1: pipeline_split 0.06% : 0.000042s : 1: pre_auto_parallel 0.04% : 0.000030s : 1: py_interpret_to_execute 0.03% : 0.000021s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000004s : 1: remove_cast_before_assign_add 0.03% : 0.000017s : 1: remove_dup_value 20.18% : 0.013581s : 1: renormalize.infer 0.78% : 0.000523s : 1: renormalize.specialize 0.01% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.01% : 0.000010s : 1: rewriter_after_jit_bprop_graph 0.07% : 0.000044s : 1: rewriter_after_opt_a 0.13% : 0.000088s : 1: rewriter_before_opt_a 0.01% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.01% : 0.000005s : 1: slice_recompute_activation 0.01% : 0.000006s : 1: split_layernorm_comm 0.04% : 0.000025s : 1: split_matmul_comm_elemetwise 0.01% : 0.000008s : 1: swap_dp_allreduce_reducescatter 0.13% : 0.000085s : 1: symbol_engine_optimizer 0.13% : 0.000085s : 1: tuple_transform 8.83% : 0.005944s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:42:37.587.694 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:42:37.587.981 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0740469, [21] [bootstrap]: 0.00044599 [type_inference]: 0.0058429 [event_method]: 1.877e-05 [auto_monad]: 6.322e-05 [graph_reusing]: 6.10002e-06 [inline]: 2.19999e-06 [add_attr]: 0.00357322, [1] [add_attr_with_inline]: 0.00355989, [1] [Cycle 1]: 9.659e-05, [2] [tag_attr]: 2.334e-05 [meta_addattr_fg_expand]: 5.81998e-06 [parallel-infer-symbol]: 3.48e-06 [pre_auto_parallel]: 4.413e-05 [insert-virtual-dataset]: 2.51e-06 [parallel-infer-symbol-second]: 8.80013e-07 [dataset_repeat_opt]: 2.09e-06 [pipeline_split]: 1.64e-06 [optimize]: 0.0626756, [53] [py_interpret_to_execute]: 3.754e-05 [rewriter_before_opt_a]: 0.00010053 [opt_a]: 0.0599395, [2] [Cycle 1]: 0.0589, [45] [expand_dump_flag]: 3.35e-06 [switch_simplify]: 4.549e-05 [loop_unroll]: 3.125e-05 [a_1]: 0.0007182 [with_stream_mark]: 2.773e-05 [recompute_prepare]: 1.604e-05 [updatestate_depend_eliminate]: 4.99e-06 [updatestate_assign_eliminate]: 3.61999e-06 [updatestate_loads_eliminate]: 3.38e-06 [parameter_eliminate]: 2.60002e-06 [a_2]: 0.00012623 [accelerated_algorithm]: 9.20999e-06 [shard]: 2.95002e-06 [meta_shard_fg_expand]: 2.20002e-06 [shard_inline]: 6.81999e-06 [merge_send_recv]: 1e-05 [auto_parallel]: 1.019e-05 [parallel]: 2.174e-05 [flash_sp]: 1.307e-05 [merge_comm]: 4.52e-06 [allreduce_fusion]: 3.75e-06 [matmul_add_comm_reduction]: 1.149e-05 [allreduce_slice_to_reducescatter]: 1.27999e-06 [virtual_shard_identity]: 1.302e-05 [virtual_dataset]: 7.92003e-06 [get_grad_eliminate_]: 6.76999e-06 [virtual_output]: 7.25998e-06 [merge_forward]: 4.47e-06 [cell_reuse_recompute_pass]: 2.22001e-06 [offload_activation]: 1.274e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.058e-05 [merge_recompute_call_nodes]: 1.89e-06 [before_grad]: 1.415e-05 [set_forward_comm_id_for_comm_node_pass]: 6.01998e-06 [meta_fg_expand]: 3.03e-06 [flash_sp_send_recv_attached]: 3.25e-06 [receive_attached]: 2.94001e-06 [after_resolve]: 1.7e-05 [a_after_grad]: 1.104e-05 [renormalize]: 0.00094266 [add_forward_monad_depend]: 1.74e-05 [auto_monad_grad]: 3.45998e-06 [auto_monad_eliminator]: 4.161e-05 [cse]: 3.549e-05 [a_3]: 8.751e-05 [Cycle 2]: 0.00101923, [45] [expand_dump_flag]: 3.06001e-06 [switch_simplify]: 9.88998e-06 [loop_unroll]: 7.18e-06 [a_1]: 0.00017531 [with_stream_mark]: 2.121e-05 [recompute_prepare]: 7.85998e-06 [updatestate_depend_eliminate]: 4.37e-06 [updatestate_assign_eliminate]: 3.30003e-06 [updatestate_loads_eliminate]: 2.89999e-06 [parameter_eliminate]: 2.32001e-06 [a_2]: 0.00011054 [accelerated_algorithm]: 6.94001e-06 [shard]: 2.83998e-06 [meta_shard_fg_expand]: 2.94999e-06 [shard_inline]: 6.31e-06 [merge_send_recv]: 9.59999e-06 [auto_parallel]: 1.016e-05 [parallel]: 1.051e-05 [flash_sp]: 3.99997e-06 [merge_comm]: 3.91999e-06 [allreduce_fusion]: 3.58e-06 [matmul_add_comm_reduction]: 3.532e-05 [allreduce_slice_to_reducescatter]: 7.99977e-07 [virtual_shard_identity]: 9.75002e-06 [virtual_dataset]: 6.56e-06 [get_grad_eliminate_]: 6.46e-06 [virtual_output]: 6.58e-06 [merge_forward]: 5.27001e-06 [cell_reuse_recompute_pass]: 3.86999e-06 [offload_activation]: 1.081e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.614e-05 [merge_recompute_call_nodes]: 1.79998e-06 [before_grad]: 1.052e-05 [set_forward_comm_id_for_comm_node_pass]: 4.63999e-06 [meta_fg_expand]: 3.48999e-06 [flash_sp_send_recv_attached]: 1.94e-06 [receive_attached]: 2.04e-06 [after_resolve]: 1.649e-05 [a_after_grad]: 1.038e-05 [renormalize]: 8.9989e-08 [add_forward_monad_depend]: 2.59999e-06 [auto_monad_grad]: 1.93002e-06 [auto_monad_eliminator]: 1.135e-05 [cse]: 1.847e-05 [a_3]: 5.142e-05 [py_interpret_to_execute_after_opt_a]: 2.368e-05 [slice_cell_reuse_recomputed_activation]: 6.16e-06 [rewriter_after_opt_a]: 4.959e-05 [convert_after_rewriter]: 1.041e-05 [order_py_execute_after_rewriter]: 8.89998e-06 [mutable_eliminate]: 0.00079269 [opt_b]: 0.00029932, [1] [Cycle 1]: 0.00028748, [7] [b_1]: 0.00017408 [b_2]: 8.77e-06 [updatestate_depend_eliminate]: 9.93998e-06 [updatestate_assign_eliminate]: 3.26999e-06 [updatestate_loads_eliminate]: 2.89999e-06 [renormalize]: 1.15999e-06 [cse]: 2.881e-05 [optimize_parallel_all_gather_comm]: 2.275e-05 [overlap_param_gather]: 5.57001e-06 [cconv]: 3.793e-05 [loop_unroll]: 0.00048859 [opt_after_cconv]: 0.00013822, [1] [Cycle 1]: 0.0001288, [7] [c_1]: 3.186e-05 [parameter_eliminate]: 5.72999e-06 [updatestate_depend_eliminate]: 6.69999e-06 [updatestate_assign_eliminate]: 2.91999e-06 [updatestate_loads_eliminate]: 2.71e-06 [cse]: 2.088e-05 [renormalize]: 6.69999e-07 [remove_dup_value]: 1.799e-05 [tuple_transform]: 9.486e-05, [1] [Cycle 1]: 8.748e-05, [4] [d_1]: 4.81e-05 [none_parameter_eliminate]: 2.39999e-06 [renormalize]: 1.8999e-07 [switch_simplify]: 6.84001e-06 [partial_unused_args_eliminate]: 4.56002e-06 [add_recomputation]: 5.325e-05 [cse_after_recomputation]: 2.918e-05, [1] [Cycle 1]: 2.134e-05, [1] [cse]: 1.155e-05 [environ_conv]: 9.47001e-06 [swap_dp_allreduce_reducescatter]: 8.42998e-06 [bias_add_comm_swap]: 5.40999e-06 [label_micro_interleaved_index]: 7.53e-06 [label_fine_grained_interleaved_index]: 5.67999e-06 [merge_cast_opt]: 3.98999e-06 [slice_recompute_activation]: 4.82998e-06 [micro_interleaved_order_control]: 4.74e-06 [assign_add_opt]: 3.68e-06 [ForceFp32Comm]: 3.83999e-06 [remove_cast_before_assign_add]: 3.38e-06 [full_micro_interleaved_order_control]: 4.53999e-06 [reorder_send_recv_between_fp_bp]: 5.36002e-06 [comm_op_add_attrs]: 3.57002e-06 [add_comm_op_reuse_tag]: 3.16999e-06 [interleave_split_concat_branches]: 3.58e-06 [interleave_parallel_branches]: 3.38e-06 [overlap_opt_shard_in_pipeline]: 3.63e-06 [overlap_opt_shard_grad_in_pipeline]: 4.43999e-06 [control_data_broadcast_order]: 1.666e-05 [grouped_pairwise_exchange_alltoall]: 3.76999e-06 [offloading_packed_experts]: 6.99001e-06 [overlap_recompute_and_grad_model_parallel]: 7.6e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.96001e-06 [overlap_recompute_allgather_and_fa_grad]: 3.7e-06 [overlap_recompute_comm]: 4.80001e-06 [overlap_grad_ring_attention]: 6.46e-06 [overlap_grad_flash_sp]: 2.573e-05 [begin_end_overlap_inline]: 3.04001e-06 [split_matmul_comm_elemetwise]: 4.4e-06 [split_layernorm_comm]: 4e-06 [handle_group_info]: 3.30998e-06 [symbol_engine_optimizer]: 0.0001026, [1] [Cycle 1]: 9.539e-05, [6] [build]: 4.32998e-06 [elim_shapecalc]: 1.092e-05 [elim_not_effective]: 1.35e-05 [opt_reshape]: 7.26001e-06 [fold_const_symbol]: 1.065e-05 [renormalize]: 2.09984e-07 [detach_backward]: 4.45e-06 [pipeline_parallel_scheduler]: 1.87999e-06 [auto_monad_reorder]: 2.132e-05 [get_jit_bprop_graph]: 1.82001e-06 [rewriter_after_jit_bprop_graph]: 7.24001e-06 [opt_after_jit_grad]: 0.00061304 [validate]: 4.65e-05 Sums bootstrap : 0.000446s : 3.59% type_inference : 0.005843s : 47.07% event_method : 0.000019s : 0.15% auto_monad : 0.000063s : 0.51% graph_reusing : 0.000006s : 0.05% inline : 0.000002s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000023s : 0.19% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.05% parallel-infer-symbol : 0.000003s : 0.03% pre_auto_parallel : 0.000044s : 0.36% insert-virtual-dataset : 0.000003s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000038s : 0.30% optimize.rewriter_before_opt_a : 0.000101s : 0.81% optimize.opt_a.expand_dump_flag : 0.000006s : 0.05% optimize.opt_a.switch_simplify : 0.000055s : 0.45% optimize.opt_a.loop_unroll : 0.000038s : 0.31% optimize.opt_a.a_1 : 0.000894s : 7.20% optimize.opt_a.with_stream_mark : 0.000049s : 0.39% optimize.opt_a.recompute_prepare : 0.000024s : 0.19% optimize.opt_a.updatestate_depend_eliminate : 0.000009s : 0.08% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.06% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.05% optimize.opt_a.parameter_eliminate : 0.000005s : 0.04% optimize.opt_a.a_2 : 0.000237s : 1.91% optimize.opt_a.accelerated_algorithm : 0.000016s : 0.13% optimize.opt_a.shard : 0.000006s : 0.05% optimize.opt_a.meta_shard_fg_expand : 0.000005s : 0.04% optimize.opt_a.shard_inline : 0.000013s : 0.11% optimize.opt_a.merge_send_recv : 0.000020s : 0.16% optimize.opt_a.auto_parallel : 0.000020s : 0.16% optimize.opt_a.parallel : 0.000032s : 0.26% optimize.opt_a.flash_sp : 0.000017s : 0.14% optimize.opt_a.merge_comm : 0.000008s : 0.07% optimize.opt_a.allreduce_fusion : 0.000007s : 0.06% optimize.opt_a.matmul_add_comm_reduction : 0.000047s : 0.38% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000002s : 0.02% optimize.opt_a.virtual_shard_identity : 0.000023s : 0.18% optimize.opt_a.virtual_dataset : 0.000014s : 0.12% optimize.opt_a.get_grad_eliminate_ : 0.000013s : 0.11% optimize.opt_a.virtual_output : 0.000014s : 0.11% optimize.opt_a.merge_forward : 0.000010s : 0.08% optimize.opt_a.cell_reuse_recompute_pass : 0.000006s : 0.05% optimize.opt_a.offload_activation : 0.000024s : 0.19% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000037s : 0.30% optimize.opt_a.merge_recompute_call_nodes : 0.000004s : 0.03% optimize.opt_a.before_grad : 0.000025s : 0.20% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000011s : 0.09% optimize.opt_a.meta_fg_expand : 0.000007s : 0.05% optimize.opt_a.flash_sp_send_recv_attached : 0.000005s : 0.04% optimize.opt_a.receive_attached : 0.000005s : 0.04% optimize.opt_a.after_resolve : 0.000033s : 0.27% optimize.opt_a.a_after_grad : 0.000021s : 0.17% optimize.opt_a.renormalize : 0.000943s : 7.59% optimize.opt_a.add_forward_monad_depend : 0.000020s : 0.16% optimize.opt_a.auto_monad_grad : 0.000005s : 0.04% optimize.opt_a.auto_monad_eliminator : 0.000053s : 0.43% optimize.opt_a.cse : 0.000054s : 0.43% optimize.opt_a.a_3 : 0.000139s : 1.12% optimize.py_interpret_to_execute_after_opt_a : 0.000024s : 0.19% optimize.slice_cell_reuse_recomputed_activation : 0.000006s : 0.05% optimize.rewriter_after_opt_a : 0.000050s : 0.40% optimize.convert_after_rewriter : 0.000010s : 0.08% optimize.order_py_execute_after_rewriter : 0.000009s : 0.07% optimize.mutable_eliminate : 0.000793s : 6.39% optimize.opt_b.b_1 : 0.000174s : 1.40% optimize.opt_b.b_2 : 0.000009s : 0.07% optimize.opt_b.updatestate_depend_eliminate : 0.000010s : 0.08% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.02% optimize.opt_b.renormalize : 0.000001s : 0.01% optimize.opt_b.cse : 0.000029s : 0.23% optimize.optimize_parallel_all_gather_comm : 0.000023s : 0.18% optimize.overlap_param_gather : 0.000006s : 0.04% optimize.cconv : 0.000038s : 0.31% optimize.loop_unroll : 0.000489s : 3.94% optimize.opt_after_cconv.c_1 : 0.000032s : 0.26% optimize.opt_after_cconv.parameter_eliminate : 0.000006s : 0.05% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000007s : 0.05% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.02% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.02% optimize.opt_after_cconv.cse : 0.000021s : 0.17% optimize.opt_after_cconv.renormalize : 0.000001s : 0.01% optimize.remove_dup_value : 0.000018s : 0.14% optimize.tuple_transform.d_1 : 0.000048s : 0.39% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.02% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000007s : 0.06% optimize.partial_unused_args_eliminate : 0.000005s : 0.04% optimize.add_recomputation : 0.000053s : 0.43% optimize.cse_after_recomputation.cse : 0.000012s : 0.09% optimize.environ_conv : 0.000009s : 0.08% optimize.swap_dp_allreduce_reducescatter : 0.000008s : 0.07% optimize.bias_add_comm_swap : 0.000005s : 0.04% optimize.label_micro_interleaved_index : 0.000008s : 0.06% optimize.label_fine_grained_interleaved_index : 0.000006s : 0.05% optimize.merge_cast_opt : 0.000004s : 0.03% optimize.slice_recompute_activation : 0.000005s : 0.04% optimize.micro_interleaved_order_control : 0.000005s : 0.04% optimize.assign_add_opt : 0.000004s : 0.03% optimize.ForceFp32Comm : 0.000004s : 0.03% optimize.remove_cast_before_assign_add : 0.000003s : 0.03% optimize.full_micro_interleaved_order_control : 0.000005s : 0.04% optimize.reorder_send_recv_between_fp_bp : 0.000005s : 0.04% optimize.comm_op_add_attrs : 0.000004s : 0.03% optimize.add_comm_op_reuse_tag : 0.000003s : 0.03% optimize.interleave_split_concat_branches : 0.000004s : 0.03% optimize.interleave_parallel_branches : 0.000003s : 0.03% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.03% optimize.overlap_opt_shard_grad_in_pipeline : 0.000004s : 0.04% optimize.control_data_broadcast_order : 0.000017s : 0.13% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.03% optimize.offloading_packed_experts : 0.000007s : 0.06% optimize.overlap_recompute_and_grad_model_parallel : 0.000008s : 0.06% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.03% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.03% optimize.overlap_recompute_comm : 0.000005s : 0.04% optimize.overlap_grad_ring_attention : 0.000006s : 0.05% optimize.overlap_grad_flash_sp : 0.000026s : 0.21% optimize.begin_end_overlap_inline : 0.000003s : 0.02% optimize.split_matmul_comm_elemetwise : 0.000004s : 0.04% optimize.split_layernorm_comm : 0.000004s : 0.03% optimize.handle_group_info : 0.000003s : 0.03% optimize.symbol_engine_optimizer.build : 0.000004s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000011s : 0.09% optimize.symbol_engine_optimizer.elim_not_effective : 0.000013s : 0.11% optimize.symbol_engine_optimizer.opt_reshape : 0.000007s : 0.06% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000011s : 0.09% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000004s : 0.04% pipeline_parallel_scheduler : 0.000002s : 0.02% auto_monad_reorder : 0.000021s : 0.17% get_jit_bprop_graph : 0.000002s : 0.01% rewriter_after_jit_bprop_graph : 0.000007s : 0.06% opt_after_jit_grad : 0.000613s : 4.94% validate : 0.000047s : 0.37% Time group info: ------[substitution.] 0.000271 34 16.63% : 0.000045s : 6: substitution.arithmetic_simplify 0.72% : 0.000002s : 2: substitution.elim_not_effective 0.55% : 0.000001s : 2: substitution.fold_const_symbol 2.36% : 0.000006s : 4: substitution.graph_param_transform 66.81% : 0.000181s : 4: substitution.inline 1.97% : 0.000005s : 4: substitution.j_node_and_user_rematch 2.28% : 0.000006s : 4: substitution.remove_not_recompute_node 2.86% : 0.000008s : 4: substitution.replace_old_param 5.83% : 0.000016s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.005786 2 88.23% : 0.005105s : 1: type_inference.infer 11.77% : 0.000681s : 1: type_inference.specialize ------[replace.] 0.000071 8 60.90% : 0.000044s : 4: replace.inline 39.10% : 0.000028s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000191 8 92.68% : 0.000177s : 4: match.inline 7.32% : 0.000014s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000235 1278 1.11% : 0.000003s : 13: predicate.accumulaten_eliminater 1.23% : 0.000003s : 4: predicate.ad_related_special_op_eliminate 0.70% : 0.000002s : 8: predicate.addn_check_dump 1.09% : 0.000003s : 13: predicate.addn_zero_filter 0.91% : 0.000002s : 13: predicate.adjust_all_reduce_mul_add 2.78% : 0.000007s : 21: predicate.arithmetic_simplify 0.94% : 0.000002s : 13: predicate.cast_eliminate 0.57% : 0.000001s : 8: predicate.check_bprop_eliminate 0.58% : 0.000001s : 8: predicate.compare_switch_simplify 0.16% : 0.000000s : 4: predicate.const_output_eliminate 0.57% : 0.000001s : 8: predicate.depend_value_elim 0.82% : 0.000002s : 13: predicate.dict_get_item_const_eliminator 0.96% : 0.000002s : 13: predicate.dict_get_item_eliminator 0.81% : 0.000002s : 13: predicate.dict_set_item_eliminator 0.87% : 0.000002s : 8: predicate.dumpgradient_eliminate 0.24% : 0.000001s : 4: predicate.elim_not_effective 0.35% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.02% : 0.000002s : 17: predicate.environ_add_const_eliminate 0.99% : 0.000002s : 17: predicate.environ_get_add_eliminate 1.16% : 0.000003s : 17: predicate.environ_get_depend_swap 2.07% : 0.000005s : 25: predicate.environ_get_eliminate 0.95% : 0.000002s : 17: predicate.environ_get_set_eliminate 1.37% : 0.000003s : 21: predicate.exchange_switch_depend_value 2.58% : 0.000006s : 21: predicate.float_depend_g_call 0.50% : 0.000001s : 8: predicate.float_environ_get_switch 0.79% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.14% : 0.000000s : 4: predicate.fold_const_symbol 0.60% : 0.000001s : 8: predicate.get_grad_eliminate 0.24% : 0.000001s : 4: predicate.graph_param_transform 0.51% : 0.000001s : 8: predicate.incorporate_call 0.41% : 0.000001s : 8: predicate.incorporate_call_switch 5.99% : 0.000014s : 58: predicate.inline 0.97% : 0.000002s : 8: predicate.inline_without_move 0.28% : 0.000001s : 8: predicate.j_node_and_user_rematch 0.92% : 0.000002s : 8: predicate.less_batch_normalization 1.81% : 0.000004s : 25: predicate.list_to_tuple_eliminator_ 2.21% : 0.000005s : 38: predicate.load_eliminater 0.84% : 0.000002s : 4: predicate.loop_unroll_after_grad 2.18% : 0.000005s : 34: predicate.loop_unroll_before_grad 1.59% : 0.000004s : 21: predicate.make_slice_get_slice_eliminator 0.54% : 0.000001s : 8: predicate.merge_addn 0.56% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.62% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.72% : 0.000002s : 13: predicate.minmaximum_grad 1.62% : 0.000004s : 4: predicate.mutable_eliminate 0.28% : 0.000001s : 4: predicate.opt_reshape 0.42% : 0.000001s : 4: predicate.parallel_virtual_node 2.24% : 0.000005s : 21: predicate.partial_defer_inline 1.44% : 0.000003s : 21: predicate.partial_eliminate 0.88% : 0.000002s : 13: predicate.print_const_string_wrapper 0.54% : 0.000001s : 8: predicate.reduce_all_const_elim 1.24% : 0.000003s : 13: predicate.reduce_eliminate 2.19% : 0.000005s : 38: predicate.redundant_stop_gradient_eliminater 0.48% : 0.000001s : 8: predicate.remove_not_recompute_node 1.48% : 0.000003s : 25: predicate.replace_applicator 0.57% : 0.000001s : 8: predicate.replace_old_param 0.42% : 0.000001s : 4: predicate.reset_defer_inline 1.07% : 0.000003s : 13: predicate.reshape_eliminate 0.56% : 0.000001s : 8: predicate.row_tensor_add_zeros_like 0.47% : 0.000001s : 4: predicate.row_tensor_eliminate 0.98% : 0.000002s : 8: predicate.same_eliminate 0.55% : 0.000001s : 8: predicate.set_cell_output_no_recompute 1.18% : 0.000003s : 8: predicate.shard_identity_eliminate 0.77% : 0.000002s : 8: predicate.special_op_eliminate 0.76% : 0.000002s : 8: predicate.specialize_transform 1.22% : 0.000003s : 8: predicate.split_environ_get_set_with_tuple_value 0.72% : 0.000002s : 8: predicate.stack_unstack_eliminate 0.33% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.41% : 0.000003s : 21: predicate.switch_defer_inline 1.93% : 0.000005s : 29: predicate.switch_layer_defer_inline 4.70% : 0.000011s : 67: predicate.switch_simplify 0.86% : 0.000002s : 13: predicate.tile_eliminate 0.88% : 0.000002s : 13: predicate.transpose_eliminate 1.45% : 0.000003s : 21: predicate.tuple_list_convert_item_index_to_positive 1.45% : 0.000003s : 21: predicate.tuple_list_get_item_const_eliminator 1.56% : 0.000004s : 21: predicate.tuple_list_get_item_depend_reorder 3.41% : 0.000008s : 33: predicate.tuple_list_get_item_eliminator 1.33% : 0.000003s : 21: predicate.tuple_list_get_set_item_eliminator 2.16% : 0.000005s : 29: predicate.tuple_list_set_item_eliminator 1.78% : 0.000004s : 25: predicate.tuple_to_list_eliminator_ 2.12% : 0.000005s : 38: predicate.updatestate_pure_node_eliminater 2.76% : 0.000006s : 46: predicate.updatestate_useless_node_eliminater 0.40% : 0.000001s : 4: predicate.value_based_eliminate 0.65% : 0.000002s : 8: predicate.virtual_dataset_eliminate 0.78% : 0.000002s : 8: predicate.virtual_output_eliminate 0.26% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.44% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000568 11 52.45% : 0.000298s : 5: func_graph_cloner_run.FuncGraphClonerGraph 47.55% : 0.000270s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.142780 192 0.00% : 0.000006s : 1: ForceFp32Comm 2.51% : 0.003585s : 1: add_attr 2.50% : 0.003564s : 1: add_attr_with_inline 0.00% : 0.000006s : 1: add_comm_op_reuse_tag 0.04% : 0.000057s : 1: add_recomputation 0.00% : 0.000006s : 1: assign_add_opt 0.05% : 0.000072s : 1: auto_monad 0.02% : 0.000029s : 1: auto_monad_reorder 0.00% : 0.000006s : 1: begin_end_overlap_inline 0.01% : 0.000008s : 1: bias_add_comm_swap 0.34% : 0.000491s : 1: bootstrap 0.03% : 0.000042s : 1: cconv 0.00% : 0.000006s : 1: comm_op_add_attrs 0.01% : 0.000020s : 1: control_data_broadcast_order 0.01% : 0.000014s : 1: convert_after_rewriter 0.02% : 0.000032s : 1: cse_after_recomputation 0.01% : 0.000008s : 1: dataset_repeat_opt 0.02% : 0.000023s : 1: detach_backward 0.01% : 0.000012s : 1: environ_conv 0.02% : 0.000029s : 1: event_method 0.01% : 0.000007s : 1: full_micro_interleaved_order_control 0.01% : 0.000008s : 1: get_jit_bprop_graph 0.01% : 0.000013s : 1: graph_reusing 0.00% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000006s : 1: handle_group_info 0.01% : 0.000008s : 1: inline 0.01% : 0.000009s : 1: insert-virtual-dataset 0.00% : 0.000006s : 1: interleave_parallel_branches 0.00% : 0.000006s : 1: interleave_split_concat_branches 0.01% : 0.000008s : 1: label_fine_grained_interleaved_index 0.01% : 0.000010s : 1: label_micro_interleaved_index 0.35% : 0.000495s : 1: loop_unroll 0.00% : 0.000007s : 1: merge_cast_opt 0.01% : 0.000007s : 1: micro_interleaved_order_control 0.56% : 0.000801s : 1: mutable_eliminate 0.01% : 0.000010s : 1: offloading_packed_experts 0.01% : 0.000016s : 1: opt.transform.loop_unroll_optimizer 0.02% : 0.000022s : 1: opt.transform.mutable_eliminate 0.98% : 0.001399s : 78: opt.transform.opt_a 0.02% : 0.000030s : 1: opt.transform.opt_after_cconv 0.02% : 0.000031s : 1: opt.transform.opt_after_jit_grad 0.08% : 0.000108s : 28: opt.transform.opt_b 0.04% : 0.000053s : 2: opt.transform.opt_trans_graph 0.03% : 0.000039s : 4: opt.transform.symbol_engine_opt 41.98% : 0.059943s : 1: opt_a 0.10% : 0.000142s : 1: opt_after_cconv 0.44% : 0.000626s : 1: opt_after_jit_grad 0.21% : 0.000303s : 1: opt_b 44.18% : 0.063079s : 1: optimize 0.02% : 0.000026s : 1: optimize_parallel_all_gather_comm 0.01% : 0.000012s : 1: order_py_execute_after_rewriter 0.02% : 0.000029s : 1: overlap_grad_flash_sp 0.00% : 0.000007s : 1: overlap_grad_matmul_and_grad_allreduce 0.01% : 0.000010s : 1: overlap_grad_ring_attention 0.00% : 0.000007s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000007s : 1: overlap_opt_shard_in_pipeline 0.01% : 0.000009s : 1: overlap_param_gather 0.00% : 0.000006s : 1: overlap_recompute_allgather_and_fa_grad 0.01% : 0.000011s : 1: overlap_recompute_and_grad_model_parallel 0.01% : 0.000008s : 1: overlap_recompute_comm 0.01% : 0.000010s : 1: parallel-infer-symbol 0.00% : 0.000007s : 1: parallel-infer-symbol-second 0.01% : 0.000008s : 1: partial_unused_args_eliminate 0.01% : 0.000009s : 1: pipeline_parallel_scheduler 0.01% : 0.000008s : 1: pipeline_split 0.04% : 0.000052s : 1: pre_auto_parallel 0.03% : 0.000042s : 1: py_interpret_to_execute 0.02% : 0.000028s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000006s : 1: remove_cast_before_assign_add 0.01% : 0.000021s : 1: remove_dup_value 0.35% : 0.000507s : 1: renormalize.infer 0.30% : 0.000424s : 1: renormalize.specialize 0.01% : 0.000008s : 1: reorder_send_recv_between_fp_bp 0.01% : 0.000014s : 1: rewriter_after_jit_bprop_graph 0.04% : 0.000054s : 1: rewriter_after_opt_a 0.07% : 0.000106s : 1: rewriter_before_opt_a 0.01% : 0.000009s : 1: slice_cell_reuse_recomputed_activation 0.01% : 0.000008s : 1: slice_recompute_activation 0.00% : 0.000007s : 1: split_layernorm_comm 0.01% : 0.000007s : 1: split_matmul_comm_elemetwise 0.01% : 0.000012s : 1: swap_dp_allreduce_reducescatter 0.07% : 0.000105s : 1: symbol_engine_optimizer 0.07% : 0.000098s : 1: tuple_transform 4.12% : 0.005884s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:42:38.931.041 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.170182, [21] [bootstrap]: 0.00046874 [type_inference]: 0.0431138 [event_method]: 2.058e-05 [auto_monad]: 6.341e-05 [graph_reusing]: 6.06e-06 [inline]: 2.76999e-06 [add_attr]: 0.00376606, [1] [add_attr_with_inline]: 0.00375283, [1] [Cycle 1]: 7.947e-05, [2] [tag_attr]: 2.408e-05 [meta_addattr_fg_expand]: 6.18998e-06 [parallel-infer-symbol]: 3.77002e-06 [pre_auto_parallel]: 4.593e-05 [insert-virtual-dataset]: 3.01001e-06 [parallel-infer-symbol-second]: 8.70001e-07 [dataset_repeat_opt]: 2.01e-06 [pipeline_split]: 1.74998e-06 [optimize]: 0.121713, [53] [py_interpret_to_execute]: 3.052e-05 [rewriter_before_opt_a]: 0.00010179 [opt_a]: 0.119292, [2] [Cycle 1]: 0.118518, [45] [expand_dump_flag]: 3.30998e-06 [switch_simplify]: 4.46e-05 [loop_unroll]: 3.014e-05 [a_1]: 0.00070656 [with_stream_mark]: 2.731e-05 [recompute_prepare]: 1.154e-05 [updatestate_depend_eliminate]: 4.81002e-06 [updatestate_assign_eliminate]: 3.26001e-06 [updatestate_loads_eliminate]: 3.2e-06 [parameter_eliminate]: 2.09999e-06 [a_2]: 9.507e-05 [accelerated_algorithm]: 8.84e-06 [shard]: 2.76999e-06 [meta_shard_fg_expand]: 2.55997e-06 [shard_inline]: 7.03998e-06 [merge_send_recv]: 9.75002e-06 [auto_parallel]: 1.112e-05 [parallel]: 2.216e-05 [flash_sp]: 1.134e-05 [merge_comm]: 3.98999e-06 [allreduce_fusion]: 3.55e-06 [matmul_add_comm_reduction]: 1.178e-05 [allreduce_slice_to_reducescatter]: 8.70001e-07 [virtual_shard_identity]: 9.80002e-06 [virtual_dataset]: 6.83e-06 [get_grad_eliminate_]: 6.91001e-06 [virtual_output]: 7.20998e-06 [merge_forward]: 4.52998e-06 [cell_reuse_recompute_pass]: 2.01998e-06 [offload_activation]: 1.057e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.544e-05 [merge_recompute_call_nodes]: 1.49e-06 [before_grad]: 1.128e-05 [set_forward_comm_id_for_comm_node_pass]: 4.45999e-06 [meta_fg_expand]: 2.93e-06 [flash_sp_send_recv_attached]: 3.13e-06 [receive_attached]: 2.56e-06 [after_resolve]: 1.852e-05 [a_after_grad]: 1.167e-05 [renormalize]: 0.116935 [add_forward_monad_depend]: 8.97e-06 [auto_monad_grad]: 2.63e-06 [auto_monad_eliminator]: 1.927e-05 [cse]: 3.103e-05 [a_3]: 6.034e-05 [Cycle 2]: 0.00076001, [45] [expand_dump_flag]: 2.49999e-06 [switch_simplify]: 9.05999e-06 [loop_unroll]: 7.54002e-06 [a_1]: 0.00016323 [with_stream_mark]: 1.999e-05 [recompute_prepare]: 7.16001e-06 [updatestate_depend_eliminate]: 3.8e-06 [updatestate_assign_eliminate]: 3.36001e-06 [updatestate_loads_eliminate]: 3.08998e-06 [parameter_eliminate]: 2.02999e-06 [a_2]: 8.055e-05 [accelerated_algorithm]: 7.15e-06 [shard]: 2.54999e-06 [meta_shard_fg_expand]: 2.43998e-06 [shard_inline]: 6.16998e-06 [merge_send_recv]: 9.38002e-06 [auto_parallel]: 9.96e-06 [parallel]: 9.89999e-06 [flash_sp]: 3.97e-06 [merge_comm]: 3.79002e-06 [allreduce_fusion]: 3.26001e-06 [matmul_add_comm_reduction]: 9.96998e-06 [allreduce_slice_to_reducescatter]: 8.00006e-07 [virtual_shard_identity]: 9.47999e-06 [virtual_dataset]: 6.30002e-06 [get_grad_eliminate_]: 6.31e-06 [virtual_output]: 6.38e-06 [merge_forward]: 4.3e-06 [cell_reuse_recompute_pass]: 3.13e-06 [offload_activation]: 1.062e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.244e-05 [merge_recompute_call_nodes]: 1.38002e-06 [before_grad]: 1.03e-05 [set_forward_comm_id_for_comm_node_pass]: 3.5e-06 [meta_fg_expand]: 2.64001e-06 [flash_sp_send_recv_attached]: 1.99e-06 [receive_attached]: 2.33002e-06 [after_resolve]: 1.272e-05 [a_after_grad]: 9.37999e-06 [renormalize]: 8.00064e-08 [add_forward_monad_depend]: 1.81e-06 [auto_monad_grad]: 9.49978e-07 [auto_monad_eliminator]: 9.03002e-06 [cse]: 1.581e-05 [a_3]: 3.779e-05 [py_interpret_to_execute_after_opt_a]: 1.659e-05 [slice_cell_reuse_recomputed_activation]: 1.96e-06 [rewriter_after_opt_a]: 3.871e-05 [convert_after_rewriter]: 6.93e-06 [order_py_execute_after_rewriter]: 5.02999e-06 [mutable_eliminate]: 0.00073147 [opt_b]: 0.00021844, [1] [Cycle 1]: 0.00021091, [7] [b_1]: 0.00012868 [b_2]: 8.35001e-06 [updatestate_depend_eliminate]: 9.20001e-06 [updatestate_assign_eliminate]: 2.48998e-06 [updatestate_loads_eliminate]: 2.47001e-06 [renormalize]: 8.70001e-07 [cse]: 2.199e-05 [optimize_parallel_all_gather_comm]: 1.863e-05 [overlap_param_gather]: 2.22001e-06 [cconv]: 3.355e-05 [loop_unroll]: 0.00048133 [opt_after_cconv]: 0.00011054, [1] [Cycle 1]: 0.00010346, [7] [c_1]: 3.293e-05 [parameter_eliminate]: 3.88999e-06 [updatestate_depend_eliminate]: 6.05002e-06 [updatestate_assign_eliminate]: 2.54001e-06 [updatestate_loads_eliminate]: 2.36e-06 [cse]: 1.97e-05 [renormalize]: 7.89994e-07 [remove_dup_value]: 1.483e-05 [tuple_transform]: 8.17e-05, [1] [Cycle 1]: 7.708e-05, [4] [d_1]: 4.954e-05 [none_parameter_eliminate]: 1.54998e-06 [renormalize]: 2.19996e-07 [switch_simplify]: 6.70998e-06 [partial_unused_args_eliminate]: 2.11e-06 [add_recomputation]: 5.378e-05 [cse_after_recomputation]: 2.24e-05, [1] [Cycle 1]: 1.753e-05, [1] [cse]: 1.186e-05 [environ_conv]: 5.52001e-06 [swap_dp_allreduce_reducescatter]: 5.29e-06 [bias_add_comm_swap]: 3.56999e-06 [label_micro_interleaved_index]: 5.37999e-06 [label_fine_grained_interleaved_index]: 3.26001e-06 [merge_cast_opt]: 1.79e-06 [slice_recompute_activation]: 2.55002e-06 [micro_interleaved_order_control]: 2.49001e-06 [assign_add_opt]: 1.36002e-06 [ForceFp32Comm]: 1.01002e-06 [remove_cast_before_assign_add]: 1.12e-06 [full_micro_interleaved_order_control]: 2.23998e-06 [reorder_send_recv_between_fp_bp]: 2.95002e-06 [comm_op_add_attrs]: 1.00001e-06 [add_comm_op_reuse_tag]: 1.00999e-06 [interleave_split_concat_branches]: 1.34998e-06 [interleave_parallel_branches]: 1.08001e-06 [overlap_opt_shard_in_pipeline]: 1.32e-06 [overlap_opt_shard_grad_in_pipeline]: 2.12999e-06 [control_data_broadcast_order]: 1.338e-05 [grouped_pairwise_exchange_alltoall]: 1.87001e-06 [offloading_packed_experts]: 4.28999e-06 [overlap_recompute_and_grad_model_parallel]: 5.04998e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.30001e-06 [overlap_recompute_allgather_and_fa_grad]: 1.45999e-06 [overlap_recompute_comm]: 2.59001e-06 [overlap_grad_ring_attention]: 4.12998e-06 [overlap_grad_flash_sp]: 2.308e-05 [begin_end_overlap_inline]: 5.39992e-07 [split_matmul_comm_elemetwise]: 2.18002e-06 [split_layernorm_comm]: 1.89e-06 [handle_group_info]: 9.60019e-07 [symbol_engine_optimizer]: 8.263e-05, [1] [Cycle 1]: 7.714e-05, [6] [build]: 3.78001e-06 [elim_shapecalc]: 1.213e-05 [elim_not_effective]: 1.432e-05 [opt_reshape]: 7.2e-06 [fold_const_symbol]: 9.99001e-06 [renormalize]: 2.50002e-07 [detach_backward]: 2.31e-06 [pipeline_parallel_scheduler]: 1.81e-06 [auto_monad_reorder]: 1.802e-05 [get_jit_bprop_graph]: 2.36998e-06 [rewriter_after_jit_bprop_graph]: 6.02999e-06 [opt_after_jit_grad]: 0.00071622 [validate]: 4.732e-05 Sums bootstrap : 0.000469s : 0.28% type_inference : 0.043114s : 26.08% event_method : 0.000021s : 0.01% auto_monad : 0.000063s : 0.04% graph_reusing : 0.000006s : 0.00% inline : 0.000003s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000024s : 0.01% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.00% parallel-infer-symbol : 0.000004s : 0.00% pre_auto_parallel : 0.000046s : 0.03% insert-virtual-dataset : 0.000003s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000031s : 0.02% optimize.rewriter_before_opt_a : 0.000102s : 0.06% optimize.opt_a.expand_dump_flag : 0.000006s : 0.00% optimize.opt_a.switch_simplify : 0.000054s : 0.03% optimize.opt_a.loop_unroll : 0.000038s : 0.02% optimize.opt_a.a_1 : 0.000870s : 0.53% optimize.opt_a.with_stream_mark : 0.000047s : 0.03% optimize.opt_a.recompute_prepare : 0.000019s : 0.01% optimize.opt_a.updatestate_depend_eliminate : 0.000009s : 0.01% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.00% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.00% optimize.opt_a.parameter_eliminate : 0.000004s : 0.00% optimize.opt_a.a_2 : 0.000176s : 0.11% optimize.opt_a.accelerated_algorithm : 0.000016s : 0.01% optimize.opt_a.shard : 0.000005s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.000005s : 0.00% optimize.opt_a.shard_inline : 0.000013s : 0.01% optimize.opt_a.merge_send_recv : 0.000019s : 0.01% optimize.opt_a.auto_parallel : 0.000021s : 0.01% optimize.opt_a.parallel : 0.000032s : 0.02% optimize.opt_a.flash_sp : 0.000015s : 0.01% optimize.opt_a.merge_comm : 0.000008s : 0.00% optimize.opt_a.allreduce_fusion : 0.000007s : 0.00% optimize.opt_a.matmul_add_comm_reduction : 0.000022s : 0.01% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000002s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000019s : 0.01% optimize.opt_a.virtual_dataset : 0.000013s : 0.01% optimize.opt_a.get_grad_eliminate_ : 0.000013s : 0.01% optimize.opt_a.virtual_output : 0.000014s : 0.01% optimize.opt_a.merge_forward : 0.000009s : 0.01% optimize.opt_a.cell_reuse_recompute_pass : 0.000005s : 0.00% optimize.opt_a.offload_activation : 0.000021s : 0.01% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000028s : 0.02% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.00% optimize.opt_a.before_grad : 0.000022s : 0.01% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000008s : 0.00% optimize.opt_a.meta_fg_expand : 0.000006s : 0.00% optimize.opt_a.flash_sp_send_recv_attached : 0.000005s : 0.00% optimize.opt_a.receive_attached : 0.000005s : 0.00% optimize.opt_a.after_resolve : 0.000031s : 0.02% optimize.opt_a.a_after_grad : 0.000021s : 0.01% optimize.opt_a.renormalize : 0.116935s : 70.74% optimize.opt_a.add_forward_monad_depend : 0.000011s : 0.01% optimize.opt_a.auto_monad_grad : 0.000004s : 0.00% optimize.opt_a.auto_monad_eliminator : 0.000028s : 0.02% optimize.opt_a.cse : 0.000047s : 0.03% optimize.opt_a.a_3 : 0.000098s : 0.06% optimize.py_interpret_to_execute_after_opt_a : 0.000017s : 0.01% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.00% optimize.rewriter_after_opt_a : 0.000039s : 0.02% optimize.convert_after_rewriter : 0.000007s : 0.00% optimize.order_py_execute_after_rewriter : 0.000005s : 0.00% optimize.mutable_eliminate : 0.000731s : 0.44% optimize.opt_b.b_1 : 0.000129s : 0.08% optimize.opt_b.b_2 : 0.000008s : 0.01% optimize.opt_b.updatestate_depend_eliminate : 0.000009s : 0.01% optimize.opt_b.updatestate_assign_eliminate : 0.000002s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000002s : 0.00% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000022s : 0.01% optimize.optimize_parallel_all_gather_comm : 0.000019s : 0.01% optimize.overlap_param_gather : 0.000002s : 0.00% optimize.cconv : 0.000034s : 0.02% optimize.loop_unroll : 0.000481s : 0.29% optimize.opt_after_cconv.c_1 : 0.000033s : 0.02% optimize.opt_after_cconv.parameter_eliminate : 0.000004s : 0.00% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.00% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.00% optimize.opt_after_cconv.cse : 0.000020s : 0.01% optimize.opt_after_cconv.renormalize : 0.000001s : 0.00% optimize.remove_dup_value : 0.000015s : 0.01% optimize.tuple_transform.d_1 : 0.000050s : 0.03% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000007s : 0.00% optimize.partial_unused_args_eliminate : 0.000002s : 0.00% optimize.add_recomputation : 0.000054s : 0.03% optimize.cse_after_recomputation.cse : 0.000012s : 0.01% optimize.environ_conv : 0.000006s : 0.00% optimize.swap_dp_allreduce_reducescatter : 0.000005s : 0.00% optimize.bias_add_comm_swap : 0.000004s : 0.00% optimize.label_micro_interleaved_index : 0.000005s : 0.00% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.00% optimize.merge_cast_opt : 0.000002s : 0.00% optimize.slice_recompute_activation : 0.000003s : 0.00% optimize.micro_interleaved_order_control : 0.000002s : 0.00% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.00% optimize.full_micro_interleaved_order_control : 0.000002s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.00% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.00% optimize.control_data_broadcast_order : 0.000013s : 0.01% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.00% optimize.offloading_packed_experts : 0.000004s : 0.00% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.00% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.00% optimize.overlap_recompute_comm : 0.000003s : 0.00% optimize.overlap_grad_ring_attention : 0.000004s : 0.00% optimize.overlap_grad_flash_sp : 0.000023s : 0.01% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.00% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000004s : 0.00% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000012s : 0.01% optimize.symbol_engine_optimizer.elim_not_effective : 0.000014s : 0.01% optimize.symbol_engine_optimizer.opt_reshape : 0.000007s : 0.00% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000010s : 0.01% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.00% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000018s : 0.01% get_jit_bprop_graph : 0.000002s : 0.00% rewriter_after_jit_bprop_graph : 0.000006s : 0.00% opt_after_jit_grad : 0.000716s : 0.43% validate : 0.000047s : 0.03% Time group info: ------[substitution.] 0.000267 34 16.80% : 0.000045s : 6: substitution.arithmetic_simplify 0.86% : 0.000002s : 2: substitution.elim_not_effective 0.47% : 0.000001s : 2: substitution.fold_const_symbol 2.38% : 0.000006s : 4: substitution.graph_param_transform 66.55% : 0.000178s : 4: substitution.inline 1.67% : 0.000004s : 4: substitution.j_node_and_user_rematch 2.66% : 0.000007s : 4: substitution.remove_not_recompute_node 2.47% : 0.000007s : 4: substitution.replace_old_param 6.14% : 0.000016s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.043041 2 98.21% : 0.042271s : 1: type_inference.infer 1.79% : 0.000771s : 1: type_inference.specialize ------[replace.] 0.000067 8 62.36% : 0.000042s : 4: replace.inline 37.64% : 0.000025s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000190 8 92.28% : 0.000175s : 4: match.inline 7.72% : 0.000015s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000347 1278 0.55% : 0.000002s : 13: predicate.accumulaten_eliminater 0.59% : 0.000002s : 4: predicate.ad_related_special_op_eliminate 0.43% : 0.000001s : 8: predicate.addn_check_dump 0.57% : 0.000002s : 13: predicate.addn_zero_filter 0.59% : 0.000002s : 13: predicate.adjust_all_reduce_mul_add 1.64% : 0.000006s : 21: predicate.arithmetic_simplify 0.70% : 0.000002s : 13: predicate.cast_eliminate 0.39% : 0.000001s : 8: predicate.check_bprop_eliminate 0.41% : 0.000001s : 8: predicate.compare_switch_simplify 0.11% : 0.000000s : 4: predicate.const_output_eliminate 0.45% : 0.000002s : 8: predicate.depend_value_elim 0.64% : 0.000002s : 13: predicate.dict_get_item_const_eliminator 0.74% : 0.000003s : 13: predicate.dict_get_item_eliminator 0.58% : 0.000002s : 13: predicate.dict_set_item_eliminator 0.62% : 0.000002s : 8: predicate.dumpgradient_eliminate 0.11% : 0.000000s : 4: predicate.elim_not_effective 0.33% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 0.79% : 0.000003s : 17: predicate.environ_add_const_eliminate 0.62% : 0.000002s : 17: predicate.environ_get_add_eliminate 0.74% : 0.000003s : 17: predicate.environ_get_depend_swap 1.22% : 0.000004s : 25: predicate.environ_get_eliminate 0.65% : 0.000002s : 17: predicate.environ_get_set_eliminate 0.92% : 0.000003s : 21: predicate.exchange_switch_depend_value 1.50% : 0.000005s : 21: predicate.float_depend_g_call 0.33% : 0.000001s : 8: predicate.float_environ_get_switch 0.52% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.11% : 0.000000s : 4: predicate.fold_const_symbol 0.57% : 0.000002s : 8: predicate.get_grad_eliminate 0.13% : 0.000000s : 4: predicate.graph_param_transform 0.35% : 0.000001s : 8: predicate.incorporate_call 0.30% : 0.000001s : 8: predicate.incorporate_call_switch 4.01% : 0.000014s : 58: predicate.inline 0.45% : 0.000002s : 8: predicate.inline_without_move 0.20% : 0.000001s : 8: predicate.j_node_and_user_rematch 0.61% : 0.000002s : 8: predicate.less_batch_normalization 1.08% : 0.000004s : 25: predicate.list_to_tuple_eliminator_ 1.51% : 0.000005s : 38: predicate.load_eliminater 0.65% : 0.000002s : 4: predicate.loop_unroll_after_grad 1.50% : 0.000005s : 34: predicate.loop_unroll_before_grad 1.12% : 0.000004s : 21: predicate.make_slice_get_slice_eliminator 0.34% : 0.000001s : 8: predicate.merge_addn 0.35% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.36% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.49% : 0.000002s : 13: predicate.minmaximum_grad 1.01% : 0.000004s : 4: predicate.mutable_eliminate 0.24% : 0.000001s : 4: predicate.opt_reshape 0.30% : 0.000001s : 4: predicate.parallel_virtual_node 1.16% : 0.000004s : 21: predicate.partial_defer_inline 0.97% : 0.000003s : 21: predicate.partial_eliminate 0.56% : 0.000002s : 13: predicate.print_const_string_wrapper 0.39% : 0.000001s : 8: predicate.reduce_all_const_elim 0.91% : 0.000003s : 13: predicate.reduce_eliminate 1.53% : 0.000005s : 38: predicate.redundant_stop_gradient_eliminater 0.35% : 0.000001s : 8: predicate.remove_not_recompute_node 0.88% : 0.000003s : 25: predicate.replace_applicator 0.51% : 0.000002s : 8: predicate.replace_old_param 0.29% : 0.000001s : 4: predicate.reset_defer_inline 0.62% : 0.000002s : 13: predicate.reshape_eliminate 0.40% : 0.000001s : 8: predicate.row_tensor_add_zeros_like 0.28% : 0.000001s : 4: predicate.row_tensor_eliminate 0.53% : 0.000002s : 8: predicate.same_eliminate 0.24% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.52% : 0.000002s : 8: predicate.shard_identity_eliminate 34.84% : 0.000121s : 8: predicate.special_op_eliminate 0.39% : 0.000001s : 8: predicate.specialize_transform 0.64% : 0.000002s : 8: predicate.split_environ_get_set_with_tuple_value 0.57% : 0.000002s : 8: predicate.stack_unstack_eliminate 0.23% : 0.000001s : 4: predicate.switch_call_monad_eliminater 0.93% : 0.000003s : 21: predicate.switch_defer_inline 1.36% : 0.000005s : 29: predicate.switch_layer_defer_inline 3.17% : 0.000011s : 67: predicate.switch_simplify 0.57% : 0.000002s : 13: predicate.tile_eliminate 0.60% : 0.000002s : 13: predicate.transpose_eliminate 0.95% : 0.000003s : 21: predicate.tuple_list_convert_item_index_to_positive 1.22% : 0.000004s : 21: predicate.tuple_list_get_item_const_eliminator 0.94% : 0.000003s : 21: predicate.tuple_list_get_item_depend_reorder 2.07% : 0.000007s : 33: predicate.tuple_list_get_item_eliminator 1.10% : 0.000004s : 21: predicate.tuple_list_get_set_item_eliminator 1.54% : 0.000005s : 29: predicate.tuple_list_set_item_eliminator 1.12% : 0.000004s : 25: predicate.tuple_to_list_eliminator_ 1.59% : 0.000006s : 38: predicate.updatestate_pure_node_eliminater 1.91% : 0.000007s : 46: predicate.updatestate_useless_node_eliminater 0.28% : 0.000001s : 4: predicate.value_based_eliminate 0.42% : 0.000001s : 8: predicate.virtual_dataset_eliminate 0.49% : 0.000002s : 8: predicate.virtual_output_eliminate 0.15% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.37% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000727 11 58.79% : 0.000427s : 5: func_graph_cloner_run.FuncGraphClonerGraph 41.21% : 0.000300s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.298226 192 0.00% : 0.000004s : 1: ForceFp32Comm 1.26% : 0.003772s : 1: add_attr 1.26% : 0.003757s : 1: add_attr_with_inline 0.00% : 0.000004s : 1: add_comm_op_reuse_tag 0.02% : 0.000058s : 1: add_recomputation 0.00% : 0.000004s : 1: assign_add_opt 0.02% : 0.000069s : 1: auto_monad 0.01% : 0.000022s : 1: auto_monad_reorder 0.00% : 0.000003s : 1: begin_end_overlap_inline 0.00% : 0.000007s : 1: bias_add_comm_swap 0.17% : 0.000504s : 1: bootstrap 0.01% : 0.000038s : 1: cconv 0.00% : 0.000004s : 1: comm_op_add_attrs 0.01% : 0.000017s : 1: control_data_broadcast_order 0.00% : 0.000010s : 1: convert_after_rewriter 0.01% : 0.000025s : 1: cse_after_recomputation 0.00% : 0.000005s : 1: dataset_repeat_opt 0.00% : 0.000007s : 1: detach_backward 0.00% : 0.000009s : 1: environ_conv 0.01% : 0.000027s : 1: event_method 0.00% : 0.000005s : 1: full_micro_interleaved_order_control 0.00% : 0.000006s : 1: get_jit_bprop_graph 0.00% : 0.000010s : 1: graph_reusing 0.00% : 0.000005s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000004s : 1: handle_group_info 0.00% : 0.000006s : 1: inline 0.00% : 0.000007s : 1: insert-virtual-dataset 0.01% : 0.000026s : 1: interleave_parallel_branches 0.00% : 0.000004s : 1: interleave_split_concat_branches 0.00% : 0.000007s : 1: label_fine_grained_interleaved_index 0.00% : 0.000008s : 1: label_micro_interleaved_index 0.16% : 0.000490s : 1: loop_unroll 0.00% : 0.000005s : 1: merge_cast_opt 0.00% : 0.000005s : 1: micro_interleaved_order_control 0.25% : 0.000742s : 1: mutable_eliminate 0.00% : 0.000007s : 1: offloading_packed_experts 0.01% : 0.000016s : 1: opt.transform.loop_unroll_optimizer 0.01% : 0.000019s : 1: opt.transform.mutable_eliminate 0.45% : 0.001342s : 78: opt.transform.opt_a 0.01% : 0.000031s : 1: opt.transform.opt_after_cconv 0.05% : 0.000153s : 1: opt.transform.opt_after_jit_grad 0.03% : 0.000103s : 28: opt.transform.opt_b 0.02% : 0.000054s : 2: opt.transform.opt_trans_graph 0.01% : 0.000040s : 4: opt.transform.symbol_engine_opt 40.00% : 0.119295s : 1: opt_a 0.04% : 0.000114s : 1: opt_after_cconv 0.24% : 0.000729s : 1: opt_after_jit_grad 0.07% : 0.000222s : 1: opt_b 40.81% : 0.121720s : 1: optimize 0.01% : 0.000022s : 1: optimize_parallel_all_gather_comm 0.00% : 0.000008s : 1: order_py_execute_after_rewriter 0.01% : 0.000027s : 1: overlap_grad_flash_sp 0.00% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.00% : 0.000007s : 1: overlap_grad_ring_attention 0.00% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000005s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000005s : 1: overlap_param_gather 0.00% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.00% : 0.000008s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000005s : 1: overlap_recompute_comm 0.00% : 0.000008s : 1: parallel-infer-symbol 0.00% : 0.000004s : 1: parallel-infer-symbol-second 0.00% : 0.000005s : 1: partial_unused_args_eliminate 0.00% : 0.000006s : 1: pipeline_parallel_scheduler 0.00% : 0.000005s : 1: pipeline_split 0.02% : 0.000051s : 1: pre_auto_parallel 0.01% : 0.000035s : 1: py_interpret_to_execute 0.01% : 0.000020s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000004s : 1: remove_cast_before_assign_add 0.01% : 0.000019s : 1: remove_dup_value 0.18% : 0.000545s : 1: renormalize.infer 0.14% : 0.000406s : 1: renormalize.specialize 0.00% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.00% : 0.000009s : 1: rewriter_after_jit_bprop_graph 0.01% : 0.000043s : 1: rewriter_after_opt_a 0.04% : 0.000107s : 1: rewriter_before_opt_a 0.00% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000006s : 1: slice_recompute_activation 0.00% : 0.000005s : 1: split_layernorm_comm 0.00% : 0.000005s : 1: split_matmul_comm_elemetwise 0.00% : 0.000008s : 1: swap_dp_allreduce_reducescatter 0.03% : 0.000085s : 1: symbol_engine_optimizer 0.03% : 0.000085s : 1: tuple_transform 14.46% : 0.043134s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:42:40.595.701 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:42:40.595.981 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0180344, [21] [bootstrap]: 0.00046278 [type_inference]: 0.00603813 [event_method]: 2.268e-05 [auto_monad]: 6.838e-05 [graph_reusing]: 7.18e-06 [inline]: 2.73e-06 [add_attr]: 0.00368855, [1] [add_attr_with_inline]: 0.00367603, [1] [Cycle 1]: 9.517e-05, [2] [tag_attr]: 2.502e-05 [meta_addattr_fg_expand]: 5.70001e-06 [parallel-infer-symbol]: 4.15999e-06 [pre_auto_parallel]: 4.412e-05 [insert-virtual-dataset]: 2.37999e-06 [parallel-infer-symbol-second]: 8.2e-07 [dataset_repeat_opt]: 2.09999e-06 [pipeline_split]: 1.99999e-06 [optimize]: 0.00631269, [53] [py_interpret_to_execute]: 3.707e-05 [rewriter_before_opt_a]: 0.00010133 [opt_a]: 0.00345564, [2] [Cycle 1]: 0.00254239, [45] [expand_dump_flag]: 3.36001e-06 [switch_simplify]: 4.47e-05 [loop_unroll]: 3.066e-05 [a_1]: 0.00069087 [with_stream_mark]: 2.225e-05 [recompute_prepare]: 1.026e-05 [updatestate_depend_eliminate]: 4.72e-06 [updatestate_assign_eliminate]: 3.38999e-06 [updatestate_loads_eliminate]: 3.23e-06 [parameter_eliminate]: 2.07001e-06 [a_2]: 0.00012098 [accelerated_algorithm]: 7.79002e-06 [shard]: 2.29999e-06 [meta_shard_fg_expand]: 2.32001e-06 [shard_inline]: 6.93998e-06 [merge_send_recv]: 8.90999e-06 [auto_parallel]: 8.69e-06 [parallel]: 2.08e-05 [flash_sp]: 1.017e-05 [merge_comm]: 4.34997e-06 [allreduce_fusion]: 3.61001e-06 [matmul_add_comm_reduction]: 9.49999e-06 [allreduce_slice_to_reducescatter]: 8.69972e-07 [virtual_shard_identity]: 8.72998e-06 [virtual_dataset]: 6.89001e-06 [get_grad_eliminate_]: 6.87002e-06 [virtual_output]: 6.69001e-06 [merge_forward]: 4.09002e-06 [cell_reuse_recompute_pass]: 1.35001e-06 [offload_activation]: 1.109e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.49e-05 [merge_recompute_call_nodes]: 1.60999e-06 [before_grad]: 1.174e-05 [set_forward_comm_id_for_comm_node_pass]: 3.97998e-06 [meta_fg_expand]: 2.96001e-06 [flash_sp_send_recv_attached]: 2.64001e-06 [receive_attached]: 2.43998e-06 [after_resolve]: 1.262e-05 [a_after_grad]: 1.002e-05 [renormalize]: 0.00087204 [add_forward_monad_depend]: 7.06001e-06 [auto_monad_grad]: 3.11999e-06 [auto_monad_eliminator]: 1.789e-05 [cse]: 3.097e-05 [a_3]: 6.74e-05 [Cycle 2]: 0.00089601, [45] [expand_dump_flag]: 2.44999e-06 [switch_simplify]: 8.47e-06 [loop_unroll]: 6.29001e-06 [a_1]: 0.00015057 [with_stream_mark]: 1.539e-05 [recompute_prepare]: 7.24001e-06 [updatestate_depend_eliminate]: 3.4e-06 [updatestate_assign_eliminate]: 3.50003e-06 [updatestate_loads_eliminate]: 2.22001e-06 [parameter_eliminate]: 1.58002e-06 [a_2]: 0.00010593 [accelerated_algorithm]: 6.46e-06 [shard]: 1.96e-06 [meta_shard_fg_expand]: 2.24001e-06 [shard_inline]: 6.89999e-06 [merge_send_recv]: 6.14999e-06 [auto_parallel]: 7.77e-06 [parallel]: 7.27002e-06 [flash_sp]: 3.97e-06 [merge_comm]: 3.94002e-06 [allreduce_fusion]: 3.48e-06 [matmul_add_comm_reduction]: 8.07e-06 [allreduce_slice_to_reducescatter]: 5.79981e-07 [virtual_shard_identity]: 6.81001e-06 [virtual_dataset]: 7.05998e-06 [get_grad_eliminate_]: 6.30997e-06 [virtual_output]: 5.81e-06 [merge_forward]: 3.79002e-06 [cell_reuse_recompute_pass]: 1.74e-06 [offload_activation]: 8.91002e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.443e-05 [merge_recompute_call_nodes]: 1.07e-06 [before_grad]: 1.033e-05 [set_forward_comm_id_for_comm_node_pass]: 3.63e-06 [meta_fg_expand]: 2.19999e-06 [flash_sp_send_recv_attached]: 1.09e-06 [receive_attached]: 2.05002e-06 [after_resolve]: 1.165e-05 [a_after_grad]: 1.004e-05 [renormalize]: 7.99773e-08 [add_forward_monad_depend]: 1.73002e-06 [auto_monad_grad]: 1.65001e-06 [auto_monad_eliminator]: 8.60001e-06 [cse]: 1.76e-05 [a_3]: 4.972e-05 [py_interpret_to_execute_after_opt_a]: 4.01e-05 [slice_cell_reuse_recomputed_activation]: 5.51e-06 [rewriter_after_opt_a]: 5.075e-05 [convert_after_rewriter]: 1.119e-05 [order_py_execute_after_rewriter]: 8.64003e-06 [mutable_eliminate]: 0.00082198 [opt_b]: 0.00030469, [1] [Cycle 1]: 0.00029272, [7] [b_1]: 0.0001744 [b_2]: 9.03002e-06 [updatestate_depend_eliminate]: 1.14e-05 [updatestate_assign_eliminate]: 3.61999e-06 [updatestate_loads_eliminate]: 3.16999e-06 [renormalize]: 6.69999e-07 [cse]: 2.911e-05 [optimize_parallel_all_gather_comm]: 2.441e-05 [overlap_param_gather]: 5.34e-06 [cconv]: 4.102e-05 [loop_unroll]: 0.00053615 [opt_after_cconv]: 0.00014125, [1] [Cycle 1]: 0.00013153, [7] [c_1]: 3.363e-05 [parameter_eliminate]: 6.28e-06 [updatestate_depend_eliminate]: 7.04001e-06 [updatestate_assign_eliminate]: 2.56e-06 [updatestate_loads_eliminate]: 2.54001e-06 [cse]: 2.224e-05 [renormalize]: 7.2e-07 [remove_dup_value]: 1.714e-05 [tuple_transform]: 0.0001008, [1] [Cycle 1]: 9.196e-05, [4] [d_1]: 5.056e-05 [none_parameter_eliminate]: 1.63002e-06 [renormalize]: 2.19996e-07 [switch_simplify]: 7.45998e-06 [partial_unused_args_eliminate]: 4.48001e-06 [add_recomputation]: 5.983e-05 [cse_after_recomputation]: 2.922e-05, [1] [Cycle 1]: 2.186e-05, [1] [cse]: 1.224e-05 [environ_conv]: 9.05001e-06 [swap_dp_allreduce_reducescatter]: 7.63999e-06 [bias_add_comm_swap]: 5.54998e-06 [label_micro_interleaved_index]: 8.64e-06 [label_fine_grained_interleaved_index]: 5.35001e-06 [merge_cast_opt]: 3.74002e-06 [slice_recompute_activation]: 4.82e-06 [micro_interleaved_order_control]: 5.06997e-06 [assign_add_opt]: 3.7e-06 [ForceFp32Comm]: 3.38e-06 [remove_cast_before_assign_add]: 3.37002e-06 [full_micro_interleaved_order_control]: 4.92e-06 [reorder_send_recv_between_fp_bp]: 5.84e-06 [comm_op_add_attrs]: 3.97e-06 [add_comm_op_reuse_tag]: 3.39001e-06 [interleave_split_concat_branches]: 3.81001e-06 [interleave_parallel_branches]: 3.61999e-06 [overlap_opt_shard_in_pipeline]: 4.26001e-06 [overlap_opt_shard_grad_in_pipeline]: 4.74e-06 [control_data_broadcast_order]: 1.782e-05 [grouped_pairwise_exchange_alltoall]: 4.29997e-06 [offloading_packed_experts]: 6.43e-06 [overlap_recompute_and_grad_model_parallel]: 7.35003e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.88001e-06 [overlap_recompute_allgather_and_fa_grad]: 3.93999e-06 [overlap_recompute_comm]: 5.37001e-06 [overlap_grad_ring_attention]: 6.69999e-06 [overlap_grad_flash_sp]: 2.533e-05 [begin_end_overlap_inline]: 2.89999e-06 [split_matmul_comm_elemetwise]: 5.12e-06 [split_layernorm_comm]: 4.12e-06 [handle_group_info]: 3.24001e-06 [symbol_engine_optimizer]: 0.0001008, [1] [Cycle 1]: 9.34e-05, [6] [build]: 3.88999e-06 [elim_shapecalc]: 1.174e-05 [elim_not_effective]: 1.36e-05 [opt_reshape]: 7.78999e-06 [fold_const_symbol]: 1.018e-05 [renormalize]: 2.3999e-07 [detach_backward]: 3.58999e-06 [pipeline_parallel_scheduler]: 1.66e-06 [auto_monad_reorder]: 2.284e-05 [get_jit_bprop_graph]: 2.06e-06 [rewriter_after_jit_bprop_graph]: 6.52001e-06 [opt_after_jit_grad]: 0.000626 [validate]: 4.955e-05 Sums bootstrap : 0.000463s : 3.72% type_inference : 0.006038s : 48.48% event_method : 0.000023s : 0.18% auto_monad : 0.000068s : 0.55% graph_reusing : 0.000007s : 0.06% inline : 0.000003s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000025s : 0.20% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.05% parallel-infer-symbol : 0.000004s : 0.03% pre_auto_parallel : 0.000044s : 0.35% insert-virtual-dataset : 0.000002s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.02% optimize.py_interpret_to_execute : 0.000037s : 0.30% optimize.rewriter_before_opt_a : 0.000101s : 0.81% optimize.opt_a.expand_dump_flag : 0.000006s : 0.05% optimize.opt_a.switch_simplify : 0.000053s : 0.43% optimize.opt_a.loop_unroll : 0.000037s : 0.30% optimize.opt_a.a_1 : 0.000841s : 6.76% optimize.opt_a.with_stream_mark : 0.000038s : 0.30% optimize.opt_a.recompute_prepare : 0.000018s : 0.14% optimize.opt_a.updatestate_depend_eliminate : 0.000008s : 0.07% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.06% optimize.opt_a.updatestate_loads_eliminate : 0.000005s : 0.04% optimize.opt_a.parameter_eliminate : 0.000004s : 0.03% optimize.opt_a.a_2 : 0.000227s : 1.82% optimize.opt_a.accelerated_algorithm : 0.000014s : 0.11% optimize.opt_a.shard : 0.000004s : 0.03% optimize.opt_a.meta_shard_fg_expand : 0.000005s : 0.04% optimize.opt_a.shard_inline : 0.000014s : 0.11% optimize.opt_a.merge_send_recv : 0.000015s : 0.12% optimize.opt_a.auto_parallel : 0.000016s : 0.13% optimize.opt_a.parallel : 0.000028s : 0.23% optimize.opt_a.flash_sp : 0.000014s : 0.11% optimize.opt_a.merge_comm : 0.000008s : 0.07% optimize.opt_a.allreduce_fusion : 0.000007s : 0.06% optimize.opt_a.matmul_add_comm_reduction : 0.000018s : 0.14% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000016s : 0.12% optimize.opt_a.virtual_dataset : 0.000014s : 0.11% optimize.opt_a.get_grad_eliminate_ : 0.000013s : 0.11% optimize.opt_a.virtual_output : 0.000013s : 0.10% optimize.opt_a.merge_forward : 0.000008s : 0.06% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.02% optimize.opt_a.offload_activation : 0.000020s : 0.16% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000029s : 0.24% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.02% optimize.opt_a.before_grad : 0.000022s : 0.18% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000008s : 0.06% optimize.opt_a.meta_fg_expand : 0.000005s : 0.04% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.03% optimize.opt_a.receive_attached : 0.000004s : 0.04% optimize.opt_a.after_resolve : 0.000024s : 0.19% optimize.opt_a.a_after_grad : 0.000020s : 0.16% optimize.opt_a.renormalize : 0.000872s : 7.00% optimize.opt_a.add_forward_monad_depend : 0.000009s : 0.07% optimize.opt_a.auto_monad_grad : 0.000005s : 0.04% optimize.opt_a.auto_monad_eliminator : 0.000026s : 0.21% optimize.opt_a.cse : 0.000049s : 0.39% optimize.opt_a.a_3 : 0.000117s : 0.94% optimize.py_interpret_to_execute_after_opt_a : 0.000040s : 0.32% optimize.slice_cell_reuse_recomputed_activation : 0.000006s : 0.04% optimize.rewriter_after_opt_a : 0.000051s : 0.41% optimize.convert_after_rewriter : 0.000011s : 0.09% optimize.order_py_execute_after_rewriter : 0.000009s : 0.07% optimize.mutable_eliminate : 0.000822s : 6.60% optimize.opt_b.b_1 : 0.000174s : 1.40% optimize.opt_b.b_2 : 0.000009s : 0.07% optimize.opt_b.updatestate_depend_eliminate : 0.000011s : 0.09% optimize.opt_b.updatestate_assign_eliminate : 0.000004s : 0.03% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_b.renormalize : 0.000001s : 0.01% optimize.opt_b.cse : 0.000029s : 0.23% optimize.optimize_parallel_all_gather_comm : 0.000024s : 0.20% optimize.overlap_param_gather : 0.000005s : 0.04% optimize.cconv : 0.000041s : 0.33% optimize.loop_unroll : 0.000536s : 4.30% optimize.opt_after_cconv.c_1 : 0.000034s : 0.27% optimize.opt_after_cconv.parameter_eliminate : 0.000006s : 0.05% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000007s : 0.06% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.02% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.02% optimize.opt_after_cconv.cse : 0.000022s : 0.18% optimize.opt_after_cconv.renormalize : 0.000001s : 0.01% optimize.remove_dup_value : 0.000017s : 0.14% optimize.tuple_transform.d_1 : 0.000051s : 0.41% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000007s : 0.06% optimize.partial_unused_args_eliminate : 0.000004s : 0.04% optimize.add_recomputation : 0.000060s : 0.48% optimize.cse_after_recomputation.cse : 0.000012s : 0.10% optimize.environ_conv : 0.000009s : 0.07% optimize.swap_dp_allreduce_reducescatter : 0.000008s : 0.06% optimize.bias_add_comm_swap : 0.000006s : 0.04% optimize.label_micro_interleaved_index : 0.000009s : 0.07% optimize.label_fine_grained_interleaved_index : 0.000005s : 0.04% optimize.merge_cast_opt : 0.000004s : 0.03% optimize.slice_recompute_activation : 0.000005s : 0.04% optimize.micro_interleaved_order_control : 0.000005s : 0.04% optimize.assign_add_opt : 0.000004s : 0.03% optimize.ForceFp32Comm : 0.000003s : 0.03% optimize.remove_cast_before_assign_add : 0.000003s : 0.03% optimize.full_micro_interleaved_order_control : 0.000005s : 0.04% optimize.reorder_send_recv_between_fp_bp : 0.000006s : 0.05% optimize.comm_op_add_attrs : 0.000004s : 0.03% optimize.add_comm_op_reuse_tag : 0.000003s : 0.03% optimize.interleave_split_concat_branches : 0.000004s : 0.03% optimize.interleave_parallel_branches : 0.000004s : 0.03% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.03% optimize.overlap_opt_shard_grad_in_pipeline : 0.000005s : 0.04% optimize.control_data_broadcast_order : 0.000018s : 0.14% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.03% optimize.offloading_packed_experts : 0.000006s : 0.05% optimize.overlap_recompute_and_grad_model_parallel : 0.000007s : 0.06% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.03% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.03% optimize.overlap_recompute_comm : 0.000005s : 0.04% optimize.overlap_grad_ring_attention : 0.000007s : 0.05% optimize.overlap_grad_flash_sp : 0.000025s : 0.20% optimize.begin_end_overlap_inline : 0.000003s : 0.02% optimize.split_matmul_comm_elemetwise : 0.000005s : 0.04% optimize.split_layernorm_comm : 0.000004s : 0.03% optimize.handle_group_info : 0.000003s : 0.03% optimize.symbol_engine_optimizer.build : 0.000004s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000012s : 0.09% optimize.symbol_engine_optimizer.elim_not_effective : 0.000014s : 0.11% optimize.symbol_engine_optimizer.opt_reshape : 0.000008s : 0.06% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000010s : 0.08% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000004s : 0.03% pipeline_parallel_scheduler : 0.000002s : 0.01% auto_monad_reorder : 0.000023s : 0.18% get_jit_bprop_graph : 0.000002s : 0.02% rewriter_after_jit_bprop_graph : 0.000007s : 0.05% opt_after_jit_grad : 0.000626s : 5.03% validate : 0.000050s : 0.40% Time group info: ------[substitution.] 0.000252 34 15.43% : 0.000039s : 6: substitution.arithmetic_simplify 0.71% : 0.000002s : 2: substitution.elim_not_effective 0.56% : 0.000001s : 2: substitution.fold_const_symbol 2.63% : 0.000007s : 4: substitution.graph_param_transform 68.37% : 0.000172s : 4: substitution.inline 1.90% : 0.000005s : 4: substitution.j_node_and_user_rematch 1.99% : 0.000005s : 4: substitution.remove_not_recompute_node 2.28% : 0.000006s : 4: substitution.replace_old_param 6.13% : 0.000015s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.005957 2 86.41% : 0.005147s : 1: type_inference.infer 13.59% : 0.000809s : 1: type_inference.specialize ------[replace.] 0.000068 8 63.81% : 0.000044s : 4: replace.inline 36.19% : 0.000025s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000183 8 92.61% : 0.000170s : 4: match.inline 7.39% : 0.000014s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000220 1278 0.83% : 0.000002s : 13: predicate.accumulaten_eliminater 1.09% : 0.000002s : 4: predicate.ad_related_special_op_eliminate 0.50% : 0.000001s : 8: predicate.addn_check_dump 0.88% : 0.000002s : 13: predicate.addn_zero_filter 0.76% : 0.000002s : 13: predicate.adjust_all_reduce_mul_add 2.28% : 0.000005s : 21: predicate.arithmetic_simplify 0.92% : 0.000002s : 13: predicate.cast_eliminate 0.61% : 0.000001s : 8: predicate.check_bprop_eliminate 0.54% : 0.000001s : 8: predicate.compare_switch_simplify 0.18% : 0.000000s : 4: predicate.const_output_eliminate 0.74% : 0.000002s : 8: predicate.depend_value_elim 0.84% : 0.000002s : 13: predicate.dict_get_item_const_eliminator 1.00% : 0.000002s : 13: predicate.dict_get_item_eliminator 0.81% : 0.000002s : 13: predicate.dict_set_item_eliminator 1.04% : 0.000002s : 8: predicate.dumpgradient_eliminate 0.19% : 0.000000s : 4: predicate.elim_not_effective 0.52% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.21% : 0.000003s : 17: predicate.environ_add_const_eliminate 1.03% : 0.000002s : 17: predicate.environ_get_add_eliminate 1.11% : 0.000002s : 17: predicate.environ_get_depend_swap 1.58% : 0.000003s : 25: predicate.environ_get_eliminate 1.06% : 0.000002s : 17: predicate.environ_get_set_eliminate 1.31% : 0.000003s : 21: predicate.exchange_switch_depend_value 2.56% : 0.000006s : 21: predicate.float_depend_g_call 0.50% : 0.000001s : 8: predicate.float_environ_get_switch 0.90% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.16% : 0.000000s : 4: predicate.fold_const_symbol 0.74% : 0.000002s : 8: predicate.get_grad_eliminate 0.30% : 0.000001s : 4: predicate.graph_param_transform 0.55% : 0.000001s : 8: predicate.incorporate_call 0.44% : 0.000001s : 8: predicate.incorporate_call_switch 6.66% : 0.000015s : 58: predicate.inline 0.69% : 0.000002s : 8: predicate.inline_without_move 0.31% : 0.000001s : 8: predicate.j_node_and_user_rematch 0.82% : 0.000002s : 8: predicate.less_batch_normalization 1.83% : 0.000004s : 25: predicate.list_to_tuple_eliminator_ 2.47% : 0.000005s : 38: predicate.load_eliminater 1.34% : 0.000003s : 4: predicate.loop_unroll_after_grad 2.18% : 0.000005s : 34: predicate.loop_unroll_before_grad 1.57% : 0.000003s : 21: predicate.make_slice_get_slice_eliminator 0.56% : 0.000001s : 8: predicate.merge_addn 0.67% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.60% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.79% : 0.000002s : 13: predicate.minmaximum_grad 1.56% : 0.000003s : 4: predicate.mutable_eliminate 0.39% : 0.000001s : 4: predicate.opt_reshape 0.55% : 0.000001s : 4: predicate.parallel_virtual_node 1.62% : 0.000004s : 21: predicate.partial_defer_inline 1.55% : 0.000003s : 21: predicate.partial_eliminate 0.97% : 0.000002s : 13: predicate.print_const_string_wrapper 0.64% : 0.000001s : 8: predicate.reduce_all_const_elim 1.10% : 0.000002s : 13: predicate.reduce_eliminate 2.49% : 0.000005s : 38: predicate.redundant_stop_gradient_eliminater 0.53% : 0.000001s : 8: predicate.remove_not_recompute_node 1.38% : 0.000003s : 25: predicate.replace_applicator 0.64% : 0.000001s : 8: predicate.replace_old_param 0.53% : 0.000001s : 4: predicate.reset_defer_inline 0.84% : 0.000002s : 13: predicate.reshape_eliminate 0.58% : 0.000001s : 8: predicate.row_tensor_add_zeros_like 0.49% : 0.000001s : 4: predicate.row_tensor_eliminate 0.78% : 0.000002s : 8: predicate.same_eliminate 0.42% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.76% : 0.000002s : 8: predicate.shard_identity_eliminate 0.73% : 0.000002s : 8: predicate.special_op_eliminate 0.73% : 0.000002s : 8: predicate.specialize_transform 0.77% : 0.000002s : 8: predicate.split_environ_get_set_with_tuple_value 0.82% : 0.000002s : 8: predicate.stack_unstack_eliminate 0.39% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.48% : 0.000003s : 21: predicate.switch_defer_inline 1.92% : 0.000004s : 29: predicate.switch_layer_defer_inline 5.09% : 0.000011s : 67: predicate.switch_simplify 0.91% : 0.000002s : 13: predicate.tile_eliminate 0.96% : 0.000002s : 13: predicate.transpose_eliminate 1.48% : 0.000003s : 21: predicate.tuple_list_convert_item_index_to_positive 1.63% : 0.000004s : 21: predicate.tuple_list_get_item_const_eliminator 1.47% : 0.000003s : 21: predicate.tuple_list_get_item_depend_reorder 3.08% : 0.000007s : 33: predicate.tuple_list_get_item_eliminator 1.33% : 0.000003s : 21: predicate.tuple_list_get_set_item_eliminator 2.02% : 0.000004s : 29: predicate.tuple_list_set_item_eliminator 1.62% : 0.000004s : 25: predicate.tuple_to_list_eliminator_ 2.28% : 0.000005s : 38: predicate.updatestate_pure_node_eliminater 3.07% : 0.000007s : 46: predicate.updatestate_useless_node_eliminater 0.47% : 0.000001s : 4: predicate.value_based_eliminate 0.85% : 0.000002s : 8: predicate.virtual_dataset_eliminate 0.63% : 0.000001s : 8: predicate.virtual_output_eliminate 0.25% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.50% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000601 11 51.53% : 0.000310s : 5: func_graph_cloner_run.FuncGraphClonerGraph 48.47% : 0.000291s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.030350 192 0.02% : 0.000006s : 1: ForceFp32Comm 12.20% : 0.003701s : 1: add_attr 12.13% : 0.003680s : 1: add_attr_with_inline 0.02% : 0.000007s : 1: add_comm_op_reuse_tag 0.21% : 0.000063s : 1: add_recomputation 0.02% : 0.000006s : 1: assign_add_opt 0.26% : 0.000077s : 1: auto_monad 0.10% : 0.000031s : 1: auto_monad_reorder 0.02% : 0.000006s : 1: begin_end_overlap_inline 0.03% : 0.000008s : 1: bias_add_comm_swap 1.67% : 0.000507s : 1: bootstrap 0.15% : 0.000044s : 1: cconv 0.02% : 0.000007s : 1: comm_op_add_attrs 0.07% : 0.000021s : 1: control_data_broadcast_order 0.05% : 0.000015s : 1: convert_after_rewriter 0.11% : 0.000033s : 1: cse_after_recomputation 0.02% : 0.000008s : 1: dataset_repeat_opt 0.07% : 0.000022s : 1: detach_backward 0.04% : 0.000012s : 1: environ_conv 0.11% : 0.000035s : 1: event_method 0.03% : 0.000008s : 1: full_micro_interleaved_order_control 0.03% : 0.000008s : 1: get_jit_bprop_graph 0.05% : 0.000014s : 1: graph_reusing 0.02% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.02% : 0.000006s : 1: handle_group_info 0.03% : 0.000008s : 1: inline 0.03% : 0.000009s : 1: insert-virtual-dataset 0.02% : 0.000006s : 1: interleave_parallel_branches 0.02% : 0.000006s : 1: interleave_split_concat_branches 0.03% : 0.000008s : 1: label_fine_grained_interleaved_index 0.04% : 0.000012s : 1: label_micro_interleaved_index 1.79% : 0.000543s : 1: loop_unroll 0.02% : 0.000007s : 1: merge_cast_opt 0.03% : 0.000008s : 1: micro_interleaved_order_control 2.74% : 0.000832s : 1: mutable_eliminate 0.03% : 0.000010s : 1: offloading_packed_experts 0.06% : 0.000018s : 1: opt.transform.loop_unroll_optimizer 0.08% : 0.000024s : 1: opt.transform.mutable_eliminate 4.26% : 0.001292s : 78: opt.transform.opt_a 0.11% : 0.000032s : 1: opt.transform.opt_after_cconv 0.10% : 0.000031s : 1: opt.transform.opt_after_jit_grad 0.36% : 0.000108s : 28: opt.transform.opt_b 0.18% : 0.000056s : 2: opt.transform.opt_trans_graph 0.13% : 0.000039s : 4: opt.transform.symbol_engine_opt 11.40% : 0.003459s : 1: opt_a 0.48% : 0.000145s : 1: opt_after_cconv 2.11% : 0.000640s : 1: opt_after_jit_grad 1.02% : 0.000309s : 1: opt_b 22.08% : 0.006700s : 1: optimize 0.09% : 0.000027s : 1: optimize_parallel_all_gather_comm 0.04% : 0.000012s : 1: order_py_execute_after_rewriter 0.09% : 0.000028s : 1: overlap_grad_flash_sp 0.02% : 0.000006s : 1: overlap_grad_matmul_and_grad_allreduce 0.03% : 0.000009s : 1: overlap_grad_ring_attention 0.02% : 0.000007s : 1: overlap_opt_shard_grad_in_pipeline 0.02% : 0.000007s : 1: overlap_opt_shard_in_pipeline 0.03% : 0.000008s : 1: overlap_param_gather 0.02% : 0.000007s : 1: overlap_recompute_allgather_and_fa_grad 0.04% : 0.000011s : 1: overlap_recompute_and_grad_model_parallel 0.03% : 0.000008s : 1: overlap_recompute_comm 0.04% : 0.000011s : 1: parallel-infer-symbol 0.02% : 0.000007s : 1: parallel-infer-symbol-second 0.02% : 0.000007s : 1: partial_unused_args_eliminate 0.03% : 0.000009s : 1: pipeline_parallel_scheduler 0.02% : 0.000008s : 1: pipeline_split 0.17% : 0.000052s : 1: pre_auto_parallel 0.13% : 0.000041s : 1: py_interpret_to_execute 0.15% : 0.000044s : 1: py_interpret_to_execute_after_opt_a 0.02% : 0.000006s : 1: remove_cast_before_assign_add 0.07% : 0.000020s : 1: remove_dup_value 1.56% : 0.000475s : 1: renormalize.infer 1.28% : 0.000387s : 1: renormalize.specialize 0.03% : 0.000009s : 1: reorder_send_recv_between_fp_bp 0.04% : 0.000012s : 1: rewriter_after_jit_bprop_graph 0.18% : 0.000054s : 1: rewriter_after_opt_a 0.35% : 0.000105s : 1: rewriter_before_opt_a 0.03% : 0.000008s : 1: slice_cell_reuse_recomputed_activation 0.03% : 0.000008s : 1: slice_recompute_activation 0.02% : 0.000007s : 1: split_layernorm_comm 0.03% : 0.000008s : 1: split_matmul_comm_elemetwise 0.04% : 0.000011s : 1: swap_dp_allreduce_reducescatter 0.34% : 0.000104s : 1: symbol_engine_optimizer 0.34% : 0.000104s : 1: tuple_transform 20.06% : 0.006087s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:42:41.798.697 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0174008, [21] [bootstrap]: 0.0004638 [type_inference]: 0.00680671 [event_method]: 2.186e-05 [auto_monad]: 7.047e-05 [graph_reusing]: 5.89e-06 [inline]: 2.96999e-06 [add_attr]: 0.00369894, [1] [add_attr_with_inline]: 0.00368518, [1] [Cycle 1]: 7.877e-05, [2] [tag_attr]: 2.496e-05 [meta_addattr_fg_expand]: 6.34999e-06 [parallel-infer-symbol]: 3.6e-06 [pre_auto_parallel]: 4.233e-05 [insert-virtual-dataset]: 2.64999e-06 [parallel-infer-symbol-second]: 8.39995e-07 [dataset_repeat_opt]: 2.40002e-06 [pipeline_split]: 2.10002e-06 [optimize]: 0.00554386, [53] [py_interpret_to_execute]: 3.293e-05 [rewriter_before_opt_a]: 9.611e-05 [opt_a]: 0.00320596, [2] [Cycle 1]: 0.00242558, [45] [expand_dump_flag]: 2.89999e-06 [switch_simplify]: 4.44e-05 [loop_unroll]: 3.179e-05 [a_1]: 0.00072404 [with_stream_mark]: 2.438e-05 [recompute_prepare]: 1.4e-05 [updatestate_depend_eliminate]: 5.56e-06 [updatestate_assign_eliminate]: 3.33e-06 [updatestate_loads_eliminate]: 3.48e-06 [parameter_eliminate]: 2.10002e-06 [a_2]: 0.00010139 [accelerated_algorithm]: 9.09e-06 [shard]: 2.02999e-06 [meta_shard_fg_expand]: 2.89001e-06 [shard_inline]: 7.1e-06 [merge_send_recv]: 1.013e-05 [auto_parallel]: 9.47001e-06 [parallel]: 2.011e-05 [flash_sp]: 1.04e-05 [merge_comm]: 3.9e-06 [allreduce_fusion]: 3.62002e-06 [matmul_add_comm_reduction]: 1.08e-05 [allreduce_slice_to_reducescatter]: 6.79982e-07 [virtual_shard_identity]: 9.62001e-06 [virtual_dataset]: 7.33e-06 [get_grad_eliminate_]: 6.16e-06 [virtual_output]: 7.65998e-06 [merge_forward]: 3.85998e-06 [cell_reuse_recompute_pass]: 1.67999e-06 [offload_activation]: 1.06e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.426e-05 [merge_recompute_call_nodes]: 1.53002e-06 [before_grad]: 1.293e-05 [set_forward_comm_id_for_comm_node_pass]: 5.01002e-06 [meta_fg_expand]: 3.23e-06 [flash_sp_send_recv_attached]: 2.99001e-06 [receive_attached]: 2.56e-06 [after_resolve]: 1.391e-05 [a_after_grad]: 1.17e-05 [renormalize]: 0.00085363 [add_forward_monad_depend]: 7.56999e-06 [auto_monad_grad]: 2.86999e-06 [auto_monad_eliminator]: 2.013e-05 [cse]: 2.97e-05 [a_3]: 5.621e-05 [Cycle 2]: 0.00076769, [45] [expand_dump_flag]: 2.41e-06 [switch_simplify]: 9.91e-06 [loop_unroll]: 6.45002e-06 [a_1]: 0.00015234 [with_stream_mark]: 1.806e-05 [recompute_prepare]: 7.60998e-06 [updatestate_depend_eliminate]: 3.60998e-06 [updatestate_assign_eliminate]: 2.78e-06 [updatestate_loads_eliminate]: 1.928e-05 [parameter_eliminate]: 1.71e-06 [a_2]: 8.698e-05 [accelerated_algorithm]: 7.82002e-06 [shard]: 1.52999e-06 [meta_shard_fg_expand]: 2.00002e-06 [shard_inline]: 7.25e-06 [merge_send_recv]: 8.32e-06 [auto_parallel]: 8.17e-06 [parallel]: 7.15003e-06 [flash_sp]: 3.98999e-06 [merge_comm]: 3.49001e-06 [allreduce_fusion]: 3.52002e-06 [matmul_add_comm_reduction]: 7.71001e-06 [allreduce_slice_to_reducescatter]: 5.40022e-07 [virtual_shard_identity]: 7.15e-06 [virtual_dataset]: 6.09001e-06 [get_grad_eliminate_]: 6.46999e-06 [virtual_output]: 5.86e-06 [merge_forward]: 3.71001e-06 [cell_reuse_recompute_pass]: 2.29001e-06 [offload_activation]: 9.82999e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.225e-05 [merge_recompute_call_nodes]: 1.48002e-06 [before_grad]: 1.011e-05 [set_forward_comm_id_for_comm_node_pass]: 3.73001e-06 [meta_fg_expand]: 2.07001e-06 [flash_sp_send_recv_attached]: 1.28002e-06 [receive_attached]: 1.91e-06 [after_resolve]: 1.164e-05 [a_after_grad]: 1.163e-05 [renormalize]: 8.00064e-08 [add_forward_monad_depend]: 2.17999e-06 [auto_monad_grad]: 1.97001e-06 [auto_monad_eliminator]: 9.58002e-06 [cse]: 1.822e-05 [a_3]: 3.77e-05 [py_interpret_to_execute_after_opt_a]: 1.357e-05 [slice_cell_reuse_recomputed_activation]: 2.14e-06 [rewriter_after_opt_a]: 3.901e-05 [convert_after_rewriter]: 7.28e-06 [order_py_execute_after_rewriter]: 4.95999e-06 [mutable_eliminate]: 0.00069384 [opt_b]: 0.00022357, [1] [Cycle 1]: 0.00021638, [7] [b_1]: 0.00013235 [b_2]: 8.89e-06 [updatestate_depend_eliminate]: 8.16002e-06 [updatestate_assign_eliminate]: 3.12002e-06 [updatestate_loads_eliminate]: 2.49001e-06 [renormalize]: 1.02e-06 [cse]: 2.234e-05 [optimize_parallel_all_gather_comm]: 1.913e-05 [overlap_param_gather]: 2.02999e-06 [cconv]: 3.412e-05 [loop_unroll]: 0.00048306 [opt_after_cconv]: 0.00011229, [1] [Cycle 1]: 0.00010613, [7] [c_1]: 3.482e-05 [parameter_eliminate]: 4.18001e-06 [updatestate_depend_eliminate]: 7.19001e-06 [updatestate_assign_eliminate]: 2.59999e-06 [updatestate_loads_eliminate]: 2.32001e-06 [cse]: 1.873e-05 [renormalize]: 5.3001e-07 [remove_dup_value]: 1.415e-05 [tuple_transform]: 8.171e-05, [1] [Cycle 1]: 7.663e-05, [4] [d_1]: 4.852e-05 [none_parameter_eliminate]: 1.69e-06 [renormalize]: 1.79978e-07 [switch_simplify]: 7.16999e-06 [partial_unused_args_eliminate]: 1.94999e-06 [add_recomputation]: 4.985e-05 [cse_after_recomputation]: 2.133e-05, [1] [Cycle 1]: 1.678e-05, [1] [cse]: 1.137e-05 [environ_conv]: 5.17e-06 [swap_dp_allreduce_reducescatter]: 4.96002e-06 [bias_add_comm_swap]: 2.63998e-06 [label_micro_interleaved_index]: 4.62e-06 [label_fine_grained_interleaved_index]: 3.06999e-06 [merge_cast_opt]: 1.42999e-06 [slice_recompute_activation]: 2.12999e-06 [micro_interleaved_order_control]: 2.39001e-06 [assign_add_opt]: 1.17e-06 [ForceFp32Comm]: 1.05001e-06 [remove_cast_before_assign_add]: 1.17e-06 [full_micro_interleaved_order_control]: 2.11e-06 [reorder_send_recv_between_fp_bp]: 2.61999e-06 [comm_op_add_attrs]: 1.02e-06 [add_comm_op_reuse_tag]: 9.29984e-07 [interleave_split_concat_branches]: 1.12e-06 [interleave_parallel_branches]: 1.27e-06 [overlap_opt_shard_in_pipeline]: 1.40999e-06 [overlap_opt_shard_grad_in_pipeline]: 1.89e-06 [control_data_broadcast_order]: 1.266e-05 [grouped_pairwise_exchange_alltoall]: 1.97999e-06 [offloading_packed_experts]: 3.86999e-06 [overlap_recompute_and_grad_model_parallel]: 5.05001e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.13001e-06 [overlap_recompute_allgather_and_fa_grad]: 1.32999e-06 [overlap_recompute_comm]: 2.82002e-06 [overlap_grad_ring_attention]: 4.06001e-06 [overlap_grad_flash_sp]: 2.1e-05 [begin_end_overlap_inline]: 5.19998e-07 [split_matmul_comm_elemetwise]: 2.22999e-06 [split_layernorm_comm]: 1.54e-06 [handle_group_info]: 9.79984e-07 [symbol_engine_optimizer]: 7.938e-05, [1] [Cycle 1]: 7.497e-05, [6] [build]: 3.61001e-06 [elim_shapecalc]: 1.133e-05 [elim_not_effective]: 1.324e-05 [opt_reshape]: 7.16999e-06 [fold_const_symbol]: 1.023e-05 [renormalize]: 2.00002e-07 [detach_backward]: 2.07999e-06 [pipeline_parallel_scheduler]: 1.59e-06 [auto_monad_reorder]: 1.677e-05 [get_jit_bprop_graph]: 2.15002e-06 [rewriter_after_jit_bprop_graph]: 4.70999e-06 [opt_after_jit_grad]: 0.00049489 [validate]: 4.235e-05 Sums bootstrap : 0.000464s : 3.67% type_inference : 0.006807s : 53.80% event_method : 0.000022s : 0.17% auto_monad : 0.000070s : 0.56% graph_reusing : 0.000006s : 0.05% inline : 0.000003s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000025s : 0.20% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.05% parallel-infer-symbol : 0.000004s : 0.03% pre_auto_parallel : 0.000042s : 0.33% insert-virtual-dataset : 0.000003s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.02% optimize.py_interpret_to_execute : 0.000033s : 0.26% optimize.rewriter_before_opt_a : 0.000096s : 0.76% optimize.opt_a.expand_dump_flag : 0.000005s : 0.04% optimize.opt_a.switch_simplify : 0.000054s : 0.43% optimize.opt_a.loop_unroll : 0.000038s : 0.30% optimize.opt_a.a_1 : 0.000876s : 6.93% optimize.opt_a.with_stream_mark : 0.000042s : 0.34% optimize.opt_a.recompute_prepare : 0.000022s : 0.17% optimize.opt_a.updatestate_depend_eliminate : 0.000009s : 0.07% optimize.opt_a.updatestate_assign_eliminate : 0.000006s : 0.05% optimize.opt_a.updatestate_loads_eliminate : 0.000023s : 0.18% optimize.opt_a.parameter_eliminate : 0.000004s : 0.03% optimize.opt_a.a_2 : 0.000188s : 1.49% optimize.opt_a.accelerated_algorithm : 0.000017s : 0.13% optimize.opt_a.shard : 0.000004s : 0.03% optimize.opt_a.meta_shard_fg_expand : 0.000005s : 0.04% optimize.opt_a.shard_inline : 0.000014s : 0.11% optimize.opt_a.merge_send_recv : 0.000018s : 0.15% optimize.opt_a.auto_parallel : 0.000018s : 0.14% optimize.opt_a.parallel : 0.000027s : 0.22% optimize.opt_a.flash_sp : 0.000014s : 0.11% optimize.opt_a.merge_comm : 0.000007s : 0.06% optimize.opt_a.allreduce_fusion : 0.000007s : 0.06% optimize.opt_a.matmul_add_comm_reduction : 0.000019s : 0.15% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000017s : 0.13% optimize.opt_a.virtual_dataset : 0.000013s : 0.11% optimize.opt_a.get_grad_eliminate_ : 0.000013s : 0.10% optimize.opt_a.virtual_output : 0.000014s : 0.11% optimize.opt_a.merge_forward : 0.000008s : 0.06% optimize.opt_a.cell_reuse_recompute_pass : 0.000004s : 0.03% optimize.opt_a.offload_activation : 0.000020s : 0.16% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000027s : 0.21% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.02% optimize.opt_a.before_grad : 0.000023s : 0.18% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000009s : 0.07% optimize.opt_a.meta_fg_expand : 0.000005s : 0.04% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.03% optimize.opt_a.receive_attached : 0.000004s : 0.04% optimize.opt_a.after_resolve : 0.000026s : 0.20% optimize.opt_a.a_after_grad : 0.000023s : 0.18% optimize.opt_a.renormalize : 0.000854s : 6.75% optimize.opt_a.add_forward_monad_depend : 0.000010s : 0.08% optimize.opt_a.auto_monad_grad : 0.000005s : 0.04% optimize.opt_a.auto_monad_eliminator : 0.000030s : 0.23% optimize.opt_a.cse : 0.000048s : 0.38% optimize.opt_a.a_3 : 0.000094s : 0.74% optimize.py_interpret_to_execute_after_opt_a : 0.000014s : 0.11% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.02% optimize.rewriter_after_opt_a : 0.000039s : 0.31% optimize.convert_after_rewriter : 0.000007s : 0.06% optimize.order_py_execute_after_rewriter : 0.000005s : 0.04% optimize.mutable_eliminate : 0.000694s : 5.48% optimize.opt_b.b_1 : 0.000132s : 1.05% optimize.opt_b.b_2 : 0.000009s : 0.07% optimize.opt_b.updatestate_depend_eliminate : 0.000008s : 0.06% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.02% optimize.opt_b.updatestate_loads_eliminate : 0.000002s : 0.02% optimize.opt_b.renormalize : 0.000001s : 0.01% optimize.opt_b.cse : 0.000022s : 0.18% optimize.optimize_parallel_all_gather_comm : 0.000019s : 0.15% optimize.overlap_param_gather : 0.000002s : 0.02% optimize.cconv : 0.000034s : 0.27% optimize.loop_unroll : 0.000483s : 3.82% optimize.opt_after_cconv.c_1 : 0.000035s : 0.28% optimize.opt_after_cconv.parameter_eliminate : 0.000004s : 0.03% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000007s : 0.06% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.02% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.02% optimize.opt_after_cconv.cse : 0.000019s : 0.15% optimize.opt_after_cconv.renormalize : 0.000001s : 0.00% optimize.remove_dup_value : 0.000014s : 0.11% optimize.tuple_transform.d_1 : 0.000049s : 0.38% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000007s : 0.06% optimize.partial_unused_args_eliminate : 0.000002s : 0.02% optimize.add_recomputation : 0.000050s : 0.39% optimize.cse_after_recomputation.cse : 0.000011s : 0.09% optimize.environ_conv : 0.000005s : 0.04% optimize.swap_dp_allreduce_reducescatter : 0.000005s : 0.04% optimize.bias_add_comm_swap : 0.000003s : 0.02% optimize.label_micro_interleaved_index : 0.000005s : 0.04% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.02% optimize.merge_cast_opt : 0.000001s : 0.01% optimize.slice_recompute_activation : 0.000002s : 0.02% optimize.micro_interleaved_order_control : 0.000002s : 0.02% optimize.assign_add_opt : 0.000001s : 0.01% optimize.ForceFp32Comm : 0.000001s : 0.01% optimize.remove_cast_before_assign_add : 0.000001s : 0.01% optimize.full_micro_interleaved_order_control : 0.000002s : 0.02% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.02% optimize.comm_op_add_attrs : 0.000001s : 0.01% optimize.add_comm_op_reuse_tag : 0.000001s : 0.01% optimize.interleave_split_concat_branches : 0.000001s : 0.01% optimize.interleave_parallel_branches : 0.000001s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.01% optimize.control_data_broadcast_order : 0.000013s : 0.10% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.02% optimize.offloading_packed_experts : 0.000004s : 0.03% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.04% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.01% optimize.overlap_recompute_comm : 0.000003s : 0.02% optimize.overlap_grad_ring_attention : 0.000004s : 0.03% optimize.overlap_grad_flash_sp : 0.000021s : 0.17% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.02% optimize.split_layernorm_comm : 0.000002s : 0.01% optimize.handle_group_info : 0.000001s : 0.01% optimize.symbol_engine_optimizer.build : 0.000004s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000011s : 0.09% optimize.symbol_engine_optimizer.elim_not_effective : 0.000013s : 0.10% optimize.symbol_engine_optimizer.opt_reshape : 0.000007s : 0.06% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000010s : 0.08% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.02% pipeline_parallel_scheduler : 0.000002s : 0.01% auto_monad_reorder : 0.000017s : 0.13% get_jit_bprop_graph : 0.000002s : 0.02% rewriter_after_jit_bprop_graph : 0.000005s : 0.04% opt_after_jit_grad : 0.000495s : 3.91% validate : 0.000042s : 0.33% Time group info: ------[substitution.] 0.000259 34 15.71% : 0.000041s : 6: substitution.arithmetic_simplify 0.73% : 0.000002s : 2: substitution.elim_not_effective 0.57% : 0.000001s : 2: substitution.fold_const_symbol 2.56% : 0.000007s : 4: substitution.graph_param_transform 67.51% : 0.000175s : 4: substitution.inline 2.02% : 0.000005s : 4: substitution.j_node_and_user_rematch 1.96% : 0.000005s : 4: substitution.remove_not_recompute_node 2.51% : 0.000007s : 4: substitution.replace_old_param 6.44% : 0.000017s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.006729 2 85.45% : 0.005750s : 1: type_inference.infer 14.55% : 0.000979s : 1: type_inference.specialize ------[replace.] 0.000070 8 61.71% : 0.000043s : 4: replace.inline 38.29% : 0.000027s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000186 8 92.11% : 0.000172s : 4: match.inline 7.89% : 0.000015s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000222 1278 0.90% : 0.000002s : 13: predicate.accumulaten_eliminater 0.68% : 0.000002s : 4: predicate.ad_related_special_op_eliminate 0.43% : 0.000001s : 8: predicate.addn_check_dump 0.95% : 0.000002s : 13: predicate.addn_zero_filter 0.77% : 0.000002s : 13: predicate.adjust_all_reduce_mul_add 2.82% : 0.000006s : 21: predicate.arithmetic_simplify 0.79% : 0.000002s : 13: predicate.cast_eliminate 0.55% : 0.000001s : 8: predicate.check_bprop_eliminate 0.42% : 0.000001s : 8: predicate.compare_switch_simplify 0.19% : 0.000000s : 4: predicate.const_output_eliminate 0.56% : 0.000001s : 8: predicate.depend_value_elim 0.88% : 0.000002s : 13: predicate.dict_get_item_const_eliminator 1.05% : 0.000002s : 13: predicate.dict_get_item_eliminator 0.85% : 0.000002s : 13: predicate.dict_set_item_eliminator 0.91% : 0.000002s : 8: predicate.dumpgradient_eliminate 0.21% : 0.000000s : 4: predicate.elim_not_effective 0.36% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.05% : 0.000002s : 17: predicate.environ_add_const_eliminate 1.15% : 0.000003s : 17: predicate.environ_get_add_eliminate 1.06% : 0.000002s : 17: predicate.environ_get_depend_swap 1.56% : 0.000003s : 25: predicate.environ_get_eliminate 1.22% : 0.000003s : 17: predicate.environ_get_set_eliminate 1.46% : 0.000003s : 21: predicate.exchange_switch_depend_value 2.57% : 0.000006s : 21: predicate.float_depend_g_call 0.47% : 0.000001s : 8: predicate.float_environ_get_switch 0.78% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.18% : 0.000000s : 4: predicate.fold_const_symbol 0.56% : 0.000001s : 8: predicate.get_grad_eliminate 0.22% : 0.000000s : 4: predicate.graph_param_transform 0.61% : 0.000001s : 8: predicate.incorporate_call 0.43% : 0.000001s : 8: predicate.incorporate_call_switch 6.22% : 0.000014s : 58: predicate.inline 0.81% : 0.000002s : 8: predicate.inline_without_move 0.29% : 0.000001s : 8: predicate.j_node_and_user_rematch 0.94% : 0.000002s : 8: predicate.less_batch_normalization 1.89% : 0.000004s : 25: predicate.list_to_tuple_eliminator_ 2.40% : 0.000005s : 38: predicate.load_eliminater 0.92% : 0.000002s : 4: predicate.loop_unroll_after_grad 2.25% : 0.000005s : 34: predicate.loop_unroll_before_grad 1.91% : 0.000004s : 21: predicate.make_slice_get_slice_eliminator 0.53% : 0.000001s : 8: predicate.merge_addn 0.72% : 0.000002s : 8: predicate.micro_step_allgather_replace 0.63% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.86% : 0.000002s : 13: predicate.minmaximum_grad 1.29% : 0.000003s : 4: predicate.mutable_eliminate 0.42% : 0.000001s : 4: predicate.opt_reshape 0.48% : 0.000001s : 4: predicate.parallel_virtual_node 2.02% : 0.000004s : 21: predicate.partial_defer_inline 1.63% : 0.000004s : 21: predicate.partial_eliminate 0.98% : 0.000002s : 13: predicate.print_const_string_wrapper 0.65% : 0.000001s : 8: predicate.reduce_all_const_elim 1.11% : 0.000002s : 13: predicate.reduce_eliminate 2.47% : 0.000005s : 38: predicate.redundant_stop_gradient_eliminater 0.59% : 0.000001s : 8: predicate.remove_not_recompute_node 1.51% : 0.000003s : 25: predicate.replace_applicator 0.56% : 0.000001s : 8: predicate.replace_old_param 0.36% : 0.000001s : 4: predicate.reset_defer_inline 1.00% : 0.000002s : 13: predicate.reshape_eliminate 0.53% : 0.000001s : 8: predicate.row_tensor_add_zeros_like 0.35% : 0.000001s : 4: predicate.row_tensor_eliminate 0.82% : 0.000002s : 8: predicate.same_eliminate 0.41% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.85% : 0.000002s : 8: predicate.shard_identity_eliminate 0.62% : 0.000001s : 8: predicate.special_op_eliminate 0.88% : 0.000002s : 8: predicate.specialize_transform 0.95% : 0.000002s : 8: predicate.split_environ_get_set_with_tuple_value 0.79% : 0.000002s : 8: predicate.stack_unstack_eliminate 0.37% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.49% : 0.000003s : 21: predicate.switch_defer_inline 1.90% : 0.000004s : 29: predicate.switch_layer_defer_inline 5.04% : 0.000011s : 67: predicate.switch_simplify 0.96% : 0.000002s : 13: predicate.tile_eliminate 0.85% : 0.000002s : 13: predicate.transpose_eliminate 1.47% : 0.000003s : 21: predicate.tuple_list_convert_item_index_to_positive 1.66% : 0.000004s : 21: predicate.tuple_list_get_item_const_eliminator 1.39% : 0.000003s : 21: predicate.tuple_list_get_item_depend_reorder 3.43% : 0.000008s : 33: predicate.tuple_list_get_item_eliminator 1.50% : 0.000003s : 21: predicate.tuple_list_get_set_item_eliminator 2.22% : 0.000005s : 29: predicate.tuple_list_set_item_eliminator 1.92% : 0.000004s : 25: predicate.tuple_to_list_eliminator_ 2.26% : 0.000005s : 38: predicate.updatestate_pure_node_eliminater 2.95% : 0.000007s : 46: predicate.updatestate_useless_node_eliminater 0.34% : 0.000001s : 4: predicate.value_based_eliminate 0.56% : 0.000001s : 8: predicate.virtual_dataset_eliminate 0.63% : 0.000001s : 8: predicate.virtual_output_eliminate 0.30% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.47% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000652 11 52.53% : 0.000342s : 5: func_graph_cloner_run.FuncGraphClonerGraph 47.47% : 0.000309s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.028992 192 0.01% : 0.000004s : 1: ForceFp32Comm 12.78% : 0.003706s : 1: add_attr 12.73% : 0.003690s : 1: add_attr_with_inline 0.01% : 0.000004s : 1: add_comm_op_reuse_tag 0.19% : 0.000054s : 1: add_recomputation 0.01% : 0.000004s : 1: assign_add_opt 0.26% : 0.000076s : 1: auto_monad 0.07% : 0.000020s : 1: auto_monad_reorder 0.01% : 0.000003s : 1: begin_end_overlap_inline 0.02% : 0.000005s : 1: bias_add_comm_swap 1.71% : 0.000496s : 1: bootstrap 0.13% : 0.000038s : 1: cconv 0.01% : 0.000004s : 1: comm_op_add_attrs 0.05% : 0.000016s : 1: control_data_broadcast_order 0.04% : 0.000011s : 1: convert_after_rewriter 0.08% : 0.000024s : 1: cse_after_recomputation 0.02% : 0.000005s : 1: dataset_repeat_opt 0.02% : 0.000005s : 1: detach_backward 0.03% : 0.000009s : 1: environ_conv 0.10% : 0.000029s : 1: event_method 0.02% : 0.000005s : 1: full_micro_interleaved_order_control 0.02% : 0.000005s : 1: get_jit_bprop_graph 0.03% : 0.000009s : 1: graph_reusing 0.02% : 0.000005s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000004s : 1: handle_group_info 0.02% : 0.000006s : 1: inline 0.02% : 0.000006s : 1: insert-virtual-dataset 0.01% : 0.000004s : 1: interleave_parallel_branches 0.01% : 0.000004s : 1: interleave_split_concat_branches 0.02% : 0.000006s : 1: label_fine_grained_interleaved_index 0.03% : 0.000007s : 1: label_micro_interleaved_index 1.69% : 0.000491s : 1: loop_unroll 0.01% : 0.000004s : 1: merge_cast_opt 0.02% : 0.000005s : 1: micro_interleaved_order_control 2.43% : 0.000704s : 1: mutable_eliminate 0.02% : 0.000007s : 1: offloading_packed_experts 0.05% : 0.000015s : 1: opt.transform.loop_unroll_optimizer 0.07% : 0.000021s : 1: opt.transform.mutable_eliminate 4.65% : 0.001347s : 78: opt.transform.opt_a 0.11% : 0.000033s : 1: opt.transform.opt_after_cconv 0.09% : 0.000026s : 1: opt.transform.opt_after_jit_grad 0.37% : 0.000106s : 28: opt.transform.opt_b 0.18% : 0.000053s : 2: opt.transform.opt_trans_graph 0.13% : 0.000038s : 4: opt.transform.symbol_engine_opt 11.07% : 0.003210s : 1: opt_a 0.40% : 0.000116s : 1: opt_after_cconv 1.74% : 0.000505s : 1: opt_after_jit_grad 0.78% : 0.000228s : 1: opt_b 19.14% : 0.005549s : 1: optimize 0.08% : 0.000023s : 1: optimize_parallel_all_gather_comm 0.03% : 0.000008s : 1: order_py_execute_after_rewriter 0.08% : 0.000024s : 1: overlap_grad_flash_sp 0.01% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.02% : 0.000007s : 1: overlap_grad_ring_attention 0.02% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.02% : 0.000005s : 1: overlap_opt_shard_in_pipeline 0.02% : 0.000005s : 1: overlap_param_gather 0.01% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.03% : 0.000008s : 1: overlap_recompute_and_grad_model_parallel 0.02% : 0.000006s : 1: overlap_recompute_comm 0.02% : 0.000007s : 1: parallel-infer-symbol 0.01% : 0.000004s : 1: parallel-infer-symbol-second 0.02% : 0.000005s : 1: partial_unused_args_eliminate 0.02% : 0.000005s : 1: pipeline_parallel_scheduler 0.02% : 0.000005s : 1: pipeline_split 0.16% : 0.000047s : 1: pre_auto_parallel 0.13% : 0.000037s : 1: py_interpret_to_execute 0.06% : 0.000017s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000004s : 1: remove_cast_before_assign_add 0.06% : 0.000018s : 1: remove_dup_value 1.65% : 0.000477s : 1: renormalize.infer 1.26% : 0.000365s : 1: renormalize.specialize 0.02% : 0.000005s : 1: reorder_send_recv_between_fp_bp 0.03% : 0.000008s : 1: rewriter_after_jit_bprop_graph 0.15% : 0.000043s : 1: rewriter_after_opt_a 0.35% : 0.000101s : 1: rewriter_before_opt_a 0.02% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.02% : 0.000005s : 1: slice_recompute_activation 0.01% : 0.000004s : 1: split_layernorm_comm 0.02% : 0.000005s : 1: split_matmul_comm_elemetwise 0.03% : 0.000008s : 1: swap_dp_allreduce_reducescatter 0.28% : 0.000082s : 1: symbol_engine_optimizer 0.29% : 0.000085s : 1: tuple_transform 23.56% : 0.006831s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:42:42.907.101 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:42:42.907.376 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.12865, [21] [bootstrap]: 0.00046758 [type_inference]: 0.115226 [event_method]: 2.222e-05 [auto_monad]: 7.102e-05 [graph_reusing]: 6.48e-06 [inline]: 2.65002e-06 [add_attr]: 0.00378327, [1] [add_attr_with_inline]: 0.00376957, [1] [Cycle 1]: 0.00010241, [2] [tag_attr]: 2.426e-05 [meta_addattr_fg_expand]: 6.12999e-06 [parallel-infer-symbol]: 3.5e-06 [pre_auto_parallel]: 4.185e-05 [insert-virtual-dataset]: 2.53e-06 [parallel-infer-symbol-second]: 1.25999e-06 [dataset_repeat_opt]: 1.94e-06 [pipeline_split]: 1.77001e-06 [optimize]: 0.00736325, [53] [py_interpret_to_execute]: 3.625e-05 [rewriter_before_opt_a]: 0.00010025 [opt_a]: 0.00399008, [2] [Cycle 1]: 0.00287479, [45] [expand_dump_flag]: 3.05002e-06 [switch_simplify]: 4.322e-05 [loop_unroll]: 3.058e-05 [a_1]: 0.00076742 [with_stream_mark]: 2.595e-05 [recompute_prepare]: 1.275e-05 [updatestate_depend_eliminate]: 4.95001e-06 [updatestate_assign_eliminate]: 4.26001e-06 [updatestate_loads_eliminate]: 3.56999e-06 [parameter_eliminate]: 2.54999e-06 [a_2]: 0.00012252 [accelerated_algorithm]: 8.82e-06 [shard]: 2.93998e-06 [meta_shard_fg_expand]: 2.21e-06 [shard_inline]: 7.82e-06 [merge_send_recv]: 1.108e-05 [auto_parallel]: 1.192e-05 [parallel]: 2.288e-05 [flash_sp]: 1.252e-05 [merge_comm]: 4.92999e-06 [allreduce_fusion]: 3.56001e-06 [matmul_add_comm_reduction]: 1.212e-05 [allreduce_slice_to_reducescatter]: 1.47001e-06 [virtual_shard_identity]: 1.379e-05 [virtual_dataset]: 7.61999e-06 [get_grad_eliminate_]: 7.03e-06 [virtual_output]: 6.98e-06 [merge_forward]: 5.70001e-06 [cell_reuse_recompute_pass]: 2.32001e-06 [offload_activation]: 1.155e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.168e-05 [merge_recompute_call_nodes]: 1.97999e-06 [before_grad]: 1.344e-05 [set_forward_comm_id_for_comm_node_pass]: 4.84e-06 [meta_fg_expand]: 3.11999e-06 [flash_sp_send_recv_attached]: 3.6e-06 [receive_attached]: 2.62001e-06 [after_resolve]: 2.033e-05 [a_after_grad]: 1.291e-05 [renormalize]: 0.00099381 [add_forward_monad_depend]: 1.003e-05 [auto_monad_grad]: 2.37999e-06 [auto_monad_eliminator]: 2.591e-05 [cse]: 3.235e-05 [a_3]: 7.483e-05 [Cycle 2]: 0.00109332, [45] [expand_dump_flag]: 2.90998e-06 [switch_simplify]: 9.00999e-06 [loop_unroll]: 7.16001e-06 [a_1]: 0.0001635 [with_stream_mark]: 2.333e-05 [recompute_prepare]: 8.77999e-06 [updatestate_depend_eliminate]: 4.84e-06 [updatestate_assign_eliminate]: 3.11001e-06 [updatestate_loads_eliminate]: 3.46999e-06 [parameter_eliminate]: 2.59999e-06 [a_2]: 0.0001187 [accelerated_algorithm]: 8.92e-06 [shard]: 2.98e-06 [meta_shard_fg_expand]: 2.66e-06 [shard_inline]: 8.25e-06 [merge_send_recv]: 1.11e-05 [auto_parallel]: 1.13e-05 [parallel]: 9.97999e-06 [flash_sp]: 4.68001e-06 [merge_comm]: 5.13002e-06 [allreduce_fusion]: 4.33001e-06 [matmul_add_comm_reduction]: 1.133e-05 [allreduce_slice_to_reducescatter]: 9.09989e-07 [virtual_shard_identity]: 1.222e-05 [virtual_dataset]: 7.34002e-06 [get_grad_eliminate_]: 6.76e-06 [virtual_output]: 6.79001e-06 [merge_forward]: 5.46e-06 [cell_reuse_recompute_pass]: 3.6e-06 [offload_activation]: 1.17e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.141e-05 [merge_recompute_call_nodes]: 1.59e-06 [before_grad]: 1.433e-05 [set_forward_comm_id_for_comm_node_pass]: 5.59e-06 [meta_fg_expand]: 3.00002e-06 [flash_sp_send_recv_attached]: 2.10002e-06 [receive_attached]: 2.52001e-06 [after_resolve]: 1.904e-05 [a_after_grad]: 1.26e-05 [renormalize]: 9.00181e-08 [add_forward_monad_depend]: 5.77001e-06 [auto_monad_grad]: 3.5e-06 [auto_monad_eliminator]: 1.743e-05 [cse]: 2.857e-05 [a_3]: 5.691e-05 [py_interpret_to_execute_after_opt_a]: 2.556e-05 [slice_cell_reuse_recomputed_activation]: 5.12e-06 [rewriter_after_opt_a]: 5.821e-05 [convert_after_rewriter]: 1.27e-05 [order_py_execute_after_rewriter]: 8.20999e-06 [mutable_eliminate]: 0.00090206 [opt_b]: 0.00034969, [1] [Cycle 1]: 0.00033526, [7] [b_1]: 0.00019244 [b_2]: 1.101e-05 [updatestate_depend_eliminate]: 1.38e-05 [updatestate_assign_eliminate]: 3.48e-06 [updatestate_loads_eliminate]: 3.64002e-06 [renormalize]: 1.05999e-06 [cse]: 3.707e-05 [optimize_parallel_all_gather_comm]: 3.063e-05 [overlap_param_gather]: 6.21998e-06 [cconv]: 4.625e-05 [loop_unroll]: 0.00072799 [opt_after_cconv]: 0.00017679, [1] [Cycle 1]: 0.00016357, [7] [c_1]: 3.816e-05 [parameter_eliminate]: 6.78e-06 [updatestate_depend_eliminate]: 1.295e-05 [updatestate_assign_eliminate]: 3.3e-06 [updatestate_loads_eliminate]: 3.45e-06 [cse]: 3.346e-05 [renormalize]: 8.2e-07 [remove_dup_value]: 2.008e-05 [tuple_transform]: 0.00011904, [1] [Cycle 1]: 0.00010977, [4] [d_1]: 6.134e-05 [none_parameter_eliminate]: 2.53e-06 [renormalize]: 4.19997e-07 [switch_simplify]: 8.50999e-06 [partial_unused_args_eliminate]: 5.54e-06 [add_recomputation]: 6.882e-05 [cse_after_recomputation]: 3.822e-05, [1] [Cycle 1]: 2.976e-05, [1] [cse]: 1.831e-05 [environ_conv]: 1.015e-05 [swap_dp_allreduce_reducescatter]: 9.05999e-06 [bias_add_comm_swap]: 6.22001e-06 [label_micro_interleaved_index]: 1.202e-05 [label_fine_grained_interleaved_index]: 5.27001e-06 [merge_cast_opt]: 4.22998e-06 [slice_recompute_activation]: 4.50001e-06 [micro_interleaved_order_control]: 5.37001e-06 [assign_add_opt]: 4.15e-06 [ForceFp32Comm]: 3.4e-06 [remove_cast_before_assign_add]: 3.85e-06 [full_micro_interleaved_order_control]: 5.05999e-06 [reorder_send_recv_between_fp_bp]: 5.77001e-06 [comm_op_add_attrs]: 4.44998e-06 [add_comm_op_reuse_tag]: 3.27002e-06 [interleave_split_concat_branches]: 3.89002e-06 [interleave_parallel_branches]: 4.22e-06 [overlap_opt_shard_in_pipeline]: 3.86001e-06 [overlap_opt_shard_grad_in_pipeline]: 4.50999e-06 [control_data_broadcast_order]: 2.507e-05 [grouped_pairwise_exchange_alltoall]: 4.74e-06 [offloading_packed_experts]: 8.91002e-06 [overlap_recompute_and_grad_model_parallel]: 8.72e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.56001e-06 [overlap_recompute_allgather_and_fa_grad]: 3.8e-06 [overlap_recompute_comm]: 5.54998e-06 [overlap_grad_ring_attention]: 8.97999e-06 [overlap_grad_flash_sp]: 3.155e-05 [begin_end_overlap_inline]: 3.73001e-06 [split_matmul_comm_elemetwise]: 4.84e-06 [split_layernorm_comm]: 4.94e-06 [handle_group_info]: 3.67998e-06 [symbol_engine_optimizer]: 0.00014468, [1] [Cycle 1]: 0.00013211, [6] [build]: 6.54001e-06 [elim_shapecalc]: 2.191e-05 [elim_not_effective]: 1.899e-05 [opt_reshape]: 8.95999e-06 [fold_const_symbol]: 1.175e-05 [renormalize]: 3.80009e-07 [detach_backward]: 5.91e-06 [pipeline_parallel_scheduler]: 2.32001e-06 [auto_monad_reorder]: 2.861e-05 [get_jit_bprop_graph]: 1.87001e-06 [rewriter_after_jit_bprop_graph]: 8.69e-06 [opt_after_jit_grad]: 0.00083942 [validate]: 5.671e-05 Sums bootstrap : 0.000468s : 0.38% type_inference : 0.115226s : 93.91% event_method : 0.000022s : 0.02% auto_monad : 0.000071s : 0.06% graph_reusing : 0.000006s : 0.01% inline : 0.000003s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000024s : 0.02% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.00% parallel-infer-symbol : 0.000003s : 0.00% pre_auto_parallel : 0.000042s : 0.03% insert-virtual-dataset : 0.000003s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000036s : 0.03% optimize.rewriter_before_opt_a : 0.000100s : 0.08% optimize.opt_a.expand_dump_flag : 0.000006s : 0.00% optimize.opt_a.switch_simplify : 0.000052s : 0.04% optimize.opt_a.loop_unroll : 0.000038s : 0.03% optimize.opt_a.a_1 : 0.000931s : 0.76% optimize.opt_a.with_stream_mark : 0.000049s : 0.04% optimize.opt_a.recompute_prepare : 0.000022s : 0.02% optimize.opt_a.updatestate_depend_eliminate : 0.000010s : 0.01% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.01% optimize.opt_a.updatestate_loads_eliminate : 0.000007s : 0.01% optimize.opt_a.parameter_eliminate : 0.000005s : 0.00% optimize.opt_a.a_2 : 0.000241s : 0.20% optimize.opt_a.accelerated_algorithm : 0.000018s : 0.01% optimize.opt_a.shard : 0.000006s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.000005s : 0.00% optimize.opt_a.shard_inline : 0.000016s : 0.01% optimize.opt_a.merge_send_recv : 0.000022s : 0.02% optimize.opt_a.auto_parallel : 0.000023s : 0.02% optimize.opt_a.parallel : 0.000033s : 0.03% optimize.opt_a.flash_sp : 0.000017s : 0.01% optimize.opt_a.merge_comm : 0.000010s : 0.01% optimize.opt_a.allreduce_fusion : 0.000008s : 0.01% optimize.opt_a.matmul_add_comm_reduction : 0.000023s : 0.02% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000002s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000026s : 0.02% optimize.opt_a.virtual_dataset : 0.000015s : 0.01% optimize.opt_a.get_grad_eliminate_ : 0.000014s : 0.01% optimize.opt_a.virtual_output : 0.000014s : 0.01% optimize.opt_a.merge_forward : 0.000011s : 0.01% optimize.opt_a.cell_reuse_recompute_pass : 0.000006s : 0.00% optimize.opt_a.offload_activation : 0.000023s : 0.02% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000043s : 0.04% optimize.opt_a.merge_recompute_call_nodes : 0.000004s : 0.00% optimize.opt_a.before_grad : 0.000028s : 0.02% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000010s : 0.01% optimize.opt_a.meta_fg_expand : 0.000006s : 0.00% optimize.opt_a.flash_sp_send_recv_attached : 0.000006s : 0.00% optimize.opt_a.receive_attached : 0.000005s : 0.00% optimize.opt_a.after_resolve : 0.000039s : 0.03% optimize.opt_a.a_after_grad : 0.000026s : 0.02% optimize.opt_a.renormalize : 0.000994s : 0.81% optimize.opt_a.add_forward_monad_depend : 0.000016s : 0.01% optimize.opt_a.auto_monad_grad : 0.000006s : 0.00% optimize.opt_a.auto_monad_eliminator : 0.000043s : 0.04% optimize.opt_a.cse : 0.000061s : 0.05% optimize.opt_a.a_3 : 0.000132s : 0.11% optimize.py_interpret_to_execute_after_opt_a : 0.000026s : 0.02% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.00% optimize.rewriter_after_opt_a : 0.000058s : 0.05% optimize.convert_after_rewriter : 0.000013s : 0.01% optimize.order_py_execute_after_rewriter : 0.000008s : 0.01% optimize.mutable_eliminate : 0.000902s : 0.74% optimize.opt_b.b_1 : 0.000192s : 0.16% optimize.opt_b.b_2 : 0.000011s : 0.01% optimize.opt_b.updatestate_depend_eliminate : 0.000014s : 0.01% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000004s : 0.00% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000037s : 0.03% optimize.optimize_parallel_all_gather_comm : 0.000031s : 0.02% optimize.overlap_param_gather : 0.000006s : 0.01% optimize.cconv : 0.000046s : 0.04% optimize.loop_unroll : 0.000728s : 0.59% optimize.opt_after_cconv.c_1 : 0.000038s : 0.03% optimize.opt_after_cconv.parameter_eliminate : 0.000007s : 0.01% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000013s : 0.01% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.cse : 0.000033s : 0.03% optimize.opt_after_cconv.renormalize : 0.000001s : 0.00% optimize.remove_dup_value : 0.000020s : 0.02% optimize.tuple_transform.d_1 : 0.000061s : 0.05% optimize.tuple_transform.none_parameter_eliminate : 0.000003s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000009s : 0.01% optimize.partial_unused_args_eliminate : 0.000006s : 0.00% optimize.add_recomputation : 0.000069s : 0.06% optimize.cse_after_recomputation.cse : 0.000018s : 0.01% optimize.environ_conv : 0.000010s : 0.01% optimize.swap_dp_allreduce_reducescatter : 0.000009s : 0.01% optimize.bias_add_comm_swap : 0.000006s : 0.01% optimize.label_micro_interleaved_index : 0.000012s : 0.01% optimize.label_fine_grained_interleaved_index : 0.000005s : 0.00% optimize.merge_cast_opt : 0.000004s : 0.00% optimize.slice_recompute_activation : 0.000005s : 0.00% optimize.micro_interleaved_order_control : 0.000005s : 0.00% optimize.assign_add_opt : 0.000004s : 0.00% optimize.ForceFp32Comm : 0.000003s : 0.00% optimize.remove_cast_before_assign_add : 0.000004s : 0.00% optimize.full_micro_interleaved_order_control : 0.000005s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000006s : 0.00% optimize.comm_op_add_attrs : 0.000004s : 0.00% optimize.add_comm_op_reuse_tag : 0.000003s : 0.00% optimize.interleave_split_concat_branches : 0.000004s : 0.00% optimize.interleave_parallel_branches : 0.000004s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000005s : 0.00% optimize.control_data_broadcast_order : 0.000025s : 0.02% optimize.grouped_pairwise_exchange_alltoall : 0.000005s : 0.00% optimize.offloading_packed_experts : 0.000009s : 0.01% optimize.overlap_recompute_and_grad_model_parallel : 0.000009s : 0.01% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.00% optimize.overlap_recompute_comm : 0.000006s : 0.00% optimize.overlap_grad_ring_attention : 0.000009s : 0.01% optimize.overlap_grad_flash_sp : 0.000032s : 0.03% optimize.begin_end_overlap_inline : 0.000004s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000005s : 0.00% optimize.split_layernorm_comm : 0.000005s : 0.00% optimize.handle_group_info : 0.000004s : 0.00% optimize.symbol_engine_optimizer.build : 0.000007s : 0.01% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000022s : 0.02% optimize.symbol_engine_optimizer.elim_not_effective : 0.000019s : 0.02% optimize.symbol_engine_optimizer.opt_reshape : 0.000009s : 0.01% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000012s : 0.01% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000006s : 0.00% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000029s : 0.02% get_jit_bprop_graph : 0.000002s : 0.00% rewriter_after_jit_bprop_graph : 0.000009s : 0.01% opt_after_jit_grad : 0.000839s : 0.68% validate : 0.000057s : 0.05% Time group info: ------[substitution.] 0.000313 34 15.41% : 0.000048s : 6: substitution.arithmetic_simplify 0.78% : 0.000002s : 2: substitution.elim_not_effective 0.41% : 0.000001s : 2: substitution.fold_const_symbol 2.39% : 0.000007s : 4: substitution.graph_param_transform 67.98% : 0.000213s : 4: substitution.inline 2.05% : 0.000006s : 4: substitution.j_node_and_user_rematch 2.43% : 0.000008s : 4: substitution.remove_not_recompute_node 3.09% : 0.000010s : 4: substitution.replace_old_param 5.47% : 0.000017s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.115158 2 99.27% : 0.114314s : 1: type_inference.infer 0.73% : 0.000844s : 1: type_inference.specialize ------[replace.] 0.000093 8 48.28% : 0.000045s : 4: replace.inline 51.72% : 0.000048s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000224 8 93.19% : 0.000209s : 4: match.inline 6.81% : 0.000015s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000252 1278 0.81% : 0.000002s : 13: predicate.accumulaten_eliminater 1.40% : 0.000004s : 4: predicate.ad_related_special_op_eliminate 0.47% : 0.000001s : 8: predicate.addn_check_dump 0.97% : 0.000002s : 13: predicate.addn_zero_filter 0.76% : 0.000002s : 13: predicate.adjust_all_reduce_mul_add 2.77% : 0.000007s : 21: predicate.arithmetic_simplify 0.86% : 0.000002s : 13: predicate.cast_eliminate 0.57% : 0.000001s : 8: predicate.check_bprop_eliminate 0.46% : 0.000001s : 8: predicate.compare_switch_simplify 0.16% : 0.000000s : 4: predicate.const_output_eliminate 0.65% : 0.000002s : 8: predicate.depend_value_elim 0.73% : 0.000002s : 13: predicate.dict_get_item_const_eliminator 0.91% : 0.000002s : 13: predicate.dict_get_item_eliminator 0.79% : 0.000002s : 13: predicate.dict_set_item_eliminator 1.60% : 0.000004s : 8: predicate.dumpgradient_eliminate 0.26% : 0.000001s : 4: predicate.elim_not_effective 0.54% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.36% : 0.000003s : 17: predicate.environ_add_const_eliminate 0.85% : 0.000002s : 17: predicate.environ_get_add_eliminate 0.87% : 0.000002s : 17: predicate.environ_get_depend_swap 1.81% : 0.000005s : 25: predicate.environ_get_eliminate 0.90% : 0.000002s : 17: predicate.environ_get_set_eliminate 1.21% : 0.000003s : 21: predicate.exchange_switch_depend_value 2.13% : 0.000005s : 21: predicate.float_depend_g_call 0.48% : 0.000001s : 8: predicate.float_environ_get_switch 0.66% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.15% : 0.000000s : 4: predicate.fold_const_symbol 0.65% : 0.000002s : 8: predicate.get_grad_eliminate 0.29% : 0.000001s : 4: predicate.graph_param_transform 0.50% : 0.000001s : 8: predicate.incorporate_call 0.41% : 0.000001s : 8: predicate.incorporate_call_switch 6.32% : 0.000016s : 58: predicate.inline 1.25% : 0.000003s : 8: predicate.inline_without_move 0.33% : 0.000001s : 8: predicate.j_node_and_user_rematch 0.96% : 0.000002s : 8: predicate.less_batch_normalization 1.70% : 0.000004s : 25: predicate.list_to_tuple_eliminator_ 2.19% : 0.000006s : 38: predicate.load_eliminater 1.90% : 0.000005s : 4: predicate.loop_unroll_after_grad 1.96% : 0.000005s : 34: predicate.loop_unroll_before_grad 1.95% : 0.000005s : 21: predicate.make_slice_get_slice_eliminator 0.50% : 0.000001s : 8: predicate.merge_addn 0.53% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.62% : 0.000002s : 8: predicate.mini_step_allgather_replace 0.64% : 0.000002s : 13: predicate.minmaximum_grad 1.98% : 0.000005s : 4: predicate.mutable_eliminate 0.40% : 0.000001s : 4: predicate.opt_reshape 0.60% : 0.000002s : 4: predicate.parallel_virtual_node 1.74% : 0.000004s : 21: predicate.partial_defer_inline 1.35% : 0.000003s : 21: predicate.partial_eliminate 0.79% : 0.000002s : 13: predicate.print_const_string_wrapper 0.65% : 0.000002s : 8: predicate.reduce_all_const_elim 1.15% : 0.000003s : 13: predicate.reduce_eliminate 2.34% : 0.000006s : 38: predicate.redundant_stop_gradient_eliminater 0.50% : 0.000001s : 8: predicate.remove_not_recompute_node 1.30% : 0.000003s : 25: predicate.replace_applicator 0.62% : 0.000002s : 8: predicate.replace_old_param 0.53% : 0.000001s : 4: predicate.reset_defer_inline 0.81% : 0.000002s : 13: predicate.reshape_eliminate 0.61% : 0.000002s : 8: predicate.row_tensor_add_zeros_like 0.56% : 0.000001s : 4: predicate.row_tensor_eliminate 1.04% : 0.000003s : 8: predicate.same_eliminate 0.52% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.96% : 0.000002s : 8: predicate.shard_identity_eliminate 0.62% : 0.000002s : 8: predicate.special_op_eliminate 0.72% : 0.000002s : 8: predicate.specialize_transform 1.21% : 0.000003s : 8: predicate.split_environ_get_set_with_tuple_value 1.57% : 0.000004s : 8: predicate.stack_unstack_eliminate 0.32% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.21% : 0.000003s : 21: predicate.switch_defer_inline 1.73% : 0.000004s : 29: predicate.switch_layer_defer_inline 4.47% : 0.000011s : 67: predicate.switch_simplify 0.91% : 0.000002s : 13: predicate.tile_eliminate 0.73% : 0.000002s : 13: predicate.transpose_eliminate 1.33% : 0.000003s : 21: predicate.tuple_list_convert_item_index_to_positive 1.51% : 0.000004s : 21: predicate.tuple_list_get_item_const_eliminator 1.41% : 0.000004s : 21: predicate.tuple_list_get_item_depend_reorder 3.48% : 0.000009s : 33: predicate.tuple_list_get_item_eliminator 1.48% : 0.000004s : 21: predicate.tuple_list_get_set_item_eliminator 2.23% : 0.000006s : 29: predicate.tuple_list_set_item_eliminator 1.49% : 0.000004s : 25: predicate.tuple_to_list_eliminator_ 2.05% : 0.000005s : 38: predicate.updatestate_pure_node_eliminater 2.83% : 0.000007s : 46: predicate.updatestate_useless_node_eliminater 0.37% : 0.000001s : 4: predicate.value_based_eliminate 0.75% : 0.000002s : 8: predicate.virtual_dataset_eliminate 0.57% : 0.000001s : 8: predicate.virtual_output_eliminate 0.24% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.54% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000703 11 54.11% : 0.000381s : 5: func_graph_cloner_run.FuncGraphClonerGraph 45.89% : 0.000323s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.142437 192 0.00% : 0.000006s : 1: ForceFp32Comm 2.66% : 0.003796s : 1: add_attr 2.65% : 0.003774s : 1: add_attr_with_inline 0.00% : 0.000007s : 1: add_comm_op_reuse_tag 0.05% : 0.000075s : 1: add_recomputation 0.00% : 0.000007s : 1: assign_add_opt 0.06% : 0.000081s : 1: auto_monad 0.03% : 0.000038s : 1: auto_monad_reorder 0.00% : 0.000007s : 1: begin_end_overlap_inline 0.01% : 0.000009s : 1: bias_add_comm_swap 0.36% : 0.000517s : 1: bootstrap 0.04% : 0.000051s : 1: cconv 0.01% : 0.000008s : 1: comm_op_add_attrs 0.02% : 0.000030s : 1: control_data_broadcast_order 0.01% : 0.000017s : 1: convert_after_rewriter 0.03% : 0.000041s : 1: cse_after_recomputation 0.01% : 0.000008s : 1: dataset_repeat_opt 0.02% : 0.000035s : 1: detach_backward 0.01% : 0.000014s : 1: environ_conv 0.02% : 0.000033s : 1: event_method 0.01% : 0.000008s : 1: full_micro_interleaved_order_control 0.01% : 0.000009s : 1: get_jit_bprop_graph 0.01% : 0.000013s : 1: graph_reusing 0.01% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000007s : 1: handle_group_info 0.01% : 0.000008s : 1: inline 0.01% : 0.000009s : 1: insert-virtual-dataset 0.01% : 0.000007s : 1: interleave_parallel_branches 0.00% : 0.000007s : 1: interleave_split_concat_branches 0.01% : 0.000008s : 1: label_fine_grained_interleaved_index 0.01% : 0.000015s : 1: label_micro_interleaved_index 0.52% : 0.000739s : 1: loop_unroll 0.00% : 0.000007s : 1: merge_cast_opt 0.01% : 0.000008s : 1: micro_interleaved_order_control 0.64% : 0.000914s : 1: mutable_eliminate 0.01% : 0.000012s : 1: offloading_packed_experts 0.02% : 0.000028s : 1: opt.transform.loop_unroll_optimizer 0.02% : 0.000030s : 1: opt.transform.mutable_eliminate 1.02% : 0.001455s : 78: opt.transform.opt_a 0.03% : 0.000036s : 1: opt.transform.opt_after_cconv 0.03% : 0.000042s : 1: opt.transform.opt_after_jit_grad 0.08% : 0.000120s : 28: opt.transform.opt_b 0.05% : 0.000067s : 2: opt.transform.opt_trans_graph 0.04% : 0.000056s : 4: opt.transform.symbol_engine_opt 2.80% : 0.003995s : 1: opt_a 0.13% : 0.000182s : 1: opt_after_cconv 0.60% : 0.000857s : 1: opt_after_jit_grad 0.25% : 0.000355s : 1: opt_b 5.46% : 0.007772s : 1: optimize 0.02% : 0.000035s : 1: optimize_parallel_all_gather_comm 0.01% : 0.000011s : 1: order_py_execute_after_rewriter 0.03% : 0.000036s : 1: overlap_grad_flash_sp 0.01% : 0.000007s : 1: overlap_grad_matmul_and_grad_allreduce 0.01% : 0.000012s : 1: overlap_grad_ring_attention 0.01% : 0.000007s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000007s : 1: overlap_opt_shard_in_pipeline 0.01% : 0.000010s : 1: overlap_param_gather 0.00% : 0.000006s : 1: overlap_recompute_allgather_and_fa_grad 0.01% : 0.000012s : 1: overlap_recompute_and_grad_model_parallel 0.01% : 0.000009s : 1: overlap_recompute_comm 0.01% : 0.000010s : 1: parallel-infer-symbol 0.00% : 0.000007s : 1: parallel-infer-symbol-second 0.01% : 0.000009s : 1: partial_unused_args_eliminate 0.01% : 0.000011s : 1: pipeline_parallel_scheduler 0.01% : 0.000007s : 1: pipeline_split 0.03% : 0.000050s : 1: pre_auto_parallel 0.03% : 0.000040s : 1: py_interpret_to_execute 0.02% : 0.000030s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000007s : 1: remove_cast_before_assign_add 0.02% : 0.000023s : 1: remove_dup_value 0.37% : 0.000534s : 1: renormalize.infer 0.31% : 0.000446s : 1: renormalize.specialize 0.01% : 0.000009s : 1: reorder_send_recv_between_fp_bp 0.01% : 0.000015s : 1: rewriter_after_jit_bprop_graph 0.05% : 0.000064s : 1: rewriter_after_opt_a 0.07% : 0.000105s : 1: rewriter_before_opt_a 0.01% : 0.000008s : 1: slice_cell_reuse_recomputed_activation 0.01% : 0.000008s : 1: slice_recompute_activation 0.01% : 0.000009s : 1: split_layernorm_comm 0.01% : 0.000008s : 1: split_matmul_comm_elemetwise 0.01% : 0.000013s : 1: swap_dp_allreduce_reducescatter 0.10% : 0.000148s : 1: symbol_engine_optimizer 0.09% : 0.000123s : 1: tuple_transform 80.93% : 0.115275s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:42:44.422.677 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.030395, [21] [bootstrap]: 0.00047325 [type_inference]: 0.00696267 [event_method]: 1.882e-05 [auto_monad]: 6.4e-05 [graph_reusing]: 5.56998e-06 [inline]: 3.06001e-06 [add_attr]: 0.00356878, [1] [add_attr_with_inline]: 0.003556, [1] [Cycle 1]: 7.539e-05, [2] [tag_attr]: 2.424e-05 [meta_addattr_fg_expand]: 5.73002e-06 [parallel-infer-symbol]: 3.7e-06 [pre_auto_parallel]: 4.062e-05 [insert-virtual-dataset]: 2.68e-06 [parallel-infer-symbol-second]: 6.50005e-07 [dataset_repeat_opt]: 1.82001e-06 [pipeline_split]: 1.79998e-06 [optimize]: 0.0184562, [53] [py_interpret_to_execute]: 3.088e-05 [rewriter_before_opt_a]: 9.337e-05 [opt_a]: 0.0159664, [2] [Cycle 1]: 0.0151631, [45] [expand_dump_flag]: 3.22002e-06 [switch_simplify]: 4.489e-05 [loop_unroll]: 3.091e-05 [a_1]: 0.00069269 [with_stream_mark]: 1.955e-05 [recompute_prepare]: 1.062e-05 [updatestate_depend_eliminate]: 4.84e-06 [updatestate_assign_eliminate]: 3.28e-06 [updatestate_loads_eliminate]: 3.31001e-06 [parameter_eliminate]: 2.11e-06 [a_2]: 9.14e-05 [accelerated_algorithm]: 7.14001e-06 [shard]: 1.86998e-06 [meta_shard_fg_expand]: 2.04e-06 [shard_inline]: 6.64999e-06 [merge_send_recv]: 8.37e-06 [auto_parallel]: 7.75998e-06 [parallel]: 1.973e-05 [flash_sp]: 9.14998e-06 [merge_comm]: 4.08999e-06 [allreduce_fusion]: 3.31001e-06 [matmul_add_comm_reduction]: 1.023e-05 [allreduce_slice_to_reducescatter]: 7.99977e-07 [virtual_shard_identity]: 8.86997e-06 [virtual_dataset]: 6.98998e-06 [get_grad_eliminate_]: 6.75998e-06 [virtual_output]: 6.28e-06 [merge_forward]: 3.90998e-06 [cell_reuse_recompute_pass]: 1.04e-06 [offload_activation]: 1.105e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.287e-05 [merge_recompute_call_nodes]: 1.55001e-06 [before_grad]: 1.139e-05 [set_forward_comm_id_for_comm_node_pass]: 3.6e-06 [meta_fg_expand]: 2.79999e-06 [flash_sp_send_recv_attached]: 2.94001e-06 [receive_attached]: 2.07999e-06 [after_resolve]: 1.211e-05 [a_after_grad]: 1.067e-05 [renormalize]: 0.013631 [add_forward_monad_depend]: 1.247e-05 [auto_monad_grad]: 3.28998e-06 [auto_monad_eliminator]: 2.731e-05 [cse]: 3.316e-05 [a_3]: 6.574e-05 [Cycle 2]: 0.00078879, [45] [expand_dump_flag]: 2.38998e-06 [switch_simplify]: 9.89999e-06 [loop_unroll]: 7.83999e-06 [a_1]: 0.00017146 [with_stream_mark]: 2.092e-05 [recompute_prepare]: 8.54e-06 [updatestate_depend_eliminate]: 3.75e-06 [updatestate_assign_eliminate]: 2.98e-06 [updatestate_loads_eliminate]: 3.54002e-06 [parameter_eliminate]: 2.09e-06 [a_2]: 8.053e-05 [accelerated_algorithm]: 7.47002e-06 [shard]: 2.56998e-06 [meta_shard_fg_expand]: 2.46e-06 [shard_inline]: 6.33e-06 [merge_send_recv]: 8.23001e-06 [auto_parallel]: 1.008e-05 [parallel]: 1.015e-05 [flash_sp]: 4.55999e-06 [merge_comm]: 4.35e-06 [allreduce_fusion]: 3.53e-06 [matmul_add_comm_reduction]: 1.089e-05 [allreduce_slice_to_reducescatter]: 7.89994e-07 [virtual_shard_identity]: 8.80999e-06 [virtual_dataset]: 6.10002e-06 [get_grad_eliminate_]: 6.53e-06 [virtual_output]: 6.44999e-06 [merge_forward]: 5.15001e-06 [cell_reuse_recompute_pass]: 3.77002e-06 [offload_activation]: 1.135e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.182e-05 [merge_recompute_call_nodes]: 1.42e-06 [before_grad]: 1.091e-05 [set_forward_comm_id_for_comm_node_pass]: 3.95e-06 [meta_fg_expand]: 2.75002e-06 [flash_sp_send_recv_attached]: 1.77001e-06 [receive_attached]: 2.34999e-06 [after_resolve]: 1.415e-05 [a_after_grad]: 9.82999e-06 [renormalize]: 1.39989e-07 [add_forward_monad_depend]: 1.70001e-06 [auto_monad_grad]: 1.59998e-06 [auto_monad_eliminator]: 9.46003e-06 [cse]: 1.765e-05 [a_3]: 3.724e-05 [py_interpret_to_execute_after_opt_a]: 2.017e-05 [slice_cell_reuse_recomputed_activation]: 2.66e-06 [rewriter_after_opt_a]: 4.223e-05 [convert_after_rewriter]: 7.29001e-06 [order_py_execute_after_rewriter]: 5.49e-06 [mutable_eliminate]: 0.00077002 [opt_b]: 0.0002273, [1] [Cycle 1]: 0.00021884, [7] [b_1]: 0.0001307 [b_2]: 9.61e-06 [updatestate_depend_eliminate]: 9.52999e-06 [updatestate_assign_eliminate]: 2.61e-06 [updatestate_loads_eliminate]: 2.61999e-06 [renormalize]: 7.2e-07 [cse]: 2.489e-05 [optimize_parallel_all_gather_comm]: 1.94e-05 [overlap_param_gather]: 2.26e-06 [cconv]: 3.607e-05 [loop_unroll]: 0.00049458 [opt_after_cconv]: 0.00011284, [1] [Cycle 1]: 0.00010599, [7] [c_1]: 3.249e-05 [parameter_eliminate]: 5.14e-06 [updatestate_depend_eliminate]: 7.08998e-06 [updatestate_assign_eliminate]: 2.67001e-06 [updatestate_loads_eliminate]: 2.41e-06 [cse]: 2.09e-05 [renormalize]: 7.2e-07 [remove_dup_value]: 1.421e-05 [tuple_transform]: 8.212e-05, [1] [Cycle 1]: 7.717e-05, [4] [d_1]: 4.832e-05 [none_parameter_eliminate]: 1.51998e-06 [renormalize]: 2.70025e-07 [switch_simplify]: 7.53999e-06 [partial_unused_args_eliminate]: 2.10002e-06 [add_recomputation]: 5.358e-05 [cse_after_recomputation]: 2.377e-05, [1] [Cycle 1]: 1.913e-05, [1] [cse]: 1.248e-05 [environ_conv]: 6.04001e-06 [swap_dp_allreduce_reducescatter]: 5.35999e-06 [bias_add_comm_swap]: 3.23e-06 [label_micro_interleaved_index]: 5.90002e-06 [label_fine_grained_interleaved_index]: 2.56998e-06 [merge_cast_opt]: 1.40999e-06 [slice_recompute_activation]: 2.53e-06 [micro_interleaved_order_control]: 2.11e-06 [assign_add_opt]: 1.32999e-06 [ForceFp32Comm]: 9.20001e-07 [remove_cast_before_assign_add]: 1.19e-06 [full_micro_interleaved_order_control]: 2.06e-06 [reorder_send_recv_between_fp_bp]: 2.64999e-06 [comm_op_add_attrs]: 1.09e-06 [add_comm_op_reuse_tag]: 9.30013e-07 [interleave_split_concat_branches]: 1.21002e-06 [interleave_parallel_branches]: 1.02998e-06 [overlap_opt_shard_in_pipeline]: 1.57001e-06 [overlap_opt_shard_grad_in_pipeline]: 2.05002e-06 [control_data_broadcast_order]: 1.37e-05 [grouped_pairwise_exchange_alltoall]: 1.89e-06 [offloading_packed_experts]: 4.52998e-06 [overlap_recompute_and_grad_model_parallel]: 4.95999e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.34e-06 [overlap_recompute_allgather_and_fa_grad]: 1.60001e-06 [overlap_recompute_comm]: 2.12999e-06 [overlap_grad_ring_attention]: 4.14002e-06 [overlap_grad_flash_sp]: 2.337e-05 [begin_end_overlap_inline]: 5.8001e-07 [split_matmul_comm_elemetwise]: 2.07001e-06 [split_layernorm_comm]: 1.84e-06 [handle_group_info]: 9.00007e-07 [symbol_engine_optimizer]: 0.00010658, [1] [Cycle 1]: 0.0001017, [6] [build]: 4.66002e-06 [elim_shapecalc]: 1.302e-05 [elim_not_effective]: 1.422e-05 [opt_reshape]: 7.01001e-06 [fold_const_symbol]: 9.94001e-06 [renormalize]: 2.3999e-07 [detach_backward]: 2.68e-06 [pipeline_parallel_scheduler]: 1.69e-06 [auto_monad_reorder]: 1.895e-05 [get_jit_bprop_graph]: 2.46e-06 [rewriter_after_jit_bprop_graph]: 5.98002e-06 [opt_after_jit_grad]: 0.00054223 [validate]: 4.823e-05 Sums bootstrap : 0.000473s : 1.84% type_inference : 0.006963s : 27.06% event_method : 0.000019s : 0.07% auto_monad : 0.000064s : 0.25% graph_reusing : 0.000006s : 0.02% inline : 0.000003s : 0.01% add_attr.add_attr_with_inline.tag_attr : 0.000024s : 0.09% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.02% parallel-infer-symbol : 0.000004s : 0.01% pre_auto_parallel : 0.000041s : 0.16% insert-virtual-dataset : 0.000003s : 0.01% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.01% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000031s : 0.12% optimize.rewriter_before_opt_a : 0.000093s : 0.36% optimize.opt_a.expand_dump_flag : 0.000006s : 0.02% optimize.opt_a.switch_simplify : 0.000055s : 0.21% optimize.opt_a.loop_unroll : 0.000039s : 0.15% optimize.opt_a.a_1 : 0.000864s : 3.36% optimize.opt_a.with_stream_mark : 0.000040s : 0.16% optimize.opt_a.recompute_prepare : 0.000019s : 0.07% optimize.opt_a.updatestate_depend_eliminate : 0.000009s : 0.03% optimize.opt_a.updatestate_assign_eliminate : 0.000006s : 0.02% optimize.opt_a.updatestate_loads_eliminate : 0.000007s : 0.03% optimize.opt_a.parameter_eliminate : 0.000004s : 0.02% optimize.opt_a.a_2 : 0.000172s : 0.67% optimize.opt_a.accelerated_algorithm : 0.000015s : 0.06% optimize.opt_a.shard : 0.000004s : 0.02% optimize.opt_a.meta_shard_fg_expand : 0.000005s : 0.02% optimize.opt_a.shard_inline : 0.000013s : 0.05% optimize.opt_a.merge_send_recv : 0.000017s : 0.06% optimize.opt_a.auto_parallel : 0.000018s : 0.07% optimize.opt_a.parallel : 0.000030s : 0.12% optimize.opt_a.flash_sp : 0.000014s : 0.05% optimize.opt_a.merge_comm : 0.000008s : 0.03% optimize.opt_a.allreduce_fusion : 0.000007s : 0.03% optimize.opt_a.matmul_add_comm_reduction : 0.000021s : 0.08% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000002s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000018s : 0.07% optimize.opt_a.virtual_dataset : 0.000013s : 0.05% optimize.opt_a.get_grad_eliminate_ : 0.000013s : 0.05% optimize.opt_a.virtual_output : 0.000013s : 0.05% optimize.opt_a.merge_forward : 0.000009s : 0.04% optimize.opt_a.cell_reuse_recompute_pass : 0.000005s : 0.02% optimize.opt_a.offload_activation : 0.000022s : 0.09% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000025s : 0.10% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.01% optimize.opt_a.before_grad : 0.000022s : 0.09% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000008s : 0.03% optimize.opt_a.meta_fg_expand : 0.000006s : 0.02% optimize.opt_a.flash_sp_send_recv_attached : 0.000005s : 0.02% optimize.opt_a.receive_attached : 0.000004s : 0.02% optimize.opt_a.after_resolve : 0.000026s : 0.10% optimize.opt_a.a_after_grad : 0.000021s : 0.08% optimize.opt_a.renormalize : 0.013631s : 52.98% optimize.opt_a.add_forward_monad_depend : 0.000014s : 0.06% optimize.opt_a.auto_monad_grad : 0.000005s : 0.02% optimize.opt_a.auto_monad_eliminator : 0.000037s : 0.14% optimize.opt_a.cse : 0.000051s : 0.20% optimize.opt_a.a_3 : 0.000103s : 0.40% optimize.py_interpret_to_execute_after_opt_a : 0.000020s : 0.08% optimize.slice_cell_reuse_recomputed_activation : 0.000003s : 0.01% optimize.rewriter_after_opt_a : 0.000042s : 0.16% optimize.convert_after_rewriter : 0.000007s : 0.03% optimize.order_py_execute_after_rewriter : 0.000005s : 0.02% optimize.mutable_eliminate : 0.000770s : 2.99% optimize.opt_b.b_1 : 0.000131s : 0.51% optimize.opt_b.b_2 : 0.000010s : 0.04% optimize.opt_b.updatestate_depend_eliminate : 0.000010s : 0.04% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.01% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.01% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000025s : 0.10% optimize.optimize_parallel_all_gather_comm : 0.000019s : 0.08% optimize.overlap_param_gather : 0.000002s : 0.01% optimize.cconv : 0.000036s : 0.14% optimize.loop_unroll : 0.000495s : 1.92% optimize.opt_after_cconv.c_1 : 0.000032s : 0.13% optimize.opt_after_cconv.parameter_eliminate : 0.000005s : 0.02% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000007s : 0.03% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.01% optimize.opt_after_cconv.cse : 0.000021s : 0.08% optimize.opt_after_cconv.renormalize : 0.000001s : 0.00% optimize.remove_dup_value : 0.000014s : 0.06% optimize.tuple_transform.d_1 : 0.000048s : 0.19% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000008s : 0.03% optimize.partial_unused_args_eliminate : 0.000002s : 0.01% optimize.add_recomputation : 0.000054s : 0.21% optimize.cse_after_recomputation.cse : 0.000012s : 0.05% optimize.environ_conv : 0.000006s : 0.02% optimize.swap_dp_allreduce_reducescatter : 0.000005s : 0.02% optimize.bias_add_comm_swap : 0.000003s : 0.01% optimize.label_micro_interleaved_index : 0.000006s : 0.02% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.01% optimize.merge_cast_opt : 0.000001s : 0.01% optimize.slice_recompute_activation : 0.000003s : 0.01% optimize.micro_interleaved_order_control : 0.000002s : 0.01% optimize.assign_add_opt : 0.000001s : 0.01% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.00% optimize.full_micro_interleaved_order_control : 0.000002s : 0.01% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.01% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000002s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.01% optimize.control_data_broadcast_order : 0.000014s : 0.05% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.01% optimize.offloading_packed_experts : 0.000005s : 0.02% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.02% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000002s : 0.01% optimize.overlap_recompute_comm : 0.000002s : 0.01% optimize.overlap_grad_ring_attention : 0.000004s : 0.02% optimize.overlap_grad_flash_sp : 0.000023s : 0.09% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.01% optimize.split_layernorm_comm : 0.000002s : 0.01% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000005s : 0.02% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000013s : 0.05% optimize.symbol_engine_optimizer.elim_not_effective : 0.000014s : 0.06% optimize.symbol_engine_optimizer.opt_reshape : 0.000007s : 0.03% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000010s : 0.04% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000003s : 0.01% pipeline_parallel_scheduler : 0.000002s : 0.01% auto_monad_reorder : 0.000019s : 0.07% get_jit_bprop_graph : 0.000002s : 0.01% rewriter_after_jit_bprop_graph : 0.000006s : 0.02% opt_after_jit_grad : 0.000542s : 2.11% validate : 0.000048s : 0.19% Time group info: ------[substitution.] 0.000255 34 17.25% : 0.000044s : 6: substitution.arithmetic_simplify 0.92% : 0.000002s : 2: substitution.elim_not_effective 0.48% : 0.000001s : 2: substitution.fold_const_symbol 2.50% : 0.000006s : 4: substitution.graph_param_transform 65.85% : 0.000168s : 4: substitution.inline 2.09% : 0.000005s : 4: substitution.j_node_and_user_rematch 2.13% : 0.000005s : 4: substitution.remove_not_recompute_node 2.52% : 0.000006s : 4: substitution.replace_old_param 6.26% : 0.000016s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.006890 2 89.17% : 0.006144s : 1: type_inference.infer 10.83% : 0.000746s : 1: type_inference.specialize ------[replace.] 0.000067 8 62.99% : 0.000042s : 4: replace.inline 37.01% : 0.000025s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000179 8 92.17% : 0.000165s : 4: match.inline 7.83% : 0.000014s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000229 1278 0.89% : 0.000002s : 13: predicate.accumulaten_eliminater 1.09% : 0.000002s : 4: predicate.ad_related_special_op_eliminate 0.53% : 0.000001s : 8: predicate.addn_check_dump 0.87% : 0.000002s : 13: predicate.addn_zero_filter 0.88% : 0.000002s : 13: predicate.adjust_all_reduce_mul_add 2.87% : 0.000007s : 21: predicate.arithmetic_simplify 0.95% : 0.000002s : 13: predicate.cast_eliminate 0.75% : 0.000002s : 8: predicate.check_bprop_eliminate 0.49% : 0.000001s : 8: predicate.compare_switch_simplify 0.18% : 0.000000s : 4: predicate.const_output_eliminate 0.52% : 0.000001s : 8: predicate.depend_value_elim 0.89% : 0.000002s : 13: predicate.dict_get_item_const_eliminator 1.06% : 0.000002s : 13: predicate.dict_get_item_eliminator 1.05% : 0.000002s : 13: predicate.dict_set_item_eliminator 1.22% : 0.000003s : 8: predicate.dumpgradient_eliminate 0.21% : 0.000000s : 4: predicate.elim_not_effective 0.46% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.05% : 0.000002s : 17: predicate.environ_add_const_eliminate 1.05% : 0.000002s : 17: predicate.environ_get_add_eliminate 1.32% : 0.000003s : 17: predicate.environ_get_depend_swap 1.85% : 0.000004s : 25: predicate.environ_get_eliminate 1.35% : 0.000003s : 17: predicate.environ_get_set_eliminate 1.38% : 0.000003s : 21: predicate.exchange_switch_depend_value 2.33% : 0.000005s : 21: predicate.float_depend_g_call 0.49% : 0.000001s : 8: predicate.float_environ_get_switch 0.77% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.18% : 0.000000s : 4: predicate.fold_const_symbol 0.69% : 0.000002s : 8: predicate.get_grad_eliminate 0.19% : 0.000000s : 4: predicate.graph_param_transform 0.54% : 0.000001s : 8: predicate.incorporate_call 0.44% : 0.000001s : 8: predicate.incorporate_call_switch 6.44% : 0.000015s : 58: predicate.inline 0.61% : 0.000001s : 8: predicate.inline_without_move 0.30% : 0.000001s : 8: predicate.j_node_and_user_rematch 0.74% : 0.000002s : 8: predicate.less_batch_normalization 1.73% : 0.000004s : 25: predicate.list_to_tuple_eliminator_ 2.39% : 0.000005s : 38: predicate.load_eliminater 1.22% : 0.000003s : 4: predicate.loop_unroll_after_grad 2.21% : 0.000005s : 34: predicate.loop_unroll_before_grad 1.80% : 0.000004s : 21: predicate.make_slice_get_slice_eliminator 0.62% : 0.000001s : 8: predicate.merge_addn 0.55% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.61% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.77% : 0.000002s : 13: predicate.minmaximum_grad 1.56% : 0.000004s : 4: predicate.mutable_eliminate 0.34% : 0.000001s : 4: predicate.opt_reshape 0.35% : 0.000001s : 4: predicate.parallel_virtual_node 1.63% : 0.000004s : 21: predicate.partial_defer_inline 1.46% : 0.000003s : 21: predicate.partial_eliminate 0.94% : 0.000002s : 13: predicate.print_const_string_wrapper 0.68% : 0.000002s : 8: predicate.reduce_all_const_elim 1.10% : 0.000003s : 13: predicate.reduce_eliminate 2.32% : 0.000005s : 38: predicate.redundant_stop_gradient_eliminater 0.53% : 0.000001s : 8: predicate.remove_not_recompute_node 1.35% : 0.000003s : 25: predicate.replace_applicator 0.55% : 0.000001s : 8: predicate.replace_old_param 0.23% : 0.000001s : 4: predicate.reset_defer_inline 1.19% : 0.000003s : 13: predicate.reshape_eliminate 0.57% : 0.000001s : 8: predicate.row_tensor_add_zeros_like 0.47% : 0.000001s : 4: predicate.row_tensor_eliminate 0.70% : 0.000002s : 8: predicate.same_eliminate 0.48% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.73% : 0.000002s : 8: predicate.shard_identity_eliminate 0.67% : 0.000002s : 8: predicate.special_op_eliminate 0.60% : 0.000001s : 8: predicate.specialize_transform 0.98% : 0.000002s : 8: predicate.split_environ_get_set_with_tuple_value 1.04% : 0.000002s : 8: predicate.stack_unstack_eliminate 0.30% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.42% : 0.000003s : 21: predicate.switch_defer_inline 1.79% : 0.000004s : 29: predicate.switch_layer_defer_inline 4.89% : 0.000011s : 67: predicate.switch_simplify 0.96% : 0.000002s : 13: predicate.tile_eliminate 0.90% : 0.000002s : 13: predicate.transpose_eliminate 1.34% : 0.000003s : 21: predicate.tuple_list_convert_item_index_to_positive 1.79% : 0.000004s : 21: predicate.tuple_list_get_item_const_eliminator 1.66% : 0.000004s : 21: predicate.tuple_list_get_item_depend_reorder 3.17% : 0.000007s : 33: predicate.tuple_list_get_item_eliminator 1.56% : 0.000004s : 21: predicate.tuple_list_get_set_item_eliminator 2.19% : 0.000005s : 29: predicate.tuple_list_set_item_eliminator 1.70% : 0.000004s : 25: predicate.tuple_to_list_eliminator_ 2.16% : 0.000005s : 38: predicate.updatestate_pure_node_eliminater 2.82% : 0.000006s : 46: predicate.updatestate_useless_node_eliminater 0.40% : 0.000001s : 4: predicate.value_based_eliminate 0.60% : 0.000001s : 8: predicate.virtual_dataset_eliminate 0.59% : 0.000001s : 8: predicate.virtual_output_eliminate 0.26% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.48% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000721 11 51.73% : 0.000373s : 5: func_graph_cloner_run.FuncGraphClonerGraph 48.27% : 0.000348s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.067518 192 0.01% : 0.000004s : 1: ForceFp32Comm 5.29% : 0.003575s : 1: add_attr 5.27% : 0.003560s : 1: add_attr_with_inline 0.01% : 0.000005s : 1: add_comm_op_reuse_tag 0.09% : 0.000058s : 1: add_recomputation 0.01% : 0.000004s : 1: assign_add_opt 0.10% : 0.000069s : 1: auto_monad 0.03% : 0.000023s : 1: auto_monad_reorder 0.01% : 0.000004s : 1: begin_end_overlap_inline 0.01% : 0.000006s : 1: bias_add_comm_swap 0.75% : 0.000507s : 1: bootstrap 0.06% : 0.000040s : 1: cconv 0.01% : 0.000004s : 1: comm_op_add_attrs 0.03% : 0.000017s : 1: control_data_broadcast_order 0.02% : 0.000011s : 1: convert_after_rewriter 0.04% : 0.000027s : 1: cse_after_recomputation 0.01% : 0.000005s : 1: dataset_repeat_opt 0.01% : 0.000006s : 1: detach_backward 0.01% : 0.000009s : 1: environ_conv 0.04% : 0.000027s : 1: event_method 0.01% : 0.000005s : 1: full_micro_interleaved_order_control 0.01% : 0.000005s : 1: get_jit_bprop_graph 0.01% : 0.000009s : 1: graph_reusing 0.01% : 0.000005s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000004s : 1: handle_group_info 0.01% : 0.000006s : 1: inline 0.01% : 0.000006s : 1: insert-virtual-dataset 0.01% : 0.000004s : 1: interleave_parallel_branches 0.01% : 0.000004s : 1: interleave_split_concat_branches 0.01% : 0.000005s : 1: label_fine_grained_interleaved_index 0.01% : 0.000009s : 1: label_micro_interleaved_index 0.75% : 0.000505s : 1: loop_unroll 0.01% : 0.000004s : 1: merge_cast_opt 0.01% : 0.000005s : 1: micro_interleaved_order_control 1.16% : 0.000781s : 1: mutable_eliminate 0.01% : 0.000007s : 1: offloading_packed_experts 0.03% : 0.000018s : 1: opt.transform.loop_unroll_optimizer 0.03% : 0.000022s : 1: opt.transform.mutable_eliminate 1.96% : 0.001324s : 78: opt.transform.opt_a 0.05% : 0.000031s : 1: opt.transform.opt_after_cconv 0.05% : 0.000034s : 1: opt.transform.opt_after_jit_grad 0.16% : 0.000106s : 28: opt.transform.opt_b 0.08% : 0.000053s : 2: opt.transform.opt_trans_graph 0.06% : 0.000040s : 4: opt.transform.symbol_engine_opt 23.65% : 0.015970s : 1: opt_a 0.17% : 0.000117s : 1: opt_after_cconv 0.82% : 0.000554s : 1: opt_after_jit_grad 0.34% : 0.000231s : 1: opt_b 27.34% : 0.018461s : 1: optimize 0.03% : 0.000023s : 1: optimize_parallel_all_gather_comm 0.01% : 0.000009s : 1: order_py_execute_after_rewriter 0.04% : 0.000027s : 1: overlap_grad_flash_sp 0.01% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.01% : 0.000007s : 1: overlap_grad_ring_attention 0.01% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000005s : 1: overlap_opt_shard_in_pipeline 0.01% : 0.000005s : 1: overlap_param_gather 0.01% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.01% : 0.000008s : 1: overlap_recompute_and_grad_model_parallel 0.01% : 0.000005s : 1: overlap_recompute_comm 0.01% : 0.000007s : 1: parallel-infer-symbol 0.01% : 0.000004s : 1: parallel-infer-symbol-second 0.01% : 0.000005s : 1: partial_unused_args_eliminate 0.01% : 0.000005s : 1: pipeline_parallel_scheduler 0.01% : 0.000005s : 1: pipeline_split 0.07% : 0.000045s : 1: pre_auto_parallel 0.05% : 0.000035s : 1: py_interpret_to_execute 0.03% : 0.000024s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000004s : 1: remove_cast_before_assign_add 0.03% : 0.000018s : 1: remove_dup_value 19.36% : 0.013071s : 1: renormalize.infer 0.80% : 0.000540s : 1: renormalize.specialize 0.01% : 0.000005s : 1: reorder_send_recv_between_fp_bp 0.01% : 0.000009s : 1: rewriter_after_jit_bprop_graph 0.07% : 0.000047s : 1: rewriter_after_opt_a 0.14% : 0.000098s : 1: rewriter_before_opt_a 0.01% : 0.000006s : 1: slice_cell_reuse_recomputed_activation 0.01% : 0.000006s : 1: slice_recompute_activation 0.01% : 0.000005s : 1: split_layernorm_comm 0.01% : 0.000005s : 1: split_matmul_comm_elemetwise 0.01% : 0.000008s : 1: swap_dp_allreduce_reducescatter 0.16% : 0.000110s : 1: symbol_engine_optimizer 0.13% : 0.000085s : 1: tuple_transform 10.35% : 0.006985s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:42:45.415.885 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:42:45.416.152 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0310302, [21] [bootstrap]: 0.00044545 [type_inference]: 0.00650415 [event_method]: 2.22e-05 [auto_monad]: 7.04e-05 [graph_reusing]: 6.73998e-06 [inline]: 3.08e-06 [add_attr]: 0.00396736, [1] [add_attr_with_inline]: 0.00395369, [1] [Cycle 1]: 0.00010178, [2] [tag_attr]: 2.7e-05 [meta_addattr_fg_expand]: 6.20002e-06 [parallel-infer-symbol]: 4.16001e-06 [pre_auto_parallel]: 4.517e-05 [insert-virtual-dataset]: 2.38002e-06 [parallel-infer-symbol-second]: 6.80011e-07 [dataset_repeat_opt]: 2.02001e-06 [pipeline_split]: 1.81998e-06 [optimize]: 0.0184625, [53] [py_interpret_to_execute]: 3.694e-05 [rewriter_before_opt_a]: 0.0001014 [opt_a]: 0.0155339, [2] [Cycle 1]: 0.0144753, [45] [expand_dump_flag]: 3.56001e-06 [switch_simplify]: 4.434e-05 [loop_unroll]: 3.09e-05 [a_1]: 0.00070453 [with_stream_mark]: 2.611e-05 [recompute_prepare]: 1.101e-05 [updatestate_depend_eliminate]: 5.63002e-06 [updatestate_assign_eliminate]: 3.56001e-06 [updatestate_loads_eliminate]: 3.13e-06 [parameter_eliminate]: 2.52001e-06 [a_2]: 0.00012328 [accelerated_algorithm]: 9.49e-06 [shard]: 2.65997e-06 [meta_shard_fg_expand]: 2.88e-06 [shard_inline]: 7.25e-06 [merge_send_recv]: 1.111e-05 [auto_parallel]: 1.014e-05 [parallel]: 2.194e-05 [flash_sp]: 1.154e-05 [merge_comm]: 5.62999e-06 [allreduce_fusion]: 3.83001e-06 [matmul_add_comm_reduction]: 1.16e-05 [allreduce_slice_to_reducescatter]: 1.00999e-06 [virtual_shard_identity]: 1.243e-05 [virtual_dataset]: 7.35998e-06 [get_grad_eliminate_]: 6.73e-06 [virtual_output]: 6.79001e-06 [merge_forward]: 4.99e-06 [cell_reuse_recompute_pass]: 2.55002e-06 [offload_activation]: 1.24e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.032e-05 [merge_recompute_call_nodes]: 1.61998e-06 [before_grad]: 1.462e-05 [set_forward_comm_id_for_comm_node_pass]: 4.79e-06 [meta_fg_expand]: 3.24001e-06 [flash_sp_send_recv_attached]: 3.65e-06 [receive_attached]: 2.69999e-06 [after_resolve]: 1.473e-05 [a_after_grad]: 1.48e-05 [renormalize]: 0.0126484 [add_forward_monad_depend]: 1.219e-05 [auto_monad_grad]: 2.55002e-06 [auto_monad_eliminator]: 2.554e-05 [cse]: 3.488e-05 [a_3]: 7.856e-05 [Cycle 2]: 0.00103831, [45] [expand_dump_flag]: 2.19001e-06 [switch_simplify]: 9.62999e-06 [loop_unroll]: 6.44999e-06 [a_1]: 0.00016599 [with_stream_mark]: 2.332e-05 [recompute_prepare]: 8.50001e-06 [updatestate_depend_eliminate]: 4.17998e-06 [updatestate_assign_eliminate]: 3.18e-06 [updatestate_loads_eliminate]: 2.99999e-06 [parameter_eliminate]: 2.49001e-06 [a_2]: 0.00011366 [accelerated_algorithm]: 7.94002e-06 [shard]: 2.98e-06 [meta_shard_fg_expand]: 2.63e-06 [shard_inline]: 7.93999e-06 [merge_send_recv]: 9.99001e-06 [auto_parallel]: 1.02e-05 [parallel]: 9.54e-06 [flash_sp]: 4.15e-06 [merge_comm]: 3.65e-06 [allreduce_fusion]: 3.25e-06 [matmul_add_comm_reduction]: 9.96e-06 [allreduce_slice_to_reducescatter]: 6.89994e-07 [virtual_shard_identity]: 1.189e-05 [virtual_dataset]: 6.59999e-06 [get_grad_eliminate_]: 7.05e-06 [virtual_output]: 6.25002e-06 [merge_forward]: 6.96999e-06 [cell_reuse_recompute_pass]: 3.91999e-06 [offload_activation]: 1.101e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.708e-05 [merge_recompute_call_nodes]: 1.88997e-06 [before_grad]: 1.302e-05 [set_forward_comm_id_for_comm_node_pass]: 5.09e-06 [meta_fg_expand]: 2.93998e-06 [flash_sp_send_recv_attached]: 2.01998e-06 [receive_attached]: 2.83003e-06 [after_resolve]: 1.482e-05 [a_after_grad]: 1.202e-05 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 3.03e-06 [auto_monad_grad]: 2.33998e-06 [auto_monad_eliminator]: 1.402e-05 [cse]: 2.401e-05 [a_3]: 5.593e-05 [py_interpret_to_execute_after_opt_a]: 2.434e-05 [slice_cell_reuse_recomputed_activation]: 6.17999e-06 [rewriter_after_opt_a]: 5.574e-05 [convert_after_rewriter]: 1.223e-05 [order_py_execute_after_rewriter]: 9.45001e-06 [mutable_eliminate]: 0.00080999 [opt_b]: 0.00030365, [1] [Cycle 1]: 0.00029126, [7] [b_1]: 0.00017623 [b_2]: 9.16002e-06 [updatestate_depend_eliminate]: 9.20001e-06 [updatestate_assign_eliminate]: 2.80997e-06 [updatestate_loads_eliminate]: 2.69001e-06 [renormalize]: 1.25001e-06 [cse]: 2.969e-05 [optimize_parallel_all_gather_comm]: 2.569e-05 [overlap_param_gather]: 5.67999e-06 [cconv]: 4.185e-05 [loop_unroll]: 0.00056854 [opt_after_cconv]: 0.00018559, [1] [Cycle 1]: 0.00017536, [7] [c_1]: 3.387e-05 [parameter_eliminate]: 5.52001e-06 [updatestate_depend_eliminate]: 6.26998e-06 [updatestate_assign_eliminate]: 2.69001e-06 [updatestate_loads_eliminate]: 2.39999e-06 [cse]: 2.217e-05 [renormalize]: 6.39993e-07 [remove_dup_value]: 1.839e-05 [tuple_transform]: 0.00010323, [1] [Cycle 1]: 9.52e-05, [4] [d_1]: 5.309e-05 [none_parameter_eliminate]: 2.01e-06 [renormalize]: 2.50002e-07 [switch_simplify]: 8.04002e-06 [partial_unused_args_eliminate]: 4.81997e-06 [add_recomputation]: 5.76e-05 [cse_after_recomputation]: 3.12e-05, [1] [Cycle 1]: 2.345e-05, [1] [cse]: 1.295e-05 [environ_conv]: 8.88002e-06 [swap_dp_allreduce_reducescatter]: 8.30999e-06 [bias_add_comm_swap]: 5.92999e-06 [label_micro_interleaved_index]: 8.48999e-06 [label_fine_grained_interleaved_index]: 5.27001e-06 [merge_cast_opt]: 3.81999e-06 [slice_recompute_activation]: 4.58001e-06 [micro_interleaved_order_control]: 4.67e-06 [assign_add_opt]: 3.78001e-06 [ForceFp32Comm]: 3.43e-06 [remove_cast_before_assign_add]: 3.34001e-06 [full_micro_interleaved_order_control]: 4.72e-06 [reorder_send_recv_between_fp_bp]: 5.77999e-06 [comm_op_add_attrs]: 3.46999e-06 [add_comm_op_reuse_tag]: 3.73001e-06 [interleave_split_concat_branches]: 3.56001e-06 [interleave_parallel_branches]: 3.61001e-06 [overlap_opt_shard_in_pipeline]: 3.64002e-06 [overlap_opt_shard_grad_in_pipeline]: 4.37e-06 [control_data_broadcast_order]: 1.734e-05 [grouped_pairwise_exchange_alltoall]: 4.06001e-06 [offloading_packed_experts]: 6.96999e-06 [overlap_recompute_and_grad_model_parallel]: 8.10999e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.80998e-06 [overlap_recompute_allgather_and_fa_grad]: 3.94002e-06 [overlap_recompute_comm]: 5.28002e-06 [overlap_grad_ring_attention]: 6.44001e-06 [overlap_grad_flash_sp]: 2.554e-05 [begin_end_overlap_inline]: 2.96001e-06 [split_matmul_comm_elemetwise]: 4.75001e-06 [split_layernorm_comm]: 4.09002e-06 [handle_group_info]: 3.53e-06 [symbol_engine_optimizer]: 0.00010125, [1] [Cycle 1]: 9.395e-05, [6] [build]: 4.25999e-06 [elim_shapecalc]: 1.037e-05 [elim_not_effective]: 1.453e-05 [opt_reshape]: 7.63999e-06 [fold_const_symbol]: 1.07e-05 [renormalize]: 3.50003e-07 [detach_backward]: 4.97999e-06 [pipeline_parallel_scheduler]: 1.91e-06 [auto_monad_reorder]: 2.277e-05 [get_jit_bprop_graph]: 1.99999e-06 [rewriter_after_jit_bprop_graph]: 6.72002e-06 [opt_after_jit_grad]: 0.00074514 [validate]: 5.014e-05 Sums bootstrap : 0.000445s : 1.78% type_inference : 0.006504s : 26.01% event_method : 0.000022s : 0.09% auto_monad : 0.000070s : 0.28% graph_reusing : 0.000007s : 0.03% inline : 0.000003s : 0.01% add_attr.add_attr_with_inline.tag_attr : 0.000027s : 0.11% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.02% parallel-infer-symbol : 0.000004s : 0.02% pre_auto_parallel : 0.000045s : 0.18% insert-virtual-dataset : 0.000002s : 0.01% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.01% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000037s : 0.15% optimize.rewriter_before_opt_a : 0.000101s : 0.41% optimize.opt_a.expand_dump_flag : 0.000006s : 0.02% optimize.opt_a.switch_simplify : 0.000054s : 0.22% optimize.opt_a.loop_unroll : 0.000037s : 0.15% optimize.opt_a.a_1 : 0.000871s : 3.48% optimize.opt_a.with_stream_mark : 0.000049s : 0.20% optimize.opt_a.recompute_prepare : 0.000020s : 0.08% optimize.opt_a.updatestate_depend_eliminate : 0.000010s : 0.04% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.03% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.02% optimize.opt_a.parameter_eliminate : 0.000005s : 0.02% optimize.opt_a.a_2 : 0.000237s : 0.95% optimize.opt_a.accelerated_algorithm : 0.000017s : 0.07% optimize.opt_a.shard : 0.000006s : 0.02% optimize.opt_a.meta_shard_fg_expand : 0.000006s : 0.02% optimize.opt_a.shard_inline : 0.000015s : 0.06% optimize.opt_a.merge_send_recv : 0.000021s : 0.08% optimize.opt_a.auto_parallel : 0.000020s : 0.08% optimize.opt_a.parallel : 0.000031s : 0.13% optimize.opt_a.flash_sp : 0.000016s : 0.06% optimize.opt_a.merge_comm : 0.000009s : 0.04% optimize.opt_a.allreduce_fusion : 0.000007s : 0.03% optimize.opt_a.matmul_add_comm_reduction : 0.000022s : 0.09% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000002s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000024s : 0.10% optimize.opt_a.virtual_dataset : 0.000014s : 0.06% optimize.opt_a.get_grad_eliminate_ : 0.000014s : 0.06% optimize.opt_a.virtual_output : 0.000013s : 0.05% optimize.opt_a.merge_forward : 0.000012s : 0.05% optimize.opt_a.cell_reuse_recompute_pass : 0.000006s : 0.03% optimize.opt_a.offload_activation : 0.000023s : 0.09% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000037s : 0.15% optimize.opt_a.merge_recompute_call_nodes : 0.000004s : 0.01% optimize.opt_a.before_grad : 0.000028s : 0.11% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000010s : 0.04% optimize.opt_a.meta_fg_expand : 0.000006s : 0.02% optimize.opt_a.flash_sp_send_recv_attached : 0.000006s : 0.02% optimize.opt_a.receive_attached : 0.000006s : 0.02% optimize.opt_a.after_resolve : 0.000030s : 0.12% optimize.opt_a.a_after_grad : 0.000027s : 0.11% optimize.opt_a.renormalize : 0.012648s : 50.58% optimize.opt_a.add_forward_monad_depend : 0.000015s : 0.06% optimize.opt_a.auto_monad_grad : 0.000005s : 0.02% optimize.opt_a.auto_monad_eliminator : 0.000040s : 0.16% optimize.opt_a.cse : 0.000059s : 0.24% optimize.opt_a.a_3 : 0.000134s : 0.54% optimize.py_interpret_to_execute_after_opt_a : 0.000024s : 0.10% optimize.slice_cell_reuse_recomputed_activation : 0.000006s : 0.02% optimize.rewriter_after_opt_a : 0.000056s : 0.22% optimize.convert_after_rewriter : 0.000012s : 0.05% optimize.order_py_execute_after_rewriter : 0.000009s : 0.04% optimize.mutable_eliminate : 0.000810s : 3.24% optimize.opt_b.b_1 : 0.000176s : 0.70% optimize.opt_b.b_2 : 0.000009s : 0.04% optimize.opt_b.updatestate_depend_eliminate : 0.000009s : 0.04% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.01% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.01% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000030s : 0.12% optimize.optimize_parallel_all_gather_comm : 0.000026s : 0.10% optimize.overlap_param_gather : 0.000006s : 0.02% optimize.cconv : 0.000042s : 0.17% optimize.loop_unroll : 0.000569s : 2.27% optimize.opt_after_cconv.c_1 : 0.000034s : 0.14% optimize.opt_after_cconv.parameter_eliminate : 0.000006s : 0.02% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.03% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.01% optimize.opt_after_cconv.cse : 0.000022s : 0.09% optimize.opt_after_cconv.renormalize : 0.000001s : 0.00% optimize.remove_dup_value : 0.000018s : 0.07% optimize.tuple_transform.d_1 : 0.000053s : 0.21% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000008s : 0.03% optimize.partial_unused_args_eliminate : 0.000005s : 0.02% optimize.add_recomputation : 0.000058s : 0.23% optimize.cse_after_recomputation.cse : 0.000013s : 0.05% optimize.environ_conv : 0.000009s : 0.04% optimize.swap_dp_allreduce_reducescatter : 0.000008s : 0.03% optimize.bias_add_comm_swap : 0.000006s : 0.02% optimize.label_micro_interleaved_index : 0.000008s : 0.03% optimize.label_fine_grained_interleaved_index : 0.000005s : 0.02% optimize.merge_cast_opt : 0.000004s : 0.02% optimize.slice_recompute_activation : 0.000005s : 0.02% optimize.micro_interleaved_order_control : 0.000005s : 0.02% optimize.assign_add_opt : 0.000004s : 0.02% optimize.ForceFp32Comm : 0.000003s : 0.01% optimize.remove_cast_before_assign_add : 0.000003s : 0.01% optimize.full_micro_interleaved_order_control : 0.000005s : 0.02% optimize.reorder_send_recv_between_fp_bp : 0.000006s : 0.02% optimize.comm_op_add_attrs : 0.000003s : 0.01% optimize.add_comm_op_reuse_tag : 0.000004s : 0.01% optimize.interleave_split_concat_branches : 0.000004s : 0.01% optimize.interleave_parallel_branches : 0.000004s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000004s : 0.02% optimize.control_data_broadcast_order : 0.000017s : 0.07% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.02% optimize.offloading_packed_experts : 0.000007s : 0.03% optimize.overlap_recompute_and_grad_model_parallel : 0.000008s : 0.03% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.02% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.02% optimize.overlap_recompute_comm : 0.000005s : 0.02% optimize.overlap_grad_ring_attention : 0.000006s : 0.03% optimize.overlap_grad_flash_sp : 0.000026s : 0.10% optimize.begin_end_overlap_inline : 0.000003s : 0.01% optimize.split_matmul_comm_elemetwise : 0.000005s : 0.02% optimize.split_layernorm_comm : 0.000004s : 0.02% optimize.handle_group_info : 0.000004s : 0.01% optimize.symbol_engine_optimizer.build : 0.000004s : 0.02% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000010s : 0.04% optimize.symbol_engine_optimizer.elim_not_effective : 0.000015s : 0.06% optimize.symbol_engine_optimizer.opt_reshape : 0.000008s : 0.03% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000011s : 0.04% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000005s : 0.02% pipeline_parallel_scheduler : 0.000002s : 0.01% auto_monad_reorder : 0.000023s : 0.09% get_jit_bprop_graph : 0.000002s : 0.01% rewriter_after_jit_bprop_graph : 0.000007s : 0.03% opt_after_jit_grad : 0.000745s : 2.98% validate : 0.000050s : 0.20% Time group info: ------[substitution.] 0.000269 34 16.57% : 0.000045s : 6: substitution.arithmetic_simplify 0.79% : 0.000002s : 2: substitution.elim_not_effective 0.61% : 0.000002s : 2: substitution.fold_const_symbol 2.42% : 0.000007s : 4: substitution.graph_param_transform 66.69% : 0.000180s : 4: substitution.inline 2.20% : 0.000006s : 4: substitution.j_node_and_user_rematch 2.20% : 0.000006s : 4: substitution.remove_not_recompute_node 2.60% : 0.000007s : 4: substitution.replace_old_param 5.92% : 0.000016s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.006438 2 87.24% : 0.005617s : 1: type_inference.infer 12.76% : 0.000821s : 1: type_inference.specialize ------[replace.] 0.000066 8 63.58% : 0.000042s : 4: replace.inline 36.42% : 0.000024s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000191 8 92.59% : 0.000177s : 4: match.inline 7.41% : 0.000014s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000237 1278 1.02% : 0.000002s : 13: predicate.accumulaten_eliminater 1.22% : 0.000003s : 4: predicate.ad_related_special_op_eliminate 0.70% : 0.000002s : 8: predicate.addn_check_dump 0.87% : 0.000002s : 13: predicate.addn_zero_filter 0.71% : 0.000002s : 13: predicate.adjust_all_reduce_mul_add 2.79% : 0.000007s : 21: predicate.arithmetic_simplify 1.00% : 0.000002s : 13: predicate.cast_eliminate 0.62% : 0.000001s : 8: predicate.check_bprop_eliminate 0.55% : 0.000001s : 8: predicate.compare_switch_simplify 0.19% : 0.000000s : 4: predicate.const_output_eliminate 0.81% : 0.000002s : 8: predicate.depend_value_elim 0.86% : 0.000002s : 13: predicate.dict_get_item_const_eliminator 0.98% : 0.000002s : 13: predicate.dict_get_item_eliminator 0.75% : 0.000002s : 13: predicate.dict_set_item_eliminator 1.22% : 0.000003s : 8: predicate.dumpgradient_eliminate 0.37% : 0.000001s : 4: predicate.elim_not_effective 0.37% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.29% : 0.000003s : 17: predicate.environ_add_const_eliminate 1.04% : 0.000002s : 17: predicate.environ_get_add_eliminate 0.99% : 0.000002s : 17: predicate.environ_get_depend_swap 1.62% : 0.000004s : 25: predicate.environ_get_eliminate 0.97% : 0.000002s : 17: predicate.environ_get_set_eliminate 1.51% : 0.000004s : 21: predicate.exchange_switch_depend_value 2.44% : 0.000006s : 21: predicate.float_depend_g_call 0.58% : 0.000001s : 8: predicate.float_environ_get_switch 0.80% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.16% : 0.000000s : 4: predicate.fold_const_symbol 0.80% : 0.000002s : 8: predicate.get_grad_eliminate 0.17% : 0.000000s : 4: predicate.graph_param_transform 0.50% : 0.000001s : 8: predicate.incorporate_call 0.41% : 0.000001s : 8: predicate.incorporate_call_switch 5.77% : 0.000014s : 58: predicate.inline 1.07% : 0.000003s : 8: predicate.inline_without_move 0.30% : 0.000001s : 8: predicate.j_node_and_user_rematch 0.99% : 0.000002s : 8: predicate.less_batch_normalization 1.82% : 0.000004s : 25: predicate.list_to_tuple_eliminator_ 2.20% : 0.000005s : 38: predicate.load_eliminater 0.73% : 0.000002s : 4: predicate.loop_unroll_after_grad 2.09% : 0.000005s : 34: predicate.loop_unroll_before_grad 1.63% : 0.000004s : 21: predicate.make_slice_get_slice_eliminator 0.49% : 0.000001s : 8: predicate.merge_addn 0.61% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.64% : 0.000002s : 8: predicate.mini_step_allgather_replace 0.76% : 0.000002s : 13: predicate.minmaximum_grad 1.09% : 0.000003s : 4: predicate.mutable_eliminate 0.35% : 0.000001s : 4: predicate.opt_reshape 0.46% : 0.000001s : 4: predicate.parallel_virtual_node 1.72% : 0.000004s : 21: predicate.partial_defer_inline 1.42% : 0.000003s : 21: predicate.partial_eliminate 0.89% : 0.000002s : 13: predicate.print_const_string_wrapper 0.61% : 0.000001s : 8: predicate.reduce_all_const_elim 1.21% : 0.000003s : 13: predicate.reduce_eliminate 2.37% : 0.000006s : 38: predicate.redundant_stop_gradient_eliminater 0.67% : 0.000002s : 8: predicate.remove_not_recompute_node 1.31% : 0.000003s : 25: predicate.replace_applicator 0.53% : 0.000001s : 8: predicate.replace_old_param 0.24% : 0.000001s : 4: predicate.reset_defer_inline 0.86% : 0.000002s : 13: predicate.reshape_eliminate 0.70% : 0.000002s : 8: predicate.row_tensor_add_zeros_like 0.46% : 0.000001s : 4: predicate.row_tensor_eliminate 0.91% : 0.000002s : 8: predicate.same_eliminate 0.54% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.98% : 0.000002s : 8: predicate.shard_identity_eliminate 0.77% : 0.000002s : 8: predicate.special_op_eliminate 0.72% : 0.000002s : 8: predicate.specialize_transform 1.14% : 0.000003s : 8: predicate.split_environ_get_set_with_tuple_value 1.13% : 0.000003s : 8: predicate.stack_unstack_eliminate 0.27% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.40% : 0.000003s : 21: predicate.switch_defer_inline 1.96% : 0.000005s : 29: predicate.switch_layer_defer_inline 4.69% : 0.000011s : 67: predicate.switch_simplify 1.25% : 0.000003s : 13: predicate.tile_eliminate 1.05% : 0.000003s : 13: predicate.transpose_eliminate 1.69% : 0.000004s : 21: predicate.tuple_list_convert_item_index_to_positive 1.41% : 0.000003s : 21: predicate.tuple_list_get_item_const_eliminator 1.74% : 0.000004s : 21: predicate.tuple_list_get_item_depend_reorder 3.10% : 0.000007s : 33: predicate.tuple_list_get_item_eliminator 1.55% : 0.000004s : 21: predicate.tuple_list_get_set_item_eliminator 2.20% : 0.000005s : 29: predicate.tuple_list_set_item_eliminator 1.66% : 0.000004s : 25: predicate.tuple_to_list_eliminator_ 2.29% : 0.000005s : 38: predicate.updatestate_pure_node_eliminater 2.83% : 0.000007s : 46: predicate.updatestate_useless_node_eliminater 0.35% : 0.000001s : 4: predicate.value_based_eliminate 0.68% : 0.000002s : 8: predicate.virtual_dataset_eliminate 0.67% : 0.000002s : 8: predicate.virtual_output_eliminate 0.22% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.50% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000684 11 48.78% : 0.000334s : 5: func_graph_cloner_run.FuncGraphClonerGraph 51.22% : 0.000350s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.067622 192 0.01% : 0.000006s : 1: ForceFp32Comm 5.89% : 0.003981s : 1: add_attr 5.85% : 0.003958s : 1: add_attr_with_inline 0.01% : 0.000008s : 1: add_comm_op_reuse_tag 0.09% : 0.000061s : 1: add_recomputation 0.01% : 0.000006s : 1: assign_add_opt 0.12% : 0.000080s : 1: auto_monad 0.04% : 0.000030s : 1: auto_monad_reorder 0.01% : 0.000006s : 1: begin_end_overlap_inline 0.01% : 0.000009s : 1: bias_add_comm_swap 0.72% : 0.000490s : 1: bootstrap 0.07% : 0.000046s : 1: cconv 0.01% : 0.000006s : 1: comm_op_add_attrs 0.03% : 0.000020s : 1: control_data_broadcast_order 0.02% : 0.000016s : 1: convert_after_rewriter 0.05% : 0.000034s : 1: cse_after_recomputation 0.01% : 0.000007s : 1: dataset_repeat_opt 0.04% : 0.000024s : 1: detach_backward 0.02% : 0.000012s : 1: environ_conv 0.05% : 0.000033s : 1: event_method 0.01% : 0.000007s : 1: full_micro_interleaved_order_control 0.01% : 0.000010s : 1: get_jit_bprop_graph 0.02% : 0.000013s : 1: graph_reusing 0.01% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000006s : 1: handle_group_info 0.01% : 0.000009s : 1: inline 0.01% : 0.000009s : 1: insert-virtual-dataset 0.01% : 0.000006s : 1: interleave_parallel_branches 0.01% : 0.000006s : 1: interleave_split_concat_branches 0.01% : 0.000008s : 1: label_fine_grained_interleaved_index 0.02% : 0.000011s : 1: label_micro_interleaved_index 0.85% : 0.000575s : 1: loop_unroll 0.01% : 0.000006s : 1: merge_cast_opt 0.01% : 0.000007s : 1: micro_interleaved_order_control 1.21% : 0.000820s : 1: mutable_eliminate 0.02% : 0.000010s : 1: offloading_packed_experts 0.02% : 0.000016s : 1: opt.transform.loop_unroll_optimizer 0.03% : 0.000023s : 1: opt.transform.mutable_eliminate 2.03% : 0.001373s : 78: opt.transform.opt_a 0.05% : 0.000032s : 1: opt.transform.opt_after_cconv 0.05% : 0.000034s : 1: opt.transform.opt_after_jit_grad 0.16% : 0.000110s : 28: opt.transform.opt_b 0.09% : 0.000059s : 2: opt.transform.opt_trans_graph 0.06% : 0.000040s : 4: opt.transform.symbol_engine_opt 22.98% : 0.015538s : 1: opt_a 0.28% : 0.000189s : 1: opt_after_cconv 1.12% : 0.000759s : 1: opt_after_jit_grad 0.46% : 0.000308s : 1: opt_b 27.86% : 0.018842s : 1: optimize 0.04% : 0.000029s : 1: optimize_parallel_all_gather_comm 0.02% : 0.000013s : 1: order_py_execute_after_rewriter 0.04% : 0.000029s : 1: overlap_grad_flash_sp 0.01% : 0.000007s : 1: overlap_grad_matmul_and_grad_allreduce 0.01% : 0.000009s : 1: overlap_grad_ring_attention 0.01% : 0.000007s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000006s : 1: overlap_opt_shard_in_pipeline 0.01% : 0.000009s : 1: overlap_param_gather 0.01% : 0.000007s : 1: overlap_recompute_allgather_and_fa_grad 0.02% : 0.000013s : 1: overlap_recompute_and_grad_model_parallel 0.01% : 0.000008s : 1: overlap_recompute_comm 0.02% : 0.000011s : 1: parallel-infer-symbol 0.01% : 0.000006s : 1: parallel-infer-symbol-second 0.01% : 0.000008s : 1: partial_unused_args_eliminate 0.01% : 0.000009s : 1: pipeline_parallel_scheduler 0.01% : 0.000007s : 1: pipeline_split 0.08% : 0.000053s : 1: pre_auto_parallel 0.06% : 0.000041s : 1: py_interpret_to_execute 0.04% : 0.000029s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000006s : 1: remove_cast_before_assign_add 0.03% : 0.000021s : 1: remove_dup_value 17.90% : 0.012106s : 1: renormalize.infer 0.77% : 0.000522s : 1: renormalize.specialize 0.01% : 0.000009s : 1: reorder_send_recv_between_fp_bp 0.02% : 0.000013s : 1: rewriter_after_jit_bprop_graph 0.09% : 0.000062s : 1: rewriter_after_opt_a 0.16% : 0.000105s : 1: rewriter_before_opt_a 0.01% : 0.000009s : 1: slice_cell_reuse_recomputed_activation 0.01% : 0.000008s : 1: slice_recompute_activation 0.01% : 0.000008s : 1: split_layernorm_comm 0.01% : 0.000008s : 1: split_matmul_comm_elemetwise 0.02% : 0.000011s : 1: swap_dp_allreduce_reducescatter 0.15% : 0.000104s : 1: symbol_engine_optimizer 0.16% : 0.000106s : 1: tuple_transform 9.69% : 0.006555s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:42:46.662.547 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.104234, [21] [bootstrap]: 0.00043068 [type_inference]: 0.00583602 [event_method]: 2.003e-05 [auto_monad]: 6.396e-05 [graph_reusing]: 6.03998e-06 [inline]: 2.37999e-06 [add_attr]: 0.00339232, [1] [add_attr_with_inline]: 0.00338102, [1] [Cycle 1]: 7.283e-05, [2] [tag_attr]: 2.163e-05 [meta_addattr_fg_expand]: 6.17999e-06 [parallel-infer-symbol]: 4.37e-06 [pre_auto_parallel]: 3.973e-05 [insert-virtual-dataset]: 2.43002e-06 [parallel-infer-symbol-second]: 9.5999e-07 [dataset_repeat_opt]: 2.11003e-06 [pipeline_split]: 1.71e-06 [optimize]: 0.0936847, [53] [py_interpret_to_execute]: 2.791e-05 [rewriter_before_opt_a]: 8.567e-05 [opt_a]: 0.0913848, [2] [Cycle 1]: 0.0906097, [45] [expand_dump_flag]: 2.93e-06 [switch_simplify]: 4.396e-05 [loop_unroll]: 3.066e-05 [a_1]: 0.00067145 [with_stream_mark]: 2.208e-05 [recompute_prepare]: 1.32e-05 [updatestate_depend_eliminate]: 4.48999e-06 [updatestate_assign_eliminate]: 3.08998e-06 [updatestate_loads_eliminate]: 3.08998e-06 [parameter_eliminate]: 2.23002e-06 [a_2]: 8.887e-05 [accelerated_algorithm]: 8.13001e-06 [shard]: 1.62001e-06 [meta_shard_fg_expand]: 2.57001e-06 [shard_inline]: 6.78e-06 [merge_send_recv]: 8.53001e-06 [auto_parallel]: 7.66001e-06 [parallel]: 2.02e-05 [flash_sp]: 9.46003e-06 [merge_comm]: 4.34002e-06 [allreduce_fusion]: 4.22e-06 [matmul_add_comm_reduction]: 1.067e-05 [allreduce_slice_to_reducescatter]: 6.09987e-07 [virtual_shard_identity]: 9.96e-06 [virtual_dataset]: 7.05998e-06 [get_grad_eliminate_]: 6.06998e-06 [virtual_output]: 6.14999e-06 [merge_forward]: 4.25e-06 [cell_reuse_recompute_pass]: 1.64998e-06 [offload_activation]: 1.156e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.25e-05 [merge_recompute_call_nodes]: 1.84e-06 [before_grad]: 1.095e-05 [set_forward_comm_id_for_comm_node_pass]: 4.58999e-06 [meta_fg_expand]: 3.06001e-06 [flash_sp_send_recv_attached]: 3.4e-06 [receive_attached]: 2.19001e-06 [after_resolve]: 1.408e-05 [a_after_grad]: 1.113e-05 [renormalize]: 0.0890967 [add_forward_monad_depend]: 1.32e-05 [auto_monad_grad]: 2.83e-06 [auto_monad_eliminator]: 2.385e-05 [cse]: 3.305e-05 [a_3]: 6.442e-05 [Cycle 2]: 0.00076125, [45] [expand_dump_flag]: 2.29999e-06 [switch_simplify]: 9.31e-06 [loop_unroll]: 6.52001e-06 [a_1]: 0.00016884 [with_stream_mark]: 1.936e-05 [recompute_prepare]: 7.34002e-06 [updatestate_depend_eliminate]: 4.26001e-06 [updatestate_assign_eliminate]: 3.11001e-06 [updatestate_loads_eliminate]: 3.06001e-06 [parameter_eliminate]: 2.00002e-06 [a_2]: 7.997e-05 [accelerated_algorithm]: 7.31001e-06 [shard]: 2.43e-06 [meta_shard_fg_expand]: 2.31e-06 [shard_inline]: 6.19001e-06 [merge_send_recv]: 8.72e-06 [auto_parallel]: 9.29e-06 [parallel]: 9.52999e-06 [flash_sp]: 3.81999e-06 [merge_comm]: 4.05998e-06 [allreduce_fusion]: 3.93001e-06 [matmul_add_comm_reduction]: 9.27001e-06 [allreduce_slice_to_reducescatter]: 7.7e-07 [virtual_shard_identity]: 1.098e-05 [virtual_dataset]: 6.39999e-06 [get_grad_eliminate_]: 6.06e-06 [virtual_output]: 6.22001e-06 [merge_forward]: 4.12e-06 [cell_reuse_recompute_pass]: 3.4e-06 [offload_activation]: 1.025e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.145e-05 [merge_recompute_call_nodes]: 1.49e-06 [before_grad]: 9.99999e-06 [set_forward_comm_id_for_comm_node_pass]: 4.20999e-06 [meta_fg_expand]: 2.67001e-06 [flash_sp_send_recv_attached]: 1.90001e-06 [receive_attached]: 2.33002e-06 [after_resolve]: 1.319e-05 [a_after_grad]: 1.047e-05 [renormalize]: 8.00064e-08 [add_forward_monad_depend]: 1.68002e-06 [auto_monad_grad]: 1.09e-06 [auto_monad_eliminator]: 9.02999e-06 [cse]: 1.597e-05 [a_3]: 3.617e-05 [py_interpret_to_execute_after_opt_a]: 1.765e-05 [slice_cell_reuse_recomputed_activation]: 2.26998e-06 [rewriter_after_opt_a]: 3.898e-05 [convert_after_rewriter]: 6.49999e-06 [order_py_execute_after_rewriter]: 4.99e-06 [mutable_eliminate]: 0.00074085 [opt_b]: 0.00021165, [1] [Cycle 1]: 0.00020444, [7] [b_1]: 0.00012591 [b_2]: 8.50001e-06 [updatestate_depend_eliminate]: 7.16001e-06 [updatestate_assign_eliminate]: 2.53e-06 [updatestate_loads_eliminate]: 2.59001e-06 [renormalize]: 9.5999e-07 [cse]: 2.065e-05 [optimize_parallel_all_gather_comm]: 1.688e-05 [overlap_param_gather]: 1.91e-06 [cconv]: 3.177e-05 [loop_unroll]: 0.0004329 [opt_after_cconv]: 0.00010489, [1] [Cycle 1]: 9.92e-05, [7] [c_1]: 3.263e-05 [parameter_eliminate]: 4.08999e-06 [updatestate_depend_eliminate]: 5.34e-06 [updatestate_assign_eliminate]: 2.44999e-06 [updatestate_loads_eliminate]: 2.17999e-06 [cse]: 1.705e-05 [renormalize]: 5.09986e-07 [remove_dup_value]: 1.346e-05 [tuple_transform]: 7.827e-05, [1] [Cycle 1]: 7.357e-05, [4] [d_1]: 4.61e-05 [none_parameter_eliminate]: 1.63002e-06 [renormalize]: 4.10015e-07 [switch_simplify]: 6.84001e-06 [partial_unused_args_eliminate]: 2.07999e-06 [add_recomputation]: 5.251e-05 [cse_after_recomputation]: 2.227e-05, [1] [Cycle 1]: 1.757e-05, [1] [cse]: 1.158e-05 [environ_conv]: 5.22999e-06 [swap_dp_allreduce_reducescatter]: 5.16002e-06 [bias_add_comm_swap]: 2.88e-06 [label_micro_interleaved_index]: 4.04002e-06 [label_fine_grained_interleaved_index]: 2.98998e-06 [merge_cast_opt]: 1.60999e-06 [slice_recompute_activation]: 2.02999e-06 [micro_interleaved_order_control]: 2.19999e-06 [assign_add_opt]: 1.49998e-06 [ForceFp32Comm]: 1.05001e-06 [remove_cast_before_assign_add]: 1.00999e-06 [full_micro_interleaved_order_control]: 2.04999e-06 [reorder_send_recv_between_fp_bp]: 3.26001e-06 [comm_op_add_attrs]: 1.40001e-06 [add_comm_op_reuse_tag]: 9.60019e-07 [interleave_split_concat_branches]: 1.15001e-06 [interleave_parallel_branches]: 1.03001e-06 [overlap_opt_shard_in_pipeline]: 1.17e-06 [overlap_opt_shard_grad_in_pipeline]: 2.02001e-06 [control_data_broadcast_order]: 1.29e-05 [grouped_pairwise_exchange_alltoall]: 1.50999e-06 [offloading_packed_experts]: 3.81001e-06 [overlap_recompute_and_grad_model_parallel]: 4.37998e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.12999e-06 [overlap_recompute_allgather_and_fa_grad]: 1.45999e-06 [overlap_recompute_comm]: 2.10002e-06 [overlap_grad_ring_attention]: 4.33999e-06 [overlap_grad_flash_sp]: 2.079e-05 [begin_end_overlap_inline]: 4.80009e-07 [split_matmul_comm_elemetwise]: 2.49999e-06 [split_layernorm_comm]: 1.62001e-06 [handle_group_info]: 9.39996e-07 [symbol_engine_optimizer]: 7.844e-05, [1] [Cycle 1]: 7.325e-05, [6] [build]: 3.36001e-06 [elim_shapecalc]: 9.91e-06 [elim_not_effective]: 1.315e-05 [opt_reshape]: 6.68998e-06 [fold_const_symbol]: 1.05e-05 [renormalize]: 2.50002e-07 [detach_backward]: 2.40002e-06 [pipeline_parallel_scheduler]: 1.59e-06 [auto_monad_reorder]: 1.632e-05 [get_jit_bprop_graph]: 1.98997e-06 [rewriter_after_jit_bprop_graph]: 5.18002e-06 [opt_after_jit_grad]: 0.00051533 [validate]: 4.555e-05 Sums bootstrap : 0.000431s : 0.43% type_inference : 0.005836s : 5.85% event_method : 0.000020s : 0.02% auto_monad : 0.000064s : 0.06% graph_reusing : 0.000006s : 0.01% inline : 0.000002s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000022s : 0.02% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.01% parallel-infer-symbol : 0.000004s : 0.00% pre_auto_parallel : 0.000040s : 0.04% insert-virtual-dataset : 0.000002s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000028s : 0.03% optimize.rewriter_before_opt_a : 0.000086s : 0.09% optimize.opt_a.expand_dump_flag : 0.000005s : 0.01% optimize.opt_a.switch_simplify : 0.000053s : 0.05% optimize.opt_a.loop_unroll : 0.000037s : 0.04% optimize.opt_a.a_1 : 0.000840s : 0.84% optimize.opt_a.with_stream_mark : 0.000041s : 0.04% optimize.opt_a.recompute_prepare : 0.000021s : 0.02% optimize.opt_a.updatestate_depend_eliminate : 0.000009s : 0.01% optimize.opt_a.updatestate_assign_eliminate : 0.000006s : 0.01% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.01% optimize.opt_a.parameter_eliminate : 0.000004s : 0.00% optimize.opt_a.a_2 : 0.000169s : 0.17% optimize.opt_a.accelerated_algorithm : 0.000015s : 0.02% optimize.opt_a.shard : 0.000004s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.000005s : 0.00% optimize.opt_a.shard_inline : 0.000013s : 0.01% optimize.opt_a.merge_send_recv : 0.000017s : 0.02% optimize.opt_a.auto_parallel : 0.000017s : 0.02% optimize.opt_a.parallel : 0.000030s : 0.03% optimize.opt_a.flash_sp : 0.000013s : 0.01% optimize.opt_a.merge_comm : 0.000008s : 0.01% optimize.opt_a.allreduce_fusion : 0.000008s : 0.01% optimize.opt_a.matmul_add_comm_reduction : 0.000020s : 0.02% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000021s : 0.02% optimize.opt_a.virtual_dataset : 0.000013s : 0.01% optimize.opt_a.get_grad_eliminate_ : 0.000012s : 0.01% optimize.opt_a.virtual_output : 0.000012s : 0.01% optimize.opt_a.merge_forward : 0.000008s : 0.01% optimize.opt_a.cell_reuse_recompute_pass : 0.000005s : 0.01% optimize.opt_a.offload_activation : 0.000022s : 0.02% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000024s : 0.02% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.00% optimize.opt_a.before_grad : 0.000021s : 0.02% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000009s : 0.01% optimize.opt_a.meta_fg_expand : 0.000006s : 0.01% optimize.opt_a.flash_sp_send_recv_attached : 0.000005s : 0.01% optimize.opt_a.receive_attached : 0.000005s : 0.00% optimize.opt_a.after_resolve : 0.000027s : 0.03% optimize.opt_a.a_after_grad : 0.000022s : 0.02% optimize.opt_a.renormalize : 0.089097s : 89.27% optimize.opt_a.add_forward_monad_depend : 0.000015s : 0.01% optimize.opt_a.auto_monad_grad : 0.000004s : 0.00% optimize.opt_a.auto_monad_eliminator : 0.000033s : 0.03% optimize.opt_a.cse : 0.000049s : 0.05% optimize.opt_a.a_3 : 0.000101s : 0.10% optimize.py_interpret_to_execute_after_opt_a : 0.000018s : 0.02% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.00% optimize.rewriter_after_opt_a : 0.000039s : 0.04% optimize.convert_after_rewriter : 0.000006s : 0.01% optimize.order_py_execute_after_rewriter : 0.000005s : 0.00% optimize.mutable_eliminate : 0.000741s : 0.74% optimize.opt_b.b_1 : 0.000126s : 0.13% optimize.opt_b.b_2 : 0.000009s : 0.01% optimize.opt_b.updatestate_depend_eliminate : 0.000007s : 0.01% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.00% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000021s : 0.02% optimize.optimize_parallel_all_gather_comm : 0.000017s : 0.02% optimize.overlap_param_gather : 0.000002s : 0.00% optimize.cconv : 0.000032s : 0.03% optimize.loop_unroll : 0.000433s : 0.43% optimize.opt_after_cconv.c_1 : 0.000033s : 0.03% optimize.opt_after_cconv.parameter_eliminate : 0.000004s : 0.00% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000005s : 0.01% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000002s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.00% optimize.opt_after_cconv.cse : 0.000017s : 0.02% optimize.opt_after_cconv.renormalize : 0.000001s : 0.00% optimize.remove_dup_value : 0.000013s : 0.01% optimize.tuple_transform.d_1 : 0.000046s : 0.05% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000007s : 0.01% optimize.partial_unused_args_eliminate : 0.000002s : 0.00% optimize.add_recomputation : 0.000053s : 0.05% optimize.cse_after_recomputation.cse : 0.000012s : 0.01% optimize.environ_conv : 0.000005s : 0.01% optimize.swap_dp_allreduce_reducescatter : 0.000005s : 0.01% optimize.bias_add_comm_swap : 0.000003s : 0.00% optimize.label_micro_interleaved_index : 0.000004s : 0.00% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.00% optimize.merge_cast_opt : 0.000002s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.00% optimize.micro_interleaved_order_control : 0.000002s : 0.00% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.00% optimize.full_micro_interleaved_order_control : 0.000002s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.00% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.00% optimize.control_data_broadcast_order : 0.000013s : 0.01% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.00% optimize.offloading_packed_experts : 0.000004s : 0.00% optimize.overlap_recompute_and_grad_model_parallel : 0.000004s : 0.00% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.00% optimize.overlap_recompute_comm : 0.000002s : 0.00% optimize.overlap_grad_ring_attention : 0.000004s : 0.00% optimize.overlap_grad_flash_sp : 0.000021s : 0.02% optimize.begin_end_overlap_inline : 0.000000s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.00% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000003s : 0.00% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000010s : 0.01% optimize.symbol_engine_optimizer.elim_not_effective : 0.000013s : 0.01% optimize.symbol_engine_optimizer.opt_reshape : 0.000007s : 0.01% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000010s : 0.01% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.00% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000016s : 0.02% get_jit_bprop_graph : 0.000002s : 0.00% rewriter_after_jit_bprop_graph : 0.000005s : 0.01% opt_after_jit_grad : 0.000515s : 0.52% validate : 0.000046s : 0.05% Time group info: ------[substitution.] 0.000239 34 17.70% : 0.000042s : 6: substitution.arithmetic_simplify 1.11% : 0.000003s : 2: substitution.elim_not_effective 0.53% : 0.000001s : 2: substitution.fold_const_symbol 2.37% : 0.000006s : 4: substitution.graph_param_transform 65.27% : 0.000156s : 4: substitution.inline 1.93% : 0.000005s : 4: substitution.j_node_and_user_rematch 2.06% : 0.000005s : 4: substitution.remove_not_recompute_node 2.60% : 0.000006s : 4: substitution.replace_old_param 6.44% : 0.000015s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.005766 2 87.65% : 0.005054s : 1: type_inference.infer 12.35% : 0.000712s : 1: type_inference.specialize ------[replace.] 0.000067 8 60.07% : 0.000040s : 4: replace.inline 39.93% : 0.000027s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000165 8 91.86% : 0.000152s : 4: match.inline 8.14% : 0.000013s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000217 1278 1.00% : 0.000002s : 13: predicate.accumulaten_eliminater 0.95% : 0.000002s : 4: predicate.ad_related_special_op_eliminate 0.51% : 0.000001s : 8: predicate.addn_check_dump 1.03% : 0.000002s : 13: predicate.addn_zero_filter 0.86% : 0.000002s : 13: predicate.adjust_all_reduce_mul_add 2.75% : 0.000006s : 21: predicate.arithmetic_simplify 0.92% : 0.000002s : 13: predicate.cast_eliminate 0.55% : 0.000001s : 8: predicate.check_bprop_eliminate 0.51% : 0.000001s : 8: predicate.compare_switch_simplify 0.17% : 0.000000s : 4: predicate.const_output_eliminate 0.60% : 0.000001s : 8: predicate.depend_value_elim 0.89% : 0.000002s : 13: predicate.dict_get_item_const_eliminator 1.04% : 0.000002s : 13: predicate.dict_get_item_eliminator 0.86% : 0.000002s : 13: predicate.dict_set_item_eliminator 1.03% : 0.000002s : 8: predicate.dumpgradient_eliminate 0.19% : 0.000000s : 4: predicate.elim_not_effective 0.34% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.21% : 0.000003s : 17: predicate.environ_add_const_eliminate 1.06% : 0.000002s : 17: predicate.environ_get_add_eliminate 1.01% : 0.000002s : 17: predicate.environ_get_depend_swap 1.58% : 0.000003s : 25: predicate.environ_get_eliminate 1.14% : 0.000002s : 17: predicate.environ_get_set_eliminate 1.43% : 0.000003s : 21: predicate.exchange_switch_depend_value 2.52% : 0.000005s : 21: predicate.float_depend_g_call 0.57% : 0.000001s : 8: predicate.float_environ_get_switch 0.80% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.18% : 0.000000s : 4: predicate.fold_const_symbol 0.79% : 0.000002s : 8: predicate.get_grad_eliminate 0.29% : 0.000001s : 4: predicate.graph_param_transform 0.56% : 0.000001s : 8: predicate.incorporate_call 0.45% : 0.000001s : 8: predicate.incorporate_call_switch 6.55% : 0.000014s : 58: predicate.inline 1.08% : 0.000002s : 8: predicate.inline_without_move 0.31% : 0.000001s : 8: predicate.j_node_and_user_rematch 0.95% : 0.000002s : 8: predicate.less_batch_normalization 1.73% : 0.000004s : 25: predicate.list_to_tuple_eliminator_ 2.30% : 0.000005s : 38: predicate.load_eliminater 0.82% : 0.000002s : 4: predicate.loop_unroll_after_grad 2.32% : 0.000005s : 34: predicate.loop_unroll_before_grad 1.62% : 0.000004s : 21: predicate.make_slice_get_slice_eliminator 0.46% : 0.000001s : 8: predicate.merge_addn 0.58% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.52% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.84% : 0.000002s : 13: predicate.minmaximum_grad 1.15% : 0.000002s : 4: predicate.mutable_eliminate 0.33% : 0.000001s : 4: predicate.opt_reshape 0.42% : 0.000001s : 4: predicate.parallel_virtual_node 1.75% : 0.000004s : 21: predicate.partial_defer_inline 1.56% : 0.000003s : 21: predicate.partial_eliminate 0.88% : 0.000002s : 13: predicate.print_const_string_wrapper 0.62% : 0.000001s : 8: predicate.reduce_all_const_elim 1.28% : 0.000003s : 13: predicate.reduce_eliminate 2.37% : 0.000005s : 38: predicate.redundant_stop_gradient_eliminater 0.41% : 0.000001s : 8: predicate.remove_not_recompute_node 1.47% : 0.000003s : 25: predicate.replace_applicator 0.58% : 0.000001s : 8: predicate.replace_old_param 0.43% : 0.000001s : 4: predicate.reset_defer_inline 0.94% : 0.000002s : 13: predicate.reshape_eliminate 0.80% : 0.000002s : 8: predicate.row_tensor_add_zeros_like 0.37% : 0.000001s : 4: predicate.row_tensor_eliminate 0.83% : 0.000002s : 8: predicate.same_eliminate 0.56% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.69% : 0.000001s : 8: predicate.shard_identity_eliminate 0.68% : 0.000001s : 8: predicate.special_op_eliminate 0.65% : 0.000001s : 8: predicate.specialize_transform 1.03% : 0.000002s : 8: predicate.split_environ_get_set_with_tuple_value 0.87% : 0.000002s : 8: predicate.stack_unstack_eliminate 0.34% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.48% : 0.000003s : 21: predicate.switch_defer_inline 1.97% : 0.000004s : 29: predicate.switch_layer_defer_inline 5.06% : 0.000011s : 67: predicate.switch_simplify 0.92% : 0.000002s : 13: predicate.tile_eliminate 0.87% : 0.000002s : 13: predicate.transpose_eliminate 1.44% : 0.000003s : 21: predicate.tuple_list_convert_item_index_to_positive 1.50% : 0.000003s : 21: predicate.tuple_list_get_item_const_eliminator 1.33% : 0.000003s : 21: predicate.tuple_list_get_item_depend_reorder 3.25% : 0.000007s : 33: predicate.tuple_list_get_item_eliminator 1.44% : 0.000003s : 21: predicate.tuple_list_get_set_item_eliminator 2.26% : 0.000005s : 29: predicate.tuple_list_set_item_eliminator 1.66% : 0.000004s : 25: predicate.tuple_to_list_eliminator_ 2.28% : 0.000005s : 38: predicate.updatestate_pure_node_eliminater 3.06% : 0.000007s : 46: predicate.updatestate_useless_node_eliminater 0.35% : 0.000001s : 4: predicate.value_based_eliminate 0.80% : 0.000002s : 8: predicate.virtual_dataset_eliminate 0.62% : 0.000001s : 8: predicate.virtual_output_eliminate 0.24% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.60% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000635 11 47.87% : 0.000304s : 5: func_graph_cloner_run.FuncGraphClonerGraph 52.13% : 0.000331s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.291837 192 0.00% : 0.000004s : 1: ForceFp32Comm 1.16% : 0.003397s : 1: add_attr 1.16% : 0.003385s : 1: add_attr_with_inline 0.00% : 0.000004s : 1: add_comm_op_reuse_tag 0.02% : 0.000057s : 1: add_recomputation 0.00% : 0.000004s : 1: assign_add_opt 0.02% : 0.000069s : 1: auto_monad 0.01% : 0.000020s : 1: auto_monad_reorder 0.00% : 0.000003s : 1: begin_end_overlap_inline 0.00% : 0.000006s : 1: bias_add_comm_swap 0.16% : 0.000462s : 1: bootstrap 0.01% : 0.000035s : 1: cconv 0.00% : 0.000004s : 1: comm_op_add_attrs 0.01% : 0.000017s : 1: control_data_broadcast_order 0.00% : 0.000010s : 1: convert_after_rewriter 0.01% : 0.000025s : 1: cse_after_recomputation 0.00% : 0.000005s : 1: dataset_repeat_opt 0.00% : 0.000006s : 1: detach_backward 0.00% : 0.000008s : 1: environ_conv 0.01% : 0.000028s : 1: event_method 0.00% : 0.000005s : 1: full_micro_interleaved_order_control 0.00% : 0.000005s : 1: get_jit_bprop_graph 0.00% : 0.000010s : 1: graph_reusing 0.00% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000004s : 1: handle_group_info 0.00% : 0.000005s : 1: inline 0.00% : 0.000006s : 1: insert-virtual-dataset 0.00% : 0.000004s : 1: interleave_parallel_branches 0.00% : 0.000004s : 1: interleave_split_concat_branches 0.00% : 0.000006s : 1: label_fine_grained_interleaved_index 0.00% : 0.000007s : 1: label_micro_interleaved_index 0.15% : 0.000440s : 1: loop_unroll 0.00% : 0.000004s : 1: merge_cast_opt 0.00% : 0.000005s : 1: micro_interleaved_order_control 0.26% : 0.000751s : 1: mutable_eliminate 0.00% : 0.000007s : 1: offloading_packed_experts 0.01% : 0.000015s : 1: opt.transform.loop_unroll_optimizer 0.01% : 0.000018s : 1: opt.transform.mutable_eliminate 0.45% : 0.001300s : 78: opt.transform.opt_a 0.01% : 0.000031s : 1: opt.transform.opt_after_cconv 0.01% : 0.000029s : 1: opt.transform.opt_after_jit_grad 0.03% : 0.000102s : 28: opt.transform.opt_b 0.02% : 0.000051s : 2: opt.transform.opt_trans_graph 0.01% : 0.000037s : 4: opt.transform.symbol_engine_opt 31.31% : 0.091388s : 1: opt_a 0.04% : 0.000108s : 1: opt_after_cconv 0.18% : 0.000526s : 1: opt_after_jit_grad 0.07% : 0.000216s : 1: opt_b 32.10% : 0.093690s : 1: optimize 0.01% : 0.000021s : 1: optimize_parallel_all_gather_comm 0.00% : 0.000008s : 1: order_py_execute_after_rewriter 0.01% : 0.000025s : 1: overlap_grad_flash_sp 0.00% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.00% : 0.000007s : 1: overlap_grad_ring_attention 0.00% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000005s : 1: overlap_param_gather 0.00% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.00% : 0.000008s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000005s : 1: overlap_recompute_comm 0.00% : 0.000008s : 1: parallel-infer-symbol 0.00% : 0.000004s : 1: parallel-infer-symbol-second 0.00% : 0.000005s : 1: partial_unused_args_eliminate 0.00% : 0.000005s : 1: pipeline_parallel_scheduler 0.00% : 0.000005s : 1: pipeline_split 0.02% : 0.000044s : 1: pre_auto_parallel 0.01% : 0.000032s : 1: py_interpret_to_execute 0.01% : 0.000022s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000004s : 1: remove_cast_before_assign_add 0.01% : 0.000017s : 1: remove_dup_value 30.35% : 0.088560s : 1: renormalize.infer 0.18% : 0.000518s : 1: renormalize.specialize 0.00% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.00% : 0.000008s : 1: rewriter_after_jit_bprop_graph 0.02% : 0.000045s : 1: rewriter_after_opt_a 0.03% : 0.000091s : 1: rewriter_before_opt_a 0.00% : 0.000006s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000005s : 1: slice_recompute_activation 0.00% : 0.000005s : 1: split_layernorm_comm 0.00% : 0.000005s : 1: split_matmul_comm_elemetwise 0.00% : 0.000008s : 1: swap_dp_allreduce_reducescatter 0.03% : 0.000082s : 1: symbol_engine_optimizer 0.03% : 0.000081s : 1: tuple_transform 2.01% : 0.005855s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:42:47.763.545 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:42:47.763.810 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0281599, [21] [bootstrap]: 0.0004389 [type_inference]: 0.00568253 [event_method]: 1.847e-05 [auto_monad]: 6.151e-05 [graph_reusing]: 6.06e-06 [inline]: 1.97001e-06 [add_attr]: 0.00309623, [1] [add_attr_with_inline]: 0.0030873, [1] [Cycle 1]: 7.628e-05, [2] [tag_attr]: 1.903e-05 [meta_addattr_fg_expand]: 6.29001e-06 [parallel-infer-symbol]: 3.16001e-06 [pre_auto_parallel]: 3.47e-05 [insert-virtual-dataset]: 2.67001e-06 [parallel-infer-symbol-second]: 8.50006e-07 [dataset_repeat_opt]: 2.16e-06 [pipeline_split]: 1.62001e-06 [optimize]: 0.017595, [53] [py_interpret_to_execute]: 2.982e-05 [rewriter_before_opt_a]: 8.515e-05 [opt_a]: 0.014892, [2] [Cycle 1]: 0.013886, [45] [expand_dump_flag]: 2.96001e-06 [switch_simplify]: 4.041e-05 [loop_unroll]: 3.058e-05 [a_1]: 0.00063788 [with_stream_mark]: 1.736e-05 [recompute_prepare]: 9.69999e-06 [updatestate_depend_eliminate]: 3.63999e-06 [updatestate_assign_eliminate]: 3.34001e-06 [updatestate_loads_eliminate]: 2.94999e-06 [parameter_eliminate]: 2.26e-06 [a_2]: 0.00011666 [accelerated_algorithm]: 7.66001e-06 [shard]: 1.79e-06 [meta_shard_fg_expand]: 2.03997e-06 [shard_inline]: 7.58001e-06 [merge_send_recv]: 9.17001e-06 [auto_parallel]: 7.06999e-06 [parallel]: 2.017e-05 [flash_sp]: 9.15001e-06 [merge_comm]: 4.11001e-06 [allreduce_fusion]: 3.81001e-06 [matmul_add_comm_reduction]: 1.002e-05 [allreduce_slice_to_reducescatter]: 7.50006e-07 [virtual_shard_identity]: 9.81e-06 [virtual_dataset]: 7.19001e-06 [get_grad_eliminate_]: 6.45002e-06 [virtual_output]: 6.74999e-06 [merge_forward]: 4.62e-06 [cell_reuse_recompute_pass]: 1.92001e-06 [offload_activation]: 1.012e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.559e-05 [merge_recompute_call_nodes]: 1.47001e-06 [before_grad]: 1.072e-05 [set_forward_comm_id_for_comm_node_pass]: 4.22003e-06 [meta_fg_expand]: 2.81e-06 [flash_sp_send_recv_attached]: 2.42001e-06 [receive_attached]: 2.31e-06 [after_resolve]: 1.227e-05 [a_after_grad]: 1.122e-05 [renormalize]: 0.0122294 [add_forward_monad_depend]: 1.182e-05 [auto_monad_grad]: 2.37001e-06 [auto_monad_eliminator]: 2.605e-05 [cse]: 3.203e-05 [a_3]: 7.773e-05 [Cycle 2]: 0.00098778, [45] [expand_dump_flag]: 2.94001e-06 [switch_simplify]: 9.88002e-06 [loop_unroll]: 6.71e-06 [a_1]: 0.00017077 [with_stream_mark]: 2.056e-05 [recompute_prepare]: 6.98e-06 [updatestate_depend_eliminate]: 4.04002e-06 [updatestate_assign_eliminate]: 4e-06 [updatestate_loads_eliminate]: 3.36999e-06 [parameter_eliminate]: 2.02999e-06 [a_2]: 0.0001141 [accelerated_algorithm]: 7.7e-06 [shard]: 2.68e-06 [meta_shard_fg_expand]: 3.11999e-06 [shard_inline]: 6.96001e-06 [merge_send_recv]: 9.16998e-06 [auto_parallel]: 1.025e-05 [parallel]: 9.87001e-06 [flash_sp]: 4.57e-06 [merge_comm]: 3.5e-06 [allreduce_fusion]: 3.67002e-06 [matmul_add_comm_reduction]: 1.046e-05 [allreduce_slice_to_reducescatter]: 9.09989e-07 [virtual_shard_identity]: 8.43001e-06 [virtual_dataset]: 6.81001e-06 [get_grad_eliminate_]: 6.61999e-06 [virtual_output]: 6.84999e-06 [merge_forward]: 4.82e-06 [cell_reuse_recompute_pass]: 3.71999e-06 [offload_activation]: 1.088e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.603e-05 [merge_recompute_call_nodes]: 1.49998e-06 [before_grad]: 1.383e-05 [set_forward_comm_id_for_comm_node_pass]: 4.95001e-06 [meta_fg_expand]: 3.25998e-06 [flash_sp_send_recv_attached]: 1.74e-06 [receive_attached]: 2.74999e-06 [after_resolve]: 1.47e-05 [a_after_grad]: 1.11e-05 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 2.32999e-06 [auto_monad_grad]: 1.30001e-06 [auto_monad_eliminator]: 1.168e-05 [cse]: 1.758e-05 [a_3]: 5.381e-05 [py_interpret_to_execute_after_opt_a]: 2.148e-05 [slice_cell_reuse_recomputed_activation]: 5.39e-06 [rewriter_after_opt_a]: 4.654e-05 [convert_after_rewriter]: 1.135e-05 [order_py_execute_after_rewriter]: 8.28999e-06 [mutable_eliminate]: 0.00074246 [opt_b]: 0.00029234, [1] [Cycle 1]: 0.00028143, [7] [b_1]: 0.00017275 [b_2]: 9.20999e-06 [updatestate_depend_eliminate]: 9.93002e-06 [updatestate_assign_eliminate]: 2.86e-06 [updatestate_loads_eliminate]: 2.46e-06 [renormalize]: 6.60017e-07 [cse]: 2.228e-05 [optimize_parallel_all_gather_comm]: 2.226e-05 [overlap_param_gather]: 5.15999e-06 [cconv]: 3.481e-05 [loop_unroll]: 0.00050557 [opt_after_cconv]: 0.00013673, [1] [Cycle 1]: 0.00012676, [7] [c_1]: 3.227e-05 [parameter_eliminate]: 4.52e-06 [updatestate_depend_eliminate]: 8.28999e-06 [updatestate_assign_eliminate]: 2.74999e-06 [updatestate_loads_eliminate]: 2.59999e-06 [cse]: 1.977e-05 [renormalize]: 5.50004e-07 [remove_dup_value]: 1.736e-05 [tuple_transform]: 9.529e-05, [1] [Cycle 1]: 8.814e-05, [4] [d_1]: 4.821e-05 [none_parameter_eliminate]: 1.70001e-06 [renormalize]: 2.00002e-07 [switch_simplify]: 7.4e-06 [partial_unused_args_eliminate]: 4.43999e-06 [add_recomputation]: 5.723e-05 [cse_after_recomputation]: 5.451e-05, [1] [Cycle 1]: 2.644e-05, [1] [cse]: 1.422e-05 [environ_conv]: 9.31e-06 [swap_dp_allreduce_reducescatter]: 8.77999e-06 [bias_add_comm_swap]: 6.36e-06 [label_micro_interleaved_index]: 7.6e-06 [label_fine_grained_interleaved_index]: 5.20001e-06 [merge_cast_opt]: 3.71999e-06 [slice_recompute_activation]: 4.38001e-06 [micro_interleaved_order_control]: 5.15999e-06 [assign_add_opt]: 3.6e-06 [ForceFp32Comm]: 3.27997e-06 [remove_cast_before_assign_add]: 3.63e-06 [full_micro_interleaved_order_control]: 4.62e-06 [reorder_send_recv_between_fp_bp]: 5.67999e-06 [comm_op_add_attrs]: 3.48999e-06 [add_comm_op_reuse_tag]: 3.26999e-06 [interleave_split_concat_branches]: 3.86999e-06 [interleave_parallel_branches]: 3.68e-06 [overlap_opt_shard_in_pipeline]: 3.51001e-06 [overlap_opt_shard_grad_in_pipeline]: 4.81002e-06 [control_data_broadcast_order]: 1.762e-05 [grouped_pairwise_exchange_alltoall]: 4.36002e-06 [offloading_packed_experts]: 7.7e-06 [overlap_recompute_and_grad_model_parallel]: 7.58001e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.6e-06 [overlap_recompute_allgather_and_fa_grad]: 3.71999e-06 [overlap_recompute_comm]: 5.28002e-06 [overlap_grad_ring_attention]: 6.94999e-06 [overlap_grad_flash_sp]: 2.635e-05 [begin_end_overlap_inline]: 2.86999e-06 [split_matmul_comm_elemetwise]: 5.08002e-06 [split_layernorm_comm]: 4.19002e-06 [handle_group_info]: 3.41001e-06 [symbol_engine_optimizer]: 0.00010998, [1] [Cycle 1]: 0.00010156, [6] [build]: 4.16001e-06 [elim_shapecalc]: 1.213e-05 [elim_not_effective]: 1.56e-05 [opt_reshape]: 8.18001e-06 [fold_const_symbol]: 1.039e-05 [renormalize]: 2.50002e-07 [detach_backward]: 4.13001e-06 [pipeline_parallel_scheduler]: 1.99999e-06 [auto_monad_reorder]: 2.148e-05 [get_jit_bprop_graph]: 2.33998e-06 [rewriter_after_jit_bprop_graph]: 6.58e-06 [opt_after_jit_grad]: 0.00055641 [validate]: 4.357e-05 Sums bootstrap : 0.000439s : 1.89% type_inference : 0.005683s : 24.51% event_method : 0.000018s : 0.08% auto_monad : 0.000062s : 0.27% graph_reusing : 0.000006s : 0.03% inline : 0.000002s : 0.01% add_attr.add_attr_with_inline.tag_attr : 0.000019s : 0.08% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.03% parallel-infer-symbol : 0.000003s : 0.01% pre_auto_parallel : 0.000035s : 0.15% insert-virtual-dataset : 0.000003s : 0.01% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.01% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000030s : 0.13% optimize.rewriter_before_opt_a : 0.000085s : 0.37% optimize.opt_a.expand_dump_flag : 0.000006s : 0.03% optimize.opt_a.switch_simplify : 0.000050s : 0.22% optimize.opt_a.loop_unroll : 0.000037s : 0.16% optimize.opt_a.a_1 : 0.000809s : 3.49% optimize.opt_a.with_stream_mark : 0.000038s : 0.16% optimize.opt_a.recompute_prepare : 0.000017s : 0.07% optimize.opt_a.updatestate_depend_eliminate : 0.000008s : 0.03% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.03% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.03% optimize.opt_a.parameter_eliminate : 0.000004s : 0.02% optimize.opt_a.a_2 : 0.000231s : 1.00% optimize.opt_a.accelerated_algorithm : 0.000015s : 0.07% optimize.opt_a.shard : 0.000004s : 0.02% optimize.opt_a.meta_shard_fg_expand : 0.000005s : 0.02% optimize.opt_a.shard_inline : 0.000015s : 0.06% optimize.opt_a.merge_send_recv : 0.000018s : 0.08% optimize.opt_a.auto_parallel : 0.000017s : 0.07% optimize.opt_a.parallel : 0.000030s : 0.13% optimize.opt_a.flash_sp : 0.000014s : 0.06% optimize.opt_a.merge_comm : 0.000008s : 0.03% optimize.opt_a.allreduce_fusion : 0.000007s : 0.03% optimize.opt_a.matmul_add_comm_reduction : 0.000020s : 0.09% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000002s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000018s : 0.08% optimize.opt_a.virtual_dataset : 0.000014s : 0.06% optimize.opt_a.get_grad_eliminate_ : 0.000013s : 0.06% optimize.opt_a.virtual_output : 0.000014s : 0.06% optimize.opt_a.merge_forward : 0.000009s : 0.04% optimize.opt_a.cell_reuse_recompute_pass : 0.000006s : 0.02% optimize.opt_a.offload_activation : 0.000021s : 0.09% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000032s : 0.14% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.01% optimize.opt_a.before_grad : 0.000025s : 0.11% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000009s : 0.04% optimize.opt_a.meta_fg_expand : 0.000006s : 0.03% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.02% optimize.opt_a.receive_attached : 0.000005s : 0.02% optimize.opt_a.after_resolve : 0.000027s : 0.12% optimize.opt_a.a_after_grad : 0.000022s : 0.10% optimize.opt_a.renormalize : 0.012229s : 52.75% optimize.opt_a.add_forward_monad_depend : 0.000014s : 0.06% optimize.opt_a.auto_monad_grad : 0.000004s : 0.02% optimize.opt_a.auto_monad_eliminator : 0.000038s : 0.16% optimize.opt_a.cse : 0.000050s : 0.21% optimize.opt_a.a_3 : 0.000132s : 0.57% optimize.py_interpret_to_execute_after_opt_a : 0.000021s : 0.09% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.02% optimize.rewriter_after_opt_a : 0.000047s : 0.20% optimize.convert_after_rewriter : 0.000011s : 0.05% optimize.order_py_execute_after_rewriter : 0.000008s : 0.04% optimize.mutable_eliminate : 0.000742s : 3.20% optimize.opt_b.b_1 : 0.000173s : 0.75% optimize.opt_b.b_2 : 0.000009s : 0.04% optimize.opt_b.updatestate_depend_eliminate : 0.000010s : 0.04% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.01% optimize.opt_b.updatestate_loads_eliminate : 0.000002s : 0.01% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000022s : 0.10% optimize.optimize_parallel_all_gather_comm : 0.000022s : 0.10% optimize.overlap_param_gather : 0.000005s : 0.02% optimize.cconv : 0.000035s : 0.15% optimize.loop_unroll : 0.000506s : 2.18% optimize.opt_after_cconv.c_1 : 0.000032s : 0.14% optimize.opt_after_cconv.parameter_eliminate : 0.000005s : 0.02% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000008s : 0.04% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.cse : 0.000020s : 0.09% optimize.opt_after_cconv.renormalize : 0.000001s : 0.00% optimize.remove_dup_value : 0.000017s : 0.07% optimize.tuple_transform.d_1 : 0.000048s : 0.21% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000007s : 0.03% optimize.partial_unused_args_eliminate : 0.000004s : 0.02% optimize.add_recomputation : 0.000057s : 0.25% optimize.cse_after_recomputation.cse : 0.000014s : 0.06% optimize.environ_conv : 0.000009s : 0.04% optimize.swap_dp_allreduce_reducescatter : 0.000009s : 0.04% optimize.bias_add_comm_swap : 0.000006s : 0.03% optimize.label_micro_interleaved_index : 0.000008s : 0.03% optimize.label_fine_grained_interleaved_index : 0.000005s : 0.02% optimize.merge_cast_opt : 0.000004s : 0.02% optimize.slice_recompute_activation : 0.000004s : 0.02% optimize.micro_interleaved_order_control : 0.000005s : 0.02% optimize.assign_add_opt : 0.000004s : 0.02% optimize.ForceFp32Comm : 0.000003s : 0.01% optimize.remove_cast_before_assign_add : 0.000004s : 0.02% optimize.full_micro_interleaved_order_control : 0.000005s : 0.02% optimize.reorder_send_recv_between_fp_bp : 0.000006s : 0.02% optimize.comm_op_add_attrs : 0.000003s : 0.02% optimize.add_comm_op_reuse_tag : 0.000003s : 0.01% optimize.interleave_split_concat_branches : 0.000004s : 0.02% optimize.interleave_parallel_branches : 0.000004s : 0.02% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.02% optimize.overlap_opt_shard_grad_in_pipeline : 0.000005s : 0.02% optimize.control_data_broadcast_order : 0.000018s : 0.08% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.02% optimize.offloading_packed_experts : 0.000008s : 0.03% optimize.overlap_recompute_and_grad_model_parallel : 0.000008s : 0.03% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.02% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.02% optimize.overlap_recompute_comm : 0.000005s : 0.02% optimize.overlap_grad_ring_attention : 0.000007s : 0.03% optimize.overlap_grad_flash_sp : 0.000026s : 0.11% optimize.begin_end_overlap_inline : 0.000003s : 0.01% optimize.split_matmul_comm_elemetwise : 0.000005s : 0.02% optimize.split_layernorm_comm : 0.000004s : 0.02% optimize.handle_group_info : 0.000003s : 0.01% optimize.symbol_engine_optimizer.build : 0.000004s : 0.02% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000012s : 0.05% optimize.symbol_engine_optimizer.elim_not_effective : 0.000016s : 0.07% optimize.symbol_engine_optimizer.opt_reshape : 0.000008s : 0.04% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000010s : 0.04% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000004s : 0.02% pipeline_parallel_scheduler : 0.000002s : 0.01% auto_monad_reorder : 0.000021s : 0.09% get_jit_bprop_graph : 0.000002s : 0.01% rewriter_after_jit_bprop_graph : 0.000007s : 0.03% opt_after_jit_grad : 0.000556s : 2.40% validate : 0.000044s : 0.19% Time group info: ------[substitution.] 0.000221 34 18.94% : 0.000042s : 6: substitution.arithmetic_simplify 1.05% : 0.000002s : 2: substitution.elim_not_effective 0.68% : 0.000002s : 2: substitution.fold_const_symbol 2.97% : 0.000007s : 4: substitution.graph_param_transform 61.79% : 0.000137s : 4: substitution.inline 2.16% : 0.000005s : 4: substitution.j_node_and_user_rematch 2.40% : 0.000005s : 4: substitution.remove_not_recompute_node 2.91% : 0.000006s : 4: substitution.replace_old_param 7.10% : 0.000016s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.005633 2 88.25% : 0.004971s : 1: type_inference.infer 11.75% : 0.000662s : 1: type_inference.specialize ------[replace.] 0.000062 8 62.16% : 0.000038s : 4: replace.inline 37.84% : 0.000023s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000148 8 90.60% : 0.000134s : 4: match.inline 9.40% : 0.000014s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000225 1278 0.89% : 0.000002s : 13: predicate.accumulaten_eliminater 0.93% : 0.000002s : 4: predicate.ad_related_special_op_eliminate 0.57% : 0.000001s : 8: predicate.addn_check_dump 1.07% : 0.000002s : 13: predicate.addn_zero_filter 0.79% : 0.000002s : 13: predicate.adjust_all_reduce_mul_add 2.56% : 0.000006s : 21: predicate.arithmetic_simplify 0.90% : 0.000002s : 13: predicate.cast_eliminate 0.66% : 0.000001s : 8: predicate.check_bprop_eliminate 0.56% : 0.000001s : 8: predicate.compare_switch_simplify 0.18% : 0.000000s : 4: predicate.const_output_eliminate 0.63% : 0.000001s : 8: predicate.depend_value_elim 0.84% : 0.000002s : 13: predicate.dict_get_item_const_eliminator 1.16% : 0.000003s : 13: predicate.dict_get_item_eliminator 0.82% : 0.000002s : 13: predicate.dict_set_item_eliminator 1.35% : 0.000003s : 8: predicate.dumpgradient_eliminate 0.35% : 0.000001s : 4: predicate.elim_not_effective 0.52% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.19% : 0.000003s : 17: predicate.environ_add_const_eliminate 1.00% : 0.000002s : 17: predicate.environ_get_add_eliminate 1.03% : 0.000002s : 17: predicate.environ_get_depend_swap 1.57% : 0.000004s : 25: predicate.environ_get_eliminate 1.07% : 0.000002s : 17: predicate.environ_get_set_eliminate 1.37% : 0.000003s : 21: predicate.exchange_switch_depend_value 2.33% : 0.000005s : 21: predicate.float_depend_g_call 0.50% : 0.000001s : 8: predicate.float_environ_get_switch 0.85% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.17% : 0.000000s : 4: predicate.fold_const_symbol 0.77% : 0.000002s : 8: predicate.get_grad_eliminate 0.26% : 0.000001s : 4: predicate.graph_param_transform 0.55% : 0.000001s : 8: predicate.incorporate_call 0.47% : 0.000001s : 8: predicate.incorporate_call_switch 5.95% : 0.000013s : 58: predicate.inline 1.23% : 0.000003s : 8: predicate.inline_without_move 0.30% : 0.000001s : 8: predicate.j_node_and_user_rematch 1.00% : 0.000002s : 8: predicate.less_batch_normalization 1.74% : 0.000004s : 25: predicate.list_to_tuple_eliminator_ 2.28% : 0.000005s : 38: predicate.load_eliminater 1.50% : 0.000003s : 4: predicate.loop_unroll_after_grad 2.17% : 0.000005s : 34: predicate.loop_unroll_before_grad 1.46% : 0.000003s : 21: predicate.make_slice_get_slice_eliminator 0.52% : 0.000001s : 8: predicate.merge_addn 0.61% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.65% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.79% : 0.000002s : 13: predicate.minmaximum_grad 1.43% : 0.000003s : 4: predicate.mutable_eliminate 0.57% : 0.000001s : 4: predicate.opt_reshape 0.43% : 0.000001s : 4: predicate.parallel_virtual_node 1.56% : 0.000004s : 21: predicate.partial_defer_inline 1.55% : 0.000003s : 21: predicate.partial_eliminate 0.91% : 0.000002s : 13: predicate.print_const_string_wrapper 0.56% : 0.000001s : 8: predicate.reduce_all_const_elim 1.17% : 0.000003s : 13: predicate.reduce_eliminate 2.31% : 0.000005s : 38: predicate.redundant_stop_gradient_eliminater 0.49% : 0.000001s : 8: predicate.remove_not_recompute_node 1.44% : 0.000003s : 25: predicate.replace_applicator 0.53% : 0.000001s : 8: predicate.replace_old_param 0.39% : 0.000001s : 4: predicate.reset_defer_inline 0.95% : 0.000002s : 13: predicate.reshape_eliminate 0.57% : 0.000001s : 8: predicate.row_tensor_add_zeros_like 0.39% : 0.000001s : 4: predicate.row_tensor_eliminate 0.99% : 0.000002s : 8: predicate.same_eliminate 0.41% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.87% : 0.000002s : 8: predicate.shard_identity_eliminate 0.86% : 0.000002s : 8: predicate.special_op_eliminate 0.83% : 0.000002s : 8: predicate.specialize_transform 0.99% : 0.000002s : 8: predicate.split_environ_get_set_with_tuple_value 0.99% : 0.000002s : 8: predicate.stack_unstack_eliminate 0.38% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.44% : 0.000003s : 21: predicate.switch_defer_inline 1.96% : 0.000004s : 29: predicate.switch_layer_defer_inline 4.71% : 0.000011s : 67: predicate.switch_simplify 0.86% : 0.000002s : 13: predicate.tile_eliminate 0.88% : 0.000002s : 13: predicate.transpose_eliminate 1.60% : 0.000004s : 21: predicate.tuple_list_convert_item_index_to_positive 1.52% : 0.000003s : 21: predicate.tuple_list_get_item_const_eliminator 1.33% : 0.000003s : 21: predicate.tuple_list_get_item_depend_reorder 3.01% : 0.000007s : 33: predicate.tuple_list_get_item_eliminator 1.36% : 0.000003s : 21: predicate.tuple_list_get_set_item_eliminator 2.23% : 0.000005s : 29: predicate.tuple_list_set_item_eliminator 1.62% : 0.000004s : 25: predicate.tuple_to_list_eliminator_ 2.22% : 0.000005s : 38: predicate.updatestate_pure_node_eliminater 3.12% : 0.000007s : 46: predicate.updatestate_useless_node_eliminater 0.34% : 0.000001s : 4: predicate.value_based_eliminate 0.77% : 0.000002s : 8: predicate.virtual_dataset_eliminate 0.61% : 0.000001s : 8: predicate.virtual_output_eliminate 0.21% : 0.000000s : 4: predicate.virtual_view_grad_eliminate 0.48% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000630 11 50.32% : 0.000317s : 5: func_graph_cloner_run.FuncGraphClonerGraph 49.68% : 0.000313s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.062512 192 0.01% : 0.000006s : 1: ForceFp32Comm 4.97% : 0.003106s : 1: add_attr 4.94% : 0.003091s : 1: add_attr_with_inline 0.01% : 0.000007s : 1: add_comm_op_reuse_tag 0.10% : 0.000061s : 1: add_recomputation 0.01% : 0.000006s : 1: assign_add_opt 0.11% : 0.000071s : 1: auto_monad 0.05% : 0.000029s : 1: auto_monad_reorder 0.01% : 0.000006s : 1: begin_end_overlap_inline 0.01% : 0.000009s : 1: bias_add_comm_swap 0.77% : 0.000482s : 1: bootstrap 0.06% : 0.000038s : 1: cconv 0.01% : 0.000006s : 1: comm_op_add_attrs 0.03% : 0.000021s : 1: control_data_broadcast_order 0.02% : 0.000015s : 1: convert_after_rewriter 0.09% : 0.000058s : 1: cse_after_recomputation 0.01% : 0.000008s : 1: dataset_repeat_opt 0.03% : 0.000022s : 1: detach_backward 0.02% : 0.000012s : 1: environ_conv 0.05% : 0.000028s : 1: event_method 0.01% : 0.000007s : 1: full_micro_interleaved_order_control 0.01% : 0.000008s : 1: get_jit_bprop_graph 0.02% : 0.000012s : 1: graph_reusing 0.01% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000006s : 1: handle_group_info 0.01% : 0.000008s : 1: inline 0.01% : 0.000009s : 1: insert-virtual-dataset 0.01% : 0.000006s : 1: interleave_parallel_branches 0.01% : 0.000006s : 1: interleave_split_concat_branches 0.01% : 0.000008s : 1: label_fine_grained_interleaved_index 0.02% : 0.000011s : 1: label_micro_interleaved_index 0.82% : 0.000513s : 1: loop_unroll 0.01% : 0.000007s : 1: merge_cast_opt 0.01% : 0.000008s : 1: micro_interleaved_order_control 1.20% : 0.000750s : 1: mutable_eliminate 0.02% : 0.000011s : 1: offloading_packed_experts 0.03% : 0.000019s : 1: opt.transform.loop_unroll_optimizer 0.03% : 0.000021s : 1: opt.transform.mutable_eliminate 2.05% : 0.001281s : 78: opt.transform.opt_a 0.05% : 0.000031s : 1: opt.transform.opt_after_cconv 0.05% : 0.000030s : 1: opt.transform.opt_after_jit_grad 0.17% : 0.000107s : 28: opt.transform.opt_b 0.08% : 0.000053s : 2: opt.transform.opt_trans_graph 0.07% : 0.000042s : 4: opt.transform.symbol_engine_opt 23.83% : 0.014896s : 1: opt_a 0.22% : 0.000141s : 1: opt_after_cconv 0.91% : 0.000567s : 1: opt_after_jit_grad 0.48% : 0.000297s : 1: opt_b 28.68% : 0.017927s : 1: optimize 0.04% : 0.000026s : 1: optimize_parallel_all_gather_comm 0.02% : 0.000011s : 1: order_py_execute_after_rewriter 0.05% : 0.000030s : 1: overlap_grad_flash_sp 0.01% : 0.000006s : 1: overlap_grad_matmul_and_grad_allreduce 0.02% : 0.000010s : 1: overlap_grad_ring_attention 0.01% : 0.000007s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000006s : 1: overlap_opt_shard_in_pipeline 0.01% : 0.000008s : 1: overlap_param_gather 0.01% : 0.000006s : 1: overlap_recompute_allgather_and_fa_grad 0.02% : 0.000011s : 1: overlap_recompute_and_grad_model_parallel 0.01% : 0.000008s : 1: overlap_recompute_comm 0.02% : 0.000010s : 1: parallel-infer-symbol 0.01% : 0.000007s : 1: parallel-infer-symbol-second 0.01% : 0.000007s : 1: partial_unused_args_eliminate 0.02% : 0.000009s : 1: pipeline_parallel_scheduler 0.01% : 0.000007s : 1: pipeline_split 0.07% : 0.000042s : 1: pre_auto_parallel 0.05% : 0.000034s : 1: py_interpret_to_execute 0.04% : 0.000025s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000006s : 1: remove_cast_before_assign_add 0.03% : 0.000021s : 1: remove_dup_value 18.74% : 0.011715s : 1: renormalize.infer 0.80% : 0.000497s : 1: renormalize.specialize 0.01% : 0.000008s : 1: reorder_send_recv_between_fp_bp 0.02% : 0.000012s : 1: rewriter_after_jit_bprop_graph 0.08% : 0.000051s : 1: rewriter_after_opt_a 0.14% : 0.000089s : 1: rewriter_before_opt_a 0.01% : 0.000008s : 1: slice_cell_reuse_recomputed_activation 0.01% : 0.000007s : 1: slice_recompute_activation 0.01% : 0.000007s : 1: split_layernorm_comm 0.01% : 0.000008s : 1: split_matmul_comm_elemetwise 0.02% : 0.000012s : 1: swap_dp_allreduce_reducescatter 0.18% : 0.000113s : 1: symbol_engine_optimizer 0.16% : 0.000098s : 1: tuple_transform 9.15% : 0.005719s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:42:48.954.470 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0971959, [21] [bootstrap]: 0.00049535 [type_inference]: 0.00614565 [event_method]: 1.84e-05 [auto_monad]: 6.221e-05 [graph_reusing]: 6.31998e-06 [inline]: 2.07001e-06 [add_attr]: 0.00323604, [1] [add_attr_with_inline]: 0.00322608, [1] [Cycle 1]: 6.723e-05, [2] [tag_attr]: 2.034e-05 [meta_addattr_fg_expand]: 5.64e-06 [parallel-infer-symbol]: 3.53e-06 [pre_auto_parallel]: 3.625e-05 [insert-virtual-dataset]: 2.37001e-06 [parallel-infer-symbol-second]: 1.05001e-06 [dataset_repeat_opt]: 2.27001e-06 [pipeline_split]: 1.62001e-06 [optimize]: 0.0863089, [53] [py_interpret_to_execute]: 2.81e-05 [rewriter_before_opt_a]: 9.425e-05 [opt_a]: 0.0837475, [2] [Cycle 1]: 0.0829188, [45] [expand_dump_flag]: 2.91999e-06 [switch_simplify]: 4.448e-05 [loop_unroll]: 3.28e-05 [a_1]: 0.00068635 [with_stream_mark]: 2.216e-05 [recompute_prepare]: 1.1e-05 [updatestate_depend_eliminate]: 4.29002e-06 [updatestate_assign_eliminate]: 3.41001e-06 [updatestate_loads_eliminate]: 3.06001e-06 [parameter_eliminate]: 2.39999e-06 [a_2]: 8.976e-05 [accelerated_algorithm]: 7.85e-06 [shard]: 2.09e-06 [meta_shard_fg_expand]: 2.31998e-06 [shard_inline]: 6.51e-06 [merge_send_recv]: 8.94998e-06 [auto_parallel]: 9.01998e-06 [parallel]: 1.949e-05 [flash_sp]: 1.054e-05 [merge_comm]: 4.27e-06 [allreduce_fusion]: 3.73999e-06 [matmul_add_comm_reduction]: 1.069e-05 [allreduce_slice_to_reducescatter]: 8.49977e-07 [virtual_shard_identity]: 8.55001e-06 [virtual_dataset]: 7.01999e-06 [get_grad_eliminate_]: 6.85002e-06 [virtual_output]: 6.89999e-06 [merge_forward]: 3.93999e-06 [cell_reuse_recompute_pass]: 1.45999e-06 [offload_activation]: 1.15e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.242e-05 [merge_recompute_call_nodes]: 1.70001e-06 [before_grad]: 1.106e-05 [set_forward_comm_id_for_comm_node_pass]: 3.91001e-06 [meta_fg_expand]: 3.04001e-06 [flash_sp_send_recv_attached]: 3.36001e-06 [receive_attached]: 2.22001e-06 [after_resolve]: 1.376e-05 [a_after_grad]: 1.009e-05 [renormalize]: 0.0813787 [add_forward_monad_depend]: 1.301e-05 [auto_monad_grad]: 2.55997e-06 [auto_monad_eliminator]: 2.662e-05 [cse]: 3.249e-05 [a_3]: 6.725e-05 [Cycle 2]: 0.00081443, [45] [expand_dump_flag]: 2.12999e-06 [switch_simplify]: 1.015e-05 [loop_unroll]: 7.22997e-06 [a_1]: 0.00016568 [with_stream_mark]: 2.278e-05 [recompute_prepare]: 7.46999e-06 [updatestate_depend_eliminate]: 4.32e-06 [updatestate_assign_eliminate]: 3.28e-06 [updatestate_loads_eliminate]: 3.26999e-06 [parameter_eliminate]: 2.70997e-06 [a_2]: 8.182e-05 [accelerated_algorithm]: 8.05e-06 [shard]: 2.53998e-06 [meta_shard_fg_expand]: 2.22999e-06 [shard_inline]: 6.89999e-06 [merge_send_recv]: 1.138e-05 [auto_parallel]: 1.11e-05 [parallel]: 9.77999e-06 [flash_sp]: 4.67e-06 [merge_comm]: 4.03001e-06 [allreduce_fusion]: 3.9e-06 [matmul_add_comm_reduction]: 1.102e-05 [allreduce_slice_to_reducescatter]: 8.39995e-07 [virtual_shard_identity]: 1.1e-05 [virtual_dataset]: 6.20002e-06 [get_grad_eliminate_]: 5.81e-06 [virtual_output]: 6.12999e-06 [merge_forward]: 4.12003e-06 [cell_reuse_recompute_pass]: 3.71001e-06 [offload_activation]: 1.148e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.687e-05 [merge_recompute_call_nodes]: 1.74e-06 [before_grad]: 1.15e-05 [set_forward_comm_id_for_comm_node_pass]: 4.08999e-06 [meta_fg_expand]: 3.09001e-06 [flash_sp_send_recv_attached]: 2.21998e-06 [receive_attached]: 2.94001e-06 [after_resolve]: 1.4e-05 [a_after_grad]: 1.199e-05 [renormalize]: 6.00121e-08 [add_forward_monad_depend]: 3.37002e-06 [auto_monad_grad]: 2.17999e-06 [auto_monad_eliminator]: 1.112e-05 [cse]: 1.934e-05 [a_3]: 3.862e-05 [py_interpret_to_execute_after_opt_a]: 2.063e-05 [slice_cell_reuse_recomputed_activation]: 2.14e-06 [rewriter_after_opt_a]: 4.65e-05 [convert_after_rewriter]: 7.11001e-06 [order_py_execute_after_rewriter]: 5.47999e-06 [mutable_eliminate]: 0.00078975 [opt_b]: 0.00023287, [1] [Cycle 1]: 0.00022332, [7] [b_1]: 0.00012907 [b_2]: 1.064e-05 [updatestate_depend_eliminate]: 1.015e-05 [updatestate_assign_eliminate]: 3.78001e-06 [updatestate_loads_eliminate]: 2.48998e-06 [renormalize]: 5.79981e-07 [cse]: 2.758e-05 [optimize_parallel_all_gather_comm]: 1.976e-05 [overlap_param_gather]: 2.07001e-06 [cconv]: 3.505e-05 [loop_unroll]: 0.00052239 [opt_after_cconv]: 0.00012033, [1] [Cycle 1]: 0.00011221, [7] [c_1]: 3.211e-05 [parameter_eliminate]: 5.62001e-06 [updatestate_depend_eliminate]: 8.31002e-06 [updatestate_assign_eliminate]: 2.74999e-06 [updatestate_loads_eliminate]: 2.64001e-06 [cse]: 2.418e-05 [renormalize]: 5.39992e-07 [remove_dup_value]: 1.554e-05 [tuple_transform]: 8.383e-05, [1] [Cycle 1]: 7.901e-05, [4] [d_1]: 5.116e-05 [none_parameter_eliminate]: 1.86998e-06 [renormalize]: 2.09984e-07 [switch_simplify]: 7.28999e-06 [partial_unused_args_eliminate]: 2.39999e-06 [add_recomputation]: 5.546e-05 [cse_after_recomputation]: 2.263e-05, [1] [Cycle 1]: 1.783e-05, [1] [cse]: 1.212e-05 [environ_conv]: 5.49998e-06 [swap_dp_allreduce_reducescatter]: 5.33002e-06 [bias_add_comm_swap]: 3.23e-06 [label_micro_interleaved_index]: 6.07999e-06 [label_fine_grained_interleaved_index]: 2.97002e-06 [merge_cast_opt]: 1.51002e-06 [slice_recompute_activation]: 2.14999e-06 [micro_interleaved_order_control]: 2.57001e-06 [assign_add_opt]: 1.29e-06 [ForceFp32Comm]: 9.20001e-07 [remove_cast_before_assign_add]: 1.13001e-06 [full_micro_interleaved_order_control]: 1.99e-06 [reorder_send_recv_between_fp_bp]: 2.74001e-06 [comm_op_add_attrs]: 1.07e-06 [add_comm_op_reuse_tag]: 1.00999e-06 [interleave_split_concat_branches]: 1.09003e-06 [interleave_parallel_branches]: 1.14e-06 [overlap_opt_shard_in_pipeline]: 1.71e-06 [overlap_opt_shard_grad_in_pipeline]: 2.21e-06 [control_data_broadcast_order]: 1.491e-05 [grouped_pairwise_exchange_alltoall]: 2.69001e-06 [offloading_packed_experts]: 3.84002e-06 [overlap_recompute_and_grad_model_parallel]: 4.67e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.20001e-06 [overlap_recompute_allgather_and_fa_grad]: 1.67999e-06 [overlap_recompute_comm]: 2.64999e-06 [overlap_grad_ring_attention]: 4.64998e-06 [overlap_grad_flash_sp]: 2.225e-05 [begin_end_overlap_inline]: 5.19998e-07 [split_matmul_comm_elemetwise]: 2.39001e-06 [split_layernorm_comm]: 1.66002e-06 [handle_group_info]: 9.80013e-07 [symbol_engine_optimizer]: 8.58e-05, [1] [Cycle 1]: 8.034e-05, [6] [build]: 4.27e-06 [elim_shapecalc]: 1.189e-05 [elim_not_effective]: 1.453e-05 [opt_reshape]: 7.2e-06 [fold_const_symbol]: 1.096e-05 [renormalize]: 5.19998e-07 [detach_backward]: 2.30002e-06 [pipeline_parallel_scheduler]: 1.65001e-06 [auto_monad_reorder]: 1.947e-05 [get_jit_bprop_graph]: 2.79001e-06 [rewriter_after_jit_bprop_graph]: 6.36998e-06 [opt_after_jit_grad]: 0.00062458 [validate]: 5.045e-05 Sums bootstrap : 0.000495s : 0.53% type_inference : 0.006146s : 6.62% event_method : 0.000018s : 0.02% auto_monad : 0.000062s : 0.07% graph_reusing : 0.000006s : 0.01% inline : 0.000002s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000020s : 0.02% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.01% parallel-infer-symbol : 0.000004s : 0.00% pre_auto_parallel : 0.000036s : 0.04% insert-virtual-dataset : 0.000002s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000028s : 0.03% optimize.rewriter_before_opt_a : 0.000094s : 0.10% optimize.opt_a.expand_dump_flag : 0.000005s : 0.01% optimize.opt_a.switch_simplify : 0.000055s : 0.06% optimize.opt_a.loop_unroll : 0.000040s : 0.04% optimize.opt_a.a_1 : 0.000852s : 0.92% optimize.opt_a.with_stream_mark : 0.000045s : 0.05% optimize.opt_a.recompute_prepare : 0.000018s : 0.02% optimize.opt_a.updatestate_depend_eliminate : 0.000009s : 0.01% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.01% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.01% optimize.opt_a.parameter_eliminate : 0.000005s : 0.01% optimize.opt_a.a_2 : 0.000172s : 0.18% optimize.opt_a.accelerated_algorithm : 0.000016s : 0.02% optimize.opt_a.shard : 0.000005s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.000005s : 0.00% optimize.opt_a.shard_inline : 0.000013s : 0.01% optimize.opt_a.merge_send_recv : 0.000020s : 0.02% optimize.opt_a.auto_parallel : 0.000020s : 0.02% optimize.opt_a.parallel : 0.000029s : 0.03% optimize.opt_a.flash_sp : 0.000015s : 0.02% optimize.opt_a.merge_comm : 0.000008s : 0.01% optimize.opt_a.allreduce_fusion : 0.000008s : 0.01% optimize.opt_a.matmul_add_comm_reduction : 0.000022s : 0.02% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000002s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000020s : 0.02% optimize.opt_a.virtual_dataset : 0.000013s : 0.01% optimize.opt_a.get_grad_eliminate_ : 0.000013s : 0.01% optimize.opt_a.virtual_output : 0.000013s : 0.01% optimize.opt_a.merge_forward : 0.000008s : 0.01% optimize.opt_a.cell_reuse_recompute_pass : 0.000005s : 0.01% optimize.opt_a.offload_activation : 0.000023s : 0.02% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000029s : 0.03% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.00% optimize.opt_a.before_grad : 0.000023s : 0.02% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000008s : 0.01% optimize.opt_a.meta_fg_expand : 0.000006s : 0.01% optimize.opt_a.flash_sp_send_recv_attached : 0.000006s : 0.01% optimize.opt_a.receive_attached : 0.000005s : 0.01% optimize.opt_a.after_resolve : 0.000028s : 0.03% optimize.opt_a.a_after_grad : 0.000022s : 0.02% optimize.opt_a.renormalize : 0.081379s : 87.65% optimize.opt_a.add_forward_monad_depend : 0.000016s : 0.02% optimize.opt_a.auto_monad_grad : 0.000005s : 0.01% optimize.opt_a.auto_monad_eliminator : 0.000038s : 0.04% optimize.opt_a.cse : 0.000052s : 0.06% optimize.opt_a.a_3 : 0.000106s : 0.11% optimize.py_interpret_to_execute_after_opt_a : 0.000021s : 0.02% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.00% optimize.rewriter_after_opt_a : 0.000047s : 0.05% optimize.convert_after_rewriter : 0.000007s : 0.01% optimize.order_py_execute_after_rewriter : 0.000005s : 0.01% optimize.mutable_eliminate : 0.000790s : 0.85% optimize.opt_b.b_1 : 0.000129s : 0.14% optimize.opt_b.b_2 : 0.000011s : 0.01% optimize.opt_b.updatestate_depend_eliminate : 0.000010s : 0.01% optimize.opt_b.updatestate_assign_eliminate : 0.000004s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000002s : 0.00% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000028s : 0.03% optimize.optimize_parallel_all_gather_comm : 0.000020s : 0.02% optimize.overlap_param_gather : 0.000002s : 0.00% optimize.cconv : 0.000035s : 0.04% optimize.loop_unroll : 0.000522s : 0.56% optimize.opt_after_cconv.c_1 : 0.000032s : 0.03% optimize.opt_after_cconv.parameter_eliminate : 0.000006s : 0.01% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000008s : 0.01% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.cse : 0.000024s : 0.03% optimize.opt_after_cconv.renormalize : 0.000001s : 0.00% optimize.remove_dup_value : 0.000016s : 0.02% optimize.tuple_transform.d_1 : 0.000051s : 0.06% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000007s : 0.01% optimize.partial_unused_args_eliminate : 0.000002s : 0.00% optimize.add_recomputation : 0.000055s : 0.06% optimize.cse_after_recomputation.cse : 0.000012s : 0.01% optimize.environ_conv : 0.000005s : 0.01% optimize.swap_dp_allreduce_reducescatter : 0.000005s : 0.01% optimize.bias_add_comm_swap : 0.000003s : 0.00% optimize.label_micro_interleaved_index : 0.000006s : 0.01% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.00% optimize.merge_cast_opt : 0.000002s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.00% optimize.micro_interleaved_order_control : 0.000003s : 0.00% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.00% optimize.full_micro_interleaved_order_control : 0.000002s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.00% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000002s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.00% optimize.control_data_broadcast_order : 0.000015s : 0.02% optimize.grouped_pairwise_exchange_alltoall : 0.000003s : 0.00% optimize.offloading_packed_experts : 0.000004s : 0.00% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.01% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000002s : 0.00% optimize.overlap_recompute_comm : 0.000003s : 0.00% optimize.overlap_grad_ring_attention : 0.000005s : 0.01% optimize.overlap_grad_flash_sp : 0.000022s : 0.02% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.00% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000004s : 0.00% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000012s : 0.01% optimize.symbol_engine_optimizer.elim_not_effective : 0.000015s : 0.02% optimize.symbol_engine_optimizer.opt_reshape : 0.000007s : 0.01% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000011s : 0.01% optimize.symbol_engine_optimizer.renormalize : 0.000001s : 0.00% detach_backward : 0.000002s : 0.00% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000019s : 0.02% get_jit_bprop_graph : 0.000003s : 0.00% rewriter_after_jit_bprop_graph : 0.000006s : 0.01% opt_after_jit_grad : 0.000625s : 0.67% validate : 0.000050s : 0.05% Time group info: ------[substitution.] 0.000249 34 17.47% : 0.000044s : 6: substitution.arithmetic_simplify 0.83% : 0.000002s : 2: substitution.elim_not_effective 0.67% : 0.000002s : 2: substitution.fold_const_symbol 2.96% : 0.000007s : 4: substitution.graph_param_transform 64.65% : 0.000161s : 4: substitution.inline 1.85% : 0.000005s : 4: substitution.j_node_and_user_rematch 2.41% : 0.000006s : 4: substitution.remove_not_recompute_node 2.74% : 0.000007s : 4: substitution.replace_old_param 6.42% : 0.000016s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.006079 2 88.75% : 0.005396s : 1: type_inference.infer 11.25% : 0.000684s : 1: type_inference.specialize ------[replace.] 0.000065 8 62.45% : 0.000041s : 4: replace.inline 37.55% : 0.000025s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000173 8 91.83% : 0.000159s : 4: match.inline 8.17% : 0.000014s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000229 1278 0.99% : 0.000002s : 13: predicate.accumulaten_eliminater 1.38% : 0.000003s : 4: predicate.ad_related_special_op_eliminate 0.61% : 0.000001s : 8: predicate.addn_check_dump 1.02% : 0.000002s : 13: predicate.addn_zero_filter 0.72% : 0.000002s : 13: predicate.adjust_all_reduce_mul_add 2.74% : 0.000006s : 21: predicate.arithmetic_simplify 1.07% : 0.000002s : 13: predicate.cast_eliminate 0.57% : 0.000001s : 8: predicate.check_bprop_eliminate 0.52% : 0.000001s : 8: predicate.compare_switch_simplify 0.17% : 0.000000s : 4: predicate.const_output_eliminate 0.59% : 0.000001s : 8: predicate.depend_value_elim 0.83% : 0.000002s : 13: predicate.dict_get_item_const_eliminator 1.02% : 0.000002s : 13: predicate.dict_get_item_eliminator 0.95% : 0.000002s : 13: predicate.dict_set_item_eliminator 1.06% : 0.000002s : 8: predicate.dumpgradient_eliminate 0.25% : 0.000001s : 4: predicate.elim_not_effective 0.41% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.04% : 0.000002s : 17: predicate.environ_add_const_eliminate 0.98% : 0.000002s : 17: predicate.environ_get_add_eliminate 0.93% : 0.000002s : 17: predicate.environ_get_depend_swap 1.59% : 0.000004s : 25: predicate.environ_get_eliminate 1.25% : 0.000003s : 17: predicate.environ_get_set_eliminate 1.34% : 0.000003s : 21: predicate.exchange_switch_depend_value 2.57% : 0.000006s : 21: predicate.float_depend_g_call 0.51% : 0.000001s : 8: predicate.float_environ_get_switch 0.68% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.17% : 0.000000s : 4: predicate.fold_const_symbol 0.71% : 0.000002s : 8: predicate.get_grad_eliminate 0.24% : 0.000001s : 4: predicate.graph_param_transform 0.52% : 0.000001s : 8: predicate.incorporate_call 0.43% : 0.000001s : 8: predicate.incorporate_call_switch 6.11% : 0.000014s : 58: predicate.inline 0.78% : 0.000002s : 8: predicate.inline_without_move 0.29% : 0.000001s : 8: predicate.j_node_and_user_rematch 1.17% : 0.000003s : 8: predicate.less_batch_normalization 1.76% : 0.000004s : 25: predicate.list_to_tuple_eliminator_ 2.31% : 0.000005s : 38: predicate.load_eliminater 1.33% : 0.000003s : 4: predicate.loop_unroll_after_grad 2.31% : 0.000005s : 34: predicate.loop_unroll_before_grad 1.75% : 0.000004s : 21: predicate.make_slice_get_slice_eliminator 0.51% : 0.000001s : 8: predicate.merge_addn 0.52% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.50% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.82% : 0.000002s : 13: predicate.minmaximum_grad 1.79% : 0.000004s : 4: predicate.mutable_eliminate 0.35% : 0.000001s : 4: predicate.opt_reshape 0.33% : 0.000001s : 4: predicate.parallel_virtual_node 1.72% : 0.000004s : 21: predicate.partial_defer_inline 1.54% : 0.000004s : 21: predicate.partial_eliminate 0.82% : 0.000002s : 13: predicate.print_const_string_wrapper 0.64% : 0.000001s : 8: predicate.reduce_all_const_elim 1.11% : 0.000003s : 13: predicate.reduce_eliminate 2.36% : 0.000005s : 38: predicate.redundant_stop_gradient_eliminater 0.46% : 0.000001s : 8: predicate.remove_not_recompute_node 1.46% : 0.000003s : 25: predicate.replace_applicator 0.58% : 0.000001s : 8: predicate.replace_old_param 0.32% : 0.000001s : 4: predicate.reset_defer_inline 0.92% : 0.000002s : 13: predicate.reshape_eliminate 0.64% : 0.000001s : 8: predicate.row_tensor_add_zeros_like 0.47% : 0.000001s : 4: predicate.row_tensor_eliminate 0.74% : 0.000002s : 8: predicate.same_eliminate 0.67% : 0.000002s : 8: predicate.set_cell_output_no_recompute 0.85% : 0.000002s : 8: predicate.shard_identity_eliminate 0.85% : 0.000002s : 8: predicate.special_op_eliminate 0.62% : 0.000001s : 8: predicate.specialize_transform 1.11% : 0.000003s : 8: predicate.split_environ_get_set_with_tuple_value 0.77% : 0.000002s : 8: predicate.stack_unstack_eliminate 0.37% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.53% : 0.000004s : 21: predicate.switch_defer_inline 1.94% : 0.000004s : 29: predicate.switch_layer_defer_inline 4.51% : 0.000010s : 67: predicate.switch_simplify 0.82% : 0.000002s : 13: predicate.tile_eliminate 0.92% : 0.000002s : 13: predicate.transpose_eliminate 1.46% : 0.000003s : 21: predicate.tuple_list_convert_item_index_to_positive 1.40% : 0.000003s : 21: predicate.tuple_list_get_item_const_eliminator 1.36% : 0.000003s : 21: predicate.tuple_list_get_item_depend_reorder 3.40% : 0.000008s : 33: predicate.tuple_list_get_item_eliminator 1.45% : 0.000003s : 21: predicate.tuple_list_get_set_item_eliminator 2.40% : 0.000006s : 29: predicate.tuple_list_set_item_eliminator 1.64% : 0.000004s : 25: predicate.tuple_to_list_eliminator_ 2.25% : 0.000005s : 38: predicate.updatestate_pure_node_eliminater 2.98% : 0.000007s : 46: predicate.updatestate_useless_node_eliminater 0.36% : 0.000001s : 4: predicate.value_based_eliminate 0.68% : 0.000002s : 8: predicate.virtual_dataset_eliminate 0.61% : 0.000001s : 8: predicate.virtual_output_eliminate 0.29% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.47% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000662 11 48.56% : 0.000321s : 5: func_graph_cloner_run.FuncGraphClonerGraph 51.44% : 0.000341s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.269567 192 0.00% : 0.000004s : 1: ForceFp32Comm 1.20% : 0.003242s : 1: add_attr 1.20% : 0.003230s : 1: add_attr_with_inline 0.00% : 0.000004s : 1: add_comm_op_reuse_tag 0.02% : 0.000060s : 1: add_recomputation 0.00% : 0.000004s : 1: assign_add_opt 0.02% : 0.000067s : 1: auto_monad 0.01% : 0.000023s : 1: auto_monad_reorder 0.00% : 0.000003s : 1: begin_end_overlap_inline 0.00% : 0.000006s : 1: bias_add_comm_swap 0.20% : 0.000528s : 1: bootstrap 0.01% : 0.000039s : 1: cconv 0.00% : 0.000004s : 1: comm_op_add_attrs 0.01% : 0.000019s : 1: control_data_broadcast_order 0.00% : 0.000011s : 1: convert_after_rewriter 0.01% : 0.000025s : 1: cse_after_recomputation 0.00% : 0.000005s : 1: dataset_repeat_opt 0.00% : 0.000005s : 1: detach_backward 0.00% : 0.000009s : 1: environ_conv 0.01% : 0.000025s : 1: event_method 0.00% : 0.000005s : 1: full_micro_interleaved_order_control 0.00% : 0.000006s : 1: get_jit_bprop_graph 0.00% : 0.000010s : 1: graph_reusing 0.00% : 0.000005s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000004s : 1: handle_group_info 0.00% : 0.000005s : 1: inline 0.00% : 0.000006s : 1: insert-virtual-dataset 0.00% : 0.000004s : 1: interleave_parallel_branches 0.00% : 0.000004s : 1: interleave_split_concat_branches 0.00% : 0.000006s : 1: label_fine_grained_interleaved_index 0.00% : 0.000009s : 1: label_micro_interleaved_index 0.20% : 0.000533s : 1: loop_unroll 0.00% : 0.000004s : 1: merge_cast_opt 0.00% : 0.000005s : 1: micro_interleaved_order_control 0.30% : 0.000802s : 1: mutable_eliminate 0.00% : 0.000007s : 1: offloading_packed_experts 0.01% : 0.000020s : 1: opt.transform.loop_unroll_optimizer 0.01% : 0.000025s : 1: opt.transform.mutable_eliminate 0.49% : 0.001322s : 78: opt.transform.opt_a 0.01% : 0.000030s : 1: opt.transform.opt_after_cconv 0.01% : 0.000033s : 1: opt.transform.opt_after_jit_grad 0.04% : 0.000104s : 28: opt.transform.opt_b 0.02% : 0.000055s : 2: opt.transform.opt_trans_graph 0.01% : 0.000040s : 4: opt.transform.symbol_engine_opt 31.07% : 0.083751s : 1: opt_a 0.05% : 0.000124s : 1: opt_after_cconv 0.24% : 0.000637s : 1: opt_after_jit_grad 0.09% : 0.000237s : 1: opt_b 32.02% : 0.086314s : 1: optimize 0.01% : 0.000024s : 1: optimize_parallel_all_gather_comm 0.00% : 0.000008s : 1: order_py_execute_after_rewriter 0.01% : 0.000026s : 1: overlap_grad_flash_sp 0.00% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.00% : 0.000008s : 1: overlap_grad_ring_attention 0.00% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000005s : 1: overlap_param_gather 0.00% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.00% : 0.000008s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000005s : 1: overlap_recompute_comm 0.00% : 0.000007s : 1: parallel-infer-symbol 0.00% : 0.000004s : 1: parallel-infer-symbol-second 0.00% : 0.000005s : 1: partial_unused_args_eliminate 0.00% : 0.000005s : 1: pipeline_parallel_scheduler 0.00% : 0.000005s : 1: pipeline_split 0.01% : 0.000040s : 1: pre_auto_parallel 0.01% : 0.000033s : 1: py_interpret_to_execute 0.01% : 0.000025s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000004s : 1: remove_cast_before_assign_add 0.01% : 0.000019s : 1: remove_dup_value 29.97% : 0.080800s : 1: renormalize.infer 0.21% : 0.000559s : 1: renormalize.specialize 0.00% : 0.000005s : 1: reorder_send_recv_between_fp_bp 0.00% : 0.000009s : 1: rewriter_after_jit_bprop_graph 0.02% : 0.000052s : 1: rewriter_after_opt_a 0.04% : 0.000099s : 1: rewriter_before_opt_a 0.00% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000006s : 1: slice_recompute_activation 0.00% : 0.000005s : 1: split_layernorm_comm 0.00% : 0.000006s : 1: split_matmul_comm_elemetwise 0.00% : 0.000008s : 1: swap_dp_allreduce_reducescatter 0.03% : 0.000089s : 1: symbol_engine_optimizer 0.03% : 0.000087s : 1: tuple_transform 2.29% : 0.006166s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:42:50.451.579 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:42:50.451.841 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0903893, [21] [bootstrap]: 0.00046096 [type_inference]: 0.0786337 [event_method]: 2.09e-05 [auto_monad]: 6.297e-05 [graph_reusing]: 5.89e-06 [inline]: 2.74999e-06 [add_attr]: 0.00375686, [1] [add_attr_with_inline]: 0.00374589, [1] [Cycle 1]: 8.682e-05, [2] [tag_attr]: 2.187e-05 [meta_addattr_fg_expand]: 6.58e-06 [parallel-infer-symbol]: 3.46999e-06 [pre_auto_parallel]: 4.934e-05 [insert-virtual-dataset]: 3.21999e-06 [parallel-infer-symbol-second]: 8.99978e-07 [dataset_repeat_opt]: 2.68e-06 [pipeline_split]: 1.74998e-06 [optimize]: 0.00582646, [53] [py_interpret_to_execute]: 3.217e-05 [rewriter_before_opt_a]: 9.007e-05 [opt_a]: 0.00321298, [2] [Cycle 1]: 0.00232257, [45] [expand_dump_flag]: 3.41999e-06 [switch_simplify]: 4.27e-05 [loop_unroll]: 3.013e-05 [a_1]: 0.00066167 [with_stream_mark]: 2.138e-05 [recompute_prepare]: 1.166e-05 [updatestate_depend_eliminate]: 4.28999e-06 [updatestate_assign_eliminate]: 3.61001e-06 [updatestate_loads_eliminate]: 2.80002e-06 [parameter_eliminate]: 1.87999e-06 [a_2]: 0.00011906 [accelerated_algorithm]: 7.88001e-06 [shard]: 2.18002e-06 [meta_shard_fg_expand]: 1.86003e-06 [shard_inline]: 6.52001e-06 [merge_send_recv]: 8.54e-06 [auto_parallel]: 8.85001e-06 [parallel]: 1.963e-05 [flash_sp]: 9.52999e-06 [merge_comm]: 4.08001e-06 [allreduce_fusion]: 3.61999e-06 [matmul_add_comm_reduction]: 1.105e-05 [allreduce_slice_to_reducescatter]: 8.29983e-07 [virtual_shard_identity]: 9.00999e-06 [virtual_dataset]: 6.49001e-06 [get_grad_eliminate_]: 6.62002e-06 [virtual_output]: 6.49999e-06 [merge_forward]: 3.71001e-06 [cell_reuse_recompute_pass]: 1.37e-06 [offload_activation]: 1.019e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.599e-05 [merge_recompute_call_nodes]: 1.59e-06 [before_grad]: 1.106e-05 [set_forward_comm_id_for_comm_node_pass]: 3.73999e-06 [meta_fg_expand]: 3.46999e-06 [flash_sp_send_recv_attached]: 2.72001e-06 [receive_attached]: 2.37999e-06 [after_resolve]: 1.201e-05 [a_after_grad]: 1.038e-05 [renormalize]: 0.00070688 [add_forward_monad_depend]: 7.04001e-06 [auto_monad_grad]: 2.27001e-06 [auto_monad_eliminator]: 1.744e-05 [cse]: 2.988e-05 [a_3]: 6.258e-05 [Cycle 2]: 0.00087547, [45] [expand_dump_flag]: 2.12001e-06 [switch_simplify]: 8.2e-06 [loop_unroll]: 6.24001e-06 [a_1]: 0.00014353 [with_stream_mark]: 1.482e-05 [recompute_prepare]: 6.98998e-06 [updatestate_depend_eliminate]: 3.42002e-06 [updatestate_assign_eliminate]: 2.66e-06 [updatestate_loads_eliminate]: 2.29001e-06 [parameter_eliminate]: 1.13001e-06 [a_2]: 0.00011147 [accelerated_algorithm]: 6.10002e-06 [shard]: 1.67999e-06 [meta_shard_fg_expand]: 1.76998e-06 [shard_inline]: 6.06998e-06 [merge_send_recv]: 6.19999e-06 [auto_parallel]: 7.33e-06 [parallel]: 6.19999e-06 [flash_sp]: 3.73999e-06 [merge_comm]: 4.26001e-06 [allreduce_fusion]: 3.38e-06 [matmul_add_comm_reduction]: 7.28999e-06 [allreduce_slice_to_reducescatter]: 9.89996e-07 [virtual_shard_identity]: 7.97e-06 [virtual_dataset]: 6.26998e-06 [get_grad_eliminate_]: 5.84999e-06 [virtual_output]: 6.14999e-06 [merge_forward]: 3.5e-06 [cell_reuse_recompute_pass]: 1.96e-06 [offload_activation]: 7.73001e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.35e-05 [merge_recompute_call_nodes]: 1.00999e-06 [before_grad]: 9.72999e-06 [set_forward_comm_id_for_comm_node_pass]: 3.66999e-06 [meta_fg_expand]: 2.32001e-06 [flash_sp_send_recv_attached]: 1.50001e-06 [receive_attached]: 1.78002e-06 [after_resolve]: 1.149e-05 [a_after_grad]: 9.08002e-06 [renormalize]: 7.00238e-08 [add_forward_monad_depend]: 1.82999e-06 [auto_monad_grad]: 1.68002e-06 [auto_monad_eliminator]: 8.42e-06 [cse]: 1.684e-05 [a_3]: 4.961e-05 [py_interpret_to_execute_after_opt_a]: 1.417e-05 [slice_cell_reuse_recomputed_activation]: 4.74002e-06 [rewriter_after_opt_a]: 4.37e-05 [convert_after_rewriter]: 1.047e-05 [order_py_execute_after_rewriter]: 8.01001e-06 [mutable_eliminate]: 0.00060187 [opt_b]: 0.00028189, [1] [Cycle 1]: 0.0002707, [7] [b_1]: 0.00016584 [b_2]: 8.70001e-06 [updatestate_depend_eliminate]: 9.18002e-06 [updatestate_assign_eliminate]: 2.99999e-06 [updatestate_loads_eliminate]: 2.48e-06 [renormalize]: 6.19999e-07 [cse]: 2.246e-05 [optimize_parallel_all_gather_comm]: 2.23e-05 [overlap_param_gather]: 5.67001e-06 [cconv]: 3.585e-05 [loop_unroll]: 0.00057271 [opt_after_cconv]: 0.00014331, [1] [Cycle 1]: 0.00013303, [7] [c_1]: 3.329e-05 [parameter_eliminate]: 5.17999e-06 [updatestate_depend_eliminate]: 7.3e-06 [updatestate_assign_eliminate]: 2.93e-06 [updatestate_loads_eliminate]: 2.46e-06 [cse]: 2.436e-05 [renormalize]: 4.39992e-07 [remove_dup_value]: 1.786e-05 [tuple_transform]: 0.00010295, [1] [Cycle 1]: 9.508e-05, [4] [d_1]: 5.306e-05 [none_parameter_eliminate]: 2.12999e-06 [renormalize]: 1.59984e-07 [switch_simplify]: 8.23001e-06 [partial_unused_args_eliminate]: 4.93001e-06 [add_recomputation]: 5.872e-05 [cse_after_recomputation]: 2.956e-05, [1] [Cycle 1]: 2.213e-05, [1] [cse]: 1.257e-05 [environ_conv]: 8.08999e-06 [swap_dp_allreduce_reducescatter]: 8.75001e-06 [bias_add_comm_swap]: 5.66998e-06 [label_micro_interleaved_index]: 9.02999e-06 [label_fine_grained_interleaved_index]: 5.19e-06 [merge_cast_opt]: 4.08999e-06 [slice_recompute_activation]: 5.14e-06 [micro_interleaved_order_control]: 4.99e-06 [assign_add_opt]: 3.76999e-06 [ForceFp32Comm]: 3.46001e-06 [remove_cast_before_assign_add]: 3.60998e-06 [full_micro_interleaved_order_control]: 4.53999e-06 [reorder_send_recv_between_fp_bp]: 4.95999e-06 [comm_op_add_attrs]: 3.54002e-06 [add_comm_op_reuse_tag]: 3.46001e-06 [interleave_split_concat_branches]: 3.5e-06 [interleave_parallel_branches]: 3.57002e-06 [overlap_opt_shard_in_pipeline]: 3.66999e-06 [overlap_opt_shard_grad_in_pipeline]: 4.87998e-06 [control_data_broadcast_order]: 1.707e-05 [grouped_pairwise_exchange_alltoall]: 4.57e-06 [offloading_packed_experts]: 7.35e-06 [overlap_recompute_and_grad_model_parallel]: 8.07e-06 [overlap_grad_matmul_and_grad_allreduce]: 4.2e-06 [overlap_recompute_allgather_and_fa_grad]: 3.78001e-06 [overlap_recompute_comm]: 5.13002e-06 [overlap_grad_ring_attention]: 7.48e-06 [overlap_grad_flash_sp]: 2.742e-05 [begin_end_overlap_inline]: 3.21001e-06 [split_matmul_comm_elemetwise]: 4.70999e-06 [split_layernorm_comm]: 4.67e-06 [handle_group_info]: 3.73999e-06 [symbol_engine_optimizer]: 0.00010908, [1] [Cycle 1]: 0.00010201, [6] [build]: 4.23999e-06 [elim_shapecalc]: 1.374e-05 [elim_not_effective]: 1.464e-05 [opt_reshape]: 8.11002e-06 [fold_const_symbol]: 1.022e-05 [renormalize]: 3.30008e-07 [detach_backward]: 5.12e-06 [pipeline_parallel_scheduler]: 1.89e-06 [auto_monad_reorder]: 2.329e-05 [get_jit_bprop_graph]: 1.66998e-06 [rewriter_after_jit_bprop_graph]: 7.7e-06 [opt_after_jit_grad]: 0.00075819 [validate]: 5.12e-05 Sums bootstrap : 0.000461s : 0.54% type_inference : 0.078634s : 92.83% event_method : 0.000021s : 0.02% auto_monad : 0.000063s : 0.07% graph_reusing : 0.000006s : 0.01% inline : 0.000003s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000022s : 0.03% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000007s : 0.01% parallel-infer-symbol : 0.000003s : 0.00% pre_auto_parallel : 0.000049s : 0.06% insert-virtual-dataset : 0.000003s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000003s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000032s : 0.04% optimize.rewriter_before_opt_a : 0.000090s : 0.11% optimize.opt_a.expand_dump_flag : 0.000006s : 0.01% optimize.opt_a.switch_simplify : 0.000051s : 0.06% optimize.opt_a.loop_unroll : 0.000036s : 0.04% optimize.opt_a.a_1 : 0.000805s : 0.95% optimize.opt_a.with_stream_mark : 0.000036s : 0.04% optimize.opt_a.recompute_prepare : 0.000019s : 0.02% optimize.opt_a.updatestate_depend_eliminate : 0.000008s : 0.01% optimize.opt_a.updatestate_assign_eliminate : 0.000006s : 0.01% optimize.opt_a.updatestate_loads_eliminate : 0.000005s : 0.01% optimize.opt_a.parameter_eliminate : 0.000003s : 0.00% optimize.opt_a.a_2 : 0.000231s : 0.27% optimize.opt_a.accelerated_algorithm : 0.000014s : 0.02% optimize.opt_a.shard : 0.000004s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.00% optimize.opt_a.shard_inline : 0.000013s : 0.01% optimize.opt_a.merge_send_recv : 0.000015s : 0.02% optimize.opt_a.auto_parallel : 0.000016s : 0.02% optimize.opt_a.parallel : 0.000026s : 0.03% optimize.opt_a.flash_sp : 0.000013s : 0.02% optimize.opt_a.merge_comm : 0.000008s : 0.01% optimize.opt_a.allreduce_fusion : 0.000007s : 0.01% optimize.opt_a.matmul_add_comm_reduction : 0.000018s : 0.02% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000002s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000017s : 0.02% optimize.opt_a.virtual_dataset : 0.000013s : 0.02% optimize.opt_a.get_grad_eliminate_ : 0.000012s : 0.01% optimize.opt_a.virtual_output : 0.000013s : 0.01% optimize.opt_a.merge_forward : 0.000007s : 0.01% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.00% optimize.opt_a.offload_activation : 0.000018s : 0.02% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000029s : 0.03% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.00% optimize.opt_a.before_grad : 0.000021s : 0.02% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000007s : 0.01% optimize.opt_a.meta_fg_expand : 0.000006s : 0.01% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.00% optimize.opt_a.receive_attached : 0.000004s : 0.00% optimize.opt_a.after_resolve : 0.000024s : 0.03% optimize.opt_a.a_after_grad : 0.000019s : 0.02% optimize.opt_a.renormalize : 0.000707s : 0.83% optimize.opt_a.add_forward_monad_depend : 0.000009s : 0.01% optimize.opt_a.auto_monad_grad : 0.000004s : 0.00% optimize.opt_a.auto_monad_eliminator : 0.000026s : 0.03% optimize.opt_a.cse : 0.000047s : 0.06% optimize.opt_a.a_3 : 0.000112s : 0.13% optimize.py_interpret_to_execute_after_opt_a : 0.000014s : 0.02% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.01% optimize.rewriter_after_opt_a : 0.000044s : 0.05% optimize.convert_after_rewriter : 0.000010s : 0.01% optimize.order_py_execute_after_rewriter : 0.000008s : 0.01% optimize.mutable_eliminate : 0.000602s : 0.71% optimize.opt_b.b_1 : 0.000166s : 0.20% optimize.opt_b.b_2 : 0.000009s : 0.01% optimize.opt_b.updatestate_depend_eliminate : 0.000009s : 0.01% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000002s : 0.00% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000022s : 0.03% optimize.optimize_parallel_all_gather_comm : 0.000022s : 0.03% optimize.overlap_param_gather : 0.000006s : 0.01% optimize.cconv : 0.000036s : 0.04% optimize.loop_unroll : 0.000573s : 0.68% optimize.opt_after_cconv.c_1 : 0.000033s : 0.04% optimize.opt_after_cconv.parameter_eliminate : 0.000005s : 0.01% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000007s : 0.01% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.00% optimize.opt_after_cconv.cse : 0.000024s : 0.03% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000018s : 0.02% optimize.tuple_transform.d_1 : 0.000053s : 0.06% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000008s : 0.01% optimize.partial_unused_args_eliminate : 0.000005s : 0.01% optimize.add_recomputation : 0.000059s : 0.07% optimize.cse_after_recomputation.cse : 0.000013s : 0.01% optimize.environ_conv : 0.000008s : 0.01% optimize.swap_dp_allreduce_reducescatter : 0.000009s : 0.01% optimize.bias_add_comm_swap : 0.000006s : 0.01% optimize.label_micro_interleaved_index : 0.000009s : 0.01% optimize.label_fine_grained_interleaved_index : 0.000005s : 0.01% optimize.merge_cast_opt : 0.000004s : 0.00% optimize.slice_recompute_activation : 0.000005s : 0.01% optimize.micro_interleaved_order_control : 0.000005s : 0.01% optimize.assign_add_opt : 0.000004s : 0.00% optimize.ForceFp32Comm : 0.000003s : 0.00% optimize.remove_cast_before_assign_add : 0.000004s : 0.00% optimize.full_micro_interleaved_order_control : 0.000005s : 0.01% optimize.reorder_send_recv_between_fp_bp : 0.000005s : 0.01% optimize.comm_op_add_attrs : 0.000004s : 0.00% optimize.add_comm_op_reuse_tag : 0.000003s : 0.00% optimize.interleave_split_concat_branches : 0.000003s : 0.00% optimize.interleave_parallel_branches : 0.000004s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000005s : 0.01% optimize.control_data_broadcast_order : 0.000017s : 0.02% optimize.grouped_pairwise_exchange_alltoall : 0.000005s : 0.01% optimize.offloading_packed_experts : 0.000007s : 0.01% optimize.overlap_recompute_and_grad_model_parallel : 0.000008s : 0.01% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.00% optimize.overlap_recompute_comm : 0.000005s : 0.01% optimize.overlap_grad_ring_attention : 0.000007s : 0.01% optimize.overlap_grad_flash_sp : 0.000027s : 0.03% optimize.begin_end_overlap_inline : 0.000003s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000005s : 0.01% optimize.split_layernorm_comm : 0.000005s : 0.01% optimize.handle_group_info : 0.000004s : 0.00% optimize.symbol_engine_optimizer.build : 0.000004s : 0.01% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000014s : 0.02% optimize.symbol_engine_optimizer.elim_not_effective : 0.000015s : 0.02% optimize.symbol_engine_optimizer.opt_reshape : 0.000008s : 0.01% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000010s : 0.01% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000005s : 0.01% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000023s : 0.03% get_jit_bprop_graph : 0.000002s : 0.00% rewriter_after_jit_bprop_graph : 0.000008s : 0.01% opt_after_jit_grad : 0.000758s : 0.90% validate : 0.000051s : 0.06% Time group info: ------[substitution.] 0.000232 34 16.49% : 0.000038s : 6: substitution.arithmetic_simplify 1.07% : 0.000002s : 2: substitution.elim_not_effective 0.62% : 0.000001s : 2: substitution.fold_const_symbol 2.78% : 0.000006s : 4: substitution.graph_param_transform 65.99% : 0.000153s : 4: substitution.inline 1.74% : 0.000004s : 4: substitution.j_node_and_user_rematch 2.12% : 0.000005s : 4: substitution.remove_not_recompute_node 2.38% : 0.000006s : 4: substitution.replace_old_param 6.80% : 0.000016s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.078577 2 99.09% : 0.077858s : 1: type_inference.infer 0.91% : 0.000718s : 1: type_inference.specialize ------[replace.] 0.000064 8 62.63% : 0.000040s : 4: replace.inline 37.37% : 0.000024s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000165 8 91.48% : 0.000151s : 4: match.inline 8.52% : 0.000014s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000216 1278 0.86% : 0.000002s : 13: predicate.accumulaten_eliminater 1.01% : 0.000002s : 4: predicate.ad_related_special_op_eliminate 0.50% : 0.000001s : 8: predicate.addn_check_dump 0.92% : 0.000002s : 13: predicate.addn_zero_filter 0.76% : 0.000002s : 13: predicate.adjust_all_reduce_mul_add 2.38% : 0.000005s : 21: predicate.arithmetic_simplify 0.87% : 0.000002s : 13: predicate.cast_eliminate 0.54% : 0.000001s : 8: predicate.check_bprop_eliminate 0.49% : 0.000001s : 8: predicate.compare_switch_simplify 0.18% : 0.000000s : 4: predicate.const_output_eliminate 0.60% : 0.000001s : 8: predicate.depend_value_elim 0.85% : 0.000002s : 13: predicate.dict_get_item_const_eliminator 0.94% : 0.000002s : 13: predicate.dict_get_item_eliminator 0.81% : 0.000002s : 13: predicate.dict_set_item_eliminator 1.42% : 0.000003s : 8: predicate.dumpgradient_eliminate 0.25% : 0.000001s : 4: predicate.elim_not_effective 0.47% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.11% : 0.000002s : 17: predicate.environ_add_const_eliminate 1.01% : 0.000002s : 17: predicate.environ_get_add_eliminate 1.01% : 0.000002s : 17: predicate.environ_get_depend_swap 1.73% : 0.000004s : 25: predicate.environ_get_eliminate 1.00% : 0.000002s : 17: predicate.environ_get_set_eliminate 1.42% : 0.000003s : 21: predicate.exchange_switch_depend_value 2.48% : 0.000005s : 21: predicate.float_depend_g_call 0.49% : 0.000001s : 8: predicate.float_environ_get_switch 0.83% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.17% : 0.000000s : 4: predicate.fold_const_symbol 0.71% : 0.000002s : 8: predicate.get_grad_eliminate 0.27% : 0.000001s : 4: predicate.graph_param_transform 0.67% : 0.000001s : 8: predicate.incorporate_call 0.47% : 0.000001s : 8: predicate.incorporate_call_switch 6.45% : 0.000014s : 58: predicate.inline 0.81% : 0.000002s : 8: predicate.inline_without_move 0.39% : 0.000001s : 8: predicate.j_node_and_user_rematch 0.75% : 0.000002s : 8: predicate.less_batch_normalization 1.73% : 0.000004s : 25: predicate.list_to_tuple_eliminator_ 2.56% : 0.000006s : 38: predicate.load_eliminater 1.52% : 0.000003s : 4: predicate.loop_unroll_after_grad 2.29% : 0.000005s : 34: predicate.loop_unroll_before_grad 1.50% : 0.000003s : 21: predicate.make_slice_get_slice_eliminator 0.62% : 0.000001s : 8: predicate.merge_addn 0.50% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.55% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.75% : 0.000002s : 13: predicate.minmaximum_grad 1.52% : 0.000003s : 4: predicate.mutable_eliminate 0.43% : 0.000001s : 4: predicate.opt_reshape 0.40% : 0.000001s : 4: predicate.parallel_virtual_node 1.75% : 0.000004s : 21: predicate.partial_defer_inline 1.56% : 0.000003s : 21: predicate.partial_eliminate 0.83% : 0.000002s : 13: predicate.print_const_string_wrapper 0.61% : 0.000001s : 8: predicate.reduce_all_const_elim 1.25% : 0.000003s : 13: predicate.reduce_eliminate 2.41% : 0.000005s : 38: predicate.redundant_stop_gradient_eliminater 0.52% : 0.000001s : 8: predicate.remove_not_recompute_node 1.28% : 0.000003s : 25: predicate.replace_applicator 0.60% : 0.000001s : 8: predicate.replace_old_param 0.27% : 0.000001s : 4: predicate.reset_defer_inline 0.86% : 0.000002s : 13: predicate.reshape_eliminate 0.58% : 0.000001s : 8: predicate.row_tensor_add_zeros_like 0.62% : 0.000001s : 4: predicate.row_tensor_eliminate 1.10% : 0.000002s : 8: predicate.same_eliminate 0.54% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.88% : 0.000002s : 8: predicate.shard_identity_eliminate 0.76% : 0.000002s : 8: predicate.special_op_eliminate 0.66% : 0.000001s : 8: predicate.specialize_transform 1.05% : 0.000002s : 8: predicate.split_environ_get_set_with_tuple_value 0.77% : 0.000002s : 8: predicate.stack_unstack_eliminate 0.31% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.50% : 0.000003s : 21: predicate.switch_defer_inline 1.92% : 0.000004s : 29: predicate.switch_layer_defer_inline 5.30% : 0.000011s : 67: predicate.switch_simplify 0.80% : 0.000002s : 13: predicate.tile_eliminate 1.06% : 0.000002s : 13: predicate.transpose_eliminate 1.48% : 0.000003s : 21: predicate.tuple_list_convert_item_index_to_positive 1.47% : 0.000003s : 21: predicate.tuple_list_get_item_const_eliminator 1.29% : 0.000003s : 21: predicate.tuple_list_get_item_depend_reorder 3.19% : 0.000007s : 33: predicate.tuple_list_get_item_eliminator 1.45% : 0.000003s : 21: predicate.tuple_list_get_set_item_eliminator 2.18% : 0.000005s : 29: predicate.tuple_list_set_item_eliminator 1.77% : 0.000004s : 25: predicate.tuple_to_list_eliminator_ 2.26% : 0.000005s : 38: predicate.updatestate_pure_node_eliminater 3.03% : 0.000007s : 46: predicate.updatestate_useless_node_eliminater 0.32% : 0.000001s : 4: predicate.value_based_eliminate 0.57% : 0.000001s : 8: predicate.virtual_dataset_eliminate 0.54% : 0.000001s : 8: predicate.virtual_output_eliminate 0.25% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.44% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000570 11 55.05% : 0.000314s : 5: func_graph_cloner_run.FuncGraphClonerGraph 44.95% : 0.000256s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.102080 192 0.01% : 0.000006s : 1: ForceFp32Comm 3.69% : 0.003768s : 1: add_attr 3.67% : 0.003750s : 1: add_attr_with_inline 0.01% : 0.000007s : 1: add_comm_op_reuse_tag 0.06% : 0.000063s : 1: add_recomputation 0.01% : 0.000006s : 1: assign_add_opt 0.07% : 0.000072s : 1: auto_monad 0.03% : 0.000032s : 1: auto_monad_reorder 0.01% : 0.000006s : 1: begin_end_overlap_inline 0.01% : 0.000009s : 1: bias_add_comm_swap 0.50% : 0.000506s : 1: bootstrap 0.04% : 0.000039s : 1: cconv 0.01% : 0.000006s : 1: comm_op_add_attrs 0.02% : 0.000020s : 1: control_data_broadcast_order 0.01% : 0.000014s : 1: convert_after_rewriter 0.03% : 0.000033s : 1: cse_after_recomputation 0.01% : 0.000009s : 1: dataset_repeat_opt 0.02% : 0.000025s : 1: detach_backward 0.01% : 0.000011s : 1: environ_conv 0.03% : 0.000034s : 1: event_method 0.01% : 0.000007s : 1: full_micro_interleaved_order_control 0.01% : 0.000008s : 1: get_jit_bprop_graph 0.01% : 0.000012s : 1: graph_reusing 0.01% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000007s : 1: handle_group_info 0.01% : 0.000008s : 1: inline 0.01% : 0.000010s : 1: insert-virtual-dataset 0.01% : 0.000006s : 1: interleave_parallel_branches 0.01% : 0.000006s : 1: interleave_split_concat_branches 0.01% : 0.000008s : 1: label_fine_grained_interleaved_index 0.01% : 0.000012s : 1: label_micro_interleaved_index 0.57% : 0.000581s : 1: loop_unroll 0.01% : 0.000007s : 1: merge_cast_opt 0.01% : 0.000008s : 1: micro_interleaved_order_control 0.60% : 0.000609s : 1: mutable_eliminate 0.01% : 0.000010s : 1: offloading_packed_experts 0.02% : 0.000018s : 1: opt.transform.loop_unroll_optimizer 0.02% : 0.000020s : 1: opt.transform.mutable_eliminate 1.23% : 0.001254s : 78: opt.transform.opt_a 0.03% : 0.000032s : 1: opt.transform.opt_after_cconv 0.04% : 0.000036s : 1: opt.transform.opt_after_jit_grad 0.10% : 0.000103s : 28: opt.transform.opt_b 0.06% : 0.000058s : 2: opt.transform.opt_trans_graph 0.04% : 0.000043s : 4: opt.transform.symbol_engine_opt 3.15% : 0.003216s : 1: opt_a 0.14% : 0.000147s : 1: opt_after_cconv 0.76% : 0.000772s : 1: opt_after_jit_grad 0.28% : 0.000286s : 1: opt_b 6.09% : 0.006218s : 1: optimize 0.03% : 0.000026s : 1: optimize_parallel_all_gather_comm 0.01% : 0.000011s : 1: order_py_execute_after_rewriter 0.03% : 0.000031s : 1: overlap_grad_flash_sp 0.01% : 0.000007s : 1: overlap_grad_matmul_and_grad_allreduce 0.01% : 0.000010s : 1: overlap_grad_ring_attention 0.01% : 0.000007s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000006s : 1: overlap_opt_shard_in_pipeline 0.01% : 0.000009s : 1: overlap_param_gather 0.01% : 0.000006s : 1: overlap_recompute_allgather_and_fa_grad 0.01% : 0.000012s : 1: overlap_recompute_and_grad_model_parallel 0.01% : 0.000008s : 1: overlap_recompute_comm 0.01% : 0.000010s : 1: parallel-infer-symbol 0.01% : 0.000007s : 1: parallel-infer-symbol-second 0.01% : 0.000008s : 1: partial_unused_args_eliminate 0.01% : 0.000011s : 1: pipeline_parallel_scheduler 0.01% : 0.000008s : 1: pipeline_split 0.06% : 0.000058s : 1: pre_auto_parallel 0.04% : 0.000036s : 1: py_interpret_to_execute 0.02% : 0.000018s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000006s : 1: remove_cast_before_assign_add 0.02% : 0.000021s : 1: remove_dup_value 0.37% : 0.000378s : 1: renormalize.infer 0.31% : 0.000319s : 1: renormalize.specialize 0.01% : 0.000008s : 1: reorder_send_recv_between_fp_bp 0.01% : 0.000013s : 1: rewriter_after_jit_bprop_graph 0.05% : 0.000048s : 1: rewriter_after_opt_a 0.09% : 0.000094s : 1: rewriter_before_opt_a 0.01% : 0.000007s : 1: slice_cell_reuse_recomputed_activation 0.01% : 0.000009s : 1: slice_recompute_activation 0.01% : 0.000008s : 1: split_layernorm_comm 0.01% : 0.000008s : 1: split_matmul_comm_elemetwise 0.01% : 0.000012s : 1: swap_dp_allreduce_reducescatter 0.11% : 0.000112s : 1: symbol_engine_optimizer 0.10% : 0.000106s : 1: tuple_transform 77.11% : 0.078717s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:42:51.408.184 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0582734, [21] [bootstrap]: 0.0005554 [type_inference]: 0.00718735 [event_method]: 2.256e-05 [auto_monad]: 6.95e-05 [graph_reusing]: 6.60002e-06 [inline]: 3.16999e-06 [add_attr]: 0.0445047, [1] [add_attr_with_inline]: 0.0444911, [1] [Cycle 1]: 7.922e-05, [2] [tag_attr]: 2.546e-05 [meta_addattr_fg_expand]: 5.83002e-06 [parallel-infer-symbol]: 3.88999e-06 [pre_auto_parallel]: 4.376e-05 [insert-virtual-dataset]: 2.44001e-06 [parallel-infer-symbol-second]: 7.7e-07 [dataset_repeat_opt]: 2.67001e-06 [pipeline_split]: 1.79e-06 [optimize]: 0.00514035, [53] [py_interpret_to_execute]: 3.255e-05 [rewriter_before_opt_a]: 9.534e-05 [opt_a]: 0.00300881, [2] [Cycle 1]: 0.00229998, [45] [expand_dump_flag]: 3.10998e-06 [switch_simplify]: 4.279e-05 [loop_unroll]: 2.995e-05 [a_1]: 0.00068985 [with_stream_mark]: 2.255e-05 [recompute_prepare]: 1.045e-05 [updatestate_depend_eliminate]: 4.80999e-06 [updatestate_assign_eliminate]: 3.46999e-06 [updatestate_loads_eliminate]: 3.23e-06 [parameter_eliminate]: 2.51998e-06 [a_2]: 8.945e-05 [accelerated_algorithm]: 7.51001e-06 [shard]: 1.96e-06 [meta_shard_fg_expand]: 1.62999e-06 [shard_inline]: 6.76999e-06 [merge_send_recv]: 9.25001e-06 [auto_parallel]: 7.93001e-06 [parallel]: 2.096e-05 [flash_sp]: 9.66e-06 [merge_comm]: 4.48999e-06 [allreduce_fusion]: 3.41001e-06 [matmul_add_comm_reduction]: 1.082e-05 [allreduce_slice_to_reducescatter]: 6.69999e-07 [virtual_shard_identity]: 8.53001e-06 [virtual_dataset]: 6.46999e-06 [get_grad_eliminate_]: 6.59001e-06 [virtual_output]: 6.51e-06 [merge_forward]: 3.93001e-06 [cell_reuse_recompute_pass]: 1.32e-06 [offload_activation]: 1.126e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.451e-05 [merge_recompute_call_nodes]: 1.45001e-06 [before_grad]: 1.151e-05 [set_forward_comm_id_for_comm_node_pass]: 4.18999e-06 [meta_fg_expand]: 3.35e-06 [flash_sp_send_recv_attached]: 2.64999e-06 [receive_attached]: 2.97002e-06 [after_resolve]: 1.324e-05 [a_after_grad]: 1.16e-05 [renormalize]: 0.00082829 [add_forward_monad_depend]: 6.08998e-06 [auto_monad_grad]: 2.54999e-06 [auto_monad_eliminator]: 1.675e-05 [cse]: 3.141e-05 [a_3]: 5.061e-05 [Cycle 2]: 0.00069778, [45] [expand_dump_flag]: 1.67001e-06 [switch_simplify]: 8.18001e-06 [loop_unroll]: 6.20002e-06 [a_1]: 0.00014746 [with_stream_mark]: 1.39e-05 [recompute_prepare]: 7.61999e-06 [updatestate_depend_eliminate]: 3.71001e-06 [updatestate_assign_eliminate]: 2.54001e-06 [updatestate_loads_eliminate]: 2.86999e-06 [parameter_eliminate]: 1.21002e-06 [a_2]: 8.008e-05 [accelerated_algorithm]: 6.74001e-06 [shard]: 1.41998e-06 [meta_shard_fg_expand]: 1.76e-06 [shard_inline]: 6.14999e-06 [merge_send_recv]: 5.90002e-06 [auto_parallel]: 6.26998e-06 [parallel]: 5.59e-06 [flash_sp]: 3.76001e-06 [merge_comm]: 4.05998e-06 [allreduce_fusion]: 3.11999e-06 [matmul_add_comm_reduction]: 6.34999e-06 [allreduce_slice_to_reducescatter]: 4.50003e-07 [virtual_shard_identity]: 7.16999e-06 [virtual_dataset]: 6.06998e-06 [get_grad_eliminate_]: 6.76e-06 [virtual_output]: 5.59e-06 [merge_forward]: 3.63e-06 [cell_reuse_recompute_pass]: 1.77999e-06 [offload_activation]: 7.93001e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.088e-05 [merge_recompute_call_nodes]: 1.49998e-06 [before_grad]: 1.026e-05 [set_forward_comm_id_for_comm_node_pass]: 4.02e-06 [meta_fg_expand]: 2.02001e-06 [flash_sp_send_recv_attached]: 1.35999e-06 [receive_attached]: 1.19998e-06 [after_resolve]: 1.228e-05 [a_after_grad]: 9.74999e-06 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 1.60999e-06 [auto_monad_grad]: 1.02998e-06 [auto_monad_eliminator]: 8.23999e-06 [cse]: 1.527e-05 [a_3]: 3.623e-05 [py_interpret_to_execute_after_opt_a]: 1.115e-05 [slice_cell_reuse_recomputed_activation]: 1.77001e-06 [rewriter_after_opt_a]: 3.572e-05 [convert_after_rewriter]: 6.60002e-06 [order_py_execute_after_rewriter]: 5.44e-06 [mutable_eliminate]: 0.00058329 [opt_b]: 0.00020889, [1] [Cycle 1]: 0.00020194, [7] [b_1]: 0.00012456 [b_2]: 8.28001e-06 [updatestate_depend_eliminate]: 7.56999e-06 [updatestate_assign_eliminate]: 2.74999e-06 [updatestate_loads_eliminate]: 2.26998e-06 [renormalize]: 7.99977e-07 [cse]: 2.077e-05 [optimize_parallel_all_gather_comm]: 1.69e-05 [overlap_param_gather]: 1.87001e-06 [cconv]: 2.773e-05 [loop_unroll]: 0.00043355 [opt_after_cconv]: 0.00010614, [1] [Cycle 1]: 9.988e-05, [7] [c_1]: 3.024e-05 [parameter_eliminate]: 4.27e-06 [updatestate_depend_eliminate]: 6.43e-06 [updatestate_assign_eliminate]: 2.41998e-06 [updatestate_loads_eliminate]: 2.27999e-06 [cse]: 1.904e-05 [renormalize]: 3.69997e-07 [remove_dup_value]: 1.318e-05 [tuple_transform]: 7.718e-05, [1] [Cycle 1]: 7.218e-05, [4] [d_1]: 4.475e-05 [none_parameter_eliminate]: 1.66998e-06 [renormalize]: 2.50002e-07 [switch_simplify]: 6.91001e-06 [partial_unused_args_eliminate]: 1.77999e-06 [add_recomputation]: 5.14e-05 [cse_after_recomputation]: 2.116e-05, [1] [Cycle 1]: 1.634e-05, [1] [cse]: 1.019e-05 [environ_conv]: 4.78001e-06 [swap_dp_allreduce_reducescatter]: 4.85001e-06 [bias_add_comm_swap]: 2.91e-06 [label_micro_interleaved_index]: 4.58999e-06 [label_fine_grained_interleaved_index]: 2.46e-06 [merge_cast_opt]: 1.27e-06 [slice_recompute_activation]: 2.28002e-06 [micro_interleaved_order_control]: 2.31e-06 [assign_add_opt]: 1.14003e-06 [ForceFp32Comm]: 8.50006e-07 [remove_cast_before_assign_add]: 9.90025e-07 [full_micro_interleaved_order_control]: 2.40002e-06 [reorder_send_recv_between_fp_bp]: 2.68998e-06 [comm_op_add_attrs]: 1.02998e-06 [add_comm_op_reuse_tag]: 9.60019e-07 [interleave_split_concat_branches]: 1.18001e-06 [interleave_parallel_branches]: 1.27999e-06 [overlap_opt_shard_in_pipeline]: 1.31002e-06 [overlap_opt_shard_grad_in_pipeline]: 2.01998e-06 [control_data_broadcast_order]: 1.295e-05 [grouped_pairwise_exchange_alltoall]: 1.52999e-06 [offloading_packed_experts]: 4.75999e-06 [overlap_recompute_and_grad_model_parallel]: 4.37998e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.74998e-06 [overlap_recompute_allgather_and_fa_grad]: 1.62999e-06 [overlap_recompute_comm]: 2.37001e-06 [overlap_grad_ring_attention]: 4.62998e-06 [overlap_grad_flash_sp]: 2.151e-05 [begin_end_overlap_inline]: 6.00005e-07 [split_matmul_comm_elemetwise]: 2.10002e-06 [split_layernorm_comm]: 1.67001e-06 [handle_group_info]: 8.99978e-07 [symbol_engine_optimizer]: 7.616e-05, [1] [Cycle 1]: 7.18e-05, [6] [build]: 3.35e-06 [elim_shapecalc]: 1.071e-05 [elim_not_effective]: 1.284e-05 [opt_reshape]: 6.94999e-06 [fold_const_symbol]: 9.81003e-06 [renormalize]: 2.50002e-07 [detach_backward]: 2.60002e-06 [pipeline_parallel_scheduler]: 1.53002e-06 [auto_monad_reorder]: 1.605e-05 [get_jit_bprop_graph]: 1.75001e-06 [rewriter_after_jit_bprop_graph]: 5.07e-06 [opt_after_jit_grad]: 0.00048457 [validate]: 4.009e-05 Sums bootstrap : 0.000555s : 4.35% type_inference : 0.007187s : 56.34% event_method : 0.000023s : 0.18% auto_monad : 0.000069s : 0.54% graph_reusing : 0.000007s : 0.05% inline : 0.000003s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000025s : 0.20% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.05% parallel-infer-symbol : 0.000004s : 0.03% pre_auto_parallel : 0.000044s : 0.34% insert-virtual-dataset : 0.000002s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000003s : 0.02% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000033s : 0.26% optimize.rewriter_before_opt_a : 0.000095s : 0.75% optimize.opt_a.expand_dump_flag : 0.000005s : 0.04% optimize.opt_a.switch_simplify : 0.000051s : 0.40% optimize.opt_a.loop_unroll : 0.000036s : 0.28% optimize.opt_a.a_1 : 0.000837s : 6.56% optimize.opt_a.with_stream_mark : 0.000036s : 0.29% optimize.opt_a.recompute_prepare : 0.000018s : 0.14% optimize.opt_a.updatestate_depend_eliminate : 0.000009s : 0.07% optimize.opt_a.updatestate_assign_eliminate : 0.000006s : 0.05% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.05% optimize.opt_a.parameter_eliminate : 0.000004s : 0.03% optimize.opt_a.a_2 : 0.000170s : 1.33% optimize.opt_a.accelerated_algorithm : 0.000014s : 0.11% optimize.opt_a.shard : 0.000003s : 0.03% optimize.opt_a.meta_shard_fg_expand : 0.000003s : 0.03% optimize.opt_a.shard_inline : 0.000013s : 0.10% optimize.opt_a.merge_send_recv : 0.000015s : 0.12% optimize.opt_a.auto_parallel : 0.000014s : 0.11% optimize.opt_a.parallel : 0.000027s : 0.21% optimize.opt_a.flash_sp : 0.000013s : 0.11% optimize.opt_a.merge_comm : 0.000009s : 0.07% optimize.opt_a.allreduce_fusion : 0.000007s : 0.05% optimize.opt_a.matmul_add_comm_reduction : 0.000017s : 0.13% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000016s : 0.12% optimize.opt_a.virtual_dataset : 0.000013s : 0.10% optimize.opt_a.get_grad_eliminate_ : 0.000013s : 0.10% optimize.opt_a.virtual_output : 0.000012s : 0.09% optimize.opt_a.merge_forward : 0.000008s : 0.06% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.02% optimize.opt_a.offload_activation : 0.000019s : 0.15% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000025s : 0.20% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.02% optimize.opt_a.before_grad : 0.000022s : 0.17% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000008s : 0.06% optimize.opt_a.meta_fg_expand : 0.000005s : 0.04% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.03% optimize.opt_a.receive_attached : 0.000004s : 0.03% optimize.opt_a.after_resolve : 0.000026s : 0.20% optimize.opt_a.a_after_grad : 0.000021s : 0.17% optimize.opt_a.renormalize : 0.000828s : 6.49% optimize.opt_a.add_forward_monad_depend : 0.000008s : 0.06% optimize.opt_a.auto_monad_grad : 0.000004s : 0.03% optimize.opt_a.auto_monad_eliminator : 0.000025s : 0.20% optimize.opt_a.cse : 0.000047s : 0.37% optimize.opt_a.a_3 : 0.000087s : 0.68% optimize.py_interpret_to_execute_after_opt_a : 0.000011s : 0.09% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.01% optimize.rewriter_after_opt_a : 0.000036s : 0.28% optimize.convert_after_rewriter : 0.000007s : 0.05% optimize.order_py_execute_after_rewriter : 0.000005s : 0.04% optimize.mutable_eliminate : 0.000583s : 4.57% optimize.opt_b.b_1 : 0.000125s : 0.98% optimize.opt_b.b_2 : 0.000008s : 0.06% optimize.opt_b.updatestate_depend_eliminate : 0.000008s : 0.06% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.02% optimize.opt_b.updatestate_loads_eliminate : 0.000002s : 0.02% optimize.opt_b.renormalize : 0.000001s : 0.01% optimize.opt_b.cse : 0.000021s : 0.16% optimize.optimize_parallel_all_gather_comm : 0.000017s : 0.13% optimize.overlap_param_gather : 0.000002s : 0.01% optimize.cconv : 0.000028s : 0.22% optimize.loop_unroll : 0.000434s : 3.40% optimize.opt_after_cconv.c_1 : 0.000030s : 0.24% optimize.opt_after_cconv.parameter_eliminate : 0.000004s : 0.03% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.05% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000002s : 0.02% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.02% optimize.opt_after_cconv.cse : 0.000019s : 0.15% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000013s : 0.10% optimize.tuple_transform.d_1 : 0.000045s : 0.35% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000007s : 0.05% optimize.partial_unused_args_eliminate : 0.000002s : 0.01% optimize.add_recomputation : 0.000051s : 0.40% optimize.cse_after_recomputation.cse : 0.000010s : 0.08% optimize.environ_conv : 0.000005s : 0.04% optimize.swap_dp_allreduce_reducescatter : 0.000005s : 0.04% optimize.bias_add_comm_swap : 0.000003s : 0.02% optimize.label_micro_interleaved_index : 0.000005s : 0.04% optimize.label_fine_grained_interleaved_index : 0.000002s : 0.02% optimize.merge_cast_opt : 0.000001s : 0.01% optimize.slice_recompute_activation : 0.000002s : 0.02% optimize.micro_interleaved_order_control : 0.000002s : 0.02% optimize.assign_add_opt : 0.000001s : 0.01% optimize.ForceFp32Comm : 0.000001s : 0.01% optimize.remove_cast_before_assign_add : 0.000001s : 0.01% optimize.full_micro_interleaved_order_control : 0.000002s : 0.02% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.02% optimize.comm_op_add_attrs : 0.000001s : 0.01% optimize.add_comm_op_reuse_tag : 0.000001s : 0.01% optimize.interleave_split_concat_branches : 0.000001s : 0.01% optimize.interleave_parallel_branches : 0.000001s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.02% optimize.control_data_broadcast_order : 0.000013s : 0.10% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.01% optimize.offloading_packed_experts : 0.000005s : 0.04% optimize.overlap_recompute_and_grad_model_parallel : 0.000004s : 0.03% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000002s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000002s : 0.01% optimize.overlap_recompute_comm : 0.000002s : 0.02% optimize.overlap_grad_ring_attention : 0.000005s : 0.04% optimize.overlap_grad_flash_sp : 0.000022s : 0.17% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.02% optimize.split_layernorm_comm : 0.000002s : 0.01% optimize.handle_group_info : 0.000001s : 0.01% optimize.symbol_engine_optimizer.build : 0.000003s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000011s : 0.08% optimize.symbol_engine_optimizer.elim_not_effective : 0.000013s : 0.10% optimize.symbol_engine_optimizer.opt_reshape : 0.000007s : 0.05% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000010s : 0.08% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000003s : 0.02% pipeline_parallel_scheduler : 0.000002s : 0.01% auto_monad_reorder : 0.000016s : 0.13% get_jit_bprop_graph : 0.000002s : 0.01% rewriter_after_jit_bprop_graph : 0.000005s : 0.04% opt_after_jit_grad : 0.000485s : 3.80% validate : 0.000040s : 0.31% Time group info: ------[substitution.] 0.000246 34 14.57% : 0.000036s : 6: substitution.arithmetic_simplify 0.72% : 0.000002s : 2: substitution.elim_not_effective 0.55% : 0.000001s : 2: substitution.fold_const_symbol 2.46% : 0.000006s : 4: substitution.graph_param_transform 69.53% : 0.000171s : 4: substitution.inline 1.73% : 0.000004s : 4: substitution.j_node_and_user_rematch 1.91% : 0.000005s : 4: substitution.remove_not_recompute_node 2.27% : 0.000006s : 4: substitution.replace_old_param 6.25% : 0.000015s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.007105 2 87.99% : 0.006252s : 1: type_inference.infer 12.01% : 0.000853s : 1: type_inference.specialize ------[replace.] 0.000066 8 65.76% : 0.000043s : 4: replace.inline 34.24% : 0.000023s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000181 8 92.61% : 0.000168s : 4: match.inline 7.39% : 0.000013s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000210 1278 0.88% : 0.000002s : 13: predicate.accumulaten_eliminater 0.93% : 0.000002s : 4: predicate.ad_related_special_op_eliminate 0.53% : 0.000001s : 8: predicate.addn_check_dump 0.86% : 0.000002s : 13: predicate.addn_zero_filter 0.83% : 0.000002s : 13: predicate.adjust_all_reduce_mul_add 2.77% : 0.000006s : 21: predicate.arithmetic_simplify 0.92% : 0.000002s : 13: predicate.cast_eliminate 0.53% : 0.000001s : 8: predicate.check_bprop_eliminate 0.48% : 0.000001s : 8: predicate.compare_switch_simplify 0.21% : 0.000000s : 4: predicate.const_output_eliminate 0.57% : 0.000001s : 8: predicate.depend_value_elim 0.86% : 0.000002s : 13: predicate.dict_get_item_const_eliminator 1.00% : 0.000002s : 13: predicate.dict_get_item_eliminator 0.83% : 0.000002s : 13: predicate.dict_set_item_eliminator 1.23% : 0.000003s : 8: predicate.dumpgradient_eliminate 0.21% : 0.000000s : 4: predicate.elim_not_effective 0.44% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.15% : 0.000002s : 17: predicate.environ_add_const_eliminate 1.18% : 0.000002s : 17: predicate.environ_get_add_eliminate 1.03% : 0.000002s : 17: predicate.environ_get_depend_swap 1.64% : 0.000003s : 25: predicate.environ_get_eliminate 1.13% : 0.000002s : 17: predicate.environ_get_set_eliminate 1.44% : 0.000003s : 21: predicate.exchange_switch_depend_value 2.89% : 0.000006s : 21: predicate.float_depend_g_call 0.54% : 0.000001s : 8: predicate.float_environ_get_switch 0.79% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.19% : 0.000000s : 4: predicate.fold_const_symbol 0.60% : 0.000001s : 8: predicate.get_grad_eliminate 0.21% : 0.000000s : 4: predicate.graph_param_transform 0.61% : 0.000001s : 8: predicate.incorporate_call 0.50% : 0.000001s : 8: predicate.incorporate_call_switch 6.29% : 0.000013s : 58: predicate.inline 0.86% : 0.000002s : 8: predicate.inline_without_move 0.30% : 0.000001s : 8: predicate.j_node_and_user_rematch 0.78% : 0.000002s : 8: predicate.less_batch_normalization 1.74% : 0.000004s : 25: predicate.list_to_tuple_eliminator_ 2.59% : 0.000005s : 38: predicate.load_eliminater 1.09% : 0.000002s : 4: predicate.loop_unroll_after_grad 2.32% : 0.000005s : 34: predicate.loop_unroll_before_grad 1.61% : 0.000003s : 21: predicate.make_slice_get_slice_eliminator 0.52% : 0.000001s : 8: predicate.merge_addn 0.61% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.52% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.81% : 0.000002s : 13: predicate.minmaximum_grad 1.34% : 0.000003s : 4: predicate.mutable_eliminate 0.32% : 0.000001s : 4: predicate.opt_reshape 0.31% : 0.000001s : 4: predicate.parallel_virtual_node 1.94% : 0.000004s : 21: predicate.partial_defer_inline 1.58% : 0.000003s : 21: predicate.partial_eliminate 0.84% : 0.000002s : 13: predicate.print_const_string_wrapper 0.51% : 0.000001s : 8: predicate.reduce_all_const_elim 1.19% : 0.000002s : 13: predicate.reduce_eliminate 2.51% : 0.000005s : 38: predicate.redundant_stop_gradient_eliminater 0.41% : 0.000001s : 8: predicate.remove_not_recompute_node 1.42% : 0.000003s : 25: predicate.replace_applicator 0.62% : 0.000001s : 8: predicate.replace_old_param 0.29% : 0.000001s : 4: predicate.reset_defer_inline 0.89% : 0.000002s : 13: predicate.reshape_eliminate 0.57% : 0.000001s : 8: predicate.row_tensor_add_zeros_like 0.39% : 0.000001s : 4: predicate.row_tensor_eliminate 0.85% : 0.000002s : 8: predicate.same_eliminate 0.57% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.87% : 0.000002s : 8: predicate.shard_identity_eliminate 0.72% : 0.000002s : 8: predicate.special_op_eliminate 0.77% : 0.000002s : 8: predicate.specialize_transform 0.98% : 0.000002s : 8: predicate.split_environ_get_set_with_tuple_value 0.93% : 0.000002s : 8: predicate.stack_unstack_eliminate 0.32% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.51% : 0.000003s : 21: predicate.switch_defer_inline 2.02% : 0.000004s : 29: predicate.switch_layer_defer_inline 4.97% : 0.000010s : 67: predicate.switch_simplify 0.88% : 0.000002s : 13: predicate.tile_eliminate 0.91% : 0.000002s : 13: predicate.transpose_eliminate 1.45% : 0.000003s : 21: predicate.tuple_list_convert_item_index_to_positive 1.41% : 0.000003s : 21: predicate.tuple_list_get_item_const_eliminator 1.39% : 0.000003s : 21: predicate.tuple_list_get_item_depend_reorder 3.53% : 0.000007s : 33: predicate.tuple_list_get_item_eliminator 1.47% : 0.000003s : 21: predicate.tuple_list_get_set_item_eliminator 2.12% : 0.000004s : 29: predicate.tuple_list_set_item_eliminator 1.74% : 0.000004s : 25: predicate.tuple_to_list_eliminator_ 2.37% : 0.000005s : 38: predicate.updatestate_pure_node_eliminater 2.97% : 0.000006s : 46: predicate.updatestate_useless_node_eliminater 0.32% : 0.000001s : 4: predicate.value_based_eliminate 0.61% : 0.000001s : 8: predicate.virtual_dataset_eliminate 0.54% : 0.000001s : 8: predicate.virtual_output_eliminate 0.31% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.41% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000653 11 54.76% : 0.000358s : 5: func_graph_cloner_run.FuncGraphClonerGraph 45.24% : 0.000295s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.110162 192 0.00% : 0.000003s : 1: ForceFp32Comm 40.41% : 0.044512s : 1: add_attr 40.39% : 0.044496s : 1: add_attr_with_inline 0.00% : 0.000004s : 1: add_comm_op_reuse_tag 0.05% : 0.000055s : 1: add_recomputation 0.00% : 0.000004s : 1: assign_add_opt 0.07% : 0.000075s : 1: auto_monad 0.02% : 0.000020s : 1: auto_monad_reorder 0.00% : 0.000003s : 1: begin_end_overlap_inline 0.01% : 0.000006s : 1: bias_add_comm_swap 0.54% : 0.000591s : 1: bootstrap 0.03% : 0.000031s : 1: cconv 0.00% : 0.000004s : 1: comm_op_add_attrs 0.01% : 0.000016s : 1: control_data_broadcast_order 0.01% : 0.000010s : 1: convert_after_rewriter 0.02% : 0.000024s : 1: cse_after_recomputation 0.01% : 0.000006s : 1: dataset_repeat_opt 0.01% : 0.000006s : 1: detach_backward 0.01% : 0.000008s : 1: environ_conv 0.03% : 0.000030s : 1: event_method 0.00% : 0.000005s : 1: full_micro_interleaved_order_control 0.00% : 0.000005s : 1: get_jit_bprop_graph 0.01% : 0.000010s : 1: graph_reusing 0.00% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000004s : 1: handle_group_info 0.01% : 0.000006s : 1: inline 0.01% : 0.000006s : 1: insert-virtual-dataset 0.00% : 0.000004s : 1: interleave_parallel_branches 0.00% : 0.000004s : 1: interleave_split_concat_branches 0.01% : 0.000006s : 1: label_fine_grained_interleaved_index 0.01% : 0.000008s : 1: label_micro_interleaved_index 0.40% : 0.000442s : 1: loop_unroll 0.00% : 0.000004s : 1: merge_cast_opt 0.00% : 0.000005s : 1: micro_interleaved_order_control 0.54% : 0.000593s : 1: mutable_eliminate 0.01% : 0.000008s : 1: offloading_packed_experts 0.01% : 0.000015s : 1: opt.transform.loop_unroll_optimizer 0.01% : 0.000017s : 1: opt.transform.mutable_eliminate 1.16% : 0.001279s : 78: opt.transform.opt_a 0.03% : 0.000029s : 1: opt.transform.opt_after_cconv 0.02% : 0.000027s : 1: opt.transform.opt_after_jit_grad 0.09% : 0.000103s : 28: opt.transform.opt_b 0.04% : 0.000049s : 2: opt.transform.opt_trans_graph 0.03% : 0.000037s : 4: opt.transform.symbol_engine_opt 2.73% : 0.003012s : 1: opt_a 0.10% : 0.000110s : 1: opt_after_cconv 0.45% : 0.000495s : 1: opt_after_jit_grad 0.19% : 0.000212s : 1: opt_b 4.67% : 0.005145s : 1: optimize 0.02% : 0.000020s : 1: optimize_parallel_all_gather_comm 0.01% : 0.000008s : 1: order_py_execute_after_rewriter 0.02% : 0.000026s : 1: overlap_grad_flash_sp 0.00% : 0.000005s : 1: overlap_grad_matmul_and_grad_allreduce 0.01% : 0.000008s : 1: overlap_grad_ring_attention 0.00% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000005s : 1: overlap_param_gather 0.00% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.01% : 0.000008s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000005s : 1: overlap_recompute_comm 0.01% : 0.000008s : 1: parallel-infer-symbol 0.00% : 0.000004s : 1: parallel-infer-symbol-second 0.00% : 0.000005s : 1: partial_unused_args_eliminate 0.00% : 0.000005s : 1: pipeline_parallel_scheduler 0.00% : 0.000005s : 1: pipeline_split 0.04% : 0.000048s : 1: pre_auto_parallel 0.03% : 0.000037s : 1: py_interpret_to_execute 0.01% : 0.000015s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000004s : 1: remove_cast_before_assign_add 0.01% : 0.000016s : 1: remove_dup_value 0.40% : 0.000438s : 1: renormalize.infer 0.35% : 0.000381s : 1: renormalize.specialize 0.00% : 0.000005s : 1: reorder_send_recv_between_fp_bp 0.01% : 0.000008s : 1: rewriter_after_jit_bprop_graph 0.04% : 0.000040s : 1: rewriter_after_opt_a 0.09% : 0.000099s : 1: rewriter_before_opt_a 0.00% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000005s : 1: slice_recompute_activation 0.00% : 0.000005s : 1: split_layernorm_comm 0.00% : 0.000005s : 1: split_matmul_comm_elemetwise 0.01% : 0.000008s : 1: swap_dp_allreduce_reducescatter 0.07% : 0.000079s : 1: symbol_engine_optimizer 0.07% : 0.000080s : 1: tuple_transform 6.55% : 0.007213s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:42:52.425.1 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:42:52.460.5 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0951811, [21] [bootstrap]: 0.00044878 [type_inference]: 0.00577196 [event_method]: 1.817e-05 [auto_monad]: 6.218e-05 [graph_reusing]: 6.28e-06 [inline]: 2.27999e-06 [add_attr]: 0.00319987, [1] [add_attr_with_inline]: 0.00318483, [1] [Cycle 1]: 8.306e-05, [2] [tag_attr]: 2.205e-05 [meta_addattr_fg_expand]: 5.70001e-06 [parallel-infer-symbol]: 3.25e-06 [pre_auto_parallel]: 5.318e-05 [insert-virtual-dataset]: 2.63998e-06 [parallel-infer-symbol-second]: 1.00001e-06 [dataset_repeat_opt]: 2.71999e-06 [pipeline_split]: 2.23998e-06 [optimize]: 0.0842413, [53] [py_interpret_to_execute]: 3.682e-05 [rewriter_before_opt_a]: 0.00010416 [opt_a]: 0.0815952, [2] [Cycle 1]: 0.0806433, [45] [expand_dump_flag]: 3.83001e-06 [switch_simplify]: 4.395e-05 [loop_unroll]: 3.075e-05 [a_1]: 0.00069211 [with_stream_mark]: 2.666e-05 [recompute_prepare]: 1.182e-05 [updatestate_depend_eliminate]: 4.78001e-06 [updatestate_assign_eliminate]: 3.23e-06 [updatestate_loads_eliminate]: 3.08998e-06 [parameter_eliminate]: 2.14999e-06 [a_2]: 0.00012173 [accelerated_algorithm]: 9.27999e-06 [shard]: 2.28002e-06 [meta_shard_fg_expand]: 2.25002e-06 [shard_inline]: 6.92002e-06 [merge_send_recv]: 9.95002e-06 [auto_parallel]: 9.99999e-06 [parallel]: 2.042e-05 [flash_sp]: 1.168e-05 [merge_comm]: 4.40999e-06 [allreduce_fusion]: 3.8e-06 [matmul_add_comm_reduction]: 1.062e-05 [allreduce_slice_to_reducescatter]: 9.50007e-07 [virtual_shard_identity]: 1.147e-05 [virtual_dataset]: 7.87e-06 [get_grad_eliminate_]: 6.89999e-06 [virtual_output]: 7.28e-06 [merge_forward]: 4.25999e-06 [cell_reuse_recompute_pass]: 1.45999e-06 [offload_activation]: 1.109e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.84e-05 [merge_recompute_call_nodes]: 1.75001e-06 [before_grad]: 1.168e-05 [set_forward_comm_id_for_comm_node_pass]: 3.66999e-06 [meta_fg_expand]: 3.81001e-06 [flash_sp_send_recv_attached]: 3.82002e-06 [receive_attached]: 2.29001e-06 [after_resolve]: 1.52e-05 [a_after_grad]: 1.27e-05 [renormalize]: 0.0788593 [add_forward_monad_depend]: 1.293e-05 [auto_monad_grad]: 2.53998e-06 [auto_monad_eliminator]: 2.471e-05 [cse]: 3.138e-05 [a_3]: 7.973e-05 [Cycle 2]: 0.00093364, [45] [expand_dump_flag]: 2.46e-06 [switch_simplify]: 9.41998e-06 [loop_unroll]: 6.78998e-06 [a_1]: 0.00016399 [with_stream_mark]: 1.951e-05 [recompute_prepare]: 6.76e-06 [updatestate_depend_eliminate]: 4.58999e-06 [updatestate_assign_eliminate]: 3.25e-06 [updatestate_loads_eliminate]: 2.91999e-06 [parameter_eliminate]: 2.26e-06 [a_2]: 0.0001062 [accelerated_algorithm]: 6.73e-06 [shard]: 2.54001e-06 [meta_shard_fg_expand]: 2.12999e-06 [shard_inline]: 9.54e-06 [merge_send_recv]: 9.20001e-06 [auto_parallel]: 1.098e-05 [parallel]: 9.81998e-06 [flash_sp]: 4.21001e-06 [merge_comm]: 4.33001e-06 [allreduce_fusion]: 3.31999e-06 [matmul_add_comm_reduction]: 1.009e-05 [allreduce_slice_to_reducescatter]: 1.17e-06 [virtual_shard_identity]: 7.71999e-06 [virtual_dataset]: 6.13002e-06 [get_grad_eliminate_]: 6.23e-06 [virtual_output]: 6.36e-06 [merge_forward]: 6.59001e-06 [cell_reuse_recompute_pass]: 3.46001e-06 [offload_activation]: 1.037e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.466e-05 [merge_recompute_call_nodes]: 1.44e-06 [before_grad]: 1.135e-05 [set_forward_comm_id_for_comm_node_pass]: 4.13001e-06 [meta_fg_expand]: 2.86e-06 [flash_sp_send_recv_attached]: 1.64e-06 [receive_attached]: 2.73003e-06 [after_resolve]: 1.251e-05 [a_after_grad]: 1.066e-05 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 1.72001e-06 [auto_monad_grad]: 1.09998e-06 [auto_monad_eliminator]: 8.3e-06 [cse]: 1.596e-05 [a_3]: 5.002e-05 [py_interpret_to_execute_after_opt_a]: 2.036e-05 [slice_cell_reuse_recomputed_activation]: 4.87998e-06 [rewriter_after_opt_a]: 4.783e-05 [convert_after_rewriter]: 1.04e-05 [order_py_execute_after_rewriter]: 8.23999e-06 [mutable_eliminate]: 0.00074166 [opt_b]: 0.00027699, [1] [Cycle 1]: 0.00026652, [7] [b_1]: 0.00017114 [b_2]: 9.02e-06 [updatestate_depend_eliminate]: 5.66e-06 [updatestate_assign_eliminate]: 2.50002e-06 [updatestate_loads_eliminate]: 2.58e-06 [renormalize]: 5.8001e-07 [cse]: 1.926e-05 [optimize_parallel_all_gather_comm]: 2.441e-05 [overlap_param_gather]: 5.12999e-06 [cconv]: 3.271e-05 [loop_unroll]: 0.00045044 [opt_after_cconv]: 0.00013008, [1] [Cycle 1]: 0.00012097, [7] [c_1]: 3.14e-05 [parameter_eliminate]: 3.66001e-06 [updatestate_depend_eliminate]: 5.88002e-06 [updatestate_assign_eliminate]: 2.79001e-06 [updatestate_loads_eliminate]: 2.36e-06 [cse]: 1.817e-05 [renormalize]: 6.19999e-07 [remove_dup_value]: 1.616e-05 [tuple_transform]: 9.445e-05, [1] [Cycle 1]: 8.699e-05, [4] [d_1]: 4.709e-05 [none_parameter_eliminate]: 1.82999e-06 [renormalize]: 1.80007e-07 [switch_simplify]: 7.41999e-06 [partial_unused_args_eliminate]: 4.27998e-06 [add_recomputation]: 5.392e-05 [cse_after_recomputation]: 2.829e-05, [1] [Cycle 1]: 2.066e-05, [1] [cse]: 1.155e-05 [environ_conv]: 8.63001e-06 [swap_dp_allreduce_reducescatter]: 8.47e-06 [bias_add_comm_swap]: 5.04998e-06 [label_micro_interleaved_index]: 7.63999e-06 [label_fine_grained_interleaved_index]: 5.09e-06 [merge_cast_opt]: 3.55e-06 [slice_recompute_activation]: 4.60999e-06 [micro_interleaved_order_control]: 5.07e-06 [assign_add_opt]: 3.5e-06 [ForceFp32Comm]: 3.30998e-06 [remove_cast_before_assign_add]: 3.47002e-06 [full_micro_interleaved_order_control]: 4.37003e-06 [reorder_send_recv_between_fp_bp]: 5.46e-06 [comm_op_add_attrs]: 2.469e-05 [add_comm_op_reuse_tag]: 3.31999e-06 [interleave_split_concat_branches]: 3.65003e-06 [interleave_parallel_branches]: 3.33e-06 [overlap_opt_shard_in_pipeline]: 3.76999e-06 [overlap_opt_shard_grad_in_pipeline]: 4.77998e-06 [control_data_broadcast_order]: 1.614e-05 [grouped_pairwise_exchange_alltoall]: 4.30999e-06 [offloading_packed_experts]: 6.74001e-06 [overlap_recompute_and_grad_model_parallel]: 7.79002e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.48999e-06 [overlap_recompute_allgather_and_fa_grad]: 3.85e-06 [overlap_recompute_comm]: 5.04e-06 [overlap_grad_ring_attention]: 7.19001e-06 [overlap_grad_flash_sp]: 2.846e-05 [begin_end_overlap_inline]: 3.04001e-06 [split_matmul_comm_elemetwise]: 5.07e-06 [split_layernorm_comm]: 5.17e-06 [handle_group_info]: 3.65e-06 [symbol_engine_optimizer]: 0.0001146, [1] [Cycle 1]: 0.00010622, [6] [build]: 4.90999e-06 [elim_shapecalc]: 1.309e-05 [elim_not_effective]: 1.476e-05 [opt_reshape]: 7.38e-06 [fold_const_symbol]: 1.164e-05 [renormalize]: 1.99972e-07 [detach_backward]: 4.15999e-06 [pipeline_parallel_scheduler]: 1.79e-06 [auto_monad_reorder]: 2.401e-05 [get_jit_bprop_graph]: 2.14999e-06 [rewriter_after_jit_bprop_graph]: 6.61e-06 [opt_after_jit_grad]: 0.00066118 [validate]: 4.913e-05 Sums bootstrap : 0.000449s : 0.50% type_inference : 0.005772s : 6.41% event_method : 0.000018s : 0.02% auto_monad : 0.000062s : 0.07% graph_reusing : 0.000006s : 0.01% inline : 0.000002s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000022s : 0.02% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.01% parallel-infer-symbol : 0.000003s : 0.00% pre_auto_parallel : 0.000053s : 0.06% insert-virtual-dataset : 0.000003s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000003s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000037s : 0.04% optimize.rewriter_before_opt_a : 0.000104s : 0.12% optimize.opt_a.expand_dump_flag : 0.000006s : 0.01% optimize.opt_a.switch_simplify : 0.000053s : 0.06% optimize.opt_a.loop_unroll : 0.000038s : 0.04% optimize.opt_a.a_1 : 0.000856s : 0.95% optimize.opt_a.with_stream_mark : 0.000046s : 0.05% optimize.opt_a.recompute_prepare : 0.000019s : 0.02% optimize.opt_a.updatestate_depend_eliminate : 0.000009s : 0.01% optimize.opt_a.updatestate_assign_eliminate : 0.000006s : 0.01% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.01% optimize.opt_a.parameter_eliminate : 0.000004s : 0.00% optimize.opt_a.a_2 : 0.000228s : 0.25% optimize.opt_a.accelerated_algorithm : 0.000016s : 0.02% optimize.opt_a.shard : 0.000005s : 0.01% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.00% optimize.opt_a.shard_inline : 0.000016s : 0.02% optimize.opt_a.merge_send_recv : 0.000019s : 0.02% optimize.opt_a.auto_parallel : 0.000021s : 0.02% optimize.opt_a.parallel : 0.000030s : 0.03% optimize.opt_a.flash_sp : 0.000016s : 0.02% optimize.opt_a.merge_comm : 0.000009s : 0.01% optimize.opt_a.allreduce_fusion : 0.000007s : 0.01% optimize.opt_a.matmul_add_comm_reduction : 0.000021s : 0.02% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000002s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000019s : 0.02% optimize.opt_a.virtual_dataset : 0.000014s : 0.02% optimize.opt_a.get_grad_eliminate_ : 0.000013s : 0.01% optimize.opt_a.virtual_output : 0.000014s : 0.02% optimize.opt_a.merge_forward : 0.000011s : 0.01% optimize.opt_a.cell_reuse_recompute_pass : 0.000005s : 0.01% optimize.opt_a.offload_activation : 0.000021s : 0.02% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000033s : 0.04% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.00% optimize.opt_a.before_grad : 0.000023s : 0.03% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000008s : 0.01% optimize.opt_a.meta_fg_expand : 0.000007s : 0.01% optimize.opt_a.flash_sp_send_recv_attached : 0.000005s : 0.01% optimize.opt_a.receive_attached : 0.000005s : 0.01% optimize.opt_a.after_resolve : 0.000028s : 0.03% optimize.opt_a.a_after_grad : 0.000023s : 0.03% optimize.opt_a.renormalize : 0.078859s : 87.54% optimize.opt_a.add_forward_monad_depend : 0.000015s : 0.02% optimize.opt_a.auto_monad_grad : 0.000004s : 0.00% optimize.opt_a.auto_monad_eliminator : 0.000033s : 0.04% optimize.opt_a.cse : 0.000047s : 0.05% optimize.opt_a.a_3 : 0.000130s : 0.14% optimize.py_interpret_to_execute_after_opt_a : 0.000020s : 0.02% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.01% optimize.rewriter_after_opt_a : 0.000048s : 0.05% optimize.convert_after_rewriter : 0.000010s : 0.01% optimize.order_py_execute_after_rewriter : 0.000008s : 0.01% optimize.mutable_eliminate : 0.000742s : 0.82% optimize.opt_b.b_1 : 0.000171s : 0.19% optimize.opt_b.b_2 : 0.000009s : 0.01% optimize.opt_b.updatestate_depend_eliminate : 0.000006s : 0.01% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.00% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000019s : 0.02% optimize.optimize_parallel_all_gather_comm : 0.000024s : 0.03% optimize.overlap_param_gather : 0.000005s : 0.01% optimize.cconv : 0.000033s : 0.04% optimize.loop_unroll : 0.000450s : 0.50% optimize.opt_after_cconv.c_1 : 0.000031s : 0.03% optimize.opt_after_cconv.parameter_eliminate : 0.000004s : 0.00% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.01% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.00% optimize.opt_after_cconv.cse : 0.000018s : 0.02% optimize.opt_after_cconv.renormalize : 0.000001s : 0.00% optimize.remove_dup_value : 0.000016s : 0.02% optimize.tuple_transform.d_1 : 0.000047s : 0.05% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000007s : 0.01% optimize.partial_unused_args_eliminate : 0.000004s : 0.00% optimize.add_recomputation : 0.000054s : 0.06% optimize.cse_after_recomputation.cse : 0.000012s : 0.01% optimize.environ_conv : 0.000009s : 0.01% optimize.swap_dp_allreduce_reducescatter : 0.000008s : 0.01% optimize.bias_add_comm_swap : 0.000005s : 0.01% optimize.label_micro_interleaved_index : 0.000008s : 0.01% optimize.label_fine_grained_interleaved_index : 0.000005s : 0.01% optimize.merge_cast_opt : 0.000004s : 0.00% optimize.slice_recompute_activation : 0.000005s : 0.01% optimize.micro_interleaved_order_control : 0.000005s : 0.01% optimize.assign_add_opt : 0.000003s : 0.00% optimize.ForceFp32Comm : 0.000003s : 0.00% optimize.remove_cast_before_assign_add : 0.000003s : 0.00% optimize.full_micro_interleaved_order_control : 0.000004s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000005s : 0.01% optimize.comm_op_add_attrs : 0.000025s : 0.03% optimize.add_comm_op_reuse_tag : 0.000003s : 0.00% optimize.interleave_split_concat_branches : 0.000004s : 0.00% optimize.interleave_parallel_branches : 0.000003s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000005s : 0.01% optimize.control_data_broadcast_order : 0.000016s : 0.02% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.00% optimize.offloading_packed_experts : 0.000007s : 0.01% optimize.overlap_recompute_and_grad_model_parallel : 0.000008s : 0.01% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000003s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.00% optimize.overlap_recompute_comm : 0.000005s : 0.01% optimize.overlap_grad_ring_attention : 0.000007s : 0.01% optimize.overlap_grad_flash_sp : 0.000028s : 0.03% optimize.begin_end_overlap_inline : 0.000003s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000005s : 0.01% optimize.split_layernorm_comm : 0.000005s : 0.01% optimize.handle_group_info : 0.000004s : 0.00% optimize.symbol_engine_optimizer.build : 0.000005s : 0.01% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000013s : 0.01% optimize.symbol_engine_optimizer.elim_not_effective : 0.000015s : 0.02% optimize.symbol_engine_optimizer.opt_reshape : 0.000007s : 0.01% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000012s : 0.01% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000004s : 0.00% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000024s : 0.03% get_jit_bprop_graph : 0.000002s : 0.00% rewriter_after_jit_bprop_graph : 0.000007s : 0.01% opt_after_jit_grad : 0.000661s : 0.73% validate : 0.000049s : 0.05% Time group info: ------[substitution.] 0.000253 34 17.16% : 0.000043s : 6: substitution.arithmetic_simplify 0.69% : 0.000002s : 2: substitution.elim_not_effective 0.68% : 0.000002s : 2: substitution.fold_const_symbol 2.27% : 0.000006s : 4: substitution.graph_param_transform 65.23% : 0.000165s : 4: substitution.inline 2.18% : 0.000006s : 4: substitution.j_node_and_user_rematch 2.29% : 0.000006s : 4: substitution.remove_not_recompute_node 2.81% : 0.000007s : 4: substitution.replace_old_param 6.69% : 0.000017s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.005717 2 88.02% : 0.005032s : 1: type_inference.infer 11.98% : 0.000685s : 1: type_inference.specialize ------[replace.] 0.000066 8 61.77% : 0.000041s : 4: replace.inline 38.23% : 0.000025s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000177 8 91.51% : 0.000162s : 4: match.inline 8.49% : 0.000015s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000229 1278 1.25% : 0.000003s : 13: predicate.accumulaten_eliminater 0.87% : 0.000002s : 4: predicate.ad_related_special_op_eliminate 0.55% : 0.000001s : 8: predicate.addn_check_dump 1.30% : 0.000003s : 13: predicate.addn_zero_filter 0.77% : 0.000002s : 13: predicate.adjust_all_reduce_mul_add 2.53% : 0.000006s : 21: predicate.arithmetic_simplify 1.10% : 0.000003s : 13: predicate.cast_eliminate 0.60% : 0.000001s : 8: predicate.check_bprop_eliminate 0.54% : 0.000001s : 8: predicate.compare_switch_simplify 0.17% : 0.000000s : 4: predicate.const_output_eliminate 0.62% : 0.000001s : 8: predicate.depend_value_elim 1.01% : 0.000002s : 13: predicate.dict_get_item_const_eliminator 1.00% : 0.000002s : 13: predicate.dict_get_item_eliminator 0.85% : 0.000002s : 13: predicate.dict_set_item_eliminator 1.19% : 0.000003s : 8: predicate.dumpgradient_eliminate 0.26% : 0.000001s : 4: predicate.elim_not_effective 0.34% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.25% : 0.000003s : 17: predicate.environ_add_const_eliminate 1.09% : 0.000002s : 17: predicate.environ_get_add_eliminate 1.05% : 0.000002s : 17: predicate.environ_get_depend_swap 1.64% : 0.000004s : 25: predicate.environ_get_eliminate 1.18% : 0.000003s : 17: predicate.environ_get_set_eliminate 1.49% : 0.000003s : 21: predicate.exchange_switch_depend_value 2.49% : 0.000006s : 21: predicate.float_depend_g_call 0.57% : 0.000001s : 8: predicate.float_environ_get_switch 0.88% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.19% : 0.000000s : 4: predicate.fold_const_symbol 0.63% : 0.000001s : 8: predicate.get_grad_eliminate 0.17% : 0.000000s : 4: predicate.graph_param_transform 0.51% : 0.000001s : 8: predicate.incorporate_call 0.43% : 0.000001s : 8: predicate.incorporate_call_switch 5.83% : 0.000013s : 58: predicate.inline 0.78% : 0.000002s : 8: predicate.inline_without_move 0.29% : 0.000001s : 8: predicate.j_node_and_user_rematch 0.92% : 0.000002s : 8: predicate.less_batch_normalization 1.74% : 0.000004s : 25: predicate.list_to_tuple_eliminator_ 2.18% : 0.000005s : 38: predicate.load_eliminater 0.78% : 0.000002s : 4: predicate.loop_unroll_after_grad 2.29% : 0.000005s : 34: predicate.loop_unroll_before_grad 1.53% : 0.000004s : 21: predicate.make_slice_get_slice_eliminator 0.56% : 0.000001s : 8: predicate.merge_addn 0.60% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.68% : 0.000002s : 8: predicate.mini_step_allgather_replace 0.91% : 0.000002s : 13: predicate.minmaximum_grad 1.01% : 0.000002s : 4: predicate.mutable_eliminate 0.28% : 0.000001s : 4: predicate.opt_reshape 0.47% : 0.000001s : 4: predicate.parallel_virtual_node 1.60% : 0.000004s : 21: predicate.partial_defer_inline 1.52% : 0.000003s : 21: predicate.partial_eliminate 0.87% : 0.000002s : 13: predicate.print_const_string_wrapper 0.65% : 0.000001s : 8: predicate.reduce_all_const_elim 1.18% : 0.000003s : 13: predicate.reduce_eliminate 2.43% : 0.000006s : 38: predicate.redundant_stop_gradient_eliminater 0.59% : 0.000001s : 8: predicate.remove_not_recompute_node 1.37% : 0.000003s : 25: predicate.replace_applicator 0.63% : 0.000001s : 8: predicate.replace_old_param 0.23% : 0.000001s : 4: predicate.reset_defer_inline 1.22% : 0.000003s : 13: predicate.reshape_eliminate 0.61% : 0.000001s : 8: predicate.row_tensor_add_zeros_like 0.46% : 0.000001s : 4: predicate.row_tensor_eliminate 0.79% : 0.000002s : 8: predicate.same_eliminate 0.41% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.73% : 0.000002s : 8: predicate.shard_identity_eliminate 0.78% : 0.000002s : 8: predicate.special_op_eliminate 0.97% : 0.000002s : 8: predicate.specialize_transform 1.00% : 0.000002s : 8: predicate.split_environ_get_set_with_tuple_value 1.27% : 0.000003s : 8: predicate.stack_unstack_eliminate 0.31% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.47% : 0.000003s : 21: predicate.switch_defer_inline 1.85% : 0.000004s : 29: predicate.switch_layer_defer_inline 4.97% : 0.000011s : 67: predicate.switch_simplify 0.83% : 0.000002s : 13: predicate.tile_eliminate 1.04% : 0.000002s : 13: predicate.transpose_eliminate 1.72% : 0.000004s : 21: predicate.tuple_list_convert_item_index_to_positive 1.52% : 0.000003s : 21: predicate.tuple_list_get_item_const_eliminator 1.29% : 0.000003s : 21: predicate.tuple_list_get_item_depend_reorder 3.23% : 0.000007s : 33: predicate.tuple_list_get_item_eliminator 1.40% : 0.000003s : 21: predicate.tuple_list_get_set_item_eliminator 2.30% : 0.000005s : 29: predicate.tuple_list_set_item_eliminator 1.74% : 0.000004s : 25: predicate.tuple_to_list_eliminator_ 2.21% : 0.000005s : 38: predicate.updatestate_pure_node_eliminater 3.07% : 0.000007s : 46: predicate.updatestate_useless_node_eliminater 0.42% : 0.000001s : 4: predicate.value_based_eliminate 0.62% : 0.000001s : 8: predicate.virtual_dataset_eliminate 0.64% : 0.000001s : 8: predicate.virtual_output_eliminate 0.25% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.40% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000650 11 45.78% : 0.000297s : 5: func_graph_cloner_run.FuncGraphClonerGraph 54.22% : 0.000352s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.262935 192 0.00% : 0.000006s : 1: ForceFp32Comm 1.22% : 0.003211s : 1: add_attr 1.21% : 0.003189s : 1: add_attr_with_inline 0.00% : 0.000007s : 1: add_comm_op_reuse_tag 0.02% : 0.000058s : 1: add_recomputation 0.00% : 0.000006s : 1: assign_add_opt 0.03% : 0.000071s : 1: auto_monad 0.01% : 0.000032s : 1: auto_monad_reorder 0.00% : 0.000006s : 1: begin_end_overlap_inline 0.00% : 0.000008s : 1: bias_add_comm_swap 0.19% : 0.000494s : 1: bootstrap 0.01% : 0.000036s : 1: cconv 0.01% : 0.000028s : 1: comm_op_add_attrs 0.01% : 0.000019s : 1: control_data_broadcast_order 0.01% : 0.000013s : 1: convert_after_rewriter 0.01% : 0.000031s : 1: cse_after_recomputation 0.00% : 0.000008s : 1: dataset_repeat_opt 0.01% : 0.000023s : 1: detach_backward 0.00% : 0.000011s : 1: environ_conv 0.01% : 0.000029s : 1: event_method 0.00% : 0.000007s : 1: full_micro_interleaved_order_control 0.00% : 0.000009s : 1: get_jit_bprop_graph 0.00% : 0.000013s : 1: graph_reusing 0.00% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000007s : 1: handle_group_info 0.00% : 0.000008s : 1: inline 0.00% : 0.000009s : 1: insert-virtual-dataset 0.00% : 0.000006s : 1: interleave_parallel_branches 0.00% : 0.000006s : 1: interleave_split_concat_branches 0.00% : 0.000008s : 1: label_fine_grained_interleaved_index 0.00% : 0.000010s : 1: label_micro_interleaved_index 0.17% : 0.000456s : 1: loop_unroll 0.00% : 0.000006s : 1: merge_cast_opt 0.00% : 0.000008s : 1: micro_interleaved_order_control 0.28% : 0.000749s : 1: mutable_eliminate 0.00% : 0.000010s : 1: offloading_packed_experts 0.01% : 0.000014s : 1: opt.transform.loop_unroll_optimizer 0.01% : 0.000017s : 1: opt.transform.mutable_eliminate 0.51% : 0.001334s : 78: opt.transform.opt_a 0.01% : 0.000030s : 1: opt.transform.opt_after_cconv 0.01% : 0.000032s : 1: opt.transform.opt_after_jit_grad 0.04% : 0.000107s : 28: opt.transform.opt_b 0.02% : 0.000052s : 2: opt.transform.opt_trans_graph 0.02% : 0.000043s : 4: opt.transform.symbol_engine_opt 31.03% : 0.081599s : 1: opt_a 0.05% : 0.000134s : 1: opt_after_cconv 0.26% : 0.000675s : 1: opt_after_jit_grad 0.11% : 0.000280s : 1: opt_b 32.17% : 0.084596s : 1: optimize 0.01% : 0.000028s : 1: optimize_parallel_all_gather_comm 0.00% : 0.000011s : 1: order_py_execute_after_rewriter 0.01% : 0.000032s : 1: overlap_grad_flash_sp 0.00% : 0.000006s : 1: overlap_grad_matmul_and_grad_allreduce 0.00% : 0.000010s : 1: overlap_grad_ring_attention 0.00% : 0.000007s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000007s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000008s : 1: overlap_param_gather 0.00% : 0.000007s : 1: overlap_recompute_allgather_and_fa_grad 0.00% : 0.000011s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000008s : 1: overlap_recompute_comm 0.00% : 0.000010s : 1: parallel-infer-symbol 0.00% : 0.000006s : 1: parallel-infer-symbol-second 0.00% : 0.000007s : 1: partial_unused_args_eliminate 0.00% : 0.000009s : 1: pipeline_parallel_scheduler 0.00% : 0.000008s : 1: pipeline_split 0.02% : 0.000062s : 1: pre_auto_parallel 0.02% : 0.000041s : 1: py_interpret_to_execute 0.01% : 0.000024s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000006s : 1: remove_cast_before_assign_add 0.01% : 0.000020s : 1: remove_dup_value 29.78% : 0.078306s : 1: renormalize.infer 0.20% : 0.000534s : 1: renormalize.specialize 0.00% : 0.000008s : 1: reorder_send_recv_between_fp_bp 0.00% : 0.000013s : 1: rewriter_after_jit_bprop_graph 0.02% : 0.000051s : 1: rewriter_after_opt_a 0.04% : 0.000108s : 1: rewriter_before_opt_a 0.00% : 0.000008s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000008s : 1: slice_recompute_activation 0.00% : 0.000009s : 1: split_layernorm_comm 0.00% : 0.000008s : 1: split_matmul_comm_elemetwise 0.00% : 0.000011s : 1: swap_dp_allreduce_reducescatter 0.04% : 0.000118s : 1: symbol_engine_optimizer 0.04% : 0.000097s : 1: tuple_transform 2.21% : 0.005808s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:42:53.242.3 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0675891, [21] [bootstrap]: 0.00048143 [type_inference]: 0.0504458 [event_method]: 2.284e-05 [auto_monad]: 7.115e-05 [graph_reusing]: 6.28002e-06 [inline]: 3.01001e-06 [add_attr]: 0.0079708, [1] [add_attr_with_inline]: 0.00795948, [1] [Cycle 1]: 7.383e-05, [2] [tag_attr]: 2.2e-05 [meta_addattr_fg_expand]: 6.15002e-06 [parallel-infer-symbol]: 3.43e-06 [pre_auto_parallel]: 4.038e-05 [insert-virtual-dataset]: 2.49001e-06 [parallel-infer-symbol-second]: 7.10017e-07 [dataset_repeat_opt]: 2.16e-06 [pipeline_split]: 1.64e-06 [optimize]: 0.00783287, [53] [py_interpret_to_execute]: 2.972e-05 [rewriter_before_opt_a]: 8.625e-05 [opt_a]: 0.00563718, [2] [Cycle 1]: 0.00493699, [45] [expand_dump_flag]: 3.06001e-06 [switch_simplify]: 4.534e-05 [loop_unroll]: 3.016e-05 [a_1]: 0.0006639 [with_stream_mark]: 1.797e-05 [recompute_prepare]: 9.19e-06 [updatestate_depend_eliminate]: 4.27e-06 [updatestate_assign_eliminate]: 2.99001e-06 [updatestate_loads_eliminate]: 3.33e-06 [parameter_eliminate]: 1.84e-06 [a_2]: 9.004e-05 [accelerated_algorithm]: 7.58001e-06 [shard]: 1.68002e-06 [meta_shard_fg_expand]: 1.72001e-06 [shard_inline]: 6.53e-06 [merge_send_recv]: 8.3e-06 [auto_parallel]: 6.53998e-06 [parallel]: 1.962e-05 [flash_sp]: 8.37e-06 [merge_comm]: 4.1e-06 [allreduce_fusion]: 3.73001e-06 [matmul_add_comm_reduction]: 9.67999e-06 [allreduce_slice_to_reducescatter]: 6.80011e-07 [virtual_shard_identity]: 8.94e-06 [virtual_dataset]: 6.84001e-06 [get_grad_eliminate_]: 6.21e-06 [virtual_output]: 6.30002e-06 [merge_forward]: 3.68999e-06 [cell_reuse_recompute_pass]: 1.18001e-06 [offload_activation]: 1.002e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.278e-05 [merge_recompute_call_nodes]: 1.54998e-06 [before_grad]: 1.15e-05 [set_forward_comm_id_for_comm_node_pass]: 3.71999e-06 [meta_fg_expand]: 2.68998e-06 [flash_sp_send_recv_attached]: 2.63e-06 [receive_attached]: 2.14999e-06 [after_resolve]: 1.138e-05 [a_after_grad]: 9.93998e-06 [renormalize]: 0.00352208 [add_forward_monad_depend]: 6.73998e-06 [auto_monad_grad]: 2.53998e-06 [auto_monad_eliminator]: 1.807e-05 [cse]: 3.046e-05 [a_3]: 5.037e-05 [Cycle 2]: 0.00068917, [45] [expand_dump_flag]: 1.15999e-06 [switch_simplify]: 8.35999e-06 [loop_unroll]: 6.17999e-06 [a_1]: 0.00014702 [with_stream_mark]: 1.374e-05 [recompute_prepare]: 6.54001e-06 [updatestate_depend_eliminate]: 3.56999e-06 [updatestate_assign_eliminate]: 2.46998e-06 [updatestate_loads_eliminate]: 2.83e-06 [parameter_eliminate]: 1.20999e-06 [a_2]: 7.942e-05 [accelerated_algorithm]: 6.49001e-06 [shard]: 1.34e-06 [meta_shard_fg_expand]: 1.54e-06 [shard_inline]: 6.53e-06 [merge_send_recv]: 5.57001e-06 [auto_parallel]: 6.48998e-06 [parallel]: 5.42001e-06 [flash_sp]: 3.58999e-06 [merge_comm]: 3.12002e-06 [allreduce_fusion]: 3.6e-06 [matmul_add_comm_reduction]: 6.54999e-06 [allreduce_slice_to_reducescatter]: 6.09987e-07 [virtual_shard_identity]: 1.003e-05 [virtual_dataset]: 6.34999e-06 [get_grad_eliminate_]: 5.77001e-06 [virtual_output]: 5.98998e-06 [merge_forward]: 3.58e-06 [cell_reuse_recompute_pass]: 1.32e-06 [offload_activation]: 7.92998e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.196e-05 [merge_recompute_call_nodes]: 1.12e-06 [before_grad]: 9.52001e-06 [set_forward_comm_id_for_comm_node_pass]: 3.21001e-06 [meta_fg_expand]: 2.49001e-06 [flash_sp_send_recv_attached]: 1.07e-06 [receive_attached]: 1.18001e-06 [after_resolve]: 1.102e-05 [a_after_grad]: 9.22999e-06 [renormalize]: 7.00238e-08 [add_forward_monad_depend]: 1.40001e-06 [auto_monad_grad]: 9.89996e-07 [auto_monad_eliminator]: 7.45e-06 [cse]: 1.319e-05 [a_3]: 3.834e-05 [py_interpret_to_execute_after_opt_a]: 1.059e-05 [slice_cell_reuse_recomputed_activation]: 2.17001e-06 [rewriter_after_opt_a]: 4.132e-05 [convert_after_rewriter]: 7.01001e-06 [order_py_execute_after_rewriter]: 5.19e-06 [mutable_eliminate]: 0.00059775 [opt_b]: 0.00024799, [1] [Cycle 1]: 0.00024142, [7] [b_1]: 0.0001245 [b_2]: 7.93999e-06 [updatestate_depend_eliminate]: 5.27999e-06 [updatestate_assign_eliminate]: 2.41998e-06 [updatestate_loads_eliminate]: 2.81e-06 [renormalize]: 4.50003e-07 [cse]: 1.881e-05 [optimize_parallel_all_gather_comm]: 1.92e-05 [overlap_param_gather]: 2.30002e-06 [cconv]: 2.857e-05 [loop_unroll]: 0.0004499 [opt_after_cconv]: 0.000102, [1] [Cycle 1]: 9.552e-05, [7] [c_1]: 3.16e-05 [parameter_eliminate]: 2.98e-06 [updatestate_depend_eliminate]: 5.24998e-06 [updatestate_assign_eliminate]: 2.59999e-06 [updatestate_loads_eliminate]: 2.37999e-06 [cse]: 1.652e-05 [renormalize]: 4.00003e-07 [remove_dup_value]: 1.395e-05 [tuple_transform]: 7.522e-05, [1] [Cycle 1]: 7.096e-05, [4] [d_1]: 4.332e-05 [none_parameter_eliminate]: 1.77999e-06 [renormalize]: 8.00006e-07 [switch_simplify]: 6.71e-06 [partial_unused_args_eliminate]: 2.09999e-06 [add_recomputation]: 5.161e-05 [cse_after_recomputation]: 2.174e-05, [1] [Cycle 1]: 1.699e-05, [1] [cse]: 1.137e-05 [environ_conv]: 5.35999e-06 [swap_dp_allreduce_reducescatter]: 5.14e-06 [bias_add_comm_swap]: 3.01001e-06 [label_micro_interleaved_index]: 4.47e-06 [label_fine_grained_interleaved_index]: 3.21001e-06 [merge_cast_opt]: 1.46002e-06 [slice_recompute_activation]: 2.34999e-06 [micro_interleaved_order_control]: 2.55002e-06 [assign_add_opt]: 1.24e-06 [ForceFp32Comm]: 8.30012e-07 [remove_cast_before_assign_add]: 1.02e-06 [full_micro_interleaved_order_control]: 2.04999e-06 [reorder_send_recv_between_fp_bp]: 2.66e-06 [comm_op_add_attrs]: 1.23002e-06 [add_comm_op_reuse_tag]: 9.39996e-07 [interleave_split_concat_branches]: 1.54e-06 [interleave_parallel_branches]: 1.02998e-06 [overlap_opt_shard_in_pipeline]: 1.24e-06 [overlap_opt_shard_grad_in_pipeline]: 1.74e-06 [control_data_broadcast_order]: 1.256e-05 [grouped_pairwise_exchange_alltoall]: 1.54e-06 [offloading_packed_experts]: 4.06001e-06 [overlap_recompute_and_grad_model_parallel]: 4.87998e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.22999e-06 [overlap_recompute_allgather_and_fa_grad]: 1.62999e-06 [overlap_recompute_comm]: 2.68998e-06 [overlap_grad_ring_attention]: 4.01001e-06 [overlap_grad_flash_sp]: 1.978e-05 [begin_end_overlap_inline]: 4.80009e-07 [split_matmul_comm_elemetwise]: 2.86e-06 [split_layernorm_comm]: 1.69998e-06 [handle_group_info]: 1.12e-06 [symbol_engine_optimizer]: 8.168e-05, [1] [Cycle 1]: 7.72e-05, [6] [build]: 3.83001e-06 [elim_shapecalc]: 1.056e-05 [elim_not_effective]: 1.408e-05 [opt_reshape]: 7.83999e-06 [fold_const_symbol]: 1.013e-05 [renormalize]: 3.80009e-07 [detach_backward]: 1.84e-06 [pipeline_parallel_scheduler]: 1.60001e-06 [auto_monad_reorder]: 1.6e-05 [get_jit_bprop_graph]: 1.77999e-06 [rewriter_after_jit_bprop_graph]: 3.92002e-06 [opt_after_jit_grad]: 0.00046917 [validate]: 4.526e-05 Sums bootstrap : 0.000481s : 0.82% type_inference : 0.050446s : 86.10% event_method : 0.000023s : 0.04% auto_monad : 0.000071s : 0.12% graph_reusing : 0.000006s : 0.01% inline : 0.000003s : 0.01% add_attr.add_attr_with_inline.tag_attr : 0.000022s : 0.04% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.01% parallel-infer-symbol : 0.000003s : 0.01% pre_auto_parallel : 0.000040s : 0.07% insert-virtual-dataset : 0.000002s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000030s : 0.05% optimize.rewriter_before_opt_a : 0.000086s : 0.15% optimize.opt_a.expand_dump_flag : 0.000004s : 0.01% optimize.opt_a.switch_simplify : 0.000054s : 0.09% optimize.opt_a.loop_unroll : 0.000036s : 0.06% optimize.opt_a.a_1 : 0.000811s : 1.38% optimize.opt_a.with_stream_mark : 0.000032s : 0.05% optimize.opt_a.recompute_prepare : 0.000016s : 0.03% optimize.opt_a.updatestate_depend_eliminate : 0.000008s : 0.01% optimize.opt_a.updatestate_assign_eliminate : 0.000005s : 0.01% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.01% optimize.opt_a.parameter_eliminate : 0.000003s : 0.01% optimize.opt_a.a_2 : 0.000169s : 0.29% optimize.opt_a.accelerated_algorithm : 0.000014s : 0.02% optimize.opt_a.shard : 0.000003s : 0.01% optimize.opt_a.meta_shard_fg_expand : 0.000003s : 0.01% optimize.opt_a.shard_inline : 0.000013s : 0.02% optimize.opt_a.merge_send_recv : 0.000014s : 0.02% optimize.opt_a.auto_parallel : 0.000013s : 0.02% optimize.opt_a.parallel : 0.000025s : 0.04% optimize.opt_a.flash_sp : 0.000012s : 0.02% optimize.opt_a.merge_comm : 0.000007s : 0.01% optimize.opt_a.allreduce_fusion : 0.000007s : 0.01% optimize.opt_a.matmul_add_comm_reduction : 0.000016s : 0.03% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000019s : 0.03% optimize.opt_a.virtual_dataset : 0.000013s : 0.02% optimize.opt_a.get_grad_eliminate_ : 0.000012s : 0.02% optimize.opt_a.virtual_output : 0.000012s : 0.02% optimize.opt_a.merge_forward : 0.000007s : 0.01% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.00% optimize.opt_a.offload_activation : 0.000018s : 0.03% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000025s : 0.04% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.00% optimize.opt_a.before_grad : 0.000021s : 0.04% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000007s : 0.01% optimize.opt_a.meta_fg_expand : 0.000005s : 0.01% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.01% optimize.opt_a.receive_attached : 0.000003s : 0.01% optimize.opt_a.after_resolve : 0.000022s : 0.04% optimize.opt_a.a_after_grad : 0.000019s : 0.03% optimize.opt_a.renormalize : 0.003522s : 6.01% optimize.opt_a.add_forward_monad_depend : 0.000008s : 0.01% optimize.opt_a.auto_monad_grad : 0.000004s : 0.01% optimize.opt_a.auto_monad_eliminator : 0.000026s : 0.04% optimize.opt_a.cse : 0.000044s : 0.07% optimize.opt_a.a_3 : 0.000089s : 0.15% optimize.py_interpret_to_execute_after_opt_a : 0.000011s : 0.02% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.00% optimize.rewriter_after_opt_a : 0.000041s : 0.07% optimize.convert_after_rewriter : 0.000007s : 0.01% optimize.order_py_execute_after_rewriter : 0.000005s : 0.01% optimize.mutable_eliminate : 0.000598s : 1.02% optimize.opt_b.b_1 : 0.000124s : 0.21% optimize.opt_b.b_2 : 0.000008s : 0.01% optimize.opt_b.updatestate_depend_eliminate : 0.000005s : 0.01% optimize.opt_b.updatestate_assign_eliminate : 0.000002s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.00% optimize.opt_b.renormalize : 0.000000s : 0.00% optimize.opt_b.cse : 0.000019s : 0.03% optimize.optimize_parallel_all_gather_comm : 0.000019s : 0.03% optimize.overlap_param_gather : 0.000002s : 0.00% optimize.cconv : 0.000029s : 0.05% optimize.loop_unroll : 0.000450s : 0.77% optimize.opt_after_cconv.c_1 : 0.000032s : 0.05% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000005s : 0.01% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.00% optimize.opt_after_cconv.cse : 0.000017s : 0.03% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000014s : 0.02% optimize.tuple_transform.d_1 : 0.000043s : 0.07% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000001s : 0.00% optimize.tuple_transform.switch_simplify : 0.000007s : 0.01% optimize.partial_unused_args_eliminate : 0.000002s : 0.00% optimize.add_recomputation : 0.000052s : 0.09% optimize.cse_after_recomputation.cse : 0.000011s : 0.02% optimize.environ_conv : 0.000005s : 0.01% optimize.swap_dp_allreduce_reducescatter : 0.000005s : 0.01% optimize.bias_add_comm_swap : 0.000003s : 0.01% optimize.label_micro_interleaved_index : 0.000004s : 0.01% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.01% optimize.merge_cast_opt : 0.000001s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.00% optimize.micro_interleaved_order_control : 0.000003s : 0.00% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.00% optimize.full_micro_interleaved_order_control : 0.000002s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.00% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000002s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.00% optimize.control_data_broadcast_order : 0.000013s : 0.02% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.00% optimize.offloading_packed_experts : 0.000004s : 0.01% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.01% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000002s : 0.00% optimize.overlap_recompute_comm : 0.000003s : 0.00% optimize.overlap_grad_ring_attention : 0.000004s : 0.01% optimize.overlap_grad_flash_sp : 0.000020s : 0.03% optimize.begin_end_overlap_inline : 0.000000s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000003s : 0.00% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000004s : 0.01% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000011s : 0.02% optimize.symbol_engine_optimizer.elim_not_effective : 0.000014s : 0.02% optimize.symbol_engine_optimizer.opt_reshape : 0.000008s : 0.01% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000010s : 0.02% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.00% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000016s : 0.03% get_jit_bprop_graph : 0.000002s : 0.00% rewriter_after_jit_bprop_graph : 0.000004s : 0.01% opt_after_jit_grad : 0.000469s : 0.80% validate : 0.000045s : 0.08% Time group info: ------[substitution.] 0.000230 34 14.42% : 0.000033s : 6: substitution.arithmetic_simplify 0.79% : 0.000002s : 2: substitution.elim_not_effective 0.59% : 0.000001s : 2: substitution.fold_const_symbol 2.61% : 0.000006s : 4: substitution.graph_param_transform 68.96% : 0.000159s : 4: substitution.inline 1.84% : 0.000004s : 4: substitution.j_node_and_user_rematch 2.35% : 0.000005s : 4: substitution.remove_not_recompute_node 1.79% : 0.000004s : 4: substitution.replace_old_param 6.65% : 0.000015s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.050367 2 98.23% : 0.049473s : 1: type_inference.infer 1.77% : 0.000894s : 1: type_inference.specialize ------[replace.] 0.000062 8 64.06% : 0.000040s : 4: replace.inline 35.94% : 0.000022s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000170 8 92.06% : 0.000156s : 4: match.inline 7.94% : 0.000013s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000211 1278 0.95% : 0.000002s : 13: predicate.accumulaten_eliminater 0.75% : 0.000002s : 4: predicate.ad_related_special_op_eliminate 0.63% : 0.000001s : 8: predicate.addn_check_dump 0.92% : 0.000002s : 13: predicate.addn_zero_filter 0.92% : 0.000002s : 13: predicate.adjust_all_reduce_mul_add 2.49% : 0.000005s : 21: predicate.arithmetic_simplify 0.93% : 0.000002s : 13: predicate.cast_eliminate 0.61% : 0.000001s : 8: predicate.check_bprop_eliminate 0.52% : 0.000001s : 8: predicate.compare_switch_simplify 0.18% : 0.000000s : 4: predicate.const_output_eliminate 0.59% : 0.000001s : 8: predicate.depend_value_elim 1.00% : 0.000002s : 13: predicate.dict_get_item_const_eliminator 1.05% : 0.000002s : 13: predicate.dict_get_item_eliminator 0.90% : 0.000002s : 13: predicate.dict_set_item_eliminator 0.98% : 0.000002s : 8: predicate.dumpgradient_eliminate 0.23% : 0.000000s : 4: predicate.elim_not_effective 0.50% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.18% : 0.000002s : 17: predicate.environ_add_const_eliminate 1.03% : 0.000002s : 17: predicate.environ_get_add_eliminate 1.09% : 0.000002s : 17: predicate.environ_get_depend_swap 1.71% : 0.000004s : 25: predicate.environ_get_eliminate 1.02% : 0.000002s : 17: predicate.environ_get_set_eliminate 1.40% : 0.000003s : 21: predicate.exchange_switch_depend_value 2.40% : 0.000005s : 21: predicate.float_depend_g_call 0.52% : 0.000001s : 8: predicate.float_environ_get_switch 0.74% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.18% : 0.000000s : 4: predicate.fold_const_symbol 0.61% : 0.000001s : 8: predicate.get_grad_eliminate 0.22% : 0.000000s : 4: predicate.graph_param_transform 0.57% : 0.000001s : 8: predicate.incorporate_call 0.48% : 0.000001s : 8: predicate.incorporate_call_switch 6.35% : 0.000013s : 58: predicate.inline 0.79% : 0.000002s : 8: predicate.inline_without_move 0.34% : 0.000001s : 8: predicate.j_node_and_user_rematch 0.78% : 0.000002s : 8: predicate.less_batch_normalization 1.75% : 0.000004s : 25: predicate.list_to_tuple_eliminator_ 2.50% : 0.000005s : 38: predicate.load_eliminater 0.88% : 0.000002s : 4: predicate.loop_unroll_after_grad 2.26% : 0.000005s : 34: predicate.loop_unroll_before_grad 1.88% : 0.000004s : 21: predicate.make_slice_get_slice_eliminator 0.51% : 0.000001s : 8: predicate.merge_addn 0.56% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.61% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.89% : 0.000002s : 13: predicate.minmaximum_grad 1.25% : 0.000003s : 4: predicate.mutable_eliminate 0.51% : 0.000001s : 4: predicate.opt_reshape 0.40% : 0.000001s : 4: predicate.parallel_virtual_node 1.75% : 0.000004s : 21: predicate.partial_defer_inline 1.60% : 0.000003s : 21: predicate.partial_eliminate 0.86% : 0.000002s : 13: predicate.print_const_string_wrapper 0.64% : 0.000001s : 8: predicate.reduce_all_const_elim 1.12% : 0.000002s : 13: predicate.reduce_eliminate 2.61% : 0.000006s : 38: predicate.redundant_stop_gradient_eliminater 0.41% : 0.000001s : 8: predicate.remove_not_recompute_node 1.34% : 0.000003s : 25: predicate.replace_applicator 0.60% : 0.000001s : 8: predicate.replace_old_param 0.28% : 0.000001s : 4: predicate.reset_defer_inline 0.94% : 0.000002s : 13: predicate.reshape_eliminate 0.63% : 0.000001s : 8: predicate.row_tensor_add_zeros_like 0.36% : 0.000001s : 4: predicate.row_tensor_eliminate 0.85% : 0.000002s : 8: predicate.same_eliminate 0.43% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.95% : 0.000002s : 8: predicate.shard_identity_eliminate 0.71% : 0.000001s : 8: predicate.special_op_eliminate 0.70% : 0.000001s : 8: predicate.specialize_transform 0.93% : 0.000002s : 8: predicate.split_environ_get_set_with_tuple_value 0.72% : 0.000002s : 8: predicate.stack_unstack_eliminate 0.32% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.56% : 0.000003s : 21: predicate.switch_defer_inline 2.18% : 0.000005s : 29: predicate.switch_layer_defer_inline 4.94% : 0.000010s : 67: predicate.switch_simplify 0.89% : 0.000002s : 13: predicate.tile_eliminate 0.91% : 0.000002s : 13: predicate.transpose_eliminate 1.60% : 0.000003s : 21: predicate.tuple_list_convert_item_index_to_positive 1.56% : 0.000003s : 21: predicate.tuple_list_get_item_const_eliminator 1.46% : 0.000003s : 21: predicate.tuple_list_get_item_depend_reorder 3.29% : 0.000007s : 33: predicate.tuple_list_get_item_eliminator 1.59% : 0.000003s : 21: predicate.tuple_list_get_set_item_eliminator 2.13% : 0.000005s : 29: predicate.tuple_list_set_item_eliminator 1.63% : 0.000003s : 25: predicate.tuple_to_list_eliminator_ 2.45% : 0.000005s : 38: predicate.updatestate_pure_node_eliminater 3.01% : 0.000006s : 46: predicate.updatestate_useless_node_eliminater 0.39% : 0.000001s : 4: predicate.value_based_eliminate 0.60% : 0.000001s : 8: predicate.virtual_dataset_eliminate 0.68% : 0.000001s : 8: predicate.virtual_output_eliminate 0.27% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.52% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000668 11 48.41% : 0.000323s : 5: func_graph_cloner_run.FuncGraphClonerGraph 51.59% : 0.000344s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.088296 192 0.00% : 0.000004s : 1: ForceFp32Comm 9.03% : 0.007977s : 1: add_attr 9.02% : 0.007964s : 1: add_attr_with_inline 0.00% : 0.000004s : 1: add_comm_op_reuse_tag 0.06% : 0.000056s : 1: add_recomputation 0.00% : 0.000004s : 1: assign_add_opt 0.09% : 0.000076s : 1: auto_monad 0.02% : 0.000020s : 1: auto_monad_reorder 0.00% : 0.000003s : 1: begin_end_overlap_inline 0.01% : 0.000006s : 1: bias_add_comm_swap 0.58% : 0.000510s : 1: bootstrap 0.04% : 0.000032s : 1: cconv 0.00% : 0.000004s : 1: comm_op_add_attrs 0.02% : 0.000016s : 1: control_data_broadcast_order 0.01% : 0.000010s : 1: convert_after_rewriter 0.03% : 0.000025s : 1: cse_after_recomputation 0.01% : 0.000005s : 1: dataset_repeat_opt 0.01% : 0.000005s : 1: detach_backward 0.01% : 0.000008s : 1: environ_conv 0.03% : 0.000030s : 1: event_method 0.01% : 0.000005s : 1: full_micro_interleaved_order_control 0.01% : 0.000005s : 1: get_jit_bprop_graph 0.01% : 0.000010s : 1: graph_reusing 0.00% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000004s : 1: handle_group_info 0.01% : 0.000006s : 1: inline 0.01% : 0.000006s : 1: insert-virtual-dataset 0.00% : 0.000004s : 1: interleave_parallel_branches 0.00% : 0.000004s : 1: interleave_split_concat_branches 0.01% : 0.000006s : 1: label_fine_grained_interleaved_index 0.01% : 0.000007s : 1: label_micro_interleaved_index 0.52% : 0.000458s : 1: loop_unroll 0.00% : 0.000004s : 1: merge_cast_opt 0.01% : 0.000005s : 1: micro_interleaved_order_control 0.69% : 0.000606s : 1: mutable_eliminate 0.01% : 0.000007s : 1: offloading_packed_experts 0.02% : 0.000014s : 1: opt.transform.loop_unroll_optimizer 0.02% : 0.000016s : 1: opt.transform.mutable_eliminate 1.42% : 0.001251s : 78: opt.transform.opt_a 0.03% : 0.000030s : 1: opt.transform.opt_after_cconv 0.03% : 0.000024s : 1: opt.transform.opt_after_jit_grad 0.11% : 0.000101s : 28: opt.transform.opt_b 0.05% : 0.000048s : 2: opt.transform.opt_trans_graph 0.04% : 0.000039s : 4: opt.transform.symbol_engine_opt 6.39% : 0.005640s : 1: opt_a 0.12% : 0.000105s : 1: opt_after_cconv 0.54% : 0.000478s : 1: opt_after_jit_grad 0.28% : 0.000251s : 1: opt_b 8.88% : 0.007838s : 1: optimize 0.03% : 0.000023s : 1: optimize_parallel_all_gather_comm 0.01% : 0.000008s : 1: order_py_execute_after_rewriter 0.03% : 0.000023s : 1: overlap_grad_flash_sp 0.00% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.01% : 0.000007s : 1: overlap_grad_ring_attention 0.01% : 0.000004s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.01% : 0.000005s : 1: overlap_param_gather 0.00% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.01% : 0.000008s : 1: overlap_recompute_and_grad_model_parallel 0.01% : 0.000005s : 1: overlap_recompute_comm 0.01% : 0.000008s : 1: parallel-infer-symbol 0.00% : 0.000004s : 1: parallel-infer-symbol-second 0.01% : 0.000005s : 1: partial_unused_args_eliminate 0.01% : 0.000005s : 1: pipeline_parallel_scheduler 0.01% : 0.000004s : 1: pipeline_split 0.05% : 0.000045s : 1: pre_auto_parallel 0.04% : 0.000034s : 1: py_interpret_to_execute 0.02% : 0.000014s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000004s : 1: remove_cast_before_assign_add 0.02% : 0.000017s : 1: remove_dup_value 3.56% : 0.003147s : 1: renormalize.infer 0.41% : 0.000364s : 1: renormalize.specialize 0.01% : 0.000005s : 1: reorder_send_recv_between_fp_bp 0.01% : 0.000007s : 1: rewriter_after_jit_bprop_graph 0.05% : 0.000046s : 1: rewriter_after_opt_a 0.10% : 0.000090s : 1: rewriter_before_opt_a 0.01% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.01% : 0.000005s : 1: slice_recompute_activation 0.01% : 0.000005s : 1: split_layernorm_comm 0.01% : 0.000006s : 1: split_matmul_comm_elemetwise 0.01% : 0.000008s : 1: swap_dp_allreduce_reducescatter 0.10% : 0.000085s : 1: symbol_engine_optimizer 0.09% : 0.000078s : 1: tuple_transform 57.16% : 0.050472s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:42:54.679.45 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:42:54.682.05 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0178999, [21] [bootstrap]: 0.00043948 [type_inference]: 0.00604022 [event_method]: 1.89e-05 [auto_monad]: 6.514e-05 [graph_reusing]: 5.96e-06 [inline]: 2.36e-06 [add_attr]: 0.00329961, [1] [add_attr_with_inline]: 0.00328877, [1] [Cycle 1]: 8.523e-05, [2] [tag_attr]: 2.229e-05 [meta_addattr_fg_expand]: 5.81e-06 [parallel-infer-symbol]: 3.26001e-06 [pre_auto_parallel]: 3.817e-05 [insert-virtual-dataset]: 2.41e-06 [parallel-infer-symbol-second]: 7.00005e-07 [dataset_repeat_opt]: 2.36e-06 [pipeline_split]: 1.79998e-06 [optimize]: 0.0064157, [53] [py_interpret_to_execute]: 3.29e-05 [rewriter_before_opt_a]: 9.393e-05 [opt_a]: 0.00350519, [2] [Cycle 1]: 0.00252733, [45] [expand_dump_flag]: 3.11999e-06 [switch_simplify]: 4.366e-05 [loop_unroll]: 3.143e-05 [a_1]: 0.0006771 [with_stream_mark]: 2.027e-05 [recompute_prepare]: 1.038e-05 [updatestate_depend_eliminate]: 4.89e-06 [updatestate_assign_eliminate]: 3.90998e-06 [updatestate_loads_eliminate]: 3.21999e-06 [parameter_eliminate]: 2.09999e-06 [a_2]: 0.00011866 [accelerated_algorithm]: 7.52002e-06 [shard]: 2.32999e-06 [meta_shard_fg_expand]: 1.76998e-06 [shard_inline]: 6.79999e-06 [merge_send_recv]: 1.043e-05 [auto_parallel]: 8.08999e-06 [parallel]: 2.037e-05 [flash_sp]: 9.49e-06 [merge_comm]: 4.45999e-06 [allreduce_fusion]: 3.77998e-06 [matmul_add_comm_reduction]: 1.076e-05 [allreduce_slice_to_reducescatter]: 7.89994e-07 [virtual_shard_identity]: 8.51002e-06 [virtual_dataset]: 6.64001e-06 [get_grad_eliminate_]: 6.66e-06 [virtual_output]: 6.76999e-06 [merge_forward]: 3.85e-06 [cell_reuse_recompute_pass]: 1.45999e-06 [offload_activation]: 1.11e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.623e-05 [merge_recompute_call_nodes]: 2.03002e-06 [before_grad]: 1.143e-05 [set_forward_comm_id_for_comm_node_pass]: 4.15e-06 [meta_fg_expand]: 2.92002e-06 [flash_sp_send_recv_attached]: 3.05002e-06 [receive_attached]: 2.09e-06 [after_resolve]: 2.214e-05 [a_after_grad]: 1.087e-05 [renormalize]: 0.00085004 [add_forward_monad_depend]: 8.66002e-06 [auto_monad_grad]: 2.64001e-06 [auto_monad_eliminator]: 2.065e-05 [cse]: 3.154e-05 [a_3]: 6.801e-05 [Cycle 2]: 0.00095955, [45] [expand_dump_flag]: 1.92001e-06 [switch_simplify]: 8.57998e-06 [loop_unroll]: 6.79999e-06 [a_1]: 0.00015216 [with_stream_mark]: 1.575e-05 [recompute_prepare]: 7.51999e-06 [updatestate_depend_eliminate]: 3.50003e-06 [updatestate_assign_eliminate]: 2.78e-06 [updatestate_loads_eliminate]: 2.59001e-06 [parameter_eliminate]: 1.74998e-06 [a_2]: 0.00010909 [accelerated_algorithm]: 6.58998e-06 [shard]: 1.70001e-06 [meta_shard_fg_expand]: 1.67001e-06 [shard_inline]: 7.45e-06 [merge_send_recv]: 8.40001e-06 [auto_parallel]: 9.48002e-06 [parallel]: 7.16999e-06 [flash_sp]: 4.27998e-06 [merge_comm]: 3.44001e-06 [allreduce_fusion]: 3.86001e-06 [matmul_add_comm_reduction]: 8.76002e-06 [allreduce_slice_to_reducescatter]: 5.09986e-07 [virtual_shard_identity]: 8.50001e-06 [virtual_dataset]: 7.02002e-06 [get_grad_eliminate_]: 6.83998e-06 [virtual_output]: 6.74001e-06 [merge_forward]: 5.04e-06 [cell_reuse_recompute_pass]: 2.63e-06 [offload_activation]: 1.141e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.549e-05 [merge_recompute_call_nodes]: 1.20001e-06 [before_grad]: 1.131e-05 [set_forward_comm_id_for_comm_node_pass]: 3.65e-06 [meta_fg_expand]: 2.59001e-06 [flash_sp_send_recv_attached]: 1.19998e-06 [receive_attached]: 1.99999e-06 [after_resolve]: 1.276e-05 [a_after_grad]: 1.163e-05 [renormalize]: 5.9983e-08 [add_forward_monad_depend]: 2.60002e-06 [auto_monad_grad]: 2.71e-06 [auto_monad_eliminator]: 1.092e-05 [cse]: 1.924e-05 [a_3]: 5.21e-05 [py_interpret_to_execute_after_opt_a]: 1.917e-05 [slice_cell_reuse_recomputed_activation]: 5.50001e-06 [rewriter_after_opt_a]: 4.689e-05 [convert_after_rewriter]: 9.92999e-06 [order_py_execute_after_rewriter]: 9.87001e-06 [mutable_eliminate]: 0.00081443 [opt_b]: 0.00030887, [1] [Cycle 1]: 0.00029737, [7] [b_1]: 0.00018302 [b_2]: 1.053e-05 [updatestate_depend_eliminate]: 9.78002e-06 [updatestate_assign_eliminate]: 3.6e-06 [updatestate_loads_eliminate]: 2.89999e-06 [renormalize]: 7.7e-07 [cse]: 2.82e-05 [optimize_parallel_all_gather_comm]: 2.397e-05 [overlap_param_gather]: 5.42001e-06 [cconv]: 4.15e-05 [loop_unroll]: 0.0005769 [opt_after_cconv]: 0.00014797, [1] [Cycle 1]: 0.00013707, [7] [c_1]: 3.459e-05 [parameter_eliminate]: 5.86e-06 [updatestate_depend_eliminate]: 7.74002e-06 [updatestate_assign_eliminate]: 2.78998e-06 [updatestate_loads_eliminate]: 2.71e-06 [cse]: 2.414e-05 [renormalize]: 6.19999e-07 [remove_dup_value]: 1.987e-05 [tuple_transform]: 9.999e-05, [1] [Cycle 1]: 9.247e-05, [4] [d_1]: 5.14e-05 [none_parameter_eliminate]: 1.75001e-06 [renormalize]: 2.30008e-07 [switch_simplify]: 7.62998e-06 [partial_unused_args_eliminate]: 4.96997e-06 [add_recomputation]: 6.089e-05 [cse_after_recomputation]: 3.005e-05, [1] [Cycle 1]: 2.272e-05, [1] [cse]: 1.312e-05 [environ_conv]: 9.74e-06 [swap_dp_allreduce_reducescatter]: 7.96001e-06 [bias_add_comm_swap]: 6.48e-06 [label_micro_interleaved_index]: 8.48999e-06 [label_fine_grained_interleaved_index]: 6.12999e-06 [merge_cast_opt]: 4.37e-06 [slice_recompute_activation]: 4.60001e-06 [micro_interleaved_order_control]: 4.92999e-06 [assign_add_opt]: 3.52997e-06 [ForceFp32Comm]: 3.35998e-06 [remove_cast_before_assign_add]: 3.53e-06 [full_micro_interleaved_order_control]: 5.00999e-06 [reorder_send_recv_between_fp_bp]: 6.68998e-06 [comm_op_add_attrs]: 3.43e-06 [add_comm_op_reuse_tag]: 3.69002e-06 [interleave_split_concat_branches]: 4.27e-06 [interleave_parallel_branches]: 3.96001e-06 [overlap_opt_shard_in_pipeline]: 4.1e-06 [overlap_opt_shard_grad_in_pipeline]: 4.85999e-06 [control_data_broadcast_order]: 1.9e-05 [grouped_pairwise_exchange_alltoall]: 4.25e-06 [offloading_packed_experts]: 7.65e-06 [overlap_recompute_and_grad_model_parallel]: 8.19002e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.92998e-06 [overlap_recompute_allgather_and_fa_grad]: 4.07998e-06 [overlap_recompute_comm]: 5.47999e-06 [overlap_grad_ring_attention]: 7.74002e-06 [overlap_grad_flash_sp]: 2.816e-05 [begin_end_overlap_inline]: 3.37002e-06 [split_matmul_comm_elemetwise]: 4.90001e-06 [split_layernorm_comm]: 4.43001e-06 [handle_group_info]: 3.63e-06 [symbol_engine_optimizer]: 0.00011461, [1] [Cycle 1]: 0.0001067, [6] [build]: 4.80999e-06 [elim_shapecalc]: 1.481e-05 [elim_not_effective]: 1.589e-05 [opt_reshape]: 8.47998e-06 [fold_const_symbol]: 1.035e-05 [renormalize]: 5.3001e-07 [detach_backward]: 5.00001e-06 [pipeline_parallel_scheduler]: 2.01e-06 [auto_monad_reorder]: 2.39e-05 [get_jit_bprop_graph]: 2.53e-06 [rewriter_after_jit_bprop_graph]: 7.7e-06 [opt_after_jit_grad]: 0.00081617 [validate]: 5.425e-05 Sums bootstrap : 0.000439s : 3.47% type_inference : 0.006040s : 47.72% event_method : 0.000019s : 0.15% auto_monad : 0.000065s : 0.51% graph_reusing : 0.000006s : 0.05% inline : 0.000002s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000022s : 0.18% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.05% parallel-infer-symbol : 0.000003s : 0.03% pre_auto_parallel : 0.000038s : 0.30% insert-virtual-dataset : 0.000002s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000033s : 0.26% optimize.rewriter_before_opt_a : 0.000094s : 0.74% optimize.opt_a.expand_dump_flag : 0.000005s : 0.04% optimize.opt_a.switch_simplify : 0.000052s : 0.41% optimize.opt_a.loop_unroll : 0.000038s : 0.30% optimize.opt_a.a_1 : 0.000829s : 6.55% optimize.opt_a.with_stream_mark : 0.000036s : 0.28% optimize.opt_a.recompute_prepare : 0.000018s : 0.14% optimize.opt_a.updatestate_depend_eliminate : 0.000008s : 0.07% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.05% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.05% optimize.opt_a.parameter_eliminate : 0.000004s : 0.03% optimize.opt_a.a_2 : 0.000228s : 1.80% optimize.opt_a.accelerated_algorithm : 0.000014s : 0.11% optimize.opt_a.shard : 0.000004s : 0.03% optimize.opt_a.meta_shard_fg_expand : 0.000003s : 0.03% optimize.opt_a.shard_inline : 0.000014s : 0.11% optimize.opt_a.merge_send_recv : 0.000019s : 0.15% optimize.opt_a.auto_parallel : 0.000018s : 0.14% optimize.opt_a.parallel : 0.000028s : 0.22% optimize.opt_a.flash_sp : 0.000014s : 0.11% optimize.opt_a.merge_comm : 0.000008s : 0.06% optimize.opt_a.allreduce_fusion : 0.000008s : 0.06% optimize.opt_a.matmul_add_comm_reduction : 0.000020s : 0.15% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000017s : 0.13% optimize.opt_a.virtual_dataset : 0.000014s : 0.11% optimize.opt_a.get_grad_eliminate_ : 0.000013s : 0.11% optimize.opt_a.virtual_output : 0.000014s : 0.11% optimize.opt_a.merge_forward : 0.000009s : 0.07% optimize.opt_a.cell_reuse_recompute_pass : 0.000004s : 0.03% optimize.opt_a.offload_activation : 0.000023s : 0.18% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000032s : 0.25% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.03% optimize.opt_a.before_grad : 0.000023s : 0.18% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000008s : 0.06% optimize.opt_a.meta_fg_expand : 0.000006s : 0.04% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.03% optimize.opt_a.receive_attached : 0.000004s : 0.03% optimize.opt_a.after_resolve : 0.000035s : 0.28% optimize.opt_a.a_after_grad : 0.000022s : 0.18% optimize.opt_a.renormalize : 0.000850s : 6.72% optimize.opt_a.add_forward_monad_depend : 0.000011s : 0.09% optimize.opt_a.auto_monad_grad : 0.000005s : 0.04% optimize.opt_a.auto_monad_eliminator : 0.000032s : 0.25% optimize.opt_a.cse : 0.000051s : 0.40% optimize.opt_a.a_3 : 0.000120s : 0.95% optimize.py_interpret_to_execute_after_opt_a : 0.000019s : 0.15% optimize.slice_cell_reuse_recomputed_activation : 0.000006s : 0.04% optimize.rewriter_after_opt_a : 0.000047s : 0.37% optimize.convert_after_rewriter : 0.000010s : 0.08% optimize.order_py_execute_after_rewriter : 0.000010s : 0.08% optimize.mutable_eliminate : 0.000814s : 6.43% optimize.opt_b.b_1 : 0.000183s : 1.45% optimize.opt_b.b_2 : 0.000011s : 0.08% optimize.opt_b.updatestate_depend_eliminate : 0.000010s : 0.08% optimize.opt_b.updatestate_assign_eliminate : 0.000004s : 0.03% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.02% optimize.opt_b.renormalize : 0.000001s : 0.01% optimize.opt_b.cse : 0.000028s : 0.22% optimize.optimize_parallel_all_gather_comm : 0.000024s : 0.19% optimize.overlap_param_gather : 0.000005s : 0.04% optimize.cconv : 0.000041s : 0.33% optimize.loop_unroll : 0.000577s : 4.56% optimize.opt_after_cconv.c_1 : 0.000035s : 0.27% optimize.opt_after_cconv.parameter_eliminate : 0.000006s : 0.05% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000008s : 0.06% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.02% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.02% optimize.opt_after_cconv.cse : 0.000024s : 0.19% optimize.opt_after_cconv.renormalize : 0.000001s : 0.00% optimize.remove_dup_value : 0.000020s : 0.16% optimize.tuple_transform.d_1 : 0.000051s : 0.41% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000008s : 0.06% optimize.partial_unused_args_eliminate : 0.000005s : 0.04% optimize.add_recomputation : 0.000061s : 0.48% optimize.cse_after_recomputation.cse : 0.000013s : 0.10% optimize.environ_conv : 0.000010s : 0.08% optimize.swap_dp_allreduce_reducescatter : 0.000008s : 0.06% optimize.bias_add_comm_swap : 0.000006s : 0.05% optimize.label_micro_interleaved_index : 0.000008s : 0.07% optimize.label_fine_grained_interleaved_index : 0.000006s : 0.05% optimize.merge_cast_opt : 0.000004s : 0.03% optimize.slice_recompute_activation : 0.000005s : 0.04% optimize.micro_interleaved_order_control : 0.000005s : 0.04% optimize.assign_add_opt : 0.000004s : 0.03% optimize.ForceFp32Comm : 0.000003s : 0.03% optimize.remove_cast_before_assign_add : 0.000004s : 0.03% optimize.full_micro_interleaved_order_control : 0.000005s : 0.04% optimize.reorder_send_recv_between_fp_bp : 0.000007s : 0.05% optimize.comm_op_add_attrs : 0.000003s : 0.03% optimize.add_comm_op_reuse_tag : 0.000004s : 0.03% optimize.interleave_split_concat_branches : 0.000004s : 0.03% optimize.interleave_parallel_branches : 0.000004s : 0.03% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.03% optimize.overlap_opt_shard_grad_in_pipeline : 0.000005s : 0.04% optimize.control_data_broadcast_order : 0.000019s : 0.15% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.03% optimize.offloading_packed_experts : 0.000008s : 0.06% optimize.overlap_recompute_and_grad_model_parallel : 0.000008s : 0.06% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.03% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.03% optimize.overlap_recompute_comm : 0.000005s : 0.04% optimize.overlap_grad_ring_attention : 0.000008s : 0.06% optimize.overlap_grad_flash_sp : 0.000028s : 0.22% optimize.begin_end_overlap_inline : 0.000003s : 0.03% optimize.split_matmul_comm_elemetwise : 0.000005s : 0.04% optimize.split_layernorm_comm : 0.000004s : 0.04% optimize.handle_group_info : 0.000004s : 0.03% optimize.symbol_engine_optimizer.build : 0.000005s : 0.04% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000015s : 0.12% optimize.symbol_engine_optimizer.elim_not_effective : 0.000016s : 0.13% optimize.symbol_engine_optimizer.opt_reshape : 0.000008s : 0.07% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000010s : 0.08% optimize.symbol_engine_optimizer.renormalize : 0.000001s : 0.00% detach_backward : 0.000005s : 0.04% pipeline_parallel_scheduler : 0.000002s : 0.02% auto_monad_reorder : 0.000024s : 0.19% get_jit_bprop_graph : 0.000003s : 0.02% rewriter_after_jit_bprop_graph : 0.000008s : 0.06% opt_after_jit_grad : 0.000816s : 6.45% validate : 0.000054s : 0.43% Time group info: ------[substitution.] 0.000254 34 15.32% : 0.000039s : 6: substitution.arithmetic_simplify 0.91% : 0.000002s : 2: substitution.elim_not_effective 0.52% : 0.000001s : 2: substitution.fold_const_symbol 2.62% : 0.000007s : 4: substitution.graph_param_transform 63.81% : 0.000162s : 4: substitution.inline 1.98% : 0.000005s : 4: substitution.j_node_and_user_rematch 2.38% : 0.000006s : 4: substitution.remove_not_recompute_node 6.04% : 0.000015s : 4: substitution.replace_old_param 6.42% : 0.000016s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.005988 2 88.02% : 0.005270s : 1: type_inference.infer 11.98% : 0.000717s : 1: type_inference.specialize ------[replace.] 0.000067 8 65.68% : 0.000044s : 4: replace.inline 34.32% : 0.000023s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000174 8 91.69% : 0.000159s : 4: match.inline 8.31% : 0.000014s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000229 1278 0.97% : 0.000002s : 13: predicate.accumulaten_eliminater 1.24% : 0.000003s : 4: predicate.ad_related_special_op_eliminate 0.58% : 0.000001s : 8: predicate.addn_check_dump 0.97% : 0.000002s : 13: predicate.addn_zero_filter 0.73% : 0.000002s : 13: predicate.adjust_all_reduce_mul_add 2.49% : 0.000006s : 21: predicate.arithmetic_simplify 0.99% : 0.000002s : 13: predicate.cast_eliminate 0.64% : 0.000001s : 8: predicate.check_bprop_eliminate 0.54% : 0.000001s : 8: predicate.compare_switch_simplify 0.20% : 0.000000s : 4: predicate.const_output_eliminate 0.62% : 0.000001s : 8: predicate.depend_value_elim 0.97% : 0.000002s : 13: predicate.dict_get_item_const_eliminator 0.95% : 0.000002s : 13: predicate.dict_get_item_eliminator 0.81% : 0.000002s : 13: predicate.dict_set_item_eliminator 1.48% : 0.000003s : 8: predicate.dumpgradient_eliminate 0.38% : 0.000001s : 4: predicate.elim_not_effective 0.51% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.09% : 0.000002s : 17: predicate.environ_add_const_eliminate 1.09% : 0.000003s : 17: predicate.environ_get_add_eliminate 0.96% : 0.000002s : 17: predicate.environ_get_depend_swap 1.56% : 0.000004s : 25: predicate.environ_get_eliminate 1.00% : 0.000002s : 17: predicate.environ_get_set_eliminate 1.30% : 0.000003s : 21: predicate.exchange_switch_depend_value 2.35% : 0.000005s : 21: predicate.float_depend_g_call 0.51% : 0.000001s : 8: predicate.float_environ_get_switch 0.84% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.17% : 0.000000s : 4: predicate.fold_const_symbol 0.74% : 0.000002s : 8: predicate.get_grad_eliminate 0.17% : 0.000000s : 4: predicate.graph_param_transform 0.54% : 0.000001s : 8: predicate.incorporate_call 0.44% : 0.000001s : 8: predicate.incorporate_call_switch 5.88% : 0.000013s : 58: predicate.inline 1.06% : 0.000002s : 8: predicate.inline_without_move 0.31% : 0.000001s : 8: predicate.j_node_and_user_rematch 0.82% : 0.000002s : 8: predicate.less_batch_normalization 1.77% : 0.000004s : 25: predicate.list_to_tuple_eliminator_ 2.31% : 0.000005s : 38: predicate.load_eliminater 1.12% : 0.000003s : 4: predicate.loop_unroll_after_grad 2.23% : 0.000005s : 34: predicate.loop_unroll_before_grad 1.71% : 0.000004s : 21: predicate.make_slice_get_slice_eliminator 0.59% : 0.000001s : 8: predicate.merge_addn 0.67% : 0.000002s : 8: predicate.micro_step_allgather_replace 0.58% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.86% : 0.000002s : 13: predicate.minmaximum_grad 1.41% : 0.000003s : 4: predicate.mutable_eliminate 0.61% : 0.000001s : 4: predicate.opt_reshape 0.54% : 0.000001s : 4: predicate.parallel_virtual_node 1.66% : 0.000004s : 21: predicate.partial_defer_inline 1.46% : 0.000003s : 21: predicate.partial_eliminate 0.89% : 0.000002s : 13: predicate.print_const_string_wrapper 0.64% : 0.000001s : 8: predicate.reduce_all_const_elim 1.16% : 0.000003s : 13: predicate.reduce_eliminate 2.38% : 0.000005s : 38: predicate.redundant_stop_gradient_eliminater 0.54% : 0.000001s : 8: predicate.remove_not_recompute_node 1.37% : 0.000003s : 25: predicate.replace_applicator 0.38% : 0.000001s : 8: predicate.replace_old_param 0.43% : 0.000001s : 4: predicate.reset_defer_inline 0.83% : 0.000002s : 13: predicate.reshape_eliminate 0.69% : 0.000002s : 8: predicate.row_tensor_add_zeros_like 0.65% : 0.000001s : 4: predicate.row_tensor_eliminate 0.98% : 0.000002s : 8: predicate.same_eliminate 0.40% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.87% : 0.000002s : 8: predicate.shard_identity_eliminate 0.76% : 0.000002s : 8: predicate.special_op_eliminate 0.72% : 0.000002s : 8: predicate.specialize_transform 1.28% : 0.000003s : 8: predicate.split_environ_get_set_with_tuple_value 1.05% : 0.000002s : 8: predicate.stack_unstack_eliminate 0.32% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.32% : 0.000003s : 21: predicate.switch_defer_inline 1.91% : 0.000004s : 29: predicate.switch_layer_defer_inline 4.72% : 0.000011s : 67: predicate.switch_simplify 0.82% : 0.000002s : 13: predicate.tile_eliminate 0.88% : 0.000002s : 13: predicate.transpose_eliminate 1.44% : 0.000003s : 21: predicate.tuple_list_convert_item_index_to_positive 1.42% : 0.000003s : 21: predicate.tuple_list_get_item_const_eliminator 1.41% : 0.000003s : 21: predicate.tuple_list_get_item_depend_reorder 3.36% : 0.000008s : 33: predicate.tuple_list_get_item_eliminator 1.41% : 0.000003s : 21: predicate.tuple_list_get_set_item_eliminator 2.23% : 0.000005s : 29: predicate.tuple_list_set_item_eliminator 1.86% : 0.000004s : 25: predicate.tuple_to_list_eliminator_ 2.22% : 0.000005s : 38: predicate.updatestate_pure_node_eliminater 2.83% : 0.000006s : 46: predicate.updatestate_useless_node_eliminater 0.37% : 0.000001s : 4: predicate.value_based_eliminate 0.66% : 0.000002s : 8: predicate.virtual_dataset_eliminate 0.63% : 0.000001s : 8: predicate.virtual_output_eliminate 0.23% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.51% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000563 11 51.47% : 0.000290s : 5: func_graph_cloner_run.FuncGraphClonerGraph 48.53% : 0.000273s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.029939 192 0.02% : 0.000006s : 1: ForceFp32Comm 11.05% : 0.003309s : 1: add_attr 11.00% : 0.003293s : 1: add_attr_with_inline 0.03% : 0.000008s : 1: add_comm_op_reuse_tag 0.22% : 0.000065s : 1: add_recomputation 0.02% : 0.000006s : 1: assign_add_opt 0.25% : 0.000074s : 1: auto_monad 0.10% : 0.000031s : 1: auto_monad_reorder 0.02% : 0.000006s : 1: begin_end_overlap_inline 0.03% : 0.000009s : 1: bias_add_comm_swap 1.65% : 0.000493s : 1: bootstrap 0.15% : 0.000045s : 1: cconv 0.02% : 0.000007s : 1: comm_op_add_attrs 0.08% : 0.000023s : 1: control_data_broadcast_order 0.04% : 0.000013s : 1: convert_after_rewriter 0.11% : 0.000033s : 1: cse_after_recomputation 0.03% : 0.000008s : 1: dataset_repeat_opt 0.08% : 0.000025s : 1: detach_backward 0.04% : 0.000013s : 1: environ_conv 0.10% : 0.000029s : 1: event_method 0.03% : 0.000008s : 1: full_micro_interleaved_order_control 0.03% : 0.000009s : 1: get_jit_bprop_graph 0.04% : 0.000013s : 1: graph_reusing 0.02% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.02% : 0.000007s : 1: handle_group_info 0.03% : 0.000008s : 1: inline 0.03% : 0.000009s : 1: insert-virtual-dataset 0.02% : 0.000007s : 1: interleave_parallel_branches 0.02% : 0.000007s : 1: interleave_split_concat_branches 0.03% : 0.000009s : 1: label_fine_grained_interleaved_index 0.04% : 0.000011s : 1: label_micro_interleaved_index 1.95% : 0.000585s : 1: loop_unroll 0.02% : 0.000007s : 1: merge_cast_opt 0.03% : 0.000008s : 1: micro_interleaved_order_control 2.75% : 0.000824s : 1: mutable_eliminate 0.04% : 0.000011s : 1: offloading_packed_experts 0.07% : 0.000019s : 1: opt.transform.loop_unroll_optimizer 0.08% : 0.000025s : 1: opt.transform.mutable_eliminate 4.36% : 0.001304s : 78: opt.transform.opt_a 0.11% : 0.000033s : 1: opt.transform.opt_after_cconv 0.13% : 0.000039s : 1: opt.transform.opt_after_jit_grad 0.38% : 0.000114s : 28: opt.transform.opt_b 0.19% : 0.000057s : 2: opt.transform.opt_trans_graph 0.15% : 0.000045s : 4: opt.transform.symbol_engine_opt 11.72% : 0.003509s : 1: opt_a 0.51% : 0.000152s : 1: opt_after_cconv 2.78% : 0.000832s : 1: opt_after_jit_grad 1.05% : 0.000313s : 1: opt_b 22.70% : 0.006795s : 1: optimize 0.09% : 0.000027s : 1: optimize_parallel_all_gather_comm 0.04% : 0.000013s : 1: order_py_execute_after_rewriter 0.11% : 0.000032s : 1: overlap_grad_flash_sp 0.02% : 0.000007s : 1: overlap_grad_matmul_and_grad_allreduce 0.04% : 0.000011s : 1: overlap_grad_ring_attention 0.03% : 0.000008s : 1: overlap_opt_shard_grad_in_pipeline 0.02% : 0.000007s : 1: overlap_opt_shard_in_pipeline 0.03% : 0.000009s : 1: overlap_param_gather 0.02% : 0.000007s : 1: overlap_recompute_allgather_and_fa_grad 0.04% : 0.000012s : 1: overlap_recompute_and_grad_model_parallel 0.03% : 0.000009s : 1: overlap_recompute_comm 0.03% : 0.000010s : 1: parallel-infer-symbol 0.02% : 0.000006s : 1: parallel-infer-symbol-second 0.03% : 0.000008s : 1: partial_unused_args_eliminate 0.03% : 0.000010s : 1: pipeline_parallel_scheduler 0.02% : 0.000007s : 1: pipeline_split 0.15% : 0.000046s : 1: pre_auto_parallel 0.12% : 0.000037s : 1: py_interpret_to_execute 0.07% : 0.000022s : 1: py_interpret_to_execute_after_opt_a 0.02% : 0.000006s : 1: remove_cast_before_assign_add 0.08% : 0.000023s : 1: remove_dup_value 1.50% : 0.000449s : 1: renormalize.infer 1.30% : 0.000391s : 1: renormalize.specialize 0.03% : 0.000010s : 1: reorder_send_recv_between_fp_bp 0.05% : 0.000014s : 1: rewriter_after_jit_bprop_graph 0.17% : 0.000051s : 1: rewriter_after_opt_a 0.33% : 0.000098s : 1: rewriter_before_opt_a 0.03% : 0.000008s : 1: slice_cell_reuse_recomputed_activation 0.03% : 0.000008s : 1: slice_recompute_activation 0.03% : 0.000008s : 1: split_layernorm_comm 0.03% : 0.000008s : 1: split_matmul_comm_elemetwise 0.04% : 0.000011s : 1: swap_dp_allreduce_reducescatter 0.39% : 0.000118s : 1: symbol_engine_optimizer 0.34% : 0.000103s : 1: tuple_transform 20.31% : 0.006081s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:42:54.938.509 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0172404, [21] [bootstrap]: 0.00046542 [type_inference]: 0.00649432 [event_method]: 1.993e-05 [auto_monad]: 6.553e-05 [graph_reusing]: 6.17999e-06 [inline]: 2.73003e-06 [add_attr]: 0.00401091, [1] [add_attr_with_inline]: 0.00399903, [1] [Cycle 1]: 7.242e-05, [2] [tag_attr]: 2.495e-05 [meta_addattr_fg_expand]: 5.69999e-06 [parallel-infer-symbol]: 3.7e-06 [pre_auto_parallel]: 3.998e-05 [insert-virtual-dataset]: 2.74001e-06 [parallel-infer-symbol-second]: 7.50006e-07 [dataset_repeat_opt]: 2.07999e-06 [pipeline_split]: 1.74e-06 [optimize]: 0.00531353, [53] [py_interpret_to_execute]: 3.013e-05 [rewriter_before_opt_a]: 9.107e-05 [opt_a]: 0.00291367, [2] [Cycle 1]: 0.00220625, [45] [expand_dump_flag]: 3.13e-06 [switch_simplify]: 4.341e-05 [loop_unroll]: 3.096e-05 [a_1]: 0.00067218 [with_stream_mark]: 1.677e-05 [recompute_prepare]: 1.034e-05 [updatestate_depend_eliminate]: 3.84002e-06 [updatestate_assign_eliminate]: 3.51001e-06 [updatestate_loads_eliminate]: 2.93e-06 [parameter_eliminate]: 1.92999e-06 [a_2]: 8.861e-05 [accelerated_algorithm]: 7.33999e-06 [shard]: 2.17999e-06 [meta_shard_fg_expand]: 1.80001e-06 [shard_inline]: 6.56e-06 [merge_send_recv]: 9.20001e-06 [auto_parallel]: 7.83001e-06 [parallel]: 2.108e-05 [flash_sp]: 8.45001e-06 [merge_comm]: 3.63e-06 [allreduce_fusion]: 3.58999e-06 [matmul_add_comm_reduction]: 9.96e-06 [allreduce_slice_to_reducescatter]: 8.39995e-07 [virtual_shard_identity]: 8.15e-06 [virtual_dataset]: 6.66999e-06 [get_grad_eliminate_]: 6.63e-06 [virtual_output]: 7e-06 [merge_forward]: 3.80998e-06 [cell_reuse_recompute_pass]: 1.50999e-06 [offload_activation]: 1.137e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.185e-05 [merge_recompute_call_nodes]: 1.50001e-06 [before_grad]: 1.084e-05 [set_forward_comm_id_for_comm_node_pass]: 3.65e-06 [meta_fg_expand]: 3.23998e-06 [flash_sp_send_recv_attached]: 2.94001e-06 [receive_attached]: 2.61999e-06 [after_resolve]: 1.195e-05 [a_after_grad]: 1.092e-05 [renormalize]: 0.00078637 [add_forward_monad_depend]: 5.36002e-06 [auto_monad_grad]: 2.79001e-06 [auto_monad_eliminator]: 1.64e-05 [cse]: 3.138e-05 [a_3]: 4.926e-05 [Cycle 2]: 0.00069597, [45] [expand_dump_flag]: 1.94e-06 [switch_simplify]: 7.63999e-06 [loop_unroll]: 6.39001e-06 [a_1]: 0.00015014 [with_stream_mark]: 1.325e-05 [recompute_prepare]: 6.73e-06 [updatestate_depend_eliminate]: 3.61999e-06 [updatestate_assign_eliminate]: 2.77002e-06 [updatestate_loads_eliminate]: 2.73e-06 [parameter_eliminate]: 1.28002e-06 [a_2]: 7.964e-05 [accelerated_algorithm]: 6.56999e-06 [shard]: 1.22999e-06 [meta_shard_fg_expand]: 1.49e-06 [shard_inline]: 6.15002e-06 [merge_send_recv]: 5.67001e-06 [auto_parallel]: 6.49999e-06 [parallel]: 5.36002e-06 [flash_sp]: 3.43999e-06 [merge_comm]: 3.45e-06 [allreduce_fusion]: 3.45e-06 [matmul_add_comm_reduction]: 6.54001e-06 [allreduce_slice_to_reducescatter]: 7.30011e-07 [virtual_shard_identity]: 6.79001e-06 [virtual_dataset]: 6.07001e-06 [get_grad_eliminate_]: 6.05002e-06 [virtual_output]: 5.91e-06 [merge_forward]: 2.91e-06 [cell_reuse_recompute_pass]: 1.88002e-06 [offload_activation]: 7.75998e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.213e-05 [merge_recompute_call_nodes]: 9.39996e-07 [before_grad]: 9.80002e-06 [set_forward_comm_id_for_comm_node_pass]: 3.43e-06 [meta_fg_expand]: 2.29001e-06 [flash_sp_send_recv_attached]: 1.02998e-06 [receive_attached]: 1.72001e-06 [after_resolve]: 1.127e-05 [a_after_grad]: 1.074e-05 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 1.29e-06 [auto_monad_grad]: 1.66e-06 [auto_monad_eliminator]: 7.57998e-06 [cse]: 1.612e-05 [a_3]: 3.781e-05 [py_interpret_to_execute_after_opt_a]: 1.211e-05 [slice_cell_reuse_recomputed_activation]: 1.84998e-06 [rewriter_after_opt_a]: 7.001e-05 [convert_after_rewriter]: 7.36999e-06 [order_py_execute_after_rewriter]: 5.12999e-06 [mutable_eliminate]: 0.00071682 [opt_b]: 0.00021155, [1] [Cycle 1]: 0.00020417, [7] [b_1]: 0.00012657 [b_2]: 8.43001e-06 [updatestate_depend_eliminate]: 6.87002e-06 [updatestate_assign_eliminate]: 2.51e-06 [updatestate_loads_eliminate]: 2.74001e-06 [renormalize]: 8.39995e-07 [cse]: 2.077e-05 [optimize_parallel_all_gather_comm]: 1.852e-05 [overlap_param_gather]: 1.91003e-06 [cconv]: 3.048e-05 [loop_unroll]: 0.00050926 [opt_after_cconv]: 0.00010975, [1] [Cycle 1]: 0.0001029, [7] [c_1]: 3.296e-05 [parameter_eliminate]: 4.36002e-06 [updatestate_depend_eliminate]: 5.74e-06 [updatestate_assign_eliminate]: 2.61e-06 [updatestate_loads_eliminate]: 2.46e-06 [cse]: 1.946e-05 [renormalize]: 3.30008e-07 [remove_dup_value]: 1.446e-05 [tuple_transform]: 7.914e-05, [1] [Cycle 1]: 7.369e-05, [4] [d_1]: 4.656e-05 [none_parameter_eliminate]: 1.60999e-06 [renormalize]: 2.09984e-07 [switch_simplify]: 7.02002e-06 [partial_unused_args_eliminate]: 2.08998e-06 [add_recomputation]: 5.396e-05 [cse_after_recomputation]: 2.207e-05, [1] [Cycle 1]: 1.714e-05, [1] [cse]: 1.153e-05 [environ_conv]: 5.51002e-06 [swap_dp_allreduce_reducescatter]: 4.90001e-06 [bias_add_comm_swap]: 2.83e-06 [label_micro_interleaved_index]: 5.12e-06 [label_fine_grained_interleaved_index]: 3.28e-06 [merge_cast_opt]: 1.66e-06 [slice_recompute_activation]: 2.24001e-06 [micro_interleaved_order_control]: 2.34999e-06 [assign_add_opt]: 1.47999e-06 [ForceFp32Comm]: 1.07998e-06 [remove_cast_before_assign_add]: 1.12e-06 [full_micro_interleaved_order_control]: 2.09e-06 [reorder_send_recv_between_fp_bp]: 2.69999e-06 [comm_op_add_attrs]: 9.70002e-07 [add_comm_op_reuse_tag]: 9.79984e-07 [interleave_split_concat_branches]: 1.08001e-06 [interleave_parallel_branches]: 1.16002e-06 [overlap_opt_shard_in_pipeline]: 1.17e-06 [overlap_opt_shard_grad_in_pipeline]: 2.14999e-06 [control_data_broadcast_order]: 1.287e-05 [grouped_pairwise_exchange_alltoall]: 1.50999e-06 [offloading_packed_experts]: 3.77998e-06 [overlap_recompute_and_grad_model_parallel]: 5.24e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.49e-06 [overlap_recompute_allgather_and_fa_grad]: 1.37e-06 [overlap_recompute_comm]: 2.22999e-06 [overlap_grad_ring_attention]: 4.17e-06 [overlap_grad_flash_sp]: 2.158e-05 [begin_end_overlap_inline]: 5.89993e-07 [split_matmul_comm_elemetwise]: 2.33998e-06 [split_layernorm_comm]: 1.60999e-06 [handle_group_info]: 9.99979e-07 [symbol_engine_optimizer]: 8.076e-05, [1] [Cycle 1]: 7.626e-05, [6] [build]: 3.85998e-06 [elim_shapecalc]: 1.065e-05 [elim_not_effective]: 1.425e-05 [opt_reshape]: 7.61001e-06 [fold_const_symbol]: 1.01e-05 [renormalize]: 1.90019e-07 [detach_backward]: 2.14999e-06 [pipeline_parallel_scheduler]: 1.64e-06 [auto_monad_reorder]: 1.691e-05 [get_jit_bprop_graph]: 2.33002e-06 [rewriter_after_jit_bprop_graph]: 4.83001e-06 [opt_after_jit_grad]: 0.00055443 [validate]: 4.4e-05 Sums bootstrap : 0.000465s : 3.81% type_inference : 0.006494s : 53.15% event_method : 0.000020s : 0.16% auto_monad : 0.000066s : 0.54% graph_reusing : 0.000006s : 0.05% inline : 0.000003s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000025s : 0.20% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.05% parallel-infer-symbol : 0.000004s : 0.03% pre_auto_parallel : 0.000040s : 0.33% insert-virtual-dataset : 0.000003s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000030s : 0.25% optimize.rewriter_before_opt_a : 0.000091s : 0.75% optimize.opt_a.expand_dump_flag : 0.000005s : 0.04% optimize.opt_a.switch_simplify : 0.000051s : 0.42% optimize.opt_a.loop_unroll : 0.000037s : 0.31% optimize.opt_a.a_1 : 0.000822s : 6.73% optimize.opt_a.with_stream_mark : 0.000030s : 0.25% optimize.opt_a.recompute_prepare : 0.000017s : 0.14% optimize.opt_a.updatestate_depend_eliminate : 0.000007s : 0.06% optimize.opt_a.updatestate_assign_eliminate : 0.000006s : 0.05% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.05% optimize.opt_a.parameter_eliminate : 0.000003s : 0.03% optimize.opt_a.a_2 : 0.000168s : 1.38% optimize.opt_a.accelerated_algorithm : 0.000014s : 0.11% optimize.opt_a.shard : 0.000003s : 0.03% optimize.opt_a.meta_shard_fg_expand : 0.000003s : 0.03% optimize.opt_a.shard_inline : 0.000013s : 0.10% optimize.opt_a.merge_send_recv : 0.000015s : 0.12% optimize.opt_a.auto_parallel : 0.000014s : 0.12% optimize.opt_a.parallel : 0.000026s : 0.22% optimize.opt_a.flash_sp : 0.000012s : 0.10% optimize.opt_a.merge_comm : 0.000007s : 0.06% optimize.opt_a.allreduce_fusion : 0.000007s : 0.06% optimize.opt_a.matmul_add_comm_reduction : 0.000017s : 0.14% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000002s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000015s : 0.12% optimize.opt_a.virtual_dataset : 0.000013s : 0.10% optimize.opt_a.get_grad_eliminate_ : 0.000013s : 0.10% optimize.opt_a.virtual_output : 0.000013s : 0.11% optimize.opt_a.merge_forward : 0.000007s : 0.05% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.03% optimize.opt_a.offload_activation : 0.000019s : 0.16% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000024s : 0.20% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.02% optimize.opt_a.before_grad : 0.000021s : 0.17% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000007s : 0.06% optimize.opt_a.meta_fg_expand : 0.000006s : 0.05% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.03% optimize.opt_a.receive_attached : 0.000004s : 0.04% optimize.opt_a.after_resolve : 0.000023s : 0.19% optimize.opt_a.a_after_grad : 0.000022s : 0.18% optimize.opt_a.renormalize : 0.000786s : 6.44% optimize.opt_a.add_forward_monad_depend : 0.000007s : 0.05% optimize.opt_a.auto_monad_grad : 0.000004s : 0.04% optimize.opt_a.auto_monad_eliminator : 0.000024s : 0.20% optimize.opt_a.cse : 0.000048s : 0.39% optimize.opt_a.a_3 : 0.000087s : 0.71% optimize.py_interpret_to_execute_after_opt_a : 0.000012s : 0.10% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.02% optimize.rewriter_after_opt_a : 0.000070s : 0.57% optimize.convert_after_rewriter : 0.000007s : 0.06% optimize.order_py_execute_after_rewriter : 0.000005s : 0.04% optimize.mutable_eliminate : 0.000717s : 5.87% optimize.opt_b.b_1 : 0.000127s : 1.04% optimize.opt_b.b_2 : 0.000008s : 0.07% optimize.opt_b.updatestate_depend_eliminate : 0.000007s : 0.06% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.02% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.02% optimize.opt_b.renormalize : 0.000001s : 0.01% optimize.opt_b.cse : 0.000021s : 0.17% optimize.optimize_parallel_all_gather_comm : 0.000019s : 0.15% optimize.overlap_param_gather : 0.000002s : 0.02% optimize.cconv : 0.000030s : 0.25% optimize.loop_unroll : 0.000509s : 4.17% optimize.opt_after_cconv.c_1 : 0.000033s : 0.27% optimize.opt_after_cconv.parameter_eliminate : 0.000004s : 0.04% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.05% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.02% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.02% optimize.opt_after_cconv.cse : 0.000019s : 0.16% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000014s : 0.12% optimize.tuple_transform.d_1 : 0.000047s : 0.38% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000007s : 0.06% optimize.partial_unused_args_eliminate : 0.000002s : 0.02% optimize.add_recomputation : 0.000054s : 0.44% optimize.cse_after_recomputation.cse : 0.000012s : 0.09% optimize.environ_conv : 0.000006s : 0.05% optimize.swap_dp_allreduce_reducescatter : 0.000005s : 0.04% optimize.bias_add_comm_swap : 0.000003s : 0.02% optimize.label_micro_interleaved_index : 0.000005s : 0.04% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.03% optimize.merge_cast_opt : 0.000002s : 0.01% optimize.slice_recompute_activation : 0.000002s : 0.02% optimize.micro_interleaved_order_control : 0.000002s : 0.02% optimize.assign_add_opt : 0.000001s : 0.01% optimize.ForceFp32Comm : 0.000001s : 0.01% optimize.remove_cast_before_assign_add : 0.000001s : 0.01% optimize.full_micro_interleaved_order_control : 0.000002s : 0.02% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.02% optimize.comm_op_add_attrs : 0.000001s : 0.01% optimize.add_comm_op_reuse_tag : 0.000001s : 0.01% optimize.interleave_split_concat_branches : 0.000001s : 0.01% optimize.interleave_parallel_branches : 0.000001s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.02% optimize.control_data_broadcast_order : 0.000013s : 0.11% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.01% optimize.offloading_packed_experts : 0.000004s : 0.03% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.04% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.01% optimize.overlap_recompute_comm : 0.000002s : 0.02% optimize.overlap_grad_ring_attention : 0.000004s : 0.03% optimize.overlap_grad_flash_sp : 0.000022s : 0.18% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.02% optimize.split_layernorm_comm : 0.000002s : 0.01% optimize.handle_group_info : 0.000001s : 0.01% optimize.symbol_engine_optimizer.build : 0.000004s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000011s : 0.09% optimize.symbol_engine_optimizer.elim_not_effective : 0.000014s : 0.12% optimize.symbol_engine_optimizer.opt_reshape : 0.000008s : 0.06% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000010s : 0.08% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.02% pipeline_parallel_scheduler : 0.000002s : 0.01% auto_monad_reorder : 0.000017s : 0.14% get_jit_bprop_graph : 0.000002s : 0.02% rewriter_after_jit_bprop_graph : 0.000005s : 0.04% opt_after_jit_grad : 0.000554s : 4.54% validate : 0.000044s : 0.36% Time group info: ------[substitution.] 0.000231 34 14.84% : 0.000034s : 6: substitution.arithmetic_simplify 1.02% : 0.000002s : 2: substitution.elim_not_effective 0.60% : 0.000001s : 2: substitution.fold_const_symbol 2.72% : 0.000006s : 4: substitution.graph_param_transform 67.89% : 0.000157s : 4: substitution.inline 1.93% : 0.000004s : 4: substitution.j_node_and_user_rematch 2.03% : 0.000005s : 4: substitution.remove_not_recompute_node 2.31% : 0.000005s : 4: substitution.replace_old_param 6.67% : 0.000015s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.006386 2 87.57% : 0.005592s : 1: type_inference.infer 12.43% : 0.000794s : 1: type_inference.specialize ------[replace.] 0.000066 8 64.59% : 0.000042s : 4: replace.inline 35.41% : 0.000023s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000168 8 91.90% : 0.000154s : 4: match.inline 8.10% : 0.000014s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000213 1278 1.02% : 0.000002s : 13: predicate.accumulaten_eliminater 0.84% : 0.000002s : 4: predicate.ad_related_special_op_eliminate 0.53% : 0.000001s : 8: predicate.addn_check_dump 0.85% : 0.000002s : 13: predicate.addn_zero_filter 0.82% : 0.000002s : 13: predicate.adjust_all_reduce_mul_add 2.54% : 0.000005s : 21: predicate.arithmetic_simplify 0.96% : 0.000002s : 13: predicate.cast_eliminate 0.55% : 0.000001s : 8: predicate.check_bprop_eliminate 0.50% : 0.000001s : 8: predicate.compare_switch_simplify 0.18% : 0.000000s : 4: predicate.const_output_eliminate 0.66% : 0.000001s : 8: predicate.depend_value_elim 0.92% : 0.000002s : 13: predicate.dict_get_item_const_eliminator 1.09% : 0.000002s : 13: predicate.dict_get_item_eliminator 0.90% : 0.000002s : 13: predicate.dict_set_item_eliminator 0.98% : 0.000002s : 8: predicate.dumpgradient_eliminate 0.23% : 0.000000s : 4: predicate.elim_not_effective 0.41% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.15% : 0.000002s : 17: predicate.environ_add_const_eliminate 1.05% : 0.000002s : 17: predicate.environ_get_add_eliminate 1.05% : 0.000002s : 17: predicate.environ_get_depend_swap 1.91% : 0.000004s : 25: predicate.environ_get_eliminate 1.14% : 0.000002s : 17: predicate.environ_get_set_eliminate 1.45% : 0.000003s : 21: predicate.exchange_switch_depend_value 2.53% : 0.000005s : 21: predicate.float_depend_g_call 0.52% : 0.000001s : 8: predicate.float_environ_get_switch 0.78% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.20% : 0.000000s : 4: predicate.fold_const_symbol 0.70% : 0.000002s : 8: predicate.get_grad_eliminate 0.19% : 0.000000s : 4: predicate.graph_param_transform 0.58% : 0.000001s : 8: predicate.incorporate_call 0.48% : 0.000001s : 8: predicate.incorporate_call_switch 6.05% : 0.000013s : 58: predicate.inline 0.82% : 0.000002s : 8: predicate.inline_without_move 0.33% : 0.000001s : 8: predicate.j_node_and_user_rematch 0.87% : 0.000002s : 8: predicate.less_batch_normalization 1.72% : 0.000004s : 25: predicate.list_to_tuple_eliminator_ 2.57% : 0.000005s : 38: predicate.load_eliminater 0.81% : 0.000002s : 4: predicate.loop_unroll_after_grad 2.33% : 0.000005s : 34: predicate.loop_unroll_before_grad 1.67% : 0.000004s : 21: predicate.make_slice_get_slice_eliminator 0.57% : 0.000001s : 8: predicate.merge_addn 0.63% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.80% : 0.000002s : 8: predicate.mini_step_allgather_replace 0.80% : 0.000002s : 13: predicate.minmaximum_grad 1.21% : 0.000003s : 4: predicate.mutable_eliminate 0.38% : 0.000001s : 4: predicate.opt_reshape 0.46% : 0.000001s : 4: predicate.parallel_virtual_node 1.90% : 0.000004s : 21: predicate.partial_defer_inline 1.60% : 0.000003s : 21: predicate.partial_eliminate 0.93% : 0.000002s : 13: predicate.print_const_string_wrapper 0.61% : 0.000001s : 8: predicate.reduce_all_const_elim 1.16% : 0.000002s : 13: predicate.reduce_eliminate 2.41% : 0.000005s : 38: predicate.redundant_stop_gradient_eliminater 0.49% : 0.000001s : 8: predicate.remove_not_recompute_node 1.48% : 0.000003s : 25: predicate.replace_applicator 0.42% : 0.000001s : 8: predicate.replace_old_param 0.25% : 0.000001s : 4: predicate.reset_defer_inline 0.93% : 0.000002s : 13: predicate.reshape_eliminate 0.60% : 0.000001s : 8: predicate.row_tensor_add_zeros_like 0.42% : 0.000001s : 4: predicate.row_tensor_eliminate 0.69% : 0.000001s : 8: predicate.same_eliminate 0.45% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.78% : 0.000002s : 8: predicate.shard_identity_eliminate 0.77% : 0.000002s : 8: predicate.special_op_eliminate 0.66% : 0.000001s : 8: predicate.specialize_transform 0.78% : 0.000002s : 8: predicate.split_environ_get_set_with_tuple_value 0.85% : 0.000002s : 8: predicate.stack_unstack_eliminate 0.33% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.50% : 0.000003s : 21: predicate.switch_defer_inline 1.97% : 0.000004s : 29: predicate.switch_layer_defer_inline 5.12% : 0.000011s : 67: predicate.switch_simplify 0.79% : 0.000002s : 13: predicate.tile_eliminate 0.95% : 0.000002s : 13: predicate.transpose_eliminate 1.50% : 0.000003s : 21: predicate.tuple_list_convert_item_index_to_positive 1.44% : 0.000003s : 21: predicate.tuple_list_get_item_const_eliminator 1.44% : 0.000003s : 21: predicate.tuple_list_get_item_depend_reorder 3.16% : 0.000007s : 33: predicate.tuple_list_get_item_eliminator 1.53% : 0.000003s : 21: predicate.tuple_list_get_set_item_eliminator 2.63% : 0.000006s : 29: predicate.tuple_list_set_item_eliminator 1.89% : 0.000004s : 25: predicate.tuple_to_list_eliminator_ 2.28% : 0.000005s : 38: predicate.updatestate_pure_node_eliminater 3.02% : 0.000006s : 46: predicate.updatestate_useless_node_eliminater 0.39% : 0.000001s : 4: predicate.value_based_eliminate 0.61% : 0.000001s : 8: predicate.virtual_dataset_eliminate 0.70% : 0.000001s : 8: predicate.virtual_output_eliminate 0.30% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.52% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000592 11 53.32% : 0.000316s : 5: func_graph_cloner_run.FuncGraphClonerGraph 46.68% : 0.000276s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.028740 192 0.01% : 0.000004s : 1: ForceFp32Comm 13.98% : 0.004017s : 1: add_attr 13.93% : 0.004003s : 1: add_attr_with_inline 0.02% : 0.000004s : 1: add_comm_op_reuse_tag 0.20% : 0.000058s : 1: add_recomputation 0.01% : 0.000004s : 1: assign_add_opt 0.25% : 0.000071s : 1: auto_monad 0.07% : 0.000021s : 1: auto_monad_reorder 0.01% : 0.000004s : 1: begin_end_overlap_inline 0.02% : 0.000006s : 1: bias_add_comm_swap 1.73% : 0.000497s : 1: bootstrap 0.12% : 0.000034s : 1: cconv 0.01% : 0.000004s : 1: comm_op_add_attrs 0.06% : 0.000016s : 1: control_data_broadcast_order 0.04% : 0.000011s : 1: convert_after_rewriter 0.09% : 0.000025s : 1: cse_after_recomputation 0.02% : 0.000005s : 1: dataset_repeat_opt 0.02% : 0.000005s : 1: detach_backward 0.03% : 0.000009s : 1: environ_conv 0.10% : 0.000027s : 1: event_method 0.02% : 0.000005s : 1: full_micro_interleaved_order_control 0.02% : 0.000005s : 1: get_jit_bprop_graph 0.03% : 0.000010s : 1: graph_reusing 0.01% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000004s : 1: handle_group_info 0.02% : 0.000006s : 1: inline 0.02% : 0.000006s : 1: insert-virtual-dataset 0.01% : 0.000004s : 1: interleave_parallel_branches 0.01% : 0.000004s : 1: interleave_split_concat_branches 0.02% : 0.000006s : 1: label_fine_grained_interleaved_index 0.03% : 0.000008s : 1: label_micro_interleaved_index 1.80% : 0.000518s : 1: loop_unroll 0.02% : 0.000004s : 1: merge_cast_opt 0.02% : 0.000005s : 1: micro_interleaved_order_control 2.53% : 0.000726s : 1: mutable_eliminate 0.02% : 0.000007s : 1: offloading_packed_experts 0.05% : 0.000015s : 1: opt.transform.loop_unroll_optimizer 0.06% : 0.000018s : 1: opt.transform.mutable_eliminate 4.39% : 0.001261s : 78: opt.transform.opt_a 0.11% : 0.000032s : 1: opt.transform.opt_after_cconv 0.09% : 0.000026s : 1: opt.transform.opt_after_jit_grad 0.35% : 0.000101s : 28: opt.transform.opt_b 0.18% : 0.000051s : 2: opt.transform.opt_trans_graph 0.13% : 0.000038s : 4: opt.transform.symbol_engine_opt 10.15% : 0.002917s : 1: opt_a 0.39% : 0.000113s : 1: opt_after_cconv 1.97% : 0.000566s : 1: opt_after_jit_grad 0.75% : 0.000215s : 1: opt_b 18.51% : 0.005319s : 1: optimize 0.08% : 0.000022s : 1: optimize_parallel_all_gather_comm 0.03% : 0.000009s : 1: order_py_execute_after_rewriter 0.09% : 0.000025s : 1: overlap_grad_flash_sp 0.02% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.02% : 0.000007s : 1: overlap_grad_ring_attention 0.02% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.02% : 0.000005s : 1: overlap_param_gather 0.01% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.03% : 0.000009s : 1: overlap_recompute_and_grad_model_parallel 0.02% : 0.000005s : 1: overlap_recompute_comm 0.03% : 0.000007s : 1: parallel-infer-symbol 0.01% : 0.000004s : 1: parallel-infer-symbol-second 0.02% : 0.000005s : 1: partial_unused_args_eliminate 0.02% : 0.000005s : 1: pipeline_parallel_scheduler 0.02% : 0.000004s : 1: pipeline_split 0.16% : 0.000045s : 1: pre_auto_parallel 0.12% : 0.000034s : 1: py_interpret_to_execute 0.05% : 0.000015s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000004s : 1: remove_cast_before_assign_add 0.06% : 0.000018s : 1: remove_dup_value 1.46% : 0.000419s : 1: renormalize.infer 1.25% : 0.000358s : 1: renormalize.specialize 0.02% : 0.000005s : 1: reorder_send_recv_between_fp_bp 0.03% : 0.000008s : 1: rewriter_after_jit_bprop_graph 0.26% : 0.000075s : 1: rewriter_after_opt_a 0.33% : 0.000095s : 1: rewriter_before_opt_a 0.02% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.02% : 0.000006s : 1: slice_recompute_activation 0.02% : 0.000005s : 1: split_layernorm_comm 0.02% : 0.000005s : 1: split_matmul_comm_elemetwise 0.03% : 0.000008s : 1: swap_dp_allreduce_reducescatter 0.29% : 0.000083s : 1: symbol_engine_optimizer 0.29% : 0.000082s : 1: tuple_transform 22.70% : 0.006523s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:42:55.543.415 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:42:55.543.686 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0840954, [21] [bootstrap]: 0.00047326 [type_inference]: 0.0711594 [event_method]: 2.58e-05 [auto_monad]: 7.505e-05 [graph_reusing]: 6.63e-06 [inline]: 3.26999e-06 [add_attr]: 0.00420822, [1] [add_attr_with_inline]: 0.00419296, [1] [Cycle 1]: 0.00010401, [2] [tag_attr]: 2.433e-05 [meta_addattr_fg_expand]: 6.34001e-06 [parallel-infer-symbol]: 3.95998e-06 [pre_auto_parallel]: 4.509e-05 [insert-virtual-dataset]: 2.76999e-06 [parallel-infer-symbol-second]: 1.02e-06 [dataset_repeat_opt]: 2.06e-06 [pipeline_split]: 1.62001e-06 [optimize]: 0.00672239, [53] [py_interpret_to_execute]: 4.11e-05 [rewriter_before_opt_a]: 0.00010465 [opt_a]: 0.00370479, [2] [Cycle 1]: 0.00274578, [45] [expand_dump_flag]: 3.97e-06 [switch_simplify]: 7.002e-05 [loop_unroll]: 4.982e-05 [a_1]: 0.00075528 [with_stream_mark]: 2.651e-05 [recompute_prepare]: 1.166e-05 [updatestate_depend_eliminate]: 4.67998e-06 [updatestate_assign_eliminate]: 3.66999e-06 [updatestate_loads_eliminate]: 2.93e-06 [parameter_eliminate]: 2.66999e-06 [a_2]: 0.00012167 [accelerated_algorithm]: 7.85e-06 [shard]: 2.19999e-06 [meta_shard_fg_expand]: 2.73e-06 [shard_inline]: 7.16001e-06 [merge_send_recv]: 1.008e-05 [auto_parallel]: 9.61e-06 [parallel]: 2.089e-05 [flash_sp]: 1.058e-05 [merge_comm]: 3.96001e-06 [allreduce_fusion]: 3.41999e-06 [matmul_add_comm_reduction]: 1.139e-05 [allreduce_slice_to_reducescatter]: 7.89994e-07 [virtual_shard_identity]: 1.072e-05 [virtual_dataset]: 7.1e-06 [get_grad_eliminate_]: 6.68e-06 [virtual_output]: 7.23e-06 [merge_forward]: 4.87e-06 [cell_reuse_recompute_pass]: 1.56002e-06 [offload_activation]: 1.145e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.618e-05 [merge_recompute_call_nodes]: 2.16e-06 [before_grad]: 1.21e-05 [set_forward_comm_id_for_comm_node_pass]: 4.13001e-06 [meta_fg_expand]: 3.53e-06 [flash_sp_send_recv_attached]: 3.37997e-06 [receive_attached]: 2.50002e-06 [after_resolve]: 1.281e-05 [a_after_grad]: 1.139e-05 [renormalize]: 0.00090644 [add_forward_monad_depend]: 7.74002e-06 [auto_monad_grad]: 2.56e-06 [auto_monad_eliminator]: 2.088e-05 [cse]: 3.1e-05 [a_3]: 7.147e-05 [Cycle 2]: 0.00093888, [45] [expand_dump_flag]: 2.83998e-06 [switch_simplify]: 8.94003e-06 [loop_unroll]: 7.18998e-06 [a_1]: 0.00016102 [with_stream_mark]: 1.822e-05 [recompute_prepare]: 7.62998e-06 [updatestate_depend_eliminate]: 3.49001e-06 [updatestate_assign_eliminate]: 2.93e-06 [updatestate_loads_eliminate]: 2.58003e-06 [parameter_eliminate]: 1.96e-06 [a_2]: 0.00010996 [accelerated_algorithm]: 7.02002e-06 [shard]: 1.93002e-06 [meta_shard_fg_expand]: 2.98e-06 [shard_inline]: 6.68e-06 [merge_send_recv]: 9.01998e-06 [auto_parallel]: 8.62e-06 [parallel]: 7.31001e-06 [flash_sp]: 3.96001e-06 [merge_comm]: 3.6e-06 [allreduce_fusion]: 3.69002e-06 [matmul_add_comm_reduction]: 1.477e-05 [allreduce_slice_to_reducescatter]: 5.00004e-07 [virtual_shard_identity]: 9.20999e-06 [virtual_dataset]: 6.34001e-06 [get_grad_eliminate_]: 6.46e-06 [virtual_output]: 6.52001e-06 [merge_forward]: 3.75e-06 [cell_reuse_recompute_pass]: 2.38998e-06 [offload_activation]: 9.72999e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.434e-05 [merge_recompute_call_nodes]: 1.15999e-06 [before_grad]: 9.98002e-06 [set_forward_comm_id_for_comm_node_pass]: 4.17e-06 [meta_fg_expand]: 2.93998e-06 [flash_sp_send_recv_attached]: 1.81998e-06 [receive_attached]: 1.82001e-06 [after_resolve]: 1.322e-05 [a_after_grad]: 1.079e-05 [renormalize]: 8.9989e-08 [add_forward_monad_depend]: 2.27001e-06 [auto_monad_grad]: 2.11998e-06 [auto_monad_eliminator]: 9.79e-06 [cse]: 1.814e-05 [a_3]: 5.171e-05 [py_interpret_to_execute_after_opt_a]: 2.167e-05 [slice_cell_reuse_recomputed_activation]: 5.83002e-06 [rewriter_after_opt_a]: 4.794e-05 [convert_after_rewriter]: 1.079e-05 [order_py_execute_after_rewriter]: 9.28997e-06 [mutable_eliminate]: 0.00082216 [opt_b]: 0.00030795, [1] [Cycle 1]: 0.00029498, [7] [b_1]: 0.00017755 [b_2]: 9.55001e-06 [updatestate_depend_eliminate]: 1.059e-05 [updatestate_assign_eliminate]: 3.01999e-06 [updatestate_loads_eliminate]: 3.23998e-06 [renormalize]: 8.89995e-07 [cse]: 2.948e-05 [optimize_parallel_all_gather_comm]: 2.614e-05 [overlap_param_gather]: 5.27999e-06 [cconv]: 4.011e-05 [loop_unroll]: 0.00062158 [opt_after_cconv]: 0.00014878, [1] [Cycle 1]: 0.00013663, [7] [c_1]: 3.475e-05 [parameter_eliminate]: 5.59e-06 [updatestate_depend_eliminate]: 7.53999e-06 [updatestate_assign_eliminate]: 2.97002e-06 [updatestate_loads_eliminate]: 2.56e-06 [cse]: 2.455e-05 [renormalize]: 6.60017e-07 [remove_dup_value]: 1.897e-05 [tuple_transform]: 0.00010415, [1] [Cycle 1]: 9.617e-05, [4] [d_1]: 5.324e-05 [none_parameter_eliminate]: 2.01e-06 [renormalize]: 2.19996e-07 [switch_simplify]: 7.98001e-06 [partial_unused_args_eliminate]: 4.52e-06 [add_recomputation]: 5.821e-05 [cse_after_recomputation]: 3.101e-05, [1] [Cycle 1]: 2.327e-05, [1] [cse]: 1.378e-05 [environ_conv]: 9.66e-06 [swap_dp_allreduce_reducescatter]: 8.08001e-06 [bias_add_comm_swap]: 5.69e-06 [label_micro_interleaved_index]: 8.99e-06 [label_fine_grained_interleaved_index]: 5.10001e-06 [merge_cast_opt]: 4.1e-06 [slice_recompute_activation]: 5.12999e-06 [micro_interleaved_order_control]: 6.17999e-06 [assign_add_opt]: 3.83999e-06 [ForceFp32Comm]: 3.36001e-06 [remove_cast_before_assign_add]: 3.4e-06 [full_micro_interleaved_order_control]: 4.53001e-06 [reorder_send_recv_between_fp_bp]: 6.11e-06 [comm_op_add_attrs]: 4e-06 [add_comm_op_reuse_tag]: 3.37002e-06 [interleave_split_concat_branches]: 3.62002e-06 [interleave_parallel_branches]: 3.45998e-06 [overlap_opt_shard_in_pipeline]: 3.55e-06 [overlap_opt_shard_grad_in_pipeline]: 4.20999e-06 [control_data_broadcast_order]: 1.795e-05 [grouped_pairwise_exchange_alltoall]: 4.27e-06 [offloading_packed_experts]: 7.63999e-06 [overlap_recompute_and_grad_model_parallel]: 8.03999e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.79002e-06 [overlap_recompute_allgather_and_fa_grad]: 3.92998e-06 [overlap_recompute_comm]: 5.44998e-06 [overlap_grad_ring_attention]: 6.79999e-06 [overlap_grad_flash_sp]: 2.749e-05 [begin_end_overlap_inline]: 3.10998e-06 [split_matmul_comm_elemetwise]: 4.62e-06 [split_layernorm_comm]: 4.13999e-06 [handle_group_info]: 3.53999e-06 [symbol_engine_optimizer]: 0.000117, [1] [Cycle 1]: 0.00010886, [6] [build]: 4.58999e-06 [elim_shapecalc]: 1.386e-05 [elim_not_effective]: 1.594e-05 [opt_reshape]: 8.74e-06 [fold_const_symbol]: 1.164e-05 [renormalize]: 7.50006e-07 [detach_backward]: 4.38999e-06 [pipeline_parallel_scheduler]: 1.82001e-06 [auto_monad_reorder]: 2.178e-05 [get_jit_bprop_graph]: 1.93997e-06 [rewriter_after_jit_bprop_graph]: 5.72001e-06 [opt_after_jit_grad]: 0.00064969 [validate]: 4.211e-05 Sums bootstrap : 0.000473s : 0.61% type_inference : 0.071159s : 91.32% event_method : 0.000026s : 0.03% auto_monad : 0.000075s : 0.10% graph_reusing : 0.000007s : 0.01% inline : 0.000003s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000024s : 0.03% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.01% parallel-infer-symbol : 0.000004s : 0.01% pre_auto_parallel : 0.000045s : 0.06% insert-virtual-dataset : 0.000003s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000041s : 0.05% optimize.rewriter_before_opt_a : 0.000105s : 0.13% optimize.opt_a.expand_dump_flag : 0.000007s : 0.01% optimize.opt_a.switch_simplify : 0.000079s : 0.10% optimize.opt_a.loop_unroll : 0.000057s : 0.07% optimize.opt_a.a_1 : 0.000916s : 1.18% optimize.opt_a.with_stream_mark : 0.000045s : 0.06% optimize.opt_a.recompute_prepare : 0.000019s : 0.02% optimize.opt_a.updatestate_depend_eliminate : 0.000008s : 0.01% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.01% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.01% optimize.opt_a.parameter_eliminate : 0.000005s : 0.01% optimize.opt_a.a_2 : 0.000232s : 0.30% optimize.opt_a.accelerated_algorithm : 0.000015s : 0.02% optimize.opt_a.shard : 0.000004s : 0.01% optimize.opt_a.meta_shard_fg_expand : 0.000006s : 0.01% optimize.opt_a.shard_inline : 0.000014s : 0.02% optimize.opt_a.merge_send_recv : 0.000019s : 0.02% optimize.opt_a.auto_parallel : 0.000018s : 0.02% optimize.opt_a.parallel : 0.000028s : 0.04% optimize.opt_a.flash_sp : 0.000015s : 0.02% optimize.opt_a.merge_comm : 0.000008s : 0.01% optimize.opt_a.allreduce_fusion : 0.000007s : 0.01% optimize.opt_a.matmul_add_comm_reduction : 0.000026s : 0.03% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000020s : 0.03% optimize.opt_a.virtual_dataset : 0.000013s : 0.02% optimize.opt_a.get_grad_eliminate_ : 0.000013s : 0.02% optimize.opt_a.virtual_output : 0.000014s : 0.02% optimize.opt_a.merge_forward : 0.000009s : 0.01% optimize.opt_a.cell_reuse_recompute_pass : 0.000004s : 0.01% optimize.opt_a.offload_activation : 0.000021s : 0.03% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000031s : 0.04% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.00% optimize.opt_a.before_grad : 0.000022s : 0.03% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000008s : 0.01% optimize.opt_a.meta_fg_expand : 0.000006s : 0.01% optimize.opt_a.flash_sp_send_recv_attached : 0.000005s : 0.01% optimize.opt_a.receive_attached : 0.000004s : 0.01% optimize.opt_a.after_resolve : 0.000026s : 0.03% optimize.opt_a.a_after_grad : 0.000022s : 0.03% optimize.opt_a.renormalize : 0.000907s : 1.16% optimize.opt_a.add_forward_monad_depend : 0.000010s : 0.01% optimize.opt_a.auto_monad_grad : 0.000005s : 0.01% optimize.opt_a.auto_monad_eliminator : 0.000031s : 0.04% optimize.opt_a.cse : 0.000049s : 0.06% optimize.opt_a.a_3 : 0.000123s : 0.16% optimize.py_interpret_to_execute_after_opt_a : 0.000022s : 0.03% optimize.slice_cell_reuse_recomputed_activation : 0.000006s : 0.01% optimize.rewriter_after_opt_a : 0.000048s : 0.06% optimize.convert_after_rewriter : 0.000011s : 0.01% optimize.order_py_execute_after_rewriter : 0.000009s : 0.01% optimize.mutable_eliminate : 0.000822s : 1.06% optimize.opt_b.b_1 : 0.000178s : 0.23% optimize.opt_b.b_2 : 0.000010s : 0.01% optimize.opt_b.updatestate_depend_eliminate : 0.000011s : 0.01% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.00% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000029s : 0.04% optimize.optimize_parallel_all_gather_comm : 0.000026s : 0.03% optimize.overlap_param_gather : 0.000005s : 0.01% optimize.cconv : 0.000040s : 0.05% optimize.loop_unroll : 0.000622s : 0.80% optimize.opt_after_cconv.c_1 : 0.000035s : 0.04% optimize.opt_after_cconv.parameter_eliminate : 0.000006s : 0.01% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000008s : 0.01% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.cse : 0.000025s : 0.03% optimize.opt_after_cconv.renormalize : 0.000001s : 0.00% optimize.remove_dup_value : 0.000019s : 0.02% optimize.tuple_transform.d_1 : 0.000053s : 0.07% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000008s : 0.01% optimize.partial_unused_args_eliminate : 0.000005s : 0.01% optimize.add_recomputation : 0.000058s : 0.07% optimize.cse_after_recomputation.cse : 0.000014s : 0.02% optimize.environ_conv : 0.000010s : 0.01% optimize.swap_dp_allreduce_reducescatter : 0.000008s : 0.01% optimize.bias_add_comm_swap : 0.000006s : 0.01% optimize.label_micro_interleaved_index : 0.000009s : 0.01% optimize.label_fine_grained_interleaved_index : 0.000005s : 0.01% optimize.merge_cast_opt : 0.000004s : 0.01% optimize.slice_recompute_activation : 0.000005s : 0.01% optimize.micro_interleaved_order_control : 0.000006s : 0.01% optimize.assign_add_opt : 0.000004s : 0.00% optimize.ForceFp32Comm : 0.000003s : 0.00% optimize.remove_cast_before_assign_add : 0.000003s : 0.00% optimize.full_micro_interleaved_order_control : 0.000005s : 0.01% optimize.reorder_send_recv_between_fp_bp : 0.000006s : 0.01% optimize.comm_op_add_attrs : 0.000004s : 0.01% optimize.add_comm_op_reuse_tag : 0.000003s : 0.00% optimize.interleave_split_concat_branches : 0.000004s : 0.00% optimize.interleave_parallel_branches : 0.000003s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000004s : 0.01% optimize.control_data_broadcast_order : 0.000018s : 0.02% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.01% optimize.offloading_packed_experts : 0.000008s : 0.01% optimize.overlap_recompute_and_grad_model_parallel : 0.000008s : 0.01% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.01% optimize.overlap_recompute_comm : 0.000005s : 0.01% optimize.overlap_grad_ring_attention : 0.000007s : 0.01% optimize.overlap_grad_flash_sp : 0.000027s : 0.04% optimize.begin_end_overlap_inline : 0.000003s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000005s : 0.01% optimize.split_layernorm_comm : 0.000004s : 0.01% optimize.handle_group_info : 0.000004s : 0.00% optimize.symbol_engine_optimizer.build : 0.000005s : 0.01% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000014s : 0.02% optimize.symbol_engine_optimizer.elim_not_effective : 0.000016s : 0.02% optimize.symbol_engine_optimizer.opt_reshape : 0.000009s : 0.01% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000012s : 0.01% optimize.symbol_engine_optimizer.renormalize : 0.000001s : 0.00% detach_backward : 0.000004s : 0.01% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000022s : 0.03% get_jit_bprop_graph : 0.000002s : 0.00% rewriter_after_jit_bprop_graph : 0.000006s : 0.01% opt_after_jit_grad : 0.000650s : 0.83% validate : 0.000042s : 0.05% Time group info: ------[substitution.] 0.000273 34 15.87% : 0.000043s : 6: substitution.arithmetic_simplify 0.79% : 0.000002s : 2: substitution.elim_not_effective 0.61% : 0.000002s : 2: substitution.fold_const_symbol 2.57% : 0.000007s : 4: substitution.graph_param_transform 68.19% : 0.000186s : 4: substitution.inline 1.99% : 0.000005s : 4: substitution.j_node_and_user_rematch 1.91% : 0.000005s : 4: substitution.remove_not_recompute_node 2.27% : 0.000006s : 4: substitution.replace_old_param 5.80% : 0.000016s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.071079 2 98.58% : 0.070071s : 1: type_inference.infer 1.42% : 0.001009s : 1: type_inference.specialize ------[replace.] 0.000072 8 60.76% : 0.000044s : 4: replace.inline 39.24% : 0.000028s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000197 8 92.92% : 0.000183s : 4: match.inline 7.08% : 0.000014s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000260 1278 1.07% : 0.000003s : 13: predicate.accumulaten_eliminater 0.69% : 0.000002s : 4: predicate.ad_related_special_op_eliminate 0.51% : 0.000001s : 8: predicate.addn_check_dump 0.96% : 0.000002s : 13: predicate.addn_zero_filter 0.79% : 0.000002s : 13: predicate.adjust_all_reduce_mul_add 2.53% : 0.000007s : 21: predicate.arithmetic_simplify 0.91% : 0.000002s : 13: predicate.cast_eliminate 0.52% : 0.000001s : 8: predicate.check_bprop_eliminate 0.61% : 0.000002s : 8: predicate.compare_switch_simplify 0.16% : 0.000000s : 4: predicate.const_output_eliminate 0.55% : 0.000001s : 8: predicate.depend_value_elim 0.76% : 0.000002s : 13: predicate.dict_get_item_const_eliminator 0.96% : 0.000002s : 13: predicate.dict_get_item_eliminator 0.91% : 0.000002s : 13: predicate.dict_set_item_eliminator 0.82% : 0.000002s : 8: predicate.dumpgradient_eliminate 0.21% : 0.000001s : 4: predicate.elim_not_effective 0.42% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.14% : 0.000003s : 17: predicate.environ_add_const_eliminate 1.14% : 0.000003s : 17: predicate.environ_get_add_eliminate 0.98% : 0.000003s : 17: predicate.environ_get_depend_swap 1.52% : 0.000004s : 25: predicate.environ_get_eliminate 0.96% : 0.000002s : 17: predicate.environ_get_set_eliminate 1.37% : 0.000004s : 21: predicate.exchange_switch_depend_value 2.07% : 0.000005s : 21: predicate.float_depend_g_call 0.54% : 0.000001s : 8: predicate.float_environ_get_switch 0.72% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.15% : 0.000000s : 4: predicate.fold_const_symbol 0.74% : 0.000002s : 8: predicate.get_grad_eliminate 0.17% : 0.000000s : 4: predicate.graph_param_transform 0.51% : 0.000001s : 8: predicate.incorporate_call 0.39% : 0.000001s : 8: predicate.incorporate_call_switch 12.21% : 0.000032s : 58: predicate.inline 0.72% : 0.000002s : 8: predicate.inline_without_move 0.27% : 0.000001s : 8: predicate.j_node_and_user_rematch 0.74% : 0.000002s : 8: predicate.less_batch_normalization 1.82% : 0.000005s : 25: predicate.list_to_tuple_eliminator_ 2.22% : 0.000006s : 38: predicate.load_eliminater 0.89% : 0.000002s : 4: predicate.loop_unroll_after_grad 2.07% : 0.000005s : 34: predicate.loop_unroll_before_grad 1.61% : 0.000004s : 21: predicate.make_slice_get_slice_eliminator 0.51% : 0.000001s : 8: predicate.merge_addn 0.57% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.50% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.74% : 0.000002s : 13: predicate.minmaximum_grad 1.41% : 0.000004s : 4: predicate.mutable_eliminate 0.32% : 0.000001s : 4: predicate.opt_reshape 0.35% : 0.000001s : 4: predicate.parallel_virtual_node 1.66% : 0.000004s : 21: predicate.partial_defer_inline 1.39% : 0.000004s : 21: predicate.partial_eliminate 0.81% : 0.000002s : 13: predicate.print_const_string_wrapper 0.60% : 0.000002s : 8: predicate.reduce_all_const_elim 1.16% : 0.000003s : 13: predicate.reduce_eliminate 2.22% : 0.000006s : 38: predicate.redundant_stop_gradient_eliminater 0.49% : 0.000001s : 8: predicate.remove_not_recompute_node 1.24% : 0.000003s : 25: predicate.replace_applicator 0.33% : 0.000001s : 8: predicate.replace_old_param 0.32% : 0.000001s : 4: predicate.reset_defer_inline 0.88% : 0.000002s : 13: predicate.reshape_eliminate 0.59% : 0.000002s : 8: predicate.row_tensor_add_zeros_like 0.49% : 0.000001s : 4: predicate.row_tensor_eliminate 0.78% : 0.000002s : 8: predicate.same_eliminate 0.34% : 0.000001s : 8: predicate.set_cell_output_no_recompute 1.15% : 0.000003s : 8: predicate.shard_identity_eliminate 0.64% : 0.000002s : 8: predicate.special_op_eliminate 0.54% : 0.000001s : 8: predicate.specialize_transform 1.05% : 0.000003s : 8: predicate.split_environ_get_set_with_tuple_value 0.99% : 0.000003s : 8: predicate.stack_unstack_eliminate 0.29% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.23% : 0.000003s : 21: predicate.switch_defer_inline 1.75% : 0.000005s : 29: predicate.switch_layer_defer_inline 4.54% : 0.000012s : 67: predicate.switch_simplify 0.77% : 0.000002s : 13: predicate.tile_eliminate 0.82% : 0.000002s : 13: predicate.transpose_eliminate 1.65% : 0.000004s : 21: predicate.tuple_list_convert_item_index_to_positive 1.81% : 0.000005s : 21: predicate.tuple_list_get_item_const_eliminator 1.42% : 0.000004s : 21: predicate.tuple_list_get_item_depend_reorder 3.00% : 0.000008s : 33: predicate.tuple_list_get_item_eliminator 1.46% : 0.000004s : 21: predicate.tuple_list_get_set_item_eliminator 2.40% : 0.000006s : 29: predicate.tuple_list_set_item_eliminator 1.43% : 0.000004s : 25: predicate.tuple_to_list_eliminator_ 2.18% : 0.000006s : 38: predicate.updatestate_pure_node_eliminater 2.74% : 0.000007s : 46: predicate.updatestate_useless_node_eliminater 0.31% : 0.000001s : 4: predicate.value_based_eliminate 0.60% : 0.000002s : 8: predicate.virtual_dataset_eliminate 0.60% : 0.000002s : 8: predicate.virtual_output_eliminate 0.24% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.39% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000720 11 44.85% : 0.000323s : 5: func_graph_cloner_run.FuncGraphClonerGraph 55.15% : 0.000397s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.097520 192 0.01% : 0.000006s : 1: ForceFp32Comm 4.33% : 0.004222s : 1: add_attr 4.30% : 0.004198s : 1: add_attr_with_inline 0.01% : 0.000007s : 1: add_comm_op_reuse_tag 0.06% : 0.000062s : 1: add_recomputation 0.01% : 0.000006s : 1: assign_add_opt 0.09% : 0.000084s : 1: auto_monad 0.03% : 0.000030s : 1: auto_monad_reorder 0.01% : 0.000006s : 1: begin_end_overlap_inline 0.01% : 0.000008s : 1: bias_add_comm_swap 0.53% : 0.000520s : 1: bootstrap 0.04% : 0.000043s : 1: cconv 0.01% : 0.000007s : 1: comm_op_add_attrs 0.02% : 0.000021s : 1: control_data_broadcast_order 0.01% : 0.000014s : 1: convert_after_rewriter 0.03% : 0.000034s : 1: cse_after_recomputation 0.01% : 0.000008s : 1: dataset_repeat_opt 0.02% : 0.000023s : 1: detach_backward 0.01% : 0.000013s : 1: environ_conv 0.04% : 0.000039s : 1: event_method 0.01% : 0.000007s : 1: full_micro_interleaved_order_control 0.01% : 0.000008s : 1: get_jit_bprop_graph 0.01% : 0.000013s : 1: graph_reusing 0.01% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000006s : 1: handle_group_info 0.01% : 0.000009s : 1: inline 0.01% : 0.000009s : 1: insert-virtual-dataset 0.01% : 0.000006s : 1: interleave_parallel_branches 0.01% : 0.000006s : 1: interleave_split_concat_branches 0.01% : 0.000008s : 1: label_fine_grained_interleaved_index 0.01% : 0.000012s : 1: label_micro_interleaved_index 0.65% : 0.000631s : 1: loop_unroll 0.01% : 0.000007s : 1: merge_cast_opt 0.01% : 0.000009s : 1: micro_interleaved_order_control 0.85% : 0.000834s : 1: mutable_eliminate 0.01% : 0.000011s : 1: offloading_packed_experts 0.02% : 0.000017s : 1: opt.transform.loop_unroll_optimizer 0.02% : 0.000024s : 1: opt.transform.mutable_eliminate 1.46% : 0.001429s : 78: opt.transform.opt_a 0.03% : 0.000033s : 1: opt.transform.opt_after_cconv 0.03% : 0.000027s : 1: opt.transform.opt_after_jit_grad 0.11% : 0.000110s : 28: opt.transform.opt_b 0.06% : 0.000059s : 2: opt.transform.opt_trans_graph 0.05% : 0.000045s : 4: opt.transform.symbol_engine_opt 3.80% : 0.003709s : 1: opt_a 0.16% : 0.000153s : 1: opt_after_cconv 0.68% : 0.000660s : 1: opt_after_jit_grad 0.32% : 0.000312s : 1: opt_b 7.25% : 0.007074s : 1: optimize 0.03% : 0.000030s : 1: optimize_parallel_all_gather_comm 0.01% : 0.000013s : 1: order_py_execute_after_rewriter 0.03% : 0.000031s : 1: overlap_grad_flash_sp 0.01% : 0.000006s : 1: overlap_grad_matmul_and_grad_allreduce 0.01% : 0.000010s : 1: overlap_grad_ring_attention 0.01% : 0.000007s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000006s : 1: overlap_opt_shard_in_pipeline 0.01% : 0.000008s : 1: overlap_param_gather 0.01% : 0.000006s : 1: overlap_recompute_allgather_and_fa_grad 0.01% : 0.000011s : 1: overlap_recompute_and_grad_model_parallel 0.01% : 0.000008s : 1: overlap_recompute_comm 0.01% : 0.000011s : 1: parallel-infer-symbol 0.01% : 0.000007s : 1: parallel-infer-symbol-second 0.01% : 0.000007s : 1: partial_unused_args_eliminate 0.01% : 0.000009s : 1: pipeline_parallel_scheduler 0.01% : 0.000007s : 1: pipeline_split 0.05% : 0.000054s : 1: pre_auto_parallel 0.05% : 0.000045s : 1: py_interpret_to_execute 0.03% : 0.000025s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000006s : 1: remove_cast_before_assign_add 0.02% : 0.000022s : 1: remove_dup_value 0.47% : 0.000461s : 1: renormalize.infer 0.44% : 0.000433s : 1: renormalize.specialize 0.01% : 0.000009s : 1: reorder_send_recv_between_fp_bp 0.01% : 0.000012s : 1: rewriter_after_jit_bprop_graph 0.05% : 0.000052s : 1: rewriter_after_opt_a 0.11% : 0.000109s : 1: rewriter_before_opt_a 0.01% : 0.000009s : 1: slice_cell_reuse_recomputed_activation 0.05% : 0.000044s : 1: slice_recompute_activation 0.01% : 0.000008s : 1: split_layernorm_comm 0.01% : 0.000007s : 1: split_matmul_comm_elemetwise 0.01% : 0.000011s : 1: swap_dp_allreduce_reducescatter 0.12% : 0.000120s : 1: symbol_engine_optimizer 0.11% : 0.000107s : 1: tuple_transform 73.03% : 0.071219s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:42:56.457.921 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0949916, [21] [bootstrap]: 0.00041652 [type_inference]: 0.0849491 [event_method]: 2.269e-05 [auto_monad]: 6.664e-05 [graph_reusing]: 6.13002e-06 [inline]: 3.21999e-06 [add_attr]: 0.00362949, [1] [add_attr_with_inline]: 0.00361826, [1] [Cycle 1]: 6.583e-05, [2] [tag_attr]: 2.088e-05 [meta_addattr_fg_expand]: 5.59e-06 [parallel-infer-symbol]: 3.51999e-06 [pre_auto_parallel]: 3.775e-05 [insert-virtual-dataset]: 2.61e-06 [parallel-infer-symbol-second]: 7.39994e-07 [dataset_repeat_opt]: 2.17001e-06 [pipeline_split]: 1.69e-06 [optimize]: 0.00512137, [53] [py_interpret_to_execute]: 2.813e-05 [rewriter_before_opt_a]: 8.215e-05 [opt_a]: 0.0030717, [2] [Cycle 1]: 0.0023921, [45] [expand_dump_flag]: 2.74001e-06 [switch_simplify]: 4.497e-05 [loop_unroll]: 3.032e-05 [a_1]: 0.00065638 [with_stream_mark]: 1.683e-05 [recompute_prepare]: 8.71002e-06 [updatestate_depend_eliminate]: 4.37e-06 [updatestate_assign_eliminate]: 3.41001e-06 [updatestate_loads_eliminate]: 3.23e-06 [parameter_eliminate]: 2.04e-06 [a_2]: 9.159e-05 [accelerated_algorithm]: 7.45e-06 [shard]: 1.89999e-06 [meta_shard_fg_expand]: 1.79998e-06 [shard_inline]: 6.43e-06 [merge_send_recv]: 8.18999e-06 [auto_parallel]: 6.66e-06 [parallel]: 1.963e-05 [flash_sp]: 9.15999e-06 [merge_comm]: 4.18999e-06 [allreduce_fusion]: 3.58999e-06 [matmul_add_comm_reduction]: 1.004e-05 [allreduce_slice_to_reducescatter]: 7.2e-07 [virtual_shard_identity]: 9.04e-06 [virtual_dataset]: 7.76001e-06 [get_grad_eliminate_]: 6.16e-06 [virtual_output]: 6.98998e-06 [merge_forward]: 3.7e-06 [cell_reuse_recompute_pass]: 1.39e-06 [offload_activation]: 9.77999e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.21e-05 [merge_recompute_call_nodes]: 1.68997e-06 [before_grad]: 1.017e-05 [set_forward_comm_id_for_comm_node_pass]: 3.64002e-06 [meta_fg_expand]: 2.51e-06 [flash_sp_send_recv_attached]: 3.08e-06 [receive_attached]: 1.99e-06 [after_resolve]: 1.235e-05 [a_after_grad]: 1.096e-05 [renormalize]: 0.00095972 [add_forward_monad_depend]: 6.58e-06 [auto_monad_grad]: 2.71e-06 [auto_monad_eliminator]: 1.589e-05 [cse]: 3.363e-05 [a_3]: 4.977e-05 [Cycle 2]: 0.00066883, [45] [expand_dump_flag]: 1.22e-06 [switch_simplify]: 8.15e-06 [loop_unroll]: 6.14001e-06 [a_1]: 0.00014309 [with_stream_mark]: 1.239e-05 [recompute_prepare]: 6.07001e-06 [updatestate_depend_eliminate]: 3.13e-06 [updatestate_assign_eliminate]: 2.42001e-06 [updatestate_loads_eliminate]: 2.44001e-06 [parameter_eliminate]: 1.81e-06 [a_2]: 7.705e-05 [accelerated_algorithm]: 6.59001e-06 [shard]: 1.32999e-06 [meta_shard_fg_expand]: 1.37e-06 [shard_inline]: 6.06e-06 [merge_send_recv]: 5.03002e-06 [auto_parallel]: 5.51e-06 [parallel]: 4.90001e-06 [flash_sp]: 3.31001e-06 [merge_comm]: 3.38e-06 [allreduce_fusion]: 3.63999e-06 [matmul_add_comm_reduction]: 5.97999e-06 [allreduce_slice_to_reducescatter]: 4.30009e-07 [virtual_shard_identity]: 6.61999e-06 [virtual_dataset]: 5.91e-06 [get_grad_eliminate_]: 5.56e-06 [virtual_output]: 5.58002e-06 [merge_forward]: 2.98998e-06 [cell_reuse_recompute_pass]: 1.49998e-06 [offload_activation]: 6.83998e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.107e-05 [merge_recompute_call_nodes]: 6.89994e-07 [before_grad]: 9.78002e-06 [set_forward_comm_id_for_comm_node_pass]: 3.31001e-06 [meta_fg_expand]: 2.08002e-06 [flash_sp_send_recv_attached]: 1.01002e-06 [receive_attached]: 1.32999e-06 [after_resolve]: 1.186e-05 [a_after_grad]: 1.009e-05 [renormalize]: 8.00064e-08 [add_forward_monad_depend]: 1.24e-06 [auto_monad_grad]: 1.34e-06 [auto_monad_eliminator]: 8e-06 [cse]: 1.321e-05 [a_3]: 3.59e-05 [py_interpret_to_execute_after_opt_a]: 8.64e-06 [slice_cell_reuse_recomputed_activation]: 1.86998e-06 [rewriter_after_opt_a]: 3.643e-05 [convert_after_rewriter]: 7.18998e-06 [order_py_execute_after_rewriter]: 5.33002e-06 [mutable_eliminate]: 0.00054177 [opt_b]: 0.00020467, [1] [Cycle 1]: 0.00019749, [7] [b_1]: 0.00012491 [b_2]: 8.17998e-06 [updatestate_depend_eliminate]: 5.84e-06 [updatestate_assign_eliminate]: 2.27999e-06 [updatestate_loads_eliminate]: 2.48e-06 [renormalize]: 5.59987e-07 [cse]: 1.881e-05 [optimize_parallel_all_gather_comm]: 1.741e-05 [overlap_param_gather]: 1.95001e-06 [cconv]: 2.868e-05 [loop_unroll]: 0.00043046 [opt_after_cconv]: 9.976e-05, [1] [Cycle 1]: 9.387e-05, [7] [c_1]: 3.042e-05 [parameter_eliminate]: 3.41001e-06 [updatestate_depend_eliminate]: 4.92e-06 [updatestate_assign_eliminate]: 2.43e-06 [updatestate_loads_eliminate]: 2.36998e-06 [cse]: 1.632e-05 [renormalize]: 4.69998e-07 [remove_dup_value]: 1.379e-05 [tuple_transform]: 7.72e-05, [1] [Cycle 1]: 7.276e-05, [4] [d_1]: 4.549e-05 [none_parameter_eliminate]: 1.62999e-06 [renormalize]: 1.70025e-07 [switch_simplify]: 6.81999e-06 [partial_unused_args_eliminate]: 1.69e-06 [add_recomputation]: 4.909e-05 [cse_after_recomputation]: 2.028e-05, [1] [Cycle 1]: 1.576e-05, [1] [cse]: 1.06e-05 [environ_conv]: 5.08002e-06 [swap_dp_allreduce_reducescatter]: 5.04e-06 [bias_add_comm_swap]: 2.73e-06 [label_micro_interleaved_index]: 4.35e-06 [label_fine_grained_interleaved_index]: 2.71999e-06 [merge_cast_opt]: 1.23002e-06 [slice_recompute_activation]: 2.07001e-06 [micro_interleaved_order_control]: 2.44999e-06 [assign_add_opt]: 1.23002e-06 [ForceFp32Comm]: 1.29e-06 [remove_cast_before_assign_add]: 1.10001e-06 [full_micro_interleaved_order_control]: 1.99999e-06 [reorder_send_recv_between_fp_bp]: 2.79999e-06 [comm_op_add_attrs]: 1.15999e-06 [add_comm_op_reuse_tag]: 9.40025e-07 [interleave_split_concat_branches]: 1.19e-06 [interleave_parallel_branches]: 1.04e-06 [overlap_opt_shard_in_pipeline]: 1.39e-06 [overlap_opt_shard_grad_in_pipeline]: 1.97999e-06 [control_data_broadcast_order]: 1.274e-05 [grouped_pairwise_exchange_alltoall]: 1.52001e-06 [offloading_packed_experts]: 3.3e-06 [overlap_recompute_and_grad_model_parallel]: 4.69002e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.14e-06 [overlap_recompute_allgather_and_fa_grad]: 1.47999e-06 [overlap_recompute_comm]: 2.07001e-06 [overlap_grad_ring_attention]: 3.69002e-06 [overlap_grad_flash_sp]: 2.137e-05 [begin_end_overlap_inline]: 4.90021e-07 [split_matmul_comm_elemetwise]: 2.25002e-06 [split_layernorm_comm]: 1.62999e-06 [handle_group_info]: 8.89995e-07 [symbol_engine_optimizer]: 7.504e-05, [1] [Cycle 1]: 7.097e-05, [6] [build]: 3.48e-06 [elim_shapecalc]: 9.64999e-06 [elim_not_effective]: 1.291e-05 [opt_reshape]: 7.08998e-06 [fold_const_symbol]: 9.70002e-06 [renormalize]: 2.19996e-07 [detach_backward]: 2.29001e-06 [pipeline_parallel_scheduler]: 1.87001e-06 [auto_monad_reorder]: 1.678e-05 [get_jit_bprop_graph]: 1.96e-06 [rewriter_after_jit_bprop_graph]: 4.96002e-06 [opt_after_jit_grad]: 0.00050008 [validate]: 4.284e-05 Sums bootstrap : 0.000417s : 0.46% type_inference : 0.084949s : 94.00% event_method : 0.000023s : 0.03% auto_monad : 0.000067s : 0.07% graph_reusing : 0.000006s : 0.01% inline : 0.000003s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000021s : 0.02% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.01% parallel-infer-symbol : 0.000004s : 0.00% pre_auto_parallel : 0.000038s : 0.04% insert-virtual-dataset : 0.000003s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000028s : 0.03% optimize.rewriter_before_opt_a : 0.000082s : 0.09% optimize.opt_a.expand_dump_flag : 0.000004s : 0.00% optimize.opt_a.switch_simplify : 0.000053s : 0.06% optimize.opt_a.loop_unroll : 0.000036s : 0.04% optimize.opt_a.a_1 : 0.000799s : 0.88% optimize.opt_a.with_stream_mark : 0.000029s : 0.03% optimize.opt_a.recompute_prepare : 0.000015s : 0.02% optimize.opt_a.updatestate_depend_eliminate : 0.000007s : 0.01% optimize.opt_a.updatestate_assign_eliminate : 0.000006s : 0.01% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.01% optimize.opt_a.parameter_eliminate : 0.000004s : 0.00% optimize.opt_a.a_2 : 0.000169s : 0.19% optimize.opt_a.accelerated_algorithm : 0.000014s : 0.02% optimize.opt_a.shard : 0.000003s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.000003s : 0.00% optimize.opt_a.shard_inline : 0.000012s : 0.01% optimize.opt_a.merge_send_recv : 0.000013s : 0.01% optimize.opt_a.auto_parallel : 0.000012s : 0.01% optimize.opt_a.parallel : 0.000025s : 0.03% optimize.opt_a.flash_sp : 0.000012s : 0.01% optimize.opt_a.merge_comm : 0.000008s : 0.01% optimize.opt_a.allreduce_fusion : 0.000007s : 0.01% optimize.opt_a.matmul_add_comm_reduction : 0.000016s : 0.02% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000016s : 0.02% optimize.opt_a.virtual_dataset : 0.000014s : 0.02% optimize.opt_a.get_grad_eliminate_ : 0.000012s : 0.01% optimize.opt_a.virtual_output : 0.000013s : 0.01% optimize.opt_a.merge_forward : 0.000007s : 0.01% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.00% optimize.opt_a.offload_activation : 0.000017s : 0.02% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000023s : 0.03% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.00% optimize.opt_a.before_grad : 0.000020s : 0.02% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000007s : 0.01% optimize.opt_a.meta_fg_expand : 0.000005s : 0.01% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.00% optimize.opt_a.receive_attached : 0.000003s : 0.00% optimize.opt_a.after_resolve : 0.000024s : 0.03% optimize.opt_a.a_after_grad : 0.000021s : 0.02% optimize.opt_a.renormalize : 0.000960s : 1.06% optimize.opt_a.add_forward_monad_depend : 0.000008s : 0.01% optimize.opt_a.auto_monad_grad : 0.000004s : 0.00% optimize.opt_a.auto_monad_eliminator : 0.000024s : 0.03% optimize.opt_a.cse : 0.000047s : 0.05% optimize.opt_a.a_3 : 0.000086s : 0.09% optimize.py_interpret_to_execute_after_opt_a : 0.000009s : 0.01% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.00% optimize.rewriter_after_opt_a : 0.000036s : 0.04% optimize.convert_after_rewriter : 0.000007s : 0.01% optimize.order_py_execute_after_rewriter : 0.000005s : 0.01% optimize.mutable_eliminate : 0.000542s : 0.60% optimize.opt_b.b_1 : 0.000125s : 0.14% optimize.opt_b.b_2 : 0.000008s : 0.01% optimize.opt_b.updatestate_depend_eliminate : 0.000006s : 0.01% optimize.opt_b.updatestate_assign_eliminate : 0.000002s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000002s : 0.00% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000019s : 0.02% optimize.optimize_parallel_all_gather_comm : 0.000017s : 0.02% optimize.overlap_param_gather : 0.000002s : 0.00% optimize.cconv : 0.000029s : 0.03% optimize.loop_unroll : 0.000430s : 0.48% optimize.opt_after_cconv.c_1 : 0.000030s : 0.03% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000005s : 0.01% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000002s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.00% optimize.opt_after_cconv.cse : 0.000016s : 0.02% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000014s : 0.02% optimize.tuple_transform.d_1 : 0.000045s : 0.05% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000007s : 0.01% optimize.partial_unused_args_eliminate : 0.000002s : 0.00% optimize.add_recomputation : 0.000049s : 0.05% optimize.cse_after_recomputation.cse : 0.000011s : 0.01% optimize.environ_conv : 0.000005s : 0.01% optimize.swap_dp_allreduce_reducescatter : 0.000005s : 0.01% optimize.bias_add_comm_swap : 0.000003s : 0.00% optimize.label_micro_interleaved_index : 0.000004s : 0.00% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.00% optimize.merge_cast_opt : 0.000001s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.00% optimize.micro_interleaved_order_control : 0.000002s : 0.00% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.00% optimize.full_micro_interleaved_order_control : 0.000002s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.00% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.00% optimize.control_data_broadcast_order : 0.000013s : 0.01% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.00% optimize.offloading_packed_experts : 0.000003s : 0.00% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.01% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.00% optimize.overlap_recompute_comm : 0.000002s : 0.00% optimize.overlap_grad_ring_attention : 0.000004s : 0.00% optimize.overlap_grad_flash_sp : 0.000021s : 0.02% optimize.begin_end_overlap_inline : 0.000000s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.00% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000003s : 0.00% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000010s : 0.01% optimize.symbol_engine_optimizer.elim_not_effective : 0.000013s : 0.01% optimize.symbol_engine_optimizer.opt_reshape : 0.000007s : 0.01% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000010s : 0.01% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.00% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000017s : 0.02% get_jit_bprop_graph : 0.000002s : 0.00% rewriter_after_jit_bprop_graph : 0.000005s : 0.01% opt_after_jit_grad : 0.000500s : 0.55% validate : 0.000043s : 0.05% Time group info: ------[substitution.] 0.000220 34 14.60% : 0.000032s : 6: substitution.arithmetic_simplify 1.03% : 0.000002s : 2: substitution.elim_not_effective 0.61% : 0.000001s : 2: substitution.fold_const_symbol 2.60% : 0.000006s : 4: substitution.graph_param_transform 67.88% : 0.000149s : 4: substitution.inline 1.70% : 0.000004s : 4: substitution.j_node_and_user_rematch 2.19% : 0.000005s : 4: substitution.remove_not_recompute_node 2.42% : 0.000005s : 4: substitution.replace_old_param 6.96% : 0.000015s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.084872 2 98.91% : 0.083945s : 1: type_inference.infer 1.09% : 0.000927s : 1: type_inference.specialize ------[replace.] 0.000062 8 63.69% : 0.000039s : 4: replace.inline 36.31% : 0.000022s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000160 8 91.63% : 0.000147s : 4: match.inline 8.37% : 0.000013s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000203 1278 0.93% : 0.000002s : 13: predicate.accumulaten_eliminater 0.95% : 0.000002s : 4: predicate.ad_related_special_op_eliminate 0.55% : 0.000001s : 8: predicate.addn_check_dump 0.92% : 0.000002s : 13: predicate.addn_zero_filter 0.82% : 0.000002s : 13: predicate.adjust_all_reduce_mul_add 2.54% : 0.000005s : 21: predicate.arithmetic_simplify 0.97% : 0.000002s : 13: predicate.cast_eliminate 0.62% : 0.000001s : 8: predicate.check_bprop_eliminate 0.52% : 0.000001s : 8: predicate.compare_switch_simplify 0.18% : 0.000000s : 4: predicate.const_output_eliminate 0.65% : 0.000001s : 8: predicate.depend_value_elim 0.96% : 0.000002s : 13: predicate.dict_get_item_const_eliminator 0.99% : 0.000002s : 13: predicate.dict_get_item_eliminator 1.07% : 0.000002s : 13: predicate.dict_set_item_eliminator 0.89% : 0.000002s : 8: predicate.dumpgradient_eliminate 0.25% : 0.000001s : 4: predicate.elim_not_effective 0.37% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.13% : 0.000002s : 17: predicate.environ_add_const_eliminate 1.06% : 0.000002s : 17: predicate.environ_get_add_eliminate 1.05% : 0.000002s : 17: predicate.environ_get_depend_swap 1.63% : 0.000003s : 25: predicate.environ_get_eliminate 1.07% : 0.000002s : 17: predicate.environ_get_set_eliminate 1.47% : 0.000003s : 21: predicate.exchange_switch_depend_value 2.62% : 0.000005s : 21: predicate.float_depend_g_call 0.57% : 0.000001s : 8: predicate.float_environ_get_switch 0.90% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.20% : 0.000000s : 4: predicate.fold_const_symbol 0.61% : 0.000001s : 8: predicate.get_grad_eliminate 0.20% : 0.000000s : 4: predicate.graph_param_transform 0.58% : 0.000001s : 8: predicate.incorporate_call 0.48% : 0.000001s : 8: predicate.incorporate_call_switch 6.55% : 0.000013s : 58: predicate.inline 0.89% : 0.000002s : 8: predicate.inline_without_move 0.32% : 0.000001s : 8: predicate.j_node_and_user_rematch 0.82% : 0.000002s : 8: predicate.less_batch_normalization 1.75% : 0.000004s : 25: predicate.list_to_tuple_eliminator_ 2.53% : 0.000005s : 38: predicate.load_eliminater 0.83% : 0.000002s : 4: predicate.loop_unroll_after_grad 2.39% : 0.000005s : 34: predicate.loop_unroll_before_grad 1.49% : 0.000003s : 21: predicate.make_slice_get_slice_eliminator 0.59% : 0.000001s : 8: predicate.merge_addn 0.57% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.54% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.80% : 0.000002s : 13: predicate.minmaximum_grad 1.02% : 0.000002s : 4: predicate.mutable_eliminate 0.35% : 0.000001s : 4: predicate.opt_reshape 0.36% : 0.000001s : 4: predicate.parallel_virtual_node 1.76% : 0.000004s : 21: predicate.partial_defer_inline 1.68% : 0.000003s : 21: predicate.partial_eliminate 0.88% : 0.000002s : 13: predicate.print_const_string_wrapper 0.63% : 0.000001s : 8: predicate.reduce_all_const_elim 1.16% : 0.000002s : 13: predicate.reduce_eliminate 2.45% : 0.000005s : 38: predicate.redundant_stop_gradient_eliminater 0.49% : 0.000001s : 8: predicate.remove_not_recompute_node 1.48% : 0.000003s : 25: predicate.replace_applicator 0.43% : 0.000001s : 8: predicate.replace_old_param 0.28% : 0.000001s : 4: predicate.reset_defer_inline 0.90% : 0.000002s : 13: predicate.reshape_eliminate 0.59% : 0.000001s : 8: predicate.row_tensor_add_zeros_like 0.38% : 0.000001s : 4: predicate.row_tensor_eliminate 0.71% : 0.000001s : 8: predicate.same_eliminate 0.44% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.82% : 0.000002s : 8: predicate.shard_identity_eliminate 0.63% : 0.000001s : 8: predicate.special_op_eliminate 0.70% : 0.000001s : 8: predicate.specialize_transform 0.94% : 0.000002s : 8: predicate.split_environ_get_set_with_tuple_value 0.74% : 0.000001s : 8: predicate.stack_unstack_eliminate 0.35% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.58% : 0.000003s : 21: predicate.switch_defer_inline 2.02% : 0.000004s : 29: predicate.switch_layer_defer_inline 5.28% : 0.000011s : 67: predicate.switch_simplify 0.93% : 0.000002s : 13: predicate.tile_eliminate 0.92% : 0.000002s : 13: predicate.transpose_eliminate 1.44% : 0.000003s : 21: predicate.tuple_list_convert_item_index_to_positive 1.61% : 0.000003s : 21: predicate.tuple_list_get_item_const_eliminator 1.47% : 0.000003s : 21: predicate.tuple_list_get_item_depend_reorder 3.34% : 0.000007s : 33: predicate.tuple_list_get_item_eliminator 1.53% : 0.000003s : 21: predicate.tuple_list_get_set_item_eliminator 2.17% : 0.000004s : 29: predicate.tuple_list_set_item_eliminator 1.73% : 0.000004s : 25: predicate.tuple_to_list_eliminator_ 2.45% : 0.000005s : 38: predicate.updatestate_pure_node_eliminater 3.01% : 0.000006s : 46: predicate.updatestate_useless_node_eliminater 0.33% : 0.000001s : 4: predicate.value_based_eliminate 0.80% : 0.000002s : 8: predicate.virtual_dataset_eliminate 0.68% : 0.000001s : 8: predicate.virtual_output_eliminate 0.26% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.46% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000749 11 55.09% : 0.000413s : 5: func_graph_cloner_run.FuncGraphClonerGraph 44.91% : 0.000336s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.106075 192 0.00% : 0.000004s : 1: ForceFp32Comm 3.43% : 0.003635s : 1: add_attr 3.42% : 0.003623s : 1: add_attr_with_inline 0.00% : 0.000004s : 1: add_comm_op_reuse_tag 0.05% : 0.000053s : 1: add_recomputation 0.00% : 0.000004s : 1: assign_add_opt 0.07% : 0.000071s : 1: auto_monad 0.02% : 0.000021s : 1: auto_monad_reorder 0.00% : 0.000003s : 1: begin_end_overlap_inline 0.01% : 0.000006s : 1: bias_add_comm_swap 0.42% : 0.000446s : 1: bootstrap 0.03% : 0.000032s : 1: cconv 0.00% : 0.000004s : 1: comm_op_add_attrs 0.01% : 0.000016s : 1: control_data_broadcast_order 0.01% : 0.000010s : 1: convert_after_rewriter 0.02% : 0.000023s : 1: cse_after_recomputation 0.00% : 0.000005s : 1: dataset_repeat_opt 0.01% : 0.000006s : 1: detach_backward 0.01% : 0.000008s : 1: environ_conv 0.03% : 0.000029s : 1: event_method 0.00% : 0.000005s : 1: full_micro_interleaved_order_control 0.00% : 0.000005s : 1: get_jit_bprop_graph 0.01% : 0.000009s : 1: graph_reusing 0.00% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000004s : 1: handle_group_info 0.01% : 0.000006s : 1: inline 0.01% : 0.000006s : 1: insert-virtual-dataset 0.00% : 0.000004s : 1: interleave_parallel_branches 0.00% : 0.000004s : 1: interleave_split_concat_branches 0.01% : 0.000006s : 1: label_fine_grained_interleaved_index 0.01% : 0.000007s : 1: label_micro_interleaved_index 0.41% : 0.000439s : 1: loop_unroll 0.00% : 0.000004s : 1: merge_cast_opt 0.00% : 0.000005s : 1: micro_interleaved_order_control 0.52% : 0.000551s : 1: mutable_eliminate 0.01% : 0.000006s : 1: offloading_packed_experts 0.01% : 0.000014s : 1: opt.transform.loop_unroll_optimizer 0.01% : 0.000016s : 1: opt.transform.mutable_eliminate 1.16% : 0.001235s : 78: opt.transform.opt_a 0.03% : 0.000029s : 1: opt.transform.opt_after_cconv 0.02% : 0.000026s : 1: opt.transform.opt_after_jit_grad 0.10% : 0.000102s : 28: opt.transform.opt_b 0.05% : 0.000050s : 2: opt.transform.opt_trans_graph 0.03% : 0.000036s : 4: opt.transform.symbol_engine_opt 2.90% : 0.003075s : 1: opt_a 0.10% : 0.000104s : 1: opt_after_cconv 0.48% : 0.000510s : 1: opt_after_jit_grad 0.20% : 0.000208s : 1: opt_b 4.83% : 0.005126s : 1: optimize 0.02% : 0.000021s : 1: optimize_parallel_all_gather_comm 0.01% : 0.000009s : 1: order_py_execute_after_rewriter 0.02% : 0.000025s : 1: overlap_grad_flash_sp 0.00% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.01% : 0.000007s : 1: overlap_grad_ring_attention 0.00% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000005s : 1: overlap_param_gather 0.00% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.01% : 0.000008s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000005s : 1: overlap_recompute_comm 0.01% : 0.000007s : 1: parallel-infer-symbol 0.00% : 0.000004s : 1: parallel-infer-symbol-second 0.00% : 0.000005s : 1: partial_unused_args_eliminate 0.00% : 0.000005s : 1: pipeline_parallel_scheduler 0.00% : 0.000005s : 1: pipeline_split 0.04% : 0.000042s : 1: pre_auto_parallel 0.03% : 0.000032s : 1: py_interpret_to_execute 0.01% : 0.000012s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000004s : 1: remove_cast_before_assign_add 0.02% : 0.000017s : 1: remove_dup_value 0.37% : 0.000397s : 1: renormalize.infer 0.52% : 0.000554s : 1: renormalize.specialize 0.01% : 0.000005s : 1: reorder_send_recv_between_fp_bp 0.01% : 0.000008s : 1: rewriter_after_jit_bprop_graph 0.04% : 0.000041s : 1: rewriter_after_opt_a 0.08% : 0.000086s : 1: rewriter_before_opt_a 0.00% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.01% : 0.000005s : 1: slice_recompute_activation 0.00% : 0.000005s : 1: split_layernorm_comm 0.00% : 0.000005s : 1: split_matmul_comm_elemetwise 0.01% : 0.000008s : 1: swap_dp_allreduce_reducescatter 0.07% : 0.000078s : 1: symbol_engine_optimizer 0.08% : 0.000080s : 1: tuple_transform 80.11% : 0.084972s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:42:57.379.446 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:42:57.379.718 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0282245, [21] [bootstrap]: 0.00040739 [type_inference]: 0.00563857 [event_method]: 1.838e-05 [auto_monad]: 6.032e-05 [graph_reusing]: 6.38e-06 [inline]: 2.04e-06 [add_attr]: 0.00311794, [1] [add_attr_with_inline]: 0.00310913, [1] [Cycle 1]: 7.237e-05, [2] [tag_attr]: 1.887e-05 [meta_addattr_fg_expand]: 6.06e-06 [parallel-infer-symbol]: 3.45998e-06 [pre_auto_parallel]: 3.069e-05 [insert-virtual-dataset]: 2.27999e-06 [parallel-infer-symbol-second]: 8.50006e-07 [dataset_repeat_opt]: 1.81e-06 [pipeline_split]: 1.99999e-06 [optimize]: 0.0053461, [53] [py_interpret_to_execute]: 2.823e-05 [rewriter_before_opt_a]: 8.372e-05 [opt_a]: 0.00305552, [2] [Cycle 1]: 0.00219532, [45] [expand_dump_flag]: 2.98e-06 [switch_simplify]: 4.207e-05 [loop_unroll]: 3.014e-05 [a_1]: 0.00063427 [with_stream_mark]: 1.556e-05 [recompute_prepare]: 8.72998e-06 [updatestate_depend_eliminate]: 3.75e-06 [updatestate_assign_eliminate]: 3.13e-06 [updatestate_loads_eliminate]: 3.32002e-06 [parameter_eliminate]: 1.94e-06 [a_2]: 0.00011406 [accelerated_algorithm]: 7.28e-06 [shard]: 1.72001e-06 [meta_shard_fg_expand]: 1.74e-06 [shard_inline]: 6.39999e-06 [merge_send_recv]: 8.38999e-06 [auto_parallel]: 6.19001e-06 [parallel]: 1.756e-05 [flash_sp]: 7.58999e-06 [merge_comm]: 4.23001e-06 [allreduce_fusion]: 3.39001e-06 [matmul_add_comm_reduction]: 9.81e-06 [allreduce_slice_to_reducescatter]: 6.39993e-07 [virtual_shard_identity]: 7.79002e-06 [virtual_dataset]: 6.62002e-06 [get_grad_eliminate_]: 6.08002e-06 [virtual_output]: 6.43e-06 [merge_forward]: 4.17003e-06 [cell_reuse_recompute_pass]: 1.44e-06 [offload_activation]: 1.067e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.422e-05 [merge_recompute_call_nodes]: 1.49e-06 [before_grad]: 1.058e-05 [set_forward_comm_id_for_comm_node_pass]: 3.98001e-06 [meta_fg_expand]: 3.46999e-06 [flash_sp_send_recv_attached]: 2.86e-06 [receive_attached]: 2.16998e-06 [after_resolve]: 1.298e-05 [a_after_grad]: 1.04e-05 [renormalize]: 0.00064104 [add_forward_monad_depend]: 5.14e-06 [auto_monad_grad]: 2.51e-06 [auto_monad_eliminator]: 1.519e-05 [cse]: 2.597e-05 [a_3]: 6.171e-05 [Cycle 2]: 0.000846, [45] [expand_dump_flag]: 1.32999e-06 [switch_simplify]: 7.92e-06 [loop_unroll]: 6.27001e-06 [a_1]: 0.00014189 [with_stream_mark]: 1.151e-05 [recompute_prepare]: 6.38003e-06 [updatestate_depend_eliminate]: 2.86999e-06 [updatestate_assign_eliminate]: 2.65997e-06 [updatestate_loads_eliminate]: 2.59001e-06 [parameter_eliminate]: 1.47001e-06 [a_2]: 0.00010566 [accelerated_algorithm]: 6.22001e-06 [shard]: 1.12999e-06 [meta_shard_fg_expand]: 1.28002e-06 [shard_inline]: 6.07001e-06 [merge_send_recv]: 4.82e-06 [auto_parallel]: 5.72001e-06 [parallel]: 5.50001e-06 [flash_sp]: 3.37002e-06 [merge_comm]: 3.71999e-06 [allreduce_fusion]: 6.22001e-06 [matmul_add_comm_reduction]: 6.09999e-06 [allreduce_slice_to_reducescatter]: 6.39993e-07 [virtual_shard_identity]: 7.11001e-06 [virtual_dataset]: 6.01998e-06 [get_grad_eliminate_]: 5.75001e-06 [virtual_output]: 5.95002e-06 [merge_forward]: 2.60002e-06 [cell_reuse_recompute_pass]: 1.76e-06 [offload_activation]: 6.31e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.365e-05 [merge_recompute_call_nodes]: 1.07e-06 [before_grad]: 1.006e-05 [set_forward_comm_id_for_comm_node_pass]: 3.53e-06 [meta_fg_expand]: 2.31e-06 [flash_sp_send_recv_attached]: 1.43002e-06 [receive_attached]: 1.28002e-06 [after_resolve]: 1.176e-05 [a_after_grad]: 9.57999e-06 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 1.90001e-06 [auto_monad_grad]: 1.09e-06 [auto_monad_eliminator]: 8.65999e-06 [cse]: 1.436e-05 [a_3]: 4.899e-05 [py_interpret_to_execute_after_opt_a]: 1.447e-05 [slice_cell_reuse_recomputed_activation]: 5.10999e-06 [rewriter_after_opt_a]: 3.781e-05 [convert_after_rewriter]: 9.69999e-06 [order_py_execute_after_rewriter]: 8.12998e-06 [mutable_eliminate]: 0.00054114 [opt_b]: 0.00026691, [1] [Cycle 1]: 0.00025755, [7] [b_1]: 0.00016569 [b_2]: 8.07003e-06 [updatestate_depend_eliminate]: 5.35001e-06 [updatestate_assign_eliminate]: 2.39999e-06 [updatestate_loads_eliminate]: 2.35002e-06 [renormalize]: 4.80009e-07 [cse]: 1.745e-05 [optimize_parallel_all_gather_comm]: 1.973e-05 [overlap_param_gather]: 5.22e-06 [cconv]: 2.984e-05 [loop_unroll]: 0.00043817 [opt_after_cconv]: 0.00012848, [1] [Cycle 1]: 0.00011974, [7] [c_1]: 3.168e-05 [parameter_eliminate]: 3.20002e-06 [updatestate_depend_eliminate]: 5.81e-06 [updatestate_assign_eliminate]: 2.63998e-06 [updatestate_loads_eliminate]: 2.32999e-06 [cse]: 1.688e-05 [renormalize]: 4.89992e-07 [remove_dup_value]: 1.629e-05 [tuple_transform]: 9.098e-05, [1] [Cycle 1]: 8.385e-05, [4] [d_1]: 4.477e-05 [none_parameter_eliminate]: 1.61998e-06 [renormalize]: 2.3999e-07 [switch_simplify]: 7.08e-06 [partial_unused_args_eliminate]: 4.58999e-06 [add_recomputation]: 4.881e-05 [cse_after_recomputation]: 2.755e-05, [1] [Cycle 1]: 2.033e-05, [1] [cse]: 1.123e-05 [environ_conv]: 8.02e-06 [swap_dp_allreduce_reducescatter]: 8.40999e-06 [bias_add_comm_swap]: 5.40999e-06 [label_micro_interleaved_index]: 7.33e-06 [label_fine_grained_interleaved_index]: 5.14998e-06 [merge_cast_opt]: 3.78001e-06 [slice_recompute_activation]: 4.31002e-06 [micro_interleaved_order_control]: 4.72998e-06 [assign_add_opt]: 3.68e-06 [ForceFp32Comm]: 3.25002e-06 [remove_cast_before_assign_add]: 3.42002e-06 [full_micro_interleaved_order_control]: 4.87998e-06 [reorder_send_recv_between_fp_bp]: 5.13002e-06 [comm_op_add_attrs]: 3.38e-06 [add_comm_op_reuse_tag]: 3.27002e-06 [interleave_split_concat_branches]: 3.48e-06 [interleave_parallel_branches]: 3.47997e-06 [overlap_opt_shard_in_pipeline]: 3.60998e-06 [overlap_opt_shard_grad_in_pipeline]: 4.19002e-06 [control_data_broadcast_order]: 1.545e-05 [grouped_pairwise_exchange_alltoall]: 4.11001e-06 [offloading_packed_experts]: 6.78e-06 [overlap_recompute_and_grad_model_parallel]: 7.55e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.7e-06 [overlap_recompute_allgather_and_fa_grad]: 3.73999e-06 [overlap_recompute_comm]: 4.90999e-06 [overlap_grad_ring_attention]: 6.58e-06 [overlap_grad_flash_sp]: 2.244e-05 [begin_end_overlap_inline]: 2.97002e-06 [split_matmul_comm_elemetwise]: 4.4e-06 [split_layernorm_comm]: 4.17998e-06 [handle_group_info]: 3.24001e-06 [symbol_engine_optimizer]: 9.577e-05, [1] [Cycle 1]: 8.895e-05, [6] [build]: 3.35998e-06 [elim_shapecalc]: 9.31e-06 [elim_not_effective]: 1.276e-05 [opt_reshape]: 7.45e-06 [fold_const_symbol]: 9.92999e-06 [renormalize]: 2.3999e-07 [detach_backward]: 9.87999e-06 [pipeline_parallel_scheduler]: 2.16998e-06 [auto_monad_reorder]: 2.895e-05 [get_jit_bprop_graph]: 2.04e-06 [rewriter_after_jit_bprop_graph]: 8.22998e-06 [opt_after_jit_grad]: 0.00076982 [validate]: 4.417e-05 Sums bootstrap : 0.000407s : 3.64% type_inference : 0.005639s : 50.32% event_method : 0.000018s : 0.16% auto_monad : 0.000060s : 0.54% graph_reusing : 0.000006s : 0.06% inline : 0.000002s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000019s : 0.17% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.05% parallel-infer-symbol : 0.000003s : 0.03% pre_auto_parallel : 0.000031s : 0.27% insert-virtual-dataset : 0.000002s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.02% optimize.py_interpret_to_execute : 0.000028s : 0.25% optimize.rewriter_before_opt_a : 0.000084s : 0.75% optimize.opt_a.expand_dump_flag : 0.000004s : 0.04% optimize.opt_a.switch_simplify : 0.000050s : 0.45% optimize.opt_a.loop_unroll : 0.000036s : 0.32% optimize.opt_a.a_1 : 0.000776s : 6.93% optimize.opt_a.with_stream_mark : 0.000027s : 0.24% optimize.opt_a.recompute_prepare : 0.000015s : 0.13% optimize.opt_a.updatestate_depend_eliminate : 0.000007s : 0.06% optimize.opt_a.updatestate_assign_eliminate : 0.000006s : 0.05% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.05% optimize.opt_a.parameter_eliminate : 0.000003s : 0.03% optimize.opt_a.a_2 : 0.000220s : 1.96% optimize.opt_a.accelerated_algorithm : 0.000014s : 0.12% optimize.opt_a.shard : 0.000003s : 0.03% optimize.opt_a.meta_shard_fg_expand : 0.000003s : 0.03% optimize.opt_a.shard_inline : 0.000012s : 0.11% optimize.opt_a.merge_send_recv : 0.000013s : 0.12% optimize.opt_a.auto_parallel : 0.000012s : 0.11% optimize.opt_a.parallel : 0.000023s : 0.21% optimize.opt_a.flash_sp : 0.000011s : 0.10% optimize.opt_a.merge_comm : 0.000008s : 0.07% optimize.opt_a.allreduce_fusion : 0.000010s : 0.09% optimize.opt_a.matmul_add_comm_reduction : 0.000016s : 0.14% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000015s : 0.13% optimize.opt_a.virtual_dataset : 0.000013s : 0.11% optimize.opt_a.get_grad_eliminate_ : 0.000012s : 0.11% optimize.opt_a.virtual_output : 0.000012s : 0.11% optimize.opt_a.merge_forward : 0.000007s : 0.06% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.03% optimize.opt_a.offload_activation : 0.000017s : 0.15% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000028s : 0.25% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.02% optimize.opt_a.before_grad : 0.000021s : 0.18% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000008s : 0.07% optimize.opt_a.meta_fg_expand : 0.000006s : 0.05% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.04% optimize.opt_a.receive_attached : 0.000003s : 0.03% optimize.opt_a.after_resolve : 0.000025s : 0.22% optimize.opt_a.a_after_grad : 0.000020s : 0.18% optimize.opt_a.renormalize : 0.000641s : 5.72% optimize.opt_a.add_forward_monad_depend : 0.000007s : 0.06% optimize.opt_a.auto_monad_grad : 0.000004s : 0.03% optimize.opt_a.auto_monad_eliminator : 0.000024s : 0.21% optimize.opt_a.cse : 0.000040s : 0.36% optimize.opt_a.a_3 : 0.000111s : 0.99% optimize.py_interpret_to_execute_after_opt_a : 0.000014s : 0.13% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.05% optimize.rewriter_after_opt_a : 0.000038s : 0.34% optimize.convert_after_rewriter : 0.000010s : 0.09% optimize.order_py_execute_after_rewriter : 0.000008s : 0.07% optimize.mutable_eliminate : 0.000541s : 4.83% optimize.opt_b.b_1 : 0.000166s : 1.48% optimize.opt_b.b_2 : 0.000008s : 0.07% optimize.opt_b.updatestate_depend_eliminate : 0.000005s : 0.05% optimize.opt_b.updatestate_assign_eliminate : 0.000002s : 0.02% optimize.opt_b.updatestate_loads_eliminate : 0.000002s : 0.02% optimize.opt_b.renormalize : 0.000000s : 0.00% optimize.opt_b.cse : 0.000017s : 0.16% optimize.optimize_parallel_all_gather_comm : 0.000020s : 0.18% optimize.overlap_param_gather : 0.000005s : 0.05% optimize.cconv : 0.000030s : 0.27% optimize.loop_unroll : 0.000438s : 3.91% optimize.opt_after_cconv.c_1 : 0.000032s : 0.28% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.05% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.02% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.02% optimize.opt_after_cconv.cse : 0.000017s : 0.15% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000016s : 0.15% optimize.tuple_transform.d_1 : 0.000045s : 0.40% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000007s : 0.06% optimize.partial_unused_args_eliminate : 0.000005s : 0.04% optimize.add_recomputation : 0.000049s : 0.44% optimize.cse_after_recomputation.cse : 0.000011s : 0.10% optimize.environ_conv : 0.000008s : 0.07% optimize.swap_dp_allreduce_reducescatter : 0.000008s : 0.08% optimize.bias_add_comm_swap : 0.000005s : 0.05% optimize.label_micro_interleaved_index : 0.000007s : 0.07% optimize.label_fine_grained_interleaved_index : 0.000005s : 0.05% optimize.merge_cast_opt : 0.000004s : 0.03% optimize.slice_recompute_activation : 0.000004s : 0.04% optimize.micro_interleaved_order_control : 0.000005s : 0.04% optimize.assign_add_opt : 0.000004s : 0.03% optimize.ForceFp32Comm : 0.000003s : 0.03% optimize.remove_cast_before_assign_add : 0.000003s : 0.03% optimize.full_micro_interleaved_order_control : 0.000005s : 0.04% optimize.reorder_send_recv_between_fp_bp : 0.000005s : 0.05% optimize.comm_op_add_attrs : 0.000003s : 0.03% optimize.add_comm_op_reuse_tag : 0.000003s : 0.03% optimize.interleave_split_concat_branches : 0.000003s : 0.03% optimize.interleave_parallel_branches : 0.000003s : 0.03% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.03% optimize.overlap_opt_shard_grad_in_pipeline : 0.000004s : 0.04% optimize.control_data_broadcast_order : 0.000015s : 0.14% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.04% optimize.offloading_packed_experts : 0.000007s : 0.06% optimize.overlap_recompute_and_grad_model_parallel : 0.000008s : 0.07% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.03% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.03% optimize.overlap_recompute_comm : 0.000005s : 0.04% optimize.overlap_grad_ring_attention : 0.000007s : 0.06% optimize.overlap_grad_flash_sp : 0.000022s : 0.20% optimize.begin_end_overlap_inline : 0.000003s : 0.03% optimize.split_matmul_comm_elemetwise : 0.000004s : 0.04% optimize.split_layernorm_comm : 0.000004s : 0.04% optimize.handle_group_info : 0.000003s : 0.03% optimize.symbol_engine_optimizer.build : 0.000003s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000009s : 0.08% optimize.symbol_engine_optimizer.elim_not_effective : 0.000013s : 0.11% optimize.symbol_engine_optimizer.opt_reshape : 0.000007s : 0.07% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000010s : 0.09% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000010s : 0.09% pipeline_parallel_scheduler : 0.000002s : 0.02% auto_monad_reorder : 0.000029s : 0.26% get_jit_bprop_graph : 0.000002s : 0.02% rewriter_after_jit_bprop_graph : 0.000008s : 0.07% opt_after_jit_grad : 0.000770s : 6.87% validate : 0.000044s : 0.39% Time group info: ------[substitution.] 0.000205 34 15.95% : 0.000033s : 6: substitution.arithmetic_simplify 0.97% : 0.000002s : 2: substitution.elim_not_effective 0.62% : 0.000001s : 2: substitution.fold_const_symbol 2.93% : 0.000006s : 4: substitution.graph_param_transform 65.40% : 0.000134s : 4: substitution.inline 1.75% : 0.000004s : 4: substitution.j_node_and_user_rematch 2.56% : 0.000005s : 4: substitution.remove_not_recompute_node 2.55% : 0.000005s : 4: substitution.replace_old_param 7.27% : 0.000015s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.005589 2 86.82% : 0.004852s : 1: type_inference.infer 13.18% : 0.000737s : 1: type_inference.specialize ------[replace.] 0.000058 8 63.57% : 0.000037s : 4: replace.inline 36.43% : 0.000021s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000145 8 90.96% : 0.000132s : 4: match.inline 9.04% : 0.000013s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000204 1278 0.90% : 0.000002s : 13: predicate.accumulaten_eliminater 0.85% : 0.000002s : 4: predicate.ad_related_special_op_eliminate 0.52% : 0.000001s : 8: predicate.addn_check_dump 0.90% : 0.000002s : 13: predicate.addn_zero_filter 0.81% : 0.000002s : 13: predicate.adjust_all_reduce_mul_add 2.62% : 0.000005s : 21: predicate.arithmetic_simplify 1.00% : 0.000002s : 13: predicate.cast_eliminate 0.59% : 0.000001s : 8: predicate.check_bprop_eliminate 0.50% : 0.000001s : 8: predicate.compare_switch_simplify 0.18% : 0.000000s : 4: predicate.const_output_eliminate 0.64% : 0.000001s : 8: predicate.depend_value_elim 0.92% : 0.000002s : 13: predicate.dict_get_item_const_eliminator 1.07% : 0.000002s : 13: predicate.dict_get_item_eliminator 0.90% : 0.000002s : 13: predicate.dict_set_item_eliminator 1.34% : 0.000003s : 8: predicate.dumpgradient_eliminate 0.21% : 0.000000s : 4: predicate.elim_not_effective 0.32% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.07% : 0.000002s : 17: predicate.environ_add_const_eliminate 1.07% : 0.000002s : 17: predicate.environ_get_add_eliminate 1.02% : 0.000002s : 17: predicate.environ_get_depend_swap 1.70% : 0.000003s : 25: predicate.environ_get_eliminate 1.11% : 0.000002s : 17: predicate.environ_get_set_eliminate 1.43% : 0.000003s : 21: predicate.exchange_switch_depend_value 2.48% : 0.000005s : 21: predicate.float_depend_g_call 0.56% : 0.000001s : 8: predicate.float_environ_get_switch 0.78% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.18% : 0.000000s : 4: predicate.fold_const_symbol 0.63% : 0.000001s : 8: predicate.get_grad_eliminate 0.21% : 0.000000s : 4: predicate.graph_param_transform 0.58% : 0.000001s : 8: predicate.incorporate_call 0.53% : 0.000001s : 8: predicate.incorporate_call_switch 6.59% : 0.000013s : 58: predicate.inline 0.90% : 0.000002s : 8: predicate.inline_without_move 0.37% : 0.000001s : 8: predicate.j_node_and_user_rematch 0.77% : 0.000002s : 8: predicate.less_batch_normalization 1.88% : 0.000004s : 25: predicate.list_to_tuple_eliminator_ 2.46% : 0.000005s : 38: predicate.load_eliminater 0.90% : 0.000002s : 4: predicate.loop_unroll_after_grad 2.33% : 0.000005s : 34: predicate.loop_unroll_before_grad 1.62% : 0.000003s : 21: predicate.make_slice_get_slice_eliminator 0.60% : 0.000001s : 8: predicate.merge_addn 0.60% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.58% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.80% : 0.000002s : 13: predicate.minmaximum_grad 0.99% : 0.000002s : 4: predicate.mutable_eliminate 0.37% : 0.000001s : 4: predicate.opt_reshape 0.31% : 0.000001s : 4: predicate.parallel_virtual_node 1.92% : 0.000004s : 21: predicate.partial_defer_inline 1.63% : 0.000003s : 21: predicate.partial_eliminate 1.08% : 0.000002s : 13: predicate.print_const_string_wrapper 0.60% : 0.000001s : 8: predicate.reduce_all_const_elim 1.13% : 0.000002s : 13: predicate.reduce_eliminate 2.37% : 0.000005s : 38: predicate.redundant_stop_gradient_eliminater 0.40% : 0.000001s : 8: predicate.remove_not_recompute_node 1.34% : 0.000003s : 25: predicate.replace_applicator 0.63% : 0.000001s : 8: predicate.replace_old_param 0.26% : 0.000001s : 4: predicate.reset_defer_inline 0.85% : 0.000002s : 13: predicate.reshape_eliminate 0.77% : 0.000002s : 8: predicate.row_tensor_add_zeros_like 0.38% : 0.000001s : 4: predicate.row_tensor_eliminate 0.76% : 0.000002s : 8: predicate.same_eliminate 0.48% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.74% : 0.000002s : 8: predicate.shard_identity_eliminate 0.86% : 0.000002s : 8: predicate.special_op_eliminate 0.70% : 0.000001s : 8: predicate.specialize_transform 0.91% : 0.000002s : 8: predicate.split_environ_get_set_with_tuple_value 0.81% : 0.000002s : 8: predicate.stack_unstack_eliminate 0.33% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.52% : 0.000003s : 21: predicate.switch_defer_inline 2.11% : 0.000004s : 29: predicate.switch_layer_defer_inline 5.14% : 0.000011s : 67: predicate.switch_simplify 0.85% : 0.000002s : 13: predicate.tile_eliminate 0.97% : 0.000002s : 13: predicate.transpose_eliminate 1.46% : 0.000003s : 21: predicate.tuple_list_convert_item_index_to_positive 1.53% : 0.000003s : 21: predicate.tuple_list_get_item_const_eliminator 1.41% : 0.000003s : 21: predicate.tuple_list_get_item_depend_reorder 3.30% : 0.000007s : 33: predicate.tuple_list_get_item_eliminator 1.46% : 0.000003s : 21: predicate.tuple_list_get_set_item_eliminator 2.13% : 0.000004s : 29: predicate.tuple_list_set_item_eliminator 1.71% : 0.000003s : 25: predicate.tuple_to_list_eliminator_ 2.36% : 0.000005s : 38: predicate.updatestate_pure_node_eliminater 3.10% : 0.000006s : 46: predicate.updatestate_useless_node_eliminater 0.35% : 0.000001s : 4: predicate.value_based_eliminate 0.65% : 0.000001s : 8: predicate.virtual_dataset_eliminate 0.62% : 0.000001s : 8: predicate.virtual_output_eliminate 0.26% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.49% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000501 11 54.60% : 0.000274s : 5: func_graph_cloner_run.FuncGraphClonerGraph 45.40% : 0.000228s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.038677 192 0.02% : 0.000006s : 1: ForceFp32Comm 8.08% : 0.003126s : 1: add_attr 8.05% : 0.003113s : 1: add_attr_with_inline 0.02% : 0.000006s : 1: add_comm_op_reuse_tag 0.14% : 0.000052s : 1: add_recomputation 0.02% : 0.000006s : 1: assign_add_opt 0.18% : 0.000069s : 1: auto_monad 0.09% : 0.000036s : 1: auto_monad_reorder 0.01% : 0.000006s : 1: begin_end_overlap_inline 0.02% : 0.000008s : 1: bias_add_comm_swap 1.15% : 0.000445s : 1: bootstrap 0.09% : 0.000033s : 1: cconv 0.02% : 0.000006s : 1: comm_op_add_attrs 0.05% : 0.000018s : 1: control_data_broadcast_order 0.03% : 0.000013s : 1: convert_after_rewriter 0.08% : 0.000031s : 1: cse_after_recomputation 0.02% : 0.000007s : 1: dataset_repeat_opt 0.12% : 0.000048s : 1: detach_backward 0.03% : 0.000011s : 1: environ_conv 0.07% : 0.000029s : 1: event_method 0.02% : 0.000008s : 1: full_micro_interleaved_order_control 0.02% : 0.000008s : 1: get_jit_bprop_graph 0.03% : 0.000013s : 1: graph_reusing 0.02% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.02% : 0.000006s : 1: handle_group_info 0.02% : 0.000008s : 1: inline 0.02% : 0.000008s : 1: insert-virtual-dataset 0.02% : 0.000006s : 1: interleave_parallel_branches 0.02% : 0.000006s : 1: interleave_split_concat_branches 0.02% : 0.000008s : 1: label_fine_grained_interleaved_index 0.03% : 0.000010s : 1: label_micro_interleaved_index 1.15% : 0.000444s : 1: loop_unroll 0.02% : 0.000006s : 1: merge_cast_opt 0.02% : 0.000007s : 1: micro_interleaved_order_control 1.42% : 0.000547s : 1: mutable_eliminate 0.02% : 0.000009s : 1: offloading_packed_experts 0.04% : 0.000014s : 1: opt.transform.loop_unroll_optimizer 0.04% : 0.000015s : 1: opt.transform.mutable_eliminate 3.13% : 0.001211s : 78: opt.transform.opt_a 0.08% : 0.000030s : 1: opt.transform.opt_after_cconv 0.09% : 0.000035s : 1: opt.transform.opt_after_jit_grad 0.26% : 0.000102s : 28: opt.transform.opt_b 0.13% : 0.000050s : 2: opt.transform.opt_trans_graph 0.09% : 0.000036s : 4: opt.transform.symbol_engine_opt 7.91% : 0.003059s : 1: opt_a 0.34% : 0.000132s : 1: opt_after_cconv 2.02% : 0.000781s : 1: opt_after_jit_grad 0.70% : 0.000270s : 1: opt_b 46.02% : 0.017799s : 1: optimize 0.06% : 0.000023s : 1: optimize_parallel_all_gather_comm 0.03% : 0.000011s : 1: order_py_execute_after_rewriter 0.07% : 0.000025s : 1: overlap_grad_flash_sp 0.02% : 0.000007s : 1: overlap_grad_matmul_and_grad_allreduce 0.03% : 0.000010s : 1: overlap_grad_ring_attention 0.02% : 0.000007s : 1: overlap_opt_shard_grad_in_pipeline 0.02% : 0.000006s : 1: overlap_opt_shard_in_pipeline 0.02% : 0.000008s : 1: overlap_param_gather 0.02% : 0.000006s : 1: overlap_recompute_allgather_and_fa_grad 0.03% : 0.000011s : 1: overlap_recompute_and_grad_model_parallel 0.02% : 0.000008s : 1: overlap_recompute_comm 0.03% : 0.000010s : 1: parallel-infer-symbol 0.02% : 0.000006s : 1: parallel-infer-symbol-second 0.02% : 0.000007s : 1: partial_unused_args_eliminate 0.03% : 0.000011s : 1: pipeline_parallel_scheduler 0.02% : 0.000007s : 1: pipeline_split 0.10% : 0.000038s : 1: pre_auto_parallel 0.08% : 0.000032s : 1: py_interpret_to_execute 0.05% : 0.000018s : 1: py_interpret_to_execute_after_opt_a 0.02% : 0.000006s : 1: remove_cast_before_assign_add 0.05% : 0.000020s : 1: remove_dup_value 0.90% : 0.000347s : 1: renormalize.infer 0.74% : 0.000285s : 1: renormalize.specialize 0.02% : 0.000008s : 1: reorder_send_recv_between_fp_bp 0.04% : 0.000015s : 1: rewriter_after_jit_bprop_graph 0.11% : 0.000041s : 1: rewriter_after_opt_a 0.23% : 0.000088s : 1: rewriter_before_opt_a 0.02% : 0.000008s : 1: slice_cell_reuse_recomputed_activation 0.02% : 0.000007s : 1: slice_recompute_activation 0.02% : 0.000007s : 1: split_layernorm_comm 0.02% : 0.000007s : 1: split_matmul_comm_elemetwise 0.03% : 0.000011s : 1: swap_dp_allreduce_reducescatter 0.26% : 0.000099s : 1: symbol_engine_optimizer 0.24% : 0.000094s : 1: tuple_transform 14.68% : 0.005676s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:42:58.506.923 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0155333, [21] [bootstrap]: 0.00045615 [type_inference]: 0.00624798 [event_method]: 1.873e-05 [auto_monad]: 6.444e-05 [graph_reusing]: 5.69e-06 [inline]: 1.94e-06 [add_attr]: 0.0031427, [1] [add_attr_with_inline]: 0.00313229, [1] [Cycle 1]: 6.476e-05, [2] [tag_attr]: 1.993e-05 [meta_addattr_fg_expand]: 5.81998e-06 [parallel-infer-symbol]: 3.23e-06 [pre_auto_parallel]: 3.404e-05 [insert-virtual-dataset]: 2.49001e-06 [parallel-infer-symbol-second]: 1.12e-06 [dataset_repeat_opt]: 1.86e-06 [pipeline_split]: 1.71e-06 [optimize]: 0.00484133, [53] [py_interpret_to_execute]: 2.525e-05 [rewriter_before_opt_a]: 8.114e-05 [opt_a]: 0.00271924, [2] [Cycle 1]: 0.00202624, [45] [expand_dump_flag]: 3.25e-06 [switch_simplify]: 4.39e-05 [loop_unroll]: 3.124e-05 [a_1]: 0.0006467 [with_stream_mark]: 1.643e-05 [recompute_prepare]: 8.47e-06 [updatestate_depend_eliminate]: 3.7e-06 [updatestate_assign_eliminate]: 3.28e-06 [updatestate_loads_eliminate]: 3.18998e-06 [parameter_eliminate]: 1.87001e-06 [a_2]: 8.669e-05 [accelerated_algorithm]: 7.2e-06 [shard]: 2.14999e-06 [meta_shard_fg_expand]: 1.92999e-06 [shard_inline]: 6.55002e-06 [merge_send_recv]: 8.54e-06 [auto_parallel]: 5.92001e-06 [parallel]: 1.916e-05 [flash_sp]: 7.93999e-06 [merge_comm]: 4.2e-06 [allreduce_fusion]: 3.58e-06 [matmul_add_comm_reduction]: 1.06e-05 [allreduce_slice_to_reducescatter]: 6.79982e-07 [virtual_shard_identity]: 1.466e-05 [virtual_dataset]: 6.88e-06 [get_grad_eliminate_]: 5.91e-06 [virtual_output]: 6.31998e-06 [merge_forward]: 4.4e-06 [cell_reuse_recompute_pass]: 1.15999e-06 [offload_activation]: 1.019e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.297e-05 [merge_recompute_call_nodes]: 1.55001e-06 [before_grad]: 1.041e-05 [set_forward_comm_id_for_comm_node_pass]: 3.86001e-06 [meta_fg_expand]: 2.79999e-06 [flash_sp_send_recv_attached]: 2.64001e-06 [receive_attached]: 2.13002e-06 [after_resolve]: 1.169e-05 [a_after_grad]: 1.001e-05 [renormalize]: 0.0006497 [add_forward_monad_depend]: 5.76e-06 [auto_monad_grad]: 2.41998e-06 [auto_monad_eliminator]: 1.511e-05 [cse]: 2.719e-05 [a_3]: 4.895e-05 [Cycle 2]: 0.00068301, [45] [expand_dump_flag]: 1.74998e-06 [switch_simplify]: 7.46001e-06 [loop_unroll]: 6.68e-06 [a_1]: 0.00014757 [with_stream_mark]: 1.193e-05 [recompute_prepare]: 6.41e-06 [updatestate_depend_eliminate]: 2.76e-06 [updatestate_assign_eliminate]: 2.32001e-06 [updatestate_loads_eliminate]: 2.37999e-06 [parameter_eliminate]: 1.37e-06 [a_2]: 8.037e-05 [accelerated_algorithm]: 6.12999e-06 [shard]: 1.92001e-06 [meta_shard_fg_expand]: 1.81e-06 [shard_inline]: 6.02001e-06 [merge_send_recv]: 6.01e-06 [auto_parallel]: 5.80002e-06 [parallel]: 5.59e-06 [flash_sp]: 3.28998e-06 [merge_comm]: 3.41999e-06 [allreduce_fusion]: 3.42002e-06 [matmul_add_comm_reduction]: 6.46e-06 [allreduce_slice_to_reducescatter]: 3.60014e-07 [virtual_shard_identity]: 9.71e-06 [virtual_dataset]: 5.95002e-06 [get_grad_eliminate_]: 5.99e-06 [virtual_output]: 5.66998e-06 [merge_forward]: 3.82998e-06 [cell_reuse_recompute_pass]: 1.67999e-06 [offload_activation]: 7.57998e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.087e-05 [merge_recompute_call_nodes]: 9.19972e-07 [before_grad]: 9.27001e-06 [set_forward_comm_id_for_comm_node_pass]: 3.3e-06 [meta_fg_expand]: 1.98002e-06 [flash_sp_send_recv_attached]: 1.05999e-06 [receive_attached]: 1.35001e-06 [after_resolve]: 1.037e-05 [a_after_grad]: 8.99998e-06 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 1.52999e-06 [auto_monad_grad]: 1.27e-06 [auto_monad_eliminator]: 7.97e-06 [cse]: 1.442e-05 [a_3]: 3.68e-05 [py_interpret_to_execute_after_opt_a]: 9.91e-06 [slice_cell_reuse_recomputed_activation]: 1.88997e-06 [rewriter_after_opt_a]: 3.453e-05 [convert_after_rewriter]: 6.38998e-06 [order_py_execute_after_rewriter]: 5.15001e-06 [mutable_eliminate]: 0.00055711 [opt_b]: 0.00025814, [1] [Cycle 1]: 0.00025151, [7] [b_1]: 0.00013894 [b_2]: 9.05999e-06 [updatestate_depend_eliminate]: 6.61e-06 [updatestate_assign_eliminate]: 2.32001e-06 [updatestate_loads_eliminate]: 2.51998e-06 [renormalize]: 5.09986e-07 [cse]: 2.146e-05 [optimize_parallel_all_gather_comm]: 1.673e-05 [overlap_param_gather]: 1.84e-06 [cconv]: 2.822e-05 [loop_unroll]: 0.00043725 [opt_after_cconv]: 0.00010456, [1] [Cycle 1]: 9.901e-05, [7] [c_1]: 3.233e-05 [parameter_eliminate]: 3.48999e-06 [updatestate_depend_eliminate]: 5.62001e-06 [updatestate_assign_eliminate]: 2.60002e-06 [updatestate_loads_eliminate]: 2.28998e-06 [cse]: 1.689e-05 [renormalize]: 6.19999e-07 [remove_dup_value]: 1.273e-05 [tuple_transform]: 7.551e-05, [1] [Cycle 1]: 7.133e-05, [4] [d_1]: 4.375e-05 [none_parameter_eliminate]: 1.64e-06 [renormalize]: 1.90019e-07 [switch_simplify]: 6.88998e-06 [partial_unused_args_eliminate]: 1.93997e-06 [add_recomputation]: 4.673e-05 [cse_after_recomputation]: 2.114e-05, [1] [Cycle 1]: 1.64e-05, [1] [cse]: 1.058e-05 [environ_conv]: 5.12999e-06 [swap_dp_allreduce_reducescatter]: 5.14e-06 [bias_add_comm_swap]: 2.56e-06 [label_micro_interleaved_index]: 4.84e-06 [label_fine_grained_interleaved_index]: 3.02002e-06 [merge_cast_opt]: 1.42999e-06 [slice_recompute_activation]: 2.06e-06 [micro_interleaved_order_control]: 2.43e-06 [assign_add_opt]: 1.49e-06 [ForceFp32Comm]: 7.59988e-07 [remove_cast_before_assign_add]: 9.90025e-07 [full_micro_interleaved_order_control]: 2.17001e-06 [reorder_send_recv_between_fp_bp]: 2.71e-06 [comm_op_add_attrs]: 1.07998e-06 [add_comm_op_reuse_tag]: 1.07e-06 [interleave_split_concat_branches]: 1.10001e-06 [interleave_parallel_branches]: 1.03001e-06 [overlap_opt_shard_in_pipeline]: 1.25999e-06 [overlap_opt_shard_grad_in_pipeline]: 1.94e-06 [control_data_broadcast_order]: 1.258e-05 [grouped_pairwise_exchange_alltoall]: 1.52999e-06 [offloading_packed_experts]: 3.8e-06 [overlap_recompute_and_grad_model_parallel]: 4.77e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.36002e-06 [overlap_recompute_allgather_and_fa_grad]: 1.66002e-06 [overlap_recompute_comm]: 2.48e-06 [overlap_grad_ring_attention]: 4.1e-06 [overlap_grad_flash_sp]: 1.854e-05 [begin_end_overlap_inline]: 8.90024e-07 [split_matmul_comm_elemetwise]: 2.26998e-06 [split_layernorm_comm]: 1.64e-06 [handle_group_info]: 1.18001e-06 [symbol_engine_optimizer]: 7.857e-05, [1] [Cycle 1]: 7.387e-05, [6] [build]: 3.04999e-06 [elim_shapecalc]: 1.072e-05 [elim_not_effective]: 1.325e-05 [opt_reshape]: 7.6e-06 [fold_const_symbol]: 1.035e-05 [renormalize]: 2.69996e-07 [detach_backward]: 2.08998e-06 [pipeline_parallel_scheduler]: 1.62999e-06 [auto_monad_reorder]: 1.625e-05 [get_jit_bprop_graph]: 1.93997e-06 [rewriter_after_jit_bprop_graph]: 4.1e-06 [opt_after_jit_grad]: 0.00049151 [validate]: 4.13e-05 Sums bootstrap : 0.000456s : 4.00% type_inference : 0.006248s : 54.81% event_method : 0.000019s : 0.16% auto_monad : 0.000064s : 0.57% graph_reusing : 0.000006s : 0.05% inline : 0.000002s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000020s : 0.17% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.05% parallel-infer-symbol : 0.000003s : 0.03% pre_auto_parallel : 0.000034s : 0.30% insert-virtual-dataset : 0.000002s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.02% optimize.py_interpret_to_execute : 0.000025s : 0.22% optimize.rewriter_before_opt_a : 0.000081s : 0.71% optimize.opt_a.expand_dump_flag : 0.000005s : 0.04% optimize.opt_a.switch_simplify : 0.000051s : 0.45% optimize.opt_a.loop_unroll : 0.000038s : 0.33% optimize.opt_a.a_1 : 0.000794s : 6.97% optimize.opt_a.with_stream_mark : 0.000028s : 0.25% optimize.opt_a.recompute_prepare : 0.000015s : 0.13% optimize.opt_a.updatestate_depend_eliminate : 0.000006s : 0.06% optimize.opt_a.updatestate_assign_eliminate : 0.000006s : 0.05% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.05% optimize.opt_a.parameter_eliminate : 0.000003s : 0.03% optimize.opt_a.a_2 : 0.000167s : 1.47% optimize.opt_a.accelerated_algorithm : 0.000013s : 0.12% optimize.opt_a.shard : 0.000004s : 0.04% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.03% optimize.opt_a.shard_inline : 0.000013s : 0.11% optimize.opt_a.merge_send_recv : 0.000015s : 0.13% optimize.opt_a.auto_parallel : 0.000012s : 0.10% optimize.opt_a.parallel : 0.000025s : 0.22% optimize.opt_a.flash_sp : 0.000011s : 0.10% optimize.opt_a.merge_comm : 0.000008s : 0.07% optimize.opt_a.allreduce_fusion : 0.000007s : 0.06% optimize.opt_a.matmul_add_comm_reduction : 0.000017s : 0.15% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000024s : 0.21% optimize.opt_a.virtual_dataset : 0.000013s : 0.11% optimize.opt_a.get_grad_eliminate_ : 0.000012s : 0.10% optimize.opt_a.virtual_output : 0.000012s : 0.11% optimize.opt_a.merge_forward : 0.000008s : 0.07% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.02% optimize.opt_a.offload_activation : 0.000018s : 0.16% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000024s : 0.21% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.02% optimize.opt_a.before_grad : 0.000020s : 0.17% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000007s : 0.06% optimize.opt_a.meta_fg_expand : 0.000005s : 0.04% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.03% optimize.opt_a.receive_attached : 0.000003s : 0.03% optimize.opt_a.after_resolve : 0.000022s : 0.19% optimize.opt_a.a_after_grad : 0.000019s : 0.17% optimize.opt_a.renormalize : 0.000650s : 5.70% optimize.opt_a.add_forward_monad_depend : 0.000007s : 0.06% optimize.opt_a.auto_monad_grad : 0.000004s : 0.03% optimize.opt_a.auto_monad_eliminator : 0.000023s : 0.20% optimize.opt_a.cse : 0.000042s : 0.37% optimize.opt_a.a_3 : 0.000086s : 0.75% optimize.py_interpret_to_execute_after_opt_a : 0.000010s : 0.09% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.02% optimize.rewriter_after_opt_a : 0.000035s : 0.30% optimize.convert_after_rewriter : 0.000006s : 0.06% optimize.order_py_execute_after_rewriter : 0.000005s : 0.05% optimize.mutable_eliminate : 0.000557s : 4.89% optimize.opt_b.b_1 : 0.000139s : 1.22% optimize.opt_b.b_2 : 0.000009s : 0.08% optimize.opt_b.updatestate_depend_eliminate : 0.000007s : 0.06% optimize.opt_b.updatestate_assign_eliminate : 0.000002s : 0.02% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.02% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000021s : 0.19% optimize.optimize_parallel_all_gather_comm : 0.000017s : 0.15% optimize.overlap_param_gather : 0.000002s : 0.02% optimize.cconv : 0.000028s : 0.25% optimize.loop_unroll : 0.000437s : 3.84% optimize.opt_after_cconv.c_1 : 0.000032s : 0.28% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.05% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.02% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.02% optimize.opt_after_cconv.cse : 0.000017s : 0.15% optimize.opt_after_cconv.renormalize : 0.000001s : 0.01% optimize.remove_dup_value : 0.000013s : 0.11% optimize.tuple_transform.d_1 : 0.000044s : 0.38% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000007s : 0.06% optimize.partial_unused_args_eliminate : 0.000002s : 0.02% optimize.add_recomputation : 0.000047s : 0.41% optimize.cse_after_recomputation.cse : 0.000011s : 0.09% optimize.environ_conv : 0.000005s : 0.05% optimize.swap_dp_allreduce_reducescatter : 0.000005s : 0.05% optimize.bias_add_comm_swap : 0.000003s : 0.02% optimize.label_micro_interleaved_index : 0.000005s : 0.04% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.03% optimize.merge_cast_opt : 0.000001s : 0.01% optimize.slice_recompute_activation : 0.000002s : 0.02% optimize.micro_interleaved_order_control : 0.000002s : 0.02% optimize.assign_add_opt : 0.000001s : 0.01% optimize.ForceFp32Comm : 0.000001s : 0.01% optimize.remove_cast_before_assign_add : 0.000001s : 0.01% optimize.full_micro_interleaved_order_control : 0.000002s : 0.02% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.02% optimize.comm_op_add_attrs : 0.000001s : 0.01% optimize.add_comm_op_reuse_tag : 0.000001s : 0.01% optimize.interleave_split_concat_branches : 0.000001s : 0.01% optimize.interleave_parallel_branches : 0.000001s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.02% optimize.control_data_broadcast_order : 0.000013s : 0.11% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.01% optimize.offloading_packed_experts : 0.000004s : 0.03% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.04% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000002s : 0.01% optimize.overlap_recompute_comm : 0.000002s : 0.02% optimize.overlap_grad_ring_attention : 0.000004s : 0.04% optimize.overlap_grad_flash_sp : 0.000019s : 0.16% optimize.begin_end_overlap_inline : 0.000001s : 0.01% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.02% optimize.split_layernorm_comm : 0.000002s : 0.01% optimize.handle_group_info : 0.000001s : 0.01% optimize.symbol_engine_optimizer.build : 0.000003s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000011s : 0.09% optimize.symbol_engine_optimizer.elim_not_effective : 0.000013s : 0.12% optimize.symbol_engine_optimizer.opt_reshape : 0.000008s : 0.07% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000010s : 0.09% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.02% pipeline_parallel_scheduler : 0.000002s : 0.01% auto_monad_reorder : 0.000016s : 0.14% get_jit_bprop_graph : 0.000002s : 0.02% rewriter_after_jit_bprop_graph : 0.000004s : 0.04% opt_after_jit_grad : 0.000492s : 4.31% validate : 0.000041s : 0.36% Time group info: ------[substitution.] 0.000213 34 15.05% : 0.000032s : 6: substitution.arithmetic_simplify 0.95% : 0.000002s : 2: substitution.elim_not_effective 0.65% : 0.000001s : 2: substitution.fold_const_symbol 2.84% : 0.000006s : 4: substitution.graph_param_transform 67.16% : 0.000143s : 4: substitution.inline 1.75% : 0.000004s : 4: substitution.j_node_and_user_rematch 2.25% : 0.000005s : 4: substitution.remove_not_recompute_node 1.95% : 0.000004s : 4: substitution.replace_old_param 7.42% : 0.000016s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.006185 2 88.86% : 0.005496s : 1: type_inference.infer 11.14% : 0.000689s : 1: type_inference.specialize ------[replace.] 0.000062 8 62.23% : 0.000039s : 4: replace.inline 37.77% : 0.000023s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000155 8 90.99% : 0.000141s : 4: match.inline 9.01% : 0.000014s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000205 1278 0.92% : 0.000002s : 13: predicate.accumulaten_eliminater 0.72% : 0.000001s : 4: predicate.ad_related_special_op_eliminate 0.52% : 0.000001s : 8: predicate.addn_check_dump 0.89% : 0.000002s : 13: predicate.addn_zero_filter 0.79% : 0.000002s : 13: predicate.adjust_all_reduce_mul_add 2.32% : 0.000005s : 21: predicate.arithmetic_simplify 0.88% : 0.000002s : 13: predicate.cast_eliminate 0.61% : 0.000001s : 8: predicate.check_bprop_eliminate 0.51% : 0.000001s : 8: predicate.compare_switch_simplify 0.19% : 0.000000s : 4: predicate.const_output_eliminate 0.58% : 0.000001s : 8: predicate.depend_value_elim 1.00% : 0.000002s : 13: predicate.dict_get_item_const_eliminator 1.05% : 0.000002s : 13: predicate.dict_get_item_eliminator 0.84% : 0.000002s : 13: predicate.dict_set_item_eliminator 0.95% : 0.000002s : 8: predicate.dumpgradient_eliminate 0.20% : 0.000000s : 4: predicate.elim_not_effective 0.50% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.13% : 0.000002s : 17: predicate.environ_add_const_eliminate 1.04% : 0.000002s : 17: predicate.environ_get_add_eliminate 1.06% : 0.000002s : 17: predicate.environ_get_depend_swap 1.65% : 0.000003s : 25: predicate.environ_get_eliminate 1.23% : 0.000003s : 17: predicate.environ_get_set_eliminate 1.40% : 0.000003s : 21: predicate.exchange_switch_depend_value 2.55% : 0.000005s : 21: predicate.float_depend_g_call 0.50% : 0.000001s : 8: predicate.float_environ_get_switch 0.78% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.21% : 0.000000s : 4: predicate.fold_const_symbol 0.62% : 0.000001s : 8: predicate.get_grad_eliminate 0.21% : 0.000000s : 4: predicate.graph_param_transform 0.62% : 0.000001s : 8: predicate.incorporate_call 0.53% : 0.000001s : 8: predicate.incorporate_call_switch 6.42% : 0.000013s : 58: predicate.inline 0.84% : 0.000002s : 8: predicate.inline_without_move 0.31% : 0.000001s : 8: predicate.j_node_and_user_rematch 1.00% : 0.000002s : 8: predicate.less_batch_normalization 1.85% : 0.000004s : 25: predicate.list_to_tuple_eliminator_ 2.57% : 0.000005s : 38: predicate.load_eliminater 0.93% : 0.000002s : 4: predicate.loop_unroll_after_grad 2.48% : 0.000005s : 34: predicate.loop_unroll_before_grad 1.63% : 0.000003s : 21: predicate.make_slice_get_slice_eliminator 0.52% : 0.000001s : 8: predicate.merge_addn 0.58% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.56% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.85% : 0.000002s : 13: predicate.minmaximum_grad 1.01% : 0.000002s : 4: predicate.mutable_eliminate 0.36% : 0.000001s : 4: predicate.opt_reshape 0.33% : 0.000001s : 4: predicate.parallel_virtual_node 1.82% : 0.000004s : 21: predicate.partial_defer_inline 1.66% : 0.000003s : 21: predicate.partial_eliminate 0.87% : 0.000002s : 13: predicate.print_const_string_wrapper 0.60% : 0.000001s : 8: predicate.reduce_all_const_elim 1.34% : 0.000003s : 13: predicate.reduce_eliminate 2.45% : 0.000005s : 38: predicate.redundant_stop_gradient_eliminater 0.38% : 0.000001s : 8: predicate.remove_not_recompute_node 1.51% : 0.000003s : 25: predicate.replace_applicator 0.54% : 0.000001s : 8: predicate.replace_old_param 0.26% : 0.000001s : 4: predicate.reset_defer_inline 0.89% : 0.000002s : 13: predicate.reshape_eliminate 0.56% : 0.000001s : 8: predicate.row_tensor_add_zeros_like 0.63% : 0.000001s : 4: predicate.row_tensor_eliminate 0.69% : 0.000001s : 8: predicate.same_eliminate 0.43% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.96% : 0.000002s : 8: predicate.shard_identity_eliminate 0.92% : 0.000002s : 8: predicate.special_op_eliminate 0.68% : 0.000001s : 8: predicate.specialize_transform 0.93% : 0.000002s : 8: predicate.split_environ_get_set_with_tuple_value 0.71% : 0.000001s : 8: predicate.stack_unstack_eliminate 0.33% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.53% : 0.000003s : 21: predicate.switch_defer_inline 2.04% : 0.000004s : 29: predicate.switch_layer_defer_inline 5.30% : 0.000011s : 67: predicate.switch_simplify 0.91% : 0.000002s : 13: predicate.tile_eliminate 0.91% : 0.000002s : 13: predicate.transpose_eliminate 1.56% : 0.000003s : 21: predicate.tuple_list_convert_item_index_to_positive 1.51% : 0.000003s : 21: predicate.tuple_list_get_item_const_eliminator 1.46% : 0.000003s : 21: predicate.tuple_list_get_item_depend_reorder 3.23% : 0.000007s : 33: predicate.tuple_list_get_item_eliminator 1.46% : 0.000003s : 21: predicate.tuple_list_get_set_item_eliminator 2.16% : 0.000004s : 29: predicate.tuple_list_set_item_eliminator 1.71% : 0.000004s : 25: predicate.tuple_to_list_eliminator_ 2.41% : 0.000005s : 38: predicate.updatestate_pure_node_eliminater 3.03% : 0.000006s : 46: predicate.updatestate_useless_node_eliminater 0.33% : 0.000001s : 4: predicate.value_based_eliminate 0.61% : 0.000001s : 8: predicate.virtual_dataset_eliminate 0.61% : 0.000001s : 8: predicate.virtual_output_eliminate 0.35% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.46% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000541 11 55.41% : 0.000300s : 5: func_graph_cloner_run.FuncGraphClonerGraph 44.59% : 0.000241s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.025559 192 0.01% : 0.000003s : 1: ForceFp32Comm 12.32% : 0.003148s : 1: add_attr 12.27% : 0.003137s : 1: add_attr_with_inline 0.02% : 0.000004s : 1: add_comm_op_reuse_tag 0.20% : 0.000051s : 1: add_recomputation 0.02% : 0.000004s : 1: assign_add_opt 0.27% : 0.000069s : 1: auto_monad 0.08% : 0.000020s : 1: auto_monad_reorder 0.01% : 0.000004s : 1: begin_end_overlap_inline 0.02% : 0.000005s : 1: bias_add_comm_swap 1.89% : 0.000484s : 1: bootstrap 0.12% : 0.000032s : 1: cconv 0.01% : 0.000004s : 1: comm_op_add_attrs 0.06% : 0.000016s : 1: control_data_broadcast_order 0.04% : 0.000010s : 1: convert_after_rewriter 0.09% : 0.000024s : 1: cse_after_recomputation 0.02% : 0.000005s : 1: dataset_repeat_opt 0.02% : 0.000006s : 1: detach_backward 0.03% : 0.000008s : 1: environ_conv 0.10% : 0.000025s : 1: event_method 0.02% : 0.000005s : 1: full_micro_interleaved_order_control 0.02% : 0.000005s : 1: get_jit_bprop_graph 0.04% : 0.000010s : 1: graph_reusing 0.02% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.02% : 0.000004s : 1: handle_group_info 0.02% : 0.000005s : 1: inline 0.02% : 0.000006s : 1: insert-virtual-dataset 0.01% : 0.000004s : 1: interleave_parallel_branches 0.01% : 0.000004s : 1: interleave_split_concat_branches 0.02% : 0.000006s : 1: label_fine_grained_interleaved_index 0.03% : 0.000008s : 1: label_micro_interleaved_index 1.74% : 0.000446s : 1: loop_unroll 0.02% : 0.000004s : 1: merge_cast_opt 0.02% : 0.000005s : 1: micro_interleaved_order_control 2.21% : 0.000566s : 1: mutable_eliminate 0.03% : 0.000007s : 1: offloading_packed_experts 0.06% : 0.000014s : 1: opt.transform.loop_unroll_optimizer 0.06% : 0.000015s : 1: opt.transform.mutable_eliminate 4.82% : 0.001231s : 78: opt.transform.opt_a 0.12% : 0.000031s : 1: opt.transform.opt_after_cconv 0.10% : 0.000027s : 1: opt.transform.opt_after_jit_grad 0.46% : 0.000117s : 28: opt.transform.opt_b 0.19% : 0.000049s : 2: opt.transform.opt_trans_graph 0.15% : 0.000038s : 4: opt.transform.symbol_engine_opt 10.65% : 0.002722s : 1: opt_a 0.42% : 0.000108s : 1: opt_after_cconv 1.96% : 0.000501s : 1: opt_after_jit_grad 1.03% : 0.000262s : 1: opt_b 18.96% : 0.004846s : 1: optimize 0.08% : 0.000020s : 1: optimize_parallel_all_gather_comm 0.03% : 0.000009s : 1: order_py_execute_after_rewriter 0.09% : 0.000022s : 1: overlap_grad_flash_sp 0.02% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.03% : 0.000007s : 1: overlap_grad_ring_attention 0.02% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.02% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.02% : 0.000005s : 1: overlap_param_gather 0.02% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.03% : 0.000008s : 1: overlap_recompute_and_grad_model_parallel 0.02% : 0.000006s : 1: overlap_recompute_comm 0.03% : 0.000007s : 1: parallel-infer-symbol 0.02% : 0.000004s : 1: parallel-infer-symbol-second 0.02% : 0.000005s : 1: partial_unused_args_eliminate 0.02% : 0.000005s : 1: pipeline_parallel_scheduler 0.02% : 0.000004s : 1: pipeline_split 0.15% : 0.000038s : 1: pre_auto_parallel 0.12% : 0.000029s : 1: py_interpret_to_execute 0.05% : 0.000013s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000004s : 1: remove_cast_before_assign_add 0.06% : 0.000016s : 1: remove_dup_value 1.33% : 0.000339s : 1: renormalize.infer 1.19% : 0.000304s : 1: renormalize.specialize 0.02% : 0.000005s : 1: reorder_send_recv_between_fp_bp 0.03% : 0.000007s : 1: rewriter_after_jit_bprop_graph 0.15% : 0.000038s : 1: rewriter_after_opt_a 0.33% : 0.000085s : 1: rewriter_before_opt_a 0.02% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.02% : 0.000005s : 1: slice_recompute_activation 0.02% : 0.000005s : 1: split_layernorm_comm 0.02% : 0.000005s : 1: split_matmul_comm_elemetwise 0.03% : 0.000008s : 1: swap_dp_allreduce_reducescatter 0.32% : 0.000081s : 1: symbol_engine_optimizer 0.31% : 0.000078s : 1: tuple_transform 24.52% : 0.006266s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:42:59.499.699 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:42:59.499.965 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0174808, [21] [bootstrap]: 0.0004703 [type_inference]: 0.00598811 [event_method]: 1.855e-05 [auto_monad]: 6.774e-05 [graph_reusing]: 7.08e-06 [inline]: 2.21e-06 [add_attr]: 0.00390016, [1] [add_attr_with_inline]: 0.00388963, [1] [Cycle 1]: 8.008e-05, [2] [tag_attr]: 1.975e-05 [meta_addattr_fg_expand]: 5.76998e-06 [parallel-infer-symbol]: 3.26001e-06 [pre_auto_parallel]: 3.594e-05 [insert-virtual-dataset]: 2.31998e-06 [parallel-infer-symbol-second]: 7.7e-07 [dataset_repeat_opt]: 2.06e-06 [pipeline_split]: 1.67999e-06 [optimize]: 0.00552066, [53] [py_interpret_to_execute]: 3.03e-05 [rewriter_before_opt_a]: 8.786e-05 [opt_a]: 0.00315013, [2] [Cycle 1]: 0.00228455, [45] [expand_dump_flag]: 3.04001e-06 [switch_simplify]: 4.304e-05 [loop_unroll]: 3.037e-05 [a_1]: 0.00063637 [with_stream_mark]: 1.693e-05 [recompute_prepare]: 1.004e-05 [updatestate_depend_eliminate]: 4.20999e-06 [updatestate_assign_eliminate]: 3.58e-06 [updatestate_loads_eliminate]: 3.04999e-06 [parameter_eliminate]: 1.77001e-06 [a_2]: 0.00011704 [accelerated_algorithm]: 7.75e-06 [shard]: 2.01998e-06 [meta_shard_fg_expand]: 1.79998e-06 [shard_inline]: 6.94999e-06 [merge_send_recv]: 9.24998e-06 [auto_parallel]: 7.87e-06 [parallel]: 1.944e-05 [flash_sp]: 8.54e-06 [merge_comm]: 3.70998e-06 [allreduce_fusion]: 3.84002e-06 [matmul_add_comm_reduction]: 9.10001e-06 [allreduce_slice_to_reducescatter]: 9.50007e-07 [virtual_shard_identity]: 8.43999e-06 [virtual_dataset]: 6.94999e-06 [get_grad_eliminate_]: 6.34999e-06 [virtual_output]: 7.58001e-06 [merge_forward]: 4.68999e-06 [cell_reuse_recompute_pass]: 1.62001e-06 [offload_activation]: 1.07e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.365e-05 [merge_recompute_call_nodes]: 1.39998e-06 [before_grad]: 1.065e-05 [set_forward_comm_id_for_comm_node_pass]: 3.63999e-06 [meta_fg_expand]: 3.09999e-06 [flash_sp_send_recv_attached]: 2.47001e-06 [receive_attached]: 2.55002e-06 [after_resolve]: 1.225e-05 [a_after_grad]: 1.106e-05 [renormalize]: 0.0007196 [add_forward_monad_depend]: 6.23002e-06 [auto_monad_grad]: 2.46998e-06 [auto_monad_eliminator]: 1.507e-05 [cse]: 2.658e-05 [a_3]: 6.364e-05 [Cycle 2]: 0.00085114, [45] [expand_dump_flag]: 1.52001e-06 [switch_simplify]: 8.47e-06 [loop_unroll]: 6.53e-06 [a_1]: 0.00014106 [with_stream_mark]: 1.144e-05 [recompute_prepare]: 6.86999e-06 [updatestate_depend_eliminate]: 3.14001e-06 [updatestate_assign_eliminate]: 3.23e-06 [updatestate_loads_eliminate]: 2.59999e-06 [parameter_eliminate]: 1.22e-06 [a_2]: 0.0001077 [accelerated_algorithm]: 6.41e-06 [shard]: 1.44e-06 [meta_shard_fg_expand]: 1.67001e-06 [shard_inline]: 6.42001e-06 [merge_send_recv]: 5.36002e-06 [auto_parallel]: 6.28998e-06 [parallel]: 5.40999e-06 [flash_sp]: 3.64002e-06 [merge_comm]: 3.61999e-06 [allreduce_fusion]: 3.35e-06 [matmul_add_comm_reduction]: 7.00998e-06 [allreduce_slice_to_reducescatter]: 6.00005e-07 [virtual_shard_identity]: 7.46001e-06 [virtual_dataset]: 6.11e-06 [get_grad_eliminate_]: 6.12999e-06 [virtual_output]: 5.66e-06 [merge_forward]: 2.63e-06 [cell_reuse_recompute_pass]: 2.24001e-06 [offload_activation]: 7.16001e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.34e-05 [merge_recompute_call_nodes]: 9.49978e-07 [before_grad]: 9.05999e-06 [set_forward_comm_id_for_comm_node_pass]: 3.51999e-06 [meta_fg_expand]: 1.91e-06 [flash_sp_send_recv_attached]: 1.17999e-06 [receive_attached]: 1.59e-06 [after_resolve]: 1.172e-05 [a_after_grad]: 1.099e-05 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 1.29e-06 [auto_monad_grad]: 9.89996e-07 [auto_monad_eliminator]: 7.51999e-06 [cse]: 1.564e-05 [a_3]: 5.027e-05 [py_interpret_to_execute_after_opt_a]: 1.365e-05 [slice_cell_reuse_recomputed_activation]: 6.17999e-06 [rewriter_after_opt_a]: 4.096e-05 [convert_after_rewriter]: 1.004e-05 [order_py_execute_after_rewriter]: 8.55999e-06 [mutable_eliminate]: 0.00059285 [opt_b]: 0.0002719, [1] [Cycle 1]: 0.00026164, [7] [b_1]: 0.00016747 [b_2]: 7.87003e-06 [updatestate_depend_eliminate]: 5.86e-06 [updatestate_assign_eliminate]: 2.84999e-06 [updatestate_loads_eliminate]: 2.41e-06 [renormalize]: 7.89994e-07 [cse]: 1.79e-05 [optimize_parallel_all_gather_comm]: 2.055e-05 [overlap_param_gather]: 5.03002e-06 [cconv]: 3.255e-05 [loop_unroll]: 0.00044123 [opt_after_cconv]: 0.00012782, [1] [Cycle 1]: 0.00011834, [7] [c_1]: 3.149e-05 [parameter_eliminate]: 3.62998e-06 [updatestate_depend_eliminate]: 5.14e-06 [updatestate_assign_eliminate]: 2.48e-06 [updatestate_loads_eliminate]: 2.27999e-06 [cse]: 1.669e-05 [renormalize]: 5.3001e-07 [remove_dup_value]: 1.796e-05 [tuple_transform]: 9.188e-05, [1] [Cycle 1]: 8.481e-05, [4] [d_1]: 4.459e-05 [none_parameter_eliminate]: 1.70001e-06 [renormalize]: 3.99974e-07 [switch_simplify]: 7.32002e-06 [partial_unused_args_eliminate]: 4.42e-06 [add_recomputation]: 4.948e-05 [cse_after_recomputation]: 2.771e-05, [1] [Cycle 1]: 2.056e-05, [1] [cse]: 1.142e-05 [environ_conv]: 8.57e-06 [swap_dp_allreduce_reducescatter]: 8.05e-06 [bias_add_comm_swap]: 5.40999e-06 [label_micro_interleaved_index]: 7.41001e-06 [label_fine_grained_interleaved_index]: 5.15001e-06 [merge_cast_opt]: 3.65998e-06 [slice_recompute_activation]: 4.43999e-06 [micro_interleaved_order_control]: 4.77e-06 [assign_add_opt]: 3.7e-06 [ForceFp32Comm]: 3.18998e-06 [remove_cast_before_assign_add]: 3.36999e-06 [full_micro_interleaved_order_control]: 5.00001e-06 [reorder_send_recv_between_fp_bp]: 5.39e-06 [comm_op_add_attrs]: 3.30998e-06 [add_comm_op_reuse_tag]: 3.58e-06 [interleave_split_concat_branches]: 3.86999e-06 [interleave_parallel_branches]: 3.35e-06 [overlap_opt_shard_in_pipeline]: 3.39001e-06 [overlap_opt_shard_grad_in_pipeline]: 4.15999e-06 [control_data_broadcast_order]: 1.623e-05 [grouped_pairwise_exchange_alltoall]: 4.07003e-06 [offloading_packed_experts]: 6.46e-06 [overlap_recompute_and_grad_model_parallel]: 7.57998e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.68e-06 [overlap_recompute_allgather_and_fa_grad]: 3.78999e-06 [overlap_recompute_comm]: 4.67e-06 [overlap_grad_ring_attention]: 6.59001e-06 [overlap_grad_flash_sp]: 2.119e-05 [begin_end_overlap_inline]: 2.95998e-06 [split_matmul_comm_elemetwise]: 4.80999e-06 [split_layernorm_comm]: 4.41002e-06 [handle_group_info]: 3.5e-06 [symbol_engine_optimizer]: 9.653e-05, [1] [Cycle 1]: 8.924e-05, [6] [build]: 2.84999e-06 [elim_shapecalc]: 9.85002e-06 [elim_not_effective]: 1.286e-05 [opt_reshape]: 7.01001e-06 [fold_const_symbol]: 1.029e-05 [renormalize]: 2.69996e-07 [detach_backward]: 4.22998e-06 [pipeline_parallel_scheduler]: 2.05002e-06 [auto_monad_reorder]: 2.129e-05 [get_jit_bprop_graph]: 1.62001e-06 [rewriter_after_jit_bprop_graph]: 5.56e-06 [opt_after_jit_grad]: 0.00079903 [validate]: 3.976e-05 Sums bootstrap : 0.000470s : 3.98% type_inference : 0.005988s : 50.66% event_method : 0.000019s : 0.16% auto_monad : 0.000068s : 0.57% graph_reusing : 0.000007s : 0.06% inline : 0.000002s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000020s : 0.17% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.05% parallel-infer-symbol : 0.000003s : 0.03% pre_auto_parallel : 0.000036s : 0.30% insert-virtual-dataset : 0.000002s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000030s : 0.26% optimize.rewriter_before_opt_a : 0.000088s : 0.74% optimize.opt_a.expand_dump_flag : 0.000005s : 0.04% optimize.opt_a.switch_simplify : 0.000052s : 0.44% optimize.opt_a.loop_unroll : 0.000037s : 0.31% optimize.opt_a.a_1 : 0.000777s : 6.58% optimize.opt_a.with_stream_mark : 0.000028s : 0.24% optimize.opt_a.recompute_prepare : 0.000017s : 0.14% optimize.opt_a.updatestate_depend_eliminate : 0.000007s : 0.06% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.06% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.05% optimize.opt_a.parameter_eliminate : 0.000003s : 0.03% optimize.opt_a.a_2 : 0.000225s : 1.90% optimize.opt_a.accelerated_algorithm : 0.000014s : 0.12% optimize.opt_a.shard : 0.000003s : 0.03% optimize.opt_a.meta_shard_fg_expand : 0.000003s : 0.03% optimize.opt_a.shard_inline : 0.000013s : 0.11% optimize.opt_a.merge_send_recv : 0.000015s : 0.12% optimize.opt_a.auto_parallel : 0.000014s : 0.12% optimize.opt_a.parallel : 0.000025s : 0.21% optimize.opt_a.flash_sp : 0.000012s : 0.10% optimize.opt_a.merge_comm : 0.000007s : 0.06% optimize.opt_a.allreduce_fusion : 0.000007s : 0.06% optimize.opt_a.matmul_add_comm_reduction : 0.000016s : 0.14% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000002s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000016s : 0.13% optimize.opt_a.virtual_dataset : 0.000013s : 0.11% optimize.opt_a.get_grad_eliminate_ : 0.000012s : 0.11% optimize.opt_a.virtual_output : 0.000013s : 0.11% optimize.opt_a.merge_forward : 0.000007s : 0.06% optimize.opt_a.cell_reuse_recompute_pass : 0.000004s : 0.03% optimize.opt_a.offload_activation : 0.000018s : 0.15% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000027s : 0.23% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.02% optimize.opt_a.before_grad : 0.000020s : 0.17% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000007s : 0.06% optimize.opt_a.meta_fg_expand : 0.000005s : 0.04% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.03% optimize.opt_a.receive_attached : 0.000004s : 0.04% optimize.opt_a.after_resolve : 0.000024s : 0.20% optimize.opt_a.a_after_grad : 0.000022s : 0.19% optimize.opt_a.renormalize : 0.000720s : 6.09% optimize.opt_a.add_forward_monad_depend : 0.000008s : 0.06% optimize.opt_a.auto_monad_grad : 0.000003s : 0.03% optimize.opt_a.auto_monad_eliminator : 0.000023s : 0.19% optimize.opt_a.cse : 0.000042s : 0.36% optimize.opt_a.a_3 : 0.000114s : 0.96% optimize.py_interpret_to_execute_after_opt_a : 0.000014s : 0.12% optimize.slice_cell_reuse_recomputed_activation : 0.000006s : 0.05% optimize.rewriter_after_opt_a : 0.000041s : 0.35% optimize.convert_after_rewriter : 0.000010s : 0.08% optimize.order_py_execute_after_rewriter : 0.000009s : 0.07% optimize.mutable_eliminate : 0.000593s : 5.02% optimize.opt_b.b_1 : 0.000167s : 1.42% optimize.opt_b.b_2 : 0.000008s : 0.07% optimize.opt_b.updatestate_depend_eliminate : 0.000006s : 0.05% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.02% optimize.opt_b.updatestate_loads_eliminate : 0.000002s : 0.02% optimize.opt_b.renormalize : 0.000001s : 0.01% optimize.opt_b.cse : 0.000018s : 0.15% optimize.optimize_parallel_all_gather_comm : 0.000021s : 0.17% optimize.overlap_param_gather : 0.000005s : 0.04% optimize.cconv : 0.000033s : 0.28% optimize.loop_unroll : 0.000441s : 3.73% optimize.opt_after_cconv.c_1 : 0.000031s : 0.27% optimize.opt_after_cconv.parameter_eliminate : 0.000004s : 0.03% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000005s : 0.04% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000002s : 0.02% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.02% optimize.opt_after_cconv.cse : 0.000017s : 0.14% optimize.opt_after_cconv.renormalize : 0.000001s : 0.00% optimize.remove_dup_value : 0.000018s : 0.15% optimize.tuple_transform.d_1 : 0.000045s : 0.38% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000007s : 0.06% optimize.partial_unused_args_eliminate : 0.000004s : 0.04% optimize.add_recomputation : 0.000049s : 0.42% optimize.cse_after_recomputation.cse : 0.000011s : 0.10% optimize.environ_conv : 0.000009s : 0.07% optimize.swap_dp_allreduce_reducescatter : 0.000008s : 0.07% optimize.bias_add_comm_swap : 0.000005s : 0.05% optimize.label_micro_interleaved_index : 0.000007s : 0.06% optimize.label_fine_grained_interleaved_index : 0.000005s : 0.04% optimize.merge_cast_opt : 0.000004s : 0.03% optimize.slice_recompute_activation : 0.000004s : 0.04% optimize.micro_interleaved_order_control : 0.000005s : 0.04% optimize.assign_add_opt : 0.000004s : 0.03% optimize.ForceFp32Comm : 0.000003s : 0.03% optimize.remove_cast_before_assign_add : 0.000003s : 0.03% optimize.full_micro_interleaved_order_control : 0.000005s : 0.04% optimize.reorder_send_recv_between_fp_bp : 0.000005s : 0.05% optimize.comm_op_add_attrs : 0.000003s : 0.03% optimize.add_comm_op_reuse_tag : 0.000004s : 0.03% optimize.interleave_split_concat_branches : 0.000004s : 0.03% optimize.interleave_parallel_branches : 0.000003s : 0.03% optimize.overlap_opt_shard_in_pipeline : 0.000003s : 0.03% optimize.overlap_opt_shard_grad_in_pipeline : 0.000004s : 0.04% optimize.control_data_broadcast_order : 0.000016s : 0.14% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.03% optimize.offloading_packed_experts : 0.000006s : 0.05% optimize.overlap_recompute_and_grad_model_parallel : 0.000008s : 0.06% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.03% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.03% optimize.overlap_recompute_comm : 0.000005s : 0.04% optimize.overlap_grad_ring_attention : 0.000007s : 0.06% optimize.overlap_grad_flash_sp : 0.000021s : 0.18% optimize.begin_end_overlap_inline : 0.000003s : 0.03% optimize.split_matmul_comm_elemetwise : 0.000005s : 0.04% optimize.split_layernorm_comm : 0.000004s : 0.04% optimize.handle_group_info : 0.000003s : 0.03% optimize.symbol_engine_optimizer.build : 0.000003s : 0.02% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000010s : 0.08% optimize.symbol_engine_optimizer.elim_not_effective : 0.000013s : 0.11% optimize.symbol_engine_optimizer.opt_reshape : 0.000007s : 0.06% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000010s : 0.09% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000004s : 0.04% pipeline_parallel_scheduler : 0.000002s : 0.02% auto_monad_reorder : 0.000021s : 0.18% get_jit_bprop_graph : 0.000002s : 0.01% rewriter_after_jit_bprop_graph : 0.000006s : 0.05% opt_after_jit_grad : 0.000799s : 6.76% validate : 0.000040s : 0.34% Time group info: ------[substitution.] 0.000211 34 15.86% : 0.000034s : 6: substitution.arithmetic_simplify 0.89% : 0.000002s : 2: substitution.elim_not_effective 0.64% : 0.000001s : 2: substitution.fold_const_symbol 2.75% : 0.000006s : 4: substitution.graph_param_transform 65.92% : 0.000139s : 4: substitution.inline 1.77% : 0.000004s : 4: substitution.j_node_and_user_rematch 2.25% : 0.000005s : 4: substitution.remove_not_recompute_node 2.49% : 0.000005s : 4: substitution.replace_old_param 7.43% : 0.000016s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.005929 2 87.57% : 0.005192s : 1: type_inference.infer 12.43% : 0.000737s : 1: type_inference.specialize ------[replace.] 0.000059 8 62.36% : 0.000037s : 4: replace.inline 37.64% : 0.000022s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000151 8 90.83% : 0.000137s : 4: match.inline 9.17% : 0.000014s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000206 1278 0.87% : 0.000002s : 13: predicate.accumulaten_eliminater 1.09% : 0.000002s : 4: predicate.ad_related_special_op_eliminate 0.55% : 0.000001s : 8: predicate.addn_check_dump 0.85% : 0.000002s : 13: predicate.addn_zero_filter 0.82% : 0.000002s : 13: predicate.adjust_all_reduce_mul_add 2.45% : 0.000005s : 21: predicate.arithmetic_simplify 0.89% : 0.000002s : 13: predicate.cast_eliminate 0.59% : 0.000001s : 8: predicate.check_bprop_eliminate 0.51% : 0.000001s : 8: predicate.compare_switch_simplify 0.19% : 0.000000s : 4: predicate.const_output_eliminate 0.60% : 0.000001s : 8: predicate.depend_value_elim 0.89% : 0.000002s : 13: predicate.dict_get_item_const_eliminator 0.98% : 0.000002s : 13: predicate.dict_get_item_eliminator 0.83% : 0.000002s : 13: predicate.dict_set_item_eliminator 1.06% : 0.000002s : 8: predicate.dumpgradient_eliminate 0.18% : 0.000000s : 4: predicate.elim_not_effective 0.38% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.15% : 0.000002s : 17: predicate.environ_add_const_eliminate 1.04% : 0.000002s : 17: predicate.environ_get_add_eliminate 1.06% : 0.000002s : 17: predicate.environ_get_depend_swap 1.71% : 0.000004s : 25: predicate.environ_get_eliminate 1.03% : 0.000002s : 17: predicate.environ_get_set_eliminate 1.40% : 0.000003s : 21: predicate.exchange_switch_depend_value 2.53% : 0.000005s : 21: predicate.float_depend_g_call 0.51% : 0.000001s : 8: predicate.float_environ_get_switch 0.76% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.16% : 0.000000s : 4: predicate.fold_const_symbol 0.80% : 0.000002s : 8: predicate.get_grad_eliminate 0.20% : 0.000000s : 4: predicate.graph_param_transform 0.57% : 0.000001s : 8: predicate.incorporate_call 0.50% : 0.000001s : 8: predicate.incorporate_call_switch 6.41% : 0.000013s : 58: predicate.inline 0.82% : 0.000002s : 8: predicate.inline_without_move 0.32% : 0.000001s : 8: predicate.j_node_and_user_rematch 0.78% : 0.000002s : 8: predicate.less_batch_normalization 1.73% : 0.000004s : 25: predicate.list_to_tuple_eliminator_ 2.50% : 0.000005s : 38: predicate.load_eliminater 0.96% : 0.000002s : 4: predicate.loop_unroll_after_grad 2.26% : 0.000005s : 34: predicate.loop_unroll_before_grad 1.54% : 0.000003s : 21: predicate.make_slice_get_slice_eliminator 0.53% : 0.000001s : 8: predicate.merge_addn 0.56% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.54% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.79% : 0.000002s : 13: predicate.minmaximum_grad 0.98% : 0.000002s : 4: predicate.mutable_eliminate 0.42% : 0.000001s : 4: predicate.opt_reshape 0.35% : 0.000001s : 4: predicate.parallel_virtual_node 1.82% : 0.000004s : 21: predicate.partial_defer_inline 1.62% : 0.000003s : 21: predicate.partial_eliminate 0.86% : 0.000002s : 13: predicate.print_const_string_wrapper 0.60% : 0.000001s : 8: predicate.reduce_all_const_elim 1.28% : 0.000003s : 13: predicate.reduce_eliminate 2.48% : 0.000005s : 38: predicate.redundant_stop_gradient_eliminater 0.39% : 0.000001s : 8: predicate.remove_not_recompute_node 1.38% : 0.000003s : 25: predicate.replace_applicator 0.76% : 0.000002s : 8: predicate.replace_old_param 0.36% : 0.000001s : 4: predicate.reset_defer_inline 0.99% : 0.000002s : 13: predicate.reshape_eliminate 0.65% : 0.000001s : 8: predicate.row_tensor_add_zeros_like 0.37% : 0.000001s : 4: predicate.row_tensor_eliminate 0.87% : 0.000002s : 8: predicate.same_eliminate 0.53% : 0.000001s : 8: predicate.set_cell_output_no_recompute 1.01% : 0.000002s : 8: predicate.shard_identity_eliminate 0.67% : 0.000001s : 8: predicate.special_op_eliminate 0.70% : 0.000001s : 8: predicate.specialize_transform 0.95% : 0.000002s : 8: predicate.split_environ_get_set_with_tuple_value 1.14% : 0.000002s : 8: predicate.stack_unstack_eliminate 0.33% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.50% : 0.000003s : 21: predicate.switch_defer_inline 2.02% : 0.000004s : 29: predicate.switch_layer_defer_inline 5.09% : 0.000010s : 67: predicate.switch_simplify 0.94% : 0.000002s : 13: predicate.tile_eliminate 1.06% : 0.000002s : 13: predicate.transpose_eliminate 1.51% : 0.000003s : 21: predicate.tuple_list_convert_item_index_to_positive 1.46% : 0.000003s : 21: predicate.tuple_list_get_item_const_eliminator 1.38% : 0.000003s : 21: predicate.tuple_list_get_item_depend_reorder 3.13% : 0.000006s : 33: predicate.tuple_list_get_item_eliminator 1.52% : 0.000003s : 21: predicate.tuple_list_get_set_item_eliminator 2.18% : 0.000005s : 29: predicate.tuple_list_set_item_eliminator 1.71% : 0.000004s : 25: predicate.tuple_to_list_eliminator_ 2.38% : 0.000005s : 38: predicate.updatestate_pure_node_eliminater 3.14% : 0.000006s : 46: predicate.updatestate_useless_node_eliminater 0.36% : 0.000001s : 4: predicate.value_based_eliminate 0.82% : 0.000002s : 8: predicate.virtual_dataset_eliminate 0.67% : 0.000001s : 8: predicate.virtual_output_eliminate 0.26% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.47% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000560 11 55.47% : 0.000311s : 5: func_graph_cloner_run.FuncGraphClonerGraph 44.53% : 0.000249s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.028980 192 0.02% : 0.000006s : 1: ForceFp32Comm 13.49% : 0.003910s : 1: add_attr 13.44% : 0.003894s : 1: add_attr_with_inline 0.02% : 0.000006s : 1: add_comm_op_reuse_tag 0.18% : 0.000053s : 1: add_recomputation 0.02% : 0.000006s : 1: assign_add_opt 0.27% : 0.000078s : 1: auto_monad 0.10% : 0.000029s : 1: auto_monad_reorder 0.02% : 0.000006s : 1: begin_end_overlap_inline 0.03% : 0.000008s : 1: bias_add_comm_swap 1.78% : 0.000515s : 1: bootstrap 0.12% : 0.000036s : 1: cconv 0.02% : 0.000006s : 1: comm_op_add_attrs 0.07% : 0.000019s : 1: control_data_broadcast_order 0.05% : 0.000013s : 1: convert_after_rewriter 0.11% : 0.000031s : 1: cse_after_recomputation 0.03% : 0.000008s : 1: dataset_repeat_opt 0.07% : 0.000021s : 1: detach_backward 0.04% : 0.000012s : 1: environ_conv 0.10% : 0.000030s : 1: event_method 0.03% : 0.000008s : 1: full_micro_interleaved_order_control 0.03% : 0.000008s : 1: get_jit_bprop_graph 0.05% : 0.000013s : 1: graph_reusing 0.02% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.02% : 0.000006s : 1: handle_group_info 0.03% : 0.000008s : 1: inline 0.03% : 0.000009s : 1: insert-virtual-dataset 0.02% : 0.000006s : 1: interleave_parallel_branches 0.02% : 0.000006s : 1: interleave_split_concat_branches 0.03% : 0.000008s : 1: label_fine_grained_interleaved_index 0.04% : 0.000010s : 1: label_micro_interleaved_index 1.54% : 0.000447s : 1: loop_unroll 0.02% : 0.000006s : 1: merge_cast_opt 0.03% : 0.000007s : 1: micro_interleaved_order_control 2.07% : 0.000600s : 1: mutable_eliminate 0.03% : 0.000009s : 1: offloading_packed_experts 0.05% : 0.000014s : 1: opt.transform.loop_unroll_optimizer 0.06% : 0.000016s : 1: opt.transform.mutable_eliminate 4.21% : 0.001220s : 78: opt.transform.opt_a 0.10% : 0.000030s : 1: opt.transform.opt_after_cconv 0.09% : 0.000027s : 1: opt.transform.opt_after_jit_grad 0.35% : 0.000102s : 28: opt.transform.opt_b 0.17% : 0.000049s : 2: opt.transform.opt_trans_graph 0.12% : 0.000036s : 4: opt.transform.symbol_engine_opt 10.88% : 0.003154s : 1: opt_a 0.45% : 0.000131s : 1: opt_after_cconv 2.80% : 0.000810s : 1: opt_after_jit_grad 0.95% : 0.000276s : 1: opt_b 20.20% : 0.005854s : 1: optimize 0.08% : 0.000024s : 1: optimize_parallel_all_gather_comm 0.04% : 0.000012s : 1: order_py_execute_after_rewriter 0.08% : 0.000024s : 1: overlap_grad_flash_sp 0.02% : 0.000006s : 1: overlap_grad_matmul_and_grad_allreduce 0.03% : 0.000009s : 1: overlap_grad_ring_attention 0.02% : 0.000007s : 1: overlap_opt_shard_grad_in_pipeline 0.02% : 0.000006s : 1: overlap_opt_shard_in_pipeline 0.03% : 0.000008s : 1: overlap_param_gather 0.02% : 0.000006s : 1: overlap_recompute_allgather_and_fa_grad 0.04% : 0.000011s : 1: overlap_recompute_and_grad_model_parallel 0.03% : 0.000008s : 1: overlap_recompute_comm 0.03% : 0.000010s : 1: parallel-infer-symbol 0.02% : 0.000006s : 1: parallel-infer-symbol-second 0.03% : 0.000007s : 1: partial_unused_args_eliminate 0.03% : 0.000009s : 1: pipeline_parallel_scheduler 0.02% : 0.000007s : 1: pipeline_split 0.15% : 0.000044s : 1: pre_auto_parallel 0.12% : 0.000034s : 1: py_interpret_to_execute 0.06% : 0.000017s : 1: py_interpret_to_execute_after_opt_a 0.02% : 0.000006s : 1: remove_cast_before_assign_add 0.07% : 0.000021s : 1: remove_dup_value 1.39% : 0.000404s : 1: renormalize.infer 1.06% : 0.000308s : 1: renormalize.specialize 0.03% : 0.000008s : 1: reorder_send_recv_between_fp_bp 0.04% : 0.000012s : 1: rewriter_after_jit_bprop_graph 0.15% : 0.000045s : 1: rewriter_after_opt_a 0.32% : 0.000092s : 1: rewriter_before_opt_a 0.03% : 0.000009s : 1: slice_cell_reuse_recomputed_activation 0.03% : 0.000007s : 1: slice_recompute_activation 0.03% : 0.000007s : 1: split_layernorm_comm 0.03% : 0.000008s : 1: split_matmul_comm_elemetwise 0.04% : 0.000011s : 1: swap_dp_allreduce_reducescatter 0.34% : 0.000100s : 1: symbol_engine_optimizer 0.33% : 0.000095s : 1: tuple_transform 20.80% : 0.006028s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:43:00.727.489 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0924921, [21] [bootstrap]: 0.00080437 [type_inference]: 0.00566472 [event_method]: 1.855e-05 [auto_monad]: 6.264e-05 [graph_reusing]: 6.35002e-06 [inline]: 2.17001e-06 [add_attr]: 0.00331595, [1] [add_attr_with_inline]: 0.00330573, [1] [Cycle 1]: 6.946e-05, [2] [tag_attr]: 2.013e-05 [meta_addattr_fg_expand]: 5.90002e-06 [parallel-infer-symbol]: 3.45998e-06 [pre_auto_parallel]: 3.876e-05 [insert-virtual-dataset]: 2.36e-06 [parallel-infer-symbol-second]: 8.09989e-07 [dataset_repeat_opt]: 2.22001e-06 [pipeline_split]: 1.70001e-06 [optimize]: 0.0818274, [53] [py_interpret_to_execute]: 2.83e-05 [rewriter_before_opt_a]: 8.637e-05 [opt_a]: 0.0793429, [2] [Cycle 1]: 0.0785314, [45] [expand_dump_flag]: 3.08e-06 [switch_simplify]: 4.289e-05 [loop_unroll]: 3.103e-05 [a_1]: 0.00065128 [with_stream_mark]: 1.789e-05 [recompute_prepare]: 9.44e-06 [updatestate_depend_eliminate]: 3.97e-06 [updatestate_assign_eliminate]: 3.26001e-06 [updatestate_loads_eliminate]: 3.06001e-06 [parameter_eliminate]: 1.77999e-06 [a_2]: 8.971e-05 [accelerated_algorithm]: 7.13e-06 [shard]: 1.66e-06 [meta_shard_fg_expand]: 2.05002e-06 [shard_inline]: 6.77002e-06 [merge_send_recv]: 9.15001e-06 [auto_parallel]: 7.53999e-06 [parallel]: 1.949e-05 [flash_sp]: 8.28001e-06 [merge_comm]: 4.25e-06 [allreduce_fusion]: 3.48e-06 [matmul_add_comm_reduction]: 9.52001e-06 [allreduce_slice_to_reducescatter]: 6.69999e-07 [virtual_shard_identity]: 8.01001e-06 [virtual_dataset]: 6.84001e-06 [get_grad_eliminate_]: 6.17999e-06 [virtual_output]: 7.01001e-06 [merge_forward]: 3.93001e-06 [cell_reuse_recompute_pass]: 1.12999e-06 [offload_activation]: 1.102e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.257e-05 [merge_recompute_call_nodes]: 1.65001e-06 [before_grad]: 1.141e-05 [set_forward_comm_id_for_comm_node_pass]: 3.5e-06 [meta_fg_expand]: 2.84999e-06 [flash_sp_send_recv_attached]: 2.46e-06 [receive_attached]: 2.78998e-06 [after_resolve]: 1.175e-05 [a_after_grad]: 9.92001e-06 [renormalize]: 0.0770585 [add_forward_monad_depend]: 1.37e-05 [auto_monad_grad]: 2.98e-06 [auto_monad_eliminator]: 2.51e-05 [cse]: 3.361e-05 [a_3]: 6.739e-05 [Cycle 2]: 0.00079878, [45] [expand_dump_flag]: 2.64999e-06 [switch_simplify]: 9.75002e-06 [loop_unroll]: 7.36999e-06 [a_1]: 0.00016762 [with_stream_mark]: 2.044e-05 [recompute_prepare]: 7.78999e-06 [updatestate_depend_eliminate]: 4.65001e-06 [updatestate_assign_eliminate]: 3.39001e-06 [updatestate_loads_eliminate]: 3.7e-06 [parameter_eliminate]: 2.12999e-06 [a_2]: 8.563e-05 [accelerated_algorithm]: 7.76001e-06 [shard]: 2.40002e-06 [meta_shard_fg_expand]: 2.79999e-06 [shard_inline]: 6.35002e-06 [merge_send_recv]: 9.24e-06 [auto_parallel]: 1.148e-05 [parallel]: 9.28002e-06 [flash_sp]: 4.15999e-06 [merge_comm]: 3.91001e-06 [allreduce_fusion]: 3.79002e-06 [matmul_add_comm_reduction]: 1.056e-05 [allreduce_slice_to_reducescatter]: 1.35001e-06 [virtual_shard_identity]: 8.14002e-06 [virtual_dataset]: 6.58e-06 [get_grad_eliminate_]: 6.64001e-06 [virtual_output]: 6.13002e-06 [merge_forward]: 4.56002e-06 [cell_reuse_recompute_pass]: 3.06001e-06 [offload_activation]: 1.228e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.275e-05 [merge_recompute_call_nodes]: 1.54e-06 [before_grad]: 1.102e-05 [set_forward_comm_id_for_comm_node_pass]: 4.07998e-06 [meta_fg_expand]: 2.63e-06 [flash_sp_send_recv_attached]: 2.19001e-06 [receive_attached]: 2.26e-06 [after_resolve]: 1.391e-05 [a_after_grad]: 1.182e-05 [renormalize]: 8.00064e-08 [add_forward_monad_depend]: 2.38002e-06 [auto_monad_grad]: 1.63002e-06 [auto_monad_eliminator]: 1.022e-05 [cse]: 1.834e-05 [a_3]: 3.821e-05 [py_interpret_to_execute_after_opt_a]: 1.986e-05 [slice_cell_reuse_recomputed_activation]: 2.59001e-06 [rewriter_after_opt_a]: 4.469e-05 [convert_after_rewriter]: 7.6e-06 [order_py_execute_after_rewriter]: 6.13002e-06 [mutable_eliminate]: 0.00082511 [opt_b]: 0.00022115, [1] [Cycle 1]: 0.00021301, [7] [b_1]: 0.00012822 [b_2]: 8.51997e-06 [updatestate_depend_eliminate]: 8.40999e-06 [updatestate_assign_eliminate]: 2.78e-06 [updatestate_loads_eliminate]: 2.68e-06 [renormalize]: 1.12e-06 [cse]: 2.374e-05 [optimize_parallel_all_gather_comm]: 1.883e-05 [overlap_param_gather]: 1.81e-06 [cconv]: 3.267e-05 [loop_unroll]: 0.00046228 [opt_after_cconv]: 0.00010917, [1] [Cycle 1]: 0.00010282, [7] [c_1]: 3.262e-05 [parameter_eliminate]: 4.74e-06 [updatestate_depend_eliminate]: 6.36e-06 [updatestate_assign_eliminate]: 2.67001e-06 [updatestate_loads_eliminate]: 2.62001e-06 [cse]: 1.864e-05 [renormalize]: 8.89995e-07 [remove_dup_value]: 1.441e-05 [tuple_transform]: 8.212e-05, [1] [Cycle 1]: 7.742e-05, [4] [d_1]: 4.911e-05 [none_parameter_eliminate]: 1.79e-06 [renormalize]: 2.60014e-07 [switch_simplify]: 7.45e-06 [partial_unused_args_eliminate]: 1.97999e-06 [add_recomputation]: 5.245e-05 [cse_after_recomputation]: 2.219e-05, [1] [Cycle 1]: 1.697e-05, [1] [cse]: 1.09e-05 [environ_conv]: 5.42001e-06 [swap_dp_allreduce_reducescatter]: 6.58998e-06 [bias_add_comm_swap]: 2.96001e-06 [label_micro_interleaved_index]: 4.76002e-06 [label_fine_grained_interleaved_index]: 2.77002e-06 [merge_cast_opt]: 1.32999e-06 [slice_recompute_activation]: 1.99999e-06 [micro_interleaved_order_control]: 2.81e-06 [assign_add_opt]: 1.30001e-06 [ForceFp32Comm]: 1.22999e-06 [remove_cast_before_assign_add]: 1.04e-06 [full_micro_interleaved_order_control]: 2.25002e-06 [reorder_send_recv_between_fp_bp]: 2.91999e-06 [comm_op_add_attrs]: 1.01002e-06 [add_comm_op_reuse_tag]: 1.04998e-06 [interleave_split_concat_branches]: 1.12999e-06 [interleave_parallel_branches]: 1.00999e-06 [overlap_opt_shard_in_pipeline]: 1.37e-06 [overlap_opt_shard_grad_in_pipeline]: 1.866e-05 [control_data_broadcast_order]: 1.419e-05 [grouped_pairwise_exchange_alltoall]: 1.97001e-06 [offloading_packed_experts]: 4.11001e-06 [overlap_recompute_and_grad_model_parallel]: 4.84e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.26002e-06 [overlap_recompute_allgather_and_fa_grad]: 1.32e-06 [overlap_recompute_comm]: 2.36e-06 [overlap_grad_ring_attention]: 4.92999e-06 [overlap_grad_flash_sp]: 2.439e-05 [begin_end_overlap_inline]: 5.89993e-07 [split_matmul_comm_elemetwise]: 2.15002e-06 [split_layernorm_comm]: 1.95001e-06 [handle_group_info]: 1.76003e-06 [symbol_engine_optimizer]: 8.482e-05, [1] [Cycle 1]: 7.973e-05, [6] [build]: 4.60001e-06 [elim_shapecalc]: 1.199e-05 [elim_not_effective]: 1.405e-05 [opt_reshape]: 7.64002e-06 [fold_const_symbol]: 1.048e-05 [renormalize]: 2.50002e-07 [detach_backward]: 2.30002e-06 [pipeline_parallel_scheduler]: 1.54e-06 [auto_monad_reorder]: 1.922e-05 [get_jit_bprop_graph]: 2.10002e-06 [rewriter_after_jit_bprop_graph]: 5.81998e-06 [opt_after_jit_grad]: 0.00050121 [validate]: 4.605e-05 Sums bootstrap : 0.000804s : 0.91% type_inference : 0.005665s : 6.43% event_method : 0.000019s : 0.02% auto_monad : 0.000063s : 0.07% graph_reusing : 0.000006s : 0.01% inline : 0.000002s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000020s : 0.02% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.01% parallel-infer-symbol : 0.000003s : 0.00% pre_auto_parallel : 0.000039s : 0.04% insert-virtual-dataset : 0.000002s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000028s : 0.03% optimize.rewriter_before_opt_a : 0.000086s : 0.10% optimize.opt_a.expand_dump_flag : 0.000006s : 0.01% optimize.opt_a.switch_simplify : 0.000053s : 0.06% optimize.opt_a.loop_unroll : 0.000038s : 0.04% optimize.opt_a.a_1 : 0.000819s : 0.93% optimize.opt_a.with_stream_mark : 0.000038s : 0.04% optimize.opt_a.recompute_prepare : 0.000017s : 0.02% optimize.opt_a.updatestate_depend_eliminate : 0.000009s : 0.01% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.01% optimize.opt_a.updatestate_loads_eliminate : 0.000007s : 0.01% optimize.opt_a.parameter_eliminate : 0.000004s : 0.00% optimize.opt_a.a_2 : 0.000175s : 0.20% optimize.opt_a.accelerated_algorithm : 0.000015s : 0.02% optimize.opt_a.shard : 0.000004s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.000005s : 0.01% optimize.opt_a.shard_inline : 0.000013s : 0.01% optimize.opt_a.merge_send_recv : 0.000018s : 0.02% optimize.opt_a.auto_parallel : 0.000019s : 0.02% optimize.opt_a.parallel : 0.000029s : 0.03% optimize.opt_a.flash_sp : 0.000012s : 0.01% optimize.opt_a.merge_comm : 0.000008s : 0.01% optimize.opt_a.allreduce_fusion : 0.000007s : 0.01% optimize.opt_a.matmul_add_comm_reduction : 0.000020s : 0.02% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000002s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000016s : 0.02% optimize.opt_a.virtual_dataset : 0.000013s : 0.02% optimize.opt_a.get_grad_eliminate_ : 0.000013s : 0.01% optimize.opt_a.virtual_output : 0.000013s : 0.01% optimize.opt_a.merge_forward : 0.000008s : 0.01% optimize.opt_a.cell_reuse_recompute_pass : 0.000004s : 0.00% optimize.opt_a.offload_activation : 0.000023s : 0.03% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000025s : 0.03% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.00% optimize.opt_a.before_grad : 0.000022s : 0.03% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000008s : 0.01% optimize.opt_a.meta_fg_expand : 0.000005s : 0.01% optimize.opt_a.flash_sp_send_recv_attached : 0.000005s : 0.01% optimize.opt_a.receive_attached : 0.000005s : 0.01% optimize.opt_a.after_resolve : 0.000026s : 0.03% optimize.opt_a.a_after_grad : 0.000022s : 0.02% optimize.opt_a.renormalize : 0.077059s : 87.45% optimize.opt_a.add_forward_monad_depend : 0.000016s : 0.02% optimize.opt_a.auto_monad_grad : 0.000005s : 0.01% optimize.opt_a.auto_monad_eliminator : 0.000035s : 0.04% optimize.opt_a.cse : 0.000052s : 0.06% optimize.opt_a.a_3 : 0.000106s : 0.12% optimize.py_interpret_to_execute_after_opt_a : 0.000020s : 0.02% optimize.slice_cell_reuse_recomputed_activation : 0.000003s : 0.00% optimize.rewriter_after_opt_a : 0.000045s : 0.05% optimize.convert_after_rewriter : 0.000008s : 0.01% optimize.order_py_execute_after_rewriter : 0.000006s : 0.01% optimize.mutable_eliminate : 0.000825s : 0.94% optimize.opt_b.b_1 : 0.000128s : 0.15% optimize.opt_b.b_2 : 0.000009s : 0.01% optimize.opt_b.updatestate_depend_eliminate : 0.000008s : 0.01% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.00% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000024s : 0.03% optimize.optimize_parallel_all_gather_comm : 0.000019s : 0.02% optimize.overlap_param_gather : 0.000002s : 0.00% optimize.cconv : 0.000033s : 0.04% optimize.loop_unroll : 0.000462s : 0.52% optimize.opt_after_cconv.c_1 : 0.000033s : 0.04% optimize.opt_after_cconv.parameter_eliminate : 0.000005s : 0.01% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.01% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.cse : 0.000019s : 0.02% optimize.opt_after_cconv.renormalize : 0.000001s : 0.00% optimize.remove_dup_value : 0.000014s : 0.02% optimize.tuple_transform.d_1 : 0.000049s : 0.06% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000007s : 0.01% optimize.partial_unused_args_eliminate : 0.000002s : 0.00% optimize.add_recomputation : 0.000052s : 0.06% optimize.cse_after_recomputation.cse : 0.000011s : 0.01% optimize.environ_conv : 0.000005s : 0.01% optimize.swap_dp_allreduce_reducescatter : 0.000007s : 0.01% optimize.bias_add_comm_swap : 0.000003s : 0.00% optimize.label_micro_interleaved_index : 0.000005s : 0.01% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.00% optimize.merge_cast_opt : 0.000001s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.00% optimize.micro_interleaved_order_control : 0.000003s : 0.00% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.00% optimize.full_micro_interleaved_order_control : 0.000002s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.00% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000019s : 0.02% optimize.control_data_broadcast_order : 0.000014s : 0.02% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.00% optimize.offloading_packed_experts : 0.000004s : 0.00% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.01% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.00% optimize.overlap_recompute_comm : 0.000002s : 0.00% optimize.overlap_grad_ring_attention : 0.000005s : 0.01% optimize.overlap_grad_flash_sp : 0.000024s : 0.03% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.00% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000002s : 0.00% optimize.symbol_engine_optimizer.build : 0.000005s : 0.01% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000012s : 0.01% optimize.symbol_engine_optimizer.elim_not_effective : 0.000014s : 0.02% optimize.symbol_engine_optimizer.opt_reshape : 0.000008s : 0.01% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000010s : 0.01% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.00% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000019s : 0.02% get_jit_bprop_graph : 0.000002s : 0.00% rewriter_after_jit_bprop_graph : 0.000006s : 0.01% opt_after_jit_grad : 0.000501s : 0.57% validate : 0.000046s : 0.05% Time group info: ------[substitution.] 0.000229 34 17.79% : 0.000041s : 6: substitution.arithmetic_simplify 0.90% : 0.000002s : 2: substitution.elim_not_effective 0.77% : 0.000002s : 2: substitution.fold_const_symbol 2.84% : 0.000007s : 4: substitution.graph_param_transform 64.02% : 0.000147s : 4: substitution.inline 2.42% : 0.000006s : 4: substitution.j_node_and_user_rematch 2.22% : 0.000005s : 4: substitution.remove_not_recompute_node 2.69% : 0.000006s : 4: substitution.replace_old_param 6.35% : 0.000015s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.005604 2 88.11% : 0.004938s : 1: type_inference.infer 11.89% : 0.000666s : 1: type_inference.specialize ------[replace.] 0.000062 8 62.93% : 0.000039s : 4: replace.inline 37.07% : 0.000023s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000157 8 91.88% : 0.000144s : 4: match.inline 8.12% : 0.000013s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000222 1278 1.11% : 0.000002s : 13: predicate.accumulaten_eliminater 0.90% : 0.000002s : 4: predicate.ad_related_special_op_eliminate 0.48% : 0.000001s : 8: predicate.addn_check_dump 0.95% : 0.000002s : 13: predicate.addn_zero_filter 0.83% : 0.000002s : 13: predicate.adjust_all_reduce_mul_add 2.97% : 0.000007s : 21: predicate.arithmetic_simplify 1.00% : 0.000002s : 13: predicate.cast_eliminate 0.62% : 0.000001s : 8: predicate.check_bprop_eliminate 0.60% : 0.000001s : 8: predicate.compare_switch_simplify 0.17% : 0.000000s : 4: predicate.const_output_eliminate 0.59% : 0.000001s : 8: predicate.depend_value_elim 0.92% : 0.000002s : 13: predicate.dict_get_item_const_eliminator 1.04% : 0.000002s : 13: predicate.dict_get_item_eliminator 0.88% : 0.000002s : 13: predicate.dict_set_item_eliminator 1.13% : 0.000003s : 8: predicate.dumpgradient_eliminate 0.22% : 0.000000s : 4: predicate.elim_not_effective 0.41% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.17% : 0.000003s : 17: predicate.environ_add_const_eliminate 1.13% : 0.000003s : 17: predicate.environ_get_add_eliminate 1.03% : 0.000002s : 17: predicate.environ_get_depend_swap 1.64% : 0.000004s : 25: predicate.environ_get_eliminate 1.06% : 0.000002s : 17: predicate.environ_get_set_eliminate 1.38% : 0.000003s : 21: predicate.exchange_switch_depend_value 2.45% : 0.000005s : 21: predicate.float_depend_g_call 0.51% : 0.000001s : 8: predicate.float_environ_get_switch 0.83% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.17% : 0.000000s : 4: predicate.fold_const_symbol 0.80% : 0.000002s : 8: predicate.get_grad_eliminate 0.30% : 0.000001s : 4: predicate.graph_param_transform 0.54% : 0.000001s : 8: predicate.incorporate_call 0.44% : 0.000001s : 8: predicate.incorporate_call_switch 6.19% : 0.000014s : 58: predicate.inline 0.86% : 0.000002s : 8: predicate.inline_without_move 0.30% : 0.000001s : 8: predicate.j_node_and_user_rematch 1.07% : 0.000002s : 8: predicate.less_batch_normalization 1.72% : 0.000004s : 25: predicate.list_to_tuple_eliminator_ 2.37% : 0.000005s : 38: predicate.load_eliminater 0.79% : 0.000002s : 4: predicate.loop_unroll_after_grad 2.28% : 0.000005s : 34: predicate.loop_unroll_before_grad 1.78% : 0.000004s : 21: predicate.make_slice_get_slice_eliminator 0.53% : 0.000001s : 8: predicate.merge_addn 0.56% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.54% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.97% : 0.000002s : 13: predicate.minmaximum_grad 1.12% : 0.000002s : 4: predicate.mutable_eliminate 0.54% : 0.000001s : 4: predicate.opt_reshape 0.35% : 0.000001s : 4: predicate.parallel_virtual_node 1.85% : 0.000004s : 21: predicate.partial_defer_inline 1.54% : 0.000003s : 21: predicate.partial_eliminate 0.86% : 0.000002s : 13: predicate.print_const_string_wrapper 0.58% : 0.000001s : 8: predicate.reduce_all_const_elim 1.29% : 0.000003s : 13: predicate.reduce_eliminate 2.45% : 0.000005s : 38: predicate.redundant_stop_gradient_eliminater 0.41% : 0.000001s : 8: predicate.remove_not_recompute_node 1.38% : 0.000003s : 25: predicate.replace_applicator 0.46% : 0.000001s : 8: predicate.replace_old_param 0.42% : 0.000001s : 4: predicate.reset_defer_inline 1.02% : 0.000002s : 13: predicate.reshape_eliminate 0.58% : 0.000001s : 8: predicate.row_tensor_add_zeros_like 0.41% : 0.000001s : 4: predicate.row_tensor_eliminate 0.98% : 0.000002s : 8: predicate.same_eliminate 0.49% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.94% : 0.000002s : 8: predicate.shard_identity_eliminate 0.80% : 0.000002s : 8: predicate.special_op_eliminate 0.63% : 0.000001s : 8: predicate.specialize_transform 0.98% : 0.000002s : 8: predicate.split_environ_get_set_with_tuple_value 0.97% : 0.000002s : 8: predicate.stack_unstack_eliminate 0.31% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.48% : 0.000003s : 21: predicate.switch_defer_inline 1.96% : 0.000004s : 29: predicate.switch_layer_defer_inline 4.80% : 0.000011s : 67: predicate.switch_simplify 0.85% : 0.000002s : 13: predicate.tile_eliminate 0.97% : 0.000002s : 13: predicate.transpose_eliminate 1.41% : 0.000003s : 21: predicate.tuple_list_convert_item_index_to_positive 1.44% : 0.000003s : 21: predicate.tuple_list_get_item_const_eliminator 1.40% : 0.000003s : 21: predicate.tuple_list_get_item_depend_reorder 2.94% : 0.000007s : 33: predicate.tuple_list_get_item_eliminator 1.34% : 0.000003s : 21: predicate.tuple_list_get_set_item_eliminator 2.23% : 0.000005s : 29: predicate.tuple_list_set_item_eliminator 1.61% : 0.000004s : 25: predicate.tuple_to_list_eliminator_ 2.28% : 0.000005s : 38: predicate.updatestate_pure_node_eliminater 2.99% : 0.000007s : 46: predicate.updatestate_useless_node_eliminater 0.37% : 0.000001s : 4: predicate.value_based_eliminate 0.71% : 0.000002s : 8: predicate.virtual_dataset_eliminate 0.81% : 0.000002s : 8: predicate.virtual_output_eliminate 0.27% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.58% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000610 11 47.47% : 0.000289s : 5: func_graph_cloner_run.FuncGraphClonerGraph 52.53% : 0.000320s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.256108 192 0.00% : 0.000004s : 1: ForceFp32Comm 1.30% : 0.003321s : 1: add_attr 1.29% : 0.003310s : 1: add_attr_with_inline 0.00% : 0.000004s : 1: add_comm_op_reuse_tag 0.02% : 0.000057s : 1: add_recomputation 0.00% : 0.000004s : 1: assign_add_opt 0.03% : 0.000068s : 1: auto_monad 0.01% : 0.000023s : 1: auto_monad_reorder 0.00% : 0.000004s : 1: begin_end_overlap_inline 0.00% : 0.000006s : 1: bias_add_comm_swap 0.33% : 0.000837s : 1: bootstrap 0.01% : 0.000036s : 1: cconv 0.00% : 0.000004s : 1: comm_op_add_attrs 0.01% : 0.000018s : 1: control_data_broadcast_order 0.00% : 0.000011s : 1: convert_after_rewriter 0.01% : 0.000025s : 1: cse_after_recomputation 0.00% : 0.000005s : 1: dataset_repeat_opt 0.00% : 0.000006s : 1: detach_backward 0.00% : 0.000009s : 1: environ_conv 0.01% : 0.000025s : 1: event_method 0.00% : 0.000005s : 1: full_micro_interleaved_order_control 0.00% : 0.000005s : 1: get_jit_bprop_graph 0.00% : 0.000010s : 1: graph_reusing 0.00% : 0.000005s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000005s : 1: handle_group_info 0.00% : 0.000005s : 1: inline 0.00% : 0.000006s : 1: insert-virtual-dataset 0.00% : 0.000004s : 1: interleave_parallel_branches 0.00% : 0.000004s : 1: interleave_split_concat_branches 0.00% : 0.000006s : 1: label_fine_grained_interleaved_index 0.00% : 0.000008s : 1: label_micro_interleaved_index 0.18% : 0.000471s : 1: loop_unroll 0.00% : 0.000004s : 1: merge_cast_opt 0.00% : 0.000006s : 1: micro_interleaved_order_control 0.33% : 0.000836s : 1: mutable_eliminate 0.00% : 0.000007s : 1: offloading_packed_experts 0.01% : 0.000016s : 1: opt.transform.loop_unroll_optimizer 0.01% : 0.000021s : 1: opt.transform.mutable_eliminate 0.50% : 0.001279s : 78: opt.transform.opt_a 0.01% : 0.000031s : 1: opt.transform.opt_after_cconv 0.01% : 0.000027s : 1: opt.transform.opt_after_jit_grad 0.04% : 0.000103s : 28: opt.transform.opt_b 0.02% : 0.000054s : 2: opt.transform.opt_trans_graph 0.02% : 0.000039s : 4: opt.transform.symbol_engine_opt 30.98% : 0.079347s : 1: opt_a 0.04% : 0.000113s : 1: opt_after_cconv 0.20% : 0.000512s : 1: opt_after_jit_grad 0.09% : 0.000225s : 1: opt_b 31.95% : 0.081833s : 1: optimize 0.01% : 0.000022s : 1: optimize_parallel_all_gather_comm 0.00% : 0.000009s : 1: order_py_execute_after_rewriter 0.01% : 0.000028s : 1: overlap_grad_flash_sp 0.00% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.00% : 0.000008s : 1: overlap_grad_ring_attention 0.01% : 0.000022s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000005s : 1: overlap_param_gather 0.00% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.00% : 0.000008s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000005s : 1: overlap_recompute_comm 0.00% : 0.000007s : 1: parallel-infer-symbol 0.00% : 0.000004s : 1: parallel-infer-symbol-second 0.00% : 0.000005s : 1: partial_unused_args_eliminate 0.00% : 0.000005s : 1: pipeline_parallel_scheduler 0.00% : 0.000005s : 1: pipeline_split 0.02% : 0.000043s : 1: pre_auto_parallel 0.01% : 0.000033s : 1: py_interpret_to_execute 0.01% : 0.000024s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000004s : 1: remove_cast_before_assign_add 0.01% : 0.000018s : 1: remove_dup_value 29.88% : 0.076514s : 1: renormalize.infer 0.20% : 0.000524s : 1: renormalize.specialize 0.00% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.00% : 0.000009s : 1: rewriter_after_jit_bprop_graph 0.02% : 0.000050s : 1: rewriter_after_opt_a 0.04% : 0.000091s : 1: rewriter_before_opt_a 0.00% : 0.000006s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000005s : 1: slice_recompute_activation 0.00% : 0.000005s : 1: split_layernorm_comm 0.00% : 0.000005s : 1: split_matmul_comm_elemetwise 0.00% : 0.000010s : 1: swap_dp_allreduce_reducescatter 0.03% : 0.000087s : 1: symbol_engine_optimizer 0.03% : 0.000085s : 1: tuple_transform 2.22% : 0.005684s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:43:01.759.411 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:43:01.759.688 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0757663, [21] [bootstrap]: 0.00043389 [type_inference]: 0.0077717 [event_method]: 2.453e-05 [auto_monad]: 7.265e-05 [graph_reusing]: 6.08002e-06 [inline]: 3.53e-06 [add_attr]: 0.00421936, [1] [add_attr_with_inline]: 0.00420663, [1] [Cycle 1]: 0.00010455, [2] [tag_attr]: 2.617e-05 [meta_addattr_fg_expand]: 6.33e-06 [parallel-infer-symbol]: 3.48e-06 [pre_auto_parallel]: 4.249e-05 [insert-virtual-dataset]: 2.54999e-06 [parallel-infer-symbol-second]: 9.10019e-07 [dataset_repeat_opt]: 2.36998e-06 [pipeline_split]: 1.66e-06 [optimize]: 0.0618286, [53] [py_interpret_to_execute]: 3.472e-05 [rewriter_before_opt_a]: 0.00010028 [opt_a]: 0.0591956, [2] [Cycle 1]: 0.058221, [45] [expand_dump_flag]: 2.98e-06 [switch_simplify]: 4.457e-05 [loop_unroll]: 3.041e-05 [a_1]: 0.00071128 [with_stream_mark]: 2.873e-05 [recompute_prepare]: 1.746e-05 [updatestate_depend_eliminate]: 4.92999e-06 [updatestate_assign_eliminate]: 3.61001e-06 [updatestate_loads_eliminate]: 3.22002e-06 [parameter_eliminate]: 2.82002e-06 [a_2]: 0.00012787 [accelerated_algorithm]: 9.55001e-06 [shard]: 2.88e-06 [meta_shard_fg_expand]: 2.39001e-06 [shard_inline]: 8.08999e-06 [merge_send_recv]: 1.12e-05 [auto_parallel]: 1.018e-05 [parallel]: 2.24e-05 [flash_sp]: 1.172e-05 [merge_comm]: 5.27999e-06 [allreduce_fusion]: 3.68e-06 [matmul_add_comm_reduction]: 1.326e-05 [allreduce_slice_to_reducescatter]: 8.10018e-07 [virtual_shard_identity]: 1.213e-05 [virtual_dataset]: 7.14001e-06 [get_grad_eliminate_]: 7.43e-06 [virtual_output]: 6.89999e-06 [merge_forward]: 5.24e-06 [cell_reuse_recompute_pass]: 2.02999e-06 [offload_activation]: 1.194e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.076e-05 [merge_recompute_call_nodes]: 2.22999e-06 [before_grad]: 1.354e-05 [set_forward_comm_id_for_comm_node_pass]: 6.41e-06 [meta_fg_expand]: 3.18e-06 [flash_sp_send_recv_attached]: 3.89002e-06 [receive_attached]: 2.32001e-06 [after_resolve]: 1.604e-05 [a_after_grad]: 1.141e-05 [renormalize]: 0.056334 [add_forward_monad_depend]: 1.202e-05 [auto_monad_grad]: 2.62001e-06 [auto_monad_eliminator]: 2.577e-05 [cse]: 3.313e-05 [a_3]: 7.927e-05 [Cycle 2]: 0.00095453, [45] [expand_dump_flag]: 2.67001e-06 [switch_simplify]: 9.81998e-06 [loop_unroll]: 7.26001e-06 [a_1]: 0.00016414 [with_stream_mark]: 1.973e-05 [recompute_prepare]: 7.46999e-06 [updatestate_depend_eliminate]: 4.03999e-06 [updatestate_assign_eliminate]: 3.21999e-06 [updatestate_loads_eliminate]: 2.76e-06 [parameter_eliminate]: 2.62001e-06 [a_2]: 0.00010907 [accelerated_algorithm]: 7.76001e-06 [shard]: 3.07002e-06 [meta_shard_fg_expand]: 2.08998e-06 [shard_inline]: 6.75002e-06 [merge_send_recv]: 8.70001e-06 [auto_parallel]: 9.31002e-06 [parallel]: 8.43999e-06 [flash_sp]: 4.60999e-06 [merge_comm]: 3.71001e-06 [allreduce_fusion]: 3.78001e-06 [matmul_add_comm_reduction]: 1.024e-05 [allreduce_slice_to_reducescatter]: 7.7e-07 [virtual_shard_identity]: 8.47e-06 [virtual_dataset]: 6.40997e-06 [get_grad_eliminate_]: 6.73998e-06 [virtual_output]: 6.21e-06 [merge_forward]: 4.19997e-06 [cell_reuse_recompute_pass]: 3.66999e-06 [offload_activation]: 1.349e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.558e-05 [merge_recompute_call_nodes]: 1.69e-06 [before_grad]: 1.131e-05 [set_forward_comm_id_for_comm_node_pass]: 4.59002e-06 [meta_fg_expand]: 2.82002e-06 [flash_sp_send_recv_attached]: 1.94e-06 [receive_attached]: 2.27001e-06 [after_resolve]: 1.439e-05 [a_after_grad]: 1.112e-05 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 2.44999e-06 [auto_monad_grad]: 1.41002e-06 [auto_monad_eliminator]: 9.47001e-06 [cse]: 1.776e-05 [a_3]: 4.951e-05 [py_interpret_to_execute_after_opt_a]: 2.088e-05 [slice_cell_reuse_recomputed_activation]: 4.80001e-06 [rewriter_after_opt_a]: 4.39e-05 [convert_after_rewriter]: 1.034e-05 [order_py_execute_after_rewriter]: 8.18999e-06 [mutable_eliminate]: 0.00072339 [opt_b]: 0.00028018, [1] [Cycle 1]: 0.00026919, [7] [b_1]: 0.00017056 [b_2]: 8.57e-06 [updatestate_depend_eliminate]: 7.34002e-06 [updatestate_assign_eliminate]: 2.41e-06 [updatestate_loads_eliminate]: 2.49001e-06 [renormalize]: 5.39992e-07 [cse]: 2.076e-05 [optimize_parallel_all_gather_comm]: 2.08e-05 [overlap_param_gather]: 5.62001e-06 [cconv]: 3.301e-05 [loop_unroll]: 0.00047189 [opt_after_cconv]: 0.00014138, [1] [Cycle 1]: 0.00013226, [7] [c_1]: 3.457e-05 [parameter_eliminate]: 4.3e-06 [updatestate_depend_eliminate]: 6.52001e-06 [updatestate_assign_eliminate]: 2.74001e-06 [updatestate_loads_eliminate]: 2.61e-06 [cse]: 2e-05 [renormalize]: 4.19997e-07 [remove_dup_value]: 1.747e-05 [tuple_transform]: 9.916e-05, [1] [Cycle 1]: 9.157e-05, [4] [d_1]: 4.909e-05 [none_parameter_eliminate]: 1.94e-06 [renormalize]: 2.30008e-07 [switch_simplify]: 8.02e-06 [partial_unused_args_eliminate]: 5.07e-06 [add_recomputation]: 5.441e-05 [cse_after_recomputation]: 3.001e-05, [1] [Cycle 1]: 2.246e-05, [1] [cse]: 1.223e-05 [environ_conv]: 8.65999e-06 [swap_dp_allreduce_reducescatter]: 8.80001e-06 [bias_add_comm_swap]: 5.72999e-06 [label_micro_interleaved_index]: 7.84002e-06 [label_fine_grained_interleaved_index]: 5.40001e-06 [merge_cast_opt]: 3.93001e-06 [slice_recompute_activation]: 4.53999e-06 [micro_interleaved_order_control]: 5.04998e-06 [assign_add_opt]: 3.86999e-06 [ForceFp32Comm]: 3.68e-06 [remove_cast_before_assign_add]: 3.5e-06 [full_micro_interleaved_order_control]: 4.62e-06 [reorder_send_recv_between_fp_bp]: 6.28e-06 [comm_op_add_attrs]: 3.61999e-06 [add_comm_op_reuse_tag]: 3.27002e-06 [interleave_split_concat_branches]: 3.73001e-06 [interleave_parallel_branches]: 3.73001e-06 [overlap_opt_shard_in_pipeline]: 3.84002e-06 [overlap_opt_shard_grad_in_pipeline]: 4.70999e-06 [control_data_broadcast_order]: 1.746e-05 [grouped_pairwise_exchange_alltoall]: 4.22998e-06 [offloading_packed_experts]: 6.92002e-06 [overlap_recompute_and_grad_model_parallel]: 8.1e-06 [overlap_grad_matmul_and_grad_allreduce]: 4.08999e-06 [overlap_recompute_allgather_and_fa_grad]: 4.07e-06 [overlap_recompute_comm]: 5.27999e-06 [overlap_grad_ring_attention]: 7.51999e-06 [overlap_grad_flash_sp]: 2.592e-05 [begin_end_overlap_inline]: 3.64002e-06 [split_matmul_comm_elemetwise]: 4.83001e-06 [split_layernorm_comm]: 3.96001e-06 [handle_group_info]: 3.48999e-06 [symbol_engine_optimizer]: 0.00010665, [1] [Cycle 1]: 9.898e-05, [6] [build]: 4.22e-06 [elim_shapecalc]: 1.285e-05 [elim_not_effective]: 1.454e-05 [opt_reshape]: 7.58001e-06 [fold_const_symbol]: 1.052e-05 [renormalize]: 3.80009e-07 [detach_backward]: 4.32e-06 [pipeline_parallel_scheduler]: 1.95001e-06 [auto_monad_reorder]: 2.229e-05 [get_jit_bprop_graph]: 1.91e-06 [rewriter_after_jit_bprop_graph]: 5.19998e-06 [opt_after_jit_grad]: 0.00060539 [validate]: 4.413e-05 Sums bootstrap : 0.000434s : 0.62% type_inference : 0.007772s : 11.18% event_method : 0.000025s : 0.04% auto_monad : 0.000073s : 0.10% graph_reusing : 0.000006s : 0.01% inline : 0.000004s : 0.01% add_attr.add_attr_with_inline.tag_attr : 0.000026s : 0.04% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.01% parallel-infer-symbol : 0.000003s : 0.01% pre_auto_parallel : 0.000042s : 0.06% insert-virtual-dataset : 0.000003s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000035s : 0.05% optimize.rewriter_before_opt_a : 0.000100s : 0.14% optimize.opt_a.expand_dump_flag : 0.000006s : 0.01% optimize.opt_a.switch_simplify : 0.000054s : 0.08% optimize.opt_a.loop_unroll : 0.000038s : 0.05% optimize.opt_a.a_1 : 0.000875s : 1.26% optimize.opt_a.with_stream_mark : 0.000048s : 0.07% optimize.opt_a.recompute_prepare : 0.000025s : 0.04% optimize.opt_a.updatestate_depend_eliminate : 0.000009s : 0.01% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.01% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.01% optimize.opt_a.parameter_eliminate : 0.000005s : 0.01% optimize.opt_a.a_2 : 0.000237s : 0.34% optimize.opt_a.accelerated_algorithm : 0.000017s : 0.02% optimize.opt_a.shard : 0.000006s : 0.01% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.01% optimize.opt_a.shard_inline : 0.000015s : 0.02% optimize.opt_a.merge_send_recv : 0.000020s : 0.03% optimize.opt_a.auto_parallel : 0.000019s : 0.03% optimize.opt_a.parallel : 0.000031s : 0.04% optimize.opt_a.flash_sp : 0.000016s : 0.02% optimize.opt_a.merge_comm : 0.000009s : 0.01% optimize.opt_a.allreduce_fusion : 0.000007s : 0.01% optimize.opt_a.matmul_add_comm_reduction : 0.000023s : 0.03% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000002s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000021s : 0.03% optimize.opt_a.virtual_dataset : 0.000014s : 0.02% optimize.opt_a.get_grad_eliminate_ : 0.000014s : 0.02% optimize.opt_a.virtual_output : 0.000013s : 0.02% optimize.opt_a.merge_forward : 0.000009s : 0.01% optimize.opt_a.cell_reuse_recompute_pass : 0.000006s : 0.01% optimize.opt_a.offload_activation : 0.000025s : 0.04% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000036s : 0.05% optimize.opt_a.merge_recompute_call_nodes : 0.000004s : 0.01% optimize.opt_a.before_grad : 0.000025s : 0.04% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000011s : 0.02% optimize.opt_a.meta_fg_expand : 0.000006s : 0.01% optimize.opt_a.flash_sp_send_recv_attached : 0.000006s : 0.01% optimize.opt_a.receive_attached : 0.000005s : 0.01% optimize.opt_a.after_resolve : 0.000030s : 0.04% optimize.opt_a.a_after_grad : 0.000023s : 0.03% optimize.opt_a.renormalize : 0.056334s : 81.02% optimize.opt_a.add_forward_monad_depend : 0.000014s : 0.02% optimize.opt_a.auto_monad_grad : 0.000004s : 0.01% optimize.opt_a.auto_monad_eliminator : 0.000035s : 0.05% optimize.opt_a.cse : 0.000051s : 0.07% optimize.opt_a.a_3 : 0.000129s : 0.19% optimize.py_interpret_to_execute_after_opt_a : 0.000021s : 0.03% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.01% optimize.rewriter_after_opt_a : 0.000044s : 0.06% optimize.convert_after_rewriter : 0.000010s : 0.01% optimize.order_py_execute_after_rewriter : 0.000008s : 0.01% optimize.mutable_eliminate : 0.000723s : 1.04% optimize.opt_b.b_1 : 0.000171s : 0.25% optimize.opt_b.b_2 : 0.000009s : 0.01% optimize.opt_b.updatestate_depend_eliminate : 0.000007s : 0.01% optimize.opt_b.updatestate_assign_eliminate : 0.000002s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000002s : 0.00% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000021s : 0.03% optimize.optimize_parallel_all_gather_comm : 0.000021s : 0.03% optimize.overlap_param_gather : 0.000006s : 0.01% optimize.cconv : 0.000033s : 0.05% optimize.loop_unroll : 0.000472s : 0.68% optimize.opt_after_cconv.c_1 : 0.000035s : 0.05% optimize.opt_after_cconv.parameter_eliminate : 0.000004s : 0.01% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000007s : 0.01% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.cse : 0.000020s : 0.03% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000017s : 0.03% optimize.tuple_transform.d_1 : 0.000049s : 0.07% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000008s : 0.01% optimize.partial_unused_args_eliminate : 0.000005s : 0.01% optimize.add_recomputation : 0.000054s : 0.08% optimize.cse_after_recomputation.cse : 0.000012s : 0.02% optimize.environ_conv : 0.000009s : 0.01% optimize.swap_dp_allreduce_reducescatter : 0.000009s : 0.01% optimize.bias_add_comm_swap : 0.000006s : 0.01% optimize.label_micro_interleaved_index : 0.000008s : 0.01% optimize.label_fine_grained_interleaved_index : 0.000005s : 0.01% optimize.merge_cast_opt : 0.000004s : 0.01% optimize.slice_recompute_activation : 0.000005s : 0.01% optimize.micro_interleaved_order_control : 0.000005s : 0.01% optimize.assign_add_opt : 0.000004s : 0.01% optimize.ForceFp32Comm : 0.000004s : 0.01% optimize.remove_cast_before_assign_add : 0.000003s : 0.01% optimize.full_micro_interleaved_order_control : 0.000005s : 0.01% optimize.reorder_send_recv_between_fp_bp : 0.000006s : 0.01% optimize.comm_op_add_attrs : 0.000004s : 0.01% optimize.add_comm_op_reuse_tag : 0.000003s : 0.00% optimize.interleave_split_concat_branches : 0.000004s : 0.01% optimize.interleave_parallel_branches : 0.000004s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000005s : 0.01% optimize.control_data_broadcast_order : 0.000017s : 0.03% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.01% optimize.offloading_packed_experts : 0.000007s : 0.01% optimize.overlap_recompute_and_grad_model_parallel : 0.000008s : 0.01% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.01% optimize.overlap_recompute_comm : 0.000005s : 0.01% optimize.overlap_grad_ring_attention : 0.000008s : 0.01% optimize.overlap_grad_flash_sp : 0.000026s : 0.04% optimize.begin_end_overlap_inline : 0.000004s : 0.01% optimize.split_matmul_comm_elemetwise : 0.000005s : 0.01% optimize.split_layernorm_comm : 0.000004s : 0.01% optimize.handle_group_info : 0.000003s : 0.01% optimize.symbol_engine_optimizer.build : 0.000004s : 0.01% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000013s : 0.02% optimize.symbol_engine_optimizer.elim_not_effective : 0.000015s : 0.02% optimize.symbol_engine_optimizer.opt_reshape : 0.000008s : 0.01% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000011s : 0.02% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000004s : 0.01% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000022s : 0.03% get_jit_bprop_graph : 0.000002s : 0.00% rewriter_after_jit_bprop_graph : 0.000005s : 0.01% opt_after_jit_grad : 0.000605s : 0.87% validate : 0.000044s : 0.06% Time group info: ------[substitution.] 0.000271 34 15.83% : 0.000043s : 6: substitution.arithmetic_simplify 0.90% : 0.000002s : 2: substitution.elim_not_effective 0.48% : 0.000001s : 2: substitution.fold_const_symbol 2.48% : 0.000007s : 4: substitution.graph_param_transform 66.96% : 0.000181s : 4: substitution.inline 2.11% : 0.000006s : 4: substitution.j_node_and_user_rematch 2.42% : 0.000007s : 4: substitution.remove_not_recompute_node 2.68% : 0.000007s : 4: substitution.replace_old_param 6.14% : 0.000017s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.007697 2 86.61% : 0.006666s : 1: type_inference.infer 13.39% : 0.001031s : 1: type_inference.specialize ------[replace.] 0.000067 8 62.36% : 0.000042s : 4: replace.inline 37.64% : 0.000025s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000193 8 92.33% : 0.000178s : 4: match.inline 7.67% : 0.000015s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000226 1278 0.96% : 0.000002s : 13: predicate.accumulaten_eliminater 0.80% : 0.000002s : 4: predicate.ad_related_special_op_eliminate 0.59% : 0.000001s : 8: predicate.addn_check_dump 0.90% : 0.000002s : 13: predicate.addn_zero_filter 0.77% : 0.000002s : 13: predicate.adjust_all_reduce_mul_add 2.92% : 0.000007s : 21: predicate.arithmetic_simplify 1.17% : 0.000003s : 13: predicate.cast_eliminate 0.60% : 0.000001s : 8: predicate.check_bprop_eliminate 0.50% : 0.000001s : 8: predicate.compare_switch_simplify 0.18% : 0.000000s : 4: predicate.const_output_eliminate 0.70% : 0.000002s : 8: predicate.depend_value_elim 1.17% : 0.000003s : 13: predicate.dict_get_item_const_eliminator 1.05% : 0.000002s : 13: predicate.dict_get_item_eliminator 0.84% : 0.000002s : 13: predicate.dict_set_item_eliminator 1.06% : 0.000002s : 8: predicate.dumpgradient_eliminate 0.19% : 0.000000s : 4: predicate.elim_not_effective 0.48% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.16% : 0.000003s : 17: predicate.environ_add_const_eliminate 1.03% : 0.000002s : 17: predicate.environ_get_add_eliminate 1.00% : 0.000002s : 17: predicate.environ_get_depend_swap 1.80% : 0.000004s : 25: predicate.environ_get_eliminate 1.14% : 0.000003s : 17: predicate.environ_get_set_eliminate 1.34% : 0.000003s : 21: predicate.exchange_switch_depend_value 2.46% : 0.000006s : 21: predicate.float_depend_g_call 0.57% : 0.000001s : 8: predicate.float_environ_get_switch 0.75% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.16% : 0.000000s : 4: predicate.fold_const_symbol 0.68% : 0.000002s : 8: predicate.get_grad_eliminate 0.23% : 0.000001s : 4: predicate.graph_param_transform 0.55% : 0.000001s : 8: predicate.incorporate_call 0.46% : 0.000001s : 8: predicate.incorporate_call_switch 6.45% : 0.000015s : 58: predicate.inline 1.14% : 0.000003s : 8: predicate.inline_without_move 0.31% : 0.000001s : 8: predicate.j_node_and_user_rematch 0.79% : 0.000002s : 8: predicate.less_batch_normalization 1.84% : 0.000004s : 25: predicate.list_to_tuple_eliminator_ 2.24% : 0.000005s : 38: predicate.load_eliminater 0.79% : 0.000002s : 4: predicate.loop_unroll_after_grad 2.37% : 0.000005s : 34: predicate.loop_unroll_before_grad 1.55% : 0.000003s : 21: predicate.make_slice_get_slice_eliminator 0.61% : 0.000001s : 8: predicate.merge_addn 0.56% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.63% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.74% : 0.000002s : 13: predicate.minmaximum_grad 1.01% : 0.000002s : 4: predicate.mutable_eliminate 0.34% : 0.000001s : 4: predicate.opt_reshape 0.30% : 0.000001s : 4: predicate.parallel_virtual_node 1.62% : 0.000004s : 21: predicate.partial_defer_inline 1.49% : 0.000003s : 21: predicate.partial_eliminate 0.85% : 0.000002s : 13: predicate.print_const_string_wrapper 0.65% : 0.000001s : 8: predicate.reduce_all_const_elim 1.28% : 0.000003s : 13: predicate.reduce_eliminate 2.32% : 0.000005s : 38: predicate.redundant_stop_gradient_eliminater 0.51% : 0.000001s : 8: predicate.remove_not_recompute_node 1.37% : 0.000003s : 25: predicate.replace_applicator 0.91% : 0.000002s : 8: predicate.replace_old_param 0.25% : 0.000001s : 4: predicate.reset_defer_inline 1.02% : 0.000002s : 13: predicate.reshape_eliminate 0.70% : 0.000002s : 8: predicate.row_tensor_add_zeros_like 0.40% : 0.000001s : 4: predicate.row_tensor_eliminate 0.82% : 0.000002s : 8: predicate.same_eliminate 0.65% : 0.000001s : 8: predicate.set_cell_output_no_recompute 1.25% : 0.000003s : 8: predicate.shard_identity_eliminate 0.69% : 0.000002s : 8: predicate.special_op_eliminate 0.63% : 0.000001s : 8: predicate.specialize_transform 1.03% : 0.000002s : 8: predicate.split_environ_get_set_with_tuple_value 0.89% : 0.000002s : 8: predicate.stack_unstack_eliminate 0.39% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.43% : 0.000003s : 21: predicate.switch_defer_inline 1.94% : 0.000004s : 29: predicate.switch_layer_defer_inline 4.77% : 0.000011s : 67: predicate.switch_simplify 0.86% : 0.000002s : 13: predicate.tile_eliminate 0.87% : 0.000002s : 13: predicate.transpose_eliminate 1.36% : 0.000003s : 21: predicate.tuple_list_convert_item_index_to_positive 1.50% : 0.000003s : 21: predicate.tuple_list_get_item_const_eliminator 1.33% : 0.000003s : 21: predicate.tuple_list_get_item_depend_reorder 3.28% : 0.000007s : 33: predicate.tuple_list_get_item_eliminator 1.45% : 0.000003s : 21: predicate.tuple_list_get_set_item_eliminator 2.33% : 0.000005s : 29: predicate.tuple_list_set_item_eliminator 1.70% : 0.000004s : 25: predicate.tuple_to_list_eliminator_ 2.23% : 0.000005s : 38: predicate.updatestate_pure_node_eliminater 2.88% : 0.000006s : 46: predicate.updatestate_useless_node_eliminater 0.36% : 0.000001s : 4: predicate.value_based_eliminate 0.70% : 0.000002s : 8: predicate.virtual_dataset_eliminate 0.77% : 0.000002s : 8: predicate.virtual_output_eliminate 0.25% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.43% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000754 11 44.88% : 0.000338s : 5: func_graph_cloner_run.FuncGraphClonerGraph 55.12% : 0.000416s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.199657 192 0.00% : 0.000006s : 1: ForceFp32Comm 2.12% : 0.004233s : 1: add_attr 2.11% : 0.004210s : 1: add_attr_with_inline 0.00% : 0.000006s : 1: add_comm_op_reuse_tag 0.03% : 0.000058s : 1: add_recomputation 0.00% : 0.000006s : 1: assign_add_opt 0.04% : 0.000085s : 1: auto_monad 0.02% : 0.000030s : 1: auto_monad_reorder 0.00% : 0.000006s : 1: begin_end_overlap_inline 0.00% : 0.000009s : 1: bias_add_comm_swap 0.24% : 0.000474s : 1: bootstrap 0.02% : 0.000036s : 1: cconv 0.00% : 0.000006s : 1: comm_op_add_attrs 0.01% : 0.000021s : 1: control_data_broadcast_order 0.01% : 0.000013s : 1: convert_after_rewriter 0.02% : 0.000033s : 1: cse_after_recomputation 0.00% : 0.000008s : 1: dataset_repeat_opt 0.01% : 0.000023s : 1: detach_backward 0.01% : 0.000012s : 1: environ_conv 0.02% : 0.000037s : 1: event_method 0.00% : 0.000007s : 1: full_micro_interleaved_order_control 0.00% : 0.000008s : 1: get_jit_bprop_graph 0.01% : 0.000013s : 1: graph_reusing 0.00% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000006s : 1: handle_group_info 0.00% : 0.000009s : 1: inline 0.00% : 0.000009s : 1: insert-virtual-dataset 0.00% : 0.000006s : 1: interleave_parallel_branches 0.00% : 0.000006s : 1: interleave_split_concat_branches 0.00% : 0.000008s : 1: label_fine_grained_interleaved_index 0.01% : 0.000011s : 1: label_micro_interleaved_index 0.24% : 0.000479s : 1: loop_unroll 0.00% : 0.000007s : 1: merge_cast_opt 0.00% : 0.000008s : 1: micro_interleaved_order_control 0.37% : 0.000730s : 1: mutable_eliminate 0.00% : 0.000010s : 1: offloading_packed_experts 0.01% : 0.000015s : 1: opt.transform.loop_unroll_optimizer 0.01% : 0.000019s : 1: opt.transform.mutable_eliminate 0.69% : 0.001373s : 78: opt.transform.opt_a 0.02% : 0.000033s : 1: opt.transform.opt_after_cconv 0.01% : 0.000029s : 1: opt.transform.opt_after_jit_grad 0.05% : 0.000105s : 28: opt.transform.opt_b 0.03% : 0.000055s : 2: opt.transform.opt_trans_graph 0.02% : 0.000041s : 4: opt.transform.symbol_engine_opt 29.65% : 0.059199s : 1: opt_a 0.07% : 0.000145s : 1: opt_after_cconv 0.31% : 0.000617s : 1: opt_after_jit_grad 0.14% : 0.000284s : 1: opt_b 31.16% : 0.062220s : 1: optimize 0.01% : 0.000024s : 1: optimize_parallel_all_gather_comm 0.01% : 0.000011s : 1: order_py_execute_after_rewriter 0.01% : 0.000030s : 1: overlap_grad_flash_sp 0.00% : 0.000007s : 1: overlap_grad_matmul_and_grad_allreduce 0.01% : 0.000011s : 1: overlap_grad_ring_attention 0.00% : 0.000007s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000007s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000009s : 1: overlap_param_gather 0.00% : 0.000007s : 1: overlap_recompute_allgather_and_fa_grad 0.01% : 0.000011s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000008s : 1: overlap_recompute_comm 0.00% : 0.000010s : 1: parallel-infer-symbol 0.00% : 0.000007s : 1: parallel-infer-symbol-second 0.00% : 0.000008s : 1: partial_unused_args_eliminate 0.00% : 0.000009s : 1: pipeline_parallel_scheduler 0.00% : 0.000007s : 1: pipeline_split 0.03% : 0.000051s : 1: pre_auto_parallel 0.02% : 0.000040s : 1: py_interpret_to_execute 0.01% : 0.000024s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000006s : 1: remove_cast_before_assign_add 0.01% : 0.000021s : 1: remove_dup_value 27.95% : 0.055796s : 1: renormalize.infer 0.26% : 0.000518s : 1: renormalize.specialize 0.00% : 0.000009s : 1: reorder_send_recv_between_fp_bp 0.01% : 0.000011s : 1: rewriter_after_jit_bprop_graph 0.02% : 0.000048s : 1: rewriter_after_opt_a 0.05% : 0.000105s : 1: rewriter_before_opt_a 0.00% : 0.000008s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000008s : 1: slice_recompute_activation 0.00% : 0.000007s : 1: split_layernorm_comm 0.00% : 0.000008s : 1: split_matmul_comm_elemetwise 0.01% : 0.000012s : 1: swap_dp_allreduce_reducescatter 0.05% : 0.000110s : 1: symbol_engine_optimizer 0.05% : 0.000102s : 1: tuple_transform 3.92% : 0.007826s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:43:02.678.553 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0661576, [21] [bootstrap]: 0.00088335 [type_inference]: 0.00605367 [event_method]: 1.943e-05 [auto_monad]: 6.902e-05 [graph_reusing]: 6.86001e-06 [inline]: 2.53003e-06 [add_attr]: 0.00340117, [1] [add_attr_with_inline]: 0.00338989, [1] [Cycle 1]: 7.147e-05, [2] [tag_attr]: 2.2e-05 [meta_addattr_fg_expand]: 5.40001e-06 [parallel-infer-symbol]: 3.66001e-06 [pre_auto_parallel]: 4.026e-05 [insert-virtual-dataset]: 2.24001e-06 [parallel-infer-symbol-second]: 7.7e-07 [dataset_repeat_opt]: 1.67999e-06 [pipeline_split]: 1.72999e-06 [optimize]: 0.0548643, [53] [py_interpret_to_execute]: 3e-05 [rewriter_before_opt_a]: 9.068e-05 [opt_a]: 0.0523198, [2] [Cycle 1]: 0.0515236, [45] [expand_dump_flag]: 3.5e-06 [switch_simplify]: 4.454e-05 [loop_unroll]: 3.058e-05 [a_1]: 0.00066237 [with_stream_mark]: 1.971e-05 [recompute_prepare]: 9.46e-06 [updatestate_depend_eliminate]: 4.28001e-06 [updatestate_assign_eliminate]: 3.4e-06 [updatestate_loads_eliminate]: 2.99001e-06 [parameter_eliminate]: 1.74998e-06 [a_2]: 8.692e-05 [accelerated_algorithm]: 7.95e-06 [shard]: 1.87999e-06 [meta_shard_fg_expand]: 1.87001e-06 [shard_inline]: 6.84999e-06 [merge_send_recv]: 8.55001e-06 [auto_parallel]: 6.74999e-06 [parallel]: 2.011e-05 [flash_sp]: 8.02003e-06 [merge_comm]: 4.1e-06 [allreduce_fusion]: 3.44001e-06 [matmul_add_comm_reduction]: 1.099e-05 [allreduce_slice_to_reducescatter]: 6.30011e-07 [virtual_shard_identity]: 9.29e-06 [virtual_dataset]: 6.58003e-06 [get_grad_eliminate_]: 6.33e-06 [virtual_output]: 6.81999e-06 [merge_forward]: 5.01002e-06 [cell_reuse_recompute_pass]: 1.54e-06 [offload_activation]: 1.021e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.323e-05 [merge_recompute_call_nodes]: 1.42999e-06 [before_grad]: 1.139e-05 [set_forward_comm_id_for_comm_node_pass]: 3.8e-06 [meta_fg_expand]: 3.32002e-06 [flash_sp_send_recv_attached]: 3.10998e-06 [receive_attached]: 2.59001e-06 [after_resolve]: 1.198e-05 [a_after_grad]: 1.055e-05 [renormalize]: 0.0500387 [add_forward_monad_depend]: 1.123e-05 [auto_monad_grad]: 3.29001e-06 [auto_monad_eliminator]: 2.542e-05 [cse]: 3.406e-05 [a_3]: 6.778e-05 [Cycle 2]: 0.00078156, [45] [expand_dump_flag]: 2.59001e-06 [switch_simplify]: 9.79e-06 [loop_unroll]: 7.46001e-06 [a_1]: 0.00017095 [with_stream_mark]: 2.078e-05 [recompute_prepare]: 7.80998e-06 [updatestate_depend_eliminate]: 4.45e-06 [updatestate_assign_eliminate]: 3.25e-06 [updatestate_loads_eliminate]: 3.16001e-06 [parameter_eliminate]: 2.00002e-06 [a_2]: 8.08e-05 [accelerated_algorithm]: 6.88e-06 [shard]: 2.46e-06 [meta_shard_fg_expand]: 2.48e-06 [shard_inline]: 7.39002e-06 [merge_send_recv]: 8.84e-06 [auto_parallel]: 9.54e-06 [parallel]: 9.32999e-06 [flash_sp]: 4.27998e-06 [merge_comm]: 3.71001e-06 [allreduce_fusion]: 3.66001e-06 [matmul_add_comm_reduction]: 9.49e-06 [allreduce_slice_to_reducescatter]: 8.50006e-07 [virtual_shard_identity]: 7.38999e-06 [virtual_dataset]: 6.45002e-06 [get_grad_eliminate_]: 6.14999e-06 [virtual_output]: 6.57002e-06 [merge_forward]: 4.07e-06 [cell_reuse_recompute_pass]: 3.16999e-06 [offload_activation]: 1.098e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.272e-05 [merge_recompute_call_nodes]: 1.89999e-06 [before_grad]: 1.009e-05 [set_forward_comm_id_for_comm_node_pass]: 3.91999e-06 [meta_fg_expand]: 2.59999e-06 [flash_sp_send_recv_attached]: 1.59998e-06 [receive_attached]: 2.48e-06 [after_resolve]: 1.47e-05 [a_after_grad]: 1.019e-05 [renormalize]: 6.00121e-08 [add_forward_monad_depend]: 1.72999e-06 [auto_monad_grad]: 1.12e-06 [auto_monad_eliminator]: 9.47999e-06 [cse]: 1.671e-05 [a_3]: 3.79e-05 [py_interpret_to_execute_after_opt_a]: 1.709e-05 [slice_cell_reuse_recomputed_activation]: 2.12001e-06 [rewriter_after_opt_a]: 4.074e-05 [convert_after_rewriter]: 7.39002e-06 [order_py_execute_after_rewriter]: 5.12e-06 [mutable_eliminate]: 0.00078547 [opt_b]: 0.000221, [1] [Cycle 1]: 0.00021293, [7] [b_1]: 0.00012938 [b_2]: 8.61002e-06 [updatestate_depend_eliminate]: 8.50001e-06 [updatestate_assign_eliminate]: 2.56998e-06 [updatestate_loads_eliminate]: 2.86e-06 [renormalize]: 9.20001e-07 [cse]: 2.401e-05 [optimize_parallel_all_gather_comm]: 2.025e-05 [overlap_param_gather]: 1.77999e-06 [cconv]: 3.525e-05 [loop_unroll]: 0.00054103 [opt_after_cconv]: 0.00011575, [1] [Cycle 1]: 0.0001092, [7] [c_1]: 3.335e-05 [parameter_eliminate]: 5.79e-06 [updatestate_depend_eliminate]: 7.18998e-06 [updatestate_assign_eliminate]: 2.54001e-06 [updatestate_loads_eliminate]: 2.81e-06 [cse]: 2.297e-05 [renormalize]: 5.10016e-07 [remove_dup_value]: 1.456e-05 [tuple_transform]: 8.289e-05, [1] [Cycle 1]: 7.804e-05, [4] [d_1]: 4.902e-05 [none_parameter_eliminate]: 1.71998e-06 [renormalize]: 1.59984e-07 [switch_simplify]: 7.26001e-06 [partial_unused_args_eliminate]: 2.14999e-06 [add_recomputation]: 5.512e-05 [cse_after_recomputation]: 2.337e-05, [1] [Cycle 1]: 1.774e-05, [1] [cse]: 1.229e-05 [environ_conv]: 5.97001e-06 [swap_dp_allreduce_reducescatter]: 5.27999e-06 [bias_add_comm_swap]: 3.31999e-06 [label_micro_interleaved_index]: 4.88001e-06 [label_fine_grained_interleaved_index]: 2.73998e-06 [merge_cast_opt]: 1.55999e-06 [slice_recompute_activation]: 2.35002e-06 [micro_interleaved_order_control]: 2.21e-06 [assign_add_opt]: 1.22e-06 [ForceFp32Comm]: 8.59989e-07 [remove_cast_before_assign_add]: 1.54e-06 [full_micro_interleaved_order_control]: 2.02999e-06 [reorder_send_recv_between_fp_bp]: 2.73998e-06 [comm_op_add_attrs]: 1.37999e-06 [add_comm_op_reuse_tag]: 9.39996e-07 [interleave_split_concat_branches]: 1.35999e-06 [interleave_parallel_branches]: 1.05001e-06 [overlap_opt_shard_in_pipeline]: 1.15999e-06 [overlap_opt_shard_grad_in_pipeline]: 1.92999e-06 [control_data_broadcast_order]: 1.296e-05 [grouped_pairwise_exchange_alltoall]: 1.59e-06 [offloading_packed_experts]: 4.18999e-06 [overlap_recompute_and_grad_model_parallel]: 4.63001e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.37e-06 [overlap_recompute_allgather_and_fa_grad]: 1.34998e-06 [overlap_recompute_comm]: 2.88998e-06 [overlap_grad_ring_attention]: 4.15e-06 [overlap_grad_flash_sp]: 2.321e-05 [begin_end_overlap_inline]: 5.59987e-07 [split_matmul_comm_elemetwise]: 2.50002e-06 [split_layernorm_comm]: 1.64e-06 [handle_group_info]: 9.10019e-07 [symbol_engine_optimizer]: 8.391e-05, [1] [Cycle 1]: 7.846e-05, [6] [build]: 3.86001e-06 [elim_shapecalc]: 1.14e-05 [elim_not_effective]: 1.402e-05 [opt_reshape]: 7.4e-06 [fold_const_symbol]: 1.13e-05 [renormalize]: 2.19996e-07 [detach_backward]: 2.59999e-06 [pipeline_parallel_scheduler]: 1.59998e-06 [auto_monad_reorder]: 1.662e-05 [get_jit_bprop_graph]: 2.06998e-06 [rewriter_after_jit_bprop_graph]: 5.97999e-06 [opt_after_jit_grad]: 0.00055341 [validate]: 4.658e-05 Sums bootstrap : 0.000883s : 1.43% type_inference : 0.006054s : 9.82% event_method : 0.000019s : 0.03% auto_monad : 0.000069s : 0.11% graph_reusing : 0.000007s : 0.01% inline : 0.000003s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000022s : 0.04% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000005s : 0.01% parallel-infer-symbol : 0.000004s : 0.01% pre_auto_parallel : 0.000040s : 0.07% insert-virtual-dataset : 0.000002s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000030s : 0.05% optimize.rewriter_before_opt_a : 0.000091s : 0.15% optimize.opt_a.expand_dump_flag : 0.000006s : 0.01% optimize.opt_a.switch_simplify : 0.000054s : 0.09% optimize.opt_a.loop_unroll : 0.000038s : 0.06% optimize.opt_a.a_1 : 0.000833s : 1.35% optimize.opt_a.with_stream_mark : 0.000040s : 0.07% optimize.opt_a.recompute_prepare : 0.000017s : 0.03% optimize.opt_a.updatestate_depend_eliminate : 0.000009s : 0.01% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.01% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.01% optimize.opt_a.parameter_eliminate : 0.000004s : 0.01% optimize.opt_a.a_2 : 0.000168s : 0.27% optimize.opt_a.accelerated_algorithm : 0.000015s : 0.02% optimize.opt_a.shard : 0.000004s : 0.01% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.01% optimize.opt_a.shard_inline : 0.000014s : 0.02% optimize.opt_a.merge_send_recv : 0.000017s : 0.03% optimize.opt_a.auto_parallel : 0.000016s : 0.03% optimize.opt_a.parallel : 0.000029s : 0.05% optimize.opt_a.flash_sp : 0.000012s : 0.02% optimize.opt_a.merge_comm : 0.000008s : 0.01% optimize.opt_a.allreduce_fusion : 0.000007s : 0.01% optimize.opt_a.matmul_add_comm_reduction : 0.000020s : 0.03% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000017s : 0.03% optimize.opt_a.virtual_dataset : 0.000013s : 0.02% optimize.opt_a.get_grad_eliminate_ : 0.000012s : 0.02% optimize.opt_a.virtual_output : 0.000013s : 0.02% optimize.opt_a.merge_forward : 0.000009s : 0.01% optimize.opt_a.cell_reuse_recompute_pass : 0.000005s : 0.01% optimize.opt_a.offload_activation : 0.000021s : 0.03% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000026s : 0.04% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.01% optimize.opt_a.before_grad : 0.000021s : 0.03% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000008s : 0.01% optimize.opt_a.meta_fg_expand : 0.000006s : 0.01% optimize.opt_a.flash_sp_send_recv_attached : 0.000005s : 0.01% optimize.opt_a.receive_attached : 0.000005s : 0.01% optimize.opt_a.after_resolve : 0.000027s : 0.04% optimize.opt_a.a_after_grad : 0.000021s : 0.03% optimize.opt_a.renormalize : 0.050039s : 81.15% optimize.opt_a.add_forward_monad_depend : 0.000013s : 0.02% optimize.opt_a.auto_monad_grad : 0.000004s : 0.01% optimize.opt_a.auto_monad_eliminator : 0.000035s : 0.06% optimize.opt_a.cse : 0.000051s : 0.08% optimize.opt_a.a_3 : 0.000106s : 0.17% optimize.py_interpret_to_execute_after_opt_a : 0.000017s : 0.03% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.00% optimize.rewriter_after_opt_a : 0.000041s : 0.07% optimize.convert_after_rewriter : 0.000007s : 0.01% optimize.order_py_execute_after_rewriter : 0.000005s : 0.01% optimize.mutable_eliminate : 0.000785s : 1.27% optimize.opt_b.b_1 : 0.000129s : 0.21% optimize.opt_b.b_2 : 0.000009s : 0.01% optimize.opt_b.updatestate_depend_eliminate : 0.000009s : 0.01% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.00% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000024s : 0.04% optimize.optimize_parallel_all_gather_comm : 0.000020s : 0.03% optimize.overlap_param_gather : 0.000002s : 0.00% optimize.cconv : 0.000035s : 0.06% optimize.loop_unroll : 0.000541s : 0.88% optimize.opt_after_cconv.c_1 : 0.000033s : 0.05% optimize.opt_after_cconv.parameter_eliminate : 0.000006s : 0.01% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000007s : 0.01% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.cse : 0.000023s : 0.04% optimize.opt_after_cconv.renormalize : 0.000001s : 0.00% optimize.remove_dup_value : 0.000015s : 0.02% optimize.tuple_transform.d_1 : 0.000049s : 0.08% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000007s : 0.01% optimize.partial_unused_args_eliminate : 0.000002s : 0.00% optimize.add_recomputation : 0.000055s : 0.09% optimize.cse_after_recomputation.cse : 0.000012s : 0.02% optimize.environ_conv : 0.000006s : 0.01% optimize.swap_dp_allreduce_reducescatter : 0.000005s : 0.01% optimize.bias_add_comm_swap : 0.000003s : 0.01% optimize.label_micro_interleaved_index : 0.000005s : 0.01% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.00% optimize.merge_cast_opt : 0.000002s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.00% optimize.micro_interleaved_order_control : 0.000002s : 0.00% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000002s : 0.00% optimize.full_micro_interleaved_order_control : 0.000002s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.00% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.00% optimize.control_data_broadcast_order : 0.000013s : 0.02% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.00% optimize.offloading_packed_experts : 0.000004s : 0.01% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.01% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.00% optimize.overlap_recompute_comm : 0.000003s : 0.00% optimize.overlap_grad_ring_attention : 0.000004s : 0.01% optimize.overlap_grad_flash_sp : 0.000023s : 0.04% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000003s : 0.00% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000004s : 0.01% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000011s : 0.02% optimize.symbol_engine_optimizer.elim_not_effective : 0.000014s : 0.02% optimize.symbol_engine_optimizer.opt_reshape : 0.000007s : 0.01% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000011s : 0.02% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000003s : 0.00% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000017s : 0.03% get_jit_bprop_graph : 0.000002s : 0.00% rewriter_after_jit_bprop_graph : 0.000006s : 0.01% opt_after_jit_grad : 0.000553s : 0.90% validate : 0.000047s : 0.08% Time group info: ------[substitution.] 0.000243 34 17.37% : 0.000042s : 6: substitution.arithmetic_simplify 0.88% : 0.000002s : 2: substitution.elim_not_effective 0.54% : 0.000001s : 2: substitution.fold_const_symbol 2.86% : 0.000007s : 4: substitution.graph_param_transform 64.65% : 0.000157s : 4: substitution.inline 2.19% : 0.000005s : 4: substitution.j_node_and_user_rematch 2.28% : 0.000006s : 4: substitution.remove_not_recompute_node 2.68% : 0.000007s : 4: substitution.replace_old_param 6.55% : 0.000016s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.005982 2 87.71% : 0.005247s : 1: type_inference.infer 12.29% : 0.000735s : 1: type_inference.specialize ------[replace.] 0.000063 8 64.28% : 0.000041s : 4: replace.inline 35.72% : 0.000023s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000168 8 91.65% : 0.000154s : 4: match.inline 8.35% : 0.000014s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000225 1278 1.03% : 0.000002s : 13: predicate.accumulaten_eliminater 0.98% : 0.000002s : 4: predicate.ad_related_special_op_eliminate 0.47% : 0.000001s : 8: predicate.addn_check_dump 0.89% : 0.000002s : 13: predicate.addn_zero_filter 0.82% : 0.000002s : 13: predicate.adjust_all_reduce_mul_add 2.34% : 0.000005s : 21: predicate.arithmetic_simplify 0.97% : 0.000002s : 13: predicate.cast_eliminate 0.55% : 0.000001s : 8: predicate.check_bprop_eliminate 0.48% : 0.000001s : 8: predicate.compare_switch_simplify 0.18% : 0.000000s : 4: predicate.const_output_eliminate 0.63% : 0.000001s : 8: predicate.depend_value_elim 1.06% : 0.000002s : 13: predicate.dict_get_item_const_eliminator 1.00% : 0.000002s : 13: predicate.dict_get_item_eliminator 0.83% : 0.000002s : 13: predicate.dict_set_item_eliminator 0.97% : 0.000002s : 8: predicate.dumpgradient_eliminate 0.36% : 0.000001s : 4: predicate.elim_not_effective 0.50% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.17% : 0.000003s : 17: predicate.environ_add_const_eliminate 1.03% : 0.000002s : 17: predicate.environ_get_add_eliminate 1.01% : 0.000002s : 17: predicate.environ_get_depend_swap 1.65% : 0.000004s : 25: predicate.environ_get_eliminate 0.98% : 0.000002s : 17: predicate.environ_get_set_eliminate 1.36% : 0.000003s : 21: predicate.exchange_switch_depend_value 2.46% : 0.000006s : 21: predicate.float_depend_g_call 0.52% : 0.000001s : 8: predicate.float_environ_get_switch 0.76% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.16% : 0.000000s : 4: predicate.fold_const_symbol 0.73% : 0.000002s : 8: predicate.get_grad_eliminate 0.28% : 0.000001s : 4: predicate.graph_param_transform 0.51% : 0.000001s : 8: predicate.incorporate_call 0.45% : 0.000001s : 8: predicate.incorporate_call_switch 6.29% : 0.000014s : 58: predicate.inline 0.81% : 0.000002s : 8: predicate.inline_without_move 0.30% : 0.000001s : 8: predicate.j_node_and_user_rematch 0.91% : 0.000002s : 8: predicate.less_batch_normalization 1.71% : 0.000004s : 25: predicate.list_to_tuple_eliminator_ 2.36% : 0.000005s : 38: predicate.load_eliminater 1.34% : 0.000003s : 4: predicate.loop_unroll_after_grad 2.29% : 0.000005s : 34: predicate.loop_unroll_before_grad 2.14% : 0.000005s : 21: predicate.make_slice_get_slice_eliminator 0.45% : 0.000001s : 8: predicate.merge_addn 0.62% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.53% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.76% : 0.000002s : 13: predicate.minmaximum_grad 1.60% : 0.000004s : 4: predicate.mutable_eliminate 0.37% : 0.000001s : 4: predicate.opt_reshape 0.39% : 0.000001s : 4: predicate.parallel_virtual_node 1.73% : 0.000004s : 21: predicate.partial_defer_inline 1.51% : 0.000003s : 21: predicate.partial_eliminate 0.91% : 0.000002s : 13: predicate.print_const_string_wrapper 0.54% : 0.000001s : 8: predicate.reduce_all_const_elim 1.09% : 0.000002s : 13: predicate.reduce_eliminate 2.30% : 0.000005s : 38: predicate.redundant_stop_gradient_eliminater 0.44% : 0.000001s : 8: predicate.remove_not_recompute_node 1.55% : 0.000003s : 25: predicate.replace_applicator 0.54% : 0.000001s : 8: predicate.replace_old_param 0.34% : 0.000001s : 4: predicate.reset_defer_inline 1.21% : 0.000003s : 13: predicate.reshape_eliminate 0.67% : 0.000001s : 8: predicate.row_tensor_add_zeros_like 0.35% : 0.000001s : 4: predicate.row_tensor_eliminate 0.84% : 0.000002s : 8: predicate.same_eliminate 0.41% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.77% : 0.000002s : 8: predicate.shard_identity_eliminate 0.69% : 0.000002s : 8: predicate.special_op_eliminate 0.65% : 0.000001s : 8: predicate.specialize_transform 1.11% : 0.000002s : 8: predicate.split_environ_get_set_with_tuple_value 0.83% : 0.000002s : 8: predicate.stack_unstack_eliminate 0.32% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.49% : 0.000003s : 21: predicate.switch_defer_inline 2.04% : 0.000005s : 29: predicate.switch_layer_defer_inline 4.87% : 0.000011s : 67: predicate.switch_simplify 0.86% : 0.000002s : 13: predicate.tile_eliminate 0.96% : 0.000002s : 13: predicate.transpose_eliminate 1.55% : 0.000003s : 21: predicate.tuple_list_convert_item_index_to_positive 1.57% : 0.000004s : 21: predicate.tuple_list_get_item_const_eliminator 1.38% : 0.000003s : 21: predicate.tuple_list_get_item_depend_reorder 3.38% : 0.000008s : 33: predicate.tuple_list_get_item_eliminator 1.45% : 0.000003s : 21: predicate.tuple_list_get_set_item_eliminator 2.18% : 0.000005s : 29: predicate.tuple_list_set_item_eliminator 1.55% : 0.000003s : 25: predicate.tuple_to_list_eliminator_ 2.42% : 0.000005s : 38: predicate.updatestate_pure_node_eliminater 2.81% : 0.000006s : 46: predicate.updatestate_useless_node_eliminater 0.52% : 0.000001s : 4: predicate.value_based_eliminate 0.71% : 0.000002s : 8: predicate.virtual_dataset_eliminate 0.80% : 0.000002s : 8: predicate.virtual_output_eliminate 0.28% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.41% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000656 11 48.19% : 0.000316s : 5: func_graph_cloner_run.FuncGraphClonerGraph 51.81% : 0.000340s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.175889 192 0.00% : 0.000004s : 1: ForceFp32Comm 1.94% : 0.003407s : 1: add_attr 1.93% : 0.003394s : 1: add_attr_with_inline 0.00% : 0.000005s : 1: add_comm_op_reuse_tag 0.03% : 0.000059s : 1: add_recomputation 0.00% : 0.000004s : 1: assign_add_opt 0.04% : 0.000074s : 1: auto_monad 0.01% : 0.000020s : 1: auto_monad_reorder 0.00% : 0.000003s : 1: begin_end_overlap_inline 0.00% : 0.000006s : 1: bias_add_comm_swap 0.53% : 0.000930s : 1: bootstrap 0.04% : 0.000063s : 1: cconv 0.00% : 0.000004s : 1: comm_op_add_attrs 0.01% : 0.000016s : 1: control_data_broadcast_order 0.01% : 0.000011s : 1: convert_after_rewriter 0.02% : 0.000026s : 1: cse_after_recomputation 0.00% : 0.000005s : 1: dataset_repeat_opt 0.00% : 0.000006s : 1: detach_backward 0.01% : 0.000009s : 1: environ_conv 0.01% : 0.000026s : 1: event_method 0.00% : 0.000005s : 1: full_micro_interleaved_order_control 0.00% : 0.000005s : 1: get_jit_bprop_graph 0.01% : 0.000011s : 1: graph_reusing 0.00% : 0.000005s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000004s : 1: handle_group_info 0.00% : 0.000006s : 1: inline 0.00% : 0.000006s : 1: insert-virtual-dataset 0.00% : 0.000004s : 1: interleave_parallel_branches 0.00% : 0.000004s : 1: interleave_split_concat_branches 0.00% : 0.000006s : 1: label_fine_grained_interleaved_index 0.00% : 0.000008s : 1: label_micro_interleaved_index 0.31% : 0.000551s : 1: loop_unroll 0.00% : 0.000004s : 1: merge_cast_opt 0.00% : 0.000005s : 1: micro_interleaved_order_control 0.45% : 0.000799s : 1: mutable_eliminate 0.00% : 0.000008s : 1: offloading_packed_experts 0.01% : 0.000016s : 1: opt.transform.loop_unroll_optimizer 0.01% : 0.000021s : 1: opt.transform.mutable_eliminate 0.73% : 0.001290s : 78: opt.transform.opt_a 0.02% : 0.000032s : 1: opt.transform.opt_after_cconv 0.02% : 0.000029s : 1: opt.transform.opt_after_jit_grad 0.06% : 0.000103s : 28: opt.transform.opt_b 0.03% : 0.000054s : 2: opt.transform.opt_trans_graph 0.02% : 0.000040s : 4: opt.transform.symbol_engine_opt 29.75% : 0.052323s : 1: opt_a 0.07% : 0.000119s : 1: opt_after_cconv 0.32% : 0.000564s : 1: opt_after_jit_grad 0.13% : 0.000224s : 1: opt_b 31.20% : 0.054870s : 1: optimize 0.01% : 0.000024s : 1: optimize_parallel_all_gather_comm 0.00% : 0.000008s : 1: order_py_execute_after_rewriter 0.02% : 0.000027s : 1: overlap_grad_flash_sp 0.00% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.00% : 0.000007s : 1: overlap_grad_ring_attention 0.00% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000005s : 1: overlap_param_gather 0.00% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.00% : 0.000008s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000006s : 1: overlap_recompute_comm 0.00% : 0.000007s : 1: parallel-infer-symbol 0.00% : 0.000004s : 1: parallel-infer-symbol-second 0.00% : 0.000005s : 1: partial_unused_args_eliminate 0.00% : 0.000005s : 1: pipeline_parallel_scheduler 0.00% : 0.000004s : 1: pipeline_split 0.03% : 0.000044s : 1: pre_auto_parallel 0.02% : 0.000034s : 1: py_interpret_to_execute 0.01% : 0.000021s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000004s : 1: remove_cast_before_assign_add 0.01% : 0.000019s : 1: remove_dup_value 28.14% : 0.049498s : 1: renormalize.infer 0.30% : 0.000520s : 1: renormalize.specialize 0.00% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.01% : 0.000009s : 1: rewriter_after_jit_bprop_graph 0.03% : 0.000045s : 1: rewriter_after_opt_a 0.05% : 0.000096s : 1: rewriter_before_opt_a 0.00% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000006s : 1: slice_recompute_activation 0.00% : 0.000005s : 1: split_layernorm_comm 0.00% : 0.000005s : 1: split_matmul_comm_elemetwise 0.00% : 0.000008s : 1: swap_dp_allreduce_reducescatter 0.05% : 0.000087s : 1: symbol_engine_optimizer 0.05% : 0.000086s : 1: tuple_transform 3.45% : 0.006074s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:43:03.416.475 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:43:03.416.796 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0903116, [21] [bootstrap]: 0.00040855 [type_inference]: 0.078173 [event_method]: 1.979e-05 [auto_monad]: 6.506e-05 [graph_reusing]: 6.44999e-06 [inline]: 2.95002e-06 [add_attr]: 0.00343118, [1] [add_attr_with_inline]: 0.00342092, [1] [Cycle 1]: 8.845e-05, [2] [tag_attr]: 1.999e-05 [meta_addattr_fg_expand]: 5.87999e-06 [parallel-infer-symbol]: 3.56001e-06 [pre_auto_parallel]: 3.927e-05 [insert-virtual-dataset]: 2.31998e-06 [parallel-infer-symbol-second]: 7.60017e-07 [dataset_repeat_opt]: 2.16e-06 [pipeline_split]: 1.71e-06 [optimize]: 0.00689424, [53] [py_interpret_to_execute]: 3.187e-05 [rewriter_before_opt_a]: 9.134e-05 [opt_a]: 0.003574, [2] [Cycle 1]: 0.00266449, [45] [expand_dump_flag]: 3.24001e-06 [switch_simplify]: 4.364e-05 [loop_unroll]: 3.029e-05 [a_1]: 0.00090602 [with_stream_mark]: 2.487e-05 [recompute_prepare]: 1.281e-05 [updatestate_depend_eliminate]: 4.27e-06 [updatestate_assign_eliminate]: 3.13e-06 [updatestate_loads_eliminate]: 3.29001e-06 [parameter_eliminate]: 2.13002e-06 [a_2]: 0.0001181 [accelerated_algorithm]: 7.25998e-06 [shard]: 2.19999e-06 [meta_shard_fg_expand]: 2.10002e-06 [shard_inline]: 6.68998e-06 [merge_send_recv]: 8.92e-06 [auto_parallel]: 7.82e-06 [parallel]: 1.944e-05 [flash_sp]: 9.56e-06 [merge_comm]: 3.98999e-06 [allreduce_fusion]: 3.83001e-06 [matmul_add_comm_reduction]: 1.015e-05 [allreduce_slice_to_reducescatter]: 6.39993e-07 [virtual_shard_identity]: 8.67e-06 [virtual_dataset]: 6.83e-06 [get_grad_eliminate_]: 6.46999e-06 [virtual_output]: 6.43e-06 [merge_forward]: 3.73001e-06 [cell_reuse_recompute_pass]: 1.55001e-06 [offload_activation]: 1.094e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.614e-05 [merge_recompute_call_nodes]: 1.79e-06 [before_grad]: 1.123e-05 [set_forward_comm_id_for_comm_node_pass]: 4.27998e-06 [meta_fg_expand]: 3.18e-06 [flash_sp_send_recv_attached]: 3.18e-06 [receive_attached]: 2.58e-06 [after_resolve]: 1.255e-05 [a_after_grad]: 9.80002e-06 [renormalize]: 0.00077222 [add_forward_monad_depend]: 5.82001e-06 [auto_monad_grad]: 2.21e-06 [auto_monad_eliminator]: 1.758e-05 [cse]: 2.983e-05 [a_3]: 6.69e-05 [Cycle 2]: 0.00089382, [45] [expand_dump_flag]: 1.71e-06 [switch_simplify]: 8.05e-06 [loop_unroll]: 6.16998e-06 [a_1]: 0.00014527 [with_stream_mark]: 1.552e-05 [recompute_prepare]: 7.15003e-06 [updatestate_depend_eliminate]: 3.28998e-06 [updatestate_assign_eliminate]: 2.98e-06 [updatestate_loads_eliminate]: 2.38998e-06 [parameter_eliminate]: 1.82999e-06 [a_2]: 0.00010584 [accelerated_algorithm]: 6.64001e-06 [shard]: 1.71998e-06 [meta_shard_fg_expand]: 1.84e-06 [shard_inline]: 6.06e-06 [merge_send_recv]: 6.45002e-06 [auto_parallel]: 8.08001e-06 [parallel]: 5.91998e-06 [flash_sp]: 4.03001e-06 [merge_comm]: 3.61001e-06 [allreduce_fusion]: 3.83999e-06 [matmul_add_comm_reduction]: 6.97002e-06 [allreduce_slice_to_reducescatter]: 5.3001e-07 [virtual_shard_identity]: 7.53e-06 [virtual_dataset]: 6.11998e-06 [get_grad_eliminate_]: 6.58e-06 [virtual_output]: 5.98998e-06 [merge_forward]: 4.03999e-06 [cell_reuse_recompute_pass]: 1.76e-06 [offload_activation]: 8.17e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.423e-05 [merge_recompute_call_nodes]: 1.22999e-06 [before_grad]: 1.061e-05 [set_forward_comm_id_for_comm_node_pass]: 3.76999e-06 [meta_fg_expand]: 2.14e-06 [flash_sp_send_recv_attached]: 1.54e-06 [receive_attached]: 1.74998e-06 [after_resolve]: 1.288e-05 [a_after_grad]: 1.018e-05 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 1.90001e-06 [auto_monad_grad]: 1.64e-06 [auto_monad_eliminator]: 1.041e-05 [cse]: 1.753e-05 [a_3]: 5.21e-05 [py_interpret_to_execute_after_opt_a]: 1.583e-05 [slice_cell_reuse_recomputed_activation]: 5.50001e-06 [rewriter_after_opt_a]: 4.482e-05 [convert_after_rewriter]: 9.94001e-06 [order_py_execute_after_rewriter]: 8.31002e-06 [mutable_eliminate]: 0.00064817 [opt_b]: 0.0003295, [1] [Cycle 1]: 0.00031889, [7] [b_1]: 0.0002136 [b_2]: 8.02e-06 [updatestate_depend_eliminate]: 9.36e-06 [updatestate_assign_eliminate]: 2.88998e-06 [updatestate_loads_eliminate]: 2.83998e-06 [renormalize]: 7.60017e-07 [cse]: 2.267e-05 [optimize_parallel_all_gather_comm]: 2.179e-05 [overlap_param_gather]: 5.39e-06 [cconv]: 3.622e-05 [loop_unroll]: 0.00121096 [opt_after_cconv]: 0.0001459, [1] [Cycle 1]: 0.00013505, [7] [c_1]: 3.206e-05 [parameter_eliminate]: 5.87999e-06 [updatestate_depend_eliminate]: 9.24e-06 [updatestate_assign_eliminate]: 3.28e-06 [updatestate_loads_eliminate]: 2.63e-06 [cse]: 2.365e-05 [renormalize]: 1.08001e-06 [remove_dup_value]: 1.746e-05 [tuple_transform]: 9.726e-05, [1] [Cycle 1]: 8.991e-05, [4] [d_1]: 4.981e-05 [none_parameter_eliminate]: 1.67001e-06 [renormalize]: 2.50002e-07 [switch_simplify]: 7.55e-06 [partial_unused_args_eliminate]: 4.71002e-06 [add_recomputation]: 5.651e-05 [cse_after_recomputation]: 2.819e-05, [1] [Cycle 1]: 2.089e-05, [1] [cse]: 1.108e-05 [environ_conv]: 9.03002e-06 [swap_dp_allreduce_reducescatter]: 8.65001e-06 [bias_add_comm_swap]: 5.69999e-06 [label_micro_interleaved_index]: 7.85e-06 [label_fine_grained_interleaved_index]: 5.46998e-06 [merge_cast_opt]: 3.76999e-06 [slice_recompute_activation]: 4.64002e-06 [micro_interleaved_order_control]: 4.59998e-06 [assign_add_opt]: 3.82002e-06 [ForceFp32Comm]: 3.26001e-06 [remove_cast_before_assign_add]: 3.4e-06 [full_micro_interleaved_order_control]: 4.91002e-06 [reorder_send_recv_between_fp_bp]: 5.81e-06 [comm_op_add_attrs]: 3.57997e-06 [add_comm_op_reuse_tag]: 3.20998e-06 [interleave_split_concat_branches]: 3.6e-06 [interleave_parallel_branches]: 3.35e-06 [overlap_opt_shard_in_pipeline]: 3.37002e-06 [overlap_opt_shard_grad_in_pipeline]: 4.4e-06 [control_data_broadcast_order]: 1.731e-05 [grouped_pairwise_exchange_alltoall]: 4.15e-06 [offloading_packed_experts]: 7.38999e-06 [overlap_recompute_and_grad_model_parallel]: 7.29001e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.57997e-06 [overlap_recompute_allgather_and_fa_grad]: 3.71999e-06 [overlap_recompute_comm]: 4.87e-06 [overlap_grad_ring_attention]: 6.86999e-06 [overlap_grad_flash_sp]: 2.462e-05 [begin_end_overlap_inline]: 3.09999e-06 [split_matmul_comm_elemetwise]: 5.49e-06 [split_layernorm_comm]: 4.29002e-06 [handle_group_info]: 3.25e-06 [symbol_engine_optimizer]: 0.00010048, [1] [Cycle 1]: 9.332e-05, [6] [build]: 4.48999e-06 [elim_shapecalc]: 1.063e-05 [elim_not_effective]: 1.311e-05 [opt_reshape]: 7.18998e-06 [fold_const_symbol]: 1.039e-05 [renormalize]: 1.70025e-07 [detach_backward]: 4.1e-06 [pipeline_parallel_scheduler]: 2.01e-06 [auto_monad_reorder]: 2.088e-05 [get_jit_bprop_graph]: 2.59001e-06 [rewriter_after_jit_bprop_graph]: 5.36002e-06 [opt_after_jit_grad]: 0.00058046 [validate]: 4.265e-05 Sums bootstrap : 0.000409s : 0.48% type_inference : 0.078173s : 91.93% event_method : 0.000020s : 0.02% auto_monad : 0.000065s : 0.08% graph_reusing : 0.000006s : 0.01% inline : 0.000003s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000020s : 0.02% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.01% parallel-infer-symbol : 0.000004s : 0.00% pre_auto_parallel : 0.000039s : 0.05% insert-virtual-dataset : 0.000002s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000032s : 0.04% optimize.rewriter_before_opt_a : 0.000091s : 0.11% optimize.opt_a.expand_dump_flag : 0.000005s : 0.01% optimize.opt_a.switch_simplify : 0.000052s : 0.06% optimize.opt_a.loop_unroll : 0.000036s : 0.04% optimize.opt_a.a_1 : 0.001051s : 1.24% optimize.opt_a.with_stream_mark : 0.000040s : 0.05% optimize.opt_a.recompute_prepare : 0.000020s : 0.02% optimize.opt_a.updatestate_depend_eliminate : 0.000008s : 0.01% optimize.opt_a.updatestate_assign_eliminate : 0.000006s : 0.01% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.01% optimize.opt_a.parameter_eliminate : 0.000004s : 0.00% optimize.opt_a.a_2 : 0.000224s : 0.26% optimize.opt_a.accelerated_algorithm : 0.000014s : 0.02% optimize.opt_a.shard : 0.000004s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.00% optimize.opt_a.shard_inline : 0.000013s : 0.01% optimize.opt_a.merge_send_recv : 0.000015s : 0.02% optimize.opt_a.auto_parallel : 0.000016s : 0.02% optimize.opt_a.parallel : 0.000025s : 0.03% optimize.opt_a.flash_sp : 0.000014s : 0.02% optimize.opt_a.merge_comm : 0.000008s : 0.01% optimize.opt_a.allreduce_fusion : 0.000008s : 0.01% optimize.opt_a.matmul_add_comm_reduction : 0.000017s : 0.02% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000016s : 0.02% optimize.opt_a.virtual_dataset : 0.000013s : 0.02% optimize.opt_a.get_grad_eliminate_ : 0.000013s : 0.02% optimize.opt_a.virtual_output : 0.000012s : 0.01% optimize.opt_a.merge_forward : 0.000008s : 0.01% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.00% optimize.opt_a.offload_activation : 0.000019s : 0.02% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000030s : 0.04% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.00% optimize.opt_a.before_grad : 0.000022s : 0.03% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000008s : 0.01% optimize.opt_a.meta_fg_expand : 0.000005s : 0.01% optimize.opt_a.flash_sp_send_recv_attached : 0.000005s : 0.01% optimize.opt_a.receive_attached : 0.000004s : 0.01% optimize.opt_a.after_resolve : 0.000025s : 0.03% optimize.opt_a.a_after_grad : 0.000020s : 0.02% optimize.opt_a.renormalize : 0.000772s : 0.91% optimize.opt_a.add_forward_monad_depend : 0.000008s : 0.01% optimize.opt_a.auto_monad_grad : 0.000004s : 0.00% optimize.opt_a.auto_monad_eliminator : 0.000028s : 0.03% optimize.opt_a.cse : 0.000047s : 0.06% optimize.opt_a.a_3 : 0.000119s : 0.14% optimize.py_interpret_to_execute_after_opt_a : 0.000016s : 0.02% optimize.slice_cell_reuse_recomputed_activation : 0.000006s : 0.01% optimize.rewriter_after_opt_a : 0.000045s : 0.05% optimize.convert_after_rewriter : 0.000010s : 0.01% optimize.order_py_execute_after_rewriter : 0.000008s : 0.01% optimize.mutable_eliminate : 0.000648s : 0.76% optimize.opt_b.b_1 : 0.000214s : 0.25% optimize.opt_b.b_2 : 0.000008s : 0.01% optimize.opt_b.updatestate_depend_eliminate : 0.000009s : 0.01% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.00% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000023s : 0.03% optimize.optimize_parallel_all_gather_comm : 0.000022s : 0.03% optimize.overlap_param_gather : 0.000005s : 0.01% optimize.cconv : 0.000036s : 0.04% optimize.loop_unroll : 0.001211s : 1.42% optimize.opt_after_cconv.c_1 : 0.000032s : 0.04% optimize.opt_after_cconv.parameter_eliminate : 0.000006s : 0.01% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000009s : 0.01% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.cse : 0.000024s : 0.03% optimize.opt_after_cconv.renormalize : 0.000001s : 0.00% optimize.remove_dup_value : 0.000017s : 0.02% optimize.tuple_transform.d_1 : 0.000050s : 0.06% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000008s : 0.01% optimize.partial_unused_args_eliminate : 0.000005s : 0.01% optimize.add_recomputation : 0.000057s : 0.07% optimize.cse_after_recomputation.cse : 0.000011s : 0.01% optimize.environ_conv : 0.000009s : 0.01% optimize.swap_dp_allreduce_reducescatter : 0.000009s : 0.01% optimize.bias_add_comm_swap : 0.000006s : 0.01% optimize.label_micro_interleaved_index : 0.000008s : 0.01% optimize.label_fine_grained_interleaved_index : 0.000005s : 0.01% optimize.merge_cast_opt : 0.000004s : 0.00% optimize.slice_recompute_activation : 0.000005s : 0.01% optimize.micro_interleaved_order_control : 0.000005s : 0.01% optimize.assign_add_opt : 0.000004s : 0.00% optimize.ForceFp32Comm : 0.000003s : 0.00% optimize.remove_cast_before_assign_add : 0.000003s : 0.00% optimize.full_micro_interleaved_order_control : 0.000005s : 0.01% optimize.reorder_send_recv_between_fp_bp : 0.000006s : 0.01% optimize.comm_op_add_attrs : 0.000004s : 0.00% optimize.add_comm_op_reuse_tag : 0.000003s : 0.00% optimize.interleave_split_concat_branches : 0.000004s : 0.00% optimize.interleave_parallel_branches : 0.000003s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000003s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000004s : 0.01% optimize.control_data_broadcast_order : 0.000017s : 0.02% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.00% optimize.offloading_packed_experts : 0.000007s : 0.01% optimize.overlap_recompute_and_grad_model_parallel : 0.000007s : 0.01% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.00% optimize.overlap_recompute_comm : 0.000005s : 0.01% optimize.overlap_grad_ring_attention : 0.000007s : 0.01% optimize.overlap_grad_flash_sp : 0.000025s : 0.03% optimize.begin_end_overlap_inline : 0.000003s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000005s : 0.01% optimize.split_layernorm_comm : 0.000004s : 0.01% optimize.handle_group_info : 0.000003s : 0.00% optimize.symbol_engine_optimizer.build : 0.000004s : 0.01% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000011s : 0.01% optimize.symbol_engine_optimizer.elim_not_effective : 0.000013s : 0.02% optimize.symbol_engine_optimizer.opt_reshape : 0.000007s : 0.01% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000010s : 0.01% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000004s : 0.00% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000021s : 0.02% get_jit_bprop_graph : 0.000003s : 0.00% rewriter_after_jit_bprop_graph : 0.000005s : 0.01% opt_after_jit_grad : 0.000580s : 0.68% validate : 0.000043s : 0.05% Time group info: ------[substitution.] 0.000432 34 39.78% : 0.000172s : 6: substitution.arithmetic_simplify 0.43% : 0.000002s : 2: substitution.elim_not_effective 0.31% : 0.000001s : 2: substitution.fold_const_symbol 1.58% : 0.000007s : 4: substitution.graph_param_transform 50.27% : 0.000217s : 4: substitution.inline 1.16% : 0.000005s : 4: substitution.j_node_and_user_rematch 1.09% : 0.000005s : 4: substitution.remove_not_recompute_node 1.45% : 0.000006s : 4: substitution.replace_old_param 3.93% : 0.000017s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.078113 2 99.04% : 0.077366s : 1: type_inference.infer 0.96% : 0.000748s : 1: type_inference.specialize ------[replace.] 0.000074 8 57.55% : 0.000043s : 4: replace.inline 42.45% : 0.000031s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000230 8 93.46% : 0.000215s : 4: match.inline 6.54% : 0.000015s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000220 1278 0.84% : 0.000002s : 13: predicate.accumulaten_eliminater 0.65% : 0.000001s : 4: predicate.ad_related_special_op_eliminate 0.49% : 0.000001s : 8: predicate.addn_check_dump 1.16% : 0.000003s : 13: predicate.addn_zero_filter 0.80% : 0.000002s : 13: predicate.adjust_all_reduce_mul_add 2.45% : 0.000005s : 21: predicate.arithmetic_simplify 0.98% : 0.000002s : 13: predicate.cast_eliminate 0.67% : 0.000001s : 8: predicate.check_bprop_eliminate 0.52% : 0.000001s : 8: predicate.compare_switch_simplify 0.20% : 0.000000s : 4: predicate.const_output_eliminate 0.56% : 0.000001s : 8: predicate.depend_value_elim 0.87% : 0.000002s : 13: predicate.dict_get_item_const_eliminator 1.00% : 0.000002s : 13: predicate.dict_get_item_eliminator 0.90% : 0.000002s : 13: predicate.dict_set_item_eliminator 1.14% : 0.000002s : 8: predicate.dumpgradient_eliminate 0.16% : 0.000000s : 4: predicate.elim_not_effective 0.59% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.06% : 0.000002s : 17: predicate.environ_add_const_eliminate 1.01% : 0.000002s : 17: predicate.environ_get_add_eliminate 1.06% : 0.000002s : 17: predicate.environ_get_depend_swap 1.84% : 0.000004s : 25: predicate.environ_get_eliminate 1.06% : 0.000002s : 17: predicate.environ_get_set_eliminate 1.40% : 0.000003s : 21: predicate.exchange_switch_depend_value 2.39% : 0.000005s : 21: predicate.float_depend_g_call 0.65% : 0.000001s : 8: predicate.float_environ_get_switch 0.73% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.17% : 0.000000s : 4: predicate.fold_const_symbol 0.91% : 0.000002s : 8: predicate.get_grad_eliminate 0.26% : 0.000001s : 4: predicate.graph_param_transform 0.55% : 0.000001s : 8: predicate.incorporate_call 0.46% : 0.000001s : 8: predicate.incorporate_call_switch 6.51% : 0.000014s : 58: predicate.inline 0.81% : 0.000002s : 8: predicate.inline_without_move 0.41% : 0.000001s : 8: predicate.j_node_and_user_rematch 0.78% : 0.000002s : 8: predicate.less_batch_normalization 1.70% : 0.000004s : 25: predicate.list_to_tuple_eliminator_ 2.35% : 0.000005s : 38: predicate.load_eliminater 1.49% : 0.000003s : 4: predicate.loop_unroll_after_grad 2.22% : 0.000005s : 34: predicate.loop_unroll_before_grad 1.61% : 0.000004s : 21: predicate.make_slice_get_slice_eliminator 0.50% : 0.000001s : 8: predicate.merge_addn 0.76% : 0.000002s : 8: predicate.micro_step_allgather_replace 0.59% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.78% : 0.000002s : 13: predicate.minmaximum_grad 1.32% : 0.000003s : 4: predicate.mutable_eliminate 0.33% : 0.000001s : 4: predicate.opt_reshape 0.35% : 0.000001s : 4: predicate.parallel_virtual_node 1.86% : 0.000004s : 21: predicate.partial_defer_inline 1.55% : 0.000003s : 21: predicate.partial_eliminate 0.81% : 0.000002s : 13: predicate.print_const_string_wrapper 0.77% : 0.000002s : 8: predicate.reduce_all_const_elim 1.05% : 0.000002s : 13: predicate.reduce_eliminate 2.44% : 0.000005s : 38: predicate.redundant_stop_gradient_eliminater 0.47% : 0.000001s : 8: predicate.remove_not_recompute_node 1.33% : 0.000003s : 25: predicate.replace_applicator 0.50% : 0.000001s : 8: predicate.replace_old_param 0.46% : 0.000001s : 4: predicate.reset_defer_inline 0.91% : 0.000002s : 13: predicate.reshape_eliminate 0.64% : 0.000001s : 8: predicate.row_tensor_add_zeros_like 0.35% : 0.000001s : 4: predicate.row_tensor_eliminate 0.88% : 0.000002s : 8: predicate.same_eliminate 0.57% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.78% : 0.000002s : 8: predicate.shard_identity_eliminate 0.70% : 0.000002s : 8: predicate.special_op_eliminate 0.66% : 0.000001s : 8: predicate.specialize_transform 0.89% : 0.000002s : 8: predicate.split_environ_get_set_with_tuple_value 0.68% : 0.000001s : 8: predicate.stack_unstack_eliminate 0.41% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.55% : 0.000003s : 21: predicate.switch_defer_inline 1.99% : 0.000004s : 29: predicate.switch_layer_defer_inline 5.15% : 0.000011s : 67: predicate.switch_simplify 1.07% : 0.000002s : 13: predicate.tile_eliminate 0.88% : 0.000002s : 13: predicate.transpose_eliminate 1.38% : 0.000003s : 21: predicate.tuple_list_convert_item_index_to_positive 1.44% : 0.000003s : 21: predicate.tuple_list_get_item_const_eliminator 1.55% : 0.000003s : 21: predicate.tuple_list_get_item_depend_reorder 3.38% : 0.000007s : 33: predicate.tuple_list_get_item_eliminator 1.68% : 0.000004s : 21: predicate.tuple_list_get_set_item_eliminator 2.15% : 0.000005s : 29: predicate.tuple_list_set_item_eliminator 1.65% : 0.000004s : 25: predicate.tuple_to_list_eliminator_ 2.19% : 0.000005s : 38: predicate.updatestate_pure_node_eliminater 2.90% : 0.000006s : 46: predicate.updatestate_useless_node_eliminater 0.35% : 0.000001s : 4: predicate.value_based_eliminate 0.64% : 0.000001s : 8: predicate.virtual_dataset_eliminate 0.60% : 0.000001s : 8: predicate.virtual_output_eliminate 0.28% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.40% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000580 11 53.05% : 0.000308s : 5: func_graph_cloner_run.FuncGraphClonerGraph 46.95% : 0.000272s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.103069 192 0.01% : 0.000006s : 1: ForceFp32Comm 3.34% : 0.003442s : 1: add_attr 3.32% : 0.003425s : 1: add_attr_with_inline 0.01% : 0.000006s : 1: add_comm_op_reuse_tag 0.06% : 0.000061s : 1: add_recomputation 0.01% : 0.000007s : 1: assign_add_opt 0.07% : 0.000074s : 1: auto_monad 0.03% : 0.000029s : 1: auto_monad_reorder 0.01% : 0.000006s : 1: begin_end_overlap_inline 0.01% : 0.000008s : 1: bias_add_comm_swap 0.44% : 0.000450s : 1: bootstrap 0.04% : 0.000039s : 1: cconv 0.01% : 0.000006s : 1: comm_op_add_attrs 0.02% : 0.000021s : 1: control_data_broadcast_order 0.01% : 0.000013s : 1: convert_after_rewriter 0.03% : 0.000031s : 1: cse_after_recomputation 0.01% : 0.000009s : 1: dataset_repeat_opt 0.02% : 0.000023s : 1: detach_backward 0.01% : 0.000012s : 1: environ_conv 0.03% : 0.000030s : 1: event_method 0.01% : 0.000008s : 1: full_micro_interleaved_order_control 0.01% : 0.000009s : 1: get_jit_bprop_graph 0.01% : 0.000013s : 1: graph_reusing 0.01% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000006s : 1: handle_group_info 0.01% : 0.000009s : 1: inline 0.01% : 0.000009s : 1: insert-virtual-dataset 0.01% : 0.000006s : 1: interleave_parallel_branches 0.01% : 0.000006s : 1: interleave_split_concat_branches 0.01% : 0.000008s : 1: label_fine_grained_interleaved_index 0.01% : 0.000011s : 1: label_micro_interleaved_index 1.18% : 0.001220s : 1: loop_unroll 0.01% : 0.000007s : 1: merge_cast_opt 0.01% : 0.000007s : 1: micro_interleaved_order_control 0.64% : 0.000656s : 1: mutable_eliminate 0.01% : 0.000010s : 1: offloading_packed_experts 0.02% : 0.000021s : 1: opt.transform.loop_unroll_optimizer 0.02% : 0.000019s : 1: opt.transform.mutable_eliminate 1.46% : 0.001503s : 78: opt.transform.opt_a 0.03% : 0.000030s : 1: opt.transform.opt_after_cconv 0.03% : 0.000029s : 1: opt.transform.opt_after_jit_grad 0.10% : 0.000105s : 28: opt.transform.opt_b 0.05% : 0.000055s : 2: opt.transform.opt_trans_graph 0.04% : 0.000038s : 4: opt.transform.symbol_engine_opt 3.47% : 0.003577s : 1: opt_a 0.15% : 0.000150s : 1: opt_after_cconv 0.57% : 0.000592s : 1: opt_after_jit_grad 0.32% : 0.000334s : 1: opt_b 7.02% : 0.007238s : 1: optimize 0.02% : 0.000025s : 1: optimize_parallel_all_gather_comm 0.01% : 0.000012s : 1: order_py_execute_after_rewriter 0.03% : 0.000028s : 1: overlap_grad_flash_sp 0.01% : 0.000006s : 1: overlap_grad_matmul_and_grad_allreduce 0.01% : 0.000010s : 1: overlap_grad_ring_attention 0.01% : 0.000007s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000006s : 1: overlap_opt_shard_in_pipeline 0.01% : 0.000008s : 1: overlap_param_gather 0.01% : 0.000006s : 1: overlap_recompute_allgather_and_fa_grad 0.01% : 0.000011s : 1: overlap_recompute_and_grad_model_parallel 0.01% : 0.000008s : 1: overlap_recompute_comm 0.01% : 0.000010s : 1: parallel-infer-symbol 0.01% : 0.000006s : 1: parallel-infer-symbol-second 0.01% : 0.000008s : 1: partial_unused_args_eliminate 0.01% : 0.000010s : 1: pipeline_parallel_scheduler 0.01% : 0.000007s : 1: pipeline_split 0.05% : 0.000046s : 1: pre_auto_parallel 0.03% : 0.000036s : 1: py_interpret_to_execute 0.02% : 0.000020s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000006s : 1: remove_cast_before_assign_add 0.02% : 0.000021s : 1: remove_dup_value 0.41% : 0.000419s : 1: renormalize.infer 0.33% : 0.000344s : 1: renormalize.specialize 0.01% : 0.000008s : 1: reorder_send_recv_between_fp_bp 0.01% : 0.000011s : 1: rewriter_after_jit_bprop_graph 0.05% : 0.000049s : 1: rewriter_after_opt_a 0.09% : 0.000095s : 1: rewriter_before_opt_a 0.01% : 0.000008s : 1: slice_cell_reuse_recomputed_activation 0.01% : 0.000008s : 1: slice_recompute_activation 0.01% : 0.000007s : 1: split_layernorm_comm 0.01% : 0.000008s : 1: split_matmul_comm_elemetwise 0.01% : 0.000012s : 1: swap_dp_allreduce_reducescatter 0.10% : 0.000103s : 1: symbol_engine_optimizer 0.10% : 0.000100s : 1: tuple_transform 75.89% : 0.078220s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:43:04.288.047 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0259855, [21] [bootstrap]: 0.00042287 [type_inference]: 0.0163464 [event_method]: 1.867e-05 [auto_monad]: 6.212e-05 [graph_reusing]: 6.58e-06 [inline]: 2.49999e-06 [add_attr]: 0.00343894, [1] [add_attr_with_inline]: 0.00342755, [1] [Cycle 1]: 6.709e-05, [2] [tag_attr]: 2.169e-05 [meta_addattr_fg_expand]: 5.58002e-06 [parallel-infer-symbol]: 3.41001e-06 [pre_auto_parallel]: 3.805e-05 [insert-virtual-dataset]: 2.24999e-06 [parallel-infer-symbol-second]: 7.49977e-07 [dataset_repeat_opt]: 2.19001e-06 [pipeline_split]: 1.63002e-06 [optimize]: 0.00496434, [53] [py_interpret_to_execute]: 2.794e-05 [rewriter_before_opt_a]: 8.255e-05 [opt_a]: 0.0029111, [2] [Cycle 1]: 0.00214848, [45] [expand_dump_flag]: 3.55e-06 [switch_simplify]: 4.356e-05 [loop_unroll]: 3.158e-05 [a_1]: 0.00065361 [with_stream_mark]: 1.924e-05 [recompute_prepare]: 1.055e-05 [updatestate_depend_eliminate]: 4.01001e-06 [updatestate_assign_eliminate]: 3.26001e-06 [updatestate_loads_eliminate]: 2.91999e-06 [parameter_eliminate]: 1.97999e-06 [a_2]: 8.939e-05 [accelerated_algorithm]: 7.95e-06 [shard]: 2.12999e-06 [meta_shard_fg_expand]: 2.01e-06 [shard_inline]: 6.36998e-06 [merge_send_recv]: 9.04e-06 [auto_parallel]: 6.88e-06 [parallel]: 1.921e-05 [flash_sp]: 8.67998e-06 [merge_comm]: 3.94997e-06 [allreduce_fusion]: 3.53e-06 [matmul_add_comm_reduction]: 9.31e-06 [allreduce_slice_to_reducescatter]: 1.20001e-06 [virtual_shard_identity]: 7.88001e-06 [virtual_dataset]: 7.03998e-06 [get_grad_eliminate_]: 6.11e-06 [virtual_output]: 6.13002e-06 [merge_forward]: 3.8e-06 [cell_reuse_recompute_pass]: 1.07e-06 [offload_activation]: 1.037e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.365e-05 [merge_recompute_call_nodes]: 1.44e-06 [before_grad]: 1.076e-05 [set_forward_comm_id_for_comm_node_pass]: 3.78001e-06 [meta_fg_expand]: 2.73e-06 [flash_sp_send_recv_attached]: 2.59999e-06 [receive_attached]: 2.56e-06 [after_resolve]: 1.268e-05 [a_after_grad]: 1.114e-05 [renormalize]: 0.00074039 [add_forward_monad_depend]: 6.88998e-06 [auto_monad_grad]: 2.70002e-06 [auto_monad_eliminator]: 1.669e-05 [cse]: 2.945e-05 [a_3]: 5.081e-05 [Cycle 2]: 0.00075211, [45] [expand_dump_flag]: 1.40001e-06 [switch_simplify]: 8.42998e-06 [loop_unroll]: 6.53e-06 [a_1]: 0.00019806 [with_stream_mark]: 1.381e-05 [recompute_prepare]: 7.36001e-06 [updatestate_depend_eliminate]: 3.24001e-06 [updatestate_assign_eliminate]: 2.93998e-06 [updatestate_loads_eliminate]: 2.44999e-06 [parameter_eliminate]: 2.01e-06 [a_2]: 9.663e-05 [accelerated_algorithm]: 6.21998e-06 [shard]: 1.42999e-06 [meta_shard_fg_expand]: 1.45001e-06 [shard_inline]: 5.98998e-06 [merge_send_recv]: 5.57001e-06 [auto_parallel]: 7.3e-06 [parallel]: 5.50001e-06 [flash_sp]: 3.63e-06 [merge_comm]: 3.63999e-06 [allreduce_fusion]: 3.26001e-06 [matmul_add_comm_reduction]: 6.36998e-06 [allreduce_slice_to_reducescatter]: 3.50003e-07 [virtual_shard_identity]: 6.88e-06 [virtual_dataset]: 5.79e-06 [get_grad_eliminate_]: 7.57002e-06 [virtual_output]: 5.68997e-06 [merge_forward]: 2.93e-06 [cell_reuse_recompute_pass]: 2.34999e-06 [offload_activation]: 8.27e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.193e-05 [merge_recompute_call_nodes]: 1.22999e-06 [before_grad]: 9.69e-06 [set_forward_comm_id_for_comm_node_pass]: 3.35e-06 [meta_fg_expand]: 1.97001e-06 [flash_sp_send_recv_attached]: 9.80013e-07 [receive_attached]: 1.84e-06 [after_resolve]: 1.045e-05 [a_after_grad]: 9.56e-06 [renormalize]: 8.9989e-08 [add_forward_monad_depend]: 1.55999e-06 [auto_monad_grad]: 1.00001e-06 [auto_monad_eliminator]: 7.74002e-06 [cse]: 1.468e-05 [a_3]: 3.66e-05 [py_interpret_to_execute_after_opt_a]: 1.134e-05 [slice_cell_reuse_recomputed_activation]: 2.55002e-06 [rewriter_after_opt_a]: 3.685e-05 [convert_after_rewriter]: 6.77002e-06 [order_py_execute_after_rewriter]: 5.81e-06 [mutable_eliminate]: 0.00055525 [opt_b]: 0.00020625, [1] [Cycle 1]: 0.00019961, [7] [b_1]: 0.00012616 [b_2]: 7.78001e-06 [updatestate_depend_eliminate]: 7.14001e-06 [updatestate_assign_eliminate]: 2.49001e-06 [updatestate_loads_eliminate]: 2.47001e-06 [renormalize]: 8.89995e-07 [cse]: 1.772e-05 [optimize_parallel_all_gather_comm]: 1.675e-05 [overlap_param_gather]: 1.91e-06 [cconv]: 2.589e-05 [loop_unroll]: 0.0004218 [opt_after_cconv]: 0.00010076, [1] [Cycle 1]: 9.507e-05, [7] [c_1]: 3.079e-05 [parameter_eliminate]: 3.47002e-06 [updatestate_depend_eliminate]: 5.57999e-06 [updatestate_assign_eliminate]: 2.46998e-06 [updatestate_loads_eliminate]: 2.21998e-06 [cse]: 1.632e-05 [renormalize]: 5.09986e-07 [remove_dup_value]: 1.327e-05 [tuple_transform]: 7.657e-05, [1] [Cycle 1]: 7.22e-05, [4] [d_1]: 4.504e-05 [none_parameter_eliminate]: 1.78002e-06 [renormalize]: 2.20025e-07 [switch_simplify]: 6.66e-06 [partial_unused_args_eliminate]: 1.65001e-06 [add_recomputation]: 4.804e-05 [cse_after_recomputation]: 2.15e-05, [1] [Cycle 1]: 1.691e-05, [1] [cse]: 1.062e-05 [environ_conv]: 5.19e-06 [swap_dp_allreduce_reducescatter]: 5.09e-06 [bias_add_comm_swap]: 3.21001e-06 [label_micro_interleaved_index]: 4.08001e-06 [label_fine_grained_interleaved_index]: 2.91999e-06 [merge_cast_opt]: 1.52001e-06 [slice_recompute_activation]: 2.04999e-06 [micro_interleaved_order_control]: 2.07999e-06 [assign_add_opt]: 1.17e-06 [ForceFp32Comm]: 8.30012e-07 [remove_cast_before_assign_add]: 1.16997e-06 [full_micro_interleaved_order_control]: 2.19999e-06 [reorder_send_recv_between_fp_bp]: 2.93998e-06 [comm_op_add_attrs]: 1.30999e-06 [add_comm_op_reuse_tag]: 1.30001e-06 [interleave_split_concat_branches]: 1.12e-06 [interleave_parallel_branches]: 1.03001e-06 [overlap_opt_shard_in_pipeline]: 1.18001e-06 [overlap_opt_shard_grad_in_pipeline]: 2.04e-06 [control_data_broadcast_order]: 1.238e-05 [grouped_pairwise_exchange_alltoall]: 1.72999e-06 [offloading_packed_experts]: 3.65e-06 [overlap_recompute_and_grad_model_parallel]: 4.75999e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.14e-06 [overlap_recompute_allgather_and_fa_grad]: 1.34e-06 [overlap_recompute_comm]: 2.22001e-06 [overlap_grad_ring_attention]: 4.17e-06 [overlap_grad_flash_sp]: 2.03e-05 [begin_end_overlap_inline]: 5.69999e-07 [split_matmul_comm_elemetwise]: 2.47001e-06 [split_layernorm_comm]: 1.70001e-06 [handle_group_info]: 9.10019e-07 [symbol_engine_optimizer]: 7.391e-05, [1] [Cycle 1]: 6.984e-05, [6] [build]: 3.48e-06 [elim_shapecalc]: 9.39e-06 [elim_not_effective]: 1.243e-05 [opt_reshape]: 6.61e-06 [fold_const_symbol]: 1.015e-05 [renormalize]: 2.30008e-07 [detach_backward]: 2.61999e-06 [pipeline_parallel_scheduler]: 1.48002e-06 [auto_monad_reorder]: 1.583e-05 [get_jit_bprop_graph]: 1.57001e-06 [rewriter_after_jit_bprop_graph]: 3.98001e-06 [opt_after_jit_grad]: 0.00045353 [validate]: 4.168e-05 Sums bootstrap : 0.000423s : 1.96% type_inference : 0.016346s : 75.73% event_method : 0.000019s : 0.09% auto_monad : 0.000062s : 0.29% graph_reusing : 0.000007s : 0.03% inline : 0.000002s : 0.01% add_attr.add_attr_with_inline.tag_attr : 0.000022s : 0.10% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.03% parallel-infer-symbol : 0.000003s : 0.02% pre_auto_parallel : 0.000038s : 0.18% insert-virtual-dataset : 0.000002s : 0.01% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.01% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000028s : 0.13% optimize.rewriter_before_opt_a : 0.000083s : 0.38% optimize.opt_a.expand_dump_flag : 0.000005s : 0.02% optimize.opt_a.switch_simplify : 0.000052s : 0.24% optimize.opt_a.loop_unroll : 0.000038s : 0.18% optimize.opt_a.a_1 : 0.000852s : 3.95% optimize.opt_a.with_stream_mark : 0.000033s : 0.15% optimize.opt_a.recompute_prepare : 0.000018s : 0.08% optimize.opt_a.updatestate_depend_eliminate : 0.000007s : 0.03% optimize.opt_a.updatestate_assign_eliminate : 0.000006s : 0.03% optimize.opt_a.updatestate_loads_eliminate : 0.000005s : 0.02% optimize.opt_a.parameter_eliminate : 0.000004s : 0.02% optimize.opt_a.a_2 : 0.000186s : 0.86% optimize.opt_a.accelerated_algorithm : 0.000014s : 0.07% optimize.opt_a.shard : 0.000004s : 0.02% optimize.opt_a.meta_shard_fg_expand : 0.000003s : 0.02% optimize.opt_a.shard_inline : 0.000012s : 0.06% optimize.opt_a.merge_send_recv : 0.000015s : 0.07% optimize.opt_a.auto_parallel : 0.000014s : 0.07% optimize.opt_a.parallel : 0.000025s : 0.11% optimize.opt_a.flash_sp : 0.000012s : 0.06% optimize.opt_a.merge_comm : 0.000008s : 0.04% optimize.opt_a.allreduce_fusion : 0.000007s : 0.03% optimize.opt_a.matmul_add_comm_reduction : 0.000016s : 0.07% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000002s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000015s : 0.07% optimize.opt_a.virtual_dataset : 0.000013s : 0.06% optimize.opt_a.get_grad_eliminate_ : 0.000014s : 0.06% optimize.opt_a.virtual_output : 0.000012s : 0.05% optimize.opt_a.merge_forward : 0.000007s : 0.03% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.02% optimize.opt_a.offload_activation : 0.000019s : 0.09% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000026s : 0.12% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.01% optimize.opt_a.before_grad : 0.000020s : 0.09% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000007s : 0.03% optimize.opt_a.meta_fg_expand : 0.000005s : 0.02% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.02% optimize.opt_a.receive_attached : 0.000004s : 0.02% optimize.opt_a.after_resolve : 0.000023s : 0.11% optimize.opt_a.a_after_grad : 0.000021s : 0.10% optimize.opt_a.renormalize : 0.000740s : 3.43% optimize.opt_a.add_forward_monad_depend : 0.000008s : 0.04% optimize.opt_a.auto_monad_grad : 0.000004s : 0.02% optimize.opt_a.auto_monad_eliminator : 0.000024s : 0.11% optimize.opt_a.cse : 0.000044s : 0.20% optimize.opt_a.a_3 : 0.000087s : 0.40% optimize.py_interpret_to_execute_after_opt_a : 0.000011s : 0.05% optimize.slice_cell_reuse_recomputed_activation : 0.000003s : 0.01% optimize.rewriter_after_opt_a : 0.000037s : 0.17% optimize.convert_after_rewriter : 0.000007s : 0.03% optimize.order_py_execute_after_rewriter : 0.000006s : 0.03% optimize.mutable_eliminate : 0.000555s : 2.57% optimize.opt_b.b_1 : 0.000126s : 0.58% optimize.opt_b.b_2 : 0.000008s : 0.04% optimize.opt_b.updatestate_depend_eliminate : 0.000007s : 0.03% optimize.opt_b.updatestate_assign_eliminate : 0.000002s : 0.01% optimize.opt_b.updatestate_loads_eliminate : 0.000002s : 0.01% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000018s : 0.08% optimize.optimize_parallel_all_gather_comm : 0.000017s : 0.08% optimize.overlap_param_gather : 0.000002s : 0.01% optimize.cconv : 0.000026s : 0.12% optimize.loop_unroll : 0.000422s : 1.95% optimize.opt_after_cconv.c_1 : 0.000031s : 0.14% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.02% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.03% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000002s : 0.01% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.01% optimize.opt_after_cconv.cse : 0.000016s : 0.08% optimize.opt_after_cconv.renormalize : 0.000001s : 0.00% optimize.remove_dup_value : 0.000013s : 0.06% optimize.tuple_transform.d_1 : 0.000045s : 0.21% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000007s : 0.03% optimize.partial_unused_args_eliminate : 0.000002s : 0.01% optimize.add_recomputation : 0.000048s : 0.22% optimize.cse_after_recomputation.cse : 0.000011s : 0.05% optimize.environ_conv : 0.000005s : 0.02% optimize.swap_dp_allreduce_reducescatter : 0.000005s : 0.02% optimize.bias_add_comm_swap : 0.000003s : 0.01% optimize.label_micro_interleaved_index : 0.000004s : 0.02% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.01% optimize.merge_cast_opt : 0.000002s : 0.01% optimize.slice_recompute_activation : 0.000002s : 0.01% optimize.micro_interleaved_order_control : 0.000002s : 0.01% optimize.assign_add_opt : 0.000001s : 0.01% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.01% optimize.full_micro_interleaved_order_control : 0.000002s : 0.01% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.01% optimize.comm_op_add_attrs : 0.000001s : 0.01% optimize.add_comm_op_reuse_tag : 0.000001s : 0.01% optimize.interleave_split_concat_branches : 0.000001s : 0.01% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.01% optimize.control_data_broadcast_order : 0.000012s : 0.06% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.01% optimize.offloading_packed_experts : 0.000004s : 0.02% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.02% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.01% optimize.overlap_recompute_comm : 0.000002s : 0.01% optimize.overlap_grad_ring_attention : 0.000004s : 0.02% optimize.overlap_grad_flash_sp : 0.000020s : 0.09% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.01% optimize.split_layernorm_comm : 0.000002s : 0.01% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000003s : 0.02% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000009s : 0.04% optimize.symbol_engine_optimizer.elim_not_effective : 0.000012s : 0.06% optimize.symbol_engine_optimizer.opt_reshape : 0.000007s : 0.03% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000010s : 0.05% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000003s : 0.01% pipeline_parallel_scheduler : 0.000001s : 0.01% auto_monad_reorder : 0.000016s : 0.07% get_jit_bprop_graph : 0.000002s : 0.01% rewriter_after_jit_bprop_graph : 0.000004s : 0.02% opt_after_jit_grad : 0.000454s : 2.10% validate : 0.000042s : 0.19% Time group info: ------[substitution.] 0.000227 34 15.91% : 0.000036s : 6: substitution.arithmetic_simplify 0.88% : 0.000002s : 2: substitution.elim_not_effective 0.60% : 0.000001s : 2: substitution.fold_const_symbol 2.91% : 0.000007s : 4: substitution.graph_param_transform 66.96% : 0.000152s : 4: substitution.inline 1.66% : 0.000004s : 4: substitution.j_node_and_user_rematch 2.30% : 0.000005s : 4: substitution.remove_not_recompute_node 2.06% : 0.000005s : 4: substitution.replace_old_param 6.73% : 0.000015s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.016281 2 95.56% : 0.015559s : 1: type_inference.infer 4.44% : 0.000723s : 1: type_inference.specialize ------[replace.] 0.000061 8 63.22% : 0.000039s : 4: replace.inline 36.78% : 0.000022s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000163 8 91.78% : 0.000149s : 4: match.inline 8.22% : 0.000013s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000206 1278 0.93% : 0.000002s : 13: predicate.accumulaten_eliminater 0.82% : 0.000002s : 4: predicate.ad_related_special_op_eliminate 0.51% : 0.000001s : 8: predicate.addn_check_dump 0.93% : 0.000002s : 13: predicate.addn_zero_filter 0.83% : 0.000002s : 13: predicate.adjust_all_reduce_mul_add 2.39% : 0.000005s : 21: predicate.arithmetic_simplify 0.91% : 0.000002s : 13: predicate.cast_eliminate 0.60% : 0.000001s : 8: predicate.check_bprop_eliminate 0.50% : 0.000001s : 8: predicate.compare_switch_simplify 0.21% : 0.000000s : 4: predicate.const_output_eliminate 0.56% : 0.000001s : 8: predicate.depend_value_elim 0.89% : 0.000002s : 13: predicate.dict_get_item_const_eliminator 0.97% : 0.000002s : 13: predicate.dict_get_item_eliminator 0.82% : 0.000002s : 13: predicate.dict_set_item_eliminator 1.13% : 0.000002s : 8: predicate.dumpgradient_eliminate 0.23% : 0.000000s : 4: predicate.elim_not_effective 0.42% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.29% : 0.000003s : 17: predicate.environ_add_const_eliminate 1.10% : 0.000002s : 17: predicate.environ_get_add_eliminate 1.11% : 0.000002s : 17: predicate.environ_get_depend_swap 1.73% : 0.000004s : 25: predicate.environ_get_eliminate 1.16% : 0.000002s : 17: predicate.environ_get_set_eliminate 1.41% : 0.000003s : 21: predicate.exchange_switch_depend_value 2.65% : 0.000005s : 21: predicate.float_depend_g_call 0.55% : 0.000001s : 8: predicate.float_environ_get_switch 0.92% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.19% : 0.000000s : 4: predicate.fold_const_symbol 0.67% : 0.000001s : 8: predicate.get_grad_eliminate 0.30% : 0.000001s : 4: predicate.graph_param_transform 0.62% : 0.000001s : 8: predicate.incorporate_call 0.46% : 0.000001s : 8: predicate.incorporate_call_switch 6.36% : 0.000013s : 58: predicate.inline 0.91% : 0.000002s : 8: predicate.inline_without_move 0.32% : 0.000001s : 8: predicate.j_node_and_user_rematch 0.96% : 0.000002s : 8: predicate.less_batch_normalization 1.71% : 0.000004s : 25: predicate.list_to_tuple_eliminator_ 2.46% : 0.000005s : 38: predicate.load_eliminater 0.90% : 0.000002s : 4: predicate.loop_unroll_after_grad 2.36% : 0.000005s : 34: predicate.loop_unroll_before_grad 1.57% : 0.000003s : 21: predicate.make_slice_get_slice_eliminator 0.52% : 0.000001s : 8: predicate.merge_addn 0.62% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.61% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.78% : 0.000002s : 13: predicate.minmaximum_grad 1.15% : 0.000002s : 4: predicate.mutable_eliminate 0.32% : 0.000001s : 4: predicate.opt_reshape 0.37% : 0.000001s : 4: predicate.parallel_virtual_node 1.79% : 0.000004s : 21: predicate.partial_defer_inline 1.69% : 0.000003s : 21: predicate.partial_eliminate 0.84% : 0.000002s : 13: predicate.print_const_string_wrapper 0.53% : 0.000001s : 8: predicate.reduce_all_const_elim 1.16% : 0.000002s : 13: predicate.reduce_eliminate 2.47% : 0.000005s : 38: predicate.redundant_stop_gradient_eliminater 0.53% : 0.000001s : 8: predicate.remove_not_recompute_node 1.46% : 0.000003s : 25: predicate.replace_applicator 0.50% : 0.000001s : 8: predicate.replace_old_param 0.26% : 0.000001s : 4: predicate.reset_defer_inline 1.05% : 0.000002s : 13: predicate.reshape_eliminate 0.62% : 0.000001s : 8: predicate.row_tensor_add_zeros_like 0.36% : 0.000001s : 4: predicate.row_tensor_eliminate 0.90% : 0.000002s : 8: predicate.same_eliminate 0.42% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.77% : 0.000002s : 8: predicate.shard_identity_eliminate 0.71% : 0.000001s : 8: predicate.special_op_eliminate 0.68% : 0.000001s : 8: predicate.specialize_transform 0.88% : 0.000002s : 8: predicate.split_environ_get_set_with_tuple_value 0.75% : 0.000002s : 8: predicate.stack_unstack_eliminate 0.40% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.52% : 0.000003s : 21: predicate.switch_defer_inline 2.04% : 0.000004s : 29: predicate.switch_layer_defer_inline 5.23% : 0.000011s : 67: predicate.switch_simplify 0.88% : 0.000002s : 13: predicate.tile_eliminate 0.97% : 0.000002s : 13: predicate.transpose_eliminate 1.47% : 0.000003s : 21: predicate.tuple_list_convert_item_index_to_positive 1.59% : 0.000003s : 21: predicate.tuple_list_get_item_const_eliminator 1.46% : 0.000003s : 21: predicate.tuple_list_get_item_depend_reorder 3.17% : 0.000007s : 33: predicate.tuple_list_get_item_eliminator 1.40% : 0.000003s : 21: predicate.tuple_list_get_set_item_eliminator 2.20% : 0.000005s : 29: predicate.tuple_list_set_item_eliminator 1.80% : 0.000004s : 25: predicate.tuple_to_list_eliminator_ 2.40% : 0.000005s : 38: predicate.updatestate_pure_node_eliminater 2.99% : 0.000006s : 46: predicate.updatestate_useless_node_eliminater 0.37% : 0.000001s : 4: predicate.value_based_eliminate 0.72% : 0.000001s : 8: predicate.virtual_dataset_eliminate 0.61% : 0.000001s : 8: predicate.virtual_output_eliminate 0.28% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.36% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000621 11 56.00% : 0.000348s : 5: func_graph_cloner_run.FuncGraphClonerGraph 44.00% : 0.000273s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.036576 192 0.01% : 0.000004s : 1: ForceFp32Comm 9.42% : 0.003445s : 1: add_attr 9.38% : 0.003432s : 1: add_attr_with_inline 0.01% : 0.000004s : 1: add_comm_op_reuse_tag 0.14% : 0.000052s : 1: add_recomputation 0.01% : 0.000004s : 1: assign_add_opt 0.18% : 0.000067s : 1: auto_monad 0.05% : 0.000020s : 1: auto_monad_reorder 0.01% : 0.000004s : 1: begin_end_overlap_inline 0.02% : 0.000006s : 1: bias_add_comm_swap 1.24% : 0.000453s : 1: bootstrap 0.08% : 0.000029s : 1: cconv 0.01% : 0.000004s : 1: comm_op_add_attrs 0.04% : 0.000015s : 1: control_data_broadcast_order 0.03% : 0.000010s : 1: convert_after_rewriter 0.07% : 0.000024s : 1: cse_after_recomputation 0.01% : 0.000005s : 1: dataset_repeat_opt 0.02% : 0.000006s : 1: detach_backward 0.02% : 0.000008s : 1: environ_conv 0.07% : 0.000025s : 1: event_method 0.01% : 0.000005s : 1: full_micro_interleaved_order_control 0.01% : 0.000005s : 1: get_jit_bprop_graph 0.03% : 0.000010s : 1: graph_reusing 0.01% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000004s : 1: handle_group_info 0.01% : 0.000005s : 1: inline 0.02% : 0.000006s : 1: insert-virtual-dataset 0.01% : 0.000004s : 1: interleave_parallel_branches 0.01% : 0.000004s : 1: interleave_split_concat_branches 0.02% : 0.000006s : 1: label_fine_grained_interleaved_index 0.02% : 0.000007s : 1: label_micro_interleaved_index 1.17% : 0.000430s : 1: loop_unroll 0.01% : 0.000004s : 1: merge_cast_opt 0.01% : 0.000005s : 1: micro_interleaved_order_control 1.54% : 0.000563s : 1: mutable_eliminate 0.02% : 0.000007s : 1: offloading_packed_experts 0.04% : 0.000014s : 1: opt.transform.loop_unroll_optimizer 0.04% : 0.000015s : 1: opt.transform.mutable_eliminate 3.58% : 0.001310s : 78: opt.transform.opt_a 0.08% : 0.000030s : 1: opt.transform.opt_after_cconv 0.07% : 0.000025s : 1: opt.transform.opt_after_jit_grad 0.28% : 0.000102s : 28: opt.transform.opt_b 0.14% : 0.000050s : 2: opt.transform.opt_trans_graph 0.10% : 0.000035s : 4: opt.transform.symbol_engine_opt 7.97% : 0.002914s : 1: opt_a 0.29% : 0.000105s : 1: opt_after_cconv 1.26% : 0.000462s : 1: opt_after_jit_grad 0.57% : 0.000210s : 1: opt_b 13.59% : 0.004969s : 1: optimize 0.06% : 0.000020s : 1: optimize_parallel_all_gather_comm 0.02% : 0.000009s : 1: order_py_execute_after_rewriter 0.06% : 0.000024s : 1: overlap_grad_flash_sp 0.01% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.02% : 0.000007s : 1: overlap_grad_ring_attention 0.01% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.01% : 0.000005s : 1: overlap_param_gather 0.01% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.02% : 0.000008s : 1: overlap_recompute_and_grad_model_parallel 0.01% : 0.000005s : 1: overlap_recompute_comm 0.02% : 0.000007s : 1: parallel-infer-symbol 0.01% : 0.000004s : 1: parallel-infer-symbol-second 0.01% : 0.000004s : 1: partial_unused_args_eliminate 0.01% : 0.000004s : 1: pipeline_parallel_scheduler 0.01% : 0.000004s : 1: pipeline_split 0.12% : 0.000042s : 1: pre_auto_parallel 0.09% : 0.000032s : 1: py_interpret_to_execute 0.04% : 0.000015s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000004s : 1: remove_cast_before_assign_add 0.05% : 0.000017s : 1: remove_dup_value 1.07% : 0.000390s : 1: renormalize.infer 0.93% : 0.000341s : 1: renormalize.specialize 0.02% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.02% : 0.000007s : 1: rewriter_after_jit_bprop_graph 0.11% : 0.000041s : 1: rewriter_after_opt_a 0.24% : 0.000087s : 1: rewriter_before_opt_a 0.01% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.01% : 0.000005s : 1: slice_recompute_activation 0.01% : 0.000005s : 1: split_layernorm_comm 0.01% : 0.000005s : 1: split_matmul_comm_elemetwise 0.02% : 0.000008s : 1: swap_dp_allreduce_reducescatter 0.21% : 0.000077s : 1: symbol_engine_optimizer 0.22% : 0.000080s : 1: tuple_transform 44.74% : 0.016365s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:43:05.107.526 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:43:05.107.804 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.069069, [21] [bootstrap]: 0.0004542 [type_inference]: 0.0585833 [event_method]: 2.064e-05 [auto_monad]: 6.511e-05 [graph_reusing]: 6.58e-06 [inline]: 2.12999e-06 [add_attr]: 0.00324548, [1] [add_attr_with_inline]: 0.00323558, [1] [Cycle 1]: 7.86e-05, [2] [tag_attr]: 1.983e-05 [meta_addattr_fg_expand]: 5.82001e-06 [parallel-infer-symbol]: 4.07998e-06 [pre_auto_parallel]: 3.418e-05 [insert-virtual-dataset]: 2.53e-06 [parallel-infer-symbol-second]: 7.30011e-07 [dataset_repeat_opt]: 2.06e-06 [pipeline_split]: 1.60999e-06 [optimize]: 0.00547913, [53] [py_interpret_to_execute]: 3.023e-05 [rewriter_before_opt_a]: 8.538e-05 [opt_a]: 0.00313345, [2] [Cycle 1]: 0.00222726, [45] [expand_dump_flag]: 2.94999e-06 [switch_simplify]: 4.231e-05 [loop_unroll]: 3.078e-05 [a_1]: 0.00063901 [with_stream_mark]: 1.851e-05 [recompute_prepare]: 9.66e-06 [updatestate_depend_eliminate]: 3.75e-06 [updatestate_assign_eliminate]: 3.45e-06 [updatestate_loads_eliminate]: 2.94999e-06 [parameter_eliminate]: 1.93997e-06 [a_2]: 0.00011476 [accelerated_algorithm]: 7.04001e-06 [shard]: 1.77999e-06 [meta_shard_fg_expand]: 1.67999e-06 [shard_inline]: 6.62002e-06 [merge_send_recv]: 8.77e-06 [auto_parallel]: 6.23998e-06 [parallel]: 1.979e-05 [flash_sp]: 9.27999e-06 [merge_comm]: 4.3e-06 [allreduce_fusion]: 3.58999e-06 [matmul_add_comm_reduction]: 9.13002e-06 [allreduce_slice_to_reducescatter]: 9.99979e-07 [virtual_shard_identity]: 8.34998e-06 [virtual_dataset]: 6.66e-06 [get_grad_eliminate_]: 6.24001e-06 [virtual_output]: 7.31001e-06 [merge_forward]: 3.86999e-06 [cell_reuse_recompute_pass]: 1.19e-06 [offload_activation]: 9.89999e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.445e-05 [merge_recompute_call_nodes]: 1.57999e-06 [before_grad]: 1.134e-05 [set_forward_comm_id_for_comm_node_pass]: 4.03001e-06 [meta_fg_expand]: 2.76e-06 [flash_sp_send_recv_attached]: 3.21001e-06 [receive_attached]: 2.40002e-06 [after_resolve]: 1.134e-05 [a_after_grad]: 9.76998e-06 [renormalize]: 0.00067649 [add_forward_monad_depend]: 5.62001e-06 [auto_monad_grad]: 2.82002e-06 [auto_monad_eliminator]: 1.547e-05 [cse]: 2.732e-05 [a_3]: 5.997e-05 [Cycle 2]: 0.00089098, [45] [expand_dump_flag]: 1.40999e-06 [switch_simplify]: 8.3e-06 [loop_unroll]: 6.21998e-06 [a_1]: 0.00014188 [with_stream_mark]: 1.346e-05 [recompute_prepare]: 6.67002e-06 [updatestate_depend_eliminate]: 3.28e-06 [updatestate_assign_eliminate]: 2.76e-06 [updatestate_loads_eliminate]: 2.32999e-06 [parameter_eliminate]: 1.15999e-06 [a_2]: 0.00010467 [accelerated_algorithm]: 6.68e-06 [shard]: 1.30001e-06 [meta_shard_fg_expand]: 1.71e-06 [shard_inline]: 6.14999e-06 [merge_send_recv]: 5.37999e-06 [auto_parallel]: 5.70001e-06 [parallel]: 4.31002e-06 [flash_sp]: 3.51999e-06 [merge_comm]: 3.87998e-06 [allreduce_fusion]: 3.07002e-06 [matmul_add_comm_reduction]: 6.04001e-06 [allreduce_slice_to_reducescatter]: 4.10015e-07 [virtual_shard_identity]: 7.35e-06 [virtual_dataset]: 7.58001e-06 [get_grad_eliminate_]: 6.81999e-06 [virtual_output]: 7.1e-06 [merge_forward]: 3.38e-06 [cell_reuse_recompute_pass]: 2.02999e-06 [offload_activation]: 7.38e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.545e-05 [merge_recompute_call_nodes]: 1.09998e-06 [before_grad]: 1.115e-05 [set_forward_comm_id_for_comm_node_pass]: 4.17e-06 [meta_fg_expand]: 2.39001e-06 [flash_sp_send_recv_attached]: 1.19e-06 [receive_attached]: 1.82999e-06 [after_resolve]: 1.153e-05 [a_after_grad]: 9.74999e-06 [renormalize]: 8.00064e-08 [add_forward_monad_depend]: 1.72001e-06 [auto_monad_grad]: 1.32999e-06 [auto_monad_eliminator]: 8.90999e-06 [cse]: 1.453e-05 [a_3]: 4.94e-05 [py_interpret_to_execute_after_opt_a]: 1.372e-05 [slice_cell_reuse_recomputed_activation]: 4.70001e-06 [rewriter_after_opt_a]: 4.308e-05 [convert_after_rewriter]: 1.058e-05 [order_py_execute_after_rewriter]: 7.71999e-06 [mutable_eliminate]: 0.00056661 [opt_b]: 0.00027318, [1] [Cycle 1]: 0.0002631, [7] [b_1]: 0.00016575 [b_2]: 8.05e-06 [updatestate_depend_eliminate]: 6.54999e-06 [updatestate_assign_eliminate]: 2.76999e-06 [updatestate_loads_eliminate]: 2.94999e-06 [renormalize]: 6.19999e-07 [cse]: 1.834e-05 [optimize_parallel_all_gather_comm]: 2.008e-05 [overlap_param_gather]: 5.37001e-06 [cconv]: 3.156e-05 [loop_unroll]: 0.00044925 [opt_after_cconv]: 0.00012607, [1] [Cycle 1]: 0.00011678, [7] [c_1]: 3.118e-05 [parameter_eliminate]: 3.43999e-06 [updatestate_depend_eliminate]: 5.51e-06 [updatestate_assign_eliminate]: 2.52001e-06 [updatestate_loads_eliminate]: 2.49999e-06 [cse]: 1.616e-05 [renormalize]: 4.59986e-07 [remove_dup_value]: 1.652e-05 [tuple_transform]: 9.1e-05, [1] [Cycle 1]: 8.411e-05, [4] [d_1]: 4.473e-05 [none_parameter_eliminate]: 1.64998e-06 [renormalize]: 1.90019e-07 [switch_simplify]: 7.16999e-06 [partial_unused_args_eliminate]: 4.33999e-06 [add_recomputation]: 4.925e-05 [cse_after_recomputation]: 2.621e-05, [1] [Cycle 1]: 1.947e-05, [1] [cse]: 1.059e-05 [environ_conv]: 8.13999e-06 [swap_dp_allreduce_reducescatter]: 8.10999e-06 [bias_add_comm_swap]: 5.34e-06 [label_micro_interleaved_index]: 7.83999e-06 [label_fine_grained_interleaved_index]: 5.23002e-06 [merge_cast_opt]: 3.9e-06 [slice_recompute_activation]: 4.25999e-06 [micro_interleaved_order_control]: 4.89998e-06 [assign_add_opt]: 3.58e-06 [ForceFp32Comm]: 3.15002e-06 [remove_cast_before_assign_add]: 3.58999e-06 [full_micro_interleaved_order_control]: 4.45999e-06 [reorder_send_recv_between_fp_bp]: 5.49e-06 [comm_op_add_attrs]: 3.33998e-06 [add_comm_op_reuse_tag]: 3.24001e-06 [interleave_split_concat_branches]: 3.54002e-06 [interleave_parallel_branches]: 3.34001e-06 [overlap_opt_shard_in_pipeline]: 3.54002e-06 [overlap_opt_shard_grad_in_pipeline]: 4.19002e-06 [control_data_broadcast_order]: 1.57e-05 [grouped_pairwise_exchange_alltoall]: 3.85e-06 [offloading_packed_experts]: 7.03998e-06 [overlap_recompute_and_grad_model_parallel]: 7.01999e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.45998e-06 [overlap_recompute_allgather_and_fa_grad]: 3.62002e-06 [overlap_recompute_comm]: 5.19e-06 [overlap_grad_ring_attention]: 6.66e-06 [overlap_grad_flash_sp]: 2.376e-05 [begin_end_overlap_inline]: 2.91e-06 [split_matmul_comm_elemetwise]: 4.57e-06 [split_layernorm_comm]: 3.88999e-06 [handle_group_info]: 3.63999e-06 [symbol_engine_optimizer]: 9.971e-05, [1] [Cycle 1]: 9.327e-05, [6] [build]: 4.07e-06 [elim_shapecalc]: 1.004e-05 [elim_not_effective]: 1.339e-05 [opt_reshape]: 8.32998e-06 [fold_const_symbol]: 1.021e-05 [renormalize]: 2.29978e-07 [detach_backward]: 4.43001e-06 [pipeline_parallel_scheduler]: 2.34001e-06 [auto_monad_reorder]: 2.037e-05 [get_jit_bprop_graph]: 2.06998e-06 [rewriter_after_jit_bprop_graph]: 5.21998e-06 [opt_after_jit_grad]: 0.00050179 [validate]: 3.982e-05 Sums bootstrap : 0.000454s : 0.71% type_inference : 0.058583s : 91.49% event_method : 0.000021s : 0.03% auto_monad : 0.000065s : 0.10% graph_reusing : 0.000007s : 0.01% inline : 0.000002s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000020s : 0.03% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.01% parallel-infer-symbol : 0.000004s : 0.01% pre_auto_parallel : 0.000034s : 0.05% insert-virtual-dataset : 0.000003s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000030s : 0.05% optimize.rewriter_before_opt_a : 0.000085s : 0.13% optimize.opt_a.expand_dump_flag : 0.000004s : 0.01% optimize.opt_a.switch_simplify : 0.000051s : 0.08% optimize.opt_a.loop_unroll : 0.000037s : 0.06% optimize.opt_a.a_1 : 0.000781s : 1.22% optimize.opt_a.with_stream_mark : 0.000032s : 0.05% optimize.opt_a.recompute_prepare : 0.000016s : 0.03% optimize.opt_a.updatestate_depend_eliminate : 0.000007s : 0.01% optimize.opt_a.updatestate_assign_eliminate : 0.000006s : 0.01% optimize.opt_a.updatestate_loads_eliminate : 0.000005s : 0.01% optimize.opt_a.parameter_eliminate : 0.000003s : 0.00% optimize.opt_a.a_2 : 0.000219s : 0.34% optimize.opt_a.accelerated_algorithm : 0.000014s : 0.02% optimize.opt_a.shard : 0.000003s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.000003s : 0.01% optimize.opt_a.shard_inline : 0.000013s : 0.02% optimize.opt_a.merge_send_recv : 0.000014s : 0.02% optimize.opt_a.auto_parallel : 0.000012s : 0.02% optimize.opt_a.parallel : 0.000024s : 0.04% optimize.opt_a.flash_sp : 0.000013s : 0.02% optimize.opt_a.merge_comm : 0.000008s : 0.01% optimize.opt_a.allreduce_fusion : 0.000007s : 0.01% optimize.opt_a.matmul_add_comm_reduction : 0.000015s : 0.02% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000016s : 0.02% optimize.opt_a.virtual_dataset : 0.000014s : 0.02% optimize.opt_a.get_grad_eliminate_ : 0.000013s : 0.02% optimize.opt_a.virtual_output : 0.000014s : 0.02% optimize.opt_a.merge_forward : 0.000007s : 0.01% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.01% optimize.opt_a.offload_activation : 0.000017s : 0.03% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000030s : 0.05% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.00% optimize.opt_a.before_grad : 0.000022s : 0.04% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000008s : 0.01% optimize.opt_a.meta_fg_expand : 0.000005s : 0.01% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.01% optimize.opt_a.receive_attached : 0.000004s : 0.01% optimize.opt_a.after_resolve : 0.000023s : 0.04% optimize.opt_a.a_after_grad : 0.000020s : 0.03% optimize.opt_a.renormalize : 0.000677s : 1.06% optimize.opt_a.add_forward_monad_depend : 0.000007s : 0.01% optimize.opt_a.auto_monad_grad : 0.000004s : 0.01% optimize.opt_a.auto_monad_eliminator : 0.000024s : 0.04% optimize.opt_a.cse : 0.000042s : 0.07% optimize.opt_a.a_3 : 0.000109s : 0.17% optimize.py_interpret_to_execute_after_opt_a : 0.000014s : 0.02% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.01% optimize.rewriter_after_opt_a : 0.000043s : 0.07% optimize.convert_after_rewriter : 0.000011s : 0.02% optimize.order_py_execute_after_rewriter : 0.000008s : 0.01% optimize.mutable_eliminate : 0.000567s : 0.88% optimize.opt_b.b_1 : 0.000166s : 0.26% optimize.opt_b.b_2 : 0.000008s : 0.01% optimize.opt_b.updatestate_depend_eliminate : 0.000007s : 0.01% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.00% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000018s : 0.03% optimize.optimize_parallel_all_gather_comm : 0.000020s : 0.03% optimize.overlap_param_gather : 0.000005s : 0.01% optimize.cconv : 0.000032s : 0.05% optimize.loop_unroll : 0.000449s : 0.70% optimize.opt_after_cconv.c_1 : 0.000031s : 0.05% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.01% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.00% optimize.opt_after_cconv.cse : 0.000016s : 0.03% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000017s : 0.03% optimize.tuple_transform.d_1 : 0.000045s : 0.07% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000007s : 0.01% optimize.partial_unused_args_eliminate : 0.000004s : 0.01% optimize.add_recomputation : 0.000049s : 0.08% optimize.cse_after_recomputation.cse : 0.000011s : 0.02% optimize.environ_conv : 0.000008s : 0.01% optimize.swap_dp_allreduce_reducescatter : 0.000008s : 0.01% optimize.bias_add_comm_swap : 0.000005s : 0.01% optimize.label_micro_interleaved_index : 0.000008s : 0.01% optimize.label_fine_grained_interleaved_index : 0.000005s : 0.01% optimize.merge_cast_opt : 0.000004s : 0.01% optimize.slice_recompute_activation : 0.000004s : 0.01% optimize.micro_interleaved_order_control : 0.000005s : 0.01% optimize.assign_add_opt : 0.000004s : 0.01% optimize.ForceFp32Comm : 0.000003s : 0.00% optimize.remove_cast_before_assign_add : 0.000004s : 0.01% optimize.full_micro_interleaved_order_control : 0.000004s : 0.01% optimize.reorder_send_recv_between_fp_bp : 0.000005s : 0.01% optimize.comm_op_add_attrs : 0.000003s : 0.01% optimize.add_comm_op_reuse_tag : 0.000003s : 0.01% optimize.interleave_split_concat_branches : 0.000004s : 0.01% optimize.interleave_parallel_branches : 0.000003s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000004s : 0.01% optimize.control_data_broadcast_order : 0.000016s : 0.02% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.01% optimize.offloading_packed_experts : 0.000007s : 0.01% optimize.overlap_recompute_and_grad_model_parallel : 0.000007s : 0.01% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000003s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.01% optimize.overlap_recompute_comm : 0.000005s : 0.01% optimize.overlap_grad_ring_attention : 0.000007s : 0.01% optimize.overlap_grad_flash_sp : 0.000024s : 0.04% optimize.begin_end_overlap_inline : 0.000003s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000005s : 0.01% optimize.split_layernorm_comm : 0.000004s : 0.01% optimize.handle_group_info : 0.000004s : 0.01% optimize.symbol_engine_optimizer.build : 0.000004s : 0.01% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000010s : 0.02% optimize.symbol_engine_optimizer.elim_not_effective : 0.000013s : 0.02% optimize.symbol_engine_optimizer.opt_reshape : 0.000008s : 0.01% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000010s : 0.02% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000004s : 0.01% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000020s : 0.03% get_jit_bprop_graph : 0.000002s : 0.00% rewriter_after_jit_bprop_graph : 0.000005s : 0.01% opt_after_jit_grad : 0.000502s : 0.78% validate : 0.000040s : 0.06% Time group info: ------[substitution.] 0.000215 34 15.38% : 0.000033s : 6: substitution.arithmetic_simplify 0.93% : 0.000002s : 2: substitution.elim_not_effective 0.64% : 0.000001s : 2: substitution.fold_const_symbol 2.85% : 0.000006s : 4: substitution.graph_param_transform 66.00% : 0.000142s : 4: substitution.inline 2.08% : 0.000004s : 4: substitution.j_node_and_user_rematch 2.61% : 0.000006s : 4: substitution.remove_not_recompute_node 2.36% : 0.000005s : 4: substitution.replace_old_param 7.16% : 0.000015s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.058523 2 98.80% : 0.057819s : 1: type_inference.infer 1.20% : 0.000704s : 1: type_inference.specialize ------[replace.] 0.000059 8 62.25% : 0.000037s : 4: replace.inline 37.75% : 0.000022s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000153 8 91.16% : 0.000139s : 4: match.inline 8.84% : 0.000013s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000206 1278 0.86% : 0.000002s : 13: predicate.accumulaten_eliminater 0.67% : 0.000001s : 4: predicate.ad_related_special_op_eliminate 0.50% : 0.000001s : 8: predicate.addn_check_dump 0.87% : 0.000002s : 13: predicate.addn_zero_filter 0.79% : 0.000002s : 13: predicate.adjust_all_reduce_mul_add 2.51% : 0.000005s : 21: predicate.arithmetic_simplify 0.99% : 0.000002s : 13: predicate.cast_eliminate 0.62% : 0.000001s : 8: predicate.check_bprop_eliminate 0.51% : 0.000001s : 8: predicate.compare_switch_simplify 0.20% : 0.000000s : 4: predicate.const_output_eliminate 0.92% : 0.000002s : 8: predicate.depend_value_elim 0.94% : 0.000002s : 13: predicate.dict_get_item_const_eliminator 1.01% : 0.000002s : 13: predicate.dict_get_item_eliminator 0.88% : 0.000002s : 13: predicate.dict_set_item_eliminator 1.02% : 0.000002s : 8: predicate.dumpgradient_eliminate 0.21% : 0.000000s : 4: predicate.elim_not_effective 0.38% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.11% : 0.000002s : 17: predicate.environ_add_const_eliminate 1.09% : 0.000002s : 17: predicate.environ_get_add_eliminate 1.05% : 0.000002s : 17: predicate.environ_get_depend_swap 1.67% : 0.000003s : 25: predicate.environ_get_eliminate 1.07% : 0.000002s : 17: predicate.environ_get_set_eliminate 1.43% : 0.000003s : 21: predicate.exchange_switch_depend_value 2.47% : 0.000005s : 21: predicate.float_depend_g_call 0.53% : 0.000001s : 8: predicate.float_environ_get_switch 0.78% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.17% : 0.000000s : 4: predicate.fold_const_symbol 0.62% : 0.000001s : 8: predicate.get_grad_eliminate 0.29% : 0.000001s : 4: predicate.graph_param_transform 0.59% : 0.000001s : 8: predicate.incorporate_call 0.51% : 0.000001s : 8: predicate.incorporate_call_switch 6.51% : 0.000013s : 58: predicate.inline 0.73% : 0.000002s : 8: predicate.inline_without_move 0.35% : 0.000001s : 8: predicate.j_node_and_user_rematch 0.78% : 0.000002s : 8: predicate.less_batch_normalization 1.83% : 0.000004s : 25: predicate.list_to_tuple_eliminator_ 2.43% : 0.000005s : 38: predicate.load_eliminater 0.97% : 0.000002s : 4: predicate.loop_unroll_after_grad 2.48% : 0.000005s : 34: predicate.loop_unroll_before_grad 1.69% : 0.000003s : 21: predicate.make_slice_get_slice_eliminator 0.52% : 0.000001s : 8: predicate.merge_addn 0.53% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.62% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.79% : 0.000002s : 13: predicate.minmaximum_grad 1.12% : 0.000002s : 4: predicate.mutable_eliminate 0.42% : 0.000001s : 4: predicate.opt_reshape 0.35% : 0.000001s : 4: predicate.parallel_virtual_node 1.77% : 0.000004s : 21: predicate.partial_defer_inline 1.63% : 0.000003s : 21: predicate.partial_eliminate 0.87% : 0.000002s : 13: predicate.print_const_string_wrapper 0.53% : 0.000001s : 8: predicate.reduce_all_const_elim 1.20% : 0.000002s : 13: predicate.reduce_eliminate 2.38% : 0.000005s : 38: predicate.redundant_stop_gradient_eliminater 0.57% : 0.000001s : 8: predicate.remove_not_recompute_node 1.44% : 0.000003s : 25: predicate.replace_applicator 0.67% : 0.000001s : 8: predicate.replace_old_param 0.37% : 0.000001s : 4: predicate.reset_defer_inline 0.97% : 0.000002s : 13: predicate.reshape_eliminate 0.61% : 0.000001s : 8: predicate.row_tensor_add_zeros_like 0.35% : 0.000001s : 4: predicate.row_tensor_eliminate 0.84% : 0.000002s : 8: predicate.same_eliminate 0.75% : 0.000002s : 8: predicate.set_cell_output_no_recompute 0.68% : 0.000001s : 8: predicate.shard_identity_eliminate 0.77% : 0.000002s : 8: predicate.special_op_eliminate 0.68% : 0.000001s : 8: predicate.specialize_transform 0.88% : 0.000002s : 8: predicate.split_environ_get_set_with_tuple_value 0.81% : 0.000002s : 8: predicate.stack_unstack_eliminate 0.52% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.51% : 0.000003s : 21: predicate.switch_defer_inline 2.02% : 0.000004s : 29: predicate.switch_layer_defer_inline 5.24% : 0.000011s : 67: predicate.switch_simplify 0.84% : 0.000002s : 13: predicate.tile_eliminate 0.91% : 0.000002s : 13: predicate.transpose_eliminate 1.51% : 0.000003s : 21: predicate.tuple_list_convert_item_index_to_positive 1.52% : 0.000003s : 21: predicate.tuple_list_get_item_const_eliminator 1.32% : 0.000003s : 21: predicate.tuple_list_get_item_depend_reorder 3.39% : 0.000007s : 33: predicate.tuple_list_get_item_eliminator 1.37% : 0.000003s : 21: predicate.tuple_list_get_set_item_eliminator 2.05% : 0.000004s : 29: predicate.tuple_list_set_item_eliminator 1.74% : 0.000004s : 25: predicate.tuple_to_list_eliminator_ 2.34% : 0.000005s : 38: predicate.updatestate_pure_node_eliminater 2.99% : 0.000006s : 46: predicate.updatestate_useless_node_eliminater 0.38% : 0.000001s : 4: predicate.value_based_eliminate 0.62% : 0.000001s : 8: predicate.virtual_dataset_eliminate 0.95% : 0.000002s : 8: predicate.virtual_output_eliminate 0.24% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.39% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000570 11 55.17% : 0.000314s : 5: func_graph_cloner_run.FuncGraphClonerGraph 44.83% : 0.000256s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.079835 192 0.01% : 0.000006s : 1: ForceFp32Comm 4.08% : 0.003256s : 1: add_attr 4.06% : 0.003239s : 1: add_attr_with_inline 0.01% : 0.000006s : 1: add_comm_op_reuse_tag 0.07% : 0.000053s : 1: add_recomputation 0.01% : 0.000006s : 1: assign_add_opt 0.09% : 0.000075s : 1: auto_monad 0.04% : 0.000029s : 1: auto_monad_reorder 0.01% : 0.000006s : 1: begin_end_overlap_inline 0.01% : 0.000008s : 1: bias_add_comm_swap 0.62% : 0.000499s : 1: bootstrap 0.04% : 0.000035s : 1: cconv 0.01% : 0.000006s : 1: comm_op_add_attrs 0.02% : 0.000019s : 1: control_data_broadcast_order 0.02% : 0.000014s : 1: convert_after_rewriter 0.04% : 0.000029s : 1: cse_after_recomputation 0.01% : 0.000008s : 1: dataset_repeat_opt 0.03% : 0.000020s : 1: detach_backward 0.01% : 0.000011s : 1: environ_conv 0.04% : 0.000032s : 1: event_method 0.01% : 0.000007s : 1: full_micro_interleaved_order_control 0.01% : 0.000009s : 1: get_jit_bprop_graph 0.02% : 0.000013s : 1: graph_reusing 0.01% : 0.000006s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000006s : 1: handle_group_info 0.01% : 0.000008s : 1: inline 0.01% : 0.000009s : 1: insert-virtual-dataset 0.01% : 0.000006s : 1: interleave_parallel_branches 0.01% : 0.000006s : 1: interleave_split_concat_branches 0.01% : 0.000008s : 1: label_fine_grained_interleaved_index 0.01% : 0.000011s : 1: label_micro_interleaved_index 0.57% : 0.000455s : 1: loop_unroll 0.01% : 0.000007s : 1: merge_cast_opt 0.01% : 0.000007s : 1: micro_interleaved_order_control 0.72% : 0.000573s : 1: mutable_eliminate 0.01% : 0.000010s : 1: offloading_packed_experts 0.02% : 0.000015s : 1: opt.transform.loop_unroll_optimizer 0.02% : 0.000016s : 1: opt.transform.mutable_eliminate 1.53% : 0.001221s : 78: opt.transform.opt_a 0.04% : 0.000030s : 1: opt.transform.opt_after_cconv 0.03% : 0.000026s : 1: opt.transform.opt_after_jit_grad 0.13% : 0.000103s : 28: opt.transform.opt_b 0.06% : 0.000050s : 2: opt.transform.opt_trans_graph 0.05% : 0.000038s : 4: opt.transform.symbol_engine_opt 3.93% : 0.003137s : 1: opt_a 0.16% : 0.000129s : 1: opt_after_cconv 0.64% : 0.000513s : 1: opt_after_jit_grad 0.35% : 0.000277s : 1: opt_b 7.28% : 0.005808s : 1: optimize 0.03% : 0.000023s : 1: optimize_parallel_all_gather_comm 0.01% : 0.000011s : 1: order_py_execute_after_rewriter 0.03% : 0.000027s : 1: overlap_grad_flash_sp 0.01% : 0.000006s : 1: overlap_grad_matmul_and_grad_allreduce 0.01% : 0.000009s : 1: overlap_grad_ring_attention 0.01% : 0.000007s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000006s : 1: overlap_opt_shard_in_pipeline 0.01% : 0.000008s : 1: overlap_param_gather 0.01% : 0.000006s : 1: overlap_recompute_allgather_and_fa_grad 0.01% : 0.000010s : 1: overlap_recompute_and_grad_model_parallel 0.01% : 0.000008s : 1: overlap_recompute_comm 0.01% : 0.000011s : 1: parallel-infer-symbol 0.01% : 0.000006s : 1: parallel-infer-symbol-second 0.01% : 0.000007s : 1: partial_unused_args_eliminate 0.01% : 0.000011s : 1: pipeline_parallel_scheduler 0.01% : 0.000007s : 1: pipeline_split 0.05% : 0.000042s : 1: pre_auto_parallel 0.04% : 0.000034s : 1: py_interpret_to_execute 0.02% : 0.000017s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000006s : 1: remove_cast_before_assign_add 0.02% : 0.000020s : 1: remove_dup_value 0.44% : 0.000351s : 1: renormalize.infer 0.40% : 0.000317s : 1: renormalize.specialize 0.01% : 0.000009s : 1: reorder_send_recv_between_fp_bp 0.01% : 0.000012s : 1: rewriter_after_jit_bprop_graph 0.06% : 0.000047s : 1: rewriter_after_opt_a 0.11% : 0.000089s : 1: rewriter_before_opt_a 0.01% : 0.000008s : 1: slice_cell_reuse_recomputed_activation 0.01% : 0.000007s : 1: slice_recompute_activation 0.01% : 0.000007s : 1: split_layernorm_comm 0.01% : 0.000008s : 1: split_matmul_comm_elemetwise 0.01% : 0.000011s : 1: swap_dp_allreduce_reducescatter 0.13% : 0.000103s : 1: symbol_engine_optimizer 0.12% : 0.000094s : 1: tuple_transform 73.44% : 0.058629s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:43:06.630.46 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.106449, [21] [bootstrap]: 0.00052569 [type_inference]: 0.0558319 [event_method]: 2.312e-05 [auto_monad]: 6.869e-05 [graph_reusing]: 6.41e-06 [inline]: 2.19001e-06 [add_attr]: 0.00376834, [1] [add_attr_with_inline]: 0.0037583, [1] [Cycle 1]: 6.311e-05, [2] [tag_attr]: 2.014e-05 [meta_addattr_fg_expand]: 5.72999e-06 [parallel-infer-symbol]: 3.62002e-06 [pre_auto_parallel]: 3.69e-05 [insert-virtual-dataset]: 2.86999e-06 [parallel-infer-symbol-second]: 7.2e-07 [dataset_repeat_opt]: 2.38002e-06 [pipeline_split]: 1.60001e-06 [optimize]: 0.00504264, [53] [py_interpret_to_execute]: 2.729e-05 [rewriter_before_opt_a]: 8.348e-05 [opt_a]: 0.00289535, [2] [Cycle 1]: 0.00219598, [45] [expand_dump_flag]: 3.04001e-06 [switch_simplify]: 4.37e-05 [loop_unroll]: 3.03e-05 [a_1]: 0.00069226 [with_stream_mark]: 1.715e-05 [recompute_prepare]: 9.21002e-06 [updatestate_depend_eliminate]: 4.02e-06 [updatestate_assign_eliminate]: 3.13998e-06 [updatestate_loads_eliminate]: 2.86e-06 [parameter_eliminate]: 1.87001e-06 [a_2]: 8.698e-05 [accelerated_algorithm]: 7.8e-06 [shard]: 2.32999e-06 [meta_shard_fg_expand]: 1.79e-06 [shard_inline]: 6.61999e-06 [merge_send_recv]: 8.30999e-06 [auto_parallel]: 6.54001e-06 [parallel]: 1.863e-05 [flash_sp]: 7.80998e-06 [merge_comm]: 3.98001e-06 [allreduce_fusion]: 3.45003e-06 [matmul_add_comm_reduction]: 1.052e-05 [allreduce_slice_to_reducescatter]: 8.70001e-07 [virtual_shard_identity]: 7.77e-06 [virtual_dataset]: 7e-06 [get_grad_eliminate_]: 6.43e-06 [virtual_output]: 6.44999e-06 [merge_forward]: 4.54002e-06 [cell_reuse_recompute_pass]: 1.05001e-06 [offload_activation]: 1.009e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.245e-05 [merge_recompute_call_nodes]: 1.54998e-06 [before_grad]: 1.129e-05 [set_forward_comm_id_for_comm_node_pass]: 3.70998e-06 [meta_fg_expand]: 2.67001e-06 [flash_sp_send_recv_attached]: 2.54999e-06 [receive_attached]: 2.15002e-06 [after_resolve]: 1.223e-05 [a_after_grad]: 9.77001e-06 [renormalize]: 0.00076944 [add_forward_monad_depend]: 5.71998e-06 [auto_monad_grad]: 2.57001e-06 [auto_monad_eliminator]: 1.567e-05 [cse]: 3.089e-05 [a_3]: 4.907e-05 [Cycle 2]: 0.00068919, [45] [expand_dump_flag]: 1.40999e-06 [switch_simplify]: 7.50998e-06 [loop_unroll]: 6.54001e-06 [a_1]: 0.00014548 [with_stream_mark]: 1.261e-05 [recompute_prepare]: 7.25e-06 [updatestate_depend_eliminate]: 3.11999e-06 [updatestate_assign_eliminate]: 2.42001e-06 [updatestate_loads_eliminate]: 2.40002e-06 [parameter_eliminate]: 1.49998e-06 [a_2]: 7.762e-05 [accelerated_algorithm]: 6.34999e-06 [shard]: 1.62001e-06 [meta_shard_fg_expand]: 1.94999e-06 [shard_inline]: 8.83001e-06 [merge_send_recv]: 5.79999e-06 [auto_parallel]: 7.03998e-06 [parallel]: 5.47999e-06 [flash_sp]: 3.81001e-06 [merge_comm]: 3.90998e-06 [allreduce_fusion]: 3.02002e-06 [matmul_add_comm_reduction]: 7.3e-06 [allreduce_slice_to_reducescatter]: 3.39991e-07 [virtual_shard_identity]: 6.70998e-06 [virtual_dataset]: 5.79999e-06 [get_grad_eliminate_]: 5.66e-06 [virtual_output]: 5.64998e-06 [merge_forward]: 3.21999e-06 [cell_reuse_recompute_pass]: 1.59998e-06 [offload_activation]: 8.15e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.158e-05 [merge_recompute_call_nodes]: 1.05001e-06 [before_grad]: 1.081e-05 [set_forward_comm_id_for_comm_node_pass]: 3.14001e-06 [meta_fg_expand]: 2.16998e-06 [flash_sp_send_recv_attached]: 1.30999e-06 [receive_attached]: 1.55001e-06 [after_resolve]: 1.074e-05 [a_after_grad]: 9.29998e-06 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 1.75001e-06 [auto_monad_grad]: 1.70001e-06 [auto_monad_eliminator]: 8.17e-06 [cse]: 1.572e-05 [a_3]: 3.639e-05 [py_interpret_to_execute_after_opt_a]: 1.001e-05 [slice_cell_reuse_recomputed_activation]: 2.53003e-06 [rewriter_after_opt_a]: 3.497e-05 [convert_after_rewriter]: 6.73998e-06 [order_py_execute_after_rewriter]: 5.32001e-06 [mutable_eliminate]: 0.00059108 [opt_b]: 0.00020814, [1] [Cycle 1]: 0.000201, [7] [b_1]: 0.0001241 [b_2]: 8.25e-06 [updatestate_depend_eliminate]: 6.80002e-06 [updatestate_assign_eliminate]: 2.56e-06 [updatestate_loads_eliminate]: 2.37001e-06 [renormalize]: 7.30011e-07 [cse]: 2.079e-05 [optimize_parallel_all_gather_comm]: 1.8e-05 [overlap_param_gather]: 2.01e-06 [cconv]: 2.912e-05 [loop_unroll]: 0.00045602 [opt_after_cconv]: 0.0001067, [1] [Cycle 1]: 0.00010111, [7] [c_1]: 3.187e-05 [parameter_eliminate]: 4.15e-06 [updatestate_depend_eliminate]: 5.78997e-06 [updatestate_assign_eliminate]: 2.88998e-06 [updatestate_loads_eliminate]: 2.36e-06 [cse]: 1.891e-05 [renormalize]: 4.30009e-07 [remove_dup_value]: 1.334e-05 [tuple_transform]: 7.756e-05, [1] [Cycle 1]: 7.309e-05, [4] [d_1]: 4.52e-05 [none_parameter_eliminate]: 1.54998e-06 [renormalize]: 2.19996e-07 [switch_simplify]: 7.14001e-06 [partial_unused_args_eliminate]: 2.04e-06 [add_recomputation]: 4.947e-05 [cse_after_recomputation]: 2.08e-05, [1] [Cycle 1]: 1.607e-05, [1] [cse]: 1.055e-05 [environ_conv]: 5.29e-06 [swap_dp_allreduce_reducescatter]: 4.67e-06 [bias_add_comm_swap]: 2.66999e-06 [label_micro_interleaved_index]: 4.82998e-06 [label_fine_grained_interleaved_index]: 3.33e-06 [merge_cast_opt]: 1.62999e-06 [slice_recompute_activation]: 1.94999e-06 [micro_interleaved_order_control]: 2.29001e-06 [assign_add_opt]: 1.32e-06 [ForceFp32Comm]: 1.16002e-06 [remove_cast_before_assign_add]: 1.30999e-06 [full_micro_interleaved_order_control]: 2.09e-06 [reorder_send_recv_between_fp_bp]: 2.77002e-06 [comm_op_add_attrs]: 1.22e-06 [add_comm_op_reuse_tag]: 1.06002e-06 [interleave_split_concat_branches]: 1.12999e-06 [interleave_parallel_branches]: 1.12e-06 [overlap_opt_shard_in_pipeline]: 1.35999e-06 [overlap_opt_shard_grad_in_pipeline]: 1.96998e-06 [control_data_broadcast_order]: 1.334e-05 [grouped_pairwise_exchange_alltoall]: 1.89999e-06 [offloading_packed_experts]: 3.99002e-06 [overlap_recompute_and_grad_model_parallel]: 4.97e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.22e-06 [overlap_recompute_allgather_and_fa_grad]: 1.47999e-06 [overlap_recompute_comm]: 2.01e-06 [overlap_grad_ring_attention]: 3.91999e-06 [overlap_grad_flash_sp]: 2.016e-05 [begin_end_overlap_inline]: 4.69998e-07 [split_matmul_comm_elemetwise]: 2.49001e-06 [split_layernorm_comm]: 1.61002e-06 [handle_group_info]: 8.99978e-07 [symbol_engine_optimizer]: 7.736e-05, [1] [Cycle 1]: 7.308e-05, [6] [build]: 3.56999e-06 [elim_shapecalc]: 1.034e-05 [elim_not_effective]: 1.327e-05 [opt_reshape]: 7.09001e-06 [fold_const_symbol]: 1.002e-05 [renormalize]: 3.7998e-07 [detach_backward]: 2.14999e-06 [pipeline_parallel_scheduler]: 2.07001e-06 [auto_monad_reorder]: 1.744e-05 [get_jit_bprop_graph]: 1.83002e-06 [rewriter_after_jit_bprop_graph]: 4.84003e-06 [opt_after_jit_grad]: 0.0408645 [validate]: 5.598e-05 Sums bootstrap : 0.000526s : 0.52% type_inference : 0.055832s : 54.91% event_method : 0.000023s : 0.02% auto_monad : 0.000069s : 0.07% graph_reusing : 0.000006s : 0.01% inline : 0.000002s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000020s : 0.02% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.01% parallel-infer-symbol : 0.000004s : 0.00% pre_auto_parallel : 0.000037s : 0.04% insert-virtual-dataset : 0.000003s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000027s : 0.03% optimize.rewriter_before_opt_a : 0.000083s : 0.08% optimize.opt_a.expand_dump_flag : 0.000004s : 0.00% optimize.opt_a.switch_simplify : 0.000051s : 0.05% optimize.opt_a.loop_unroll : 0.000037s : 0.04% optimize.opt_a.a_1 : 0.000838s : 0.82% optimize.opt_a.with_stream_mark : 0.000030s : 0.03% optimize.opt_a.recompute_prepare : 0.000016s : 0.02% optimize.opt_a.updatestate_depend_eliminate : 0.000007s : 0.01% optimize.opt_a.updatestate_assign_eliminate : 0.000006s : 0.01% optimize.opt_a.updatestate_loads_eliminate : 0.000005s : 0.01% optimize.opt_a.parameter_eliminate : 0.000003s : 0.00% optimize.opt_a.a_2 : 0.000165s : 0.16% optimize.opt_a.accelerated_algorithm : 0.000014s : 0.01% optimize.opt_a.shard : 0.000004s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.00% optimize.opt_a.shard_inline : 0.000015s : 0.02% optimize.opt_a.merge_send_recv : 0.000014s : 0.01% optimize.opt_a.auto_parallel : 0.000014s : 0.01% optimize.opt_a.parallel : 0.000024s : 0.02% optimize.opt_a.flash_sp : 0.000012s : 0.01% optimize.opt_a.merge_comm : 0.000008s : 0.01% optimize.opt_a.allreduce_fusion : 0.000006s : 0.01% optimize.opt_a.matmul_add_comm_reduction : 0.000018s : 0.02% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000014s : 0.01% optimize.opt_a.virtual_dataset : 0.000013s : 0.01% optimize.opt_a.get_grad_eliminate_ : 0.000012s : 0.01% optimize.opt_a.virtual_output : 0.000012s : 0.01% optimize.opt_a.merge_forward : 0.000008s : 0.01% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.00% optimize.opt_a.offload_activation : 0.000018s : 0.02% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000024s : 0.02% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.00% optimize.opt_a.before_grad : 0.000022s : 0.02% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000007s : 0.01% optimize.opt_a.meta_fg_expand : 0.000005s : 0.00% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.00% optimize.opt_a.receive_attached : 0.000004s : 0.00% optimize.opt_a.after_resolve : 0.000023s : 0.02% optimize.opt_a.a_after_grad : 0.000019s : 0.02% optimize.opt_a.renormalize : 0.000770s : 0.76% optimize.opt_a.add_forward_monad_depend : 0.000007s : 0.01% optimize.opt_a.auto_monad_grad : 0.000004s : 0.00% optimize.opt_a.auto_monad_eliminator : 0.000024s : 0.02% optimize.opt_a.cse : 0.000047s : 0.05% optimize.opt_a.a_3 : 0.000085s : 0.08% optimize.py_interpret_to_execute_after_opt_a : 0.000010s : 0.01% optimize.slice_cell_reuse_recomputed_activation : 0.000003s : 0.00% optimize.rewriter_after_opt_a : 0.000035s : 0.03% optimize.convert_after_rewriter : 0.000007s : 0.01% optimize.order_py_execute_after_rewriter : 0.000005s : 0.01% optimize.mutable_eliminate : 0.000591s : 0.58% optimize.opt_b.b_1 : 0.000124s : 0.12% optimize.opt_b.b_2 : 0.000008s : 0.01% optimize.opt_b.updatestate_depend_eliminate : 0.000007s : 0.01% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000002s : 0.00% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000021s : 0.02% optimize.optimize_parallel_all_gather_comm : 0.000018s : 0.02% optimize.overlap_param_gather : 0.000002s : 0.00% optimize.cconv : 0.000029s : 0.03% optimize.loop_unroll : 0.000456s : 0.45% optimize.opt_after_cconv.c_1 : 0.000032s : 0.03% optimize.opt_after_cconv.parameter_eliminate : 0.000004s : 0.00% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.01% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.00% optimize.opt_after_cconv.cse : 0.000019s : 0.02% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000013s : 0.01% optimize.tuple_transform.d_1 : 0.000045s : 0.04% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000007s : 0.01% optimize.partial_unused_args_eliminate : 0.000002s : 0.00% optimize.add_recomputation : 0.000049s : 0.05% optimize.cse_after_recomputation.cse : 0.000011s : 0.01% optimize.environ_conv : 0.000005s : 0.01% optimize.swap_dp_allreduce_reducescatter : 0.000005s : 0.00% optimize.bias_add_comm_swap : 0.000003s : 0.00% optimize.label_micro_interleaved_index : 0.000005s : 0.00% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.00% optimize.merge_cast_opt : 0.000002s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.00% optimize.micro_interleaved_order_control : 0.000002s : 0.00% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.00% optimize.full_micro_interleaved_order_control : 0.000002s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.00% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.00% optimize.control_data_broadcast_order : 0.000013s : 0.01% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.00% optimize.offloading_packed_experts : 0.000004s : 0.00% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.00% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.00% optimize.overlap_recompute_comm : 0.000002s : 0.00% optimize.overlap_grad_ring_attention : 0.000004s : 0.00% optimize.overlap_grad_flash_sp : 0.000020s : 0.02% optimize.begin_end_overlap_inline : 0.000000s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.00% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000004s : 0.00% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000010s : 0.01% optimize.symbol_engine_optimizer.elim_not_effective : 0.000013s : 0.01% optimize.symbol_engine_optimizer.opt_reshape : 0.000007s : 0.01% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000010s : 0.01% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.00% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000017s : 0.02% get_jit_bprop_graph : 0.000002s : 0.00% rewriter_after_jit_bprop_graph : 0.000005s : 0.00% opt_after_jit_grad : 0.040864s : 40.19% validate : 0.000056s : 0.06% Time group info: ------[substitution.] 0.000257 34 12.74% : 0.000033s : 6: substitution.arithmetic_simplify 0.72% : 0.000002s : 2: substitution.elim_not_effective 0.52% : 0.000001s : 2: substitution.fold_const_symbol 2.54% : 0.000007s : 4: substitution.graph_param_transform 71.92% : 0.000185s : 4: substitution.inline 1.85% : 0.000005s : 4: substitution.j_node_and_user_rematch 1.99% : 0.000005s : 4: substitution.remove_not_recompute_node 1.71% : 0.000004s : 4: substitution.replace_old_param 6.00% : 0.000015s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.055760 2 98.49% : 0.054916s : 1: type_inference.infer 1.51% : 0.000844s : 1: type_inference.specialize ------[replace.] 0.000064 8 64.58% : 0.000041s : 4: replace.inline 35.42% : 0.000023s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000195 8 93.01% : 0.000182s : 4: match.inline 6.99% : 0.000014s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000209 1278 0.94% : 0.000002s : 13: predicate.accumulaten_eliminater 2.12% : 0.000004s : 4: predicate.ad_related_special_op_eliminate 0.52% : 0.000001s : 8: predicate.addn_check_dump 0.97% : 0.000002s : 13: predicate.addn_zero_filter 0.78% : 0.000002s : 13: predicate.adjust_all_reduce_mul_add 2.48% : 0.000005s : 21: predicate.arithmetic_simplify 0.87% : 0.000002s : 13: predicate.cast_eliminate 0.56% : 0.000001s : 8: predicate.check_bprop_eliminate 0.49% : 0.000001s : 8: predicate.compare_switch_simplify 0.19% : 0.000000s : 4: predicate.const_output_eliminate 0.58% : 0.000001s : 8: predicate.depend_value_elim 0.85% : 0.000002s : 13: predicate.dict_get_item_const_eliminator 0.99% : 0.000002s : 13: predicate.dict_get_item_eliminator 0.86% : 0.000002s : 13: predicate.dict_set_item_eliminator 1.98% : 0.000004s : 8: predicate.dumpgradient_eliminate 0.18% : 0.000000s : 4: predicate.elim_not_effective 0.38% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.05% : 0.000002s : 17: predicate.environ_add_const_eliminate 1.12% : 0.000002s : 17: predicate.environ_get_add_eliminate 1.11% : 0.000002s : 17: predicate.environ_get_depend_swap 1.83% : 0.000004s : 25: predicate.environ_get_eliminate 1.11% : 0.000002s : 17: predicate.environ_get_set_eliminate 1.42% : 0.000003s : 21: predicate.exchange_switch_depend_value 2.43% : 0.000005s : 21: predicate.float_depend_g_call 0.49% : 0.000001s : 8: predicate.float_environ_get_switch 0.80% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.17% : 0.000000s : 4: predicate.fold_const_symbol 0.64% : 0.000001s : 8: predicate.get_grad_eliminate 0.19% : 0.000000s : 4: predicate.graph_param_transform 0.59% : 0.000001s : 8: predicate.incorporate_call 0.48% : 0.000001s : 8: predicate.incorporate_call_switch 6.32% : 0.000013s : 58: predicate.inline 0.84% : 0.000002s : 8: predicate.inline_without_move 0.32% : 0.000001s : 8: predicate.j_node_and_user_rematch 0.85% : 0.000002s : 8: predicate.less_batch_normalization 1.75% : 0.000004s : 25: predicate.list_to_tuple_eliminator_ 2.47% : 0.000005s : 38: predicate.load_eliminater 1.01% : 0.000002s : 4: predicate.loop_unroll_after_grad 2.26% : 0.000005s : 34: predicate.loop_unroll_before_grad 1.55% : 0.000003s : 21: predicate.make_slice_get_slice_eliminator 0.52% : 0.000001s : 8: predicate.merge_addn 0.56% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.59% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.79% : 0.000002s : 13: predicate.minmaximum_grad 1.14% : 0.000002s : 4: predicate.mutable_eliminate 0.35% : 0.000001s : 4: predicate.opt_reshape 0.32% : 0.000001s : 4: predicate.parallel_virtual_node 1.84% : 0.000004s : 21: predicate.partial_defer_inline 1.59% : 0.000003s : 21: predicate.partial_eliminate 0.86% : 0.000002s : 13: predicate.print_const_string_wrapper 0.60% : 0.000001s : 8: predicate.reduce_all_const_elim 1.06% : 0.000002s : 13: predicate.reduce_eliminate 2.47% : 0.000005s : 38: predicate.redundant_stop_gradient_eliminater 0.38% : 0.000001s : 8: predicate.remove_not_recompute_node 1.38% : 0.000003s : 25: predicate.replace_applicator 0.48% : 0.000001s : 8: predicate.replace_old_param 0.36% : 0.000001s : 4: predicate.reset_defer_inline 0.91% : 0.000002s : 13: predicate.reshape_eliminate 0.56% : 0.000001s : 8: predicate.row_tensor_add_zeros_like 0.33% : 0.000001s : 4: predicate.row_tensor_eliminate 0.71% : 0.000001s : 8: predicate.same_eliminate 0.51% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.80% : 0.000002s : 8: predicate.shard_identity_eliminate 0.76% : 0.000002s : 8: predicate.special_op_eliminate 0.66% : 0.000001s : 8: predicate.specialize_transform 0.76% : 0.000002s : 8: predicate.split_environ_get_set_with_tuple_value 0.66% : 0.000001s : 8: predicate.stack_unstack_eliminate 0.32% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.48% : 0.000003s : 21: predicate.switch_defer_inline 1.96% : 0.000004s : 29: predicate.switch_layer_defer_inline 5.14% : 0.000011s : 67: predicate.switch_simplify 0.88% : 0.000002s : 13: predicate.tile_eliminate 0.94% : 0.000002s : 13: predicate.transpose_eliminate 1.61% : 0.000003s : 21: predicate.tuple_list_convert_item_index_to_positive 1.56% : 0.000003s : 21: predicate.tuple_list_get_item_const_eliminator 1.35% : 0.000003s : 21: predicate.tuple_list_get_item_depend_reorder 3.15% : 0.000007s : 33: predicate.tuple_list_get_item_eliminator 1.40% : 0.000003s : 21: predicate.tuple_list_get_set_item_eliminator 2.10% : 0.000004s : 29: predicate.tuple_list_set_item_eliminator 1.71% : 0.000004s : 25: predicate.tuple_to_list_eliminator_ 2.47% : 0.000005s : 38: predicate.updatestate_pure_node_eliminater 2.99% : 0.000006s : 46: predicate.updatestate_useless_node_eliminater 0.36% : 0.000001s : 4: predicate.value_based_eliminate 0.60% : 0.000001s : 8: predicate.virtual_dataset_eliminate 0.61% : 0.000001s : 8: predicate.virtual_output_eliminate 0.37% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.50% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000694 11 60.03% : 0.000417s : 5: func_graph_cloner_run.FuncGraphClonerGraph 39.97% : 0.000278s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.117450 192 0.00% : 0.000004s : 1: ForceFp32Comm 3.21% : 0.003774s : 1: add_attr 3.20% : 0.003762s : 1: add_attr_with_inline 0.00% : 0.000005s : 1: add_comm_op_reuse_tag 0.05% : 0.000053s : 1: add_recomputation 0.00% : 0.000004s : 1: assign_add_opt 0.06% : 0.000074s : 1: auto_monad 0.02% : 0.000021s : 1: auto_monad_reorder 0.00% : 0.000004s : 1: begin_end_overlap_inline 0.00% : 0.000005s : 1: bias_add_comm_swap 0.48% : 0.000560s : 1: bootstrap 0.03% : 0.000033s : 1: cconv 0.00% : 0.000004s : 1: comm_op_add_attrs 0.01% : 0.000017s : 1: control_data_broadcast_order 0.01% : 0.000010s : 1: convert_after_rewriter 0.02% : 0.000024s : 1: cse_after_recomputation 0.00% : 0.000005s : 1: dataset_repeat_opt 0.00% : 0.000006s : 1: detach_backward 0.01% : 0.000009s : 1: environ_conv 0.03% : 0.000030s : 1: event_method 0.00% : 0.000005s : 1: full_micro_interleaved_order_control 0.00% : 0.000005s : 1: get_jit_bprop_graph 0.01% : 0.000010s : 1: graph_reusing 0.00% : 0.000005s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000004s : 1: handle_group_info 0.00% : 0.000006s : 1: inline 0.01% : 0.000006s : 1: insert-virtual-dataset 0.00% : 0.000004s : 1: interleave_parallel_branches 0.00% : 0.000004s : 1: interleave_split_concat_branches 0.01% : 0.000007s : 1: label_fine_grained_interleaved_index 0.01% : 0.000008s : 1: label_micro_interleaved_index 0.40% : 0.000465s : 1: loop_unroll 0.00% : 0.000004s : 1: merge_cast_opt 0.00% : 0.000005s : 1: micro_interleaved_order_control 0.51% : 0.000601s : 1: mutable_eliminate 0.01% : 0.000007s : 1: offloading_packed_experts 0.01% : 0.000016s : 1: opt.transform.loop_unroll_optimizer 0.01% : 0.000016s : 1: opt.transform.mutable_eliminate 1.08% : 0.001271s : 78: opt.transform.opt_a 0.03% : 0.000030s : 1: opt.transform.opt_after_cconv 0.04% : 0.000051s : 1: opt.transform.opt_after_jit_grad 0.09% : 0.000101s : 28: opt.transform.opt_b 0.04% : 0.000050s : 2: opt.transform.opt_trans_graph 0.03% : 0.000037s : 4: opt.transform.symbol_engine_opt 2.47% : 0.002899s : 1: opt_a 0.09% : 0.000111s : 1: opt_after_cconv 34.81% : 0.040888s : 1: opt_after_jit_grad 0.18% : 0.000212s : 1: opt_b 4.30% : 0.005048s : 1: optimize 0.02% : 0.000022s : 1: optimize_parallel_all_gather_comm 0.01% : 0.000008s : 1: order_py_execute_after_rewriter 0.02% : 0.000024s : 1: overlap_grad_flash_sp 0.00% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.01% : 0.000007s : 1: overlap_grad_ring_attention 0.00% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000005s : 1: overlap_param_gather 0.00% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.01% : 0.000008s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000005s : 1: overlap_recompute_comm 0.01% : 0.000007s : 1: parallel-infer-symbol 0.00% : 0.000004s : 1: parallel-infer-symbol-second 0.00% : 0.000005s : 1: partial_unused_args_eliminate 0.00% : 0.000005s : 1: pipeline_parallel_scheduler 0.00% : 0.000005s : 1: pipeline_split 0.04% : 0.000041s : 1: pre_auto_parallel 0.03% : 0.000031s : 1: py_interpret_to_execute 0.01% : 0.000014s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000004s : 1: remove_cast_before_assign_add 0.01% : 0.000017s : 1: remove_dup_value 0.37% : 0.000434s : 1: renormalize.infer 0.28% : 0.000328s : 1: renormalize.specialize 0.00% : 0.000005s : 1: reorder_send_recv_between_fp_bp 0.01% : 0.000008s : 1: rewriter_after_jit_bprop_graph 0.03% : 0.000039s : 1: rewriter_after_opt_a 0.07% : 0.000087s : 1: rewriter_before_opt_a 0.00% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000005s : 1: slice_recompute_activation 0.00% : 0.000005s : 1: split_layernorm_comm 0.00% : 0.000005s : 1: split_matmul_comm_elemetwise 0.01% : 0.000007s : 1: swap_dp_allreduce_reducescatter 0.07% : 0.000080s : 1: symbol_engine_optimizer 0.07% : 0.000080s : 1: tuple_transform 47.56% : 0.055854s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:43:06.995.932 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:43:06.996.211 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0169355, [21] [bootstrap]: 0.00042258 [type_inference]: 0.00587102 [event_method]: 1.871e-05 [auto_monad]: 6.15e-05 [graph_reusing]: 6.22001e-06 [inline]: 2.21998e-06 [add_attr]: 0.0037352, [1] [add_attr_with_inline]: 0.00372453, [1] [Cycle 1]: 7.801e-05, [2] [tag_attr]: 2.05e-05 [meta_addattr_fg_expand]: 5.78997e-06 [parallel-infer-symbol]: 3.28e-06 [pre_auto_parallel]: 3.567e-05 [insert-virtual-dataset]: 2.65002e-06 [parallel-infer-symbol-second]: 7.39994e-07 [dataset_repeat_opt]: 2.17001e-06 [pipeline_split]: 1.64e-06 [optimize]: 0.00559901, [53] [py_interpret_to_execute]: 3.04e-05 [rewriter_before_opt_a]: 9.149e-05 [opt_a]: 0.00319687, [2] [Cycle 1]: 0.00234507, [45] [expand_dump_flag]: 3.16999e-06 [switch_simplify]: 4.143e-05 [loop_unroll]: 3.018e-05 [a_1]: 0.00066649 [with_stream_mark]: 1.946e-05 [recompute_prepare]: 8.97e-06 [updatestate_depend_eliminate]: 5.01002e-06 [updatestate_assign_eliminate]: 3.81999e-06 [updatestate_loads_eliminate]: 2.99999e-06 [parameter_eliminate]: 2.77002e-06 [a_2]: 0.00011628 [accelerated_algorithm]: 8.27998e-06 [shard]: 1.74998e-06 [meta_shard_fg_expand]: 2.21e-06 [shard_inline]: 6.86001e-06 [merge_send_recv]: 9.81e-06 [auto_parallel]: 7.82e-06 [parallel]: 1.979e-05 [flash_sp]: 8.33999e-06 [merge_comm]: 3.68e-06 [allreduce_fusion]: 3.49001e-06 [matmul_add_comm_reduction]: 1.01e-05 [allreduce_slice_to_reducescatter]: 6.50005e-07 [virtual_shard_identity]: 9.05001e-06 [virtual_dataset]: 6.80002e-06 [get_grad_eliminate_]: 6.42001e-06 [virtual_output]: 6.66999e-06 [merge_forward]: 4.14002e-06 [cell_reuse_recompute_pass]: 1.29e-06 [offload_activation]: 1.002e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.493e-05 [merge_recompute_call_nodes]: 1.42999e-06 [before_grad]: 1.074e-05 [set_forward_comm_id_for_comm_node_pass]: 4.00998e-06 [meta_fg_expand]: 2.90002e-06 [flash_sp_send_recv_attached]: 2.94999e-06 [receive_attached]: 2.17999e-06 [after_resolve]: 1.273e-05 [a_after_grad]: 1.028e-05 [renormalize]: 0.00072994 [add_forward_monad_depend]: 5.97001e-06 [auto_monad_grad]: 2.44001e-06 [auto_monad_eliminator]: 1.658e-05 [cse]: 2.551e-05 [a_3]: 6.179e-05 [Cycle 2]: 0.00083752, [45] [expand_dump_flag]: 1.48002e-06 [switch_simplify]: 7.41001e-06 [loop_unroll]: 6.29999e-06 [a_1]: 0.00014398 [with_stream_mark]: 1.226e-05 [recompute_prepare]: 6.59999e-06 [updatestate_depend_eliminate]: 3.01999e-06 [updatestate_assign_eliminate]: 2.71e-06 [updatestate_loads_eliminate]: 2.18002e-06 [parameter_eliminate]: 1.47001e-06 [a_2]: 0.00010448 [accelerated_algorithm]: 6.26998e-06 [shard]: 1.05001e-06 [meta_shard_fg_expand]: 1.54e-06 [shard_inline]: 5.97001e-06 [merge_send_recv]: 5.81e-06 [auto_parallel]: 6.18998e-06 [parallel]: 5.87001e-06 [flash_sp]: 3.32002e-06 [merge_comm]: 3.5e-06 [allreduce_fusion]: 3.52997e-06 [matmul_add_comm_reduction]: 6.17001e-06 [allreduce_slice_to_reducescatter]: 3.20026e-07 [virtual_shard_identity]: 6.88e-06 [virtual_dataset]: 5.97999e-06 [get_grad_eliminate_]: 5.67999e-06 [virtual_output]: 5.74e-06 [merge_forward]: 2.69001e-06 [cell_reuse_recompute_pass]: 1.55999e-06 [offload_activation]: 6.84001e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.355e-05 [merge_recompute_call_nodes]: 9.20001e-07 [before_grad]: 9.52999e-06 [set_forward_comm_id_for_comm_node_pass]: 3.17002e-06 [meta_fg_expand]: 2.06e-06 [flash_sp_send_recv_attached]: 7.89994e-07 [receive_attached]: 1.20999e-06 [after_resolve]: 1.106e-05 [a_after_grad]: 9.71e-06 [renormalize]: 8.00064e-08 [add_forward_monad_depend]: 1.34998e-06 [auto_monad_grad]: 9.79984e-07 [auto_monad_eliminator]: 8.27e-06 [cse]: 1.363e-05 [a_3]: 4.788e-05 [py_interpret_to_execute_after_opt_a]: 1.372e-05 [slice_cell_reuse_recomputed_activation]: 4.72e-06 [rewriter_after_opt_a]: 3.963e-05 [convert_after_rewriter]: 1.03e-05 [order_py_execute_after_rewriter]: 8.82999e-06 [mutable_eliminate]: 0.00058247 [opt_b]: 0.00027525, [1] [Cycle 1]: 0.00026542, [7] [b_1]: 0.00016768 [b_2]: 7.86001e-06 [updatestate_depend_eliminate]: 7.8e-06 [updatestate_assign_eliminate]: 2.49001e-06 [updatestate_loads_eliminate]: 2.89999e-06 [renormalize]: 9.70002e-07 [cse]: 2.005e-05 [optimize_parallel_all_gather_comm]: 2.129e-05 [overlap_param_gather]: 5.09e-06 [cconv]: 3.176e-05 [loop_unroll]: 0.00045672 [opt_after_cconv]: 0.00012702, [1] [Cycle 1]: 0.00011859, [7] [c_1]: 3.14e-05 [parameter_eliminate]: 4.15999e-06 [updatestate_depend_eliminate]: 5.87999e-06 [updatestate_assign_eliminate]: 2.43998e-06 [updatestate_loads_eliminate]: 2.28998e-06 [cse]: 1.652e-05 [renormalize]: 3.50003e-07 [remove_dup_value]: 2.977e-05 [tuple_transform]: 9.569e-05, [1] [Cycle 1]: 8.833e-05, [4] [d_1]: 4.74e-05 [none_parameter_eliminate]: 1.99999e-06 [renormalize]: 2.10013e-07 [switch_simplify]: 7.22002e-06 [partial_unused_args_eliminate]: 4.49002e-06 [add_recomputation]: 5.323e-05 [cse_after_recomputation]: 2.692e-05, [1] [Cycle 1]: 1.99e-05, [1] [cse]: 1.101e-05 [environ_conv]: 8.23001e-06 [swap_dp_allreduce_reducescatter]: 7.88999e-06 [bias_add_comm_swap]: 5.57001e-06 [label_micro_interleaved_index]: 8.44002e-06 [label_fine_grained_interleaved_index]: 5.12e-06 [merge_cast_opt]: 3.98001e-06 [slice_recompute_activation]: 4.42e-06 [micro_interleaved_order_control]: 4.80001e-06 [assign_add_opt]: 3.72002e-06 [ForceFp32Comm]: 3.09001e-06 [remove_cast_before_assign_add]: 3.41001e-06 [full_micro_interleaved_order_control]: 4.54998e-06 [reorder_send_recv_between_fp_bp]: 5.16002e-06 [comm_op_add_attrs]: 3.36999e-06 [add_comm_op_reuse_tag]: 3.44001e-06 [interleave_split_concat_branches]: 3.51001e-06 [interleave_parallel_branches]: 3.92998e-06 [overlap_opt_shard_in_pipeline]: 3.53e-06 [overlap_opt_shard_grad_in_pipeline]: 4.70999e-06 [control_data_broadcast_order]: 1.643e-05 [grouped_pairwise_exchange_alltoall]: 4.51002e-06 [offloading_packed_experts]: 6.49001e-06 [overlap_recompute_and_grad_model_parallel]: 7.58001e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.64002e-06 [overlap_recompute_allgather_and_fa_grad]: 3.86999e-06 [overlap_recompute_comm]: 5.19e-06 [overlap_grad_ring_attention]: 6.64001e-06 [overlap_grad_flash_sp]: 2.382e-05 [begin_end_overlap_inline]: 3.07002e-06 [split_matmul_comm_elemetwise]: 4.45999e-06 [split_layernorm_comm]: 3.90998e-06 [handle_group_info]: 3.77002e-06 [symbol_engine_optimizer]: 9.799e-05, [1] [Cycle 1]: 9.105e-05, [6] [build]: 3.14999e-06 [elim_shapecalc]: 1.066e-05 [elim_not_effective]: 1.341e-05 [opt_reshape]: 7.35e-06 [fold_const_symbol]: 1.039e-05 [renormalize]: 2.20025e-07 [detach_backward]: 4e-06 [pipeline_parallel_scheduler]: 2.17001e-06 [auto_monad_reorder]: 1.924e-05 [get_jit_bprop_graph]: 1.86e-06 [rewriter_after_jit_bprop_graph]: 5.54e-06 [opt_after_jit_grad]: 0.00051418 [validate]: 4.089e-05 Sums bootstrap : 0.000423s : 3.70% type_inference : 0.005871s : 51.37% event_method : 0.000019s : 0.16% auto_monad : 0.000062s : 0.54% graph_reusing : 0.000006s : 0.05% inline : 0.000002s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000021s : 0.18% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.05% parallel-infer-symbol : 0.000003s : 0.03% pre_auto_parallel : 0.000036s : 0.31% insert-virtual-dataset : 0.000003s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000030s : 0.27% optimize.rewriter_before_opt_a : 0.000091s : 0.80% optimize.opt_a.expand_dump_flag : 0.000005s : 0.04% optimize.opt_a.switch_simplify : 0.000049s : 0.43% optimize.opt_a.loop_unroll : 0.000036s : 0.32% optimize.opt_a.a_1 : 0.000810s : 7.09% optimize.opt_a.with_stream_mark : 0.000032s : 0.28% optimize.opt_a.recompute_prepare : 0.000016s : 0.14% optimize.opt_a.updatestate_depend_eliminate : 0.000008s : 0.07% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.06% optimize.opt_a.updatestate_loads_eliminate : 0.000005s : 0.05% optimize.opt_a.parameter_eliminate : 0.000004s : 0.04% optimize.opt_a.a_2 : 0.000221s : 1.93% optimize.opt_a.accelerated_algorithm : 0.000015s : 0.13% optimize.opt_a.shard : 0.000003s : 0.02% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.03% optimize.opt_a.shard_inline : 0.000013s : 0.11% optimize.opt_a.merge_send_recv : 0.000016s : 0.14% optimize.opt_a.auto_parallel : 0.000014s : 0.12% optimize.opt_a.parallel : 0.000026s : 0.22% optimize.opt_a.flash_sp : 0.000012s : 0.10% optimize.opt_a.merge_comm : 0.000007s : 0.06% optimize.opt_a.allreduce_fusion : 0.000007s : 0.06% optimize.opt_a.matmul_add_comm_reduction : 0.000016s : 0.14% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000016s : 0.14% optimize.opt_a.virtual_dataset : 0.000013s : 0.11% optimize.opt_a.get_grad_eliminate_ : 0.000012s : 0.11% optimize.opt_a.virtual_output : 0.000012s : 0.11% optimize.opt_a.merge_forward : 0.000007s : 0.06% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.02% optimize.opt_a.offload_activation : 0.000017s : 0.15% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000028s : 0.25% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.02% optimize.opt_a.before_grad : 0.000020s : 0.18% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000007s : 0.06% optimize.opt_a.meta_fg_expand : 0.000005s : 0.04% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.03% optimize.opt_a.receive_attached : 0.000003s : 0.03% optimize.opt_a.after_resolve : 0.000024s : 0.21% optimize.opt_a.a_after_grad : 0.000020s : 0.17% optimize.opt_a.renormalize : 0.000730s : 6.39% optimize.opt_a.add_forward_monad_depend : 0.000007s : 0.06% optimize.opt_a.auto_monad_grad : 0.000003s : 0.03% optimize.opt_a.auto_monad_eliminator : 0.000025s : 0.22% optimize.opt_a.cse : 0.000039s : 0.34% optimize.opt_a.a_3 : 0.000110s : 0.96% optimize.py_interpret_to_execute_after_opt_a : 0.000014s : 0.12% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.04% optimize.rewriter_after_opt_a : 0.000040s : 0.35% optimize.convert_after_rewriter : 0.000010s : 0.09% optimize.order_py_execute_after_rewriter : 0.000009s : 0.08% optimize.mutable_eliminate : 0.000582s : 5.10% optimize.opt_b.b_1 : 0.000168s : 1.47% optimize.opt_b.b_2 : 0.000008s : 0.07% optimize.opt_b.updatestate_depend_eliminate : 0.000008s : 0.07% optimize.opt_b.updatestate_assign_eliminate : 0.000002s : 0.02% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_b.renormalize : 0.000001s : 0.01% optimize.opt_b.cse : 0.000020s : 0.18% optimize.optimize_parallel_all_gather_comm : 0.000021s : 0.19% optimize.overlap_param_gather : 0.000005s : 0.04% optimize.cconv : 0.000032s : 0.28% optimize.loop_unroll : 0.000457s : 4.00% optimize.opt_after_cconv.c_1 : 0.000031s : 0.27% optimize.opt_after_cconv.parameter_eliminate : 0.000004s : 0.04% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.05% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000002s : 0.02% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.02% optimize.opt_after_cconv.cse : 0.000017s : 0.14% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000030s : 0.26% optimize.tuple_transform.d_1 : 0.000047s : 0.41% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.02% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000007s : 0.06% optimize.partial_unused_args_eliminate : 0.000004s : 0.04% optimize.add_recomputation : 0.000053s : 0.47% optimize.cse_after_recomputation.cse : 0.000011s : 0.10% optimize.environ_conv : 0.000008s : 0.07% optimize.swap_dp_allreduce_reducescatter : 0.000008s : 0.07% optimize.bias_add_comm_swap : 0.000006s : 0.05% optimize.label_micro_interleaved_index : 0.000008s : 0.07% optimize.label_fine_grained_interleaved_index : 0.000005s : 0.04% optimize.merge_cast_opt : 0.000004s : 0.03% optimize.slice_recompute_activation : 0.000004s : 0.04% optimize.micro_interleaved_order_control : 0.000005s : 0.04% optimize.assign_add_opt : 0.000004s : 0.03% optimize.ForceFp32Comm : 0.000003s : 0.03% optimize.remove_cast_before_assign_add : 0.000003s : 0.03% optimize.full_micro_interleaved_order_control : 0.000005s : 0.04% optimize.reorder_send_recv_between_fp_bp : 0.000005s : 0.05% optimize.comm_op_add_attrs : 0.000003s : 0.03% optimize.add_comm_op_reuse_tag : 0.000003s : 0.03% optimize.interleave_split_concat_branches : 0.000004s : 0.03% optimize.interleave_parallel_branches : 0.000004s : 0.03% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.03% optimize.overlap_opt_shard_grad_in_pipeline : 0.000005s : 0.04% optimize.control_data_broadcast_order : 0.000016s : 0.14% optimize.grouped_pairwise_exchange_alltoall : 0.000005s : 0.04% optimize.offloading_packed_experts : 0.000006s : 0.06% optimize.overlap_recompute_and_grad_model_parallel : 0.000008s : 0.07% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.03% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.03% optimize.overlap_recompute_comm : 0.000005s : 0.05% optimize.overlap_grad_ring_attention : 0.000007s : 0.06% optimize.overlap_grad_flash_sp : 0.000024s : 0.21% optimize.begin_end_overlap_inline : 0.000003s : 0.03% optimize.split_matmul_comm_elemetwise : 0.000004s : 0.04% optimize.split_layernorm_comm : 0.000004s : 0.03% optimize.handle_group_info : 0.000004s : 0.03% optimize.symbol_engine_optimizer.build : 0.000003s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000011s : 0.09% optimize.symbol_engine_optimizer.elim_not_effective : 0.000013s : 0.12% optimize.symbol_engine_optimizer.opt_reshape : 0.000007s : 0.06% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000010s : 0.09% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000004s : 0.03% pipeline_parallel_scheduler : 0.000002s : 0.02% auto_monad_reorder : 0.000019s : 0.17% get_jit_bprop_graph : 0.000002s : 0.02% rewriter_after_jit_bprop_graph : 0.000006s : 0.05% opt_after_jit_grad : 0.000514s : 4.50% validate : 0.000041s : 0.36% Time group info: ------[substitution.] 0.000227 34 15.03% : 0.000034s : 6: substitution.arithmetic_simplify 0.80% : 0.000002s : 2: substitution.elim_not_effective 0.63% : 0.000001s : 2: substitution.fold_const_symbol 2.69% : 0.000006s : 4: substitution.graph_param_transform 68.20% : 0.000155s : 4: substitution.inline 1.65% : 0.000004s : 4: substitution.j_node_and_user_rematch 2.36% : 0.000005s : 4: substitution.remove_not_recompute_node 2.14% : 0.000005s : 4: substitution.replace_old_param 6.50% : 0.000015s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.005816 2 88.08% : 0.005123s : 1: type_inference.infer 11.92% : 0.000693s : 1: type_inference.specialize ------[replace.] 0.000064 8 63.77% : 0.000041s : 4: replace.inline 36.23% : 0.000023s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000165 8 92.12% : 0.000152s : 4: match.inline 7.88% : 0.000013s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000210 1278 0.90% : 0.000002s : 13: predicate.accumulaten_eliminater 0.69% : 0.000001s : 4: predicate.ad_related_special_op_eliminate 0.48% : 0.000001s : 8: predicate.addn_check_dump 1.01% : 0.000002s : 13: predicate.addn_zero_filter 0.81% : 0.000002s : 13: predicate.adjust_all_reduce_mul_add 2.38% : 0.000005s : 21: predicate.arithmetic_simplify 0.99% : 0.000002s : 13: predicate.cast_eliminate 0.60% : 0.000001s : 8: predicate.check_bprop_eliminate 0.49% : 0.000001s : 8: predicate.compare_switch_simplify 0.20% : 0.000000s : 4: predicate.const_output_eliminate 0.53% : 0.000001s : 8: predicate.depend_value_elim 0.93% : 0.000002s : 13: predicate.dict_get_item_const_eliminator 0.99% : 0.000002s : 13: predicate.dict_get_item_eliminator 0.89% : 0.000002s : 13: predicate.dict_set_item_eliminator 1.06% : 0.000002s : 8: predicate.dumpgradient_eliminate 0.19% : 0.000000s : 4: predicate.elim_not_effective 0.55% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.31% : 0.000003s : 17: predicate.environ_add_const_eliminate 1.07% : 0.000002s : 17: predicate.environ_get_add_eliminate 1.15% : 0.000002s : 17: predicate.environ_get_depend_swap 1.64% : 0.000003s : 25: predicate.environ_get_eliminate 1.00% : 0.000002s : 17: predicate.environ_get_set_eliminate 1.47% : 0.000003s : 21: predicate.exchange_switch_depend_value 2.57% : 0.000005s : 21: predicate.float_depend_g_call 0.51% : 0.000001s : 8: predicate.float_environ_get_switch 0.78% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.16% : 0.000000s : 4: predicate.fold_const_symbol 0.67% : 0.000001s : 8: predicate.get_grad_eliminate 0.32% : 0.000001s : 4: predicate.graph_param_transform 0.59% : 0.000001s : 8: predicate.incorporate_call 0.49% : 0.000001s : 8: predicate.incorporate_call_switch 6.60% : 0.000014s : 58: predicate.inline 0.70% : 0.000001s : 8: predicate.inline_without_move 0.34% : 0.000001s : 8: predicate.j_node_and_user_rematch 0.82% : 0.000002s : 8: predicate.less_batch_normalization 1.83% : 0.000004s : 25: predicate.list_to_tuple_eliminator_ 2.40% : 0.000005s : 38: predicate.load_eliminater 1.22% : 0.000003s : 4: predicate.loop_unroll_after_grad 2.35% : 0.000005s : 34: predicate.loop_unroll_before_grad 1.61% : 0.000003s : 21: predicate.make_slice_get_slice_eliminator 0.50% : 0.000001s : 8: predicate.merge_addn 0.61% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.56% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.82% : 0.000002s : 13: predicate.minmaximum_grad 1.49% : 0.000003s : 4: predicate.mutable_eliminate 0.36% : 0.000001s : 4: predicate.opt_reshape 0.35% : 0.000001s : 4: predicate.parallel_virtual_node 1.74% : 0.000004s : 21: predicate.partial_defer_inline 1.60% : 0.000003s : 21: predicate.partial_eliminate 0.93% : 0.000002s : 13: predicate.print_const_string_wrapper 0.57% : 0.000001s : 8: predicate.reduce_all_const_elim 1.25% : 0.000003s : 13: predicate.reduce_eliminate 2.37% : 0.000005s : 38: predicate.redundant_stop_gradient_eliminater 0.48% : 0.000001s : 8: predicate.remove_not_recompute_node 1.45% : 0.000003s : 25: predicate.replace_applicator 0.52% : 0.000001s : 8: predicate.replace_old_param 0.38% : 0.000001s : 4: predicate.reset_defer_inline 0.94% : 0.000002s : 13: predicate.reshape_eliminate 0.65% : 0.000001s : 8: predicate.row_tensor_add_zeros_like 0.37% : 0.000001s : 4: predicate.row_tensor_eliminate 0.81% : 0.000002s : 8: predicate.same_eliminate 0.40% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.72% : 0.000002s : 8: predicate.shard_identity_eliminate 0.76% : 0.000002s : 8: predicate.special_op_eliminate 0.80% : 0.000002s : 8: predicate.specialize_transform 0.75% : 0.000002s : 8: predicate.split_environ_get_set_with_tuple_value 0.86% : 0.000002s : 8: predicate.stack_unstack_eliminate 0.38% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.51% : 0.000003s : 21: predicate.switch_defer_inline 2.05% : 0.000004s : 29: predicate.switch_layer_defer_inline 4.99% : 0.000010s : 67: predicate.switch_simplify 0.83% : 0.000002s : 13: predicate.tile_eliminate 0.96% : 0.000002s : 13: predicate.transpose_eliminate 1.47% : 0.000003s : 21: predicate.tuple_list_convert_item_index_to_positive 1.53% : 0.000003s : 21: predicate.tuple_list_get_item_const_eliminator 1.37% : 0.000003s : 21: predicate.tuple_list_get_item_depend_reorder 3.40% : 0.000007s : 33: predicate.tuple_list_get_item_eliminator 1.59% : 0.000003s : 21: predicate.tuple_list_get_set_item_eliminator 2.06% : 0.000004s : 29: predicate.tuple_list_set_item_eliminator 1.65% : 0.000003s : 25: predicate.tuple_to_list_eliminator_ 2.36% : 0.000005s : 38: predicate.updatestate_pure_node_eliminater 2.98% : 0.000006s : 46: predicate.updatestate_useless_node_eliminater 0.47% : 0.000001s : 4: predicate.value_based_eliminate 0.60% : 0.000001s : 8: predicate.virtual_dataset_eliminate 0.58% : 0.000001s : 8: predicate.virtual_output_eliminate 0.28% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.60% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000528 11 54.30% : 0.000287s : 5: func_graph_cloner_run.FuncGraphClonerGraph 45.70% : 0.000241s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.028390 192 0.02% : 0.000006s : 1: ForceFp32Comm 13.19% : 0.003745s : 1: add_attr 13.13% : 0.003728s : 1: add_attr_with_inline 0.02% : 0.000007s : 1: add_comm_op_reuse_tag 0.20% : 0.000057s : 1: add_recomputation 0.02% : 0.000006s : 1: assign_add_opt 0.25% : 0.000071s : 1: auto_monad 0.10% : 0.000027s : 1: auto_monad_reorder 0.02% : 0.000006s : 1: begin_end_overlap_inline 0.03% : 0.000008s : 1: bias_add_comm_swap 1.64% : 0.000466s : 1: bootstrap 0.12% : 0.000035s : 1: cconv 0.02% : 0.000006s : 1: comm_op_add_attrs 0.07% : 0.000020s : 1: control_data_broadcast_order 0.05% : 0.000014s : 1: convert_after_rewriter 0.11% : 0.000030s : 1: cse_after_recomputation 0.03% : 0.000008s : 1: dataset_repeat_opt 0.07% : 0.000019s : 1: detach_backward 0.04% : 0.000011s : 1: environ_conv 0.10% : 0.000030s : 1: event_method 0.03% : 0.000007s : 1: full_micro_interleaved_order_control 0.03% : 0.000008s : 1: get_jit_bprop_graph 0.05% : 0.000014s : 1: graph_reusing 0.03% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.02% : 0.000006s : 1: handle_group_info 0.03% : 0.000008s : 1: inline 0.03% : 0.000009s : 1: insert-virtual-dataset 0.02% : 0.000006s : 1: interleave_parallel_branches 0.02% : 0.000006s : 1: interleave_split_concat_branches 0.03% : 0.000008s : 1: label_fine_grained_interleaved_index 0.04% : 0.000011s : 1: label_micro_interleaved_index 1.63% : 0.000463s : 1: loop_unroll 0.02% : 0.000007s : 1: merge_cast_opt 0.03% : 0.000007s : 1: micro_interleaved_order_control 2.07% : 0.000589s : 1: mutable_eliminate 0.03% : 0.000009s : 1: offloading_packed_experts 0.06% : 0.000016s : 1: opt.transform.loop_unroll_optimizer 0.06% : 0.000017s : 1: opt.transform.mutable_eliminate 4.38% : 0.001244s : 78: opt.transform.opt_a 0.11% : 0.000030s : 1: opt.transform.opt_after_cconv 0.09% : 0.000027s : 1: opt.transform.opt_after_jit_grad 0.36% : 0.000104s : 28: opt.transform.opt_b 0.18% : 0.000052s : 2: opt.transform.opt_trans_graph 0.13% : 0.000038s : 4: opt.transform.symbol_engine_opt 11.27% : 0.003200s : 1: opt_a 0.46% : 0.000130s : 1: opt_after_cconv 1.85% : 0.000526s : 1: opt_after_jit_grad 0.98% : 0.000279s : 1: opt_b 20.90% : 0.005933s : 1: optimize 0.09% : 0.000025s : 1: optimize_parallel_all_gather_comm 0.04% : 0.000012s : 1: order_py_execute_after_rewriter 0.09% : 0.000027s : 1: overlap_grad_flash_sp 0.02% : 0.000006s : 1: overlap_grad_matmul_and_grad_allreduce 0.03% : 0.000009s : 1: overlap_grad_ring_attention 0.03% : 0.000007s : 1: overlap_opt_shard_grad_in_pipeline 0.02% : 0.000006s : 1: overlap_opt_shard_in_pipeline 0.03% : 0.000008s : 1: overlap_param_gather 0.02% : 0.000006s : 1: overlap_recompute_allgather_and_fa_grad 0.04% : 0.000011s : 1: overlap_recompute_and_grad_model_parallel 0.03% : 0.000008s : 1: overlap_recompute_comm 0.03% : 0.000010s : 1: parallel-infer-symbol 0.02% : 0.000006s : 1: parallel-infer-symbol-second 0.03% : 0.000007s : 1: partial_unused_args_eliminate 0.03% : 0.000010s : 1: pipeline_parallel_scheduler 0.03% : 0.000007s : 1: pipeline_split 0.15% : 0.000043s : 1: pre_auto_parallel 0.12% : 0.000034s : 1: py_interpret_to_execute 0.06% : 0.000017s : 1: py_interpret_to_execute_after_opt_a 0.02% : 0.000006s : 1: remove_cast_before_assign_add 0.12% : 0.000034s : 1: remove_dup_value 1.41% : 0.000400s : 1: renormalize.infer 1.13% : 0.000321s : 1: renormalize.specialize 0.03% : 0.000008s : 1: reorder_send_recv_between_fp_bp 0.04% : 0.000011s : 1: rewriter_after_jit_bprop_graph 0.15% : 0.000043s : 1: rewriter_after_opt_a 0.34% : 0.000095s : 1: rewriter_before_opt_a 0.03% : 0.000008s : 1: slice_cell_reuse_recomputed_activation 0.03% : 0.000008s : 1: slice_recompute_activation 0.02% : 0.000007s : 1: split_layernorm_comm 0.03% : 0.000007s : 1: split_matmul_comm_elemetwise 0.04% : 0.000011s : 1: swap_dp_allreduce_reducescatter 0.36% : 0.000101s : 1: symbol_engine_optimizer 0.35% : 0.000099s : 1: tuple_transform 20.82% : 0.005911s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:43:08.187.205 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0151311, [21] [bootstrap]: 0.00043838 [type_inference]: 0.00572485 [event_method]: 1.846e-05 [auto_monad]: 6.427e-05 [graph_reusing]: 5.89e-06 [inline]: 2.17001e-06 [add_attr]: 0.00326537, [1] [add_attr_with_inline]: 0.00325633, [1] [Cycle 1]: 6.019e-05, [2] [tag_attr]: 1.85e-05 [meta_addattr_fg_expand]: 5.46e-06 [parallel-infer-symbol]: 3.66001e-06 [pre_auto_parallel]: 3.471e-05 [insert-virtual-dataset]: 2.78e-06 [parallel-infer-symbol-second]: 9.49978e-07 [dataset_repeat_opt]: 2.04999e-06 [pipeline_split]: 1.74e-06 [optimize]: 0.00486465, [53] [py_interpret_to_execute]: 2.612e-05 [rewriter_before_opt_a]: 8.1e-05 [opt_a]: 0.00285731, [2] [Cycle 1]: 0.00209733, [45] [expand_dump_flag]: 2.93e-06 [switch_simplify]: 4.319e-05 [loop_unroll]: 2.973e-05 [a_1]: 0.00063994 [with_stream_mark]: 1.653e-05 [recompute_prepare]: 9.14e-06 [updatestate_depend_eliminate]: 3.95e-06 [updatestate_assign_eliminate]: 3.48e-06 [updatestate_loads_eliminate]: 2.81999e-06 [parameter_eliminate]: 1.86e-06 [a_2]: 8.697e-05 [accelerated_algorithm]: 7.35e-06 [shard]: 1.71e-06 [meta_shard_fg_expand]: 2.09e-06 [shard_inline]: 6.52001e-06 [merge_send_recv]: 9.08002e-06 [auto_parallel]: 6.44001e-06 [parallel]: 1.862e-05 [flash_sp]: 8.37998e-06 [merge_comm]: 3.64002e-06 [allreduce_fusion]: 3.51001e-06 [matmul_add_comm_reduction]: 9.09e-06 [allreduce_slice_to_reducescatter]: 6.69999e-07 [virtual_shard_identity]: 7.51001e-06 [virtual_dataset]: 6.51e-06 [get_grad_eliminate_]: 6.57002e-06 [virtual_output]: 6.18002e-06 [merge_forward]: 3.85998e-06 [cell_reuse_recompute_pass]: 1.12e-06 [offload_activation]: 1.045e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.24e-05 [merge_recompute_call_nodes]: 1.55001e-06 [before_grad]: 1.085e-05 [set_forward_comm_id_for_comm_node_pass]: 3.89002e-06 [meta_fg_expand]: 2.63e-06 [flash_sp_send_recv_attached]: 2.71999e-06 [receive_attached]: 2.29001e-06 [after_resolve]: 1.114e-05 [a_after_grad]: 1.006e-05 [renormalize]: 0.00072997 [add_forward_monad_depend]: 5.59998e-06 [auto_monad_grad]: 2.36e-06 [auto_monad_eliminator]: 1.53e-05 [cse]: 3.026e-05 [a_3]: 5.132e-05 [Cycle 2]: 0.00074951, [45] [expand_dump_flag]: 1.52999e-06 [switch_simplify]: 8.37e-06 [loop_unroll]: 6.68e-06 [a_1]: 0.00014146 [with_stream_mark]: 1.299e-05 [recompute_prepare]: 6.79001e-06 [updatestate_depend_eliminate]: 2.94001e-06 [updatestate_assign_eliminate]: 2.42001e-06 [updatestate_loads_eliminate]: 2.52001e-06 [parameter_eliminate]: 1.35001e-06 [a_2]: 0.0001416 [accelerated_algorithm]: 7.75e-06 [shard]: 1.10999e-06 [meta_shard_fg_expand]: 1.55999e-06 [shard_inline]: 6.91999e-06 [merge_send_recv]: 5.46e-06 [auto_parallel]: 6.41998e-06 [parallel]: 4.84e-06 [flash_sp]: 3.50998e-06 [merge_comm]: 3.59002e-06 [allreduce_fusion]: 3.35998e-06 [matmul_add_comm_reduction]: 6.56e-06 [allreduce_slice_to_reducescatter]: 3.89991e-07 [virtual_shard_identity]: 1.062e-05 [virtual_dataset]: 6.12001e-06 [get_grad_eliminate_]: 5.72999e-06 [virtual_output]: 5.65001e-06 [merge_forward]: 3.78999e-06 [cell_reuse_recompute_pass]: 1.42e-06 [offload_activation]: 6.79001e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.17e-05 [merge_recompute_call_nodes]: 9.29984e-07 [before_grad]: 9.87999e-06 [set_forward_comm_id_for_comm_node_pass]: 3.6e-06 [meta_fg_expand]: 2.11e-06 [flash_sp_send_recv_attached]: 1.25999e-06 [receive_attached]: 1.30999e-06 [after_resolve]: 1.067e-05 [a_after_grad]: 9.20999e-06 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 2.25002e-06 [auto_monad_grad]: 1.06002e-06 [auto_monad_eliminator]: 7.64002e-06 [cse]: 1.529e-05 [a_3]: 3.704e-05 [py_interpret_to_execute_after_opt_a]: 1.022e-05 [slice_cell_reuse_recomputed_activation]: 1.96003e-06 [rewriter_after_opt_a]: 3.347e-05 [convert_after_rewriter]: 6.78e-06 [order_py_execute_after_rewriter]: 5.38002e-06 [mutable_eliminate]: 0.00053026 [opt_b]: 0.00020248, [1] [Cycle 1]: 0.00019619, [7] [b_1]: 0.00012443 [b_2]: 8.47998e-06 [updatestate_depend_eliminate]: 5.67001e-06 [updatestate_assign_eliminate]: 2.39001e-06 [updatestate_loads_eliminate]: 2.98e-06 [renormalize]: 5.3001e-07 [cse]: 1.748e-05 [optimize_parallel_all_gather_comm]: 1.634e-05 [overlap_param_gather]: 1.97999e-06 [cconv]: 2.568e-05 [loop_unroll]: 0.00041832 [opt_after_cconv]: 9.961e-05, [1] [Cycle 1]: 9.412e-05, [7] [c_1]: 3.051e-05 [parameter_eliminate]: 2.79999e-06 [updatestate_depend_eliminate]: 5.34e-06 [updatestate_assign_eliminate]: 2.53e-06 [updatestate_loads_eliminate]: 2.17999e-06 [cse]: 1.625e-05 [renormalize]: 4.69998e-07 [remove_dup_value]: 1.265e-05 [tuple_transform]: 7.47e-05, [1] [Cycle 1]: 7.046e-05, [4] [d_1]: 4.418e-05 [none_parameter_eliminate]: 1.55001e-06 [renormalize]: 2.50002e-07 [switch_simplify]: 6.79001e-06 [partial_unused_args_eliminate]: 1.68002e-06 [add_recomputation]: 5.075e-05 [cse_after_recomputation]: 2.081e-05, [1] [Cycle 1]: 1.621e-05, [1] [cse]: 1.084e-05 [environ_conv]: 4.67e-06 [swap_dp_allreduce_reducescatter]: 5.50001e-06 [bias_add_comm_swap]: 2.93e-06 [label_micro_interleaved_index]: 4.89998e-06 [label_fine_grained_interleaved_index]: 3.00002e-06 [merge_cast_opt]: 1.33002e-06 [slice_recompute_activation]: 2.16e-06 [micro_interleaved_order_control]: 2.21e-06 [assign_add_opt]: 1.37e-06 [ForceFp32Comm]: 1.04e-06 [remove_cast_before_assign_add]: 1.02e-06 [full_micro_interleaved_order_control]: 2.32999e-06 [reorder_send_recv_between_fp_bp]: 2.65002e-06 [comm_op_add_attrs]: 1.05999e-06 [add_comm_op_reuse_tag]: 9.39996e-07 [interleave_split_concat_branches]: 1.22999e-06 [interleave_parallel_branches]: 1.27999e-06 [overlap_opt_shard_in_pipeline]: 1.18001e-06 [overlap_opt_shard_grad_in_pipeline]: 1.79998e-06 [control_data_broadcast_order]: 1.185e-05 [grouped_pairwise_exchange_alltoall]: 1.51002e-06 [offloading_packed_experts]: 3.86999e-06 [overlap_recompute_and_grad_model_parallel]: 4.98001e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.52001e-06 [overlap_recompute_allgather_and_fa_grad]: 1.38002e-06 [overlap_recompute_comm]: 2.58e-06 [overlap_grad_ring_attention]: 4.12998e-06 [overlap_grad_flash_sp]: 1.902e-05 [begin_end_overlap_inline]: 6.09987e-07 [split_matmul_comm_elemetwise]: 2.09999e-06 [split_layernorm_comm]: 1.91e-06 [handle_group_info]: 9.00007e-07 [symbol_engine_optimizer]: 7.297e-05, [1] [Cycle 1]: 6.904e-05, [6] [build]: 2.67001e-06 [elim_shapecalc]: 9.58002e-06 [elim_not_effective]: 1.253e-05 [opt_reshape]: 6.76e-06 [fold_const_symbol]: 9.66e-06 [renormalize]: 2.30008e-07 [detach_backward]: 1.82999e-06 [pipeline_parallel_scheduler]: 1.60999e-06 [auto_monad_reorder]: 1.615e-05 [get_jit_bprop_graph]: 1.69e-06 [rewriter_after_jit_bprop_graph]: 4e-06 [opt_after_jit_grad]: 0.00048379 [validate]: 3.916e-05 Sums bootstrap : 0.000438s : 4.02% type_inference : 0.005725s : 52.49% event_method : 0.000018s : 0.17% auto_monad : 0.000064s : 0.59% graph_reusing : 0.000006s : 0.05% inline : 0.000002s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000019s : 0.17% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000005s : 0.05% parallel-infer-symbol : 0.000004s : 0.03% pre_auto_parallel : 0.000035s : 0.32% insert-virtual-dataset : 0.000003s : 0.03% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.02% optimize.py_interpret_to_execute : 0.000026s : 0.24% optimize.rewriter_before_opt_a : 0.000081s : 0.74% optimize.opt_a.expand_dump_flag : 0.000004s : 0.04% optimize.opt_a.switch_simplify : 0.000052s : 0.47% optimize.opt_a.loop_unroll : 0.000036s : 0.33% optimize.opt_a.a_1 : 0.000781s : 7.16% optimize.opt_a.with_stream_mark : 0.000030s : 0.27% optimize.opt_a.recompute_prepare : 0.000016s : 0.15% optimize.opt_a.updatestate_depend_eliminate : 0.000007s : 0.06% optimize.opt_a.updatestate_assign_eliminate : 0.000006s : 0.05% optimize.opt_a.updatestate_loads_eliminate : 0.000005s : 0.05% optimize.opt_a.parameter_eliminate : 0.000003s : 0.03% optimize.opt_a.a_2 : 0.000229s : 2.10% optimize.opt_a.accelerated_algorithm : 0.000015s : 0.14% optimize.opt_a.shard : 0.000003s : 0.03% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.03% optimize.opt_a.shard_inline : 0.000013s : 0.12% optimize.opt_a.merge_send_recv : 0.000015s : 0.13% optimize.opt_a.auto_parallel : 0.000013s : 0.12% optimize.opt_a.parallel : 0.000023s : 0.22% optimize.opt_a.flash_sp : 0.000012s : 0.11% optimize.opt_a.merge_comm : 0.000007s : 0.07% optimize.opt_a.allreduce_fusion : 0.000007s : 0.06% optimize.opt_a.matmul_add_comm_reduction : 0.000016s : 0.14% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000018s : 0.17% optimize.opt_a.virtual_dataset : 0.000013s : 0.12% optimize.opt_a.get_grad_eliminate_ : 0.000012s : 0.11% optimize.opt_a.virtual_output : 0.000012s : 0.11% optimize.opt_a.merge_forward : 0.000008s : 0.07% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.02% optimize.opt_a.offload_activation : 0.000017s : 0.16% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000024s : 0.22% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.02% optimize.opt_a.before_grad : 0.000021s : 0.19% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000007s : 0.07% optimize.opt_a.meta_fg_expand : 0.000005s : 0.04% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.04% optimize.opt_a.receive_attached : 0.000004s : 0.03% optimize.opt_a.after_resolve : 0.000022s : 0.20% optimize.opt_a.a_after_grad : 0.000019s : 0.18% optimize.opt_a.renormalize : 0.000730s : 6.69% optimize.opt_a.add_forward_monad_depend : 0.000008s : 0.07% optimize.opt_a.auto_monad_grad : 0.000003s : 0.03% optimize.opt_a.auto_monad_eliminator : 0.000023s : 0.21% optimize.opt_a.cse : 0.000046s : 0.42% optimize.opt_a.a_3 : 0.000088s : 0.81% optimize.py_interpret_to_execute_after_opt_a : 0.000010s : 0.09% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.02% optimize.rewriter_after_opt_a : 0.000033s : 0.31% optimize.convert_after_rewriter : 0.000007s : 0.06% optimize.order_py_execute_after_rewriter : 0.000005s : 0.05% optimize.mutable_eliminate : 0.000530s : 4.86% optimize.opt_b.b_1 : 0.000124s : 1.14% optimize.opt_b.b_2 : 0.000008s : 0.08% optimize.opt_b.updatestate_depend_eliminate : 0.000006s : 0.05% optimize.opt_b.updatestate_assign_eliminate : 0.000002s : 0.02% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000017s : 0.16% optimize.optimize_parallel_all_gather_comm : 0.000016s : 0.15% optimize.overlap_param_gather : 0.000002s : 0.02% optimize.cconv : 0.000026s : 0.24% optimize.loop_unroll : 0.000418s : 3.84% optimize.opt_after_cconv.c_1 : 0.000031s : 0.28% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000005s : 0.05% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.02% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.02% optimize.opt_after_cconv.cse : 0.000016s : 0.15% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000013s : 0.12% optimize.tuple_transform.d_1 : 0.000044s : 0.41% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000007s : 0.06% optimize.partial_unused_args_eliminate : 0.000002s : 0.02% optimize.add_recomputation : 0.000051s : 0.47% optimize.cse_after_recomputation.cse : 0.000011s : 0.10% optimize.environ_conv : 0.000005s : 0.04% optimize.swap_dp_allreduce_reducescatter : 0.000006s : 0.05% optimize.bias_add_comm_swap : 0.000003s : 0.03% optimize.label_micro_interleaved_index : 0.000005s : 0.04% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.03% optimize.merge_cast_opt : 0.000001s : 0.01% optimize.slice_recompute_activation : 0.000002s : 0.02% optimize.micro_interleaved_order_control : 0.000002s : 0.02% optimize.assign_add_opt : 0.000001s : 0.01% optimize.ForceFp32Comm : 0.000001s : 0.01% optimize.remove_cast_before_assign_add : 0.000001s : 0.01% optimize.full_micro_interleaved_order_control : 0.000002s : 0.02% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.02% optimize.comm_op_add_attrs : 0.000001s : 0.01% optimize.add_comm_op_reuse_tag : 0.000001s : 0.01% optimize.interleave_split_concat_branches : 0.000001s : 0.01% optimize.interleave_parallel_branches : 0.000001s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.02% optimize.control_data_broadcast_order : 0.000012s : 0.11% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.01% optimize.offloading_packed_experts : 0.000004s : 0.04% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.05% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000002s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.01% optimize.overlap_recompute_comm : 0.000003s : 0.02% optimize.overlap_grad_ring_attention : 0.000004s : 0.04% optimize.overlap_grad_flash_sp : 0.000019s : 0.17% optimize.begin_end_overlap_inline : 0.000001s : 0.01% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.02% optimize.split_layernorm_comm : 0.000002s : 0.02% optimize.handle_group_info : 0.000001s : 0.01% optimize.symbol_engine_optimizer.build : 0.000003s : 0.02% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000010s : 0.09% optimize.symbol_engine_optimizer.elim_not_effective : 0.000013s : 0.11% optimize.symbol_engine_optimizer.opt_reshape : 0.000007s : 0.06% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000010s : 0.09% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.02% pipeline_parallel_scheduler : 0.000002s : 0.01% auto_monad_reorder : 0.000016s : 0.15% get_jit_bprop_graph : 0.000002s : 0.02% rewriter_after_jit_bprop_graph : 0.000004s : 0.04% opt_after_jit_grad : 0.000484s : 4.44% validate : 0.000039s : 0.36% Time group info: ------[substitution.] 0.000211 34 14.49% : 0.000031s : 6: substitution.arithmetic_simplify 0.94% : 0.000002s : 2: substitution.elim_not_effective 0.61% : 0.000001s : 2: substitution.fold_const_symbol 2.88% : 0.000006s : 4: substitution.graph_param_transform 67.55% : 0.000142s : 4: substitution.inline 1.93% : 0.000004s : 4: substitution.j_node_and_user_rematch 2.25% : 0.000005s : 4: substitution.remove_not_recompute_node 2.20% : 0.000005s : 4: substitution.replace_old_param 7.15% : 0.000015s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.005660 2 87.84% : 0.004972s : 1: type_inference.infer 12.16% : 0.000688s : 1: type_inference.specialize ------[replace.] 0.000059 8 62.21% : 0.000037s : 4: replace.inline 37.79% : 0.000022s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000153 8 91.40% : 0.000140s : 4: match.inline 8.60% : 0.000013s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000264 1278 0.68% : 0.000002s : 13: predicate.accumulaten_eliminater 0.61% : 0.000002s : 4: predicate.ad_related_special_op_eliminate 0.41% : 0.000001s : 8: predicate.addn_check_dump 0.97% : 0.000003s : 13: predicate.addn_zero_filter 0.61% : 0.000002s : 13: predicate.adjust_all_reduce_mul_add 1.78% : 0.000005s : 21: predicate.arithmetic_simplify 0.77% : 0.000002s : 13: predicate.cast_eliminate 0.55% : 0.000001s : 8: predicate.check_bprop_eliminate 0.38% : 0.000001s : 8: predicate.compare_switch_simplify 0.15% : 0.000000s : 4: predicate.const_output_eliminate 0.45% : 0.000001s : 8: predicate.depend_value_elim 0.78% : 0.000002s : 13: predicate.dict_get_item_const_eliminator 0.79% : 0.000002s : 13: predicate.dict_get_item_eliminator 0.65% : 0.000002s : 13: predicate.dict_set_item_eliminator 0.69% : 0.000002s : 8: predicate.dumpgradient_eliminate 0.15% : 0.000000s : 4: predicate.elim_not_effective 0.26% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 0.85% : 0.000002s : 17: predicate.environ_add_const_eliminate 0.85% : 0.000002s : 17: predicate.environ_get_add_eliminate 0.83% : 0.000002s : 17: predicate.environ_get_depend_swap 1.29% : 0.000003s : 25: predicate.environ_get_eliminate 0.87% : 0.000002s : 17: predicate.environ_get_set_eliminate 1.13% : 0.000003s : 21: predicate.exchange_switch_depend_value 1.89% : 0.000005s : 21: predicate.float_depend_g_call 0.44% : 0.000001s : 8: predicate.float_environ_get_switch 0.57% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.15% : 0.000000s : 4: predicate.fold_const_symbol 0.66% : 0.000002s : 8: predicate.get_grad_eliminate 0.16% : 0.000000s : 4: predicate.graph_param_transform 0.44% : 0.000001s : 8: predicate.incorporate_call 0.37% : 0.000001s : 8: predicate.incorporate_call_switch 4.80% : 0.000013s : 58: predicate.inline 0.54% : 0.000001s : 8: predicate.inline_without_move 0.25% : 0.000001s : 8: predicate.j_node_and_user_rematch 0.70% : 0.000002s : 8: predicate.less_batch_normalization 1.36% : 0.000004s : 25: predicate.list_to_tuple_eliminator_ 2.05% : 0.000005s : 38: predicate.load_eliminater 0.71% : 0.000002s : 4: predicate.loop_unroll_after_grad 1.95% : 0.000005s : 34: predicate.loop_unroll_before_grad 1.23% : 0.000003s : 21: predicate.make_slice_get_slice_eliminator 0.39% : 0.000001s : 8: predicate.merge_addn 0.41% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.45% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.61% : 0.000002s : 13: predicate.minmaximum_grad 0.70% : 0.000002s : 4: predicate.mutable_eliminate 0.27% : 0.000001s : 4: predicate.opt_reshape 0.26% : 0.000001s : 4: predicate.parallel_virtual_node 1.35% : 0.000004s : 21: predicate.partial_defer_inline 1.25% : 0.000003s : 21: predicate.partial_eliminate 0.82% : 0.000002s : 13: predicate.print_const_string_wrapper 23.39% : 0.000062s : 8: predicate.reduce_all_const_elim 0.91% : 0.000002s : 13: predicate.reduce_eliminate 1.97% : 0.000005s : 38: predicate.redundant_stop_gradient_eliminater 0.31% : 0.000001s : 8: predicate.remove_not_recompute_node 1.13% : 0.000003s : 25: predicate.replace_applicator 0.40% : 0.000001s : 8: predicate.replace_old_param 0.23% : 0.000001s : 4: predicate.reset_defer_inline 0.71% : 0.000002s : 13: predicate.reshape_eliminate 0.52% : 0.000001s : 8: predicate.row_tensor_add_zeros_like 0.34% : 0.000001s : 4: predicate.row_tensor_eliminate 0.87% : 0.000002s : 8: predicate.same_eliminate 0.33% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.59% : 0.000002s : 8: predicate.shard_identity_eliminate 0.55% : 0.000001s : 8: predicate.special_op_eliminate 0.53% : 0.000001s : 8: predicate.specialize_transform 0.66% : 0.000002s : 8: predicate.split_environ_get_set_with_tuple_value 0.58% : 0.000002s : 8: predicate.stack_unstack_eliminate 0.27% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.17% : 0.000003s : 21: predicate.switch_defer_inline 1.63% : 0.000004s : 29: predicate.switch_layer_defer_inline 3.97% : 0.000010s : 67: predicate.switch_simplify 0.67% : 0.000002s : 13: predicate.tile_eliminate 0.72% : 0.000002s : 13: predicate.transpose_eliminate 1.17% : 0.000003s : 21: predicate.tuple_list_convert_item_index_to_positive 1.20% : 0.000003s : 21: predicate.tuple_list_get_item_const_eliminator 1.15% : 0.000003s : 21: predicate.tuple_list_get_item_depend_reorder 2.49% : 0.000007s : 33: predicate.tuple_list_get_item_eliminator 1.12% : 0.000003s : 21: predicate.tuple_list_get_set_item_eliminator 1.81% : 0.000005s : 29: predicate.tuple_list_set_item_eliminator 1.35% : 0.000004s : 25: predicate.tuple_to_list_eliminator_ 1.83% : 0.000005s : 38: predicate.updatestate_pure_node_eliminater 2.34% : 0.000006s : 46: predicate.updatestate_useless_node_eliminater 0.27% : 0.000001s : 4: predicate.value_based_eliminate 0.50% : 0.000001s : 8: predicate.virtual_dataset_eliminate 0.47% : 0.000001s : 8: predicate.virtual_output_eliminate 0.29% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.30% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000534 11 54.66% : 0.000292s : 5: func_graph_cloner_run.FuncGraphClonerGraph 45.34% : 0.000242s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.025408 192 0.01% : 0.000004s : 1: ForceFp32Comm 12.87% : 0.003271s : 1: add_attr 12.83% : 0.003260s : 1: add_attr_with_inline 0.02% : 0.000004s : 1: add_comm_op_reuse_tag 0.22% : 0.000055s : 1: add_recomputation 0.02% : 0.000004s : 1: assign_add_opt 0.27% : 0.000069s : 1: auto_monad 0.08% : 0.000020s : 1: auto_monad_reorder 0.01% : 0.000003s : 1: begin_end_overlap_inline 0.02% : 0.000006s : 1: bias_add_comm_swap 1.84% : 0.000467s : 1: bootstrap 0.11% : 0.000029s : 1: cconv 0.01% : 0.000004s : 1: comm_op_add_attrs 0.06% : 0.000015s : 1: control_data_broadcast_order 0.04% : 0.000010s : 1: convert_after_rewriter 0.09% : 0.000023s : 1: cse_after_recomputation 0.02% : 0.000005s : 1: dataset_repeat_opt 0.02% : 0.000005s : 1: detach_backward 0.03% : 0.000008s : 1: environ_conv 0.10% : 0.000025s : 1: event_method 0.02% : 0.000005s : 1: full_micro_interleaved_order_control 0.02% : 0.000005s : 1: get_jit_bprop_graph 0.04% : 0.000010s : 1: graph_reusing 0.02% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000004s : 1: handle_group_info 0.02% : 0.000005s : 1: inline 0.03% : 0.000007s : 1: insert-virtual-dataset 0.02% : 0.000004s : 1: interleave_parallel_branches 0.02% : 0.000004s : 1: interleave_split_concat_branches 0.02% : 0.000006s : 1: label_fine_grained_interleaved_index 0.03% : 0.000008s : 1: label_micro_interleaved_index 1.68% : 0.000426s : 1: loop_unroll 0.02% : 0.000004s : 1: merge_cast_opt 0.02% : 0.000005s : 1: micro_interleaved_order_control 2.12% : 0.000538s : 1: mutable_eliminate 0.03% : 0.000007s : 1: offloading_packed_experts 0.05% : 0.000014s : 1: opt.transform.loop_unroll_optimizer 0.06% : 0.000014s : 1: opt.transform.mutable_eliminate 5.03% : 0.001277s : 78: opt.transform.opt_a 0.11% : 0.000029s : 1: opt.transform.opt_after_cconv 0.10% : 0.000026s : 1: opt.transform.opt_after_jit_grad 0.40% : 0.000101s : 28: opt.transform.opt_b 0.19% : 0.000049s : 2: opt.transform.opt_trans_graph 0.14% : 0.000035s : 4: opt.transform.symbol_engine_opt 11.26% : 0.002860s : 1: opt_a 0.41% : 0.000103s : 1: opt_after_cconv 1.94% : 0.000493s : 1: opt_after_jit_grad 0.81% : 0.000206s : 1: opt_b 19.16% : 0.004869s : 1: optimize 0.08% : 0.000020s : 1: optimize_parallel_all_gather_comm 0.03% : 0.000008s : 1: order_py_execute_after_rewriter 0.09% : 0.000022s : 1: overlap_grad_flash_sp 0.02% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.03% : 0.000007s : 1: overlap_grad_ring_attention 0.02% : 0.000004s : 1: overlap_opt_shard_grad_in_pipeline 0.02% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.02% : 0.000005s : 1: overlap_param_gather 0.02% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.03% : 0.000008s : 1: overlap_recompute_and_grad_model_parallel 0.02% : 0.000005s : 1: overlap_recompute_comm 0.03% : 0.000007s : 1: parallel-infer-symbol 0.02% : 0.000005s : 1: parallel-infer-symbol-second 0.02% : 0.000005s : 1: partial_unused_args_eliminate 0.02% : 0.000005s : 1: pipeline_parallel_scheduler 0.02% : 0.000005s : 1: pipeline_split 0.15% : 0.000039s : 1: pre_auto_parallel 0.12% : 0.000030s : 1: py_interpret_to_execute 0.05% : 0.000013s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000004s : 1: remove_cast_before_assign_add 0.06% : 0.000016s : 1: remove_dup_value 1.45% : 0.000368s : 1: renormalize.infer 1.39% : 0.000353s : 1: renormalize.specialize 0.02% : 0.000005s : 1: reorder_send_recv_between_fp_bp 0.03% : 0.000007s : 1: rewriter_after_jit_bprop_graph 0.15% : 0.000037s : 1: rewriter_after_opt_a 0.34% : 0.000086s : 1: rewriter_before_opt_a 0.02% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.02% : 0.000005s : 1: slice_recompute_activation 0.02% : 0.000005s : 1: split_layernorm_comm 0.02% : 0.000005s : 1: split_matmul_comm_elemetwise 0.03% : 0.000009s : 1: swap_dp_allreduce_reducescatter 0.30% : 0.000076s : 1: symbol_engine_optimizer 0.30% : 0.000077s : 1: tuple_transform 22.61% : 0.005744s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:43:09.391.517 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:43:09.391.828 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0626906, [21] [bootstrap]: 0.00045202 [type_inference]: 0.050228 [event_method]: 2.325e-05 [auto_monad]: 6.663e-05 [graph_reusing]: 6.26e-06 [inline]: 3.58999e-06 [add_attr]: 0.00360038, [1] [add_attr_with_inline]: 0.00358799, [1] [Cycle 1]: 9.527e-05, [2] [tag_attr]: 2.224e-05 [meta_addattr_fg_expand]: 6.02999e-06 [parallel-infer-symbol]: 3.41001e-06 [pre_auto_parallel]: 4.109e-05 [insert-virtual-dataset]: 2.62001e-06 [parallel-infer-symbol-second]: 8.89995e-07 [dataset_repeat_opt]: 2.07999e-06 [pipeline_split]: 2.13998e-06 [optimize]: 0.00676339, [53] [py_interpret_to_execute]: 3.327e-05 [rewriter_before_opt_a]: 9.625e-05 [opt_a]: 0.00406925, [2] [Cycle 1]: 0.00297437, [45] [expand_dump_flag]: 3.46001e-06 [switch_simplify]: 4.26e-05 [loop_unroll]: 3.049e-05 [a_1]: 0.00067611 [with_stream_mark]: 2.271e-05 [recompute_prepare]: 9.65002e-06 [updatestate_depend_eliminate]: 4.53001e-06 [updatestate_assign_eliminate]: 3.60998e-06 [updatestate_loads_eliminate]: 3.39001e-06 [parameter_eliminate]: 2.44001e-06 [a_2]: 0.00011721 [accelerated_algorithm]: 7.87e-06 [shard]: 2.29001e-06 [meta_shard_fg_expand]: 1.98002e-06 [shard_inline]: 6.96001e-06 [merge_send_recv]: 9.45001e-06 [auto_parallel]: 8.59e-06 [parallel]: 2.007e-05 [flash_sp]: 9.69e-06 [merge_comm]: 4.05998e-06 [allreduce_fusion]: 3.42002e-06 [matmul_add_comm_reduction]: 1.008e-05 [allreduce_slice_to_reducescatter]: 6.89994e-07 [virtual_shard_identity]: 8.65999e-06 [virtual_dataset]: 7.25998e-06 [get_grad_eliminate_]: 6.58e-06 [virtual_output]: 6.59001e-06 [merge_forward]: 3.88001e-06 [cell_reuse_recompute_pass]: 1.27999e-06 [offload_activation]: 1.062e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.644e-05 [merge_recompute_call_nodes]: 2.00002e-06 [before_grad]: 1.158e-05 [set_forward_comm_id_for_comm_node_pass]: 4.31002e-06 [meta_fg_expand]: 3.81001e-06 [flash_sp_send_recv_attached]: 3.13998e-06 [receive_attached]: 2.14999e-06 [after_resolve]: 1.226e-05 [a_after_grad]: 1.203e-05 [renormalize]: 0.00130871 [add_forward_monad_depend]: 9.47999e-06 [auto_monad_grad]: 2.22999e-06 [auto_monad_eliminator]: 2.209e-05 [cse]: 3.044e-05 [a_3]: 7.489e-05 [Cycle 2]: 0.00107556, [45] [expand_dump_flag]: 2.69001e-06 [switch_simplify]: 9.42999e-06 [loop_unroll]: 6.88e-06 [a_1]: 0.00016162 [with_stream_mark]: 2.058e-05 [recompute_prepare]: 7.13e-06 [updatestate_depend_eliminate]: 4.39998e-06 [updatestate_assign_eliminate]: 2.88998e-06 [updatestate_loads_eliminate]: 2.64999e-06 [parameter_eliminate]: 1.84e-06 [a_2]: 0.00010966 [accelerated_algorithm]: 7.40003e-06 [shard]: 2.37001e-06 [meta_shard_fg_expand]: 2.58998e-06 [shard_inline]: 6.71999e-06 [merge_send_recv]: 8.14002e-06 [auto_parallel]: 8.74e-06 [parallel]: 5.489e-05 [flash_sp]: 4.42e-06 [merge_comm]: 4.52e-06 [allreduce_fusion]: 3.90998e-06 [matmul_add_comm_reduction]: 8.40001e-06 [allreduce_slice_to_reducescatter]: 6.39993e-07 [virtual_shard_identity]: 9.17001e-06 [virtual_dataset]: 6.28e-06 [get_grad_eliminate_]: 6.89999e-06 [virtual_output]: 6.61e-06 [merge_forward]: 5.71998e-06 [cell_reuse_recompute_pass]: 3.26001e-06 [offload_activation]: 1.263e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.987e-05 [merge_recompute_call_nodes]: 2.01e-06 [before_grad]: 1.553e-05 [set_forward_comm_id_for_comm_node_pass]: 4.23999e-06 [meta_fg_expand]: 3.61999e-06 [flash_sp_send_recv_attached]: 2.06e-06 [receive_attached]: 1.99999e-06 [after_resolve]: 1.584e-05 [a_after_grad]: 1.194e-05 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 3.41001e-06 [auto_monad_grad]: 2.58e-06 [auto_monad_eliminator]: 1.326e-05 [cse]: 2.08e-05 [a_3]: 5.243e-05 [py_interpret_to_execute_after_opt_a]: 2.207e-05 [slice_cell_reuse_recomputed_activation]: 4.63001e-06 [rewriter_after_opt_a]: 4.505e-05 [convert_after_rewriter]: 1.031e-05 [order_py_execute_after_rewriter]: 8.37998e-06 [mutable_eliminate]: 0.00076449 [opt_b]: 0.000289, [1] [Cycle 1]: 0.00027722, [7] [b_1]: 0.000174 [b_2]: 9.00999e-06 [updatestate_depend_eliminate]: 8.60999e-06 [updatestate_assign_eliminate]: 3.28e-06 [updatestate_loads_eliminate]: 2.94999e-06 [renormalize]: 6.50005e-07 [cse]: 2.163e-05 [optimize_parallel_all_gather_comm]: 2.24e-05 [overlap_param_gather]: 5.20999e-06 [cconv]: 3.965e-05 [loop_unroll]: 0.00049405 [opt_after_cconv]: 0.00013813, [1] [Cycle 1]: 0.00012816, [7] [c_1]: 3.273e-05 [parameter_eliminate]: 6.21998e-06 [updatestate_depend_eliminate]: 6.16e-06 [updatestate_assign_eliminate]: 2.59999e-06 [updatestate_loads_eliminate]: 3.07002e-06 [cse]: 1.999e-05 [renormalize]: 5.3001e-07 [remove_dup_value]: 1.853e-05 [tuple_transform]: 9.831e-05, [1] [Cycle 1]: 9.073e-05, [4] [d_1]: 4.961e-05 [none_parameter_eliminate]: 1.86e-06 [renormalize]: 2.19996e-07 [switch_simplify]: 7.83001e-06 [partial_unused_args_eliminate]: 4.38001e-06 [add_recomputation]: 5.668e-05 [cse_after_recomputation]: 2.886e-05, [1] [Cycle 1]: 2.176e-05, [1] [cse]: 1.25e-05 [environ_conv]: 8.40001e-06 [swap_dp_allreduce_reducescatter]: 7.62002e-06 [bias_add_comm_swap]: 6.57002e-06 [label_micro_interleaved_index]: 7.31001e-06 [label_fine_grained_interleaved_index]: 5.47001e-06 [merge_cast_opt]: 3.76001e-06 [slice_recompute_activation]: 4.4e-06 [micro_interleaved_order_control]: 4.77e-06 [assign_add_opt]: 3.71999e-06 [ForceFp32Comm]: 3.26001e-06 [remove_cast_before_assign_add]: 3.51999e-06 [full_micro_interleaved_order_control]: 4.79e-06 [reorder_send_recv_between_fp_bp]: 5.47999e-06 [comm_op_add_attrs]: 3.97e-06 [add_comm_op_reuse_tag]: 3.33e-06 [interleave_split_concat_branches]: 3.53999e-06 [interleave_parallel_branches]: 3.43999e-06 [overlap_opt_shard_in_pipeline]: 3.62002e-06 [overlap_opt_shard_grad_in_pipeline]: 4.79998e-06 [control_data_broadcast_order]: 1.682e-05 [grouped_pairwise_exchange_alltoall]: 3.98999e-06 [offloading_packed_experts]: 6.61999e-06 [overlap_recompute_and_grad_model_parallel]: 7.4e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.93001e-06 [overlap_recompute_allgather_and_fa_grad]: 3.71001e-06 [overlap_recompute_comm]: 4.84003e-06 [overlap_grad_ring_attention]: 6.61e-06 [overlap_grad_flash_sp]: 2.549e-05 [begin_end_overlap_inline]: 2.81e-06 [split_matmul_comm_elemetwise]: 4.3e-06 [split_layernorm_comm]: 4.12e-06 [handle_group_info]: 3.3e-06 [symbol_engine_optimizer]: 0.00010111, [1] [Cycle 1]: 9.414e-05, [6] [build]: 4.17998e-06 [elim_shapecalc]: 1.115e-05 [elim_not_effective]: 1.358e-05 [opt_reshape]: 7.11001e-06 [fold_const_symbol]: 1.064e-05 [renormalize]: 1.8999e-07 [detach_backward]: 4.10998e-06 [pipeline_parallel_scheduler]: 2.18998e-06 [auto_monad_reorder]: 1.995e-05 [get_jit_bprop_graph]: 2.36998e-06 [rewriter_after_jit_bprop_graph]: 6.73998e-06 [opt_after_jit_grad]: 0.00078018 [validate]: 4.614e-05 Sums bootstrap : 0.000452s : 0.79% type_inference : 0.050228s : 87.86% event_method : 0.000023s : 0.04% auto_monad : 0.000067s : 0.12% graph_reusing : 0.000006s : 0.01% inline : 0.000004s : 0.01% add_attr.add_attr_with_inline.tag_attr : 0.000022s : 0.04% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.01% parallel-infer-symbol : 0.000003s : 0.01% pre_auto_parallel : 0.000041s : 0.07% insert-virtual-dataset : 0.000003s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000033s : 0.06% optimize.rewriter_before_opt_a : 0.000096s : 0.17% optimize.opt_a.expand_dump_flag : 0.000006s : 0.01% optimize.opt_a.switch_simplify : 0.000052s : 0.09% optimize.opt_a.loop_unroll : 0.000037s : 0.07% optimize.opt_a.a_1 : 0.000838s : 1.47% optimize.opt_a.with_stream_mark : 0.000043s : 0.08% optimize.opt_a.recompute_prepare : 0.000017s : 0.03% optimize.opt_a.updatestate_depend_eliminate : 0.000009s : 0.02% optimize.opt_a.updatestate_assign_eliminate : 0.000006s : 0.01% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.01% optimize.opt_a.parameter_eliminate : 0.000004s : 0.01% optimize.opt_a.a_2 : 0.000227s : 0.40% optimize.opt_a.accelerated_algorithm : 0.000015s : 0.03% optimize.opt_a.shard : 0.000005s : 0.01% optimize.opt_a.meta_shard_fg_expand : 0.000005s : 0.01% optimize.opt_a.shard_inline : 0.000014s : 0.02% optimize.opt_a.merge_send_recv : 0.000018s : 0.03% optimize.opt_a.auto_parallel : 0.000017s : 0.03% optimize.opt_a.parallel : 0.000075s : 0.13% optimize.opt_a.flash_sp : 0.000014s : 0.02% optimize.opt_a.merge_comm : 0.000009s : 0.02% optimize.opt_a.allreduce_fusion : 0.000007s : 0.01% optimize.opt_a.matmul_add_comm_reduction : 0.000018s : 0.03% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000018s : 0.03% optimize.opt_a.virtual_dataset : 0.000014s : 0.02% optimize.opt_a.get_grad_eliminate_ : 0.000013s : 0.02% optimize.opt_a.virtual_output : 0.000013s : 0.02% optimize.opt_a.merge_forward : 0.000010s : 0.02% optimize.opt_a.cell_reuse_recompute_pass : 0.000005s : 0.01% optimize.opt_a.offload_activation : 0.000023s : 0.04% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000036s : 0.06% optimize.opt_a.merge_recompute_call_nodes : 0.000004s : 0.01% optimize.opt_a.before_grad : 0.000027s : 0.05% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000009s : 0.01% optimize.opt_a.meta_fg_expand : 0.000007s : 0.01% optimize.opt_a.flash_sp_send_recv_attached : 0.000005s : 0.01% optimize.opt_a.receive_attached : 0.000004s : 0.01% optimize.opt_a.after_resolve : 0.000028s : 0.05% optimize.opt_a.a_after_grad : 0.000024s : 0.04% optimize.opt_a.renormalize : 0.001309s : 2.29% optimize.opt_a.add_forward_monad_depend : 0.000013s : 0.02% optimize.opt_a.auto_monad_grad : 0.000005s : 0.01% optimize.opt_a.auto_monad_eliminator : 0.000035s : 0.06% optimize.opt_a.cse : 0.000051s : 0.09% optimize.opt_a.a_3 : 0.000127s : 0.22% optimize.py_interpret_to_execute_after_opt_a : 0.000022s : 0.04% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.01% optimize.rewriter_after_opt_a : 0.000045s : 0.08% optimize.convert_after_rewriter : 0.000010s : 0.02% optimize.order_py_execute_after_rewriter : 0.000008s : 0.01% optimize.mutable_eliminate : 0.000764s : 1.34% optimize.opt_b.b_1 : 0.000174s : 0.30% optimize.opt_b.b_2 : 0.000009s : 0.02% optimize.opt_b.updatestate_depend_eliminate : 0.000009s : 0.02% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.01% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.01% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000022s : 0.04% optimize.optimize_parallel_all_gather_comm : 0.000022s : 0.04% optimize.overlap_param_gather : 0.000005s : 0.01% optimize.cconv : 0.000040s : 0.07% optimize.loop_unroll : 0.000494s : 0.86% optimize.opt_after_cconv.c_1 : 0.000033s : 0.06% optimize.opt_after_cconv.parameter_eliminate : 0.000006s : 0.01% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.01% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.cse : 0.000020s : 0.03% optimize.opt_after_cconv.renormalize : 0.000001s : 0.00% optimize.remove_dup_value : 0.000019s : 0.03% optimize.tuple_transform.d_1 : 0.000050s : 0.09% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000008s : 0.01% optimize.partial_unused_args_eliminate : 0.000004s : 0.01% optimize.add_recomputation : 0.000057s : 0.10% optimize.cse_after_recomputation.cse : 0.000013s : 0.02% optimize.environ_conv : 0.000008s : 0.01% optimize.swap_dp_allreduce_reducescatter : 0.000008s : 0.01% optimize.bias_add_comm_swap : 0.000007s : 0.01% optimize.label_micro_interleaved_index : 0.000007s : 0.01% optimize.label_fine_grained_interleaved_index : 0.000005s : 0.01% optimize.merge_cast_opt : 0.000004s : 0.01% optimize.slice_recompute_activation : 0.000004s : 0.01% optimize.micro_interleaved_order_control : 0.000005s : 0.01% optimize.assign_add_opt : 0.000004s : 0.01% optimize.ForceFp32Comm : 0.000003s : 0.01% optimize.remove_cast_before_assign_add : 0.000004s : 0.01% optimize.full_micro_interleaved_order_control : 0.000005s : 0.01% optimize.reorder_send_recv_between_fp_bp : 0.000005s : 0.01% optimize.comm_op_add_attrs : 0.000004s : 0.01% optimize.add_comm_op_reuse_tag : 0.000003s : 0.01% optimize.interleave_split_concat_branches : 0.000004s : 0.01% optimize.interleave_parallel_branches : 0.000003s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000005s : 0.01% optimize.control_data_broadcast_order : 0.000017s : 0.03% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.01% optimize.offloading_packed_experts : 0.000007s : 0.01% optimize.overlap_recompute_and_grad_model_parallel : 0.000007s : 0.01% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.01% optimize.overlap_recompute_comm : 0.000005s : 0.01% optimize.overlap_grad_ring_attention : 0.000007s : 0.01% optimize.overlap_grad_flash_sp : 0.000025s : 0.04% optimize.begin_end_overlap_inline : 0.000003s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000004s : 0.01% optimize.split_layernorm_comm : 0.000004s : 0.01% optimize.handle_group_info : 0.000003s : 0.01% optimize.symbol_engine_optimizer.build : 0.000004s : 0.01% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000011s : 0.02% optimize.symbol_engine_optimizer.elim_not_effective : 0.000014s : 0.02% optimize.symbol_engine_optimizer.opt_reshape : 0.000007s : 0.01% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000011s : 0.02% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000004s : 0.01% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000020s : 0.03% get_jit_bprop_graph : 0.000002s : 0.00% rewriter_after_jit_bprop_graph : 0.000007s : 0.01% opt_after_jit_grad : 0.000780s : 1.36% validate : 0.000046s : 0.08% Time group info: ------[substitution.] 0.000244 34 16.07% : 0.000039s : 6: substitution.arithmetic_simplify 0.86% : 0.000002s : 2: substitution.elim_not_effective 0.53% : 0.000001s : 2: substitution.fold_const_symbol 2.65% : 0.000006s : 4: substitution.graph_param_transform 65.58% : 0.000160s : 4: substitution.inline 2.21% : 0.000005s : 4: substitution.j_node_and_user_rematch 2.72% : 0.000007s : 4: substitution.remove_not_recompute_node 2.80% : 0.000007s : 4: substitution.replace_old_param 6.58% : 0.000016s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.050165 2 98.27% : 0.049300s : 1: type_inference.infer 1.73% : 0.000866s : 1: type_inference.specialize ------[replace.] 0.000066 8 63.41% : 0.000042s : 4: replace.inline 36.59% : 0.000024s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000171 8 91.65% : 0.000157s : 4: match.inline 8.35% : 0.000014s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000233 1278 1.07% : 0.000002s : 13: predicate.accumulaten_eliminater 0.72% : 0.000002s : 4: predicate.ad_related_special_op_eliminate 0.61% : 0.000001s : 8: predicate.addn_check_dump 0.99% : 0.000002s : 13: predicate.addn_zero_filter 0.70% : 0.000002s : 13: predicate.adjust_all_reduce_mul_add 2.65% : 0.000006s : 21: predicate.arithmetic_simplify 0.96% : 0.000002s : 13: predicate.cast_eliminate 0.80% : 0.000002s : 8: predicate.check_bprop_eliminate 0.64% : 0.000002s : 8: predicate.compare_switch_simplify 0.17% : 0.000000s : 4: predicate.const_output_eliminate 0.73% : 0.000002s : 8: predicate.depend_value_elim 0.91% : 0.000002s : 13: predicate.dict_get_item_const_eliminator 1.30% : 0.000003s : 13: predicate.dict_get_item_eliminator 0.86% : 0.000002s : 13: predicate.dict_set_item_eliminator 1.07% : 0.000002s : 8: predicate.dumpgradient_eliminate 0.23% : 0.000001s : 4: predicate.elim_not_effective 0.46% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.11% : 0.000003s : 17: predicate.environ_add_const_eliminate 1.19% : 0.000003s : 17: predicate.environ_get_add_eliminate 1.12% : 0.000003s : 17: predicate.environ_get_depend_swap 1.55% : 0.000004s : 25: predicate.environ_get_eliminate 0.95% : 0.000002s : 17: predicate.environ_get_set_eliminate 1.35% : 0.000003s : 21: predicate.exchange_switch_depend_value 2.37% : 0.000006s : 21: predicate.float_depend_g_call 0.52% : 0.000001s : 8: predicate.float_environ_get_switch 0.77% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.18% : 0.000000s : 4: predicate.fold_const_symbol 0.83% : 0.000002s : 8: predicate.get_grad_eliminate 0.18% : 0.000000s : 4: predicate.graph_param_transform 0.54% : 0.000001s : 8: predicate.incorporate_call 0.44% : 0.000001s : 8: predicate.incorporate_call_switch 5.96% : 0.000014s : 58: predicate.inline 0.86% : 0.000002s : 8: predicate.inline_without_move 0.30% : 0.000001s : 8: predicate.j_node_and_user_rematch 1.10% : 0.000003s : 8: predicate.less_batch_normalization 1.91% : 0.000004s : 25: predicate.list_to_tuple_eliminator_ 2.46% : 0.000006s : 38: predicate.load_eliminater 1.20% : 0.000003s : 4: predicate.loop_unroll_after_grad 2.15% : 0.000005s : 34: predicate.loop_unroll_before_grad 1.79% : 0.000004s : 21: predicate.make_slice_get_slice_eliminator 0.64% : 0.000001s : 8: predicate.merge_addn 0.55% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.55% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.84% : 0.000002s : 13: predicate.minmaximum_grad 1.20% : 0.000003s : 4: predicate.mutable_eliminate 0.34% : 0.000001s : 4: predicate.opt_reshape 0.52% : 0.000001s : 4: predicate.parallel_virtual_node 1.70% : 0.000004s : 21: predicate.partial_defer_inline 1.46% : 0.000003s : 21: predicate.partial_eliminate 0.82% : 0.000002s : 13: predicate.print_const_string_wrapper 0.58% : 0.000001s : 8: predicate.reduce_all_const_elim 1.29% : 0.000003s : 13: predicate.reduce_eliminate 2.35% : 0.000005s : 38: predicate.redundant_stop_gradient_eliminater 0.48% : 0.000001s : 8: predicate.remove_not_recompute_node 1.28% : 0.000003s : 25: predicate.replace_applicator 0.63% : 0.000001s : 8: predicate.replace_old_param 0.24% : 0.000001s : 4: predicate.reset_defer_inline 0.94% : 0.000002s : 13: predicate.reshape_eliminate 0.64% : 0.000001s : 8: predicate.row_tensor_add_zeros_like 0.56% : 0.000001s : 4: predicate.row_tensor_eliminate 0.83% : 0.000002s : 8: predicate.same_eliminate 0.48% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.91% : 0.000002s : 8: predicate.shard_identity_eliminate 0.68% : 0.000002s : 8: predicate.special_op_eliminate 0.62% : 0.000001s : 8: predicate.specialize_transform 1.14% : 0.000003s : 8: predicate.split_environ_get_set_with_tuple_value 1.23% : 0.000003s : 8: predicate.stack_unstack_eliminate 0.34% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.48% : 0.000003s : 21: predicate.switch_defer_inline 1.86% : 0.000004s : 29: predicate.switch_layer_defer_inline 4.66% : 0.000011s : 67: predicate.switch_simplify 1.00% : 0.000002s : 13: predicate.tile_eliminate 0.96% : 0.000002s : 13: predicate.transpose_eliminate 1.65% : 0.000004s : 21: predicate.tuple_list_convert_item_index_to_positive 1.40% : 0.000003s : 21: predicate.tuple_list_get_item_const_eliminator 1.43% : 0.000003s : 21: predicate.tuple_list_get_item_depend_reorder 3.18% : 0.000007s : 33: predicate.tuple_list_get_item_eliminator 1.34% : 0.000003s : 21: predicate.tuple_list_get_set_item_eliminator 2.19% : 0.000005s : 29: predicate.tuple_list_set_item_eliminator 1.62% : 0.000004s : 25: predicate.tuple_to_list_eliminator_ 2.13% : 0.000005s : 38: predicate.updatestate_pure_node_eliminater 2.95% : 0.000007s : 46: predicate.updatestate_useless_node_eliminater 0.37% : 0.000001s : 4: predicate.value_based_eliminate 0.73% : 0.000002s : 8: predicate.virtual_dataset_eliminate 0.77% : 0.000002s : 8: predicate.virtual_output_eliminate 0.25% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.43% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000720 11 51.59% : 0.000371s : 5: func_graph_cloner_run.FuncGraphClonerGraph 48.41% : 0.000349s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.075823 192 0.01% : 0.000006s : 1: ForceFp32Comm 4.76% : 0.003612s : 1: add_attr 4.74% : 0.003592s : 1: add_attr_with_inline 0.01% : 0.000007s : 1: add_comm_op_reuse_tag 0.08% : 0.000061s : 1: add_recomputation 0.01% : 0.000006s : 1: assign_add_opt 0.10% : 0.000076s : 1: auto_monad 0.04% : 0.000027s : 1: auto_monad_reorder 0.01% : 0.000006s : 1: begin_end_overlap_inline 0.01% : 0.000009s : 1: bias_add_comm_swap 0.66% : 0.000503s : 1: bootstrap 0.06% : 0.000043s : 1: cconv 0.01% : 0.000007s : 1: comm_op_add_attrs 0.03% : 0.000020s : 1: control_data_broadcast_order 0.02% : 0.000014s : 1: convert_after_rewriter 0.04% : 0.000032s : 1: cse_after_recomputation 0.01% : 0.000008s : 1: dataset_repeat_opt 0.03% : 0.000022s : 1: detach_backward 0.01% : 0.000011s : 1: environ_conv 0.05% : 0.000034s : 1: event_method 0.01% : 0.000008s : 1: full_micro_interleaved_order_control 0.01% : 0.000009s : 1: get_jit_bprop_graph 0.02% : 0.000013s : 1: graph_reusing 0.01% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000006s : 1: handle_group_info 0.01% : 0.000009s : 1: inline 0.01% : 0.000009s : 1: insert-virtual-dataset 0.01% : 0.000006s : 1: interleave_parallel_branches 0.01% : 0.000006s : 1: interleave_split_concat_branches 0.01% : 0.000008s : 1: label_fine_grained_interleaved_index 0.01% : 0.000010s : 1: label_micro_interleaved_index 0.66% : 0.000501s : 1: loop_unroll 0.01% : 0.000006s : 1: merge_cast_opt 0.01% : 0.000007s : 1: micro_interleaved_order_control 1.02% : 0.000773s : 1: mutable_eliminate 0.01% : 0.000009s : 1: offloading_packed_experts 0.02% : 0.000017s : 1: opt.transform.loop_unroll_optimizer 0.03% : 0.000021s : 1: opt.transform.mutable_eliminate 1.73% : 0.001314s : 78: opt.transform.opt_a 0.04% : 0.000031s : 1: opt.transform.opt_after_cconv 0.04% : 0.000030s : 1: opt.transform.opt_after_jit_grad 0.14% : 0.000107s : 28: opt.transform.opt_b 0.07% : 0.000055s : 2: opt.transform.opt_trans_graph 0.05% : 0.000039s : 4: opt.transform.symbol_engine_opt 5.37% : 0.004073s : 1: opt_a 0.19% : 0.000142s : 1: opt_after_cconv 1.05% : 0.000792s : 1: opt_after_jit_grad 0.39% : 0.000293s : 1: opt_b 9.39% : 0.007118s : 1: optimize 0.03% : 0.000026s : 1: optimize_parallel_all_gather_comm 0.02% : 0.000012s : 1: order_py_execute_after_rewriter 0.04% : 0.000029s : 1: overlap_grad_flash_sp 0.01% : 0.000007s : 1: overlap_grad_matmul_and_grad_allreduce 0.01% : 0.000010s : 1: overlap_grad_ring_attention 0.01% : 0.000007s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000007s : 1: overlap_opt_shard_in_pipeline 0.01% : 0.000008s : 1: overlap_param_gather 0.01% : 0.000006s : 1: overlap_recompute_allgather_and_fa_grad 0.01% : 0.000011s : 1: overlap_recompute_and_grad_model_parallel 0.01% : 0.000008s : 1: overlap_recompute_comm 0.01% : 0.000010s : 1: parallel-infer-symbol 0.01% : 0.000007s : 1: parallel-infer-symbol-second 0.01% : 0.000007s : 1: partial_unused_args_eliminate 0.01% : 0.000009s : 1: pipeline_parallel_scheduler 0.01% : 0.000008s : 1: pipeline_split 0.06% : 0.000049s : 1: pre_auto_parallel 0.05% : 0.000037s : 1: py_interpret_to_execute 0.03% : 0.000025s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000006s : 1: remove_cast_before_assign_add 0.03% : 0.000022s : 1: remove_dup_value 1.12% : 0.000851s : 1: renormalize.infer 0.59% : 0.000445s : 1: renormalize.specialize 0.01% : 0.000008s : 1: reorder_send_recv_between_fp_bp 0.02% : 0.000012s : 1: rewriter_after_jit_bprop_graph 0.07% : 0.000049s : 1: rewriter_after_opt_a 0.13% : 0.000100s : 1: rewriter_before_opt_a 0.01% : 0.000007s : 1: slice_cell_reuse_recomputed_activation 0.01% : 0.000008s : 1: slice_recompute_activation 0.01% : 0.000007s : 1: split_layernorm_comm 0.01% : 0.000008s : 1: split_matmul_comm_elemetwise 0.01% : 0.000011s : 1: swap_dp_allreduce_reducescatter 0.14% : 0.000104s : 1: symbol_engine_optimizer 0.13% : 0.000101s : 1: tuple_transform 66.32% : 0.050282s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:43:10.718.357 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0159031, [21] [bootstrap]: 0.0004511 [type_inference]: 0.00589847 [event_method]: 1.954e-05 [auto_monad]: 6.469e-05 [graph_reusing]: 5.72001e-06 [inline]: 2.07999e-06 [add_attr]: 0.00342594, [1] [add_attr_with_inline]: 0.00341479, [1] [Cycle 1]: 6.473e-05, [2] [tag_attr]: 2.073e-05 [meta_addattr_fg_expand]: 6.23998e-06 [parallel-infer-symbol]: 3.09999e-06 [pre_auto_parallel]: 3.624e-05 [insert-virtual-dataset]: 2.83998e-06 [parallel-infer-symbol-second]: 7.29982e-07 [dataset_repeat_opt]: 1.87999e-06 [pipeline_split]: 1.94e-06 [optimize]: 0.00518394, [53] [py_interpret_to_execute]: 2.808e-05 [rewriter_before_opt_a]: 8.183e-05 [opt_a]: 0.00285367, [2] [Cycle 1]: 0.00214444, [45] [expand_dump_flag]: 2.99001e-06 [switch_simplify]: 4.362e-05 [loop_unroll]: 3.092e-05 [a_1]: 0.00064925 [with_stream_mark]: 1.715e-05 [recompute_prepare]: 9.23002e-06 [updatestate_depend_eliminate]: 4.22e-06 [updatestate_assign_eliminate]: 3.31999e-06 [updatestate_loads_eliminate]: 2.98e-06 [parameter_eliminate]: 2.07001e-06 [a_2]: 8.828e-05 [accelerated_algorithm]: 7.60998e-06 [shard]: 2.11e-06 [meta_shard_fg_expand]: 1.77999e-06 [shard_inline]: 6.59999e-06 [merge_send_recv]: 8.36002e-06 [auto_parallel]: 6.76e-06 [parallel]: 2.042e-05 [flash_sp]: 8.43999e-06 [merge_comm]: 4.06001e-06 [allreduce_fusion]: 3.41999e-06 [matmul_add_comm_reduction]: 9.91998e-06 [allreduce_slice_to_reducescatter]: 6.30011e-07 [virtual_shard_identity]: 8.1e-06 [virtual_dataset]: 6.56e-06 [get_grad_eliminate_]: 6.22001e-06 [virtual_output]: 6.64999e-06 [merge_forward]: 3.8e-06 [cell_reuse_recompute_pass]: 1.13001e-06 [offload_activation]: 1.142e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.18e-05 [merge_recompute_call_nodes]: 1.083e-05 [before_grad]: 1.307e-05 [set_forward_comm_id_for_comm_node_pass]: 4.18999e-06 [meta_fg_expand]: 2.99001e-06 [flash_sp_send_recv_attached]: 2.80997e-06 [receive_attached]: 3.03e-06 [after_resolve]: 1.203e-05 [a_after_grad]: 1.071e-05 [renormalize]: 0.00073921 [add_forward_monad_depend]: 5.87001e-06 [auto_monad_grad]: 2.84001e-06 [auto_monad_eliminator]: 1.604e-05 [cse]: 3.06e-05 [a_3]: 4.884e-05 [Cycle 2]: 0.00069808, [45] [expand_dump_flag]: 1.54e-06 [switch_simplify]: 7.93001e-06 [loop_unroll]: 6.84999e-06 [a_1]: 0.00014582 [with_stream_mark]: 1.515e-05 [recompute_prepare]: 6.86999e-06 [updatestate_depend_eliminate]: 3.09999e-06 [updatestate_assign_eliminate]: 2.51e-06 [updatestate_loads_eliminate]: 3.06001e-06 [parameter_eliminate]: 1.47001e-06 [a_2]: 7.879e-05 [accelerated_algorithm]: 6.48e-06 [shard]: 1.15001e-06 [meta_shard_fg_expand]: 1.60001e-06 [shard_inline]: 6.06e-06 [merge_send_recv]: 6.15002e-06 [auto_parallel]: 6.97002e-06 [parallel]: 6.20002e-06 [flash_sp]: 3.6e-06 [merge_comm]: 3.71001e-06 [allreduce_fusion]: 3.23e-06 [matmul_add_comm_reduction]: 7.07002e-06 [allreduce_slice_to_reducescatter]: 3.59985e-07 [virtual_shard_identity]: 6.91999e-06 [virtual_dataset]: 6.14001e-06 [get_grad_eliminate_]: 5.84999e-06 [virtual_output]: 5.89999e-06 [merge_forward]: 3.06999e-06 [cell_reuse_recompute_pass]: 1.77999e-06 [offload_activation]: 7.56999e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.208e-05 [merge_recompute_call_nodes]: 9.39996e-07 [before_grad]: 9.74999e-06 [set_forward_comm_id_for_comm_node_pass]: 4e-06 [meta_fg_expand]: 2.21e-06 [flash_sp_send_recv_attached]: 1.30999e-06 [receive_attached]: 1.70001e-06 [after_resolve]: 1.137e-05 [a_after_grad]: 8.95999e-06 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 2.39999e-06 [auto_monad_grad]: 1.30999e-06 [auto_monad_eliminator]: 8.33999e-06 [cse]: 1.556e-05 [a_3]: 3.894e-05 [py_interpret_to_execute_after_opt_a]: 1.291e-05 [slice_cell_reuse_recomputed_activation]: 2.26998e-06 [rewriter_after_opt_a]: 4.05e-05 [convert_after_rewriter]: 7.71999e-06 [order_py_execute_after_rewriter]: 5.29998e-06 [mutable_eliminate]: 0.0007168 [opt_b]: 0.00021045, [1] [Cycle 1]: 0.00020244, [7] [b_1]: 0.00012514 [b_2]: 8.52e-06 [updatestate_depend_eliminate]: 6.50002e-06 [updatestate_assign_eliminate]: 2.39999e-06 [updatestate_loads_eliminate]: 2.32999e-06 [renormalize]: 5.59987e-07 [cse]: 2.13e-05 [optimize_parallel_all_gather_comm]: 1.682e-05 [overlap_param_gather]: 1.94999e-06 [cconv]: 2.983e-05 [loop_unroll]: 0.00048077 [opt_after_cconv]: 0.00010793, [1] [Cycle 1]: 0.0001009, [7] [c_1]: 3.153e-05 [parameter_eliminate]: 3.28e-06 [updatestate_depend_eliminate]: 5.33002e-06 [updatestate_assign_eliminate]: 2.68e-06 [updatestate_loads_eliminate]: 2.37999e-06 [cse]: 1.991e-05 [renormalize]: 6.00005e-07 [remove_dup_value]: 1.528e-05 [tuple_transform]: 8.217e-05, [1] [Cycle 1]: 7.781e-05, [4] [d_1]: 4.822e-05 [none_parameter_eliminate]: 1.71e-06 [renormalize]: 3.20026e-07 [switch_simplify]: 7.52002e-06 [partial_unused_args_eliminate]: 1.87999e-06 [add_recomputation]: 5.071e-05 [cse_after_recomputation]: 2.355e-05, [1] [Cycle 1]: 1.861e-05, [1] [cse]: 1.225e-05 [environ_conv]: 5.66e-06 [swap_dp_allreduce_reducescatter]: 5.49e-06 [bias_add_comm_swap]: 3.19001e-06 [label_micro_interleaved_index]: 5.55001e-06 [label_fine_grained_interleaved_index]: 3.08e-06 [merge_cast_opt]: 2.36e-06 [slice_recompute_activation]: 2.82002e-06 [micro_interleaved_order_control]: 2.41e-06 [assign_add_opt]: 1.31002e-06 [ForceFp32Comm]: 1.02e-06 [remove_cast_before_assign_add]: 1.15001e-06 [full_micro_interleaved_order_control]: 2.43002e-06 [reorder_send_recv_between_fp_bp]: 2.95002e-06 [comm_op_add_attrs]: 1.00001e-06 [add_comm_op_reuse_tag]: 9.29984e-07 [interleave_split_concat_branches]: 1.14e-06 [interleave_parallel_branches]: 1.25999e-06 [overlap_opt_shard_in_pipeline]: 1.42e-06 [overlap_opt_shard_grad_in_pipeline]: 1.72999e-06 [control_data_broadcast_order]: 1.237e-05 [grouped_pairwise_exchange_alltoall]: 2.17001e-06 [offloading_packed_experts]: 3.61001e-06 [overlap_recompute_and_grad_model_parallel]: 5.05001e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.20999e-06 [overlap_recompute_allgather_and_fa_grad]: 1.58002e-06 [overlap_recompute_comm]: 2.27999e-06 [overlap_grad_ring_attention]: 3.85998e-06 [overlap_grad_flash_sp]: 2.159e-05 [begin_end_overlap_inline]: 5.39992e-07 [split_matmul_comm_elemetwise]: 2.35002e-06 [split_layernorm_comm]: 1.65001e-06 [handle_group_info]: 1.37e-06 [symbol_engine_optimizer]: 7.708e-05, [1] [Cycle 1]: 7.275e-05, [6] [build]: 3.88001e-06 [elim_shapecalc]: 1.038e-05 [elim_not_effective]: 1.312e-05 [opt_reshape]: 7.3e-06 [fold_const_symbol]: 1.001e-05 [renormalize]: 4.00003e-07 [detach_backward]: 2.17001e-06 [pipeline_parallel_scheduler]: 2.01e-06 [auto_monad_reorder]: 1.745e-05 [get_jit_bprop_graph]: 2.11e-06 [rewriter_after_jit_bprop_graph]: 5.22999e-06 [opt_after_jit_grad]: 0.00057648 [validate]: 4.513e-05 Sums bootstrap : 0.000451s : 3.93% type_inference : 0.005898s : 51.35% event_method : 0.000020s : 0.17% auto_monad : 0.000065s : 0.56% graph_reusing : 0.000006s : 0.05% inline : 0.000002s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000021s : 0.18% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.05% parallel-infer-symbol : 0.000003s : 0.03% pre_auto_parallel : 0.000036s : 0.32% insert-virtual-dataset : 0.000003s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.02% optimize.py_interpret_to_execute : 0.000028s : 0.24% optimize.rewriter_before_opt_a : 0.000082s : 0.71% optimize.opt_a.expand_dump_flag : 0.000005s : 0.04% optimize.opt_a.switch_simplify : 0.000052s : 0.45% optimize.opt_a.loop_unroll : 0.000038s : 0.33% optimize.opt_a.a_1 : 0.000795s : 6.92% optimize.opt_a.with_stream_mark : 0.000032s : 0.28% optimize.opt_a.recompute_prepare : 0.000016s : 0.14% optimize.opt_a.updatestate_depend_eliminate : 0.000007s : 0.06% optimize.opt_a.updatestate_assign_eliminate : 0.000006s : 0.05% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.05% optimize.opt_a.parameter_eliminate : 0.000004s : 0.03% optimize.opt_a.a_2 : 0.000167s : 1.45% optimize.opt_a.accelerated_algorithm : 0.000014s : 0.12% optimize.opt_a.shard : 0.000003s : 0.03% optimize.opt_a.meta_shard_fg_expand : 0.000003s : 0.03% optimize.opt_a.shard_inline : 0.000013s : 0.11% optimize.opt_a.merge_send_recv : 0.000015s : 0.13% optimize.opt_a.auto_parallel : 0.000014s : 0.12% optimize.opt_a.parallel : 0.000027s : 0.23% optimize.opt_a.flash_sp : 0.000012s : 0.10% optimize.opt_a.merge_comm : 0.000008s : 0.07% optimize.opt_a.allreduce_fusion : 0.000007s : 0.06% optimize.opt_a.matmul_add_comm_reduction : 0.000017s : 0.15% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000015s : 0.13% optimize.opt_a.virtual_dataset : 0.000013s : 0.11% optimize.opt_a.get_grad_eliminate_ : 0.000012s : 0.11% optimize.opt_a.virtual_output : 0.000013s : 0.11% optimize.opt_a.merge_forward : 0.000007s : 0.06% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.03% optimize.opt_a.offload_activation : 0.000019s : 0.17% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000024s : 0.21% optimize.opt_a.merge_recompute_call_nodes : 0.000012s : 0.10% optimize.opt_a.before_grad : 0.000023s : 0.20% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000008s : 0.07% optimize.opt_a.meta_fg_expand : 0.000005s : 0.05% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.04% optimize.opt_a.receive_attached : 0.000005s : 0.04% optimize.opt_a.after_resolve : 0.000023s : 0.20% optimize.opt_a.a_after_grad : 0.000020s : 0.17% optimize.opt_a.renormalize : 0.000739s : 6.44% optimize.opt_a.add_forward_monad_depend : 0.000008s : 0.07% optimize.opt_a.auto_monad_grad : 0.000004s : 0.04% optimize.opt_a.auto_monad_eliminator : 0.000024s : 0.21% optimize.opt_a.cse : 0.000046s : 0.40% optimize.opt_a.a_3 : 0.000088s : 0.76% optimize.py_interpret_to_execute_after_opt_a : 0.000013s : 0.11% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.02% optimize.rewriter_after_opt_a : 0.000041s : 0.35% optimize.convert_after_rewriter : 0.000008s : 0.07% optimize.order_py_execute_after_rewriter : 0.000005s : 0.05% optimize.mutable_eliminate : 0.000717s : 6.24% optimize.opt_b.b_1 : 0.000125s : 1.09% optimize.opt_b.b_2 : 0.000009s : 0.07% optimize.opt_b.updatestate_depend_eliminate : 0.000007s : 0.06% optimize.opt_b.updatestate_assign_eliminate : 0.000002s : 0.02% optimize.opt_b.updatestate_loads_eliminate : 0.000002s : 0.02% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000021s : 0.19% optimize.optimize_parallel_all_gather_comm : 0.000017s : 0.15% optimize.overlap_param_gather : 0.000002s : 0.02% optimize.cconv : 0.000030s : 0.26% optimize.loop_unroll : 0.000481s : 4.19% optimize.opt_after_cconv.c_1 : 0.000032s : 0.27% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000005s : 0.05% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.02% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.02% optimize.opt_after_cconv.cse : 0.000020s : 0.17% optimize.opt_after_cconv.renormalize : 0.000001s : 0.01% optimize.remove_dup_value : 0.000015s : 0.13% optimize.tuple_transform.d_1 : 0.000048s : 0.42% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000008s : 0.07% optimize.partial_unused_args_eliminate : 0.000002s : 0.02% optimize.add_recomputation : 0.000051s : 0.44% optimize.cse_after_recomputation.cse : 0.000012s : 0.11% optimize.environ_conv : 0.000006s : 0.05% optimize.swap_dp_allreduce_reducescatter : 0.000005s : 0.05% optimize.bias_add_comm_swap : 0.000003s : 0.03% optimize.label_micro_interleaved_index : 0.000006s : 0.05% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.03% optimize.merge_cast_opt : 0.000002s : 0.02% optimize.slice_recompute_activation : 0.000003s : 0.02% optimize.micro_interleaved_order_control : 0.000002s : 0.02% optimize.assign_add_opt : 0.000001s : 0.01% optimize.ForceFp32Comm : 0.000001s : 0.01% optimize.remove_cast_before_assign_add : 0.000001s : 0.01% optimize.full_micro_interleaved_order_control : 0.000002s : 0.02% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.03% optimize.comm_op_add_attrs : 0.000001s : 0.01% optimize.add_comm_op_reuse_tag : 0.000001s : 0.01% optimize.interleave_split_concat_branches : 0.000001s : 0.01% optimize.interleave_parallel_branches : 0.000001s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.02% optimize.control_data_broadcast_order : 0.000012s : 0.11% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.02% optimize.offloading_packed_experts : 0.000004s : 0.03% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.04% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000002s : 0.01% optimize.overlap_recompute_comm : 0.000002s : 0.02% optimize.overlap_grad_ring_attention : 0.000004s : 0.03% optimize.overlap_grad_flash_sp : 0.000022s : 0.19% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.02% optimize.split_layernorm_comm : 0.000002s : 0.01% optimize.handle_group_info : 0.000001s : 0.01% optimize.symbol_engine_optimizer.build : 0.000004s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000010s : 0.09% optimize.symbol_engine_optimizer.elim_not_effective : 0.000013s : 0.11% optimize.symbol_engine_optimizer.opt_reshape : 0.000007s : 0.06% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000010s : 0.09% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.02% pipeline_parallel_scheduler : 0.000002s : 0.02% auto_monad_reorder : 0.000017s : 0.15% get_jit_bprop_graph : 0.000002s : 0.02% rewriter_after_jit_bprop_graph : 0.000005s : 0.05% opt_after_jit_grad : 0.000576s : 5.02% validate : 0.000045s : 0.39% Time group info: ------[substitution.] 0.000220 34 14.86% : 0.000033s : 6: substitution.arithmetic_simplify 1.05% : 0.000002s : 2: substitution.elim_not_effective 0.80% : 0.000002s : 2: substitution.fold_const_symbol 2.92% : 0.000006s : 4: substitution.graph_param_transform 66.64% : 0.000147s : 4: substitution.inline 2.02% : 0.000004s : 4: substitution.j_node_and_user_rematch 2.51% : 0.000006s : 4: substitution.remove_not_recompute_node 2.37% : 0.000005s : 4: substitution.replace_old_param 6.82% : 0.000015s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.005761 2 88.39% : 0.005092s : 1: type_inference.infer 11.61% : 0.000669s : 1: type_inference.specialize ------[replace.] 0.000063 8 62.87% : 0.000039s : 4: replace.inline 37.13% : 0.000023s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000157 8 91.60% : 0.000144s : 4: match.inline 8.40% : 0.000013s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000210 1278 1.10% : 0.000002s : 13: predicate.accumulaten_eliminater 0.97% : 0.000002s : 4: predicate.ad_related_special_op_eliminate 0.52% : 0.000001s : 8: predicate.addn_check_dump 1.02% : 0.000002s : 13: predicate.addn_zero_filter 0.89% : 0.000002s : 13: predicate.adjust_all_reduce_mul_add 2.23% : 0.000005s : 21: predicate.arithmetic_simplify 0.91% : 0.000002s : 13: predicate.cast_eliminate 0.62% : 0.000001s : 8: predicate.check_bprop_eliminate 0.52% : 0.000001s : 8: predicate.compare_switch_simplify 0.20% : 0.000000s : 4: predicate.const_output_eliminate 0.58% : 0.000001s : 8: predicate.depend_value_elim 0.88% : 0.000002s : 13: predicate.dict_get_item_const_eliminator 0.99% : 0.000002s : 13: predicate.dict_get_item_eliminator 0.91% : 0.000002s : 13: predicate.dict_set_item_eliminator 1.22% : 0.000003s : 8: predicate.dumpgradient_eliminate 0.22% : 0.000000s : 4: predicate.elim_not_effective 0.41% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.05% : 0.000002s : 17: predicate.environ_add_const_eliminate 1.10% : 0.000002s : 17: predicate.environ_get_add_eliminate 1.13% : 0.000002s : 17: predicate.environ_get_depend_swap 1.64% : 0.000003s : 25: predicate.environ_get_eliminate 1.26% : 0.000003s : 17: predicate.environ_get_set_eliminate 1.40% : 0.000003s : 21: predicate.exchange_switch_depend_value 2.48% : 0.000005s : 21: predicate.float_depend_g_call 0.52% : 0.000001s : 8: predicate.float_environ_get_switch 0.80% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.19% : 0.000000s : 4: predicate.fold_const_symbol 0.60% : 0.000001s : 8: predicate.get_grad_eliminate 0.22% : 0.000000s : 4: predicate.graph_param_transform 0.57% : 0.000001s : 8: predicate.incorporate_call 0.45% : 0.000001s : 8: predicate.incorporate_call_switch 6.52% : 0.000014s : 58: predicate.inline 0.93% : 0.000002s : 8: predicate.inline_without_move 0.41% : 0.000001s : 8: predicate.j_node_and_user_rematch 0.99% : 0.000002s : 8: predicate.less_batch_normalization 1.81% : 0.000004s : 25: predicate.list_to_tuple_eliminator_ 2.54% : 0.000005s : 38: predicate.load_eliminater 0.95% : 0.000002s : 4: predicate.loop_unroll_after_grad 2.69% : 0.000006s : 34: predicate.loop_unroll_before_grad 1.59% : 0.000003s : 21: predicate.make_slice_get_slice_eliminator 0.57% : 0.000001s : 8: predicate.merge_addn 0.66% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.56% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.78% : 0.000002s : 13: predicate.minmaximum_grad 0.91% : 0.000002s : 4: predicate.mutable_eliminate 0.35% : 0.000001s : 4: predicate.opt_reshape 0.41% : 0.000001s : 4: predicate.parallel_virtual_node 1.72% : 0.000004s : 21: predicate.partial_defer_inline 1.58% : 0.000003s : 21: predicate.partial_eliminate 0.89% : 0.000002s : 13: predicate.print_const_string_wrapper 0.52% : 0.000001s : 8: predicate.reduce_all_const_elim 1.06% : 0.000002s : 13: predicate.reduce_eliminate 2.34% : 0.000005s : 38: predicate.redundant_stop_gradient_eliminater 0.40% : 0.000001s : 8: predicate.remove_not_recompute_node 1.32% : 0.000003s : 25: predicate.replace_applicator 0.49% : 0.000001s : 8: predicate.replace_old_param 0.27% : 0.000001s : 4: predicate.reset_defer_inline 0.96% : 0.000002s : 13: predicate.reshape_eliminate 0.59% : 0.000001s : 8: predicate.row_tensor_add_zeros_like 0.39% : 0.000001s : 4: predicate.row_tensor_eliminate 0.95% : 0.000002s : 8: predicate.same_eliminate 0.43% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.78% : 0.000002s : 8: predicate.shard_identity_eliminate 0.90% : 0.000002s : 8: predicate.special_op_eliminate 0.79% : 0.000002s : 8: predicate.specialize_transform 1.02% : 0.000002s : 8: predicate.split_environ_get_set_with_tuple_value 0.68% : 0.000001s : 8: predicate.stack_unstack_eliminate 0.31% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.53% : 0.000003s : 21: predicate.switch_defer_inline 1.94% : 0.000004s : 29: predicate.switch_layer_defer_inline 4.99% : 0.000010s : 67: predicate.switch_simplify 0.87% : 0.000002s : 13: predicate.tile_eliminate 0.87% : 0.000002s : 13: predicate.transpose_eliminate 1.40% : 0.000003s : 21: predicate.tuple_list_convert_item_index_to_positive 1.48% : 0.000003s : 21: predicate.tuple_list_get_item_const_eliminator 1.42% : 0.000003s : 21: predicate.tuple_list_get_item_depend_reorder 3.53% : 0.000007s : 33: predicate.tuple_list_get_item_eliminator 1.55% : 0.000003s : 21: predicate.tuple_list_get_set_item_eliminator 2.14% : 0.000004s : 29: predicate.tuple_list_set_item_eliminator 1.69% : 0.000004s : 25: predicate.tuple_to_list_eliminator_ 2.57% : 0.000005s : 38: predicate.updatestate_pure_node_eliminater 2.93% : 0.000006s : 46: predicate.updatestate_useless_node_eliminater 0.36% : 0.000001s : 4: predicate.value_based_eliminate 0.59% : 0.000001s : 8: predicate.virtual_dataset_eliminate 0.68% : 0.000001s : 8: predicate.virtual_output_eliminate 0.28% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.46% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000553 11 55.32% : 0.000306s : 5: func_graph_cloner_run.FuncGraphClonerGraph 44.68% : 0.000247s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.026622 192 0.01% : 0.000004s : 1: ForceFp32Comm 12.89% : 0.003431s : 1: add_attr 12.84% : 0.003419s : 1: add_attr_with_inline 0.02% : 0.000004s : 1: add_comm_op_reuse_tag 0.21% : 0.000055s : 1: add_recomputation 0.02% : 0.000004s : 1: assign_add_opt 0.26% : 0.000069s : 1: auto_monad 0.08% : 0.000021s : 1: auto_monad_reorder 0.01% : 0.000003s : 1: begin_end_overlap_inline 0.02% : 0.000006s : 1: bias_add_comm_swap 1.80% : 0.000479s : 1: bootstrap 0.13% : 0.000033s : 1: cconv 0.01% : 0.000004s : 1: comm_op_add_attrs 0.06% : 0.000016s : 1: control_data_broadcast_order 0.04% : 0.000011s : 1: convert_after_rewriter 0.10% : 0.000027s : 1: cse_after_recomputation 0.02% : 0.000005s : 1: dataset_repeat_opt 0.02% : 0.000005s : 1: detach_backward 0.03% : 0.000009s : 1: environ_conv 0.10% : 0.000027s : 1: event_method 0.02% : 0.000005s : 1: full_micro_interleaved_order_control 0.02% : 0.000005s : 1: get_jit_bprop_graph 0.03% : 0.000009s : 1: graph_reusing 0.02% : 0.000005s : 1: grouped_pairwise_exchange_alltoall 0.02% : 0.000004s : 1: handle_group_info 0.02% : 0.000005s : 1: inline 0.02% : 0.000006s : 1: insert-virtual-dataset 0.01% : 0.000004s : 1: interleave_parallel_branches 0.01% : 0.000004s : 1: interleave_split_concat_branches 0.02% : 0.000006s : 1: label_fine_grained_interleaved_index 0.03% : 0.000008s : 1: label_micro_interleaved_index 1.84% : 0.000489s : 1: loop_unroll 0.02% : 0.000006s : 1: merge_cast_opt 0.02% : 0.000005s : 1: micro_interleaved_order_control 2.73% : 0.000728s : 1: mutable_eliminate 0.02% : 0.000006s : 1: offloading_packed_experts 0.05% : 0.000014s : 1: opt.transform.loop_unroll_optimizer 0.06% : 0.000017s : 1: opt.transform.mutable_eliminate 4.63% : 0.001231s : 78: opt.transform.opt_a 0.11% : 0.000030s : 1: opt.transform.opt_after_cconv 0.10% : 0.000027s : 1: opt.transform.opt_after_jit_grad 0.38% : 0.000102s : 28: opt.transform.opt_b 0.20% : 0.000054s : 2: opt.transform.opt_trans_graph 0.14% : 0.000037s : 4: opt.transform.symbol_engine_opt 10.73% : 0.002857s : 1: opt_a 0.42% : 0.000112s : 1: opt_after_cconv 2.20% : 0.000586s : 1: opt_after_jit_grad 0.80% : 0.000214s : 1: opt_b 19.50% : 0.005190s : 1: optimize 0.08% : 0.000020s : 1: optimize_parallel_all_gather_comm 0.03% : 0.000009s : 1: order_py_execute_after_rewriter 0.09% : 0.000025s : 1: overlap_grad_flash_sp 0.01% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.03% : 0.000007s : 1: overlap_grad_ring_attention 0.02% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.02% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.02% : 0.000005s : 1: overlap_param_gather 0.02% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.03% : 0.000009s : 1: overlap_recompute_and_grad_model_parallel 0.02% : 0.000005s : 1: overlap_recompute_comm 0.03% : 0.000007s : 1: parallel-infer-symbol 0.01% : 0.000004s : 1: parallel-infer-symbol-second 0.02% : 0.000005s : 1: partial_unused_args_eliminate 0.02% : 0.000005s : 1: pipeline_parallel_scheduler 0.02% : 0.000005s : 1: pipeline_split 0.15% : 0.000040s : 1: pre_auto_parallel 0.12% : 0.000033s : 1: py_interpret_to_execute 0.06% : 0.000016s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000004s : 1: remove_cast_before_assign_add 0.07% : 0.000019s : 1: remove_dup_value 1.49% : 0.000397s : 1: renormalize.infer 1.25% : 0.000333s : 1: renormalize.specialize 0.02% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.03% : 0.000009s : 1: rewriter_after_jit_bprop_graph 0.17% : 0.000044s : 1: rewriter_after_opt_a 0.32% : 0.000086s : 1: rewriter_before_opt_a 0.02% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.03% : 0.000007s : 1: slice_recompute_activation 0.02% : 0.000005s : 1: split_layernorm_comm 0.02% : 0.000005s : 1: split_matmul_comm_elemetwise 0.03% : 0.000009s : 1: swap_dp_allreduce_reducescatter 0.30% : 0.000080s : 1: symbol_engine_optimizer 0.32% : 0.000085s : 1: tuple_transform 22.23% : 0.005917s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:43:11.991.530 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:43:11.991.821 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.145344, [21] [bootstrap]: 0.00044681 [type_inference]: 0.0767867 [event_method]: 2.173e-05 [auto_monad]: 6.997e-05 [graph_reusing]: 6.37001e-06 [inline]: 3.3e-06 [add_attr]: 0.00407158, [1] [add_attr_with_inline]: 0.00405833, [1] [Cycle 1]: 9.459e-05, [2] [tag_attr]: 2.371e-05 [meta_addattr_fg_expand]: 6.28e-06 [parallel-infer-symbol]: 3.42002e-06 [pre_auto_parallel]: 4.249e-05 [insert-virtual-dataset]: 2.77002e-06 [parallel-infer-symbol-second]: 6.99976e-07 [dataset_repeat_opt]: 2.06e-06 [pipeline_split]: 2.22001e-06 [optimize]: 0.00619282, [53] [py_interpret_to_execute]: 3.589e-05 [rewriter_before_opt_a]: 0.00010004 [opt_a]: 0.00359294, [2] [Cycle 1]: 0.00261801, [45] [expand_dump_flag]: 2.91e-06 [switch_simplify]: 4.391e-05 [loop_unroll]: 3.152e-05 [a_1]: 0.00071078 [with_stream_mark]: 2.249e-05 [recompute_prepare]: 1.06e-05 [updatestate_depend_eliminate]: 4.37e-06 [updatestate_assign_eliminate]: 3.81999e-06 [updatestate_loads_eliminate]: 3.31999e-06 [parameter_eliminate]: 2.62001e-06 [a_2]: 0.00012068 [accelerated_algorithm]: 7.95998e-06 [shard]: 2.74001e-06 [meta_shard_fg_expand]: 2.41998e-06 [shard_inline]: 6.77002e-06 [merge_send_recv]: 1.012e-05 [auto_parallel]: 1.049e-05 [parallel]: 2.095e-05 [flash_sp]: 1.033e-05 [merge_comm]: 4.73001e-06 [allreduce_fusion]: 3.5e-06 [matmul_add_comm_reduction]: 1.085e-05 [allreduce_slice_to_reducescatter]: 1.24e-06 [virtual_shard_identity]: 9.91e-06 [virtual_dataset]: 7.92998e-06 [get_grad_eliminate_]: 7.28999e-06 [virtual_output]: 6.79001e-06 [merge_forward]: 4.54002e-06 [cell_reuse_recompute_pass]: 1.61002e-06 [offload_activation]: 1.108e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.637e-05 [merge_recompute_call_nodes]: 1.64998e-06 [before_grad]: 1.218e-05 [set_forward_comm_id_for_comm_node_pass]: 4.08999e-06 [meta_fg_expand]: 3.47002e-06 [flash_sp_send_recv_attached]: 3.06001e-06 [receive_attached]: 2.63e-06 [after_resolve]: 1.292e-05 [a_after_grad]: 1.064e-05 [renormalize]: 0.00088562 [add_forward_monad_depend]: 9.00001e-06 [auto_monad_grad]: 2.37001e-06 [auto_monad_eliminator]: 1.944e-05 [cse]: 3.112e-05 [a_3]: 7.296e-05 [Cycle 2]: 0.00095591, [45] [expand_dump_flag]: 2.46998e-06 [switch_simplify]: 8.99e-06 [loop_unroll]: 6.36e-06 [a_1]: 0.00017571 [with_stream_mark]: 2.032e-05 [recompute_prepare]: 7.51999e-06 [updatestate_depend_eliminate]: 3.62998e-06 [updatestate_assign_eliminate]: 3.04001e-06 [updatestate_loads_eliminate]: 2.48998e-06 [parameter_eliminate]: 1.72999e-06 [a_2]: 0.00010945 [accelerated_algorithm]: 6.76e-06 [shard]: 2.07001e-06 [meta_shard_fg_expand]: 2.14e-06 [shard_inline]: 6.31e-06 [merge_send_recv]: 7.71001e-06 [auto_parallel]: 9.58002e-06 [parallel]: 9.03002e-06 [flash_sp]: 4.13999e-06 [merge_comm]: 3.48e-06 [allreduce_fusion]: 3.62002e-06 [matmul_add_comm_reduction]: 8.35999e-06 [allreduce_slice_to_reducescatter]: 3.50003e-07 [virtual_shard_identity]: 8.57e-06 [virtual_dataset]: 6.51e-06 [get_grad_eliminate_]: 6.19999e-06 [virtual_output]: 6.55997e-06 [merge_forward]: 4.15e-06 [cell_reuse_recompute_pass]: 2.84999e-06 [offload_activation]: 9.34e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.52e-05 [merge_recompute_call_nodes]: 1.54e-06 [before_grad]: 1.108e-05 [set_forward_comm_id_for_comm_node_pass]: 3.83001e-06 [meta_fg_expand]: 2.98e-06 [flash_sp_send_recv_attached]: 2.11998e-06 [receive_attached]: 1.84e-06 [after_resolve]: 1.544e-05 [a_after_grad]: 1.046e-05 [renormalize]: 7.99773e-08 [add_forward_monad_depend]: 1.64e-06 [auto_monad_grad]: 1.75001e-06 [auto_monad_eliminator]: 8.72e-06 [cse]: 1.782e-05 [a_3]: 5.115e-05 [py_interpret_to_execute_after_opt_a]: 2.029e-05 [slice_cell_reuse_recomputed_activation]: 5.80002e-06 [rewriter_after_opt_a]: 4.395e-05 [convert_after_rewriter]: 1.005e-05 [order_py_execute_after_rewriter]: 7.86001e-06 [mutable_eliminate]: 0.0007467 [opt_b]: 0.00028373, [1] [Cycle 1]: 0.00027237, [7] [b_1]: 0.00016912 [b_2]: 8.75001e-06 [updatestate_depend_eliminate]: 8.33999e-06 [updatestate_assign_eliminate]: 3.38e-06 [updatestate_loads_eliminate]: 2.59001e-06 [renormalize]: 5.99975e-07 [cse]: 2.241e-05 [optimize_parallel_all_gather_comm]: 2.242e-05 [overlap_param_gather]: 5.19e-06 [cconv]: 3.722e-05 [loop_unroll]: 0.00045986 [opt_after_cconv]: 0.0001307, [1] [Cycle 1]: 0.00012115, [7] [c_1]: 3.13e-05 [parameter_eliminate]: 4.06001e-06 [updatestate_depend_eliminate]: 5.76e-06 [updatestate_assign_eliminate]: 2.41e-06 [updatestate_loads_eliminate]: 2.54001e-06 [cse]: 1.852e-05 [renormalize]: 8.00006e-07 [remove_dup_value]: 1.676e-05 [tuple_transform]: 9.171e-05, [1] [Cycle 1]: 8.449e-05, [4] [d_1]: 4.49e-05 [none_parameter_eliminate]: 1.62999e-06 [renormalize]: 3.00002e-07 [switch_simplify]: 7.16001e-06 [partial_unused_args_eliminate]: 4.59998e-06 [add_recomputation]: 5.116e-05 [cse_after_recomputation]: 2.697e-05, [1] [Cycle 1]: 2.007e-05, [1] [cse]: 1.099e-05 [environ_conv]: 8.78001e-06 [swap_dp_allreduce_reducescatter]: 8.06001e-06 [bias_add_comm_swap]: 5.52001e-06 [label_micro_interleaved_index]: 7.38999e-06 [label_fine_grained_interleaved_index]: 5.67001e-06 [merge_cast_opt]: 4.18001e-06 [slice_recompute_activation]: 4.48999e-06 [micro_interleaved_order_control]: 5.09e-06 [assign_add_opt]: 3.65e-06 [ForceFp32Comm]: 3.46001e-06 [remove_cast_before_assign_add]: 3.62002e-06 [full_micro_interleaved_order_control]: 4.75999e-06 [reorder_send_recv_between_fp_bp]: 5.76003e-06 [comm_op_add_attrs]: 3.92002e-06 [add_comm_op_reuse_tag]: 3.26999e-06 [interleave_split_concat_branches]: 3.83999e-06 [interleave_parallel_branches]: 3.46999e-06 [overlap_opt_shard_in_pipeline]: 3.45998e-06 [overlap_opt_shard_grad_in_pipeline]: 4.16001e-06 [control_data_broadcast_order]: 1.505e-05 [grouped_pairwise_exchange_alltoall]: 4.48001e-06 [offloading_packed_experts]: 6.73998e-06 [overlap_recompute_and_grad_model_parallel]: 7e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.53e-06 [overlap_recompute_allgather_and_fa_grad]: 3.64002e-06 [overlap_recompute_comm]: 5.07e-06 [overlap_grad_ring_attention]: 6.79001e-06 [overlap_grad_flash_sp]: 2.393e-05 [begin_end_overlap_inline]: 3.57002e-06 [split_matmul_comm_elemetwise]: 4.67998e-06 [split_layernorm_comm]: 4.33001e-06 [handle_group_info]: 3.47002e-06 [symbol_engine_optimizer]: 9.614e-05, [1] [Cycle 1]: 8.948e-05, [6] [build]: 2.98e-06 [elim_shapecalc]: 9.97999e-06 [elim_not_effective]: 1.335e-05 [opt_reshape]: 7.24001e-06 [fold_const_symbol]: 1.026e-05 [renormalize]: 2.19996e-07 [detach_backward]: 3.52002e-06 [pipeline_parallel_scheduler]: 1.79e-06 [auto_monad_reorder]: 1.993e-05 [get_jit_bprop_graph]: 2.44999e-06 [rewriter_after_jit_bprop_graph]: 7.12002e-06 [opt_after_jit_grad]: 0.0569584 [validate]: 5.446e-05 Sums bootstrap : 0.000447s : 0.32% type_inference : 0.076787s : 55.09% event_method : 0.000022s : 0.02% auto_monad : 0.000070s : 0.05% graph_reusing : 0.000006s : 0.00% inline : 0.000003s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000024s : 0.02% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.00% parallel-infer-symbol : 0.000003s : 0.00% pre_auto_parallel : 0.000042s : 0.03% insert-virtual-dataset : 0.000003s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000036s : 0.03% optimize.rewriter_before_opt_a : 0.000100s : 0.07% optimize.opt_a.expand_dump_flag : 0.000005s : 0.00% optimize.opt_a.switch_simplify : 0.000053s : 0.04% optimize.opt_a.loop_unroll : 0.000038s : 0.03% optimize.opt_a.a_1 : 0.000886s : 0.64% optimize.opt_a.with_stream_mark : 0.000043s : 0.03% optimize.opt_a.recompute_prepare : 0.000018s : 0.01% optimize.opt_a.updatestate_depend_eliminate : 0.000008s : 0.01% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.00% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.00% optimize.opt_a.parameter_eliminate : 0.000004s : 0.00% optimize.opt_a.a_2 : 0.000230s : 0.17% optimize.opt_a.accelerated_algorithm : 0.000015s : 0.01% optimize.opt_a.shard : 0.000005s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.000005s : 0.00% optimize.opt_a.shard_inline : 0.000013s : 0.01% optimize.opt_a.merge_send_recv : 0.000018s : 0.01% optimize.opt_a.auto_parallel : 0.000020s : 0.01% optimize.opt_a.parallel : 0.000030s : 0.02% optimize.opt_a.flash_sp : 0.000014s : 0.01% optimize.opt_a.merge_comm : 0.000008s : 0.01% optimize.opt_a.allreduce_fusion : 0.000007s : 0.01% optimize.opt_a.matmul_add_comm_reduction : 0.000019s : 0.01% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000002s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000018s : 0.01% optimize.opt_a.virtual_dataset : 0.000014s : 0.01% optimize.opt_a.get_grad_eliminate_ : 0.000013s : 0.01% optimize.opt_a.virtual_output : 0.000013s : 0.01% optimize.opt_a.merge_forward : 0.000009s : 0.01% optimize.opt_a.cell_reuse_recompute_pass : 0.000004s : 0.00% optimize.opt_a.offload_activation : 0.000020s : 0.01% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000032s : 0.02% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.00% optimize.opt_a.before_grad : 0.000023s : 0.02% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000008s : 0.01% optimize.opt_a.meta_fg_expand : 0.000006s : 0.00% optimize.opt_a.flash_sp_send_recv_attached : 0.000005s : 0.00% optimize.opt_a.receive_attached : 0.000004s : 0.00% optimize.opt_a.after_resolve : 0.000028s : 0.02% optimize.opt_a.a_after_grad : 0.000021s : 0.02% optimize.opt_a.renormalize : 0.000886s : 0.64% optimize.opt_a.add_forward_monad_depend : 0.000011s : 0.01% optimize.opt_a.auto_monad_grad : 0.000004s : 0.00% optimize.opt_a.auto_monad_eliminator : 0.000028s : 0.02% optimize.opt_a.cse : 0.000049s : 0.04% optimize.opt_a.a_3 : 0.000124s : 0.09% optimize.py_interpret_to_execute_after_opt_a : 0.000020s : 0.01% optimize.slice_cell_reuse_recomputed_activation : 0.000006s : 0.00% optimize.rewriter_after_opt_a : 0.000044s : 0.03% optimize.convert_after_rewriter : 0.000010s : 0.01% optimize.order_py_execute_after_rewriter : 0.000008s : 0.01% optimize.mutable_eliminate : 0.000747s : 0.54% optimize.opt_b.b_1 : 0.000169s : 0.12% optimize.opt_b.b_2 : 0.000009s : 0.01% optimize.opt_b.updatestate_depend_eliminate : 0.000008s : 0.01% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.00% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000022s : 0.02% optimize.optimize_parallel_all_gather_comm : 0.000022s : 0.02% optimize.overlap_param_gather : 0.000005s : 0.00% optimize.cconv : 0.000037s : 0.03% optimize.loop_unroll : 0.000460s : 0.33% optimize.opt_after_cconv.c_1 : 0.000031s : 0.02% optimize.opt_after_cconv.parameter_eliminate : 0.000004s : 0.00% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.00% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000002s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.cse : 0.000019s : 0.01% optimize.opt_after_cconv.renormalize : 0.000001s : 0.00% optimize.remove_dup_value : 0.000017s : 0.01% optimize.tuple_transform.d_1 : 0.000045s : 0.03% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000007s : 0.01% optimize.partial_unused_args_eliminate : 0.000005s : 0.00% optimize.add_recomputation : 0.000051s : 0.04% optimize.cse_after_recomputation.cse : 0.000011s : 0.01% optimize.environ_conv : 0.000009s : 0.01% optimize.swap_dp_allreduce_reducescatter : 0.000008s : 0.01% optimize.bias_add_comm_swap : 0.000006s : 0.00% optimize.label_micro_interleaved_index : 0.000007s : 0.01% optimize.label_fine_grained_interleaved_index : 0.000006s : 0.00% optimize.merge_cast_opt : 0.000004s : 0.00% optimize.slice_recompute_activation : 0.000004s : 0.00% optimize.micro_interleaved_order_control : 0.000005s : 0.00% optimize.assign_add_opt : 0.000004s : 0.00% optimize.ForceFp32Comm : 0.000003s : 0.00% optimize.remove_cast_before_assign_add : 0.000004s : 0.00% optimize.full_micro_interleaved_order_control : 0.000005s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000006s : 0.00% optimize.comm_op_add_attrs : 0.000004s : 0.00% optimize.add_comm_op_reuse_tag : 0.000003s : 0.00% optimize.interleave_split_concat_branches : 0.000004s : 0.00% optimize.interleave_parallel_branches : 0.000003s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000003s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000004s : 0.00% optimize.control_data_broadcast_order : 0.000015s : 0.01% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.00% optimize.offloading_packed_experts : 0.000007s : 0.00% optimize.overlap_recompute_and_grad_model_parallel : 0.000007s : 0.01% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.00% optimize.overlap_recompute_comm : 0.000005s : 0.00% optimize.overlap_grad_ring_attention : 0.000007s : 0.00% optimize.overlap_grad_flash_sp : 0.000024s : 0.02% optimize.begin_end_overlap_inline : 0.000004s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000005s : 0.00% optimize.split_layernorm_comm : 0.000004s : 0.00% optimize.handle_group_info : 0.000003s : 0.00% optimize.symbol_engine_optimizer.build : 0.000003s : 0.00% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000010s : 0.01% optimize.symbol_engine_optimizer.elim_not_effective : 0.000013s : 0.01% optimize.symbol_engine_optimizer.opt_reshape : 0.000007s : 0.01% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000010s : 0.01% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000004s : 0.00% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000020s : 0.01% get_jit_bprop_graph : 0.000002s : 0.00% rewriter_after_jit_bprop_graph : 0.000007s : 0.01% opt_after_jit_grad : 0.056958s : 40.86% validate : 0.000054s : 0.04% Time group info: ------[substitution.] 0.000263 34 15.22% : 0.000040s : 6: substitution.arithmetic_simplify 0.70% : 0.000002s : 2: substitution.elim_not_effective 0.52% : 0.000001s : 2: substitution.fold_const_symbol 2.25% : 0.000006s : 4: substitution.graph_param_transform 68.12% : 0.000179s : 4: substitution.inline 1.94% : 0.000005s : 4: substitution.j_node_and_user_rematch 2.54% : 0.000007s : 4: substitution.remove_not_recompute_node 2.75% : 0.000007s : 4: substitution.replace_old_param 5.97% : 0.000016s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.076719 2 98.76% : 0.075765s : 1: type_inference.infer 1.24% : 0.000955s : 1: type_inference.specialize ------[replace.] 0.000069 8 61.86% : 0.000043s : 4: replace.inline 38.14% : 0.000026s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000190 8 92.71% : 0.000176s : 4: match.inline 7.29% : 0.000014s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000234 1278 0.95% : 0.000002s : 13: predicate.accumulaten_eliminater 1.36% : 0.000003s : 4: predicate.ad_related_special_op_eliminate 0.55% : 0.000001s : 8: predicate.addn_check_dump 0.96% : 0.000002s : 13: predicate.addn_zero_filter 0.96% : 0.000002s : 13: predicate.adjust_all_reduce_mul_add 2.71% : 0.000006s : 21: predicate.arithmetic_simplify 1.12% : 0.000003s : 13: predicate.cast_eliminate 0.84% : 0.000002s : 8: predicate.check_bprop_eliminate 0.67% : 0.000002s : 8: predicate.compare_switch_simplify 0.16% : 0.000000s : 4: predicate.const_output_eliminate 0.67% : 0.000002s : 8: predicate.depend_value_elim 1.05% : 0.000002s : 13: predicate.dict_get_item_const_eliminator 1.06% : 0.000002s : 13: predicate.dict_get_item_eliminator 0.86% : 0.000002s : 13: predicate.dict_set_item_eliminator 1.28% : 0.000003s : 8: predicate.dumpgradient_eliminate 0.20% : 0.000000s : 4: predicate.elim_not_effective 0.37% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.23% : 0.000003s : 17: predicate.environ_add_const_eliminate 1.10% : 0.000003s : 17: predicate.environ_get_add_eliminate 1.01% : 0.000002s : 17: predicate.environ_get_depend_swap 1.60% : 0.000004s : 25: predicate.environ_get_eliminate 0.97% : 0.000002s : 17: predicate.environ_get_set_eliminate 1.35% : 0.000003s : 21: predicate.exchange_switch_depend_value 2.36% : 0.000006s : 21: predicate.float_depend_g_call 0.52% : 0.000001s : 8: predicate.float_environ_get_switch 0.76% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.16% : 0.000000s : 4: predicate.fold_const_symbol 0.67% : 0.000002s : 8: predicate.get_grad_eliminate 0.26% : 0.000001s : 4: predicate.graph_param_transform 0.52% : 0.000001s : 8: predicate.incorporate_call 0.44% : 0.000001s : 8: predicate.incorporate_call_switch 5.84% : 0.000014s : 58: predicate.inline 0.73% : 0.000002s : 8: predicate.inline_without_move 0.30% : 0.000001s : 8: predicate.j_node_and_user_rematch 1.01% : 0.000002s : 8: predicate.less_batch_normalization 1.77% : 0.000004s : 25: predicate.list_to_tuple_eliminator_ 2.39% : 0.000006s : 38: predicate.load_eliminater 0.91% : 0.000002s : 4: predicate.loop_unroll_after_grad 2.13% : 0.000005s : 34: predicate.loop_unroll_before_grad 1.59% : 0.000004s : 21: predicate.make_slice_get_slice_eliminator 0.55% : 0.000001s : 8: predicate.merge_addn 0.55% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.54% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.78% : 0.000002s : 13: predicate.minmaximum_grad 1.28% : 0.000003s : 4: predicate.mutable_eliminate 0.34% : 0.000001s : 4: predicate.opt_reshape 0.36% : 0.000001s : 4: predicate.parallel_virtual_node 1.83% : 0.000004s : 21: predicate.partial_defer_inline 1.42% : 0.000003s : 21: predicate.partial_eliminate 0.84% : 0.000002s : 13: predicate.print_const_string_wrapper 0.65% : 0.000002s : 8: predicate.reduce_all_const_elim 1.54% : 0.000004s : 13: predicate.reduce_eliminate 2.31% : 0.000005s : 38: predicate.redundant_stop_gradient_eliminater 0.39% : 0.000001s : 8: predicate.remove_not_recompute_node 1.40% : 0.000003s : 25: predicate.replace_applicator 0.52% : 0.000001s : 8: predicate.replace_old_param 0.25% : 0.000001s : 4: predicate.reset_defer_inline 1.03% : 0.000002s : 13: predicate.reshape_eliminate 0.73% : 0.000002s : 8: predicate.row_tensor_add_zeros_like 0.38% : 0.000001s : 4: predicate.row_tensor_eliminate 0.89% : 0.000002s : 8: predicate.same_eliminate 0.52% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.80% : 0.000002s : 8: predicate.shard_identity_eliminate 0.82% : 0.000002s : 8: predicate.special_op_eliminate 0.70% : 0.000002s : 8: predicate.specialize_transform 0.95% : 0.000002s : 8: predicate.split_environ_get_set_with_tuple_value 0.89% : 0.000002s : 8: predicate.stack_unstack_eliminate 0.29% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.42% : 0.000003s : 21: predicate.switch_defer_inline 1.90% : 0.000004s : 29: predicate.switch_layer_defer_inline 4.76% : 0.000011s : 67: predicate.switch_simplify 0.87% : 0.000002s : 13: predicate.tile_eliminate 1.01% : 0.000002s : 13: predicate.transpose_eliminate 1.59% : 0.000004s : 21: predicate.tuple_list_convert_item_index_to_positive 1.47% : 0.000003s : 21: predicate.tuple_list_get_item_const_eliminator 1.57% : 0.000004s : 21: predicate.tuple_list_get_item_depend_reorder 3.10% : 0.000007s : 33: predicate.tuple_list_get_item_eliminator 1.24% : 0.000003s : 21: predicate.tuple_list_get_set_item_eliminator 2.55% : 0.000006s : 29: predicate.tuple_list_set_item_eliminator 1.74% : 0.000004s : 25: predicate.tuple_to_list_eliminator_ 2.20% : 0.000005s : 38: predicate.updatestate_pure_node_eliminater 2.97% : 0.000007s : 46: predicate.updatestate_useless_node_eliminater 0.40% : 0.000001s : 4: predicate.value_based_eliminate 0.99% : 0.000002s : 8: predicate.virtual_dataset_eliminate 0.64% : 0.000001s : 8: predicate.virtual_output_eliminate 0.24% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.40% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000773 11 53.27% : 0.000412s : 5: func_graph_cloner_run.FuncGraphClonerGraph 46.73% : 0.000361s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.157986 192 0.00% : 0.000006s : 1: ForceFp32Comm 2.59% : 0.004085s : 1: add_attr 2.57% : 0.004062s : 1: add_attr_with_inline 0.00% : 0.000007s : 1: add_comm_op_reuse_tag 0.03% : 0.000055s : 1: add_recomputation 0.00% : 0.000006s : 1: assign_add_opt 0.05% : 0.000080s : 1: auto_monad 0.02% : 0.000028s : 1: auto_monad_reorder 0.00% : 0.000006s : 1: begin_end_overlap_inline 0.01% : 0.000008s : 1: bias_add_comm_swap 0.31% : 0.000492s : 1: bootstrap 0.03% : 0.000040s : 1: cconv 0.00% : 0.000007s : 1: comm_op_add_attrs 0.01% : 0.000018s : 1: control_data_broadcast_order 0.01% : 0.000013s : 1: convert_after_rewriter 0.02% : 0.000030s : 1: cse_after_recomputation 0.00% : 0.000008s : 1: dataset_repeat_opt 0.01% : 0.000019s : 1: detach_backward 0.01% : 0.000012s : 1: environ_conv 0.02% : 0.000035s : 1: event_method 0.00% : 0.000007s : 1: full_micro_interleaved_order_control 0.01% : 0.000009s : 1: get_jit_bprop_graph 0.01% : 0.000013s : 1: graph_reusing 0.00% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000006s : 1: handle_group_info 0.01% : 0.000009s : 1: inline 0.01% : 0.000009s : 1: insert-virtual-dataset 0.00% : 0.000006s : 1: interleave_parallel_branches 0.00% : 0.000006s : 1: interleave_split_concat_branches 0.01% : 0.000008s : 1: label_fine_grained_interleaved_index 0.01% : 0.000010s : 1: label_micro_interleaved_index 0.29% : 0.000466s : 1: loop_unroll 0.00% : 0.000007s : 1: merge_cast_opt 0.00% : 0.000008s : 1: micro_interleaved_order_control 0.48% : 0.000755s : 1: mutable_eliminate 0.01% : 0.000010s : 1: offloading_packed_experts 0.01% : 0.000015s : 1: opt.transform.loop_unroll_optimizer 0.01% : 0.000019s : 1: opt.transform.mutable_eliminate 0.86% : 0.001355s : 78: opt.transform.opt_a 0.02% : 0.000030s : 1: opt.transform.opt_after_cconv 0.03% : 0.000041s : 1: opt.transform.opt_after_jit_grad 0.07% : 0.000105s : 28: opt.transform.opt_b 0.03% : 0.000050s : 2: opt.transform.opt_trans_graph 0.02% : 0.000037s : 4: opt.transform.symbol_engine_opt 2.28% : 0.003597s : 1: opt_a 0.08% : 0.000134s : 1: opt_after_cconv 36.06% : 0.056976s : 1: opt_after_jit_grad 0.18% : 0.000287s : 1: opt_b 4.15% : 0.006549s : 1: optimize 0.02% : 0.000026s : 1: optimize_parallel_all_gather_comm 0.01% : 0.000011s : 1: order_py_execute_after_rewriter 0.02% : 0.000027s : 1: overlap_grad_flash_sp 0.00% : 0.000006s : 1: overlap_grad_matmul_and_grad_allreduce 0.01% : 0.000010s : 1: overlap_grad_ring_attention 0.00% : 0.000007s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000006s : 1: overlap_opt_shard_in_pipeline 0.01% : 0.000009s : 1: overlap_param_gather 0.00% : 0.000006s : 1: overlap_recompute_allgather_and_fa_grad 0.01% : 0.000011s : 1: overlap_recompute_and_grad_model_parallel 0.01% : 0.000008s : 1: overlap_recompute_comm 0.01% : 0.000010s : 1: parallel-infer-symbol 0.00% : 0.000006s : 1: parallel-infer-symbol-second 0.00% : 0.000007s : 1: partial_unused_args_eliminate 0.01% : 0.000009s : 1: pipeline_parallel_scheduler 0.01% : 0.000009s : 1: pipeline_split 0.03% : 0.000051s : 1: pre_auto_parallel 0.03% : 0.000040s : 1: py_interpret_to_execute 0.02% : 0.000024s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000006s : 1: remove_cast_before_assign_add 0.01% : 0.000020s : 1: remove_dup_value 0.29% : 0.000455s : 1: renormalize.infer 0.27% : 0.000419s : 1: renormalize.specialize 0.01% : 0.000008s : 1: reorder_send_recv_between_fp_bp 0.01% : 0.000013s : 1: rewriter_after_jit_bprop_graph 0.03% : 0.000048s : 1: rewriter_after_opt_a 0.07% : 0.000104s : 1: rewriter_before_opt_a 0.01% : 0.000009s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000008s : 1: slice_recompute_activation 0.00% : 0.000008s : 1: split_layernorm_comm 0.00% : 0.000007s : 1: split_matmul_comm_elemetwise 0.01% : 0.000011s : 1: swap_dp_allreduce_reducescatter 0.06% : 0.000099s : 1: symbol_engine_optimizer 0.06% : 0.000095s : 1: tuple_transform 48.64% : 0.076844s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:43:13.266.444 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0775201, [21] [bootstrap]: 0.0004891 [type_inference]: 0.0672501 [event_method]: 2.041e-05 [auto_monad]: 6.459e-05 [graph_reusing]: 6.04999e-06 [inline]: 2.81e-06 [add_attr]: 0.00358848, [1] [add_attr_with_inline]: 0.00357713, [1] [Cycle 1]: 7.192e-05, [2] [tag_attr]: 2.282e-05 [meta_addattr_fg_expand]: 5.77999e-06 [parallel-infer-symbol]: 3.43e-06 [pre_auto_parallel]: 3.748e-05 [insert-virtual-dataset]: 3.03e-06 [parallel-infer-symbol-second]: 8.00006e-07 [dataset_repeat_opt]: 2.21998e-06 [pipeline_split]: 1.62999e-06 [optimize]: 0.00530579, [53] [py_interpret_to_execute]: 2.655e-05 [rewriter_before_opt_a]: 8.897e-05 [opt_a]: 0.00311742, [2] [Cycle 1]: 0.0023965, [45] [expand_dump_flag]: 2.96001e-06 [switch_simplify]: 4.458e-05 [loop_unroll]: 3.047e-05 [a_1]: 0.00071504 [with_stream_mark]: 1.956e-05 [recompute_prepare]: 8.55001e-06 [updatestate_depend_eliminate]: 3.96001e-06 [updatestate_assign_eliminate]: 3.5e-06 [updatestate_loads_eliminate]: 3.01999e-06 [parameter_eliminate]: 2.11e-06 [a_2]: 8.844e-05 [accelerated_algorithm]: 6.86999e-06 [shard]: 2.12999e-06 [meta_shard_fg_expand]: 1.69e-06 [shard_inline]: 6.78e-06 [merge_send_recv]: 9.59e-06 [auto_parallel]: 6.79999e-06 [parallel]: 2.052e-05 [flash_sp]: 8.70001e-06 [merge_comm]: 4.3e-06 [allreduce_fusion]: 3.63e-06 [matmul_add_comm_reduction]: 9.36e-06 [allreduce_slice_to_reducescatter]: 6.60017e-07 [virtual_shard_identity]: 7.61001e-06 [virtual_dataset]: 6.66e-06 [get_grad_eliminate_]: 6.26e-06 [virtual_output]: 6.51999e-06 [merge_forward]: 4.00998e-06 [cell_reuse_recompute_pass]: 1.12e-06 [offload_activation]: 1.055e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.203e-05 [merge_recompute_call_nodes]: 1.47001e-06 [before_grad]: 1.063e-05 [set_forward_comm_id_for_comm_node_pass]: 4.02002e-06 [meta_fg_expand]: 2.84001e-06 [flash_sp_send_recv_attached]: 2.36e-06 [receive_attached]: 2.26998e-06 [after_resolve]: 1.159e-05 [a_after_grad]: 1.039e-05 [renormalize]: 0.00077473 [add_forward_monad_depend]: 5.74e-06 [auto_monad_grad]: 2.87002e-06 [auto_monad_eliminator]: 1.657e-05 [cse]: 2.971e-05 [a_3]: 0.00020561 [Cycle 2]: 0.00071024, [45] [expand_dump_flag]: 1.76e-06 [switch_simplify]: 9.04e-06 [loop_unroll]: 6.41e-06 [a_1]: 0.00014862 [with_stream_mark]: 1.525e-05 [recompute_prepare]: 7.38e-06 [updatestate_depend_eliminate]: 3.26001e-06 [updatestate_assign_eliminate]: 2.48e-06 [updatestate_loads_eliminate]: 2.98e-06 [parameter_eliminate]: 1.17e-06 [a_2]: 8.166e-05 [accelerated_algorithm]: 6.17001e-06 [shard]: 1.66e-06 [meta_shard_fg_expand]: 1.82999e-06 [shard_inline]: 6.61e-06 [merge_send_recv]: 6.22001e-06 [auto_parallel]: 6.27001e-06 [parallel]: 5.74999e-06 [flash_sp]: 3.46001e-06 [merge_comm]: 3.53e-06 [allreduce_fusion]: 3.41001e-06 [matmul_add_comm_reduction]: 7.1e-06 [allreduce_slice_to_reducescatter]: 4.00003e-07 [virtual_shard_identity]: 7.56999e-06 [virtual_dataset]: 1.031e-05 [get_grad_eliminate_]: 5.86e-06 [virtual_output]: 6.19001e-06 [merge_forward]: 3.46001e-06 [cell_reuse_recompute_pass]: 1.98997e-06 [offload_activation]: 7.75998e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.119e-05 [merge_recompute_call_nodes]: 1.10999e-06 [before_grad]: 9.33002e-06 [set_forward_comm_id_for_comm_node_pass]: 3.64002e-06 [meta_fg_expand]: 2.10002e-06 [flash_sp_send_recv_attached]: 8.29983e-07 [receive_attached]: 1.20001e-06 [after_resolve]: 1.105e-05 [a_after_grad]: 9.81e-06 [renormalize]: 8.00064e-08 [add_forward_monad_depend]: 2.23002e-06 [auto_monad_grad]: 1.02998e-06 [auto_monad_eliminator]: 9.74999e-06 [cse]: 1.582e-05 [a_3]: 3.793e-05 [py_interpret_to_execute_after_opt_a]: 1.135e-05 [slice_cell_reuse_recomputed_activation]: 2.27001e-06 [rewriter_after_opt_a]: 3.67e-05 [convert_after_rewriter]: 6.91999e-06 [order_py_execute_after_rewriter]: 4.93001e-06 [mutable_eliminate]: 0.00061181 [opt_b]: 0.00020295, [1] [Cycle 1]: 0.00019689, [7] [b_1]: 0.00012283 [b_2]: 8.11002e-06 [updatestate_depend_eliminate]: 5.93998e-06 [updatestate_assign_eliminate]: 2.46e-06 [updatestate_loads_eliminate]: 2.52001e-06 [renormalize]: 5.89993e-07 [cse]: 1.899e-05 [optimize_parallel_all_gather_comm]: 1.619e-05 [overlap_param_gather]: 1.84e-06 [cconv]: 2.83e-05 [loop_unroll]: 0.00047067 [opt_after_cconv]: 0.0001049, [1] [Cycle 1]: 9.885e-05, [7] [c_1]: 3.194e-05 [parameter_eliminate]: 3.88999e-06 [updatestate_depend_eliminate]: 6.54999e-06 [updatestate_assign_eliminate]: 2.38002e-06 [updatestate_loads_eliminate]: 2.34001e-06 [cse]: 1.758e-05 [renormalize]: 4.50003e-07 [remove_dup_value]: 1.4e-05 [tuple_transform]: 7.67e-05, [1] [Cycle 1]: 7.227e-05, [4] [d_1]: 4.445e-05 [none_parameter_eliminate]: 1.74998e-06 [renormalize]: 3.19997e-07 [switch_simplify]: 6.93998e-06 [partial_unused_args_eliminate]: 2.02999e-06 [add_recomputation]: 5.008e-05 [cse_after_recomputation]: 2.224e-05, [1] [Cycle 1]: 1.747e-05, [1] [cse]: 1.133e-05 [environ_conv]: 4.89998e-06 [swap_dp_allreduce_reducescatter]: 4.99003e-06 [bias_add_comm_swap]: 2.86e-06 [label_micro_interleaved_index]: 4.89998e-06 [label_fine_grained_interleaved_index]: 2.64999e-06 [merge_cast_opt]: 1.23002e-06 [slice_recompute_activation]: 2.24001e-06 [micro_interleaved_order_control]: 2.17001e-06 [assign_add_opt]: 1.12999e-06 [ForceFp32Comm]: 8.79983e-07 [remove_cast_before_assign_add]: 9.99979e-07 [full_micro_interleaved_order_control]: 2.09e-06 [reorder_send_recv_between_fp_bp]: 2.73e-06 [comm_op_add_attrs]: 1.17999e-06 [add_comm_op_reuse_tag]: 1.20001e-06 [interleave_split_concat_branches]: 1.23002e-06 [interleave_parallel_branches]: 1.05001e-06 [overlap_opt_shard_in_pipeline]: 1.19e-06 [overlap_opt_shard_grad_in_pipeline]: 2.27999e-06 [control_data_broadcast_order]: 1.467e-05 [grouped_pairwise_exchange_alltoall]: 1.62999e-06 [offloading_packed_experts]: 3.76001e-06 [overlap_recompute_and_grad_model_parallel]: 4.70001e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.16002e-06 [overlap_recompute_allgather_and_fa_grad]: 1.38002e-06 [overlap_recompute_comm]: 2.41e-06 [overlap_grad_ring_attention]: 3.73001e-06 [overlap_grad_flash_sp]: 1.998e-05 [begin_end_overlap_inline]: 5.49975e-07 [split_matmul_comm_elemetwise]: 2.30002e-06 [split_layernorm_comm]: 2.04e-06 [handle_group_info]: 9.29984e-07 [symbol_engine_optimizer]: 8.31e-05, [1] [Cycle 1]: 7.843e-05, [6] [build]: 3.63e-06 [elim_shapecalc]: 1.108e-05 [elim_not_effective]: 1.333e-05 [opt_reshape]: 7.3e-06 [fold_const_symbol]: 1.145e-05 [renormalize]: 3.10014e-07 [detach_backward]: 2.22999e-06 [pipeline_parallel_scheduler]: 1.50999e-06 [auto_monad_reorder]: 1.713e-05 [get_jit_bprop_graph]: 2.31998e-06 [rewriter_after_jit_bprop_graph]: 4.77e-06 [opt_after_jit_grad]: 0.0005062 [validate]: 4.285e-05 Sums bootstrap : 0.000489s : 0.67% type_inference : 0.067250s : 92.21% event_method : 0.000020s : 0.03% auto_monad : 0.000065s : 0.09% graph_reusing : 0.000006s : 0.01% inline : 0.000003s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000023s : 0.03% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.01% parallel-infer-symbol : 0.000003s : 0.00% pre_auto_parallel : 0.000037s : 0.05% insert-virtual-dataset : 0.000003s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000027s : 0.04% optimize.rewriter_before_opt_a : 0.000089s : 0.12% optimize.opt_a.expand_dump_flag : 0.000005s : 0.01% optimize.opt_a.switch_simplify : 0.000054s : 0.07% optimize.opt_a.loop_unroll : 0.000037s : 0.05% optimize.opt_a.a_1 : 0.000864s : 1.18% optimize.opt_a.with_stream_mark : 0.000035s : 0.05% optimize.opt_a.recompute_prepare : 0.000016s : 0.02% optimize.opt_a.updatestate_depend_eliminate : 0.000007s : 0.01% optimize.opt_a.updatestate_assign_eliminate : 0.000006s : 0.01% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.01% optimize.opt_a.parameter_eliminate : 0.000003s : 0.00% optimize.opt_a.a_2 : 0.000170s : 0.23% optimize.opt_a.accelerated_algorithm : 0.000013s : 0.02% optimize.opt_a.shard : 0.000004s : 0.01% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.00% optimize.opt_a.shard_inline : 0.000013s : 0.02% optimize.opt_a.merge_send_recv : 0.000016s : 0.02% optimize.opt_a.auto_parallel : 0.000013s : 0.02% optimize.opt_a.parallel : 0.000026s : 0.04% optimize.opt_a.flash_sp : 0.000012s : 0.02% optimize.opt_a.merge_comm : 0.000008s : 0.01% optimize.opt_a.allreduce_fusion : 0.000007s : 0.01% optimize.opt_a.matmul_add_comm_reduction : 0.000016s : 0.02% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000015s : 0.02% optimize.opt_a.virtual_dataset : 0.000017s : 0.02% optimize.opt_a.get_grad_eliminate_ : 0.000012s : 0.02% optimize.opt_a.virtual_output : 0.000013s : 0.02% optimize.opt_a.merge_forward : 0.000007s : 0.01% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.00% optimize.opt_a.offload_activation : 0.000018s : 0.03% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000023s : 0.03% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.00% optimize.opt_a.before_grad : 0.000020s : 0.03% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000008s : 0.01% optimize.opt_a.meta_fg_expand : 0.000005s : 0.01% optimize.opt_a.flash_sp_send_recv_attached : 0.000003s : 0.00% optimize.opt_a.receive_attached : 0.000003s : 0.00% optimize.opt_a.after_resolve : 0.000023s : 0.03% optimize.opt_a.a_after_grad : 0.000020s : 0.03% optimize.opt_a.renormalize : 0.000775s : 1.06% optimize.opt_a.add_forward_monad_depend : 0.000008s : 0.01% optimize.opt_a.auto_monad_grad : 0.000004s : 0.01% optimize.opt_a.auto_monad_eliminator : 0.000026s : 0.04% optimize.opt_a.cse : 0.000046s : 0.06% optimize.opt_a.a_3 : 0.000244s : 0.33% optimize.py_interpret_to_execute_after_opt_a : 0.000011s : 0.02% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.00% optimize.rewriter_after_opt_a : 0.000037s : 0.05% optimize.convert_after_rewriter : 0.000007s : 0.01% optimize.order_py_execute_after_rewriter : 0.000005s : 0.01% optimize.mutable_eliminate : 0.000612s : 0.84% optimize.opt_b.b_1 : 0.000123s : 0.17% optimize.opt_b.b_2 : 0.000008s : 0.01% optimize.opt_b.updatestate_depend_eliminate : 0.000006s : 0.01% optimize.opt_b.updatestate_assign_eliminate : 0.000002s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.00% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000019s : 0.03% optimize.optimize_parallel_all_gather_comm : 0.000016s : 0.02% optimize.overlap_param_gather : 0.000002s : 0.00% optimize.cconv : 0.000028s : 0.04% optimize.loop_unroll : 0.000471s : 0.65% optimize.opt_after_cconv.c_1 : 0.000032s : 0.04% optimize.opt_after_cconv.parameter_eliminate : 0.000004s : 0.01% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000007s : 0.01% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000002s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.00% optimize.opt_after_cconv.cse : 0.000018s : 0.02% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000014s : 0.02% optimize.tuple_transform.d_1 : 0.000044s : 0.06% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000007s : 0.01% optimize.partial_unused_args_eliminate : 0.000002s : 0.00% optimize.add_recomputation : 0.000050s : 0.07% optimize.cse_after_recomputation.cse : 0.000011s : 0.02% optimize.environ_conv : 0.000005s : 0.01% optimize.swap_dp_allreduce_reducescatter : 0.000005s : 0.01% optimize.bias_add_comm_swap : 0.000003s : 0.00% optimize.label_micro_interleaved_index : 0.000005s : 0.01% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.00% optimize.merge_cast_opt : 0.000001s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.00% optimize.micro_interleaved_order_control : 0.000002s : 0.00% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.00% optimize.full_micro_interleaved_order_control : 0.000002s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.00% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.00% optimize.control_data_broadcast_order : 0.000015s : 0.02% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.00% optimize.offloading_packed_experts : 0.000004s : 0.01% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.01% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.00% optimize.overlap_recompute_comm : 0.000002s : 0.00% optimize.overlap_grad_ring_attention : 0.000004s : 0.01% optimize.overlap_grad_flash_sp : 0.000020s : 0.03% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.00% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000004s : 0.00% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000011s : 0.02% optimize.symbol_engine_optimizer.elim_not_effective : 0.000013s : 0.02% optimize.symbol_engine_optimizer.opt_reshape : 0.000007s : 0.01% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000011s : 0.02% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.00% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000017s : 0.02% get_jit_bprop_graph : 0.000002s : 0.00% rewriter_after_jit_bprop_graph : 0.000005s : 0.01% opt_after_jit_grad : 0.000506s : 0.69% validate : 0.000043s : 0.06% Time group info: ------[substitution.] 0.000269 34 12.87% : 0.000035s : 6: substitution.arithmetic_simplify 0.67% : 0.000002s : 2: substitution.elim_not_effective 0.71% : 0.000002s : 2: substitution.fold_const_symbol 2.18% : 0.000006s : 4: substitution.graph_param_transform 72.51% : 0.000195s : 4: substitution.inline 1.40% : 0.000004s : 4: substitution.j_node_and_user_rematch 1.72% : 0.000005s : 4: substitution.remove_not_recompute_node 1.95% : 0.000005s : 4: substitution.replace_old_param 5.99% : 0.000016s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.067177 2 98.87% : 0.066419s : 1: type_inference.infer 1.13% : 0.000757s : 1: type_inference.specialize ------[replace.] 0.000065 8 61.87% : 0.000040s : 4: replace.inline 38.13% : 0.000025s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000207 8 93.07% : 0.000192s : 4: match.inline 6.93% : 0.000014s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000214 1278 0.88% : 0.000002s : 13: predicate.accumulaten_eliminater 0.80% : 0.000002s : 4: predicate.ad_related_special_op_eliminate 0.49% : 0.000001s : 8: predicate.addn_check_dump 0.91% : 0.000002s : 13: predicate.addn_zero_filter 0.80% : 0.000002s : 13: predicate.adjust_all_reduce_mul_add 2.39% : 0.000005s : 21: predicate.arithmetic_simplify 0.88% : 0.000002s : 13: predicate.cast_eliminate 0.60% : 0.000001s : 8: predicate.check_bprop_eliminate 0.49% : 0.000001s : 8: predicate.compare_switch_simplify 0.18% : 0.000000s : 4: predicate.const_output_eliminate 0.59% : 0.000001s : 8: predicate.depend_value_elim 0.91% : 0.000002s : 13: predicate.dict_get_item_const_eliminator 1.00% : 0.000002s : 13: predicate.dict_get_item_eliminator 0.91% : 0.000002s : 13: predicate.dict_set_item_eliminator 1.09% : 0.000002s : 8: predicate.dumpgradient_eliminate 0.20% : 0.000000s : 4: predicate.elim_not_effective 0.62% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.09% : 0.000002s : 17: predicate.environ_add_const_eliminate 1.04% : 0.000002s : 17: predicate.environ_get_add_eliminate 1.08% : 0.000002s : 17: predicate.environ_get_depend_swap 1.88% : 0.000004s : 25: predicate.environ_get_eliminate 1.33% : 0.000003s : 17: predicate.environ_get_set_eliminate 1.52% : 0.000003s : 21: predicate.exchange_switch_depend_value 2.47% : 0.000005s : 21: predicate.float_depend_g_call 0.54% : 0.000001s : 8: predicate.float_environ_get_switch 0.82% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.18% : 0.000000s : 4: predicate.fold_const_symbol 0.65% : 0.000001s : 8: predicate.get_grad_eliminate 0.18% : 0.000000s : 4: predicate.graph_param_transform 0.62% : 0.000001s : 8: predicate.incorporate_call 0.49% : 0.000001s : 8: predicate.incorporate_call_switch 6.80% : 0.000015s : 58: predicate.inline 0.91% : 0.000002s : 8: predicate.inline_without_move 0.32% : 0.000001s : 8: predicate.j_node_and_user_rematch 0.61% : 0.000001s : 8: predicate.less_batch_normalization 1.73% : 0.000004s : 25: predicate.list_to_tuple_eliminator_ 2.54% : 0.000005s : 38: predicate.load_eliminater 0.95% : 0.000002s : 4: predicate.loop_unroll_after_grad 2.18% : 0.000005s : 34: predicate.loop_unroll_before_grad 1.52% : 0.000003s : 21: predicate.make_slice_get_slice_eliminator 0.55% : 0.000001s : 8: predicate.merge_addn 0.65% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.63% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.93% : 0.000002s : 13: predicate.minmaximum_grad 1.18% : 0.000003s : 4: predicate.mutable_eliminate 0.36% : 0.000001s : 4: predicate.opt_reshape 0.45% : 0.000001s : 4: predicate.parallel_virtual_node 1.75% : 0.000004s : 21: predicate.partial_defer_inline 1.58% : 0.000003s : 21: predicate.partial_eliminate 0.83% : 0.000002s : 13: predicate.print_const_string_wrapper 0.57% : 0.000001s : 8: predicate.reduce_all_const_elim 1.11% : 0.000002s : 13: predicate.reduce_eliminate 2.55% : 0.000005s : 38: predicate.redundant_stop_gradient_eliminater 0.42% : 0.000001s : 8: predicate.remove_not_recompute_node 1.50% : 0.000003s : 25: predicate.replace_applicator 0.41% : 0.000001s : 8: predicate.replace_old_param 0.36% : 0.000001s : 4: predicate.reset_defer_inline 0.94% : 0.000002s : 13: predicate.reshape_eliminate 0.80% : 0.000002s : 8: predicate.row_tensor_add_zeros_like 0.56% : 0.000001s : 4: predicate.row_tensor_eliminate 0.76% : 0.000002s : 8: predicate.same_eliminate 0.42% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.77% : 0.000002s : 8: predicate.shard_identity_eliminate 0.72% : 0.000002s : 8: predicate.special_op_eliminate 0.73% : 0.000002s : 8: predicate.specialize_transform 0.80% : 0.000002s : 8: predicate.split_environ_get_set_with_tuple_value 0.96% : 0.000002s : 8: predicate.stack_unstack_eliminate 0.30% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.52% : 0.000003s : 21: predicate.switch_defer_inline 2.03% : 0.000004s : 29: predicate.switch_layer_defer_inline 4.99% : 0.000011s : 67: predicate.switch_simplify 0.90% : 0.000002s : 13: predicate.tile_eliminate 1.01% : 0.000002s : 13: predicate.transpose_eliminate 1.57% : 0.000003s : 21: predicate.tuple_list_convert_item_index_to_positive 1.42% : 0.000003s : 21: predicate.tuple_list_get_item_const_eliminator 1.40% : 0.000003s : 21: predicate.tuple_list_get_item_depend_reorder 3.24% : 0.000007s : 33: predicate.tuple_list_get_item_eliminator 1.30% : 0.000003s : 21: predicate.tuple_list_get_set_item_eliminator 2.23% : 0.000005s : 29: predicate.tuple_list_set_item_eliminator 1.73% : 0.000004s : 25: predicate.tuple_to_list_eliminator_ 2.43% : 0.000005s : 38: predicate.updatestate_pure_node_eliminater 3.09% : 0.000007s : 46: predicate.updatestate_useless_node_eliminater 0.38% : 0.000001s : 4: predicate.value_based_eliminate 0.61% : 0.000001s : 8: predicate.virtual_dataset_eliminate 0.62% : 0.000001s : 8: predicate.virtual_output_eliminate 0.28% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.45% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000635 11 57.48% : 0.000365s : 5: func_graph_cloner_run.FuncGraphClonerGraph 42.52% : 0.000270s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.088786 192 0.00% : 0.000004s : 1: ForceFp32Comm 4.05% : 0.003595s : 1: add_attr 4.03% : 0.003582s : 1: add_attr_with_inline 0.01% : 0.000005s : 1: add_comm_op_reuse_tag 0.06% : 0.000054s : 1: add_recomputation 0.00% : 0.000004s : 1: assign_add_opt 0.08% : 0.000070s : 1: auto_monad 0.02% : 0.000021s : 1: auto_monad_reorder 0.00% : 0.000004s : 1: begin_end_overlap_inline 0.01% : 0.000006s : 1: bias_add_comm_swap 0.59% : 0.000521s : 1: bootstrap 0.04% : 0.000032s : 1: cconv 0.00% : 0.000004s : 1: comm_op_add_attrs 0.02% : 0.000018s : 1: control_data_broadcast_order 0.01% : 0.000010s : 1: convert_after_rewriter 0.03% : 0.000025s : 1: cse_after_recomputation 0.01% : 0.000005s : 1: dataset_repeat_opt 0.01% : 0.000005s : 1: detach_backward 0.01% : 0.000008s : 1: environ_conv 0.03% : 0.000027s : 1: event_method 0.01% : 0.000005s : 1: full_micro_interleaved_order_control 0.01% : 0.000005s : 1: get_jit_bprop_graph 0.01% : 0.000010s : 1: graph_reusing 0.00% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000004s : 1: handle_group_info 0.01% : 0.000006s : 1: inline 0.01% : 0.000006s : 1: insert-virtual-dataset 0.00% : 0.000004s : 1: interleave_parallel_branches 0.00% : 0.000004s : 1: interleave_split_concat_branches 0.01% : 0.000006s : 1: label_fine_grained_interleaved_index 0.01% : 0.000008s : 1: label_micro_interleaved_index 0.54% : 0.000481s : 1: loop_unroll 0.00% : 0.000004s : 1: merge_cast_opt 0.01% : 0.000005s : 1: micro_interleaved_order_control 0.70% : 0.000621s : 1: mutable_eliminate 0.01% : 0.000007s : 1: offloading_packed_experts 0.02% : 0.000015s : 1: opt.transform.loop_unroll_optimizer 0.02% : 0.000018s : 1: opt.transform.mutable_eliminate 1.64% : 0.001456s : 78: opt.transform.opt_a 0.03% : 0.000031s : 1: opt.transform.opt_after_cconv 0.03% : 0.000027s : 1: opt.transform.opt_after_jit_grad 0.11% : 0.000100s : 28: opt.transform.opt_b 0.06% : 0.000049s : 2: opt.transform.opt_trans_graph 0.04% : 0.000038s : 4: opt.transform.symbol_engine_opt 3.51% : 0.003121s : 1: opt_a 0.12% : 0.000109s : 1: opt_after_cconv 0.58% : 0.000516s : 1: opt_after_jit_grad 0.23% : 0.000206s : 1: opt_b 5.98% : 0.005311s : 1: optimize 0.02% : 0.000020s : 1: optimize_parallel_all_gather_comm 0.01% : 0.000008s : 1: order_py_execute_after_rewriter 0.03% : 0.000023s : 1: overlap_grad_flash_sp 0.00% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.01% : 0.000007s : 1: overlap_grad_ring_attention 0.01% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.01% : 0.000005s : 1: overlap_param_gather 0.00% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.01% : 0.000008s : 1: overlap_recompute_and_grad_model_parallel 0.01% : 0.000006s : 1: overlap_recompute_comm 0.01% : 0.000007s : 1: parallel-infer-symbol 0.00% : 0.000004s : 1: parallel-infer-symbol-second 0.01% : 0.000005s : 1: partial_unused_args_eliminate 0.01% : 0.000005s : 1: pipeline_parallel_scheduler 0.00% : 0.000004s : 1: pipeline_split 0.05% : 0.000041s : 1: pre_auto_parallel 0.03% : 0.000031s : 1: py_interpret_to_execute 0.02% : 0.000015s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000004s : 1: remove_cast_before_assign_add 0.02% : 0.000018s : 1: remove_dup_value 0.47% : 0.000416s : 1: renormalize.infer 0.40% : 0.000351s : 1: renormalize.specialize 0.01% : 0.000005s : 1: reorder_send_recv_between_fp_bp 0.01% : 0.000008s : 1: rewriter_after_jit_bprop_graph 0.05% : 0.000040s : 1: rewriter_after_opt_a 0.10% : 0.000093s : 1: rewriter_before_opt_a 0.01% : 0.000006s : 1: slice_cell_reuse_recomputed_activation 0.01% : 0.000006s : 1: slice_recompute_activation 0.01% : 0.000006s : 1: split_layernorm_comm 0.01% : 0.000005s : 1: split_matmul_comm_elemetwise 0.01% : 0.000008s : 1: swap_dp_allreduce_reducescatter 0.10% : 0.000086s : 1: symbol_engine_optimizer 0.09% : 0.000080s : 1: tuple_transform 75.77% : 0.067276s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:43:14.356.390 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:43:14.356.670 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0913928, [21] [bootstrap]: 0.0004266 [type_inference]: 0.00616893 [event_method]: 1.934e-05 [auto_monad]: 6.536e-05 [graph_reusing]: 6.11e-06 [inline]: 2.14e-06 [add_attr]: 0.00360003, [1] [add_attr_with_inline]: 0.00358578, [1] [Cycle 1]: 0.00010103, [2] [tag_attr]: 2.458e-05 [meta_addattr_fg_expand]: 6.02001e-06 [parallel-infer-symbol]: 3.95e-06 [pre_auto_parallel]: 4.23e-05 [insert-virtual-dataset]: 2.58998e-06 [parallel-infer-symbol-second]: 7.59988e-07 [dataset_repeat_opt]: 2.26998e-06 [pipeline_split]: 1.64e-06 [optimize]: 0.07963, [53] [py_interpret_to_execute]: 3.515e-05 [rewriter_before_opt_a]: 0.00010296 [opt_a]: 0.0768795, [2] [Cycle 1]: 0.075902, [45] [expand_dump_flag]: 3.33e-06 [switch_simplify]: 4.375e-05 [loop_unroll]: 3.043e-05 [a_1]: 0.0007072 [with_stream_mark]: 2.274e-05 [recompute_prepare]: 1.152e-05 [updatestate_depend_eliminate]: 4.90999e-06 [updatestate_assign_eliminate]: 4.08999e-06 [updatestate_loads_eliminate]: 3.76001e-06 [parameter_eliminate]: 2.17999e-06 [a_2]: 0.0001221 [accelerated_algorithm]: 7.61999e-06 [shard]: 2.22999e-06 [meta_shard_fg_expand]: 2.34999e-06 [shard_inline]: 6.54001e-06 [merge_send_recv]: 1.015e-05 [auto_parallel]: 9.36e-06 [parallel]: 2.096e-05 [flash_sp]: 1.219e-05 [merge_comm]: 5.14e-06 [allreduce_fusion]: 3.74002e-06 [matmul_add_comm_reduction]: 1.126e-05 [allreduce_slice_to_reducescatter]: 1.22e-06 [virtual_shard_identity]: 1.053e-05 [virtual_dataset]: 6.73998e-06 [get_grad_eliminate_]: 6.71999e-06 [virtual_output]: 7.38999e-06 [merge_forward]: 3.87002e-06 [cell_reuse_recompute_pass]: 2.02001e-06 [offload_activation]: 1.081e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.652e-05 [merge_recompute_call_nodes]: 2.16e-06 [before_grad]: 1.398e-05 [set_forward_comm_id_for_comm_node_pass]: 3.60998e-06 [meta_fg_expand]: 3.99002e-06 [flash_sp_send_recv_attached]: 2.78e-06 [receive_attached]: 2.17001e-06 [after_resolve]: 1.314e-05 [a_after_grad]: 1.098e-05 [renormalize]: 0.0741451 [add_forward_monad_depend]: 1.031e-05 [auto_monad_grad]: 2.48e-06 [auto_monad_eliminator]: 2.007e-05 [cse]: 2.868e-05 [a_3]: 7.834e-05 [Cycle 2]: 0.00095845, [45] [expand_dump_flag]: 2.58998e-06 [switch_simplify]: 1.012e-05 [loop_unroll]: 6.98e-06 [a_1]: 0.0001619 [with_stream_mark]: 2.193e-05 [recompute_prepare]: 8.47e-06 [updatestate_depend_eliminate]: 4.89e-06 [updatestate_assign_eliminate]: 3.81999e-06 [updatestate_loads_eliminate]: 3.25e-06 [parameter_eliminate]: 1.99e-06 [a_2]: 0.00010991 [accelerated_algorithm]: 7.16999e-06 [shard]: 2.98998e-06 [meta_shard_fg_expand]: 2.58003e-06 [shard_inline]: 6.42001e-06 [merge_send_recv]: 8.90001e-06 [auto_parallel]: 9.71e-06 [parallel]: 8.97e-06 [flash_sp]: 3.88999e-06 [merge_comm]: 3.8e-06 [allreduce_fusion]: 3.75e-06 [matmul_add_comm_reduction]: 1.129e-05 [allreduce_slice_to_reducescatter]: 9.20001e-07 [virtual_shard_identity]: 7.88001e-06 [virtual_dataset]: 6.15002e-06 [get_grad_eliminate_]: 6.04999e-06 [virtual_output]: 6.52001e-06 [merge_forward]: 4.41002e-06 [cell_reuse_recompute_pass]: 3.18998e-06 [offload_activation]: 1.002e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.582e-05 [merge_recompute_call_nodes]: 1.72999e-06 [before_grad]: 1.069e-05 [set_forward_comm_id_for_comm_node_pass]: 4.21001e-06 [meta_fg_expand]: 3.08e-06 [flash_sp_send_recv_attached]: 1.70001e-06 [receive_attached]: 2.76e-06 [after_resolve]: 1.326e-05 [a_after_grad]: 1.051e-05 [renormalize]: 8.9989e-08 [add_forward_monad_depend]: 2.22999e-06 [auto_monad_grad]: 2.09999e-06 [auto_monad_eliminator]: 1.006e-05 [cse]: 1.719e-05 [a_3]: 4.996e-05 [py_interpret_to_execute_after_opt_a]: 1.934e-05 [slice_cell_reuse_recomputed_activation]: 5.77999e-06 [rewriter_after_opt_a]: 4.561e-05 [convert_after_rewriter]: 9.94001e-06 [order_py_execute_after_rewriter]: 8.50999e-06 [mutable_eliminate]: 0.00073779 [opt_b]: 0.00028231, [1] [Cycle 1]: 0.00027183, [7] [b_1]: 0.00017024 [b_2]: 8.15e-06 [updatestate_depend_eliminate]: 7.75e-06 [updatestate_assign_eliminate]: 2.68e-06 [updatestate_loads_eliminate]: 2.71e-06 [renormalize]: 5.50004e-07 [cse]: 2.288e-05 [optimize_parallel_all_gather_comm]: 2.184e-05 [overlap_param_gather]: 5.16002e-06 [cconv]: 3.915e-05 [loop_unroll]: 0.00053425 [opt_after_cconv]: 0.00013765, [1] [Cycle 1]: 0.00012773, [7] [c_1]: 3.255e-05 [parameter_eliminate]: 4.95001e-06 [updatestate_depend_eliminate]: 6.64001e-06 [updatestate_assign_eliminate]: 2.74001e-06 [updatestate_loads_eliminate]: 2.36998e-06 [cse]: 2.033e-05 [renormalize]: 4.19997e-07 [remove_dup_value]: 1.881e-05 [tuple_transform]: 0.00010111, [1] [Cycle 1]: 9.35e-05, [4] [d_1]: 5.017e-05 [none_parameter_eliminate]: 1.64998e-06 [renormalize]: 2.10013e-07 [switch_simplify]: 8.15999e-06 [partial_unused_args_eliminate]: 5.30999e-06 [add_recomputation]: 5.764e-05 [cse_after_recomputation]: 3.076e-05, [1] [Cycle 1]: 2.252e-05, [1] [cse]: 1.285e-05 [environ_conv]: 9.79999e-06 [swap_dp_allreduce_reducescatter]: 8.77e-06 [bias_add_comm_swap]: 5.84e-06 [label_micro_interleaved_index]: 8.00999e-06 [label_fine_grained_interleaved_index]: 5.38002e-06 [merge_cast_opt]: 4.33001e-06 [slice_recompute_activation]: 5.00001e-06 [micro_interleaved_order_control]: 5.38002e-06 [assign_add_opt]: 4.13001e-06 [ForceFp32Comm]: 3.55e-06 [remove_cast_before_assign_add]: 3.34001e-06 [full_micro_interleaved_order_control]: 5.21002e-06 [reorder_send_recv_between_fp_bp]: 5.81e-06 [comm_op_add_attrs]: 3.75998e-06 [add_comm_op_reuse_tag]: 3.85e-06 [interleave_split_concat_branches]: 3.83001e-06 [interleave_parallel_branches]: 4.23001e-06 [overlap_opt_shard_in_pipeline]: 4.62e-06 [overlap_opt_shard_grad_in_pipeline]: 4.37e-06 [control_data_broadcast_order]: 1.763e-05 [grouped_pairwise_exchange_alltoall]: 4.15e-06 [offloading_packed_experts]: 7.61999e-06 [overlap_recompute_and_grad_model_parallel]: 8.64003e-06 [overlap_grad_matmul_and_grad_allreduce]: 4.12e-06 [overlap_recompute_allgather_and_fa_grad]: 4.23001e-06 [overlap_recompute_comm]: 5.17e-06 [overlap_grad_ring_attention]: 7.36001e-06 [overlap_grad_flash_sp]: 2.692e-05 [begin_end_overlap_inline]: 3.38999e-06 [split_matmul_comm_elemetwise]: 5.49e-06 [split_layernorm_comm]: 4.33001e-06 [handle_group_info]: 3.80998e-06 [symbol_engine_optimizer]: 0.000113, [1] [Cycle 1]: 0.00010516, [6] [build]: 4.06001e-06 [elim_shapecalc]: 1.296e-05 [elim_not_effective]: 1.566e-05 [opt_reshape]: 7.96001e-06 [fold_const_symbol]: 1.126e-05 [renormalize]: 5.59987e-07 [detach_backward]: 4.50001e-06 [pipeline_parallel_scheduler]: 2.21e-06 [auto_monad_reorder]: 2.141e-05 [get_jit_bprop_graph]: 2.14e-06 [rewriter_after_jit_bprop_graph]: 6.96001e-06 [opt_after_jit_grad]: 0.00068281 [validate]: 4.907e-05 Sums bootstrap : 0.000427s : 0.50% type_inference : 0.006169s : 7.19% event_method : 0.000019s : 0.02% auto_monad : 0.000065s : 0.08% graph_reusing : 0.000006s : 0.01% inline : 0.000002s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000025s : 0.03% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.01% parallel-infer-symbol : 0.000004s : 0.00% pre_auto_parallel : 0.000042s : 0.05% insert-virtual-dataset : 0.000003s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000035s : 0.04% optimize.rewriter_before_opt_a : 0.000103s : 0.12% optimize.opt_a.expand_dump_flag : 0.000006s : 0.01% optimize.opt_a.switch_simplify : 0.000054s : 0.06% optimize.opt_a.loop_unroll : 0.000037s : 0.04% optimize.opt_a.a_1 : 0.000869s : 1.01% optimize.opt_a.with_stream_mark : 0.000045s : 0.05% optimize.opt_a.recompute_prepare : 0.000020s : 0.02% optimize.opt_a.updatestate_depend_eliminate : 0.000010s : 0.01% optimize.opt_a.updatestate_assign_eliminate : 0.000008s : 0.01% optimize.opt_a.updatestate_loads_eliminate : 0.000007s : 0.01% optimize.opt_a.parameter_eliminate : 0.000004s : 0.00% optimize.opt_a.a_2 : 0.000232s : 0.27% optimize.opt_a.accelerated_algorithm : 0.000015s : 0.02% optimize.opt_a.shard : 0.000005s : 0.01% optimize.opt_a.meta_shard_fg_expand : 0.000005s : 0.01% optimize.opt_a.shard_inline : 0.000013s : 0.02% optimize.opt_a.merge_send_recv : 0.000019s : 0.02% optimize.opt_a.auto_parallel : 0.000019s : 0.02% optimize.opt_a.parallel : 0.000030s : 0.03% optimize.opt_a.flash_sp : 0.000016s : 0.02% optimize.opt_a.merge_comm : 0.000009s : 0.01% optimize.opt_a.allreduce_fusion : 0.000007s : 0.01% optimize.opt_a.matmul_add_comm_reduction : 0.000023s : 0.03% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000002s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000018s : 0.02% optimize.opt_a.virtual_dataset : 0.000013s : 0.02% optimize.opt_a.get_grad_eliminate_ : 0.000013s : 0.01% optimize.opt_a.virtual_output : 0.000014s : 0.02% optimize.opt_a.merge_forward : 0.000008s : 0.01% optimize.opt_a.cell_reuse_recompute_pass : 0.000005s : 0.01% optimize.opt_a.offload_activation : 0.000021s : 0.02% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000032s : 0.04% optimize.opt_a.merge_recompute_call_nodes : 0.000004s : 0.00% optimize.opt_a.before_grad : 0.000025s : 0.03% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000008s : 0.01% optimize.opt_a.meta_fg_expand : 0.000007s : 0.01% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.01% optimize.opt_a.receive_attached : 0.000005s : 0.01% optimize.opt_a.after_resolve : 0.000026s : 0.03% optimize.opt_a.a_after_grad : 0.000021s : 0.03% optimize.opt_a.renormalize : 0.074145s : 86.37% optimize.opt_a.add_forward_monad_depend : 0.000013s : 0.01% optimize.opt_a.auto_monad_grad : 0.000005s : 0.01% optimize.opt_a.auto_monad_eliminator : 0.000030s : 0.04% optimize.opt_a.cse : 0.000046s : 0.05% optimize.opt_a.a_3 : 0.000128s : 0.15% optimize.py_interpret_to_execute_after_opt_a : 0.000019s : 0.02% optimize.slice_cell_reuse_recomputed_activation : 0.000006s : 0.01% optimize.rewriter_after_opt_a : 0.000046s : 0.05% optimize.convert_after_rewriter : 0.000010s : 0.01% optimize.order_py_execute_after_rewriter : 0.000009s : 0.01% optimize.mutable_eliminate : 0.000738s : 0.86% optimize.opt_b.b_1 : 0.000170s : 0.20% optimize.opt_b.b_2 : 0.000008s : 0.01% optimize.opt_b.updatestate_depend_eliminate : 0.000008s : 0.01% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.00% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000023s : 0.03% optimize.optimize_parallel_all_gather_comm : 0.000022s : 0.03% optimize.overlap_param_gather : 0.000005s : 0.01% optimize.cconv : 0.000039s : 0.05% optimize.loop_unroll : 0.000534s : 0.62% optimize.opt_after_cconv.c_1 : 0.000033s : 0.04% optimize.opt_after_cconv.parameter_eliminate : 0.000005s : 0.01% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000007s : 0.01% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.00% optimize.opt_after_cconv.cse : 0.000020s : 0.02% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000019s : 0.02% optimize.tuple_transform.d_1 : 0.000050s : 0.06% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000008s : 0.01% optimize.partial_unused_args_eliminate : 0.000005s : 0.01% optimize.add_recomputation : 0.000058s : 0.07% optimize.cse_after_recomputation.cse : 0.000013s : 0.01% optimize.environ_conv : 0.000010s : 0.01% optimize.swap_dp_allreduce_reducescatter : 0.000009s : 0.01% optimize.bias_add_comm_swap : 0.000006s : 0.01% optimize.label_micro_interleaved_index : 0.000008s : 0.01% optimize.label_fine_grained_interleaved_index : 0.000005s : 0.01% optimize.merge_cast_opt : 0.000004s : 0.01% optimize.slice_recompute_activation : 0.000005s : 0.01% optimize.micro_interleaved_order_control : 0.000005s : 0.01% optimize.assign_add_opt : 0.000004s : 0.00% optimize.ForceFp32Comm : 0.000004s : 0.00% optimize.remove_cast_before_assign_add : 0.000003s : 0.00% optimize.full_micro_interleaved_order_control : 0.000005s : 0.01% optimize.reorder_send_recv_between_fp_bp : 0.000006s : 0.01% optimize.comm_op_add_attrs : 0.000004s : 0.00% optimize.add_comm_op_reuse_tag : 0.000004s : 0.00% optimize.interleave_split_concat_branches : 0.000004s : 0.00% optimize.interleave_parallel_branches : 0.000004s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000005s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000004s : 0.01% optimize.control_data_broadcast_order : 0.000018s : 0.02% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.00% optimize.offloading_packed_experts : 0.000008s : 0.01% optimize.overlap_recompute_and_grad_model_parallel : 0.000009s : 0.01% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.00% optimize.overlap_recompute_comm : 0.000005s : 0.01% optimize.overlap_grad_ring_attention : 0.000007s : 0.01% optimize.overlap_grad_flash_sp : 0.000027s : 0.03% optimize.begin_end_overlap_inline : 0.000003s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000005s : 0.01% optimize.split_layernorm_comm : 0.000004s : 0.01% optimize.handle_group_info : 0.000004s : 0.00% optimize.symbol_engine_optimizer.build : 0.000004s : 0.00% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000013s : 0.02% optimize.symbol_engine_optimizer.elim_not_effective : 0.000016s : 0.02% optimize.symbol_engine_optimizer.opt_reshape : 0.000008s : 0.01% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000011s : 0.01% optimize.symbol_engine_optimizer.renormalize : 0.000001s : 0.00% detach_backward : 0.000005s : 0.01% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000021s : 0.02% get_jit_bprop_graph : 0.000002s : 0.00% rewriter_after_jit_bprop_graph : 0.000007s : 0.01% opt_after_jit_grad : 0.000683s : 0.80% validate : 0.000049s : 0.06% Time group info: ------[substitution.] 0.000267 34 15.54% : 0.000042s : 6: substitution.arithmetic_simplify 0.87% : 0.000002s : 2: substitution.elim_not_effective 0.61% : 0.000002s : 2: substitution.fold_const_symbol 2.71% : 0.000007s : 4: substitution.graph_param_transform 67.05% : 0.000179s : 4: substitution.inline 2.12% : 0.000006s : 4: substitution.j_node_and_user_rematch 2.58% : 0.000007s : 4: substitution.remove_not_recompute_node 2.61% : 0.000007s : 4: substitution.replace_old_param 5.91% : 0.000016s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.006112 2 88.47% : 0.005408s : 1: type_inference.infer 11.53% : 0.000705s : 1: type_inference.specialize ------[replace.] 0.000068 8 61.38% : 0.000042s : 4: replace.inline 38.62% : 0.000026s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000190 8 92.65% : 0.000176s : 4: match.inline 7.35% : 0.000014s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000230 1278 0.99% : 0.000002s : 13: predicate.accumulaten_eliminater 0.98% : 0.000002s : 4: predicate.ad_related_special_op_eliminate 0.56% : 0.000001s : 8: predicate.addn_check_dump 0.96% : 0.000002s : 13: predicate.addn_zero_filter 0.74% : 0.000002s : 13: predicate.adjust_all_reduce_mul_add 2.79% : 0.000006s : 21: predicate.arithmetic_simplify 0.99% : 0.000002s : 13: predicate.cast_eliminate 0.67% : 0.000002s : 8: predicate.check_bprop_eliminate 0.53% : 0.000001s : 8: predicate.compare_switch_simplify 0.16% : 0.000000s : 4: predicate.const_output_eliminate 0.61% : 0.000001s : 8: predicate.depend_value_elim 0.93% : 0.000002s : 13: predicate.dict_get_item_const_eliminator 0.95% : 0.000002s : 13: predicate.dict_get_item_eliminator 0.95% : 0.000002s : 13: predicate.dict_set_item_eliminator 0.99% : 0.000002s : 8: predicate.dumpgradient_eliminate 0.27% : 0.000001s : 4: predicate.elim_not_effective 0.45% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.21% : 0.000003s : 17: predicate.environ_add_const_eliminate 1.14% : 0.000003s : 17: predicate.environ_get_add_eliminate 1.05% : 0.000002s : 17: predicate.environ_get_depend_swap 1.54% : 0.000004s : 25: predicate.environ_get_eliminate 1.15% : 0.000003s : 17: predicate.environ_get_set_eliminate 1.41% : 0.000003s : 21: predicate.exchange_switch_depend_value 2.46% : 0.000006s : 21: predicate.float_depend_g_call 0.63% : 0.000001s : 8: predicate.float_environ_get_switch 0.76% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.18% : 0.000000s : 4: predicate.fold_const_symbol 0.67% : 0.000002s : 8: predicate.get_grad_eliminate 0.24% : 0.000001s : 4: predicate.graph_param_transform 0.54% : 0.000001s : 8: predicate.incorporate_call 0.43% : 0.000001s : 8: predicate.incorporate_call_switch 5.95% : 0.000014s : 58: predicate.inline 0.73% : 0.000002s : 8: predicate.inline_without_move 0.28% : 0.000001s : 8: predicate.j_node_and_user_rematch 0.92% : 0.000002s : 8: predicate.less_batch_normalization 1.79% : 0.000004s : 25: predicate.list_to_tuple_eliminator_ 2.42% : 0.000006s : 38: predicate.load_eliminater 1.26% : 0.000003s : 4: predicate.loop_unroll_after_grad 2.25% : 0.000005s : 34: predicate.loop_unroll_before_grad 1.95% : 0.000005s : 21: predicate.make_slice_get_slice_eliminator 0.59% : 0.000001s : 8: predicate.merge_addn 0.57% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.48% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.81% : 0.000002s : 13: predicate.minmaximum_grad 0.86% : 0.000002s : 4: predicate.mutable_eliminate 0.35% : 0.000001s : 4: predicate.opt_reshape 0.46% : 0.000001s : 4: predicate.parallel_virtual_node 1.87% : 0.000004s : 21: predicate.partial_defer_inline 1.47% : 0.000003s : 21: predicate.partial_eliminate 0.81% : 0.000002s : 13: predicate.print_const_string_wrapper 0.58% : 0.000001s : 8: predicate.reduce_all_const_elim 1.30% : 0.000003s : 13: predicate.reduce_eliminate 2.39% : 0.000006s : 38: predicate.redundant_stop_gradient_eliminater 0.50% : 0.000001s : 8: predicate.remove_not_recompute_node 1.37% : 0.000003s : 25: predicate.replace_applicator 0.55% : 0.000001s : 8: predicate.replace_old_param 0.33% : 0.000001s : 4: predicate.reset_defer_inline 1.02% : 0.000002s : 13: predicate.reshape_eliminate 0.59% : 0.000001s : 8: predicate.row_tensor_add_zeros_like 0.36% : 0.000001s : 4: predicate.row_tensor_eliminate 0.89% : 0.000002s : 8: predicate.same_eliminate 0.48% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.84% : 0.000002s : 8: predicate.shard_identity_eliminate 0.74% : 0.000002s : 8: predicate.special_op_eliminate 0.65% : 0.000001s : 8: predicate.specialize_transform 1.05% : 0.000002s : 8: predicate.split_environ_get_set_with_tuple_value 0.86% : 0.000002s : 8: predicate.stack_unstack_eliminate 0.32% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.41% : 0.000003s : 21: predicate.switch_defer_inline 2.28% : 0.000005s : 29: predicate.switch_layer_defer_inline 5.13% : 0.000012s : 67: predicate.switch_simplify 0.95% : 0.000002s : 13: predicate.tile_eliminate 0.96% : 0.000002s : 13: predicate.transpose_eliminate 1.52% : 0.000003s : 21: predicate.tuple_list_convert_item_index_to_positive 1.62% : 0.000004s : 21: predicate.tuple_list_get_item_const_eliminator 1.38% : 0.000003s : 21: predicate.tuple_list_get_item_depend_reorder 3.16% : 0.000007s : 33: predicate.tuple_list_get_item_eliminator 1.49% : 0.000003s : 21: predicate.tuple_list_get_set_item_eliminator 2.32% : 0.000005s : 29: predicate.tuple_list_set_item_eliminator 1.79% : 0.000004s : 25: predicate.tuple_to_list_eliminator_ 2.21% : 0.000005s : 38: predicate.updatestate_pure_node_eliminater 2.84% : 0.000007s : 46: predicate.updatestate_useless_node_eliminater 0.37% : 0.000001s : 4: predicate.value_based_eliminate 0.61% : 0.000001s : 8: predicate.virtual_dataset_eliminate 0.67% : 0.000002s : 8: predicate.virtual_output_eliminate 0.29% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.44% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000663 11 54.91% : 0.000364s : 5: func_graph_cloner_run.FuncGraphClonerGraph 45.09% : 0.000299s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.250244 192 0.00% : 0.000006s : 1: ForceFp32Comm 1.44% : 0.003613s : 1: add_attr 1.43% : 0.003590s : 1: add_attr_with_inline 0.00% : 0.000007s : 1: add_comm_op_reuse_tag 0.02% : 0.000062s : 1: add_recomputation 0.00% : 0.000007s : 1: assign_add_opt 0.03% : 0.000075s : 1: auto_monad 0.01% : 0.000029s : 1: auto_monad_reorder 0.00% : 0.000006s : 1: begin_end_overlap_inline 0.00% : 0.000009s : 1: bias_add_comm_swap 0.19% : 0.000473s : 1: bootstrap 0.02% : 0.000042s : 1: cconv 0.00% : 0.000007s : 1: comm_op_add_attrs 0.01% : 0.000022s : 1: control_data_broadcast_order 0.01% : 0.000013s : 1: convert_after_rewriter 0.01% : 0.000034s : 1: cse_after_recomputation 0.00% : 0.000008s : 1: dataset_repeat_opt 0.01% : 0.000024s : 1: detach_backward 0.01% : 0.000013s : 1: environ_conv 0.01% : 0.000030s : 1: event_method 0.00% : 0.000008s : 1: full_micro_interleaved_order_control 0.00% : 0.000010s : 1: get_jit_bprop_graph 0.00% : 0.000012s : 1: graph_reusing 0.00% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000007s : 1: handle_group_info 0.00% : 0.000008s : 1: inline 0.00% : 0.000009s : 1: insert-virtual-dataset 0.00% : 0.000007s : 1: interleave_parallel_branches 0.00% : 0.000007s : 1: interleave_split_concat_branches 0.00% : 0.000009s : 1: label_fine_grained_interleaved_index 0.00% : 0.000011s : 1: label_micro_interleaved_index 0.22% : 0.000541s : 1: loop_unroll 0.00% : 0.000007s : 1: merge_cast_opt 0.00% : 0.000008s : 1: micro_interleaved_order_control 0.30% : 0.000746s : 1: mutable_eliminate 0.00% : 0.000011s : 1: offloading_packed_experts 0.01% : 0.000016s : 1: opt.transform.loop_unroll_optimizer 0.01% : 0.000017s : 1: opt.transform.mutable_eliminate 0.54% : 0.001341s : 78: opt.transform.opt_a 0.01% : 0.000031s : 1: opt.transform.opt_after_cconv 0.01% : 0.000033s : 1: opt.transform.opt_after_jit_grad 0.04% : 0.000105s : 28: opt.transform.opt_b 0.02% : 0.000056s : 2: opt.transform.opt_trans_graph 0.02% : 0.000044s : 4: opt.transform.symbol_engine_opt 30.72% : 0.076883s : 1: opt_a 0.06% : 0.000141s : 1: opt_after_cconv 0.28% : 0.000695s : 1: opt_after_jit_grad 0.11% : 0.000286s : 1: opt_b 31.97% : 0.080008s : 1: optimize 0.01% : 0.000025s : 1: optimize_parallel_all_gather_comm 0.00% : 0.000012s : 1: order_py_execute_after_rewriter 0.01% : 0.000031s : 1: overlap_grad_flash_sp 0.00% : 0.000007s : 1: overlap_grad_matmul_and_grad_allreduce 0.00% : 0.000011s : 1: overlap_grad_ring_attention 0.00% : 0.000007s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000008s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000008s : 1: overlap_param_gather 0.00% : 0.000007s : 1: overlap_recompute_allgather_and_fa_grad 0.01% : 0.000013s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000009s : 1: overlap_recompute_comm 0.00% : 0.000011s : 1: parallel-infer-symbol 0.00% : 0.000006s : 1: parallel-infer-symbol-second 0.00% : 0.000008s : 1: partial_unused_args_eliminate 0.00% : 0.000011s : 1: pipeline_parallel_scheduler 0.00% : 0.000007s : 1: pipeline_split 0.02% : 0.000051s : 1: pre_auto_parallel 0.02% : 0.000039s : 1: py_interpret_to_execute 0.01% : 0.000023s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000006s : 1: remove_cast_before_assign_add 0.01% : 0.000022s : 1: remove_dup_value 29.44% : 0.073677s : 1: renormalize.infer 0.18% : 0.000452s : 1: renormalize.specialize 0.00% : 0.000009s : 1: reorder_send_recv_between_fp_bp 0.01% : 0.000014s : 1: rewriter_after_jit_bprop_graph 0.02% : 0.000049s : 1: rewriter_after_opt_a 0.04% : 0.000107s : 1: rewriter_before_opt_a 0.00% : 0.000009s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000009s : 1: slice_recompute_activation 0.00% : 0.000008s : 1: split_layernorm_comm 0.00% : 0.000009s : 1: split_matmul_comm_elemetwise 0.00% : 0.000012s : 1: swap_dp_allreduce_reducescatter 0.05% : 0.000116s : 1: symbol_engine_optimizer 0.04% : 0.000104s : 1: tuple_transform 2.48% : 0.006212s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:43:15.529.098 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0183229, [21] [bootstrap]: 0.00049267 [type_inference]: 0.00743892 [event_method]: 2.198e-05 [auto_monad]: 6.542e-05 [graph_reusing]: 6.06998e-06 [inline]: 3.03e-06 [add_attr]: 0.00431675, [1] [add_attr_with_inline]: 0.00430537, [1] [Cycle 1]: 7.375e-05, [2] [tag_attr]: 2.433e-05 [meta_addattr_fg_expand]: 5.79e-06 [parallel-infer-symbol]: 3.34001e-06 [pre_auto_parallel]: 4.03e-05 [insert-virtual-dataset]: 2.53e-06 [parallel-infer-symbol-second]: 6.50005e-07 [dataset_repeat_opt]: 2.16e-06 [pipeline_split]: 1.74e-06 [optimize]: 0.00519592, [53] [py_interpret_to_execute]: 3.098e-05 [rewriter_before_opt_a]: 9.031e-05 [opt_a]: 0.00297213, [2] [Cycle 1]: 0.00225581, [45] [expand_dump_flag]: 3.17002e-06 [switch_simplify]: 4.423e-05 [loop_unroll]: 3.004e-05 [a_1]: 0.00067538 [with_stream_mark]: 2.191e-05 [recompute_prepare]: 9.61998e-06 [updatestate_depend_eliminate]: 5.35999e-06 [updatestate_assign_eliminate]: 3.28e-06 [updatestate_loads_eliminate]: 3.06999e-06 [parameter_eliminate]: 2.06e-06 [a_2]: 8.878e-05 [accelerated_algorithm]: 6.74001e-06 [shard]: 2.05002e-06 [meta_shard_fg_expand]: 2.09e-06 [shard_inline]: 7.02997e-06 [merge_send_recv]: 9.87999e-06 [auto_parallel]: 7.11001e-06 [parallel]: 1.928e-05 [flash_sp]: 8.53001e-06 [merge_comm]: 4.27e-06 [allreduce_fusion]: 3.47002e-06 [matmul_add_comm_reduction]: 1.007e-05 [allreduce_slice_to_reducescatter]: 1.25001e-06 [virtual_shard_identity]: 9.87001e-06 [virtual_dataset]: 6.92002e-06 [get_grad_eliminate_]: 6.32001e-06 [virtual_output]: 6.43e-06 [merge_forward]: 4.1e-06 [cell_reuse_recompute_pass]: 1.22e-06 [offload_activation]: 1.038e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.265e-05 [merge_recompute_call_nodes]: 1.50001e-06 [before_grad]: 1.157e-05 [set_forward_comm_id_for_comm_node_pass]: 3.48999e-06 [meta_fg_expand]: 3.13998e-06 [flash_sp_send_recv_attached]: 2.65997e-06 [receive_attached]: 2.76999e-06 [after_resolve]: 1.205e-05 [a_after_grad]: 1.02e-05 [renormalize]: 0.00081339 [add_forward_monad_depend]: 6.64001e-06 [auto_monad_grad]: 2.48e-06 [auto_monad_eliminator]: 1.804e-05 [cse]: 2.982e-05 [a_3]: 5.252e-05 [Cycle 2]: 0.00070498, [45] [expand_dump_flag]: 2.02999e-06 [switch_simplify]: 8.3e-06 [loop_unroll]: 6.44001e-06 [a_1]: 0.00014608 [with_stream_mark]: 1.697e-05 [recompute_prepare]: 7.05e-06 [updatestate_depend_eliminate]: 3.28998e-06 [updatestate_assign_eliminate]: 2.61e-06 [updatestate_loads_eliminate]: 2.50002e-06 [parameter_eliminate]: 1.22e-06 [a_2]: 7.828e-05 [accelerated_algorithm]: 6.69999e-06 [shard]: 1.66e-06 [meta_shard_fg_expand]: 1.60999e-06 [shard_inline]: 6.12999e-06 [merge_send_recv]: 5.20999e-06 [auto_parallel]: 6.67002e-06 [parallel]: 5.80002e-06 [flash_sp]: 3.66001e-06 [merge_comm]: 3.57997e-06 [allreduce_fusion]: 3.73001e-06 [matmul_add_comm_reduction]: 6.63998e-06 [allreduce_slice_to_reducescatter]: 5.09986e-07 [virtual_shard_identity]: 6.91001e-06 [virtual_dataset]: 6.12999e-06 [get_grad_eliminate_]: 5.97999e-06 [virtual_output]: 5.62999e-06 [merge_forward]: 2.96999e-06 [cell_reuse_recompute_pass]: 2.04e-06 [offload_activation]: 7.48e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.107e-05 [merge_recompute_call_nodes]: 8.79983e-07 [before_grad]: 1.009e-05 [set_forward_comm_id_for_comm_node_pass]: 3.19001e-06 [meta_fg_expand]: 2.26998e-06 [flash_sp_send_recv_attached]: 1.40001e-06 [receive_attached]: 1.22e-06 [after_resolve]: 1.225e-05 [a_after_grad]: 9.10001e-06 [renormalize]: 6.00121e-08 [add_forward_monad_depend]: 1.86e-06 [auto_monad_grad]: 9.00007e-07 [auto_monad_eliminator]: 8.28999e-06 [cse]: 1.388e-05 [a_3]: 5.315e-05 [py_interpret_to_execute_after_opt_a]: 1.215e-05 [slice_cell_reuse_recomputed_activation]: 1.86998e-06 [rewriter_after_opt_a]: 3.852e-05 [convert_after_rewriter]: 6.93e-06 [order_py_execute_after_rewriter]: 5.37999e-06 [mutable_eliminate]: 0.00064181 [opt_b]: 0.00021381, [1] [Cycle 1]: 0.00020591, [7] [b_1]: 0.00012719 [b_2]: 8.09002e-06 [updatestate_depend_eliminate]: 7.48e-06 [updatestate_assign_eliminate]: 2.47001e-06 [updatestate_loads_eliminate]: 2.39001e-06 [renormalize]: 7.59988e-07 [cse]: 2.154e-05 [optimize_parallel_all_gather_comm]: 1.798e-05 [overlap_param_gather]: 1.96e-06 [cconv]: 3.125e-05 [loop_unroll]: 0.00044964 [opt_after_cconv]: 0.00010558, [1] [Cycle 1]: 9.916e-05, [7] [c_1]: 3.121e-05 [parameter_eliminate]: 4.25e-06 [updatestate_depend_eliminate]: 6.41998e-06 [updatestate_assign_eliminate]: 2.46e-06 [updatestate_loads_eliminate]: 2.34001e-06 [cse]: 1.794e-05 [renormalize]: 3.50003e-07 [remove_dup_value]: 1.484e-05 [tuple_transform]: 7.871e-05, [1] [Cycle 1]: 7.405e-05, [4] [d_1]: 4.64e-05 [none_parameter_eliminate]: 1.52001e-06 [renormalize]: 1.70025e-07 [switch_simplify]: 7.10002e-06 [partial_unused_args_eliminate]: 2.33002e-06 [add_recomputation]: 5.101e-05 [cse_after_recomputation]: 2.088e-05, [1] [Cycle 1]: 1.627e-05, [1] [cse]: 1.065e-05 [environ_conv]: 5.86e-06 [swap_dp_allreduce_reducescatter]: 5.10001e-06 [bias_add_comm_swap]: 2.78e-06 [label_micro_interleaved_index]: 4.54002e-06 [label_fine_grained_interleaved_index]: 2.68e-06 [merge_cast_opt]: 1.25999e-06 [slice_recompute_activation]: 2.01e-06 [micro_interleaved_order_control]: 2.28998e-06 [assign_add_opt]: 1.15999e-06 [ForceFp32Comm]: 8.59989e-07 [remove_cast_before_assign_add]: 1.30999e-06 [full_micro_interleaved_order_control]: 2.31998e-06 [reorder_send_recv_between_fp_bp]: 2.69001e-06 [comm_op_add_attrs]: 1.38002e-06 [add_comm_op_reuse_tag]: 9.39996e-07 [interleave_split_concat_branches]: 1.29e-06 [interleave_parallel_branches]: 1.05001e-06 [overlap_opt_shard_in_pipeline]: 1.27e-06 [overlap_opt_shard_grad_in_pipeline]: 2.14999e-06 [control_data_broadcast_order]: 1.326e-05 [grouped_pairwise_exchange_alltoall]: 2.24999e-06 [offloading_packed_experts]: 3.62998e-06 [overlap_recompute_and_grad_model_parallel]: 4.61997e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.34e-06 [overlap_recompute_allgather_and_fa_grad]: 1.33002e-06 [overlap_recompute_comm]: 2.39999e-06 [overlap_grad_ring_attention]: 4.4e-06 [overlap_grad_flash_sp]: 2.138e-05 [begin_end_overlap_inline]: 9.70002e-07 [split_matmul_comm_elemetwise]: 2.26e-06 [split_layernorm_comm]: 1.76998e-06 [handle_group_info]: 9.5999e-07 [symbol_engine_optimizer]: 7.897e-05, [1] [Cycle 1]: 7.414e-05, [6] [build]: 4.15e-06 [elim_shapecalc]: 1.095e-05 [elim_not_effective]: 1.313e-05 [opt_reshape]: 7.45e-06 [fold_const_symbol]: 1.022e-05 [renormalize]: 2.30008e-07 [detach_backward]: 2.51e-06 [pipeline_parallel_scheduler]: 1.71998e-06 [auto_monad_reorder]: 1.61e-05 [get_jit_bprop_graph]: 1.94999e-06 [rewriter_after_jit_bprop_graph]: 4.42e-06 [opt_after_jit_grad]: 0.00048778 [validate]: 4.313e-05 Sums bootstrap : 0.000493s : 3.79% type_inference : 0.007439s : 57.20% event_method : 0.000022s : 0.17% auto_monad : 0.000065s : 0.50% graph_reusing : 0.000006s : 0.05% inline : 0.000003s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000024s : 0.19% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.04% parallel-infer-symbol : 0.000003s : 0.03% pre_auto_parallel : 0.000040s : 0.31% insert-virtual-dataset : 0.000003s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000031s : 0.24% optimize.rewriter_before_opt_a : 0.000090s : 0.69% optimize.opt_a.expand_dump_flag : 0.000005s : 0.04% optimize.opt_a.switch_simplify : 0.000053s : 0.40% optimize.opt_a.loop_unroll : 0.000036s : 0.28% optimize.opt_a.a_1 : 0.000821s : 6.32% optimize.opt_a.with_stream_mark : 0.000039s : 0.30% optimize.opt_a.recompute_prepare : 0.000017s : 0.13% optimize.opt_a.updatestate_depend_eliminate : 0.000009s : 0.07% optimize.opt_a.updatestate_assign_eliminate : 0.000006s : 0.05% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.04% optimize.opt_a.parameter_eliminate : 0.000003s : 0.03% optimize.opt_a.a_2 : 0.000167s : 1.28% optimize.opt_a.accelerated_algorithm : 0.000013s : 0.10% optimize.opt_a.shard : 0.000004s : 0.03% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.03% optimize.opt_a.shard_inline : 0.000013s : 0.10% optimize.opt_a.merge_send_recv : 0.000015s : 0.12% optimize.opt_a.auto_parallel : 0.000014s : 0.11% optimize.opt_a.parallel : 0.000025s : 0.19% optimize.opt_a.flash_sp : 0.000012s : 0.09% optimize.opt_a.merge_comm : 0.000008s : 0.06% optimize.opt_a.allreduce_fusion : 0.000007s : 0.06% optimize.opt_a.matmul_add_comm_reduction : 0.000017s : 0.13% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000002s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000017s : 0.13% optimize.opt_a.virtual_dataset : 0.000013s : 0.10% optimize.opt_a.get_grad_eliminate_ : 0.000012s : 0.09% optimize.opt_a.virtual_output : 0.000012s : 0.09% optimize.opt_a.merge_forward : 0.000007s : 0.05% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.03% optimize.opt_a.offload_activation : 0.000018s : 0.14% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000024s : 0.18% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.02% optimize.opt_a.before_grad : 0.000022s : 0.17% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000007s : 0.05% optimize.opt_a.meta_fg_expand : 0.000005s : 0.04% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.03% optimize.opt_a.receive_attached : 0.000004s : 0.03% optimize.opt_a.after_resolve : 0.000024s : 0.19% optimize.opt_a.a_after_grad : 0.000019s : 0.15% optimize.opt_a.renormalize : 0.000813s : 6.26% optimize.opt_a.add_forward_monad_depend : 0.000009s : 0.07% optimize.opt_a.auto_monad_grad : 0.000003s : 0.03% optimize.opt_a.auto_monad_eliminator : 0.000026s : 0.20% optimize.opt_a.cse : 0.000044s : 0.34% optimize.opt_a.a_3 : 0.000106s : 0.81% optimize.py_interpret_to_execute_after_opt_a : 0.000012s : 0.09% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.01% optimize.rewriter_after_opt_a : 0.000039s : 0.30% optimize.convert_after_rewriter : 0.000007s : 0.05% optimize.order_py_execute_after_rewriter : 0.000005s : 0.04% optimize.mutable_eliminate : 0.000642s : 4.94% optimize.opt_b.b_1 : 0.000127s : 0.98% optimize.opt_b.b_2 : 0.000008s : 0.06% optimize.opt_b.updatestate_depend_eliminate : 0.000007s : 0.06% optimize.opt_b.updatestate_assign_eliminate : 0.000002s : 0.02% optimize.opt_b.updatestate_loads_eliminate : 0.000002s : 0.02% optimize.opt_b.renormalize : 0.000001s : 0.01% optimize.opt_b.cse : 0.000022s : 0.17% optimize.optimize_parallel_all_gather_comm : 0.000018s : 0.14% optimize.overlap_param_gather : 0.000002s : 0.02% optimize.cconv : 0.000031s : 0.24% optimize.loop_unroll : 0.000450s : 3.46% optimize.opt_after_cconv.c_1 : 0.000031s : 0.24% optimize.opt_after_cconv.parameter_eliminate : 0.000004s : 0.03% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.05% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000002s : 0.02% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.02% optimize.opt_after_cconv.cse : 0.000018s : 0.14% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000015s : 0.11% optimize.tuple_transform.d_1 : 0.000046s : 0.36% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000007s : 0.05% optimize.partial_unused_args_eliminate : 0.000002s : 0.02% optimize.add_recomputation : 0.000051s : 0.39% optimize.cse_after_recomputation.cse : 0.000011s : 0.08% optimize.environ_conv : 0.000006s : 0.05% optimize.swap_dp_allreduce_reducescatter : 0.000005s : 0.04% optimize.bias_add_comm_swap : 0.000003s : 0.02% optimize.label_micro_interleaved_index : 0.000005s : 0.03% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.02% optimize.merge_cast_opt : 0.000001s : 0.01% optimize.slice_recompute_activation : 0.000002s : 0.02% optimize.micro_interleaved_order_control : 0.000002s : 0.02% optimize.assign_add_opt : 0.000001s : 0.01% optimize.ForceFp32Comm : 0.000001s : 0.01% optimize.remove_cast_before_assign_add : 0.000001s : 0.01% optimize.full_micro_interleaved_order_control : 0.000002s : 0.02% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.02% optimize.comm_op_add_attrs : 0.000001s : 0.01% optimize.add_comm_op_reuse_tag : 0.000001s : 0.01% optimize.interleave_split_concat_branches : 0.000001s : 0.01% optimize.interleave_parallel_branches : 0.000001s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.02% optimize.control_data_broadcast_order : 0.000013s : 0.10% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.02% optimize.offloading_packed_experts : 0.000004s : 0.03% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.04% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.01% optimize.overlap_recompute_comm : 0.000002s : 0.02% optimize.overlap_grad_ring_attention : 0.000004s : 0.03% optimize.overlap_grad_flash_sp : 0.000021s : 0.16% optimize.begin_end_overlap_inline : 0.000001s : 0.01% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.02% optimize.split_layernorm_comm : 0.000002s : 0.01% optimize.handle_group_info : 0.000001s : 0.01% optimize.symbol_engine_optimizer.build : 0.000004s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000011s : 0.08% optimize.symbol_engine_optimizer.elim_not_effective : 0.000013s : 0.10% optimize.symbol_engine_optimizer.opt_reshape : 0.000007s : 0.06% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000010s : 0.08% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000003s : 0.02% pipeline_parallel_scheduler : 0.000002s : 0.01% auto_monad_reorder : 0.000016s : 0.12% get_jit_bprop_graph : 0.000002s : 0.01% rewriter_after_jit_bprop_graph : 0.000004s : 0.03% opt_after_jit_grad : 0.000488s : 3.75% validate : 0.000043s : 0.33% Time group info: ------[substitution.] 0.000237 34 14.66% : 0.000035s : 6: substitution.arithmetic_simplify 0.75% : 0.000002s : 2: substitution.elim_not_effective 0.58% : 0.000001s : 2: substitution.fold_const_symbol 2.57% : 0.000006s : 4: substitution.graph_param_transform 68.68% : 0.000163s : 4: substitution.inline 2.02% : 0.000005s : 4: substitution.j_node_and_user_rematch 2.01% : 0.000005s : 4: substitution.remove_not_recompute_node 2.09% : 0.000005s : 4: substitution.replace_old_param 6.65% : 0.000016s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.007357 2 88.52% : 0.006513s : 1: type_inference.infer 11.48% : 0.000845s : 1: type_inference.specialize ------[replace.] 0.000063 8 64.51% : 0.000041s : 4: replace.inline 35.49% : 0.000022s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000174 8 91.96% : 0.000160s : 4: match.inline 8.04% : 0.000014s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000219 1278 0.91% : 0.000002s : 13: predicate.accumulaten_eliminater 0.94% : 0.000002s : 4: predicate.ad_related_special_op_eliminate 0.47% : 0.000001s : 8: predicate.addn_check_dump 0.88% : 0.000002s : 13: predicate.addn_zero_filter 0.86% : 0.000002s : 13: predicate.adjust_all_reduce_mul_add 2.57% : 0.000006s : 21: predicate.arithmetic_simplify 0.89% : 0.000002s : 13: predicate.cast_eliminate 0.56% : 0.000001s : 8: predicate.check_bprop_eliminate 0.54% : 0.000001s : 8: predicate.compare_switch_simplify 0.17% : 0.000000s : 4: predicate.const_output_eliminate 0.64% : 0.000001s : 8: predicate.depend_value_elim 1.01% : 0.000002s : 13: predicate.dict_get_item_const_eliminator 0.94% : 0.000002s : 13: predicate.dict_get_item_eliminator 0.80% : 0.000002s : 13: predicate.dict_set_item_eliminator 1.22% : 0.000003s : 8: predicate.dumpgradient_eliminate 0.22% : 0.000000s : 4: predicate.elim_not_effective 0.46% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.11% : 0.000002s : 17: predicate.environ_add_const_eliminate 1.03% : 0.000002s : 17: predicate.environ_get_add_eliminate 1.01% : 0.000002s : 17: predicate.environ_get_depend_swap 1.70% : 0.000004s : 25: predicate.environ_get_eliminate 1.23% : 0.000003s : 17: predicate.environ_get_set_eliminate 1.41% : 0.000003s : 21: predicate.exchange_switch_depend_value 2.45% : 0.000005s : 21: predicate.float_depend_g_call 0.58% : 0.000001s : 8: predicate.float_environ_get_switch 0.77% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.17% : 0.000000s : 4: predicate.fold_const_symbol 0.60% : 0.000001s : 8: predicate.get_grad_eliminate 0.29% : 0.000001s : 4: predicate.graph_param_transform 0.54% : 0.000001s : 8: predicate.incorporate_call 0.46% : 0.000001s : 8: predicate.incorporate_call_switch 6.36% : 0.000014s : 58: predicate.inline 0.90% : 0.000002s : 8: predicate.inline_without_move 0.34% : 0.000001s : 8: predicate.j_node_and_user_rematch 0.73% : 0.000002s : 8: predicate.less_batch_normalization 1.74% : 0.000004s : 25: predicate.list_to_tuple_eliminator_ 2.48% : 0.000005s : 38: predicate.load_eliminater 1.22% : 0.000003s : 4: predicate.loop_unroll_after_grad 2.23% : 0.000005s : 34: predicate.loop_unroll_before_grad 1.69% : 0.000004s : 21: predicate.make_slice_get_slice_eliminator 0.49% : 0.000001s : 8: predicate.merge_addn 0.80% : 0.000002s : 8: predicate.micro_step_allgather_replace 0.68% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.81% : 0.000002s : 13: predicate.minmaximum_grad 1.36% : 0.000003s : 4: predicate.mutable_eliminate 0.42% : 0.000001s : 4: predicate.opt_reshape 0.46% : 0.000001s : 4: predicate.parallel_virtual_node 1.79% : 0.000004s : 21: predicate.partial_defer_inline 1.55% : 0.000003s : 21: predicate.partial_eliminate 0.95% : 0.000002s : 13: predicate.print_const_string_wrapper 0.53% : 0.000001s : 8: predicate.reduce_all_const_elim 1.17% : 0.000003s : 13: predicate.reduce_eliminate 2.41% : 0.000005s : 38: predicate.redundant_stop_gradient_eliminater 0.52% : 0.000001s : 8: predicate.remove_not_recompute_node 1.27% : 0.000003s : 25: predicate.replace_applicator 0.48% : 0.000001s : 8: predicate.replace_old_param 0.36% : 0.000001s : 4: predicate.reset_defer_inline 0.82% : 0.000002s : 13: predicate.reshape_eliminate 0.64% : 0.000001s : 8: predicate.row_tensor_add_zeros_like 0.45% : 0.000001s : 4: predicate.row_tensor_eliminate 0.79% : 0.000002s : 8: predicate.same_eliminate 0.56% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.89% : 0.000002s : 8: predicate.shard_identity_eliminate 0.73% : 0.000002s : 8: predicate.special_op_eliminate 0.65% : 0.000001s : 8: predicate.specialize_transform 0.96% : 0.000002s : 8: predicate.split_environ_get_set_with_tuple_value 0.74% : 0.000002s : 8: predicate.stack_unstack_eliminate 0.31% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.48% : 0.000003s : 21: predicate.switch_defer_inline 2.05% : 0.000004s : 29: predicate.switch_layer_defer_inline 5.10% : 0.000011s : 67: predicate.switch_simplify 0.87% : 0.000002s : 13: predicate.tile_eliminate 0.87% : 0.000002s : 13: predicate.transpose_eliminate 1.44% : 0.000003s : 21: predicate.tuple_list_convert_item_index_to_positive 1.48% : 0.000003s : 21: predicate.tuple_list_get_item_const_eliminator 1.76% : 0.000004s : 21: predicate.tuple_list_get_item_depend_reorder 3.18% : 0.000007s : 33: predicate.tuple_list_get_item_eliminator 1.46% : 0.000003s : 21: predicate.tuple_list_get_set_item_eliminator 2.32% : 0.000005s : 29: predicate.tuple_list_set_item_eliminator 1.64% : 0.000004s : 25: predicate.tuple_to_list_eliminator_ 2.37% : 0.000005s : 38: predicate.updatestate_pure_node_eliminater 2.83% : 0.000006s : 46: predicate.updatestate_useless_node_eliminater 0.38% : 0.000001s : 4: predicate.value_based_eliminate 0.69% : 0.000002s : 8: predicate.virtual_dataset_eliminate 0.63% : 0.000001s : 8: predicate.virtual_output_eliminate 0.29% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.42% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000661 11 55.74% : 0.000369s : 5: func_graph_cloner_run.FuncGraphClonerGraph 44.26% : 0.000293s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.030066 192 0.01% : 0.000003s : 1: ForceFp32Comm 14.38% : 0.004323s : 1: add_attr 14.33% : 0.004310s : 1: add_attr_with_inline 0.01% : 0.000004s : 1: add_comm_op_reuse_tag 0.18% : 0.000055s : 1: add_recomputation 0.01% : 0.000004s : 1: assign_add_opt 0.23% : 0.000070s : 1: auto_monad 0.07% : 0.000020s : 1: auto_monad_reorder 0.01% : 0.000004s : 1: begin_end_overlap_inline 0.02% : 0.000006s : 1: bias_add_comm_swap 1.75% : 0.000526s : 1: bootstrap 0.12% : 0.000035s : 1: cconv 0.01% : 0.000004s : 1: comm_op_add_attrs 0.06% : 0.000017s : 1: control_data_broadcast_order 0.03% : 0.000010s : 1: convert_after_rewriter 0.08% : 0.000024s : 1: cse_after_recomputation 0.02% : 0.000005s : 1: dataset_repeat_opt 0.02% : 0.000006s : 1: detach_backward 0.03% : 0.000009s : 1: environ_conv 0.10% : 0.000030s : 1: event_method 0.02% : 0.000005s : 1: full_micro_interleaved_order_control 0.02% : 0.000005s : 1: get_jit_bprop_graph 0.03% : 0.000010s : 1: graph_reusing 0.02% : 0.000005s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000004s : 1: handle_group_info 0.02% : 0.000006s : 1: inline 0.02% : 0.000006s : 1: insert-virtual-dataset 0.01% : 0.000004s : 1: interleave_parallel_branches 0.01% : 0.000004s : 1: interleave_split_concat_branches 0.02% : 0.000006s : 1: label_fine_grained_interleaved_index 0.02% : 0.000007s : 1: label_micro_interleaved_index 1.53% : 0.000459s : 1: loop_unroll 0.01% : 0.000004s : 1: merge_cast_opt 0.02% : 0.000005s : 1: micro_interleaved_order_control 2.17% : 0.000651s : 1: mutable_eliminate 0.02% : 0.000007s : 1: offloading_packed_experts 0.05% : 0.000015s : 1: opt.transform.loop_unroll_optimizer 0.06% : 0.000018s : 1: opt.transform.mutable_eliminate 4.24% : 0.001274s : 78: opt.transform.opt_a 0.10% : 0.000030s : 1: opt.transform.opt_after_cconv 0.09% : 0.000028s : 1: opt.transform.opt_after_jit_grad 0.34% : 0.000103s : 28: opt.transform.opt_b 0.17% : 0.000051s : 2: opt.transform.opt_trans_graph 0.12% : 0.000037s : 4: opt.transform.symbol_engine_opt 9.90% : 0.002976s : 1: opt_a 0.36% : 0.000109s : 1: opt_after_cconv 1.65% : 0.000497s : 1: opt_after_jit_grad 0.72% : 0.000218s : 1: opt_b 17.30% : 0.005201s : 1: optimize 0.07% : 0.000021s : 1: optimize_parallel_all_gather_comm 0.03% : 0.000008s : 1: order_py_execute_after_rewriter 0.08% : 0.000025s : 1: overlap_grad_flash_sp 0.01% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.02% : 0.000007s : 1: overlap_grad_ring_attention 0.02% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.02% : 0.000005s : 1: overlap_param_gather 0.01% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.03% : 0.000008s : 1: overlap_recompute_and_grad_model_parallel 0.02% : 0.000005s : 1: overlap_recompute_comm 0.02% : 0.000007s : 1: parallel-infer-symbol 0.01% : 0.000004s : 1: parallel-infer-symbol-second 0.02% : 0.000005s : 1: partial_unused_args_eliminate 0.02% : 0.000005s : 1: pipeline_parallel_scheduler 0.02% : 0.000005s : 1: pipeline_split 0.15% : 0.000045s : 1: pre_auto_parallel 0.12% : 0.000035s : 1: py_interpret_to_execute 0.05% : 0.000016s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000004s : 1: remove_cast_before_assign_add 0.06% : 0.000018s : 1: remove_dup_value 1.49% : 0.000447s : 1: renormalize.infer 1.19% : 0.000357s : 1: renormalize.specialize 0.02% : 0.000005s : 1: reorder_send_recv_between_fp_bp 0.03% : 0.000008s : 1: rewriter_after_jit_bprop_graph 0.14% : 0.000043s : 1: rewriter_after_opt_a 0.31% : 0.000095s : 1: rewriter_before_opt_a 0.02% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.02% : 0.000005s : 1: slice_recompute_activation 0.02% : 0.000005s : 1: split_layernorm_comm 0.02% : 0.000005s : 1: split_matmul_comm_elemetwise 0.03% : 0.000008s : 1: swap_dp_allreduce_reducescatter 0.27% : 0.000082s : 1: symbol_engine_optimizer 0.27% : 0.000082s : 1: tuple_transform 24.84% : 0.007468s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:43:16.552.385 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:43:16.552.659 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0199488, [21] [bootstrap]: 0.00049159 [type_inference]: 0.00747206 [event_method]: 2.2e-05 [auto_monad]: 6.832e-05 [graph_reusing]: 5.99999e-06 [inline]: 3.09999e-06 [add_attr]: 0.00435737, [1] [add_attr_with_inline]: 0.00434521, [1] [Cycle 1]: 9.273e-05, [2] [tag_attr]: 2.262e-05 [meta_addattr_fg_expand]: 6.36e-06 [parallel-infer-symbol]: 3.59002e-06 [pre_auto_parallel]: 3.914e-05 [insert-virtual-dataset]: 2.41e-06 [parallel-infer-symbol-second]: 8.09989e-07 [dataset_repeat_opt]: 1.84e-06 [pipeline_split]: 1.67001e-06 [optimize]: 0.00611219, [53] [py_interpret_to_execute]: 3.366e-05 [rewriter_before_opt_a]: 9.796e-05 [opt_a]: 0.00345203, [2] [Cycle 1]: 0.00251706, [45] [expand_dump_flag]: 3.33e-06 [switch_simplify]: 4.292e-05 [loop_unroll]: 3.11e-05 [a_1]: 0.00067358 [with_stream_mark]: 2.015e-05 [recompute_prepare]: 9.46e-06 [updatestate_depend_eliminate]: 4.11001e-06 [updatestate_assign_eliminate]: 3.50003e-06 [updatestate_loads_eliminate]: 3.03e-06 [parameter_eliminate]: 2.04e-06 [a_2]: 0.00011896 [accelerated_algorithm]: 7.16999e-06 [shard]: 1.80001e-06 [meta_shard_fg_expand]: 2.21e-06 [shard_inline]: 6.78998e-06 [merge_send_recv]: 8.38999e-06 [auto_parallel]: 8.11002e-06 [parallel]: 1.919e-05 [flash_sp]: 8.95999e-06 [merge_comm]: 4.23999e-06 [allreduce_fusion]: 3.85e-06 [matmul_add_comm_reduction]: 1.053e-05 [allreduce_slice_to_reducescatter]: 1.07998e-06 [virtual_shard_identity]: 7.84002e-06 [virtual_dataset]: 7.15e-06 [get_grad_eliminate_]: 6.33e-06 [virtual_output]: 6.93e-06 [merge_forward]: 4.41002e-06 [cell_reuse_recompute_pass]: 1.79e-06 [offload_activation]: 1.106e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.582e-05 [merge_recompute_call_nodes]: 1.52001e-06 [before_grad]: 1.213e-05 [set_forward_comm_id_for_comm_node_pass]: 4.27998e-06 [meta_fg_expand]: 3.35e-06 [flash_sp_send_recv_attached]: 2.61e-06 [receive_attached]: 2.56e-06 [after_resolve]: 1.311e-05 [a_after_grad]: 1.107e-05 [renormalize]: 0.00087479 [add_forward_monad_depend]: 7.97e-06 [auto_monad_grad]: 3.03e-06 [auto_monad_eliminator]: 1.781e-05 [cse]: 3.153e-05 [a_3]: 6.784e-05 [Cycle 2]: 0.00091859, [45] [expand_dump_flag]: 1.47001e-06 [switch_simplify]: 8.39998e-06 [loop_unroll]: 6.56e-06 [a_1]: 0.00015245 [with_stream_mark]: 1.32e-05 [recompute_prepare]: 7.55e-06 [updatestate_depend_eliminate]: 3.35e-06 [updatestate_assign_eliminate]: 3.06001e-06 [updatestate_loads_eliminate]: 2.14e-06 [parameter_eliminate]: 1.42999e-06 [a_2]: 0.00011149 [accelerated_algorithm]: 7.11001e-06 [shard]: 1.49998e-06 [meta_shard_fg_expand]: 2.14999e-06 [shard_inline]: 6.87002e-06 [merge_send_recv]: 6.14001e-06 [auto_parallel]: 6.48e-06 [parallel]: 7.81001e-06 [flash_sp]: 4.06001e-06 [merge_comm]: 3.5e-06 [allreduce_fusion]: 3.43e-06 [matmul_add_comm_reduction]: 8.65001e-06 [allreduce_slice_to_reducescatter]: 4.39992e-07 [virtual_shard_identity]: 7.2e-06 [virtual_dataset]: 7.03e-06 [get_grad_eliminate_]: 6.93e-06 [virtual_output]: 6.85998e-06 [merge_forward]: 4.08999e-06 [cell_reuse_recompute_pass]: 2.66e-06 [offload_activation]: 8.90001e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.756e-05 [merge_recompute_call_nodes]: 1.22999e-06 [before_grad]: 1.132e-05 [set_forward_comm_id_for_comm_node_pass]: 3.97002e-06 [meta_fg_expand]: 2.44001e-06 [flash_sp_send_recv_attached]: 1.09e-06 [receive_attached]: 2.89999e-06 [after_resolve]: 1.187e-05 [a_after_grad]: 9.49999e-06 [renormalize]: 5.9983e-08 [add_forward_monad_depend]: 1.86998e-06 [auto_monad_grad]: 1.76e-06 [auto_monad_eliminator]: 9.67001e-06 [cse]: 1.602e-05 [a_3]: 5.018e-05 [py_interpret_to_execute_after_opt_a]: 1.618e-05 [slice_cell_reuse_recomputed_activation]: 4.70999e-06 [rewriter_after_opt_a]: 4.281e-05 [convert_after_rewriter]: 9.85002e-06 [order_py_execute_after_rewriter]: 8.05e-06 [mutable_eliminate]: 0.00071563 [opt_b]: 0.0003026, [1] [Cycle 1]: 0.00029126, [7] [b_1]: 0.00018848 [b_2]: 1.016e-05 [updatestate_depend_eliminate]: 7.79002e-06 [updatestate_assign_eliminate]: 2.93e-06 [updatestate_loads_eliminate]: 2.70002e-06 [renormalize]: 5.99975e-07 [cse]: 1.802e-05 [optimize_parallel_all_gather_comm]: 2.116e-05 [overlap_param_gather]: 5.14998e-06 [cconv]: 3.366e-05 [loop_unroll]: 0.00046172 [opt_after_cconv]: 0.00013359, [1] [Cycle 1]: 0.00012344, [7] [c_1]: 3.294e-05 [parameter_eliminate]: 3.91001e-06 [updatestate_depend_eliminate]: 5.64e-06 [updatestate_assign_eliminate]: 2.53e-06 [updatestate_loads_eliminate]: 2.39999e-06 [cse]: 1.815e-05 [renormalize]: 4.19997e-07 [remove_dup_value]: 1.791e-05 [tuple_transform]: 9.364e-05, [1] [Cycle 1]: 8.596e-05, [4] [d_1]: 4.603e-05 [none_parameter_eliminate]: 1.77999e-06 [renormalize]: 2.59985e-07 [switch_simplify]: 7.33e-06 [partial_unused_args_eliminate]: 4.55999e-06 [add_recomputation]: 5.147e-05 [cse_after_recomputation]: 2.826e-05, [1] [Cycle 1]: 2.092e-05, [1] [cse]: 1.185e-05 [environ_conv]: 8.94998e-06 [swap_dp_allreduce_reducescatter]: 7.59002e-06 [bias_add_comm_swap]: 5.59e-06 [label_micro_interleaved_index]: 7.55998e-06 [label_fine_grained_interleaved_index]: 5.25001e-06 [merge_cast_opt]: 4.18999e-06 [slice_recompute_activation]: 4.53999e-06 [micro_interleaved_order_control]: 4.51002e-06 [assign_add_opt]: 3.51999e-06 [ForceFp32Comm]: 3.35e-06 [remove_cast_before_assign_add]: 3.31999e-06 [full_micro_interleaved_order_control]: 4.84003e-06 [reorder_send_recv_between_fp_bp]: 5.31998e-06 [comm_op_add_attrs]: 3.7e-06 [add_comm_op_reuse_tag]: 3.35e-06 [interleave_split_concat_branches]: 4e-06 [interleave_parallel_branches]: 3.5e-06 [overlap_opt_shard_in_pipeline]: 3.44001e-06 [overlap_opt_shard_grad_in_pipeline]: 4.32998e-06 [control_data_broadcast_order]: 4.425e-05 [grouped_pairwise_exchange_alltoall]: 4.67998e-06 [offloading_packed_experts]: 7.53e-06 [overlap_recompute_and_grad_model_parallel]: 7.62998e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.56999e-06 [overlap_recompute_allgather_and_fa_grad]: 3.68999e-06 [overlap_recompute_comm]: 4.53999e-06 [overlap_grad_ring_attention]: 6.67002e-06 [overlap_grad_flash_sp]: 2.458e-05 [begin_end_overlap_inline]: 3.27997e-06 [split_matmul_comm_elemetwise]: 5.13002e-06 [split_layernorm_comm]: 4.65999e-06 [handle_group_info]: 3.99002e-06 [symbol_engine_optimizer]: 0.00014244, [1] [Cycle 1]: 0.00013421, [6] [build]: 3.78001e-06 [elim_shapecalc]: 1.379e-05 [elim_not_effective]: 1.467e-05 [opt_reshape]: 7.3e-06 [fold_const_symbol]: 1.123e-05 [renormalize]: 2.80008e-07 [detach_backward]: 4.70999e-06 [pipeline_parallel_scheduler]: 2.17999e-06 [auto_monad_reorder]: 2.001e-05 [get_jit_bprop_graph]: 1.93002e-06 [rewriter_after_jit_bprop_graph]: 6.22001e-06 [opt_after_jit_grad]: 0.00065391 [validate]: 5.046e-05 Sums bootstrap : 0.000492s : 3.59% type_inference : 0.007472s : 54.54% event_method : 0.000022s : 0.16% auto_monad : 0.000068s : 0.50% graph_reusing : 0.000006s : 0.04% inline : 0.000003s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000023s : 0.17% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.05% parallel-infer-symbol : 0.000004s : 0.03% pre_auto_parallel : 0.000039s : 0.29% insert-virtual-dataset : 0.000002s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.01% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000034s : 0.25% optimize.rewriter_before_opt_a : 0.000098s : 0.72% optimize.opt_a.expand_dump_flag : 0.000005s : 0.04% optimize.opt_a.switch_simplify : 0.000051s : 0.37% optimize.opt_a.loop_unroll : 0.000038s : 0.27% optimize.opt_a.a_1 : 0.000826s : 6.03% optimize.opt_a.with_stream_mark : 0.000033s : 0.24% optimize.opt_a.recompute_prepare : 0.000017s : 0.12% optimize.opt_a.updatestate_depend_eliminate : 0.000007s : 0.05% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.05% optimize.opt_a.updatestate_loads_eliminate : 0.000005s : 0.04% optimize.opt_a.parameter_eliminate : 0.000003s : 0.03% optimize.opt_a.a_2 : 0.000230s : 1.68% optimize.opt_a.accelerated_algorithm : 0.000014s : 0.10% optimize.opt_a.shard : 0.000003s : 0.02% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.03% optimize.opt_a.shard_inline : 0.000014s : 0.10% optimize.opt_a.merge_send_recv : 0.000015s : 0.11% optimize.opt_a.auto_parallel : 0.000015s : 0.11% optimize.opt_a.parallel : 0.000027s : 0.20% optimize.opt_a.flash_sp : 0.000013s : 0.10% optimize.opt_a.merge_comm : 0.000008s : 0.06% optimize.opt_a.allreduce_fusion : 0.000007s : 0.05% optimize.opt_a.matmul_add_comm_reduction : 0.000019s : 0.14% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000002s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000015s : 0.11% optimize.opt_a.virtual_dataset : 0.000014s : 0.10% optimize.opt_a.get_grad_eliminate_ : 0.000013s : 0.10% optimize.opt_a.virtual_output : 0.000014s : 0.10% optimize.opt_a.merge_forward : 0.000009s : 0.06% optimize.opt_a.cell_reuse_recompute_pass : 0.000004s : 0.03% optimize.opt_a.offload_activation : 0.000020s : 0.15% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000033s : 0.24% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.02% optimize.opt_a.before_grad : 0.000023s : 0.17% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000008s : 0.06% optimize.opt_a.meta_fg_expand : 0.000006s : 0.04% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.03% optimize.opt_a.receive_attached : 0.000005s : 0.04% optimize.opt_a.after_resolve : 0.000025s : 0.18% optimize.opt_a.a_after_grad : 0.000021s : 0.15% optimize.opt_a.renormalize : 0.000875s : 6.39% optimize.opt_a.add_forward_monad_depend : 0.000010s : 0.07% optimize.opt_a.auto_monad_grad : 0.000005s : 0.03% optimize.opt_a.auto_monad_eliminator : 0.000027s : 0.20% optimize.opt_a.cse : 0.000048s : 0.35% optimize.opt_a.a_3 : 0.000118s : 0.86% optimize.py_interpret_to_execute_after_opt_a : 0.000016s : 0.12% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.03% optimize.rewriter_after_opt_a : 0.000043s : 0.31% optimize.convert_after_rewriter : 0.000010s : 0.07% optimize.order_py_execute_after_rewriter : 0.000008s : 0.06% optimize.mutable_eliminate : 0.000716s : 5.22% optimize.opt_b.b_1 : 0.000188s : 1.38% optimize.opt_b.b_2 : 0.000010s : 0.07% optimize.opt_b.updatestate_depend_eliminate : 0.000008s : 0.06% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.02% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.02% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000018s : 0.13% optimize.optimize_parallel_all_gather_comm : 0.000021s : 0.15% optimize.overlap_param_gather : 0.000005s : 0.04% optimize.cconv : 0.000034s : 0.25% optimize.loop_unroll : 0.000462s : 3.37% optimize.opt_after_cconv.c_1 : 0.000033s : 0.24% optimize.opt_after_cconv.parameter_eliminate : 0.000004s : 0.03% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.04% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.02% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.02% optimize.opt_after_cconv.cse : 0.000018s : 0.13% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000018s : 0.13% optimize.tuple_transform.d_1 : 0.000046s : 0.34% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000007s : 0.05% optimize.partial_unused_args_eliminate : 0.000005s : 0.03% optimize.add_recomputation : 0.000051s : 0.38% optimize.cse_after_recomputation.cse : 0.000012s : 0.09% optimize.environ_conv : 0.000009s : 0.07% optimize.swap_dp_allreduce_reducescatter : 0.000008s : 0.06% optimize.bias_add_comm_swap : 0.000006s : 0.04% optimize.label_micro_interleaved_index : 0.000008s : 0.06% optimize.label_fine_grained_interleaved_index : 0.000005s : 0.04% optimize.merge_cast_opt : 0.000004s : 0.03% optimize.slice_recompute_activation : 0.000005s : 0.03% optimize.micro_interleaved_order_control : 0.000005s : 0.03% optimize.assign_add_opt : 0.000004s : 0.03% optimize.ForceFp32Comm : 0.000003s : 0.02% optimize.remove_cast_before_assign_add : 0.000003s : 0.02% optimize.full_micro_interleaved_order_control : 0.000005s : 0.04% optimize.reorder_send_recv_between_fp_bp : 0.000005s : 0.04% optimize.comm_op_add_attrs : 0.000004s : 0.03% optimize.add_comm_op_reuse_tag : 0.000003s : 0.02% optimize.interleave_split_concat_branches : 0.000004s : 0.03% optimize.interleave_parallel_branches : 0.000003s : 0.03% optimize.overlap_opt_shard_in_pipeline : 0.000003s : 0.03% optimize.overlap_opt_shard_grad_in_pipeline : 0.000004s : 0.03% optimize.control_data_broadcast_order : 0.000044s : 0.32% optimize.grouped_pairwise_exchange_alltoall : 0.000005s : 0.03% optimize.offloading_packed_experts : 0.000008s : 0.05% optimize.overlap_recompute_and_grad_model_parallel : 0.000008s : 0.06% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.03% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.03% optimize.overlap_recompute_comm : 0.000005s : 0.03% optimize.overlap_grad_ring_attention : 0.000007s : 0.05% optimize.overlap_grad_flash_sp : 0.000025s : 0.18% optimize.begin_end_overlap_inline : 0.000003s : 0.02% optimize.split_matmul_comm_elemetwise : 0.000005s : 0.04% optimize.split_layernorm_comm : 0.000005s : 0.03% optimize.handle_group_info : 0.000004s : 0.03% optimize.symbol_engine_optimizer.build : 0.000004s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000014s : 0.10% optimize.symbol_engine_optimizer.elim_not_effective : 0.000015s : 0.11% optimize.symbol_engine_optimizer.opt_reshape : 0.000007s : 0.05% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000011s : 0.08% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000005s : 0.03% pipeline_parallel_scheduler : 0.000002s : 0.02% auto_monad_reorder : 0.000020s : 0.15% get_jit_bprop_graph : 0.000002s : 0.01% rewriter_after_jit_bprop_graph : 0.000006s : 0.05% opt_after_jit_grad : 0.000654s : 4.77% validate : 0.000050s : 0.37% Time group info: ------[substitution.] 0.000238 34 14.60% : 0.000035s : 6: substitution.arithmetic_simplify 0.87% : 0.000002s : 2: substitution.elim_not_effective 0.57% : 0.000001s : 2: substitution.fold_const_symbol 2.44% : 0.000006s : 4: substitution.graph_param_transform 67.50% : 0.000160s : 4: substitution.inline 1.86% : 0.000004s : 4: substitution.j_node_and_user_rematch 3.12% : 0.000007s : 4: substitution.remove_not_recompute_node 2.39% : 0.000006s : 4: substitution.replace_old_param 6.65% : 0.000016s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.007403 2 88.52% : 0.006553s : 1: type_inference.infer 11.48% : 0.000850s : 1: type_inference.specialize ------[replace.] 0.000065 8 62.18% : 0.000040s : 4: replace.inline 37.82% : 0.000024s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000172 8 91.85% : 0.000158s : 4: match.inline 8.15% : 0.000014s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000223 1278 0.83% : 0.000002s : 13: predicate.accumulaten_eliminater 1.02% : 0.000002s : 4: predicate.ad_related_special_op_eliminate 0.50% : 0.000001s : 8: predicate.addn_check_dump 0.97% : 0.000002s : 13: predicate.addn_zero_filter 0.76% : 0.000002s : 13: predicate.adjust_all_reduce_mul_add 2.54% : 0.000006s : 21: predicate.arithmetic_simplify 0.91% : 0.000002s : 13: predicate.cast_eliminate 0.59% : 0.000001s : 8: predicate.check_bprop_eliminate 0.47% : 0.000001s : 8: predicate.compare_switch_simplify 0.21% : 0.000000s : 4: predicate.const_output_eliminate 0.70% : 0.000002s : 8: predicate.depend_value_elim 0.92% : 0.000002s : 13: predicate.dict_get_item_const_eliminator 0.93% : 0.000002s : 13: predicate.dict_get_item_eliminator 1.06% : 0.000002s : 13: predicate.dict_set_item_eliminator 1.05% : 0.000002s : 8: predicate.dumpgradient_eliminate 0.21% : 0.000000s : 4: predicate.elim_not_effective 0.64% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.10% : 0.000002s : 17: predicate.environ_add_const_eliminate 1.09% : 0.000002s : 17: predicate.environ_get_add_eliminate 1.02% : 0.000002s : 17: predicate.environ_get_depend_swap 1.74% : 0.000004s : 25: predicate.environ_get_eliminate 1.15% : 0.000003s : 17: predicate.environ_get_set_eliminate 1.30% : 0.000003s : 21: predicate.exchange_switch_depend_value 2.51% : 0.000006s : 21: predicate.float_depend_g_call 0.63% : 0.000001s : 8: predicate.float_environ_get_switch 0.78% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.17% : 0.000000s : 4: predicate.fold_const_symbol 0.80% : 0.000002s : 8: predicate.get_grad_eliminate 0.20% : 0.000000s : 4: predicate.graph_param_transform 0.55% : 0.000001s : 8: predicate.incorporate_call 0.48% : 0.000001s : 8: predicate.incorporate_call_switch 6.15% : 0.000014s : 58: predicate.inline 0.69% : 0.000002s : 8: predicate.inline_without_move 0.39% : 0.000001s : 8: predicate.j_node_and_user_rematch 0.86% : 0.000002s : 8: predicate.less_batch_normalization 1.79% : 0.000004s : 25: predicate.list_to_tuple_eliminator_ 2.54% : 0.000006s : 38: predicate.load_eliminater 0.89% : 0.000002s : 4: predicate.loop_unroll_after_grad 2.22% : 0.000005s : 34: predicate.loop_unroll_before_grad 1.68% : 0.000004s : 21: predicate.make_slice_get_slice_eliminator 0.73% : 0.000002s : 8: predicate.merge_addn 0.58% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.58% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.76% : 0.000002s : 13: predicate.minmaximum_grad 1.34% : 0.000003s : 4: predicate.mutable_eliminate 0.34% : 0.000001s : 4: predicate.opt_reshape 0.38% : 0.000001s : 4: predicate.parallel_virtual_node 1.77% : 0.000004s : 21: predicate.partial_defer_inline 1.51% : 0.000003s : 21: predicate.partial_eliminate 0.86% : 0.000002s : 13: predicate.print_const_string_wrapper 0.55% : 0.000001s : 8: predicate.reduce_all_const_elim 1.10% : 0.000002s : 13: predicate.reduce_eliminate 2.61% : 0.000006s : 38: predicate.redundant_stop_gradient_eliminater 0.42% : 0.000001s : 8: predicate.remove_not_recompute_node 1.42% : 0.000003s : 25: predicate.replace_applicator 0.42% : 0.000001s : 8: predicate.replace_old_param 0.35% : 0.000001s : 4: predicate.reset_defer_inline 0.93% : 0.000002s : 13: predicate.reshape_eliminate 0.65% : 0.000001s : 8: predicate.row_tensor_add_zeros_like 0.44% : 0.000001s : 4: predicate.row_tensor_eliminate 0.79% : 0.000002s : 8: predicate.same_eliminate 0.43% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.84% : 0.000002s : 8: predicate.shard_identity_eliminate 0.86% : 0.000002s : 8: predicate.special_op_eliminate 0.69% : 0.000002s : 8: predicate.specialize_transform 0.88% : 0.000002s : 8: predicate.split_environ_get_set_with_tuple_value 0.94% : 0.000002s : 8: predicate.stack_unstack_eliminate 0.31% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.41% : 0.000003s : 21: predicate.switch_defer_inline 1.90% : 0.000004s : 29: predicate.switch_layer_defer_inline 4.80% : 0.000011s : 67: predicate.switch_simplify 0.82% : 0.000002s : 13: predicate.tile_eliminate 0.84% : 0.000002s : 13: predicate.transpose_eliminate 1.56% : 0.000003s : 21: predicate.tuple_list_convert_item_index_to_positive 1.52% : 0.000003s : 21: predicate.tuple_list_get_item_const_eliminator 1.38% : 0.000003s : 21: predicate.tuple_list_get_item_depend_reorder 3.64% : 0.000008s : 33: predicate.tuple_list_get_item_eliminator 1.60% : 0.000004s : 21: predicate.tuple_list_get_set_item_eliminator 2.17% : 0.000005s : 29: predicate.tuple_list_set_item_eliminator 1.64% : 0.000004s : 25: predicate.tuple_to_list_eliminator_ 2.36% : 0.000005s : 38: predicate.updatestate_pure_node_eliminater 3.21% : 0.000007s : 46: predicate.updatestate_useless_node_eliminater 0.38% : 0.000001s : 4: predicate.value_based_eliminate 0.89% : 0.000002s : 8: predicate.virtual_dataset_eliminate 0.66% : 0.000001s : 8: predicate.virtual_output_eliminate 0.29% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.47% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000674 11 53.12% : 0.000358s : 5: func_graph_cloner_run.FuncGraphClonerGraph 46.88% : 0.000316s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.032730 192 0.02% : 0.000006s : 1: ForceFp32Comm 13.35% : 0.004369s : 1: add_attr 13.29% : 0.004350s : 1: add_attr_with_inline 0.02% : 0.000007s : 1: add_comm_op_reuse_tag 0.17% : 0.000055s : 1: add_recomputation 0.02% : 0.000006s : 1: assign_add_opt 0.24% : 0.000078s : 1: auto_monad 0.09% : 0.000029s : 1: auto_monad_reorder 0.02% : 0.000006s : 1: begin_end_overlap_inline 0.03% : 0.000008s : 1: bias_add_comm_swap 1.65% : 0.000540s : 1: bootstrap 0.11% : 0.000037s : 1: cconv 0.02% : 0.000006s : 1: comm_op_add_attrs 0.15% : 0.000048s : 1: control_data_broadcast_order 0.04% : 0.000013s : 1: convert_after_rewriter 0.10% : 0.000032s : 1: cse_after_recomputation 0.02% : 0.000007s : 1: dataset_repeat_opt 0.06% : 0.000021s : 1: detach_backward 0.04% : 0.000012s : 1: environ_conv 0.10% : 0.000033s : 1: event_method 0.02% : 0.000008s : 1: full_micro_interleaved_order_control 0.03% : 0.000009s : 1: get_jit_bprop_graph 0.04% : 0.000013s : 1: graph_reusing 0.02% : 0.000008s : 1: grouped_pairwise_exchange_alltoall 0.02% : 0.000007s : 1: handle_group_info 0.03% : 0.000009s : 1: inline 0.03% : 0.000009s : 1: insert-virtual-dataset 0.02% : 0.000006s : 1: interleave_parallel_branches 0.02% : 0.000007s : 1: interleave_split_concat_branches 0.02% : 0.000008s : 1: label_fine_grained_interleaved_index 0.03% : 0.000010s : 1: label_micro_interleaved_index 1.43% : 0.000468s : 1: loop_unroll 0.02% : 0.000007s : 1: merge_cast_opt 0.02% : 0.000007s : 1: micro_interleaved_order_control 2.21% : 0.000722s : 1: mutable_eliminate 0.03% : 0.000011s : 1: offloading_packed_experts 0.04% : 0.000015s : 1: opt.transform.loop_unroll_optimizer 0.06% : 0.000019s : 1: opt.transform.mutable_eliminate 3.93% : 0.001286s : 78: opt.transform.opt_a 0.10% : 0.000032s : 1: opt.transform.opt_after_cconv 0.10% : 0.000031s : 1: opt.transform.opt_after_jit_grad 0.36% : 0.000117s : 28: opt.transform.opt_b 0.15% : 0.000051s : 2: opt.transform.opt_trans_graph 0.13% : 0.000043s : 4: opt.transform.symbol_engine_opt 10.56% : 0.003456s : 1: opt_a 0.42% : 0.000138s : 1: opt_after_cconv 2.04% : 0.000667s : 1: opt_after_jit_grad 0.94% : 0.000306s : 1: opt_b 19.72% : 0.006456s : 1: optimize 0.07% : 0.000024s : 1: optimize_parallel_all_gather_comm 0.03% : 0.000011s : 1: order_py_execute_after_rewriter 0.09% : 0.000028s : 1: overlap_grad_flash_sp 0.02% : 0.000006s : 1: overlap_grad_matmul_and_grad_allreduce 0.03% : 0.000010s : 1: overlap_grad_ring_attention 0.02% : 0.000007s : 1: overlap_opt_shard_grad_in_pipeline 0.02% : 0.000006s : 1: overlap_opt_shard_in_pipeline 0.03% : 0.000008s : 1: overlap_param_gather 0.02% : 0.000006s : 1: overlap_recompute_allgather_and_fa_grad 0.03% : 0.000011s : 1: overlap_recompute_and_grad_model_parallel 0.02% : 0.000007s : 1: overlap_recompute_comm 0.03% : 0.000010s : 1: parallel-infer-symbol 0.02% : 0.000007s : 1: parallel-infer-symbol-second 0.02% : 0.000007s : 1: partial_unused_args_eliminate 0.03% : 0.000010s : 1: pipeline_parallel_scheduler 0.02% : 0.000007s : 1: pipeline_split 0.14% : 0.000046s : 1: pre_auto_parallel 0.11% : 0.000037s : 1: py_interpret_to_execute 0.06% : 0.000020s : 1: py_interpret_to_execute_after_opt_a 0.02% : 0.000006s : 1: remove_cast_before_assign_add 0.07% : 0.000021s : 1: remove_dup_value 1.46% : 0.000478s : 1: renormalize.infer 1.18% : 0.000388s : 1: renormalize.specialize 0.02% : 0.000008s : 1: reorder_send_recv_between_fp_bp 0.04% : 0.000012s : 1: rewriter_after_jit_bprop_graph 0.14% : 0.000047s : 1: rewriter_after_opt_a 0.31% : 0.000102s : 1: rewriter_before_opt_a 0.02% : 0.000008s : 1: slice_cell_reuse_recomputed_activation 0.02% : 0.000008s : 1: slice_recompute_activation 0.02% : 0.000008s : 1: split_layernorm_comm 0.02% : 0.000008s : 1: split_matmul_comm_elemetwise 0.03% : 0.000011s : 1: swap_dp_allreduce_reducescatter 0.45% : 0.000146s : 1: symbol_engine_optimizer 0.30% : 0.000097s : 1: tuple_transform 23.00% : 0.007528s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:43:17.687.085 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0367707, [21] [bootstrap]: 0.00046996 [type_inference]: 0.00650374 [event_method]: 1.979e-05 [auto_monad]: 6.595e-05 [graph_reusing]: 6.31e-06 [inline]: 2.34999e-06 [add_attr]: 0.0238195, [1] [add_attr_with_inline]: 0.0238033, [1] [Cycle 1]: 8.047e-05, [2] [tag_attr]: 2.501e-05 [meta_addattr_fg_expand]: 5.84e-06 [parallel-infer-symbol]: 3.67002e-06 [pre_auto_parallel]: 4.382e-05 [insert-virtual-dataset]: 2.56998e-06 [parallel-infer-symbol-second]: 8.09989e-07 [dataset_repeat_opt]: 2.19999e-06 [pipeline_split]: 1.69e-06 [optimize]: 0.00512272, [53] [py_interpret_to_execute]: 3.643e-05 [rewriter_before_opt_a]: 9.58e-05 [opt_a]: 0.00287927, [2] [Cycle 1]: 0.00220338, [45] [expand_dump_flag]: 3.16001e-06 [switch_simplify]: 4.318e-05 [loop_unroll]: 3.09e-05 [a_1]: 0.00068916 [with_stream_mark]: 1.775e-05 [recompute_prepare]: 9.69999e-06 [updatestate_depend_eliminate]: 4.31002e-06 [updatestate_assign_eliminate]: 3.23998e-06 [updatestate_loads_eliminate]: 2.81e-06 [parameter_eliminate]: 2.19001e-06 [a_2]: 8.931e-05 [accelerated_algorithm]: 7.75e-06 [shard]: 1.72001e-06 [meta_shard_fg_expand]: 1.86e-06 [shard_inline]: 6.93e-06 [merge_send_recv]: 8.15999e-06 [auto_parallel]: 7.06999e-06 [parallel]: 1.956e-05 [flash_sp]: 9.64e-06 [merge_comm]: 3.94002e-06 [allreduce_fusion]: 3.83001e-06 [matmul_add_comm_reduction]: 9.49e-06 [allreduce_slice_to_reducescatter]: 8.50006e-07 [virtual_shard_identity]: 8.86997e-06 [virtual_dataset]: 6.85998e-06 [get_grad_eliminate_]: 6.16e-06 [virtual_output]: 6.23e-06 [merge_forward]: 3.75e-06 [cell_reuse_recompute_pass]: 1.24998e-06 [offload_activation]: 1.009e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.172e-05 [merge_recompute_call_nodes]: 2.02999e-06 [before_grad]: 1.036e-05 [set_forward_comm_id_for_comm_node_pass]: 3.61001e-06 [meta_fg_expand]: 2.88998e-06 [flash_sp_send_recv_attached]: 2.93e-06 [receive_attached]: 2.37999e-06 [after_resolve]: 1.209e-05 [a_after_grad]: 1.013e-05 [renormalize]: 0.00078054 [add_forward_monad_depend]: 6.06e-06 [auto_monad_grad]: 2.41e-06 [auto_monad_eliminator]: 1.552e-05 [cse]: 3.047e-05 [a_3]: 4.904e-05 [Cycle 2]: 0.00066559, [45] [expand_dump_flag]: 9.40025e-07 [switch_simplify]: 7.71999e-06 [loop_unroll]: 6.41e-06 [a_1]: 0.00014169 [with_stream_mark]: 1.295e-05 [recompute_prepare]: 6.78e-06 [updatestate_depend_eliminate]: 3.25e-06 [updatestate_assign_eliminate]: 2.47001e-06 [updatestate_loads_eliminate]: 2.41e-06 [parameter_eliminate]: 1.25001e-06 [a_2]: 7.772e-05 [accelerated_algorithm]: 6.63998e-06 [shard]: 1.14e-06 [meta_shard_fg_expand]: 1.76998e-06 [shard_inline]: 6.02001e-06 [merge_send_recv]: 5.16998e-06 [auto_parallel]: 5.22e-06 [parallel]: 4.35999e-06 [flash_sp]: 3.16999e-06 [merge_comm]: 6.17999e-06 [allreduce_fusion]: 3.29001e-06 [matmul_add_comm_reduction]: 6.23e-06 [allreduce_slice_to_reducescatter]: 4.09986e-07 [virtual_shard_identity]: 6.68e-06 [virtual_dataset]: 6.75002e-06 [get_grad_eliminate_]: 5.89e-06 [virtual_output]: 5.81e-06 [merge_forward]: 2.82002e-06 [cell_reuse_recompute_pass]: 1.24e-06 [offload_activation]: 6.72002e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.07e-05 [merge_recompute_call_nodes]: 7.7e-07 [before_grad]: 8.87e-06 [set_forward_comm_id_for_comm_node_pass]: 4.63999e-06 [meta_fg_expand]: 1.82999e-06 [flash_sp_send_recv_attached]: 8.00006e-07 [receive_attached]: 1.14e-06 [after_resolve]: 1.149e-05 [a_after_grad]: 9.72001e-06 [renormalize]: 7.99773e-08 [add_forward_monad_depend]: 1.22e-06 [auto_monad_grad]: 9.39996e-07 [auto_monad_eliminator]: 7.5e-06 [cse]: 1.384e-05 [a_3]: 3.613e-05 [py_interpret_to_execute_after_opt_a]: 9.14998e-06 [slice_cell_reuse_recomputed_activation]: 1.87999e-06 [rewriter_after_opt_a]: 3.32e-05 [convert_after_rewriter]: 6.51e-06 [order_py_execute_after_rewriter]: 4.94003e-06 [mutable_eliminate]: 0.0007191 [opt_b]: 0.00020293, [1] [Cycle 1]: 0.00019602, [7] [b_1]: 0.00012277 [b_2]: 8.46002e-06 [updatestate_depend_eliminate]: 5.47999e-06 [updatestate_assign_eliminate]: 2.26e-06 [updatestate_loads_eliminate]: 2.43e-06 [renormalize]: 6.50005e-07 [cse]: 1.917e-05 [optimize_parallel_all_gather_comm]: 1.66e-05 [overlap_param_gather]: 1.92999e-06 [cconv]: 2.687e-05 [loop_unroll]: 0.00042966 [opt_after_cconv]: 0.00010225, [1] [Cycle 1]: 9.657e-05, [7] [c_1]: 3.14e-05 [parameter_eliminate]: 2.81999e-06 [updatestate_depend_eliminate]: 5.64998e-06 [updatestate_assign_eliminate]: 2.44001e-06 [updatestate_loads_eliminate]: 2.32999e-06 [cse]: 1.706e-05 [renormalize]: 5.00004e-07 [remove_dup_value]: 1.257e-05 [tuple_transform]: 7.759e-05, [1] [Cycle 1]: 7.339e-05, [4] [d_1]: 4.545e-05 [none_parameter_eliminate]: 1.66e-06 [renormalize]: 1.69995e-07 [switch_simplify]: 7.23999e-06 [partial_unused_args_eliminate]: 2.29001e-06 [add_recomputation]: 4.966e-05 [cse_after_recomputation]: 2.159e-05, [1] [Cycle 1]: 1.67e-05, [1] [cse]: 1.113e-05 [environ_conv]: 5.39e-06 [swap_dp_allreduce_reducescatter]: 5.22999e-06 [bias_add_comm_swap]: 3.06001e-06 [label_micro_interleaved_index]: 4.23999e-06 [label_fine_grained_interleaved_index]: 2.84001e-06 [merge_cast_opt]: 1.25999e-06 [slice_recompute_activation]: 2.04999e-06 [micro_interleaved_order_control]: 2.12001e-06 [assign_add_opt]: 1.08001e-06 [ForceFp32Comm]: 1.11002e-06 [remove_cast_before_assign_add]: 1.00001e-06 [full_micro_interleaved_order_control]: 1.96e-06 [reorder_send_recv_between_fp_bp]: 2.69999e-06 [comm_op_add_attrs]: 9.80013e-07 [add_comm_op_reuse_tag]: 9.49978e-07 [interleave_split_concat_branches]: 1.14e-06 [interleave_parallel_branches]: 1.19998e-06 [overlap_opt_shard_in_pipeline]: 1.40999e-06 [overlap_opt_shard_grad_in_pipeline]: 2.01e-06 [control_data_broadcast_order]: 1.329e-05 [grouped_pairwise_exchange_alltoall]: 1.55999e-06 [offloading_packed_experts]: 3.48999e-06 [overlap_recompute_and_grad_model_parallel]: 4.47e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.15001e-06 [overlap_recompute_allgather_and_fa_grad]: 1.32e-06 [overlap_recompute_comm]: 2.46e-06 [overlap_grad_ring_attention]: 3.93999e-06 [overlap_grad_flash_sp]: 2.07e-05 [begin_end_overlap_inline]: 5.89993e-07 [split_matmul_comm_elemetwise]: 2.17999e-06 [split_layernorm_comm]: 1.86e-06 [handle_group_info]: 9.10019e-07 [symbol_engine_optimizer]: 7.62e-05, [1] [Cycle 1]: 7.146e-05, [6] [build]: 3.46999e-06 [elim_shapecalc]: 9.91e-06 [elim_not_effective]: 1.315e-05 [opt_reshape]: 6.64999e-06 [fold_const_symbol]: 9.87999e-06 [renormalize]: 1.70025e-07 [detach_backward]: 2.42001e-06 [pipeline_parallel_scheduler]: 1.47001e-06 [auto_monad_reorder]: 1.625e-05 [get_jit_bprop_graph]: 1.94e-06 [rewriter_after_jit_bprop_graph]: 4.07e-06 [opt_after_jit_grad]: 0.00047239 [validate]: 4.184e-05 Sums bootstrap : 0.000470s : 3.92% type_inference : 0.006504s : 54.23% event_method : 0.000020s : 0.17% auto_monad : 0.000066s : 0.55% graph_reusing : 0.000006s : 0.05% inline : 0.000002s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000025s : 0.21% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.05% parallel-infer-symbol : 0.000004s : 0.03% pre_auto_parallel : 0.000044s : 0.37% insert-virtual-dataset : 0.000003s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000036s : 0.30% optimize.rewriter_before_opt_a : 0.000096s : 0.80% optimize.opt_a.expand_dump_flag : 0.000004s : 0.03% optimize.opt_a.switch_simplify : 0.000051s : 0.42% optimize.opt_a.loop_unroll : 0.000037s : 0.31% optimize.opt_a.a_1 : 0.000831s : 6.93% optimize.opt_a.with_stream_mark : 0.000031s : 0.26% optimize.opt_a.recompute_prepare : 0.000016s : 0.14% optimize.opt_a.updatestate_depend_eliminate : 0.000008s : 0.06% optimize.opt_a.updatestate_assign_eliminate : 0.000006s : 0.05% optimize.opt_a.updatestate_loads_eliminate : 0.000005s : 0.04% optimize.opt_a.parameter_eliminate : 0.000003s : 0.03% optimize.opt_a.a_2 : 0.000167s : 1.39% optimize.opt_a.accelerated_algorithm : 0.000014s : 0.12% optimize.opt_a.shard : 0.000003s : 0.02% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.03% optimize.opt_a.shard_inline : 0.000013s : 0.11% optimize.opt_a.merge_send_recv : 0.000013s : 0.11% optimize.opt_a.auto_parallel : 0.000012s : 0.10% optimize.opt_a.parallel : 0.000024s : 0.20% optimize.opt_a.flash_sp : 0.000013s : 0.11% optimize.opt_a.merge_comm : 0.000010s : 0.08% optimize.opt_a.allreduce_fusion : 0.000007s : 0.06% optimize.opt_a.matmul_add_comm_reduction : 0.000016s : 0.13% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000016s : 0.13% optimize.opt_a.virtual_dataset : 0.000014s : 0.11% optimize.opt_a.get_grad_eliminate_ : 0.000012s : 0.10% optimize.opt_a.virtual_output : 0.000012s : 0.10% optimize.opt_a.merge_forward : 0.000007s : 0.05% optimize.opt_a.cell_reuse_recompute_pass : 0.000002s : 0.02% optimize.opt_a.offload_activation : 0.000017s : 0.14% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000022s : 0.19% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.02% optimize.opt_a.before_grad : 0.000019s : 0.16% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000008s : 0.07% optimize.opt_a.meta_fg_expand : 0.000005s : 0.04% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.03% optimize.opt_a.receive_attached : 0.000004s : 0.03% optimize.opt_a.after_resolve : 0.000024s : 0.20% optimize.opt_a.a_after_grad : 0.000020s : 0.17% optimize.opt_a.renormalize : 0.000781s : 6.51% optimize.opt_a.add_forward_monad_depend : 0.000007s : 0.06% optimize.opt_a.auto_monad_grad : 0.000003s : 0.03% optimize.opt_a.auto_monad_eliminator : 0.000023s : 0.19% optimize.opt_a.cse : 0.000044s : 0.37% optimize.opt_a.a_3 : 0.000085s : 0.71% optimize.py_interpret_to_execute_after_opt_a : 0.000009s : 0.08% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.02% optimize.rewriter_after_opt_a : 0.000033s : 0.28% optimize.convert_after_rewriter : 0.000007s : 0.05% optimize.order_py_execute_after_rewriter : 0.000005s : 0.04% optimize.mutable_eliminate : 0.000719s : 6.00% optimize.opt_b.b_1 : 0.000123s : 1.02% optimize.opt_b.b_2 : 0.000008s : 0.07% optimize.opt_b.updatestate_depend_eliminate : 0.000005s : 0.05% optimize.opt_b.updatestate_assign_eliminate : 0.000002s : 0.02% optimize.opt_b.updatestate_loads_eliminate : 0.000002s : 0.02% optimize.opt_b.renormalize : 0.000001s : 0.01% optimize.opt_b.cse : 0.000019s : 0.16% optimize.optimize_parallel_all_gather_comm : 0.000017s : 0.14% optimize.overlap_param_gather : 0.000002s : 0.02% optimize.cconv : 0.000027s : 0.22% optimize.loop_unroll : 0.000430s : 3.58% optimize.opt_after_cconv.c_1 : 0.000031s : 0.26% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.02% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.05% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000002s : 0.02% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.02% optimize.opt_after_cconv.cse : 0.000017s : 0.14% optimize.opt_after_cconv.renormalize : 0.000001s : 0.00% optimize.remove_dup_value : 0.000013s : 0.10% optimize.tuple_transform.d_1 : 0.000045s : 0.38% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000007s : 0.06% optimize.partial_unused_args_eliminate : 0.000002s : 0.02% optimize.add_recomputation : 0.000050s : 0.41% optimize.cse_after_recomputation.cse : 0.000011s : 0.09% optimize.environ_conv : 0.000005s : 0.04% optimize.swap_dp_allreduce_reducescatter : 0.000005s : 0.04% optimize.bias_add_comm_swap : 0.000003s : 0.03% optimize.label_micro_interleaved_index : 0.000004s : 0.04% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.02% optimize.merge_cast_opt : 0.000001s : 0.01% optimize.slice_recompute_activation : 0.000002s : 0.02% optimize.micro_interleaved_order_control : 0.000002s : 0.02% optimize.assign_add_opt : 0.000001s : 0.01% optimize.ForceFp32Comm : 0.000001s : 0.01% optimize.remove_cast_before_assign_add : 0.000001s : 0.01% optimize.full_micro_interleaved_order_control : 0.000002s : 0.02% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.02% optimize.comm_op_add_attrs : 0.000001s : 0.01% optimize.add_comm_op_reuse_tag : 0.000001s : 0.01% optimize.interleave_split_concat_branches : 0.000001s : 0.01% optimize.interleave_parallel_branches : 0.000001s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.02% optimize.control_data_broadcast_order : 0.000013s : 0.11% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.01% optimize.offloading_packed_experts : 0.000003s : 0.03% optimize.overlap_recompute_and_grad_model_parallel : 0.000004s : 0.04% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.01% optimize.overlap_recompute_comm : 0.000002s : 0.02% optimize.overlap_grad_ring_attention : 0.000004s : 0.03% optimize.overlap_grad_flash_sp : 0.000021s : 0.17% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.02% optimize.split_layernorm_comm : 0.000002s : 0.02% optimize.handle_group_info : 0.000001s : 0.01% optimize.symbol_engine_optimizer.build : 0.000003s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000010s : 0.08% optimize.symbol_engine_optimizer.elim_not_effective : 0.000013s : 0.11% optimize.symbol_engine_optimizer.opt_reshape : 0.000007s : 0.06% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000010s : 0.08% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.02% pipeline_parallel_scheduler : 0.000001s : 0.01% auto_monad_reorder : 0.000016s : 0.14% get_jit_bprop_graph : 0.000002s : 0.02% rewriter_after_jit_bprop_graph : 0.000004s : 0.03% opt_after_jit_grad : 0.000472s : 3.94% validate : 0.000042s : 0.35% Time group info: ------[substitution.] 0.000243 34 13.97% : 0.000034s : 6: substitution.arithmetic_simplify 0.83% : 0.000002s : 2: substitution.elim_not_effective 0.55% : 0.000001s : 2: substitution.fold_const_symbol 2.68% : 0.000007s : 4: substitution.graph_param_transform 70.47% : 0.000171s : 4: substitution.inline 1.53% : 0.000004s : 4: substitution.j_node_and_user_rematch 1.85% : 0.000004s : 4: substitution.remove_not_recompute_node 1.65% : 0.000004s : 4: substitution.replace_old_param 6.48% : 0.000016s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.006430 2 87.56% : 0.005630s : 1: type_inference.infer 12.44% : 0.000800s : 1: type_inference.specialize ------[replace.] 0.000066 8 63.99% : 0.000042s : 4: replace.inline 36.01% : 0.000024s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000183 8 92.38% : 0.000169s : 4: match.inline 7.62% : 0.000014s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000211 1278 0.98% : 0.000002s : 13: predicate.accumulaten_eliminater 0.70% : 0.000001s : 4: predicate.ad_related_special_op_eliminate 0.61% : 0.000001s : 8: predicate.addn_check_dump 0.93% : 0.000002s : 13: predicate.addn_zero_filter 0.79% : 0.000002s : 13: predicate.adjust_all_reduce_mul_add 2.33% : 0.000005s : 21: predicate.arithmetic_simplify 1.00% : 0.000002s : 13: predicate.cast_eliminate 0.72% : 0.000002s : 8: predicate.check_bprop_eliminate 0.52% : 0.000001s : 8: predicate.compare_switch_simplify 0.19% : 0.000000s : 4: predicate.const_output_eliminate 0.64% : 0.000001s : 8: predicate.depend_value_elim 0.94% : 0.000002s : 13: predicate.dict_get_item_const_eliminator 1.14% : 0.000002s : 13: predicate.dict_get_item_eliminator 0.92% : 0.000002s : 13: predicate.dict_set_item_eliminator 0.99% : 0.000002s : 8: predicate.dumpgradient_eliminate 0.21% : 0.000000s : 4: predicate.elim_not_effective 0.39% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.15% : 0.000002s : 17: predicate.environ_add_const_eliminate 1.03% : 0.000002s : 17: predicate.environ_get_add_eliminate 1.03% : 0.000002s : 17: predicate.environ_get_depend_swap 1.78% : 0.000004s : 25: predicate.environ_get_eliminate 1.04% : 0.000002s : 17: predicate.environ_get_set_eliminate 1.60% : 0.000003s : 21: predicate.exchange_switch_depend_value 2.31% : 0.000005s : 21: predicate.float_depend_g_call 0.52% : 0.000001s : 8: predicate.float_environ_get_switch 0.86% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.19% : 0.000000s : 4: predicate.fold_const_symbol 0.73% : 0.000002s : 8: predicate.get_grad_eliminate 0.21% : 0.000000s : 4: predicate.graph_param_transform 0.55% : 0.000001s : 8: predicate.incorporate_call 0.47% : 0.000001s : 8: predicate.incorporate_call_switch 6.40% : 0.000013s : 58: predicate.inline 0.68% : 0.000001s : 8: predicate.inline_without_move 0.32% : 0.000001s : 8: predicate.j_node_and_user_rematch 0.97% : 0.000002s : 8: predicate.less_batch_normalization 1.74% : 0.000004s : 25: predicate.list_to_tuple_eliminator_ 2.45% : 0.000005s : 38: predicate.load_eliminater 0.92% : 0.000002s : 4: predicate.loop_unroll_after_grad 2.31% : 0.000005s : 34: predicate.loop_unroll_before_grad 1.58% : 0.000003s : 21: predicate.make_slice_get_slice_eliminator 0.54% : 0.000001s : 8: predicate.merge_addn 0.55% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.60% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.81% : 0.000002s : 13: predicate.minmaximum_grad 1.27% : 0.000003s : 4: predicate.mutable_eliminate 0.32% : 0.000001s : 4: predicate.opt_reshape 0.38% : 0.000001s : 4: predicate.parallel_virtual_node 1.86% : 0.000004s : 21: predicate.partial_defer_inline 1.62% : 0.000003s : 21: predicate.partial_eliminate 0.93% : 0.000002s : 13: predicate.print_const_string_wrapper 0.66% : 0.000001s : 8: predicate.reduce_all_const_elim 1.16% : 0.000002s : 13: predicate.reduce_eliminate 2.59% : 0.000005s : 38: predicate.redundant_stop_gradient_eliminater 0.38% : 0.000001s : 8: predicate.remove_not_recompute_node 1.37% : 0.000003s : 25: predicate.replace_applicator 0.61% : 0.000001s : 8: predicate.replace_old_param 0.28% : 0.000001s : 4: predicate.reset_defer_inline 0.92% : 0.000002s : 13: predicate.reshape_eliminate 0.60% : 0.000001s : 8: predicate.row_tensor_add_zeros_like 0.42% : 0.000001s : 4: predicate.row_tensor_eliminate 0.75% : 0.000002s : 8: predicate.same_eliminate 0.42% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.80% : 0.000002s : 8: predicate.shard_identity_eliminate 0.70% : 0.000001s : 8: predicate.special_op_eliminate 0.67% : 0.000001s : 8: predicate.specialize_transform 1.03% : 0.000002s : 8: predicate.split_environ_get_set_with_tuple_value 0.79% : 0.000002s : 8: predicate.stack_unstack_eliminate 0.35% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.49% : 0.000003s : 21: predicate.switch_defer_inline 2.06% : 0.000004s : 29: predicate.switch_layer_defer_inline 5.04% : 0.000011s : 67: predicate.switch_simplify 0.96% : 0.000002s : 13: predicate.tile_eliminate 0.91% : 0.000002s : 13: predicate.transpose_eliminate 1.52% : 0.000003s : 21: predicate.tuple_list_convert_item_index_to_positive 1.49% : 0.000003s : 21: predicate.tuple_list_get_item_const_eliminator 1.55% : 0.000003s : 21: predicate.tuple_list_get_item_depend_reorder 3.28% : 0.000007s : 33: predicate.tuple_list_get_item_eliminator 1.49% : 0.000003s : 21: predicate.tuple_list_get_set_item_eliminator 2.43% : 0.000005s : 29: predicate.tuple_list_set_item_eliminator 1.72% : 0.000004s : 25: predicate.tuple_to_list_eliminator_ 2.38% : 0.000005s : 38: predicate.updatestate_pure_node_eliminater 2.98% : 0.000006s : 46: predicate.updatestate_useless_node_eliminater 0.36% : 0.000001s : 4: predicate.value_based_eliminate 0.91% : 0.000002s : 8: predicate.virtual_dataset_eliminate 0.59% : 0.000001s : 8: predicate.virtual_output_eliminate 0.25% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.41% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000617 11 50.06% : 0.000309s : 5: func_graph_cloner_run.FuncGraphClonerGraph 49.94% : 0.000308s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.067885 192 0.01% : 0.000004s : 1: ForceFp32Comm 35.10% : 0.023827s : 1: add_attr 35.07% : 0.023808s : 1: add_attr_with_inline 0.01% : 0.000004s : 1: add_comm_op_reuse_tag 0.08% : 0.000054s : 1: add_recomputation 0.01% : 0.000004s : 1: assign_add_opt 0.11% : 0.000072s : 1: auto_monad 0.03% : 0.000020s : 1: auto_monad_reorder 0.01% : 0.000004s : 1: begin_end_overlap_inline 0.01% : 0.000006s : 1: bias_add_comm_swap 0.74% : 0.000501s : 1: bootstrap 0.04% : 0.000030s : 1: cconv 0.01% : 0.000004s : 1: comm_op_add_attrs 0.02% : 0.000017s : 1: control_data_broadcast_order 0.01% : 0.000010s : 1: convert_after_rewriter 0.04% : 0.000024s : 1: cse_after_recomputation 0.01% : 0.000007s : 1: dataset_repeat_opt 0.01% : 0.000006s : 1: detach_backward 0.01% : 0.000009s : 1: environ_conv 0.04% : 0.000027s : 1: event_method 0.01% : 0.000005s : 1: full_micro_interleaved_order_control 0.01% : 0.000005s : 1: get_jit_bprop_graph 0.02% : 0.000011s : 1: graph_reusing 0.01% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000004s : 1: handle_group_info 0.01% : 0.000005s : 1: inline 0.01% : 0.000006s : 1: insert-virtual-dataset 0.01% : 0.000004s : 1: interleave_parallel_branches 0.01% : 0.000004s : 1: interleave_split_concat_branches 0.01% : 0.000006s : 1: label_fine_grained_interleaved_index 0.01% : 0.000007s : 1: label_micro_interleaved_index 0.64% : 0.000437s : 1: loop_unroll 0.01% : 0.000004s : 1: merge_cast_opt 0.01% : 0.000005s : 1: micro_interleaved_order_control 1.07% : 0.000728s : 1: mutable_eliminate 0.01% : 0.000007s : 1: offloading_packed_experts 0.02% : 0.000014s : 1: opt.transform.loop_unroll_optimizer 0.02% : 0.000016s : 1: opt.transform.mutable_eliminate 1.86% : 0.001260s : 78: opt.transform.opt_a 0.04% : 0.000030s : 1: opt.transform.opt_after_cconv 0.04% : 0.000025s : 1: opt.transform.opt_after_jit_grad 0.15% : 0.000100s : 28: opt.transform.opt_b 0.07% : 0.000050s : 2: opt.transform.opt_trans_graph 0.05% : 0.000036s : 4: opt.transform.symbol_engine_opt 4.25% : 0.002882s : 1: opt_a 0.16% : 0.000106s : 1: opt_after_cconv 0.71% : 0.000481s : 1: opt_after_jit_grad 0.30% : 0.000206s : 1: opt_b 7.55% : 0.005128s : 1: optimize 0.03% : 0.000020s : 1: optimize_parallel_all_gather_comm 0.01% : 0.000008s : 1: order_py_execute_after_rewriter 0.04% : 0.000024s : 1: overlap_grad_flash_sp 0.01% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.01% : 0.000007s : 1: overlap_grad_ring_attention 0.01% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.01% : 0.000005s : 1: overlap_param_gather 0.01% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.01% : 0.000008s : 1: overlap_recompute_and_grad_model_parallel 0.01% : 0.000005s : 1: overlap_recompute_comm 0.01% : 0.000007s : 1: parallel-infer-symbol 0.01% : 0.000004s : 1: parallel-infer-symbol-second 0.01% : 0.000006s : 1: partial_unused_args_eliminate 0.01% : 0.000004s : 1: pipeline_parallel_scheduler 0.01% : 0.000005s : 1: pipeline_split 0.07% : 0.000048s : 1: pre_auto_parallel 0.06% : 0.000040s : 1: py_interpret_to_execute 0.02% : 0.000012s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000003s : 1: remove_cast_before_assign_add 0.02% : 0.000016s : 1: remove_dup_value 0.63% : 0.000426s : 1: renormalize.infer 0.51% : 0.000345s : 1: renormalize.specialize 0.01% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.01% : 0.000007s : 1: rewriter_after_jit_bprop_graph 0.06% : 0.000038s : 1: rewriter_after_opt_a 0.15% : 0.000100s : 1: rewriter_before_opt_a 0.01% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.01% : 0.000005s : 1: slice_recompute_activation 0.01% : 0.000005s : 1: split_layernorm_comm 0.01% : 0.000005s : 1: split_matmul_comm_elemetwise 0.01% : 0.000008s : 1: swap_dp_allreduce_reducescatter 0.12% : 0.000079s : 1: symbol_engine_optimizer 0.12% : 0.000081s : 1: tuple_transform 9.61% : 0.006526s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:43:18.799.649 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:43:18.799.923 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.115177, [21] [bootstrap]: 0.00042612 [type_inference]: 0.0185348 [event_method]: 2.275e-05 [auto_monad]: 6.881e-05 [graph_reusing]: 6.64001e-06 [inline]: 2.43998e-06 [add_attr]: 0.0159167, [1] [add_attr_with_inline]: 0.0159023, [1] [Cycle 1]: 0.00010174, [2] [tag_attr]: 2.391e-05 [meta_addattr_fg_expand]: 5.92999e-06 [parallel-infer-symbol]: 3.86001e-06 [pre_auto_parallel]: 4.334e-05 [insert-virtual-dataset]: 2.37001e-06 [parallel-infer-symbol-second]: 9.20001e-07 [dataset_repeat_opt]: 1.91998e-06 [pipeline_split]: 1.57001e-06 [optimize]: 0.0788342, [53] [py_interpret_to_execute]: 3.746e-05 [rewriter_before_opt_a]: 0.00010188 [opt_a]: 0.0761785, [2] [Cycle 1]: 0.0752037, [45] [expand_dump_flag]: 3.34001e-06 [switch_simplify]: 4.624e-05 [loop_unroll]: 3.024e-05 [a_1]: 0.00070918 [with_stream_mark]: 2.606e-05 [recompute_prepare]: 1.358e-05 [updatestate_depend_eliminate]: 4.53001e-06 [updatestate_assign_eliminate]: 3.4e-06 [updatestate_loads_eliminate]: 3.04999e-06 [parameter_eliminate]: 2.19999e-06 [a_2]: 0.00012486 [accelerated_algorithm]: 8.88002e-06 [shard]: 1.89e-06 [meta_shard_fg_expand]: 2.13002e-06 [shard_inline]: 9.71e-06 [merge_send_recv]: 9.64999e-06 [auto_parallel]: 8.46002e-06 [parallel]: 1.934e-05 [flash_sp]: 1.03e-05 [merge_comm]: 4.08999e-06 [allreduce_fusion]: 3.3e-06 [matmul_add_comm_reduction]: 1.033e-05 [allreduce_slice_to_reducescatter]: 6.19999e-07 [virtual_shard_identity]: 9.48002e-06 [virtual_dataset]: 7.23e-06 [get_grad_eliminate_]: 7e-06 [virtual_output]: 6.76999e-06 [merge_forward]: 4.68001e-06 [cell_reuse_recompute_pass]: 1.54e-06 [offload_activation]: 1.207e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.676e-05 [merge_recompute_call_nodes]: 1.74998e-06 [before_grad]: 1.158e-05 [set_forward_comm_id_for_comm_node_pass]: 4e-06 [meta_fg_expand]: 2.96999e-06 [flash_sp_send_recv_attached]: 3.83001e-06 [receive_attached]: 2.27999e-06 [after_resolve]: 1.354e-05 [a_after_grad]: 1.073e-05 [renormalize]: 0.0734168 [add_forward_monad_depend]: 1.191e-05 [auto_monad_grad]: 2.78998e-06 [auto_monad_eliminator]: 2.633e-05 [cse]: 3.206e-05 [a_3]: 8.094e-05 [Cycle 2]: 0.00095569, [45] [expand_dump_flag]: 1.99999e-06 [switch_simplify]: 1.034e-05 [loop_unroll]: 7.4e-06 [a_1]: 0.00016534 [with_stream_mark]: 1.995e-05 [recompute_prepare]: 7.98999e-06 [updatestate_depend_eliminate]: 3.65e-06 [updatestate_assign_eliminate]: 3.08e-06 [updatestate_loads_eliminate]: 2.88e-06 [parameter_eliminate]: 2.16e-06 [a_2]: 0.0001095 [accelerated_algorithm]: 7.48e-06 [shard]: 2.68e-06 [meta_shard_fg_expand]: 2.20002e-06 [shard_inline]: 6.81001e-06 [merge_send_recv]: 8.75999e-06 [auto_parallel]: 1.05e-05 [parallel]: 9.29998e-06 [flash_sp]: 4.52998e-06 [merge_comm]: 4.18001e-06 [allreduce_fusion]: 6.28998e-06 [matmul_add_comm_reduction]: 1.088e-05 [allreduce_slice_to_reducescatter]: 6.39993e-07 [virtual_shard_identity]: 9.20999e-06 [virtual_dataset]: 6.74999e-06 [get_grad_eliminate_]: 6.22001e-06 [virtual_output]: 6.31e-06 [merge_forward]: 4.50001e-06 [cell_reuse_recompute_pass]: 3.45e-06 [offload_activation]: 1.183e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.563e-05 [merge_recompute_call_nodes]: 1.49e-06 [before_grad]: 1.187e-05 [set_forward_comm_id_for_comm_node_pass]: 4.87998e-06 [meta_fg_expand]: 2.93e-06 [flash_sp_send_recv_attached]: 2.01e-06 [receive_attached]: 2.82002e-06 [after_resolve]: 1.334e-05 [a_after_grad]: 1.016e-05 [renormalize]: 7.99773e-08 [add_forward_monad_depend]: 1.74e-06 [auto_monad_grad]: 1.77999e-06 [auto_monad_eliminator]: 8.82e-06 [cse]: 1.642e-05 [a_3]: 5.008e-05 [py_interpret_to_execute_after_opt_a]: 2.345e-05 [slice_cell_reuse_recomputed_activation]: 5.12999e-06 [rewriter_after_opt_a]: 4.454e-05 [convert_after_rewriter]: 1.028e-05 [order_py_execute_after_rewriter]: 8.47998e-06 [mutable_eliminate]: 0.00075075 [opt_b]: 0.00028256, [1] [Cycle 1]: 0.00027152, [7] [b_1]: 0.00016838 [b_2]: 8.49998e-06 [updatestate_depend_eliminate]: 8.59e-06 [updatestate_assign_eliminate]: 2.94999e-06 [updatestate_loads_eliminate]: 2.58998e-06 [renormalize]: 7.7e-07 [cse]: 2.211e-05 [optimize_parallel_all_gather_comm]: 2.239e-05 [overlap_param_gather]: 4.4e-06 [cconv]: 3.668e-05 [loop_unroll]: 0.00046553 [opt_after_cconv]: 0.00013185, [1] [Cycle 1]: 0.00012277, [7] [c_1]: 3.18e-05 [parameter_eliminate]: 5.25999e-06 [updatestate_depend_eliminate]: 5.73002e-06 [updatestate_assign_eliminate]: 2.68e-06 [updatestate_loads_eliminate]: 2.48e-06 [cse]: 1.866e-05 [renormalize]: 3.39991e-07 [remove_dup_value]: 1.616e-05 [tuple_transform]: 9.362e-05, [1] [Cycle 1]: 8.591e-05, [4] [d_1]: 4.638e-05 [none_parameter_eliminate]: 1.94e-06 [renormalize]: 1.19995e-07 [switch_simplify]: 7.60998e-06 [partial_unused_args_eliminate]: 4.79e-06 [add_recomputation]: 5.359e-05 [cse_after_recomputation]: 2.829e-05, [1] [Cycle 1]: 2.128e-05, [1] [cse]: 1.139e-05 [environ_conv]: 8.73001e-06 [swap_dp_allreduce_reducescatter]: 8.15e-06 [bias_add_comm_swap]: 5.58002e-06 [label_micro_interleaved_index]: 7.46001e-06 [label_fine_grained_interleaved_index]: 5.22999e-06 [merge_cast_opt]: 3.6e-06 [slice_recompute_activation]: 4.95001e-06 [micro_interleaved_order_control]: 6.43e-06 [assign_add_opt]: 3.68e-06 [ForceFp32Comm]: 3.26001e-06 [remove_cast_before_assign_add]: 3.61999e-06 [full_micro_interleaved_order_control]: 4.41002e-06 [reorder_send_recv_between_fp_bp]: 5.92999e-06 [comm_op_add_attrs]: 3.70998e-06 [add_comm_op_reuse_tag]: 3.26001e-06 [interleave_split_concat_branches]: 3.68999e-06 [interleave_parallel_branches]: 3.53999e-06 [overlap_opt_shard_in_pipeline]: 3.64002e-06 [overlap_opt_shard_grad_in_pipeline]: 4.72998e-06 [control_data_broadcast_order]: 1.73e-05 [grouped_pairwise_exchange_alltoall]: 4.46002e-06 [offloading_packed_experts]: 6.09999e-06 [overlap_recompute_and_grad_model_parallel]: 8.03001e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.6e-06 [overlap_recompute_allgather_and_fa_grad]: 3.68e-06 [overlap_recompute_comm]: 5.35999e-06 [overlap_grad_ring_attention]: 6.70002e-06 [overlap_grad_flash_sp]: 2.485e-05 [begin_end_overlap_inline]: 2.96001e-06 [split_matmul_comm_elemetwise]: 5.18002e-06 [split_layernorm_comm]: 3.90998e-06 [handle_group_info]: 3.54002e-06 [symbol_engine_optimizer]: 0.00010542, [1] [Cycle 1]: 9.757e-05, [6] [build]: 4.36002e-06 [elim_shapecalc]: 1.206e-05 [elim_not_effective]: 1.432e-05 [opt_reshape]: 7.63001e-06 [fold_const_symbol]: 1.039e-05 [renormalize]: 2.00002e-07 [detach_backward]: 4.47e-06 [pipeline_parallel_scheduler]: 1.57001e-06 [auto_monad_reorder]: 2.115e-05 [get_jit_bprop_graph]: 2.27001e-06 [rewriter_after_jit_bprop_graph]: 6.66999e-06 [opt_after_jit_grad]: 0.00059357 [validate]: 4.468e-05 Sums bootstrap : 0.000426s : 0.44% type_inference : 0.018535s : 19.04% event_method : 0.000023s : 0.02% auto_monad : 0.000069s : 0.07% graph_reusing : 0.000007s : 0.01% inline : 0.000002s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000024s : 0.02% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.01% parallel-infer-symbol : 0.000004s : 0.00% pre_auto_parallel : 0.000043s : 0.04% insert-virtual-dataset : 0.000002s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000037s : 0.04% optimize.rewriter_before_opt_a : 0.000102s : 0.10% optimize.opt_a.expand_dump_flag : 0.000005s : 0.01% optimize.opt_a.switch_simplify : 0.000057s : 0.06% optimize.opt_a.loop_unroll : 0.000038s : 0.04% optimize.opt_a.a_1 : 0.000875s : 0.90% optimize.opt_a.with_stream_mark : 0.000046s : 0.05% optimize.opt_a.recompute_prepare : 0.000022s : 0.02% optimize.opt_a.updatestate_depend_eliminate : 0.000008s : 0.01% optimize.opt_a.updatestate_assign_eliminate : 0.000006s : 0.01% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.01% optimize.opt_a.parameter_eliminate : 0.000004s : 0.00% optimize.opt_a.a_2 : 0.000234s : 0.24% optimize.opt_a.accelerated_algorithm : 0.000016s : 0.02% optimize.opt_a.shard : 0.000005s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.00% optimize.opt_a.shard_inline : 0.000017s : 0.02% optimize.opt_a.merge_send_recv : 0.000018s : 0.02% optimize.opt_a.auto_parallel : 0.000019s : 0.02% optimize.opt_a.parallel : 0.000029s : 0.03% optimize.opt_a.flash_sp : 0.000015s : 0.02% optimize.opt_a.merge_comm : 0.000008s : 0.01% optimize.opt_a.allreduce_fusion : 0.000010s : 0.01% optimize.opt_a.matmul_add_comm_reduction : 0.000021s : 0.02% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000019s : 0.02% optimize.opt_a.virtual_dataset : 0.000014s : 0.01% optimize.opt_a.get_grad_eliminate_ : 0.000013s : 0.01% optimize.opt_a.virtual_output : 0.000013s : 0.01% optimize.opt_a.merge_forward : 0.000009s : 0.01% optimize.opt_a.cell_reuse_recompute_pass : 0.000005s : 0.01% optimize.opt_a.offload_activation : 0.000024s : 0.02% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000032s : 0.03% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.00% optimize.opt_a.before_grad : 0.000023s : 0.02% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000009s : 0.01% optimize.opt_a.meta_fg_expand : 0.000006s : 0.01% optimize.opt_a.flash_sp_send_recv_attached : 0.000006s : 0.01% optimize.opt_a.receive_attached : 0.000005s : 0.01% optimize.opt_a.after_resolve : 0.000027s : 0.03% optimize.opt_a.a_after_grad : 0.000021s : 0.02% optimize.opt_a.renormalize : 0.073417s : 75.43% optimize.opt_a.add_forward_monad_depend : 0.000014s : 0.01% optimize.opt_a.auto_monad_grad : 0.000005s : 0.00% optimize.opt_a.auto_monad_eliminator : 0.000035s : 0.04% optimize.opt_a.cse : 0.000048s : 0.05% optimize.opt_a.a_3 : 0.000131s : 0.13% optimize.py_interpret_to_execute_after_opt_a : 0.000023s : 0.02% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.01% optimize.rewriter_after_opt_a : 0.000045s : 0.05% optimize.convert_after_rewriter : 0.000010s : 0.01% optimize.order_py_execute_after_rewriter : 0.000008s : 0.01% optimize.mutable_eliminate : 0.000751s : 0.77% optimize.opt_b.b_1 : 0.000168s : 0.17% optimize.opt_b.b_2 : 0.000008s : 0.01% optimize.opt_b.updatestate_depend_eliminate : 0.000009s : 0.01% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.00% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000022s : 0.02% optimize.optimize_parallel_all_gather_comm : 0.000022s : 0.02% optimize.overlap_param_gather : 0.000004s : 0.00% optimize.cconv : 0.000037s : 0.04% optimize.loop_unroll : 0.000466s : 0.48% optimize.opt_after_cconv.c_1 : 0.000032s : 0.03% optimize.opt_after_cconv.parameter_eliminate : 0.000005s : 0.01% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.01% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.00% optimize.opt_after_cconv.cse : 0.000019s : 0.02% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000016s : 0.02% optimize.tuple_transform.d_1 : 0.000046s : 0.05% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000008s : 0.01% optimize.partial_unused_args_eliminate : 0.000005s : 0.00% optimize.add_recomputation : 0.000054s : 0.06% optimize.cse_after_recomputation.cse : 0.000011s : 0.01% optimize.environ_conv : 0.000009s : 0.01% optimize.swap_dp_allreduce_reducescatter : 0.000008s : 0.01% optimize.bias_add_comm_swap : 0.000006s : 0.01% optimize.label_micro_interleaved_index : 0.000007s : 0.01% optimize.label_fine_grained_interleaved_index : 0.000005s : 0.01% optimize.merge_cast_opt : 0.000004s : 0.00% optimize.slice_recompute_activation : 0.000005s : 0.01% optimize.micro_interleaved_order_control : 0.000006s : 0.01% optimize.assign_add_opt : 0.000004s : 0.00% optimize.ForceFp32Comm : 0.000003s : 0.00% optimize.remove_cast_before_assign_add : 0.000004s : 0.00% optimize.full_micro_interleaved_order_control : 0.000004s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000006s : 0.01% optimize.comm_op_add_attrs : 0.000004s : 0.00% optimize.add_comm_op_reuse_tag : 0.000003s : 0.00% optimize.interleave_split_concat_branches : 0.000004s : 0.00% optimize.interleave_parallel_branches : 0.000004s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000005s : 0.00% optimize.control_data_broadcast_order : 0.000017s : 0.02% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.00% optimize.offloading_packed_experts : 0.000006s : 0.01% optimize.overlap_recompute_and_grad_model_parallel : 0.000008s : 0.01% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.00% optimize.overlap_recompute_comm : 0.000005s : 0.01% optimize.overlap_grad_ring_attention : 0.000007s : 0.01% optimize.overlap_grad_flash_sp : 0.000025s : 0.03% optimize.begin_end_overlap_inline : 0.000003s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000005s : 0.01% optimize.split_layernorm_comm : 0.000004s : 0.00% optimize.handle_group_info : 0.000004s : 0.00% optimize.symbol_engine_optimizer.build : 0.000004s : 0.00% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000012s : 0.01% optimize.symbol_engine_optimizer.elim_not_effective : 0.000014s : 0.01% optimize.symbol_engine_optimizer.opt_reshape : 0.000008s : 0.01% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000010s : 0.01% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000004s : 0.00% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000021s : 0.02% get_jit_bprop_graph : 0.000002s : 0.00% rewriter_after_jit_bprop_graph : 0.000007s : 0.01% opt_after_jit_grad : 0.000594s : 0.61% validate : 0.000045s : 0.05% Time group info: ------[substitution.] 0.000269 34 16.59% : 0.000045s : 6: substitution.arithmetic_simplify 0.70% : 0.000002s : 2: substitution.elim_not_effective 0.53% : 0.000001s : 2: substitution.fold_const_symbol 2.48% : 0.000007s : 4: substitution.graph_param_transform 67.07% : 0.000181s : 4: substitution.inline 2.14% : 0.000006s : 4: substitution.j_node_and_user_rematch 2.09% : 0.000006s : 4: substitution.remove_not_recompute_node 2.67% : 0.000007s : 4: substitution.replace_old_param 5.74% : 0.000015s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.018465 2 94.96% : 0.017534s : 1: type_inference.infer 5.04% : 0.000931s : 1: type_inference.specialize ------[replace.] 0.000068 8 61.22% : 0.000041s : 4: replace.inline 38.78% : 0.000026s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000191 8 92.84% : 0.000177s : 4: match.inline 7.16% : 0.000014s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000232 1278 0.92% : 0.000002s : 13: predicate.accumulaten_eliminater 0.74% : 0.000002s : 4: predicate.ad_related_special_op_eliminate 0.46% : 0.000001s : 8: predicate.addn_check_dump 0.87% : 0.000002s : 13: predicate.addn_zero_filter 0.79% : 0.000002s : 13: predicate.adjust_all_reduce_mul_add 2.73% : 0.000006s : 21: predicate.arithmetic_simplify 0.93% : 0.000002s : 13: predicate.cast_eliminate 0.56% : 0.000001s : 8: predicate.check_bprop_eliminate 0.49% : 0.000001s : 8: predicate.compare_switch_simplify 0.17% : 0.000000s : 4: predicate.const_output_eliminate 0.91% : 0.000002s : 8: predicate.depend_value_elim 0.86% : 0.000002s : 13: predicate.dict_get_item_const_eliminator 1.44% : 0.000003s : 13: predicate.dict_get_item_eliminator 0.84% : 0.000002s : 13: predicate.dict_set_item_eliminator 1.08% : 0.000003s : 8: predicate.dumpgradient_eliminate 0.19% : 0.000000s : 4: predicate.elim_not_effective 0.41% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.09% : 0.000003s : 17: predicate.environ_add_const_eliminate 1.03% : 0.000002s : 17: predicate.environ_get_add_eliminate 0.99% : 0.000002s : 17: predicate.environ_get_depend_swap 1.53% : 0.000004s : 25: predicate.environ_get_eliminate 1.12% : 0.000003s : 17: predicate.environ_get_set_eliminate 1.30% : 0.000003s : 21: predicate.exchange_switch_depend_value 2.55% : 0.000006s : 21: predicate.float_depend_g_call 0.59% : 0.000001s : 8: predicate.float_environ_get_switch 0.84% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.15% : 0.000000s : 4: predicate.fold_const_symbol 0.60% : 0.000001s : 8: predicate.get_grad_eliminate 0.18% : 0.000000s : 4: predicate.graph_param_transform 0.53% : 0.000001s : 8: predicate.incorporate_call 0.43% : 0.000001s : 8: predicate.incorporate_call_switch 6.25% : 0.000015s : 58: predicate.inline 0.84% : 0.000002s : 8: predicate.inline_without_move 0.38% : 0.000001s : 8: predicate.j_node_and_user_rematch 1.14% : 0.000003s : 8: predicate.less_batch_normalization 1.79% : 0.000004s : 25: predicate.list_to_tuple_eliminator_ 2.33% : 0.000005s : 38: predicate.load_eliminater 0.81% : 0.000002s : 4: predicate.loop_unroll_after_grad 2.23% : 0.000005s : 34: predicate.loop_unroll_before_grad 1.96% : 0.000005s : 21: predicate.make_slice_get_slice_eliminator 0.54% : 0.000001s : 8: predicate.merge_addn 0.71% : 0.000002s : 8: predicate.micro_step_allgather_replace 0.49% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.85% : 0.000002s : 13: predicate.minmaximum_grad 1.17% : 0.000003s : 4: predicate.mutable_eliminate 0.31% : 0.000001s : 4: predicate.opt_reshape 0.35% : 0.000001s : 4: predicate.parallel_virtual_node 1.79% : 0.000004s : 21: predicate.partial_defer_inline 1.46% : 0.000003s : 21: predicate.partial_eliminate 0.86% : 0.000002s : 13: predicate.print_const_string_wrapper 0.59% : 0.000001s : 8: predicate.reduce_all_const_elim 1.17% : 0.000003s : 13: predicate.reduce_eliminate 2.35% : 0.000005s : 38: predicate.redundant_stop_gradient_eliminater 0.48% : 0.000001s : 8: predicate.remove_not_recompute_node 1.32% : 0.000003s : 25: predicate.replace_applicator 0.59% : 0.000001s : 8: predicate.replace_old_param 0.40% : 0.000001s : 4: predicate.reset_defer_inline 1.16% : 0.000003s : 13: predicate.reshape_eliminate 0.62% : 0.000001s : 8: predicate.row_tensor_add_zeros_like 0.35% : 0.000001s : 4: predicate.row_tensor_eliminate 0.84% : 0.000002s : 8: predicate.same_eliminate 0.51% : 0.000001s : 8: predicate.set_cell_output_no_recompute 1.03% : 0.000002s : 8: predicate.shard_identity_eliminate 0.65% : 0.000001s : 8: predicate.special_op_eliminate 0.71% : 0.000002s : 8: predicate.specialize_transform 1.19% : 0.000003s : 8: predicate.split_environ_get_set_with_tuple_value 0.84% : 0.000002s : 8: predicate.stack_unstack_eliminate 0.31% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.37% : 0.000003s : 21: predicate.switch_defer_inline 1.97% : 0.000005s : 29: predicate.switch_layer_defer_inline 5.23% : 0.000012s : 67: predicate.switch_simplify 0.86% : 0.000002s : 13: predicate.tile_eliminate 1.09% : 0.000003s : 13: predicate.transpose_eliminate 1.49% : 0.000003s : 21: predicate.tuple_list_convert_item_index_to_positive 1.48% : 0.000003s : 21: predicate.tuple_list_get_item_const_eliminator 1.49% : 0.000003s : 21: predicate.tuple_list_get_item_depend_reorder 3.36% : 0.000008s : 33: predicate.tuple_list_get_item_eliminator 1.32% : 0.000003s : 21: predicate.tuple_list_get_set_item_eliminator 2.33% : 0.000005s : 29: predicate.tuple_list_set_item_eliminator 1.89% : 0.000004s : 25: predicate.tuple_to_list_eliminator_ 2.21% : 0.000005s : 38: predicate.updatestate_pure_node_eliminater 2.78% : 0.000006s : 46: predicate.updatestate_useless_node_eliminater 0.39% : 0.000001s : 4: predicate.value_based_eliminate 0.68% : 0.000002s : 8: predicate.virtual_dataset_eliminate 0.74% : 0.000002s : 8: predicate.virtual_output_eliminate 0.25% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.43% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000711 11 39.46% : 0.000281s : 5: func_graph_cloner_run.FuncGraphClonerGraph 60.54% : 0.000431s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.284835 192 0.00% : 0.000006s : 1: ForceFp32Comm 5.59% : 0.015928s : 1: add_attr 5.58% : 0.015907s : 1: add_attr_with_inline 0.00% : 0.000007s : 1: add_comm_op_reuse_tag 0.02% : 0.000057s : 1: add_recomputation 0.00% : 0.000006s : 1: assign_add_opt 0.03% : 0.000078s : 1: auto_monad 0.01% : 0.000028s : 1: auto_monad_reorder 0.00% : 0.000006s : 1: begin_end_overlap_inline 0.00% : 0.000008s : 1: bias_add_comm_swap 0.16% : 0.000468s : 1: bootstrap 0.01% : 0.000040s : 1: cconv 0.00% : 0.000006s : 1: comm_op_add_attrs 0.01% : 0.000021s : 1: control_data_broadcast_order 0.00% : 0.000014s : 1: convert_after_rewriter 0.01% : 0.000032s : 1: cse_after_recomputation 0.00% : 0.000007s : 1: dataset_repeat_opt 0.01% : 0.000024s : 1: detach_backward 0.00% : 0.000012s : 1: environ_conv 0.01% : 0.000034s : 1: event_method 0.00% : 0.000007s : 1: full_micro_interleaved_order_control 0.00% : 0.000009s : 1: get_jit_bprop_graph 0.00% : 0.000013s : 1: graph_reusing 0.00% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000006s : 1: handle_group_info 0.00% : 0.000008s : 1: inline 0.00% : 0.000009s : 1: insert-virtual-dataset 0.00% : 0.000006s : 1: interleave_parallel_branches 0.00% : 0.000006s : 1: interleave_split_concat_branches 0.00% : 0.000008s : 1: label_fine_grained_interleaved_index 0.00% : 0.000010s : 1: label_micro_interleaved_index 0.17% : 0.000472s : 1: loop_unroll 0.00% : 0.000007s : 1: merge_cast_opt 0.00% : 0.000009s : 1: micro_interleaved_order_control 0.27% : 0.000759s : 1: mutable_eliminate 0.00% : 0.000009s : 1: offloading_packed_experts 0.01% : 0.000015s : 1: opt.transform.loop_unroll_optimizer 0.01% : 0.000021s : 1: opt.transform.mutable_eliminate 0.48% : 0.001360s : 78: opt.transform.opt_a 0.01% : 0.000031s : 1: opt.transform.opt_after_cconv 0.01% : 0.000031s : 1: opt.transform.opt_after_jit_grad 0.04% : 0.000104s : 28: opt.transform.opt_b 0.02% : 0.000052s : 2: opt.transform.opt_trans_graph 0.01% : 0.000040s : 4: opt.transform.symbol_engine_opt 26.75% : 0.076182s : 1: opt_a 0.05% : 0.000135s : 1: opt_after_cconv 0.21% : 0.000607s : 1: opt_after_jit_grad 0.10% : 0.000286s : 1: opt_b 27.81% : 0.079202s : 1: optimize 0.01% : 0.000026s : 1: optimize_parallel_all_gather_comm 0.00% : 0.000012s : 1: order_py_execute_after_rewriter 0.01% : 0.000028s : 1: overlap_grad_flash_sp 0.00% : 0.000006s : 1: overlap_grad_matmul_and_grad_allreduce 0.00% : 0.000009s : 1: overlap_grad_ring_attention 0.00% : 0.000007s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000006s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000007s : 1: overlap_param_gather 0.00% : 0.000006s : 1: overlap_recompute_allgather_and_fa_grad 0.00% : 0.000011s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000008s : 1: overlap_recompute_comm 0.00% : 0.000011s : 1: parallel-infer-symbol 0.00% : 0.000006s : 1: parallel-infer-symbol-second 0.00% : 0.000008s : 1: partial_unused_args_eliminate 0.00% : 0.000010s : 1: pipeline_parallel_scheduler 0.00% : 0.000007s : 1: pipeline_split 0.02% : 0.000051s : 1: pre_auto_parallel 0.01% : 0.000042s : 1: py_interpret_to_execute 0.01% : 0.000026s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000006s : 1: remove_cast_before_assign_add 0.01% : 0.000019s : 1: remove_dup_value 25.58% : 0.072866s : 1: renormalize.infer 0.19% : 0.000532s : 1: renormalize.specialize 0.00% : 0.000009s : 1: reorder_send_recv_between_fp_bp 0.00% : 0.000013s : 1: rewriter_after_jit_bprop_graph 0.02% : 0.000049s : 1: rewriter_after_opt_a 0.04% : 0.000106s : 1: rewriter_before_opt_a 0.00% : 0.000008s : 1: slice_cell_reuse_recomputed_activation 0.01% : 0.000022s : 1: slice_recompute_activation 0.00% : 0.000007s : 1: split_layernorm_comm 0.00% : 0.000008s : 1: split_matmul_comm_elemetwise 0.00% : 0.000011s : 1: swap_dp_allreduce_reducescatter 0.04% : 0.000109s : 1: symbol_engine_optimizer 0.03% : 0.000096s : 1: tuple_transform 6.52% : 0.018584s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:43:20.171.657 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0616126, [21] [bootstrap]: 0.0004408 [type_inference]: 0.0505377 [event_method]: 1.918e-05 [auto_monad]: 6.417e-05 [graph_reusing]: 6.12001e-06 [inline]: 2.68e-06 [add_attr]: 0.00360477, [1] [add_attr_with_inline]: 0.00359405, [1] [Cycle 1]: 7.098e-05, [2] [tag_attr]: 2.181e-05 [meta_addattr_fg_expand]: 6.21e-06 [parallel-infer-symbol]: 3.46999e-06 [pre_auto_parallel]: 3.817e-05 [insert-virtual-dataset]: 2.46e-06 [parallel-infer-symbol-second]: 1.04e-06 [dataset_repeat_opt]: 2.02999e-06 [pipeline_split]: 1.69e-06 [optimize]: 0.00612543, [53] [py_interpret_to_execute]: 2.861e-05 [rewriter_before_opt_a]: 9.135e-05 [opt_a]: 0.00370363, [2] [Cycle 1]: 0.00294177, [45] [expand_dump_flag]: 3.25e-06 [switch_simplify]: 4.654e-05 [loop_unroll]: 3.263e-05 [a_1]: 0.00125366 [with_stream_mark]: 2.403e-05 [recompute_prepare]: 1.129e-05 [updatestate_depend_eliminate]: 4.73001e-06 [updatestate_assign_eliminate]: 3.54002e-06 [updatestate_loads_eliminate]: 2.76e-06 [parameter_eliminate]: 2.25002e-06 [a_2]: 9.311e-05 [accelerated_algorithm]: 7.88999e-06 [shard]: 2.73003e-06 [meta_shard_fg_expand]: 2.32999e-06 [shard_inline]: 7.08e-06 [merge_send_recv]: 9.41e-06 [auto_parallel]: 7.98001e-06 [parallel]: 1.986e-05 [flash_sp]: 9.44e-06 [merge_comm]: 4.13001e-06 [allreduce_fusion]: 3.98999e-06 [matmul_add_comm_reduction]: 1.094e-05 [allreduce_slice_to_reducescatter]: 7.7e-07 [virtual_shard_identity]: 8.60999e-06 [virtual_dataset]: 8.74998e-06 [get_grad_eliminate_]: 6.95002e-06 [virtual_output]: 7.60998e-06 [merge_forward]: 4.41002e-06 [cell_reuse_recompute_pass]: 1.87999e-06 [offload_activation]: 1.055e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.386e-05 [merge_recompute_call_nodes]: 1.66e-06 [before_grad]: 1.248e-05 [set_forward_comm_id_for_comm_node_pass]: 4.02e-06 [meta_fg_expand]: 3.15002e-06 [flash_sp_send_recv_attached]: 2.76e-06 [receive_attached]: 2.69001e-06 [after_resolve]: 1.39e-05 [a_after_grad]: 1.106e-05 [renormalize]: 0.00086039 [add_forward_monad_depend]: 6.71999e-06 [auto_monad_grad]: 2.85002e-06 [auto_monad_eliminator]: 1.938e-05 [cse]: 3.362e-05 [a_3]: 5.78e-05 [Cycle 2]: 0.00074962, [45] [expand_dump_flag]: 2.04999e-06 [switch_simplify]: 8.65999e-06 [loop_unroll]: 7.7e-06 [a_1]: 0.00016133 [with_stream_mark]: 1.553e-05 [recompute_prepare]: 7.6e-06 [updatestate_depend_eliminate]: 3.73999e-06 [updatestate_assign_eliminate]: 2.74999e-06 [updatestate_loads_eliminate]: 3.03e-06 [parameter_eliminate]: 1.40001e-06 [a_2]: 8.709e-05 [accelerated_algorithm]: 6.86999e-06 [shard]: 2.36e-06 [meta_shard_fg_expand]: 2.24001e-06 [shard_inline]: 7.15003e-06 [merge_send_recv]: 6.34001e-06 [auto_parallel]: 6.12001e-06 [parallel]: 7.1e-06 [flash_sp]: 4.03001e-06 [merge_comm]: 3.74002e-06 [allreduce_fusion]: 3.54002e-06 [matmul_add_comm_reduction]: 1.397e-05 [allreduce_slice_to_reducescatter]: 6.89994e-07 [virtual_shard_identity]: 8.01001e-06 [virtual_dataset]: 6.63e-06 [get_grad_eliminate_]: 6.06e-06 [virtual_output]: 6.12999e-06 [merge_forward]: 3.43999e-06 [cell_reuse_recompute_pass]: 1.52999e-06 [offload_activation]: 8.75001e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.2e-05 [merge_recompute_call_nodes]: 1.87001e-06 [before_grad]: 1.037e-05 [set_forward_comm_id_for_comm_node_pass]: 3.31999e-06 [meta_fg_expand]: 2.24001e-06 [flash_sp_send_recv_attached]: 9.89996e-07 [receive_attached]: 1.68997e-06 [after_resolve]: 1.214e-05 [a_after_grad]: 1.05e-05 [renormalize]: 8.9989e-08 [add_forward_monad_depend]: 1.34998e-06 [auto_monad_grad]: 1.84998e-06 [auto_monad_eliminator]: 8.55001e-06 [cse]: 1.497e-05 [a_3]: 3.905e-05 [py_interpret_to_execute_after_opt_a]: 1.286e-05 [slice_cell_reuse_recomputed_activation]: 2.39999e-06 [rewriter_after_opt_a]: 4.163e-05 [convert_after_rewriter]: 7.95e-06 [order_py_execute_after_rewriter]: 5.47999e-06 [mutable_eliminate]: 0.00072518 [opt_b]: 0.0002295, [1] [Cycle 1]: 0.0002181, [7] [b_1]: 0.00013488 [b_2]: 8.98002e-06 [updatestate_depend_eliminate]: 8.04002e-06 [updatestate_assign_eliminate]: 2.53e-06 [updatestate_loads_eliminate]: 2.94001e-06 [renormalize]: 6.00005e-07 [cse]: 2.17e-05 [optimize_parallel_all_gather_comm]: 1.812e-05 [overlap_param_gather]: 2.22999e-06 [cconv]: 3.149e-05 [loop_unroll]: 0.00049139 [opt_after_cconv]: 0.00011461, [1] [Cycle 1]: 0.0001079, [7] [c_1]: 3.415e-05 [parameter_eliminate]: 4.95999e-06 [updatestate_depend_eliminate]: 6.06e-06 [updatestate_assign_eliminate]: 3.16999e-06 [updatestate_loads_eliminate]: 2.45002e-06 [cse]: 1.969e-05 [renormalize]: 4.80009e-07 [remove_dup_value]: 1.475e-05 [tuple_transform]: 8.226e-05, [1] [Cycle 1]: 7.698e-05, [4] [d_1]: 4.789e-05 [none_parameter_eliminate]: 1.67001e-06 [renormalize]: 2.80008e-07 [switch_simplify]: 7.28e-06 [partial_unused_args_eliminate]: 2.01e-06 [add_recomputation]: 5.262e-05 [cse_after_recomputation]: 2.345e-05, [1] [Cycle 1]: 1.84e-05, [1] [cse]: 1.232e-05 [environ_conv]: 5.43002e-06 [swap_dp_allreduce_reducescatter]: 5.60001e-06 [bias_add_comm_swap]: 3.46001e-06 [label_micro_interleaved_index]: 4.51002e-06 [label_fine_grained_interleaved_index]: 3.20998e-06 [merge_cast_opt]: 1.60001e-06 [slice_recompute_activation]: 2.16e-06 [micro_interleaved_order_control]: 2.43e-06 [assign_add_opt]: 1.29998e-06 [ForceFp32Comm]: 8.70001e-07 [remove_cast_before_assign_add]: 1.12999e-06 [full_micro_interleaved_order_control]: 2.54001e-06 [reorder_send_recv_between_fp_bp]: 2.93e-06 [comm_op_add_attrs]: 1.19998e-06 [add_comm_op_reuse_tag]: 9.29984e-07 [interleave_split_concat_branches]: 1.20999e-06 [interleave_parallel_branches]: 1.07998e-06 [overlap_opt_shard_in_pipeline]: 1.47001e-06 [overlap_opt_shard_grad_in_pipeline]: 2.20002e-06 [control_data_broadcast_order]: 1.345e-05 [grouped_pairwise_exchange_alltoall]: 1.59998e-06 [offloading_packed_experts]: 3.83999e-06 [overlap_recompute_and_grad_model_parallel]: 5.17e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.25999e-06 [overlap_recompute_allgather_and_fa_grad]: 1.34e-06 [overlap_recompute_comm]: 2.42001e-06 [overlap_grad_ring_attention]: 3.92998e-06 [overlap_grad_flash_sp]: 2.122e-05 [begin_end_overlap_inline]: 4.89992e-07 [split_matmul_comm_elemetwise]: 2.24001e-06 [split_layernorm_comm]: 1.72001e-06 [handle_group_info]: 9.99979e-07 [symbol_engine_optimizer]: 8.047e-05, [1] [Cycle 1]: 7.586e-05, [6] [build]: 3.42002e-06 [elim_shapecalc]: 1.072e-05 [elim_not_effective]: 1.413e-05 [opt_reshape]: 7.97e-06 [fold_const_symbol]: 1.029e-05 [renormalize]: 1.50001e-07 [detach_backward]: 2.19001e-06 [pipeline_parallel_scheduler]: 1.60001e-06 [auto_monad_reorder]: 1.649e-05 [get_jit_bprop_graph]: 2.17999e-06 [rewriter_after_jit_bprop_graph]: 5.81998e-06 [opt_after_jit_grad]: 0.00052526 [validate]: 4.44e-05 Sums bootstrap : 0.000441s : 0.77% type_inference : 0.050538s : 88.77% event_method : 0.000019s : 0.03% auto_monad : 0.000064s : 0.11% graph_reusing : 0.000006s : 0.01% inline : 0.000003s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000022s : 0.04% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.01% parallel-infer-symbol : 0.000003s : 0.01% pre_auto_parallel : 0.000038s : 0.07% insert-virtual-dataset : 0.000002s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000029s : 0.05% optimize.rewriter_before_opt_a : 0.000091s : 0.16% optimize.opt_a.expand_dump_flag : 0.000005s : 0.01% optimize.opt_a.switch_simplify : 0.000055s : 0.10% optimize.opt_a.loop_unroll : 0.000040s : 0.07% optimize.opt_a.a_1 : 0.001415s : 2.49% optimize.opt_a.with_stream_mark : 0.000040s : 0.07% optimize.opt_a.recompute_prepare : 0.000019s : 0.03% optimize.opt_a.updatestate_depend_eliminate : 0.000008s : 0.01% optimize.opt_a.updatestate_assign_eliminate : 0.000006s : 0.01% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.01% optimize.opt_a.parameter_eliminate : 0.000004s : 0.01% optimize.opt_a.a_2 : 0.000180s : 0.32% optimize.opt_a.accelerated_algorithm : 0.000015s : 0.03% optimize.opt_a.shard : 0.000005s : 0.01% optimize.opt_a.meta_shard_fg_expand : 0.000005s : 0.01% optimize.opt_a.shard_inline : 0.000014s : 0.02% optimize.opt_a.merge_send_recv : 0.000016s : 0.03% optimize.opt_a.auto_parallel : 0.000014s : 0.02% optimize.opt_a.parallel : 0.000027s : 0.05% optimize.opt_a.flash_sp : 0.000013s : 0.02% optimize.opt_a.merge_comm : 0.000008s : 0.01% optimize.opt_a.allreduce_fusion : 0.000008s : 0.01% optimize.opt_a.matmul_add_comm_reduction : 0.000025s : 0.04% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000017s : 0.03% optimize.opt_a.virtual_dataset : 0.000015s : 0.03% optimize.opt_a.get_grad_eliminate_ : 0.000013s : 0.02% optimize.opt_a.virtual_output : 0.000014s : 0.02% optimize.opt_a.merge_forward : 0.000008s : 0.01% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.01% optimize.opt_a.offload_activation : 0.000019s : 0.03% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000026s : 0.05% optimize.opt_a.merge_recompute_call_nodes : 0.000004s : 0.01% optimize.opt_a.before_grad : 0.000023s : 0.04% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000007s : 0.01% optimize.opt_a.meta_fg_expand : 0.000005s : 0.01% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.01% optimize.opt_a.receive_attached : 0.000004s : 0.01% optimize.opt_a.after_resolve : 0.000026s : 0.05% optimize.opt_a.a_after_grad : 0.000022s : 0.04% optimize.opt_a.renormalize : 0.000860s : 1.51% optimize.opt_a.add_forward_monad_depend : 0.000008s : 0.01% optimize.opt_a.auto_monad_grad : 0.000005s : 0.01% optimize.opt_a.auto_monad_eliminator : 0.000028s : 0.05% optimize.opt_a.cse : 0.000049s : 0.09% optimize.opt_a.a_3 : 0.000097s : 0.17% optimize.py_interpret_to_execute_after_opt_a : 0.000013s : 0.02% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.00% optimize.rewriter_after_opt_a : 0.000042s : 0.07% optimize.convert_after_rewriter : 0.000008s : 0.01% optimize.order_py_execute_after_rewriter : 0.000005s : 0.01% optimize.mutable_eliminate : 0.000725s : 1.27% optimize.opt_b.b_1 : 0.000135s : 0.24% optimize.opt_b.b_2 : 0.000009s : 0.02% optimize.opt_b.updatestate_depend_eliminate : 0.000008s : 0.01% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.01% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000022s : 0.04% optimize.optimize_parallel_all_gather_comm : 0.000018s : 0.03% optimize.overlap_param_gather : 0.000002s : 0.00% optimize.cconv : 0.000031s : 0.06% optimize.loop_unroll : 0.000491s : 0.86% optimize.opt_after_cconv.c_1 : 0.000034s : 0.06% optimize.opt_after_cconv.parameter_eliminate : 0.000005s : 0.01% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.01% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.00% optimize.opt_after_cconv.cse : 0.000020s : 0.03% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000015s : 0.03% optimize.tuple_transform.d_1 : 0.000048s : 0.08% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000007s : 0.01% optimize.partial_unused_args_eliminate : 0.000002s : 0.00% optimize.add_recomputation : 0.000053s : 0.09% optimize.cse_after_recomputation.cse : 0.000012s : 0.02% optimize.environ_conv : 0.000005s : 0.01% optimize.swap_dp_allreduce_reducescatter : 0.000006s : 0.01% optimize.bias_add_comm_swap : 0.000003s : 0.01% optimize.label_micro_interleaved_index : 0.000005s : 0.01% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.01% optimize.merge_cast_opt : 0.000002s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.00% optimize.micro_interleaved_order_control : 0.000002s : 0.00% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.00% optimize.full_micro_interleaved_order_control : 0.000003s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.01% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.00% optimize.control_data_broadcast_order : 0.000013s : 0.02% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.00% optimize.offloading_packed_experts : 0.000004s : 0.01% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.01% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.00% optimize.overlap_recompute_comm : 0.000002s : 0.00% optimize.overlap_grad_ring_attention : 0.000004s : 0.01% optimize.overlap_grad_flash_sp : 0.000021s : 0.04% optimize.begin_end_overlap_inline : 0.000000s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.00% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000003s : 0.01% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000011s : 0.02% optimize.symbol_engine_optimizer.elim_not_effective : 0.000014s : 0.02% optimize.symbol_engine_optimizer.opt_reshape : 0.000008s : 0.01% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000010s : 0.02% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.00% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000016s : 0.03% get_jit_bprop_graph : 0.000002s : 0.00% rewriter_after_jit_bprop_graph : 0.000006s : 0.01% opt_after_jit_grad : 0.000525s : 0.92% validate : 0.000044s : 0.08% Time group info: ------[substitution.] 0.000239 34 17.00% : 0.000041s : 6: substitution.arithmetic_simplify 0.88% : 0.000002s : 2: substitution.elim_not_effective 0.57% : 0.000001s : 2: substitution.fold_const_symbol 2.79% : 0.000007s : 4: substitution.graph_param_transform 64.76% : 0.000155s : 4: substitution.inline 2.00% : 0.000005s : 4: substitution.j_node_and_user_rematch 2.17% : 0.000005s : 4: substitution.remove_not_recompute_node 2.51% : 0.000006s : 4: substitution.replace_old_param 7.33% : 0.000018s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.050468 2 98.56% : 0.049739s : 1: type_inference.infer 1.44% : 0.000729s : 1: type_inference.specialize ------[replace.] 0.000080 8 51.65% : 0.000041s : 4: replace.inline 48.35% : 0.000039s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000167 8 90.74% : 0.000152s : 4: match.inline 9.26% : 0.000015s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000748 1278 0.32% : 0.000002s : 13: predicate.accumulaten_eliminater 0.21% : 0.000002s : 4: predicate.ad_related_special_op_eliminate 0.18% : 0.000001s : 8: predicate.addn_check_dump 0.30% : 0.000002s : 13: predicate.addn_zero_filter 0.26% : 0.000002s : 13: predicate.adjust_all_reduce_mul_add 0.84% : 0.000006s : 21: predicate.arithmetic_simplify 0.31% : 0.000002s : 13: predicate.cast_eliminate 0.23% : 0.000002s : 8: predicate.check_bprop_eliminate 0.16% : 0.000001s : 8: predicate.compare_switch_simplify 0.05% : 0.000000s : 4: predicate.const_output_eliminate 0.19% : 0.000001s : 8: predicate.depend_value_elim 0.28% : 0.000002s : 13: predicate.dict_get_item_const_eliminator 0.35% : 0.000003s : 13: predicate.dict_get_item_eliminator 0.26% : 0.000002s : 13: predicate.dict_set_item_eliminator 0.30% : 0.000002s : 8: predicate.dumpgradient_eliminate 0.05% : 0.000000s : 4: predicate.elim_not_effective 0.11% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 0.35% : 0.000003s : 17: predicate.environ_add_const_eliminate 68.76% : 0.000514s : 17: predicate.environ_get_add_eliminate 0.36% : 0.000003s : 17: predicate.environ_get_depend_swap 0.54% : 0.000004s : 25: predicate.environ_get_eliminate 0.49% : 0.000004s : 17: predicate.environ_get_set_eliminate 0.45% : 0.000003s : 21: predicate.exchange_switch_depend_value 0.75% : 0.000006s : 21: predicate.float_depend_g_call 0.17% : 0.000001s : 8: predicate.float_environ_get_switch 0.24% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.05% : 0.000000s : 4: predicate.fold_const_symbol 0.20% : 0.000001s : 8: predicate.get_grad_eliminate 0.09% : 0.000001s : 4: predicate.graph_param_transform 0.17% : 0.000001s : 8: predicate.incorporate_call 0.14% : 0.000001s : 8: predicate.incorporate_call_switch 2.09% : 0.000016s : 58: predicate.inline 0.27% : 0.000002s : 8: predicate.inline_without_move 0.09% : 0.000001s : 8: predicate.j_node_and_user_rematch 0.27% : 0.000002s : 8: predicate.less_batch_normalization 0.58% : 0.000004s : 25: predicate.list_to_tuple_eliminator_ 0.75% : 0.000006s : 38: predicate.load_eliminater 0.30% : 0.000002s : 4: predicate.loop_unroll_after_grad 0.81% : 0.000006s : 34: predicate.loop_unroll_before_grad 0.46% : 0.000003s : 21: predicate.make_slice_get_slice_eliminator 0.19% : 0.000001s : 8: predicate.merge_addn 0.17% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.20% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.26% : 0.000002s : 13: predicate.minmaximum_grad 0.36% : 0.000003s : 4: predicate.mutable_eliminate 0.10% : 0.000001s : 4: predicate.opt_reshape 0.14% : 0.000001s : 4: predicate.parallel_virtual_node 0.62% : 0.000005s : 21: predicate.partial_defer_inline 0.49% : 0.000004s : 21: predicate.partial_eliminate 0.30% : 0.000002s : 13: predicate.print_const_string_wrapper 0.17% : 0.000001s : 8: predicate.reduce_all_const_elim 0.44% : 0.000003s : 13: predicate.reduce_eliminate 0.73% : 0.000005s : 38: predicate.redundant_stop_gradient_eliminater 0.14% : 0.000001s : 8: predicate.remove_not_recompute_node 0.42% : 0.000003s : 25: predicate.replace_applicator 0.16% : 0.000001s : 8: predicate.replace_old_param 0.07% : 0.000001s : 4: predicate.reset_defer_inline 0.29% : 0.000002s : 13: predicate.reshape_eliminate 0.21% : 0.000002s : 8: predicate.row_tensor_add_zeros_like 0.11% : 0.000001s : 4: predicate.row_tensor_eliminate 0.23% : 0.000002s : 8: predicate.same_eliminate 0.13% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.24% : 0.000002s : 8: predicate.shard_identity_eliminate 0.26% : 0.000002s : 8: predicate.special_op_eliminate 0.21% : 0.000002s : 8: predicate.specialize_transform 0.29% : 0.000002s : 8: predicate.split_environ_get_set_with_tuple_value 0.26% : 0.000002s : 8: predicate.stack_unstack_eliminate 0.11% : 0.000001s : 4: predicate.switch_call_monad_eliminater 0.47% : 0.000003s : 21: predicate.switch_defer_inline 0.64% : 0.000005s : 29: predicate.switch_layer_defer_inline 1.56% : 0.000012s : 67: predicate.switch_simplify 0.30% : 0.000002s : 13: predicate.tile_eliminate 0.28% : 0.000002s : 13: predicate.transpose_eliminate 0.45% : 0.000003s : 21: predicate.tuple_list_convert_item_index_to_positive 0.45% : 0.000003s : 21: predicate.tuple_list_get_item_const_eliminator 0.40% : 0.000003s : 21: predicate.tuple_list_get_item_depend_reorder 1.06% : 0.000008s : 33: predicate.tuple_list_get_item_eliminator 0.51% : 0.000004s : 21: predicate.tuple_list_get_set_item_eliminator 0.73% : 0.000005s : 29: predicate.tuple_list_set_item_eliminator 0.59% : 0.000004s : 25: predicate.tuple_to_list_eliminator_ 0.74% : 0.000006s : 38: predicate.updatestate_pure_node_eliminater 0.94% : 0.000007s : 46: predicate.updatestate_useless_node_eliminater 0.14% : 0.000001s : 4: predicate.value_based_eliminate 0.20% : 0.000001s : 8: predicate.virtual_dataset_eliminate 0.18% : 0.000001s : 8: predicate.virtual_output_eliminate 0.09% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.17% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000611 11 54.58% : 0.000333s : 5: func_graph_cloner_run.FuncGraphClonerGraph 45.42% : 0.000277s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.074240 192 0.00% : 0.000004s : 1: ForceFp32Comm 4.86% : 0.003612s : 1: add_attr 4.85% : 0.003598s : 1: add_attr_with_inline 0.01% : 0.000004s : 1: add_comm_op_reuse_tag 0.08% : 0.000056s : 1: add_recomputation 0.01% : 0.000004s : 1: assign_add_opt 0.09% : 0.000069s : 1: auto_monad 0.03% : 0.000020s : 1: auto_monad_reorder 0.00% : 0.000003s : 1: begin_end_overlap_inline 0.01% : 0.000007s : 1: bias_add_comm_swap 0.64% : 0.000473s : 1: bootstrap 0.05% : 0.000035s : 1: cconv 0.01% : 0.000004s : 1: comm_op_add_attrs 0.02% : 0.000017s : 1: control_data_broadcast_order 0.02% : 0.000011s : 1: convert_after_rewriter 0.04% : 0.000027s : 1: cse_after_recomputation 0.01% : 0.000005s : 1: dataset_repeat_opt 0.01% : 0.000006s : 1: detach_backward 0.01% : 0.000009s : 1: environ_conv 0.03% : 0.000026s : 1: event_method 0.01% : 0.000005s : 1: full_micro_interleaved_order_control 0.01% : 0.000006s : 1: get_jit_bprop_graph 0.01% : 0.000009s : 1: graph_reusing 0.01% : 0.000005s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000004s : 1: handle_group_info 0.01% : 0.000006s : 1: inline 0.01% : 0.000006s : 1: insert-virtual-dataset 0.01% : 0.000004s : 1: interleave_parallel_branches 0.01% : 0.000004s : 1: interleave_split_concat_branches 0.01% : 0.000007s : 1: label_fine_grained_interleaved_index 0.01% : 0.000008s : 1: label_micro_interleaved_index 0.67% : 0.000501s : 1: loop_unroll 0.01% : 0.000005s : 1: merge_cast_opt 0.01% : 0.000005s : 1: micro_interleaved_order_control 1.02% : 0.000757s : 1: mutable_eliminate 0.01% : 0.000007s : 1: offloading_packed_experts 0.02% : 0.000015s : 1: opt.transform.loop_unroll_optimizer 0.03% : 0.000019s : 1: opt.transform.mutable_eliminate 2.54% : 0.001889s : 78: opt.transform.opt_a 0.04% : 0.000032s : 1: opt.transform.opt_after_cconv 0.04% : 0.000027s : 1: opt.transform.opt_after_jit_grad 0.15% : 0.000109s : 28: opt.transform.opt_b 0.07% : 0.000053s : 2: opt.transform.opt_trans_graph 0.05% : 0.000039s : 4: opt.transform.symbol_engine_opt 4.99% : 0.003708s : 1: opt_a 0.16% : 0.000119s : 1: opt_after_cconv 0.72% : 0.000536s : 1: opt_after_jit_grad 0.31% : 0.000233s : 1: opt_b 8.26% : 0.006131s : 1: optimize 0.03% : 0.000022s : 1: optimize_parallel_all_gather_comm 0.01% : 0.000008s : 1: order_py_execute_after_rewriter 0.03% : 0.000025s : 1: overlap_grad_flash_sp 0.01% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.01% : 0.000007s : 1: overlap_grad_ring_attention 0.01% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.01% : 0.000005s : 1: overlap_param_gather 0.01% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.01% : 0.000009s : 1: overlap_recompute_and_grad_model_parallel 0.01% : 0.000006s : 1: overlap_recompute_comm 0.01% : 0.000007s : 1: parallel-infer-symbol 0.01% : 0.000004s : 1: parallel-infer-symbol-second 0.01% : 0.000005s : 1: partial_unused_args_eliminate 0.01% : 0.000005s : 1: pipeline_parallel_scheduler 0.01% : 0.000005s : 1: pipeline_split 0.06% : 0.000043s : 1: pre_auto_parallel 0.04% : 0.000033s : 1: py_interpret_to_execute 0.02% : 0.000017s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000004s : 1: remove_cast_before_assign_add 0.03% : 0.000019s : 1: remove_dup_value 0.63% : 0.000466s : 1: renormalize.infer 0.52% : 0.000383s : 1: renormalize.specialize 0.01% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.01% : 0.000009s : 1: rewriter_after_jit_bprop_graph 0.06% : 0.000046s : 1: rewriter_after_opt_a 0.13% : 0.000096s : 1: rewriter_before_opt_a 0.01% : 0.000006s : 1: slice_cell_reuse_recomputed_activation 0.01% : 0.000006s : 1: slice_recompute_activation 0.01% : 0.000005s : 1: split_layernorm_comm 0.01% : 0.000005s : 1: split_matmul_comm_elemetwise 0.01% : 0.000009s : 1: swap_dp_allreduce_reducescatter 0.11% : 0.000083s : 1: symbol_engine_optimizer 0.11% : 0.000085s : 1: tuple_transform 68.10% : 0.050558s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:43:21.163.177 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:43:21.163.455 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0839926, [21] [bootstrap]: 0.00044142 [type_inference]: 0.0724119 [event_method]: 2.364e-05 [auto_monad]: 7.072e-05 [graph_reusing]: 6.34999e-06 [inline]: 2.58e-06 [add_attr]: 0.00355622, [1] [add_attr_with_inline]: 0.00354532, [1] [Cycle 1]: 8.861e-05, [2] [tag_attr]: 2.187e-05 [meta_addattr_fg_expand]: 6.07999e-06 [parallel-infer-symbol]: 3.82998e-06 [pre_auto_parallel]: 4.089e-05 [insert-virtual-dataset]: 2.39999e-06 [parallel-infer-symbol-second]: 7.79983e-07 [dataset_repeat_opt]: 2.10002e-06 [pipeline_split]: 1.64e-06 [optimize]: 0.0062107, [53] [py_interpret_to_execute]: 3.138e-05 [rewriter_before_opt_a]: 0.00037423 [opt_a]: 0.00349395, [2] [Cycle 1]: 0.00238794, [45] [expand_dump_flag]: 3.95e-06 [switch_simplify]: 4.617e-05 [loop_unroll]: 3.052e-05 [a_1]: 0.0006676 [with_stream_mark]: 2.084e-05 [recompute_prepare]: 9.34998e-06 [updatestate_depend_eliminate]: 4e-06 [updatestate_assign_eliminate]: 3.38e-06 [updatestate_loads_eliminate]: 2.94999e-06 [parameter_eliminate]: 1.99999e-06 [a_2]: 0.00011795 [accelerated_algorithm]: 7.22002e-06 [shard]: 2.21e-06 [meta_shard_fg_expand]: 1.99999e-06 [shard_inline]: 6.59999e-06 [merge_send_recv]: 8.54e-06 [auto_parallel]: 7.47998e-06 [parallel]: 1.881e-05 [flash_sp]: 8.97e-06 [merge_comm]: 4.28999e-06 [allreduce_fusion]: 4.15e-06 [matmul_add_comm_reduction]: 9.92001e-06 [allreduce_slice_to_reducescatter]: 7.89994e-07 [virtual_shard_identity]: 8.50001e-06 [virtual_dataset]: 6.77002e-06 [get_grad_eliminate_]: 6.80002e-06 [virtual_output]: 6.57002e-06 [merge_forward]: 4.37e-06 [cell_reuse_recompute_pass]: 1.45001e-06 [offload_activation]: 1.08e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.494e-05 [merge_recompute_call_nodes]: 2.01998e-06 [before_grad]: 1.106e-05 [set_forward_comm_id_for_comm_node_pass]: 4.1e-06 [meta_fg_expand]: 2.64999e-06 [flash_sp_send_recv_attached]: 2.74999e-06 [receive_attached]: 2.17999e-06 [after_resolve]: 1.275e-05 [a_after_grad]: 1e-05 [renormalize]: 0.00077941 [add_forward_monad_depend]: 5.66e-06 [auto_monad_grad]: 2.36998e-06 [auto_monad_eliminator]: 1.614e-05 [cse]: 2.963e-05 [a_3]: 6.167e-05 [Cycle 2]: 0.00107684, [45] [expand_dump_flag]: 2.44999e-06 [switch_simplify]: 8.33001e-06 [loop_unroll]: 6.47001e-06 [a_1]: 0.00031885 [with_stream_mark]: 1.725e-05 [recompute_prepare]: 8.75001e-06 [updatestate_depend_eliminate]: 3.50998e-06 [updatestate_assign_eliminate]: 2.71e-06 [updatestate_loads_eliminate]: 2.51e-06 [parameter_eliminate]: 1.59e-06 [a_2]: 0.00011082 [accelerated_algorithm]: 6.88e-06 [shard]: 1.94e-06 [meta_shard_fg_expand]: 1.77999e-06 [shard_inline]: 6.24001e-06 [merge_send_recv]: 5.94e-06 [auto_parallel]: 6.63e-06 [parallel]: 7.43e-06 [flash_sp]: 4.45999e-06 [merge_comm]: 3.58999e-06 [allreduce_fusion]: 4.52e-06 [matmul_add_comm_reduction]: 7.23999e-06 [allreduce_slice_to_reducescatter]: 6.29982e-07 [virtual_shard_identity]: 7.62002e-06 [virtual_dataset]: 6.90998e-06 [get_grad_eliminate_]: 6.58e-06 [virtual_output]: 6.02999e-06 [merge_forward]: 3.43999e-06 [cell_reuse_recompute_pass]: 1.80001e-06 [offload_activation]: 7.55e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.594e-05 [merge_recompute_call_nodes]: 9.70002e-07 [before_grad]: 1.049e-05 [set_forward_comm_id_for_comm_node_pass]: 4.32e-06 [meta_fg_expand]: 2.17001e-06 [flash_sp_send_recv_attached]: 1.15999e-06 [receive_attached]: 1.52001e-06 [after_resolve]: 1.156e-05 [a_after_grad]: 9.23002e-06 [renormalize]: 8.00064e-08 [add_forward_monad_depend]: 1.88002e-06 [auto_monad_grad]: 1.36002e-06 [auto_monad_eliminator]: 8.62998e-06 [cse]: 1.632e-05 [a_3]: 4.93e-05 [py_interpret_to_execute_after_opt_a]: 1.413e-05 [slice_cell_reuse_recomputed_activation]: 4.80001e-06 [rewriter_after_opt_a]: 4.143e-05 [convert_after_rewriter]: 9.76e-06 [order_py_execute_after_rewriter]: 8.20999e-06 [mutable_eliminate]: 0.00062911 [opt_b]: 0.00026985, [1] [Cycle 1]: 0.00025954, [7] [b_1]: 0.00016675 [b_2]: 8.06001e-06 [updatestate_depend_eliminate]: 6.58e-06 [updatestate_assign_eliminate]: 2.56e-06 [updatestate_loads_eliminate]: 2.41e-06 [renormalize]: 5.39992e-07 [cse]: 1.756e-05 [optimize_parallel_all_gather_comm]: 2.234e-05 [overlap_param_gather]: 5.15999e-06 [cconv]: 3.417e-05 [loop_unroll]: 0.00044795 [opt_after_cconv]: 0.00012736, [1] [Cycle 1]: 0.00011832, [7] [c_1]: 3.138e-05 [parameter_eliminate]: 3.85e-06 [updatestate_depend_eliminate]: 5.52001e-06 [updatestate_assign_eliminate]: 2.61e-06 [updatestate_loads_eliminate]: 2.31e-06 [cse]: 1.732e-05 [renormalize]: 3.19997e-07 [remove_dup_value]: 1.635e-05 [tuple_transform]: 9.361e-05, [1] [Cycle 1]: 8.637e-05, [4] [d_1]: 4.614e-05 [none_parameter_eliminate]: 1.66e-06 [renormalize]: 2.10013e-07 [switch_simplify]: 7.3e-06 [partial_unused_args_eliminate]: 4.47998e-06 [add_recomputation]: 5.289e-05 [cse_after_recomputation]: 2.709e-05, [1] [Cycle 1]: 2.038e-05, [1] [cse]: 1.116e-05 [environ_conv]: 8.51002e-06 [swap_dp_allreduce_reducescatter]: 7.67998e-06 [bias_add_comm_swap]: 5.51998e-06 [label_micro_interleaved_index]: 6.69999e-06 [label_fine_grained_interleaved_index]: 5.52001e-06 [merge_cast_opt]: 3.83999e-06 [slice_recompute_activation]: 4.33999e-06 [micro_interleaved_order_control]: 4.87e-06 [assign_add_opt]: 3.60998e-06 [ForceFp32Comm]: 3.75e-06 [remove_cast_before_assign_add]: 3.58999e-06 [full_micro_interleaved_order_control]: 4.80999e-06 [reorder_send_recv_between_fp_bp]: 5.34998e-06 [comm_op_add_attrs]: 3.55e-06 [add_comm_op_reuse_tag]: 3.39001e-06 [interleave_split_concat_branches]: 3.55998e-06 [interleave_parallel_branches]: 3.57002e-06 [overlap_opt_shard_in_pipeline]: 3.56001e-06 [overlap_opt_shard_grad_in_pipeline]: 4.65999e-06 [control_data_broadcast_order]: 1.616e-05 [grouped_pairwise_exchange_alltoall]: 4.3e-06 [offloading_packed_experts]: 7.11999e-06 [overlap_recompute_and_grad_model_parallel]: 7.29001e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.54002e-06 [overlap_recompute_allgather_and_fa_grad]: 4.06001e-06 [overlap_recompute_comm]: 5.42001e-06 [overlap_grad_ring_attention]: 6.61999e-06 [overlap_grad_flash_sp]: 2.367e-05 [begin_end_overlap_inline]: 3.11999e-06 [split_matmul_comm_elemetwise]: 4.72e-06 [split_layernorm_comm]: 4.24002e-06 [handle_group_info]: 3.28998e-06 [symbol_engine_optimizer]: 9.844e-05, [1] [Cycle 1]: 9.183e-05, [6] [build]: 3.46999e-06 [elim_shapecalc]: 1.009e-05 [elim_not_effective]: 1.377e-05 [opt_reshape]: 7.18e-06 [fold_const_symbol]: 1.026e-05 [renormalize]: 2.19996e-07 [detach_backward]: 3.76999e-06 [pipeline_parallel_scheduler]: 1.91e-06 [auto_monad_reorder]: 1.95e-05 [get_jit_bprop_graph]: 1.79e-06 [rewriter_after_jit_bprop_graph]: 4.43001e-06 [opt_after_jit_grad]: 0.00052852 [validate]: 4.002e-05 Sums bootstrap : 0.000441s : 0.56% type_inference : 0.072412s : 92.12% event_method : 0.000024s : 0.03% auto_monad : 0.000071s : 0.09% graph_reusing : 0.000006s : 0.01% inline : 0.000003s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000022s : 0.03% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.01% parallel-infer-symbol : 0.000004s : 0.00% pre_auto_parallel : 0.000041s : 0.05% insert-virtual-dataset : 0.000002s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000031s : 0.04% optimize.rewriter_before_opt_a : 0.000374s : 0.48% optimize.opt_a.expand_dump_flag : 0.000006s : 0.01% optimize.opt_a.switch_simplify : 0.000055s : 0.07% optimize.opt_a.loop_unroll : 0.000037s : 0.05% optimize.opt_a.a_1 : 0.000986s : 1.25% optimize.opt_a.with_stream_mark : 0.000038s : 0.05% optimize.opt_a.recompute_prepare : 0.000018s : 0.02% optimize.opt_a.updatestate_depend_eliminate : 0.000008s : 0.01% optimize.opt_a.updatestate_assign_eliminate : 0.000006s : 0.01% optimize.opt_a.updatestate_loads_eliminate : 0.000005s : 0.01% optimize.opt_a.parameter_eliminate : 0.000004s : 0.00% optimize.opt_a.a_2 : 0.000229s : 0.29% optimize.opt_a.accelerated_algorithm : 0.000014s : 0.02% optimize.opt_a.shard : 0.000004s : 0.01% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.00% optimize.opt_a.shard_inline : 0.000013s : 0.02% optimize.opt_a.merge_send_recv : 0.000014s : 0.02% optimize.opt_a.auto_parallel : 0.000014s : 0.02% optimize.opt_a.parallel : 0.000026s : 0.03% optimize.opt_a.flash_sp : 0.000013s : 0.02% optimize.opt_a.merge_comm : 0.000008s : 0.01% optimize.opt_a.allreduce_fusion : 0.000009s : 0.01% optimize.opt_a.matmul_add_comm_reduction : 0.000017s : 0.02% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000016s : 0.02% optimize.opt_a.virtual_dataset : 0.000014s : 0.02% optimize.opt_a.get_grad_eliminate_ : 0.000013s : 0.02% optimize.opt_a.virtual_output : 0.000013s : 0.02% optimize.opt_a.merge_forward : 0.000008s : 0.01% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.00% optimize.opt_a.offload_activation : 0.000018s : 0.02% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000031s : 0.04% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.00% optimize.opt_a.before_grad : 0.000022s : 0.03% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000008s : 0.01% optimize.opt_a.meta_fg_expand : 0.000005s : 0.01% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.00% optimize.opt_a.receive_attached : 0.000004s : 0.00% optimize.opt_a.after_resolve : 0.000024s : 0.03% optimize.opt_a.a_after_grad : 0.000019s : 0.02% optimize.opt_a.renormalize : 0.000779s : 0.99% optimize.opt_a.add_forward_monad_depend : 0.000008s : 0.01% optimize.opt_a.auto_monad_grad : 0.000004s : 0.00% optimize.opt_a.auto_monad_eliminator : 0.000025s : 0.03% optimize.opt_a.cse : 0.000046s : 0.06% optimize.opt_a.a_3 : 0.000111s : 0.14% optimize.py_interpret_to_execute_after_opt_a : 0.000014s : 0.02% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.01% optimize.rewriter_after_opt_a : 0.000041s : 0.05% optimize.convert_after_rewriter : 0.000010s : 0.01% optimize.order_py_execute_after_rewriter : 0.000008s : 0.01% optimize.mutable_eliminate : 0.000629s : 0.80% optimize.opt_b.b_1 : 0.000167s : 0.21% optimize.opt_b.b_2 : 0.000008s : 0.01% optimize.opt_b.updatestate_depend_eliminate : 0.000007s : 0.01% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000002s : 0.00% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000018s : 0.02% optimize.optimize_parallel_all_gather_comm : 0.000022s : 0.03% optimize.overlap_param_gather : 0.000005s : 0.01% optimize.cconv : 0.000034s : 0.04% optimize.loop_unroll : 0.000448s : 0.57% optimize.opt_after_cconv.c_1 : 0.000031s : 0.04% optimize.opt_after_cconv.parameter_eliminate : 0.000004s : 0.00% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.01% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.00% optimize.opt_after_cconv.cse : 0.000017s : 0.02% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000016s : 0.02% optimize.tuple_transform.d_1 : 0.000046s : 0.06% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000007s : 0.01% optimize.partial_unused_args_eliminate : 0.000004s : 0.01% optimize.add_recomputation : 0.000053s : 0.07% optimize.cse_after_recomputation.cse : 0.000011s : 0.01% optimize.environ_conv : 0.000009s : 0.01% optimize.swap_dp_allreduce_reducescatter : 0.000008s : 0.01% optimize.bias_add_comm_swap : 0.000006s : 0.01% optimize.label_micro_interleaved_index : 0.000007s : 0.01% optimize.label_fine_grained_interleaved_index : 0.000006s : 0.01% optimize.merge_cast_opt : 0.000004s : 0.00% optimize.slice_recompute_activation : 0.000004s : 0.01% optimize.micro_interleaved_order_control : 0.000005s : 0.01% optimize.assign_add_opt : 0.000004s : 0.00% optimize.ForceFp32Comm : 0.000004s : 0.00% optimize.remove_cast_before_assign_add : 0.000004s : 0.00% optimize.full_micro_interleaved_order_control : 0.000005s : 0.01% optimize.reorder_send_recv_between_fp_bp : 0.000005s : 0.01% optimize.comm_op_add_attrs : 0.000004s : 0.00% optimize.add_comm_op_reuse_tag : 0.000003s : 0.00% optimize.interleave_split_concat_branches : 0.000004s : 0.00% optimize.interleave_parallel_branches : 0.000004s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000005s : 0.01% optimize.control_data_broadcast_order : 0.000016s : 0.02% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.01% optimize.offloading_packed_experts : 0.000007s : 0.01% optimize.overlap_recompute_and_grad_model_parallel : 0.000007s : 0.01% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.01% optimize.overlap_recompute_comm : 0.000005s : 0.01% optimize.overlap_grad_ring_attention : 0.000007s : 0.01% optimize.overlap_grad_flash_sp : 0.000024s : 0.03% optimize.begin_end_overlap_inline : 0.000003s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000005s : 0.01% optimize.split_layernorm_comm : 0.000004s : 0.01% optimize.handle_group_info : 0.000003s : 0.00% optimize.symbol_engine_optimizer.build : 0.000003s : 0.00% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000010s : 0.01% optimize.symbol_engine_optimizer.elim_not_effective : 0.000014s : 0.02% optimize.symbol_engine_optimizer.opt_reshape : 0.000007s : 0.01% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000010s : 0.01% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000004s : 0.00% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000020s : 0.02% get_jit_bprop_graph : 0.000002s : 0.00% rewriter_after_jit_bprop_graph : 0.000004s : 0.01% opt_after_jit_grad : 0.000529s : 0.67% validate : 0.000040s : 0.05% Time group info: ------[substitution.] 0.000235 34 15.22% : 0.000036s : 6: substitution.arithmetic_simplify 0.86% : 0.000002s : 2: substitution.elim_not_effective 0.59% : 0.000001s : 2: substitution.fold_const_symbol 2.73% : 0.000006s : 4: substitution.graph_param_transform 67.45% : 0.000158s : 4: substitution.inline 1.85% : 0.000004s : 4: substitution.j_node_and_user_rematch 2.20% : 0.000005s : 4: substitution.remove_not_recompute_node 2.41% : 0.000006s : 4: substitution.replace_old_param 6.70% : 0.000016s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.072336 2 98.67% : 0.071373s : 1: type_inference.infer 1.33% : 0.000963s : 1: type_inference.specialize ------[replace.] 0.000062 8 64.30% : 0.000040s : 4: replace.inline 35.70% : 0.000022s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000170 8 91.83% : 0.000156s : 4: match.inline 8.17% : 0.000014s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000209 1278 0.87% : 0.000002s : 13: predicate.accumulaten_eliminater 0.72% : 0.000002s : 4: predicate.ad_related_special_op_eliminate 0.59% : 0.000001s : 8: predicate.addn_check_dump 0.91% : 0.000002s : 13: predicate.addn_zero_filter 0.84% : 0.000002s : 13: predicate.adjust_all_reduce_mul_add 2.63% : 0.000006s : 21: predicate.arithmetic_simplify 1.29% : 0.000003s : 13: predicate.cast_eliminate 0.61% : 0.000001s : 8: predicate.check_bprop_eliminate 0.53% : 0.000001s : 8: predicate.compare_switch_simplify 0.18% : 0.000000s : 4: predicate.const_output_eliminate 0.61% : 0.000001s : 8: predicate.depend_value_elim 0.91% : 0.000002s : 13: predicate.dict_get_item_const_eliminator 0.99% : 0.000002s : 13: predicate.dict_get_item_eliminator 1.03% : 0.000002s : 13: predicate.dict_set_item_eliminator 1.13% : 0.000002s : 8: predicate.dumpgradient_eliminate 0.19% : 0.000000s : 4: predicate.elim_not_effective 0.34% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.10% : 0.000002s : 17: predicate.environ_add_const_eliminate 1.26% : 0.000003s : 17: predicate.environ_get_add_eliminate 1.04% : 0.000002s : 17: predicate.environ_get_depend_swap 1.66% : 0.000003s : 25: predicate.environ_get_eliminate 1.02% : 0.000002s : 17: predicate.environ_get_set_eliminate 1.39% : 0.000003s : 21: predicate.exchange_switch_depend_value 2.56% : 0.000005s : 21: predicate.float_depend_g_call 0.57% : 0.000001s : 8: predicate.float_environ_get_switch 0.87% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.17% : 0.000000s : 4: predicate.fold_const_symbol 0.77% : 0.000002s : 8: predicate.get_grad_eliminate 0.44% : 0.000001s : 4: predicate.graph_param_transform 0.60% : 0.000001s : 8: predicate.incorporate_call 0.48% : 0.000001s : 8: predicate.incorporate_call_switch 6.35% : 0.000013s : 58: predicate.inline 0.73% : 0.000002s : 8: predicate.inline_without_move 0.32% : 0.000001s : 8: predicate.j_node_and_user_rematch 0.85% : 0.000002s : 8: predicate.less_batch_normalization 1.74% : 0.000004s : 25: predicate.list_to_tuple_eliminator_ 2.48% : 0.000005s : 38: predicate.load_eliminater 0.79% : 0.000002s : 4: predicate.loop_unroll_after_grad 2.36% : 0.000005s : 34: predicate.loop_unroll_before_grad 1.68% : 0.000004s : 21: predicate.make_slice_get_slice_eliminator 0.57% : 0.000001s : 8: predicate.merge_addn 0.75% : 0.000002s : 8: predicate.micro_step_allgather_replace 0.59% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.84% : 0.000002s : 13: predicate.minmaximum_grad 0.95% : 0.000002s : 4: predicate.mutable_eliminate 0.38% : 0.000001s : 4: predicate.opt_reshape 0.45% : 0.000001s : 4: predicate.parallel_virtual_node 1.81% : 0.000004s : 21: predicate.partial_defer_inline 1.61% : 0.000003s : 21: predicate.partial_eliminate 0.83% : 0.000002s : 13: predicate.print_const_string_wrapper 0.58% : 0.000001s : 8: predicate.reduce_all_const_elim 1.08% : 0.000002s : 13: predicate.reduce_eliminate 2.56% : 0.000005s : 38: predicate.redundant_stop_gradient_eliminater 0.43% : 0.000001s : 8: predicate.remove_not_recompute_node 1.40% : 0.000003s : 25: predicate.replace_applicator 0.54% : 0.000001s : 8: predicate.replace_old_param 0.29% : 0.000001s : 4: predicate.reset_defer_inline 1.05% : 0.000002s : 13: predicate.reshape_eliminate 0.63% : 0.000001s : 8: predicate.row_tensor_add_zeros_like 0.37% : 0.000001s : 4: predicate.row_tensor_eliminate 0.76% : 0.000002s : 8: predicate.same_eliminate 0.43% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.75% : 0.000002s : 8: predicate.shard_identity_eliminate 0.71% : 0.000001s : 8: predicate.special_op_eliminate 0.72% : 0.000002s : 8: predicate.specialize_transform 0.76% : 0.000002s : 8: predicate.split_environ_get_set_with_tuple_value 0.69% : 0.000001s : 8: predicate.stack_unstack_eliminate 0.33% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.45% : 0.000003s : 21: predicate.switch_defer_inline 2.04% : 0.000004s : 29: predicate.switch_layer_defer_inline 5.37% : 0.000011s : 67: predicate.switch_simplify 0.84% : 0.000002s : 13: predicate.tile_eliminate 0.86% : 0.000002s : 13: predicate.transpose_eliminate 1.40% : 0.000003s : 21: predicate.tuple_list_convert_item_index_to_positive 1.54% : 0.000003s : 21: predicate.tuple_list_get_item_const_eliminator 1.39% : 0.000003s : 21: predicate.tuple_list_get_item_depend_reorder 3.26% : 0.000007s : 33: predicate.tuple_list_get_item_eliminator 1.43% : 0.000003s : 21: predicate.tuple_list_get_set_item_eliminator 2.37% : 0.000005s : 29: predicate.tuple_list_set_item_eliminator 1.73% : 0.000004s : 25: predicate.tuple_to_list_eliminator_ 2.37% : 0.000005s : 38: predicate.updatestate_pure_node_eliminater 3.03% : 0.000006s : 46: predicate.updatestate_useless_node_eliminater 0.39% : 0.000001s : 4: predicate.value_based_eliminate 0.73% : 0.000002s : 8: predicate.virtual_dataset_eliminate 0.58% : 0.000001s : 8: predicate.virtual_output_eliminate 0.31% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.49% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000678 11 47.53% : 0.000322s : 5: func_graph_cloner_run.FuncGraphClonerGraph 52.47% : 0.000356s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.096114 192 0.01% : 0.000006s : 1: ForceFp32Comm 3.71% : 0.003566s : 1: add_attr 3.69% : 0.003549s : 1: add_attr_with_inline 0.01% : 0.000007s : 1: add_comm_op_reuse_tag 0.06% : 0.000056s : 1: add_recomputation 0.01% : 0.000006s : 1: assign_add_opt 0.08% : 0.000080s : 1: auto_monad 0.03% : 0.000027s : 1: auto_monad_reorder 0.01% : 0.000006s : 1: begin_end_overlap_inline 0.01% : 0.000008s : 1: bias_add_comm_swap 0.51% : 0.000488s : 1: bootstrap 0.04% : 0.000037s : 1: cconv 0.01% : 0.000006s : 1: comm_op_add_attrs 0.02% : 0.000019s : 1: control_data_broadcast_order 0.01% : 0.000013s : 1: convert_after_rewriter 0.03% : 0.000030s : 1: cse_after_recomputation 0.01% : 0.000008s : 1: dataset_repeat_opt 0.02% : 0.000018s : 1: detach_backward 0.01% : 0.000011s : 1: environ_conv 0.04% : 0.000036s : 1: event_method 0.01% : 0.000008s : 1: full_micro_interleaved_order_control 0.01% : 0.000009s : 1: get_jit_bprop_graph 0.01% : 0.000013s : 1: graph_reusing 0.01% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000006s : 1: handle_group_info 0.01% : 0.000008s : 1: inline 0.01% : 0.000009s : 1: insert-virtual-dataset 0.01% : 0.000006s : 1: interleave_parallel_branches 0.01% : 0.000006s : 1: interleave_split_concat_branches 0.01% : 0.000008s : 1: label_fine_grained_interleaved_index 0.01% : 0.000010s : 1: label_micro_interleaved_index 0.47% : 0.000454s : 1: loop_unroll 0.01% : 0.000006s : 1: merge_cast_opt 0.01% : 0.000008s : 1: micro_interleaved_order_control 0.66% : 0.000636s : 1: mutable_eliminate 0.01% : 0.000010s : 1: offloading_packed_experts 0.02% : 0.000015s : 1: opt.transform.loop_unroll_optimizer 0.02% : 0.000015s : 1: opt.transform.mutable_eliminate 1.49% : 0.001437s : 78: opt.transform.opt_a 0.03% : 0.000030s : 1: opt.transform.opt_after_cconv 0.03% : 0.000025s : 1: opt.transform.opt_after_jit_grad 0.11% : 0.000103s : 28: opt.transform.opt_b 0.05% : 0.000051s : 2: opt.transform.opt_trans_graph 0.04% : 0.000038s : 4: opt.transform.symbol_engine_opt 3.64% : 0.003498s : 1: opt_a 0.14% : 0.000131s : 1: opt_after_cconv 0.56% : 0.000539s : 1: opt_after_jit_grad 0.28% : 0.000273s : 1: opt_b 6.82% : 0.006558s : 1: optimize 0.03% : 0.000026s : 1: optimize_parallel_all_gather_comm 0.01% : 0.000011s : 1: order_py_execute_after_rewriter 0.03% : 0.000027s : 1: overlap_grad_flash_sp 0.01% : 0.000006s : 1: overlap_grad_matmul_and_grad_allreduce 0.01% : 0.000010s : 1: overlap_grad_ring_attention 0.01% : 0.000007s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000006s : 1: overlap_opt_shard_in_pipeline 0.01% : 0.000008s : 1: overlap_param_gather 0.01% : 0.000007s : 1: overlap_recompute_allgather_and_fa_grad 0.01% : 0.000011s : 1: overlap_recompute_and_grad_model_parallel 0.01% : 0.000008s : 1: overlap_recompute_comm 0.01% : 0.000011s : 1: parallel-infer-symbol 0.01% : 0.000006s : 1: parallel-infer-symbol-second 0.01% : 0.000007s : 1: partial_unused_args_eliminate 0.01% : 0.000009s : 1: pipeline_parallel_scheduler 0.01% : 0.000007s : 1: pipeline_split 0.05% : 0.000049s : 1: pre_auto_parallel 0.04% : 0.000035s : 1: py_interpret_to_execute 0.02% : 0.000018s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000006s : 1: remove_cast_before_assign_add 0.02% : 0.000019s : 1: remove_dup_value 0.44% : 0.000427s : 1: renormalize.infer 0.36% : 0.000343s : 1: renormalize.specialize 0.01% : 0.000008s : 1: reorder_send_recv_between_fp_bp 0.01% : 0.000010s : 1: rewriter_after_jit_bprop_graph 0.05% : 0.000045s : 1: rewriter_after_opt_a 0.40% : 0.000383s : 1: rewriter_before_opt_a 0.01% : 0.000007s : 1: slice_cell_reuse_recomputed_activation 0.01% : 0.000007s : 1: slice_recompute_activation 0.01% : 0.000007s : 1: split_layernorm_comm 0.01% : 0.000008s : 1: split_matmul_comm_elemetwise 0.01% : 0.000011s : 1: swap_dp_allreduce_reducescatter 0.11% : 0.000102s : 1: symbol_engine_optimizer 0.10% : 0.000096s : 1: tuple_transform 75.40% : 0.072468s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:43:22.386.833 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.103187, [21] [bootstrap]: 0.00050833 [type_inference]: 0.00715975 [event_method]: 2.108e-05 [auto_monad]: 7.007e-05 [graph_reusing]: 6.57002e-06 [inline]: 2.59001e-06 [add_attr]: 0.00391124, [1] [add_attr_with_inline]: 0.00389691, [1] [Cycle 1]: 8.33e-05, [2] [tag_attr]: 2.57e-05 [meta_addattr_fg_expand]: 6.57002e-06 [parallel-infer-symbol]: 3.44001e-06 [pre_auto_parallel]: 4.496e-05 [insert-virtual-dataset]: 2.96999e-06 [parallel-infer-symbol-second]: 9.00007e-07 [dataset_repeat_opt]: 2.48002e-06 [pipeline_split]: 1.72999e-06 [optimize]: 0.0905338, [53] [py_interpret_to_execute]: 3.686e-05 [rewriter_before_opt_a]: 9.642e-05 [opt_a]: 0.0879699, [2] [Cycle 1]: 0.0871786, [45] [expand_dump_flag]: 3.31999e-06 [switch_simplify]: 4.651e-05 [loop_unroll]: 3.076e-05 [a_1]: 0.00072675 [with_stream_mark]: 2.37e-05 [recompute_prepare]: 1.367e-05 [updatestate_depend_eliminate]: 5.53002e-06 [updatestate_assign_eliminate]: 3.18998e-06 [updatestate_loads_eliminate]: 2.96999e-06 [parameter_eliminate]: 2.24001e-06 [a_2]: 9.321e-05 [accelerated_algorithm]: 8.3e-06 [shard]: 1.81e-06 [meta_shard_fg_expand]: 1.97999e-06 [shard_inline]: 6.91001e-06 [merge_send_recv]: 8.64e-06 [auto_parallel]: 7.95e-06 [parallel]: 2.045e-05 [flash_sp]: 1.01e-05 [merge_comm]: 4.52e-06 [allreduce_fusion]: 3.45e-06 [matmul_add_comm_reduction]: 1.105e-05 [allreduce_slice_to_reducescatter]: 8.70001e-07 [virtual_shard_identity]: 9.72999e-06 [virtual_dataset]: 6.90998e-06 [get_grad_eliminate_]: 6.73998e-06 [virtual_output]: 6.84999e-06 [merge_forward]: 4.35e-06 [cell_reuse_recompute_pass]: 2.63e-06 [offload_activation]: 1.2e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.308e-05 [merge_recompute_call_nodes]: 1.58002e-06 [before_grad]: 1.348e-05 [set_forward_comm_id_for_comm_node_pass]: 4.57e-06 [meta_fg_expand]: 3.07002e-06 [flash_sp_send_recv_attached]: 3.37002e-06 [receive_attached]: 2.51e-06 [after_resolve]: 1.341e-05 [a_after_grad]: 1.278e-05 [renormalize]: 0.0855695 [add_forward_monad_depend]: 1.2e-05 [auto_monad_grad]: 3.03998e-06 [auto_monad_eliminator]: 2.549e-05 [cse]: 3.655e-05 [a_3]: 6.962e-05 [Cycle 2]: 0.0007771, [45] [expand_dump_flag]: 2.70002e-06 [switch_simplify]: 1.118e-05 [loop_unroll]: 8.13001e-06 [a_1]: 0.00016745 [with_stream_mark]: 2.082e-05 [recompute_prepare]: 7.27002e-06 [updatestate_depend_eliminate]: 4.13001e-06 [updatestate_assign_eliminate]: 3.17002e-06 [updatestate_loads_eliminate]: 3.14999e-06 [parameter_eliminate]: 2.38998e-06 [a_2]: 8.118e-05 [accelerated_algorithm]: 7.13998e-06 [shard]: 2.28002e-06 [meta_shard_fg_expand]: 2.27999e-06 [shard_inline]: 6.31e-06 [merge_send_recv]: 8.57998e-06 [auto_parallel]: 9.31e-06 [parallel]: 9.54999e-06 [flash_sp]: 4.50999e-06 [merge_comm]: 3.80998e-06 [allreduce_fusion]: 4.05e-06 [matmul_add_comm_reduction]: 1.042e-05 [allreduce_slice_to_reducescatter]: 9.30013e-07 [virtual_shard_identity]: 8.08999e-06 [virtual_dataset]: 6.36e-06 [get_grad_eliminate_]: 5.99e-06 [virtual_output]: 6.24999e-06 [merge_forward]: 4.07e-06 [cell_reuse_recompute_pass]: 3.66999e-06 [offload_activation]: 1.113e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.302e-05 [merge_recompute_call_nodes]: 1.53002e-06 [before_grad]: 1.08e-05 [set_forward_comm_id_for_comm_node_pass]: 3.79002e-06 [meta_fg_expand]: 2.59001e-06 [flash_sp_send_recv_attached]: 2.19999e-06 [receive_attached]: 2.65002e-06 [after_resolve]: 1.322e-05 [a_after_grad]: 1.083e-05 [renormalize]: 7.00238e-08 [add_forward_monad_depend]: 2.32999e-06 [auto_monad_grad]: 1.64998e-06 [auto_monad_eliminator]: 8.67e-06 [cse]: 1.606e-05 [a_3]: 3.764e-05 [py_interpret_to_execute_after_opt_a]: 1.854e-05 [slice_cell_reuse_recomputed_activation]: 1.97999e-06 [rewriter_after_opt_a]: 3.866e-05 [convert_after_rewriter]: 7.06999e-06 [order_py_execute_after_rewriter]: 5.24e-06 [mutable_eliminate]: 0.00079561 [opt_b]: 0.0002338, [1] [Cycle 1]: 0.00022394, [7] [b_1]: 0.0001363 [b_2]: 8.68001e-06 [updatestate_depend_eliminate]: 8.28999e-06 [updatestate_assign_eliminate]: 2.63e-06 [updatestate_loads_eliminate]: 2.68e-06 [renormalize]: 1.15001e-06 [cse]: 2.678e-05 [optimize_parallel_all_gather_comm]: 1.889e-05 [overlap_param_gather]: 1.88002e-06 [cconv]: 3.633e-05 [loop_unroll]: 0.00050845 [opt_after_cconv]: 0.00010937, [1] [Cycle 1]: 0.00010273, [7] [c_1]: 3.194e-05 [parameter_eliminate]: 5.00001e-06 [updatestate_depend_eliminate]: 5.40001e-06 [updatestate_assign_eliminate]: 2.38998e-06 [updatestate_loads_eliminate]: 2.26e-06 [cse]: 2.03e-05 [renormalize]: 3.7998e-07 [remove_dup_value]: 1.349e-05 [tuple_transform]: 8.387e-05, [1] [Cycle 1]: 7.933e-05, [4] [d_1]: 5.07e-05 [none_parameter_eliminate]: 1.60999e-06 [renormalize]: 1.8999e-07 [switch_simplify]: 7.18e-06 [partial_unused_args_eliminate]: 1.72999e-06 [add_recomputation]: 5.646e-05 [cse_after_recomputation]: 2.389e-05, [1] [Cycle 1]: 1.895e-05, [1] [cse]: 1.242e-05 [environ_conv]: 6.35002e-06 [swap_dp_allreduce_reducescatter]: 5.05999e-06 [bias_add_comm_swap]: 3.67998e-06 [label_micro_interleaved_index]: 4.93001e-06 [label_fine_grained_interleaved_index]: 3.01001e-06 [merge_cast_opt]: 1.31002e-06 [slice_recompute_activation]: 2.27999e-06 [micro_interleaved_order_control]: 2.49001e-06 [assign_add_opt]: 1.59e-06 [ForceFp32Comm]: 1.20999e-06 [remove_cast_before_assign_add]: 1.10001e-06 [full_micro_interleaved_order_control]: 2.64999e-06 [reorder_send_recv_between_fp_bp]: 2.88e-06 [comm_op_add_attrs]: 1.02e-06 [add_comm_op_reuse_tag]: 9.70002e-07 [interleave_split_concat_branches]: 1.16002e-06 [interleave_parallel_branches]: 1.04e-06 [overlap_opt_shard_in_pipeline]: 1.24e-06 [overlap_opt_shard_grad_in_pipeline]: 1.71e-06 [control_data_broadcast_order]: 1.371e-05 [grouped_pairwise_exchange_alltoall]: 1.94999e-06 [offloading_packed_experts]: 4.38001e-06 [overlap_recompute_and_grad_model_parallel]: 5.72001e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.45001e-06 [overlap_recompute_allgather_and_fa_grad]: 1.36998e-06 [overlap_recompute_comm]: 2.69001e-06 [overlap_grad_ring_attention]: 4.63001e-06 [overlap_grad_flash_sp]: 2.233e-05 [begin_end_overlap_inline]: 5.49975e-07 [split_matmul_comm_elemetwise]: 2.27999e-06 [split_layernorm_comm]: 1.67999e-06 [handle_group_info]: 9.10019e-07 [symbol_engine_optimizer]: 9.112e-05, [1] [Cycle 1]: 8.571e-05, [6] [build]: 4.72e-06 [elim_shapecalc]: 1.299e-05 [elim_not_effective]: 1.578e-05 [opt_reshape]: 7.89997e-06 [fold_const_symbol]: 9.92999e-06 [renormalize]: 1.80007e-07 [detach_backward]: 2.71e-06 [pipeline_parallel_scheduler]: 1.57001e-06 [auto_monad_reorder]: 2.03e-05 [get_jit_bprop_graph]: 2.96001e-06 [rewriter_after_jit_bprop_graph]: 6.78e-06 [opt_after_jit_grad]: 0.00064995 [validate]: 5.385e-05 Sums bootstrap : 0.000508s : 0.52% type_inference : 0.007160s : 7.30% event_method : 0.000021s : 0.02% auto_monad : 0.000070s : 0.07% graph_reusing : 0.000007s : 0.01% inline : 0.000003s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000026s : 0.03% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000007s : 0.01% parallel-infer-symbol : 0.000003s : 0.00% pre_auto_parallel : 0.000045s : 0.05% insert-virtual-dataset : 0.000003s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000037s : 0.04% optimize.rewriter_before_opt_a : 0.000096s : 0.10% optimize.opt_a.expand_dump_flag : 0.000006s : 0.01% optimize.opt_a.switch_simplify : 0.000058s : 0.06% optimize.opt_a.loop_unroll : 0.000039s : 0.04% optimize.opt_a.a_1 : 0.000894s : 0.91% optimize.opt_a.with_stream_mark : 0.000045s : 0.05% optimize.opt_a.recompute_prepare : 0.000021s : 0.02% optimize.opt_a.updatestate_depend_eliminate : 0.000010s : 0.01% optimize.opt_a.updatestate_assign_eliminate : 0.000006s : 0.01% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.01% optimize.opt_a.parameter_eliminate : 0.000005s : 0.00% optimize.opt_a.a_2 : 0.000174s : 0.18% optimize.opt_a.accelerated_algorithm : 0.000015s : 0.02% optimize.opt_a.shard : 0.000004s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.00% optimize.opt_a.shard_inline : 0.000013s : 0.01% optimize.opt_a.merge_send_recv : 0.000017s : 0.02% optimize.opt_a.auto_parallel : 0.000017s : 0.02% optimize.opt_a.parallel : 0.000030s : 0.03% optimize.opt_a.flash_sp : 0.000015s : 0.01% optimize.opt_a.merge_comm : 0.000008s : 0.01% optimize.opt_a.allreduce_fusion : 0.000007s : 0.01% optimize.opt_a.matmul_add_comm_reduction : 0.000021s : 0.02% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000002s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000018s : 0.02% optimize.opt_a.virtual_dataset : 0.000013s : 0.01% optimize.opt_a.get_grad_eliminate_ : 0.000013s : 0.01% optimize.opt_a.virtual_output : 0.000013s : 0.01% optimize.opt_a.merge_forward : 0.000008s : 0.01% optimize.opt_a.cell_reuse_recompute_pass : 0.000006s : 0.01% optimize.opt_a.offload_activation : 0.000023s : 0.02% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000026s : 0.03% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.00% optimize.opt_a.before_grad : 0.000024s : 0.02% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000008s : 0.01% optimize.opt_a.meta_fg_expand : 0.000006s : 0.01% optimize.opt_a.flash_sp_send_recv_attached : 0.000006s : 0.01% optimize.opt_a.receive_attached : 0.000005s : 0.01% optimize.opt_a.after_resolve : 0.000027s : 0.03% optimize.opt_a.a_after_grad : 0.000024s : 0.02% optimize.opt_a.renormalize : 0.085570s : 87.19% optimize.opt_a.add_forward_monad_depend : 0.000014s : 0.01% optimize.opt_a.auto_monad_grad : 0.000005s : 0.00% optimize.opt_a.auto_monad_eliminator : 0.000034s : 0.03% optimize.opt_a.cse : 0.000053s : 0.05% optimize.opt_a.a_3 : 0.000107s : 0.11% optimize.py_interpret_to_execute_after_opt_a : 0.000019s : 0.02% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.00% optimize.rewriter_after_opt_a : 0.000039s : 0.04% optimize.convert_after_rewriter : 0.000007s : 0.01% optimize.order_py_execute_after_rewriter : 0.000005s : 0.01% optimize.mutable_eliminate : 0.000796s : 0.81% optimize.opt_b.b_1 : 0.000136s : 0.14% optimize.opt_b.b_2 : 0.000009s : 0.01% optimize.opt_b.updatestate_depend_eliminate : 0.000008s : 0.01% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.00% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000027s : 0.03% optimize.optimize_parallel_all_gather_comm : 0.000019s : 0.02% optimize.overlap_param_gather : 0.000002s : 0.00% optimize.cconv : 0.000036s : 0.04% optimize.loop_unroll : 0.000508s : 0.52% optimize.opt_after_cconv.c_1 : 0.000032s : 0.03% optimize.opt_after_cconv.parameter_eliminate : 0.000005s : 0.01% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000005s : 0.01% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000002s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.00% optimize.opt_after_cconv.cse : 0.000020s : 0.02% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000013s : 0.01% optimize.tuple_transform.d_1 : 0.000051s : 0.05% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000007s : 0.01% optimize.partial_unused_args_eliminate : 0.000002s : 0.00% optimize.add_recomputation : 0.000056s : 0.06% optimize.cse_after_recomputation.cse : 0.000012s : 0.01% optimize.environ_conv : 0.000006s : 0.01% optimize.swap_dp_allreduce_reducescatter : 0.000005s : 0.01% optimize.bias_add_comm_swap : 0.000004s : 0.00% optimize.label_micro_interleaved_index : 0.000005s : 0.01% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.00% optimize.merge_cast_opt : 0.000001s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.00% optimize.micro_interleaved_order_control : 0.000002s : 0.00% optimize.assign_add_opt : 0.000002s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.00% optimize.full_micro_interleaved_order_control : 0.000003s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.00% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.00% optimize.control_data_broadcast_order : 0.000014s : 0.01% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.00% optimize.offloading_packed_experts : 0.000004s : 0.00% optimize.overlap_recompute_and_grad_model_parallel : 0.000006s : 0.01% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.00% optimize.overlap_recompute_comm : 0.000003s : 0.00% optimize.overlap_grad_ring_attention : 0.000005s : 0.00% optimize.overlap_grad_flash_sp : 0.000022s : 0.02% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.00% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000005s : 0.00% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000013s : 0.01% optimize.symbol_engine_optimizer.elim_not_effective : 0.000016s : 0.02% optimize.symbol_engine_optimizer.opt_reshape : 0.000008s : 0.01% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000010s : 0.01% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000003s : 0.00% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000020s : 0.02% get_jit_bprop_graph : 0.000003s : 0.00% rewriter_after_jit_bprop_graph : 0.000007s : 0.01% opt_after_jit_grad : 0.000650s : 0.66% validate : 0.000054s : 0.05% Time group info: ------[substitution.] 0.000273 34 16.43% : 0.000045s : 6: substitution.arithmetic_simplify 0.72% : 0.000002s : 2: substitution.elim_not_effective 0.49% : 0.000001s : 2: substitution.fold_const_symbol 2.38% : 0.000007s : 4: substitution.graph_param_transform 67.40% : 0.000184s : 4: substitution.inline 1.95% : 0.000005s : 4: substitution.j_node_and_user_rematch 2.18% : 0.000006s : 4: substitution.remove_not_recompute_node 2.27% : 0.000006s : 4: substitution.replace_old_param 6.19% : 0.000017s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.007073 2 88.82% : 0.006283s : 1: type_inference.infer 11.18% : 0.000791s : 1: type_inference.specialize ------[replace.] 0.000073 8 64.24% : 0.000047s : 4: replace.inline 35.76% : 0.000026s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000196 8 92.29% : 0.000181s : 4: match.inline 7.71% : 0.000015s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000236 1278 0.94% : 0.000002s : 13: predicate.accumulaten_eliminater 1.08% : 0.000003s : 4: predicate.ad_related_special_op_eliminate 0.48% : 0.000001s : 8: predicate.addn_check_dump 1.03% : 0.000002s : 13: predicate.addn_zero_filter 0.77% : 0.000002s : 13: predicate.adjust_all_reduce_mul_add 2.47% : 0.000006s : 21: predicate.arithmetic_simplify 1.22% : 0.000003s : 13: predicate.cast_eliminate 0.72% : 0.000002s : 8: predicate.check_bprop_eliminate 0.52% : 0.000001s : 8: predicate.compare_switch_simplify 0.17% : 0.000000s : 4: predicate.const_output_eliminate 0.75% : 0.000002s : 8: predicate.depend_value_elim 0.88% : 0.000002s : 13: predicate.dict_get_item_const_eliminator 1.14% : 0.000003s : 13: predicate.dict_get_item_eliminator 0.77% : 0.000002s : 13: predicate.dict_set_item_eliminator 1.21% : 0.000003s : 8: predicate.dumpgradient_eliminate 0.26% : 0.000001s : 4: predicate.elim_not_effective 0.61% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.32% : 0.000003s : 17: predicate.environ_add_const_eliminate 1.08% : 0.000003s : 17: predicate.environ_get_add_eliminate 0.97% : 0.000002s : 17: predicate.environ_get_depend_swap 1.55% : 0.000004s : 25: predicate.environ_get_eliminate 1.01% : 0.000002s : 17: predicate.environ_get_set_eliminate 1.43% : 0.000003s : 21: predicate.exchange_switch_depend_value 2.23% : 0.000005s : 21: predicate.float_depend_g_call 0.58% : 0.000001s : 8: predicate.float_environ_get_switch 0.82% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.17% : 0.000000s : 4: predicate.fold_const_symbol 0.64% : 0.000002s : 8: predicate.get_grad_eliminate 0.30% : 0.000001s : 4: predicate.graph_param_transform 0.52% : 0.000001s : 8: predicate.incorporate_call 0.41% : 0.000001s : 8: predicate.incorporate_call_switch 5.99% : 0.000014s : 58: predicate.inline 0.80% : 0.000002s : 8: predicate.inline_without_move 0.28% : 0.000001s : 8: predicate.j_node_and_user_rematch 1.11% : 0.000003s : 8: predicate.less_batch_normalization 1.81% : 0.000004s : 25: predicate.list_to_tuple_eliminator_ 2.47% : 0.000006s : 38: predicate.load_eliminater 0.93% : 0.000002s : 4: predicate.loop_unroll_after_grad 2.18% : 0.000005s : 34: predicate.loop_unroll_before_grad 1.46% : 0.000003s : 21: predicate.make_slice_get_slice_eliminator 0.48% : 0.000001s : 8: predicate.merge_addn 0.50% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.53% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.86% : 0.000002s : 13: predicate.minmaximum_grad 1.35% : 0.000003s : 4: predicate.mutable_eliminate 0.46% : 0.000001s : 4: predicate.opt_reshape 0.41% : 0.000001s : 4: predicate.parallel_virtual_node 1.74% : 0.000004s : 21: predicate.partial_defer_inline 1.41% : 0.000003s : 21: predicate.partial_eliminate 0.89% : 0.000002s : 13: predicate.print_const_string_wrapper 0.58% : 0.000001s : 8: predicate.reduce_all_const_elim 1.21% : 0.000003s : 13: predicate.reduce_eliminate 2.40% : 0.000006s : 38: predicate.redundant_stop_gradient_eliminater 0.56% : 0.000001s : 8: predicate.remove_not_recompute_node 1.38% : 0.000003s : 25: predicate.replace_applicator 0.49% : 0.000001s : 8: predicate.replace_old_param 0.33% : 0.000001s : 4: predicate.reset_defer_inline 0.98% : 0.000002s : 13: predicate.reshape_eliminate 0.60% : 0.000001s : 8: predicate.row_tensor_add_zeros_like 0.38% : 0.000001s : 4: predicate.row_tensor_eliminate 0.80% : 0.000002s : 8: predicate.same_eliminate 0.55% : 0.000001s : 8: predicate.set_cell_output_no_recompute 1.01% : 0.000002s : 8: predicate.shard_identity_eliminate 0.88% : 0.000002s : 8: predicate.special_op_eliminate 0.68% : 0.000002s : 8: predicate.specialize_transform 1.06% : 0.000003s : 8: predicate.split_environ_get_set_with_tuple_value 0.96% : 0.000002s : 8: predicate.stack_unstack_eliminate 0.41% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.35% : 0.000003s : 21: predicate.switch_defer_inline 1.82% : 0.000004s : 29: predicate.switch_layer_defer_inline 4.98% : 0.000012s : 67: predicate.switch_simplify 1.02% : 0.000002s : 13: predicate.tile_eliminate 1.05% : 0.000002s : 13: predicate.transpose_eliminate 1.39% : 0.000003s : 21: predicate.tuple_list_convert_item_index_to_positive 1.50% : 0.000004s : 21: predicate.tuple_list_get_item_const_eliminator 1.63% : 0.000004s : 21: predicate.tuple_list_get_item_depend_reorder 3.23% : 0.000008s : 33: predicate.tuple_list_get_item_eliminator 1.44% : 0.000003s : 21: predicate.tuple_list_get_set_item_eliminator 2.37% : 0.000006s : 29: predicate.tuple_list_set_item_eliminator 1.85% : 0.000004s : 25: predicate.tuple_to_list_eliminator_ 2.25% : 0.000005s : 38: predicate.updatestate_pure_node_eliminater 2.79% : 0.000007s : 46: predicate.updatestate_useless_node_eliminater 0.37% : 0.000001s : 4: predicate.value_based_eliminate 0.68% : 0.000002s : 8: predicate.virtual_dataset_eliminate 0.60% : 0.000001s : 8: predicate.virtual_output_eliminate 0.26% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.41% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000708 11 50.21% : 0.000356s : 5: func_graph_cloner_run.FuncGraphClonerGraph 49.79% : 0.000353s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.284710 192 0.00% : 0.000004s : 1: ForceFp32Comm 1.38% : 0.003918s : 1: add_attr 1.37% : 0.003902s : 1: add_attr_with_inline 0.00% : 0.000005s : 1: add_comm_op_reuse_tag 0.02% : 0.000061s : 1: add_recomputation 0.00% : 0.000004s : 1: assign_add_opt 0.03% : 0.000075s : 1: auto_monad 0.01% : 0.000024s : 1: auto_monad_reorder 0.00% : 0.000003s : 1: begin_end_overlap_inline 0.00% : 0.000007s : 1: bias_add_comm_swap 0.19% : 0.000540s : 1: bootstrap 0.01% : 0.000040s : 1: cconv 0.00% : 0.000004s : 1: comm_op_add_attrs 0.01% : 0.000017s : 1: control_data_broadcast_order 0.00% : 0.000010s : 1: convert_after_rewriter 0.01% : 0.000027s : 1: cse_after_recomputation 0.00% : 0.000005s : 1: dataset_repeat_opt 0.00% : 0.000006s : 1: detach_backward 0.00% : 0.000010s : 1: environ_conv 0.01% : 0.000028s : 1: event_method 0.00% : 0.000006s : 1: full_micro_interleaved_order_control 0.00% : 0.000006s : 1: get_jit_bprop_graph 0.00% : 0.000010s : 1: graph_reusing 0.00% : 0.000005s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000004s : 1: handle_group_info 0.00% : 0.000006s : 1: inline 0.00% : 0.000006s : 1: insert-virtual-dataset 0.00% : 0.000004s : 1: interleave_parallel_branches 0.00% : 0.000004s : 1: interleave_split_concat_branches 0.00% : 0.000006s : 1: label_fine_grained_interleaved_index 0.00% : 0.000008s : 1: label_micro_interleaved_index 0.18% : 0.000518s : 1: loop_unroll 0.00% : 0.000004s : 1: merge_cast_opt 0.00% : 0.000005s : 1: micro_interleaved_order_control 0.28% : 0.000809s : 1: mutable_eliminate 0.01% : 0.000039s : 1: offloading_packed_experts 0.01% : 0.000015s : 1: opt.transform.loop_unroll_optimizer 0.01% : 0.000023s : 1: opt.transform.mutable_eliminate 0.48% : 0.001371s : 78: opt.transform.opt_a 0.01% : 0.000031s : 1: opt.transform.opt_after_cconv 0.01% : 0.000036s : 1: opt.transform.opt_after_jit_grad 0.04% : 0.000109s : 28: opt.transform.opt_b 0.02% : 0.000055s : 2: opt.transform.opt_trans_graph 0.01% : 0.000042s : 4: opt.transform.symbol_engine_opt 30.90% : 0.087974s : 1: opt_a 0.04% : 0.000113s : 1: opt_after_cconv 0.23% : 0.000663s : 1: opt_after_jit_grad 0.08% : 0.000238s : 1: opt_b 31.80% : 0.090541s : 1: optimize 0.01% : 0.000022s : 1: optimize_parallel_all_gather_comm 0.00% : 0.000008s : 1: order_py_execute_after_rewriter 0.01% : 0.000026s : 1: overlap_grad_flash_sp 0.00% : 0.000005s : 1: overlap_grad_matmul_and_grad_allreduce 0.00% : 0.000007s : 1: overlap_grad_ring_attention 0.00% : 0.000004s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000005s : 1: overlap_param_gather 0.00% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.00% : 0.000011s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000006s : 1: overlap_recompute_comm 0.00% : 0.000008s : 1: parallel-infer-symbol 0.00% : 0.000004s : 1: parallel-infer-symbol-second 0.00% : 0.000005s : 1: partial_unused_args_eliminate 0.00% : 0.000005s : 1: pipeline_parallel_scheduler 0.00% : 0.000005s : 1: pipeline_split 0.02% : 0.000049s : 1: pre_auto_parallel 0.01% : 0.000041s : 1: py_interpret_to_execute 0.01% : 0.000022s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000004s : 1: remove_cast_before_assign_add 0.01% : 0.000017s : 1: remove_dup_value 29.86% : 0.085023s : 1: renormalize.infer 0.18% : 0.000525s : 1: renormalize.specialize 0.00% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.00% : 0.000010s : 1: rewriter_after_jit_bprop_graph 0.02% : 0.000043s : 1: rewriter_after_opt_a 0.04% : 0.000101s : 1: rewriter_before_opt_a 0.00% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000007s : 1: slice_recompute_activation 0.00% : 0.000007s : 1: split_layernorm_comm 0.00% : 0.000005s : 1: split_matmul_comm_elemetwise 0.00% : 0.000008s : 1: swap_dp_allreduce_reducescatter 0.03% : 0.000094s : 1: symbol_engine_optimizer 0.03% : 0.000087s : 1: tuple_transform 2.52% : 0.007184s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:43:23.867.672 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:43:23.867.948 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.155587, [21] [bootstrap]: 0.00045553 [type_inference]: 0.0175891 [event_method]: 2.488e-05 [auto_monad]: 7.308e-05 [graph_reusing]: 6.63e-06 [inline]: 3.33998e-06 [add_attr]: 0.00401325, [1] [add_attr_with_inline]: 0.00399975, [1] [Cycle 1]: 9.528e-05, [2] [tag_attr]: 2.557e-05 [meta_addattr_fg_expand]: 6.44999e-06 [parallel-infer-symbol]: 3.97002e-06 [pre_auto_parallel]: 4.346e-05 [insert-virtual-dataset]: 2.37999e-06 [parallel-infer-symbol-second]: 7.7e-07 [dataset_repeat_opt]: 2.01e-06 [pipeline_split]: 1.97001e-06 [optimize]: 0.132103, [53] [py_interpret_to_execute]: 3.526e-05 [rewriter_before_opt_a]: 0.00010046 [opt_a]: 0.00372206, [2] [Cycle 1]: 0.00274662, [45] [expand_dump_flag]: 8.22e-06 [switch_simplify]: 7.997e-05 [loop_unroll]: 3.202e-05 [a_1]: 0.00076068 [with_stream_mark]: 3.036e-05 [recompute_prepare]: 9.79e-06 [updatestate_depend_eliminate]: 4.95999e-06 [updatestate_assign_eliminate]: 3.58999e-06 [updatestate_loads_eliminate]: 3.16001e-06 [parameter_eliminate]: 2.22999e-06 [a_2]: 0.00012005 [accelerated_algorithm]: 8.49998e-06 [shard]: 2.12999e-06 [meta_shard_fg_expand]: 3.01999e-06 [shard_inline]: 7.9e-06 [merge_send_recv]: 9.68997e-06 [auto_parallel]: 1.185e-05 [parallel]: 2.107e-05 [flash_sp]: 1.126e-05 [merge_comm]: 4.3e-06 [allreduce_fusion]: 3.49001e-06 [matmul_add_comm_reduction]: 1.139e-05 [allreduce_slice_to_reducescatter]: 8.09989e-07 [virtual_shard_identity]: 1.131e-05 [virtual_dataset]: 7.1e-06 [get_grad_eliminate_]: 6.78998e-06 [virtual_output]: 7.07002e-06 [merge_forward]: 4.53999e-06 [cell_reuse_recompute_pass]: 1.42e-06 [offload_activation]: 1.207e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.545e-05 [merge_recompute_call_nodes]: 1.44e-06 [before_grad]: 1.216e-05 [set_forward_comm_id_for_comm_node_pass]: 4.43001e-06 [meta_fg_expand]: 3.2e-06 [flash_sp_send_recv_attached]: 3.28e-06 [receive_attached]: 2.47001e-06 [after_resolve]: 1.408e-05 [a_after_grad]: 1.125e-05 [renormalize]: 0.00086897 [add_forward_monad_depend]: 8.90999e-06 [auto_monad_grad]: 3.09999e-06 [auto_monad_eliminator]: 1.989e-05 [cse]: 3.191e-05 [a_3]: 7.171e-05 [Cycle 2]: 0.00094852, [45] [expand_dump_flag]: 2.56e-06 [switch_simplify]: 8.37e-06 [loop_unroll]: 7.1e-06 [a_1]: 0.00015779 [with_stream_mark]: 1.954e-05 [recompute_prepare]: 7.28999e-06 [updatestate_depend_eliminate]: 3.43e-06 [updatestate_assign_eliminate]: 3.33998e-06 [updatestate_loads_eliminate]: 2.73e-06 [parameter_eliminate]: 1.65001e-06 [a_2]: 0.00011164 [accelerated_algorithm]: 7.55e-06 [shard]: 1.69998e-06 [meta_shard_fg_expand]: 1.88002e-06 [shard_inline]: 7.27002e-06 [merge_send_recv]: 8.08001e-06 [auto_parallel]: 9.27999e-06 [parallel]: 7.14001e-06 [flash_sp]: 3.91001e-06 [merge_comm]: 4.18999e-06 [allreduce_fusion]: 4.79e-06 [matmul_add_comm_reduction]: 9.96e-06 [allreduce_slice_to_reducescatter]: 6.89994e-07 [virtual_shard_identity]: 8.02e-06 [virtual_dataset]: 6.93e-06 [get_grad_eliminate_]: 6.28e-06 [virtual_output]: 6.24999e-06 [merge_forward]: 4.55001e-06 [cell_reuse_recompute_pass]: 2.81e-06 [offload_activation]: 9.71e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.661e-05 [merge_recompute_call_nodes]: 1.39e-06 [before_grad]: 1.07e-05 [set_forward_comm_id_for_comm_node_pass]: 4.55999e-06 [meta_fg_expand]: 2.37999e-06 [flash_sp_send_recv_attached]: 1.41002e-06 [receive_attached]: 2.14e-06 [after_resolve]: 1.287e-05 [a_after_grad]: 9.97001e-06 [renormalize]: 6.00121e-08 [add_forward_monad_depend]: 2.64001e-06 [auto_monad_grad]: 1.92001e-06 [auto_monad_eliminator]: 1.089e-05 [cse]: 2.056e-05 [a_3]: 5.138e-05 [py_interpret_to_execute_after_opt_a]: 2.002e-05 [slice_cell_reuse_recomputed_activation]: 5.55001e-06 [rewriter_after_opt_a]: 5.068e-05 [convert_after_rewriter]: 1.021e-05 [order_py_execute_after_rewriter]: 8.48999e-06 [mutable_eliminate]: 0.00078912 [opt_b]: 0.00029375, [1] [Cycle 1]: 0.00028251, [7] [b_1]: 0.00017487 [b_2]: 8.60001e-06 [updatestate_depend_eliminate]: 7.93999e-06 [updatestate_assign_eliminate]: 2.93998e-06 [updatestate_loads_eliminate]: 2.96001e-06 [renormalize]: 7.39994e-07 [cse]: 2.581e-05 [optimize_parallel_all_gather_comm]: 2.389e-05 [overlap_param_gather]: 5.24e-06 [cconv]: 3.744e-05 [loop_unroll]: 0.00048714 [opt_after_cconv]: 0.00013475, [1] [Cycle 1]: 0.00012498, [7] [c_1]: 3.135e-05 [parameter_eliminate]: 5.61e-06 [updatestate_depend_eliminate]: 5.94999e-06 [updatestate_assign_eliminate]: 2.62001e-06 [updatestate_loads_eliminate]: 2.37001e-06 [cse]: 2.051e-05 [renormalize]: 5.79981e-07 [remove_dup_value]: 1.795e-05 [tuple_transform]: 9.587e-05, [1] [Cycle 1]: 8.842e-05, [4] [d_1]: 4.91e-05 [none_parameter_eliminate]: 1.92999e-06 [renormalize]: 1.39989e-07 [switch_simplify]: 7.16999e-06 [partial_unused_args_eliminate]: 4.34002e-06 [add_recomputation]: 5.423e-05 [cse_after_recomputation]: 2.796e-05, [1] [Cycle 1]: 2.094e-05, [1] [cse]: 1.185e-05 [environ_conv]: 8.60999e-06 [swap_dp_allreduce_reducescatter]: 8.29002e-06 [bias_add_comm_swap]: 6.04999e-06 [label_micro_interleaved_index]: 7.99002e-06 [label_fine_grained_interleaved_index]: 4.99e-06 [merge_cast_opt]: 3.75998e-06 [slice_recompute_activation]: 4.42998e-06 [micro_interleaved_order_control]: 5.02999e-06 [assign_add_opt]: 3.86999e-06 [ForceFp32Comm]: 3.28e-06 [remove_cast_before_assign_add]: 3.33e-06 [full_micro_interleaved_order_control]: 4.47e-06 [reorder_send_recv_between_fp_bp]: 5.05999e-06 [comm_op_add_attrs]: 3.61999e-06 [add_comm_op_reuse_tag]: 3.23e-06 [interleave_split_concat_branches]: 3.83999e-06 [interleave_parallel_branches]: 3.47997e-06 [overlap_opt_shard_in_pipeline]: 3.7e-06 [overlap_opt_shard_grad_in_pipeline]: 4.23001e-06 [control_data_broadcast_order]: 1.634e-05 [grouped_pairwise_exchange_alltoall]: 4.49002e-06 [offloading_packed_experts]: 6.11998e-06 [overlap_recompute_and_grad_model_parallel]: 8.08001e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.52002e-06 [overlap_recompute_allgather_and_fa_grad]: 3.82998e-06 [overlap_recompute_comm]: 4.79998e-06 [overlap_grad_ring_attention]: 7.24001e-06 [overlap_grad_flash_sp]: 2.443e-05 [begin_end_overlap_inline]: 2.91999e-06 [split_matmul_comm_elemetwise]: 4.91002e-06 [split_layernorm_comm]: 4.22e-06 [handle_group_info]: 3.3e-06 [symbol_engine_optimizer]: 9.899e-05, [1] [Cycle 1]: 9.185e-05, [6] [build]: 3.68e-06 [elim_shapecalc]: 1.021e-05 [elim_not_effective]: 1.333e-05 [opt_reshape]: 7.20998e-06 [fold_const_symbol]: 1.045e-05 [renormalize]: 1.70025e-07 [detach_backward]: 3.7e-06 [pipeline_parallel_scheduler]: 1.87001e-06 [auto_monad_reorder]: 1.911e-05 [get_jit_bprop_graph]: 2.14999e-06 [rewriter_after_jit_bprop_graph]: 6.53e-06 [opt_after_jit_grad]: 0.00055599 [validate]: 4.284e-05 Sums bootstrap : 0.000456s : 1.90% type_inference : 0.017589s : 73.37% event_method : 0.000025s : 0.10% auto_monad : 0.000073s : 0.30% graph_reusing : 0.000007s : 0.03% inline : 0.000003s : 0.01% add_attr.add_attr_with_inline.tag_attr : 0.000026s : 0.11% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.03% parallel-infer-symbol : 0.000004s : 0.02% pre_auto_parallel : 0.000043s : 0.18% insert-virtual-dataset : 0.000002s : 0.01% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.01% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000035s : 0.15% optimize.rewriter_before_opt_a : 0.000100s : 0.42% optimize.opt_a.expand_dump_flag : 0.000011s : 0.04% optimize.opt_a.switch_simplify : 0.000088s : 0.37% optimize.opt_a.loop_unroll : 0.000039s : 0.16% optimize.opt_a.a_1 : 0.000918s : 3.83% optimize.opt_a.with_stream_mark : 0.000050s : 0.21% optimize.opt_a.recompute_prepare : 0.000017s : 0.07% optimize.opt_a.updatestate_depend_eliminate : 0.000008s : 0.03% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.03% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.02% optimize.opt_a.parameter_eliminate : 0.000004s : 0.02% optimize.opt_a.a_2 : 0.000232s : 0.97% optimize.opt_a.accelerated_algorithm : 0.000016s : 0.07% optimize.opt_a.shard : 0.000004s : 0.02% optimize.opt_a.meta_shard_fg_expand : 0.000005s : 0.02% optimize.opt_a.shard_inline : 0.000015s : 0.06% optimize.opt_a.merge_send_recv : 0.000018s : 0.07% optimize.opt_a.auto_parallel : 0.000021s : 0.09% optimize.opt_a.parallel : 0.000028s : 0.12% optimize.opt_a.flash_sp : 0.000015s : 0.06% optimize.opt_a.merge_comm : 0.000008s : 0.04% optimize.opt_a.allreduce_fusion : 0.000008s : 0.03% optimize.opt_a.matmul_add_comm_reduction : 0.000021s : 0.09% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000019s : 0.08% optimize.opt_a.virtual_dataset : 0.000014s : 0.06% optimize.opt_a.get_grad_eliminate_ : 0.000013s : 0.05% optimize.opt_a.virtual_output : 0.000013s : 0.06% optimize.opt_a.merge_forward : 0.000009s : 0.04% optimize.opt_a.cell_reuse_recompute_pass : 0.000004s : 0.02% optimize.opt_a.offload_activation : 0.000022s : 0.09% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000032s : 0.13% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.01% optimize.opt_a.before_grad : 0.000023s : 0.10% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000009s : 0.04% optimize.opt_a.meta_fg_expand : 0.000006s : 0.02% optimize.opt_a.flash_sp_send_recv_attached : 0.000005s : 0.02% optimize.opt_a.receive_attached : 0.000005s : 0.02% optimize.opt_a.after_resolve : 0.000027s : 0.11% optimize.opt_a.a_after_grad : 0.000021s : 0.09% optimize.opt_a.renormalize : 0.000869s : 3.62% optimize.opt_a.add_forward_monad_depend : 0.000012s : 0.05% optimize.opt_a.auto_monad_grad : 0.000005s : 0.02% optimize.opt_a.auto_monad_eliminator : 0.000031s : 0.13% optimize.opt_a.cse : 0.000052s : 0.22% optimize.opt_a.a_3 : 0.000123s : 0.51% optimize.py_interpret_to_execute_after_opt_a : 0.000020s : 0.08% optimize.slice_cell_reuse_recomputed_activation : 0.000006s : 0.02% optimize.rewriter_after_opt_a : 0.000051s : 0.21% optimize.convert_after_rewriter : 0.000010s : 0.04% optimize.order_py_execute_after_rewriter : 0.000008s : 0.04% optimize.mutable_eliminate : 0.000789s : 3.29% optimize.opt_b.b_1 : 0.000175s : 0.73% optimize.opt_b.b_2 : 0.000009s : 0.04% optimize.opt_b.updatestate_depend_eliminate : 0.000008s : 0.03% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.01% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.01% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000026s : 0.11% optimize.optimize_parallel_all_gather_comm : 0.000024s : 0.10% optimize.overlap_param_gather : 0.000005s : 0.02% optimize.cconv : 0.000037s : 0.16% optimize.loop_unroll : 0.000487s : 2.03% optimize.opt_after_cconv.c_1 : 0.000031s : 0.13% optimize.opt_after_cconv.parameter_eliminate : 0.000006s : 0.02% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.02% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.01% optimize.opt_after_cconv.cse : 0.000021s : 0.09% optimize.opt_after_cconv.renormalize : 0.000001s : 0.00% optimize.remove_dup_value : 0.000018s : 0.07% optimize.tuple_transform.d_1 : 0.000049s : 0.20% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000007s : 0.03% optimize.partial_unused_args_eliminate : 0.000004s : 0.02% optimize.add_recomputation : 0.000054s : 0.23% optimize.cse_after_recomputation.cse : 0.000012s : 0.05% optimize.environ_conv : 0.000009s : 0.04% optimize.swap_dp_allreduce_reducescatter : 0.000008s : 0.03% optimize.bias_add_comm_swap : 0.000006s : 0.03% optimize.label_micro_interleaved_index : 0.000008s : 0.03% optimize.label_fine_grained_interleaved_index : 0.000005s : 0.02% optimize.merge_cast_opt : 0.000004s : 0.02% optimize.slice_recompute_activation : 0.000004s : 0.02% optimize.micro_interleaved_order_control : 0.000005s : 0.02% optimize.assign_add_opt : 0.000004s : 0.02% optimize.ForceFp32Comm : 0.000003s : 0.01% optimize.remove_cast_before_assign_add : 0.000003s : 0.01% optimize.full_micro_interleaved_order_control : 0.000004s : 0.02% optimize.reorder_send_recv_between_fp_bp : 0.000005s : 0.02% optimize.comm_op_add_attrs : 0.000004s : 0.02% optimize.add_comm_op_reuse_tag : 0.000003s : 0.01% optimize.interleave_split_concat_branches : 0.000004s : 0.02% optimize.interleave_parallel_branches : 0.000003s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.02% optimize.overlap_opt_shard_grad_in_pipeline : 0.000004s : 0.02% optimize.control_data_broadcast_order : 0.000016s : 0.07% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.02% optimize.offloading_packed_experts : 0.000006s : 0.03% optimize.overlap_recompute_and_grad_model_parallel : 0.000008s : 0.03% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.02% optimize.overlap_recompute_comm : 0.000005s : 0.02% optimize.overlap_grad_ring_attention : 0.000007s : 0.03% optimize.overlap_grad_flash_sp : 0.000024s : 0.10% optimize.begin_end_overlap_inline : 0.000003s : 0.01% optimize.split_matmul_comm_elemetwise : 0.000005s : 0.02% optimize.split_layernorm_comm : 0.000004s : 0.02% optimize.handle_group_info : 0.000003s : 0.01% optimize.symbol_engine_optimizer.build : 0.000004s : 0.02% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000010s : 0.04% optimize.symbol_engine_optimizer.elim_not_effective : 0.000013s : 0.06% optimize.symbol_engine_optimizer.opt_reshape : 0.000007s : 0.03% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000010s : 0.04% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000004s : 0.02% pipeline_parallel_scheduler : 0.000002s : 0.01% auto_monad_reorder : 0.000019s : 0.08% get_jit_bprop_graph : 0.000002s : 0.01% rewriter_after_jit_bprop_graph : 0.000007s : 0.03% opt_after_jit_grad : 0.000556s : 2.32% validate : 0.000043s : 0.18% Time group info: ------[substitution.] 0.000277 34 15.07% : 0.000042s : 6: substitution.arithmetic_simplify 0.80% : 0.000002s : 2: substitution.elim_not_effective 0.49% : 0.000001s : 2: substitution.fold_const_symbol 2.28% : 0.000006s : 4: substitution.graph_param_transform 69.21% : 0.000192s : 4: substitution.inline 1.80% : 0.000005s : 4: substitution.j_node_and_user_rematch 2.15% : 0.000006s : 4: substitution.remove_not_recompute_node 2.07% : 0.000006s : 4: substitution.replace_old_param 6.11% : 0.000017s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.017505 2 91.79% : 0.016069s : 1: type_inference.infer 8.21% : 0.001437s : 1: type_inference.specialize ------[replace.] 0.000072 8 63.12% : 0.000045s : 4: replace.inline 36.88% : 0.000027s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000204 8 92.59% : 0.000189s : 4: match.inline 7.41% : 0.000015s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000246 1278 0.99% : 0.000002s : 13: predicate.accumulaten_eliminater 0.81% : 0.000002s : 4: predicate.ad_related_special_op_eliminate 0.61% : 0.000002s : 8: predicate.addn_check_dump 0.87% : 0.000002s : 13: predicate.addn_zero_filter 0.78% : 0.000002s : 13: predicate.adjust_all_reduce_mul_add 2.41% : 0.000006s : 21: predicate.arithmetic_simplify 0.89% : 0.000002s : 13: predicate.cast_eliminate 0.64% : 0.000002s : 8: predicate.check_bprop_eliminate 0.52% : 0.000001s : 8: predicate.compare_switch_simplify 0.18% : 0.000000s : 4: predicate.const_output_eliminate 0.57% : 0.000001s : 8: predicate.depend_value_elim 0.88% : 0.000002s : 13: predicate.dict_get_item_const_eliminator 0.98% : 0.000002s : 13: predicate.dict_get_item_eliminator 0.81% : 0.000002s : 13: predicate.dict_set_item_eliminator 1.12% : 0.000003s : 8: predicate.dumpgradient_eliminate 0.18% : 0.000000s : 4: predicate.elim_not_effective 0.35% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.21% : 0.000003s : 17: predicate.environ_add_const_eliminate 0.94% : 0.000002s : 17: predicate.environ_get_add_eliminate 1.13% : 0.000003s : 17: predicate.environ_get_depend_swap 1.50% : 0.000004s : 25: predicate.environ_get_eliminate 1.03% : 0.000003s : 17: predicate.environ_get_set_eliminate 1.35% : 0.000003s : 21: predicate.exchange_switch_depend_value 2.89% : 0.000007s : 21: predicate.float_depend_g_call 0.50% : 0.000001s : 8: predicate.float_environ_get_switch 0.71% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.15% : 0.000000s : 4: predicate.fold_const_symbol 0.70% : 0.000002s : 8: predicate.get_grad_eliminate 0.25% : 0.000001s : 4: predicate.graph_param_transform 0.56% : 0.000001s : 8: predicate.incorporate_call 0.42% : 0.000001s : 8: predicate.incorporate_call_switch 6.33% : 0.000016s : 58: predicate.inline 0.86% : 0.000002s : 8: predicate.inline_without_move 0.28% : 0.000001s : 8: predicate.j_node_and_user_rematch 0.93% : 0.000002s : 8: predicate.less_batch_normalization 1.68% : 0.000004s : 25: predicate.list_to_tuple_eliminator_ 2.18% : 0.000005s : 38: predicate.load_eliminater 0.90% : 0.000002s : 4: predicate.loop_unroll_after_grad 2.30% : 0.000006s : 34: predicate.loop_unroll_before_grad 1.49% : 0.000004s : 21: predicate.make_slice_get_slice_eliminator 0.45% : 0.000001s : 8: predicate.merge_addn 0.63% : 0.000002s : 8: predicate.micro_step_allgather_replace 0.55% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.74% : 0.000002s : 13: predicate.minmaximum_grad 1.33% : 0.000003s : 4: predicate.mutable_eliminate 0.27% : 0.000001s : 4: predicate.opt_reshape 0.33% : 0.000001s : 4: predicate.parallel_virtual_node 1.60% : 0.000004s : 21: predicate.partial_defer_inline 1.45% : 0.000004s : 21: predicate.partial_eliminate 0.89% : 0.000002s : 13: predicate.print_const_string_wrapper 0.54% : 0.000001s : 8: predicate.reduce_all_const_elim 1.12% : 0.000003s : 13: predicate.reduce_eliminate 2.27% : 0.000006s : 38: predicate.redundant_stop_gradient_eliminater 0.44% : 0.000001s : 8: predicate.remove_not_recompute_node 1.16% : 0.000003s : 25: predicate.replace_applicator 0.54% : 0.000001s : 8: predicate.replace_old_param 0.41% : 0.000001s : 4: predicate.reset_defer_inline 0.84% : 0.000002s : 13: predicate.reshape_eliminate 0.59% : 0.000001s : 8: predicate.row_tensor_add_zeros_like 0.33% : 0.000001s : 4: predicate.row_tensor_eliminate 0.73% : 0.000002s : 8: predicate.same_eliminate 0.45% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.64% : 0.000002s : 8: predicate.shard_identity_eliminate 0.79% : 0.000002s : 8: predicate.special_op_eliminate 0.58% : 0.000001s : 8: predicate.specialize_transform 0.79% : 0.000002s : 8: predicate.split_environ_get_set_with_tuple_value 0.82% : 0.000002s : 8: predicate.stack_unstack_eliminate 0.31% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.38% : 0.000003s : 21: predicate.switch_defer_inline 1.92% : 0.000005s : 29: predicate.switch_layer_defer_inline 9.12% : 0.000022s : 67: predicate.switch_simplify 1.01% : 0.000002s : 13: predicate.tile_eliminate 0.98% : 0.000002s : 13: predicate.transpose_eliminate 1.57% : 0.000004s : 21: predicate.tuple_list_convert_item_index_to_positive 1.51% : 0.000004s : 21: predicate.tuple_list_get_item_const_eliminator 1.29% : 0.000003s : 21: predicate.tuple_list_get_item_depend_reorder 2.99% : 0.000007s : 33: predicate.tuple_list_get_item_eliminator 1.35% : 0.000003s : 21: predicate.tuple_list_get_set_item_eliminator 2.42% : 0.000006s : 29: predicate.tuple_list_set_item_eliminator 1.62% : 0.000004s : 25: predicate.tuple_to_list_eliminator_ 2.22% : 0.000005s : 38: predicate.updatestate_pure_node_eliminater 2.82% : 0.000007s : 46: predicate.updatestate_useless_node_eliminater 0.35% : 0.000001s : 4: predicate.value_based_eliminate 0.74% : 0.000002s : 8: predicate.virtual_dataset_eliminate 0.64% : 0.000002s : 8: predicate.virtual_output_eliminate 0.22% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.37% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000710 11 47.92% : 0.000340s : 5: func_graph_cloner_run.FuncGraphClonerGraph 52.08% : 0.000370s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.294130 192 0.00% : 0.000006s : 1: ForceFp32Comm 1.37% : 0.004026s : 1: add_attr 1.36% : 0.004004s : 1: add_attr_with_inline 0.00% : 0.000007s : 1: add_comm_op_reuse_tag 0.02% : 0.000058s : 1: add_recomputation 0.00% : 0.000007s : 1: assign_add_opt 0.03% : 0.000082s : 1: auto_monad 0.01% : 0.000027s : 1: auto_monad_reorder 0.00% : 0.000006s : 1: begin_end_overlap_inline 0.00% : 0.000009s : 1: bias_add_comm_swap 0.17% : 0.000499s : 1: bootstrap 0.01% : 0.000041s : 1: cconv 0.00% : 0.000006s : 1: comm_op_add_attrs 0.01% : 0.000019s : 1: control_data_broadcast_order 0.00% : 0.000014s : 1: convert_after_rewriter 0.01% : 0.000031s : 1: cse_after_recomputation 0.00% : 0.000008s : 1: dataset_repeat_opt 0.01% : 0.000019s : 1: detach_backward 0.00% : 0.000012s : 1: environ_conv 0.01% : 0.000036s : 1: event_method 0.00% : 0.000007s : 1: full_micro_interleaved_order_control 0.00% : 0.000008s : 1: get_jit_bprop_graph 0.00% : 0.000013s : 1: graph_reusing 0.00% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000006s : 1: handle_group_info 0.00% : 0.000009s : 1: inline 0.00% : 0.000009s : 1: insert-virtual-dataset 0.00% : 0.000006s : 1: interleave_parallel_branches 0.00% : 0.000006s : 1: interleave_split_concat_branches 0.00% : 0.000008s : 1: label_fine_grained_interleaved_index 0.00% : 0.000011s : 1: label_micro_interleaved_index 0.17% : 0.000494s : 1: loop_unroll 0.00% : 0.000006s : 1: merge_cast_opt 0.00% : 0.000008s : 1: micro_interleaved_order_control 0.27% : 0.000798s : 1: mutable_eliminate 0.00% : 0.000009s : 1: offloading_packed_experts 0.01% : 0.000017s : 1: opt.transform.loop_unroll_optimizer 0.01% : 0.000022s : 1: opt.transform.mutable_eliminate 0.48% : 0.001418s : 78: opt.transform.opt_a 0.01% : 0.000030s : 1: opt.transform.opt_after_cconv 0.01% : 0.000029s : 1: opt.transform.opt_after_jit_grad 0.04% : 0.000107s : 28: opt.transform.opt_b 0.02% : 0.000053s : 2: opt.transform.opt_trans_graph 0.01% : 0.000037s : 4: opt.transform.symbol_engine_opt 1.27% : 0.003729s : 1: opt_a 0.05% : 0.000138s : 1: opt_after_cconv 0.19% : 0.000566s : 1: opt_after_jit_grad 0.10% : 0.000298s : 1: opt_b 45.03% : 0.132461s : 1: optimize 0.01% : 0.000027s : 1: optimize_parallel_all_gather_comm 0.00% : 0.000012s : 1: order_py_execute_after_rewriter 0.01% : 0.000028s : 1: overlap_grad_flash_sp 0.00% : 0.000006s : 1: overlap_grad_matmul_and_grad_allreduce 0.00% : 0.000010s : 1: overlap_grad_ring_attention 0.00% : 0.000007s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000006s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000008s : 1: overlap_param_gather 0.00% : 0.000006s : 1: overlap_recompute_allgather_and_fa_grad 0.00% : 0.000011s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000008s : 1: overlap_recompute_comm 0.00% : 0.000011s : 1: parallel-infer-symbol 0.00% : 0.000006s : 1: parallel-infer-symbol-second 0.00% : 0.000007s : 1: partial_unused_args_eliminate 0.00% : 0.000009s : 1: pipeline_parallel_scheduler 0.00% : 0.000007s : 1: pipeline_split 0.02% : 0.000051s : 1: pre_auto_parallel 0.01% : 0.000039s : 1: py_interpret_to_execute 0.01% : 0.000024s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000006s : 1: remove_cast_before_assign_add 0.01% : 0.000021s : 1: remove_dup_value 0.16% : 0.000473s : 1: renormalize.infer 0.13% : 0.000386s : 1: renormalize.specialize 0.00% : 0.000008s : 1: reorder_send_recv_between_fp_bp 0.00% : 0.000012s : 1: rewriter_after_jit_bprop_graph 0.02% : 0.000056s : 1: rewriter_after_opt_a 42.76% : 0.125763s : 1: rewriter_before_opt_a 0.00% : 0.000008s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000008s : 1: slice_recompute_activation 0.00% : 0.000007s : 1: split_layernorm_comm 0.00% : 0.000008s : 1: split_matmul_comm_elemetwise 0.00% : 0.000011s : 1: swap_dp_allreduce_reducescatter 0.03% : 0.000102s : 1: symbol_engine_optimizer 0.03% : 0.000099s : 1: tuple_transform 6.00% : 0.017650s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:43:25.387.789 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0406847, [21] [bootstrap]: 0.00043176 [type_inference]: 0.0302354 [event_method]: 1.907e-05 [auto_monad]: 6.092e-05 [graph_reusing]: 5.42001e-06 [inline]: 2.64999e-06 [add_attr]: 0.0035286, [1] [add_attr_with_inline]: 0.00351813, [1] [Cycle 1]: 5.7e-05, [2] [tag_attr]: 2.053e-05 [meta_addattr_fg_expand]: 5.66998e-06 [parallel-infer-symbol]: 3.73999e-06 [pre_auto_parallel]: 3.5e-05 [insert-virtual-dataset]: 2.73e-06 [parallel-infer-symbol-second]: 1.04e-06 [dataset_repeat_opt]: 2.01e-06 [pipeline_split]: 1.84e-06 [optimize]: 0.00560207, [53] [py_interpret_to_execute]: 2.623e-05 [rewriter_before_opt_a]: 8.797e-05 [opt_a]: 0.00321446, [2] [Cycle 1]: 0.00226095, [45] [expand_dump_flag]: 3.01999e-06 [switch_simplify]: 3.781e-05 [loop_unroll]: 3.118e-05 [a_1]: 0.00067239 [with_stream_mark]: 1.794e-05 [recompute_prepare]: 9.92001e-06 [updatestate_depend_eliminate]: 4.47e-06 [updatestate_assign_eliminate]: 3.06001e-06 [updatestate_loads_eliminate]: 2.34999e-06 [parameter_eliminate]: 1.42e-06 [a_2]: 9.127e-05 [accelerated_algorithm]: 7.48e-06 [shard]: 2.38998e-06 [meta_shard_fg_expand]: 1.76003e-06 [shard_inline]: 6.76e-06 [merge_send_recv]: 9.19e-06 [auto_parallel]: 7.63001e-06 [parallel]: 2.031e-05 [flash_sp]: 8.79e-06 [merge_comm]: 4.53001e-06 [allreduce_fusion]: 3.45e-06 [matmul_add_comm_reduction]: 9.02e-06 [allreduce_slice_to_reducescatter]: 1.05001e-06 [virtual_shard_identity]: 8.67e-06 [virtual_dataset]: 7.5e-06 [get_grad_eliminate_]: 6.86999e-06 [virtual_output]: 6.69999e-06 [merge_forward]: 4.32e-06 [cell_reuse_recompute_pass]: 1.24e-06 [offload_activation]: 1.007e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.317e-05 [merge_recompute_call_nodes]: 1.35001e-06 [before_grad]: 1.124e-05 [set_forward_comm_id_for_comm_node_pass]: 3.83001e-06 [meta_fg_expand]: 2.89001e-06 [flash_sp_send_recv_attached]: 2.64999e-06 [receive_attached]: 2.32001e-06 [after_resolve]: 1.22e-05 [a_after_grad]: 1.052e-05 [renormalize]: 0.00083053 [add_forward_monad_depend]: 6.28e-06 [auto_monad_grad]: 2.49999e-06 [auto_monad_eliminator]: 1.656e-05 [cse]: 2.831e-05 [a_3]: 5.242e-05 [Cycle 2]: 0.00094192, [45] [expand_dump_flag]: 1.78997e-06 [switch_simplify]: 8.23999e-06 [loop_unroll]: 6.31e-06 [a_1]: 0.00014707 [with_stream_mark]: 1.395e-05 [recompute_prepare]: 6.46999e-06 [updatestate_depend_eliminate]: 2.96001e-06 [updatestate_assign_eliminate]: 2.52001e-06 [updatestate_loads_eliminate]: 2.57001e-06 [parameter_eliminate]: 1.45001e-06 [a_2]: 7.808e-05 [accelerated_algorithm]: 6.43e-06 [shard]: 2.02999e-06 [meta_shard_fg_expand]: 1.44998e-06 [shard_inline]: 6.16998e-06 [merge_send_recv]: 6.28e-06 [auto_parallel]: 7.68999e-06 [parallel]: 6.06998e-06 [flash_sp]: 3.73999e-06 [merge_comm]: 3.61001e-06 [allreduce_fusion]: 3.76001e-06 [matmul_add_comm_reduction]: 9.25999e-06 [allreduce_slice_to_reducescatter]: 1.00999e-06 [virtual_shard_identity]: 7.51999e-06 [virtual_dataset]: 6.16e-06 [get_grad_eliminate_]: 5.97999e-06 [virtual_output]: 2.42e-05 [merge_forward]: 4.28001e-06 [cell_reuse_recompute_pass]: 2.18002e-06 [offload_activation]: 7.83999e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.252e-05 [merge_recompute_call_nodes]: 1.12e-06 [before_grad]: 9.94001e-06 [set_forward_comm_id_for_comm_node_pass]: 0.00018531 [meta_fg_expand]: 2.97002e-06 [flash_sp_send_recv_attached]: 1.99e-06 [receive_attached]: 1.97001e-06 [after_resolve]: 1.939e-05 [a_after_grad]: 1.036e-05 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 3.58e-06 [auto_monad_grad]: 1.92999e-06 [auto_monad_eliminator]: 1.138e-05 [cse]: 1.864e-05 [a_3]: 4.085e-05 [py_interpret_to_execute_after_opt_a]: 1.284e-05 [slice_cell_reuse_recomputed_activation]: 1.96003e-06 [rewriter_after_opt_a]: 4.193e-05 [convert_after_rewriter]: 7.16001e-06 [order_py_execute_after_rewriter]: 5.17e-06 [mutable_eliminate]: 0.00072529 [opt_b]: 0.00021854, [1] [Cycle 1]: 0.00021073, [7] [b_1]: 0.00012815 [b_2]: 8.81002e-06 [updatestate_depend_eliminate]: 8.12e-06 [updatestate_assign_eliminate]: 2.71e-06 [updatestate_loads_eliminate]: 2.57001e-06 [renormalize]: 6.30011e-07 [cse]: 2.245e-05 [optimize_parallel_all_gather_comm]: 1.946e-05 [overlap_param_gather]: 2.11e-06 [cconv]: 3.3e-05 [loop_unroll]: 0.00049236 [opt_after_cconv]: 0.00011527, [1] [Cycle 1]: 0.00010822, [7] [c_1]: 3.391e-05 [parameter_eliminate]: 5.18002e-06 [updatestate_depend_eliminate]: 6.91001e-06 [updatestate_assign_eliminate]: 2.77002e-06 [updatestate_loads_eliminate]: 3.12002e-06 [cse]: 2.012e-05 [renormalize]: 2.30008e-07 [remove_dup_value]: 1.448e-05 [tuple_transform]: 8.491e-05, [1] [Cycle 1]: 7.996e-05, [4] [d_1]: 4.95e-05 [none_parameter_eliminate]: 1.62999e-06 [renormalize]: 2.69996e-07 [switch_simplify]: 7.45e-06 [partial_unused_args_eliminate]: 1.99e-06 [add_recomputation]: 5.615e-05 [cse_after_recomputation]: 2.295e-05, [1] [Cycle 1]: 1.813e-05, [1] [cse]: 1.181e-05 [environ_conv]: 5.24998e-06 [swap_dp_allreduce_reducescatter]: 5.60001e-06 [bias_add_comm_swap]: 3.08998e-06 [label_micro_interleaved_index]: 4.66002e-06 [label_fine_grained_interleaved_index]: 3.01001e-06 [merge_cast_opt]: 1.52999e-06 [slice_recompute_activation]: 2.12999e-06 [micro_interleaved_order_control]: 2.55997e-06 [assign_add_opt]: 1.40999e-06 [ForceFp32Comm]: 8.60018e-07 [remove_cast_before_assign_add]: 1.25001e-06 [full_micro_interleaved_order_control]: 2.39001e-06 [reorder_send_recv_between_fp_bp]: 2.83003e-06 [comm_op_add_attrs]: 1.02e-06 [add_comm_op_reuse_tag]: 1.05999e-06 [interleave_split_concat_branches]: 1.30999e-06 [interleave_parallel_branches]: 1.34e-06 [overlap_opt_shard_in_pipeline]: 1.15001e-06 [overlap_opt_shard_grad_in_pipeline]: 2.08002e-06 [control_data_broadcast_order]: 1.571e-05 [grouped_pairwise_exchange_alltoall]: 1.52001e-06 [offloading_packed_experts]: 4.35e-06 [overlap_recompute_and_grad_model_parallel]: 4.66997e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.19e-06 [overlap_recompute_allgather_and_fa_grad]: 1.35001e-06 [overlap_recompute_comm]: 2.06998e-06 [overlap_grad_ring_attention]: 4.16001e-06 [overlap_grad_flash_sp]: 2.178e-05 [begin_end_overlap_inline]: 4.89992e-07 [split_matmul_comm_elemetwise]: 2.22999e-06 [split_layernorm_comm]: 1.77999e-06 [handle_group_info]: 9.99979e-07 [symbol_engine_optimizer]: 7.908e-05, [1] [Cycle 1]: 7.448e-05, [6] [build]: 3.62998e-06 [elim_shapecalc]: 1.103e-05 [elim_not_effective]: 1.297e-05 [opt_reshape]: 6.94001e-06 [fold_const_symbol]: 1.012e-05 [renormalize]: 1.60013e-07 [detach_backward]: 2.04e-06 [pipeline_parallel_scheduler]: 1.49e-06 [auto_monad_reorder]: 1.748e-05 [get_jit_bprop_graph]: 1.88002e-06 [rewriter_after_jit_bprop_graph]: 6.00002e-06 [opt_after_jit_grad]: 0.00052036 [validate]: 4.463e-05 Sums bootstrap : 0.000432s : 1.20% type_inference : 0.030235s : 83.69% event_method : 0.000019s : 0.05% auto_monad : 0.000061s : 0.17% graph_reusing : 0.000005s : 0.02% inline : 0.000003s : 0.01% add_attr.add_attr_with_inline.tag_attr : 0.000021s : 0.06% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.02% parallel-infer-symbol : 0.000004s : 0.01% pre_auto_parallel : 0.000035s : 0.10% insert-virtual-dataset : 0.000003s : 0.01% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.01% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000026s : 0.07% optimize.rewriter_before_opt_a : 0.000088s : 0.24% optimize.opt_a.expand_dump_flag : 0.000005s : 0.01% optimize.opt_a.switch_simplify : 0.000046s : 0.13% optimize.opt_a.loop_unroll : 0.000037s : 0.10% optimize.opt_a.a_1 : 0.000819s : 2.27% optimize.opt_a.with_stream_mark : 0.000032s : 0.09% optimize.opt_a.recompute_prepare : 0.000016s : 0.05% optimize.opt_a.updatestate_depend_eliminate : 0.000007s : 0.02% optimize.opt_a.updatestate_assign_eliminate : 0.000006s : 0.02% optimize.opt_a.updatestate_loads_eliminate : 0.000005s : 0.01% optimize.opt_a.parameter_eliminate : 0.000003s : 0.01% optimize.opt_a.a_2 : 0.000169s : 0.47% optimize.opt_a.accelerated_algorithm : 0.000014s : 0.04% optimize.opt_a.shard : 0.000004s : 0.01% optimize.opt_a.meta_shard_fg_expand : 0.000003s : 0.01% optimize.opt_a.shard_inline : 0.000013s : 0.04% optimize.opt_a.merge_send_recv : 0.000015s : 0.04% optimize.opt_a.auto_parallel : 0.000015s : 0.04% optimize.opt_a.parallel : 0.000026s : 0.07% optimize.opt_a.flash_sp : 0.000013s : 0.03% optimize.opt_a.merge_comm : 0.000008s : 0.02% optimize.opt_a.allreduce_fusion : 0.000007s : 0.02% optimize.opt_a.matmul_add_comm_reduction : 0.000018s : 0.05% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000002s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000016s : 0.04% optimize.opt_a.virtual_dataset : 0.000014s : 0.04% optimize.opt_a.get_grad_eliminate_ : 0.000013s : 0.04% optimize.opt_a.virtual_output : 0.000031s : 0.09% optimize.opt_a.merge_forward : 0.000009s : 0.02% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.01% optimize.opt_a.offload_activation : 0.000018s : 0.05% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000026s : 0.07% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.01% optimize.opt_a.before_grad : 0.000021s : 0.06% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000189s : 0.52% optimize.opt_a.meta_fg_expand : 0.000006s : 0.02% optimize.opt_a.flash_sp_send_recv_attached : 0.000005s : 0.01% optimize.opt_a.receive_attached : 0.000004s : 0.01% optimize.opt_a.after_resolve : 0.000032s : 0.09% optimize.opt_a.a_after_grad : 0.000021s : 0.06% optimize.opt_a.renormalize : 0.000831s : 2.30% optimize.opt_a.add_forward_monad_depend : 0.000010s : 0.03% optimize.opt_a.auto_monad_grad : 0.000004s : 0.01% optimize.opt_a.auto_monad_eliminator : 0.000028s : 0.08% optimize.opt_a.cse : 0.000047s : 0.13% optimize.opt_a.a_3 : 0.000093s : 0.26% optimize.py_interpret_to_execute_after_opt_a : 0.000013s : 0.04% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.01% optimize.rewriter_after_opt_a : 0.000042s : 0.12% optimize.convert_after_rewriter : 0.000007s : 0.02% optimize.order_py_execute_after_rewriter : 0.000005s : 0.01% optimize.mutable_eliminate : 0.000725s : 2.01% optimize.opt_b.b_1 : 0.000128s : 0.35% optimize.opt_b.b_2 : 0.000009s : 0.02% optimize.opt_b.updatestate_depend_eliminate : 0.000008s : 0.02% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.01% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.01% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000022s : 0.06% optimize.optimize_parallel_all_gather_comm : 0.000019s : 0.05% optimize.overlap_param_gather : 0.000002s : 0.01% optimize.cconv : 0.000033s : 0.09% optimize.loop_unroll : 0.000492s : 1.36% optimize.opt_after_cconv.c_1 : 0.000034s : 0.09% optimize.opt_after_cconv.parameter_eliminate : 0.000005s : 0.01% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000007s : 0.02% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.cse : 0.000020s : 0.06% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000014s : 0.04% optimize.tuple_transform.d_1 : 0.000049s : 0.14% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000007s : 0.02% optimize.partial_unused_args_eliminate : 0.000002s : 0.01% optimize.add_recomputation : 0.000056s : 0.16% optimize.cse_after_recomputation.cse : 0.000012s : 0.03% optimize.environ_conv : 0.000005s : 0.01% optimize.swap_dp_allreduce_reducescatter : 0.000006s : 0.02% optimize.bias_add_comm_swap : 0.000003s : 0.01% optimize.label_micro_interleaved_index : 0.000005s : 0.01% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.01% optimize.merge_cast_opt : 0.000002s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.01% optimize.micro_interleaved_order_control : 0.000003s : 0.01% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.00% optimize.full_micro_interleaved_order_control : 0.000002s : 0.01% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.01% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.01% optimize.control_data_broadcast_order : 0.000016s : 0.04% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.00% optimize.offloading_packed_experts : 0.000004s : 0.01% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.01% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.00% optimize.overlap_recompute_comm : 0.000002s : 0.01% optimize.overlap_grad_ring_attention : 0.000004s : 0.01% optimize.overlap_grad_flash_sp : 0.000022s : 0.06% optimize.begin_end_overlap_inline : 0.000000s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.01% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000004s : 0.01% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000011s : 0.03% optimize.symbol_engine_optimizer.elim_not_effective : 0.000013s : 0.04% optimize.symbol_engine_optimizer.opt_reshape : 0.000007s : 0.02% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000010s : 0.03% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.01% pipeline_parallel_scheduler : 0.000001s : 0.00% auto_monad_reorder : 0.000017s : 0.05% get_jit_bprop_graph : 0.000002s : 0.01% rewriter_after_jit_bprop_graph : 0.000006s : 0.02% opt_after_jit_grad : 0.000520s : 1.44% validate : 0.000045s : 0.12% Time group info: ------[substitution.] 0.000225 34 15.95% : 0.000036s : 6: substitution.arithmetic_simplify 0.81% : 0.000002s : 2: substitution.elim_not_effective 0.59% : 0.000001s : 2: substitution.fold_const_symbol 3.05% : 0.000007s : 4: substitution.graph_param_transform 67.22% : 0.000151s : 4: substitution.inline 1.87% : 0.000004s : 4: substitution.j_node_and_user_rematch 2.19% : 0.000005s : 4: substitution.remove_not_recompute_node 2.64% : 0.000006s : 4: substitution.replace_old_param 5.69% : 0.000013s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.030174 2 97.69% : 0.029477s : 1: type_inference.infer 2.31% : 0.000697s : 1: type_inference.specialize ------[replace.] 0.000064 8 62.15% : 0.000040s : 4: replace.inline 37.85% : 0.000024s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000159 8 93.19% : 0.000148s : 4: match.inline 6.81% : 0.000011s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000221 1278 1.09% : 0.000002s : 13: predicate.accumulaten_eliminater 1.04% : 0.000002s : 4: predicate.ad_related_special_op_eliminate 0.48% : 0.000001s : 8: predicate.addn_check_dump 1.02% : 0.000002s : 13: predicate.addn_zero_filter 0.81% : 0.000002s : 13: predicate.adjust_all_reduce_mul_add 2.83% : 0.000006s : 21: predicate.arithmetic_simplify 0.92% : 0.000002s : 13: predicate.cast_eliminate 0.68% : 0.000002s : 8: predicate.check_bprop_eliminate 0.53% : 0.000001s : 8: predicate.compare_switch_simplify 0.18% : 0.000000s : 4: predicate.const_output_eliminate 0.56% : 0.000001s : 8: predicate.depend_value_elim 0.83% : 0.000002s : 13: predicate.dict_get_item_const_eliminator 0.97% : 0.000002s : 13: predicate.dict_get_item_eliminator 0.86% : 0.000002s : 13: predicate.dict_set_item_eliminator 1.18% : 0.000003s : 8: predicate.dumpgradient_eliminate 0.19% : 0.000000s : 4: predicate.elim_not_effective 0.34% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.23% : 0.000003s : 17: predicate.environ_add_const_eliminate 1.09% : 0.000002s : 17: predicate.environ_get_add_eliminate 1.11% : 0.000002s : 17: predicate.environ_get_depend_swap 1.55% : 0.000003s : 25: predicate.environ_get_eliminate 1.05% : 0.000002s : 17: predicate.environ_get_set_eliminate 1.37% : 0.000003s : 21: predicate.exchange_switch_depend_value 2.32% : 0.000005s : 21: predicate.float_depend_g_call 0.53% : 0.000001s : 8: predicate.float_environ_get_switch 0.69% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.19% : 0.000000s : 4: predicate.fold_const_symbol 0.67% : 0.000001s : 8: predicate.get_grad_eliminate 0.20% : 0.000000s : 4: predicate.graph_param_transform 0.57% : 0.000001s : 8: predicate.incorporate_call 0.45% : 0.000001s : 8: predicate.incorporate_call_switch 6.34% : 0.000014s : 58: predicate.inline 0.85% : 0.000002s : 8: predicate.inline_without_move 0.30% : 0.000001s : 8: predicate.j_node_and_user_rematch 0.84% : 0.000002s : 8: predicate.less_batch_normalization 1.93% : 0.000004s : 25: predicate.list_to_tuple_eliminator_ 2.45% : 0.000005s : 38: predicate.load_eliminater 1.26% : 0.000003s : 4: predicate.loop_unroll_after_grad 2.23% : 0.000005s : 34: predicate.loop_unroll_before_grad 1.61% : 0.000004s : 21: predicate.make_slice_get_slice_eliminator 0.50% : 0.000001s : 8: predicate.merge_addn 0.55% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.53% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.84% : 0.000002s : 13: predicate.minmaximum_grad 1.04% : 0.000002s : 4: predicate.mutable_eliminate 0.33% : 0.000001s : 4: predicate.opt_reshape 0.53% : 0.000001s : 4: predicate.parallel_virtual_node 1.76% : 0.000004s : 21: predicate.partial_defer_inline 1.57% : 0.000003s : 21: predicate.partial_eliminate 0.87% : 0.000002s : 13: predicate.print_const_string_wrapper 0.75% : 0.000002s : 8: predicate.reduce_all_const_elim 1.20% : 0.000003s : 13: predicate.reduce_eliminate 2.44% : 0.000005s : 38: predicate.redundant_stop_gradient_eliminater 0.36% : 0.000001s : 8: predicate.remove_not_recompute_node 1.42% : 0.000003s : 25: predicate.replace_applicator 0.62% : 0.000001s : 8: predicate.replace_old_param 0.36% : 0.000001s : 4: predicate.reset_defer_inline 0.97% : 0.000002s : 13: predicate.reshape_eliminate 0.92% : 0.000002s : 8: predicate.row_tensor_add_zeros_like 0.40% : 0.000001s : 4: predicate.row_tensor_eliminate 0.94% : 0.000002s : 8: predicate.same_eliminate 0.42% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.70% : 0.000002s : 8: predicate.shard_identity_eliminate 0.61% : 0.000001s : 8: predicate.special_op_eliminate 0.76% : 0.000002s : 8: predicate.specialize_transform 1.10% : 0.000002s : 8: predicate.split_environ_get_set_with_tuple_value 0.85% : 0.000002s : 8: predicate.stack_unstack_eliminate 0.52% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.66% : 0.000004s : 21: predicate.switch_defer_inline 1.94% : 0.000004s : 29: predicate.switch_layer_defer_inline 4.72% : 0.000010s : 67: predicate.switch_simplify 0.86% : 0.000002s : 13: predicate.tile_eliminate 0.88% : 0.000002s : 13: predicate.transpose_eliminate 1.57% : 0.000003s : 21: predicate.tuple_list_convert_item_index_to_positive 1.43% : 0.000003s : 21: predicate.tuple_list_get_item_const_eliminator 1.56% : 0.000003s : 21: predicate.tuple_list_get_item_depend_reorder 2.92% : 0.000006s : 33: predicate.tuple_list_get_item_eliminator 1.38% : 0.000003s : 21: predicate.tuple_list_get_set_item_eliminator 2.22% : 0.000005s : 29: predicate.tuple_list_set_item_eliminator 1.61% : 0.000004s : 25: predicate.tuple_to_list_eliminator_ 2.37% : 0.000005s : 38: predicate.updatestate_pure_node_eliminater 2.95% : 0.000007s : 46: predicate.updatestate_useless_node_eliminater 0.38% : 0.000001s : 4: predicate.value_based_eliminate 0.72% : 0.000002s : 8: predicate.virtual_dataset_eliminate 0.75% : 0.000002s : 8: predicate.virtual_output_eliminate 0.24% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.65% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000629 11 57.11% : 0.000359s : 5: func_graph_cloner_run.FuncGraphClonerGraph 42.89% : 0.000270s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.052066 192 0.01% : 0.000004s : 1: ForceFp32Comm 6.79% : 0.003534s : 1: add_attr 6.77% : 0.003522s : 1: add_attr_with_inline 0.01% : 0.000005s : 1: add_comm_op_reuse_tag 0.12% : 0.000060s : 1: add_recomputation 0.01% : 0.000004s : 1: assign_add_opt 0.13% : 0.000066s : 1: auto_monad 0.04% : 0.000021s : 1: auto_monad_reorder 0.01% : 0.000003s : 1: begin_end_overlap_inline 0.01% : 0.000006s : 1: bias_add_comm_swap 0.89% : 0.000464s : 1: bootstrap 0.07% : 0.000037s : 1: cconv 0.01% : 0.000004s : 1: comm_op_add_attrs 0.04% : 0.000019s : 1: control_data_broadcast_order 0.02% : 0.000010s : 1: convert_after_rewriter 0.05% : 0.000026s : 1: cse_after_recomputation 0.01% : 0.000005s : 1: dataset_repeat_opt 0.01% : 0.000005s : 1: detach_backward 0.02% : 0.000009s : 1: environ_conv 0.05% : 0.000025s : 1: event_method 0.01% : 0.000005s : 1: full_micro_interleaved_order_control 0.01% : 0.000005s : 1: get_jit_bprop_graph 0.02% : 0.000009s : 1: graph_reusing 0.01% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000004s : 1: handle_group_info 0.01% : 0.000006s : 1: inline 0.01% : 0.000007s : 1: insert-virtual-dataset 0.01% : 0.000004s : 1: interleave_parallel_branches 0.01% : 0.000004s : 1: interleave_split_concat_branches 0.01% : 0.000007s : 1: label_fine_grained_interleaved_index 0.01% : 0.000008s : 1: label_micro_interleaved_index 0.96% : 0.000502s : 1: loop_unroll 0.01% : 0.000005s : 1: merge_cast_opt 0.01% : 0.000005s : 1: micro_interleaved_order_control 1.41% : 0.000736s : 1: mutable_eliminate 0.01% : 0.000007s : 1: offloading_packed_experts 0.04% : 0.000019s : 1: opt.transform.loop_unroll_optimizer 0.04% : 0.000019s : 1: opt.transform.mutable_eliminate 2.43% : 0.001268s : 78: opt.transform.opt_a 0.06% : 0.000032s : 1: opt.transform.opt_after_cconv 0.05% : 0.000028s : 1: opt.transform.opt_after_jit_grad 0.20% : 0.000103s : 28: opt.transform.opt_b 0.10% : 0.000054s : 2: opt.transform.opt_trans_graph 0.07% : 0.000037s : 4: opt.transform.symbol_engine_opt 6.18% : 0.003218s : 1: opt_a 0.23% : 0.000119s : 1: opt_after_cconv 1.02% : 0.000531s : 1: opt_after_jit_grad 0.43% : 0.000222s : 1: opt_b 10.77% : 0.005608s : 1: optimize 0.04% : 0.000023s : 1: optimize_parallel_all_gather_comm 0.02% : 0.000008s : 1: order_py_execute_after_rewriter 0.05% : 0.000025s : 1: overlap_grad_flash_sp 0.01% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.01% : 0.000007s : 1: overlap_grad_ring_attention 0.01% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.01% : 0.000005s : 1: overlap_param_gather 0.01% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.02% : 0.000008s : 1: overlap_recompute_and_grad_model_parallel 0.01% : 0.000005s : 1: overlap_recompute_comm 0.01% : 0.000007s : 1: parallel-infer-symbol 0.01% : 0.000004s : 1: parallel-infer-symbol-second 0.01% : 0.000005s : 1: partial_unused_args_eliminate 0.01% : 0.000005s : 1: pipeline_parallel_scheduler 0.01% : 0.000005s : 1: pipeline_split 0.08% : 0.000040s : 1: pre_auto_parallel 0.06% : 0.000031s : 1: py_interpret_to_execute 0.03% : 0.000017s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000004s : 1: remove_cast_before_assign_add 0.03% : 0.000018s : 1: remove_dup_value 0.87% : 0.000455s : 1: renormalize.infer 0.70% : 0.000366s : 1: renormalize.specialize 0.01% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.02% : 0.000009s : 1: rewriter_after_jit_bprop_graph 0.09% : 0.000046s : 1: rewriter_after_opt_a 0.18% : 0.000092s : 1: rewriter_before_opt_a 0.01% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.01% : 0.000006s : 1: slice_recompute_activation 0.01% : 0.000005s : 1: split_layernorm_comm 0.01% : 0.000005s : 1: split_matmul_comm_elemetwise 0.02% : 0.000009s : 1: swap_dp_allreduce_reducescatter 0.16% : 0.000082s : 1: symbol_engine_optimizer 0.17% : 0.000088s : 1: tuple_transform 58.10% : 0.030252s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:43:26.123.504 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:43:26.123.776 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.15487, [21] [bootstrap]: 0.00051053 [type_inference]: 0.109302 [event_method]: 2.47e-05 [auto_monad]: 7.203e-05 [graph_reusing]: 6.15002e-06 [inline]: 2.76999e-06 [add_attr]: 0.00432595, [1] [add_attr_with_inline]: 0.00431129, [1] [Cycle 1]: 0.00010663, [2] [tag_attr]: 2.836e-05 [meta_addattr_fg_expand]: 6.16e-06 [parallel-infer-symbol]: 4.60001e-06 [pre_auto_parallel]: 4.649e-05 [insert-virtual-dataset]: 2.41e-06 [parallel-infer-symbol-second]: 7.79983e-07 [dataset_repeat_opt]: 2.07001e-06 [pipeline_split]: 1.96e-06 [optimize]: 0.0388706, [53] [py_interpret_to_execute]: 3.994e-05 [rewriter_before_opt_a]: 0.00010454 [opt_a]: 0.0361495, [2] [Cycle 1]: 0.0351923, [45] [expand_dump_flag]: 3.51001e-06 [switch_simplify]: 4.471e-05 [loop_unroll]: 3.021e-05 [a_1]: 0.00072684 [with_stream_mark]: 2.585e-05 [recompute_prepare]: 1.457e-05 [updatestate_depend_eliminate]: 5.17e-06 [updatestate_assign_eliminate]: 3.73999e-06 [updatestate_loads_eliminate]: 2.98998e-06 [parameter_eliminate]: 2.34999e-06 [a_2]: 0.00036469 [accelerated_algorithm]: 1.122e-05 [shard]: 3.09999e-06 [meta_shard_fg_expand]: 3.67002e-06 [shard_inline]: 8.04997e-06 [merge_send_recv]: 1.294e-05 [auto_parallel]: 1.4e-05 [parallel]: 3.856e-05 [flash_sp]: 2.2e-05 [merge_comm]: 1.763e-05 [allreduce_fusion]: 4.40999e-06 [matmul_add_comm_reduction]: 1.372e-05 [allreduce_slice_to_reducescatter]: 8.79983e-07 [virtual_shard_identity]: 3.504e-05 [virtual_dataset]: 8.97e-06 [get_grad_eliminate_]: 7.78001e-06 [virtual_output]: 8.02998e-06 [merge_forward]: 5.29e-06 [cell_reuse_recompute_pass]: 3.36001e-06 [offload_activation]: 1.057e-05 [cell_reuse_handle_not_recompute_node_pass]: 3.003e-05 [merge_recompute_call_nodes]: 1.60001e-06 [before_grad]: 1.323e-05 [set_forward_comm_id_for_comm_node_pass]: 4.32e-06 [meta_fg_expand]: 5.95002e-06 [flash_sp_send_recv_attached]: 2.70002e-06 [receive_attached]: 2.68e-06 [after_resolve]: 1.643e-05 [a_after_grad]: 1.163e-05 [renormalize]: 0.00087748 [add_forward_monad_depend]: 8.80999e-06 [auto_monad_grad]: 2.34001e-06 [auto_monad_eliminator]: 2e-05 [cse]: 2.869e-05 [a_3]: 6.855e-05 [Cycle 2]: 0.00093782, [45] [expand_dump_flag]: 2.24999e-06 [switch_simplify]: 8.39002e-06 [loop_unroll]: 6.51e-06 [a_1]: 0.00016257 [with_stream_mark]: 1.842e-05 [recompute_prepare]: 6.71999e-06 [updatestate_depend_eliminate]: 3.36001e-06 [updatestate_assign_eliminate]: 3.56999e-06 [updatestate_loads_eliminate]: 2.74001e-06 [parameter_eliminate]: 1.72001e-06 [a_2]: 0.00010804 [accelerated_algorithm]: 7.14001e-06 [shard]: 2.27001e-06 [meta_shard_fg_expand]: 2.04999e-06 [shard_inline]: 6.64001e-06 [merge_send_recv]: 8.47998e-06 [auto_parallel]: 9.61998e-06 [parallel]: 6.59001e-06 [flash_sp]: 4.48999e-06 [merge_comm]: 3.64002e-06 [allreduce_fusion]: 4.23001e-06 [matmul_add_comm_reduction]: 8.31002e-06 [allreduce_slice_to_reducescatter]: 6.30011e-07 [virtual_shard_identity]: 7.51001e-06 [virtual_dataset]: 6.63e-06 [get_grad_eliminate_]: 5.87999e-06 [virtual_output]: 6.28e-06 [merge_forward]: 3.91001e-06 [cell_reuse_recompute_pass]: 3.05002e-06 [offload_activation]: 9.17001e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.629e-05 [merge_recompute_call_nodes]: 1.45999e-06 [before_grad]: 1.158e-05 [set_forward_comm_id_for_comm_node_pass]: 4e-06 [meta_fg_expand]: 2.26e-06 [flash_sp_send_recv_attached]: 1.14998e-06 [receive_attached]: 1.69998e-06 [after_resolve]: 1.452e-05 [a_after_grad]: 1.015e-05 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 2.36e-06 [auto_monad_grad]: 1.82001e-06 [auto_monad_eliminator]: 1.077e-05 [cse]: 1.725e-05 [a_3]: 5.215e-05 [py_interpret_to_execute_after_opt_a]: 1.719e-05 [slice_cell_reuse_recomputed_activation]: 5.04e-06 [rewriter_after_opt_a]: 4.747e-05 [convert_after_rewriter]: 1.046e-05 [order_py_execute_after_rewriter]: 8.13999e-06 [mutable_eliminate]: 0.00074389 [opt_b]: 0.00028768, [1] [Cycle 1]: 0.00027618, [7] [b_1]: 0.00017248 [b_2]: 8.40001e-06 [updatestate_depend_eliminate]: 8.07e-06 [updatestate_assign_eliminate]: 2.76999e-06 [updatestate_loads_eliminate]: 2.69999e-06 [renormalize]: 9.09989e-07 [cse]: 2.325e-05 [optimize_parallel_all_gather_comm]: 2.155e-05 [overlap_param_gather]: 5.04e-06 [cconv]: 3.581e-05 [loop_unroll]: 0.00052674 [opt_after_cconv]: 0.00014107, [1] [Cycle 1]: 0.00013116, [7] [c_1]: 3.419e-05 [parameter_eliminate]: 3.86001e-06 [updatestate_depend_eliminate]: 6.74001e-06 [updatestate_assign_eliminate]: 2.93998e-06 [updatestate_loads_eliminate]: 2.22001e-06 [cse]: 2.268e-05 [renormalize]: 8.00006e-07 [remove_dup_value]: 1.686e-05 [tuple_transform]: 9.557e-05, [1] [Cycle 1]: 8.756e-05, [4] [d_1]: 4.784e-05 [none_parameter_eliminate]: 1.62001e-06 [renormalize]: 4.69998e-07 [switch_simplify]: 6.76e-06 [partial_unused_args_eliminate]: 4.73001e-06 [add_recomputation]: 5.78e-05 [cse_after_recomputation]: 2.982e-05, [1] [Cycle 1]: 2.164e-05, [1] [cse]: 1.211e-05 [environ_conv]: 9.35001e-06 [swap_dp_allreduce_reducescatter]: 8.23999e-06 [bias_add_comm_swap]: 5.40001e-06 [label_micro_interleaved_index]: 8.2e-06 [label_fine_grained_interleaved_index]: 5.57999e-06 [merge_cast_opt]: 3.87002e-06 [slice_recompute_activation]: 4.48999e-06 [micro_interleaved_order_control]: 5.24998e-06 [assign_add_opt]: 3.86999e-06 [ForceFp32Comm]: 3.33998e-06 [remove_cast_before_assign_add]: 3.66001e-06 [full_micro_interleaved_order_control]: 5.01002e-06 [reorder_send_recv_between_fp_bp]: 5.62999e-06 [comm_op_add_attrs]: 3.43e-06 [add_comm_op_reuse_tag]: 3.36999e-06 [interleave_split_concat_branches]: 3.54002e-06 [interleave_parallel_branches]: 3.51001e-06 [overlap_opt_shard_in_pipeline]: 3.4e-06 [overlap_opt_shard_grad_in_pipeline]: 3.95e-06 [control_data_broadcast_order]: 1.734e-05 [grouped_pairwise_exchange_alltoall]: 4.75999e-06 [offloading_packed_experts]: 7.71999e-06 [overlap_recompute_and_grad_model_parallel]: 7.92998e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.88001e-06 [overlap_recompute_allgather_and_fa_grad]: 3.86999e-06 [overlap_recompute_comm]: 4.84e-06 [overlap_grad_ring_attention]: 6.65998e-06 [overlap_grad_flash_sp]: 2.534e-05 [begin_end_overlap_inline]: 3.21999e-06 [split_matmul_comm_elemetwise]: 4.18999e-06 [split_layernorm_comm]: 4.19997e-06 [handle_group_info]: 3.58999e-06 [symbol_engine_optimizer]: 0.00010694, [1] [Cycle 1]: 9.906e-05, [6] [build]: 3.63e-06 [elim_shapecalc]: 1.271e-05 [elim_not_effective]: 1.392e-05 [opt_reshape]: 7.21001e-06 [fold_const_symbol]: 1.086e-05 [renormalize]: 2.3999e-07 [detach_backward]: 5.57999e-06 [pipeline_parallel_scheduler]: 1.81998e-06 [auto_monad_reorder]: 2.447e-05 [get_jit_bprop_graph]: 2.18002e-06 [rewriter_after_jit_bprop_graph]: 6.91999e-06 [opt_after_jit_grad]: 0.00063143 [validate]: 4.71e-05 Sums bootstrap : 0.000511s : 0.44% type_inference : 0.109302s : 94.14% event_method : 0.000025s : 0.02% auto_monad : 0.000072s : 0.06% graph_reusing : 0.000006s : 0.01% inline : 0.000003s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000028s : 0.02% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.01% parallel-infer-symbol : 0.000005s : 0.00% pre_auto_parallel : 0.000046s : 0.04% insert-virtual-dataset : 0.000002s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000040s : 0.03% optimize.rewriter_before_opt_a : 0.000105s : 0.09% optimize.opt_a.expand_dump_flag : 0.000006s : 0.00% optimize.opt_a.switch_simplify : 0.000053s : 0.05% optimize.opt_a.loop_unroll : 0.000037s : 0.03% optimize.opt_a.a_1 : 0.000889s : 0.77% optimize.opt_a.with_stream_mark : 0.000044s : 0.04% optimize.opt_a.recompute_prepare : 0.000021s : 0.02% optimize.opt_a.updatestate_depend_eliminate : 0.000009s : 0.01% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.01% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.00% optimize.opt_a.parameter_eliminate : 0.000004s : 0.00% optimize.opt_a.a_2 : 0.000473s : 0.41% optimize.opt_a.accelerated_algorithm : 0.000018s : 0.02% optimize.opt_a.shard : 0.000005s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.000006s : 0.00% optimize.opt_a.shard_inline : 0.000015s : 0.01% optimize.opt_a.merge_send_recv : 0.000021s : 0.02% optimize.opt_a.auto_parallel : 0.000024s : 0.02% optimize.opt_a.parallel : 0.000045s : 0.04% optimize.opt_a.flash_sp : 0.000026s : 0.02% optimize.opt_a.merge_comm : 0.000021s : 0.02% optimize.opt_a.allreduce_fusion : 0.000009s : 0.01% optimize.opt_a.matmul_add_comm_reduction : 0.000022s : 0.02% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000002s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000043s : 0.04% optimize.opt_a.virtual_dataset : 0.000016s : 0.01% optimize.opt_a.get_grad_eliminate_ : 0.000014s : 0.01% optimize.opt_a.virtual_output : 0.000014s : 0.01% optimize.opt_a.merge_forward : 0.000009s : 0.01% optimize.opt_a.cell_reuse_recompute_pass : 0.000006s : 0.01% optimize.opt_a.offload_activation : 0.000020s : 0.02% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000046s : 0.04% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.00% optimize.opt_a.before_grad : 0.000025s : 0.02% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000008s : 0.01% optimize.opt_a.meta_fg_expand : 0.000008s : 0.01% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.00% optimize.opt_a.receive_attached : 0.000004s : 0.00% optimize.opt_a.after_resolve : 0.000031s : 0.03% optimize.opt_a.a_after_grad : 0.000022s : 0.02% optimize.opt_a.renormalize : 0.000878s : 0.76% optimize.opt_a.add_forward_monad_depend : 0.000011s : 0.01% optimize.opt_a.auto_monad_grad : 0.000004s : 0.00% optimize.opt_a.auto_monad_eliminator : 0.000031s : 0.03% optimize.opt_a.cse : 0.000046s : 0.04% optimize.opt_a.a_3 : 0.000121s : 0.10% optimize.py_interpret_to_execute_after_opt_a : 0.000017s : 0.01% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.00% optimize.rewriter_after_opt_a : 0.000047s : 0.04% optimize.convert_after_rewriter : 0.000010s : 0.01% optimize.order_py_execute_after_rewriter : 0.000008s : 0.01% optimize.mutable_eliminate : 0.000744s : 0.64% optimize.opt_b.b_1 : 0.000172s : 0.15% optimize.opt_b.b_2 : 0.000008s : 0.01% optimize.opt_b.updatestate_depend_eliminate : 0.000008s : 0.01% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.00% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000023s : 0.02% optimize.optimize_parallel_all_gather_comm : 0.000022s : 0.02% optimize.overlap_param_gather : 0.000005s : 0.00% optimize.cconv : 0.000036s : 0.03% optimize.loop_unroll : 0.000527s : 0.45% optimize.opt_after_cconv.c_1 : 0.000034s : 0.03% optimize.opt_after_cconv.parameter_eliminate : 0.000004s : 0.00% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000007s : 0.01% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.00% optimize.opt_after_cconv.cse : 0.000023s : 0.02% optimize.opt_after_cconv.renormalize : 0.000001s : 0.00% optimize.remove_dup_value : 0.000017s : 0.01% optimize.tuple_transform.d_1 : 0.000048s : 0.04% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000007s : 0.01% optimize.partial_unused_args_eliminate : 0.000005s : 0.00% optimize.add_recomputation : 0.000058s : 0.05% optimize.cse_after_recomputation.cse : 0.000012s : 0.01% optimize.environ_conv : 0.000009s : 0.01% optimize.swap_dp_allreduce_reducescatter : 0.000008s : 0.01% optimize.bias_add_comm_swap : 0.000005s : 0.00% optimize.label_micro_interleaved_index : 0.000008s : 0.01% optimize.label_fine_grained_interleaved_index : 0.000006s : 0.00% optimize.merge_cast_opt : 0.000004s : 0.00% optimize.slice_recompute_activation : 0.000004s : 0.00% optimize.micro_interleaved_order_control : 0.000005s : 0.00% optimize.assign_add_opt : 0.000004s : 0.00% optimize.ForceFp32Comm : 0.000003s : 0.00% optimize.remove_cast_before_assign_add : 0.000004s : 0.00% optimize.full_micro_interleaved_order_control : 0.000005s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000006s : 0.00% optimize.comm_op_add_attrs : 0.000003s : 0.00% optimize.add_comm_op_reuse_tag : 0.000003s : 0.00% optimize.interleave_split_concat_branches : 0.000004s : 0.00% optimize.interleave_parallel_branches : 0.000004s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000003s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000004s : 0.00% optimize.control_data_broadcast_order : 0.000017s : 0.01% optimize.grouped_pairwise_exchange_alltoall : 0.000005s : 0.00% optimize.offloading_packed_experts : 0.000008s : 0.01% optimize.overlap_recompute_and_grad_model_parallel : 0.000008s : 0.01% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.00% optimize.overlap_recompute_comm : 0.000005s : 0.00% optimize.overlap_grad_ring_attention : 0.000007s : 0.01% optimize.overlap_grad_flash_sp : 0.000025s : 0.02% optimize.begin_end_overlap_inline : 0.000003s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000004s : 0.00% optimize.split_layernorm_comm : 0.000004s : 0.00% optimize.handle_group_info : 0.000004s : 0.00% optimize.symbol_engine_optimizer.build : 0.000004s : 0.00% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000013s : 0.01% optimize.symbol_engine_optimizer.elim_not_effective : 0.000014s : 0.01% optimize.symbol_engine_optimizer.opt_reshape : 0.000007s : 0.01% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000011s : 0.01% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000006s : 0.00% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000024s : 0.02% get_jit_bprop_graph : 0.000002s : 0.00% rewriter_after_jit_bprop_graph : 0.000007s : 0.01% opt_after_jit_grad : 0.000631s : 0.54% validate : 0.000047s : 0.04% Time group info: ------[substitution.] 0.000284 34 16.32% : 0.000046s : 6: substitution.arithmetic_simplify 0.70% : 0.000002s : 2: substitution.elim_not_effective 0.47% : 0.000001s : 2: substitution.fold_const_symbol 2.36% : 0.000007s : 4: substitution.graph_param_transform 66.91% : 0.000190s : 4: substitution.inline 1.99% : 0.000006s : 4: substitution.j_node_and_user_rematch 2.09% : 0.000006s : 4: substitution.remove_not_recompute_node 3.08% : 0.000009s : 4: substitution.replace_old_param 6.07% : 0.000017s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.109232 2 99.14% : 0.108290s : 1: type_inference.infer 0.86% : 0.000942s : 1: type_inference.specialize ------[replace.] 0.000069 8 63.28% : 0.000044s : 4: replace.inline 36.72% : 0.000025s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000202 8 92.53% : 0.000187s : 4: match.inline 7.47% : 0.000015s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000239 1278 1.07% : 0.000003s : 13: predicate.accumulaten_eliminater 0.95% : 0.000002s : 4: predicate.ad_related_special_op_eliminate 0.51% : 0.000001s : 8: predicate.addn_check_dump 1.02% : 0.000002s : 13: predicate.addn_zero_filter 0.76% : 0.000002s : 13: predicate.adjust_all_reduce_mul_add 2.79% : 0.000007s : 21: predicate.arithmetic_simplify 1.08% : 0.000003s : 13: predicate.cast_eliminate 0.54% : 0.000001s : 8: predicate.check_bprop_eliminate 0.59% : 0.000001s : 8: predicate.compare_switch_simplify 0.16% : 0.000000s : 4: predicate.const_output_eliminate 0.59% : 0.000001s : 8: predicate.depend_value_elim 0.90% : 0.000002s : 13: predicate.dict_get_item_const_eliminator 1.10% : 0.000003s : 13: predicate.dict_get_item_eliminator 0.93% : 0.000002s : 13: predicate.dict_set_item_eliminator 0.89% : 0.000002s : 8: predicate.dumpgradient_eliminate 0.17% : 0.000000s : 4: predicate.elim_not_effective 0.33% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.13% : 0.000003s : 17: predicate.environ_add_const_eliminate 1.04% : 0.000002s : 17: predicate.environ_get_add_eliminate 1.23% : 0.000003s : 17: predicate.environ_get_depend_swap 1.55% : 0.000004s : 25: predicate.environ_get_eliminate 1.08% : 0.000003s : 17: predicate.environ_get_set_eliminate 1.37% : 0.000003s : 21: predicate.exchange_switch_depend_value 2.24% : 0.000005s : 21: predicate.float_depend_g_call 0.69% : 0.000002s : 8: predicate.float_environ_get_switch 0.85% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.18% : 0.000000s : 4: predicate.fold_const_symbol 0.66% : 0.000002s : 8: predicate.get_grad_eliminate 0.18% : 0.000000s : 4: predicate.graph_param_transform 0.67% : 0.000002s : 8: predicate.incorporate_call 0.42% : 0.000001s : 8: predicate.incorporate_call_switch 6.03% : 0.000014s : 58: predicate.inline 0.79% : 0.000002s : 8: predicate.inline_without_move 0.29% : 0.000001s : 8: predicate.j_node_and_user_rematch 1.06% : 0.000003s : 8: predicate.less_batch_normalization 1.77% : 0.000004s : 25: predicate.list_to_tuple_eliminator_ 2.32% : 0.000006s : 38: predicate.load_eliminater 1.00% : 0.000002s : 4: predicate.loop_unroll_after_grad 2.19% : 0.000005s : 34: predicate.loop_unroll_before_grad 1.65% : 0.000004s : 21: predicate.make_slice_get_slice_eliminator 0.59% : 0.000001s : 8: predicate.merge_addn 0.50% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.53% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.84% : 0.000002s : 13: predicate.minmaximum_grad 1.25% : 0.000003s : 4: predicate.mutable_eliminate 0.27% : 0.000001s : 4: predicate.opt_reshape 0.36% : 0.000001s : 4: predicate.parallel_virtual_node 1.84% : 0.000004s : 21: predicate.partial_defer_inline 1.39% : 0.000003s : 21: predicate.partial_eliminate 0.82% : 0.000002s : 13: predicate.print_const_string_wrapper 0.70% : 0.000002s : 8: predicate.reduce_all_const_elim 1.14% : 0.000003s : 13: predicate.reduce_eliminate 2.40% : 0.000006s : 38: predicate.redundant_stop_gradient_eliminater 0.65% : 0.000002s : 8: predicate.remove_not_recompute_node 1.22% : 0.000003s : 25: predicate.replace_applicator 0.75% : 0.000002s : 8: predicate.replace_old_param 0.44% : 0.000001s : 4: predicate.reset_defer_inline 1.20% : 0.000003s : 13: predicate.reshape_eliminate 0.73% : 0.000002s : 8: predicate.row_tensor_add_zeros_like 0.44% : 0.000001s : 4: predicate.row_tensor_eliminate 0.74% : 0.000002s : 8: predicate.same_eliminate 0.61% : 0.000001s : 8: predicate.set_cell_output_no_recompute 1.48% : 0.000004s : 8: predicate.shard_identity_eliminate 0.75% : 0.000002s : 8: predicate.special_op_eliminate 0.90% : 0.000002s : 8: predicate.specialize_transform 1.18% : 0.000003s : 8: predicate.split_environ_get_set_with_tuple_value 0.64% : 0.000002s : 8: predicate.stack_unstack_eliminate 0.30% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.31% : 0.000003s : 21: predicate.switch_defer_inline 2.07% : 0.000005s : 29: predicate.switch_layer_defer_inline 4.99% : 0.000012s : 67: predicate.switch_simplify 0.80% : 0.000002s : 13: predicate.tile_eliminate 1.07% : 0.000003s : 13: predicate.transpose_eliminate 1.39% : 0.000003s : 21: predicate.tuple_list_convert_item_index_to_positive 1.85% : 0.000004s : 21: predicate.tuple_list_get_item_const_eliminator 1.33% : 0.000003s : 21: predicate.tuple_list_get_item_depend_reorder 3.01% : 0.000007s : 33: predicate.tuple_list_get_item_eliminator 1.36% : 0.000003s : 21: predicate.tuple_list_get_set_item_eliminator 2.16% : 0.000005s : 29: predicate.tuple_list_set_item_eliminator 1.62% : 0.000004s : 25: predicate.tuple_to_list_eliminator_ 2.33% : 0.000006s : 38: predicate.updatestate_pure_node_eliminater 2.76% : 0.000007s : 46: predicate.updatestate_useless_node_eliminater 0.36% : 0.000001s : 4: predicate.value_based_eliminate 0.90% : 0.000002s : 8: predicate.virtual_dataset_eliminate 0.63% : 0.000002s : 8: predicate.virtual_output_eliminate 0.26% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.35% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000738 11 52.46% : 0.000387s : 5: func_graph_cloner_run.FuncGraphClonerGraph 47.54% : 0.000351s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.200711 192 0.00% : 0.000006s : 1: ForceFp32Comm 2.16% : 0.004339s : 1: add_attr 2.15% : 0.004316s : 1: add_attr_with_inline 0.00% : 0.000007s : 1: add_comm_op_reuse_tag 0.03% : 0.000061s : 1: add_recomputation 0.00% : 0.000006s : 1: assign_add_opt 0.04% : 0.000082s : 1: auto_monad 0.02% : 0.000033s : 1: auto_monad_reorder 0.00% : 0.000006s : 1: begin_end_overlap_inline 0.00% : 0.000008s : 1: bias_add_comm_swap 0.28% : 0.000564s : 1: bootstrap 0.02% : 0.000039s : 1: cconv 0.00% : 0.000006s : 1: comm_op_add_attrs 0.01% : 0.000021s : 1: control_data_broadcast_order 0.01% : 0.000014s : 1: convert_after_rewriter 0.02% : 0.000033s : 1: cse_after_recomputation 0.00% : 0.000008s : 1: dataset_repeat_opt 0.02% : 0.000032s : 1: detach_backward 0.01% : 0.000013s : 1: environ_conv 0.02% : 0.000037s : 1: event_method 0.00% : 0.000008s : 1: full_micro_interleaved_order_control 0.00% : 0.000008s : 1: get_jit_bprop_graph 0.01% : 0.000012s : 1: graph_reusing 0.00% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000006s : 1: handle_group_info 0.00% : 0.000008s : 1: inline 0.00% : 0.000010s : 1: insert-virtual-dataset 0.00% : 0.000006s : 1: interleave_parallel_branches 0.00% : 0.000006s : 1: interleave_split_concat_branches 0.00% : 0.000009s : 1: label_fine_grained_interleaved_index 0.01% : 0.000011s : 1: label_micro_interleaved_index 0.27% : 0.000534s : 1: loop_unroll 0.00% : 0.000007s : 1: merge_cast_opt 0.00% : 0.000008s : 1: micro_interleaved_order_control 0.37% : 0.000751s : 1: mutable_eliminate 0.01% : 0.000011s : 1: offloading_packed_experts 0.01% : 0.000016s : 1: opt.transform.loop_unroll_optimizer 0.01% : 0.000019s : 1: opt.transform.mutable_eliminate 0.81% : 0.001628s : 78: opt.transform.opt_a 0.02% : 0.000033s : 1: opt.transform.opt_after_cconv 0.02% : 0.000031s : 1: opt.transform.opt_after_jit_grad 0.05% : 0.000106s : 28: opt.transform.opt_b 0.03% : 0.000052s : 2: opt.transform.opt_trans_graph 0.02% : 0.000040s : 4: opt.transform.symbol_engine_opt 18.01% : 0.036153s : 1: opt_a 0.07% : 0.000145s : 1: opt_after_cconv 0.32% : 0.000644s : 1: opt_after_jit_grad 0.15% : 0.000291s : 1: opt_b 19.70% : 0.039540s : 1: optimize 0.01% : 0.000025s : 1: optimize_parallel_all_gather_comm 0.01% : 0.000011s : 1: order_py_execute_after_rewriter 0.01% : 0.000029s : 1: overlap_grad_flash_sp 0.00% : 0.000007s : 1: overlap_grad_matmul_and_grad_allreduce 0.00% : 0.000010s : 1: overlap_grad_ring_attention 0.00% : 0.000007s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000006s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000008s : 1: overlap_param_gather 0.00% : 0.000007s : 1: overlap_recompute_allgather_and_fa_grad 0.01% : 0.000011s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000008s : 1: overlap_recompute_comm 0.01% : 0.000012s : 1: parallel-infer-symbol 0.00% : 0.000007s : 1: parallel-infer-symbol-second 0.00% : 0.000008s : 1: partial_unused_args_eliminate 0.01% : 0.000010s : 1: pipeline_parallel_scheduler 0.00% : 0.000008s : 1: pipeline_split 0.03% : 0.000055s : 1: pre_auto_parallel 0.02% : 0.000044s : 1: py_interpret_to_execute 0.01% : 0.000021s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000006s : 1: remove_cast_before_assign_add 0.01% : 0.000020s : 1: remove_dup_value 0.23% : 0.000465s : 1: renormalize.infer 0.20% : 0.000401s : 1: renormalize.specialize 0.00% : 0.000008s : 1: reorder_send_recv_between_fp_bp 0.01% : 0.000013s : 1: rewriter_after_jit_bprop_graph 0.03% : 0.000052s : 1: rewriter_after_opt_a 0.05% : 0.000109s : 1: rewriter_before_opt_a 0.00% : 0.000008s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000008s : 1: slice_recompute_activation 0.00% : 0.000007s : 1: split_layernorm_comm 0.00% : 0.000007s : 1: split_matmul_comm_elemetwise 0.01% : 0.000011s : 1: swap_dp_allreduce_reducescatter 0.05% : 0.000110s : 1: symbol_engine_optimizer 0.05% : 0.000098s : 1: tuple_transform 54.49% : 0.109363s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:43:27.574.475 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.172298, [21] [bootstrap]: 0.00052374 [type_inference]: 0.0960601 [event_method]: 2.205e-05 [auto_monad]: 6.834e-05 [graph_reusing]: 6.24999e-06 [inline]: 2.84999e-06 [add_attr]: 0.00444109, [1] [add_attr_with_inline]: 0.00442421, [1] [Cycle 1]: 9.255e-05, [2] [tag_attr]: 2.805e-05 [meta_addattr_fg_expand]: 6.37001e-06 [parallel-infer-symbol]: 4.2e-06 [pre_auto_parallel]: 4.861e-05 [insert-virtual-dataset]: 3.23e-06 [parallel-infer-symbol-second]: 1.00999e-06 [dataset_repeat_opt]: 2.11e-06 [pipeline_split]: 1.86e-06 [optimize]: 0.0702216, [53] [py_interpret_to_execute]: 3.963e-05 [rewriter_before_opt_a]: 0.00010728 [opt_a]: 0.0676624, [2] [Cycle 1]: 0.00262302, [45] [expand_dump_flag]: 3.44001e-06 [switch_simplify]: 5.197e-05 [loop_unroll]: 3.394e-05 [a_1]: 0.00078862 [with_stream_mark]: 2.602e-05 [recompute_prepare]: 1.348e-05 [updatestate_depend_eliminate]: 5.76e-06 [updatestate_assign_eliminate]: 3.51001e-06 [updatestate_loads_eliminate]: 3.38999e-06 [parameter_eliminate]: 2.93e-06 [a_2]: 0.00010328 [accelerated_algorithm]: 9.13002e-06 [shard]: 2.51e-06 [meta_shard_fg_expand]: 2.84001e-06 [shard_inline]: 7.59002e-06 [merge_send_recv]: 1.064e-05 [auto_parallel]: 1.016e-05 [parallel]: 2.153e-05 [flash_sp]: 1.01e-05 [merge_comm]: 5.04e-06 [allreduce_fusion]: 3.8e-06 [matmul_add_comm_reduction]: 1.235e-05 [allreduce_slice_to_reducescatter]: 8.79983e-07 [virtual_shard_identity]: 1.193e-05 [virtual_dataset]: 9.46e-06 [get_grad_eliminate_]: 7.85e-06 [virtual_output]: 7.92e-06 [merge_forward]: 5.20001e-06 [cell_reuse_recompute_pass]: 1.77001e-06 [offload_activation]: 1.164e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.676e-05 [merge_recompute_call_nodes]: 1.57001e-06 [before_grad]: 1.339e-05 [set_forward_comm_id_for_comm_node_pass]: 4.24002e-06 [meta_fg_expand]: 3.98001e-06 [flash_sp_send_recv_attached]: 3.27002e-06 [receive_attached]: 2.57001e-06 [after_resolve]: 1.459e-05 [a_after_grad]: 1.209e-05 [renormalize]: 0.00092459 [add_forward_monad_depend]: 1.004e-05 [auto_monad_grad]: 2.82002e-06 [auto_monad_eliminator]: 1.863e-05 [cse]: 3.074e-05 [a_3]: 6.079e-05 [Cycle 2]: 0.0650241, [45] [expand_dump_flag]: 2.22001e-06 [switch_simplify]: 9.09e-06 [loop_unroll]: 7.35e-06 [a_1]: 0.00016492 [with_stream_mark]: 1.747e-05 [recompute_prepare]: 6.76999e-06 [updatestate_depend_eliminate]: 3.75998e-06 [updatestate_assign_eliminate]: 2.84999e-06 [updatestate_loads_eliminate]: 3.98999e-06 [parameter_eliminate]: 1.71e-06 [a_2]: 8.164e-05 [accelerated_algorithm]: 7.00002e-06 [shard]: 1.67001e-06 [meta_shard_fg_expand]: 2.29999e-06 [shard_inline]: 6.53998e-06 [merge_send_recv]: 8.45999e-06 [auto_parallel]: 5.001e-05 [parallel]: 1.424e-05 [flash_sp]: 7.05e-06 [merge_comm]: 8.88002e-06 [allreduce_fusion]: 4.58999e-06 [matmul_add_comm_reduction]: 1.396e-05 [allreduce_slice_to_reducescatter]: 9.49978e-07 [virtual_shard_identity]: 3.436e-05 [virtual_dataset]: 9.47001e-06 [get_grad_eliminate_]: 7.43999e-06 [virtual_output]: 7.78001e-06 [merge_forward]: 4.90001e-06 [cell_reuse_recompute_pass]: 3.86999e-06 [offload_activation]: 1.165e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.741e-05 [merge_recompute_call_nodes]: 1.39e-06 [before_grad]: 1.164e-05 [set_forward_comm_id_for_comm_node_pass]: 4.38999e-06 [meta_fg_expand]: 3.89997e-06 [flash_sp_send_recv_attached]: 1.70001e-06 [receive_attached]: 2.34001e-06 [after_resolve]: 1.382e-05 [a_after_grad]: 1.101e-05 [renormalize]: 8.00064e-08 [add_forward_monad_depend]: 7.29001e-06 [auto_monad_grad]: 3.22002e-06 [auto_monad_eliminator]: 1.891e-05 [cse]: 3.631e-05 [a_3]: 4.571e-05 [py_interpret_to_execute_after_opt_a]: 2.058e-05 [slice_cell_reuse_recomputed_activation]: 2.15002e-06 [rewriter_after_opt_a]: 4.287e-05 [convert_after_rewriter]: 7.31001e-06 [order_py_execute_after_rewriter]: 5.50001e-06 [mutable_eliminate]: 0.00077954 [opt_b]: 0.00023632, [1] [Cycle 1]: 0.00022716, [7] [b_1]: 0.00013611 [b_2]: 9.51e-06 [updatestate_depend_eliminate]: 9.28002e-06 [updatestate_assign_eliminate]: 3.4e-06 [updatestate_loads_eliminate]: 2.98e-06 [renormalize]: 5.69999e-07 [cse]: 2.793e-05 [optimize_parallel_all_gather_comm]: 1.896e-05 [overlap_param_gather]: 2.30002e-06 [cconv]: 3.742e-05 [loop_unroll]: 0.00053279 [opt_after_cconv]: 0.00011623, [1] [Cycle 1]: 0.000109, [7] [c_1]: 3.508e-05 [parameter_eliminate]: 5.57001e-06 [updatestate_depend_eliminate]: 7.06001e-06 [updatestate_assign_eliminate]: 2.56998e-06 [updatestate_loads_eliminate]: 2.23998e-06 [cse]: 2.15e-05 [renormalize]: 6.40022e-07 [remove_dup_value]: 1.423e-05 [tuple_transform]: 8.163e-05, [1] [Cycle 1]: 7.641e-05, [4] [d_1]: 4.928e-05 [none_parameter_eliminate]: 1.89e-06 [renormalize]: 1.69995e-07 [switch_simplify]: 6.76999e-06 [partial_unused_args_eliminate]: 1.83002e-06 [add_recomputation]: 5.702e-05 [cse_after_recomputation]: 2.29e-05, [1] [Cycle 1]: 1.796e-05, [1] [cse]: 1.213e-05 [environ_conv]: 6.16e-06 [swap_dp_allreduce_reducescatter]: 4.97e-06 [bias_add_comm_swap]: 3.14001e-06 [label_micro_interleaved_index]: 5.58002e-06 [label_fine_grained_interleaved_index]: 3.02002e-06 [merge_cast_opt]: 1.51002e-06 [slice_recompute_activation]: 2.40002e-06 [micro_interleaved_order_control]: 3.53999e-06 [assign_add_opt]: 1.39e-06 [ForceFp32Comm]: 9.50007e-07 [remove_cast_before_assign_add]: 1.12e-06 [full_micro_interleaved_order_control]: 2.69999e-06 [reorder_send_recv_between_fp_bp]: 2.89999e-06 [comm_op_add_attrs]: 1.03001e-06 [add_comm_op_reuse_tag]: 9.89996e-07 [interleave_split_concat_branches]: 1.19e-06 [interleave_parallel_branches]: 1.24e-06 [overlap_opt_shard_in_pipeline]: 1.17e-06 [overlap_opt_shard_grad_in_pipeline]: 2.06998e-06 [control_data_broadcast_order]: 1.334e-05 [grouped_pairwise_exchange_alltoall]: 1.54998e-06 [offloading_packed_experts]: 3.90998e-06 [overlap_recompute_and_grad_model_parallel]: 4.42e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.13001e-06 [overlap_recompute_allgather_and_fa_grad]: 1.35999e-06 [overlap_recompute_comm]: 2.16e-06 [overlap_grad_ring_attention]: 4.53999e-06 [overlap_grad_flash_sp]: 2.154e-05 [begin_end_overlap_inline]: 4.69998e-07 [split_matmul_comm_elemetwise]: 2.22999e-06 [split_layernorm_comm]: 1.84e-06 [handle_group_info]: 1.19e-06 [symbol_engine_optimizer]: 8.237e-05, [1] [Cycle 1]: 7.781e-05, [6] [build]: 3.45e-06 [elim_shapecalc]: 1.052e-05 [elim_not_effective]: 1.465e-05 [opt_reshape]: 7.25e-06 [fold_const_symbol]: 1.172e-05 [renormalize]: 2.3999e-07 [detach_backward]: 2.24001e-06 [pipeline_parallel_scheduler]: 1.85001e-06 [auto_monad_reorder]: 1.764e-05 [get_jit_bprop_graph]: 2.00002e-06 [rewriter_after_jit_bprop_graph]: 5.79e-06 [opt_after_jit_grad]: 0.00059604 [validate]: 4.763e-05 Sums bootstrap : 0.000524s : 0.51% type_inference : 0.096060s : 93.61% event_method : 0.000022s : 0.02% auto_monad : 0.000068s : 0.07% graph_reusing : 0.000006s : 0.01% inline : 0.000003s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000028s : 0.03% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.01% parallel-infer-symbol : 0.000004s : 0.00% pre_auto_parallel : 0.000049s : 0.05% insert-virtual-dataset : 0.000003s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000040s : 0.04% optimize.rewriter_before_opt_a : 0.000107s : 0.10% optimize.opt_a.expand_dump_flag : 0.000006s : 0.01% optimize.opt_a.switch_simplify : 0.000061s : 0.06% optimize.opt_a.loop_unroll : 0.000041s : 0.04% optimize.opt_a.a_1 : 0.000954s : 0.93% optimize.opt_a.with_stream_mark : 0.000043s : 0.04% optimize.opt_a.recompute_prepare : 0.000020s : 0.02% optimize.opt_a.updatestate_depend_eliminate : 0.000010s : 0.01% optimize.opt_a.updatestate_assign_eliminate : 0.000006s : 0.01% optimize.opt_a.updatestate_loads_eliminate : 0.000007s : 0.01% optimize.opt_a.parameter_eliminate : 0.000005s : 0.00% optimize.opt_a.a_2 : 0.000185s : 0.18% optimize.opt_a.accelerated_algorithm : 0.000016s : 0.02% optimize.opt_a.shard : 0.000004s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.000005s : 0.01% optimize.opt_a.shard_inline : 0.000014s : 0.01% optimize.opt_a.merge_send_recv : 0.000019s : 0.02% optimize.opt_a.auto_parallel : 0.000060s : 0.06% optimize.opt_a.parallel : 0.000036s : 0.03% optimize.opt_a.flash_sp : 0.000017s : 0.02% optimize.opt_a.merge_comm : 0.000014s : 0.01% optimize.opt_a.allreduce_fusion : 0.000008s : 0.01% optimize.opt_a.matmul_add_comm_reduction : 0.000026s : 0.03% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000002s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000046s : 0.05% optimize.opt_a.virtual_dataset : 0.000019s : 0.02% optimize.opt_a.get_grad_eliminate_ : 0.000015s : 0.01% optimize.opt_a.virtual_output : 0.000016s : 0.02% optimize.opt_a.merge_forward : 0.000010s : 0.01% optimize.opt_a.cell_reuse_recompute_pass : 0.000006s : 0.01% optimize.opt_a.offload_activation : 0.000023s : 0.02% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000044s : 0.04% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.00% optimize.opt_a.before_grad : 0.000025s : 0.02% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000009s : 0.01% optimize.opt_a.meta_fg_expand : 0.000008s : 0.01% optimize.opt_a.flash_sp_send_recv_attached : 0.000005s : 0.00% optimize.opt_a.receive_attached : 0.000005s : 0.00% optimize.opt_a.after_resolve : 0.000028s : 0.03% optimize.opt_a.a_after_grad : 0.000023s : 0.02% optimize.opt_a.renormalize : 0.000925s : 0.90% optimize.opt_a.add_forward_monad_depend : 0.000017s : 0.02% optimize.opt_a.auto_monad_grad : 0.000006s : 0.01% optimize.opt_a.auto_monad_eliminator : 0.000038s : 0.04% optimize.opt_a.cse : 0.000067s : 0.07% optimize.opt_a.a_3 : 0.000106s : 0.10% optimize.py_interpret_to_execute_after_opt_a : 0.000021s : 0.02% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.00% optimize.rewriter_after_opt_a : 0.000043s : 0.04% optimize.convert_after_rewriter : 0.000007s : 0.01% optimize.order_py_execute_after_rewriter : 0.000006s : 0.01% optimize.mutable_eliminate : 0.000780s : 0.76% optimize.opt_b.b_1 : 0.000136s : 0.13% optimize.opt_b.b_2 : 0.000010s : 0.01% optimize.opt_b.updatestate_depend_eliminate : 0.000009s : 0.01% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.00% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000028s : 0.03% optimize.optimize_parallel_all_gather_comm : 0.000019s : 0.02% optimize.overlap_param_gather : 0.000002s : 0.00% optimize.cconv : 0.000037s : 0.04% optimize.loop_unroll : 0.000533s : 0.52% optimize.opt_after_cconv.c_1 : 0.000035s : 0.03% optimize.opt_after_cconv.parameter_eliminate : 0.000006s : 0.01% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000007s : 0.01% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.00% optimize.opt_after_cconv.cse : 0.000022s : 0.02% optimize.opt_after_cconv.renormalize : 0.000001s : 0.00% optimize.remove_dup_value : 0.000014s : 0.01% optimize.tuple_transform.d_1 : 0.000049s : 0.05% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000007s : 0.01% optimize.partial_unused_args_eliminate : 0.000002s : 0.00% optimize.add_recomputation : 0.000057s : 0.06% optimize.cse_after_recomputation.cse : 0.000012s : 0.01% optimize.environ_conv : 0.000006s : 0.01% optimize.swap_dp_allreduce_reducescatter : 0.000005s : 0.00% optimize.bias_add_comm_swap : 0.000003s : 0.00% optimize.label_micro_interleaved_index : 0.000006s : 0.01% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.00% optimize.merge_cast_opt : 0.000002s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.00% optimize.micro_interleaved_order_control : 0.000004s : 0.00% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.00% optimize.full_micro_interleaved_order_control : 0.000003s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.00% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.00% optimize.control_data_broadcast_order : 0.000013s : 0.01% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.00% optimize.offloading_packed_experts : 0.000004s : 0.00% optimize.overlap_recompute_and_grad_model_parallel : 0.000004s : 0.00% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.00% optimize.overlap_recompute_comm : 0.000002s : 0.00% optimize.overlap_grad_ring_attention : 0.000005s : 0.00% optimize.overlap_grad_flash_sp : 0.000022s : 0.02% optimize.begin_end_overlap_inline : 0.000000s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.00% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000003s : 0.00% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000011s : 0.01% optimize.symbol_engine_optimizer.elim_not_effective : 0.000015s : 0.01% optimize.symbol_engine_optimizer.opt_reshape : 0.000007s : 0.01% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000012s : 0.01% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.00% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000018s : 0.02% get_jit_bprop_graph : 0.000002s : 0.00% rewriter_after_jit_bprop_graph : 0.000006s : 0.01% opt_after_jit_grad : 0.000596s : 0.58% validate : 0.000048s : 0.05% Time group info: ------[substitution.] 0.000291 34 15.57% : 0.000045s : 6: substitution.arithmetic_simplify 0.79% : 0.000002s : 2: substitution.elim_not_effective 0.60% : 0.000002s : 2: substitution.fold_const_symbol 2.22% : 0.000006s : 4: substitution.graph_param_transform 67.78% : 0.000197s : 4: substitution.inline 1.92% : 0.000006s : 4: substitution.j_node_and_user_rematch 2.48% : 0.000007s : 4: substitution.remove_not_recompute_node 2.54% : 0.000007s : 4: substitution.replace_old_param 6.09% : 0.000018s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.095942 2 99.19% : 0.095161s : 1: type_inference.infer 0.81% : 0.000781s : 1: type_inference.specialize ------[replace.] 0.000075 8 63.25% : 0.000048s : 4: replace.inline 36.75% : 0.000028s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000210 8 92.51% : 0.000194s : 4: match.inline 7.49% : 0.000016s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000257 1278 0.93% : 0.000002s : 13: predicate.accumulaten_eliminater 0.62% : 0.000002s : 4: predicate.ad_related_special_op_eliminate 0.62% : 0.000002s : 8: predicate.addn_check_dump 0.93% : 0.000002s : 13: predicate.addn_zero_filter 0.82% : 0.000002s : 13: predicate.adjust_all_reduce_mul_add 2.64% : 0.000007s : 21: predicate.arithmetic_simplify 1.05% : 0.000003s : 13: predicate.cast_eliminate 0.55% : 0.000001s : 8: predicate.check_bprop_eliminate 0.60% : 0.000002s : 8: predicate.compare_switch_simplify 0.15% : 0.000000s : 4: predicate.const_output_eliminate 0.75% : 0.000002s : 8: predicate.depend_value_elim 1.00% : 0.000003s : 13: predicate.dict_get_item_const_eliminator 1.03% : 0.000003s : 13: predicate.dict_get_item_eliminator 0.86% : 0.000002s : 13: predicate.dict_set_item_eliminator 1.16% : 0.000003s : 8: predicate.dumpgradient_eliminate 0.23% : 0.000001s : 4: predicate.elim_not_effective 0.33% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.15% : 0.000003s : 17: predicate.environ_add_const_eliminate 1.14% : 0.000003s : 17: predicate.environ_get_add_eliminate 1.12% : 0.000003s : 17: predicate.environ_get_depend_swap 1.68% : 0.000004s : 25: predicate.environ_get_eliminate 1.10% : 0.000003s : 17: predicate.environ_get_set_eliminate 1.42% : 0.000004s : 21: predicate.exchange_switch_depend_value 2.34% : 0.000006s : 21: predicate.float_depend_g_call 0.56% : 0.000001s : 8: predicate.float_environ_get_switch 0.73% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.16% : 0.000000s : 4: predicate.fold_const_symbol 0.81% : 0.000002s : 8: predicate.get_grad_eliminate 0.16% : 0.000000s : 4: predicate.graph_param_transform 0.48% : 0.000001s : 8: predicate.incorporate_call 0.42% : 0.000001s : 8: predicate.incorporate_call_switch 5.99% : 0.000015s : 58: predicate.inline 0.77% : 0.000002s : 8: predicate.inline_without_move 0.28% : 0.000001s : 8: predicate.j_node_and_user_rematch 1.09% : 0.000003s : 8: predicate.less_batch_normalization 1.75% : 0.000005s : 25: predicate.list_to_tuple_eliminator_ 2.53% : 0.000006s : 38: predicate.load_eliminater 0.84% : 0.000002s : 4: predicate.loop_unroll_after_grad 2.35% : 0.000006s : 34: predicate.loop_unroll_before_grad 1.43% : 0.000004s : 21: predicate.make_slice_get_slice_eliminator 0.48% : 0.000001s : 8: predicate.merge_addn 0.64% : 0.000002s : 8: predicate.micro_step_allgather_replace 0.63% : 0.000002s : 8: predicate.mini_step_allgather_replace 0.87% : 0.000002s : 13: predicate.minmaximum_grad 1.22% : 0.000003s : 4: predicate.mutable_eliminate 0.33% : 0.000001s : 4: predicate.opt_reshape 0.28% : 0.000001s : 4: predicate.parallel_virtual_node 2.04% : 0.000005s : 21: predicate.partial_defer_inline 1.41% : 0.000004s : 21: predicate.partial_eliminate 0.87% : 0.000002s : 13: predicate.print_const_string_wrapper 0.68% : 0.000002s : 8: predicate.reduce_all_const_elim 1.27% : 0.000003s : 13: predicate.reduce_eliminate 2.52% : 0.000006s : 38: predicate.redundant_stop_gradient_eliminater 0.51% : 0.000001s : 8: predicate.remove_not_recompute_node 1.25% : 0.000003s : 25: predicate.replace_applicator 0.60% : 0.000002s : 8: predicate.replace_old_param 0.46% : 0.000001s : 4: predicate.reset_defer_inline 1.03% : 0.000003s : 13: predicate.reshape_eliminate 0.80% : 0.000002s : 8: predicate.row_tensor_add_zeros_like 0.55% : 0.000001s : 4: predicate.row_tensor_eliminate 0.79% : 0.000002s : 8: predicate.same_eliminate 0.49% : 0.000001s : 8: predicate.set_cell_output_no_recompute 1.29% : 0.000003s : 8: predicate.shard_identity_eliminate 0.60% : 0.000002s : 8: predicate.special_op_eliminate 0.58% : 0.000001s : 8: predicate.specialize_transform 1.20% : 0.000003s : 8: predicate.split_environ_get_set_with_tuple_value 0.84% : 0.000002s : 8: predicate.stack_unstack_eliminate 0.32% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.55% : 0.000004s : 21: predicate.switch_defer_inline 1.88% : 0.000005s : 29: predicate.switch_layer_defer_inline 4.94% : 0.000013s : 67: predicate.switch_simplify 0.97% : 0.000002s : 13: predicate.tile_eliminate 1.17% : 0.000003s : 13: predicate.transpose_eliminate 1.50% : 0.000004s : 21: predicate.tuple_list_convert_item_index_to_positive 1.38% : 0.000004s : 21: predicate.tuple_list_get_item_const_eliminator 1.49% : 0.000004s : 21: predicate.tuple_list_get_item_depend_reorder 3.09% : 0.000008s : 33: predicate.tuple_list_get_item_eliminator 1.44% : 0.000004s : 21: predicate.tuple_list_get_set_item_eliminator 2.10% : 0.000005s : 29: predicate.tuple_list_set_item_eliminator 1.62% : 0.000004s : 25: predicate.tuple_to_list_eliminator_ 2.16% : 0.000006s : 38: predicate.updatestate_pure_node_eliminater 2.75% : 0.000007s : 46: predicate.updatestate_useless_node_eliminater 0.37% : 0.000001s : 4: predicate.value_based_eliminate 1.11% : 0.000003s : 8: predicate.virtual_dataset_eliminate 0.66% : 0.000002s : 8: predicate.virtual_output_eliminate 0.22% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.50% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000680 11 54.91% : 0.000373s : 5: func_graph_cloner_run.FuncGraphClonerGraph 45.09% : 0.000307s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.249517 192 0.00% : 0.000004s : 1: ForceFp32Comm 1.78% : 0.004448s : 1: add_attr 1.78% : 0.004429s : 1: add_attr_with_inline 0.00% : 0.000005s : 1: add_comm_op_reuse_tag 0.02% : 0.000061s : 1: add_recomputation 0.00% : 0.000004s : 1: assign_add_opt 0.03% : 0.000075s : 1: auto_monad 0.01% : 0.000022s : 1: auto_monad_reorder 0.00% : 0.000003s : 1: begin_end_overlap_inline 0.00% : 0.000006s : 1: bias_add_comm_swap 0.22% : 0.000561s : 1: bootstrap 0.02% : 0.000041s : 1: cconv 0.00% : 0.000004s : 1: comm_op_add_attrs 0.01% : 0.000017s : 1: control_data_broadcast_order 0.00% : 0.000010s : 1: convert_after_rewriter 0.01% : 0.000026s : 1: cse_after_recomputation 0.00% : 0.000005s : 1: dataset_repeat_opt 0.00% : 0.000006s : 1: detach_backward 0.00% : 0.000010s : 1: environ_conv 0.01% : 0.000030s : 1: event_method 0.00% : 0.000006s : 1: full_micro_interleaved_order_control 0.00% : 0.000005s : 1: get_jit_bprop_graph 0.00% : 0.000010s : 1: graph_reusing 0.00% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000004s : 1: handle_group_info 0.00% : 0.000006s : 1: inline 0.00% : 0.000007s : 1: insert-virtual-dataset 0.00% : 0.000004s : 1: interleave_parallel_branches 0.00% : 0.000004s : 1: interleave_split_concat_branches 0.00% : 0.000006s : 1: label_fine_grained_interleaved_index 0.00% : 0.000009s : 1: label_micro_interleaved_index 0.22% : 0.000543s : 1: loop_unroll 0.00% : 0.000004s : 1: merge_cast_opt 0.00% : 0.000007s : 1: micro_interleaved_order_control 0.32% : 0.000790s : 1: mutable_eliminate 0.00% : 0.000007s : 1: offloading_packed_experts 0.01% : 0.000017s : 1: opt.transform.loop_unroll_optimizer 0.01% : 0.000021s : 1: opt.transform.mutable_eliminate 0.60% : 0.001495s : 78: opt.transform.opt_a 0.01% : 0.000033s : 1: opt.transform.opt_after_cconv 0.01% : 0.000029s : 1: opt.transform.opt_after_jit_grad 0.04% : 0.000109s : 28: opt.transform.opt_b 0.02% : 0.000054s : 2: opt.transform.opt_trans_graph 0.02% : 0.000039s : 4: opt.transform.symbol_engine_opt 27.12% : 0.067666s : 1: opt_a 0.05% : 0.000120s : 1: opt_after_cconv 0.24% : 0.000607s : 1: opt_after_jit_grad 0.10% : 0.000240s : 1: opt_b 28.15% : 0.070228s : 1: optimize 0.01% : 0.000022s : 1: optimize_parallel_all_gather_comm 0.00% : 0.000009s : 1: order_py_execute_after_rewriter 0.01% : 0.000025s : 1: overlap_grad_flash_sp 0.00% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.00% : 0.000008s : 1: overlap_grad_ring_attention 0.00% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000006s : 1: overlap_param_gather 0.00% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.00% : 0.000008s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000005s : 1: overlap_recompute_comm 0.00% : 0.000008s : 1: parallel-infer-symbol 0.00% : 0.000004s : 1: parallel-infer-symbol-second 0.00% : 0.000005s : 1: partial_unused_args_eliminate 0.00% : 0.000005s : 1: pipeline_parallel_scheduler 0.00% : 0.000005s : 1: pipeline_split 0.02% : 0.000055s : 1: pre_auto_parallel 0.02% : 0.000045s : 1: py_interpret_to_execute 0.01% : 0.000024s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000004s : 1: remove_cast_before_assign_add 0.01% : 0.000018s : 1: remove_dup_value 0.19% : 0.000481s : 1: renormalize.infer 0.17% : 0.000429s : 1: renormalize.specialize 0.00% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.00% : 0.000009s : 1: rewriter_after_jit_bprop_graph 0.02% : 0.000047s : 1: rewriter_after_opt_a 0.05% : 0.000114s : 1: rewriter_before_opt_a 0.00% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000007s : 1: slice_recompute_activation 0.00% : 0.000006s : 1: split_layernorm_comm 0.00% : 0.000005s : 1: split_matmul_comm_elemetwise 0.00% : 0.000008s : 1: swap_dp_allreduce_reducescatter 0.03% : 0.000085s : 1: symbol_engine_optimizer 0.03% : 0.000084s : 1: tuple_transform 38.52% : 0.096119s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:43:28.923.605 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:43:28.923.892 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0564656, [21] [bootstrap]: 0.00046735 [type_inference]: 0.0437116 [event_method]: 2.183e-05 [auto_monad]: 6.743e-05 [graph_reusing]: 5.69999e-06 [inline]: 3.11001e-06 [add_attr]: 0.00395627, [1] [add_attr_with_inline]: 0.00394425, [1] [Cycle 1]: 9.778e-05, [2] [tag_attr]: 2.525e-05 [meta_addattr_fg_expand]: 6.37001e-06 [parallel-infer-symbol]: 3.81001e-06 [pre_auto_parallel]: 4.189e-05 [insert-virtual-dataset]: 2.71e-06 [parallel-infer-symbol-second]: 9.00007e-07 [dataset_repeat_opt]: 1.87001e-06 [pipeline_split]: 1.74e-06 [optimize]: 0.0060861, [53] [py_interpret_to_execute]: 3.543e-05 [rewriter_before_opt_a]: 9.84e-05 [opt_a]: 0.00344757, [2] [Cycle 1]: 0.00253537, [45] [expand_dump_flag]: 3.18e-06 [switch_simplify]: 4.537e-05 [loop_unroll]: 3.102e-05 [a_1]: 0.00069816 [with_stream_mark]: 2.23e-05 [recompute_prepare]: 1.032e-05 [updatestate_depend_eliminate]: 4.18001e-06 [updatestate_assign_eliminate]: 3.68999e-06 [updatestate_loads_eliminate]: 3.18998e-06 [parameter_eliminate]: 2.15002e-06 [a_2]: 0.00012393 [accelerated_algorithm]: 8.94e-06 [shard]: 2.71e-06 [meta_shard_fg_expand]: 2.41e-06 [shard_inline]: 6.71e-06 [merge_send_recv]: 9.42999e-06 [auto_parallel]: 7.90998e-06 [parallel]: 1.9e-05 [flash_sp]: 9.51998e-06 [merge_comm]: 4.17998e-06 [allreduce_fusion]: 3.5e-06 [matmul_add_comm_reduction]: 1.068e-05 [allreduce_slice_to_reducescatter]: 8.70001e-07 [virtual_shard_identity]: 9.06002e-06 [virtual_dataset]: 7.31999e-06 [get_grad_eliminate_]: 7.47998e-06 [virtual_output]: 7.4e-06 [merge_forward]: 4.58999e-06 [cell_reuse_recompute_pass]: 1.96998e-06 [offload_activation]: 1.117e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.69e-05 [merge_recompute_call_nodes]: 1.61998e-06 [before_grad]: 1.257e-05 [set_forward_comm_id_for_comm_node_pass]: 3.62002e-06 [meta_fg_expand]: 2.73998e-06 [flash_sp_send_recv_attached]: 3.46999e-06 [receive_attached]: 2.34999e-06 [after_resolve]: 1.141e-05 [a_after_grad]: 1.177e-05 [renormalize]: 0.00084822 [add_forward_monad_depend]: 6.74999e-06 [auto_monad_grad]: 2.39999e-06 [auto_monad_eliminator]: 1.732e-05 [cse]: 3.044e-05 [a_3]: 6.719e-05 [Cycle 2]: 0.00089623, [45] [expand_dump_flag]: 1.82999e-06 [switch_simplify]: 8.08999e-06 [loop_unroll]: 6.71e-06 [a_1]: 0.00014737 [with_stream_mark]: 1.558e-05 [recompute_prepare]: 7.25998e-06 [updatestate_depend_eliminate]: 3.66001e-06 [updatestate_assign_eliminate]: 2.76999e-06 [updatestate_loads_eliminate]: 2.56998e-06 [parameter_eliminate]: 1.20001e-06 [a_2]: 0.00010911 [accelerated_algorithm]: 7.05e-06 [shard]: 1.74e-06 [meta_shard_fg_expand]: 1.72999e-06 [shard_inline]: 6.43998e-06 [merge_send_recv]: 7.08998e-06 [auto_parallel]: 8.50001e-06 [parallel]: 5.28002e-06 [flash_sp]: 4.27e-06 [merge_comm]: 3.67998e-06 [allreduce_fusion]: 3.52002e-06 [matmul_add_comm_reduction]: 8.42e-06 [allreduce_slice_to_reducescatter]: 7.50006e-07 [virtual_shard_identity]: 7.43999e-06 [virtual_dataset]: 6.29999e-06 [get_grad_eliminate_]: 6.15002e-06 [virtual_output]: 6.46e-06 [merge_forward]: 3.8e-06 [cell_reuse_recompute_pass]: 2.26e-06 [offload_activation]: 8.33001e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.501e-05 [merge_recompute_call_nodes]: 1.24e-06 [before_grad]: 9.94999e-06 [set_forward_comm_id_for_comm_node_pass]: 3.39001e-06 [meta_fg_expand]: 2.37001e-06 [flash_sp_send_recv_attached]: 1.43002e-06 [receive_attached]: 2.34001e-06 [after_resolve]: 1.165e-05 [a_after_grad]: 9.98998e-06 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 2.13002e-06 [auto_monad_grad]: 1.59998e-06 [auto_monad_eliminator]: 1.044e-05 [cse]: 1.641e-05 [a_3]: 5.079e-05 [py_interpret_to_execute_after_opt_a]: 1.612e-05 [slice_cell_reuse_recomputed_activation]: 5.64e-06 [rewriter_after_opt_a]: 4.639e-05 [convert_after_rewriter]: 1.037e-05 [order_py_execute_after_rewriter]: 7.75e-06 [mutable_eliminate]: 0.00072046 [opt_b]: 0.0002801, [1] [Cycle 1]: 0.0002691, [7] [b_1]: 0.0001664 [b_2]: 9.22001e-06 [updatestate_depend_eliminate]: 8.45001e-06 [updatestate_assign_eliminate]: 2.68e-06 [updatestate_loads_eliminate]: 2.81e-06 [renormalize]: 6.40022e-07 [cse]: 2.231e-05 [optimize_parallel_all_gather_comm]: 2.247e-05 [overlap_param_gather]: 5.39e-06 [cconv]: 3.523e-05 [loop_unroll]: 0.0005088 [opt_after_cconv]: 0.00013642, [1] [Cycle 1]: 0.00012689, [7] [c_1]: 3.32e-05 [parameter_eliminate]: 5.04998e-06 [updatestate_depend_eliminate]: 7.50998e-06 [updatestate_assign_eliminate]: 2.48e-06 [updatestate_loads_eliminate]: 2.53e-06 [cse]: 2.016e-05 [renormalize]: 4.19997e-07 [remove_dup_value]: 1.717e-05 [tuple_transform]: 9.617e-05, [1] [Cycle 1]: 8.828e-05, [4] [d_1]: 4.783e-05 [none_parameter_eliminate]: 1.57999e-06 [renormalize]: 2.59985e-07 [switch_simplify]: 7.74002e-06 [partial_unused_args_eliminate]: 4.87e-06 [add_recomputation]: 5.621e-05 [cse_after_recomputation]: 2.672e-05, [1] [Cycle 1]: 1.989e-05, [1] [cse]: 1.13e-05 [environ_conv]: 8.50001e-06 [swap_dp_allreduce_reducescatter]: 8.18999e-06 [bias_add_comm_swap]: 5.96e-06 [label_micro_interleaved_index]: 7.88999e-06 [label_fine_grained_interleaved_index]: 5.66e-06 [merge_cast_opt]: 3.65e-06 [slice_recompute_activation]: 4.62e-06 [micro_interleaved_order_control]: 4.60999e-06 [assign_add_opt]: 3.83001e-06 [ForceFp32Comm]: 3.18998e-06 [remove_cast_before_assign_add]: 3.46999e-06 [full_micro_interleaved_order_control]: 5.16998e-06 [reorder_send_recv_between_fp_bp]: 5.55001e-06 [comm_op_add_attrs]: 3.74002e-06 [add_comm_op_reuse_tag]: 3.21999e-06 [interleave_split_concat_branches]: 3.61999e-06 [interleave_parallel_branches]: 3.80998e-06 [overlap_opt_shard_in_pipeline]: 3.78999e-06 [overlap_opt_shard_grad_in_pipeline]: 4.16001e-06 [control_data_broadcast_order]: 1.653e-05 [grouped_pairwise_exchange_alltoall]: 3.92002e-06 [offloading_packed_experts]: 6.74001e-06 [overlap_recompute_and_grad_model_parallel]: 7.07002e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.75e-06 [overlap_recompute_allgather_and_fa_grad]: 4.53999e-06 [overlap_recompute_comm]: 5.02e-06 [overlap_grad_ring_attention]: 6.73e-06 [overlap_grad_flash_sp]: 2.499e-05 [begin_end_overlap_inline]: 2.99001e-06 [split_matmul_comm_elemetwise]: 4.46002e-06 [split_layernorm_comm]: 4.07003e-06 [handle_group_info]: 3.41999e-06 [symbol_engine_optimizer]: 0.00010345, [1] [Cycle 1]: 9.665e-05, [6] [build]: 3.95e-06 [elim_shapecalc]: 1.04e-05 [elim_not_effective]: 1.391e-05 [opt_reshape]: 8.82e-06 [fold_const_symbol]: 1.061e-05 [renormalize]: 2.30008e-07 [detach_backward]: 5.26998e-06 [pipeline_parallel_scheduler]: 2.44999e-06 [auto_monad_reorder]: 2.491e-05 [get_jit_bprop_graph]: 2.09e-06 [rewriter_after_jit_bprop_graph]: 7.06001e-06 [opt_after_jit_grad]: 0.00064341 [validate]: 4.638e-05 Sums bootstrap : 0.000467s : 0.94% type_inference : 0.043712s : 87.53% event_method : 0.000022s : 0.04% auto_monad : 0.000067s : 0.14% graph_reusing : 0.000006s : 0.01% inline : 0.000003s : 0.01% add_attr.add_attr_with_inline.tag_attr : 0.000025s : 0.05% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.01% parallel-infer-symbol : 0.000004s : 0.01% pre_auto_parallel : 0.000042s : 0.08% insert-virtual-dataset : 0.000003s : 0.01% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000035s : 0.07% optimize.rewriter_before_opt_a : 0.000098s : 0.20% optimize.opt_a.expand_dump_flag : 0.000005s : 0.01% optimize.opt_a.switch_simplify : 0.000053s : 0.11% optimize.opt_a.loop_unroll : 0.000038s : 0.08% optimize.opt_a.a_1 : 0.000846s : 1.69% optimize.opt_a.with_stream_mark : 0.000038s : 0.08% optimize.opt_a.recompute_prepare : 0.000018s : 0.04% optimize.opt_a.updatestate_depend_eliminate : 0.000008s : 0.02% optimize.opt_a.updatestate_assign_eliminate : 0.000006s : 0.01% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.01% optimize.opt_a.parameter_eliminate : 0.000003s : 0.01% optimize.opt_a.a_2 : 0.000233s : 0.47% optimize.opt_a.accelerated_algorithm : 0.000016s : 0.03% optimize.opt_a.shard : 0.000004s : 0.01% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.01% optimize.opt_a.shard_inline : 0.000013s : 0.03% optimize.opt_a.merge_send_recv : 0.000017s : 0.03% optimize.opt_a.auto_parallel : 0.000016s : 0.03% optimize.opt_a.parallel : 0.000024s : 0.05% optimize.opt_a.flash_sp : 0.000014s : 0.03% optimize.opt_a.merge_comm : 0.000008s : 0.02% optimize.opt_a.allreduce_fusion : 0.000007s : 0.01% optimize.opt_a.matmul_add_comm_reduction : 0.000019s : 0.04% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000002s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000017s : 0.03% optimize.opt_a.virtual_dataset : 0.000014s : 0.03% optimize.opt_a.get_grad_eliminate_ : 0.000014s : 0.03% optimize.opt_a.virtual_output : 0.000014s : 0.03% optimize.opt_a.merge_forward : 0.000008s : 0.02% optimize.opt_a.cell_reuse_recompute_pass : 0.000004s : 0.01% optimize.opt_a.offload_activation : 0.000020s : 0.04% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000032s : 0.06% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.01% optimize.opt_a.before_grad : 0.000023s : 0.05% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000007s : 0.01% optimize.opt_a.meta_fg_expand : 0.000005s : 0.01% optimize.opt_a.flash_sp_send_recv_attached : 0.000005s : 0.01% optimize.opt_a.receive_attached : 0.000005s : 0.01% optimize.opt_a.after_resolve : 0.000023s : 0.05% optimize.opt_a.a_after_grad : 0.000022s : 0.04% optimize.opt_a.renormalize : 0.000848s : 1.70% optimize.opt_a.add_forward_monad_depend : 0.000009s : 0.02% optimize.opt_a.auto_monad_grad : 0.000004s : 0.01% optimize.opt_a.auto_monad_eliminator : 0.000028s : 0.06% optimize.opt_a.cse : 0.000047s : 0.09% optimize.opt_a.a_3 : 0.000118s : 0.24% optimize.py_interpret_to_execute_after_opt_a : 0.000016s : 0.03% optimize.slice_cell_reuse_recomputed_activation : 0.000006s : 0.01% optimize.rewriter_after_opt_a : 0.000046s : 0.09% optimize.convert_after_rewriter : 0.000010s : 0.02% optimize.order_py_execute_after_rewriter : 0.000008s : 0.02% optimize.mutable_eliminate : 0.000720s : 1.44% optimize.opt_b.b_1 : 0.000166s : 0.33% optimize.opt_b.b_2 : 0.000009s : 0.02% optimize.opt_b.updatestate_depend_eliminate : 0.000008s : 0.02% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.01% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.01% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000022s : 0.04% optimize.optimize_parallel_all_gather_comm : 0.000022s : 0.04% optimize.overlap_param_gather : 0.000005s : 0.01% optimize.cconv : 0.000035s : 0.07% optimize.loop_unroll : 0.000509s : 1.02% optimize.opt_after_cconv.c_1 : 0.000033s : 0.07% optimize.opt_after_cconv.parameter_eliminate : 0.000005s : 0.01% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000008s : 0.02% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000002s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.cse : 0.000020s : 0.04% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000017s : 0.03% optimize.tuple_transform.d_1 : 0.000048s : 0.10% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000008s : 0.02% optimize.partial_unused_args_eliminate : 0.000005s : 0.01% optimize.add_recomputation : 0.000056s : 0.11% optimize.cse_after_recomputation.cse : 0.000011s : 0.02% optimize.environ_conv : 0.000009s : 0.02% optimize.swap_dp_allreduce_reducescatter : 0.000008s : 0.02% optimize.bias_add_comm_swap : 0.000006s : 0.01% optimize.label_micro_interleaved_index : 0.000008s : 0.02% optimize.label_fine_grained_interleaved_index : 0.000006s : 0.01% optimize.merge_cast_opt : 0.000004s : 0.01% optimize.slice_recompute_activation : 0.000005s : 0.01% optimize.micro_interleaved_order_control : 0.000005s : 0.01% optimize.assign_add_opt : 0.000004s : 0.01% optimize.ForceFp32Comm : 0.000003s : 0.01% optimize.remove_cast_before_assign_add : 0.000003s : 0.01% optimize.full_micro_interleaved_order_control : 0.000005s : 0.01% optimize.reorder_send_recv_between_fp_bp : 0.000006s : 0.01% optimize.comm_op_add_attrs : 0.000004s : 0.01% optimize.add_comm_op_reuse_tag : 0.000003s : 0.01% optimize.interleave_split_concat_branches : 0.000004s : 0.01% optimize.interleave_parallel_branches : 0.000004s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000004s : 0.01% optimize.control_data_broadcast_order : 0.000017s : 0.03% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.01% optimize.offloading_packed_experts : 0.000007s : 0.01% optimize.overlap_recompute_and_grad_model_parallel : 0.000007s : 0.01% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000005s : 0.01% optimize.overlap_recompute_comm : 0.000005s : 0.01% optimize.overlap_grad_ring_attention : 0.000007s : 0.01% optimize.overlap_grad_flash_sp : 0.000025s : 0.05% optimize.begin_end_overlap_inline : 0.000003s : 0.01% optimize.split_matmul_comm_elemetwise : 0.000004s : 0.01% optimize.split_layernorm_comm : 0.000004s : 0.01% optimize.handle_group_info : 0.000003s : 0.01% optimize.symbol_engine_optimizer.build : 0.000004s : 0.01% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000010s : 0.02% optimize.symbol_engine_optimizer.elim_not_effective : 0.000014s : 0.03% optimize.symbol_engine_optimizer.opt_reshape : 0.000009s : 0.02% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000011s : 0.02% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000005s : 0.01% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000025s : 0.05% get_jit_bprop_graph : 0.000002s : 0.00% rewriter_after_jit_bprop_graph : 0.000007s : 0.01% opt_after_jit_grad : 0.000643s : 1.29% validate : 0.000046s : 0.09% Time group info: ------[substitution.] 0.000242 34 16.67% : 0.000040s : 6: substitution.arithmetic_simplify 0.92% : 0.000002s : 2: substitution.elim_not_effective 0.62% : 0.000002s : 2: substitution.fold_const_symbol 2.54% : 0.000006s : 4: substitution.graph_param_transform 66.58% : 0.000161s : 4: substitution.inline 1.89% : 0.000005s : 4: substitution.j_node_and_user_rematch 2.30% : 0.000006s : 4: substitution.remove_not_recompute_node 2.14% : 0.000005s : 4: substitution.replace_old_param 6.34% : 0.000015s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.043644 2 98.21% : 0.042864s : 1: type_inference.infer 1.79% : 0.000780s : 1: type_inference.specialize ------[replace.] 0.000066 8 62.95% : 0.000041s : 4: replace.inline 37.05% : 0.000024s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000172 8 92.13% : 0.000158s : 4: match.inline 7.87% : 0.000014s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000221 1278 0.98% : 0.000002s : 13: predicate.accumulaten_eliminater 0.97% : 0.000002s : 4: predicate.ad_related_special_op_eliminate 0.61% : 0.000001s : 8: predicate.addn_check_dump 0.91% : 0.000002s : 13: predicate.addn_zero_filter 0.77% : 0.000002s : 13: predicate.adjust_all_reduce_mul_add 2.71% : 0.000006s : 21: predicate.arithmetic_simplify 0.91% : 0.000002s : 13: predicate.cast_eliminate 0.58% : 0.000001s : 8: predicate.check_bprop_eliminate 0.54% : 0.000001s : 8: predicate.compare_switch_simplify 0.26% : 0.000001s : 4: predicate.const_output_eliminate 0.58% : 0.000001s : 8: predicate.depend_value_elim 0.92% : 0.000002s : 13: predicate.dict_get_item_const_eliminator 1.01% : 0.000002s : 13: predicate.dict_get_item_eliminator 0.82% : 0.000002s : 13: predicate.dict_set_item_eliminator 1.31% : 0.000003s : 8: predicate.dumpgradient_eliminate 0.19% : 0.000000s : 4: predicate.elim_not_effective 0.45% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.05% : 0.000002s : 17: predicate.environ_add_const_eliminate 1.03% : 0.000002s : 17: predicate.environ_get_add_eliminate 1.07% : 0.000002s : 17: predicate.environ_get_depend_swap 1.68% : 0.000004s : 25: predicate.environ_get_eliminate 1.09% : 0.000002s : 17: predicate.environ_get_set_eliminate 1.33% : 0.000003s : 21: predicate.exchange_switch_depend_value 2.36% : 0.000005s : 21: predicate.float_depend_g_call 0.54% : 0.000001s : 8: predicate.float_environ_get_switch 0.77% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.16% : 0.000000s : 4: predicate.fold_const_symbol 0.74% : 0.000002s : 8: predicate.get_grad_eliminate 0.20% : 0.000000s : 4: predicate.graph_param_transform 0.55% : 0.000001s : 8: predicate.incorporate_call 0.48% : 0.000001s : 8: predicate.incorporate_call_switch 6.52% : 0.000014s : 58: predicate.inline 0.78% : 0.000002s : 8: predicate.inline_without_move 0.31% : 0.000001s : 8: predicate.j_node_and_user_rematch 1.16% : 0.000003s : 8: predicate.less_batch_normalization 1.85% : 0.000004s : 25: predicate.list_to_tuple_eliminator_ 2.34% : 0.000005s : 38: predicate.load_eliminater 1.16% : 0.000003s : 4: predicate.loop_unroll_after_grad 2.40% : 0.000005s : 34: predicate.loop_unroll_before_grad 1.56% : 0.000003s : 21: predicate.make_slice_get_slice_eliminator 0.53% : 0.000001s : 8: predicate.merge_addn 0.57% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.72% : 0.000002s : 8: predicate.mini_step_allgather_replace 0.74% : 0.000002s : 13: predicate.minmaximum_grad 1.49% : 0.000003s : 4: predicate.mutable_eliminate 0.47% : 0.000001s : 4: predicate.opt_reshape 0.40% : 0.000001s : 4: predicate.parallel_virtual_node 1.59% : 0.000004s : 21: predicate.partial_defer_inline 1.52% : 0.000003s : 21: predicate.partial_eliminate 0.94% : 0.000002s : 13: predicate.print_const_string_wrapper 0.61% : 0.000001s : 8: predicate.reduce_all_const_elim 1.27% : 0.000003s : 13: predicate.reduce_eliminate 2.36% : 0.000005s : 38: predicate.redundant_stop_gradient_eliminater 0.67% : 0.000001s : 8: predicate.remove_not_recompute_node 1.42% : 0.000003s : 25: predicate.replace_applicator 0.37% : 0.000001s : 8: predicate.replace_old_param 0.35% : 0.000001s : 4: predicate.reset_defer_inline 1.08% : 0.000002s : 13: predicate.reshape_eliminate 0.73% : 0.000002s : 8: predicate.row_tensor_add_zeros_like 0.49% : 0.000001s : 4: predicate.row_tensor_eliminate 0.82% : 0.000002s : 8: predicate.same_eliminate 0.41% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.69% : 0.000002s : 8: predicate.shard_identity_eliminate 0.63% : 0.000001s : 8: predicate.special_op_eliminate 0.67% : 0.000001s : 8: predicate.specialize_transform 0.93% : 0.000002s : 8: predicate.split_environ_get_set_with_tuple_value 0.87% : 0.000002s : 8: predicate.stack_unstack_eliminate 0.37% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.40% : 0.000003s : 21: predicate.switch_defer_inline 1.88% : 0.000004s : 29: predicate.switch_layer_defer_inline 5.23% : 0.000012s : 67: predicate.switch_simplify 0.87% : 0.000002s : 13: predicate.tile_eliminate 0.82% : 0.000002s : 13: predicate.transpose_eliminate 1.72% : 0.000004s : 21: predicate.tuple_list_convert_item_index_to_positive 1.45% : 0.000003s : 21: predicate.tuple_list_get_item_const_eliminator 1.36% : 0.000003s : 21: predicate.tuple_list_get_item_depend_reorder 3.17% : 0.000007s : 33: predicate.tuple_list_get_item_eliminator 1.33% : 0.000003s : 21: predicate.tuple_list_get_set_item_eliminator 2.11% : 0.000005s : 29: predicate.tuple_list_set_item_eliminator 1.63% : 0.000004s : 25: predicate.tuple_to_list_eliminator_ 2.27% : 0.000005s : 38: predicate.updatestate_pure_node_eliminater 3.00% : 0.000007s : 46: predicate.updatestate_useless_node_eliminater 0.38% : 0.000001s : 4: predicate.value_based_eliminate 0.66% : 0.000001s : 8: predicate.virtual_dataset_eliminate 0.66% : 0.000001s : 8: predicate.virtual_output_eliminate 0.25% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.41% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000652 11 55.32% : 0.000361s : 5: func_graph_cloner_run.FuncGraphClonerGraph 44.68% : 0.000291s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.068811 192 0.01% : 0.000006s : 1: ForceFp32Comm 5.77% : 0.003968s : 1: add_attr 5.74% : 0.003948s : 1: add_attr_with_inline 0.01% : 0.000007s : 1: add_comm_op_reuse_tag 0.09% : 0.000060s : 1: add_recomputation 0.01% : 0.000006s : 1: assign_add_opt 0.11% : 0.000076s : 1: auto_monad 0.05% : 0.000032s : 1: auto_monad_reorder 0.01% : 0.000006s : 1: begin_end_overlap_inline 0.01% : 0.000009s : 1: bias_add_comm_swap 0.75% : 0.000514s : 1: bootstrap 0.06% : 0.000038s : 1: cconv 0.01% : 0.000006s : 1: comm_op_add_attrs 0.03% : 0.000020s : 1: control_data_broadcast_order 0.02% : 0.000014s : 1: convert_after_rewriter 0.04% : 0.000030s : 1: cse_after_recomputation 0.01% : 0.000008s : 1: dataset_repeat_opt 0.04% : 0.000031s : 1: detach_backward 0.02% : 0.000011s : 1: environ_conv 0.05% : 0.000034s : 1: event_method 0.01% : 0.000008s : 1: full_micro_interleaved_order_control 0.01% : 0.000010s : 1: get_jit_bprop_graph 0.02% : 0.000012s : 1: graph_reusing 0.01% : 0.000006s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000006s : 1: handle_group_info 0.01% : 0.000009s : 1: inline 0.01% : 0.000009s : 1: insert-virtual-dataset 0.01% : 0.000006s : 1: interleave_parallel_branches 0.01% : 0.000006s : 1: interleave_split_concat_branches 0.01% : 0.000008s : 1: label_fine_grained_interleaved_index 0.02% : 0.000011s : 1: label_micro_interleaved_index 0.75% : 0.000515s : 1: loop_unroll 0.01% : 0.000006s : 1: merge_cast_opt 0.01% : 0.000007s : 1: micro_interleaved_order_control 1.06% : 0.000728s : 1: mutable_eliminate 0.01% : 0.000009s : 1: offloading_packed_experts 0.02% : 0.000017s : 1: opt.transform.loop_unroll_optimizer 0.03% : 0.000019s : 1: opt.transform.mutable_eliminate 1.90% : 0.001307s : 78: opt.transform.opt_a 0.05% : 0.000032s : 1: opt.transform.opt_after_cconv 0.05% : 0.000032s : 1: opt.transform.opt_after_jit_grad 0.15% : 0.000103s : 28: opt.transform.opt_b 0.08% : 0.000053s : 2: opt.transform.opt_trans_graph 0.06% : 0.000040s : 4: opt.transform.symbol_engine_opt 5.02% : 0.003451s : 1: opt_a 0.20% : 0.000141s : 1: opt_after_cconv 0.95% : 0.000655s : 1: opt_after_jit_grad 0.41% : 0.000284s : 1: opt_b 10.41% : 0.007164s : 1: optimize 0.04% : 0.000026s : 1: optimize_parallel_all_gather_comm 0.02% : 0.000011s : 1: order_py_execute_after_rewriter 0.04% : 0.000028s : 1: overlap_grad_flash_sp 0.01% : 0.000007s : 1: overlap_grad_matmul_and_grad_allreduce 0.01% : 0.000010s : 1: overlap_grad_ring_attention 0.01% : 0.000007s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000007s : 1: overlap_opt_shard_in_pipeline 0.01% : 0.000008s : 1: overlap_param_gather 0.01% : 0.000007s : 1: overlap_recompute_allgather_and_fa_grad 0.02% : 0.000011s : 1: overlap_recompute_and_grad_model_parallel 0.01% : 0.000008s : 1: overlap_recompute_comm 0.01% : 0.000010s : 1: parallel-infer-symbol 0.01% : 0.000006s : 1: parallel-infer-symbol-second 0.01% : 0.000008s : 1: partial_unused_args_eliminate 0.02% : 0.000010s : 1: pipeline_parallel_scheduler 0.01% : 0.000008s : 1: pipeline_split 0.07% : 0.000050s : 1: pre_auto_parallel 0.06% : 0.000039s : 1: py_interpret_to_execute 0.03% : 0.000019s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000006s : 1: remove_cast_before_assign_add 0.03% : 0.000021s : 1: remove_dup_value 0.67% : 0.000459s : 1: renormalize.infer 0.55% : 0.000380s : 1: renormalize.specialize 0.01% : 0.000008s : 1: reorder_send_recv_between_fp_bp 0.02% : 0.000013s : 1: rewriter_after_jit_bprop_graph 0.07% : 0.000051s : 1: rewriter_after_opt_a 0.15% : 0.000103s : 1: rewriter_before_opt_a 0.01% : 0.000009s : 1: slice_cell_reuse_recomputed_activation 0.01% : 0.000008s : 1: slice_recompute_activation 0.01% : 0.000007s : 1: split_layernorm_comm 0.01% : 0.000007s : 1: split_matmul_comm_elemetwise 0.02% : 0.000011s : 1: swap_dp_allreduce_reducescatter 0.15% : 0.000107s : 1: symbol_engine_optimizer 0.14% : 0.000099s : 1: tuple_transform 63.60% : 0.043765s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:43:30.126.533 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.025843, [21] [bootstrap]: 0.00043309 [type_inference]: 0.00628527 [event_method]: 2.191e-05 [auto_monad]: 6.706e-05 [graph_reusing]: 6.35002e-06 [inline]: 3.11001e-06 [add_attr]: 0.00349361, [1] [add_attr_with_inline]: 0.0034812, [1] [Cycle 1]: 7.173e-05, [2] [tag_attr]: 2.382e-05 [meta_addattr_fg_expand]: 6.19001e-06 [parallel-infer-symbol]: 3.99997e-06 [pre_auto_parallel]: 3.834e-05 [insert-virtual-dataset]: 2.57001e-06 [parallel-infer-symbol-second]: 8.59989e-07 [dataset_repeat_opt]: 2.30002e-06 [pipeline_split]: 1.79e-06 [optimize]: 0.0147752, [53] [py_interpret_to_execute]: 3.053e-05 [rewriter_before_opt_a]: 9.038e-05 [opt_a]: 0.0124652, [2] [Cycle 1]: 0.0117135, [45] [expand_dump_flag]: 3.38e-06 [switch_simplify]: 4.518e-05 [loop_unroll]: 3.069e-05 [a_1]: 0.00991649 [with_stream_mark]: 4.598e-05 [recompute_prepare]: 1.949e-05 [updatestate_depend_eliminate]: 5.49e-06 [updatestate_assign_eliminate]: 3.36001e-06 [updatestate_loads_eliminate]: 3.31001e-06 [parameter_eliminate]: 2.68e-06 [a_2]: 0.00011734 [accelerated_algorithm]: 8.18999e-06 [shard]: 3.43e-06 [meta_shard_fg_expand]: 3.66999e-06 [shard_inline]: 6.76e-06 [merge_send_recv]: 1.084e-05 [auto_parallel]: 1.055e-05 [parallel]: 2.112e-05 [flash_sp]: 1.209e-05 [merge_comm]: 4.52e-06 [allreduce_fusion]: 3.72998e-06 [matmul_add_comm_reduction]: 1.191e-05 [allreduce_slice_to_reducescatter]: 9.60019e-07 [virtual_shard_identity]: 9.12999e-06 [virtual_dataset]: 7.13e-06 [get_grad_eliminate_]: 6.73e-06 [virtual_output]: 6.93e-06 [merge_forward]: 5.19998e-06 [cell_reuse_recompute_pass]: 3.47002e-06 [offload_activation]: 9.78002e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.375e-05 [merge_recompute_call_nodes]: 1.53002e-06 [before_grad]: 1.221e-05 [set_forward_comm_id_for_comm_node_pass]: 4.42e-06 [meta_fg_expand]: 3.44001e-06 [flash_sp_send_recv_attached]: 2.87002e-06 [receive_attached]: 2.14999e-06 [after_resolve]: 1.382e-05 [a_after_grad]: 1.088e-05 [renormalize]: 0.00086405 [add_forward_monad_depend]: 5.74e-06 [auto_monad_grad]: 2.63998e-06 [auto_monad_eliminator]: 1.825e-05 [cse]: 5.221e-05 [a_3]: 5.358e-05 [Cycle 2]: 0.00073877, [45] [expand_dump_flag]: 2.83003e-06 [switch_simplify]: 8.35001e-06 [loop_unroll]: 6.58e-06 [a_1]: 0.00015658 [with_stream_mark]: 1.732e-05 [recompute_prepare]: 7.18e-06 [updatestate_depend_eliminate]: 3.38999e-06 [updatestate_assign_eliminate]: 2.59001e-06 [updatestate_loads_eliminate]: 2.72001e-06 [parameter_eliminate]: 1.29998e-06 [a_2]: 7.926e-05 [accelerated_algorithm]: 8.05999e-06 [shard]: 1.54998e-06 [meta_shard_fg_expand]: 1.80001e-06 [shard_inline]: 6.66e-06 [merge_send_recv]: 7.46999e-06 [auto_parallel]: 7.1e-06 [parallel]: 6.62002e-06 [flash_sp]: 3.83001e-06 [merge_comm]: 3.62002e-06 [allreduce_fusion]: 3.48999e-06 [matmul_add_comm_reduction]: 8.1e-06 [allreduce_slice_to_reducescatter]: 6.59988e-07 [virtual_shard_identity]: 8.3e-06 [virtual_dataset]: 6.33e-06 [get_grad_eliminate_]: 5.72999e-06 [virtual_output]: 5.97001e-06 [merge_forward]: 3.83001e-06 [cell_reuse_recompute_pass]: 2.24001e-06 [offload_activation]: 7.82002e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.277e-05 [merge_recompute_call_nodes]: 1.11002e-06 [before_grad]: 9.63002e-06 [set_forward_comm_id_for_comm_node_pass]: 3.88999e-06 [meta_fg_expand]: 2.71999e-06 [flash_sp_send_recv_attached]: 1.29e-06 [receive_attached]: 1.98002e-06 [after_resolve]: 1.199e-05 [a_after_grad]: 9.54e-06 [renormalize]: 9.00181e-08 [add_forward_monad_depend]: 2.02999e-06 [auto_monad_grad]: 1.72001e-06 [auto_monad_eliminator]: 9.34e-06 [cse]: 2.032e-05 [a_3]: 3.655e-05 [py_interpret_to_execute_after_opt_a]: 1.365e-05 [slice_cell_reuse_recomputed_activation]: 2.02999e-06 [rewriter_after_opt_a]: 3.988e-05 [convert_after_rewriter]: 7.53999e-06 [order_py_execute_after_rewriter]: 5.27001e-06 [mutable_eliminate]: 0.00072328 [opt_b]: 0.00021506, [1] [Cycle 1]: 0.00020683, [7] [b_1]: 0.0001248 [b_2]: 8.63001e-06 [updatestate_depend_eliminate]: 7.18e-06 [updatestate_assign_eliminate]: 2.54999e-06 [updatestate_loads_eliminate]: 2.96999e-06 [renormalize]: 6.69999e-07 [cse]: 2.239e-05 [optimize_parallel_all_gather_comm]: 2.021e-05 [overlap_param_gather]: 1.84998e-06 [cconv]: 3.356e-05 [loop_unroll]: 0.00044834 [opt_after_cconv]: 0.00010656, [1] [Cycle 1]: 0.00010081, [7] [c_1]: 3.197e-05 [parameter_eliminate]: 3.97e-06 [updatestate_depend_eliminate]: 5.82999e-06 [updatestate_assign_eliminate]: 2.65002e-06 [updatestate_loads_eliminate]: 2.22001e-06 [cse]: 1.867e-05 [renormalize]: 9.00007e-07 [remove_dup_value]: 1.359e-05 [tuple_transform]: 7.893e-05, [1] [Cycle 1]: 7.408e-05, [4] [d_1]: 4.682e-05 [none_parameter_eliminate]: 1.84e-06 [renormalize]: 3.59985e-07 [switch_simplify]: 6.87002e-06 [partial_unused_args_eliminate]: 1.62999e-06 [add_recomputation]: 5.06e-05 [cse_after_recomputation]: 2.121e-05, [1] [Cycle 1]: 1.64e-05, [1] [cse]: 1.088e-05 [environ_conv]: 4.87998e-06 [swap_dp_allreduce_reducescatter]: 4.90001e-06 [bias_add_comm_swap]: 3.33998e-06 [label_micro_interleaved_index]: 4.05998e-06 [label_fine_grained_interleaved_index]: 2.94001e-06 [merge_cast_opt]: 1.27999e-06 [slice_recompute_activation]: 1.97001e-06 [micro_interleaved_order_control]: 2.27001e-06 [assign_add_opt]: 1.16002e-06 [ForceFp32Comm]: 8.59989e-07 [remove_cast_before_assign_add]: 9.80013e-07 [full_micro_interleaved_order_control]: 2.37999e-06 [reorder_send_recv_between_fp_bp]: 3.11999e-06 [comm_op_add_attrs]: 1.26002e-06 [add_comm_op_reuse_tag]: 1.33002e-06 [interleave_split_concat_branches]: 1.41002e-06 [interleave_parallel_branches]: 1.11002e-06 [overlap_opt_shard_in_pipeline]: 1.29e-06 [overlap_opt_shard_grad_in_pipeline]: 2.02999e-06 [control_data_broadcast_order]: 1.314e-05 [grouped_pairwise_exchange_alltoall]: 1.46998e-06 [offloading_packed_experts]: 3.86001e-06 [overlap_recompute_and_grad_model_parallel]: 4.88001e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.53002e-06 [overlap_recompute_allgather_and_fa_grad]: 1.35999e-06 [overlap_recompute_comm]: 2.36e-06 [overlap_grad_ring_attention]: 3.78001e-06 [overlap_grad_flash_sp]: 2.171e-05 [begin_end_overlap_inline]: 5.8001e-07 [split_matmul_comm_elemetwise]: 2.21e-06 [split_layernorm_comm]: 2.05002e-06 [handle_group_info]: 9.60019e-07 [symbol_engine_optimizer]: 7.873e-05, [1] [Cycle 1]: 7.395e-05, [6] [build]: 3.26001e-06 [elim_shapecalc]: 1.07e-05 [elim_not_effective]: 1.252e-05 [opt_reshape]: 6.73e-06 [fold_const_symbol]: 1.01e-05 [renormalize]: 8.59989e-07 [detach_backward]: 2.29001e-06 [pipeline_parallel_scheduler]: 1.96e-06 [auto_monad_reorder]: 1.764e-05 [get_jit_bprop_graph]: 2.34999e-06 [rewriter_after_jit_bprop_graph]: 4.82e-06 [opt_after_jit_grad]: 0.00047396 [validate]: 4.26e-05 Sums bootstrap : 0.000433s : 2.03% type_inference : 0.006285s : 29.53% event_method : 0.000022s : 0.10% auto_monad : 0.000067s : 0.32% graph_reusing : 0.000006s : 0.03% inline : 0.000003s : 0.01% add_attr.add_attr_with_inline.tag_attr : 0.000024s : 0.11% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.03% parallel-infer-symbol : 0.000004s : 0.02% pre_auto_parallel : 0.000038s : 0.18% insert-virtual-dataset : 0.000003s : 0.01% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.01% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000031s : 0.14% optimize.rewriter_before_opt_a : 0.000090s : 0.42% optimize.opt_a.expand_dump_flag : 0.000006s : 0.03% optimize.opt_a.switch_simplify : 0.000054s : 0.25% optimize.opt_a.loop_unroll : 0.000037s : 0.18% optimize.opt_a.a_1 : 0.010073s : 47.32% optimize.opt_a.with_stream_mark : 0.000063s : 0.30% optimize.opt_a.recompute_prepare : 0.000027s : 0.13% optimize.opt_a.updatestate_depend_eliminate : 0.000009s : 0.04% optimize.opt_a.updatestate_assign_eliminate : 0.000006s : 0.03% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.03% optimize.opt_a.parameter_eliminate : 0.000004s : 0.02% optimize.opt_a.a_2 : 0.000197s : 0.92% optimize.opt_a.accelerated_algorithm : 0.000016s : 0.08% optimize.opt_a.shard : 0.000005s : 0.02% optimize.opt_a.meta_shard_fg_expand : 0.000005s : 0.03% optimize.opt_a.shard_inline : 0.000013s : 0.06% optimize.opt_a.merge_send_recv : 0.000018s : 0.09% optimize.opt_a.auto_parallel : 0.000018s : 0.08% optimize.opt_a.parallel : 0.000028s : 0.13% optimize.opt_a.flash_sp : 0.000016s : 0.07% optimize.opt_a.merge_comm : 0.000008s : 0.04% optimize.opt_a.allreduce_fusion : 0.000007s : 0.03% optimize.opt_a.matmul_add_comm_reduction : 0.000020s : 0.09% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000002s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000017s : 0.08% optimize.opt_a.virtual_dataset : 0.000013s : 0.06% optimize.opt_a.get_grad_eliminate_ : 0.000012s : 0.06% optimize.opt_a.virtual_output : 0.000013s : 0.06% optimize.opt_a.merge_forward : 0.000009s : 0.04% optimize.opt_a.cell_reuse_recompute_pass : 0.000006s : 0.03% optimize.opt_a.offload_activation : 0.000018s : 0.08% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000027s : 0.12% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.01% optimize.opt_a.before_grad : 0.000022s : 0.10% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000008s : 0.04% optimize.opt_a.meta_fg_expand : 0.000006s : 0.03% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.02% optimize.opt_a.receive_attached : 0.000004s : 0.02% optimize.opt_a.after_resolve : 0.000026s : 0.12% optimize.opt_a.a_after_grad : 0.000020s : 0.10% optimize.opt_a.renormalize : 0.000864s : 4.06% optimize.opt_a.add_forward_monad_depend : 0.000008s : 0.04% optimize.opt_a.auto_monad_grad : 0.000004s : 0.02% optimize.opt_a.auto_monad_eliminator : 0.000028s : 0.13% optimize.opt_a.cse : 0.000073s : 0.34% optimize.opt_a.a_3 : 0.000090s : 0.42% optimize.py_interpret_to_execute_after_opt_a : 0.000014s : 0.06% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.01% optimize.rewriter_after_opt_a : 0.000040s : 0.19% optimize.convert_after_rewriter : 0.000008s : 0.04% optimize.order_py_execute_after_rewriter : 0.000005s : 0.02% optimize.mutable_eliminate : 0.000723s : 3.40% optimize.opt_b.b_1 : 0.000125s : 0.59% optimize.opt_b.b_2 : 0.000009s : 0.04% optimize.opt_b.updatestate_depend_eliminate : 0.000007s : 0.03% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.01% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.01% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000022s : 0.11% optimize.optimize_parallel_all_gather_comm : 0.000020s : 0.09% optimize.overlap_param_gather : 0.000002s : 0.01% optimize.cconv : 0.000034s : 0.16% optimize.loop_unroll : 0.000448s : 2.11% optimize.opt_after_cconv.c_1 : 0.000032s : 0.15% optimize.opt_after_cconv.parameter_eliminate : 0.000004s : 0.02% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.03% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.01% optimize.opt_after_cconv.cse : 0.000019s : 0.09% optimize.opt_after_cconv.renormalize : 0.000001s : 0.00% optimize.remove_dup_value : 0.000014s : 0.06% optimize.tuple_transform.d_1 : 0.000047s : 0.22% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000007s : 0.03% optimize.partial_unused_args_eliminate : 0.000002s : 0.01% optimize.add_recomputation : 0.000051s : 0.24% optimize.cse_after_recomputation.cse : 0.000011s : 0.05% optimize.environ_conv : 0.000005s : 0.02% optimize.swap_dp_allreduce_reducescatter : 0.000005s : 0.02% optimize.bias_add_comm_swap : 0.000003s : 0.02% optimize.label_micro_interleaved_index : 0.000004s : 0.02% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.01% optimize.merge_cast_opt : 0.000001s : 0.01% optimize.slice_recompute_activation : 0.000002s : 0.01% optimize.micro_interleaved_order_control : 0.000002s : 0.01% optimize.assign_add_opt : 0.000001s : 0.01% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.00% optimize.full_micro_interleaved_order_control : 0.000002s : 0.01% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.01% optimize.comm_op_add_attrs : 0.000001s : 0.01% optimize.add_comm_op_reuse_tag : 0.000001s : 0.01% optimize.interleave_split_concat_branches : 0.000001s : 0.01% optimize.interleave_parallel_branches : 0.000001s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.01% optimize.control_data_broadcast_order : 0.000013s : 0.06% optimize.grouped_pairwise_exchange_alltoall : 0.000001s : 0.01% optimize.offloading_packed_experts : 0.000004s : 0.02% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.02% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000002s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.01% optimize.overlap_recompute_comm : 0.000002s : 0.01% optimize.overlap_grad_ring_attention : 0.000004s : 0.02% optimize.overlap_grad_flash_sp : 0.000022s : 0.10% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.01% optimize.split_layernorm_comm : 0.000002s : 0.01% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000003s : 0.02% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000011s : 0.05% optimize.symbol_engine_optimizer.elim_not_effective : 0.000013s : 0.06% optimize.symbol_engine_optimizer.opt_reshape : 0.000007s : 0.03% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000010s : 0.05% optimize.symbol_engine_optimizer.renormalize : 0.000001s : 0.00% detach_backward : 0.000002s : 0.01% pipeline_parallel_scheduler : 0.000002s : 0.01% auto_monad_reorder : 0.000018s : 0.08% get_jit_bprop_graph : 0.000002s : 0.01% rewriter_after_jit_bprop_graph : 0.000005s : 0.02% opt_after_jit_grad : 0.000474s : 2.23% validate : 0.000043s : 0.20% Time group info: ------[substitution.] 0.000257 34 20.48% : 0.000053s : 6: substitution.arithmetic_simplify 0.80% : 0.000002s : 2: substitution.elim_not_effective 0.55% : 0.000001s : 2: substitution.fold_const_symbol 2.47% : 0.000006s : 4: substitution.graph_param_transform 62.85% : 0.000162s : 4: substitution.inline 2.03% : 0.000005s : 4: substitution.j_node_and_user_rematch 2.14% : 0.000005s : 4: substitution.remove_not_recompute_node 2.65% : 0.000007s : 4: substitution.replace_old_param 6.02% : 0.000015s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.006214 2 87.31% : 0.005425s : 1: type_inference.infer 12.69% : 0.000788s : 1: type_inference.specialize ------[replace.] 0.000065 8 63.85% : 0.000041s : 4: replace.inline 36.15% : 0.000023s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000173 8 92.12% : 0.000159s : 4: match.inline 7.88% : 0.000014s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.008945 1278 0.02% : 0.000002s : 13: predicate.accumulaten_eliminater 0.02% : 0.000002s : 4: predicate.ad_related_special_op_eliminate 0.01% : 0.000001s : 8: predicate.addn_check_dump 0.03% : 0.000002s : 13: predicate.addn_zero_filter 0.02% : 0.000002s : 13: predicate.adjust_all_reduce_mul_add 0.07% : 0.000007s : 21: predicate.arithmetic_simplify 0.03% : 0.000003s : 13: predicate.cast_eliminate 0.02% : 0.000002s : 8: predicate.check_bprop_eliminate 0.01% : 0.000001s : 8: predicate.compare_switch_simplify 0.00% : 0.000000s : 4: predicate.const_output_eliminate 0.01% : 0.000001s : 8: predicate.depend_value_elim 0.02% : 0.000002s : 13: predicate.dict_get_item_const_eliminator 0.08% : 0.000007s : 13: predicate.dict_get_item_eliminator 0.02% : 0.000002s : 13: predicate.dict_set_item_eliminator 0.03% : 0.000003s : 8: predicate.dumpgradient_eliminate 0.01% : 0.000000s : 4: predicate.elim_not_effective 0.01% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 0.03% : 0.000003s : 17: predicate.environ_add_const_eliminate 0.03% : 0.000002s : 17: predicate.environ_get_add_eliminate 0.03% : 0.000002s : 17: predicate.environ_get_depend_swap 0.04% : 0.000004s : 25: predicate.environ_get_eliminate 97.39% : 0.008712s : 17: predicate.environ_get_set_eliminate 0.03% : 0.000003s : 21: predicate.exchange_switch_depend_value 0.07% : 0.000006s : 21: predicate.float_depend_g_call 0.02% : 0.000001s : 8: predicate.float_environ_get_switch 0.02% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.00% : 0.000000s : 4: predicate.fold_const_symbol 0.01% : 0.000001s : 8: predicate.get_grad_eliminate 0.01% : 0.000001s : 4: predicate.graph_param_transform 0.02% : 0.000001s : 8: predicate.incorporate_call 0.01% : 0.000001s : 8: predicate.incorporate_call_switch 0.17% : 0.000015s : 58: predicate.inline 0.02% : 0.000002s : 8: predicate.inline_without_move 0.01% : 0.000001s : 8: predicate.j_node_and_user_rematch 0.02% : 0.000002s : 8: predicate.less_batch_normalization 0.04% : 0.000004s : 25: predicate.list_to_tuple_eliminator_ 0.06% : 0.000006s : 38: predicate.load_eliminater 0.03% : 0.000003s : 4: predicate.loop_unroll_after_grad 0.06% : 0.000005s : 34: predicate.loop_unroll_before_grad 0.04% : 0.000004s : 21: predicate.make_slice_get_slice_eliminator 0.02% : 0.000002s : 8: predicate.merge_addn 0.01% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.01% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.02% : 0.000002s : 13: predicate.minmaximum_grad 0.03% : 0.000003s : 4: predicate.mutable_eliminate 0.01% : 0.000001s : 4: predicate.opt_reshape 0.01% : 0.000001s : 4: predicate.parallel_virtual_node 0.04% : 0.000004s : 21: predicate.partial_defer_inline 0.04% : 0.000003s : 21: predicate.partial_eliminate 0.02% : 0.000002s : 13: predicate.print_const_string_wrapper 0.02% : 0.000002s : 8: predicate.reduce_all_const_elim 0.04% : 0.000004s : 13: predicate.reduce_eliminate 0.06% : 0.000006s : 38: predicate.redundant_stop_gradient_eliminater 0.01% : 0.000001s : 8: predicate.remove_not_recompute_node 0.07% : 0.000006s : 25: predicate.replace_applicator 0.01% : 0.000001s : 8: predicate.replace_old_param 0.01% : 0.000001s : 4: predicate.reset_defer_inline 0.02% : 0.000002s : 13: predicate.reshape_eliminate 0.02% : 0.000001s : 8: predicate.row_tensor_add_zeros_like 0.01% : 0.000001s : 4: predicate.row_tensor_eliminate 0.02% : 0.000002s : 8: predicate.same_eliminate 0.01% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.03% : 0.000002s : 8: predicate.shard_identity_eliminate 0.02% : 0.000002s : 8: predicate.special_op_eliminate 0.02% : 0.000002s : 8: predicate.specialize_transform 0.02% : 0.000002s : 8: predicate.split_environ_get_set_with_tuple_value 0.02% : 0.000002s : 8: predicate.stack_unstack_eliminate 0.01% : 0.000001s : 4: predicate.switch_call_monad_eliminater 0.04% : 0.000003s : 21: predicate.switch_defer_inline 0.05% : 0.000004s : 29: predicate.switch_layer_defer_inline 0.12% : 0.000011s : 67: predicate.switch_simplify 0.02% : 0.000002s : 13: predicate.tile_eliminate 0.03% : 0.000003s : 13: predicate.transpose_eliminate 0.04% : 0.000003s : 21: predicate.tuple_list_convert_item_index_to_positive 0.04% : 0.000003s : 21: predicate.tuple_list_get_item_const_eliminator 0.04% : 0.000003s : 21: predicate.tuple_list_get_item_depend_reorder 0.08% : 0.000007s : 33: predicate.tuple_list_get_item_eliminator 0.04% : 0.000003s : 21: predicate.tuple_list_get_set_item_eliminator 0.06% : 0.000005s : 29: predicate.tuple_list_set_item_eliminator 0.04% : 0.000004s : 25: predicate.tuple_to_list_eliminator_ 0.06% : 0.000005s : 38: predicate.updatestate_pure_node_eliminater 0.07% : 0.000007s : 46: predicate.updatestate_useless_node_eliminater 0.01% : 0.000001s : 4: predicate.value_based_eliminate 0.02% : 0.000002s : 8: predicate.virtual_dataset_eliminate 0.01% : 0.000001s : 8: predicate.virtual_output_eliminate 0.01% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.01% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000638 11 52.01% : 0.000332s : 5: func_graph_cloner_run.FuncGraphClonerGraph 47.99% : 0.000306s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.055665 192 0.01% : 0.000004s : 1: ForceFp32Comm 6.29% : 0.003499s : 1: add_attr 6.26% : 0.003487s : 1: add_attr_with_inline 0.01% : 0.000004s : 1: add_comm_op_reuse_tag 0.10% : 0.000055s : 1: add_recomputation 0.01% : 0.000004s : 1: assign_add_opt 0.13% : 0.000073s : 1: auto_monad 0.04% : 0.000022s : 1: auto_monad_reorder 0.01% : 0.000003s : 1: begin_end_overlap_inline 0.01% : 0.000006s : 1: bias_add_comm_swap 0.83% : 0.000462s : 1: bootstrap 0.07% : 0.000037s : 1: cconv 0.01% : 0.000004s : 1: comm_op_add_attrs 0.03% : 0.000016s : 1: control_data_broadcast_order 0.02% : 0.000011s : 1: convert_after_rewriter 0.04% : 0.000024s : 1: cse_after_recomputation 0.01% : 0.000005s : 1: dataset_repeat_opt 0.01% : 0.000005s : 1: detach_backward 0.01% : 0.000008s : 1: environ_conv 0.05% : 0.000028s : 1: event_method 0.01% : 0.000005s : 1: full_micro_interleaved_order_control 0.01% : 0.000005s : 1: get_jit_bprop_graph 0.02% : 0.000010s : 1: graph_reusing 0.01% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000004s : 1: handle_group_info 0.01% : 0.000006s : 1: inline 0.01% : 0.000006s : 1: insert-virtual-dataset 0.01% : 0.000004s : 1: interleave_parallel_branches 0.01% : 0.000004s : 1: interleave_split_concat_branches 0.01% : 0.000006s : 1: label_fine_grained_interleaved_index 0.01% : 0.000007s : 1: label_micro_interleaved_index 0.82% : 0.000457s : 1: loop_unroll 0.01% : 0.000004s : 1: merge_cast_opt 0.01% : 0.000005s : 1: micro_interleaved_order_control 1.32% : 0.000734s : 1: mutable_eliminate 0.01% : 0.000007s : 1: offloading_packed_experts 0.03% : 0.000015s : 1: opt.transform.loop_unroll_optimizer 0.04% : 0.000020s : 1: opt.transform.mutable_eliminate 18.95% : 0.010551s : 78: opt.transform.opt_a 0.05% : 0.000031s : 1: opt.transform.opt_after_cconv 0.05% : 0.000026s : 1: opt.transform.opt_after_jit_grad 0.18% : 0.000101s : 28: opt.transform.opt_b 0.09% : 0.000052s : 2: opt.transform.opt_trans_graph 0.07% : 0.000036s : 4: opt.transform.symbol_engine_opt 22.40% : 0.012469s : 1: opt_a 0.20% : 0.000110s : 1: opt_after_cconv 0.87% : 0.000482s : 1: opt_after_jit_grad 0.39% : 0.000219s : 1: opt_b 26.55% : 0.014781s : 1: optimize 0.04% : 0.000024s : 1: optimize_parallel_all_gather_comm 0.01% : 0.000008s : 1: order_py_execute_after_rewriter 0.05% : 0.000025s : 1: overlap_grad_flash_sp 0.01% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.01% : 0.000007s : 1: overlap_grad_ring_attention 0.01% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.01% : 0.000005s : 1: overlap_param_gather 0.01% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.01% : 0.000008s : 1: overlap_recompute_and_grad_model_parallel 0.01% : 0.000005s : 1: overlap_recompute_comm 0.01% : 0.000008s : 1: parallel-infer-symbol 0.01% : 0.000004s : 1: parallel-infer-symbol-second 0.01% : 0.000004s : 1: partial_unused_args_eliminate 0.01% : 0.000005s : 1: pipeline_parallel_scheduler 0.01% : 0.000005s : 1: pipeline_split 0.08% : 0.000043s : 1: pre_auto_parallel 0.06% : 0.000035s : 1: py_interpret_to_execute 0.03% : 0.000017s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000004s : 1: remove_cast_before_assign_add 0.03% : 0.000017s : 1: remove_dup_value 0.85% : 0.000472s : 1: renormalize.infer 0.68% : 0.000381s : 1: renormalize.specialize 0.01% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.01% : 0.000008s : 1: rewriter_after_jit_bprop_graph 0.08% : 0.000044s : 1: rewriter_after_opt_a 0.17% : 0.000095s : 1: rewriter_before_opt_a 0.01% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.01% : 0.000005s : 1: slice_recompute_activation 0.01% : 0.000005s : 1: split_layernorm_comm 0.01% : 0.000005s : 1: split_matmul_comm_elemetwise 0.01% : 0.000008s : 1: swap_dp_allreduce_reducescatter 0.15% : 0.000081s : 1: symbol_engine_optimizer 0.15% : 0.000082s : 1: tuple_transform 11.33% : 0.006309s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:43:31.314.550 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:43:31.314.835 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.110266, [21] [bootstrap]: 0.00046317 [type_inference]: 0.0852153 [event_method]: 2.261e-05 [auto_monad]: 6.672e-05 [graph_reusing]: 6.62002e-06 [inline]: 3.4e-06 [add_attr]: 0.016432, [1] [add_attr_with_inline]: 0.0164186, [1] [Cycle 1]: 0.00010206, [2] [tag_attr]: 2.671e-05 [meta_addattr_fg_expand]: 6.19999e-06 [parallel-infer-symbol]: 4.20999e-06 [pre_auto_parallel]: 4.313e-05 [insert-virtual-dataset]: 2.31e-06 [parallel-infer-symbol-second]: 7.59988e-07 [dataset_repeat_opt]: 2.19001e-06 [pipeline_split]: 1.52999e-06 [optimize]: 0.00661823, [53] [py_interpret_to_execute]: 3.603e-05 [rewriter_before_opt_a]: 9.924e-05 [opt_a]: 0.00359944, [2] [Cycle 1]: 0.00256777, [45] [expand_dump_flag]: 3.03998e-06 [switch_simplify]: 4.311e-05 [loop_unroll]: 3.058e-05 [a_1]: 0.00069797 [with_stream_mark]: 2.216e-05 [recompute_prepare]: 1.131e-05 [updatestate_depend_eliminate]: 4.60999e-06 [updatestate_assign_eliminate]: 3.63e-06 [updatestate_loads_eliminate]: 2.92002e-06 [parameter_eliminate]: 2.39999e-06 [a_2]: 0.00012515 [accelerated_algorithm]: 7.77e-06 [shard]: 2.32001e-06 [meta_shard_fg_expand]: 2.12999e-06 [shard_inline]: 6.76e-06 [merge_send_recv]: 9.91998e-06 [auto_parallel]: 7.33e-06 [parallel]: 1.947e-05 [flash_sp]: 9.09998e-06 [merge_comm]: 3.71001e-06 [allreduce_fusion]: 3.61001e-06 [matmul_add_comm_reduction]: 1.001e-05 [allreduce_slice_to_reducescatter]: 7.7e-07 [virtual_shard_identity]: 9.18002e-06 [virtual_dataset]: 6.81999e-06 [get_grad_eliminate_]: 6.86001e-06 [virtual_output]: 6.81001e-06 [merge_forward]: 3.63999e-06 [cell_reuse_recompute_pass]: 1.18001e-06 [offload_activation]: 1.121e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.58e-05 [merge_recompute_call_nodes]: 1.85001e-06 [before_grad]: 1.177e-05 [set_forward_comm_id_for_comm_node_pass]: 3.71001e-06 [meta_fg_expand]: 2.84999e-06 [flash_sp_send_recv_attached]: 3.43e-06 [receive_attached]: 2.12999e-06 [after_resolve]: 1.356e-05 [a_after_grad]: 1.059e-05 [renormalize]: 0.00087946 [add_forward_monad_depend]: 6.79999e-06 [auto_monad_grad]: 2.94999e-06 [auto_monad_eliminator]: 1.915e-05 [cse]: 3.105e-05 [a_3]: 6.992e-05 [Cycle 2]: 0.00101382, [45] [expand_dump_flag]: 2.05002e-06 [switch_simplify]: 8.65999e-06 [loop_unroll]: 7.43e-06 [a_1]: 0.0001625 [with_stream_mark]: 1.973e-05 [recompute_prepare]: 8.61002e-06 [updatestate_depend_eliminate]: 4.22998e-06 [updatestate_assign_eliminate]: 2.94999e-06 [updatestate_loads_eliminate]: 3.11001e-06 [parameter_eliminate]: 2.66e-06 [a_2]: 0.00011773 [accelerated_algorithm]: 9.09e-06 [shard]: 2.31e-06 [meta_shard_fg_expand]: 2.55002e-06 [shard_inline]: 8.15e-06 [merge_send_recv]: 9.34998e-06 [auto_parallel]: 8.99e-06 [parallel]: 7.68001e-06 [flash_sp]: 3.87998e-06 [merge_comm]: 4.28001e-06 [allreduce_fusion]: 4.34997e-06 [matmul_add_comm_reduction]: 8.77999e-06 [allreduce_slice_to_reducescatter]: 8.70001e-07 [virtual_shard_identity]: 9.59999e-06 [virtual_dataset]: 7.05e-06 [get_grad_eliminate_]: 6.85998e-06 [virtual_output]: 6.06e-06 [merge_forward]: 4.04997e-06 [cell_reuse_recompute_pass]: 2.63003e-06 [offload_activation]: 1.042e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.54e-05 [merge_recompute_call_nodes]: 1.32999e-06 [before_grad]: 1.113e-05 [set_forward_comm_id_for_comm_node_pass]: 5.12e-06 [meta_fg_expand]: 3.31999e-06 [flash_sp_send_recv_attached]: 1.79998e-06 [receive_attached]: 1.75001e-06 [after_resolve]: 1.421e-05 [a_after_grad]: 1.328e-05 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 4.65001e-06 [auto_monad_grad]: 2.07999e-06 [auto_monad_eliminator]: 1.276e-05 [cse]: 2.231e-05 [a_3]: 5.676e-05 [py_interpret_to_execute_after_opt_a]: 2.03e-05 [slice_cell_reuse_recomputed_activation]: 6.04999e-06 [rewriter_after_opt_a]: 4.908e-05 [convert_after_rewriter]: 1.104e-05 [order_py_execute_after_rewriter]: 8.55999e-06 [mutable_eliminate]: 0.00083774 [opt_b]: 0.00031295, [1] [Cycle 1]: 0.00030023, [7] [b_1]: 0.00018149 [b_2]: 9.27001e-06 [updatestate_depend_eliminate]: 8.95999e-06 [updatestate_assign_eliminate]: 2.75002e-06 [updatestate_loads_eliminate]: 3.21999e-06 [renormalize]: 1.55999e-06 [cse]: 2.936e-05 [optimize_parallel_all_gather_comm]: 2.412e-05 [overlap_param_gather]: 6.16e-06 [cconv]: 4.229e-05 [loop_unroll]: 0.00060718 [opt_after_cconv]: 0.00014843, [1] [Cycle 1]: 0.00013814, [7] [c_1]: 3.684e-05 [parameter_eliminate]: 5.59998e-06 [updatestate_depend_eliminate]: 6.53998e-06 [updatestate_assign_eliminate]: 2.59001e-06 [updatestate_loads_eliminate]: 2.56e-06 [cse]: 2.289e-05 [renormalize]: 5.39992e-07 [remove_dup_value]: 1.954e-05 [tuple_transform]: 0.00010648, [1] [Cycle 1]: 9.854e-05, [4] [d_1]: 5.344e-05 [none_parameter_eliminate]: 1.79998e-06 [renormalize]: 1.59984e-07 [switch_simplify]: 8.19002e-06 [partial_unused_args_eliminate]: 5.09998e-06 [add_recomputation]: 5.825e-05 [cse_after_recomputation]: 3.215e-05, [1] [Cycle 1]: 2.454e-05, [1] [cse]: 1.429e-05 [environ_conv]: 9.97999e-06 [swap_dp_allreduce_reducescatter]: 8.45999e-06 [bias_add_comm_swap]: 6.04001e-06 [label_micro_interleaved_index]: 8.75999e-06 [label_fine_grained_interleaved_index]: 6.12999e-06 [merge_cast_opt]: 3.9e-06 [slice_recompute_activation]: 4.74e-06 [micro_interleaved_order_control]: 5.24998e-06 [assign_add_opt]: 3.81001e-06 [ForceFp32Comm]: 3.28e-06 [remove_cast_before_assign_add]: 3.98001e-06 [full_micro_interleaved_order_control]: 5.10999e-06 [reorder_send_recv_between_fp_bp]: 7.33e-06 [comm_op_add_attrs]: 4.21001e-06 [add_comm_op_reuse_tag]: 3.65e-06 [interleave_split_concat_branches]: 4.19002e-06 [interleave_parallel_branches]: 3.91999e-06 [overlap_opt_shard_in_pipeline]: 4.28001e-06 [overlap_opt_shard_grad_in_pipeline]: 4.97e-06 [control_data_broadcast_order]: 1.924e-05 [grouped_pairwise_exchange_alltoall]: 4.27e-06 [offloading_packed_experts]: 7.43e-06 [overlap_recompute_and_grad_model_parallel]: 7.9e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.83001e-06 [overlap_recompute_allgather_and_fa_grad]: 3.78999e-06 [overlap_recompute_comm]: 5.12e-06 [overlap_grad_ring_attention]: 7.03e-06 [overlap_grad_flash_sp]: 2.64e-05 [begin_end_overlap_inline]: 3.14999e-06 [split_matmul_comm_elemetwise]: 5.17e-06 [split_layernorm_comm]: 4.57e-06 [handle_group_info]: 3.69002e-06 [symbol_engine_optimizer]: 0.00010889, [1] [Cycle 1]: 0.0001007, [6] [build]: 3.8e-06 [elim_shapecalc]: 1.307e-05 [elim_not_effective]: 1.488e-05 [opt_reshape]: 7.7e-06 [fold_const_symbol]: 1.084e-05 [renormalize]: 2.00002e-07 [detach_backward]: 4.17e-06 [pipeline_parallel_scheduler]: 2.20002e-06 [auto_monad_reorder]: 2.329e-05 [get_jit_bprop_graph]: 1.75001e-06 [rewriter_after_jit_bprop_graph]: 6.81999e-06 [opt_after_jit_grad]: 0.0006492 [validate]: 4.534e-05 Sums bootstrap : 0.000463s : 0.50% type_inference : 0.085215s : 92.78% event_method : 0.000023s : 0.02% auto_monad : 0.000067s : 0.07% graph_reusing : 0.000007s : 0.01% inline : 0.000003s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000027s : 0.03% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.01% parallel-infer-symbol : 0.000004s : 0.00% pre_auto_parallel : 0.000043s : 0.05% insert-virtual-dataset : 0.000002s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000036s : 0.04% optimize.rewriter_before_opt_a : 0.000099s : 0.11% optimize.opt_a.expand_dump_flag : 0.000005s : 0.01% optimize.opt_a.switch_simplify : 0.000052s : 0.06% optimize.opt_a.loop_unroll : 0.000038s : 0.04% optimize.opt_a.a_1 : 0.000860s : 0.94% optimize.opt_a.with_stream_mark : 0.000042s : 0.05% optimize.opt_a.recompute_prepare : 0.000020s : 0.02% optimize.opt_a.updatestate_depend_eliminate : 0.000009s : 0.01% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.01% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.01% optimize.opt_a.parameter_eliminate : 0.000005s : 0.01% optimize.opt_a.a_2 : 0.000243s : 0.26% optimize.opt_a.accelerated_algorithm : 0.000017s : 0.02% optimize.opt_a.shard : 0.000005s : 0.01% optimize.opt_a.meta_shard_fg_expand : 0.000005s : 0.01% optimize.opt_a.shard_inline : 0.000015s : 0.02% optimize.opt_a.merge_send_recv : 0.000019s : 0.02% optimize.opt_a.auto_parallel : 0.000016s : 0.02% optimize.opt_a.parallel : 0.000027s : 0.03% optimize.opt_a.flash_sp : 0.000013s : 0.01% optimize.opt_a.merge_comm : 0.000008s : 0.01% optimize.opt_a.allreduce_fusion : 0.000008s : 0.01% optimize.opt_a.matmul_add_comm_reduction : 0.000019s : 0.02% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000002s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000019s : 0.02% optimize.opt_a.virtual_dataset : 0.000014s : 0.02% optimize.opt_a.get_grad_eliminate_ : 0.000014s : 0.01% optimize.opt_a.virtual_output : 0.000013s : 0.01% optimize.opt_a.merge_forward : 0.000008s : 0.01% optimize.opt_a.cell_reuse_recompute_pass : 0.000004s : 0.00% optimize.opt_a.offload_activation : 0.000022s : 0.02% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000031s : 0.03% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.00% optimize.opt_a.before_grad : 0.000023s : 0.02% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000009s : 0.01% optimize.opt_a.meta_fg_expand : 0.000006s : 0.01% optimize.opt_a.flash_sp_send_recv_attached : 0.000005s : 0.01% optimize.opt_a.receive_attached : 0.000004s : 0.00% optimize.opt_a.after_resolve : 0.000028s : 0.03% optimize.opt_a.a_after_grad : 0.000024s : 0.03% optimize.opt_a.renormalize : 0.000880s : 0.96% optimize.opt_a.add_forward_monad_depend : 0.000011s : 0.01% optimize.opt_a.auto_monad_grad : 0.000005s : 0.01% optimize.opt_a.auto_monad_eliminator : 0.000032s : 0.03% optimize.opt_a.cse : 0.000053s : 0.06% optimize.opt_a.a_3 : 0.000127s : 0.14% optimize.py_interpret_to_execute_after_opt_a : 0.000020s : 0.02% optimize.slice_cell_reuse_recomputed_activation : 0.000006s : 0.01% optimize.rewriter_after_opt_a : 0.000049s : 0.05% optimize.convert_after_rewriter : 0.000011s : 0.01% optimize.order_py_execute_after_rewriter : 0.000009s : 0.01% optimize.mutable_eliminate : 0.000838s : 0.91% optimize.opt_b.b_1 : 0.000181s : 0.20% optimize.opt_b.b_2 : 0.000009s : 0.01% optimize.opt_b.updatestate_depend_eliminate : 0.000009s : 0.01% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.00% optimize.opt_b.renormalize : 0.000002s : 0.00% optimize.opt_b.cse : 0.000029s : 0.03% optimize.optimize_parallel_all_gather_comm : 0.000024s : 0.03% optimize.overlap_param_gather : 0.000006s : 0.01% optimize.cconv : 0.000042s : 0.05% optimize.loop_unroll : 0.000607s : 0.66% optimize.opt_after_cconv.c_1 : 0.000037s : 0.04% optimize.opt_after_cconv.parameter_eliminate : 0.000006s : 0.01% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000007s : 0.01% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.cse : 0.000023s : 0.02% optimize.opt_after_cconv.renormalize : 0.000001s : 0.00% optimize.remove_dup_value : 0.000020s : 0.02% optimize.tuple_transform.d_1 : 0.000053s : 0.06% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000008s : 0.01% optimize.partial_unused_args_eliminate : 0.000005s : 0.01% optimize.add_recomputation : 0.000058s : 0.06% optimize.cse_after_recomputation.cse : 0.000014s : 0.02% optimize.environ_conv : 0.000010s : 0.01% optimize.swap_dp_allreduce_reducescatter : 0.000008s : 0.01% optimize.bias_add_comm_swap : 0.000006s : 0.01% optimize.label_micro_interleaved_index : 0.000009s : 0.01% optimize.label_fine_grained_interleaved_index : 0.000006s : 0.01% optimize.merge_cast_opt : 0.000004s : 0.00% optimize.slice_recompute_activation : 0.000005s : 0.01% optimize.micro_interleaved_order_control : 0.000005s : 0.01% optimize.assign_add_opt : 0.000004s : 0.00% optimize.ForceFp32Comm : 0.000003s : 0.00% optimize.remove_cast_before_assign_add : 0.000004s : 0.00% optimize.full_micro_interleaved_order_control : 0.000005s : 0.01% optimize.reorder_send_recv_between_fp_bp : 0.000007s : 0.01% optimize.comm_op_add_attrs : 0.000004s : 0.00% optimize.add_comm_op_reuse_tag : 0.000004s : 0.00% optimize.interleave_split_concat_branches : 0.000004s : 0.00% optimize.interleave_parallel_branches : 0.000004s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000005s : 0.01% optimize.control_data_broadcast_order : 0.000019s : 0.02% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.00% optimize.offloading_packed_experts : 0.000007s : 0.01% optimize.overlap_recompute_and_grad_model_parallel : 0.000008s : 0.01% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.00% optimize.overlap_recompute_comm : 0.000005s : 0.01% optimize.overlap_grad_ring_attention : 0.000007s : 0.01% optimize.overlap_grad_flash_sp : 0.000026s : 0.03% optimize.begin_end_overlap_inline : 0.000003s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000005s : 0.01% optimize.split_layernorm_comm : 0.000005s : 0.00% optimize.handle_group_info : 0.000004s : 0.00% optimize.symbol_engine_optimizer.build : 0.000004s : 0.00% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000013s : 0.01% optimize.symbol_engine_optimizer.elim_not_effective : 0.000015s : 0.02% optimize.symbol_engine_optimizer.opt_reshape : 0.000008s : 0.01% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000011s : 0.01% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000004s : 0.00% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000023s : 0.03% get_jit_bprop_graph : 0.000002s : 0.00% rewriter_after_jit_bprop_graph : 0.000007s : 0.01% opt_after_jit_grad : 0.000649s : 0.71% validate : 0.000045s : 0.05% Time group info: ------[substitution.] 0.000260 34 15.30% : 0.000040s : 6: substitution.arithmetic_simplify 0.87% : 0.000002s : 2: substitution.elim_not_effective 0.55% : 0.000001s : 2: substitution.fold_const_symbol 2.74% : 0.000007s : 4: substitution.graph_param_transform 68.48% : 0.000178s : 4: substitution.inline 2.00% : 0.000005s : 4: substitution.j_node_and_user_rematch 1.92% : 0.000005s : 4: substitution.remove_not_recompute_node 2.40% : 0.000006s : 4: substitution.replace_old_param 5.73% : 0.000015s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.085151 2 99.08% : 0.084366s : 1: type_inference.infer 0.92% : 0.000784s : 1: type_inference.specialize ------[replace.] 0.000066 8 65.92% : 0.000043s : 4: replace.inline 34.08% : 0.000022s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000188 8 93.00% : 0.000175s : 4: match.inline 7.00% : 0.000013s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000236 1278 1.07% : 0.000003s : 13: predicate.accumulaten_eliminater 0.84% : 0.000002s : 4: predicate.ad_related_special_op_eliminate 0.49% : 0.000001s : 8: predicate.addn_check_dump 0.97% : 0.000002s : 13: predicate.addn_zero_filter 0.80% : 0.000002s : 13: predicate.adjust_all_reduce_mul_add 2.56% : 0.000006s : 21: predicate.arithmetic_simplify 0.93% : 0.000002s : 13: predicate.cast_eliminate 0.71% : 0.000002s : 8: predicate.check_bprop_eliminate 0.56% : 0.000001s : 8: predicate.compare_switch_simplify 0.17% : 0.000000s : 4: predicate.const_output_eliminate 0.65% : 0.000002s : 8: predicate.depend_value_elim 0.91% : 0.000002s : 13: predicate.dict_get_item_const_eliminator 1.13% : 0.000003s : 13: predicate.dict_get_item_eliminator 0.99% : 0.000002s : 13: predicate.dict_set_item_eliminator 1.00% : 0.000002s : 8: predicate.dumpgradient_eliminate 0.18% : 0.000000s : 4: predicate.elim_not_effective 0.42% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.16% : 0.000003s : 17: predicate.environ_add_const_eliminate 1.10% : 0.000003s : 17: predicate.environ_get_add_eliminate 1.03% : 0.000002s : 17: predicate.environ_get_depend_swap 1.57% : 0.000004s : 25: predicate.environ_get_eliminate 0.96% : 0.000002s : 17: predicate.environ_get_set_eliminate 1.35% : 0.000003s : 21: predicate.exchange_switch_depend_value 2.26% : 0.000005s : 21: predicate.float_depend_g_call 0.50% : 0.000001s : 8: predicate.float_environ_get_switch 0.98% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.16% : 0.000000s : 4: predicate.fold_const_symbol 0.60% : 0.000001s : 8: predicate.get_grad_eliminate 0.26% : 0.000001s : 4: predicate.graph_param_transform 0.58% : 0.000001s : 8: predicate.incorporate_call 0.43% : 0.000001s : 8: predicate.incorporate_call_switch 6.55% : 0.000015s : 58: predicate.inline 0.71% : 0.000002s : 8: predicate.inline_without_move 0.28% : 0.000001s : 8: predicate.j_node_and_user_rematch 0.88% : 0.000002s : 8: predicate.less_batch_normalization 2.19% : 0.000005s : 25: predicate.list_to_tuple_eliminator_ 2.42% : 0.000006s : 38: predicate.load_eliminater 1.17% : 0.000003s : 4: predicate.loop_unroll_after_grad 2.35% : 0.000006s : 34: predicate.loop_unroll_before_grad 1.57% : 0.000004s : 21: predicate.make_slice_get_slice_eliminator 0.51% : 0.000001s : 8: predicate.merge_addn 0.53% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.56% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.74% : 0.000002s : 13: predicate.minmaximum_grad 1.48% : 0.000003s : 4: predicate.mutable_eliminate 0.35% : 0.000001s : 4: predicate.opt_reshape 0.48% : 0.000001s : 4: predicate.parallel_virtual_node 1.65% : 0.000004s : 21: predicate.partial_defer_inline 1.45% : 0.000003s : 21: predicate.partial_eliminate 0.99% : 0.000002s : 13: predicate.print_const_string_wrapper 0.64% : 0.000002s : 8: predicate.reduce_all_const_elim 1.23% : 0.000003s : 13: predicate.reduce_eliminate 2.33% : 0.000005s : 38: predicate.redundant_stop_gradient_eliminater 0.44% : 0.000001s : 8: predicate.remove_not_recompute_node 1.31% : 0.000003s : 25: predicate.replace_applicator 0.51% : 0.000001s : 8: predicate.replace_old_param 0.22% : 0.000001s : 4: predicate.reset_defer_inline 0.95% : 0.000002s : 13: predicate.reshape_eliminate 0.81% : 0.000002s : 8: predicate.row_tensor_add_zeros_like 0.37% : 0.000001s : 4: predicate.row_tensor_eliminate 0.70% : 0.000002s : 8: predicate.same_eliminate 0.39% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.91% : 0.000002s : 8: predicate.shard_identity_eliminate 0.67% : 0.000002s : 8: predicate.special_op_eliminate 0.73% : 0.000002s : 8: predicate.specialize_transform 1.07% : 0.000003s : 8: predicate.split_environ_get_set_with_tuple_value 0.84% : 0.000002s : 8: predicate.stack_unstack_eliminate 0.37% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.40% : 0.000003s : 21: predicate.switch_defer_inline 1.83% : 0.000004s : 29: predicate.switch_layer_defer_inline 4.72% : 0.000011s : 67: predicate.switch_simplify 0.89% : 0.000002s : 13: predicate.tile_eliminate 1.14% : 0.000003s : 13: predicate.transpose_eliminate 1.46% : 0.000003s : 21: predicate.tuple_list_convert_item_index_to_positive 1.62% : 0.000004s : 21: predicate.tuple_list_get_item_const_eliminator 1.27% : 0.000003s : 21: predicate.tuple_list_get_item_depend_reorder 3.42% : 0.000008s : 33: predicate.tuple_list_get_item_eliminator 1.50% : 0.000004s : 21: predicate.tuple_list_get_set_item_eliminator 2.43% : 0.000006s : 29: predicate.tuple_list_set_item_eliminator 1.87% : 0.000004s : 25: predicate.tuple_to_list_eliminator_ 2.31% : 0.000005s : 38: predicate.updatestate_pure_node_eliminater 3.00% : 0.000007s : 46: predicate.updatestate_useless_node_eliminater 0.47% : 0.000001s : 4: predicate.value_based_eliminate 0.64% : 0.000002s : 8: predicate.virtual_dataset_eliminate 0.65% : 0.000002s : 8: predicate.virtual_output_eliminate 0.24% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.47% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000658 11 54.84% : 0.000361s : 5: func_graph_cloner_run.FuncGraphClonerGraph 45.16% : 0.000297s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.135695 192 0.00% : 0.000006s : 1: ForceFp32Comm 12.12% : 0.016444s : 1: add_attr 12.10% : 0.016423s : 1: add_attr_with_inline 0.01% : 0.000008s : 1: add_comm_op_reuse_tag 0.05% : 0.000063s : 1: add_recomputation 0.00% : 0.000006s : 1: assign_add_opt 0.06% : 0.000076s : 1: auto_monad 0.02% : 0.000032s : 1: auto_monad_reorder 0.00% : 0.000006s : 1: begin_end_overlap_inline 0.01% : 0.000009s : 1: bias_add_comm_swap 0.38% : 0.000515s : 1: bootstrap 0.03% : 0.000046s : 1: cconv 0.01% : 0.000007s : 1: comm_op_add_attrs 0.02% : 0.000023s : 1: control_data_broadcast_order 0.01% : 0.000015s : 1: convert_after_rewriter 0.03% : 0.000035s : 1: cse_after_recomputation 0.01% : 0.000008s : 1: dataset_repeat_opt 0.02% : 0.000023s : 1: detach_backward 0.01% : 0.000013s : 1: environ_conv 0.02% : 0.000034s : 1: event_method 0.03% : 0.000040s : 1: full_micro_interleaved_order_control 0.01% : 0.000008s : 1: get_jit_bprop_graph 0.01% : 0.000013s : 1: graph_reusing 0.01% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000007s : 1: handle_group_info 0.01% : 0.000009s : 1: inline 0.01% : 0.000008s : 1: insert-virtual-dataset 0.00% : 0.000007s : 1: interleave_parallel_branches 0.01% : 0.000007s : 1: interleave_split_concat_branches 0.01% : 0.000009s : 1: label_fine_grained_interleaved_index 0.01% : 0.000012s : 1: label_micro_interleaved_index 0.45% : 0.000616s : 1: loop_unroll 0.00% : 0.000007s : 1: merge_cast_opt 0.01% : 0.000008s : 1: micro_interleaved_order_control 0.62% : 0.000848s : 1: mutable_eliminate 0.01% : 0.000011s : 1: offloading_packed_experts 0.01% : 0.000019s : 1: opt.transform.loop_unroll_optimizer 0.02% : 0.000023s : 1: opt.transform.mutable_eliminate 0.99% : 0.001340s : 78: opt.transform.opt_a 0.03% : 0.000035s : 1: opt.transform.opt_after_cconv 0.02% : 0.000029s : 1: opt.transform.opt_after_jit_grad 0.08% : 0.000112s : 28: opt.transform.opt_b 0.04% : 0.000059s : 2: opt.transform.opt_trans_graph 0.03% : 0.000042s : 4: opt.transform.symbol_engine_opt 2.66% : 0.003604s : 1: opt_a 0.11% : 0.000153s : 1: opt_after_cconv 0.49% : 0.000662s : 1: opt_after_jit_grad 0.23% : 0.000317s : 1: opt_b 5.15% : 0.006985s : 1: optimize 0.02% : 0.000028s : 1: optimize_parallel_all_gather_comm 0.01% : 0.000012s : 1: order_py_execute_after_rewriter 0.02% : 0.000030s : 1: overlap_grad_flash_sp 0.00% : 0.000007s : 1: overlap_grad_matmul_and_grad_allreduce 0.01% : 0.000010s : 1: overlap_grad_ring_attention 0.01% : 0.000008s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000007s : 1: overlap_opt_shard_in_pipeline 0.01% : 0.000009s : 1: overlap_param_gather 0.00% : 0.000006s : 1: overlap_recompute_allgather_and_fa_grad 0.01% : 0.000011s : 1: overlap_recompute_and_grad_model_parallel 0.01% : 0.000008s : 1: overlap_recompute_comm 0.01% : 0.000011s : 1: parallel-infer-symbol 0.00% : 0.000006s : 1: parallel-infer-symbol-second 0.01% : 0.000008s : 1: partial_unused_args_eliminate 0.01% : 0.000011s : 1: pipeline_parallel_scheduler 0.01% : 0.000007s : 1: pipeline_split 0.04% : 0.000051s : 1: pre_auto_parallel 0.03% : 0.000040s : 1: py_interpret_to_execute 0.02% : 0.000024s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000007s : 1: remove_cast_before_assign_add 0.02% : 0.000023s : 1: remove_dup_value 0.34% : 0.000457s : 1: renormalize.infer 0.30% : 0.000414s : 1: renormalize.specialize 0.01% : 0.000010s : 1: reorder_send_recv_between_fp_bp 0.01% : 0.000013s : 1: rewriter_after_jit_bprop_graph 0.04% : 0.000053s : 1: rewriter_after_opt_a 0.08% : 0.000103s : 1: rewriter_before_opt_a 0.01% : 0.000009s : 1: slice_cell_reuse_recomputed_activation 0.01% : 0.000008s : 1: slice_recompute_activation 0.01% : 0.000008s : 1: split_layernorm_comm 0.01% : 0.000008s : 1: split_matmul_comm_elemetwise 0.01% : 0.000012s : 1: swap_dp_allreduce_reducescatter 0.08% : 0.000112s : 1: symbol_engine_optimizer 0.08% : 0.000110s : 1: tuple_transform 62.84% : 0.085266s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:43:32.786.096 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0494151, [21] [bootstrap]: 0.00048863 [type_inference]: 0.0389702 [event_method]: 2.274e-05 [auto_monad]: 6.512e-05 [graph_reusing]: 6.27001e-06 [inline]: 2.89999e-06 [add_attr]: 0.0036852, [1] [add_attr_with_inline]: 0.00367342, [1] [Cycle 1]: 7.424e-05, [2] [tag_attr]: 2.312e-05 [meta_addattr_fg_expand]: 5.94e-06 [parallel-infer-symbol]: 4.31002e-06 [pre_auto_parallel]: 4.094e-05 [insert-virtual-dataset]: 3.06001e-06 [parallel-infer-symbol-second]: 7.7e-07 [dataset_repeat_opt]: 2.30002e-06 [pipeline_split]: 1.64e-06 [optimize]: 0.00541637, [53] [py_interpret_to_execute]: 2.815e-05 [rewriter_before_opt_a]: 9.012e-05 [opt_a]: 0.00296311, [2] [Cycle 1]: 0.00224209, [45] [expand_dump_flag]: 2.86e-06 [switch_simplify]: 4.27e-05 [loop_unroll]: 3.009e-05 [a_1]: 0.00067932 [with_stream_mark]: 2.172e-05 [recompute_prepare]: 1.085e-05 [updatestate_depend_eliminate]: 4.48001e-06 [updatestate_assign_eliminate]: 3.38e-06 [updatestate_loads_eliminate]: 2.96001e-06 [parameter_eliminate]: 2.29999e-06 [a_2]: 8.824e-05 [accelerated_algorithm]: 7.95e-06 [shard]: 1.69e-06 [meta_shard_fg_expand]: 2.34001e-06 [shard_inline]: 7.34002e-06 [merge_send_recv]: 8.99003e-06 [auto_parallel]: 7.61001e-06 [parallel]: 2.069e-05 [flash_sp]: 8.45999e-06 [merge_comm]: 4.3e-06 [allreduce_fusion]: 3.46001e-06 [matmul_add_comm_reduction]: 1.035e-05 [allreduce_slice_to_reducescatter]: 1.12e-06 [virtual_shard_identity]: 9.42001e-06 [virtual_dataset]: 6.93e-06 [get_grad_eliminate_]: 6.29001e-06 [virtual_output]: 6.48e-06 [merge_forward]: 4.11001e-06 [cell_reuse_recompute_pass]: 1.81e-06 [offload_activation]: 1.04e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.424e-05 [merge_recompute_call_nodes]: 1.67999e-06 [before_grad]: 1.061e-05 [set_forward_comm_id_for_comm_node_pass]: 3.83001e-06 [meta_fg_expand]: 3.23e-06 [flash_sp_send_recv_attached]: 2.52001e-06 [receive_attached]: 2.36e-06 [after_resolve]: 1.328e-05 [a_after_grad]: 1.056e-05 [renormalize]: 0.00079142 [add_forward_monad_depend]: 5.76998e-06 [auto_monad_grad]: 2.74001e-06 [auto_monad_eliminator]: 1.761e-05 [cse]: 3.056e-05 [a_3]: 5.06e-05 [Cycle 2]: 0.00070918, [45] [expand_dump_flag]: 1.72001e-06 [switch_simplify]: 7.88999e-06 [loop_unroll]: 6.30002e-06 [a_1]: 0.00014376 [with_stream_mark]: 1.555e-05 [recompute_prepare]: 6.83e-06 [updatestate_depend_eliminate]: 1.516e-05 [updatestate_assign_eliminate]: 3.28998e-06 [updatestate_loads_eliminate]: 2.53998e-06 [parameter_eliminate]: 1.27e-06 [a_2]: 7.99e-05 [accelerated_algorithm]: 6.85002e-06 [shard]: 1.71002e-06 [meta_shard_fg_expand]: 1.44998e-06 [shard_inline]: 6.43e-06 [merge_send_recv]: 6.86999e-06 [auto_parallel]: 7.15e-06 [parallel]: 6.10002e-06 [flash_sp]: 4.13999e-06 [merge_comm]: 3.28e-06 [allreduce_fusion]: 3.25e-06 [matmul_add_comm_reduction]: 7.21999e-06 [allreduce_slice_to_reducescatter]: 7.2e-07 [virtual_shard_identity]: 7.19001e-06 [virtual_dataset]: 5.99e-06 [get_grad_eliminate_]: 5.62001e-06 [virtual_output]: 5.66e-06 [merge_forward]: 3.09999e-06 [cell_reuse_recompute_pass]: 1.37999e-06 [offload_activation]: 7.4e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.121e-05 [merge_recompute_call_nodes]: 1.30999e-06 [before_grad]: 1.146e-05 [set_forward_comm_id_for_comm_node_pass]: 3.32997e-06 [meta_fg_expand]: 2.06e-06 [flash_sp_send_recv_attached]: 1.18001e-06 [receive_attached]: 2.16e-06 [after_resolve]: 1.12e-05 [a_after_grad]: 9.47001e-06 [renormalize]: 8.00064e-08 [add_forward_monad_depend]: 1.99999e-06 [auto_monad_grad]: 1.37999e-06 [auto_monad_eliminator]: 8.39002e-06 [cse]: 1.569e-05 [a_3]: 3.537e-05 [py_interpret_to_execute_after_opt_a]: 1.204e-05 [slice_cell_reuse_recomputed_activation]: 2.11e-06 [rewriter_after_opt_a]: 3.793e-05 [convert_after_rewriter]: 7.08998e-06 [order_py_execute_after_rewriter]: 5.13002e-06 [mutable_eliminate]: 0.00060789 [opt_b]: 0.00042189, [1] [Cycle 1]: 0.0002023, [7] [b_1]: 0.00012506 [b_2]: 8.01001e-06 [updatestate_depend_eliminate]: 7.88001e-06 [updatestate_assign_eliminate]: 2.39001e-06 [updatestate_loads_eliminate]: 2.22999e-06 [renormalize]: 9.30013e-07 [cse]: 1.934e-05 [optimize_parallel_all_gather_comm]: 2.161e-05 [overlap_param_gather]: 2.29001e-06 [cconv]: 3.163e-05 [loop_unroll]: 0.00045697 [opt_after_cconv]: 0.00010548, [1] [Cycle 1]: 9.897e-05, [7] [c_1]: 3.181e-05 [parameter_eliminate]: 4.33001e-06 [updatestate_depend_eliminate]: 5.50001e-06 [updatestate_assign_eliminate]: 2.41998e-06 [updatestate_loads_eliminate]: 2.31e-06 [cse]: 1.779e-05 [renormalize]: 4.59986e-07 [remove_dup_value]: 1.338e-05 [tuple_transform]: 7.857e-05, [1] [Cycle 1]: 7.409e-05, [4] [d_1]: 4.653e-05 [none_parameter_eliminate]: 1.63002e-06 [renormalize]: 2.29978e-07 [switch_simplify]: 6.76e-06 [partial_unused_args_eliminate]: 1.72999e-06 [add_recomputation]: 5.095e-05 [cse_after_recomputation]: 2.083e-05, [1] [Cycle 1]: 1.634e-05, [1] [cse]: 1.088e-05 [environ_conv]: 5.40999e-06 [swap_dp_allreduce_reducescatter]: 5.10999e-06 [bias_add_comm_swap]: 2.55002e-06 [label_micro_interleaved_index]: 4.05e-06 [label_fine_grained_interleaved_index]: 2.93e-06 [merge_cast_opt]: 1.21002e-06 [slice_recompute_activation]: 2.11003e-06 [micro_interleaved_order_control]: 2.76999e-06 [assign_add_opt]: 1.14998e-06 [ForceFp32Comm]: 8.39995e-07 [remove_cast_before_assign_add]: 1.00001e-06 [full_micro_interleaved_order_control]: 2.39001e-06 [reorder_send_recv_between_fp_bp]: 2.64001e-06 [comm_op_add_attrs]: 1.00999e-06 [add_comm_op_reuse_tag]: 9.49978e-07 [interleave_split_concat_branches]: 1.12999e-06 [interleave_parallel_branches]: 1.26002e-06 [overlap_opt_shard_in_pipeline]: 1.15001e-06 [overlap_opt_shard_grad_in_pipeline]: 2.17001e-06 [control_data_broadcast_order]: 1.219e-05 [grouped_pairwise_exchange_alltoall]: 2.24999e-06 [offloading_packed_experts]: 4.13999e-06 [overlap_recompute_and_grad_model_parallel]: 7.19001e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.93002e-06 [overlap_recompute_allgather_and_fa_grad]: 1.34e-06 [overlap_recompute_comm]: 2.58e-06 [overlap_grad_ring_attention]: 4.35e-06 [overlap_grad_flash_sp]: 2.211e-05 [begin_end_overlap_inline]: 6.19999e-07 [split_matmul_comm_elemetwise]: 2.12999e-06 [split_layernorm_comm]: 1.62001e-06 [handle_group_info]: 1.15999e-06 [symbol_engine_optimizer]: 7.973e-05, [1] [Cycle 1]: 7.495e-05, [6] [build]: 3.7e-06 [elim_shapecalc]: 1.083e-05 [elim_not_effective]: 1.334e-05 [opt_reshape]: 7.28999e-06 [fold_const_symbol]: 9.97999e-06 [renormalize]: 2.50002e-07 [detach_backward]: 2.17999e-06 [pipeline_parallel_scheduler]: 1.59e-06 [auto_monad_reorder]: 1.728e-05 [get_jit_bprop_graph]: 1.64e-06 [rewriter_after_jit_bprop_graph]: 3.92002e-06 [opt_after_jit_grad]: 0.00046689 [validate]: 4.133e-05 Sums bootstrap : 0.000489s : 1.10% type_inference : 0.038970s : 87.65% event_method : 0.000023s : 0.05% auto_monad : 0.000065s : 0.15% graph_reusing : 0.000006s : 0.01% inline : 0.000003s : 0.01% add_attr.add_attr_with_inline.tag_attr : 0.000023s : 0.05% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.01% parallel-infer-symbol : 0.000004s : 0.01% pre_auto_parallel : 0.000041s : 0.09% insert-virtual-dataset : 0.000003s : 0.01% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.01% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000028s : 0.06% optimize.rewriter_before_opt_a : 0.000090s : 0.20% optimize.opt_a.expand_dump_flag : 0.000005s : 0.01% optimize.opt_a.switch_simplify : 0.000051s : 0.11% optimize.opt_a.loop_unroll : 0.000036s : 0.08% optimize.opt_a.a_1 : 0.000823s : 1.85% optimize.opt_a.with_stream_mark : 0.000037s : 0.08% optimize.opt_a.recompute_prepare : 0.000018s : 0.04% optimize.opt_a.updatestate_depend_eliminate : 0.000020s : 0.04% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.02% optimize.opt_a.updatestate_loads_eliminate : 0.000005s : 0.01% optimize.opt_a.parameter_eliminate : 0.000004s : 0.01% optimize.opt_a.a_2 : 0.000168s : 0.38% optimize.opt_a.accelerated_algorithm : 0.000015s : 0.03% optimize.opt_a.shard : 0.000003s : 0.01% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.01% optimize.opt_a.shard_inline : 0.000014s : 0.03% optimize.opt_a.merge_send_recv : 0.000016s : 0.04% optimize.opt_a.auto_parallel : 0.000015s : 0.03% optimize.opt_a.parallel : 0.000027s : 0.06% optimize.opt_a.flash_sp : 0.000013s : 0.03% optimize.opt_a.merge_comm : 0.000008s : 0.02% optimize.opt_a.allreduce_fusion : 0.000007s : 0.02% optimize.opt_a.matmul_add_comm_reduction : 0.000018s : 0.04% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000002s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000017s : 0.04% optimize.opt_a.virtual_dataset : 0.000013s : 0.03% optimize.opt_a.get_grad_eliminate_ : 0.000012s : 0.03% optimize.opt_a.virtual_output : 0.000012s : 0.03% optimize.opt_a.merge_forward : 0.000007s : 0.02% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.01% optimize.opt_a.offload_activation : 0.000018s : 0.04% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000025s : 0.06% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.01% optimize.opt_a.before_grad : 0.000022s : 0.05% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000007s : 0.02% optimize.opt_a.meta_fg_expand : 0.000005s : 0.01% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.01% optimize.opt_a.receive_attached : 0.000005s : 0.01% optimize.opt_a.after_resolve : 0.000024s : 0.06% optimize.opt_a.a_after_grad : 0.000020s : 0.05% optimize.opt_a.renormalize : 0.000792s : 1.78% optimize.opt_a.add_forward_monad_depend : 0.000008s : 0.02% optimize.opt_a.auto_monad_grad : 0.000004s : 0.01% optimize.opt_a.auto_monad_eliminator : 0.000026s : 0.06% optimize.opt_a.cse : 0.000046s : 0.10% optimize.opt_a.a_3 : 0.000086s : 0.19% optimize.py_interpret_to_execute_after_opt_a : 0.000012s : 0.03% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.00% optimize.rewriter_after_opt_a : 0.000038s : 0.09% optimize.convert_after_rewriter : 0.000007s : 0.02% optimize.order_py_execute_after_rewriter : 0.000005s : 0.01% optimize.mutable_eliminate : 0.000608s : 1.37% optimize.opt_b.b_1 : 0.000125s : 0.28% optimize.opt_b.b_2 : 0.000008s : 0.02% optimize.opt_b.updatestate_depend_eliminate : 0.000008s : 0.02% optimize.opt_b.updatestate_assign_eliminate : 0.000002s : 0.01% optimize.opt_b.updatestate_loads_eliminate : 0.000002s : 0.01% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000019s : 0.04% optimize.optimize_parallel_all_gather_comm : 0.000022s : 0.05% optimize.overlap_param_gather : 0.000002s : 0.01% optimize.cconv : 0.000032s : 0.07% optimize.loop_unroll : 0.000457s : 1.03% optimize.opt_after_cconv.c_1 : 0.000032s : 0.07% optimize.opt_after_cconv.parameter_eliminate : 0.000004s : 0.01% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.01% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000002s : 0.01% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.01% optimize.opt_after_cconv.cse : 0.000018s : 0.04% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000013s : 0.03% optimize.tuple_transform.d_1 : 0.000047s : 0.10% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000007s : 0.02% optimize.partial_unused_args_eliminate : 0.000002s : 0.00% optimize.add_recomputation : 0.000051s : 0.11% optimize.cse_after_recomputation.cse : 0.000011s : 0.02% optimize.environ_conv : 0.000005s : 0.01% optimize.swap_dp_allreduce_reducescatter : 0.000005s : 0.01% optimize.bias_add_comm_swap : 0.000003s : 0.01% optimize.label_micro_interleaved_index : 0.000004s : 0.01% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.01% optimize.merge_cast_opt : 0.000001s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.00% optimize.micro_interleaved_order_control : 0.000003s : 0.01% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.00% optimize.full_micro_interleaved_order_control : 0.000002s : 0.01% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.01% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.00% optimize.control_data_broadcast_order : 0.000012s : 0.03% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.01% optimize.offloading_packed_experts : 0.000004s : 0.01% optimize.overlap_recompute_and_grad_model_parallel : 0.000007s : 0.02% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000002s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.00% optimize.overlap_recompute_comm : 0.000003s : 0.01% optimize.overlap_grad_ring_attention : 0.000004s : 0.01% optimize.overlap_grad_flash_sp : 0.000022s : 0.05% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.00% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000004s : 0.01% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000011s : 0.02% optimize.symbol_engine_optimizer.elim_not_effective : 0.000013s : 0.03% optimize.symbol_engine_optimizer.opt_reshape : 0.000007s : 0.02% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000010s : 0.02% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.00% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000017s : 0.04% get_jit_bprop_graph : 0.000002s : 0.00% rewriter_after_jit_bprop_graph : 0.000004s : 0.01% opt_after_jit_grad : 0.000467s : 1.05% validate : 0.000041s : 0.09% Time group info: ------[substitution.] 0.000242 34 14.49% : 0.000035s : 6: substitution.arithmetic_simplify 0.78% : 0.000002s : 2: substitution.elim_not_effective 0.60% : 0.000001s : 2: substitution.fold_const_symbol 2.73% : 0.000007s : 4: substitution.graph_param_transform 69.13% : 0.000167s : 4: substitution.inline 1.64% : 0.000004s : 4: substitution.j_node_and_user_rematch 1.95% : 0.000005s : 4: substitution.remove_not_recompute_node 2.04% : 0.000005s : 4: substitution.replace_old_param 6.63% : 0.000016s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.038896 2 97.96% : 0.038101s : 1: type_inference.infer 2.04% : 0.000795s : 1: type_inference.specialize ------[replace.] 0.000068 8 63.24% : 0.000043s : 4: replace.inline 36.76% : 0.000025s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000178 8 92.06% : 0.000164s : 4: match.inline 7.94% : 0.000014s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000210 1278 0.89% : 0.000002s : 13: predicate.accumulaten_eliminater 0.68% : 0.000001s : 4: predicate.ad_related_special_op_eliminate 0.58% : 0.000001s : 8: predicate.addn_check_dump 0.96% : 0.000002s : 13: predicate.addn_zero_filter 0.76% : 0.000002s : 13: predicate.adjust_all_reduce_mul_add 2.78% : 0.000006s : 21: predicate.arithmetic_simplify 0.91% : 0.000002s : 13: predicate.cast_eliminate 0.63% : 0.000001s : 8: predicate.check_bprop_eliminate 0.50% : 0.000001s : 8: predicate.compare_switch_simplify 0.19% : 0.000000s : 4: predicate.const_output_eliminate 0.50% : 0.000001s : 8: predicate.depend_value_elim 0.93% : 0.000002s : 13: predicate.dict_get_item_const_eliminator 0.95% : 0.000002s : 13: predicate.dict_get_item_eliminator 0.86% : 0.000002s : 13: predicate.dict_set_item_eliminator 1.01% : 0.000002s : 8: predicate.dumpgradient_eliminate 0.19% : 0.000000s : 4: predicate.elim_not_effective 0.39% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.29% : 0.000003s : 17: predicate.environ_add_const_eliminate 1.04% : 0.000002s : 17: predicate.environ_get_add_eliminate 1.06% : 0.000002s : 17: predicate.environ_get_depend_swap 1.65% : 0.000003s : 25: predicate.environ_get_eliminate 1.14% : 0.000002s : 17: predicate.environ_get_set_eliminate 1.40% : 0.000003s : 21: predicate.exchange_switch_depend_value 2.51% : 0.000005s : 21: predicate.float_depend_g_call 0.48% : 0.000001s : 8: predicate.float_environ_get_switch 0.74% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.19% : 0.000000s : 4: predicate.fold_const_symbol 0.55% : 0.000001s : 8: predicate.get_grad_eliminate 0.21% : 0.000000s : 4: predicate.graph_param_transform 0.59% : 0.000001s : 8: predicate.incorporate_call 0.48% : 0.000001s : 8: predicate.incorporate_call_switch 6.45% : 0.000014s : 58: predicate.inline 0.95% : 0.000002s : 8: predicate.inline_without_move 0.32% : 0.000001s : 8: predicate.j_node_and_user_rematch 1.13% : 0.000002s : 8: predicate.less_batch_normalization 1.81% : 0.000004s : 25: predicate.list_to_tuple_eliminator_ 2.46% : 0.000005s : 38: predicate.load_eliminater 0.86% : 0.000002s : 4: predicate.loop_unroll_after_grad 2.36% : 0.000005s : 34: predicate.loop_unroll_before_grad 1.49% : 0.000003s : 21: predicate.make_slice_get_slice_eliminator 0.53% : 0.000001s : 8: predicate.merge_addn 0.54% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.60% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.83% : 0.000002s : 13: predicate.minmaximum_grad 1.46% : 0.000003s : 4: predicate.mutable_eliminate 0.45% : 0.000001s : 4: predicate.opt_reshape 0.43% : 0.000001s : 4: predicate.parallel_virtual_node 1.64% : 0.000003s : 21: predicate.partial_defer_inline 1.60% : 0.000003s : 21: predicate.partial_eliminate 0.87% : 0.000002s : 13: predicate.print_const_string_wrapper 0.55% : 0.000001s : 8: predicate.reduce_all_const_elim 1.39% : 0.000003s : 13: predicate.reduce_eliminate 2.38% : 0.000005s : 38: predicate.redundant_stop_gradient_eliminater 0.48% : 0.000001s : 8: predicate.remove_not_recompute_node 1.33% : 0.000003s : 25: predicate.replace_applicator 0.55% : 0.000001s : 8: predicate.replace_old_param 0.36% : 0.000001s : 4: predicate.reset_defer_inline 0.96% : 0.000002s : 13: predicate.reshape_eliminate 0.59% : 0.000001s : 8: predicate.row_tensor_add_zeros_like 0.36% : 0.000001s : 4: predicate.row_tensor_eliminate 0.80% : 0.000002s : 8: predicate.same_eliminate 0.43% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.78% : 0.000002s : 8: predicate.shard_identity_eliminate 0.72% : 0.000002s : 8: predicate.special_op_eliminate 0.69% : 0.000001s : 8: predicate.specialize_transform 0.89% : 0.000002s : 8: predicate.split_environ_get_set_with_tuple_value 0.91% : 0.000002s : 8: predicate.stack_unstack_eliminate 0.38% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.51% : 0.000003s : 21: predicate.switch_defer_inline 2.05% : 0.000004s : 29: predicate.switch_layer_defer_inline 5.24% : 0.000011s : 67: predicate.switch_simplify 0.89% : 0.000002s : 13: predicate.tile_eliminate 0.91% : 0.000002s : 13: predicate.transpose_eliminate 1.48% : 0.000003s : 21: predicate.tuple_list_convert_item_index_to_positive 1.56% : 0.000003s : 21: predicate.tuple_list_get_item_const_eliminator 1.43% : 0.000003s : 21: predicate.tuple_list_get_item_depend_reorder 3.34% : 0.000007s : 33: predicate.tuple_list_get_item_eliminator 1.42% : 0.000003s : 21: predicate.tuple_list_get_set_item_eliminator 2.19% : 0.000005s : 29: predicate.tuple_list_set_item_eliminator 1.62% : 0.000003s : 25: predicate.tuple_to_list_eliminator_ 2.32% : 0.000005s : 38: predicate.updatestate_pure_node_eliminater 3.13% : 0.000007s : 46: predicate.updatestate_useless_node_eliminater 0.39% : 0.000001s : 4: predicate.value_based_eliminate 0.66% : 0.000001s : 8: predicate.virtual_dataset_eliminate 0.52% : 0.000001s : 8: predicate.virtual_output_eliminate 0.32% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.68% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000625 11 55.40% : 0.000347s : 5: func_graph_cloner_run.FuncGraphClonerGraph 44.60% : 0.000279s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.060665 192 0.01% : 0.000004s : 1: ForceFp32Comm 6.09% : 0.003692s : 1: add_attr 6.06% : 0.003678s : 1: add_attr_with_inline 0.01% : 0.000004s : 1: add_comm_op_reuse_tag 0.09% : 0.000055s : 1: add_recomputation 0.01% : 0.000004s : 1: assign_add_opt 0.12% : 0.000071s : 1: auto_monad 0.03% : 0.000021s : 1: auto_monad_reorder 0.01% : 0.000004s : 1: begin_end_overlap_inline 0.01% : 0.000005s : 1: bias_add_comm_swap 0.86% : 0.000524s : 1: bootstrap 0.06% : 0.000035s : 1: cconv 0.01% : 0.000004s : 1: comm_op_add_attrs 0.03% : 0.000015s : 1: control_data_broadcast_order 0.02% : 0.000010s : 1: convert_after_rewriter 0.04% : 0.000024s : 1: cse_after_recomputation 0.01% : 0.000005s : 1: dataset_repeat_opt 0.01% : 0.000005s : 1: detach_backward 0.01% : 0.000008s : 1: environ_conv 0.05% : 0.000030s : 1: event_method 0.01% : 0.000005s : 1: full_micro_interleaved_order_control 0.01% : 0.000005s : 1: get_jit_bprop_graph 0.02% : 0.000010s : 1: graph_reusing 0.01% : 0.000005s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000004s : 1: handle_group_info 0.01% : 0.000006s : 1: inline 0.01% : 0.000006s : 1: insert-virtual-dataset 0.01% : 0.000004s : 1: interleave_parallel_branches 0.01% : 0.000004s : 1: interleave_split_concat_branches 0.01% : 0.000006s : 1: label_fine_grained_interleaved_index 0.01% : 0.000007s : 1: label_micro_interleaved_index 0.77% : 0.000465s : 1: loop_unroll 0.01% : 0.000004s : 1: merge_cast_opt 0.01% : 0.000005s : 1: micro_interleaved_order_control 1.02% : 0.000618s : 1: mutable_eliminate 0.01% : 0.000007s : 1: offloading_packed_experts 0.02% : 0.000015s : 1: opt.transform.loop_unroll_optimizer 0.03% : 0.000018s : 1: opt.transform.mutable_eliminate 2.08% : 0.001262s : 78: opt.transform.opt_a 0.05% : 0.000030s : 1: opt.transform.opt_after_cconv 0.04% : 0.000024s : 1: opt.transform.opt_after_jit_grad 0.17% : 0.000102s : 28: opt.transform.opt_b 0.08% : 0.000051s : 2: opt.transform.opt_trans_graph 0.06% : 0.000038s : 4: opt.transform.symbol_engine_opt 4.89% : 0.002966s : 1: opt_a 0.18% : 0.000109s : 1: opt_after_cconv 0.78% : 0.000476s : 1: opt_after_jit_grad 0.71% : 0.000429s : 1: opt_b 8.94% : 0.005422s : 1: optimize 0.04% : 0.000026s : 1: optimize_parallel_all_gather_comm 0.01% : 0.000008s : 1: order_py_execute_after_rewriter 0.04% : 0.000025s : 1: overlap_grad_flash_sp 0.01% : 0.000005s : 1: overlap_grad_matmul_and_grad_allreduce 0.01% : 0.000007s : 1: overlap_grad_ring_attention 0.01% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.01% : 0.000005s : 1: overlap_param_gather 0.01% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.02% : 0.000014s : 1: overlap_recompute_and_grad_model_parallel 0.01% : 0.000006s : 1: overlap_recompute_comm 0.01% : 0.000008s : 1: parallel-infer-symbol 0.01% : 0.000004s : 1: parallel-infer-symbol-second 0.01% : 0.000005s : 1: partial_unused_args_eliminate 0.01% : 0.000004s : 1: pipeline_parallel_scheduler 0.01% : 0.000004s : 1: pipeline_split 0.07% : 0.000045s : 1: pre_auto_parallel 0.05% : 0.000032s : 1: py_interpret_to_execute 0.03% : 0.000015s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000004s : 1: remove_cast_before_assign_add 0.03% : 0.000017s : 1: remove_dup_value 0.72% : 0.000434s : 1: renormalize.infer 0.57% : 0.000348s : 1: renormalize.specialize 0.01% : 0.000005s : 1: reorder_send_recv_between_fp_bp 0.01% : 0.000007s : 1: rewriter_after_jit_bprop_graph 0.07% : 0.000042s : 1: rewriter_after_opt_a 0.16% : 0.000095s : 1: rewriter_before_opt_a 0.01% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.01% : 0.000005s : 1: slice_recompute_activation 0.01% : 0.000005s : 1: split_layernorm_comm 0.01% : 0.000005s : 1: split_matmul_comm_elemetwise 0.01% : 0.000008s : 1: swap_dp_allreduce_reducescatter 0.14% : 0.000082s : 1: symbol_engine_optimizer 0.13% : 0.000081s : 1: tuple_transform 64.28% : 0.038995s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:43:33.775.573 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:43:33.775.877 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0894541, [21] [bootstrap]: 0.0004315 [type_inference]: 0.078193 [event_method]: 1.904e-05 [auto_monad]: 6.45e-05 [graph_reusing]: 6.28e-06 [inline]: 2.32001e-06 [add_attr]: 0.00346344, [1] [add_attr_with_inline]: 0.0034521, [1] [Cycle 1]: 8.99e-05, [2] [tag_attr]: 2.302e-05 [meta_addattr_fg_expand]: 6.04999e-06 [parallel-infer-symbol]: 3.71001e-06 [pre_auto_parallel]: 3.902e-05 [insert-virtual-dataset]: 2.27999e-06 [parallel-infer-symbol-second]: 7.30011e-07 [dataset_repeat_opt]: 1.92001e-06 [pipeline_split]: 1.67001e-06 [optimize]: 0.00601221, [53] [py_interpret_to_execute]: 3.203e-05 [rewriter_before_opt_a]: 9.282e-05 [opt_a]: 0.00349883, [2] [Cycle 1]: 0.00258994, [45] [expand_dump_flag]: 3.63e-06 [switch_simplify]: 4.34e-05 [loop_unroll]: 3.136e-05 [a_1]: 0.00067094 [with_stream_mark]: 1.81e-05 [recompute_prepare]: 1.073e-05 [updatestate_depend_eliminate]: 4.83001e-06 [updatestate_assign_eliminate]: 3.48e-06 [updatestate_loads_eliminate]: 3.28e-06 [parameter_eliminate]: 1.84e-06 [a_2]: 0.00013389 [accelerated_algorithm]: 7.77e-06 [shard]: 2.09e-06 [meta_shard_fg_expand]: 1.87999e-06 [shard_inline]: 6.83998e-06 [merge_send_recv]: 8.2e-06 [auto_parallel]: 6.95002e-06 [parallel]: 2.03e-05 [flash_sp]: 5.06e-05 [merge_comm]: 5.92001e-06 [allreduce_fusion]: 3.92002e-06 [matmul_add_comm_reduction]: 1.048e-05 [allreduce_slice_to_reducescatter]: 9.89996e-07 [virtual_shard_identity]: 1.081e-05 [virtual_dataset]: 7.16001e-06 [get_grad_eliminate_]: 6.81001e-06 [virtual_output]: 9.063e-05 [merge_forward]: 5.20001e-06 [cell_reuse_recompute_pass]: 1.50001e-06 [offload_activation]: 1.191e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.931e-05 [merge_recompute_call_nodes]: 1.54e-06 [before_grad]: 1.21e-05 [set_forward_comm_id_for_comm_node_pass]: 4.52e-06 [meta_fg_expand]: 3.06001e-06 [flash_sp_send_recv_attached]: 3.23e-06 [receive_attached]: 2.36e-06 [after_resolve]: 1.346e-05 [a_after_grad]: 1.034e-05 [renormalize]: 0.00079839 [add_forward_monad_depend]: 6.58e-06 [auto_monad_grad]: 2.54001e-06 [auto_monad_eliminator]: 1.647e-05 [cse]: 2.857e-05 [a_3]: 6.319e-05 [Cycle 2]: 0.00089294, [45] [expand_dump_flag]: 1.79e-06 [switch_simplify]: 7.93001e-06 [loop_unroll]: 6.39001e-06 [a_1]: 0.00014874 [with_stream_mark]: 1.453e-05 [recompute_prepare]: 6.41e-06 [updatestate_depend_eliminate]: 3.53999e-06 [updatestate_assign_eliminate]: 2.75997e-06 [updatestate_loads_eliminate]: 2.66999e-06 [parameter_eliminate]: 1.52999e-06 [a_2]: 0.00010843 [accelerated_algorithm]: 6.51999e-06 [shard]: 1.30999e-06 [meta_shard_fg_expand]: 2.05002e-06 [shard_inline]: 6.11e-06 [merge_send_recv]: 5.84e-06 [auto_parallel]: 6.14999e-06 [parallel]: 5.79999e-06 [flash_sp]: 3.7e-06 [merge_comm]: 4.09002e-06 [allreduce_fusion]: 3.78999e-06 [matmul_add_comm_reduction]: 7.35e-06 [allreduce_slice_to_reducescatter]: 3.30008e-07 [virtual_shard_identity]: 1.047e-05 [virtual_dataset]: 6.21998e-06 [get_grad_eliminate_]: 6.30997e-06 [virtual_output]: 5.86e-06 [merge_forward]: 4.74e-06 [cell_reuse_recompute_pass]: 2.13002e-06 [offload_activation]: 7.86001e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.552e-05 [merge_recompute_call_nodes]: 1.36998e-06 [before_grad]: 9.76e-06 [set_forward_comm_id_for_comm_node_pass]: 3.57002e-06 [meta_fg_expand]: 2.34001e-06 [flash_sp_send_recv_attached]: 9.09989e-07 [receive_attached]: 1.41002e-06 [after_resolve]: 1.364e-05 [a_after_grad]: 1.107e-05 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 2.01003e-06 [auto_monad_grad]: 1.60999e-06 [auto_monad_eliminator]: 9.20999e-06 [cse]: 1.705e-05 [a_3]: 5.382e-05 [py_interpret_to_execute_after_opt_a]: 1.609e-05 [slice_cell_reuse_recomputed_activation]: 5.14e-06 [rewriter_after_opt_a]: 4.253e-05 [convert_after_rewriter]: 1.064e-05 [order_py_execute_after_rewriter]: 7.77e-06 [mutable_eliminate]: 0.00063309 [opt_b]: 0.0002794, [1] [Cycle 1]: 0.00026921, [7] [b_1]: 0.00016817 [b_2]: 9.06002e-06 [updatestate_depend_eliminate]: 8.36002e-06 [updatestate_assign_eliminate]: 2.81999e-06 [updatestate_loads_eliminate]: 3.03e-06 [renormalize]: 1.01997e-06 [cse]: 2e-05 [optimize_parallel_all_gather_comm]: 2.214e-05 [overlap_param_gather]: 5.52001e-06 [cconv]: 3.612e-05 [loop_unroll]: 0.00048047 [opt_after_cconv]: 0.00013381, [1] [Cycle 1]: 0.00012475, [7] [c_1]: 3.366e-05 [parameter_eliminate]: 4.17e-06 [updatestate_depend_eliminate]: 7.2e-06 [updatestate_assign_eliminate]: 2.44999e-06 [updatestate_loads_eliminate]: 2.52001e-06 [cse]: 1.886e-05 [renormalize]: 4.2998e-07 [remove_dup_value]: 1.675e-05 [tuple_transform]: 9.281e-05, [1] [Cycle 1]: 8.49e-05, [4] [d_1]: 4.55e-05 [none_parameter_eliminate]: 1.85001e-06 [renormalize]: 2.19996e-07 [switch_simplify]: 7.49002e-06 [partial_unused_args_eliminate]: 4.23999e-06 [add_recomputation]: 5.237e-05 [cse_after_recomputation]: 2.857e-05, [1] [Cycle 1]: 2.101e-05, [1] [cse]: 1.157e-05 [environ_conv]: 9.37001e-06 [swap_dp_allreduce_reducescatter]: 8.17e-06 [bias_add_comm_swap]: 5.59998e-06 [label_micro_interleaved_index]: 7.54002e-06 [label_fine_grained_interleaved_index]: 5.54998e-06 [merge_cast_opt]: 3.9e-06 [slice_recompute_activation]: 4.78001e-06 [micro_interleaved_order_control]: 5.35999e-06 [assign_add_opt]: 3.91999e-06 [ForceFp32Comm]: 4.03001e-06 [remove_cast_before_assign_add]: 3.58999e-06 [full_micro_interleaved_order_control]: 5.52999e-06 [reorder_send_recv_between_fp_bp]: 6.93e-06 [comm_op_add_attrs]: 3.66999e-06 [add_comm_op_reuse_tag]: 3.58999e-06 [interleave_split_concat_branches]: 3.81999e-06 [interleave_parallel_branches]: 3.63999e-06 [overlap_opt_shard_in_pipeline]: 3.91999e-06 [overlap_opt_shard_grad_in_pipeline]: 4.97999e-06 [control_data_broadcast_order]: 1.712e-05 [grouped_pairwise_exchange_alltoall]: 4.58999e-06 [offloading_packed_experts]: 8.03999e-06 [overlap_recompute_and_grad_model_parallel]: 7.8e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.68999e-06 [overlap_recompute_allgather_and_fa_grad]: 3.72998e-06 [overlap_recompute_comm]: 4.67998e-06 [overlap_grad_ring_attention]: 6.69001e-06 [overlap_grad_flash_sp]: 2.474e-05 [begin_end_overlap_inline]: 2.94999e-06 [split_matmul_comm_elemetwise]: 4.62998e-06 [split_layernorm_comm]: 4.28001e-06 [handle_group_info]: 3.58e-06 [symbol_engine_optimizer]: 0.00010747, [1] [Cycle 1]: 0.00010014, [6] [build]: 3.99002e-06 [elim_shapecalc]: 1.238e-05 [elim_not_effective]: 1.463e-05 [opt_reshape]: 8.06001e-06 [fold_const_symbol]: 1.147e-05 [renormalize]: 2.50002e-07 [detach_backward]: 4.19002e-06 [pipeline_parallel_scheduler]: 1.79e-06 [auto_monad_reorder]: 2.177e-05 [get_jit_bprop_graph]: 2.26e-06 [rewriter_after_jit_bprop_graph]: 5.61e-06 [opt_after_jit_grad]: 0.00053906 [validate]: 4.055e-05 Sums bootstrap : 0.000431s : 0.51% type_inference : 0.078193s : 92.89% event_method : 0.000019s : 0.02% auto_monad : 0.000064s : 0.08% graph_reusing : 0.000006s : 0.01% inline : 0.000002s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000023s : 0.03% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.01% parallel-infer-symbol : 0.000004s : 0.00% pre_auto_parallel : 0.000039s : 0.05% insert-virtual-dataset : 0.000002s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000032s : 0.04% optimize.rewriter_before_opt_a : 0.000093s : 0.11% optimize.opt_a.expand_dump_flag : 0.000005s : 0.01% optimize.opt_a.switch_simplify : 0.000051s : 0.06% optimize.opt_a.loop_unroll : 0.000038s : 0.04% optimize.opt_a.a_1 : 0.000820s : 0.97% optimize.opt_a.with_stream_mark : 0.000033s : 0.04% optimize.opt_a.recompute_prepare : 0.000017s : 0.02% optimize.opt_a.updatestate_depend_eliminate : 0.000008s : 0.01% optimize.opt_a.updatestate_assign_eliminate : 0.000006s : 0.01% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.01% optimize.opt_a.parameter_eliminate : 0.000003s : 0.00% optimize.opt_a.a_2 : 0.000242s : 0.29% optimize.opt_a.accelerated_algorithm : 0.000014s : 0.02% optimize.opt_a.shard : 0.000003s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.00% optimize.opt_a.shard_inline : 0.000013s : 0.02% optimize.opt_a.merge_send_recv : 0.000014s : 0.02% optimize.opt_a.auto_parallel : 0.000013s : 0.02% optimize.opt_a.parallel : 0.000026s : 0.03% optimize.opt_a.flash_sp : 0.000054s : 0.06% optimize.opt_a.merge_comm : 0.000010s : 0.01% optimize.opt_a.allreduce_fusion : 0.000008s : 0.01% optimize.opt_a.matmul_add_comm_reduction : 0.000018s : 0.02% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000021s : 0.03% optimize.opt_a.virtual_dataset : 0.000013s : 0.02% optimize.opt_a.get_grad_eliminate_ : 0.000013s : 0.02% optimize.opt_a.virtual_output : 0.000096s : 0.11% optimize.opt_a.merge_forward : 0.000010s : 0.01% optimize.opt_a.cell_reuse_recompute_pass : 0.000004s : 0.00% optimize.opt_a.offload_activation : 0.000020s : 0.02% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000035s : 0.04% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.00% optimize.opt_a.before_grad : 0.000022s : 0.03% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000008s : 0.01% optimize.opt_a.meta_fg_expand : 0.000005s : 0.01% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.00% optimize.opt_a.receive_attached : 0.000004s : 0.00% optimize.opt_a.after_resolve : 0.000027s : 0.03% optimize.opt_a.a_after_grad : 0.000021s : 0.03% optimize.opt_a.renormalize : 0.000798s : 0.95% optimize.opt_a.add_forward_monad_depend : 0.000009s : 0.01% optimize.opt_a.auto_monad_grad : 0.000004s : 0.00% optimize.opt_a.auto_monad_eliminator : 0.000026s : 0.03% optimize.opt_a.cse : 0.000046s : 0.05% optimize.opt_a.a_3 : 0.000117s : 0.14% optimize.py_interpret_to_execute_after_opt_a : 0.000016s : 0.02% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.01% optimize.rewriter_after_opt_a : 0.000043s : 0.05% optimize.convert_after_rewriter : 0.000011s : 0.01% optimize.order_py_execute_after_rewriter : 0.000008s : 0.01% optimize.mutable_eliminate : 0.000633s : 0.75% optimize.opt_b.b_1 : 0.000168s : 0.20% optimize.opt_b.b_2 : 0.000009s : 0.01% optimize.opt_b.updatestate_depend_eliminate : 0.000008s : 0.01% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.00% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000020s : 0.02% optimize.optimize_parallel_all_gather_comm : 0.000022s : 0.03% optimize.overlap_param_gather : 0.000006s : 0.01% optimize.cconv : 0.000036s : 0.04% optimize.loop_unroll : 0.000480s : 0.57% optimize.opt_after_cconv.c_1 : 0.000034s : 0.04% optimize.opt_after_cconv.parameter_eliminate : 0.000004s : 0.00% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000007s : 0.01% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000002s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.cse : 0.000019s : 0.02% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000017s : 0.02% optimize.tuple_transform.d_1 : 0.000045s : 0.05% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000007s : 0.01% optimize.partial_unused_args_eliminate : 0.000004s : 0.01% optimize.add_recomputation : 0.000052s : 0.06% optimize.cse_after_recomputation.cse : 0.000012s : 0.01% optimize.environ_conv : 0.000009s : 0.01% optimize.swap_dp_allreduce_reducescatter : 0.000008s : 0.01% optimize.bias_add_comm_swap : 0.000006s : 0.01% optimize.label_micro_interleaved_index : 0.000008s : 0.01% optimize.label_fine_grained_interleaved_index : 0.000006s : 0.01% optimize.merge_cast_opt : 0.000004s : 0.00% optimize.slice_recompute_activation : 0.000005s : 0.01% optimize.micro_interleaved_order_control : 0.000005s : 0.01% optimize.assign_add_opt : 0.000004s : 0.00% optimize.ForceFp32Comm : 0.000004s : 0.00% optimize.remove_cast_before_assign_add : 0.000004s : 0.00% optimize.full_micro_interleaved_order_control : 0.000006s : 0.01% optimize.reorder_send_recv_between_fp_bp : 0.000007s : 0.01% optimize.comm_op_add_attrs : 0.000004s : 0.00% optimize.add_comm_op_reuse_tag : 0.000004s : 0.00% optimize.interleave_split_concat_branches : 0.000004s : 0.00% optimize.interleave_parallel_branches : 0.000004s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000005s : 0.01% optimize.control_data_broadcast_order : 0.000017s : 0.02% optimize.grouped_pairwise_exchange_alltoall : 0.000005s : 0.01% optimize.offloading_packed_experts : 0.000008s : 0.01% optimize.overlap_recompute_and_grad_model_parallel : 0.000008s : 0.01% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.00% optimize.overlap_recompute_comm : 0.000005s : 0.01% optimize.overlap_grad_ring_attention : 0.000007s : 0.01% optimize.overlap_grad_flash_sp : 0.000025s : 0.03% optimize.begin_end_overlap_inline : 0.000003s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000005s : 0.01% optimize.split_layernorm_comm : 0.000004s : 0.01% optimize.handle_group_info : 0.000004s : 0.00% optimize.symbol_engine_optimizer.build : 0.000004s : 0.00% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000012s : 0.01% optimize.symbol_engine_optimizer.elim_not_effective : 0.000015s : 0.02% optimize.symbol_engine_optimizer.opt_reshape : 0.000008s : 0.01% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000011s : 0.01% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000004s : 0.00% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000022s : 0.03% get_jit_bprop_graph : 0.000002s : 0.00% rewriter_after_jit_bprop_graph : 0.000006s : 0.01% opt_after_jit_grad : 0.000539s : 0.64% validate : 0.000041s : 0.05% Time group info: ------[substitution.] 0.000236 34 14.89% : 0.000035s : 6: substitution.arithmetic_simplify 0.76% : 0.000002s : 2: substitution.elim_not_effective 0.69% : 0.000002s : 2: substitution.fold_const_symbol 2.60% : 0.000006s : 4: substitution.graph_param_transform 68.15% : 0.000161s : 4: substitution.inline 1.92% : 0.000005s : 4: substitution.j_node_and_user_rematch 2.15% : 0.000005s : 4: substitution.remove_not_recompute_node 2.41% : 0.000006s : 4: substitution.replace_old_param 6.42% : 0.000015s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.078135 2 99.05% : 0.077395s : 1: type_inference.infer 0.95% : 0.000739s : 1: type_inference.specialize ------[replace.] 0.000063 8 63.28% : 0.000040s : 4: replace.inline 36.72% : 0.000023s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000171 8 92.21% : 0.000158s : 4: match.inline 7.79% : 0.000013s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000211 1278 0.98% : 0.000002s : 13: predicate.accumulaten_eliminater 0.75% : 0.000002s : 4: predicate.ad_related_special_op_eliminate 0.55% : 0.000001s : 8: predicate.addn_check_dump 0.94% : 0.000002s : 13: predicate.addn_zero_filter 0.77% : 0.000002s : 13: predicate.adjust_all_reduce_mul_add 2.28% : 0.000005s : 21: predicate.arithmetic_simplify 0.89% : 0.000002s : 13: predicate.cast_eliminate 0.67% : 0.000001s : 8: predicate.check_bprop_eliminate 0.53% : 0.000001s : 8: predicate.compare_switch_simplify 0.18% : 0.000000s : 4: predicate.const_output_eliminate 0.68% : 0.000001s : 8: predicate.depend_value_elim 0.94% : 0.000002s : 13: predicate.dict_get_item_const_eliminator 1.07% : 0.000002s : 13: predicate.dict_get_item_eliminator 0.81% : 0.000002s : 13: predicate.dict_set_item_eliminator 1.21% : 0.000003s : 8: predicate.dumpgradient_eliminate 0.36% : 0.000001s : 4: predicate.elim_not_effective 0.34% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.15% : 0.000002s : 17: predicate.environ_add_const_eliminate 1.09% : 0.000002s : 17: predicate.environ_get_add_eliminate 1.03% : 0.000002s : 17: predicate.environ_get_depend_swap 1.67% : 0.000004s : 25: predicate.environ_get_eliminate 1.05% : 0.000002s : 17: predicate.environ_get_set_eliminate 1.39% : 0.000003s : 21: predicate.exchange_switch_depend_value 2.58% : 0.000005s : 21: predicate.float_depend_g_call 0.55% : 0.000001s : 8: predicate.float_environ_get_switch 0.80% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.22% : 0.000000s : 4: predicate.fold_const_symbol 0.60% : 0.000001s : 8: predicate.get_grad_eliminate 0.19% : 0.000000s : 4: predicate.graph_param_transform 0.60% : 0.000001s : 8: predicate.incorporate_call 0.48% : 0.000001s : 8: predicate.incorporate_call_switch 6.34% : 0.000013s : 58: predicate.inline 0.83% : 0.000002s : 8: predicate.inline_without_move 0.30% : 0.000001s : 8: predicate.j_node_and_user_rematch 0.78% : 0.000002s : 8: predicate.less_batch_normalization 1.88% : 0.000004s : 25: predicate.list_to_tuple_eliminator_ 2.40% : 0.000005s : 38: predicate.load_eliminater 1.05% : 0.000002s : 4: predicate.loop_unroll_after_grad 2.51% : 0.000005s : 34: predicate.loop_unroll_before_grad 1.54% : 0.000003s : 21: predicate.make_slice_get_slice_eliminator 0.55% : 0.000001s : 8: predicate.merge_addn 0.60% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.66% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.76% : 0.000002s : 13: predicate.minmaximum_grad 1.28% : 0.000003s : 4: predicate.mutable_eliminate 0.37% : 0.000001s : 4: predicate.opt_reshape 0.36% : 0.000001s : 4: predicate.parallel_virtual_node 1.74% : 0.000004s : 21: predicate.partial_defer_inline 1.59% : 0.000003s : 21: predicate.partial_eliminate 0.86% : 0.000002s : 13: predicate.print_const_string_wrapper 0.70% : 0.000001s : 8: predicate.reduce_all_const_elim 1.20% : 0.000003s : 13: predicate.reduce_eliminate 2.47% : 0.000005s : 38: predicate.redundant_stop_gradient_eliminater 0.39% : 0.000001s : 8: predicate.remove_not_recompute_node 1.50% : 0.000003s : 25: predicate.replace_applicator 0.68% : 0.000001s : 8: predicate.replace_old_param 0.24% : 0.000001s : 4: predicate.reset_defer_inline 1.06% : 0.000002s : 13: predicate.reshape_eliminate 0.62% : 0.000001s : 8: predicate.row_tensor_add_zeros_like 0.49% : 0.000001s : 4: predicate.row_tensor_eliminate 0.80% : 0.000002s : 8: predicate.same_eliminate 0.42% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.86% : 0.000002s : 8: predicate.shard_identity_eliminate 0.67% : 0.000001s : 8: predicate.special_op_eliminate 1.03% : 0.000002s : 8: predicate.specialize_transform 0.83% : 0.000002s : 8: predicate.split_environ_get_set_with_tuple_value 0.80% : 0.000002s : 8: predicate.stack_unstack_eliminate 0.32% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.47% : 0.000003s : 21: predicate.switch_defer_inline 1.98% : 0.000004s : 29: predicate.switch_layer_defer_inline 5.17% : 0.000011s : 67: predicate.switch_simplify 1.03% : 0.000002s : 13: predicate.tile_eliminate 0.90% : 0.000002s : 13: predicate.transpose_eliminate 1.55% : 0.000003s : 21: predicate.tuple_list_convert_item_index_to_positive 1.46% : 0.000003s : 21: predicate.tuple_list_get_item_const_eliminator 1.36% : 0.000003s : 21: predicate.tuple_list_get_item_depend_reorder 3.02% : 0.000006s : 33: predicate.tuple_list_get_item_eliminator 1.32% : 0.000003s : 21: predicate.tuple_list_get_set_item_eliminator 1.99% : 0.000004s : 29: predicate.tuple_list_set_item_eliminator 1.97% : 0.000004s : 25: predicate.tuple_to_list_eliminator_ 2.34% : 0.000005s : 38: predicate.updatestate_pure_node_eliminater 3.03% : 0.000006s : 46: predicate.updatestate_useless_node_eliminater 0.37% : 0.000001s : 4: predicate.value_based_eliminate 0.60% : 0.000001s : 8: predicate.virtual_dataset_eliminate 0.79% : 0.000002s : 8: predicate.virtual_output_eliminate 0.26% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.59% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000622 11 56.91% : 0.000354s : 5: func_graph_cloner_run.FuncGraphClonerGraph 43.09% : 0.000268s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.101258 192 0.01% : 0.000007s : 1: ForceFp32Comm 3.43% : 0.003474s : 1: add_attr 3.41% : 0.003456s : 1: add_attr_with_inline 0.01% : 0.000007s : 1: add_comm_op_reuse_tag 0.06% : 0.000056s : 1: add_recomputation 0.01% : 0.000007s : 1: assign_add_opt 0.07% : 0.000073s : 1: auto_monad 0.03% : 0.000030s : 1: auto_monad_reorder 0.01% : 0.000006s : 1: begin_end_overlap_inline 0.01% : 0.000009s : 1: bias_add_comm_swap 0.47% : 0.000480s : 1: bootstrap 0.04% : 0.000039s : 1: cconv 0.01% : 0.000006s : 1: comm_op_add_attrs 0.02% : 0.000020s : 1: control_data_broadcast_order 0.01% : 0.000014s : 1: convert_after_rewriter 0.03% : 0.000032s : 1: cse_after_recomputation 0.01% : 0.000007s : 1: dataset_repeat_opt 0.02% : 0.000020s : 1: detach_backward 0.01% : 0.000012s : 1: environ_conv 0.03% : 0.000030s : 1: event_method 0.01% : 0.000008s : 1: full_micro_interleaved_order_control 0.01% : 0.000009s : 1: get_jit_bprop_graph 0.01% : 0.000013s : 1: graph_reusing 0.01% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000006s : 1: handle_group_info 0.01% : 0.000008s : 1: inline 0.01% : 0.000009s : 1: insert-virtual-dataset 0.01% : 0.000006s : 1: interleave_parallel_branches 0.01% : 0.000007s : 1: interleave_split_concat_branches 0.01% : 0.000008s : 1: label_fine_grained_interleaved_index 0.01% : 0.000010s : 1: label_micro_interleaved_index 0.48% : 0.000487s : 1: loop_unroll 0.01% : 0.000007s : 1: merge_cast_opt 0.01% : 0.000008s : 1: micro_interleaved_order_control 0.63% : 0.000640s : 1: mutable_eliminate 0.01% : 0.000011s : 1: offloading_packed_experts 0.02% : 0.000016s : 1: opt.transform.loop_unroll_optimizer 0.02% : 0.000017s : 1: opt.transform.mutable_eliminate 1.36% : 0.001378s : 78: opt.transform.opt_a 0.03% : 0.000032s : 1: opt.transform.opt_after_cconv 0.03% : 0.000028s : 1: opt.transform.opt_after_jit_grad 0.10% : 0.000105s : 28: opt.transform.opt_b 0.05% : 0.000051s : 2: opt.transform.opt_trans_graph 0.04% : 0.000043s : 4: opt.transform.symbol_engine_opt 3.46% : 0.003503s : 1: opt_a 0.14% : 0.000137s : 1: opt_after_cconv 0.54% : 0.000550s : 1: opt_after_jit_grad 0.28% : 0.000284s : 1: opt_b 6.27% : 0.006349s : 1: optimize 0.03% : 0.000026s : 1: optimize_parallel_all_gather_comm 0.01% : 0.000011s : 1: order_py_execute_after_rewriter 0.03% : 0.000028s : 1: overlap_grad_flash_sp 0.01% : 0.000007s : 1: overlap_grad_matmul_and_grad_allreduce 0.01% : 0.000010s : 1: overlap_grad_ring_attention 0.01% : 0.000007s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000007s : 1: overlap_opt_shard_in_pipeline 0.01% : 0.000009s : 1: overlap_param_gather 0.01% : 0.000007s : 1: overlap_recompute_allgather_and_fa_grad 0.01% : 0.000011s : 1: overlap_recompute_and_grad_model_parallel 0.01% : 0.000007s : 1: overlap_recompute_comm 0.01% : 0.000011s : 1: parallel-infer-symbol 0.01% : 0.000006s : 1: parallel-infer-symbol-second 0.01% : 0.000007s : 1: partial_unused_args_eliminate 0.01% : 0.000009s : 1: pipeline_parallel_scheduler 0.01% : 0.000007s : 1: pipeline_split 0.05% : 0.000046s : 1: pre_auto_parallel 0.04% : 0.000036s : 1: py_interpret_to_execute 0.02% : 0.000020s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000006s : 1: remove_cast_before_assign_add 0.02% : 0.000020s : 1: remove_dup_value 0.43% : 0.000436s : 1: renormalize.infer 0.35% : 0.000353s : 1: renormalize.specialize 0.01% : 0.000010s : 1: reorder_send_recv_between_fp_bp 0.01% : 0.000011s : 1: rewriter_after_jit_bprop_graph 0.05% : 0.000046s : 1: rewriter_after_opt_a 0.10% : 0.000097s : 1: rewriter_before_opt_a 0.01% : 0.000008s : 1: slice_cell_reuse_recomputed_activation 0.01% : 0.000008s : 1: slice_recompute_activation 0.01% : 0.000007s : 1: split_layernorm_comm 0.01% : 0.000007s : 1: split_matmul_comm_elemetwise 0.01% : 0.000011s : 1: swap_dp_allreduce_reducescatter 0.11% : 0.000111s : 1: symbol_engine_optimizer 0.09% : 0.000096s : 1: tuple_transform 77.27% : 0.078238s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:43:34.914.438 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0445832, [21] [bootstrap]: 0.00039786 [type_inference]: 0.0059605 [event_method]: 1.891e-05 [auto_monad]: 0.00323832 [graph_reusing]: 7.45998e-06 [inline]: 3.23998e-06 [add_attr]: 0.00769664, [1] [add_attr_with_inline]: 0.00768317, [1] [Cycle 1]: 7.912e-05, [2] [tag_attr]: 2.422e-05 [meta_addattr_fg_expand]: 6.15002e-06 [parallel-infer-symbol]: 3.75e-06 [pre_auto_parallel]: 4.21e-05 [insert-virtual-dataset]: 2.79001e-06 [parallel-infer-symbol-second]: 1.03001e-06 [dataset_repeat_opt]: 2.54001e-06 [pipeline_split]: 1.94e-06 [optimize]: 0.0262879, [53] [py_interpret_to_execute]: 2.986e-05 [rewriter_before_opt_a]: 0.00010366 [opt_a]: 0.0235074, [2] [Cycle 1]: 0.0226401, [45] [expand_dump_flag]: 3.56001e-06 [switch_simplify]: 4.521e-05 [loop_unroll]: 3.087e-05 [a_1]: 0.00068081 [with_stream_mark]: 2.204e-05 [recompute_prepare]: 1.18e-05 [updatestate_depend_eliminate]: 4.28001e-06 [updatestate_assign_eliminate]: 3.36999e-06 [updatestate_loads_eliminate]: 2.91e-06 [parameter_eliminate]: 1.77999e-06 [a_2]: 9.047e-05 [accelerated_algorithm]: 8.50001e-06 [shard]: 2.19999e-06 [meta_shard_fg_expand]: 1.86e-06 [shard_inline]: 6.65998e-06 [merge_send_recv]: 8.80999e-06 [auto_parallel]: 7.53999e-06 [parallel]: 2.014e-05 [flash_sp]: 1.085e-05 [merge_comm]: 3.84002e-06 [allreduce_fusion]: 3.55998e-06 [matmul_add_comm_reduction]: 1.164e-05 [allreduce_slice_to_reducescatter]: 7.79983e-07 [virtual_shard_identity]: 8.80999e-06 [virtual_dataset]: 6.49001e-06 [get_grad_eliminate_]: 6.27001e-06 [virtual_output]: 6.71999e-06 [merge_forward]: 4.55001e-06 [cell_reuse_recompute_pass]: 1.17e-06 [offload_activation]: 1.038e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.378e-05 [merge_recompute_call_nodes]: 1.56998e-06 [before_grad]: 1.189e-05 [set_forward_comm_id_for_comm_node_pass]: 4.55001e-06 [meta_fg_expand]: 3.2e-06 [flash_sp_send_recv_attached]: 3.35e-06 [receive_attached]: 2.85002e-06 [after_resolve]: 1.381e-05 [a_after_grad]: 1.109e-05 [renormalize]: 0.0211007 [add_forward_monad_depend]: 1.222e-05 [auto_monad_grad]: 2.98e-06 [auto_monad_eliminator]: 2.581e-05 [cse]: 3.607e-05 [a_3]: 6.95e-05 [Cycle 2]: 0.00085181, [45] [expand_dump_flag]: 2.47001e-06 [switch_simplify]: 1.079e-05 [loop_unroll]: 8.17e-06 [a_1]: 0.00017135 [with_stream_mark]: 2.312e-05 [recompute_prepare]: 8.55001e-06 [updatestate_depend_eliminate]: 4.33999e-06 [updatestate_assign_eliminate]: 3.03e-06 [updatestate_loads_eliminate]: 3.75e-06 [parameter_eliminate]: 2.66e-06 [a_2]: 8.475e-05 [accelerated_algorithm]: 7.43e-06 [shard]: 2.99999e-06 [meta_shard_fg_expand]: 2.54001e-06 [shard_inline]: 6.90998e-06 [merge_send_recv]: 1.185e-05 [auto_parallel]: 9.66e-06 [parallel]: 9.53002e-06 [flash_sp]: 4.50001e-06 [merge_comm]: 3.93001e-06 [allreduce_fusion]: 4.3e-06 [matmul_add_comm_reduction]: 1.072e-05 [allreduce_slice_to_reducescatter]: 9.60019e-07 [virtual_shard_identity]: 9.69e-06 [virtual_dataset]: 7.21001e-06 [get_grad_eliminate_]: 6.69001e-06 [virtual_output]: 6.35002e-06 [merge_forward]: 4.63001e-06 [cell_reuse_recompute_pass]: 3.71001e-06 [offload_activation]: 1.16e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.567e-05 [merge_recompute_call_nodes]: 1.67999e-06 [before_grad]: 1.231e-05 [set_forward_comm_id_for_comm_node_pass]: 4.06001e-06 [meta_fg_expand]: 2.91e-06 [flash_sp_send_recv_attached]: 1.69e-06 [receive_attached]: 2.71e-06 [after_resolve]: 1.38e-05 [a_after_grad]: 1.078e-05 [renormalize]: 7.00238e-08 [add_forward_monad_depend]: 2.81999e-06 [auto_monad_grad]: 2.10002e-06 [auto_monad_eliminator]: 1.216e-05 [cse]: 2.128e-05 [a_3]: 4.432e-05 [py_interpret_to_execute_after_opt_a]: 2.064e-05 [slice_cell_reuse_recomputed_activation]: 2.62001e-06 [rewriter_after_opt_a]: 4.628e-05 [convert_after_rewriter]: 8.07e-06 [order_py_execute_after_rewriter]: 5.55001e-06 [mutable_eliminate]: 0.00084175 [opt_b]: 0.00024909, [1] [Cycle 1]: 0.00023968, [7] [b_1]: 0.00013768 [b_2]: 1.017e-05 [updatestate_depend_eliminate]: 1.061e-05 [updatestate_assign_eliminate]: 3.34001e-06 [updatestate_loads_eliminate]: 3.5e-06 [renormalize]: 9.5999e-07 [cse]: 3.38e-05 [optimize_parallel_all_gather_comm]: 2.249e-05 [overlap_param_gather]: 2.34001e-06 [cconv]: 3.64e-05 [loop_unroll]: 0.00063372 [opt_after_cconv]: 0.0001267, [1] [Cycle 1]: 0.00011851, [7] [c_1]: 3.728e-05 [parameter_eliminate]: 5.49e-06 [updatestate_depend_eliminate]: 7.25998e-06 [updatestate_assign_eliminate]: 2.70997e-06 [updatestate_loads_eliminate]: 2.79999e-06 [cse]: 2.591e-05 [renormalize]: 7.30011e-07 [remove_dup_value]: 1.478e-05 [tuple_transform]: 8.33e-05, [1] [Cycle 1]: 7.825e-05, [4] [d_1]: 4.808e-05 [none_parameter_eliminate]: 1.71e-06 [renormalize]: 3.89991e-07 [switch_simplify]: 8.36002e-06 [partial_unused_args_eliminate]: 1.87001e-06 [add_recomputation]: 5.657e-05 [cse_after_recomputation]: 2.425e-05, [1] [Cycle 1]: 1.902e-05, [1] [cse]: 1.239e-05 [environ_conv]: 5.92999e-06 [swap_dp_allreduce_reducescatter]: 5.67999e-06 [bias_add_comm_swap]: 3.41001e-06 [label_micro_interleaved_index]: 5.61e-06 [label_fine_grained_interleaved_index]: 3.23e-06 [merge_cast_opt]: 1.62999e-06 [slice_recompute_activation]: 2.07999e-06 [micro_interleaved_order_control]: 2.79999e-06 [assign_add_opt]: 1.72001e-06 [ForceFp32Comm]: 1.37999e-06 [remove_cast_before_assign_add]: 1.10999e-06 [full_micro_interleaved_order_control]: 2.12999e-06 [reorder_send_recv_between_fp_bp]: 2.86e-06 [comm_op_add_attrs]: 1.06002e-06 [add_comm_op_reuse_tag]: 1.00999e-06 [interleave_split_concat_branches]: 1.34e-06 [interleave_parallel_branches]: 1.14e-06 [overlap_opt_shard_in_pipeline]: 1.85001e-06 [overlap_opt_shard_grad_in_pipeline]: 1.89e-06 [control_data_broadcast_order]: 1.427e-05 [grouped_pairwise_exchange_alltoall]: 2.11e-06 [offloading_packed_experts]: 4.3e-06 [overlap_recompute_and_grad_model_parallel]: 5.31002e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.23002e-06 [overlap_recompute_allgather_and_fa_grad]: 1.37999e-06 [overlap_recompute_comm]: 2.43002e-06 [overlap_grad_ring_attention]: 5.44998e-06 [overlap_grad_flash_sp]: 2.402e-05 [begin_end_overlap_inline]: 6.89994e-07 [split_matmul_comm_elemetwise]: 2.15002e-06 [split_layernorm_comm]: 1.58002e-06 [handle_group_info]: 1.00001e-06 [symbol_engine_optimizer]: 8.928e-05, [1] [Cycle 1]: 8.471e-05, [6] [build]: 3.94002e-06 [elim_shapecalc]: 1.356e-05 [elim_not_effective]: 1.421e-05 [opt_reshape]: 8.18999e-06 [fold_const_symbol]: 1.096e-05 [renormalize]: 1.60013e-07 [detach_backward]: 2.37999e-06 [pipeline_parallel_scheduler]: 1.69e-06 [auto_monad_reorder]: 1.771e-05 [get_jit_bprop_graph]: 1.92999e-06 [rewriter_after_jit_bprop_graph]: 4.16001e-06 [opt_after_jit_grad]: 0.00065775 [validate]: 4.376e-05 Sums bootstrap : 0.000398s : 1.11% type_inference : 0.005960s : 16.69% event_method : 0.000019s : 0.05% auto_monad : 0.003238s : 9.07% graph_reusing : 0.000007s : 0.02% inline : 0.000003s : 0.01% add_attr.add_attr_with_inline.tag_attr : 0.000024s : 0.07% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.02% parallel-infer-symbol : 0.000004s : 0.01% pre_auto_parallel : 0.000042s : 0.12% insert-virtual-dataset : 0.000003s : 0.01% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000003s : 0.01% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000030s : 0.08% optimize.rewriter_before_opt_a : 0.000104s : 0.29% optimize.opt_a.expand_dump_flag : 0.000006s : 0.02% optimize.opt_a.switch_simplify : 0.000056s : 0.16% optimize.opt_a.loop_unroll : 0.000039s : 0.11% optimize.opt_a.a_1 : 0.000852s : 2.39% optimize.opt_a.with_stream_mark : 0.000045s : 0.13% optimize.opt_a.recompute_prepare : 0.000020s : 0.06% optimize.opt_a.updatestate_depend_eliminate : 0.000009s : 0.02% optimize.opt_a.updatestate_assign_eliminate : 0.000006s : 0.02% optimize.opt_a.updatestate_loads_eliminate : 0.000007s : 0.02% optimize.opt_a.parameter_eliminate : 0.000004s : 0.01% optimize.opt_a.a_2 : 0.000175s : 0.49% optimize.opt_a.accelerated_algorithm : 0.000016s : 0.04% optimize.opt_a.shard : 0.000005s : 0.01% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.01% optimize.opt_a.shard_inline : 0.000014s : 0.04% optimize.opt_a.merge_send_recv : 0.000021s : 0.06% optimize.opt_a.auto_parallel : 0.000017s : 0.05% optimize.opt_a.parallel : 0.000030s : 0.08% optimize.opt_a.flash_sp : 0.000015s : 0.04% optimize.opt_a.merge_comm : 0.000008s : 0.02% optimize.opt_a.allreduce_fusion : 0.000008s : 0.02% optimize.opt_a.matmul_add_comm_reduction : 0.000022s : 0.06% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000002s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000018s : 0.05% optimize.opt_a.virtual_dataset : 0.000014s : 0.04% optimize.opt_a.get_grad_eliminate_ : 0.000013s : 0.04% optimize.opt_a.virtual_output : 0.000013s : 0.04% optimize.opt_a.merge_forward : 0.000009s : 0.03% optimize.opt_a.cell_reuse_recompute_pass : 0.000005s : 0.01% optimize.opt_a.offload_activation : 0.000022s : 0.06% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000029s : 0.08% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.01% optimize.opt_a.before_grad : 0.000024s : 0.07% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000009s : 0.02% optimize.opt_a.meta_fg_expand : 0.000006s : 0.02% optimize.opt_a.flash_sp_send_recv_attached : 0.000005s : 0.01% optimize.opt_a.receive_attached : 0.000006s : 0.02% optimize.opt_a.after_resolve : 0.000028s : 0.08% optimize.opt_a.a_after_grad : 0.000022s : 0.06% optimize.opt_a.renormalize : 0.021101s : 59.07% optimize.opt_a.add_forward_monad_depend : 0.000015s : 0.04% optimize.opt_a.auto_monad_grad : 0.000005s : 0.01% optimize.opt_a.auto_monad_eliminator : 0.000038s : 0.11% optimize.opt_a.cse : 0.000057s : 0.16% optimize.opt_a.a_3 : 0.000114s : 0.32% optimize.py_interpret_to_execute_after_opt_a : 0.000021s : 0.06% optimize.slice_cell_reuse_recomputed_activation : 0.000003s : 0.01% optimize.rewriter_after_opt_a : 0.000046s : 0.13% optimize.convert_after_rewriter : 0.000008s : 0.02% optimize.order_py_execute_after_rewriter : 0.000006s : 0.02% optimize.mutable_eliminate : 0.000842s : 2.36% optimize.opt_b.b_1 : 0.000138s : 0.39% optimize.opt_b.b_2 : 0.000010s : 0.03% optimize.opt_b.updatestate_depend_eliminate : 0.000011s : 0.03% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.01% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.01% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000034s : 0.09% optimize.optimize_parallel_all_gather_comm : 0.000022s : 0.06% optimize.overlap_param_gather : 0.000002s : 0.01% optimize.cconv : 0.000036s : 0.10% optimize.loop_unroll : 0.000634s : 1.77% optimize.opt_after_cconv.c_1 : 0.000037s : 0.10% optimize.opt_after_cconv.parameter_eliminate : 0.000005s : 0.02% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000007s : 0.02% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.cse : 0.000026s : 0.07% optimize.opt_after_cconv.renormalize : 0.000001s : 0.00% optimize.remove_dup_value : 0.000015s : 0.04% optimize.tuple_transform.d_1 : 0.000048s : 0.13% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000008s : 0.02% optimize.partial_unused_args_eliminate : 0.000002s : 0.01% optimize.add_recomputation : 0.000057s : 0.16% optimize.cse_after_recomputation.cse : 0.000012s : 0.03% optimize.environ_conv : 0.000006s : 0.02% optimize.swap_dp_allreduce_reducescatter : 0.000006s : 0.02% optimize.bias_add_comm_swap : 0.000003s : 0.01% optimize.label_micro_interleaved_index : 0.000006s : 0.02% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.01% optimize.merge_cast_opt : 0.000002s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.01% optimize.micro_interleaved_order_control : 0.000003s : 0.01% optimize.assign_add_opt : 0.000002s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.00% optimize.full_micro_interleaved_order_control : 0.000002s : 0.01% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.01% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000002s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.01% optimize.control_data_broadcast_order : 0.000014s : 0.04% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.01% optimize.offloading_packed_experts : 0.000004s : 0.01% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.01% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.00% optimize.overlap_recompute_comm : 0.000002s : 0.01% optimize.overlap_grad_ring_attention : 0.000005s : 0.02% optimize.overlap_grad_flash_sp : 0.000024s : 0.07% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.01% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000004s : 0.01% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000014s : 0.04% optimize.symbol_engine_optimizer.elim_not_effective : 0.000014s : 0.04% optimize.symbol_engine_optimizer.opt_reshape : 0.000008s : 0.02% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000011s : 0.03% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.01% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000018s : 0.05% get_jit_bprop_graph : 0.000002s : 0.01% rewriter_after_jit_bprop_graph : 0.000004s : 0.01% opt_after_jit_grad : 0.000658s : 1.84% validate : 0.000044s : 0.12% Time group info: ------[substitution.] 0.000252 34 16.75% : 0.000042s : 6: substitution.arithmetic_simplify 0.76% : 0.000002s : 2: substitution.elim_not_effective 0.71% : 0.000002s : 2: substitution.fold_const_symbol 2.53% : 0.000006s : 4: substitution.graph_param_transform 65.82% : 0.000166s : 4: substitution.inline 2.32% : 0.000006s : 4: substitution.j_node_and_user_rematch 2.07% : 0.000005s : 4: substitution.remove_not_recompute_node 2.60% : 0.000007s : 4: substitution.replace_old_param 6.44% : 0.000016s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.005895 2 87.88% : 0.005181s : 1: type_inference.infer 12.12% : 0.000715s : 1: type_inference.specialize ------[replace.] 0.000067 8 62.42% : 0.000042s : 4: replace.inline 37.58% : 0.000025s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000178 8 91.91% : 0.000163s : 4: match.inline 8.09% : 0.000014s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000232 1278 0.97% : 0.000002s : 13: predicate.accumulaten_eliminater 0.72% : 0.000002s : 4: predicate.ad_related_special_op_eliminate 0.60% : 0.000001s : 8: predicate.addn_check_dump 0.97% : 0.000002s : 13: predicate.addn_zero_filter 0.75% : 0.000002s : 13: predicate.adjust_all_reduce_mul_add 2.56% : 0.000006s : 21: predicate.arithmetic_simplify 0.87% : 0.000002s : 13: predicate.cast_eliminate 0.71% : 0.000002s : 8: predicate.check_bprop_eliminate 0.46% : 0.000001s : 8: predicate.compare_switch_simplify 0.19% : 0.000000s : 4: predicate.const_output_eliminate 0.65% : 0.000002s : 8: predicate.depend_value_elim 0.95% : 0.000002s : 13: predicate.dict_get_item_const_eliminator 0.98% : 0.000002s : 13: predicate.dict_get_item_eliminator 0.81% : 0.000002s : 13: predicate.dict_set_item_eliminator 0.99% : 0.000002s : 8: predicate.dumpgradient_eliminate 0.24% : 0.000001s : 4: predicate.elim_not_effective 0.49% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.17% : 0.000003s : 17: predicate.environ_add_const_eliminate 1.14% : 0.000003s : 17: predicate.environ_get_add_eliminate 0.99% : 0.000002s : 17: predicate.environ_get_depend_swap 1.62% : 0.000004s : 25: predicate.environ_get_eliminate 1.05% : 0.000002s : 17: predicate.environ_get_set_eliminate 1.29% : 0.000003s : 21: predicate.exchange_switch_depend_value 2.34% : 0.000005s : 21: predicate.float_depend_g_call 0.66% : 0.000002s : 8: predicate.float_environ_get_switch 0.71% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.14% : 0.000000s : 4: predicate.fold_const_symbol 0.69% : 0.000002s : 8: predicate.get_grad_eliminate 0.16% : 0.000000s : 4: predicate.graph_param_transform 0.51% : 0.000001s : 8: predicate.incorporate_call 0.42% : 0.000001s : 8: predicate.incorporate_call_switch 6.01% : 0.000014s : 58: predicate.inline 1.02% : 0.000002s : 8: predicate.inline_without_move 0.29% : 0.000001s : 8: predicate.j_node_and_user_rematch 1.06% : 0.000002s : 8: predicate.less_batch_normalization 1.72% : 0.000004s : 25: predicate.list_to_tuple_eliminator_ 2.33% : 0.000005s : 38: predicate.load_eliminater 1.10% : 0.000003s : 4: predicate.loop_unroll_after_grad 2.52% : 0.000006s : 34: predicate.loop_unroll_before_grad 1.63% : 0.000004s : 21: predicate.make_slice_get_slice_eliminator 0.50% : 0.000001s : 8: predicate.merge_addn 0.62% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.69% : 0.000002s : 8: predicate.mini_step_allgather_replace 0.76% : 0.000002s : 13: predicate.minmaximum_grad 1.33% : 0.000003s : 4: predicate.mutable_eliminate 0.41% : 0.000001s : 4: predicate.opt_reshape 0.38% : 0.000001s : 4: predicate.parallel_virtual_node 1.63% : 0.000004s : 21: predicate.partial_defer_inline 1.47% : 0.000003s : 21: predicate.partial_eliminate 0.94% : 0.000002s : 13: predicate.print_const_string_wrapper 0.57% : 0.000001s : 8: predicate.reduce_all_const_elim 1.07% : 0.000002s : 13: predicate.reduce_eliminate 2.30% : 0.000005s : 38: predicate.redundant_stop_gradient_eliminater 0.49% : 0.000001s : 8: predicate.remove_not_recompute_node 1.49% : 0.000003s : 25: predicate.replace_applicator 0.55% : 0.000001s : 8: predicate.replace_old_param 0.38% : 0.000001s : 4: predicate.reset_defer_inline 1.10% : 0.000003s : 13: predicate.reshape_eliminate 0.69% : 0.000002s : 8: predicate.row_tensor_add_zeros_like 0.45% : 0.000001s : 4: predicate.row_tensor_eliminate 1.14% : 0.000003s : 8: predicate.same_eliminate 0.45% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.85% : 0.000002s : 8: predicate.shard_identity_eliminate 0.81% : 0.000002s : 8: predicate.special_op_eliminate 0.62% : 0.000001s : 8: predicate.specialize_transform 1.21% : 0.000003s : 8: predicate.split_environ_get_set_with_tuple_value 0.72% : 0.000002s : 8: predicate.stack_unstack_eliminate 0.39% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.38% : 0.000003s : 21: predicate.switch_defer_inline 1.85% : 0.000004s : 29: predicate.switch_layer_defer_inline 4.89% : 0.000011s : 67: predicate.switch_simplify 0.82% : 0.000002s : 13: predicate.tile_eliminate 1.07% : 0.000002s : 13: predicate.transpose_eliminate 1.75% : 0.000004s : 21: predicate.tuple_list_convert_item_index_to_positive 1.46% : 0.000003s : 21: predicate.tuple_list_get_item_const_eliminator 1.52% : 0.000004s : 21: predicate.tuple_list_get_item_depend_reorder 3.49% : 0.000008s : 33: predicate.tuple_list_get_item_eliminator 1.35% : 0.000003s : 21: predicate.tuple_list_get_set_item_eliminator 2.25% : 0.000005s : 29: predicate.tuple_list_set_item_eliminator 1.71% : 0.000004s : 25: predicate.tuple_to_list_eliminator_ 2.28% : 0.000005s : 38: predicate.updatestate_pure_node_eliminater 3.07% : 0.000007s : 46: predicate.updatestate_useless_node_eliminater 0.41% : 0.000001s : 4: predicate.value_based_eliminate 0.66% : 0.000002s : 8: predicate.virtual_dataset_eliminate 0.74% : 0.000002s : 8: predicate.virtual_output_eliminate 0.34% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.48% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.001295 11 23.66% : 0.000306s : 5: func_graph_cloner_run.FuncGraphClonerGraph 76.34% : 0.000988s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.101148 192 0.00% : 0.000004s : 1: ForceFp32Comm 7.62% : 0.007704s : 1: add_attr 7.60% : 0.007687s : 1: add_attr_with_inline 0.01% : 0.000005s : 1: add_comm_op_reuse_tag 0.06% : 0.000061s : 1: add_recomputation 0.00% : 0.000005s : 1: assign_add_opt 3.23% : 0.003263s : 1: auto_monad 0.02% : 0.000022s : 1: auto_monad_reorder 0.00% : 0.000003s : 1: begin_end_overlap_inline 0.01% : 0.000007s : 1: bias_add_comm_swap 0.42% : 0.000426s : 1: bootstrap 0.04% : 0.000041s : 1: cconv 0.00% : 0.000004s : 1: comm_op_add_attrs 0.02% : 0.000018s : 1: control_data_broadcast_order 0.01% : 0.000012s : 1: convert_after_rewriter 0.03% : 0.000027s : 1: cse_after_recomputation 0.01% : 0.000005s : 1: dataset_repeat_opt 0.01% : 0.000006s : 1: detach_backward 0.01% : 0.000009s : 1: environ_conv 0.03% : 0.000026s : 1: event_method 0.01% : 0.000005s : 1: full_micro_interleaved_order_control 0.00% : 0.000005s : 1: get_jit_bprop_graph 0.01% : 0.000014s : 1: graph_reusing 0.00% : 0.000005s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000004s : 1: handle_group_info 0.01% : 0.000007s : 1: inline 0.01% : 0.000006s : 1: insert-virtual-dataset 0.00% : 0.000004s : 1: interleave_parallel_branches 0.00% : 0.000005s : 1: interleave_split_concat_branches 0.01% : 0.000007s : 1: label_fine_grained_interleaved_index 0.01% : 0.000009s : 1: label_micro_interleaved_index 0.64% : 0.000644s : 1: loop_unroll 0.00% : 0.000004s : 1: merge_cast_opt 0.01% : 0.000006s : 1: micro_interleaved_order_control 0.85% : 0.000860s : 1: mutable_eliminate 0.01% : 0.000007s : 1: offloading_packed_experts 0.02% : 0.000020s : 1: opt.transform.loop_unroll_optimizer 0.03% : 0.000026s : 1: opt.transform.mutable_eliminate 1.32% : 0.001333s : 78: opt.transform.opt_a 0.03% : 0.000035s : 1: opt.transform.opt_after_cconv 0.03% : 0.000029s : 1: opt.transform.opt_after_jit_grad 0.11% : 0.000111s : 28: opt.transform.opt_b 0.05% : 0.000053s : 2: opt.transform.opt_trans_graph 0.04% : 0.000042s : 4: opt.transform.symbol_engine_opt 23.24% : 0.023512s : 1: opt_a 0.13% : 0.000131s : 1: opt_after_cconv 0.66% : 0.000669s : 1: opt_after_jit_grad 0.25% : 0.000253s : 1: opt_b 26.00% : 0.026294s : 1: optimize 0.03% : 0.000026s : 1: optimize_parallel_all_gather_comm 0.01% : 0.000009s : 1: order_py_execute_after_rewriter 0.03% : 0.000028s : 1: overlap_grad_flash_sp 0.00% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.01% : 0.000009s : 1: overlap_grad_ring_attention 0.01% : 0.000006s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000005s : 1: overlap_opt_shard_in_pipeline 0.01% : 0.000005s : 1: overlap_param_gather 0.00% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.01% : 0.000009s : 1: overlap_recompute_and_grad_model_parallel 0.01% : 0.000006s : 1: overlap_recompute_comm 0.01% : 0.000008s : 1: parallel-infer-symbol 0.00% : 0.000004s : 1: parallel-infer-symbol-second 0.01% : 0.000005s : 1: partial_unused_args_eliminate 0.00% : 0.000005s : 1: pipeline_parallel_scheduler 0.00% : 0.000005s : 1: pipeline_split 0.05% : 0.000046s : 1: pre_auto_parallel 0.03% : 0.000034s : 1: py_interpret_to_execute 0.02% : 0.000025s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000004s : 1: remove_cast_before_assign_add 0.02% : 0.000018s : 1: remove_dup_value 19.66% : 0.019887s : 1: renormalize.infer 1.18% : 0.001193s : 1: renormalize.specialize 0.01% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.01% : 0.000008s : 1: rewriter_after_jit_bprop_graph 0.05% : 0.000054s : 1: rewriter_after_opt_a 0.11% : 0.000110s : 1: rewriter_before_opt_a 0.01% : 0.000006s : 1: slice_cell_reuse_recomputed_activation 0.01% : 0.000006s : 1: slice_recompute_activation 0.01% : 0.000006s : 1: split_layernorm_comm 0.00% : 0.000005s : 1: split_matmul_comm_elemetwise 0.01% : 0.000009s : 1: swap_dp_allreduce_reducescatter 0.09% : 0.000093s : 1: symbol_engine_optimizer 0.09% : 0.000086s : 1: tuple_transform 5.91% : 0.005980s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:43:35.739.089 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:43:35.739.363 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.120669, [21] [bootstrap]: 0.00042705 [type_inference]: 0.0886784 [event_method]: 2.406e-05 [auto_monad]: 6.863e-05 [graph_reusing]: 5.94999e-06 [inline]: 2.63e-06 [add_attr]: 0.00366975, [1] [add_attr_with_inline]: 0.00365726, [1] [Cycle 1]: 9.384e-05, [2] [tag_attr]: 2.298e-05 [meta_addattr_fg_expand]: 6.14001e-06 [parallel-infer-symbol]: 4.1e-06 [pre_auto_parallel]: 4.187e-05 [insert-virtual-dataset]: 2.34999e-06 [parallel-infer-symbol-second]: 7.89994e-07 [dataset_repeat_opt]: 1.99999e-06 [pipeline_split]: 1.59e-06 [optimize]: 0.00601202, [53] [py_interpret_to_execute]: 3.476e-05 [rewriter_before_opt_a]: 9.487e-05 [opt_a]: 0.0033783, [2] [Cycle 1]: 0.00246426, [45] [expand_dump_flag]: 3.05002e-06 [switch_simplify]: 4.309e-05 [loop_unroll]: 2.98e-05 [a_1]: 0.00067202 [with_stream_mark]: 2.22e-05 [recompute_prepare]: 1.081e-05 [updatestate_depend_eliminate]: 4.4e-06 [updatestate_assign_eliminate]: 3.56001e-06 [updatestate_loads_eliminate]: 3.11999e-06 [parameter_eliminate]: 2.53003e-06 [a_2]: 0.00011791 [accelerated_algorithm]: 7.55998e-06 [shard]: 2.41e-06 [meta_shard_fg_expand]: 1.99999e-06 [shard_inline]: 7.13e-06 [merge_send_recv]: 8.48999e-06 [auto_parallel]: 8.05e-06 [parallel]: 1.909e-05 [flash_sp]: 8.33001e-06 [merge_comm]: 4.46002e-06 [allreduce_fusion]: 3.97e-06 [matmul_add_comm_reduction]: 1.038e-05 [allreduce_slice_to_reducescatter]: 1.14e-06 [virtual_shard_identity]: 9.18002e-06 [virtual_dataset]: 7.20003e-06 [get_grad_eliminate_]: 6.38e-06 [virtual_output]: 7.41999e-06 [merge_forward]: 4.25999e-06 [cell_reuse_recompute_pass]: 1.99e-06 [offload_activation]: 1.043e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.444e-05 [merge_recompute_call_nodes]: 1.47001e-06 [before_grad]: 1.137e-05 [set_forward_comm_id_for_comm_node_pass]: 4.26001e-06 [meta_fg_expand]: 2.92002e-06 [flash_sp_send_recv_attached]: 3.45998e-06 [receive_attached]: 2.32001e-06 [after_resolve]: 1.208e-05 [a_after_grad]: 1.067e-05 [renormalize]: 0.00079471 [add_forward_monad_depend]: 2.529e-05 [auto_monad_grad]: 2.84001e-06 [auto_monad_eliminator]: 1.956e-05 [cse]: 2.965e-05 [a_3]: 7.178e-05 [Cycle 2]: 0.00089649, [45] [expand_dump_flag]: 2.31998e-06 [switch_simplify]: 8.97e-06 [loop_unroll]: 6.04001e-06 [a_1]: 0.00014821 [with_stream_mark]: 1.697e-05 [recompute_prepare]: 7.13998e-06 [updatestate_depend_eliminate]: 3.97002e-06 [updatestate_assign_eliminate]: 2.55997e-06 [updatestate_loads_eliminate]: 2.31e-06 [parameter_eliminate]: 1.58002e-06 [a_2]: 0.00010828 [accelerated_algorithm]: 7.26999e-06 [shard]: 1.92001e-06 [meta_shard_fg_expand]: 1.82001e-06 [shard_inline]: 6.61999e-06 [merge_send_recv]: 7.15e-06 [auto_parallel]: 7.88001e-06 [parallel]: 8.46002e-06 [flash_sp]: 4.14002e-06 [merge_comm]: 3.48e-06 [allreduce_fusion]: 3.77002e-06 [matmul_add_comm_reduction]: 7.38999e-06 [allreduce_slice_to_reducescatter]: 7.89994e-07 [virtual_shard_identity]: 6.79001e-06 [virtual_dataset]: 6.45997e-06 [get_grad_eliminate_]: 5.87001e-06 [virtual_output]: 5.79e-06 [merge_forward]: 3.71001e-06 [cell_reuse_recompute_pass]: 2.22001e-06 [offload_activation]: 8.93002e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.469e-05 [merge_recompute_call_nodes]: 1.44998e-06 [before_grad]: 1.021e-05 [set_forward_comm_id_for_comm_node_pass]: 3.75e-06 [meta_fg_expand]: 2.68998e-06 [flash_sp_send_recv_attached]: 1.49998e-06 [receive_attached]: 2.14e-06 [after_resolve]: 1.246e-05 [a_after_grad]: 9.52999e-06 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 2.01e-06 [auto_monad_grad]: 1.89999e-06 [auto_monad_eliminator]: 9.20999e-06 [cse]: 1.688e-05 [a_3]: 5.269e-05 [py_interpret_to_execute_after_opt_a]: 1.822e-05 [slice_cell_reuse_recomputed_activation]: 4.85001e-06 [rewriter_after_opt_a]: 4.586e-05 [convert_after_rewriter]: 1.099e-05 [order_py_execute_after_rewriter]: 8.26002e-06 [mutable_eliminate]: 0.00073587 [opt_b]: 0.00028583, [1] [Cycle 1]: 0.00027421, [7] [b_1]: 0.00016945 [b_2]: 8.35001e-06 [updatestate_depend_eliminate]: 9.22001e-06 [updatestate_assign_eliminate]: 2.97002e-06 [updatestate_loads_eliminate]: 2.61999e-06 [renormalize]: 7.7e-07 [cse]: 2.212e-05 [optimize_parallel_all_gather_comm]: 2.336e-05 [overlap_param_gather]: 5.55001e-06 [cconv]: 3.668e-05 [loop_unroll]: 0.00047638 [opt_after_cconv]: 0.00013588, [1] [Cycle 1]: 0.00012597, [7] [c_1]: 3.219e-05 [parameter_eliminate]: 5.57001e-06 [updatestate_depend_eliminate]: 5.99e-06 [updatestate_assign_eliminate]: 3.04999e-06 [updatestate_loads_eliminate]: 2.66999e-06 [cse]: 1.852e-05 [renormalize]: 8.30012e-07 [remove_dup_value]: 1.707e-05 [tuple_transform]: 9.403e-05, [1] [Cycle 1]: 8.635e-05, [4] [d_1]: 4.665e-05 [none_parameter_eliminate]: 1.44e-06 [renormalize]: 4.19997e-07 [switch_simplify]: 7.45e-06 [partial_unused_args_eliminate]: 4.52e-06 [add_recomputation]: 5.349e-05 [cse_after_recomputation]: 2.991e-05, [1] [Cycle 1]: 2.174e-05, [1] [cse]: 1.207e-05 [environ_conv]: 8.32e-06 [swap_dp_allreduce_reducescatter]: 8.74998e-06 [bias_add_comm_swap]: 5.65001e-06 [label_micro_interleaved_index]: 7.16999e-06 [label_fine_grained_interleaved_index]: 6.30002e-06 [merge_cast_opt]: 3.83001e-06 [slice_recompute_activation]: 4.82e-06 [micro_interleaved_order_control]: 5.14e-06 [assign_add_opt]: 3.85e-06 [ForceFp32Comm]: 3.38e-06 [remove_cast_before_assign_add]: 3.58e-06 [full_micro_interleaved_order_control]: 4.68999e-06 [reorder_send_recv_between_fp_bp]: 6.11e-06 [comm_op_add_attrs]: 4.08999e-06 [add_comm_op_reuse_tag]: 3.17002e-06 [interleave_split_concat_branches]: 4.07e-06 [interleave_parallel_branches]: 3.48999e-06 [overlap_opt_shard_in_pipeline]: 3.93001e-06 [overlap_opt_shard_grad_in_pipeline]: 4.60999e-06 [control_data_broadcast_order]: 1.634e-05 [grouped_pairwise_exchange_alltoall]: 3.91001e-06 [offloading_packed_experts]: 6.42001e-06 [overlap_recompute_and_grad_model_parallel]: 8.34998e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.98001e-06 [overlap_recompute_allgather_and_fa_grad]: 4.63001e-06 [overlap_recompute_comm]: 5.59e-06 [overlap_grad_ring_attention]: 7.50998e-06 [overlap_grad_flash_sp]: 2.6e-05 [begin_end_overlap_inline]: 3.35003e-06 [split_matmul_comm_elemetwise]: 5.04e-06 [split_layernorm_comm]: 4.01001e-06 [handle_group_info]: 3.26001e-06 [symbol_engine_optimizer]: 9.907e-05, [1] [Cycle 1]: 9.17e-05, [6] [build]: 3.46999e-06 [elim_shapecalc]: 9.92001e-06 [elim_not_effective]: 1.354e-05 [opt_reshape]: 7.49002e-06 [fold_const_symbol]: 1.093e-05 [renormalize]: 3.09985e-07 [detach_backward]: 3.28e-06 [pipeline_parallel_scheduler]: 2.61e-06 [auto_monad_reorder]: 2.085e-05 [get_jit_bprop_graph]: 1.943e-05 [rewriter_after_jit_bprop_graph]: 1.268e-05 [opt_after_jit_grad]: 0.00087145 [validate]: 5.603e-05 Sums bootstrap : 0.000427s : 0.45% type_inference : 0.088678s : 93.31% event_method : 0.000024s : 0.03% auto_monad : 0.000069s : 0.07% graph_reusing : 0.000006s : 0.01% inline : 0.000003s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000023s : 0.02% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.01% parallel-infer-symbol : 0.000004s : 0.00% pre_auto_parallel : 0.000042s : 0.04% insert-virtual-dataset : 0.000002s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000035s : 0.04% optimize.rewriter_before_opt_a : 0.000095s : 0.10% optimize.opt_a.expand_dump_flag : 0.000005s : 0.01% optimize.opt_a.switch_simplify : 0.000052s : 0.05% optimize.opt_a.loop_unroll : 0.000036s : 0.04% optimize.opt_a.a_1 : 0.000820s : 0.86% optimize.opt_a.with_stream_mark : 0.000039s : 0.04% optimize.opt_a.recompute_prepare : 0.000018s : 0.02% optimize.opt_a.updatestate_depend_eliminate : 0.000008s : 0.01% optimize.opt_a.updatestate_assign_eliminate : 0.000006s : 0.01% optimize.opt_a.updatestate_loads_eliminate : 0.000005s : 0.01% optimize.opt_a.parameter_eliminate : 0.000004s : 0.00% optimize.opt_a.a_2 : 0.000226s : 0.24% optimize.opt_a.accelerated_algorithm : 0.000015s : 0.02% optimize.opt_a.shard : 0.000004s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.00% optimize.opt_a.shard_inline : 0.000014s : 0.01% optimize.opt_a.merge_send_recv : 0.000016s : 0.02% optimize.opt_a.auto_parallel : 0.000016s : 0.02% optimize.opt_a.parallel : 0.000028s : 0.03% optimize.opt_a.flash_sp : 0.000012s : 0.01% optimize.opt_a.merge_comm : 0.000008s : 0.01% optimize.opt_a.allreduce_fusion : 0.000008s : 0.01% optimize.opt_a.matmul_add_comm_reduction : 0.000018s : 0.02% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000002s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000016s : 0.02% optimize.opt_a.virtual_dataset : 0.000014s : 0.01% optimize.opt_a.get_grad_eliminate_ : 0.000012s : 0.01% optimize.opt_a.virtual_output : 0.000013s : 0.01% optimize.opt_a.merge_forward : 0.000008s : 0.01% optimize.opt_a.cell_reuse_recompute_pass : 0.000004s : 0.00% optimize.opt_a.offload_activation : 0.000019s : 0.02% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000029s : 0.03% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.00% optimize.opt_a.before_grad : 0.000022s : 0.02% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000008s : 0.01% optimize.opt_a.meta_fg_expand : 0.000006s : 0.01% optimize.opt_a.flash_sp_send_recv_attached : 0.000005s : 0.01% optimize.opt_a.receive_attached : 0.000004s : 0.00% optimize.opt_a.after_resolve : 0.000025s : 0.03% optimize.opt_a.a_after_grad : 0.000020s : 0.02% optimize.opt_a.renormalize : 0.000795s : 0.84% optimize.opt_a.add_forward_monad_depend : 0.000027s : 0.03% optimize.opt_a.auto_monad_grad : 0.000005s : 0.00% optimize.opt_a.auto_monad_eliminator : 0.000029s : 0.03% optimize.opt_a.cse : 0.000047s : 0.05% optimize.opt_a.a_3 : 0.000124s : 0.13% optimize.py_interpret_to_execute_after_opt_a : 0.000018s : 0.02% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.01% optimize.rewriter_after_opt_a : 0.000046s : 0.05% optimize.convert_after_rewriter : 0.000011s : 0.01% optimize.order_py_execute_after_rewriter : 0.000008s : 0.01% optimize.mutable_eliminate : 0.000736s : 0.77% optimize.opt_b.b_1 : 0.000169s : 0.18% optimize.opt_b.b_2 : 0.000008s : 0.01% optimize.opt_b.updatestate_depend_eliminate : 0.000009s : 0.01% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.00% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000022s : 0.02% optimize.optimize_parallel_all_gather_comm : 0.000023s : 0.02% optimize.overlap_param_gather : 0.000006s : 0.01% optimize.cconv : 0.000037s : 0.04% optimize.loop_unroll : 0.000476s : 0.50% optimize.opt_after_cconv.c_1 : 0.000032s : 0.03% optimize.opt_after_cconv.parameter_eliminate : 0.000006s : 0.01% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.01% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.cse : 0.000019s : 0.02% optimize.opt_after_cconv.renormalize : 0.000001s : 0.00% optimize.remove_dup_value : 0.000017s : 0.02% optimize.tuple_transform.d_1 : 0.000047s : 0.05% optimize.tuple_transform.none_parameter_eliminate : 0.000001s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000007s : 0.01% optimize.partial_unused_args_eliminate : 0.000005s : 0.00% optimize.add_recomputation : 0.000053s : 0.06% optimize.cse_after_recomputation.cse : 0.000012s : 0.01% optimize.environ_conv : 0.000008s : 0.01% optimize.swap_dp_allreduce_reducescatter : 0.000009s : 0.01% optimize.bias_add_comm_swap : 0.000006s : 0.01% optimize.label_micro_interleaved_index : 0.000007s : 0.01% optimize.label_fine_grained_interleaved_index : 0.000006s : 0.01% optimize.merge_cast_opt : 0.000004s : 0.00% optimize.slice_recompute_activation : 0.000005s : 0.01% optimize.micro_interleaved_order_control : 0.000005s : 0.01% optimize.assign_add_opt : 0.000004s : 0.00% optimize.ForceFp32Comm : 0.000003s : 0.00% optimize.remove_cast_before_assign_add : 0.000004s : 0.00% optimize.full_micro_interleaved_order_control : 0.000005s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000006s : 0.01% optimize.comm_op_add_attrs : 0.000004s : 0.00% optimize.add_comm_op_reuse_tag : 0.000003s : 0.00% optimize.interleave_split_concat_branches : 0.000004s : 0.00% optimize.interleave_parallel_branches : 0.000003s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000005s : 0.00% optimize.control_data_broadcast_order : 0.000016s : 0.02% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.00% optimize.offloading_packed_experts : 0.000006s : 0.01% optimize.overlap_recompute_and_grad_model_parallel : 0.000008s : 0.01% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000005s : 0.00% optimize.overlap_recompute_comm : 0.000006s : 0.01% optimize.overlap_grad_ring_attention : 0.000008s : 0.01% optimize.overlap_grad_flash_sp : 0.000026s : 0.03% optimize.begin_end_overlap_inline : 0.000003s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000005s : 0.01% optimize.split_layernorm_comm : 0.000004s : 0.00% optimize.handle_group_info : 0.000003s : 0.00% optimize.symbol_engine_optimizer.build : 0.000003s : 0.00% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000010s : 0.01% optimize.symbol_engine_optimizer.elim_not_effective : 0.000014s : 0.01% optimize.symbol_engine_optimizer.opt_reshape : 0.000007s : 0.01% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000011s : 0.01% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000003s : 0.00% pipeline_parallel_scheduler : 0.000003s : 0.00% auto_monad_reorder : 0.000021s : 0.02% get_jit_bprop_graph : 0.000019s : 0.02% rewriter_after_jit_bprop_graph : 0.000013s : 0.01% opt_after_jit_grad : 0.000871s : 0.92% validate : 0.000056s : 0.06% Time group info: ------[substitution.] 0.000244 34 15.36% : 0.000037s : 6: substitution.arithmetic_simplify 0.78% : 0.000002s : 2: substitution.elim_not_effective 0.69% : 0.000002s : 2: substitution.fold_const_symbol 2.68% : 0.000007s : 4: substitution.graph_param_transform 68.00% : 0.000166s : 4: substitution.inline 1.91% : 0.000005s : 4: substitution.j_node_and_user_rematch 2.09% : 0.000005s : 4: substitution.remove_not_recompute_node 2.42% : 0.000006s : 4: substitution.replace_old_param 6.08% : 0.000015s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.088607 2 98.92% : 0.087650s : 1: type_inference.infer 1.08% : 0.000958s : 1: type_inference.specialize ------[replace.] 0.000063 8 62.87% : 0.000039s : 4: replace.inline 37.13% : 0.000023s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000176 8 92.61% : 0.000163s : 4: match.inline 7.39% : 0.000013s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000218 1278 0.86% : 0.000002s : 13: predicate.accumulaten_eliminater 1.73% : 0.000004s : 4: predicate.ad_related_special_op_eliminate 0.54% : 0.000001s : 8: predicate.addn_check_dump 0.97% : 0.000002s : 13: predicate.addn_zero_filter 0.76% : 0.000002s : 13: predicate.adjust_all_reduce_mul_add 2.47% : 0.000005s : 21: predicate.arithmetic_simplify 0.94% : 0.000002s : 13: predicate.cast_eliminate 0.78% : 0.000002s : 8: predicate.check_bprop_eliminate 0.48% : 0.000001s : 8: predicate.compare_switch_simplify 0.19% : 0.000000s : 4: predicate.const_output_eliminate 0.56% : 0.000001s : 8: predicate.depend_value_elim 0.86% : 0.000002s : 13: predicate.dict_get_item_const_eliminator 1.06% : 0.000002s : 13: predicate.dict_get_item_eliminator 0.84% : 0.000002s : 13: predicate.dict_set_item_eliminator 1.54% : 0.000003s : 8: predicate.dumpgradient_eliminate 0.19% : 0.000000s : 4: predicate.elim_not_effective 0.33% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.11% : 0.000002s : 17: predicate.environ_add_const_eliminate 1.10% : 0.000002s : 17: predicate.environ_get_add_eliminate 1.02% : 0.000002s : 17: predicate.environ_get_depend_swap 1.69% : 0.000004s : 25: predicate.environ_get_eliminate 1.00% : 0.000002s : 17: predicate.environ_get_set_eliminate 1.43% : 0.000003s : 21: predicate.exchange_switch_depend_value 2.53% : 0.000006s : 21: predicate.float_depend_g_call 0.48% : 0.000001s : 8: predicate.float_environ_get_switch 0.74% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.17% : 0.000000s : 4: predicate.fold_const_symbol 0.57% : 0.000001s : 8: predicate.get_grad_eliminate 0.29% : 0.000001s : 4: predicate.graph_param_transform 0.55% : 0.000001s : 8: predicate.incorporate_call 0.45% : 0.000001s : 8: predicate.incorporate_call_switch 6.63% : 0.000014s : 58: predicate.inline 0.70% : 0.000002s : 8: predicate.inline_without_move 0.50% : 0.000001s : 8: predicate.j_node_and_user_rematch 0.80% : 0.000002s : 8: predicate.less_batch_normalization 1.90% : 0.000004s : 25: predicate.list_to_tuple_eliminator_ 2.39% : 0.000005s : 38: predicate.load_eliminater 0.99% : 0.000002s : 4: predicate.loop_unroll_after_grad 2.21% : 0.000005s : 34: predicate.loop_unroll_before_grad 1.54% : 0.000003s : 21: predicate.make_slice_get_slice_eliminator 0.52% : 0.000001s : 8: predicate.merge_addn 0.61% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.58% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.75% : 0.000002s : 13: predicate.minmaximum_grad 1.37% : 0.000003s : 4: predicate.mutable_eliminate 0.38% : 0.000001s : 4: predicate.opt_reshape 0.46% : 0.000001s : 4: predicate.parallel_virtual_node 1.65% : 0.000004s : 21: predicate.partial_defer_inline 1.56% : 0.000003s : 21: predicate.partial_eliminate 0.81% : 0.000002s : 13: predicate.print_const_string_wrapper 0.72% : 0.000002s : 8: predicate.reduce_all_const_elim 1.15% : 0.000003s : 13: predicate.reduce_eliminate 2.50% : 0.000005s : 38: predicate.redundant_stop_gradient_eliminater 0.45% : 0.000001s : 8: predicate.remove_not_recompute_node 1.41% : 0.000003s : 25: predicate.replace_applicator 0.61% : 0.000001s : 8: predicate.replace_old_param 0.26% : 0.000001s : 4: predicate.reset_defer_inline 0.83% : 0.000002s : 13: predicate.reshape_eliminate 0.78% : 0.000002s : 8: predicate.row_tensor_add_zeros_like 0.38% : 0.000001s : 4: predicate.row_tensor_eliminate 1.02% : 0.000002s : 8: predicate.same_eliminate 0.50% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.80% : 0.000002s : 8: predicate.shard_identity_eliminate 0.70% : 0.000002s : 8: predicate.special_op_eliminate 0.67% : 0.000001s : 8: predicate.specialize_transform 1.05% : 0.000002s : 8: predicate.split_environ_get_set_with_tuple_value 0.74% : 0.000002s : 8: predicate.stack_unstack_eliminate 0.29% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.43% : 0.000003s : 21: predicate.switch_defer_inline 2.06% : 0.000004s : 29: predicate.switch_layer_defer_inline 4.99% : 0.000011s : 67: predicate.switch_simplify 0.84% : 0.000002s : 13: predicate.tile_eliminate 0.84% : 0.000002s : 13: predicate.transpose_eliminate 1.67% : 0.000004s : 21: predicate.tuple_list_convert_item_index_to_positive 1.47% : 0.000003s : 21: predicate.tuple_list_get_item_const_eliminator 1.49% : 0.000003s : 21: predicate.tuple_list_get_item_depend_reorder 3.09% : 0.000007s : 33: predicate.tuple_list_get_item_eliminator 1.38% : 0.000003s : 21: predicate.tuple_list_get_set_item_eliminator 2.04% : 0.000004s : 29: predicate.tuple_list_set_item_eliminator 1.81% : 0.000004s : 25: predicate.tuple_to_list_eliminator_ 2.26% : 0.000005s : 38: predicate.updatestate_pure_node_eliminater 2.93% : 0.000006s : 46: predicate.updatestate_useless_node_eliminater 0.34% : 0.000001s : 4: predicate.value_based_eliminate 0.61% : 0.000001s : 8: predicate.virtual_dataset_eliminate 0.64% : 0.000001s : 8: predicate.virtual_output_eliminate 0.26% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.39% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000652 11 45.98% : 0.000300s : 5: func_graph_cloner_run.FuncGraphClonerGraph 54.02% : 0.000352s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.132532 192 0.00% : 0.000006s : 1: ForceFp32Comm 2.78% : 0.003681s : 1: add_attr 2.76% : 0.003662s : 1: add_attr_with_inline 0.00% : 0.000006s : 1: add_comm_op_reuse_tag 0.04% : 0.000058s : 1: add_recomputation 0.00% : 0.000006s : 1: assign_add_opt 0.06% : 0.000077s : 1: auto_monad 15.14% : 0.020063s : 1: auto_monad_reorder 0.00% : 0.000006s : 1: begin_end_overlap_inline 0.01% : 0.000008s : 1: bias_add_comm_swap 0.36% : 0.000471s : 1: bootstrap 0.03% : 0.000040s : 1: cconv 0.01% : 0.000007s : 1: comm_op_add_attrs 0.01% : 0.000019s : 1: control_data_broadcast_order 0.01% : 0.000014s : 1: convert_after_rewriter 0.03% : 0.000033s : 1: cse_after_recomputation 0.01% : 0.000008s : 1: dataset_repeat_opt 0.02% : 0.000020s : 1: detach_backward 0.01% : 0.000011s : 1: environ_conv 0.03% : 0.000036s : 1: event_method 0.01% : 0.000008s : 1: full_micro_interleaved_order_control 0.07% : 0.000093s : 1: get_jit_bprop_graph 0.01% : 0.000012s : 1: graph_reusing 0.00% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000006s : 1: handle_group_info 0.01% : 0.000008s : 1: inline 0.01% : 0.000009s : 1: insert-virtual-dataset 0.00% : 0.000006s : 1: interleave_parallel_branches 0.01% : 0.000007s : 1: interleave_split_concat_branches 0.01% : 0.000009s : 1: label_fine_grained_interleaved_index 0.01% : 0.000010s : 1: label_micro_interleaved_index 0.36% : 0.000482s : 1: loop_unroll 0.00% : 0.000007s : 1: merge_cast_opt 0.01% : 0.000008s : 1: micro_interleaved_order_control 0.56% : 0.000745s : 1: mutable_eliminate 0.01% : 0.000009s : 1: offloading_packed_experts 0.01% : 0.000015s : 1: opt.transform.loop_unroll_optimizer 0.01% : 0.000018s : 1: opt.transform.mutable_eliminate 0.96% : 0.001275s : 78: opt.transform.opt_a 0.02% : 0.000031s : 1: opt.transform.opt_after_cconv 0.03% : 0.000039s : 1: opt.transform.opt_after_jit_grad 0.08% : 0.000105s : 28: opt.transform.opt_b 0.04% : 0.000052s : 2: opt.transform.opt_trans_graph 0.03% : 0.000039s : 4: opt.transform.symbol_engine_opt 2.55% : 0.003382s : 1: opt_a 0.11% : 0.000139s : 1: opt_after_cconv 0.67% : 0.000884s : 1: opt_after_jit_grad 0.22% : 0.000290s : 1: opt_b 4.79% : 0.006350s : 1: optimize 0.02% : 0.000027s : 1: optimize_parallel_all_gather_comm 0.01% : 0.000011s : 1: order_py_execute_after_rewriter 0.02% : 0.000030s : 1: overlap_grad_flash_sp 0.01% : 0.000007s : 1: overlap_grad_matmul_and_grad_allreduce 0.01% : 0.000011s : 1: overlap_grad_ring_attention 0.01% : 0.000007s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000007s : 1: overlap_opt_shard_in_pipeline 0.01% : 0.000008s : 1: overlap_param_gather 0.01% : 0.000007s : 1: overlap_recompute_allgather_and_fa_grad 0.01% : 0.000012s : 1: overlap_recompute_and_grad_model_parallel 0.01% : 0.000009s : 1: overlap_recompute_comm 0.01% : 0.000011s : 1: parallel-infer-symbol 0.00% : 0.000006s : 1: parallel-infer-symbol-second 0.01% : 0.000007s : 1: partial_unused_args_eliminate 0.01% : 0.000010s : 1: pipeline_parallel_scheduler 0.01% : 0.000007s : 1: pipeline_split 0.04% : 0.000050s : 1: pre_auto_parallel 0.03% : 0.000039s : 1: py_interpret_to_execute 0.02% : 0.000022s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000006s : 1: remove_cast_before_assign_add 0.02% : 0.000021s : 1: remove_dup_value 0.32% : 0.000419s : 1: renormalize.infer 0.28% : 0.000367s : 1: renormalize.specialize 0.01% : 0.000009s : 1: reorder_send_recv_between_fp_bp 0.02% : 0.000020s : 1: rewriter_after_jit_bprop_graph 0.04% : 0.000050s : 1: rewriter_after_opt_a 0.08% : 0.000099s : 1: rewriter_before_opt_a 0.01% : 0.000008s : 1: slice_cell_reuse_recomputed_activation 0.01% : 0.000008s : 1: slice_recompute_activation 0.01% : 0.000007s : 1: split_layernorm_comm 0.01% : 0.000008s : 1: split_matmul_comm_elemetwise 0.01% : 0.000012s : 1: swap_dp_allreduce_reducescatter 0.08% : 0.000102s : 1: symbol_engine_optimizer 0.07% : 0.000097s : 1: tuple_transform 66.95% : 0.088733s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:43:36.646.487 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0169429, [21] [bootstrap]: 0.0004335 [type_inference]: 0.00640537 [event_method]: 2.032e-05 [auto_monad]: 6.531e-05 [graph_reusing]: 5.44e-06 [inline]: 3.01999e-06 [add_attr]: 0.00387424, [1] [add_attr_with_inline]: 0.00386287, [1] [Cycle 1]: 7.195e-05, [2] [tag_attr]: 2.557e-05 [meta_addattr_fg_expand]: 5.55001e-06 [parallel-infer-symbol]: 4.08001e-06 [pre_auto_parallel]: 4.013e-05 [insert-virtual-dataset]: 2.55997e-06 [parallel-infer-symbol-second]: 8.09989e-07 [dataset_repeat_opt]: 2.44999e-06 [pipeline_split]: 2.01e-06 [optimize]: 0.00530622, [53] [py_interpret_to_execute]: 2.987e-05 [rewriter_before_opt_a]: 9.123e-05 [opt_a]: 0.00294551, [2] [Cycle 1]: 0.00223318, [45] [expand_dump_flag]: 2.88e-06 [switch_simplify]: 4.524e-05 [loop_unroll]: 3.059e-05 [a_1]: 0.00067808 [with_stream_mark]: 1.813e-05 [recompute_prepare]: 9.36e-06 [updatestate_depend_eliminate]: 4.95999e-06 [updatestate_assign_eliminate]: 3.47002e-06 [updatestate_loads_eliminate]: 3.33998e-06 [parameter_eliminate]: 2.01e-06 [a_2]: 8.926e-05 [accelerated_algorithm]: 7.53e-06 [shard]: 2.12001e-06 [meta_shard_fg_expand]: 1.84998e-06 [shard_inline]: 6.63998e-06 [merge_send_recv]: 8.99998e-06 [auto_parallel]: 6.94999e-06 [parallel]: 1.88e-05 [flash_sp]: 8.82e-06 [merge_comm]: 4.37e-06 [allreduce_fusion]: 3.63e-06 [matmul_add_comm_reduction]: 1.015e-05 [allreduce_slice_to_reducescatter]: 7.90023e-07 [virtual_shard_identity]: 8.02e-06 [virtual_dataset]: 6.86999e-06 [get_grad_eliminate_]: 6.76999e-06 [virtual_output]: 6.62002e-06 [merge_forward]: 3.68e-06 [cell_reuse_recompute_pass]: 1.57001e-06 [offload_activation]: 9.91e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.23e-05 [merge_recompute_call_nodes]: 1.55999e-06 [before_grad]: 1.063e-05 [set_forward_comm_id_for_comm_node_pass]: 3.63e-06 [meta_fg_expand]: 3.4e-06 [flash_sp_send_recv_attached]: 2.66e-06 [receive_attached]: 1.96e-06 [after_resolve]: 1.237e-05 [a_after_grad]: 1.017e-05 [renormalize]: 0.00080634 [add_forward_monad_depend]: 6.81999e-06 [auto_monad_grad]: 2.87002e-06 [auto_monad_eliminator]: 1.645e-05 [cse]: 2.998e-05 [a_3]: 5.336e-05 [Cycle 2]: 0.00070151, [45] [expand_dump_flag]: 1.64e-06 [switch_simplify]: 8.60999e-06 [loop_unroll]: 7.21999e-06 [a_1]: 0.00015149 [with_stream_mark]: 1.434e-05 [recompute_prepare]: 7.25e-06 [updatestate_depend_eliminate]: 3.24001e-06 [updatestate_assign_eliminate]: 2.39999e-06 [updatestate_loads_eliminate]: 2.63e-06 [parameter_eliminate]: 1.40999e-06 [a_2]: 7.846e-05 [accelerated_algorithm]: 6.41e-06 [shard]: 1.09998e-06 [meta_shard_fg_expand]: 1.50999e-06 [shard_inline]: 6.44999e-06 [merge_send_recv]: 5.37999e-06 [auto_parallel]: 5.86e-06 [parallel]: 4.75999e-06 [flash_sp]: 3.41001e-06 [merge_comm]: 3.78999e-06 [allreduce_fusion]: 3.61999e-06 [matmul_add_comm_reduction]: 5.66e-06 [allreduce_slice_to_reducescatter]: 5.79981e-07 [virtual_shard_identity]: 7.11001e-06 [virtual_dataset]: 6.19999e-06 [get_grad_eliminate_]: 6.20002e-06 [virtual_output]: 5.74999e-06 [merge_forward]: 3.23998e-06 [cell_reuse_recompute_pass]: 2.26998e-06 [offload_activation]: 6.73e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.19e-05 [merge_recompute_call_nodes]: 7.00005e-07 [before_grad]: 9.32001e-06 [set_forward_comm_id_for_comm_node_pass]: 3.43999e-06 [meta_fg_expand]: 2.12999e-06 [flash_sp_send_recv_attached]: 9.79984e-07 [receive_attached]: 1.32e-06 [after_resolve]: 1.128e-05 [a_after_grad]: 8.88002e-06 [renormalize]: 8.00064e-08 [add_forward_monad_depend]: 1.43002e-06 [auto_monad_grad]: 1.19e-06 [auto_monad_eliminator]: 7.59002e-06 [cse]: 1.484e-05 [a_3]: 4.247e-05 [py_interpret_to_execute_after_opt_a]: 1.147e-05 [slice_cell_reuse_recomputed_activation]: 2.09e-06 [rewriter_after_opt_a]: 7.71e-05 [convert_after_rewriter]: 7.65998e-06 [order_py_execute_after_rewriter]: 5.49998e-06 [mutable_eliminate]: 0.00068738 [opt_b]: 0.00021113, [1] [Cycle 1]: 0.00020395, [7] [b_1]: 0.00012888 [b_2]: 8.61997e-06 [updatestate_depend_eliminate]: 6.04999e-06 [updatestate_assign_eliminate]: 2.39001e-06 [updatestate_loads_eliminate]: 2.32999e-06 [renormalize]: 7.30011e-07 [cse]: 1.917e-05 [optimize_parallel_all_gather_comm]: 1.705e-05 [overlap_param_gather]: 1.89e-06 [cconv]: 2.891e-05 [loop_unroll]: 0.00049113 [opt_after_cconv]: 0.00011177, [1] [Cycle 1]: 0.0001046, [7] [c_1]: 3.354e-05 [parameter_eliminate]: 3.26001e-06 [updatestate_depend_eliminate]: 6.05002e-06 [updatestate_assign_eliminate]: 2.54001e-06 [updatestate_loads_eliminate]: 2.54999e-06 [cse]: 1.948e-05 [renormalize]: 6.39993e-07 [remove_dup_value]: 1.367e-05 [tuple_transform]: 8.215e-05, [1] [Cycle 1]: 7.739e-05, [4] [d_1]: 4.892e-05 [none_parameter_eliminate]: 1.61002e-06 [renormalize]: 3.39991e-07 [switch_simplify]: 7.18998e-06 [partial_unused_args_eliminate]: 2.06003e-06 [add_recomputation]: 5.145e-05 [cse_after_recomputation]: 2.426e-05, [1] [Cycle 1]: 1.876e-05, [1] [cse]: 1.252e-05 [environ_conv]: 5.71003e-06 [swap_dp_allreduce_reducescatter]: 5.47001e-06 [bias_add_comm_swap]: 2.84001e-06 [label_micro_interleaved_index]: 4.93001e-06 [label_fine_grained_interleaved_index]: 2.77002e-06 [merge_cast_opt]: 1.24e-06 [slice_recompute_activation]: 2.41e-06 [micro_interleaved_order_control]: 2.63e-06 [assign_add_opt]: 1.40999e-06 [ForceFp32Comm]: 8.80013e-07 [remove_cast_before_assign_add]: 1.25999e-06 [full_micro_interleaved_order_control]: 2.54999e-06 [reorder_send_recv_between_fp_bp]: 2.98e-06 [comm_op_add_attrs]: 1.05001e-06 [add_comm_op_reuse_tag]: 1.31002e-06 [interleave_split_concat_branches]: 1.15001e-06 [interleave_parallel_branches]: 1.07998e-06 [overlap_opt_shard_in_pipeline]: 1.37999e-06 [overlap_opt_shard_grad_in_pipeline]: 1.74e-06 [control_data_broadcast_order]: 1.283e-05 [grouped_pairwise_exchange_alltoall]: 1.86e-06 [offloading_packed_experts]: 3.88999e-06 [overlap_recompute_and_grad_model_parallel]: 4.70999e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.17e-06 [overlap_recompute_allgather_and_fa_grad]: 1.42e-06 [overlap_recompute_comm]: 2.75997e-06 [overlap_grad_ring_attention]: 4.02998e-06 [overlap_grad_flash_sp]: 2.075e-05 [begin_end_overlap_inline]: 5.19998e-07 [split_matmul_comm_elemetwise]: 2.11e-06 [split_layernorm_comm]: 1.55999e-06 [handle_group_info]: 9.20001e-07 [symbol_engine_optimizer]: 7.871e-05, [1] [Cycle 1]: 7.37e-05, [6] [build]: 3.55998e-06 [elim_shapecalc]: 9.75002e-06 [elim_not_effective]: 1.37e-05 [opt_reshape]: 7.33e-06 [fold_const_symbol]: 1.019e-05 [renormalize]: 2.79979e-07 [detach_backward]: 2.56e-06 [pipeline_parallel_scheduler]: 1.55999e-06 [auto_monad_reorder]: 1.68e-05 [get_jit_bprop_graph]: 1.94999e-06 [rewriter_after_jit_bprop_graph]: 4.07e-06 [opt_after_jit_grad]: 0.00054391 [validate]: 4.468e-05 Sums bootstrap : 0.000433s : 3.59% type_inference : 0.006405s : 53.05% event_method : 0.000020s : 0.17% auto_monad : 0.000065s : 0.54% graph_reusing : 0.000005s : 0.05% inline : 0.000003s : 0.03% add_attr.add_attr_with_inline.tag_attr : 0.000026s : 0.21% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.05% parallel-infer-symbol : 0.000004s : 0.03% pre_auto_parallel : 0.000040s : 0.33% insert-virtual-dataset : 0.000003s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.02% optimize.py_interpret_to_execute : 0.000030s : 0.25% optimize.rewriter_before_opt_a : 0.000091s : 0.76% optimize.opt_a.expand_dump_flag : 0.000005s : 0.04% optimize.opt_a.switch_simplify : 0.000054s : 0.45% optimize.opt_a.loop_unroll : 0.000038s : 0.31% optimize.opt_a.a_1 : 0.000830s : 6.87% optimize.opt_a.with_stream_mark : 0.000032s : 0.27% optimize.opt_a.recompute_prepare : 0.000017s : 0.14% optimize.opt_a.updatestate_depend_eliminate : 0.000008s : 0.07% optimize.opt_a.updatestate_assign_eliminate : 0.000006s : 0.05% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.05% optimize.opt_a.parameter_eliminate : 0.000003s : 0.03% optimize.opt_a.a_2 : 0.000168s : 1.39% optimize.opt_a.accelerated_algorithm : 0.000014s : 0.12% optimize.opt_a.shard : 0.000003s : 0.03% optimize.opt_a.meta_shard_fg_expand : 0.000003s : 0.03% optimize.opt_a.shard_inline : 0.000013s : 0.11% optimize.opt_a.merge_send_recv : 0.000014s : 0.12% optimize.opt_a.auto_parallel : 0.000013s : 0.11% optimize.opt_a.parallel : 0.000024s : 0.20% optimize.opt_a.flash_sp : 0.000012s : 0.10% optimize.opt_a.merge_comm : 0.000008s : 0.07% optimize.opt_a.allreduce_fusion : 0.000007s : 0.06% optimize.opt_a.matmul_add_comm_reduction : 0.000016s : 0.13% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000015s : 0.13% optimize.opt_a.virtual_dataset : 0.000013s : 0.11% optimize.opt_a.get_grad_eliminate_ : 0.000013s : 0.11% optimize.opt_a.virtual_output : 0.000012s : 0.10% optimize.opt_a.merge_forward : 0.000007s : 0.06% optimize.opt_a.cell_reuse_recompute_pass : 0.000004s : 0.03% optimize.opt_a.offload_activation : 0.000017s : 0.14% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000024s : 0.20% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.02% optimize.opt_a.before_grad : 0.000020s : 0.17% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000007s : 0.06% optimize.opt_a.meta_fg_expand : 0.000006s : 0.05% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.03% optimize.opt_a.receive_attached : 0.000003s : 0.03% optimize.opt_a.after_resolve : 0.000024s : 0.20% optimize.opt_a.a_after_grad : 0.000019s : 0.16% optimize.opt_a.renormalize : 0.000806s : 6.68% optimize.opt_a.add_forward_monad_depend : 0.000008s : 0.07% optimize.opt_a.auto_monad_grad : 0.000004s : 0.03% optimize.opt_a.auto_monad_eliminator : 0.000024s : 0.20% optimize.opt_a.cse : 0.000045s : 0.37% optimize.opt_a.a_3 : 0.000096s : 0.79% optimize.py_interpret_to_execute_after_opt_a : 0.000011s : 0.10% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.02% optimize.rewriter_after_opt_a : 0.000077s : 0.64% optimize.convert_after_rewriter : 0.000008s : 0.06% optimize.order_py_execute_after_rewriter : 0.000005s : 0.05% optimize.mutable_eliminate : 0.000687s : 5.69% optimize.opt_b.b_1 : 0.000129s : 1.07% optimize.opt_b.b_2 : 0.000009s : 0.07% optimize.opt_b.updatestate_depend_eliminate : 0.000006s : 0.05% optimize.opt_b.updatestate_assign_eliminate : 0.000002s : 0.02% optimize.opt_b.updatestate_loads_eliminate : 0.000002s : 0.02% optimize.opt_b.renormalize : 0.000001s : 0.01% optimize.opt_b.cse : 0.000019s : 0.16% optimize.optimize_parallel_all_gather_comm : 0.000017s : 0.14% optimize.overlap_param_gather : 0.000002s : 0.02% optimize.cconv : 0.000029s : 0.24% optimize.loop_unroll : 0.000491s : 4.07% optimize.opt_after_cconv.c_1 : 0.000034s : 0.28% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.05% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.02% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.02% optimize.opt_after_cconv.cse : 0.000019s : 0.16% optimize.opt_after_cconv.renormalize : 0.000001s : 0.01% optimize.remove_dup_value : 0.000014s : 0.11% optimize.tuple_transform.d_1 : 0.000049s : 0.41% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000007s : 0.06% optimize.partial_unused_args_eliminate : 0.000002s : 0.02% optimize.add_recomputation : 0.000051s : 0.43% optimize.cse_after_recomputation.cse : 0.000013s : 0.10% optimize.environ_conv : 0.000006s : 0.05% optimize.swap_dp_allreduce_reducescatter : 0.000005s : 0.05% optimize.bias_add_comm_swap : 0.000003s : 0.02% optimize.label_micro_interleaved_index : 0.000005s : 0.04% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.02% optimize.merge_cast_opt : 0.000001s : 0.01% optimize.slice_recompute_activation : 0.000002s : 0.02% optimize.micro_interleaved_order_control : 0.000003s : 0.02% optimize.assign_add_opt : 0.000001s : 0.01% optimize.ForceFp32Comm : 0.000001s : 0.01% optimize.remove_cast_before_assign_add : 0.000001s : 0.01% optimize.full_micro_interleaved_order_control : 0.000003s : 0.02% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.02% optimize.comm_op_add_attrs : 0.000001s : 0.01% optimize.add_comm_op_reuse_tag : 0.000001s : 0.01% optimize.interleave_split_concat_branches : 0.000001s : 0.01% optimize.interleave_parallel_branches : 0.000001s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.01% optimize.control_data_broadcast_order : 0.000013s : 0.11% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.02% optimize.offloading_packed_experts : 0.000004s : 0.03% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.04% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.01% optimize.overlap_recompute_comm : 0.000003s : 0.02% optimize.overlap_grad_ring_attention : 0.000004s : 0.03% optimize.overlap_grad_flash_sp : 0.000021s : 0.17% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.02% optimize.split_layernorm_comm : 0.000002s : 0.01% optimize.handle_group_info : 0.000001s : 0.01% optimize.symbol_engine_optimizer.build : 0.000004s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000010s : 0.08% optimize.symbol_engine_optimizer.elim_not_effective : 0.000014s : 0.11% optimize.symbol_engine_optimizer.opt_reshape : 0.000007s : 0.06% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000010s : 0.08% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000003s : 0.02% pipeline_parallel_scheduler : 0.000002s : 0.01% auto_monad_reorder : 0.000017s : 0.14% get_jit_bprop_graph : 0.000002s : 0.02% rewriter_after_jit_bprop_graph : 0.000004s : 0.03% opt_after_jit_grad : 0.000544s : 4.51% validate : 0.000045s : 0.37% Time group info: ------[substitution.] 0.000239 34 14.56% : 0.000035s : 6: substitution.arithmetic_simplify 0.77% : 0.000002s : 2: substitution.elim_not_effective 0.66% : 0.000002s : 2: substitution.fold_const_symbol 2.63% : 0.000006s : 4: substitution.graph_param_transform 68.05% : 0.000163s : 4: substitution.inline 1.80% : 0.000004s : 4: substitution.j_node_and_user_rematch 2.52% : 0.000006s : 4: substitution.remove_not_recompute_node 2.26% : 0.000005s : 4: substitution.replace_old_param 6.75% : 0.000016s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.006336 2 87.82% : 0.005564s : 1: type_inference.infer 12.18% : 0.000772s : 1: type_inference.specialize ------[replace.] 0.000066 8 62.38% : 0.000041s : 4: replace.inline 37.62% : 0.000025s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000174 8 91.79% : 0.000160s : 4: match.inline 8.21% : 0.000014s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000216 1278 0.94% : 0.000002s : 13: predicate.accumulaten_eliminater 0.73% : 0.000002s : 4: predicate.ad_related_special_op_eliminate 0.49% : 0.000001s : 8: predicate.addn_check_dump 0.91% : 0.000002s : 13: predicate.addn_zero_filter 0.81% : 0.000002s : 13: predicate.adjust_all_reduce_mul_add 2.60% : 0.000006s : 21: predicate.arithmetic_simplify 0.90% : 0.000002s : 13: predicate.cast_eliminate 0.61% : 0.000001s : 8: predicate.check_bprop_eliminate 0.54% : 0.000001s : 8: predicate.compare_switch_simplify 0.18% : 0.000000s : 4: predicate.const_output_eliminate 0.61% : 0.000001s : 8: predicate.depend_value_elim 0.88% : 0.000002s : 13: predicate.dict_get_item_const_eliminator 1.02% : 0.000002s : 13: predicate.dict_get_item_eliminator 0.87% : 0.000002s : 13: predicate.dict_set_item_eliminator 0.98% : 0.000002s : 8: predicate.dumpgradient_eliminate 0.45% : 0.000001s : 4: predicate.elim_not_effective 0.37% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.11% : 0.000002s : 17: predicate.environ_add_const_eliminate 1.05% : 0.000002s : 17: predicate.environ_get_add_eliminate 1.12% : 0.000002s : 17: predicate.environ_get_depend_swap 1.81% : 0.000004s : 25: predicate.environ_get_eliminate 1.02% : 0.000002s : 17: predicate.environ_get_set_eliminate 1.43% : 0.000003s : 21: predicate.exchange_switch_depend_value 2.41% : 0.000005s : 21: predicate.float_depend_g_call 0.52% : 0.000001s : 8: predicate.float_environ_get_switch 0.86% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.18% : 0.000000s : 4: predicate.fold_const_symbol 0.59% : 0.000001s : 8: predicate.get_grad_eliminate 0.19% : 0.000000s : 4: predicate.graph_param_transform 0.57% : 0.000001s : 8: predicate.incorporate_call 0.44% : 0.000001s : 8: predicate.incorporate_call_switch 6.10% : 0.000013s : 58: predicate.inline 0.66% : 0.000001s : 8: predicate.inline_without_move 0.30% : 0.000001s : 8: predicate.j_node_and_user_rematch 1.01% : 0.000002s : 8: predicate.less_batch_normalization 1.82% : 0.000004s : 25: predicate.list_to_tuple_eliminator_ 2.43% : 0.000005s : 38: predicate.load_eliminater 1.06% : 0.000002s : 4: predicate.loop_unroll_after_grad 2.32% : 0.000005s : 34: predicate.loop_unroll_before_grad 1.52% : 0.000003s : 21: predicate.make_slice_get_slice_eliminator 0.72% : 0.000002s : 8: predicate.merge_addn 0.63% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.65% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.86% : 0.000002s : 13: predicate.minmaximum_grad 1.31% : 0.000003s : 4: predicate.mutable_eliminate 0.34% : 0.000001s : 4: predicate.opt_reshape 0.35% : 0.000001s : 4: predicate.parallel_virtual_node 1.73% : 0.000004s : 21: predicate.partial_defer_inline 1.54% : 0.000003s : 21: predicate.partial_eliminate 0.89% : 0.000002s : 13: predicate.print_const_string_wrapper 0.63% : 0.000001s : 8: predicate.reduce_all_const_elim 1.14% : 0.000002s : 13: predicate.reduce_eliminate 2.64% : 0.000006s : 38: predicate.redundant_stop_gradient_eliminater 0.47% : 0.000001s : 8: predicate.remove_not_recompute_node 1.47% : 0.000003s : 25: predicate.replace_applicator 0.51% : 0.000001s : 8: predicate.replace_old_param 0.25% : 0.000001s : 4: predicate.reset_defer_inline 1.04% : 0.000002s : 13: predicate.reshape_eliminate 0.63% : 0.000001s : 8: predicate.row_tensor_add_zeros_like 0.37% : 0.000001s : 4: predicate.row_tensor_eliminate 0.96% : 0.000002s : 8: predicate.same_eliminate 0.43% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.82% : 0.000002s : 8: predicate.shard_identity_eliminate 0.68% : 0.000001s : 8: predicate.special_op_eliminate 0.63% : 0.000001s : 8: predicate.specialize_transform 0.82% : 0.000002s : 8: predicate.split_environ_get_set_with_tuple_value 0.75% : 0.000002s : 8: predicate.stack_unstack_eliminate 0.33% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.48% : 0.000003s : 21: predicate.switch_defer_inline 2.00% : 0.000004s : 29: predicate.switch_layer_defer_inline 5.31% : 0.000011s : 67: predicate.switch_simplify 0.96% : 0.000002s : 13: predicate.tile_eliminate 1.09% : 0.000002s : 13: predicate.transpose_eliminate 1.68% : 0.000004s : 21: predicate.tuple_list_convert_item_index_to_positive 1.65% : 0.000004s : 21: predicate.tuple_list_get_item_const_eliminator 1.51% : 0.000003s : 21: predicate.tuple_list_get_item_depend_reorder 3.19% : 0.000007s : 33: predicate.tuple_list_get_item_eliminator 1.47% : 0.000003s : 21: predicate.tuple_list_get_set_item_eliminator 2.13% : 0.000005s : 29: predicate.tuple_list_set_item_eliminator 1.69% : 0.000004s : 25: predicate.tuple_to_list_eliminator_ 2.36% : 0.000005s : 38: predicate.updatestate_pure_node_eliminater 3.08% : 0.000007s : 46: predicate.updatestate_useless_node_eliminater 0.43% : 0.000001s : 4: predicate.value_based_eliminate 0.69% : 0.000001s : 8: predicate.virtual_dataset_eliminate 0.61% : 0.000001s : 8: predicate.virtual_output_eliminate 0.25% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.43% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000574 11 50.04% : 0.000287s : 5: func_graph_cloner_run.FuncGraphClonerGraph 49.96% : 0.000287s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.028348 192 0.01% : 0.000004s : 1: ForceFp32Comm 13.69% : 0.003880s : 1: add_attr 13.64% : 0.003867s : 1: add_attr_with_inline 0.02% : 0.000005s : 1: add_comm_op_reuse_tag 0.20% : 0.000056s : 1: add_recomputation 0.01% : 0.000004s : 1: assign_add_opt 0.25% : 0.000070s : 1: auto_monad 0.07% : 0.000021s : 1: auto_monad_reorder 0.01% : 0.000003s : 1: begin_end_overlap_inline 0.02% : 0.000006s : 1: bias_add_comm_swap 1.63% : 0.000461s : 1: bootstrap 0.11% : 0.000032s : 1: cconv 0.01% : 0.000004s : 1: comm_op_add_attrs 0.06% : 0.000016s : 1: control_data_broadcast_order 0.04% : 0.000011s : 1: convert_after_rewriter 0.10% : 0.000027s : 1: cse_after_recomputation 0.02% : 0.000005s : 1: dataset_repeat_opt 0.02% : 0.000006s : 1: detach_backward 0.03% : 0.000009s : 1: environ_conv 0.10% : 0.000027s : 1: event_method 0.02% : 0.000005s : 1: full_micro_interleaved_order_control 0.02% : 0.000005s : 1: get_jit_bprop_graph 0.03% : 0.000009s : 1: graph_reusing 0.02% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000004s : 1: handle_group_info 0.02% : 0.000006s : 1: inline 0.02% : 0.000006s : 1: insert-virtual-dataset 0.01% : 0.000004s : 1: interleave_parallel_branches 0.01% : 0.000004s : 1: interleave_split_concat_branches 0.02% : 0.000006s : 1: label_fine_grained_interleaved_index 0.03% : 0.000008s : 1: label_micro_interleaved_index 1.77% : 0.000500s : 1: loop_unroll 0.01% : 0.000004s : 1: merge_cast_opt 0.02% : 0.000006s : 1: micro_interleaved_order_control 2.46% : 0.000696s : 1: mutable_eliminate 0.02% : 0.000007s : 1: offloading_packed_experts 0.05% : 0.000015s : 1: opt.transform.loop_unroll_optimizer 0.06% : 0.000017s : 1: opt.transform.mutable_eliminate 4.50% : 0.001274s : 78: opt.transform.opt_a 0.11% : 0.000032s : 1: opt.transform.opt_after_cconv 0.10% : 0.000027s : 1: opt.transform.opt_after_jit_grad 0.37% : 0.000104s : 28: opt.transform.opt_b 0.19% : 0.000054s : 2: opt.transform.opt_trans_graph 0.13% : 0.000037s : 4: opt.transform.symbol_engine_opt 10.40% : 0.002949s : 1: opt_a 0.41% : 0.000115s : 1: opt_after_cconv 1.95% : 0.000554s : 1: opt_after_jit_grad 0.76% : 0.000215s : 1: opt_b 18.74% : 0.005312s : 1: optimize 0.07% : 0.000020s : 1: optimize_parallel_all_gather_comm 0.03% : 0.000009s : 1: order_py_execute_after_rewriter 0.09% : 0.000024s : 1: overlap_grad_flash_sp 0.01% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.02% : 0.000007s : 1: overlap_grad_ring_attention 0.02% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.02% : 0.000005s : 1: overlap_param_gather 0.01% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.03% : 0.000008s : 1: overlap_recompute_and_grad_model_parallel 0.02% : 0.000006s : 1: overlap_recompute_comm 0.03% : 0.000008s : 1: parallel-infer-symbol 0.01% : 0.000004s : 1: parallel-infer-symbol-second 0.02% : 0.000006s : 1: partial_unused_args_eliminate 0.02% : 0.000005s : 1: pipeline_parallel_scheduler 0.02% : 0.000005s : 1: pipeline_split 0.16% : 0.000045s : 1: pre_auto_parallel 0.12% : 0.000034s : 1: py_interpret_to_execute 0.05% : 0.000015s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000004s : 1: remove_cast_before_assign_add 0.06% : 0.000017s : 1: remove_dup_value 1.55% : 0.000439s : 1: renormalize.infer 1.26% : 0.000358s : 1: renormalize.specialize 0.02% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.03% : 0.000008s : 1: rewriter_after_jit_bprop_graph 0.29% : 0.000082s : 1: rewriter_after_opt_a 0.34% : 0.000096s : 1: rewriter_before_opt_a 0.02% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.02% : 0.000006s : 1: slice_recompute_activation 0.02% : 0.000005s : 1: split_layernorm_comm 0.02% : 0.000005s : 1: split_matmul_comm_elemetwise 0.03% : 0.000008s : 1: swap_dp_allreduce_reducescatter 0.29% : 0.000081s : 1: symbol_engine_optimizer 0.30% : 0.000085s : 1: tuple_transform 22.68% : 0.006429s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:43:37.458.972 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:43:37.459.257 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0312978, [21] [bootstrap]: 0.0004294 [type_inference]: 0.00607714 [event_method]: 2.286e-05 [auto_monad]: 6.509e-05 [graph_reusing]: 6.66e-06 [inline]: 2.79001e-06 [add_attr]: 0.0162853, [1] [add_attr_with_inline]: 0.0162715, [1] [Cycle 1]: 0.00011268, [2] [tag_attr]: 2.772e-05 [meta_addattr_fg_expand]: 6.38003e-06 [parallel-infer-symbol]: 3.85e-06 [pre_auto_parallel]: 4.819e-05 [insert-virtual-dataset]: 3.26001e-06 [parallel-infer-symbol-second]: 1.00001e-06 [dataset_repeat_opt]: 1.91e-06 [pipeline_split]: 2.11e-06 [optimize]: 0.00685003, [53] [py_interpret_to_execute]: 3.74e-05 [rewriter_before_opt_a]: 0.00010909 [opt_a]: 0.00390733, [2] [Cycle 1]: 0.00295972, [45] [expand_dump_flag]: 5.29e-06 [switch_simplify]: 5.316e-05 [loop_unroll]: 3.143e-05 [a_1]: 0.00073267 [with_stream_mark]: 2.57e-05 [recompute_prepare]: 1.017e-05 [updatestate_depend_eliminate]: 4.82998e-06 [updatestate_assign_eliminate]: 3.51999e-06 [updatestate_loads_eliminate]: 2.89999e-06 [parameter_eliminate]: 2.44999e-06 [a_2]: 0.00012044 [accelerated_algorithm]: 7.98999e-06 [shard]: 2.17999e-06 [meta_shard_fg_expand]: 2.29001e-06 [shard_inline]: 6.84999e-06 [merge_send_recv]: 9.46e-06 [auto_parallel]: 9.20001e-06 [parallel]: 2.137e-05 [flash_sp]: 9.52001e-06 [merge_comm]: 4.33999e-06 [allreduce_fusion]: 3.86001e-06 [matmul_add_comm_reduction]: 1.089e-05 [allreduce_slice_to_reducescatter]: 6.30011e-07 [virtual_shard_identity]: 8.47e-06 [virtual_dataset]: 6.90002e-06 [get_grad_eliminate_]: 6.58e-06 [virtual_output]: 6.84999e-06 [merge_forward]: 4.15999e-06 [cell_reuse_recompute_pass]: 1.38002e-06 [offload_activation]: 1.126e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.447e-05 [merge_recompute_call_nodes]: 1.76e-06 [before_grad]: 1.208e-05 [set_forward_comm_id_for_comm_node_pass]: 3.70998e-06 [meta_fg_expand]: 4.05e-06 [flash_sp_send_recv_attached]: 2.64001e-06 [receive_attached]: 2.43e-06 [after_resolve]: 2.039e-05 [a_after_grad]: 1.084e-05 [renormalize]: 0.00094572 [add_forward_monad_depend]: 7.55998e-06 [auto_monad_grad]: 2.68003e-06 [auto_monad_eliminator]: 1.946e-05 [cse]: 3.154e-05 [a_3]: 7.029e-05 [Cycle 2]: 0.00093118, [45] [expand_dump_flag]: 2.01e-06 [switch_simplify]: 8.84e-06 [loop_unroll]: 7.45e-06 [a_1]: 0.00015955 [with_stream_mark]: 1.785e-05 [recompute_prepare]: 7.11001e-06 [updatestate_depend_eliminate]: 3.29001e-06 [updatestate_assign_eliminate]: 2.81e-06 [updatestate_loads_eliminate]: 2.44999e-06 [parameter_eliminate]: 2.14999e-06 [a_2]: 0.00010906 [accelerated_algorithm]: 7.27002e-06 [shard]: 2.04e-06 [meta_shard_fg_expand]: 2.14999e-06 [shard_inline]: 6.63e-06 [merge_send_recv]: 7.54002e-06 [auto_parallel]: 8.2e-06 [parallel]: 7.26001e-06 [flash_sp]: 3.75998e-06 [merge_comm]: 3.84002e-06 [allreduce_fusion]: 3.9e-06 [matmul_add_comm_reduction]: 7.71001e-06 [allreduce_slice_to_reducescatter]: 4.59986e-07 [virtual_shard_identity]: 7.63999e-06 [virtual_dataset]: 6.44001e-06 [get_grad_eliminate_]: 5.96e-06 [virtual_output]: 6.36e-06 [merge_forward]: 3.56001e-06 [cell_reuse_recompute_pass]: 2.84999e-06 [offload_activation]: 9.15001e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.46e-05 [merge_recompute_call_nodes]: 1.27999e-06 [before_grad]: 1.368e-05 [set_forward_comm_id_for_comm_node_pass]: 3.83999e-06 [meta_fg_expand]: 2.60002e-06 [flash_sp_send_recv_attached]: 1.35999e-06 [receive_attached]: 2.37001e-06 [after_resolve]: 1.25e-05 [a_after_grad]: 1.016e-05 [renormalize]: 8.00064e-08 [add_forward_monad_depend]: 1.96003e-06 [auto_monad_grad]: 1.17999e-06 [auto_monad_eliminator]: 1.099e-05 [cse]: 1.871e-05 [a_3]: 5.276e-05 [py_interpret_to_execute_after_opt_a]: 2.05e-05 [slice_cell_reuse_recomputed_activation]: 4.90999e-06 [rewriter_after_opt_a]: 4.665e-05 [convert_after_rewriter]: 1.044e-05 [order_py_execute_after_rewriter]: 7.90998e-06 [mutable_eliminate]: 0.00079 [opt_b]: 0.00030116, [1] [Cycle 1]: 0.00028876, [7] [b_1]: 0.0001795 [b_2]: 9.77999e-06 [updatestate_depend_eliminate]: 9.15999e-06 [updatestate_assign_eliminate]: 2.56998e-06 [updatestate_loads_eliminate]: 2.61e-06 [renormalize]: 1.03001e-06 [cse]: 2.52e-05 [optimize_parallel_all_gather_comm]: 2.362e-05 [overlap_param_gather]: 5.99e-06 [cconv]: 3.84e-05 [loop_unroll]: 0.00062875 [opt_after_cconv]: 0.00014578, [1] [Cycle 1]: 0.00013439, [7] [c_1]: 3.431e-05 [parameter_eliminate]: 5.59e-06 [updatestate_depend_eliminate]: 7.21001e-06 [updatestate_assign_eliminate]: 2.68e-06 [updatestate_loads_eliminate]: 2.72001e-06 [cse]: 2.432e-05 [renormalize]: 6.59988e-07 [remove_dup_value]: 3.8e-05 [tuple_transform]: 0.00010413, [1] [Cycle 1]: 9.5e-05, [4] [d_1]: 5.377e-05 [none_parameter_eliminate]: 2.14e-06 [renormalize]: 2.99973e-07 [switch_simplify]: 7.8e-06 [partial_unused_args_eliminate]: 5.02999e-06 [add_recomputation]: 5.758e-05 [cse_after_recomputation]: 3.151e-05, [1] [Cycle 1]: 2.422e-05, [1] [cse]: 1.433e-05 [environ_conv]: 9.91998e-06 [swap_dp_allreduce_reducescatter]: 8.25999e-06 [bias_add_comm_swap]: 5.96998e-06 [label_micro_interleaved_index]: 9.71998e-06 [label_fine_grained_interleaved_index]: 5.72999e-06 [merge_cast_opt]: 4.15999e-06 [slice_recompute_activation]: 4.54002e-06 [micro_interleaved_order_control]: 5.08002e-06 [assign_add_opt]: 3.70998e-06 [ForceFp32Comm]: 3.39001e-06 [remove_cast_before_assign_add]: 3.68e-06 [full_micro_interleaved_order_control]: 4.69002e-06 [reorder_send_recv_between_fp_bp]: 5.68002e-06 [comm_op_add_attrs]: 3.75e-06 [add_comm_op_reuse_tag]: 3.17002e-06 [interleave_split_concat_branches]: 4.03001e-06 [interleave_parallel_branches]: 3.56001e-06 [overlap_opt_shard_in_pipeline]: 3.56999e-06 [overlap_opt_shard_grad_in_pipeline]: 4.65001e-06 [control_data_broadcast_order]: 1.808e-05 [grouped_pairwise_exchange_alltoall]: 4.44002e-06 [offloading_packed_experts]: 7.85998e-06 [overlap_recompute_and_grad_model_parallel]: 7.88999e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.9e-06 [overlap_recompute_allgather_and_fa_grad]: 3.93001e-06 [overlap_recompute_comm]: 4.95001e-06 [overlap_grad_ring_attention]: 6.74001e-06 [overlap_grad_flash_sp]: 2.587e-05 [begin_end_overlap_inline]: 3.48999e-06 [split_matmul_comm_elemetwise]: 4.90001e-06 [split_layernorm_comm]: 4.01001e-06 [handle_group_info]: 3.63999e-06 [symbol_engine_optimizer]: 0.0001065, [1] [Cycle 1]: 9.953e-05, [6] [build]: 4.32e-06 [elim_shapecalc]: 1.15e-05 [elim_not_effective]: 1.583e-05 [opt_reshape]: 8.40999e-06 [fold_const_symbol]: 1.104e-05 [renormalize]: 1.59984e-07 [detach_backward]: 5.34e-06 [pipeline_parallel_scheduler]: 1.69e-06 [auto_monad_reorder]: 2.162e-05 [get_jit_bprop_graph]: 1.99e-06 [rewriter_after_jit_bprop_graph]: 6.90998e-06 [opt_after_jit_grad]: 0.00075651 [validate]: 5.31e-05 Sums bootstrap : 0.000429s : 3.34% type_inference : 0.006077s : 47.28% event_method : 0.000023s : 0.18% auto_monad : 0.000065s : 0.51% graph_reusing : 0.000007s : 0.05% inline : 0.000003s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000028s : 0.22% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.05% parallel-infer-symbol : 0.000004s : 0.03% pre_auto_parallel : 0.000048s : 0.37% insert-virtual-dataset : 0.000003s : 0.03% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.01% pipeline_split : 0.000002s : 0.02% optimize.py_interpret_to_execute : 0.000037s : 0.29% optimize.rewriter_before_opt_a : 0.000109s : 0.85% optimize.opt_a.expand_dump_flag : 0.000007s : 0.06% optimize.opt_a.switch_simplify : 0.000062s : 0.48% optimize.opt_a.loop_unroll : 0.000039s : 0.30% optimize.opt_a.a_1 : 0.000892s : 6.94% optimize.opt_a.with_stream_mark : 0.000044s : 0.34% optimize.opt_a.recompute_prepare : 0.000017s : 0.13% optimize.opt_a.updatestate_depend_eliminate : 0.000008s : 0.06% optimize.opt_a.updatestate_assign_eliminate : 0.000006s : 0.05% optimize.opt_a.updatestate_loads_eliminate : 0.000005s : 0.04% optimize.opt_a.parameter_eliminate : 0.000005s : 0.04% optimize.opt_a.a_2 : 0.000230s : 1.79% optimize.opt_a.accelerated_algorithm : 0.000015s : 0.12% optimize.opt_a.shard : 0.000004s : 0.03% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.03% optimize.opt_a.shard_inline : 0.000013s : 0.10% optimize.opt_a.merge_send_recv : 0.000017s : 0.13% optimize.opt_a.auto_parallel : 0.000017s : 0.14% optimize.opt_a.parallel : 0.000029s : 0.22% optimize.opt_a.flash_sp : 0.000013s : 0.10% optimize.opt_a.merge_comm : 0.000008s : 0.06% optimize.opt_a.allreduce_fusion : 0.000008s : 0.06% optimize.opt_a.matmul_add_comm_reduction : 0.000019s : 0.14% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000016s : 0.13% optimize.opt_a.virtual_dataset : 0.000013s : 0.10% optimize.opt_a.get_grad_eliminate_ : 0.000013s : 0.10% optimize.opt_a.virtual_output : 0.000013s : 0.10% optimize.opt_a.merge_forward : 0.000008s : 0.06% optimize.opt_a.cell_reuse_recompute_pass : 0.000004s : 0.03% optimize.opt_a.offload_activation : 0.000020s : 0.16% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000029s : 0.23% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.02% optimize.opt_a.before_grad : 0.000026s : 0.20% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000008s : 0.06% optimize.opt_a.meta_fg_expand : 0.000007s : 0.05% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.03% optimize.opt_a.receive_attached : 0.000005s : 0.04% optimize.opt_a.after_resolve : 0.000033s : 0.26% optimize.opt_a.a_after_grad : 0.000021s : 0.16% optimize.opt_a.renormalize : 0.000946s : 7.36% optimize.opt_a.add_forward_monad_depend : 0.000010s : 0.07% optimize.opt_a.auto_monad_grad : 0.000004s : 0.03% optimize.opt_a.auto_monad_eliminator : 0.000030s : 0.24% optimize.opt_a.cse : 0.000050s : 0.39% optimize.opt_a.a_3 : 0.000123s : 0.96% optimize.py_interpret_to_execute_after_opt_a : 0.000021s : 0.16% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.04% optimize.rewriter_after_opt_a : 0.000047s : 0.36% optimize.convert_after_rewriter : 0.000010s : 0.08% optimize.order_py_execute_after_rewriter : 0.000008s : 0.06% optimize.mutable_eliminate : 0.000790s : 6.15% optimize.opt_b.b_1 : 0.000179s : 1.40% optimize.opt_b.b_2 : 0.000010s : 0.08% optimize.opt_b.updatestate_depend_eliminate : 0.000009s : 0.07% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.02% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.02% optimize.opt_b.renormalize : 0.000001s : 0.01% optimize.opt_b.cse : 0.000025s : 0.20% optimize.optimize_parallel_all_gather_comm : 0.000024s : 0.18% optimize.overlap_param_gather : 0.000006s : 0.05% optimize.cconv : 0.000038s : 0.30% optimize.loop_unroll : 0.000629s : 4.89% optimize.opt_after_cconv.c_1 : 0.000034s : 0.27% optimize.opt_after_cconv.parameter_eliminate : 0.000006s : 0.04% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000007s : 0.06% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.02% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.02% optimize.opt_after_cconv.cse : 0.000024s : 0.19% optimize.opt_after_cconv.renormalize : 0.000001s : 0.01% optimize.remove_dup_value : 0.000038s : 0.30% optimize.tuple_transform.d_1 : 0.000054s : 0.42% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.02% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000008s : 0.06% optimize.partial_unused_args_eliminate : 0.000005s : 0.04% optimize.add_recomputation : 0.000058s : 0.45% optimize.cse_after_recomputation.cse : 0.000014s : 0.11% optimize.environ_conv : 0.000010s : 0.08% optimize.swap_dp_allreduce_reducescatter : 0.000008s : 0.06% optimize.bias_add_comm_swap : 0.000006s : 0.05% optimize.label_micro_interleaved_index : 0.000010s : 0.08% optimize.label_fine_grained_interleaved_index : 0.000006s : 0.04% optimize.merge_cast_opt : 0.000004s : 0.03% optimize.slice_recompute_activation : 0.000005s : 0.04% optimize.micro_interleaved_order_control : 0.000005s : 0.04% optimize.assign_add_opt : 0.000004s : 0.03% optimize.ForceFp32Comm : 0.000003s : 0.03% optimize.remove_cast_before_assign_add : 0.000004s : 0.03% optimize.full_micro_interleaved_order_control : 0.000005s : 0.04% optimize.reorder_send_recv_between_fp_bp : 0.000006s : 0.04% optimize.comm_op_add_attrs : 0.000004s : 0.03% optimize.add_comm_op_reuse_tag : 0.000003s : 0.02% optimize.interleave_split_concat_branches : 0.000004s : 0.03% optimize.interleave_parallel_branches : 0.000004s : 0.03% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.03% optimize.overlap_opt_shard_grad_in_pipeline : 0.000005s : 0.04% optimize.control_data_broadcast_order : 0.000018s : 0.14% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.03% optimize.offloading_packed_experts : 0.000008s : 0.06% optimize.overlap_recompute_and_grad_model_parallel : 0.000008s : 0.06% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.03% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.03% optimize.overlap_recompute_comm : 0.000005s : 0.04% optimize.overlap_grad_ring_attention : 0.000007s : 0.05% optimize.overlap_grad_flash_sp : 0.000026s : 0.20% optimize.begin_end_overlap_inline : 0.000003s : 0.03% optimize.split_matmul_comm_elemetwise : 0.000005s : 0.04% optimize.split_layernorm_comm : 0.000004s : 0.03% optimize.handle_group_info : 0.000004s : 0.03% optimize.symbol_engine_optimizer.build : 0.000004s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000012s : 0.09% optimize.symbol_engine_optimizer.elim_not_effective : 0.000016s : 0.12% optimize.symbol_engine_optimizer.opt_reshape : 0.000008s : 0.07% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000011s : 0.09% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000005s : 0.04% pipeline_parallel_scheduler : 0.000002s : 0.01% auto_monad_reorder : 0.000022s : 0.17% get_jit_bprop_graph : 0.000002s : 0.02% rewriter_after_jit_bprop_graph : 0.000007s : 0.05% opt_after_jit_grad : 0.000757s : 5.89% validate : 0.000053s : 0.41% Time group info: ------[substitution.] 0.000280 34 15.27% : 0.000043s : 6: substitution.arithmetic_simplify 0.67% : 0.000002s : 2: substitution.elim_not_effective 0.69% : 0.000002s : 2: substitution.fold_const_symbol 2.62% : 0.000007s : 4: substitution.graph_param_transform 68.17% : 0.000191s : 4: substitution.inline 1.83% : 0.000005s : 4: substitution.j_node_and_user_rematch 2.10% : 0.000006s : 4: substitution.remove_not_recompute_node 2.28% : 0.000006s : 4: substitution.replace_old_param 6.37% : 0.000018s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.006012 2 87.30% : 0.005249s : 1: type_inference.infer 12.70% : 0.000763s : 1: type_inference.specialize ------[replace.] 0.000071 8 62.90% : 0.000045s : 4: replace.inline 37.10% : 0.000026s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000204 8 92.25% : 0.000188s : 4: match.inline 7.75% : 0.000016s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000238 1278 0.96% : 0.000002s : 13: predicate.accumulaten_eliminater 0.91% : 0.000002s : 4: predicate.ad_related_special_op_eliminate 0.54% : 0.000001s : 8: predicate.addn_check_dump 1.15% : 0.000003s : 13: predicate.addn_zero_filter 0.81% : 0.000002s : 13: predicate.adjust_all_reduce_mul_add 2.85% : 0.000007s : 21: predicate.arithmetic_simplify 0.95% : 0.000002s : 13: predicate.cast_eliminate 0.58% : 0.000001s : 8: predicate.check_bprop_eliminate 0.53% : 0.000001s : 8: predicate.compare_switch_simplify 0.17% : 0.000000s : 4: predicate.const_output_eliminate 0.67% : 0.000002s : 8: predicate.depend_value_elim 0.90% : 0.000002s : 13: predicate.dict_get_item_const_eliminator 0.92% : 0.000002s : 13: predicate.dict_get_item_eliminator 1.00% : 0.000002s : 13: predicate.dict_set_item_eliminator 1.18% : 0.000003s : 8: predicate.dumpgradient_eliminate 0.24% : 0.000001s : 4: predicate.elim_not_effective 0.41% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.48% : 0.000004s : 17: predicate.environ_add_const_eliminate 0.97% : 0.000002s : 17: predicate.environ_get_add_eliminate 1.14% : 0.000003s : 17: predicate.environ_get_depend_swap 1.44% : 0.000003s : 25: predicate.environ_get_eliminate 1.06% : 0.000003s : 17: predicate.environ_get_set_eliminate 1.30% : 0.000003s : 21: predicate.exchange_switch_depend_value 2.53% : 0.000006s : 21: predicate.float_depend_g_call 0.58% : 0.000001s : 8: predicate.float_environ_get_switch 0.84% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.16% : 0.000000s : 4: predicate.fold_const_symbol 0.71% : 0.000002s : 8: predicate.get_grad_eliminate 0.28% : 0.000001s : 4: predicate.graph_param_transform 0.53% : 0.000001s : 8: predicate.incorporate_call 0.40% : 0.000001s : 8: predicate.incorporate_call_switch 5.73% : 0.000014s : 58: predicate.inline 0.63% : 0.000002s : 8: predicate.inline_without_move 0.30% : 0.000001s : 8: predicate.j_node_and_user_rematch 0.87% : 0.000002s : 8: predicate.less_batch_normalization 1.83% : 0.000004s : 25: predicate.list_to_tuple_eliminator_ 2.27% : 0.000005s : 38: predicate.load_eliminater 1.08% : 0.000003s : 4: predicate.loop_unroll_after_grad 2.21% : 0.000005s : 34: predicate.loop_unroll_before_grad 1.81% : 0.000004s : 21: predicate.make_slice_get_slice_eliminator 0.55% : 0.000001s : 8: predicate.merge_addn 0.66% : 0.000002s : 8: predicate.micro_step_allgather_replace 0.61% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.76% : 0.000002s : 13: predicate.minmaximum_grad 1.29% : 0.000003s : 4: predicate.mutable_eliminate 0.34% : 0.000001s : 4: predicate.opt_reshape 0.57% : 0.000001s : 4: predicate.parallel_virtual_node 1.58% : 0.000004s : 21: predicate.partial_defer_inline 1.41% : 0.000003s : 21: predicate.partial_eliminate 0.94% : 0.000002s : 13: predicate.print_const_string_wrapper 0.58% : 0.000001s : 8: predicate.reduce_all_const_elim 1.36% : 0.000003s : 13: predicate.reduce_eliminate 2.29% : 0.000005s : 38: predicate.redundant_stop_gradient_eliminater 0.44% : 0.000001s : 8: predicate.remove_not_recompute_node 1.27% : 0.000003s : 25: predicate.replace_applicator 0.40% : 0.000001s : 8: predicate.replace_old_param 0.40% : 0.000001s : 4: predicate.reset_defer_inline 0.96% : 0.000002s : 13: predicate.reshape_eliminate 0.80% : 0.000002s : 8: predicate.row_tensor_add_zeros_like 0.37% : 0.000001s : 4: predicate.row_tensor_eliminate 0.79% : 0.000002s : 8: predicate.same_eliminate 0.38% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.81% : 0.000002s : 8: predicate.shard_identity_eliminate 0.87% : 0.000002s : 8: predicate.special_op_eliminate 0.60% : 0.000001s : 8: predicate.specialize_transform 1.22% : 0.000003s : 8: predicate.split_environ_get_set_with_tuple_value 0.85% : 0.000002s : 8: predicate.stack_unstack_eliminate 0.26% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.49% : 0.000004s : 21: predicate.switch_defer_inline 2.02% : 0.000005s : 29: predicate.switch_layer_defer_inline 5.92% : 0.000014s : 67: predicate.switch_simplify 0.85% : 0.000002s : 13: predicate.tile_eliminate 0.86% : 0.000002s : 13: predicate.transpose_eliminate 1.77% : 0.000004s : 21: predicate.tuple_list_convert_item_index_to_positive 1.71% : 0.000004s : 21: predicate.tuple_list_get_item_const_eliminator 1.48% : 0.000004s : 21: predicate.tuple_list_get_item_depend_reorder 3.03% : 0.000007s : 33: predicate.tuple_list_get_item_eliminator 1.38% : 0.000003s : 21: predicate.tuple_list_get_set_item_eliminator 2.16% : 0.000005s : 29: predicate.tuple_list_set_item_eliminator 1.69% : 0.000004s : 25: predicate.tuple_to_list_eliminator_ 2.16% : 0.000005s : 38: predicate.updatestate_pure_node_eliminater 2.80% : 0.000007s : 46: predicate.updatestate_useless_node_eliminater 0.35% : 0.000001s : 4: predicate.value_based_eliminate 0.61% : 0.000001s : 8: predicate.virtual_dataset_eliminate 0.69% : 0.000002s : 8: predicate.virtual_output_eliminate 0.24% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.51% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000599 11 51.67% : 0.000309s : 5: func_graph_cloner_run.FuncGraphClonerGraph 48.33% : 0.000290s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.056899 192 0.01% : 0.000006s : 1: ForceFp32Comm 28.64% : 0.016298s : 1: add_attr 28.60% : 0.016276s : 1: add_attr_with_inline 0.01% : 0.000007s : 1: add_comm_op_reuse_tag 0.11% : 0.000061s : 1: add_recomputation 0.01% : 0.000006s : 1: assign_add_opt 0.13% : 0.000075s : 1: auto_monad 0.05% : 0.000030s : 1: auto_monad_reorder 0.01% : 0.000006s : 1: begin_end_overlap_inline 0.02% : 0.000009s : 1: bias_add_comm_swap 0.83% : 0.000473s : 1: bootstrap 0.07% : 0.000042s : 1: cconv 0.01% : 0.000006s : 1: comm_op_add_attrs 0.04% : 0.000021s : 1: control_data_broadcast_order 0.02% : 0.000013s : 1: convert_after_rewriter 0.06% : 0.000035s : 1: cse_after_recomputation 0.01% : 0.000008s : 1: dataset_repeat_opt 0.05% : 0.000026s : 1: detach_backward 0.02% : 0.000013s : 1: environ_conv 0.06% : 0.000033s : 1: event_method 0.01% : 0.000007s : 1: full_micro_interleaved_order_control 0.01% : 0.000008s : 1: get_jit_bprop_graph 0.02% : 0.000013s : 1: graph_reusing 0.01% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000006s : 1: handle_group_info 0.02% : 0.000009s : 1: inline 0.02% : 0.000009s : 1: insert-virtual-dataset 0.01% : 0.000006s : 1: interleave_parallel_branches 0.01% : 0.000007s : 1: interleave_split_concat_branches 0.01% : 0.000008s : 1: label_fine_grained_interleaved_index 0.02% : 0.000012s : 1: label_micro_interleaved_index 1.12% : 0.000636s : 1: loop_unroll 0.01% : 0.000007s : 1: merge_cast_opt 0.01% : 0.000008s : 1: micro_interleaved_order_control 1.41% : 0.000800s : 1: mutable_eliminate 0.02% : 0.000011s : 1: offloading_packed_experts 0.03% : 0.000018s : 1: opt.transform.loop_unroll_optimizer 0.04% : 0.000022s : 1: opt.transform.mutable_eliminate 2.41% : 0.001370s : 78: opt.transform.opt_a 0.06% : 0.000033s : 1: opt.transform.opt_after_cconv 0.06% : 0.000035s : 1: opt.transform.opt_after_jit_grad 0.20% : 0.000112s : 28: opt.transform.opt_b 0.10% : 0.000059s : 2: opt.transform.opt_trans_graph 0.08% : 0.000043s : 4: opt.transform.symbol_engine_opt 6.87% : 0.003911s : 1: opt_a 0.26% : 0.000149s : 1: opt_after_cconv 1.36% : 0.000772s : 1: opt_after_jit_grad 0.54% : 0.000305s : 1: opt_b 12.69% : 0.007220s : 1: optimize 0.05% : 0.000027s : 1: optimize_parallel_all_gather_comm 0.02% : 0.000011s : 1: order_py_execute_after_rewriter 0.05% : 0.000029s : 1: overlap_grad_flash_sp 0.01% : 0.000007s : 1: overlap_grad_matmul_and_grad_allreduce 0.02% : 0.000010s : 1: overlap_grad_ring_attention 0.01% : 0.000007s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000006s : 1: overlap_opt_shard_in_pipeline 0.02% : 0.000009s : 1: overlap_param_gather 0.01% : 0.000007s : 1: overlap_recompute_allgather_and_fa_grad 0.02% : 0.000012s : 1: overlap_recompute_and_grad_model_parallel 0.01% : 0.000008s : 1: overlap_recompute_comm 0.02% : 0.000011s : 1: parallel-infer-symbol 0.01% : 0.000007s : 1: parallel-infer-symbol-second 0.01% : 0.000008s : 1: partial_unused_args_eliminate 0.02% : 0.000009s : 1: pipeline_parallel_scheduler 0.01% : 0.000007s : 1: pipeline_split 0.10% : 0.000055s : 1: pre_auto_parallel 0.07% : 0.000041s : 1: py_interpret_to_execute 0.04% : 0.000024s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000006s : 1: remove_cast_before_assign_add 0.07% : 0.000042s : 1: remove_dup_value 0.90% : 0.000513s : 1: renormalize.infer 0.74% : 0.000421s : 1: renormalize.specialize 0.02% : 0.000009s : 1: reorder_send_recv_between_fp_bp 0.02% : 0.000013s : 1: rewriter_after_jit_bprop_graph 0.09% : 0.000050s : 1: rewriter_after_opt_a 0.20% : 0.000113s : 1: rewriter_before_opt_a 0.01% : 0.000008s : 1: slice_cell_reuse_recomputed_activation 0.02% : 0.000009s : 1: slice_recompute_activation 0.01% : 0.000008s : 1: split_layernorm_comm 0.01% : 0.000008s : 1: split_matmul_comm_elemetwise 0.02% : 0.000011s : 1: swap_dp_allreduce_reducescatter 0.19% : 0.000109s : 1: symbol_engine_optimizer 0.19% : 0.000107s : 1: tuple_transform 10.77% : 0.006127s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:43:38.610.331 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.130804, [21] [bootstrap]: 0.00046125 [type_inference]: 0.120807 [event_method]: 2.251e-05 [auto_monad]: 6.91e-05 [graph_reusing]: 5.87001e-06 [inline]: 3.08998e-06 [add_attr]: 0.00363724, [1] [add_attr_with_inline]: 0.00362464, [1] [Cycle 1]: 7.051e-05, [2] [tag_attr]: 2.305e-05 [meta_addattr_fg_expand]: 5.62999e-06 [parallel-infer-symbol]: 3.56001e-06 [pre_auto_parallel]: 3.816e-05 [insert-virtual-dataset]: 2.96999e-06 [parallel-infer-symbol-second]: 8.70001e-07 [dataset_repeat_opt]: 2.45002e-06 [pipeline_split]: 1.84e-06 [optimize]: 0.00502861, [53] [py_interpret_to_execute]: 2.794e-05 [rewriter_before_opt_a]: 8.715e-05 [opt_a]: 0.00283983, [2] [Cycle 1]: 0.00214861, [45] [expand_dump_flag]: 3.46001e-06 [switch_simplify]: 4.38e-05 [loop_unroll]: 3.036e-05 [a_1]: 0.00066056 [with_stream_mark]: 1.933e-05 [recompute_prepare]: 9.66e-06 [updatestate_depend_eliminate]: 4.43999e-06 [updatestate_assign_eliminate]: 3.18e-06 [updatestate_loads_eliminate]: 3.14999e-06 [parameter_eliminate]: 1.88002e-06 [a_2]: 8.838e-05 [accelerated_algorithm]: 7.53e-06 [shard]: 1.81998e-06 [meta_shard_fg_expand]: 1.85001e-06 [shard_inline]: 6.47001e-06 [merge_send_recv]: 8.55999e-06 [auto_parallel]: 7.46001e-06 [parallel]: 2.014e-05 [flash_sp]: 9.14998e-06 [merge_comm]: 3.90998e-06 [allreduce_fusion]: 3.65e-06 [matmul_add_comm_reduction]: 9.53002e-06 [allreduce_slice_to_reducescatter]: 6.50005e-07 [virtual_shard_identity]: 8.31002e-06 [virtual_dataset]: 7.1e-06 [get_grad_eliminate_]: 6.06e-06 [virtual_output]: 6.58e-06 [merge_forward]: 3.98001e-06 [cell_reuse_recompute_pass]: 1.38002e-06 [offload_activation]: 1.021e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.205e-05 [merge_recompute_call_nodes]: 1.54e-06 [before_grad]: 1.062e-05 [set_forward_comm_id_for_comm_node_pass]: 3.65e-06 [meta_fg_expand]: 2.64001e-06 [flash_sp_send_recv_attached]: 2.41e-06 [receive_attached]: 2.12999e-06 [after_resolve]: 1.154e-05 [a_after_grad]: 1.014e-05 [renormalize]: 0.00075215 [add_forward_monad_depend]: 5.09e-06 [auto_monad_grad]: 2.50002e-06 [auto_monad_eliminator]: 1.52e-05 [cse]: 3.017e-05 [a_3]: 4.741e-05 [Cycle 2]: 0.00068136, [45] [expand_dump_flag]: 1.14e-06 [switch_simplify]: 7.65998e-06 [loop_unroll]: 6.24999e-06 [a_1]: 0.0001412 [with_stream_mark]: 1.228e-05 [recompute_prepare]: 6.63e-06 [updatestate_depend_eliminate]: 3.09001e-06 [updatestate_assign_eliminate]: 2.74001e-06 [updatestate_loads_eliminate]: 2.51998e-06 [parameter_eliminate]: 1.20999e-06 [a_2]: 7.957e-05 [accelerated_algorithm]: 6.93e-06 [shard]: 1.25001e-06 [meta_shard_fg_expand]: 1.40999e-06 [shard_inline]: 6.46999e-06 [merge_send_recv]: 5.35001e-06 [auto_parallel]: 6.14001e-06 [parallel]: 4.80001e-06 [flash_sp]: 3.55e-06 [merge_comm]: 3.06001e-06 [allreduce_fusion]: 3.16001e-06 [matmul_add_comm_reduction]: 5.97001e-06 [allreduce_slice_to_reducescatter]: 3.50003e-07 [virtual_shard_identity]: 6.51999e-06 [virtual_dataset]: 7.10998e-06 [get_grad_eliminate_]: 6.49001e-06 [virtual_output]: 6.24999e-06 [merge_forward]: 2.89999e-06 [cell_reuse_recompute_pass]: 1.59e-06 [offload_activation]: 7.21999e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.229e-05 [merge_recompute_call_nodes]: 8.39995e-07 [before_grad]: 9.24998e-06 [set_forward_comm_id_for_comm_node_pass]: 3.48e-06 [meta_fg_expand]: 2.08002e-06 [flash_sp_send_recv_attached]: 1.01002e-06 [receive_attached]: 1.04998e-06 [after_resolve]: 1.124e-05 [a_after_grad]: 9.76e-06 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 1.31998e-06 [auto_monad_grad]: 8.40024e-07 [auto_monad_eliminator]: 7.13998e-06 [cse]: 1.35e-05 [a_3]: 3.567e-05 [py_interpret_to_execute_after_opt_a]: 1.043e-05 [slice_cell_reuse_recomputed_activation]: 2.36e-06 [rewriter_after_opt_a]: 3.837e-05 [convert_after_rewriter]: 6.73e-06 [order_py_execute_after_rewriter]: 5.34e-06 [mutable_eliminate]: 0.00059495 [opt_b]: 0.00021214, [1] [Cycle 1]: 0.00020508, [7] [b_1]: 0.0001285 [b_2]: 9.02e-06 [updatestate_depend_eliminate]: 6.58e-06 [updatestate_assign_eliminate]: 2.49999e-06 [updatestate_loads_eliminate]: 2.53e-06 [renormalize]: 6.39993e-07 [cse]: 1.822e-05 [optimize_parallel_all_gather_comm]: 1.67e-05 [overlap_param_gather]: 1.99e-06 [cconv]: 2.728e-05 [loop_unroll]: 0.0004582 [opt_after_cconv]: 0.00010819, [1] [Cycle 1]: 0.00010221, [7] [c_1]: 3.214e-05 [parameter_eliminate]: 3.78999e-06 [updatestate_depend_eliminate]: 6.16998e-06 [updatestate_assign_eliminate]: 2.49001e-06 [updatestate_loads_eliminate]: 2.51e-06 [cse]: 1.877e-05 [renormalize]: 4.09986e-07 [remove_dup_value]: 1.395e-05 [tuple_transform]: 8.041e-05, [1] [Cycle 1]: 7.574e-05, [4] [d_1]: 4.662e-05 [none_parameter_eliminate]: 1.60001e-06 [renormalize]: 2.60014e-07 [switch_simplify]: 8.1e-06 [partial_unused_args_eliminate]: 1.73002e-06 [add_recomputation]: 4.938e-05 [cse_after_recomputation]: 2.207e-05, [1] [Cycle 1]: 1.749e-05, [1] [cse]: 1.165e-05 [environ_conv]: 5.72001e-06 [swap_dp_allreduce_reducescatter]: 5.41998e-06 [bias_add_comm_swap]: 2.54999e-06 [label_micro_interleaved_index]: 4.43999e-06 [label_fine_grained_interleaved_index]: 2.59001e-06 [merge_cast_opt]: 1.34998e-06 [slice_recompute_activation]: 2.15002e-06 [micro_interleaved_order_control]: 2.71e-06 [assign_add_opt]: 1.44e-06 [ForceFp32Comm]: 9.00007e-07 [remove_cast_before_assign_add]: 1.04e-06 [full_micro_interleaved_order_control]: 2.19001e-06 [reorder_send_recv_between_fp_bp]: 2.48e-06 [comm_op_add_attrs]: 1.00001e-06 [add_comm_op_reuse_tag]: 9.39996e-07 [interleave_split_concat_branches]: 1.12999e-06 [interleave_parallel_branches]: 1.17999e-06 [overlap_opt_shard_in_pipeline]: 1.57001e-06 [overlap_opt_shard_grad_in_pipeline]: 2.26e-06 [control_data_broadcast_order]: 1.297e-05 [grouped_pairwise_exchange_alltoall]: 1.51998e-06 [offloading_packed_experts]: 3.95e-06 [overlap_recompute_and_grad_model_parallel]: 4.66002e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.27999e-06 [overlap_recompute_allgather_and_fa_grad]: 1.40999e-06 [overlap_recompute_comm]: 2.29001e-06 [overlap_grad_ring_attention]: 5.14e-06 [overlap_grad_flash_sp]: 2.095e-05 [begin_end_overlap_inline]: 6.50005e-07 [split_matmul_comm_elemetwise]: 2.24999e-06 [split_layernorm_comm]: 1.70001e-06 [handle_group_info]: 1.09998e-06 [symbol_engine_optimizer]: 7.794e-05, [1] [Cycle 1]: 7.336e-05, [6] [build]: 3.04999e-06 [elim_shapecalc]: 1.045e-05 [elim_not_effective]: 1.297e-05 [opt_reshape]: 6.79001e-06 [fold_const_symbol]: 1.025e-05 [renormalize]: 2.30008e-07 [detach_backward]: 2.07999e-06 [pipeline_parallel_scheduler]: 1.54e-06 [auto_monad_reorder]: 1.663e-05 [get_jit_bprop_graph]: 1.98002e-06 [rewriter_after_jit_bprop_graph]: 4.45999e-06 [opt_after_jit_grad]: 0.00048565 [validate]: 4.275e-05 Sums bootstrap : 0.000461s : 0.37% type_inference : 0.120807s : 95.76% event_method : 0.000023s : 0.02% auto_monad : 0.000069s : 0.05% graph_reusing : 0.000006s : 0.00% inline : 0.000003s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000023s : 0.02% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.00% parallel-infer-symbol : 0.000004s : 0.00% pre_auto_parallel : 0.000038s : 0.03% insert-virtual-dataset : 0.000003s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000028s : 0.02% optimize.rewriter_before_opt_a : 0.000087s : 0.07% optimize.opt_a.expand_dump_flag : 0.000005s : 0.00% optimize.opt_a.switch_simplify : 0.000051s : 0.04% optimize.opt_a.loop_unroll : 0.000037s : 0.03% optimize.opt_a.a_1 : 0.000802s : 0.64% optimize.opt_a.with_stream_mark : 0.000032s : 0.03% optimize.opt_a.recompute_prepare : 0.000016s : 0.01% optimize.opt_a.updatestate_depend_eliminate : 0.000008s : 0.01% optimize.opt_a.updatestate_assign_eliminate : 0.000006s : 0.00% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.00% optimize.opt_a.parameter_eliminate : 0.000003s : 0.00% optimize.opt_a.a_2 : 0.000168s : 0.13% optimize.opt_a.accelerated_algorithm : 0.000014s : 0.01% optimize.opt_a.shard : 0.000003s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.000003s : 0.00% optimize.opt_a.shard_inline : 0.000013s : 0.01% optimize.opt_a.merge_send_recv : 0.000014s : 0.01% optimize.opt_a.auto_parallel : 0.000014s : 0.01% optimize.opt_a.parallel : 0.000025s : 0.02% optimize.opt_a.flash_sp : 0.000013s : 0.01% optimize.opt_a.merge_comm : 0.000007s : 0.01% optimize.opt_a.allreduce_fusion : 0.000007s : 0.01% optimize.opt_a.matmul_add_comm_reduction : 0.000016s : 0.01% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000015s : 0.01% optimize.opt_a.virtual_dataset : 0.000014s : 0.01% optimize.opt_a.get_grad_eliminate_ : 0.000013s : 0.01% optimize.opt_a.virtual_output : 0.000013s : 0.01% optimize.opt_a.merge_forward : 0.000007s : 0.01% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.00% optimize.opt_a.offload_activation : 0.000017s : 0.01% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000024s : 0.02% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.00% optimize.opt_a.before_grad : 0.000020s : 0.02% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000007s : 0.01% optimize.opt_a.meta_fg_expand : 0.000005s : 0.00% optimize.opt_a.flash_sp_send_recv_attached : 0.000003s : 0.00% optimize.opt_a.receive_attached : 0.000003s : 0.00% optimize.opt_a.after_resolve : 0.000023s : 0.02% optimize.opt_a.a_after_grad : 0.000020s : 0.02% optimize.opt_a.renormalize : 0.000752s : 0.60% optimize.opt_a.add_forward_monad_depend : 0.000006s : 0.01% optimize.opt_a.auto_monad_grad : 0.000003s : 0.00% optimize.opt_a.auto_monad_eliminator : 0.000022s : 0.02% optimize.opt_a.cse : 0.000044s : 0.03% optimize.opt_a.a_3 : 0.000083s : 0.07% optimize.py_interpret_to_execute_after_opt_a : 0.000010s : 0.01% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.00% optimize.rewriter_after_opt_a : 0.000038s : 0.03% optimize.convert_after_rewriter : 0.000007s : 0.01% optimize.order_py_execute_after_rewriter : 0.000005s : 0.00% optimize.mutable_eliminate : 0.000595s : 0.47% optimize.opt_b.b_1 : 0.000128s : 0.10% optimize.opt_b.b_2 : 0.000009s : 0.01% optimize.opt_b.updatestate_depend_eliminate : 0.000007s : 0.01% optimize.opt_b.updatestate_assign_eliminate : 0.000002s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.00% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000018s : 0.01% optimize.optimize_parallel_all_gather_comm : 0.000017s : 0.01% optimize.overlap_param_gather : 0.000002s : 0.00% optimize.cconv : 0.000027s : 0.02% optimize.loop_unroll : 0.000458s : 0.36% optimize.opt_after_cconv.c_1 : 0.000032s : 0.03% optimize.opt_after_cconv.parameter_eliminate : 0.000004s : 0.00% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.00% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000002s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.cse : 0.000019s : 0.01% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000014s : 0.01% optimize.tuple_transform.d_1 : 0.000047s : 0.04% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000008s : 0.01% optimize.partial_unused_args_eliminate : 0.000002s : 0.00% optimize.add_recomputation : 0.000049s : 0.04% optimize.cse_after_recomputation.cse : 0.000012s : 0.01% optimize.environ_conv : 0.000006s : 0.00% optimize.swap_dp_allreduce_reducescatter : 0.000005s : 0.00% optimize.bias_add_comm_swap : 0.000003s : 0.00% optimize.label_micro_interleaved_index : 0.000004s : 0.00% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.00% optimize.merge_cast_opt : 0.000001s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.00% optimize.micro_interleaved_order_control : 0.000003s : 0.00% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.00% optimize.full_micro_interleaved_order_control : 0.000002s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000002s : 0.00% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000002s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.00% optimize.control_data_broadcast_order : 0.000013s : 0.01% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.00% optimize.offloading_packed_experts : 0.000004s : 0.00% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.00% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.00% optimize.overlap_recompute_comm : 0.000002s : 0.00% optimize.overlap_grad_ring_attention : 0.000005s : 0.00% optimize.overlap_grad_flash_sp : 0.000021s : 0.02% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.00% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000003s : 0.00% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000010s : 0.01% optimize.symbol_engine_optimizer.elim_not_effective : 0.000013s : 0.01% optimize.symbol_engine_optimizer.opt_reshape : 0.000007s : 0.01% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000010s : 0.01% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.00% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000017s : 0.01% get_jit_bprop_graph : 0.000002s : 0.00% rewriter_after_jit_bprop_graph : 0.000004s : 0.00% opt_after_jit_grad : 0.000486s : 0.38% validate : 0.000043s : 0.03% Time group info: ------[substitution.] 0.000228 34 14.48% : 0.000033s : 6: substitution.arithmetic_simplify 0.79% : 0.000002s : 2: substitution.elim_not_effective 0.57% : 0.000001s : 2: substitution.fold_const_symbol 2.85% : 0.000007s : 4: substitution.graph_param_transform 68.69% : 0.000157s : 4: substitution.inline 1.50% : 0.000003s : 4: substitution.j_node_and_user_rematch 2.08% : 0.000005s : 4: substitution.remove_not_recompute_node 2.06% : 0.000005s : 4: substitution.replace_old_param 6.99% : 0.000016s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.120724 2 99.21% : 0.119772s : 1: type_inference.infer 0.79% : 0.000952s : 1: type_inference.specialize ------[replace.] 0.000062 8 63.31% : 0.000039s : 4: replace.inline 36.69% : 0.000023s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000168 8 91.58% : 0.000154s : 4: match.inline 8.42% : 0.000014s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000204 1278 0.92% : 0.000002s : 13: predicate.accumulaten_eliminater 0.77% : 0.000002s : 4: predicate.ad_related_special_op_eliminate 0.49% : 0.000001s : 8: predicate.addn_check_dump 0.93% : 0.000002s : 13: predicate.addn_zero_filter 0.83% : 0.000002s : 13: predicate.adjust_all_reduce_mul_add 2.55% : 0.000005s : 21: predicate.arithmetic_simplify 0.86% : 0.000002s : 13: predicate.cast_eliminate 0.56% : 0.000001s : 8: predicate.check_bprop_eliminate 0.50% : 0.000001s : 8: predicate.compare_switch_simplify 0.19% : 0.000000s : 4: predicate.const_output_eliminate 0.60% : 0.000001s : 8: predicate.depend_value_elim 0.90% : 0.000002s : 13: predicate.dict_get_item_const_eliminator 1.08% : 0.000002s : 13: predicate.dict_get_item_eliminator 0.82% : 0.000002s : 13: predicate.dict_set_item_eliminator 1.13% : 0.000002s : 8: predicate.dumpgradient_eliminate 0.23% : 0.000000s : 4: predicate.elim_not_effective 0.46% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.10% : 0.000002s : 17: predicate.environ_add_const_eliminate 1.09% : 0.000002s : 17: predicate.environ_get_add_eliminate 1.10% : 0.000002s : 17: predicate.environ_get_depend_swap 1.64% : 0.000003s : 25: predicate.environ_get_eliminate 1.10% : 0.000002s : 17: predicate.environ_get_set_eliminate 1.48% : 0.000003s : 21: predicate.exchange_switch_depend_value 2.49% : 0.000005s : 21: predicate.float_depend_g_call 0.55% : 0.000001s : 8: predicate.float_environ_get_switch 0.80% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.19% : 0.000000s : 4: predicate.fold_const_symbol 0.62% : 0.000001s : 8: predicate.get_grad_eliminate 0.21% : 0.000000s : 4: predicate.graph_param_transform 0.59% : 0.000001s : 8: predicate.incorporate_call 0.48% : 0.000001s : 8: predicate.incorporate_call_switch 6.65% : 0.000014s : 58: predicate.inline 0.80% : 0.000002s : 8: predicate.inline_without_move 0.35% : 0.000001s : 8: predicate.j_node_and_user_rematch 0.90% : 0.000002s : 8: predicate.less_batch_normalization 1.79% : 0.000004s : 25: predicate.list_to_tuple_eliminator_ 2.49% : 0.000005s : 38: predicate.load_eliminater 1.09% : 0.000002s : 4: predicate.loop_unroll_after_grad 2.32% : 0.000005s : 34: predicate.loop_unroll_before_grad 1.73% : 0.000004s : 21: predicate.make_slice_get_slice_eliminator 0.52% : 0.000001s : 8: predicate.merge_addn 0.57% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.64% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.80% : 0.000002s : 13: predicate.minmaximum_grad 1.30% : 0.000003s : 4: predicate.mutable_eliminate 0.35% : 0.000001s : 4: predicate.opt_reshape 0.37% : 0.000001s : 4: predicate.parallel_virtual_node 1.75% : 0.000004s : 21: predicate.partial_defer_inline 1.64% : 0.000003s : 21: predicate.partial_eliminate 0.85% : 0.000002s : 13: predicate.print_const_string_wrapper 0.58% : 0.000001s : 8: predicate.reduce_all_const_elim 1.20% : 0.000002s : 13: predicate.reduce_eliminate 2.48% : 0.000005s : 38: predicate.redundant_stop_gradient_eliminater 0.49% : 0.000001s : 8: predicate.remove_not_recompute_node 1.34% : 0.000003s : 25: predicate.replace_applicator 0.50% : 0.000001s : 8: predicate.replace_old_param 0.27% : 0.000001s : 4: predicate.reset_defer_inline 0.93% : 0.000002s : 13: predicate.reshape_eliminate 0.69% : 0.000001s : 8: predicate.row_tensor_add_zeros_like 0.44% : 0.000001s : 4: predicate.row_tensor_eliminate 0.71% : 0.000001s : 8: predicate.same_eliminate 0.45% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.69% : 0.000001s : 8: predicate.shard_identity_eliminate 0.74% : 0.000002s : 8: predicate.special_op_eliminate 0.68% : 0.000001s : 8: predicate.specialize_transform 0.86% : 0.000002s : 8: predicate.split_environ_get_set_with_tuple_value 0.69% : 0.000001s : 8: predicate.stack_unstack_eliminate 0.35% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.56% : 0.000003s : 21: predicate.switch_defer_inline 2.00% : 0.000004s : 29: predicate.switch_layer_defer_inline 5.30% : 0.000011s : 67: predicate.switch_simplify 0.87% : 0.000002s : 13: predicate.tile_eliminate 0.90% : 0.000002s : 13: predicate.transpose_eliminate 1.55% : 0.000003s : 21: predicate.tuple_list_convert_item_index_to_positive 1.47% : 0.000003s : 21: predicate.tuple_list_get_item_const_eliminator 1.40% : 0.000003s : 21: predicate.tuple_list_get_item_depend_reorder 3.03% : 0.000006s : 33: predicate.tuple_list_get_item_eliminator 1.46% : 0.000003s : 21: predicate.tuple_list_get_set_item_eliminator 2.25% : 0.000005s : 29: predicate.tuple_list_set_item_eliminator 1.72% : 0.000004s : 25: predicate.tuple_to_list_eliminator_ 2.39% : 0.000005s : 38: predicate.updatestate_pure_node_eliminater 3.17% : 0.000006s : 46: predicate.updatestate_useless_node_eliminater 0.37% : 0.000001s : 4: predicate.value_based_eliminate 0.83% : 0.000002s : 8: predicate.virtual_dataset_eliminate 0.68% : 0.000001s : 8: predicate.virtual_output_eliminate 0.36% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.44% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000662 11 47.28% : 0.000313s : 5: func_graph_cloner_run.FuncGraphClonerGraph 52.72% : 0.000349s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.141605 192 0.00% : 0.000004s : 1: ForceFp32Comm 2.57% : 0.003643s : 1: add_attr 2.56% : 0.003629s : 1: add_attr_with_inline 0.00% : 0.000005s : 1: add_comm_op_reuse_tag 0.04% : 0.000053s : 1: add_recomputation 0.00% : 0.000004s : 1: assign_add_opt 0.05% : 0.000075s : 1: auto_monad 0.01% : 0.000021s : 1: auto_monad_reorder 0.00% : 0.000004s : 1: begin_end_overlap_inline 0.00% : 0.000005s : 1: bias_add_comm_swap 0.35% : 0.000490s : 1: bootstrap 0.02% : 0.000031s : 1: cconv 0.00% : 0.000004s : 1: comm_op_add_attrs 0.01% : 0.000016s : 1: control_data_broadcast_order 0.01% : 0.000010s : 1: convert_after_rewriter 0.02% : 0.000025s : 1: cse_after_recomputation 0.00% : 0.000005s : 1: dataset_repeat_opt 0.00% : 0.000005s : 1: detach_backward 0.01% : 0.000009s : 1: environ_conv 0.02% : 0.000030s : 1: event_method 0.00% : 0.000005s : 1: full_micro_interleaved_order_control 0.00% : 0.000005s : 1: get_jit_bprop_graph 0.01% : 0.000010s : 1: graph_reusing 0.00% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000004s : 1: handle_group_info 0.00% : 0.000006s : 1: inline 0.00% : 0.000006s : 1: insert-virtual-dataset 0.00% : 0.000004s : 1: interleave_parallel_branches 0.00% : 0.000004s : 1: interleave_split_concat_branches 0.00% : 0.000006s : 1: label_fine_grained_interleaved_index 0.01% : 0.000007s : 1: label_micro_interleaved_index 0.33% : 0.000467s : 1: loop_unroll 0.00% : 0.000004s : 1: merge_cast_opt 0.00% : 0.000006s : 1: micro_interleaved_order_control 0.43% : 0.000605s : 1: mutable_eliminate 0.00% : 0.000007s : 1: offloading_packed_experts 0.01% : 0.000016s : 1: opt.transform.loop_unroll_optimizer 0.01% : 0.000017s : 1: opt.transform.mutable_eliminate 0.87% : 0.001236s : 78: opt.transform.opt_a 0.02% : 0.000031s : 1: opt.transform.opt_after_cconv 0.02% : 0.000026s : 1: opt.transform.opt_after_jit_grad 0.07% : 0.000106s : 28: opt.transform.opt_b 0.04% : 0.000052s : 2: opt.transform.opt_trans_graph 0.03% : 0.000037s : 4: opt.transform.symbol_engine_opt 2.01% : 0.002843s : 1: opt_a 0.08% : 0.000112s : 1: opt_after_cconv 0.35% : 0.000495s : 1: opt_after_jit_grad 0.15% : 0.000215s : 1: opt_b 3.55% : 0.005033s : 1: optimize 0.01% : 0.000020s : 1: optimize_parallel_all_gather_comm 0.01% : 0.000009s : 1: order_py_execute_after_rewriter 0.02% : 0.000024s : 1: overlap_grad_flash_sp 0.00% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.01% : 0.000009s : 1: overlap_grad_ring_attention 0.00% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000005s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000005s : 1: overlap_param_gather 0.00% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.01% : 0.000008s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000005s : 1: overlap_recompute_comm 0.01% : 0.000007s : 1: parallel-infer-symbol 0.00% : 0.000004s : 1: parallel-infer-symbol-second 0.00% : 0.000005s : 1: partial_unused_args_eliminate 0.00% : 0.000005s : 1: pipeline_parallel_scheduler 0.00% : 0.000005s : 1: pipeline_split 0.03% : 0.000042s : 1: pre_auto_parallel 0.02% : 0.000032s : 1: py_interpret_to_execute 0.01% : 0.000014s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000004s : 1: remove_cast_before_assign_add 0.01% : 0.000017s : 1: remove_dup_value 0.29% : 0.000410s : 1: renormalize.infer 0.24% : 0.000334s : 1: renormalize.specialize 0.00% : 0.000005s : 1: reorder_send_recv_between_fp_bp 0.01% : 0.000008s : 1: rewriter_after_jit_bprop_graph 0.03% : 0.000043s : 1: rewriter_after_opt_a 0.06% : 0.000092s : 1: rewriter_before_opt_a 0.01% : 0.000021s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000006s : 1: slice_recompute_activation 0.00% : 0.000005s : 1: split_layernorm_comm 0.00% : 0.000005s : 1: split_matmul_comm_elemetwise 0.01% : 0.000008s : 1: swap_dp_allreduce_reducescatter 0.06% : 0.000081s : 1: symbol_engine_optimizer 0.06% : 0.000083s : 1: tuple_transform 85.33% : 0.120835s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:43:39.859.397 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:43:39.859.649 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0381163, [21] [bootstrap]: 0.00041122 [type_inference]: 0.00576546 [event_method]: 2.007e-05 [auto_monad]: 6.338e-05 [graph_reusing]: 6.21e-06 [inline]: 2.33998e-06 [add_attr]: 0.00345419, [1] [add_attr_with_inline]: 0.00344274, [1] [Cycle 1]: 9.441e-05, [2] [tag_attr]: 2.319e-05 [meta_addattr_fg_expand]: 5.94e-06 [parallel-infer-symbol]: 3.54002e-06 [pre_auto_parallel]: 3.927e-05 [insert-virtual-dataset]: 2.46e-06 [parallel-infer-symbol-second]: 9.20001e-07 [dataset_repeat_opt]: 2.64001e-06 [pipeline_split]: 1.66e-06 [optimize]: 0.0267443, [53] [py_interpret_to_execute]: 3.42e-05 [rewriter_before_opt_a]: 9.651e-05 [opt_a]: 0.0239071, [2] [Cycle 1]: 0.0229418, [45] [expand_dump_flag]: 2.89999e-06 [switch_simplify]: 4.483e-05 [loop_unroll]: 3.077e-05 [a_1]: 0.0209126 [with_stream_mark]: 4.308e-05 [recompute_prepare]: 1.738e-05 [updatestate_depend_eliminate]: 5.07e-06 [updatestate_assign_eliminate]: 3.5e-06 [updatestate_loads_eliminate]: 3.02002e-06 [parameter_eliminate]: 2.61e-06 [a_2]: 0.00012633 [accelerated_algorithm]: 8.47998e-06 [shard]: 3.91001e-06 [meta_shard_fg_expand]: 3.99002e-06 [shard_inline]: 7.23e-06 [merge_send_recv]: 1.159e-05 [auto_parallel]: 1.171e-05 [parallel]: 2.215e-05 [flash_sp]: 1.316e-05 [merge_comm]: 4.08001e-06 [allreduce_fusion]: 3.73999e-06 [matmul_add_comm_reduction]: 1.15e-05 [allreduce_slice_to_reducescatter]: 1.19e-06 [virtual_shard_identity]: 8.82e-06 [virtual_dataset]: 7.66999e-06 [get_grad_eliminate_]: 6.46999e-06 [virtual_output]: 7.10002e-06 [merge_forward]: 4.36002e-06 [cell_reuse_recompute_pass]: 3.86999e-06 [offload_activation]: 1.142e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.747e-05 [merge_recompute_call_nodes]: 1.44998e-06 [before_grad]: 1.183e-05 [set_forward_comm_id_for_comm_node_pass]: 5.56e-06 [meta_fg_expand]: 4.03001e-06 [flash_sp_send_recv_attached]: 2.88e-06 [receive_attached]: 2.21e-06 [after_resolve]: 1.491e-05 [a_after_grad]: 1.191e-05 [renormalize]: 0.00090054 [add_forward_monad_depend]: 8.48999e-06 [auto_monad_grad]: 2.79999e-06 [auto_monad_eliminator]: 2.144e-05 [cse]: 3.08e-05 [a_3]: 7.341e-05 [Cycle 2]: 0.00094588, [45] [expand_dump_flag]: 2.92002e-06 [switch_simplify]: 8.77999e-06 [loop_unroll]: 6.69001e-06 [a_1]: 0.00015495 [with_stream_mark]: 1.854e-05 [recompute_prepare]: 7.68999e-06 [updatestate_depend_eliminate]: 3.48e-06 [updatestate_assign_eliminate]: 2.93998e-06 [updatestate_loads_eliminate]: 2.69999e-06 [parameter_eliminate]: 1.96e-06 [a_2]: 0.00010833 [accelerated_algorithm]: 6.85998e-06 [shard]: 2.01e-06 [meta_shard_fg_expand]: 2.09999e-06 [shard_inline]: 6.34001e-06 [merge_send_recv]: 8.62e-06 [auto_parallel]: 9.88002e-06 [parallel]: 8.82e-06 [flash_sp]: 4.90001e-06 [merge_comm]: 4.40999e-06 [allreduce_fusion]: 3.89002e-06 [matmul_add_comm_reduction]: 9.59e-06 [allreduce_slice_to_reducescatter]: 5.09986e-07 [virtual_shard_identity]: 7.92003e-06 [virtual_dataset]: 7.18998e-06 [get_grad_eliminate_]: 6.46999e-06 [virtual_output]: 6.77002e-06 [merge_forward]: 4.92e-06 [cell_reuse_recompute_pass]: 2.83998e-06 [offload_activation]: 1.015e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.572e-05 [merge_recompute_call_nodes]: 1.72001e-06 [before_grad]: 1.137e-05 [set_forward_comm_id_for_comm_node_pass]: 3.88999e-06 [meta_fg_expand]: 2.69999e-06 [flash_sp_send_recv_attached]: 1.57001e-06 [receive_attached]: 1.87001e-06 [after_resolve]: 1.324e-05 [a_after_grad]: 1.055e-05 [renormalize]: 7.00238e-08 [add_forward_monad_depend]: 2.86999e-06 [auto_monad_grad]: 2.02001e-06 [auto_monad_eliminator]: 1.058e-05 [cse]: 1.729e-05 [a_3]: 5.14e-05 [py_interpret_to_execute_after_opt_a]: 1.813e-05 [slice_cell_reuse_recomputed_activation]: 5.09998e-06 [rewriter_after_opt_a]: 4.633e-05 [convert_after_rewriter]: 1.033e-05 [order_py_execute_after_rewriter]: 8.09997e-06 [mutable_eliminate]: 0.00081073 [opt_b]: 0.00030174, [1] [Cycle 1]: 0.00028965, [7] [b_1]: 0.00017399 [b_2]: 8.46002e-06 [updatestate_depend_eliminate]: 9.41e-06 [updatestate_assign_eliminate]: 3.7e-06 [updatestate_loads_eliminate]: 2.91e-06 [renormalize]: 5.69999e-07 [cse]: 2.849e-05 [optimize_parallel_all_gather_comm]: 2.205e-05 [overlap_param_gather]: 4.74002e-06 [cconv]: 4.064e-05 [loop_unroll]: 0.00055049 [opt_after_cconv]: 0.0001417, [1] [Cycle 1]: 0.00013195, [7] [c_1]: 3.295e-05 [parameter_eliminate]: 5.81e-06 [updatestate_depend_eliminate]: 6.09999e-06 [updatestate_assign_eliminate]: 2.54999e-06 [updatestate_loads_eliminate]: 2.69999e-06 [cse]: 2.336e-05 [renormalize]: 4.69998e-07 [remove_dup_value]: 1.856e-05 [tuple_transform]: 9.989e-05, [1] [Cycle 1]: 9.195e-05, [4] [d_1]: 5.029e-05 [none_parameter_eliminate]: 2.22001e-06 [renormalize]: 2.19996e-07 [switch_simplify]: 8.17998e-06 [partial_unused_args_eliminate]: 4.33999e-06 [add_recomputation]: 5.694e-05 [cse_after_recomputation]: 2.941e-05, [1] [Cycle 1]: 2.218e-05, [1] [cse]: 1.298e-05 [environ_conv]: 8.45999e-06 [swap_dp_allreduce_reducescatter]: 8.70001e-06 [bias_add_comm_swap]: 5.84e-06 [label_micro_interleaved_index]: 8.35001e-06 [label_fine_grained_interleaved_index]: 5.99e-06 [merge_cast_opt]: 3.98999e-06 [slice_recompute_activation]: 4.73001e-06 [micro_interleaved_order_control]: 4.85999e-06 [assign_add_opt]: 3.48999e-06 [ForceFp32Comm]: 3.3e-06 [remove_cast_before_assign_add]: 3.4e-06 [full_micro_interleaved_order_control]: 4.70999e-06 [reorder_send_recv_between_fp_bp]: 5.99999e-06 [comm_op_add_attrs]: 3.56001e-06 [add_comm_op_reuse_tag]: 3.21999e-06 [interleave_split_concat_branches]: 3.78001e-06 [interleave_parallel_branches]: 3.62002e-06 [overlap_opt_shard_in_pipeline]: 3.53e-06 [overlap_opt_shard_grad_in_pipeline]: 4.95999e-06 [control_data_broadcast_order]: 1.903e-05 [grouped_pairwise_exchange_alltoall]: 4.25e-06 [offloading_packed_experts]: 6.56e-06 [overlap_recompute_and_grad_model_parallel]: 7.69002e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.58e-06 [overlap_recompute_allgather_and_fa_grad]: 4.18001e-06 [overlap_recompute_comm]: 4.84998e-06 [overlap_grad_ring_attention]: 7.61001e-06 [overlap_grad_flash_sp]: 2.509e-05 [begin_end_overlap_inline]: 3.02002e-06 [split_matmul_comm_elemetwise]: 4.85001e-06 [split_layernorm_comm]: 4.23999e-06 [handle_group_info]: 3.68e-06 [symbol_engine_optimizer]: 0.00011299, [1] [Cycle 1]: 0.00010478, [6] [build]: 4.61002e-06 [elim_shapecalc]: 1.271e-05 [elim_not_effective]: 1.392e-05 [opt_reshape]: 7.61001e-06 [fold_const_symbol]: 1.122e-05 [renormalize]: 3.89991e-07 [detach_backward]: 5.92999e-06 [pipeline_parallel_scheduler]: 1.95001e-06 [auto_monad_reorder]: 2.695e-05 [get_jit_bprop_graph]: 1.72999e-06 [rewriter_after_jit_bprop_graph]: 6.14001e-06 [opt_after_jit_grad]: 0.00064451 [validate]: 4.744e-05 Sums bootstrap : 0.000411s : 1.27% type_inference : 0.005765s : 17.76% event_method : 0.000020s : 0.06% auto_monad : 0.000063s : 0.20% graph_reusing : 0.000006s : 0.02% inline : 0.000002s : 0.01% add_attr.add_attr_with_inline.tag_attr : 0.000023s : 0.07% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.02% parallel-infer-symbol : 0.000004s : 0.01% pre_auto_parallel : 0.000039s : 0.12% insert-virtual-dataset : 0.000002s : 0.01% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000003s : 0.01% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000034s : 0.11% optimize.rewriter_before_opt_a : 0.000097s : 0.30% optimize.opt_a.expand_dump_flag : 0.000006s : 0.02% optimize.opt_a.switch_simplify : 0.000054s : 0.17% optimize.opt_a.loop_unroll : 0.000037s : 0.12% optimize.opt_a.a_1 : 0.021068s : 64.90% optimize.opt_a.with_stream_mark : 0.000062s : 0.19% optimize.opt_a.recompute_prepare : 0.000025s : 0.08% optimize.opt_a.updatestate_depend_eliminate : 0.000009s : 0.03% optimize.opt_a.updatestate_assign_eliminate : 0.000006s : 0.02% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.02% optimize.opt_a.parameter_eliminate : 0.000005s : 0.01% optimize.opt_a.a_2 : 0.000235s : 0.72% optimize.opt_a.accelerated_algorithm : 0.000015s : 0.05% optimize.opt_a.shard : 0.000006s : 0.02% optimize.opt_a.meta_shard_fg_expand : 0.000006s : 0.02% optimize.opt_a.shard_inline : 0.000014s : 0.04% optimize.opt_a.merge_send_recv : 0.000020s : 0.06% optimize.opt_a.auto_parallel : 0.000022s : 0.07% optimize.opt_a.parallel : 0.000031s : 0.10% optimize.opt_a.flash_sp : 0.000018s : 0.06% optimize.opt_a.merge_comm : 0.000008s : 0.03% optimize.opt_a.allreduce_fusion : 0.000008s : 0.02% optimize.opt_a.matmul_add_comm_reduction : 0.000021s : 0.06% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000002s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000017s : 0.05% optimize.opt_a.virtual_dataset : 0.000015s : 0.05% optimize.opt_a.get_grad_eliminate_ : 0.000013s : 0.04% optimize.opt_a.virtual_output : 0.000014s : 0.04% optimize.opt_a.merge_forward : 0.000009s : 0.03% optimize.opt_a.cell_reuse_recompute_pass : 0.000007s : 0.02% optimize.opt_a.offload_activation : 0.000022s : 0.07% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000033s : 0.10% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.01% optimize.opt_a.before_grad : 0.000023s : 0.07% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000009s : 0.03% optimize.opt_a.meta_fg_expand : 0.000007s : 0.02% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.01% optimize.opt_a.receive_attached : 0.000004s : 0.01% optimize.opt_a.after_resolve : 0.000028s : 0.09% optimize.opt_a.a_after_grad : 0.000022s : 0.07% optimize.opt_a.renormalize : 0.000901s : 2.77% optimize.opt_a.add_forward_monad_depend : 0.000011s : 0.03% optimize.opt_a.auto_monad_grad : 0.000005s : 0.01% optimize.opt_a.auto_monad_eliminator : 0.000032s : 0.10% optimize.opt_a.cse : 0.000048s : 0.15% optimize.opt_a.a_3 : 0.000125s : 0.38% optimize.py_interpret_to_execute_after_opt_a : 0.000018s : 0.06% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.02% optimize.rewriter_after_opt_a : 0.000046s : 0.14% optimize.convert_after_rewriter : 0.000010s : 0.03% optimize.order_py_execute_after_rewriter : 0.000008s : 0.02% optimize.mutable_eliminate : 0.000811s : 2.50% optimize.opt_b.b_1 : 0.000174s : 0.54% optimize.opt_b.b_2 : 0.000008s : 0.03% optimize.opt_b.updatestate_depend_eliminate : 0.000009s : 0.03% optimize.opt_b.updatestate_assign_eliminate : 0.000004s : 0.01% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.01% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000028s : 0.09% optimize.optimize_parallel_all_gather_comm : 0.000022s : 0.07% optimize.overlap_param_gather : 0.000005s : 0.01% optimize.cconv : 0.000041s : 0.13% optimize.loop_unroll : 0.000550s : 1.70% optimize.opt_after_cconv.c_1 : 0.000033s : 0.10% optimize.opt_after_cconv.parameter_eliminate : 0.000006s : 0.02% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.02% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.cse : 0.000023s : 0.07% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000019s : 0.06% optimize.tuple_transform.d_1 : 0.000050s : 0.15% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000008s : 0.03% optimize.partial_unused_args_eliminate : 0.000004s : 0.01% optimize.add_recomputation : 0.000057s : 0.18% optimize.cse_after_recomputation.cse : 0.000013s : 0.04% optimize.environ_conv : 0.000008s : 0.03% optimize.swap_dp_allreduce_reducescatter : 0.000009s : 0.03% optimize.bias_add_comm_swap : 0.000006s : 0.02% optimize.label_micro_interleaved_index : 0.000008s : 0.03% optimize.label_fine_grained_interleaved_index : 0.000006s : 0.02% optimize.merge_cast_opt : 0.000004s : 0.01% optimize.slice_recompute_activation : 0.000005s : 0.01% optimize.micro_interleaved_order_control : 0.000005s : 0.01% optimize.assign_add_opt : 0.000003s : 0.01% optimize.ForceFp32Comm : 0.000003s : 0.01% optimize.remove_cast_before_assign_add : 0.000003s : 0.01% optimize.full_micro_interleaved_order_control : 0.000005s : 0.01% optimize.reorder_send_recv_between_fp_bp : 0.000006s : 0.02% optimize.comm_op_add_attrs : 0.000004s : 0.01% optimize.add_comm_op_reuse_tag : 0.000003s : 0.01% optimize.interleave_split_concat_branches : 0.000004s : 0.01% optimize.interleave_parallel_branches : 0.000004s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000005s : 0.02% optimize.control_data_broadcast_order : 0.000019s : 0.06% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.01% optimize.offloading_packed_experts : 0.000007s : 0.02% optimize.overlap_recompute_and_grad_model_parallel : 0.000008s : 0.02% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.01% optimize.overlap_recompute_comm : 0.000005s : 0.01% optimize.overlap_grad_ring_attention : 0.000008s : 0.02% optimize.overlap_grad_flash_sp : 0.000025s : 0.08% optimize.begin_end_overlap_inline : 0.000003s : 0.01% optimize.split_matmul_comm_elemetwise : 0.000005s : 0.01% optimize.split_layernorm_comm : 0.000004s : 0.01% optimize.handle_group_info : 0.000004s : 0.01% optimize.symbol_engine_optimizer.build : 0.000005s : 0.01% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000013s : 0.04% optimize.symbol_engine_optimizer.elim_not_effective : 0.000014s : 0.04% optimize.symbol_engine_optimizer.opt_reshape : 0.000008s : 0.02% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000011s : 0.03% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000006s : 0.02% pipeline_parallel_scheduler : 0.000002s : 0.01% auto_monad_reorder : 0.000027s : 0.08% get_jit_bprop_graph : 0.000002s : 0.01% rewriter_after_jit_bprop_graph : 0.000006s : 0.02% opt_after_jit_grad : 0.000645s : 1.99% validate : 0.000047s : 0.15% Time group info: ------[substitution.] 0.000269 34 21.89% : 0.000059s : 6: substitution.arithmetic_simplify 0.82% : 0.000002s : 2: substitution.elim_not_effective 0.60% : 0.000002s : 2: substitution.fold_const_symbol 2.64% : 0.000007s : 4: substitution.graph_param_transform 60.56% : 0.000163s : 4: substitution.inline 2.01% : 0.000005s : 4: substitution.j_node_and_user_rematch 2.70% : 0.000007s : 4: substitution.remove_not_recompute_node 2.78% : 0.000007s : 4: substitution.replace_old_param 5.99% : 0.000016s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.005707 2 87.16% : 0.004974s : 1: type_inference.infer 12.84% : 0.000733s : 1: type_inference.specialize ------[replace.] 0.000070 8 65.03% : 0.000046s : 4: replace.inline 34.97% : 0.000025s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000174 8 91.88% : 0.000160s : 4: match.inline 8.12% : 0.000014s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.020374 1278 0.01% : 0.000002s : 13: predicate.accumulaten_eliminater 0.01% : 0.000003s : 4: predicate.ad_related_special_op_eliminate 0.01% : 0.000001s : 8: predicate.addn_check_dump 0.01% : 0.000002s : 13: predicate.addn_zero_filter 0.01% : 0.000002s : 13: predicate.adjust_all_reduce_mul_add 0.04% : 0.000008s : 21: predicate.arithmetic_simplify 0.02% : 0.000003s : 13: predicate.cast_eliminate 0.01% : 0.000001s : 8: predicate.check_bprop_eliminate 0.01% : 0.000002s : 8: predicate.compare_switch_simplify 0.00% : 0.000000s : 4: predicate.const_output_eliminate 0.01% : 0.000002s : 8: predicate.depend_value_elim 0.01% : 0.000002s : 13: predicate.dict_get_item_const_eliminator 0.01% : 0.000003s : 13: predicate.dict_get_item_eliminator 0.01% : 0.000002s : 13: predicate.dict_set_item_eliminator 0.01% : 0.000003s : 8: predicate.dumpgradient_eliminate 0.00% : 0.000001s : 4: predicate.elim_not_effective 0.00% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 0.01% : 0.000003s : 17: predicate.environ_add_const_eliminate 0.01% : 0.000002s : 17: predicate.environ_get_add_eliminate 0.01% : 0.000002s : 17: predicate.environ_get_depend_swap 0.02% : 0.000004s : 25: predicate.environ_get_eliminate 0.01% : 0.000002s : 17: predicate.environ_get_set_eliminate 0.02% : 0.000003s : 21: predicate.exchange_switch_depend_value 0.03% : 0.000006s : 21: predicate.float_depend_g_call 0.01% : 0.000001s : 8: predicate.float_environ_get_switch 0.01% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.00% : 0.000000s : 4: predicate.fold_const_symbol 0.01% : 0.000002s : 8: predicate.get_grad_eliminate 0.00% : 0.000001s : 4: predicate.graph_param_transform 0.01% : 0.000001s : 8: predicate.incorporate_call 0.00% : 0.000001s : 8: predicate.incorporate_call_switch 0.09% : 0.000018s : 58: predicate.inline 0.01% : 0.000002s : 8: predicate.inline_without_move 0.00% : 0.000001s : 8: predicate.j_node_and_user_rematch 0.01% : 0.000002s : 8: predicate.less_batch_normalization 98.81% : 0.020132s : 25: predicate.list_to_tuple_eliminator_ 0.03% : 0.000006s : 38: predicate.load_eliminater 0.01% : 0.000002s : 4: predicate.loop_unroll_after_grad 0.03% : 0.000005s : 34: predicate.loop_unroll_before_grad 0.02% : 0.000004s : 21: predicate.make_slice_get_slice_eliminator 0.01% : 0.000001s : 8: predicate.merge_addn 0.01% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.01% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.01% : 0.000002s : 13: predicate.minmaximum_grad 0.02% : 0.000003s : 4: predicate.mutable_eliminate 0.00% : 0.000001s : 4: predicate.opt_reshape 0.00% : 0.000001s : 4: predicate.parallel_virtual_node 0.02% : 0.000004s : 21: predicate.partial_defer_inline 0.02% : 0.000003s : 21: predicate.partial_eliminate 0.01% : 0.000002s : 13: predicate.print_const_string_wrapper 0.01% : 0.000002s : 8: predicate.reduce_all_const_elim 0.01% : 0.000003s : 13: predicate.reduce_eliminate 0.03% : 0.000006s : 38: predicate.redundant_stop_gradient_eliminater 0.01% : 0.000001s : 8: predicate.remove_not_recompute_node 0.02% : 0.000003s : 25: predicate.replace_applicator 0.01% : 0.000001s : 8: predicate.replace_old_param 0.00% : 0.000001s : 4: predicate.reset_defer_inline 0.01% : 0.000003s : 13: predicate.reshape_eliminate 0.01% : 0.000001s : 8: predicate.row_tensor_add_zeros_like 0.00% : 0.000001s : 4: predicate.row_tensor_eliminate 0.01% : 0.000002s : 8: predicate.same_eliminate 0.01% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.01% : 0.000002s : 8: predicate.shard_identity_eliminate 0.01% : 0.000002s : 8: predicate.special_op_eliminate 0.01% : 0.000002s : 8: predicate.specialize_transform 0.01% : 0.000002s : 8: predicate.split_environ_get_set_with_tuple_value 0.01% : 0.000002s : 8: predicate.stack_unstack_eliminate 0.00% : 0.000001s : 4: predicate.switch_call_monad_eliminater 0.02% : 0.000003s : 21: predicate.switch_defer_inline 0.02% : 0.000005s : 29: predicate.switch_layer_defer_inline 0.05% : 0.000011s : 67: predicate.switch_simplify 0.01% : 0.000002s : 13: predicate.tile_eliminate 0.01% : 0.000002s : 13: predicate.transpose_eliminate 0.02% : 0.000003s : 21: predicate.tuple_list_convert_item_index_to_positive 0.02% : 0.000004s : 21: predicate.tuple_list_get_item_const_eliminator 0.02% : 0.000004s : 21: predicate.tuple_list_get_item_depend_reorder 0.05% : 0.000010s : 33: predicate.tuple_list_get_item_eliminator 0.02% : 0.000004s : 21: predicate.tuple_list_get_set_item_eliminator 0.03% : 0.000005s : 29: predicate.tuple_list_set_item_eliminator 0.02% : 0.000005s : 25: predicate.tuple_to_list_eliminator_ 0.03% : 0.000006s : 38: predicate.updatestate_pure_node_eliminater 0.04% : 0.000008s : 46: predicate.updatestate_useless_node_eliminater 0.00% : 0.000001s : 4: predicate.value_based_eliminate 0.01% : 0.000002s : 8: predicate.virtual_dataset_eliminate 0.01% : 0.000001s : 8: predicate.virtual_output_eliminate 0.00% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.00% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000579 11 51.83% : 0.000300s : 5: func_graph_cloner_run.FuncGraphClonerGraph 48.17% : 0.000279s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.090910 192 0.01% : 0.000006s : 1: ForceFp32Comm 3.81% : 0.003466s : 1: add_attr 3.79% : 0.003447s : 1: add_attr_with_inline 0.01% : 0.000007s : 1: add_comm_op_reuse_tag 0.07% : 0.000061s : 1: add_recomputation 0.01% : 0.000006s : 1: assign_add_opt 0.08% : 0.000072s : 1: auto_monad 0.04% : 0.000035s : 1: auto_monad_reorder 0.01% : 0.000006s : 1: begin_end_overlap_inline 0.01% : 0.000009s : 1: bias_add_comm_swap 0.50% : 0.000455s : 1: bootstrap 0.05% : 0.000044s : 1: cconv 0.01% : 0.000006s : 1: comm_op_add_attrs 0.02% : 0.000022s : 1: control_data_broadcast_order 0.01% : 0.000014s : 1: convert_after_rewriter 0.04% : 0.000033s : 1: cse_after_recomputation 0.01% : 0.000008s : 1: dataset_repeat_opt 0.04% : 0.000033s : 1: detach_backward 0.01% : 0.000011s : 1: environ_conv 0.03% : 0.000031s : 1: event_method 0.01% : 0.000007s : 1: full_micro_interleaved_order_control 0.01% : 0.000008s : 1: get_jit_bprop_graph 0.01% : 0.000013s : 1: graph_reusing 0.01% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000007s : 1: handle_group_info 0.01% : 0.000008s : 1: inline 0.01% : 0.000010s : 1: insert-virtual-dataset 0.01% : 0.000006s : 1: interleave_parallel_branches 0.01% : 0.000007s : 1: interleave_split_concat_branches 0.01% : 0.000009s : 1: label_fine_grained_interleaved_index 0.01% : 0.000011s : 1: label_micro_interleaved_index 0.61% : 0.000559s : 1: loop_unroll 0.01% : 0.000007s : 1: merge_cast_opt 0.01% : 0.000007s : 1: micro_interleaved_order_control 0.90% : 0.000821s : 1: mutable_eliminate 0.01% : 0.000010s : 1: offloading_packed_experts 0.02% : 0.000016s : 1: opt.transform.loop_unroll_optimizer 0.03% : 0.000023s : 1: opt.transform.mutable_eliminate 23.70% : 0.021546s : 78: opt.transform.opt_a 0.03% : 0.000032s : 1: opt.transform.opt_after_cconv 0.03% : 0.000031s : 1: opt.transform.opt_after_jit_grad 0.12% : 0.000106s : 28: opt.transform.opt_b 0.06% : 0.000056s : 2: opt.transform.opt_trans_graph 0.05% : 0.000041s : 4: opt.transform.symbol_engine_opt 26.30% : 0.023911s : 1: opt_a 0.16% : 0.000145s : 1: opt_after_cconv 0.72% : 0.000657s : 1: opt_after_jit_grad 0.34% : 0.000306s : 1: opt_b 30.06% : 0.027323s : 1: optimize 0.03% : 0.000026s : 1: optimize_parallel_all_gather_comm 0.01% : 0.000011s : 1: order_py_execute_after_rewriter 0.03% : 0.000029s : 1: overlap_grad_flash_sp 0.01% : 0.000006s : 1: overlap_grad_matmul_and_grad_allreduce 0.01% : 0.000011s : 1: overlap_grad_ring_attention 0.01% : 0.000008s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000006s : 1: overlap_opt_shard_in_pipeline 0.01% : 0.000008s : 1: overlap_param_gather 0.01% : 0.000007s : 1: overlap_recompute_allgather_and_fa_grad 0.01% : 0.000011s : 1: overlap_recompute_and_grad_model_parallel 0.01% : 0.000008s : 1: overlap_recompute_comm 0.01% : 0.000011s : 1: parallel-infer-symbol 0.01% : 0.000006s : 1: parallel-infer-symbol-second 0.01% : 0.000007s : 1: partial_unused_args_eliminate 0.01% : 0.000010s : 1: pipeline_parallel_scheduler 0.02% : 0.000020s : 1: pipeline_split 0.05% : 0.000047s : 1: pre_auto_parallel 0.04% : 0.000038s : 1: py_interpret_to_execute 0.02% : 0.000022s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000006s : 1: remove_cast_before_assign_add 0.02% : 0.000022s : 1: remove_dup_value 0.52% : 0.000477s : 1: renormalize.infer 0.45% : 0.000410s : 1: renormalize.specialize 0.01% : 0.000009s : 1: reorder_send_recv_between_fp_bp 0.01% : 0.000012s : 1: rewriter_after_jit_bprop_graph 0.06% : 0.000050s : 1: rewriter_after_opt_a 0.11% : 0.000100s : 1: rewriter_before_opt_a 0.01% : 0.000008s : 1: slice_cell_reuse_recomputed_activation 0.01% : 0.000008s : 1: slice_recompute_activation 0.01% : 0.000008s : 1: split_layernorm_comm 0.01% : 0.000008s : 1: split_matmul_comm_elemetwise 0.01% : 0.000012s : 1: swap_dp_allreduce_reducescatter 0.13% : 0.000116s : 1: symbol_engine_optimizer 0.11% : 0.000103s : 1: tuple_transform 6.39% : 0.005808s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:43:41.703.9 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.08692, [21] [bootstrap]: 0.00041978 [type_inference]: 0.00613884 [event_method]: 1.945e-05 [auto_monad]: 6.184e-05 [graph_reusing]: 5.69999e-06 [inline]: 2.36998e-06 [add_attr]: 0.0743623, [1] [add_attr_with_inline]: 0.0743498, [1] [Cycle 1]: 7.085e-05, [2] [tag_attr]: 2.264e-05 [meta_addattr_fg_expand]: 5.64e-06 [parallel-infer-symbol]: 3.93999e-06 [pre_auto_parallel]: 4.122e-05 [insert-virtual-dataset]: 2.40002e-06 [parallel-infer-symbol-second]: 7.89994e-07 [dataset_repeat_opt]: 1.81e-06 [pipeline_split]: 1.61998e-06 [optimize]: 0.00515081, [53] [py_interpret_to_execute]: 2.849e-05 [rewriter_before_opt_a]: 8.933e-05 [opt_a]: 0.00287998, [2] [Cycle 1]: 0.00221003, [45] [expand_dump_flag]: 3.2e-06 [switch_simplify]: 4.267e-05 [loop_unroll]: 3.101e-05 [a_1]: 0.00068733 [with_stream_mark]: 2.051e-05 [recompute_prepare]: 9.27001e-06 [updatestate_depend_eliminate]: 3.83001e-06 [updatestate_assign_eliminate]: 3.25e-06 [updatestate_loads_eliminate]: 2.76999e-06 [parameter_eliminate]: 1.93002e-06 [a_2]: 9.051e-05 [accelerated_algorithm]: 7.53999e-06 [shard]: 2.12001e-06 [meta_shard_fg_expand]: 2.41e-06 [shard_inline]: 6.46999e-06 [merge_send_recv]: 8.74e-06 [auto_parallel]: 6.63998e-06 [parallel]: 2.122e-05 [flash_sp]: 1.034e-05 [merge_comm]: 4.02e-06 [allreduce_fusion]: 3.7e-06 [matmul_add_comm_reduction]: 1.037e-05 [allreduce_slice_to_reducescatter]: 8.2e-07 [virtual_shard_identity]: 9.64e-06 [virtual_dataset]: 8.10999e-06 [get_grad_eliminate_]: 6.67002e-06 [virtual_output]: 6.91001e-06 [merge_forward]: 4.15999e-06 [cell_reuse_recompute_pass]: 1.31002e-06 [offload_activation]: 1.048e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.416e-05 [merge_recompute_call_nodes]: 1.40999e-06 [before_grad]: 1.204e-05 [set_forward_comm_id_for_comm_node_pass]: 4.33999e-06 [meta_fg_expand]: 3.03998e-06 [flash_sp_send_recv_attached]: 2.48002e-06 [receive_attached]: 2.37999e-06 [after_resolve]: 1.263e-05 [a_after_grad]: 9.96998e-06 [renormalize]: 0.00075919 [add_forward_monad_depend]: 4.90999e-06 [auto_monad_grad]: 2.46e-06 [auto_monad_eliminator]: 1.492e-05 [cse]: 2.99e-05 [a_3]: 4.859e-05 [Cycle 2]: 0.00065995, [45] [expand_dump_flag]: 1.68002e-06 [switch_simplify]: 7.41001e-06 [loop_unroll]: 6.26998e-06 [a_1]: 0.00014163 [with_stream_mark]: 1.177e-05 [recompute_prepare]: 6.74001e-06 [updatestate_depend_eliminate]: 2.84999e-06 [updatestate_assign_eliminate]: 2.41998e-06 [updatestate_loads_eliminate]: 2.59999e-06 [parameter_eliminate]: 1.19e-06 [a_2]: 7.703e-05 [accelerated_algorithm]: 6.30002e-06 [shard]: 1.17999e-06 [meta_shard_fg_expand]: 1.21997e-06 [shard_inline]: 8.97e-06 [merge_send_recv]: 4.63999e-06 [auto_parallel]: 5.19998e-06 [parallel]: 4.57e-06 [flash_sp]: 3.61001e-06 [merge_comm]: 3.28e-06 [allreduce_fusion]: 3.20998e-06 [matmul_add_comm_reduction]: 6.11e-06 [allreduce_slice_to_reducescatter]: 3.00002e-07 [virtual_shard_identity]: 6.70002e-06 [virtual_dataset]: 5.92999e-06 [get_grad_eliminate_]: 5.57001e-06 [virtual_output]: 5.56002e-06 [merge_forward]: 2.66e-06 [cell_reuse_recompute_pass]: 1.68002e-06 [offload_activation]: 6.81001e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.16e-05 [merge_recompute_call_nodes]: 8.89995e-07 [before_grad]: 9.65002e-06 [set_forward_comm_id_for_comm_node_pass]: 3.52002e-06 [meta_fg_expand]: 1.96e-06 [flash_sp_send_recv_attached]: 9.30013e-07 [receive_attached]: 1.17e-06 [after_resolve]: 1.019e-05 [a_after_grad]: 9.12001e-06 [renormalize]: 7.00238e-08 [add_forward_monad_depend]: 1.29e-06 [auto_monad_grad]: 1.08001e-06 [auto_monad_eliminator]: 7.2e-06 [cse]: 1.26e-05 [a_3]: 3.584e-05 [py_interpret_to_execute_after_opt_a]: 9.53997e-06 [slice_cell_reuse_recomputed_activation]: 1.91e-06 [rewriter_after_opt_a]: 3.389e-05 [convert_after_rewriter]: 6.64001e-06 [order_py_execute_after_rewriter]: 4.99e-06 [mutable_eliminate]: 0.00056444 [opt_b]: 0.00019875, [1] [Cycle 1]: 0.00019321, [7] [b_1]: 0.00012281 [b_2]: 7.9e-06 [updatestate_depend_eliminate]: 5.97001e-06 [updatestate_assign_eliminate]: 2.39999e-06 [updatestate_loads_eliminate]: 2.44999e-06 [renormalize]: 3.69997e-07 [cse]: 1.625e-05 [optimize_parallel_all_gather_comm]: 1.695e-05 [overlap_param_gather]: 1.89e-06 [cconv]: 2.49e-05 [loop_unroll]: 0.00041883 [opt_after_cconv]: 0.00030039, [1] [Cycle 1]: 0.00029478, [7] [c_1]: 3.181e-05 [parameter_eliminate]: 3.21999e-06 [updatestate_depend_eliminate]: 6.56e-06 [updatestate_assign_eliminate]: 2.82002e-06 [updatestate_loads_eliminate]: 2.29999e-06 [cse]: 1.708e-05 [renormalize]: 3.69997e-07 [remove_dup_value]: 1.318e-05 [tuple_transform]: 8.378e-05, [1] [Cycle 1]: 7.829e-05, [4] [d_1]: 4.885e-05 [none_parameter_eliminate]: 1.72001e-06 [renormalize]: 2.09984e-07 [switch_simplify]: 7.52002e-06 [partial_unused_args_eliminate]: 1.79e-06 [add_recomputation]: 4.675e-05 [cse_after_recomputation]: 2.198e-05, [1] [Cycle 1]: 1.705e-05, [1] [cse]: 1.14e-05 [environ_conv]: 5.99e-06 [swap_dp_allreduce_reducescatter]: 5.29e-06 [bias_add_comm_swap]: 2.85002e-06 [label_micro_interleaved_index]: 4.28001e-06 [label_fine_grained_interleaved_index]: 2.76e-06 [merge_cast_opt]: 1.22e-06 [slice_recompute_activation]: 2.06e-06 [micro_interleaved_order_control]: 2.27001e-06 [assign_add_opt]: 1.16002e-06 [ForceFp32Comm]: 8.2e-07 [remove_cast_before_assign_add]: 1.13001e-06 [full_micro_interleaved_order_control]: 2.16e-06 [reorder_send_recv_between_fp_bp]: 2.84999e-06 [comm_op_add_attrs]: 1.15999e-06 [add_comm_op_reuse_tag]: 1.00001e-06 [interleave_split_concat_branches]: 1.30001e-06 [interleave_parallel_branches]: 1.06002e-06 [overlap_opt_shard_in_pipeline]: 1.37999e-06 [overlap_opt_shard_grad_in_pipeline]: 1.96e-06 [control_data_broadcast_order]: 1.255e-05 [grouped_pairwise_exchange_alltoall]: 1.60001e-06 [offloading_packed_experts]: 3.86001e-06 [overlap_recompute_and_grad_model_parallel]: 4.75001e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.17e-06 [overlap_recompute_allgather_and_fa_grad]: 1.47001e-06 [overlap_recompute_comm]: 2.68998e-06 [overlap_grad_ring_attention]: 4.43001e-06 [overlap_grad_flash_sp]: 1.995e-05 [begin_end_overlap_inline]: 6.79982e-07 [split_matmul_comm_elemetwise]: 2.60002e-06 [split_layernorm_comm]: 1.66e-06 [handle_group_info]: 9.80013e-07 [symbol_engine_optimizer]: 8.081e-05, [1] [Cycle 1]: 7.661e-05, [6] [build]: 3.38e-06 [elim_shapecalc]: 1.087e-05 [elim_not_effective]: 1.443e-05 [opt_reshape]: 6.85002e-06 [fold_const_symbol]: 1.002e-05 [renormalize]: 2.50002e-07 [detach_backward]: 2.02001e-06 [pipeline_parallel_scheduler]: 1.52999e-06 [auto_monad_reorder]: 1.62e-05 [get_jit_bprop_graph]: 2.27999e-06 [rewriter_after_jit_bprop_graph]: 3.82998e-06 [opt_after_jit_grad]: 0.00048457 [validate]: 4.039e-05 Sums bootstrap : 0.000420s : 3.69% type_inference : 0.006139s : 53.93% event_method : 0.000019s : 0.17% auto_monad : 0.000062s : 0.54% graph_reusing : 0.000006s : 0.05% inline : 0.000002s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000023s : 0.20% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.05% parallel-infer-symbol : 0.000004s : 0.03% pre_auto_parallel : 0.000041s : 0.36% insert-virtual-dataset : 0.000002s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000028s : 0.25% optimize.rewriter_before_opt_a : 0.000089s : 0.78% optimize.opt_a.expand_dump_flag : 0.000005s : 0.04% optimize.opt_a.switch_simplify : 0.000050s : 0.44% optimize.opt_a.loop_unroll : 0.000037s : 0.33% optimize.opt_a.a_1 : 0.000829s : 7.28% optimize.opt_a.with_stream_mark : 0.000032s : 0.28% optimize.opt_a.recompute_prepare : 0.000016s : 0.14% optimize.opt_a.updatestate_depend_eliminate : 0.000007s : 0.06% optimize.opt_a.updatestate_assign_eliminate : 0.000006s : 0.05% optimize.opt_a.updatestate_loads_eliminate : 0.000005s : 0.05% optimize.opt_a.parameter_eliminate : 0.000003s : 0.03% optimize.opt_a.a_2 : 0.000168s : 1.47% optimize.opt_a.accelerated_algorithm : 0.000014s : 0.12% optimize.opt_a.shard : 0.000003s : 0.03% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.03% optimize.opt_a.shard_inline : 0.000015s : 0.14% optimize.opt_a.merge_send_recv : 0.000013s : 0.12% optimize.opt_a.auto_parallel : 0.000012s : 0.10% optimize.opt_a.parallel : 0.000026s : 0.23% optimize.opt_a.flash_sp : 0.000014s : 0.12% optimize.opt_a.merge_comm : 0.000007s : 0.06% optimize.opt_a.allreduce_fusion : 0.000007s : 0.06% optimize.opt_a.matmul_add_comm_reduction : 0.000016s : 0.14% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000016s : 0.14% optimize.opt_a.virtual_dataset : 0.000014s : 0.12% optimize.opt_a.get_grad_eliminate_ : 0.000012s : 0.11% optimize.opt_a.virtual_output : 0.000012s : 0.11% optimize.opt_a.merge_forward : 0.000007s : 0.06% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.03% optimize.opt_a.offload_activation : 0.000017s : 0.15% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000026s : 0.23% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.02% optimize.opt_a.before_grad : 0.000022s : 0.19% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000008s : 0.07% optimize.opt_a.meta_fg_expand : 0.000005s : 0.04% optimize.opt_a.flash_sp_send_recv_attached : 0.000003s : 0.03% optimize.opt_a.receive_attached : 0.000004s : 0.03% optimize.opt_a.after_resolve : 0.000023s : 0.20% optimize.opt_a.a_after_grad : 0.000019s : 0.17% optimize.opt_a.renormalize : 0.000759s : 6.67% optimize.opt_a.add_forward_monad_depend : 0.000006s : 0.05% optimize.opt_a.auto_monad_grad : 0.000004s : 0.03% optimize.opt_a.auto_monad_eliminator : 0.000022s : 0.19% optimize.opt_a.cse : 0.000043s : 0.37% optimize.opt_a.a_3 : 0.000084s : 0.74% optimize.py_interpret_to_execute_after_opt_a : 0.000010s : 0.08% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.02% optimize.rewriter_after_opt_a : 0.000034s : 0.30% optimize.convert_after_rewriter : 0.000007s : 0.06% optimize.order_py_execute_after_rewriter : 0.000005s : 0.04% optimize.mutable_eliminate : 0.000564s : 4.96% optimize.opt_b.b_1 : 0.000123s : 1.08% optimize.opt_b.b_2 : 0.000008s : 0.07% optimize.opt_b.updatestate_depend_eliminate : 0.000006s : 0.05% optimize.opt_b.updatestate_assign_eliminate : 0.000002s : 0.02% optimize.opt_b.updatestate_loads_eliminate : 0.000002s : 0.02% optimize.opt_b.renormalize : 0.000000s : 0.00% optimize.opt_b.cse : 0.000016s : 0.14% optimize.optimize_parallel_all_gather_comm : 0.000017s : 0.15% optimize.overlap_param_gather : 0.000002s : 0.02% optimize.cconv : 0.000025s : 0.22% optimize.loop_unroll : 0.000419s : 3.68% optimize.opt_after_cconv.c_1 : 0.000032s : 0.28% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000007s : 0.06% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.02% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.02% optimize.opt_after_cconv.cse : 0.000017s : 0.15% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000013s : 0.12% optimize.tuple_transform.d_1 : 0.000049s : 0.43% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.02% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000008s : 0.07% optimize.partial_unused_args_eliminate : 0.000002s : 0.02% optimize.add_recomputation : 0.000047s : 0.41% optimize.cse_after_recomputation.cse : 0.000011s : 0.10% optimize.environ_conv : 0.000006s : 0.05% optimize.swap_dp_allreduce_reducescatter : 0.000005s : 0.05% optimize.bias_add_comm_swap : 0.000003s : 0.03% optimize.label_micro_interleaved_index : 0.000004s : 0.04% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.02% optimize.merge_cast_opt : 0.000001s : 0.01% optimize.slice_recompute_activation : 0.000002s : 0.02% optimize.micro_interleaved_order_control : 0.000002s : 0.02% optimize.assign_add_opt : 0.000001s : 0.01% optimize.ForceFp32Comm : 0.000001s : 0.01% optimize.remove_cast_before_assign_add : 0.000001s : 0.01% optimize.full_micro_interleaved_order_control : 0.000002s : 0.02% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.03% optimize.comm_op_add_attrs : 0.000001s : 0.01% optimize.add_comm_op_reuse_tag : 0.000001s : 0.01% optimize.interleave_split_concat_branches : 0.000001s : 0.01% optimize.interleave_parallel_branches : 0.000001s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.02% optimize.control_data_broadcast_order : 0.000013s : 0.11% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.01% optimize.offloading_packed_experts : 0.000004s : 0.03% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.04% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.01% optimize.overlap_recompute_comm : 0.000003s : 0.02% optimize.overlap_grad_ring_attention : 0.000004s : 0.04% optimize.overlap_grad_flash_sp : 0.000020s : 0.18% optimize.begin_end_overlap_inline : 0.000001s : 0.01% optimize.split_matmul_comm_elemetwise : 0.000003s : 0.02% optimize.split_layernorm_comm : 0.000002s : 0.01% optimize.handle_group_info : 0.000001s : 0.01% optimize.symbol_engine_optimizer.build : 0.000003s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000011s : 0.10% optimize.symbol_engine_optimizer.elim_not_effective : 0.000014s : 0.13% optimize.symbol_engine_optimizer.opt_reshape : 0.000007s : 0.06% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000010s : 0.09% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.02% pipeline_parallel_scheduler : 0.000002s : 0.01% auto_monad_reorder : 0.000016s : 0.14% get_jit_bprop_graph : 0.000002s : 0.02% rewriter_after_jit_bprop_graph : 0.000004s : 0.03% opt_after_jit_grad : 0.000485s : 4.26% validate : 0.000040s : 0.35% Time group info: ------[substitution.] 0.000234 34 13.80% : 0.000032s : 6: substitution.arithmetic_simplify 0.88% : 0.000002s : 2: substitution.elim_not_effective 0.58% : 0.000001s : 2: substitution.fold_const_symbol 2.57% : 0.000006s : 4: substitution.graph_param_transform 69.07% : 0.000161s : 4: substitution.inline 1.90% : 0.000004s : 4: substitution.j_node_and_user_rematch 2.12% : 0.000005s : 4: substitution.remove_not_recompute_node 2.13% : 0.000005s : 4: substitution.replace_old_param 6.95% : 0.000016s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.006072 2 88.06% : 0.005347s : 1: type_inference.infer 11.94% : 0.000725s : 1: type_inference.specialize ------[replace.] 0.000077 8 68.78% : 0.000053s : 4: replace.inline 31.22% : 0.000024s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000173 8 92.03% : 0.000159s : 4: match.inline 7.97% : 0.000014s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000204 1278 0.99% : 0.000002s : 13: predicate.accumulaten_eliminater 0.88% : 0.000002s : 4: predicate.ad_related_special_op_eliminate 0.50% : 0.000001s : 8: predicate.addn_check_dump 0.95% : 0.000002s : 13: predicate.addn_zero_filter 0.85% : 0.000002s : 13: predicate.adjust_all_reduce_mul_add 2.67% : 0.000005s : 21: predicate.arithmetic_simplify 1.15% : 0.000002s : 13: predicate.cast_eliminate 0.59% : 0.000001s : 8: predicate.check_bprop_eliminate 0.50% : 0.000001s : 8: predicate.compare_switch_simplify 0.20% : 0.000000s : 4: predicate.const_output_eliminate 0.67% : 0.000001s : 8: predicate.depend_value_elim 0.93% : 0.000002s : 13: predicate.dict_get_item_const_eliminator 1.03% : 0.000002s : 13: predicate.dict_get_item_eliminator 0.85% : 0.000002s : 13: predicate.dict_set_item_eliminator 0.88% : 0.000002s : 8: predicate.dumpgradient_eliminate 0.25% : 0.000001s : 4: predicate.elim_not_effective 0.41% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.13% : 0.000002s : 17: predicate.environ_add_const_eliminate 1.05% : 0.000002s : 17: predicate.environ_get_add_eliminate 1.04% : 0.000002s : 17: predicate.environ_get_depend_swap 1.69% : 0.000003s : 25: predicate.environ_get_eliminate 1.07% : 0.000002s : 17: predicate.environ_get_set_eliminate 1.45% : 0.000003s : 21: predicate.exchange_switch_depend_value 2.45% : 0.000005s : 21: predicate.float_depend_g_call 0.49% : 0.000001s : 8: predicate.float_environ_get_switch 0.87% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.19% : 0.000000s : 4: predicate.fold_const_symbol 0.66% : 0.000001s : 8: predicate.get_grad_eliminate 0.28% : 0.000001s : 4: predicate.graph_param_transform 0.59% : 0.000001s : 8: predicate.incorporate_call 0.48% : 0.000001s : 8: predicate.incorporate_call_switch 6.37% : 0.000013s : 58: predicate.inline 0.76% : 0.000002s : 8: predicate.inline_without_move 0.34% : 0.000001s : 8: predicate.j_node_and_user_rematch 0.72% : 0.000001s : 8: predicate.less_batch_normalization 1.76% : 0.000004s : 25: predicate.list_to_tuple_eliminator_ 2.52% : 0.000005s : 38: predicate.load_eliminater 0.97% : 0.000002s : 4: predicate.loop_unroll_after_grad 2.42% : 0.000005s : 34: predicate.loop_unroll_before_grad 1.55% : 0.000003s : 21: predicate.make_slice_get_slice_eliminator 0.55% : 0.000001s : 8: predicate.merge_addn 0.60% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.57% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.92% : 0.000002s : 13: predicate.minmaximum_grad 0.92% : 0.000002s : 4: predicate.mutable_eliminate 0.36% : 0.000001s : 4: predicate.opt_reshape 0.36% : 0.000001s : 4: predicate.parallel_virtual_node 1.80% : 0.000004s : 21: predicate.partial_defer_inline 1.64% : 0.000003s : 21: predicate.partial_eliminate 0.90% : 0.000002s : 13: predicate.print_const_string_wrapper 0.53% : 0.000001s : 8: predicate.reduce_all_const_elim 1.13% : 0.000002s : 13: predicate.reduce_eliminate 2.45% : 0.000005s : 38: predicate.redundant_stop_gradient_eliminater 0.43% : 0.000001s : 8: predicate.remove_not_recompute_node 1.37% : 0.000003s : 25: predicate.replace_applicator 0.54% : 0.000001s : 8: predicate.replace_old_param 0.27% : 0.000001s : 4: predicate.reset_defer_inline 0.93% : 0.000002s : 13: predicate.reshape_eliminate 0.58% : 0.000001s : 8: predicate.row_tensor_add_zeros_like 0.46% : 0.000001s : 4: predicate.row_tensor_eliminate 0.70% : 0.000001s : 8: predicate.same_eliminate 0.53% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.91% : 0.000002s : 8: predicate.shard_identity_eliminate 0.68% : 0.000001s : 8: predicate.special_op_eliminate 0.69% : 0.000001s : 8: predicate.specialize_transform 0.89% : 0.000002s : 8: predicate.split_environ_get_set_with_tuple_value 0.73% : 0.000001s : 8: predicate.stack_unstack_eliminate 0.33% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.51% : 0.000003s : 21: predicate.switch_defer_inline 2.03% : 0.000004s : 29: predicate.switch_layer_defer_inline 5.32% : 0.000011s : 67: predicate.switch_simplify 0.95% : 0.000002s : 13: predicate.tile_eliminate 0.86% : 0.000002s : 13: predicate.transpose_eliminate 1.67% : 0.000003s : 21: predicate.tuple_list_convert_item_index_to_positive 1.58% : 0.000003s : 21: predicate.tuple_list_get_item_const_eliminator 1.66% : 0.000003s : 21: predicate.tuple_list_get_item_depend_reorder 3.12% : 0.000006s : 33: predicate.tuple_list_get_item_eliminator 1.57% : 0.000003s : 21: predicate.tuple_list_get_set_item_eliminator 2.30% : 0.000005s : 29: predicate.tuple_list_set_item_eliminator 1.71% : 0.000003s : 25: predicate.tuple_to_list_eliminator_ 2.42% : 0.000005s : 38: predicate.updatestate_pure_node_eliminater 3.10% : 0.000006s : 46: predicate.updatestate_useless_node_eliminater 0.37% : 0.000001s : 4: predicate.value_based_eliminate 0.64% : 0.000001s : 8: predicate.virtual_dataset_eliminate 0.59% : 0.000001s : 8: predicate.virtual_output_eliminate 0.30% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.38% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000562 11 53.22% : 0.000299s : 5: func_graph_cloner_run.FuncGraphClonerGraph 46.78% : 0.000263s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.168599 192 0.00% : 0.000004s : 1: ForceFp32Comm 44.11% : 0.074369s : 1: add_attr 44.10% : 0.074354s : 1: add_attr_with_inline 0.00% : 0.000004s : 1: add_comm_op_reuse_tag 0.03% : 0.000050s : 1: add_recomputation 0.00% : 0.000004s : 1: assign_add_opt 0.04% : 0.000067s : 1: auto_monad 0.01% : 0.000020s : 1: auto_monad_reorder 0.00% : 0.000004s : 1: begin_end_overlap_inline 0.00% : 0.000006s : 1: bias_add_comm_swap 0.27% : 0.000450s : 1: bootstrap 0.02% : 0.000028s : 1: cconv 0.00% : 0.000004s : 1: comm_op_add_attrs 0.01% : 0.000016s : 1: control_data_broadcast_order 0.01% : 0.000010s : 1: convert_after_rewriter 0.01% : 0.000025s : 1: cse_after_recomputation 0.00% : 0.000005s : 1: dataset_repeat_opt 0.00% : 0.000005s : 1: detach_backward 0.01% : 0.000009s : 1: environ_conv 0.02% : 0.000026s : 1: event_method 0.00% : 0.000005s : 1: full_micro_interleaved_order_control 0.00% : 0.000005s : 1: get_jit_bprop_graph 0.01% : 0.000009s : 1: graph_reusing 0.00% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000004s : 1: handle_group_info 0.00% : 0.000005s : 1: inline 0.00% : 0.000006s : 1: insert-virtual-dataset 0.00% : 0.000004s : 1: interleave_parallel_branches 0.00% : 0.000004s : 1: interleave_split_concat_branches 0.00% : 0.000006s : 1: label_fine_grained_interleaved_index 0.00% : 0.000007s : 1: label_micro_interleaved_index 0.25% : 0.000427s : 1: loop_unroll 0.00% : 0.000004s : 1: merge_cast_opt 0.00% : 0.000005s : 1: micro_interleaved_order_control 0.34% : 0.000572s : 1: mutable_eliminate 0.00% : 0.000007s : 1: offloading_packed_experts 0.01% : 0.000013s : 1: opt.transform.loop_unroll_optimizer 0.01% : 0.000015s : 1: opt.transform.mutable_eliminate 0.75% : 0.001267s : 78: opt.transform.opt_a 0.02% : 0.000030s : 1: opt.transform.opt_after_cconv 0.02% : 0.000025s : 1: opt.transform.opt_after_jit_grad 0.06% : 0.000100s : 28: opt.transform.opt_b 0.03% : 0.000054s : 2: opt.transform.opt_trans_graph 0.02% : 0.000038s : 4: opt.transform.symbol_engine_opt 1.71% : 0.002883s : 1: opt_a 0.18% : 0.000304s : 1: opt_after_cconv 0.29% : 0.000493s : 1: opt_after_jit_grad 0.12% : 0.000202s : 1: opt_b 3.06% : 0.005155s : 1: optimize 0.01% : 0.000020s : 1: optimize_parallel_all_gather_comm 0.00% : 0.000008s : 1: order_py_execute_after_rewriter 0.01% : 0.000024s : 1: overlap_grad_flash_sp 0.00% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.00% : 0.000007s : 1: overlap_grad_ring_attention 0.00% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000005s : 1: overlap_param_gather 0.00% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.00% : 0.000008s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000006s : 1: overlap_recompute_comm 0.00% : 0.000008s : 1: parallel-infer-symbol 0.00% : 0.000004s : 1: parallel-infer-symbol-second 0.00% : 0.000005s : 1: partial_unused_args_eliminate 0.00% : 0.000004s : 1: pipeline_parallel_scheduler 0.00% : 0.000005s : 1: pipeline_split 0.03% : 0.000045s : 1: pre_auto_parallel 0.02% : 0.000033s : 1: py_interpret_to_execute 0.01% : 0.000013s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000004s : 1: remove_cast_before_assign_add 0.01% : 0.000017s : 1: remove_dup_value 0.25% : 0.000415s : 1: renormalize.infer 0.20% : 0.000335s : 1: renormalize.specialize 0.00% : 0.000005s : 1: reorder_send_recv_between_fp_bp 0.00% : 0.000007s : 1: rewriter_after_jit_bprop_graph 0.02% : 0.000038s : 1: rewriter_after_opt_a 0.06% : 0.000094s : 1: rewriter_before_opt_a 0.00% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000005s : 1: slice_recompute_activation 0.00% : 0.000006s : 1: split_layernorm_comm 0.00% : 0.000005s : 1: split_matmul_comm_elemetwise 0.00% : 0.000008s : 1: swap_dp_allreduce_reducescatter 0.05% : 0.000084s : 1: symbol_engine_optimizer 0.05% : 0.000087s : 1: tuple_transform 3.65% : 0.006158s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:43:41.939.248 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:43:41.939.538 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0178292, [21] [bootstrap]: 0.00048141 [type_inference]: 0.00596324 [event_method]: 1.875e-05 [auto_monad]: 6.29e-05 [graph_reusing]: 5.92999e-06 [inline]: 2.17999e-06 [add_attr]: 0.00353078, [1] [add_attr_with_inline]: 0.00351874, [1] [Cycle 1]: 8.855e-05, [2] [tag_attr]: 2.338e-05 [meta_addattr_fg_expand]: 6.22001e-06 [parallel-infer-symbol]: 3.55e-06 [pre_auto_parallel]: 4.008e-05 [insert-virtual-dataset]: 2.31e-06 [parallel-infer-symbol-second]: 9.80013e-07 [dataset_repeat_opt]: 2.11998e-06 [pipeline_split]: 1.74e-06 [optimize]: 0.00631781, [53] [py_interpret_to_execute]: 3.26e-05 [rewriter_before_opt_a]: 9.613e-05 [opt_a]: 0.00341281, [2] [Cycle 1]: 0.00248581, [45] [expand_dump_flag]: 3.29001e-06 [switch_simplify]: 4.224e-05 [loop_unroll]: 3.117e-05 [a_1]: 0.00067087 [with_stream_mark]: 1.804e-05 [recompute_prepare]: 9.93002e-06 [updatestate_depend_eliminate]: 4.32998e-06 [updatestate_assign_eliminate]: 3.51001e-06 [updatestate_loads_eliminate]: 2.96001e-06 [parameter_eliminate]: 2.29001e-06 [a_2]: 0.00011886 [accelerated_algorithm]: 7.16001e-06 [shard]: 2.04e-06 [meta_shard_fg_expand]: 2.13002e-06 [shard_inline]: 6.81001e-06 [merge_send_recv]: 9.27999e-06 [auto_parallel]: 8.3e-06 [parallel]: 2.004e-05 [flash_sp]: 8e-06 [merge_comm]: 4.03001e-06 [allreduce_fusion]: 3.58e-06 [matmul_add_comm_reduction]: 9.97999e-06 [allreduce_slice_to_reducescatter]: 8.79983e-07 [virtual_shard_identity]: 9.20999e-06 [virtual_dataset]: 7.08998e-06 [get_grad_eliminate_]: 6.39999e-06 [virtual_output]: 6.59001e-06 [merge_forward]: 3.95e-06 [cell_reuse_recompute_pass]: 1.45999e-06 [offload_activation]: 1.049e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.381e-05 [merge_recompute_call_nodes]: 1.57999e-06 [before_grad]: 1.076e-05 [set_forward_comm_id_for_comm_node_pass]: 3.6e-06 [meta_fg_expand]: 2.88e-06 [flash_sp_send_recv_attached]: 2.35002e-06 [receive_attached]: 2.06e-06 [after_resolve]: 1.196e-05 [a_after_grad]: 1.068e-05 [renormalize]: 0.00086481 [add_forward_monad_depend]: 6.88e-06 [auto_monad_grad]: 2.69001e-06 [auto_monad_eliminator]: 1.712e-05 [cse]: 2.94e-05 [a_3]: 6.713e-05 [Cycle 2]: 0.00091086, [45] [expand_dump_flag]: 2.06998e-06 [switch_simplify]: 8.05999e-06 [loop_unroll]: 6.65002e-06 [a_1]: 0.0001486 [with_stream_mark]: 1.577e-05 [recompute_prepare]: 6.99001e-06 [updatestate_depend_eliminate]: 3.32997e-06 [updatestate_assign_eliminate]: 2.81e-06 [updatestate_loads_eliminate]: 2.41998e-06 [parameter_eliminate]: 1.63002e-06 [a_2]: 0.00011064 [accelerated_algorithm]: 7.56999e-06 [shard]: 1.72999e-06 [meta_shard_fg_expand]: 1.74998e-06 [shard_inline]: 6.14001e-06 [merge_send_recv]: 6.34001e-06 [auto_parallel]: 6.78e-06 [parallel]: 6.63e-06 [flash_sp]: 3.48e-06 [merge_comm]: 3.93999e-06 [allreduce_fusion]: 3.68e-06 [matmul_add_comm_reduction]: 8.05e-06 [allreduce_slice_to_reducescatter]: 1.07e-06 [virtual_shard_identity]: 8.80999e-06 [virtual_dataset]: 6.21e-06 [get_grad_eliminate_]: 6.73e-06 [virtual_output]: 5.92001e-06 [merge_forward]: 3.88001e-06 [cell_reuse_recompute_pass]: 2.73e-06 [offload_activation]: 9.03002e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.418e-05 [merge_recompute_call_nodes]: 1.61998e-06 [before_grad]: 1.001e-05 [set_forward_comm_id_for_comm_node_pass]: 4.13001e-06 [meta_fg_expand]: 2.92002e-06 [flash_sp_send_recv_attached]: 1.49e-06 [receive_attached]: 1.40001e-06 [after_resolve]: 1.254e-05 [a_after_grad]: 1.015e-05 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 2.74999e-06 [auto_monad_grad]: 1.96998e-06 [auto_monad_eliminator]: 1.015e-05 [cse]: 1.731e-05 [a_3]: 5.305e-05 [py_interpret_to_execute_after_opt_a]: 1.646e-05 [slice_cell_reuse_recomputed_activation]: 4.72e-06 [rewriter_after_opt_a]: 4.425e-05 [convert_after_rewriter]: 1.098e-05 [order_py_execute_after_rewriter]: 1.044e-05 [mutable_eliminate]: 0.00074098 [opt_b]: 0.00030131, [1] [Cycle 1]: 0.00029015, [7] [b_1]: 0.00017566 [b_2]: 8.99998e-06 [updatestate_depend_eliminate]: 8.99998e-06 [updatestate_assign_eliminate]: 3.01001e-06 [updatestate_loads_eliminate]: 2.89001e-06 [renormalize]: 9.39996e-07 [cse]: 2.786e-05 [optimize_parallel_all_gather_comm]: 2.329e-05 [overlap_param_gather]: 6.02999e-06 [cconv]: 3.932e-05 [loop_unroll]: 0.00052668 [opt_after_cconv]: 0.00014656, [1] [Cycle 1]: 0.00013604, [7] [c_1]: 3.235e-05 [parameter_eliminate]: 5.50001e-06 [updatestate_depend_eliminate]: 8.22e-06 [updatestate_assign_eliminate]: 3.03998e-06 [updatestate_loads_eliminate]: 2.44999e-06 [cse]: 2.17e-05 [renormalize]: 7.09988e-07 [remove_dup_value]: 1.926e-05 [tuple_transform]: 0.00010704, [1] [Cycle 1]: 9.904e-05, [4] [d_1]: 5.363e-05 [none_parameter_eliminate]: 2.66e-06 [renormalize]: 1.80007e-07 [switch_simplify]: 8.12e-06 [partial_unused_args_eliminate]: 4.58999e-06 [add_recomputation]: 6.183e-05 [cse_after_recomputation]: 3.296e-05, [1] [Cycle 1]: 2.515e-05, [1] [cse]: 1.464e-05 [environ_conv]: 9.90002e-06 [swap_dp_allreduce_reducescatter]: 9.07001e-06 [bias_add_comm_swap]: 6.01e-06 [label_micro_interleaved_index]: 8.90001e-06 [label_fine_grained_interleaved_index]: 6.35002e-06 [merge_cast_opt]: 4.58999e-06 [slice_recompute_activation]: 2.317e-05 [micro_interleaved_order_control]: 5.83002e-06 [assign_add_opt]: 4.50001e-06 [ForceFp32Comm]: 4.15999e-06 [remove_cast_before_assign_add]: 3.53e-06 [full_micro_interleaved_order_control]: 4.95999e-06 [reorder_send_recv_between_fp_bp]: 6.02999e-06 [comm_op_add_attrs]: 9.082e-05 [add_comm_op_reuse_tag]: 4.79e-06 [interleave_split_concat_branches]: 4.85999e-06 [interleave_parallel_branches]: 3.78999e-06 [overlap_opt_shard_in_pipeline]: 3.8e-06 [overlap_opt_shard_grad_in_pipeline]: 4.70999e-06 [control_data_broadcast_order]: 2.155e-05 [grouped_pairwise_exchange_alltoall]: 4.29002e-06 [offloading_packed_experts]: 7.51999e-06 [overlap_recompute_and_grad_model_parallel]: 8.15e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.83001e-06 [overlap_recompute_allgather_and_fa_grad]: 3.71999e-06 [overlap_recompute_comm]: 4.80999e-06 [overlap_grad_ring_attention]: 7.93999e-06 [overlap_grad_flash_sp]: 2.753e-05 [begin_end_overlap_inline]: 3.01999e-06 [split_matmul_comm_elemetwise]: 4.68001e-06 [split_layernorm_comm]: 4.38999e-06 [handle_group_info]: 3.57002e-06 [symbol_engine_optimizer]: 0.0001174, [1] [Cycle 1]: 0.0001085, [6] [build]: 5.24e-06 [elim_shapecalc]: 1.498e-05 [elim_not_effective]: 1.566e-05 [opt_reshape]: 8.33001e-06 [fold_const_symbol]: 1.081e-05 [renormalize]: 2.69996e-07 [detach_backward]: 4.27e-06 [pipeline_parallel_scheduler]: 1.74e-06 [auto_monad_reorder]: 2.224e-05 [get_jit_bprop_graph]: 2.06e-06 [rewriter_after_jit_bprop_graph]: 7.56999e-06 [opt_after_jit_grad]: 0.00068054 [validate]: 4.743e-05 Sums bootstrap : 0.000481s : 3.88% type_inference : 0.005963s : 48.07% event_method : 0.000019s : 0.15% auto_monad : 0.000063s : 0.51% graph_reusing : 0.000006s : 0.05% inline : 0.000002s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000023s : 0.19% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.05% parallel-infer-symbol : 0.000004s : 0.03% pre_auto_parallel : 0.000040s : 0.32% insert-virtual-dataset : 0.000002s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000033s : 0.26% optimize.rewriter_before_opt_a : 0.000096s : 0.77% optimize.opt_a.expand_dump_flag : 0.000005s : 0.04% optimize.opt_a.switch_simplify : 0.000050s : 0.41% optimize.opt_a.loop_unroll : 0.000038s : 0.30% optimize.opt_a.a_1 : 0.000819s : 6.61% optimize.opt_a.with_stream_mark : 0.000034s : 0.27% optimize.opt_a.recompute_prepare : 0.000017s : 0.14% optimize.opt_a.updatestate_depend_eliminate : 0.000008s : 0.06% optimize.opt_a.updatestate_assign_eliminate : 0.000006s : 0.05% optimize.opt_a.updatestate_loads_eliminate : 0.000005s : 0.04% optimize.opt_a.parameter_eliminate : 0.000004s : 0.03% optimize.opt_a.a_2 : 0.000230s : 1.85% optimize.opt_a.accelerated_algorithm : 0.000015s : 0.12% optimize.opt_a.shard : 0.000004s : 0.03% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.03% optimize.opt_a.shard_inline : 0.000013s : 0.10% optimize.opt_a.merge_send_recv : 0.000016s : 0.13% optimize.opt_a.auto_parallel : 0.000015s : 0.12% optimize.opt_a.parallel : 0.000027s : 0.21% optimize.opt_a.flash_sp : 0.000011s : 0.09% optimize.opt_a.merge_comm : 0.000008s : 0.06% optimize.opt_a.allreduce_fusion : 0.000007s : 0.06% optimize.opt_a.matmul_add_comm_reduction : 0.000018s : 0.15% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000002s : 0.02% optimize.opt_a.virtual_shard_identity : 0.000018s : 0.15% optimize.opt_a.virtual_dataset : 0.000013s : 0.11% optimize.opt_a.get_grad_eliminate_ : 0.000013s : 0.11% optimize.opt_a.virtual_output : 0.000013s : 0.10% optimize.opt_a.merge_forward : 0.000008s : 0.06% optimize.opt_a.cell_reuse_recompute_pass : 0.000004s : 0.03% optimize.opt_a.offload_activation : 0.000020s : 0.16% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000028s : 0.23% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.03% optimize.opt_a.before_grad : 0.000021s : 0.17% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000008s : 0.06% optimize.opt_a.meta_fg_expand : 0.000006s : 0.05% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.03% optimize.opt_a.receive_attached : 0.000003s : 0.03% optimize.opt_a.after_resolve : 0.000025s : 0.20% optimize.opt_a.a_after_grad : 0.000021s : 0.17% optimize.opt_a.renormalize : 0.000865s : 6.97% optimize.opt_a.add_forward_monad_depend : 0.000010s : 0.08% optimize.opt_a.auto_monad_grad : 0.000005s : 0.04% optimize.opt_a.auto_monad_eliminator : 0.000027s : 0.22% optimize.opt_a.cse : 0.000047s : 0.38% optimize.opt_a.a_3 : 0.000120s : 0.97% optimize.py_interpret_to_execute_after_opt_a : 0.000016s : 0.13% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.04% optimize.rewriter_after_opt_a : 0.000044s : 0.36% optimize.convert_after_rewriter : 0.000011s : 0.09% optimize.order_py_execute_after_rewriter : 0.000010s : 0.08% optimize.mutable_eliminate : 0.000741s : 5.97% optimize.opt_b.b_1 : 0.000176s : 1.42% optimize.opt_b.b_2 : 0.000009s : 0.07% optimize.opt_b.updatestate_depend_eliminate : 0.000009s : 0.07% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.02% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.02% optimize.opt_b.renormalize : 0.000001s : 0.01% optimize.opt_b.cse : 0.000028s : 0.22% optimize.optimize_parallel_all_gather_comm : 0.000023s : 0.19% optimize.overlap_param_gather : 0.000006s : 0.05% optimize.cconv : 0.000039s : 0.32% optimize.loop_unroll : 0.000527s : 4.25% optimize.opt_after_cconv.c_1 : 0.000032s : 0.26% optimize.opt_after_cconv.parameter_eliminate : 0.000006s : 0.04% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000008s : 0.07% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.02% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.02% optimize.opt_after_cconv.cse : 0.000022s : 0.17% optimize.opt_after_cconv.renormalize : 0.000001s : 0.01% optimize.remove_dup_value : 0.000019s : 0.16% optimize.tuple_transform.d_1 : 0.000054s : 0.43% optimize.tuple_transform.none_parameter_eliminate : 0.000003s : 0.02% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000008s : 0.07% optimize.partial_unused_args_eliminate : 0.000005s : 0.04% optimize.add_recomputation : 0.000062s : 0.50% optimize.cse_after_recomputation.cse : 0.000015s : 0.12% optimize.environ_conv : 0.000010s : 0.08% optimize.swap_dp_allreduce_reducescatter : 0.000009s : 0.07% optimize.bias_add_comm_swap : 0.000006s : 0.05% optimize.label_micro_interleaved_index : 0.000009s : 0.07% optimize.label_fine_grained_interleaved_index : 0.000006s : 0.05% optimize.merge_cast_opt : 0.000005s : 0.04% optimize.slice_recompute_activation : 0.000023s : 0.19% optimize.micro_interleaved_order_control : 0.000006s : 0.05% optimize.assign_add_opt : 0.000005s : 0.04% optimize.ForceFp32Comm : 0.000004s : 0.03% optimize.remove_cast_before_assign_add : 0.000004s : 0.03% optimize.full_micro_interleaved_order_control : 0.000005s : 0.04% optimize.reorder_send_recv_between_fp_bp : 0.000006s : 0.05% optimize.comm_op_add_attrs : 0.000091s : 0.73% optimize.add_comm_op_reuse_tag : 0.000005s : 0.04% optimize.interleave_split_concat_branches : 0.000005s : 0.04% optimize.interleave_parallel_branches : 0.000004s : 0.03% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.03% optimize.overlap_opt_shard_grad_in_pipeline : 0.000005s : 0.04% optimize.control_data_broadcast_order : 0.000022s : 0.17% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.03% optimize.offloading_packed_experts : 0.000008s : 0.06% optimize.overlap_recompute_and_grad_model_parallel : 0.000008s : 0.07% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.03% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.03% optimize.overlap_recompute_comm : 0.000005s : 0.04% optimize.overlap_grad_ring_attention : 0.000008s : 0.06% optimize.overlap_grad_flash_sp : 0.000028s : 0.22% optimize.begin_end_overlap_inline : 0.000003s : 0.02% optimize.split_matmul_comm_elemetwise : 0.000005s : 0.04% optimize.split_layernorm_comm : 0.000004s : 0.04% optimize.handle_group_info : 0.000004s : 0.03% optimize.symbol_engine_optimizer.build : 0.000005s : 0.04% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000015s : 0.12% optimize.symbol_engine_optimizer.elim_not_effective : 0.000016s : 0.13% optimize.symbol_engine_optimizer.opt_reshape : 0.000008s : 0.07% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000011s : 0.09% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000004s : 0.03% pipeline_parallel_scheduler : 0.000002s : 0.01% auto_monad_reorder : 0.000022s : 0.18% get_jit_bprop_graph : 0.000002s : 0.02% rewriter_after_jit_bprop_graph : 0.000008s : 0.06% opt_after_jit_grad : 0.000681s : 5.49% validate : 0.000047s : 0.38% Time group info: ------[substitution.] 0.000235 34 15.27% : 0.000036s : 6: substitution.arithmetic_simplify 0.82% : 0.000002s : 2: substitution.elim_not_effective 0.56% : 0.000001s : 2: substitution.fold_const_symbol 2.95% : 0.000007s : 4: substitution.graph_param_transform 67.80% : 0.000159s : 4: substitution.inline 2.01% : 0.000005s : 4: substitution.j_node_and_user_rematch 1.97% : 0.000005s : 4: substitution.remove_not_recompute_node 2.22% : 0.000005s : 4: substitution.replace_old_param 6.41% : 0.000015s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.005907 2 87.94% : 0.005194s : 1: type_inference.infer 12.06% : 0.000712s : 1: type_inference.specialize ------[replace.] 0.000065 8 63.31% : 0.000041s : 4: replace.inline 36.69% : 0.000024s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000170 8 92.25% : 0.000157s : 4: match.inline 7.75% : 0.000013s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000223 1278 0.89% : 0.000002s : 13: predicate.accumulaten_eliminater 0.89% : 0.000002s : 4: predicate.ad_related_special_op_eliminate 0.58% : 0.000001s : 8: predicate.addn_check_dump 0.93% : 0.000002s : 13: predicate.addn_zero_filter 0.74% : 0.000002s : 13: predicate.adjust_all_reduce_mul_add 2.78% : 0.000006s : 21: predicate.arithmetic_simplify 1.03% : 0.000002s : 13: predicate.cast_eliminate 0.59% : 0.000001s : 8: predicate.check_bprop_eliminate 0.47% : 0.000001s : 8: predicate.compare_switch_simplify 0.17% : 0.000000s : 4: predicate.const_output_eliminate 0.58% : 0.000001s : 8: predicate.depend_value_elim 0.99% : 0.000002s : 13: predicate.dict_get_item_const_eliminator 1.04% : 0.000002s : 13: predicate.dict_get_item_eliminator 0.77% : 0.000002s : 13: predicate.dict_set_item_eliminator 1.30% : 0.000003s : 8: predicate.dumpgradient_eliminate 0.26% : 0.000001s : 4: predicate.elim_not_effective 0.57% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.26% : 0.000003s : 17: predicate.environ_add_const_eliminate 0.99% : 0.000002s : 17: predicate.environ_get_add_eliminate 1.08% : 0.000002s : 17: predicate.environ_get_depend_swap 1.59% : 0.000004s : 25: predicate.environ_get_eliminate 0.96% : 0.000002s : 17: predicate.environ_get_set_eliminate 1.35% : 0.000003s : 21: predicate.exchange_switch_depend_value 2.79% : 0.000006s : 21: predicate.float_depend_g_call 0.57% : 0.000001s : 8: predicate.float_environ_get_switch 0.86% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.17% : 0.000000s : 4: predicate.fold_const_symbol 0.63% : 0.000001s : 8: predicate.get_grad_eliminate 0.18% : 0.000000s : 4: predicate.graph_param_transform 0.56% : 0.000001s : 8: predicate.incorporate_call 0.43% : 0.000001s : 8: predicate.incorporate_call_switch 6.22% : 0.000014s : 58: predicate.inline 0.72% : 0.000002s : 8: predicate.inline_without_move 0.33% : 0.000001s : 8: predicate.j_node_and_user_rematch 0.94% : 0.000002s : 8: predicate.less_batch_normalization 1.79% : 0.000004s : 25: predicate.list_to_tuple_eliminator_ 2.53% : 0.000006s : 38: predicate.load_eliminater 1.60% : 0.000004s : 4: predicate.loop_unroll_after_grad 2.32% : 0.000005s : 34: predicate.loop_unroll_before_grad 1.69% : 0.000004s : 21: predicate.make_slice_get_slice_eliminator 0.67% : 0.000001s : 8: predicate.merge_addn 0.67% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.63% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.78% : 0.000002s : 13: predicate.minmaximum_grad 1.51% : 0.000003s : 4: predicate.mutable_eliminate 0.40% : 0.000001s : 4: predicate.opt_reshape 0.35% : 0.000001s : 4: predicate.parallel_virtual_node 1.70% : 0.000004s : 21: predicate.partial_defer_inline 1.51% : 0.000003s : 21: predicate.partial_eliminate 0.84% : 0.000002s : 13: predicate.print_const_string_wrapper 0.68% : 0.000002s : 8: predicate.reduce_all_const_elim 1.20% : 0.000003s : 13: predicate.reduce_eliminate 2.41% : 0.000005s : 38: predicate.redundant_stop_gradient_eliminater 0.54% : 0.000001s : 8: predicate.remove_not_recompute_node 1.27% : 0.000003s : 25: predicate.replace_applicator 0.55% : 0.000001s : 8: predicate.replace_old_param 0.26% : 0.000001s : 4: predicate.reset_defer_inline 0.87% : 0.000002s : 13: predicate.reshape_eliminate 0.64% : 0.000001s : 8: predicate.row_tensor_add_zeros_like 0.43% : 0.000001s : 4: predicate.row_tensor_eliminate 0.72% : 0.000002s : 8: predicate.same_eliminate 0.46% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.78% : 0.000002s : 8: predicate.shard_identity_eliminate 0.71% : 0.000002s : 8: predicate.special_op_eliminate 0.69% : 0.000002s : 8: predicate.specialize_transform 1.31% : 0.000003s : 8: predicate.split_environ_get_set_with_tuple_value 0.86% : 0.000002s : 8: predicate.stack_unstack_eliminate 0.39% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.38% : 0.000003s : 21: predicate.switch_defer_inline 1.97% : 0.000004s : 29: predicate.switch_layer_defer_inline 4.86% : 0.000011s : 67: predicate.switch_simplify 0.80% : 0.000002s : 13: predicate.tile_eliminate 0.85% : 0.000002s : 13: predicate.transpose_eliminate 1.49% : 0.000003s : 21: predicate.tuple_list_convert_item_index_to_positive 1.44% : 0.000003s : 21: predicate.tuple_list_get_item_const_eliminator 1.43% : 0.000003s : 21: predicate.tuple_list_get_item_depend_reorder 3.01% : 0.000007s : 33: predicate.tuple_list_get_item_eliminator 1.37% : 0.000003s : 21: predicate.tuple_list_get_set_item_eliminator 2.09% : 0.000005s : 29: predicate.tuple_list_set_item_eliminator 1.63% : 0.000004s : 25: predicate.tuple_to_list_eliminator_ 2.24% : 0.000005s : 38: predicate.updatestate_pure_node_eliminater 2.85% : 0.000006s : 46: predicate.updatestate_useless_node_eliminater 0.38% : 0.000001s : 4: predicate.value_based_eliminate 0.59% : 0.000001s : 8: predicate.virtual_dataset_eliminate 0.67% : 0.000001s : 8: predicate.virtual_output_eliminate 0.24% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.71% : 0.000002s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000569 11 52.08% : 0.000296s : 5: func_graph_cloner_run.FuncGraphClonerGraph 47.92% : 0.000272s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.029972 192 0.02% : 0.000007s : 1: ForceFp32Comm 11.81% : 0.003541s : 1: add_attr 11.76% : 0.003523s : 1: add_attr_with_inline 0.03% : 0.000009s : 1: add_comm_op_reuse_tag 0.22% : 0.000066s : 1: add_recomputation 0.02% : 0.000007s : 1: assign_add_opt 0.24% : 0.000072s : 1: auto_monad 0.10% : 0.000031s : 1: auto_monad_reorder 0.02% : 0.000006s : 1: begin_end_overlap_inline 0.03% : 0.000009s : 1: bias_add_comm_swap 1.76% : 0.000527s : 1: bootstrap 0.14% : 0.000043s : 1: cconv 0.33% : 0.000098s : 1: comm_op_add_attrs 0.08% : 0.000025s : 1: control_data_broadcast_order 0.05% : 0.000014s : 1: convert_after_rewriter 0.12% : 0.000037s : 1: cse_after_recomputation 0.03% : 0.000008s : 1: dataset_repeat_opt 0.07% : 0.000022s : 1: detach_backward 0.04% : 0.000013s : 1: environ_conv 0.10% : 0.000029s : 1: event_method 0.03% : 0.000008s : 1: full_micro_interleaved_order_control 0.03% : 0.000009s : 1: get_jit_bprop_graph 0.04% : 0.000012s : 1: graph_reusing 0.02% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.02% : 0.000006s : 1: handle_group_info 0.03% : 0.000008s : 1: inline 0.03% : 0.000009s : 1: insert-virtual-dataset 0.02% : 0.000006s : 1: interleave_parallel_branches 0.03% : 0.000008s : 1: interleave_split_concat_branches 0.03% : 0.000009s : 1: label_fine_grained_interleaved_index 0.04% : 0.000012s : 1: label_micro_interleaved_index 1.78% : 0.000534s : 1: loop_unroll 0.02% : 0.000007s : 1: merge_cast_opt 0.03% : 0.000009s : 1: micro_interleaved_order_control 2.50% : 0.000750s : 1: mutable_eliminate 0.04% : 0.000011s : 1: offloading_packed_experts 0.06% : 0.000019s : 1: opt.transform.loop_unroll_optimizer 0.07% : 0.000022s : 1: opt.transform.mutable_eliminate 4.25% : 0.001273s : 78: opt.transform.opt_a 0.10% : 0.000031s : 1: opt.transform.opt_after_cconv 0.11% : 0.000032s : 1: opt.transform.opt_after_jit_grad 0.36% : 0.000108s : 28: opt.transform.opt_b 0.20% : 0.000059s : 2: opt.transform.opt_trans_graph 0.15% : 0.000045s : 4: opt.transform.symbol_engine_opt 11.40% : 0.003416s : 1: opt_a 0.50% : 0.000151s : 1: opt_after_cconv 2.31% : 0.000693s : 1: opt_after_jit_grad 1.02% : 0.000305s : 1: opt_b 22.29% : 0.006681s : 1: optimize 0.09% : 0.000027s : 1: optimize_parallel_all_gather_comm 0.05% : 0.000015s : 1: order_py_execute_after_rewriter 0.11% : 0.000032s : 1: overlap_grad_flash_sp 0.02% : 0.000007s : 1: overlap_grad_matmul_and_grad_allreduce 0.04% : 0.000011s : 1: overlap_grad_ring_attention 0.03% : 0.000008s : 1: overlap_opt_shard_grad_in_pipeline 0.02% : 0.000007s : 1: overlap_opt_shard_in_pipeline 0.03% : 0.000010s : 1: overlap_param_gather 0.02% : 0.000006s : 1: overlap_recompute_allgather_and_fa_grad 0.04% : 0.000012s : 1: overlap_recompute_and_grad_model_parallel 0.03% : 0.000008s : 1: overlap_recompute_comm 0.04% : 0.000011s : 1: parallel-infer-symbol 0.02% : 0.000007s : 1: parallel-infer-symbol-second 0.03% : 0.000008s : 1: partial_unused_args_eliminate 0.03% : 0.000010s : 1: pipeline_parallel_scheduler 0.02% : 0.000007s : 1: pipeline_split 0.16% : 0.000048s : 1: pre_auto_parallel 0.12% : 0.000036s : 1: py_interpret_to_execute 0.07% : 0.000020s : 1: py_interpret_to_execute_after_opt_a 0.02% : 0.000006s : 1: remove_cast_before_assign_add 0.08% : 0.000023s : 1: remove_dup_value 1.61% : 0.000482s : 1: renormalize.infer 1.25% : 0.000373s : 1: renormalize.specialize 0.03% : 0.000009s : 1: reorder_send_recv_between_fp_bp 0.05% : 0.000014s : 1: rewriter_after_jit_bprop_graph 0.16% : 0.000049s : 1: rewriter_after_opt_a 0.34% : 0.000100s : 1: rewriter_before_opt_a 0.02% : 0.000007s : 1: slice_cell_reuse_recomputed_activation 0.09% : 0.000028s : 1: slice_recompute_activation 0.03% : 0.000008s : 1: split_layernorm_comm 0.03% : 0.000008s : 1: split_matmul_comm_elemetwise 0.04% : 0.000012s : 1: swap_dp_allreduce_reducescatter 0.40% : 0.000120s : 1: symbol_engine_optimizer 0.37% : 0.000110s : 1: tuple_transform 20.04% : 0.006006s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:43:43.266.696 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0685528, [21] [bootstrap]: 0.00046771 [type_inference]: 0.0583304 [event_method]: 2.071e-05 [auto_monad]: 6.667e-05 [graph_reusing]: 5.74e-06 [inline]: 3.04001e-06 [add_attr]: 0.00386387, [1] [add_attr_with_inline]: 0.00385342, [1] [Cycle 1]: 6.571e-05, [2] [tag_attr]: 2.16e-05 [meta_addattr_fg_expand]: 6.43e-06 [parallel-infer-symbol]: 3.65e-06 [pre_auto_parallel]: 3.725e-05 [insert-virtual-dataset]: 2.71e-06 [parallel-infer-symbol-second]: 9.49978e-07 [dataset_repeat_opt]: 2.11e-06 [pipeline_split]: 2.04999e-06 [optimize]: 0.00503802, [53] [py_interpret_to_execute]: 2.779e-05 [rewriter_before_opt_a]: 8.521e-05 [opt_a]: 0.00287615, [2] [Cycle 1]: 0.00217508, [45] [expand_dump_flag]: 3.04999e-06 [switch_simplify]: 4.563e-05 [loop_unroll]: 3.022e-05 [a_1]: 0.00065848 [with_stream_mark]: 1.756e-05 [recompute_prepare]: 9.04e-06 [updatestate_depend_eliminate]: 3.76999e-06 [updatestate_assign_eliminate]: 3.91999e-06 [updatestate_loads_eliminate]: 2.82002e-06 [parameter_eliminate]: 2.09e-06 [a_2]: 8.797e-05 [accelerated_algorithm]: 7.44002e-06 [shard]: 1.92999e-06 [meta_shard_fg_expand]: 2.01e-06 [shard_inline]: 6.69001e-06 [merge_send_recv]: 9.35001e-06 [auto_parallel]: 6.28e-06 [parallel]: 2.006e-05 [flash_sp]: 8.50001e-06 [merge_comm]: 4.06001e-06 [allreduce_fusion]: 3.75998e-06 [matmul_add_comm_reduction]: 9.20001e-06 [allreduce_slice_to_reducescatter]: 6.59988e-07 [virtual_shard_identity]: 8.23001e-06 [virtual_dataset]: 6.69001e-06 [get_grad_eliminate_]: 6.07999e-06 [virtual_output]: 6.82002e-06 [merge_forward]: 4.28999e-06 [cell_reuse_recompute_pass]: 1.34003e-06 [offload_activation]: 1.064e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.148e-05 [merge_recompute_call_nodes]: 1.71e-06 [before_grad]: 1.031e-05 [set_forward_comm_id_for_comm_node_pass]: 3.97e-06 [meta_fg_expand]: 2.81e-06 [flash_sp_send_recv_attached]: 2.53e-06 [receive_attached]: 2.44001e-06 [after_resolve]: 1.141e-05 [a_after_grad]: 1.009e-05 [renormalize]: 0.0007793 [add_forward_monad_depend]: 6.19001e-06 [auto_monad_grad]: 2.47001e-06 [auto_monad_eliminator]: 1.61e-05 [cse]: 2.978e-05 [a_3]: 4.893e-05 [Cycle 2]: 0.00068951, [45] [expand_dump_flag]: 1.76e-06 [switch_simplify]: 7.61999e-06 [loop_unroll]: 6.23e-06 [a_1]: 0.00014564 [with_stream_mark]: 1.294e-05 [recompute_prepare]: 6.75002e-06 [updatestate_depend_eliminate]: 3.38999e-06 [updatestate_assign_eliminate]: 2.46e-06 [updatestate_loads_eliminate]: 2.37999e-06 [parameter_eliminate]: 1.45001e-06 [a_2]: 7.908e-05 [accelerated_algorithm]: 6.04999e-06 [shard]: 1.62001e-06 [meta_shard_fg_expand]: 1.72001e-06 [shard_inline]: 6.09001e-06 [merge_send_recv]: 5.35999e-06 [auto_parallel]: 6.07001e-06 [parallel]: 7e-06 [flash_sp]: 3.78001e-06 [merge_comm]: 3.72998e-06 [allreduce_fusion]: 3.58999e-06 [matmul_add_comm_reduction]: 7.65e-06 [allreduce_slice_to_reducescatter]: 3.69997e-07 [virtual_shard_identity]: 6.71e-06 [virtual_dataset]: 6.45002e-06 [get_grad_eliminate_]: 5.75001e-06 [virtual_output]: 5.81e-06 [merge_forward]: 3.65e-06 [cell_reuse_recompute_pass]: 2.02001e-06 [offload_activation]: 7.16999e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.159e-05 [merge_recompute_call_nodes]: 8.90024e-07 [before_grad]: 9.90002e-06 [set_forward_comm_id_for_comm_node_pass]: 3.48e-06 [meta_fg_expand]: 2.35002e-06 [flash_sp_send_recv_attached]: 1.19e-06 [receive_attached]: 1.82999e-06 [after_resolve]: 1.188e-05 [a_after_grad]: 9.25999e-06 [renormalize]: 7.99773e-08 [add_forward_monad_depend]: 1.39e-06 [auto_monad_grad]: 2.07999e-06 [auto_monad_eliminator]: 7.34002e-06 [cse]: 1.488e-05 [a_3]: 3.834e-05 [py_interpret_to_execute_after_opt_a]: 1.142e-05 [slice_cell_reuse_recomputed_activation]: 2.19001e-06 [rewriter_after_opt_a]: 3.672e-05 [convert_after_rewriter]: 6.73998e-06 [order_py_execute_after_rewriter]: 5.04e-06 [mutable_eliminate]: 0.00060306 [opt_b]: 0.00020776, [1] [Cycle 1]: 0.0002013, [7] [b_1]: 0.00012534 [b_2]: 7.75e-06 [updatestate_depend_eliminate]: 6.84999e-06 [updatestate_assign_eliminate]: 2.63e-06 [updatestate_loads_eliminate]: 2.26998e-06 [renormalize]: 7.59988e-07 [cse]: 2.031e-05 [optimize_parallel_all_gather_comm]: 1.772e-05 [overlap_param_gather]: 1.99999e-06 [cconv]: 3.114e-05 [loop_unroll]: 0.0004504 [opt_after_cconv]: 0.00010515, [1] [Cycle 1]: 9.896e-05, [7] [c_1]: 3.228e-05 [parameter_eliminate]: 3.86001e-06 [updatestate_depend_eliminate]: 5.10001e-06 [updatestate_assign_eliminate]: 2.88e-06 [updatestate_loads_eliminate]: 2.14e-06 [cse]: 1.8e-05 [renormalize]: 3.30008e-07 [remove_dup_value]: 1.362e-05 [tuple_transform]: 7.907e-05, [1] [Cycle 1]: 7.456e-05, [4] [d_1]: 4.648e-05 [none_parameter_eliminate]: 1.76e-06 [renormalize]: 4.40021e-07 [switch_simplify]: 7.4e-06 [partial_unused_args_eliminate]: 2.11998e-06 [add_recomputation]: 5.132e-05 [cse_after_recomputation]: 2.213e-05, [1] [Cycle 1]: 1.738e-05, [1] [cse]: 1.189e-05 [environ_conv]: 5.02999e-06 [swap_dp_allreduce_reducescatter]: 4.73001e-06 [bias_add_comm_swap]: 2.91e-06 [label_micro_interleaved_index]: 4.63999e-06 [label_fine_grained_interleaved_index]: 2.94999e-06 [merge_cast_opt]: 1.24e-06 [slice_recompute_activation]: 2.03002e-06 [micro_interleaved_order_control]: 2.41e-06 [assign_add_opt]: 1.39003e-06 [ForceFp32Comm]: 1.04e-06 [remove_cast_before_assign_add]: 1.00001e-06 [full_micro_interleaved_order_control]: 2.26998e-06 [reorder_send_recv_between_fp_bp]: 2.66e-06 [comm_op_add_attrs]: 9.89996e-07 [add_comm_op_reuse_tag]: 9.90025e-07 [interleave_split_concat_branches]: 1.15001e-06 [interleave_parallel_branches]: 1.02998e-06 [overlap_opt_shard_in_pipeline]: 1.20001e-06 [overlap_opt_shard_grad_in_pipeline]: 2.25002e-06 [control_data_broadcast_order]: 1.344e-05 [grouped_pairwise_exchange_alltoall]: 1.54998e-06 [offloading_packed_experts]: 3.98999e-06 [overlap_recompute_and_grad_model_parallel]: 5.05001e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.20999e-06 [overlap_recompute_allgather_and_fa_grad]: 1.37e-06 [overlap_recompute_comm]: 2.02001e-06 [overlap_grad_ring_attention]: 3.95e-06 [overlap_grad_flash_sp]: 2.012e-05 [begin_end_overlap_inline]: 5.89993e-07 [split_matmul_comm_elemetwise]: 2.03002e-06 [split_layernorm_comm]: 1.91998e-06 [handle_group_info]: 1.44998e-06 [symbol_engine_optimizer]: 7.858e-05, [1] [Cycle 1]: 7.45e-05, [6] [build]: 3.55998e-06 [elim_shapecalc]: 1.007e-05 [elim_not_effective]: 1.429e-05 [opt_reshape]: 7.76001e-06 [fold_const_symbol]: 1.037e-05 [renormalize]: 1.8999e-07 [detach_backward]: 2.37999e-06 [pipeline_parallel_scheduler]: 1.89999e-06 [auto_monad_reorder]: 1.713e-05 [get_jit_bprop_graph]: 1.72001e-06 [rewriter_after_jit_bprop_graph]: 4.03001e-06 [opt_after_jit_grad]: 0.00047905 [validate]: 4.145e-05 Sums bootstrap : 0.000468s : 0.73% type_inference : 0.058330s : 91.55% event_method : 0.000021s : 0.03% auto_monad : 0.000067s : 0.10% graph_reusing : 0.000006s : 0.01% inline : 0.000003s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000022s : 0.03% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.01% parallel-infer-symbol : 0.000004s : 0.01% pre_auto_parallel : 0.000037s : 0.06% insert-virtual-dataset : 0.000003s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000028s : 0.04% optimize.rewriter_before_opt_a : 0.000085s : 0.13% optimize.opt_a.expand_dump_flag : 0.000005s : 0.01% optimize.opt_a.switch_simplify : 0.000053s : 0.08% optimize.opt_a.loop_unroll : 0.000036s : 0.06% optimize.opt_a.a_1 : 0.000804s : 1.26% optimize.opt_a.with_stream_mark : 0.000030s : 0.05% optimize.opt_a.recompute_prepare : 0.000016s : 0.02% optimize.opt_a.updatestate_depend_eliminate : 0.000007s : 0.01% optimize.opt_a.updatestate_assign_eliminate : 0.000006s : 0.01% optimize.opt_a.updatestate_loads_eliminate : 0.000005s : 0.01% optimize.opt_a.parameter_eliminate : 0.000004s : 0.01% optimize.opt_a.a_2 : 0.000167s : 0.26% optimize.opt_a.accelerated_algorithm : 0.000013s : 0.02% optimize.opt_a.shard : 0.000004s : 0.01% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.01% optimize.opt_a.shard_inline : 0.000013s : 0.02% optimize.opt_a.merge_send_recv : 0.000015s : 0.02% optimize.opt_a.auto_parallel : 0.000012s : 0.02% optimize.opt_a.parallel : 0.000027s : 0.04% optimize.opt_a.flash_sp : 0.000012s : 0.02% optimize.opt_a.merge_comm : 0.000008s : 0.01% optimize.opt_a.allreduce_fusion : 0.000007s : 0.01% optimize.opt_a.matmul_add_comm_reduction : 0.000017s : 0.03% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000015s : 0.02% optimize.opt_a.virtual_dataset : 0.000013s : 0.02% optimize.opt_a.get_grad_eliminate_ : 0.000012s : 0.02% optimize.opt_a.virtual_output : 0.000013s : 0.02% optimize.opt_a.merge_forward : 0.000008s : 0.01% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.01% optimize.opt_a.offload_activation : 0.000018s : 0.03% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000023s : 0.04% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.00% optimize.opt_a.before_grad : 0.000020s : 0.03% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000007s : 0.01% optimize.opt_a.meta_fg_expand : 0.000005s : 0.01% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.01% optimize.opt_a.receive_attached : 0.000004s : 0.01% optimize.opt_a.after_resolve : 0.000023s : 0.04% optimize.opt_a.a_after_grad : 0.000019s : 0.03% optimize.opt_a.renormalize : 0.000779s : 1.22% optimize.opt_a.add_forward_monad_depend : 0.000008s : 0.01% optimize.opt_a.auto_monad_grad : 0.000005s : 0.01% optimize.opt_a.auto_monad_eliminator : 0.000023s : 0.04% optimize.opt_a.cse : 0.000045s : 0.07% optimize.opt_a.a_3 : 0.000087s : 0.14% optimize.py_interpret_to_execute_after_opt_a : 0.000011s : 0.02% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.00% optimize.rewriter_after_opt_a : 0.000037s : 0.06% optimize.convert_after_rewriter : 0.000007s : 0.01% optimize.order_py_execute_after_rewriter : 0.000005s : 0.01% optimize.mutable_eliminate : 0.000603s : 0.95% optimize.opt_b.b_1 : 0.000125s : 0.20% optimize.opt_b.b_2 : 0.000008s : 0.01% optimize.opt_b.updatestate_depend_eliminate : 0.000007s : 0.01% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000002s : 0.00% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000020s : 0.03% optimize.optimize_parallel_all_gather_comm : 0.000018s : 0.03% optimize.overlap_param_gather : 0.000002s : 0.00% optimize.cconv : 0.000031s : 0.05% optimize.loop_unroll : 0.000450s : 0.71% optimize.opt_after_cconv.c_1 : 0.000032s : 0.05% optimize.opt_after_cconv.parameter_eliminate : 0.000004s : 0.01% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000005s : 0.01% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.00% optimize.opt_after_cconv.cse : 0.000018s : 0.03% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000014s : 0.02% optimize.tuple_transform.d_1 : 0.000046s : 0.07% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000007s : 0.01% optimize.partial_unused_args_eliminate : 0.000002s : 0.00% optimize.add_recomputation : 0.000051s : 0.08% optimize.cse_after_recomputation.cse : 0.000012s : 0.02% optimize.environ_conv : 0.000005s : 0.01% optimize.swap_dp_allreduce_reducescatter : 0.000005s : 0.01% optimize.bias_add_comm_swap : 0.000003s : 0.00% optimize.label_micro_interleaved_index : 0.000005s : 0.01% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.00% optimize.merge_cast_opt : 0.000001s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.00% optimize.micro_interleaved_order_control : 0.000002s : 0.00% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.00% optimize.full_micro_interleaved_order_control : 0.000002s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.00% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.00% optimize.control_data_broadcast_order : 0.000013s : 0.02% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.00% optimize.offloading_packed_experts : 0.000004s : 0.01% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.01% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.00% optimize.overlap_recompute_comm : 0.000002s : 0.00% optimize.overlap_grad_ring_attention : 0.000004s : 0.01% optimize.overlap_grad_flash_sp : 0.000020s : 0.03% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.00% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000004s : 0.01% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000010s : 0.02% optimize.symbol_engine_optimizer.elim_not_effective : 0.000014s : 0.02% optimize.symbol_engine_optimizer.opt_reshape : 0.000008s : 0.01% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000010s : 0.02% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.00% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000017s : 0.03% get_jit_bprop_graph : 0.000002s : 0.00% rewriter_after_jit_bprop_graph : 0.000004s : 0.01% opt_after_jit_grad : 0.000479s : 0.75% validate : 0.000041s : 0.07% Time group info: ------[substitution.] 0.000221 34 15.08% : 0.000033s : 6: substitution.arithmetic_simplify 1.18% : 0.000003s : 2: substitution.elim_not_effective 0.59% : 0.000001s : 2: substitution.fold_const_symbol 2.87% : 0.000006s : 4: substitution.graph_param_transform 67.36% : 0.000149s : 4: substitution.inline 1.66% : 0.000004s : 4: substitution.j_node_and_user_rematch 2.03% : 0.000004s : 4: substitution.remove_not_recompute_node 2.19% : 0.000005s : 4: substitution.replace_old_param 7.05% : 0.000016s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.058260 2 98.72% : 0.057517s : 1: type_inference.infer 1.28% : 0.000743s : 1: type_inference.specialize ------[replace.] 0.000064 8 62.64% : 0.000040s : 4: replace.inline 37.36% : 0.000024s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000160 8 91.42% : 0.000146s : 4: match.inline 8.58% : 0.000014s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000211 1278 0.93% : 0.000002s : 13: predicate.accumulaten_eliminater 1.05% : 0.000002s : 4: predicate.ad_related_special_op_eliminate 0.49% : 0.000001s : 8: predicate.addn_check_dump 0.96% : 0.000002s : 13: predicate.addn_zero_filter 0.79% : 0.000002s : 13: predicate.adjust_all_reduce_mul_add 2.62% : 0.000006s : 21: predicate.arithmetic_simplify 0.93% : 0.000002s : 13: predicate.cast_eliminate 0.57% : 0.000001s : 8: predicate.check_bprop_eliminate 0.47% : 0.000001s : 8: predicate.compare_switch_simplify 0.17% : 0.000000s : 4: predicate.const_output_eliminate 0.62% : 0.000001s : 8: predicate.depend_value_elim 0.93% : 0.000002s : 13: predicate.dict_get_item_const_eliminator 1.16% : 0.000002s : 13: predicate.dict_get_item_eliminator 0.91% : 0.000002s : 13: predicate.dict_set_item_eliminator 1.01% : 0.000002s : 8: predicate.dumpgradient_eliminate 0.43% : 0.000001s : 4: predicate.elim_not_effective 0.37% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.06% : 0.000002s : 17: predicate.environ_add_const_eliminate 1.03% : 0.000002s : 17: predicate.environ_get_add_eliminate 1.04% : 0.000002s : 17: predicate.environ_get_depend_swap 1.63% : 0.000003s : 25: predicate.environ_get_eliminate 1.02% : 0.000002s : 17: predicate.environ_get_set_eliminate 1.41% : 0.000003s : 21: predicate.exchange_switch_depend_value 2.30% : 0.000005s : 21: predicate.float_depend_g_call 0.47% : 0.000001s : 8: predicate.float_environ_get_switch 0.79% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.19% : 0.000000s : 4: predicate.fold_const_symbol 0.61% : 0.000001s : 8: predicate.get_grad_eliminate 0.20% : 0.000000s : 4: predicate.graph_param_transform 0.57% : 0.000001s : 8: predicate.incorporate_call 0.47% : 0.000001s : 8: predicate.incorporate_call_switch 6.39% : 0.000013s : 58: predicate.inline 0.92% : 0.000002s : 8: predicate.inline_without_move 0.31% : 0.000001s : 8: predicate.j_node_and_user_rematch 0.94% : 0.000002s : 8: predicate.less_batch_normalization 1.89% : 0.000004s : 25: predicate.list_to_tuple_eliminator_ 2.51% : 0.000005s : 38: predicate.load_eliminater 1.13% : 0.000002s : 4: predicate.loop_unroll_after_grad 2.29% : 0.000005s : 34: predicate.loop_unroll_before_grad 1.68% : 0.000004s : 21: predicate.make_slice_get_slice_eliminator 0.51% : 0.000001s : 8: predicate.merge_addn 0.61% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.58% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.76% : 0.000002s : 13: predicate.minmaximum_grad 1.35% : 0.000003s : 4: predicate.mutable_eliminate 0.39% : 0.000001s : 4: predicate.opt_reshape 0.46% : 0.000001s : 4: predicate.parallel_virtual_node 1.74% : 0.000004s : 21: predicate.partial_defer_inline 1.63% : 0.000003s : 21: predicate.partial_eliminate 0.90% : 0.000002s : 13: predicate.print_const_string_wrapper 0.68% : 0.000001s : 8: predicate.reduce_all_const_elim 1.13% : 0.000002s : 13: predicate.reduce_eliminate 2.40% : 0.000005s : 38: predicate.redundant_stop_gradient_eliminater 0.45% : 0.000001s : 8: predicate.remove_not_recompute_node 1.33% : 0.000003s : 25: predicate.replace_applicator 0.64% : 0.000001s : 8: predicate.replace_old_param 0.25% : 0.000001s : 4: predicate.reset_defer_inline 0.88% : 0.000002s : 13: predicate.reshape_eliminate 0.63% : 0.000001s : 8: predicate.row_tensor_add_zeros_like 0.38% : 0.000001s : 4: predicate.row_tensor_eliminate 0.82% : 0.000002s : 8: predicate.same_eliminate 0.55% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.71% : 0.000001s : 8: predicate.shard_identity_eliminate 0.77% : 0.000002s : 8: predicate.special_op_eliminate 0.66% : 0.000001s : 8: predicate.specialize_transform 0.87% : 0.000002s : 8: predicate.split_environ_get_set_with_tuple_value 0.70% : 0.000001s : 8: predicate.stack_unstack_eliminate 0.34% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.45% : 0.000003s : 21: predicate.switch_defer_inline 1.98% : 0.000004s : 29: predicate.switch_layer_defer_inline 5.28% : 0.000011s : 67: predicate.switch_simplify 0.89% : 0.000002s : 13: predicate.tile_eliminate 1.10% : 0.000002s : 13: predicate.transpose_eliminate 1.50% : 0.000003s : 21: predicate.tuple_list_convert_item_index_to_positive 1.53% : 0.000003s : 21: predicate.tuple_list_get_item_const_eliminator 1.54% : 0.000003s : 21: predicate.tuple_list_get_item_depend_reorder 3.16% : 0.000007s : 33: predicate.tuple_list_get_item_eliminator 1.39% : 0.000003s : 21: predicate.tuple_list_get_set_item_eliminator 2.09% : 0.000004s : 29: predicate.tuple_list_set_item_eliminator 1.83% : 0.000004s : 25: predicate.tuple_to_list_eliminator_ 2.43% : 0.000005s : 38: predicate.updatestate_pure_node_eliminater 3.02% : 0.000006s : 46: predicate.updatestate_useless_node_eliminater 0.36% : 0.000001s : 4: predicate.value_based_eliminate 0.70% : 0.000001s : 8: predicate.virtual_dataset_eliminate 0.62% : 0.000001s : 8: predicate.virtual_output_eliminate 0.28% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.44% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000593 11 55.50% : 0.000329s : 5: func_graph_cloner_run.FuncGraphClonerGraph 44.50% : 0.000264s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.079615 192 0.00% : 0.000004s : 1: ForceFp32Comm 4.86% : 0.003869s : 1: add_attr 4.85% : 0.003858s : 1: add_attr_with_inline 0.01% : 0.000004s : 1: add_comm_op_reuse_tag 0.07% : 0.000055s : 1: add_recomputation 0.01% : 0.000004s : 1: assign_add_opt 0.09% : 0.000072s : 1: auto_monad 0.03% : 0.000021s : 1: auto_monad_reorder 0.00% : 0.000004s : 1: begin_end_overlap_inline 0.01% : 0.000006s : 1: bias_add_comm_swap 0.63% : 0.000499s : 1: bootstrap 0.04% : 0.000035s : 1: cconv 0.00% : 0.000004s : 1: comm_op_add_attrs 0.02% : 0.000016s : 1: control_data_broadcast_order 0.01% : 0.000010s : 1: convert_after_rewriter 0.03% : 0.000025s : 1: cse_after_recomputation 0.01% : 0.000005s : 1: dataset_repeat_opt 0.01% : 0.000006s : 1: detach_backward 0.01% : 0.000008s : 1: environ_conv 0.03% : 0.000028s : 1: event_method 0.01% : 0.000005s : 1: full_micro_interleaved_order_control 0.01% : 0.000005s : 1: get_jit_bprop_graph 0.01% : 0.000009s : 1: graph_reusing 0.01% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000004s : 1: handle_group_info 0.01% : 0.000006s : 1: inline 0.01% : 0.000006s : 1: insert-virtual-dataset 0.00% : 0.000004s : 1: interleave_parallel_branches 0.00% : 0.000004s : 1: interleave_split_concat_branches 0.01% : 0.000006s : 1: label_fine_grained_interleaved_index 0.01% : 0.000007s : 1: label_micro_interleaved_index 0.58% : 0.000459s : 1: loop_unroll 0.01% : 0.000004s : 1: merge_cast_opt 0.01% : 0.000005s : 1: micro_interleaved_order_control 0.77% : 0.000613s : 1: mutable_eliminate 0.01% : 0.000007s : 1: offloading_packed_experts 0.02% : 0.000014s : 1: opt.transform.loop_unroll_optimizer 0.02% : 0.000018s : 1: opt.transform.mutable_eliminate 1.55% : 0.001236s : 78: opt.transform.opt_a 0.04% : 0.000031s : 1: opt.transform.opt_after_cconv 0.03% : 0.000026s : 1: opt.transform.opt_after_jit_grad 0.13% : 0.000102s : 28: opt.transform.opt_b 0.06% : 0.000051s : 2: opt.transform.opt_trans_graph 0.05% : 0.000038s : 4: opt.transform.symbol_engine_opt 3.62% : 0.002879s : 1: opt_a 0.14% : 0.000109s : 1: opt_after_cconv 0.61% : 0.000489s : 1: opt_after_jit_grad 0.27% : 0.000211s : 1: opt_b 6.33% : 0.005043s : 1: optimize 0.03% : 0.000021s : 1: optimize_parallel_all_gather_comm 0.01% : 0.000008s : 1: order_py_execute_after_rewriter 0.03% : 0.000023s : 1: overlap_grad_flash_sp 0.00% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.01% : 0.000007s : 1: overlap_grad_ring_attention 0.01% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.01% : 0.000005s : 1: overlap_param_gather 0.01% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.01% : 0.000009s : 1: overlap_recompute_and_grad_model_parallel 0.01% : 0.000005s : 1: overlap_recompute_comm 0.01% : 0.000007s : 1: parallel-infer-symbol 0.00% : 0.000004s : 1: parallel-infer-symbol-second 0.01% : 0.000005s : 1: partial_unused_args_eliminate 0.01% : 0.000005s : 1: pipeline_parallel_scheduler 0.01% : 0.000006s : 1: pipeline_split 0.05% : 0.000041s : 1: pre_auto_parallel 0.04% : 0.000032s : 1: py_interpret_to_execute 0.02% : 0.000015s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000004s : 1: remove_cast_before_assign_add 0.02% : 0.000017s : 1: remove_dup_value 0.52% : 0.000411s : 1: renormalize.infer 0.45% : 0.000360s : 1: renormalize.specialize 0.01% : 0.000005s : 1: reorder_send_recv_between_fp_bp 0.01% : 0.000007s : 1: rewriter_after_jit_bprop_graph 0.05% : 0.000041s : 1: rewriter_after_opt_a 0.11% : 0.000089s : 1: rewriter_before_opt_a 0.01% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.01% : 0.000005s : 1: slice_recompute_activation 0.01% : 0.000005s : 1: split_layernorm_comm 0.01% : 0.000005s : 1: split_matmul_comm_elemetwise 0.01% : 0.000008s : 1: swap_dp_allreduce_reducescatter 0.10% : 0.000081s : 1: symbol_engine_optimizer 0.10% : 0.000082s : 1: tuple_transform 73.29% : 0.058350s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:43:44.157.646 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:43:44.157.920 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0313997, [21] [bootstrap]: 0.00044192 [type_inference]: 0.0067254 [event_method]: 2.119e-05 [auto_monad]: 6.667e-05 [graph_reusing]: 6.21e-06 [inline]: 2.63e-06 [add_attr]: 0.00376881, [1] [add_attr_with_inline]: 0.00375683, [1] [Cycle 1]: 9.519e-05, [2] [tag_attr]: 2.401e-05 [meta_addattr_fg_expand]: 6.02999e-06 [parallel-infer-symbol]: 3.55003e-06 [pre_auto_parallel]: 4.057e-05 [insert-virtual-dataset]: 2.32999e-06 [parallel-infer-symbol-second]: 7.09988e-07 [dataset_repeat_opt]: 1.99e-06 [pipeline_split]: 1.63002e-06 [optimize]: 0.019144, [53] [py_interpret_to_execute]: 3.477e-05 [rewriter_before_opt_a]: 9.735e-05 [opt_a]: 0.016537, [2] [Cycle 1]: 0.0155844, [45] [expand_dump_flag]: 2.93e-06 [switch_simplify]: 4.266e-05 [loop_unroll]: 2.995e-05 [a_1]: 0.00067485 [with_stream_mark]: 2.133e-05 [recompute_prepare]: 1.016e-05 [updatestate_depend_eliminate]: 4.28001e-06 [updatestate_assign_eliminate]: 3.50998e-06 [updatestate_loads_eliminate]: 3.5e-06 [parameter_eliminate]: 2.28002e-06 [a_2]: 0.0001184 [accelerated_algorithm]: 8.90999e-06 [shard]: 1.82001e-06 [meta_shard_fg_expand]: 2.09e-06 [shard_inline]: 6.68998e-06 [merge_send_recv]: 9.14e-06 [auto_parallel]: 9.02e-06 [parallel]: 2.046e-05 [flash_sp]: 9.22001e-06 [merge_comm]: 4.12e-06 [allreduce_fusion]: 4.18999e-06 [matmul_add_comm_reduction]: 1.09e-05 [allreduce_slice_to_reducescatter]: 8.00006e-07 [virtual_shard_identity]: 9.77001e-06 [virtual_dataset]: 6.95002e-06 [get_grad_eliminate_]: 6.22001e-06 [virtual_output]: 6.77002e-06 [merge_forward]: 4.60001e-06 [cell_reuse_recompute_pass]: 1.44998e-06 [offload_activation]: 1.053e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.669e-05 [merge_recompute_call_nodes]: 1.45001e-06 [before_grad]: 1.752e-05 [set_forward_comm_id_for_comm_node_pass]: 4.99e-06 [meta_fg_expand]: 3.69002e-06 [flash_sp_send_recv_attached]: 2.83e-06 [receive_attached]: 2.48e-06 [after_resolve]: 1.265e-05 [a_after_grad]: 1.109e-05 [renormalize]: 0.0138677 [add_forward_monad_depend]: 1.242e-05 [auto_monad_grad]: 2.54001e-06 [auto_monad_eliminator]: 2.617e-05 [cse]: 3.267e-05 [a_3]: 7.92e-05 [Cycle 2]: 0.0009341, [45] [expand_dump_flag]: 2.35002e-06 [switch_simplify]: 1.013e-05 [loop_unroll]: 7.35e-06 [a_1]: 0.00016524 [with_stream_mark]: 1.951e-05 [recompute_prepare]: 7.1e-06 [updatestate_depend_eliminate]: 4.02002e-06 [updatestate_assign_eliminate]: 3.06999e-06 [updatestate_loads_eliminate]: 2.88e-06 [parameter_eliminate]: 1.76e-06 [a_2]: 0.00010738 [accelerated_algorithm]: 6.44999e-06 [shard]: 2.49999e-06 [meta_shard_fg_expand]: 2.78e-06 [shard_inline]: 6.59001e-06 [merge_send_recv]: 8.85001e-06 [auto_parallel]: 9.52999e-06 [parallel]: 9.70002e-06 [flash_sp]: 4.03999e-06 [merge_comm]: 4.06001e-06 [allreduce_fusion]: 3.91999e-06 [matmul_add_comm_reduction]: 1.204e-05 [allreduce_slice_to_reducescatter]: 8.10018e-07 [virtual_shard_identity]: 7.23e-06 [virtual_dataset]: 6.31e-06 [get_grad_eliminate_]: 5.77001e-06 [virtual_output]: 6.64999e-06 [merge_forward]: 4.16001e-06 [cell_reuse_recompute_pass]: 3.49001e-06 [offload_activation]: 1.082e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.478e-05 [merge_recompute_call_nodes]: 1.62999e-06 [before_grad]: 1.032e-05 [set_forward_comm_id_for_comm_node_pass]: 4e-06 [meta_fg_expand]: 2.59001e-06 [flash_sp_send_recv_attached]: 1.99999e-06 [receive_attached]: 2.53e-06 [after_resolve]: 1.285e-05 [a_after_grad]: 1.145e-05 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 1.71e-06 [auto_monad_grad]: 1.21002e-06 [auto_monad_eliminator]: 8.48999e-06 [cse]: 1.539e-05 [a_3]: 4.86e-05 [py_interpret_to_execute_after_opt_a]: 2.088e-05 [slice_cell_reuse_recomputed_activation]: 4.55001e-06 [rewriter_after_opt_a]: 4.315e-05 [convert_after_rewriter]: 1.047e-05 [order_py_execute_after_rewriter]: 8e-06 [mutable_eliminate]: 0.0007337 [opt_b]: 0.00028164, [1] [Cycle 1]: 0.0002707, [7] [b_1]: 0.0001705 [b_2]: 9.14e-06 [updatestate_depend_eliminate]: 6.90998e-06 [updatestate_assign_eliminate]: 2.68e-06 [updatestate_loads_eliminate]: 2.51998e-06 [renormalize]: 6.50005e-07 [cse]: 2.037e-05 [optimize_parallel_all_gather_comm]: 2.031e-05 [overlap_param_gather]: 4.90001e-06 [cconv]: 3.349e-05 [loop_unroll]: 0.00045418 [opt_after_cconv]: 0.00013362, [1] [Cycle 1]: 0.00012436, [7] [c_1]: 3.25e-05 [parameter_eliminate]: 4.89e-06 [updatestate_depend_eliminate]: 5.64998e-06 [updatestate_assign_eliminate]: 2.74999e-06 [updatestate_loads_eliminate]: 2.44001e-06 [cse]: 1.755e-05 [renormalize]: 3.50003e-07 [remove_dup_value]: 1.658e-05 [tuple_transform]: 9.182e-05, [1] [Cycle 1]: 8.495e-05, [4] [d_1]: 4.497e-05 [none_parameter_eliminate]: 1.94e-06 [renormalize]: 2.00002e-07 [switch_simplify]: 7.04001e-06 [partial_unused_args_eliminate]: 4.37e-06 [add_recomputation]: 5.491e-05 [cse_after_recomputation]: 2.821e-05, [1] [Cycle 1]: 2.093e-05, [1] [cse]: 1.177e-05 [environ_conv]: 8.18001e-06 [swap_dp_allreduce_reducescatter]: 7.97998e-06 [bias_add_comm_swap]: 5.63997e-06 [label_micro_interleaved_index]: 7.82998e-06 [label_fine_grained_interleaved_index]: 4.94e-06 [merge_cast_opt]: 3.96001e-06 [slice_recompute_activation]: 4.63001e-06 [micro_interleaved_order_control]: 3.289e-05 [assign_add_opt]: 3.62002e-06 [ForceFp32Comm]: 3.21999e-06 [remove_cast_before_assign_add]: 3.31999e-06 [full_micro_interleaved_order_control]: 4.38001e-06 [reorder_send_recv_between_fp_bp]: 5.44e-06 [comm_op_add_attrs]: 3.61999e-06 [add_comm_op_reuse_tag]: 3.57002e-06 [interleave_split_concat_branches]: 3.54002e-06 [interleave_parallel_branches]: 3.50998e-06 [overlap_opt_shard_in_pipeline]: 3.55003e-06 [overlap_opt_shard_grad_in_pipeline]: 4.34997e-06 [control_data_broadcast_order]: 1.664e-05 [grouped_pairwise_exchange_alltoall]: 3.88001e-06 [offloading_packed_experts]: 6.79999e-06 [overlap_recompute_and_grad_model_parallel]: 7.33e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.93999e-06 [overlap_recompute_allgather_and_fa_grad]: 3.8e-06 [overlap_recompute_comm]: 4.86002e-06 [overlap_grad_ring_attention]: 6.58e-06 [overlap_grad_flash_sp]: 2.444e-05 [begin_end_overlap_inline]: 3.16999e-06 [split_matmul_comm_elemetwise]: 4.77e-06 [split_layernorm_comm]: 4.08001e-06 [handle_group_info]: 3.31999e-06 [symbol_engine_optimizer]: 9.997e-05, [1] [Cycle 1]: 9.265e-05, [6] [build]: 4.03001e-06 [elim_shapecalc]: 1.069e-05 [elim_not_effective]: 1.334e-05 [opt_reshape]: 7.57002e-06 [fold_const_symbol]: 1.016e-05 [renormalize]: 2.10013e-07 [detach_backward]: 3.71999e-06 [pipeline_parallel_scheduler]: 1.60999e-06 [auto_monad_reorder]: 1.896e-05 [get_jit_bprop_graph]: 2.24999e-06 [rewriter_after_jit_bprop_graph]: 5.09e-06 [opt_after_jit_grad]: 0.00051446 [validate]: 4.111e-05 Sums bootstrap : 0.000442s : 1.71% type_inference : 0.006725s : 26.07% event_method : 0.000021s : 0.08% auto_monad : 0.000067s : 0.26% graph_reusing : 0.000006s : 0.02% inline : 0.000003s : 0.01% add_attr.add_attr_with_inline.tag_attr : 0.000024s : 0.09% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.02% parallel-infer-symbol : 0.000004s : 0.01% pre_auto_parallel : 0.000041s : 0.16% insert-virtual-dataset : 0.000002s : 0.01% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.01% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000035s : 0.13% optimize.rewriter_before_opt_a : 0.000097s : 0.38% optimize.opt_a.expand_dump_flag : 0.000005s : 0.02% optimize.opt_a.switch_simplify : 0.000053s : 0.20% optimize.opt_a.loop_unroll : 0.000037s : 0.14% optimize.opt_a.a_1 : 0.000840s : 3.26% optimize.opt_a.with_stream_mark : 0.000041s : 0.16% optimize.opt_a.recompute_prepare : 0.000017s : 0.07% optimize.opt_a.updatestate_depend_eliminate : 0.000008s : 0.03% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.03% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.02% optimize.opt_a.parameter_eliminate : 0.000004s : 0.02% optimize.opt_a.a_2 : 0.000226s : 0.88% optimize.opt_a.accelerated_algorithm : 0.000015s : 0.06% optimize.opt_a.shard : 0.000004s : 0.02% optimize.opt_a.meta_shard_fg_expand : 0.000005s : 0.02% optimize.opt_a.shard_inline : 0.000013s : 0.05% optimize.opt_a.merge_send_recv : 0.000018s : 0.07% optimize.opt_a.auto_parallel : 0.000019s : 0.07% optimize.opt_a.parallel : 0.000030s : 0.12% optimize.opt_a.flash_sp : 0.000013s : 0.05% optimize.opt_a.merge_comm : 0.000008s : 0.03% optimize.opt_a.allreduce_fusion : 0.000008s : 0.03% optimize.opt_a.matmul_add_comm_reduction : 0.000023s : 0.09% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000002s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000017s : 0.07% optimize.opt_a.virtual_dataset : 0.000013s : 0.05% optimize.opt_a.get_grad_eliminate_ : 0.000012s : 0.05% optimize.opt_a.virtual_output : 0.000013s : 0.05% optimize.opt_a.merge_forward : 0.000009s : 0.03% optimize.opt_a.cell_reuse_recompute_pass : 0.000005s : 0.02% optimize.opt_a.offload_activation : 0.000021s : 0.08% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000031s : 0.12% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.01% optimize.opt_a.before_grad : 0.000028s : 0.11% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000009s : 0.03% optimize.opt_a.meta_fg_expand : 0.000006s : 0.02% optimize.opt_a.flash_sp_send_recv_attached : 0.000005s : 0.02% optimize.opt_a.receive_attached : 0.000005s : 0.02% optimize.opt_a.after_resolve : 0.000026s : 0.10% optimize.opt_a.a_after_grad : 0.000023s : 0.09% optimize.opt_a.renormalize : 0.013868s : 53.75% optimize.opt_a.add_forward_monad_depend : 0.000014s : 0.05% optimize.opt_a.auto_monad_grad : 0.000004s : 0.01% optimize.opt_a.auto_monad_eliminator : 0.000035s : 0.13% optimize.opt_a.cse : 0.000048s : 0.19% optimize.opt_a.a_3 : 0.000128s : 0.50% optimize.py_interpret_to_execute_after_opt_a : 0.000021s : 0.08% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.02% optimize.rewriter_after_opt_a : 0.000043s : 0.17% optimize.convert_after_rewriter : 0.000010s : 0.04% optimize.order_py_execute_after_rewriter : 0.000008s : 0.03% optimize.mutable_eliminate : 0.000734s : 2.84% optimize.opt_b.b_1 : 0.000170s : 0.66% optimize.opt_b.b_2 : 0.000009s : 0.04% optimize.opt_b.updatestate_depend_eliminate : 0.000007s : 0.03% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.01% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.01% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000020s : 0.08% optimize.optimize_parallel_all_gather_comm : 0.000020s : 0.08% optimize.overlap_param_gather : 0.000005s : 0.02% optimize.cconv : 0.000033s : 0.13% optimize.loop_unroll : 0.000454s : 1.76% optimize.opt_after_cconv.c_1 : 0.000032s : 0.13% optimize.opt_after_cconv.parameter_eliminate : 0.000005s : 0.02% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.02% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.01% optimize.opt_after_cconv.cse : 0.000018s : 0.07% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000017s : 0.06% optimize.tuple_transform.d_1 : 0.000045s : 0.17% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000007s : 0.03% optimize.partial_unused_args_eliminate : 0.000004s : 0.02% optimize.add_recomputation : 0.000055s : 0.21% optimize.cse_after_recomputation.cse : 0.000012s : 0.05% optimize.environ_conv : 0.000008s : 0.03% optimize.swap_dp_allreduce_reducescatter : 0.000008s : 0.03% optimize.bias_add_comm_swap : 0.000006s : 0.02% optimize.label_micro_interleaved_index : 0.000008s : 0.03% optimize.label_fine_grained_interleaved_index : 0.000005s : 0.02% optimize.merge_cast_opt : 0.000004s : 0.02% optimize.slice_recompute_activation : 0.000005s : 0.02% optimize.micro_interleaved_order_control : 0.000033s : 0.13% optimize.assign_add_opt : 0.000004s : 0.01% optimize.ForceFp32Comm : 0.000003s : 0.01% optimize.remove_cast_before_assign_add : 0.000003s : 0.01% optimize.full_micro_interleaved_order_control : 0.000004s : 0.02% optimize.reorder_send_recv_between_fp_bp : 0.000005s : 0.02% optimize.comm_op_add_attrs : 0.000004s : 0.01% optimize.add_comm_op_reuse_tag : 0.000004s : 0.01% optimize.interleave_split_concat_branches : 0.000004s : 0.01% optimize.interleave_parallel_branches : 0.000004s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000004s : 0.02% optimize.control_data_broadcast_order : 0.000017s : 0.06% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.02% optimize.offloading_packed_experts : 0.000007s : 0.03% optimize.overlap_recompute_and_grad_model_parallel : 0.000007s : 0.03% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.02% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.01% optimize.overlap_recompute_comm : 0.000005s : 0.02% optimize.overlap_grad_ring_attention : 0.000007s : 0.03% optimize.overlap_grad_flash_sp : 0.000024s : 0.09% optimize.begin_end_overlap_inline : 0.000003s : 0.01% optimize.split_matmul_comm_elemetwise : 0.000005s : 0.02% optimize.split_layernorm_comm : 0.000004s : 0.02% optimize.handle_group_info : 0.000003s : 0.01% optimize.symbol_engine_optimizer.build : 0.000004s : 0.02% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000011s : 0.04% optimize.symbol_engine_optimizer.elim_not_effective : 0.000013s : 0.05% optimize.symbol_engine_optimizer.opt_reshape : 0.000008s : 0.03% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000010s : 0.04% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000004s : 0.01% pipeline_parallel_scheduler : 0.000002s : 0.01% auto_monad_reorder : 0.000019s : 0.07% get_jit_bprop_graph : 0.000002s : 0.01% rewriter_after_jit_bprop_graph : 0.000005s : 0.02% opt_after_jit_grad : 0.000514s : 1.99% validate : 0.000041s : 0.16% Time group info: ------[substitution.] 0.000254 34 17.15% : 0.000044s : 6: substitution.arithmetic_simplify 0.73% : 0.000002s : 2: substitution.elim_not_effective 0.70% : 0.000002s : 2: substitution.fold_const_symbol 2.63% : 0.000007s : 4: substitution.graph_param_transform 65.93% : 0.000167s : 4: substitution.inline 1.75% : 0.000004s : 4: substitution.j_node_and_user_rematch 2.09% : 0.000005s : 4: substitution.remove_not_recompute_node 2.58% : 0.000007s : 4: substitution.replace_old_param 6.44% : 0.000016s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.006660 2 87.91% : 0.005854s : 1: type_inference.infer 12.09% : 0.000805s : 1: type_inference.specialize ------[replace.] 0.000062 8 63.71% : 0.000040s : 4: replace.inline 36.29% : 0.000023s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000179 8 91.87% : 0.000165s : 4: match.inline 8.13% : 0.000015s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000220 1278 0.99% : 0.000002s : 13: predicate.accumulaten_eliminater 0.88% : 0.000002s : 4: predicate.ad_related_special_op_eliminate 0.60% : 0.000001s : 8: predicate.addn_check_dump 1.12% : 0.000002s : 13: predicate.addn_zero_filter 0.73% : 0.000002s : 13: predicate.adjust_all_reduce_mul_add 2.50% : 0.000006s : 21: predicate.arithmetic_simplify 1.05% : 0.000002s : 13: predicate.cast_eliminate 0.61% : 0.000001s : 8: predicate.check_bprop_eliminate 0.50% : 0.000001s : 8: predicate.compare_switch_simplify 0.17% : 0.000000s : 4: predicate.const_output_eliminate 0.64% : 0.000001s : 8: predicate.depend_value_elim 0.91% : 0.000002s : 13: predicate.dict_get_item_const_eliminator 1.14% : 0.000003s : 13: predicate.dict_get_item_eliminator 1.02% : 0.000002s : 13: predicate.dict_set_item_eliminator 1.04% : 0.000002s : 8: predicate.dumpgradient_eliminate 0.26% : 0.000001s : 4: predicate.elim_not_effective 0.43% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.09% : 0.000002s : 17: predicate.environ_add_const_eliminate 1.28% : 0.000003s : 17: predicate.environ_get_add_eliminate 1.04% : 0.000002s : 17: predicate.environ_get_depend_swap 1.58% : 0.000003s : 25: predicate.environ_get_eliminate 0.99% : 0.000002s : 17: predicate.environ_get_set_eliminate 1.50% : 0.000003s : 21: predicate.exchange_switch_depend_value 2.37% : 0.000005s : 21: predicate.float_depend_g_call 0.61% : 0.000001s : 8: predicate.float_environ_get_switch 0.73% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.19% : 0.000000s : 4: predicate.fold_const_symbol 0.61% : 0.000001s : 8: predicate.get_grad_eliminate 0.28% : 0.000001s : 4: predicate.graph_param_transform 0.57% : 0.000001s : 8: predicate.incorporate_call 0.46% : 0.000001s : 8: predicate.incorporate_call_switch 6.44% : 0.000014s : 58: predicate.inline 0.99% : 0.000002s : 8: predicate.inline_without_move 0.30% : 0.000001s : 8: predicate.j_node_and_user_rematch 1.03% : 0.000002s : 8: predicate.less_batch_normalization 1.74% : 0.000004s : 25: predicate.list_to_tuple_eliminator_ 2.38% : 0.000005s : 38: predicate.load_eliminater 0.82% : 0.000002s : 4: predicate.loop_unroll_after_grad 2.23% : 0.000005s : 34: predicate.loop_unroll_before_grad 1.73% : 0.000004s : 21: predicate.make_slice_get_slice_eliminator 0.55% : 0.000001s : 8: predicate.merge_addn 0.67% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.55% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.80% : 0.000002s : 13: predicate.minmaximum_grad 0.99% : 0.000002s : 4: predicate.mutable_eliminate 0.35% : 0.000001s : 4: predicate.opt_reshape 0.35% : 0.000001s : 4: predicate.parallel_virtual_node 1.71% : 0.000004s : 21: predicate.partial_defer_inline 1.54% : 0.000003s : 21: predicate.partial_eliminate 0.83% : 0.000002s : 13: predicate.print_const_string_wrapper 0.60% : 0.000001s : 8: predicate.reduce_all_const_elim 1.44% : 0.000003s : 13: predicate.reduce_eliminate 2.58% : 0.000006s : 38: predicate.redundant_stop_gradient_eliminater 0.44% : 0.000001s : 8: predicate.remove_not_recompute_node 1.46% : 0.000003s : 25: predicate.replace_applicator 0.49% : 0.000001s : 8: predicate.replace_old_param 0.41% : 0.000001s : 4: predicate.reset_defer_inline 0.99% : 0.000002s : 13: predicate.reshape_eliminate 0.60% : 0.000001s : 8: predicate.row_tensor_add_zeros_like 0.44% : 0.000001s : 4: predicate.row_tensor_eliminate 0.96% : 0.000002s : 8: predicate.same_eliminate 0.42% : 0.000001s : 8: predicate.set_cell_output_no_recompute 1.04% : 0.000002s : 8: predicate.shard_identity_eliminate 0.57% : 0.000001s : 8: predicate.special_op_eliminate 0.75% : 0.000002s : 8: predicate.specialize_transform 0.96% : 0.000002s : 8: predicate.split_environ_get_set_with_tuple_value 0.92% : 0.000002s : 8: predicate.stack_unstack_eliminate 0.32% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.45% : 0.000003s : 21: predicate.switch_defer_inline 1.97% : 0.000004s : 29: predicate.switch_layer_defer_inline 4.69% : 0.000010s : 67: predicate.switch_simplify 0.88% : 0.000002s : 13: predicate.tile_eliminate 1.04% : 0.000002s : 13: predicate.transpose_eliminate 1.51% : 0.000003s : 21: predicate.tuple_list_convert_item_index_to_positive 1.50% : 0.000003s : 21: predicate.tuple_list_get_item_const_eliminator 1.37% : 0.000003s : 21: predicate.tuple_list_get_item_depend_reorder 3.24% : 0.000007s : 33: predicate.tuple_list_get_item_eliminator 1.36% : 0.000003s : 21: predicate.tuple_list_get_set_item_eliminator 2.39% : 0.000005s : 29: predicate.tuple_list_set_item_eliminator 1.72% : 0.000004s : 25: predicate.tuple_to_list_eliminator_ 2.22% : 0.000005s : 38: predicate.updatestate_pure_node_eliminater 2.88% : 0.000006s : 46: predicate.updatestate_useless_node_eliminater 0.40% : 0.000001s : 4: predicate.value_based_eliminate 0.67% : 0.000001s : 8: predicate.virtual_dataset_eliminate 0.67% : 0.000001s : 8: predicate.virtual_output_eliminate 0.28% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.51% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000776 11 44.67% : 0.000347s : 5: func_graph_cloner_run.FuncGraphClonerGraph 55.33% : 0.000429s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.069614 192 0.01% : 0.000006s : 1: ForceFp32Comm 5.43% : 0.003780s : 1: add_attr 5.40% : 0.003761s : 1: add_attr_with_inline 0.01% : 0.000007s : 1: add_comm_op_reuse_tag 0.08% : 0.000058s : 1: add_recomputation 0.01% : 0.000006s : 1: assign_add_opt 0.11% : 0.000076s : 1: auto_monad 0.04% : 0.000027s : 1: auto_monad_reorder 0.01% : 0.000006s : 1: begin_end_overlap_inline 0.01% : 0.000008s : 1: bias_add_comm_swap 0.70% : 0.000490s : 1: bootstrap 0.05% : 0.000037s : 1: cconv 0.01% : 0.000007s : 1: comm_op_add_attrs 0.03% : 0.000020s : 1: control_data_broadcast_order 0.02% : 0.000014s : 1: convert_after_rewriter 0.04% : 0.000031s : 1: cse_after_recomputation 0.01% : 0.000008s : 1: dataset_repeat_opt 0.03% : 0.000019s : 1: detach_backward 0.02% : 0.000012s : 1: environ_conv 0.05% : 0.000033s : 1: event_method 0.01% : 0.000007s : 1: full_micro_interleaved_order_control 0.01% : 0.000009s : 1: get_jit_bprop_graph 0.02% : 0.000013s : 1: graph_reusing 0.01% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000006s : 1: handle_group_info 0.01% : 0.000008s : 1: inline 0.01% : 0.000008s : 1: insert-virtual-dataset 0.01% : 0.000006s : 1: interleave_parallel_branches 0.01% : 0.000006s : 1: interleave_split_concat_branches 0.01% : 0.000008s : 1: label_fine_grained_interleaved_index 0.01% : 0.000010s : 1: label_micro_interleaved_index 0.66% : 0.000461s : 1: loop_unroll 0.01% : 0.000007s : 1: merge_cast_opt 0.05% : 0.000036s : 1: micro_interleaved_order_control 1.06% : 0.000741s : 1: mutable_eliminate 0.01% : 0.000010s : 1: offloading_packed_experts 0.02% : 0.000014s : 1: opt.transform.loop_unroll_optimizer 0.02% : 0.000017s : 1: opt.transform.mutable_eliminate 1.87% : 0.001305s : 78: opt.transform.opt_a 0.04% : 0.000031s : 1: opt.transform.opt_after_cconv 0.04% : 0.000027s : 1: opt.transform.opt_after_jit_grad 0.15% : 0.000106s : 28: opt.transform.opt_b 0.07% : 0.000050s : 2: opt.transform.opt_trans_graph 0.05% : 0.000038s : 4: opt.transform.symbol_engine_opt 23.76% : 0.016541s : 1: opt_a 0.20% : 0.000137s : 1: opt_after_cconv 0.76% : 0.000526s : 1: opt_after_jit_grad 0.41% : 0.000285s : 1: opt_b 27.95% : 0.019459s : 1: optimize 0.03% : 0.000023s : 1: optimize_parallel_all_gather_comm 0.02% : 0.000011s : 1: order_py_execute_after_rewriter 0.04% : 0.000028s : 1: overlap_grad_flash_sp 0.01% : 0.000007s : 1: overlap_grad_matmul_and_grad_allreduce 0.01% : 0.000009s : 1: overlap_grad_ring_attention 0.01% : 0.000007s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000006s : 1: overlap_opt_shard_in_pipeline 0.01% : 0.000008s : 1: overlap_param_gather 0.01% : 0.000006s : 1: overlap_recompute_allgather_and_fa_grad 0.02% : 0.000010s : 1: overlap_recompute_and_grad_model_parallel 0.01% : 0.000008s : 1: overlap_recompute_comm 0.01% : 0.000010s : 1: parallel-infer-symbol 0.01% : 0.000006s : 1: parallel-infer-symbol-second 0.01% : 0.000007s : 1: partial_unused_args_eliminate 0.01% : 0.000009s : 1: pipeline_parallel_scheduler 0.01% : 0.000007s : 1: pipeline_split 0.07% : 0.000048s : 1: pre_auto_parallel 0.06% : 0.000039s : 1: py_interpret_to_execute 0.03% : 0.000024s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000006s : 1: remove_cast_before_assign_add 0.03% : 0.000020s : 1: remove_dup_value 19.04% : 0.013255s : 1: renormalize.infer 0.85% : 0.000594s : 1: renormalize.specialize 0.01% : 0.000008s : 1: reorder_send_recv_between_fp_bp 0.02% : 0.000011s : 1: rewriter_after_jit_bprop_graph 0.07% : 0.000046s : 1: rewriter_after_opt_a 0.15% : 0.000101s : 1: rewriter_before_opt_a 0.01% : 0.000007s : 1: slice_cell_reuse_recomputed_activation 0.01% : 0.000008s : 1: slice_recompute_activation 0.01% : 0.000007s : 1: split_layernorm_comm 0.01% : 0.000007s : 1: split_matmul_comm_elemetwise 0.02% : 0.000011s : 1: swap_dp_allreduce_reducescatter 0.15% : 0.000103s : 1: symbol_engine_optimizer 0.14% : 0.000095s : 1: tuple_transform 9.74% : 0.006779s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:43:45.625.928 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0651361, [21] [bootstrap]: 0.0485324 [type_inference]: 0.00663698 [event_method]: 1.987e-05 [auto_monad]: 6.464e-05 [graph_reusing]: 5.82999e-06 [inline]: 2.84001e-06 [add_attr]: 0.00368287, [1] [add_attr_with_inline]: 0.00367088, [1] [Cycle 1]: 7.119e-05, [2] [tag_attr]: 2.295e-05 [meta_addattr_fg_expand]: 5.82999e-06 [parallel-infer-symbol]: 3.84002e-06 [pre_auto_parallel]: 4.071e-05 [insert-virtual-dataset]: 2.48e-06 [parallel-infer-symbol-second]: 1.04e-06 [dataset_repeat_opt]: 2.27001e-06 [pipeline_split]: 2.05002e-06 [optimize]: 0.00538514, [53] [py_interpret_to_execute]: 2.982e-05 [rewriter_before_opt_a]: 8.917e-05 [opt_a]: 0.00301949, [2] [Cycle 1]: 0.00229294, [45] [expand_dump_flag]: 3.17002e-06 [switch_simplify]: 4.385e-05 [loop_unroll]: 3.125e-05 [a_1]: 0.00066895 [with_stream_mark]: 2.135e-05 [recompute_prepare]: 1.106e-05 [updatestate_depend_eliminate]: 4.2e-06 [updatestate_assign_eliminate]: 3.03998e-06 [updatestate_loads_eliminate]: 3.21001e-06 [parameter_eliminate]: 1.92001e-06 [a_2]: 8.916e-05 [accelerated_algorithm]: 7.41999e-06 [shard]: 2.27001e-06 [meta_shard_fg_expand]: 2.16e-06 [shard_inline]: 6.56999e-06 [merge_send_recv]: 9.05999e-06 [auto_parallel]: 7.35e-06 [parallel]: 2.272e-05 [flash_sp]: 8.49998e-06 [merge_comm]: 4.15e-06 [allreduce_fusion]: 3.61999e-06 [matmul_add_comm_reduction]: 9.76e-06 [allreduce_slice_to_reducescatter]: 6.50005e-07 [virtual_shard_identity]: 8.49002e-06 [virtual_dataset]: 6.93e-06 [get_grad_eliminate_]: 6.23e-06 [virtual_output]: 6.84999e-06 [merge_forward]: 4.22998e-06 [cell_reuse_recompute_pass]: 1.32e-06 [offload_activation]: 1.042e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.577e-05 [merge_recompute_call_nodes]: 1.60999e-06 [before_grad]: 1.221e-05 [set_forward_comm_id_for_comm_node_pass]: 3.85e-06 [meta_fg_expand]: 3.39001e-06 [flash_sp_send_recv_attached]: 3.06001e-06 [receive_attached]: 2.53e-06 [after_resolve]: 1.298e-05 [a_after_grad]: 1.046e-05 [renormalize]: 0.00083246 [add_forward_monad_depend]: 7.08e-06 [auto_monad_grad]: 2.50002e-06 [auto_monad_eliminator]: 1.764e-05 [cse]: 3.187e-05 [a_3]: 5.587e-05 [Cycle 2]: 0.00071365, [45] [expand_dump_flag]: 1.40999e-06 [switch_simplify]: 8.84e-06 [loop_unroll]: 6.54999e-06 [a_1]: 0.0001518 [with_stream_mark]: 1.541e-05 [recompute_prepare]: 7.65e-06 [updatestate_depend_eliminate]: 3.41999e-06 [updatestate_assign_eliminate]: 3.03e-06 [updatestate_loads_eliminate]: 2.68e-06 [parameter_eliminate]: 1.57001e-06 [a_2]: 8.103e-05 [accelerated_algorithm]: 7.09001e-06 [shard]: 1.47001e-06 [meta_shard_fg_expand]: 1.77999e-06 [shard_inline]: 6.18002e-06 [merge_send_recv]: 6.53998e-06 [auto_parallel]: 7.36999e-06 [parallel]: 6.18002e-06 [flash_sp]: 4.2e-06 [merge_comm]: 7.35e-06 [allreduce_fusion]: 3.43e-06 [matmul_add_comm_reduction]: 6.90998e-06 [allreduce_slice_to_reducescatter]: 6.00005e-07 [virtual_shard_identity]: 6.99001e-06 [virtual_dataset]: 6.66999e-06 [get_grad_eliminate_]: 5.56e-06 [virtual_output]: 5.61e-06 [merge_forward]: 3.2e-06 [cell_reuse_recompute_pass]: 2.03002e-06 [offload_activation]: 7.33e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.111e-05 [merge_recompute_call_nodes]: 1.20001e-06 [before_grad]: 9.46998e-06 [set_forward_comm_id_for_comm_node_pass]: 6.14001e-06 [meta_fg_expand]: 2.03997e-06 [flash_sp_send_recv_attached]: 1.34e-06 [receive_attached]: 2.14e-06 [after_resolve]: 1.235e-05 [a_after_grad]: 9.80002e-06 [renormalize]: 8.00064e-08 [add_forward_monad_depend]: 1.99e-06 [auto_monad_grad]: 1.59e-06 [auto_monad_eliminator]: 8.28999e-06 [cse]: 1.683e-05 [a_3]: 3.623e-05 [py_interpret_to_execute_after_opt_a]: 1.308e-05 [slice_cell_reuse_recomputed_activation]: 2.32999e-06 [rewriter_after_opt_a]: 3.999e-05 [convert_after_rewriter]: 7.08e-06 [order_py_execute_after_rewriter]: 5.50001e-06 [mutable_eliminate]: 0.00073194 [opt_b]: 0.00021467, [1] [Cycle 1]: 0.0002073, [7] [b_1]: 0.0001267 [b_2]: 8.37998e-06 [updatestate_depend_eliminate]: 8.18999e-06 [updatestate_assign_eliminate]: 2.51e-06 [updatestate_loads_eliminate]: 2.39001e-06 [renormalize]: 6.29982e-07 [cse]: 2.218e-05 [optimize_parallel_all_gather_comm]: 1.898e-05 [overlap_param_gather]: 2.12999e-06 [cconv]: 3.015e-05 [loop_unroll]: 0.00046423 [opt_after_cconv]: 0.00010548, [1] [Cycle 1]: 9.925e-05, [7] [c_1]: 3.149e-05 [parameter_eliminate]: 4.07e-06 [updatestate_depend_eliminate]: 5.30999e-06 [updatestate_assign_eliminate]: 2.35002e-06 [updatestate_loads_eliminate]: 2.29001e-06 [cse]: 1.972e-05 [renormalize]: 3.00002e-07 [remove_dup_value]: 1.399e-05 [tuple_transform]: 7.975e-05, [1] [Cycle 1]: 7.461e-05, [4] [d_1]: 4.625e-05 [none_parameter_eliminate]: 1.64998e-06 [renormalize]: 1.69995e-07 [switch_simplify]: 6.98e-06 [partial_unused_args_eliminate]: 2.18998e-06 [add_recomputation]: 5.303e-05 [cse_after_recomputation]: 2.241e-05, [1] [Cycle 1]: 1.778e-05, [1] [cse]: 1.192e-05 [environ_conv]: 6.16e-06 [swap_dp_allreduce_reducescatter]: 5.03002e-06 [bias_add_comm_swap]: 3.22002e-06 [label_micro_interleaved_index]: 4.75001e-06 [label_fine_grained_interleaved_index]: 2.74999e-06 [merge_cast_opt]: 1.46998e-06 [slice_recompute_activation]: 2.69999e-06 [micro_interleaved_order_control]: 2.49001e-06 [assign_add_opt]: 1.39e-06 [ForceFp32Comm]: 8.79983e-07 [remove_cast_before_assign_add]: 1.04998e-06 [full_micro_interleaved_order_control]: 2.69001e-06 [reorder_send_recv_between_fp_bp]: 3.13e-06 [comm_op_add_attrs]: 1.04003e-06 [add_comm_op_reuse_tag]: 9.70002e-07 [interleave_split_concat_branches]: 1.16997e-06 [interleave_parallel_branches]: 1.17e-06 [overlap_opt_shard_in_pipeline]: 1.17e-06 [overlap_opt_shard_grad_in_pipeline]: 2.04999e-06 [control_data_broadcast_order]: 1.382e-05 [grouped_pairwise_exchange_alltoall]: 2.16e-06 [offloading_packed_experts]: 4.29002e-06 [overlap_recompute_and_grad_model_parallel]: 5.05001e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.45999e-06 [overlap_recompute_allgather_and_fa_grad]: 1.35001e-06 [overlap_recompute_comm]: 2.56998e-06 [overlap_grad_ring_attention]: 4.11001e-06 [overlap_grad_flash_sp]: 2.199e-05 [begin_end_overlap_inline]: 5.90022e-07 [split_matmul_comm_elemetwise]: 2.11998e-06 [split_layernorm_comm]: 1.76e-06 [handle_group_info]: 8.89995e-07 [symbol_engine_optimizer]: 0.00010232, [1] [Cycle 1]: 9.718e-05, [6] [build]: 3.21999e-06 [elim_shapecalc]: 1.167e-05 [elim_not_effective]: 2.948e-05 [opt_reshape]: 7.45e-06 [fold_const_symbol]: 1.076e-05 [renormalize]: 1.8999e-07 [detach_backward]: 2.71e-06 [pipeline_parallel_scheduler]: 1.95001e-06 [auto_monad_reorder]: 1.753e-05 [get_jit_bprop_graph]: 1.99999e-06 [rewriter_after_jit_bprop_graph]: 5.54e-06 [opt_after_jit_grad]: 0.00049913 [validate]: 4.407e-05 Sums bootstrap : 0.048532s : 80.31% type_inference : 0.006637s : 10.98% event_method : 0.000020s : 0.03% auto_monad : 0.000065s : 0.11% graph_reusing : 0.000006s : 0.01% inline : 0.000003s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000023s : 0.04% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.01% parallel-infer-symbol : 0.000004s : 0.01% pre_auto_parallel : 0.000041s : 0.07% insert-virtual-dataset : 0.000002s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000030s : 0.05% optimize.rewriter_before_opt_a : 0.000089s : 0.15% optimize.opt_a.expand_dump_flag : 0.000005s : 0.01% optimize.opt_a.switch_simplify : 0.000053s : 0.09% optimize.opt_a.loop_unroll : 0.000038s : 0.06% optimize.opt_a.a_1 : 0.000821s : 1.36% optimize.opt_a.with_stream_mark : 0.000037s : 0.06% optimize.opt_a.recompute_prepare : 0.000019s : 0.03% optimize.opt_a.updatestate_depend_eliminate : 0.000008s : 0.01% optimize.opt_a.updatestate_assign_eliminate : 0.000006s : 0.01% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.01% optimize.opt_a.parameter_eliminate : 0.000003s : 0.01% optimize.opt_a.a_2 : 0.000170s : 0.28% optimize.opt_a.accelerated_algorithm : 0.000015s : 0.02% optimize.opt_a.shard : 0.000004s : 0.01% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.01% optimize.opt_a.shard_inline : 0.000013s : 0.02% optimize.opt_a.merge_send_recv : 0.000016s : 0.03% optimize.opt_a.auto_parallel : 0.000015s : 0.02% optimize.opt_a.parallel : 0.000029s : 0.05% optimize.opt_a.flash_sp : 0.000013s : 0.02% optimize.opt_a.merge_comm : 0.000012s : 0.02% optimize.opt_a.allreduce_fusion : 0.000007s : 0.01% optimize.opt_a.matmul_add_comm_reduction : 0.000017s : 0.03% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000015s : 0.03% optimize.opt_a.virtual_dataset : 0.000014s : 0.02% optimize.opt_a.get_grad_eliminate_ : 0.000012s : 0.02% optimize.opt_a.virtual_output : 0.000012s : 0.02% optimize.opt_a.merge_forward : 0.000007s : 0.01% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.01% optimize.opt_a.offload_activation : 0.000018s : 0.03% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000037s : 0.06% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.00% optimize.opt_a.before_grad : 0.000022s : 0.04% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000010s : 0.02% optimize.opt_a.meta_fg_expand : 0.000005s : 0.01% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.01% optimize.opt_a.receive_attached : 0.000005s : 0.01% optimize.opt_a.after_resolve : 0.000025s : 0.04% optimize.opt_a.a_after_grad : 0.000020s : 0.03% optimize.opt_a.renormalize : 0.000833s : 1.38% optimize.opt_a.add_forward_monad_depend : 0.000009s : 0.02% optimize.opt_a.auto_monad_grad : 0.000004s : 0.01% optimize.opt_a.auto_monad_eliminator : 0.000026s : 0.04% optimize.opt_a.cse : 0.000049s : 0.08% optimize.opt_a.a_3 : 0.000092s : 0.15% optimize.py_interpret_to_execute_after_opt_a : 0.000013s : 0.02% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.00% optimize.rewriter_after_opt_a : 0.000040s : 0.07% optimize.convert_after_rewriter : 0.000007s : 0.01% optimize.order_py_execute_after_rewriter : 0.000006s : 0.01% optimize.mutable_eliminate : 0.000732s : 1.21% optimize.opt_b.b_1 : 0.000127s : 0.21% optimize.opt_b.b_2 : 0.000008s : 0.01% optimize.opt_b.updatestate_depend_eliminate : 0.000008s : 0.01% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000002s : 0.00% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000022s : 0.04% optimize.optimize_parallel_all_gather_comm : 0.000019s : 0.03% optimize.overlap_param_gather : 0.000002s : 0.00% optimize.cconv : 0.000030s : 0.05% optimize.loop_unroll : 0.000464s : 0.77% optimize.opt_after_cconv.c_1 : 0.000031s : 0.05% optimize.opt_after_cconv.parameter_eliminate : 0.000004s : 0.01% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000005s : 0.01% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000002s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.00% optimize.opt_after_cconv.cse : 0.000020s : 0.03% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000014s : 0.02% optimize.tuple_transform.d_1 : 0.000046s : 0.08% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000007s : 0.01% optimize.partial_unused_args_eliminate : 0.000002s : 0.00% optimize.add_recomputation : 0.000053s : 0.09% optimize.cse_after_recomputation.cse : 0.000012s : 0.02% optimize.environ_conv : 0.000006s : 0.01% optimize.swap_dp_allreduce_reducescatter : 0.000005s : 0.01% optimize.bias_add_comm_swap : 0.000003s : 0.01% optimize.label_micro_interleaved_index : 0.000005s : 0.01% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.00% optimize.merge_cast_opt : 0.000001s : 0.00% optimize.slice_recompute_activation : 0.000003s : 0.00% optimize.micro_interleaved_order_control : 0.000002s : 0.00% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.00% optimize.full_micro_interleaved_order_control : 0.000003s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.01% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.00% optimize.control_data_broadcast_order : 0.000014s : 0.02% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.00% optimize.offloading_packed_experts : 0.000004s : 0.01% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.01% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.00% optimize.overlap_recompute_comm : 0.000003s : 0.00% optimize.overlap_grad_ring_attention : 0.000004s : 0.01% optimize.overlap_grad_flash_sp : 0.000022s : 0.04% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.00% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000003s : 0.01% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000012s : 0.02% optimize.symbol_engine_optimizer.elim_not_effective : 0.000029s : 0.05% optimize.symbol_engine_optimizer.opt_reshape : 0.000007s : 0.01% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000011s : 0.02% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000003s : 0.00% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000018s : 0.03% get_jit_bprop_graph : 0.000002s : 0.00% rewriter_after_jit_bprop_graph : 0.000006s : 0.01% opt_after_jit_grad : 0.000499s : 0.83% validate : 0.000044s : 0.07% Time group info: ------[substitution.] 0.000238 34 15.33% : 0.000037s : 6: substitution.arithmetic_simplify 1.53% : 0.000004s : 2: substitution.elim_not_effective 0.60% : 0.000001s : 2: substitution.fold_const_symbol 2.94% : 0.000007s : 4: substitution.graph_param_transform 67.18% : 0.000160s : 4: substitution.inline 1.90% : 0.000005s : 4: substitution.j_node_and_user_rematch 2.08% : 0.000005s : 4: substitution.remove_not_recompute_node 2.31% : 0.000006s : 4: substitution.replace_old_param 6.14% : 0.000015s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.006565 2 88.32% : 0.005798s : 1: type_inference.infer 11.68% : 0.000767s : 1: type_inference.specialize ------[replace.] 0.000065 8 63.14% : 0.000041s : 4: replace.inline 36.86% : 0.000024s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000170 8 92.51% : 0.000158s : 4: match.inline 7.49% : 0.000013s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000214 1278 0.93% : 0.000002s : 13: predicate.accumulaten_eliminater 0.67% : 0.000001s : 4: predicate.ad_related_special_op_eliminate 0.52% : 0.000001s : 8: predicate.addn_check_dump 0.84% : 0.000002s : 13: predicate.addn_zero_filter 0.84% : 0.000002s : 13: predicate.adjust_all_reduce_mul_add 2.47% : 0.000005s : 21: predicate.arithmetic_simplify 0.91% : 0.000002s : 13: predicate.cast_eliminate 0.59% : 0.000001s : 8: predicate.check_bprop_eliminate 0.47% : 0.000001s : 8: predicate.compare_switch_simplify 0.17% : 0.000000s : 4: predicate.const_output_eliminate 0.70% : 0.000002s : 8: predicate.depend_value_elim 0.89% : 0.000002s : 13: predicate.dict_get_item_const_eliminator 1.09% : 0.000002s : 13: predicate.dict_get_item_eliminator 0.88% : 0.000002s : 13: predicate.dict_set_item_eliminator 1.02% : 0.000002s : 8: predicate.dumpgradient_eliminate 0.27% : 0.000001s : 4: predicate.elim_not_effective 0.47% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.11% : 0.000002s : 17: predicate.environ_add_const_eliminate 1.09% : 0.000002s : 17: predicate.environ_get_add_eliminate 1.19% : 0.000003s : 17: predicate.environ_get_depend_swap 1.62% : 0.000003s : 25: predicate.environ_get_eliminate 1.05% : 0.000002s : 17: predicate.environ_get_set_eliminate 1.46% : 0.000003s : 21: predicate.exchange_switch_depend_value 2.43% : 0.000005s : 21: predicate.float_depend_g_call 0.57% : 0.000001s : 8: predicate.float_environ_get_switch 0.77% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.19% : 0.000000s : 4: predicate.fold_const_symbol 0.62% : 0.000001s : 8: predicate.get_grad_eliminate 0.21% : 0.000000s : 4: predicate.graph_param_transform 0.57% : 0.000001s : 8: predicate.incorporate_call 0.47% : 0.000001s : 8: predicate.incorporate_call_switch 6.60% : 0.000014s : 58: predicate.inline 0.79% : 0.000002s : 8: predicate.inline_without_move 0.31% : 0.000001s : 8: predicate.j_node_and_user_rematch 0.91% : 0.000002s : 8: predicate.less_batch_normalization 1.89% : 0.000004s : 25: predicate.list_to_tuple_eliminator_ 2.47% : 0.000005s : 38: predicate.load_eliminater 0.97% : 0.000002s : 4: predicate.loop_unroll_after_grad 2.46% : 0.000005s : 34: predicate.loop_unroll_before_grad 1.74% : 0.000004s : 21: predicate.make_slice_get_slice_eliminator 0.57% : 0.000001s : 8: predicate.merge_addn 0.70% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.56% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.82% : 0.000002s : 13: predicate.minmaximum_grad 1.88% : 0.000004s : 4: predicate.mutable_eliminate 0.37% : 0.000001s : 4: predicate.opt_reshape 0.61% : 0.000001s : 4: predicate.parallel_virtual_node 1.79% : 0.000004s : 21: predicate.partial_defer_inline 1.58% : 0.000003s : 21: predicate.partial_eliminate 0.89% : 0.000002s : 13: predicate.print_const_string_wrapper 0.50% : 0.000001s : 8: predicate.reduce_all_const_elim 1.19% : 0.000003s : 13: predicate.reduce_eliminate 2.37% : 0.000005s : 38: predicate.redundant_stop_gradient_eliminater 0.57% : 0.000001s : 8: predicate.remove_not_recompute_node 1.31% : 0.000003s : 25: predicate.replace_applicator 0.52% : 0.000001s : 8: predicate.replace_old_param 0.34% : 0.000001s : 4: predicate.reset_defer_inline 0.91% : 0.000002s : 13: predicate.reshape_eliminate 0.62% : 0.000001s : 8: predicate.row_tensor_add_zeros_like 0.40% : 0.000001s : 4: predicate.row_tensor_eliminate 0.93% : 0.000002s : 8: predicate.same_eliminate 0.55% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.72% : 0.000002s : 8: predicate.shard_identity_eliminate 0.72% : 0.000002s : 8: predicate.special_op_eliminate 0.66% : 0.000001s : 8: predicate.specialize_transform 0.96% : 0.000002s : 8: predicate.split_environ_get_set_with_tuple_value 0.82% : 0.000002s : 8: predicate.stack_unstack_eliminate 0.39% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.42% : 0.000003s : 21: predicate.switch_defer_inline 2.02% : 0.000004s : 29: predicate.switch_layer_defer_inline 4.82% : 0.000010s : 67: predicate.switch_simplify 0.86% : 0.000002s : 13: predicate.tile_eliminate 0.82% : 0.000002s : 13: predicate.transpose_eliminate 1.56% : 0.000003s : 21: predicate.tuple_list_convert_item_index_to_positive 1.51% : 0.000003s : 21: predicate.tuple_list_get_item_const_eliminator 1.32% : 0.000003s : 21: predicate.tuple_list_get_item_depend_reorder 3.27% : 0.000007s : 33: predicate.tuple_list_get_item_eliminator 1.34% : 0.000003s : 21: predicate.tuple_list_get_set_item_eliminator 2.15% : 0.000005s : 29: predicate.tuple_list_set_item_eliminator 1.71% : 0.000004s : 25: predicate.tuple_to_list_eliminator_ 2.34% : 0.000005s : 38: predicate.updatestate_pure_node_eliminater 3.09% : 0.000007s : 46: predicate.updatestate_useless_node_eliminater 0.38% : 0.000001s : 4: predicate.value_based_eliminate 0.67% : 0.000001s : 8: predicate.virtual_dataset_eliminate 0.61% : 0.000001s : 8: predicate.virtual_output_eliminate 0.28% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.40% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000619 11 54.81% : 0.000339s : 5: func_graph_cloner_run.FuncGraphClonerGraph 45.19% : 0.000280s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.076470 192 0.00% : 0.000004s : 1: ForceFp32Comm 4.82% : 0.003689s : 1: add_attr 4.81% : 0.003675s : 1: add_attr_with_inline 0.01% : 0.000005s : 1: add_comm_op_reuse_tag 0.08% : 0.000058s : 1: add_recomputation 0.01% : 0.000004s : 1: assign_add_opt 0.09% : 0.000070s : 1: auto_monad 0.03% : 0.000021s : 1: auto_monad_reorder 0.00% : 0.000003s : 1: begin_end_overlap_inline 0.01% : 0.000006s : 1: bias_add_comm_swap 63.52% : 0.048577s : 1: bootstrap 0.04% : 0.000034s : 1: cconv 0.00% : 0.000004s : 1: comm_op_add_attrs 0.02% : 0.000017s : 1: control_data_broadcast_order 0.01% : 0.000011s : 1: convert_after_rewriter 0.03% : 0.000026s : 1: cse_after_recomputation 0.01% : 0.000005s : 1: dataset_repeat_opt 0.01% : 0.000006s : 1: detach_backward 0.01% : 0.000009s : 1: environ_conv 0.03% : 0.000026s : 1: event_method 0.01% : 0.000005s : 1: full_micro_interleaved_order_control 0.01% : 0.000005s : 1: get_jit_bprop_graph 0.01% : 0.000009s : 1: graph_reusing 0.01% : 0.000005s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000004s : 1: handle_group_info 0.01% : 0.000006s : 1: inline 0.01% : 0.000006s : 1: insert-virtual-dataset 0.01% : 0.000004s : 1: interleave_parallel_branches 0.01% : 0.000004s : 1: interleave_split_concat_branches 0.01% : 0.000006s : 1: label_fine_grained_interleaved_index 0.01% : 0.000008s : 1: label_micro_interleaved_index 0.62% : 0.000473s : 1: loop_unroll 0.01% : 0.000004s : 1: merge_cast_opt 0.01% : 0.000005s : 1: micro_interleaved_order_control 0.97% : 0.000743s : 1: mutable_eliminate 0.01% : 0.000007s : 1: offloading_packed_experts 0.02% : 0.000015s : 1: opt.transform.loop_unroll_optimizer 0.02% : 0.000019s : 1: opt.transform.mutable_eliminate 1.67% : 0.001279s : 78: opt.transform.opt_a 0.04% : 0.000030s : 1: opt.transform.opt_after_cconv 0.03% : 0.000026s : 1: opt.transform.opt_after_jit_grad 0.13% : 0.000103s : 28: opt.transform.opt_b 0.07% : 0.000051s : 2: opt.transform.opt_trans_graph 0.07% : 0.000055s : 4: opt.transform.symbol_engine_opt 3.95% : 0.003023s : 1: opt_a 0.14% : 0.000109s : 1: opt_after_cconv 0.66% : 0.000508s : 1: opt_after_jit_grad 0.29% : 0.000218s : 1: opt_b 7.05% : 0.005391s : 1: optimize 0.03% : 0.000022s : 1: optimize_parallel_all_gather_comm 0.01% : 0.000009s : 1: order_py_execute_after_rewriter 0.03% : 0.000026s : 1: overlap_grad_flash_sp 0.01% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.01% : 0.000007s : 1: overlap_grad_ring_attention 0.01% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.01% : 0.000005s : 1: overlap_param_gather 0.01% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.01% : 0.000009s : 1: overlap_recompute_and_grad_model_parallel 0.01% : 0.000005s : 1: overlap_recompute_comm 0.01% : 0.000008s : 1: parallel-infer-symbol 0.01% : 0.000004s : 1: parallel-infer-symbol-second 0.01% : 0.000005s : 1: partial_unused_args_eliminate 0.01% : 0.000005s : 1: pipeline_parallel_scheduler 0.01% : 0.000005s : 1: pipeline_split 0.06% : 0.000046s : 1: pre_auto_parallel 0.04% : 0.000034s : 1: py_interpret_to_execute 0.02% : 0.000017s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000004s : 1: remove_cast_before_assign_add 0.02% : 0.000017s : 1: remove_dup_value 0.60% : 0.000457s : 1: renormalize.infer 0.48% : 0.000366s : 1: renormalize.specialize 0.01% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.01% : 0.000009s : 1: rewriter_after_jit_bprop_graph 0.06% : 0.000044s : 1: rewriter_after_opt_a 0.12% : 0.000094s : 1: rewriter_before_opt_a 0.01% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.01% : 0.000007s : 1: slice_recompute_activation 0.01% : 0.000005s : 1: split_layernorm_comm 0.01% : 0.000005s : 1: split_matmul_comm_elemetwise 0.01% : 0.000008s : 1: swap_dp_allreduce_reducescatter 0.14% : 0.000105s : 1: symbol_engine_optimizer 0.11% : 0.000083s : 1: tuple_transform 8.71% : 0.006661s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:43:46.943.318 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:43:46.943.581 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.138768, [21] [bootstrap]: 0.00045066 [type_inference]: 0.126581 [event_method]: 2.321e-05 [auto_monad]: 6.719e-05 [graph_reusing]: 6.17999e-06 [inline]: 2.93e-06 [add_attr]: 0.00367746, [1] [add_attr_with_inline]: 0.0036656, [1] [Cycle 1]: 9.044e-05, [2] [tag_attr]: 2.374e-05 [meta_addattr_fg_expand]: 5.93998e-06 [parallel-infer-symbol]: 3.56001e-06 [pre_auto_parallel]: 4.215e-05 [insert-virtual-dataset]: 2.26e-06 [parallel-infer-symbol-second]: 8.00006e-07 [dataset_repeat_opt]: 2.46e-06 [pipeline_split]: 1.89999e-06 [optimize]: 0.00626277, [53] [py_interpret_to_execute]: 3.267e-05 [rewriter_before_opt_a]: 9.605e-05 [opt_a]: 0.00361437, [2] [Cycle 1]: 0.00270479, [45] [expand_dump_flag]: 3.58999e-06 [switch_simplify]: 4.258e-05 [loop_unroll]: 3.035e-05 [a_1]: 0.00069335 [with_stream_mark]: 2.414e-05 [recompute_prepare]: 1.144e-05 [updatestate_depend_eliminate]: 5.17999e-06 [updatestate_assign_eliminate]: 3.43e-06 [updatestate_loads_eliminate]: 3.25e-06 [parameter_eliminate]: 2.93998e-06 [a_2]: 0.00012347 [accelerated_algorithm]: 7.69002e-06 [shard]: 2.56e-06 [meta_shard_fg_expand]: 1.95001e-06 [shard_inline]: 6.86001e-06 [merge_send_recv]: 9.25001e-06 [auto_parallel]: 7.38999e-06 [parallel]: 1.98e-05 [flash_sp]: 9.64999e-06 [merge_comm]: 3.98001e-06 [allreduce_fusion]: 3.65e-06 [matmul_add_comm_reduction]: 1.038e-05 [allreduce_slice_to_reducescatter]: 8.90024e-07 [virtual_shard_identity]: 9.56998e-06 [virtual_dataset]: 7.05e-06 [get_grad_eliminate_]: 6.59999e-06 [virtual_output]: 6.78e-06 [merge_forward]: 3.92998e-06 [cell_reuse_recompute_pass]: 1.37e-06 [offload_activation]: 1.05e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.469e-05 [merge_recompute_call_nodes]: 1.65001e-06 [before_grad]: 1.884e-05 [set_forward_comm_id_for_comm_node_pass]: 4.13001e-06 [meta_fg_expand]: 3.04999e-06 [flash_sp_send_recv_attached]: 2.86e-06 [receive_attached]: 2.19001e-06 [after_resolve]: 1.381e-05 [a_after_grad]: 1.222e-05 [renormalize]: 0.00087456 [add_forward_monad_depend]: 8.15e-06 [auto_monad_grad]: 2.83e-06 [auto_monad_eliminator]: 2.171e-05 [cse]: 3.25e-05 [a_3]: 7.087e-05 [Cycle 2]: 0.00089292, [45] [expand_dump_flag]: 2.61e-06 [switch_simplify]: 1.004e-05 [loop_unroll]: 7.28999e-06 [a_1]: 0.00015755 [with_stream_mark]: 1.566e-05 [recompute_prepare]: 7.8e-06 [updatestate_depend_eliminate]: 3.33e-06 [updatestate_assign_eliminate]: 2.99999e-06 [updatestate_loads_eliminate]: 2.41e-06 [parameter_eliminate]: 1.59e-06 [a_2]: 0.00010647 [accelerated_algorithm]: 6.78e-06 [shard]: 1.91998e-06 [meta_shard_fg_expand]: 1.67001e-06 [shard_inline]: 6.29001e-06 [merge_send_recv]: 6.48e-06 [auto_parallel]: 7.48e-06 [parallel]: 6.36e-06 [flash_sp]: 3.04999e-06 [merge_comm]: 3.41001e-06 [allreduce_fusion]: 3.33998e-06 [matmul_add_comm_reduction]: 7.45e-06 [allreduce_slice_to_reducescatter]: 6.19999e-07 [virtual_shard_identity]: 8.02e-06 [virtual_dataset]: 6.29999e-06 [get_grad_eliminate_]: 5.66e-06 [virtual_output]: 6.07001e-06 [merge_forward]: 6.01e-06 [cell_reuse_recompute_pass]: 2.68998e-06 [offload_activation]: 8.42e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.361e-05 [merge_recompute_call_nodes]: 1.32e-06 [before_grad]: 1.016e-05 [set_forward_comm_id_for_comm_node_pass]: 3.78999e-06 [meta_fg_expand]: 2.54001e-06 [flash_sp_send_recv_attached]: 1.41998e-06 [receive_attached]: 1.77001e-06 [after_resolve]: 1.257e-05 [a_after_grad]: 1e-05 [renormalize]: 8.00064e-08 [add_forward_monad_depend]: 1.69e-06 [auto_monad_grad]: 1.48002e-06 [auto_monad_eliminator]: 8.65001e-06 [cse]: 1.471e-05 [a_3]: 4.939e-05 [py_interpret_to_execute_after_opt_a]: 1.827e-05 [slice_cell_reuse_recomputed_activation]: 4.97999e-06 [rewriter_after_opt_a]: 4.365e-05 [convert_after_rewriter]: 1.029e-05 [order_py_execute_after_rewriter]: 8.05e-06 [mutable_eliminate]: 0.00074088 [opt_b]: 0.00028015, [1] [Cycle 1]: 0.00026956, [7] [b_1]: 0.00016924 [b_2]: 8.98002e-06 [updatestate_depend_eliminate]: 6.02999e-06 [updatestate_assign_eliminate]: 2.74999e-06 [updatestate_loads_eliminate]: 3.31999e-06 [renormalize]: 8.39995e-07 [cse]: 2.142e-05 [optimize_parallel_all_gather_comm]: 2.091e-05 [overlap_param_gather]: 5.01997e-06 [cconv]: 3.583e-05 [loop_unroll]: 0.0004918 [opt_after_cconv]: 0.00013347, [1] [Cycle 1]: 0.00012435, [7] [c_1]: 3.287e-05 [parameter_eliminate]: 3.56999e-06 [updatestate_depend_eliminate]: 5.42001e-06 [updatestate_assign_eliminate]: 2.66999e-06 [updatestate_loads_eliminate]: 2.74999e-06 [cse]: 1.929e-05 [renormalize]: 4.10015e-07 [remove_dup_value]: 1.75e-05 [tuple_transform]: 9.681e-05, [1] [Cycle 1]: 8.953e-05, [4] [d_1]: 4.924e-05 [none_parameter_eliminate]: 1.72001e-06 [renormalize]: 1.40019e-07 [switch_simplify]: 7.65998e-06 [partial_unused_args_eliminate]: 4.58001e-06 [add_recomputation]: 5.545e-05 [cse_after_recomputation]: 2.829e-05, [1] [Cycle 1]: 2.119e-05, [1] [cse]: 1.21e-05 [environ_conv]: 8.82999e-06 [swap_dp_allreduce_reducescatter]: 7.97e-06 [bias_add_comm_swap]: 5.39998e-06 [label_micro_interleaved_index]: 7.43e-06 [label_fine_grained_interleaved_index]: 5.45001e-06 [merge_cast_opt]: 4e-06 [slice_recompute_activation]: 4.76002e-06 [micro_interleaved_order_control]: 4.82998e-06 [assign_add_opt]: 3.81999e-06 [ForceFp32Comm]: 3.76999e-06 [remove_cast_before_assign_add]: 3.88999e-06 [full_micro_interleaved_order_control]: 4.82e-06 [reorder_send_recv_between_fp_bp]: 6.06998e-06 [comm_op_add_attrs]: 3.43999e-06 [add_comm_op_reuse_tag]: 3.48e-06 [interleave_split_concat_branches]: 3.69002e-06 [interleave_parallel_branches]: 3.38999e-06 [overlap_opt_shard_in_pipeline]: 3.76999e-06 [overlap_opt_shard_grad_in_pipeline]: 4.33001e-06 [control_data_broadcast_order]: 1.596e-05 [grouped_pairwise_exchange_alltoall]: 3.78001e-06 [offloading_packed_experts]: 6.49001e-06 [overlap_recompute_and_grad_model_parallel]: 8.1e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.8e-06 [overlap_recompute_allgather_and_fa_grad]: 4.18001e-06 [overlap_recompute_comm]: 4.69002e-06 [overlap_grad_ring_attention]: 6.51e-06 [overlap_grad_flash_sp]: 2.514e-05 [begin_end_overlap_inline]: 2.91999e-06 [split_matmul_comm_elemetwise]: 4.53999e-06 [split_layernorm_comm]: 3.93001e-06 [handle_group_info]: 3.63e-06 [symbol_engine_optimizer]: 0.00011813, [1] [Cycle 1]: 0.0001108, [6] [build]: 3.85e-06 [elim_shapecalc]: 9.60001e-06 [elim_not_effective]: 1.488e-05 [opt_reshape]: 7.48999e-06 [fold_const_symbol]: 1.142e-05 [renormalize]: 2.19996e-07 [detach_backward]: 4.25999e-06 [pipeline_parallel_scheduler]: 1.91998e-06 [auto_monad_reorder]: 2.071e-05 [get_jit_bprop_graph]: 1.76003e-06 [rewriter_after_jit_bprop_graph]: 6.48e-06 [opt_after_jit_grad]: 0.00059737 [validate]: 4.299e-05 Sums bootstrap : 0.000451s : 0.34% type_inference : 0.126581s : 95.34% event_method : 0.000023s : 0.02% auto_monad : 0.000067s : 0.05% graph_reusing : 0.000006s : 0.00% inline : 0.000003s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000024s : 0.02% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.00% parallel-infer-symbol : 0.000004s : 0.00% pre_auto_parallel : 0.000042s : 0.03% insert-virtual-dataset : 0.000002s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000033s : 0.02% optimize.rewriter_before_opt_a : 0.000096s : 0.07% optimize.opt_a.expand_dump_flag : 0.000006s : 0.00% optimize.opt_a.switch_simplify : 0.000053s : 0.04% optimize.opt_a.loop_unroll : 0.000038s : 0.03% optimize.opt_a.a_1 : 0.000851s : 0.64% optimize.opt_a.with_stream_mark : 0.000040s : 0.03% optimize.opt_a.recompute_prepare : 0.000019s : 0.01% optimize.opt_a.updatestate_depend_eliminate : 0.000009s : 0.01% optimize.opt_a.updatestate_assign_eliminate : 0.000006s : 0.00% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.00% optimize.opt_a.parameter_eliminate : 0.000005s : 0.00% optimize.opt_a.a_2 : 0.000230s : 0.17% optimize.opt_a.accelerated_algorithm : 0.000014s : 0.01% optimize.opt_a.shard : 0.000004s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.00% optimize.opt_a.shard_inline : 0.000013s : 0.01% optimize.opt_a.merge_send_recv : 0.000016s : 0.01% optimize.opt_a.auto_parallel : 0.000015s : 0.01% optimize.opt_a.parallel : 0.000026s : 0.02% optimize.opt_a.flash_sp : 0.000013s : 0.01% optimize.opt_a.merge_comm : 0.000007s : 0.01% optimize.opt_a.allreduce_fusion : 0.000007s : 0.01% optimize.opt_a.matmul_add_comm_reduction : 0.000018s : 0.01% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000002s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000018s : 0.01% optimize.opt_a.virtual_dataset : 0.000013s : 0.01% optimize.opt_a.get_grad_eliminate_ : 0.000012s : 0.01% optimize.opt_a.virtual_output : 0.000013s : 0.01% optimize.opt_a.merge_forward : 0.000010s : 0.01% optimize.opt_a.cell_reuse_recompute_pass : 0.000004s : 0.00% optimize.opt_a.offload_activation : 0.000019s : 0.01% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000028s : 0.02% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.00% optimize.opt_a.before_grad : 0.000029s : 0.02% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000008s : 0.01% optimize.opt_a.meta_fg_expand : 0.000006s : 0.00% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.00% optimize.opt_a.receive_attached : 0.000004s : 0.00% optimize.opt_a.after_resolve : 0.000026s : 0.02% optimize.opt_a.a_after_grad : 0.000022s : 0.02% optimize.opt_a.renormalize : 0.000875s : 0.66% optimize.opt_a.add_forward_monad_depend : 0.000010s : 0.01% optimize.opt_a.auto_monad_grad : 0.000004s : 0.00% optimize.opt_a.auto_monad_eliminator : 0.000030s : 0.02% optimize.opt_a.cse : 0.000047s : 0.04% optimize.opt_a.a_3 : 0.000120s : 0.09% optimize.py_interpret_to_execute_after_opt_a : 0.000018s : 0.01% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.00% optimize.rewriter_after_opt_a : 0.000044s : 0.03% optimize.convert_after_rewriter : 0.000010s : 0.01% optimize.order_py_execute_after_rewriter : 0.000008s : 0.01% optimize.mutable_eliminate : 0.000741s : 0.56% optimize.opt_b.b_1 : 0.000169s : 0.13% optimize.opt_b.b_2 : 0.000009s : 0.01% optimize.opt_b.updatestate_depend_eliminate : 0.000006s : 0.00% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.00% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000021s : 0.02% optimize.optimize_parallel_all_gather_comm : 0.000021s : 0.02% optimize.overlap_param_gather : 0.000005s : 0.00% optimize.cconv : 0.000036s : 0.03% optimize.loop_unroll : 0.000492s : 0.37% optimize.opt_after_cconv.c_1 : 0.000033s : 0.02% optimize.opt_after_cconv.parameter_eliminate : 0.000004s : 0.00% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000005s : 0.00% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.cse : 0.000019s : 0.01% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000017s : 0.01% optimize.tuple_transform.d_1 : 0.000049s : 0.04% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000008s : 0.01% optimize.partial_unused_args_eliminate : 0.000005s : 0.00% optimize.add_recomputation : 0.000055s : 0.04% optimize.cse_after_recomputation.cse : 0.000012s : 0.01% optimize.environ_conv : 0.000009s : 0.01% optimize.swap_dp_allreduce_reducescatter : 0.000008s : 0.01% optimize.bias_add_comm_swap : 0.000005s : 0.00% optimize.label_micro_interleaved_index : 0.000007s : 0.01% optimize.label_fine_grained_interleaved_index : 0.000005s : 0.00% optimize.merge_cast_opt : 0.000004s : 0.00% optimize.slice_recompute_activation : 0.000005s : 0.00% optimize.micro_interleaved_order_control : 0.000005s : 0.00% optimize.assign_add_opt : 0.000004s : 0.00% optimize.ForceFp32Comm : 0.000004s : 0.00% optimize.remove_cast_before_assign_add : 0.000004s : 0.00% optimize.full_micro_interleaved_order_control : 0.000005s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000006s : 0.00% optimize.comm_op_add_attrs : 0.000003s : 0.00% optimize.add_comm_op_reuse_tag : 0.000003s : 0.00% optimize.interleave_split_concat_branches : 0.000004s : 0.00% optimize.interleave_parallel_branches : 0.000003s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000004s : 0.00% optimize.control_data_broadcast_order : 0.000016s : 0.01% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.00% optimize.offloading_packed_experts : 0.000006s : 0.00% optimize.overlap_recompute_and_grad_model_parallel : 0.000008s : 0.01% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.00% optimize.overlap_recompute_comm : 0.000005s : 0.00% optimize.overlap_grad_ring_attention : 0.000007s : 0.00% optimize.overlap_grad_flash_sp : 0.000025s : 0.02% optimize.begin_end_overlap_inline : 0.000003s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000005s : 0.00% optimize.split_layernorm_comm : 0.000004s : 0.00% optimize.handle_group_info : 0.000004s : 0.00% optimize.symbol_engine_optimizer.build : 0.000004s : 0.00% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000010s : 0.01% optimize.symbol_engine_optimizer.elim_not_effective : 0.000015s : 0.01% optimize.symbol_engine_optimizer.opt_reshape : 0.000007s : 0.01% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000011s : 0.01% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000004s : 0.00% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000021s : 0.02% get_jit_bprop_graph : 0.000002s : 0.00% rewriter_after_jit_bprop_graph : 0.000006s : 0.00% opt_after_jit_grad : 0.000597s : 0.45% validate : 0.000043s : 0.03% Time group info: ------[substitution.] 0.000248 34 14.93% : 0.000037s : 6: substitution.arithmetic_simplify 0.80% : 0.000002s : 2: substitution.elim_not_effective 0.67% : 0.000002s : 2: substitution.fold_const_symbol 3.00% : 0.000007s : 4: substitution.graph_param_transform 67.72% : 0.000168s : 4: substitution.inline 1.71% : 0.000004s : 4: substitution.j_node_and_user_rematch 2.12% : 0.000005s : 4: substitution.remove_not_recompute_node 2.37% : 0.000006s : 4: substitution.replace_old_param 6.68% : 0.000017s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.126517 2 99.34% : 0.125686s : 1: type_inference.infer 0.66% : 0.000830s : 1: type_inference.specialize ------[replace.] 0.000065 8 65.03% : 0.000042s : 4: replace.inline 34.97% : 0.000023s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000180 8 91.88% : 0.000165s : 4: match.inline 8.12% : 0.000015s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000223 1278 0.99% : 0.000002s : 13: predicate.accumulaten_eliminater 1.05% : 0.000002s : 4: predicate.ad_related_special_op_eliminate 0.58% : 0.000001s : 8: predicate.addn_check_dump 0.95% : 0.000002s : 13: predicate.addn_zero_filter 0.92% : 0.000002s : 13: predicate.adjust_all_reduce_mul_add 2.50% : 0.000006s : 21: predicate.arithmetic_simplify 1.12% : 0.000003s : 13: predicate.cast_eliminate 0.62% : 0.000001s : 8: predicate.check_bprop_eliminate 0.52% : 0.000001s : 8: predicate.compare_switch_simplify 0.18% : 0.000000s : 4: predicate.const_output_eliminate 0.59% : 0.000001s : 8: predicate.depend_value_elim 0.86% : 0.000002s : 13: predicate.dict_get_item_const_eliminator 0.89% : 0.000002s : 13: predicate.dict_get_item_eliminator 0.91% : 0.000002s : 13: predicate.dict_set_item_eliminator 1.00% : 0.000002s : 8: predicate.dumpgradient_eliminate 0.27% : 0.000001s : 4: predicate.elim_not_effective 0.36% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.12% : 0.000002s : 17: predicate.environ_add_const_eliminate 1.11% : 0.000002s : 17: predicate.environ_get_add_eliminate 1.06% : 0.000002s : 17: predicate.environ_get_depend_swap 1.81% : 0.000004s : 25: predicate.environ_get_eliminate 1.06% : 0.000002s : 17: predicate.environ_get_set_eliminate 1.42% : 0.000003s : 21: predicate.exchange_switch_depend_value 2.50% : 0.000006s : 21: predicate.float_depend_g_call 0.58% : 0.000001s : 8: predicate.float_environ_get_switch 0.73% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.17% : 0.000000s : 4: predicate.fold_const_symbol 0.59% : 0.000001s : 8: predicate.get_grad_eliminate 0.27% : 0.000001s : 4: predicate.graph_param_transform 0.58% : 0.000001s : 8: predicate.incorporate_call 0.46% : 0.000001s : 8: predicate.incorporate_call_switch 6.37% : 0.000014s : 58: predicate.inline 0.73% : 0.000002s : 8: predicate.inline_without_move 0.38% : 0.000001s : 8: predicate.j_node_and_user_rematch 0.82% : 0.000002s : 8: predicate.less_batch_normalization 1.85% : 0.000004s : 25: predicate.list_to_tuple_eliminator_ 2.38% : 0.000005s : 38: predicate.load_eliminater 1.07% : 0.000002s : 4: predicate.loop_unroll_after_grad 2.27% : 0.000005s : 34: predicate.loop_unroll_before_grad 1.68% : 0.000004s : 21: predicate.make_slice_get_slice_eliminator 0.52% : 0.000001s : 8: predicate.merge_addn 0.67% : 0.000002s : 8: predicate.micro_step_allgather_replace 0.61% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.83% : 0.000002s : 13: predicate.minmaximum_grad 1.31% : 0.000003s : 4: predicate.mutable_eliminate 0.37% : 0.000001s : 4: predicate.opt_reshape 0.39% : 0.000001s : 4: predicate.parallel_virtual_node 1.80% : 0.000004s : 21: predicate.partial_defer_inline 1.55% : 0.000003s : 21: predicate.partial_eliminate 0.86% : 0.000002s : 13: predicate.print_const_string_wrapper 0.54% : 0.000001s : 8: predicate.reduce_all_const_elim 1.12% : 0.000002s : 13: predicate.reduce_eliminate 2.44% : 0.000005s : 38: predicate.redundant_stop_gradient_eliminater 0.52% : 0.000001s : 8: predicate.remove_not_recompute_node 1.40% : 0.000003s : 25: predicate.replace_applicator 0.46% : 0.000001s : 8: predicate.replace_old_param 0.26% : 0.000001s : 4: predicate.reset_defer_inline 0.83% : 0.000002s : 13: predicate.reshape_eliminate 0.63% : 0.000001s : 8: predicate.row_tensor_add_zeros_like 0.55% : 0.000001s : 4: predicate.row_tensor_eliminate 0.71% : 0.000002s : 8: predicate.same_eliminate 0.45% : 0.000001s : 8: predicate.set_cell_output_no_recompute 1.05% : 0.000002s : 8: predicate.shard_identity_eliminate 0.62% : 0.000001s : 8: predicate.special_op_eliminate 0.71% : 0.000002s : 8: predicate.specialize_transform 0.91% : 0.000002s : 8: predicate.split_environ_get_set_with_tuple_value 0.87% : 0.000002s : 8: predicate.stack_unstack_eliminate 0.40% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.53% : 0.000003s : 21: predicate.switch_defer_inline 2.07% : 0.000005s : 29: predicate.switch_layer_defer_inline 5.10% : 0.000011s : 67: predicate.switch_simplify 0.85% : 0.000002s : 13: predicate.tile_eliminate 1.12% : 0.000002s : 13: predicate.transpose_eliminate 1.62% : 0.000004s : 21: predicate.tuple_list_convert_item_index_to_positive 1.54% : 0.000003s : 21: predicate.tuple_list_get_item_const_eliminator 1.34% : 0.000003s : 21: predicate.tuple_list_get_item_depend_reorder 2.96% : 0.000007s : 33: predicate.tuple_list_get_item_eliminator 1.45% : 0.000003s : 21: predicate.tuple_list_get_set_item_eliminator 2.30% : 0.000005s : 29: predicate.tuple_list_set_item_eliminator 1.72% : 0.000004s : 25: predicate.tuple_to_list_eliminator_ 2.35% : 0.000005s : 38: predicate.updatestate_pure_node_eliminater 3.01% : 0.000007s : 46: predicate.updatestate_useless_node_eliminater 0.40% : 0.000001s : 4: predicate.value_based_eliminate 0.67% : 0.000001s : 8: predicate.virtual_dataset_eliminate 0.63% : 0.000001s : 8: predicate.virtual_output_eliminate 0.26% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.38% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000671 11 46.37% : 0.000311s : 5: func_graph_cloner_run.FuncGraphClonerGraph 53.63% : 0.000360s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.151044 192 0.00% : 0.000006s : 1: ForceFp32Comm 2.44% : 0.003689s : 1: add_attr 2.43% : 0.003670s : 1: add_attr_with_inline 0.00% : 0.000007s : 1: add_comm_op_reuse_tag 0.04% : 0.000059s : 1: add_recomputation 0.00% : 0.000006s : 1: assign_add_opt 0.05% : 0.000076s : 1: auto_monad 0.02% : 0.000028s : 1: auto_monad_reorder 0.00% : 0.000006s : 1: begin_end_overlap_inline 0.01% : 0.000008s : 1: bias_add_comm_swap 0.33% : 0.000493s : 1: bootstrap 0.03% : 0.000039s : 1: cconv 0.00% : 0.000006s : 1: comm_op_add_attrs 0.01% : 0.000019s : 1: control_data_broadcast_order 0.01% : 0.000013s : 1: convert_after_rewriter 0.02% : 0.000031s : 1: cse_after_recomputation 0.01% : 0.000008s : 1: dataset_repeat_opt 0.02% : 0.000024s : 1: detach_backward 0.01% : 0.000012s : 1: environ_conv 0.02% : 0.000034s : 1: event_method 0.00% : 0.000008s : 1: full_micro_interleaved_order_control 0.01% : 0.000009s : 1: get_jit_bprop_graph 0.01% : 0.000012s : 1: graph_reusing 0.00% : 0.000006s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000006s : 1: handle_group_info 0.01% : 0.000009s : 1: inline 0.01% : 0.000009s : 1: insert-virtual-dataset 0.00% : 0.000006s : 1: interleave_parallel_branches 0.00% : 0.000006s : 1: interleave_split_concat_branches 0.01% : 0.000008s : 1: label_fine_grained_interleaved_index 0.01% : 0.000010s : 1: label_micro_interleaved_index 0.33% : 0.000498s : 1: loop_unroll 0.00% : 0.000007s : 1: merge_cast_opt 0.00% : 0.000007s : 1: micro_interleaved_order_control 0.50% : 0.000748s : 1: mutable_eliminate 0.01% : 0.000010s : 1: offloading_packed_experts 0.01% : 0.000016s : 1: opt.transform.loop_unroll_optimizer 0.01% : 0.000018s : 1: opt.transform.mutable_eliminate 0.87% : 0.001316s : 78: opt.transform.opt_a 0.02% : 0.000031s : 1: opt.transform.opt_after_cconv 0.02% : 0.000028s : 1: opt.transform.opt_after_jit_grad 0.07% : 0.000105s : 28: opt.transform.opt_b 0.04% : 0.000054s : 2: opt.transform.opt_trans_graph 0.03% : 0.000040s : 4: opt.transform.symbol_engine_opt 2.40% : 0.003618s : 1: opt_a 0.09% : 0.000137s : 1: opt_after_cconv 0.40% : 0.000608s : 1: opt_after_jit_grad 0.19% : 0.000284s : 1: opt_b 4.61% : 0.006962s : 1: optimize 0.02% : 0.000024s : 1: optimize_parallel_all_gather_comm 0.01% : 0.000011s : 1: order_py_execute_after_rewriter 0.02% : 0.000028s : 1: overlap_grad_flash_sp 0.00% : 0.000007s : 1: overlap_grad_matmul_and_grad_allreduce 0.01% : 0.000009s : 1: overlap_grad_ring_attention 0.00% : 0.000007s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000006s : 1: overlap_opt_shard_in_pipeline 0.01% : 0.000008s : 1: overlap_param_gather 0.00% : 0.000007s : 1: overlap_recompute_allgather_and_fa_grad 0.01% : 0.000012s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000007s : 1: overlap_recompute_comm 0.01% : 0.000010s : 1: parallel-infer-symbol 0.00% : 0.000006s : 1: parallel-infer-symbol-second 0.00% : 0.000007s : 1: partial_unused_args_eliminate 0.01% : 0.000009s : 1: pipeline_parallel_scheduler 0.00% : 0.000007s : 1: pipeline_split 0.03% : 0.000049s : 1: pre_auto_parallel 0.02% : 0.000036s : 1: py_interpret_to_execute 0.01% : 0.000022s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000006s : 1: remove_cast_before_assign_add 0.01% : 0.000021s : 1: remove_dup_value 0.29% : 0.000436s : 1: renormalize.infer 0.28% : 0.000429s : 1: renormalize.specialize 0.01% : 0.000009s : 1: reorder_send_recv_between_fp_bp 0.01% : 0.000013s : 1: rewriter_after_jit_bprop_graph 0.03% : 0.000048s : 1: rewriter_after_opt_a 0.07% : 0.000099s : 1: rewriter_before_opt_a 0.01% : 0.000008s : 1: slice_cell_reuse_recomputed_activation 0.01% : 0.000009s : 1: slice_recompute_activation 0.00% : 0.000007s : 1: split_layernorm_comm 0.00% : 0.000007s : 1: split_matmul_comm_elemetwise 0.01% : 0.000011s : 1: swap_dp_allreduce_reducescatter 0.08% : 0.000121s : 1: symbol_engine_optimizer 0.07% : 0.000100s : 1: tuple_transform 83.84% : 0.126633s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:43:48.155.342 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.109948, [21] [bootstrap]: 0.00043918 [type_inference]: 0.00614841 [event_method]: 1.907e-05 [auto_monad]: 6.166e-05 [graph_reusing]: 6.19001e-06 [inline]: 2.64999e-06 [add_attr]: 0.00328242, [1] [add_attr_with_inline]: 0.00327103, [1] [Cycle 1]: 0.00011355, [2] [tag_attr]: 2.184e-05 [meta_addattr_fg_expand]: 6.06e-06 [parallel-infer-symbol]: 3.31001e-06 [pre_auto_parallel]: 3.793e-05 [insert-virtual-dataset]: 2.51e-06 [parallel-infer-symbol-second]: 7.00005e-07 [dataset_repeat_opt]: 1.99e-06 [pipeline_split]: 1.72001e-06 [optimize]: 0.0992073, [53] [py_interpret_to_execute]: 2.751e-05 [rewriter_before_opt_a]: 8.709e-05 [opt_a]: 0.096783, [2] [Cycle 1]: 0.0959616, [45] [expand_dump_flag]: 3.16999e-06 [switch_simplify]: 4.46e-05 [loop_unroll]: 3.078e-05 [a_1]: 0.00066216 [with_stream_mark]: 1.868e-05 [recompute_prepare]: 9.64e-06 [updatestate_depend_eliminate]: 4.25e-06 [updatestate_assign_eliminate]: 3.28998e-06 [updatestate_loads_eliminate]: 3.01999e-06 [parameter_eliminate]: 1.88002e-06 [a_2]: 8.997e-05 [accelerated_algorithm]: 7.02002e-06 [shard]: 1.67001e-06 [meta_shard_fg_expand]: 1.90001e-06 [shard_inline]: 6.44001e-06 [merge_send_recv]: 8.83001e-06 [auto_parallel]: 7.27997e-06 [parallel]: 2.053e-05 [flash_sp]: 8.80999e-06 [merge_comm]: 4.38001e-06 [allreduce_fusion]: 3.4e-06 [matmul_add_comm_reduction]: 9.89001e-06 [allreduce_slice_to_reducescatter]: 1.04998e-06 [virtual_shard_identity]: 9.97001e-06 [virtual_dataset]: 6.69999e-06 [get_grad_eliminate_]: 6.59001e-06 [virtual_output]: 6.29999e-06 [merge_forward]: 3.61999e-06 [cell_reuse_recompute_pass]: 1.49e-06 [offload_activation]: 1.066e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.299e-05 [merge_recompute_call_nodes]: 1.79e-06 [before_grad]: 1.147e-05 [set_forward_comm_id_for_comm_node_pass]: 3.83999e-06 [meta_fg_expand]: 2.88e-06 [flash_sp_send_recv_attached]: 2.79001e-06 [receive_attached]: 2.76999e-06 [after_resolve]: 1.249e-05 [a_after_grad]: 1.024e-05 [renormalize]: 0.0944663 [add_forward_monad_depend]: 1.296e-05 [auto_monad_grad]: 2.68998e-06 [auto_monad_eliminator]: 2.581e-05 [cse]: 3.761e-05 [a_3]: 6.819e-05 [Cycle 2]: 0.00080645, [45] [expand_dump_flag]: 2.79001e-06 [switch_simplify]: 1.045e-05 [loop_unroll]: 7.23e-06 [a_1]: 0.00016785 [with_stream_mark]: 2.201e-05 [recompute_prepare]: 7.23999e-06 [updatestate_depend_eliminate]: 4.08001e-06 [updatestate_assign_eliminate]: 3.11999e-06 [updatestate_loads_eliminate]: 3.24001e-06 [parameter_eliminate]: 2.27001e-06 [a_2]: 8.472e-05 [accelerated_algorithm]: 7.67998e-06 [shard]: 3.01001e-06 [meta_shard_fg_expand]: 2.63998e-06 [shard_inline]: 6.27001e-06 [merge_send_recv]: 9.70002e-06 [auto_parallel]: 1.071e-05 [parallel]: 9.86998e-06 [flash_sp]: 4.31002e-06 [merge_comm]: 3.83999e-06 [allreduce_fusion]: 3.76001e-06 [matmul_add_comm_reduction]: 1.077e-05 [allreduce_slice_to_reducescatter]: 8.89995e-07 [virtual_shard_identity]: 1.03e-05 [virtual_dataset]: 8.13999e-06 [get_grad_eliminate_]: 6.94001e-06 [virtual_output]: 5.94e-06 [merge_forward]: 4.58999e-06 [cell_reuse_recompute_pass]: 3.3e-06 [offload_activation]: 1.203e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.29e-05 [merge_recompute_call_nodes]: 1.44998e-06 [before_grad]: 1.129e-05 [set_forward_comm_id_for_comm_node_pass]: 4.13999e-06 [meta_fg_expand]: 3.41999e-06 [flash_sp_send_recv_attached]: 1.96998e-06 [receive_attached]: 2.09e-06 [after_resolve]: 1.345e-05 [a_after_grad]: 1.063e-05 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 2.34999e-06 [auto_monad_grad]: 1.52001e-06 [auto_monad_eliminator]: 9.87999e-06 [cse]: 1.738e-05 [a_3]: 3.867e-05 [py_interpret_to_execute_after_opt_a]: 1.886e-05 [slice_cell_reuse_recomputed_activation]: 2.26e-06 [rewriter_after_opt_a]: 4.3e-05 [convert_after_rewriter]: 7.95998e-06 [order_py_execute_after_rewriter]: 5.49998e-06 [mutable_eliminate]: 0.00079424 [opt_b]: 0.0002352, [1] [Cycle 1]: 0.00022645, [7] [b_1]: 0.00014076 [b_2]: 9.84001e-06 [updatestate_depend_eliminate]: 7.28999e-06 [updatestate_assign_eliminate]: 2.91999e-06 [updatestate_loads_eliminate]: 2.53998e-06 [renormalize]: 9.20001e-07 [cse]: 2.198e-05 [optimize_parallel_all_gather_comm]: 1.727e-05 [overlap_param_gather]: 1.89999e-06 [cconv]: 3.125e-05 [loop_unroll]: 0.00045474 [opt_after_cconv]: 0.00010633, [1] [Cycle 1]: 0.00010011, [7] [c_1]: 3.208e-05 [parameter_eliminate]: 4.55999e-06 [updatestate_depend_eliminate]: 5.57999e-06 [updatestate_assign_eliminate]: 2.37001e-06 [updatestate_loads_eliminate]: 2.41e-06 [cse]: 1.833e-05 [renormalize]: 5.09986e-07 [remove_dup_value]: 1.463e-05 [tuple_transform]: 7.996e-05, [1] [Cycle 1]: 7.469e-05, [4] [d_1]: 4.642e-05 [none_parameter_eliminate]: 1.57999e-06 [renormalize]: 2.30008e-07 [switch_simplify]: 7.61001e-06 [partial_unused_args_eliminate]: 1.91e-06 [add_recomputation]: 5.35e-05 [cse_after_recomputation]: 2.281e-05, [1] [Cycle 1]: 1.754e-05, [1] [cse]: 1.189e-05 [environ_conv]: 5.88002e-06 [swap_dp_allreduce_reducescatter]: 5.67999e-06 [bias_add_comm_swap]: 2.73e-06 [label_micro_interleaved_index]: 4.43999e-06 [label_fine_grained_interleaved_index]: 2.89999e-06 [merge_cast_opt]: 1.28002e-06 [slice_recompute_activation]: 2.14e-06 [micro_interleaved_order_control]: 2.86e-06 [assign_add_opt]: 1.16997e-06 [ForceFp32Comm]: 1.15999e-06 [remove_cast_before_assign_add]: 1.09e-06 [full_micro_interleaved_order_control]: 2.21e-06 [reorder_send_recv_between_fp_bp]: 2.66999e-06 [comm_op_add_attrs]: 9.79984e-07 [add_comm_op_reuse_tag]: 1.05999e-06 [interleave_split_concat_branches]: 1.14e-06 [interleave_parallel_branches]: 1.00999e-06 [overlap_opt_shard_in_pipeline]: 1.29003e-06 [overlap_opt_shard_grad_in_pipeline]: 2.38002e-06 [control_data_broadcast_order]: 1.378e-05 [grouped_pairwise_exchange_alltoall]: 1.55001e-06 [offloading_packed_experts]: 4.06001e-06 [overlap_recompute_and_grad_model_parallel]: 4.53999e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.47001e-06 [overlap_recompute_allgather_and_fa_grad]: 1.57001e-06 [overlap_recompute_comm]: 2.59001e-06 [overlap_grad_ring_attention]: 4.59998e-06 [overlap_grad_flash_sp]: 2.215e-05 [begin_end_overlap_inline]: 5.69999e-07 [split_matmul_comm_elemetwise]: 2.16e-06 [split_layernorm_comm]: 1.64998e-06 [handle_group_info]: 9.50007e-07 [symbol_engine_optimizer]: 8.125e-05, [1] [Cycle 1]: 7.652e-05, [6] [build]: 3.84002e-06 [elim_shapecalc]: 1.082e-05 [elim_not_effective]: 1.451e-05 [opt_reshape]: 7.24001e-06 [fold_const_symbol]: 1.06e-05 [renormalize]: 2.20025e-07 [detach_backward]: 2.14e-06 [pipeline_parallel_scheduler]: 1.54e-06 [auto_monad_reorder]: 1.658e-05 [get_jit_bprop_graph]: 2.14e-06 [rewriter_after_jit_bprop_graph]: 4.99998e-06 [opt_after_jit_grad]: 0.00048882 [validate]: 6.001e-05 Sums bootstrap : 0.000439s : 0.42% type_inference : 0.006148s : 5.82% event_method : 0.000019s : 0.02% auto_monad : 0.000062s : 0.06% graph_reusing : 0.000006s : 0.01% inline : 0.000003s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000022s : 0.02% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.01% parallel-infer-symbol : 0.000003s : 0.00% pre_auto_parallel : 0.000038s : 0.04% insert-virtual-dataset : 0.000003s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000028s : 0.03% optimize.rewriter_before_opt_a : 0.000087s : 0.08% optimize.opt_a.expand_dump_flag : 0.000006s : 0.01% optimize.opt_a.switch_simplify : 0.000055s : 0.05% optimize.opt_a.loop_unroll : 0.000038s : 0.04% optimize.opt_a.a_1 : 0.000830s : 0.79% optimize.opt_a.with_stream_mark : 0.000041s : 0.04% optimize.opt_a.recompute_prepare : 0.000017s : 0.02% optimize.opt_a.updatestate_depend_eliminate : 0.000008s : 0.01% optimize.opt_a.updatestate_assign_eliminate : 0.000006s : 0.01% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.01% optimize.opt_a.parameter_eliminate : 0.000004s : 0.00% optimize.opt_a.a_2 : 0.000175s : 0.17% optimize.opt_a.accelerated_algorithm : 0.000015s : 0.01% optimize.opt_a.shard : 0.000005s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.000005s : 0.00% optimize.opt_a.shard_inline : 0.000013s : 0.01% optimize.opt_a.merge_send_recv : 0.000019s : 0.02% optimize.opt_a.auto_parallel : 0.000018s : 0.02% optimize.opt_a.parallel : 0.000030s : 0.03% optimize.opt_a.flash_sp : 0.000013s : 0.01% optimize.opt_a.merge_comm : 0.000008s : 0.01% optimize.opt_a.allreduce_fusion : 0.000007s : 0.01% optimize.opt_a.matmul_add_comm_reduction : 0.000021s : 0.02% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000002s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000020s : 0.02% optimize.opt_a.virtual_dataset : 0.000015s : 0.01% optimize.opt_a.get_grad_eliminate_ : 0.000014s : 0.01% optimize.opt_a.virtual_output : 0.000012s : 0.01% optimize.opt_a.merge_forward : 0.000008s : 0.01% optimize.opt_a.cell_reuse_recompute_pass : 0.000005s : 0.00% optimize.opt_a.offload_activation : 0.000023s : 0.02% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000026s : 0.02% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.00% optimize.opt_a.before_grad : 0.000023s : 0.02% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000008s : 0.01% optimize.opt_a.meta_fg_expand : 0.000006s : 0.01% optimize.opt_a.flash_sp_send_recv_attached : 0.000005s : 0.00% optimize.opt_a.receive_attached : 0.000005s : 0.00% optimize.opt_a.after_resolve : 0.000026s : 0.02% optimize.opt_a.a_after_grad : 0.000021s : 0.02% optimize.opt_a.renormalize : 0.094466s : 89.45% optimize.opt_a.add_forward_monad_depend : 0.000015s : 0.01% optimize.opt_a.auto_monad_grad : 0.000004s : 0.00% optimize.opt_a.auto_monad_eliminator : 0.000036s : 0.03% optimize.opt_a.cse : 0.000055s : 0.05% optimize.opt_a.a_3 : 0.000107s : 0.10% optimize.py_interpret_to_execute_after_opt_a : 0.000019s : 0.02% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.00% optimize.rewriter_after_opt_a : 0.000043s : 0.04% optimize.convert_after_rewriter : 0.000008s : 0.01% optimize.order_py_execute_after_rewriter : 0.000005s : 0.01% optimize.mutable_eliminate : 0.000794s : 0.75% optimize.opt_b.b_1 : 0.000141s : 0.13% optimize.opt_b.b_2 : 0.000010s : 0.01% optimize.opt_b.updatestate_depend_eliminate : 0.000007s : 0.01% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.00% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000022s : 0.02% optimize.optimize_parallel_all_gather_comm : 0.000017s : 0.02% optimize.overlap_param_gather : 0.000002s : 0.00% optimize.cconv : 0.000031s : 0.03% optimize.loop_unroll : 0.000455s : 0.43% optimize.opt_after_cconv.c_1 : 0.000032s : 0.03% optimize.opt_after_cconv.parameter_eliminate : 0.000005s : 0.00% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.01% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000002s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.00% optimize.opt_after_cconv.cse : 0.000018s : 0.02% optimize.opt_after_cconv.renormalize : 0.000001s : 0.00% optimize.remove_dup_value : 0.000015s : 0.01% optimize.tuple_transform.d_1 : 0.000046s : 0.04% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000008s : 0.01% optimize.partial_unused_args_eliminate : 0.000002s : 0.00% optimize.add_recomputation : 0.000053s : 0.05% optimize.cse_after_recomputation.cse : 0.000012s : 0.01% optimize.environ_conv : 0.000006s : 0.01% optimize.swap_dp_allreduce_reducescatter : 0.000006s : 0.01% optimize.bias_add_comm_swap : 0.000003s : 0.00% optimize.label_micro_interleaved_index : 0.000004s : 0.00% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.00% optimize.merge_cast_opt : 0.000001s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.00% optimize.micro_interleaved_order_control : 0.000003s : 0.00% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.00% optimize.full_micro_interleaved_order_control : 0.000002s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.00% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.00% optimize.control_data_broadcast_order : 0.000014s : 0.01% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.00% optimize.offloading_packed_experts : 0.000004s : 0.00% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.00% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000002s : 0.00% optimize.overlap_recompute_comm : 0.000003s : 0.00% optimize.overlap_grad_ring_attention : 0.000005s : 0.00% optimize.overlap_grad_flash_sp : 0.000022s : 0.02% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.00% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000004s : 0.00% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000011s : 0.01% optimize.symbol_engine_optimizer.elim_not_effective : 0.000015s : 0.01% optimize.symbol_engine_optimizer.opt_reshape : 0.000007s : 0.01% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000011s : 0.01% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.00% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000017s : 0.02% get_jit_bprop_graph : 0.000002s : 0.00% rewriter_after_jit_bprop_graph : 0.000005s : 0.00% opt_after_jit_grad : 0.000489s : 0.46% validate : 0.000060s : 0.06% Time group info: ------[substitution.] 0.000238 34 16.78% : 0.000040s : 6: substitution.arithmetic_simplify 0.87% : 0.000002s : 2: substitution.elim_not_effective 0.64% : 0.000002s : 2: substitution.fold_const_symbol 2.71% : 0.000006s : 4: substitution.graph_param_transform 65.29% : 0.000156s : 4: substitution.inline 2.12% : 0.000005s : 4: substitution.j_node_and_user_rematch 2.31% : 0.000006s : 4: substitution.remove_not_recompute_node 2.71% : 0.000006s : 4: substitution.replace_old_param 6.56% : 0.000016s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.006080 2 88.08% : 0.005355s : 1: type_inference.infer 11.92% : 0.000725s : 1: type_inference.specialize ------[replace.] 0.000064 8 63.09% : 0.000040s : 4: replace.inline 36.91% : 0.000024s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000167 8 91.66% : 0.000153s : 4: match.inline 8.34% : 0.000014s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000224 1278 0.90% : 0.000002s : 13: predicate.accumulaten_eliminater 0.80% : 0.000002s : 4: predicate.ad_related_special_op_eliminate 0.53% : 0.000001s : 8: predicate.addn_check_dump 0.94% : 0.000002s : 13: predicate.addn_zero_filter 0.74% : 0.000002s : 13: predicate.adjust_all_reduce_mul_add 2.86% : 0.000006s : 21: predicate.arithmetic_simplify 0.90% : 0.000002s : 13: predicate.cast_eliminate 0.55% : 0.000001s : 8: predicate.check_bprop_eliminate 0.86% : 0.000002s : 8: predicate.compare_switch_simplify 0.18% : 0.000000s : 4: predicate.const_output_eliminate 0.64% : 0.000001s : 8: predicate.depend_value_elim 0.98% : 0.000002s : 13: predicate.dict_get_item_const_eliminator 1.00% : 0.000002s : 13: predicate.dict_get_item_eliminator 0.86% : 0.000002s : 13: predicate.dict_set_item_eliminator 1.19% : 0.000003s : 8: predicate.dumpgradient_eliminate 0.42% : 0.000001s : 4: predicate.elim_not_effective 0.57% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.16% : 0.000003s : 17: predicate.environ_add_const_eliminate 1.06% : 0.000002s : 17: predicate.environ_get_add_eliminate 1.07% : 0.000002s : 17: predicate.environ_get_depend_swap 1.66% : 0.000004s : 25: predicate.environ_get_eliminate 0.99% : 0.000002s : 17: predicate.environ_get_set_eliminate 1.34% : 0.000003s : 21: predicate.exchange_switch_depend_value 2.37% : 0.000005s : 21: predicate.float_depend_g_call 0.53% : 0.000001s : 8: predicate.float_environ_get_switch 0.77% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.17% : 0.000000s : 4: predicate.fold_const_symbol 0.63% : 0.000001s : 8: predicate.get_grad_eliminate 0.21% : 0.000000s : 4: predicate.graph_param_transform 0.58% : 0.000001s : 8: predicate.incorporate_call 0.46% : 0.000001s : 8: predicate.incorporate_call_switch 5.99% : 0.000013s : 58: predicate.inline 0.83% : 0.000002s : 8: predicate.inline_without_move 0.30% : 0.000001s : 8: predicate.j_node_and_user_rematch 0.89% : 0.000002s : 8: predicate.less_batch_normalization 1.83% : 0.000004s : 25: predicate.list_to_tuple_eliminator_ 2.38% : 0.000005s : 38: predicate.load_eliminater 1.02% : 0.000002s : 4: predicate.loop_unroll_after_grad 2.32% : 0.000005s : 34: predicate.loop_unroll_before_grad 1.54% : 0.000003s : 21: predicate.make_slice_get_slice_eliminator 0.62% : 0.000001s : 8: predicate.merge_addn 0.56% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.55% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.93% : 0.000002s : 13: predicate.minmaximum_grad 1.31% : 0.000003s : 4: predicate.mutable_eliminate 0.35% : 0.000001s : 4: predicate.opt_reshape 0.43% : 0.000001s : 4: predicate.parallel_virtual_node 1.57% : 0.000004s : 21: predicate.partial_defer_inline 1.51% : 0.000003s : 21: predicate.partial_eliminate 0.85% : 0.000002s : 13: predicate.print_const_string_wrapper 0.60% : 0.000001s : 8: predicate.reduce_all_const_elim 1.23% : 0.000003s : 13: predicate.reduce_eliminate 2.42% : 0.000005s : 38: predicate.redundant_stop_gradient_eliminater 0.41% : 0.000001s : 8: predicate.remove_not_recompute_node 1.68% : 0.000004s : 25: predicate.replace_applicator 0.66% : 0.000001s : 8: predicate.replace_old_param 0.29% : 0.000001s : 4: predicate.reset_defer_inline 0.95% : 0.000002s : 13: predicate.reshape_eliminate 0.58% : 0.000001s : 8: predicate.row_tensor_add_zeros_like 0.51% : 0.000001s : 4: predicate.row_tensor_eliminate 0.92% : 0.000002s : 8: predicate.same_eliminate 0.37% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.84% : 0.000002s : 8: predicate.shard_identity_eliminate 0.68% : 0.000002s : 8: predicate.special_op_eliminate 0.59% : 0.000001s : 8: predicate.specialize_transform 0.95% : 0.000002s : 8: predicate.split_environ_get_set_with_tuple_value 0.82% : 0.000002s : 8: predicate.stack_unstack_eliminate 0.38% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.42% : 0.000003s : 21: predicate.switch_defer_inline 1.99% : 0.000004s : 29: predicate.switch_layer_defer_inline 5.11% : 0.000011s : 67: predicate.switch_simplify 0.88% : 0.000002s : 13: predicate.tile_eliminate 0.86% : 0.000002s : 13: predicate.transpose_eliminate 1.53% : 0.000003s : 21: predicate.tuple_list_convert_item_index_to_positive 1.48% : 0.000003s : 21: predicate.tuple_list_get_item_const_eliminator 1.38% : 0.000003s : 21: predicate.tuple_list_get_item_depend_reorder 3.19% : 0.000007s : 33: predicate.tuple_list_get_item_eliminator 1.47% : 0.000003s : 21: predicate.tuple_list_get_set_item_eliminator 2.32% : 0.000005s : 29: predicate.tuple_list_set_item_eliminator 1.80% : 0.000004s : 25: predicate.tuple_to_list_eliminator_ 2.37% : 0.000005s : 38: predicate.updatestate_pure_node_eliminater 3.04% : 0.000007s : 46: predicate.updatestate_useless_node_eliminater 0.49% : 0.000001s : 4: predicate.value_based_eliminate 0.79% : 0.000002s : 8: predicate.virtual_dataset_eliminate 0.64% : 0.000001s : 8: predicate.virtual_output_eliminate 0.26% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.37% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000635 11 47.97% : 0.000305s : 5: func_graph_cloner_run.FuncGraphClonerGraph 52.03% : 0.000330s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.308328 192 0.00% : 0.000004s : 1: ForceFp32Comm 1.07% : 0.003288s : 1: add_attr 1.06% : 0.003276s : 1: add_attr_with_inline 0.00% : 0.000004s : 1: add_comm_op_reuse_tag 0.02% : 0.000057s : 1: add_recomputation 0.00% : 0.000004s : 1: assign_add_opt 0.02% : 0.000067s : 1: auto_monad 0.01% : 0.000021s : 1: auto_monad_reorder 0.00% : 0.000003s : 1: begin_end_overlap_inline 0.00% : 0.000006s : 1: bias_add_comm_swap 0.15% : 0.000467s : 1: bootstrap 0.01% : 0.000035s : 1: cconv 0.00% : 0.000004s : 1: comm_op_add_attrs 0.01% : 0.000017s : 1: control_data_broadcast_order 0.00% : 0.000011s : 1: convert_after_rewriter 0.01% : 0.000026s : 1: cse_after_recomputation 0.00% : 0.000005s : 1: dataset_repeat_opt 0.00% : 0.000005s : 1: detach_backward 0.00% : 0.000009s : 1: environ_conv 0.01% : 0.000026s : 1: event_method 0.00% : 0.000005s : 1: full_micro_interleaved_order_control 0.00% : 0.000005s : 1: get_jit_bprop_graph 0.00% : 0.000010s : 1: graph_reusing 0.00% : 0.000005s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000004s : 1: handle_group_info 0.00% : 0.000006s : 1: inline 0.00% : 0.000006s : 1: insert-virtual-dataset 0.00% : 0.000004s : 1: interleave_parallel_branches 0.00% : 0.000004s : 1: interleave_split_concat_branches 0.00% : 0.000006s : 1: label_fine_grained_interleaved_index 0.00% : 0.000007s : 1: label_micro_interleaved_index 0.15% : 0.000463s : 1: loop_unroll 0.00% : 0.000004s : 1: merge_cast_opt 0.00% : 0.000006s : 1: micro_interleaved_order_control 0.26% : 0.000806s : 1: mutable_eliminate 0.00% : 0.000007s : 1: offloading_packed_experts 0.00% : 0.000015s : 1: opt.transform.loop_unroll_optimizer 0.01% : 0.000020s : 1: opt.transform.mutable_eliminate 0.42% : 0.001295s : 78: opt.transform.opt_a 0.01% : 0.000031s : 1: opt.transform.opt_after_cconv 0.01% : 0.000027s : 1: opt.transform.opt_after_jit_grad 0.04% : 0.000113s : 28: opt.transform.opt_b 0.02% : 0.000052s : 2: opt.transform.opt_trans_graph 0.01% : 0.000040s : 4: opt.transform.symbol_engine_opt 31.39% : 0.096787s : 1: opt_a 0.04% : 0.000110s : 1: opt_after_cconv 0.16% : 0.000498s : 1: opt_after_jit_grad 0.08% : 0.000239s : 1: opt_b 32.18% : 0.099214s : 1: optimize 0.01% : 0.000022s : 1: optimize_parallel_all_gather_comm 0.00% : 0.000009s : 1: order_py_execute_after_rewriter 0.01% : 0.000025s : 1: overlap_grad_flash_sp 0.00% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.00% : 0.000007s : 1: overlap_grad_ring_attention 0.00% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000005s : 1: overlap_param_gather 0.00% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.00% : 0.000008s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000006s : 1: overlap_recompute_comm 0.00% : 0.000007s : 1: parallel-infer-symbol 0.00% : 0.000004s : 1: parallel-infer-symbol-second 0.00% : 0.000005s : 1: partial_unused_args_eliminate 0.00% : 0.000005s : 1: pipeline_parallel_scheduler 0.00% : 0.000005s : 1: pipeline_split 0.01% : 0.000042s : 1: pre_auto_parallel 0.01% : 0.000032s : 1: py_interpret_to_execute 0.01% : 0.000023s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000004s : 1: remove_cast_before_assign_add 0.01% : 0.000018s : 1: remove_dup_value 30.45% : 0.093894s : 1: renormalize.infer 0.18% : 0.000554s : 1: renormalize.specialize 0.00% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.00% : 0.000008s : 1: rewriter_after_jit_bprop_graph 0.02% : 0.000048s : 1: rewriter_after_opt_a 0.03% : 0.000091s : 1: rewriter_before_opt_a 0.00% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000006s : 1: slice_recompute_activation 0.00% : 0.000005s : 1: split_layernorm_comm 0.00% : 0.000005s : 1: split_matmul_comm_elemetwise 0.00% : 0.000008s : 1: swap_dp_allreduce_reducescatter 0.03% : 0.000084s : 1: symbol_engine_optimizer 0.03% : 0.000083s : 1: tuple_transform 2.00% : 0.006168s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:43:49.243.623 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:43:49.243.884 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.103166, [21] [bootstrap]: 0.00043762 [type_inference]: 0.0915744 [event_method]: 2.316e-05 [auto_monad]: 6.85e-05 [graph_reusing]: 5.92999e-06 [inline]: 2.86999e-06 [add_attr]: 0.00369342, [1] [add_attr_with_inline]: 0.00368145, [1] [Cycle 1]: 9.346e-05, [2] [tag_attr]: 2.199e-05 [meta_addattr_fg_expand]: 6.53e-06 [parallel-infer-symbol]: 3.38e-06 [pre_auto_parallel]: 4.091e-05 [insert-virtual-dataset]: 2.63e-06 [parallel-infer-symbol-second]: 7.39994e-07 [dataset_repeat_opt]: 1.94e-06 [pipeline_split]: 1.70001e-06 [optimize]: 0.00602803, [53] [py_interpret_to_execute]: 3.242e-05 [rewriter_before_opt_a]: 9.815e-05 [opt_a]: 0.0032618, [2] [Cycle 1]: 0.00236367, [45] [expand_dump_flag]: 3.2e-06 [switch_simplify]: 4.282e-05 [loop_unroll]: 3.003e-05 [a_1]: 0.00067462 [with_stream_mark]: 1.975e-05 [recompute_prepare]: 1.008e-05 [updatestate_depend_eliminate]: 4.2e-06 [updatestate_assign_eliminate]: 3.56999e-06 [updatestate_loads_eliminate]: 3.25e-06 [parameter_eliminate]: 1.90001e-06 [a_2]: 0.0001158 [accelerated_algorithm]: 7.05998e-06 [shard]: 1.76e-06 [meta_shard_fg_expand]: 2.06998e-06 [shard_inline]: 6.49001e-06 [merge_send_recv]: 8.46002e-06 [auto_parallel]: 6.96999e-06 [parallel]: 1.917e-05 [flash_sp]: 8.47998e-06 [merge_comm]: 4.04002e-06 [allreduce_fusion]: 4.16001e-06 [matmul_add_comm_reduction]: 9.32999e-06 [allreduce_slice_to_reducescatter]: 1.08001e-06 [virtual_shard_identity]: 9.10999e-06 [virtual_dataset]: 7.38e-06 [get_grad_eliminate_]: 6.48e-06 [virtual_output]: 6.59999e-06 [merge_forward]: 3.83001e-06 [cell_reuse_recompute_pass]: 1.17e-06 [offload_activation]: 1.088e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.491e-05 [merge_recompute_call_nodes]: 1.52001e-06 [before_grad]: 1.123e-05 [set_forward_comm_id_for_comm_node_pass]: 4.03001e-06 [meta_fg_expand]: 3.03e-06 [flash_sp_send_recv_attached]: 2.79999e-06 [receive_attached]: 2.63e-06 [after_resolve]: 1.322e-05 [a_after_grad]: 1.022e-05 [renormalize]: 0.00076032 [add_forward_monad_depend]: 6.07001e-06 [auto_monad_grad]: 2.61999e-06 [auto_monad_eliminator]: 1.586e-05 [cse]: 2.734e-05 [a_3]: 6.324e-05 [Cycle 2]: 0.00088221, [45] [expand_dump_flag]: 2.07001e-06 [switch_simplify]: 7.58999e-06 [loop_unroll]: 6.34001e-06 [a_1]: 0.00014418 [with_stream_mark]: 1.305e-05 [recompute_prepare]: 6.46e-06 [updatestate_depend_eliminate]: 3.06999e-06 [updatestate_assign_eliminate]: 2.36e-06 [updatestate_loads_eliminate]: 2.16e-06 [parameter_eliminate]: 1.52001e-06 [a_2]: 0.00010576 [accelerated_algorithm]: 6.36e-06 [shard]: 1.52999e-06 [meta_shard_fg_expand]: 1.42e-06 [shard_inline]: 6.84999e-06 [merge_send_recv]: 5.82001e-06 [auto_parallel]: 6.98998e-06 [parallel]: 6.09999e-06 [flash_sp]: 3.69002e-06 [merge_comm]: 7.16001e-06 [allreduce_fusion]: 3.55e-06 [matmul_add_comm_reduction]: 7.05e-06 [allreduce_slice_to_reducescatter]: 3.80009e-07 [virtual_shard_identity]: 7.18998e-06 [virtual_dataset]: 7.35e-06 [get_grad_eliminate_]: 5.74e-06 [virtual_output]: 6.04001e-06 [merge_forward]: 3.3e-06 [cell_reuse_recompute_pass]: 1.93002e-06 [offload_activation]: 7.17002e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.35e-05 [merge_recompute_call_nodes]: 9.19972e-07 [before_grad]: 9.81998e-06 [set_forward_comm_id_for_comm_node_pass]: 5.72001e-06 [meta_fg_expand]: 2.07001e-06 [flash_sp_send_recv_attached]: 1.15999e-06 [receive_attached]: 1.44e-06 [after_resolve]: 1.267e-05 [a_after_grad]: 1.037e-05 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 1.81e-06 [auto_monad_grad]: 1.54e-06 [auto_monad_eliminator]: 8.24002e-06 [cse]: 1.546e-05 [a_3]: 4.943e-05 [py_interpret_to_execute_after_opt_a]: 1.551e-05 [slice_cell_reuse_recomputed_activation]: 5.23002e-06 [rewriter_after_opt_a]: 4.627e-05 [convert_after_rewriter]: 1.105e-05 [order_py_execute_after_rewriter]: 8.50999e-06 [mutable_eliminate]: 0.00089217 [opt_b]: 0.00027368, [1] [Cycle 1]: 0.00026335, [7] [b_1]: 0.00016719 [b_2]: 8.59998e-06 [updatestate_depend_eliminate]: 6.71999e-06 [updatestate_assign_eliminate]: 2.61999e-06 [updatestate_loads_eliminate]: 2.31998e-06 [renormalize]: 6.00005e-07 [cse]: 1.923e-05 [optimize_parallel_all_gather_comm]: 2.064e-05 [overlap_param_gather]: 5.18002e-06 [cconv]: 3.573e-05 [loop_unroll]: 0.00046977 [opt_after_cconv]: 0.00013088, [1] [Cycle 1]: 0.00012067, [7] [c_1]: 3.198e-05 [parameter_eliminate]: 3.81999e-06 [updatestate_depend_eliminate]: 5.61998e-06 [updatestate_assign_eliminate]: 2.54001e-06 [updatestate_loads_eliminate]: 2.26003e-06 [cse]: 1.781e-05 [renormalize]: 3.70026e-07 [remove_dup_value]: 1.734e-05 [tuple_transform]: 9.64e-05, [1] [Cycle 1]: 8.866e-05, [4] [d_1]: 4.639e-05 [none_parameter_eliminate]: 2.07001e-06 [renormalize]: 1.8999e-07 [switch_simplify]: 7.76001e-06 [partial_unused_args_eliminate]: 4.74e-06 [add_recomputation]: 5.548e-05 [cse_after_recomputation]: 3.096e-05, [1] [Cycle 1]: 2.294e-05, [1] [cse]: 1.29e-05 [environ_conv]: 8.80999e-06 [swap_dp_allreduce_reducescatter]: 8.84e-06 [bias_add_comm_swap]: 6.16e-06 [label_micro_interleaved_index]: 8.13999e-06 [label_fine_grained_interleaved_index]: 5.37999e-06 [merge_cast_opt]: 4.72e-06 [slice_recompute_activation]: 5.07e-06 [micro_interleaved_order_control]: 5.22999e-06 [assign_add_opt]: 3.98001e-06 [ForceFp32Comm]: 3.78999e-06 [remove_cast_before_assign_add]: 3.46001e-06 [full_micro_interleaved_order_control]: 5.00001e-06 [reorder_send_recv_between_fp_bp]: 5.24e-06 [comm_op_add_attrs]: 4.38999e-06 [add_comm_op_reuse_tag]: 3.51001e-06 [interleave_split_concat_branches]: 3.46999e-06 [interleave_parallel_branches]: 3.66999e-06 [overlap_opt_shard_in_pipeline]: 3.85998e-06 [overlap_opt_shard_grad_in_pipeline]: 4.84e-06 [control_data_broadcast_order]: 1.612e-05 [grouped_pairwise_exchange_alltoall]: 4.03001e-06 [offloading_packed_experts]: 6.44001e-06 [overlap_recompute_and_grad_model_parallel]: 7.66999e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.63e-06 [overlap_recompute_allgather_and_fa_grad]: 3.85e-06 [overlap_recompute_comm]: 5.09e-06 [overlap_grad_ring_attention]: 7.02002e-06 [overlap_grad_flash_sp]: 2.495e-05 [begin_end_overlap_inline]: 2.91e-06 [split_matmul_comm_elemetwise]: 5.11002e-06 [split_layernorm_comm]: 4.78001e-06 [handle_group_info]: 3.40998e-06 [symbol_engine_optimizer]: 0.00010202, [1] [Cycle 1]: 9.446e-05, [6] [build]: 4.11001e-06 [elim_shapecalc]: 1.063e-05 [elim_not_effective]: 1.338e-05 [opt_reshape]: 7.56001e-06 [fold_const_symbol]: 1.083e-05 [renormalize]: 2.10013e-07 [detach_backward]: 4.03001e-06 [pipeline_parallel_scheduler]: 2.22001e-06 [auto_monad_reorder]: 1.979e-05 [get_jit_bprop_graph]: 1.99e-06 [rewriter_after_jit_bprop_graph]: 5.39998e-06 [opt_after_jit_grad]: 0.00060043 [validate]: 4.311e-05 Sums bootstrap : 0.000438s : 0.45% type_inference : 0.091574s : 93.76% event_method : 0.000023s : 0.02% auto_monad : 0.000068s : 0.07% graph_reusing : 0.000006s : 0.01% inline : 0.000003s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000022s : 0.02% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000007s : 0.01% parallel-infer-symbol : 0.000003s : 0.00% pre_auto_parallel : 0.000041s : 0.04% insert-virtual-dataset : 0.000003s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000032s : 0.03% optimize.rewriter_before_opt_a : 0.000098s : 0.10% optimize.opt_a.expand_dump_flag : 0.000005s : 0.01% optimize.opt_a.switch_simplify : 0.000050s : 0.05% optimize.opt_a.loop_unroll : 0.000036s : 0.04% optimize.opt_a.a_1 : 0.000819s : 0.84% optimize.opt_a.with_stream_mark : 0.000033s : 0.03% optimize.opt_a.recompute_prepare : 0.000017s : 0.02% optimize.opt_a.updatestate_depend_eliminate : 0.000007s : 0.01% optimize.opt_a.updatestate_assign_eliminate : 0.000006s : 0.01% optimize.opt_a.updatestate_loads_eliminate : 0.000005s : 0.01% optimize.opt_a.parameter_eliminate : 0.000003s : 0.00% optimize.opt_a.a_2 : 0.000222s : 0.23% optimize.opt_a.accelerated_algorithm : 0.000013s : 0.01% optimize.opt_a.shard : 0.000003s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.000003s : 0.00% optimize.opt_a.shard_inline : 0.000013s : 0.01% optimize.opt_a.merge_send_recv : 0.000014s : 0.01% optimize.opt_a.auto_parallel : 0.000014s : 0.01% optimize.opt_a.parallel : 0.000025s : 0.03% optimize.opt_a.flash_sp : 0.000012s : 0.01% optimize.opt_a.merge_comm : 0.000011s : 0.01% optimize.opt_a.allreduce_fusion : 0.000008s : 0.01% optimize.opt_a.matmul_add_comm_reduction : 0.000016s : 0.02% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000016s : 0.02% optimize.opt_a.virtual_dataset : 0.000015s : 0.02% optimize.opt_a.get_grad_eliminate_ : 0.000012s : 0.01% optimize.opt_a.virtual_output : 0.000013s : 0.01% optimize.opt_a.merge_forward : 0.000007s : 0.01% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.00% optimize.opt_a.offload_activation : 0.000018s : 0.02% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000028s : 0.03% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.00% optimize.opt_a.before_grad : 0.000021s : 0.02% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000010s : 0.01% optimize.opt_a.meta_fg_expand : 0.000005s : 0.01% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.00% optimize.opt_a.receive_attached : 0.000004s : 0.00% optimize.opt_a.after_resolve : 0.000026s : 0.03% optimize.opt_a.a_after_grad : 0.000021s : 0.02% optimize.opt_a.renormalize : 0.000760s : 0.78% optimize.opt_a.add_forward_monad_depend : 0.000008s : 0.01% optimize.opt_a.auto_monad_grad : 0.000004s : 0.00% optimize.opt_a.auto_monad_eliminator : 0.000024s : 0.02% optimize.opt_a.cse : 0.000043s : 0.04% optimize.opt_a.a_3 : 0.000113s : 0.12% optimize.py_interpret_to_execute_after_opt_a : 0.000016s : 0.02% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.01% optimize.rewriter_after_opt_a : 0.000046s : 0.05% optimize.convert_after_rewriter : 0.000011s : 0.01% optimize.order_py_execute_after_rewriter : 0.000009s : 0.01% optimize.mutable_eliminate : 0.000892s : 0.91% optimize.opt_b.b_1 : 0.000167s : 0.17% optimize.opt_b.b_2 : 0.000009s : 0.01% optimize.opt_b.updatestate_depend_eliminate : 0.000007s : 0.01% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000002s : 0.00% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000019s : 0.02% optimize.optimize_parallel_all_gather_comm : 0.000021s : 0.02% optimize.overlap_param_gather : 0.000005s : 0.01% optimize.cconv : 0.000036s : 0.04% optimize.loop_unroll : 0.000470s : 0.48% optimize.opt_after_cconv.c_1 : 0.000032s : 0.03% optimize.opt_after_cconv.parameter_eliminate : 0.000004s : 0.00% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.01% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.00% optimize.opt_after_cconv.cse : 0.000018s : 0.02% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000017s : 0.02% optimize.tuple_transform.d_1 : 0.000046s : 0.05% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000008s : 0.01% optimize.partial_unused_args_eliminate : 0.000005s : 0.00% optimize.add_recomputation : 0.000055s : 0.06% optimize.cse_after_recomputation.cse : 0.000013s : 0.01% optimize.environ_conv : 0.000009s : 0.01% optimize.swap_dp_allreduce_reducescatter : 0.000009s : 0.01% optimize.bias_add_comm_swap : 0.000006s : 0.01% optimize.label_micro_interleaved_index : 0.000008s : 0.01% optimize.label_fine_grained_interleaved_index : 0.000005s : 0.01% optimize.merge_cast_opt : 0.000005s : 0.00% optimize.slice_recompute_activation : 0.000005s : 0.01% optimize.micro_interleaved_order_control : 0.000005s : 0.01% optimize.assign_add_opt : 0.000004s : 0.00% optimize.ForceFp32Comm : 0.000004s : 0.00% optimize.remove_cast_before_assign_add : 0.000003s : 0.00% optimize.full_micro_interleaved_order_control : 0.000005s : 0.01% optimize.reorder_send_recv_between_fp_bp : 0.000005s : 0.01% optimize.comm_op_add_attrs : 0.000004s : 0.00% optimize.add_comm_op_reuse_tag : 0.000004s : 0.00% optimize.interleave_split_concat_branches : 0.000003s : 0.00% optimize.interleave_parallel_branches : 0.000004s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000005s : 0.00% optimize.control_data_broadcast_order : 0.000016s : 0.02% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.00% optimize.offloading_packed_experts : 0.000006s : 0.01% optimize.overlap_recompute_and_grad_model_parallel : 0.000008s : 0.01% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.00% optimize.overlap_recompute_comm : 0.000005s : 0.01% optimize.overlap_grad_ring_attention : 0.000007s : 0.01% optimize.overlap_grad_flash_sp : 0.000025s : 0.03% optimize.begin_end_overlap_inline : 0.000003s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000005s : 0.01% optimize.split_layernorm_comm : 0.000005s : 0.00% optimize.handle_group_info : 0.000003s : 0.00% optimize.symbol_engine_optimizer.build : 0.000004s : 0.00% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000011s : 0.01% optimize.symbol_engine_optimizer.elim_not_effective : 0.000013s : 0.01% optimize.symbol_engine_optimizer.opt_reshape : 0.000008s : 0.01% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000011s : 0.01% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000004s : 0.00% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000020s : 0.02% get_jit_bprop_graph : 0.000002s : 0.00% rewriter_after_jit_bprop_graph : 0.000005s : 0.01% opt_after_jit_grad : 0.000600s : 0.61% validate : 0.000043s : 0.04% Time group info: ------[substitution.] 0.000237 34 14.43% : 0.000034s : 6: substitution.arithmetic_simplify 0.95% : 0.000002s : 2: substitution.elim_not_effective 0.71% : 0.000002s : 2: substitution.fold_const_symbol 2.58% : 0.000006s : 4: substitution.graph_param_transform 68.16% : 0.000161s : 4: substitution.inline 1.88% : 0.000004s : 4: substitution.j_node_and_user_rematch 2.14% : 0.000005s : 4: substitution.remove_not_recompute_node 2.54% : 0.000006s : 4: substitution.replace_old_param 6.60% : 0.000016s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.091503 2 98.96% : 0.090556s : 1: type_inference.infer 1.04% : 0.000947s : 1: type_inference.specialize ------[replace.] 0.000064 8 64.56% : 0.000041s : 4: replace.inline 35.44% : 0.000023s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000172 8 91.99% : 0.000159s : 4: match.inline 8.01% : 0.000014s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000212 1278 0.91% : 0.000002s : 13: predicate.accumulaten_eliminater 0.82% : 0.000002s : 4: predicate.ad_related_special_op_eliminate 0.50% : 0.000001s : 8: predicate.addn_check_dump 0.95% : 0.000002s : 13: predicate.addn_zero_filter 0.85% : 0.000002s : 13: predicate.adjust_all_reduce_mul_add 2.36% : 0.000005s : 21: predicate.arithmetic_simplify 0.96% : 0.000002s : 13: predicate.cast_eliminate 0.69% : 0.000001s : 8: predicate.check_bprop_eliminate 0.57% : 0.000001s : 8: predicate.compare_switch_simplify 0.17% : 0.000000s : 4: predicate.const_output_eliminate 0.59% : 0.000001s : 8: predicate.depend_value_elim 0.92% : 0.000002s : 13: predicate.dict_get_item_const_eliminator 1.15% : 0.000002s : 13: predicate.dict_get_item_eliminator 0.86% : 0.000002s : 13: predicate.dict_set_item_eliminator 0.94% : 0.000002s : 8: predicate.dumpgradient_eliminate 0.17% : 0.000000s : 4: predicate.elim_not_effective 0.41% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.15% : 0.000002s : 17: predicate.environ_add_const_eliminate 1.14% : 0.000002s : 17: predicate.environ_get_add_eliminate 1.13% : 0.000002s : 17: predicate.environ_get_depend_swap 1.64% : 0.000003s : 25: predicate.environ_get_eliminate 1.03% : 0.000002s : 17: predicate.environ_get_set_eliminate 1.39% : 0.000003s : 21: predicate.exchange_switch_depend_value 2.41% : 0.000005s : 21: predicate.float_depend_g_call 0.60% : 0.000001s : 8: predicate.float_environ_get_switch 0.74% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.19% : 0.000000s : 4: predicate.fold_const_symbol 0.65% : 0.000001s : 8: predicate.get_grad_eliminate 0.20% : 0.000000s : 4: predicate.graph_param_transform 0.60% : 0.000001s : 8: predicate.incorporate_call 0.48% : 0.000001s : 8: predicate.incorporate_call_switch 6.42% : 0.000014s : 58: predicate.inline 0.92% : 0.000002s : 8: predicate.inline_without_move 0.32% : 0.000001s : 8: predicate.j_node_and_user_rematch 0.84% : 0.000002s : 8: predicate.less_batch_normalization 1.83% : 0.000004s : 25: predicate.list_to_tuple_eliminator_ 2.46% : 0.000005s : 38: predicate.load_eliminater 0.96% : 0.000002s : 4: predicate.loop_unroll_after_grad 2.31% : 0.000005s : 34: predicate.loop_unroll_before_grad 1.48% : 0.000003s : 21: predicate.make_slice_get_slice_eliminator 0.56% : 0.000001s : 8: predicate.merge_addn 0.61% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.60% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.87% : 0.000002s : 13: predicate.minmaximum_grad 1.29% : 0.000003s : 4: predicate.mutable_eliminate 0.37% : 0.000001s : 4: predicate.opt_reshape 0.33% : 0.000001s : 4: predicate.parallel_virtual_node 1.71% : 0.000004s : 21: predicate.partial_defer_inline 1.58% : 0.000003s : 21: predicate.partial_eliminate 0.88% : 0.000002s : 13: predicate.print_const_string_wrapper 0.64% : 0.000001s : 8: predicate.reduce_all_const_elim 1.11% : 0.000002s : 13: predicate.reduce_eliminate 2.38% : 0.000005s : 38: predicate.redundant_stop_gradient_eliminater 0.39% : 0.000001s : 8: predicate.remove_not_recompute_node 1.40% : 0.000003s : 25: predicate.replace_applicator 0.51% : 0.000001s : 8: predicate.replace_old_param 0.43% : 0.000001s : 4: predicate.reset_defer_inline 0.91% : 0.000002s : 13: predicate.reshape_eliminate 0.67% : 0.000001s : 8: predicate.row_tensor_add_zeros_like 0.37% : 0.000001s : 4: predicate.row_tensor_eliminate 0.79% : 0.000002s : 8: predicate.same_eliminate 0.44% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.82% : 0.000002s : 8: predicate.shard_identity_eliminate 0.78% : 0.000002s : 8: predicate.special_op_eliminate 0.65% : 0.000001s : 8: predicate.specialize_transform 1.11% : 0.000002s : 8: predicate.split_environ_get_set_with_tuple_value 0.85% : 0.000002s : 8: predicate.stack_unstack_eliminate 0.32% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.48% : 0.000003s : 21: predicate.switch_defer_inline 1.95% : 0.000004s : 29: predicate.switch_layer_defer_inline 5.11% : 0.000011s : 67: predicate.switch_simplify 0.90% : 0.000002s : 13: predicate.tile_eliminate 0.98% : 0.000002s : 13: predicate.transpose_eliminate 1.73% : 0.000004s : 21: predicate.tuple_list_convert_item_index_to_positive 1.52% : 0.000003s : 21: predicate.tuple_list_get_item_const_eliminator 1.36% : 0.000003s : 21: predicate.tuple_list_get_item_depend_reorder 3.68% : 0.000008s : 33: predicate.tuple_list_get_item_eliminator 1.34% : 0.000003s : 21: predicate.tuple_list_get_set_item_eliminator 2.18% : 0.000005s : 29: predicate.tuple_list_set_item_eliminator 1.69% : 0.000004s : 25: predicate.tuple_to_list_eliminator_ 2.54% : 0.000005s : 38: predicate.updatestate_pure_node_eliminater 3.01% : 0.000006s : 46: predicate.updatestate_useless_node_eliminater 0.39% : 0.000001s : 4: predicate.value_based_eliminate 0.79% : 0.000002s : 8: predicate.virtual_dataset_eliminate 0.61% : 0.000001s : 8: predicate.virtual_output_eliminate 0.26% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.41% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000655 11 46.06% : 0.000302s : 5: func_graph_cloner_run.FuncGraphClonerGraph 53.94% : 0.000354s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.115048 192 0.01% : 0.000006s : 1: ForceFp32Comm 3.22% : 0.003705s : 1: add_attr 3.20% : 0.003686s : 1: add_attr_with_inline 0.01% : 0.000008s : 1: add_comm_op_reuse_tag 0.05% : 0.000059s : 1: add_recomputation 0.01% : 0.000007s : 1: assign_add_opt 0.07% : 0.000078s : 1: auto_monad 0.02% : 0.000028s : 1: auto_monad_reorder 0.01% : 0.000006s : 1: begin_end_overlap_inline 0.01% : 0.000009s : 1: bias_add_comm_swap 0.42% : 0.000485s : 1: bootstrap 0.03% : 0.000039s : 1: cconv 0.01% : 0.000007s : 1: comm_op_add_attrs 0.02% : 0.000019s : 1: control_data_broadcast_order 0.01% : 0.000014s : 1: convert_after_rewriter 0.03% : 0.000034s : 1: cse_after_recomputation 0.01% : 0.000008s : 1: dataset_repeat_opt 0.02% : 0.000021s : 1: detach_backward 0.01% : 0.000012s : 1: environ_conv 0.03% : 0.000034s : 1: event_method 0.01% : 0.000008s : 1: full_micro_interleaved_order_control 0.01% : 0.000009s : 1: get_jit_bprop_graph 0.01% : 0.000012s : 1: graph_reusing 0.01% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000006s : 1: handle_group_info 0.01% : 0.000009s : 1: inline 0.01% : 0.000009s : 1: insert-virtual-dataset 0.01% : 0.000006s : 1: interleave_parallel_branches 0.01% : 0.000006s : 1: interleave_split_concat_branches 0.01% : 0.000008s : 1: label_fine_grained_interleaved_index 0.01% : 0.000011s : 1: label_micro_interleaved_index 0.41% : 0.000476s : 1: loop_unroll 0.01% : 0.000007s : 1: merge_cast_opt 0.01% : 0.000008s : 1: micro_interleaved_order_control 0.78% : 0.000899s : 1: mutable_eliminate 0.01% : 0.000009s : 1: offloading_packed_experts 0.01% : 0.000015s : 1: opt.transform.loop_unroll_optimizer 0.02% : 0.000018s : 1: opt.transform.mutable_eliminate 1.10% : 0.001262s : 78: opt.transform.opt_a 0.03% : 0.000031s : 1: opt.transform.opt_after_cconv 0.02% : 0.000026s : 1: opt.transform.opt_after_jit_grad 0.09% : 0.000103s : 28: opt.transform.opt_b 0.05% : 0.000052s : 2: opt.transform.opt_trans_graph 0.03% : 0.000039s : 4: opt.transform.symbol_engine_opt 2.84% : 0.003266s : 1: opt_a 0.12% : 0.000134s : 1: opt_after_cconv 0.53% : 0.000611s : 1: opt_after_jit_grad 0.24% : 0.000277s : 1: opt_b 5.53% : 0.006359s : 1: optimize 0.02% : 0.000024s : 1: optimize_parallel_all_gather_comm 0.01% : 0.000012s : 1: order_py_execute_after_rewriter 0.02% : 0.000028s : 1: overlap_grad_flash_sp 0.01% : 0.000007s : 1: overlap_grad_matmul_and_grad_allreduce 0.01% : 0.000010s : 1: overlap_grad_ring_attention 0.01% : 0.000007s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000007s : 1: overlap_opt_shard_in_pipeline 0.01% : 0.000008s : 1: overlap_param_gather 0.01% : 0.000006s : 1: overlap_recompute_allgather_and_fa_grad 0.01% : 0.000011s : 1: overlap_recompute_and_grad_model_parallel 0.01% : 0.000008s : 1: overlap_recompute_comm 0.01% : 0.000010s : 1: parallel-infer-symbol 0.01% : 0.000006s : 1: parallel-infer-symbol-second 0.01% : 0.000008s : 1: partial_unused_args_eliminate 0.01% : 0.000010s : 1: pipeline_parallel_scheduler 0.01% : 0.000007s : 1: pipeline_split 0.04% : 0.000049s : 1: pre_auto_parallel 0.03% : 0.000036s : 1: py_interpret_to_execute 0.02% : 0.000019s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000006s : 1: remove_cast_before_assign_add 0.02% : 0.000021s : 1: remove_dup_value 0.35% : 0.000405s : 1: renormalize.infer 0.30% : 0.000348s : 1: renormalize.specialize 0.01% : 0.000008s : 1: reorder_send_recv_between_fp_bp 0.01% : 0.000012s : 1: rewriter_after_jit_bprop_graph 0.04% : 0.000050s : 1: rewriter_after_opt_a 0.09% : 0.000102s : 1: rewriter_before_opt_a 0.01% : 0.000008s : 1: slice_cell_reuse_recomputed_activation 0.01% : 0.000009s : 1: slice_recompute_activation 0.01% : 0.000008s : 1: split_layernorm_comm 0.01% : 0.000008s : 1: split_matmul_comm_elemetwise 0.01% : 0.000012s : 1: swap_dp_allreduce_reducescatter 0.09% : 0.000105s : 1: symbol_engine_optimizer 0.09% : 0.000100s : 1: tuple_transform 79.64% : 0.091628s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:43:50.610.327 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0614033, [21] [bootstrap]: 0.00043628 [type_inference]: 0.0506114 [event_method]: 2.093e-05 [auto_monad]: 7.116e-05 [graph_reusing]: 6.24001e-06 [inline]: 3.03e-06 [add_attr]: 0.00352031, [1] [add_attr_with_inline]: 0.00350811, [1] [Cycle 1]: 7.576e-05, [2] [tag_attr]: 2.289e-05 [meta_addattr_fg_expand]: 6.34999e-06 [parallel-infer-symbol]: 3.36999e-06 [pre_auto_parallel]: 4.106e-05 [insert-virtual-dataset]: 2.38002e-06 [parallel-infer-symbol-second]: 7.90023e-07 [dataset_repeat_opt]: 2.39999e-06 [pipeline_split]: 1.59998e-06 [optimize]: 0.00589025, [53] [py_interpret_to_execute]: 3.035e-05 [rewriter_before_opt_a]: 0.00010901 [opt_a]: 0.00333518, [2] [Cycle 1]: 0.00259275, [45] [expand_dump_flag]: 3.55e-06 [switch_simplify]: 0.00022096 [loop_unroll]: 3.739e-05 [a_1]: 0.00069627 [with_stream_mark]: 2.499e-05 [recompute_prepare]: 1.188e-05 [updatestate_depend_eliminate]: 4.57e-06 [updatestate_assign_eliminate]: 3.13e-06 [updatestate_loads_eliminate]: 3.06999e-06 [parameter_eliminate]: 2.34999e-06 [a_2]: 9.128e-05 [accelerated_algorithm]: 8.54e-06 [shard]: 2.04e-06 [meta_shard_fg_expand]: 2.39001e-06 [shard_inline]: 6.74999e-06 [merge_send_recv]: 1.075e-05 [auto_parallel]: 9.57999e-06 [parallel]: 2.181e-05 [flash_sp]: 1.057e-05 [merge_comm]: 4.18999e-06 [allreduce_fusion]: 3.36001e-06 [matmul_add_comm_reduction]: 1.081e-05 [allreduce_slice_to_reducescatter]: 8.89995e-07 [virtual_shard_identity]: 8.61002e-06 [virtual_dataset]: 7.35e-06 [get_grad_eliminate_]: 6.39001e-06 [virtual_output]: 6.64001e-06 [merge_forward]: 4.40999e-06 [cell_reuse_recompute_pass]: 1.17999e-06 [offload_activation]: 1.17e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.338e-05 [merge_recompute_call_nodes]: 1.45999e-06 [before_grad]: 1.173e-05 [set_forward_comm_id_for_comm_node_pass]: 4.07e-06 [meta_fg_expand]: 3.28e-06 [flash_sp_send_recv_attached]: 2.69999e-06 [receive_attached]: 2.07999e-06 [after_resolve]: 1.247e-05 [a_after_grad]: 1.161e-05 [renormalize]: 0.00088862 [add_forward_monad_depend]: 6.89001e-06 [auto_monad_grad]: 3.2e-06 [auto_monad_eliminator]: 1.867e-05 [cse]: 3.246e-05 [a_3]: 5.344e-05 [Cycle 2]: 0.00072915, [45] [expand_dump_flag]: 2.91e-06 [switch_simplify]: 7.95e-06 [loop_unroll]: 6.54001e-06 [a_1]: 0.00015481 [with_stream_mark]: 1.789e-05 [recompute_prepare]: 6.89001e-06 [updatestate_depend_eliminate]: 3.88001e-06 [updatestate_assign_eliminate]: 2.77002e-06 [updatestate_loads_eliminate]: 3.3e-06 [parameter_eliminate]: 1.17e-06 [a_2]: 7.963e-05 [accelerated_algorithm]: 6.76e-06 [shard]: 1.56002e-06 [meta_shard_fg_expand]: 1.69998e-06 [shard_inline]: 6.29001e-06 [merge_send_recv]: 6.96999e-06 [auto_parallel]: 7.7e-06 [parallel]: 6.89999e-06 [flash_sp]: 4.14002e-06 [merge_comm]: 6.12999e-06 [allreduce_fusion]: 3.3e-06 [matmul_add_comm_reduction]: 6.28998e-06 [allreduce_slice_to_reducescatter]: 7.2e-07 [virtual_shard_identity]: 7.25e-06 [virtual_dataset]: 6.93998e-06 [get_grad_eliminate_]: 6.17999e-06 [virtual_output]: 5.72001e-06 [merge_forward]: 3.78001e-06 [cell_reuse_recompute_pass]: 2.11003e-06 [offload_activation]: 9.12001e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.21e-05 [merge_recompute_call_nodes]: 9.20001e-07 [before_grad]: 9.36e-06 [set_forward_comm_id_for_comm_node_pass]: 5.59e-06 [meta_fg_expand]: 2.66e-06 [flash_sp_send_recv_attached]: 1.76e-06 [receive_attached]: 1.66e-06 [after_resolve]: 1.255e-05 [a_after_grad]: 1.015e-05 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 1.60001e-06 [auto_monad_grad]: 1.40999e-06 [auto_monad_eliminator]: 8.82e-06 [cse]: 1.616e-05 [a_3]: 3.661e-05 [py_interpret_to_execute_after_opt_a]: 1.501e-05 [slice_cell_reuse_recomputed_activation]: 1.79998e-06 [rewriter_after_opt_a]: 3.949e-05 [convert_after_rewriter]: 7.98999e-06 [order_py_execute_after_rewriter]: 5.69999e-06 [mutable_eliminate]: 0.00081789 [opt_b]: 0.00022702, [1] [Cycle 1]: 0.00021834, [7] [b_1]: 0.00012915 [b_2]: 8.89e-06 [updatestate_depend_eliminate]: 7.85e-06 [updatestate_assign_eliminate]: 3.14001e-06 [updatestate_loads_eliminate]: 3.33e-06 [renormalize]: 1.05001e-06 [cse]: 2.578e-05 [optimize_parallel_all_gather_comm]: 1.997e-05 [overlap_param_gather]: 2.26e-06 [cconv]: 3.83e-05 [loop_unroll]: 0.00052695 [opt_after_cconv]: 0.00010852, [1] [Cycle 1]: 0.00010201, [7] [c_1]: 3.234e-05 [parameter_eliminate]: 4.22998e-06 [updatestate_depend_eliminate]: 6.33e-06 [updatestate_assign_eliminate]: 2.41e-06 [updatestate_loads_eliminate]: 2.51998e-06 [cse]: 1.951e-05 [renormalize]: 2.59985e-07 [remove_dup_value]: 1.514e-05 [tuple_transform]: 8.237e-05, [1] [Cycle 1]: 7.711e-05, [4] [d_1]: 4.873e-05 [none_parameter_eliminate]: 1.99e-06 [renormalize]: 2.00002e-07 [switch_simplify]: 7.3e-06 [partial_unused_args_eliminate]: 2.02001e-06 [add_recomputation]: 5.43e-05 [cse_after_recomputation]: 2.322e-05, [1] [Cycle 1]: 1.847e-05, [1] [cse]: 1.271e-05 [environ_conv]: 5.91e-06 [swap_dp_allreduce_reducescatter]: 5.64e-06 [bias_add_comm_swap]: 3.4e-06 [label_micro_interleaved_index]: 4.53999e-06 [label_fine_grained_interleaved_index]: 2.69999e-06 [merge_cast_opt]: 1.38002e-06 [slice_recompute_activation]: 2.36e-06 [micro_interleaved_order_control]: 2.41998e-06 [assign_add_opt]: 1.26997e-06 [ForceFp32Comm]: 8.30012e-07 [remove_cast_before_assign_add]: 1.10999e-06 [full_micro_interleaved_order_control]: 1.97999e-06 [reorder_send_recv_between_fp_bp]: 2.69999e-06 [comm_op_add_attrs]: 1.17e-06 [add_comm_op_reuse_tag]: 9.79984e-07 [interleave_split_concat_branches]: 1.12e-06 [interleave_parallel_branches]: 1.00999e-06 [overlap_opt_shard_in_pipeline]: 1.39003e-06 [overlap_opt_shard_grad_in_pipeline]: 2.19001e-06 [control_data_broadcast_order]: 1.299e-05 [grouped_pairwise_exchange_alltoall]: 1.66e-06 [offloading_packed_experts]: 4.18001e-06 [overlap_recompute_and_grad_model_parallel]: 4.57e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.17e-06 [overlap_recompute_allgather_and_fa_grad]: 1.42999e-06 [overlap_recompute_comm]: 2.47001e-06 [overlap_grad_ring_attention]: 4.48001e-06 [overlap_grad_flash_sp]: 2.208e-05 [begin_end_overlap_inline]: 5.09986e-07 [split_matmul_comm_elemetwise]: 2.34999e-06 [split_layernorm_comm]: 1.91e-06 [handle_group_info]: 1.00999e-06 [symbol_engine_optimizer]: 8.226e-05, [1] [Cycle 1]: 7.758e-05, [6] [build]: 3.76001e-06 [elim_shapecalc]: 1.056e-05 [elim_not_effective]: 1.345e-05 [opt_reshape]: 8.11002e-06 [fold_const_symbol]: 1.075e-05 [renormalize]: 2.10013e-07 [detach_backward]: 2.64001e-06 [pipeline_parallel_scheduler]: 1.74e-06 [auto_monad_reorder]: 1.905e-05 [get_jit_bprop_graph]: 1.94e-06 [rewriter_after_jit_bprop_graph]: 6.81999e-06 [opt_after_jit_grad]: 0.00054775 [validate]: 4.805e-05 Sums bootstrap : 0.000436s : 0.77% type_inference : 0.050611s : 89.06% event_method : 0.000021s : 0.04% auto_monad : 0.000071s : 0.13% graph_reusing : 0.000006s : 0.01% inline : 0.000003s : 0.01% add_attr.add_attr_with_inline.tag_attr : 0.000023s : 0.04% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.01% parallel-infer-symbol : 0.000003s : 0.01% pre_auto_parallel : 0.000041s : 0.07% insert-virtual-dataset : 0.000002s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000030s : 0.05% optimize.rewriter_before_opt_a : 0.000109s : 0.19% optimize.opt_a.expand_dump_flag : 0.000006s : 0.01% optimize.opt_a.switch_simplify : 0.000229s : 0.40% optimize.opt_a.loop_unroll : 0.000044s : 0.08% optimize.opt_a.a_1 : 0.000851s : 1.50% optimize.opt_a.with_stream_mark : 0.000043s : 0.08% optimize.opt_a.recompute_prepare : 0.000019s : 0.03% optimize.opt_a.updatestate_depend_eliminate : 0.000008s : 0.01% optimize.opt_a.updatestate_assign_eliminate : 0.000006s : 0.01% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.01% optimize.opt_a.parameter_eliminate : 0.000004s : 0.01% optimize.opt_a.a_2 : 0.000171s : 0.30% optimize.opt_a.accelerated_algorithm : 0.000015s : 0.03% optimize.opt_a.shard : 0.000004s : 0.01% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.01% optimize.opt_a.shard_inline : 0.000013s : 0.02% optimize.opt_a.merge_send_recv : 0.000018s : 0.03% optimize.opt_a.auto_parallel : 0.000017s : 0.03% optimize.opt_a.parallel : 0.000029s : 0.05% optimize.opt_a.flash_sp : 0.000015s : 0.03% optimize.opt_a.merge_comm : 0.000010s : 0.02% optimize.opt_a.allreduce_fusion : 0.000007s : 0.01% optimize.opt_a.matmul_add_comm_reduction : 0.000017s : 0.03% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000002s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000016s : 0.03% optimize.opt_a.virtual_dataset : 0.000014s : 0.03% optimize.opt_a.get_grad_eliminate_ : 0.000013s : 0.02% optimize.opt_a.virtual_output : 0.000012s : 0.02% optimize.opt_a.merge_forward : 0.000008s : 0.01% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.01% optimize.opt_a.offload_activation : 0.000021s : 0.04% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000025s : 0.04% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.00% optimize.opt_a.before_grad : 0.000021s : 0.04% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000010s : 0.02% optimize.opt_a.meta_fg_expand : 0.000006s : 0.01% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.01% optimize.opt_a.receive_attached : 0.000004s : 0.01% optimize.opt_a.after_resolve : 0.000025s : 0.04% optimize.opt_a.a_after_grad : 0.000022s : 0.04% optimize.opt_a.renormalize : 0.000889s : 1.56% optimize.opt_a.add_forward_monad_depend : 0.000008s : 0.01% optimize.opt_a.auto_monad_grad : 0.000005s : 0.01% optimize.opt_a.auto_monad_eliminator : 0.000027s : 0.05% optimize.opt_a.cse : 0.000049s : 0.09% optimize.opt_a.a_3 : 0.000090s : 0.16% optimize.py_interpret_to_execute_after_opt_a : 0.000015s : 0.03% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.00% optimize.rewriter_after_opt_a : 0.000039s : 0.07% optimize.convert_after_rewriter : 0.000008s : 0.01% optimize.order_py_execute_after_rewriter : 0.000006s : 0.01% optimize.mutable_eliminate : 0.000818s : 1.44% optimize.opt_b.b_1 : 0.000129s : 0.23% optimize.opt_b.b_2 : 0.000009s : 0.02% optimize.opt_b.updatestate_depend_eliminate : 0.000008s : 0.01% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.01% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.01% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000026s : 0.05% optimize.optimize_parallel_all_gather_comm : 0.000020s : 0.04% optimize.overlap_param_gather : 0.000002s : 0.00% optimize.cconv : 0.000038s : 0.07% optimize.loop_unroll : 0.000527s : 0.93% optimize.opt_after_cconv.c_1 : 0.000032s : 0.06% optimize.opt_after_cconv.parameter_eliminate : 0.000004s : 0.01% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.01% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000002s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.cse : 0.000020s : 0.03% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000015s : 0.03% optimize.tuple_transform.d_1 : 0.000049s : 0.09% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000007s : 0.01% optimize.partial_unused_args_eliminate : 0.000002s : 0.00% optimize.add_recomputation : 0.000054s : 0.10% optimize.cse_after_recomputation.cse : 0.000013s : 0.02% optimize.environ_conv : 0.000006s : 0.01% optimize.swap_dp_allreduce_reducescatter : 0.000006s : 0.01% optimize.bias_add_comm_swap : 0.000003s : 0.01% optimize.label_micro_interleaved_index : 0.000005s : 0.01% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.00% optimize.merge_cast_opt : 0.000001s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.00% optimize.micro_interleaved_order_control : 0.000002s : 0.00% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.00% optimize.full_micro_interleaved_order_control : 0.000002s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.00% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.00% optimize.control_data_broadcast_order : 0.000013s : 0.02% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.00% optimize.offloading_packed_experts : 0.000004s : 0.01% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.01% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.00% optimize.overlap_recompute_comm : 0.000002s : 0.00% optimize.overlap_grad_ring_attention : 0.000004s : 0.01% optimize.overlap_grad_flash_sp : 0.000022s : 0.04% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.00% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000004s : 0.01% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000011s : 0.02% optimize.symbol_engine_optimizer.elim_not_effective : 0.000013s : 0.02% optimize.symbol_engine_optimizer.opt_reshape : 0.000008s : 0.01% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000011s : 0.02% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000003s : 0.00% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000019s : 0.03% get_jit_bprop_graph : 0.000002s : 0.00% rewriter_after_jit_bprop_graph : 0.000007s : 0.01% opt_after_jit_grad : 0.000548s : 0.96% validate : 0.000048s : 0.08% Time group info: ------[substitution.] 0.000254 34 15.39% : 0.000039s : 6: substitution.arithmetic_simplify 0.82% : 0.000002s : 2: substitution.elim_not_effective 0.62% : 0.000002s : 2: substitution.fold_const_symbol 2.96% : 0.000008s : 4: substitution.graph_param_transform 67.87% : 0.000172s : 4: substitution.inline 1.68% : 0.000004s : 4: substitution.j_node_and_user_rematch 2.08% : 0.000005s : 4: substitution.remove_not_recompute_node 2.30% : 0.000006s : 4: substitution.replace_old_param 6.28% : 0.000016s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.050535 2 98.36% : 0.049705s : 1: type_inference.infer 1.64% : 0.000830s : 1: type_inference.specialize ------[replace.] 0.000068 8 62.91% : 0.000043s : 4: replace.inline 37.09% : 0.000025s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000184 8 92.24% : 0.000169s : 4: match.inline 7.76% : 0.000014s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000223 1278 0.99% : 0.000002s : 13: predicate.accumulaten_eliminater 0.73% : 0.000002s : 4: predicate.ad_related_special_op_eliminate 0.56% : 0.000001s : 8: predicate.addn_check_dump 1.19% : 0.000003s : 13: predicate.addn_zero_filter 0.73% : 0.000002s : 13: predicate.adjust_all_reduce_mul_add 2.39% : 0.000005s : 21: predicate.arithmetic_simplify 1.22% : 0.000003s : 13: predicate.cast_eliminate 0.56% : 0.000001s : 8: predicate.check_bprop_eliminate 0.54% : 0.000001s : 8: predicate.compare_switch_simplify 0.16% : 0.000000s : 4: predicate.const_output_eliminate 0.60% : 0.000001s : 8: predicate.depend_value_elim 0.84% : 0.000002s : 13: predicate.dict_get_item_const_eliminator 1.19% : 0.000003s : 13: predicate.dict_get_item_eliminator 0.84% : 0.000002s : 13: predicate.dict_set_item_eliminator 0.92% : 0.000002s : 8: predicate.dumpgradient_eliminate 0.24% : 0.000001s : 4: predicate.elim_not_effective 0.42% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.11% : 0.000002s : 17: predicate.environ_add_const_eliminate 1.05% : 0.000002s : 17: predicate.environ_get_add_eliminate 1.00% : 0.000002s : 17: predicate.environ_get_depend_swap 1.60% : 0.000004s : 25: predicate.environ_get_eliminate 0.95% : 0.000002s : 17: predicate.environ_get_set_eliminate 1.34% : 0.000003s : 21: predicate.exchange_switch_depend_value 2.45% : 0.000005s : 21: predicate.float_depend_g_call 0.51% : 0.000001s : 8: predicate.float_environ_get_switch 0.80% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.18% : 0.000000s : 4: predicate.fold_const_symbol 0.61% : 0.000001s : 8: predicate.get_grad_eliminate 0.27% : 0.000001s : 4: predicate.graph_param_transform 0.65% : 0.000001s : 8: predicate.incorporate_call 0.46% : 0.000001s : 8: predicate.incorporate_call_switch 6.10% : 0.000014s : 58: predicate.inline 0.89% : 0.000002s : 8: predicate.inline_without_move 0.30% : 0.000001s : 8: predicate.j_node_and_user_rematch 1.05% : 0.000002s : 8: predicate.less_batch_normalization 2.05% : 0.000005s : 25: predicate.list_to_tuple_eliminator_ 2.37% : 0.000005s : 38: predicate.load_eliminater 1.25% : 0.000003s : 4: predicate.loop_unroll_after_grad 2.80% : 0.000006s : 34: predicate.loop_unroll_before_grad 1.89% : 0.000004s : 21: predicate.make_slice_get_slice_eliminator 0.51% : 0.000001s : 8: predicate.merge_addn 0.58% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.67% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.73% : 0.000002s : 13: predicate.minmaximum_grad 1.41% : 0.000003s : 4: predicate.mutable_eliminate 0.34% : 0.000001s : 4: predicate.opt_reshape 0.39% : 0.000001s : 4: predicate.parallel_virtual_node 1.69% : 0.000004s : 21: predicate.partial_defer_inline 1.56% : 0.000003s : 21: predicate.partial_eliminate 0.90% : 0.000002s : 13: predicate.print_const_string_wrapper 0.56% : 0.000001s : 8: predicate.reduce_all_const_elim 1.22% : 0.000003s : 13: predicate.reduce_eliminate 2.54% : 0.000006s : 38: predicate.redundant_stop_gradient_eliminater 0.63% : 0.000001s : 8: predicate.remove_not_recompute_node 1.32% : 0.000003s : 25: predicate.replace_applicator 0.48% : 0.000001s : 8: predicate.replace_old_param 0.26% : 0.000001s : 4: predicate.reset_defer_inline 0.95% : 0.000002s : 13: predicate.reshape_eliminate 0.64% : 0.000001s : 8: predicate.row_tensor_add_zeros_like 0.51% : 0.000001s : 4: predicate.row_tensor_eliminate 0.74% : 0.000002s : 8: predicate.same_eliminate 0.64% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.81% : 0.000002s : 8: predicate.shard_identity_eliminate 0.77% : 0.000002s : 8: predicate.special_op_eliminate 0.76% : 0.000002s : 8: predicate.specialize_transform 0.99% : 0.000002s : 8: predicate.split_environ_get_set_with_tuple_value 0.75% : 0.000002s : 8: predicate.stack_unstack_eliminate 0.31% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.46% : 0.000003s : 21: predicate.switch_defer_inline 1.99% : 0.000004s : 29: predicate.switch_layer_defer_inline 4.88% : 0.000011s : 67: predicate.switch_simplify 0.90% : 0.000002s : 13: predicate.tile_eliminate 0.84% : 0.000002s : 13: predicate.transpose_eliminate 1.58% : 0.000004s : 21: predicate.tuple_list_convert_item_index_to_positive 1.41% : 0.000003s : 21: predicate.tuple_list_get_item_const_eliminator 1.59% : 0.000004s : 21: predicate.tuple_list_get_item_depend_reorder 3.06% : 0.000007s : 33: predicate.tuple_list_get_item_eliminator 1.55% : 0.000003s : 21: predicate.tuple_list_get_set_item_eliminator 2.07% : 0.000005s : 29: predicate.tuple_list_set_item_eliminator 1.56% : 0.000003s : 25: predicate.tuple_to_list_eliminator_ 2.33% : 0.000005s : 38: predicate.updatestate_pure_node_eliminater 2.76% : 0.000006s : 46: predicate.updatestate_useless_node_eliminater 0.42% : 0.000001s : 4: predicate.value_based_eliminate 0.79% : 0.000002s : 8: predicate.virtual_dataset_eliminate 0.69% : 0.000002s : 8: predicate.virtual_output_eliminate 0.27% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.39% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000623 11 52.40% : 0.000326s : 5: func_graph_cloner_run.FuncGraphClonerGraph 47.60% : 0.000296s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.073323 192 0.00% : 0.000004s : 1: ForceFp32Comm 4.81% : 0.003526s : 1: add_attr 4.79% : 0.003513s : 1: add_attr_with_inline 0.01% : 0.000005s : 1: add_comm_op_reuse_tag 0.08% : 0.000059s : 1: add_recomputation 0.01% : 0.000004s : 1: assign_add_opt 0.10% : 0.000077s : 1: auto_monad 0.03% : 0.000023s : 1: auto_monad_reorder 0.00% : 0.000003s : 1: begin_end_overlap_inline 0.01% : 0.000006s : 1: bias_add_comm_swap 0.63% : 0.000463s : 1: bootstrap 0.06% : 0.000042s : 1: cconv 0.01% : 0.000004s : 1: comm_op_add_attrs 0.02% : 0.000016s : 1: control_data_broadcast_order 0.02% : 0.000011s : 1: convert_after_rewriter 0.04% : 0.000026s : 1: cse_after_recomputation 0.01% : 0.000005s : 1: dataset_repeat_opt 0.01% : 0.000006s : 1: detach_backward 0.01% : 0.000009s : 1: environ_conv 0.04% : 0.000028s : 1: event_method 0.01% : 0.000005s : 1: full_micro_interleaved_order_control 0.01% : 0.000005s : 1: get_jit_bprop_graph 0.01% : 0.000010s : 1: graph_reusing 0.01% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000004s : 1: handle_group_info 0.01% : 0.000006s : 1: inline 0.01% : 0.000006s : 1: insert-virtual-dataset 0.01% : 0.000004s : 1: interleave_parallel_branches 0.01% : 0.000004s : 1: interleave_split_concat_branches 0.01% : 0.000006s : 1: label_fine_grained_interleaved_index 0.01% : 0.000007s : 1: label_micro_interleaved_index 0.73% : 0.000535s : 1: loop_unroll 0.01% : 0.000004s : 1: merge_cast_opt 0.01% : 0.000006s : 1: micro_interleaved_order_control 1.13% : 0.000830s : 1: mutable_eliminate 0.01% : 0.000007s : 1: offloading_packed_experts 0.02% : 0.000017s : 1: opt.transform.loop_unroll_optimizer 0.03% : 0.000020s : 1: opt.transform.mutable_eliminate 2.02% : 0.001480s : 78: opt.transform.opt_a 0.04% : 0.000031s : 1: opt.transform.opt_after_cconv 0.04% : 0.000027s : 1: opt.transform.opt_after_jit_grad 0.14% : 0.000105s : 28: opt.transform.opt_b 0.07% : 0.000054s : 2: opt.transform.opt_trans_graph 0.05% : 0.000039s : 4: opt.transform.symbol_engine_opt 4.55% : 0.003339s : 1: opt_a 0.15% : 0.000112s : 1: opt_after_cconv 0.76% : 0.000560s : 1: opt_after_jit_grad 0.32% : 0.000231s : 1: opt_b 8.04% : 0.005896s : 1: optimize 0.03% : 0.000023s : 1: optimize_parallel_all_gather_comm 0.01% : 0.000009s : 1: order_py_execute_after_rewriter 0.04% : 0.000026s : 1: overlap_grad_flash_sp 0.01% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.01% : 0.000007s : 1: overlap_grad_ring_attention 0.01% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000005s : 1: overlap_opt_shard_in_pipeline 0.01% : 0.000005s : 1: overlap_param_gather 0.01% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.01% : 0.000008s : 1: overlap_recompute_and_grad_model_parallel 0.01% : 0.000005s : 1: overlap_recompute_comm 0.01% : 0.000007s : 1: parallel-infer-symbol 0.01% : 0.000004s : 1: parallel-infer-symbol-second 0.01% : 0.000005s : 1: partial_unused_args_eliminate 0.01% : 0.000005s : 1: pipeline_parallel_scheduler 0.01% : 0.000005s : 1: pipeline_split 0.06% : 0.000046s : 1: pre_auto_parallel 0.05% : 0.000034s : 1: py_interpret_to_execute 0.03% : 0.000018s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000004s : 1: remove_cast_before_assign_add 0.03% : 0.000019s : 1: remove_dup_value 0.67% : 0.000490s : 1: renormalize.infer 0.53% : 0.000387s : 1: renormalize.specialize 0.01% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.01% : 0.000010s : 1: rewriter_after_jit_bprop_graph 0.06% : 0.000044s : 1: rewriter_after_opt_a 0.16% : 0.000114s : 1: rewriter_before_opt_a 0.01% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.01% : 0.000006s : 1: slice_recompute_activation 0.01% : 0.000006s : 1: split_layernorm_comm 0.01% : 0.000005s : 1: split_matmul_comm_elemetwise 0.01% : 0.000009s : 1: swap_dp_allreduce_reducescatter 0.12% : 0.000085s : 1: symbol_engine_optimizer 0.12% : 0.000085s : 1: tuple_transform 69.06% : 0.050634s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:43:51.459.413 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:43:51.459.671 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0187146, [21] [bootstrap]: 0.00045046 [type_inference]: 0.00640866 [event_method]: 2.089e-05 [auto_monad]: 6.488e-05 [graph_reusing]: 6.31e-06 [inline]: 2.58e-06 [add_attr]: 0.00364391, [1] [add_attr_with_inline]: 0.00363148, [1] [Cycle 1]: 9.473e-05, [2] [tag_attr]: 2.363e-05 [meta_addattr_fg_expand]: 5.66998e-06 [parallel-infer-symbol]: 3.61999e-06 [pre_auto_parallel]: 4.274e-05 [insert-virtual-dataset]: 2.43002e-06 [parallel-infer-symbol-second]: 8.30012e-07 [dataset_repeat_opt]: 2.07999e-06 [pipeline_split]: 1.86e-06 [optimize]: 0.00623769, [53] [py_interpret_to_execute]: 3.514e-05 [rewriter_before_opt_a]: 0.0001 [opt_a]: 0.00349869, [2] [Cycle 1]: 0.00258636, [45] [expand_dump_flag]: 3.75998e-06 [switch_simplify]: 4.564e-05 [loop_unroll]: 3.335e-05 [a_1]: 0.00069757 [with_stream_mark]: 2.303e-05 [recompute_prepare]: 1.023e-05 [updatestate_depend_eliminate]: 4.80001e-06 [updatestate_assign_eliminate]: 3.56999e-06 [updatestate_loads_eliminate]: 3.32997e-06 [parameter_eliminate]: 2.51e-06 [a_2]: 0.00013718 [accelerated_algorithm]: 9.47001e-06 [shard]: 2.31e-06 [meta_shard_fg_expand]: 2.51e-06 [shard_inline]: 7.11001e-06 [merge_send_recv]: 9.12001e-06 [auto_parallel]: 1.012e-05 [parallel]: 2.176e-05 [flash_sp]: 1.176e-05 [merge_comm]: 4.32e-06 [allreduce_fusion]: 3.65e-06 [matmul_add_comm_reduction]: 1.019e-05 [allreduce_slice_to_reducescatter]: 8.2e-07 [virtual_shard_identity]: 1.115e-05 [virtual_dataset]: 7.04001e-06 [get_grad_eliminate_]: 7.25e-06 [virtual_output]: 7.01001e-06 [merge_forward]: 4.43999e-06 [cell_reuse_recompute_pass]: 1.34e-06 [offload_activation]: 1.087e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.753e-05 [merge_recompute_call_nodes]: 1.73002e-06 [before_grad]: 1.211e-05 [set_forward_comm_id_for_comm_node_pass]: 4.35e-06 [meta_fg_expand]: 3.31001e-06 [flash_sp_send_recv_attached]: 3.06999e-06 [receive_attached]: 2.67001e-06 [after_resolve]: 1.31e-05 [a_after_grad]: 1.222e-05 [renormalize]: 0.00085655 [add_forward_monad_depend]: 7.9e-06 [auto_monad_grad]: 2.36e-06 [auto_monad_eliminator]: 1.856e-05 [cse]: 3.057e-05 [a_3]: 6.795e-05 [Cycle 2]: 0.00089345, [45] [expand_dump_flag]: 2.58e-06 [switch_simplify]: 9.10999e-06 [loop_unroll]: 6.31998e-06 [a_1]: 0.00014855 [with_stream_mark]: 1.598e-05 [recompute_prepare]: 6.62002e-06 [updatestate_depend_eliminate]: 4.07998e-06 [updatestate_assign_eliminate]: 3.18e-06 [updatestate_loads_eliminate]: 2.74999e-06 [parameter_eliminate]: 1.72001e-06 [a_2]: 0.00010762 [accelerated_algorithm]: 6.72002e-06 [shard]: 2.08002e-06 [meta_shard_fg_expand]: 1.91003e-06 [shard_inline]: 6.62002e-06 [merge_send_recv]: 6.41e-06 [auto_parallel]: 7.42998e-06 [parallel]: 7.23e-06 [flash_sp]: 4.27998e-06 [merge_comm]: 3.88999e-06 [allreduce_fusion]: 3.20002e-06 [matmul_add_comm_reduction]: 7.51999e-06 [allreduce_slice_to_reducescatter]: 9.89996e-07 [virtual_shard_identity]: 8.25e-06 [virtual_dataset]: 6.19001e-06 [get_grad_eliminate_]: 5.77999e-06 [virtual_output]: 6.09999e-06 [merge_forward]: 4.13001e-06 [cell_reuse_recompute_pass]: 2.21e-06 [offload_activation]: 7.98999e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.524e-05 [merge_recompute_call_nodes]: 1.20999e-06 [before_grad]: 1.136e-05 [set_forward_comm_id_for_comm_node_pass]: 4.00998e-06 [meta_fg_expand]: 2.31e-06 [flash_sp_send_recv_attached]: 1.42e-06 [receive_attached]: 1.61998e-06 [after_resolve]: 1.136e-05 [a_after_grad]: 9.80002e-06 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 2.12001e-06 [auto_monad_grad]: 1.49e-06 [auto_monad_eliminator]: 8.25999e-06 [cse]: 1.786e-05 [a_3]: 5.09e-05 [py_interpret_to_execute_after_opt_a]: 1.742e-05 [slice_cell_reuse_recomputed_activation]: 5.24e-06 [rewriter_after_opt_a]: 4.408e-05 [convert_after_rewriter]: 1.048e-05 [order_py_execute_after_rewriter]: 8.86997e-06 [mutable_eliminate]: 0.0007399 [opt_b]: 0.00028918, [1] [Cycle 1]: 0.00027784, [7] [b_1]: 0.00017138 [b_2]: 8.59998e-06 [updatestate_depend_eliminate]: 8.99e-06 [updatestate_assign_eliminate]: 3.2e-06 [updatestate_loads_eliminate]: 3.21999e-06 [renormalize]: 8.89995e-07 [cse]: 2.328e-05 [optimize_parallel_all_gather_comm]: 2.304e-05 [overlap_param_gather]: 5.18002e-06 [cconv]: 3.701e-05 [loop_unroll]: 0.00054083 [opt_after_cconv]: 0.00013562, [1] [Cycle 1]: 0.00012553, [7] [c_1]: 3.196e-05 [parameter_eliminate]: 4.72998e-06 [updatestate_depend_eliminate]: 7.45003e-06 [updatestate_assign_eliminate]: 2.78e-06 [updatestate_loads_eliminate]: 2.39999e-06 [cse]: 2.067e-05 [renormalize]: 2.60014e-07 [remove_dup_value]: 1.708e-05 [tuple_transform]: 0.00010083, [1] [Cycle 1]: 9.223e-05, [4] [d_1]: 4.974e-05 [none_parameter_eliminate]: 1.96e-06 [renormalize]: 2.19996e-07 [switch_simplify]: 7.66999e-06 [partial_unused_args_eliminate]: 4.99e-06 [add_recomputation]: 5.81e-05 [cse_after_recomputation]: 3.023e-05, [1] [Cycle 1]: 2.294e-05, [1] [cse]: 1.246e-05 [environ_conv]: 9.55001e-06 [swap_dp_allreduce_reducescatter]: 1.013e-05 [bias_add_comm_swap]: 5.66998e-06 [label_micro_interleaved_index]: 8.28001e-06 [label_fine_grained_interleaved_index]: 6.06e-06 [merge_cast_opt]: 3.68999e-06 [slice_recompute_activation]: 4.76002e-06 [micro_interleaved_order_control]: 5.22999e-06 [assign_add_opt]: 4.22e-06 [ForceFp32Comm]: 3.29001e-06 [remove_cast_before_assign_add]: 3.42997e-06 [full_micro_interleaved_order_control]: 5.25001e-06 [reorder_send_recv_between_fp_bp]: 5.27001e-06 [comm_op_add_attrs]: 4.57998e-06 [add_comm_op_reuse_tag]: 3.41999e-06 [interleave_split_concat_branches]: 3.58e-06 [interleave_parallel_branches]: 3.63e-06 [overlap_opt_shard_in_pipeline]: 3.93999e-06 [overlap_opt_shard_grad_in_pipeline]: 4.60001e-06 [control_data_broadcast_order]: 1.694e-05 [grouped_pairwise_exchange_alltoall]: 4.08999e-06 [offloading_packed_experts]: 7.6e-06 [overlap_recompute_and_grad_model_parallel]: 8.05e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.76999e-06 [overlap_recompute_allgather_and_fa_grad]: 3.87998e-06 [overlap_recompute_comm]: 5.10001e-06 [overlap_grad_ring_attention]: 7.03e-06 [overlap_grad_flash_sp]: 2.669e-05 [begin_end_overlap_inline]: 3.04001e-06 [split_matmul_comm_elemetwise]: 4.58999e-06 [split_layernorm_comm]: 4.35e-06 [handle_group_info]: 3.72002e-06 [symbol_engine_optimizer]: 0.00010848, [1] [Cycle 1]: 0.00010021, [6] [build]: 3.66999e-06 [elim_shapecalc]: 1.136e-05 [elim_not_effective]: 1.429e-05 [opt_reshape]: 7.6e-06 [fold_const_symbol]: 1.027e-05 [renormalize]: 1.99972e-07 [detach_backward]: 4.42e-06 [pipeline_parallel_scheduler]: 1.74998e-06 [auto_monad_reorder]: 2.332e-05 [get_jit_bprop_graph]: 2.19001e-06 [rewriter_after_jit_bprop_graph]: 6.36e-06 [opt_after_jit_grad]: 0.00061468 [validate]: 4.636e-05 Sums bootstrap : 0.000450s : 3.55% type_inference : 0.006409s : 50.47% event_method : 0.000021s : 0.16% auto_monad : 0.000065s : 0.51% graph_reusing : 0.000006s : 0.05% inline : 0.000003s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000024s : 0.19% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.04% parallel-infer-symbol : 0.000004s : 0.03% pre_auto_parallel : 0.000043s : 0.34% insert-virtual-dataset : 0.000002s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000035s : 0.28% optimize.rewriter_before_opt_a : 0.000100s : 0.79% optimize.opt_a.expand_dump_flag : 0.000006s : 0.05% optimize.opt_a.switch_simplify : 0.000055s : 0.43% optimize.opt_a.loop_unroll : 0.000040s : 0.31% optimize.opt_a.a_1 : 0.000846s : 6.66% optimize.opt_a.with_stream_mark : 0.000039s : 0.31% optimize.opt_a.recompute_prepare : 0.000017s : 0.13% optimize.opt_a.updatestate_depend_eliminate : 0.000009s : 0.07% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.05% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.05% optimize.opt_a.parameter_eliminate : 0.000004s : 0.03% optimize.opt_a.a_2 : 0.000245s : 1.93% optimize.opt_a.accelerated_algorithm : 0.000016s : 0.13% optimize.opt_a.shard : 0.000004s : 0.03% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.03% optimize.opt_a.shard_inline : 0.000014s : 0.11% optimize.opt_a.merge_send_recv : 0.000016s : 0.12% optimize.opt_a.auto_parallel : 0.000018s : 0.14% optimize.opt_a.parallel : 0.000029s : 0.23% optimize.opt_a.flash_sp : 0.000016s : 0.13% optimize.opt_a.merge_comm : 0.000008s : 0.06% optimize.opt_a.allreduce_fusion : 0.000007s : 0.05% optimize.opt_a.matmul_add_comm_reduction : 0.000018s : 0.14% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000002s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000019s : 0.15% optimize.opt_a.virtual_dataset : 0.000013s : 0.10% optimize.opt_a.get_grad_eliminate_ : 0.000013s : 0.10% optimize.opt_a.virtual_output : 0.000013s : 0.10% optimize.opt_a.merge_forward : 0.000009s : 0.07% optimize.opt_a.cell_reuse_recompute_pass : 0.000004s : 0.03% optimize.opt_a.offload_activation : 0.000019s : 0.15% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000033s : 0.26% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.02% optimize.opt_a.before_grad : 0.000023s : 0.18% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000008s : 0.07% optimize.opt_a.meta_fg_expand : 0.000006s : 0.04% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.04% optimize.opt_a.receive_attached : 0.000004s : 0.03% optimize.opt_a.after_resolve : 0.000024s : 0.19% optimize.opt_a.a_after_grad : 0.000022s : 0.17% optimize.opt_a.renormalize : 0.000857s : 6.75% optimize.opt_a.add_forward_monad_depend : 0.000010s : 0.08% optimize.opt_a.auto_monad_grad : 0.000004s : 0.03% optimize.opt_a.auto_monad_eliminator : 0.000027s : 0.21% optimize.opt_a.cse : 0.000048s : 0.38% optimize.opt_a.a_3 : 0.000119s : 0.94% optimize.py_interpret_to_execute_after_opt_a : 0.000017s : 0.14% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.04% optimize.rewriter_after_opt_a : 0.000044s : 0.35% optimize.convert_after_rewriter : 0.000010s : 0.08% optimize.order_py_execute_after_rewriter : 0.000009s : 0.07% optimize.mutable_eliminate : 0.000740s : 5.83% optimize.opt_b.b_1 : 0.000171s : 1.35% optimize.opt_b.b_2 : 0.000009s : 0.07% optimize.opt_b.updatestate_depend_eliminate : 0.000009s : 0.07% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_b.renormalize : 0.000001s : 0.01% optimize.opt_b.cse : 0.000023s : 0.18% optimize.optimize_parallel_all_gather_comm : 0.000023s : 0.18% optimize.overlap_param_gather : 0.000005s : 0.04% optimize.cconv : 0.000037s : 0.29% optimize.loop_unroll : 0.000541s : 4.26% optimize.opt_after_cconv.c_1 : 0.000032s : 0.25% optimize.opt_after_cconv.parameter_eliminate : 0.000005s : 0.04% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000007s : 0.06% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.02% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.02% optimize.opt_after_cconv.cse : 0.000021s : 0.16% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000017s : 0.13% optimize.tuple_transform.d_1 : 0.000050s : 0.39% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.02% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000008s : 0.06% optimize.partial_unused_args_eliminate : 0.000005s : 0.04% optimize.add_recomputation : 0.000058s : 0.46% optimize.cse_after_recomputation.cse : 0.000012s : 0.10% optimize.environ_conv : 0.000010s : 0.08% optimize.swap_dp_allreduce_reducescatter : 0.000010s : 0.08% optimize.bias_add_comm_swap : 0.000006s : 0.04% optimize.label_micro_interleaved_index : 0.000008s : 0.07% optimize.label_fine_grained_interleaved_index : 0.000006s : 0.05% optimize.merge_cast_opt : 0.000004s : 0.03% optimize.slice_recompute_activation : 0.000005s : 0.04% optimize.micro_interleaved_order_control : 0.000005s : 0.04% optimize.assign_add_opt : 0.000004s : 0.03% optimize.ForceFp32Comm : 0.000003s : 0.03% optimize.remove_cast_before_assign_add : 0.000003s : 0.03% optimize.full_micro_interleaved_order_control : 0.000005s : 0.04% optimize.reorder_send_recv_between_fp_bp : 0.000005s : 0.04% optimize.comm_op_add_attrs : 0.000005s : 0.04% optimize.add_comm_op_reuse_tag : 0.000003s : 0.03% optimize.interleave_split_concat_branches : 0.000004s : 0.03% optimize.interleave_parallel_branches : 0.000004s : 0.03% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.03% optimize.overlap_opt_shard_grad_in_pipeline : 0.000005s : 0.04% optimize.control_data_broadcast_order : 0.000017s : 0.13% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.03% optimize.offloading_packed_experts : 0.000008s : 0.06% optimize.overlap_recompute_and_grad_model_parallel : 0.000008s : 0.06% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.03% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.03% optimize.overlap_recompute_comm : 0.000005s : 0.04% optimize.overlap_grad_ring_attention : 0.000007s : 0.06% optimize.overlap_grad_flash_sp : 0.000027s : 0.21% optimize.begin_end_overlap_inline : 0.000003s : 0.02% optimize.split_matmul_comm_elemetwise : 0.000005s : 0.04% optimize.split_layernorm_comm : 0.000004s : 0.03% optimize.handle_group_info : 0.000004s : 0.03% optimize.symbol_engine_optimizer.build : 0.000004s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000011s : 0.09% optimize.symbol_engine_optimizer.elim_not_effective : 0.000014s : 0.11% optimize.symbol_engine_optimizer.opt_reshape : 0.000008s : 0.06% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000010s : 0.08% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000004s : 0.03% pipeline_parallel_scheduler : 0.000002s : 0.01% auto_monad_reorder : 0.000023s : 0.18% get_jit_bprop_graph : 0.000002s : 0.02% rewriter_after_jit_bprop_graph : 0.000006s : 0.05% opt_after_jit_grad : 0.000615s : 4.84% validate : 0.000046s : 0.37% Time group info: ------[substitution.] 0.000250 34 15.40% : 0.000038s : 6: substitution.arithmetic_simplify 0.77% : 0.000002s : 2: substitution.elim_not_effective 0.51% : 0.000001s : 2: substitution.fold_const_symbol 2.83% : 0.000007s : 4: substitution.graph_param_transform 67.11% : 0.000168s : 4: substitution.inline 1.92% : 0.000005s : 4: substitution.j_node_and_user_rematch 2.78% : 0.000007s : 4: substitution.remove_not_recompute_node 2.14% : 0.000005s : 4: substitution.replace_old_param 6.55% : 0.000016s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.006349 2 88.06% : 0.005591s : 1: type_inference.infer 11.94% : 0.000758s : 1: type_inference.specialize ------[replace.] 0.000070 8 64.56% : 0.000045s : 4: replace.inline 35.44% : 0.000025s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000179 8 91.93% : 0.000165s : 4: match.inline 8.07% : 0.000014s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000225 1278 0.84% : 0.000002s : 13: predicate.accumulaten_eliminater 0.78% : 0.000002s : 4: predicate.ad_related_special_op_eliminate 0.61% : 0.000001s : 8: predicate.addn_check_dump 0.95% : 0.000002s : 13: predicate.addn_zero_filter 0.80% : 0.000002s : 13: predicate.adjust_all_reduce_mul_add 2.87% : 0.000006s : 21: predicate.arithmetic_simplify 0.90% : 0.000002s : 13: predicate.cast_eliminate 0.59% : 0.000001s : 8: predicate.check_bprop_eliminate 0.61% : 0.000001s : 8: predicate.compare_switch_simplify 0.17% : 0.000000s : 4: predicate.const_output_eliminate 0.75% : 0.000002s : 8: predicate.depend_value_elim 0.80% : 0.000002s : 13: predicate.dict_get_item_const_eliminator 0.91% : 0.000002s : 13: predicate.dict_get_item_eliminator 0.85% : 0.000002s : 13: predicate.dict_set_item_eliminator 1.27% : 0.000003s : 8: predicate.dumpgradient_eliminate 0.20% : 0.000000s : 4: predicate.elim_not_effective 0.44% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.07% : 0.000002s : 17: predicate.environ_add_const_eliminate 0.99% : 0.000002s : 17: predicate.environ_get_add_eliminate 1.01% : 0.000002s : 17: predicate.environ_get_depend_swap 1.78% : 0.000004s : 25: predicate.environ_get_eliminate 1.02% : 0.000002s : 17: predicate.environ_get_set_eliminate 1.38% : 0.000003s : 21: predicate.exchange_switch_depend_value 2.54% : 0.000006s : 21: predicate.float_depend_g_call 0.51% : 0.000001s : 8: predicate.float_environ_get_switch 0.83% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.17% : 0.000000s : 4: predicate.fold_const_symbol 0.92% : 0.000002s : 8: predicate.get_grad_eliminate 0.19% : 0.000000s : 4: predicate.graph_param_transform 0.57% : 0.000001s : 8: predicate.incorporate_call 0.47% : 0.000001s : 8: predicate.incorporate_call_switch 6.41% : 0.000014s : 58: predicate.inline 1.18% : 0.000003s : 8: predicate.inline_without_move 0.30% : 0.000001s : 8: predicate.j_node_and_user_rematch 0.83% : 0.000002s : 8: predicate.less_batch_normalization 1.83% : 0.000004s : 25: predicate.list_to_tuple_eliminator_ 2.37% : 0.000005s : 38: predicate.load_eliminater 1.18% : 0.000003s : 4: predicate.loop_unroll_after_grad 2.42% : 0.000005s : 34: predicate.loop_unroll_before_grad 1.57% : 0.000004s : 21: predicate.make_slice_get_slice_eliminator 0.53% : 0.000001s : 8: predicate.merge_addn 0.52% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.53% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.86% : 0.000002s : 13: predicate.minmaximum_grad 1.54% : 0.000003s : 4: predicate.mutable_eliminate 0.43% : 0.000001s : 4: predicate.opt_reshape 0.57% : 0.000001s : 4: predicate.parallel_virtual_node 1.89% : 0.000004s : 21: predicate.partial_defer_inline 1.50% : 0.000003s : 21: predicate.partial_eliminate 0.81% : 0.000002s : 13: predicate.print_const_string_wrapper 0.55% : 0.000001s : 8: predicate.reduce_all_const_elim 1.13% : 0.000003s : 13: predicate.reduce_eliminate 2.27% : 0.000005s : 38: predicate.redundant_stop_gradient_eliminater 0.59% : 0.000001s : 8: predicate.remove_not_recompute_node 1.34% : 0.000003s : 25: predicate.replace_applicator 0.54% : 0.000001s : 8: predicate.replace_old_param 0.25% : 0.000001s : 4: predicate.reset_defer_inline 0.96% : 0.000002s : 13: predicate.reshape_eliminate 0.59% : 0.000001s : 8: predicate.row_tensor_add_zeros_like 0.52% : 0.000001s : 4: predicate.row_tensor_eliminate 0.98% : 0.000002s : 8: predicate.same_eliminate 0.42% : 0.000001s : 8: predicate.set_cell_output_no_recompute 1.07% : 0.000002s : 8: predicate.shard_identity_eliminate 0.61% : 0.000001s : 8: predicate.special_op_eliminate 0.77% : 0.000002s : 8: predicate.specialize_transform 0.99% : 0.000002s : 8: predicate.split_environ_get_set_with_tuple_value 0.78% : 0.000002s : 8: predicate.stack_unstack_eliminate 0.37% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.37% : 0.000003s : 21: predicate.switch_defer_inline 2.05% : 0.000005s : 29: predicate.switch_layer_defer_inline 4.94% : 0.000011s : 67: predicate.switch_simplify 0.83% : 0.000002s : 13: predicate.tile_eliminate 0.81% : 0.000002s : 13: predicate.transpose_eliminate 1.48% : 0.000003s : 21: predicate.tuple_list_convert_item_index_to_positive 1.58% : 0.000004s : 21: predicate.tuple_list_get_item_const_eliminator 1.28% : 0.000003s : 21: predicate.tuple_list_get_item_depend_reorder 2.91% : 0.000007s : 33: predicate.tuple_list_get_item_eliminator 1.62% : 0.000004s : 21: predicate.tuple_list_get_set_item_eliminator 2.25% : 0.000005s : 29: predicate.tuple_list_set_item_eliminator 1.54% : 0.000003s : 25: predicate.tuple_to_list_eliminator_ 2.31% : 0.000005s : 38: predicate.updatestate_pure_node_eliminater 2.94% : 0.000007s : 46: predicate.updatestate_useless_node_eliminater 0.51% : 0.000001s : 4: predicate.value_based_eliminate 0.59% : 0.000001s : 8: predicate.virtual_dataset_eliminate 0.55% : 0.000001s : 8: predicate.virtual_output_eliminate 0.26% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.38% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000618 11 54.33% : 0.000336s : 5: func_graph_cloner_run.FuncGraphClonerGraph 45.67% : 0.000282s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.030917 192 0.02% : 0.000006s : 1: ForceFp32Comm 11.82% : 0.003656s : 1: add_attr 11.76% : 0.003636s : 1: add_attr_with_inline 0.02% : 0.000007s : 1: add_comm_op_reuse_tag 0.20% : 0.000062s : 1: add_recomputation 0.02% : 0.000007s : 1: assign_add_opt 0.24% : 0.000075s : 1: auto_monad 0.10% : 0.000032s : 1: auto_monad_reorder 0.02% : 0.000006s : 1: begin_end_overlap_inline 0.03% : 0.000008s : 1: bias_add_comm_swap 3.27% : 0.001012s : 1: bootstrap 0.13% : 0.000040s : 1: cconv 0.02% : 0.000007s : 1: comm_op_add_attrs 0.07% : 0.000020s : 1: control_data_broadcast_order 0.05% : 0.000014s : 1: convert_after_rewriter 0.11% : 0.000034s : 1: cse_after_recomputation 0.03% : 0.000008s : 1: dataset_repeat_opt 0.08% : 0.000024s : 1: detach_backward 0.04% : 0.000013s : 1: environ_conv 0.10% : 0.000032s : 1: event_method 0.03% : 0.000008s : 1: full_micro_interleaved_order_control 0.03% : 0.000008s : 1: get_jit_bprop_graph 0.04% : 0.000013s : 1: graph_reusing 0.02% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.02% : 0.000008s : 1: handle_group_info 0.03% : 0.000008s : 1: inline 0.03% : 0.000009s : 1: insert-virtual-dataset 0.02% : 0.000006s : 1: interleave_parallel_branches 0.02% : 0.000006s : 1: interleave_split_concat_branches 0.03% : 0.000009s : 1: label_fine_grained_interleaved_index 0.04% : 0.000011s : 1: label_micro_interleaved_index 1.77% : 0.000548s : 1: loop_unroll 0.02% : 0.000006s : 1: merge_cast_opt 0.03% : 0.000008s : 1: micro_interleaved_order_control 2.42% : 0.000749s : 1: mutable_eliminate 0.03% : 0.000011s : 1: offloading_packed_experts 0.06% : 0.000017s : 1: opt.transform.loop_unroll_optimizer 0.07% : 0.000020s : 1: opt.transform.mutable_eliminate 4.26% : 0.001318s : 78: opt.transform.opt_a 0.10% : 0.000030s : 1: opt.transform.opt_after_cconv 0.10% : 0.000031s : 1: opt.transform.opt_after_jit_grad 0.35% : 0.000107s : 28: opt.transform.opt_b 0.18% : 0.000055s : 2: opt.transform.opt_trans_graph 0.13% : 0.000040s : 4: opt.transform.symbol_engine_opt 11.33% : 0.003502s : 1: opt_a 0.45% : 0.000140s : 1: opt_after_cconv 2.03% : 0.000626s : 1: opt_after_jit_grad 0.95% : 0.000293s : 1: opt_b 21.25% : 0.006570s : 1: optimize 0.09% : 0.000026s : 1: optimize_parallel_all_gather_comm 0.04% : 0.000012s : 1: order_py_execute_after_rewriter 0.10% : 0.000030s : 1: overlap_grad_flash_sp 0.02% : 0.000006s : 1: overlap_grad_matmul_and_grad_allreduce 0.03% : 0.000010s : 1: overlap_grad_ring_attention 0.02% : 0.000007s : 1: overlap_opt_shard_grad_in_pipeline 0.02% : 0.000007s : 1: overlap_opt_shard_in_pipeline 0.03% : 0.000008s : 1: overlap_param_gather 0.02% : 0.000006s : 1: overlap_recompute_allgather_and_fa_grad 0.04% : 0.000012s : 1: overlap_recompute_and_grad_model_parallel 0.03% : 0.000008s : 1: overlap_recompute_comm 0.04% : 0.000011s : 1: parallel-infer-symbol 0.02% : 0.000007s : 1: parallel-infer-symbol-second 0.03% : 0.000008s : 1: partial_unused_args_eliminate 0.03% : 0.000009s : 1: pipeline_parallel_scheduler 0.02% : 0.000007s : 1: pipeline_split 0.17% : 0.000051s : 1: pre_auto_parallel 0.13% : 0.000039s : 1: py_interpret_to_execute 0.07% : 0.000021s : 1: py_interpret_to_execute_after_opt_a 0.02% : 0.000006s : 1: remove_cast_before_assign_add 0.07% : 0.000020s : 1: remove_dup_value 1.51% : 0.000467s : 1: renormalize.infer 1.23% : 0.000379s : 1: renormalize.specialize 0.03% : 0.000008s : 1: reorder_send_recv_between_fp_bp 0.04% : 0.000012s : 1: rewriter_after_jit_bprop_graph 0.16% : 0.000048s : 1: rewriter_after_opt_a 0.34% : 0.000105s : 1: rewriter_before_opt_a 0.03% : 0.000008s : 1: slice_cell_reuse_recomputed_activation 0.03% : 0.000008s : 1: slice_recompute_activation 0.03% : 0.000008s : 1: split_layernorm_comm 0.02% : 0.000007s : 1: split_matmul_comm_elemetwise 0.04% : 0.000013s : 1: swap_dp_allreduce_reducescatter 0.36% : 0.000111s : 1: symbol_engine_optimizer 0.34% : 0.000104s : 1: tuple_transform 20.90% : 0.006460s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:43:52.360.343 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0178395, [21] [bootstrap]: 0.00043555 [type_inference]: 0.00572259 [event_method]: 1.914e-05 [auto_monad]: 6.251e-05 [graph_reusing]: 5.61e-06 [inline]: 2.27001e-06 [add_attr]: 0.0056813, [1] [add_attr_with_inline]: 0.00566785, [1] [Cycle 1]: 7.088e-05, [2] [tag_attr]: 2.266e-05 [meta_addattr_fg_expand]: 6.58998e-06 [parallel-infer-symbol]: 3.60998e-06 [pre_auto_parallel]: 4.155e-05 [insert-virtual-dataset]: 2.39001e-06 [parallel-infer-symbol-second]: 7.59988e-07 [dataset_repeat_opt]: 2.06e-06 [pipeline_split]: 1.64e-06 [optimize]: 0.0051319, [53] [py_interpret_to_execute]: 2.928e-05 [rewriter_before_opt_a]: 9.093e-05 [opt_a]: 0.00292939, [2] [Cycle 1]: 0.0022418, [45] [expand_dump_flag]: 2.97002e-06 [switch_simplify]: 4.43e-05 [loop_unroll]: 3.004e-05 [a_1]: 0.00066407 [with_stream_mark]: 1.746e-05 [recompute_prepare]: 9.12999e-06 [updatestate_depend_eliminate]: 4.15e-06 [updatestate_assign_eliminate]: 3.53e-06 [updatestate_loads_eliminate]: 2.86e-06 [parameter_eliminate]: 1.96e-06 [a_2]: 8.85e-05 [accelerated_algorithm]: 7.23e-06 [shard]: 1.59e-06 [meta_shard_fg_expand]: 1.81e-06 [shard_inline]: 6.55002e-06 [merge_send_recv]: 7.85e-06 [auto_parallel]: 6.48998e-06 [parallel]: 1.915e-05 [flash_sp]: 8.84998e-06 [merge_comm]: 3.93999e-06 [allreduce_fusion]: 3.43e-06 [matmul_add_comm_reduction]: 9.49999e-06 [allreduce_slice_to_reducescatter]: 8.00006e-07 [virtual_shard_identity]: 7.92e-06 [virtual_dataset]: 6.82002e-06 [get_grad_eliminate_]: 6.17999e-06 [virtual_output]: 6.48998e-06 [merge_forward]: 3.84002e-06 [cell_reuse_recompute_pass]: 1.36002e-06 [offload_activation]: 1.075e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.242e-05 [merge_recompute_call_nodes]: 1.91998e-06 [before_grad]: 1.07e-05 [set_forward_comm_id_for_comm_node_pass]: 3.81001e-06 [meta_fg_expand]: 2.99999e-06 [flash_sp_send_recv_attached]: 2.68e-06 [receive_attached]: 2.22999e-06 [after_resolve]: 1.216e-05 [a_after_grad]: 1.07e-05 [renormalize]: 0.00083581 [add_forward_monad_depend]: 6.71e-06 [auto_monad_grad]: 2.78003e-06 [auto_monad_eliminator]: 1.694e-05 [cse]: 3.096e-05 [a_3]: 4.856e-05 [Cycle 2]: 0.00067578, [45] [expand_dump_flag]: 1.52001e-06 [switch_simplify]: 7.4e-06 [loop_unroll]: 6.29999e-06 [a_1]: 0.00014264 [with_stream_mark]: 1.354e-05 [recompute_prepare]: 6.60002e-06 [updatestate_depend_eliminate]: 3.31001e-06 [updatestate_assign_eliminate]: 2.43002e-06 [updatestate_loads_eliminate]: 2.46998e-06 [parameter_eliminate]: 1.20999e-06 [a_2]: 7.946e-05 [accelerated_algorithm]: 6.03002e-06 [shard]: 1.62999e-06 [meta_shard_fg_expand]: 1.79998e-06 [shard_inline]: 6.02999e-06 [merge_send_recv]: 5.21998e-06 [auto_parallel]: 5.87999e-06 [parallel]: 5.79999e-06 [flash_sp]: 3.5e-06 [merge_comm]: 3.55998e-06 [allreduce_fusion]: 3.55998e-06 [matmul_add_comm_reduction]: 6.50997e-06 [allreduce_slice_to_reducescatter]: 5.89993e-07 [virtual_shard_identity]: 9.64e-06 [virtual_dataset]: 5.72001e-06 [get_grad_eliminate_]: 5.67001e-06 [virtual_output]: 5.61003e-06 [merge_forward]: 3.13e-06 [cell_reuse_recompute_pass]: 1.96e-06 [offload_activation]: 6.95998e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.141e-05 [merge_recompute_call_nodes]: 8.49977e-07 [before_grad]: 9.15999e-06 [set_forward_comm_id_for_comm_node_pass]: 3.20002e-06 [meta_fg_expand]: 1.97999e-06 [flash_sp_send_recv_attached]: 1.09998e-06 [receive_attached]: 1.37e-06 [after_resolve]: 1.042e-05 [a_after_grad]: 9.15001e-06 [renormalize]: 8.00064e-08 [add_forward_monad_depend]: 1.52001e-06 [auto_monad_grad]: 1.27999e-06 [auto_monad_eliminator]: 8.17e-06 [cse]: 1.431e-05 [a_3]: 3.593e-05 [py_interpret_to_execute_after_opt_a]: 1.074e-05 [slice_cell_reuse_recomputed_activation]: 2.31e-06 [rewriter_after_opt_a]: 3.638e-05 [convert_after_rewriter]: 6.98e-06 [order_py_execute_after_rewriter]: 5.24998e-06 [mutable_eliminate]: 0.00064461 [opt_b]: 0.00020443, [1] [Cycle 1]: 0.00019733, [7] [b_1]: 0.00012377 [b_2]: 7.87e-06 [updatestate_depend_eliminate]: 5.90002e-06 [updatestate_assign_eliminate]: 2.69999e-06 [updatestate_loads_eliminate]: 2.35002e-06 [renormalize]: 6.80011e-07 [cse]: 1.871e-05 [optimize_parallel_all_gather_comm]: 1.703e-05 [overlap_param_gather]: 1.95001e-06 [cconv]: 2.782e-05 [loop_unroll]: 0.00044877 [opt_after_cconv]: 0.00010445, [1] [Cycle 1]: 9.87e-05, [7] [c_1]: 3.145e-05 [parameter_eliminate]: 3.79002e-06 [updatestate_depend_eliminate]: 5.57001e-06 [updatestate_assign_eliminate]: 2.81999e-06 [updatestate_loads_eliminate]: 2.31e-06 [cse]: 1.778e-05 [renormalize]: 4.19997e-07 [remove_dup_value]: 1.364e-05 [tuple_transform]: 7.739e-05, [1] [Cycle 1]: 7.268e-05, [4] [d_1]: 4.512e-05 [none_parameter_eliminate]: 1.52999e-06 [renormalize]: 1.8999e-07 [switch_simplify]: 6.96999e-06 [partial_unused_args_eliminate]: 1.99e-06 [add_recomputation]: 4.943e-05 [cse_after_recomputation]: 2.296e-05, [1] [Cycle 1]: 1.803e-05, [1] [cse]: 1.168e-05 [environ_conv]: 5.61e-06 [swap_dp_allreduce_reducescatter]: 5.76998e-06 [bias_add_comm_swap]: 2.83998e-06 [label_micro_interleaved_index]: 5.17e-06 [label_fine_grained_interleaved_index]: 2.76e-06 [merge_cast_opt]: 1.52001e-06 [slice_recompute_activation]: 2.01e-06 [micro_interleaved_order_control]: 2.38002e-06 [assign_add_opt]: 1.37e-06 [ForceFp32Comm]: 1.00001e-06 [remove_cast_before_assign_add]: 1.10999e-06 [full_micro_interleaved_order_control]: 1.96e-06 [reorder_send_recv_between_fp_bp]: 2.68e-06 [comm_op_add_attrs]: 1.02e-06 [add_comm_op_reuse_tag]: 9.50007e-07 [interleave_split_concat_branches]: 1.14e-06 [interleave_parallel_branches]: 1.04e-06 [overlap_opt_shard_in_pipeline]: 1.35001e-06 [overlap_opt_shard_grad_in_pipeline]: 2.04e-06 [control_data_broadcast_order]: 1.343e-05 [grouped_pairwise_exchange_alltoall]: 1.62999e-06 [offloading_packed_experts]: 4.3e-06 [overlap_recompute_and_grad_model_parallel]: 4.60001e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.25001e-06 [overlap_recompute_allgather_and_fa_grad]: 1.31998e-06 [overlap_recompute_comm]: 2.31e-06 [overlap_grad_ring_attention]: 4.51002e-06 [overlap_grad_flash_sp]: 2.31e-05 [begin_end_overlap_inline]: 5.10016e-07 [split_matmul_comm_elemetwise]: 2.06e-06 [split_layernorm_comm]: 1.60001e-06 [handle_group_info]: 9.20001e-07 [symbol_engine_optimizer]: 7.965e-05, [1] [Cycle 1]: 7.52e-05, [6] [build]: 3.36001e-06 [elim_shapecalc]: 1.119e-05 [elim_not_effective]: 1.393e-05 [opt_reshape]: 7.18e-06 [fold_const_symbol]: 1.022e-05 [renormalize]: 2.00002e-07 [detach_backward]: 2.19001e-06 [pipeline_parallel_scheduler]: 1.64e-06 [auto_monad_reorder]: 1.832e-05 [get_jit_bprop_graph]: 1.71002e-06 [rewriter_after_jit_bprop_graph]: 5.14998e-06 [opt_after_jit_grad]: 0.00050189 [validate]: 4.361e-05 Sums bootstrap : 0.000436s : 3.89% type_inference : 0.005723s : 51.14% event_method : 0.000019s : 0.17% auto_monad : 0.000063s : 0.56% graph_reusing : 0.000006s : 0.05% inline : 0.000002s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000023s : 0.20% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000007s : 0.06% parallel-infer-symbol : 0.000004s : 0.03% pre_auto_parallel : 0.000042s : 0.37% insert-virtual-dataset : 0.000002s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000029s : 0.26% optimize.rewriter_before_opt_a : 0.000091s : 0.81% optimize.opt_a.expand_dump_flag : 0.000004s : 0.04% optimize.opt_a.switch_simplify : 0.000052s : 0.46% optimize.opt_a.loop_unroll : 0.000036s : 0.32% optimize.opt_a.a_1 : 0.000807s : 7.21% optimize.opt_a.with_stream_mark : 0.000031s : 0.28% optimize.opt_a.recompute_prepare : 0.000016s : 0.14% optimize.opt_a.updatestate_depend_eliminate : 0.000007s : 0.07% optimize.opt_a.updatestate_assign_eliminate : 0.000006s : 0.05% optimize.opt_a.updatestate_loads_eliminate : 0.000005s : 0.05% optimize.opt_a.parameter_eliminate : 0.000003s : 0.03% optimize.opt_a.a_2 : 0.000168s : 1.50% optimize.opt_a.accelerated_algorithm : 0.000013s : 0.12% optimize.opt_a.shard : 0.000003s : 0.03% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.03% optimize.opt_a.shard_inline : 0.000013s : 0.11% optimize.opt_a.merge_send_recv : 0.000013s : 0.12% optimize.opt_a.auto_parallel : 0.000012s : 0.11% optimize.opt_a.parallel : 0.000025s : 0.22% optimize.opt_a.flash_sp : 0.000012s : 0.11% optimize.opt_a.merge_comm : 0.000007s : 0.07% optimize.opt_a.allreduce_fusion : 0.000007s : 0.06% optimize.opt_a.matmul_add_comm_reduction : 0.000016s : 0.14% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000018s : 0.16% optimize.opt_a.virtual_dataset : 0.000013s : 0.11% optimize.opt_a.get_grad_eliminate_ : 0.000012s : 0.11% optimize.opt_a.virtual_output : 0.000012s : 0.11% optimize.opt_a.merge_forward : 0.000007s : 0.06% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.03% optimize.opt_a.offload_activation : 0.000018s : 0.16% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000024s : 0.21% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.02% optimize.opt_a.before_grad : 0.000020s : 0.18% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000007s : 0.06% optimize.opt_a.meta_fg_expand : 0.000005s : 0.04% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.03% optimize.opt_a.receive_attached : 0.000004s : 0.03% optimize.opt_a.after_resolve : 0.000023s : 0.20% optimize.opt_a.a_after_grad : 0.000020s : 0.18% optimize.opt_a.renormalize : 0.000836s : 7.47% optimize.opt_a.add_forward_monad_depend : 0.000008s : 0.07% optimize.opt_a.auto_monad_grad : 0.000004s : 0.04% optimize.opt_a.auto_monad_eliminator : 0.000025s : 0.22% optimize.opt_a.cse : 0.000045s : 0.40% optimize.opt_a.a_3 : 0.000084s : 0.76% optimize.py_interpret_to_execute_after_opt_a : 0.000011s : 0.10% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.02% optimize.rewriter_after_opt_a : 0.000036s : 0.33% optimize.convert_after_rewriter : 0.000007s : 0.06% optimize.order_py_execute_after_rewriter : 0.000005s : 0.05% optimize.mutable_eliminate : 0.000645s : 5.76% optimize.opt_b.b_1 : 0.000124s : 1.11% optimize.opt_b.b_2 : 0.000008s : 0.07% optimize.opt_b.updatestate_depend_eliminate : 0.000006s : 0.05% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.02% optimize.opt_b.updatestate_loads_eliminate : 0.000002s : 0.02% optimize.opt_b.renormalize : 0.000001s : 0.01% optimize.opt_b.cse : 0.000019s : 0.17% optimize.optimize_parallel_all_gather_comm : 0.000017s : 0.15% optimize.overlap_param_gather : 0.000002s : 0.02% optimize.cconv : 0.000028s : 0.25% optimize.loop_unroll : 0.000449s : 4.01% optimize.opt_after_cconv.c_1 : 0.000031s : 0.28% optimize.opt_after_cconv.parameter_eliminate : 0.000004s : 0.03% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.05% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.02% optimize.opt_after_cconv.cse : 0.000018s : 0.16% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000014s : 0.12% optimize.tuple_transform.d_1 : 0.000045s : 0.40% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000007s : 0.06% optimize.partial_unused_args_eliminate : 0.000002s : 0.02% optimize.add_recomputation : 0.000049s : 0.44% optimize.cse_after_recomputation.cse : 0.000012s : 0.10% optimize.environ_conv : 0.000006s : 0.05% optimize.swap_dp_allreduce_reducescatter : 0.000006s : 0.05% optimize.bias_add_comm_swap : 0.000003s : 0.03% optimize.label_micro_interleaved_index : 0.000005s : 0.05% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.02% optimize.merge_cast_opt : 0.000002s : 0.01% optimize.slice_recompute_activation : 0.000002s : 0.02% optimize.micro_interleaved_order_control : 0.000002s : 0.02% optimize.assign_add_opt : 0.000001s : 0.01% optimize.ForceFp32Comm : 0.000001s : 0.01% optimize.remove_cast_before_assign_add : 0.000001s : 0.01% optimize.full_micro_interleaved_order_control : 0.000002s : 0.02% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.02% optimize.comm_op_add_attrs : 0.000001s : 0.01% optimize.add_comm_op_reuse_tag : 0.000001s : 0.01% optimize.interleave_split_concat_branches : 0.000001s : 0.01% optimize.interleave_parallel_branches : 0.000001s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.02% optimize.control_data_broadcast_order : 0.000013s : 0.12% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.01% optimize.offloading_packed_experts : 0.000004s : 0.04% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.04% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.01% optimize.overlap_recompute_comm : 0.000002s : 0.02% optimize.overlap_grad_ring_attention : 0.000005s : 0.04% optimize.overlap_grad_flash_sp : 0.000023s : 0.21% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.02% optimize.split_layernorm_comm : 0.000002s : 0.01% optimize.handle_group_info : 0.000001s : 0.01% optimize.symbol_engine_optimizer.build : 0.000003s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000011s : 0.10% optimize.symbol_engine_optimizer.elim_not_effective : 0.000014s : 0.12% optimize.symbol_engine_optimizer.opt_reshape : 0.000007s : 0.06% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000010s : 0.09% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.02% pipeline_parallel_scheduler : 0.000002s : 0.01% auto_monad_reorder : 0.000018s : 0.16% get_jit_bprop_graph : 0.000002s : 0.02% rewriter_after_jit_bprop_graph : 0.000005s : 0.05% opt_after_jit_grad : 0.000502s : 4.49% validate : 0.000044s : 0.39% Time group info: ------[substitution.] 0.000227 34 14.47% : 0.000033s : 6: substitution.arithmetic_simplify 0.91% : 0.000002s : 2: substitution.elim_not_effective 0.59% : 0.000001s : 2: substitution.fold_const_symbol 2.83% : 0.000006s : 4: substitution.graph_param_transform 68.80% : 0.000156s : 4: substitution.inline 1.70% : 0.000004s : 4: substitution.j_node_and_user_rematch 2.22% : 0.000005s : 4: substitution.remove_not_recompute_node 1.96% : 0.000004s : 4: substitution.replace_old_param 6.53% : 0.000015s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.005658 2 87.62% : 0.004957s : 1: type_inference.infer 12.38% : 0.000700s : 1: type_inference.specialize ------[replace.] 0.000063 8 62.80% : 0.000039s : 4: replace.inline 37.20% : 0.000023s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000167 8 92.18% : 0.000154s : 4: match.inline 7.82% : 0.000013s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000210 1278 0.83% : 0.000002s : 13: predicate.accumulaten_eliminater 0.97% : 0.000002s : 4: predicate.ad_related_special_op_eliminate 0.49% : 0.000001s : 8: predicate.addn_check_dump 1.04% : 0.000002s : 13: predicate.addn_zero_filter 0.85% : 0.000002s : 13: predicate.adjust_all_reduce_mul_add 2.46% : 0.000005s : 21: predicate.arithmetic_simplify 0.96% : 0.000002s : 13: predicate.cast_eliminate 0.61% : 0.000001s : 8: predicate.check_bprop_eliminate 0.54% : 0.000001s : 8: predicate.compare_switch_simplify 0.19% : 0.000000s : 4: predicate.const_output_eliminate 0.61% : 0.000001s : 8: predicate.depend_value_elim 0.91% : 0.000002s : 13: predicate.dict_get_item_const_eliminator 1.20% : 0.000003s : 13: predicate.dict_get_item_eliminator 0.89% : 0.000002s : 13: predicate.dict_set_item_eliminator 1.16% : 0.000002s : 8: predicate.dumpgradient_eliminate 0.21% : 0.000000s : 4: predicate.elim_not_effective 0.56% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.09% : 0.000002s : 17: predicate.environ_add_const_eliminate 1.04% : 0.000002s : 17: predicate.environ_get_add_eliminate 1.01% : 0.000002s : 17: predicate.environ_get_depend_swap 1.72% : 0.000004s : 25: predicate.environ_get_eliminate 1.07% : 0.000002s : 17: predicate.environ_get_set_eliminate 1.43% : 0.000003s : 21: predicate.exchange_switch_depend_value 2.52% : 0.000005s : 21: predicate.float_depend_g_call 0.50% : 0.000001s : 8: predicate.float_environ_get_switch 0.80% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.19% : 0.000000s : 4: predicate.fold_const_symbol 0.66% : 0.000001s : 8: predicate.get_grad_eliminate 0.19% : 0.000000s : 4: predicate.graph_param_transform 0.58% : 0.000001s : 8: predicate.incorporate_call 0.45% : 0.000001s : 8: predicate.incorporate_call_switch 6.40% : 0.000013s : 58: predicate.inline 0.83% : 0.000002s : 8: predicate.inline_without_move 0.31% : 0.000001s : 8: predicate.j_node_and_user_rematch 0.81% : 0.000002s : 8: predicate.less_batch_normalization 1.77% : 0.000004s : 25: predicate.list_to_tuple_eliminator_ 2.42% : 0.000005s : 38: predicate.load_eliminater 0.94% : 0.000002s : 4: predicate.loop_unroll_after_grad 2.31% : 0.000005s : 34: predicate.loop_unroll_before_grad 1.60% : 0.000003s : 21: predicate.make_slice_get_slice_eliminator 0.71% : 0.000001s : 8: predicate.merge_addn 0.56% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.55% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.82% : 0.000002s : 13: predicate.minmaximum_grad 1.36% : 0.000003s : 4: predicate.mutable_eliminate 0.47% : 0.000001s : 4: predicate.opt_reshape 0.35% : 0.000001s : 4: predicate.parallel_virtual_node 1.69% : 0.000004s : 21: predicate.partial_defer_inline 1.60% : 0.000003s : 21: predicate.partial_eliminate 0.86% : 0.000002s : 13: predicate.print_const_string_wrapper 0.68% : 0.000001s : 8: predicate.reduce_all_const_elim 1.19% : 0.000002s : 13: predicate.reduce_eliminate 2.44% : 0.000005s : 38: predicate.redundant_stop_gradient_eliminater 0.51% : 0.000001s : 8: predicate.remove_not_recompute_node 1.31% : 0.000003s : 25: predicate.replace_applicator 0.52% : 0.000001s : 8: predicate.replace_old_param 0.27% : 0.000001s : 4: predicate.reset_defer_inline 0.91% : 0.000002s : 13: predicate.reshape_eliminate 0.58% : 0.000001s : 8: predicate.row_tensor_add_zeros_like 0.34% : 0.000001s : 4: predicate.row_tensor_eliminate 0.81% : 0.000002s : 8: predicate.same_eliminate 0.42% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.86% : 0.000002s : 8: predicate.shard_identity_eliminate 0.71% : 0.000001s : 8: predicate.special_op_eliminate 0.65% : 0.000001s : 8: predicate.specialize_transform 1.00% : 0.000002s : 8: predicate.split_environ_get_set_with_tuple_value 0.84% : 0.000002s : 8: predicate.stack_unstack_eliminate 0.39% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.44% : 0.000003s : 21: predicate.switch_defer_inline 1.99% : 0.000004s : 29: predicate.switch_layer_defer_inline 5.13% : 0.000011s : 67: predicate.switch_simplify 0.83% : 0.000002s : 13: predicate.tile_eliminate 0.91% : 0.000002s : 13: predicate.transpose_eliminate 1.67% : 0.000004s : 21: predicate.tuple_list_convert_item_index_to_positive 1.60% : 0.000003s : 21: predicate.tuple_list_get_item_const_eliminator 1.42% : 0.000003s : 21: predicate.tuple_list_get_item_depend_reorder 3.09% : 0.000006s : 33: predicate.tuple_list_get_item_eliminator 1.53% : 0.000003s : 21: predicate.tuple_list_get_set_item_eliminator 2.30% : 0.000005s : 29: predicate.tuple_list_set_item_eliminator 1.73% : 0.000004s : 25: predicate.tuple_to_list_eliminator_ 2.35% : 0.000005s : 38: predicate.updatestate_pure_node_eliminater 3.15% : 0.000007s : 46: predicate.updatestate_useless_node_eliminater 0.36% : 0.000001s : 4: predicate.value_based_eliminate 0.64% : 0.000001s : 8: predicate.virtual_dataset_eliminate 0.68% : 0.000001s : 8: predicate.virtual_output_eliminate 0.28% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.41% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000556 11 53.75% : 0.000299s : 5: func_graph_cloner_run.FuncGraphClonerGraph 46.25% : 0.000257s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.030867 192 0.01% : 0.000004s : 1: ForceFp32Comm 18.43% : 0.005688s : 1: add_attr 18.38% : 0.005673s : 1: add_attr_with_inline 0.01% : 0.000004s : 1: add_comm_op_reuse_tag 0.17% : 0.000054s : 1: add_recomputation 0.01% : 0.000004s : 1: assign_add_opt 0.22% : 0.000068s : 1: auto_monad 0.07% : 0.000022s : 1: auto_monad_reorder 0.01% : 0.000004s : 1: begin_end_overlap_inline 0.02% : 0.000006s : 1: bias_add_comm_swap 1.50% : 0.000462s : 1: bootstrap 0.10% : 0.000031s : 1: cconv 0.01% : 0.000004s : 1: comm_op_add_attrs 0.05% : 0.000017s : 1: control_data_broadcast_order 0.03% : 0.000010s : 1: convert_after_rewriter 0.08% : 0.000026s : 1: cse_after_recomputation 0.02% : 0.000005s : 1: dataset_repeat_opt 0.02% : 0.000005s : 1: detach_backward 0.03% : 0.000009s : 1: environ_conv 0.08% : 0.000026s : 1: event_method 0.02% : 0.000005s : 1: full_micro_interleaved_order_control 0.02% : 0.000005s : 1: get_jit_bprop_graph 0.03% : 0.000009s : 1: graph_reusing 0.01% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000004s : 1: handle_group_info 0.02% : 0.000005s : 1: inline 0.02% : 0.000006s : 1: insert-virtual-dataset 0.01% : 0.000004s : 1: interleave_parallel_branches 0.01% : 0.000004s : 1: interleave_split_concat_branches 0.02% : 0.000006s : 1: label_fine_grained_interleaved_index 0.03% : 0.000008s : 1: label_micro_interleaved_index 1.48% : 0.000457s : 1: loop_unroll 0.01% : 0.000004s : 1: merge_cast_opt 0.02% : 0.000005s : 1: micro_interleaved_order_control 2.12% : 0.000654s : 1: mutable_eliminate 0.02% : 0.000007s : 1: offloading_packed_experts 0.05% : 0.000015s : 1: opt.transform.loop_unroll_optimizer 0.06% : 0.000017s : 1: opt.transform.mutable_eliminate 4.02% : 0.001239s : 78: opt.transform.opt_a 0.10% : 0.000030s : 1: opt.transform.opt_after_cconv 0.09% : 0.000027s : 1: opt.transform.opt_after_jit_grad 0.33% : 0.000101s : 28: opt.transform.opt_b 0.16% : 0.000050s : 2: opt.transform.opt_trans_graph 0.13% : 0.000039s : 4: opt.transform.symbol_engine_opt 9.50% : 0.002933s : 1: opt_a 0.35% : 0.000108s : 1: opt_after_cconv 1.66% : 0.000511s : 1: opt_after_jit_grad 0.67% : 0.000208s : 1: opt_b 16.64% : 0.005137s : 1: optimize 0.07% : 0.000021s : 1: optimize_parallel_all_gather_comm 0.03% : 0.000008s : 1: order_py_execute_after_rewriter 0.09% : 0.000026s : 1: overlap_grad_flash_sp 0.01% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.02% : 0.000007s : 1: overlap_grad_ring_attention 0.02% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.02% : 0.000005s : 1: overlap_param_gather 0.01% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.03% : 0.000008s : 1: overlap_recompute_and_grad_model_parallel 0.02% : 0.000005s : 1: overlap_recompute_comm 0.02% : 0.000007s : 1: parallel-infer-symbol 0.01% : 0.000004s : 1: parallel-infer-symbol-second 0.02% : 0.000005s : 1: partial_unused_args_eliminate 0.02% : 0.000005s : 1: pipeline_parallel_scheduler 0.01% : 0.000004s : 1: pipeline_split 0.15% : 0.000046s : 1: pre_auto_parallel 0.11% : 0.000034s : 1: py_interpret_to_execute 0.05% : 0.000014s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000004s : 1: remove_cast_before_assign_add 0.05% : 0.000017s : 1: remove_dup_value 1.55% : 0.000479s : 1: renormalize.infer 1.13% : 0.000348s : 1: renormalize.specialize 0.02% : 0.000005s : 1: reorder_send_recv_between_fp_bp 0.03% : 0.000008s : 1: rewriter_after_jit_bprop_graph 0.13% : 0.000040s : 1: rewriter_after_opt_a 0.31% : 0.000095s : 1: rewriter_before_opt_a 0.02% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.02% : 0.000005s : 1: slice_recompute_activation 0.02% : 0.000005s : 1: split_layernorm_comm 0.02% : 0.000005s : 1: split_matmul_comm_elemetwise 0.03% : 0.000009s : 1: swap_dp_allreduce_reducescatter 0.27% : 0.000082s : 1: symbol_engine_optimizer 0.26% : 0.000080s : 1: tuple_transform 18.60% : 0.005741s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:43:53.447.212 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:43:53.447.496 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.115496, [21] [bootstrap]: 0.00043927 [type_inference]: 0.00613154 [event_method]: 2.056e-05 [auto_monad]: 6.48e-05 [graph_reusing]: 6.11998e-06 [inline]: 2.31998e-06 [add_attr]: 0.00348938, [1] [add_attr_with_inline]: 0.00347749, [1] [Cycle 1]: 9.173e-05, [2] [tag_attr]: 2.471e-05 [meta_addattr_fg_expand]: 6.13998e-06 [parallel-infer-symbol]: 4.12003e-06 [pre_auto_parallel]: 4.185e-05 [insert-virtual-dataset]: 2.34999e-06 [parallel-infer-symbol-second]: 8.50006e-07 [dataset_repeat_opt]: 2.06e-06 [pipeline_split]: 1.69998e-06 [optimize]: 0.103669, [53] [py_interpret_to_execute]: 3.383e-05 [rewriter_before_opt_a]: 9.611e-05 [opt_a]: 0.100897, [2] [Cycle 1]: 0.0999145, [45] [expand_dump_flag]: 3.65998e-06 [switch_simplify]: 4.367e-05 [loop_unroll]: 3.165e-05 [a_1]: 0.00069008 [with_stream_mark]: 2.142e-05 [recompute_prepare]: 9.80002e-06 [updatestate_depend_eliminate]: 4.53001e-06 [updatestate_assign_eliminate]: 3.98999e-06 [updatestate_loads_eliminate]: 2.83e-06 [parameter_eliminate]: 2.19001e-06 [a_2]: 0.00011824 [accelerated_algorithm]: 7.62002e-06 [shard]: 1.87999e-06 [meta_shard_fg_expand]: 2.04999e-06 [shard_inline]: 6.54001e-06 [merge_send_recv]: 9.05001e-06 [auto_parallel]: 7.03e-06 [parallel]: 1.92e-05 [flash_sp]: 8.70999e-06 [merge_comm]: 3.93999e-06 [allreduce_fusion]: 3.75e-06 [matmul_add_comm_reduction]: 1.055e-05 [allreduce_slice_to_reducescatter]: 8.30012e-07 [virtual_shard_identity]: 8.59e-06 [virtual_dataset]: 7.19001e-06 [get_grad_eliminate_]: 6.64999e-06 [virtual_output]: 6.64999e-06 [merge_forward]: 4.15999e-06 [cell_reuse_recompute_pass]: 2.00002e-06 [offload_activation]: 1.065e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.544e-05 [merge_recompute_call_nodes]: 1.43002e-06 [before_grad]: 1.059e-05 [set_forward_comm_id_for_comm_node_pass]: 3.68e-06 [meta_fg_expand]: 3.03e-06 [flash_sp_send_recv_attached]: 2.32001e-06 [receive_attached]: 2.19999e-06 [after_resolve]: 1.228e-05 [a_after_grad]: 1.067e-05 [renormalize]: 0.0981996 [add_forward_monad_depend]: 1.202e-05 [auto_monad_grad]: 2.79001e-06 [auto_monad_eliminator]: 2.426e-05 [cse]: 3.203e-05 [a_3]: 8.056e-05 [Cycle 2]: 0.00096357, [45] [expand_dump_flag]: 2.44001e-06 [switch_simplify]: 9.66998e-06 [loop_unroll]: 7.16001e-06 [a_1]: 0.0001672 [with_stream_mark]: 1.99e-05 [recompute_prepare]: 7.13e-06 [updatestate_depend_eliminate]: 4.51002e-06 [updatestate_assign_eliminate]: 3.50003e-06 [updatestate_loads_eliminate]: 2.94001e-06 [parameter_eliminate]: 2.09e-06 [a_2]: 0.00011033 [accelerated_algorithm]: 8.15e-06 [shard]: 2.39001e-06 [meta_shard_fg_expand]: 2.62001e-06 [shard_inline]: 6.69001e-06 [merge_send_recv]: 8.32e-06 [auto_parallel]: 1.099e-05 [parallel]: 1.029e-05 [flash_sp]: 4.00998e-06 [merge_comm]: 4.32e-06 [allreduce_fusion]: 6.55002e-06 [matmul_add_comm_reduction]: 9.47999e-06 [allreduce_slice_to_reducescatter]: 7.7e-07 [virtual_shard_identity]: 8.63001e-06 [virtual_dataset]: 6.83e-06 [get_grad_eliminate_]: 6.26e-06 [virtual_output]: 6.16e-06 [merge_forward]: 4.2e-06 [cell_reuse_recompute_pass]: 3.80998e-06 [offload_activation]: 1.035e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.591e-05 [merge_recompute_call_nodes]: 1.64e-06 [before_grad]: 1.069e-05 [set_forward_comm_id_for_comm_node_pass]: 4.1e-06 [meta_fg_expand]: 3.02002e-06 [flash_sp_send_recv_attached]: 2.34999e-06 [receive_attached]: 2.52001e-06 [after_resolve]: 1.476e-05 [a_after_grad]: 1.149e-05 [renormalize]: 8.00064e-08 [add_forward_monad_depend]: 1.62999e-06 [auto_monad_grad]: 1.30999e-06 [auto_monad_eliminator]: 9.39e-06 [cse]: 1.661e-05 [a_3]: 5.209e-05 [py_interpret_to_execute_after_opt_a]: 2.317e-05 [slice_cell_reuse_recomputed_activation]: 5.49e-06 [rewriter_after_opt_a]: 4.792e-05 [convert_after_rewriter]: 1.026e-05 [order_py_execute_after_rewriter]: 7.98001e-06 [mutable_eliminate]: 0.00075966 [opt_b]: 0.00029471, [1] [Cycle 1]: 0.00028399, [7] [b_1]: 0.00017597 [b_2]: 9.14e-06 [updatestate_depend_eliminate]: 8.95001e-06 [updatestate_assign_eliminate]: 2.71e-06 [updatestate_loads_eliminate]: 2.89999e-06 [renormalize]: 7.7e-07 [cse]: 2.523e-05 [optimize_parallel_all_gather_comm]: 2.287e-05 [overlap_param_gather]: 5.27001e-06 [cconv]: 3.912e-05 [loop_unroll]: 0.0005193 [opt_after_cconv]: 0.00014027, [1] [Cycle 1]: 0.00012977, [7] [c_1]: 3.316e-05 [parameter_eliminate]: 5.57001e-06 [updatestate_depend_eliminate]: 6.89001e-06 [updatestate_assign_eliminate]: 2.59999e-06 [updatestate_loads_eliminate]: 2.99001e-06 [cse]: 1.943e-05 [renormalize]: 4.89992e-07 [remove_dup_value]: 1.829e-05 [tuple_transform]: 9.778e-05, [1] [Cycle 1]: 8.975e-05, [4] [d_1]: 4.818e-05 [none_parameter_eliminate]: 2.07999e-06 [renormalize]: 1.80007e-07 [switch_simplify]: 7.78001e-06 [partial_unused_args_eliminate]: 4.99e-06 [add_recomputation]: 8.81e-05 [cse_after_recomputation]: 3.241e-05, [1] [Cycle 1]: 2.447e-05, [1] [cse]: 1.393e-05 [environ_conv]: 9.37999e-06 [swap_dp_allreduce_reducescatter]: 7.95998e-06 [bias_add_comm_swap]: 5.46998e-06 [label_micro_interleaved_index]: 7.35e-06 [label_fine_grained_interleaved_index]: 5.40999e-06 [merge_cast_opt]: 3.88001e-06 [slice_recompute_activation]: 4.80999e-06 [micro_interleaved_order_control]: 5.15999e-06 [assign_add_opt]: 3.71999e-06 [ForceFp32Comm]: 3.78001e-06 [remove_cast_before_assign_add]: 3.55e-06 [full_micro_interleaved_order_control]: 4.68999e-06 [reorder_send_recv_between_fp_bp]: 5.55001e-06 [comm_op_add_attrs]: 3.97002e-06 [add_comm_op_reuse_tag]: 3.23e-06 [interleave_split_concat_branches]: 3.58e-06 [interleave_parallel_branches]: 3.6e-06 [overlap_opt_shard_in_pipeline]: 3.7e-06 [overlap_opt_shard_grad_in_pipeline]: 4.36002e-06 [control_data_broadcast_order]: 1.78e-05 [grouped_pairwise_exchange_alltoall]: 4.04002e-06 [offloading_packed_experts]: 7.53e-06 [overlap_recompute_and_grad_model_parallel]: 7.63001e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.65e-06 [overlap_recompute_allgather_and_fa_grad]: 3.8e-06 [overlap_recompute_comm]: 5.34e-06 [overlap_grad_ring_attention]: 7.26999e-06 [overlap_grad_flash_sp]: 2.517e-05 [begin_end_overlap_inline]: 3.03e-06 [split_matmul_comm_elemetwise]: 4.59998e-06 [split_layernorm_comm]: 3.92002e-06 [handle_group_info]: 3.49001e-06 [symbol_engine_optimizer]: 0.0001081, [1] [Cycle 1]: 0.00010039, [6] [build]: 4.75999e-06 [elim_shapecalc]: 1.247e-05 [elim_not_effective]: 1.459e-05 [opt_reshape]: 7.33e-06 [fold_const_symbol]: 1.097e-05 [renormalize]: 2.19996e-07 [detach_backward]: 5.09e-06 [pipeline_parallel_scheduler]: 2.25002e-06 [auto_monad_reorder]: 2.463e-05 [get_jit_bprop_graph]: 2.39001e-06 [rewriter_after_jit_bprop_graph]: 7.01999e-06 [opt_after_jit_grad]: 0.00061476 [validate]: 4.652e-05 Sums bootstrap : 0.000439s : 0.40% type_inference : 0.006132s : 5.58% event_method : 0.000021s : 0.02% auto_monad : 0.000065s : 0.06% graph_reusing : 0.000006s : 0.01% inline : 0.000002s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000025s : 0.02% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.01% parallel-infer-symbol : 0.000004s : 0.00% pre_auto_parallel : 0.000042s : 0.04% insert-virtual-dataset : 0.000002s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000034s : 0.03% optimize.rewriter_before_opt_a : 0.000096s : 0.09% optimize.opt_a.expand_dump_flag : 0.000006s : 0.01% optimize.opt_a.switch_simplify : 0.000053s : 0.05% optimize.opt_a.loop_unroll : 0.000039s : 0.04% optimize.opt_a.a_1 : 0.000857s : 0.78% optimize.opt_a.with_stream_mark : 0.000041s : 0.04% optimize.opt_a.recompute_prepare : 0.000017s : 0.02% optimize.opt_a.updatestate_depend_eliminate : 0.000009s : 0.01% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.01% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.01% optimize.opt_a.parameter_eliminate : 0.000004s : 0.00% optimize.opt_a.a_2 : 0.000229s : 0.21% optimize.opt_a.accelerated_algorithm : 0.000016s : 0.01% optimize.opt_a.shard : 0.000004s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.000005s : 0.00% optimize.opt_a.shard_inline : 0.000013s : 0.01% optimize.opt_a.merge_send_recv : 0.000017s : 0.02% optimize.opt_a.auto_parallel : 0.000018s : 0.02% optimize.opt_a.parallel : 0.000029s : 0.03% optimize.opt_a.flash_sp : 0.000013s : 0.01% optimize.opt_a.merge_comm : 0.000008s : 0.01% optimize.opt_a.allreduce_fusion : 0.000010s : 0.01% optimize.opt_a.matmul_add_comm_reduction : 0.000020s : 0.02% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000002s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000017s : 0.02% optimize.opt_a.virtual_dataset : 0.000014s : 0.01% optimize.opt_a.get_grad_eliminate_ : 0.000013s : 0.01% optimize.opt_a.virtual_output : 0.000013s : 0.01% optimize.opt_a.merge_forward : 0.000008s : 0.01% optimize.opt_a.cell_reuse_recompute_pass : 0.000006s : 0.01% optimize.opt_a.offload_activation : 0.000021s : 0.02% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000031s : 0.03% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.00% optimize.opt_a.before_grad : 0.000021s : 0.02% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000008s : 0.01% optimize.opt_a.meta_fg_expand : 0.000006s : 0.01% optimize.opt_a.flash_sp_send_recv_attached : 0.000005s : 0.00% optimize.opt_a.receive_attached : 0.000005s : 0.00% optimize.opt_a.after_resolve : 0.000027s : 0.02% optimize.opt_a.a_after_grad : 0.000022s : 0.02% optimize.opt_a.renormalize : 0.098200s : 89.42% optimize.opt_a.add_forward_monad_depend : 0.000014s : 0.01% optimize.opt_a.auto_monad_grad : 0.000004s : 0.00% optimize.opt_a.auto_monad_eliminator : 0.000034s : 0.03% optimize.opt_a.cse : 0.000049s : 0.04% optimize.opt_a.a_3 : 0.000133s : 0.12% optimize.py_interpret_to_execute_after_opt_a : 0.000023s : 0.02% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.00% optimize.rewriter_after_opt_a : 0.000048s : 0.04% optimize.convert_after_rewriter : 0.000010s : 0.01% optimize.order_py_execute_after_rewriter : 0.000008s : 0.01% optimize.mutable_eliminate : 0.000760s : 0.69% optimize.opt_b.b_1 : 0.000176s : 0.16% optimize.opt_b.b_2 : 0.000009s : 0.01% optimize.opt_b.updatestate_depend_eliminate : 0.000009s : 0.01% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.00% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000025s : 0.02% optimize.optimize_parallel_all_gather_comm : 0.000023s : 0.02% optimize.overlap_param_gather : 0.000005s : 0.00% optimize.cconv : 0.000039s : 0.04% optimize.loop_unroll : 0.000519s : 0.47% optimize.opt_after_cconv.c_1 : 0.000033s : 0.03% optimize.opt_after_cconv.parameter_eliminate : 0.000006s : 0.01% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000007s : 0.01% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.cse : 0.000019s : 0.02% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000018s : 0.02% optimize.tuple_transform.d_1 : 0.000048s : 0.04% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000008s : 0.01% optimize.partial_unused_args_eliminate : 0.000005s : 0.00% optimize.add_recomputation : 0.000088s : 0.08% optimize.cse_after_recomputation.cse : 0.000014s : 0.01% optimize.environ_conv : 0.000009s : 0.01% optimize.swap_dp_allreduce_reducescatter : 0.000008s : 0.01% optimize.bias_add_comm_swap : 0.000005s : 0.00% optimize.label_micro_interleaved_index : 0.000007s : 0.01% optimize.label_fine_grained_interleaved_index : 0.000005s : 0.00% optimize.merge_cast_opt : 0.000004s : 0.00% optimize.slice_recompute_activation : 0.000005s : 0.00% optimize.micro_interleaved_order_control : 0.000005s : 0.00% optimize.assign_add_opt : 0.000004s : 0.00% optimize.ForceFp32Comm : 0.000004s : 0.00% optimize.remove_cast_before_assign_add : 0.000004s : 0.00% optimize.full_micro_interleaved_order_control : 0.000005s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000006s : 0.01% optimize.comm_op_add_attrs : 0.000004s : 0.00% optimize.add_comm_op_reuse_tag : 0.000003s : 0.00% optimize.interleave_split_concat_branches : 0.000004s : 0.00% optimize.interleave_parallel_branches : 0.000004s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000004s : 0.00% optimize.control_data_broadcast_order : 0.000018s : 0.02% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.00% optimize.offloading_packed_experts : 0.000008s : 0.01% optimize.overlap_recompute_and_grad_model_parallel : 0.000008s : 0.01% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.00% optimize.overlap_recompute_comm : 0.000005s : 0.00% optimize.overlap_grad_ring_attention : 0.000007s : 0.01% optimize.overlap_grad_flash_sp : 0.000025s : 0.02% optimize.begin_end_overlap_inline : 0.000003s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000005s : 0.00% optimize.split_layernorm_comm : 0.000004s : 0.00% optimize.handle_group_info : 0.000003s : 0.00% optimize.symbol_engine_optimizer.build : 0.000005s : 0.00% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000012s : 0.01% optimize.symbol_engine_optimizer.elim_not_effective : 0.000015s : 0.01% optimize.symbol_engine_optimizer.opt_reshape : 0.000007s : 0.01% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000011s : 0.01% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000005s : 0.00% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000025s : 0.02% get_jit_bprop_graph : 0.000002s : 0.00% rewriter_after_jit_bprop_graph : 0.000007s : 0.01% opt_after_jit_grad : 0.000615s : 0.56% validate : 0.000047s : 0.04% Time group info: ------[substitution.] 0.000251 34 16.54% : 0.000042s : 6: substitution.arithmetic_simplify 0.89% : 0.000002s : 2: substitution.elim_not_effective 0.58% : 0.000001s : 2: substitution.fold_const_symbol 2.48% : 0.000006s : 4: substitution.graph_param_transform 66.23% : 0.000166s : 4: substitution.inline 1.83% : 0.000005s : 4: substitution.j_node_and_user_rematch 2.15% : 0.000005s : 4: substitution.remove_not_recompute_node 2.75% : 0.000007s : 4: substitution.replace_old_param 6.55% : 0.000016s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.006069 2 87.80% : 0.005329s : 1: type_inference.infer 12.20% : 0.000741s : 1: type_inference.specialize ------[replace.] 0.000067 8 64.61% : 0.000044s : 4: replace.inline 35.39% : 0.000024s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000178 8 91.78% : 0.000164s : 4: match.inline 8.22% : 0.000015s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000232 1278 0.99% : 0.000002s : 13: predicate.accumulaten_eliminater 0.99% : 0.000002s : 4: predicate.ad_related_special_op_eliminate 0.50% : 0.000001s : 8: predicate.addn_check_dump 0.86% : 0.000002s : 13: predicate.addn_zero_filter 0.76% : 0.000002s : 13: predicate.adjust_all_reduce_mul_add 2.49% : 0.000006s : 21: predicate.arithmetic_simplify 0.89% : 0.000002s : 13: predicate.cast_eliminate 0.65% : 0.000002s : 8: predicate.check_bprop_eliminate 0.50% : 0.000001s : 8: predicate.compare_switch_simplify 0.16% : 0.000000s : 4: predicate.const_output_eliminate 0.74% : 0.000002s : 8: predicate.depend_value_elim 0.84% : 0.000002s : 13: predicate.dict_get_item_const_eliminator 1.03% : 0.000002s : 13: predicate.dict_get_item_eliminator 0.90% : 0.000002s : 13: predicate.dict_set_item_eliminator 0.99% : 0.000002s : 8: predicate.dumpgradient_eliminate 0.27% : 0.000001s : 4: predicate.elim_not_effective 0.62% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.24% : 0.000003s : 17: predicate.environ_add_const_eliminate 1.07% : 0.000002s : 17: predicate.environ_get_add_eliminate 1.24% : 0.000003s : 17: predicate.environ_get_depend_swap 1.65% : 0.000004s : 25: predicate.environ_get_eliminate 0.97% : 0.000002s : 17: predicate.environ_get_set_eliminate 1.38% : 0.000003s : 21: predicate.exchange_switch_depend_value 2.39% : 0.000006s : 21: predicate.float_depend_g_call 0.57% : 0.000001s : 8: predicate.float_environ_get_switch 0.87% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.18% : 0.000000s : 4: predicate.fold_const_symbol 0.64% : 0.000001s : 8: predicate.get_grad_eliminate 0.28% : 0.000001s : 4: predicate.graph_param_transform 0.52% : 0.000001s : 8: predicate.incorporate_call 0.44% : 0.000001s : 8: predicate.incorporate_call_switch 6.00% : 0.000014s : 58: predicate.inline 0.83% : 0.000002s : 8: predicate.inline_without_move 0.30% : 0.000001s : 8: predicate.j_node_and_user_rematch 0.97% : 0.000002s : 8: predicate.less_batch_normalization 1.94% : 0.000005s : 25: predicate.list_to_tuple_eliminator_ 2.43% : 0.000006s : 38: predicate.load_eliminater 1.15% : 0.000003s : 4: predicate.loop_unroll_after_grad 2.21% : 0.000005s : 34: predicate.loop_unroll_before_grad 1.65% : 0.000004s : 21: predicate.make_slice_get_slice_eliminator 0.51% : 0.000001s : 8: predicate.merge_addn 0.56% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.57% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.77% : 0.000002s : 13: predicate.minmaximum_grad 1.48% : 0.000003s : 4: predicate.mutable_eliminate 0.40% : 0.000001s : 4: predicate.opt_reshape 0.50% : 0.000001s : 4: predicate.parallel_virtual_node 1.74% : 0.000004s : 21: predicate.partial_defer_inline 1.50% : 0.000003s : 21: predicate.partial_eliminate 0.85% : 0.000002s : 13: predicate.print_const_string_wrapper 0.57% : 0.000001s : 8: predicate.reduce_all_const_elim 1.32% : 0.000003s : 13: predicate.reduce_eliminate 2.62% : 0.000006s : 38: predicate.redundant_stop_gradient_eliminater 0.43% : 0.000001s : 8: predicate.remove_not_recompute_node 1.39% : 0.000003s : 25: predicate.replace_applicator 0.59% : 0.000001s : 8: predicate.replace_old_param 0.32% : 0.000001s : 4: predicate.reset_defer_inline 0.81% : 0.000002s : 13: predicate.reshape_eliminate 0.69% : 0.000002s : 8: predicate.row_tensor_add_zeros_like 0.41% : 0.000001s : 4: predicate.row_tensor_eliminate 0.76% : 0.000002s : 8: predicate.same_eliminate 0.48% : 0.000001s : 8: predicate.set_cell_output_no_recompute 1.02% : 0.000002s : 8: predicate.shard_identity_eliminate 0.66% : 0.000002s : 8: predicate.special_op_eliminate 0.65% : 0.000002s : 8: predicate.specialize_transform 1.15% : 0.000003s : 8: predicate.split_environ_get_set_with_tuple_value 0.78% : 0.000002s : 8: predicate.stack_unstack_eliminate 0.34% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.44% : 0.000003s : 21: predicate.switch_defer_inline 1.86% : 0.000004s : 29: predicate.switch_layer_defer_inline 4.77% : 0.000011s : 67: predicate.switch_simplify 0.81% : 0.000002s : 13: predicate.tile_eliminate 0.96% : 0.000002s : 13: predicate.transpose_eliminate 1.50% : 0.000003s : 21: predicate.tuple_list_convert_item_index_to_positive 1.42% : 0.000003s : 21: predicate.tuple_list_get_item_const_eliminator 1.65% : 0.000004s : 21: predicate.tuple_list_get_item_depend_reorder 3.33% : 0.000008s : 33: predicate.tuple_list_get_item_eliminator 1.61% : 0.000004s : 21: predicate.tuple_list_get_set_item_eliminator 2.33% : 0.000005s : 29: predicate.tuple_list_set_item_eliminator 1.71% : 0.000004s : 25: predicate.tuple_to_list_eliminator_ 2.26% : 0.000005s : 38: predicate.updatestate_pure_node_eliminater 2.93% : 0.000007s : 46: predicate.updatestate_useless_node_eliminater 0.37% : 0.000001s : 4: predicate.value_based_eliminate 0.75% : 0.000002s : 8: predicate.virtual_dataset_eliminate 0.64% : 0.000001s : 8: predicate.virtual_output_eliminate 0.22% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.47% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000679 11 47.97% : 0.000326s : 5: func_graph_cloner_run.FuncGraphClonerGraph 52.03% : 0.000354s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.322321 192 0.00% : 0.000006s : 1: ForceFp32Comm 1.09% : 0.003500s : 1: add_attr 1.08% : 0.003482s : 1: add_attr_with_inline 0.00% : 0.000007s : 1: add_comm_op_reuse_tag 0.03% : 0.000092s : 1: add_recomputation 0.00% : 0.000006s : 1: assign_add_opt 0.02% : 0.000074s : 1: auto_monad 0.01% : 0.000033s : 1: auto_monad_reorder 0.00% : 0.000006s : 1: begin_end_overlap_inline 0.00% : 0.000008s : 1: bias_add_comm_swap 0.15% : 0.000484s : 1: bootstrap 0.01% : 0.000043s : 1: cconv 0.00% : 0.000007s : 1: comm_op_add_attrs 0.01% : 0.000021s : 1: control_data_broadcast_order 0.00% : 0.000014s : 1: convert_after_rewriter 0.01% : 0.000036s : 1: cse_after_recomputation 0.00% : 0.000008s : 1: dataset_repeat_opt 0.01% : 0.000029s : 1: detach_backward 0.00% : 0.000012s : 1: environ_conv 0.01% : 0.000031s : 1: event_method 0.00% : 0.000007s : 1: full_micro_interleaved_order_control 0.00% : 0.000008s : 1: get_jit_bprop_graph 0.00% : 0.000012s : 1: graph_reusing 0.00% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000006s : 1: handle_group_info 0.00% : 0.000008s : 1: inline 0.00% : 0.000009s : 1: insert-virtual-dataset 0.00% : 0.000006s : 1: interleave_parallel_branches 0.00% : 0.000006s : 1: interleave_split_concat_branches 0.00% : 0.000008s : 1: label_fine_grained_interleaved_index 0.00% : 0.000010s : 1: label_micro_interleaved_index 0.16% : 0.000526s : 1: loop_unroll 0.00% : 0.000007s : 1: merge_cast_opt 0.00% : 0.000008s : 1: micro_interleaved_order_control 0.24% : 0.000768s : 1: mutable_eliminate 0.00% : 0.000010s : 1: offloading_packed_experts 0.01% : 0.000018s : 1: opt.transform.loop_unroll_optimizer 0.01% : 0.000020s : 1: opt.transform.mutable_eliminate 0.41% : 0.001325s : 78: opt.transform.opt_a 0.01% : 0.000031s : 1: opt.transform.opt_after_cconv 0.01% : 0.000031s : 1: opt.transform.opt_after_jit_grad 0.03% : 0.000110s : 28: opt.transform.opt_b 0.02% : 0.000054s : 2: opt.transform.opt_trans_graph 0.01% : 0.000041s : 4: opt.transform.symbol_engine_opt 31.30% : 0.100900s : 1: opt_a 0.04% : 0.000143s : 1: opt_after_cconv 0.19% : 0.000627s : 1: opt_after_jit_grad 0.09% : 0.000298s : 1: opt_b 32.36% : 0.104316s : 1: optimize 0.01% : 0.000026s : 1: optimize_parallel_all_gather_comm 0.00% : 0.000011s : 1: order_py_execute_after_rewriter 0.01% : 0.000029s : 1: overlap_grad_flash_sp 0.00% : 0.000006s : 1: overlap_grad_matmul_and_grad_allreduce 0.00% : 0.000010s : 1: overlap_grad_ring_attention 0.00% : 0.000007s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000006s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000009s : 1: overlap_param_gather 0.00% : 0.000006s : 1: overlap_recompute_allgather_and_fa_grad 0.00% : 0.000011s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000008s : 1: overlap_recompute_comm 0.00% : 0.000011s : 1: parallel-infer-symbol 0.00% : 0.000006s : 1: parallel-infer-symbol-second 0.00% : 0.000008s : 1: partial_unused_args_eliminate 0.00% : 0.000010s : 1: pipeline_parallel_scheduler 0.00% : 0.000007s : 1: pipeline_split 0.02% : 0.000050s : 1: pre_auto_parallel 0.01% : 0.000038s : 1: py_interpret_to_execute 0.01% : 0.000027s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000006s : 1: remove_cast_before_assign_add 0.01% : 0.000022s : 1: remove_dup_value 30.29% : 0.097647s : 1: renormalize.infer 0.17% : 0.000532s : 1: renormalize.specialize 0.00% : 0.000009s : 1: reorder_send_recv_between_fp_bp 0.00% : 0.000013s : 1: rewriter_after_jit_bprop_graph 0.02% : 0.000052s : 1: rewriter_after_opt_a 0.03% : 0.000100s : 1: rewriter_before_opt_a 0.00% : 0.000008s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000008s : 1: slice_recompute_activation 0.00% : 0.000007s : 1: split_layernorm_comm 0.00% : 0.000007s : 1: split_matmul_comm_elemetwise 0.00% : 0.000011s : 1: swap_dp_allreduce_reducescatter 0.03% : 0.000111s : 1: symbol_engine_optimizer 0.03% : 0.000101s : 1: tuple_transform 1.92% : 0.006176s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:43:54.538.034 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.10084, [21] [bootstrap]: 0.00047532 [type_inference]: 0.0906587 [event_method]: 1.925e-05 [auto_monad]: 6.207e-05 [graph_reusing]: 6.48e-06 [inline]: 2.07999e-06 [add_attr]: 0.0035755, [1] [add_attr_with_inline]: 0.00356346, [1] [Cycle 1]: 6.963e-05, [2] [tag_attr]: 2.258e-05 [meta_addattr_fg_expand]: 5.97001e-06 [parallel-infer-symbol]: 3.33e-06 [pre_auto_parallel]: 3.938e-05 [insert-virtual-dataset]: 2.53998e-06 [parallel-infer-symbol-second]: 9.89996e-07 [dataset_repeat_opt]: 1.77999e-06 [pipeline_split]: 1.76e-06 [optimize]: 0.00524995, [53] [py_interpret_to_execute]: 2.76e-05 [rewriter_before_opt_a]: 8.655e-05 [opt_a]: 0.00306697, [2] [Cycle 1]: 0.0023661, [45] [expand_dump_flag]: 2.93e-06 [switch_simplify]: 4.329e-05 [loop_unroll]: 2.994e-05 [a_1]: 0.00065735 [with_stream_mark]: 1.7e-05 [recompute_prepare]: 9.42999e-06 [updatestate_depend_eliminate]: 4.39002e-06 [updatestate_assign_eliminate]: 3.13998e-06 [updatestate_loads_eliminate]: 2.85002e-06 [parameter_eliminate]: 2.25002e-06 [a_2]: 9.255e-05 [accelerated_algorithm]: 7.44002e-06 [shard]: 2.31e-06 [meta_shard_fg_expand]: 1.91e-06 [shard_inline]: 6.88e-06 [merge_send_recv]: 8.27e-06 [auto_parallel]: 7.75998e-06 [parallel]: 1.842e-05 [flash_sp]: 9.50001e-06 [merge_comm]: 3.81999e-06 [allreduce_fusion]: 3.39001e-06 [matmul_add_comm_reduction]: 1.02e-05 [allreduce_slice_to_reducescatter]: 1.02e-06 [virtual_shard_identity]: 8.50001e-06 [virtual_dataset]: 6.89999e-06 [get_grad_eliminate_]: 6.37001e-06 [virtual_output]: 6.51999e-06 [merge_forward]: 3.88001e-06 [cell_reuse_recompute_pass]: 1.25001e-06 [offload_activation]: 1.047e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.25e-05 [merge_recompute_call_nodes]: 1.40999e-06 [before_grad]: 1.098e-05 [set_forward_comm_id_for_comm_node_pass]: 3.86999e-06 [meta_fg_expand]: 2.93e-06 [flash_sp_send_recv_attached]: 2.56998e-06 [receive_attached]: 2.98e-06 [after_resolve]: 1.227e-05 [a_after_grad]: 9.86e-06 [renormalize]: 0.00096289 [add_forward_monad_depend]: 6.27001e-06 [auto_monad_grad]: 2.83998e-06 [auto_monad_eliminator]: 1.504e-05 [cse]: 2.93e-05 [a_3]: 4.889e-05 [Cycle 2]: 0.00068978, [45] [expand_dump_flag]: 2.21e-06 [switch_simplify]: 7.61999e-06 [loop_unroll]: 6.34001e-06 [a_1]: 0.00014529 [with_stream_mark]: 1.292e-05 [recompute_prepare]: 6.31e-06 [updatestate_depend_eliminate]: 3.13e-06 [updatestate_assign_eliminate]: 2.16e-06 [updatestate_loads_eliminate]: 2.64001e-06 [parameter_eliminate]: 1.22999e-06 [a_2]: 7.847e-05 [accelerated_algorithm]: 7.35e-06 [shard]: 1.72999e-06 [meta_shard_fg_expand]: 1.63002e-06 [shard_inline]: 6.94999e-06 [merge_send_recv]: 5.39e-06 [auto_parallel]: 6.84999e-06 [parallel]: 5.59998e-06 [flash_sp]: 3.58999e-06 [merge_comm]: 7.51001e-06 [allreduce_fusion]: 3.16999e-06 [matmul_add_comm_reduction]: 6.24001e-06 [allreduce_slice_to_reducescatter]: 7.7e-07 [virtual_shard_identity]: 8.46002e-06 [virtual_dataset]: 6.86999e-06 [get_grad_eliminate_]: 5.62001e-06 [virtual_output]: 5.74e-06 [merge_forward]: 3.08e-06 [cell_reuse_recompute_pass]: 1.89999e-06 [offload_activation]: 6.55002e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.097e-05 [merge_recompute_call_nodes]: 8.99978e-07 [before_grad]: 1.023e-05 [set_forward_comm_id_for_comm_node_pass]: 5.07e-06 [meta_fg_expand]: 2.21e-06 [flash_sp_send_recv_attached]: 1.05001e-06 [receive_attached]: 1.67999e-06 [after_resolve]: 1.172e-05 [a_after_grad]: 9.93002e-06 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 1.29e-06 [auto_monad_grad]: 1.15999e-06 [auto_monad_eliminator]: 7.14001e-06 [cse]: 1.31e-05 [a_3]: 3.641e-05 [py_interpret_to_execute_after_opt_a]: 1.094e-05 [slice_cell_reuse_recomputed_activation]: 1.86e-06 [rewriter_after_opt_a]: 3.581e-05 [convert_after_rewriter]: 6.31e-06 [order_py_execute_after_rewriter]: 5.04e-06 [mutable_eliminate]: 0.00063021 [opt_b]: 0.00020832, [1] [Cycle 1]: 0.00020119, [7] [b_1]: 0.00012518 [b_2]: 8.28001e-06 [updatestate_depend_eliminate]: 6.17999e-06 [updatestate_assign_eliminate]: 2.58e-06 [updatestate_loads_eliminate]: 2.49001e-06 [renormalize]: 5.79981e-07 [cse]: 1.922e-05 [optimize_parallel_all_gather_comm]: 1.654e-05 [overlap_param_gather]: 1.92001e-06 [cconv]: 2.805e-05 [loop_unroll]: 0.00045584 [opt_after_cconv]: 0.00010202, [1] [Cycle 1]: 9.578e-05, [7] [c_1]: 3.082e-05 [parameter_eliminate]: 3.18e-06 [updatestate_depend_eliminate]: 5.24e-06 [updatestate_assign_eliminate]: 2.68998e-06 [updatestate_loads_eliminate]: 2.27999e-06 [cse]: 1.705e-05 [renormalize]: 3.69997e-07 [remove_dup_value]: 1.418e-05 [tuple_transform]: 7.891e-05, [1] [Cycle 1]: 7.401e-05, [4] [d_1]: 4.624e-05 [none_parameter_eliminate]: 1.81003e-06 [renormalize]: 2.00002e-07 [switch_simplify]: 6.79001e-06 [partial_unused_args_eliminate]: 1.74e-06 [add_recomputation]: 4.832e-05 [cse_after_recomputation]: 2.166e-05, [1] [Cycle 1]: 1.695e-05, [1] [cse]: 1.114e-05 [environ_conv]: 5.72001e-06 [swap_dp_allreduce_reducescatter]: 5.19e-06 [bias_add_comm_swap]: 2.67001e-06 [label_micro_interleaved_index]: 4.76002e-06 [label_fine_grained_interleaved_index]: 2.56e-06 [merge_cast_opt]: 1.34e-06 [slice_recompute_activation]: 2.43e-06 [micro_interleaved_order_control]: 2.71e-06 [assign_add_opt]: 1.15999e-06 [ForceFp32Comm]: 7.89994e-07 [remove_cast_before_assign_add]: 1.01997e-06 [full_micro_interleaved_order_control]: 2.02999e-06 [reorder_send_recv_between_fp_bp]: 2.69999e-06 [comm_op_add_attrs]: 1.00999e-06 [add_comm_op_reuse_tag]: 9.70002e-07 [interleave_split_concat_branches]: 1.24998e-06 [interleave_parallel_branches]: 1.08001e-06 [overlap_opt_shard_in_pipeline]: 1.25001e-06 [overlap_opt_shard_grad_in_pipeline]: 1.69e-06 [control_data_broadcast_order]: 1.287e-05 [grouped_pairwise_exchange_alltoall]: 1.50001e-06 [offloading_packed_experts]: 4.32e-06 [overlap_recompute_and_grad_model_parallel]: 5.09e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.19003e-06 [overlap_recompute_allgather_and_fa_grad]: 1.34e-06 [overlap_recompute_comm]: 2.86e-06 [overlap_grad_ring_attention]: 4.26001e-06 [overlap_grad_flash_sp]: 2.034e-05 [begin_end_overlap_inline]: 4.89992e-07 [split_matmul_comm_elemetwise]: 2.06e-06 [split_layernorm_comm]: 1.94999e-06 [handle_group_info]: 9.89996e-07 [symbol_engine_optimizer]: 7.977e-05, [1] [Cycle 1]: 7.557e-05, [6] [build]: 3.29001e-06 [elim_shapecalc]: 1.04e-05 [elim_not_effective]: 1.378e-05 [opt_reshape]: 7.65998e-06 [fold_const_symbol]: 1.005e-05 [renormalize]: 2.09984e-07 [detach_backward]: 2.22001e-06 [pipeline_parallel_scheduler]: 1.72001e-06 [auto_monad_reorder]: 1.639e-05 [get_jit_bprop_graph]: 1.81e-06 [rewriter_after_jit_bprop_graph]: 3.72002e-06 [opt_after_jit_grad]: 0.00050846 [validate]: 4.454e-05 Sums bootstrap : 0.000475s : 0.49% type_inference : 0.090659s : 94.16% event_method : 0.000019s : 0.02% auto_monad : 0.000062s : 0.06% graph_reusing : 0.000006s : 0.01% inline : 0.000002s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000023s : 0.02% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.01% parallel-infer-symbol : 0.000003s : 0.00% pre_auto_parallel : 0.000039s : 0.04% insert-virtual-dataset : 0.000003s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000028s : 0.03% optimize.rewriter_before_opt_a : 0.000087s : 0.09% optimize.opt_a.expand_dump_flag : 0.000005s : 0.01% optimize.opt_a.switch_simplify : 0.000051s : 0.05% optimize.opt_a.loop_unroll : 0.000036s : 0.04% optimize.opt_a.a_1 : 0.000803s : 0.83% optimize.opt_a.with_stream_mark : 0.000030s : 0.03% optimize.opt_a.recompute_prepare : 0.000016s : 0.02% optimize.opt_a.updatestate_depend_eliminate : 0.000008s : 0.01% optimize.opt_a.updatestate_assign_eliminate : 0.000005s : 0.01% optimize.opt_a.updatestate_loads_eliminate : 0.000005s : 0.01% optimize.opt_a.parameter_eliminate : 0.000003s : 0.00% optimize.opt_a.a_2 : 0.000171s : 0.18% optimize.opt_a.accelerated_algorithm : 0.000015s : 0.02% optimize.opt_a.shard : 0.000004s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.00% optimize.opt_a.shard_inline : 0.000014s : 0.01% optimize.opt_a.merge_send_recv : 0.000014s : 0.01% optimize.opt_a.auto_parallel : 0.000015s : 0.02% optimize.opt_a.parallel : 0.000024s : 0.02% optimize.opt_a.flash_sp : 0.000013s : 0.01% optimize.opt_a.merge_comm : 0.000011s : 0.01% optimize.opt_a.allreduce_fusion : 0.000007s : 0.01% optimize.opt_a.matmul_add_comm_reduction : 0.000016s : 0.02% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000002s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000017s : 0.02% optimize.opt_a.virtual_dataset : 0.000014s : 0.01% optimize.opt_a.get_grad_eliminate_ : 0.000012s : 0.01% optimize.opt_a.virtual_output : 0.000012s : 0.01% optimize.opt_a.merge_forward : 0.000007s : 0.01% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.00% optimize.opt_a.offload_activation : 0.000017s : 0.02% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000023s : 0.02% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.00% optimize.opt_a.before_grad : 0.000021s : 0.02% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000009s : 0.01% optimize.opt_a.meta_fg_expand : 0.000005s : 0.01% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.00% optimize.opt_a.receive_attached : 0.000005s : 0.00% optimize.opt_a.after_resolve : 0.000024s : 0.02% optimize.opt_a.a_after_grad : 0.000020s : 0.02% optimize.opt_a.renormalize : 0.000963s : 1.00% optimize.opt_a.add_forward_monad_depend : 0.000008s : 0.01% optimize.opt_a.auto_monad_grad : 0.000004s : 0.00% optimize.opt_a.auto_monad_eliminator : 0.000022s : 0.02% optimize.opt_a.cse : 0.000042s : 0.04% optimize.opt_a.a_3 : 0.000085s : 0.09% optimize.py_interpret_to_execute_after_opt_a : 0.000011s : 0.01% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.00% optimize.rewriter_after_opt_a : 0.000036s : 0.04% optimize.convert_after_rewriter : 0.000006s : 0.01% optimize.order_py_execute_after_rewriter : 0.000005s : 0.01% optimize.mutable_eliminate : 0.000630s : 0.65% optimize.opt_b.b_1 : 0.000125s : 0.13% optimize.opt_b.b_2 : 0.000008s : 0.01% optimize.opt_b.updatestate_depend_eliminate : 0.000006s : 0.01% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000002s : 0.00% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000019s : 0.02% optimize.optimize_parallel_all_gather_comm : 0.000017s : 0.02% optimize.overlap_param_gather : 0.000002s : 0.00% optimize.cconv : 0.000028s : 0.03% optimize.loop_unroll : 0.000456s : 0.47% optimize.opt_after_cconv.c_1 : 0.000031s : 0.03% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000005s : 0.01% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.00% optimize.opt_after_cconv.cse : 0.000017s : 0.02% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000014s : 0.01% optimize.tuple_transform.d_1 : 0.000046s : 0.05% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000007s : 0.01% optimize.partial_unused_args_eliminate : 0.000002s : 0.00% optimize.add_recomputation : 0.000048s : 0.05% optimize.cse_after_recomputation.cse : 0.000011s : 0.01% optimize.environ_conv : 0.000006s : 0.01% optimize.swap_dp_allreduce_reducescatter : 0.000005s : 0.01% optimize.bias_add_comm_swap : 0.000003s : 0.00% optimize.label_micro_interleaved_index : 0.000005s : 0.00% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.00% optimize.merge_cast_opt : 0.000001s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.00% optimize.micro_interleaved_order_control : 0.000003s : 0.00% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.00% optimize.full_micro_interleaved_order_control : 0.000002s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.00% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.00% optimize.control_data_broadcast_order : 0.000013s : 0.01% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.00% optimize.offloading_packed_experts : 0.000004s : 0.00% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.01% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.00% optimize.overlap_recompute_comm : 0.000003s : 0.00% optimize.overlap_grad_ring_attention : 0.000004s : 0.00% optimize.overlap_grad_flash_sp : 0.000020s : 0.02% optimize.begin_end_overlap_inline : 0.000000s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.00% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000003s : 0.00% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000010s : 0.01% optimize.symbol_engine_optimizer.elim_not_effective : 0.000014s : 0.01% optimize.symbol_engine_optimizer.opt_reshape : 0.000008s : 0.01% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000010s : 0.01% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.00% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000016s : 0.02% get_jit_bprop_graph : 0.000002s : 0.00% rewriter_after_jit_bprop_graph : 0.000004s : 0.00% opt_after_jit_grad : 0.000508s : 0.53% validate : 0.000045s : 0.05% Time group info: ------[substitution.] 0.000228 34 14.64% : 0.000033s : 6: substitution.arithmetic_simplify 0.97% : 0.000002s : 2: substitution.elim_not_effective 0.65% : 0.000001s : 2: substitution.fold_const_symbol 2.84% : 0.000006s : 4: substitution.graph_param_transform 68.01% : 0.000155s : 4: substitution.inline 1.56% : 0.000004s : 4: substitution.j_node_and_user_rematch 2.07% : 0.000005s : 4: substitution.remove_not_recompute_node 2.30% : 0.000005s : 4: substitution.replace_old_param 6.97% : 0.000016s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.090592 2 99.20% : 0.089868s : 1: type_inference.infer 0.80% : 0.000725s : 1: type_inference.specialize ------[replace.] 0.000062 8 63.41% : 0.000040s : 4: replace.inline 36.59% : 0.000023s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000167 8 91.60% : 0.000153s : 4: match.inline 8.40% : 0.000014s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000206 1278 1.02% : 0.000002s : 13: predicate.accumulaten_eliminater 0.78% : 0.000002s : 4: predicate.ad_related_special_op_eliminate 0.51% : 0.000001s : 8: predicate.addn_check_dump 1.36% : 0.000003s : 13: predicate.addn_zero_filter 0.79% : 0.000002s : 13: predicate.adjust_all_reduce_mul_add 2.59% : 0.000005s : 21: predicate.arithmetic_simplify 0.95% : 0.000002s : 13: predicate.cast_eliminate 0.60% : 0.000001s : 8: predicate.check_bprop_eliminate 0.58% : 0.000001s : 8: predicate.compare_switch_simplify 0.19% : 0.000000s : 4: predicate.const_output_eliminate 0.59% : 0.000001s : 8: predicate.depend_value_elim 0.90% : 0.000002s : 13: predicate.dict_get_item_const_eliminator 0.96% : 0.000002s : 13: predicate.dict_get_item_eliminator 0.89% : 0.000002s : 13: predicate.dict_set_item_eliminator 1.06% : 0.000002s : 8: predicate.dumpgradient_eliminate 0.23% : 0.000000s : 4: predicate.elim_not_effective 0.44% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.15% : 0.000002s : 17: predicate.environ_add_const_eliminate 1.13% : 0.000002s : 17: predicate.environ_get_add_eliminate 1.05% : 0.000002s : 17: predicate.environ_get_depend_swap 1.64% : 0.000003s : 25: predicate.environ_get_eliminate 1.10% : 0.000002s : 17: predicate.environ_get_set_eliminate 1.47% : 0.000003s : 21: predicate.exchange_switch_depend_value 2.54% : 0.000005s : 21: predicate.float_depend_g_call 0.50% : 0.000001s : 8: predicate.float_environ_get_switch 0.79% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.18% : 0.000000s : 4: predicate.fold_const_symbol 0.65% : 0.000001s : 8: predicate.get_grad_eliminate 0.19% : 0.000000s : 4: predicate.graph_param_transform 0.63% : 0.000001s : 8: predicate.incorporate_call 0.49% : 0.000001s : 8: predicate.incorporate_call_switch 6.32% : 0.000013s : 58: predicate.inline 0.76% : 0.000002s : 8: predicate.inline_without_move 0.36% : 0.000001s : 8: predicate.j_node_and_user_rematch 0.83% : 0.000002s : 8: predicate.less_batch_normalization 1.78% : 0.000004s : 25: predicate.list_to_tuple_eliminator_ 2.46% : 0.000005s : 38: predicate.load_eliminater 0.89% : 0.000002s : 4: predicate.loop_unroll_after_grad 2.37% : 0.000005s : 34: predicate.loop_unroll_before_grad 1.61% : 0.000003s : 21: predicate.make_slice_get_slice_eliminator 0.55% : 0.000001s : 8: predicate.merge_addn 0.60% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.59% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.81% : 0.000002s : 13: predicate.minmaximum_grad 1.01% : 0.000002s : 4: predicate.mutable_eliminate 0.37% : 0.000001s : 4: predicate.opt_reshape 0.35% : 0.000001s : 4: predicate.parallel_virtual_node 1.70% : 0.000003s : 21: predicate.partial_defer_inline 1.61% : 0.000003s : 21: predicate.partial_eliminate 0.89% : 0.000002s : 13: predicate.print_const_string_wrapper 0.63% : 0.000001s : 8: predicate.reduce_all_const_elim 1.15% : 0.000002s : 13: predicate.reduce_eliminate 2.46% : 0.000005s : 38: predicate.redundant_stop_gradient_eliminater 0.37% : 0.000001s : 8: predicate.remove_not_recompute_node 1.51% : 0.000003s : 25: predicate.replace_applicator 0.50% : 0.000001s : 8: predicate.replace_old_param 0.27% : 0.000001s : 4: predicate.reset_defer_inline 1.00% : 0.000002s : 13: predicate.reshape_eliminate 0.64% : 0.000001s : 8: predicate.row_tensor_add_zeros_like 0.45% : 0.000001s : 4: predicate.row_tensor_eliminate 0.84% : 0.000002s : 8: predicate.same_eliminate 0.42% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.82% : 0.000002s : 8: predicate.shard_identity_eliminate 0.66% : 0.000001s : 8: predicate.special_op_eliminate 0.74% : 0.000002s : 8: predicate.specialize_transform 1.03% : 0.000002s : 8: predicate.split_environ_get_set_with_tuple_value 0.77% : 0.000002s : 8: predicate.stack_unstack_eliminate 0.32% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.55% : 0.000003s : 21: predicate.switch_defer_inline 2.05% : 0.000004s : 29: predicate.switch_layer_defer_inline 4.98% : 0.000010s : 67: predicate.switch_simplify 0.85% : 0.000002s : 13: predicate.tile_eliminate 0.84% : 0.000002s : 13: predicate.transpose_eliminate 1.54% : 0.000003s : 21: predicate.tuple_list_convert_item_index_to_positive 1.60% : 0.000003s : 21: predicate.tuple_list_get_item_const_eliminator 1.41% : 0.000003s : 21: predicate.tuple_list_get_item_depend_reorder 3.28% : 0.000007s : 33: predicate.tuple_list_get_item_eliminator 1.52% : 0.000003s : 21: predicate.tuple_list_get_set_item_eliminator 2.23% : 0.000005s : 29: predicate.tuple_list_set_item_eliminator 1.76% : 0.000004s : 25: predicate.tuple_to_list_eliminator_ 2.45% : 0.000005s : 38: predicate.updatestate_pure_node_eliminater 3.16% : 0.000006s : 46: predicate.updatestate_useless_node_eliminater 0.42% : 0.000001s : 4: predicate.value_based_eliminate 0.65% : 0.000001s : 8: predicate.virtual_dataset_eliminate 0.62% : 0.000001s : 8: predicate.virtual_output_eliminate 0.26% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.44% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000621 11 57.04% : 0.000354s : 5: func_graph_cloner_run.FuncGraphClonerGraph 42.96% : 0.000267s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.112004 192 0.00% : 0.000004s : 1: ForceFp32Comm 3.20% : 0.003581s : 1: add_attr 3.19% : 0.003568s : 1: add_attr_with_inline 0.00% : 0.000005s : 1: add_comm_op_reuse_tag 0.05% : 0.000052s : 1: add_recomputation 0.00% : 0.000004s : 1: assign_add_opt 0.06% : 0.000067s : 1: auto_monad 0.02% : 0.000020s : 1: auto_monad_reorder 0.00% : 0.000003s : 1: begin_end_overlap_inline 0.00% : 0.000005s : 1: bias_add_comm_swap 0.45% : 0.000509s : 1: bootstrap 0.03% : 0.000031s : 1: cconv 0.00% : 0.000004s : 1: comm_op_add_attrs 0.01% : 0.000016s : 1: control_data_broadcast_order 0.01% : 0.000009s : 1: convert_after_rewriter 0.02% : 0.000025s : 1: cse_after_recomputation 0.00% : 0.000005s : 1: dataset_repeat_opt 0.00% : 0.000005s : 1: detach_backward 0.01% : 0.000009s : 1: environ_conv 0.02% : 0.000026s : 1: event_method 0.00% : 0.000005s : 1: full_micro_interleaved_order_control 0.00% : 0.000005s : 1: get_jit_bprop_graph 0.01% : 0.000010s : 1: graph_reusing 0.00% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000004s : 1: handle_group_info 0.00% : 0.000005s : 1: inline 0.01% : 0.000006s : 1: insert-virtual-dataset 0.00% : 0.000004s : 1: interleave_parallel_branches 0.00% : 0.000004s : 1: interleave_split_concat_branches 0.00% : 0.000006s : 1: label_fine_grained_interleaved_index 0.01% : 0.000008s : 1: label_micro_interleaved_index 0.41% : 0.000464s : 1: loop_unroll 0.00% : 0.000004s : 1: merge_cast_opt 0.00% : 0.000006s : 1: micro_interleaved_order_control 0.57% : 0.000639s : 1: mutable_eliminate 0.01% : 0.000008s : 1: offloading_packed_experts 0.01% : 0.000014s : 1: opt.transform.loop_unroll_optimizer 0.01% : 0.000015s : 1: opt.transform.mutable_eliminate 1.11% : 0.001240s : 78: opt.transform.opt_a 0.03% : 0.000029s : 1: opt.transform.opt_after_cconv 0.02% : 0.000026s : 1: opt.transform.opt_after_jit_grad 0.09% : 0.000102s : 28: opt.transform.opt_b 0.04% : 0.000050s : 2: opt.transform.opt_trans_graph 0.03% : 0.000038s : 4: opt.transform.symbol_engine_opt 2.74% : 0.003070s : 1: opt_a 0.09% : 0.000106s : 1: opt_after_cconv 0.46% : 0.000518s : 1: opt_after_jit_grad 0.19% : 0.000212s : 1: opt_b 4.69% : 0.005256s : 1: optimize 0.02% : 0.000020s : 1: optimize_parallel_all_gather_comm 0.01% : 0.000008s : 1: order_py_execute_after_rewriter 0.02% : 0.000024s : 1: overlap_grad_flash_sp 0.00% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.01% : 0.000007s : 1: overlap_grad_ring_attention 0.00% : 0.000004s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000005s : 1: overlap_param_gather 0.00% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.01% : 0.000009s : 1: overlap_recompute_and_grad_model_parallel 0.01% : 0.000006s : 1: overlap_recompute_comm 0.01% : 0.000007s : 1: parallel-infer-symbol 0.00% : 0.000004s : 1: parallel-infer-symbol-second 0.00% : 0.000005s : 1: partial_unused_args_eliminate 0.00% : 0.000005s : 1: pipeline_parallel_scheduler 0.00% : 0.000005s : 1: pipeline_split 0.04% : 0.000043s : 1: pre_auto_parallel 0.03% : 0.000032s : 1: py_interpret_to_execute 0.01% : 0.000014s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000004s : 1: remove_cast_before_assign_add 0.02% : 0.000018s : 1: remove_dup_value 0.37% : 0.000417s : 1: renormalize.infer 0.48% : 0.000537s : 1: renormalize.specialize 0.00% : 0.000005s : 1: reorder_send_recv_between_fp_bp 0.01% : 0.000007s : 1: rewriter_after_jit_bprop_graph 0.04% : 0.000040s : 1: rewriter_after_opt_a 0.08% : 0.000091s : 1: rewriter_before_opt_a 0.00% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.01% : 0.000006s : 1: slice_recompute_activation 0.00% : 0.000005s : 1: split_layernorm_comm 0.00% : 0.000005s : 1: split_matmul_comm_elemetwise 0.01% : 0.000008s : 1: swap_dp_allreduce_reducescatter 0.07% : 0.000082s : 1: symbol_engine_optimizer 0.07% : 0.000082s : 1: tuple_transform 80.96% : 0.090680s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:43:55.519.115 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:43:55.519.415 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0300088, [21] [bootstrap]: 0.00044492 [type_inference]: 0.00627467 [event_method]: 2.137e-05 [auto_monad]: 6.582e-05 [graph_reusing]: 5.84e-06 [inline]: 2.51e-06 [add_attr]: 0.00358339, [1] [add_attr_with_inline]: 0.00357228, [1] [Cycle 1]: 9.19e-05, [2] [tag_attr]: 2.315e-05 [meta_addattr_fg_expand]: 6.09001e-06 [parallel-infer-symbol]: 3.68e-06 [pre_auto_parallel]: 4.082e-05 [insert-virtual-dataset]: 2.35002e-06 [parallel-infer-symbol-second]: 8.2e-07 [dataset_repeat_opt]: 2.11e-06 [pipeline_split]: 1.77001e-06 [optimize]: 0.0174703, [53] [py_interpret_to_execute]: 3.247e-05 [rewriter_before_opt_a]: 9.673e-05 [opt_a]: 0.0145925, [2] [Cycle 1]: 0.0135759, [45] [expand_dump_flag]: 2.96999e-06 [switch_simplify]: 4.283e-05 [loop_unroll]: 3.123e-05 [a_1]: 0.0006918 [with_stream_mark]: 1.991e-05 [recompute_prepare]: 1.032e-05 [updatestate_depend_eliminate]: 4.70001e-06 [updatestate_assign_eliminate]: 3.72002e-06 [updatestate_loads_eliminate]: 2.96001e-06 [parameter_eliminate]: 2.24999e-06 [a_2]: 0.00011922 [accelerated_algorithm]: 8.40999e-06 [shard]: 2.42001e-06 [meta_shard_fg_expand]: 1.88002e-06 [shard_inline]: 6.91001e-06 [merge_send_recv]: 9.84001e-06 [auto_parallel]: 7.53999e-06 [parallel]: 2.485e-05 [flash_sp]: 9.52001e-06 [merge_comm]: 4.19002e-06 [allreduce_fusion]: 3.65e-06 [matmul_add_comm_reduction]: 1.034e-05 [allreduce_slice_to_reducescatter]: 8.79983e-07 [virtual_shard_identity]: 9.62999e-06 [virtual_dataset]: 6.90998e-06 [get_grad_eliminate_]: 6.25002e-06 [virtual_output]: 6.70998e-06 [merge_forward]: 4.02002e-06 [cell_reuse_recompute_pass]: 1.42999e-06 [offload_activation]: 1.062e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.583e-05 [merge_recompute_call_nodes]: 1.50999e-06 [before_grad]: 1.161e-05 [set_forward_comm_id_for_comm_node_pass]: 3.7e-06 [meta_fg_expand]: 3.13e-06 [flash_sp_send_recv_attached]: 3.09999e-06 [receive_attached]: 2.71999e-06 [after_resolve]: 1.196e-05 [a_after_grad]: 1.213e-05 [renormalize]: 0.0118472 [add_forward_monad_depend]: 1.219e-05 [auto_monad_grad]: 2.86e-06 [auto_monad_eliminator]: 2.628e-05 [cse]: 3.298e-05 [a_3]: 8.106e-05 [Cycle 2]: 0.00099633, [45] [expand_dump_flag]: 2.50002e-06 [switch_simplify]: 9.10001e-06 [loop_unroll]: 7.15e-06 [a_1]: 0.00017614 [with_stream_mark]: 2.053e-05 [recompute_prepare]: 7.65e-06 [updatestate_depend_eliminate]: 3.98999e-06 [updatestate_assign_eliminate]: 3.37002e-06 [updatestate_loads_eliminate]: 3.26999e-06 [parameter_eliminate]: 1.90001e-06 [a_2]: 0.00011579 [accelerated_algorithm]: 7.96001e-06 [shard]: 2.46e-06 [meta_shard_fg_expand]: 2.15002e-06 [shard_inline]: 7.2e-06 [merge_send_recv]: 8.21002e-06 [auto_parallel]: 9.69999e-06 [parallel]: 1.022e-05 [flash_sp]: 4.17003e-06 [merge_comm]: 3.98999e-06 [allreduce_fusion]: 3.81001e-06 [matmul_add_comm_reduction]: 9.57001e-06 [allreduce_slice_to_reducescatter]: 1.09e-06 [virtual_shard_identity]: 1.299e-05 [virtual_dataset]: 8.02003e-06 [get_grad_eliminate_]: 6.83e-06 [virtual_output]: 6.68998e-06 [merge_forward]: 5.04e-06 [cell_reuse_recompute_pass]: 3.55998e-06 [offload_activation]: 1.12e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.554e-05 [merge_recompute_call_nodes]: 1.48002e-06 [before_grad]: 1.082e-05 [set_forward_comm_id_for_comm_node_pass]: 3.78001e-06 [meta_fg_expand]: 3.7e-06 [flash_sp_send_recv_attached]: 1.99e-06 [receive_attached]: 2.48e-06 [after_resolve]: 1.47e-05 [a_after_grad]: 1.192e-05 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 1.81998e-06 [auto_monad_grad]: 1.40999e-06 [auto_monad_eliminator]: 9.62999e-06 [cse]: 1.675e-05 [a_3]: 5.518e-05 [py_interpret_to_execute_after_opt_a]: 2.369e-05 [slice_cell_reuse_recomputed_activation]: 5.51e-06 [rewriter_after_opt_a]: 4.654e-05 [convert_after_rewriter]: 1.123e-05 [order_py_execute_after_rewriter]: 8.79e-06 [mutable_eliminate]: 0.00078188 [opt_b]: 0.00030507, [1] [Cycle 1]: 0.0002928, [7] [b_1]: 0.00018829 [b_2]: 9.05999e-06 [updatestate_depend_eliminate]: 7.28e-06 [updatestate_assign_eliminate]: 2.88998e-06 [updatestate_loads_eliminate]: 2.75002e-06 [renormalize]: 6.59988e-07 [cse]: 2.053e-05 [optimize_parallel_all_gather_comm]: 2.162e-05 [overlap_param_gather]: 1.349e-05 [cconv]: 3.707e-05 [loop_unroll]: 0.00052496 [opt_after_cconv]: 0.00014194, [1] [Cycle 1]: 0.0001312, [7] [c_1]: 3.516e-05 [parameter_eliminate]: 4.33001e-06 [updatestate_depend_eliminate]: 6.07999e-06 [updatestate_assign_eliminate]: 2.81e-06 [updatestate_loads_eliminate]: 2.81999e-06 [cse]: 1.971e-05 [renormalize]: 3.30008e-07 [remove_dup_value]: 7.313e-05 [tuple_transform]: 0.00010443, [1] [Cycle 1]: 9.571e-05, [4] [d_1]: 5.155e-05 [none_parameter_eliminate]: 2.44999e-06 [renormalize]: 2.19996e-07 [switch_simplify]: 8.45999e-06 [partial_unused_args_eliminate]: 4.55001e-06 [add_recomputation]: 6.13e-05 [cse_after_recomputation]: 3.064e-05, [1] [Cycle 1]: 2.338e-05, [1] [cse]: 1.357e-05 [environ_conv]: 9.02e-06 [swap_dp_allreduce_reducescatter]: 8.72e-06 [bias_add_comm_swap]: 6.53e-06 [label_micro_interleaved_index]: 8.38999e-06 [label_fine_grained_interleaved_index]: 5.59e-06 [merge_cast_opt]: 3.92998e-06 [slice_recompute_activation]: 5.23002e-06 [micro_interleaved_order_control]: 5.51002e-06 [assign_add_opt]: 4.13001e-06 [ForceFp32Comm]: 3.6e-06 [remove_cast_before_assign_add]: 3.73001e-06 [full_micro_interleaved_order_control]: 5.00001e-06 [reorder_send_recv_between_fp_bp]: 6.14999e-06 [comm_op_add_attrs]: 4.1e-06 [add_comm_op_reuse_tag]: 3.36999e-06 [interleave_split_concat_branches]: 3.9e-06 [interleave_parallel_branches]: 4.02e-06 [overlap_opt_shard_in_pipeline]: 8.65001e-06 [overlap_opt_shard_grad_in_pipeline]: 5.10001e-06 [control_data_broadcast_order]: 1.813e-05 [grouped_pairwise_exchange_alltoall]: 4.37e-06 [offloading_packed_experts]: 7.05998e-06 [overlap_recompute_and_grad_model_parallel]: 7.85998e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.81999e-06 [overlap_recompute_allgather_and_fa_grad]: 3.8e-06 [overlap_recompute_comm]: 4.92e-06 [overlap_grad_ring_attention]: 6.49999e-06 [overlap_grad_flash_sp]: 2.76e-05 [begin_end_overlap_inline]: 3.18e-06 [split_matmul_comm_elemetwise]: 4.99e-06 [split_layernorm_comm]: 4.48001e-06 [handle_group_info]: 3.5e-06 [symbol_engine_optimizer]: 0.00010891, [1] [Cycle 1]: 0.00010068, [6] [build]: 3.80998e-06 [elim_shapecalc]: 1.134e-05 [elim_not_effective]: 1.435e-05 [opt_reshape]: 8.67e-06 [fold_const_symbol]: 1.18e-05 [renormalize]: 1.80007e-07 [detach_backward]: 6.12999e-06 [pipeline_parallel_scheduler]: 2.62001e-06 [auto_monad_reorder]: 2.753e-05 [get_jit_bprop_graph]: 2.02999e-06 [rewriter_after_jit_bprop_graph]: 7.48e-06 [opt_after_jit_grad]: 0.00072255 [validate]: 4.882e-05 Sums bootstrap : 0.000445s : 1.87% type_inference : 0.006275s : 26.30% event_method : 0.000021s : 0.09% auto_monad : 0.000066s : 0.28% graph_reusing : 0.000006s : 0.02% inline : 0.000003s : 0.01% add_attr.add_attr_with_inline.tag_attr : 0.000023s : 0.10% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.03% parallel-infer-symbol : 0.000004s : 0.02% pre_auto_parallel : 0.000041s : 0.17% insert-virtual-dataset : 0.000002s : 0.01% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.01% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000032s : 0.14% optimize.rewriter_before_opt_a : 0.000097s : 0.41% optimize.opt_a.expand_dump_flag : 0.000005s : 0.02% optimize.opt_a.switch_simplify : 0.000052s : 0.22% optimize.opt_a.loop_unroll : 0.000038s : 0.16% optimize.opt_a.a_1 : 0.000868s : 3.64% optimize.opt_a.with_stream_mark : 0.000040s : 0.17% optimize.opt_a.recompute_prepare : 0.000018s : 0.08% optimize.opt_a.updatestate_depend_eliminate : 0.000009s : 0.04% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.03% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.03% optimize.opt_a.parameter_eliminate : 0.000004s : 0.02% optimize.opt_a.a_2 : 0.000235s : 0.99% optimize.opt_a.accelerated_algorithm : 0.000016s : 0.07% optimize.opt_a.shard : 0.000005s : 0.02% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.02% optimize.opt_a.shard_inline : 0.000014s : 0.06% optimize.opt_a.merge_send_recv : 0.000018s : 0.08% optimize.opt_a.auto_parallel : 0.000017s : 0.07% optimize.opt_a.parallel : 0.000035s : 0.15% optimize.opt_a.flash_sp : 0.000014s : 0.06% optimize.opt_a.merge_comm : 0.000008s : 0.03% optimize.opt_a.allreduce_fusion : 0.000007s : 0.03% optimize.opt_a.matmul_add_comm_reduction : 0.000020s : 0.08% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000002s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000023s : 0.09% optimize.opt_a.virtual_dataset : 0.000015s : 0.06% optimize.opt_a.get_grad_eliminate_ : 0.000013s : 0.05% optimize.opt_a.virtual_output : 0.000013s : 0.06% optimize.opt_a.merge_forward : 0.000009s : 0.04% optimize.opt_a.cell_reuse_recompute_pass : 0.000005s : 0.02% optimize.opt_a.offload_activation : 0.000022s : 0.09% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000031s : 0.13% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.01% optimize.opt_a.before_grad : 0.000022s : 0.09% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000007s : 0.03% optimize.opt_a.meta_fg_expand : 0.000007s : 0.03% optimize.opt_a.flash_sp_send_recv_attached : 0.000005s : 0.02% optimize.opt_a.receive_attached : 0.000005s : 0.02% optimize.opt_a.after_resolve : 0.000027s : 0.11% optimize.opt_a.a_after_grad : 0.000024s : 0.10% optimize.opt_a.renormalize : 0.011847s : 49.67% optimize.opt_a.add_forward_monad_depend : 0.000014s : 0.06% optimize.opt_a.auto_monad_grad : 0.000004s : 0.02% optimize.opt_a.auto_monad_eliminator : 0.000036s : 0.15% optimize.opt_a.cse : 0.000050s : 0.21% optimize.opt_a.a_3 : 0.000136s : 0.57% optimize.py_interpret_to_execute_after_opt_a : 0.000024s : 0.10% optimize.slice_cell_reuse_recomputed_activation : 0.000006s : 0.02% optimize.rewriter_after_opt_a : 0.000047s : 0.20% optimize.convert_after_rewriter : 0.000011s : 0.05% optimize.order_py_execute_after_rewriter : 0.000009s : 0.04% optimize.mutable_eliminate : 0.000782s : 3.28% optimize.opt_b.b_1 : 0.000188s : 0.79% optimize.opt_b.b_2 : 0.000009s : 0.04% optimize.opt_b.updatestate_depend_eliminate : 0.000007s : 0.03% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.01% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.01% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000021s : 0.09% optimize.optimize_parallel_all_gather_comm : 0.000022s : 0.09% optimize.overlap_param_gather : 0.000013s : 0.06% optimize.cconv : 0.000037s : 0.16% optimize.loop_unroll : 0.000525s : 2.20% optimize.opt_after_cconv.c_1 : 0.000035s : 0.15% optimize.opt_after_cconv.parameter_eliminate : 0.000004s : 0.02% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.03% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.cse : 0.000020s : 0.08% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000073s : 0.31% optimize.tuple_transform.d_1 : 0.000052s : 0.22% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000008s : 0.04% optimize.partial_unused_args_eliminate : 0.000005s : 0.02% optimize.add_recomputation : 0.000061s : 0.26% optimize.cse_after_recomputation.cse : 0.000014s : 0.06% optimize.environ_conv : 0.000009s : 0.04% optimize.swap_dp_allreduce_reducescatter : 0.000009s : 0.04% optimize.bias_add_comm_swap : 0.000007s : 0.03% optimize.label_micro_interleaved_index : 0.000008s : 0.04% optimize.label_fine_grained_interleaved_index : 0.000006s : 0.02% optimize.merge_cast_opt : 0.000004s : 0.02% optimize.slice_recompute_activation : 0.000005s : 0.02% optimize.micro_interleaved_order_control : 0.000006s : 0.02% optimize.assign_add_opt : 0.000004s : 0.02% optimize.ForceFp32Comm : 0.000004s : 0.02% optimize.remove_cast_before_assign_add : 0.000004s : 0.02% optimize.full_micro_interleaved_order_control : 0.000005s : 0.02% optimize.reorder_send_recv_between_fp_bp : 0.000006s : 0.03% optimize.comm_op_add_attrs : 0.000004s : 0.02% optimize.add_comm_op_reuse_tag : 0.000003s : 0.01% optimize.interleave_split_concat_branches : 0.000004s : 0.02% optimize.interleave_parallel_branches : 0.000004s : 0.02% optimize.overlap_opt_shard_in_pipeline : 0.000009s : 0.04% optimize.overlap_opt_shard_grad_in_pipeline : 0.000005s : 0.02% optimize.control_data_broadcast_order : 0.000018s : 0.08% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.02% optimize.offloading_packed_experts : 0.000007s : 0.03% optimize.overlap_recompute_and_grad_model_parallel : 0.000008s : 0.03% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.02% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.02% optimize.overlap_recompute_comm : 0.000005s : 0.02% optimize.overlap_grad_ring_attention : 0.000006s : 0.03% optimize.overlap_grad_flash_sp : 0.000028s : 0.12% optimize.begin_end_overlap_inline : 0.000003s : 0.01% optimize.split_matmul_comm_elemetwise : 0.000005s : 0.02% optimize.split_layernorm_comm : 0.000004s : 0.02% optimize.handle_group_info : 0.000003s : 0.01% optimize.symbol_engine_optimizer.build : 0.000004s : 0.02% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000011s : 0.05% optimize.symbol_engine_optimizer.elim_not_effective : 0.000014s : 0.06% optimize.symbol_engine_optimizer.opt_reshape : 0.000009s : 0.04% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000012s : 0.05% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000006s : 0.03% pipeline_parallel_scheduler : 0.000003s : 0.01% auto_monad_reorder : 0.000028s : 0.12% get_jit_bprop_graph : 0.000002s : 0.01% rewriter_after_jit_bprop_graph : 0.000007s : 0.03% opt_after_jit_grad : 0.000723s : 3.03% validate : 0.000049s : 0.20% Time group info: ------[substitution.] 0.000264 34 16.01% : 0.000042s : 6: substitution.arithmetic_simplify 0.73% : 0.000002s : 2: substitution.elim_not_effective 0.68% : 0.000002s : 2: substitution.fold_const_symbol 2.85% : 0.000008s : 4: substitution.graph_param_transform 60.34% : 0.000160s : 4: substitution.inline 1.64% : 0.000004s : 4: substitution.j_node_and_user_rematch 1.93% : 0.000005s : 4: substitution.remove_not_recompute_node 2.41% : 0.000006s : 4: substitution.replace_old_param 13.41% : 0.000035s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.006214 2 87.68% : 0.005448s : 1: type_inference.infer 12.32% : 0.000766s : 1: type_inference.specialize ------[replace.] 0.000065 8 63.14% : 0.000041s : 4: replace.inline 36.86% : 0.000024s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000190 8 82.34% : 0.000157s : 4: match.inline 17.66% : 0.000034s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000236 1278 0.97% : 0.000002s : 13: predicate.accumulaten_eliminater 0.95% : 0.000002s : 4: predicate.ad_related_special_op_eliminate 0.60% : 0.000001s : 8: predicate.addn_check_dump 0.86% : 0.000002s : 13: predicate.addn_zero_filter 0.74% : 0.000002s : 13: predicate.adjust_all_reduce_mul_add 2.67% : 0.000006s : 21: predicate.arithmetic_simplify 0.91% : 0.000002s : 13: predicate.cast_eliminate 0.55% : 0.000001s : 8: predicate.check_bprop_eliminate 0.50% : 0.000001s : 8: predicate.compare_switch_simplify 0.17% : 0.000000s : 4: predicate.const_output_eliminate 0.70% : 0.000002s : 8: predicate.depend_value_elim 0.85% : 0.000002s : 13: predicate.dict_get_item_const_eliminator 1.07% : 0.000003s : 13: predicate.dict_get_item_eliminator 0.81% : 0.000002s : 13: predicate.dict_set_item_eliminator 1.56% : 0.000004s : 8: predicate.dumpgradient_eliminate 0.26% : 0.000001s : 4: predicate.elim_not_effective 0.44% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.09% : 0.000003s : 17: predicate.environ_add_const_eliminate 1.21% : 0.000003s : 17: predicate.environ_get_add_eliminate 1.11% : 0.000003s : 17: predicate.environ_get_depend_swap 1.63% : 0.000004s : 25: predicate.environ_get_eliminate 0.99% : 0.000002s : 17: predicate.environ_get_set_eliminate 1.32% : 0.000003s : 21: predicate.exchange_switch_depend_value 2.34% : 0.000006s : 21: predicate.float_depend_g_call 0.62% : 0.000001s : 8: predicate.float_environ_get_switch 0.89% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.19% : 0.000000s : 4: predicate.fold_const_symbol 0.72% : 0.000002s : 8: predicate.get_grad_eliminate 0.25% : 0.000001s : 4: predicate.graph_param_transform 0.53% : 0.000001s : 8: predicate.incorporate_call 0.44% : 0.000001s : 8: predicate.incorporate_call_switch 5.99% : 0.000014s : 58: predicate.inline 0.88% : 0.000002s : 8: predicate.inline_without_move 0.30% : 0.000001s : 8: predicate.j_node_and_user_rematch 1.04% : 0.000002s : 8: predicate.less_batch_normalization 1.90% : 0.000004s : 25: predicate.list_to_tuple_eliminator_ 2.42% : 0.000006s : 38: predicate.load_eliminater 0.78% : 0.000002s : 4: predicate.loop_unroll_after_grad 2.19% : 0.000005s : 34: predicate.loop_unroll_before_grad 1.60% : 0.000004s : 21: predicate.make_slice_get_slice_eliminator 0.53% : 0.000001s : 8: predicate.merge_addn 0.74% : 0.000002s : 8: predicate.micro_step_allgather_replace 0.59% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.75% : 0.000002s : 13: predicate.minmaximum_grad 1.20% : 0.000003s : 4: predicate.mutable_eliminate 0.40% : 0.000001s : 4: predicate.opt_reshape 0.52% : 0.000001s : 4: predicate.parallel_virtual_node 2.01% : 0.000005s : 21: predicate.partial_defer_inline 1.50% : 0.000004s : 21: predicate.partial_eliminate 0.88% : 0.000002s : 13: predicate.print_const_string_wrapper 0.56% : 0.000001s : 8: predicate.reduce_all_const_elim 1.15% : 0.000003s : 13: predicate.reduce_eliminate 2.36% : 0.000006s : 38: predicate.redundant_stop_gradient_eliminater 0.47% : 0.000001s : 8: predicate.remove_not_recompute_node 1.42% : 0.000003s : 25: predicate.replace_applicator 0.47% : 0.000001s : 8: predicate.replace_old_param 0.37% : 0.000001s : 4: predicate.reset_defer_inline 0.89% : 0.000002s : 13: predicate.reshape_eliminate 0.54% : 0.000001s : 8: predicate.row_tensor_add_zeros_like 0.50% : 0.000001s : 4: predicate.row_tensor_eliminate 0.94% : 0.000002s : 8: predicate.same_eliminate 0.40% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.95% : 0.000002s : 8: predicate.shard_identity_eliminate 0.83% : 0.000002s : 8: predicate.special_op_eliminate 0.65% : 0.000002s : 8: predicate.specialize_transform 1.05% : 0.000002s : 8: predicate.split_environ_get_set_with_tuple_value 1.11% : 0.000003s : 8: predicate.stack_unstack_eliminate 0.38% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.41% : 0.000003s : 21: predicate.switch_defer_inline 1.90% : 0.000005s : 29: predicate.switch_layer_defer_inline 4.86% : 0.000011s : 67: predicate.switch_simplify 0.92% : 0.000002s : 13: predicate.tile_eliminate 0.78% : 0.000002s : 13: predicate.transpose_eliminate 1.49% : 0.000004s : 21: predicate.tuple_list_convert_item_index_to_positive 1.41% : 0.000003s : 21: predicate.tuple_list_get_item_const_eliminator 1.38% : 0.000003s : 21: predicate.tuple_list_get_item_depend_reorder 3.26% : 0.000008s : 33: predicate.tuple_list_get_item_eliminator 1.60% : 0.000004s : 21: predicate.tuple_list_get_set_item_eliminator 2.30% : 0.000005s : 29: predicate.tuple_list_set_item_eliminator 1.71% : 0.000004s : 25: predicate.tuple_to_list_eliminator_ 2.12% : 0.000005s : 38: predicate.updatestate_pure_node_eliminater 2.85% : 0.000007s : 46: predicate.updatestate_useless_node_eliminater 0.49% : 0.000001s : 4: predicate.value_based_eliminate 0.85% : 0.000002s : 8: predicate.virtual_dataset_eliminate 0.59% : 0.000001s : 8: predicate.virtual_output_eliminate 0.22% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.63% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000651 11 47.92% : 0.000312s : 5: func_graph_cloner_run.FuncGraphClonerGraph 52.08% : 0.000339s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.064406 192 0.01% : 0.000006s : 1: ForceFp32Comm 5.58% : 0.003594s : 1: add_attr 5.55% : 0.003576s : 1: add_attr_with_inline 0.01% : 0.000007s : 1: add_comm_op_reuse_tag 0.10% : 0.000065s : 1: add_recomputation 0.01% : 0.000007s : 1: assign_add_opt 0.12% : 0.000075s : 1: auto_monad 0.06% : 0.000036s : 1: auto_monad_reorder 0.01% : 0.000006s : 1: begin_end_overlap_inline 0.01% : 0.000010s : 1: bias_add_comm_swap 0.77% : 0.000496s : 1: bootstrap 0.06% : 0.000040s : 1: cconv 0.01% : 0.000007s : 1: comm_op_add_attrs 0.03% : 0.000021s : 1: control_data_broadcast_order 0.02% : 0.000015s : 1: convert_after_rewriter 0.05% : 0.000034s : 1: cse_after_recomputation 0.01% : 0.000008s : 1: dataset_repeat_opt 0.05% : 0.000029s : 1: detach_backward 0.02% : 0.000012s : 1: environ_conv 0.05% : 0.000032s : 1: event_method 0.01% : 0.000008s : 1: full_micro_interleaved_order_control 0.01% : 0.000009s : 1: get_jit_bprop_graph 0.02% : 0.000012s : 1: graph_reusing 0.01% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000006s : 1: handle_group_info 0.01% : 0.000008s : 1: inline 0.01% : 0.000009s : 1: insert-virtual-dataset 0.01% : 0.000007s : 1: interleave_parallel_branches 0.01% : 0.000007s : 1: interleave_split_concat_branches 0.01% : 0.000009s : 1: label_fine_grained_interleaved_index 0.02% : 0.000011s : 1: label_micro_interleaved_index 0.83% : 0.000532s : 1: loop_unroll 0.01% : 0.000007s : 1: merge_cast_opt 0.01% : 0.000008s : 1: micro_interleaved_order_control 1.23% : 0.000790s : 1: mutable_eliminate 0.02% : 0.000010s : 1: offloading_packed_experts 0.02% : 0.000016s : 1: opt.transform.loop_unroll_optimizer 0.03% : 0.000018s : 1: opt.transform.mutable_eliminate 2.10% : 0.001351s : 78: opt.transform.opt_a 0.05% : 0.000034s : 1: opt.transform.opt_after_cconv 0.05% : 0.000034s : 1: opt.transform.opt_after_jit_grad 0.18% : 0.000116s : 28: opt.transform.opt_b 0.09% : 0.000057s : 2: opt.transform.opt_trans_graph 0.06% : 0.000042s : 4: opt.transform.symbol_engine_opt 22.66% : 0.014597s : 1: opt_a 0.23% : 0.000146s : 1: opt_after_cconv 1.14% : 0.000735s : 1: opt_after_jit_grad 0.48% : 0.000309s : 1: opt_b 28.66% : 0.018457s : 1: optimize 0.04% : 0.000025s : 1: optimize_parallel_all_gather_comm 0.02% : 0.000012s : 1: order_py_execute_after_rewriter 0.05% : 0.000031s : 1: overlap_grad_flash_sp 0.01% : 0.000007s : 1: overlap_grad_matmul_and_grad_allreduce 0.01% : 0.000010s : 1: overlap_grad_ring_attention 0.01% : 0.000008s : 1: overlap_opt_shard_grad_in_pipeline 0.02% : 0.000012s : 1: overlap_opt_shard_in_pipeline 0.03% : 0.000017s : 1: overlap_param_gather 0.01% : 0.000006s : 1: overlap_recompute_allgather_and_fa_grad 0.02% : 0.000012s : 1: overlap_recompute_and_grad_model_parallel 0.01% : 0.000008s : 1: overlap_recompute_comm 0.02% : 0.000011s : 1: parallel-infer-symbol 0.01% : 0.000007s : 1: parallel-infer-symbol-second 0.01% : 0.000008s : 1: partial_unused_args_eliminate 0.02% : 0.000012s : 1: pipeline_parallel_scheduler 0.01% : 0.000007s : 1: pipeline_split 0.08% : 0.000049s : 1: pre_auto_parallel 0.06% : 0.000036s : 1: py_interpret_to_execute 0.04% : 0.000027s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000006s : 1: remove_cast_before_assign_add 0.12% : 0.000077s : 1: remove_dup_value 17.57% : 0.011315s : 1: renormalize.infer 0.80% : 0.000514s : 1: renormalize.specialize 0.01% : 0.000009s : 1: reorder_send_recv_between_fp_bp 0.02% : 0.000015s : 1: rewriter_after_jit_bprop_graph 0.08% : 0.000051s : 1: rewriter_after_opt_a 0.16% : 0.000101s : 1: rewriter_before_opt_a 0.01% : 0.000008s : 1: slice_cell_reuse_recomputed_activation 0.02% : 0.000010s : 1: slice_recompute_activation 0.01% : 0.000008s : 1: split_layernorm_comm 0.01% : 0.000008s : 1: split_matmul_comm_elemetwise 0.02% : 0.000012s : 1: swap_dp_allreduce_reducescatter 0.17% : 0.000112s : 1: symbol_engine_optimizer 0.17% : 0.000107s : 1: tuple_transform 9.81% : 0.006321s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:43:56.885.216 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0692082, [21] [bootstrap]: 0.00042599 [type_inference]: 0.045284 [event_method]: 2.232e-05 [auto_monad]: 7.013e-05 [graph_reusing]: 6.34001e-06 [inline]: 3.25e-06 [add_attr]: 0.00379864, [1] [add_attr_with_inline]: 0.00378519, [1] [Cycle 1]: 7.436e-05, [2] [tag_attr]: 2.34e-05 [meta_addattr_fg_expand]: 6.21e-06 [parallel-infer-symbol]: 3.32002e-06 [pre_auto_parallel]: 4.257e-05 [insert-virtual-dataset]: 3.25e-06 [parallel-infer-symbol-second]: 9.09989e-07 [dataset_repeat_opt]: 2.44001e-06 [pipeline_split]: 2.11998e-06 [optimize]: 0.018779, [53] [py_interpret_to_execute]: 3.018e-05 [rewriter_before_opt_a]: 9.182e-05 [opt_a]: 0.0164059, [2] [Cycle 1]: 0.0156131, [45] [expand_dump_flag]: 3.08998e-06 [switch_simplify]: 4.52e-05 [loop_unroll]: 3.121e-05 [a_1]: 0.00070675 [with_stream_mark]: 2.157e-05 [recompute_prepare]: 1.093e-05 [updatestate_depend_eliminate]: 4.54002e-06 [updatestate_assign_eliminate]: 3.10998e-06 [updatestate_loads_eliminate]: 2.93e-06 [parameter_eliminate]: 1.87999e-06 [a_2]: 9.395e-05 [accelerated_algorithm]: 7.33999e-06 [shard]: 2.02999e-06 [meta_shard_fg_expand]: 1.91e-06 [shard_inline]: 6.71999e-06 [merge_send_recv]: 9.48002e-06 [auto_parallel]: 8.13001e-06 [parallel]: 1.881e-05 [flash_sp]: 9.17001e-06 [merge_comm]: 3.98001e-06 [allreduce_fusion]: 3.42997e-06 [matmul_add_comm_reduction]: 9.93002e-06 [allreduce_slice_to_reducescatter]: 7.89994e-07 [virtual_shard_identity]: 9.04e-06 [virtual_dataset]: 6.71e-06 [get_grad_eliminate_]: 6.29999e-06 [virtual_output]: 7.08998e-06 [merge_forward]: 4.20999e-06 [cell_reuse_recompute_pass]: 1.67999e-06 [offload_activation]: 1.205e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.237e-05 [merge_recompute_call_nodes]: 1.86e-06 [before_grad]: 1.119e-05 [set_forward_comm_id_for_comm_node_pass]: 4.12e-06 [meta_fg_expand]: 2.96001e-06 [flash_sp_send_recv_attached]: 3.06001e-06 [receive_attached]: 2.40002e-06 [after_resolve]: 1.313e-05 [a_after_grad]: 1.059e-05 [renormalize]: 0.0140745 [add_forward_monad_depend]: 1.293e-05 [auto_monad_grad]: 2.63e-06 [auto_monad_eliminator]: 2.635e-05 [cse]: 3.293e-05 [a_3]: 6.34e-05 [Cycle 2]: 0.00077833, [45] [expand_dump_flag]: 2.39999e-06 [switch_simplify]: 9.22001e-06 [loop_unroll]: 7.35003e-06 [a_1]: 0.00017152 [with_stream_mark]: 2.068e-05 [recompute_prepare]: 6.68003e-06 [updatestate_depend_eliminate]: 3.83001e-06 [updatestate_assign_eliminate]: 3.01999e-06 [updatestate_loads_eliminate]: 3.34001e-06 [parameter_eliminate]: 2.50002e-06 [a_2]: 8.37e-05 [accelerated_algorithm]: 6.32001e-06 [shard]: 2.84001e-06 [meta_shard_fg_expand]: 2.15002e-06 [shard_inline]: 6.20002e-06 [merge_send_recv]: 8.23999e-06 [auto_parallel]: 1.058e-05 [parallel]: 9.94001e-06 [flash_sp]: 3.88001e-06 [merge_comm]: 3.61999e-06 [allreduce_fusion]: 3.7e-06 [matmul_add_comm_reduction]: 9.15001e-06 [allreduce_slice_to_reducescatter]: 8.79983e-07 [virtual_shard_identity]: 8.94998e-06 [virtual_dataset]: 5.95002e-06 [get_grad_eliminate_]: 6.02999e-06 [virtual_output]: 6.12999e-06 [merge_forward]: 4.50001e-06 [cell_reuse_recompute_pass]: 3.33e-06 [offload_activation]: 1.14e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.259e-05 [merge_recompute_call_nodes]: 1.45999e-06 [before_grad]: 1.095e-05 [set_forward_comm_id_for_comm_node_pass]: 4.16001e-06 [meta_fg_expand]: 2.61e-06 [flash_sp_send_recv_attached]: 2.39999e-06 [receive_attached]: 2.42001e-06 [after_resolve]: 1.42e-05 [a_after_grad]: 9.97999e-06 [renormalize]: 8.00064e-08 [add_forward_monad_depend]: 1.55001e-06 [auto_monad_grad]: 1.89e-06 [auto_monad_eliminator]: 8.94e-06 [cse]: 1.61e-05 [a_3]: 3.747e-05 [py_interpret_to_execute_after_opt_a]: 1.744e-05 [slice_cell_reuse_recomputed_activation]: 2.05002e-06 [rewriter_after_opt_a]: 4.09e-05 [convert_after_rewriter]: 7.31999e-06 [order_py_execute_after_rewriter]: 4.90999e-06 [mutable_eliminate]: 0.00074289 [opt_b]: 0.00022334, [1] [Cycle 1]: 0.00021504, [7] [b_1]: 0.00013154 [b_2]: 9.42001e-06 [updatestate_depend_eliminate]: 8.24002e-06 [updatestate_assign_eliminate]: 2.66e-06 [updatestate_loads_eliminate]: 2.44001e-06 [renormalize]: 8.39995e-07 [cse]: 2.284e-05 [optimize_parallel_all_gather_comm]: 1.854e-05 [overlap_param_gather]: 2.26e-06 [cconv]: 3.027e-05 [loop_unroll]: 0.00046967 [opt_after_cconv]: 0.00010749, [1] [Cycle 1]: 0.00010131, [7] [c_1]: 3.267e-05 [parameter_eliminate]: 3.69002e-06 [updatestate_depend_eliminate]: 5.67999e-06 [updatestate_assign_eliminate]: 2.57001e-06 [updatestate_loads_eliminate]: 2.24999e-06 [cse]: 1.903e-05 [renormalize]: 3.60014e-07 [remove_dup_value]: 1.505e-05 [tuple_transform]: 8.051e-05, [1] [Cycle 1]: 7.553e-05, [4] [d_1]: 4.754e-05 [none_parameter_eliminate]: 1.66998e-06 [renormalize]: 2.29978e-07 [switch_simplify]: 6.89999e-06 [partial_unused_args_eliminate]: 1.71e-06 [add_recomputation]: 5.4e-05 [cse_after_recomputation]: 2.252e-05, [1] [Cycle 1]: 1.73e-05, [1] [cse]: 1.184e-05 [environ_conv]: 5.96998e-06 [swap_dp_allreduce_reducescatter]: 4.73001e-06 [bias_add_comm_swap]: 2.74999e-06 [label_micro_interleaved_index]: 4.41002e-06 [label_fine_grained_interleaved_index]: 2.61e-06 [merge_cast_opt]: 1.34998e-06 [slice_recompute_activation]: 2.11e-06 [micro_interleaved_order_control]: 2.44001e-06 [assign_add_opt]: 1.49e-06 [ForceFp32Comm]: 8.89995e-07 [remove_cast_before_assign_add]: 1.05001e-06 [full_micro_interleaved_order_control]: 2.24001e-06 [reorder_send_recv_between_fp_bp]: 2.89001e-06 [comm_op_add_attrs]: 1.15001e-06 [add_comm_op_reuse_tag]: 1.18001e-06 [interleave_split_concat_branches]: 1.15001e-06 [interleave_parallel_branches]: 1.05999e-06 [overlap_opt_shard_in_pipeline]: 1.39e-06 [overlap_opt_shard_grad_in_pipeline]: 1.72999e-06 [control_data_broadcast_order]: 1.339e-05 [grouped_pairwise_exchange_alltoall]: 2.16e-06 [offloading_packed_experts]: 4.32e-06 [overlap_recompute_and_grad_model_parallel]: 4.98001e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.32e-06 [overlap_recompute_allgather_and_fa_grad]: 1.32999e-06 [overlap_recompute_comm]: 2.36998e-06 [overlap_grad_ring_attention]: 4.25e-06 [overlap_grad_flash_sp]: 2.126e-05 [begin_end_overlap_inline]: 4.7998e-07 [split_matmul_comm_elemetwise]: 2.14999e-06 [split_layernorm_comm]: 1.79e-06 [handle_group_info]: 9.39996e-07 [symbol_engine_optimizer]: 7.881e-05, [1] [Cycle 1]: 7.423e-05, [6] [build]: 3.36999e-06 [elim_shapecalc]: 1.046e-05 [elim_not_effective]: 1.268e-05 [opt_reshape]: 7.2e-06 [fold_const_symbol]: 1.054e-05 [renormalize]: 1.8999e-07 [detach_backward]: 2.44001e-06 [pipeline_parallel_scheduler]: 1.49e-06 [auto_monad_reorder]: 1.647e-05 [get_jit_bprop_graph]: 2.16e-06 [rewriter_after_jit_bprop_graph]: 5.47999e-06 [opt_after_jit_grad]: 0.00049768 [validate]: 4.648e-05 Sums bootstrap : 0.000426s : 0.66% type_inference : 0.045284s : 70.38% event_method : 0.000022s : 0.03% auto_monad : 0.000070s : 0.11% graph_reusing : 0.000006s : 0.01% inline : 0.000003s : 0.01% add_attr.add_attr_with_inline.tag_attr : 0.000023s : 0.04% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.01% parallel-infer-symbol : 0.000003s : 0.01% pre_auto_parallel : 0.000043s : 0.07% insert-virtual-dataset : 0.000003s : 0.01% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000030s : 0.05% optimize.rewriter_before_opt_a : 0.000092s : 0.14% optimize.opt_a.expand_dump_flag : 0.000005s : 0.01% optimize.opt_a.switch_simplify : 0.000054s : 0.08% optimize.opt_a.loop_unroll : 0.000039s : 0.06% optimize.opt_a.a_1 : 0.000878s : 1.37% optimize.opt_a.with_stream_mark : 0.000042s : 0.07% optimize.opt_a.recompute_prepare : 0.000018s : 0.03% optimize.opt_a.updatestate_depend_eliminate : 0.000008s : 0.01% optimize.opt_a.updatestate_assign_eliminate : 0.000006s : 0.01% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.01% optimize.opt_a.parameter_eliminate : 0.000004s : 0.01% optimize.opt_a.a_2 : 0.000178s : 0.28% optimize.opt_a.accelerated_algorithm : 0.000014s : 0.02% optimize.opt_a.shard : 0.000005s : 0.01% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.01% optimize.opt_a.shard_inline : 0.000013s : 0.02% optimize.opt_a.merge_send_recv : 0.000018s : 0.03% optimize.opt_a.auto_parallel : 0.000019s : 0.03% optimize.opt_a.parallel : 0.000029s : 0.04% optimize.opt_a.flash_sp : 0.000013s : 0.02% optimize.opt_a.merge_comm : 0.000008s : 0.01% optimize.opt_a.allreduce_fusion : 0.000007s : 0.01% optimize.opt_a.matmul_add_comm_reduction : 0.000019s : 0.03% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000002s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000018s : 0.03% optimize.opt_a.virtual_dataset : 0.000013s : 0.02% optimize.opt_a.get_grad_eliminate_ : 0.000012s : 0.02% optimize.opt_a.virtual_output : 0.000013s : 0.02% optimize.opt_a.merge_forward : 0.000009s : 0.01% optimize.opt_a.cell_reuse_recompute_pass : 0.000005s : 0.01% optimize.opt_a.offload_activation : 0.000023s : 0.04% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000025s : 0.04% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.01% optimize.opt_a.before_grad : 0.000022s : 0.03% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000008s : 0.01% optimize.opt_a.meta_fg_expand : 0.000006s : 0.01% optimize.opt_a.flash_sp_send_recv_attached : 0.000005s : 0.01% optimize.opt_a.receive_attached : 0.000005s : 0.01% optimize.opt_a.after_resolve : 0.000027s : 0.04% optimize.opt_a.a_after_grad : 0.000021s : 0.03% optimize.opt_a.renormalize : 0.014075s : 21.88% optimize.opt_a.add_forward_monad_depend : 0.000014s : 0.02% optimize.opt_a.auto_monad_grad : 0.000005s : 0.01% optimize.opt_a.auto_monad_eliminator : 0.000035s : 0.05% optimize.opt_a.cse : 0.000049s : 0.08% optimize.opt_a.a_3 : 0.000101s : 0.16% optimize.py_interpret_to_execute_after_opt_a : 0.000017s : 0.03% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.00% optimize.rewriter_after_opt_a : 0.000041s : 0.06% optimize.convert_after_rewriter : 0.000007s : 0.01% optimize.order_py_execute_after_rewriter : 0.000005s : 0.01% optimize.mutable_eliminate : 0.000743s : 1.15% optimize.opt_b.b_1 : 0.000132s : 0.20% optimize.opt_b.b_2 : 0.000009s : 0.01% optimize.opt_b.updatestate_depend_eliminate : 0.000008s : 0.01% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000002s : 0.00% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000023s : 0.04% optimize.optimize_parallel_all_gather_comm : 0.000019s : 0.03% optimize.overlap_param_gather : 0.000002s : 0.00% optimize.cconv : 0.000030s : 0.05% optimize.loop_unroll : 0.000470s : 0.73% optimize.opt_after_cconv.c_1 : 0.000033s : 0.05% optimize.opt_after_cconv.parameter_eliminate : 0.000004s : 0.01% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.01% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.00% optimize.opt_after_cconv.cse : 0.000019s : 0.03% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000015s : 0.02% optimize.tuple_transform.d_1 : 0.000048s : 0.07% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000007s : 0.01% optimize.partial_unused_args_eliminate : 0.000002s : 0.00% optimize.add_recomputation : 0.000054s : 0.08% optimize.cse_after_recomputation.cse : 0.000012s : 0.02% optimize.environ_conv : 0.000006s : 0.01% optimize.swap_dp_allreduce_reducescatter : 0.000005s : 0.01% optimize.bias_add_comm_swap : 0.000003s : 0.00% optimize.label_micro_interleaved_index : 0.000004s : 0.01% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.00% optimize.merge_cast_opt : 0.000001s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.00% optimize.micro_interleaved_order_control : 0.000002s : 0.00% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.00% optimize.full_micro_interleaved_order_control : 0.000002s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.00% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.00% optimize.control_data_broadcast_order : 0.000013s : 0.02% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.00% optimize.offloading_packed_experts : 0.000004s : 0.01% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.01% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.00% optimize.overlap_recompute_comm : 0.000002s : 0.00% optimize.overlap_grad_ring_attention : 0.000004s : 0.01% optimize.overlap_grad_flash_sp : 0.000021s : 0.03% optimize.begin_end_overlap_inline : 0.000000s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.00% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000003s : 0.01% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000010s : 0.02% optimize.symbol_engine_optimizer.elim_not_effective : 0.000013s : 0.02% optimize.symbol_engine_optimizer.opt_reshape : 0.000007s : 0.01% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000011s : 0.02% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.00% pipeline_parallel_scheduler : 0.000001s : 0.00% auto_monad_reorder : 0.000016s : 0.03% get_jit_bprop_graph : 0.000002s : 0.00% rewriter_after_jit_bprop_graph : 0.000005s : 0.01% opt_after_jit_grad : 0.000498s : 0.77% validate : 0.000046s : 0.07% Time group info: ------[substitution.] 0.000259 34 17.15% : 0.000044s : 6: substitution.arithmetic_simplify 0.71% : 0.000002s : 2: substitution.elim_not_effective 0.64% : 0.000002s : 2: substitution.fold_const_symbol 2.43% : 0.000006s : 4: substitution.graph_param_transform 65.98% : 0.000171s : 4: substitution.inline 1.86% : 0.000005s : 4: substitution.j_node_and_user_rematch 2.13% : 0.000006s : 4: substitution.remove_not_recompute_node 2.61% : 0.000007s : 4: substitution.replace_old_param 6.49% : 0.000017s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.045196 2 97.88% : 0.044238s : 1: type_inference.infer 2.12% : 0.000958s : 1: type_inference.specialize ------[replace.] 0.000067 8 64.16% : 0.000043s : 4: replace.inline 35.84% : 0.000024s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000183 8 91.83% : 0.000168s : 4: match.inline 8.17% : 0.000015s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000231 1278 0.85% : 0.000002s : 13: predicate.accumulaten_eliminater 0.79% : 0.000002s : 4: predicate.ad_related_special_op_eliminate 0.49% : 0.000001s : 8: predicate.addn_check_dump 0.93% : 0.000002s : 13: predicate.addn_zero_filter 0.84% : 0.000002s : 13: predicate.adjust_all_reduce_mul_add 2.63% : 0.000006s : 21: predicate.arithmetic_simplify 1.00% : 0.000002s : 13: predicate.cast_eliminate 0.59% : 0.000001s : 8: predicate.check_bprop_eliminate 0.49% : 0.000001s : 8: predicate.compare_switch_simplify 0.16% : 0.000000s : 4: predicate.const_output_eliminate 0.75% : 0.000002s : 8: predicate.depend_value_elim 0.96% : 0.000002s : 13: predicate.dict_get_item_const_eliminator 1.11% : 0.000003s : 13: predicate.dict_get_item_eliminator 0.84% : 0.000002s : 13: predicate.dict_set_item_eliminator 0.88% : 0.000002s : 8: predicate.dumpgradient_eliminate 0.18% : 0.000000s : 4: predicate.elim_not_effective 0.40% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.06% : 0.000002s : 17: predicate.environ_add_const_eliminate 1.09% : 0.000003s : 17: predicate.environ_get_add_eliminate 1.01% : 0.000002s : 17: predicate.environ_get_depend_swap 1.77% : 0.000004s : 25: predicate.environ_get_eliminate 1.17% : 0.000003s : 17: predicate.environ_get_set_eliminate 1.40% : 0.000003s : 21: predicate.exchange_switch_depend_value 2.33% : 0.000005s : 21: predicate.float_depend_g_call 0.52% : 0.000001s : 8: predicate.float_environ_get_switch 0.75% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.16% : 0.000000s : 4: predicate.fold_const_symbol 0.65% : 0.000001s : 8: predicate.get_grad_eliminate 0.20% : 0.000000s : 4: predicate.graph_param_transform 0.56% : 0.000001s : 8: predicate.incorporate_call 0.46% : 0.000001s : 8: predicate.incorporate_call_switch 6.45% : 0.000015s : 58: predicate.inline 0.74% : 0.000002s : 8: predicate.inline_without_move 0.31% : 0.000001s : 8: predicate.j_node_and_user_rematch 0.84% : 0.000002s : 8: predicate.less_batch_normalization 1.69% : 0.000004s : 25: predicate.list_to_tuple_eliminator_ 2.43% : 0.000006s : 38: predicate.load_eliminater 0.83% : 0.000002s : 4: predicate.loop_unroll_after_grad 2.41% : 0.000006s : 34: predicate.loop_unroll_before_grad 1.51% : 0.000003s : 21: predicate.make_slice_get_slice_eliminator 0.57% : 0.000001s : 8: predicate.merge_addn 0.54% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.55% : 0.000001s : 8: predicate.mini_step_allgather_replace 1.08% : 0.000003s : 13: predicate.minmaximum_grad 1.63% : 0.000004s : 4: predicate.mutable_eliminate 0.32% : 0.000001s : 4: predicate.opt_reshape 0.42% : 0.000001s : 4: predicate.parallel_virtual_node 1.93% : 0.000004s : 21: predicate.partial_defer_inline 1.53% : 0.000004s : 21: predicate.partial_eliminate 1.04% : 0.000002s : 13: predicate.print_const_string_wrapper 0.65% : 0.000001s : 8: predicate.reduce_all_const_elim 1.34% : 0.000003s : 13: predicate.reduce_eliminate 2.45% : 0.000006s : 38: predicate.redundant_stop_gradient_eliminater 0.53% : 0.000001s : 8: predicate.remove_not_recompute_node 1.36% : 0.000003s : 25: predicate.replace_applicator 0.55% : 0.000001s : 8: predicate.replace_old_param 0.24% : 0.000001s : 4: predicate.reset_defer_inline 1.04% : 0.000002s : 13: predicate.reshape_eliminate 0.68% : 0.000002s : 8: predicate.row_tensor_add_zeros_like 0.49% : 0.000001s : 4: predicate.row_tensor_eliminate 0.76% : 0.000002s : 8: predicate.same_eliminate 0.40% : 0.000001s : 8: predicate.set_cell_output_no_recompute 1.10% : 0.000003s : 8: predicate.shard_identity_eliminate 0.64% : 0.000001s : 8: predicate.special_op_eliminate 0.60% : 0.000001s : 8: predicate.specialize_transform 1.00% : 0.000002s : 8: predicate.split_environ_get_set_with_tuple_value 0.85% : 0.000002s : 8: predicate.stack_unstack_eliminate 0.37% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.43% : 0.000003s : 21: predicate.switch_defer_inline 1.94% : 0.000004s : 29: predicate.switch_layer_defer_inline 5.15% : 0.000012s : 67: predicate.switch_simplify 0.83% : 0.000002s : 13: predicate.tile_eliminate 1.03% : 0.000002s : 13: predicate.transpose_eliminate 1.62% : 0.000004s : 21: predicate.tuple_list_convert_item_index_to_positive 1.44% : 0.000003s : 21: predicate.tuple_list_get_item_const_eliminator 1.32% : 0.000003s : 21: predicate.tuple_list_get_item_depend_reorder 3.23% : 0.000007s : 33: predicate.tuple_list_get_item_eliminator 1.41% : 0.000003s : 21: predicate.tuple_list_get_set_item_eliminator 2.13% : 0.000005s : 29: predicate.tuple_list_set_item_eliminator 1.65% : 0.000004s : 25: predicate.tuple_to_list_eliminator_ 2.33% : 0.000005s : 38: predicate.updatestate_pure_node_eliminater 3.04% : 0.000007s : 46: predicate.updatestate_useless_node_eliminater 0.36% : 0.000001s : 4: predicate.value_based_eliminate 0.68% : 0.000002s : 8: predicate.virtual_dataset_eliminate 0.84% : 0.000002s : 8: predicate.virtual_output_eliminate 0.24% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.41% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000775 11 46.00% : 0.000356s : 5: func_graph_cloner_run.FuncGraphClonerGraph 54.00% : 0.000418s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.107331 192 0.00% : 0.000004s : 1: ForceFp32Comm 3.55% : 0.003805s : 1: add_attr 3.53% : 0.003791s : 1: add_attr_with_inline 0.00% : 0.000004s : 1: add_comm_op_reuse_tag 0.05% : 0.000058s : 1: add_recomputation 0.00% : 0.000004s : 1: assign_add_opt 0.07% : 0.000076s : 1: auto_monad 0.02% : 0.000020s : 1: auto_monad_reorder 0.00% : 0.000003s : 1: begin_end_overlap_inline 0.01% : 0.000006s : 1: bias_add_comm_swap 0.43% : 0.000456s : 1: bootstrap 0.03% : 0.000034s : 1: cconv 0.00% : 0.000004s : 1: comm_op_add_attrs 0.02% : 0.000016s : 1: control_data_broadcast_order 0.01% : 0.000011s : 1: convert_after_rewriter 0.02% : 0.000025s : 1: cse_after_recomputation 0.01% : 0.000005s : 1: dataset_repeat_opt 0.01% : 0.000006s : 1: detach_backward 0.01% : 0.000010s : 1: environ_conv 0.03% : 0.000031s : 1: event_method 0.00% : 0.000005s : 1: full_micro_interleaved_order_control 0.00% : 0.000005s : 1: get_jit_bprop_graph 0.01% : 0.000010s : 1: graph_reusing 0.00% : 0.000005s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000004s : 1: handle_group_info 0.01% : 0.000006s : 1: inline 0.01% : 0.000007s : 1: insert-virtual-dataset 0.00% : 0.000004s : 1: interleave_parallel_branches 0.00% : 0.000004s : 1: interleave_split_concat_branches 0.01% : 0.000006s : 1: label_fine_grained_interleaved_index 0.01% : 0.000007s : 1: label_micro_interleaved_index 0.44% : 0.000477s : 1: loop_unroll 0.00% : 0.000004s : 1: merge_cast_opt 0.00% : 0.000005s : 1: micro_interleaved_order_control 0.70% : 0.000753s : 1: mutable_eliminate 0.01% : 0.000007s : 1: offloading_packed_experts 0.01% : 0.000014s : 1: opt.transform.loop_unroll_optimizer 0.02% : 0.000020s : 1: opt.transform.mutable_eliminate 1.25% : 0.001341s : 78: opt.transform.opt_a 0.03% : 0.000031s : 1: opt.transform.opt_after_cconv 0.02% : 0.000027s : 1: opt.transform.opt_after_jit_grad 0.10% : 0.000106s : 28: opt.transform.opt_b 0.05% : 0.000052s : 2: opt.transform.opt_trans_graph 0.03% : 0.000037s : 4: opt.transform.symbol_engine_opt 15.29% : 0.016409s : 1: opt_a 0.10% : 0.000111s : 1: opt_after_cconv 0.47% : 0.000508s : 1: opt_after_jit_grad 0.21% : 0.000227s : 1: opt_b 17.50% : 0.018785s : 1: optimize 0.02% : 0.000022s : 1: optimize_parallel_all_gather_comm 0.01% : 0.000008s : 1: order_py_execute_after_rewriter 0.02% : 0.000025s : 1: overlap_grad_flash_sp 0.00% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.01% : 0.000007s : 1: overlap_grad_ring_attention 0.00% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000005s : 1: overlap_param_gather 0.00% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.01% : 0.000009s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000005s : 1: overlap_recompute_comm 0.01% : 0.000007s : 1: parallel-infer-symbol 0.00% : 0.000004s : 1: parallel-infer-symbol-second 0.00% : 0.000005s : 1: partial_unused_args_eliminate 0.00% : 0.000005s : 1: pipeline_parallel_scheduler 0.00% : 0.000005s : 1: pipeline_split 0.04% : 0.000047s : 1: pre_auto_parallel 0.03% : 0.000034s : 1: py_interpret_to_execute 0.02% : 0.000021s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000004s : 1: remove_cast_before_assign_add 0.02% : 0.000018s : 1: remove_dup_value 12.64% : 0.013566s : 1: renormalize.infer 0.46% : 0.000490s : 1: renormalize.specialize 0.01% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.02% : 0.000026s : 1: rewriter_after_jit_bprop_graph 0.04% : 0.000045s : 1: rewriter_after_opt_a 0.09% : 0.000096s : 1: rewriter_before_opt_a 0.00% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.01% : 0.000006s : 1: slice_recompute_activation 0.00% : 0.000005s : 1: split_layernorm_comm 0.00% : 0.000005s : 1: split_matmul_comm_elemetwise 0.01% : 0.000008s : 1: swap_dp_allreduce_reducescatter 0.08% : 0.000082s : 1: symbol_engine_optimizer 0.08% : 0.000083s : 1: tuple_transform 42.22% : 0.045312s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:43:57.903.125 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:43:57.903.403 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0210864, [21] [bootstrap]: 0.00043423 [type_inference]: 0.00724924 [event_method]: 2.289e-05 [auto_monad]: 6.952e-05 [graph_reusing]: 6.26998e-06 [inline]: 3.06999e-06 [add_attr]: 0.00403913, [1] [add_attr_with_inline]: 0.00402451, [1] [Cycle 1]: 0.00010123, [2] [tag_attr]: 2.607e-05 [meta_addattr_fg_expand]: 6.06e-06 [parallel-infer-symbol]: 3.81001e-06 [pre_auto_parallel]: 4.389e-05 [insert-virtual-dataset]: 2.69999e-06 [parallel-infer-symbol-second]: 1.00001e-06 [dataset_repeat_opt]: 1.96e-06 [pipeline_split]: 1.84998e-06 [optimize]: 0.00696315, [53] [py_interpret_to_execute]: 3.771e-05 [rewriter_before_opt_a]: 0.00010614 [opt_a]: 0.00386437, [2] [Cycle 1]: 0.00284954, [45] [expand_dump_flag]: 3.26001e-06 [switch_simplify]: 4.788e-05 [loop_unroll]: 3.575e-05 [a_1]: 0.00074603 [with_stream_mark]: 2.514e-05 [recompute_prepare]: 1.187e-05 [updatestate_depend_eliminate]: 5.02e-06 [updatestate_assign_eliminate]: 3.46999e-06 [updatestate_loads_eliminate]: 3.20998e-06 [parameter_eliminate]: 2.60002e-06 [a_2]: 0.00012893 [accelerated_algorithm]: 9.05999e-06 [shard]: 2.79001e-06 [meta_shard_fg_expand]: 2.31e-06 [shard_inline]: 7.87e-06 [merge_send_recv]: 1.056e-05 [auto_parallel]: 8.23001e-06 [parallel]: 2.157e-05 [flash_sp]: 1.102e-05 [merge_comm]: 4.35999e-06 [allreduce_fusion]: 3.60998e-06 [matmul_add_comm_reduction]: 1.081e-05 [allreduce_slice_to_reducescatter]: 9.30013e-07 [virtual_shard_identity]: 1.075e-05 [virtual_dataset]: 4.954e-05 [get_grad_eliminate_]: 8.77999e-06 [virtual_output]: 6.95002e-06 [merge_forward]: 5.47999e-06 [cell_reuse_recompute_pass]: 1.88997e-06 [offload_activation]: 1.284e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.804e-05 [merge_recompute_call_nodes]: 1.97999e-06 [before_grad]: 1.201e-05 [set_forward_comm_id_for_comm_node_pass]: 4.85001e-06 [meta_fg_expand]: 3.65e-06 [flash_sp_send_recv_attached]: 3.61999e-06 [receive_attached]: 2.57001e-06 [after_resolve]: 1.448e-05 [a_after_grad]: 1.147e-05 [renormalize]: 0.0009676 [add_forward_monad_depend]: 9.42001e-06 [auto_monad_grad]: 2.44001e-06 [auto_monad_eliminator]: 1.992e-05 [cse]: 3.208e-05 [a_3]: 7.434e-05 [Cycle 2]: 0.00099498, [45] [expand_dump_flag]: 2.98e-06 [switch_simplify]: 8.90999e-06 [loop_unroll]: 6.94999e-06 [a_1]: 0.0001712 [with_stream_mark]: 2.117e-05 [recompute_prepare]: 7.03e-06 [updatestate_depend_eliminate]: 4.45e-06 [updatestate_assign_eliminate]: 3.24001e-06 [updatestate_loads_eliminate]: 2.98e-06 [parameter_eliminate]: 1.81e-06 [a_2]: 0.00011041 [accelerated_algorithm]: 8.52e-06 [shard]: 1.94e-06 [meta_shard_fg_expand]: 2.37001e-06 [shard_inline]: 7.08e-06 [merge_send_recv]: 8.55999e-06 [auto_parallel]: 1.035e-05 [parallel]: 7.98001e-06 [flash_sp]: 4.80001e-06 [merge_comm]: 4.15e-06 [allreduce_fusion]: 3.90998e-06 [matmul_add_comm_reduction]: 8.76002e-06 [allreduce_slice_to_reducescatter]: 9.20001e-07 [virtual_shard_identity]: 9.25001e-06 [virtual_dataset]: 6.51999e-06 [get_grad_eliminate_]: 6.17001e-06 [virtual_output]: 6.29001e-06 [merge_forward]: 3.83999e-06 [cell_reuse_recompute_pass]: 3.48e-06 [offload_activation]: 1.021e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.939e-05 [merge_recompute_call_nodes]: 1.30001e-06 [before_grad]: 1.146e-05 [set_forward_comm_id_for_comm_node_pass]: 4.45999e-06 [meta_fg_expand]: 2.53e-06 [flash_sp_send_recv_attached]: 1.49e-06 [receive_attached]: 2.02999e-06 [after_resolve]: 1.473e-05 [a_after_grad]: 1.14e-05 [renormalize]: 9.00181e-08 [add_forward_monad_depend]: 3.36999e-06 [auto_monad_grad]: 1.97001e-06 [auto_monad_eliminator]: 1.291e-05 [cse]: 1.993e-05 [a_3]: 5.309e-05 [py_interpret_to_execute_after_opt_a]: 1.979e-05 [slice_cell_reuse_recomputed_activation]: 4.95999e-06 [rewriter_after_opt_a]: 4.814e-05 [convert_after_rewriter]: 1.146e-05 [order_py_execute_after_rewriter]: 8.42e-06 [mutable_eliminate]: 0.00082459 [opt_b]: 0.00031703, [1] [Cycle 1]: 0.00030306, [7] [b_1]: 0.0001834 [b_2]: 1.052e-05 [updatestate_depend_eliminate]: 1.191e-05 [updatestate_assign_eliminate]: 3.04999e-06 [updatestate_loads_eliminate]: 3.11001e-06 [renormalize]: 9.50007e-07 [cse]: 3.122e-05 [optimize_parallel_all_gather_comm]: 2.556e-05 [overlap_param_gather]: 5.86e-06 [cconv]: 4.248e-05 [loop_unroll]: 0.00069484 [opt_after_cconv]: 0.0001548, [1] [Cycle 1]: 0.00014311, [7] [c_1]: 3.636e-05 [parameter_eliminate]: 5.71e-06 [updatestate_depend_eliminate]: 7.66001e-06 [updatestate_assign_eliminate]: 2.94999e-06 [updatestate_loads_eliminate]: 2.73e-06 [cse]: 2.832e-05 [renormalize]: 4.89992e-07 [remove_dup_value]: 1.912e-05 [tuple_transform]: 0.00010888, [1] [Cycle 1]: 0.00010115, [4] [d_1]: 5.628e-05 [none_parameter_eliminate]: 1.69e-06 [renormalize]: 7.89994e-07 [switch_simplify]: 8.67998e-06 [partial_unused_args_eliminate]: 4.84e-06 [add_recomputation]: 6.437e-05 [cse_after_recomputation]: 3.123e-05, [1] [Cycle 1]: 2.305e-05, [1] [cse]: 1.285e-05 [environ_conv]: 9.49e-06 [swap_dp_allreduce_reducescatter]: 8.96998e-06 [bias_add_comm_swap]: 6.55997e-06 [label_micro_interleaved_index]: 9.70002e-06 [label_fine_grained_interleaved_index]: 5.30001e-06 [merge_cast_opt]: 4.30999e-06 [slice_recompute_activation]: 5.40001e-06 [micro_interleaved_order_control]: 5.21002e-06 [assign_add_opt]: 4.16001e-06 [ForceFp32Comm]: 3.53999e-06 [remove_cast_before_assign_add]: 3.64002e-06 [full_micro_interleaved_order_control]: 5.17999e-06 [reorder_send_recv_between_fp_bp]: 6.46e-06 [comm_op_add_attrs]: 3.60998e-06 [add_comm_op_reuse_tag]: 3.83001e-06 [interleave_split_concat_branches]: 3.83999e-06 [interleave_parallel_branches]: 3.58999e-06 [overlap_opt_shard_in_pipeline]: 4.45e-06 [overlap_opt_shard_grad_in_pipeline]: 4.65001e-06 [control_data_broadcast_order]: 1.784e-05 [grouped_pairwise_exchange_alltoall]: 4.1e-06 [offloading_packed_experts]: 6.84999e-06 [overlap_recompute_and_grad_model_parallel]: 8.03999e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.75998e-06 [overlap_recompute_allgather_and_fa_grad]: 4.06001e-06 [overlap_recompute_comm]: 4.95999e-06 [overlap_grad_ring_attention]: 7.55e-06 [overlap_grad_flash_sp]: 2.623e-05 [begin_end_overlap_inline]: 3.56001e-06 [split_matmul_comm_elemetwise]: 5.52999e-06 [split_layernorm_comm]: 4.12e-06 [handle_group_info]: 3.85e-06 [symbol_engine_optimizer]: 0.00011999, [1] [Cycle 1]: 0.00011117, [6] [build]: 4.63999e-06 [elim_shapecalc]: 1.386e-05 [elim_not_effective]: 1.684e-05 [opt_reshape]: 8.85999e-06 [fold_const_symbol]: 1.179e-05 [renormalize]: 2.20025e-07 [detach_backward]: 6.49001e-06 [pipeline_parallel_scheduler]: 2.09e-06 [auto_monad_reorder]: 2.756e-05 [get_jit_bprop_graph]: 1.92001e-06 [rewriter_after_jit_bprop_graph]: 8.05999e-06 [opt_after_jit_grad]: 0.00080532 [validate]: 5.214e-05 Sums bootstrap : 0.000434s : 3.02% type_inference : 0.007249s : 50.49% event_method : 0.000023s : 0.16% auto_monad : 0.000070s : 0.48% graph_reusing : 0.000006s : 0.04% inline : 0.000003s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000026s : 0.18% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.04% parallel-infer-symbol : 0.000004s : 0.03% pre_auto_parallel : 0.000044s : 0.31% insert-virtual-dataset : 0.000003s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.01% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000038s : 0.26% optimize.rewriter_before_opt_a : 0.000106s : 0.74% optimize.opt_a.expand_dump_flag : 0.000006s : 0.04% optimize.opt_a.switch_simplify : 0.000057s : 0.40% optimize.opt_a.loop_unroll : 0.000043s : 0.30% optimize.opt_a.a_1 : 0.000917s : 6.39% optimize.opt_a.with_stream_mark : 0.000046s : 0.32% optimize.opt_a.recompute_prepare : 0.000019s : 0.13% optimize.opt_a.updatestate_depend_eliminate : 0.000009s : 0.07% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.05% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.04% optimize.opt_a.parameter_eliminate : 0.000004s : 0.03% optimize.opt_a.a_2 : 0.000239s : 1.67% optimize.opt_a.accelerated_algorithm : 0.000018s : 0.12% optimize.opt_a.shard : 0.000005s : 0.03% optimize.opt_a.meta_shard_fg_expand : 0.000005s : 0.03% optimize.opt_a.shard_inline : 0.000015s : 0.10% optimize.opt_a.merge_send_recv : 0.000019s : 0.13% optimize.opt_a.auto_parallel : 0.000019s : 0.13% optimize.opt_a.parallel : 0.000030s : 0.21% optimize.opt_a.flash_sp : 0.000016s : 0.11% optimize.opt_a.merge_comm : 0.000009s : 0.06% optimize.opt_a.allreduce_fusion : 0.000008s : 0.05% optimize.opt_a.matmul_add_comm_reduction : 0.000020s : 0.14% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000002s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000020s : 0.14% optimize.opt_a.virtual_dataset : 0.000056s : 0.39% optimize.opt_a.get_grad_eliminate_ : 0.000015s : 0.10% optimize.opt_a.virtual_output : 0.000013s : 0.09% optimize.opt_a.merge_forward : 0.000009s : 0.06% optimize.opt_a.cell_reuse_recompute_pass : 0.000005s : 0.04% optimize.opt_a.offload_activation : 0.000023s : 0.16% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000037s : 0.26% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.02% optimize.opt_a.before_grad : 0.000023s : 0.16% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000009s : 0.06% optimize.opt_a.meta_fg_expand : 0.000006s : 0.04% optimize.opt_a.flash_sp_send_recv_attached : 0.000005s : 0.04% optimize.opt_a.receive_attached : 0.000005s : 0.03% optimize.opt_a.after_resolve : 0.000029s : 0.20% optimize.opt_a.a_after_grad : 0.000023s : 0.16% optimize.opt_a.renormalize : 0.000968s : 6.74% optimize.opt_a.add_forward_monad_depend : 0.000013s : 0.09% optimize.opt_a.auto_monad_grad : 0.000004s : 0.03% optimize.opt_a.auto_monad_eliminator : 0.000033s : 0.23% optimize.opt_a.cse : 0.000052s : 0.36% optimize.opt_a.a_3 : 0.000127s : 0.89% optimize.py_interpret_to_execute_after_opt_a : 0.000020s : 0.14% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.03% optimize.rewriter_after_opt_a : 0.000048s : 0.34% optimize.convert_after_rewriter : 0.000011s : 0.08% optimize.order_py_execute_after_rewriter : 0.000008s : 0.06% optimize.mutable_eliminate : 0.000825s : 5.74% optimize.opt_b.b_1 : 0.000183s : 1.28% optimize.opt_b.b_2 : 0.000011s : 0.07% optimize.opt_b.updatestate_depend_eliminate : 0.000012s : 0.08% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.02% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.02% optimize.opt_b.renormalize : 0.000001s : 0.01% optimize.opt_b.cse : 0.000031s : 0.22% optimize.optimize_parallel_all_gather_comm : 0.000026s : 0.18% optimize.overlap_param_gather : 0.000006s : 0.04% optimize.cconv : 0.000042s : 0.30% optimize.loop_unroll : 0.000695s : 4.84% optimize.opt_after_cconv.c_1 : 0.000036s : 0.25% optimize.opt_after_cconv.parameter_eliminate : 0.000006s : 0.04% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000008s : 0.05% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.02% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.02% optimize.opt_after_cconv.cse : 0.000028s : 0.20% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000019s : 0.13% optimize.tuple_transform.d_1 : 0.000056s : 0.39% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000001s : 0.01% optimize.tuple_transform.switch_simplify : 0.000009s : 0.06% optimize.partial_unused_args_eliminate : 0.000005s : 0.03% optimize.add_recomputation : 0.000064s : 0.45% optimize.cse_after_recomputation.cse : 0.000013s : 0.09% optimize.environ_conv : 0.000009s : 0.07% optimize.swap_dp_allreduce_reducescatter : 0.000009s : 0.06% optimize.bias_add_comm_swap : 0.000007s : 0.05% optimize.label_micro_interleaved_index : 0.000010s : 0.07% optimize.label_fine_grained_interleaved_index : 0.000005s : 0.04% optimize.merge_cast_opt : 0.000004s : 0.03% optimize.slice_recompute_activation : 0.000005s : 0.04% optimize.micro_interleaved_order_control : 0.000005s : 0.04% optimize.assign_add_opt : 0.000004s : 0.03% optimize.ForceFp32Comm : 0.000004s : 0.02% optimize.remove_cast_before_assign_add : 0.000004s : 0.03% optimize.full_micro_interleaved_order_control : 0.000005s : 0.04% optimize.reorder_send_recv_between_fp_bp : 0.000006s : 0.04% optimize.comm_op_add_attrs : 0.000004s : 0.03% optimize.add_comm_op_reuse_tag : 0.000004s : 0.03% optimize.interleave_split_concat_branches : 0.000004s : 0.03% optimize.interleave_parallel_branches : 0.000004s : 0.03% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.03% optimize.overlap_opt_shard_grad_in_pipeline : 0.000005s : 0.03% optimize.control_data_broadcast_order : 0.000018s : 0.12% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.03% optimize.offloading_packed_experts : 0.000007s : 0.05% optimize.overlap_recompute_and_grad_model_parallel : 0.000008s : 0.06% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.03% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.03% optimize.overlap_recompute_comm : 0.000005s : 0.03% optimize.overlap_grad_ring_attention : 0.000008s : 0.05% optimize.overlap_grad_flash_sp : 0.000026s : 0.18% optimize.begin_end_overlap_inline : 0.000004s : 0.02% optimize.split_matmul_comm_elemetwise : 0.000006s : 0.04% optimize.split_layernorm_comm : 0.000004s : 0.03% optimize.handle_group_info : 0.000004s : 0.03% optimize.symbol_engine_optimizer.build : 0.000005s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000014s : 0.10% optimize.symbol_engine_optimizer.elim_not_effective : 0.000017s : 0.12% optimize.symbol_engine_optimizer.opt_reshape : 0.000009s : 0.06% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000012s : 0.08% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000006s : 0.05% pipeline_parallel_scheduler : 0.000002s : 0.01% auto_monad_reorder : 0.000028s : 0.19% get_jit_bprop_graph : 0.000002s : 0.01% rewriter_after_jit_bprop_graph : 0.000008s : 0.06% opt_after_jit_grad : 0.000805s : 5.61% validate : 0.000052s : 0.36% Time group info: ------[substitution.] 0.000279 34 15.58% : 0.000044s : 6: substitution.arithmetic_simplify 0.74% : 0.000002s : 2: substitution.elim_not_effective 0.60% : 0.000002s : 2: substitution.fold_const_symbol 2.50% : 0.000007s : 4: substitution.graph_param_transform 66.85% : 0.000187s : 4: substitution.inline 1.69% : 0.000005s : 4: substitution.j_node_and_user_rematch 3.43% : 0.000010s : 4: substitution.remove_not_recompute_node 2.53% : 0.000007s : 4: substitution.replace_old_param 6.08% : 0.000017s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.007181 2 88.37% : 0.006346s : 1: type_inference.infer 11.63% : 0.000835s : 1: type_inference.specialize ------[replace.] 0.000071 8 63.28% : 0.000045s : 4: replace.inline 36.72% : 0.000026s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000199 8 92.39% : 0.000184s : 4: match.inline 7.61% : 0.000015s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000259 1278 0.88% : 0.000002s : 13: predicate.accumulaten_eliminater 1.25% : 0.000003s : 4: predicate.ad_related_special_op_eliminate 0.46% : 0.000001s : 8: predicate.addn_check_dump 0.97% : 0.000003s : 13: predicate.addn_zero_filter 1.14% : 0.000003s : 13: predicate.adjust_all_reduce_mul_add 2.79% : 0.000007s : 21: predicate.arithmetic_simplify 1.05% : 0.000003s : 13: predicate.cast_eliminate 0.51% : 0.000001s : 8: predicate.check_bprop_eliminate 0.51% : 0.000001s : 8: predicate.compare_switch_simplify 0.15% : 0.000000s : 4: predicate.const_output_eliminate 0.54% : 0.000001s : 8: predicate.depend_value_elim 0.91% : 0.000002s : 13: predicate.dict_get_item_const_eliminator 1.06% : 0.000003s : 13: predicate.dict_get_item_eliminator 0.90% : 0.000002s : 13: predicate.dict_set_item_eliminator 1.06% : 0.000003s : 8: predicate.dumpgradient_eliminate 0.22% : 0.000001s : 4: predicate.elim_not_effective 0.45% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.16% : 0.000003s : 17: predicate.environ_add_const_eliminate 1.00% : 0.000003s : 17: predicate.environ_get_add_eliminate 0.95% : 0.000002s : 17: predicate.environ_get_depend_swap 1.67% : 0.000004s : 25: predicate.environ_get_eliminate 0.99% : 0.000003s : 17: predicate.environ_get_set_eliminate 1.29% : 0.000003s : 21: predicate.exchange_switch_depend_value 2.29% : 0.000006s : 21: predicate.float_depend_g_call 0.68% : 0.000002s : 8: predicate.float_environ_get_switch 0.87% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.17% : 0.000000s : 4: predicate.fold_const_symbol 0.58% : 0.000002s : 8: predicate.get_grad_eliminate 0.18% : 0.000000s : 4: predicate.graph_param_transform 0.49% : 0.000001s : 8: predicate.incorporate_call 0.40% : 0.000001s : 8: predicate.incorporate_call_switch 5.99% : 0.000016s : 58: predicate.inline 0.96% : 0.000002s : 8: predicate.inline_without_move 0.26% : 0.000001s : 8: predicate.j_node_and_user_rematch 0.85% : 0.000002s : 8: predicate.less_batch_normalization 1.68% : 0.000004s : 25: predicate.list_to_tuple_eliminator_ 2.48% : 0.000006s : 38: predicate.load_eliminater 1.18% : 0.000003s : 4: predicate.loop_unroll_after_grad 2.31% : 0.000006s : 34: predicate.loop_unroll_before_grad 1.72% : 0.000004s : 21: predicate.make_slice_get_slice_eliminator 0.53% : 0.000001s : 8: predicate.merge_addn 0.55% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.55% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.77% : 0.000002s : 13: predicate.minmaximum_grad 1.83% : 0.000005s : 4: predicate.mutable_eliminate 0.39% : 0.000001s : 4: predicate.opt_reshape 0.38% : 0.000001s : 4: predicate.parallel_virtual_node 1.54% : 0.000004s : 21: predicate.partial_defer_inline 1.38% : 0.000004s : 21: predicate.partial_eliminate 0.79% : 0.000002s : 13: predicate.print_const_string_wrapper 0.64% : 0.000002s : 8: predicate.reduce_all_const_elim 1.12% : 0.000003s : 13: predicate.reduce_eliminate 2.33% : 0.000006s : 38: predicate.redundant_stop_gradient_eliminater 0.56% : 0.000001s : 8: predicate.remove_not_recompute_node 1.10% : 0.000003s : 25: predicate.replace_applicator 0.54% : 0.000001s : 8: predicate.replace_old_param 0.31% : 0.000001s : 4: predicate.reset_defer_inline 1.07% : 0.000003s : 13: predicate.reshape_eliminate 0.60% : 0.000002s : 8: predicate.row_tensor_add_zeros_like 0.52% : 0.000001s : 4: predicate.row_tensor_eliminate 1.02% : 0.000003s : 8: predicate.same_eliminate 0.36% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.87% : 0.000002s : 8: predicate.shard_identity_eliminate 0.71% : 0.000002s : 8: predicate.special_op_eliminate 0.69% : 0.000002s : 8: predicate.specialize_transform 0.91% : 0.000002s : 8: predicate.split_environ_get_set_with_tuple_value 0.78% : 0.000002s : 8: predicate.stack_unstack_eliminate 0.35% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.33% : 0.000003s : 21: predicate.switch_defer_inline 1.95% : 0.000005s : 29: predicate.switch_layer_defer_inline 5.00% : 0.000013s : 67: predicate.switch_simplify 0.85% : 0.000002s : 13: predicate.tile_eliminate 0.90% : 0.000002s : 13: predicate.transpose_eliminate 1.65% : 0.000004s : 21: predicate.tuple_list_convert_item_index_to_positive 1.44% : 0.000004s : 21: predicate.tuple_list_get_item_const_eliminator 1.65% : 0.000004s : 21: predicate.tuple_list_get_item_depend_reorder 3.45% : 0.000009s : 33: predicate.tuple_list_get_item_eliminator 1.91% : 0.000005s : 21: predicate.tuple_list_get_set_item_eliminator 2.21% : 0.000006s : 29: predicate.tuple_list_set_item_eliminator 1.75% : 0.000005s : 25: predicate.tuple_to_list_eliminator_ 2.25% : 0.000006s : 38: predicate.updatestate_pure_node_eliminater 2.81% : 0.000007s : 46: predicate.updatestate_useless_node_eliminater 0.36% : 0.000001s : 4: predicate.value_based_eliminate 0.73% : 0.000002s : 8: predicate.virtual_dataset_eliminate 0.55% : 0.000001s : 8: predicate.virtual_output_eliminate 0.22% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.78% : 0.000002s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000754 11 56.68% : 0.000427s : 5: func_graph_cloner_run.FuncGraphClonerGraph 43.32% : 0.000326s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.034684 192 0.02% : 0.000006s : 1: ForceFp32Comm 11.68% : 0.004051s : 1: add_attr 11.62% : 0.004029s : 1: add_attr_with_inline 0.02% : 0.000008s : 1: add_comm_op_reuse_tag 0.20% : 0.000068s : 1: add_recomputation 0.02% : 0.000007s : 1: assign_add_opt 0.23% : 0.000079s : 1: auto_monad 0.10% : 0.000036s : 1: auto_monad_reorder 0.02% : 0.000006s : 1: begin_end_overlap_inline 0.03% : 0.000010s : 1: bias_add_comm_swap 1.38% : 0.000479s : 1: bootstrap 0.13% : 0.000046s : 1: cconv 0.02% : 0.000007s : 1: comm_op_add_attrs 0.06% : 0.000021s : 1: control_data_broadcast_order 0.04% : 0.000015s : 1: convert_after_rewriter 0.10% : 0.000035s : 1: cse_after_recomputation 0.02% : 0.000008s : 1: dataset_repeat_opt 0.11% : 0.000038s : 1: detach_backward 0.04% : 0.000012s : 1: environ_conv 0.10% : 0.000034s : 1: event_method 0.02% : 0.000008s : 1: full_micro_interleaved_order_control 0.02% : 0.000008s : 1: get_jit_bprop_graph 0.04% : 0.000012s : 1: graph_reusing 0.02% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.02% : 0.000007s : 1: handle_group_info 0.03% : 0.000009s : 1: inline 0.03% : 0.000009s : 1: insert-virtual-dataset 0.02% : 0.000006s : 1: interleave_parallel_branches 0.02% : 0.000007s : 1: interleave_split_concat_branches 0.02% : 0.000008s : 1: label_fine_grained_interleaved_index 0.04% : 0.000013s : 1: label_micro_interleaved_index 2.03% : 0.000704s : 1: loop_unroll 0.02% : 0.000007s : 1: merge_cast_opt 0.02% : 0.000008s : 1: micro_interleaved_order_control 2.41% : 0.000835s : 1: mutable_eliminate 0.03% : 0.000010s : 1: offloading_packed_experts 0.06% : 0.000022s : 1: opt.transform.loop_unroll_optimizer 0.08% : 0.000027s : 1: opt.transform.mutable_eliminate 4.21% : 0.001459s : 78: opt.transform.opt_a 0.10% : 0.000034s : 1: opt.transform.opt_after_cconv 0.11% : 0.000039s : 1: opt.transform.opt_after_jit_grad 0.33% : 0.000116s : 28: opt.transform.opt_b 0.18% : 0.000062s : 2: opt.transform.opt_trans_graph 0.14% : 0.000047s : 4: opt.transform.symbol_engine_opt 11.15% : 0.003868s : 1: opt_a 0.46% : 0.000159s : 1: opt_after_cconv 2.37% : 0.000821s : 1: opt_after_jit_grad 0.93% : 0.000321s : 1: opt_b 23.08% : 0.008005s : 1: optimize 0.08% : 0.000029s : 1: optimize_parallel_all_gather_comm 0.03% : 0.000011s : 1: order_py_execute_after_rewriter 0.08% : 0.000029s : 1: overlap_grad_flash_sp 0.02% : 0.000007s : 1: overlap_grad_matmul_and_grad_allreduce 0.03% : 0.000010s : 1: overlap_grad_ring_attention 0.02% : 0.000007s : 1: overlap_opt_shard_grad_in_pipeline 0.02% : 0.000008s : 1: overlap_opt_shard_in_pipeline 0.03% : 0.000009s : 1: overlap_param_gather 0.02% : 0.000007s : 1: overlap_recompute_allgather_and_fa_grad 0.04% : 0.000013s : 1: overlap_recompute_and_grad_model_parallel 0.02% : 0.000008s : 1: overlap_recompute_comm 0.03% : 0.000010s : 1: parallel-infer-symbol 0.02% : 0.000007s : 1: parallel-infer-symbol-second 0.02% : 0.000008s : 1: partial_unused_args_eliminate 0.03% : 0.000011s : 1: pipeline_parallel_scheduler 0.02% : 0.000008s : 1: pipeline_split 0.15% : 0.000052s : 1: pre_auto_parallel 0.12% : 0.000042s : 1: py_interpret_to_execute 0.07% : 0.000023s : 1: py_interpret_to_execute_after_opt_a 0.02% : 0.000006s : 1: remove_cast_before_assign_add 0.07% : 0.000023s : 1: remove_dup_value 1.48% : 0.000513s : 1: renormalize.infer 1.27% : 0.000441s : 1: renormalize.specialize 0.03% : 0.000009s : 1: reorder_send_recv_between_fp_bp 0.04% : 0.000014s : 1: rewriter_after_jit_bprop_graph 0.15% : 0.000053s : 1: rewriter_after_opt_a 0.32% : 0.000111s : 1: rewriter_before_opt_a 0.02% : 0.000008s : 1: slice_cell_reuse_recomputed_activation 0.03% : 0.000009s : 1: slice_recompute_activation 0.02% : 0.000009s : 1: split_layernorm_comm 0.02% : 0.000008s : 1: split_matmul_comm_elemetwise 0.04% : 0.000012s : 1: swap_dp_allreduce_reducescatter 0.35% : 0.000123s : 1: symbol_engine_optimizer 0.32% : 0.000112s : 1: tuple_transform 21.05% : 0.007302s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:43:59.114.393 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0897736, [21] [bootstrap]: 0.00043565 [type_inference]: 0.0788527 [event_method]: 2.071e-05 [auto_monad]: 6.633e-05 [graph_reusing]: 6.38e-06 [inline]: 2.66e-06 [add_attr]: 0.00385579, [1] [add_attr_with_inline]: 0.00384193, [1] [Cycle 1]: 7.983e-05, [2] [tag_attr]: 2.624e-05 [meta_addattr_fg_expand]: 6.12999e-06 [parallel-infer-symbol]: 3.68999e-06 [pre_auto_parallel]: 4.471e-05 [insert-virtual-dataset]: 2.94999e-06 [parallel-infer-symbol-second]: 7.2e-07 [dataset_repeat_opt]: 2.78e-06 [pipeline_split]: 1.77001e-06 [optimize]: 0.00573921, [53] [py_interpret_to_execute]: 3.166e-05 [rewriter_before_opt_a]: 9.58e-05 [opt_a]: 0.00327798, [2] [Cycle 1]: 0.00245713, [45] [expand_dump_flag]: 3.28998e-06 [switch_simplify]: 4.449e-05 [loop_unroll]: 3.002e-05 [a_1]: 0.00070626 [with_stream_mark]: 2.362e-05 [recompute_prepare]: 1.196e-05 [updatestate_depend_eliminate]: 5.91e-06 [updatestate_assign_eliminate]: 4.25999e-06 [updatestate_loads_eliminate]: 2.86e-06 [parameter_eliminate]: 2.11998e-06 [a_2]: 9.303e-05 [accelerated_algorithm]: 8.04002e-06 [shard]: 2.94999e-06 [meta_shard_fg_expand]: 2.00002e-06 [shard_inline]: 7.48e-06 [merge_send_recv]: 1.032e-05 [auto_parallel]: 8.89e-06 [parallel]: 2.104e-05 [flash_sp]: 1.076e-05 [merge_comm]: 4.02e-06 [allreduce_fusion]: 3.55e-06 [matmul_add_comm_reduction]: 1.123e-05 [allreduce_slice_to_reducescatter]: 8.39995e-07 [virtual_shard_identity]: 9.22999e-06 [virtual_dataset]: 7.42002e-06 [get_grad_eliminate_]: 6.59999e-06 [virtual_output]: 6.51e-06 [merge_forward]: 5.10999e-06 [cell_reuse_recompute_pass]: 1.44998e-06 [offload_activation]: 1.073e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.45e-05 [merge_recompute_call_nodes]: 1.96998e-06 [before_grad]: 1.198e-05 [set_forward_comm_id_for_comm_node_pass]: 4.08999e-06 [meta_fg_expand]: 3.13e-06 [flash_sp_send_recv_attached]: 3.46001e-06 [receive_attached]: 2.43e-06 [after_resolve]: 1.513e-05 [a_after_grad]: 1.089e-05 [renormalize]: 0.00091747 [add_forward_monad_depend]: 9.03002e-06 [auto_monad_grad]: 2.54001e-06 [auto_monad_eliminator]: 1.911e-05 [cse]: 3.481e-05 [a_3]: 5.767e-05 [Cycle 2]: 0.00080605, [45] [expand_dump_flag]: 2.38002e-06 [switch_simplify]: 9.19e-06 [loop_unroll]: 6.47001e-06 [a_1]: 0.00015558 [with_stream_mark]: 2.064e-05 [recompute_prepare]: 7.7e-06 [updatestate_depend_eliminate]: 4.74e-06 [updatestate_assign_eliminate]: 3.23e-06 [updatestate_loads_eliminate]: 2.93e-06 [parameter_eliminate]: 1.74e-06 [a_2]: 8.298e-05 [accelerated_algorithm]: 6.74999e-06 [shard]: 2.60002e-06 [meta_shard_fg_expand]: 1.88002e-06 [shard_inline]: 6.19999e-06 [merge_send_recv]: 8.70001e-06 [auto_parallel]: 9.21002e-06 [parallel]: 9.07999e-06 [flash_sp]: 4.12e-06 [merge_comm]: 3.97998e-06 [allreduce_fusion]: 4e-06 [matmul_add_comm_reduction]: 9.39e-06 [allreduce_slice_to_reducescatter]: 7.2e-07 [virtual_shard_identity]: 8.32998e-06 [virtual_dataset]: 6.29001e-06 [get_grad_eliminate_]: 8.80001e-06 [virtual_output]: 6.10002e-06 [merge_forward]: 3.48e-06 [cell_reuse_recompute_pass]: 3.25e-06 [offload_activation]: 1.013e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.342e-05 [merge_recompute_call_nodes]: 1.96998e-06 [before_grad]: 1.046e-05 [set_forward_comm_id_for_comm_node_pass]: 3.58e-06 [meta_fg_expand]: 2.45002e-06 [flash_sp_send_recv_attached]: 1.46002e-06 [receive_attached]: 2.02999e-06 [after_resolve]: 1.297e-05 [a_after_grad]: 9.37001e-06 [renormalize]: 8.00064e-08 [add_forward_monad_depend]: 3.27002e-06 [auto_monad_grad]: 1.92999e-06 [auto_monad_eliminator]: 1.136e-05 [cse]: 2.35e-05 [a_3]: 4.028e-05 [py_interpret_to_execute_after_opt_a]: 1.599e-05 [slice_cell_reuse_recomputed_activation]: 2.41998e-06 [rewriter_after_opt_a]: 4.295e-05 [convert_after_rewriter]: 7.26001e-06 [order_py_execute_after_rewriter]: 5.34e-06 [mutable_eliminate]: 0.00076008 [opt_b]: 0.00023258, [1] [Cycle 1]: 0.00022354, [7] [b_1]: 0.000129 [b_2]: 8.98002e-06 [updatestate_depend_eliminate]: 1.2e-05 [updatestate_assign_eliminate]: 3.02002e-06 [updatestate_loads_eliminate]: 3.46999e-06 [renormalize]: 1.42e-06 [cse]: 2.737e-05 [optimize_parallel_all_gather_comm]: 2.008e-05 [overlap_param_gather]: 2.64999e-06 [cconv]: 3.763e-05 [loop_unroll]: 0.00048239 [opt_after_cconv]: 0.00011244, [1] [Cycle 1]: 0.00010572, [7] [c_1]: 3.209e-05 [parameter_eliminate]: 5.91e-06 [updatestate_depend_eliminate]: 6.58998e-06 [updatestate_assign_eliminate]: 2.86999e-06 [updatestate_loads_eliminate]: 2.28002e-06 [cse]: 2.019e-05 [renormalize]: 6.39993e-07 [remove_dup_value]: 1.492e-05 [tuple_transform]: 8.1e-05, [1] [Cycle 1]: 7.629e-05, [4] [d_1]: 4.928e-05 [none_parameter_eliminate]: 1.60999e-06 [renormalize]: 2.40019e-07 [switch_simplify]: 6.64001e-06 [partial_unused_args_eliminate]: 2.18998e-06 [add_recomputation]: 5.568e-05 [cse_after_recomputation]: 2.369e-05, [1] [Cycle 1]: 1.848e-05, [1] [cse]: 1.237e-05 [environ_conv]: 6.39999e-06 [swap_dp_allreduce_reducescatter]: 5.04e-06 [bias_add_comm_swap]: 2.91999e-06 [label_micro_interleaved_index]: 4.52998e-06 [label_fine_grained_interleaved_index]: 3.31001e-06 [merge_cast_opt]: 1.34e-06 [slice_recompute_activation]: 2.04e-06 [micro_interleaved_order_control]: 2.76e-06 [assign_add_opt]: 1.49998e-06 [ForceFp32Comm]: 8.30012e-07 [remove_cast_before_assign_add]: 1.37e-06 [full_micro_interleaved_order_control]: 2.44999e-06 [reorder_send_recv_between_fp_bp]: 3.33e-06 [comm_op_add_attrs]: 1.29e-06 [add_comm_op_reuse_tag]: 1.15999e-06 [interleave_split_concat_branches]: 1.20999e-06 [interleave_parallel_branches]: 1.10001e-06 [overlap_opt_shard_in_pipeline]: 1.29e-06 [overlap_opt_shard_grad_in_pipeline]: 2.67001e-06 [control_data_broadcast_order]: 1.391e-05 [grouped_pairwise_exchange_alltoall]: 1.62999e-06 [offloading_packed_experts]: 3.93999e-06 [overlap_recompute_and_grad_model_parallel]: 4.92999e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.20001e-06 [overlap_recompute_allgather_and_fa_grad]: 1.34e-06 [overlap_recompute_comm]: 2.73003e-06 [overlap_grad_ring_attention]: 4.03999e-06 [overlap_grad_flash_sp]: 2.196e-05 [begin_end_overlap_inline]: 5.89993e-07 [split_matmul_comm_elemetwise]: 2.21e-06 [split_layernorm_comm]: 1.80001e-06 [handle_group_info]: 9.20001e-07 [symbol_engine_optimizer]: 8.335e-05, [1] [Cycle 1]: 7.825e-05, [6] [build]: 3.7e-06 [elim_shapecalc]: 1.179e-05 [elim_not_effective]: 1.38e-05 [opt_reshape]: 7.09001e-06 [fold_const_symbol]: 1.058e-05 [renormalize]: 2.69996e-07 [detach_backward]: 2.18002e-06 [pipeline_parallel_scheduler]: 1.62999e-06 [auto_monad_reorder]: 1.761e-05 [get_jit_bprop_graph]: 1.99e-06 [rewriter_after_jit_bprop_graph]: 5.97999e-06 [opt_after_jit_grad]: 0.00050302 [validate]: 4.487e-05 Sums bootstrap : 0.000436s : 0.51% type_inference : 0.078853s : 92.95% event_method : 0.000021s : 0.02% auto_monad : 0.000066s : 0.08% graph_reusing : 0.000006s : 0.01% inline : 0.000003s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000026s : 0.03% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.01% parallel-infer-symbol : 0.000004s : 0.00% pre_auto_parallel : 0.000045s : 0.05% insert-virtual-dataset : 0.000003s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000003s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000032s : 0.04% optimize.rewriter_before_opt_a : 0.000096s : 0.11% optimize.opt_a.expand_dump_flag : 0.000006s : 0.01% optimize.opt_a.switch_simplify : 0.000054s : 0.06% optimize.opt_a.loop_unroll : 0.000036s : 0.04% optimize.opt_a.a_1 : 0.000862s : 1.02% optimize.opt_a.with_stream_mark : 0.000044s : 0.05% optimize.opt_a.recompute_prepare : 0.000020s : 0.02% optimize.opt_a.updatestate_depend_eliminate : 0.000011s : 0.01% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.01% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.01% optimize.opt_a.parameter_eliminate : 0.000004s : 0.00% optimize.opt_a.a_2 : 0.000176s : 0.21% optimize.opt_a.accelerated_algorithm : 0.000015s : 0.02% optimize.opt_a.shard : 0.000006s : 0.01% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.00% optimize.opt_a.shard_inline : 0.000014s : 0.02% optimize.opt_a.merge_send_recv : 0.000019s : 0.02% optimize.opt_a.auto_parallel : 0.000018s : 0.02% optimize.opt_a.parallel : 0.000030s : 0.04% optimize.opt_a.flash_sp : 0.000015s : 0.02% optimize.opt_a.merge_comm : 0.000008s : 0.01% optimize.opt_a.allreduce_fusion : 0.000008s : 0.01% optimize.opt_a.matmul_add_comm_reduction : 0.000021s : 0.02% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000002s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000018s : 0.02% optimize.opt_a.virtual_dataset : 0.000014s : 0.02% optimize.opt_a.get_grad_eliminate_ : 0.000015s : 0.02% optimize.opt_a.virtual_output : 0.000013s : 0.01% optimize.opt_a.merge_forward : 0.000009s : 0.01% optimize.opt_a.cell_reuse_recompute_pass : 0.000005s : 0.01% optimize.opt_a.offload_activation : 0.000021s : 0.02% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000028s : 0.03% optimize.opt_a.merge_recompute_call_nodes : 0.000004s : 0.00% optimize.opt_a.before_grad : 0.000022s : 0.03% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000008s : 0.01% optimize.opt_a.meta_fg_expand : 0.000006s : 0.01% optimize.opt_a.flash_sp_send_recv_attached : 0.000005s : 0.01% optimize.opt_a.receive_attached : 0.000004s : 0.01% optimize.opt_a.after_resolve : 0.000028s : 0.03% optimize.opt_a.a_after_grad : 0.000020s : 0.02% optimize.opt_a.renormalize : 0.000918s : 1.08% optimize.opt_a.add_forward_monad_depend : 0.000012s : 0.01% optimize.opt_a.auto_monad_grad : 0.000004s : 0.01% optimize.opt_a.auto_monad_eliminator : 0.000030s : 0.04% optimize.opt_a.cse : 0.000058s : 0.07% optimize.opt_a.a_3 : 0.000098s : 0.12% optimize.py_interpret_to_execute_after_opt_a : 0.000016s : 0.02% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.00% optimize.rewriter_after_opt_a : 0.000043s : 0.05% optimize.convert_after_rewriter : 0.000007s : 0.01% optimize.order_py_execute_after_rewriter : 0.000005s : 0.01% optimize.mutable_eliminate : 0.000760s : 0.90% optimize.opt_b.b_1 : 0.000129s : 0.15% optimize.opt_b.b_2 : 0.000009s : 0.01% optimize.opt_b.updatestate_depend_eliminate : 0.000012s : 0.01% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.00% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000027s : 0.03% optimize.optimize_parallel_all_gather_comm : 0.000020s : 0.02% optimize.overlap_param_gather : 0.000003s : 0.00% optimize.cconv : 0.000038s : 0.04% optimize.loop_unroll : 0.000482s : 0.57% optimize.opt_after_cconv.c_1 : 0.000032s : 0.04% optimize.opt_after_cconv.parameter_eliminate : 0.000006s : 0.01% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000007s : 0.01% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.00% optimize.opt_after_cconv.cse : 0.000020s : 0.02% optimize.opt_after_cconv.renormalize : 0.000001s : 0.00% optimize.remove_dup_value : 0.000015s : 0.02% optimize.tuple_transform.d_1 : 0.000049s : 0.06% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000007s : 0.01% optimize.partial_unused_args_eliminate : 0.000002s : 0.00% optimize.add_recomputation : 0.000056s : 0.07% optimize.cse_after_recomputation.cse : 0.000012s : 0.01% optimize.environ_conv : 0.000006s : 0.01% optimize.swap_dp_allreduce_reducescatter : 0.000005s : 0.01% optimize.bias_add_comm_swap : 0.000003s : 0.00% optimize.label_micro_interleaved_index : 0.000005s : 0.01% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.00% optimize.merge_cast_opt : 0.000001s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.00% optimize.micro_interleaved_order_control : 0.000003s : 0.00% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.00% optimize.full_micro_interleaved_order_control : 0.000002s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.00% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000003s : 0.00% optimize.control_data_broadcast_order : 0.000014s : 0.02% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.00% optimize.offloading_packed_experts : 0.000004s : 0.00% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.01% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.00% optimize.overlap_recompute_comm : 0.000003s : 0.00% optimize.overlap_grad_ring_attention : 0.000004s : 0.00% optimize.overlap_grad_flash_sp : 0.000022s : 0.03% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.00% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000004s : 0.00% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000012s : 0.01% optimize.symbol_engine_optimizer.elim_not_effective : 0.000014s : 0.02% optimize.symbol_engine_optimizer.opt_reshape : 0.000007s : 0.01% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000011s : 0.01% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.00% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000018s : 0.02% get_jit_bprop_graph : 0.000002s : 0.00% rewriter_after_jit_bprop_graph : 0.000006s : 0.01% opt_after_jit_grad : 0.000503s : 0.59% validate : 0.000045s : 0.05% Time group info: ------[substitution.] 0.000263 34 15.76% : 0.000042s : 6: substitution.arithmetic_simplify 0.83% : 0.000002s : 2: substitution.elim_not_effective 0.63% : 0.000002s : 2: substitution.fold_const_symbol 2.66% : 0.000007s : 4: substitution.graph_param_transform 67.45% : 0.000178s : 4: substitution.inline 1.59% : 0.000004s : 4: substitution.j_node_and_user_rematch 2.13% : 0.000006s : 4: substitution.remove_not_recompute_node 2.90% : 0.000008s : 4: substitution.replace_old_param 6.06% : 0.000016s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.078777 2 99.05% : 0.078027s : 1: type_inference.infer 0.95% : 0.000750s : 1: type_inference.specialize ------[replace.] 0.000070 8 65.00% : 0.000045s : 4: replace.inline 35.00% : 0.000024s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000189 8 92.56% : 0.000175s : 4: match.inline 7.44% : 0.000014s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000223 1278 0.90% : 0.000002s : 13: predicate.accumulaten_eliminater 0.72% : 0.000002s : 4: predicate.ad_related_special_op_eliminate 0.44% : 0.000001s : 8: predicate.addn_check_dump 0.92% : 0.000002s : 13: predicate.addn_zero_filter 0.91% : 0.000002s : 13: predicate.adjust_all_reduce_mul_add 2.77% : 0.000006s : 21: predicate.arithmetic_simplify 0.96% : 0.000002s : 13: predicate.cast_eliminate 0.61% : 0.000001s : 8: predicate.check_bprop_eliminate 0.51% : 0.000001s : 8: predicate.compare_switch_simplify 0.18% : 0.000000s : 4: predicate.const_output_eliminate 0.56% : 0.000001s : 8: predicate.depend_value_elim 0.91% : 0.000002s : 13: predicate.dict_get_item_const_eliminator 0.93% : 0.000002s : 13: predicate.dict_get_item_eliminator 0.86% : 0.000002s : 13: predicate.dict_set_item_eliminator 1.03% : 0.000002s : 8: predicate.dumpgradient_eliminate 0.20% : 0.000000s : 4: predicate.elim_not_effective 0.34% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.14% : 0.000003s : 17: predicate.environ_add_const_eliminate 1.04% : 0.000002s : 17: predicate.environ_get_add_eliminate 0.97% : 0.000002s : 17: predicate.environ_get_depend_swap 1.54% : 0.000003s : 25: predicate.environ_get_eliminate 1.08% : 0.000002s : 17: predicate.environ_get_set_eliminate 1.60% : 0.000004s : 21: predicate.exchange_switch_depend_value 2.63% : 0.000006s : 21: predicate.float_depend_g_call 0.58% : 0.000001s : 8: predicate.float_environ_get_switch 0.76% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.18% : 0.000000s : 4: predicate.fold_const_symbol 0.62% : 0.000001s : 8: predicate.get_grad_eliminate 0.29% : 0.000001s : 4: predicate.graph_param_transform 0.57% : 0.000001s : 8: predicate.incorporate_call 0.44% : 0.000001s : 8: predicate.incorporate_call_switch 6.74% : 0.000015s : 58: predicate.inline 0.86% : 0.000002s : 8: predicate.inline_without_move 0.30% : 0.000001s : 8: predicate.j_node_and_user_rematch 0.93% : 0.000002s : 8: predicate.less_batch_normalization 1.58% : 0.000004s : 25: predicate.list_to_tuple_eliminator_ 2.64% : 0.000006s : 38: predicate.load_eliminater 0.91% : 0.000002s : 4: predicate.loop_unroll_after_grad 2.14% : 0.000005s : 34: predicate.loop_unroll_before_grad 1.62% : 0.000004s : 21: predicate.make_slice_get_slice_eliminator 0.51% : 0.000001s : 8: predicate.merge_addn 0.72% : 0.000002s : 8: predicate.micro_step_allgather_replace 0.65% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.84% : 0.000002s : 13: predicate.minmaximum_grad 1.71% : 0.000004s : 4: predicate.mutable_eliminate 0.34% : 0.000001s : 4: predicate.opt_reshape 0.50% : 0.000001s : 4: predicate.parallel_virtual_node 1.68% : 0.000004s : 21: predicate.partial_defer_inline 1.51% : 0.000003s : 21: predicate.partial_eliminate 0.95% : 0.000002s : 13: predicate.print_const_string_wrapper 0.61% : 0.000001s : 8: predicate.reduce_all_const_elim 1.19% : 0.000003s : 13: predicate.reduce_eliminate 2.21% : 0.000005s : 38: predicate.redundant_stop_gradient_eliminater 0.56% : 0.000001s : 8: predicate.remove_not_recompute_node 1.37% : 0.000003s : 25: predicate.replace_applicator 0.56% : 0.000001s : 8: predicate.replace_old_param 0.36% : 0.000001s : 4: predicate.reset_defer_inline 0.94% : 0.000002s : 13: predicate.reshape_eliminate 0.59% : 0.000001s : 8: predicate.row_tensor_add_zeros_like 0.55% : 0.000001s : 4: predicate.row_tensor_eliminate 0.83% : 0.000002s : 8: predicate.same_eliminate 0.40% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.83% : 0.000002s : 8: predicate.shard_identity_eliminate 0.72% : 0.000002s : 8: predicate.special_op_eliminate 0.77% : 0.000002s : 8: predicate.specialize_transform 1.07% : 0.000002s : 8: predicate.split_environ_get_set_with_tuple_value 0.74% : 0.000002s : 8: predicate.stack_unstack_eliminate 0.37% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.39% : 0.000003s : 21: predicate.switch_defer_inline 2.09% : 0.000005s : 29: predicate.switch_layer_defer_inline 5.11% : 0.000011s : 67: predicate.switch_simplify 0.87% : 0.000002s : 13: predicate.tile_eliminate 1.05% : 0.000002s : 13: predicate.transpose_eliminate 1.51% : 0.000003s : 21: predicate.tuple_list_convert_item_index_to_positive 1.47% : 0.000003s : 21: predicate.tuple_list_get_item_const_eliminator 1.48% : 0.000003s : 21: predicate.tuple_list_get_item_depend_reorder 3.04% : 0.000007s : 33: predicate.tuple_list_get_item_eliminator 1.42% : 0.000003s : 21: predicate.tuple_list_get_set_item_eliminator 2.30% : 0.000005s : 29: predicate.tuple_list_set_item_eliminator 1.57% : 0.000004s : 25: predicate.tuple_to_list_eliminator_ 2.22% : 0.000005s : 38: predicate.updatestate_pure_node_eliminater 2.82% : 0.000006s : 46: predicate.updatestate_useless_node_eliminater 0.43% : 0.000001s : 4: predicate.value_based_eliminate 0.69% : 0.000002s : 8: predicate.virtual_dataset_eliminate 0.66% : 0.000001s : 8: predicate.virtual_output_eliminate 0.27% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.56% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000651 11 55.67% : 0.000362s : 5: func_graph_cloner_run.FuncGraphClonerGraph 44.33% : 0.000289s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.101757 192 0.00% : 0.000004s : 1: ForceFp32Comm 3.80% : 0.003862s : 1: add_attr 3.78% : 0.003847s : 1: add_attr_with_inline 0.00% : 0.000005s : 1: add_comm_op_reuse_tag 0.06% : 0.000061s : 1: add_recomputation 0.00% : 0.000004s : 1: assign_add_opt 0.07% : 0.000072s : 1: auto_monad 0.02% : 0.000022s : 1: auto_monad_reorder 0.00% : 0.000004s : 1: begin_end_overlap_inline 0.01% : 0.000006s : 1: bias_add_comm_swap 0.46% : 0.000465s : 1: bootstrap 0.04% : 0.000042s : 1: cconv 0.00% : 0.000004s : 1: comm_op_add_attrs 0.02% : 0.000017s : 1: control_data_broadcast_order 0.01% : 0.000011s : 1: convert_after_rewriter 0.03% : 0.000027s : 1: cse_after_recomputation 0.01% : 0.000006s : 1: dataset_repeat_opt 0.01% : 0.000005s : 1: detach_backward 0.01% : 0.000010s : 1: environ_conv 0.03% : 0.000027s : 1: event_method 0.01% : 0.000005s : 1: full_micro_interleaved_order_control 0.01% : 0.000005s : 1: get_jit_bprop_graph 0.01% : 0.000010s : 1: graph_reusing 0.00% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000004s : 1: handle_group_info 0.01% : 0.000006s : 1: inline 0.01% : 0.000007s : 1: insert-virtual-dataset 0.00% : 0.000004s : 1: interleave_parallel_branches 0.00% : 0.000004s : 1: interleave_split_concat_branches 0.01% : 0.000007s : 1: label_fine_grained_interleaved_index 0.01% : 0.000007s : 1: label_micro_interleaved_index 0.48% : 0.000492s : 1: loop_unroll 0.00% : 0.000004s : 1: merge_cast_opt 0.01% : 0.000006s : 1: micro_interleaved_order_control 0.76% : 0.000774s : 1: mutable_eliminate 0.01% : 0.000007s : 1: offloading_packed_experts 0.02% : 0.000016s : 1: opt.transform.loop_unroll_optimizer 0.02% : 0.000023s : 1: opt.transform.mutable_eliminate 1.31% : 0.001328s : 78: opt.transform.opt_a 0.03% : 0.000031s : 1: opt.transform.opt_after_cconv 0.03% : 0.000026s : 1: opt.transform.opt_after_jit_grad 0.10% : 0.000105s : 28: opt.transform.opt_b 0.05% : 0.000053s : 2: opt.transform.opt_trans_graph 0.04% : 0.000039s : 4: opt.transform.symbol_engine_opt 3.23% : 0.003282s : 1: opt_a 0.11% : 0.000116s : 1: opt_after_cconv 0.50% : 0.000512s : 1: opt_after_jit_grad 0.23% : 0.000237s : 1: opt_b 5.65% : 0.005746s : 1: optimize 0.02% : 0.000024s : 1: optimize_parallel_all_gather_comm 0.01% : 0.000009s : 1: order_py_execute_after_rewriter 0.03% : 0.000026s : 1: overlap_grad_flash_sp 0.00% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.01% : 0.000007s : 1: overlap_grad_ring_attention 0.01% : 0.000006s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.01% : 0.000006s : 1: overlap_param_gather 0.00% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.01% : 0.000009s : 1: overlap_recompute_and_grad_model_parallel 0.01% : 0.000006s : 1: overlap_recompute_comm 0.01% : 0.000007s : 1: parallel-infer-symbol 0.00% : 0.000004s : 1: parallel-infer-symbol-second 0.01% : 0.000005s : 1: partial_unused_args_eliminate 0.00% : 0.000005s : 1: pipeline_parallel_scheduler 0.00% : 0.000005s : 1: pipeline_split 0.05% : 0.000049s : 1: pre_auto_parallel 0.04% : 0.000036s : 1: py_interpret_to_execute 0.02% : 0.000020s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000004s : 1: remove_cast_before_assign_add 0.02% : 0.000019s : 1: remove_dup_value 0.49% : 0.000502s : 1: renormalize.infer 0.40% : 0.000403s : 1: renormalize.specialize 0.01% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.01% : 0.000009s : 1: rewriter_after_jit_bprop_graph 0.05% : 0.000048s : 1: rewriter_after_opt_a 0.10% : 0.000101s : 1: rewriter_before_opt_a 0.01% : 0.000006s : 1: slice_cell_reuse_recomputed_activation 0.01% : 0.000006s : 1: slice_recompute_activation 0.01% : 0.000005s : 1: split_layernorm_comm 0.01% : 0.000005s : 1: split_matmul_comm_elemetwise 0.01% : 0.000008s : 1: swap_dp_allreduce_reducescatter 0.08% : 0.000086s : 1: symbol_engine_optimizer 0.08% : 0.000084s : 1: tuple_transform 77.51% : 0.078873s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:44:00.264.857 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:44:00.265.125 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0307377, [21] [bootstrap]: 0.00045329 [type_inference]: 0.00682455 [event_method]: 2.198e-05 [auto_monad]: 6.79e-05 [graph_reusing]: 5.74e-06 [inline]: 2.84999e-06 [add_attr]: 0.00391254, [1] [add_attr_with_inline]: 0.00390014, [1] [Cycle 1]: 0.00010129, [2] [tag_attr]: 2.655e-05 [meta_addattr_fg_expand]: 5.97999e-06 [parallel-infer-symbol]: 3.71001e-06 [pre_auto_parallel]: 4.566e-05 [insert-virtual-dataset]: 2.88998e-06 [parallel-infer-symbol-second]: 7.59988e-07 [dataset_repeat_opt]: 2.46e-06 [pipeline_split]: 1.57001e-06 [optimize]: 0.0179895, [53] [py_interpret_to_execute]: 3.651e-05 [rewriter_before_opt_a]: 9.848e-05 [opt_a]: 0.0150276, [2] [Cycle 1]: 0.0139866, [45] [expand_dump_flag]: 3.3e-06 [switch_simplify]: 4.36e-05 [loop_unroll]: 3.011e-05 [a_1]: 0.00069856 [with_stream_mark]: 2.216e-05 [recompute_prepare]: 1.08e-05 [updatestate_depend_eliminate]: 4.65999e-06 [updatestate_assign_eliminate]: 3.95e-06 [updatestate_loads_eliminate]: 3.22002e-06 [parameter_eliminate]: 2.06e-06 [a_2]: 0.00018863 [accelerated_algorithm]: 8.29002e-06 [shard]: 2.73e-06 [meta_shard_fg_expand]: 2.59001e-06 [shard_inline]: 6.46999e-06 [merge_send_recv]: 9.87999e-06 [auto_parallel]: 1.005e-05 [parallel]: 2.199e-05 [flash_sp]: 9.99001e-06 [merge_comm]: 4.15999e-06 [allreduce_fusion]: 3.86001e-06 [matmul_add_comm_reduction]: 1.073e-05 [allreduce_slice_to_reducescatter]: 8.89995e-07 [virtual_shard_identity]: 8.80001e-06 [virtual_dataset]: 7.02002e-06 [get_grad_eliminate_]: 7.66999e-06 [virtual_output]: 6.94999e-06 [merge_forward]: 4.12998e-06 [cell_reuse_recompute_pass]: 1.49e-06 [offload_activation]: 1.108e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.566e-05 [merge_recompute_call_nodes]: 1.47999e-06 [before_grad]: 1.17e-05 [set_forward_comm_id_for_comm_node_pass]: 4.17998e-06 [meta_fg_expand]: 3.32002e-06 [flash_sp_send_recv_attached]: 3.00998e-06 [receive_attached]: 2.47001e-06 [after_resolve]: 1.224e-05 [a_after_grad]: 1.07e-05 [renormalize]: 0.0121633 [add_forward_monad_depend]: 1.273e-05 [auto_monad_grad]: 3.18998e-06 [auto_monad_eliminator]: 2.527e-05 [cse]: 3.308e-05 [a_3]: 8.223e-05 [Cycle 2]: 0.00102078, [45] [expand_dump_flag]: 2.41e-06 [switch_simplify]: 9.86e-06 [loop_unroll]: 7.23e-06 [a_1]: 0.00016755 [with_stream_mark]: 2.199e-05 [recompute_prepare]: 9.14998e-06 [updatestate_depend_eliminate]: 4.30999e-06 [updatestate_assign_eliminate]: 3.14999e-06 [updatestate_loads_eliminate]: 2.72001e-06 [parameter_eliminate]: 2.02001e-06 [a_2]: 0.00011492 [accelerated_algorithm]: 8.72e-06 [shard]: 3.03e-06 [meta_shard_fg_expand]: 2.21e-06 [shard_inline]: 6.78998e-06 [merge_send_recv]: 1.058e-05 [auto_parallel]: 1.022e-05 [parallel]: 9.50001e-06 [flash_sp]: 4.14002e-06 [merge_comm]: 4.38001e-06 [allreduce_fusion]: 3.46999e-06 [matmul_add_comm_reduction]: 1.066e-05 [allreduce_slice_to_reducescatter]: 8.00006e-07 [virtual_shard_identity]: 9.67001e-06 [virtual_dataset]: 6.78e-06 [get_grad_eliminate_]: 6.55997e-06 [virtual_output]: 6.46999e-06 [merge_forward]: 4.79e-06 [cell_reuse_recompute_pass]: 3.13e-06 [offload_activation]: 1.105e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.732e-05 [merge_recompute_call_nodes]: 1.77001e-06 [before_grad]: 1.211e-05 [set_forward_comm_id_for_comm_node_pass]: 5.09e-06 [meta_fg_expand]: 3.03e-06 [flash_sp_send_recv_attached]: 2.02001e-06 [receive_attached]: 2.88998e-06 [after_resolve]: 1.607e-05 [a_after_grad]: 1.074e-05 [renormalize]: 9.00181e-08 [add_forward_monad_depend]: 4.36002e-06 [auto_monad_grad]: 1.66e-06 [auto_monad_eliminator]: 1.321e-05 [cse]: 2.258e-05 [a_3]: 5.283e-05 [py_interpret_to_execute_after_opt_a]: 2.549e-05 [slice_cell_reuse_recomputed_activation]: 5.21998e-06 [rewriter_after_opt_a]: 5.078e-05 [convert_after_rewriter]: 1.125e-05 [order_py_execute_after_rewriter]: 8.66002e-06 [mutable_eliminate]: 0.00082137 [opt_b]: 0.00031141, [1] [Cycle 1]: 0.00029869, [7] [b_1]: 0.0001768 [b_2]: 1.023e-05 [updatestate_depend_eliminate]: 1.173e-05 [updatestate_assign_eliminate]: 2.78e-06 [updatestate_loads_eliminate]: 2.97002e-06 [renormalize]: 1.14e-06 [cse]: 3.298e-05 [optimize_parallel_all_gather_comm]: 2.608e-05 [overlap_param_gather]: 5.19998e-06 [cconv]: 4.146e-05 [loop_unroll]: 0.00060279 [opt_after_cconv]: 0.00015147, [1] [Cycle 1]: 0.0001412, [7] [c_1]: 3.539e-05 [parameter_eliminate]: 6.22001e-06 [updatestate_depend_eliminate]: 8.33999e-06 [updatestate_assign_eliminate]: 2.76e-06 [updatestate_loads_eliminate]: 3.33e-06 [cse]: 2.509e-05 [renormalize]: 1.37e-06 [remove_dup_value]: 1.897e-05 [tuple_transform]: 0.00010002, [1] [Cycle 1]: 9.193e-05, [4] [d_1]: 5.114e-05 [none_parameter_eliminate]: 1.98997e-06 [renormalize]: 2.09984e-07 [switch_simplify]: 7.50998e-06 [partial_unused_args_eliminate]: 4.78001e-06 [add_recomputation]: 6.021e-05 [cse_after_recomputation]: 2.983e-05, [1] [Cycle 1]: 2.213e-05, [1] [cse]: 1.199e-05 [environ_conv]: 9.61998e-06 [swap_dp_allreduce_reducescatter]: 7.89997e-06 [bias_add_comm_swap]: 5.75001e-06 [label_micro_interleaved_index]: 9.28002e-06 [label_fine_grained_interleaved_index]: 5.94e-06 [merge_cast_opt]: 4.12e-06 [slice_recompute_activation]: 4.82e-06 [micro_interleaved_order_control]: 5.22e-06 [assign_add_opt]: 3.73999e-06 [ForceFp32Comm]: 3.3e-06 [remove_cast_before_assign_add]: 4e-06 [full_micro_interleaved_order_control]: 4.60999e-06 [reorder_send_recv_between_fp_bp]: 5.97999e-06 [comm_op_add_attrs]: 3.5e-06 [add_comm_op_reuse_tag]: 3.36001e-06 [interleave_split_concat_branches]: 3.48999e-06 [interleave_parallel_branches]: 3.68e-06 [overlap_opt_shard_in_pipeline]: 3.90998e-06 [overlap_opt_shard_grad_in_pipeline]: 4.1e-06 [control_data_broadcast_order]: 1.919e-05 [grouped_pairwise_exchange_alltoall]: 4.02998e-06 [offloading_packed_experts]: 7.4e-06 [overlap_recompute_and_grad_model_parallel]: 7.42002e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.71001e-06 [overlap_recompute_allgather_and_fa_grad]: 3.8e-06 [overlap_recompute_comm]: 4.71002e-06 [overlap_grad_ring_attention]: 7.40003e-06 [overlap_grad_flash_sp]: 2.84e-05 [begin_end_overlap_inline]: 3.03998e-06 [split_matmul_comm_elemetwise]: 4.51002e-06 [split_layernorm_comm]: 5.04e-06 [handle_group_info]: 3.63e-06 [symbol_engine_optimizer]: 0.00011253, [1] [Cycle 1]: 0.0001037, [6] [build]: 4.90999e-06 [elim_shapecalc]: 1.373e-05 [elim_not_effective]: 1.434e-05 [opt_reshape]: 8.37998e-06 [fold_const_symbol]: 1.129e-05 [renormalize]: 2.10013e-07 [detach_backward]: 5.24e-06 [pipeline_parallel_scheduler]: 1.76e-06 [auto_monad_reorder]: 2.435e-05 [get_jit_bprop_graph]: 2.51998e-06 [rewriter_after_jit_bprop_graph]: 6.77002e-06 [opt_after_jit_grad]: 0.00067975 [validate]: 4.831e-05 Sums bootstrap : 0.000453s : 1.82% type_inference : 0.006825s : 27.45% event_method : 0.000022s : 0.09% auto_monad : 0.000068s : 0.27% graph_reusing : 0.000006s : 0.02% inline : 0.000003s : 0.01% add_attr.add_attr_with_inline.tag_attr : 0.000027s : 0.11% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.02% parallel-infer-symbol : 0.000004s : 0.01% pre_auto_parallel : 0.000046s : 0.18% insert-virtual-dataset : 0.000003s : 0.01% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.01% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000037s : 0.15% optimize.rewriter_before_opt_a : 0.000098s : 0.40% optimize.opt_a.expand_dump_flag : 0.000006s : 0.02% optimize.opt_a.switch_simplify : 0.000053s : 0.21% optimize.opt_a.loop_unroll : 0.000037s : 0.15% optimize.opt_a.a_1 : 0.000866s : 3.48% optimize.opt_a.with_stream_mark : 0.000044s : 0.18% optimize.opt_a.recompute_prepare : 0.000020s : 0.08% optimize.opt_a.updatestate_depend_eliminate : 0.000009s : 0.04% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.03% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.02% optimize.opt_a.parameter_eliminate : 0.000004s : 0.02% optimize.opt_a.a_2 : 0.000304s : 1.22% optimize.opt_a.accelerated_algorithm : 0.000017s : 0.07% optimize.opt_a.shard : 0.000006s : 0.02% optimize.opt_a.meta_shard_fg_expand : 0.000005s : 0.02% optimize.opt_a.shard_inline : 0.000013s : 0.05% optimize.opt_a.merge_send_recv : 0.000020s : 0.08% optimize.opt_a.auto_parallel : 0.000020s : 0.08% optimize.opt_a.parallel : 0.000031s : 0.13% optimize.opt_a.flash_sp : 0.000014s : 0.06% optimize.opt_a.merge_comm : 0.000009s : 0.03% optimize.opt_a.allreduce_fusion : 0.000007s : 0.03% optimize.opt_a.matmul_add_comm_reduction : 0.000021s : 0.09% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000002s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000018s : 0.07% optimize.opt_a.virtual_dataset : 0.000014s : 0.06% optimize.opt_a.get_grad_eliminate_ : 0.000014s : 0.06% optimize.opt_a.virtual_output : 0.000013s : 0.05% optimize.opt_a.merge_forward : 0.000009s : 0.04% optimize.opt_a.cell_reuse_recompute_pass : 0.000005s : 0.02% optimize.opt_a.offload_activation : 0.000022s : 0.09% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000033s : 0.13% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.01% optimize.opt_a.before_grad : 0.000024s : 0.10% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000009s : 0.04% optimize.opt_a.meta_fg_expand : 0.000006s : 0.03% optimize.opt_a.flash_sp_send_recv_attached : 0.000005s : 0.02% optimize.opt_a.receive_attached : 0.000005s : 0.02% optimize.opt_a.after_resolve : 0.000028s : 0.11% optimize.opt_a.a_after_grad : 0.000021s : 0.09% optimize.opt_a.renormalize : 0.012163s : 48.92% optimize.opt_a.add_forward_monad_depend : 0.000017s : 0.07% optimize.opt_a.auto_monad_grad : 0.000005s : 0.02% optimize.opt_a.auto_monad_eliminator : 0.000038s : 0.15% optimize.opt_a.cse : 0.000056s : 0.22% optimize.opt_a.a_3 : 0.000135s : 0.54% optimize.py_interpret_to_execute_after_opt_a : 0.000025s : 0.10% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.02% optimize.rewriter_after_opt_a : 0.000051s : 0.20% optimize.convert_after_rewriter : 0.000011s : 0.05% optimize.order_py_execute_after_rewriter : 0.000009s : 0.03% optimize.mutable_eliminate : 0.000821s : 3.30% optimize.opt_b.b_1 : 0.000177s : 0.71% optimize.opt_b.b_2 : 0.000010s : 0.04% optimize.opt_b.updatestate_depend_eliminate : 0.000012s : 0.05% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.01% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.01% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000033s : 0.13% optimize.optimize_parallel_all_gather_comm : 0.000026s : 0.10% optimize.overlap_param_gather : 0.000005s : 0.02% optimize.cconv : 0.000041s : 0.17% optimize.loop_unroll : 0.000603s : 2.42% optimize.opt_after_cconv.c_1 : 0.000035s : 0.14% optimize.opt_after_cconv.parameter_eliminate : 0.000006s : 0.03% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000008s : 0.03% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.cse : 0.000025s : 0.10% optimize.opt_after_cconv.renormalize : 0.000001s : 0.01% optimize.remove_dup_value : 0.000019s : 0.08% optimize.tuple_transform.d_1 : 0.000051s : 0.21% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000008s : 0.03% optimize.partial_unused_args_eliminate : 0.000005s : 0.02% optimize.add_recomputation : 0.000060s : 0.24% optimize.cse_after_recomputation.cse : 0.000012s : 0.05% optimize.environ_conv : 0.000010s : 0.04% optimize.swap_dp_allreduce_reducescatter : 0.000008s : 0.03% optimize.bias_add_comm_swap : 0.000006s : 0.02% optimize.label_micro_interleaved_index : 0.000009s : 0.04% optimize.label_fine_grained_interleaved_index : 0.000006s : 0.02% optimize.merge_cast_opt : 0.000004s : 0.02% optimize.slice_recompute_activation : 0.000005s : 0.02% optimize.micro_interleaved_order_control : 0.000005s : 0.02% optimize.assign_add_opt : 0.000004s : 0.02% optimize.ForceFp32Comm : 0.000003s : 0.01% optimize.remove_cast_before_assign_add : 0.000004s : 0.02% optimize.full_micro_interleaved_order_control : 0.000005s : 0.02% optimize.reorder_send_recv_between_fp_bp : 0.000006s : 0.02% optimize.comm_op_add_attrs : 0.000003s : 0.01% optimize.add_comm_op_reuse_tag : 0.000003s : 0.01% optimize.interleave_split_concat_branches : 0.000003s : 0.01% optimize.interleave_parallel_branches : 0.000004s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.02% optimize.overlap_opt_shard_grad_in_pipeline : 0.000004s : 0.02% optimize.control_data_broadcast_order : 0.000019s : 0.08% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.02% optimize.offloading_packed_experts : 0.000007s : 0.03% optimize.overlap_recompute_and_grad_model_parallel : 0.000007s : 0.03% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.02% optimize.overlap_recompute_comm : 0.000005s : 0.02% optimize.overlap_grad_ring_attention : 0.000007s : 0.03% optimize.overlap_grad_flash_sp : 0.000028s : 0.11% optimize.begin_end_overlap_inline : 0.000003s : 0.01% optimize.split_matmul_comm_elemetwise : 0.000005s : 0.02% optimize.split_layernorm_comm : 0.000005s : 0.02% optimize.handle_group_info : 0.000004s : 0.01% optimize.symbol_engine_optimizer.build : 0.000005s : 0.02% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000014s : 0.06% optimize.symbol_engine_optimizer.elim_not_effective : 0.000014s : 0.06% optimize.symbol_engine_optimizer.opt_reshape : 0.000008s : 0.03% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000011s : 0.05% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000005s : 0.02% pipeline_parallel_scheduler : 0.000002s : 0.01% auto_monad_reorder : 0.000024s : 0.10% get_jit_bprop_graph : 0.000003s : 0.01% rewriter_after_jit_bprop_graph : 0.000007s : 0.03% opt_after_jit_grad : 0.000680s : 2.73% validate : 0.000048s : 0.19% Time group info: ------[substitution.] 0.000267 34 17.28% : 0.000046s : 6: substitution.arithmetic_simplify 0.86% : 0.000002s : 2: substitution.elim_not_effective 0.63% : 0.000002s : 2: substitution.fold_const_symbol 2.48% : 0.000007s : 4: substitution.graph_param_transform 65.83% : 0.000176s : 4: substitution.inline 1.93% : 0.000005s : 4: substitution.j_node_and_user_rematch 2.11% : 0.000006s : 4: substitution.remove_not_recompute_node 2.85% : 0.000008s : 4: substitution.replace_old_param 6.04% : 0.000016s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.006761 2 88.23% : 0.005965s : 1: type_inference.infer 11.77% : 0.000796s : 1: type_inference.specialize ------[replace.] 0.000070 8 63.36% : 0.000044s : 4: replace.inline 36.64% : 0.000026s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000187 8 92.33% : 0.000173s : 4: match.inline 7.67% : 0.000014s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000235 1278 1.07% : 0.000003s : 13: predicate.accumulaten_eliminater 0.91% : 0.000002s : 4: predicate.ad_related_special_op_eliminate 0.50% : 0.000001s : 8: predicate.addn_check_dump 0.93% : 0.000002s : 13: predicate.addn_zero_filter 0.73% : 0.000002s : 13: predicate.adjust_all_reduce_mul_add 2.61% : 0.000006s : 21: predicate.arithmetic_simplify 1.01% : 0.000002s : 13: predicate.cast_eliminate 0.62% : 0.000001s : 8: predicate.check_bprop_eliminate 0.49% : 0.000001s : 8: predicate.compare_switch_simplify 0.15% : 0.000000s : 4: predicate.const_output_eliminate 0.64% : 0.000001s : 8: predicate.depend_value_elim 0.91% : 0.000002s : 13: predicate.dict_get_item_const_eliminator 1.00% : 0.000002s : 13: predicate.dict_get_item_eliminator 0.86% : 0.000002s : 13: predicate.dict_set_item_eliminator 1.33% : 0.000003s : 8: predicate.dumpgradient_eliminate 0.19% : 0.000000s : 4: predicate.elim_not_effective 0.36% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.08% : 0.000003s : 17: predicate.environ_add_const_eliminate 1.11% : 0.000003s : 17: predicate.environ_get_add_eliminate 1.13% : 0.000003s : 17: predicate.environ_get_depend_swap 1.87% : 0.000004s : 25: predicate.environ_get_eliminate 1.02% : 0.000002s : 17: predicate.environ_get_set_eliminate 1.37% : 0.000003s : 21: predicate.exchange_switch_depend_value 2.22% : 0.000005s : 21: predicate.float_depend_g_call 0.56% : 0.000001s : 8: predicate.float_environ_get_switch 0.88% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.14% : 0.000000s : 4: predicate.fold_const_symbol 0.68% : 0.000002s : 8: predicate.get_grad_eliminate 0.18% : 0.000000s : 4: predicate.graph_param_transform 0.50% : 0.000001s : 8: predicate.incorporate_call 0.57% : 0.000001s : 8: predicate.incorporate_call_switch 6.32% : 0.000015s : 58: predicate.inline 0.68% : 0.000002s : 8: predicate.inline_without_move 0.27% : 0.000001s : 8: predicate.j_node_and_user_rematch 0.87% : 0.000002s : 8: predicate.less_batch_normalization 1.91% : 0.000004s : 25: predicate.list_to_tuple_eliminator_ 2.43% : 0.000006s : 38: predicate.load_eliminater 1.30% : 0.000003s : 4: predicate.loop_unroll_after_grad 2.18% : 0.000005s : 34: predicate.loop_unroll_before_grad 1.54% : 0.000004s : 21: predicate.make_slice_get_slice_eliminator 0.82% : 0.000002s : 8: predicate.merge_addn 0.54% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.55% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.96% : 0.000002s : 13: predicate.minmaximum_grad 1.80% : 0.000004s : 4: predicate.mutable_eliminate 0.36% : 0.000001s : 4: predicate.opt_reshape 0.40% : 0.000001s : 4: predicate.parallel_virtual_node 1.55% : 0.000004s : 21: predicate.partial_defer_inline 1.42% : 0.000003s : 21: predicate.partial_eliminate 0.85% : 0.000002s : 13: predicate.print_const_string_wrapper 0.65% : 0.000002s : 8: predicate.reduce_all_const_elim 1.07% : 0.000003s : 13: predicate.reduce_eliminate 2.46% : 0.000006s : 38: predicate.redundant_stop_gradient_eliminater 0.72% : 0.000002s : 8: predicate.remove_not_recompute_node 1.32% : 0.000003s : 25: predicate.replace_applicator 0.56% : 0.000001s : 8: predicate.replace_old_param 0.40% : 0.000001s : 4: predicate.reset_defer_inline 0.88% : 0.000002s : 13: predicate.reshape_eliminate 0.61% : 0.000001s : 8: predicate.row_tensor_add_zeros_like 0.41% : 0.000001s : 4: predicate.row_tensor_eliminate 0.90% : 0.000002s : 8: predicate.same_eliminate 0.39% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.81% : 0.000002s : 8: predicate.shard_identity_eliminate 0.64% : 0.000002s : 8: predicate.special_op_eliminate 0.65% : 0.000002s : 8: predicate.specialize_transform 1.02% : 0.000002s : 8: predicate.split_environ_get_set_with_tuple_value 0.95% : 0.000002s : 8: predicate.stack_unstack_eliminate 0.36% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.37% : 0.000003s : 21: predicate.switch_defer_inline 1.83% : 0.000004s : 29: predicate.switch_layer_defer_inline 4.65% : 0.000011s : 67: predicate.switch_simplify 0.82% : 0.000002s : 13: predicate.tile_eliminate 0.89% : 0.000002s : 13: predicate.transpose_eliminate 1.55% : 0.000004s : 21: predicate.tuple_list_convert_item_index_to_positive 1.29% : 0.000003s : 21: predicate.tuple_list_get_item_const_eliminator 1.40% : 0.000003s : 21: predicate.tuple_list_get_item_depend_reorder 3.18% : 0.000007s : 33: predicate.tuple_list_get_item_eliminator 1.53% : 0.000004s : 21: predicate.tuple_list_get_set_item_eliminator 2.50% : 0.000006s : 29: predicate.tuple_list_set_item_eliminator 1.72% : 0.000004s : 25: predicate.tuple_to_list_eliminator_ 2.16% : 0.000005s : 38: predicate.updatestate_pure_node_eliminater 2.97% : 0.000007s : 46: predicate.updatestate_useless_node_eliminater 0.73% : 0.000002s : 4: predicate.value_based_eliminate 0.61% : 0.000001s : 8: predicate.virtual_dataset_eliminate 0.61% : 0.000001s : 8: predicate.virtual_output_eliminate 0.26% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.79% : 0.000002s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000756 11 52.08% : 0.000394s : 5: func_graph_cloner_run.FuncGraphClonerGraph 47.92% : 0.000362s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.066301 192 0.01% : 0.000006s : 1: ForceFp32Comm 5.92% : 0.003925s : 1: add_attr 5.89% : 0.003904s : 1: add_attr_with_inline 0.01% : 0.000007s : 1: add_comm_op_reuse_tag 0.10% : 0.000065s : 1: add_recomputation 0.01% : 0.000006s : 1: assign_add_opt 0.12% : 0.000078s : 1: auto_monad 0.05% : 0.000033s : 1: auto_monad_reorder 0.01% : 0.000006s : 1: begin_end_overlap_inline 0.01% : 0.000009s : 1: bias_add_comm_swap 0.75% : 0.000499s : 1: bootstrap 0.07% : 0.000045s : 1: cconv 0.01% : 0.000007s : 1: comm_op_add_attrs 0.03% : 0.000023s : 1: control_data_broadcast_order 0.02% : 0.000014s : 1: convert_after_rewriter 0.05% : 0.000033s : 1: cse_after_recomputation 0.01% : 0.000008s : 1: dataset_repeat_opt 0.04% : 0.000028s : 1: detach_backward 0.02% : 0.000013s : 1: environ_conv 0.05% : 0.000033s : 1: event_method 0.01% : 0.000007s : 1: full_micro_interleaved_order_control 0.01% : 0.000009s : 1: get_jit_bprop_graph 0.02% : 0.000012s : 1: graph_reusing 0.01% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000007s : 1: handle_group_info 0.01% : 0.000009s : 1: inline 0.01% : 0.000009s : 1: insert-virtual-dataset 0.01% : 0.000006s : 1: interleave_parallel_branches 0.01% : 0.000006s : 1: interleave_split_concat_branches 0.01% : 0.000009s : 1: label_fine_grained_interleaved_index 0.02% : 0.000012s : 1: label_micro_interleaved_index 0.92% : 0.000611s : 1: loop_unroll 0.01% : 0.000007s : 1: merge_cast_opt 0.01% : 0.000008s : 1: micro_interleaved_order_control 1.25% : 0.000831s : 1: mutable_eliminate 0.02% : 0.000011s : 1: offloading_packed_experts 0.03% : 0.000018s : 1: opt.transform.loop_unroll_optimizer 0.04% : 0.000025s : 1: opt.transform.mutable_eliminate 2.04% : 0.001352s : 78: opt.transform.opt_a 0.05% : 0.000034s : 1: opt.transform.opt_after_cconv 0.05% : 0.000032s : 1: opt.transform.opt_after_jit_grad 0.17% : 0.000111s : 28: opt.transform.opt_b 0.08% : 0.000056s : 2: opt.transform.opt_trans_graph 0.06% : 0.000043s : 4: opt.transform.symbol_engine_opt 22.67% : 0.015032s : 1: opt_a 0.23% : 0.000155s : 1: opt_after_cconv 1.04% : 0.000693s : 1: opt_after_jit_grad 0.48% : 0.000316s : 1: opt_b 27.65% : 0.018335s : 1: optimize 0.04% : 0.000029s : 1: optimize_parallel_all_gather_comm 0.02% : 0.000012s : 1: order_py_execute_after_rewriter 0.05% : 0.000032s : 1: overlap_grad_flash_sp 0.01% : 0.000007s : 1: overlap_grad_matmul_and_grad_allreduce 0.02% : 0.000010s : 1: overlap_grad_ring_attention 0.01% : 0.000007s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000006s : 1: overlap_opt_shard_in_pipeline 0.01% : 0.000008s : 1: overlap_param_gather 0.01% : 0.000006s : 1: overlap_recompute_allgather_and_fa_grad 0.02% : 0.000011s : 1: overlap_recompute_and_grad_model_parallel 0.01% : 0.000008s : 1: overlap_recompute_comm 0.02% : 0.000011s : 1: parallel-infer-symbol 0.01% : 0.000006s : 1: parallel-infer-symbol-second 0.01% : 0.000008s : 1: partial_unused_args_eliminate 0.01% : 0.000010s : 1: pipeline_parallel_scheduler 0.01% : 0.000007s : 1: pipeline_split 0.08% : 0.000053s : 1: pre_auto_parallel 0.06% : 0.000040s : 1: py_interpret_to_execute 0.05% : 0.000030s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000007s : 1: remove_cast_before_assign_add 0.03% : 0.000023s : 1: remove_dup_value 17.52% : 0.011617s : 1: renormalize.infer 0.79% : 0.000526s : 1: renormalize.specialize 0.01% : 0.000009s : 1: reorder_send_recv_between_fp_bp 0.02% : 0.000013s : 1: rewriter_after_jit_bprop_graph 0.08% : 0.000055s : 1: rewriter_after_opt_a 0.15% : 0.000102s : 1: rewriter_before_opt_a 0.01% : 0.000008s : 1: slice_cell_reuse_recomputed_activation 0.01% : 0.000009s : 1: slice_recompute_activation 0.01% : 0.000009s : 1: split_layernorm_comm 0.01% : 0.000007s : 1: split_matmul_comm_elemetwise 0.02% : 0.000011s : 1: swap_dp_allreduce_reducescatter 0.17% : 0.000116s : 1: symbol_engine_optimizer 0.16% : 0.000103s : 1: tuple_transform 10.38% : 0.006879s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:44:01.671.060 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0781499, [21] [bootstrap]: 0.00042975 [type_inference]: 0.0667331 [event_method]: 1.858e-05 [auto_monad]: 6.476e-05 [graph_reusing]: 6.51e-06 [inline]: 2.71e-06 [add_attr]: 0.00443173, [1] [add_attr_with_inline]: 0.00441819, [1] [Cycle 1]: 7.824e-05, [2] [tag_attr]: 2.786e-05 [meta_addattr_fg_expand]: 6.11e-06 [parallel-infer-symbol]: 3.8e-06 [pre_auto_parallel]: 4.423e-05 [insert-virtual-dataset]: 2.74001e-06 [parallel-infer-symbol-second]: 9.69972e-07 [dataset_repeat_opt]: 2.11998e-06 [pipeline_split]: 1.62001e-06 [optimize]: 0.00557685, [53] [py_interpret_to_execute]: 3.49e-05 [rewriter_before_opt_a]: 9.854e-05 [opt_a]: 0.00323283, [2] [Cycle 1]: 0.00251904, [45] [expand_dump_flag]: 2.96001e-06 [switch_simplify]: 4.472e-05 [loop_unroll]: 3.074e-05 [a_1]: 0.00070066 [with_stream_mark]: 1.985e-05 [recompute_prepare]: 9.98002e-06 [updatestate_depend_eliminate]: 4.08001e-06 [updatestate_assign_eliminate]: 3.35998e-06 [updatestate_loads_eliminate]: 2.77002e-06 [parameter_eliminate]: 2.06e-06 [a_2]: 8.801e-05 [accelerated_algorithm]: 7.97e-06 [shard]: 1.84e-06 [meta_shard_fg_expand]: 2.14999e-06 [shard_inline]: 7.49002e-06 [merge_send_recv]: 9.43997e-06 [auto_parallel]: 7.45998e-06 [parallel]: 1.973e-05 [flash_sp]: 9.26002e-06 [merge_comm]: 3.88999e-06 [allreduce_fusion]: 3.53999e-06 [matmul_add_comm_reduction]: 1.058e-05 [allreduce_slice_to_reducescatter]: 8.60018e-07 [virtual_shard_identity]: 8.25e-06 [virtual_dataset]: 7.33999e-06 [get_grad_eliminate_]: 6.93998e-06 [virtual_output]: 6.91001e-06 [merge_forward]: 3.89002e-06 [cell_reuse_recompute_pass]: 1.21002e-06 [offload_activation]: 1.004e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.365e-05 [merge_recompute_call_nodes]: 1.54e-06 [before_grad]: 1.114e-05 [set_forward_comm_id_for_comm_node_pass]: 3.78001e-06 [meta_fg_expand]: 3.09001e-06 [flash_sp_send_recv_attached]: 2.71e-06 [receive_attached]: 2.49999e-06 [after_resolve]: 1.225e-05 [a_after_grad]: 1.067e-05 [renormalize]: 0.00104069 [add_forward_monad_depend]: 6.95998e-06 [auto_monad_grad]: 2.24999e-06 [auto_monad_eliminator]: 1.948e-05 [cse]: 3.193e-05 [a_3]: 5.323e-05 [Cycle 2]: 0.00070068, [45] [expand_dump_flag]: 2.32999e-06 [switch_simplify]: 7.95e-06 [loop_unroll]: 6.57002e-06 [a_1]: 0.00015042 [with_stream_mark]: 1.571e-05 [recompute_prepare]: 6.88998e-06 [updatestate_depend_eliminate]: 3.68999e-06 [updatestate_assign_eliminate]: 3.29001e-06 [updatestate_loads_eliminate]: 2.69999e-06 [parameter_eliminate]: 1.56998e-06 [a_2]: 7.915e-05 [accelerated_algorithm]: 6.36e-06 [shard]: 1.39998e-06 [meta_shard_fg_expand]: 1.84e-06 [shard_inline]: 5.97999e-06 [merge_send_recv]: 6.59999e-06 [auto_parallel]: 6.81001e-06 [parallel]: 5.89e-06 [flash_sp]: 6.94001e-06 [merge_comm]: 3.51999e-06 [allreduce_fusion]: 3.15002e-06 [matmul_add_comm_reduction]: 6.77002e-06 [allreduce_slice_to_reducescatter]: 4.50003e-07 [virtual_shard_identity]: 7.85e-06 [virtual_dataset]: 5.87999e-06 [get_grad_eliminate_]: 5.76003e-06 [virtual_output]: 5.91e-06 [merge_forward]: 3.23e-06 [cell_reuse_recompute_pass]: 1.84998e-06 [offload_activation]: 8.3e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.318e-05 [merge_recompute_call_nodes]: 1.20001e-06 [before_grad]: 1.042e-05 [set_forward_comm_id_for_comm_node_pass]: 3.37002e-06 [meta_fg_expand]: 2.49999e-06 [flash_sp_send_recv_attached]: 1.04998e-06 [receive_attached]: 1.57999e-06 [after_resolve]: 1.285e-05 [a_after_grad]: 1.01e-05 [renormalize]: 8.9989e-08 [add_forward_monad_depend]: 1.44e-06 [auto_monad_grad]: 8.49977e-07 [auto_monad_eliminator]: 7.57998e-06 [cse]: 1.391e-05 [a_3]: 3.671e-05 [py_interpret_to_execute_after_opt_a]: 1.226e-05 [slice_cell_reuse_recomputed_activation]: 2.27999e-06 [rewriter_after_opt_a]: 3.904e-05 [convert_after_rewriter]: 7.56999e-06 [order_py_execute_after_rewriter]: 5.07999e-06 [mutable_eliminate]: 0.00072402 [opt_b]: 0.00021679, [1] [Cycle 1]: 0.00020917, [7] [b_1]: 0.00012863 [b_2]: 8.42e-06 [updatestate_depend_eliminate]: 7.98999e-06 [updatestate_assign_eliminate]: 2.99001e-06 [updatestate_loads_eliminate]: 2.59001e-06 [renormalize]: 8.29983e-07 [cse]: 2.157e-05 [optimize_parallel_all_gather_comm]: 1.791e-05 [overlap_param_gather]: 2.24999e-06 [cconv]: 2.977e-05 [loop_unroll]: 0.00046189 [opt_after_cconv]: 0.00011034, [1] [Cycle 1]: 0.00010362, [7] [c_1]: 3.211e-05 [parameter_eliminate]: 4.84e-06 [updatestate_depend_eliminate]: 6.66e-06 [updatestate_assign_eliminate]: 2.63998e-06 [updatestate_loads_eliminate]: 2.39001e-06 [cse]: 1.925e-05 [renormalize]: 5.29981e-07 [remove_dup_value]: 1.384e-05 [tuple_transform]: 8.31e-05, [1] [Cycle 1]: 7.793e-05, [4] [d_1]: 4.86e-05 [none_parameter_eliminate]: 1.94999e-06 [renormalize]: 3.30008e-07 [switch_simplify]: 7.72002e-06 [partial_unused_args_eliminate]: 1.91998e-06 [add_recomputation]: 5.185e-05 [cse_after_recomputation]: 2.198e-05, [1] [Cycle 1]: 1.707e-05, [1] [cse]: 1.136e-05 [environ_conv]: 5.55001e-06 [swap_dp_allreduce_reducescatter]: 5.15999e-06 [bias_add_comm_swap]: 2.74999e-06 [label_micro_interleaved_index]: 5.20999e-06 [label_fine_grained_interleaved_index]: 2.86e-06 [merge_cast_opt]: 1.37e-06 [slice_recompute_activation]: 2.46e-06 [micro_interleaved_order_control]: 2.20002e-06 [assign_add_opt]: 1.14003e-06 [ForceFp32Comm]: 8.59989e-07 [remove_cast_before_assign_add]: 9.70002e-07 [full_micro_interleaved_order_control]: 2.09e-06 [reorder_send_recv_between_fp_bp]: 3.01999e-06 [comm_op_add_attrs]: 1.07e-06 [add_comm_op_reuse_tag]: 9.39996e-07 [interleave_split_concat_branches]: 1.26002e-06 [interleave_parallel_branches]: 1.02e-06 [overlap_opt_shard_in_pipeline]: 1.75001e-06 [overlap_opt_shard_grad_in_pipeline]: 2.10002e-06 [control_data_broadcast_order]: 1.237e-05 [grouped_pairwise_exchange_alltoall]: 1.57001e-06 [offloading_packed_experts]: 3.97e-06 [overlap_recompute_and_grad_model_parallel]: 4.45e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.14998e-06 [overlap_recompute_allgather_and_fa_grad]: 1.31002e-06 [overlap_recompute_comm]: 2.18998e-06 [overlap_grad_ring_attention]: 4.12e-06 [overlap_grad_flash_sp]: 2.039e-05 [begin_end_overlap_inline]: 5.19998e-07 [split_matmul_comm_elemetwise]: 2.76e-06 [split_layernorm_comm]: 1.78002e-06 [handle_group_info]: 1.11002e-06 [symbol_engine_optimizer]: 7.978e-05, [1] [Cycle 1]: 7.502e-05, [6] [build]: 4.02998e-06 [elim_shapecalc]: 1.067e-05 [elim_not_effective]: 1.398e-05 [opt_reshape]: 7.04001e-06 [fold_const_symbol]: 1.018e-05 [renormalize]: 2.00002e-07 [detach_backward]: 2.48e-06 [pipeline_parallel_scheduler]: 1.74998e-06 [auto_monad_reorder]: 1.617e-05 [get_jit_bprop_graph]: 2.70002e-06 [rewriter_after_jit_bprop_graph]: 5.91e-06 [opt_after_jit_grad]: 0.00059065 [validate]: 4.764e-05 Sums bootstrap : 0.000430s : 0.59% type_inference : 0.066733s : 91.77% event_method : 0.000019s : 0.03% auto_monad : 0.000065s : 0.09% graph_reusing : 0.000007s : 0.01% inline : 0.000003s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000028s : 0.04% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.01% parallel-infer-symbol : 0.000004s : 0.01% pre_auto_parallel : 0.000044s : 0.06% insert-virtual-dataset : 0.000003s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000035s : 0.05% optimize.rewriter_before_opt_a : 0.000099s : 0.14% optimize.opt_a.expand_dump_flag : 0.000005s : 0.01% optimize.opt_a.switch_simplify : 0.000053s : 0.07% optimize.opt_a.loop_unroll : 0.000037s : 0.05% optimize.opt_a.a_1 : 0.000851s : 1.17% optimize.opt_a.with_stream_mark : 0.000036s : 0.05% optimize.opt_a.recompute_prepare : 0.000017s : 0.02% optimize.opt_a.updatestate_depend_eliminate : 0.000008s : 0.01% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.01% optimize.opt_a.updatestate_loads_eliminate : 0.000005s : 0.01% optimize.opt_a.parameter_eliminate : 0.000004s : 0.00% optimize.opt_a.a_2 : 0.000167s : 0.23% optimize.opt_a.accelerated_algorithm : 0.000014s : 0.02% optimize.opt_a.shard : 0.000003s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.01% optimize.opt_a.shard_inline : 0.000013s : 0.02% optimize.opt_a.merge_send_recv : 0.000016s : 0.02% optimize.opt_a.auto_parallel : 0.000014s : 0.02% optimize.opt_a.parallel : 0.000026s : 0.04% optimize.opt_a.flash_sp : 0.000016s : 0.02% optimize.opt_a.merge_comm : 0.000007s : 0.01% optimize.opt_a.allreduce_fusion : 0.000007s : 0.01% optimize.opt_a.matmul_add_comm_reduction : 0.000017s : 0.02% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000016s : 0.02% optimize.opt_a.virtual_dataset : 0.000013s : 0.02% optimize.opt_a.get_grad_eliminate_ : 0.000013s : 0.02% optimize.opt_a.virtual_output : 0.000013s : 0.02% optimize.opt_a.merge_forward : 0.000007s : 0.01% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.00% optimize.opt_a.offload_activation : 0.000018s : 0.03% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000027s : 0.04% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.00% optimize.opt_a.before_grad : 0.000022s : 0.03% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000007s : 0.01% optimize.opt_a.meta_fg_expand : 0.000006s : 0.01% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.01% optimize.opt_a.receive_attached : 0.000004s : 0.01% optimize.opt_a.after_resolve : 0.000025s : 0.03% optimize.opt_a.a_after_grad : 0.000021s : 0.03% optimize.opt_a.renormalize : 0.001041s : 1.43% optimize.opt_a.add_forward_monad_depend : 0.000008s : 0.01% optimize.opt_a.auto_monad_grad : 0.000003s : 0.00% optimize.opt_a.auto_monad_eliminator : 0.000027s : 0.04% optimize.opt_a.cse : 0.000046s : 0.06% optimize.opt_a.a_3 : 0.000090s : 0.12% optimize.py_interpret_to_execute_after_opt_a : 0.000012s : 0.02% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.00% optimize.rewriter_after_opt_a : 0.000039s : 0.05% optimize.convert_after_rewriter : 0.000008s : 0.01% optimize.order_py_execute_after_rewriter : 0.000005s : 0.01% optimize.mutable_eliminate : 0.000724s : 1.00% optimize.opt_b.b_1 : 0.000129s : 0.18% optimize.opt_b.b_2 : 0.000008s : 0.01% optimize.opt_b.updatestate_depend_eliminate : 0.000008s : 0.01% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.00% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000022s : 0.03% optimize.optimize_parallel_all_gather_comm : 0.000018s : 0.02% optimize.overlap_param_gather : 0.000002s : 0.00% optimize.cconv : 0.000030s : 0.04% optimize.loop_unroll : 0.000462s : 0.64% optimize.opt_after_cconv.c_1 : 0.000032s : 0.04% optimize.opt_after_cconv.parameter_eliminate : 0.000005s : 0.01% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000007s : 0.01% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.00% optimize.opt_after_cconv.cse : 0.000019s : 0.03% optimize.opt_after_cconv.renormalize : 0.000001s : 0.00% optimize.remove_dup_value : 0.000014s : 0.02% optimize.tuple_transform.d_1 : 0.000049s : 0.07% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000008s : 0.01% optimize.partial_unused_args_eliminate : 0.000002s : 0.00% optimize.add_recomputation : 0.000052s : 0.07% optimize.cse_after_recomputation.cse : 0.000011s : 0.02% optimize.environ_conv : 0.000006s : 0.01% optimize.swap_dp_allreduce_reducescatter : 0.000005s : 0.01% optimize.bias_add_comm_swap : 0.000003s : 0.00% optimize.label_micro_interleaved_index : 0.000005s : 0.01% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.00% optimize.merge_cast_opt : 0.000001s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.00% optimize.micro_interleaved_order_control : 0.000002s : 0.00% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.00% optimize.full_micro_interleaved_order_control : 0.000002s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.00% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000002s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.00% optimize.control_data_broadcast_order : 0.000012s : 0.02% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.00% optimize.offloading_packed_experts : 0.000004s : 0.01% optimize.overlap_recompute_and_grad_model_parallel : 0.000004s : 0.01% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.00% optimize.overlap_recompute_comm : 0.000002s : 0.00% optimize.overlap_grad_ring_attention : 0.000004s : 0.01% optimize.overlap_grad_flash_sp : 0.000020s : 0.03% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000003s : 0.00% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000004s : 0.01% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000011s : 0.01% optimize.symbol_engine_optimizer.elim_not_effective : 0.000014s : 0.02% optimize.symbol_engine_optimizer.opt_reshape : 0.000007s : 0.01% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000010s : 0.01% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.00% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000016s : 0.02% get_jit_bprop_graph : 0.000003s : 0.00% rewriter_after_jit_bprop_graph : 0.000006s : 0.01% opt_after_jit_grad : 0.000591s : 0.81% validate : 0.000048s : 0.07% Time group info: ------[substitution.] 0.000258 34 14.01% : 0.000036s : 6: substitution.arithmetic_simplify 0.87% : 0.000002s : 2: substitution.elim_not_effective 0.52% : 0.000001s : 2: substitution.fold_const_symbol 2.31% : 0.000006s : 4: substitution.graph_param_transform 68.87% : 0.000178s : 4: substitution.inline 1.79% : 0.000005s : 4: substitution.j_node_and_user_rematch 2.62% : 0.000007s : 4: substitution.remove_not_recompute_node 2.28% : 0.000006s : 4: substitution.replace_old_param 6.73% : 0.000017s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.066665 2 98.89% : 0.065928s : 1: type_inference.infer 1.11% : 0.000737s : 1: type_inference.specialize ------[replace.] 0.000069 8 62.32% : 0.000043s : 4: replace.inline 37.68% : 0.000026s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000191 8 91.90% : 0.000175s : 4: match.inline 8.10% : 0.000015s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000213 1278 0.96% : 0.000002s : 13: predicate.accumulaten_eliminater 0.81% : 0.000002s : 4: predicate.ad_related_special_op_eliminate 0.49% : 0.000001s : 8: predicate.addn_check_dump 1.00% : 0.000002s : 13: predicate.addn_zero_filter 0.77% : 0.000002s : 13: predicate.adjust_all_reduce_mul_add 2.74% : 0.000006s : 21: predicate.arithmetic_simplify 0.99% : 0.000002s : 13: predicate.cast_eliminate 0.61% : 0.000001s : 8: predicate.check_bprop_eliminate 0.55% : 0.000001s : 8: predicate.compare_switch_simplify 0.19% : 0.000000s : 4: predicate.const_output_eliminate 0.66% : 0.000001s : 8: predicate.depend_value_elim 0.95% : 0.000002s : 13: predicate.dict_get_item_const_eliminator 1.03% : 0.000002s : 13: predicate.dict_get_item_eliminator 0.89% : 0.000002s : 13: predicate.dict_set_item_eliminator 1.09% : 0.000002s : 8: predicate.dumpgradient_eliminate 0.25% : 0.000001s : 4: predicate.elim_not_effective 0.39% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.21% : 0.000003s : 17: predicate.environ_add_const_eliminate 1.16% : 0.000002s : 17: predicate.environ_get_add_eliminate 1.03% : 0.000002s : 17: predicate.environ_get_depend_swap 1.69% : 0.000004s : 25: predicate.environ_get_eliminate 1.03% : 0.000002s : 17: predicate.environ_get_set_eliminate 1.43% : 0.000003s : 21: predicate.exchange_switch_depend_value 2.49% : 0.000005s : 21: predicate.float_depend_g_call 0.51% : 0.000001s : 8: predicate.float_environ_get_switch 0.78% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.17% : 0.000000s : 4: predicate.fold_const_symbol 0.68% : 0.000001s : 8: predicate.get_grad_eliminate 0.19% : 0.000000s : 4: predicate.graph_param_transform 0.55% : 0.000001s : 8: predicate.incorporate_call 0.47% : 0.000001s : 8: predicate.incorporate_call_switch 6.18% : 0.000013s : 58: predicate.inline 0.80% : 0.000002s : 8: predicate.inline_without_move 0.31% : 0.000001s : 8: predicate.j_node_and_user_rematch 0.87% : 0.000002s : 8: predicate.less_batch_normalization 1.71% : 0.000004s : 25: predicate.list_to_tuple_eliminator_ 2.47% : 0.000005s : 38: predicate.load_eliminater 0.87% : 0.000002s : 4: predicate.loop_unroll_after_grad 2.32% : 0.000005s : 34: predicate.loop_unroll_before_grad 1.90% : 0.000004s : 21: predicate.make_slice_get_slice_eliminator 0.56% : 0.000001s : 8: predicate.merge_addn 0.53% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.63% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.80% : 0.000002s : 13: predicate.minmaximum_grad 1.55% : 0.000003s : 4: predicate.mutable_eliminate 0.35% : 0.000001s : 4: predicate.opt_reshape 0.44% : 0.000001s : 4: predicate.parallel_virtual_node 1.89% : 0.000004s : 21: predicate.partial_defer_inline 1.57% : 0.000003s : 21: predicate.partial_eliminate 0.84% : 0.000002s : 13: predicate.print_const_string_wrapper 0.59% : 0.000001s : 8: predicate.reduce_all_const_elim 1.26% : 0.000003s : 13: predicate.reduce_eliminate 2.44% : 0.000005s : 38: predicate.redundant_stop_gradient_eliminater 0.51% : 0.000001s : 8: predicate.remove_not_recompute_node 1.33% : 0.000003s : 25: predicate.replace_applicator 0.43% : 0.000001s : 8: predicate.replace_old_param 0.27% : 0.000001s : 4: predicate.reset_defer_inline 0.92% : 0.000002s : 13: predicate.reshape_eliminate 0.62% : 0.000001s : 8: predicate.row_tensor_add_zeros_like 0.42% : 0.000001s : 4: predicate.row_tensor_eliminate 0.77% : 0.000002s : 8: predicate.same_eliminate 0.47% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.84% : 0.000002s : 8: predicate.shard_identity_eliminate 0.70% : 0.000001s : 8: predicate.special_op_eliminate 0.67% : 0.000001s : 8: predicate.specialize_transform 0.73% : 0.000002s : 8: predicate.split_environ_get_set_with_tuple_value 0.80% : 0.000002s : 8: predicate.stack_unstack_eliminate 0.32% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.51% : 0.000003s : 21: predicate.switch_defer_inline 2.01% : 0.000004s : 29: predicate.switch_layer_defer_inline 5.59% : 0.000012s : 67: predicate.switch_simplify 0.90% : 0.000002s : 13: predicate.tile_eliminate 0.83% : 0.000002s : 13: predicate.transpose_eliminate 1.44% : 0.000003s : 21: predicate.tuple_list_convert_item_index_to_positive 1.55% : 0.000003s : 21: predicate.tuple_list_get_item_const_eliminator 1.36% : 0.000003s : 21: predicate.tuple_list_get_item_depend_reorder 3.32% : 0.000007s : 33: predicate.tuple_list_get_item_eliminator 1.46% : 0.000003s : 21: predicate.tuple_list_get_set_item_eliminator 2.15% : 0.000005s : 29: predicate.tuple_list_set_item_eliminator 1.78% : 0.000004s : 25: predicate.tuple_to_list_eliminator_ 2.31% : 0.000005s : 38: predicate.updatestate_pure_node_eliminater 3.07% : 0.000007s : 46: predicate.updatestate_useless_node_eliminater 0.40% : 0.000001s : 4: predicate.value_based_eliminate 0.61% : 0.000001s : 8: predicate.virtual_dataset_eliminate 0.61% : 0.000001s : 8: predicate.virtual_output_eliminate 0.24% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.41% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000660 11 57.26% : 0.000378s : 5: func_graph_cloner_run.FuncGraphClonerGraph 42.74% : 0.000282s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.090636 192 0.00% : 0.000004s : 1: ForceFp32Comm 4.90% : 0.004438s : 1: add_attr 4.88% : 0.004422s : 1: add_attr_with_inline 0.00% : 0.000004s : 1: add_comm_op_reuse_tag 0.06% : 0.000056s : 1: add_recomputation 0.00% : 0.000004s : 1: assign_add_opt 0.08% : 0.000070s : 1: auto_monad 0.02% : 0.000020s : 1: auto_monad_reorder 0.00% : 0.000003s : 1: begin_end_overlap_inline 0.01% : 0.000006s : 1: bias_add_comm_swap 0.51% : 0.000459s : 1: bootstrap 0.04% : 0.000033s : 1: cconv 0.00% : 0.000004s : 1: comm_op_add_attrs 0.02% : 0.000015s : 1: control_data_broadcast_order 0.01% : 0.000011s : 1: convert_after_rewriter 0.03% : 0.000025s : 1: cse_after_recomputation 0.01% : 0.000005s : 1: dataset_repeat_opt 0.01% : 0.000006s : 1: detach_backward 0.01% : 0.000009s : 1: environ_conv 0.03% : 0.000025s : 1: event_method 0.01% : 0.000005s : 1: full_micro_interleaved_order_control 0.01% : 0.000006s : 1: get_jit_bprop_graph 0.01% : 0.000010s : 1: graph_reusing 0.00% : 0.000005s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000004s : 1: handle_group_info 0.01% : 0.000006s : 1: inline 0.01% : 0.000006s : 1: insert-virtual-dataset 0.00% : 0.000004s : 1: interleave_parallel_branches 0.00% : 0.000004s : 1: interleave_split_concat_branches 0.01% : 0.000006s : 1: label_fine_grained_interleaved_index 0.01% : 0.000008s : 1: label_micro_interleaved_index 0.52% : 0.000471s : 1: loop_unroll 0.00% : 0.000004s : 1: merge_cast_opt 0.01% : 0.000005s : 1: micro_interleaved_order_control 0.81% : 0.000735s : 1: mutable_eliminate 0.01% : 0.000007s : 1: offloading_packed_experts 0.02% : 0.000016s : 1: opt.transform.loop_unroll_optimizer 0.02% : 0.000020s : 1: opt.transform.mutable_eliminate 1.43% : 0.001298s : 78: opt.transform.opt_a 0.03% : 0.000031s : 1: opt.transform.opt_after_cconv 0.03% : 0.000026s : 1: opt.transform.opt_after_jit_grad 0.11% : 0.000104s : 28: opt.transform.opt_b 0.06% : 0.000054s : 2: opt.transform.opt_trans_graph 0.04% : 0.000038s : 4: opt.transform.symbol_engine_opt 3.57% : 0.003236s : 1: opt_a 0.13% : 0.000114s : 1: opt_after_cconv 0.66% : 0.000601s : 1: opt_after_jit_grad 0.24% : 0.000220s : 1: opt_b 6.16% : 0.005583s : 1: optimize 0.02% : 0.000022s : 1: optimize_parallel_all_gather_comm 0.01% : 0.000009s : 1: order_py_execute_after_rewriter 0.03% : 0.000024s : 1: overlap_grad_flash_sp 0.00% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.01% : 0.000007s : 1: overlap_grad_ring_attention 0.01% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000005s : 1: overlap_opt_shard_in_pipeline 0.01% : 0.000005s : 1: overlap_param_gather 0.00% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.01% : 0.000008s : 1: overlap_recompute_and_grad_model_parallel 0.01% : 0.000005s : 1: overlap_recompute_comm 0.01% : 0.000007s : 1: parallel-infer-symbol 0.00% : 0.000004s : 1: parallel-infer-symbol-second 0.01% : 0.000005s : 1: partial_unused_args_eliminate 0.01% : 0.000005s : 1: pipeline_parallel_scheduler 0.00% : 0.000005s : 1: pipeline_split 0.05% : 0.000049s : 1: pre_auto_parallel 0.04% : 0.000039s : 1: py_interpret_to_execute 0.02% : 0.000015s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000004s : 1: remove_cast_before_assign_add 0.02% : 0.000018s : 1: remove_dup_value 0.70% : 0.000638s : 1: renormalize.infer 0.43% : 0.000390s : 1: renormalize.specialize 0.01% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.01% : 0.000009s : 1: rewriter_after_jit_bprop_graph 0.05% : 0.000043s : 1: rewriter_after_opt_a 0.12% : 0.000105s : 1: rewriter_before_opt_a 0.01% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.01% : 0.000006s : 1: slice_recompute_activation 0.01% : 0.000005s : 1: split_layernorm_comm 0.01% : 0.000005s : 1: split_matmul_comm_elemetwise 0.01% : 0.000008s : 1: swap_dp_allreduce_reducescatter 0.09% : 0.000083s : 1: symbol_engine_optimizer 0.09% : 0.000086s : 1: tuple_transform 73.65% : 0.066754s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:44:02.765.037 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:44:02.765.305 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.051475, [21] [bootstrap]: 0.00043537 [type_inference]: 0.00652692 [event_method]: 2.051e-05 [auto_monad]: 6.595e-05 [graph_reusing]: 5.47999e-06 [inline]: 2.49001e-06 [add_attr]: 0.00369068, [1] [add_attr_with_inline]: 0.00367727, [1] [Cycle 1]: 0.0001001, [2] [tag_attr]: 2.613e-05 [meta_addattr_fg_expand]: 6.17999e-06 [parallel-infer-symbol]: 3.41999e-06 [pre_auto_parallel]: 4.282e-05 [insert-virtual-dataset]: 2.39999e-06 [parallel-infer-symbol-second]: 8.60018e-07 [dataset_repeat_opt]: 2.39001e-06 [pipeline_split]: 1.61002e-06 [optimize]: 0.0390824, [53] [py_interpret_to_execute]: 3.463e-05 [rewriter_before_opt_a]: 9.753e-05 [opt_a]: 0.0361496, [2] [Cycle 1]: 0.0350915, [45] [expand_dump_flag]: 3.42997e-06 [switch_simplify]: 4.323e-05 [loop_unroll]: 3.027e-05 [a_1]: 0.00068846 [with_stream_mark]: 2.486e-05 [recompute_prepare]: 1.15e-05 [updatestate_depend_eliminate]: 5.05001e-06 [updatestate_assign_eliminate]: 3.81999e-06 [updatestate_loads_eliminate]: 3.23e-06 [parameter_eliminate]: 1.96e-06 [a_2]: 0.00011893 [accelerated_algorithm]: 8.69003e-06 [shard]: 2.44999e-06 [meta_shard_fg_expand]: 2.07001e-06 [shard_inline]: 7.06001e-06 [merge_send_recv]: 8.87e-06 [auto_parallel]: 8.72998e-06 [parallel]: 2.022e-05 [flash_sp]: 1.102e-05 [merge_comm]: 4.45999e-06 [allreduce_fusion]: 3.45003e-06 [matmul_add_comm_reduction]: 1.104e-05 [allreduce_slice_to_reducescatter]: 8.30012e-07 [virtual_shard_identity]: 1.041e-05 [virtual_dataset]: 7.23999e-06 [get_grad_eliminate_]: 6.91001e-06 [virtual_output]: 7.09001e-06 [merge_forward]: 4.52998e-06 [cell_reuse_recompute_pass]: 2.02999e-06 [offload_activation]: 1.146e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.718e-05 [merge_recompute_call_nodes]: 1.71998e-06 [before_grad]: 1.323e-05 [set_forward_comm_id_for_comm_node_pass]: 5.17e-06 [meta_fg_expand]: 3.44001e-06 [flash_sp_send_recv_attached]: 3.72002e-06 [receive_attached]: 2.54001e-06 [after_resolve]: 1.52e-05 [a_after_grad]: 1.23e-05 [renormalize]: 0.033326 [add_forward_monad_depend]: 1.254e-05 [auto_monad_grad]: 2.96001e-06 [auto_monad_eliminator]: 2.619e-05 [cse]: 3.289e-05 [a_3]: 8.24e-05 [Cycle 2]: 0.00103795, [45] [expand_dump_flag]: 2.47001e-06 [switch_simplify]: 9.67999e-06 [loop_unroll]: 6.76e-06 [a_1]: 0.00017644 [with_stream_mark]: 2.366e-05 [recompute_prepare]: 9.14e-06 [updatestate_depend_eliminate]: 5.40001e-06 [updatestate_assign_eliminate]: 3.58999e-06 [updatestate_loads_eliminate]: 3.26001e-06 [parameter_eliminate]: 2.79001e-06 [a_2]: 0.00011706 [accelerated_algorithm]: 8.27e-06 [shard]: 2.57001e-06 [meta_shard_fg_expand]: 2.48e-06 [shard_inline]: 7.05e-06 [merge_send_recv]: 9.62001e-06 [auto_parallel]: 1.255e-05 [parallel]: 1.052e-05 [flash_sp]: 5.22e-06 [merge_comm]: 4.84998e-06 [allreduce_fusion]: 3.91001e-06 [matmul_add_comm_reduction]: 1.027e-05 [allreduce_slice_to_reducescatter]: 1.47001e-06 [virtual_shard_identity]: 1.308e-05 [virtual_dataset]: 7.38999e-06 [get_grad_eliminate_]: 6.28e-06 [virtual_output]: 6.62002e-06 [merge_forward]: 5.70001e-06 [cell_reuse_recompute_pass]: 3.92998e-06 [offload_activation]: 1.274e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.752e-05 [merge_recompute_call_nodes]: 1.60001e-06 [before_grad]: 1.247e-05 [set_forward_comm_id_for_comm_node_pass]: 3.66001e-06 [meta_fg_expand]: 3.14001e-06 [flash_sp_send_recv_attached]: 2.08998e-06 [receive_attached]: 2.45002e-06 [after_resolve]: 1.488e-05 [a_after_grad]: 1.104e-05 [renormalize]: 1.00001e-07 [add_forward_monad_depend]: 3.63999e-06 [auto_monad_grad]: 2.22999e-06 [auto_monad_eliminator]: 1.122e-05 [cse]: 1.986e-05 [a_3]: 5.355e-05 [py_interpret_to_execute_after_opt_a]: 2.354e-05 [slice_cell_reuse_recomputed_activation]: 5.25999e-06 [rewriter_after_opt_a]: 4.81e-05 [convert_after_rewriter]: 1.108e-05 [order_py_execute_after_rewriter]: 8.80001e-06 [mutable_eliminate]: 0.00078997 [opt_b]: 0.00029665, [1] [Cycle 1]: 0.00028565, [7] [b_1]: 0.00017237 [b_2]: 8.53001e-06 [updatestate_depend_eliminate]: 9.59e-06 [updatestate_assign_eliminate]: 2.96999e-06 [updatestate_loads_eliminate]: 2.76999e-06 [renormalize]: 6.50005e-07 [cse]: 2.814e-05 [optimize_parallel_all_gather_comm]: 2.37e-05 [overlap_param_gather]: 5.40001e-06 [cconv]: 4.077e-05 [loop_unroll]: 0.00058476 [opt_after_cconv]: 0.00019667, [1] [Cycle 1]: 0.00018575, [7] [c_1]: 7.805e-05 [parameter_eliminate]: 6.93e-06 [updatestate_depend_eliminate]: 9.39998e-06 [updatestate_assign_eliminate]: 2.98e-06 [updatestate_loads_eliminate]: 2.68e-06 [cse]: 2.215e-05 [renormalize]: 5.89993e-07 [remove_dup_value]: 1.953e-05 [tuple_transform]: 0.00010278, [1] [Cycle 1]: 9.477e-05, [4] [d_1]: 5.352e-05 [none_parameter_eliminate]: 1.80001e-06 [renormalize]: 2.10013e-07 [switch_simplify]: 7.16001e-06 [partial_unused_args_eliminate]: 4.80999e-06 [add_recomputation]: 6.025e-05 [cse_after_recomputation]: 3.131e-05, [1] [Cycle 1]: 2.302e-05, [1] [cse]: 1.263e-05 [environ_conv]: 9.27999e-06 [swap_dp_allreduce_reducescatter]: 8.44998e-06 [bias_add_comm_swap]: 6.23998e-06 [label_micro_interleaved_index]: 8.68001e-06 [label_fine_grained_interleaved_index]: 6.21e-06 [merge_cast_opt]: 4.23999e-06 [slice_recompute_activation]: 4.97999e-06 [micro_interleaved_order_control]: 6.83e-06 [assign_add_opt]: 4.28999e-06 [ForceFp32Comm]: 3.37002e-06 [remove_cast_before_assign_add]: 3.85998e-06 [full_micro_interleaved_order_control]: 4.77e-06 [reorder_send_recv_between_fp_bp]: 5.62999e-06 [comm_op_add_attrs]: 3.61001e-06 [add_comm_op_reuse_tag]: 3.21001e-06 [interleave_split_concat_branches]: 3.86999e-06 [interleave_parallel_branches]: 3.45e-06 [overlap_opt_shard_in_pipeline]: 3.7e-06 [overlap_opt_shard_grad_in_pipeline]: 4.52e-06 [control_data_broadcast_order]: 1.687e-05 [grouped_pairwise_exchange_alltoall]: 4.37e-06 [offloading_packed_experts]: 6.79001e-06 [overlap_recompute_and_grad_model_parallel]: 7.96001e-06 [overlap_grad_matmul_and_grad_allreduce]: 4.02e-06 [overlap_recompute_allgather_and_fa_grad]: 3.84002e-06 [overlap_recompute_comm]: 5.77001e-06 [overlap_grad_ring_attention]: 7.7e-06 [overlap_grad_flash_sp]: 2.746e-05 [begin_end_overlap_inline]: 3.31999e-06 [split_matmul_comm_elemetwise]: 4.75001e-06 [split_layernorm_comm]: 4.21001e-06 [handle_group_info]: 3.48e-06 [symbol_engine_optimizer]: 0.00011057, [1] [Cycle 1]: 0.00010222, [6] [build]: 3.81001e-06 [elim_shapecalc]: 1.217e-05 [elim_not_effective]: 1.595e-05 [opt_reshape]: 8.45001e-06 [fold_const_symbol]: 1.111e-05 [renormalize]: 2.10013e-07 [detach_backward]: 5.04e-06 [pipeline_parallel_scheduler]: 2.01998e-06 [auto_monad_reorder]: 2.288e-05 [get_jit_bprop_graph]: 1.95001e-06 [rewriter_after_jit_bprop_graph]: 6.68e-06 [opt_after_jit_grad]: 0.00074813 [validate]: 5.051e-05 Sums bootstrap : 0.000435s : 0.95% type_inference : 0.006527s : 14.28% event_method : 0.000021s : 0.04% auto_monad : 0.000066s : 0.14% graph_reusing : 0.000005s : 0.01% inline : 0.000002s : 0.01% add_attr.add_attr_with_inline.tag_attr : 0.000026s : 0.06% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.01% parallel-infer-symbol : 0.000003s : 0.01% pre_auto_parallel : 0.000043s : 0.09% insert-virtual-dataset : 0.000002s : 0.01% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.01% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000035s : 0.08% optimize.rewriter_before_opt_a : 0.000098s : 0.21% optimize.opt_a.expand_dump_flag : 0.000006s : 0.01% optimize.opt_a.switch_simplify : 0.000053s : 0.12% optimize.opt_a.loop_unroll : 0.000037s : 0.08% optimize.opt_a.a_1 : 0.000865s : 1.89% optimize.opt_a.with_stream_mark : 0.000049s : 0.11% optimize.opt_a.recompute_prepare : 0.000021s : 0.05% optimize.opt_a.updatestate_depend_eliminate : 0.000010s : 0.02% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.02% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.01% optimize.opt_a.parameter_eliminate : 0.000005s : 0.01% optimize.opt_a.a_2 : 0.000236s : 0.52% optimize.opt_a.accelerated_algorithm : 0.000017s : 0.04% optimize.opt_a.shard : 0.000005s : 0.01% optimize.opt_a.meta_shard_fg_expand : 0.000005s : 0.01% optimize.opt_a.shard_inline : 0.000014s : 0.03% optimize.opt_a.merge_send_recv : 0.000018s : 0.04% optimize.opt_a.auto_parallel : 0.000021s : 0.05% optimize.opt_a.parallel : 0.000031s : 0.07% optimize.opt_a.flash_sp : 0.000016s : 0.04% optimize.opt_a.merge_comm : 0.000009s : 0.02% optimize.opt_a.allreduce_fusion : 0.000007s : 0.02% optimize.opt_a.matmul_add_comm_reduction : 0.000021s : 0.05% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000002s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000023s : 0.05% optimize.opt_a.virtual_dataset : 0.000015s : 0.03% optimize.opt_a.get_grad_eliminate_ : 0.000013s : 0.03% optimize.opt_a.virtual_output : 0.000014s : 0.03% optimize.opt_a.merge_forward : 0.000010s : 0.02% optimize.opt_a.cell_reuse_recompute_pass : 0.000006s : 0.01% optimize.opt_a.offload_activation : 0.000024s : 0.05% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000035s : 0.08% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.01% optimize.opt_a.before_grad : 0.000026s : 0.06% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000009s : 0.02% optimize.opt_a.meta_fg_expand : 0.000007s : 0.01% optimize.opt_a.flash_sp_send_recv_attached : 0.000006s : 0.01% optimize.opt_a.receive_attached : 0.000005s : 0.01% optimize.opt_a.after_resolve : 0.000030s : 0.07% optimize.opt_a.a_after_grad : 0.000023s : 0.05% optimize.opt_a.renormalize : 0.033326s : 72.93% optimize.opt_a.add_forward_monad_depend : 0.000016s : 0.04% optimize.opt_a.auto_monad_grad : 0.000005s : 0.01% optimize.opt_a.auto_monad_eliminator : 0.000037s : 0.08% optimize.opt_a.cse : 0.000053s : 0.12% optimize.opt_a.a_3 : 0.000136s : 0.30% optimize.py_interpret_to_execute_after_opt_a : 0.000024s : 0.05% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.01% optimize.rewriter_after_opt_a : 0.000048s : 0.11% optimize.convert_after_rewriter : 0.000011s : 0.02% optimize.order_py_execute_after_rewriter : 0.000009s : 0.02% optimize.mutable_eliminate : 0.000790s : 1.73% optimize.opt_b.b_1 : 0.000172s : 0.38% optimize.opt_b.b_2 : 0.000009s : 0.02% optimize.opt_b.updatestate_depend_eliminate : 0.000010s : 0.02% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.01% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.01% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000028s : 0.06% optimize.optimize_parallel_all_gather_comm : 0.000024s : 0.05% optimize.overlap_param_gather : 0.000005s : 0.01% optimize.cconv : 0.000041s : 0.09% optimize.loop_unroll : 0.000585s : 1.28% optimize.opt_after_cconv.c_1 : 0.000078s : 0.17% optimize.opt_after_cconv.parameter_eliminate : 0.000007s : 0.02% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000009s : 0.02% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.cse : 0.000022s : 0.05% optimize.opt_after_cconv.renormalize : 0.000001s : 0.00% optimize.remove_dup_value : 0.000020s : 0.04% optimize.tuple_transform.d_1 : 0.000054s : 0.12% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000007s : 0.02% optimize.partial_unused_args_eliminate : 0.000005s : 0.01% optimize.add_recomputation : 0.000060s : 0.13% optimize.cse_after_recomputation.cse : 0.000013s : 0.03% optimize.environ_conv : 0.000009s : 0.02% optimize.swap_dp_allreduce_reducescatter : 0.000008s : 0.02% optimize.bias_add_comm_swap : 0.000006s : 0.01% optimize.label_micro_interleaved_index : 0.000009s : 0.02% optimize.label_fine_grained_interleaved_index : 0.000006s : 0.01% optimize.merge_cast_opt : 0.000004s : 0.01% optimize.slice_recompute_activation : 0.000005s : 0.01% optimize.micro_interleaved_order_control : 0.000007s : 0.01% optimize.assign_add_opt : 0.000004s : 0.01% optimize.ForceFp32Comm : 0.000003s : 0.01% optimize.remove_cast_before_assign_add : 0.000004s : 0.01% optimize.full_micro_interleaved_order_control : 0.000005s : 0.01% optimize.reorder_send_recv_between_fp_bp : 0.000006s : 0.01% optimize.comm_op_add_attrs : 0.000004s : 0.01% optimize.add_comm_op_reuse_tag : 0.000003s : 0.01% optimize.interleave_split_concat_branches : 0.000004s : 0.01% optimize.interleave_parallel_branches : 0.000003s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000005s : 0.01% optimize.control_data_broadcast_order : 0.000017s : 0.04% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.01% optimize.offloading_packed_experts : 0.000007s : 0.01% optimize.overlap_recompute_and_grad_model_parallel : 0.000008s : 0.02% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.01% optimize.overlap_recompute_comm : 0.000006s : 0.01% optimize.overlap_grad_ring_attention : 0.000008s : 0.02% optimize.overlap_grad_flash_sp : 0.000027s : 0.06% optimize.begin_end_overlap_inline : 0.000003s : 0.01% optimize.split_matmul_comm_elemetwise : 0.000005s : 0.01% optimize.split_layernorm_comm : 0.000004s : 0.01% optimize.handle_group_info : 0.000003s : 0.01% optimize.symbol_engine_optimizer.build : 0.000004s : 0.01% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000012s : 0.03% optimize.symbol_engine_optimizer.elim_not_effective : 0.000016s : 0.03% optimize.symbol_engine_optimizer.opt_reshape : 0.000008s : 0.02% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000011s : 0.02% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000005s : 0.01% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000023s : 0.05% get_jit_bprop_graph : 0.000002s : 0.00% rewriter_after_jit_bprop_graph : 0.000007s : 0.01% opt_after_jit_grad : 0.000748s : 1.64% validate : 0.000051s : 0.11% Time group info: ------[substitution.] 0.000262 34 17.90% : 0.000047s : 6: substitution.arithmetic_simplify 0.86% : 0.000002s : 2: substitution.elim_not_effective 0.69% : 0.000002s : 2: substitution.fold_const_symbol 2.54% : 0.000007s : 4: substitution.graph_param_transform 64.66% : 0.000169s : 4: substitution.inline 2.26% : 0.000006s : 4: substitution.j_node_and_user_rematch 2.32% : 0.000006s : 4: substitution.remove_not_recompute_node 2.56% : 0.000007s : 4: substitution.replace_old_param 6.21% : 0.000016s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.006464 2 88.01% : 0.005689s : 1: type_inference.infer 11.99% : 0.000775s : 1: type_inference.specialize ------[replace.] 0.000066 8 64.44% : 0.000043s : 4: replace.inline 35.56% : 0.000024s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000181 8 92.04% : 0.000167s : 4: match.inline 7.96% : 0.000014s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000238 1278 1.01% : 0.000002s : 13: predicate.accumulaten_eliminater 1.09% : 0.000003s : 4: predicate.ad_related_special_op_eliminate 0.54% : 0.000001s : 8: predicate.addn_check_dump 0.83% : 0.000002s : 13: predicate.addn_zero_filter 0.76% : 0.000002s : 13: predicate.adjust_all_reduce_mul_add 2.78% : 0.000007s : 21: predicate.arithmetic_simplify 0.92% : 0.000002s : 13: predicate.cast_eliminate 0.72% : 0.000002s : 8: predicate.check_bprop_eliminate 0.51% : 0.000001s : 8: predicate.compare_switch_simplify 0.16% : 0.000000s : 4: predicate.const_output_eliminate 0.61% : 0.000001s : 8: predicate.depend_value_elim 0.88% : 0.000002s : 13: predicate.dict_get_item_const_eliminator 1.09% : 0.000003s : 13: predicate.dict_get_item_eliminator 0.80% : 0.000002s : 13: predicate.dict_set_item_eliminator 1.26% : 0.000003s : 8: predicate.dumpgradient_eliminate 0.18% : 0.000000s : 4: predicate.elim_not_effective 0.49% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.16% : 0.000003s : 17: predicate.environ_add_const_eliminate 0.95% : 0.000002s : 17: predicate.environ_get_add_eliminate 0.97% : 0.000002s : 17: predicate.environ_get_depend_swap 1.65% : 0.000004s : 25: predicate.environ_get_eliminate 0.97% : 0.000002s : 17: predicate.environ_get_set_eliminate 1.28% : 0.000003s : 21: predicate.exchange_switch_depend_value 2.37% : 0.000006s : 21: predicate.float_depend_g_call 0.58% : 0.000001s : 8: predicate.float_environ_get_switch 0.84% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.15% : 0.000000s : 4: predicate.fold_const_symbol 0.67% : 0.000002s : 8: predicate.get_grad_eliminate 0.30% : 0.000001s : 4: predicate.graph_param_transform 0.57% : 0.000001s : 8: predicate.incorporate_call 0.41% : 0.000001s : 8: predicate.incorporate_call_switch 6.13% : 0.000015s : 58: predicate.inline 1.12% : 0.000003s : 8: predicate.inline_without_move 0.30% : 0.000001s : 8: predicate.j_node_and_user_rematch 1.00% : 0.000002s : 8: predicate.less_batch_normalization 1.60% : 0.000004s : 25: predicate.list_to_tuple_eliminator_ 2.35% : 0.000006s : 38: predicate.load_eliminater 1.44% : 0.000003s : 4: predicate.loop_unroll_after_grad 2.14% : 0.000005s : 34: predicate.loop_unroll_before_grad 1.74% : 0.000004s : 21: predicate.make_slice_get_slice_eliminator 0.50% : 0.000001s : 8: predicate.merge_addn 0.51% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.52% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.76% : 0.000002s : 13: predicate.minmaximum_grad 1.80% : 0.000004s : 4: predicate.mutable_eliminate 0.49% : 0.000001s : 4: predicate.opt_reshape 0.33% : 0.000001s : 4: predicate.parallel_virtual_node 1.77% : 0.000004s : 21: predicate.partial_defer_inline 1.44% : 0.000003s : 21: predicate.partial_eliminate 0.81% : 0.000002s : 13: predicate.print_const_string_wrapper 0.62% : 0.000001s : 8: predicate.reduce_all_const_elim 1.11% : 0.000003s : 13: predicate.reduce_eliminate 2.27% : 0.000005s : 38: predicate.redundant_stop_gradient_eliminater 0.57% : 0.000001s : 8: predicate.remove_not_recompute_node 1.47% : 0.000004s : 25: predicate.replace_applicator 0.69% : 0.000002s : 8: predicate.replace_old_param 0.23% : 0.000001s : 4: predicate.reset_defer_inline 0.96% : 0.000002s : 13: predicate.reshape_eliminate 0.57% : 0.000001s : 8: predicate.row_tensor_add_zeros_like 0.38% : 0.000001s : 4: predicate.row_tensor_eliminate 0.96% : 0.000002s : 8: predicate.same_eliminate 0.59% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.76% : 0.000002s : 8: predicate.shard_identity_eliminate 0.64% : 0.000002s : 8: predicate.special_op_eliminate 0.77% : 0.000002s : 8: predicate.specialize_transform 1.06% : 0.000003s : 8: predicate.split_environ_get_set_with_tuple_value 0.91% : 0.000002s : 8: predicate.stack_unstack_eliminate 0.44% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.27% : 0.000003s : 21: predicate.switch_defer_inline 1.76% : 0.000004s : 29: predicate.switch_layer_defer_inline 4.47% : 0.000011s : 67: predicate.switch_simplify 1.11% : 0.000003s : 13: predicate.tile_eliminate 0.87% : 0.000002s : 13: predicate.transpose_eliminate 1.61% : 0.000004s : 21: predicate.tuple_list_convert_item_index_to_positive 1.67% : 0.000004s : 21: predicate.tuple_list_get_item_const_eliminator 1.43% : 0.000003s : 21: predicate.tuple_list_get_item_depend_reorder 3.29% : 0.000008s : 33: predicate.tuple_list_get_item_eliminator 1.52% : 0.000004s : 21: predicate.tuple_list_get_set_item_eliminator 2.14% : 0.000005s : 29: predicate.tuple_list_set_item_eliminator 1.82% : 0.000004s : 25: predicate.tuple_to_list_eliminator_ 2.18% : 0.000005s : 38: predicate.updatestate_pure_node_eliminater 3.21% : 0.000008s : 46: predicate.updatestate_useless_node_eliminater 0.32% : 0.000001s : 4: predicate.value_based_eliminate 0.65% : 0.000002s : 8: predicate.virtual_dataset_eliminate 0.73% : 0.000002s : 8: predicate.virtual_output_eliminate 0.29% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.37% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000694 11 47.18% : 0.000328s : 5: func_graph_cloner_run.FuncGraphClonerGraph 52.82% : 0.000367s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.129119 192 0.00% : 0.000006s : 1: ForceFp32Comm 2.87% : 0.003703s : 1: add_attr 2.85% : 0.003681s : 1: add_attr_with_inline 0.01% : 0.000008s : 1: add_comm_op_reuse_tag 0.05% : 0.000064s : 1: add_recomputation 0.01% : 0.000007s : 1: assign_add_opt 0.06% : 0.000076s : 1: auto_monad 0.02% : 0.000031s : 1: auto_monad_reorder 0.00% : 0.000006s : 1: begin_end_overlap_inline 0.01% : 0.000009s : 1: bias_add_comm_swap 0.37% : 0.000479s : 1: bootstrap 0.03% : 0.000044s : 1: cconv 0.00% : 0.000006s : 1: comm_op_add_attrs 0.02% : 0.000020s : 1: control_data_broadcast_order 0.01% : 0.000014s : 1: convert_after_rewriter 0.03% : 0.000035s : 1: cse_after_recomputation 0.01% : 0.000008s : 1: dataset_repeat_opt 0.02% : 0.000027s : 1: detach_backward 0.01% : 0.000013s : 1: environ_conv 0.03% : 0.000032s : 1: event_method 0.01% : 0.000007s : 1: full_micro_interleaved_order_control 0.01% : 0.000009s : 1: get_jit_bprop_graph 0.01% : 0.000012s : 1: graph_reusing 0.01% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000006s : 1: handle_group_info 0.01% : 0.000008s : 1: inline 0.01% : 0.000009s : 1: insert-virtual-dataset 0.00% : 0.000006s : 1: interleave_parallel_branches 0.00% : 0.000006s : 1: interleave_split_concat_branches 0.01% : 0.000009s : 1: label_fine_grained_interleaved_index 0.01% : 0.000012s : 1: label_micro_interleaved_index 0.46% : 0.000592s : 1: loop_unroll 0.01% : 0.000007s : 1: merge_cast_opt 0.01% : 0.000010s : 1: micro_interleaved_order_control 0.62% : 0.000798s : 1: mutable_eliminate 0.01% : 0.000010s : 1: offloading_packed_experts 0.01% : 0.000019s : 1: opt.transform.loop_unroll_optimizer 0.02% : 0.000024s : 1: opt.transform.mutable_eliminate 1.05% : 0.001358s : 78: opt.transform.opt_a 0.06% : 0.000076s : 1: opt.transform.opt_after_cconv 0.03% : 0.000034s : 1: opt.transform.opt_after_jit_grad 0.08% : 0.000106s : 28: opt.transform.opt_b 0.04% : 0.000058s : 2: opt.transform.opt_trans_graph 0.03% : 0.000043s : 4: opt.transform.symbol_engine_opt 28.00% : 0.036154s : 1: opt_a 0.16% : 0.000201s : 1: opt_after_cconv 0.59% : 0.000761s : 1: opt_after_jit_grad 0.23% : 0.000300s : 1: opt_b 30.64% : 0.039560s : 1: optimize 0.02% : 0.000028s : 1: optimize_parallel_all_gather_comm 0.01% : 0.000013s : 1: order_py_execute_after_rewriter 0.02% : 0.000031s : 1: overlap_grad_flash_sp 0.01% : 0.000007s : 1: overlap_grad_matmul_and_grad_allreduce 0.01% : 0.000010s : 1: overlap_grad_ring_attention 0.01% : 0.000007s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000006s : 1: overlap_opt_shard_in_pipeline 0.01% : 0.000009s : 1: overlap_param_gather 0.00% : 0.000006s : 1: overlap_recompute_allgather_and_fa_grad 0.01% : 0.000012s : 1: overlap_recompute_and_grad_model_parallel 0.01% : 0.000009s : 1: overlap_recompute_comm 0.01% : 0.000010s : 1: parallel-infer-symbol 0.01% : 0.000007s : 1: parallel-infer-symbol-second 0.01% : 0.000008s : 1: partial_unused_args_eliminate 0.01% : 0.000009s : 1: pipeline_parallel_scheduler 0.01% : 0.000007s : 1: pipeline_split 0.04% : 0.000050s : 1: pre_auto_parallel 0.03% : 0.000039s : 1: py_interpret_to_execute 0.02% : 0.000027s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000006s : 1: remove_cast_before_assign_add 0.02% : 0.000023s : 1: remove_dup_value 25.37% : 0.032752s : 1: renormalize.infer 0.43% : 0.000553s : 1: renormalize.specialize 0.01% : 0.000008s : 1: reorder_send_recv_between_fp_bp 0.01% : 0.000013s : 1: rewriter_after_jit_bprop_graph 0.04% : 0.000052s : 1: rewriter_after_opt_a 0.08% : 0.000102s : 1: rewriter_before_opt_a 0.01% : 0.000008s : 1: slice_cell_reuse_recomputed_activation 0.01% : 0.000009s : 1: slice_recompute_activation 0.01% : 0.000009s : 1: split_layernorm_comm 0.01% : 0.000007s : 1: split_matmul_comm_elemetwise 0.01% : 0.000012s : 1: swap_dp_allreduce_reducescatter 0.09% : 0.000114s : 1: symbol_engine_optimizer 0.08% : 0.000106s : 1: tuple_transform 5.09% : 0.006577s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:44:03.702.465 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0284435, [21] [bootstrap]: 0.00042881 [type_inference]: 0.0176095 [event_method]: 2.384e-05 [auto_monad]: 7.211e-05 [graph_reusing]: 6.53e-06 [inline]: 2.89001e-06 [add_attr]: 0.00411321, [1] [add_attr_with_inline]: 0.00409958, [1] [Cycle 1]: 8.034e-05, [2] [tag_attr]: 2.798e-05 [meta_addattr_fg_expand]: 6.36e-06 [parallel-infer-symbol]: 3.65e-06 [pre_auto_parallel]: 4.372e-05 [insert-virtual-dataset]: 2.64001e-06 [parallel-infer-symbol-second]: 8.49977e-07 [dataset_repeat_opt]: 2.26e-06 [pipeline_split]: 1.72001e-06 [optimize]: 0.00540004, [53] [py_interpret_to_execute]: 3.343e-05 [rewriter_before_opt_a]: 9.494e-05 [opt_a]: 0.00307518, [2] [Cycle 1]: 0.00235641, [45] [expand_dump_flag]: 2.96999e-06 [switch_simplify]: 4.541e-05 [loop_unroll]: 3.023e-05 [a_1]: 0.00068494 [with_stream_mark]: 2.19e-05 [recompute_prepare]: 9.56998e-06 [updatestate_depend_eliminate]: 4.25e-06 [updatestate_assign_eliminate]: 3.28e-06 [updatestate_loads_eliminate]: 3.05998e-06 [parameter_eliminate]: 2.01998e-06 [a_2]: 8.824e-05 [accelerated_algorithm]: 7.39002e-06 [shard]: 1.99e-06 [meta_shard_fg_expand]: 1.80001e-06 [shard_inline]: 7.17002e-06 [merge_send_recv]: 8.60001e-06 [auto_parallel]: 8.24998e-06 [parallel]: 1.999e-05 [flash_sp]: 9.27001e-06 [merge_comm]: 4.3e-06 [allreduce_fusion]: 3.51999e-06 [matmul_add_comm_reduction]: 1.101e-05 [allreduce_slice_to_reducescatter]: 6.30011e-07 [virtual_shard_identity]: 9.25001e-06 [virtual_dataset]: 7e-06 [get_grad_eliminate_]: 6.82002e-06 [virtual_output]: 6.83e-06 [merge_forward]: 3.65998e-06 [cell_reuse_recompute_pass]: 1.15001e-06 [offload_activation]: 1.031e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.232e-05 [merge_recompute_call_nodes]: 1.97999e-06 [before_grad]: 1.09e-05 [set_forward_comm_id_for_comm_node_pass]: 4.17e-06 [meta_fg_expand]: 3.14999e-06 [flash_sp_send_recv_attached]: 2.71e-06 [receive_attached]: 2.53e-06 [after_resolve]: 1.14e-05 [a_after_grad]: 1.081e-05 [renormalize]: 0.00089126 [add_forward_monad_depend]: 8.02e-06 [auto_monad_grad]: 2.69001e-06 [auto_monad_eliminator]: 1.98e-05 [cse]: 3.216e-05 [a_3]: 5.239e-05 [Cycle 2]: 0.00070643, [45] [expand_dump_flag]: 2.26e-06 [switch_simplify]: 8.32e-06 [loop_unroll]: 6.37001e-06 [a_1]: 0.00015263 [with_stream_mark]: 1.548e-05 [recompute_prepare]: 7.05e-06 [updatestate_depend_eliminate]: 3.35e-06 [updatestate_assign_eliminate]: 2.19999e-06 [updatestate_loads_eliminate]: 2.93e-06 [parameter_eliminate]: 1.42e-06 [a_2]: 7.777e-05 [accelerated_algorithm]: 6.91001e-06 [shard]: 2.36e-06 [meta_shard_fg_expand]: 1.67001e-06 [shard_inline]: 6.30002e-06 [merge_send_recv]: 6.39999e-06 [auto_parallel]: 7.43999e-06 [parallel]: 6.21998e-06 [flash_sp]: 3.49001e-06 [merge_comm]: 3.36001e-06 [allreduce_fusion]: 3.61999e-06 [matmul_add_comm_reduction]: 7.26999e-06 [allreduce_slice_to_reducescatter]: 5.3001e-07 [virtual_shard_identity]: 7.61999e-06 [virtual_dataset]: 9.28002e-06 [get_grad_eliminate_]: 5.94999e-06 [virtual_output]: 5.59e-06 [merge_forward]: 3.71001e-06 [cell_reuse_recompute_pass]: 2.68e-06 [offload_activation]: 7.88001e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.116e-05 [merge_recompute_call_nodes]: 1.14e-06 [before_grad]: 9.41003e-06 [set_forward_comm_id_for_comm_node_pass]: 5.37999e-06 [meta_fg_expand]: 2.21e-06 [flash_sp_send_recv_attached]: 1.17e-06 [receive_attached]: 2.13998e-06 [after_resolve]: 1.303e-05 [a_after_grad]: 1.015e-05 [renormalize]: 8.00064e-08 [add_forward_monad_depend]: 2.04999e-06 [auto_monad_grad]: 1.31002e-06 [auto_monad_eliminator]: 8.15e-06 [cse]: 1.583e-05 [a_3]: 3.732e-05 [py_interpret_to_execute_after_opt_a]: 1.207e-05 [slice_cell_reuse_recomputed_activation]: 2.51998e-06 [rewriter_after_opt_a]: 3.72e-05 [convert_after_rewriter]: 7.20998e-06 [order_py_execute_after_rewriter]: 5.21002e-06 [mutable_eliminate]: 0.00073157 [opt_b]: 0.00021085, [1] [Cycle 1]: 0.00020401, [7] [b_1]: 0.00012751 [b_2]: 8.80001e-06 [updatestate_depend_eliminate]: 7.33999e-06 [updatestate_assign_eliminate]: 2.44001e-06 [updatestate_loads_eliminate]: 2.62001e-06 [renormalize]: 5.89993e-07 [cse]: 1.926e-05 [optimize_parallel_all_gather_comm]: 1.712e-05 [overlap_param_gather]: 1.94999e-06 [cconv]: 3.133e-05 [loop_unroll]: 0.0004516 [opt_after_cconv]: 0.00010661, [1] [Cycle 1]: 0.00010059, [7] [c_1]: 3.152e-05 [parameter_eliminate]: 4.17e-06 [updatestate_depend_eliminate]: 5.67001e-06 [updatestate_assign_eliminate]: 2.47001e-06 [updatestate_loads_eliminate]: 2.42001e-06 [cse]: 1.91e-05 [renormalize]: 7.79983e-07 [remove_dup_value]: 1.458e-05 [tuple_transform]: 7.835e-05, [1] [Cycle 1]: 7.348e-05, [4] [d_1]: 4.571e-05 [none_parameter_eliminate]: 1.79e-06 [renormalize]: 2.00002e-07 [switch_simplify]: 6.96001e-06 [partial_unused_args_eliminate]: 1.96e-06 [add_recomputation]: 5.12e-05 [cse_after_recomputation]: 2.193e-05, [1] [Cycle 1]: 1.705e-05, [1] [cse]: 1.126e-05 [environ_conv]: 5.69e-06 [swap_dp_allreduce_reducescatter]: 5.21002e-06 [bias_add_comm_swap]: 3.18e-06 [label_micro_interleaved_index]: 4.84e-06 [label_fine_grained_interleaved_index]: 2.86999e-06 [merge_cast_opt]: 1.30999e-06 [slice_recompute_activation]: 2.29001e-06 [micro_interleaved_order_control]: 2.41998e-06 [assign_add_opt]: 1.45999e-06 [ForceFp32Comm]: 1.22e-06 [remove_cast_before_assign_add]: 1.05999e-06 [full_micro_interleaved_order_control]: 2.14e-06 [reorder_send_recv_between_fp_bp]: 3.27002e-06 [comm_op_add_attrs]: 1.00999e-06 [add_comm_op_reuse_tag]: 9.70002e-07 [interleave_split_concat_branches]: 1.30001e-06 [interleave_parallel_branches]: 1.04e-06 [overlap_opt_shard_in_pipeline]: 1.26002e-06 [overlap_opt_shard_grad_in_pipeline]: 1.79e-06 [control_data_broadcast_order]: 1.388e-05 [grouped_pairwise_exchange_alltoall]: 1.55999e-06 [offloading_packed_experts]: 3.54002e-06 [overlap_recompute_and_grad_model_parallel]: 5.07999e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.42999e-06 [overlap_recompute_allgather_and_fa_grad]: 1.37e-06 [overlap_recompute_comm]: 2.37001e-06 [overlap_grad_ring_attention]: 4.16001e-06 [overlap_grad_flash_sp]: 2.255e-05 [begin_end_overlap_inline]: 5.19998e-07 [split_matmul_comm_elemetwise]: 2.11003e-06 [split_layernorm_comm]: 1.57001e-06 [handle_group_info]: 1.05001e-06 [symbol_engine_optimizer]: 8.145e-05, [1] [Cycle 1]: 7.712e-05, [6] [build]: 3.48999e-06 [elim_shapecalc]: 1.174e-05 [elim_not_effective]: 1.398e-05 [opt_reshape]: 7.27002e-06 [fold_const_symbol]: 1.083e-05 [renormalize]: 1.80007e-07 [detach_backward]: 2.17999e-06 [pipeline_parallel_scheduler]: 1.89e-06 [auto_monad_reorder]: 1.785e-05 [get_jit_bprop_graph]: 2.31e-06 [rewriter_after_jit_bprop_graph]: 5.52001e-06 [opt_after_jit_grad]: 0.00048569 [validate]: 4.327e-05 Sums bootstrap : 0.000429s : 1.84% type_inference : 0.017609s : 75.48% event_method : 0.000024s : 0.10% auto_monad : 0.000072s : 0.31% graph_reusing : 0.000007s : 0.03% inline : 0.000003s : 0.01% add_attr.add_attr_with_inline.tag_attr : 0.000028s : 0.12% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.03% parallel-infer-symbol : 0.000004s : 0.02% pre_auto_parallel : 0.000044s : 0.19% insert-virtual-dataset : 0.000003s : 0.01% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.01% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000033s : 0.14% optimize.rewriter_before_opt_a : 0.000095s : 0.41% optimize.opt_a.expand_dump_flag : 0.000005s : 0.02% optimize.opt_a.switch_simplify : 0.000054s : 0.23% optimize.opt_a.loop_unroll : 0.000037s : 0.16% optimize.opt_a.a_1 : 0.000838s : 3.59% optimize.opt_a.with_stream_mark : 0.000037s : 0.16% optimize.opt_a.recompute_prepare : 0.000017s : 0.07% optimize.opt_a.updatestate_depend_eliminate : 0.000008s : 0.03% optimize.opt_a.updatestate_assign_eliminate : 0.000005s : 0.02% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.03% optimize.opt_a.parameter_eliminate : 0.000003s : 0.01% optimize.opt_a.a_2 : 0.000166s : 0.71% optimize.opt_a.accelerated_algorithm : 0.000014s : 0.06% optimize.opt_a.shard : 0.000004s : 0.02% optimize.opt_a.meta_shard_fg_expand : 0.000003s : 0.01% optimize.opt_a.shard_inline : 0.000013s : 0.06% optimize.opt_a.merge_send_recv : 0.000015s : 0.06% optimize.opt_a.auto_parallel : 0.000016s : 0.07% optimize.opt_a.parallel : 0.000026s : 0.11% optimize.opt_a.flash_sp : 0.000013s : 0.05% optimize.opt_a.merge_comm : 0.000008s : 0.03% optimize.opt_a.allreduce_fusion : 0.000007s : 0.03% optimize.opt_a.matmul_add_comm_reduction : 0.000018s : 0.08% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000017s : 0.07% optimize.opt_a.virtual_dataset : 0.000016s : 0.07% optimize.opt_a.get_grad_eliminate_ : 0.000013s : 0.05% optimize.opt_a.virtual_output : 0.000012s : 0.05% optimize.opt_a.merge_forward : 0.000007s : 0.03% optimize.opt_a.cell_reuse_recompute_pass : 0.000004s : 0.02% optimize.opt_a.offload_activation : 0.000018s : 0.08% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000023s : 0.10% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.01% optimize.opt_a.before_grad : 0.000020s : 0.09% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000010s : 0.04% optimize.opt_a.meta_fg_expand : 0.000005s : 0.02% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.02% optimize.opt_a.receive_attached : 0.000005s : 0.02% optimize.opt_a.after_resolve : 0.000024s : 0.10% optimize.opt_a.a_after_grad : 0.000021s : 0.09% optimize.opt_a.renormalize : 0.000891s : 3.82% optimize.opt_a.add_forward_monad_depend : 0.000010s : 0.04% optimize.opt_a.auto_monad_grad : 0.000004s : 0.02% optimize.opt_a.auto_monad_eliminator : 0.000028s : 0.12% optimize.opt_a.cse : 0.000048s : 0.21% optimize.opt_a.a_3 : 0.000090s : 0.38% optimize.py_interpret_to_execute_after_opt_a : 0.000012s : 0.05% optimize.slice_cell_reuse_recomputed_activation : 0.000003s : 0.01% optimize.rewriter_after_opt_a : 0.000037s : 0.16% optimize.convert_after_rewriter : 0.000007s : 0.03% optimize.order_py_execute_after_rewriter : 0.000005s : 0.02% optimize.mutable_eliminate : 0.000732s : 3.14% optimize.opt_b.b_1 : 0.000128s : 0.55% optimize.opt_b.b_2 : 0.000009s : 0.04% optimize.opt_b.updatestate_depend_eliminate : 0.000007s : 0.03% optimize.opt_b.updatestate_assign_eliminate : 0.000002s : 0.01% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.01% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000019s : 0.08% optimize.optimize_parallel_all_gather_comm : 0.000017s : 0.07% optimize.overlap_param_gather : 0.000002s : 0.01% optimize.cconv : 0.000031s : 0.13% optimize.loop_unroll : 0.000452s : 1.94% optimize.opt_after_cconv.c_1 : 0.000032s : 0.14% optimize.opt_after_cconv.parameter_eliminate : 0.000004s : 0.02% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.02% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000002s : 0.01% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.01% optimize.opt_after_cconv.cse : 0.000019s : 0.08% optimize.opt_after_cconv.renormalize : 0.000001s : 0.00% optimize.remove_dup_value : 0.000015s : 0.06% optimize.tuple_transform.d_1 : 0.000046s : 0.20% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000007s : 0.03% optimize.partial_unused_args_eliminate : 0.000002s : 0.01% optimize.add_recomputation : 0.000051s : 0.22% optimize.cse_after_recomputation.cse : 0.000011s : 0.05% optimize.environ_conv : 0.000006s : 0.02% optimize.swap_dp_allreduce_reducescatter : 0.000005s : 0.02% optimize.bias_add_comm_swap : 0.000003s : 0.01% optimize.label_micro_interleaved_index : 0.000005s : 0.02% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.01% optimize.merge_cast_opt : 0.000001s : 0.01% optimize.slice_recompute_activation : 0.000002s : 0.01% optimize.micro_interleaved_order_control : 0.000002s : 0.01% optimize.assign_add_opt : 0.000001s : 0.01% optimize.ForceFp32Comm : 0.000001s : 0.01% optimize.remove_cast_before_assign_add : 0.000001s : 0.00% optimize.full_micro_interleaved_order_control : 0.000002s : 0.01% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.01% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.01% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.01% optimize.control_data_broadcast_order : 0.000014s : 0.06% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.01% optimize.offloading_packed_experts : 0.000004s : 0.02% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.02% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.01% optimize.overlap_recompute_comm : 0.000002s : 0.01% optimize.overlap_grad_ring_attention : 0.000004s : 0.02% optimize.overlap_grad_flash_sp : 0.000023s : 0.10% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.01% optimize.split_layernorm_comm : 0.000002s : 0.01% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000003s : 0.01% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000012s : 0.05% optimize.symbol_engine_optimizer.elim_not_effective : 0.000014s : 0.06% optimize.symbol_engine_optimizer.opt_reshape : 0.000007s : 0.03% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000011s : 0.05% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.01% pipeline_parallel_scheduler : 0.000002s : 0.01% auto_monad_reorder : 0.000018s : 0.08% get_jit_bprop_graph : 0.000002s : 0.01% rewriter_after_jit_bprop_graph : 0.000006s : 0.02% opt_after_jit_grad : 0.000486s : 2.08% validate : 0.000043s : 0.19% Time group info: ------[substitution.] 0.000247 34 14.79% : 0.000036s : 6: substitution.arithmetic_simplify 0.89% : 0.000002s : 2: substitution.elim_not_effective 0.67% : 0.000002s : 2: substitution.fold_const_symbol 2.62% : 0.000006s : 4: substitution.graph_param_transform 68.59% : 0.000169s : 4: substitution.inline 1.83% : 0.000004s : 4: substitution.j_node_and_user_rematch 1.85% : 0.000005s : 4: substitution.remove_not_recompute_node 2.30% : 0.000006s : 4: substitution.replace_old_param 6.46% : 0.000016s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.017516 2 93.14% : 0.016315s : 1: type_inference.infer 6.86% : 0.001202s : 1: type_inference.specialize ------[replace.] 0.000065 8 64.47% : 0.000042s : 4: replace.inline 35.53% : 0.000023s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000181 8 92.16% : 0.000166s : 4: match.inline 7.84% : 0.000014s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000219 1278 0.93% : 0.000002s : 13: predicate.accumulaten_eliminater 0.75% : 0.000002s : 4: predicate.ad_related_special_op_eliminate 0.63% : 0.000001s : 8: predicate.addn_check_dump 0.92% : 0.000002s : 13: predicate.addn_zero_filter 0.83% : 0.000002s : 13: predicate.adjust_all_reduce_mul_add 2.55% : 0.000006s : 21: predicate.arithmetic_simplify 0.83% : 0.000002s : 13: predicate.cast_eliminate 0.66% : 0.000001s : 8: predicate.check_bprop_eliminate 0.53% : 0.000001s : 8: predicate.compare_switch_simplify 0.18% : 0.000000s : 4: predicate.const_output_eliminate 0.64% : 0.000001s : 8: predicate.depend_value_elim 0.91% : 0.000002s : 13: predicate.dict_get_item_const_eliminator 1.19% : 0.000003s : 13: predicate.dict_get_item_eliminator 0.94% : 0.000002s : 13: predicate.dict_set_item_eliminator 0.91% : 0.000002s : 8: predicate.dumpgradient_eliminate 0.27% : 0.000001s : 4: predicate.elim_not_effective 0.69% : 0.000002s : 4: predicate.elim_shapecalc_of_broadcastargs 1.21% : 0.000003s : 17: predicate.environ_add_const_eliminate 1.12% : 0.000002s : 17: predicate.environ_get_add_eliminate 0.97% : 0.000002s : 17: predicate.environ_get_depend_swap 1.59% : 0.000003s : 25: predicate.environ_get_eliminate 1.01% : 0.000002s : 17: predicate.environ_get_set_eliminate 1.36% : 0.000003s : 21: predicate.exchange_switch_depend_value 2.37% : 0.000005s : 21: predicate.float_depend_g_call 0.62% : 0.000001s : 8: predicate.float_environ_get_switch 0.74% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.18% : 0.000000s : 4: predicate.fold_const_symbol 0.62% : 0.000001s : 8: predicate.get_grad_eliminate 0.19% : 0.000000s : 4: predicate.graph_param_transform 0.66% : 0.000001s : 8: predicate.incorporate_call 0.45% : 0.000001s : 8: predicate.incorporate_call_switch 6.44% : 0.000014s : 58: predicate.inline 0.73% : 0.000002s : 8: predicate.inline_without_move 0.32% : 0.000001s : 8: predicate.j_node_and_user_rematch 1.17% : 0.000003s : 8: predicate.less_batch_normalization 1.85% : 0.000004s : 25: predicate.list_to_tuple_eliminator_ 2.41% : 0.000005s : 38: predicate.load_eliminater 0.94% : 0.000002s : 4: predicate.loop_unroll_after_grad 2.20% : 0.000005s : 34: predicate.loop_unroll_before_grad 1.63% : 0.000004s : 21: predicate.make_slice_get_slice_eliminator 0.53% : 0.000001s : 8: predicate.merge_addn 0.54% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.56% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.74% : 0.000002s : 13: predicate.minmaximum_grad 1.41% : 0.000003s : 4: predicate.mutable_eliminate 0.39% : 0.000001s : 4: predicate.opt_reshape 0.37% : 0.000001s : 4: predicate.parallel_virtual_node 1.90% : 0.000004s : 21: predicate.partial_defer_inline 1.60% : 0.000003s : 21: predicate.partial_eliminate 0.95% : 0.000002s : 13: predicate.print_const_string_wrapper 0.63% : 0.000001s : 8: predicate.reduce_all_const_elim 1.25% : 0.000003s : 13: predicate.reduce_eliminate 2.58% : 0.000006s : 38: predicate.redundant_stop_gradient_eliminater 0.41% : 0.000001s : 8: predicate.remove_not_recompute_node 1.40% : 0.000003s : 25: predicate.replace_applicator 0.50% : 0.000001s : 8: predicate.replace_old_param 0.36% : 0.000001s : 4: predicate.reset_defer_inline 0.83% : 0.000002s : 13: predicate.reshape_eliminate 0.63% : 0.000001s : 8: predicate.row_tensor_add_zeros_like 0.51% : 0.000001s : 4: predicate.row_tensor_eliminate 0.71% : 0.000002s : 8: predicate.same_eliminate 0.41% : 0.000001s : 8: predicate.set_cell_output_no_recompute 1.28% : 0.000003s : 8: predicate.shard_identity_eliminate 0.69% : 0.000002s : 8: predicate.special_op_eliminate 0.64% : 0.000001s : 8: predicate.specialize_transform 0.89% : 0.000002s : 8: predicate.split_environ_get_set_with_tuple_value 0.72% : 0.000002s : 8: predicate.stack_unstack_eliminate 0.31% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.44% : 0.000003s : 21: predicate.switch_defer_inline 1.86% : 0.000004s : 29: predicate.switch_layer_defer_inline 5.19% : 0.000011s : 67: predicate.switch_simplify 1.11% : 0.000002s : 13: predicate.tile_eliminate 1.12% : 0.000002s : 13: predicate.transpose_eliminate 1.38% : 0.000003s : 21: predicate.tuple_list_convert_item_index_to_positive 1.46% : 0.000003s : 21: predicate.tuple_list_get_item_const_eliminator 1.37% : 0.000003s : 21: predicate.tuple_list_get_item_depend_reorder 3.07% : 0.000007s : 33: predicate.tuple_list_get_item_eliminator 1.60% : 0.000003s : 21: predicate.tuple_list_get_set_item_eliminator 2.27% : 0.000005s : 29: predicate.tuple_list_set_item_eliminator 1.65% : 0.000004s : 25: predicate.tuple_to_list_eliminator_ 2.38% : 0.000005s : 38: predicate.updatestate_pure_node_eliminater 2.90% : 0.000006s : 46: predicate.updatestate_useless_node_eliminater 0.39% : 0.000001s : 4: predicate.value_based_eliminate 0.70% : 0.000002s : 8: predicate.virtual_dataset_eliminate 0.59% : 0.000001s : 8: predicate.virtual_output_eliminate 0.27% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.41% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000740 11 48.28% : 0.000357s : 5: func_graph_cloner_run.FuncGraphClonerGraph 51.72% : 0.000383s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.040266 192 0.01% : 0.000004s : 1: ForceFp32Comm 10.23% : 0.004120s : 1: add_attr 10.19% : 0.004104s : 1: add_attr_with_inline 0.01% : 0.000005s : 1: add_comm_op_reuse_tag 0.14% : 0.000055s : 1: add_recomputation 0.01% : 0.000004s : 1: assign_add_opt 0.19% : 0.000077s : 1: auto_monad 0.05% : 0.000022s : 1: auto_monad_reorder 0.01% : 0.000004s : 1: begin_end_overlap_inline 0.02% : 0.000006s : 1: bias_add_comm_swap 1.15% : 0.000462s : 1: bootstrap 0.09% : 0.000035s : 1: cconv 0.01% : 0.000004s : 1: comm_op_add_attrs 0.04% : 0.000017s : 1: control_data_broadcast_order 0.03% : 0.000010s : 1: convert_after_rewriter 0.06% : 0.000025s : 1: cse_after_recomputation 0.01% : 0.000006s : 1: dataset_repeat_opt 0.01% : 0.000006s : 1: detach_backward 0.02% : 0.000009s : 1: environ_conv 0.08% : 0.000030s : 1: event_method 0.01% : 0.000005s : 1: full_micro_interleaved_order_control 0.01% : 0.000005s : 1: get_jit_bprop_graph 0.02% : 0.000010s : 1: graph_reusing 0.01% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000004s : 1: handle_group_info 0.02% : 0.000006s : 1: inline 0.01% : 0.000006s : 1: insert-virtual-dataset 0.01% : 0.000004s : 1: interleave_parallel_branches 0.01% : 0.000004s : 1: interleave_split_concat_branches 0.02% : 0.000006s : 1: label_fine_grained_interleaved_index 0.02% : 0.000008s : 1: label_micro_interleaved_index 1.14% : 0.000461s : 1: loop_unroll 0.01% : 0.000004s : 1: merge_cast_opt 0.01% : 0.000005s : 1: micro_interleaved_order_control 1.84% : 0.000742s : 1: mutable_eliminate 0.02% : 0.000007s : 1: offloading_packed_experts 0.04% : 0.000014s : 1: opt.transform.loop_unroll_optimizer 0.04% : 0.000017s : 1: opt.transform.mutable_eliminate 3.18% : 0.001281s : 78: opt.transform.opt_a 0.07% : 0.000030s : 1: opt.transform.opt_after_cconv 0.06% : 0.000026s : 1: opt.transform.opt_after_jit_grad 0.26% : 0.000104s : 28: opt.transform.opt_b 0.13% : 0.000051s : 2: opt.transform.opt_trans_graph 0.10% : 0.000040s : 4: opt.transform.symbol_engine_opt 7.65% : 0.003079s : 1: opt_a 0.27% : 0.000110s : 1: opt_after_cconv 1.23% : 0.000494s : 1: opt_after_jit_grad 0.53% : 0.000214s : 1: opt_b 13.42% : 0.005406s : 1: optimize 0.05% : 0.000021s : 1: optimize_parallel_all_gather_comm 0.02% : 0.000008s : 1: order_py_execute_after_rewriter 0.06% : 0.000026s : 1: overlap_grad_flash_sp 0.01% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.02% : 0.000007s : 1: overlap_grad_ring_attention 0.01% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.01% : 0.000005s : 1: overlap_param_gather 0.01% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.02% : 0.000009s : 1: overlap_recompute_and_grad_model_parallel 0.01% : 0.000005s : 1: overlap_recompute_comm 0.02% : 0.000007s : 1: parallel-infer-symbol 0.01% : 0.000004s : 1: parallel-infer-symbol-second 0.01% : 0.000005s : 1: partial_unused_args_eliminate 0.01% : 0.000005s : 1: pipeline_parallel_scheduler 0.01% : 0.000004s : 1: pipeline_split 0.12% : 0.000048s : 1: pre_auto_parallel 0.09% : 0.000038s : 1: py_interpret_to_execute 0.04% : 0.000016s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000004s : 1: remove_cast_before_assign_add 0.05% : 0.000018s : 1: remove_dup_value 1.19% : 0.000481s : 1: renormalize.infer 0.99% : 0.000400s : 1: renormalize.specialize 0.01% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.02% : 0.000009s : 1: rewriter_after_jit_bprop_graph 0.10% : 0.000041s : 1: rewriter_after_opt_a 0.25% : 0.000099s : 1: rewriter_before_opt_a 0.01% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.02% : 0.000006s : 1: slice_recompute_activation 0.01% : 0.000005s : 1: split_layernorm_comm 0.01% : 0.000005s : 1: split_matmul_comm_elemetwise 0.02% : 0.000008s : 1: swap_dp_allreduce_reducescatter 0.21% : 0.000084s : 1: symbol_engine_optimizer 0.20% : 0.000081s : 1: tuple_transform 43.80% : 0.017638s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:44:04.667.329 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:44:04.667.607 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0706305, [21] [bootstrap]: 0.00042522 [type_inference]: 0.0585627 [event_method]: 2.409e-05 [auto_monad]: 6.873e-05 [graph_reusing]: 6.12999e-06 [inline]: 3.75e-06 [add_attr]: 0.00377451, [1] [add_attr_with_inline]: 0.0037627, [1] [Cycle 1]: 9.669e-05, [2] [tag_attr]: 2.458e-05 [meta_addattr_fg_expand]: 6.14001e-06 [parallel-infer-symbol]: 3.73001e-06 [pre_auto_parallel]: 4.321e-05 [insert-virtual-dataset]: 2.44001e-06 [parallel-infer-symbol-second]: 8.2e-07 [dataset_repeat_opt]: 1.94e-06 [pipeline_split]: 1.62999e-06 [optimize]: 0.00631582, [53] [py_interpret_to_execute]: 3.47e-05 [rewriter_before_opt_a]: 9.612e-05 [opt_a]: 0.00355449, [2] [Cycle 1]: 0.00242364, [45] [expand_dump_flag]: 3.03998e-06 [switch_simplify]: 4.408e-05 [loop_unroll]: 3.108e-05 [a_1]: 0.00065892 [with_stream_mark]: 2.202e-05 [recompute_prepare]: 1.047e-05 [updatestate_depend_eliminate]: 4.57e-06 [updatestate_assign_eliminate]: 3.56999e-06 [updatestate_loads_eliminate]: 3.03e-06 [parameter_eliminate]: 2.11e-06 [a_2]: 0.00011194 [accelerated_algorithm]: 7.58999e-06 [shard]: 2.24999e-06 [meta_shard_fg_expand]: 2.24001e-06 [shard_inline]: 7.05002e-06 [merge_send_recv]: 8.67e-06 [auto_parallel]: 7.97e-06 [parallel]: 1.874e-05 [flash_sp]: 8.85999e-06 [merge_comm]: 4.20999e-06 [allreduce_fusion]: 3.67002e-06 [matmul_add_comm_reduction]: 1.071e-05 [allreduce_slice_to_reducescatter]: 1.04e-06 [virtual_shard_identity]: 8.85999e-06 [virtual_dataset]: 7.16001e-06 [get_grad_eliminate_]: 6.40997e-06 [virtual_output]: 6.66e-06 [merge_forward]: 3.95998e-06 [cell_reuse_recompute_pass]: 1.22e-06 [offload_activation]: 1.124e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.511e-05 [merge_recompute_call_nodes]: 1.41002e-06 [before_grad]: 1.087e-05 [set_forward_comm_id_for_comm_node_pass]: 3.49001e-06 [meta_fg_expand]: 3.4e-06 [flash_sp_send_recv_attached]: 3.65e-06 [receive_attached]: 2.73e-06 [after_resolve]: 1.328e-05 [a_after_grad]: 1.009e-05 [renormalize]: 0.00080584 [add_forward_monad_depend]: 6.78003e-06 [auto_monad_grad]: 2.42001e-06 [auto_monad_eliminator]: 1.761e-05 [cse]: 3.156e-05 [a_3]: 6.519e-05 [Cycle 2]: 0.00111344, [45] [expand_dump_flag]: 1.96e-06 [switch_simplify]: 8.17e-06 [loop_unroll]: 6.30002e-06 [a_1]: 0.00013372 [with_stream_mark]: 1.423e-05 [recompute_prepare]: 6.71e-06 [updatestate_depend_eliminate]: 3.28998e-06 [updatestate_assign_eliminate]: 3.14999e-06 [updatestate_loads_eliminate]: 2.22999e-06 [parameter_eliminate]: 1.54e-06 [a_2]: 0.00027354 [accelerated_algorithm]: 8.84e-06 [shard]: 2.81999e-06 [meta_shard_fg_expand]: 2.30002e-06 [shard_inline]: 6.81999e-06 [merge_send_recv]: 9.51e-06 [auto_parallel]: 8.28999e-06 [parallel]: 7.12002e-06 [flash_sp]: 4.79e-06 [merge_comm]: 3.72002e-06 [allreduce_fusion]: 3.51001e-06 [matmul_add_comm_reduction]: 9.07001e-06 [allreduce_slice_to_reducescatter]: 6.09987e-07 [virtual_shard_identity]: 8.17e-06 [virtual_dataset]: 6.53998e-06 [get_grad_eliminate_]: 7.53e-06 [virtual_output]: 5.91e-06 [merge_forward]: 4.42998e-06 [cell_reuse_recompute_pass]: 2.74001e-06 [offload_activation]: 1.128e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.926e-05 [merge_recompute_call_nodes]: 1.30001e-06 [before_grad]: 1.135e-05 [set_forward_comm_id_for_comm_node_pass]: 3.84002e-06 [meta_fg_expand]: 2.26998e-06 [flash_sp_send_recv_attached]: 1.85001e-06 [receive_attached]: 1.86e-06 [after_resolve]: 1.289e-05 [a_after_grad]: 9.77001e-06 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 2.71e-06 [auto_monad_grad]: 1.72999e-06 [auto_monad_eliminator]: 1.334e-05 [cse]: 2.143e-05 [a_3]: 5.629e-05 [py_interpret_to_execute_after_opt_a]: 1.756e-05 [slice_cell_reuse_recomputed_activation]: 5.20001e-06 [rewriter_after_opt_a]: 4.782e-05 [convert_after_rewriter]: 1.078e-05 [order_py_execute_after_rewriter]: 8.62998e-06 [mutable_eliminate]: 0.0007539 [opt_b]: 0.00031695, [1] [Cycle 1]: 0.00030412, [7] [b_1]: 0.00018984 [b_2]: 9.20999e-06 [updatestate_depend_eliminate]: 9.22999e-06 [updatestate_assign_eliminate]: 3.26999e-06 [updatestate_loads_eliminate]: 2.93998e-06 [renormalize]: 7.30011e-07 [cse]: 2.744e-05 [optimize_parallel_all_gather_comm]: 2.168e-05 [overlap_param_gather]: 4.62e-06 [cconv]: 3.812e-05 [loop_unroll]: 0.00053063 [opt_after_cconv]: 0.00014198, [1] [Cycle 1]: 0.0001316, [7] [c_1]: 3.495e-05 [parameter_eliminate]: 6.07999e-06 [updatestate_depend_eliminate]: 6.34999e-06 [updatestate_assign_eliminate]: 2.68e-06 [updatestate_loads_eliminate]: 2.32999e-06 [cse]: 2.046e-05 [renormalize]: 5.00004e-07 [remove_dup_value]: 1.828e-05 [tuple_transform]: 9.65e-05, [1] [Cycle 1]: 8.877e-05, [4] [d_1]: 4.768e-05 [none_parameter_eliminate]: 1.82999e-06 [renormalize]: 2.3999e-07 [switch_simplify]: 7.55e-06 [partial_unused_args_eliminate]: 4.83001e-06 [add_recomputation]: 5.715e-05 [cse_after_recomputation]: 3.077e-05, [1] [Cycle 1]: 2.303e-05, [1] [cse]: 1.3e-05 [environ_conv]: 9.67001e-06 [swap_dp_allreduce_reducescatter]: 8.18999e-06 [bias_add_comm_swap]: 6.44999e-06 [label_micro_interleaved_index]: 8.16002e-06 [label_fine_grained_interleaved_index]: 5.61e-06 [merge_cast_opt]: 3.97e-06 [slice_recompute_activation]: 5.15001e-06 [micro_interleaved_order_control]: 5.09e-06 [assign_add_opt]: 3.71999e-06 [ForceFp32Comm]: 3.38999e-06 [remove_cast_before_assign_add]: 3.41999e-06 [full_micro_interleaved_order_control]: 4.67e-06 [reorder_send_recv_between_fp_bp]: 5.56e-06 [comm_op_add_attrs]: 3.21001e-06 [add_comm_op_reuse_tag]: 3.80998e-06 [interleave_split_concat_branches]: 3.84002e-06 [interleave_parallel_branches]: 3.37002e-06 [overlap_opt_shard_in_pipeline]: 4.06001e-06 [overlap_opt_shard_grad_in_pipeline]: 4.52e-06 [control_data_broadcast_order]: 1.615e-05 [grouped_pairwise_exchange_alltoall]: 4.15999e-06 [offloading_packed_experts]: 7.62998e-06 [overlap_recompute_and_grad_model_parallel]: 7.45e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.76001e-06 [overlap_recompute_allgather_and_fa_grad]: 3.64002e-06 [overlap_recompute_comm]: 4.60999e-06 [overlap_grad_ring_attention]: 6.55002e-06 [overlap_grad_flash_sp]: 2.415e-05 [begin_end_overlap_inline]: 2.93e-06 [split_matmul_comm_elemetwise]: 4.50999e-06 [split_layernorm_comm]: 4.70001e-06 [handle_group_info]: 3.85e-06 [symbol_engine_optimizer]: 0.00010532, [1] [Cycle 1]: 9.794e-05, [6] [build]: 4.30999e-06 [elim_shapecalc]: 1.221e-05 [elim_not_effective]: 1.376e-05 [opt_reshape]: 7.46001e-06 [fold_const_symbol]: 1.111e-05 [renormalize]: 3.69997e-07 [detach_backward]: 4.30999e-06 [pipeline_parallel_scheduler]: 2.75002e-06 [auto_monad_reorder]: 2.277e-05 [get_jit_bprop_graph]: 2.16e-06 [rewriter_after_jit_bprop_graph]: 7.29001e-06 [opt_after_jit_grad]: 0.000704 [validate]: 4.481e-05 Sums bootstrap : 0.000425s : 0.65% type_inference : 0.058563s : 90.12% event_method : 0.000024s : 0.04% auto_monad : 0.000069s : 0.11% graph_reusing : 0.000006s : 0.01% inline : 0.000004s : 0.01% add_attr.add_attr_with_inline.tag_attr : 0.000025s : 0.04% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.01% parallel-infer-symbol : 0.000004s : 0.01% pre_auto_parallel : 0.000043s : 0.07% insert-virtual-dataset : 0.000002s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000035s : 0.05% optimize.rewriter_before_opt_a : 0.000096s : 0.15% optimize.opt_a.expand_dump_flag : 0.000005s : 0.01% optimize.opt_a.switch_simplify : 0.000052s : 0.08% optimize.opt_a.loop_unroll : 0.000037s : 0.06% optimize.opt_a.a_1 : 0.000793s : 1.22% optimize.opt_a.with_stream_mark : 0.000036s : 0.06% optimize.opt_a.recompute_prepare : 0.000017s : 0.03% optimize.opt_a.updatestate_depend_eliminate : 0.000008s : 0.01% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.01% optimize.opt_a.updatestate_loads_eliminate : 0.000005s : 0.01% optimize.opt_a.parameter_eliminate : 0.000004s : 0.01% optimize.opt_a.a_2 : 0.000385s : 0.59% optimize.opt_a.accelerated_algorithm : 0.000016s : 0.03% optimize.opt_a.shard : 0.000005s : 0.01% optimize.opt_a.meta_shard_fg_expand : 0.000005s : 0.01% optimize.opt_a.shard_inline : 0.000014s : 0.02% optimize.opt_a.merge_send_recv : 0.000018s : 0.03% optimize.opt_a.auto_parallel : 0.000016s : 0.03% optimize.opt_a.parallel : 0.000026s : 0.04% optimize.opt_a.flash_sp : 0.000014s : 0.02% optimize.opt_a.merge_comm : 0.000008s : 0.01% optimize.opt_a.allreduce_fusion : 0.000007s : 0.01% optimize.opt_a.matmul_add_comm_reduction : 0.000020s : 0.03% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000002s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000017s : 0.03% optimize.opt_a.virtual_dataset : 0.000014s : 0.02% optimize.opt_a.get_grad_eliminate_ : 0.000014s : 0.02% optimize.opt_a.virtual_output : 0.000013s : 0.02% optimize.opt_a.merge_forward : 0.000008s : 0.01% optimize.opt_a.cell_reuse_recompute_pass : 0.000004s : 0.01% optimize.opt_a.offload_activation : 0.000023s : 0.03% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000034s : 0.05% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.00% optimize.opt_a.before_grad : 0.000022s : 0.03% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000007s : 0.01% optimize.opt_a.meta_fg_expand : 0.000006s : 0.01% optimize.opt_a.flash_sp_send_recv_attached : 0.000006s : 0.01% optimize.opt_a.receive_attached : 0.000005s : 0.01% optimize.opt_a.after_resolve : 0.000026s : 0.04% optimize.opt_a.a_after_grad : 0.000020s : 0.03% optimize.opt_a.renormalize : 0.000806s : 1.24% optimize.opt_a.add_forward_monad_depend : 0.000009s : 0.01% optimize.opt_a.auto_monad_grad : 0.000004s : 0.01% optimize.opt_a.auto_monad_eliminator : 0.000031s : 0.05% optimize.opt_a.cse : 0.000053s : 0.08% optimize.opt_a.a_3 : 0.000121s : 0.19% optimize.py_interpret_to_execute_after_opt_a : 0.000018s : 0.03% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.01% optimize.rewriter_after_opt_a : 0.000048s : 0.07% optimize.convert_after_rewriter : 0.000011s : 0.02% optimize.order_py_execute_after_rewriter : 0.000009s : 0.01% optimize.mutable_eliminate : 0.000754s : 1.16% optimize.opt_b.b_1 : 0.000190s : 0.29% optimize.opt_b.b_2 : 0.000009s : 0.01% optimize.opt_b.updatestate_depend_eliminate : 0.000009s : 0.01% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.01% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.00% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000027s : 0.04% optimize.optimize_parallel_all_gather_comm : 0.000022s : 0.03% optimize.overlap_param_gather : 0.000005s : 0.01% optimize.cconv : 0.000038s : 0.06% optimize.loop_unroll : 0.000531s : 0.82% optimize.opt_after_cconv.c_1 : 0.000035s : 0.05% optimize.opt_after_cconv.parameter_eliminate : 0.000006s : 0.01% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.01% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.00% optimize.opt_after_cconv.cse : 0.000020s : 0.03% optimize.opt_after_cconv.renormalize : 0.000001s : 0.00% optimize.remove_dup_value : 0.000018s : 0.03% optimize.tuple_transform.d_1 : 0.000048s : 0.07% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000008s : 0.01% optimize.partial_unused_args_eliminate : 0.000005s : 0.01% optimize.add_recomputation : 0.000057s : 0.09% optimize.cse_after_recomputation.cse : 0.000013s : 0.02% optimize.environ_conv : 0.000010s : 0.01% optimize.swap_dp_allreduce_reducescatter : 0.000008s : 0.01% optimize.bias_add_comm_swap : 0.000006s : 0.01% optimize.label_micro_interleaved_index : 0.000008s : 0.01% optimize.label_fine_grained_interleaved_index : 0.000006s : 0.01% optimize.merge_cast_opt : 0.000004s : 0.01% optimize.slice_recompute_activation : 0.000005s : 0.01% optimize.micro_interleaved_order_control : 0.000005s : 0.01% optimize.assign_add_opt : 0.000004s : 0.01% optimize.ForceFp32Comm : 0.000003s : 0.01% optimize.remove_cast_before_assign_add : 0.000003s : 0.01% optimize.full_micro_interleaved_order_control : 0.000005s : 0.01% optimize.reorder_send_recv_between_fp_bp : 0.000006s : 0.01% optimize.comm_op_add_attrs : 0.000003s : 0.00% optimize.add_comm_op_reuse_tag : 0.000004s : 0.01% optimize.interleave_split_concat_branches : 0.000004s : 0.01% optimize.interleave_parallel_branches : 0.000003s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000005s : 0.01% optimize.control_data_broadcast_order : 0.000016s : 0.02% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.01% optimize.offloading_packed_experts : 0.000008s : 0.01% optimize.overlap_recompute_and_grad_model_parallel : 0.000007s : 0.01% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.01% optimize.overlap_recompute_comm : 0.000005s : 0.01% optimize.overlap_grad_ring_attention : 0.000007s : 0.01% optimize.overlap_grad_flash_sp : 0.000024s : 0.04% optimize.begin_end_overlap_inline : 0.000003s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000005s : 0.01% optimize.split_layernorm_comm : 0.000005s : 0.01% optimize.handle_group_info : 0.000004s : 0.01% optimize.symbol_engine_optimizer.build : 0.000004s : 0.01% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000012s : 0.02% optimize.symbol_engine_optimizer.elim_not_effective : 0.000014s : 0.02% optimize.symbol_engine_optimizer.opt_reshape : 0.000007s : 0.01% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000011s : 0.02% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000004s : 0.01% pipeline_parallel_scheduler : 0.000003s : 0.00% auto_monad_reorder : 0.000023s : 0.04% get_jit_bprop_graph : 0.000002s : 0.00% rewriter_after_jit_bprop_graph : 0.000007s : 0.01% opt_after_jit_grad : 0.000704s : 1.08% validate : 0.000045s : 0.07% Time group info: ------[substitution.] 0.000219 29 0.83% : 0.000002s : 2: substitution.elim_not_effective 0.66% : 0.000001s : 2: substitution.fold_const_symbol 2.85% : 0.000006s : 4: substitution.graph_param_transform 77.32% : 0.000169s : 4: substitution.inline 2.01% : 0.000004s : 4: substitution.j_node_and_user_rematch 2.53% : 0.000006s : 4: substitution.remove_not_recompute_node 3.09% : 0.000007s : 4: substitution.replace_old_param 7.23% : 0.000016s : 4: substitution.tuple_list_get_item_eliminator 3.48% : 0.000008s : 1: substitution.value_based_eliminate ------[type_inference.] 0.058486 2 98.36% : 0.057528s : 1: type_inference.infer 1.64% : 0.000959s : 1: type_inference.specialize ------[replace.] 0.000066 8 65.15% : 0.000043s : 4: replace.inline 34.85% : 0.000023s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000180 8 92.37% : 0.000166s : 4: match.inline 7.63% : 0.000014s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000224 1278 0.78% : 0.000002s : 13: predicate.accumulaten_eliminater 0.86% : 0.000002s : 4: predicate.ad_related_special_op_eliminate 0.51% : 0.000001s : 8: predicate.addn_check_dump 1.00% : 0.000002s : 13: predicate.addn_zero_filter 0.75% : 0.000002s : 13: predicate.adjust_all_reduce_mul_add 1.95% : 0.000004s : 21: predicate.arithmetic_simplify 1.06% : 0.000002s : 13: predicate.cast_eliminate 0.64% : 0.000001s : 8: predicate.check_bprop_eliminate 0.45% : 0.000001s : 8: predicate.compare_switch_simplify 0.26% : 0.000001s : 4: predicate.const_output_eliminate 0.54% : 0.000001s : 8: predicate.depend_value_elim 0.87% : 0.000002s : 13: predicate.dict_get_item_const_eliminator 0.91% : 0.000002s : 13: predicate.dict_get_item_eliminator 0.84% : 0.000002s : 13: predicate.dict_set_item_eliminator 1.15% : 0.000003s : 8: predicate.dumpgradient_eliminate 0.27% : 0.000001s : 4: predicate.elim_not_effective 0.68% : 0.000002s : 4: predicate.elim_shapecalc_of_broadcastargs 1.21% : 0.000003s : 17: predicate.environ_add_const_eliminate 1.12% : 0.000003s : 17: predicate.environ_get_add_eliminate 1.02% : 0.000002s : 17: predicate.environ_get_depend_swap 1.51% : 0.000003s : 25: predicate.environ_get_eliminate 0.98% : 0.000002s : 17: predicate.environ_get_set_eliminate 1.35% : 0.000003s : 21: predicate.exchange_switch_depend_value 2.35% : 0.000005s : 21: predicate.float_depend_g_call 0.65% : 0.000001s : 8: predicate.float_environ_get_switch 0.74% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.17% : 0.000000s : 4: predicate.fold_const_symbol 0.79% : 0.000002s : 8: predicate.get_grad_eliminate 0.28% : 0.000001s : 4: predicate.graph_param_transform 0.58% : 0.000001s : 8: predicate.incorporate_call 0.44% : 0.000001s : 8: predicate.incorporate_call_switch 6.42% : 0.000014s : 58: predicate.inline 0.82% : 0.000002s : 8: predicate.inline_without_move 0.31% : 0.000001s : 8: predicate.j_node_and_user_rematch 0.76% : 0.000002s : 8: predicate.less_batch_normalization 1.82% : 0.000004s : 25: predicate.list_to_tuple_eliminator_ 2.38% : 0.000005s : 38: predicate.load_eliminater 1.21% : 0.000003s : 4: predicate.loop_unroll_after_grad 2.31% : 0.000005s : 34: predicate.loop_unroll_before_grad 1.61% : 0.000004s : 21: predicate.make_slice_get_slice_eliminator 0.49% : 0.000001s : 8: predicate.merge_addn 0.74% : 0.000002s : 8: predicate.micro_step_allgather_replace 0.73% : 0.000002s : 8: predicate.mini_step_allgather_replace 0.80% : 0.000002s : 13: predicate.minmaximum_grad 2.08% : 0.000005s : 4: predicate.mutable_eliminate 0.39% : 0.000001s : 4: predicate.opt_reshape 0.37% : 0.000001s : 4: predicate.parallel_virtual_node 1.58% : 0.000004s : 21: predicate.partial_defer_inline 1.49% : 0.000003s : 21: predicate.partial_eliminate 0.86% : 0.000002s : 13: predicate.print_const_string_wrapper 0.56% : 0.000001s : 8: predicate.reduce_all_const_elim 1.23% : 0.000003s : 13: predicate.reduce_eliminate 2.27% : 0.000005s : 38: predicate.redundant_stop_gradient_eliminater 0.54% : 0.000001s : 8: predicate.remove_not_recompute_node 1.38% : 0.000003s : 25: predicate.replace_applicator 0.41% : 0.000001s : 8: predicate.replace_old_param 0.27% : 0.000001s : 4: predicate.reset_defer_inline 0.87% : 0.000002s : 13: predicate.reshape_eliminate 0.66% : 0.000001s : 8: predicate.row_tensor_add_zeros_like 0.49% : 0.000001s : 4: predicate.row_tensor_eliminate 0.68% : 0.000002s : 8: predicate.same_eliminate 0.44% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.75% : 0.000002s : 8: predicate.shard_identity_eliminate 0.64% : 0.000001s : 8: predicate.special_op_eliminate 0.89% : 0.000002s : 8: predicate.specialize_transform 1.47% : 0.000003s : 8: predicate.split_environ_get_set_with_tuple_value 0.73% : 0.000002s : 8: predicate.stack_unstack_eliminate 0.38% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.39% : 0.000003s : 21: predicate.switch_defer_inline 1.91% : 0.000004s : 29: predicate.switch_layer_defer_inline 5.07% : 0.000011s : 67: predicate.switch_simplify 0.96% : 0.000002s : 13: predicate.tile_eliminate 0.84% : 0.000002s : 13: predicate.transpose_eliminate 1.48% : 0.000003s : 21: predicate.tuple_list_convert_item_index_to_positive 1.64% : 0.000004s : 21: predicate.tuple_list_get_item_const_eliminator 1.80% : 0.000004s : 21: predicate.tuple_list_get_item_depend_reorder 3.32% : 0.000007s : 33: predicate.tuple_list_get_item_eliminator 1.42% : 0.000003s : 21: predicate.tuple_list_get_set_item_eliminator 2.07% : 0.000005s : 29: predicate.tuple_list_set_item_eliminator 1.66% : 0.000004s : 25: predicate.tuple_to_list_eliminator_ 2.24% : 0.000005s : 38: predicate.updatestate_pure_node_eliminater 2.98% : 0.000007s : 46: predicate.updatestate_useless_node_eliminater 0.54% : 0.000001s : 4: predicate.value_based_eliminate 0.80% : 0.000002s : 8: predicate.virtual_dataset_eliminate 0.63% : 0.000001s : 8: predicate.virtual_output_eliminate 0.24% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.48% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000733 11 51.25% : 0.000376s : 5: func_graph_cloner_run.FuncGraphClonerGraph 48.75% : 0.000357s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.083103 192 0.01% : 0.000006s : 1: ForceFp32Comm 4.56% : 0.003786s : 1: add_attr 4.53% : 0.003767s : 1: add_attr_with_inline 0.01% : 0.000007s : 1: add_comm_op_reuse_tag 0.07% : 0.000062s : 1: add_recomputation 0.01% : 0.000006s : 1: assign_add_opt 0.09% : 0.000079s : 1: auto_monad 0.04% : 0.000032s : 1: auto_monad_reorder 0.01% : 0.000006s : 1: begin_end_overlap_inline 0.01% : 0.000009s : 1: bias_add_comm_swap 0.57% : 0.000471s : 1: bootstrap 0.05% : 0.000042s : 1: cconv 0.01% : 0.000006s : 1: comm_op_add_attrs 0.02% : 0.000019s : 1: control_data_broadcast_order 0.02% : 0.000014s : 1: convert_after_rewriter 0.04% : 0.000034s : 1: cse_after_recomputation 0.01% : 0.000007s : 1: dataset_repeat_opt 0.02% : 0.000021s : 1: detach_backward 0.02% : 0.000013s : 1: environ_conv 0.04% : 0.000035s : 1: event_method 0.01% : 0.000007s : 1: full_micro_interleaved_order_control 0.01% : 0.000010s : 1: get_jit_bprop_graph 0.02% : 0.000013s : 1: graph_reusing 0.01% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000006s : 1: handle_group_info 0.01% : 0.000009s : 1: inline 0.01% : 0.000009s : 1: insert-virtual-dataset 0.01% : 0.000006s : 1: interleave_parallel_branches 0.01% : 0.000006s : 1: interleave_split_concat_branches 0.01% : 0.000008s : 1: label_fine_grained_interleaved_index 0.01% : 0.000011s : 1: label_micro_interleaved_index 0.65% : 0.000538s : 1: loop_unroll 0.01% : 0.000007s : 1: merge_cast_opt 0.01% : 0.000008s : 1: micro_interleaved_order_control 0.92% : 0.000763s : 1: mutable_eliminate 0.01% : 0.000010s : 1: offloading_packed_experts 0.02% : 0.000017s : 1: opt.transform.loop_unroll_optimizer 0.03% : 0.000022s : 1: opt.transform.mutable_eliminate 1.69% : 0.001408s : 78: opt.transform.opt_a 0.04% : 0.000033s : 1: opt.transform.opt_after_cconv 0.04% : 0.000031s : 1: opt.transform.opt_after_jit_grad 0.15% : 0.000124s : 28: opt.transform.opt_b 0.06% : 0.000053s : 2: opt.transform.opt_trans_graph 0.05% : 0.000041s : 4: opt.transform.symbol_engine_opt 4.28% : 0.003558s : 1: opt_a 0.18% : 0.000146s : 1: opt_after_cconv 0.86% : 0.000716s : 1: opt_after_jit_grad 0.39% : 0.000321s : 1: opt_b 8.00% : 0.006648s : 1: optimize 0.03% : 0.000025s : 1: optimize_parallel_all_gather_comm 0.01% : 0.000012s : 1: order_py_execute_after_rewriter 0.03% : 0.000027s : 1: overlap_grad_flash_sp 0.01% : 0.000006s : 1: overlap_grad_matmul_and_grad_allreduce 0.01% : 0.000009s : 1: overlap_grad_ring_attention 0.01% : 0.000007s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000007s : 1: overlap_opt_shard_in_pipeline 0.01% : 0.000008s : 1: overlap_param_gather 0.01% : 0.000006s : 1: overlap_recompute_allgather_and_fa_grad 0.01% : 0.000011s : 1: overlap_recompute_and_grad_model_parallel 0.01% : 0.000007s : 1: overlap_recompute_comm 0.01% : 0.000011s : 1: parallel-infer-symbol 0.01% : 0.000007s : 1: parallel-infer-symbol-second 0.01% : 0.000008s : 1: partial_unused_args_eliminate 0.01% : 0.000011s : 1: pipeline_parallel_scheduler 0.01% : 0.000007s : 1: pipeline_split 0.06% : 0.000051s : 1: pre_auto_parallel 0.05% : 0.000039s : 1: py_interpret_to_execute 0.02% : 0.000021s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000006s : 1: remove_cast_before_assign_add 0.03% : 0.000022s : 1: remove_dup_value 0.52% : 0.000434s : 1: renormalize.infer 0.43% : 0.000361s : 1: renormalize.specialize 0.01% : 0.000008s : 1: reorder_send_recv_between_fp_bp 0.02% : 0.000014s : 1: rewriter_after_jit_bprop_graph 0.06% : 0.000053s : 1: rewriter_after_opt_a 0.12% : 0.000100s : 1: rewriter_before_opt_a 0.01% : 0.000008s : 1: slice_cell_reuse_recomputed_activation 0.01% : 0.000009s : 1: slice_recompute_activation 0.01% : 0.000008s : 1: split_layernorm_comm 0.01% : 0.000007s : 1: split_matmul_comm_elemetwise 0.01% : 0.000011s : 1: swap_dp_allreduce_reducescatter 0.13% : 0.000108s : 1: symbol_engine_optimizer 0.12% : 0.000100s : 1: tuple_transform 70.53% : 0.058615s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:44:05.770.430 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0166542, [21] [bootstrap]: 0.00058516 [type_inference]: 0.0064208 [event_method]: 1.937e-05 [auto_monad]: 6.324e-05 [graph_reusing]: 5.62999e-06 [inline]: 2.93e-06 [add_attr]: 0.00346031, [1] [add_attr_with_inline]: 0.00344887, [1] [Cycle 1]: 7.234e-05, [2] [tag_attr]: 2.272e-05 [meta_addattr_fg_expand]: 5.84e-06 [parallel-infer-symbol]: 3.38e-06 [pre_auto_parallel]: 3.857e-05 [insert-virtual-dataset]: 2.32999e-06 [parallel-infer-symbol-second]: 7.00005e-07 [dataset_repeat_opt]: 2.40997e-06 [pipeline_split]: 1.77001e-06 [optimize]: 0.00530945, [53] [py_interpret_to_execute]: 2.99e-05 [rewriter_before_opt_a]: 9.236e-05 [opt_a]: 0.00303894, [2] [Cycle 1]: 0.00233845, [45] [expand_dump_flag]: 2.91999e-06 [switch_simplify]: 4.341e-05 [loop_unroll]: 3.118e-05 [a_1]: 0.00066149 [with_stream_mark]: 2.275e-05 [recompute_prepare]: 1.175e-05 [updatestate_depend_eliminate]: 4.57e-06 [updatestate_assign_eliminate]: 3.75e-06 [updatestate_loads_eliminate]: 3.07002e-06 [parameter_eliminate]: 2.26e-06 [a_2]: 8.428e-05 [accelerated_algorithm]: 7.69002e-06 [shard]: 1.99999e-06 [meta_shard_fg_expand]: 2.06e-06 [shard_inline]: 6.54999e-06 [merge_send_recv]: 8.92e-06 [auto_parallel]: 7.98001e-06 [parallel]: 2.101e-05 [flash_sp]: 8.74e-06 [merge_comm]: 4.57e-06 [allreduce_fusion]: 3.54002e-06 [matmul_add_comm_reduction]: 9.74e-06 [allreduce_slice_to_reducescatter]: 8.10018e-07 [virtual_shard_identity]: 9.32001e-06 [virtual_dataset]: 7.03e-06 [get_grad_eliminate_]: 6.09001e-06 [virtual_output]: 6.68e-06 [merge_forward]: 4.50001e-06 [cell_reuse_recompute_pass]: 1.25999e-06 [offload_activation]: 1.094e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.466e-05 [merge_recompute_call_nodes]: 1.57001e-06 [before_grad]: 1.013e-05 [set_forward_comm_id_for_comm_node_pass]: 3.7e-06 [meta_fg_expand]: 2.93e-06 [flash_sp_send_recv_attached]: 2.75997e-06 [receive_attached]: 2.07999e-06 [after_resolve]: 1.316e-05 [a_after_grad]: 9.88002e-06 [renormalize]: 0.00089114 [add_forward_monad_depend]: 7.83001e-06 [auto_monad_grad]: 2.37999e-06 [auto_monad_eliminator]: 1.961e-05 [cse]: 3.233e-05 [a_3]: 5.555e-05 [Cycle 2]: 0.00068914, [45] [expand_dump_flag]: 2.15002e-06 [switch_simplify]: 8.89e-06 [loop_unroll]: 6.61999e-06 [a_1]: 0.00013991 [with_stream_mark]: 1.511e-05 [recompute_prepare]: 6.51e-06 [updatestate_depend_eliminate]: 3.61999e-06 [updatestate_assign_eliminate]: 2.86e-06 [updatestate_loads_eliminate]: 2.79999e-06 [parameter_eliminate]: 1.17999e-06 [a_2]: 7.296e-05 [accelerated_algorithm]: 6.41e-06 [shard]: 2.34001e-06 [meta_shard_fg_expand]: 1.96e-06 [shard_inline]: 6.19999e-06 [merge_send_recv]: 6.60997e-06 [auto_parallel]: 7.29001e-06 [parallel]: 6.56999e-06 [flash_sp]: 3.54002e-06 [merge_comm]: 3.66001e-06 [allreduce_fusion]: 3.80998e-06 [matmul_add_comm_reduction]: 7.46999e-06 [allreduce_slice_to_reducescatter]: 1.19998e-06 [virtual_shard_identity]: 7.46001e-06 [virtual_dataset]: 6.04999e-06 [get_grad_eliminate_]: 5.87001e-06 [virtual_output]: 5.89e-06 [merge_forward]: 3.40998e-06 [cell_reuse_recompute_pass]: 2.34999e-06 [offload_activation]: 9.02e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.466e-05 [merge_recompute_call_nodes]: 1.34e-06 [before_grad]: 9.51e-06 [set_forward_comm_id_for_comm_node_pass]: 3.16999e-06 [meta_fg_expand]: 2.25002e-06 [flash_sp_send_recv_attached]: 1.22e-06 [receive_attached]: 1.86998e-06 [after_resolve]: 1.085e-05 [a_after_grad]: 9.04e-06 [renormalize]: 7.00238e-08 [add_forward_monad_depend]: 1.37999e-06 [auto_monad_grad]: 1.41998e-06 [auto_monad_eliminator]: 7.61999e-06 [cse]: 1.428e-05 [a_3]: 3.668e-05 [py_interpret_to_execute_after_opt_a]: 1.311e-05 [slice_cell_reuse_recomputed_activation]: 1.90001e-06 [rewriter_after_opt_a]: 3.757e-05 [convert_after_rewriter]: 6.66e-06 [order_py_execute_after_rewriter]: 5.24e-06 [mutable_eliminate]: 0.00068493 [opt_b]: 0.0002289, [1] [Cycle 1]: 0.00022119, [7] [b_1]: 0.00014356 [b_2]: 8.76002e-06 [updatestate_depend_eliminate]: 6.74999e-06 [updatestate_assign_eliminate]: 2.43e-06 [updatestate_loads_eliminate]: 2.30002e-06 [renormalize]: 8.2e-07 [cse]: 1.954e-05 [optimize_parallel_all_gather_comm]: 2.019e-05 [overlap_param_gather]: 2.18998e-06 [cconv]: 3.066e-05 [loop_unroll]: 0.00044483 [opt_after_cconv]: 0.00010417, [1] [Cycle 1]: 9.795e-05, [7] [c_1]: 3.171e-05 [parameter_eliminate]: 3.61999e-06 [updatestate_depend_eliminate]: 5.35001e-06 [updatestate_assign_eliminate]: 2.55002e-06 [updatestate_loads_eliminate]: 2.31e-06 [cse]: 1.767e-05 [renormalize]: 3.80009e-07 [remove_dup_value]: 1.41e-05 [tuple_transform]: 7.815e-05, [1] [Cycle 1]: 7.36e-05, [4] [d_1]: 4.521e-05 [none_parameter_eliminate]: 1.65001e-06 [renormalize]: 2.50002e-07 [switch_simplify]: 7.31999e-06 [partial_unused_args_eliminate]: 1.72999e-06 [add_recomputation]: 5.064e-05 [cse_after_recomputation]: 2.192e-05, [1] [Cycle 1]: 1.705e-05, [1] [cse]: 1.115e-05 [environ_conv]: 5.82001e-06 [swap_dp_allreduce_reducescatter]: 5.61003e-06 [bias_add_comm_swap]: 2.93998e-06 [label_micro_interleaved_index]: 4.37e-06 [label_fine_grained_interleaved_index]: 2.62001e-06 [merge_cast_opt]: 1.25999e-06 [slice_recompute_activation]: 2.12999e-06 [micro_interleaved_order_control]: 2.69999e-06 [assign_add_opt]: 1.20001e-06 [ForceFp32Comm]: 8.30012e-07 [remove_cast_before_assign_add]: 1.00001e-06 [full_micro_interleaved_order_control]: 2.15002e-06 [reorder_send_recv_between_fp_bp]: 2.75997e-06 [comm_op_add_attrs]: 1.14998e-06 [add_comm_op_reuse_tag]: 9.20001e-07 [interleave_split_concat_branches]: 1.11002e-06 [interleave_parallel_branches]: 1.04e-06 [overlap_opt_shard_in_pipeline]: 1.14998e-06 [overlap_opt_shard_grad_in_pipeline]: 2.14e-06 [control_data_broadcast_order]: 1.284e-05 [grouped_pairwise_exchange_alltoall]: 1.59e-06 [offloading_packed_experts]: 4.23999e-06 [overlap_recompute_and_grad_model_parallel]: 4.93001e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.53002e-06 [overlap_recompute_allgather_and_fa_grad]: 1.30999e-06 [overlap_recompute_comm]: 2.24001e-06 [overlap_grad_ring_attention]: 4.05e-06 [overlap_grad_flash_sp]: 2.154e-05 [begin_end_overlap_inline]: 4.69998e-07 [split_matmul_comm_elemetwise]: 2.24001e-06 [split_layernorm_comm]: 1.84e-06 [handle_group_info]: 9.20001e-07 [symbol_engine_optimizer]: 7.927e-05, [1] [Cycle 1]: 7.483e-05, [6] [build]: 3.31999e-06 [elim_shapecalc]: 1.078e-05 [elim_not_effective]: 1.294e-05 [opt_reshape]: 7.48e-06 [fold_const_symbol]: 1.06e-05 [renormalize]: 2.60014e-07 [detach_backward]: 2.22999e-06 [pipeline_parallel_scheduler]: 1.50001e-06 [auto_monad_reorder]: 1.683e-05 [get_jit_bprop_graph]: 2.37999e-06 [rewriter_after_jit_bprop_graph]: 4.50001e-06 [opt_after_jit_grad]: 0.00050668 [validate]: 4.767e-05 Sums bootstrap : 0.000585s : 4.80% type_inference : 0.006421s : 52.63% event_method : 0.000019s : 0.16% auto_monad : 0.000063s : 0.52% graph_reusing : 0.000006s : 0.05% inline : 0.000003s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000023s : 0.19% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.05% parallel-infer-symbol : 0.000003s : 0.03% pre_auto_parallel : 0.000039s : 0.32% insert-virtual-dataset : 0.000002s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000030s : 0.25% optimize.rewriter_before_opt_a : 0.000092s : 0.76% optimize.opt_a.expand_dump_flag : 0.000005s : 0.04% optimize.opt_a.switch_simplify : 0.000052s : 0.43% optimize.opt_a.loop_unroll : 0.000038s : 0.31% optimize.opt_a.a_1 : 0.000801s : 6.57% optimize.opt_a.with_stream_mark : 0.000038s : 0.31% optimize.opt_a.recompute_prepare : 0.000018s : 0.15% optimize.opt_a.updatestate_depend_eliminate : 0.000008s : 0.07% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.05% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.05% optimize.opt_a.parameter_eliminate : 0.000003s : 0.03% optimize.opt_a.a_2 : 0.000157s : 1.29% optimize.opt_a.accelerated_algorithm : 0.000014s : 0.12% optimize.opt_a.shard : 0.000004s : 0.04% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.03% optimize.opt_a.shard_inline : 0.000013s : 0.10% optimize.opt_a.merge_send_recv : 0.000016s : 0.13% optimize.opt_a.auto_parallel : 0.000015s : 0.13% optimize.opt_a.parallel : 0.000028s : 0.23% optimize.opt_a.flash_sp : 0.000012s : 0.10% optimize.opt_a.merge_comm : 0.000008s : 0.07% optimize.opt_a.allreduce_fusion : 0.000007s : 0.06% optimize.opt_a.matmul_add_comm_reduction : 0.000017s : 0.14% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000002s : 0.02% optimize.opt_a.virtual_shard_identity : 0.000017s : 0.14% optimize.opt_a.virtual_dataset : 0.000013s : 0.11% optimize.opt_a.get_grad_eliminate_ : 0.000012s : 0.10% optimize.opt_a.virtual_output : 0.000013s : 0.10% optimize.opt_a.merge_forward : 0.000008s : 0.06% optimize.opt_a.cell_reuse_recompute_pass : 0.000004s : 0.03% optimize.opt_a.offload_activation : 0.000020s : 0.16% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000029s : 0.24% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.02% optimize.opt_a.before_grad : 0.000020s : 0.16% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000007s : 0.06% optimize.opt_a.meta_fg_expand : 0.000005s : 0.04% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.03% optimize.opt_a.receive_attached : 0.000004s : 0.03% optimize.opt_a.after_resolve : 0.000024s : 0.20% optimize.opt_a.a_after_grad : 0.000019s : 0.16% optimize.opt_a.renormalize : 0.000891s : 7.31% optimize.opt_a.add_forward_monad_depend : 0.000009s : 0.08% optimize.opt_a.auto_monad_grad : 0.000004s : 0.03% optimize.opt_a.auto_monad_eliminator : 0.000027s : 0.22% optimize.opt_a.cse : 0.000047s : 0.38% optimize.opt_a.a_3 : 0.000092s : 0.76% optimize.py_interpret_to_execute_after_opt_a : 0.000013s : 0.11% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.02% optimize.rewriter_after_opt_a : 0.000038s : 0.31% optimize.convert_after_rewriter : 0.000007s : 0.05% optimize.order_py_execute_after_rewriter : 0.000005s : 0.04% optimize.mutable_eliminate : 0.000685s : 5.61% optimize.opt_b.b_1 : 0.000144s : 1.18% optimize.opt_b.b_2 : 0.000009s : 0.07% optimize.opt_b.updatestate_depend_eliminate : 0.000007s : 0.06% optimize.opt_b.updatestate_assign_eliminate : 0.000002s : 0.02% optimize.opt_b.updatestate_loads_eliminate : 0.000002s : 0.02% optimize.opt_b.renormalize : 0.000001s : 0.01% optimize.opt_b.cse : 0.000020s : 0.16% optimize.optimize_parallel_all_gather_comm : 0.000020s : 0.17% optimize.overlap_param_gather : 0.000002s : 0.02% optimize.cconv : 0.000031s : 0.25% optimize.loop_unroll : 0.000445s : 3.65% optimize.opt_after_cconv.c_1 : 0.000032s : 0.26% optimize.opt_after_cconv.parameter_eliminate : 0.000004s : 0.03% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000005s : 0.04% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.02% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.02% optimize.opt_after_cconv.cse : 0.000018s : 0.14% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000014s : 0.12% optimize.tuple_transform.d_1 : 0.000045s : 0.37% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000007s : 0.06% optimize.partial_unused_args_eliminate : 0.000002s : 0.01% optimize.add_recomputation : 0.000051s : 0.42% optimize.cse_after_recomputation.cse : 0.000011s : 0.09% optimize.environ_conv : 0.000006s : 0.05% optimize.swap_dp_allreduce_reducescatter : 0.000006s : 0.05% optimize.bias_add_comm_swap : 0.000003s : 0.02% optimize.label_micro_interleaved_index : 0.000004s : 0.04% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.02% optimize.merge_cast_opt : 0.000001s : 0.01% optimize.slice_recompute_activation : 0.000002s : 0.02% optimize.micro_interleaved_order_control : 0.000003s : 0.02% optimize.assign_add_opt : 0.000001s : 0.01% optimize.ForceFp32Comm : 0.000001s : 0.01% optimize.remove_cast_before_assign_add : 0.000001s : 0.01% optimize.full_micro_interleaved_order_control : 0.000002s : 0.02% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.02% optimize.comm_op_add_attrs : 0.000001s : 0.01% optimize.add_comm_op_reuse_tag : 0.000001s : 0.01% optimize.interleave_split_concat_branches : 0.000001s : 0.01% optimize.interleave_parallel_branches : 0.000001s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.02% optimize.control_data_broadcast_order : 0.000013s : 0.11% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.01% optimize.offloading_packed_experts : 0.000004s : 0.03% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.04% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000002s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.01% optimize.overlap_recompute_comm : 0.000002s : 0.02% optimize.overlap_grad_ring_attention : 0.000004s : 0.03% optimize.overlap_grad_flash_sp : 0.000022s : 0.18% optimize.begin_end_overlap_inline : 0.000000s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.02% optimize.split_layernorm_comm : 0.000002s : 0.02% optimize.handle_group_info : 0.000001s : 0.01% optimize.symbol_engine_optimizer.build : 0.000003s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000011s : 0.09% optimize.symbol_engine_optimizer.elim_not_effective : 0.000013s : 0.11% optimize.symbol_engine_optimizer.opt_reshape : 0.000007s : 0.06% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000011s : 0.09% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.02% pipeline_parallel_scheduler : 0.000002s : 0.01% auto_monad_reorder : 0.000017s : 0.14% get_jit_bprop_graph : 0.000002s : 0.02% rewriter_after_jit_bprop_graph : 0.000005s : 0.04% opt_after_jit_grad : 0.000507s : 4.15% validate : 0.000048s : 0.39% Time group info: ------[substitution.] 0.000214 29 0.89% : 0.000002s : 2: substitution.elim_not_effective 0.58% : 0.000001s : 2: substitution.fold_const_symbol 2.96% : 0.000006s : 4: substitution.graph_param_transform 77.37% : 0.000166s : 4: substitution.inline 1.76% : 0.000004s : 4: substitution.j_node_and_user_rematch 2.78% : 0.000006s : 4: substitution.remove_not_recompute_node 2.42% : 0.000005s : 4: substitution.replace_old_param 7.69% : 0.000016s : 4: substitution.tuple_list_get_item_eliminator 3.55% : 0.000008s : 1: substitution.value_based_eliminate ------[type_inference.] 0.006352 2 88.43% : 0.005617s : 1: type_inference.infer 11.57% : 0.000735s : 1: type_inference.specialize ------[replace.] 0.000066 8 62.45% : 0.000041s : 4: replace.inline 37.55% : 0.000025s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000177 8 92.05% : 0.000163s : 4: match.inline 7.95% : 0.000014s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000221 1278 1.09% : 0.000002s : 13: predicate.accumulaten_eliminater 0.92% : 0.000002s : 4: predicate.ad_related_special_op_eliminate 0.62% : 0.000001s : 8: predicate.addn_check_dump 0.98% : 0.000002s : 13: predicate.addn_zero_filter 0.83% : 0.000002s : 13: predicate.adjust_all_reduce_mul_add 2.60% : 0.000006s : 21: predicate.arithmetic_simplify 0.98% : 0.000002s : 13: predicate.cast_eliminate 0.57% : 0.000001s : 8: predicate.check_bprop_eliminate 0.46% : 0.000001s : 8: predicate.compare_switch_simplify 0.19% : 0.000000s : 4: predicate.const_output_eliminate 0.58% : 0.000001s : 8: predicate.depend_value_elim 0.84% : 0.000002s : 13: predicate.dict_get_item_const_eliminator 1.05% : 0.000002s : 13: predicate.dict_get_item_eliminator 0.93% : 0.000002s : 13: predicate.dict_set_item_eliminator 1.10% : 0.000002s : 8: predicate.dumpgradient_eliminate 0.27% : 0.000001s : 4: predicate.elim_not_effective 0.35% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.25% : 0.000003s : 17: predicate.environ_add_const_eliminate 1.05% : 0.000002s : 17: predicate.environ_get_add_eliminate 1.06% : 0.000002s : 17: predicate.environ_get_depend_swap 1.71% : 0.000004s : 25: predicate.environ_get_eliminate 1.08% : 0.000002s : 17: predicate.environ_get_set_eliminate 1.41% : 0.000003s : 21: predicate.exchange_switch_depend_value 2.41% : 0.000005s : 21: predicate.float_depend_g_call 0.55% : 0.000001s : 8: predicate.float_environ_get_switch 0.76% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.16% : 0.000000s : 4: predicate.fold_const_symbol 0.62% : 0.000001s : 8: predicate.get_grad_eliminate 0.20% : 0.000000s : 4: predicate.graph_param_transform 0.54% : 0.000001s : 8: predicate.incorporate_call 0.47% : 0.000001s : 8: predicate.incorporate_call_switch 6.11% : 0.000013s : 58: predicate.inline 0.65% : 0.000001s : 8: predicate.inline_without_move 0.29% : 0.000001s : 8: predicate.j_node_and_user_rematch 0.80% : 0.000002s : 8: predicate.less_batch_normalization 1.77% : 0.000004s : 25: predicate.list_to_tuple_eliminator_ 2.43% : 0.000005s : 38: predicate.load_eliminater 1.04% : 0.000002s : 4: predicate.loop_unroll_after_grad 2.60% : 0.000006s : 34: predicate.loop_unroll_before_grad 1.72% : 0.000004s : 21: predicate.make_slice_get_slice_eliminator 0.61% : 0.000001s : 8: predicate.merge_addn 0.53% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.63% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.82% : 0.000002s : 13: predicate.minmaximum_grad 1.40% : 0.000003s : 4: predicate.mutable_eliminate 0.36% : 0.000001s : 4: predicate.opt_reshape 0.38% : 0.000001s : 4: predicate.parallel_virtual_node 1.83% : 0.000004s : 21: predicate.partial_defer_inline 1.56% : 0.000003s : 21: predicate.partial_eliminate 0.90% : 0.000002s : 13: predicate.print_const_string_wrapper 0.59% : 0.000001s : 8: predicate.reduce_all_const_elim 1.28% : 0.000003s : 13: predicate.reduce_eliminate 2.45% : 0.000005s : 38: predicate.redundant_stop_gradient_eliminater 0.46% : 0.000001s : 8: predicate.remove_not_recompute_node 1.38% : 0.000003s : 25: predicate.replace_applicator 0.49% : 0.000001s : 8: predicate.replace_old_param 0.50% : 0.000001s : 4: predicate.reset_defer_inline 0.97% : 0.000002s : 13: predicate.reshape_eliminate 0.61% : 0.000001s : 8: predicate.row_tensor_add_zeros_like 0.38% : 0.000001s : 4: predicate.row_tensor_eliminate 0.82% : 0.000002s : 8: predicate.same_eliminate 0.49% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.96% : 0.000002s : 8: predicate.shard_identity_eliminate 0.76% : 0.000002s : 8: predicate.special_op_eliminate 0.65% : 0.000001s : 8: predicate.specialize_transform 0.91% : 0.000002s : 8: predicate.split_environ_get_set_with_tuple_value 0.70% : 0.000002s : 8: predicate.stack_unstack_eliminate 0.30% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.52% : 0.000003s : 21: predicate.switch_defer_inline 1.98% : 0.000004s : 29: predicate.switch_layer_defer_inline 4.83% : 0.000011s : 67: predicate.switch_simplify 0.87% : 0.000002s : 13: predicate.tile_eliminate 0.94% : 0.000002s : 13: predicate.transpose_eliminate 1.46% : 0.000003s : 21: predicate.tuple_list_convert_item_index_to_positive 1.47% : 0.000003s : 21: predicate.tuple_list_get_item_const_eliminator 1.53% : 0.000003s : 21: predicate.tuple_list_get_item_depend_reorder 3.12% : 0.000007s : 33: predicate.tuple_list_get_item_eliminator 1.47% : 0.000003s : 21: predicate.tuple_list_get_set_item_eliminator 2.26% : 0.000005s : 29: predicate.tuple_list_set_item_eliminator 1.65% : 0.000004s : 25: predicate.tuple_to_list_eliminator_ 2.34% : 0.000005s : 38: predicate.updatestate_pure_node_eliminater 3.08% : 0.000007s : 46: predicate.updatestate_useless_node_eliminater 0.62% : 0.000001s : 4: predicate.value_based_eliminate 0.63% : 0.000001s : 8: predicate.virtual_dataset_eliminate 0.71% : 0.000002s : 8: predicate.virtual_output_eliminate 0.31% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.44% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000612 11 54.70% : 0.000335s : 5: func_graph_cloner_run.FuncGraphClonerGraph 45.30% : 0.000277s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.027711 192 0.01% : 0.000003s : 1: ForceFp32Comm 12.51% : 0.003466s : 1: add_attr 12.46% : 0.003453s : 1: add_attr_with_inline 0.02% : 0.000005s : 1: add_comm_op_reuse_tag 0.20% : 0.000054s : 1: add_recomputation 0.01% : 0.000004s : 1: assign_add_opt 0.25% : 0.000068s : 1: auto_monad 0.07% : 0.000021s : 1: auto_monad_reorder 0.01% : 0.000003s : 1: begin_end_overlap_inline 0.02% : 0.000006s : 1: bias_add_comm_swap 2.21% : 0.000612s : 1: bootstrap 0.12% : 0.000034s : 1: cconv 0.01% : 0.000004s : 1: comm_op_add_attrs 0.06% : 0.000016s : 1: control_data_broadcast_order 0.04% : 0.000010s : 1: convert_after_rewriter 0.09% : 0.000025s : 1: cse_after_recomputation 0.02% : 0.000005s : 1: dataset_repeat_opt 0.02% : 0.000006s : 1: detach_backward 0.03% : 0.000009s : 1: environ_conv 0.10% : 0.000027s : 1: event_method 0.02% : 0.000005s : 1: full_micro_interleaved_order_control 0.02% : 0.000005s : 1: get_jit_bprop_graph 0.03% : 0.000010s : 1: graph_reusing 0.02% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000004s : 1: handle_group_info 0.02% : 0.000006s : 1: inline 0.02% : 0.000006s : 1: insert-virtual-dataset 0.01% : 0.000004s : 1: interleave_parallel_branches 0.01% : 0.000004s : 1: interleave_split_concat_branches 0.02% : 0.000006s : 1: label_fine_grained_interleaved_index 0.03% : 0.000007s : 1: label_micro_interleaved_index 1.63% : 0.000453s : 1: loop_unroll 0.01% : 0.000004s : 1: merge_cast_opt 0.02% : 0.000005s : 1: micro_interleaved_order_control 2.51% : 0.000694s : 1: mutable_eliminate 0.03% : 0.000007s : 1: offloading_packed_experts 0.05% : 0.000015s : 1: opt.transform.loop_unroll_optimizer 0.07% : 0.000018s : 1: opt.transform.mutable_eliminate 4.47% : 0.001237s : 78: opt.transform.opt_a 0.11% : 0.000030s : 1: opt.transform.opt_after_cconv 0.11% : 0.000030s : 1: opt.transform.opt_after_jit_grad 0.43% : 0.000119s : 28: opt.transform.opt_b 0.18% : 0.000050s : 2: opt.transform.opt_trans_graph 0.14% : 0.000038s : 4: opt.transform.symbol_engine_opt 10.98% : 0.003043s : 1: opt_a 0.39% : 0.000108s : 1: opt_after_cconv 1.86% : 0.000517s : 1: opt_after_jit_grad 0.84% : 0.000232s : 1: opt_b 19.18% : 0.005315s : 1: optimize 0.08% : 0.000023s : 1: optimize_parallel_all_gather_comm 0.03% : 0.000008s : 1: order_py_execute_after_rewriter 0.09% : 0.000025s : 1: overlap_grad_flash_sp 0.02% : 0.000005s : 1: overlap_grad_matmul_and_grad_allreduce 0.03% : 0.000007s : 1: overlap_grad_ring_attention 0.02% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.02% : 0.000005s : 1: overlap_param_gather 0.01% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.03% : 0.000008s : 1: overlap_recompute_and_grad_model_parallel 0.02% : 0.000006s : 1: overlap_recompute_comm 0.03% : 0.000007s : 1: parallel-infer-symbol 0.01% : 0.000004s : 1: parallel-infer-symbol-second 0.02% : 0.000005s : 1: partial_unused_args_eliminate 0.02% : 0.000005s : 1: pipeline_parallel_scheduler 0.02% : 0.000005s : 1: pipeline_split 0.15% : 0.000043s : 1: pre_auto_parallel 0.12% : 0.000034s : 1: py_interpret_to_execute 0.06% : 0.000016s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000004s : 1: remove_cast_before_assign_add 0.06% : 0.000018s : 1: remove_dup_value 1.78% : 0.000493s : 1: renormalize.infer 1.40% : 0.000388s : 1: renormalize.specialize 0.02% : 0.000005s : 1: reorder_send_recv_between_fp_bp 0.03% : 0.000008s : 1: rewriter_after_jit_bprop_graph 0.15% : 0.000042s : 1: rewriter_after_opt_a 0.35% : 0.000097s : 1: rewriter_before_opt_a 0.02% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.02% : 0.000005s : 1: slice_recompute_activation 0.02% : 0.000005s : 1: split_layernorm_comm 0.02% : 0.000005s : 1: split_matmul_comm_elemetwise 0.03% : 0.000009s : 1: swap_dp_allreduce_reducescatter 0.30% : 0.000082s : 1: symbol_engine_optimizer 0.29% : 0.000081s : 1: tuple_transform 23.24% : 0.006440s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:44:06.383.451 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:44:06.383.717 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0349575, [21] [bootstrap]: 0.00043102 [type_inference]: 0.00626813 [event_method]: 2.153e-05 [auto_monad]: 6.723e-05 [graph_reusing]: 6.83e-06 [inline]: 2.71e-06 [add_attr]: 0.00346832, [1] [add_attr_with_inline]: 0.00345604, [1] [Cycle 1]: 8.839e-05, [2] [tag_attr]: 2.359e-05 [meta_addattr_fg_expand]: 6.17001e-06 [parallel-infer-symbol]: 3.01001e-06 [pre_auto_parallel]: 4.034e-05 [insert-virtual-dataset]: 2.78e-06 [parallel-infer-symbol-second]: 8.79983e-07 [dataset_repeat_opt]: 2.19001e-06 [pipeline_split]: 1.64998e-06 [optimize]: 0.0230411, [53] [py_interpret_to_execute]: 3.446e-05 [rewriter_before_opt_a]: 9.622e-05 [opt_a]: 0.00379053, [2] [Cycle 1]: 0.00277158, [45] [expand_dump_flag]: 2.89999e-06 [switch_simplify]: 4.321e-05 [loop_unroll]: 3.153e-05 [a_1]: 0.00071073 [with_stream_mark]: 2.167e-05 [recompute_prepare]: 1.366e-05 [updatestate_depend_eliminate]: 5.31002e-06 [updatestate_assign_eliminate]: 4.47e-06 [updatestate_loads_eliminate]: 3.83999e-06 [parameter_eliminate]: 2.33998e-06 [a_2]: 0.0001337 [accelerated_algorithm]: 9.63997e-06 [shard]: 2.27999e-06 [meta_shard_fg_expand]: 2.29999e-06 [shard_inline]: 9.00001e-06 [merge_send_recv]: 1.011e-05 [auto_parallel]: 9.17999e-06 [parallel]: 2.082e-05 [flash_sp]: 9.81e-06 [merge_comm]: 5.15001e-06 [allreduce_fusion]: 4.23999e-06 [matmul_add_comm_reduction]: 1.147e-05 [allreduce_slice_to_reducescatter]: 6.69999e-07 [virtual_shard_identity]: 1.117e-05 [virtual_dataset]: 8.62998e-06 [get_grad_eliminate_]: 7.87e-06 [virtual_output]: 8.37e-06 [merge_forward]: 4.40999e-06 [cell_reuse_recompute_pass]: 1.50999e-06 [offload_activation]: 1.214e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.959e-05 [merge_recompute_call_nodes]: 1.60999e-06 [before_grad]: 1.366e-05 [set_forward_comm_id_for_comm_node_pass]: 4.77e-06 [meta_fg_expand]: 3.2e-06 [flash_sp_send_recv_attached]: 2.79999e-06 [receive_attached]: 2.29999e-06 [after_resolve]: 1.409e-05 [a_after_grad]: 1.23e-05 [renormalize]: 0.00101 [add_forward_monad_depend]: 8.41002e-06 [auto_monad_grad]: 2.88e-06 [auto_monad_eliminator]: 2.094e-05 [cse]: 3.722e-05 [a_3]: 8.148e-05 [Cycle 2]: 0.00100007, [45] [expand_dump_flag]: 1.97999e-06 [switch_simplify]: 1.042e-05 [loop_unroll]: 8.00999e-06 [a_1]: 0.00018721 [with_stream_mark]: 1.667e-05 [recompute_prepare]: 8.80999e-06 [updatestate_depend_eliminate]: 4.94e-06 [updatestate_assign_eliminate]: 3.48999e-06 [updatestate_loads_eliminate]: 3.31001e-06 [parameter_eliminate]: 1.76e-06 [a_2]: 0.00012348 [accelerated_algorithm]: 8.75999e-06 [shard]: 2.73e-06 [meta_shard_fg_expand]: 1.93002e-06 [shard_inline]: 7.56999e-06 [merge_send_recv]: 8.33001e-06 [auto_parallel]: 8.48999e-06 [parallel]: 7.99002e-06 [flash_sp]: 4.32e-06 [merge_comm]: 4.22e-06 [allreduce_fusion]: 4.01001e-06 [matmul_add_comm_reduction]: 1.15e-05 [allreduce_slice_to_reducescatter]: 6.89994e-07 [virtual_shard_identity]: 9.54999e-06 [virtual_dataset]: 7.43999e-06 [get_grad_eliminate_]: 7.45e-06 [virtual_output]: 7.1e-06 [merge_forward]: 4.28999e-06 [cell_reuse_recompute_pass]: 2.76999e-06 [offload_activation]: 1.13e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.776e-05 [merge_recompute_call_nodes]: 1.10999e-06 [before_grad]: 1.313e-05 [set_forward_comm_id_for_comm_node_pass]: 4.28001e-06 [meta_fg_expand]: 3.31999e-06 [flash_sp_send_recv_attached]: 1.50999e-06 [receive_attached]: 1.86e-06 [after_resolve]: 1.351e-05 [a_after_grad]: 1.244e-05 [renormalize]: 8.9989e-08 [add_forward_monad_depend]: 1.63002e-06 [auto_monad_grad]: 1.29e-06 [auto_monad_eliminator]: 1.195e-05 [cse]: 2.102e-05 [a_3]: 5.996e-05 [py_interpret_to_execute_after_opt_a]: 2.076e-05 [slice_cell_reuse_recomputed_activation]: 4.95001e-06 [rewriter_after_opt_a]: 5.46e-05 [convert_after_rewriter]: 1.181e-05 [order_py_execute_after_rewriter]: 8.98002e-06 [mutable_eliminate]: 0.00078345 [opt_b]: 0.00036345, [1] [Cycle 1]: 0.00035036, [7] [b_1]: 0.00022183 [b_2]: 1.153e-05 [updatestate_depend_eliminate]: 1.013e-05 [updatestate_assign_eliminate]: 3.60998e-06 [updatestate_loads_eliminate]: 3.66999e-06 [renormalize]: 1.04e-06 [cse]: 3.448e-05 [optimize_parallel_all_gather_comm]: 2.751e-05 [overlap_param_gather]: 4.90999e-06 [cconv]: 3.977e-05 [loop_unroll]: 0.00063145 [opt_after_cconv]: 0.0163208, [1] [Cycle 1]: 0.0163022, [7] [c_1]: 0.0160831 [parameter_eliminate]: 8.42e-06 [updatestate_depend_eliminate]: 1.797e-05 [updatestate_assign_eliminate]: 4.42e-06 [updatestate_loads_eliminate]: 3.93001e-06 [cse]: 5.464e-05 [renormalize]: 1.00999e-06 [remove_dup_value]: 2.505e-05 [tuple_transform]: 0.00013746, [1] [Cycle 1]: 0.00012399, [4] [d_1]: 7.701e-05 [none_parameter_eliminate]: 1.76e-06 [renormalize]: 2.00002e-07 [switch_simplify]: 9.71998e-06 [partial_unused_args_eliminate]: 5.04998e-06 [add_recomputation]: 7.418e-05 [cse_after_recomputation]: 3.339e-05, [1] [Cycle 1]: 2.639e-05, [1] [cse]: 1.694e-05 [environ_conv]: 1.192e-05 [swap_dp_allreduce_reducescatter]: 8.90001e-06 [bias_add_comm_swap]: 6.01e-06 [label_micro_interleaved_index]: 1.198e-05 [label_fine_grained_interleaved_index]: 4.99e-06 [merge_cast_opt]: 4.15999e-06 [slice_recompute_activation]: 4.50999e-06 [micro_interleaved_order_control]: 5.20999e-06 [assign_add_opt]: 4.07e-06 [ForceFp32Comm]: 3.38999e-06 [remove_cast_before_assign_add]: 3.71001e-06 [full_micro_interleaved_order_control]: 4.87998e-06 [reorder_send_recv_between_fp_bp]: 6.22001e-06 [comm_op_add_attrs]: 3.71001e-06 [add_comm_op_reuse_tag]: 3.45998e-06 [interleave_split_concat_branches]: 3.48999e-06 [interleave_parallel_branches]: 3.5e-06 [overlap_opt_shard_in_pipeline]: 3.80998e-06 [overlap_opt_shard_grad_in_pipeline]: 4.28999e-06 [control_data_broadcast_order]: 2.121e-05 [grouped_pairwise_exchange_alltoall]: 4.03999e-06 [offloading_packed_experts]: 7.7e-06 [overlap_recompute_and_grad_model_parallel]: 8.12e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.70998e-06 [overlap_recompute_allgather_and_fa_grad]: 4.34002e-06 [overlap_recompute_comm]: 5.45001e-06 [overlap_grad_ring_attention]: 7.46999e-06 [overlap_grad_flash_sp]: 2.914e-05 [begin_end_overlap_inline]: 2.94999e-06 [split_matmul_comm_elemetwise]: 4.45e-06 [split_layernorm_comm]: 4.58999e-06 [handle_group_info]: 3.41999e-06 [symbol_engine_optimizer]: 0.00012334, [1] [Cycle 1]: 0.00011484, [6] [build]: 5.47999e-06 [elim_shapecalc]: 1.549e-05 [elim_not_effective]: 1.939e-05 [opt_reshape]: 9.84001e-06 [fold_const_symbol]: 1.367e-05 [renormalize]: 2.30008e-07 [detach_backward]: 4.53999e-06 [pipeline_parallel_scheduler]: 2.64001e-06 [auto_monad_reorder]: 2.956e-05 [get_jit_bprop_graph]: 2.58998e-06 [rewriter_after_jit_bprop_graph]: 6.28e-06 [opt_after_jit_grad]: 0.00082147 [validate]: 5.568e-05 Sums bootstrap : 0.000431s : 1.46% type_inference : 0.006268s : 21.27% event_method : 0.000022s : 0.07% auto_monad : 0.000067s : 0.23% graph_reusing : 0.000007s : 0.02% inline : 0.000003s : 0.01% add_attr.add_attr_with_inline.tag_attr : 0.000024s : 0.08% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.02% parallel-infer-symbol : 0.000003s : 0.01% pre_auto_parallel : 0.000040s : 0.14% insert-virtual-dataset : 0.000003s : 0.01% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.01% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000034s : 0.12% optimize.rewriter_before_opt_a : 0.000096s : 0.33% optimize.opt_a.expand_dump_flag : 0.000005s : 0.02% optimize.opt_a.switch_simplify : 0.000054s : 0.18% optimize.opt_a.loop_unroll : 0.000040s : 0.13% optimize.opt_a.a_1 : 0.000898s : 3.05% optimize.opt_a.with_stream_mark : 0.000038s : 0.13% optimize.opt_a.recompute_prepare : 0.000022s : 0.08% optimize.opt_a.updatestate_depend_eliminate : 0.000010s : 0.03% optimize.opt_a.updatestate_assign_eliminate : 0.000008s : 0.03% optimize.opt_a.updatestate_loads_eliminate : 0.000007s : 0.02% optimize.opt_a.parameter_eliminate : 0.000004s : 0.01% optimize.opt_a.a_2 : 0.000257s : 0.87% optimize.opt_a.accelerated_algorithm : 0.000018s : 0.06% optimize.opt_a.shard : 0.000005s : 0.02% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.01% optimize.opt_a.shard_inline : 0.000017s : 0.06% optimize.opt_a.merge_send_recv : 0.000018s : 0.06% optimize.opt_a.auto_parallel : 0.000018s : 0.06% optimize.opt_a.parallel : 0.000029s : 0.10% optimize.opt_a.flash_sp : 0.000014s : 0.05% optimize.opt_a.merge_comm : 0.000009s : 0.03% optimize.opt_a.allreduce_fusion : 0.000008s : 0.03% optimize.opt_a.matmul_add_comm_reduction : 0.000023s : 0.08% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000021s : 0.07% optimize.opt_a.virtual_dataset : 0.000016s : 0.05% optimize.opt_a.get_grad_eliminate_ : 0.000015s : 0.05% optimize.opt_a.virtual_output : 0.000015s : 0.05% optimize.opt_a.merge_forward : 0.000009s : 0.03% optimize.opt_a.cell_reuse_recompute_pass : 0.000004s : 0.01% optimize.opt_a.offload_activation : 0.000023s : 0.08% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000037s : 0.13% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.01% optimize.opt_a.before_grad : 0.000027s : 0.09% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000009s : 0.03% optimize.opt_a.meta_fg_expand : 0.000007s : 0.02% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.01% optimize.opt_a.receive_attached : 0.000004s : 0.01% optimize.opt_a.after_resolve : 0.000028s : 0.09% optimize.opt_a.a_after_grad : 0.000025s : 0.08% optimize.opt_a.renormalize : 0.001010s : 3.43% optimize.opt_a.add_forward_monad_depend : 0.000010s : 0.03% optimize.opt_a.auto_monad_grad : 0.000004s : 0.01% optimize.opt_a.auto_monad_eliminator : 0.000033s : 0.11% optimize.opt_a.cse : 0.000058s : 0.20% optimize.opt_a.a_3 : 0.000141s : 0.48% optimize.py_interpret_to_execute_after_opt_a : 0.000021s : 0.07% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.02% optimize.rewriter_after_opt_a : 0.000055s : 0.19% optimize.convert_after_rewriter : 0.000012s : 0.04% optimize.order_py_execute_after_rewriter : 0.000009s : 0.03% optimize.mutable_eliminate : 0.000783s : 2.66% optimize.opt_b.b_1 : 0.000222s : 0.75% optimize.opt_b.b_2 : 0.000012s : 0.04% optimize.opt_b.updatestate_depend_eliminate : 0.000010s : 0.03% optimize.opt_b.updatestate_assign_eliminate : 0.000004s : 0.01% optimize.opt_b.updatestate_loads_eliminate : 0.000004s : 0.01% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000034s : 0.12% optimize.optimize_parallel_all_gather_comm : 0.000028s : 0.09% optimize.overlap_param_gather : 0.000005s : 0.02% optimize.cconv : 0.000040s : 0.13% optimize.loop_unroll : 0.000631s : 2.14% optimize.opt_after_cconv.c_1 : 0.016083s : 54.57% optimize.opt_after_cconv.parameter_eliminate : 0.000008s : 0.03% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000018s : 0.06% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000004s : 0.01% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000004s : 0.01% optimize.opt_after_cconv.cse : 0.000055s : 0.19% optimize.opt_after_cconv.renormalize : 0.000001s : 0.00% optimize.remove_dup_value : 0.000025s : 0.08% optimize.tuple_transform.d_1 : 0.000077s : 0.26% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000010s : 0.03% optimize.partial_unused_args_eliminate : 0.000005s : 0.02% optimize.add_recomputation : 0.000074s : 0.25% optimize.cse_after_recomputation.cse : 0.000017s : 0.06% optimize.environ_conv : 0.000012s : 0.04% optimize.swap_dp_allreduce_reducescatter : 0.000009s : 0.03% optimize.bias_add_comm_swap : 0.000006s : 0.02% optimize.label_micro_interleaved_index : 0.000012s : 0.04% optimize.label_fine_grained_interleaved_index : 0.000005s : 0.02% optimize.merge_cast_opt : 0.000004s : 0.01% optimize.slice_recompute_activation : 0.000005s : 0.02% optimize.micro_interleaved_order_control : 0.000005s : 0.02% optimize.assign_add_opt : 0.000004s : 0.01% optimize.ForceFp32Comm : 0.000003s : 0.01% optimize.remove_cast_before_assign_add : 0.000004s : 0.01% optimize.full_micro_interleaved_order_control : 0.000005s : 0.02% optimize.reorder_send_recv_between_fp_bp : 0.000006s : 0.02% optimize.comm_op_add_attrs : 0.000004s : 0.01% optimize.add_comm_op_reuse_tag : 0.000003s : 0.01% optimize.interleave_split_concat_branches : 0.000003s : 0.01% optimize.interleave_parallel_branches : 0.000003s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000004s : 0.01% optimize.control_data_broadcast_order : 0.000021s : 0.07% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.01% optimize.offloading_packed_experts : 0.000008s : 0.03% optimize.overlap_recompute_and_grad_model_parallel : 0.000008s : 0.03% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.01% optimize.overlap_recompute_comm : 0.000005s : 0.02% optimize.overlap_grad_ring_attention : 0.000007s : 0.03% optimize.overlap_grad_flash_sp : 0.000029s : 0.10% optimize.begin_end_overlap_inline : 0.000003s : 0.01% optimize.split_matmul_comm_elemetwise : 0.000004s : 0.02% optimize.split_layernorm_comm : 0.000005s : 0.02% optimize.handle_group_info : 0.000003s : 0.01% optimize.symbol_engine_optimizer.build : 0.000005s : 0.02% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000015s : 0.05% optimize.symbol_engine_optimizer.elim_not_effective : 0.000019s : 0.07% optimize.symbol_engine_optimizer.opt_reshape : 0.000010s : 0.03% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000014s : 0.05% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000005s : 0.02% pipeline_parallel_scheduler : 0.000003s : 0.01% auto_monad_reorder : 0.000030s : 0.10% get_jit_bprop_graph : 0.000003s : 0.01% rewriter_after_jit_bprop_graph : 0.000006s : 0.02% opt_after_jit_grad : 0.000821s : 2.79% validate : 0.000056s : 0.19% Time group info: ------[substitution.] 0.000248 39 12.35% : 0.000031s : 3: substitution.cast_eliminate 1.05% : 0.000003s : 3: substitution.elim_not_effective 0.70% : 0.000002s : 3: substitution.fold_const_symbol 3.68% : 0.000009s : 5: substitution.graph_param_transform 66.80% : 0.000166s : 4: substitution.inline 2.12% : 0.000005s : 6: substitution.j_node_and_user_rematch 2.55% : 0.000006s : 6: substitution.remove_not_recompute_node 2.66% : 0.000007s : 4: substitution.replace_old_param 5.17% : 0.000013s : 4: substitution.tuple_list_get_item_eliminator 2.92% : 0.000007s : 1: substitution.value_based_eliminate ------[type_inference.] 0.006205 2 87.53% : 0.005432s : 1: type_inference.infer 12.47% : 0.000774s : 1: type_inference.specialize ------[replace.] 0.000069 8 62.46% : 0.000043s : 4: replace.inline 37.54% : 0.000026s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000174 8 93.67% : 0.000163s : 4: match.inline 6.33% : 0.000011s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000284 1504 0.77% : 0.000002s : 15: predicate.accumulaten_eliminater 0.95% : 0.000003s : 5: predicate.ad_related_special_op_eliminate 0.52% : 0.000001s : 10: predicate.addn_check_dump 0.84% : 0.000002s : 15: predicate.addn_zero_filter 0.72% : 0.000002s : 15: predicate.adjust_all_reduce_mul_add 1.84% : 0.000005s : 25: predicate.arithmetic_simplify 0.88% : 0.000002s : 15: predicate.cast_eliminate 0.57% : 0.000002s : 10: predicate.check_bprop_eliminate 0.53% : 0.000002s : 10: predicate.compare_switch_simplify 0.16% : 0.000000s : 5: predicate.const_output_eliminate 0.58% : 0.000002s : 10: predicate.depend_value_elim 0.79% : 0.000002s : 15: predicate.dict_get_item_const_eliminator 0.86% : 0.000002s : 15: predicate.dict_get_item_eliminator 0.90% : 0.000003s : 15: predicate.dict_set_item_eliminator 1.17% : 0.000003s : 10: predicate.dumpgradient_eliminate 0.16% : 0.000000s : 5: predicate.elim_not_effective 0.46% : 0.000001s : 5: predicate.elim_shapecalc_of_broadcastargs 1.17% : 0.000003s : 20: predicate.environ_add_const_eliminate 1.19% : 0.000003s : 20: predicate.environ_get_add_eliminate 0.93% : 0.000003s : 20: predicate.environ_get_depend_swap 1.58% : 0.000004s : 30: predicate.environ_get_eliminate 0.92% : 0.000003s : 20: predicate.environ_get_set_eliminate 1.19% : 0.000003s : 23: predicate.exchange_switch_depend_value 1.98% : 0.000006s : 23: predicate.float_depend_g_call 0.53% : 0.000002s : 10: predicate.float_environ_get_switch 0.79% : 0.000002s : 15: predicate.float_tuple_getitem_switch 0.17% : 0.000000s : 5: predicate.fold_const_symbol 0.61% : 0.000002s : 10: predicate.get_grad_eliminate 0.27% : 0.000001s : 5: predicate.graph_param_transform 0.53% : 0.000002s : 10: predicate.incorporate_call 0.46% : 0.000001s : 10: predicate.incorporate_call_switch 6.20% : 0.000018s : 68: predicate.inline 0.70% : 0.000002s : 10: predicate.inline_without_move 0.29% : 0.000001s : 10: predicate.j_node_and_user_rematch 0.86% : 0.000002s : 10: predicate.less_batch_normalization 1.78% : 0.000005s : 29: predicate.list_to_tuple_eliminator_ 2.36% : 0.000007s : 44: predicate.load_eliminater 1.05% : 0.000003s : 5: predicate.loop_unroll_after_grad 1.87% : 0.000005s : 36: predicate.loop_unroll_before_grad 2.14% : 0.000006s : 25: predicate.make_slice_get_slice_eliminator 0.55% : 0.000002s : 10: predicate.merge_addn 0.55% : 0.000002s : 10: predicate.micro_step_allgather_replace 0.49% : 0.000001s : 10: predicate.mini_step_allgather_replace 0.77% : 0.000002s : 15: predicate.minmaximum_grad 1.14% : 0.000003s : 5: predicate.mutable_eliminate 0.46% : 0.000001s : 5: predicate.opt_reshape 0.39% : 0.000001s : 5: predicate.parallel_virtual_node 1.44% : 0.000004s : 23: predicate.partial_defer_inline 1.43% : 0.000004s : 24: predicate.partial_eliminate 0.82% : 0.000002s : 15: predicate.print_const_string_wrapper 0.56% : 0.000002s : 10: predicate.reduce_all_const_elim 1.20% : 0.000003s : 15: predicate.reduce_eliminate 2.47% : 0.000007s : 44: predicate.redundant_stop_gradient_eliminater 0.33% : 0.000001s : 10: predicate.remove_not_recompute_node 1.12% : 0.000003s : 29: predicate.replace_applicator 0.50% : 0.000001s : 10: predicate.replace_old_param 0.43% : 0.000001s : 5: predicate.reset_defer_inline 0.91% : 0.000003s : 15: predicate.reshape_eliminate 0.60% : 0.000002s : 10: predicate.row_tensor_add_zeros_like 0.44% : 0.000001s : 5: predicate.row_tensor_eliminate 0.74% : 0.000002s : 10: predicate.same_eliminate 0.37% : 0.000001s : 10: predicate.set_cell_output_no_recompute 0.83% : 0.000002s : 10: predicate.shard_identity_eliminate 0.74% : 0.000002s : 10: predicate.special_op_eliminate 0.73% : 0.000002s : 10: predicate.specialize_transform 0.88% : 0.000003s : 10: predicate.split_environ_get_set_with_tuple_value 0.73% : 0.000002s : 10: predicate.stack_unstack_eliminate 1.27% : 0.000004s : 5: predicate.switch_call_monad_eliminater 1.21% : 0.000003s : 23: predicate.switch_defer_inline 1.73% : 0.000005s : 33: predicate.switch_layer_defer_inline 4.53% : 0.000013s : 74: predicate.switch_simplify 0.79% : 0.000002s : 15: predicate.tile_eliminate 0.79% : 0.000002s : 15: predicate.transpose_eliminate 1.35% : 0.000004s : 25: predicate.tuple_list_convert_item_index_to_positive 1.34% : 0.000004s : 25: predicate.tuple_list_get_item_const_eliminator 1.51% : 0.000004s : 25: predicate.tuple_list_get_item_depend_reorder 2.88% : 0.000008s : 39: predicate.tuple_list_get_item_eliminator 1.51% : 0.000004s : 25: predicate.tuple_list_get_set_item_eliminator 2.23% : 0.000006s : 35: predicate.tuple_list_set_item_eliminator 1.66% : 0.000005s : 29: predicate.tuple_to_list_eliminator_ 2.58% : 0.000007s : 44: predicate.updatestate_pure_node_eliminater 8.34% : 0.000024s : 54: predicate.updatestate_useless_node_eliminater 0.53% : 0.000002s : 5: predicate.value_based_eliminate 0.59% : 0.000002s : 10: predicate.virtual_dataset_eliminate 0.57% : 0.000002s : 10: predicate.virtual_output_eliminate 0.25% : 0.000001s : 5: predicate.virtual_view_grad_eliminate 0.42% : 0.000001s : 5: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000616 11 53.06% : 0.000327s : 5: func_graph_cloner_run.FuncGraphClonerGraph 46.94% : 0.000289s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.080205 192 0.01% : 0.000006s : 1: ForceFp32Comm 4.34% : 0.003478s : 1: add_attr 4.31% : 0.003460s : 1: add_attr_with_inline 0.01% : 0.000006s : 1: add_comm_op_reuse_tag 0.10% : 0.000078s : 1: add_recomputation 0.01% : 0.000007s : 1: assign_add_opt 0.10% : 0.000078s : 1: auto_monad 0.05% : 0.000037s : 1: auto_monad_reorder 0.01% : 0.000006s : 1: begin_end_overlap_inline 0.01% : 0.000009s : 1: bias_add_comm_swap 0.60% : 0.000481s : 1: bootstrap 0.05% : 0.000044s : 1: cconv 0.01% : 0.000007s : 1: comm_op_add_attrs 0.03% : 0.000024s : 1: control_data_broadcast_order 0.02% : 0.000015s : 1: convert_after_rewriter 0.05% : 0.000037s : 1: cse_after_recomputation 0.01% : 0.000008s : 1: dataset_repeat_opt 0.03% : 0.000026s : 1: detach_backward 0.02% : 0.000015s : 1: environ_conv 0.04% : 0.000033s : 1: event_method 0.01% : 0.000007s : 1: full_micro_interleaved_order_control 0.01% : 0.000009s : 1: get_jit_bprop_graph 0.02% : 0.000014s : 1: graph_reusing 0.01% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000006s : 1: handle_group_info 0.01% : 0.000008s : 1: inline 0.01% : 0.000009s : 1: insert-virtual-dataset 0.01% : 0.000006s : 1: interleave_parallel_branches 0.01% : 0.000006s : 1: interleave_split_concat_branches 0.01% : 0.000008s : 1: label_fine_grained_interleaved_index 0.02% : 0.000015s : 1: label_micro_interleaved_index 0.80% : 0.000639s : 1: loop_unroll 0.01% : 0.000007s : 1: merge_cast_opt 0.01% : 0.000008s : 1: micro_interleaved_order_control 0.99% : 0.000793s : 1: mutable_eliminate 0.01% : 0.000011s : 1: offloading_packed_experts 0.03% : 0.000022s : 1: opt.transform.loop_unroll_optimizer 0.03% : 0.000023s : 1: opt.transform.mutable_eliminate 1.80% : 0.001447s : 78: opt.transform.opt_a 20.05% : 0.016077s : 1: opt.transform.opt_after_cconv 0.05% : 0.000040s : 1: opt.transform.opt_after_jit_grad 0.20% : 0.000156s : 28: opt.transform.opt_b 0.10% : 0.000082s : 2: opt.transform.opt_trans_graph 0.07% : 0.000053s : 4: opt.transform.symbol_engine_opt 4.73% : 0.003794s : 1: opt_a 20.36% : 0.016327s : 1: opt_after_cconv 1.04% : 0.000835s : 1: opt_after_jit_grad 0.46% : 0.000368s : 1: opt_b 29.22% : 0.023439s : 1: optimize 0.04% : 0.000031s : 1: optimize_parallel_all_gather_comm 0.01% : 0.000012s : 1: order_py_execute_after_rewriter 0.04% : 0.000032s : 1: overlap_grad_flash_sp 0.01% : 0.000007s : 1: overlap_grad_matmul_and_grad_allreduce 0.01% : 0.000011s : 1: overlap_grad_ring_attention 0.01% : 0.000008s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000007s : 1: overlap_opt_shard_in_pipeline 0.01% : 0.000008s : 1: overlap_param_gather 0.01% : 0.000007s : 1: overlap_recompute_allgather_and_fa_grad 0.01% : 0.000011s : 1: overlap_recompute_and_grad_model_parallel 0.01% : 0.000008s : 1: overlap_recompute_comm 0.01% : 0.000010s : 1: parallel-infer-symbol 0.01% : 0.000007s : 1: parallel-infer-symbol-second 0.01% : 0.000008s : 1: partial_unused_args_eliminate 0.02% : 0.000013s : 1: pipeline_parallel_scheduler 0.01% : 0.000007s : 1: pipeline_split 0.06% : 0.000048s : 1: pre_auto_parallel 0.05% : 0.000039s : 1: py_interpret_to_execute 0.03% : 0.000024s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000007s : 1: remove_cast_before_assign_add 0.04% : 0.000029s : 1: remove_dup_value 0.74% : 0.000592s : 1: renormalize.infer 0.51% : 0.000406s : 1: renormalize.specialize 0.01% : 0.000009s : 1: reorder_send_recv_between_fp_bp 0.02% : 0.000012s : 1: rewriter_after_jit_bprop_graph 0.07% : 0.000059s : 1: rewriter_after_opt_a 0.12% : 0.000100s : 1: rewriter_before_opt_a 0.01% : 0.000008s : 1: slice_cell_reuse_recomputed_activation 0.01% : 0.000007s : 1: slice_recompute_activation 0.01% : 0.000008s : 1: split_layernorm_comm 0.01% : 0.000007s : 1: split_matmul_comm_elemetwise 0.01% : 0.000012s : 1: swap_dp_allreduce_reducescatter 0.16% : 0.000127s : 1: symbol_engine_optimizer 0.18% : 0.000141s : 1: tuple_transform 7.87% : 0.006312s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:44:06.946.087 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0326703, [21] [bootstrap]: 0.00042841 [type_inference]: 0.0205294 [event_method]: 2.396e-05 [auto_monad]: 7.215e-05 [graph_reusing]: 6.48e-06 [inline]: 3.60998e-06 [add_attr]: 0.00466191, [1] [add_attr_with_inline]: 0.00464707, [1] [Cycle 1]: 8.055e-05, [2] [tag_attr]: 2.8e-05 [meta_addattr_fg_expand]: 5.85002e-06 [parallel-infer-symbol]: 3.48999e-06 [pre_auto_parallel]: 4.558e-05 [insert-virtual-dataset]: 2.36e-06 [parallel-infer-symbol-second]: 7.30011e-07 [dataset_repeat_opt]: 2.29001e-06 [pipeline_split]: 1.74e-06 [optimize]: 0.0061036, [53] [py_interpret_to_execute]: 3.463e-05 [rewriter_before_opt_a]: 0.00010421 [opt_a]: 0.00354704, [2] [Cycle 1]: 0.00260938, [45] [expand_dump_flag]: 3.11999e-06 [switch_simplify]: 4.451e-05 [loop_unroll]: 3.183e-05 [a_1]: 0.00073588 [with_stream_mark]: 2.12e-05 [recompute_prepare]: 1.189e-05 [updatestate_depend_eliminate]: 5.41998e-06 [updatestate_assign_eliminate]: 4.13001e-06 [updatestate_loads_eliminate]: 3.70998e-06 [parameter_eliminate]: 1.94999e-06 [a_2]: 0.0001051 [accelerated_algorithm]: 9.70002e-06 [shard]: 2.99999e-06 [meta_shard_fg_expand]: 2.19001e-06 [shard_inline]: 8.40999e-06 [merge_send_recv]: 9.37001e-06 [auto_parallel]: 9.93002e-06 [parallel]: 2.15e-05 [flash_sp]: 1.043e-05 [merge_comm]: 4.87998e-06 [allreduce_fusion]: 4.38001e-06 [matmul_add_comm_reduction]: 1.117e-05 [allreduce_slice_to_reducescatter]: 1.20999e-06 [virtual_shard_identity]: 9.89001e-06 [virtual_dataset]: 8.92e-06 [get_grad_eliminate_]: 7.4e-06 [virtual_output]: 8.16002e-06 [merge_forward]: 4.50999e-06 [cell_reuse_recompute_pass]: 1.56002e-06 [offload_activation]: 1.194e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.717e-05 [merge_recompute_call_nodes]: 1.48002e-06 [before_grad]: 1.473e-05 [set_forward_comm_id_for_comm_node_pass]: 5.52999e-06 [meta_fg_expand]: 3.73999e-06 [flash_sp_send_recv_attached]: 2.71e-06 [receive_attached]: 2.78e-06 [after_resolve]: 1.361e-05 [a_after_grad]: 1.246e-05 [renormalize]: 0.00100909 [add_forward_monad_depend]: 6.91999e-06 [auto_monad_grad]: 2.91999e-06 [auto_monad_eliminator]: 2.009e-05 [cse]: 3.969e-05 [a_3]: 6.563e-05 [Cycle 2]: 0.00092565, [45] [expand_dump_flag]: 2.51e-06 [switch_simplify]: 1.275e-05 [loop_unroll]: 9.50001e-06 [a_1]: 0.00020351 [with_stream_mark]: 1.981e-05 [recompute_prepare]: 8.13999e-06 [updatestate_depend_eliminate]: 4.15e-06 [updatestate_assign_eliminate]: 3.40998e-06 [updatestate_loads_eliminate]: 3.52997e-06 [parameter_eliminate]: 1.44e-06 [a_2]: 9.418e-05 [accelerated_algorithm]: 8.25e-06 [shard]: 1.79998e-06 [meta_shard_fg_expand]: 2.09e-06 [shard_inline]: 7.77e-06 [merge_send_recv]: 8.47e-06 [auto_parallel]: 9.44e-06 [parallel]: 8.13001e-06 [flash_sp]: 4.23001e-06 [merge_comm]: 4.87e-06 [allreduce_fusion]: 3.95e-06 [matmul_add_comm_reduction]: 8.12998e-06 [allreduce_slice_to_reducescatter]: 6.59988e-07 [virtual_shard_identity]: 8.79e-06 [virtual_dataset]: 7.49002e-06 [get_grad_eliminate_]: 7.01999e-06 [virtual_output]: 8.95001e-06 [merge_forward]: 4.45999e-06 [cell_reuse_recompute_pass]: 2.37999e-06 [offload_activation]: 9.77999e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.542e-05 [merge_recompute_call_nodes]: 1.39e-06 [before_grad]: 1.172e-05 [set_forward_comm_id_for_comm_node_pass]: 4.97999e-06 [meta_fg_expand]: 3.19001e-06 [flash_sp_send_recv_attached]: 1.02e-06 [receive_attached]: 1.64e-06 [after_resolve]: 1.336e-05 [a_after_grad]: 1.149e-05 [renormalize]: 7.00238e-08 [add_forward_monad_depend]: 1.69e-06 [auto_monad_grad]: 1.52001e-06 [auto_monad_eliminator]: 1.003e-05 [cse]: 2.186e-05 [a_3]: 4.656e-05 [py_interpret_to_execute_after_opt_a]: 1.677e-05 [slice_cell_reuse_recomputed_activation]: 1.86e-06 [rewriter_after_opt_a]: 4.485e-05 [convert_after_rewriter]: 8.28001e-06 [order_py_execute_after_rewriter]: 5.61e-06 [mutable_eliminate]: 0.00071912 [opt_b]: 0.00027572, [1] [Cycle 1]: 0.00026787, [7] [b_1]: 0.00017309 [b_2]: 1.027e-05 [updatestate_depend_eliminate]: 9.54e-06 [updatestate_assign_eliminate]: 3.35e-06 [updatestate_loads_eliminate]: 2.94001e-06 [renormalize]: 1.00999e-06 [cse]: 2.789e-05 [optimize_parallel_all_gather_comm]: 1.923e-05 [overlap_param_gather]: 2.14999e-06 [cconv]: 3.177e-05 [loop_unroll]: 0.00048007 [opt_after_cconv]: 0.00012448, [1] [Cycle 1]: 0.00011825, [7] [c_1]: 3.85e-05 [parameter_eliminate]: 4.38999e-06 [updatestate_depend_eliminate]: 6.88e-06 [updatestate_assign_eliminate]: 3.25e-06 [updatestate_loads_eliminate]: 2.93e-06 [cse]: 2.524e-05 [renormalize]: 4.69998e-07 [remove_dup_value]: 1.548e-05 [tuple_transform]: 9.017e-05, [1] [Cycle 1]: 8.537e-05, [4] [d_1]: 5.488e-05 [none_parameter_eliminate]: 2.02999e-06 [renormalize]: 1.8999e-07 [switch_simplify]: 8.72998e-06 [partial_unused_args_eliminate]: 2.22001e-06 [add_recomputation]: 6.577e-05 [cse_after_recomputation]: 2.737e-05, [1] [Cycle 1]: 2.238e-05, [1] [cse]: 1.647e-05 [environ_conv]: 7.38e-06 [swap_dp_allreduce_reducescatter]: 6.16998e-06 [bias_add_comm_swap]: 2.84001e-06 [label_micro_interleaved_index]: 4.24002e-06 [label_fine_grained_interleaved_index]: 2.76e-06 [merge_cast_opt]: 1.80001e-06 [slice_recompute_activation]: 2.02001e-06 [micro_interleaved_order_control]: 2.59001e-06 [assign_add_opt]: 1.39e-06 [ForceFp32Comm]: 1.19e-06 [remove_cast_before_assign_add]: 1.49e-06 [full_micro_interleaved_order_control]: 2.09e-06 [reorder_send_recv_between_fp_bp]: 2.74001e-06 [comm_op_add_attrs]: 9.39996e-07 [add_comm_op_reuse_tag]: 9.39996e-07 [interleave_split_concat_branches]: 1.13001e-06 [interleave_parallel_branches]: 1.07e-06 [overlap_opt_shard_in_pipeline]: 1.52999e-06 [overlap_opt_shard_grad_in_pipeline]: 1.79998e-06 [control_data_broadcast_order]: 1.655e-05 [grouped_pairwise_exchange_alltoall]: 1.76e-06 [offloading_packed_experts]: 5.22e-06 [overlap_recompute_and_grad_model_parallel]: 6.16e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.15999e-06 [overlap_recompute_allgather_and_fa_grad]: 1.34e-06 [overlap_recompute_comm]: 2.53998e-06 [overlap_grad_ring_attention]: 4.63999e-06 [overlap_grad_flash_sp]: 2.556e-05 [begin_end_overlap_inline]: 5.8001e-07 [split_matmul_comm_elemetwise]: 2.29001e-06 [split_layernorm_comm]: 1.84e-06 [handle_group_info]: 1.42999e-06 [symbol_engine_optimizer]: 8.975e-05, [1] [Cycle 1]: 8.516e-05, [6] [build]: 4.32e-06 [elim_shapecalc]: 1.314e-05 [elim_not_effective]: 1.572e-05 [opt_reshape]: 8.72e-06 [fold_const_symbol]: 1.264e-05 [renormalize]: 2.80008e-07 [detach_backward]: 2.10002e-06 [pipeline_parallel_scheduler]: 1.98002e-06 [auto_monad_reorder]: 2.178e-05 [get_jit_bprop_graph]: 2.34001e-06 [rewriter_after_jit_bprop_graph]: 5.04e-06 [opt_after_jit_grad]: 0.00052837 [validate]: 5.217e-05 Sums bootstrap : 0.000428s : 1.60% type_inference : 0.020529s : 76.47% event_method : 0.000024s : 0.09% auto_monad : 0.000072s : 0.27% graph_reusing : 0.000006s : 0.02% inline : 0.000004s : 0.01% add_attr.add_attr_with_inline.tag_attr : 0.000028s : 0.10% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.02% parallel-infer-symbol : 0.000003s : 0.01% pre_auto_parallel : 0.000046s : 0.17% insert-virtual-dataset : 0.000002s : 0.01% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.01% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000035s : 0.13% optimize.rewriter_before_opt_a : 0.000104s : 0.39% optimize.opt_a.expand_dump_flag : 0.000006s : 0.02% optimize.opt_a.switch_simplify : 0.000057s : 0.21% optimize.opt_a.loop_unroll : 0.000041s : 0.15% optimize.opt_a.a_1 : 0.000939s : 3.50% optimize.opt_a.with_stream_mark : 0.000041s : 0.15% optimize.opt_a.recompute_prepare : 0.000020s : 0.07% optimize.opt_a.updatestate_depend_eliminate : 0.000010s : 0.04% optimize.opt_a.updatestate_assign_eliminate : 0.000008s : 0.03% optimize.opt_a.updatestate_loads_eliminate : 0.000007s : 0.03% optimize.opt_a.parameter_eliminate : 0.000003s : 0.01% optimize.opt_a.a_2 : 0.000199s : 0.74% optimize.opt_a.accelerated_algorithm : 0.000018s : 0.07% optimize.opt_a.shard : 0.000005s : 0.02% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.02% optimize.opt_a.shard_inline : 0.000016s : 0.06% optimize.opt_a.merge_send_recv : 0.000018s : 0.07% optimize.opt_a.auto_parallel : 0.000019s : 0.07% optimize.opt_a.parallel : 0.000030s : 0.11% optimize.opt_a.flash_sp : 0.000015s : 0.05% optimize.opt_a.merge_comm : 0.000010s : 0.04% optimize.opt_a.allreduce_fusion : 0.000008s : 0.03% optimize.opt_a.matmul_add_comm_reduction : 0.000019s : 0.07% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000002s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000019s : 0.07% optimize.opt_a.virtual_dataset : 0.000016s : 0.06% optimize.opt_a.get_grad_eliminate_ : 0.000014s : 0.05% optimize.opt_a.virtual_output : 0.000017s : 0.06% optimize.opt_a.merge_forward : 0.000009s : 0.03% optimize.opt_a.cell_reuse_recompute_pass : 0.000004s : 0.01% optimize.opt_a.offload_activation : 0.000022s : 0.08% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000033s : 0.12% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.01% optimize.opt_a.before_grad : 0.000026s : 0.10% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000011s : 0.04% optimize.opt_a.meta_fg_expand : 0.000007s : 0.03% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.01% optimize.opt_a.receive_attached : 0.000004s : 0.02% optimize.opt_a.after_resolve : 0.000027s : 0.10% optimize.opt_a.a_after_grad : 0.000024s : 0.09% optimize.opt_a.renormalize : 0.001009s : 3.76% optimize.opt_a.add_forward_monad_depend : 0.000009s : 0.03% optimize.opt_a.auto_monad_grad : 0.000004s : 0.02% optimize.opt_a.auto_monad_eliminator : 0.000030s : 0.11% optimize.opt_a.cse : 0.000062s : 0.23% optimize.opt_a.a_3 : 0.000112s : 0.42% optimize.py_interpret_to_execute_after_opt_a : 0.000017s : 0.06% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.01% optimize.rewriter_after_opt_a : 0.000045s : 0.17% optimize.convert_after_rewriter : 0.000008s : 0.03% optimize.order_py_execute_after_rewriter : 0.000006s : 0.02% optimize.mutable_eliminate : 0.000719s : 2.68% optimize.opt_b.b_1 : 0.000173s : 0.64% optimize.opt_b.b_2 : 0.000010s : 0.04% optimize.opt_b.updatestate_depend_eliminate : 0.000010s : 0.04% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.01% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.01% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000028s : 0.10% optimize.optimize_parallel_all_gather_comm : 0.000019s : 0.07% optimize.overlap_param_gather : 0.000002s : 0.01% optimize.cconv : 0.000032s : 0.12% optimize.loop_unroll : 0.000480s : 1.79% optimize.opt_after_cconv.c_1 : 0.000039s : 0.14% optimize.opt_after_cconv.parameter_eliminate : 0.000004s : 0.02% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000007s : 0.03% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.cse : 0.000025s : 0.09% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000015s : 0.06% optimize.tuple_transform.d_1 : 0.000055s : 0.20% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000009s : 0.03% optimize.partial_unused_args_eliminate : 0.000002s : 0.01% optimize.add_recomputation : 0.000066s : 0.24% optimize.cse_after_recomputation.cse : 0.000016s : 0.06% optimize.environ_conv : 0.000007s : 0.03% optimize.swap_dp_allreduce_reducescatter : 0.000006s : 0.02% optimize.bias_add_comm_swap : 0.000003s : 0.01% optimize.label_micro_interleaved_index : 0.000004s : 0.02% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.01% optimize.merge_cast_opt : 0.000002s : 0.01% optimize.slice_recompute_activation : 0.000002s : 0.01% optimize.micro_interleaved_order_control : 0.000003s : 0.01% optimize.assign_add_opt : 0.000001s : 0.01% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.01% optimize.full_micro_interleaved_order_control : 0.000002s : 0.01% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.01% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000002s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.01% optimize.control_data_broadcast_order : 0.000017s : 0.06% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.01% optimize.offloading_packed_experts : 0.000005s : 0.02% optimize.overlap_recompute_and_grad_model_parallel : 0.000006s : 0.02% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.00% optimize.overlap_recompute_comm : 0.000003s : 0.01% optimize.overlap_grad_ring_attention : 0.000005s : 0.02% optimize.overlap_grad_flash_sp : 0.000026s : 0.10% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.01% optimize.split_layernorm_comm : 0.000002s : 0.01% optimize.handle_group_info : 0.000001s : 0.01% optimize.symbol_engine_optimizer.build : 0.000004s : 0.02% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000013s : 0.05% optimize.symbol_engine_optimizer.elim_not_effective : 0.000016s : 0.06% optimize.symbol_engine_optimizer.opt_reshape : 0.000009s : 0.03% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000013s : 0.05% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.01% pipeline_parallel_scheduler : 0.000002s : 0.01% auto_monad_reorder : 0.000022s : 0.08% get_jit_bprop_graph : 0.000002s : 0.01% rewriter_after_jit_bprop_graph : 0.000005s : 0.02% opt_after_jit_grad : 0.000528s : 1.97% validate : 0.000052s : 0.19% Time group info: ------[substitution.] 0.000264 39 12.56% : 0.000033s : 3: substitution.cast_eliminate 0.85% : 0.000002s : 3: substitution.elim_not_effective 0.63% : 0.000002s : 3: substitution.fold_const_symbol 2.73% : 0.000007s : 5: substitution.graph_param_transform 68.48% : 0.000181s : 4: substitution.inline 2.22% : 0.000006s : 6: substitution.j_node_and_user_rematch 2.65% : 0.000007s : 6: substitution.remove_not_recompute_node 2.24% : 0.000006s : 4: substitution.replace_old_param 4.90% : 0.000013s : 4: substitution.tuple_list_get_item_eliminator 2.74% : 0.000007s : 1: substitution.value_based_eliminate ------[type_inference.] 0.020439 2 95.08% : 0.019434s : 1: type_inference.infer 4.92% : 0.001005s : 1: type_inference.specialize ------[replace.] 0.000067 8 63.16% : 0.000043s : 4: replace.inline 36.84% : 0.000025s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000189 8 94.15% : 0.000178s : 4: match.inline 5.85% : 0.000011s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000262 1504 0.86% : 0.000002s : 15: predicate.accumulaten_eliminater 0.94% : 0.000002s : 5: predicate.ad_related_special_op_eliminate 0.55% : 0.000001s : 10: predicate.addn_check_dump 0.96% : 0.000003s : 15: predicate.addn_zero_filter 0.77% : 0.000002s : 15: predicate.adjust_all_reduce_mul_add 2.21% : 0.000006s : 25: predicate.arithmetic_simplify 1.09% : 0.000003s : 15: predicate.cast_eliminate 0.56% : 0.000001s : 10: predicate.check_bprop_eliminate 0.62% : 0.000002s : 10: predicate.compare_switch_simplify 0.19% : 0.000001s : 5: predicate.const_output_eliminate 0.64% : 0.000002s : 10: predicate.depend_value_elim 0.90% : 0.000002s : 15: predicate.dict_get_item_const_eliminator 1.10% : 0.000003s : 15: predicate.dict_get_item_eliminator 0.83% : 0.000002s : 15: predicate.dict_set_item_eliminator 0.94% : 0.000002s : 10: predicate.dumpgradient_eliminate 0.23% : 0.000001s : 5: predicate.elim_not_effective 0.43% : 0.000001s : 5: predicate.elim_shapecalc_of_broadcastargs 1.17% : 0.000003s : 20: predicate.environ_add_const_eliminate 1.08% : 0.000003s : 20: predicate.environ_get_add_eliminate 1.03% : 0.000003s : 20: predicate.environ_get_depend_swap 1.91% : 0.000005s : 30: predicate.environ_get_eliminate 1.10% : 0.000003s : 20: predicate.environ_get_set_eliminate 1.27% : 0.000003s : 23: predicate.exchange_switch_depend_value 2.20% : 0.000006s : 23: predicate.float_depend_g_call 0.67% : 0.000002s : 10: predicate.float_environ_get_switch 0.76% : 0.000002s : 15: predicate.float_tuple_getitem_switch 0.17% : 0.000000s : 5: predicate.fold_const_symbol 0.63% : 0.000002s : 10: predicate.get_grad_eliminate 0.27% : 0.000001s : 5: predicate.graph_param_transform 0.59% : 0.000002s : 10: predicate.incorporate_call 0.54% : 0.000001s : 10: predicate.incorporate_call_switch 6.31% : 0.000017s : 68: predicate.inline 0.75% : 0.000002s : 10: predicate.inline_without_move 0.29% : 0.000001s : 10: predicate.j_node_and_user_rematch 0.98% : 0.000003s : 10: predicate.less_batch_normalization 1.86% : 0.000005s : 29: predicate.list_to_tuple_eliminator_ 2.41% : 0.000006s : 44: predicate.load_eliminater 1.27% : 0.000003s : 5: predicate.loop_unroll_after_grad 2.03% : 0.000005s : 36: predicate.loop_unroll_before_grad 1.80% : 0.000005s : 25: predicate.make_slice_get_slice_eliminator 0.58% : 0.000002s : 10: predicate.merge_addn 0.63% : 0.000002s : 10: predicate.micro_step_allgather_replace 0.58% : 0.000002s : 10: predicate.mini_step_allgather_replace 0.79% : 0.000002s : 15: predicate.minmaximum_grad 1.28% : 0.000003s : 5: predicate.mutable_eliminate 0.37% : 0.000001s : 5: predicate.opt_reshape 0.48% : 0.000001s : 5: predicate.parallel_virtual_node 1.78% : 0.000005s : 23: predicate.partial_defer_inline 1.50% : 0.000004s : 24: predicate.partial_eliminate 0.91% : 0.000002s : 15: predicate.print_const_string_wrapper 0.63% : 0.000002s : 10: predicate.reduce_all_const_elim 1.27% : 0.000003s : 15: predicate.reduce_eliminate 2.37% : 0.000006s : 44: predicate.redundant_stop_gradient_eliminater 0.38% : 0.000001s : 10: predicate.remove_not_recompute_node 1.36% : 0.000004s : 29: predicate.replace_applicator 0.52% : 0.000001s : 10: predicate.replace_old_param 0.32% : 0.000001s : 5: predicate.reset_defer_inline 0.99% : 0.000003s : 15: predicate.reshape_eliminate 0.63% : 0.000002s : 10: predicate.row_tensor_add_zeros_like 0.39% : 0.000001s : 5: predicate.row_tensor_eliminate 0.92% : 0.000002s : 10: predicate.same_eliminate 0.40% : 0.000001s : 10: predicate.set_cell_output_no_recompute 0.74% : 0.000002s : 10: predicate.shard_identity_eliminate 0.71% : 0.000002s : 10: predicate.special_op_eliminate 0.70% : 0.000002s : 10: predicate.specialize_transform 1.08% : 0.000003s : 10: predicate.split_environ_get_set_with_tuple_value 0.79% : 0.000002s : 10: predicate.stack_unstack_eliminate 0.38% : 0.000001s : 5: predicate.switch_call_monad_eliminater 1.34% : 0.000004s : 23: predicate.switch_defer_inline 2.00% : 0.000005s : 33: predicate.switch_layer_defer_inline 4.68% : 0.000012s : 74: predicate.switch_simplify 0.95% : 0.000003s : 15: predicate.tile_eliminate 1.06% : 0.000003s : 15: predicate.transpose_eliminate 1.79% : 0.000005s : 25: predicate.tuple_list_convert_item_index_to_positive 1.55% : 0.000004s : 25: predicate.tuple_list_get_item_const_eliminator 1.49% : 0.000004s : 25: predicate.tuple_list_get_item_depend_reorder 3.03% : 0.000008s : 39: predicate.tuple_list_get_item_eliminator 1.44% : 0.000004s : 25: predicate.tuple_list_get_set_item_eliminator 2.49% : 0.000007s : 35: predicate.tuple_list_set_item_eliminator 1.66% : 0.000004s : 29: predicate.tuple_to_list_eliminator_ 2.31% : 0.000006s : 44: predicate.updatestate_pure_node_eliminater 3.17% : 0.000008s : 54: predicate.updatestate_useless_node_eliminater 0.49% : 0.000001s : 5: predicate.value_based_eliminate 0.84% : 0.000002s : 10: predicate.virtual_dataset_eliminate 0.64% : 0.000002s : 10: predicate.virtual_output_eliminate 0.26% : 0.000001s : 5: predicate.virtual_view_grad_eliminate 0.44% : 0.000001s : 5: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000708 11 47.37% : 0.000335s : 5: func_graph_cloner_run.FuncGraphClonerGraph 52.63% : 0.000373s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.046089 192 0.01% : 0.000004s : 1: ForceFp32Comm 10.13% : 0.004668s : 1: add_attr 10.09% : 0.004652s : 1: add_attr_with_inline 0.01% : 0.000004s : 1: add_comm_op_reuse_tag 0.15% : 0.000071s : 1: add_recomputation 0.01% : 0.000004s : 1: assign_add_opt 0.17% : 0.000079s : 1: auto_monad 0.06% : 0.000026s : 1: auto_monad_reorder 0.01% : 0.000003s : 1: begin_end_overlap_inline 0.01% : 0.000006s : 1: bias_add_comm_swap 1.00% : 0.000459s : 1: bootstrap 0.08% : 0.000035s : 1: cconv 0.01% : 0.000004s : 1: comm_op_add_attrs 0.04% : 0.000020s : 1: control_data_broadcast_order 0.03% : 0.000012s : 1: convert_after_rewriter 0.07% : 0.000031s : 1: cse_after_recomputation 0.01% : 0.000005s : 1: dataset_repeat_opt 0.01% : 0.000006s : 1: detach_backward 0.02% : 0.000011s : 1: environ_conv 0.07% : 0.000031s : 1: event_method 0.01% : 0.000005s : 1: full_micro_interleaved_order_control 0.01% : 0.000006s : 1: get_jit_bprop_graph 0.02% : 0.000010s : 1: graph_reusing 0.01% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000004s : 1: handle_group_info 0.01% : 0.000007s : 1: inline 0.01% : 0.000006s : 1: insert-virtual-dataset 0.01% : 0.000004s : 1: interleave_parallel_branches 0.01% : 0.000004s : 1: interleave_split_concat_branches 0.01% : 0.000006s : 1: label_fine_grained_interleaved_index 0.02% : 0.000008s : 1: label_micro_interleaved_index 1.06% : 0.000489s : 1: loop_unroll 0.01% : 0.000005s : 1: merge_cast_opt 0.01% : 0.000006s : 1: micro_interleaved_order_control 1.58% : 0.000729s : 1: mutable_eliminate 0.02% : 0.000008s : 1: offloading_packed_experts 0.04% : 0.000018s : 1: opt.transform.loop_unroll_optimizer 0.05% : 0.000021s : 1: opt.transform.mutable_eliminate 3.21% : 0.001481s : 78: opt.transform.opt_a 0.08% : 0.000037s : 1: opt.transform.opt_after_cconv 0.07% : 0.000031s : 1: opt.transform.opt_after_jit_grad 0.32% : 0.000149s : 28: opt.transform.opt_b 0.13% : 0.000061s : 2: opt.transform.opt_trans_graph 0.10% : 0.000046s : 4: opt.transform.symbol_engine_opt 7.70% : 0.003550s : 1: opt_a 0.28% : 0.000128s : 1: opt_after_cconv 1.17% : 0.000539s : 1: opt_after_jit_grad 0.61% : 0.000280s : 1: opt_b 13.25% : 0.006109s : 1: optimize 0.05% : 0.000023s : 1: optimize_parallel_all_gather_comm 0.02% : 0.000009s : 1: order_py_execute_after_rewriter 0.06% : 0.000029s : 1: overlap_grad_flash_sp 0.01% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.02% : 0.000008s : 1: overlap_grad_ring_attention 0.01% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000005s : 1: overlap_opt_shard_in_pipeline 0.01% : 0.000005s : 1: overlap_param_gather 0.01% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.02% : 0.000009s : 1: overlap_recompute_and_grad_model_parallel 0.01% : 0.000005s : 1: overlap_recompute_comm 0.02% : 0.000008s : 1: parallel-infer-symbol 0.01% : 0.000004s : 1: parallel-infer-symbol-second 0.01% : 0.000005s : 1: partial_unused_args_eliminate 0.01% : 0.000005s : 1: pipeline_parallel_scheduler 0.01% : 0.000005s : 1: pipeline_split 0.11% : 0.000050s : 1: pre_auto_parallel 0.08% : 0.000039s : 1: py_interpret_to_execute 0.04% : 0.000020s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000005s : 1: remove_cast_before_assign_add 0.04% : 0.000020s : 1: remove_dup_value 1.28% : 0.000589s : 1: renormalize.infer 0.89% : 0.000408s : 1: renormalize.specialize 0.01% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.02% : 0.000009s : 1: rewriter_after_jit_bprop_graph 0.11% : 0.000049s : 1: rewriter_after_opt_a 0.24% : 0.000109s : 1: rewriter_before_opt_a 0.01% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.01% : 0.000005s : 1: slice_recompute_activation 0.01% : 0.000004s : 1: split_layernorm_comm 0.01% : 0.000005s : 1: split_matmul_comm_elemetwise 0.02% : 0.000010s : 1: swap_dp_allreduce_reducescatter 0.20% : 0.000092s : 1: symbol_engine_optimizer 0.20% : 0.000093s : 1: tuple_transform 44.60% : 0.020556s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:44:07.585.776 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:44:07.586.081 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0975777, [21] [bootstrap]: 0.00045297 [type_inference]: 0.0658009 [event_method]: 2.319e-05 [auto_monad]: 7.064e-05 [graph_reusing]: 5.79e-06 [inline]: 2.54001e-06 [add_attr]: 0.00390158, [1] [add_attr_with_inline]: 0.00388981, [1] [Cycle 1]: 9.3e-05, [2] [tag_attr]: 2.528e-05 [meta_addattr_fg_expand]: 6.29001e-06 [parallel-infer-symbol]: 4.12e-06 [pre_auto_parallel]: 4.269e-05 [insert-virtual-dataset]: 2.86999e-06 [parallel-infer-symbol-second]: 7.2e-07 [dataset_repeat_opt]: 2.00002e-06 [pipeline_split]: 1.63002e-06 [optimize]: 0.0197107, [53] [py_interpret_to_execute]: 4.501e-05 [rewriter_before_opt_a]: 0.00010622 [opt_a]: 0.0042394, [2] [Cycle 1]: 0.00307185, [45] [expand_dump_flag]: 3.51999e-06 [switch_simplify]: 4.448e-05 [loop_unroll]: 3.262e-05 [a_1]: 0.0007477 [with_stream_mark]: 2.276e-05 [recompute_prepare]: 1.236e-05 [updatestate_depend_eliminate]: 5.57001e-06 [updatestate_assign_eliminate]: 4.94998e-06 [updatestate_loads_eliminate]: 4.37e-06 [parameter_eliminate]: 2.02001e-06 [a_2]: 0.00017241 [accelerated_algorithm]: 1.061e-05 [shard]: 2.21e-06 [meta_shard_fg_expand]: 2.63998e-06 [shard_inline]: 1.013e-05 [merge_send_recv]: 1.148e-05 [auto_parallel]: 1.053e-05 [parallel]: 2.263e-05 [flash_sp]: 1.131e-05 [merge_comm]: 6.08998e-06 [allreduce_fusion]: 5.79999e-06 [matmul_add_comm_reduction]: 1.172e-05 [allreduce_slice_to_reducescatter]: 9.50007e-07 [virtual_shard_identity]: 1.4e-05 [virtual_dataset]: 1.005e-05 [get_grad_eliminate_]: 9.54e-06 [virtual_output]: 1.545e-05 [merge_forward]: 6.60002e-06 [cell_reuse_recompute_pass]: 1.69e-06 [offload_activation]: 1.374e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.424e-05 [merge_recompute_call_nodes]: 1.76e-06 [before_grad]: 1.642e-05 [set_forward_comm_id_for_comm_node_pass]: 5.56002e-06 [meta_fg_expand]: 4.54002e-06 [flash_sp_send_recv_attached]: 4.02e-06 [receive_attached]: 2.88e-06 [after_resolve]: 1.649e-05 [a_after_grad]: 1.462e-05 [renormalize]: 0.00113112 [add_forward_monad_depend]: 1.002e-05 [auto_monad_grad]: 2.39001e-06 [auto_monad_eliminator]: 2.629e-05 [cse]: 4.769e-05 [a_3]: 9.384e-05 [Cycle 2]: 0.0011475, [45] [expand_dump_flag]: 2.37001e-06 [switch_simplify]: 1.166e-05 [loop_unroll]: 9.72999e-06 [a_1]: 0.00022777 [with_stream_mark]: 2.191e-05 [recompute_prepare]: 1.013e-05 [updatestate_depend_eliminate]: 6.37001e-06 [updatestate_assign_eliminate]: 4.28999e-06 [updatestate_loads_eliminate]: 4.78001e-06 [parameter_eliminate]: 1.71998e-06 [a_2]: 0.00014176 [accelerated_algorithm]: 9.82001e-06 [shard]: 2.35002e-06 [meta_shard_fg_expand]: 2.94001e-06 [shard_inline]: 9.94001e-06 [merge_send_recv]: 9.47001e-06 [auto_parallel]: 1.104e-05 [parallel]: 1.044e-05 [flash_sp]: 3.93001e-06 [merge_comm]: 4.98001e-06 [allreduce_fusion]: 5.27001e-06 [matmul_add_comm_reduction]: 1.16e-05 [allreduce_slice_to_reducescatter]: 7.10017e-07 [virtual_shard_identity]: 1.013e-05 [virtual_dataset]: 9.40001e-06 [get_grad_eliminate_]: 9.02e-06 [virtual_output]: 8.95001e-06 [merge_forward]: 5.67001e-06 [cell_reuse_recompute_pass]: 3.78001e-06 [offload_activation]: 1.332e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.201e-05 [merge_recompute_call_nodes]: 1.69998e-06 [before_grad]: 1.525e-05 [set_forward_comm_id_for_comm_node_pass]: 6.17999e-06 [meta_fg_expand]: 4.08999e-06 [flash_sp_send_recv_attached]: 1.92999e-06 [receive_attached]: 2.51e-06 [after_resolve]: 1.552e-05 [a_after_grad]: 1.363e-05 [renormalize]: 1.00001e-07 [add_forward_monad_depend]: 2.71e-06 [auto_monad_grad]: 1.81998e-06 [auto_monad_eliminator]: 1.435e-05 [cse]: 3.061e-05 [a_3]: 7.056e-05 [py_interpret_to_execute_after_opt_a]: 2.712e-05 [slice_cell_reuse_recomputed_activation]: 5.34e-06 [rewriter_after_opt_a]: 6.307e-05 [convert_after_rewriter]: 1.281e-05 [order_py_execute_after_rewriter]: 9.87001e-06 [mutable_eliminate]: 0.00089761 [opt_b]: 0.00297036, [1] [Cycle 1]: 0.00295032, [7] [b_1]: 0.0017696 [b_2]: 0.000172 [updatestate_depend_eliminate]: 0.00017063 [updatestate_assign_eliminate]: 8.05e-06 [updatestate_loads_eliminate]: 1.096e-05 [renormalize]: 9.80013e-07 [cse]: 0.00020935 [optimize_parallel_all_gather_comm]: 0.00019035 [overlap_param_gather]: 8.03001e-06 [cconv]: 0.00019461 [loop_unroll]: 0.00384627 [opt_after_cconv]: 0.00122462, [1] [Cycle 1]: 0.00120339, [7] [c_1]: 0.00037012 [parameter_eliminate]: 8.05e-06 [updatestate_depend_eliminate]: 0.00015646 [updatestate_assign_eliminate]: 7.80998e-06 [updatestate_loads_eliminate]: 0.00014318 [cse]: 0.00021155 [renormalize]: 8.39995e-07 [remove_dup_value]: 0.0003663 [tuple_transform]: 0.00079659, [1] [Cycle 1]: 0.00077796, [4] [d_1]: 0.00038198 [none_parameter_eliminate]: 5.04998e-06 [renormalize]: 3.89991e-07 [switch_simplify]: 0.00016854 [partial_unused_args_eliminate]: 0.00015468 [add_recomputation]: 0.00039274 [cse_after_recomputation]: 0.00038092, [1] [Cycle 1]: 0.00036253, [1] [cse]: 0.00019183 [environ_conv]: 1.89e-05 [swap_dp_allreduce_reducescatter]: 0.00016895 [bias_add_comm_swap]: 8.72e-06 [label_micro_interleaved_index]: 0.00015434 [label_fine_grained_interleaved_index]: 1.267e-05 [merge_cast_opt]: 4.63999e-06 [slice_recompute_activation]: 5.34e-06 [micro_interleaved_order_control]: 6.11998e-06 [assign_add_opt]: 1.347e-05 [ForceFp32Comm]: 4.97e-06 [remove_cast_before_assign_add]: 3.70998e-06 [full_micro_interleaved_order_control]: 6.36998e-06 [reorder_send_recv_between_fp_bp]: 0.00014372 [comm_op_add_attrs]: 9.27001e-06 [add_comm_op_reuse_tag]: 4.05e-06 [interleave_split_concat_branches]: 5.66e-06 [interleave_parallel_branches]: 3.78999e-06 [overlap_opt_shard_in_pipeline]: 1.417e-05 [overlap_opt_shard_grad_in_pipeline]: 7.08e-06 [control_data_broadcast_order]: 0.00018502 [grouped_pairwise_exchange_alltoall]: 6.11e-06 [offloading_packed_experts]: 1.202e-05 [overlap_recompute_and_grad_model_parallel]: 2.488e-05 [overlap_grad_matmul_and_grad_allreduce]: 4.89e-06 [overlap_recompute_allgather_and_fa_grad]: 4.26001e-06 [overlap_recompute_comm]: 1.826e-05 [overlap_grad_ring_attention]: 1.639e-05 [overlap_grad_flash_sp]: 0.00019835 [begin_end_overlap_inline]: 0.0001477 [split_matmul_comm_elemetwise]: 9.44998e-06 [split_layernorm_comm]: 4.67998e-06 [handle_group_info]: 4.34002e-06 [symbol_engine_optimizer]: 0.00138433, [1] [Cycle 1]: 0.00135202, [6] [build]: 1.095e-05 [elim_shapecalc]: 0.00043029 [elim_not_effective]: 4.873e-05 [opt_reshape]: 2.304e-05 [fold_const_symbol]: 0.00016565 [renormalize]: 1.40999e-06 [detach_backward]: 1.217e-05 [pipeline_parallel_scheduler]: 6.63e-06 [auto_monad_reorder]: 0.0001881 [get_jit_bprop_graph]: 4.53999e-06 [rewriter_after_jit_bprop_graph]: 1.346e-05 [opt_after_jit_grad]: 0.00417944 [validate]: 0.0002255 Sums bootstrap : 0.000453s : 0.52% type_inference : 0.065801s : 75.89% event_method : 0.000023s : 0.03% auto_monad : 0.000071s : 0.08% graph_reusing : 0.000006s : 0.01% inline : 0.000003s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000025s : 0.03% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.01% parallel-infer-symbol : 0.000004s : 0.00% pre_auto_parallel : 0.000043s : 0.05% insert-virtual-dataset : 0.000003s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000045s : 0.05% optimize.rewriter_before_opt_a : 0.000106s : 0.12% optimize.opt_a.expand_dump_flag : 0.000006s : 0.01% optimize.opt_a.switch_simplify : 0.000056s : 0.06% optimize.opt_a.loop_unroll : 0.000042s : 0.05% optimize.opt_a.a_1 : 0.000975s : 1.12% optimize.opt_a.with_stream_mark : 0.000045s : 0.05% optimize.opt_a.recompute_prepare : 0.000022s : 0.03% optimize.opt_a.updatestate_depend_eliminate : 0.000012s : 0.01% optimize.opt_a.updatestate_assign_eliminate : 0.000009s : 0.01% optimize.opt_a.updatestate_loads_eliminate : 0.000009s : 0.01% optimize.opt_a.parameter_eliminate : 0.000004s : 0.00% optimize.opt_a.a_2 : 0.000314s : 0.36% optimize.opt_a.accelerated_algorithm : 0.000020s : 0.02% optimize.opt_a.shard : 0.000005s : 0.01% optimize.opt_a.meta_shard_fg_expand : 0.000006s : 0.01% optimize.opt_a.shard_inline : 0.000020s : 0.02% optimize.opt_a.merge_send_recv : 0.000021s : 0.02% optimize.opt_a.auto_parallel : 0.000022s : 0.02% optimize.opt_a.parallel : 0.000033s : 0.04% optimize.opt_a.flash_sp : 0.000015s : 0.02% optimize.opt_a.merge_comm : 0.000011s : 0.01% optimize.opt_a.allreduce_fusion : 0.000011s : 0.01% optimize.opt_a.matmul_add_comm_reduction : 0.000023s : 0.03% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000002s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000024s : 0.03% optimize.opt_a.virtual_dataset : 0.000019s : 0.02% optimize.opt_a.get_grad_eliminate_ : 0.000019s : 0.02% optimize.opt_a.virtual_output : 0.000024s : 0.03% optimize.opt_a.merge_forward : 0.000012s : 0.01% optimize.opt_a.cell_reuse_recompute_pass : 0.000005s : 0.01% optimize.opt_a.offload_activation : 0.000027s : 0.03% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000046s : 0.05% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.00% optimize.opt_a.before_grad : 0.000032s : 0.04% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000012s : 0.01% optimize.opt_a.meta_fg_expand : 0.000009s : 0.01% optimize.opt_a.flash_sp_send_recv_attached : 0.000006s : 0.01% optimize.opt_a.receive_attached : 0.000005s : 0.01% optimize.opt_a.after_resolve : 0.000032s : 0.04% optimize.opt_a.a_after_grad : 0.000028s : 0.03% optimize.opt_a.renormalize : 0.001131s : 1.30% optimize.opt_a.add_forward_monad_depend : 0.000013s : 0.01% optimize.opt_a.auto_monad_grad : 0.000004s : 0.00% optimize.opt_a.auto_monad_eliminator : 0.000041s : 0.05% optimize.opt_a.cse : 0.000078s : 0.09% optimize.opt_a.a_3 : 0.000164s : 0.19% optimize.py_interpret_to_execute_after_opt_a : 0.000027s : 0.03% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.01% optimize.rewriter_after_opt_a : 0.000063s : 0.07% optimize.convert_after_rewriter : 0.000013s : 0.01% optimize.order_py_execute_after_rewriter : 0.000010s : 0.01% optimize.mutable_eliminate : 0.000898s : 1.04% optimize.opt_b.b_1 : 0.001770s : 2.04% optimize.opt_b.b_2 : 0.000172s : 0.20% optimize.opt_b.updatestate_depend_eliminate : 0.000171s : 0.20% optimize.opt_b.updatestate_assign_eliminate : 0.000008s : 0.01% optimize.opt_b.updatestate_loads_eliminate : 0.000011s : 0.01% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000209s : 0.24% optimize.optimize_parallel_all_gather_comm : 0.000190s : 0.22% optimize.overlap_param_gather : 0.000008s : 0.01% optimize.cconv : 0.000195s : 0.22% optimize.loop_unroll : 0.003846s : 4.44% optimize.opt_after_cconv.c_1 : 0.000370s : 0.43% optimize.opt_after_cconv.parameter_eliminate : 0.000008s : 0.01% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000156s : 0.18% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000008s : 0.01% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000143s : 0.17% optimize.opt_after_cconv.cse : 0.000212s : 0.24% optimize.opt_after_cconv.renormalize : 0.000001s : 0.00% optimize.remove_dup_value : 0.000366s : 0.42% optimize.tuple_transform.d_1 : 0.000382s : 0.44% optimize.tuple_transform.none_parameter_eliminate : 0.000005s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000169s : 0.19% optimize.partial_unused_args_eliminate : 0.000155s : 0.18% optimize.add_recomputation : 0.000393s : 0.45% optimize.cse_after_recomputation.cse : 0.000192s : 0.22% optimize.environ_conv : 0.000019s : 0.02% optimize.swap_dp_allreduce_reducescatter : 0.000169s : 0.19% optimize.bias_add_comm_swap : 0.000009s : 0.01% optimize.label_micro_interleaved_index : 0.000154s : 0.18% optimize.label_fine_grained_interleaved_index : 0.000013s : 0.01% optimize.merge_cast_opt : 0.000005s : 0.01% optimize.slice_recompute_activation : 0.000005s : 0.01% optimize.micro_interleaved_order_control : 0.000006s : 0.01% optimize.assign_add_opt : 0.000013s : 0.02% optimize.ForceFp32Comm : 0.000005s : 0.01% optimize.remove_cast_before_assign_add : 0.000004s : 0.00% optimize.full_micro_interleaved_order_control : 0.000006s : 0.01% optimize.reorder_send_recv_between_fp_bp : 0.000144s : 0.17% optimize.comm_op_add_attrs : 0.000009s : 0.01% optimize.add_comm_op_reuse_tag : 0.000004s : 0.00% optimize.interleave_split_concat_branches : 0.000006s : 0.01% optimize.interleave_parallel_branches : 0.000004s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000014s : 0.02% optimize.overlap_opt_shard_grad_in_pipeline : 0.000007s : 0.01% optimize.control_data_broadcast_order : 0.000185s : 0.21% optimize.grouped_pairwise_exchange_alltoall : 0.000006s : 0.01% optimize.offloading_packed_experts : 0.000012s : 0.01% optimize.overlap_recompute_and_grad_model_parallel : 0.000025s : 0.03% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000005s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.00% optimize.overlap_recompute_comm : 0.000018s : 0.02% optimize.overlap_grad_ring_attention : 0.000016s : 0.02% optimize.overlap_grad_flash_sp : 0.000198s : 0.23% optimize.begin_end_overlap_inline : 0.000148s : 0.17% optimize.split_matmul_comm_elemetwise : 0.000009s : 0.01% optimize.split_layernorm_comm : 0.000005s : 0.01% optimize.handle_group_info : 0.000004s : 0.01% optimize.symbol_engine_optimizer.build : 0.000011s : 0.01% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000430s : 0.50% optimize.symbol_engine_optimizer.elim_not_effective : 0.000049s : 0.06% optimize.symbol_engine_optimizer.opt_reshape : 0.000023s : 0.03% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000166s : 0.19% optimize.symbol_engine_optimizer.renormalize : 0.000001s : 0.00% detach_backward : 0.000012s : 0.01% pipeline_parallel_scheduler : 0.000007s : 0.01% auto_monad_reorder : 0.000188s : 0.22% get_jit_bprop_graph : 0.000005s : 0.01% rewriter_after_jit_bprop_graph : 0.000013s : 0.02% opt_after_jit_grad : 0.004179s : 4.82% validate : 0.000225s : 0.26% Time group info: ------[substitution.] 0.000276 49 14.57% : 0.000040s : 6: substitution.cast_eliminate 1.39% : 0.000004s : 4: substitution.elim_not_effective 1.27% : 0.000004s : 4: substitution.fold_const_symbol 5.15% : 0.000014s : 6: substitution.graph_param_transform 61.36% : 0.000169s : 4: substitution.inline 2.43% : 0.000007s : 8: substitution.j_node_and_user_rematch 3.15% : 0.000009s : 8: substitution.remove_not_recompute_node 2.72% : 0.000008s : 4: substitution.replace_old_param 4.65% : 0.000013s : 4: substitution.tuple_list_get_item_eliminator 3.31% : 0.000009s : 1: substitution.value_based_eliminate ------[type_inference.] 0.065736 2 98.68% : 0.064870s : 1: type_inference.infer 1.32% : 0.000866s : 1: type_inference.specialize ------[replace.] 0.000066 8 63.61% : 0.000042s : 4: replace.inline 36.39% : 0.000024s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000177 8 93.86% : 0.000166s : 4: match.inline 6.14% : 0.000011s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000885 1730 0.28% : 0.000003s : 17: predicate.accumulaten_eliminater 0.66% : 0.000006s : 6: predicate.ad_related_special_op_eliminate 0.20% : 0.000002s : 12: predicate.addn_check_dump 0.30% : 0.000003s : 17: predicate.addn_zero_filter 0.26% : 0.000002s : 17: predicate.adjust_all_reduce_mul_add 0.79% : 0.000007s : 29: predicate.arithmetic_simplify 0.35% : 0.000003s : 17: predicate.cast_eliminate 0.22% : 0.000002s : 12: predicate.check_bprop_eliminate 0.25% : 0.000002s : 12: predicate.compare_switch_simplify 0.09% : 0.000001s : 6: predicate.const_output_eliminate 0.25% : 0.000002s : 12: predicate.depend_value_elim 0.28% : 0.000003s : 17: predicate.dict_get_item_const_eliminator 0.35% : 0.000003s : 17: predicate.dict_get_item_eliminator 0.28% : 0.000002s : 17: predicate.dict_set_item_eliminator 0.70% : 0.000006s : 12: predicate.dumpgradient_eliminate 0.12% : 0.000001s : 6: predicate.elim_not_effective 43.59% : 0.000386s : 6: predicate.elim_shapecalc_of_broadcastargs 0.48% : 0.000004s : 23: predicate.environ_add_const_eliminate 0.37% : 0.000003s : 23: predicate.environ_get_add_eliminate 0.53% : 0.000005s : 23: predicate.environ_get_depend_swap 0.62% : 0.000005s : 35: predicate.environ_get_eliminate 0.47% : 0.000004s : 23: predicate.environ_get_set_eliminate 0.40% : 0.000004s : 25: predicate.exchange_switch_depend_value 0.68% : 0.000006s : 25: predicate.float_depend_g_call 0.20% : 0.000002s : 12: predicate.float_environ_get_switch 0.34% : 0.000003s : 18: predicate.float_tuple_getitem_switch 0.09% : 0.000001s : 6: predicate.fold_const_symbol 0.28% : 0.000002s : 12: predicate.get_grad_eliminate 0.13% : 0.000001s : 6: predicate.graph_param_transform 0.25% : 0.000002s : 12: predicate.incorporate_call 0.19% : 0.000002s : 12: predicate.incorporate_call_switch 19.49% : 0.000173s : 78: predicate.inline 0.26% : 0.000002s : 12: predicate.inline_without_move 0.10% : 0.000001s : 12: predicate.j_node_and_user_rematch 0.30% : 0.000003s : 12: predicate.less_batch_normalization 0.80% : 0.000007s : 33: predicate.list_to_tuple_eliminator_ 0.84% : 0.000007s : 50: predicate.load_eliminater 0.93% : 0.000008s : 6: predicate.loop_unroll_after_grad 0.64% : 0.000006s : 38: predicate.loop_unroll_before_grad 0.83% : 0.000007s : 29: predicate.make_slice_get_slice_eliminator 0.22% : 0.000002s : 12: predicate.merge_addn 0.21% : 0.000002s : 12: predicate.micro_step_allgather_replace 0.25% : 0.000002s : 12: predicate.mini_step_allgather_replace 0.25% : 0.000002s : 17: predicate.minmaximum_grad 0.75% : 0.000007s : 6: predicate.mutable_eliminate 0.35% : 0.000003s : 6: predicate.opt_reshape 0.24% : 0.000002s : 6: predicate.parallel_virtual_node 0.50% : 0.000004s : 25: predicate.partial_defer_inline 0.50% : 0.000004s : 27: predicate.partial_eliminate 0.29% : 0.000003s : 17: predicate.print_const_string_wrapper 0.22% : 0.000002s : 12: predicate.reduce_all_const_elim 0.43% : 0.000004s : 17: predicate.reduce_eliminate 0.86% : 0.000008s : 50: predicate.redundant_stop_gradient_eliminater 0.13% : 0.000001s : 12: predicate.remove_not_recompute_node 0.44% : 0.000004s : 33: predicate.replace_applicator 0.18% : 0.000002s : 12: predicate.replace_old_param 0.15% : 0.000001s : 6: predicate.reset_defer_inline 0.37% : 0.000003s : 17: predicate.reshape_eliminate 0.24% : 0.000002s : 12: predicate.row_tensor_add_zeros_like 0.47% : 0.000004s : 6: predicate.row_tensor_eliminate 0.28% : 0.000002s : 12: predicate.same_eliminate 0.14% : 0.000001s : 12: predicate.set_cell_output_no_recompute 0.33% : 0.000003s : 12: predicate.shard_identity_eliminate 0.56% : 0.000005s : 12: predicate.special_op_eliminate 0.26% : 0.000002s : 12: predicate.specialize_transform 0.37% : 0.000003s : 12: predicate.split_environ_get_set_with_tuple_value 0.27% : 0.000002s : 12: predicate.stack_unstack_eliminate 0.19% : 0.000002s : 6: predicate.switch_call_monad_eliminater 0.43% : 0.000004s : 25: predicate.switch_defer_inline 0.61% : 0.000005s : 37: predicate.switch_layer_defer_inline 1.98% : 0.000017s : 81: predicate.switch_simplify 0.28% : 0.000003s : 17: predicate.tile_eliminate 0.32% : 0.000003s : 17: predicate.transpose_eliminate 0.72% : 0.000006s : 29: predicate.tuple_list_convert_item_index_to_positive 0.76% : 0.000007s : 29: predicate.tuple_list_get_item_const_eliminator 0.60% : 0.000005s : 29: predicate.tuple_list_get_item_depend_reorder 1.18% : 0.000010s : 45: predicate.tuple_list_get_item_eliminator 0.55% : 0.000005s : 29: predicate.tuple_list_get_set_item_eliminator 0.84% : 0.000007s : 41: predicate.tuple_list_set_item_eliminator 0.57% : 0.000005s : 33: predicate.tuple_to_list_eliminator_ 0.98% : 0.000009s : 50: predicate.updatestate_pure_node_eliminater 1.29% : 0.000011s : 62: predicate.updatestate_useless_node_eliminater 0.41% : 0.000004s : 6: predicate.value_based_eliminate 0.26% : 0.000002s : 12: predicate.virtual_dataset_eliminate 0.29% : 0.000003s : 12: predicate.virtual_output_eliminate 0.13% : 0.000001s : 6: predicate.virtual_view_grad_eliminate 0.15% : 0.000001s : 6: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000700 11 53.82% : 0.000377s : 5: func_graph_cloner_run.FuncGraphClonerGraph 46.18% : 0.000323s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.125970 192 0.01% : 0.000008s : 1: ForceFp32Comm 3.11% : 0.003913s : 1: add_attr 3.09% : 0.003894s : 1: add_attr_with_inline 0.01% : 0.000008s : 1: add_comm_op_reuse_tag 0.32% : 0.000403s : 1: add_recomputation 0.02% : 0.000020s : 1: assign_add_opt 0.06% : 0.000080s : 1: auto_monad 0.28% : 0.000346s : 1: auto_monad_reorder 0.13% : 0.000159s : 1: begin_end_overlap_inline 0.01% : 0.000013s : 1: bias_add_comm_swap 0.39% : 0.000497s : 1: bootstrap 0.16% : 0.000206s : 1: cconv 0.01% : 0.000014s : 1: comm_op_add_attrs 0.15% : 0.000194s : 1: control_data_broadcast_order 0.01% : 0.000016s : 1: convert_after_rewriter 0.31% : 0.000389s : 1: cse_after_recomputation 0.01% : 0.000008s : 1: dataset_repeat_opt 0.17% : 0.000211s : 1: detach_backward 0.02% : 0.000024s : 1: environ_conv 0.03% : 0.000034s : 1: event_method 0.01% : 0.000009s : 1: full_micro_interleaved_order_control 0.02% : 0.000021s : 1: get_jit_bprop_graph 0.01% : 0.000012s : 1: graph_reusing 0.01% : 0.000010s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000007s : 1: handle_group_info 0.01% : 0.000008s : 1: inline 0.01% : 0.000009s : 1: insert-virtual-dataset 0.01% : 0.000007s : 1: interleave_parallel_branches 0.01% : 0.000009s : 1: interleave_split_concat_branches 0.01% : 0.000018s : 1: label_fine_grained_interleaved_index 0.13% : 0.000164s : 1: label_micro_interleaved_index 3.07% : 0.003869s : 1: loop_unroll 0.01% : 0.000008s : 1: merge_cast_opt 0.12% : 0.000157s : 1: micro_interleaved_order_control 0.72% : 0.000908s : 1: mutable_eliminate 0.12% : 0.000155s : 1: offloading_packed_experts 0.04% : 0.000048s : 1: opt.transform.loop_unroll_optimizer 0.03% : 0.000033s : 1: opt.transform.mutable_eliminate 1.31% : 0.001649s : 78: opt.transform.opt_a 0.17% : 0.000217s : 1: opt.transform.opt_after_cconv 0.16% : 0.000195s : 1: opt.transform.opt_after_jit_grad 1.04% : 0.001309s : 28: opt.transform.opt_b 0.43% : 0.000540s : 2: opt.transform.opt_trans_graph 0.52% : 0.000651s : 4: opt.transform.symbol_engine_opt 3.37% : 0.004244s : 1: opt_a 0.98% : 0.001232s : 1: opt_after_cconv 3.45% : 0.004352s : 1: opt_after_jit_grad 2.36% : 0.002977s : 1: opt_b 17.00% : 0.021410s : 1: optimize 0.16% : 0.000200s : 1: optimize_parallel_all_gather_comm 0.01% : 0.000013s : 1: order_py_execute_after_rewriter 0.16% : 0.000207s : 1: overlap_grad_flash_sp 0.01% : 0.000008s : 1: overlap_grad_matmul_and_grad_allreduce 0.02% : 0.000020s : 1: overlap_grad_ring_attention 0.01% : 0.000010s : 1: overlap_opt_shard_grad_in_pipeline 0.02% : 0.000024s : 1: overlap_opt_shard_in_pipeline 0.12% : 0.000152s : 1: overlap_param_gather 0.11% : 0.000142s : 1: overlap_recompute_allgather_and_fa_grad 0.02% : 0.000031s : 1: overlap_recompute_and_grad_model_parallel 0.02% : 0.000026s : 1: overlap_recompute_comm 0.01% : 0.000011s : 1: parallel-infer-symbol 0.01% : 0.000007s : 1: parallel-infer-symbol-second 0.13% : 0.000165s : 1: partial_unused_args_eliminate 0.02% : 0.000031s : 1: pipeline_parallel_scheduler 0.01% : 0.000007s : 1: pipeline_split 0.04% : 0.000050s : 1: pre_auto_parallel 0.04% : 0.000049s : 1: py_interpret_to_execute 0.02% : 0.000031s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000006s : 1: remove_cast_before_assign_add 0.30% : 0.000378s : 1: remove_dup_value 0.53% : 0.000662s : 1: renormalize.infer 0.36% : 0.000456s : 1: renormalize.specialize 0.12% : 0.000157s : 1: reorder_send_recv_between_fp_bp 0.02% : 0.000023s : 1: rewriter_after_jit_bprop_graph 0.05% : 0.000069s : 1: rewriter_after_opt_a 0.09% : 0.000110s : 1: rewriter_before_opt_a 0.01% : 0.000008s : 1: slice_cell_reuse_recomputed_activation 0.01% : 0.000008s : 1: slice_recompute_activation 0.01% : 0.000008s : 1: split_layernorm_comm 0.01% : 0.000014s : 1: split_matmul_comm_elemetwise 0.14% : 0.000177s : 1: swap_dp_allreduce_reducescatter 1.11% : 0.001393s : 1: symbol_engine_optimizer 0.64% : 0.000803s : 1: tuple_transform 52.27% : 0.065848s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:44:08.526.675 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0523818, [21] [bootstrap]: 0.00043526 [type_inference]: 0.0262632 [event_method]: 2.481e-05 [auto_monad]: 7.818e-05 [graph_reusing]: 5.81e-06 [inline]: 3.13e-06 [add_attr]: 0.0064455, [1] [add_attr_with_inline]: 0.00642904, [1] [Cycle 1]: 8.17e-05, [2] [tag_attr]: 2.749e-05 [meta_addattr_fg_expand]: 6.09001e-06 [parallel-infer-symbol]: 3.75998e-06 [pre_auto_parallel]: 4.654e-05 [insert-virtual-dataset]: 2.42001e-06 [parallel-infer-symbol-second]: 8.49977e-07 [dataset_repeat_opt]: 1.78002e-06 [pipeline_split]: 2.20002e-06 [optimize]: 0.0182752, [53] [py_interpret_to_execute]: 3.742e-05 [rewriter_before_opt_a]: 0.00011158 [opt_a]: 0.015546, [2] [Cycle 1]: 0.0144782, [45] [expand_dump_flag]: 3.59002e-06 [switch_simplify]: 4.599e-05 [loop_unroll]: 3.299e-05 [a_1]: 0.00084651 [with_stream_mark]: 2.759e-05 [recompute_prepare]: 1.496e-05 [updatestate_depend_eliminate]: 6.01998e-06 [updatestate_assign_eliminate]: 1.906e-05 [updatestate_loads_eliminate]: 5.05999e-06 [parameter_eliminate]: 5.46e-06 [a_2]: 0.00013782 [accelerated_algorithm]: 1.235e-05 [shard]: 3.53999e-06 [meta_shard_fg_expand]: 3.97e-06 [shard_inline]: 9.82001e-06 [merge_send_recv]: 1.275e-05 [auto_parallel]: 1.467e-05 [parallel]: 2.244e-05 [flash_sp]: 1.35e-05 [merge_comm]: 6.58e-06 [allreduce_fusion]: 4.89e-06 [matmul_add_comm_reduction]: 1.381e-05 [allreduce_slice_to_reducescatter]: 8.30012e-07 [virtual_shard_identity]: 1.234e-05 [virtual_dataset]: 9.57001e-06 [get_grad_eliminate_]: 9.76e-06 [virtual_output]: 9.43002e-06 [merge_forward]: 5.94999e-06 [cell_reuse_recompute_pass]: 2.27999e-06 [offload_activation]: 1.296e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.522e-05 [merge_recompute_call_nodes]: 1.54e-06 [before_grad]: 1.677e-05 [set_forward_comm_id_for_comm_node_pass]: 5.73002e-06 [meta_fg_expand]: 4.82e-06 [flash_sp_send_recv_attached]: 2.99999e-06 [receive_attached]: 2.53998e-06 [after_resolve]: 1.69e-05 [a_after_grad]: 1.468e-05 [renormalize]: 0.0113818 [add_forward_monad_depend]: 1.269e-05 [auto_monad_grad]: 2.94999e-06 [auto_monad_eliminator]: 3.114e-05 [cse]: 5.34e-05 [a_3]: 9.012e-05 [Cycle 2]: 0.00105306, [45] [expand_dump_flag]: 2.58e-06 [switch_simplify]: 1.333e-05 [loop_unroll]: 1.027e-05 [a_1]: 0.00025374 [with_stream_mark]: 2.377e-05 [recompute_prepare]: 1.119e-05 [updatestate_depend_eliminate]: 6.37001e-06 [updatestate_assign_eliminate]: 8.08001e-06 [updatestate_loads_eliminate]: 4.45e-06 [parameter_eliminate]: 1.92001e-06 [a_2]: 0.00012486 [accelerated_algorithm]: 1.059e-05 [shard]: 2.52001e-06 [meta_shard_fg_expand]: 2.86999e-06 [shard_inline]: 9.66e-06 [merge_send_recv]: 1.098e-05 [auto_parallel]: 1.158e-05 [parallel]: 1.012e-05 [flash_sp]: 4.57003e-06 [merge_comm]: 5.70001e-06 [allreduce_fusion]: 5.25001e-06 [matmul_add_comm_reduction]: 1.413e-05 [allreduce_slice_to_reducescatter]: 9.09989e-07 [virtual_shard_identity]: 1.181e-05 [virtual_dataset]: 9.32001e-06 [get_grad_eliminate_]: 9.36e-06 [virtual_output]: 9.81e-06 [merge_forward]: 6.01e-06 [cell_reuse_recompute_pass]: 4.18999e-06 [offload_activation]: 1.545e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.138e-05 [merge_recompute_call_nodes]: 1.84e-06 [before_grad]: 1.734e-05 [set_forward_comm_id_for_comm_node_pass]: 7.97e-06 [meta_fg_expand]: 3.98999e-06 [flash_sp_send_recv_attached]: 2.04e-06 [receive_attached]: 2.58e-06 [after_resolve]: 1.722e-05 [a_after_grad]: 1.494e-05 [renormalize]: 7.99773e-08 [add_forward_monad_depend]: 2.26e-06 [auto_monad_grad]: 1.54e-06 [auto_monad_eliminator]: 1.492e-05 [cse]: 2.738e-05 [a_3]: 6.052e-05 [py_interpret_to_execute_after_opt_a]: 2.268e-05 [slice_cell_reuse_recomputed_activation]: 2.16998e-06 [rewriter_after_opt_a]: 5.525e-05 [convert_after_rewriter]: 9.37999e-06 [order_py_execute_after_rewriter]: 6.68998e-06 [mutable_eliminate]: 0.0008006 [opt_b]: 0.00031912, [1] [Cycle 1]: 0.00031041, [7] [b_1]: 0.00020321 [b_2]: 1.193e-05 [updatestate_depend_eliminate]: 1.141e-05 [updatestate_assign_eliminate]: 4.14002e-06 [updatestate_loads_eliminate]: 3.76999e-06 [renormalize]: 9.90025e-07 [cse]: 3.542e-05 [optimize_parallel_all_gather_comm]: 2.149e-05 [overlap_param_gather]: 1.94999e-06 [cconv]: 3.464e-05 [loop_unroll]: 0.00046159 [opt_after_cconv]: 0.00013784, [1] [Cycle 1]: 0.00013082, [7] [c_1]: 4.453e-05 [parameter_eliminate]: 5.21998e-06 [updatestate_depend_eliminate]: 7.28999e-06 [updatestate_assign_eliminate]: 3.78001e-06 [updatestate_loads_eliminate]: 3.45e-06 [cse]: 3.015e-05 [renormalize]: 5.79981e-07 [remove_dup_value]: 5.156e-05 [tuple_transform]: 0.00010036, [1] [Cycle 1]: 9.489e-05, [4] [d_1]: 6.38e-05 [none_parameter_eliminate]: 1.89999e-06 [renormalize]: 2.40019e-07 [switch_simplify]: 9.81e-06 [partial_unused_args_eliminate]: 1.96003e-06 [add_recomputation]: 6.714e-05 [cse_after_recomputation]: 3.013e-05, [1] [Cycle 1]: 2.47e-05, [1] [cse]: 1.85e-05 [environ_conv]: 6.77002e-06 [swap_dp_allreduce_reducescatter]: 6.91999e-06 [bias_add_comm_swap]: 3.01999e-06 [label_micro_interleaved_index]: 5.77999e-06 [label_fine_grained_interleaved_index]: 2.92002e-06 [merge_cast_opt]: 1.39e-06 [slice_recompute_activation]: 2.00002e-06 [micro_interleaved_order_control]: 2.49001e-06 [assign_add_opt]: 1.37999e-06 [ForceFp32Comm]: 1.29e-06 [remove_cast_before_assign_add]: 9.89996e-07 [full_micro_interleaved_order_control]: 2.31e-06 [reorder_send_recv_between_fp_bp]: 2.59001e-06 [comm_op_add_attrs]: 9.70002e-07 [add_comm_op_reuse_tag]: 9.30013e-07 [interleave_split_concat_branches]: 1.15001e-06 [interleave_parallel_branches]: 1.24998e-06 [overlap_opt_shard_in_pipeline]: 1.52999e-06 [overlap_opt_shard_grad_in_pipeline]: 2.07001e-06 [control_data_broadcast_order]: 1.782e-05 [grouped_pairwise_exchange_alltoall]: 1.88002e-06 [offloading_packed_experts]: 4.90999e-06 [overlap_recompute_and_grad_model_parallel]: 6.12001e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.12999e-06 [overlap_recompute_allgather_and_fa_grad]: 1.35001e-06 [overlap_recompute_comm]: 2.24999e-06 [overlap_grad_ring_attention]: 4.90999e-06 [overlap_grad_flash_sp]: 2.89e-05 [begin_end_overlap_inline]: 5.89993e-07 [split_matmul_comm_elemetwise]: 2.28002e-06 [split_layernorm_comm]: 1.76e-06 [handle_group_info]: 1.44998e-06 [symbol_engine_optimizer]: 9.838e-05, [1] [Cycle 1]: 9.293e-05, [6] [build]: 4.77998e-06 [elim_shapecalc]: 1.384e-05 [elim_not_effective]: 1.775e-05 [opt_reshape]: 9.59e-06 [fold_const_symbol]: 1.453e-05 [renormalize]: 1.60013e-07 [detach_backward]: 2.37001e-06 [pipeline_parallel_scheduler]: 1.67999e-06 [auto_monad_reorder]: 2.412e-05 [get_jit_bprop_graph]: 1.96e-06 [rewriter_after_jit_bprop_graph]: 5.51e-06 [opt_after_jit_grad]: 0.00052766 [validate]: 5.331e-05 Sums bootstrap : 0.000435s : 1.00% type_inference : 0.026263s : 60.17% event_method : 0.000025s : 0.06% auto_monad : 0.000078s : 0.18% graph_reusing : 0.000006s : 0.01% inline : 0.000003s : 0.01% add_attr.add_attr_with_inline.tag_attr : 0.000027s : 0.06% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.01% parallel-infer-symbol : 0.000004s : 0.01% pre_auto_parallel : 0.000047s : 0.11% insert-virtual-dataset : 0.000002s : 0.01% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000037s : 0.09% optimize.rewriter_before_opt_a : 0.000112s : 0.26% optimize.opt_a.expand_dump_flag : 0.000006s : 0.01% optimize.opt_a.switch_simplify : 0.000059s : 0.14% optimize.opt_a.loop_unroll : 0.000043s : 0.10% optimize.opt_a.a_1 : 0.001100s : 2.52% optimize.opt_a.with_stream_mark : 0.000051s : 0.12% optimize.opt_a.recompute_prepare : 0.000026s : 0.06% optimize.opt_a.updatestate_depend_eliminate : 0.000012s : 0.03% optimize.opt_a.updatestate_assign_eliminate : 0.000027s : 0.06% optimize.opt_a.updatestate_loads_eliminate : 0.000010s : 0.02% optimize.opt_a.parameter_eliminate : 0.000007s : 0.02% optimize.opt_a.a_2 : 0.000263s : 0.60% optimize.opt_a.accelerated_algorithm : 0.000023s : 0.05% optimize.opt_a.shard : 0.000006s : 0.01% optimize.opt_a.meta_shard_fg_expand : 0.000007s : 0.02% optimize.opt_a.shard_inline : 0.000019s : 0.04% optimize.opt_a.merge_send_recv : 0.000024s : 0.05% optimize.opt_a.auto_parallel : 0.000026s : 0.06% optimize.opt_a.parallel : 0.000033s : 0.07% optimize.opt_a.flash_sp : 0.000018s : 0.04% optimize.opt_a.merge_comm : 0.000012s : 0.03% optimize.opt_a.allreduce_fusion : 0.000010s : 0.02% optimize.opt_a.matmul_add_comm_reduction : 0.000028s : 0.06% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000002s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000024s : 0.06% optimize.opt_a.virtual_dataset : 0.000019s : 0.04% optimize.opt_a.get_grad_eliminate_ : 0.000019s : 0.04% optimize.opt_a.virtual_output : 0.000019s : 0.04% optimize.opt_a.merge_forward : 0.000012s : 0.03% optimize.opt_a.cell_reuse_recompute_pass : 0.000006s : 0.01% optimize.opt_a.offload_activation : 0.000028s : 0.07% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000047s : 0.11% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.01% optimize.opt_a.before_grad : 0.000034s : 0.08% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000014s : 0.03% optimize.opt_a.meta_fg_expand : 0.000009s : 0.02% optimize.opt_a.flash_sp_send_recv_attached : 0.000005s : 0.01% optimize.opt_a.receive_attached : 0.000005s : 0.01% optimize.opt_a.after_resolve : 0.000034s : 0.08% optimize.opt_a.a_after_grad : 0.000030s : 0.07% optimize.opt_a.renormalize : 0.011382s : 26.08% optimize.opt_a.add_forward_monad_depend : 0.000015s : 0.03% optimize.opt_a.auto_monad_grad : 0.000004s : 0.01% optimize.opt_a.auto_monad_eliminator : 0.000046s : 0.11% optimize.opt_a.cse : 0.000081s : 0.19% optimize.opt_a.a_3 : 0.000151s : 0.35% optimize.py_interpret_to_execute_after_opt_a : 0.000023s : 0.05% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.00% optimize.rewriter_after_opt_a : 0.000055s : 0.13% optimize.convert_after_rewriter : 0.000009s : 0.02% optimize.order_py_execute_after_rewriter : 0.000007s : 0.02% optimize.mutable_eliminate : 0.000801s : 1.83% optimize.opt_b.b_1 : 0.000203s : 0.47% optimize.opt_b.b_2 : 0.000012s : 0.03% optimize.opt_b.updatestate_depend_eliminate : 0.000011s : 0.03% optimize.opt_b.updatestate_assign_eliminate : 0.000004s : 0.01% optimize.opt_b.updatestate_loads_eliminate : 0.000004s : 0.01% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000035s : 0.08% optimize.optimize_parallel_all_gather_comm : 0.000021s : 0.05% optimize.overlap_param_gather : 0.000002s : 0.00% optimize.cconv : 0.000035s : 0.08% optimize.loop_unroll : 0.000462s : 1.06% optimize.opt_after_cconv.c_1 : 0.000045s : 0.10% optimize.opt_after_cconv.parameter_eliminate : 0.000005s : 0.01% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000007s : 0.02% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000004s : 0.01% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.cse : 0.000030s : 0.07% optimize.opt_after_cconv.renormalize : 0.000001s : 0.00% optimize.remove_dup_value : 0.000052s : 0.12% optimize.tuple_transform.d_1 : 0.000064s : 0.15% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000010s : 0.02% optimize.partial_unused_args_eliminate : 0.000002s : 0.00% optimize.add_recomputation : 0.000067s : 0.15% optimize.cse_after_recomputation.cse : 0.000018s : 0.04% optimize.environ_conv : 0.000007s : 0.02% optimize.swap_dp_allreduce_reducescatter : 0.000007s : 0.02% optimize.bias_add_comm_swap : 0.000003s : 0.01% optimize.label_micro_interleaved_index : 0.000006s : 0.01% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.01% optimize.merge_cast_opt : 0.000001s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.00% optimize.micro_interleaved_order_control : 0.000002s : 0.01% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.00% optimize.full_micro_interleaved_order_control : 0.000002s : 0.01% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.01% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000002s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.00% optimize.control_data_broadcast_order : 0.000018s : 0.04% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.00% optimize.offloading_packed_experts : 0.000005s : 0.01% optimize.overlap_recompute_and_grad_model_parallel : 0.000006s : 0.01% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.00% optimize.overlap_recompute_comm : 0.000002s : 0.01% optimize.overlap_grad_ring_attention : 0.000005s : 0.01% optimize.overlap_grad_flash_sp : 0.000029s : 0.07% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.01% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000005s : 0.01% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000014s : 0.03% optimize.symbol_engine_optimizer.elim_not_effective : 0.000018s : 0.04% optimize.symbol_engine_optimizer.opt_reshape : 0.000010s : 0.02% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000015s : 0.03% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.01% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000024s : 0.06% get_jit_bprop_graph : 0.000002s : 0.00% rewriter_after_jit_bprop_graph : 0.000006s : 0.01% opt_after_jit_grad : 0.000528s : 1.21% validate : 0.000053s : 0.12% Time group info: ------[substitution.] 0.000299 49 14.80% : 0.000044s : 6: substitution.cast_eliminate 0.88% : 0.000003s : 4: substitution.elim_not_effective 0.66% : 0.000002s : 4: substitution.fold_const_symbol 2.84% : 0.000008s : 6: substitution.graph_param_transform 65.35% : 0.000196s : 4: substitution.inline 2.33% : 0.000007s : 8: substitution.j_node_and_user_rematch 3.10% : 0.000009s : 8: substitution.remove_not_recompute_node 2.63% : 0.000008s : 4: substitution.replace_old_param 4.59% : 0.000014s : 4: substitution.tuple_list_get_item_eliminator 2.83% : 0.000008s : 1: substitution.value_based_eliminate ------[type_inference.] 0.026164 2 86.99% : 0.022760s : 1: type_inference.infer 13.01% : 0.003404s : 1: type_inference.specialize ------[replace.] 0.000103 8 73.22% : 0.000075s : 4: replace.inline 26.78% : 0.000028s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000205 8 94.22% : 0.000193s : 4: match.inline 5.78% : 0.000012s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000312 1730 0.93% : 0.000003s : 17: predicate.accumulaten_eliminater 0.76% : 0.000002s : 6: predicate.ad_related_special_op_eliminate 0.60% : 0.000002s : 12: predicate.addn_check_dump 0.96% : 0.000003s : 17: predicate.addn_zero_filter 0.88% : 0.000003s : 17: predicate.adjust_all_reduce_mul_add 2.36% : 0.000007s : 29: predicate.arithmetic_simplify 1.15% : 0.000004s : 17: predicate.cast_eliminate 0.61% : 0.000002s : 12: predicate.check_bprop_eliminate 0.57% : 0.000002s : 12: predicate.compare_switch_simplify 0.16% : 0.000000s : 6: predicate.const_output_eliminate 0.67% : 0.000002s : 12: predicate.depend_value_elim 0.96% : 0.000003s : 17: predicate.dict_get_item_const_eliminator 1.03% : 0.000003s : 17: predicate.dict_get_item_eliminator 0.87% : 0.000003s : 17: predicate.dict_set_item_eliminator 1.01% : 0.000003s : 12: predicate.dumpgradient_eliminate 0.20% : 0.000001s : 6: predicate.elim_not_effective 0.37% : 0.000001s : 6: predicate.elim_shapecalc_of_broadcastargs 1.16% : 0.000004s : 23: predicate.environ_add_const_eliminate 1.09% : 0.000003s : 23: predicate.environ_get_add_eliminate 1.12% : 0.000003s : 23: predicate.environ_get_depend_swap 1.68% : 0.000005s : 35: predicate.environ_get_eliminate 1.01% : 0.000003s : 23: predicate.environ_get_set_eliminate 1.20% : 0.000004s : 25: predicate.exchange_switch_depend_value 2.13% : 0.000007s : 25: predicate.float_depend_g_call 0.60% : 0.000002s : 12: predicate.float_environ_get_switch 0.87% : 0.000003s : 18: predicate.float_tuple_getitem_switch 0.17% : 0.000001s : 6: predicate.fold_const_symbol 0.76% : 0.000002s : 12: predicate.get_grad_eliminate 0.27% : 0.000001s : 6: predicate.graph_param_transform 0.70% : 0.000002s : 12: predicate.incorporate_call 0.55% : 0.000002s : 12: predicate.incorporate_call_switch 6.21% : 0.000019s : 78: predicate.inline 0.78% : 0.000002s : 12: predicate.inline_without_move 0.32% : 0.000001s : 12: predicate.j_node_and_user_rematch 1.04% : 0.000003s : 12: predicate.less_batch_normalization 1.88% : 0.000006s : 33: predicate.list_to_tuple_eliminator_ 2.34% : 0.000007s : 50: predicate.load_eliminater 0.90% : 0.000003s : 6: predicate.loop_unroll_after_grad 1.96% : 0.000006s : 38: predicate.loop_unroll_before_grad 1.97% : 0.000006s : 29: predicate.make_slice_get_slice_eliminator 0.60% : 0.000002s : 12: predicate.merge_addn 0.70% : 0.000002s : 12: predicate.micro_step_allgather_replace 0.57% : 0.000002s : 12: predicate.mini_step_allgather_replace 1.01% : 0.000003s : 17: predicate.minmaximum_grad 1.35% : 0.000004s : 6: predicate.mutable_eliminate 0.33% : 0.000001s : 6: predicate.opt_reshape 0.37% : 0.000001s : 6: predicate.parallel_virtual_node 1.58% : 0.000005s : 25: predicate.partial_defer_inline 1.46% : 0.000005s : 27: predicate.partial_eliminate 0.88% : 0.000003s : 17: predicate.print_const_string_wrapper 0.63% : 0.000002s : 12: predicate.reduce_all_const_elim 1.39% : 0.000004s : 17: predicate.reduce_eliminate 2.41% : 0.000008s : 50: predicate.redundant_stop_gradient_eliminater 0.39% : 0.000001s : 12: predicate.remove_not_recompute_node 1.36% : 0.000004s : 33: predicate.replace_applicator 0.50% : 0.000002s : 12: predicate.replace_old_param 0.23% : 0.000001s : 6: predicate.reset_defer_inline 1.09% : 0.000003s : 17: predicate.reshape_eliminate 0.66% : 0.000002s : 12: predicate.row_tensor_add_zeros_like 0.55% : 0.000002s : 6: predicate.row_tensor_eliminate 0.88% : 0.000003s : 12: predicate.same_eliminate 0.41% : 0.000001s : 12: predicate.set_cell_output_no_recompute 0.82% : 0.000003s : 12: predicate.shard_identity_eliminate 0.69% : 0.000002s : 12: predicate.special_op_eliminate 0.91% : 0.000003s : 12: predicate.specialize_transform 1.05% : 0.000003s : 12: predicate.split_environ_get_set_with_tuple_value 0.93% : 0.000003s : 12: predicate.stack_unstack_eliminate 0.35% : 0.000001s : 6: predicate.switch_call_monad_eliminater 1.29% : 0.000004s : 25: predicate.switch_defer_inline 1.93% : 0.000006s : 37: predicate.switch_layer_defer_inline 4.53% : 0.000014s : 81: predicate.switch_simplify 0.88% : 0.000003s : 17: predicate.tile_eliminate 0.90% : 0.000003s : 17: predicate.transpose_eliminate 1.47% : 0.000005s : 29: predicate.tuple_list_convert_item_index_to_positive 1.58% : 0.000005s : 29: predicate.tuple_list_get_item_const_eliminator 1.62% : 0.000005s : 29: predicate.tuple_list_get_item_depend_reorder 3.13% : 0.000010s : 45: predicate.tuple_list_get_item_eliminator 1.50% : 0.000005s : 29: predicate.tuple_list_get_set_item_eliminator 2.55% : 0.000008s : 41: predicate.tuple_list_set_item_eliminator 1.76% : 0.000005s : 33: predicate.tuple_to_list_eliminator_ 2.30% : 0.000007s : 50: predicate.updatestate_pure_node_eliminater 2.98% : 0.000009s : 62: predicate.updatestate_useless_node_eliminater 0.52% : 0.000002s : 6: predicate.value_based_eliminate 0.73% : 0.000002s : 12: predicate.virtual_dataset_eliminate 0.73% : 0.000002s : 12: predicate.virtual_output_eliminate 0.29% : 0.000001s : 6: predicate.virtual_view_grad_eliminate 0.39% : 0.000001s : 6: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000811 11 42.98% : 0.000348s : 5: func_graph_cloner_run.FuncGraphClonerGraph 57.02% : 0.000462s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.090522 192 0.00% : 0.000004s : 1: ForceFp32Comm 7.13% : 0.006452s : 1: add_attr 7.11% : 0.006434s : 1: add_attr_with_inline 0.00% : 0.000004s : 1: add_comm_op_reuse_tag 0.08% : 0.000071s : 1: add_recomputation 0.00% : 0.000004s : 1: assign_add_opt 0.09% : 0.000083s : 1: auto_monad 0.03% : 0.000028s : 1: auto_monad_reorder 0.00% : 0.000004s : 1: begin_end_overlap_inline 0.01% : 0.000006s : 1: bias_add_comm_swap 0.52% : 0.000468s : 1: bootstrap 0.04% : 0.000039s : 1: cconv 0.00% : 0.000004s : 1: comm_op_add_attrs 0.02% : 0.000021s : 1: control_data_broadcast_order 0.01% : 0.000013s : 1: convert_after_rewriter 0.04% : 0.000033s : 1: cse_after_recomputation 0.01% : 0.000005s : 1: dataset_repeat_opt 0.01% : 0.000006s : 1: detach_backward 0.01% : 0.000010s : 1: environ_conv 0.04% : 0.000033s : 1: event_method 0.01% : 0.000005s : 1: full_micro_interleaved_order_control 0.01% : 0.000005s : 1: get_jit_bprop_graph 0.01% : 0.000010s : 1: graph_reusing 0.01% : 0.000005s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000004s : 1: handle_group_info 0.01% : 0.000006s : 1: inline 0.01% : 0.000006s : 1: insert-virtual-dataset 0.00% : 0.000004s : 1: interleave_parallel_branches 0.00% : 0.000004s : 1: interleave_split_concat_branches 0.01% : 0.000006s : 1: label_fine_grained_interleaved_index 0.01% : 0.000009s : 1: label_micro_interleaved_index 0.52% : 0.000470s : 1: loop_unroll 0.00% : 0.000004s : 1: merge_cast_opt 0.01% : 0.000005s : 1: micro_interleaved_order_control 0.90% : 0.000811s : 1: mutable_eliminate 0.01% : 0.000008s : 1: offloading_packed_experts 0.02% : 0.000019s : 1: opt.transform.loop_unroll_optimizer 0.03% : 0.000027s : 1: opt.transform.mutable_eliminate 1.97% : 0.001783s : 78: opt.transform.opt_a 0.05% : 0.000043s : 1: opt.transform.opt_after_cconv 0.04% : 0.000036s : 1: opt.transform.opt_after_jit_grad 0.20% : 0.000179s : 28: opt.transform.opt_b 0.08% : 0.000071s : 2: opt.transform.opt_trans_graph 0.06% : 0.000051s : 4: opt.transform.symbol_engine_opt 17.18% : 0.015550s : 1: opt_a 0.16% : 0.000142s : 1: opt_after_cconv 0.59% : 0.000538s : 1: opt_after_jit_grad 0.36% : 0.000324s : 1: opt_b 20.20% : 0.018281s : 1: optimize 0.03% : 0.000025s : 1: optimize_parallel_all_gather_comm 0.01% : 0.000010s : 1: order_py_execute_after_rewriter 0.04% : 0.000032s : 1: overlap_grad_flash_sp 0.00% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.01% : 0.000008s : 1: overlap_grad_ring_attention 0.01% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000005s : 1: overlap_opt_shard_in_pipeline 0.01% : 0.000005s : 1: overlap_param_gather 0.00% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.01% : 0.000009s : 1: overlap_recompute_and_grad_model_parallel 0.01% : 0.000005s : 1: overlap_recompute_comm 0.01% : 0.000008s : 1: parallel-infer-symbol 0.00% : 0.000004s : 1: parallel-infer-symbol-second 0.01% : 0.000005s : 1: partial_unused_args_eliminate 0.01% : 0.000005s : 1: pipeline_parallel_scheduler 0.01% : 0.000005s : 1: pipeline_split 0.06% : 0.000051s : 1: pre_auto_parallel 0.05% : 0.000043s : 1: py_interpret_to_execute 0.03% : 0.000026s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000004s : 1: remove_cast_before_assign_add 0.06% : 0.000056s : 1: remove_dup_value 11.90% : 0.010772s : 1: renormalize.infer 0.65% : 0.000587s : 1: renormalize.specialize 0.01% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.01% : 0.000009s : 1: rewriter_after_jit_bprop_graph 0.07% : 0.000060s : 1: rewriter_after_opt_a 0.13% : 0.000117s : 1: rewriter_before_opt_a 0.01% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.01% : 0.000005s : 1: slice_recompute_activation 0.00% : 0.000004s : 1: split_layernorm_comm 0.01% : 0.000005s : 1: split_matmul_comm_elemetwise 0.01% : 0.000010s : 1: swap_dp_allreduce_reducescatter 0.11% : 0.000101s : 1: symbol_engine_optimizer 0.11% : 0.000103s : 1: tuple_transform 29.04% : 0.026291s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:44:09.644.391 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:44:09.644.666 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0323824, [21] [bootstrap]: 0.00046207 [type_inference]: 0.0186903 [event_method]: 2.218e-05 [auto_monad]: 7.055e-05 [graph_reusing]: 6.76999e-06 [inline]: 3.21001e-06 [add_attr]: 0.00370357, [1] [add_attr_with_inline]: 0.00369055, [1] [Cycle 1]: 9.584e-05, [2] [tag_attr]: 2.574e-05 [meta_addattr_fg_expand]: 5.95002e-06 [parallel-infer-symbol]: 3.61001e-06 [pre_auto_parallel]: 4.106e-05 [insert-virtual-dataset]: 2.39001e-06 [parallel-infer-symbol-second]: 1.15001e-06 [dataset_repeat_opt]: 1.91e-06 [pipeline_split]: 1.60999e-06 [optimize]: 0.00768988, [53] [py_interpret_to_execute]: 3.703e-05 [rewriter_before_opt_a]: 0.00010146 [opt_a]: 0.0042597, [2] [Cycle 1]: 0.00314165, [45] [expand_dump_flag]: 3.3e-06 [switch_simplify]: 4.409e-05 [loop_unroll]: 3.174e-05 [a_1]: 0.00079614 [with_stream_mark]: 2.514e-05 [recompute_prepare]: 1.491e-05 [updatestate_depend_eliminate]: 6.61e-06 [updatestate_assign_eliminate]: 4.23001e-06 [updatestate_loads_eliminate]: 3.71001e-06 [parameter_eliminate]: 2.22001e-06 [a_2]: 0.00013536 [accelerated_algorithm]: 1.076e-05 [shard]: 2.79999e-06 [meta_shard_fg_expand]: 2.14e-06 [shard_inline]: 9.25999e-06 [merge_send_recv]: 1.13e-05 [auto_parallel]: 1.107e-05 [parallel]: 2.19e-05 [flash_sp]: 1.071e-05 [merge_comm]: 5.23002e-06 [allreduce_fusion]: 4.53001e-06 [matmul_add_comm_reduction]: 1.233e-05 [allreduce_slice_to_reducescatter]: 8.99978e-07 [virtual_shard_identity]: 1.181e-05 [virtual_dataset]: 8.24002e-06 [get_grad_eliminate_]: 8.04002e-06 [virtual_output]: 8.25999e-06 [merge_forward]: 5.44998e-06 [cell_reuse_recompute_pass]: 2.02999e-06 [offload_activation]: 1.191e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.225e-05 [merge_recompute_call_nodes]: 2.03002e-06 [before_grad]: 1.452e-05 [set_forward_comm_id_for_comm_node_pass]: 5.34998e-06 [meta_fg_expand]: 3.76999e-06 [flash_sp_send_recv_attached]: 3.31999e-06 [receive_attached]: 2.19999e-06 [after_resolve]: 1.46e-05 [a_after_grad]: 1.378e-05 [renormalize]: 0.00120156 [add_forward_monad_depend]: 9.77999e-06 [auto_monad_grad]: 2.79999e-06 [auto_monad_eliminator]: 2.542e-05 [cse]: 3.931e-05 [a_3]: 8.721e-05 [Cycle 2]: 0.0010966, [45] [expand_dump_flag]: 2.53e-06 [switch_simplify]: 1.051e-05 [loop_unroll]: 7.93001e-06 [a_1]: 0.00019862 [with_stream_mark]: 2.151e-05 [recompute_prepare]: 7.69002e-06 [updatestate_depend_eliminate]: 5.50001e-06 [updatestate_assign_eliminate]: 3.91999e-06 [updatestate_loads_eliminate]: 3.53e-06 [parameter_eliminate]: 2.36998e-06 [a_2]: 0.00012262 [accelerated_algorithm]: 9.49999e-06 [shard]: 2.93998e-06 [meta_shard_fg_expand]: 2.74999e-06 [shard_inline]: 2.088e-05 [merge_send_recv]: 1.014e-05 [auto_parallel]: 1.09e-05 [parallel]: 9.44e-06 [flash_sp]: 3.81001e-06 [merge_comm]: 4.85001e-06 [allreduce_fusion]: 4.53999e-06 [matmul_add_comm_reduction]: 1.102e-05 [allreduce_slice_to_reducescatter]: 8.80013e-07 [virtual_shard_identity]: 1.103e-05 [virtual_dataset]: 7.74002e-06 [get_grad_eliminate_]: 7.77002e-06 [virtual_output]: 7.61999e-06 [merge_forward]: 5.65001e-06 [cell_reuse_recompute_pass]: 2.88e-06 [offload_activation]: 1.174e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.13e-05 [merge_recompute_call_nodes]: 1.74998e-06 [before_grad]: 1.412e-05 [set_forward_comm_id_for_comm_node_pass]: 5.94e-06 [meta_fg_expand]: 3.65e-06 [flash_sp_send_recv_attached]: 1.86e-06 [receive_attached]: 2.56e-06 [after_resolve]: 1.572e-05 [a_after_grad]: 1.271e-05 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 3.51001e-06 [auto_monad_grad]: 2.06e-06 [auto_monad_eliminator]: 1.619e-05 [cse]: 2.824e-05 [a_3]: 6.347e-05 [py_interpret_to_execute_after_opt_a]: 2.317e-05 [slice_cell_reuse_recomputed_activation]: 5.41998e-06 [rewriter_after_opt_a]: 5.967e-05 [convert_after_rewriter]: 1.163e-05 [order_py_execute_after_rewriter]: 1.02e-05 [mutable_eliminate]: 0.00086072 [opt_b]: 0.00039096, [1] [Cycle 1]: 0.00037736, [7] [b_1]: 0.00023354 [b_2]: 1.293e-05 [updatestate_depend_eliminate]: 1.213e-05 [updatestate_assign_eliminate]: 4.16001e-06 [updatestate_loads_eliminate]: 3.99002e-06 [renormalize]: 1.04998e-06 [cse]: 4.158e-05 [optimize_parallel_all_gather_comm]: 3.071e-05 [overlap_param_gather]: 5.28002e-06 [cconv]: 4.149e-05 [loop_unroll]: 0.00082451 [opt_after_cconv]: 0.00017709, [1] [Cycle 1]: 0.00016382, [7] [c_1]: 4.26e-05 [parameter_eliminate]: 6.21998e-06 [updatestate_depend_eliminate]: 1.013e-05 [updatestate_assign_eliminate]: 3.51999e-06 [updatestate_loads_eliminate]: 3.58e-06 [cse]: 3.776e-05 [renormalize]: 8.10018e-07 [remove_dup_value]: 2.333e-05 [tuple_transform]: 0.00011479, [1] [Cycle 1]: 0.00010645, [4] [d_1]: 6.26e-05 [none_parameter_eliminate]: 1.71998e-06 [renormalize]: 1.49972e-07 [switch_simplify]: 9.27001e-06 [partial_unused_args_eliminate]: 4.97999e-06 [add_recomputation]: 7.441e-05 [cse_after_recomputation]: 3.96e-05, [1] [Cycle 1]: 3.145e-05, [1] [cse]: 2.019e-05 [environ_conv]: 1.169e-05 [swap_dp_allreduce_reducescatter]: 9.14e-06 [bias_add_comm_swap]: 6.28998e-06 [label_micro_interleaved_index]: 1.084e-05 [label_fine_grained_interleaved_index]: 5.15999e-06 [merge_cast_opt]: 4.66002e-06 [slice_recompute_activation]: 4.92e-06 [micro_interleaved_order_control]: 5.02999e-06 [assign_add_opt]: 3.86999e-06 [ForceFp32Comm]: 3.88001e-06 [remove_cast_before_assign_add]: 3.88999e-06 [full_micro_interleaved_order_control]: 4.76002e-06 [reorder_send_recv_between_fp_bp]: 5.54998e-06 [comm_op_add_attrs]: 4.25999e-06 [add_comm_op_reuse_tag]: 3.28e-06 [interleave_split_concat_branches]: 3.44001e-06 [interleave_parallel_branches]: 3.55e-06 [overlap_opt_shard_in_pipeline]: 3.58e-06 [overlap_opt_shard_grad_in_pipeline]: 4.40999e-06 [control_data_broadcast_order]: 2.189e-05 [grouped_pairwise_exchange_alltoall]: 4.67e-06 [offloading_packed_experts]: 8.1e-06 [overlap_recompute_and_grad_model_parallel]: 8.18999e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.59002e-06 [overlap_recompute_allgather_and_fa_grad]: 3.67002e-06 [overlap_recompute_comm]: 5.05001e-06 [overlap_grad_ring_attention]: 7.29001e-06 [overlap_grad_flash_sp]: 3.137e-05 [begin_end_overlap_inline]: 3.25e-06 [split_matmul_comm_elemetwise]: 5.52001e-06 [split_layernorm_comm]: 4.60001e-06 [handle_group_info]: 3.52002e-06 [symbol_engine_optimizer]: 0.00012979, [1] [Cycle 1]: 0.0001211, [6] [build]: 5.79999e-06 [elim_shapecalc]: 1.648e-05 [elim_not_effective]: 1.916e-05 [opt_reshape]: 1.094e-05 [fold_const_symbol]: 1.376e-05 [renormalize]: 2.00002e-07 [detach_backward]: 6.59999e-06 [pipeline_parallel_scheduler]: 2.21e-06 [auto_monad_reorder]: 3.051e-05 [get_jit_bprop_graph]: 2.36998e-06 [rewriter_after_jit_bprop_graph]: 8.97e-06 [opt_after_jit_grad]: 0.00086435 [validate]: 6.188e-05 Sums bootstrap : 0.000462s : 1.74% type_inference : 0.018690s : 70.22% event_method : 0.000022s : 0.08% auto_monad : 0.000071s : 0.27% graph_reusing : 0.000007s : 0.03% inline : 0.000003s : 0.01% add_attr.add_attr_with_inline.tag_attr : 0.000026s : 0.10% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.02% parallel-infer-symbol : 0.000004s : 0.01% pre_auto_parallel : 0.000041s : 0.15% insert-virtual-dataset : 0.000002s : 0.01% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.01% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000037s : 0.14% optimize.rewriter_before_opt_a : 0.000101s : 0.38% optimize.opt_a.expand_dump_flag : 0.000006s : 0.02% optimize.opt_a.switch_simplify : 0.000055s : 0.21% optimize.opt_a.loop_unroll : 0.000040s : 0.15% optimize.opt_a.a_1 : 0.000995s : 3.74% optimize.opt_a.with_stream_mark : 0.000047s : 0.18% optimize.opt_a.recompute_prepare : 0.000023s : 0.08% optimize.opt_a.updatestate_depend_eliminate : 0.000012s : 0.05% optimize.opt_a.updatestate_assign_eliminate : 0.000008s : 0.03% optimize.opt_a.updatestate_loads_eliminate : 0.000007s : 0.03% optimize.opt_a.parameter_eliminate : 0.000005s : 0.02% optimize.opt_a.a_2 : 0.000258s : 0.97% optimize.opt_a.accelerated_algorithm : 0.000020s : 0.08% optimize.opt_a.shard : 0.000006s : 0.02% optimize.opt_a.meta_shard_fg_expand : 0.000005s : 0.02% optimize.opt_a.shard_inline : 0.000030s : 0.11% optimize.opt_a.merge_send_recv : 0.000021s : 0.08% optimize.opt_a.auto_parallel : 0.000022s : 0.08% optimize.opt_a.parallel : 0.000031s : 0.12% optimize.opt_a.flash_sp : 0.000015s : 0.05% optimize.opt_a.merge_comm : 0.000010s : 0.04% optimize.opt_a.allreduce_fusion : 0.000009s : 0.03% optimize.opt_a.matmul_add_comm_reduction : 0.000023s : 0.09% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000002s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000023s : 0.09% optimize.opt_a.virtual_dataset : 0.000016s : 0.06% optimize.opt_a.get_grad_eliminate_ : 0.000016s : 0.06% optimize.opt_a.virtual_output : 0.000016s : 0.06% optimize.opt_a.merge_forward : 0.000011s : 0.04% optimize.opt_a.cell_reuse_recompute_pass : 0.000005s : 0.02% optimize.opt_a.offload_activation : 0.000024s : 0.09% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000044s : 0.16% optimize.opt_a.merge_recompute_call_nodes : 0.000004s : 0.01% optimize.opt_a.before_grad : 0.000029s : 0.11% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000011s : 0.04% optimize.opt_a.meta_fg_expand : 0.000007s : 0.03% optimize.opt_a.flash_sp_send_recv_attached : 0.000005s : 0.02% optimize.opt_a.receive_attached : 0.000005s : 0.02% optimize.opt_a.after_resolve : 0.000030s : 0.11% optimize.opt_a.a_after_grad : 0.000026s : 0.10% optimize.opt_a.renormalize : 0.001202s : 4.51% optimize.opt_a.add_forward_monad_depend : 0.000013s : 0.05% optimize.opt_a.auto_monad_grad : 0.000005s : 0.02% optimize.opt_a.auto_monad_eliminator : 0.000042s : 0.16% optimize.opt_a.cse : 0.000068s : 0.25% optimize.opt_a.a_3 : 0.000151s : 0.57% optimize.py_interpret_to_execute_after_opt_a : 0.000023s : 0.09% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.02% optimize.rewriter_after_opt_a : 0.000060s : 0.22% optimize.convert_after_rewriter : 0.000012s : 0.04% optimize.order_py_execute_after_rewriter : 0.000010s : 0.04% optimize.mutable_eliminate : 0.000861s : 3.23% optimize.opt_b.b_1 : 0.000234s : 0.88% optimize.opt_b.b_2 : 0.000013s : 0.05% optimize.opt_b.updatestate_depend_eliminate : 0.000012s : 0.05% optimize.opt_b.updatestate_assign_eliminate : 0.000004s : 0.02% optimize.opt_b.updatestate_loads_eliminate : 0.000004s : 0.01% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000042s : 0.16% optimize.optimize_parallel_all_gather_comm : 0.000031s : 0.12% optimize.overlap_param_gather : 0.000005s : 0.02% optimize.cconv : 0.000041s : 0.16% optimize.loop_unroll : 0.000825s : 3.10% optimize.opt_after_cconv.c_1 : 0.000043s : 0.16% optimize.opt_after_cconv.parameter_eliminate : 0.000006s : 0.02% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000010s : 0.04% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000004s : 0.01% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000004s : 0.01% optimize.opt_after_cconv.cse : 0.000038s : 0.14% optimize.opt_after_cconv.renormalize : 0.000001s : 0.00% optimize.remove_dup_value : 0.000023s : 0.09% optimize.tuple_transform.d_1 : 0.000063s : 0.24% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000009s : 0.03% optimize.partial_unused_args_eliminate : 0.000005s : 0.02% optimize.add_recomputation : 0.000074s : 0.28% optimize.cse_after_recomputation.cse : 0.000020s : 0.08% optimize.environ_conv : 0.000012s : 0.04% optimize.swap_dp_allreduce_reducescatter : 0.000009s : 0.03% optimize.bias_add_comm_swap : 0.000006s : 0.02% optimize.label_micro_interleaved_index : 0.000011s : 0.04% optimize.label_fine_grained_interleaved_index : 0.000005s : 0.02% optimize.merge_cast_opt : 0.000005s : 0.02% optimize.slice_recompute_activation : 0.000005s : 0.02% optimize.micro_interleaved_order_control : 0.000005s : 0.02% optimize.assign_add_opt : 0.000004s : 0.01% optimize.ForceFp32Comm : 0.000004s : 0.01% optimize.remove_cast_before_assign_add : 0.000004s : 0.01% optimize.full_micro_interleaved_order_control : 0.000005s : 0.02% optimize.reorder_send_recv_between_fp_bp : 0.000006s : 0.02% optimize.comm_op_add_attrs : 0.000004s : 0.02% optimize.add_comm_op_reuse_tag : 0.000003s : 0.01% optimize.interleave_split_concat_branches : 0.000003s : 0.01% optimize.interleave_parallel_branches : 0.000004s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000004s : 0.02% optimize.control_data_broadcast_order : 0.000022s : 0.08% optimize.grouped_pairwise_exchange_alltoall : 0.000005s : 0.02% optimize.offloading_packed_experts : 0.000008s : 0.03% optimize.overlap_recompute_and_grad_model_parallel : 0.000008s : 0.03% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.01% optimize.overlap_recompute_comm : 0.000005s : 0.02% optimize.overlap_grad_ring_attention : 0.000007s : 0.03% optimize.overlap_grad_flash_sp : 0.000031s : 0.12% optimize.begin_end_overlap_inline : 0.000003s : 0.01% optimize.split_matmul_comm_elemetwise : 0.000006s : 0.02% optimize.split_layernorm_comm : 0.000005s : 0.02% optimize.handle_group_info : 0.000004s : 0.01% optimize.symbol_engine_optimizer.build : 0.000006s : 0.02% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000016s : 0.06% optimize.symbol_engine_optimizer.elim_not_effective : 0.000019s : 0.07% optimize.symbol_engine_optimizer.opt_reshape : 0.000011s : 0.04% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000014s : 0.05% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000007s : 0.02% pipeline_parallel_scheduler : 0.000002s : 0.01% auto_monad_reorder : 0.000031s : 0.11% get_jit_bprop_graph : 0.000002s : 0.01% rewriter_after_jit_bprop_graph : 0.000009s : 0.03% opt_after_jit_grad : 0.000864s : 3.25% validate : 0.000062s : 0.23% Time group info: ------[substitution.] 0.000266 39 11.62% : 0.000031s : 3: substitution.cast_eliminate 0.95% : 0.000003s : 3: substitution.elim_not_effective 0.76% : 0.000002s : 3: substitution.fold_const_symbol 3.01% : 0.000008s : 5: substitution.graph_param_transform 66.85% : 0.000178s : 4: substitution.inline 2.39% : 0.000006s : 6: substitution.j_node_and_user_rematch 2.80% : 0.000007s : 6: substitution.remove_not_recompute_node 2.65% : 0.000007s : 4: substitution.replace_old_param 5.88% : 0.000016s : 4: substitution.tuple_list_get_item_eliminator 3.09% : 0.000008s : 1: substitution.value_based_eliminate ------[type_inference.] 0.018623 2 95.25% : 0.017739s : 1: type_inference.infer 4.75% : 0.000884s : 1: type_inference.specialize ------[replace.] 0.000072 8 60.20% : 0.000043s : 4: replace.inline 39.80% : 0.000029s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000189 8 92.81% : 0.000175s : 4: match.inline 7.19% : 0.000014s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000290 1596 1.05% : 0.000003s : 17: predicate.accumulaten_eliminater 0.90% : 0.000003s : 5: predicate.ad_related_special_op_eliminate 0.48% : 0.000001s : 10: predicate.addn_check_dump 0.86% : 0.000002s : 17: predicate.addn_zero_filter 0.76% : 0.000002s : 17: predicate.adjust_all_reduce_mul_add 2.02% : 0.000006s : 27: predicate.arithmetic_simplify 1.11% : 0.000003s : 17: predicate.cast_eliminate 0.52% : 0.000002s : 10: predicate.check_bprop_eliminate 0.65% : 0.000002s : 10: predicate.compare_switch_simplify 0.16% : 0.000000s : 5: predicate.const_output_eliminate 0.72% : 0.000002s : 10: predicate.depend_value_elim 0.92% : 0.000003s : 17: predicate.dict_get_item_const_eliminator 1.09% : 0.000003s : 17: predicate.dict_get_item_eliminator 0.88% : 0.000003s : 17: predicate.dict_set_item_eliminator 1.32% : 0.000004s : 10: predicate.dumpgradient_eliminate 0.17% : 0.000000s : 5: predicate.elim_not_effective 0.38% : 0.000001s : 5: predicate.elim_shapecalc_of_broadcastargs 1.23% : 0.000004s : 22: predicate.environ_add_const_eliminate 1.11% : 0.000003s : 22: predicate.environ_get_add_eliminate 1.12% : 0.000003s : 22: predicate.environ_get_depend_swap 1.69% : 0.000005s : 32: predicate.environ_get_eliminate 1.05% : 0.000003s : 22: predicate.environ_get_set_eliminate 1.23% : 0.000004s : 25: predicate.exchange_switch_depend_value 2.12% : 0.000006s : 25: predicate.float_depend_g_call 0.58% : 0.000002s : 10: predicate.float_environ_get_switch 0.76% : 0.000002s : 15: predicate.float_tuple_getitem_switch 0.16% : 0.000000s : 5: predicate.fold_const_symbol 0.63% : 0.000002s : 10: predicate.get_grad_eliminate 0.23% : 0.000001s : 5: predicate.graph_param_transform 0.53% : 0.000002s : 10: predicate.incorporate_call 0.45% : 0.000001s : 10: predicate.incorporate_call_switch 6.35% : 0.000018s : 72: predicate.inline 0.86% : 0.000002s : 10: predicate.inline_without_move 0.28% : 0.000001s : 10: predicate.j_node_and_user_rematch 0.94% : 0.000003s : 10: predicate.less_batch_normalization 1.97% : 0.000006s : 31: predicate.list_to_tuple_eliminator_ 2.45% : 0.000007s : 48: predicate.load_eliminater 1.44% : 0.000004s : 5: predicate.loop_unroll_after_grad 1.83% : 0.000005s : 36: predicate.loop_unroll_before_grad 1.66% : 0.000005s : 27: predicate.make_slice_get_slice_eliminator 0.57% : 0.000002s : 10: predicate.merge_addn 0.57% : 0.000002s : 10: predicate.micro_step_allgather_replace 0.63% : 0.000002s : 10: predicate.mini_step_allgather_replace 0.80% : 0.000002s : 17: predicate.minmaximum_grad 1.71% : 0.000005s : 5: predicate.mutable_eliminate 0.62% : 0.000002s : 5: predicate.opt_reshape 0.37% : 0.000001s : 5: predicate.parallel_virtual_node 1.55% : 0.000004s : 25: predicate.partial_defer_inline 1.53% : 0.000004s : 26: predicate.partial_eliminate 0.84% : 0.000002s : 17: predicate.print_const_string_wrapper 0.57% : 0.000002s : 10: predicate.reduce_all_const_elim 1.13% : 0.000003s : 17: predicate.reduce_eliminate 2.45% : 0.000007s : 48: predicate.redundant_stop_gradient_eliminater 0.53% : 0.000002s : 10: predicate.remove_not_recompute_node 1.25% : 0.000004s : 31: predicate.replace_applicator 0.45% : 0.000001s : 10: predicate.replace_old_param 0.39% : 0.000001s : 5: predicate.reset_defer_inline 0.97% : 0.000003s : 17: predicate.reshape_eliminate 0.62% : 0.000002s : 10: predicate.row_tensor_add_zeros_like 0.39% : 0.000001s : 5: predicate.row_tensor_eliminate 0.82% : 0.000002s : 10: predicate.same_eliminate 0.50% : 0.000001s : 10: predicate.set_cell_output_no_recompute 0.81% : 0.000002s : 10: predicate.shard_identity_eliminate 0.82% : 0.000002s : 10: predicate.special_op_eliminate 0.72% : 0.000002s : 10: predicate.specialize_transform 1.04% : 0.000003s : 10: predicate.split_environ_get_set_with_tuple_value 0.89% : 0.000003s : 10: predicate.stack_unstack_eliminate 0.30% : 0.000001s : 5: predicate.switch_call_monad_eliminater 1.48% : 0.000004s : 25: predicate.switch_defer_inline 1.88% : 0.000005s : 35: predicate.switch_layer_defer_inline 4.34% : 0.000013s : 76: predicate.switch_simplify 0.92% : 0.000003s : 17: predicate.tile_eliminate 1.04% : 0.000003s : 17: predicate.transpose_eliminate 1.62% : 0.000005s : 27: predicate.tuple_list_convert_item_index_to_positive 1.66% : 0.000005s : 27: predicate.tuple_list_get_item_const_eliminator 1.83% : 0.000005s : 27: predicate.tuple_list_get_item_depend_reorder 3.02% : 0.000009s : 41: predicate.tuple_list_get_item_eliminator 1.56% : 0.000005s : 27: predicate.tuple_list_get_set_item_eliminator 2.40% : 0.000007s : 37: predicate.tuple_list_set_item_eliminator 1.76% : 0.000005s : 31: predicate.tuple_to_list_eliminator_ 2.26% : 0.000007s : 48: predicate.updatestate_pure_node_eliminater 2.86% : 0.000008s : 58: predicate.updatestate_useless_node_eliminater 0.76% : 0.000002s : 5: predicate.value_based_eliminate 0.70% : 0.000002s : 10: predicate.virtual_dataset_eliminate 0.67% : 0.000002s : 10: predicate.virtual_output_eliminate 0.31% : 0.000001s : 5: predicate.virtual_view_grad_eliminate 0.45% : 0.000001s : 5: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000674 11 47.34% : 0.000319s : 5: func_graph_cloner_run.FuncGraphClonerGraph 52.66% : 0.000355s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.046801 192 0.01% : 0.000006s : 1: ForceFp32Comm 7.94% : 0.003715s : 1: add_attr 7.89% : 0.003695s : 1: add_attr_with_inline 0.01% : 0.000006s : 1: add_comm_op_reuse_tag 0.17% : 0.000078s : 1: add_recomputation 0.02% : 0.000007s : 1: assign_add_opt 0.17% : 0.000082s : 1: auto_monad 0.08% : 0.000039s : 1: auto_monad_reorder 0.01% : 0.000006s : 1: begin_end_overlap_inline 0.02% : 0.000009s : 1: bias_add_comm_swap 1.09% : 0.000509s : 1: bootstrap 0.10% : 0.000046s : 1: cconv 0.01% : 0.000007s : 1: comm_op_add_attrs 0.05% : 0.000025s : 1: control_data_broadcast_order 0.03% : 0.000016s : 1: convert_after_rewriter 0.09% : 0.000043s : 1: cse_after_recomputation 0.02% : 0.000007s : 1: dataset_repeat_opt 0.07% : 0.000034s : 1: detach_backward 0.03% : 0.000015s : 1: environ_conv 0.07% : 0.000033s : 1: event_method 0.02% : 0.000007s : 1: full_micro_interleaved_order_control 0.02% : 0.000009s : 1: get_jit_bprop_graph 0.03% : 0.000013s : 1: graph_reusing 0.02% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000007s : 1: handle_group_info 0.02% : 0.000009s : 1: inline 0.02% : 0.000009s : 1: insert-virtual-dataset 0.01% : 0.000006s : 1: interleave_parallel_branches 0.01% : 0.000006s : 1: interleave_split_concat_branches 0.02% : 0.000008s : 1: label_fine_grained_interleaved_index 0.03% : 0.000014s : 1: label_micro_interleaved_index 1.79% : 0.000836s : 1: loop_unroll 0.02% : 0.000007s : 1: merge_cast_opt 0.02% : 0.000008s : 1: micro_interleaved_order_control 1.87% : 0.000874s : 1: mutable_eliminate 0.02% : 0.000011s : 1: offloading_packed_experts 0.06% : 0.000030s : 1: opt.transform.loop_unroll_optimizer 0.06% : 0.000029s : 1: opt.transform.mutable_eliminate 3.36% : 0.001572s : 78: opt.transform.opt_a 0.09% : 0.000041s : 1: opt.transform.opt_after_cconv 0.09% : 0.000043s : 1: opt.transform.opt_after_jit_grad 0.35% : 0.000165s : 28: opt.transform.opt_b 0.15% : 0.000069s : 2: opt.transform.opt_trans_graph 0.12% : 0.000054s : 4: opt.transform.symbol_engine_opt 9.11% : 0.004264s : 1: opt_a 0.39% : 0.000181s : 1: opt_after_cconv 1.88% : 0.000880s : 1: opt_after_jit_grad 0.84% : 0.000395s : 1: opt_b 17.33% : 0.008108s : 1: optimize 0.07% : 0.000034s : 1: optimize_parallel_all_gather_comm 0.03% : 0.000014s : 1: order_py_execute_after_rewriter 0.08% : 0.000036s : 1: overlap_grad_flash_sp 0.01% : 0.000006s : 1: overlap_grad_matmul_and_grad_allreduce 0.02% : 0.000011s : 1: overlap_grad_ring_attention 0.02% : 0.000008s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000006s : 1: overlap_opt_shard_in_pipeline 0.02% : 0.000009s : 1: overlap_param_gather 0.01% : 0.000006s : 1: overlap_recompute_allgather_and_fa_grad 0.02% : 0.000012s : 1: overlap_recompute_and_grad_model_parallel 0.02% : 0.000008s : 1: overlap_recompute_comm 0.02% : 0.000010s : 1: parallel-infer-symbol 0.01% : 0.000007s : 1: parallel-infer-symbol-second 0.02% : 0.000008s : 1: partial_unused_args_eliminate 0.02% : 0.000010s : 1: pipeline_parallel_scheduler 0.01% : 0.000007s : 1: pipeline_split 0.11% : 0.000050s : 1: pre_auto_parallel 0.09% : 0.000041s : 1: py_interpret_to_execute 0.06% : 0.000027s : 1: py_interpret_to_execute_after_opt_a 0.02% : 0.000008s : 1: remove_cast_before_assign_add 0.06% : 0.000027s : 1: remove_dup_value 1.54% : 0.000721s : 1: renormalize.infer 1.00% : 0.000469s : 1: renormalize.specialize 0.02% : 0.000009s : 1: reorder_send_recv_between_fp_bp 0.03% : 0.000015s : 1: rewriter_after_jit_bprop_graph 0.14% : 0.000065s : 1: rewriter_after_opt_a 0.23% : 0.000106s : 1: rewriter_before_opt_a 0.02% : 0.000008s : 1: slice_cell_reuse_recomputed_activation 0.02% : 0.000008s : 1: slice_recompute_activation 0.02% : 0.000007s : 1: split_layernorm_comm 0.02% : 0.000008s : 1: split_matmul_comm_elemetwise 0.03% : 0.000012s : 1: swap_dp_allreduce_reducescatter 0.28% : 0.000133s : 1: symbol_engine_optimizer 0.25% : 0.000118s : 1: tuple_transform 40.03% : 0.018735s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:44:10.394.734 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0333063, [21] [bootstrap]: 0.00050132 [type_inference]: 0.0222204 [event_method]: 2.273e-05 [auto_monad]: 7.263e-05 [graph_reusing]: 6.09999e-06 [inline]: 3.34001e-06 [add_attr]: 0.00367343, [1] [add_attr_with_inline]: 0.0036612, [1] [Cycle 1]: 7.277e-05, [2] [tag_attr]: 2.459e-05 [meta_addattr_fg_expand]: 6.31998e-06 [parallel-infer-symbol]: 4.05e-06 [pre_auto_parallel]: 3.972e-05 [insert-virtual-dataset]: 2.49999e-06 [parallel-infer-symbol-second]: 8.2e-07 [dataset_repeat_opt]: 2.07001e-06 [pipeline_split]: 1.89e-06 [optimize]: 0.00598302, [53] [py_interpret_to_execute]: 3.024e-05 [rewriter_before_opt_a]: 9.444e-05 [opt_a]: 0.00336643, [2] [Cycle 1]: 0.00253158, [45] [expand_dump_flag]: 3.35e-06 [switch_simplify]: 4.654e-05 [loop_unroll]: 3.13e-05 [a_1]: 0.00077261 [with_stream_mark]: 1.925e-05 [recompute_prepare]: 1.188e-05 [updatestate_depend_eliminate]: 5.03002e-06 [updatestate_assign_eliminate]: 3.95e-06 [updatestate_loads_eliminate]: 3.60998e-06 [parameter_eliminate]: 1.82001e-06 [a_2]: 0.00010325 [accelerated_algorithm]: 8.57998e-06 [shard]: 1.89999e-06 [meta_shard_fg_expand]: 2.34999e-06 [shard_inline]: 8.74e-06 [merge_send_recv]: 1.008e-05 [auto_parallel]: 7.91001e-06 [parallel]: 2.037e-05 [flash_sp]: 9.92001e-06 [merge_comm]: 5.09e-06 [allreduce_fusion]: 4.3e-06 [matmul_add_comm_reduction]: 1.196e-05 [allreduce_slice_to_reducescatter]: 1.12e-06 [virtual_shard_identity]: 9.76e-06 [virtual_dataset]: 8.36002e-06 [get_grad_eliminate_]: 7.66001e-06 [virtual_output]: 7.97e-06 [merge_forward]: 4.48999e-06 [cell_reuse_recompute_pass]: 1.15001e-06 [offload_activation]: 1.2e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.744e-05 [merge_recompute_call_nodes]: 1.72001e-06 [before_grad]: 1.374e-05 [set_forward_comm_id_for_comm_node_pass]: 4.50999e-06 [meta_fg_expand]: 3.36001e-06 [flash_sp_send_recv_attached]: 3.19001e-06 [receive_attached]: 2.10002e-06 [after_resolve]: 1.281e-05 [a_after_grad]: 1.27e-05 [renormalize]: 0.00091901 [add_forward_monad_depend]: 7e-06 [auto_monad_grad]: 2.99001e-06 [auto_monad_eliminator]: 2.045e-05 [cse]: 3.778e-05 [a_3]: 6.401e-05 [Cycle 2]: 0.00082285, [45] [expand_dump_flag]: 1.66e-06 [switch_simplify]: 1.019e-05 [loop_unroll]: 7.61999e-06 [a_1]: 0.00020126 [with_stream_mark]: 1.774e-05 [recompute_prepare]: 8.43001e-06 [updatestate_depend_eliminate]: 3.66999e-06 [updatestate_assign_eliminate]: 3.21999e-06 [updatestate_loads_eliminate]: 3.10002e-06 [parameter_eliminate]: 2.09e-06 [a_2]: 9.16e-05 [accelerated_algorithm]: 8.32e-06 [shard]: 1.69e-06 [meta_shard_fg_expand]: 2.18002e-06 [shard_inline]: 1.023e-05 [merge_send_recv]: 7.45e-06 [auto_parallel]: 7.21999e-06 [parallel]: 7.05e-06 [flash_sp]: 4.23999e-06 [merge_comm]: 4.11001e-06 [allreduce_fusion]: 4.08999e-06 [matmul_add_comm_reduction]: 8.57e-06 [allreduce_slice_to_reducescatter]: 3.89991e-07 [virtual_shard_identity]: 8.59002e-06 [virtual_dataset]: 7.14001e-06 [get_grad_eliminate_]: 6.91999e-06 [virtual_output]: 7.21001e-06 [merge_forward]: 4.23001e-06 [cell_reuse_recompute_pass]: 2.24001e-06 [offload_activation]: 1.04e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.599e-05 [merge_recompute_call_nodes]: 1.72999e-06 [before_grad]: 1.223e-05 [set_forward_comm_id_for_comm_node_pass]: 4.83001e-06 [meta_fg_expand]: 2.73998e-06 [flash_sp_send_recv_attached]: 1.07e-06 [receive_attached]: 2.07001e-06 [after_resolve]: 1.21e-05 [a_after_grad]: 1.138e-05 [renormalize]: 6.00121e-08 [add_forward_monad_depend]: 2.27999e-06 [auto_monad_grad]: 1.67001e-06 [auto_monad_eliminator]: 1.022e-05 [cse]: 2.223e-05 [a_3]: 4.503e-05 [py_interpret_to_execute_after_opt_a]: 1.616e-05 [slice_cell_reuse_recomputed_activation]: 2.21e-06 [rewriter_after_opt_a]: 4.667e-05 [convert_after_rewriter]: 7.85e-06 [order_py_execute_after_rewriter]: 6.00002e-06 [mutable_eliminate]: 0.00079687 [opt_b]: 0.00028021, [1] [Cycle 1]: 0.00027304, [7] [b_1]: 0.00017467 [b_2]: 1.068e-05 [updatestate_depend_eliminate]: 8.69e-06 [updatestate_assign_eliminate]: 3.45998e-06 [updatestate_loads_eliminate]: 3.33e-06 [renormalize]: 8.50006e-07 [cse]: 3.146e-05 [optimize_parallel_all_gather_comm]: 1.969e-05 [overlap_param_gather]: 2.16e-06 [cconv]: 3.336e-05 [loop_unroll]: 0.00050742 [opt_after_cconv]: 0.00012796, [1] [Cycle 1]: 0.0001212, [7] [c_1]: 3.858e-05 [parameter_eliminate]: 5.29998e-06 [updatestate_depend_eliminate]: 7.68001e-06 [updatestate_assign_eliminate]: 3.26999e-06 [updatestate_loads_eliminate]: 2.91999e-06 [cse]: 2.609e-05 [renormalize]: 8.99978e-07 [remove_dup_value]: 1.862e-05 [tuple_transform]: 9.174e-05, [1] [Cycle 1]: 8.636e-05, [4] [d_1]: 5.516e-05 [none_parameter_eliminate]: 1.64e-06 [renormalize]: 1.8999e-07 [switch_simplify]: 9.10999e-06 [partial_unused_args_eliminate]: 2.26998e-06 [add_recomputation]: 6.429e-05 [cse_after_recomputation]: 2.789e-05, [1] [Cycle 1]: 2.204e-05, [1] [cse]: 1.585e-05 [environ_conv]: 7.92e-06 [swap_dp_allreduce_reducescatter]: 6.34999e-06 [bias_add_comm_swap]: 3.06001e-06 [label_micro_interleaved_index]: 4.82e-06 [label_fine_grained_interleaved_index]: 3.03e-06 [merge_cast_opt]: 1.25999e-06 [slice_recompute_activation]: 2.03997e-06 [micro_interleaved_order_control]: 2.68e-06 [assign_add_opt]: 1.32999e-06 [ForceFp32Comm]: 7.80012e-07 [remove_cast_before_assign_add]: 1.09003e-06 [full_micro_interleaved_order_control]: 2.11998e-06 [reorder_send_recv_between_fp_bp]: 3.01001e-06 [comm_op_add_attrs]: 1.10001e-06 [add_comm_op_reuse_tag]: 1.30999e-06 [interleave_split_concat_branches]: 1.22e-06 [interleave_parallel_branches]: 1.09003e-06 [overlap_opt_shard_in_pipeline]: 1.43002e-06 [overlap_opt_shard_grad_in_pipeline]: 1.76e-06 [control_data_broadcast_order]: 1.588e-05 [grouped_pairwise_exchange_alltoall]: 2.11e-06 [offloading_packed_experts]: 4.38001e-06 [overlap_recompute_and_grad_model_parallel]: 5.13002e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.18001e-06 [overlap_recompute_allgather_and_fa_grad]: 1.44998e-06 [overlap_recompute_comm]: 2.49001e-06 [overlap_grad_ring_attention]: 4.82998e-06 [overlap_grad_flash_sp]: 2.523e-05 [begin_end_overlap_inline]: 6.10016e-07 [split_matmul_comm_elemetwise]: 2.49001e-06 [split_layernorm_comm]: 1.72001e-06 [handle_group_info]: 1.62001e-06 [symbol_engine_optimizer]: 8.932e-05, [1] [Cycle 1]: 8.498e-05, [6] [build]: 4.12e-06 [elim_shapecalc]: 1.357e-05 [elim_not_effective]: 1.568e-05 [opt_reshape]: 8.38999e-06 [fold_const_symbol]: 1.229e-05 [renormalize]: 2.30008e-07 [detach_backward]: 2.26e-06 [pipeline_parallel_scheduler]: 1.55999e-06 [auto_monad_reorder]: 2.042e-05 [get_jit_bprop_graph]: 2.19001e-06 [rewriter_after_jit_bprop_graph]: 5.37001e-06 [opt_after_jit_grad]: 0.00051576 [validate]: 5.146e-05 Sums bootstrap : 0.000501s : 1.75% type_inference : 0.022220s : 77.68% event_method : 0.000023s : 0.08% auto_monad : 0.000073s : 0.25% graph_reusing : 0.000006s : 0.02% inline : 0.000003s : 0.01% add_attr.add_attr_with_inline.tag_attr : 0.000025s : 0.09% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.02% parallel-infer-symbol : 0.000004s : 0.01% pre_auto_parallel : 0.000040s : 0.14% insert-virtual-dataset : 0.000002s : 0.01% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.01% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000030s : 0.11% optimize.rewriter_before_opt_a : 0.000094s : 0.33% optimize.opt_a.expand_dump_flag : 0.000005s : 0.02% optimize.opt_a.switch_simplify : 0.000057s : 0.20% optimize.opt_a.loop_unroll : 0.000039s : 0.14% optimize.opt_a.a_1 : 0.000974s : 3.40% optimize.opt_a.with_stream_mark : 0.000037s : 0.13% optimize.opt_a.recompute_prepare : 0.000020s : 0.07% optimize.opt_a.updatestate_depend_eliminate : 0.000009s : 0.03% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.03% optimize.opt_a.updatestate_loads_eliminate : 0.000007s : 0.02% optimize.opt_a.parameter_eliminate : 0.000004s : 0.01% optimize.opt_a.a_2 : 0.000195s : 0.68% optimize.opt_a.accelerated_algorithm : 0.000017s : 0.06% optimize.opt_a.shard : 0.000004s : 0.01% optimize.opt_a.meta_shard_fg_expand : 0.000005s : 0.02% optimize.opt_a.shard_inline : 0.000019s : 0.07% optimize.opt_a.merge_send_recv : 0.000018s : 0.06% optimize.opt_a.auto_parallel : 0.000015s : 0.05% optimize.opt_a.parallel : 0.000027s : 0.10% optimize.opt_a.flash_sp : 0.000014s : 0.05% optimize.opt_a.merge_comm : 0.000009s : 0.03% optimize.opt_a.allreduce_fusion : 0.000008s : 0.03% optimize.opt_a.matmul_add_comm_reduction : 0.000021s : 0.07% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000002s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000018s : 0.06% optimize.opt_a.virtual_dataset : 0.000016s : 0.05% optimize.opt_a.get_grad_eliminate_ : 0.000015s : 0.05% optimize.opt_a.virtual_output : 0.000015s : 0.05% optimize.opt_a.merge_forward : 0.000009s : 0.03% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.01% optimize.opt_a.offload_activation : 0.000022s : 0.08% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000033s : 0.12% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.01% optimize.opt_a.before_grad : 0.000026s : 0.09% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000009s : 0.03% optimize.opt_a.meta_fg_expand : 0.000006s : 0.02% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.01% optimize.opt_a.receive_attached : 0.000004s : 0.01% optimize.opt_a.after_resolve : 0.000025s : 0.09% optimize.opt_a.a_after_grad : 0.000024s : 0.08% optimize.opt_a.renormalize : 0.000919s : 3.21% optimize.opt_a.add_forward_monad_depend : 0.000009s : 0.03% optimize.opt_a.auto_monad_grad : 0.000005s : 0.02% optimize.opt_a.auto_monad_eliminator : 0.000031s : 0.11% optimize.opt_a.cse : 0.000060s : 0.21% optimize.opt_a.a_3 : 0.000109s : 0.38% optimize.py_interpret_to_execute_after_opt_a : 0.000016s : 0.06% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.01% optimize.rewriter_after_opt_a : 0.000047s : 0.16% optimize.convert_after_rewriter : 0.000008s : 0.03% optimize.order_py_execute_after_rewriter : 0.000006s : 0.02% optimize.mutable_eliminate : 0.000797s : 2.79% optimize.opt_b.b_1 : 0.000175s : 0.61% optimize.opt_b.b_2 : 0.000011s : 0.04% optimize.opt_b.updatestate_depend_eliminate : 0.000009s : 0.03% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.01% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.01% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000031s : 0.11% optimize.optimize_parallel_all_gather_comm : 0.000020s : 0.07% optimize.overlap_param_gather : 0.000002s : 0.01% optimize.cconv : 0.000033s : 0.12% optimize.loop_unroll : 0.000507s : 1.77% optimize.opt_after_cconv.c_1 : 0.000039s : 0.13% optimize.opt_after_cconv.parameter_eliminate : 0.000005s : 0.02% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000008s : 0.03% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.cse : 0.000026s : 0.09% optimize.opt_after_cconv.renormalize : 0.000001s : 0.00% optimize.remove_dup_value : 0.000019s : 0.07% optimize.tuple_transform.d_1 : 0.000055s : 0.19% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000009s : 0.03% optimize.partial_unused_args_eliminate : 0.000002s : 0.01% optimize.add_recomputation : 0.000064s : 0.22% optimize.cse_after_recomputation.cse : 0.000016s : 0.06% optimize.environ_conv : 0.000008s : 0.03% optimize.swap_dp_allreduce_reducescatter : 0.000006s : 0.02% optimize.bias_add_comm_swap : 0.000003s : 0.01% optimize.label_micro_interleaved_index : 0.000005s : 0.02% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.01% optimize.merge_cast_opt : 0.000001s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.01% optimize.micro_interleaved_order_control : 0.000003s : 0.01% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.00% optimize.full_micro_interleaved_order_control : 0.000002s : 0.01% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.01% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.01% optimize.control_data_broadcast_order : 0.000016s : 0.06% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.01% optimize.offloading_packed_experts : 0.000004s : 0.02% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.02% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.01% optimize.overlap_recompute_comm : 0.000002s : 0.01% optimize.overlap_grad_ring_attention : 0.000005s : 0.02% optimize.overlap_grad_flash_sp : 0.000025s : 0.09% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.01% optimize.split_layernorm_comm : 0.000002s : 0.01% optimize.handle_group_info : 0.000002s : 0.01% optimize.symbol_engine_optimizer.build : 0.000004s : 0.01% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000014s : 0.05% optimize.symbol_engine_optimizer.elim_not_effective : 0.000016s : 0.05% optimize.symbol_engine_optimizer.opt_reshape : 0.000008s : 0.03% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000012s : 0.04% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.01% pipeline_parallel_scheduler : 0.000002s : 0.01% auto_monad_reorder : 0.000020s : 0.07% get_jit_bprop_graph : 0.000002s : 0.01% rewriter_after_jit_bprop_graph : 0.000005s : 0.02% opt_after_jit_grad : 0.000516s : 1.80% validate : 0.000051s : 0.18% Time group info: ------[substitution.] 0.000261 39 16.59% : 0.000043s : 3: substitution.cast_eliminate 0.86% : 0.000002s : 3: substitution.elim_not_effective 0.69% : 0.000002s : 3: substitution.fold_const_symbol 2.78% : 0.000007s : 5: substitution.graph_param_transform 63.93% : 0.000167s : 4: substitution.inline 2.03% : 0.000005s : 6: substitution.j_node_and_user_rematch 2.47% : 0.000006s : 6: substitution.remove_not_recompute_node 1.75% : 0.000005s : 4: substitution.replace_old_param 5.77% : 0.000015s : 4: substitution.tuple_list_get_item_eliminator 3.12% : 0.000008s : 1: substitution.value_based_eliminate ------[type_inference.] 0.022136 2 95.63% : 0.021169s : 1: type_inference.infer 4.37% : 0.000967s : 1: type_inference.specialize ------[replace.] 0.000066 8 61.61% : 0.000041s : 4: replace.inline 38.39% : 0.000025s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000177 8 92.60% : 0.000164s : 4: match.inline 7.40% : 0.000013s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000265 1596 0.90% : 0.000002s : 17: predicate.accumulaten_eliminater 0.76% : 0.000002s : 5: predicate.ad_related_special_op_eliminate 0.51% : 0.000001s : 10: predicate.addn_check_dump 0.90% : 0.000002s : 17: predicate.addn_zero_filter 0.83% : 0.000002s : 17: predicate.adjust_all_reduce_mul_add 2.07% : 0.000005s : 27: predicate.arithmetic_simplify 1.01% : 0.000003s : 17: predicate.cast_eliminate 0.71% : 0.000002s : 10: predicate.check_bprop_eliminate 0.57% : 0.000002s : 10: predicate.compare_switch_simplify 0.27% : 0.000001s : 5: predicate.const_output_eliminate 0.64% : 0.000002s : 10: predicate.depend_value_elim 0.99% : 0.000003s : 17: predicate.dict_get_item_const_eliminator 1.01% : 0.000003s : 17: predicate.dict_get_item_eliminator 0.90% : 0.000002s : 17: predicate.dict_set_item_eliminator 1.04% : 0.000003s : 10: predicate.dumpgradient_eliminate 0.19% : 0.000001s : 5: predicate.elim_not_effective 0.37% : 0.000001s : 5: predicate.elim_shapecalc_of_broadcastargs 1.17% : 0.000003s : 22: predicate.environ_add_const_eliminate 1.11% : 0.000003s : 22: predicate.environ_get_add_eliminate 1.09% : 0.000003s : 22: predicate.environ_get_depend_swap 1.67% : 0.000004s : 32: predicate.environ_get_eliminate 1.09% : 0.000003s : 22: predicate.environ_get_set_eliminate 1.33% : 0.000004s : 25: predicate.exchange_switch_depend_value 2.20% : 0.000006s : 25: predicate.float_depend_g_call 0.51% : 0.000001s : 10: predicate.float_environ_get_switch 0.86% : 0.000002s : 15: predicate.float_tuple_getitem_switch 0.17% : 0.000000s : 5: predicate.fold_const_symbol 0.69% : 0.000002s : 10: predicate.get_grad_eliminate 0.28% : 0.000001s : 5: predicate.graph_param_transform 0.57% : 0.000002s : 10: predicate.incorporate_call 0.48% : 0.000001s : 10: predicate.incorporate_call_switch 6.08% : 0.000016s : 72: predicate.inline 0.91% : 0.000002s : 10: predicate.inline_without_move 0.33% : 0.000001s : 10: predicate.j_node_and_user_rematch 0.90% : 0.000002s : 10: predicate.less_batch_normalization 1.90% : 0.000005s : 31: predicate.list_to_tuple_eliminator_ 2.54% : 0.000007s : 48: predicate.load_eliminater 0.90% : 0.000002s : 5: predicate.loop_unroll_after_grad 1.92% : 0.000005s : 36: predicate.loop_unroll_before_grad 1.67% : 0.000004s : 27: predicate.make_slice_get_slice_eliminator 0.64% : 0.000002s : 10: predicate.merge_addn 0.56% : 0.000001s : 10: predicate.micro_step_allgather_replace 0.54% : 0.000001s : 10: predicate.mini_step_allgather_replace 0.90% : 0.000002s : 17: predicate.minmaximum_grad 1.52% : 0.000004s : 5: predicate.mutable_eliminate 0.37% : 0.000001s : 5: predicate.opt_reshape 0.37% : 0.000001s : 5: predicate.parallel_virtual_node 1.78% : 0.000005s : 25: predicate.partial_defer_inline 1.64% : 0.000004s : 26: predicate.partial_eliminate 0.98% : 0.000003s : 17: predicate.print_const_string_wrapper 0.59% : 0.000002s : 10: predicate.reduce_all_const_elim 1.17% : 0.000003s : 17: predicate.reduce_eliminate 2.58% : 0.000007s : 48: predicate.redundant_stop_gradient_eliminater 0.35% : 0.000001s : 10: predicate.remove_not_recompute_node 1.38% : 0.000004s : 31: predicate.replace_applicator 0.53% : 0.000001s : 10: predicate.replace_old_param 0.49% : 0.000001s : 5: predicate.reset_defer_inline 0.98% : 0.000003s : 17: predicate.reshape_eliminate 0.59% : 0.000002s : 10: predicate.row_tensor_add_zeros_like 0.40% : 0.000001s : 5: predicate.row_tensor_eliminate 0.82% : 0.000002s : 10: predicate.same_eliminate 0.36% : 0.000001s : 10: predicate.set_cell_output_no_recompute 0.79% : 0.000002s : 10: predicate.shard_identity_eliminate 0.77% : 0.000002s : 10: predicate.special_op_eliminate 0.65% : 0.000002s : 10: predicate.specialize_transform 0.94% : 0.000002s : 10: predicate.split_environ_get_set_with_tuple_value 0.77% : 0.000002s : 10: predicate.stack_unstack_eliminate 0.34% : 0.000001s : 5: predicate.switch_call_monad_eliminater 1.46% : 0.000004s : 25: predicate.switch_defer_inline 1.98% : 0.000005s : 35: predicate.switch_layer_defer_inline 4.70% : 0.000012s : 76: predicate.switch_simplify 0.95% : 0.000003s : 17: predicate.tile_eliminate 1.02% : 0.000003s : 17: predicate.transpose_eliminate 1.67% : 0.000004s : 27: predicate.tuple_list_convert_item_index_to_positive 1.74% : 0.000005s : 27: predicate.tuple_list_get_item_const_eliminator 1.46% : 0.000004s : 27: predicate.tuple_list_get_item_depend_reorder 3.20% : 0.000008s : 41: predicate.tuple_list_get_item_eliminator 1.64% : 0.000004s : 27: predicate.tuple_list_get_set_item_eliminator 2.43% : 0.000006s : 37: predicate.tuple_list_set_item_eliminator 1.69% : 0.000004s : 31: predicate.tuple_to_list_eliminator_ 2.41% : 0.000006s : 48: predicate.updatestate_pure_node_eliminater 3.29% : 0.000009s : 58: predicate.updatestate_useless_node_eliminater 0.53% : 0.000001s : 5: predicate.value_based_eliminate 0.57% : 0.000001s : 10: predicate.virtual_dataset_eliminate 0.74% : 0.000002s : 10: predicate.virtual_output_eliminate 0.31% : 0.000001s : 5: predicate.virtual_view_grad_eliminate 0.42% : 0.000001s : 5: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000752 11 51.97% : 0.000391s : 5: func_graph_cloner_run.FuncGraphClonerGraph 48.03% : 0.000361s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.045601 192 0.01% : 0.000004s : 1: ForceFp32Comm 8.07% : 0.003679s : 1: add_attr 8.04% : 0.003665s : 1: add_attr_with_inline 0.01% : 0.000004s : 1: add_comm_op_reuse_tag 0.15% : 0.000069s : 1: add_recomputation 0.01% : 0.000004s : 1: assign_add_opt 0.17% : 0.000080s : 1: auto_monad 0.05% : 0.000025s : 1: auto_monad_reorder 0.01% : 0.000003s : 1: begin_end_overlap_inline 0.01% : 0.000006s : 1: bias_add_comm_swap 1.17% : 0.000536s : 1: bootstrap 0.08% : 0.000037s : 1: cconv 0.01% : 0.000004s : 1: comm_op_add_attrs 0.04% : 0.000019s : 1: control_data_broadcast_order 0.02% : 0.000011s : 1: convert_after_rewriter 0.07% : 0.000031s : 1: cse_after_recomputation 0.01% : 0.000005s : 1: dataset_repeat_opt 0.01% : 0.000006s : 1: detach_backward 0.02% : 0.000011s : 1: environ_conv 0.07% : 0.000030s : 1: event_method 0.01% : 0.000005s : 1: full_micro_interleaved_order_control 0.01% : 0.000005s : 1: get_jit_bprop_graph 0.02% : 0.000010s : 1: graph_reusing 0.01% : 0.000005s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000004s : 1: handle_group_info 0.01% : 0.000006s : 1: inline 0.01% : 0.000006s : 1: insert-virtual-dataset 0.01% : 0.000004s : 1: interleave_parallel_branches 0.01% : 0.000004s : 1: interleave_split_concat_branches 0.01% : 0.000006s : 1: label_fine_grained_interleaved_index 0.02% : 0.000008s : 1: label_micro_interleaved_index 1.13% : 0.000517s : 1: loop_unroll 0.01% : 0.000004s : 1: merge_cast_opt 0.01% : 0.000005s : 1: micro_interleaved_order_control 1.78% : 0.000811s : 1: mutable_eliminate 0.02% : 0.000007s : 1: offloading_packed_experts 0.04% : 0.000018s : 1: opt.transform.loop_unroll_optimizer 0.05% : 0.000024s : 1: opt.transform.mutable_eliminate 3.30% : 0.001504s : 78: opt.transform.opt_a 0.08% : 0.000037s : 1: opt.transform.opt_after_cconv 0.07% : 0.000031s : 1: opt.transform.opt_after_jit_grad 0.33% : 0.000151s : 28: opt.transform.opt_b 0.14% : 0.000062s : 2: opt.transform.opt_trans_graph 0.10% : 0.000045s : 4: opt.transform.symbol_engine_opt 7.39% : 0.003370s : 1: opt_a 0.29% : 0.000132s : 1: opt_after_cconv 1.15% : 0.000525s : 1: opt_after_jit_grad 0.62% : 0.000284s : 1: opt_b 13.13% : 0.005989s : 1: optimize 0.05% : 0.000023s : 1: optimize_parallel_all_gather_comm 0.02% : 0.000009s : 1: order_py_execute_after_rewriter 0.06% : 0.000029s : 1: overlap_grad_flash_sp 0.01% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.02% : 0.000008s : 1: overlap_grad_ring_attention 0.01% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.01% : 0.000005s : 1: overlap_param_gather 0.01% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.02% : 0.000008s : 1: overlap_recompute_and_grad_model_parallel 0.01% : 0.000005s : 1: overlap_recompute_comm 0.02% : 0.000008s : 1: parallel-infer-symbol 0.01% : 0.000004s : 1: parallel-infer-symbol-second 0.01% : 0.000006s : 1: partial_unused_args_eliminate 0.01% : 0.000005s : 1: pipeline_parallel_scheduler 0.01% : 0.000005s : 1: pipeline_split 0.10% : 0.000044s : 1: pre_auto_parallel 0.08% : 0.000034s : 1: py_interpret_to_execute 0.04% : 0.000020s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000004s : 1: remove_cast_before_assign_add 0.05% : 0.000022s : 1: remove_dup_value 1.16% : 0.000531s : 1: renormalize.infer 0.83% : 0.000378s : 1: renormalize.specialize 0.01% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.02% : 0.000008s : 1: rewriter_after_jit_bprop_graph 0.11% : 0.000052s : 1: rewriter_after_opt_a 0.22% : 0.000099s : 1: rewriter_before_opt_a 0.01% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.01% : 0.000005s : 1: slice_recompute_activation 0.01% : 0.000005s : 1: split_layernorm_comm 0.01% : 0.000005s : 1: split_matmul_comm_elemetwise 0.02% : 0.000009s : 1: swap_dp_allreduce_reducescatter 0.20% : 0.000092s : 1: symbol_engine_optimizer 0.21% : 0.000095s : 1: tuple_transform 48.79% : 0.022247s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:44:11.167.497 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:44:11.167.797 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.034872, [21] [bootstrap]: 0.00053476 [type_inference]: 0.00779504 [event_method]: 2.313e-05 [auto_monad]: 7.639e-05 [graph_reusing]: 6.49001e-06 [inline]: 3.03e-06 [add_attr]: 0.00418002, [1] [add_attr_with_inline]: 0.00416625, [1] [Cycle 1]: 0.00010467, [2] [tag_attr]: 2.688e-05 [meta_addattr_fg_expand]: 6.75002e-06 [parallel-infer-symbol]: 3.51001e-06 [pre_auto_parallel]: 4.62e-05 [insert-virtual-dataset]: 2.30002e-06 [parallel-infer-symbol-second]: 8.99978e-07 [dataset_repeat_opt]: 1.99999e-06 [pipeline_split]: 1.82001e-06 [optimize]: 0.0197436, [53] [py_interpret_to_execute]: 3.959e-05 [rewriter_before_opt_a]: 0.00011033 [opt_a]: 0.0165292, [2] [Cycle 1]: 0.0154156, [45] [expand_dump_flag]: 3.85e-06 [switch_simplify]: 4.521e-05 [loop_unroll]: 3.215e-05 [a_1]: 0.00083765 [with_stream_mark]: 2.43e-05 [recompute_prepare]: 1.48e-05 [updatestate_depend_eliminate]: 5.59e-06 [updatestate_assign_eliminate]: 4.12e-06 [updatestate_loads_eliminate]: 3.59002e-06 [parameter_eliminate]: 2.53e-06 [a_2]: 0.0001354 [accelerated_algorithm]: 9.86998e-06 [shard]: 2.86999e-06 [meta_shard_fg_expand]: 2.99999e-06 [shard_inline]: 8.57e-06 [merge_send_recv]: 1.092e-05 [auto_parallel]: 1.182e-05 [parallel]: 2.181e-05 [flash_sp]: 1.193e-05 [merge_comm]: 5.44e-06 [allreduce_fusion]: 4.43999e-06 [matmul_add_comm_reduction]: 1.304e-05 [allreduce_slice_to_reducescatter]: 1.05999e-06 [virtual_shard_identity]: 1.125e-05 [virtual_dataset]: 8.13999e-06 [get_grad_eliminate_]: 7.46999e-06 [virtual_output]: 7.96001e-06 [merge_forward]: 5.18002e-06 [cell_reuse_recompute_pass]: 1.73002e-06 [offload_activation]: 1.179e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.319e-05 [merge_recompute_call_nodes]: 1.82999e-06 [before_grad]: 2.057e-05 [set_forward_comm_id_for_comm_node_pass]: 5.24e-06 [meta_fg_expand]: 3.71999e-06 [flash_sp_send_recv_attached]: 3.28e-06 [receive_attached]: 2.25002e-06 [after_resolve]: 1.658e-05 [a_after_grad]: 1.311e-05 [renormalize]: 0.0134019 [add_forward_monad_depend]: 1.33e-05 [auto_monad_grad]: 2.90002e-06 [auto_monad_eliminator]: 2.751e-05 [cse]: 4.399e-05 [a_3]: 9.249e-05 [Cycle 2]: 0.00109291, [45] [expand_dump_flag]: 2.76e-06 [switch_simplify]: 1.207e-05 [loop_unroll]: 7.8e-06 [a_1]: 0.00020634 [with_stream_mark]: 2.366e-05 [recompute_prepare]: 9.64e-06 [updatestate_depend_eliminate]: 5.47001e-06 [updatestate_assign_eliminate]: 3.80998e-06 [updatestate_loads_eliminate]: 3.54002e-06 [parameter_eliminate]: 2.50002e-06 [a_2]: 0.00012472 [accelerated_algorithm]: 9.43002e-06 [shard]: 2.62001e-06 [meta_shard_fg_expand]: 2.45002e-06 [shard_inline]: 8.28999e-06 [merge_send_recv]: 1.466e-05 [auto_parallel]: 1.122e-05 [parallel]: 1.125e-05 [flash_sp]: 4.97e-06 [merge_comm]: 5.89e-06 [allreduce_fusion]: 4.33001e-06 [matmul_add_comm_reduction]: 1.122e-05 [allreduce_slice_to_reducescatter]: 1.23002e-06 [virtual_shard_identity]: 1.072e-05 [virtual_dataset]: 7.90998e-06 [get_grad_eliminate_]: 7.23e-06 [virtual_output]: 7.21001e-06 [merge_forward]: 5.59998e-06 [cell_reuse_recompute_pass]: 3.08e-06 [offload_activation]: 1.282e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.03e-05 [merge_recompute_call_nodes]: 1.67001e-06 [before_grad]: 1.377e-05 [set_forward_comm_id_for_comm_node_pass]: 5.52001e-06 [meta_fg_expand]: 3.11001e-06 [flash_sp_send_recv_attached]: 1.72001e-06 [receive_attached]: 2.41998e-06 [after_resolve]: 1.554e-05 [a_after_grad]: 1.221e-05 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 3.33998e-06 [auto_monad_grad]: 1.88997e-06 [auto_monad_eliminator]: 1.313e-05 [cse]: 2.404e-05 [a_3]: 6.332e-05 [py_interpret_to_execute_after_opt_a]: 2.681e-05 [slice_cell_reuse_recomputed_activation]: 5.66e-06 [rewriter_after_opt_a]: 5.783e-05 [convert_after_rewriter]: 1.322e-05 [order_py_execute_after_rewriter]: 9.64e-06 [mutable_eliminate]: 0.00084507 [opt_b]: 0.0003827, [1] [Cycle 1]: 0.00036939, [7] [b_1]: 0.00022778 [b_2]: 1.213e-05 [updatestate_depend_eliminate]: 1.142e-05 [updatestate_assign_eliminate]: 4.72998e-06 [updatestate_loads_eliminate]: 3.94002e-06 [renormalize]: 9.00007e-07 [cse]: 4.055e-05 [optimize_parallel_all_gather_comm]: 2.72e-05 [overlap_param_gather]: 4.67e-06 [cconv]: 4.239e-05 [loop_unroll]: 0.00063838 [opt_after_cconv]: 0.00017128, [1] [Cycle 1]: 0.00016003, [7] [c_1]: 4.2e-05 [parameter_eliminate]: 6.66999e-06 [updatestate_depend_eliminate]: 9.31e-06 [updatestate_assign_eliminate]: 4.33999e-06 [updatestate_loads_eliminate]: 3.14001e-06 [cse]: 3.556e-05 [renormalize]: 8.09989e-07 [remove_dup_value]: 2.005e-05 [tuple_transform]: 0.00011423, [1] [Cycle 1]: 0.00010584, [4] [d_1]: 6.11e-05 [none_parameter_eliminate]: 1.54e-06 [renormalize]: 1.50001e-07 [switch_simplify]: 1.011e-05 [partial_unused_args_eliminate]: 4.60001e-06 [add_recomputation]: 7.468e-05 [cse_after_recomputation]: 3.707e-05, [1] [Cycle 1]: 2.923e-05, [1] [cse]: 1.744e-05 [environ_conv]: 1.281e-05 [swap_dp_allreduce_reducescatter]: 9.59999e-06 [bias_add_comm_swap]: 6.64001e-06 [label_micro_interleaved_index]: 1.084e-05 [label_fine_grained_interleaved_index]: 5.77999e-06 [merge_cast_opt]: 4.22e-06 [slice_recompute_activation]: 4.60001e-06 [micro_interleaved_order_control]: 5.25001e-06 [assign_add_opt]: 3.88999e-06 [ForceFp32Comm]: 3.65e-06 [remove_cast_before_assign_add]: 3.51001e-06 [full_micro_interleaved_order_control]: 4.97999e-06 [reorder_send_recv_between_fp_bp]: 5.77999e-06 [comm_op_add_attrs]: 3.84997e-06 [add_comm_op_reuse_tag]: 3.40003e-06 [interleave_split_concat_branches]: 3.68e-06 [interleave_parallel_branches]: 3.73001e-06 [overlap_opt_shard_in_pipeline]: 3.93001e-06 [overlap_opt_shard_grad_in_pipeline]: 4.62e-06 [control_data_broadcast_order]: 2.39e-05 [grouped_pairwise_exchange_alltoall]: 4.56002e-06 [offloading_packed_experts]: 7.95998e-06 [overlap_recompute_and_grad_model_parallel]: 9.00999e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.53999e-06 [overlap_recompute_allgather_and_fa_grad]: 4.38999e-06 [overlap_recompute_comm]: 4.99e-06 [overlap_grad_ring_attention]: 7.61001e-06 [overlap_grad_flash_sp]: 3.059e-05 [begin_end_overlap_inline]: 3.26999e-06 [split_matmul_comm_elemetwise]: 4.90001e-06 [split_layernorm_comm]: 4.08001e-06 [handle_group_info]: 3.98001e-06 [symbol_engine_optimizer]: 0.00012877, [1] [Cycle 1]: 0.00011982, [6] [build]: 5.56998e-06 [elim_shapecalc]: 1.611e-05 [elim_not_effective]: 2.045e-05 [opt_reshape]: 1.004e-05 [fold_const_symbol]: 1.323e-05 [renormalize]: 1.09e-06 [detach_backward]: 9.90002e-06 [pipeline_parallel_scheduler]: 2.51e-06 [auto_monad_reorder]: 3.976e-05 [get_jit_bprop_graph]: 2.76e-06 [rewriter_after_jit_bprop_graph]: 1.093e-05 [opt_after_jit_grad]: 0.00090774 [validate]: 6.564e-05 Sums bootstrap : 0.000535s : 1.91% type_inference : 0.007795s : 27.91% event_method : 0.000023s : 0.08% auto_monad : 0.000076s : 0.27% graph_reusing : 0.000006s : 0.02% inline : 0.000003s : 0.01% add_attr.add_attr_with_inline.tag_attr : 0.000027s : 0.10% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000007s : 0.02% parallel-infer-symbol : 0.000004s : 0.01% pre_auto_parallel : 0.000046s : 0.17% insert-virtual-dataset : 0.000002s : 0.01% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.01% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000040s : 0.14% optimize.rewriter_before_opt_a : 0.000110s : 0.39% optimize.opt_a.expand_dump_flag : 0.000007s : 0.02% optimize.opt_a.switch_simplify : 0.000057s : 0.21% optimize.opt_a.loop_unroll : 0.000040s : 0.14% optimize.opt_a.a_1 : 0.001044s : 3.74% optimize.opt_a.with_stream_mark : 0.000048s : 0.17% optimize.opt_a.recompute_prepare : 0.000024s : 0.09% optimize.opt_a.updatestate_depend_eliminate : 0.000011s : 0.04% optimize.opt_a.updatestate_assign_eliminate : 0.000008s : 0.03% optimize.opt_a.updatestate_loads_eliminate : 0.000007s : 0.03% optimize.opt_a.parameter_eliminate : 0.000005s : 0.02% optimize.opt_a.a_2 : 0.000260s : 0.93% optimize.opt_a.accelerated_algorithm : 0.000019s : 0.07% optimize.opt_a.shard : 0.000005s : 0.02% optimize.opt_a.meta_shard_fg_expand : 0.000005s : 0.02% optimize.opt_a.shard_inline : 0.000017s : 0.06% optimize.opt_a.merge_send_recv : 0.000026s : 0.09% optimize.opt_a.auto_parallel : 0.000023s : 0.08% optimize.opt_a.parallel : 0.000033s : 0.12% optimize.opt_a.flash_sp : 0.000017s : 0.06% optimize.opt_a.merge_comm : 0.000011s : 0.04% optimize.opt_a.allreduce_fusion : 0.000009s : 0.03% optimize.opt_a.matmul_add_comm_reduction : 0.000024s : 0.09% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000002s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000022s : 0.08% optimize.opt_a.virtual_dataset : 0.000016s : 0.06% optimize.opt_a.get_grad_eliminate_ : 0.000015s : 0.05% optimize.opt_a.virtual_output : 0.000015s : 0.05% optimize.opt_a.merge_forward : 0.000011s : 0.04% optimize.opt_a.cell_reuse_recompute_pass : 0.000005s : 0.02% optimize.opt_a.offload_activation : 0.000025s : 0.09% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000043s : 0.16% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.01% optimize.opt_a.before_grad : 0.000034s : 0.12% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000011s : 0.04% optimize.opt_a.meta_fg_expand : 0.000007s : 0.02% optimize.opt_a.flash_sp_send_recv_attached : 0.000005s : 0.02% optimize.opt_a.receive_attached : 0.000005s : 0.02% optimize.opt_a.after_resolve : 0.000032s : 0.11% optimize.opt_a.a_after_grad : 0.000025s : 0.09% optimize.opt_a.renormalize : 0.013402s : 47.98% optimize.opt_a.add_forward_monad_depend : 0.000017s : 0.06% optimize.opt_a.auto_monad_grad : 0.000005s : 0.02% optimize.opt_a.auto_monad_eliminator : 0.000041s : 0.15% optimize.opt_a.cse : 0.000068s : 0.24% optimize.opt_a.a_3 : 0.000156s : 0.56% optimize.py_interpret_to_execute_after_opt_a : 0.000027s : 0.10% optimize.slice_cell_reuse_recomputed_activation : 0.000006s : 0.02% optimize.rewriter_after_opt_a : 0.000058s : 0.21% optimize.convert_after_rewriter : 0.000013s : 0.05% optimize.order_py_execute_after_rewriter : 0.000010s : 0.03% optimize.mutable_eliminate : 0.000845s : 3.03% optimize.opt_b.b_1 : 0.000228s : 0.82% optimize.opt_b.b_2 : 0.000012s : 0.04% optimize.opt_b.updatestate_depend_eliminate : 0.000011s : 0.04% optimize.opt_b.updatestate_assign_eliminate : 0.000005s : 0.02% optimize.opt_b.updatestate_loads_eliminate : 0.000004s : 0.01% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000041s : 0.15% optimize.optimize_parallel_all_gather_comm : 0.000027s : 0.10% optimize.overlap_param_gather : 0.000005s : 0.02% optimize.cconv : 0.000042s : 0.15% optimize.loop_unroll : 0.000638s : 2.29% optimize.opt_after_cconv.c_1 : 0.000042s : 0.15% optimize.opt_after_cconv.parameter_eliminate : 0.000007s : 0.02% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000009s : 0.03% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000004s : 0.02% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.cse : 0.000036s : 0.13% optimize.opt_after_cconv.renormalize : 0.000001s : 0.00% optimize.remove_dup_value : 0.000020s : 0.07% optimize.tuple_transform.d_1 : 0.000061s : 0.22% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000010s : 0.04% optimize.partial_unused_args_eliminate : 0.000005s : 0.02% optimize.add_recomputation : 0.000075s : 0.27% optimize.cse_after_recomputation.cse : 0.000017s : 0.06% optimize.environ_conv : 0.000013s : 0.05% optimize.swap_dp_allreduce_reducescatter : 0.000010s : 0.03% optimize.bias_add_comm_swap : 0.000007s : 0.02% optimize.label_micro_interleaved_index : 0.000011s : 0.04% optimize.label_fine_grained_interleaved_index : 0.000006s : 0.02% optimize.merge_cast_opt : 0.000004s : 0.02% optimize.slice_recompute_activation : 0.000005s : 0.02% optimize.micro_interleaved_order_control : 0.000005s : 0.02% optimize.assign_add_opt : 0.000004s : 0.01% optimize.ForceFp32Comm : 0.000004s : 0.01% optimize.remove_cast_before_assign_add : 0.000004s : 0.01% optimize.full_micro_interleaved_order_control : 0.000005s : 0.02% optimize.reorder_send_recv_between_fp_bp : 0.000006s : 0.02% optimize.comm_op_add_attrs : 0.000004s : 0.01% optimize.add_comm_op_reuse_tag : 0.000003s : 0.01% optimize.interleave_split_concat_branches : 0.000004s : 0.01% optimize.interleave_parallel_branches : 0.000004s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000005s : 0.02% optimize.control_data_broadcast_order : 0.000024s : 0.09% optimize.grouped_pairwise_exchange_alltoall : 0.000005s : 0.02% optimize.offloading_packed_experts : 0.000008s : 0.03% optimize.overlap_recompute_and_grad_model_parallel : 0.000009s : 0.03% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.02% optimize.overlap_recompute_comm : 0.000005s : 0.02% optimize.overlap_grad_ring_attention : 0.000008s : 0.03% optimize.overlap_grad_flash_sp : 0.000031s : 0.11% optimize.begin_end_overlap_inline : 0.000003s : 0.01% optimize.split_matmul_comm_elemetwise : 0.000005s : 0.02% optimize.split_layernorm_comm : 0.000004s : 0.01% optimize.handle_group_info : 0.000004s : 0.01% optimize.symbol_engine_optimizer.build : 0.000006s : 0.02% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000016s : 0.06% optimize.symbol_engine_optimizer.elim_not_effective : 0.000020s : 0.07% optimize.symbol_engine_optimizer.opt_reshape : 0.000010s : 0.04% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000013s : 0.05% optimize.symbol_engine_optimizer.renormalize : 0.000001s : 0.00% detach_backward : 0.000010s : 0.04% pipeline_parallel_scheduler : 0.000003s : 0.01% auto_monad_reorder : 0.000040s : 0.14% get_jit_bprop_graph : 0.000003s : 0.01% rewriter_after_jit_bprop_graph : 0.000011s : 0.04% opt_after_jit_grad : 0.000908s : 3.25% validate : 0.000066s : 0.23% Time group info: ------[substitution.] 0.000301 39 11.27% : 0.000034s : 3: substitution.cast_eliminate 0.84% : 0.000003s : 3: substitution.elim_not_effective 0.64% : 0.000002s : 3: substitution.fold_const_symbol 2.57% : 0.000008s : 5: substitution.graph_param_transform 66.65% : 0.000201s : 4: substitution.inline 3.97% : 0.000012s : 6: substitution.j_node_and_user_rematch 2.63% : 0.000008s : 6: substitution.remove_not_recompute_node 2.71% : 0.000008s : 4: substitution.replace_old_param 5.98% : 0.000018s : 4: substitution.tuple_list_get_item_eliminator 2.72% : 0.000008s : 1: substitution.value_based_eliminate ------[type_inference.] 0.007725 2 88.95% : 0.006872s : 1: type_inference.infer 11.05% : 0.000853s : 1: type_inference.specialize ------[replace.] 0.000071 8 60.95% : 0.000043s : 4: replace.inline 39.05% : 0.000028s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000213 8 92.58% : 0.000197s : 4: match.inline 7.42% : 0.000016s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000295 1596 0.92% : 0.000003s : 17: predicate.accumulaten_eliminater 1.01% : 0.000003s : 5: predicate.ad_related_special_op_eliminate 0.65% : 0.000002s : 10: predicate.addn_check_dump 0.97% : 0.000003s : 17: predicate.addn_zero_filter 0.80% : 0.000002s : 17: predicate.adjust_all_reduce_mul_add 2.12% : 0.000006s : 27: predicate.arithmetic_simplify 1.17% : 0.000003s : 17: predicate.cast_eliminate 0.60% : 0.000002s : 10: predicate.check_bprop_eliminate 0.67% : 0.000002s : 10: predicate.compare_switch_simplify 0.16% : 0.000000s : 5: predicate.const_output_eliminate 0.71% : 0.000002s : 10: predicate.depend_value_elim 0.91% : 0.000003s : 17: predicate.dict_get_item_const_eliminator 1.37% : 0.000004s : 17: predicate.dict_get_item_eliminator 0.86% : 0.000003s : 17: predicate.dict_set_item_eliminator 1.10% : 0.000003s : 10: predicate.dumpgradient_eliminate 0.31% : 0.000001s : 5: predicate.elim_not_effective 0.37% : 0.000001s : 5: predicate.elim_shapecalc_of_broadcastargs 1.21% : 0.000004s : 22: predicate.environ_add_const_eliminate 1.06% : 0.000003s : 22: predicate.environ_get_add_eliminate 1.07% : 0.000003s : 22: predicate.environ_get_depend_swap 1.74% : 0.000005s : 32: predicate.environ_get_eliminate 1.08% : 0.000003s : 22: predicate.environ_get_set_eliminate 1.24% : 0.000004s : 25: predicate.exchange_switch_depend_value 2.27% : 0.000007s : 25: predicate.float_depend_g_call 0.49% : 0.000001s : 10: predicate.float_environ_get_switch 0.73% : 0.000002s : 15: predicate.float_tuple_getitem_switch 0.16% : 0.000000s : 5: predicate.fold_const_symbol 0.59% : 0.000002s : 10: predicate.get_grad_eliminate 0.25% : 0.000001s : 5: predicate.graph_param_transform 0.59% : 0.000002s : 10: predicate.incorporate_call 0.45% : 0.000001s : 10: predicate.incorporate_call_switch 6.43% : 0.000019s : 72: predicate.inline 0.84% : 0.000002s : 10: predicate.inline_without_move 0.27% : 0.000001s : 10: predicate.j_node_and_user_rematch 0.98% : 0.000003s : 10: predicate.less_batch_normalization 1.69% : 0.000005s : 31: predicate.list_to_tuple_eliminator_ 2.27% : 0.000007s : 48: predicate.load_eliminater 1.46% : 0.000004s : 5: predicate.loop_unroll_after_grad 1.73% : 0.000005s : 36: predicate.loop_unroll_before_grad 1.86% : 0.000005s : 27: predicate.make_slice_get_slice_eliminator 0.58% : 0.000002s : 10: predicate.merge_addn 0.52% : 0.000002s : 10: predicate.micro_step_allgather_replace 0.52% : 0.000002s : 10: predicate.mini_step_allgather_replace 0.79% : 0.000002s : 17: predicate.minmaximum_grad 1.56% : 0.000005s : 5: predicate.mutable_eliminate 0.37% : 0.000001s : 5: predicate.opt_reshape 0.33% : 0.000001s : 5: predicate.parallel_virtual_node 1.60% : 0.000005s : 25: predicate.partial_defer_inline 1.48% : 0.000004s : 26: predicate.partial_eliminate 0.87% : 0.000003s : 17: predicate.print_const_string_wrapper 0.54% : 0.000002s : 10: predicate.reduce_all_const_elim 1.26% : 0.000004s : 17: predicate.reduce_eliminate 2.38% : 0.000007s : 48: predicate.redundant_stop_gradient_eliminater 0.35% : 0.000001s : 10: predicate.remove_not_recompute_node 1.51% : 0.000004s : 31: predicate.replace_applicator 0.72% : 0.000002s : 10: predicate.replace_old_param 0.37% : 0.000001s : 5: predicate.reset_defer_inline 1.15% : 0.000003s : 17: predicate.reshape_eliminate 0.64% : 0.000002s : 10: predicate.row_tensor_add_zeros_like 0.39% : 0.000001s : 5: predicate.row_tensor_eliminate 0.91% : 0.000003s : 10: predicate.same_eliminate 0.37% : 0.000001s : 10: predicate.set_cell_output_no_recompute 0.76% : 0.000002s : 10: predicate.shard_identity_eliminate 0.63% : 0.000002s : 10: predicate.special_op_eliminate 0.90% : 0.000003s : 10: predicate.specialize_transform 1.16% : 0.000003s : 10: predicate.split_environ_get_set_with_tuple_value 0.68% : 0.000002s : 10: predicate.stack_unstack_eliminate 0.40% : 0.000001s : 5: predicate.switch_call_monad_eliminater 1.43% : 0.000004s : 25: predicate.switch_defer_inline 1.92% : 0.000006s : 35: predicate.switch_layer_defer_inline 4.18% : 0.000012s : 76: predicate.switch_simplify 0.95% : 0.000003s : 17: predicate.tile_eliminate 1.06% : 0.000003s : 17: predicate.transpose_eliminate 1.74% : 0.000005s : 27: predicate.tuple_list_convert_item_index_to_positive 1.66% : 0.000005s : 27: predicate.tuple_list_get_item_const_eliminator 1.47% : 0.000004s : 27: predicate.tuple_list_get_item_depend_reorder 3.34% : 0.000010s : 41: predicate.tuple_list_get_item_eliminator 1.74% : 0.000005s : 27: predicate.tuple_list_get_set_item_eliminator 2.26% : 0.000007s : 37: predicate.tuple_list_set_item_eliminator 1.70% : 0.000005s : 31: predicate.tuple_to_list_eliminator_ 2.46% : 0.000007s : 48: predicate.updatestate_pure_node_eliminater 2.82% : 0.000008s : 58: predicate.updatestate_useless_node_eliminater 0.51% : 0.000001s : 5: predicate.value_based_eliminate 0.62% : 0.000002s : 10: predicate.virtual_dataset_eliminate 0.60% : 0.000002s : 10: predicate.virtual_output_eliminate 0.27% : 0.000001s : 5: predicate.virtual_view_grad_eliminate 0.36% : 0.000001s : 5: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000772 11 49.90% : 0.000385s : 5: func_graph_cloner_run.FuncGraphClonerGraph 50.10% : 0.000387s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.074045 192 0.01% : 0.000006s : 1: ForceFp32Comm 5.66% : 0.004193s : 1: add_attr 5.63% : 0.004170s : 1: add_attr_with_inline 0.01% : 0.000006s : 1: add_comm_op_reuse_tag 0.11% : 0.000079s : 1: add_recomputation 0.01% : 0.000007s : 1: assign_add_opt 0.12% : 0.000087s : 1: auto_monad 0.06% : 0.000048s : 1: auto_monad_reorder 0.01% : 0.000007s : 1: begin_end_overlap_inline 0.01% : 0.000010s : 1: bias_add_comm_swap 0.80% : 0.000592s : 1: bootstrap 0.06% : 0.000048s : 1: cconv 0.01% : 0.000007s : 1: comm_op_add_attrs 0.04% : 0.000027s : 1: control_data_broadcast_order 0.02% : 0.000017s : 1: convert_after_rewriter 0.05% : 0.000040s : 1: cse_after_recomputation 0.01% : 0.000008s : 1: dataset_repeat_opt 0.07% : 0.000051s : 1: detach_backward 0.02% : 0.000016s : 1: environ_conv 0.05% : 0.000036s : 1: event_method 0.01% : 0.000008s : 1: full_micro_interleaved_order_control 0.01% : 0.000009s : 1: get_jit_bprop_graph 0.02% : 0.000013s : 1: graph_reusing 0.01% : 0.000008s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000007s : 1: handle_group_info 0.01% : 0.000009s : 1: inline 0.01% : 0.000009s : 1: insert-virtual-dataset 0.01% : 0.000006s : 1: interleave_parallel_branches 0.01% : 0.000006s : 1: interleave_split_concat_branches 0.01% : 0.000009s : 1: label_fine_grained_interleaved_index 0.02% : 0.000014s : 1: label_micro_interleaved_index 0.87% : 0.000647s : 1: loop_unroll 0.01% : 0.000007s : 1: merge_cast_opt 0.01% : 0.000008s : 1: micro_interleaved_order_control 1.15% : 0.000855s : 1: mutable_eliminate 0.01% : 0.000011s : 1: offloading_packed_experts 0.04% : 0.000026s : 1: opt.transform.loop_unroll_optimizer 0.04% : 0.000029s : 1: opt.transform.mutable_eliminate 2.19% : 0.001621s : 78: opt.transform.opt_a 0.05% : 0.000040s : 1: opt.transform.opt_after_cconv 0.06% : 0.000048s : 1: opt.transform.opt_after_jit_grad 0.22% : 0.000160s : 28: opt.transform.opt_b 0.09% : 0.000069s : 2: opt.transform.opt_trans_graph 0.07% : 0.000055s : 4: opt.transform.symbol_engine_opt 22.33% : 0.016533s : 1: opt_a 0.24% : 0.000175s : 1: opt_after_cconv 1.25% : 0.000926s : 1: opt_after_jit_grad 0.52% : 0.000387s : 1: opt_b 28.12% : 0.020824s : 1: optimize 0.04% : 0.000031s : 1: optimize_parallel_all_gather_comm 0.02% : 0.000013s : 1: order_py_execute_after_rewriter 0.05% : 0.000034s : 1: overlap_grad_flash_sp 0.01% : 0.000007s : 1: overlap_grad_matmul_and_grad_allreduce 0.01% : 0.000011s : 1: overlap_grad_ring_attention 0.01% : 0.000008s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000007s : 1: overlap_opt_shard_in_pipeline 0.01% : 0.000008s : 1: overlap_param_gather 0.01% : 0.000007s : 1: overlap_recompute_allgather_and_fa_grad 0.02% : 0.000012s : 1: overlap_recompute_and_grad_model_parallel 0.01% : 0.000008s : 1: overlap_recompute_comm 0.01% : 0.000010s : 1: parallel-infer-symbol 0.01% : 0.000006s : 1: parallel-infer-symbol-second 0.01% : 0.000008s : 1: partial_unused_args_eliminate 0.01% : 0.000011s : 1: pipeline_parallel_scheduler 0.01% : 0.000007s : 1: pipeline_split 0.07% : 0.000054s : 1: pre_auto_parallel 0.06% : 0.000043s : 1: py_interpret_to_execute 0.04% : 0.000031s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000007s : 1: remove_cast_before_assign_add 0.03% : 0.000024s : 1: remove_dup_value 17.31% : 0.012817s : 1: renormalize.infer 0.76% : 0.000565s : 1: renormalize.specialize 0.01% : 0.000008s : 1: reorder_send_recv_between_fp_bp 0.02% : 0.000017s : 1: rewriter_after_jit_bprop_graph 0.09% : 0.000063s : 1: rewriter_after_opt_a 0.15% : 0.000114s : 1: rewriter_before_opt_a 0.01% : 0.000009s : 1: slice_cell_reuse_recomputed_activation 0.01% : 0.000007s : 1: slice_recompute_activation 0.01% : 0.000007s : 1: split_layernorm_comm 0.01% : 0.000008s : 1: split_matmul_comm_elemetwise 0.02% : 0.000012s : 1: swap_dp_allreduce_reducescatter 0.18% : 0.000132s : 1: symbol_engine_optimizer 0.16% : 0.000117s : 1: tuple_transform 10.60% : 0.007851s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:44:12.941.40 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0385306, [21] [bootstrap]: 0.00048081 [type_inference]: 0.00743587 [event_method]: 2.108e-05 [auto_monad]: 6.337e-05 [graph_reusing]: 5.76e-06 [inline]: 2.77002e-06 [add_attr]: 0.00355927, [1] [add_attr_with_inline]: 0.0035485, [1] [Cycle 1]: 6.735e-05, [2] [tag_attr]: 2.211e-05 [meta_addattr_fg_expand]: 5.99999e-06 [parallel-infer-symbol]: 3.76001e-06 [pre_auto_parallel]: 3.938e-05 [insert-virtual-dataset]: 2.20002e-06 [parallel-infer-symbol-second]: 6.59988e-07 [dataset_repeat_opt]: 2.31e-06 [pipeline_split]: 2.09e-06 [optimize]: 0.026146, [53] [py_interpret_to_execute]: 2.878e-05 [rewriter_before_opt_a]: 9.138e-05 [opt_a]: 0.0235951, [2] [Cycle 1]: 0.0225594, [45] [expand_dump_flag]: 3.03e-06 [switch_simplify]: 4.385e-05 [loop_unroll]: 3.068e-05 [a_1]: 0.00075893 [with_stream_mark]: 1.847e-05 [recompute_prepare]: 1.125e-05 [updatestate_depend_eliminate]: 4.85001e-06 [updatestate_assign_eliminate]: 3.72998e-06 [updatestate_loads_eliminate]: 3.8e-06 [parameter_eliminate]: 1.79998e-06 [a_2]: 0.00010308 [accelerated_algorithm]: 9.22001e-06 [shard]: 1.94999e-06 [meta_shard_fg_expand]: 1.98997e-06 [shard_inline]: 8.03999e-06 [merge_send_recv]: 1.041e-05 [auto_parallel]: 7.48e-06 [parallel]: 1.983e-05 [flash_sp]: 8.89e-06 [merge_comm]: 5.00999e-06 [allreduce_fusion]: 4.27998e-06 [matmul_add_comm_reduction]: 1.102e-05 [allreduce_slice_to_reducescatter]: 8.30012e-07 [virtual_shard_identity]: 9.87999e-06 [virtual_dataset]: 7.7e-06 [get_grad_eliminate_]: 7.33e-06 [virtual_output]: 7.65e-06 [merge_forward]: 4.28999e-06 [cell_reuse_recompute_pass]: 1.35999e-06 [offload_activation]: 1.126e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.726e-05 [merge_recompute_call_nodes]: 1.45999e-06 [before_grad]: 1.248e-05 [set_forward_comm_id_for_comm_node_pass]: 4.75999e-06 [meta_fg_expand]: 3.38e-06 [flash_sp_send_recv_attached]: 2.49001e-06 [receive_attached]: 2.27999e-06 [after_resolve]: 1.304e-05 [a_after_grad]: 1.209e-05 [renormalize]: 0.00086146 [add_forward_monad_depend]: 5.66e-06 [auto_monad_grad]: 2.64001e-06 [auto_monad_eliminator]: 1.783e-05 [cse]: 3.896e-05 [a_3]: 0.020148 [Cycle 2]: 0.00101859, [45] [expand_dump_flag]: 4.72e-06 [switch_simplify]: 1.598e-05 [loop_unroll]: 8.45001e-06 [a_1]: 0.0002318 [with_stream_mark]: 3.498e-05 [recompute_prepare]: 8.42e-06 [updatestate_depend_eliminate]: 5.45001e-06 [updatestate_assign_eliminate]: 3.81001e-06 [updatestate_loads_eliminate]: 3.91999e-06 [parameter_eliminate]: 2.61e-06 [a_2]: 9.734e-05 [accelerated_algorithm]: 7.96001e-06 [shard]: 2.97002e-06 [meta_shard_fg_expand]: 3.25e-06 [shard_inline]: 7.36999e-06 [merge_send_recv]: 1.138e-05 [auto_parallel]: 1.05e-05 [parallel]: 2.312e-05 [flash_sp]: 4.92999e-06 [merge_comm]: 4.94998e-06 [allreduce_fusion]: 4.07e-06 [matmul_add_comm_reduction]: 1.307e-05 [allreduce_slice_to_reducescatter]: 8.59989e-07 [virtual_shard_identity]: 9.74e-06 [virtual_dataset]: 7.43e-06 [get_grad_eliminate_]: 8.22e-06 [virtual_output]: 8.42e-06 [merge_forward]: 4.85001e-06 [cell_reuse_recompute_pass]: 3.86999e-06 [offload_activation]: 1.201e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.382e-05 [merge_recompute_call_nodes]: 1.50001e-06 [before_grad]: 1.299e-05 [set_forward_comm_id_for_comm_node_pass]: 5.17999e-06 [meta_fg_expand]: 3.66999e-06 [flash_sp_send_recv_attached]: 1.92001e-06 [receive_attached]: 2.46e-06 [after_resolve]: 1.595e-05 [a_after_grad]: 1.271e-05 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 2.89999e-06 [auto_monad_grad]: 3.28e-06 [auto_monad_eliminator]: 1.955e-05 [cse]: 4.469e-05 [a_3]: 4.767e-05 [py_interpret_to_execute_after_opt_a]: 2.275e-05 [slice_cell_reuse_recomputed_activation]: 2.05002e-06 [rewriter_after_opt_a]: 5.129e-05 [convert_after_rewriter]: 7.97e-06 [order_py_execute_after_rewriter]: 5.74e-06 [mutable_eliminate]: 0.00077828 [opt_b]: 0.00027943, [1] [Cycle 1]: 0.00027182, [7] [b_1]: 0.00017369 [b_2]: 1.141e-05 [updatestate_depend_eliminate]: 9.30001e-06 [updatestate_assign_eliminate]: 3.38e-06 [updatestate_loads_eliminate]: 3.35998e-06 [renormalize]: 7.80012e-07 [cse]: 3.101e-05 [optimize_parallel_all_gather_comm]: 2.076e-05 [overlap_param_gather]: 1.92999e-06 [cconv]: 3.535e-05 [loop_unroll]: 0.00047959 [opt_after_cconv]: 0.0001234, [1] [Cycle 1]: 0.00011699, [7] [c_1]: 3.894e-05 [parameter_eliminate]: 4.03001e-06 [updatestate_depend_eliminate]: 6.51e-06 [updatestate_assign_eliminate]: 3.42002e-06 [updatestate_loads_eliminate]: 2.93e-06 [cse]: 2.553e-05 [renormalize]: 3.49974e-07 [remove_dup_value]: 1.706e-05 [tuple_transform]: 8.735e-05, [1] [Cycle 1]: 8.248e-05, [4] [d_1]: 5.393e-05 [none_parameter_eliminate]: 1.82999e-06 [renormalize]: 2.00002e-07 [switch_simplify]: 8.08999e-06 [partial_unused_args_eliminate]: 1.94e-06 [add_recomputation]: 6.354e-05 [cse_after_recomputation]: 2.651e-05, [1] [Cycle 1]: 2.158e-05, [1] [cse]: 1.605e-05 [environ_conv]: 7.04001e-06 [swap_dp_allreduce_reducescatter]: 5.70001e-06 [bias_add_comm_swap]: 3.46001e-06 [label_micro_interleaved_index]: 4.45e-06 [label_fine_grained_interleaved_index]: 2.71e-06 [merge_cast_opt]: 1.53002e-06 [slice_recompute_activation]: 2.06998e-06 [micro_interleaved_order_control]: 3.01999e-06 [assign_add_opt]: 1.14e-06 [ForceFp32Comm]: 8.2e-07 [remove_cast_before_assign_add]: 1.20999e-06 [full_micro_interleaved_order_control]: 2.32999e-06 [reorder_send_recv_between_fp_bp]: 3.03e-06 [comm_op_add_attrs]: 9.80013e-07 [add_comm_op_reuse_tag]: 1.04003e-06 [interleave_split_concat_branches]: 1.44998e-06 [interleave_parallel_branches]: 1.05001e-06 [overlap_opt_shard_in_pipeline]: 1.19003e-06 [overlap_opt_shard_grad_in_pipeline]: 1.72999e-06 [control_data_broadcast_order]: 1.49e-05 [grouped_pairwise_exchange_alltoall]: 1.62001e-06 [offloading_packed_experts]: 4.32e-06 [overlap_recompute_and_grad_model_parallel]: 5.17e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.14e-06 [overlap_recompute_allgather_and_fa_grad]: 1.40001e-06 [overlap_recompute_comm]: 1.95001e-06 [overlap_grad_ring_attention]: 4.85999e-06 [overlap_grad_flash_sp]: 2.362e-05 [begin_end_overlap_inline]: 5.59987e-07 [split_matmul_comm_elemetwise]: 2.23002e-06 [split_layernorm_comm]: 2.21998e-06 [handle_group_info]: 9.39996e-07 [symbol_engine_optimizer]: 8.584e-05, [1] [Cycle 1]: 8.146e-05, [6] [build]: 4.17e-06 [elim_shapecalc]: 1.162e-05 [elim_not_effective]: 1.514e-05 [opt_reshape]: 8.45001e-06 [fold_const_symbol]: 1.273e-05 [renormalize]: 3.69997e-07 [detach_backward]: 2.27999e-06 [pipeline_parallel_scheduler]: 1.59998e-06 [auto_monad_reorder]: 2.049e-05 [get_jit_bprop_graph]: 1.96e-06 [rewriter_after_jit_bprop_graph]: 5.01997e-06 [opt_after_jit_grad]: 0.00051549 [validate]: 5.175e-05 Sums bootstrap : 0.000481s : 1.42% type_inference : 0.007436s : 21.95% event_method : 0.000021s : 0.06% auto_monad : 0.000063s : 0.19% graph_reusing : 0.000006s : 0.02% inline : 0.000003s : 0.01% add_attr.add_attr_with_inline.tag_attr : 0.000022s : 0.07% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.02% parallel-infer-symbol : 0.000004s : 0.01% pre_auto_parallel : 0.000039s : 0.12% insert-virtual-dataset : 0.000002s : 0.01% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.01% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000029s : 0.08% optimize.rewriter_before_opt_a : 0.000091s : 0.27% optimize.opt_a.expand_dump_flag : 0.000008s : 0.02% optimize.opt_a.switch_simplify : 0.000060s : 0.18% optimize.opt_a.loop_unroll : 0.000039s : 0.12% optimize.opt_a.a_1 : 0.000991s : 2.92% optimize.opt_a.with_stream_mark : 0.000053s : 0.16% optimize.opt_a.recompute_prepare : 0.000020s : 0.06% optimize.opt_a.updatestate_depend_eliminate : 0.000010s : 0.03% optimize.opt_a.updatestate_assign_eliminate : 0.000008s : 0.02% optimize.opt_a.updatestate_loads_eliminate : 0.000008s : 0.02% optimize.opt_a.parameter_eliminate : 0.000004s : 0.01% optimize.opt_a.a_2 : 0.000200s : 0.59% optimize.opt_a.accelerated_algorithm : 0.000017s : 0.05% optimize.opt_a.shard : 0.000005s : 0.01% optimize.opt_a.meta_shard_fg_expand : 0.000005s : 0.02% optimize.opt_a.shard_inline : 0.000015s : 0.05% optimize.opt_a.merge_send_recv : 0.000022s : 0.06% optimize.opt_a.auto_parallel : 0.000018s : 0.05% optimize.opt_a.parallel : 0.000043s : 0.13% optimize.opt_a.flash_sp : 0.000014s : 0.04% optimize.opt_a.merge_comm : 0.000010s : 0.03% optimize.opt_a.allreduce_fusion : 0.000008s : 0.02% optimize.opt_a.matmul_add_comm_reduction : 0.000024s : 0.07% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000002s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000020s : 0.06% optimize.opt_a.virtual_dataset : 0.000015s : 0.04% optimize.opt_a.get_grad_eliminate_ : 0.000016s : 0.05% optimize.opt_a.virtual_output : 0.000016s : 0.05% optimize.opt_a.merge_forward : 0.000009s : 0.03% optimize.opt_a.cell_reuse_recompute_pass : 0.000005s : 0.02% optimize.opt_a.offload_activation : 0.000023s : 0.07% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000041s : 0.12% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.01% optimize.opt_a.before_grad : 0.000025s : 0.08% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000010s : 0.03% optimize.opt_a.meta_fg_expand : 0.000007s : 0.02% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.01% optimize.opt_a.receive_attached : 0.000005s : 0.01% optimize.opt_a.after_resolve : 0.000029s : 0.09% optimize.opt_a.a_after_grad : 0.000025s : 0.07% optimize.opt_a.renormalize : 0.000862s : 2.54% optimize.opt_a.add_forward_monad_depend : 0.000009s : 0.03% optimize.opt_a.auto_monad_grad : 0.000006s : 0.02% optimize.opt_a.auto_monad_eliminator : 0.000037s : 0.11% optimize.opt_a.cse : 0.000084s : 0.25% optimize.opt_a.a_3 : 0.020196s : 59.60% optimize.py_interpret_to_execute_after_opt_a : 0.000023s : 0.07% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.01% optimize.rewriter_after_opt_a : 0.000051s : 0.15% optimize.convert_after_rewriter : 0.000008s : 0.02% optimize.order_py_execute_after_rewriter : 0.000006s : 0.02% optimize.mutable_eliminate : 0.000778s : 2.30% optimize.opt_b.b_1 : 0.000174s : 0.51% optimize.opt_b.b_2 : 0.000011s : 0.03% optimize.opt_b.updatestate_depend_eliminate : 0.000009s : 0.03% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.01% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.01% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000031s : 0.09% optimize.optimize_parallel_all_gather_comm : 0.000021s : 0.06% optimize.overlap_param_gather : 0.000002s : 0.01% optimize.cconv : 0.000035s : 0.10% optimize.loop_unroll : 0.000480s : 1.42% optimize.opt_after_cconv.c_1 : 0.000039s : 0.11% optimize.opt_after_cconv.parameter_eliminate : 0.000004s : 0.01% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000007s : 0.02% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.cse : 0.000026s : 0.08% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000017s : 0.05% optimize.tuple_transform.d_1 : 0.000054s : 0.16% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000008s : 0.02% optimize.partial_unused_args_eliminate : 0.000002s : 0.01% optimize.add_recomputation : 0.000064s : 0.19% optimize.cse_after_recomputation.cse : 0.000016s : 0.05% optimize.environ_conv : 0.000007s : 0.02% optimize.swap_dp_allreduce_reducescatter : 0.000006s : 0.02% optimize.bias_add_comm_swap : 0.000003s : 0.01% optimize.label_micro_interleaved_index : 0.000004s : 0.01% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.01% optimize.merge_cast_opt : 0.000002s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.01% optimize.micro_interleaved_order_control : 0.000003s : 0.01% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.00% optimize.full_micro_interleaved_order_control : 0.000002s : 0.01% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.01% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.01% optimize.control_data_broadcast_order : 0.000015s : 0.04% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.00% optimize.offloading_packed_experts : 0.000004s : 0.01% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.02% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.00% optimize.overlap_recompute_comm : 0.000002s : 0.01% optimize.overlap_grad_ring_attention : 0.000005s : 0.01% optimize.overlap_grad_flash_sp : 0.000024s : 0.07% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.01% optimize.split_layernorm_comm : 0.000002s : 0.01% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000004s : 0.01% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000012s : 0.03% optimize.symbol_engine_optimizer.elim_not_effective : 0.000015s : 0.04% optimize.symbol_engine_optimizer.opt_reshape : 0.000008s : 0.02% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000013s : 0.04% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.01% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000020s : 0.06% get_jit_bprop_graph : 0.000002s : 0.01% rewriter_after_jit_bprop_graph : 0.000005s : 0.01% opt_after_jit_grad : 0.000515s : 1.52% validate : 0.000052s : 0.15% Time group info: ------[substitution.] 0.000255 39 14.14% : 0.000036s : 3: substitution.cast_eliminate 0.87% : 0.000002s : 3: substitution.elim_not_effective 0.79% : 0.000002s : 3: substitution.fold_const_symbol 2.85% : 0.000007s : 5: substitution.graph_param_transform 62.04% : 0.000158s : 4: substitution.inline 2.09% : 0.000005s : 6: substitution.j_node_and_user_rematch 5.60% : 0.000014s : 6: substitution.remove_not_recompute_node 2.66% : 0.000007s : 4: substitution.replace_old_param 5.95% : 0.000015s : 4: substitution.tuple_list_get_item_eliminator 3.02% : 0.000008s : 1: substitution.value_based_eliminate ------[type_inference.] 0.007366 2 89.38% : 0.006584s : 1: type_inference.infer 10.62% : 0.000782s : 1: type_inference.specialize ------[replace.] 0.000067 8 59.07% : 0.000040s : 4: replace.inline 40.93% : 0.000027s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000169 8 92.15% : 0.000155s : 4: match.inline 7.85% : 0.000013s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000285 1596 1.09% : 0.000003s : 17: predicate.accumulaten_eliminater 0.80% : 0.000002s : 5: predicate.ad_related_special_op_eliminate 0.54% : 0.000002s : 10: predicate.addn_check_dump 0.90% : 0.000003s : 17: predicate.addn_zero_filter 0.79% : 0.000002s : 17: predicate.adjust_all_reduce_mul_add 2.31% : 0.000007s : 27: predicate.arithmetic_simplify 1.04% : 0.000003s : 17: predicate.cast_eliminate 0.59% : 0.000002s : 10: predicate.check_bprop_eliminate 0.59% : 0.000002s : 10: predicate.compare_switch_simplify 0.23% : 0.000001s : 5: predicate.const_output_eliminate 0.63% : 0.000002s : 10: predicate.depend_value_elim 0.95% : 0.000003s : 17: predicate.dict_get_item_const_eliminator 1.22% : 0.000003s : 17: predicate.dict_get_item_eliminator 0.86% : 0.000002s : 17: predicate.dict_set_item_eliminator 0.95% : 0.000003s : 10: predicate.dumpgradient_eliminate 0.21% : 0.000001s : 5: predicate.elim_not_effective 0.31% : 0.000001s : 5: predicate.elim_shapecalc_of_broadcastargs 1.14% : 0.000003s : 22: predicate.environ_add_const_eliminate 1.09% : 0.000003s : 22: predicate.environ_get_add_eliminate 1.09% : 0.000003s : 22: predicate.environ_get_depend_swap 1.66% : 0.000005s : 32: predicate.environ_get_eliminate 1.28% : 0.000004s : 22: predicate.environ_get_set_eliminate 1.37% : 0.000004s : 25: predicate.exchange_switch_depend_value 2.54% : 0.000007s : 25: predicate.float_depend_g_call 0.50% : 0.000001s : 10: predicate.float_environ_get_switch 0.79% : 0.000002s : 15: predicate.float_tuple_getitem_switch 0.15% : 0.000000s : 5: predicate.fold_const_symbol 0.70% : 0.000002s : 10: predicate.get_grad_eliminate 0.18% : 0.000001s : 5: predicate.graph_param_transform 0.54% : 0.000002s : 10: predicate.incorporate_call 0.48% : 0.000001s : 10: predicate.incorporate_call_switch 5.92% : 0.000017s : 72: predicate.inline 0.90% : 0.000003s : 10: predicate.inline_without_move 0.28% : 0.000001s : 10: predicate.j_node_and_user_rematch 0.93% : 0.000003s : 10: predicate.less_batch_normalization 1.71% : 0.000005s : 31: predicate.list_to_tuple_eliminator_ 2.60% : 0.000007s : 48: predicate.load_eliminater 0.97% : 0.000003s : 5: predicate.loop_unroll_after_grad 1.77% : 0.000005s : 36: predicate.loop_unroll_before_grad 1.54% : 0.000004s : 27: predicate.make_slice_get_slice_eliminator 0.54% : 0.000002s : 10: predicate.merge_addn 0.56% : 0.000002s : 10: predicate.micro_step_allgather_replace 0.56% : 0.000002s : 10: predicate.mini_step_allgather_replace 0.82% : 0.000002s : 17: predicate.minmaximum_grad 1.13% : 0.000003s : 5: predicate.mutable_eliminate 0.37% : 0.000001s : 5: predicate.opt_reshape 0.38% : 0.000001s : 5: predicate.parallel_virtual_node 1.63% : 0.000005s : 25: predicate.partial_defer_inline 1.52% : 0.000004s : 26: predicate.partial_eliminate 0.95% : 0.000003s : 17: predicate.print_const_string_wrapper 0.53% : 0.000002s : 10: predicate.reduce_all_const_elim 1.21% : 0.000003s : 17: predicate.reduce_eliminate 2.44% : 0.000007s : 48: predicate.redundant_stop_gradient_eliminater 0.40% : 0.000001s : 10: predicate.remove_not_recompute_node 1.23% : 0.000004s : 31: predicate.replace_applicator 0.52% : 0.000001s : 10: predicate.replace_old_param 0.23% : 0.000001s : 5: predicate.reset_defer_inline 0.96% : 0.000003s : 17: predicate.reshape_eliminate 2.99% : 0.000009s : 10: predicate.row_tensor_add_zeros_like 0.48% : 0.000001s : 5: predicate.row_tensor_eliminate 0.81% : 0.000002s : 10: predicate.same_eliminate 0.37% : 0.000001s : 10: predicate.set_cell_output_no_recompute 0.80% : 0.000002s : 10: predicate.shard_identity_eliminate 0.71% : 0.000002s : 10: predicate.special_op_eliminate 0.62% : 0.000002s : 10: predicate.specialize_transform 1.10% : 0.000003s : 10: predicate.split_environ_get_set_with_tuple_value 0.71% : 0.000002s : 10: predicate.stack_unstack_eliminate 0.34% : 0.000001s : 5: predicate.switch_call_monad_eliminater 1.54% : 0.000004s : 25: predicate.switch_defer_inline 1.98% : 0.000006s : 35: predicate.switch_layer_defer_inline 4.53% : 0.000013s : 76: predicate.switch_simplify 0.90% : 0.000003s : 17: predicate.tile_eliminate 1.01% : 0.000003s : 17: predicate.transpose_eliminate 1.57% : 0.000004s : 27: predicate.tuple_list_convert_item_index_to_positive 1.58% : 0.000005s : 27: predicate.tuple_list_get_item_const_eliminator 1.49% : 0.000004s : 27: predicate.tuple_list_get_item_depend_reorder 3.09% : 0.000009s : 41: predicate.tuple_list_get_item_eliminator 1.67% : 0.000005s : 27: predicate.tuple_list_get_set_item_eliminator 2.23% : 0.000006s : 37: predicate.tuple_list_set_item_eliminator 1.82% : 0.000005s : 31: predicate.tuple_to_list_eliminator_ 2.29% : 0.000007s : 48: predicate.updatestate_pure_node_eliminater 2.99% : 0.000009s : 58: predicate.updatestate_useless_node_eliminater 0.49% : 0.000001s : 5: predicate.value_based_eliminate 0.56% : 0.000002s : 10: predicate.virtual_dataset_eliminate 0.57% : 0.000002s : 10: predicate.virtual_output_eliminate 0.27% : 0.000001s : 5: predicate.virtual_view_grad_eliminate 0.35% : 0.000001s : 5: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000645 11 55.85% : 0.000360s : 5: func_graph_cloner_run.FuncGraphClonerGraph 44.15% : 0.000285s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.070878 192 0.01% : 0.000004s : 1: ForceFp32Comm 5.03% : 0.003565s : 1: add_attr 5.01% : 0.003553s : 1: add_attr_with_inline 0.01% : 0.000004s : 1: add_comm_op_reuse_tag 0.10% : 0.000068s : 1: add_recomputation 0.01% : 0.000004s : 1: assign_add_opt 0.10% : 0.000070s : 1: auto_monad 0.03% : 0.000024s : 1: auto_monad_reorder 0.00% : 0.000003s : 1: begin_end_overlap_inline 0.01% : 0.000006s : 1: bias_add_comm_swap 0.73% : 0.000515s : 1: bootstrap 0.06% : 0.000039s : 1: cconv 0.01% : 0.000004s : 1: comm_op_add_attrs 0.03% : 0.000018s : 1: control_data_broadcast_order 0.02% : 0.000011s : 1: convert_after_rewriter 0.04% : 0.000029s : 1: cse_after_recomputation 0.01% : 0.000005s : 1: dataset_repeat_opt 0.01% : 0.000005s : 1: detach_backward 0.01% : 0.000011s : 1: environ_conv 0.04% : 0.000028s : 1: event_method 0.01% : 0.000005s : 1: full_micro_interleaved_order_control 0.01% : 0.000005s : 1: get_jit_bprop_graph 0.01% : 0.000009s : 1: graph_reusing 0.01% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000004s : 1: handle_group_info 0.01% : 0.000006s : 1: inline 0.01% : 0.000006s : 1: insert-virtual-dataset 0.01% : 0.000004s : 1: interleave_parallel_branches 0.01% : 0.000004s : 1: interleave_split_concat_branches 0.01% : 0.000006s : 1: label_fine_grained_interleaved_index 0.01% : 0.000007s : 1: label_micro_interleaved_index 0.69% : 0.000488s : 1: loop_unroll 0.01% : 0.000004s : 1: merge_cast_opt 0.01% : 0.000006s : 1: micro_interleaved_order_control 1.11% : 0.000789s : 1: mutable_eliminate 0.01% : 0.000007s : 1: offloading_packed_experts 0.03% : 0.000018s : 1: opt.transform.loop_unroll_optimizer 0.03% : 0.000021s : 1: opt.transform.mutable_eliminate 2.21% : 0.001568s : 78: opt.transform.opt_a 0.05% : 0.000037s : 1: opt.transform.opt_after_cconv 0.04% : 0.000031s : 1: opt.transform.opt_after_jit_grad 0.21% : 0.000151s : 28: opt.transform.opt_b 0.08% : 0.000060s : 2: opt.transform.opt_trans_graph 0.06% : 0.000044s : 4: opt.transform.symbol_engine_opt 33.29% : 0.023598s : 1: opt_a 0.18% : 0.000127s : 1: opt_after_cconv 0.74% : 0.000525s : 1: opt_after_jit_grad 0.40% : 0.000283s : 1: opt_b 36.90% : 0.026152s : 1: optimize 0.03% : 0.000024s : 1: optimize_parallel_all_gather_comm 0.01% : 0.000009s : 1: order_py_execute_after_rewriter 0.04% : 0.000027s : 1: overlap_grad_flash_sp 0.01% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.01% : 0.000008s : 1: overlap_grad_ring_attention 0.01% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.01% : 0.000005s : 1: overlap_param_gather 0.01% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.01% : 0.000008s : 1: overlap_recompute_and_grad_model_parallel 0.01% : 0.000005s : 1: overlap_recompute_comm 0.01% : 0.000007s : 1: parallel-infer-symbol 0.01% : 0.000004s : 1: parallel-infer-symbol-second 0.01% : 0.000005s : 1: partial_unused_args_eliminate 0.01% : 0.000005s : 1: pipeline_parallel_scheduler 0.01% : 0.000005s : 1: pipeline_split 0.06% : 0.000044s : 1: pre_auto_parallel 0.05% : 0.000033s : 1: py_interpret_to_execute 0.04% : 0.000027s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000005s : 1: remove_cast_before_assign_add 0.03% : 0.000021s : 1: remove_dup_value 0.68% : 0.000485s : 1: renormalize.infer 0.52% : 0.000368s : 1: renormalize.specialize 0.01% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.01% : 0.000008s : 1: rewriter_after_jit_bprop_graph 0.08% : 0.000056s : 1: rewriter_after_opt_a 0.13% : 0.000095s : 1: rewriter_before_opt_a 0.01% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.01% : 0.000005s : 1: slice_recompute_activation 0.01% : 0.000005s : 1: split_layernorm_comm 0.01% : 0.000005s : 1: split_matmul_comm_elemetwise 0.01% : 0.000009s : 1: swap_dp_allreduce_reducescatter 0.13% : 0.000089s : 1: symbol_engine_optimizer 0.13% : 0.000090s : 1: tuple_transform 10.52% : 0.007457s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:44:12.715.659 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:44:12.715.959 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0730526, [21] [bootstrap]: 0.00045404 [type_inference]: 0.00633908 [event_method]: 2.007e-05 [auto_monad]: 6.16e-05 [graph_reusing]: 6.24001e-06 [inline]: 2.10002e-06 [add_attr]: 0.00331717, [1] [add_attr_with_inline]: 0.00330617, [1] [Cycle 1]: 8.452e-05, [2] [tag_attr]: 2.135e-05 [meta_addattr_fg_expand]: 5.68002e-06 [parallel-infer-symbol]: 3.76999e-06 [pre_auto_parallel]: 4.047e-05 [insert-virtual-dataset]: 2.44001e-06 [parallel-infer-symbol-second]: 7.09988e-07 [dataset_repeat_opt]: 2.09e-06 [pipeline_split]: 1.49e-06 [optimize]: 0.0615855, [53] [py_interpret_to_execute]: 3.242e-05 [rewriter_before_opt_a]: 9.072e-05 [opt_a]: 0.0589451, [2] [Cycle 1]: 0.0580399, [45] [expand_dump_flag]: 2.99001e-06 [switch_simplify]: 4.295e-05 [loop_unroll]: 3.015e-05 [a_1]: 0.0560458 [with_stream_mark]: 3.343e-05 [recompute_prepare]: 1.473e-05 [updatestate_depend_eliminate]: 5.69e-06 [updatestate_assign_eliminate]: 3.81001e-06 [updatestate_loads_eliminate]: 3.35998e-06 [parameter_eliminate]: 2.73e-06 [a_2]: 0.00011266 [accelerated_algorithm]: 7.67998e-06 [shard]: 2.31e-06 [meta_shard_fg_expand]: 2.89001e-06 [shard_inline]: 6.51999e-06 [merge_send_recv]: 1.031e-05 [auto_parallel]: 1.066e-05 [parallel]: 2.134e-05 [flash_sp]: 1.216e-05 [merge_comm]: 4.41002e-06 [allreduce_fusion]: 3.63999e-06 [matmul_add_comm_reduction]: 1.144e-05 [allreduce_slice_to_reducescatter]: 8.00006e-07 [virtual_shard_identity]: 9.00999e-06 [virtual_dataset]: 7.04001e-06 [get_grad_eliminate_]: 7.1e-06 [virtual_output]: 7.24001e-06 [merge_forward]: 4.65999e-06 [cell_reuse_recompute_pass]: 1.59e-06 [offload_activation]: 1.179e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.96e-05 [merge_recompute_call_nodes]: 1.72001e-06 [before_grad]: 1.275e-05 [set_forward_comm_id_for_comm_node_pass]: 4.17e-06 [meta_fg_expand]: 3.11999e-06 [flash_sp_send_recv_attached]: 2.76999e-06 [receive_attached]: 2.46998e-06 [after_resolve]: 1.437e-05 [a_after_grad]: 1.062e-05 [renormalize]: 0.0009291 [add_forward_monad_depend]: 7.25998e-06 [auto_monad_grad]: 2.82002e-06 [auto_monad_eliminator]: 1.963e-05 [cse]: 2.971e-05 [a_3]: 6.964e-05 [Cycle 2]: 0.00088701, [45] [expand_dump_flag]: 2.36998e-06 [switch_simplify]: 8.97e-06 [loop_unroll]: 6.63998e-06 [a_1]: 0.00014111 [with_stream_mark]: 1.771e-05 [recompute_prepare]: 7.59002e-06 [updatestate_depend_eliminate]: 4.37e-06 [updatestate_assign_eliminate]: 3.19001e-06 [updatestate_loads_eliminate]: 2.64001e-06 [parameter_eliminate]: 2.08998e-06 [a_2]: 0.00010068 [accelerated_algorithm]: 6.99001e-06 [shard]: 2.02999e-06 [meta_shard_fg_expand]: 2.36e-06 [shard_inline]: 6.76e-06 [merge_send_recv]: 7.88001e-06 [auto_parallel]: 9.44e-06 [parallel]: 7.08e-06 [flash_sp]: 4.51002e-06 [merge_comm]: 3.38e-06 [allreduce_fusion]: 3.65e-06 [matmul_add_comm_reduction]: 6.67002e-06 [allreduce_slice_to_reducescatter]: 8.70001e-07 [virtual_shard_identity]: 7.23e-06 [virtual_dataset]: 6.34999e-06 [get_grad_eliminate_]: 6.09001e-06 [virtual_output]: 5.97999e-06 [merge_forward]: 3.76001e-06 [cell_reuse_recompute_pass]: 2.64999e-06 [offload_activation]: 8.80001e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.804e-05 [merge_recompute_call_nodes]: 1.56998e-06 [before_grad]: 1.092e-05 [set_forward_comm_id_for_comm_node_pass]: 4.19997e-06 [meta_fg_expand]: 2.51e-06 [flash_sp_send_recv_attached]: 9.89996e-07 [receive_attached]: 1.45001e-06 [after_resolve]: 1.279e-05 [a_after_grad]: 1.014e-05 [renormalize]: 8.00064e-08 [add_forward_monad_depend]: 1.79998e-06 [auto_monad_grad]: 1.70001e-06 [auto_monad_eliminator]: 8.70001e-06 [cse]: 1.51e-05 [a_3]: 5.045e-05 [py_interpret_to_execute_after_opt_a]: 1.84e-05 [slice_cell_reuse_recomputed_activation]: 5.37999e-06 [rewriter_after_opt_a]: 4.681e-05 [convert_after_rewriter]: 1.021e-05 [order_py_execute_after_rewriter]: 8.08001e-06 [mutable_eliminate]: 0.00073489 [opt_b]: 0.00029586, [1] [Cycle 1]: 0.00028466, [7] [b_1]: 0.00018203 [b_2]: 9.34e-06 [updatestate_depend_eliminate]: 7.43999e-06 [updatestate_assign_eliminate]: 2.83e-06 [updatestate_loads_eliminate]: 2.60002e-06 [renormalize]: 7.7e-07 [cse]: 2.171e-05 [optimize_parallel_all_gather_comm]: 2.102e-05 [overlap_param_gather]: 4.49002e-06 [cconv]: 3.627e-05 [loop_unroll]: 0.00047459 [opt_after_cconv]: 0.0001339, [1] [Cycle 1]: 0.00012398, [7] [c_1]: 3.249e-05 [parameter_eliminate]: 4.09002e-06 [updatestate_depend_eliminate]: 6.41e-06 [updatestate_assign_eliminate]: 2.72001e-06 [updatestate_loads_eliminate]: 2.36e-06 [cse]: 1.876e-05 [renormalize]: 3.50003e-07 [remove_dup_value]: 1.758e-05 [tuple_transform]: 9.036e-05, [1] [Cycle 1]: 8.314e-05, [4] [d_1]: 4.47e-05 [none_parameter_eliminate]: 1.57001e-06 [renormalize]: 1.69995e-07 [switch_simplify]: 7.2e-06 [partial_unused_args_eliminate]: 4.35e-06 [add_recomputation]: 5.32e-05 [cse_after_recomputation]: 2.694e-05, [1] [Cycle 1]: 2.014e-05, [1] [cse]: 1.125e-05 [environ_conv]: 8.99e-06 [swap_dp_allreduce_reducescatter]: 7.85e-06 [bias_add_comm_swap]: 5.37999e-06 [label_micro_interleaved_index]: 7.88001e-06 [label_fine_grained_interleaved_index]: 5.42001e-06 [merge_cast_opt]: 3.84002e-06 [slice_recompute_activation]: 4.80001e-06 [micro_interleaved_order_control]: 4.85999e-06 [assign_add_opt]: 3.79002e-06 [ForceFp32Comm]: 3.63e-06 [remove_cast_before_assign_add]: 3.63e-06 [full_micro_interleaved_order_control]: 4.74e-06 [reorder_send_recv_between_fp_bp]: 5.41998e-06 [comm_op_add_attrs]: 3.46999e-06 [add_comm_op_reuse_tag]: 3.21999e-06 [interleave_split_concat_branches]: 3.71001e-06 [interleave_parallel_branches]: 3.6e-06 [overlap_opt_shard_in_pipeline]: 4.40999e-06 [overlap_opt_shard_grad_in_pipeline]: 4e-06 [control_data_broadcast_order]: 1.723e-05 [grouped_pairwise_exchange_alltoall]: 4.2e-06 [offloading_packed_experts]: 6.33e-06 [overlap_recompute_and_grad_model_parallel]: 8.05e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.76001e-06 [overlap_recompute_allgather_and_fa_grad]: 3.81999e-06 [overlap_recompute_comm]: 5.22e-06 [overlap_grad_ring_attention]: 6.94999e-06 [overlap_grad_flash_sp]: 2.505e-05 [begin_end_overlap_inline]: 3.24001e-06 [split_matmul_comm_elemetwise]: 4.63001e-06 [split_layernorm_comm]: 4.1e-06 [handle_group_info]: 3.46001e-06 [symbol_engine_optimizer]: 0.00013046, [1] [Cycle 1]: 0.00012336, [6] [build]: 3.81999e-06 [elim_shapecalc]: 1.014e-05 [elim_not_effective]: 1.308e-05 [opt_reshape]: 7.26001e-06 [fold_const_symbol]: 3.987e-05 [renormalize]: 3.50003e-07 [detach_backward]: 4.22998e-06 [pipeline_parallel_scheduler]: 1.69e-06 [auto_monad_reorder]: 1.991e-05 [get_jit_bprop_graph]: 1.89999e-06 [rewriter_after_jit_bprop_graph]: 6.28002e-06 [opt_after_jit_grad]: 0.00054959 [validate]: 4.305e-05 Sums bootstrap : 0.000454s : 0.67% type_inference : 0.006339s : 9.34% event_method : 0.000020s : 0.03% auto_monad : 0.000062s : 0.09% graph_reusing : 0.000006s : 0.01% inline : 0.000002s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000021s : 0.03% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.01% parallel-infer-symbol : 0.000004s : 0.01% pre_auto_parallel : 0.000040s : 0.06% insert-virtual-dataset : 0.000002s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000001s : 0.00% optimize.py_interpret_to_execute : 0.000032s : 0.05% optimize.rewriter_before_opt_a : 0.000091s : 0.13% optimize.opt_a.expand_dump_flag : 0.000005s : 0.01% optimize.opt_a.switch_simplify : 0.000052s : 0.08% optimize.opt_a.loop_unroll : 0.000037s : 0.05% optimize.opt_a.a_1 : 0.056187s : 82.77% optimize.opt_a.with_stream_mark : 0.000051s : 0.08% optimize.opt_a.recompute_prepare : 0.000022s : 0.03% optimize.opt_a.updatestate_depend_eliminate : 0.000010s : 0.01% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.01% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.01% optimize.opt_a.parameter_eliminate : 0.000005s : 0.01% optimize.opt_a.a_2 : 0.000213s : 0.31% optimize.opt_a.accelerated_algorithm : 0.000015s : 0.02% optimize.opt_a.shard : 0.000004s : 0.01% optimize.opt_a.meta_shard_fg_expand : 0.000005s : 0.01% optimize.opt_a.shard_inline : 0.000013s : 0.02% optimize.opt_a.merge_send_recv : 0.000018s : 0.03% optimize.opt_a.auto_parallel : 0.000020s : 0.03% optimize.opt_a.parallel : 0.000028s : 0.04% optimize.opt_a.flash_sp : 0.000017s : 0.02% optimize.opt_a.merge_comm : 0.000008s : 0.01% optimize.opt_a.allreduce_fusion : 0.000007s : 0.01% optimize.opt_a.matmul_add_comm_reduction : 0.000018s : 0.03% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000002s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000016s : 0.02% optimize.opt_a.virtual_dataset : 0.000013s : 0.02% optimize.opt_a.get_grad_eliminate_ : 0.000013s : 0.02% optimize.opt_a.virtual_output : 0.000013s : 0.02% optimize.opt_a.merge_forward : 0.000008s : 0.01% optimize.opt_a.cell_reuse_recompute_pass : 0.000004s : 0.01% optimize.opt_a.offload_activation : 0.000021s : 0.03% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000038s : 0.06% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.00% optimize.opt_a.before_grad : 0.000024s : 0.03% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000008s : 0.01% optimize.opt_a.meta_fg_expand : 0.000006s : 0.01% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.01% optimize.opt_a.receive_attached : 0.000004s : 0.01% optimize.opt_a.after_resolve : 0.000027s : 0.04% optimize.opt_a.a_after_grad : 0.000021s : 0.03% optimize.opt_a.renormalize : 0.000929s : 1.37% optimize.opt_a.add_forward_monad_depend : 0.000009s : 0.01% optimize.opt_a.auto_monad_grad : 0.000005s : 0.01% optimize.opt_a.auto_monad_eliminator : 0.000028s : 0.04% optimize.opt_a.cse : 0.000045s : 0.07% optimize.opt_a.a_3 : 0.000120s : 0.18% optimize.py_interpret_to_execute_after_opt_a : 0.000018s : 0.03% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.01% optimize.rewriter_after_opt_a : 0.000047s : 0.07% optimize.convert_after_rewriter : 0.000010s : 0.02% optimize.order_py_execute_after_rewriter : 0.000008s : 0.01% optimize.mutable_eliminate : 0.000735s : 1.08% optimize.opt_b.b_1 : 0.000182s : 0.27% optimize.opt_b.b_2 : 0.000009s : 0.01% optimize.opt_b.updatestate_depend_eliminate : 0.000007s : 0.01% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.00% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000022s : 0.03% optimize.optimize_parallel_all_gather_comm : 0.000021s : 0.03% optimize.overlap_param_gather : 0.000004s : 0.01% optimize.cconv : 0.000036s : 0.05% optimize.loop_unroll : 0.000475s : 0.70% optimize.opt_after_cconv.c_1 : 0.000032s : 0.05% optimize.opt_after_cconv.parameter_eliminate : 0.000004s : 0.01% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.01% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.00% optimize.opt_after_cconv.cse : 0.000019s : 0.03% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000018s : 0.03% optimize.tuple_transform.d_1 : 0.000045s : 0.07% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000007s : 0.01% optimize.partial_unused_args_eliminate : 0.000004s : 0.01% optimize.add_recomputation : 0.000053s : 0.08% optimize.cse_after_recomputation.cse : 0.000011s : 0.02% optimize.environ_conv : 0.000009s : 0.01% optimize.swap_dp_allreduce_reducescatter : 0.000008s : 0.01% optimize.bias_add_comm_swap : 0.000005s : 0.01% optimize.label_micro_interleaved_index : 0.000008s : 0.01% optimize.label_fine_grained_interleaved_index : 0.000005s : 0.01% optimize.merge_cast_opt : 0.000004s : 0.01% optimize.slice_recompute_activation : 0.000005s : 0.01% optimize.micro_interleaved_order_control : 0.000005s : 0.01% optimize.assign_add_opt : 0.000004s : 0.01% optimize.ForceFp32Comm : 0.000004s : 0.01% optimize.remove_cast_before_assign_add : 0.000004s : 0.01% optimize.full_micro_interleaved_order_control : 0.000005s : 0.01% optimize.reorder_send_recv_between_fp_bp : 0.000005s : 0.01% optimize.comm_op_add_attrs : 0.000003s : 0.01% optimize.add_comm_op_reuse_tag : 0.000003s : 0.00% optimize.interleave_split_concat_branches : 0.000004s : 0.01% optimize.interleave_parallel_branches : 0.000004s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000004s : 0.01% optimize.control_data_broadcast_order : 0.000017s : 0.03% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.01% optimize.offloading_packed_experts : 0.000006s : 0.01% optimize.overlap_recompute_and_grad_model_parallel : 0.000008s : 0.01% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.01% optimize.overlap_recompute_comm : 0.000005s : 0.01% optimize.overlap_grad_ring_attention : 0.000007s : 0.01% optimize.overlap_grad_flash_sp : 0.000025s : 0.04% optimize.begin_end_overlap_inline : 0.000003s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000005s : 0.01% optimize.split_layernorm_comm : 0.000004s : 0.01% optimize.handle_group_info : 0.000003s : 0.01% optimize.symbol_engine_optimizer.build : 0.000004s : 0.01% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000010s : 0.01% optimize.symbol_engine_optimizer.elim_not_effective : 0.000013s : 0.02% optimize.symbol_engine_optimizer.opt_reshape : 0.000007s : 0.01% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000040s : 0.06% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000004s : 0.01% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000020s : 0.03% get_jit_bprop_graph : 0.000002s : 0.00% rewriter_after_jit_bprop_graph : 0.000006s : 0.01% opt_after_jit_grad : 0.000550s : 0.81% validate : 0.000043s : 0.06% Time group info: ------[substitution.] 0.000292 29 0.67% : 0.000002s : 2: substitution.elim_not_effective 0.46% : 0.000001s : 2: substitution.fold_const_symbol 2.28% : 0.000007s : 4: substitution.graph_param_transform 80.93% : 0.000236s : 4: substitution.inline 1.91% : 0.000006s : 4: substitution.j_node_and_user_rematch 2.02% : 0.000006s : 4: substitution.remove_not_recompute_node 2.45% : 0.000007s : 4: substitution.replace_old_param 6.75% : 0.000020s : 4: substitution.tuple_list_get_item_eliminator 2.55% : 0.000007s : 1: substitution.value_based_eliminate ------[type_inference.] 0.006281 2 88.96% : 0.005587s : 1: type_inference.infer 11.04% : 0.000694s : 1: type_inference.specialize ------[replace.] 0.000073 8 64.16% : 0.000047s : 4: replace.inline 35.84% : 0.000026s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000250 8 92.96% : 0.000233s : 4: match.inline 7.04% : 0.000018s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000247 1278 0.89% : 0.000002s : 13: predicate.accumulaten_eliminater 0.62% : 0.000002s : 4: predicate.ad_related_special_op_eliminate 0.50% : 0.000001s : 8: predicate.addn_check_dump 1.02% : 0.000003s : 13: predicate.addn_zero_filter 0.77% : 0.000002s : 13: predicate.adjust_all_reduce_mul_add 2.36% : 0.000006s : 21: predicate.arithmetic_simplify 1.02% : 0.000003s : 13: predicate.cast_eliminate 0.53% : 0.000001s : 8: predicate.check_bprop_eliminate 0.55% : 0.000001s : 8: predicate.compare_switch_simplify 0.15% : 0.000000s : 4: predicate.const_output_eliminate 0.57% : 0.000001s : 8: predicate.depend_value_elim 0.89% : 0.000002s : 13: predicate.dict_get_item_const_eliminator 1.16% : 0.000003s : 13: predicate.dict_get_item_eliminator 1.06% : 0.000003s : 13: predicate.dict_set_item_eliminator 1.14% : 0.000003s : 8: predicate.dumpgradient_eliminate 0.24% : 0.000001s : 4: predicate.elim_not_effective 0.31% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.43% : 0.000004s : 17: predicate.environ_add_const_eliminate 0.95% : 0.000002s : 17: predicate.environ_get_add_eliminate 1.01% : 0.000002s : 17: predicate.environ_get_depend_swap 1.49% : 0.000004s : 25: predicate.environ_get_eliminate 0.96% : 0.000002s : 17: predicate.environ_get_set_eliminate 1.22% : 0.000003s : 21: predicate.exchange_switch_depend_value 2.37% : 0.000006s : 21: predicate.float_depend_g_call 0.55% : 0.000001s : 8: predicate.float_environ_get_switch 0.74% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.13% : 0.000000s : 4: predicate.fold_const_symbol 0.65% : 0.000002s : 8: predicate.get_grad_eliminate 0.16% : 0.000000s : 4: predicate.graph_param_transform 0.50% : 0.000001s : 8: predicate.incorporate_call 0.40% : 0.000001s : 8: predicate.incorporate_call_switch 6.93% : 0.000017s : 58: predicate.inline 0.74% : 0.000002s : 8: predicate.inline_without_move 0.29% : 0.000001s : 8: predicate.j_node_and_user_rematch 0.87% : 0.000002s : 8: predicate.less_batch_normalization 1.90% : 0.000005s : 25: predicate.list_to_tuple_eliminator_ 2.23% : 0.000006s : 38: predicate.load_eliminater 0.91% : 0.000002s : 4: predicate.loop_unroll_after_grad 2.06% : 0.000005s : 34: predicate.loop_unroll_before_grad 1.62% : 0.000004s : 21: predicate.make_slice_get_slice_eliminator 0.46% : 0.000001s : 8: predicate.merge_addn 0.59% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.55% : 0.000001s : 8: predicate.mini_step_allgather_replace 1.35% : 0.000003s : 13: predicate.minmaximum_grad 1.14% : 0.000003s : 4: predicate.mutable_eliminate 0.30% : 0.000001s : 4: predicate.opt_reshape 0.44% : 0.000001s : 4: predicate.parallel_virtual_node 2.59% : 0.000006s : 21: predicate.partial_defer_inline 1.37% : 0.000003s : 21: predicate.partial_eliminate 0.80% : 0.000002s : 13: predicate.print_const_string_wrapper 0.63% : 0.000002s : 8: predicate.reduce_all_const_elim 3.77% : 0.000009s : 13: predicate.reduce_eliminate 2.24% : 0.000006s : 38: predicate.redundant_stop_gradient_eliminater 0.37% : 0.000001s : 8: predicate.remove_not_recompute_node 1.25% : 0.000003s : 25: predicate.replace_applicator 0.43% : 0.000001s : 8: predicate.replace_old_param 0.30% : 0.000001s : 4: predicate.reset_defer_inline 0.86% : 0.000002s : 13: predicate.reshape_eliminate 0.63% : 0.000002s : 8: predicate.row_tensor_add_zeros_like 0.51% : 0.000001s : 4: predicate.row_tensor_eliminate 0.62% : 0.000002s : 8: predicate.same_eliminate 0.45% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.67% : 0.000002s : 8: predicate.shard_identity_eliminate 0.72% : 0.000002s : 8: predicate.special_op_eliminate 0.55% : 0.000001s : 8: predicate.specialize_transform 1.04% : 0.000003s : 8: predicate.split_environ_get_set_with_tuple_value 0.79% : 0.000002s : 8: predicate.stack_unstack_eliminate 0.33% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.47% : 0.000004s : 21: predicate.switch_defer_inline 1.81% : 0.000004s : 29: predicate.switch_layer_defer_inline 4.34% : 0.000011s : 67: predicate.switch_simplify 2.15% : 0.000005s : 13: predicate.tile_eliminate 1.10% : 0.000003s : 13: predicate.transpose_eliminate 1.38% : 0.000003s : 21: predicate.tuple_list_convert_item_index_to_positive 1.27% : 0.000003s : 21: predicate.tuple_list_get_item_const_eliminator 1.46% : 0.000004s : 21: predicate.tuple_list_get_item_depend_reorder 2.96% : 0.000007s : 33: predicate.tuple_list_get_item_eliminator 1.38% : 0.000003s : 21: predicate.tuple_list_get_set_item_eliminator 2.22% : 0.000005s : 29: predicate.tuple_list_set_item_eliminator 1.56% : 0.000004s : 25: predicate.tuple_to_list_eliminator_ 2.07% : 0.000005s : 38: predicate.updatestate_pure_node_eliminater 2.70% : 0.000007s : 46: predicate.updatestate_useless_node_eliminater 0.46% : 0.000001s : 4: predicate.value_based_eliminate 0.62% : 0.000002s : 8: predicate.virtual_dataset_eliminate 0.78% : 0.000002s : 8: predicate.virtual_output_eliminate 0.24% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.39% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000608 11 53.31% : 0.000324s : 5: func_graph_cloner_run.FuncGraphClonerGraph 46.69% : 0.000284s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.195706 192 0.00% : 0.000006s : 1: ForceFp32Comm 1.70% : 0.003328s : 1: add_attr 1.69% : 0.003310s : 1: add_attr_with_inline 0.00% : 0.000006s : 1: add_comm_op_reuse_tag 0.03% : 0.000057s : 1: add_recomputation 0.00% : 0.000006s : 1: assign_add_opt 0.04% : 0.000070s : 1: auto_monad 0.01% : 0.000028s : 1: auto_monad_reorder 0.00% : 0.000006s : 1: begin_end_overlap_inline 0.00% : 0.000008s : 1: bias_add_comm_swap 0.25% : 0.000498s : 1: bootstrap 0.02% : 0.000039s : 1: cconv 0.00% : 0.000006s : 1: comm_op_add_attrs 0.01% : 0.000020s : 1: control_data_broadcast_order 0.01% : 0.000014s : 1: convert_after_rewriter 0.02% : 0.000030s : 1: cse_after_recomputation 0.00% : 0.000008s : 1: dataset_repeat_opt 0.01% : 0.000019s : 1: detach_backward 0.01% : 0.000012s : 1: environ_conv 0.02% : 0.000032s : 1: event_method 0.00% : 0.000007s : 1: full_micro_interleaved_order_control 0.00% : 0.000009s : 1: get_jit_bprop_graph 0.01% : 0.000013s : 1: graph_reusing 0.00% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000006s : 1: handle_group_info 0.00% : 0.000008s : 1: inline 0.00% : 0.000009s : 1: insert-virtual-dataset 0.00% : 0.000006s : 1: interleave_parallel_branches 0.00% : 0.000006s : 1: interleave_split_concat_branches 0.00% : 0.000008s : 1: label_fine_grained_interleaved_index 0.01% : 0.000011s : 1: label_micro_interleaved_index 0.25% : 0.000482s : 1: loop_unroll 0.00% : 0.000007s : 1: merge_cast_opt 0.00% : 0.000008s : 1: micro_interleaved_order_control 0.38% : 0.000743s : 1: mutable_eliminate 0.00% : 0.000009s : 1: offloading_packed_experts 0.01% : 0.000015s : 1: opt.transform.loop_unroll_optimizer 0.01% : 0.000019s : 1: opt.transform.mutable_eliminate 28.94% : 0.056639s : 78: opt.transform.opt_a 0.02% : 0.000031s : 1: opt.transform.opt_after_cconv 0.01% : 0.000027s : 1: opt.transform.opt_after_jit_grad 0.06% : 0.000118s : 28: opt.transform.opt_b 0.03% : 0.000050s : 2: opt.transform.opt_trans_graph 0.03% : 0.000067s : 4: opt.transform.symbol_engine_opt 30.12% : 0.058948s : 1: opt_a 0.07% : 0.000138s : 1: opt_after_cconv 0.29% : 0.000560s : 1: opt_after_jit_grad 0.15% : 0.000299s : 1: opt_b 31.64% : 0.061919s : 1: optimize 0.01% : 0.000025s : 1: optimize_parallel_all_gather_comm 0.01% : 0.000011s : 1: order_py_execute_after_rewriter 0.01% : 0.000028s : 1: overlap_grad_flash_sp 0.00% : 0.000007s : 1: overlap_grad_matmul_and_grad_allreduce 0.00% : 0.000010s : 1: overlap_grad_ring_attention 0.00% : 0.000007s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000007s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000008s : 1: overlap_param_gather 0.00% : 0.000006s : 1: overlap_recompute_allgather_and_fa_grad 0.01% : 0.000011s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000008s : 1: overlap_recompute_comm 0.01% : 0.000011s : 1: parallel-infer-symbol 0.00% : 0.000006s : 1: parallel-infer-symbol-second 0.00% : 0.000007s : 1: partial_unused_args_eliminate 0.00% : 0.000009s : 1: pipeline_parallel_scheduler 0.00% : 0.000007s : 1: pipeline_split 0.02% : 0.000048s : 1: pre_auto_parallel 0.02% : 0.000036s : 1: py_interpret_to_execute 0.01% : 0.000022s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000006s : 1: remove_cast_before_assign_add 0.01% : 0.000021s : 1: remove_dup_value 0.26% : 0.000506s : 1: renormalize.infer 0.21% : 0.000411s : 1: renormalize.specialize 0.00% : 0.000008s : 1: reorder_send_recv_between_fp_bp 0.01% : 0.000013s : 1: rewriter_after_jit_bprop_graph 0.03% : 0.000051s : 1: rewriter_after_opt_a 0.05% : 0.000095s : 1: rewriter_before_opt_a 0.00% : 0.000008s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000008s : 1: slice_recompute_activation 0.00% : 0.000007s : 1: split_layernorm_comm 0.00% : 0.000007s : 1: split_matmul_comm_elemetwise 0.01% : 0.000011s : 1: swap_dp_allreduce_reducescatter 0.07% : 0.000133s : 1: symbol_engine_optimizer 0.05% : 0.000093s : 1: tuple_transform 3.26% : 0.006381s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:44:13.383.980 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0415273, [21] [bootstrap]: 0.00045573 [type_inference]: 0.0154199 [event_method]: 2.393e-05 [auto_monad]: 7.015e-05 [graph_reusing]: 6.65002e-06 [inline]: 2.94001e-06 [add_attr]: 0.00382178, [1] [add_attr_with_inline]: 0.00380945, [1] [Cycle 1]: 7.391e-05, [2] [tag_attr]: 2.336e-05 [meta_addattr_fg_expand]: 5.69e-06 [parallel-infer-symbol]: 4.18001e-06 [pre_auto_parallel]: 4.317e-05 [insert-virtual-dataset]: 2.30002e-06 [parallel-infer-symbol-second]: 7.50006e-07 [dataset_repeat_opt]: 2.36e-06 [pipeline_split]: 1.87001e-06 [optimize]: 0.0148637, [53] [py_interpret_to_execute]: 2.948e-05 [rewriter_before_opt_a]: 9.139e-05 [opt_a]: 0.0119549, [2] [Cycle 1]: 0.0110945, [45] [expand_dump_flag]: 3.26999e-06 [switch_simplify]: 4.338e-05 [loop_unroll]: 3.002e-05 [a_1]: 0.00065974 [with_stream_mark]: 2.111e-05 [recompute_prepare]: 1.054e-05 [updatestate_depend_eliminate]: 4.08999e-06 [updatestate_assign_eliminate]: 3.35e-06 [updatestate_loads_eliminate]: 2.93e-06 [parameter_eliminate]: 2.04999e-06 [a_2]: 8.198e-05 [accelerated_algorithm]: 8.03001e-06 [shard]: 2.16e-06 [meta_shard_fg_expand]: 1.94999e-06 [shard_inline]: 6.80998e-06 [merge_send_recv]: 9.24e-06 [auto_parallel]: 6.56999e-06 [parallel]: 2.051e-05 [flash_sp]: 9.36e-06 [merge_comm]: 4.02e-06 [allreduce_fusion]: 3.45e-06 [matmul_add_comm_reduction]: 1.051e-05 [allreduce_slice_to_reducescatter]: 8.49977e-07 [virtual_shard_identity]: 8.34998e-06 [virtual_dataset]: 6.72002e-06 [get_grad_eliminate_]: 6.26e-06 [virtual_output]: 6.39001e-06 [merge_forward]: 4.32e-06 [cell_reuse_recompute_pass]: 1.39e-06 [offload_activation]: 1.075e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.359e-05 [merge_recompute_call_nodes]: 1.72999e-06 [before_grad]: 1.088e-05 [set_forward_comm_id_for_comm_node_pass]: 4.22e-06 [meta_fg_expand]: 3.54002e-06 [flash_sp_send_recv_attached]: 2.58e-06 [receive_attached]: 2.53e-06 [after_resolve]: 1.227e-05 [a_after_grad]: 1.045e-05 [renormalize]: 0.00957509 [add_forward_monad_depend]: 1.489e-05 [auto_monad_grad]: 3.91001e-06 [auto_monad_eliminator]: 3.122e-05 [cse]: 3.591e-05 [a_3]: 7.38e-05 [Cycle 2]: 0.0008463, [45] [expand_dump_flag]: 3.46001e-06 [switch_simplify]: 9.80002e-06 [loop_unroll]: 7.26001e-06 [a_1]: 0.00014832 [with_stream_mark]: 2.312e-05 [recompute_prepare]: 8.07e-06 [updatestate_depend_eliminate]: 4.24002e-06 [updatestate_assign_eliminate]: 3.31001e-06 [updatestate_loads_eliminate]: 3.9e-06 [parameter_eliminate]: 2.90002e-06 [a_2]: 7.688e-05 [accelerated_algorithm]: 7.66999e-06 [shard]: 2.78e-06 [meta_shard_fg_expand]: 2.64001e-06 [shard_inline]: 6.42001e-06 [merge_send_recv]: 1.01e-05 [auto_parallel]: 9.59999e-06 [parallel]: 1.126e-05 [flash_sp]: 4.82998e-06 [merge_comm]: 4.55999e-06 [allreduce_fusion]: 1.12e-05 [matmul_add_comm_reduction]: 1.14e-05 [allreduce_slice_to_reducescatter]: 9.10019e-07 [virtual_shard_identity]: 9.94001e-06 [virtual_dataset]: 7.3e-06 [get_grad_eliminate_]: 6.24999e-06 [virtual_output]: 6.29001e-06 [merge_forward]: 5.24998e-06 [cell_reuse_recompute_pass]: 3.26001e-06 [offload_activation]: 1.19e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.373e-05 [merge_recompute_call_nodes]: 1.96003e-06 [before_grad]: 1.161e-05 [set_forward_comm_id_for_comm_node_pass]: 4.75001e-06 [meta_fg_expand]: 3.18e-06 [flash_sp_send_recv_attached]: 2.81999e-06 [receive_attached]: 2.39999e-06 [after_resolve]: 1.56e-05 [a_after_grad]: 1.238e-05 [renormalize]: 8.00064e-08 [add_forward_monad_depend]: 2.81999e-06 [auto_monad_grad]: 1.69998e-06 [auto_monad_eliminator]: 1.344e-05 [cse]: 2.239e-05 [a_3]: 4.093e-05 [py_interpret_to_execute_after_opt_a]: 2.376e-05 [slice_cell_reuse_recomputed_activation]: 2.26e-06 [rewriter_after_opt_a]: 5.029e-05 [convert_after_rewriter]: 8.09997e-06 [order_py_execute_after_rewriter]: 5.84e-06 [mutable_eliminate]: 0.00087045 [opt_b]: 0.00027541, [1] [Cycle 1]: 0.00026533, [7] [b_1]: 0.00015911 [b_2]: 9.49999e-06 [updatestate_depend_eliminate]: 1.189e-05 [updatestate_assign_eliminate]: 3.49001e-06 [updatestate_loads_eliminate]: 3.21001e-06 [renormalize]: 9.5999e-07 [cse]: 3.543e-05 [optimize_parallel_all_gather_comm]: 2.542e-05 [overlap_param_gather]: 1.87999e-06 [cconv]: 4.039e-05 [loop_unroll]: 0.00065113 [opt_after_cconv]: 0.000134, [1] [Cycle 1]: 0.00012503, [7] [c_1]: 3.613e-05 [parameter_eliminate]: 5.67999e-06 [updatestate_depend_eliminate]: 1.008e-05 [updatestate_assign_eliminate]: 2.86e-06 [updatestate_loads_eliminate]: 3.29001e-06 [cse]: 2.999e-05 [renormalize]: 7.30011e-07 [remove_dup_value]: 1.586e-05 [tuple_transform]: 8.964e-05, [1] [Cycle 1]: 8.428e-05, [4] [d_1]: 5.32e-05 [none_parameter_eliminate]: 2.42001e-06 [renormalize]: 2.50002e-07 [switch_simplify]: 7.94002e-06 [partial_unused_args_eliminate]: 2.15002e-06 [add_recomputation]: 6.066e-05 [cse_after_recomputation]: 2.483e-05, [1] [Cycle 1]: 1.939e-05, [1] [cse]: 1.322e-05 [environ_conv]: 6.52001e-06 [swap_dp_allreduce_reducescatter]: 5.69e-06 [bias_add_comm_swap]: 3.83999e-06 [label_micro_interleaved_index]: 7.66999e-06 [label_fine_grained_interleaved_index]: 2.78e-06 [merge_cast_opt]: 1.72999e-06 [slice_recompute_activation]: 2.31e-06 [micro_interleaved_order_control]: 2.59999e-06 [assign_add_opt]: 1.76e-06 [ForceFp32Comm]: 9.30013e-07 [remove_cast_before_assign_add]: 1.44e-06 [full_micro_interleaved_order_control]: 2.31e-06 [reorder_send_recv_between_fp_bp]: 3.23e-06 [comm_op_add_attrs]: 1.13001e-06 [add_comm_op_reuse_tag]: 1.23002e-06 [interleave_split_concat_branches]: 1.20999e-06 [interleave_parallel_branches]: 1.09e-06 [overlap_opt_shard_in_pipeline]: 1.82999e-06 [overlap_opt_shard_grad_in_pipeline]: 2.12999e-06 [control_data_broadcast_order]: 1.578e-05 [grouped_pairwise_exchange_alltoall]: 1.49e-06 [offloading_packed_experts]: 4.99e-06 [overlap_recompute_and_grad_model_parallel]: 5.12999e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.32e-06 [overlap_recompute_allgather_and_fa_grad]: 1.32999e-06 [overlap_recompute_comm]: 2.39001e-06 [overlap_grad_ring_attention]: 4.52e-06 [overlap_grad_flash_sp]: 2.376e-05 [begin_end_overlap_inline]: 5.00004e-07 [split_matmul_comm_elemetwise]: 2.19001e-06 [split_layernorm_comm]: 1.87999e-06 [handle_group_info]: 1.03001e-06 [symbol_engine_optimizer]: 0.00012248, [1] [Cycle 1]: 0.000117, [6] [build]: 4.41002e-06 [elim_shapecalc]: 4.082e-05 [elim_not_effective]: 1.679e-05 [opt_reshape]: 7.55998e-06 [fold_const_symbol]: 1.091e-05 [renormalize]: 4.10015e-07 [detach_backward]: 2.85998e-06 [pipeline_parallel_scheduler]: 1.49998e-06 [auto_monad_reorder]: 1.999e-05 [get_jit_bprop_graph]: 2.54001e-06 [rewriter_after_jit_bprop_graph]: 7.75e-06 [opt_after_jit_grad]: 0.00622165 [validate]: 0.00034481 Sums bootstrap : 0.000456s : 1.25% type_inference : 0.015420s : 42.24% event_method : 0.000024s : 0.07% auto_monad : 0.000070s : 0.19% graph_reusing : 0.000007s : 0.02% inline : 0.000003s : 0.01% add_attr.add_attr_with_inline.tag_attr : 0.000023s : 0.06% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.02% parallel-infer-symbol : 0.000004s : 0.01% pre_auto_parallel : 0.000043s : 0.12% insert-virtual-dataset : 0.000002s : 0.01% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.01% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000029s : 0.08% optimize.rewriter_before_opt_a : 0.000091s : 0.25% optimize.opt_a.expand_dump_flag : 0.000007s : 0.02% optimize.opt_a.switch_simplify : 0.000053s : 0.15% optimize.opt_a.loop_unroll : 0.000037s : 0.10% optimize.opt_a.a_1 : 0.000808s : 2.21% optimize.opt_a.with_stream_mark : 0.000044s : 0.12% optimize.opt_a.recompute_prepare : 0.000019s : 0.05% optimize.opt_a.updatestate_depend_eliminate : 0.000008s : 0.02% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.02% optimize.opt_a.updatestate_loads_eliminate : 0.000007s : 0.02% optimize.opt_a.parameter_eliminate : 0.000005s : 0.01% optimize.opt_a.a_2 : 0.000159s : 0.44% optimize.opt_a.accelerated_algorithm : 0.000016s : 0.04% optimize.opt_a.shard : 0.000005s : 0.01% optimize.opt_a.meta_shard_fg_expand : 0.000005s : 0.01% optimize.opt_a.shard_inline : 0.000013s : 0.04% optimize.opt_a.merge_send_recv : 0.000019s : 0.05% optimize.opt_a.auto_parallel : 0.000016s : 0.04% optimize.opt_a.parallel : 0.000032s : 0.09% optimize.opt_a.flash_sp : 0.000014s : 0.04% optimize.opt_a.merge_comm : 0.000009s : 0.02% optimize.opt_a.allreduce_fusion : 0.000015s : 0.04% optimize.opt_a.matmul_add_comm_reduction : 0.000022s : 0.06% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000002s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000018s : 0.05% optimize.opt_a.virtual_dataset : 0.000014s : 0.04% optimize.opt_a.get_grad_eliminate_ : 0.000013s : 0.03% optimize.opt_a.virtual_output : 0.000013s : 0.03% optimize.opt_a.merge_forward : 0.000010s : 0.03% optimize.opt_a.cell_reuse_recompute_pass : 0.000005s : 0.01% optimize.opt_a.offload_activation : 0.000023s : 0.06% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000037s : 0.10% optimize.opt_a.merge_recompute_call_nodes : 0.000004s : 0.01% optimize.opt_a.before_grad : 0.000022s : 0.06% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000009s : 0.02% optimize.opt_a.meta_fg_expand : 0.000007s : 0.02% optimize.opt_a.flash_sp_send_recv_attached : 0.000005s : 0.01% optimize.opt_a.receive_attached : 0.000005s : 0.01% optimize.opt_a.after_resolve : 0.000028s : 0.08% optimize.opt_a.a_after_grad : 0.000023s : 0.06% optimize.opt_a.renormalize : 0.009575s : 26.23% optimize.opt_a.add_forward_monad_depend : 0.000018s : 0.05% optimize.opt_a.auto_monad_grad : 0.000006s : 0.02% optimize.opt_a.auto_monad_eliminator : 0.000045s : 0.12% optimize.opt_a.cse : 0.000058s : 0.16% optimize.opt_a.a_3 : 0.000115s : 0.31% optimize.py_interpret_to_execute_after_opt_a : 0.000024s : 0.07% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.01% optimize.rewriter_after_opt_a : 0.000050s : 0.14% optimize.convert_after_rewriter : 0.000008s : 0.02% optimize.order_py_execute_after_rewriter : 0.000006s : 0.02% optimize.mutable_eliminate : 0.000870s : 2.38% optimize.opt_b.b_1 : 0.000159s : 0.44% optimize.opt_b.b_2 : 0.000009s : 0.03% optimize.opt_b.updatestate_depend_eliminate : 0.000012s : 0.03% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.01% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.01% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000035s : 0.10% optimize.optimize_parallel_all_gather_comm : 0.000025s : 0.07% optimize.overlap_param_gather : 0.000002s : 0.01% optimize.cconv : 0.000040s : 0.11% optimize.loop_unroll : 0.000651s : 1.78% optimize.opt_after_cconv.c_1 : 0.000036s : 0.10% optimize.opt_after_cconv.parameter_eliminate : 0.000006s : 0.02% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000010s : 0.03% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.cse : 0.000030s : 0.08% optimize.opt_after_cconv.renormalize : 0.000001s : 0.00% optimize.remove_dup_value : 0.000016s : 0.04% optimize.tuple_transform.d_1 : 0.000053s : 0.15% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000008s : 0.02% optimize.partial_unused_args_eliminate : 0.000002s : 0.01% optimize.add_recomputation : 0.000061s : 0.17% optimize.cse_after_recomputation.cse : 0.000013s : 0.04% optimize.environ_conv : 0.000007s : 0.02% optimize.swap_dp_allreduce_reducescatter : 0.000006s : 0.02% optimize.bias_add_comm_swap : 0.000004s : 0.01% optimize.label_micro_interleaved_index : 0.000008s : 0.02% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.01% optimize.merge_cast_opt : 0.000002s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.01% optimize.micro_interleaved_order_control : 0.000003s : 0.01% optimize.assign_add_opt : 0.000002s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.00% optimize.full_micro_interleaved_order_control : 0.000002s : 0.01% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.01% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000002s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.01% optimize.control_data_broadcast_order : 0.000016s : 0.04% optimize.grouped_pairwise_exchange_alltoall : 0.000001s : 0.00% optimize.offloading_packed_experts : 0.000005s : 0.01% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.01% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.00% optimize.overlap_recompute_comm : 0.000002s : 0.01% optimize.overlap_grad_ring_attention : 0.000005s : 0.01% optimize.overlap_grad_flash_sp : 0.000024s : 0.07% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.01% optimize.split_layernorm_comm : 0.000002s : 0.01% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000004s : 0.01% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000041s : 0.11% optimize.symbol_engine_optimizer.elim_not_effective : 0.000017s : 0.05% optimize.symbol_engine_optimizer.opt_reshape : 0.000008s : 0.02% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000011s : 0.03% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000003s : 0.01% pipeline_parallel_scheduler : 0.000001s : 0.00% auto_monad_reorder : 0.000020s : 0.05% get_jit_bprop_graph : 0.000003s : 0.01% rewriter_after_jit_bprop_graph : 0.000008s : 0.02% opt_after_jit_grad : 0.006222s : 17.04% validate : 0.000345s : 0.94% Time group info: ------[substitution.] 0.000220 29 1.01% : 0.000002s : 2: substitution.elim_not_effective 0.78% : 0.000002s : 2: substitution.fold_const_symbol 2.98% : 0.000007s : 4: substitution.graph_param_transform 76.38% : 0.000168s : 4: substitution.inline 2.22% : 0.000005s : 4: substitution.j_node_and_user_rematch 2.66% : 0.000006s : 4: substitution.remove_not_recompute_node 3.12% : 0.000007s : 4: substitution.replace_old_param 7.29% : 0.000016s : 4: substitution.tuple_list_get_item_eliminator 3.56% : 0.000008s : 1: substitution.value_based_eliminate ------[type_inference.] 0.015331 2 93.59% : 0.014348s : 1: type_inference.infer 6.41% : 0.000983s : 1: type_inference.specialize ------[replace.] 0.000065 8 65.31% : 0.000043s : 4: replace.inline 34.69% : 0.000023s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000180 8 92.17% : 0.000165s : 4: match.inline 7.83% : 0.000014s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000243 1278 0.81% : 0.000002s : 13: predicate.accumulaten_eliminater 1.83% : 0.000004s : 4: predicate.ad_related_special_op_eliminate 0.52% : 0.000001s : 8: predicate.addn_check_dump 0.85% : 0.000002s : 13: predicate.addn_zero_filter 0.73% : 0.000002s : 13: predicate.adjust_all_reduce_mul_add 1.93% : 0.000005s : 21: predicate.arithmetic_simplify 0.90% : 0.000002s : 13: predicate.cast_eliminate 0.78% : 0.000002s : 8: predicate.check_bprop_eliminate 0.47% : 0.000001s : 8: predicate.compare_switch_simplify 0.16% : 0.000000s : 4: predicate.const_output_eliminate 0.59% : 0.000001s : 8: predicate.depend_value_elim 0.85% : 0.000002s : 13: predicate.dict_get_item_const_eliminator 1.06% : 0.000003s : 13: predicate.dict_get_item_eliminator 0.76% : 0.000002s : 13: predicate.dict_set_item_eliminator 1.89% : 0.000005s : 8: predicate.dumpgradient_eliminate 0.26% : 0.000001s : 4: predicate.elim_not_effective 0.39% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.17% : 0.000003s : 17: predicate.environ_add_const_eliminate 1.04% : 0.000003s : 17: predicate.environ_get_add_eliminate 0.94% : 0.000002s : 17: predicate.environ_get_depend_swap 1.60% : 0.000004s : 25: predicate.environ_get_eliminate 1.29% : 0.000003s : 17: predicate.environ_get_set_eliminate 1.26% : 0.000003s : 21: predicate.exchange_switch_depend_value 2.31% : 0.000006s : 21: predicate.float_depend_g_call 0.49% : 0.000001s : 8: predicate.float_environ_get_switch 0.70% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.16% : 0.000000s : 4: predicate.fold_const_symbol 0.65% : 0.000002s : 8: predicate.get_grad_eliminate 0.18% : 0.000000s : 4: predicate.graph_param_transform 0.51% : 0.000001s : 8: predicate.incorporate_call 0.41% : 0.000001s : 8: predicate.incorporate_call_switch 6.52% : 0.000016s : 58: predicate.inline 1.04% : 0.000003s : 8: predicate.inline_without_move 0.28% : 0.000001s : 8: predicate.j_node_and_user_rematch 1.12% : 0.000003s : 8: predicate.less_batch_normalization 1.87% : 0.000005s : 25: predicate.list_to_tuple_eliminator_ 2.44% : 0.000006s : 38: predicate.load_eliminater 1.54% : 0.000004s : 4: predicate.loop_unroll_after_grad 2.18% : 0.000005s : 34: predicate.loop_unroll_before_grad 1.60% : 0.000004s : 21: predicate.make_slice_get_slice_eliminator 0.54% : 0.000001s : 8: predicate.merge_addn 0.67% : 0.000002s : 8: predicate.micro_step_allgather_replace 0.67% : 0.000002s : 8: predicate.mini_step_allgather_replace 0.72% : 0.000002s : 13: predicate.minmaximum_grad 2.07% : 0.000005s : 4: predicate.mutable_eliminate 0.31% : 0.000001s : 4: predicate.opt_reshape 0.51% : 0.000001s : 4: predicate.parallel_virtual_node 1.70% : 0.000004s : 21: predicate.partial_defer_inline 1.38% : 0.000003s : 21: predicate.partial_eliminate 0.79% : 0.000002s : 13: predicate.print_const_string_wrapper 0.55% : 0.000001s : 8: predicate.reduce_all_const_elim 1.09% : 0.000003s : 13: predicate.reduce_eliminate 2.26% : 0.000005s : 38: predicate.redundant_stop_gradient_eliminater 0.41% : 0.000001s : 8: predicate.remove_not_recompute_node 1.42% : 0.000003s : 25: predicate.replace_applicator 0.41% : 0.000001s : 8: predicate.replace_old_param 0.22% : 0.000001s : 4: predicate.reset_defer_inline 0.93% : 0.000002s : 13: predicate.reshape_eliminate 0.56% : 0.000001s : 8: predicate.row_tensor_add_zeros_like 0.39% : 0.000001s : 4: predicate.row_tensor_eliminate 0.75% : 0.000002s : 8: predicate.same_eliminate 0.45% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.68% : 0.000002s : 8: predicate.shard_identity_eliminate 0.88% : 0.000002s : 8: predicate.special_op_eliminate 0.60% : 0.000001s : 8: predicate.specialize_transform 1.18% : 0.000003s : 8: predicate.split_environ_get_set_with_tuple_value 0.91% : 0.000002s : 8: predicate.stack_unstack_eliminate 0.36% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.32% : 0.000003s : 21: predicate.switch_defer_inline 1.69% : 0.000004s : 29: predicate.switch_layer_defer_inline 4.60% : 0.000011s : 67: predicate.switch_simplify 0.84% : 0.000002s : 13: predicate.tile_eliminate 0.86% : 0.000002s : 13: predicate.transpose_eliminate 1.61% : 0.000004s : 21: predicate.tuple_list_convert_item_index_to_positive 1.57% : 0.000004s : 21: predicate.tuple_list_get_item_const_eliminator 1.58% : 0.000004s : 21: predicate.tuple_list_get_item_depend_reorder 3.32% : 0.000008s : 33: predicate.tuple_list_get_item_eliminator 1.59% : 0.000004s : 21: predicate.tuple_list_get_set_item_eliminator 2.07% : 0.000005s : 29: predicate.tuple_list_set_item_eliminator 1.83% : 0.000004s : 25: predicate.tuple_to_list_eliminator_ 2.14% : 0.000005s : 38: predicate.updatestate_pure_node_eliminater 3.00% : 0.000007s : 46: predicate.updatestate_useless_node_eliminater 0.49% : 0.000001s : 4: predicate.value_based_eliminate 0.84% : 0.000002s : 8: predicate.virtual_dataset_eliminate 0.51% : 0.000001s : 8: predicate.virtual_output_eliminate 0.31% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.35% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000725 11 50.16% : 0.000364s : 5: func_graph_cloner_run.FuncGraphClonerGraph 49.84% : 0.000361s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.071098 192 0.01% : 0.000004s : 1: ForceFp32Comm 5.38% : 0.003828s : 1: add_attr 5.36% : 0.003814s : 1: add_attr_with_inline 0.01% : 0.000004s : 1: add_comm_op_reuse_tag 0.09% : 0.000066s : 1: add_recomputation 0.01% : 0.000005s : 1: assign_add_opt 0.11% : 0.000076s : 1: auto_monad 0.03% : 0.000024s : 1: auto_monad_reorder 0.00% : 0.000003s : 1: begin_end_overlap_inline 0.01% : 0.000007s : 1: bias_add_comm_swap 0.68% : 0.000486s : 1: bootstrap 0.06% : 0.000045s : 1: cconv 0.01% : 0.000004s : 1: comm_op_add_attrs 0.03% : 0.000019s : 1: control_data_broadcast_order 0.02% : 0.000012s : 1: convert_after_rewriter 0.04% : 0.000029s : 1: cse_after_recomputation 0.01% : 0.000005s : 1: dataset_repeat_opt 0.01% : 0.000006s : 1: detach_backward 0.01% : 0.000010s : 1: environ_conv 0.04% : 0.000031s : 1: event_method 0.01% : 0.000005s : 1: full_micro_interleaved_order_control 0.01% : 0.000006s : 1: get_jit_bprop_graph 0.01% : 0.000010s : 1: graph_reusing 0.01% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000004s : 1: handle_group_info 0.01% : 0.000006s : 1: inline 0.01% : 0.000006s : 1: insert-virtual-dataset 0.01% : 0.000004s : 1: interleave_parallel_branches 0.01% : 0.000005s : 1: interleave_split_concat_branches 0.01% : 0.000006s : 1: label_fine_grained_interleaved_index 0.01% : 0.000011s : 1: label_micro_interleaved_index 0.93% : 0.000664s : 1: loop_unroll 0.01% : 0.000005s : 1: merge_cast_opt 0.01% : 0.000006s : 1: micro_interleaved_order_control 1.25% : 0.000888s : 1: mutable_eliminate 0.01% : 0.000008s : 1: offloading_packed_experts 0.03% : 0.000023s : 1: opt.transform.loop_unroll_optimizer 0.04% : 0.000030s : 1: opt.transform.mutable_eliminate 1.79% : 0.001275s : 78: opt.transform.opt_a 0.05% : 0.000034s : 1: opt.transform.opt_after_cconv 0.27% : 0.000193s : 1: opt.transform.opt_after_jit_grad 0.18% : 0.000129s : 28: opt.transform.opt_b 0.08% : 0.000059s : 2: opt.transform.opt_trans_graph 0.06% : 0.000044s : 4: opt.transform.symbol_engine_opt 16.82% : 0.011960s : 1: opt_a 0.19% : 0.000138s : 1: opt_after_cconv 8.79% : 0.006250s : 1: opt_after_jit_grad 0.39% : 0.000280s : 1: opt_b 20.91% : 0.014869s : 1: optimize 0.04% : 0.000030s : 1: optimize_parallel_all_gather_comm 0.01% : 0.000009s : 1: order_py_execute_after_rewriter 0.04% : 0.000028s : 1: overlap_grad_flash_sp 0.01% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.01% : 0.000008s : 1: overlap_grad_ring_attention 0.01% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000005s : 1: overlap_opt_shard_in_pipeline 0.01% : 0.000005s : 1: overlap_param_gather 0.01% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.01% : 0.000009s : 1: overlap_recompute_and_grad_model_parallel 0.01% : 0.000005s : 1: overlap_recompute_comm 0.01% : 0.000008s : 1: parallel-infer-symbol 0.00% : 0.000003s : 1: parallel-infer-symbol-second 0.01% : 0.000005s : 1: partial_unused_args_eliminate 0.01% : 0.000004s : 1: pipeline_parallel_scheduler 0.01% : 0.000005s : 1: pipeline_split 0.07% : 0.000048s : 1: pre_auto_parallel 0.05% : 0.000034s : 1: py_interpret_to_execute 0.05% : 0.000033s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000004s : 1: remove_cast_before_assign_add 0.03% : 0.000020s : 1: remove_dup_value 0.65% : 0.000463s : 1: renormalize.infer 12.79% : 0.009097s : 1: renormalize.specialize 0.01% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.02% : 0.000011s : 1: rewriter_after_jit_bprop_graph 0.08% : 0.000056s : 1: rewriter_after_opt_a 0.13% : 0.000096s : 1: rewriter_before_opt_a 0.01% : 0.000006s : 1: slice_cell_reuse_recomputed_activation 0.01% : 0.000006s : 1: slice_recompute_activation 0.01% : 0.000005s : 1: split_layernorm_comm 0.01% : 0.000005s : 1: split_matmul_comm_elemetwise 0.01% : 0.000009s : 1: swap_dp_allreduce_reducescatter 0.18% : 0.000125s : 1: symbol_engine_optimizer 0.13% : 0.000093s : 1: tuple_transform 21.73% : 0.015450s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:44:14.227.959 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:44:14.228.236 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0443502, [21] [bootstrap]: 0.00044758 [type_inference]: 0.0269834 [event_method]: 2.196e-05 [auto_monad]: 6.962e-05 [graph_reusing]: 6.27001e-06 [inline]: 3.12002e-06 [add_attr]: 0.00356198, [1] [add_attr_with_inline]: 0.00355004, [1] [Cycle 1]: 0.00010072, [2] [tag_attr]: 2.53e-05 [meta_addattr_fg_expand]: 6.29999e-06 [parallel-infer-symbol]: 3.98001e-06 [pre_auto_parallel]: 4.351e-05 [insert-virtual-dataset]: 2.73e-06 [parallel-infer-symbol-second]: 9.89996e-07 [dataset_repeat_opt]: 2.17999e-06 [pipeline_split]: 1.81e-06 [optimize]: 0.00691541, [53] [py_interpret_to_execute]: 3.588e-05 [rewriter_before_opt_a]: 0.00010095 [opt_a]: 0.00399983, [2] [Cycle 1]: 0.00299462, [45] [expand_dump_flag]: 3.71001e-06 [switch_simplify]: 4.402e-05 [loop_unroll]: 3.124e-05 [a_1]: 0.00078274 [with_stream_mark]: 2.083e-05 [recompute_prepare]: 1.172e-05 [updatestate_depend_eliminate]: 6.16e-06 [updatestate_assign_eliminate]: 4.28999e-06 [updatestate_loads_eliminate]: 3.83999e-06 [parameter_eliminate]: 2.65002e-06 [a_2]: 0.00013278 [accelerated_algorithm]: 1.036e-05 [shard]: 2.34001e-06 [meta_shard_fg_expand]: 2.04999e-06 [shard_inline]: 8.04002e-06 [merge_send_recv]: 1.031e-05 [auto_parallel]: 9.64e-06 [parallel]: 2.076e-05 [flash_sp]: 1.083e-05 [merge_comm]: 5.54e-06 [allreduce_fusion]: 4.15999e-06 [matmul_add_comm_reduction]: 1.207e-05 [allreduce_slice_to_reducescatter]: 9.39996e-07 [virtual_shard_identity]: 1.082e-05 [virtual_dataset]: 8.35999e-06 [get_grad_eliminate_]: 7.95e-06 [virtual_output]: 7.71999e-06 [merge_forward]: 5.47001e-06 [cell_reuse_recompute_pass]: 1.56998e-06 [offload_activation]: 1.191e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.17e-05 [merge_recompute_call_nodes]: 1.96998e-06 [before_grad]: 1.631e-05 [set_forward_comm_id_for_comm_node_pass]: 5.86998e-06 [meta_fg_expand]: 3.99002e-06 [flash_sp_send_recv_attached]: 2.77002e-06 [receive_attached]: 2.70002e-06 [after_resolve]: 1.512e-05 [a_after_grad]: 1.284e-05 [renormalize]: 0.00113912 [add_forward_monad_depend]: 7.38e-06 [auto_monad_grad]: 2.69001e-06 [auto_monad_eliminator]: 1.973e-05 [cse]: 3.924e-05 [a_3]: 7.901e-05 [Cycle 2]: 0.00098706, [45] [expand_dump_flag]: 2.64999e-06 [switch_simplify]: 1.048e-05 [loop_unroll]: 8.15e-06 [a_1]: 0.00018175 [with_stream_mark]: 1.653e-05 [recompute_prepare]: 8.71002e-06 [updatestate_depend_eliminate]: 4.10998e-06 [updatestate_assign_eliminate]: 3.44001e-06 [updatestate_loads_eliminate]: 3.43e-06 [parameter_eliminate]: 1.37999e-06 [a_2]: 0.00012574 [accelerated_algorithm]: 8.08999e-06 [shard]: 1.66e-06 [meta_shard_fg_expand]: 2.64001e-06 [shard_inline]: 7.55998e-06 [merge_send_recv]: 8.77e-06 [auto_parallel]: 9.70002e-06 [parallel]: 7.18e-06 [flash_sp]: 4.28001e-06 [merge_comm]: 4.80001e-06 [allreduce_fusion]: 4.64002e-06 [matmul_add_comm_reduction]: 9.23002e-06 [allreduce_slice_to_reducescatter]: 6.40022e-07 [virtual_shard_identity]: 8.70001e-06 [virtual_dataset]: 7.75e-06 [get_grad_eliminate_]: 6.89001e-06 [virtual_output]: 6.89001e-06 [merge_forward]: 4.96997e-06 [cell_reuse_recompute_pass]: 2.52001e-06 [offload_activation]: 1.01e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.734e-05 [merge_recompute_call_nodes]: 1.01997e-06 [before_grad]: 1.226e-05 [set_forward_comm_id_for_comm_node_pass]: 4.42e-06 [meta_fg_expand]: 3.2e-06 [flash_sp_send_recv_attached]: 1.38002e-06 [receive_attached]: 1.72999e-06 [after_resolve]: 1.301e-05 [a_after_grad]: 1.202e-05 [renormalize]: 7.99773e-08 [add_forward_monad_depend]: 1.76e-06 [auto_monad_grad]: 1.23002e-06 [auto_monad_eliminator]: 1.061e-05 [cse]: 2.116e-05 [a_3]: 5.933e-05 [py_interpret_to_execute_after_opt_a]: 1.971e-05 [slice_cell_reuse_recomputed_activation]: 5.87001e-06 [rewriter_after_opt_a]: 5.096e-05 [convert_after_rewriter]: 1.07e-05 [order_py_execute_after_rewriter]: 9.31e-06 [mutable_eliminate]: 0.00074691 [opt_b]: 0.00035149, [1] [Cycle 1]: 0.00034008, [7] [b_1]: 0.00021773 [b_2]: 1.038e-05 [updatestate_depend_eliminate]: 1.047e-05 [updatestate_assign_eliminate]: 3.21001e-06 [updatestate_loads_eliminate]: 3.3e-06 [renormalize]: 6.29982e-07 [cse]: 3.329e-05 [optimize_parallel_all_gather_comm]: 2.437e-05 [overlap_param_gather]: 4.70999e-06 [cconv]: 3.923e-05 [loop_unroll]: 0.00055084 [opt_after_cconv]: 0.00016417, [1] [Cycle 1]: 0.00015303, [7] [c_1]: 4.047e-05 [parameter_eliminate]: 6.69999e-06 [updatestate_depend_eliminate]: 7.88001e-06 [updatestate_assign_eliminate]: 3.45e-06 [updatestate_loads_eliminate]: 3.32002e-06 [cse]: 3.059e-05 [renormalize]: 7.40023e-07 [remove_dup_value]: 1.986e-05 [tuple_transform]: 0.00011289, [1] [Cycle 1]: 0.0001043, [4] [d_1]: 5.917e-05 [none_parameter_eliminate]: 1.93002e-06 [renormalize]: 1.8999e-07 [switch_simplify]: 9.80002e-06 [partial_unused_args_eliminate]: 4.91002e-06 [add_recomputation]: 6.683e-05 [cse_after_recomputation]: 3.495e-05, [1] [Cycle 1]: 2.651e-05, [1] [cse]: 1.628e-05 [environ_conv]: 1.16e-05 [swap_dp_allreduce_reducescatter]: 9.94999e-06 [bias_add_comm_swap]: 5.43002e-06 [label_micro_interleaved_index]: 8.77999e-06 [label_fine_grained_interleaved_index]: 5.34998e-06 [merge_cast_opt]: 4.32998e-06 [slice_recompute_activation]: 5.23002e-06 [micro_interleaved_order_control]: 5.29998e-06 [assign_add_opt]: 4.47e-06 [ForceFp32Comm]: 3.73999e-06 [remove_cast_before_assign_add]: 3.61999e-06 [full_micro_interleaved_order_control]: 4.65999e-06 [reorder_send_recv_between_fp_bp]: 5.38002e-06 [comm_op_add_attrs]: 3.63e-06 [add_comm_op_reuse_tag]: 3.23e-06 [interleave_split_concat_branches]: 3.76999e-06 [interleave_parallel_branches]: 3.39001e-06 [overlap_opt_shard_in_pipeline]: 3.82002e-06 [overlap_opt_shard_grad_in_pipeline]: 3.99002e-06 [control_data_broadcast_order]: 2.272e-05 [grouped_pairwise_exchange_alltoall]: 3.78999e-06 [offloading_packed_experts]: 6.99001e-06 [overlap_recompute_and_grad_model_parallel]: 7.51999e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.86001e-06 [overlap_recompute_allgather_and_fa_grad]: 3.86999e-06 [overlap_recompute_comm]: 5.62001e-06 [overlap_grad_ring_attention]: 7.18e-06 [overlap_grad_flash_sp]: 2.93e-05 [begin_end_overlap_inline]: 3.11999e-06 [split_matmul_comm_elemetwise]: 5.02e-06 [split_layernorm_comm]: 4.18999e-06 [handle_group_info]: 3.36001e-06 [symbol_engine_optimizer]: 0.00012221, [1] [Cycle 1]: 0.00011359, [6] [build]: 6.48e-06 [elim_shapecalc]: 1.569e-05 [elim_not_effective]: 1.625e-05 [opt_reshape]: 9.92001e-06 [fold_const_symbol]: 1.331e-05 [renormalize]: 2.10013e-07 [detach_backward]: 5.06997e-06 [pipeline_parallel_scheduler]: 1.91e-06 [auto_monad_reorder]: 2.886e-05 [get_jit_bprop_graph]: 2.11003e-06 [rewriter_after_jit_bprop_graph]: 6.39999e-06 [opt_after_jit_grad]: 0.00551463 [validate]: 6.132e-05 Sums bootstrap : 0.000448s : 1.15% type_inference : 0.026983s : 69.43% event_method : 0.000022s : 0.06% auto_monad : 0.000070s : 0.18% graph_reusing : 0.000006s : 0.02% inline : 0.000003s : 0.01% add_attr.add_attr_with_inline.tag_attr : 0.000025s : 0.07% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.02% parallel-infer-symbol : 0.000004s : 0.01% pre_auto_parallel : 0.000044s : 0.11% insert-virtual-dataset : 0.000003s : 0.01% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.01% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000036s : 0.09% optimize.rewriter_before_opt_a : 0.000101s : 0.26% optimize.opt_a.expand_dump_flag : 0.000006s : 0.02% optimize.opt_a.switch_simplify : 0.000055s : 0.14% optimize.opt_a.loop_unroll : 0.000039s : 0.10% optimize.opt_a.a_1 : 0.000964s : 2.48% optimize.opt_a.with_stream_mark : 0.000037s : 0.10% optimize.opt_a.recompute_prepare : 0.000020s : 0.05% optimize.opt_a.updatestate_depend_eliminate : 0.000010s : 0.03% optimize.opt_a.updatestate_assign_eliminate : 0.000008s : 0.02% optimize.opt_a.updatestate_loads_eliminate : 0.000007s : 0.02% optimize.opt_a.parameter_eliminate : 0.000004s : 0.01% optimize.opt_a.a_2 : 0.000259s : 0.67% optimize.opt_a.accelerated_algorithm : 0.000018s : 0.05% optimize.opt_a.shard : 0.000004s : 0.01% optimize.opt_a.meta_shard_fg_expand : 0.000005s : 0.01% optimize.opt_a.shard_inline : 0.000016s : 0.04% optimize.opt_a.merge_send_recv : 0.000019s : 0.05% optimize.opt_a.auto_parallel : 0.000019s : 0.05% optimize.opt_a.parallel : 0.000028s : 0.07% optimize.opt_a.flash_sp : 0.000015s : 0.04% optimize.opt_a.merge_comm : 0.000010s : 0.03% optimize.opt_a.allreduce_fusion : 0.000009s : 0.02% optimize.opt_a.matmul_add_comm_reduction : 0.000021s : 0.05% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000002s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000020s : 0.05% optimize.opt_a.virtual_dataset : 0.000016s : 0.04% optimize.opt_a.get_grad_eliminate_ : 0.000015s : 0.04% optimize.opt_a.virtual_output : 0.000015s : 0.04% optimize.opt_a.merge_forward : 0.000010s : 0.03% optimize.opt_a.cell_reuse_recompute_pass : 0.000004s : 0.01% optimize.opt_a.offload_activation : 0.000022s : 0.06% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000039s : 0.10% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.01% optimize.opt_a.before_grad : 0.000029s : 0.07% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000010s : 0.03% optimize.opt_a.meta_fg_expand : 0.000007s : 0.02% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.01% optimize.opt_a.receive_attached : 0.000004s : 0.01% optimize.opt_a.after_resolve : 0.000028s : 0.07% optimize.opt_a.a_after_grad : 0.000025s : 0.06% optimize.opt_a.renormalize : 0.001139s : 2.93% optimize.opt_a.add_forward_monad_depend : 0.000009s : 0.02% optimize.opt_a.auto_monad_grad : 0.000004s : 0.01% optimize.opt_a.auto_monad_eliminator : 0.000030s : 0.08% optimize.opt_a.cse : 0.000060s : 0.16% optimize.opt_a.a_3 : 0.000138s : 0.36% optimize.py_interpret_to_execute_after_opt_a : 0.000020s : 0.05% optimize.slice_cell_reuse_recomputed_activation : 0.000006s : 0.02% optimize.rewriter_after_opt_a : 0.000051s : 0.13% optimize.convert_after_rewriter : 0.000011s : 0.03% optimize.order_py_execute_after_rewriter : 0.000009s : 0.02% optimize.mutable_eliminate : 0.000747s : 1.92% optimize.opt_b.b_1 : 0.000218s : 0.56% optimize.opt_b.b_2 : 0.000010s : 0.03% optimize.opt_b.updatestate_depend_eliminate : 0.000010s : 0.03% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.01% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.01% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000033s : 0.09% optimize.optimize_parallel_all_gather_comm : 0.000024s : 0.06% optimize.overlap_param_gather : 0.000005s : 0.01% optimize.cconv : 0.000039s : 0.10% optimize.loop_unroll : 0.000551s : 1.42% optimize.opt_after_cconv.c_1 : 0.000040s : 0.10% optimize.opt_after_cconv.parameter_eliminate : 0.000007s : 0.02% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000008s : 0.02% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.cse : 0.000031s : 0.08% optimize.opt_after_cconv.renormalize : 0.000001s : 0.00% optimize.remove_dup_value : 0.000020s : 0.05% optimize.tuple_transform.d_1 : 0.000059s : 0.15% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000010s : 0.03% optimize.partial_unused_args_eliminate : 0.000005s : 0.01% optimize.add_recomputation : 0.000067s : 0.17% optimize.cse_after_recomputation.cse : 0.000016s : 0.04% optimize.environ_conv : 0.000012s : 0.03% optimize.swap_dp_allreduce_reducescatter : 0.000010s : 0.03% optimize.bias_add_comm_swap : 0.000005s : 0.01% optimize.label_micro_interleaved_index : 0.000009s : 0.02% optimize.label_fine_grained_interleaved_index : 0.000005s : 0.01% optimize.merge_cast_opt : 0.000004s : 0.01% optimize.slice_recompute_activation : 0.000005s : 0.01% optimize.micro_interleaved_order_control : 0.000005s : 0.01% optimize.assign_add_opt : 0.000004s : 0.01% optimize.ForceFp32Comm : 0.000004s : 0.01% optimize.remove_cast_before_assign_add : 0.000004s : 0.01% optimize.full_micro_interleaved_order_control : 0.000005s : 0.01% optimize.reorder_send_recv_between_fp_bp : 0.000005s : 0.01% optimize.comm_op_add_attrs : 0.000004s : 0.01% optimize.add_comm_op_reuse_tag : 0.000003s : 0.01% optimize.interleave_split_concat_branches : 0.000004s : 0.01% optimize.interleave_parallel_branches : 0.000003s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000004s : 0.01% optimize.control_data_broadcast_order : 0.000023s : 0.06% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.01% optimize.offloading_packed_experts : 0.000007s : 0.02% optimize.overlap_recompute_and_grad_model_parallel : 0.000008s : 0.02% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.01% optimize.overlap_recompute_comm : 0.000006s : 0.01% optimize.overlap_grad_ring_attention : 0.000007s : 0.02% optimize.overlap_grad_flash_sp : 0.000029s : 0.08% optimize.begin_end_overlap_inline : 0.000003s : 0.01% optimize.split_matmul_comm_elemetwise : 0.000005s : 0.01% optimize.split_layernorm_comm : 0.000004s : 0.01% optimize.handle_group_info : 0.000003s : 0.01% optimize.symbol_engine_optimizer.build : 0.000006s : 0.02% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000016s : 0.04% optimize.symbol_engine_optimizer.elim_not_effective : 0.000016s : 0.04% optimize.symbol_engine_optimizer.opt_reshape : 0.000010s : 0.03% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000013s : 0.03% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000005s : 0.01% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000029s : 0.07% get_jit_bprop_graph : 0.000002s : 0.01% rewriter_after_jit_bprop_graph : 0.000006s : 0.02% opt_after_jit_grad : 0.005515s : 14.19% validate : 0.000061s : 0.16% Time group info: ------[substitution.] 0.000252 39 11.06% : 0.000028s : 3: substitution.cast_eliminate 1.09% : 0.000003s : 3: substitution.elim_not_effective 0.74% : 0.000002s : 3: substitution.fold_const_symbol 2.76% : 0.000007s : 5: substitution.graph_param_transform 67.26% : 0.000170s : 4: substitution.inline 2.45% : 0.000006s : 6: substitution.j_node_and_user_rematch 2.85% : 0.000007s : 6: substitution.remove_not_recompute_node 2.39% : 0.000006s : 4: substitution.replace_old_param 6.28% : 0.000016s : 4: substitution.tuple_list_get_item_eliminator 3.12% : 0.000008s : 1: substitution.value_based_eliminate ------[type_inference.] 0.026919 2 97.01% : 0.026115s : 1: type_inference.infer 2.99% : 0.000804s : 1: type_inference.specialize ------[replace.] 0.000066 8 62.79% : 0.000042s : 4: replace.inline 37.21% : 0.000025s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000181 8 92.42% : 0.000167s : 4: match.inline 7.58% : 0.000014s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000275 1596 0.86% : 0.000002s : 17: predicate.accumulaten_eliminater 1.80% : 0.000005s : 5: predicate.ad_related_special_op_eliminate 0.52% : 0.000001s : 10: predicate.addn_check_dump 0.83% : 0.000002s : 17: predicate.addn_zero_filter 0.88% : 0.000002s : 17: predicate.adjust_all_reduce_mul_add 1.98% : 0.000005s : 27: predicate.arithmetic_simplify 0.99% : 0.000003s : 17: predicate.cast_eliminate 0.75% : 0.000002s : 10: predicate.check_bprop_eliminate 0.56% : 0.000002s : 10: predicate.compare_switch_simplify 0.17% : 0.000000s : 5: predicate.const_output_eliminate 0.66% : 0.000002s : 10: predicate.depend_value_elim 0.93% : 0.000003s : 17: predicate.dict_get_item_const_eliminator 1.07% : 0.000003s : 17: predicate.dict_get_item_eliminator 0.91% : 0.000003s : 17: predicate.dict_set_item_eliminator 1.68% : 0.000005s : 10: predicate.dumpgradient_eliminate 0.26% : 0.000001s : 5: predicate.elim_not_effective 0.45% : 0.000001s : 5: predicate.elim_shapecalc_of_broadcastargs 1.18% : 0.000003s : 22: predicate.environ_add_const_eliminate 1.11% : 0.000003s : 22: predicate.environ_get_add_eliminate 1.09% : 0.000003s : 22: predicate.environ_get_depend_swap 1.91% : 0.000005s : 32: predicate.environ_get_eliminate 1.25% : 0.000003s : 22: predicate.environ_get_set_eliminate 1.29% : 0.000004s : 25: predicate.exchange_switch_depend_value 2.26% : 0.000006s : 25: predicate.float_depend_g_call 0.54% : 0.000001s : 10: predicate.float_environ_get_switch 0.76% : 0.000002s : 15: predicate.float_tuple_getitem_switch 0.17% : 0.000000s : 5: predicate.fold_const_symbol 0.60% : 0.000002s : 10: predicate.get_grad_eliminate 0.20% : 0.000001s : 5: predicate.graph_param_transform 0.56% : 0.000002s : 10: predicate.incorporate_call 0.46% : 0.000001s : 10: predicate.incorporate_call_switch 6.18% : 0.000017s : 72: predicate.inline 0.90% : 0.000002s : 10: predicate.inline_without_move 0.30% : 0.000001s : 10: predicate.j_node_and_user_rematch 0.84% : 0.000002s : 10: predicate.less_batch_normalization 2.06% : 0.000006s : 31: predicate.list_to_tuple_eliminator_ 2.56% : 0.000007s : 48: predicate.load_eliminater 0.99% : 0.000003s : 5: predicate.loop_unroll_after_grad 2.04% : 0.000006s : 36: predicate.loop_unroll_before_grad 1.59% : 0.000004s : 27: predicate.make_slice_get_slice_eliminator 0.59% : 0.000002s : 10: predicate.merge_addn 0.58% : 0.000002s : 10: predicate.micro_step_allgather_replace 0.55% : 0.000002s : 10: predicate.mini_step_allgather_replace 0.93% : 0.000003s : 17: predicate.minmaximum_grad 1.18% : 0.000003s : 5: predicate.mutable_eliminate 0.53% : 0.000001s : 5: predicate.opt_reshape 0.41% : 0.000001s : 5: predicate.parallel_virtual_node 1.61% : 0.000004s : 25: predicate.partial_defer_inline 1.63% : 0.000004s : 26: predicate.partial_eliminate 0.89% : 0.000002s : 17: predicate.print_const_string_wrapper 0.55% : 0.000001s : 10: predicate.reduce_all_const_elim 1.17% : 0.000003s : 17: predicate.reduce_eliminate 2.51% : 0.000007s : 48: predicate.redundant_stop_gradient_eliminater 0.45% : 0.000001s : 10: predicate.remove_not_recompute_node 1.26% : 0.000003s : 31: predicate.replace_applicator 0.53% : 0.000001s : 10: predicate.replace_old_param 0.32% : 0.000001s : 5: predicate.reset_defer_inline 1.00% : 0.000003s : 17: predicate.reshape_eliminate 0.58% : 0.000002s : 10: predicate.row_tensor_add_zeros_like 0.41% : 0.000001s : 5: predicate.row_tensor_eliminate 0.75% : 0.000002s : 10: predicate.same_eliminate 0.39% : 0.000001s : 10: predicate.set_cell_output_no_recompute 0.79% : 0.000002s : 10: predicate.shard_identity_eliminate 0.67% : 0.000002s : 10: predicate.special_op_eliminate 0.74% : 0.000002s : 10: predicate.specialize_transform 0.93% : 0.000003s : 10: predicate.split_environ_get_set_with_tuple_value 0.64% : 0.000002s : 10: predicate.stack_unstack_eliminate 0.35% : 0.000001s : 5: predicate.switch_call_monad_eliminater 1.36% : 0.000004s : 25: predicate.switch_defer_inline 1.87% : 0.000005s : 35: predicate.switch_layer_defer_inline 4.60% : 0.000013s : 76: predicate.switch_simplify 0.92% : 0.000003s : 17: predicate.tile_eliminate 0.91% : 0.000003s : 17: predicate.transpose_eliminate 1.60% : 0.000004s : 27: predicate.tuple_list_convert_item_index_to_positive 1.66% : 0.000005s : 27: predicate.tuple_list_get_item_const_eliminator 1.40% : 0.000004s : 27: predicate.tuple_list_get_item_depend_reorder 3.15% : 0.000009s : 41: predicate.tuple_list_get_item_eliminator 1.47% : 0.000004s : 27: predicate.tuple_list_get_set_item_eliminator 2.38% : 0.000007s : 37: predicate.tuple_list_set_item_eliminator 1.73% : 0.000005s : 31: predicate.tuple_to_list_eliminator_ 2.41% : 0.000007s : 48: predicate.updatestate_pure_node_eliminater 3.09% : 0.000008s : 58: predicate.updatestate_useless_node_eliminater 0.51% : 0.000001s : 5: predicate.value_based_eliminate 0.67% : 0.000002s : 10: predicate.virtual_dataset_eliminate 0.54% : 0.000001s : 10: predicate.virtual_output_eliminate 0.26% : 0.000001s : 5: predicate.virtual_view_grad_eliminate 0.38% : 0.000001s : 5: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.020732 11 98.56% : 0.020434s : 5: func_graph_cloner_run.FuncGraphClonerGraph 1.44% : 0.000298s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.057706 192 0.01% : 0.000006s : 1: ForceFp32Comm 6.19% : 0.003573s : 1: add_attr 6.16% : 0.003554s : 1: add_attr_with_inline 0.01% : 0.000006s : 1: add_comm_op_reuse_tag 0.12% : 0.000071s : 1: add_recomputation 0.01% : 0.000007s : 1: assign_add_opt 0.14% : 0.000080s : 1: auto_monad 0.07% : 0.000038s : 1: auto_monad_reorder 0.01% : 0.000006s : 1: begin_end_overlap_inline 0.01% : 0.000008s : 1: bias_add_comm_swap 0.85% : 0.000493s : 1: bootstrap 0.07% : 0.000042s : 1: cconv 0.01% : 0.000007s : 1: comm_op_add_attrs 0.05% : 0.000027s : 1: control_data_broadcast_order 0.03% : 0.000015s : 1: convert_after_rewriter 0.07% : 0.000038s : 1: cse_after_recomputation 0.01% : 0.000008s : 1: dataset_repeat_opt 0.04% : 0.000025s : 1: detach_backward 0.03% : 0.000016s : 1: environ_conv 0.06% : 0.000033s : 1: event_method 0.01% : 0.000008s : 1: full_micro_interleaved_order_control 0.01% : 0.000008s : 1: get_jit_bprop_graph 0.02% : 0.000013s : 1: graph_reusing 0.01% : 0.000006s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000006s : 1: handle_group_info 0.01% : 0.000009s : 1: inline 0.02% : 0.000009s : 1: insert-virtual-dataset 0.01% : 0.000006s : 1: interleave_parallel_branches 0.01% : 0.000006s : 1: interleave_split_concat_branches 0.01% : 0.000008s : 1: label_fine_grained_interleaved_index 0.02% : 0.000011s : 1: label_micro_interleaved_index 0.97% : 0.000559s : 1: loop_unroll 0.01% : 0.000007s : 1: merge_cast_opt 0.01% : 0.000008s : 1: micro_interleaved_order_control 1.31% : 0.000756s : 1: mutable_eliminate 0.02% : 0.000010s : 1: offloading_packed_experts 0.04% : 0.000020s : 1: opt.transform.loop_unroll_optimizer 0.04% : 0.000023s : 1: opt.transform.mutable_eliminate 2.61% : 0.001506s : 78: opt.transform.opt_a 0.07% : 0.000039s : 1: opt.transform.opt_after_cconv 0.09% : 0.000053s : 1: opt.transform.opt_after_jit_grad 0.26% : 0.000152s : 28: opt.transform.opt_b 0.12% : 0.000067s : 2: opt.transform.opt_trans_graph 0.09% : 0.000050s : 4: opt.transform.symbol_engine_opt 6.94% : 0.004004s : 1: opt_a 0.29% : 0.000168s : 1: opt_after_cconv 9.59% : 0.005535s : 1: opt_after_jit_grad 0.62% : 0.000356s : 1: opt_b 12.65% : 0.007297s : 1: optimize 0.05% : 0.000028s : 1: optimize_parallel_all_gather_comm 0.02% : 0.000013s : 1: order_py_execute_after_rewriter 0.06% : 0.000034s : 1: overlap_grad_flash_sp 0.01% : 0.000007s : 1: overlap_grad_matmul_and_grad_allreduce 0.02% : 0.000011s : 1: overlap_grad_ring_attention 0.01% : 0.000007s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000006s : 1: overlap_opt_shard_in_pipeline 0.01% : 0.000008s : 1: overlap_param_gather 0.01% : 0.000006s : 1: overlap_recompute_allgather_and_fa_grad 0.02% : 0.000011s : 1: overlap_recompute_and_grad_model_parallel 0.01% : 0.000009s : 1: overlap_recompute_comm 0.02% : 0.000011s : 1: parallel-infer-symbol 0.01% : 0.000007s : 1: parallel-infer-symbol-second 0.01% : 0.000008s : 1: partial_unused_args_eliminate 0.02% : 0.000010s : 1: pipeline_parallel_scheduler 0.01% : 0.000008s : 1: pipeline_split 0.09% : 0.000051s : 1: pre_auto_parallel 0.07% : 0.000040s : 1: py_interpret_to_execute 0.04% : 0.000023s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000007s : 1: remove_cast_before_assign_add 0.04% : 0.000023s : 1: remove_dup_value 1.27% : 0.000733s : 1: renormalize.infer 0.68% : 0.000395s : 1: renormalize.specialize 0.01% : 0.000008s : 1: reorder_send_recv_between_fp_bp 0.02% : 0.000012s : 1: rewriter_after_jit_bprop_graph 0.10% : 0.000055s : 1: rewriter_after_opt_a 0.18% : 0.000105s : 1: rewriter_before_opt_a 0.02% : 0.000009s : 1: slice_cell_reuse_recomputed_activation 0.01% : 0.000008s : 1: slice_recompute_activation 0.01% : 0.000007s : 1: split_layernorm_comm 0.01% : 0.000008s : 1: split_matmul_comm_elemetwise 0.02% : 0.000013s : 1: swap_dp_allreduce_reducescatter 0.22% : 0.000125s : 1: symbol_engine_optimizer 0.20% : 0.000116s : 1: tuple_transform 46.84% : 0.027029s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:44:14.986.371 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0793257, [21] [bootstrap]: 0.00044964 [type_inference]: 0.00696209 [event_method]: 2.041e-05 [auto_monad]: 6.704e-05 [graph_reusing]: 5.82999e-06 [inline]: 2.83e-06 [add_attr]: 0.00358371, [1] [add_attr_with_inline]: 0.00357103, [1] [Cycle 1]: 7.597e-05, [2] [tag_attr]: 2.392e-05 [meta_addattr_fg_expand]: 6.23002e-06 [parallel-infer-symbol]: 3.33998e-06 [pre_auto_parallel]: 4.075e-05 [insert-virtual-dataset]: 2.41e-06 [parallel-infer-symbol-second]: 8.60018e-07 [dataset_repeat_opt]: 1.83002e-06 [pipeline_split]: 1.86e-06 [optimize]: 0.0670212, [53] [py_interpret_to_execute]: 2.927e-05 [rewriter_before_opt_a]: 9.421e-05 [opt_a]: 0.00359895, [2] [Cycle 1]: 0.0026867, [45] [expand_dump_flag]: 2.89001e-06 [switch_simplify]: 4.444e-05 [loop_unroll]: 3.117e-05 [a_1]: 0.00076601 [with_stream_mark]: 1.921e-05 [recompute_prepare]: 1.096e-05 [updatestate_depend_eliminate]: 5.74e-06 [updatestate_assign_eliminate]: 4.1e-06 [updatestate_loads_eliminate]: 3.88001e-06 [parameter_eliminate]: 1.72999e-06 [a_2]: 0.00010229 [accelerated_algorithm]: 9.49e-06 [shard]: 1.82999e-06 [meta_shard_fg_expand]: 2.63998e-06 [shard_inline]: 8.17e-06 [merge_send_recv]: 9.76e-06 [auto_parallel]: 7.82e-06 [parallel]: 1.878e-05 [flash_sp]: 9.86998e-06 [merge_comm]: 4.58001e-06 [allreduce_fusion]: 4.22e-06 [matmul_add_comm_reduction]: 1.153e-05 [allreduce_slice_to_reducescatter]: 8.2e-07 [virtual_shard_identity]: 1.009e-05 [virtual_dataset]: 8.82e-06 [get_grad_eliminate_]: 7.31999e-06 [virtual_output]: 8.02e-06 [merge_forward]: 4.87e-06 [cell_reuse_recompute_pass]: 1.15001e-06 [offload_activation]: 1.175e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.836e-05 [merge_recompute_call_nodes]: 1.42e-06 [before_grad]: 1.419e-05 [set_forward_comm_id_for_comm_node_pass]: 4.39998e-06 [meta_fg_expand]: 3.63e-06 [flash_sp_send_recv_attached]: 2.77002e-06 [receive_attached]: 2.36e-06 [after_resolve]: 1.317e-05 [a_after_grad]: 1.233e-05 [renormalize]: 0.00105791 [add_forward_monad_depend]: 8.1e-06 [auto_monad_grad]: 2.64001e-06 [auto_monad_eliminator]: 2.094e-05 [cse]: 4.173e-05 [a_3]: 7.178e-05 [Cycle 2]: 0.00089881, [45] [expand_dump_flag]: 2.66e-06 [switch_simplify]: 1.016e-05 [loop_unroll]: 7.91001e-06 [a_1]: 0.0001974 [with_stream_mark]: 2.248e-05 [recompute_prepare]: 8.48999e-06 [updatestate_depend_eliminate]: 4.70001e-06 [updatestate_assign_eliminate]: 3.81999e-06 [updatestate_loads_eliminate]: 3.9e-06 [parameter_eliminate]: 2.51998e-06 [a_2]: 9.71e-05 [accelerated_algorithm]: 8.66002e-06 [shard]: 2.75997e-06 [meta_shard_fg_expand]: 2.86999e-06 [shard_inline]: 8.08001e-06 [merge_send_recv]: 1.017e-05 [auto_parallel]: 1.205e-05 [parallel]: 9.33002e-06 [flash_sp]: 4e-06 [merge_comm]: 4.38001e-06 [allreduce_fusion]: 1.047e-05 [matmul_add_comm_reduction]: 1.111e-05 [allreduce_slice_to_reducescatter]: 6.39993e-07 [virtual_shard_identity]: 9.88002e-06 [virtual_dataset]: 7.26999e-06 [get_grad_eliminate_]: 7.71999e-06 [virtual_output]: 7.23e-06 [merge_forward]: 5.39998e-06 [cell_reuse_recompute_pass]: 3.4e-06 [offload_activation]: 1.16e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.779e-05 [merge_recompute_call_nodes]: 1.29e-06 [before_grad]: 1.28e-05 [set_forward_comm_id_for_comm_node_pass]: 4.55001e-06 [meta_fg_expand]: 3.36001e-06 [flash_sp_send_recv_attached]: 1.79998e-06 [receive_attached]: 2.28002e-06 [after_resolve]: 1.613e-05 [a_after_grad]: 1.179e-05 [renormalize]: 8.00064e-08 [add_forward_monad_depend]: 1.94e-06 [auto_monad_grad]: 1.57999e-06 [auto_monad_eliminator]: 1.163e-05 [cse]: 2.661e-05 [a_3]: 5.061e-05 [py_interpret_to_execute_after_opt_a]: 2.209e-05 [slice_cell_reuse_recomputed_activation]: 2.31998e-06 [rewriter_after_opt_a]: 5.293e-05 [convert_after_rewriter]: 8.27998e-06 [order_py_execute_after_rewriter]: 6.59999e-06 [mutable_eliminate]: 0.00083884 [opt_b]: 0.00031264, [1] [Cycle 1]: 0.00030257, [7] [b_1]: 0.00018924 [b_2]: 1.183e-05 [updatestate_depend_eliminate]: 1.199e-05 [updatestate_assign_eliminate]: 3.83999e-06 [updatestate_loads_eliminate]: 3.86001e-06 [renormalize]: 8.50006e-07 [cse]: 4.004e-05 [optimize_parallel_all_gather_comm]: 2.374e-05 [overlap_param_gather]: 2.27999e-06 [cconv]: 3.988e-05 [loop_unroll]: 0.0610719 [opt_after_cconv]: 0.00018773, [1] [Cycle 1]: 0.00017615, [7] [c_1]: 4.742e-05 [parameter_eliminate]: 7.86001e-06 [updatestate_depend_eliminate]: 1.497e-05 [updatestate_assign_eliminate]: 4.63999e-06 [updatestate_loads_eliminate]: 4.27998e-06 [cse]: 5.731e-05 [renormalize]: 8.29983e-07 [remove_dup_value]: 2.092e-05 [tuple_transform]: 0.00010443, [1] [Cycle 1]: 9.951e-05, [4] [d_1]: 6.807e-05 [none_parameter_eliminate]: 1.94e-06 [renormalize]: 1.79978e-07 [switch_simplify]: 8.62e-06 [partial_unused_args_eliminate]: 3.06001e-06 [add_recomputation]: 7.687e-05 [cse_after_recomputation]: 2.92e-05, [1] [Cycle 1]: 2.408e-05, [1] [cse]: 1.707e-05 [environ_conv]: 9.25001e-06 [swap_dp_allreduce_reducescatter]: 6.04999e-06 [bias_add_comm_swap]: 3.32002e-06 [label_micro_interleaved_index]: 8.45999e-06 [label_fine_grained_interleaved_index]: 2.71e-06 [merge_cast_opt]: 1.60001e-06 [slice_recompute_activation]: 2.44999e-06 [micro_interleaved_order_control]: 3.04999e-06 [assign_add_opt]: 1.51998e-06 [ForceFp32Comm]: 1.09e-06 [remove_cast_before_assign_add]: 1.24e-06 [full_micro_interleaved_order_control]: 2.73003e-06 [reorder_send_recv_between_fp_bp]: 3.01999e-06 [comm_op_add_attrs]: 1.40001e-06 [add_comm_op_reuse_tag]: 9.50007e-07 [interleave_split_concat_branches]: 1.27999e-06 [interleave_parallel_branches]: 1.35999e-06 [overlap_opt_shard_in_pipeline]: 1.37e-06 [overlap_opt_shard_grad_in_pipeline]: 1.83002e-06 [control_data_broadcast_order]: 1.852e-05 [grouped_pairwise_exchange_alltoall]: 1.74e-06 [offloading_packed_experts]: 4.67e-06 [overlap_recompute_and_grad_model_parallel]: 5.52999e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.45001e-06 [overlap_recompute_allgather_and_fa_grad]: 1.35999e-06 [overlap_recompute_comm]: 2.26e-06 [overlap_grad_ring_attention]: 4.94e-06 [overlap_grad_flash_sp]: 2.671e-05 [begin_end_overlap_inline]: 5.3001e-07 [split_matmul_comm_elemetwise]: 2.21998e-06 [split_layernorm_comm]: 2.09999e-06 [handle_group_info]: 1.07e-06 [symbol_engine_optimizer]: 9.785e-05, [1] [Cycle 1]: 9.184e-05, [6] [build]: 5.00001e-06 [elim_shapecalc]: 1.504e-05 [elim_not_effective]: 1.727e-05 [opt_reshape]: 9.09998e-06 [fold_const_symbol]: 1.333e-05 [renormalize]: 1.59984e-07 [detach_backward]: 2.33998e-06 [pipeline_parallel_scheduler]: 1.59998e-06 [auto_monad_reorder]: 2.348e-05 [get_jit_bprop_graph]: 2.93e-06 [rewriter_after_jit_bprop_graph]: 7.41001e-06 [opt_after_jit_grad]: 0.00088171 [validate]: 6.15e-05 Sums bootstrap : 0.000450s : 0.60% type_inference : 0.006962s : 9.33% event_method : 0.000020s : 0.03% auto_monad : 0.000067s : 0.09% graph_reusing : 0.000006s : 0.01% inline : 0.000003s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000024s : 0.03% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.01% parallel-infer-symbol : 0.000003s : 0.00% pre_auto_parallel : 0.000041s : 0.05% insert-virtual-dataset : 0.000002s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000029s : 0.04% optimize.rewriter_before_opt_a : 0.000094s : 0.13% optimize.opt_a.expand_dump_flag : 0.000006s : 0.01% optimize.opt_a.switch_simplify : 0.000055s : 0.07% optimize.opt_a.loop_unroll : 0.000039s : 0.05% optimize.opt_a.a_1 : 0.000963s : 1.29% optimize.opt_a.with_stream_mark : 0.000042s : 0.06% optimize.opt_a.recompute_prepare : 0.000019s : 0.03% optimize.opt_a.updatestate_depend_eliminate : 0.000010s : 0.01% optimize.opt_a.updatestate_assign_eliminate : 0.000008s : 0.01% optimize.opt_a.updatestate_loads_eliminate : 0.000008s : 0.01% optimize.opt_a.parameter_eliminate : 0.000004s : 0.01% optimize.opt_a.a_2 : 0.000199s : 0.27% optimize.opt_a.accelerated_algorithm : 0.000018s : 0.02% optimize.opt_a.shard : 0.000005s : 0.01% optimize.opt_a.meta_shard_fg_expand : 0.000006s : 0.01% optimize.opt_a.shard_inline : 0.000016s : 0.02% optimize.opt_a.merge_send_recv : 0.000020s : 0.03% optimize.opt_a.auto_parallel : 0.000020s : 0.03% optimize.opt_a.parallel : 0.000028s : 0.04% optimize.opt_a.flash_sp : 0.000014s : 0.02% optimize.opt_a.merge_comm : 0.000009s : 0.01% optimize.opt_a.allreduce_fusion : 0.000015s : 0.02% optimize.opt_a.matmul_add_comm_reduction : 0.000023s : 0.03% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000020s : 0.03% optimize.opt_a.virtual_dataset : 0.000016s : 0.02% optimize.opt_a.get_grad_eliminate_ : 0.000015s : 0.02% optimize.opt_a.virtual_output : 0.000015s : 0.02% optimize.opt_a.merge_forward : 0.000010s : 0.01% optimize.opt_a.cell_reuse_recompute_pass : 0.000005s : 0.01% optimize.opt_a.offload_activation : 0.000023s : 0.03% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000036s : 0.05% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.00% optimize.opt_a.before_grad : 0.000027s : 0.04% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000009s : 0.01% optimize.opt_a.meta_fg_expand : 0.000007s : 0.01% optimize.opt_a.flash_sp_send_recv_attached : 0.000005s : 0.01% optimize.opt_a.receive_attached : 0.000005s : 0.01% optimize.opt_a.after_resolve : 0.000029s : 0.04% optimize.opt_a.a_after_grad : 0.000024s : 0.03% optimize.opt_a.renormalize : 0.001058s : 1.42% optimize.opt_a.add_forward_monad_depend : 0.000010s : 0.01% optimize.opt_a.auto_monad_grad : 0.000004s : 0.01% optimize.opt_a.auto_monad_eliminator : 0.000033s : 0.04% optimize.opt_a.cse : 0.000068s : 0.09% optimize.opt_a.a_3 : 0.000122s : 0.16% optimize.py_interpret_to_execute_after_opt_a : 0.000022s : 0.03% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.00% optimize.rewriter_after_opt_a : 0.000053s : 0.07% optimize.convert_after_rewriter : 0.000008s : 0.01% optimize.order_py_execute_after_rewriter : 0.000007s : 0.01% optimize.mutable_eliminate : 0.000839s : 1.12% optimize.opt_b.b_1 : 0.000189s : 0.25% optimize.opt_b.b_2 : 0.000012s : 0.02% optimize.opt_b.updatestate_depend_eliminate : 0.000012s : 0.02% optimize.opt_b.updatestate_assign_eliminate : 0.000004s : 0.01% optimize.opt_b.updatestate_loads_eliminate : 0.000004s : 0.01% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000040s : 0.05% optimize.optimize_parallel_all_gather_comm : 0.000024s : 0.03% optimize.overlap_param_gather : 0.000002s : 0.00% optimize.cconv : 0.000040s : 0.05% optimize.loop_unroll : 0.061072s : 81.85% optimize.opt_after_cconv.c_1 : 0.000047s : 0.06% optimize.opt_after_cconv.parameter_eliminate : 0.000008s : 0.01% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000015s : 0.02% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000005s : 0.01% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000004s : 0.01% optimize.opt_after_cconv.cse : 0.000057s : 0.08% optimize.opt_after_cconv.renormalize : 0.000001s : 0.00% optimize.remove_dup_value : 0.000021s : 0.03% optimize.tuple_transform.d_1 : 0.000068s : 0.09% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000009s : 0.01% optimize.partial_unused_args_eliminate : 0.000003s : 0.00% optimize.add_recomputation : 0.000077s : 0.10% optimize.cse_after_recomputation.cse : 0.000017s : 0.02% optimize.environ_conv : 0.000009s : 0.01% optimize.swap_dp_allreduce_reducescatter : 0.000006s : 0.01% optimize.bias_add_comm_swap : 0.000003s : 0.00% optimize.label_micro_interleaved_index : 0.000008s : 0.01% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.00% optimize.merge_cast_opt : 0.000002s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.00% optimize.micro_interleaved_order_control : 0.000003s : 0.00% optimize.assign_add_opt : 0.000002s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.00% optimize.full_micro_interleaved_order_control : 0.000003s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.00% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.00% optimize.control_data_broadcast_order : 0.000019s : 0.02% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.00% optimize.offloading_packed_experts : 0.000005s : 0.01% optimize.overlap_recompute_and_grad_model_parallel : 0.000006s : 0.01% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.00% optimize.overlap_recompute_comm : 0.000002s : 0.00% optimize.overlap_grad_ring_attention : 0.000005s : 0.01% optimize.overlap_grad_flash_sp : 0.000027s : 0.04% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.00% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000005s : 0.01% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000015s : 0.02% optimize.symbol_engine_optimizer.elim_not_effective : 0.000017s : 0.02% optimize.symbol_engine_optimizer.opt_reshape : 0.000009s : 0.01% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000013s : 0.02% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.00% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000023s : 0.03% get_jit_bprop_graph : 0.000003s : 0.00% rewriter_after_jit_bprop_graph : 0.000007s : 0.01% opt_after_jit_grad : 0.000882s : 1.18% validate : 0.000062s : 0.08% Time group info: ------[substitution.] 0.000247 39 12.31% : 0.000030s : 3: substitution.cast_eliminate 0.98% : 0.000002s : 3: substitution.elim_not_effective 0.66% : 0.000002s : 3: substitution.fold_const_symbol 3.29% : 0.000008s : 5: substitution.graph_param_transform 65.64% : 0.000162s : 4: substitution.inline 2.13% : 0.000005s : 6: substitution.j_node_and_user_rematch 2.69% : 0.000007s : 6: substitution.remove_not_recompute_node 2.70% : 0.000007s : 4: substitution.replace_old_param 6.22% : 0.000015s : 4: substitution.tuple_list_get_item_eliminator 3.36% : 0.000008s : 1: substitution.value_based_eliminate ------[type_inference.] 0.006884 2 88.55% : 0.006095s : 1: type_inference.infer 11.45% : 0.000789s : 1: type_inference.specialize ------[replace.] 0.000065 8 61.91% : 0.000040s : 4: replace.inline 38.09% : 0.000025s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000173 8 92.25% : 0.000160s : 4: match.inline 7.75% : 0.000013s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000291 1596 0.86% : 0.000003s : 17: predicate.accumulaten_eliminater 1.29% : 0.000004s : 5: predicate.ad_related_special_op_eliminate 0.50% : 0.000001s : 10: predicate.addn_check_dump 0.90% : 0.000003s : 17: predicate.addn_zero_filter 0.85% : 0.000002s : 17: predicate.adjust_all_reduce_mul_add 2.21% : 0.000006s : 27: predicate.arithmetic_simplify 1.06% : 0.000003s : 17: predicate.cast_eliminate 0.59% : 0.000002s : 10: predicate.check_bprop_eliminate 0.54% : 0.000002s : 10: predicate.compare_switch_simplify 0.17% : 0.000000s : 5: predicate.const_output_eliminate 0.65% : 0.000002s : 10: predicate.depend_value_elim 0.90% : 0.000003s : 17: predicate.dict_get_item_const_eliminator 1.06% : 0.000003s : 17: predicate.dict_get_item_eliminator 0.85% : 0.000002s : 17: predicate.dict_set_item_eliminator 1.35% : 0.000004s : 10: predicate.dumpgradient_eliminate 0.23% : 0.000001s : 5: predicate.elim_not_effective 0.56% : 0.000002s : 5: predicate.elim_shapecalc_of_broadcastargs 1.22% : 0.000004s : 22: predicate.environ_add_const_eliminate 1.02% : 0.000003s : 22: predicate.environ_get_add_eliminate 1.03% : 0.000003s : 22: predicate.environ_get_depend_swap 1.58% : 0.000005s : 32: predicate.environ_get_eliminate 1.04% : 0.000003s : 22: predicate.environ_get_set_eliminate 1.26% : 0.000004s : 25: predicate.exchange_switch_depend_value 2.05% : 0.000006s : 25: predicate.float_depend_g_call 0.45% : 0.000001s : 10: predicate.float_environ_get_switch 0.72% : 0.000002s : 15: predicate.float_tuple_getitem_switch 0.15% : 0.000000s : 5: predicate.fold_const_symbol 0.59% : 0.000002s : 10: predicate.get_grad_eliminate 0.23% : 0.000001s : 5: predicate.graph_param_transform 0.55% : 0.000002s : 10: predicate.incorporate_call 0.44% : 0.000001s : 10: predicate.incorporate_call_switch 6.47% : 0.000019s : 72: predicate.inline 0.68% : 0.000002s : 10: predicate.inline_without_move 0.28% : 0.000001s : 10: predicate.j_node_and_user_rematch 0.86% : 0.000003s : 10: predicate.less_batch_normalization 1.76% : 0.000005s : 31: predicate.list_to_tuple_eliminator_ 2.47% : 0.000007s : 48: predicate.load_eliminater 2.74% : 0.000008s : 5: predicate.loop_unroll_after_grad 1.90% : 0.000006s : 36: predicate.loop_unroll_before_grad 1.90% : 0.000006s : 27: predicate.make_slice_get_slice_eliminator 0.56% : 0.000002s : 10: predicate.merge_addn 0.65% : 0.000002s : 10: predicate.micro_step_allgather_replace 0.55% : 0.000002s : 10: predicate.mini_step_allgather_replace 0.81% : 0.000002s : 17: predicate.minmaximum_grad 1.44% : 0.000004s : 5: predicate.mutable_eliminate 0.44% : 0.000001s : 5: predicate.opt_reshape 0.46% : 0.000001s : 5: predicate.parallel_virtual_node 1.54% : 0.000004s : 25: predicate.partial_defer_inline 1.46% : 0.000004s : 26: predicate.partial_eliminate 0.97% : 0.000003s : 17: predicate.print_const_string_wrapper 0.55% : 0.000002s : 10: predicate.reduce_all_const_elim 1.17% : 0.000003s : 17: predicate.reduce_eliminate 2.44% : 0.000007s : 48: predicate.redundant_stop_gradient_eliminater 0.45% : 0.000001s : 10: predicate.remove_not_recompute_node 1.27% : 0.000004s : 31: predicate.replace_applicator 0.51% : 0.000001s : 10: predicate.replace_old_param 0.23% : 0.000001s : 5: predicate.reset_defer_inline 1.09% : 0.000003s : 17: predicate.reshape_eliminate 0.64% : 0.000002s : 10: predicate.row_tensor_add_zeros_like 0.40% : 0.000001s : 5: predicate.row_tensor_eliminate 0.78% : 0.000002s : 10: predicate.same_eliminate 0.35% : 0.000001s : 10: predicate.set_cell_output_no_recompute 0.68% : 0.000002s : 10: predicate.shard_identity_eliminate 0.88% : 0.000003s : 10: predicate.special_op_eliminate 0.62% : 0.000002s : 10: predicate.specialize_transform 1.08% : 0.000003s : 10: predicate.split_environ_get_set_with_tuple_value 0.78% : 0.000002s : 10: predicate.stack_unstack_eliminate 0.36% : 0.000001s : 5: predicate.switch_call_monad_eliminater 1.41% : 0.000004s : 25: predicate.switch_defer_inline 1.82% : 0.000005s : 35: predicate.switch_layer_defer_inline 4.17% : 0.000012s : 76: predicate.switch_simplify 0.90% : 0.000003s : 17: predicate.tile_eliminate 0.88% : 0.000003s : 17: predicate.transpose_eliminate 1.53% : 0.000004s : 27: predicate.tuple_list_convert_item_index_to_positive 1.74% : 0.000005s : 27: predicate.tuple_list_get_item_const_eliminator 1.45% : 0.000004s : 27: predicate.tuple_list_get_item_depend_reorder 3.39% : 0.000010s : 41: predicate.tuple_list_get_item_eliminator 1.65% : 0.000005s : 27: predicate.tuple_list_get_set_item_eliminator 2.37% : 0.000007s : 37: predicate.tuple_list_set_item_eliminator 1.82% : 0.000005s : 31: predicate.tuple_to_list_eliminator_ 2.43% : 0.000007s : 48: predicate.updatestate_pure_node_eliminater 2.87% : 0.000008s : 58: predicate.updatestate_useless_node_eliminater 0.45% : 0.000001s : 5: predicate.value_based_eliminate 0.63% : 0.000002s : 10: predicate.virtual_dataset_eliminate 0.60% : 0.000002s : 10: predicate.virtual_output_eliminate 0.28% : 0.000001s : 5: predicate.virtual_view_grad_eliminate 0.44% : 0.000001s : 5: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000645 11 51.87% : 0.000334s : 5: func_graph_cloner_run.FuncGraphClonerGraph 48.13% : 0.000310s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.152781 192 0.00% : 0.000004s : 1: ForceFp32Comm 2.35% : 0.003591s : 1: add_attr 2.34% : 0.003576s : 1: add_attr_with_inline 0.00% : 0.000004s : 1: add_comm_op_reuse_tag 0.05% : 0.000082s : 1: add_recomputation 0.00% : 0.000004s : 1: assign_add_opt 0.05% : 0.000075s : 1: auto_monad 0.02% : 0.000028s : 1: auto_monad_reorder 0.00% : 0.000004s : 1: begin_end_overlap_inline 0.00% : 0.000007s : 1: bias_add_comm_swap 0.32% : 0.000481s : 1: bootstrap 0.03% : 0.000044s : 1: cconv 0.00% : 0.000004s : 1: comm_op_add_attrs 0.01% : 0.000022s : 1: control_data_broadcast_order 0.01% : 0.000012s : 1: convert_after_rewriter 0.02% : 0.000032s : 1: cse_after_recomputation 0.00% : 0.000005s : 1: dataset_repeat_opt 0.00% : 0.000006s : 1: detach_backward 0.01% : 0.000013s : 1: environ_conv 0.02% : 0.000027s : 1: event_method 0.00% : 0.000006s : 1: full_micro_interleaved_order_control 0.00% : 0.000006s : 1: get_jit_bprop_graph 0.01% : 0.000009s : 1: graph_reusing 0.00% : 0.000005s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000004s : 1: handle_group_info 0.00% : 0.000006s : 1: inline 0.00% : 0.000006s : 1: insert-virtual-dataset 0.00% : 0.000005s : 1: interleave_parallel_branches 0.00% : 0.000004s : 1: interleave_split_concat_branches 0.00% : 0.000006s : 1: label_fine_grained_interleaved_index 0.01% : 0.000011s : 1: label_micro_interleaved_index 39.99% : 0.061098s : 1: loop_unroll 0.00% : 0.000004s : 1: merge_cast_opt 0.00% : 0.000006s : 1: micro_interleaved_order_control 0.56% : 0.000855s : 1: mutable_eliminate 0.00% : 0.000007s : 1: offloading_packed_experts 0.03% : 0.000047s : 1: opt.transform.loop_unroll_optimizer 0.02% : 0.000029s : 1: opt.transform.mutable_eliminate 0.99% : 0.001507s : 78: opt.transform.opt_a 0.03% : 0.000045s : 1: opt.transform.opt_after_cconv 0.03% : 0.000048s : 1: opt.transform.opt_after_jit_grad 0.11% : 0.000162s : 28: opt.transform.opt_b 0.05% : 0.000074s : 2: opt.transform.opt_trans_graph 0.03% : 0.000050s : 4: opt.transform.symbol_engine_opt 2.36% : 0.003603s : 1: opt_a 0.13% : 0.000193s : 1: opt_after_cconv 0.59% : 0.000900s : 1: opt_after_jit_grad 0.21% : 0.000316s : 1: opt_b 43.87% : 0.067027s : 1: optimize 0.02% : 0.000027s : 1: optimize_parallel_all_gather_comm 0.01% : 0.000010s : 1: order_py_execute_after_rewriter 0.02% : 0.000030s : 1: overlap_grad_flash_sp 0.00% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.01% : 0.000009s : 1: overlap_grad_ring_attention 0.00% : 0.000006s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000006s : 1: overlap_param_gather 0.00% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.01% : 0.000008s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000005s : 1: overlap_recompute_comm 0.00% : 0.000007s : 1: parallel-infer-symbol 0.00% : 0.000004s : 1: parallel-infer-symbol-second 0.00% : 0.000006s : 1: partial_unused_args_eliminate 0.00% : 0.000005s : 1: pipeline_parallel_scheduler 0.00% : 0.000005s : 1: pipeline_split 0.03% : 0.000046s : 1: pre_auto_parallel 0.02% : 0.000033s : 1: py_interpret_to_execute 0.02% : 0.000026s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000005s : 1: remove_cast_before_assign_add 0.02% : 0.000025s : 1: remove_dup_value 0.39% : 0.000599s : 1: renormalize.infer 0.29% : 0.000448s : 1: renormalize.specialize 0.00% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.01% : 0.000011s : 1: rewriter_after_jit_bprop_graph 0.04% : 0.000059s : 1: rewriter_after_opt_a 0.06% : 0.000098s : 1: rewriter_before_opt_a 0.00% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000005s : 1: slice_recompute_activation 0.00% : 0.000005s : 1: split_layernorm_comm 0.00% : 0.000005s : 1: split_matmul_comm_elemetwise 0.01% : 0.000009s : 1: swap_dp_allreduce_reducescatter 0.07% : 0.000101s : 1: symbol_engine_optimizer 0.07% : 0.000108s : 1: tuple_transform 4.57% : 0.006984s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:44:15.750.715 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:44:15.751.037 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0706914, [21] [bootstrap]: 0.00047427 [type_inference]: 0.0303932 [event_method]: 2.094e-05 [auto_monad]: 6.837e-05 [graph_reusing]: 5.99e-06 [inline]: 2.51e-06 [add_attr]: 0.00396194, [1] [add_attr_with_inline]: 0.00394777, [1] [Cycle 1]: 0.00010724, [2] [tag_attr]: 2.791e-05 [meta_addattr_fg_expand]: 6.30997e-06 [parallel-infer-symbol]: 3.87998e-06 [pre_auto_parallel]: 4.774e-05 [insert-virtual-dataset]: 2.73998e-06 [parallel-infer-symbol-second]: 8.09989e-07 [dataset_repeat_opt]: 2.17001e-06 [pipeline_split]: 1.66e-06 [optimize]: 0.034297, [53] [py_interpret_to_execute]: 3.95e-05 [rewriter_before_opt_a]: 0.00010856 [opt_a]: 0.00417564, [2] [Cycle 1]: 0.00307708, [45] [expand_dump_flag]: 3.41001e-06 [switch_simplify]: 4.659e-05 [loop_unroll]: 3.23e-05 [a_1]: 0.00082665 [with_stream_mark]: 2.619e-05 [recompute_prepare]: 1.327e-05 [updatestate_depend_eliminate]: 5.64e-06 [updatestate_assign_eliminate]: 4.26001e-06 [updatestate_loads_eliminate]: 3.6e-06 [parameter_eliminate]: 2.63e-06 [a_2]: 0.0001378 [accelerated_algorithm]: 1.017e-05 [shard]: 2.59999e-06 [meta_shard_fg_expand]: 2.80002e-06 [shard_inline]: 8.23999e-06 [merge_send_recv]: 1.096e-05 [auto_parallel]: 1.186e-05 [parallel]: 2.218e-05 [flash_sp]: 1.249e-05 [merge_comm]: 5.64e-06 [allreduce_fusion]: 4.49998e-06 [matmul_add_comm_reduction]: 1.175e-05 [allreduce_slice_to_reducescatter]: 1.25001e-06 [virtual_shard_identity]: 1.167e-05 [virtual_dataset]: 1.196e-05 [get_grad_eliminate_]: 9.79e-06 [virtual_output]: 8.62e-06 [merge_forward]: 6.06e-06 [cell_reuse_recompute_pass]: 2.01998e-06 [offload_activation]: 1.33e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.609e-05 [merge_recompute_call_nodes]: 1.59e-06 [before_grad]: 1.568e-05 [set_forward_comm_id_for_comm_node_pass]: 5.18002e-06 [meta_fg_expand]: 4.05e-06 [flash_sp_send_recv_attached]: 3.38e-06 [receive_attached]: 3.11001e-06 [after_resolve]: 1.528e-05 [a_after_grad]: 1.255e-05 [renormalize]: 0.00108668 [add_forward_monad_depend]: 1.022e-05 [auto_monad_grad]: 2.69001e-06 [auto_monad_eliminator]: 2.454e-05 [cse]: 4.028e-05 [a_3]: 8.633e-05 [Cycle 2]: 0.00107789, [45] [expand_dump_flag]: 3.04999e-06 [switch_simplify]: 1.146e-05 [loop_unroll]: 8.02998e-06 [a_1]: 0.00019941 [with_stream_mark]: 2.202e-05 [recompute_prepare]: 9.14e-06 [updatestate_depend_eliminate]: 4.68001e-06 [updatestate_assign_eliminate]: 3.75998e-06 [updatestate_loads_eliminate]: 3.41999e-06 [parameter_eliminate]: 1.66e-06 [a_2]: 0.00012363 [accelerated_algorithm]: 8.03999e-06 [shard]: 2.24001e-06 [meta_shard_fg_expand]: 1.94e-06 [shard_inline]: 7.87e-06 [merge_send_recv]: 9.02999e-06 [auto_parallel]: 1.111e-05 [parallel]: 1.071e-05 [flash_sp]: 4.50999e-06 [merge_comm]: 5.23002e-06 [allreduce_fusion]: 5.22e-06 [matmul_add_comm_reduction]: 1.099e-05 [allreduce_slice_to_reducescatter]: 9.30013e-07 [virtual_shard_identity]: 9.67999e-06 [virtual_dataset]: 7.83001e-06 [get_grad_eliminate_]: 7.5e-06 [virtual_output]: 7.59002e-06 [merge_forward]: 4.91002e-06 [cell_reuse_recompute_pass]: 3.24001e-06 [offload_activation]: 1.222e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.047e-05 [merge_recompute_call_nodes]: 1.36002e-06 [before_grad]: 1.418e-05 [set_forward_comm_id_for_comm_node_pass]: 5.63002e-06 [meta_fg_expand]: 3.47002e-06 [flash_sp_send_recv_attached]: 2.37999e-06 [receive_attached]: 2.53998e-06 [after_resolve]: 1.431e-05 [a_after_grad]: 1.222e-05 [renormalize]: 8.9989e-08 [add_forward_monad_depend]: 2.11e-06 [auto_monad_grad]: 1.72001e-06 [auto_monad_eliminator]: 1.643e-05 [cse]: 2.652e-05 [a_3]: 6.162e-05 [py_interpret_to_execute_after_opt_a]: 2.719e-05 [slice_cell_reuse_recomputed_activation]: 5.44e-06 [rewriter_after_opt_a]: 5.888e-05 [convert_after_rewriter]: 1.219e-05 [order_py_execute_after_rewriter]: 1.006e-05 [mutable_eliminate]: 0.00085961 [opt_b]: 0.0271723, [1] [Cycle 1]: 0.027152, [7] [b_1]: 0.00023486 [b_2]: 0.0266967 [updatestate_depend_eliminate]: 2.03e-05 [updatestate_assign_eliminate]: 4.18999e-06 [updatestate_loads_eliminate]: 3.98001e-06 [renormalize]: 1.34998e-06 [cse]: 4.989e-05 [optimize_parallel_all_gather_comm]: 3.443e-05 [overlap_param_gather]: 4.55001e-06 [cconv]: 4.523e-05 [loop_unroll]: 0.00076119 [opt_after_cconv]: 0.00016228, [1] [Cycle 1]: 0.00015063, [7] [c_1]: 4.399e-05 [parameter_eliminate]: 7.01001e-06 [updatestate_depend_eliminate]: 7.77e-06 [updatestate_assign_eliminate]: 3.28998e-06 [updatestate_loads_eliminate]: 3.35998e-06 [cse]: 2.776e-05 [renormalize]: 2.60014e-07 [remove_dup_value]: 1.918e-05 [tuple_transform]: 0.00013254, [1] [Cycle 1]: 0.00012485, [4] [d_1]: 7.915e-05 [none_parameter_eliminate]: 2.66999e-06 [renormalize]: 6.59988e-07 [switch_simplify]: 9.45001e-06 [partial_unused_args_eliminate]: 4.81002e-06 [add_recomputation]: 6.902e-05 [cse_after_recomputation]: 3.568e-05, [1] [Cycle 1]: 2.753e-05, [1] [cse]: 1.708e-05 [environ_conv]: 1.075e-05 [swap_dp_allreduce_reducescatter]: 9.42001e-06 [bias_add_comm_swap]: 6.74999e-06 [label_micro_interleaved_index]: 8.27998e-06 [label_fine_grained_interleaved_index]: 5.09e-06 [merge_cast_opt]: 4.28999e-06 [slice_recompute_activation]: 5.05001e-06 [micro_interleaved_order_control]: 5.55001e-06 [assign_add_opt]: 4.06001e-06 [ForceFp32Comm]: 3.7e-06 [remove_cast_before_assign_add]: 3.41999e-06 [full_micro_interleaved_order_control]: 4.74e-06 [reorder_send_recv_between_fp_bp]: 5.64e-06 [comm_op_add_attrs]: 3.59002e-06 [add_comm_op_reuse_tag]: 4.01001e-06 [interleave_split_concat_branches]: 3.58e-06 [interleave_parallel_branches]: 3.63999e-06 [overlap_opt_shard_in_pipeline]: 4.03999e-06 [overlap_opt_shard_grad_in_pipeline]: 4.62e-06 [control_data_broadcast_order]: 1.885e-05 [grouped_pairwise_exchange_alltoall]: 3.94002e-06 [offloading_packed_experts]: 7.77998e-06 [overlap_recompute_and_grad_model_parallel]: 8.94003e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.70998e-06 [overlap_recompute_allgather_and_fa_grad]: 3.85998e-06 [overlap_recompute_comm]: 4.95999e-06 [overlap_grad_ring_attention]: 7.51001e-06 [overlap_grad_flash_sp]: 2.944e-05 [begin_end_overlap_inline]: 2.91e-06 [split_matmul_comm_elemetwise]: 5.08002e-06 [split_layernorm_comm]: 4.26001e-06 [handle_group_info]: 3.55998e-06 [symbol_engine_optimizer]: 0.00011937, [1] [Cycle 1]: 0.00011243, [6] [build]: 5.20001e-06 [elim_shapecalc]: 1.456e-05 [elim_not_effective]: 1.746e-05 [opt_reshape]: 9.10999e-06 [fold_const_symbol]: 1.36e-05 [renormalize]: 2.09984e-07 [detach_backward]: 3.84002e-06 [pipeline_parallel_scheduler]: 1.96e-06 [auto_monad_reorder]: 2.608e-05 [get_jit_bprop_graph]: 1.97999e-06 [rewriter_after_jit_bprop_graph]: 6.12001e-06 [opt_after_jit_grad]: 0.00064917 [validate]: 5.076e-05 Sums bootstrap : 0.000474s : 0.73% type_inference : 0.030393s : 47.01% event_method : 0.000021s : 0.03% auto_monad : 0.000068s : 0.11% graph_reusing : 0.000006s : 0.01% inline : 0.000003s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000028s : 0.04% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.01% parallel-infer-symbol : 0.000004s : 0.01% pre_auto_parallel : 0.000048s : 0.07% insert-virtual-dataset : 0.000003s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000040s : 0.06% optimize.rewriter_before_opt_a : 0.000109s : 0.17% optimize.opt_a.expand_dump_flag : 0.000006s : 0.01% optimize.opt_a.switch_simplify : 0.000058s : 0.09% optimize.opt_a.loop_unroll : 0.000040s : 0.06% optimize.opt_a.a_1 : 0.001026s : 1.59% optimize.opt_a.with_stream_mark : 0.000048s : 0.07% optimize.opt_a.recompute_prepare : 0.000022s : 0.03% optimize.opt_a.updatestate_depend_eliminate : 0.000010s : 0.02% optimize.opt_a.updatestate_assign_eliminate : 0.000008s : 0.01% optimize.opt_a.updatestate_loads_eliminate : 0.000007s : 0.01% optimize.opt_a.parameter_eliminate : 0.000004s : 0.01% optimize.opt_a.a_2 : 0.000261s : 0.40% optimize.opt_a.accelerated_algorithm : 0.000018s : 0.03% optimize.opt_a.shard : 0.000005s : 0.01% optimize.opt_a.meta_shard_fg_expand : 0.000005s : 0.01% optimize.opt_a.shard_inline : 0.000016s : 0.02% optimize.opt_a.merge_send_recv : 0.000020s : 0.03% optimize.opt_a.auto_parallel : 0.000023s : 0.04% optimize.opt_a.parallel : 0.000033s : 0.05% optimize.opt_a.flash_sp : 0.000017s : 0.03% optimize.opt_a.merge_comm : 0.000011s : 0.02% optimize.opt_a.allreduce_fusion : 0.000010s : 0.02% optimize.opt_a.matmul_add_comm_reduction : 0.000023s : 0.04% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000002s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000021s : 0.03% optimize.opt_a.virtual_dataset : 0.000020s : 0.03% optimize.opt_a.get_grad_eliminate_ : 0.000017s : 0.03% optimize.opt_a.virtual_output : 0.000016s : 0.03% optimize.opt_a.merge_forward : 0.000011s : 0.02% optimize.opt_a.cell_reuse_recompute_pass : 0.000005s : 0.01% optimize.opt_a.offload_activation : 0.000026s : 0.04% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000047s : 0.07% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.00% optimize.opt_a.before_grad : 0.000030s : 0.05% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000011s : 0.02% optimize.opt_a.meta_fg_expand : 0.000008s : 0.01% optimize.opt_a.flash_sp_send_recv_attached : 0.000006s : 0.01% optimize.opt_a.receive_attached : 0.000006s : 0.01% optimize.opt_a.after_resolve : 0.000030s : 0.05% optimize.opt_a.a_after_grad : 0.000025s : 0.04% optimize.opt_a.renormalize : 0.001087s : 1.68% optimize.opt_a.add_forward_monad_depend : 0.000012s : 0.02% optimize.opt_a.auto_monad_grad : 0.000004s : 0.01% optimize.opt_a.auto_monad_eliminator : 0.000041s : 0.06% optimize.opt_a.cse : 0.000067s : 0.10% optimize.opt_a.a_3 : 0.000148s : 0.23% optimize.py_interpret_to_execute_after_opt_a : 0.000027s : 0.04% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.01% optimize.rewriter_after_opt_a : 0.000059s : 0.09% optimize.convert_after_rewriter : 0.000012s : 0.02% optimize.order_py_execute_after_rewriter : 0.000010s : 0.02% optimize.mutable_eliminate : 0.000860s : 1.33% optimize.opt_b.b_1 : 0.000235s : 0.36% optimize.opt_b.b_2 : 0.026697s : 41.29% optimize.opt_b.updatestate_depend_eliminate : 0.000020s : 0.03% optimize.opt_b.updatestate_assign_eliminate : 0.000004s : 0.01% optimize.opt_b.updatestate_loads_eliminate : 0.000004s : 0.01% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000050s : 0.08% optimize.optimize_parallel_all_gather_comm : 0.000034s : 0.05% optimize.overlap_param_gather : 0.000005s : 0.01% optimize.cconv : 0.000045s : 0.07% optimize.loop_unroll : 0.000761s : 1.18% optimize.opt_after_cconv.c_1 : 0.000044s : 0.07% optimize.opt_after_cconv.parameter_eliminate : 0.000007s : 0.01% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000008s : 0.01% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.cse : 0.000028s : 0.04% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000019s : 0.03% optimize.tuple_transform.d_1 : 0.000079s : 0.12% optimize.tuple_transform.none_parameter_eliminate : 0.000003s : 0.00% optimize.tuple_transform.renormalize : 0.000001s : 0.00% optimize.tuple_transform.switch_simplify : 0.000009s : 0.01% optimize.partial_unused_args_eliminate : 0.000005s : 0.01% optimize.add_recomputation : 0.000069s : 0.11% optimize.cse_after_recomputation.cse : 0.000017s : 0.03% optimize.environ_conv : 0.000011s : 0.02% optimize.swap_dp_allreduce_reducescatter : 0.000009s : 0.01% optimize.bias_add_comm_swap : 0.000007s : 0.01% optimize.label_micro_interleaved_index : 0.000008s : 0.01% optimize.label_fine_grained_interleaved_index : 0.000005s : 0.01% optimize.merge_cast_opt : 0.000004s : 0.01% optimize.slice_recompute_activation : 0.000005s : 0.01% optimize.micro_interleaved_order_control : 0.000006s : 0.01% optimize.assign_add_opt : 0.000004s : 0.01% optimize.ForceFp32Comm : 0.000004s : 0.01% optimize.remove_cast_before_assign_add : 0.000003s : 0.01% optimize.full_micro_interleaved_order_control : 0.000005s : 0.01% optimize.reorder_send_recv_between_fp_bp : 0.000006s : 0.01% optimize.comm_op_add_attrs : 0.000004s : 0.01% optimize.add_comm_op_reuse_tag : 0.000004s : 0.01% optimize.interleave_split_concat_branches : 0.000004s : 0.01% optimize.interleave_parallel_branches : 0.000004s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000005s : 0.01% optimize.control_data_broadcast_order : 0.000019s : 0.03% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.01% optimize.offloading_packed_experts : 0.000008s : 0.01% optimize.overlap_recompute_and_grad_model_parallel : 0.000009s : 0.01% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.01% optimize.overlap_recompute_comm : 0.000005s : 0.01% optimize.overlap_grad_ring_attention : 0.000008s : 0.01% optimize.overlap_grad_flash_sp : 0.000029s : 0.05% optimize.begin_end_overlap_inline : 0.000003s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000005s : 0.01% optimize.split_layernorm_comm : 0.000004s : 0.01% optimize.handle_group_info : 0.000004s : 0.01% optimize.symbol_engine_optimizer.build : 0.000005s : 0.01% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000015s : 0.02% optimize.symbol_engine_optimizer.elim_not_effective : 0.000017s : 0.03% optimize.symbol_engine_optimizer.opt_reshape : 0.000009s : 0.01% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000014s : 0.02% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000004s : 0.01% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000026s : 0.04% get_jit_bprop_graph : 0.000002s : 0.00% rewriter_after_jit_bprop_graph : 0.000006s : 0.01% opt_after_jit_grad : 0.000649s : 1.00% validate : 0.000051s : 0.08% Time group info: ------[substitution.] 0.000288 39 11.56% : 0.000033s : 3: substitution.cast_eliminate 0.95% : 0.000003s : 3: substitution.elim_not_effective 0.76% : 0.000002s : 3: substitution.fold_const_symbol 2.68% : 0.000008s : 5: substitution.graph_param_transform 67.52% : 0.000195s : 4: substitution.inline 2.14% : 0.000006s : 6: substitution.j_node_and_user_rematch 2.56% : 0.000007s : 6: substitution.remove_not_recompute_node 2.42% : 0.000007s : 4: substitution.replace_old_param 6.20% : 0.000018s : 4: substitution.tuple_list_get_item_eliminator 3.23% : 0.000009s : 1: substitution.value_based_eliminate ------[type_inference.] 0.030333 2 97.26% : 0.029501s : 1: type_inference.infer 2.74% : 0.000832s : 1: type_inference.specialize ------[replace.] 0.000073 8 60.92% : 0.000045s : 4: replace.inline 39.08% : 0.000029s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000208 8 92.39% : 0.000192s : 4: match.inline 7.61% : 0.000016s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000294 1596 0.95% : 0.000003s : 17: predicate.accumulaten_eliminater 0.68% : 0.000002s : 5: predicate.ad_related_special_op_eliminate 0.54% : 0.000002s : 10: predicate.addn_check_dump 0.92% : 0.000003s : 17: predicate.addn_zero_filter 0.74% : 0.000002s : 17: predicate.adjust_all_reduce_mul_add 2.06% : 0.000006s : 27: predicate.arithmetic_simplify 1.01% : 0.000003s : 17: predicate.cast_eliminate 0.56% : 0.000002s : 10: predicate.check_bprop_eliminate 0.54% : 0.000002s : 10: predicate.compare_switch_simplify 0.16% : 0.000000s : 5: predicate.const_output_eliminate 0.73% : 0.000002s : 10: predicate.depend_value_elim 0.92% : 0.000003s : 17: predicate.dict_get_item_const_eliminator 0.98% : 0.000003s : 17: predicate.dict_get_item_eliminator 0.83% : 0.000002s : 17: predicate.dict_set_item_eliminator 0.99% : 0.000003s : 10: predicate.dumpgradient_eliminate 0.26% : 0.000001s : 5: predicate.elim_not_effective 0.43% : 0.000001s : 5: predicate.elim_shapecalc_of_broadcastargs 1.13% : 0.000003s : 22: predicate.environ_add_const_eliminate 1.18% : 0.000003s : 22: predicate.environ_get_add_eliminate 1.17% : 0.000003s : 22: predicate.environ_get_depend_swap 1.58% : 0.000005s : 32: predicate.environ_get_eliminate 1.13% : 0.000003s : 22: predicate.environ_get_set_eliminate 1.27% : 0.000004s : 25: predicate.exchange_switch_depend_value 2.33% : 0.000007s : 25: predicate.float_depend_g_call 0.55% : 0.000002s : 10: predicate.float_environ_get_switch 0.82% : 0.000002s : 15: predicate.float_tuple_getitem_switch 0.16% : 0.000000s : 5: predicate.fold_const_symbol 0.66% : 0.000002s : 10: predicate.get_grad_eliminate 0.24% : 0.000001s : 5: predicate.graph_param_transform 0.60% : 0.000002s : 10: predicate.incorporate_call 0.46% : 0.000001s : 10: predicate.incorporate_call_switch 5.87% : 0.000017s : 72: predicate.inline 0.72% : 0.000002s : 10: predicate.inline_without_move 0.29% : 0.000001s : 10: predicate.j_node_and_user_rematch 0.80% : 0.000002s : 10: predicate.less_batch_normalization 1.78% : 0.000005s : 31: predicate.list_to_tuple_eliminator_ 2.48% : 0.000007s : 48: predicate.load_eliminater 0.72% : 0.000002s : 5: predicate.loop_unroll_after_grad 1.82% : 0.000005s : 36: predicate.loop_unroll_before_grad 1.78% : 0.000005s : 27: predicate.make_slice_get_slice_eliminator 0.53% : 0.000002s : 10: predicate.merge_addn 0.59% : 0.000002s : 10: predicate.micro_step_allgather_replace 0.55% : 0.000002s : 10: predicate.mini_step_allgather_replace 0.80% : 0.000002s : 17: predicate.minmaximum_grad 1.66% : 0.000005s : 5: predicate.mutable_eliminate 0.37% : 0.000001s : 5: predicate.opt_reshape 0.46% : 0.000001s : 5: predicate.parallel_virtual_node 1.70% : 0.000005s : 25: predicate.partial_defer_inline 1.45% : 0.000004s : 26: predicate.partial_eliminate 0.90% : 0.000003s : 17: predicate.print_const_string_wrapper 0.65% : 0.000002s : 10: predicate.reduce_all_const_elim 1.18% : 0.000003s : 17: predicate.reduce_eliminate 2.39% : 0.000007s : 48: predicate.redundant_stop_gradient_eliminater 0.41% : 0.000001s : 10: predicate.remove_not_recompute_node 1.22% : 0.000004s : 31: predicate.replace_applicator 0.46% : 0.000001s : 10: predicate.replace_old_param 0.31% : 0.000001s : 5: predicate.reset_defer_inline 1.10% : 0.000003s : 17: predicate.reshape_eliminate 0.70% : 0.000002s : 10: predicate.row_tensor_add_zeros_like 2.24% : 0.000007s : 5: predicate.row_tensor_eliminate 0.75% : 0.000002s : 10: predicate.same_eliminate 0.42% : 0.000001s : 10: predicate.set_cell_output_no_recompute 0.85% : 0.000002s : 10: predicate.shard_identity_eliminate 0.68% : 0.000002s : 10: predicate.special_op_eliminate 0.94% : 0.000003s : 10: predicate.specialize_transform 1.22% : 0.000004s : 10: predicate.split_environ_get_set_with_tuple_value 0.80% : 0.000002s : 10: predicate.stack_unstack_eliminate 0.35% : 0.000001s : 5: predicate.switch_call_monad_eliminater 1.44% : 0.000004s : 25: predicate.switch_defer_inline 1.88% : 0.000006s : 35: predicate.switch_layer_defer_inline 4.48% : 0.000013s : 76: predicate.switch_simplify 0.84% : 0.000002s : 17: predicate.tile_eliminate 0.90% : 0.000003s : 17: predicate.transpose_eliminate 1.66% : 0.000005s : 27: predicate.tuple_list_convert_item_index_to_positive 1.56% : 0.000005s : 27: predicate.tuple_list_get_item_const_eliminator 1.61% : 0.000005s : 27: predicate.tuple_list_get_item_depend_reorder 3.46% : 0.000010s : 41: predicate.tuple_list_get_item_eliminator 1.50% : 0.000004s : 27: predicate.tuple_list_get_set_item_eliminator 2.23% : 0.000007s : 37: predicate.tuple_list_set_item_eliminator 2.09% : 0.000006s : 31: predicate.tuple_to_list_eliminator_ 2.32% : 0.000007s : 48: predicate.updatestate_pure_node_eliminater 2.90% : 0.000009s : 58: predicate.updatestate_useless_node_eliminater 0.47% : 0.000001s : 5: predicate.value_based_eliminate 0.68% : 0.000002s : 10: predicate.virtual_dataset_eliminate 0.74% : 0.000002s : 10: predicate.virtual_output_eliminate 0.29% : 0.000001s : 5: predicate.virtual_view_grad_eliminate 0.41% : 0.000001s : 5: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000673 11 52.52% : 0.000353s : 5: func_graph_cloner_run.FuncGraphClonerGraph 47.48% : 0.000320s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.138577 192 0.00% : 0.000006s : 1: ForceFp32Comm 2.87% : 0.003974s : 1: add_attr 2.85% : 0.003952s : 1: add_attr_with_inline 0.00% : 0.000007s : 1: add_comm_op_reuse_tag 0.05% : 0.000073s : 1: add_recomputation 0.00% : 0.000007s : 1: assign_add_opt 0.06% : 0.000078s : 1: auto_monad 0.02% : 0.000034s : 1: auto_monad_reorder 0.00% : 0.000006s : 1: begin_end_overlap_inline 0.01% : 0.000009s : 1: bias_add_comm_swap 0.38% : 0.000520s : 1: bootstrap 0.04% : 0.000049s : 1: cconv 0.00% : 0.000006s : 1: comm_op_add_attrs 0.02% : 0.000022s : 1: control_data_broadcast_order 0.01% : 0.000015s : 1: convert_after_rewriter 0.03% : 0.000039s : 1: cse_after_recomputation 0.01% : 0.000008s : 1: dataset_repeat_opt 0.02% : 0.000022s : 1: detach_backward 0.01% : 0.000014s : 1: environ_conv 0.02% : 0.000032s : 1: event_method 0.01% : 0.000007s : 1: full_micro_interleaved_order_control 0.01% : 0.000008s : 1: get_jit_bprop_graph 0.01% : 0.000013s : 1: graph_reusing 0.00% : 0.000006s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000006s : 1: handle_group_info 0.01% : 0.000008s : 1: inline 0.01% : 0.000010s : 1: insert-virtual-dataset 0.00% : 0.000006s : 1: interleave_parallel_branches 0.00% : 0.000007s : 1: interleave_split_concat_branches 0.01% : 0.000008s : 1: label_fine_grained_interleaved_index 0.01% : 0.000011s : 1: label_micro_interleaved_index 0.55% : 0.000768s : 1: loop_unroll 0.01% : 0.000007s : 1: merge_cast_opt 0.01% : 0.000008s : 1: micro_interleaved_order_control 0.63% : 0.000870s : 1: mutable_eliminate 0.01% : 0.000011s : 1: offloading_packed_experts 0.02% : 0.000021s : 1: opt.transform.loop_unroll_optimizer 0.02% : 0.000028s : 1: opt.transform.mutable_eliminate 1.15% : 0.001599s : 78: opt.transform.opt_a 0.03% : 0.000042s : 1: opt.transform.opt_after_cconv 0.02% : 0.000033s : 1: opt.transform.opt_after_jit_grad 19.37% : 0.026848s : 28: opt.transform.opt_b 0.06% : 0.000086s : 2: opt.transform.opt_trans_graph 0.04% : 0.000049s : 4: opt.transform.symbol_engine_opt 3.02% : 0.004180s : 1: opt_a 0.12% : 0.000166s : 1: opt_after_cconv 0.48% : 0.000661s : 1: opt_after_jit_grad 19.61% : 0.027178s : 1: opt_b 25.04% : 0.034697s : 1: optimize 0.03% : 0.000038s : 1: optimize_parallel_all_gather_comm 0.01% : 0.000013s : 1: order_py_execute_after_rewriter 0.02% : 0.000033s : 1: overlap_grad_flash_sp 0.00% : 0.000007s : 1: overlap_grad_matmul_and_grad_allreduce 0.01% : 0.000011s : 1: overlap_grad_ring_attention 0.01% : 0.000008s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000007s : 1: overlap_opt_shard_in_pipeline 0.01% : 0.000008s : 1: overlap_param_gather 0.00% : 0.000006s : 1: overlap_recompute_allgather_and_fa_grad 0.01% : 0.000012s : 1: overlap_recompute_and_grad_model_parallel 0.01% : 0.000008s : 1: overlap_recompute_comm 0.01% : 0.000011s : 1: parallel-infer-symbol 0.00% : 0.000006s : 1: parallel-infer-symbol-second 0.01% : 0.000008s : 1: partial_unused_args_eliminate 0.01% : 0.000010s : 1: pipeline_parallel_scheduler 0.01% : 0.000007s : 1: pipeline_split 0.04% : 0.000056s : 1: pre_auto_parallel 0.03% : 0.000044s : 1: py_interpret_to_execute 0.02% : 0.000031s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000007s : 1: remove_cast_before_assign_add 0.02% : 0.000022s : 1: remove_dup_value 0.43% : 0.000599s : 1: renormalize.infer 0.34% : 0.000473s : 1: renormalize.specialize 0.01% : 0.000008s : 1: reorder_send_recv_between_fp_bp 0.01% : 0.000012s : 1: rewriter_after_jit_bprop_graph 0.05% : 0.000064s : 1: rewriter_after_opt_a 0.08% : 0.000112s : 1: rewriter_before_opt_a 0.01% : 0.000009s : 1: slice_cell_reuse_recomputed_activation 0.01% : 0.000008s : 1: slice_recompute_activation 0.01% : 0.000007s : 1: split_layernorm_comm 0.01% : 0.000008s : 1: split_matmul_comm_elemetwise 0.01% : 0.000013s : 1: swap_dp_allreduce_reducescatter 0.09% : 0.000122s : 1: symbol_engine_optimizer 0.10% : 0.000136s : 1: tuple_transform 21.96% : 0.030438s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:44:16.574.585 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0459873, [21] [bootstrap]: 0.00051557 [type_inference]: 0.00961973 [event_method]: 2.274e-05 [auto_monad]: 7.524e-05 [graph_reusing]: 7.08998e-06 [inline]: 2.88e-06 [add_attr]: 0.00407047, [1] [add_attr_with_inline]: 0.00405683, [1] [Cycle 1]: 7.857e-05, [2] [tag_attr]: 2.472e-05 [meta_addattr_fg_expand]: 6.08998e-06 [parallel-infer-symbol]: 4.20999e-06 [pre_auto_parallel]: 4.624e-05 [insert-virtual-dataset]: 2.46998e-06 [parallel-infer-symbol-second]: 7.00005e-07 [dataset_repeat_opt]: 2.26998e-06 [pipeline_split]: 1.74e-06 [optimize]: 0.0305636, [53] [py_interpret_to_execute]: 3.199e-05 [rewriter_before_opt_a]: 9.808e-05 [opt_a]: 0.0276675, [2] [Cycle 1]: 0.0267597, [45] [expand_dump_flag]: 3.33e-06 [switch_simplify]: 4.494e-05 [loop_unroll]: 3.075e-05 [a_1]: 0.00079463 [with_stream_mark]: 2.397e-05 [recompute_prepare]: 1.204e-05 [updatestate_depend_eliminate]: 5.58002e-06 [updatestate_assign_eliminate]: 3.88999e-06 [updatestate_loads_eliminate]: 3.66001e-06 [parameter_eliminate]: 1.87001e-06 [a_2]: 0.00010171 [accelerated_algorithm]: 8.52e-06 [shard]: 1.64e-06 [meta_shard_fg_expand]: 2.20002e-06 [shard_inline]: 7.95e-06 [merge_send_recv]: 1.015e-05 [auto_parallel]: 1.005e-05 [parallel]: 2.041e-05 [flash_sp]: 1.103e-05 [merge_comm]: 5.06002e-06 [allreduce_fusion]: 4.55001e-06 [matmul_add_comm_reduction]: 1.142e-05 [allreduce_slice_to_reducescatter]: 6.69999e-07 [virtual_shard_identity]: 1.008e-05 [virtual_dataset]: 8.18999e-06 [get_grad_eliminate_]: 7.73001e-06 [virtual_output]: 7.71001e-06 [merge_forward]: 5.04e-06 [cell_reuse_recompute_pass]: 1.65001e-06 [offload_activation]: 1.171e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.67e-05 [merge_recompute_call_nodes]: 1.47001e-06 [before_grad]: 1.306e-05 [set_forward_comm_id_for_comm_node_pass]: 4.72998e-06 [meta_fg_expand]: 3.6e-06 [flash_sp_send_recv_attached]: 2.66e-06 [receive_attached]: 2.88998e-06 [after_resolve]: 1.385e-05 [a_after_grad]: 1.2e-05 [renormalize]: 0.0250683 [add_forward_monad_depend]: 1.022e-05 [auto_monad_grad]: 3.26999e-06 [auto_monad_eliminator]: 2.314e-05 [cse]: 4.374e-05 [a_3]: 7.73e-05 [Cycle 2]: 0.00089465, [45] [expand_dump_flag]: 2.60002e-06 [switch_simplify]: 1.099e-05 [loop_unroll]: 8.2e-06 [a_1]: 0.00020205 [with_stream_mark]: 2.406e-05 [recompute_prepare]: 9.46e-06 [updatestate_depend_eliminate]: 4.80999e-06 [updatestate_assign_eliminate]: 3.65003e-06 [updatestate_loads_eliminate]: 3.81999e-06 [parameter_eliminate]: 2.35002e-06 [a_2]: 9.502e-05 [accelerated_algorithm]: 7.77002e-06 [shard]: 3.23e-06 [meta_shard_fg_expand]: 2.74999e-06 [shard_inline]: 7.3e-06 [merge_send_recv]: 1.059e-05 [auto_parallel]: 1.068e-05 [parallel]: 9.94999e-06 [flash_sp]: 4.25e-06 [merge_comm]: 9.84999e-06 [allreduce_fusion]: 4.21001e-06 [matmul_add_comm_reduction]: 1.141e-05 [allreduce_slice_to_reducescatter]: 6.50005e-07 [virtual_shard_identity]: 9.47999e-06 [virtual_dataset]: 8.67e-06 [get_grad_eliminate_]: 7.15003e-06 [virtual_output]: 7.1e-06 [merge_forward]: 5.25999e-06 [cell_reuse_recompute_pass]: 3.27002e-06 [offload_activation]: 1.175e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.508e-05 [merge_recompute_call_nodes]: 2.21e-06 [before_grad]: 1.406e-05 [set_forward_comm_id_for_comm_node_pass]: 8.18999e-06 [meta_fg_expand]: 3.88001e-06 [flash_sp_send_recv_attached]: 1.87001e-06 [receive_attached]: 2.46998e-06 [after_resolve]: 1.641e-05 [a_after_grad]: 1.29e-05 [renormalize]: 8.9989e-08 [add_forward_monad_depend]: 1.99e-06 [auto_monad_grad]: 1.47999e-06 [auto_monad_eliminator]: 1.065e-05 [cse]: 2.37e-05 [a_3]: 4.895e-05 [py_interpret_to_execute_after_opt_a]: 1.81e-05 [slice_cell_reuse_recomputed_activation]: 2.50997e-06 [rewriter_after_opt_a]: 4.759e-05 [convert_after_rewriter]: 7.97e-06 [order_py_execute_after_rewriter]: 6.21e-06 [mutable_eliminate]: 0.00082671 [opt_b]: 0.00030221, [1] [Cycle 1]: 0.00029237, [7] [b_1]: 0.00018465 [b_2]: 1.078e-05 [updatestate_depend_eliminate]: 1.172e-05 [updatestate_assign_eliminate]: 3.38e-06 [updatestate_loads_eliminate]: 3.44001e-06 [renormalize]: 9.70002e-07 [cse]: 3.789e-05 [optimize_parallel_all_gather_comm]: 2.381e-05 [overlap_param_gather]: 2.02001e-06 [cconv]: 3.541e-05 [loop_unroll]: 0.00065801 [opt_after_cconv]: 0.00014518, [1] [Cycle 1]: 0.00013627, [7] [c_1]: 4.145e-05 [parameter_eliminate]: 6.57002e-06 [updatestate_depend_eliminate]: 8.67e-06 [updatestate_assign_eliminate]: 3.23998e-06 [updatestate_loads_eliminate]: 3.03e-06 [cse]: 3.516e-05 [renormalize]: 9.70002e-07 [remove_dup_value]: 1.909e-05 [tuple_transform]: 0.00010043, [1] [Cycle 1]: 9.5e-05, [4] [d_1]: 6.312e-05 [none_parameter_eliminate]: 2.39999e-06 [renormalize]: 3.89991e-07 [switch_simplify]: 8.76002e-06 [partial_unused_args_eliminate]: 2.06e-06 [add_recomputation]: 6.782e-05 [cse_after_recomputation]: 3.104e-05, [1] [Cycle 1]: 2.527e-05, [1] [cse]: 1.861e-05 [environ_conv]: 8.88002e-06 [swap_dp_allreduce_reducescatter]: 6.94999e-06 [bias_add_comm_swap]: 3.51999e-06 [label_micro_interleaved_index]: 6.76e-06 [label_fine_grained_interleaved_index]: 2.83998e-06 [merge_cast_opt]: 1.74998e-06 [slice_recompute_activation]: 2.76999e-06 [micro_interleaved_order_control]: 3.08e-06 [assign_add_opt]: 1.27999e-06 [ForceFp32Comm]: 8.29983e-07 [remove_cast_before_assign_add]: 1.58002e-06 [full_micro_interleaved_order_control]: 2.12001e-06 [reorder_send_recv_between_fp_bp]: 2.71e-06 [comm_op_add_attrs]: 1.04998e-06 [add_comm_op_reuse_tag]: 1.07e-06 [interleave_split_concat_branches]: 1.14003e-06 [interleave_parallel_branches]: 1.27999e-06 [overlap_opt_shard_in_pipeline]: 1.77001e-06 [overlap_opt_shard_grad_in_pipeline]: 1.96e-06 [control_data_broadcast_order]: 1.744e-05 [grouped_pairwise_exchange_alltoall]: 1.76e-06 [offloading_packed_experts]: 5.15999e-06 [overlap_recompute_and_grad_model_parallel]: 5.64e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.19e-06 [overlap_recompute_allgather_and_fa_grad]: 1.35999e-06 [overlap_recompute_comm]: 2.49999e-06 [overlap_grad_ring_attention]: 4.74e-06 [overlap_grad_flash_sp]: 2.775e-05 [begin_end_overlap_inline]: 6.09987e-07 [split_matmul_comm_elemetwise]: 2.44001e-06 [split_layernorm_comm]: 1.54e-06 [handle_group_info]: 1.37e-06 [symbol_engine_optimizer]: 9.362e-05, [1] [Cycle 1]: 8.889e-05, [6] [build]: 4.83001e-06 [elim_shapecalc]: 1.394e-05 [elim_not_effective]: 1.76e-05 [opt_reshape]: 9.14e-06 [fold_const_symbol]: 1.273e-05 [renormalize]: 3.50003e-07 [detach_backward]: 2.12999e-06 [pipeline_parallel_scheduler]: 1.52001e-06 [auto_monad_reorder]: 2.105e-05 [get_jit_bprop_graph]: 2.34999e-06 [rewriter_after_jit_bprop_graph]: 6.46999e-06 [opt_after_jit_grad]: 0.00077301 [validate]: 5.83e-05 Sums bootstrap : 0.000516s : 1.26% type_inference : 0.009620s : 23.57% event_method : 0.000023s : 0.06% auto_monad : 0.000075s : 0.18% graph_reusing : 0.000007s : 0.02% inline : 0.000003s : 0.01% add_attr.add_attr_with_inline.tag_attr : 0.000025s : 0.06% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.01% parallel-infer-symbol : 0.000004s : 0.01% pre_auto_parallel : 0.000046s : 0.11% insert-virtual-dataset : 0.000002s : 0.01% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.01% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000032s : 0.08% optimize.rewriter_before_opt_a : 0.000098s : 0.24% optimize.opt_a.expand_dump_flag : 0.000006s : 0.01% optimize.opt_a.switch_simplify : 0.000056s : 0.14% optimize.opt_a.loop_unroll : 0.000039s : 0.10% optimize.opt_a.a_1 : 0.000997s : 2.44% optimize.opt_a.with_stream_mark : 0.000048s : 0.12% optimize.opt_a.recompute_prepare : 0.000021s : 0.05% optimize.opt_a.updatestate_depend_eliminate : 0.000010s : 0.03% optimize.opt_a.updatestate_assign_eliminate : 0.000008s : 0.02% optimize.opt_a.updatestate_loads_eliminate : 0.000007s : 0.02% optimize.opt_a.parameter_eliminate : 0.000004s : 0.01% optimize.opt_a.a_2 : 0.000197s : 0.48% optimize.opt_a.accelerated_algorithm : 0.000016s : 0.04% optimize.opt_a.shard : 0.000005s : 0.01% optimize.opt_a.meta_shard_fg_expand : 0.000005s : 0.01% optimize.opt_a.shard_inline : 0.000015s : 0.04% optimize.opt_a.merge_send_recv : 0.000021s : 0.05% optimize.opt_a.auto_parallel : 0.000021s : 0.05% optimize.opt_a.parallel : 0.000030s : 0.07% optimize.opt_a.flash_sp : 0.000015s : 0.04% optimize.opt_a.merge_comm : 0.000015s : 0.04% optimize.opt_a.allreduce_fusion : 0.000009s : 0.02% optimize.opt_a.matmul_add_comm_reduction : 0.000023s : 0.06% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000020s : 0.05% optimize.opt_a.virtual_dataset : 0.000017s : 0.04% optimize.opt_a.get_grad_eliminate_ : 0.000015s : 0.04% optimize.opt_a.virtual_output : 0.000015s : 0.04% optimize.opt_a.merge_forward : 0.000010s : 0.03% optimize.opt_a.cell_reuse_recompute_pass : 0.000005s : 0.01% optimize.opt_a.offload_activation : 0.000023s : 0.06% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000032s : 0.08% optimize.opt_a.merge_recompute_call_nodes : 0.000004s : 0.01% optimize.opt_a.before_grad : 0.000027s : 0.07% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000013s : 0.03% optimize.opt_a.meta_fg_expand : 0.000007s : 0.02% optimize.opt_a.flash_sp_send_recv_attached : 0.000005s : 0.01% optimize.opt_a.receive_attached : 0.000005s : 0.01% optimize.opt_a.after_resolve : 0.000030s : 0.07% optimize.opt_a.a_after_grad : 0.000025s : 0.06% optimize.opt_a.renormalize : 0.025068s : 61.43% optimize.opt_a.add_forward_monad_depend : 0.000012s : 0.03% optimize.opt_a.auto_monad_grad : 0.000005s : 0.01% optimize.opt_a.auto_monad_eliminator : 0.000034s : 0.08% optimize.opt_a.cse : 0.000067s : 0.17% optimize.opt_a.a_3 : 0.000126s : 0.31% optimize.py_interpret_to_execute_after_opt_a : 0.000018s : 0.04% optimize.slice_cell_reuse_recomputed_activation : 0.000003s : 0.01% optimize.rewriter_after_opt_a : 0.000048s : 0.12% optimize.convert_after_rewriter : 0.000008s : 0.02% optimize.order_py_execute_after_rewriter : 0.000006s : 0.02% optimize.mutable_eliminate : 0.000827s : 2.03% optimize.opt_b.b_1 : 0.000185s : 0.45% optimize.opt_b.b_2 : 0.000011s : 0.03% optimize.opt_b.updatestate_depend_eliminate : 0.000012s : 0.03% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.01% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.01% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000038s : 0.09% optimize.optimize_parallel_all_gather_comm : 0.000024s : 0.06% optimize.overlap_param_gather : 0.000002s : 0.00% optimize.cconv : 0.000035s : 0.09% optimize.loop_unroll : 0.000658s : 1.61% optimize.opt_after_cconv.c_1 : 0.000041s : 0.10% optimize.opt_after_cconv.parameter_eliminate : 0.000007s : 0.02% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000009s : 0.02% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.cse : 0.000035s : 0.09% optimize.opt_after_cconv.renormalize : 0.000001s : 0.00% optimize.remove_dup_value : 0.000019s : 0.05% optimize.tuple_transform.d_1 : 0.000063s : 0.15% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000009s : 0.02% optimize.partial_unused_args_eliminate : 0.000002s : 0.01% optimize.add_recomputation : 0.000068s : 0.17% optimize.cse_after_recomputation.cse : 0.000019s : 0.05% optimize.environ_conv : 0.000009s : 0.02% optimize.swap_dp_allreduce_reducescatter : 0.000007s : 0.02% optimize.bias_add_comm_swap : 0.000004s : 0.01% optimize.label_micro_interleaved_index : 0.000007s : 0.02% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.01% optimize.merge_cast_opt : 0.000002s : 0.00% optimize.slice_recompute_activation : 0.000003s : 0.01% optimize.micro_interleaved_order_control : 0.000003s : 0.01% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000002s : 0.00% optimize.full_micro_interleaved_order_control : 0.000002s : 0.01% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.01% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000002s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.00% optimize.control_data_broadcast_order : 0.000017s : 0.04% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.00% optimize.offloading_packed_experts : 0.000005s : 0.01% optimize.overlap_recompute_and_grad_model_parallel : 0.000006s : 0.01% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.00% optimize.overlap_recompute_comm : 0.000002s : 0.01% optimize.overlap_grad_ring_attention : 0.000005s : 0.01% optimize.overlap_grad_flash_sp : 0.000028s : 0.07% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.01% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000005s : 0.01% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000014s : 0.03% optimize.symbol_engine_optimizer.elim_not_effective : 0.000018s : 0.04% optimize.symbol_engine_optimizer.opt_reshape : 0.000009s : 0.02% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000013s : 0.03% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.01% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000021s : 0.05% get_jit_bprop_graph : 0.000002s : 0.01% rewriter_after_jit_bprop_graph : 0.000006s : 0.02% opt_after_jit_grad : 0.000773s : 1.89% validate : 0.000058s : 0.14% Time group info: ------[substitution.] 0.000266 39 11.38% : 0.000030s : 3: substitution.cast_eliminate 1.12% : 0.000003s : 3: substitution.elim_not_effective 0.68% : 0.000002s : 3: substitution.fold_const_symbol 2.99% : 0.000008s : 5: substitution.graph_param_transform 66.98% : 0.000178s : 4: substitution.inline 2.40% : 0.000006s : 6: substitution.j_node_and_user_rematch 2.60% : 0.000007s : 6: substitution.remove_not_recompute_node 2.57% : 0.000007s : 4: substitution.replace_old_param 6.34% : 0.000017s : 4: substitution.tuple_list_get_item_eliminator 2.95% : 0.000008s : 1: substitution.value_based_eliminate ------[type_inference.] 0.009534 2 90.84% : 0.008661s : 1: type_inference.infer 9.16% : 0.000874s : 1: type_inference.specialize ------[replace.] 0.000068 8 60.78% : 0.000041s : 4: replace.inline 39.22% : 0.000027s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000190 8 92.47% : 0.000175s : 4: match.inline 7.53% : 0.000014s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000288 1596 0.93% : 0.000003s : 17: predicate.accumulaten_eliminater 1.13% : 0.000003s : 5: predicate.ad_related_special_op_eliminate 0.52% : 0.000001s : 10: predicate.addn_check_dump 1.07% : 0.000003s : 17: predicate.addn_zero_filter 0.79% : 0.000002s : 17: predicate.adjust_all_reduce_mul_add 2.24% : 0.000006s : 27: predicate.arithmetic_simplify 0.99% : 0.000003s : 17: predicate.cast_eliminate 0.87% : 0.000003s : 10: predicate.check_bprop_eliminate 0.53% : 0.000002s : 10: predicate.compare_switch_simplify 0.17% : 0.000000s : 5: predicate.const_output_eliminate 0.61% : 0.000002s : 10: predicate.depend_value_elim 0.88% : 0.000003s : 17: predicate.dict_get_item_const_eliminator 1.07% : 0.000003s : 17: predicate.dict_get_item_eliminator 0.84% : 0.000002s : 17: predicate.dict_set_item_eliminator 1.03% : 0.000003s : 10: predicate.dumpgradient_eliminate 0.34% : 0.000001s : 5: predicate.elim_not_effective 0.41% : 0.000001s : 5: predicate.elim_shapecalc_of_broadcastargs 1.30% : 0.000004s : 22: predicate.environ_add_const_eliminate 1.06% : 0.000003s : 22: predicate.environ_get_add_eliminate 1.04% : 0.000003s : 22: predicate.environ_get_depend_swap 1.79% : 0.000005s : 32: predicate.environ_get_eliminate 1.18% : 0.000003s : 22: predicate.environ_get_set_eliminate 1.27% : 0.000004s : 25: predicate.exchange_switch_depend_value 2.23% : 0.000006s : 25: predicate.float_depend_g_call 0.58% : 0.000002s : 10: predicate.float_environ_get_switch 0.83% : 0.000002s : 15: predicate.float_tuple_getitem_switch 0.15% : 0.000000s : 5: predicate.fold_const_symbol 0.62% : 0.000002s : 10: predicate.get_grad_eliminate 0.21% : 0.000001s : 5: predicate.graph_param_transform 0.54% : 0.000002s : 10: predicate.incorporate_call 0.44% : 0.000001s : 10: predicate.incorporate_call_switch 5.83% : 0.000017s : 72: predicate.inline 0.65% : 0.000002s : 10: predicate.inline_without_move 0.27% : 0.000001s : 10: predicate.j_node_and_user_rematch 0.77% : 0.000002s : 10: predicate.less_batch_normalization 1.75% : 0.000005s : 31: predicate.list_to_tuple_eliminator_ 2.56% : 0.000007s : 48: predicate.load_eliminater 1.42% : 0.000004s : 5: predicate.loop_unroll_after_grad 1.79% : 0.000005s : 36: predicate.loop_unroll_before_grad 1.91% : 0.000006s : 27: predicate.make_slice_get_slice_eliminator 0.60% : 0.000002s : 10: predicate.merge_addn 0.59% : 0.000002s : 10: predicate.micro_step_allgather_replace 0.57% : 0.000002s : 10: predicate.mini_step_allgather_replace 0.87% : 0.000003s : 17: predicate.minmaximum_grad 1.73% : 0.000005s : 5: predicate.mutable_eliminate 0.36% : 0.000001s : 5: predicate.opt_reshape 0.30% : 0.000001s : 5: predicate.parallel_virtual_node 1.71% : 0.000005s : 25: predicate.partial_defer_inline 1.51% : 0.000004s : 26: predicate.partial_eliminate 0.96% : 0.000003s : 17: predicate.print_const_string_wrapper 0.58% : 0.000002s : 10: predicate.reduce_all_const_elim 1.31% : 0.000004s : 17: predicate.reduce_eliminate 2.54% : 0.000007s : 48: predicate.redundant_stop_gradient_eliminater 0.35% : 0.000001s : 10: predicate.remove_not_recompute_node 1.33% : 0.000004s : 31: predicate.replace_applicator 0.51% : 0.000001s : 10: predicate.replace_old_param 0.50% : 0.000001s : 5: predicate.reset_defer_inline 0.97% : 0.000003s : 17: predicate.reshape_eliminate 0.63% : 0.000002s : 10: predicate.row_tensor_add_zeros_like 0.32% : 0.000001s : 5: predicate.row_tensor_eliminate 0.84% : 0.000002s : 10: predicate.same_eliminate 0.35% : 0.000001s : 10: predicate.set_cell_output_no_recompute 0.61% : 0.000002s : 10: predicate.shard_identity_eliminate 0.79% : 0.000002s : 10: predicate.special_op_eliminate 0.61% : 0.000002s : 10: predicate.specialize_transform 0.98% : 0.000003s : 10: predicate.split_environ_get_set_with_tuple_value 0.88% : 0.000003s : 10: predicate.stack_unstack_eliminate 0.30% : 0.000001s : 5: predicate.switch_call_monad_eliminater 1.33% : 0.000004s : 25: predicate.switch_defer_inline 1.89% : 0.000005s : 35: predicate.switch_layer_defer_inline 4.35% : 0.000013s : 76: predicate.switch_simplify 0.88% : 0.000003s : 17: predicate.tile_eliminate 0.90% : 0.000003s : 17: predicate.transpose_eliminate 1.52% : 0.000004s : 27: predicate.tuple_list_convert_item_index_to_positive 1.71% : 0.000005s : 27: predicate.tuple_list_get_item_const_eliminator 1.67% : 0.000005s : 27: predicate.tuple_list_get_item_depend_reorder 4.14% : 0.000012s : 41: predicate.tuple_list_get_item_eliminator 1.86% : 0.000005s : 27: predicate.tuple_list_get_set_item_eliminator 2.36% : 0.000007s : 37: predicate.tuple_list_set_item_eliminator 1.72% : 0.000005s : 31: predicate.tuple_to_list_eliminator_ 2.27% : 0.000007s : 48: predicate.updatestate_pure_node_eliminater 2.94% : 0.000008s : 58: predicate.updatestate_useless_node_eliminater 0.44% : 0.000001s : 5: predicate.value_based_eliminate 0.56% : 0.000002s : 10: predicate.virtual_dataset_eliminate 0.57% : 0.000002s : 10: predicate.virtual_output_eliminate 0.28% : 0.000001s : 5: predicate.virtual_view_grad_eliminate 0.41% : 0.000001s : 5: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000792 11 53.88% : 0.000427s : 5: func_graph_cloner_run.FuncGraphClonerGraph 46.12% : 0.000365s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.107463 192 0.00% : 0.000004s : 1: ForceFp32Comm 3.79% : 0.004077s : 1: add_attr 3.78% : 0.004061s : 1: add_attr_with_inline 0.00% : 0.000004s : 1: add_comm_op_reuse_tag 0.07% : 0.000073s : 1: add_recomputation 0.00% : 0.000004s : 1: assign_add_opt 0.08% : 0.000082s : 1: auto_monad 0.02% : 0.000025s : 1: auto_monad_reorder 0.00% : 0.000003s : 1: begin_end_overlap_inline 0.01% : 0.000007s : 1: bias_add_comm_swap 0.51% : 0.000551s : 1: bootstrap 0.04% : 0.000039s : 1: cconv 0.00% : 0.000004s : 1: comm_op_add_attrs 0.02% : 0.000020s : 1: control_data_broadcast_order 0.01% : 0.000011s : 1: convert_after_rewriter 0.03% : 0.000035s : 1: cse_after_recomputation 0.01% : 0.000005s : 1: dataset_repeat_opt 0.01% : 0.000005s : 1: detach_backward 0.01% : 0.000013s : 1: environ_conv 0.03% : 0.000030s : 1: event_method 0.00% : 0.000005s : 1: full_micro_interleaved_order_control 0.01% : 0.000006s : 1: get_jit_bprop_graph 0.01% : 0.000011s : 1: graph_reusing 0.00% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000004s : 1: handle_group_info 0.01% : 0.000006s : 1: inline 0.01% : 0.000006s : 1: insert-virtual-dataset 0.00% : 0.000004s : 1: interleave_parallel_branches 0.00% : 0.000004s : 1: interleave_split_concat_branches 0.01% : 0.000006s : 1: label_fine_grained_interleaved_index 0.01% : 0.000010s : 1: label_micro_interleaved_index 0.62% : 0.000671s : 1: loop_unroll 0.00% : 0.000005s : 1: merge_cast_opt 0.01% : 0.000006s : 1: micro_interleaved_order_control 0.78% : 0.000842s : 1: mutable_eliminate 0.01% : 0.000008s : 1: offloading_packed_experts 0.02% : 0.000023s : 1: opt.transform.loop_unroll_optimizer 0.03% : 0.000029s : 1: opt.transform.mutable_eliminate 1.43% : 0.001540s : 78: opt.transform.opt_a 0.04% : 0.000040s : 1: opt.transform.opt_after_cconv 0.04% : 0.000039s : 1: opt.transform.opt_after_jit_grad 0.15% : 0.000157s : 28: opt.transform.opt_b 0.06% : 0.000069s : 2: opt.transform.opt_trans_graph 0.05% : 0.000049s : 4: opt.transform.symbol_engine_opt 25.75% : 0.027671s : 1: opt_a 0.14% : 0.000149s : 1: opt_after_cconv 0.73% : 0.000789s : 1: opt_after_jit_grad 0.29% : 0.000307s : 1: opt_b 28.45% : 0.030569s : 1: optimize 0.03% : 0.000027s : 1: optimize_parallel_all_gather_comm 0.01% : 0.000009s : 1: order_py_execute_after_rewriter 0.03% : 0.000031s : 1: overlap_grad_flash_sp 0.00% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.01% : 0.000008s : 1: overlap_grad_ring_attention 0.01% : 0.000006s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000005s : 1: overlap_opt_shard_in_pipeline 0.01% : 0.000005s : 1: overlap_param_gather 0.00% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.01% : 0.000009s : 1: overlap_recompute_and_grad_model_parallel 0.01% : 0.000006s : 1: overlap_recompute_comm 0.01% : 0.000008s : 1: parallel-infer-symbol 0.00% : 0.000003s : 1: parallel-infer-symbol-second 0.00% : 0.000005s : 1: partial_unused_args_eliminate 0.00% : 0.000005s : 1: pipeline_parallel_scheduler 0.00% : 0.000005s : 1: pipeline_split 0.05% : 0.000050s : 1: pre_auto_parallel 0.03% : 0.000036s : 1: py_interpret_to_execute 0.02% : 0.000022s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000006s : 1: remove_cast_before_assign_add 0.02% : 0.000023s : 1: remove_dup_value 22.82% : 0.024527s : 1: renormalize.infer 0.49% : 0.000526s : 1: renormalize.specialize 0.01% : 0.000005s : 1: reorder_send_recv_between_fp_bp 0.01% : 0.000010s : 1: rewriter_after_jit_bprop_graph 0.05% : 0.000053s : 1: rewriter_after_opt_a 0.10% : 0.000103s : 1: rewriter_before_opt_a 0.01% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.01% : 0.000005s : 1: slice_recompute_activation 0.00% : 0.000004s : 1: split_layernorm_comm 0.01% : 0.000006s : 1: split_matmul_comm_elemetwise 0.01% : 0.000010s : 1: swap_dp_allreduce_reducescatter 0.09% : 0.000097s : 1: symbol_engine_optimizer 0.10% : 0.000103s : 1: tuple_transform 8.98% : 0.009647s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:44:17.331.562 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:44:17.331.827 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.138886, [21] [bootstrap]: 0.00175849 [type_inference]: 0.0486404 [event_method]: 0.00015921 [auto_monad]: 0.00022431 [graph_reusing]: 1.009e-05 [inline]: 3.21999e-06 [add_attr]: 0.0175009, [1] [add_attr_with_inline]: 0.0174793, [1] [Cycle 1]: 0.00037977, [2] [tag_attr]: 0.00015233 [meta_addattr_fg_expand]: 1.215e-05 [parallel-infer-symbol]: 5.90002e-06 [pre_auto_parallel]: 0.00020071 [insert-virtual-dataset]: 0.00013528 [parallel-infer-symbol-second]: 2.21e-06 [dataset_repeat_opt]: 3.16001e-06 [pipeline_split]: 1.79e-06 [optimize]: 0.0682011, [53] [py_interpret_to_execute]: 0.00018986 [rewriter_before_opt_a]: 0.00040215 [opt_a]: 0.0239679, [2] [Cycle 1]: 0.0177532, [45] [expand_dump_flag]: 5.34e-06 [switch_simplify]: 0.00019121 [loop_unroll]: 0.00017092 [a_1]: 0.00611897 [with_stream_mark]: 0.0001607 [recompute_prepare]: 2.794e-05 [updatestate_depend_eliminate]: 0.00014212 [updatestate_assign_eliminate]: 7.8e-06 [updatestate_loads_eliminate]: 4.72e-06 [parameter_eliminate]: 0.00013314 [a_2]: 0.00060005 [accelerated_algorithm]: 1.973e-05 [shard]: 3.68e-06 [meta_shard_fg_expand]: 6.09001e-06 [shard_inline]: 1.724e-05 [merge_send_recv]: 1.972e-05 [auto_parallel]: 0.00015051 [parallel]: 2.541e-05 [flash_sp]: 2.198e-05 [merge_comm]: 0.00015239 [allreduce_fusion]: 8.13001e-06 [matmul_add_comm_reduction]: 0.00014901 [allreduce_slice_to_reducescatter]: 1.27999e-06 [virtual_shard_identity]: 0.00015766 [virtual_dataset]: 1.882e-05 [get_grad_eliminate_]: 0.00014901 [virtual_output]: 1.419e-05 [merge_forward]: 1.424e-05 [cell_reuse_recompute_pass]: 5.06997e-06 [offload_activation]: 0.00015957 [cell_reuse_handle_not_recompute_node_pass]: 0.00019229 [merge_recompute_call_nodes]: 2.29999e-06 [before_grad]: 0.000165 [set_forward_comm_id_for_comm_node_pass]: 0.00014335 [meta_fg_expand]: 7.2e-06 [flash_sp_send_recv_attached]: 7.68001e-06 [receive_attached]: 3.57002e-06 [after_resolve]: 0.00015293 [a_after_grad]: 2.525e-05 [renormalize]: 0.00448808 [add_forward_monad_depend]: 0.00013816 [auto_monad_grad]: 4.05e-06 [auto_monad_eliminator]: 0.00016146 [cse]: 0.00018421 [a_3]: 0.00039554 [Cycle 2]: 0.00618808, [45] [expand_dump_flag]: 3.57997e-06 [switch_simplify]: 0.00014769 [loop_unroll]: 1.367e-05 [a_1]: 0.00089412 [with_stream_mark]: 0.00016323 [recompute_prepare]: 1.853e-05 [updatestate_depend_eliminate]: 1.232e-05 [updatestate_assign_eliminate]: 6.23e-06 [updatestate_loads_eliminate]: 5.02e-06 [parameter_eliminate]: 4.68999e-06 [a_2]: 0.00058012 [accelerated_algorithm]: 1.904e-05 [shard]: 3.83999e-06 [meta_shard_fg_expand]: 5.47001e-06 [shard_inline]: 1.736e-05 [merge_send_recv]: 1.978e-05 [auto_parallel]: 1.585e-05 [parallel]: 1.297e-05 [flash_sp]: 6.14001e-06 [merge_comm]: 0.00014116 [allreduce_fusion]: 7.85998e-06 [matmul_add_comm_reduction]: 0.00014584 [allreduce_slice_to_reducescatter]: 1.13001e-06 [virtual_shard_identity]: 1.957e-05 [virtual_dataset]: 1.876e-05 [get_grad_eliminate_]: 1.071e-05 [virtual_output]: 2.282e-05 [merge_forward]: 1.011e-05 [cell_reuse_recompute_pass]: 4.92999e-06 [offload_activation]: 2.264e-05 [cell_reuse_handle_not_recompute_node_pass]: 4.135e-05 [merge_recompute_call_nodes]: 2.17001e-06 [before_grad]: 2.833e-05 [set_forward_comm_id_for_comm_node_pass]: 1.461e-05 [meta_fg_expand]: 5.59e-06 [flash_sp_send_recv_attached]: 1.94e-06 [receive_attached]: 3.81999e-06 [after_resolve]: 0.00017236 [a_after_grad]: 0.0001579 [renormalize]: 2.69996e-07 [add_forward_monad_depend]: 7.87003e-06 [auto_monad_grad]: 2.73e-06 [auto_monad_eliminator]: 0.00015768 [cse]: 0.00018355 [a_3]: 0.00023502 [py_interpret_to_execute_after_opt_a]: 0.00016767 [slice_cell_reuse_recomputed_activation]: 6.95002e-06 [rewriter_after_opt_a]: 0.00034572 [convert_after_rewriter]: 2.776e-05 [order_py_execute_after_rewriter]: 0.00015581 [mutable_eliminate]: 0.00320678 [opt_b]: 0.0021512, [1] [Cycle 1]: 0.00212906, [7] [b_1]: 0.00111773 [b_2]: 2.12e-05 [updatestate_depend_eliminate]: 1.806e-05 [updatestate_assign_eliminate]: 5.47001e-06 [updatestate_loads_eliminate]: 1.004e-05 [renormalize]: 1.08001e-06 [cse]: 0.00019368 [optimize_parallel_all_gather_comm]: 0.00017155 [overlap_param_gather]: 8.57998e-06 [cconv]: 0.00017989 [loop_unroll]: 0.0358689 [opt_after_cconv]: 0.00021824, [1] [Cycle 1]: 0.00020574, [7] [c_1]: 5.122e-05 [parameter_eliminate]: 6.87002e-06 [updatestate_depend_eliminate]: 1.474e-05 [updatestate_assign_eliminate]: 4.85999e-06 [updatestate_loads_eliminate]: 4.13999e-06 [cse]: 6.058e-05 [renormalize]: 1.42e-06 [remove_dup_value]: 9.722e-05 [tuple_transform]: 0.00012861, [1] [Cycle 1]: 0.0001207, [4] [d_1]: 7.607e-05 [none_parameter_eliminate]: 2.09999e-06 [renormalize]: 1.90019e-07 [switch_simplify]: 1.043e-05 [partial_unused_args_eliminate]: 5.52001e-06 [add_recomputation]: 8.416e-05 [cse_after_recomputation]: 4.12e-05, [1] [Cycle 1]: 3.272e-05, [1] [cse]: 2.232e-05 [environ_conv]: 1.043e-05 [swap_dp_allreduce_reducescatter]: 1.071e-05 [bias_add_comm_swap]: 6.28002e-06 [label_micro_interleaved_index]: 1.082e-05 [label_fine_grained_interleaved_index]: 5.22999e-06 [merge_cast_opt]: 3.90998e-06 [slice_recompute_activation]: 5.27001e-06 [micro_interleaved_order_control]: 5.54998e-06 [assign_add_opt]: 4.08999e-06 [ForceFp32Comm]: 3.62002e-06 [remove_cast_before_assign_add]: 3.55e-06 [full_micro_interleaved_order_control]: 4.60999e-06 [reorder_send_recv_between_fp_bp]: 5.71e-06 [comm_op_add_attrs]: 3.91999e-06 [add_comm_op_reuse_tag]: 3.77998e-06 [interleave_split_concat_branches]: 3.46999e-06 [interleave_parallel_branches]: 3.65e-06 [overlap_opt_shard_in_pipeline]: 4.1e-06 [overlap_opt_shard_grad_in_pipeline]: 4.07e-06 [control_data_broadcast_order]: 2.41e-05 [grouped_pairwise_exchange_alltoall]: 4e-06 [offloading_packed_experts]: 7.86001e-06 [overlap_recompute_and_grad_model_parallel]: 8.65001e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.51001e-06 [overlap_recompute_allgather_and_fa_grad]: 3.54002e-06 [overlap_recompute_comm]: 4.96002e-06 [overlap_grad_ring_attention]: 8.47e-06 [overlap_grad_flash_sp]: 3.094e-05 [begin_end_overlap_inline]: 2.98e-06 [split_matmul_comm_elemetwise]: 4.57e-06 [split_layernorm_comm]: 4.17998e-06 [handle_group_info]: 3.4e-06 [symbol_engine_optimizer]: 0.00012268, [1] [Cycle 1]: 0.00011546, [6] [build]: 4.98001e-06 [elim_shapecalc]: 1.638e-05 [elim_not_effective]: 1.969e-05 [opt_reshape]: 1.072e-05 [fold_const_symbol]: 1.47e-05 [renormalize]: 1.8999e-07 [detach_backward]: 3.93001e-06 [pipeline_parallel_scheduler]: 2.00002e-06 [auto_monad_reorder]: 2.699e-05 [get_jit_bprop_graph]: 1.94e-06 [rewriter_after_jit_bprop_graph]: 8.18999e-06 [opt_after_jit_grad]: 0.00068142 [validate]: 4.87e-05 Sums bootstrap : 0.001758s : 1.56% type_inference : 0.048640s : 43.04% event_method : 0.000159s : 0.14% auto_monad : 0.000224s : 0.20% graph_reusing : 0.000010s : 0.01% inline : 0.000003s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000152s : 0.13% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000012s : 0.01% parallel-infer-symbol : 0.000006s : 0.01% pre_auto_parallel : 0.000201s : 0.18% insert-virtual-dataset : 0.000135s : 0.12% parallel-infer-symbol-second : 0.000002s : 0.00% dataset_repeat_opt : 0.000003s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000190s : 0.17% optimize.rewriter_before_opt_a : 0.000402s : 0.36% optimize.opt_a.expand_dump_flag : 0.000009s : 0.01% optimize.opt_a.switch_simplify : 0.000339s : 0.30% optimize.opt_a.loop_unroll : 0.000185s : 0.16% optimize.opt_a.a_1 : 0.007013s : 6.21% optimize.opt_a.with_stream_mark : 0.000324s : 0.29% optimize.opt_a.recompute_prepare : 0.000046s : 0.04% optimize.opt_a.updatestate_depend_eliminate : 0.000154s : 0.14% optimize.opt_a.updatestate_assign_eliminate : 0.000014s : 0.01% optimize.opt_a.updatestate_loads_eliminate : 0.000010s : 0.01% optimize.opt_a.parameter_eliminate : 0.000138s : 0.12% optimize.opt_a.a_2 : 0.001180s : 1.04% optimize.opt_a.accelerated_algorithm : 0.000039s : 0.03% optimize.opt_a.shard : 0.000008s : 0.01% optimize.opt_a.meta_shard_fg_expand : 0.000012s : 0.01% optimize.opt_a.shard_inline : 0.000035s : 0.03% optimize.opt_a.merge_send_recv : 0.000040s : 0.03% optimize.opt_a.auto_parallel : 0.000166s : 0.15% optimize.opt_a.parallel : 0.000038s : 0.03% optimize.opt_a.flash_sp : 0.000028s : 0.02% optimize.opt_a.merge_comm : 0.000294s : 0.26% optimize.opt_a.allreduce_fusion : 0.000016s : 0.01% optimize.opt_a.matmul_add_comm_reduction : 0.000295s : 0.26% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000002s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000177s : 0.16% optimize.opt_a.virtual_dataset : 0.000038s : 0.03% optimize.opt_a.get_grad_eliminate_ : 0.000160s : 0.14% optimize.opt_a.virtual_output : 0.000037s : 0.03% optimize.opt_a.merge_forward : 0.000024s : 0.02% optimize.opt_a.cell_reuse_recompute_pass : 0.000010s : 0.01% optimize.opt_a.offload_activation : 0.000182s : 0.16% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000234s : 0.21% optimize.opt_a.merge_recompute_call_nodes : 0.000004s : 0.00% optimize.opt_a.before_grad : 0.000193s : 0.17% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000158s : 0.14% optimize.opt_a.meta_fg_expand : 0.000013s : 0.01% optimize.opt_a.flash_sp_send_recv_attached : 0.000010s : 0.01% optimize.opt_a.receive_attached : 0.000007s : 0.01% optimize.opt_a.after_resolve : 0.000325s : 0.29% optimize.opt_a.a_after_grad : 0.000183s : 0.16% optimize.opt_a.renormalize : 0.004488s : 3.97% optimize.opt_a.add_forward_monad_depend : 0.000146s : 0.13% optimize.opt_a.auto_monad_grad : 0.000007s : 0.01% optimize.opt_a.auto_monad_eliminator : 0.000319s : 0.28% optimize.opt_a.cse : 0.000368s : 0.33% optimize.opt_a.a_3 : 0.000631s : 0.56% optimize.py_interpret_to_execute_after_opt_a : 0.000168s : 0.15% optimize.slice_cell_reuse_recomputed_activation : 0.000007s : 0.01% optimize.rewriter_after_opt_a : 0.000346s : 0.31% optimize.convert_after_rewriter : 0.000028s : 0.02% optimize.order_py_execute_after_rewriter : 0.000156s : 0.14% optimize.mutable_eliminate : 0.003207s : 2.84% optimize.opt_b.b_1 : 0.001118s : 0.99% optimize.opt_b.b_2 : 0.000021s : 0.02% optimize.opt_b.updatestate_depend_eliminate : 0.000018s : 0.02% optimize.opt_b.updatestate_assign_eliminate : 0.000005s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000010s : 0.01% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000194s : 0.17% optimize.optimize_parallel_all_gather_comm : 0.000172s : 0.15% optimize.overlap_param_gather : 0.000009s : 0.01% optimize.cconv : 0.000180s : 0.16% optimize.loop_unroll : 0.035869s : 31.74% optimize.opt_after_cconv.c_1 : 0.000051s : 0.05% optimize.opt_after_cconv.parameter_eliminate : 0.000007s : 0.01% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000015s : 0.01% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000005s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000004s : 0.00% optimize.opt_after_cconv.cse : 0.000061s : 0.05% optimize.opt_after_cconv.renormalize : 0.000001s : 0.00% optimize.remove_dup_value : 0.000097s : 0.09% optimize.tuple_transform.d_1 : 0.000076s : 0.07% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000010s : 0.01% optimize.partial_unused_args_eliminate : 0.000006s : 0.00% optimize.add_recomputation : 0.000084s : 0.07% optimize.cse_after_recomputation.cse : 0.000022s : 0.02% optimize.environ_conv : 0.000010s : 0.01% optimize.swap_dp_allreduce_reducescatter : 0.000011s : 0.01% optimize.bias_add_comm_swap : 0.000006s : 0.01% optimize.label_micro_interleaved_index : 0.000011s : 0.01% optimize.label_fine_grained_interleaved_index : 0.000005s : 0.00% optimize.merge_cast_opt : 0.000004s : 0.00% optimize.slice_recompute_activation : 0.000005s : 0.00% optimize.micro_interleaved_order_control : 0.000006s : 0.00% optimize.assign_add_opt : 0.000004s : 0.00% optimize.ForceFp32Comm : 0.000004s : 0.00% optimize.remove_cast_before_assign_add : 0.000004s : 0.00% optimize.full_micro_interleaved_order_control : 0.000005s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000006s : 0.01% optimize.comm_op_add_attrs : 0.000004s : 0.00% optimize.add_comm_op_reuse_tag : 0.000004s : 0.00% optimize.interleave_split_concat_branches : 0.000003s : 0.00% optimize.interleave_parallel_branches : 0.000004s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000004s : 0.00% optimize.control_data_broadcast_order : 0.000024s : 0.02% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.00% optimize.offloading_packed_experts : 0.000008s : 0.01% optimize.overlap_recompute_and_grad_model_parallel : 0.000009s : 0.01% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.00% optimize.overlap_recompute_comm : 0.000005s : 0.00% optimize.overlap_grad_ring_attention : 0.000008s : 0.01% optimize.overlap_grad_flash_sp : 0.000031s : 0.03% optimize.begin_end_overlap_inline : 0.000003s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000005s : 0.00% optimize.split_layernorm_comm : 0.000004s : 0.00% optimize.handle_group_info : 0.000003s : 0.00% optimize.symbol_engine_optimizer.build : 0.000005s : 0.00% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000016s : 0.01% optimize.symbol_engine_optimizer.elim_not_effective : 0.000020s : 0.02% optimize.symbol_engine_optimizer.opt_reshape : 0.000011s : 0.01% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000015s : 0.01% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000004s : 0.00% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000027s : 0.02% get_jit_bprop_graph : 0.000002s : 0.00% rewriter_after_jit_bprop_graph : 0.000008s : 0.01% opt_after_jit_grad : 0.000681s : 0.60% validate : 0.000049s : 0.04% Time group info: ------[substitution.] 0.002332 49 3.03% : 0.000071s : 6: substitution.cast_eliminate 0.14% : 0.000003s : 4: substitution.elim_not_effective 0.08% : 0.000002s : 4: substitution.fold_const_symbol 0.37% : 0.000009s : 6: substitution.graph_param_transform 76.95% : 0.001794s : 4: substitution.inline 0.41% : 0.000010s : 8: substitution.j_node_and_user_rematch 0.63% : 0.000015s : 8: substitution.remove_not_recompute_node 6.27% : 0.000146s : 4: substitution.replace_old_param 6.43% : 0.000150s : 4: substitution.tuple_list_get_item_eliminator 5.70% : 0.000133s : 1: substitution.value_based_eliminate ------[type_inference.] 0.048424 2 90.41% : 0.043778s : 1: type_inference.infer 9.59% : 0.004646s : 1: type_inference.specialize ------[replace.] 0.002005 8 90.11% : 0.001806s : 4: replace.inline 9.89% : 0.000198s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.001934 8 92.44% : 0.001788s : 4: match.inline 7.56% : 0.000146s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000951 1730 0.34% : 0.000003s : 17: predicate.accumulaten_eliminater 0.25% : 0.000002s : 6: predicate.ad_related_special_op_eliminate 0.18% : 0.000002s : 12: predicate.addn_check_dump 0.41% : 0.000004s : 17: predicate.addn_zero_filter 0.38% : 0.000004s : 17: predicate.adjust_all_reduce_mul_add 1.13% : 0.000011s : 29: predicate.arithmetic_simplify 0.52% : 0.000005s : 17: predicate.cast_eliminate 0.26% : 0.000002s : 12: predicate.check_bprop_eliminate 0.21% : 0.000002s : 12: predicate.compare_switch_simplify 0.08% : 0.000001s : 6: predicate.const_output_eliminate 0.26% : 0.000002s : 12: predicate.depend_value_elim 0.38% : 0.000004s : 17: predicate.dict_get_item_const_eliminator 0.62% : 0.000006s : 17: predicate.dict_get_item_eliminator 0.33% : 0.000003s : 17: predicate.dict_set_item_eliminator 0.34% : 0.000003s : 12: predicate.dumpgradient_eliminate 0.08% : 0.000001s : 6: predicate.elim_not_effective 0.13% : 0.000001s : 6: predicate.elim_shapecalc_of_broadcastargs 0.45% : 0.000004s : 23: predicate.environ_add_const_eliminate 0.67% : 0.000006s : 23: predicate.environ_get_add_eliminate 0.42% : 0.000004s : 23: predicate.environ_get_depend_swap 0.85% : 0.000008s : 35: predicate.environ_get_eliminate 0.45% : 0.000004s : 23: predicate.environ_get_set_eliminate 0.49% : 0.000005s : 25: predicate.exchange_switch_depend_value 1.34% : 0.000013s : 25: predicate.float_depend_g_call 0.23% : 0.000002s : 12: predicate.float_environ_get_switch 0.43% : 0.000004s : 18: predicate.float_tuple_getitem_switch 0.06% : 0.000001s : 6: predicate.fold_const_symbol 0.38% : 0.000004s : 12: predicate.get_grad_eliminate 0.08% : 0.000001s : 6: predicate.graph_param_transform 0.34% : 0.000003s : 12: predicate.incorporate_call 0.30% : 0.000003s : 12: predicate.incorporate_call_switch 16.38% : 0.000156s : 78: predicate.inline 0.66% : 0.000006s : 12: predicate.inline_without_move 0.17% : 0.000002s : 12: predicate.j_node_and_user_rematch 0.57% : 0.000005s : 12: predicate.less_batch_normalization 0.87% : 0.000008s : 33: predicate.list_to_tuple_eliminator_ 12.91% : 0.000123s : 50: predicate.load_eliminater 0.84% : 0.000008s : 6: predicate.loop_unroll_after_grad 0.96% : 0.000009s : 38: predicate.loop_unroll_before_grad 14.54% : 0.000138s : 29: predicate.make_slice_get_slice_eliminator 0.48% : 0.000005s : 12: predicate.merge_addn 0.23% : 0.000002s : 12: predicate.micro_step_allgather_replace 0.21% : 0.000002s : 12: predicate.mini_step_allgather_replace 0.30% : 0.000003s : 17: predicate.minmaximum_grad 0.79% : 0.000008s : 6: predicate.mutable_eliminate 0.14% : 0.000001s : 6: predicate.opt_reshape 0.23% : 0.000002s : 6: predicate.parallel_virtual_node 1.76% : 0.000017s : 25: predicate.partial_defer_inline 0.66% : 0.000006s : 27: predicate.partial_eliminate 0.33% : 0.000003s : 17: predicate.print_const_string_wrapper 0.23% : 0.000002s : 12: predicate.reduce_all_const_elim 0.63% : 0.000006s : 17: predicate.reduce_eliminate 1.27% : 0.000012s : 50: predicate.redundant_stop_gradient_eliminater 0.27% : 0.000003s : 12: predicate.remove_not_recompute_node 0.57% : 0.000005s : 33: predicate.replace_applicator 0.21% : 0.000002s : 12: predicate.replace_old_param 0.14% : 0.000001s : 6: predicate.reset_defer_inline 0.45% : 0.000004s : 17: predicate.reshape_eliminate 0.31% : 0.000003s : 12: predicate.row_tensor_add_zeros_like 0.24% : 0.000002s : 6: predicate.row_tensor_eliminate 0.61% : 0.000006s : 12: predicate.same_eliminate 0.20% : 0.000002s : 12: predicate.set_cell_output_no_recompute 0.39% : 0.000004s : 12: predicate.shard_identity_eliminate 0.29% : 0.000003s : 12: predicate.special_op_eliminate 0.38% : 0.000004s : 12: predicate.specialize_transform 0.55% : 0.000005s : 12: predicate.split_environ_get_set_with_tuple_value 0.43% : 0.000004s : 12: predicate.stack_unstack_eliminate 0.12% : 0.000001s : 6: predicate.switch_call_monad_eliminater 0.49% : 0.000005s : 25: predicate.switch_defer_inline 1.07% : 0.000010s : 37: predicate.switch_layer_defer_inline 2.34% : 0.000022s : 81: predicate.switch_simplify 12.99% : 0.000124s : 17: predicate.tile_eliminate 0.48% : 0.000005s : 17: predicate.transpose_eliminate 0.68% : 0.000007s : 29: predicate.tuple_list_convert_item_index_to_positive 0.85% : 0.000008s : 29: predicate.tuple_list_get_item_const_eliminator 0.61% : 0.000006s : 29: predicate.tuple_list_get_item_depend_reorder 1.53% : 0.000015s : 45: predicate.tuple_list_get_item_eliminator 0.66% : 0.000006s : 29: predicate.tuple_list_get_set_item_eliminator 1.13% : 0.000011s : 41: predicate.tuple_list_set_item_eliminator 0.73% : 0.000007s : 33: predicate.tuple_to_list_eliminator_ 0.81% : 0.000008s : 50: predicate.updatestate_pure_node_eliminater 1.36% : 0.000013s : 62: predicate.updatestate_useless_node_eliminater 0.27% : 0.000003s : 6: predicate.value_based_eliminate 0.56% : 0.000005s : 12: predicate.virtual_dataset_eliminate 0.35% : 0.000003s : 12: predicate.virtual_output_eliminate 0.09% : 0.000001s : 6: predicate.virtual_view_grad_eliminate 0.28% : 0.000003s : 6: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.003869 11 56.86% : 0.002200s : 5: func_graph_cloner_run.FuncGraphClonerGraph 43.14% : 0.001669s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.239494 192 0.00% : 0.000006s : 1: ForceFp32Comm 7.38% : 0.017675s : 1: add_attr 7.30% : 0.017489s : 1: add_attr_with_inline 0.00% : 0.000006s : 1: add_comm_op_reuse_tag 0.04% : 0.000088s : 1: add_recomputation 0.00% : 0.000007s : 1: assign_add_opt 0.15% : 0.000367s : 1: auto_monad 0.01% : 0.000035s : 1: auto_monad_reorder 0.00% : 0.000007s : 1: begin_end_overlap_inline 0.00% : 0.000009s : 1: bias_add_comm_swap 0.81% : 0.001939s : 1: bootstrap 0.08% : 0.000188s : 1: cconv 0.00% : 0.000007s : 1: comm_op_add_attrs 0.01% : 0.000027s : 1: control_data_broadcast_order 0.01% : 0.000033s : 1: convert_after_rewriter 0.02% : 0.000045s : 1: cse_after_recomputation 0.00% : 0.000010s : 1: dataset_repeat_opt 0.01% : 0.000019s : 1: detach_backward 0.01% : 0.000013s : 1: environ_conv 0.08% : 0.000185s : 1: event_method 0.00% : 0.000007s : 1: full_micro_interleaved_order_control 0.00% : 0.000008s : 1: get_jit_bprop_graph 0.01% : 0.000021s : 1: graph_reusing 0.00% : 0.000008s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000006s : 1: handle_group_info 0.00% : 0.000009s : 1: inline 0.07% : 0.000156s : 1: insert-virtual-dataset 0.00% : 0.000006s : 1: interleave_parallel_branches 0.00% : 0.000006s : 1: interleave_split_concat_branches 0.00% : 0.000008s : 1: label_fine_grained_interleaved_index 0.01% : 0.000014s : 1: label_micro_interleaved_index 14.98% : 0.035884s : 1: loop_unroll 0.00% : 0.000007s : 1: merge_cast_opt 0.00% : 0.000008s : 1: micro_interleaved_order_control 1.35% : 0.003224s : 1: mutable_eliminate 0.00% : 0.000011s : 1: offloading_packed_experts 0.02% : 0.000046s : 1: opt.transform.loop_unroll_optimizer 0.07% : 0.000167s : 1: opt.transform.mutable_eliminate 4.05% : 0.009693s : 78: opt.transform.opt_a 0.02% : 0.000049s : 1: opt.transform.opt_after_cconv 0.02% : 0.000038s : 1: opt.transform.opt_after_jit_grad 0.26% : 0.000618s : 28: opt.transform.opt_b 0.03% : 0.000083s : 2: opt.transform.opt_trans_graph 0.02% : 0.000057s : 4: opt.transform.symbol_engine_opt 10.01% : 0.023973s : 1: opt_a 0.09% : 0.000223s : 1: opt_after_cconv 0.29% : 0.000692s : 1: opt_after_jit_grad 0.90% : 0.002158s : 1: opt_b 28.62% : 0.068547s : 1: optimize 0.08% : 0.000180s : 1: optimize_parallel_all_gather_comm 0.07% : 0.000164s : 1: order_py_execute_after_rewriter 0.01% : 0.000034s : 1: overlap_grad_flash_sp 0.00% : 0.000006s : 1: overlap_grad_matmul_and_grad_allreduce 0.00% : 0.000011s : 1: overlap_grad_ring_attention 0.00% : 0.000007s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000007s : 1: overlap_opt_shard_in_pipeline 0.01% : 0.000013s : 1: overlap_param_gather 0.00% : 0.000006s : 1: overlap_recompute_allgather_and_fa_grad 0.00% : 0.000011s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000008s : 1: overlap_recompute_comm 0.01% : 0.000021s : 1: parallel-infer-symbol 0.00% : 0.000010s : 1: parallel-infer-symbol-second 0.00% : 0.000009s : 1: partial_unused_args_eliminate 0.00% : 0.000010s : 1: pipeline_parallel_scheduler 0.00% : 0.000008s : 1: pipeline_split 0.09% : 0.000221s : 1: pre_auto_parallel 0.14% : 0.000324s : 1: py_interpret_to_execute 0.07% : 0.000177s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000006s : 1: remove_cast_before_assign_add 0.04% : 0.000101s : 1: remove_dup_value 1.03% : 0.002457s : 1: renormalize.infer 0.79% : 0.001884s : 1: renormalize.specialize 0.00% : 0.000009s : 1: reorder_send_recv_between_fp_bp 0.01% : 0.000014s : 1: rewriter_after_jit_bprop_graph 0.15% : 0.000354s : 1: rewriter_after_opt_a 0.17% : 0.000412s : 1: rewriter_before_opt_a 0.00% : 0.000011s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000008s : 1: slice_recompute_activation 0.00% : 0.000007s : 1: split_layernorm_comm 0.00% : 0.000007s : 1: split_matmul_comm_elemetwise 0.01% : 0.000014s : 1: swap_dp_allreduce_reducescatter 0.05% : 0.000126s : 1: symbol_engine_optimizer 0.05% : 0.000132s : 1: tuple_transform 20.39% : 0.048840s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:44:18.262.747 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0417294, [21] [bootstrap]: 0.00043977 [type_inference]: 0.00666102 [event_method]: 2.216e-05 [auto_monad]: 6.946e-05 [graph_reusing]: 6.11e-06 [inline]: 2.79001e-06 [add_attr]: 0.00349157, [1] [add_attr_with_inline]: 0.00347954, [1] [Cycle 1]: 7.622e-05, [2] [tag_attr]: 2.466e-05 [meta_addattr_fg_expand]: 6.20002e-06 [parallel-infer-symbol]: 4.05e-06 [pre_auto_parallel]: 4.239e-05 [insert-virtual-dataset]: 2.64999e-06 [parallel-infer-symbol-second]: 8.30012e-07 [dataset_repeat_opt]: 1.99e-06 [pipeline_split]: 1.69e-06 [optimize]: 0.0244959, [53] [py_interpret_to_execute]: 3.204e-05 [rewriter_before_opt_a]: 0.00010022 [opt_a]: 0.00884593, [2] [Cycle 1]: 0.00268174, [45] [expand_dump_flag]: 3.26001e-06 [switch_simplify]: 4.908e-05 [loop_unroll]: 3.468e-05 [a_1]: 0.0007716 [with_stream_mark]: 2.288e-05 [recompute_prepare]: 1.322e-05 [updatestate_depend_eliminate]: 5.46e-06 [updatestate_assign_eliminate]: 4.79e-06 [updatestate_loads_eliminate]: 4.2e-06 [parameter_eliminate]: 2.42001e-06 [a_2]: 0.00012223 [accelerated_algorithm]: 1.023e-05 [shard]: 1.92999e-06 [meta_shard_fg_expand]: 2.58e-06 [shard_inline]: 9.27001e-06 [merge_send_recv]: 1.054e-05 [auto_parallel]: 9.51e-06 [parallel]: 2.113e-05 [flash_sp]: 9.57001e-06 [merge_comm]: 5.25999e-06 [allreduce_fusion]: 5.12999e-06 [matmul_add_comm_reduction]: 1.207e-05 [allreduce_slice_to_reducescatter]: 7.60017e-07 [virtual_shard_identity]: 1.151e-05 [virtual_dataset]: 9.28002e-06 [get_grad_eliminate_]: 8.46002e-06 [virtual_output]: 8.52e-06 [merge_forward]: 5.57999e-06 [cell_reuse_recompute_pass]: 1.49e-06 [offload_activation]: 1.166e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.949e-05 [merge_recompute_call_nodes]: 1.74e-06 [before_grad]: 1.567e-05 [set_forward_comm_id_for_comm_node_pass]: 5.52999e-06 [meta_fg_expand]: 4e-06 [flash_sp_send_recv_attached]: 2.89001e-06 [receive_attached]: 2.62001e-06 [after_resolve]: 1.541e-05 [a_after_grad]: 1.424e-05 [renormalize]: 0.00094916 [add_forward_monad_depend]: 7.76001e-06 [auto_monad_grad]: 2.68e-06 [auto_monad_eliminator]: 2.23e-05 [cse]: 4.582e-05 [a_3]: 0.00011261 [Cycle 2]: 0.00614945, [45] [expand_dump_flag]: 2.38998e-06 [switch_simplify]: 1.12e-05 [loop_unroll]: 8.37e-06 [a_1]: 0.00106707 [with_stream_mark]: 0.00017317 [recompute_prepare]: 0.0001588 [updatestate_depend_eliminate]: 1.257e-05 [updatestate_assign_eliminate]: 5.40999e-06 [updatestate_loads_eliminate]: 0.00014128 [parameter_eliminate]: 4.74002e-06 [a_2]: 0.00055827 [accelerated_algorithm]: 0.00014926 [shard]: 5.40999e-06 [meta_shard_fg_expand]: 5.73002e-06 [shard_inline]: 0.00015357 [merge_send_recv]: 1.826e-05 [auto_parallel]: 2.327e-05 [parallel]: 1.493e-05 [flash_sp]: 9.37999e-06 [merge_comm]: 1.192e-05 [allreduce_fusion]: 0.00014334 [matmul_add_comm_reduction]: 2.009e-05 [allreduce_slice_to_reducescatter]: 1.19e-06 [virtual_shard_identity]: 0.0001577 [virtual_dataset]: 1.531e-05 [get_grad_eliminate_]: 2.107e-05 [virtual_output]: 0.00016548 [merge_forward]: 1.398e-05 [cell_reuse_recompute_pass]: 4.67998e-06 [offload_activation]: 0.00017488 [cell_reuse_handle_not_recompute_node_pass]: 0.00018721 [merge_recompute_call_nodes]: 4.3e-06 [before_grad]: 3.184e-05 [set_forward_comm_id_for_comm_node_pass]: 0.00015144 [meta_fg_expand]: 8.06001e-06 [flash_sp_send_recv_attached]: 3.53e-06 [receive_attached]: 0.00013492 [after_resolve]: 3.553e-05 [a_after_grad]: 2.814e-05 [renormalize]: 8.00064e-08 [add_forward_monad_depend]: 0.00013728 [auto_monad_grad]: 5.52001e-06 [auto_monad_eliminator]: 0.00016227 [cse]: 0.00020143 [a_3]: 0.00035349 [py_interpret_to_execute_after_opt_a]: 0.00016877 [slice_cell_reuse_recomputed_activation]: 3.85e-06 [rewriter_after_opt_a]: 0.00037883 [convert_after_rewriter]: 2.555e-05 [order_py_execute_after_rewriter]: 1.046e-05 [mutable_eliminate]: 0.00401007 [opt_b]: 0.0018217, [1] [Cycle 1]: 0.00180191, [7] [b_1]: 0.0010602 [b_2]: 0.00015343 [updatestate_depend_eliminate]: 1.678e-05 [updatestate_assign_eliminate]: 1.033e-05 [updatestate_loads_eliminate]: 5.72001e-06 [renormalize]: 1.05999e-06 [cse]: 0.00019098 [optimize_parallel_all_gather_comm]: 3.389e-05 [overlap_param_gather]: 4.89e-06 [cconv]: 4.293e-05 [loop_unroll]: 0.00283769 [opt_after_cconv]: 0.00093786, [1] [Cycle 1]: 0.00092045, [7] [c_1]: 0.0001864 [parameter_eliminate]: 7.77998e-06 [updatestate_depend_eliminate]: 1.616e-05 [updatestate_assign_eliminate]: 0.00014021 [updatestate_loads_eliminate]: 8.94e-06 [cse]: 0.00020092 [renormalize]: 5.79981e-07 [remove_dup_value]: 0.00020133 [tuple_transform]: 0.00110414, [1] [Cycle 1]: 0.00096252, [4] [d_1]: 0.00075008 [none_parameter_eliminate]: 5.46e-06 [renormalize]: 4.10015e-07 [switch_simplify]: 2.078e-05 [partial_unused_args_eliminate]: 5.14e-06 [add_recomputation]: 0.00036819 [cse_after_recomputation]: 0.00021371, [1] [Cycle 1]: 0.00020366, [1] [cse]: 0.00018371 [environ_conv]: 1.122e-05 [swap_dp_allreduce_reducescatter]: 1.736e-05 [bias_add_comm_swap]: 3.25e-06 [label_micro_interleaved_index]: 8.87e-06 [label_fine_grained_interleaved_index]: 3.21999e-06 [merge_cast_opt]: 3.7e-06 [slice_recompute_activation]: 3.18e-06 [micro_interleaved_order_control]: 5.24e-06 [assign_add_opt]: 1.74998e-06 [ForceFp32Comm]: 9.39996e-07 [remove_cast_before_assign_add]: 1.19e-06 [full_micro_interleaved_order_control]: 2.32999e-06 [reorder_send_recv_between_fp_bp]: 0.00013875 [comm_op_add_attrs]: 1.42e-06 [add_comm_op_reuse_tag]: 1.13001e-06 [interleave_split_concat_branches]: 3.03e-06 [interleave_parallel_branches]: 1.15999e-06 [overlap_opt_shard_in_pipeline]: 1.44e-06 [overlap_opt_shard_grad_in_pipeline]: 2.06998e-06 [control_data_broadcast_order]: 0.00016888 [grouped_pairwise_exchange_alltoall]: 2.01998e-06 [offloading_packed_experts]: 9.55001e-06 [overlap_recompute_and_grad_model_parallel]: 0.0001585 [overlap_grad_matmul_and_grad_allreduce]: 3.28998e-06 [overlap_recompute_allgather_and_fa_grad]: 1.62001e-06 [overlap_recompute_comm]: 3.34001e-06 [overlap_grad_ring_attention]: 8.65999e-06 [overlap_grad_flash_sp]: 0.00019369 [begin_end_overlap_inline]: 2.22999e-06 [split_matmul_comm_elemetwise]: 3.21999e-06 [split_layernorm_comm]: 1.85001e-06 [handle_group_info]: 1.28002e-06 [symbol_engine_optimizer]: 0.00098927, [1] [Cycle 1]: 0.00083918, [6] [build]: 1.059e-05 [elim_shapecalc]: 0.00019001 [elim_not_effective]: 0.00019703 [opt_reshape]: 1.884e-05 [fold_const_symbol]: 3.002e-05 [renormalize]: 8.30012e-07 [detach_backward]: 3.72998e-06 [pipeline_parallel_scheduler]: 2.27999e-06 [auto_monad_reorder]: 0.00017662 [get_jit_bprop_graph]: 2.44001e-06 [rewriter_after_jit_bprop_graph]: 0.00013938 [opt_after_jit_grad]: 0.00585484 [validate]: 6.737e-05 Sums bootstrap : 0.000440s : 1.33% type_inference : 0.006661s : 20.16% event_method : 0.000022s : 0.07% auto_monad : 0.000069s : 0.21% graph_reusing : 0.000006s : 0.02% inline : 0.000003s : 0.01% add_attr.add_attr_with_inline.tag_attr : 0.000025s : 0.07% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.02% parallel-infer-symbol : 0.000004s : 0.01% pre_auto_parallel : 0.000042s : 0.13% insert-virtual-dataset : 0.000003s : 0.01% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.01% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000032s : 0.10% optimize.rewriter_before_opt_a : 0.000100s : 0.30% optimize.opt_a.expand_dump_flag : 0.000006s : 0.02% optimize.opt_a.switch_simplify : 0.000060s : 0.18% optimize.opt_a.loop_unroll : 0.000043s : 0.13% optimize.opt_a.a_1 : 0.001839s : 5.56% optimize.opt_a.with_stream_mark : 0.000196s : 0.59% optimize.opt_a.recompute_prepare : 0.000172s : 0.52% optimize.opt_a.updatestate_depend_eliminate : 0.000018s : 0.05% optimize.opt_a.updatestate_assign_eliminate : 0.000010s : 0.03% optimize.opt_a.updatestate_loads_eliminate : 0.000145s : 0.44% optimize.opt_a.parameter_eliminate : 0.000007s : 0.02% optimize.opt_a.a_2 : 0.000681s : 2.06% optimize.opt_a.accelerated_algorithm : 0.000159s : 0.48% optimize.opt_a.shard : 0.000007s : 0.02% optimize.opt_a.meta_shard_fg_expand : 0.000008s : 0.03% optimize.opt_a.shard_inline : 0.000163s : 0.49% optimize.opt_a.merge_send_recv : 0.000029s : 0.09% optimize.opt_a.auto_parallel : 0.000033s : 0.10% optimize.opt_a.parallel : 0.000036s : 0.11% optimize.opt_a.flash_sp : 0.000019s : 0.06% optimize.opt_a.merge_comm : 0.000017s : 0.05% optimize.opt_a.allreduce_fusion : 0.000148s : 0.45% optimize.opt_a.matmul_add_comm_reduction : 0.000032s : 0.10% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000002s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000169s : 0.51% optimize.opt_a.virtual_dataset : 0.000025s : 0.07% optimize.opt_a.get_grad_eliminate_ : 0.000030s : 0.09% optimize.opt_a.virtual_output : 0.000174s : 0.53% optimize.opt_a.merge_forward : 0.000020s : 0.06% optimize.opt_a.cell_reuse_recompute_pass : 0.000006s : 0.02% optimize.opt_a.offload_activation : 0.000187s : 0.56% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000207s : 0.63% optimize.opt_a.merge_recompute_call_nodes : 0.000006s : 0.02% optimize.opt_a.before_grad : 0.000048s : 0.14% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000157s : 0.48% optimize.opt_a.meta_fg_expand : 0.000012s : 0.04% optimize.opt_a.flash_sp_send_recv_attached : 0.000006s : 0.02% optimize.opt_a.receive_attached : 0.000138s : 0.42% optimize.opt_a.after_resolve : 0.000051s : 0.15% optimize.opt_a.a_after_grad : 0.000042s : 0.13% optimize.opt_a.renormalize : 0.000949s : 2.87% optimize.opt_a.add_forward_monad_depend : 0.000145s : 0.44% optimize.opt_a.auto_monad_grad : 0.000008s : 0.02% optimize.opt_a.auto_monad_eliminator : 0.000185s : 0.56% optimize.opt_a.cse : 0.000247s : 0.75% optimize.opt_a.a_3 : 0.000466s : 1.41% optimize.py_interpret_to_execute_after_opt_a : 0.000169s : 0.51% optimize.slice_cell_reuse_recomputed_activation : 0.000004s : 0.01% optimize.rewriter_after_opt_a : 0.000379s : 1.15% optimize.convert_after_rewriter : 0.000026s : 0.08% optimize.order_py_execute_after_rewriter : 0.000010s : 0.03% optimize.mutable_eliminate : 0.004010s : 12.14% optimize.opt_b.b_1 : 0.001060s : 3.21% optimize.opt_b.b_2 : 0.000153s : 0.46% optimize.opt_b.updatestate_depend_eliminate : 0.000017s : 0.05% optimize.opt_b.updatestate_assign_eliminate : 0.000010s : 0.03% optimize.opt_b.updatestate_loads_eliminate : 0.000006s : 0.02% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000191s : 0.58% optimize.optimize_parallel_all_gather_comm : 0.000034s : 0.10% optimize.overlap_param_gather : 0.000005s : 0.01% optimize.cconv : 0.000043s : 0.13% optimize.loop_unroll : 0.002838s : 8.59% optimize.opt_after_cconv.c_1 : 0.000186s : 0.56% optimize.opt_after_cconv.parameter_eliminate : 0.000008s : 0.02% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000016s : 0.05% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000140s : 0.42% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000009s : 0.03% optimize.opt_after_cconv.cse : 0.000201s : 0.61% optimize.opt_after_cconv.renormalize : 0.000001s : 0.00% optimize.remove_dup_value : 0.000201s : 0.61% optimize.tuple_transform.d_1 : 0.000750s : 2.27% optimize.tuple_transform.none_parameter_eliminate : 0.000005s : 0.02% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000021s : 0.06% optimize.partial_unused_args_eliminate : 0.000005s : 0.02% optimize.add_recomputation : 0.000368s : 1.11% optimize.cse_after_recomputation.cse : 0.000184s : 0.56% optimize.environ_conv : 0.000011s : 0.03% optimize.swap_dp_allreduce_reducescatter : 0.000017s : 0.05% optimize.bias_add_comm_swap : 0.000003s : 0.01% optimize.label_micro_interleaved_index : 0.000009s : 0.03% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.01% optimize.merge_cast_opt : 0.000004s : 0.01% optimize.slice_recompute_activation : 0.000003s : 0.01% optimize.micro_interleaved_order_control : 0.000005s : 0.02% optimize.assign_add_opt : 0.000002s : 0.01% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.00% optimize.full_micro_interleaved_order_control : 0.000002s : 0.01% optimize.reorder_send_recv_between_fp_bp : 0.000139s : 0.42% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000003s : 0.01% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.01% optimize.control_data_broadcast_order : 0.000169s : 0.51% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.01% optimize.offloading_packed_experts : 0.000010s : 0.03% optimize.overlap_recompute_and_grad_model_parallel : 0.000158s : 0.48% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000003s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000002s : 0.00% optimize.overlap_recompute_comm : 0.000003s : 0.01% optimize.overlap_grad_ring_attention : 0.000009s : 0.03% optimize.overlap_grad_flash_sp : 0.000194s : 0.59% optimize.begin_end_overlap_inline : 0.000002s : 0.01% optimize.split_matmul_comm_elemetwise : 0.000003s : 0.01% optimize.split_layernorm_comm : 0.000002s : 0.01% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000011s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000190s : 0.58% optimize.symbol_engine_optimizer.elim_not_effective : 0.000197s : 0.60% optimize.symbol_engine_optimizer.opt_reshape : 0.000019s : 0.06% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000030s : 0.09% optimize.symbol_engine_optimizer.renormalize : 0.000001s : 0.00% detach_backward : 0.000004s : 0.01% pipeline_parallel_scheduler : 0.000002s : 0.01% auto_monad_reorder : 0.000177s : 0.53% get_jit_bprop_graph : 0.000002s : 0.01% rewriter_after_jit_bprop_graph : 0.000139s : 0.42% opt_after_jit_grad : 0.005855s : 17.72% validate : 0.000067s : 0.20% Time group info: ------[substitution.] 0.000583 49 37.07% : 0.000216s : 6: substitution.cast_eliminate 0.82% : 0.000005s : 4: substitution.elim_not_effective 0.45% : 0.000003s : 4: substitution.fold_const_symbol 2.15% : 0.000013s : 6: substitution.graph_param_transform 29.76% : 0.000174s : 4: substitution.inline 1.33% : 0.000008s : 8: substitution.j_node_and_user_rematch 2.34% : 0.000014s : 8: substitution.remove_not_recompute_node 1.58% : 0.000009s : 4: substitution.replace_old_param 2.24% : 0.000013s : 4: substitution.tuple_list_get_item_eliminator 22.26% : 0.000130s : 1: substitution.value_based_eliminate ------[type_inference.] 0.006584 2 87.84% : 0.005784s : 1: type_inference.infer 12.16% : 0.000800s : 1: type_inference.specialize ------[replace.] 0.000064 8 62.87% : 0.000040s : 4: replace.inline 37.13% : 0.000024s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000181 8 94.01% : 0.000171s : 4: match.inline 5.99% : 0.000011s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.001116 1730 0.25% : 0.000003s : 17: predicate.accumulaten_eliminater 0.47% : 0.000005s : 6: predicate.ad_related_special_op_eliminate 0.17% : 0.000002s : 12: predicate.addn_check_dump 0.36% : 0.000004s : 17: predicate.addn_zero_filter 0.22% : 0.000002s : 17: predicate.adjust_all_reduce_mul_add 0.96% : 0.000011s : 29: predicate.arithmetic_simplify 0.33% : 0.000004s : 17: predicate.cast_eliminate 0.18% : 0.000002s : 12: predicate.check_bprop_eliminate 0.15% : 0.000002s : 12: predicate.compare_switch_simplify 0.07% : 0.000001s : 6: predicate.const_output_eliminate 0.20% : 0.000002s : 12: predicate.depend_value_elim 0.24% : 0.000003s : 17: predicate.dict_get_item_const_eliminator 0.26% : 0.000003s : 17: predicate.dict_get_item_eliminator 0.22% : 0.000002s : 17: predicate.dict_set_item_eliminator 11.06% : 0.000123s : 12: predicate.dumpgradient_eliminate 0.12% : 0.000001s : 6: predicate.elim_not_effective 0.34% : 0.000004s : 6: predicate.elim_shapecalc_of_broadcastargs 0.32% : 0.000004s : 23: predicate.environ_add_const_eliminate 0.31% : 0.000003s : 23: predicate.environ_get_add_eliminate 0.31% : 0.000004s : 23: predicate.environ_get_depend_swap 11.06% : 0.000124s : 35: predicate.environ_get_eliminate 0.30% : 0.000003s : 23: predicate.environ_get_set_eliminate 0.34% : 0.000004s : 25: predicate.exchange_switch_depend_value 0.58% : 0.000006s : 25: predicate.float_depend_g_call 0.18% : 0.000002s : 12: predicate.float_environ_get_switch 0.24% : 0.000003s : 18: predicate.float_tuple_getitem_switch 0.06% : 0.000001s : 6: predicate.fold_const_symbol 0.30% : 0.000003s : 12: predicate.get_grad_eliminate 0.11% : 0.000001s : 6: predicate.graph_param_transform 0.17% : 0.000002s : 12: predicate.incorporate_call 0.15% : 0.000002s : 12: predicate.incorporate_call_switch 2.49% : 0.000028s : 78: predicate.inline 0.45% : 0.000005s : 12: predicate.inline_without_move 0.11% : 0.000001s : 12: predicate.j_node_and_user_rematch 11.64% : 0.000130s : 12: predicate.less_batch_normalization 0.51% : 0.000006s : 33: predicate.list_to_tuple_eliminator_ 0.70% : 0.000008s : 50: predicate.load_eliminater 0.52% : 0.000006s : 6: predicate.loop_unroll_after_grad 0.52% : 0.000006s : 38: predicate.loop_unroll_before_grad 0.54% : 0.000006s : 29: predicate.make_slice_get_slice_eliminator 0.18% : 0.000002s : 12: predicate.merge_addn 0.38% : 0.000004s : 12: predicate.micro_step_allgather_replace 0.19% : 0.000002s : 12: predicate.mini_step_allgather_replace 0.23% : 0.000003s : 17: predicate.minmaximum_grad 11.62% : 0.000130s : 6: predicate.mutable_eliminate 0.27% : 0.000003s : 6: predicate.opt_reshape 11.87% : 0.000132s : 6: predicate.parallel_virtual_node 0.44% : 0.000005s : 25: predicate.partial_defer_inline 0.40% : 0.000005s : 27: predicate.partial_eliminate 0.23% : 0.000003s : 17: predicate.print_const_string_wrapper 0.17% : 0.000002s : 12: predicate.reduce_all_const_elim 0.35% : 0.000004s : 17: predicate.reduce_eliminate 0.68% : 0.000008s : 50: predicate.redundant_stop_gradient_eliminater 0.17% : 0.000002s : 12: predicate.remove_not_recompute_node 0.36% : 0.000004s : 33: predicate.replace_applicator 0.15% : 0.000002s : 12: predicate.replace_old_param 0.12% : 0.000001s : 6: predicate.reset_defer_inline 0.27% : 0.000003s : 17: predicate.reshape_eliminate 0.19% : 0.000002s : 12: predicate.row_tensor_add_zeros_like 0.23% : 0.000003s : 6: predicate.row_tensor_eliminate 0.39% : 0.000004s : 12: predicate.same_eliminate 0.15% : 0.000002s : 12: predicate.set_cell_output_no_recompute 0.45% : 0.000005s : 12: predicate.shard_identity_eliminate 0.23% : 0.000003s : 12: predicate.special_op_eliminate 0.31% : 0.000003s : 12: predicate.specialize_transform 0.36% : 0.000004s : 12: predicate.split_environ_get_set_with_tuple_value 0.30% : 0.000003s : 12: predicate.stack_unstack_eliminate 0.11% : 0.000001s : 6: predicate.switch_call_monad_eliminater 0.34% : 0.000004s : 25: predicate.switch_defer_inline 0.52% : 0.000006s : 37: predicate.switch_layer_defer_inline 1.43% : 0.000016s : 81: predicate.switch_simplify 0.24% : 0.000003s : 17: predicate.tile_eliminate 0.22% : 0.000002s : 17: predicate.transpose_eliminate 0.46% : 0.000005s : 29: predicate.tuple_list_convert_item_index_to_positive 0.46% : 0.000005s : 29: predicate.tuple_list_get_item_const_eliminator 0.45% : 0.000005s : 29: predicate.tuple_list_get_item_depend_reorder 1.03% : 0.000011s : 45: predicate.tuple_list_get_item_eliminator 0.42% : 0.000005s : 29: predicate.tuple_list_get_set_item_eliminator 0.82% : 0.000009s : 41: predicate.tuple_list_set_item_eliminator 12.38% : 0.000138s : 33: predicate.tuple_to_list_eliminator_ 0.64% : 0.000007s : 50: predicate.updatestate_pure_node_eliminater 1.11% : 0.000012s : 62: predicate.updatestate_useless_node_eliminater 0.27% : 0.000003s : 6: predicate.value_based_eliminate 0.25% : 0.000003s : 12: predicate.virtual_dataset_eliminate 0.20% : 0.000002s : 12: predicate.virtual_output_eliminate 0.08% : 0.000001s : 6: predicate.virtual_view_grad_eliminate 0.34% : 0.000004s : 6: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000609 11 52.53% : 0.000320s : 5: func_graph_cloner_run.FuncGraphClonerGraph 47.47% : 0.000289s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.075926 192 0.01% : 0.000004s : 1: ForceFp32Comm 4.61% : 0.003497s : 1: add_attr 4.59% : 0.003484s : 1: add_attr_with_inline 0.01% : 0.000004s : 1: add_comm_op_reuse_tag 0.50% : 0.000381s : 1: add_recomputation 0.01% : 0.000005s : 1: assign_add_opt 0.10% : 0.000075s : 1: auto_monad 0.25% : 0.000190s : 1: auto_monad_reorder 0.02% : 0.000014s : 1: begin_end_overlap_inline 0.01% : 0.000007s : 1: bias_add_comm_swap 0.62% : 0.000471s : 1: bootstrap 0.24% : 0.000180s : 1: cconv 0.01% : 0.000006s : 1: comm_op_add_attrs 0.24% : 0.000181s : 1: control_data_broadcast_order 0.23% : 0.000177s : 1: convert_after_rewriter 0.29% : 0.000219s : 1: cse_after_recomputation 0.01% : 0.000005s : 1: dataset_repeat_opt 0.01% : 0.000008s : 1: detach_backward 0.19% : 0.000147s : 1: environ_conv 0.04% : 0.000030s : 1: event_method 0.01% : 0.000006s : 1: full_micro_interleaved_order_control 0.01% : 0.000007s : 1: get_jit_bprop_graph 0.01% : 0.000009s : 1: graph_reusing 0.01% : 0.000008s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000004s : 1: handle_group_info 0.01% : 0.000006s : 1: inline 0.01% : 0.000006s : 1: insert-virtual-dataset 0.01% : 0.000004s : 1: interleave_parallel_branches 0.01% : 0.000006s : 1: interleave_split_concat_branches 0.01% : 0.000007s : 1: label_fine_grained_interleaved_index 0.02% : 0.000012s : 1: label_micro_interleaved_index 3.76% : 0.002857s : 1: loop_unroll 0.02% : 0.000016s : 1: merge_cast_opt 0.01% : 0.000008s : 1: micro_interleaved_order_control 5.32% : 0.004039s : 1: mutable_eliminate 0.20% : 0.000151s : 1: offloading_packed_experts 0.05% : 0.000036s : 1: opt.transform.loop_unroll_optimizer 0.22% : 0.000171s : 1: opt.transform.mutable_eliminate 5.28% : 0.004007s : 78: opt.transform.opt_a 0.24% : 0.000183s : 1: opt.transform.opt_after_cconv 0.08% : 0.000064s : 1: opt.transform.opt_after_jit_grad 0.84% : 0.000634s : 28: opt.transform.opt_b 0.34% : 0.000256s : 2: opt.transform.opt_trans_graph 0.55% : 0.000421s : 4: opt.transform.symbol_engine_opt 11.66% : 0.008850s : 1: opt_a 1.25% : 0.000946s : 1: opt_after_cconv 7.77% : 0.005900s : 1: opt_after_jit_grad 2.41% : 0.001832s : 1: opt_b 32.27% : 0.024505s : 1: optimize 0.05% : 0.000040s : 1: optimize_parallel_all_gather_comm 0.02% : 0.000016s : 1: order_py_execute_after_rewriter 0.64% : 0.000483s : 1: overlap_grad_flash_sp 0.01% : 0.000009s : 1: overlap_grad_matmul_and_grad_allreduce 0.02% : 0.000012s : 1: overlap_grad_ring_attention 0.01% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.02% : 0.000016s : 1: overlap_param_gather 0.01% : 0.000005s : 1: overlap_recompute_allgather_and_fa_grad 0.23% : 0.000173s : 1: overlap_recompute_and_grad_model_parallel 0.01% : 0.000007s : 1: overlap_recompute_comm 0.01% : 0.000008s : 1: parallel-infer-symbol 0.01% : 0.000004s : 1: parallel-infer-symbol-second 0.01% : 0.000010s : 1: partial_unused_args_eliminate 0.01% : 0.000006s : 1: pipeline_parallel_scheduler 0.01% : 0.000005s : 1: pipeline_split 0.06% : 0.000047s : 1: pre_auto_parallel 0.05% : 0.000037s : 1: py_interpret_to_execute 0.24% : 0.000183s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000004s : 1: remove_cast_before_assign_add 0.28% : 0.000215s : 1: remove_dup_value 0.71% : 0.000542s : 1: renormalize.infer 0.52% : 0.000397s : 1: renormalize.specialize 0.20% : 0.000152s : 1: reorder_send_recv_between_fp_bp 0.20% : 0.000153s : 1: rewriter_after_jit_bprop_graph 0.52% : 0.000395s : 1: rewriter_after_opt_a 0.14% : 0.000105s : 1: rewriter_before_opt_a 0.01% : 0.000009s : 1: slice_cell_reuse_recomputed_activation 0.01% : 0.000007s : 1: slice_recompute_activation 0.01% : 0.000005s : 1: split_layernorm_comm 0.01% : 0.000006s : 1: split_matmul_comm_elemetwise 0.04% : 0.000028s : 1: swap_dp_allreduce_reducescatter 1.32% : 0.001003s : 1: symbol_engine_optimizer 1.47% : 0.001116s : 1: tuple_transform 8.80% : 0.006683s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:44:18.790.535 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:44:18.790.868 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0220408, [21] [bootstrap]: 0.0004303 [type_inference]: 0.00859254 [event_method]: 2.234e-05 [auto_monad]: 7.109e-05 [graph_reusing]: 6.06998e-06 [inline]: 2.88003e-06 [add_attr]: 0.0036573, [1] [add_attr_with_inline]: 0.00364537, [1] [Cycle 1]: 9.175e-05, [2] [tag_attr]: 2.325e-05 [meta_addattr_fg_expand]: 5.91e-06 [parallel-infer-symbol]: 3.58999e-06 [pre_auto_parallel]: 4.175e-05 [insert-virtual-dataset]: 2.96001e-06 [parallel-infer-symbol-second]: 8.00006e-07 [dataset_repeat_opt]: 2.44001e-06 [pipeline_split]: 1.62001e-06 [optimize]: 0.0075544, [53] [py_interpret_to_execute]: 0.00016436 [rewriter_before_opt_a]: 0.00010679 [opt_a]: 0.00379899, [2] [Cycle 1]: 0.00271676, [45] [expand_dump_flag]: 3.70998e-06 [switch_simplify]: 4.604e-05 [loop_unroll]: 3.164e-05 [a_1]: 0.00071754 [with_stream_mark]: 2.153e-05 [recompute_prepare]: 1.086e-05 [updatestate_depend_eliminate]: 4.92999e-06 [updatestate_assign_eliminate]: 4.25e-06 [updatestate_loads_eliminate]: 4.16001e-06 [parameter_eliminate]: 2.58e-06 [a_2]: 0.00013332 [accelerated_algorithm]: 9.52999e-06 [shard]: 2.53e-06 [meta_shard_fg_expand]: 2.39001e-06 [shard_inline]: 8.57e-06 [merge_send_recv]: 9.87001e-06 [auto_parallel]: 8.64e-06 [parallel]: 2.022e-05 [flash_sp]: 1.038e-05 [merge_comm]: 5.44e-06 [allreduce_fusion]: 4.3e-06 [matmul_add_comm_reduction]: 1.17e-05 [allreduce_slice_to_reducescatter]: 8.50006e-07 [virtual_shard_identity]: 1.052e-05 [virtual_dataset]: 8.38999e-06 [get_grad_eliminate_]: 7.67998e-06 [virtual_output]: 8.15e-06 [merge_forward]: 4.94e-06 [cell_reuse_recompute_pass]: 1.25999e-06 [offload_activation]: 1.197e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.912e-05 [merge_recompute_call_nodes]: 1.71e-06 [before_grad]: 1.34e-05 [set_forward_comm_id_for_comm_node_pass]: 4.99e-06 [meta_fg_expand]: 3.61999e-06 [flash_sp_send_recv_attached]: 2.71e-06 [receive_attached]: 2.53e-06 [after_resolve]: 1.427e-05 [a_after_grad]: 1.375e-05 [renormalize]: 0.00095366 [add_forward_monad_depend]: 8.40999e-06 [auto_monad_grad]: 3.06999e-06 [auto_monad_eliminator]: 2.13e-05 [cse]: 3.999e-05 [a_3]: 8.001e-05 [Cycle 2]: 0.0010631, [45] [expand_dump_flag]: 2.44999e-06 [switch_simplify]: 1.048e-05 [loop_unroll]: 7.75e-06 [a_1]: 0.00019574 [with_stream_mark]: 1.868e-05 [recompute_prepare]: 9.32001e-06 [updatestate_depend_eliminate]: 4.46002e-06 [updatestate_assign_eliminate]: 3.26001e-06 [updatestate_loads_eliminate]: 3.25e-06 [parameter_eliminate]: 1.66e-06 [a_2]: 0.00012567 [accelerated_algorithm]: 9.02e-06 [shard]: 2.83e-06 [meta_shard_fg_expand]: 2.46998e-06 [shard_inline]: 7.7e-06 [merge_send_recv]: 8.27998e-06 [auto_parallel]: 9.88998e-06 [parallel]: 9.41e-06 [flash_sp]: 4.34002e-06 [merge_comm]: 5.67001e-06 [allreduce_fusion]: 4.22998e-06 [matmul_add_comm_reduction]: 9.66998e-06 [allreduce_slice_to_reducescatter]: 6.50005e-07 [virtual_shard_identity]: 9.74e-06 [virtual_dataset]: 1.168e-05 [get_grad_eliminate_]: 7.60998e-06 [virtual_output]: 7.92e-06 [merge_forward]: 4.87998e-06 [cell_reuse_recompute_pass]: 3.07002e-06 [offload_activation]: 1.135e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.992e-05 [merge_recompute_call_nodes]: 1.49e-06 [before_grad]: 1.467e-05 [set_forward_comm_id_for_comm_node_pass]: 4.95999e-06 [meta_fg_expand]: 3.86999e-06 [flash_sp_send_recv_attached]: 2.10002e-06 [receive_attached]: 2.16e-06 [after_resolve]: 1.519e-05 [a_after_grad]: 1.2e-05 [renormalize]: 8.00064e-08 [add_forward_monad_depend]: 3.63999e-06 [auto_monad_grad]: 2.37999e-06 [auto_monad_eliminator]: 1.479e-05 [cse]: 3.018e-05 [a_3]: 6.339e-05 [py_interpret_to_execute_after_opt_a]: 2.333e-05 [slice_cell_reuse_recomputed_activation]: 5.35999e-06 [rewriter_after_opt_a]: 6.002e-05 [convert_after_rewriter]: 1.164e-05 [order_py_execute_after_rewriter]: 9.67001e-06 [mutable_eliminate]: 0.00108815 [opt_b]: 0.0004012, [1] [Cycle 1]: 0.00038694, [7] [b_1]: 0.00023871 [b_2]: 1.236e-05 [updatestate_depend_eliminate]: 1.447e-05 [updatestate_assign_eliminate]: 3.80998e-06 [updatestate_loads_eliminate]: 3.55e-06 [renormalize]: 1.35001e-06 [cse]: 4.695e-05 [optimize_parallel_all_gather_comm]: 3.11e-05 [overlap_param_gather]: 4.50999e-06 [cconv]: 4.457e-05 [loop_unroll]: 0.00076283 [opt_after_cconv]: 0.00020999, [1] [Cycle 1]: 0.00019871, [7] [c_1]: 7.403e-05 [parameter_eliminate]: 7.1e-06 [updatestate_depend_eliminate]: 1.124e-05 [updatestate_assign_eliminate]: 3.57002e-06 [updatestate_loads_eliminate]: 3.33e-06 [cse]: 3.519e-05 [renormalize]: 1.10999e-06 [remove_dup_value]: 2.263e-05 [tuple_transform]: 0.00011413, [1] [Cycle 1]: 0.00010619, [4] [d_1]: 6.104e-05 [none_parameter_eliminate]: 2.56e-06 [renormalize]: 2.69996e-07 [switch_simplify]: 1.01e-05 [partial_unused_args_eliminate]: 5.25999e-06 [add_recomputation]: 7.373e-05 [cse_after_recomputation]: 3.664e-05, [1] [Cycle 1]: 2.951e-05, [1] [cse]: 1.776e-05 [environ_conv]: 1.175e-05 [swap_dp_allreduce_reducescatter]: 1.021e-05 [bias_add_comm_swap]: 6.39999e-06 [label_micro_interleaved_index]: 1.021e-05 [label_fine_grained_interleaved_index]: 5.57999e-06 [merge_cast_opt]: 4.05998e-06 [slice_recompute_activation]: 4.70999e-06 [micro_interleaved_order_control]: 5.29e-06 [assign_add_opt]: 3.9e-06 [ForceFp32Comm]: 3.45003e-06 [remove_cast_before_assign_add]: 3.68e-06 [full_micro_interleaved_order_control]: 4.74998e-06 [reorder_send_recv_between_fp_bp]: 5.61998e-06 [comm_op_add_attrs]: 4.33001e-06 [add_comm_op_reuse_tag]: 3.36999e-06 [interleave_split_concat_branches]: 3.80998e-06 [interleave_parallel_branches]: 3.78001e-06 [overlap_opt_shard_in_pipeline]: 3.86001e-06 [overlap_opt_shard_grad_in_pipeline]: 4.13001e-06 [control_data_broadcast_order]: 2.177e-05 [grouped_pairwise_exchange_alltoall]: 4.35e-06 [offloading_packed_experts]: 8.29002e-06 [overlap_recompute_and_grad_model_parallel]: 8.62998e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.84002e-06 [overlap_recompute_allgather_and_fa_grad]: 4.1e-06 [overlap_recompute_comm]: 5.12e-06 [overlap_grad_ring_attention]: 7.8e-06 [overlap_grad_flash_sp]: 2.903e-05 [begin_end_overlap_inline]: 2.79999e-06 [split_matmul_comm_elemetwise]: 4.60999e-06 [split_layernorm_comm]: 4.51002e-06 [handle_group_info]: 3.58999e-06 [symbol_engine_optimizer]: 0.00012007, [1] [Cycle 1]: 0.00011243, [6] [build]: 4.60001e-06 [elim_shapecalc]: 1.616e-05 [elim_not_effective]: 1.746e-05 [opt_reshape]: 9.29e-06 [fold_const_symbol]: 1.38e-05 [renormalize]: 2.9002e-07 [detach_backward]: 5.80002e-06 [pipeline_parallel_scheduler]: 2.14999e-06 [auto_monad_reorder]: 2.991e-05 [get_jit_bprop_graph]: 2.18998e-06 [rewriter_after_jit_bprop_graph]: 7.92e-06 [opt_after_jit_grad]: 0.00083429 [validate]: 5.889e-05 Sums bootstrap : 0.000430s : 2.62% type_inference : 0.008593s : 52.40% event_method : 0.000022s : 0.14% auto_monad : 0.000071s : 0.43% graph_reusing : 0.000006s : 0.04% inline : 0.000003s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000023s : 0.14% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.04% parallel-infer-symbol : 0.000004s : 0.02% pre_auto_parallel : 0.000042s : 0.25% insert-virtual-dataset : 0.000003s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.01% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000164s : 1.00% optimize.rewriter_before_opt_a : 0.000107s : 0.65% optimize.opt_a.expand_dump_flag : 0.000006s : 0.04% optimize.opt_a.switch_simplify : 0.000057s : 0.34% optimize.opt_a.loop_unroll : 0.000039s : 0.24% optimize.opt_a.a_1 : 0.000913s : 5.57% optimize.opt_a.with_stream_mark : 0.000040s : 0.25% optimize.opt_a.recompute_prepare : 0.000020s : 0.12% optimize.opt_a.updatestate_depend_eliminate : 0.000009s : 0.06% optimize.opt_a.updatestate_assign_eliminate : 0.000008s : 0.05% optimize.opt_a.updatestate_loads_eliminate : 0.000007s : 0.05% optimize.opt_a.parameter_eliminate : 0.000004s : 0.03% optimize.opt_a.a_2 : 0.000259s : 1.58% optimize.opt_a.accelerated_algorithm : 0.000019s : 0.11% optimize.opt_a.shard : 0.000005s : 0.03% optimize.opt_a.meta_shard_fg_expand : 0.000005s : 0.03% optimize.opt_a.shard_inline : 0.000016s : 0.10% optimize.opt_a.merge_send_recv : 0.000018s : 0.11% optimize.opt_a.auto_parallel : 0.000019s : 0.11% optimize.opt_a.parallel : 0.000030s : 0.18% optimize.opt_a.flash_sp : 0.000015s : 0.09% optimize.opt_a.merge_comm : 0.000011s : 0.07% optimize.opt_a.allreduce_fusion : 0.000009s : 0.05% optimize.opt_a.matmul_add_comm_reduction : 0.000021s : 0.13% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000002s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000020s : 0.12% optimize.opt_a.virtual_dataset : 0.000020s : 0.12% optimize.opt_a.get_grad_eliminate_ : 0.000015s : 0.09% optimize.opt_a.virtual_output : 0.000016s : 0.10% optimize.opt_a.merge_forward : 0.000010s : 0.06% optimize.opt_a.cell_reuse_recompute_pass : 0.000004s : 0.03% optimize.opt_a.offload_activation : 0.000023s : 0.14% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000039s : 0.24% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.02% optimize.opt_a.before_grad : 0.000028s : 0.17% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000010s : 0.06% optimize.opt_a.meta_fg_expand : 0.000007s : 0.05% optimize.opt_a.flash_sp_send_recv_attached : 0.000005s : 0.03% optimize.opt_a.receive_attached : 0.000005s : 0.03% optimize.opt_a.after_resolve : 0.000029s : 0.18% optimize.opt_a.a_after_grad : 0.000026s : 0.16% optimize.opt_a.renormalize : 0.000954s : 5.82% optimize.opt_a.add_forward_monad_depend : 0.000012s : 0.07% optimize.opt_a.auto_monad_grad : 0.000005s : 0.03% optimize.opt_a.auto_monad_eliminator : 0.000036s : 0.22% optimize.opt_a.cse : 0.000070s : 0.43% optimize.opt_a.a_3 : 0.000143s : 0.87% optimize.py_interpret_to_execute_after_opt_a : 0.000023s : 0.14% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.03% optimize.rewriter_after_opt_a : 0.000060s : 0.37% optimize.convert_after_rewriter : 0.000012s : 0.07% optimize.order_py_execute_after_rewriter : 0.000010s : 0.06% optimize.mutable_eliminate : 0.001088s : 6.64% optimize.opt_b.b_1 : 0.000239s : 1.46% optimize.opt_b.b_2 : 0.000012s : 0.08% optimize.opt_b.updatestate_depend_eliminate : 0.000014s : 0.09% optimize.opt_b.updatestate_assign_eliminate : 0.000004s : 0.02% optimize.opt_b.updatestate_loads_eliminate : 0.000004s : 0.02% optimize.opt_b.renormalize : 0.000001s : 0.01% optimize.opt_b.cse : 0.000047s : 0.29% optimize.optimize_parallel_all_gather_comm : 0.000031s : 0.19% optimize.overlap_param_gather : 0.000005s : 0.03% optimize.cconv : 0.000045s : 0.27% optimize.loop_unroll : 0.000763s : 4.65% optimize.opt_after_cconv.c_1 : 0.000074s : 0.45% optimize.opt_after_cconv.parameter_eliminate : 0.000007s : 0.04% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000011s : 0.07% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000004s : 0.02% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.02% optimize.opt_after_cconv.cse : 0.000035s : 0.21% optimize.opt_after_cconv.renormalize : 0.000001s : 0.01% optimize.remove_dup_value : 0.000023s : 0.14% optimize.tuple_transform.d_1 : 0.000061s : 0.37% optimize.tuple_transform.none_parameter_eliminate : 0.000003s : 0.02% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000010s : 0.06% optimize.partial_unused_args_eliminate : 0.000005s : 0.03% optimize.add_recomputation : 0.000074s : 0.45% optimize.cse_after_recomputation.cse : 0.000018s : 0.11% optimize.environ_conv : 0.000012s : 0.07% optimize.swap_dp_allreduce_reducescatter : 0.000010s : 0.06% optimize.bias_add_comm_swap : 0.000006s : 0.04% optimize.label_micro_interleaved_index : 0.000010s : 0.06% optimize.label_fine_grained_interleaved_index : 0.000006s : 0.03% optimize.merge_cast_opt : 0.000004s : 0.02% optimize.slice_recompute_activation : 0.000005s : 0.03% optimize.micro_interleaved_order_control : 0.000005s : 0.03% optimize.assign_add_opt : 0.000004s : 0.02% optimize.ForceFp32Comm : 0.000003s : 0.02% optimize.remove_cast_before_assign_add : 0.000004s : 0.02% optimize.full_micro_interleaved_order_control : 0.000005s : 0.03% optimize.reorder_send_recv_between_fp_bp : 0.000006s : 0.03% optimize.comm_op_add_attrs : 0.000004s : 0.03% optimize.add_comm_op_reuse_tag : 0.000003s : 0.02% optimize.interleave_split_concat_branches : 0.000004s : 0.02% optimize.interleave_parallel_branches : 0.000004s : 0.02% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.02% optimize.overlap_opt_shard_grad_in_pipeline : 0.000004s : 0.03% optimize.control_data_broadcast_order : 0.000022s : 0.13% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.03% optimize.offloading_packed_experts : 0.000008s : 0.05% optimize.overlap_recompute_and_grad_model_parallel : 0.000009s : 0.05% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.02% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.03% optimize.overlap_recompute_comm : 0.000005s : 0.03% optimize.overlap_grad_ring_attention : 0.000008s : 0.05% optimize.overlap_grad_flash_sp : 0.000029s : 0.18% optimize.begin_end_overlap_inline : 0.000003s : 0.02% optimize.split_matmul_comm_elemetwise : 0.000005s : 0.03% optimize.split_layernorm_comm : 0.000005s : 0.03% optimize.handle_group_info : 0.000004s : 0.02% optimize.symbol_engine_optimizer.build : 0.000005s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000016s : 0.10% optimize.symbol_engine_optimizer.elim_not_effective : 0.000017s : 0.11% optimize.symbol_engine_optimizer.opt_reshape : 0.000009s : 0.06% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000014s : 0.08% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000006s : 0.04% pipeline_parallel_scheduler : 0.000002s : 0.01% auto_monad_reorder : 0.000030s : 0.18% get_jit_bprop_graph : 0.000002s : 0.01% rewriter_after_jit_bprop_graph : 0.000008s : 0.05% opt_after_jit_grad : 0.000834s : 5.09% validate : 0.000059s : 0.36% Time group info: ------[substitution.] 0.000260 39 11.99% : 0.000031s : 3: substitution.cast_eliminate 0.92% : 0.000002s : 3: substitution.elim_not_effective 0.70% : 0.000002s : 3: substitution.fold_const_symbol 2.83% : 0.000007s : 5: substitution.graph_param_transform 67.44% : 0.000175s : 4: substitution.inline 2.37% : 0.000006s : 6: substitution.j_node_and_user_rematch 2.61% : 0.000007s : 6: substitution.remove_not_recompute_node 2.66% : 0.000007s : 4: substitution.replace_old_param 5.12% : 0.000013s : 4: substitution.tuple_list_get_item_eliminator 3.37% : 0.000009s : 1: substitution.value_based_eliminate ------[type_inference.] 0.008531 2 90.97% : 0.007761s : 1: type_inference.infer 9.03% : 0.000770s : 1: type_inference.specialize ------[replace.] 0.000065 8 62.87% : 0.000041s : 4: replace.inline 37.13% : 0.000024s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000183 8 93.78% : 0.000172s : 4: match.inline 6.22% : 0.000011s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000276 1504 0.89% : 0.000002s : 15: predicate.accumulaten_eliminater 1.42% : 0.000004s : 5: predicate.ad_related_special_op_eliminate 0.53% : 0.000001s : 10: predicate.addn_check_dump 0.95% : 0.000003s : 15: predicate.addn_zero_filter 0.76% : 0.000002s : 15: predicate.adjust_all_reduce_mul_add 1.96% : 0.000005s : 25: predicate.arithmetic_simplify 1.00% : 0.000003s : 15: predicate.cast_eliminate 0.63% : 0.000002s : 10: predicate.check_bprop_eliminate 0.57% : 0.000002s : 10: predicate.compare_switch_simplify 0.18% : 0.000001s : 5: predicate.const_output_eliminate 0.65% : 0.000002s : 10: predicate.depend_value_elim 0.86% : 0.000002s : 15: predicate.dict_get_item_const_eliminator 1.10% : 0.000003s : 15: predicate.dict_get_item_eliminator 0.79% : 0.000002s : 15: predicate.dict_set_item_eliminator 1.30% : 0.000004s : 10: predicate.dumpgradient_eliminate 0.23% : 0.000001s : 5: predicate.elim_not_effective 0.62% : 0.000002s : 5: predicate.elim_shapecalc_of_broadcastargs 1.42% : 0.000004s : 20: predicate.environ_add_const_eliminate 1.07% : 0.000003s : 20: predicate.environ_get_add_eliminate 1.10% : 0.000003s : 20: predicate.environ_get_depend_swap 1.63% : 0.000004s : 30: predicate.environ_get_eliminate 1.12% : 0.000003s : 20: predicate.environ_get_set_eliminate 1.28% : 0.000004s : 23: predicate.exchange_switch_depend_value 2.19% : 0.000006s : 23: predicate.float_depend_g_call 0.54% : 0.000001s : 10: predicate.float_environ_get_switch 0.79% : 0.000002s : 15: predicate.float_tuple_getitem_switch 0.17% : 0.000000s : 5: predicate.fold_const_symbol 0.69% : 0.000002s : 10: predicate.get_grad_eliminate 0.21% : 0.000001s : 5: predicate.graph_param_transform 0.57% : 0.000002s : 10: predicate.incorporate_call 0.48% : 0.000001s : 10: predicate.incorporate_call_switch 6.29% : 0.000017s : 68: predicate.inline 0.84% : 0.000002s : 10: predicate.inline_without_move 0.32% : 0.000001s : 10: predicate.j_node_and_user_rematch 0.81% : 0.000002s : 10: predicate.less_batch_normalization 1.69% : 0.000005s : 29: predicate.list_to_tuple_eliminator_ 2.42% : 0.000007s : 44: predicate.load_eliminater 2.00% : 0.000006s : 5: predicate.loop_unroll_after_grad 1.83% : 0.000005s : 36: predicate.loop_unroll_before_grad 1.82% : 0.000005s : 25: predicate.make_slice_get_slice_eliminator 0.57% : 0.000002s : 10: predicate.merge_addn 0.72% : 0.000002s : 10: predicate.micro_step_allgather_replace 0.58% : 0.000002s : 10: predicate.mini_step_allgather_replace 0.91% : 0.000003s : 15: predicate.minmaximum_grad 2.09% : 0.000006s : 5: predicate.mutable_eliminate 0.44% : 0.000001s : 5: predicate.opt_reshape 0.38% : 0.000001s : 5: predicate.parallel_virtual_node 1.39% : 0.000004s : 23: predicate.partial_defer_inline 1.43% : 0.000004s : 24: predicate.partial_eliminate 0.79% : 0.000002s : 15: predicate.print_const_string_wrapper 0.61% : 0.000002s : 10: predicate.reduce_all_const_elim 1.25% : 0.000003s : 15: predicate.reduce_eliminate 2.37% : 0.000007s : 44: predicate.redundant_stop_gradient_eliminater 0.31% : 0.000001s : 10: predicate.remove_not_recompute_node 1.24% : 0.000003s : 29: predicate.replace_applicator 0.42% : 0.000001s : 10: predicate.replace_old_param 0.42% : 0.000001s : 5: predicate.reset_defer_inline 0.84% : 0.000002s : 15: predicate.reshape_eliminate 0.56% : 0.000002s : 10: predicate.row_tensor_add_zeros_like 0.65% : 0.000002s : 5: predicate.row_tensor_eliminate 0.82% : 0.000002s : 10: predicate.same_eliminate 0.45% : 0.000001s : 10: predicate.set_cell_output_no_recompute 0.86% : 0.000002s : 10: predicate.shard_identity_eliminate 0.88% : 0.000002s : 10: predicate.special_op_eliminate 0.74% : 0.000002s : 10: predicate.specialize_transform 1.04% : 0.000003s : 10: predicate.split_environ_get_set_with_tuple_value 0.71% : 0.000002s : 10: predicate.stack_unstack_eliminate 0.40% : 0.000001s : 5: predicate.switch_call_monad_eliminater 1.34% : 0.000004s : 23: predicate.switch_defer_inline 1.78% : 0.000005s : 33: predicate.switch_layer_defer_inline 4.57% : 0.000013s : 74: predicate.switch_simplify 0.92% : 0.000003s : 15: predicate.tile_eliminate 0.93% : 0.000003s : 15: predicate.transpose_eliminate 1.38% : 0.000004s : 25: predicate.tuple_list_convert_item_index_to_positive 1.44% : 0.000004s : 25: predicate.tuple_list_get_item_const_eliminator 1.40% : 0.000004s : 25: predicate.tuple_list_get_item_depend_reorder 3.30% : 0.000009s : 39: predicate.tuple_list_get_item_eliminator 1.41% : 0.000004s : 25: predicate.tuple_list_get_set_item_eliminator 2.32% : 0.000006s : 35: predicate.tuple_list_set_item_eliminator 1.69% : 0.000005s : 29: predicate.tuple_to_list_eliminator_ 2.15% : 0.000006s : 44: predicate.updatestate_pure_node_eliminater 2.95% : 0.000008s : 54: predicate.updatestate_useless_node_eliminater 0.67% : 0.000002s : 5: predicate.value_based_eliminate 0.71% : 0.000002s : 10: predicate.virtual_dataset_eliminate 0.67% : 0.000002s : 10: predicate.virtual_output_eliminate 0.27% : 0.000001s : 5: predicate.virtual_view_grad_eliminate 0.50% : 0.000001s : 5: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000621 11 54.75% : 0.000340s : 5: func_graph_cloner_run.FuncGraphClonerGraph 45.25% : 0.000281s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.035961 192 0.02% : 0.000006s : 1: ForceFp32Comm 10.20% : 0.003668s : 1: add_attr 10.15% : 0.003649s : 1: add_attr_with_inline 0.02% : 0.000006s : 1: add_comm_op_reuse_tag 0.22% : 0.000078s : 1: add_recomputation 0.02% : 0.000006s : 1: assign_add_opt 0.23% : 0.000082s : 1: auto_monad 0.11% : 0.000038s : 1: auto_monad_reorder 0.02% : 0.000006s : 1: begin_end_overlap_inline 0.03% : 0.000009s : 1: bias_add_comm_swap 1.32% : 0.000476s : 1: bootstrap 0.13% : 0.000048s : 1: cconv 0.02% : 0.000007s : 1: comm_op_add_attrs 0.07% : 0.000025s : 1: control_data_broadcast_order 0.04% : 0.000015s : 1: convert_after_rewriter 0.11% : 0.000040s : 1: cse_after_recomputation 0.02% : 0.000008s : 1: dataset_repeat_opt 0.08% : 0.000030s : 1: detach_backward 0.04% : 0.000015s : 1: environ_conv 0.09% : 0.000033s : 1: event_method 0.02% : 0.000007s : 1: full_micro_interleaved_order_control 0.02% : 0.000008s : 1: get_jit_bprop_graph 0.04% : 0.000013s : 1: graph_reusing 0.02% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.02% : 0.000006s : 1: handle_group_info 0.02% : 0.000009s : 1: inline 0.03% : 0.000009s : 1: insert-virtual-dataset 0.02% : 0.000006s : 1: interleave_parallel_branches 0.02% : 0.000006s : 1: interleave_split_concat_branches 0.02% : 0.000008s : 1: label_fine_grained_interleaved_index 0.04% : 0.000013s : 1: label_micro_interleaved_index 2.15% : 0.000774s : 1: loop_unroll 0.02% : 0.000007s : 1: merge_cast_opt 0.02% : 0.000008s : 1: micro_interleaved_order_control 3.06% : 0.001100s : 1: mutable_eliminate 0.03% : 0.000011s : 1: offloading_packed_experts 0.08% : 0.000027s : 1: opt.transform.loop_unroll_optimizer 0.10% : 0.000035s : 1: opt.transform.mutable_eliminate 4.08% : 0.001469s : 78: opt.transform.opt_a 0.20% : 0.000071s : 1: opt.transform.opt_after_cconv 0.12% : 0.000042s : 1: opt.transform.opt_after_jit_grad 0.46% : 0.000166s : 28: opt.transform.opt_b 0.19% : 0.000068s : 2: opt.transform.opt_trans_graph 0.14% : 0.000052s : 4: opt.transform.symbol_engine_opt 10.57% : 0.003803s : 1: opt_a 0.59% : 0.000214s : 1: opt_after_cconv 2.36% : 0.000850s : 1: opt_after_jit_grad 1.13% : 0.000406s : 1: opt_b 22.20% : 0.007982s : 1: optimize 0.10% : 0.000035s : 1: optimize_parallel_all_gather_comm 0.04% : 0.000013s : 1: order_py_execute_after_rewriter 0.09% : 0.000032s : 1: overlap_grad_flash_sp 0.02% : 0.000006s : 1: overlap_grad_matmul_and_grad_allreduce 0.03% : 0.000011s : 1: overlap_grad_ring_attention 0.02% : 0.000007s : 1: overlap_opt_shard_grad_in_pipeline 0.02% : 0.000007s : 1: overlap_opt_shard_in_pipeline 0.02% : 0.000008s : 1: overlap_param_gather 0.02% : 0.000007s : 1: overlap_recompute_allgather_and_fa_grad 0.03% : 0.000012s : 1: overlap_recompute_and_grad_model_parallel 0.02% : 0.000008s : 1: overlap_recompute_comm 0.03% : 0.000011s : 1: parallel-infer-symbol 0.02% : 0.000006s : 1: parallel-infer-symbol-second 0.02% : 0.000009s : 1: partial_unused_args_eliminate 0.03% : 0.000011s : 1: pipeline_parallel_scheduler 0.02% : 0.000007s : 1: pipeline_split 0.14% : 0.000049s : 1: pre_auto_parallel 0.48% : 0.000172s : 1: py_interpret_to_execute 0.08% : 0.000027s : 1: py_interpret_to_execute_after_opt_a 0.02% : 0.000007s : 1: remove_cast_before_assign_add 0.07% : 0.000026s : 1: remove_dup_value 1.50% : 0.000541s : 1: renormalize.infer 1.12% : 0.000402s : 1: renormalize.specialize 0.02% : 0.000008s : 1: reorder_send_recv_between_fp_bp 0.04% : 0.000014s : 1: rewriter_after_jit_bprop_graph 0.18% : 0.000066s : 1: rewriter_after_opt_a 0.31% : 0.000111s : 1: rewriter_before_opt_a 0.02% : 0.000008s : 1: slice_cell_reuse_recomputed_activation 0.02% : 0.000007s : 1: slice_recompute_activation 0.02% : 0.000007s : 1: split_layernorm_comm 0.02% : 0.000007s : 1: split_matmul_comm_elemetwise 0.04% : 0.000013s : 1: swap_dp_allreduce_reducescatter 0.34% : 0.000123s : 1: symbol_engine_optimizer 0.33% : 0.000117s : 1: tuple_transform 24.02% : 0.008638s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:44:19.414.765 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0749117, [21] [bootstrap]: 0.00045126 [type_inference]: 0.0560594 [event_method]: 2.532e-05 [auto_monad]: 7.291e-05 [graph_reusing]: 6.82002e-06 [inline]: 3.71001e-06 [add_attr]: 0.0112847, [1] [add_attr_with_inline]: 0.0112714, [1] [Cycle 1]: 8.125e-05, [2] [tag_attr]: 2.734e-05 [meta_addattr_fg_expand]: 6.12001e-06 [parallel-infer-symbol]: 3.88001e-06 [pre_auto_parallel]: 4.295e-05 [insert-virtual-dataset]: 2.57001e-06 [parallel-infer-symbol-second]: 9.00007e-07 [dataset_repeat_opt]: 2.35002e-06 [pipeline_split]: 1.66e-06 [optimize]: 0.0061609, [53] [py_interpret_to_execute]: 3.254e-05 [rewriter_before_opt_a]: 9.854e-05 [opt_a]: 0.00358872, [2] [Cycle 1]: 0.00269273, [45] [expand_dump_flag]: 3.11999e-06 [switch_simplify]: 4.568e-05 [loop_unroll]: 3.202e-05 [a_1]: 0.0007534 [with_stream_mark]: 2.207e-05 [recompute_prepare]: 1.478e-05 [updatestate_depend_eliminate]: 5.79999e-06 [updatestate_assign_eliminate]: 4.00998e-06 [updatestate_loads_eliminate]: 3.69002e-06 [parameter_eliminate]: 2.14e-06 [a_2]: 0.00010627 [accelerated_algorithm]: 1.014e-05 [shard]: 1.99999e-06 [meta_shard_fg_expand]: 2.31998e-06 [shard_inline]: 9.00001e-06 [merge_send_recv]: 1.062e-05 [auto_parallel]: 9.79999e-06 [parallel]: 2.352e-05 [flash_sp]: 1.136e-05 [merge_comm]: 5.32999e-06 [allreduce_fusion]: 4.81002e-06 [matmul_add_comm_reduction]: 1.189e-05 [allreduce_slice_to_reducescatter]: 8.59989e-07 [virtual_shard_identity]: 1.297e-05 [virtual_dataset]: 9.10999e-06 [get_grad_eliminate_]: 8.82999e-06 [virtual_output]: 8.84998e-06 [merge_forward]: 5.45001e-06 [cell_reuse_recompute_pass]: 2.04e-06 [offload_activation]: 1.228e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.006e-05 [merge_recompute_call_nodes]: 1.44e-06 [before_grad]: 1.483e-05 [set_forward_comm_id_for_comm_node_pass]: 5.05001e-06 [meta_fg_expand]: 3.55e-06 [flash_sp_send_recv_attached]: 3.27997e-06 [receive_attached]: 2.84001e-06 [after_resolve]: 1.451e-05 [a_after_grad]: 1.313e-05 [renormalize]: 0.0010125 [add_forward_monad_depend]: 9.69e-06 [auto_monad_grad]: 2.58e-06 [auto_monad_eliminator]: 2.394e-05 [cse]: 4.198e-05 [a_3]: 6.917e-05 [Cycle 2]: 0.00088304, [45] [expand_dump_flag]: 2.39999e-06 [switch_simplify]: 1.081e-05 [loop_unroll]: 8.05999e-06 [a_1]: 0.00019512 [with_stream_mark]: 1.904e-05 [recompute_prepare]: 9.03002e-06 [updatestate_depend_eliminate]: 5.49e-06 [updatestate_assign_eliminate]: 4.22e-06 [updatestate_loads_eliminate]: 3.43999e-06 [parameter_eliminate]: 1.82001e-06 [a_2]: 0.00010325 [accelerated_algorithm]: 9.11998e-06 [shard]: 2.19001e-06 [meta_shard_fg_expand]: 2.00002e-06 [shard_inline]: 8.62998e-06 [merge_send_recv]: 9.61e-06 [auto_parallel]: 8.68001e-06 [parallel]: 8.50999e-06 [flash_sp]: 4.60999e-06 [merge_comm]: 8.40001e-06 [allreduce_fusion]: 4.00998e-06 [matmul_add_comm_reduction]: 9.99999e-06 [allreduce_slice_to_reducescatter]: 8.69972e-07 [virtual_shard_identity]: 1.108e-05 [virtual_dataset]: 8.57e-06 [get_grad_eliminate_]: 7.70998e-06 [virtual_output]: 6.98e-06 [merge_forward]: 4.95001e-06 [cell_reuse_recompute_pass]: 2.94001e-06 [offload_activation]: 1.14e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.578e-05 [merge_recompute_call_nodes]: 1.64e-06 [before_grad]: 1.296e-05 [set_forward_comm_id_for_comm_node_pass]: 6.89999e-06 [meta_fg_expand]: 3.27002e-06 [flash_sp_send_recv_attached]: 1.54e-06 [receive_attached]: 2.71e-06 [after_resolve]: 1.574e-05 [a_after_grad]: 1.274e-05 [renormalize]: 7.00238e-08 [add_forward_monad_depend]: 2.12001e-06 [auto_monad_grad]: 2.26998e-06 [auto_monad_eliminator]: 1.263e-05 [cse]: 2.482e-05 [a_3]: 4.785e-05 [py_interpret_to_execute_after_opt_a]: 1.988e-05 [slice_cell_reuse_recomputed_activation]: 2.37001e-06 [rewriter_after_opt_a]: 5.447e-05 [convert_after_rewriter]: 8.42998e-06 [order_py_execute_after_rewriter]: 5.96e-06 [mutable_eliminate]: 0.0007628 [opt_b]: 0.00028697, [1] [Cycle 1]: 0.00027726, [7] [b_1]: 0.00017586 [b_2]: 1.144e-05 [updatestate_depend_eliminate]: 9.77001e-06 [updatestate_assign_eliminate]: 3.7e-06 [updatestate_loads_eliminate]: 3.47002e-06 [renormalize]: 9.29984e-07 [cse]: 3.381e-05 [optimize_parallel_all_gather_comm]: 2.225e-05 [overlap_param_gather]: 1.99e-06 [cconv]: 3.571e-05 [loop_unroll]: 0.00047113 [opt_after_cconv]: 0.00013045, [1] [Cycle 1]: 0.00012323, [7] [c_1]: 3.87e-05 [parameter_eliminate]: 5.05999e-06 [updatestate_depend_eliminate]: 8.70999e-06 [updatestate_assign_eliminate]: 3.13e-06 [updatestate_loads_eliminate]: 3.03e-06 [cse]: 2.76e-05 [renormalize]: 8.59989e-07 [remove_dup_value]: 1.62e-05 [tuple_transform]: 8.703e-05, [1] [Cycle 1]: 8.233e-05, [4] [d_1]: 5.255e-05 [none_parameter_eliminate]: 1.77999e-06 [renormalize]: 2.80008e-07 [switch_simplify]: 8.50001e-06 [partial_unused_args_eliminate]: 1.76e-06 [add_recomputation]: 6.511e-05 [cse_after_recomputation]: 2.644e-05, [1] [Cycle 1]: 2.15e-05, [1] [cse]: 1.575e-05 [environ_conv]: 6.68e-06 [swap_dp_allreduce_reducescatter]: 6.11e-06 [bias_add_comm_swap]: 3.06999e-06 [label_micro_interleaved_index]: 4.87e-06 [label_fine_grained_interleaved_index]: 2.69999e-06 [merge_cast_opt]: 1.46002e-06 [slice_recompute_activation]: 2.46e-06 [micro_interleaved_order_control]: 2.73e-06 [assign_add_opt]: 1.16002e-06 [ForceFp32Comm]: 8.30012e-07 [remove_cast_before_assign_add]: 9.80013e-07 [full_micro_interleaved_order_control]: 2.67001e-06 [reorder_send_recv_between_fp_bp]: 2.64001e-06 [comm_op_add_attrs]: 9.70002e-07 [add_comm_op_reuse_tag]: 1.17e-06 [interleave_split_concat_branches]: 1.31998e-06 [interleave_parallel_branches]: 1.12e-06 [overlap_opt_shard_in_pipeline]: 1.18001e-06 [overlap_opt_shard_grad_in_pipeline]: 2.41998e-06 [control_data_broadcast_order]: 1.51e-05 [grouped_pairwise_exchange_alltoall]: 1.92999e-06 [offloading_packed_experts]: 4.38999e-06 [overlap_recompute_and_grad_model_parallel]: 5.30999e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.17999e-06 [overlap_recompute_allgather_and_fa_grad]: 1.37e-06 [overlap_recompute_comm]: 2.17999e-06 [overlap_grad_ring_attention]: 4.68999e-06 [overlap_grad_flash_sp]: 2.589e-05 [begin_end_overlap_inline]: 4.69998e-07 [split_matmul_comm_elemetwise]: 2.90998e-06 [split_layernorm_comm]: 1.74998e-06 [handle_group_info]: 9.89996e-07 [symbol_engine_optimizer]: 8.732e-05, [1] [Cycle 1]: 8.278e-05, [6] [build]: 3.91001e-06 [elim_shapecalc]: 1.206e-05 [elim_not_effective]: 1.567e-05 [opt_reshape]: 8.53001e-06 [fold_const_symbol]: 1.296e-05 [renormalize]: 2.10013e-07 [detach_backward]: 2.27999e-06 [pipeline_parallel_scheduler]: 1.55999e-06 [auto_monad_reorder]: 2.091e-05 [get_jit_bprop_graph]: 2.63998e-06 [rewriter_after_jit_bprop_graph]: 5.96998e-06 [opt_after_jit_grad]: 0.00053722 [validate]: 4.849e-05 Sums bootstrap : 0.000451s : 0.72% type_inference : 0.056059s : 89.63% event_method : 0.000025s : 0.04% auto_monad : 0.000073s : 0.12% graph_reusing : 0.000007s : 0.01% inline : 0.000004s : 0.01% add_attr.add_attr_with_inline.tag_attr : 0.000027s : 0.04% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.01% parallel-infer-symbol : 0.000004s : 0.01% pre_auto_parallel : 0.000043s : 0.07% insert-virtual-dataset : 0.000003s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000033s : 0.05% optimize.rewriter_before_opt_a : 0.000099s : 0.16% optimize.opt_a.expand_dump_flag : 0.000006s : 0.01% optimize.opt_a.switch_simplify : 0.000056s : 0.09% optimize.opt_a.loop_unroll : 0.000040s : 0.06% optimize.opt_a.a_1 : 0.000949s : 1.52% optimize.opt_a.with_stream_mark : 0.000041s : 0.07% optimize.opt_a.recompute_prepare : 0.000024s : 0.04% optimize.opt_a.updatestate_depend_eliminate : 0.000011s : 0.02% optimize.opt_a.updatestate_assign_eliminate : 0.000008s : 0.01% optimize.opt_a.updatestate_loads_eliminate : 0.000007s : 0.01% optimize.opt_a.parameter_eliminate : 0.000004s : 0.01% optimize.opt_a.a_2 : 0.000210s : 0.33% optimize.opt_a.accelerated_algorithm : 0.000019s : 0.03% optimize.opt_a.shard : 0.000004s : 0.01% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.01% optimize.opt_a.shard_inline : 0.000018s : 0.03% optimize.opt_a.merge_send_recv : 0.000020s : 0.03% optimize.opt_a.auto_parallel : 0.000018s : 0.03% optimize.opt_a.parallel : 0.000032s : 0.05% optimize.opt_a.flash_sp : 0.000016s : 0.03% optimize.opt_a.merge_comm : 0.000014s : 0.02% optimize.opt_a.allreduce_fusion : 0.000009s : 0.01% optimize.opt_a.matmul_add_comm_reduction : 0.000022s : 0.03% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000002s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000024s : 0.04% optimize.opt_a.virtual_dataset : 0.000018s : 0.03% optimize.opt_a.get_grad_eliminate_ : 0.000017s : 0.03% optimize.opt_a.virtual_output : 0.000016s : 0.03% optimize.opt_a.merge_forward : 0.000010s : 0.02% optimize.opt_a.cell_reuse_recompute_pass : 0.000005s : 0.01% optimize.opt_a.offload_activation : 0.000024s : 0.04% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000036s : 0.06% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.00% optimize.opt_a.before_grad : 0.000028s : 0.04% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000012s : 0.02% optimize.opt_a.meta_fg_expand : 0.000007s : 0.01% optimize.opt_a.flash_sp_send_recv_attached : 0.000005s : 0.01% optimize.opt_a.receive_attached : 0.000006s : 0.01% optimize.opt_a.after_resolve : 0.000030s : 0.05% optimize.opt_a.a_after_grad : 0.000026s : 0.04% optimize.opt_a.renormalize : 0.001013s : 1.62% optimize.opt_a.add_forward_monad_depend : 0.000012s : 0.02% optimize.opt_a.auto_monad_grad : 0.000005s : 0.01% optimize.opt_a.auto_monad_eliminator : 0.000037s : 0.06% optimize.opt_a.cse : 0.000067s : 0.11% optimize.opt_a.a_3 : 0.000117s : 0.19% optimize.py_interpret_to_execute_after_opt_a : 0.000020s : 0.03% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.00% optimize.rewriter_after_opt_a : 0.000054s : 0.09% optimize.convert_after_rewriter : 0.000008s : 0.01% optimize.order_py_execute_after_rewriter : 0.000006s : 0.01% optimize.mutable_eliminate : 0.000763s : 1.22% optimize.opt_b.b_1 : 0.000176s : 0.28% optimize.opt_b.b_2 : 0.000011s : 0.02% optimize.opt_b.updatestate_depend_eliminate : 0.000010s : 0.02% optimize.opt_b.updatestate_assign_eliminate : 0.000004s : 0.01% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.01% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000034s : 0.05% optimize.optimize_parallel_all_gather_comm : 0.000022s : 0.04% optimize.overlap_param_gather : 0.000002s : 0.00% optimize.cconv : 0.000036s : 0.06% optimize.loop_unroll : 0.000471s : 0.75% optimize.opt_after_cconv.c_1 : 0.000039s : 0.06% optimize.opt_after_cconv.parameter_eliminate : 0.000005s : 0.01% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000009s : 0.01% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.cse : 0.000028s : 0.04% optimize.opt_after_cconv.renormalize : 0.000001s : 0.00% optimize.remove_dup_value : 0.000016s : 0.03% optimize.tuple_transform.d_1 : 0.000053s : 0.08% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000009s : 0.01% optimize.partial_unused_args_eliminate : 0.000002s : 0.00% optimize.add_recomputation : 0.000065s : 0.10% optimize.cse_after_recomputation.cse : 0.000016s : 0.03% optimize.environ_conv : 0.000007s : 0.01% optimize.swap_dp_allreduce_reducescatter : 0.000006s : 0.01% optimize.bias_add_comm_swap : 0.000003s : 0.00% optimize.label_micro_interleaved_index : 0.000005s : 0.01% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.00% optimize.merge_cast_opt : 0.000001s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.00% optimize.micro_interleaved_order_control : 0.000003s : 0.00% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.00% optimize.full_micro_interleaved_order_control : 0.000003s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.00% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.00% optimize.control_data_broadcast_order : 0.000015s : 0.02% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.00% optimize.offloading_packed_experts : 0.000004s : 0.01% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.01% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.00% optimize.overlap_recompute_comm : 0.000002s : 0.00% optimize.overlap_grad_ring_attention : 0.000005s : 0.01% optimize.overlap_grad_flash_sp : 0.000026s : 0.04% optimize.begin_end_overlap_inline : 0.000000s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000003s : 0.00% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000004s : 0.01% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000012s : 0.02% optimize.symbol_engine_optimizer.elim_not_effective : 0.000016s : 0.03% optimize.symbol_engine_optimizer.opt_reshape : 0.000009s : 0.01% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000013s : 0.02% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.00% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000021s : 0.03% get_jit_bprop_graph : 0.000003s : 0.00% rewriter_after_jit_bprop_graph : 0.000006s : 0.01% opt_after_jit_grad : 0.000537s : 0.86% validate : 0.000048s : 0.08% Time group info: ------[substitution.] 0.000266 39 12.07% : 0.000032s : 3: substitution.cast_eliminate 0.89% : 0.000002s : 3: substitution.elim_not_effective 0.62% : 0.000002s : 3: substitution.fold_const_symbol 2.59% : 0.000007s : 5: substitution.graph_param_transform 68.72% : 0.000183s : 4: substitution.inline 2.04% : 0.000005s : 6: substitution.j_node_and_user_rematch 2.72% : 0.000007s : 6: substitution.remove_not_recompute_node 2.36% : 0.000006s : 4: substitution.replace_old_param 5.04% : 0.000013s : 4: substitution.tuple_list_get_item_eliminator 2.94% : 0.000008s : 1: substitution.value_based_eliminate ------[type_inference.] 0.055970 2 98.31% : 0.055024s : 1: type_inference.infer 1.69% : 0.000945s : 1: type_inference.specialize ------[replace.] 0.000070 8 64.03% : 0.000045s : 4: replace.inline 35.97% : 0.000025s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000191 8 93.97% : 0.000180s : 4: match.inline 6.03% : 0.000012s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000270 1504 0.92% : 0.000002s : 15: predicate.accumulaten_eliminater 1.10% : 0.000003s : 5: predicate.ad_related_special_op_eliminate 0.69% : 0.000002s : 10: predicate.addn_check_dump 0.99% : 0.000003s : 15: predicate.addn_zero_filter 0.76% : 0.000002s : 15: predicate.adjust_all_reduce_mul_add 2.41% : 0.000006s : 25: predicate.arithmetic_simplify 1.10% : 0.000003s : 15: predicate.cast_eliminate 0.55% : 0.000001s : 10: predicate.check_bprop_eliminate 0.57% : 0.000002s : 10: predicate.compare_switch_simplify 0.18% : 0.000000s : 5: predicate.const_output_eliminate 0.68% : 0.000002s : 10: predicate.depend_value_elim 0.91% : 0.000002s : 15: predicate.dict_get_item_const_eliminator 0.95% : 0.000003s : 15: predicate.dict_get_item_eliminator 0.80% : 0.000002s : 15: predicate.dict_set_item_eliminator 1.32% : 0.000004s : 10: predicate.dumpgradient_eliminate 0.18% : 0.000000s : 5: predicate.elim_not_effective 0.54% : 0.000001s : 5: predicate.elim_shapecalc_of_broadcastargs 1.19% : 0.000003s : 20: predicate.environ_add_const_eliminate 1.03% : 0.000003s : 20: predicate.environ_get_add_eliminate 1.06% : 0.000003s : 20: predicate.environ_get_depend_swap 1.66% : 0.000004s : 30: predicate.environ_get_eliminate 1.20% : 0.000003s : 20: predicate.environ_get_set_eliminate 1.40% : 0.000004s : 23: predicate.exchange_switch_depend_value 2.26% : 0.000006s : 23: predicate.float_depend_g_call 0.53% : 0.000001s : 10: predicate.float_environ_get_switch 0.76% : 0.000002s : 15: predicate.float_tuple_getitem_switch 0.18% : 0.000000s : 5: predicate.fold_const_symbol 0.72% : 0.000002s : 10: predicate.get_grad_eliminate 0.20% : 0.000001s : 5: predicate.graph_param_transform 0.59% : 0.000002s : 10: predicate.incorporate_call 0.51% : 0.000001s : 10: predicate.incorporate_call_switch 6.48% : 0.000017s : 68: predicate.inline 0.93% : 0.000003s : 10: predicate.inline_without_move 0.32% : 0.000001s : 10: predicate.j_node_and_user_rematch 1.15% : 0.000003s : 10: predicate.less_batch_normalization 1.65% : 0.000004s : 29: predicate.list_to_tuple_eliminator_ 2.35% : 0.000006s : 44: predicate.load_eliminater 0.80% : 0.000002s : 5: predicate.loop_unroll_after_grad 1.94% : 0.000005s : 36: predicate.loop_unroll_before_grad 1.58% : 0.000004s : 25: predicate.make_slice_get_slice_eliminator 0.73% : 0.000002s : 10: predicate.merge_addn 0.56% : 0.000002s : 10: predicate.micro_step_allgather_replace 0.60% : 0.000002s : 10: predicate.mini_step_allgather_replace 0.87% : 0.000002s : 15: predicate.minmaximum_grad 1.38% : 0.000004s : 5: predicate.mutable_eliminate 0.34% : 0.000001s : 5: predicate.opt_reshape 0.40% : 0.000001s : 5: predicate.parallel_virtual_node 2.07% : 0.000006s : 23: predicate.partial_defer_inline 1.50% : 0.000004s : 24: predicate.partial_eliminate 0.93% : 0.000003s : 15: predicate.print_const_string_wrapper 0.78% : 0.000002s : 10: predicate.reduce_all_const_elim 1.15% : 0.000003s : 15: predicate.reduce_eliminate 2.37% : 0.000006s : 44: predicate.redundant_stop_gradient_eliminater 0.38% : 0.000001s : 10: predicate.remove_not_recompute_node 1.32% : 0.000004s : 29: predicate.replace_applicator 0.61% : 0.000002s : 10: predicate.replace_old_param 0.39% : 0.000001s : 5: predicate.reset_defer_inline 0.91% : 0.000002s : 15: predicate.reshape_eliminate 0.59% : 0.000002s : 10: predicate.row_tensor_add_zeros_like 0.40% : 0.000001s : 5: predicate.row_tensor_eliminate 0.89% : 0.000002s : 10: predicate.same_eliminate 0.49% : 0.000001s : 10: predicate.set_cell_output_no_recompute 1.33% : 0.000004s : 10: predicate.shard_identity_eliminate 0.67% : 0.000002s : 10: predicate.special_op_eliminate 0.71% : 0.000002s : 10: predicate.specialize_transform 1.19% : 0.000003s : 10: predicate.split_environ_get_set_with_tuple_value 0.92% : 0.000002s : 10: predicate.stack_unstack_eliminate 0.36% : 0.000001s : 5: predicate.switch_call_monad_eliminater 1.34% : 0.000004s : 23: predicate.switch_defer_inline 1.85% : 0.000005s : 33: predicate.switch_layer_defer_inline 4.80% : 0.000013s : 74: predicate.switch_simplify 0.95% : 0.000003s : 15: predicate.tile_eliminate 0.88% : 0.000002s : 15: predicate.transpose_eliminate 1.55% : 0.000004s : 25: predicate.tuple_list_convert_item_index_to_positive 1.60% : 0.000004s : 25: predicate.tuple_list_get_item_const_eliminator 1.33% : 0.000004s : 25: predicate.tuple_list_get_item_depend_reorder 3.23% : 0.000009s : 39: predicate.tuple_list_get_item_eliminator 1.29% : 0.000003s : 25: predicate.tuple_list_get_set_item_eliminator 2.12% : 0.000006s : 35: predicate.tuple_list_set_item_eliminator 1.53% : 0.000004s : 29: predicate.tuple_to_list_eliminator_ 2.20% : 0.000006s : 44: predicate.updatestate_pure_node_eliminater 2.84% : 0.000008s : 54: predicate.updatestate_useless_node_eliminater 0.51% : 0.000001s : 5: predicate.value_based_eliminate 0.68% : 0.000002s : 10: predicate.virtual_dataset_eliminate 0.65% : 0.000002s : 10: predicate.virtual_output_eliminate 0.23% : 0.000001s : 5: predicate.virtual_view_grad_eliminate 0.46% : 0.000001s : 5: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000768 11 54.85% : 0.000422s : 5: func_graph_cloner_run.FuncGraphClonerGraph 45.15% : 0.000347s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.095105 192 0.00% : 0.000004s : 1: ForceFp32Comm 11.87% : 0.011292s : 1: add_attr 11.86% : 0.011276s : 1: add_attr_with_inline 0.00% : 0.000004s : 1: add_comm_op_reuse_tag 0.07% : 0.000069s : 1: add_recomputation 0.00% : 0.000004s : 1: assign_add_opt 0.08% : 0.000080s : 1: auto_monad 0.03% : 0.000025s : 1: auto_monad_reorder 0.00% : 0.000004s : 1: begin_end_overlap_inline 0.01% : 0.000006s : 1: bias_add_comm_swap 0.50% : 0.000479s : 1: bootstrap 0.04% : 0.000040s : 1: cconv 0.00% : 0.000004s : 1: comm_op_add_attrs 0.02% : 0.000018s : 1: control_data_broadcast_order 0.01% : 0.000012s : 1: convert_after_rewriter 0.03% : 0.000029s : 1: cse_after_recomputation 0.01% : 0.000005s : 1: dataset_repeat_opt 0.01% : 0.000006s : 1: detach_backward 0.01% : 0.000010s : 1: environ_conv 0.03% : 0.000033s : 1: event_method 0.01% : 0.000005s : 1: full_micro_interleaved_order_control 0.01% : 0.000006s : 1: get_jit_bprop_graph 0.01% : 0.000010s : 1: graph_reusing 0.00% : 0.000005s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000004s : 1: handle_group_info 0.01% : 0.000007s : 1: inline 0.01% : 0.000006s : 1: insert-virtual-dataset 0.00% : 0.000004s : 1: interleave_parallel_branches 0.00% : 0.000004s : 1: interleave_split_concat_branches 0.01% : 0.000006s : 1: label_fine_grained_interleaved_index 0.01% : 0.000008s : 1: label_micro_interleaved_index 0.50% : 0.000480s : 1: loop_unroll 0.00% : 0.000004s : 1: merge_cast_opt 0.01% : 0.000005s : 1: micro_interleaved_order_control 0.82% : 0.000775s : 1: mutable_eliminate 0.01% : 0.000007s : 1: offloading_packed_experts 0.02% : 0.000017s : 1: opt.transform.loop_unroll_optimizer 0.03% : 0.000027s : 1: opt.transform.mutable_eliminate 1.60% : 0.001520s : 78: opt.transform.opt_a 0.04% : 0.000037s : 1: opt.transform.opt_after_cconv 0.04% : 0.000035s : 1: opt.transform.opt_after_jit_grad 0.16% : 0.000153s : 28: opt.transform.opt_b 0.06% : 0.000059s : 2: opt.transform.opt_trans_graph 0.05% : 0.000045s : 4: opt.transform.symbol_engine_opt 3.78% : 0.003592s : 1: opt_a 0.14% : 0.000134s : 1: opt_after_cconv 0.58% : 0.000548s : 1: opt_after_jit_grad 0.31% : 0.000291s : 1: opt_b 6.48% : 0.006167s : 1: optimize 0.03% : 0.000026s : 1: optimize_parallel_all_gather_comm 0.01% : 0.000010s : 1: order_py_execute_after_rewriter 0.03% : 0.000030s : 1: overlap_grad_flash_sp 0.00% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.01% : 0.000008s : 1: overlap_grad_ring_attention 0.01% : 0.000006s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.01% : 0.000005s : 1: overlap_param_gather 0.00% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.01% : 0.000008s : 1: overlap_recompute_and_grad_model_parallel 0.01% : 0.000005s : 1: overlap_recompute_comm 0.01% : 0.000008s : 1: parallel-infer-symbol 0.00% : 0.000004s : 1: parallel-infer-symbol-second 0.01% : 0.000005s : 1: partial_unused_args_eliminate 0.00% : 0.000005s : 1: pipeline_parallel_scheduler 0.00% : 0.000005s : 1: pipeline_split 0.05% : 0.000047s : 1: pre_auto_parallel 0.04% : 0.000037s : 1: py_interpret_to_execute 0.03% : 0.000024s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000004s : 1: remove_cast_before_assign_add 0.02% : 0.000020s : 1: remove_dup_value 0.61% : 0.000580s : 1: renormalize.infer 0.44% : 0.000420s : 1: renormalize.specialize 0.01% : 0.000005s : 1: reorder_send_recv_between_fp_bp 0.01% : 0.000009s : 1: rewriter_after_jit_bprop_graph 0.06% : 0.000060s : 1: rewriter_after_opt_a 0.11% : 0.000103s : 1: rewriter_before_opt_a 0.01% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.01% : 0.000005s : 1: slice_recompute_activation 0.00% : 0.000005s : 1: split_layernorm_comm 0.01% : 0.000006s : 1: split_matmul_comm_elemetwise 0.01% : 0.000009s : 1: swap_dp_allreduce_reducescatter 0.09% : 0.000090s : 1: symbol_engine_optimizer 0.09% : 0.000090s : 1: tuple_transform 58.97% : 0.056087s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:44:20.247.761 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:44:20.248.049 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0661638, [21] [bootstrap]: 0.00044445 [type_inference]: 0.00614963 [event_method]: 2.052e-05 [auto_monad]: 6.446e-05 [graph_reusing]: 6.74999e-06 [inline]: 2.47001e-06 [add_attr]: 0.00339737, [1] [add_attr_with_inline]: 0.00338673, [1] [Cycle 1]: 9.147e-05, [2] [tag_attr]: 2.157e-05 [meta_addattr_fg_expand]: 6.27001e-06 [parallel-infer-symbol]: 3.97e-06 [pre_auto_parallel]: 3.981e-05 [insert-virtual-dataset]: 2.78e-06 [parallel-infer-symbol-second]: 7.29982e-07 [dataset_repeat_opt]: 1.85001e-06 [pipeline_split]: 1.62999e-06 [optimize]: 0.0548932, [53] [py_interpret_to_execute]: 3.361e-05 [rewriter_before_opt_a]: 9.315e-05 [opt_a]: 0.0522833, [2] [Cycle 1]: 0.0513491, [45] [expand_dump_flag]: 3.7e-06 [switch_simplify]: 4.381e-05 [loop_unroll]: 3.056e-05 [a_1]: 0.00064876 [with_stream_mark]: 1.956e-05 [recompute_prepare]: 9.19998e-06 [updatestate_depend_eliminate]: 4.05e-06 [updatestate_assign_eliminate]: 3.17002e-06 [updatestate_loads_eliminate]: 3.44001e-06 [parameter_eliminate]: 1.94999e-06 [a_2]: 0.00010987 [accelerated_algorithm]: 7.3e-06 [shard]: 2.27001e-06 [meta_shard_fg_expand]: 2.06998e-06 [shard_inline]: 6.83998e-06 [merge_send_recv]: 8.65999e-06 [auto_parallel]: 7.26001e-06 [parallel]: 2.126e-05 [flash_sp]: 9.09e-06 [merge_comm]: 4.12e-06 [allreduce_fusion]: 3.69002e-06 [matmul_add_comm_reduction]: 1.003e-05 [allreduce_slice_to_reducescatter]: 6.80011e-07 [virtual_shard_identity]: 9.31e-06 [virtual_dataset]: 7.46999e-06 [get_grad_eliminate_]: 6.73e-06 [virtual_output]: 6.71999e-06 [merge_forward]: 4.57e-06 [cell_reuse_recompute_pass]: 1.30999e-06 [offload_activation]: 1.076e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.485e-05 [merge_recompute_call_nodes]: 1.99e-06 [before_grad]: 1.132e-05 [set_forward_comm_id_for_comm_node_pass]: 4.07e-06 [meta_fg_expand]: 3.58e-06 [flash_sp_send_recv_attached]: 2.64999e-06 [receive_attached]: 3.00002e-06 [after_resolve]: 1.201e-05 [a_after_grad]: 1.008e-05 [renormalize]: 0.0496819 [add_forward_monad_depend]: 1.259e-05 [auto_monad_grad]: 2.69001e-06 [auto_monad_eliminator]: 2.653e-05 [cse]: 3.381e-05 [a_3]: 7.887e-05 [Cycle 2]: 0.00091624, [45] [expand_dump_flag]: 2.12999e-06 [switch_simplify]: 9.60001e-06 [loop_unroll]: 8.35001e-06 [a_1]: 0.00014286 [with_stream_mark]: 2.195e-05 [recompute_prepare]: 6.71999e-06 [updatestate_depend_eliminate]: 4.37e-06 [updatestate_assign_eliminate]: 3.04999e-06 [updatestate_loads_eliminate]: 2.93998e-06 [parameter_eliminate]: 1.93002e-06 [a_2]: 0.00010166 [accelerated_algorithm]: 8.03001e-06 [shard]: 2.36e-06 [meta_shard_fg_expand]: 2.43e-06 [shard_inline]: 6.38998e-06 [merge_send_recv]: 9.56998e-06 [auto_parallel]: 1.026e-05 [parallel]: 8.97999e-06 [flash_sp]: 4.28001e-06 [merge_comm]: 3.63e-06 [allreduce_fusion]: 3.49001e-06 [matmul_add_comm_reduction]: 9.35001e-06 [allreduce_slice_to_reducescatter]: 7.49977e-07 [virtual_shard_identity]: 7.45e-06 [virtual_dataset]: 6.37001e-06 [get_grad_eliminate_]: 6.24999e-06 [virtual_output]: 6.14999e-06 [merge_forward]: 5.86998e-06 [cell_reuse_recompute_pass]: 3.51999e-06 [offload_activation]: 1.058e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.97e-05 [merge_recompute_call_nodes]: 1.47001e-06 [before_grad]: 1.157e-05 [set_forward_comm_id_for_comm_node_pass]: 4.37e-06 [meta_fg_expand]: 3.05002e-06 [flash_sp_send_recv_attached]: 1.88002e-06 [receive_attached]: 2.90998e-06 [after_resolve]: 1.296e-05 [a_after_grad]: 1.058e-05 [renormalize]: 6.00121e-08 [add_forward_monad_depend]: 2.23002e-06 [auto_monad_grad]: 1.32999e-06 [auto_monad_eliminator]: 8.72998e-06 [cse]: 1.617e-05 [a_3]: 4.917e-05 [py_interpret_to_execute_after_opt_a]: 2.194e-05 [slice_cell_reuse_recomputed_activation]: 5.47999e-06 [rewriter_after_opt_a]: 4.399e-05 [convert_after_rewriter]: 9.99001e-06 [order_py_execute_after_rewriter]: 8.3e-06 [mutable_eliminate]: 0.00074302 [opt_b]: 0.00029615, [1] [Cycle 1]: 0.00028607, [7] [b_1]: 0.00018527 [b_2]: 1.002e-05 [updatestate_depend_eliminate]: 6.46e-06 [updatestate_assign_eliminate]: 3.04999e-06 [updatestate_loads_eliminate]: 2.43998e-06 [renormalize]: 5.69999e-07 [cse]: 1.988e-05 [optimize_parallel_all_gather_comm]: 2.064e-05 [overlap_param_gather]: 4.58999e-06 [cconv]: 3.469e-05 [loop_unroll]: 0.00045631 [opt_after_cconv]: 0.00013008, [1] [Cycle 1]: 0.00012071, [7] [c_1]: 3.2e-05 [parameter_eliminate]: 3.98999e-06 [updatestate_depend_eliminate]: 6.01e-06 [updatestate_assign_eliminate]: 2.68e-06 [updatestate_loads_eliminate]: 2.41998e-06 [cse]: 1.721e-05 [renormalize]: 4.30009e-07 [remove_dup_value]: 1.67e-05 [tuple_transform]: 8.938e-05, [1] [Cycle 1]: 8.192e-05, [4] [d_1]: 4.328e-05 [none_parameter_eliminate]: 1.67001e-06 [renormalize]: 2.19996e-07 [switch_simplify]: 7.13e-06 [partial_unused_args_eliminate]: 4.35e-06 [add_recomputation]: 5.052e-05 [cse_after_recomputation]: 2.685e-05, [1] [Cycle 1]: 1.998e-05, [1] [cse]: 1.064e-05 [environ_conv]: 8.77e-06 [swap_dp_allreduce_reducescatter]: 7.77e-06 [bias_add_comm_swap]: 5.34e-06 [label_micro_interleaved_index]: 7.23e-06 [label_fine_grained_interleaved_index]: 5.66e-06 [merge_cast_opt]: 3.81999e-06 [slice_recompute_activation]: 4.47998e-06 [micro_interleaved_order_control]: 4.94e-06 [assign_add_opt]: 3.62998e-06 [ForceFp32Comm]: 3.59002e-06 [remove_cast_before_assign_add]: 3.5e-06 [full_micro_interleaved_order_control]: 4.52998e-06 [reorder_send_recv_between_fp_bp]: 5.92001e-06 [comm_op_add_attrs]: 3.71001e-06 [add_comm_op_reuse_tag]: 1.756e-05 [interleave_split_concat_branches]: 4.62998e-06 [interleave_parallel_branches]: 3.78001e-06 [overlap_opt_shard_in_pipeline]: 3.76001e-06 [overlap_opt_shard_grad_in_pipeline]: 4.2e-06 [control_data_broadcast_order]: 1.549e-05 [grouped_pairwise_exchange_alltoall]: 3.88001e-06 [offloading_packed_experts]: 7.23e-06 [overlap_recompute_and_grad_model_parallel]: 8.18001e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.75998e-06 [overlap_recompute_allgather_and_fa_grad]: 3.69002e-06 [overlap_recompute_comm]: 5.05999e-06 [overlap_grad_ring_attention]: 6.81001e-06 [overlap_grad_flash_sp]: 2.531e-05 [begin_end_overlap_inline]: 2.88998e-06 [split_matmul_comm_elemetwise]: 4.48001e-06 [split_layernorm_comm]: 3.93001e-06 [handle_group_info]: 3.39001e-06 [symbol_engine_optimizer]: 0.00010029, [1] [Cycle 1]: 9.278e-05, [6] [build]: 3.6e-06 [elim_shapecalc]: 9.94001e-06 [elim_not_effective]: 1.377e-05 [opt_reshape]: 7.93999e-06 [fold_const_symbol]: 1.105e-05 [renormalize]: 2.3999e-07 [detach_backward]: 4.16001e-06 [pipeline_parallel_scheduler]: 1.92001e-06 [auto_monad_reorder]: 1.943e-05 [get_jit_bprop_graph]: 1.77999e-06 [rewriter_after_jit_bprop_graph]: 5.15001e-06 [opt_after_jit_grad]: 0.0004866 [validate]: 4.236e-05 Sums bootstrap : 0.000444s : 0.73% type_inference : 0.006150s : 10.09% event_method : 0.000021s : 0.03% auto_monad : 0.000064s : 0.11% graph_reusing : 0.000007s : 0.01% inline : 0.000002s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000022s : 0.04% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.01% parallel-infer-symbol : 0.000004s : 0.01% pre_auto_parallel : 0.000040s : 0.07% insert-virtual-dataset : 0.000003s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000034s : 0.06% optimize.rewriter_before_opt_a : 0.000093s : 0.15% optimize.opt_a.expand_dump_flag : 0.000006s : 0.01% optimize.opt_a.switch_simplify : 0.000053s : 0.09% optimize.opt_a.loop_unroll : 0.000039s : 0.06% optimize.opt_a.a_1 : 0.000792s : 1.30% optimize.opt_a.with_stream_mark : 0.000042s : 0.07% optimize.opt_a.recompute_prepare : 0.000016s : 0.03% optimize.opt_a.updatestate_depend_eliminate : 0.000008s : 0.01% optimize.opt_a.updatestate_assign_eliminate : 0.000006s : 0.01% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.01% optimize.opt_a.parameter_eliminate : 0.000004s : 0.01% optimize.opt_a.a_2 : 0.000212s : 0.35% optimize.opt_a.accelerated_algorithm : 0.000015s : 0.03% optimize.opt_a.shard : 0.000005s : 0.01% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.01% optimize.opt_a.shard_inline : 0.000013s : 0.02% optimize.opt_a.merge_send_recv : 0.000018s : 0.03% optimize.opt_a.auto_parallel : 0.000018s : 0.03% optimize.opt_a.parallel : 0.000030s : 0.05% optimize.opt_a.flash_sp : 0.000013s : 0.02% optimize.opt_a.merge_comm : 0.000008s : 0.01% optimize.opt_a.allreduce_fusion : 0.000007s : 0.01% optimize.opt_a.matmul_add_comm_reduction : 0.000019s : 0.03% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000017s : 0.03% optimize.opt_a.virtual_dataset : 0.000014s : 0.02% optimize.opt_a.get_grad_eliminate_ : 0.000013s : 0.02% optimize.opt_a.virtual_output : 0.000013s : 0.02% optimize.opt_a.merge_forward : 0.000010s : 0.02% optimize.opt_a.cell_reuse_recompute_pass : 0.000005s : 0.01% optimize.opt_a.offload_activation : 0.000021s : 0.04% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000035s : 0.06% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.01% optimize.opt_a.before_grad : 0.000023s : 0.04% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000008s : 0.01% optimize.opt_a.meta_fg_expand : 0.000007s : 0.01% optimize.opt_a.flash_sp_send_recv_attached : 0.000005s : 0.01% optimize.opt_a.receive_attached : 0.000006s : 0.01% optimize.opt_a.after_resolve : 0.000025s : 0.04% optimize.opt_a.a_after_grad : 0.000021s : 0.03% optimize.opt_a.renormalize : 0.049682s : 81.51% optimize.opt_a.add_forward_monad_depend : 0.000015s : 0.02% optimize.opt_a.auto_monad_grad : 0.000004s : 0.01% optimize.opt_a.auto_monad_eliminator : 0.000035s : 0.06% optimize.opt_a.cse : 0.000050s : 0.08% optimize.opt_a.a_3 : 0.000128s : 0.21% optimize.py_interpret_to_execute_after_opt_a : 0.000022s : 0.04% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.01% optimize.rewriter_after_opt_a : 0.000044s : 0.07% optimize.convert_after_rewriter : 0.000010s : 0.02% optimize.order_py_execute_after_rewriter : 0.000008s : 0.01% optimize.mutable_eliminate : 0.000743s : 1.22% optimize.opt_b.b_1 : 0.000185s : 0.30% optimize.opt_b.b_2 : 0.000010s : 0.02% optimize.opt_b.updatestate_depend_eliminate : 0.000006s : 0.01% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.01% optimize.opt_b.updatestate_loads_eliminate : 0.000002s : 0.00% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000020s : 0.03% optimize.optimize_parallel_all_gather_comm : 0.000021s : 0.03% optimize.overlap_param_gather : 0.000005s : 0.01% optimize.cconv : 0.000035s : 0.06% optimize.loop_unroll : 0.000456s : 0.75% optimize.opt_after_cconv.c_1 : 0.000032s : 0.05% optimize.opt_after_cconv.parameter_eliminate : 0.000004s : 0.01% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.01% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.00% optimize.opt_after_cconv.cse : 0.000017s : 0.03% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000017s : 0.03% optimize.tuple_transform.d_1 : 0.000043s : 0.07% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000007s : 0.01% optimize.partial_unused_args_eliminate : 0.000004s : 0.01% optimize.add_recomputation : 0.000051s : 0.08% optimize.cse_after_recomputation.cse : 0.000011s : 0.02% optimize.environ_conv : 0.000009s : 0.01% optimize.swap_dp_allreduce_reducescatter : 0.000008s : 0.01% optimize.bias_add_comm_swap : 0.000005s : 0.01% optimize.label_micro_interleaved_index : 0.000007s : 0.01% optimize.label_fine_grained_interleaved_index : 0.000006s : 0.01% optimize.merge_cast_opt : 0.000004s : 0.01% optimize.slice_recompute_activation : 0.000004s : 0.01% optimize.micro_interleaved_order_control : 0.000005s : 0.01% optimize.assign_add_opt : 0.000004s : 0.01% optimize.ForceFp32Comm : 0.000004s : 0.01% optimize.remove_cast_before_assign_add : 0.000003s : 0.01% optimize.full_micro_interleaved_order_control : 0.000005s : 0.01% optimize.reorder_send_recv_between_fp_bp : 0.000006s : 0.01% optimize.comm_op_add_attrs : 0.000004s : 0.01% optimize.add_comm_op_reuse_tag : 0.000018s : 0.03% optimize.interleave_split_concat_branches : 0.000005s : 0.01% optimize.interleave_parallel_branches : 0.000004s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000004s : 0.01% optimize.control_data_broadcast_order : 0.000015s : 0.03% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.01% optimize.offloading_packed_experts : 0.000007s : 0.01% optimize.overlap_recompute_and_grad_model_parallel : 0.000008s : 0.01% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.01% optimize.overlap_recompute_comm : 0.000005s : 0.01% optimize.overlap_grad_ring_attention : 0.000007s : 0.01% optimize.overlap_grad_flash_sp : 0.000025s : 0.04% optimize.begin_end_overlap_inline : 0.000003s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000004s : 0.01% optimize.split_layernorm_comm : 0.000004s : 0.01% optimize.handle_group_info : 0.000003s : 0.01% optimize.symbol_engine_optimizer.build : 0.000004s : 0.01% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000010s : 0.02% optimize.symbol_engine_optimizer.elim_not_effective : 0.000014s : 0.02% optimize.symbol_engine_optimizer.opt_reshape : 0.000008s : 0.01% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000011s : 0.02% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000004s : 0.01% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000019s : 0.03% get_jit_bprop_graph : 0.000002s : 0.00% rewriter_after_jit_bprop_graph : 0.000005s : 0.01% opt_after_jit_grad : 0.000487s : 0.80% validate : 0.000042s : 0.07% Time group info: ------[substitution.] 0.000211 29 0.83% : 0.000002s : 2: substitution.elim_not_effective 0.70% : 0.000001s : 2: substitution.fold_const_symbol 2.88% : 0.000006s : 4: substitution.graph_param_transform 76.01% : 0.000160s : 4: substitution.inline 2.54% : 0.000005s : 4: substitution.j_node_and_user_rematch 2.57% : 0.000005s : 4: substitution.remove_not_recompute_node 3.37% : 0.000007s : 4: substitution.replace_old_param 7.55% : 0.000016s : 4: substitution.tuple_list_get_item_eliminator 3.55% : 0.000007s : 1: substitution.value_based_eliminate ------[type_inference.] 0.006087 2 88.05% : 0.005360s : 1: type_inference.infer 11.95% : 0.000727s : 1: type_inference.specialize ------[replace.] 0.000061 8 64.64% : 0.000040s : 4: replace.inline 35.36% : 0.000022s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000171 8 91.89% : 0.000157s : 4: match.inline 8.11% : 0.000014s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000217 1278 0.96% : 0.000002s : 13: predicate.accumulaten_eliminater 0.69% : 0.000001s : 4: predicate.ad_related_special_op_eliminate 0.52% : 0.000001s : 8: predicate.addn_check_dump 0.83% : 0.000002s : 13: predicate.addn_zero_filter 0.78% : 0.000002s : 13: predicate.adjust_all_reduce_mul_add 2.30% : 0.000005s : 21: predicate.arithmetic_simplify 1.10% : 0.000002s : 13: predicate.cast_eliminate 0.60% : 0.000001s : 8: predicate.check_bprop_eliminate 0.64% : 0.000001s : 8: predicate.compare_switch_simplify 0.19% : 0.000000s : 4: predicate.const_output_eliminate 0.68% : 0.000001s : 8: predicate.depend_value_elim 1.02% : 0.000002s : 13: predicate.dict_get_item_const_eliminator 1.04% : 0.000002s : 13: predicate.dict_get_item_eliminator 0.91% : 0.000002s : 13: predicate.dict_set_item_eliminator 1.09% : 0.000002s : 8: predicate.dumpgradient_eliminate 0.22% : 0.000000s : 4: predicate.elim_not_effective 0.32% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.25% : 0.000003s : 17: predicate.environ_add_const_eliminate 1.17% : 0.000003s : 17: predicate.environ_get_add_eliminate 1.14% : 0.000002s : 17: predicate.environ_get_depend_swap 1.67% : 0.000004s : 25: predicate.environ_get_eliminate 1.14% : 0.000002s : 17: predicate.environ_get_set_eliminate 1.39% : 0.000003s : 21: predicate.exchange_switch_depend_value 2.47% : 0.000005s : 21: predicate.float_depend_g_call 0.51% : 0.000001s : 8: predicate.float_environ_get_switch 0.78% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.19% : 0.000000s : 4: predicate.fold_const_symbol 0.66% : 0.000001s : 8: predicate.get_grad_eliminate 0.19% : 0.000000s : 4: predicate.graph_param_transform 0.56% : 0.000001s : 8: predicate.incorporate_call 0.46% : 0.000001s : 8: predicate.incorporate_call_switch 6.18% : 0.000013s : 58: predicate.inline 0.74% : 0.000002s : 8: predicate.inline_without_move 0.30% : 0.000001s : 8: predicate.j_node_and_user_rematch 0.95% : 0.000002s : 8: predicate.less_batch_normalization 1.83% : 0.000004s : 25: predicate.list_to_tuple_eliminator_ 2.33% : 0.000005s : 38: predicate.load_eliminater 0.76% : 0.000002s : 4: predicate.loop_unroll_after_grad 2.38% : 0.000005s : 34: predicate.loop_unroll_before_grad 1.66% : 0.000004s : 21: predicate.make_slice_get_slice_eliminator 0.47% : 0.000001s : 8: predicate.merge_addn 0.54% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.55% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.79% : 0.000002s : 13: predicate.minmaximum_grad 1.12% : 0.000002s : 4: predicate.mutable_eliminate 0.36% : 0.000001s : 4: predicate.opt_reshape 0.47% : 0.000001s : 4: predicate.parallel_virtual_node 1.70% : 0.000004s : 21: predicate.partial_defer_inline 1.58% : 0.000003s : 21: predicate.partial_eliminate 0.91% : 0.000002s : 13: predicate.print_const_string_wrapper 0.62% : 0.000001s : 8: predicate.reduce_all_const_elim 1.17% : 0.000003s : 13: predicate.reduce_eliminate 2.46% : 0.000005s : 38: predicate.redundant_stop_gradient_eliminater 0.44% : 0.000001s : 8: predicate.remove_not_recompute_node 1.46% : 0.000003s : 25: predicate.replace_applicator 0.58% : 0.000001s : 8: predicate.replace_old_param 0.36% : 0.000001s : 4: predicate.reset_defer_inline 1.19% : 0.000003s : 13: predicate.reshape_eliminate 0.66% : 0.000001s : 8: predicate.row_tensor_add_zeros_like 0.44% : 0.000001s : 4: predicate.row_tensor_eliminate 0.92% : 0.000002s : 8: predicate.same_eliminate 0.43% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.72% : 0.000002s : 8: predicate.shard_identity_eliminate 0.65% : 0.000001s : 8: predicate.special_op_eliminate 0.65% : 0.000001s : 8: predicate.specialize_transform 0.99% : 0.000002s : 8: predicate.split_environ_get_set_with_tuple_value 1.06% : 0.000002s : 8: predicate.stack_unstack_eliminate 0.33% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.55% : 0.000003s : 21: predicate.switch_defer_inline 2.02% : 0.000004s : 29: predicate.switch_layer_defer_inline 5.00% : 0.000011s : 67: predicate.switch_simplify 0.89% : 0.000002s : 13: predicate.tile_eliminate 0.97% : 0.000002s : 13: predicate.transpose_eliminate 1.76% : 0.000004s : 21: predicate.tuple_list_convert_item_index_to_positive 1.44% : 0.000003s : 21: predicate.tuple_list_get_item_const_eliminator 1.45% : 0.000003s : 21: predicate.tuple_list_get_item_depend_reorder 3.04% : 0.000007s : 33: predicate.tuple_list_get_item_eliminator 1.42% : 0.000003s : 21: predicate.tuple_list_get_set_item_eliminator 2.20% : 0.000005s : 29: predicate.tuple_list_set_item_eliminator 1.76% : 0.000004s : 25: predicate.tuple_to_list_eliminator_ 2.35% : 0.000005s : 38: predicate.updatestate_pure_node_eliminater 3.03% : 0.000007s : 46: predicate.updatestate_useless_node_eliminater 0.66% : 0.000001s : 4: predicate.value_based_eliminate 0.77% : 0.000002s : 8: predicate.virtual_dataset_eliminate 0.68% : 0.000001s : 8: predicate.virtual_output_eliminate 0.25% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.49% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000660 11 48.70% : 0.000321s : 5: func_graph_cloner_run.FuncGraphClonerGraph 51.30% : 0.000339s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.175518 192 0.00% : 0.000006s : 1: ForceFp32Comm 1.94% : 0.003408s : 1: add_attr 1.93% : 0.003391s : 1: add_attr_with_inline 0.01% : 0.000021s : 1: add_comm_op_reuse_tag 0.03% : 0.000054s : 1: add_recomputation 0.00% : 0.000006s : 1: assign_add_opt 0.04% : 0.000073s : 1: auto_monad 0.02% : 0.000027s : 1: auto_monad_reorder 0.00% : 0.000006s : 1: begin_end_overlap_inline 0.00% : 0.000008s : 1: bias_add_comm_swap 0.28% : 0.000493s : 1: bootstrap 0.02% : 0.000038s : 1: cconv 0.00% : 0.000006s : 1: comm_op_add_attrs 0.01% : 0.000019s : 1: control_data_broadcast_order 0.01% : 0.000013s : 1: convert_after_rewriter 0.02% : 0.000030s : 1: cse_after_recomputation 0.00% : 0.000007s : 1: dataset_repeat_opt 0.01% : 0.000018s : 1: detach_backward 0.01% : 0.000012s : 1: environ_conv 0.02% : 0.000031s : 1: event_method 0.00% : 0.000007s : 1: full_micro_interleaved_order_control 0.01% : 0.000009s : 1: get_jit_bprop_graph 0.01% : 0.000013s : 1: graph_reusing 0.00% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000006s : 1: handle_group_info 0.00% : 0.000008s : 1: inline 0.01% : 0.000009s : 1: insert-virtual-dataset 0.00% : 0.000007s : 1: interleave_parallel_branches 0.00% : 0.000008s : 1: interleave_split_concat_branches 0.00% : 0.000008s : 1: label_fine_grained_interleaved_index 0.01% : 0.000010s : 1: label_micro_interleaved_index 0.26% : 0.000462s : 1: loop_unroll 0.00% : 0.000007s : 1: merge_cast_opt 0.00% : 0.000008s : 1: micro_interleaved_order_control 0.43% : 0.000751s : 1: mutable_eliminate 0.01% : 0.000010s : 1: offloading_packed_experts 0.01% : 0.000014s : 1: opt.transform.loop_unroll_optimizer 0.01% : 0.000018s : 1: opt.transform.mutable_eliminate 0.71% : 0.001242s : 78: opt.transform.opt_a 0.02% : 0.000030s : 1: opt.transform.opt_after_cconv 0.01% : 0.000026s : 1: opt.transform.opt_after_jit_grad 0.07% : 0.000119s : 28: opt.transform.opt_b 0.03% : 0.000048s : 2: opt.transform.opt_trans_graph 0.02% : 0.000039s : 4: opt.transform.symbol_engine_opt 29.79% : 0.052287s : 1: opt_a 0.08% : 0.000134s : 1: opt_after_cconv 0.28% : 0.000497s : 1: opt_after_jit_grad 0.17% : 0.000300s : 1: opt_b 31.45% : 0.055208s : 1: optimize 0.01% : 0.000024s : 1: optimize_parallel_all_gather_comm 0.01% : 0.000011s : 1: order_py_execute_after_rewriter 0.02% : 0.000028s : 1: overlap_grad_flash_sp 0.00% : 0.000006s : 1: overlap_grad_matmul_and_grad_allreduce 0.01% : 0.000010s : 1: overlap_grad_ring_attention 0.00% : 0.000007s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000006s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000008s : 1: overlap_param_gather 0.00% : 0.000006s : 1: overlap_recompute_allgather_and_fa_grad 0.01% : 0.000011s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000008s : 1: overlap_recompute_comm 0.01% : 0.000011s : 1: parallel-infer-symbol 0.00% : 0.000006s : 1: parallel-infer-symbol-second 0.00% : 0.000007s : 1: partial_unused_args_eliminate 0.01% : 0.000009s : 1: pipeline_parallel_scheduler 0.00% : 0.000007s : 1: pipeline_split 0.03% : 0.000047s : 1: pre_auto_parallel 0.02% : 0.000038s : 1: py_interpret_to_execute 0.01% : 0.000025s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000006s : 1: remove_cast_before_assign_add 0.01% : 0.000020s : 1: remove_dup_value 28.00% : 0.049138s : 1: renormalize.infer 0.30% : 0.000523s : 1: renormalize.specialize 0.00% : 0.000009s : 1: reorder_send_recv_between_fp_bp 0.01% : 0.000011s : 1: rewriter_after_jit_bprop_graph 0.03% : 0.000048s : 1: rewriter_after_opt_a 0.06% : 0.000098s : 1: rewriter_before_opt_a 0.00% : 0.000008s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000007s : 1: slice_recompute_activation 0.00% : 0.000007s : 1: split_layernorm_comm 0.00% : 0.000007s : 1: split_matmul_comm_elemetwise 0.01% : 0.000011s : 1: swap_dp_allreduce_reducescatter 0.06% : 0.000103s : 1: symbol_engine_optimizer 0.05% : 0.000092s : 1: tuple_transform 3.53% : 0.006193s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:44:21.299.476 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0178766, [21] [bootstrap]: 0.00047759 [type_inference]: 0.00755845 [event_method]: 2.027e-05 [auto_monad]: 6.688e-05 [graph_reusing]: 6.33e-06 [inline]: 2.44999e-06 [add_attr]: 0.00381122, [1] [add_attr_with_inline]: 0.00379867, [1] [Cycle 1]: 7.352e-05, [2] [tag_attr]: 2.531e-05 [meta_addattr_fg_expand]: 7.31999e-06 [parallel-infer-symbol]: 4.05998e-06 [pre_auto_parallel]: 3.798e-05 [insert-virtual-dataset]: 2.59001e-06 [parallel-infer-symbol-second]: 1.00001e-06 [dataset_repeat_opt]: 2.24001e-06 [pipeline_split]: 1.69e-06 [optimize]: 0.00512125, [53] [py_interpret_to_execute]: 2.672e-05 [rewriter_before_opt_a]: 9.112e-05 [opt_a]: 0.00283202, [2] [Cycle 1]: 0.00212255, [45] [expand_dump_flag]: 3.09999e-06 [switch_simplify]: 4.364e-05 [loop_unroll]: 3.143e-05 [a_1]: 0.00063646 [with_stream_mark]: 1.643e-05 [recompute_prepare]: 9.84001e-06 [updatestate_depend_eliminate]: 3.70998e-06 [updatestate_assign_eliminate]: 3.33e-06 [updatestate_loads_eliminate]: 3.09999e-06 [parameter_eliminate]: 1.85001e-06 [a_2]: 8.232e-05 [accelerated_algorithm]: 7.4e-06 [shard]: 2.10002e-06 [meta_shard_fg_expand]: 1.82999e-06 [shard_inline]: 6.43e-06 [merge_send_recv]: 8.42e-06 [auto_parallel]: 7.16999e-06 [parallel]: 2.253e-05 [flash_sp]: 8.62e-06 [merge_comm]: 3.85998e-06 [allreduce_fusion]: 3.48e-06 [matmul_add_comm_reduction]: 1.113e-05 [allreduce_slice_to_reducescatter]: 1.04e-06 [virtual_shard_identity]: 8.29002e-06 [virtual_dataset]: 6.78998e-06 [get_grad_eliminate_]: 6.11998e-06 [virtual_output]: 6.66e-06 [merge_forward]: 4.26001e-06 [cell_reuse_recompute_pass]: 1.52001e-06 [offload_activation]: 9.76e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.438e-05 [merge_recompute_call_nodes]: 1.67999e-06 [before_grad]: 1.084e-05 [set_forward_comm_id_for_comm_node_pass]: 4.08001e-06 [meta_fg_expand]: 2.88998e-06 [flash_sp_send_recv_attached]: 2.55997e-06 [receive_attached]: 2.80002e-06 [after_resolve]: 1.227e-05 [a_after_grad]: 1.088e-05 [renormalize]: 0.00074035 [add_forward_monad_depend]: 5.87999e-06 [auto_monad_grad]: 2.61e-06 [auto_monad_eliminator]: 1.498e-05 [cse]: 3.093e-05 [a_3]: 5.007e-05 [Cycle 2]: 0.00069802, [45] [expand_dump_flag]: 1.96e-06 [switch_simplify]: 8.06001e-06 [loop_unroll]: 6.23998e-06 [a_1]: 0.00015488 [with_stream_mark]: 1.369e-05 [recompute_prepare]: 7.11001e-06 [updatestate_depend_eliminate]: 2.94999e-06 [updatestate_assign_eliminate]: 2.52001e-06 [updatestate_loads_eliminate]: 2.59001e-06 [parameter_eliminate]: 1.17e-06 [a_2]: 7.316e-05 [accelerated_algorithm]: 6.81001e-06 [shard]: 1.24e-06 [meta_shard_fg_expand]: 1.61998e-06 [shard_inline]: 6.10002e-06 [merge_send_recv]: 6.19001e-06 [auto_parallel]: 6.38e-06 [parallel]: 5.87999e-06 [flash_sp]: 3.71999e-06 [merge_comm]: 3.7e-06 [allreduce_fusion]: 3.88001e-06 [matmul_add_comm_reduction]: 5.72001e-06 [allreduce_slice_to_reducescatter]: 3.69997e-07 [virtual_shard_identity]: 1.091e-05 [virtual_dataset]: 6.26e-06 [get_grad_eliminate_]: 5.81003e-06 [virtual_output]: 5.94e-06 [merge_forward]: 3.95e-06 [cell_reuse_recompute_pass]: 2.09e-06 [offload_activation]: 7.62002e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.345e-05 [merge_recompute_call_nodes]: 1.06002e-06 [before_grad]: 9.51e-06 [set_forward_comm_id_for_comm_node_pass]: 3.58999e-06 [meta_fg_expand]: 2.56e-06 [flash_sp_send_recv_attached]: 1.26002e-06 [receive_attached]: 1.48002e-06 [after_resolve]: 1.124e-05 [a_after_grad]: 1.017e-05 [renormalize]: 9.00181e-08 [add_forward_monad_depend]: 1.23002e-06 [auto_monad_grad]: 1.19e-06 [auto_monad_eliminator]: 8.07998e-06 [cse]: 1.488e-05 [a_3]: 3.795e-05 [py_interpret_to_execute_after_opt_a]: 1.129e-05 [slice_cell_reuse_recomputed_activation]: 2.12001e-06 [rewriter_after_opt_a]: 3.923e-05 [convert_after_rewriter]: 7.15e-06 [order_py_execute_after_rewriter]: 5.10001e-06 [mutable_eliminate]: 0.00068268 [opt_b]: 0.00022394, [1] [Cycle 1]: 0.00021686, [7] [b_1]: 0.00013977 [b_2]: 9.42001e-06 [updatestate_depend_eliminate]: 5.79e-06 [updatestate_assign_eliminate]: 2.33998e-06 [updatestate_loads_eliminate]: 2.41e-06 [renormalize]: 7.49977e-07 [cse]: 1.893e-05 [optimize_parallel_all_gather_comm]: 1.778e-05 [overlap_param_gather]: 2.43998e-06 [cconv]: 2.974e-05 [loop_unroll]: 0.00045595 [opt_after_cconv]: 0.00010559, [1] [Cycle 1]: 9.926e-05, [7] [c_1]: 3.183e-05 [parameter_eliminate]: 4.02998e-06 [updatestate_depend_eliminate]: 5.01002e-06 [updatestate_assign_eliminate]: 2.76999e-06 [updatestate_loads_eliminate]: 2.34001e-06 [cse]: 1.773e-05 [renormalize]: 5.40022e-07 [remove_dup_value]: 1.479e-05 [tuple_transform]: 7.773e-05, [1] [Cycle 1]: 7.223e-05, [4] [d_1]: 4.51e-05 [none_parameter_eliminate]: 1.51998e-06 [renormalize]: 2.19996e-07 [switch_simplify]: 6.76999e-06 [partial_unused_args_eliminate]: 2.14999e-06 [add_recomputation]: 5.434e-05 [cse_after_recomputation]: 2.317e-05, [1] [Cycle 1]: 1.798e-05, [1] [cse]: 1.198e-05 [environ_conv]: 5.32999e-06 [swap_dp_allreduce_reducescatter]: 5.97001e-06 [bias_add_comm_swap]: 3.26999e-06 [label_micro_interleaved_index]: 4.50001e-06 [label_fine_grained_interleaved_index]: 2.84999e-06 [merge_cast_opt]: 1.56998e-06 [slice_recompute_activation]: 2.17999e-06 [micro_interleaved_order_control]: 2.58e-06 [assign_add_opt]: 1.19e-06 [ForceFp32Comm]: 8.60018e-07 [remove_cast_before_assign_add]: 1.38002e-06 [full_micro_interleaved_order_control]: 2.14e-06 [reorder_send_recv_between_fp_bp]: 3.23e-06 [comm_op_add_attrs]: 1.08001e-06 [add_comm_op_reuse_tag]: 1.25999e-06 [interleave_split_concat_branches]: 1.12e-06 [interleave_parallel_branches]: 1.19003e-06 [overlap_opt_shard_in_pipeline]: 1.20001e-06 [overlap_opt_shard_grad_in_pipeline]: 2.21998e-06 [control_data_broadcast_order]: 1.316e-05 [grouped_pairwise_exchange_alltoall]: 1.75001e-06 [offloading_packed_experts]: 4.33999e-06 [overlap_recompute_and_grad_model_parallel]: 5.24e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.52001e-06 [overlap_recompute_allgather_and_fa_grad]: 1.60999e-06 [overlap_recompute_comm]: 2.23002e-06 [overlap_grad_ring_attention]: 4.66002e-06 [overlap_grad_flash_sp]: 2.316e-05 [begin_end_overlap_inline]: 5.69999e-07 [split_matmul_comm_elemetwise]: 2.58e-06 [split_layernorm_comm]: 1.86998e-06 [handle_group_info]: 9.10019e-07 [symbol_engine_optimizer]: 8.235e-05, [1] [Cycle 1]: 7.786e-05, [6] [build]: 4.63001e-06 [elim_shapecalc]: 1.118e-05 [elim_not_effective]: 1.362e-05 [opt_reshape]: 7.82002e-06 [fold_const_symbol]: 1.07e-05 [renormalize]: 2.3999e-07 [detach_backward]: 2.29999e-06 [pipeline_parallel_scheduler]: 1.67001e-06 [auto_monad_reorder]: 1.778e-05 [get_jit_bprop_graph]: 1.86e-06 [rewriter_after_jit_bprop_graph]: 5.49e-06 [opt_after_jit_grad]: 0.00052122 [validate]: 4.566e-05 Sums bootstrap : 0.000478s : 3.65% type_inference : 0.007558s : 57.81% event_method : 0.000020s : 0.16% auto_monad : 0.000067s : 0.51% graph_reusing : 0.000006s : 0.05% inline : 0.000002s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000025s : 0.19% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000007s : 0.06% parallel-infer-symbol : 0.000004s : 0.03% pre_auto_parallel : 0.000038s : 0.29% insert-virtual-dataset : 0.000003s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000027s : 0.20% optimize.rewriter_before_opt_a : 0.000091s : 0.70% optimize.opt_a.expand_dump_flag : 0.000005s : 0.04% optimize.opt_a.switch_simplify : 0.000052s : 0.40% optimize.opt_a.loop_unroll : 0.000038s : 0.29% optimize.opt_a.a_1 : 0.000791s : 6.05% optimize.opt_a.with_stream_mark : 0.000030s : 0.23% optimize.opt_a.recompute_prepare : 0.000017s : 0.13% optimize.opt_a.updatestate_depend_eliminate : 0.000007s : 0.05% optimize.opt_a.updatestate_assign_eliminate : 0.000006s : 0.04% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.04% optimize.opt_a.parameter_eliminate : 0.000003s : 0.02% optimize.opt_a.a_2 : 0.000155s : 1.19% optimize.opt_a.accelerated_algorithm : 0.000014s : 0.11% optimize.opt_a.shard : 0.000003s : 0.03% optimize.opt_a.meta_shard_fg_expand : 0.000003s : 0.03% optimize.opt_a.shard_inline : 0.000013s : 0.10% optimize.opt_a.merge_send_recv : 0.000015s : 0.11% optimize.opt_a.auto_parallel : 0.000014s : 0.10% optimize.opt_a.parallel : 0.000028s : 0.22% optimize.opt_a.flash_sp : 0.000012s : 0.09% optimize.opt_a.merge_comm : 0.000008s : 0.06% optimize.opt_a.allreduce_fusion : 0.000007s : 0.06% optimize.opt_a.matmul_add_comm_reduction : 0.000017s : 0.13% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000019s : 0.15% optimize.opt_a.virtual_dataset : 0.000013s : 0.10% optimize.opt_a.get_grad_eliminate_ : 0.000012s : 0.09% optimize.opt_a.virtual_output : 0.000013s : 0.10% optimize.opt_a.merge_forward : 0.000008s : 0.06% optimize.opt_a.cell_reuse_recompute_pass : 0.000004s : 0.03% optimize.opt_a.offload_activation : 0.000017s : 0.13% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000028s : 0.21% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.02% optimize.opt_a.before_grad : 0.000020s : 0.16% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000008s : 0.06% optimize.opt_a.meta_fg_expand : 0.000005s : 0.04% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.03% optimize.opt_a.receive_attached : 0.000004s : 0.03% optimize.opt_a.after_resolve : 0.000024s : 0.18% optimize.opt_a.a_after_grad : 0.000021s : 0.16% optimize.opt_a.renormalize : 0.000740s : 5.66% optimize.opt_a.add_forward_monad_depend : 0.000007s : 0.05% optimize.opt_a.auto_monad_grad : 0.000004s : 0.03% optimize.opt_a.auto_monad_eliminator : 0.000023s : 0.18% optimize.opt_a.cse : 0.000046s : 0.35% optimize.opt_a.a_3 : 0.000088s : 0.67% optimize.py_interpret_to_execute_after_opt_a : 0.000011s : 0.09% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.02% optimize.rewriter_after_opt_a : 0.000039s : 0.30% optimize.convert_after_rewriter : 0.000007s : 0.05% optimize.order_py_execute_after_rewriter : 0.000005s : 0.04% optimize.mutable_eliminate : 0.000683s : 5.22% optimize.opt_b.b_1 : 0.000140s : 1.07% optimize.opt_b.b_2 : 0.000009s : 0.07% optimize.opt_b.updatestate_depend_eliminate : 0.000006s : 0.04% optimize.opt_b.updatestate_assign_eliminate : 0.000002s : 0.02% optimize.opt_b.updatestate_loads_eliminate : 0.000002s : 0.02% optimize.opt_b.renormalize : 0.000001s : 0.01% optimize.opt_b.cse : 0.000019s : 0.14% optimize.optimize_parallel_all_gather_comm : 0.000018s : 0.14% optimize.overlap_param_gather : 0.000002s : 0.02% optimize.cconv : 0.000030s : 0.23% optimize.loop_unroll : 0.000456s : 3.49% optimize.opt_after_cconv.c_1 : 0.000032s : 0.24% optimize.opt_after_cconv.parameter_eliminate : 0.000004s : 0.03% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000005s : 0.04% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.02% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.02% optimize.opt_after_cconv.cse : 0.000018s : 0.14% optimize.opt_after_cconv.renormalize : 0.000001s : 0.00% optimize.remove_dup_value : 0.000015s : 0.11% optimize.tuple_transform.d_1 : 0.000045s : 0.34% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000007s : 0.05% optimize.partial_unused_args_eliminate : 0.000002s : 0.02% optimize.add_recomputation : 0.000054s : 0.42% optimize.cse_after_recomputation.cse : 0.000012s : 0.09% optimize.environ_conv : 0.000005s : 0.04% optimize.swap_dp_allreduce_reducescatter : 0.000006s : 0.05% optimize.bias_add_comm_swap : 0.000003s : 0.03% optimize.label_micro_interleaved_index : 0.000005s : 0.03% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.02% optimize.merge_cast_opt : 0.000002s : 0.01% optimize.slice_recompute_activation : 0.000002s : 0.02% optimize.micro_interleaved_order_control : 0.000003s : 0.02% optimize.assign_add_opt : 0.000001s : 0.01% optimize.ForceFp32Comm : 0.000001s : 0.01% optimize.remove_cast_before_assign_add : 0.000001s : 0.01% optimize.full_micro_interleaved_order_control : 0.000002s : 0.02% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.02% optimize.comm_op_add_attrs : 0.000001s : 0.01% optimize.add_comm_op_reuse_tag : 0.000001s : 0.01% optimize.interleave_split_concat_branches : 0.000001s : 0.01% optimize.interleave_parallel_branches : 0.000001s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.02% optimize.control_data_broadcast_order : 0.000013s : 0.10% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.01% optimize.offloading_packed_experts : 0.000004s : 0.03% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.04% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000002s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000002s : 0.01% optimize.overlap_recompute_comm : 0.000002s : 0.02% optimize.overlap_grad_ring_attention : 0.000005s : 0.04% optimize.overlap_grad_flash_sp : 0.000023s : 0.18% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000003s : 0.02% optimize.split_layernorm_comm : 0.000002s : 0.01% optimize.handle_group_info : 0.000001s : 0.01% optimize.symbol_engine_optimizer.build : 0.000005s : 0.04% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000011s : 0.09% optimize.symbol_engine_optimizer.elim_not_effective : 0.000014s : 0.10% optimize.symbol_engine_optimizer.opt_reshape : 0.000008s : 0.06% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000011s : 0.08% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.02% pipeline_parallel_scheduler : 0.000002s : 0.01% auto_monad_reorder : 0.000018s : 0.14% get_jit_bprop_graph : 0.000002s : 0.01% rewriter_after_jit_bprop_graph : 0.000005s : 0.04% opt_after_jit_grad : 0.000521s : 3.99% validate : 0.000046s : 0.35% Time group info: ------[substitution.] 0.000199 29 0.92% : 0.000002s : 2: substitution.elim_not_effective 0.84% : 0.000002s : 2: substitution.fold_const_symbol 3.50% : 0.000007s : 4: substitution.graph_param_transform 75.38% : 0.000150s : 4: substitution.inline 1.90% : 0.000004s : 4: substitution.j_node_and_user_rematch 2.84% : 0.000006s : 4: substitution.remove_not_recompute_node 2.28% : 0.000005s : 4: substitution.replace_old_param 8.35% : 0.000017s : 4: substitution.tuple_list_get_item_eliminator 3.98% : 0.000008s : 1: substitution.value_based_eliminate ------[type_inference.] 0.007483 2 89.42% : 0.006691s : 1: type_inference.infer 10.58% : 0.000792s : 1: type_inference.specialize ------[replace.] 0.000061 8 62.08% : 0.000038s : 4: replace.inline 37.92% : 0.000023s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000162 8 90.89% : 0.000147s : 4: match.inline 9.11% : 0.000015s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000230 1278 0.78% : 0.000002s : 13: predicate.accumulaten_eliminater 0.86% : 0.000002s : 4: predicate.ad_related_special_op_eliminate 0.53% : 0.000001s : 8: predicate.addn_check_dump 0.83% : 0.000002s : 13: predicate.addn_zero_filter 0.98% : 0.000002s : 13: predicate.adjust_all_reduce_mul_add 10.09% : 0.000023s : 21: predicate.arithmetic_simplify 1.05% : 0.000002s : 13: predicate.cast_eliminate 0.50% : 0.000001s : 8: predicate.check_bprop_eliminate 0.47% : 0.000001s : 8: predicate.compare_switch_simplify 0.18% : 0.000000s : 4: predicate.const_output_eliminate 0.48% : 0.000001s : 8: predicate.depend_value_elim 0.82% : 0.000002s : 13: predicate.dict_get_item_const_eliminator 0.96% : 0.000002s : 13: predicate.dict_get_item_eliminator 0.83% : 0.000002s : 13: predicate.dict_set_item_eliminator 0.92% : 0.000002s : 8: predicate.dumpgradient_eliminate 0.25% : 0.000001s : 4: predicate.elim_not_effective 0.54% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 0.96% : 0.000002s : 17: predicate.environ_add_const_eliminate 0.93% : 0.000002s : 17: predicate.environ_get_add_eliminate 0.94% : 0.000002s : 17: predicate.environ_get_depend_swap 1.56% : 0.000004s : 25: predicate.environ_get_eliminate 0.99% : 0.000002s : 17: predicate.environ_get_set_eliminate 1.32% : 0.000003s : 21: predicate.exchange_switch_depend_value 2.18% : 0.000005s : 21: predicate.float_depend_g_call 0.46% : 0.000001s : 8: predicate.float_environ_get_switch 0.72% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.15% : 0.000000s : 4: predicate.fold_const_symbol 0.65% : 0.000001s : 8: predicate.get_grad_eliminate 0.19% : 0.000000s : 4: predicate.graph_param_transform 0.64% : 0.000001s : 8: predicate.incorporate_call 0.46% : 0.000001s : 8: predicate.incorporate_call_switch 5.54% : 0.000013s : 58: predicate.inline 0.71% : 0.000002s : 8: predicate.inline_without_move 0.28% : 0.000001s : 8: predicate.j_node_and_user_rematch 1.00% : 0.000002s : 8: predicate.less_batch_normalization 1.67% : 0.000004s : 25: predicate.list_to_tuple_eliminator_ 2.34% : 0.000005s : 38: predicate.load_eliminater 0.87% : 0.000002s : 4: predicate.loop_unroll_after_grad 2.16% : 0.000005s : 34: predicate.loop_unroll_before_grad 1.49% : 0.000003s : 21: predicate.make_slice_get_slice_eliminator 0.52% : 0.000001s : 8: predicate.merge_addn 0.49% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.55% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.71% : 0.000002s : 13: predicate.minmaximum_grad 1.16% : 0.000003s : 4: predicate.mutable_eliminate 0.35% : 0.000001s : 4: predicate.opt_reshape 0.36% : 0.000001s : 4: predicate.parallel_virtual_node 1.70% : 0.000004s : 21: predicate.partial_defer_inline 1.47% : 0.000003s : 21: predicate.partial_eliminate 0.87% : 0.000002s : 13: predicate.print_const_string_wrapper 0.50% : 0.000001s : 8: predicate.reduce_all_const_elim 1.14% : 0.000003s : 13: predicate.reduce_eliminate 2.17% : 0.000005s : 38: predicate.redundant_stop_gradient_eliminater 0.37% : 0.000001s : 8: predicate.remove_not_recompute_node 1.23% : 0.000003s : 25: predicate.replace_applicator 0.48% : 0.000001s : 8: predicate.replace_old_param 0.24% : 0.000001s : 4: predicate.reset_defer_inline 0.97% : 0.000002s : 13: predicate.reshape_eliminate 0.49% : 0.000001s : 8: predicate.row_tensor_add_zeros_like 0.44% : 0.000001s : 4: predicate.row_tensor_eliminate 0.78% : 0.000002s : 8: predicate.same_eliminate 0.41% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.70% : 0.000002s : 8: predicate.shard_identity_eliminate 0.63% : 0.000001s : 8: predicate.special_op_eliminate 0.70% : 0.000002s : 8: predicate.specialize_transform 0.98% : 0.000002s : 8: predicate.split_environ_get_set_with_tuple_value 0.81% : 0.000002s : 8: predicate.stack_unstack_eliminate 0.29% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.38% : 0.000003s : 21: predicate.switch_defer_inline 1.87% : 0.000004s : 29: predicate.switch_layer_defer_inline 4.73% : 0.000011s : 67: predicate.switch_simplify 0.75% : 0.000002s : 13: predicate.tile_eliminate 0.85% : 0.000002s : 13: predicate.transpose_eliminate 1.47% : 0.000003s : 21: predicate.tuple_list_convert_item_index_to_positive 1.40% : 0.000003s : 21: predicate.tuple_list_get_item_const_eliminator 1.25% : 0.000003s : 21: predicate.tuple_list_get_item_depend_reorder 3.01% : 0.000007s : 33: predicate.tuple_list_get_item_eliminator 1.38% : 0.000003s : 21: predicate.tuple_list_get_set_item_eliminator 1.98% : 0.000005s : 29: predicate.tuple_list_set_item_eliminator 1.71% : 0.000004s : 25: predicate.tuple_to_list_eliminator_ 2.13% : 0.000005s : 38: predicate.updatestate_pure_node_eliminater 2.90% : 0.000007s : 46: predicate.updatestate_useless_node_eliminater 0.48% : 0.000001s : 4: predicate.value_based_eliminate 0.68% : 0.000002s : 8: predicate.virtual_dataset_eliminate 0.57% : 0.000001s : 8: predicate.virtual_output_eliminate 0.24% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.44% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000657 11 58.42% : 0.000384s : 5: func_graph_cloner_run.FuncGraphClonerGraph 41.58% : 0.000273s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.028925 192 0.01% : 0.000004s : 1: ForceFp32Comm 13.20% : 0.003817s : 1: add_attr 13.15% : 0.003803s : 1: add_attr_with_inline 0.02% : 0.000005s : 1: add_comm_op_reuse_tag 0.20% : 0.000059s : 1: add_recomputation 0.01% : 0.000004s : 1: assign_add_opt 0.25% : 0.000072s : 1: auto_monad 0.07% : 0.000022s : 1: auto_monad_reorder 0.01% : 0.000003s : 1: begin_end_overlap_inline 0.02% : 0.000006s : 1: bias_add_comm_swap 1.77% : 0.000513s : 1: bootstrap 0.11% : 0.000033s : 1: cconv 0.01% : 0.000004s : 1: comm_op_add_attrs 0.06% : 0.000016s : 1: control_data_broadcast_order 0.04% : 0.000011s : 1: convert_after_rewriter 0.09% : 0.000026s : 1: cse_after_recomputation 0.02% : 0.000005s : 1: dataset_repeat_opt 0.02% : 0.000006s : 1: detach_backward 0.03% : 0.000008s : 1: environ_conv 0.09% : 0.000027s : 1: event_method 0.02% : 0.000005s : 1: full_micro_interleaved_order_control 0.02% : 0.000005s : 1: get_jit_bprop_graph 0.03% : 0.000010s : 1: graph_reusing 0.02% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000004s : 1: handle_group_info 0.02% : 0.000006s : 1: inline 0.02% : 0.000006s : 1: insert-virtual-dataset 0.01% : 0.000004s : 1: interleave_parallel_branches 0.01% : 0.000004s : 1: interleave_split_concat_branches 0.02% : 0.000006s : 1: label_fine_grained_interleaved_index 0.03% : 0.000007s : 1: label_micro_interleaved_index 1.61% : 0.000465s : 1: loop_unroll 0.01% : 0.000004s : 1: merge_cast_opt 0.02% : 0.000005s : 1: micro_interleaved_order_control 2.39% : 0.000692s : 1: mutable_eliminate 0.02% : 0.000007s : 1: offloading_packed_experts 0.05% : 0.000015s : 1: opt.transform.loop_unroll_optimizer 0.06% : 0.000018s : 1: opt.transform.mutable_eliminate 4.23% : 0.001224s : 78: opt.transform.opt_a 0.10% : 0.000030s : 1: opt.transform.opt_after_cconv 0.10% : 0.000029s : 1: opt.transform.opt_after_jit_grad 0.40% : 0.000116s : 28: opt.transform.opt_b 0.17% : 0.000050s : 2: opt.transform.opt_trans_graph 0.13% : 0.000039s : 4: opt.transform.symbol_engine_opt 9.80% : 0.002836s : 1: opt_a 0.38% : 0.000109s : 1: opt_after_cconv 1.83% : 0.000531s : 1: opt_after_jit_grad 0.79% : 0.000227s : 1: opt_b 17.73% : 0.005127s : 1: optimize 0.07% : 0.000021s : 1: optimize_parallel_all_gather_comm 0.03% : 0.000008s : 1: order_py_execute_after_rewriter 0.09% : 0.000026s : 1: overlap_grad_flash_sp 0.02% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.03% : 0.000007s : 1: overlap_grad_ring_attention 0.02% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.02% : 0.000006s : 1: overlap_param_gather 0.01% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.03% : 0.000009s : 1: overlap_recompute_and_grad_model_parallel 0.02% : 0.000005s : 1: overlap_recompute_comm 0.03% : 0.000008s : 1: parallel-infer-symbol 0.01% : 0.000004s : 1: parallel-infer-symbol-second 0.02% : 0.000005s : 1: partial_unused_args_eliminate 0.02% : 0.000005s : 1: pipeline_parallel_scheduler 0.02% : 0.000005s : 1: pipeline_split 0.15% : 0.000042s : 1: pre_auto_parallel 0.11% : 0.000031s : 1: py_interpret_to_execute 0.05% : 0.000015s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000004s : 1: remove_cast_before_assign_add 0.06% : 0.000018s : 1: remove_dup_value 1.36% : 0.000393s : 1: renormalize.infer 1.17% : 0.000340s : 1: renormalize.specialize 0.02% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.03% : 0.000009s : 1: rewriter_after_jit_bprop_graph 0.15% : 0.000043s : 1: rewriter_after_opt_a 0.33% : 0.000095s : 1: rewriter_before_opt_a 0.02% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.02% : 0.000006s : 1: slice_recompute_activation 0.02% : 0.000005s : 1: split_layernorm_comm 0.02% : 0.000005s : 1: split_matmul_comm_elemetwise 0.03% : 0.000009s : 1: swap_dp_allreduce_reducescatter 0.30% : 0.000085s : 1: symbol_engine_optimizer 0.28% : 0.000081s : 1: tuple_transform 26.20% : 0.007580s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:44:22.136.836 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:44:22.137.159 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0441839, [21] [bootstrap]: 0.00043902 [type_inference]: 0.0312826 [event_method]: 2.173e-05 [auto_monad]: 7.108e-05 [graph_reusing]: 6.44001e-06 [inline]: 2.96001e-06 [add_attr]: 0.00432696, [1] [add_attr_with_inline]: 0.00431295, [1] [Cycle 1]: 0.00010024, [2] [tag_attr]: 2.489e-05 [meta_addattr_fg_expand]: 6.91999e-06 [parallel-infer-symbol]: 3.91999e-06 [pre_auto_parallel]: 4.408e-05 [insert-virtual-dataset]: 2.43e-06 [parallel-infer-symbol-second]: 9.09989e-07 [dataset_repeat_opt]: 2.01003e-06 [pipeline_split]: 2.05002e-06 [optimize]: 0.00675997, [53] [py_interpret_to_execute]: 5.202e-05 [rewriter_before_opt_a]: 0.00010549 [opt_a]: 0.00396809, [2] [Cycle 1]: 0.0029243, [45] [expand_dump_flag]: 3.23998e-06 [switch_simplify]: 4.631e-05 [loop_unroll]: 3.188e-05 [a_1]: 0.00086168 [with_stream_mark]: 2.394e-05 [recompute_prepare]: 1.383e-05 [updatestate_depend_eliminate]: 5.11002e-06 [updatestate_assign_eliminate]: 4.28999e-06 [updatestate_loads_eliminate]: 3.40998e-06 [parameter_eliminate]: 2.45002e-06 [a_2]: 0.00013234 [accelerated_algorithm]: 8.55999e-06 [shard]: 2.50002e-06 [meta_shard_fg_expand]: 2.91e-06 [shard_inline]: 8.2e-06 [merge_send_recv]: 1.106e-05 [auto_parallel]: 9.85002e-06 [parallel]: 2.068e-05 [flash_sp]: 1.191e-05 [merge_comm]: 5.44e-06 [allreduce_fusion]: 4.50999e-06 [matmul_add_comm_reduction]: 1.183e-05 [allreduce_slice_to_reducescatter]: 8.29983e-07 [virtual_shard_identity]: 1.448e-05 [virtual_dataset]: 7.88001e-06 [get_grad_eliminate_]: 7.78001e-06 [virtual_output]: 8.49998e-06 [merge_forward]: 4.75999e-06 [cell_reuse_recompute_pass]: 1.25999e-06 [offload_activation]: 1.18e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.003e-05 [merge_recompute_call_nodes]: 1.49e-06 [before_grad]: 1.344e-05 [set_forward_comm_id_for_comm_node_pass]: 4.47e-06 [meta_fg_expand]: 3.52002e-06 [flash_sp_send_recv_attached]: 2.91999e-06 [receive_attached]: 2.33998e-06 [after_resolve]: 1.416e-05 [a_after_grad]: 1.279e-05 [renormalize]: 0.00098594 [add_forward_monad_depend]: 8.12003e-06 [auto_monad_grad]: 2.46e-06 [auto_monad_eliminator]: 2.4e-05 [cse]: 3.933e-05 [a_3]: 8.2e-05 [Cycle 2]: 0.00102501, [45] [expand_dump_flag]: 2.61e-06 [switch_simplify]: 1.034e-05 [loop_unroll]: 8.23999e-06 [a_1]: 0.00018841 [with_stream_mark]: 2.026e-05 [recompute_prepare]: 8.37e-06 [updatestate_depend_eliminate]: 5.14e-06 [updatestate_assign_eliminate]: 3.41001e-06 [updatestate_loads_eliminate]: 3.06001e-06 [parameter_eliminate]: 2.02001e-06 [a_2]: 0.00012044 [accelerated_algorithm]: 7.88001e-06 [shard]: 1.99e-06 [meta_shard_fg_expand]: 2.44001e-06 [shard_inline]: 7.87e-06 [merge_send_recv]: 1.164e-05 [auto_parallel]: 8.76002e-06 [parallel]: 7.56001e-06 [flash_sp]: 3.98001e-06 [merge_comm]: 5.39e-06 [allreduce_fusion]: 4.06001e-06 [matmul_add_comm_reduction]: 9.82999e-06 [allreduce_slice_to_reducescatter]: 8.39995e-07 [virtual_shard_identity]: 9.77001e-06 [virtual_dataset]: 8.02e-06 [get_grad_eliminate_]: 7.16001e-06 [virtual_output]: 7.01001e-06 [merge_forward]: 4.77998e-06 [cell_reuse_recompute_pass]: 2.34001e-06 [offload_activation]: 1.111e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.782e-05 [merge_recompute_call_nodes]: 1.30999e-06 [before_grad]: 1.379e-05 [set_forward_comm_id_for_comm_node_pass]: 5.39e-06 [meta_fg_expand]: 3.58e-06 [flash_sp_send_recv_attached]: 1.60999e-06 [receive_attached]: 2.09999e-06 [after_resolve]: 1.417e-05 [a_after_grad]: 1.213e-05 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 2.58998e-06 [auto_monad_grad]: 1.74e-06 [auto_monad_eliminator]: 1.152e-05 [cse]: 2.494e-05 [a_3]: 6.035e-05 [py_interpret_to_execute_after_opt_a]: 2.005e-05 [slice_cell_reuse_recomputed_activation]: 5.02e-06 [rewriter_after_opt_a]: 5.615e-05 [convert_after_rewriter]: 1.235e-05 [order_py_execute_after_rewriter]: 9.84999e-06 [mutable_eliminate]: 0.00076158 [opt_b]: 0.00034662, [1] [Cycle 1]: 0.00033669, [7] [b_1]: 0.00022239 [b_2]: 9.73002e-06 [updatestate_depend_eliminate]: 9.97999e-06 [updatestate_assign_eliminate]: 3.50998e-06 [updatestate_loads_eliminate]: 3.24001e-06 [renormalize]: 6.39993e-07 [cse]: 2.831e-05 [optimize_parallel_all_gather_comm]: 2.336e-05 [overlap_param_gather]: 4.53999e-06 [cconv]: 3.758e-05 [loop_unroll]: 0.00046233 [opt_after_cconv]: 0.00014823, [1] [Cycle 1]: 0.0001391, [7] [c_1]: 3.865e-05 [parameter_eliminate]: 5.37999e-06 [updatestate_depend_eliminate]: 7.03e-06 [updatestate_assign_eliminate]: 3.35e-06 [updatestate_loads_eliminate]: 3.03e-06 [cse]: 2.439e-05 [renormalize]: 6.89994e-07 [remove_dup_value]: 1.892e-05 [tuple_transform]: 0.00010118, [1] [Cycle 1]: 9.408e-05, [4] [d_1]: 5.308e-05 [none_parameter_eliminate]: 1.71e-06 [renormalize]: 3.50003e-07 [switch_simplify]: 8.32e-06 [partial_unused_args_eliminate]: 4.33999e-06 [add_recomputation]: 6.52e-05 [cse_after_recomputation]: 3.13e-05, [1] [Cycle 1]: 2.419e-05, [1] [cse]: 1.545e-05 [environ_conv]: 1.037e-05 [swap_dp_allreduce_reducescatter]: 8.75999e-06 [bias_add_comm_swap]: 5.42999e-06 [label_micro_interleaved_index]: 7.6e-06 [label_fine_grained_interleaved_index]: 5.44e-06 [merge_cast_opt]: 3.98999e-06 [slice_recompute_activation]: 4.87998e-06 [micro_interleaved_order_control]: 4.89998e-06 [assign_add_opt]: 3.72998e-06 [ForceFp32Comm]: 3.48e-06 [remove_cast_before_assign_add]: 3.45e-06 [full_micro_interleaved_order_control]: 4.44002e-06 [reorder_send_recv_between_fp_bp]: 5.09e-06 [comm_op_add_attrs]: 3.73999e-06 [add_comm_op_reuse_tag]: 3.28e-06 [interleave_split_concat_branches]: 3.59002e-06 [interleave_parallel_branches]: 3.41001e-06 [overlap_opt_shard_in_pipeline]: 3.49001e-06 [overlap_opt_shard_grad_in_pipeline]: 4.26001e-06 [control_data_broadcast_order]: 1.837e-05 [grouped_pairwise_exchange_alltoall]: 3.91999e-06 [offloading_packed_experts]: 7.26999e-06 [overlap_recompute_and_grad_model_parallel]: 8.17e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.79002e-06 [overlap_recompute_allgather_and_fa_grad]: 4.41002e-06 [overlap_recompute_comm]: 6.14999e-06 [overlap_grad_ring_attention]: 6.98998e-06 [overlap_grad_flash_sp]: 2.78e-05 [begin_end_overlap_inline]: 2.81999e-06 [split_matmul_comm_elemetwise]: 5.09e-06 [split_layernorm_comm]: 4.45999e-06 [handle_group_info]: 3.36999e-06 [symbol_engine_optimizer]: 0.00010814, [1] [Cycle 1]: 0.00010111, [6] [build]: 3.56999e-06 [elim_shapecalc]: 1.286e-05 [elim_not_effective]: 1.656e-05 [opt_reshape]: 8.86002e-06 [fold_const_symbol]: 1.257e-05 [renormalize]: 1.79978e-07 [detach_backward]: 4.12998e-06 [pipeline_parallel_scheduler]: 1.79998e-06 [auto_monad_reorder]: 2.33e-05 [get_jit_bprop_graph]: 2.71999e-06 [rewriter_after_jit_bprop_graph]: 5.77001e-06 [opt_after_jit_grad]: 0.00051463 [validate]: 4.596e-05 Sums bootstrap : 0.000439s : 1.16% type_inference : 0.031283s : 82.31% event_method : 0.000022s : 0.06% auto_monad : 0.000071s : 0.19% graph_reusing : 0.000006s : 0.02% inline : 0.000003s : 0.01% add_attr.add_attr_with_inline.tag_attr : 0.000025s : 0.07% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000007s : 0.02% parallel-infer-symbol : 0.000004s : 0.01% pre_auto_parallel : 0.000044s : 0.12% insert-virtual-dataset : 0.000002s : 0.01% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.01% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000052s : 0.14% optimize.rewriter_before_opt_a : 0.000105s : 0.28% optimize.opt_a.expand_dump_flag : 0.000006s : 0.02% optimize.opt_a.switch_simplify : 0.000057s : 0.15% optimize.opt_a.loop_unroll : 0.000040s : 0.11% optimize.opt_a.a_1 : 0.001050s : 2.76% optimize.opt_a.with_stream_mark : 0.000044s : 0.12% optimize.opt_a.recompute_prepare : 0.000022s : 0.06% optimize.opt_a.updatestate_depend_eliminate : 0.000010s : 0.03% optimize.opt_a.updatestate_assign_eliminate : 0.000008s : 0.02% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.02% optimize.opt_a.parameter_eliminate : 0.000004s : 0.01% optimize.opt_a.a_2 : 0.000253s : 0.67% optimize.opt_a.accelerated_algorithm : 0.000016s : 0.04% optimize.opt_a.shard : 0.000004s : 0.01% optimize.opt_a.meta_shard_fg_expand : 0.000005s : 0.01% optimize.opt_a.shard_inline : 0.000016s : 0.04% optimize.opt_a.merge_send_recv : 0.000023s : 0.06% optimize.opt_a.auto_parallel : 0.000019s : 0.05% optimize.opt_a.parallel : 0.000028s : 0.07% optimize.opt_a.flash_sp : 0.000016s : 0.04% optimize.opt_a.merge_comm : 0.000011s : 0.03% optimize.opt_a.allreduce_fusion : 0.000009s : 0.02% optimize.opt_a.matmul_add_comm_reduction : 0.000022s : 0.06% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000002s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000024s : 0.06% optimize.opt_a.virtual_dataset : 0.000016s : 0.04% optimize.opt_a.get_grad_eliminate_ : 0.000015s : 0.04% optimize.opt_a.virtual_output : 0.000016s : 0.04% optimize.opt_a.merge_forward : 0.000010s : 0.03% optimize.opt_a.cell_reuse_recompute_pass : 0.000004s : 0.01% optimize.opt_a.offload_activation : 0.000023s : 0.06% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000038s : 0.10% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.01% optimize.opt_a.before_grad : 0.000027s : 0.07% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000010s : 0.03% optimize.opt_a.meta_fg_expand : 0.000007s : 0.02% optimize.opt_a.flash_sp_send_recv_attached : 0.000005s : 0.01% optimize.opt_a.receive_attached : 0.000004s : 0.01% optimize.opt_a.after_resolve : 0.000028s : 0.07% optimize.opt_a.a_after_grad : 0.000025s : 0.07% optimize.opt_a.renormalize : 0.000986s : 2.59% optimize.opt_a.add_forward_monad_depend : 0.000011s : 0.03% optimize.opt_a.auto_monad_grad : 0.000004s : 0.01% optimize.opt_a.auto_monad_eliminator : 0.000036s : 0.09% optimize.opt_a.cse : 0.000064s : 0.17% optimize.opt_a.a_3 : 0.000142s : 0.37% optimize.py_interpret_to_execute_after_opt_a : 0.000020s : 0.05% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.01% optimize.rewriter_after_opt_a : 0.000056s : 0.15% optimize.convert_after_rewriter : 0.000012s : 0.03% optimize.order_py_execute_after_rewriter : 0.000010s : 0.03% optimize.mutable_eliminate : 0.000762s : 2.00% optimize.opt_b.b_1 : 0.000222s : 0.59% optimize.opt_b.b_2 : 0.000010s : 0.03% optimize.opt_b.updatestate_depend_eliminate : 0.000010s : 0.03% optimize.opt_b.updatestate_assign_eliminate : 0.000004s : 0.01% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.01% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000028s : 0.07% optimize.optimize_parallel_all_gather_comm : 0.000023s : 0.06% optimize.overlap_param_gather : 0.000005s : 0.01% optimize.cconv : 0.000038s : 0.10% optimize.loop_unroll : 0.000462s : 1.22% optimize.opt_after_cconv.c_1 : 0.000039s : 0.10% optimize.opt_after_cconv.parameter_eliminate : 0.000005s : 0.01% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000007s : 0.02% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.cse : 0.000024s : 0.06% optimize.opt_after_cconv.renormalize : 0.000001s : 0.00% optimize.remove_dup_value : 0.000019s : 0.05% optimize.tuple_transform.d_1 : 0.000053s : 0.14% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000008s : 0.02% optimize.partial_unused_args_eliminate : 0.000004s : 0.01% optimize.add_recomputation : 0.000065s : 0.17% optimize.cse_after_recomputation.cse : 0.000015s : 0.04% optimize.environ_conv : 0.000010s : 0.03% optimize.swap_dp_allreduce_reducescatter : 0.000009s : 0.02% optimize.bias_add_comm_swap : 0.000005s : 0.01% optimize.label_micro_interleaved_index : 0.000008s : 0.02% optimize.label_fine_grained_interleaved_index : 0.000005s : 0.01% optimize.merge_cast_opt : 0.000004s : 0.01% optimize.slice_recompute_activation : 0.000005s : 0.01% optimize.micro_interleaved_order_control : 0.000005s : 0.01% optimize.assign_add_opt : 0.000004s : 0.01% optimize.ForceFp32Comm : 0.000003s : 0.01% optimize.remove_cast_before_assign_add : 0.000003s : 0.01% optimize.full_micro_interleaved_order_control : 0.000004s : 0.01% optimize.reorder_send_recv_between_fp_bp : 0.000005s : 0.01% optimize.comm_op_add_attrs : 0.000004s : 0.01% optimize.add_comm_op_reuse_tag : 0.000003s : 0.01% optimize.interleave_split_concat_branches : 0.000004s : 0.01% optimize.interleave_parallel_branches : 0.000003s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000003s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000004s : 0.01% optimize.control_data_broadcast_order : 0.000018s : 0.05% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.01% optimize.offloading_packed_experts : 0.000007s : 0.02% optimize.overlap_recompute_and_grad_model_parallel : 0.000008s : 0.02% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.01% optimize.overlap_recompute_comm : 0.000006s : 0.02% optimize.overlap_grad_ring_attention : 0.000007s : 0.02% optimize.overlap_grad_flash_sp : 0.000028s : 0.07% optimize.begin_end_overlap_inline : 0.000003s : 0.01% optimize.split_matmul_comm_elemetwise : 0.000005s : 0.01% optimize.split_layernorm_comm : 0.000004s : 0.01% optimize.handle_group_info : 0.000003s : 0.01% optimize.symbol_engine_optimizer.build : 0.000004s : 0.01% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000013s : 0.03% optimize.symbol_engine_optimizer.elim_not_effective : 0.000017s : 0.04% optimize.symbol_engine_optimizer.opt_reshape : 0.000009s : 0.02% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000013s : 0.03% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000004s : 0.01% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000023s : 0.06% get_jit_bprop_graph : 0.000003s : 0.01% rewriter_after_jit_bprop_graph : 0.000006s : 0.02% opt_after_jit_grad : 0.000515s : 1.35% validate : 0.000046s : 0.12% Time group info: ------[substitution.] 0.000279 39 13.54% : 0.000038s : 3: substitution.cast_eliminate 0.92% : 0.000003s : 3: substitution.elim_not_effective 0.64% : 0.000002s : 3: substitution.fold_const_symbol 2.35% : 0.000007s : 5: substitution.graph_param_transform 66.94% : 0.000187s : 4: substitution.inline 1.99% : 0.000006s : 6: substitution.j_node_and_user_rematch 2.53% : 0.000007s : 6: substitution.remove_not_recompute_node 2.42% : 0.000007s : 4: substitution.replace_old_param 6.10% : 0.000017s : 4: substitution.tuple_list_get_item_eliminator 2.58% : 0.000007s : 1: substitution.value_based_eliminate ------[type_inference.] 0.031218 2 97.33% : 0.030386s : 1: type_inference.infer 2.67% : 0.000833s : 1: type_inference.specialize ------[replace.] 0.000072 8 60.98% : 0.000044s : 4: replace.inline 39.02% : 0.000028s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000199 8 92.57% : 0.000184s : 4: match.inline 7.43% : 0.000015s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000276 1596 0.93% : 0.000003s : 17: predicate.accumulaten_eliminater 0.76% : 0.000002s : 5: predicate.ad_related_special_op_eliminate 0.53% : 0.000001s : 10: predicate.addn_check_dump 0.99% : 0.000003s : 17: predicate.addn_zero_filter 0.86% : 0.000002s : 17: predicate.adjust_all_reduce_mul_add 2.13% : 0.000006s : 27: predicate.arithmetic_simplify 1.06% : 0.000003s : 17: predicate.cast_eliminate 0.67% : 0.000002s : 10: predicate.check_bprop_eliminate 0.54% : 0.000001s : 10: predicate.compare_switch_simplify 0.17% : 0.000000s : 5: predicate.const_output_eliminate 0.63% : 0.000002s : 10: predicate.depend_value_elim 0.95% : 0.000003s : 17: predicate.dict_get_item_const_eliminator 1.18% : 0.000003s : 17: predicate.dict_get_item_eliminator 0.95% : 0.000003s : 17: predicate.dict_set_item_eliminator 0.92% : 0.000003s : 10: predicate.dumpgradient_eliminate 0.18% : 0.000000s : 5: predicate.elim_not_effective 0.41% : 0.000001s : 5: predicate.elim_shapecalc_of_broadcastargs 1.20% : 0.000003s : 22: predicate.environ_add_const_eliminate 1.10% : 0.000003s : 22: predicate.environ_get_add_eliminate 1.10% : 0.000003s : 22: predicate.environ_get_depend_swap 1.96% : 0.000005s : 32: predicate.environ_get_eliminate 1.12% : 0.000003s : 22: predicate.environ_get_set_eliminate 1.31% : 0.000004s : 25: predicate.exchange_switch_depend_value 2.31% : 0.000006s : 25: predicate.float_depend_g_call 0.53% : 0.000001s : 10: predicate.float_environ_get_switch 0.76% : 0.000002s : 15: predicate.float_tuple_getitem_switch 0.16% : 0.000000s : 5: predicate.fold_const_symbol 0.66% : 0.000002s : 10: predicate.get_grad_eliminate 0.18% : 0.000001s : 5: predicate.graph_param_transform 0.54% : 0.000001s : 10: predicate.incorporate_call 0.45% : 0.000001s : 10: predicate.incorporate_call_switch 6.33% : 0.000017s : 72: predicate.inline 0.88% : 0.000002s : 10: predicate.inline_without_move 0.31% : 0.000001s : 10: predicate.j_node_and_user_rematch 0.78% : 0.000002s : 10: predicate.less_batch_normalization 1.79% : 0.000005s : 31: predicate.list_to_tuple_eliminator_ 2.47% : 0.000007s : 48: predicate.load_eliminater 0.77% : 0.000002s : 5: predicate.loop_unroll_after_grad 2.11% : 0.000006s : 36: predicate.loop_unroll_before_grad 1.86% : 0.000005s : 27: predicate.make_slice_get_slice_eliminator 0.56% : 0.000002s : 10: predicate.merge_addn 0.62% : 0.000002s : 10: predicate.micro_step_allgather_replace 0.53% : 0.000001s : 10: predicate.mini_step_allgather_replace 0.94% : 0.000003s : 17: predicate.minmaximum_grad 1.25% : 0.000003s : 5: predicate.mutable_eliminate 0.34% : 0.000001s : 5: predicate.opt_reshape 0.38% : 0.000001s : 5: predicate.parallel_virtual_node 1.80% : 0.000005s : 25: predicate.partial_defer_inline 1.64% : 0.000005s : 26: predicate.partial_eliminate 0.98% : 0.000003s : 17: predicate.print_const_string_wrapper 0.69% : 0.000002s : 10: predicate.reduce_all_const_elim 1.22% : 0.000003s : 17: predicate.reduce_eliminate 2.62% : 0.000007s : 48: predicate.redundant_stop_gradient_eliminater 0.33% : 0.000001s : 10: predicate.remove_not_recompute_node 1.47% : 0.000004s : 31: predicate.replace_applicator 0.54% : 0.000001s : 10: predicate.replace_old_param 0.33% : 0.000001s : 5: predicate.reset_defer_inline 1.03% : 0.000003s : 17: predicate.reshape_eliminate 0.63% : 0.000002s : 10: predicate.row_tensor_add_zeros_like 0.33% : 0.000001s : 5: predicate.row_tensor_eliminate 0.76% : 0.000002s : 10: predicate.same_eliminate 0.46% : 0.000001s : 10: predicate.set_cell_output_no_recompute 0.83% : 0.000002s : 10: predicate.shard_identity_eliminate 0.65% : 0.000002s : 10: predicate.special_op_eliminate 0.67% : 0.000002s : 10: predicate.specialize_transform 1.08% : 0.000003s : 10: predicate.split_environ_get_set_with_tuple_value 0.84% : 0.000002s : 10: predicate.stack_unstack_eliminate 0.41% : 0.000001s : 5: predicate.switch_call_monad_eliminater 1.55% : 0.000004s : 25: predicate.switch_defer_inline 1.99% : 0.000005s : 35: predicate.switch_layer_defer_inline 4.61% : 0.000013s : 76: predicate.switch_simplify 1.04% : 0.000003s : 17: predicate.tile_eliminate 1.13% : 0.000003s : 17: predicate.transpose_eliminate 1.53% : 0.000004s : 27: predicate.tuple_list_convert_item_index_to_positive 1.66% : 0.000005s : 27: predicate.tuple_list_get_item_const_eliminator 1.60% : 0.000004s : 27: predicate.tuple_list_get_item_depend_reorder 3.16% : 0.000009s : 41: predicate.tuple_list_get_item_eliminator 1.51% : 0.000004s : 27: predicate.tuple_list_get_set_item_eliminator 2.28% : 0.000006s : 37: predicate.tuple_list_set_item_eliminator 1.65% : 0.000005s : 31: predicate.tuple_to_list_eliminator_ 2.45% : 0.000007s : 48: predicate.updatestate_pure_node_eliminater 2.97% : 0.000008s : 58: predicate.updatestate_useless_node_eliminater 0.54% : 0.000001s : 5: predicate.value_based_eliminate 0.59% : 0.000002s : 10: predicate.virtual_dataset_eliminate 0.60% : 0.000002s : 10: predicate.virtual_output_eliminate 0.25% : 0.000001s : 5: predicate.virtual_view_grad_eliminate 0.43% : 0.000001s : 5: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000661 11 54.63% : 0.000361s : 5: func_graph_cloner_run.FuncGraphClonerGraph 45.37% : 0.000300s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.058071 192 0.01% : 0.000006s : 1: ForceFp32Comm 7.47% : 0.004339s : 1: add_attr 7.43% : 0.004317s : 1: add_attr_with_inline 0.01% : 0.000006s : 1: add_comm_op_reuse_tag 0.12% : 0.000069s : 1: add_recomputation 0.01% : 0.000006s : 1: assign_add_opt 0.14% : 0.000082s : 1: auto_monad 0.05% : 0.000031s : 1: auto_monad_reorder 0.01% : 0.000006s : 1: begin_end_overlap_inline 0.01% : 0.000008s : 1: bias_add_comm_swap 0.83% : 0.000481s : 1: bootstrap 0.07% : 0.000041s : 1: cconv 0.01% : 0.000006s : 1: comm_op_add_attrs 0.04% : 0.000022s : 1: control_data_broadcast_order 0.03% : 0.000016s : 1: convert_after_rewriter 0.06% : 0.000034s : 1: cse_after_recomputation 0.01% : 0.000008s : 1: dataset_repeat_opt 0.04% : 0.000021s : 1: detach_backward 0.02% : 0.000013s : 1: environ_conv 0.06% : 0.000032s : 1: event_method 0.01% : 0.000007s : 1: full_micro_interleaved_order_control 0.01% : 0.000009s : 1: get_jit_bprop_graph 0.02% : 0.000014s : 1: graph_reusing 0.01% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000006s : 1: handle_group_info 0.01% : 0.000009s : 1: inline 0.02% : 0.000009s : 1: insert-virtual-dataset 0.01% : 0.000006s : 1: interleave_parallel_branches 0.01% : 0.000006s : 1: interleave_split_concat_branches 0.01% : 0.000008s : 1: label_fine_grained_interleaved_index 0.02% : 0.000010s : 1: label_micro_interleaved_index 0.81% : 0.000468s : 1: loop_unroll 0.01% : 0.000007s : 1: merge_cast_opt 0.01% : 0.000008s : 1: micro_interleaved_order_control 1.32% : 0.000769s : 1: mutable_eliminate 0.02% : 0.000010s : 1: offloading_packed_experts 0.03% : 0.000016s : 1: opt.transform.loop_unroll_optimizer 0.04% : 0.000023s : 1: opt.transform.mutable_eliminate 2.75% : 0.001598s : 78: opt.transform.opt_a 0.06% : 0.000037s : 1: opt.transform.opt_after_cconv 0.05% : 0.000030s : 1: opt.transform.opt_after_jit_grad 0.27% : 0.000155s : 28: opt.transform.opt_b 0.10% : 0.000059s : 2: opt.transform.opt_trans_graph 0.08% : 0.000047s : 4: opt.transform.symbol_engine_opt 6.84% : 0.003971s : 1: opt_a 0.26% : 0.000152s : 1: opt_after_cconv 0.90% : 0.000524s : 1: opt_after_jit_grad 0.60% : 0.000350s : 1: opt_b 12.24% : 0.007110s : 1: optimize 0.05% : 0.000027s : 1: optimize_parallel_all_gather_comm 0.02% : 0.000013s : 1: order_py_execute_after_rewriter 0.05% : 0.000031s : 1: overlap_grad_flash_sp 0.01% : 0.000007s : 1: overlap_grad_matmul_and_grad_allreduce 0.02% : 0.000010s : 1: overlap_grad_ring_attention 0.01% : 0.000007s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000006s : 1: overlap_opt_shard_in_pipeline 0.01% : 0.000008s : 1: overlap_param_gather 0.01% : 0.000007s : 1: overlap_recompute_allgather_and_fa_grad 0.02% : 0.000011s : 1: overlap_recompute_and_grad_model_parallel 0.02% : 0.000009s : 1: overlap_recompute_comm 0.02% : 0.000010s : 1: parallel-infer-symbol 0.01% : 0.000007s : 1: parallel-infer-symbol-second 0.01% : 0.000007s : 1: partial_unused_args_eliminate 0.02% : 0.000010s : 1: pipeline_parallel_scheduler 0.01% : 0.000008s : 1: pipeline_split 0.09% : 0.000052s : 1: pre_auto_parallel 0.10% : 0.000057s : 1: py_interpret_to_execute 0.04% : 0.000024s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000007s : 1: remove_cast_before_assign_add 0.04% : 0.000022s : 1: remove_dup_value 0.97% : 0.000562s : 1: renormalize.infer 0.71% : 0.000413s : 1: renormalize.specialize 0.01% : 0.000008s : 1: reorder_send_recv_between_fp_bp 0.02% : 0.000012s : 1: rewriter_after_jit_bprop_graph 0.10% : 0.000061s : 1: rewriter_after_opt_a 0.19% : 0.000110s : 1: rewriter_before_opt_a 0.01% : 0.000008s : 1: slice_cell_reuse_recomputed_activation 0.01% : 0.000008s : 1: slice_recompute_activation 0.01% : 0.000007s : 1: split_layernorm_comm 0.01% : 0.000008s : 1: split_matmul_comm_elemetwise 0.02% : 0.000012s : 1: swap_dp_allreduce_reducescatter 0.19% : 0.000111s : 1: symbol_engine_optimizer 0.18% : 0.000104s : 1: tuple_transform 53.97% : 0.031338s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:44:22.938.931 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0296713, [21] [bootstrap]: 0.00045138 [type_inference]: 0.0196774 [event_method]: 2.274e-05 [auto_monad]: 7.086e-05 [graph_reusing]: 6.48e-06 [inline]: 2.89001e-06 [add_attr]: 0.00349347, [1] [add_attr_with_inline]: 0.00348307, [1] [Cycle 1]: 6.134e-05, [2] [tag_attr]: 1.95e-05 [meta_addattr_fg_expand]: 6.02001e-06 [parallel-infer-symbol]: 3.41001e-06 [pre_auto_parallel]: 3.408e-05 [insert-virtual-dataset]: 2.78998e-06 [parallel-infer-symbol-second]: 7.7e-07 [dataset_repeat_opt]: 1.87999e-06 [pipeline_split]: 1.84e-06 [optimize]: 0.00521007, [53] [py_interpret_to_execute]: 2.678e-05 [rewriter_before_opt_a]: 8.67e-05 [opt_a]: 0.00310211, [2] [Cycle 1]: 0.00232194, [45] [expand_dump_flag]: 2.76999e-06 [switch_simplify]: 4.562e-05 [loop_unroll]: 3.09e-05 [a_1]: 0.00073897 [with_stream_mark]: 1.742e-05 [recompute_prepare]: 1.007e-05 [updatestate_depend_eliminate]: 4.65999e-06 [updatestate_assign_eliminate]: 3.93001e-06 [updatestate_loads_eliminate]: 3.60998e-06 [parameter_eliminate]: 1.84e-06 [a_2]: 0.00010046 [accelerated_algorithm]: 8.22e-06 [shard]: 1.76e-06 [meta_shard_fg_expand]: 2.12999e-06 [shard_inline]: 7.79002e-06 [merge_send_recv]: 1.038e-05 [auto_parallel]: 7.86001e-06 [parallel]: 1.926e-05 [flash_sp]: 8.47e-06 [merge_comm]: 5.42001e-06 [allreduce_fusion]: 4.33999e-06 [matmul_add_comm_reduction]: 1.099e-05 [allreduce_slice_to_reducescatter]: 6.30011e-07 [virtual_shard_identity]: 9.74e-06 [virtual_dataset]: 7.75e-06 [get_grad_eliminate_]: 7.9e-06 [virtual_output]: 7.46999e-06 [merge_forward]: 4.37e-06 [cell_reuse_recompute_pass]: 1.35999e-06 [offload_activation]: 1.096e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.508e-05 [merge_recompute_call_nodes]: 1.39998e-06 [before_grad]: 1.257e-05 [set_forward_comm_id_for_comm_node_pass]: 4.48999e-06 [meta_fg_expand]: 3.38999e-06 [flash_sp_send_recv_attached]: 2.65002e-06 [receive_attached]: 2.36e-06 [after_resolve]: 1.351e-05 [a_after_grad]: 1.181e-05 [renormalize]: 0.00079584 [add_forward_monad_depend]: 5.19e-06 [auto_monad_grad]: 2.19001e-06 [auto_monad_eliminator]: 1.715e-05 [cse]: 3.545e-05 [a_3]: 5.789e-05 [Cycle 2]: 0.00077079, [45] [expand_dump_flag]: 1.25999e-06 [switch_simplify]: 9.10001e-06 [loop_unroll]: 7.3e-06 [a_1]: 0.00017682 [with_stream_mark]: 1.282e-05 [recompute_prepare]: 7.66999e-06 [updatestate_depend_eliminate]: 3.85998e-06 [updatestate_assign_eliminate]: 3.10998e-06 [updatestate_loads_eliminate]: 2.91e-06 [parameter_eliminate]: 1.04e-06 [a_2]: 9.153e-05 [accelerated_algorithm]: 7.69002e-06 [shard]: 1.25999e-06 [meta_shard_fg_expand]: 1.74998e-06 [shard_inline]: 7.31999e-06 [merge_send_recv]: 5.82001e-06 [auto_parallel]: 1.37e-05 [parallel]: 5.24998e-06 [flash_sp]: 3.50003e-06 [merge_comm]: 4.48001e-06 [allreduce_fusion]: 3.89002e-06 [matmul_add_comm_reduction]: 6.89001e-06 [allreduce_slice_to_reducescatter]: 3.59985e-07 [virtual_shard_identity]: 8.91002e-06 [virtual_dataset]: 7.65998e-06 [get_grad_eliminate_]: 7.03e-06 [virtual_output]: 6.71999e-06 [merge_forward]: 3.44001e-06 [cell_reuse_recompute_pass]: 1.62001e-06 [offload_activation]: 8e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.45e-05 [merge_recompute_call_nodes]: 1.27e-06 [before_grad]: 1.457e-05 [set_forward_comm_id_for_comm_node_pass]: 4.01001e-06 [meta_fg_expand]: 2.69999e-06 [flash_sp_send_recv_attached]: 9.40025e-07 [receive_attached]: 1.29e-06 [after_resolve]: 1.242e-05 [a_after_grad]: 1.123e-05 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 1.52001e-06 [auto_monad_grad]: 1.07e-06 [auto_monad_eliminator]: 8.38999e-06 [cse]: 1.902e-05 [a_3]: 4.574e-05 [py_interpret_to_execute_after_opt_a]: 1.031e-05 [slice_cell_reuse_recomputed_activation]: 2.26e-06 [rewriter_after_opt_a]: 4.075e-05 [convert_after_rewriter]: 7.66999e-06 [order_py_execute_after_rewriter]: 5.97001e-06 [mutable_eliminate]: 0.00051523 [opt_b]: 0.0002513, [1] [Cycle 1]: 0.00024523, [7] [b_1]: 0.00016603 [b_2]: 9.49999e-06 [updatestate_depend_eliminate]: 6.32001e-06 [updatestate_assign_eliminate]: 2.83e-06 [updatestate_loads_eliminate]: 2.76e-06 [renormalize]: 6.39993e-07 [cse]: 2.205e-05 [optimize_parallel_all_gather_comm]: 1.77e-05 [overlap_param_gather]: 2.19999e-06 [cconv]: 2.602e-05 [loop_unroll]: 0.00041836 [opt_after_cconv]: 0.00011213, [1] [Cycle 1]: 0.00010664, [7] [c_1]: 3.611e-05 [parameter_eliminate]: 2.61e-06 [updatestate_depend_eliminate]: 6.07001e-06 [updatestate_assign_eliminate]: 3.09001e-06 [updatestate_loads_eliminate]: 2.89999e-06 [cse]: 2.176e-05 [renormalize]: 4.69998e-07 [remove_dup_value]: 1.432e-05 [tuple_transform]: 8.442e-05, [1] [Cycle 1]: 8e-05, [4] [d_1]: 5.178e-05 [none_parameter_eliminate]: 1.64998e-06 [renormalize]: 2.30008e-07 [switch_simplify]: 7.83999e-06 [partial_unused_args_eliminate]: 1.69e-06 [add_recomputation]: 5.523e-05 [cse_after_recomputation]: 2.428e-05, [1] [Cycle 1]: 1.985e-05, [1] [cse]: 1.442e-05 [environ_conv]: 6.19001e-06 [swap_dp_allreduce_reducescatter]: 5.54e-06 [bias_add_comm_swap]: 3.23998e-06 [label_micro_interleaved_index]: 4.07998e-06 [label_fine_grained_interleaved_index]: 2.99001e-06 [merge_cast_opt]: 1.23002e-06 [slice_recompute_activation]: 2.12999e-06 [micro_interleaved_order_control]: 2.58e-06 [assign_add_opt]: 1.37999e-06 [ForceFp32Comm]: 1.00001e-06 [remove_cast_before_assign_add]: 1.17e-06 [full_micro_interleaved_order_control]: 2.00002e-06 [reorder_send_recv_between_fp_bp]: 2.58e-06 [comm_op_add_attrs]: 1.01002e-06 [add_comm_op_reuse_tag]: 9.20001e-07 [interleave_split_concat_branches]: 1.10001e-06 [interleave_parallel_branches]: 1.01997e-06 [overlap_opt_shard_in_pipeline]: 1.22e-06 [overlap_opt_shard_grad_in_pipeline]: 2.08002e-06 [control_data_broadcast_order]: 1.528e-05 [grouped_pairwise_exchange_alltoall]: 1.78997e-06 [offloading_packed_experts]: 4.28001e-06 [overlap_recompute_and_grad_model_parallel]: 4.94e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.30999e-06 [overlap_recompute_allgather_and_fa_grad]: 1.55001e-06 [overlap_recompute_comm]: 2.37999e-06 [overlap_grad_ring_attention]: 4.90001e-06 [overlap_grad_flash_sp]: 2.196e-05 [begin_end_overlap_inline]: 6.09987e-07 [split_matmul_comm_elemetwise]: 2.11e-06 [split_layernorm_comm]: 1.66e-06 [handle_group_info]: 9.39996e-07 [symbol_engine_optimizer]: 8.25e-05, [1] [Cycle 1]: 7.832e-05, [6] [build]: 2.93998e-06 [elim_shapecalc]: 1.175e-05 [elim_not_effective]: 1.548e-05 [opt_reshape]: 8.01001e-06 [fold_const_symbol]: 1.214e-05 [renormalize]: 2.19996e-07 [detach_backward]: 2.12999e-06 [pipeline_parallel_scheduler]: 1.61002e-06 [auto_monad_reorder]: 1.925e-05 [get_jit_bprop_graph]: 1.57999e-06 [rewriter_after_jit_bprop_graph]: 3.73999e-06 [opt_after_jit_grad]: 0.00045831 [validate]: 4.404e-05 Sums bootstrap : 0.000451s : 1.79% type_inference : 0.019677s : 78.02% event_method : 0.000023s : 0.09% auto_monad : 0.000071s : 0.28% graph_reusing : 0.000006s : 0.03% inline : 0.000003s : 0.01% add_attr.add_attr_with_inline.tag_attr : 0.000020s : 0.08% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.02% parallel-infer-symbol : 0.000003s : 0.01% pre_auto_parallel : 0.000034s : 0.14% insert-virtual-dataset : 0.000003s : 0.01% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.01% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000027s : 0.11% optimize.rewriter_before_opt_a : 0.000087s : 0.34% optimize.opt_a.expand_dump_flag : 0.000004s : 0.02% optimize.opt_a.switch_simplify : 0.000055s : 0.22% optimize.opt_a.loop_unroll : 0.000038s : 0.15% optimize.opt_a.a_1 : 0.000916s : 3.63% optimize.opt_a.with_stream_mark : 0.000030s : 0.12% optimize.opt_a.recompute_prepare : 0.000018s : 0.07% optimize.opt_a.updatestate_depend_eliminate : 0.000009s : 0.03% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.03% optimize.opt_a.updatestate_loads_eliminate : 0.000007s : 0.03% optimize.opt_a.parameter_eliminate : 0.000003s : 0.01% optimize.opt_a.a_2 : 0.000192s : 0.76% optimize.opt_a.accelerated_algorithm : 0.000016s : 0.06% optimize.opt_a.shard : 0.000003s : 0.01% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.02% optimize.opt_a.shard_inline : 0.000015s : 0.06% optimize.opt_a.merge_send_recv : 0.000016s : 0.06% optimize.opt_a.auto_parallel : 0.000022s : 0.09% optimize.opt_a.parallel : 0.000025s : 0.10% optimize.opt_a.flash_sp : 0.000012s : 0.05% optimize.opt_a.merge_comm : 0.000010s : 0.04% optimize.opt_a.allreduce_fusion : 0.000008s : 0.03% optimize.opt_a.matmul_add_comm_reduction : 0.000018s : 0.07% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000019s : 0.07% optimize.opt_a.virtual_dataset : 0.000015s : 0.06% optimize.opt_a.get_grad_eliminate_ : 0.000015s : 0.06% optimize.opt_a.virtual_output : 0.000014s : 0.06% optimize.opt_a.merge_forward : 0.000008s : 0.03% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.01% optimize.opt_a.offload_activation : 0.000019s : 0.08% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000030s : 0.12% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.01% optimize.opt_a.before_grad : 0.000027s : 0.11% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000009s : 0.03% optimize.opt_a.meta_fg_expand : 0.000006s : 0.02% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.01% optimize.opt_a.receive_attached : 0.000004s : 0.01% optimize.opt_a.after_resolve : 0.000026s : 0.10% optimize.opt_a.a_after_grad : 0.000023s : 0.09% optimize.opt_a.renormalize : 0.000796s : 3.16% optimize.opt_a.add_forward_monad_depend : 0.000007s : 0.03% optimize.opt_a.auto_monad_grad : 0.000003s : 0.01% optimize.opt_a.auto_monad_eliminator : 0.000026s : 0.10% optimize.opt_a.cse : 0.000054s : 0.22% optimize.opt_a.a_3 : 0.000104s : 0.41% optimize.py_interpret_to_execute_after_opt_a : 0.000010s : 0.04% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.01% optimize.rewriter_after_opt_a : 0.000041s : 0.16% optimize.convert_after_rewriter : 0.000008s : 0.03% optimize.order_py_execute_after_rewriter : 0.000006s : 0.02% optimize.mutable_eliminate : 0.000515s : 2.04% optimize.opt_b.b_1 : 0.000166s : 0.66% optimize.opt_b.b_2 : 0.000009s : 0.04% optimize.opt_b.updatestate_depend_eliminate : 0.000006s : 0.03% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.01% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.01% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000022s : 0.09% optimize.optimize_parallel_all_gather_comm : 0.000018s : 0.07% optimize.overlap_param_gather : 0.000002s : 0.01% optimize.cconv : 0.000026s : 0.10% optimize.loop_unroll : 0.000418s : 1.66% optimize.opt_after_cconv.c_1 : 0.000036s : 0.14% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.02% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.cse : 0.000022s : 0.09% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000014s : 0.06% optimize.tuple_transform.d_1 : 0.000052s : 0.21% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000008s : 0.03% optimize.partial_unused_args_eliminate : 0.000002s : 0.01% optimize.add_recomputation : 0.000055s : 0.22% optimize.cse_after_recomputation.cse : 0.000014s : 0.06% optimize.environ_conv : 0.000006s : 0.02% optimize.swap_dp_allreduce_reducescatter : 0.000006s : 0.02% optimize.bias_add_comm_swap : 0.000003s : 0.01% optimize.label_micro_interleaved_index : 0.000004s : 0.02% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.01% optimize.merge_cast_opt : 0.000001s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.01% optimize.micro_interleaved_order_control : 0.000003s : 0.01% optimize.assign_add_opt : 0.000001s : 0.01% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.00% optimize.full_micro_interleaved_order_control : 0.000002s : 0.01% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.01% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.01% optimize.control_data_broadcast_order : 0.000015s : 0.06% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.01% optimize.offloading_packed_experts : 0.000004s : 0.02% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.02% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000002s : 0.01% optimize.overlap_recompute_comm : 0.000002s : 0.01% optimize.overlap_grad_ring_attention : 0.000005s : 0.02% optimize.overlap_grad_flash_sp : 0.000022s : 0.09% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.01% optimize.split_layernorm_comm : 0.000002s : 0.01% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000003s : 0.01% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000012s : 0.05% optimize.symbol_engine_optimizer.elim_not_effective : 0.000015s : 0.06% optimize.symbol_engine_optimizer.opt_reshape : 0.000008s : 0.03% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000012s : 0.05% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.01% pipeline_parallel_scheduler : 0.000002s : 0.01% auto_monad_reorder : 0.000019s : 0.08% get_jit_bprop_graph : 0.000002s : 0.01% rewriter_after_jit_bprop_graph : 0.000004s : 0.01% opt_after_jit_grad : 0.000458s : 1.82% validate : 0.000044s : 0.17% Time group info: ------[substitution.] 0.000215 39 11.04% : 0.000024s : 3: substitution.cast_eliminate 0.99% : 0.000002s : 3: substitution.elim_not_effective 0.78% : 0.000002s : 3: substitution.fold_const_symbol 3.24% : 0.000007s : 5: substitution.graph_param_transform 66.23% : 0.000143s : 4: substitution.inline 2.14% : 0.000005s : 6: substitution.j_node_and_user_rematch 2.92% : 0.000006s : 6: substitution.remove_not_recompute_node 2.02% : 0.000004s : 4: substitution.replace_old_param 7.17% : 0.000015s : 4: substitution.tuple_list_get_item_eliminator 3.48% : 0.000007s : 1: substitution.value_based_eliminate ------[type_inference.] 0.019599 2 95.15% : 0.018649s : 1: type_inference.infer 4.85% : 0.000950s : 1: type_inference.specialize ------[replace.] 0.000064 8 61.81% : 0.000039s : 4: replace.inline 38.19% : 0.000024s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000154 8 91.27% : 0.000140s : 4: match.inline 8.73% : 0.000013s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000251 1596 0.97% : 0.000002s : 17: predicate.accumulaten_eliminater 0.70% : 0.000002s : 5: predicate.ad_related_special_op_eliminate 0.56% : 0.000001s : 10: predicate.addn_check_dump 1.05% : 0.000003s : 17: predicate.addn_zero_filter 1.02% : 0.000003s : 17: predicate.adjust_all_reduce_mul_add 1.88% : 0.000005s : 27: predicate.arithmetic_simplify 1.04% : 0.000003s : 17: predicate.cast_eliminate 0.59% : 0.000001s : 10: predicate.check_bprop_eliminate 0.60% : 0.000001s : 10: predicate.compare_switch_simplify 0.21% : 0.000001s : 5: predicate.const_output_eliminate 0.64% : 0.000002s : 10: predicate.depend_value_elim 0.98% : 0.000002s : 17: predicate.dict_get_item_const_eliminator 1.11% : 0.000003s : 17: predicate.dict_get_item_eliminator 0.93% : 0.000002s : 17: predicate.dict_set_item_eliminator 0.85% : 0.000002s : 10: predicate.dumpgradient_eliminate 0.19% : 0.000000s : 5: predicate.elim_not_effective 0.36% : 0.000001s : 5: predicate.elim_shapecalc_of_broadcastargs 1.19% : 0.000003s : 22: predicate.environ_add_const_eliminate 1.16% : 0.000003s : 22: predicate.environ_get_add_eliminate 1.17% : 0.000003s : 22: predicate.environ_get_depend_swap 1.79% : 0.000004s : 32: predicate.environ_get_eliminate 1.18% : 0.000003s : 22: predicate.environ_get_set_eliminate 1.45% : 0.000004s : 25: predicate.exchange_switch_depend_value 2.44% : 0.000006s : 25: predicate.float_depend_g_call 0.52% : 0.000001s : 10: predicate.float_environ_get_switch 0.79% : 0.000002s : 15: predicate.float_tuple_getitem_switch 0.17% : 0.000000s : 5: predicate.fold_const_symbol 0.63% : 0.000002s : 10: predicate.get_grad_eliminate 0.19% : 0.000000s : 5: predicate.graph_param_transform 0.61% : 0.000002s : 10: predicate.incorporate_call 0.53% : 0.000001s : 10: predicate.incorporate_call_switch 6.29% : 0.000016s : 72: predicate.inline 0.77% : 0.000002s : 10: predicate.inline_without_move 0.32% : 0.000001s : 10: predicate.j_node_and_user_rematch 0.94% : 0.000002s : 10: predicate.less_batch_normalization 1.87% : 0.000005s : 31: predicate.list_to_tuple_eliminator_ 2.62% : 0.000007s : 48: predicate.load_eliminater 0.76% : 0.000002s : 5: predicate.loop_unroll_after_grad 1.97% : 0.000005s : 36: predicate.loop_unroll_before_grad 1.66% : 0.000004s : 27: predicate.make_slice_get_slice_eliminator 0.54% : 0.000001s : 10: predicate.merge_addn 0.58% : 0.000001s : 10: predicate.micro_step_allgather_replace 0.57% : 0.000001s : 10: predicate.mini_step_allgather_replace 0.89% : 0.000002s : 17: predicate.minmaximum_grad 0.81% : 0.000002s : 5: predicate.mutable_eliminate 0.32% : 0.000001s : 5: predicate.opt_reshape 0.37% : 0.000001s : 5: predicate.parallel_virtual_node 1.87% : 0.000005s : 25: predicate.partial_defer_inline 1.72% : 0.000004s : 26: predicate.partial_eliminate 0.94% : 0.000002s : 17: predicate.print_const_string_wrapper 0.57% : 0.000001s : 10: predicate.reduce_all_const_elim 1.24% : 0.000003s : 17: predicate.reduce_eliminate 2.61% : 0.000007s : 48: predicate.redundant_stop_gradient_eliminater 0.40% : 0.000001s : 10: predicate.remove_not_recompute_node 1.37% : 0.000003s : 31: predicate.replace_applicator 0.41% : 0.000001s : 10: predicate.replace_old_param 0.28% : 0.000001s : 5: predicate.reset_defer_inline 0.99% : 0.000002s : 17: predicate.reshape_eliminate 0.61% : 0.000002s : 10: predicate.row_tensor_add_zeros_like 0.33% : 0.000001s : 5: predicate.row_tensor_eliminate 0.75% : 0.000002s : 10: predicate.same_eliminate 0.43% : 0.000001s : 10: predicate.set_cell_output_no_recompute 0.76% : 0.000002s : 10: predicate.shard_identity_eliminate 0.70% : 0.000002s : 10: predicate.special_op_eliminate 0.75% : 0.000002s : 10: predicate.specialize_transform 1.18% : 0.000003s : 10: predicate.split_environ_get_set_with_tuple_value 0.74% : 0.000002s : 10: predicate.stack_unstack_eliminate 0.39% : 0.000001s : 5: predicate.switch_call_monad_eliminater 1.50% : 0.000004s : 25: predicate.switch_defer_inline 2.09% : 0.000005s : 35: predicate.switch_layer_defer_inline 4.81% : 0.000012s : 76: predicate.switch_simplify 1.04% : 0.000003s : 17: predicate.tile_eliminate 0.99% : 0.000002s : 17: predicate.transpose_eliminate 1.54% : 0.000004s : 27: predicate.tuple_list_convert_item_index_to_positive 1.69% : 0.000004s : 27: predicate.tuple_list_get_item_const_eliminator 1.51% : 0.000004s : 27: predicate.tuple_list_get_item_depend_reorder 3.16% : 0.000008s : 41: predicate.tuple_list_get_item_eliminator 1.62% : 0.000004s : 27: predicate.tuple_list_get_set_item_eliminator 2.33% : 0.000006s : 37: predicate.tuple_list_set_item_eliminator 1.73% : 0.000004s : 31: predicate.tuple_to_list_eliminator_ 2.57% : 0.000006s : 48: predicate.updatestate_pure_node_eliminater 3.33% : 0.000008s : 58: predicate.updatestate_useless_node_eliminater 0.44% : 0.000001s : 5: predicate.value_based_eliminate 0.59% : 0.000001s : 10: predicate.virtual_dataset_eliminate 0.58% : 0.000001s : 10: predicate.virtual_output_eliminate 0.31% : 0.000001s : 5: predicate.virtual_view_grad_eliminate 0.38% : 0.000001s : 5: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000658 11 48.69% : 0.000320s : 5: func_graph_cloner_run.FuncGraphClonerGraph 51.31% : 0.000338s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.040805 192 0.01% : 0.000004s : 1: ForceFp32Comm 8.57% : 0.003499s : 1: add_attr 8.55% : 0.003487s : 1: add_attr_with_inline 0.01% : 0.000004s : 1: add_comm_op_reuse_tag 0.14% : 0.000059s : 1: add_recomputation 0.01% : 0.000004s : 1: assign_add_opt 0.19% : 0.000078s : 1: auto_monad 0.06% : 0.000023s : 1: auto_monad_reorder 0.01% : 0.000003s : 1: begin_end_overlap_inline 0.01% : 0.000006s : 1: bias_add_comm_swap 1.18% : 0.000481s : 1: bootstrap 0.07% : 0.000029s : 1: cconv 0.01% : 0.000004s : 1: comm_op_add_attrs 0.04% : 0.000018s : 1: control_data_broadcast_order 0.03% : 0.000011s : 1: convert_after_rewriter 0.07% : 0.000027s : 1: cse_after_recomputation 0.01% : 0.000005s : 1: dataset_repeat_opt 0.01% : 0.000005s : 1: detach_backward 0.02% : 0.000010s : 1: environ_conv 0.07% : 0.000029s : 1: event_method 0.01% : 0.000005s : 1: full_micro_interleaved_order_control 0.01% : 0.000005s : 1: get_jit_bprop_graph 0.02% : 0.000010s : 1: graph_reusing 0.01% : 0.000005s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000004s : 1: handle_group_info 0.01% : 0.000006s : 1: inline 0.02% : 0.000006s : 1: insert-virtual-dataset 0.01% : 0.000004s : 1: interleave_parallel_branches 0.01% : 0.000004s : 1: interleave_split_concat_branches 0.01% : 0.000006s : 1: label_fine_grained_interleaved_index 0.02% : 0.000007s : 1: label_micro_interleaved_index 1.04% : 0.000426s : 1: loop_unroll 0.01% : 0.000004s : 1: merge_cast_opt 0.01% : 0.000005s : 1: micro_interleaved_order_control 1.28% : 0.000524s : 1: mutable_eliminate 0.02% : 0.000007s : 1: offloading_packed_experts 0.04% : 0.000015s : 1: opt.transform.loop_unroll_optimizer 0.04% : 0.000016s : 1: opt.transform.mutable_eliminate 3.51% : 0.001431s : 78: opt.transform.opt_a 0.09% : 0.000035s : 1: opt.transform.opt_after_cconv 0.07% : 0.000027s : 1: opt.transform.opt_after_jit_grad 0.35% : 0.000144s : 28: opt.transform.opt_b 0.14% : 0.000058s : 2: opt.transform.opt_trans_graph 0.11% : 0.000044s : 4: opt.transform.symbol_engine_opt 7.61% : 0.003105s : 1: opt_a 0.28% : 0.000116s : 1: opt_after_cconv 1.14% : 0.000467s : 1: opt_after_jit_grad 0.62% : 0.000255s : 1: opt_b 12.78% : 0.005214s : 1: optimize 0.05% : 0.000021s : 1: optimize_parallel_all_gather_comm 0.02% : 0.000009s : 1: order_py_execute_after_rewriter 0.06% : 0.000025s : 1: overlap_grad_flash_sp 0.01% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.02% : 0.000008s : 1: overlap_grad_ring_attention 0.01% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.01% : 0.000005s : 1: overlap_param_gather 0.01% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.02% : 0.000008s : 1: overlap_recompute_and_grad_model_parallel 0.01% : 0.000005s : 1: overlap_recompute_comm 0.02% : 0.000007s : 1: parallel-infer-symbol 0.01% : 0.000004s : 1: parallel-infer-symbol-second 0.01% : 0.000005s : 1: partial_unused_args_eliminate 0.01% : 0.000005s : 1: pipeline_parallel_scheduler 0.01% : 0.000005s : 1: pipeline_split 0.09% : 0.000038s : 1: pre_auto_parallel 0.08% : 0.000031s : 1: py_interpret_to_execute 0.03% : 0.000014s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000004s : 1: remove_cast_before_assign_add 0.04% : 0.000018s : 1: remove_dup_value 1.14% : 0.000466s : 1: renormalize.infer 0.79% : 0.000322s : 1: renormalize.specialize 0.01% : 0.000005s : 1: reorder_send_recv_between_fp_bp 0.02% : 0.000007s : 1: rewriter_after_jit_bprop_graph 0.11% : 0.000045s : 1: rewriter_after_opt_a 0.22% : 0.000091s : 1: rewriter_before_opt_a 0.01% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.01% : 0.000005s : 1: slice_recompute_activation 0.01% : 0.000004s : 1: split_layernorm_comm 0.01% : 0.000005s : 1: split_matmul_comm_elemetwise 0.02% : 0.000008s : 1: swap_dp_allreduce_reducescatter 0.21% : 0.000085s : 1: symbol_engine_optimizer 0.21% : 0.000087s : 1: tuple_transform 48.28% : 0.019702s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:44:23.728.551 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:44:23.728.908 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0441398, [21] [bootstrap]: 0.00049205 [type_inference]: 0.0314962 [event_method]: 2.095e-05 [auto_monad]: 6.512e-05 [graph_reusing]: 6.54999e-06 [inline]: 3.25998e-06 [add_attr]: 0.00352121, [1] [add_attr_with_inline]: 0.00351138, [1] [Cycle 1]: 8.138e-05, [2] [tag_attr]: 2.166e-05 [meta_addattr_fg_expand]: 6.34999e-06 [parallel-infer-symbol]: 3.38999e-06 [pre_auto_parallel]: 3.836e-05 [insert-virtual-dataset]: 2.22999e-06 [parallel-infer-symbol-second]: 8.50006e-07 [dataset_repeat_opt]: 2.06998e-06 [pipeline_split]: 1.55001e-06 [optimize]: 0.0061938, [53] [py_interpret_to_execute]: 2.992e-05 [rewriter_before_opt_a]: 9.242e-05 [opt_a]: 0.00361761, [2] [Cycle 1]: 0.0026456, [45] [expand_dump_flag]: 3.68e-06 [switch_simplify]: 4.372e-05 [loop_unroll]: 3.094e-05 [a_1]: 0.00068098 [with_stream_mark]: 1.695e-05 [recompute_prepare]: 1.017e-05 [updatestate_depend_eliminate]: 4.70999e-06 [updatestate_assign_eliminate]: 4.32e-06 [updatestate_loads_eliminate]: 3.55e-06 [parameter_eliminate]: 1.94999e-06 [a_2]: 0.0001282 [accelerated_algorithm]: 8.28999e-06 [shard]: 2.01003e-06 [meta_shard_fg_expand]: 2.21e-06 [shard_inline]: 7.85e-06 [merge_send_recv]: 9.87999e-06 [auto_parallel]: 7.41999e-06 [parallel]: 1.855e-05 [flash_sp]: 8.52998e-06 [merge_comm]: 4.83001e-06 [allreduce_fusion]: 4.26001e-06 [matmul_add_comm_reduction]: 1.01e-05 [allreduce_slice_to_reducescatter]: 6.19999e-07 [virtual_shard_identity]: 1.033e-05 [virtual_dataset]: 8.18001e-06 [get_grad_eliminate_]: 8.10999e-06 [virtual_output]: 7.58001e-06 [merge_forward]: 4.48001e-06 [cell_reuse_recompute_pass]: 1.11002e-06 [offload_activation]: 1.209e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.703e-05 [merge_recompute_call_nodes]: 1.55001e-06 [before_grad]: 1.302e-05 [set_forward_comm_id_for_comm_node_pass]: 4.74e-06 [meta_fg_expand]: 3.26001e-06 [flash_sp_send_recv_attached]: 2.40002e-06 [receive_attached]: 2.31e-06 [after_resolve]: 3.011e-05 [a_after_grad]: 1.242e-05 [renormalize]: 0.00096402 [add_forward_monad_depend]: 5.95002e-06 [auto_monad_grad]: 2.37001e-06 [auto_monad_eliminator]: 1.935e-05 [cse]: 3.799e-05 [a_3]: 7.298e-05 [Cycle 2]: 0.00095673, [45] [expand_dump_flag]: 1.06002e-06 [switch_simplify]: 1.019e-05 [loop_unroll]: 7.92998e-06 [a_1]: 0.00017889 [with_stream_mark]: 1.381e-05 [recompute_prepare]: 7.90998e-06 [updatestate_depend_eliminate]: 4.13999e-06 [updatestate_assign_eliminate]: 3.46999e-06 [updatestate_loads_eliminate]: 2.99999e-06 [parameter_eliminate]: 1.49003e-06 [a_2]: 0.00012012 [accelerated_algorithm]: 7.49002e-06 [shard]: 1.34e-06 [meta_shard_fg_expand]: 1.52001e-06 [shard_inline]: 7.82998e-06 [merge_send_recv]: 7.11999e-06 [auto_parallel]: 6.91999e-06 [parallel]: 1.002e-05 [flash_sp]: 4.27e-06 [merge_comm]: 4.27e-06 [allreduce_fusion]: 4.03999e-06 [matmul_add_comm_reduction]: 7.68001e-06 [allreduce_slice_to_reducescatter]: 6.60017e-07 [virtual_shard_identity]: 8.55001e-06 [virtual_dataset]: 7.54002e-06 [get_grad_eliminate_]: 7.29001e-06 [virtual_output]: 8.80001e-06 [merge_forward]: 4.12e-06 [cell_reuse_recompute_pass]: 1.54e-06 [offload_activation]: 1.042e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.67e-05 [merge_recompute_call_nodes]: 1.29998e-06 [before_grad]: 1.225e-05 [set_forward_comm_id_for_comm_node_pass]: 5.02999e-06 [meta_fg_expand]: 2.94001e-06 [flash_sp_send_recv_attached]: 1.39e-06 [receive_attached]: 1.44e-06 [after_resolve]: 1.195e-05 [a_after_grad]: 1.182e-05 [renormalize]: 8.00064e-08 [add_forward_monad_depend]: 1.29e-06 [auto_monad_grad]: 1.27e-06 [auto_monad_eliminator]: 9.00001e-06 [cse]: 1.878e-05 [a_3]: 5.904e-05 [py_interpret_to_execute_after_opt_a]: 1.73e-05 [slice_cell_reuse_recomputed_activation]: 4.87998e-06 [rewriter_after_opt_a]: 4.683e-05 [convert_after_rewriter]: 1.113e-05 [order_py_execute_after_rewriter]: 8.59e-06 [mutable_eliminate]: 0.00063057 [opt_b]: 0.00033338, [1] [Cycle 1]: 0.00032337, [7] [b_1]: 0.00021321 [b_2]: 9.81998e-06 [updatestate_depend_eliminate]: 8.35001e-06 [updatestate_assign_eliminate]: 3.4e-06 [updatestate_loads_eliminate]: 3.16001e-06 [renormalize]: 1.16002e-06 [cse]: 2.668e-05 [optimize_parallel_all_gather_comm]: 2.184e-05 [overlap_param_gather]: 4.53999e-06 [cconv]: 3.489e-05 [loop_unroll]: 0.00045934 [opt_after_cconv]: 0.00014189, [1] [Cycle 1]: 0.00013307, [7] [c_1]: 3.738e-05 [parameter_eliminate]: 5.17e-06 [updatestate_depend_eliminate]: 6.82002e-06 [updatestate_assign_eliminate]: 3.33998e-06 [updatestate_loads_eliminate]: 3.04999e-06 [cse]: 2.22e-05 [renormalize]: 3.80009e-07 [remove_dup_value]: 2.084e-05 [tuple_transform]: 9.95e-05, [1] [Cycle 1]: 9.196e-05, [4] [d_1]: 5.122e-05 [none_parameter_eliminate]: 2.09e-06 [renormalize]: 2.10013e-07 [switch_simplify]: 8.15e-06 [partial_unused_args_eliminate]: 5.22e-06 [add_recomputation]: 6.36e-05 [cse_after_recomputation]: 3.253e-05, [1] [Cycle 1]: 2.538e-05, [1] [cse]: 1.552e-05 [environ_conv]: 9.51e-06 [swap_dp_allreduce_reducescatter]: 8.77999e-06 [bias_add_comm_swap]: 5.80002e-06 [label_micro_interleaved_index]: 6.78998e-06 [label_fine_grained_interleaved_index]: 5.29e-06 [merge_cast_opt]: 3.69002e-06 [slice_recompute_activation]: 4.74998e-06 [micro_interleaved_order_control]: 4.75999e-06 [assign_add_opt]: 3.76001e-06 [ForceFp32Comm]: 3.21999e-06 [remove_cast_before_assign_add]: 3.75e-06 [full_micro_interleaved_order_control]: 4.67998e-06 [reorder_send_recv_between_fp_bp]: 5.62999e-06 [comm_op_add_attrs]: 3.33e-06 [add_comm_op_reuse_tag]: 3.58e-06 [interleave_split_concat_branches]: 3.45003e-06 [interleave_parallel_branches]: 3.38999e-06 [overlap_opt_shard_in_pipeline]: 3.48e-06 [overlap_opt_shard_grad_in_pipeline]: 4e-06 [control_data_broadcast_order]: 1.833e-05 [grouped_pairwise_exchange_alltoall]: 4.37e-06 [offloading_packed_experts]: 6.96001e-06 [overlap_recompute_and_grad_model_parallel]: 7.65e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.63999e-06 [overlap_recompute_allgather_and_fa_grad]: 3.73001e-06 [overlap_recompute_comm]: 4.77998e-06 [overlap_grad_ring_attention]: 7.39002e-06 [overlap_grad_flash_sp]: 2.742e-05 [begin_end_overlap_inline]: 2.90998e-06 [split_matmul_comm_elemetwise]: 4.60999e-06 [split_layernorm_comm]: 4.23001e-06 [handle_group_info]: 3.86999e-06 [symbol_engine_optimizer]: 0.0001086, [1] [Cycle 1]: 0.00010126, [6] [build]: 4.08999e-06 [elim_shapecalc]: 1.18e-05 [elim_not_effective]: 1.68e-05 [opt_reshape]: 8.66997e-06 [fold_const_symbol]: 1.26e-05 [renormalize]: 1.80007e-07 [detach_backward]: 6.84001e-06 [pipeline_parallel_scheduler]: 1.99e-06 [auto_monad_reorder]: 3.256e-05 [get_jit_bprop_graph]: 2.19001e-06 [rewriter_after_jit_bprop_graph]: 7.31999e-06 [opt_after_jit_grad]: 0.00061837 [validate]: 5.043e-05 Sums bootstrap : 0.000492s : 1.30% type_inference : 0.031496s : 83.16% event_method : 0.000021s : 0.06% auto_monad : 0.000065s : 0.17% graph_reusing : 0.000007s : 0.02% inline : 0.000003s : 0.01% add_attr.add_attr_with_inline.tag_attr : 0.000022s : 0.06% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.02% parallel-infer-symbol : 0.000003s : 0.01% pre_auto_parallel : 0.000038s : 0.10% insert-virtual-dataset : 0.000002s : 0.01% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.01% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000030s : 0.08% optimize.rewriter_before_opt_a : 0.000092s : 0.24% optimize.opt_a.expand_dump_flag : 0.000005s : 0.01% optimize.opt_a.switch_simplify : 0.000054s : 0.14% optimize.opt_a.loop_unroll : 0.000039s : 0.10% optimize.opt_a.a_1 : 0.000860s : 2.27% optimize.opt_a.with_stream_mark : 0.000031s : 0.08% optimize.opt_a.recompute_prepare : 0.000018s : 0.05% optimize.opt_a.updatestate_depend_eliminate : 0.000009s : 0.02% optimize.opt_a.updatestate_assign_eliminate : 0.000008s : 0.02% optimize.opt_a.updatestate_loads_eliminate : 0.000007s : 0.02% optimize.opt_a.parameter_eliminate : 0.000003s : 0.01% optimize.opt_a.a_2 : 0.000248s : 0.66% optimize.opt_a.accelerated_algorithm : 0.000016s : 0.04% optimize.opt_a.shard : 0.000003s : 0.01% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.01% optimize.opt_a.shard_inline : 0.000016s : 0.04% optimize.opt_a.merge_send_recv : 0.000017s : 0.04% optimize.opt_a.auto_parallel : 0.000014s : 0.04% optimize.opt_a.parallel : 0.000029s : 0.08% optimize.opt_a.flash_sp : 0.000013s : 0.03% optimize.opt_a.merge_comm : 0.000009s : 0.02% optimize.opt_a.allreduce_fusion : 0.000008s : 0.02% optimize.opt_a.matmul_add_comm_reduction : 0.000018s : 0.05% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000019s : 0.05% optimize.opt_a.virtual_dataset : 0.000016s : 0.04% optimize.opt_a.get_grad_eliminate_ : 0.000015s : 0.04% optimize.opt_a.virtual_output : 0.000016s : 0.04% optimize.opt_a.merge_forward : 0.000009s : 0.02% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.01% optimize.opt_a.offload_activation : 0.000023s : 0.06% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000034s : 0.09% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.01% optimize.opt_a.before_grad : 0.000025s : 0.07% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000010s : 0.03% optimize.opt_a.meta_fg_expand : 0.000006s : 0.02% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.01% optimize.opt_a.receive_attached : 0.000004s : 0.01% optimize.opt_a.after_resolve : 0.000042s : 0.11% optimize.opt_a.a_after_grad : 0.000024s : 0.06% optimize.opt_a.renormalize : 0.000964s : 2.55% optimize.opt_a.add_forward_monad_depend : 0.000007s : 0.02% optimize.opt_a.auto_monad_grad : 0.000004s : 0.01% optimize.opt_a.auto_monad_eliminator : 0.000028s : 0.07% optimize.opt_a.cse : 0.000057s : 0.15% optimize.opt_a.a_3 : 0.000132s : 0.35% optimize.py_interpret_to_execute_after_opt_a : 0.000017s : 0.05% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.01% optimize.rewriter_after_opt_a : 0.000047s : 0.12% optimize.convert_after_rewriter : 0.000011s : 0.03% optimize.order_py_execute_after_rewriter : 0.000009s : 0.02% optimize.mutable_eliminate : 0.000631s : 1.67% optimize.opt_b.b_1 : 0.000213s : 0.56% optimize.opt_b.b_2 : 0.000010s : 0.03% optimize.opt_b.updatestate_depend_eliminate : 0.000008s : 0.02% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.01% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.01% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000027s : 0.07% optimize.optimize_parallel_all_gather_comm : 0.000022s : 0.06% optimize.overlap_param_gather : 0.000005s : 0.01% optimize.cconv : 0.000035s : 0.09% optimize.loop_unroll : 0.000459s : 1.21% optimize.opt_after_cconv.c_1 : 0.000037s : 0.10% optimize.opt_after_cconv.parameter_eliminate : 0.000005s : 0.01% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000007s : 0.02% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.cse : 0.000022s : 0.06% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000021s : 0.06% optimize.tuple_transform.d_1 : 0.000051s : 0.14% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000008s : 0.02% optimize.partial_unused_args_eliminate : 0.000005s : 0.01% optimize.add_recomputation : 0.000064s : 0.17% optimize.cse_after_recomputation.cse : 0.000016s : 0.04% optimize.environ_conv : 0.000010s : 0.03% optimize.swap_dp_allreduce_reducescatter : 0.000009s : 0.02% optimize.bias_add_comm_swap : 0.000006s : 0.02% optimize.label_micro_interleaved_index : 0.000007s : 0.02% optimize.label_fine_grained_interleaved_index : 0.000005s : 0.01% optimize.merge_cast_opt : 0.000004s : 0.01% optimize.slice_recompute_activation : 0.000005s : 0.01% optimize.micro_interleaved_order_control : 0.000005s : 0.01% optimize.assign_add_opt : 0.000004s : 0.01% optimize.ForceFp32Comm : 0.000003s : 0.01% optimize.remove_cast_before_assign_add : 0.000004s : 0.01% optimize.full_micro_interleaved_order_control : 0.000005s : 0.01% optimize.reorder_send_recv_between_fp_bp : 0.000006s : 0.01% optimize.comm_op_add_attrs : 0.000003s : 0.01% optimize.add_comm_op_reuse_tag : 0.000004s : 0.01% optimize.interleave_split_concat_branches : 0.000003s : 0.01% optimize.interleave_parallel_branches : 0.000003s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000003s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000004s : 0.01% optimize.control_data_broadcast_order : 0.000018s : 0.05% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.01% optimize.offloading_packed_experts : 0.000007s : 0.02% optimize.overlap_recompute_and_grad_model_parallel : 0.000008s : 0.02% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.01% optimize.overlap_recompute_comm : 0.000005s : 0.01% optimize.overlap_grad_ring_attention : 0.000007s : 0.02% optimize.overlap_grad_flash_sp : 0.000027s : 0.07% optimize.begin_end_overlap_inline : 0.000003s : 0.01% optimize.split_matmul_comm_elemetwise : 0.000005s : 0.01% optimize.split_layernorm_comm : 0.000004s : 0.01% optimize.handle_group_info : 0.000004s : 0.01% optimize.symbol_engine_optimizer.build : 0.000004s : 0.01% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000012s : 0.03% optimize.symbol_engine_optimizer.elim_not_effective : 0.000017s : 0.04% optimize.symbol_engine_optimizer.opt_reshape : 0.000009s : 0.02% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000013s : 0.03% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000007s : 0.02% pipeline_parallel_scheduler : 0.000002s : 0.01% auto_monad_reorder : 0.000033s : 0.09% get_jit_bprop_graph : 0.000002s : 0.01% rewriter_after_jit_bprop_graph : 0.000007s : 0.02% opt_after_jit_grad : 0.000618s : 1.63% validate : 0.000050s : 0.13% Time group info: ------[substitution.] 0.000228 39 11.81% : 0.000027s : 3: substitution.cast_eliminate 1.02% : 0.000002s : 3: substitution.elim_not_effective 0.75% : 0.000002s : 3: substitution.fold_const_symbol 2.91% : 0.000007s : 5: substitution.graph_param_transform 67.43% : 0.000154s : 4: substitution.inline 2.20% : 0.000005s : 6: substitution.j_node_and_user_rematch 2.66% : 0.000006s : 6: substitution.remove_not_recompute_node 2.28% : 0.000005s : 4: substitution.replace_old_param 5.62% : 0.000013s : 4: substitution.tuple_list_get_item_eliminator 3.31% : 0.000008s : 1: substitution.value_based_eliminate ------[type_inference.] 0.031435 2 97.42% : 0.030623s : 1: type_inference.infer 2.58% : 0.000812s : 1: type_inference.specialize ------[replace.] 0.000061 8 61.64% : 0.000037s : 4: replace.inline 38.36% : 0.000023s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000162 8 93.29% : 0.000151s : 4: match.inline 6.71% : 0.000011s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000243 1504 0.93% : 0.000002s : 15: predicate.accumulaten_eliminater 0.80% : 0.000002s : 5: predicate.ad_related_special_op_eliminate 0.60% : 0.000001s : 10: predicate.addn_check_dump 0.83% : 0.000002s : 15: predicate.addn_zero_filter 0.78% : 0.000002s : 15: predicate.adjust_all_reduce_mul_add 1.97% : 0.000005s : 25: predicate.arithmetic_simplify 0.98% : 0.000002s : 15: predicate.cast_eliminate 0.67% : 0.000002s : 10: predicate.check_bprop_eliminate 0.59% : 0.000001s : 10: predicate.compare_switch_simplify 0.28% : 0.000001s : 5: predicate.const_output_eliminate 0.65% : 0.000002s : 10: predicate.depend_value_elim 1.03% : 0.000002s : 15: predicate.dict_get_item_const_eliminator 1.04% : 0.000003s : 15: predicate.dict_get_item_eliminator 0.87% : 0.000002s : 15: predicate.dict_set_item_eliminator 1.36% : 0.000003s : 10: predicate.dumpgradient_eliminate 0.23% : 0.000001s : 5: predicate.elim_not_effective 0.37% : 0.000001s : 5: predicate.elim_shapecalc_of_broadcastargs 1.13% : 0.000003s : 20: predicate.environ_add_const_eliminate 1.09% : 0.000003s : 20: predicate.environ_get_add_eliminate 1.10% : 0.000003s : 20: predicate.environ_get_depend_swap 1.80% : 0.000004s : 30: predicate.environ_get_eliminate 1.06% : 0.000003s : 20: predicate.environ_get_set_eliminate 1.33% : 0.000003s : 23: predicate.exchange_switch_depend_value 2.27% : 0.000006s : 23: predicate.float_depend_g_call 0.57% : 0.000001s : 10: predicate.float_environ_get_switch 0.83% : 0.000002s : 15: predicate.float_tuple_getitem_switch 0.21% : 0.000000s : 5: predicate.fold_const_symbol 0.74% : 0.000002s : 10: predicate.get_grad_eliminate 0.21% : 0.000001s : 5: predicate.graph_param_transform 0.63% : 0.000002s : 10: predicate.incorporate_call 0.55% : 0.000001s : 10: predicate.incorporate_call_switch 6.49% : 0.000016s : 68: predicate.inline 0.95% : 0.000002s : 10: predicate.inline_without_move 0.33% : 0.000001s : 10: predicate.j_node_and_user_rematch 0.81% : 0.000002s : 10: predicate.less_batch_normalization 1.82% : 0.000004s : 29: predicate.list_to_tuple_eliminator_ 2.48% : 0.000006s : 44: predicate.load_eliminater 1.05% : 0.000003s : 5: predicate.loop_unroll_after_grad 2.11% : 0.000005s : 36: predicate.loop_unroll_before_grad 1.65% : 0.000004s : 25: predicate.make_slice_get_slice_eliminator 0.68% : 0.000002s : 10: predicate.merge_addn 0.61% : 0.000001s : 10: predicate.micro_step_allgather_replace 0.59% : 0.000001s : 10: predicate.mini_step_allgather_replace 0.84% : 0.000002s : 15: predicate.minmaximum_grad 1.34% : 0.000003s : 5: predicate.mutable_eliminate 0.38% : 0.000001s : 5: predicate.opt_reshape 0.42% : 0.000001s : 5: predicate.parallel_virtual_node 1.62% : 0.000004s : 23: predicate.partial_defer_inline 1.60% : 0.000004s : 24: predicate.partial_eliminate 0.85% : 0.000002s : 15: predicate.print_const_string_wrapper 0.61% : 0.000001s : 10: predicate.reduce_all_const_elim 1.10% : 0.000003s : 15: predicate.reduce_eliminate 2.46% : 0.000006s : 44: predicate.redundant_stop_gradient_eliminater 0.40% : 0.000001s : 10: predicate.remove_not_recompute_node 1.39% : 0.000003s : 29: predicate.replace_applicator 0.49% : 0.000001s : 10: predicate.replace_old_param 0.35% : 0.000001s : 5: predicate.reset_defer_inline 0.92% : 0.000002s : 15: predicate.reshape_eliminate 0.68% : 0.000002s : 10: predicate.row_tensor_add_zeros_like 0.39% : 0.000001s : 5: predicate.row_tensor_eliminate 0.80% : 0.000002s : 10: predicate.same_eliminate 0.45% : 0.000001s : 10: predicate.set_cell_output_no_recompute 0.83% : 0.000002s : 10: predicate.shard_identity_eliminate 0.75% : 0.000002s : 10: predicate.special_op_eliminate 0.82% : 0.000002s : 10: predicate.specialize_transform 0.96% : 0.000002s : 10: predicate.split_environ_get_set_with_tuple_value 0.81% : 0.000002s : 10: predicate.stack_unstack_eliminate 0.43% : 0.000001s : 5: predicate.switch_call_monad_eliminater 1.38% : 0.000003s : 23: predicate.switch_defer_inline 1.96% : 0.000005s : 33: predicate.switch_layer_defer_inline 4.91% : 0.000012s : 74: predicate.switch_simplify 1.02% : 0.000002s : 15: predicate.tile_eliminate 0.94% : 0.000002s : 15: predicate.transpose_eliminate 1.46% : 0.000004s : 25: predicate.tuple_list_convert_item_index_to_positive 1.50% : 0.000004s : 25: predicate.tuple_list_get_item_const_eliminator 1.47% : 0.000004s : 25: predicate.tuple_list_get_item_depend_reorder 2.92% : 0.000007s : 39: predicate.tuple_list_get_item_eliminator 1.51% : 0.000004s : 25: predicate.tuple_list_get_set_item_eliminator 2.21% : 0.000005s : 35: predicate.tuple_list_set_item_eliminator 1.71% : 0.000004s : 29: predicate.tuple_to_list_eliminator_ 2.38% : 0.000006s : 44: predicate.updatestate_pure_node_eliminater 3.38% : 0.000008s : 54: predicate.updatestate_useless_node_eliminater 0.52% : 0.000001s : 5: predicate.value_based_eliminate 0.61% : 0.000001s : 10: predicate.virtual_dataset_eliminate 0.66% : 0.000002s : 10: predicate.virtual_output_eliminate 0.26% : 0.000001s : 5: predicate.virtual_view_grad_eliminate 0.43% : 0.000001s : 5: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000670 11 53.68% : 0.000359s : 5: func_graph_cloner_run.FuncGraphClonerGraph 46.32% : 0.000310s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.056426 192 0.01% : 0.000006s : 1: ForceFp32Comm 6.26% : 0.003532s : 1: add_attr 6.23% : 0.003515s : 1: add_attr_with_inline 0.01% : 0.000006s : 1: add_comm_op_reuse_tag 0.12% : 0.000067s : 1: add_recomputation 0.01% : 0.000006s : 1: assign_add_opt 0.13% : 0.000076s : 1: auto_monad 0.07% : 0.000040s : 1: auto_monad_reorder 0.01% : 0.000006s : 1: begin_end_overlap_inline 0.01% : 0.000008s : 1: bias_add_comm_swap 0.96% : 0.000541s : 1: bootstrap 0.07% : 0.000038s : 1: cconv 0.01% : 0.000006s : 1: comm_op_add_attrs 0.04% : 0.000021s : 1: control_data_broadcast_order 0.03% : 0.000014s : 1: convert_after_rewriter 0.06% : 0.000036s : 1: cse_after_recomputation 0.01% : 0.000008s : 1: dataset_repeat_opt 0.05% : 0.000031s : 1: detach_backward 0.02% : 0.000012s : 1: environ_conv 0.06% : 0.000032s : 1: event_method 0.01% : 0.000007s : 1: full_micro_interleaved_order_control 0.01% : 0.000008s : 1: get_jit_bprop_graph 0.02% : 0.000013s : 1: graph_reusing 0.01% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000007s : 1: handle_group_info 0.02% : 0.000009s : 1: inline 0.01% : 0.000008s : 1: insert-virtual-dataset 0.01% : 0.000006s : 1: interleave_parallel_branches 0.01% : 0.000006s : 1: interleave_split_concat_branches 0.01% : 0.000008s : 1: label_fine_grained_interleaved_index 0.02% : 0.000010s : 1: label_micro_interleaved_index 0.82% : 0.000465s : 1: loop_unroll 0.01% : 0.000006s : 1: merge_cast_opt 0.01% : 0.000008s : 1: micro_interleaved_order_control 1.13% : 0.000639s : 1: mutable_eliminate 0.02% : 0.000010s : 1: offloading_packed_experts 0.03% : 0.000016s : 1: opt.transform.loop_unroll_optimizer 0.04% : 0.000022s : 1: opt.transform.mutable_eliminate 2.48% : 0.001402s : 78: opt.transform.opt_a 0.06% : 0.000036s : 1: opt.transform.opt_after_cconv 0.06% : 0.000034s : 1: opt.transform.opt_after_jit_grad 0.26% : 0.000149s : 28: opt.transform.opt_b 0.10% : 0.000057s : 2: opt.transform.opt_trans_graph 0.08% : 0.000046s : 4: opt.transform.symbol_engine_opt 6.42% : 0.003621s : 1: opt_a 0.26% : 0.000145s : 1: opt_after_cconv 1.12% : 0.000630s : 1: opt_after_jit_grad 0.60% : 0.000337s : 1: opt_b 13.27% : 0.007489s : 1: optimize 0.04% : 0.000025s : 1: optimize_parallel_all_gather_comm 0.02% : 0.000011s : 1: order_py_execute_after_rewriter 0.05% : 0.000031s : 1: overlap_grad_flash_sp 0.01% : 0.000006s : 1: overlap_grad_matmul_and_grad_allreduce 0.02% : 0.000011s : 1: overlap_grad_ring_attention 0.01% : 0.000007s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000006s : 1: overlap_opt_shard_in_pipeline 0.01% : 0.000008s : 1: overlap_param_gather 0.01% : 0.000006s : 1: overlap_recompute_allgather_and_fa_grad 0.02% : 0.000011s : 1: overlap_recompute_and_grad_model_parallel 0.01% : 0.000008s : 1: overlap_recompute_comm 0.02% : 0.000010s : 1: parallel-infer-symbol 0.01% : 0.000006s : 1: parallel-infer-symbol-second 0.01% : 0.000008s : 1: partial_unused_args_eliminate 0.02% : 0.000010s : 1: pipeline_parallel_scheduler 0.01% : 0.000007s : 1: pipeline_split 0.08% : 0.000046s : 1: pre_auto_parallel 0.06% : 0.000034s : 1: py_interpret_to_execute 0.04% : 0.000021s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000007s : 1: remove_cast_before_assign_add 0.04% : 0.000025s : 1: remove_dup_value 1.03% : 0.000579s : 1: renormalize.infer 0.67% : 0.000376s : 1: renormalize.specialize 0.01% : 0.000008s : 1: reorder_send_recv_between_fp_bp 0.02% : 0.000013s : 1: rewriter_after_jit_bprop_graph 0.09% : 0.000051s : 1: rewriter_after_opt_a 0.17% : 0.000096s : 1: rewriter_before_opt_a 0.01% : 0.000008s : 1: slice_cell_reuse_recomputed_activation 0.01% : 0.000007s : 1: slice_recompute_activation 0.01% : 0.000007s : 1: split_layernorm_comm 0.01% : 0.000007s : 1: split_matmul_comm_elemetwise 0.02% : 0.000012s : 1: swap_dp_allreduce_reducescatter 0.20% : 0.000111s : 1: symbol_engine_optimizer 0.18% : 0.000103s : 1: tuple_transform 55.90% : 0.031540s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:44:24.250.587 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0161871, [21] [bootstrap]: 0.00043329 [type_inference]: 0.00598459 [event_method]: 2.121e-05 [auto_monad]: 6.507e-05 [graph_reusing]: 5.97001e-06 [inline]: 2.37999e-06 [add_attr]: 0.00321104, [1] [add_attr_with_inline]: 0.0032014, [1] [Cycle 1]: 6.547e-05, [2] [tag_attr]: 2.061e-05 [meta_addattr_fg_expand]: 6.04999e-06 [parallel-infer-symbol]: 3.5e-06 [pre_auto_parallel]: 3.822e-05 [insert-virtual-dataset]: 2.64999e-06 [parallel-infer-symbol-second]: 8.70001e-07 [dataset_repeat_opt]: 2.04999e-06 [pipeline_split]: 1.67001e-06 [optimize]: 0.00569025, [53] [py_interpret_to_execute]: 2.923e-05 [rewriter_before_opt_a]: 8.587e-05 [opt_a]: 0.00342789, [2] [Cycle 1]: 0.00262704, [45] [expand_dump_flag]: 2.282e-05 [switch_simplify]: 4.537e-05 [loop_unroll]: 3.225e-05 [a_1]: 0.00069814 [with_stream_mark]: 1.95e-05 [recompute_prepare]: 1.064e-05 [updatestate_depend_eliminate]: 4.53999e-06 [updatestate_assign_eliminate]: 3.83001e-06 [updatestate_loads_eliminate]: 3.46999e-06 [parameter_eliminate]: 1.82999e-06 [a_2]: 0.00010338 [accelerated_algorithm]: 8.43001e-06 [shard]: 1.72999e-06 [meta_shard_fg_expand]: 2.12999e-06 [shard_inline]: 7.55e-06 [merge_send_recv]: 9.39e-06 [auto_parallel]: 7.43999e-06 [parallel]: 1.897e-05 [flash_sp]: 9.95002e-06 [merge_comm]: 4.45999e-06 [allreduce_fusion]: 4.13001e-06 [matmul_add_comm_reduction]: 1.107e-05 [allreduce_slice_to_reducescatter]: 6.40022e-07 [virtual_shard_identity]: 9.74999e-06 [virtual_dataset]: 7.74002e-06 [get_grad_eliminate_]: 7.22002e-06 [virtual_output]: 7.36001e-06 [merge_forward]: 4.50001e-06 [cell_reuse_recompute_pass]: 1.96e-06 [offload_activation]: 1.152e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.526e-05 [merge_recompute_call_nodes]: 1.89e-06 [before_grad]: 1.233e-05 [set_forward_comm_id_for_comm_node_pass]: 4.71002e-06 [meta_fg_expand]: 3.46001e-06 [flash_sp_send_recv_attached]: 3.03e-06 [receive_attached]: 1.94e-06 [after_resolve]: 1.298e-05 [a_after_grad]: 1.182e-05 [renormalize]: 0.00109841 [add_forward_monad_depend]: 6.11e-06 [auto_monad_grad]: 2.31e-06 [auto_monad_eliminator]: 1.793e-05 [cse]: 3.665e-05 [a_3]: 5.976e-05 [Cycle 2]: 0.00079057, [45] [expand_dump_flag]: 1.67999e-06 [switch_simplify]: 9.61e-06 [loop_unroll]: 7.45e-06 [a_1]: 0.00017619 [with_stream_mark]: 1.313e-05 [recompute_prepare]: 7.82e-06 [updatestate_depend_eliminate]: 4.44998e-06 [updatestate_assign_eliminate]: 3.36001e-06 [updatestate_loads_eliminate]: 2.89999e-06 [parameter_eliminate]: 1.42999e-06 [a_2]: 9.225e-05 [accelerated_algorithm]: 8.13999e-06 [shard]: 1.40999e-06 [meta_shard_fg_expand]: 1.94e-06 [shard_inline]: 7.63999e-06 [merge_send_recv]: 1.018e-05 [auto_parallel]: 6.76e-06 [parallel]: 6.45002e-06 [flash_sp]: 3.45e-06 [merge_comm]: 4.60999e-06 [allreduce_fusion]: 4.02998e-06 [matmul_add_comm_reduction]: 8.77999e-06 [allreduce_slice_to_reducescatter]: 3.80009e-07 [virtual_shard_identity]: 8.60999e-06 [virtual_dataset]: 7.68001e-06 [get_grad_eliminate_]: 6.96999e-06 [virtual_output]: 6.92002e-06 [merge_forward]: 4.22998e-06 [cell_reuse_recompute_pass]: 2.70997e-06 [offload_activation]: 8.36002e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.476e-05 [merge_recompute_call_nodes]: 1.05001e-06 [before_grad]: 1.179e-05 [set_forward_comm_id_for_comm_node_pass]: 6.24999e-06 [meta_fg_expand]: 2.73e-06 [flash_sp_send_recv_attached]: 1.40001e-06 [receive_attached]: 1.79e-06 [after_resolve]: 1.42e-05 [a_after_grad]: 1.181e-05 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 1.89999e-06 [auto_monad_grad]: 1.49e-06 [auto_monad_eliminator]: 1.034e-05 [cse]: 2.093e-05 [a_3]: 4.567e-05 [py_interpret_to_execute_after_opt_a]: 1.221e-05 [slice_cell_reuse_recomputed_activation]: 2.26e-06 [rewriter_after_opt_a]: 4.033e-05 [convert_after_rewriter]: 7.66001e-06 [order_py_execute_after_rewriter]: 6.22001e-06 [mutable_eliminate]: 0.00057035 [opt_b]: 0.00026393, [1] [Cycle 1]: 0.00025749, [7] [b_1]: 0.00017049 [b_2]: 9.51998e-06 [updatestate_depend_eliminate]: 6.09999e-06 [updatestate_assign_eliminate]: 3.2e-06 [updatestate_loads_eliminate]: 3.06001e-06 [renormalize]: 6.19999e-07 [cse]: 2.594e-05 [optimize_parallel_all_gather_comm]: 1.923e-05 [overlap_param_gather]: 1.99e-06 [cconv]: 2.831e-05 [loop_unroll]: 0.00045218 [opt_after_cconv]: 0.00012178, [1] [Cycle 1]: 0.00011566, [7] [c_1]: 4.156e-05 [parameter_eliminate]: 3.18e-06 [updatestate_depend_eliminate]: 6.14001e-06 [updatestate_assign_eliminate]: 3.23998e-06 [updatestate_loads_eliminate]: 2.87002e-06 [cse]: 2.319e-05 [renormalize]: 3.59985e-07 [remove_dup_value]: 1.616e-05 [tuple_transform]: 9.041e-05, [1] [Cycle 1]: 8.581e-05, [4] [d_1]: 5.479e-05 [none_parameter_eliminate]: 2.18002e-06 [renormalize]: 7.2e-07 [switch_simplify]: 8.90001e-06 [partial_unused_args_eliminate]: 2.04e-06 [add_recomputation]: 5.773e-05 [cse_after_recomputation]: 2.725e-05, [1] [Cycle 1]: 2.244e-05, [1] [cse]: 1.62e-05 [environ_conv]: 5.93002e-06 [swap_dp_allreduce_reducescatter]: 6.11998e-06 [bias_add_comm_swap]: 3.08e-06 [label_micro_interleaved_index]: 4.1e-06 [label_fine_grained_interleaved_index]: 3.26001e-06 [merge_cast_opt]: 1.25001e-06 [slice_recompute_activation]: 2.39001e-06 [micro_interleaved_order_control]: 2.64999e-06 [assign_add_opt]: 1.25999e-06 [ForceFp32Comm]: 8.79983e-07 [remove_cast_before_assign_add]: 1.14e-06 [full_micro_interleaved_order_control]: 2.43002e-06 [reorder_send_recv_between_fp_bp]: 2.81e-06 [comm_op_add_attrs]: 1.25001e-06 [add_comm_op_reuse_tag]: 9.39996e-07 [interleave_split_concat_branches]: 1.17e-06 [interleave_parallel_branches]: 1.11002e-06 [overlap_opt_shard_in_pipeline]: 1.71e-06 [overlap_opt_shard_grad_in_pipeline]: 2.12999e-06 [control_data_broadcast_order]: 1.588e-05 [grouped_pairwise_exchange_alltoall]: 1.56002e-06 [offloading_packed_experts]: 4.10998e-06 [overlap_recompute_and_grad_model_parallel]: 5.94999e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.25001e-06 [overlap_recompute_allgather_and_fa_grad]: 1.37999e-06 [overlap_recompute_comm]: 2.51e-06 [overlap_grad_ring_attention]: 4.86002e-06 [overlap_grad_flash_sp]: 2.426e-05 [begin_end_overlap_inline]: 5.39992e-07 [split_matmul_comm_elemetwise]: 2.07999e-06 [split_layernorm_comm]: 1.57001e-06 [handle_group_info]: 8.99978e-07 [symbol_engine_optimizer]: 8.976e-05, [1] [Cycle 1]: 8.435e-05, [6] [build]: 3.9e-06 [elim_shapecalc]: 1.213e-05 [elim_not_effective]: 1.641e-05 [opt_reshape]: 8.87999e-06 [fold_const_symbol]: 1.29e-05 [renormalize]: 2.3999e-07 [detach_backward]: 2.22001e-06 [pipeline_parallel_scheduler]: 2.23998e-06 [auto_monad_reorder]: 2.22e-05 [get_jit_bprop_graph]: 2.27999e-06 [rewriter_after_jit_bprop_graph]: 4.10998e-06 [opt_after_jit_grad]: 0.00049052 [validate]: 4.47e-05 Sums bootstrap : 0.000433s : 3.62% type_inference : 0.005985s : 49.94% event_method : 0.000021s : 0.18% auto_monad : 0.000065s : 0.54% graph_reusing : 0.000006s : 0.05% inline : 0.000002s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000021s : 0.17% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.05% parallel-infer-symbol : 0.000003s : 0.03% pre_auto_parallel : 0.000038s : 0.32% insert-virtual-dataset : 0.000003s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000029s : 0.24% optimize.rewriter_before_opt_a : 0.000086s : 0.72% optimize.opt_a.expand_dump_flag : 0.000024s : 0.20% optimize.opt_a.switch_simplify : 0.000055s : 0.46% optimize.opt_a.loop_unroll : 0.000040s : 0.33% optimize.opt_a.a_1 : 0.000874s : 7.30% optimize.opt_a.with_stream_mark : 0.000033s : 0.27% optimize.opt_a.recompute_prepare : 0.000018s : 0.15% optimize.opt_a.updatestate_depend_eliminate : 0.000009s : 0.08% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.06% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.05% optimize.opt_a.parameter_eliminate : 0.000003s : 0.03% optimize.opt_a.a_2 : 0.000196s : 1.63% optimize.opt_a.accelerated_algorithm : 0.000017s : 0.14% optimize.opt_a.shard : 0.000003s : 0.03% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.03% optimize.opt_a.shard_inline : 0.000015s : 0.13% optimize.opt_a.merge_send_recv : 0.000020s : 0.16% optimize.opt_a.auto_parallel : 0.000014s : 0.12% optimize.opt_a.parallel : 0.000025s : 0.21% optimize.opt_a.flash_sp : 0.000013s : 0.11% optimize.opt_a.merge_comm : 0.000009s : 0.08% optimize.opt_a.allreduce_fusion : 0.000008s : 0.07% optimize.opt_a.matmul_add_comm_reduction : 0.000020s : 0.17% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000018s : 0.15% optimize.opt_a.virtual_dataset : 0.000015s : 0.13% optimize.opt_a.get_grad_eliminate_ : 0.000014s : 0.12% optimize.opt_a.virtual_output : 0.000014s : 0.12% optimize.opt_a.merge_forward : 0.000009s : 0.07% optimize.opt_a.cell_reuse_recompute_pass : 0.000005s : 0.04% optimize.opt_a.offload_activation : 0.000020s : 0.17% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000030s : 0.25% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.02% optimize.opt_a.before_grad : 0.000024s : 0.20% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000011s : 0.09% optimize.opt_a.meta_fg_expand : 0.000006s : 0.05% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.04% optimize.opt_a.receive_attached : 0.000004s : 0.03% optimize.opt_a.after_resolve : 0.000027s : 0.23% optimize.opt_a.a_after_grad : 0.000024s : 0.20% optimize.opt_a.renormalize : 0.001098s : 9.17% optimize.opt_a.add_forward_monad_depend : 0.000008s : 0.07% optimize.opt_a.auto_monad_grad : 0.000004s : 0.03% optimize.opt_a.auto_monad_eliminator : 0.000028s : 0.24% optimize.opt_a.cse : 0.000058s : 0.48% optimize.opt_a.a_3 : 0.000105s : 0.88% optimize.py_interpret_to_execute_after_opt_a : 0.000012s : 0.10% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.02% optimize.rewriter_after_opt_a : 0.000040s : 0.34% optimize.convert_after_rewriter : 0.000008s : 0.06% optimize.order_py_execute_after_rewriter : 0.000006s : 0.05% optimize.mutable_eliminate : 0.000570s : 4.76% optimize.opt_b.b_1 : 0.000170s : 1.42% optimize.opt_b.b_2 : 0.000010s : 0.08% optimize.opt_b.updatestate_depend_eliminate : 0.000006s : 0.05% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_b.renormalize : 0.000001s : 0.01% optimize.opt_b.cse : 0.000026s : 0.22% optimize.optimize_parallel_all_gather_comm : 0.000019s : 0.16% optimize.overlap_param_gather : 0.000002s : 0.02% optimize.cconv : 0.000028s : 0.24% optimize.loop_unroll : 0.000452s : 3.77% optimize.opt_after_cconv.c_1 : 0.000042s : 0.35% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.05% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.02% optimize.opt_after_cconv.cse : 0.000023s : 0.19% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000016s : 0.13% optimize.tuple_transform.d_1 : 0.000055s : 0.46% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.02% optimize.tuple_transform.renormalize : 0.000001s : 0.01% optimize.tuple_transform.switch_simplify : 0.000009s : 0.07% optimize.partial_unused_args_eliminate : 0.000002s : 0.02% optimize.add_recomputation : 0.000058s : 0.48% optimize.cse_after_recomputation.cse : 0.000016s : 0.14% optimize.environ_conv : 0.000006s : 0.05% optimize.swap_dp_allreduce_reducescatter : 0.000006s : 0.05% optimize.bias_add_comm_swap : 0.000003s : 0.03% optimize.label_micro_interleaved_index : 0.000004s : 0.03% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.03% optimize.merge_cast_opt : 0.000001s : 0.01% optimize.slice_recompute_activation : 0.000002s : 0.02% optimize.micro_interleaved_order_control : 0.000003s : 0.02% optimize.assign_add_opt : 0.000001s : 0.01% optimize.ForceFp32Comm : 0.000001s : 0.01% optimize.remove_cast_before_assign_add : 0.000001s : 0.01% optimize.full_micro_interleaved_order_control : 0.000002s : 0.02% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.02% optimize.comm_op_add_attrs : 0.000001s : 0.01% optimize.add_comm_op_reuse_tag : 0.000001s : 0.01% optimize.interleave_split_concat_branches : 0.000001s : 0.01% optimize.interleave_parallel_branches : 0.000001s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000002s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.02% optimize.control_data_broadcast_order : 0.000016s : 0.13% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.01% optimize.offloading_packed_experts : 0.000004s : 0.03% optimize.overlap_recompute_and_grad_model_parallel : 0.000006s : 0.05% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.01% optimize.overlap_recompute_comm : 0.000003s : 0.02% optimize.overlap_grad_ring_attention : 0.000005s : 0.04% optimize.overlap_grad_flash_sp : 0.000024s : 0.20% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.02% optimize.split_layernorm_comm : 0.000002s : 0.01% optimize.handle_group_info : 0.000001s : 0.01% optimize.symbol_engine_optimizer.build : 0.000004s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000012s : 0.10% optimize.symbol_engine_optimizer.elim_not_effective : 0.000016s : 0.14% optimize.symbol_engine_optimizer.opt_reshape : 0.000009s : 0.07% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000013s : 0.11% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.02% pipeline_parallel_scheduler : 0.000002s : 0.02% auto_monad_reorder : 0.000022s : 0.19% get_jit_bprop_graph : 0.000002s : 0.02% rewriter_after_jit_bprop_graph : 0.000004s : 0.03% opt_after_jit_grad : 0.000491s : 4.09% validate : 0.000045s : 0.37% Time group info: ------[substitution.] 0.000228 39 10.68% : 0.000024s : 3: substitution.cast_eliminate 1.06% : 0.000002s : 3: substitution.elim_not_effective 0.81% : 0.000002s : 3: substitution.fold_const_symbol 2.92% : 0.000007s : 5: substitution.graph_param_transform 67.88% : 0.000155s : 4: substitution.inline 2.06% : 0.000005s : 6: substitution.j_node_and_user_rematch 2.80% : 0.000006s : 6: substitution.remove_not_recompute_node 2.67% : 0.000006s : 4: substitution.replace_old_param 5.73% : 0.000013s : 4: substitution.tuple_list_get_item_eliminator 3.40% : 0.000008s : 1: substitution.value_based_eliminate ------[type_inference.] 0.005921 2 87.45% : 0.005177s : 1: type_inference.infer 12.55% : 0.000743s : 1: type_inference.specialize ------[replace.] 0.000064 8 62.07% : 0.000040s : 4: replace.inline 37.93% : 0.000024s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000164 8 93.11% : 0.000153s : 4: match.inline 6.89% : 0.000011s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000241 1504 0.84% : 0.000002s : 15: predicate.accumulaten_eliminater 0.85% : 0.000002s : 5: predicate.ad_related_special_op_eliminate 0.56% : 0.000001s : 10: predicate.addn_check_dump 0.83% : 0.000002s : 15: predicate.addn_zero_filter 0.83% : 0.000002s : 15: predicate.adjust_all_reduce_mul_add 2.12% : 0.000005s : 25: predicate.arithmetic_simplify 1.21% : 0.000003s : 15: predicate.cast_eliminate 0.59% : 0.000001s : 10: predicate.check_bprop_eliminate 0.57% : 0.000001s : 10: predicate.compare_switch_simplify 0.20% : 0.000000s : 5: predicate.const_output_eliminate 0.62% : 0.000001s : 10: predicate.depend_value_elim 0.89% : 0.000002s : 15: predicate.dict_get_item_const_eliminator 1.04% : 0.000003s : 15: predicate.dict_get_item_eliminator 0.86% : 0.000002s : 15: predicate.dict_set_item_eliminator 1.02% : 0.000002s : 10: predicate.dumpgradient_eliminate 0.23% : 0.000001s : 5: predicate.elim_not_effective 0.41% : 0.000001s : 5: predicate.elim_shapecalc_of_broadcastargs 1.19% : 0.000003s : 20: predicate.environ_add_const_eliminate 1.07% : 0.000003s : 20: predicate.environ_get_add_eliminate 1.16% : 0.000003s : 20: predicate.environ_get_depend_swap 1.68% : 0.000004s : 30: predicate.environ_get_eliminate 1.12% : 0.000003s : 20: predicate.environ_get_set_eliminate 1.44% : 0.000003s : 23: predicate.exchange_switch_depend_value 2.14% : 0.000005s : 23: predicate.float_depend_g_call 0.56% : 0.000001s : 10: predicate.float_environ_get_switch 0.84% : 0.000002s : 15: predicate.float_tuple_getitem_switch 0.21% : 0.000001s : 5: predicate.fold_const_symbol 0.66% : 0.000002s : 10: predicate.get_grad_eliminate 0.22% : 0.000001s : 5: predicate.graph_param_transform 0.66% : 0.000002s : 10: predicate.incorporate_call 0.55% : 0.000001s : 10: predicate.incorporate_call_switch 6.43% : 0.000015s : 68: predicate.inline 1.05% : 0.000003s : 10: predicate.inline_without_move 0.35% : 0.000001s : 10: predicate.j_node_and_user_rematch 0.89% : 0.000002s : 10: predicate.less_batch_normalization 1.87% : 0.000005s : 29: predicate.list_to_tuple_eliminator_ 2.52% : 0.000006s : 44: predicate.load_eliminater 0.89% : 0.000002s : 5: predicate.loop_unroll_after_grad 2.19% : 0.000005s : 36: predicate.loop_unroll_before_grad 1.70% : 0.000004s : 25: predicate.make_slice_get_slice_eliminator 0.61% : 0.000001s : 10: predicate.merge_addn 0.58% : 0.000001s : 10: predicate.micro_step_allgather_replace 0.57% : 0.000001s : 10: predicate.mini_step_allgather_replace 0.84% : 0.000002s : 15: predicate.minmaximum_grad 0.97% : 0.000002s : 5: predicate.mutable_eliminate 0.37% : 0.000001s : 5: predicate.opt_reshape 0.34% : 0.000001s : 5: predicate.parallel_virtual_node 1.76% : 0.000004s : 23: predicate.partial_defer_inline 1.69% : 0.000004s : 24: predicate.partial_eliminate 0.89% : 0.000002s : 15: predicate.print_const_string_wrapper 0.60% : 0.000001s : 10: predicate.reduce_all_const_elim 1.14% : 0.000003s : 15: predicate.reduce_eliminate 2.56% : 0.000006s : 44: predicate.redundant_stop_gradient_eliminater 0.40% : 0.000001s : 10: predicate.remove_not_recompute_node 1.36% : 0.000003s : 29: predicate.replace_applicator 0.53% : 0.000001s : 10: predicate.replace_old_param 0.37% : 0.000001s : 5: predicate.reset_defer_inline 0.95% : 0.000002s : 15: predicate.reshape_eliminate 0.63% : 0.000002s : 10: predicate.row_tensor_add_zeros_like 0.38% : 0.000001s : 5: predicate.row_tensor_eliminate 0.74% : 0.000002s : 10: predicate.same_eliminate 0.40% : 0.000001s : 10: predicate.set_cell_output_no_recompute 0.81% : 0.000002s : 10: predicate.shard_identity_eliminate 0.67% : 0.000002s : 10: predicate.special_op_eliminate 0.84% : 0.000002s : 10: predicate.specialize_transform 1.00% : 0.000002s : 10: predicate.split_environ_get_set_with_tuple_value 0.76% : 0.000002s : 10: predicate.stack_unstack_eliminate 0.38% : 0.000001s : 5: predicate.switch_call_monad_eliminater 1.49% : 0.000004s : 23: predicate.switch_defer_inline 2.06% : 0.000005s : 33: predicate.switch_layer_defer_inline 5.29% : 0.000013s : 74: predicate.switch_simplify 0.83% : 0.000002s : 15: predicate.tile_eliminate 0.91% : 0.000002s : 15: predicate.transpose_eliminate 1.52% : 0.000004s : 25: predicate.tuple_list_convert_item_index_to_positive 1.63% : 0.000004s : 25: predicate.tuple_list_get_item_const_eliminator 1.46% : 0.000004s : 25: predicate.tuple_list_get_item_depend_reorder 3.11% : 0.000007s : 39: predicate.tuple_list_get_item_eliminator 1.49% : 0.000004s : 25: predicate.tuple_list_get_set_item_eliminator 2.22% : 0.000005s : 35: predicate.tuple_list_set_item_eliminator 1.72% : 0.000004s : 29: predicate.tuple_to_list_eliminator_ 2.44% : 0.000006s : 44: predicate.updatestate_pure_node_eliminater 3.17% : 0.000008s : 54: predicate.updatestate_useless_node_eliminater 0.48% : 0.000001s : 5: predicate.value_based_eliminate 0.61% : 0.000001s : 10: predicate.virtual_dataset_eliminate 0.63% : 0.000002s : 10: predicate.virtual_output_eliminate 0.30% : 0.000001s : 5: predicate.virtual_view_grad_eliminate 0.44% : 0.000001s : 5: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000545 11 54.08% : 0.000295s : 5: func_graph_cloner_run.FuncGraphClonerGraph 45.92% : 0.000250s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.027806 192 0.01% : 0.000004s : 1: ForceFp32Comm 11.56% : 0.003216s : 1: add_attr 11.53% : 0.003205s : 1: add_attr_with_inline 0.01% : 0.000004s : 1: add_comm_op_reuse_tag 0.22% : 0.000062s : 1: add_recomputation 0.01% : 0.000004s : 1: assign_add_opt 0.26% : 0.000072s : 1: auto_monad 0.09% : 0.000026s : 1: auto_monad_reorder 0.01% : 0.000003s : 1: begin_end_overlap_inline 0.02% : 0.000006s : 1: bias_add_comm_swap 1.66% : 0.000463s : 1: bootstrap 0.11% : 0.000032s : 1: cconv 0.01% : 0.000004s : 1: comm_op_add_attrs 0.07% : 0.000020s : 1: control_data_broadcast_order 0.04% : 0.000011s : 1: convert_after_rewriter 0.11% : 0.000031s : 1: cse_after_recomputation 0.02% : 0.000005s : 1: dataset_repeat_opt 0.02% : 0.000005s : 1: detach_backward 0.03% : 0.000009s : 1: environ_conv 0.10% : 0.000028s : 1: event_method 0.02% : 0.000005s : 1: full_micro_interleaved_order_control 0.02% : 0.000005s : 1: get_jit_bprop_graph 0.03% : 0.000009s : 1: graph_reusing 0.02% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000004s : 1: handle_group_info 0.02% : 0.000005s : 1: inline 0.02% : 0.000006s : 1: insert-virtual-dataset 0.01% : 0.000004s : 1: interleave_parallel_branches 0.01% : 0.000004s : 1: interleave_split_concat_branches 0.02% : 0.000006s : 1: label_fine_grained_interleaved_index 0.02% : 0.000007s : 1: label_micro_interleaved_index 1.66% : 0.000462s : 1: loop_unroll 0.01% : 0.000004s : 1: merge_cast_opt 0.02% : 0.000006s : 1: micro_interleaved_order_control 2.08% : 0.000580s : 1: mutable_eliminate 0.03% : 0.000007s : 1: offloading_packed_experts 0.06% : 0.000018s : 1: opt.transform.loop_unroll_optimizer 0.07% : 0.000020s : 1: opt.transform.mutable_eliminate 5.02% : 0.001395s : 78: opt.transform.opt_a 0.14% : 0.000040s : 1: opt.transform.opt_after_cconv 0.11% : 0.000031s : 1: opt.transform.opt_after_jit_grad 0.53% : 0.000147s : 28: opt.transform.opt_b 0.22% : 0.000061s : 2: opt.transform.opt_trans_graph 0.17% : 0.000046s : 4: opt.transform.symbol_engine_opt 12.34% : 0.003431s : 1: opt_a 0.45% : 0.000125s : 1: opt_after_cconv 1.80% : 0.000500s : 1: opt_after_jit_grad 0.96% : 0.000268s : 1: opt_b 20.48% : 0.005695s : 1: optimize 0.08% : 0.000023s : 1: optimize_parallel_all_gather_comm 0.03% : 0.000009s : 1: order_py_execute_after_rewriter 0.10% : 0.000027s : 1: overlap_grad_flash_sp 0.01% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.03% : 0.000008s : 1: overlap_grad_ring_attention 0.02% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.02% : 0.000005s : 1: overlap_opt_shard_in_pipeline 0.02% : 0.000005s : 1: overlap_param_gather 0.01% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.03% : 0.000009s : 1: overlap_recompute_and_grad_model_parallel 0.02% : 0.000006s : 1: overlap_recompute_comm 0.03% : 0.000007s : 1: parallel-infer-symbol 0.01% : 0.000004s : 1: parallel-infer-symbol-second 0.02% : 0.000005s : 1: partial_unused_args_eliminate 0.02% : 0.000005s : 1: pipeline_parallel_scheduler 0.02% : 0.000005s : 1: pipeline_split 0.15% : 0.000042s : 1: pre_auto_parallel 0.12% : 0.000033s : 1: py_interpret_to_execute 0.06% : 0.000016s : 1: py_interpret_to_execute_after_opt_a 0.02% : 0.000004s : 1: remove_cast_before_assign_add 0.07% : 0.000020s : 1: remove_dup_value 2.75% : 0.000766s : 1: renormalize.infer 1.16% : 0.000323s : 1: renormalize.specialize 0.02% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.03% : 0.000007s : 1: rewriter_after_jit_bprop_graph 0.16% : 0.000045s : 1: rewriter_after_opt_a 0.33% : 0.000091s : 1: rewriter_before_opt_a 0.02% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.02% : 0.000005s : 1: slice_recompute_activation 0.02% : 0.000004s : 1: split_layernorm_comm 0.02% : 0.000005s : 1: split_matmul_comm_elemetwise 0.03% : 0.000009s : 1: swap_dp_allreduce_reducescatter 0.33% : 0.000093s : 1: symbol_engine_optimizer 0.34% : 0.000093s : 1: tuple_transform 21.59% : 0.006004s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:44:24.491.454 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:44:24.491.710 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0185034, [21] [bootstrap]: 0.0004566 [type_inference]: 0.00646026 [event_method]: 2.189e-05 [auto_monad]: 6.674e-05 [graph_reusing]: 6.11e-06 [inline]: 2.51e-06 [add_attr]: 0.00363682, [1] [add_attr_with_inline]: 0.00362555, [1] [Cycle 1]: 9.445e-05, [2] [tag_attr]: 2.513e-05 [meta_addattr_fg_expand]: 6.59999e-06 [parallel-infer-symbol]: 3.19001e-06 [pre_auto_parallel]: 4.16e-05 [insert-virtual-dataset]: 2.34001e-06 [parallel-infer-symbol-second]: 6.69999e-07 [dataset_repeat_opt]: 2.16e-06 [pipeline_split]: 1.58002e-06 [optimize]: 0.00634604, [53] [py_interpret_to_execute]: 3.229e-05 [rewriter_before_opt_a]: 0.00011678 [opt_a]: 0.00371895, [2] [Cycle 1]: 0.00274595, [45] [expand_dump_flag]: 3.63e-06 [switch_simplify]: 4.684e-05 [loop_unroll]: 3.134e-05 [a_1]: 0.00071159 [with_stream_mark]: 1.745e-05 [recompute_prepare]: 1.054e-05 [updatestate_depend_eliminate]: 4.89e-06 [updatestate_assign_eliminate]: 4.35999e-06 [updatestate_loads_eliminate]: 3.54002e-06 [parameter_eliminate]: 2.53e-06 [a_2]: 0.00013361 [accelerated_algorithm]: 8.57998e-06 [shard]: 2.16998e-06 [meta_shard_fg_expand]: 2.31e-06 [shard_inline]: 7.79002e-06 [merge_send_recv]: 9.57999e-06 [auto_parallel]: 7.11999e-06 [parallel]: 1.995e-05 [flash_sp]: 1.004e-05 [merge_comm]: 5.13002e-06 [allreduce_fusion]: 4.23999e-06 [matmul_add_comm_reduction]: 1.092e-05 [allreduce_slice_to_reducescatter]: 6.60017e-07 [virtual_shard_identity]: 1.076e-05 [virtual_dataset]: 7.92e-06 [get_grad_eliminate_]: 7.69002e-06 [virtual_output]: 8.07e-06 [merge_forward]: 4.65001e-06 [cell_reuse_recompute_pass]: 1.21002e-06 [offload_activation]: 1.181e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.741e-05 [merge_recompute_call_nodes]: 1.54e-06 [before_grad]: 1.32e-05 [set_forward_comm_id_for_comm_node_pass]: 4.74998e-06 [meta_fg_expand]: 3.37002e-06 [flash_sp_send_recv_attached]: 2.74999e-06 [receive_attached]: 2.36e-06 [after_resolve]: 1.293e-05 [a_after_grad]: 1.199e-05 [renormalize]: 0.00104415 [add_forward_monad_depend]: 6.01998e-06 [auto_monad_grad]: 2.37999e-06 [auto_monad_eliminator]: 1.751e-05 [cse]: 3.476e-05 [a_3]: 7.318e-05 [Cycle 2]: 0.00095801, [45] [expand_dump_flag]: 1.75001e-06 [switch_simplify]: 9.05001e-06 [loop_unroll]: 7.61999e-06 [a_1]: 0.00017697 [with_stream_mark]: 1.373e-05 [recompute_prepare]: 8.28999e-06 [updatestate_depend_eliminate]: 3.83999e-06 [updatestate_assign_eliminate]: 3.33998e-06 [updatestate_loads_eliminate]: 3.18e-06 [parameter_eliminate]: 1.24e-06 [a_2]: 0.00012135 [accelerated_algorithm]: 8.04002e-06 [shard]: 1.46002e-06 [meta_shard_fg_expand]: 2.30002e-06 [shard_inline]: 7.6e-06 [merge_send_recv]: 7.33e-06 [auto_parallel]: 7.88001e-06 [parallel]: 5.51002e-06 [flash_sp]: 4.08001e-06 [merge_comm]: 4.13999e-06 [allreduce_fusion]: 4.2e-06 [matmul_add_comm_reduction]: 7.76001e-06 [allreduce_slice_to_reducescatter]: 3.69997e-07 [virtual_shard_identity]: 8.75999e-06 [virtual_dataset]: 7.59002e-06 [get_grad_eliminate_]: 7.51999e-06 [virtual_output]: 7.57002e-06 [merge_forward]: 3.86999e-06 [cell_reuse_recompute_pass]: 2.06e-06 [offload_activation]: 8.94998e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.959e-05 [merge_recompute_call_nodes]: 1.07998e-06 [before_grad]: 1.3e-05 [set_forward_comm_id_for_comm_node_pass]: 4.99003e-06 [meta_fg_expand]: 3.06999e-06 [flash_sp_send_recv_attached]: 1.27e-06 [receive_attached]: 1.52001e-06 [after_resolve]: 1.237e-05 [a_after_grad]: 1.128e-05 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 1.72001e-06 [auto_monad_grad]: 1.17e-06 [auto_monad_eliminator]: 9.52999e-06 [cse]: 1.969e-05 [a_3]: 5.821e-05 [py_interpret_to_execute_after_opt_a]: 1.683e-05 [slice_cell_reuse_recomputed_activation]: 4.99e-06 [rewriter_after_opt_a]: 4.841e-05 [convert_after_rewriter]: 1.099e-05 [order_py_execute_after_rewriter]: 9.00999e-06 [mutable_eliminate]: 0.00067772 [opt_b]: 0.00032975, [1] [Cycle 1]: 0.0003199, [7] [b_1]: 0.0002127 [b_2]: 1.001e-05 [updatestate_depend_eliminate]: 7.85e-06 [updatestate_assign_eliminate]: 3.31001e-06 [updatestate_loads_eliminate]: 3.38e-06 [renormalize]: 7.60017e-07 [cse]: 2.491e-05 [optimize_parallel_all_gather_comm]: 2.124e-05 [overlap_param_gather]: 4.43999e-06 [cconv]: 3.307e-05 [loop_unroll]: 0.00045863 [opt_after_cconv]: 0.00014203, [1] [Cycle 1]: 0.00013361, [7] [c_1]: 3.846e-05 [parameter_eliminate]: 3.27002e-06 [updatestate_depend_eliminate]: 6.65998e-06 [updatestate_assign_eliminate]: 3.15998e-06 [updatestate_loads_eliminate]: 3.15998e-06 [cse]: 2.189e-05 [renormalize]: 5.59987e-07 [remove_dup_value]: 1.803e-05 [tuple_transform]: 0.0001002, [1] [Cycle 1]: 9.314e-05, [4] [d_1]: 5.21e-05 [none_parameter_eliminate]: 1.69998e-06 [renormalize]: 1.90019e-07 [switch_simplify]: 8.47e-06 [partial_unused_args_eliminate]: 4.24002e-06 [add_recomputation]: 6.007e-05 [cse_after_recomputation]: 3.19e-05, [1] [Cycle 1]: 2.522e-05, [1] [cse]: 1.548e-05 [environ_conv]: 9.91998e-06 [swap_dp_allreduce_reducescatter]: 8.55999e-06 [bias_add_comm_swap]: 5.15999e-06 [label_micro_interleaved_index]: 7.3e-06 [label_fine_grained_interleaved_index]: 5.15001e-06 [merge_cast_opt]: 3.73999e-06 [slice_recompute_activation]: 4.68999e-06 [micro_interleaved_order_control]: 4.89998e-06 [assign_add_opt]: 3.55e-06 [ForceFp32Comm]: 3.66999e-06 [remove_cast_before_assign_add]: 3.31999e-06 [full_micro_interleaved_order_control]: 4.32e-06 [reorder_send_recv_between_fp_bp]: 5.69e-06 [comm_op_add_attrs]: 3.48e-06 [add_comm_op_reuse_tag]: 3.28998e-06 [interleave_split_concat_branches]: 3.69002e-06 [interleave_parallel_branches]: 3.45e-06 [overlap_opt_shard_in_pipeline]: 3.44001e-06 [overlap_opt_shard_grad_in_pipeline]: 4.13001e-06 [control_data_broadcast_order]: 1.835e-05 [grouped_pairwise_exchange_alltoall]: 3.86999e-06 [offloading_packed_experts]: 7.76001e-06 [overlap_recompute_and_grad_model_parallel]: 7.48e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.61001e-06 [overlap_recompute_allgather_and_fa_grad]: 3.75998e-06 [overlap_recompute_comm]: 4.50999e-06 [overlap_grad_ring_attention]: 7.36999e-06 [overlap_grad_flash_sp]: 2.635e-05 [begin_end_overlap_inline]: 3.05998e-06 [split_matmul_comm_elemetwise]: 4.83001e-06 [split_layernorm_comm]: 4.18001e-06 [handle_group_info]: 3.25e-06 [symbol_engine_optimizer]: 0.00010377, [1] [Cycle 1]: 9.698e-05, [6] [build]: 3.66999e-06 [elim_shapecalc]: 1.078e-05 [elim_not_effective]: 1.536e-05 [opt_reshape]: 9.12001e-06 [fold_const_symbol]: 1.207e-05 [renormalize]: 2.00002e-07 [detach_backward]: 3.70998e-06 [pipeline_parallel_scheduler]: 1.87001e-06 [auto_monad_reorder]: 2.504e-05 [get_jit_bprop_graph]: 1.89999e-06 [rewriter_after_jit_bprop_graph]: 4.85001e-06 [opt_after_jit_grad]: 0.00074015 [validate]: 4.285e-05 Sums bootstrap : 0.000457s : 3.49% type_inference : 0.006460s : 49.43% event_method : 0.000022s : 0.17% auto_monad : 0.000067s : 0.51% graph_reusing : 0.000006s : 0.05% inline : 0.000003s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000025s : 0.19% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000007s : 0.05% parallel-infer-symbol : 0.000003s : 0.02% pre_auto_parallel : 0.000042s : 0.32% insert-virtual-dataset : 0.000002s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000032s : 0.25% optimize.rewriter_before_opt_a : 0.000117s : 0.89% optimize.opt_a.expand_dump_flag : 0.000005s : 0.04% optimize.opt_a.switch_simplify : 0.000056s : 0.43% optimize.opt_a.loop_unroll : 0.000039s : 0.30% optimize.opt_a.a_1 : 0.000889s : 6.80% optimize.opt_a.with_stream_mark : 0.000031s : 0.24% optimize.opt_a.recompute_prepare : 0.000019s : 0.14% optimize.opt_a.updatestate_depend_eliminate : 0.000009s : 0.07% optimize.opt_a.updatestate_assign_eliminate : 0.000008s : 0.06% optimize.opt_a.updatestate_loads_eliminate : 0.000007s : 0.05% optimize.opt_a.parameter_eliminate : 0.000004s : 0.03% optimize.opt_a.a_2 : 0.000255s : 1.95% optimize.opt_a.accelerated_algorithm : 0.000017s : 0.13% optimize.opt_a.shard : 0.000004s : 0.03% optimize.opt_a.meta_shard_fg_expand : 0.000005s : 0.04% optimize.opt_a.shard_inline : 0.000015s : 0.12% optimize.opt_a.merge_send_recv : 0.000017s : 0.13% optimize.opt_a.auto_parallel : 0.000015s : 0.11% optimize.opt_a.parallel : 0.000025s : 0.19% optimize.opt_a.flash_sp : 0.000014s : 0.11% optimize.opt_a.merge_comm : 0.000009s : 0.07% optimize.opt_a.allreduce_fusion : 0.000008s : 0.06% optimize.opt_a.matmul_add_comm_reduction : 0.000019s : 0.14% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000020s : 0.15% optimize.opt_a.virtual_dataset : 0.000016s : 0.12% optimize.opt_a.get_grad_eliminate_ : 0.000015s : 0.12% optimize.opt_a.virtual_output : 0.000016s : 0.12% optimize.opt_a.merge_forward : 0.000009s : 0.07% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.03% optimize.opt_a.offload_activation : 0.000021s : 0.16% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000037s : 0.28% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.02% optimize.opt_a.before_grad : 0.000026s : 0.20% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000010s : 0.07% optimize.opt_a.meta_fg_expand : 0.000006s : 0.05% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.03% optimize.opt_a.receive_attached : 0.000004s : 0.03% optimize.opt_a.after_resolve : 0.000025s : 0.19% optimize.opt_a.a_after_grad : 0.000023s : 0.18% optimize.opt_a.renormalize : 0.001044s : 7.99% optimize.opt_a.add_forward_monad_depend : 0.000008s : 0.06% optimize.opt_a.auto_monad_grad : 0.000004s : 0.03% optimize.opt_a.auto_monad_eliminator : 0.000027s : 0.21% optimize.opt_a.cse : 0.000054s : 0.42% optimize.opt_a.a_3 : 0.000131s : 1.01% optimize.py_interpret_to_execute_after_opt_a : 0.000017s : 0.13% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.04% optimize.rewriter_after_opt_a : 0.000048s : 0.37% optimize.convert_after_rewriter : 0.000011s : 0.08% optimize.order_py_execute_after_rewriter : 0.000009s : 0.07% optimize.mutable_eliminate : 0.000678s : 5.19% optimize.opt_b.b_1 : 0.000213s : 1.63% optimize.opt_b.b_2 : 0.000010s : 0.08% optimize.opt_b.updatestate_depend_eliminate : 0.000008s : 0.06% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_b.renormalize : 0.000001s : 0.01% optimize.opt_b.cse : 0.000025s : 0.19% optimize.optimize_parallel_all_gather_comm : 0.000021s : 0.16% optimize.overlap_param_gather : 0.000004s : 0.03% optimize.cconv : 0.000033s : 0.25% optimize.loop_unroll : 0.000459s : 3.51% optimize.opt_after_cconv.c_1 : 0.000038s : 0.29% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000007s : 0.05% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.02% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.02% optimize.opt_after_cconv.cse : 0.000022s : 0.17% optimize.opt_after_cconv.renormalize : 0.000001s : 0.00% optimize.remove_dup_value : 0.000018s : 0.14% optimize.tuple_transform.d_1 : 0.000052s : 0.40% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000008s : 0.06% optimize.partial_unused_args_eliminate : 0.000004s : 0.03% optimize.add_recomputation : 0.000060s : 0.46% optimize.cse_after_recomputation.cse : 0.000015s : 0.12% optimize.environ_conv : 0.000010s : 0.08% optimize.swap_dp_allreduce_reducescatter : 0.000009s : 0.07% optimize.bias_add_comm_swap : 0.000005s : 0.04% optimize.label_micro_interleaved_index : 0.000007s : 0.06% optimize.label_fine_grained_interleaved_index : 0.000005s : 0.04% optimize.merge_cast_opt : 0.000004s : 0.03% optimize.slice_recompute_activation : 0.000005s : 0.04% optimize.micro_interleaved_order_control : 0.000005s : 0.04% optimize.assign_add_opt : 0.000004s : 0.03% optimize.ForceFp32Comm : 0.000004s : 0.03% optimize.remove_cast_before_assign_add : 0.000003s : 0.03% optimize.full_micro_interleaved_order_control : 0.000004s : 0.03% optimize.reorder_send_recv_between_fp_bp : 0.000006s : 0.04% optimize.comm_op_add_attrs : 0.000003s : 0.03% optimize.add_comm_op_reuse_tag : 0.000003s : 0.03% optimize.interleave_split_concat_branches : 0.000004s : 0.03% optimize.interleave_parallel_branches : 0.000003s : 0.03% optimize.overlap_opt_shard_in_pipeline : 0.000003s : 0.03% optimize.overlap_opt_shard_grad_in_pipeline : 0.000004s : 0.03% optimize.control_data_broadcast_order : 0.000018s : 0.14% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.03% optimize.offloading_packed_experts : 0.000008s : 0.06% optimize.overlap_recompute_and_grad_model_parallel : 0.000007s : 0.06% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.03% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.03% optimize.overlap_recompute_comm : 0.000005s : 0.03% optimize.overlap_grad_ring_attention : 0.000007s : 0.06% optimize.overlap_grad_flash_sp : 0.000026s : 0.20% optimize.begin_end_overlap_inline : 0.000003s : 0.02% optimize.split_matmul_comm_elemetwise : 0.000005s : 0.04% optimize.split_layernorm_comm : 0.000004s : 0.03% optimize.handle_group_info : 0.000003s : 0.02% optimize.symbol_engine_optimizer.build : 0.000004s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000011s : 0.08% optimize.symbol_engine_optimizer.elim_not_effective : 0.000015s : 0.12% optimize.symbol_engine_optimizer.opt_reshape : 0.000009s : 0.07% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000012s : 0.09% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000004s : 0.03% pipeline_parallel_scheduler : 0.000002s : 0.01% auto_monad_reorder : 0.000025s : 0.19% get_jit_bprop_graph : 0.000002s : 0.01% rewriter_after_jit_bprop_graph : 0.000005s : 0.04% opt_after_jit_grad : 0.000740s : 5.66% validate : 0.000043s : 0.33% Time group info: ------[substitution.] 0.000238 39 10.95% : 0.000026s : 3: substitution.cast_eliminate 0.90% : 0.000002s : 3: substitution.elim_not_effective 0.66% : 0.000002s : 3: substitution.fold_const_symbol 2.75% : 0.000007s : 5: substitution.graph_param_transform 68.70% : 0.000164s : 4: substitution.inline 2.15% : 0.000005s : 6: substitution.j_node_and_user_rematch 3.44% : 0.000008s : 6: substitution.remove_not_recompute_node 2.09% : 0.000005s : 4: substitution.replace_old_param 5.41% : 0.000013s : 4: substitution.tuple_list_get_item_eliminator 2.95% : 0.000007s : 1: substitution.value_based_eliminate ------[type_inference.] 0.006400 2 88.23% : 0.005646s : 1: type_inference.infer 11.77% : 0.000753s : 1: type_inference.specialize ------[replace.] 0.000065 8 63.13% : 0.000041s : 4: replace.inline 36.87% : 0.000024s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000172 8 93.59% : 0.000161s : 4: match.inline 6.41% : 0.000011s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000249 1504 0.99% : 0.000002s : 15: predicate.accumulaten_eliminater 0.90% : 0.000002s : 5: predicate.ad_related_special_op_eliminate 0.56% : 0.000001s : 10: predicate.addn_check_dump 0.99% : 0.000002s : 15: predicate.addn_zero_filter 0.87% : 0.000002s : 15: predicate.adjust_all_reduce_mul_add 2.07% : 0.000005s : 25: predicate.arithmetic_simplify 1.00% : 0.000002s : 15: predicate.cast_eliminate 0.75% : 0.000002s : 10: predicate.check_bprop_eliminate 0.57% : 0.000001s : 10: predicate.compare_switch_simplify 0.19% : 0.000000s : 5: predicate.const_output_eliminate 0.64% : 0.000002s : 10: predicate.depend_value_elim 0.90% : 0.000002s : 15: predicate.dict_get_item_const_eliminator 1.14% : 0.000003s : 15: predicate.dict_get_item_eliminator 0.84% : 0.000002s : 15: predicate.dict_set_item_eliminator 1.11% : 0.000003s : 10: predicate.dumpgradient_eliminate 0.20% : 0.000001s : 5: predicate.elim_not_effective 0.40% : 0.000001s : 5: predicate.elim_shapecalc_of_broadcastargs 1.14% : 0.000003s : 20: predicate.environ_add_const_eliminate 1.08% : 0.000003s : 20: predicate.environ_get_add_eliminate 1.21% : 0.000003s : 20: predicate.environ_get_depend_swap 1.78% : 0.000004s : 30: predicate.environ_get_eliminate 1.11% : 0.000003s : 20: predicate.environ_get_set_eliminate 1.34% : 0.000003s : 23: predicate.exchange_switch_depend_value 2.14% : 0.000005s : 23: predicate.float_depend_g_call 0.68% : 0.000002s : 10: predicate.float_environ_get_switch 0.83% : 0.000002s : 15: predicate.float_tuple_getitem_switch 0.19% : 0.000000s : 5: predicate.fold_const_symbol 0.73% : 0.000002s : 10: predicate.get_grad_eliminate 0.23% : 0.000001s : 5: predicate.graph_param_transform 0.63% : 0.000002s : 10: predicate.incorporate_call 0.55% : 0.000001s : 10: predicate.incorporate_call_switch 6.18% : 0.000015s : 68: predicate.inline 0.79% : 0.000002s : 10: predicate.inline_without_move 0.32% : 0.000001s : 10: predicate.j_node_and_user_rematch 0.85% : 0.000002s : 10: predicate.less_batch_normalization 1.88% : 0.000005s : 29: predicate.list_to_tuple_eliminator_ 2.49% : 0.000006s : 44: predicate.load_eliminater 0.87% : 0.000002s : 5: predicate.loop_unroll_after_grad 2.06% : 0.000005s : 36: predicate.loop_unroll_before_grad 1.66% : 0.000004s : 25: predicate.make_slice_get_slice_eliminator 0.56% : 0.000001s : 10: predicate.merge_addn 0.55% : 0.000001s : 10: predicate.micro_step_allgather_replace 0.56% : 0.000001s : 10: predicate.mini_step_allgather_replace 0.77% : 0.000002s : 15: predicate.minmaximum_grad 1.19% : 0.000003s : 5: predicate.mutable_eliminate 0.37% : 0.000001s : 5: predicate.opt_reshape 0.38% : 0.000001s : 5: predicate.parallel_virtual_node 1.69% : 0.000004s : 23: predicate.partial_defer_inline 1.58% : 0.000004s : 24: predicate.partial_eliminate 0.83% : 0.000002s : 15: predicate.print_const_string_wrapper 0.58% : 0.000001s : 10: predicate.reduce_all_const_elim 1.38% : 0.000003s : 15: predicate.reduce_eliminate 2.51% : 0.000006s : 44: predicate.redundant_stop_gradient_eliminater 0.38% : 0.000001s : 10: predicate.remove_not_recompute_node 1.39% : 0.000003s : 29: predicate.replace_applicator 0.49% : 0.000001s : 10: predicate.replace_old_param 0.36% : 0.000001s : 5: predicate.reset_defer_inline 0.91% : 0.000002s : 15: predicate.reshape_eliminate 0.61% : 0.000002s : 10: predicate.row_tensor_add_zeros_like 0.34% : 0.000001s : 5: predicate.row_tensor_eliminate 0.69% : 0.000002s : 10: predicate.same_eliminate 0.42% : 0.000001s : 10: predicate.set_cell_output_no_recompute 0.85% : 0.000002s : 10: predicate.shard_identity_eliminate 0.71% : 0.000002s : 10: predicate.special_op_eliminate 0.76% : 0.000002s : 10: predicate.specialize_transform 0.95% : 0.000002s : 10: predicate.split_environ_get_set_with_tuple_value 0.71% : 0.000002s : 10: predicate.stack_unstack_eliminate 0.35% : 0.000001s : 5: predicate.switch_call_monad_eliminater 1.46% : 0.000004s : 23: predicate.switch_defer_inline 1.93% : 0.000005s : 33: predicate.switch_layer_defer_inline 4.84% : 0.000012s : 74: predicate.switch_simplify 0.99% : 0.000002s : 15: predicate.tile_eliminate 1.00% : 0.000002s : 15: predicate.transpose_eliminate 1.67% : 0.000004s : 25: predicate.tuple_list_convert_item_index_to_positive 1.56% : 0.000004s : 25: predicate.tuple_list_get_item_const_eliminator 1.43% : 0.000004s : 25: predicate.tuple_list_get_item_depend_reorder 3.34% : 0.000008s : 39: predicate.tuple_list_get_item_eliminator 1.48% : 0.000004s : 25: predicate.tuple_list_get_set_item_eliminator 2.32% : 0.000006s : 35: predicate.tuple_list_set_item_eliminator 1.92% : 0.000005s : 29: predicate.tuple_to_list_eliminator_ 2.38% : 0.000006s : 44: predicate.updatestate_pure_node_eliminater 3.08% : 0.000008s : 54: predicate.updatestate_useless_node_eliminater 0.46% : 0.000001s : 5: predicate.value_based_eliminate 0.69% : 0.000002s : 10: predicate.virtual_dataset_eliminate 0.87% : 0.000002s : 10: predicate.virtual_output_eliminate 0.33% : 0.000001s : 5: predicate.virtual_view_grad_eliminate 0.50% : 0.000001s : 5: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000593 11 53.84% : 0.000319s : 5: func_graph_cloner_run.FuncGraphClonerGraph 46.16% : 0.000274s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.031166 192 0.02% : 0.000006s : 1: ForceFp32Comm 11.71% : 0.003648s : 1: add_attr 11.65% : 0.003630s : 1: add_attr_with_inline 0.02% : 0.000006s : 1: add_comm_op_reuse_tag 0.20% : 0.000063s : 1: add_recomputation 0.02% : 0.000006s : 1: assign_add_opt 0.25% : 0.000077s : 1: auto_monad 0.11% : 0.000033s : 1: auto_monad_reorder 0.02% : 0.000006s : 1: begin_end_overlap_inline 0.03% : 0.000008s : 1: bias_add_comm_swap 1.61% : 0.000503s : 1: bootstrap 0.12% : 0.000036s : 1: cconv 0.02% : 0.000006s : 1: comm_op_add_attrs 0.07% : 0.000021s : 1: control_data_broadcast_order 0.05% : 0.000014s : 1: convert_after_rewriter 0.11% : 0.000035s : 1: cse_after_recomputation 0.02% : 0.000008s : 1: dataset_repeat_opt 0.06% : 0.000019s : 1: detach_backward 0.04% : 0.000013s : 1: environ_conv 0.10% : 0.000033s : 1: event_method 0.02% : 0.000007s : 1: full_micro_interleaved_order_control 0.03% : 0.000008s : 1: get_jit_bprop_graph 0.04% : 0.000012s : 1: graph_reusing 0.02% : 0.000006s : 1: grouped_pairwise_exchange_alltoall 0.02% : 0.000006s : 1: handle_group_info 0.03% : 0.000009s : 1: inline 0.03% : 0.000008s : 1: insert-virtual-dataset 0.02% : 0.000006s : 1: interleave_parallel_branches 0.02% : 0.000006s : 1: interleave_split_concat_branches 0.03% : 0.000008s : 1: label_fine_grained_interleaved_index 0.03% : 0.000010s : 1: label_micro_interleaved_index 1.49% : 0.000465s : 1: loop_unroll 0.02% : 0.000006s : 1: merge_cast_opt 0.02% : 0.000007s : 1: micro_interleaved_order_control 2.20% : 0.000685s : 1: mutable_eliminate 0.03% : 0.000011s : 1: offloading_packed_experts 0.05% : 0.000017s : 1: opt.transform.loop_unroll_optimizer 0.06% : 0.000019s : 1: opt.transform.mutable_eliminate 4.57% : 0.001423s : 78: opt.transform.opt_a 0.12% : 0.000037s : 1: opt.transform.opt_after_cconv 0.10% : 0.000032s : 1: opt.transform.opt_after_jit_grad 0.48% : 0.000149s : 28: opt.transform.opt_b 0.19% : 0.000058s : 2: opt.transform.opt_trans_graph 0.14% : 0.000044s : 4: opt.transform.symbol_engine_opt 11.94% : 0.003722s : 1: opt_a 0.47% : 0.000145s : 1: opt_after_cconv 2.41% : 0.000751s : 1: opt_after_jit_grad 1.07% : 0.000333s : 1: opt_b 21.48% : 0.006695s : 1: optimize 0.08% : 0.000025s : 1: optimize_parallel_all_gather_comm 0.04% : 0.000012s : 1: order_py_execute_after_rewriter 0.09% : 0.000030s : 1: overlap_grad_flash_sp 0.02% : 0.000006s : 1: overlap_grad_matmul_and_grad_allreduce 0.03% : 0.000010s : 1: overlap_grad_ring_attention 0.02% : 0.000007s : 1: overlap_opt_shard_grad_in_pipeline 0.02% : 0.000006s : 1: overlap_opt_shard_in_pipeline 0.02% : 0.000007s : 1: overlap_param_gather 0.02% : 0.000007s : 1: overlap_recompute_allgather_and_fa_grad 0.03% : 0.000010s : 1: overlap_recompute_and_grad_model_parallel 0.02% : 0.000007s : 1: overlap_recompute_comm 0.03% : 0.000010s : 1: parallel-infer-symbol 0.02% : 0.000006s : 1: parallel-infer-symbol-second 0.02% : 0.000007s : 1: partial_unused_args_eliminate 0.03% : 0.000008s : 1: pipeline_parallel_scheduler 0.02% : 0.000007s : 1: pipeline_split 0.16% : 0.000049s : 1: pre_auto_parallel 0.12% : 0.000036s : 1: py_interpret_to_execute 0.07% : 0.000020s : 1: py_interpret_to_execute_after_opt_a 0.02% : 0.000006s : 1: remove_cast_before_assign_add 0.07% : 0.000021s : 1: remove_dup_value 2.14% : 0.000667s : 1: renormalize.infer 1.18% : 0.000368s : 1: renormalize.specialize 0.03% : 0.000008s : 1: reorder_send_recv_between_fp_bp 0.03% : 0.000011s : 1: rewriter_after_jit_bprop_graph 0.17% : 0.000052s : 1: rewriter_after_opt_a 0.39% : 0.000121s : 1: rewriter_before_opt_a 0.03% : 0.000008s : 1: slice_cell_reuse_recomputed_activation 0.02% : 0.000007s : 1: slice_recompute_activation 0.02% : 0.000007s : 1: split_layernorm_comm 0.02% : 0.000007s : 1: split_matmul_comm_elemetwise 0.04% : 0.000011s : 1: swap_dp_allreduce_reducescatter 0.34% : 0.000107s : 1: symbol_engine_optimizer 0.33% : 0.000103s : 1: tuple_transform 20.96% : 0.006532s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:44:24.833.086 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0166221, [21] [bootstrap]: 0.00044966 [type_inference]: 0.00622824 [event_method]: 2.007e-05 [auto_monad]: 6.526e-05 [graph_reusing]: 6.38e-06 [inline]: 2.74001e-06 [add_attr]: 0.00363474, [1] [add_attr_with_inline]: 0.00362416, [1] [Cycle 1]: 7.236e-05, [2] [tag_attr]: 2.3e-05 [meta_addattr_fg_expand]: 6.36e-06 [parallel-infer-symbol]: 3.35e-06 [pre_auto_parallel]: 3.743e-05 [insert-virtual-dataset]: 2.74001e-06 [parallel-infer-symbol-second]: 7.7e-07 [dataset_repeat_opt]: 1.82999e-06 [pipeline_split]: 1.69e-06 [optimize]: 0.00540505, [53] [py_interpret_to_execute]: 2.846e-05 [rewriter_before_opt_a]: 8.905e-05 [opt_a]: 0.0031323, [2] [Cycle 1]: 0.00231838, [45] [expand_dump_flag]: 3.13e-06 [switch_simplify]: 4.555e-05 [loop_unroll]: 3.177e-05 [a_1]: 0.0006865 [with_stream_mark]: 1.739e-05 [recompute_prepare]: 1.168e-05 [updatestate_depend_eliminate]: 4.48999e-06 [updatestate_assign_eliminate]: 3.95e-06 [updatestate_loads_eliminate]: 3.66001e-06 [parameter_eliminate]: 1.79e-06 [a_2]: 0.0001026 [accelerated_algorithm]: 8.68001e-06 [shard]: 1.69998e-06 [meta_shard_fg_expand]: 2.18002e-06 [shard_inline]: 7.66999e-06 [merge_send_recv]: 9.19e-06 [auto_parallel]: 7.42998e-06 [parallel]: 2.027e-05 [flash_sp]: 8.59e-06 [merge_comm]: 4.68999e-06 [allreduce_fusion]: 4.42e-06 [matmul_add_comm_reduction]: 1.123e-05 [allreduce_slice_to_reducescatter]: 6.39993e-07 [virtual_shard_identity]: 9.54e-06 [virtual_dataset]: 7.82998e-06 [get_grad_eliminate_]: 7.44002e-06 [virtual_output]: 7.38e-06 [merge_forward]: 4.70001e-06 [cell_reuse_recompute_pass]: 1.07998e-06 [offload_activation]: 1.118e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.636e-05 [merge_recompute_call_nodes]: 1.43002e-06 [before_grad]: 1.308e-05 [set_forward_comm_id_for_comm_node_pass]: 4.80001e-06 [meta_fg_expand]: 3.66999e-06 [flash_sp_send_recv_attached]: 2.66e-06 [receive_attached]: 2.34999e-06 [after_resolve]: 1.291e-05 [a_after_grad]: 1.209e-05 [renormalize]: 0.00083021 [add_forward_monad_depend]: 6.39999e-06 [auto_monad_grad]: 2.42001e-06 [auto_monad_eliminator]: 1.74e-05 [cse]: 3.813e-05 [a_3]: 5.921e-05 [Cycle 2]: 0.00080338, [45] [expand_dump_flag]: 1.38002e-06 [switch_simplify]: 9.44e-06 [loop_unroll]: 7.41999e-06 [a_1]: 0.00017766 [with_stream_mark]: 1.327e-05 [recompute_prepare]: 7.60998e-06 [updatestate_depend_eliminate]: 3.81001e-06 [updatestate_assign_eliminate]: 3.09999e-06 [updatestate_loads_eliminate]: 2.79999e-06 [parameter_eliminate]: 1.02e-06 [a_2]: 9.328e-05 [accelerated_algorithm]: 7.42002e-06 [shard]: 1.25001e-06 [meta_shard_fg_expand]: 1.69998e-06 [shard_inline]: 7.71999e-06 [merge_send_recv]: 6.36e-06 [auto_parallel]: 5.90002e-06 [parallel]: 5.15999e-06 [flash_sp]: 3.61999e-06 [merge_comm]: 5.20999e-06 [allreduce_fusion]: 3.93999e-06 [matmul_add_comm_reduction]: 7.34002e-06 [allreduce_slice_to_reducescatter]: 3.89991e-07 [virtual_shard_identity]: 8.85999e-06 [virtual_dataset]: 7.43999e-06 [get_grad_eliminate_]: 7.01001e-06 [virtual_output]: 7.62998e-06 [merge_forward]: 3.51999e-06 [cell_reuse_recompute_pass]: 1.92001e-06 [offload_activation]: 8.10999e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.654e-05 [merge_recompute_call_nodes]: 8.79983e-07 [before_grad]: 1.195e-05 [set_forward_comm_id_for_comm_node_pass]: 4.12e-06 [meta_fg_expand]: 3.08998e-06 [flash_sp_send_recv_attached]: 9.89996e-07 [receive_attached]: 1.14e-06 [after_resolve]: 1.227e-05 [a_after_grad]: 1.161e-05 [renormalize]: 8.00064e-08 [add_forward_monad_depend]: 1.62999e-06 [auto_monad_grad]: 1.47001e-06 [auto_monad_eliminator]: 9.12999e-06 [cse]: 1.975e-05 [a_3]: 4.6e-05 [py_interpret_to_execute_after_opt_a]: 1.192e-05 [slice_cell_reuse_recomputed_activation]: 2.06e-06 [rewriter_after_opt_a]: 4.305e-05 [convert_after_rewriter]: 8.61002e-06 [order_py_execute_after_rewriter]: 6.38e-06 [mutable_eliminate]: 0.00058122 [opt_b]: 0.00026663, [1] [Cycle 1]: 0.00025969, [7] [b_1]: 0.00016769 [b_2]: 9.54e-06 [updatestate_depend_eliminate]: 7.82e-06 [updatestate_assign_eliminate]: 3.48e-06 [updatestate_loads_eliminate]: 3.25002e-06 [renormalize]: 5.50004e-07 [cse]: 2.821e-05 [optimize_parallel_all_gather_comm]: 1.952e-05 [overlap_param_gather]: 2.09e-06 [cconv]: 2.865e-05 [loop_unroll]: 0.00044644 [opt_after_cconv]: 0.00011452, [1] [Cycle 1]: 0.00010865, [7] [c_1]: 3.72e-05 [parameter_eliminate]: 3.98999e-06 [updatestate_depend_eliminate]: 6.21998e-06 [updatestate_assign_eliminate]: 3.08e-06 [updatestate_loads_eliminate]: 2.91e-06 [cse]: 2.1e-05 [renormalize]: 5.99975e-07 [remove_dup_value]: 1.482e-05 [tuple_transform]: 8.843e-05, [1] [Cycle 1]: 8.376e-05, [4] [d_1]: 5.511e-05 [none_parameter_eliminate]: 1.71e-06 [renormalize]: 2.00002e-07 [switch_simplify]: 8.13999e-06 [partial_unused_args_eliminate]: 1.89999e-06 [add_recomputation]: 5.802e-05 [cse_after_recomputation]: 2.552e-05, [1] [Cycle 1]: 2.019e-05, [1] [cse]: 1.454e-05 [environ_conv]: 6.66e-06 [swap_dp_allreduce_reducescatter]: 5.82999e-06 [bias_add_comm_swap]: 3.01999e-06 [label_micro_interleaved_index]: 4.35e-06 [label_fine_grained_interleaved_index]: 2.66e-06 [merge_cast_opt]: 1.30999e-06 [slice_recompute_activation]: 2.18998e-06 [micro_interleaved_order_control]: 2.09999e-06 [assign_add_opt]: 1.22999e-06 [ForceFp32Comm]: 7.60017e-07 [remove_cast_before_assign_add]: 9.89996e-07 [full_micro_interleaved_order_control]: 2.61e-06 [reorder_send_recv_between_fp_bp]: 2.98e-06 [comm_op_add_attrs]: 1.29e-06 [add_comm_op_reuse_tag]: 1.05001e-06 [interleave_split_concat_branches]: 1.34e-06 [interleave_parallel_branches]: 1.12e-06 [overlap_opt_shard_in_pipeline]: 1.32999e-06 [overlap_opt_shard_grad_in_pipeline]: 2.61e-06 [control_data_broadcast_order]: 1.574e-05 [grouped_pairwise_exchange_alltoall]: 1.55999e-06 [offloading_packed_experts]: 4.25e-06 [overlap_recompute_and_grad_model_parallel]: 5.57999e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.23002e-06 [overlap_recompute_allgather_and_fa_grad]: 1.41002e-06 [overlap_recompute_comm]: 2.24001e-06 [overlap_grad_ring_attention]: 4.53001e-06 [overlap_grad_flash_sp]: 2.401e-05 [begin_end_overlap_inline]: 5.59987e-07 [split_matmul_comm_elemetwise]: 2.31e-06 [split_layernorm_comm]: 1.71e-06 [handle_group_info]: 1.35001e-06 [symbol_engine_optimizer]: 9.74e-05, [1] [Cycle 1]: 9.236e-05, [6] [build]: 3.75e-06 [elim_shapecalc]: 1.357e-05 [elim_not_effective]: 1.761e-05 [opt_reshape]: 1.043e-05 [fold_const_symbol]: 1.431e-05 [renormalize]: 5.00004e-07 [detach_backward]: 2.11e-06 [pipeline_parallel_scheduler]: 1.49998e-06 [auto_monad_reorder]: 2.235e-05 [get_jit_bprop_graph]: 1.96e-06 [rewriter_after_jit_bprop_graph]: 4.42e-06 [opt_after_jit_grad]: 0.00052202 [validate]: 4.827e-05 Sums bootstrap : 0.000450s : 3.75% type_inference : 0.006228s : 52.01% event_method : 0.000020s : 0.17% auto_monad : 0.000065s : 0.54% graph_reusing : 0.000006s : 0.05% inline : 0.000003s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000023s : 0.19% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.05% parallel-infer-symbol : 0.000003s : 0.03% pre_auto_parallel : 0.000037s : 0.31% insert-virtual-dataset : 0.000003s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000028s : 0.24% optimize.rewriter_before_opt_a : 0.000089s : 0.74% optimize.opt_a.expand_dump_flag : 0.000005s : 0.04% optimize.opt_a.switch_simplify : 0.000055s : 0.46% optimize.opt_a.loop_unroll : 0.000039s : 0.33% optimize.opt_a.a_1 : 0.000864s : 7.22% optimize.opt_a.with_stream_mark : 0.000031s : 0.26% optimize.opt_a.recompute_prepare : 0.000019s : 0.16% optimize.opt_a.updatestate_depend_eliminate : 0.000008s : 0.07% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.06% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.05% optimize.opt_a.parameter_eliminate : 0.000003s : 0.02% optimize.opt_a.a_2 : 0.000196s : 1.64% optimize.opt_a.accelerated_algorithm : 0.000016s : 0.13% optimize.opt_a.shard : 0.000003s : 0.02% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.03% optimize.opt_a.shard_inline : 0.000015s : 0.13% optimize.opt_a.merge_send_recv : 0.000016s : 0.13% optimize.opt_a.auto_parallel : 0.000013s : 0.11% optimize.opt_a.parallel : 0.000025s : 0.21% optimize.opt_a.flash_sp : 0.000012s : 0.10% optimize.opt_a.merge_comm : 0.000010s : 0.08% optimize.opt_a.allreduce_fusion : 0.000008s : 0.07% optimize.opt_a.matmul_add_comm_reduction : 0.000019s : 0.16% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000018s : 0.15% optimize.opt_a.virtual_dataset : 0.000015s : 0.13% optimize.opt_a.get_grad_eliminate_ : 0.000014s : 0.12% optimize.opt_a.virtual_output : 0.000015s : 0.13% optimize.opt_a.merge_forward : 0.000008s : 0.07% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.03% optimize.opt_a.offload_activation : 0.000019s : 0.16% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000033s : 0.27% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.02% optimize.opt_a.before_grad : 0.000025s : 0.21% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000009s : 0.07% optimize.opt_a.meta_fg_expand : 0.000007s : 0.06% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.03% optimize.opt_a.receive_attached : 0.000003s : 0.03% optimize.opt_a.after_resolve : 0.000025s : 0.21% optimize.opt_a.a_after_grad : 0.000024s : 0.20% optimize.opt_a.renormalize : 0.000830s : 6.93% optimize.opt_a.add_forward_monad_depend : 0.000008s : 0.07% optimize.opt_a.auto_monad_grad : 0.000004s : 0.03% optimize.opt_a.auto_monad_eliminator : 0.000027s : 0.22% optimize.opt_a.cse : 0.000058s : 0.48% optimize.opt_a.a_3 : 0.000105s : 0.88% optimize.py_interpret_to_execute_after_opt_a : 0.000012s : 0.10% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.02% optimize.rewriter_after_opt_a : 0.000043s : 0.36% optimize.convert_after_rewriter : 0.000009s : 0.07% optimize.order_py_execute_after_rewriter : 0.000006s : 0.05% optimize.mutable_eliminate : 0.000581s : 4.85% optimize.opt_b.b_1 : 0.000168s : 1.40% optimize.opt_b.b_2 : 0.000010s : 0.08% optimize.opt_b.updatestate_depend_eliminate : 0.000008s : 0.07% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000028s : 0.24% optimize.optimize_parallel_all_gather_comm : 0.000020s : 0.16% optimize.overlap_param_gather : 0.000002s : 0.02% optimize.cconv : 0.000029s : 0.24% optimize.loop_unroll : 0.000446s : 3.73% optimize.opt_after_cconv.c_1 : 0.000037s : 0.31% optimize.opt_after_cconv.parameter_eliminate : 0.000004s : 0.03% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.05% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.02% optimize.opt_after_cconv.cse : 0.000021s : 0.18% optimize.opt_after_cconv.renormalize : 0.000001s : 0.01% optimize.remove_dup_value : 0.000015s : 0.12% optimize.tuple_transform.d_1 : 0.000055s : 0.46% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000008s : 0.07% optimize.partial_unused_args_eliminate : 0.000002s : 0.02% optimize.add_recomputation : 0.000058s : 0.48% optimize.cse_after_recomputation.cse : 0.000015s : 0.12% optimize.environ_conv : 0.000007s : 0.06% optimize.swap_dp_allreduce_reducescatter : 0.000006s : 0.05% optimize.bias_add_comm_swap : 0.000003s : 0.03% optimize.label_micro_interleaved_index : 0.000004s : 0.04% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.02% optimize.merge_cast_opt : 0.000001s : 0.01% optimize.slice_recompute_activation : 0.000002s : 0.02% optimize.micro_interleaved_order_control : 0.000002s : 0.02% optimize.assign_add_opt : 0.000001s : 0.01% optimize.ForceFp32Comm : 0.000001s : 0.01% optimize.remove_cast_before_assign_add : 0.000001s : 0.01% optimize.full_micro_interleaved_order_control : 0.000003s : 0.02% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.02% optimize.comm_op_add_attrs : 0.000001s : 0.01% optimize.add_comm_op_reuse_tag : 0.000001s : 0.01% optimize.interleave_split_concat_branches : 0.000001s : 0.01% optimize.interleave_parallel_branches : 0.000001s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000003s : 0.02% optimize.control_data_broadcast_order : 0.000016s : 0.13% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.01% optimize.offloading_packed_experts : 0.000004s : 0.04% optimize.overlap_recompute_and_grad_model_parallel : 0.000006s : 0.05% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.01% optimize.overlap_recompute_comm : 0.000002s : 0.02% optimize.overlap_grad_ring_attention : 0.000005s : 0.04% optimize.overlap_grad_flash_sp : 0.000024s : 0.20% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.02% optimize.split_layernorm_comm : 0.000002s : 0.01% optimize.handle_group_info : 0.000001s : 0.01% optimize.symbol_engine_optimizer.build : 0.000004s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000014s : 0.11% optimize.symbol_engine_optimizer.elim_not_effective : 0.000018s : 0.15% optimize.symbol_engine_optimizer.opt_reshape : 0.000010s : 0.09% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000014s : 0.12% optimize.symbol_engine_optimizer.renormalize : 0.000001s : 0.00% detach_backward : 0.000002s : 0.02% pipeline_parallel_scheduler : 0.000001s : 0.01% auto_monad_reorder : 0.000022s : 0.19% get_jit_bprop_graph : 0.000002s : 0.02% rewriter_after_jit_bprop_graph : 0.000004s : 0.04% opt_after_jit_grad : 0.000522s : 4.36% validate : 0.000048s : 0.40% Time group info: ------[substitution.] 0.000231 39 11.16% : 0.000026s : 3: substitution.cast_eliminate 0.99% : 0.000002s : 3: substitution.elim_not_effective 0.78% : 0.000002s : 3: substitution.fold_const_symbol 3.23% : 0.000007s : 5: substitution.graph_param_transform 67.78% : 0.000157s : 4: substitution.inline 2.10% : 0.000005s : 6: substitution.j_node_and_user_rematch 3.64% : 0.000008s : 6: substitution.remove_not_recompute_node 2.00% : 0.000005s : 4: substitution.replace_old_param 5.21% : 0.000012s : 4: substitution.tuple_list_get_item_eliminator 3.13% : 0.000007s : 1: substitution.value_based_eliminate ------[type_inference.] 0.006157 2 87.61% : 0.005394s : 1: type_inference.infer 12.39% : 0.000763s : 1: type_inference.specialize ------[replace.] 0.000062 8 64.05% : 0.000040s : 4: replace.inline 35.95% : 0.000022s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000164 8 93.78% : 0.000154s : 4: match.inline 6.22% : 0.000010s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000240 1504 0.95% : 0.000002s : 15: predicate.accumulaten_eliminater 0.94% : 0.000002s : 5: predicate.ad_related_special_op_eliminate 0.57% : 0.000001s : 10: predicate.addn_check_dump 0.87% : 0.000002s : 15: predicate.addn_zero_filter 0.79% : 0.000002s : 15: predicate.adjust_all_reduce_mul_add 2.19% : 0.000005s : 25: predicate.arithmetic_simplify 1.05% : 0.000003s : 15: predicate.cast_eliminate 0.62% : 0.000001s : 10: predicate.check_bprop_eliminate 0.58% : 0.000001s : 10: predicate.compare_switch_simplify 0.19% : 0.000000s : 5: predicate.const_output_eliminate 0.62% : 0.000001s : 10: predicate.depend_value_elim 0.89% : 0.000002s : 15: predicate.dict_get_item_const_eliminator 0.96% : 0.000002s : 15: predicate.dict_get_item_eliminator 0.85% : 0.000002s : 15: predicate.dict_set_item_eliminator 0.89% : 0.000002s : 10: predicate.dumpgradient_eliminate 0.30% : 0.000001s : 5: predicate.elim_not_effective 0.61% : 0.000001s : 5: predicate.elim_shapecalc_of_broadcastargs 1.22% : 0.000003s : 20: predicate.environ_add_const_eliminate 1.11% : 0.000003s : 20: predicate.environ_get_add_eliminate 1.05% : 0.000003s : 20: predicate.environ_get_depend_swap 1.74% : 0.000004s : 30: predicate.environ_get_eliminate 1.08% : 0.000003s : 20: predicate.environ_get_set_eliminate 1.33% : 0.000003s : 23: predicate.exchange_switch_depend_value 2.46% : 0.000006s : 23: predicate.float_depend_g_call 0.57% : 0.000001s : 10: predicate.float_environ_get_switch 0.82% : 0.000002s : 15: predicate.float_tuple_getitem_switch 0.23% : 0.000001s : 5: predicate.fold_const_symbol 0.70% : 0.000002s : 10: predicate.get_grad_eliminate 0.22% : 0.000001s : 5: predicate.graph_param_transform 0.70% : 0.000002s : 10: predicate.incorporate_call 0.57% : 0.000001s : 10: predicate.incorporate_call_switch 6.46% : 0.000016s : 68: predicate.inline 0.85% : 0.000002s : 10: predicate.inline_without_move 0.32% : 0.000001s : 10: predicate.j_node_and_user_rematch 0.79% : 0.000002s : 10: predicate.less_batch_normalization 1.70% : 0.000004s : 29: predicate.list_to_tuple_eliminator_ 2.47% : 0.000006s : 44: predicate.load_eliminater 0.87% : 0.000002s : 5: predicate.loop_unroll_after_grad 2.11% : 0.000005s : 36: predicate.loop_unroll_before_grad 1.66% : 0.000004s : 25: predicate.make_slice_get_slice_eliminator 0.60% : 0.000001s : 10: predicate.merge_addn 0.64% : 0.000002s : 10: predicate.micro_step_allgather_replace 0.58% : 0.000001s : 10: predicate.mini_step_allgather_replace 0.79% : 0.000002s : 15: predicate.minmaximum_grad 1.18% : 0.000003s : 5: predicate.mutable_eliminate 0.48% : 0.000001s : 5: predicate.opt_reshape 0.49% : 0.000001s : 5: predicate.parallel_virtual_node 1.59% : 0.000004s : 23: predicate.partial_defer_inline 1.62% : 0.000004s : 24: predicate.partial_eliminate 0.89% : 0.000002s : 15: predicate.print_const_string_wrapper 0.58% : 0.000001s : 10: predicate.reduce_all_const_elim 1.31% : 0.000003s : 15: predicate.reduce_eliminate 2.50% : 0.000006s : 44: predicate.redundant_stop_gradient_eliminater 0.49% : 0.000001s : 10: predicate.remove_not_recompute_node 1.34% : 0.000003s : 29: predicate.replace_applicator 0.51% : 0.000001s : 10: predicate.replace_old_param 0.35% : 0.000001s : 5: predicate.reset_defer_inline 0.86% : 0.000002s : 15: predicate.reshape_eliminate 0.60% : 0.000001s : 10: predicate.row_tensor_add_zeros_like 0.39% : 0.000001s : 5: predicate.row_tensor_eliminate 0.93% : 0.000002s : 10: predicate.same_eliminate 0.45% : 0.000001s : 10: predicate.set_cell_output_no_recompute 0.80% : 0.000002s : 10: predicate.shard_identity_eliminate 0.69% : 0.000002s : 10: predicate.special_op_eliminate 0.77% : 0.000002s : 10: predicate.specialize_transform 0.82% : 0.000002s : 10: predicate.split_environ_get_set_with_tuple_value 0.75% : 0.000002s : 10: predicate.stack_unstack_eliminate 0.36% : 0.000001s : 5: predicate.switch_call_monad_eliminater 1.42% : 0.000003s : 23: predicate.switch_defer_inline 1.97% : 0.000005s : 33: predicate.switch_layer_defer_inline 5.01% : 0.000012s : 74: predicate.switch_simplify 0.88% : 0.000002s : 15: predicate.tile_eliminate 0.89% : 0.000002s : 15: predicate.transpose_eliminate 1.49% : 0.000004s : 25: predicate.tuple_list_convert_item_index_to_positive 1.49% : 0.000004s : 25: predicate.tuple_list_get_item_const_eliminator 1.43% : 0.000003s : 25: predicate.tuple_list_get_item_depend_reorder 3.33% : 0.000008s : 39: predicate.tuple_list_get_item_eliminator 1.55% : 0.000004s : 25: predicate.tuple_list_get_set_item_eliminator 2.33% : 0.000006s : 35: predicate.tuple_list_set_item_eliminator 1.68% : 0.000004s : 29: predicate.tuple_to_list_eliminator_ 2.47% : 0.000006s : 44: predicate.updatestate_pure_node_eliminater 3.14% : 0.000008s : 54: predicate.updatestate_useless_node_eliminater 0.55% : 0.000001s : 5: predicate.value_based_eliminate 0.67% : 0.000002s : 10: predicate.virtual_dataset_eliminate 0.61% : 0.000001s : 10: predicate.virtual_output_eliminate 0.29% : 0.000001s : 5: predicate.virtual_view_grad_eliminate 0.52% : 0.000001s : 5: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000570 11 51.94% : 0.000296s : 5: func_graph_cloner_run.FuncGraphClonerGraph 48.06% : 0.000274s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.028101 192 0.01% : 0.000003s : 1: ForceFp32Comm 12.95% : 0.003640s : 1: add_attr 12.91% : 0.003629s : 1: add_attr_with_inline 0.01% : 0.000004s : 1: add_comm_op_reuse_tag 0.22% : 0.000063s : 1: add_recomputation 0.01% : 0.000004s : 1: assign_add_opt 0.25% : 0.000071s : 1: auto_monad 0.10% : 0.000027s : 1: auto_monad_reorder 0.01% : 0.000003s : 1: begin_end_overlap_inline 0.02% : 0.000006s : 1: bias_add_comm_swap 1.70% : 0.000479s : 1: bootstrap 0.11% : 0.000032s : 1: cconv 0.02% : 0.000004s : 1: comm_op_add_attrs 0.07% : 0.000019s : 1: control_data_broadcast_order 0.04% : 0.000012s : 1: convert_after_rewriter 0.10% : 0.000029s : 1: cse_after_recomputation 0.02% : 0.000005s : 1: dataset_repeat_opt 0.02% : 0.000005s : 1: detach_backward 0.03% : 0.000010s : 1: environ_conv 0.09% : 0.000026s : 1: event_method 0.02% : 0.000005s : 1: full_micro_interleaved_order_control 0.02% : 0.000006s : 1: get_jit_bprop_graph 0.04% : 0.000010s : 1: graph_reusing 0.01% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.02% : 0.000004s : 1: handle_group_info 0.02% : 0.000006s : 1: inline 0.02% : 0.000006s : 1: insert-virtual-dataset 0.01% : 0.000004s : 1: interleave_parallel_branches 0.02% : 0.000004s : 1: interleave_split_concat_branches 0.02% : 0.000006s : 1: label_fine_grained_interleaved_index 0.03% : 0.000007s : 1: label_micro_interleaved_index 1.62% : 0.000455s : 1: loop_unroll 0.01% : 0.000004s : 1: merge_cast_opt 0.02% : 0.000005s : 1: micro_interleaved_order_control 2.10% : 0.000590s : 1: mutable_eliminate 0.03% : 0.000007s : 1: offloading_packed_experts 0.06% : 0.000016s : 1: opt.transform.loop_unroll_optimizer 0.08% : 0.000021s : 1: opt.transform.mutable_eliminate 4.94% : 0.001387s : 78: opt.transform.opt_a 0.13% : 0.000036s : 1: opt.transform.opt_after_cconv 0.12% : 0.000033s : 1: opt.transform.opt_after_jit_grad 0.52% : 0.000145s : 28: opt.transform.opt_b 0.22% : 0.000061s : 2: opt.transform.opt_trans_graph 0.18% : 0.000051s : 4: opt.transform.symbol_engine_opt 11.16% : 0.003135s : 1: opt_a 0.42% : 0.000118s : 1: opt_after_cconv 1.90% : 0.000533s : 1: opt_after_jit_grad 0.96% : 0.000271s : 1: opt_b 19.25% : 0.005410s : 1: optimize 0.08% : 0.000023s : 1: optimize_parallel_all_gather_comm 0.03% : 0.000009s : 1: order_py_execute_after_rewriter 0.10% : 0.000028s : 1: overlap_grad_flash_sp 0.02% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.03% : 0.000008s : 1: overlap_grad_ring_attention 0.02% : 0.000006s : 1: overlap_opt_shard_grad_in_pipeline 0.02% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.02% : 0.000005s : 1: overlap_param_gather 0.01% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.03% : 0.000009s : 1: overlap_recompute_and_grad_model_parallel 0.02% : 0.000005s : 1: overlap_recompute_comm 0.02% : 0.000007s : 1: parallel-infer-symbol 0.01% : 0.000004s : 1: parallel-infer-symbol-second 0.02% : 0.000005s : 1: partial_unused_args_eliminate 0.02% : 0.000004s : 1: pipeline_parallel_scheduler 0.02% : 0.000005s : 1: pipeline_split 0.15% : 0.000042s : 1: pre_auto_parallel 0.12% : 0.000033s : 1: py_interpret_to_execute 0.06% : 0.000016s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000004s : 1: remove_cast_before_assign_add 0.07% : 0.000018s : 1: remove_dup_value 1.69% : 0.000476s : 1: renormalize.infer 1.23% : 0.000346s : 1: renormalize.specialize 0.02% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.03% : 0.000008s : 1: rewriter_after_jit_bprop_graph 0.17% : 0.000048s : 1: rewriter_after_opt_a 0.33% : 0.000093s : 1: rewriter_before_opt_a 0.02% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.02% : 0.000005s : 1: slice_recompute_activation 0.02% : 0.000005s : 1: split_layernorm_comm 0.02% : 0.000005s : 1: split_matmul_comm_elemetwise 0.03% : 0.000009s : 1: swap_dp_allreduce_reducescatter 0.36% : 0.000101s : 1: symbol_engine_optimizer 0.32% : 0.000091s : 1: tuple_transform 22.24% : 0.006249s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:44:25.256.109 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:44:25.256.413 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0334478, [21] [bootstrap]: 0.00070256 [type_inference]: 0.00667066 [event_method]: 2.266e-05 [auto_monad]: 0.00010032 [graph_reusing]: 6.74999e-06 [inline]: 2.79999e-06 [add_attr]: 0.00369131, [1] [add_attr_with_inline]: 0.00368014, [1] [Cycle 1]: 9.543e-05, [2] [tag_attr]: 2.397e-05 [meta_addattr_fg_expand]: 6.29001e-06 [parallel-infer-symbol]: 4.29002e-06 [pre_auto_parallel]: 4.146e-05 [insert-virtual-dataset]: 2.31998e-06 [parallel-infer-symbol-second]: 8.09989e-07 [dataset_repeat_opt]: 2.20002e-06 [pipeline_split]: 1.63002e-06 [optimize]: 0.0201852, [53] [py_interpret_to_execute]: 3.458e-05 [rewriter_before_opt_a]: 0.00011531 [opt_a]: 0.0172134, [2] [Cycle 1]: 0.0161991, [45] [expand_dump_flag]: 3.53999e-06 [switch_simplify]: 4.761e-05 [loop_unroll]: 3.405e-05 [a_1]: 0.0137596 [with_stream_mark]: 2.86e-05 [recompute_prepare]: 1.458e-05 [updatestate_depend_eliminate]: 5.76998e-06 [updatestate_assign_eliminate]: 4.10998e-06 [updatestate_loads_eliminate]: 3.94002e-06 [parameter_eliminate]: 2.55997e-06 [a_2]: 0.00040068 [accelerated_algorithm]: 1.066e-05 [shard]: 2.45997e-06 [meta_shard_fg_expand]: 3.79002e-06 [shard_inline]: 8.52e-06 [merge_send_recv]: 1.398e-05 [auto_parallel]: 1.352e-05 [parallel]: 2.375e-05 [flash_sp]: 1.224e-05 [merge_comm]: 4.73001e-06 [allreduce_fusion]: 4.32003e-06 [matmul_add_comm_reduction]: 1.17e-05 [allreduce_slice_to_reducescatter]: 9.89996e-07 [virtual_shard_identity]: 9.53002e-06 [virtual_dataset]: 8.57e-06 [get_grad_eliminate_]: 7.92e-06 [virtual_output]: 8.22e-06 [merge_forward]: 4.97e-06 [cell_reuse_recompute_pass]: 1.95001e-06 [offload_activation]: 1.164e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.069e-05 [merge_recompute_call_nodes]: 1.52001e-06 [before_grad]: 1.439e-05 [set_forward_comm_id_for_comm_node_pass]: 4.89e-06 [meta_fg_expand]: 3.89002e-06 [flash_sp_send_recv_attached]: 2.43e-06 [receive_attached]: 2.67001e-06 [after_resolve]: 1.481e-05 [a_after_grad]: 1.294e-05 [renormalize]: 0.00100496 [add_forward_monad_depend]: 7.14001e-06 [auto_monad_grad]: 3.04999e-06 [auto_monad_eliminator]: 2.282e-05 [cse]: 4.206e-05 [a_3]: 8.181e-05 [Cycle 2]: 0.00099665, [45] [expand_dump_flag]: 2.37999e-06 [switch_simplify]: 1.05e-05 [loop_unroll]: 7.83999e-06 [a_1]: 0.00019134 [with_stream_mark]: 1.66e-05 [recompute_prepare]: 8.54e-06 [updatestate_depend_eliminate]: 3.94002e-06 [updatestate_assign_eliminate]: 3.55e-06 [updatestate_loads_eliminate]: 3.19001e-06 [parameter_eliminate]: 1.54e-06 [a_2]: 0.00012143 [accelerated_algorithm]: 7.82998e-06 [shard]: 1.63002e-06 [meta_shard_fg_expand]: 2.58003e-06 [shard_inline]: 1.563e-05 [merge_send_recv]: 6.94999e-06 [auto_parallel]: 8.41002e-06 [parallel]: 6.93e-06 [flash_sp]: 3.87998e-06 [merge_comm]: 3.94002e-06 [allreduce_fusion]: 4.35e-06 [matmul_add_comm_reduction]: 9.10001e-06 [allreduce_slice_to_reducescatter]: 7.2e-07 [virtual_shard_identity]: 9.20001e-06 [virtual_dataset]: 7.75e-06 [get_grad_eliminate_]: 7.56999e-06 [virtual_output]: 7.38999e-06 [merge_forward]: 4.20999e-06 [cell_reuse_recompute_pass]: 2.13002e-06 [offload_activation]: 9.64e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.756e-05 [merge_recompute_call_nodes]: 8.80013e-07 [before_grad]: 1.24e-05 [set_forward_comm_id_for_comm_node_pass]: 4.2e-06 [meta_fg_expand]: 3.25e-06 [flash_sp_send_recv_attached]: 1.13001e-06 [receive_attached]: 1.49998e-06 [after_resolve]: 1.268e-05 [a_after_grad]: 1.161e-05 [renormalize]: 7.99773e-08 [add_forward_monad_depend]: 1.95001e-06 [auto_monad_grad]: 1.65001e-06 [auto_monad_eliminator]: 1.071e-05 [cse]: 2.12e-05 [a_3]: 5.949e-05 [py_interpret_to_execute_after_opt_a]: 1.894e-05 [slice_cell_reuse_recomputed_activation]: 5.32001e-06 [rewriter_after_opt_a]: 4.923e-05 [convert_after_rewriter]: 1.131e-05 [order_py_execute_after_rewriter]: 8.66997e-06 [mutable_eliminate]: 0.00078941 [opt_b]: 0.00034789, [1] [Cycle 1]: 0.00033598, [7] [b_1]: 0.00022482 [b_2]: 1.05e-05 [updatestate_depend_eliminate]: 7.11001e-06 [updatestate_assign_eliminate]: 3.05998e-06 [updatestate_loads_eliminate]: 3.41999e-06 [renormalize]: 7.39994e-07 [cse]: 2.741e-05 [optimize_parallel_all_gather_comm]: 2.306e-05 [overlap_param_gather]: 5.23002e-06 [cconv]: 3.321e-05 [loop_unroll]: 0.00054756 [opt_after_cconv]: 0.00016185, [1] [Cycle 1]: 0.0001506, [7] [c_1]: 4.383e-05 [parameter_eliminate]: 4.02e-06 [updatestate_depend_eliminate]: 7.50998e-06 [updatestate_assign_eliminate]: 3.95e-06 [updatestate_loads_eliminate]: 3.31001e-06 [cse]: 2.683e-05 [renormalize]: 3.29979e-07 [remove_dup_value]: 2.052e-05 [tuple_transform]: 0.00011584, [1] [Cycle 1]: 0.00010686, [4] [d_1]: 6.057e-05 [none_parameter_eliminate]: 1.67999e-06 [renormalize]: 1.70025e-07 [switch_simplify]: 1.04e-05 [partial_unused_args_eliminate]: 4.31002e-06 [add_recomputation]: 6.543e-05 [cse_after_recomputation]: 3.497e-05, [1] [Cycle 1]: 2.774e-05, [1] [cse]: 1.693e-05 [environ_conv]: 1.033e-05 [swap_dp_allreduce_reducescatter]: 9.42999e-06 [bias_add_comm_swap]: 6.04999e-06 [label_micro_interleaved_index]: 9.16998e-06 [label_fine_grained_interleaved_index]: 5.72999e-06 [merge_cast_opt]: 4.65001e-06 [slice_recompute_activation]: 4.70999e-06 [micro_interleaved_order_control]: 5.44e-06 [assign_add_opt]: 4.28001e-06 [ForceFp32Comm]: 3.83001e-06 [remove_cast_before_assign_add]: 3.7e-06 [full_micro_interleaved_order_control]: 5.10001e-06 [reorder_send_recv_between_fp_bp]: 5.16002e-06 [comm_op_add_attrs]: 3.98999e-06 [add_comm_op_reuse_tag]: 3.25e-06 [interleave_split_concat_branches]: 3.89002e-06 [interleave_parallel_branches]: 3.68e-06 [overlap_opt_shard_in_pipeline]: 3.71001e-06 [overlap_opt_shard_grad_in_pipeline]: 4.1e-06 [control_data_broadcast_order]: 2.038e-05 [grouped_pairwise_exchange_alltoall]: 4.40999e-06 [offloading_packed_experts]: 8.05e-06 [overlap_recompute_and_grad_model_parallel]: 8.59002e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.63999e-06 [overlap_recompute_allgather_and_fa_grad]: 3.95998e-06 [overlap_recompute_comm]: 5.41002e-06 [overlap_grad_ring_attention]: 7.64002e-06 [overlap_grad_flash_sp]: 2.765e-05 [begin_end_overlap_inline]: 3.55e-06 [split_matmul_comm_elemetwise]: 4.87e-06 [split_layernorm_comm]: 3.90998e-06 [handle_group_info]: 3.41999e-06 [symbol_engine_optimizer]: 0.00014438, [1] [Cycle 1]: 0.0001366, [6] [build]: 4.24002e-06 [elim_shapecalc]: 1.724e-05 [elim_not_effective]: 1.725e-05 [opt_reshape]: 8.86997e-06 [fold_const_symbol]: 3.97e-05 [renormalize]: 3.89991e-07 [detach_backward]: 6.53e-06 [pipeline_parallel_scheduler]: 2.19001e-06 [auto_monad_reorder]: 3.268e-05 [get_jit_bprop_graph]: 1.97001e-06 [rewriter_after_jit_bprop_graph]: 7.27002e-06 [opt_after_jit_grad]: 0.00072272 [validate]: 5.132e-05 Sums bootstrap : 0.000703s : 2.58% type_inference : 0.006671s : 24.48% event_method : 0.000023s : 0.08% auto_monad : 0.000100s : 0.37% graph_reusing : 0.000007s : 0.02% inline : 0.000003s : 0.01% add_attr.add_attr_with_inline.tag_attr : 0.000024s : 0.09% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.02% parallel-infer-symbol : 0.000004s : 0.02% pre_auto_parallel : 0.000041s : 0.15% insert-virtual-dataset : 0.000002s : 0.01% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.01% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000035s : 0.13% optimize.rewriter_before_opt_a : 0.000115s : 0.42% optimize.opt_a.expand_dump_flag : 0.000006s : 0.02% optimize.opt_a.switch_simplify : 0.000058s : 0.21% optimize.opt_a.loop_unroll : 0.000042s : 0.15% optimize.opt_a.a_1 : 0.013951s : 51.19% optimize.opt_a.with_stream_mark : 0.000045s : 0.17% optimize.opt_a.recompute_prepare : 0.000023s : 0.08% optimize.opt_a.updatestate_depend_eliminate : 0.000010s : 0.04% optimize.opt_a.updatestate_assign_eliminate : 0.000008s : 0.03% optimize.opt_a.updatestate_loads_eliminate : 0.000007s : 0.03% optimize.opt_a.parameter_eliminate : 0.000004s : 0.02% optimize.opt_a.a_2 : 0.000522s : 1.92% optimize.opt_a.accelerated_algorithm : 0.000018s : 0.07% optimize.opt_a.shard : 0.000004s : 0.02% optimize.opt_a.meta_shard_fg_expand : 0.000006s : 0.02% optimize.opt_a.shard_inline : 0.000024s : 0.09% optimize.opt_a.merge_send_recv : 0.000021s : 0.08% optimize.opt_a.auto_parallel : 0.000022s : 0.08% optimize.opt_a.parallel : 0.000031s : 0.11% optimize.opt_a.flash_sp : 0.000016s : 0.06% optimize.opt_a.merge_comm : 0.000009s : 0.03% optimize.opt_a.allreduce_fusion : 0.000009s : 0.03% optimize.opt_a.matmul_add_comm_reduction : 0.000021s : 0.08% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000002s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000019s : 0.07% optimize.opt_a.virtual_dataset : 0.000016s : 0.06% optimize.opt_a.get_grad_eliminate_ : 0.000015s : 0.06% optimize.opt_a.virtual_output : 0.000016s : 0.06% optimize.opt_a.merge_forward : 0.000009s : 0.03% optimize.opt_a.cell_reuse_recompute_pass : 0.000004s : 0.01% optimize.opt_a.offload_activation : 0.000021s : 0.08% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000038s : 0.14% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.01% optimize.opt_a.before_grad : 0.000027s : 0.10% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000009s : 0.03% optimize.opt_a.meta_fg_expand : 0.000007s : 0.03% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.01% optimize.opt_a.receive_attached : 0.000004s : 0.02% optimize.opt_a.after_resolve : 0.000027s : 0.10% optimize.opt_a.a_after_grad : 0.000025s : 0.09% optimize.opt_a.renormalize : 0.001005s : 3.69% optimize.opt_a.add_forward_monad_depend : 0.000009s : 0.03% optimize.opt_a.auto_monad_grad : 0.000005s : 0.02% optimize.opt_a.auto_monad_eliminator : 0.000034s : 0.12% optimize.opt_a.cse : 0.000063s : 0.23% optimize.opt_a.a_3 : 0.000141s : 0.52% optimize.py_interpret_to_execute_after_opt_a : 0.000019s : 0.07% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.02% optimize.rewriter_after_opt_a : 0.000049s : 0.18% optimize.convert_after_rewriter : 0.000011s : 0.04% optimize.order_py_execute_after_rewriter : 0.000009s : 0.03% optimize.mutable_eliminate : 0.000789s : 2.90% optimize.opt_b.b_1 : 0.000225s : 0.82% optimize.opt_b.b_2 : 0.000010s : 0.04% optimize.opt_b.updatestate_depend_eliminate : 0.000007s : 0.03% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.01% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.01% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000027s : 0.10% optimize.optimize_parallel_all_gather_comm : 0.000023s : 0.08% optimize.overlap_param_gather : 0.000005s : 0.02% optimize.cconv : 0.000033s : 0.12% optimize.loop_unroll : 0.000548s : 2.01% optimize.opt_after_cconv.c_1 : 0.000044s : 0.16% optimize.opt_after_cconv.parameter_eliminate : 0.000004s : 0.01% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000008s : 0.03% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000004s : 0.01% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.cse : 0.000027s : 0.10% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000021s : 0.08% optimize.tuple_transform.d_1 : 0.000061s : 0.22% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000010s : 0.04% optimize.partial_unused_args_eliminate : 0.000004s : 0.02% optimize.add_recomputation : 0.000065s : 0.24% optimize.cse_after_recomputation.cse : 0.000017s : 0.06% optimize.environ_conv : 0.000010s : 0.04% optimize.swap_dp_allreduce_reducescatter : 0.000009s : 0.03% optimize.bias_add_comm_swap : 0.000006s : 0.02% optimize.label_micro_interleaved_index : 0.000009s : 0.03% optimize.label_fine_grained_interleaved_index : 0.000006s : 0.02% optimize.merge_cast_opt : 0.000005s : 0.02% optimize.slice_recompute_activation : 0.000005s : 0.02% optimize.micro_interleaved_order_control : 0.000005s : 0.02% optimize.assign_add_opt : 0.000004s : 0.02% optimize.ForceFp32Comm : 0.000004s : 0.01% optimize.remove_cast_before_assign_add : 0.000004s : 0.01% optimize.full_micro_interleaved_order_control : 0.000005s : 0.02% optimize.reorder_send_recv_between_fp_bp : 0.000005s : 0.02% optimize.comm_op_add_attrs : 0.000004s : 0.01% optimize.add_comm_op_reuse_tag : 0.000003s : 0.01% optimize.interleave_split_concat_branches : 0.000004s : 0.01% optimize.interleave_parallel_branches : 0.000004s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000004s : 0.02% optimize.control_data_broadcast_order : 0.000020s : 0.07% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.02% optimize.offloading_packed_experts : 0.000008s : 0.03% optimize.overlap_recompute_and_grad_model_parallel : 0.000009s : 0.03% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.01% optimize.overlap_recompute_comm : 0.000005s : 0.02% optimize.overlap_grad_ring_attention : 0.000008s : 0.03% optimize.overlap_grad_flash_sp : 0.000028s : 0.10% optimize.begin_end_overlap_inline : 0.000004s : 0.01% optimize.split_matmul_comm_elemetwise : 0.000005s : 0.02% optimize.split_layernorm_comm : 0.000004s : 0.01% optimize.handle_group_info : 0.000003s : 0.01% optimize.symbol_engine_optimizer.build : 0.000004s : 0.02% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000017s : 0.06% optimize.symbol_engine_optimizer.elim_not_effective : 0.000017s : 0.06% optimize.symbol_engine_optimizer.opt_reshape : 0.000009s : 0.03% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000040s : 0.15% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000007s : 0.02% pipeline_parallel_scheduler : 0.000002s : 0.01% auto_monad_reorder : 0.000033s : 0.12% get_jit_bprop_graph : 0.000002s : 0.01% rewriter_after_jit_bprop_graph : 0.000007s : 0.03% opt_after_jit_grad : 0.000723s : 2.65% validate : 0.000051s : 0.19% Time group info: ------[substitution.] 0.000388 39 8.72% : 0.000034s : 3: substitution.cast_eliminate 0.75% : 0.000003s : 3: substitution.elim_not_effective 0.44% : 0.000002s : 3: substitution.fold_const_symbol 1.94% : 0.000008s : 5: substitution.graph_param_transform 77.14% : 0.000299s : 4: substitution.inline 1.30% : 0.000005s : 6: substitution.j_node_and_user_rematch 1.82% : 0.000007s : 6: substitution.remove_not_recompute_node 1.57% : 0.000006s : 4: substitution.replace_old_param 4.16% : 0.000016s : 4: substitution.tuple_list_get_item_eliminator 2.15% : 0.000008s : 1: substitution.value_based_eliminate ------[type_inference.] 0.006606 2 87.95% : 0.005810s : 1: type_inference.infer 12.05% : 0.000796s : 1: type_inference.specialize ------[replace.] 0.000237 8 88.13% : 0.000209s : 4: replace.inline 11.87% : 0.000028s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000309 8 95.42% : 0.000295s : 4: match.inline 4.58% : 0.000014s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000528 1504 0.56% : 0.000003s : 15: predicate.accumulaten_eliminater 0.54% : 0.000003s : 5: predicate.ad_related_special_op_eliminate 0.28% : 0.000001s : 10: predicate.addn_check_dump 0.59% : 0.000003s : 15: predicate.addn_zero_filter 0.57% : 0.000003s : 15: predicate.adjust_all_reduce_mul_add 1.27% : 0.000007s : 25: predicate.arithmetic_simplify 0.56% : 0.000003s : 15: predicate.cast_eliminate 0.31% : 0.000002s : 10: predicate.check_bprop_eliminate 0.40% : 0.000002s : 10: predicate.compare_switch_simplify 0.09% : 0.000000s : 5: predicate.const_output_eliminate 0.38% : 0.000002s : 10: predicate.depend_value_elim 0.44% : 0.000002s : 15: predicate.dict_get_item_const_eliminator 0.60% : 0.000003s : 15: predicate.dict_get_item_eliminator 0.45% : 0.000002s : 15: predicate.dict_set_item_eliminator 0.53% : 0.000003s : 10: predicate.dumpgradient_eliminate 0.12% : 0.000001s : 5: predicate.elim_not_effective 0.31% : 0.000002s : 5: predicate.elim_shapecalc_of_broadcastargs 0.70% : 0.000004s : 20: predicate.environ_add_const_eliminate 0.56% : 0.000003s : 20: predicate.environ_get_add_eliminate 0.62% : 0.000003s : 20: predicate.environ_get_depend_swap 0.88% : 0.000005s : 30: predicate.environ_get_eliminate 0.53% : 0.000003s : 20: predicate.environ_get_set_eliminate 0.66% : 0.000003s : 23: predicate.exchange_switch_depend_value 1.52% : 0.000008s : 23: predicate.float_depend_g_call 0.29% : 0.000002s : 10: predicate.float_environ_get_switch 0.41% : 0.000002s : 15: predicate.float_tuple_getitem_switch 0.09% : 0.000000s : 5: predicate.fold_const_symbol 0.39% : 0.000002s : 10: predicate.get_grad_eliminate 0.12% : 0.000001s : 5: predicate.graph_param_transform 0.36% : 0.000002s : 10: predicate.incorporate_call 0.27% : 0.000001s : 10: predicate.incorporate_call_switch 3.26% : 0.000017s : 68: predicate.inline 0.36% : 0.000002s : 10: predicate.inline_without_move 0.17% : 0.000001s : 10: predicate.j_node_and_user_rematch 0.50% : 0.000003s : 10: predicate.less_batch_normalization 1.14% : 0.000006s : 29: predicate.list_to_tuple_eliminator_ 1.43% : 0.000008s : 44: predicate.load_eliminater 0.53% : 0.000003s : 5: predicate.loop_unroll_after_grad 1.11% : 0.000006s : 36: predicate.loop_unroll_before_grad 0.87% : 0.000005s : 25: predicate.make_slice_get_slice_eliminator 45.68% : 0.000241s : 10: predicate.merge_addn 0.28% : 0.000001s : 10: predicate.micro_step_allgather_replace 0.31% : 0.000002s : 10: predicate.mini_step_allgather_replace 0.40% : 0.000002s : 15: predicate.minmaximum_grad 0.47% : 0.000002s : 5: predicate.mutable_eliminate 0.18% : 0.000001s : 5: predicate.opt_reshape 0.20% : 0.000001s : 5: predicate.parallel_virtual_node 1.30% : 0.000007s : 23: predicate.partial_defer_inline 0.77% : 0.000004s : 24: predicate.partial_eliminate 0.53% : 0.000003s : 15: predicate.print_const_string_wrapper 0.35% : 0.000002s : 10: predicate.reduce_all_const_elim 0.82% : 0.000004s : 15: predicate.reduce_eliminate 1.36% : 0.000007s : 44: predicate.redundant_stop_gradient_eliminater 0.18% : 0.000001s : 10: predicate.remove_not_recompute_node 0.65% : 0.000003s : 29: predicate.replace_applicator 0.28% : 0.000001s : 10: predicate.replace_old_param 0.16% : 0.000001s : 5: predicate.reset_defer_inline 0.54% : 0.000003s : 15: predicate.reshape_eliminate 0.44% : 0.000002s : 10: predicate.row_tensor_add_zeros_like 0.22% : 0.000001s : 5: predicate.row_tensor_eliminate 0.38% : 0.000002s : 10: predicate.same_eliminate 0.20% : 0.000001s : 10: predicate.set_cell_output_no_recompute 0.39% : 0.000002s : 10: predicate.shard_identity_eliminate 0.32% : 0.000002s : 10: predicate.special_op_eliminate 0.35% : 0.000002s : 10: predicate.specialize_transform 0.50% : 0.000003s : 10: predicate.split_environ_get_set_with_tuple_value 0.37% : 0.000002s : 10: predicate.stack_unstack_eliminate 0.17% : 0.000001s : 5: predicate.switch_call_monad_eliminater 0.68% : 0.000004s : 23: predicate.switch_defer_inline 1.88% : 0.000010s : 33: predicate.switch_layer_defer_inline 2.39% : 0.000013s : 74: predicate.switch_simplify 0.46% : 0.000002s : 15: predicate.tile_eliminate 0.54% : 0.000003s : 15: predicate.transpose_eliminate 1.07% : 0.000006s : 25: predicate.tuple_list_convert_item_index_to_positive 0.75% : 0.000004s : 25: predicate.tuple_list_get_item_const_eliminator 0.78% : 0.000004s : 25: predicate.tuple_list_get_item_depend_reorder 1.90% : 0.000010s : 39: predicate.tuple_list_get_item_eliminator 0.82% : 0.000004s : 25: predicate.tuple_list_get_set_item_eliminator 1.22% : 0.000006s : 35: predicate.tuple_list_set_item_eliminator 0.82% : 0.000004s : 29: predicate.tuple_to_list_eliminator_ 1.17% : 0.000006s : 44: predicate.updatestate_pure_node_eliminater 1.54% : 0.000008s : 54: predicate.updatestate_useless_node_eliminater 0.26% : 0.000001s : 5: predicate.value_based_eliminate 0.36% : 0.000002s : 10: predicate.virtual_dataset_eliminate 0.36% : 0.000002s : 10: predicate.virtual_output_eliminate 0.14% : 0.000001s : 5: predicate.virtual_view_grad_eliminate 0.24% : 0.000001s : 5: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000633 11 51.89% : 0.000328s : 5: func_graph_cloner_run.FuncGraphClonerGraph 48.11% : 0.000305s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.073322 192 0.01% : 0.000007s : 1: ForceFp32Comm 5.05% : 0.003702s : 1: add_attr 5.02% : 0.003684s : 1: add_attr_with_inline 0.01% : 0.000006s : 1: add_comm_op_reuse_tag 0.09% : 0.000070s : 1: add_recomputation 0.01% : 0.000007s : 1: assign_add_opt 0.15% : 0.000113s : 1: auto_monad 0.06% : 0.000041s : 1: auto_monad_reorder 0.01% : 0.000006s : 1: begin_end_overlap_inline 0.01% : 0.000009s : 1: bias_add_comm_swap 1.02% : 0.000751s : 1: bootstrap 0.05% : 0.000037s : 1: cconv 0.01% : 0.000007s : 1: comm_op_add_attrs 0.03% : 0.000023s : 1: control_data_broadcast_order 0.02% : 0.000014s : 1: convert_after_rewriter 0.05% : 0.000038s : 1: cse_after_recomputation 0.01% : 0.000008s : 1: dataset_repeat_opt 0.05% : 0.000038s : 1: detach_backward 0.02% : 0.000013s : 1: environ_conv 0.05% : 0.000034s : 1: event_method 0.01% : 0.000008s : 1: full_micro_interleaved_order_control 0.01% : 0.000008s : 1: get_jit_bprop_graph 0.02% : 0.000013s : 1: graph_reusing 0.01% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000006s : 1: handle_group_info 0.01% : 0.000008s : 1: inline 0.01% : 0.000009s : 1: insert-virtual-dataset 0.01% : 0.000006s : 1: interleave_parallel_branches 0.01% : 0.000006s : 1: interleave_split_concat_branches 0.01% : 0.000009s : 1: label_fine_grained_interleaved_index 0.02% : 0.000012s : 1: label_micro_interleaved_index 0.76% : 0.000555s : 1: loop_unroll 0.01% : 0.000007s : 1: merge_cast_opt 0.01% : 0.000009s : 1: micro_interleaved_order_control 1.09% : 0.000797s : 1: mutable_eliminate 0.01% : 0.000011s : 1: offloading_packed_experts 0.03% : 0.000020s : 1: opt.transform.loop_unroll_optimizer 0.03% : 0.000020s : 1: opt.transform.mutable_eliminate 20.13% : 0.014763s : 78: opt.transform.opt_a 0.06% : 0.000042s : 1: opt.transform.opt_after_cconv 0.05% : 0.000036s : 1: opt.transform.opt_after_jit_grad 0.21% : 0.000157s : 28: opt.transform.opt_b 0.09% : 0.000068s : 2: opt.transform.opt_trans_graph 0.07% : 0.000053s : 4: opt.transform.symbol_engine_opt 23.48% : 0.017217s : 1: opt_a 0.23% : 0.000166s : 1: opt_after_cconv 1.00% : 0.000735s : 1: opt_after_jit_grad 0.48% : 0.000352s : 1: opt_b 28.75% : 0.021079s : 1: optimize 0.04% : 0.000026s : 1: optimize_parallel_all_gather_comm 0.02% : 0.000012s : 1: order_py_execute_after_rewriter 0.04% : 0.000031s : 1: overlap_grad_flash_sp 0.01% : 0.000006s : 1: overlap_grad_matmul_and_grad_allreduce 0.01% : 0.000011s : 1: overlap_grad_ring_attention 0.01% : 0.000007s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000006s : 1: overlap_opt_shard_in_pipeline 0.01% : 0.000008s : 1: overlap_param_gather 0.01% : 0.000007s : 1: overlap_recompute_allgather_and_fa_grad 0.02% : 0.000011s : 1: overlap_recompute_and_grad_model_parallel 0.01% : 0.000008s : 1: overlap_recompute_comm 0.02% : 0.000011s : 1: parallel-infer-symbol 0.01% : 0.000006s : 1: parallel-infer-symbol-second 0.01% : 0.000007s : 1: partial_unused_args_eliminate 0.01% : 0.000011s : 1: pipeline_parallel_scheduler 0.01% : 0.000007s : 1: pipeline_split 0.07% : 0.000049s : 1: pre_auto_parallel 0.05% : 0.000038s : 1: py_interpret_to_execute 0.03% : 0.000022s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000007s : 1: remove_cast_before_assign_add 0.03% : 0.000025s : 1: remove_dup_value 0.79% : 0.000577s : 1: renormalize.infer 0.57% : 0.000417s : 1: renormalize.specialize 0.01% : 0.000008s : 1: reorder_send_recv_between_fp_bp 0.02% : 0.000013s : 1: rewriter_after_jit_bprop_graph 0.07% : 0.000053s : 1: rewriter_after_opt_a 0.16% : 0.000120s : 1: rewriter_before_opt_a 0.01% : 0.000008s : 1: slice_cell_reuse_recomputed_activation 0.01% : 0.000008s : 1: slice_recompute_activation 0.01% : 0.000007s : 1: split_layernorm_comm 0.01% : 0.000007s : 1: split_matmul_comm_elemetwise 0.02% : 0.000012s : 1: swap_dp_allreduce_reducescatter 0.20% : 0.000147s : 1: symbol_engine_optimizer 0.16% : 0.000119s : 1: tuple_transform 9.16% : 0.006718s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:44:26.358.642 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0285345, [21] [bootstrap]: 0.00045658 [type_inference]: 0.0183487 [event_method]: 2.001e-05 [auto_monad]: 6.507e-05 [graph_reusing]: 6.04999e-06 [inline]: 2.04e-06 [add_attr]: 0.00320145, [1] [add_attr_with_inline]: 0.00319258, [1] [Cycle 1]: 5.848e-05, [2] [tag_attr]: 1.896e-05 [meta_addattr_fg_expand]: 5.98998e-06 [parallel-infer-symbol]: 3.41999e-06 [pre_auto_parallel]: 3.226e-05 [insert-virtual-dataset]: 3.04999e-06 [parallel-infer-symbol-second]: 8.49977e-07 [dataset_repeat_opt]: 2.27001e-06 [pipeline_split]: 2.06998e-06 [optimize]: 0.00565941, [53] [py_interpret_to_execute]: 2.407e-05 [rewriter_before_opt_a]: 8.244e-05 [opt_a]: 0.00329452, [2] [Cycle 1]: 0.00248976, [45] [expand_dump_flag]: 2.81e-06 [switch_simplify]: 4.337e-05 [loop_unroll]: 3.124e-05 [a_1]: 0.00066911 [with_stream_mark]: 1.526e-05 [recompute_prepare]: 1.009e-05 [updatestate_depend_eliminate]: 4.50001e-06 [updatestate_assign_eliminate]: 4.33001e-06 [updatestate_loads_eliminate]: 3.85e-06 [parameter_eliminate]: 1.88002e-06 [a_2]: 0.00010204 [accelerated_algorithm]: 8.3e-06 [shard]: 1.81e-06 [meta_shard_fg_expand]: 2.18002e-06 [shard_inline]: 7.87e-06 [merge_send_recv]: 9.47999e-06 [auto_parallel]: 6.76e-06 [parallel]: 1.834e-05 [flash_sp]: 7.41001e-06 [merge_comm]: 4.60999e-06 [allreduce_fusion]: 4.28001e-06 [matmul_add_comm_reduction]: 1.611e-05 [allreduce_slice_to_reducescatter]: 8.00006e-07 [virtual_shard_identity]: 9.30001e-06 [virtual_dataset]: 7.97e-06 [get_grad_eliminate_]: 7.51001e-06 [virtual_output]: 7.70998e-06 [merge_forward]: 4.37e-06 [cell_reuse_recompute_pass]: 1.25001e-06 [offload_activation]: 1.134e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.568e-05 [merge_recompute_call_nodes]: 1.50001e-06 [before_grad]: 1.276e-05 [set_forward_comm_id_for_comm_node_pass]: 4.92e-06 [meta_fg_expand]: 3.38e-06 [flash_sp_send_recv_attached]: 3.54002e-06 [receive_attached]: 2.71999e-06 [after_resolve]: 1.238e-05 [a_after_grad]: 1.203e-05 [renormalize]: 0.0010145 [add_forward_monad_depend]: 5.61e-06 [auto_monad_grad]: 2.22001e-06 [auto_monad_eliminator]: 1.782e-05 [cse]: 3.56e-05 [a_3]: 7.474e-05 [Cycle 2]: 0.0007932, [45] [expand_dump_flag]: 1.64e-06 [switch_simplify]: 9.12001e-06 [loop_unroll]: 7.72002e-06 [a_1]: 0.00017869 [with_stream_mark]: 1.463e-05 [recompute_prepare]: 7.88999e-06 [updatestate_depend_eliminate]: 3.95e-06 [updatestate_assign_eliminate]: 3.43e-06 [updatestate_loads_eliminate]: 2.94999e-06 [parameter_eliminate]: 1.30999e-06 [a_2]: 9.445e-05 [accelerated_algorithm]: 7.76001e-06 [shard]: 1.67999e-06 [meta_shard_fg_expand]: 2.27999e-06 [shard_inline]: 1.126e-05 [merge_send_recv]: 7.02002e-06 [auto_parallel]: 6.86999e-06 [parallel]: 6.84999e-06 [flash_sp]: 3.86999e-06 [merge_comm]: 4.02e-06 [allreduce_fusion]: 4.05e-06 [matmul_add_comm_reduction]: 8.18001e-06 [allreduce_slice_to_reducescatter]: 6.79982e-07 [virtual_shard_identity]: 8.65999e-06 [virtual_dataset]: 7.65e-06 [get_grad_eliminate_]: 7.21999e-06 [virtual_output]: 7.08e-06 [merge_forward]: 3.84002e-06 [cell_reuse_recompute_pass]: 1.85001e-06 [offload_activation]: 8.82e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.538e-05 [merge_recompute_call_nodes]: 9.39996e-07 [before_grad]: 1.276e-05 [set_forward_comm_id_for_comm_node_pass]: 4.63999e-06 [meta_fg_expand]: 3.79002e-06 [flash_sp_send_recv_attached]: 1.00999e-06 [receive_attached]: 1.51002e-06 [after_resolve]: 1.375e-05 [a_after_grad]: 1.123e-05 [renormalize]: 8.00064e-08 [add_forward_monad_depend]: 1.49998e-06 [auto_monad_grad]: 1.45999e-06 [auto_monad_eliminator]: 8.91002e-06 [cse]: 2.036e-05 [a_3]: 4.517e-05 [py_interpret_to_execute_after_opt_a]: 1.525e-05 [slice_cell_reuse_recomputed_activation]: 1.91e-06 [rewriter_after_opt_a]: 4.263e-05 [convert_after_rewriter]: 8.32e-06 [order_py_execute_after_rewriter]: 5.87999e-06 [mutable_eliminate]: 0.00064857 [opt_b]: 0.00027257, [1] [Cycle 1]: 0.00026529, [7] [b_1]: 0.00017256 [b_2]: 1.012e-05 [updatestate_depend_eliminate]: 8.69003e-06 [updatestate_assign_eliminate]: 3.06001e-06 [updatestate_loads_eliminate]: 3.18e-06 [renormalize]: 6.59988e-07 [cse]: 2.934e-05 [optimize_parallel_all_gather_comm]: 1.955e-05 [overlap_param_gather]: 1.87999e-06 [cconv]: 3.151e-05 [loop_unroll]: 0.00047826 [opt_after_cconv]: 0.0001213, [1] [Cycle 1]: 0.00011554, [7] [c_1]: 3.784e-05 [parameter_eliminate]: 4.44998e-06 [updatestate_depend_eliminate]: 6.71e-06 [updatestate_assign_eliminate]: 3.22002e-06 [updatestate_loads_eliminate]: 2.88e-06 [cse]: 2.515e-05 [renormalize]: 5.8001e-07 [remove_dup_value]: 1.561e-05 [tuple_transform]: 8.639e-05, [1] [Cycle 1]: 8.193e-05, [4] [d_1]: 5.353e-05 [none_parameter_eliminate]: 1.82999e-06 [renormalize]: 2.30008e-07 [switch_simplify]: 7.94002e-06 [partial_unused_args_eliminate]: 2.46998e-06 [add_recomputation]: 5.998e-05 [cse_after_recomputation]: 2.628e-05, [1] [Cycle 1]: 2.141e-05, [1] [cse]: 1.516e-05 [environ_conv]: 6.96999e-06 [swap_dp_allreduce_reducescatter]: 5.59998e-06 [bias_add_comm_swap]: 2.84999e-06 [label_micro_interleaved_index]: 4.89998e-06 [label_fine_grained_interleaved_index]: 2.82002e-06 [merge_cast_opt]: 1.39e-06 [slice_recompute_activation]: 2.93e-06 [micro_interleaved_order_control]: 2.49999e-06 [assign_add_opt]: 1.49e-06 [ForceFp32Comm]: 8.59989e-07 [remove_cast_before_assign_add]: 1.15999e-06 [full_micro_interleaved_order_control]: 2.43998e-06 [reorder_send_recv_between_fp_bp]: 2.79999e-06 [comm_op_add_attrs]: 1.00001e-06 [add_comm_op_reuse_tag]: 9.70002e-07 [interleave_split_concat_branches]: 1.17e-06 [interleave_parallel_branches]: 1.32e-06 [overlap_opt_shard_in_pipeline]: 1.14003e-06 [overlap_opt_shard_grad_in_pipeline]: 1.82001e-06 [control_data_broadcast_order]: 1.523e-05 [grouped_pairwise_exchange_alltoall]: 1.79e-06 [offloading_packed_experts]: 4.46002e-06 [overlap_recompute_and_grad_model_parallel]: 5.69e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.15001e-06 [overlap_recompute_allgather_and_fa_grad]: 1.57001e-06 [overlap_recompute_comm]: 2.27001e-06 [overlap_grad_ring_attention]: 4.53001e-06 [overlap_grad_flash_sp]: 2.287e-05 [begin_end_overlap_inline]: 4.70027e-07 [split_matmul_comm_elemetwise]: 2.11e-06 [split_layernorm_comm]: 2.09e-06 [handle_group_info]: 1.10001e-06 [symbol_engine_optimizer]: 8.467e-05, [1] [Cycle 1]: 8.01e-05, [6] [build]: 3.48e-06 [elim_shapecalc]: 1.14e-05 [elim_not_effective]: 1.516e-05 [opt_reshape]: 8.74998e-06 [fold_const_symbol]: 1.275e-05 [renormalize]: 3.00002e-07 [detach_backward]: 2.02001e-06 [pipeline_parallel_scheduler]: 1.62001e-06 [auto_monad_reorder]: 2.051e-05 [get_jit_bprop_graph]: 2.22001e-06 [rewriter_after_jit_bprop_graph]: 4.67e-06 [opt_after_jit_grad]: 0.00050645 [validate]: 4.779e-05 Sums bootstrap : 0.000457s : 1.87% type_inference : 0.018349s : 75.31% event_method : 0.000020s : 0.08% auto_monad : 0.000065s : 0.27% graph_reusing : 0.000006s : 0.02% inline : 0.000002s : 0.01% add_attr.add_attr_with_inline.tag_attr : 0.000019s : 0.08% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.02% parallel-infer-symbol : 0.000003s : 0.01% pre_auto_parallel : 0.000032s : 0.13% insert-virtual-dataset : 0.000003s : 0.01% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.01% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000024s : 0.10% optimize.rewriter_before_opt_a : 0.000082s : 0.34% optimize.opt_a.expand_dump_flag : 0.000004s : 0.02% optimize.opt_a.switch_simplify : 0.000052s : 0.22% optimize.opt_a.loop_unroll : 0.000039s : 0.16% optimize.opt_a.a_1 : 0.000848s : 3.48% optimize.opt_a.with_stream_mark : 0.000030s : 0.12% optimize.opt_a.recompute_prepare : 0.000018s : 0.07% optimize.opt_a.updatestate_depend_eliminate : 0.000008s : 0.03% optimize.opt_a.updatestate_assign_eliminate : 0.000008s : 0.03% optimize.opt_a.updatestate_loads_eliminate : 0.000007s : 0.03% optimize.opt_a.parameter_eliminate : 0.000003s : 0.01% optimize.opt_a.a_2 : 0.000196s : 0.81% optimize.opt_a.accelerated_algorithm : 0.000016s : 0.07% optimize.opt_a.shard : 0.000003s : 0.01% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.02% optimize.opt_a.shard_inline : 0.000019s : 0.08% optimize.opt_a.merge_send_recv : 0.000017s : 0.07% optimize.opt_a.auto_parallel : 0.000014s : 0.06% optimize.opt_a.parallel : 0.000025s : 0.10% optimize.opt_a.flash_sp : 0.000011s : 0.05% optimize.opt_a.merge_comm : 0.000009s : 0.04% optimize.opt_a.allreduce_fusion : 0.000008s : 0.03% optimize.opt_a.matmul_add_comm_reduction : 0.000024s : 0.10% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000018s : 0.07% optimize.opt_a.virtual_dataset : 0.000016s : 0.06% optimize.opt_a.get_grad_eliminate_ : 0.000015s : 0.06% optimize.opt_a.virtual_output : 0.000015s : 0.06% optimize.opt_a.merge_forward : 0.000008s : 0.03% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.01% optimize.opt_a.offload_activation : 0.000020s : 0.08% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000031s : 0.13% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.01% optimize.opt_a.before_grad : 0.000026s : 0.10% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000010s : 0.04% optimize.opt_a.meta_fg_expand : 0.000007s : 0.03% optimize.opt_a.flash_sp_send_recv_attached : 0.000005s : 0.02% optimize.opt_a.receive_attached : 0.000004s : 0.02% optimize.opt_a.after_resolve : 0.000026s : 0.11% optimize.opt_a.a_after_grad : 0.000023s : 0.10% optimize.opt_a.renormalize : 0.001015s : 4.16% optimize.opt_a.add_forward_monad_depend : 0.000007s : 0.03% optimize.opt_a.auto_monad_grad : 0.000004s : 0.02% optimize.opt_a.auto_monad_eliminator : 0.000027s : 0.11% optimize.opt_a.cse : 0.000056s : 0.23% optimize.opt_a.a_3 : 0.000120s : 0.49% optimize.py_interpret_to_execute_after_opt_a : 0.000015s : 0.06% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.01% optimize.rewriter_after_opt_a : 0.000043s : 0.17% optimize.convert_after_rewriter : 0.000008s : 0.03% optimize.order_py_execute_after_rewriter : 0.000006s : 0.02% optimize.mutable_eliminate : 0.000649s : 2.66% optimize.opt_b.b_1 : 0.000173s : 0.71% optimize.opt_b.b_2 : 0.000010s : 0.04% optimize.opt_b.updatestate_depend_eliminate : 0.000009s : 0.04% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.01% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.01% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000029s : 0.12% optimize.optimize_parallel_all_gather_comm : 0.000020s : 0.08% optimize.overlap_param_gather : 0.000002s : 0.01% optimize.cconv : 0.000032s : 0.13% optimize.loop_unroll : 0.000478s : 1.96% optimize.opt_after_cconv.c_1 : 0.000038s : 0.16% optimize.opt_after_cconv.parameter_eliminate : 0.000004s : 0.02% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000007s : 0.03% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.cse : 0.000025s : 0.10% optimize.opt_after_cconv.renormalize : 0.000001s : 0.00% optimize.remove_dup_value : 0.000016s : 0.06% optimize.tuple_transform.d_1 : 0.000054s : 0.22% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000008s : 0.03% optimize.partial_unused_args_eliminate : 0.000002s : 0.01% optimize.add_recomputation : 0.000060s : 0.25% optimize.cse_after_recomputation.cse : 0.000015s : 0.06% optimize.environ_conv : 0.000007s : 0.03% optimize.swap_dp_allreduce_reducescatter : 0.000006s : 0.02% optimize.bias_add_comm_swap : 0.000003s : 0.01% optimize.label_micro_interleaved_index : 0.000005s : 0.02% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.01% optimize.merge_cast_opt : 0.000001s : 0.01% optimize.slice_recompute_activation : 0.000003s : 0.01% optimize.micro_interleaved_order_control : 0.000002s : 0.01% optimize.assign_add_opt : 0.000001s : 0.01% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.00% optimize.full_micro_interleaved_order_control : 0.000002s : 0.01% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.01% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.01% optimize.control_data_broadcast_order : 0.000015s : 0.06% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.01% optimize.offloading_packed_experts : 0.000004s : 0.02% optimize.overlap_recompute_and_grad_model_parallel : 0.000006s : 0.02% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000002s : 0.01% optimize.overlap_recompute_comm : 0.000002s : 0.01% optimize.overlap_grad_ring_attention : 0.000005s : 0.02% optimize.overlap_grad_flash_sp : 0.000023s : 0.09% optimize.begin_end_overlap_inline : 0.000000s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.01% optimize.split_layernorm_comm : 0.000002s : 0.01% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000003s : 0.01% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000011s : 0.05% optimize.symbol_engine_optimizer.elim_not_effective : 0.000015s : 0.06% optimize.symbol_engine_optimizer.opt_reshape : 0.000009s : 0.04% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000013s : 0.05% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.01% pipeline_parallel_scheduler : 0.000002s : 0.01% auto_monad_reorder : 0.000021s : 0.08% get_jit_bprop_graph : 0.000002s : 0.01% rewriter_after_jit_bprop_graph : 0.000005s : 0.02% opt_after_jit_grad : 0.000506s : 2.08% validate : 0.000048s : 0.20% Time group info: ------[substitution.] 0.000213 39 11.30% : 0.000024s : 3: substitution.cast_eliminate 1.04% : 0.000002s : 3: substitution.elim_not_effective 0.78% : 0.000002s : 3: substitution.fold_const_symbol 3.32% : 0.000007s : 5: substitution.graph_param_transform 66.09% : 0.000141s : 4: substitution.inline 2.34% : 0.000005s : 6: substitution.j_node_and_user_rematch 3.12% : 0.000007s : 6: substitution.remove_not_recompute_node 2.62% : 0.000006s : 4: substitution.replace_old_param 5.73% : 0.000012s : 4: substitution.tuple_list_get_item_eliminator 3.66% : 0.000008s : 1: substitution.value_based_eliminate ------[type_inference.] 0.018281 2 96.01% : 0.017551s : 1: type_inference.infer 3.99% : 0.000730s : 1: type_inference.specialize ------[replace.] 0.000061 8 62.50% : 0.000038s : 4: replace.inline 37.50% : 0.000023s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000149 8 92.99% : 0.000138s : 4: match.inline 7.01% : 0.000010s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000240 1504 0.90% : 0.000002s : 15: predicate.accumulaten_eliminater 0.78% : 0.000002s : 5: predicate.ad_related_special_op_eliminate 0.57% : 0.000001s : 10: predicate.addn_check_dump 0.86% : 0.000002s : 15: predicate.addn_zero_filter 0.80% : 0.000002s : 15: predicate.adjust_all_reduce_mul_add 1.98% : 0.000005s : 25: predicate.arithmetic_simplify 1.14% : 0.000003s : 15: predicate.cast_eliminate 0.60% : 0.000001s : 10: predicate.check_bprop_eliminate 0.57% : 0.000001s : 10: predicate.compare_switch_simplify 0.20% : 0.000000s : 5: predicate.const_output_eliminate 0.63% : 0.000002s : 10: predicate.depend_value_elim 0.91% : 0.000002s : 15: predicate.dict_get_item_const_eliminator 1.05% : 0.000003s : 15: predicate.dict_get_item_eliminator 1.00% : 0.000002s : 15: predicate.dict_set_item_eliminator 0.92% : 0.000002s : 10: predicate.dumpgradient_eliminate 0.28% : 0.000001s : 5: predicate.elim_not_effective 0.35% : 0.000001s : 5: predicate.elim_shapecalc_of_broadcastargs 1.10% : 0.000003s : 20: predicate.environ_add_const_eliminate 1.05% : 0.000003s : 20: predicate.environ_get_add_eliminate 1.09% : 0.000003s : 20: predicate.environ_get_depend_swap 1.76% : 0.000004s : 30: predicate.environ_get_eliminate 1.09% : 0.000003s : 20: predicate.environ_get_set_eliminate 1.32% : 0.000003s : 23: predicate.exchange_switch_depend_value 2.37% : 0.000006s : 23: predicate.float_depend_g_call 0.56% : 0.000001s : 10: predicate.float_environ_get_switch 0.98% : 0.000002s : 15: predicate.float_tuple_getitem_switch 0.19% : 0.000000s : 5: predicate.fold_const_symbol 0.72% : 0.000002s : 10: predicate.get_grad_eliminate 0.22% : 0.000001s : 5: predicate.graph_param_transform 0.67% : 0.000002s : 10: predicate.incorporate_call 0.57% : 0.000001s : 10: predicate.incorporate_call_switch 6.72% : 0.000016s : 68: predicate.inline 0.82% : 0.000002s : 10: predicate.inline_without_move 0.33% : 0.000001s : 10: predicate.j_node_and_user_rematch 0.73% : 0.000002s : 10: predicate.less_batch_normalization 1.90% : 0.000005s : 29: predicate.list_to_tuple_eliminator_ 2.48% : 0.000006s : 44: predicate.load_eliminater 1.00% : 0.000002s : 5: predicate.loop_unroll_after_grad 2.09% : 0.000005s : 36: predicate.loop_unroll_before_grad 1.67% : 0.000004s : 25: predicate.make_slice_get_slice_eliminator 0.64% : 0.000002s : 10: predicate.merge_addn 0.62% : 0.000001s : 10: predicate.micro_step_allgather_replace 0.58% : 0.000001s : 10: predicate.mini_step_allgather_replace 0.83% : 0.000002s : 15: predicate.minmaximum_grad 1.25% : 0.000003s : 5: predicate.mutable_eliminate 0.36% : 0.000001s : 5: predicate.opt_reshape 0.40% : 0.000001s : 5: predicate.parallel_virtual_node 1.80% : 0.000004s : 23: predicate.partial_defer_inline 1.59% : 0.000004s : 24: predicate.partial_eliminate 1.01% : 0.000002s : 15: predicate.print_const_string_wrapper 0.60% : 0.000001s : 10: predicate.reduce_all_const_elim 1.26% : 0.000003s : 15: predicate.reduce_eliminate 2.43% : 0.000006s : 44: predicate.redundant_stop_gradient_eliminater 0.40% : 0.000001s : 10: predicate.remove_not_recompute_node 1.37% : 0.000003s : 29: predicate.replace_applicator 0.45% : 0.000001s : 10: predicate.replace_old_param 0.42% : 0.000001s : 5: predicate.reset_defer_inline 0.88% : 0.000002s : 15: predicate.reshape_eliminate 0.65% : 0.000002s : 10: predicate.row_tensor_add_zeros_like 0.39% : 0.000001s : 5: predicate.row_tensor_eliminate 0.92% : 0.000002s : 10: predicate.same_eliminate 0.45% : 0.000001s : 10: predicate.set_cell_output_no_recompute 0.80% : 0.000002s : 10: predicate.shard_identity_eliminate 0.67% : 0.000002s : 10: predicate.special_op_eliminate 0.79% : 0.000002s : 10: predicate.specialize_transform 1.06% : 0.000003s : 10: predicate.split_environ_get_set_with_tuple_value 0.73% : 0.000002s : 10: predicate.stack_unstack_eliminate 0.36% : 0.000001s : 5: predicate.switch_call_monad_eliminater 1.43% : 0.000003s : 23: predicate.switch_defer_inline 1.95% : 0.000005s : 33: predicate.switch_layer_defer_inline 4.80% : 0.000012s : 74: predicate.switch_simplify 0.87% : 0.000002s : 15: predicate.tile_eliminate 0.91% : 0.000002s : 15: predicate.transpose_eliminate 1.51% : 0.000004s : 25: predicate.tuple_list_convert_item_index_to_positive 1.65% : 0.000004s : 25: predicate.tuple_list_get_item_const_eliminator 1.49% : 0.000004s : 25: predicate.tuple_list_get_item_depend_reorder 3.17% : 0.000008s : 39: predicate.tuple_list_get_item_eliminator 1.45% : 0.000003s : 25: predicate.tuple_list_get_set_item_eliminator 2.23% : 0.000005s : 35: predicate.tuple_list_set_item_eliminator 1.73% : 0.000004s : 29: predicate.tuple_to_list_eliminator_ 2.37% : 0.000006s : 44: predicate.updatestate_pure_node_eliminater 3.12% : 0.000008s : 54: predicate.updatestate_useless_node_eliminater 0.51% : 0.000001s : 5: predicate.value_based_eliminate 0.65% : 0.000002s : 10: predicate.virtual_dataset_eliminate 0.67% : 0.000002s : 10: predicate.virtual_output_eliminate 0.35% : 0.000001s : 5: predicate.virtual_view_grad_eliminate 0.47% : 0.000001s : 5: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000579 11 56.62% : 0.000328s : 5: func_graph_cloner_run.FuncGraphClonerGraph 43.38% : 0.000251s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.040016 192 0.01% : 0.000004s : 1: ForceFp32Comm 8.01% : 0.003206s : 1: add_attr 7.99% : 0.003196s : 1: add_attr_with_inline 0.01% : 0.000004s : 1: add_comm_op_reuse_tag 0.16% : 0.000064s : 1: add_recomputation 0.01% : 0.000004s : 1: assign_add_opt 0.18% : 0.000071s : 1: auto_monad 0.06% : 0.000024s : 1: auto_monad_reorder 0.01% : 0.000003s : 1: begin_end_overlap_inline 0.01% : 0.000006s : 1: bias_add_comm_swap 1.20% : 0.000480s : 1: bootstrap 0.09% : 0.000035s : 1: cconv 0.01% : 0.000004s : 1: comm_op_add_attrs 0.05% : 0.000018s : 1: control_data_broadcast_order 0.03% : 0.000011s : 1: convert_after_rewriter 0.07% : 0.000029s : 1: cse_after_recomputation 0.01% : 0.000005s : 1: dataset_repeat_opt 0.01% : 0.000005s : 1: detach_backward 0.03% : 0.000010s : 1: environ_conv 0.07% : 0.000026s : 1: event_method 0.01% : 0.000005s : 1: full_micro_interleaved_order_control 0.01% : 0.000005s : 1: get_jit_bprop_graph 0.02% : 0.000009s : 1: graph_reusing 0.01% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000004s : 1: handle_group_info 0.01% : 0.000005s : 1: inline 0.02% : 0.000006s : 1: insert-virtual-dataset 0.01% : 0.000004s : 1: interleave_parallel_branches 0.01% : 0.000004s : 1: interleave_split_concat_branches 0.01% : 0.000006s : 1: label_fine_grained_interleaved_index 0.02% : 0.000008s : 1: label_micro_interleaved_index 1.22% : 0.000488s : 1: loop_unroll 0.01% : 0.000004s : 1: merge_cast_opt 0.01% : 0.000005s : 1: micro_interleaved_order_control 1.65% : 0.000659s : 1: mutable_eliminate 0.02% : 0.000007s : 1: offloading_packed_experts 0.04% : 0.000017s : 1: opt.transform.loop_unroll_optimizer 0.06% : 0.000023s : 1: opt.transform.mutable_eliminate 3.46% : 0.001384s : 78: opt.transform.opt_a 0.09% : 0.000037s : 1: opt.transform.opt_after_cconv 0.08% : 0.000030s : 1: opt.transform.opt_after_jit_grad 0.37% : 0.000149s : 28: opt.transform.opt_b 0.15% : 0.000059s : 2: opt.transform.opt_trans_graph 0.11% : 0.000044s : 4: opt.transform.symbol_engine_opt 8.24% : 0.003298s : 1: opt_a 0.31% : 0.000125s : 1: opt_after_cconv 1.29% : 0.000517s : 1: opt_after_jit_grad 0.69% : 0.000276s : 1: opt_b 14.16% : 0.005665s : 1: optimize 0.06% : 0.000023s : 1: optimize_parallel_all_gather_comm 0.02% : 0.000009s : 1: order_py_execute_after_rewriter 0.07% : 0.000026s : 1: overlap_grad_flash_sp 0.01% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.02% : 0.000008s : 1: overlap_grad_ring_attention 0.01% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.01% : 0.000005s : 1: overlap_param_gather 0.01% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.02% : 0.000009s : 1: overlap_recompute_and_grad_model_parallel 0.01% : 0.000005s : 1: overlap_recompute_comm 0.02% : 0.000007s : 1: parallel-infer-symbol 0.01% : 0.000004s : 1: parallel-infer-symbol-second 0.01% : 0.000005s : 1: partial_unused_args_eliminate 0.01% : 0.000005s : 1: pipeline_parallel_scheduler 0.01% : 0.000005s : 1: pipeline_split 0.09% : 0.000037s : 1: pre_auto_parallel 0.07% : 0.000028s : 1: py_interpret_to_execute 0.05% : 0.000019s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000004s : 1: remove_cast_before_assign_add 0.05% : 0.000019s : 1: remove_dup_value 1.72% : 0.000689s : 1: renormalize.infer 0.79% : 0.000315s : 1: renormalize.specialize 0.01% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.02% : 0.000008s : 1: rewriter_after_jit_bprop_graph 0.12% : 0.000047s : 1: rewriter_after_opt_a 0.22% : 0.000086s : 1: rewriter_before_opt_a 0.01% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.01% : 0.000006s : 1: slice_recompute_activation 0.01% : 0.000005s : 1: split_layernorm_comm 0.01% : 0.000005s : 1: split_matmul_comm_elemetwise 0.02% : 0.000009s : 1: swap_dp_allreduce_reducescatter 0.22% : 0.000087s : 1: symbol_engine_optimizer 0.22% : 0.000089s : 1: tuple_transform 45.90% : 0.018367s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:44:26.768.121 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:44:26.768.404 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0168391, [21] [bootstrap]: 0.00042347 [type_inference]: 0.00591043 [event_method]: 1.76e-05 [auto_monad]: 6.097e-05 [graph_reusing]: 5.86e-06 [inline]: 1.83002e-06 [add_attr]: 0.00318067, [1] [add_attr_with_inline]: 0.00317266, [1] [Cycle 1]: 6.786e-05, [2] [tag_attr]: 1.765e-05 [meta_addattr_fg_expand]: 5.87001e-06 [parallel-infer-symbol]: 3.08e-06 [pre_auto_parallel]: 3.212e-05 [insert-virtual-dataset]: 2.71e-06 [parallel-infer-symbol-second]: 7.09988e-07 [dataset_repeat_opt]: 2.15002e-06 [pipeline_split]: 1.74998e-06 [optimize]: 0.00579621, [53] [py_interpret_to_execute]: 2.716e-05 [rewriter_before_opt_a]: 8.274e-05 [opt_a]: 0.00309782, [2] [Cycle 1]: 0.00226457, [45] [expand_dump_flag]: 2.69001e-06 [switch_simplify]: 3.799e-05 [loop_unroll]: 2.971e-05 [a_1]: 0.00059145 [with_stream_mark]: 1.619e-05 [recompute_prepare]: 8.87e-06 [updatestate_depend_eliminate]: 4.55001e-06 [updatestate_assign_eliminate]: 3.70998e-06 [updatestate_loads_eliminate]: 3.45998e-06 [parameter_eliminate]: 2.29999e-06 [a_2]: 0.00011258 [accelerated_algorithm]: 6.93e-06 [shard]: 1.85001e-06 [meta_shard_fg_expand]: 2.15002e-06 [shard_inline]: 6.93e-06 [merge_send_recv]: 8.63001e-06 [auto_parallel]: 6.69001e-06 [parallel]: 1.876e-05 [flash_sp]: 8.11002e-06 [merge_comm]: 3.96001e-06 [allreduce_fusion]: 3.41001e-06 [matmul_add_comm_reduction]: 1.033e-05 [allreduce_slice_to_reducescatter]: 6.69999e-07 [virtual_shard_identity]: 7.66001e-06 [virtual_dataset]: 6.92002e-06 [get_grad_eliminate_]: 6.83e-06 [virtual_output]: 6.76999e-06 [merge_forward]: 4.03001e-06 [cell_reuse_recompute_pass]: 1.34e-06 [offload_activation]: 9.76998e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.515e-05 [merge_recompute_call_nodes]: 1.73002e-06 [before_grad]: 1.099e-05 [set_forward_comm_id_for_comm_node_pass]: 4.4e-06 [meta_fg_expand]: 2.98e-06 [flash_sp_send_recv_attached]: 2.70002e-06 [receive_attached]: 2.26998e-06 [after_resolve]: 1.181e-05 [a_after_grad]: 1.022e-05 [renormalize]: 0.00076977 [add_forward_monad_depend]: 5.62001e-06 [auto_monad_grad]: 2.86999e-06 [auto_monad_eliminator]: 1.566e-05 [cse]: 2.83e-05 [a_3]: 6.066e-05 [Cycle 2]: 0.00081952, [45] [expand_dump_flag]: 1.11002e-06 [switch_simplify]: 7.77002e-06 [loop_unroll]: 6.31e-06 [a_1]: 0.00013073 [with_stream_mark]: 1.241e-05 [recompute_prepare]: 6.33e-06 [updatestate_depend_eliminate]: 2.98e-06 [updatestate_assign_eliminate]: 2.68998e-06 [updatestate_loads_eliminate]: 2.17001e-06 [parameter_eliminate]: 1.34e-06 [a_2]: 9.975e-05 [accelerated_algorithm]: 6.41e-06 [shard]: 1.14e-06 [meta_shard_fg_expand]: 1.47001e-06 [shard_inline]: 6.41e-06 [merge_send_recv]: 5.18002e-06 [auto_parallel]: 5.35001e-06 [parallel]: 9.06002e-06 [flash_sp]: 3.46999e-06 [merge_comm]: 3.31001e-06 [allreduce_fusion]: 3.13e-06 [matmul_add_comm_reduction]: 6.39001e-06 [allreduce_slice_to_reducescatter]: 5.10016e-07 [virtual_shard_identity]: 6.89999e-06 [virtual_dataset]: 6.21e-06 [get_grad_eliminate_]: 5.74e-06 [virtual_output]: 7.67998e-06 [merge_forward]: 2.95002e-06 [cell_reuse_recompute_pass]: 1.59e-06 [offload_activation]: 6.46999e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.542e-05 [merge_recompute_call_nodes]: 7.7e-07 [before_grad]: 9.62001e-06 [set_forward_comm_id_for_comm_node_pass]: 3.31001e-06 [meta_fg_expand]: 2.01e-06 [flash_sp_send_recv_attached]: 9.00007e-07 [receive_attached]: 1.24998e-06 [after_resolve]: 1.055e-05 [a_after_grad]: 9.24e-06 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 1.59998e-06 [auto_monad_grad]: 8.79983e-07 [auto_monad_eliminator]: 7.2e-06 [cse]: 1.305e-05 [a_3]: 4.966e-05 [py_interpret_to_execute_after_opt_a]: 1.287e-05 [slice_cell_reuse_recomputed_activation]: 4.68001e-06 [rewriter_after_opt_a]: 3.815e-05 [convert_after_rewriter]: 9.72999e-06 [order_py_execute_after_rewriter]: 8.08001e-06 [mutable_eliminate]: 0.00055307 [opt_b]: 0.00028728, [1] [Cycle 1]: 0.00027811, [7] [b_1]: 0.00018487 [b_2]: 8.17e-06 [updatestate_depend_eliminate]: 5.82001e-06 [updatestate_assign_eliminate]: 2.68e-06 [updatestate_loads_eliminate]: 2.49999e-06 [renormalize]: 3.80009e-07 [cse]: 1.625e-05 [optimize_parallel_all_gather_comm]: 1.977e-05 [overlap_param_gather]: 4.79e-06 [cconv]: 2.899e-05 [loop_unroll]: 0.00077929 [opt_after_cconv]: 0.00014406, [1] [Cycle 1]: 0.00013367, [7] [c_1]: 3.832e-05 [parameter_eliminate]: 3.86999e-06 [updatestate_depend_eliminate]: 6.63003e-06 [updatestate_assign_eliminate]: 2.68e-06 [updatestate_loads_eliminate]: 2.22999e-06 [cse]: 1.943e-05 [renormalize]: 8.70001e-07 [remove_dup_value]: 1.782e-05 [tuple_transform]: 9.275e-05, [1] [Cycle 1]: 8.55e-05, [4] [d_1]: 4.643e-05 [none_parameter_eliminate]: 1.60999e-06 [renormalize]: 1.80007e-07 [switch_simplify]: 7.14001e-06 [partial_unused_args_eliminate]: 4.32e-06 [add_recomputation]: 5.415e-05 [cse_after_recomputation]: 2.63e-05, [1] [Cycle 1]: 1.922e-05, [1] [cse]: 1.044e-05 [environ_conv]: 8.97e-06 [swap_dp_allreduce_reducescatter]: 7.83001e-06 [bias_add_comm_swap]: 5.34998e-06 [label_micro_interleaved_index]: 8.05999e-06 [label_fine_grained_interleaved_index]: 5.12e-06 [merge_cast_opt]: 3.8e-06 [slice_recompute_activation]: 4.53001e-06 [micro_interleaved_order_control]: 5.04e-06 [assign_add_opt]: 3.81001e-06 [ForceFp32Comm]: 3.38e-06 [remove_cast_before_assign_add]: 3.55998e-06 [full_micro_interleaved_order_control]: 4.85999e-06 [reorder_send_recv_between_fp_bp]: 5.17999e-06 [comm_op_add_attrs]: 3.56999e-06 [add_comm_op_reuse_tag]: 3.31999e-06 [interleave_split_concat_branches]: 3.52002e-06 [interleave_parallel_branches]: 3.59002e-06 [overlap_opt_shard_in_pipeline]: 3.65e-06 [overlap_opt_shard_grad_in_pipeline]: 4.70001e-06 [control_data_broadcast_order]: 1.721e-05 [grouped_pairwise_exchange_alltoall]: 4.45999e-06 [offloading_packed_experts]: 6.16e-06 [overlap_recompute_and_grad_model_parallel]: 7.15e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.61001e-06 [overlap_recompute_allgather_and_fa_grad]: 3.75998e-06 [overlap_recompute_comm]: 4.88001e-06 [overlap_grad_ring_attention]: 6.88e-06 [overlap_grad_flash_sp]: 2.501e-05 [begin_end_overlap_inline]: 2.98998e-06 [split_matmul_comm_elemetwise]: 5.15999e-06 [split_layernorm_comm]: 4.35e-06 [handle_group_info]: 3.23e-06 [symbol_engine_optimizer]: 9.866e-05, [1] [Cycle 1]: 9.096e-05, [6] [build]: 3.45e-06 [elim_shapecalc]: 1.049e-05 [elim_not_effective]: 1.347e-05 [opt_reshape]: 7.13e-06 [fold_const_symbol]: 1.059e-05 [renormalize]: 2.09984e-07 [detach_backward]: 4.17e-06 [pipeline_parallel_scheduler]: 2.07999e-06 [auto_monad_reorder]: 2.201e-05 [get_jit_bprop_graph]: 1.93002e-06 [rewriter_after_jit_bprop_graph]: 6.93e-06 [opt_after_jit_grad]: 0.0006975 [validate]: 4.935e-05 Sums bootstrap : 0.000423s : 3.57% type_inference : 0.005910s : 49.76% event_method : 0.000018s : 0.15% auto_monad : 0.000061s : 0.51% graph_reusing : 0.000006s : 0.05% inline : 0.000002s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000018s : 0.15% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.05% parallel-infer-symbol : 0.000003s : 0.03% pre_auto_parallel : 0.000032s : 0.27% insert-virtual-dataset : 0.000003s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000027s : 0.23% optimize.rewriter_before_opt_a : 0.000083s : 0.70% optimize.opt_a.expand_dump_flag : 0.000004s : 0.03% optimize.opt_a.switch_simplify : 0.000046s : 0.39% optimize.opt_a.loop_unroll : 0.000036s : 0.30% optimize.opt_a.a_1 : 0.000722s : 6.08% optimize.opt_a.with_stream_mark : 0.000029s : 0.24% optimize.opt_a.recompute_prepare : 0.000015s : 0.13% optimize.opt_a.updatestate_depend_eliminate : 0.000008s : 0.06% optimize.opt_a.updatestate_assign_eliminate : 0.000006s : 0.05% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.05% optimize.opt_a.parameter_eliminate : 0.000004s : 0.03% optimize.opt_a.a_2 : 0.000212s : 1.79% optimize.opt_a.accelerated_algorithm : 0.000013s : 0.11% optimize.opt_a.shard : 0.000003s : 0.03% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.03% optimize.opt_a.shard_inline : 0.000013s : 0.11% optimize.opt_a.merge_send_recv : 0.000014s : 0.12% optimize.opt_a.auto_parallel : 0.000012s : 0.10% optimize.opt_a.parallel : 0.000028s : 0.23% optimize.opt_a.flash_sp : 0.000012s : 0.10% optimize.opt_a.merge_comm : 0.000007s : 0.06% optimize.opt_a.allreduce_fusion : 0.000007s : 0.06% optimize.opt_a.matmul_add_comm_reduction : 0.000017s : 0.14% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000015s : 0.12% optimize.opt_a.virtual_dataset : 0.000013s : 0.11% optimize.opt_a.get_grad_eliminate_ : 0.000013s : 0.11% optimize.opt_a.virtual_output : 0.000014s : 0.12% optimize.opt_a.merge_forward : 0.000007s : 0.06% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.02% optimize.opt_a.offload_activation : 0.000016s : 0.14% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000031s : 0.26% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.02% optimize.opt_a.before_grad : 0.000021s : 0.17% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000008s : 0.06% optimize.opt_a.meta_fg_expand : 0.000005s : 0.04% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.03% optimize.opt_a.receive_attached : 0.000004s : 0.03% optimize.opt_a.after_resolve : 0.000022s : 0.19% optimize.opt_a.a_after_grad : 0.000019s : 0.16% optimize.opt_a.renormalize : 0.000770s : 6.48% optimize.opt_a.add_forward_monad_depend : 0.000007s : 0.06% optimize.opt_a.auto_monad_grad : 0.000004s : 0.03% optimize.opt_a.auto_monad_eliminator : 0.000023s : 0.19% optimize.opt_a.cse : 0.000041s : 0.35% optimize.opt_a.a_3 : 0.000110s : 0.93% optimize.py_interpret_to_execute_after_opt_a : 0.000013s : 0.11% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.04% optimize.rewriter_after_opt_a : 0.000038s : 0.32% optimize.convert_after_rewriter : 0.000010s : 0.08% optimize.order_py_execute_after_rewriter : 0.000008s : 0.07% optimize.mutable_eliminate : 0.000553s : 4.66% optimize.opt_b.b_1 : 0.000185s : 1.56% optimize.opt_b.b_2 : 0.000008s : 0.07% optimize.opt_b.updatestate_depend_eliminate : 0.000006s : 0.05% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.02% optimize.opt_b.updatestate_loads_eliminate : 0.000002s : 0.02% optimize.opt_b.renormalize : 0.000000s : 0.00% optimize.opt_b.cse : 0.000016s : 0.14% optimize.optimize_parallel_all_gather_comm : 0.000020s : 0.17% optimize.overlap_param_gather : 0.000005s : 0.04% optimize.cconv : 0.000029s : 0.24% optimize.loop_unroll : 0.000779s : 6.56% optimize.opt_after_cconv.c_1 : 0.000038s : 0.32% optimize.opt_after_cconv.parameter_eliminate : 0.000004s : 0.03% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000007s : 0.06% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.02% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.02% optimize.opt_after_cconv.cse : 0.000019s : 0.16% optimize.opt_after_cconv.renormalize : 0.000001s : 0.01% optimize.remove_dup_value : 0.000018s : 0.15% optimize.tuple_transform.d_1 : 0.000046s : 0.39% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000007s : 0.06% optimize.partial_unused_args_eliminate : 0.000004s : 0.04% optimize.add_recomputation : 0.000054s : 0.46% optimize.cse_after_recomputation.cse : 0.000010s : 0.09% optimize.environ_conv : 0.000009s : 0.08% optimize.swap_dp_allreduce_reducescatter : 0.000008s : 0.07% optimize.bias_add_comm_swap : 0.000005s : 0.05% optimize.label_micro_interleaved_index : 0.000008s : 0.07% optimize.label_fine_grained_interleaved_index : 0.000005s : 0.04% optimize.merge_cast_opt : 0.000004s : 0.03% optimize.slice_recompute_activation : 0.000005s : 0.04% optimize.micro_interleaved_order_control : 0.000005s : 0.04% optimize.assign_add_opt : 0.000004s : 0.03% optimize.ForceFp32Comm : 0.000003s : 0.03% optimize.remove_cast_before_assign_add : 0.000004s : 0.03% optimize.full_micro_interleaved_order_control : 0.000005s : 0.04% optimize.reorder_send_recv_between_fp_bp : 0.000005s : 0.04% optimize.comm_op_add_attrs : 0.000004s : 0.03% optimize.add_comm_op_reuse_tag : 0.000003s : 0.03% optimize.interleave_split_concat_branches : 0.000004s : 0.03% optimize.interleave_parallel_branches : 0.000004s : 0.03% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.03% optimize.overlap_opt_shard_grad_in_pipeline : 0.000005s : 0.04% optimize.control_data_broadcast_order : 0.000017s : 0.14% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.04% optimize.offloading_packed_experts : 0.000006s : 0.05% optimize.overlap_recompute_and_grad_model_parallel : 0.000007s : 0.06% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.03% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.03% optimize.overlap_recompute_comm : 0.000005s : 0.04% optimize.overlap_grad_ring_attention : 0.000007s : 0.06% optimize.overlap_grad_flash_sp : 0.000025s : 0.21% optimize.begin_end_overlap_inline : 0.000003s : 0.03% optimize.split_matmul_comm_elemetwise : 0.000005s : 0.04% optimize.split_layernorm_comm : 0.000004s : 0.04% optimize.handle_group_info : 0.000003s : 0.03% optimize.symbol_engine_optimizer.build : 0.000003s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000010s : 0.09% optimize.symbol_engine_optimizer.elim_not_effective : 0.000013s : 0.11% optimize.symbol_engine_optimizer.opt_reshape : 0.000007s : 0.06% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000011s : 0.09% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000004s : 0.04% pipeline_parallel_scheduler : 0.000002s : 0.02% auto_monad_reorder : 0.000022s : 0.19% get_jit_bprop_graph : 0.000002s : 0.02% rewriter_after_jit_bprop_graph : 0.000007s : 0.06% opt_after_jit_grad : 0.000697s : 5.87% validate : 0.000049s : 0.42% Time group info: ------[substitution.] 0.000153 29 1.50% : 0.000002s : 2: substitution.elim_not_effective 0.97% : 0.000001s : 2: substitution.fold_const_symbol 3.58% : 0.000005s : 4: substitution.graph_param_transform 72.63% : 0.000111s : 4: substitution.inline 2.32% : 0.000004s : 4: substitution.j_node_and_user_rematch 3.36% : 0.000005s : 4: substitution.remove_not_recompute_node 3.45% : 0.000005s : 4: substitution.replace_old_param 7.53% : 0.000012s : 4: substitution.tuple_list_get_item_eliminator 4.66% : 0.000007s : 1: substitution.value_based_eliminate ------[type_inference.] 0.005860 2 87.95% : 0.005154s : 1: type_inference.infer 12.05% : 0.000706s : 1: type_inference.specialize ------[replace.] 0.000056 8 60.12% : 0.000033s : 4: replace.inline 39.88% : 0.000022s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000118 8 91.90% : 0.000108s : 4: match.inline 8.10% : 0.000010s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000221 1278 0.87% : 0.000002s : 13: predicate.accumulaten_eliminater 1.19% : 0.000003s : 4: predicate.ad_related_special_op_eliminate 0.48% : 0.000001s : 8: predicate.addn_check_dump 1.05% : 0.000002s : 13: predicate.addn_zero_filter 0.73% : 0.000002s : 13: predicate.adjust_all_reduce_mul_add 1.87% : 0.000004s : 21: predicate.arithmetic_simplify 0.83% : 0.000002s : 13: predicate.cast_eliminate 0.54% : 0.000001s : 8: predicate.check_bprop_eliminate 0.49% : 0.000001s : 8: predicate.compare_switch_simplify 0.18% : 0.000000s : 4: predicate.const_output_eliminate 0.58% : 0.000001s : 8: predicate.depend_value_elim 0.85% : 0.000002s : 13: predicate.dict_get_item_const_eliminator 0.98% : 0.000002s : 13: predicate.dict_get_item_eliminator 0.77% : 0.000002s : 13: predicate.dict_set_item_eliminator 1.57% : 0.000003s : 8: predicate.dumpgradient_eliminate 0.20% : 0.000000s : 4: predicate.elim_not_effective 0.47% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.48% : 0.000003s : 17: predicate.environ_add_const_eliminate 1.00% : 0.000002s : 17: predicate.environ_get_add_eliminate 0.99% : 0.000002s : 17: predicate.environ_get_depend_swap 1.56% : 0.000003s : 25: predicate.environ_get_eliminate 1.02% : 0.000002s : 17: predicate.environ_get_set_eliminate 1.33% : 0.000003s : 21: predicate.exchange_switch_depend_value 2.14% : 0.000005s : 21: predicate.float_depend_g_call 0.50% : 0.000001s : 8: predicate.float_environ_get_switch 0.73% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.16% : 0.000000s : 4: predicate.fold_const_symbol 0.70% : 0.000002s : 8: predicate.get_grad_eliminate 0.21% : 0.000000s : 4: predicate.graph_param_transform 0.54% : 0.000001s : 8: predicate.incorporate_call 0.46% : 0.000001s : 8: predicate.incorporate_call_switch 6.12% : 0.000013s : 58: predicate.inline 0.66% : 0.000001s : 8: predicate.inline_without_move 0.30% : 0.000001s : 8: predicate.j_node_and_user_rematch 0.72% : 0.000002s : 8: predicate.less_batch_normalization 1.91% : 0.000004s : 25: predicate.list_to_tuple_eliminator_ 2.31% : 0.000005s : 38: predicate.load_eliminater 0.80% : 0.000002s : 4: predicate.loop_unroll_after_grad 2.09% : 0.000005s : 34: predicate.loop_unroll_before_grad 1.48% : 0.000003s : 21: predicate.make_slice_get_slice_eliminator 0.53% : 0.000001s : 8: predicate.merge_addn 0.57% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.52% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.74% : 0.000002s : 13: predicate.minmaximum_grad 0.84% : 0.000002s : 4: predicate.mutable_eliminate 0.39% : 0.000001s : 4: predicate.opt_reshape 0.39% : 0.000001s : 4: predicate.parallel_virtual_node 1.56% : 0.000003s : 21: predicate.partial_defer_inline 7.79% : 0.000017s : 21: predicate.partial_eliminate 0.80% : 0.000002s : 13: predicate.print_const_string_wrapper 0.53% : 0.000001s : 8: predicate.reduce_all_const_elim 1.42% : 0.000003s : 13: predicate.reduce_eliminate 2.28% : 0.000005s : 38: predicate.redundant_stop_gradient_eliminater 0.40% : 0.000001s : 8: predicate.remove_not_recompute_node 1.31% : 0.000003s : 25: predicate.replace_applicator 0.40% : 0.000001s : 8: predicate.replace_old_param 0.26% : 0.000001s : 4: predicate.reset_defer_inline 0.83% : 0.000002s : 13: predicate.reshape_eliminate 0.58% : 0.000001s : 8: predicate.row_tensor_add_zeros_like 0.40% : 0.000001s : 4: predicate.row_tensor_eliminate 0.71% : 0.000002s : 8: predicate.same_eliminate 0.39% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.71% : 0.000002s : 8: predicate.shard_identity_eliminate 0.70% : 0.000002s : 8: predicate.special_op_eliminate 0.67% : 0.000001s : 8: predicate.specialize_transform 0.75% : 0.000002s : 8: predicate.split_environ_get_set_with_tuple_value 0.72% : 0.000002s : 8: predicate.stack_unstack_eliminate 0.29% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.38% : 0.000003s : 21: predicate.switch_defer_inline 1.87% : 0.000004s : 29: predicate.switch_layer_defer_inline 4.62% : 0.000010s : 67: predicate.switch_simplify 0.82% : 0.000002s : 13: predicate.tile_eliminate 0.82% : 0.000002s : 13: predicate.transpose_eliminate 1.52% : 0.000003s : 21: predicate.tuple_list_convert_item_index_to_positive 1.42% : 0.000003s : 21: predicate.tuple_list_get_item_const_eliminator 1.43% : 0.000003s : 21: predicate.tuple_list_get_item_depend_reorder 2.81% : 0.000006s : 33: predicate.tuple_list_get_item_eliminator 1.40% : 0.000003s : 21: predicate.tuple_list_get_set_item_eliminator 2.19% : 0.000005s : 29: predicate.tuple_list_set_item_eliminator 1.62% : 0.000004s : 25: predicate.tuple_to_list_eliminator_ 2.20% : 0.000005s : 38: predicate.updatestate_pure_node_eliminater 3.26% : 0.000007s : 46: predicate.updatestate_useless_node_eliminater 0.42% : 0.000001s : 4: predicate.value_based_eliminate 0.66% : 0.000001s : 8: predicate.virtual_dataset_eliminate 0.58% : 0.000001s : 8: predicate.virtual_output_eliminate 0.22% : 0.000000s : 4: predicate.virtual_view_grad_eliminate 0.40% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000525 11 53.56% : 0.000281s : 5: func_graph_cloner_run.FuncGraphClonerGraph 46.44% : 0.000244s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.027892 192 0.02% : 0.000006s : 1: ForceFp32Comm 11.44% : 0.003190s : 1: add_attr 11.39% : 0.003176s : 1: add_attr_with_inline 0.02% : 0.000006s : 1: add_comm_op_reuse_tag 0.21% : 0.000058s : 1: add_recomputation 0.02% : 0.000006s : 1: assign_add_opt 0.25% : 0.000069s : 1: auto_monad 0.11% : 0.000031s : 1: auto_monad_reorder 0.02% : 0.000006s : 1: begin_end_overlap_inline 0.03% : 0.000008s : 1: bias_add_comm_swap 1.68% : 0.000468s : 1: bootstrap 0.12% : 0.000032s : 1: cconv 0.02% : 0.000006s : 1: comm_op_add_attrs 0.07% : 0.000020s : 1: control_data_broadcast_order 0.05% : 0.000013s : 1: convert_after_rewriter 0.11% : 0.000029s : 1: cse_after_recomputation 0.03% : 0.000008s : 1: dataset_repeat_opt 0.09% : 0.000024s : 1: detach_backward 0.04% : 0.000012s : 1: environ_conv 0.10% : 0.000028s : 1: event_method 0.03% : 0.000007s : 1: full_micro_interleaved_order_control 0.03% : 0.000009s : 1: get_jit_bprop_graph 0.04% : 0.000012s : 1: graph_reusing 0.03% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.02% : 0.000006s : 1: handle_group_info 0.03% : 0.000007s : 1: inline 0.03% : 0.000009s : 1: insert-virtual-dataset 0.02% : 0.000006s : 1: interleave_parallel_branches 0.02% : 0.000006s : 1: interleave_split_concat_branches 0.03% : 0.000008s : 1: label_fine_grained_interleaved_index 0.04% : 0.000011s : 1: label_micro_interleaved_index 2.83% : 0.000789s : 1: loop_unroll 0.02% : 0.000006s : 1: merge_cast_opt 0.03% : 0.000008s : 1: micro_interleaved_order_control 2.01% : 0.000560s : 1: mutable_eliminate 0.03% : 0.000009s : 1: offloading_packed_experts 0.05% : 0.000014s : 1: opt.transform.loop_unroll_optimizer 0.05% : 0.000015s : 1: opt.transform.mutable_eliminate 4.11% : 0.001146s : 78: opt.transform.opt_a 0.13% : 0.000036s : 1: opt.transform.opt_after_cconv 0.12% : 0.000034s : 1: opt.transform.opt_after_jit_grad 0.42% : 0.000118s : 28: opt.transform.opt_b 0.18% : 0.000051s : 2: opt.transform.opt_trans_graph 0.14% : 0.000038s : 4: opt.transform.symbol_engine_opt 11.12% : 0.003101s : 1: opt_a 0.53% : 0.000148s : 1: opt_after_cconv 2.55% : 0.000712s : 1: opt_after_jit_grad 1.04% : 0.000291s : 1: opt_b 22.07% : 0.006156s : 1: optimize 0.08% : 0.000023s : 1: optimize_parallel_all_gather_comm 0.04% : 0.000011s : 1: order_py_execute_after_rewriter 0.10% : 0.000028s : 1: overlap_grad_flash_sp 0.02% : 0.000006s : 1: overlap_grad_matmul_and_grad_allreduce 0.03% : 0.000010s : 1: overlap_grad_ring_attention 0.03% : 0.000008s : 1: overlap_opt_shard_grad_in_pipeline 0.02% : 0.000006s : 1: overlap_opt_shard_in_pipeline 0.03% : 0.000008s : 1: overlap_param_gather 0.02% : 0.000006s : 1: overlap_recompute_allgather_and_fa_grad 0.04% : 0.000010s : 1: overlap_recompute_and_grad_model_parallel 0.03% : 0.000008s : 1: overlap_recompute_comm 0.03% : 0.000010s : 1: parallel-infer-symbol 0.02% : 0.000006s : 1: parallel-infer-symbol-second 0.03% : 0.000007s : 1: partial_unused_args_eliminate 0.04% : 0.000010s : 1: pipeline_parallel_scheduler 0.03% : 0.000008s : 1: pipeline_split 0.14% : 0.000040s : 1: pre_auto_parallel 0.11% : 0.000031s : 1: py_interpret_to_execute 0.06% : 0.000016s : 1: py_interpret_to_execute_after_opt_a 0.02% : 0.000006s : 1: remove_cast_before_assign_add 0.08% : 0.000021s : 1: remove_dup_value 1.62% : 0.000451s : 1: renormalize.infer 1.11% : 0.000310s : 1: renormalize.specialize 0.03% : 0.000008s : 1: reorder_send_recv_between_fp_bp 0.05% : 0.000013s : 1: rewriter_after_jit_bprop_graph 0.15% : 0.000041s : 1: rewriter_after_opt_a 0.31% : 0.000086s : 1: rewriter_before_opt_a 0.03% : 0.000007s : 1: slice_cell_reuse_recomputed_activation 0.03% : 0.000008s : 1: slice_recompute_activation 0.03% : 0.000008s : 1: split_layernorm_comm 0.03% : 0.000008s : 1: split_matmul_comm_elemetwise 0.04% : 0.000011s : 1: swap_dp_allreduce_reducescatter 0.36% : 0.000102s : 1: symbol_engine_optimizer 0.34% : 0.000096s : 1: tuple_transform 21.32% : 0.005947s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:44:27.170.472 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0186591, [21] [bootstrap]: 0.00050326 [type_inference]: 0.00656147 [event_method]: 1.883e-05 [auto_monad]: 6.402e-05 [graph_reusing]: 6.33e-06 [inline]: 2.25002e-06 [add_attr]: 0.00321238, [1] [add_attr_with_inline]: 0.00320212, [1] [Cycle 1]: 6.02e-05, [2] [tag_attr]: 1.95e-05 [meta_addattr_fg_expand]: 6.09001e-06 [parallel-infer-symbol]: 3.04999e-06 [pre_auto_parallel]: 3.427e-05 [insert-virtual-dataset]: 2.36e-06 [parallel-infer-symbol-second]: 7.7e-07 [dataset_repeat_opt]: 2.26e-06 [pipeline_split]: 1.79998e-06 [optimize]: 0.00752561, [53] [py_interpret_to_execute]: 2.477e-05 [rewriter_before_opt_a]: 8.161e-05 [opt_a]: 0.00528374, [2] [Cycle 1]: 0.00456478, [45] [expand_dump_flag]: 3.03e-06 [switch_simplify]: 4.217e-05 [loop_unroll]: 3.008e-05 [a_1]: 0.0006138 [with_stream_mark]: 1.774e-05 [recompute_prepare]: 1.022e-05 [updatestate_depend_eliminate]: 3.75998e-06 [updatestate_assign_eliminate]: 3.16001e-06 [updatestate_loads_eliminate]: 2.94001e-06 [parameter_eliminate]: 1.82001e-06 [a_2]: 8.18e-05 [accelerated_algorithm]: 6.86001e-06 [shard]: 1.89e-06 [meta_shard_fg_expand]: 1.99999e-06 [shard_inline]: 6.48e-06 [merge_send_recv]: 8.08001e-06 [auto_parallel]: 6.46999e-06 [parallel]: 1.801e-05 [flash_sp]: 8.17e-06 [merge_comm]: 3.81999e-06 [allreduce_fusion]: 3.84002e-06 [matmul_add_comm_reduction]: 1.028e-05 [allreduce_slice_to_reducescatter]: 6.19999e-07 [virtual_shard_identity]: 8.38001e-06 [virtual_dataset]: 6.64001e-06 [get_grad_eliminate_]: 6.17001e-06 [virtual_output]: 6.38998e-06 [merge_forward]: 4.03001e-06 [cell_reuse_recompute_pass]: 1.05001e-06 [offload_activation]: 1.1e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.284e-05 [merge_recompute_call_nodes]: 1.71002e-06 [before_grad]: 1.036e-05 [set_forward_comm_id_for_comm_node_pass]: 3.80998e-06 [meta_fg_expand]: 2.88998e-06 [flash_sp_send_recv_attached]: 2.79001e-06 [receive_attached]: 2.34001e-06 [after_resolve]: 1.192e-05 [a_after_grad]: 9.89001e-06 [renormalize]: 0.00318739 [add_forward_monad_depend]: 7.48999e-06 [auto_monad_grad]: 3.02002e-06 [auto_monad_eliminator]: 1.855e-05 [cse]: 3.131e-05 [a_3]: 5.496e-05 [Cycle 2]: 0.00070714, [45] [expand_dump_flag]: 1.81e-06 [switch_simplify]: 8.89e-06 [loop_unroll]: 6.62002e-06 [a_1]: 0.00014547 [with_stream_mark]: 1.51e-05 [recompute_prepare]: 6.81999e-06 [updatestate_depend_eliminate]: 3.18e-06 [updatestate_assign_eliminate]: 3.05002e-06 [updatestate_loads_eliminate]: 2.80002e-06 [parameter_eliminate]: 1.97999e-06 [a_2]: 7.311e-05 [accelerated_algorithm]: 7.16001e-06 [shard]: 2.29999e-06 [meta_shard_fg_expand]: 1.74e-06 [shard_inline]: 6.09001e-06 [merge_send_recv]: 6.59001e-06 [auto_parallel]: 7.19001e-06 [parallel]: 7.30003e-06 [flash_sp]: 4.07998e-06 [merge_comm]: 3.61999e-06 [allreduce_fusion]: 3.21001e-06 [matmul_add_comm_reduction]: 7.73001e-06 [allreduce_slice_to_reducescatter]: 5.90022e-07 [virtual_shard_identity]: 7.62002e-06 [virtual_dataset]: 5.99e-06 [get_grad_eliminate_]: 5.67001e-06 [virtual_output]: 5.73997e-06 [merge_forward]: 3.86001e-06 [cell_reuse_recompute_pass]: 2.29999e-06 [offload_activation]: 8.99e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.452e-05 [merge_recompute_call_nodes]: 9.20001e-07 [before_grad]: 1.036e-05 [set_forward_comm_id_for_comm_node_pass]: 4.38001e-06 [meta_fg_expand]: 2.17001e-06 [flash_sp_send_recv_attached]: 1.34e-06 [receive_attached]: 1.70001e-06 [after_resolve]: 1.147e-05 [a_after_grad]: 9.87001e-06 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 1.79998e-06 [auto_monad_grad]: 1.52999e-06 [auto_monad_eliminator]: 7.5e-06 [cse]: 1.615e-05 [a_3]: 3.757e-05 [py_interpret_to_execute_after_opt_a]: 1.257e-05 [slice_cell_reuse_recomputed_activation]: 1.86e-06 [rewriter_after_opt_a]: 3.719e-05 [convert_after_rewriter]: 6.85002e-06 [order_py_execute_after_rewriter]: 5.69999e-06 [mutable_eliminate]: 0.00066603 [opt_b]: 0.00022621, [1] [Cycle 1]: 0.00021857, [7] [b_1]: 0.00013847 [b_2]: 8.17e-06 [updatestate_depend_eliminate]: 8.40001e-06 [updatestate_assign_eliminate]: 2.42001e-06 [updatestate_loads_eliminate]: 2.37999e-06 [renormalize]: 7.30011e-07 [cse]: 2.058e-05 [optimize_parallel_all_gather_comm]: 1.665e-05 [overlap_param_gather]: 2.00002e-06 [cconv]: 2.779e-05 [loop_unroll]: 0.00045828 [opt_after_cconv]: 0.00010565, [1] [Cycle 1]: 9.873e-05, [7] [c_1]: 3.182e-05 [parameter_eliminate]: 3.4e-06 [updatestate_depend_eliminate]: 6.25002e-06 [updatestate_assign_eliminate]: 2.54999e-06 [updatestate_loads_eliminate]: 2.62001e-06 [cse]: 1.853e-05 [renormalize]: 5.00004e-07 [remove_dup_value]: 1.466e-05 [tuple_transform]: 7.79e-05, [1] [Cycle 1]: 7.364e-05, [4] [d_1]: 4.489e-05 [none_parameter_eliminate]: 2.09e-06 [renormalize]: 1.90019e-07 [switch_simplify]: 7.03e-06 [partial_unused_args_eliminate]: 1.69998e-06 [add_recomputation]: 4.868e-05 [cse_after_recomputation]: 2.122e-05, [1] [Cycle 1]: 1.665e-05, [1] [cse]: 1.104e-05 [environ_conv]: 5.31002e-06 [swap_dp_allreduce_reducescatter]: 4.95999e-06 [bias_add_comm_swap]: 2.74999e-06 [label_micro_interleaved_index]: 4.84e-06 [label_fine_grained_interleaved_index]: 2.80997e-06 [merge_cast_opt]: 1.33002e-06 [slice_recompute_activation]: 1.92001e-06 [micro_interleaved_order_control]: 2.56e-06 [assign_add_opt]: 1.20999e-06 [ForceFp32Comm]: 8.59989e-07 [remove_cast_before_assign_add]: 1.04e-06 [full_micro_interleaved_order_control]: 2.31e-06 [reorder_send_recv_between_fp_bp]: 2.98e-06 [comm_op_add_attrs]: 1.23002e-06 [add_comm_op_reuse_tag]: 9.89996e-07 [interleave_split_concat_branches]: 1.30001e-06 [interleave_parallel_branches]: 1.05001e-06 [overlap_opt_shard_in_pipeline]: 1.12e-06 [overlap_opt_shard_grad_in_pipeline]: 1.75001e-06 [control_data_broadcast_order]: 1.347e-05 [grouped_pairwise_exchange_alltoall]: 1.66e-06 [offloading_packed_experts]: 4.22e-06 [overlap_recompute_and_grad_model_parallel]: 4.45e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.17999e-06 [overlap_recompute_allgather_and_fa_grad]: 1.35999e-06 [overlap_recompute_comm]: 2.31e-06 [overlap_grad_ring_attention]: 4.13001e-06 [overlap_grad_flash_sp]: 2.041e-05 [begin_end_overlap_inline]: 5.00004e-07 [split_matmul_comm_elemetwise]: 2.15002e-06 [split_layernorm_comm]: 1.57001e-06 [handle_group_info]: 9.20001e-07 [symbol_engine_optimizer]: 7.947e-05, [1] [Cycle 1]: 7.493e-05, [6] [build]: 3.28998e-06 [elim_shapecalc]: 1.06e-05 [elim_not_effective]: 1.336e-05 [opt_reshape]: 7.06001e-06 [fold_const_symbol]: 1.064e-05 [renormalize]: 2.50002e-07 [detach_backward]: 2.09e-06 [pipeline_parallel_scheduler]: 1.81e-06 [auto_monad_reorder]: 1.833e-05 [get_jit_bprop_graph]: 1.64e-06 [rewriter_after_jit_bprop_graph]: 4.42e-06 [opt_after_jit_grad]: 0.0004918 [validate]: 4.072e-05 Sums bootstrap : 0.000503s : 3.49% type_inference : 0.006561s : 45.47% event_method : 0.000019s : 0.13% auto_monad : 0.000064s : 0.44% graph_reusing : 0.000006s : 0.04% inline : 0.000002s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000020s : 0.14% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.04% parallel-infer-symbol : 0.000003s : 0.02% pre_auto_parallel : 0.000034s : 0.24% insert-virtual-dataset : 0.000002s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000025s : 0.17% optimize.rewriter_before_opt_a : 0.000082s : 0.57% optimize.opt_a.expand_dump_flag : 0.000005s : 0.03% optimize.opt_a.switch_simplify : 0.000051s : 0.35% optimize.opt_a.loop_unroll : 0.000037s : 0.25% optimize.opt_a.a_1 : 0.000759s : 5.26% optimize.opt_a.with_stream_mark : 0.000033s : 0.23% optimize.opt_a.recompute_prepare : 0.000017s : 0.12% optimize.opt_a.updatestate_depend_eliminate : 0.000007s : 0.05% optimize.opt_a.updatestate_assign_eliminate : 0.000006s : 0.04% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.04% optimize.opt_a.parameter_eliminate : 0.000004s : 0.03% optimize.opt_a.a_2 : 0.000155s : 1.07% optimize.opt_a.accelerated_algorithm : 0.000014s : 0.10% optimize.opt_a.shard : 0.000004s : 0.03% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.03% optimize.opt_a.shard_inline : 0.000013s : 0.09% optimize.opt_a.merge_send_recv : 0.000015s : 0.10% optimize.opt_a.auto_parallel : 0.000014s : 0.09% optimize.opt_a.parallel : 0.000025s : 0.18% optimize.opt_a.flash_sp : 0.000012s : 0.08% optimize.opt_a.merge_comm : 0.000007s : 0.05% optimize.opt_a.allreduce_fusion : 0.000007s : 0.05% optimize.opt_a.matmul_add_comm_reduction : 0.000018s : 0.12% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000016s : 0.11% optimize.opt_a.virtual_dataset : 0.000013s : 0.09% optimize.opt_a.get_grad_eliminate_ : 0.000012s : 0.08% optimize.opt_a.virtual_output : 0.000012s : 0.08% optimize.opt_a.merge_forward : 0.000008s : 0.05% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.02% optimize.opt_a.offload_activation : 0.000020s : 0.14% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000027s : 0.19% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.02% optimize.opt_a.before_grad : 0.000021s : 0.14% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000008s : 0.06% optimize.opt_a.meta_fg_expand : 0.000005s : 0.04% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.03% optimize.opt_a.receive_attached : 0.000004s : 0.03% optimize.opt_a.after_resolve : 0.000023s : 0.16% optimize.opt_a.a_after_grad : 0.000020s : 0.14% optimize.opt_a.renormalize : 0.003187s : 22.09% optimize.opt_a.add_forward_monad_depend : 0.000009s : 0.06% optimize.opt_a.auto_monad_grad : 0.000005s : 0.03% optimize.opt_a.auto_monad_eliminator : 0.000026s : 0.18% optimize.opt_a.cse : 0.000047s : 0.33% optimize.opt_a.a_3 : 0.000093s : 0.64% optimize.py_interpret_to_execute_after_opt_a : 0.000013s : 0.09% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.01% optimize.rewriter_after_opt_a : 0.000037s : 0.26% optimize.convert_after_rewriter : 0.000007s : 0.05% optimize.order_py_execute_after_rewriter : 0.000006s : 0.04% optimize.mutable_eliminate : 0.000666s : 4.62% optimize.opt_b.b_1 : 0.000138s : 0.96% optimize.opt_b.b_2 : 0.000008s : 0.06% optimize.opt_b.updatestate_depend_eliminate : 0.000008s : 0.06% optimize.opt_b.updatestate_assign_eliminate : 0.000002s : 0.02% optimize.opt_b.updatestate_loads_eliminate : 0.000002s : 0.02% optimize.opt_b.renormalize : 0.000001s : 0.01% optimize.opt_b.cse : 0.000021s : 0.14% optimize.optimize_parallel_all_gather_comm : 0.000017s : 0.12% optimize.overlap_param_gather : 0.000002s : 0.01% optimize.cconv : 0.000028s : 0.19% optimize.loop_unroll : 0.000458s : 3.18% optimize.opt_after_cconv.c_1 : 0.000032s : 0.22% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.02% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.04% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.02% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.02% optimize.opt_after_cconv.cse : 0.000019s : 0.13% optimize.opt_after_cconv.renormalize : 0.000001s : 0.00% optimize.remove_dup_value : 0.000015s : 0.10% optimize.tuple_transform.d_1 : 0.000045s : 0.31% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000007s : 0.05% optimize.partial_unused_args_eliminate : 0.000002s : 0.01% optimize.add_recomputation : 0.000049s : 0.34% optimize.cse_after_recomputation.cse : 0.000011s : 0.08% optimize.environ_conv : 0.000005s : 0.04% optimize.swap_dp_allreduce_reducescatter : 0.000005s : 0.03% optimize.bias_add_comm_swap : 0.000003s : 0.02% optimize.label_micro_interleaved_index : 0.000005s : 0.03% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.02% optimize.merge_cast_opt : 0.000001s : 0.01% optimize.slice_recompute_activation : 0.000002s : 0.01% optimize.micro_interleaved_order_control : 0.000003s : 0.02% optimize.assign_add_opt : 0.000001s : 0.01% optimize.ForceFp32Comm : 0.000001s : 0.01% optimize.remove_cast_before_assign_add : 0.000001s : 0.01% optimize.full_micro_interleaved_order_control : 0.000002s : 0.02% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.02% optimize.comm_op_add_attrs : 0.000001s : 0.01% optimize.add_comm_op_reuse_tag : 0.000001s : 0.01% optimize.interleave_split_concat_branches : 0.000001s : 0.01% optimize.interleave_parallel_branches : 0.000001s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.01% optimize.control_data_broadcast_order : 0.000013s : 0.09% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.01% optimize.offloading_packed_experts : 0.000004s : 0.03% optimize.overlap_recompute_and_grad_model_parallel : 0.000004s : 0.03% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.01% optimize.overlap_recompute_comm : 0.000002s : 0.02% optimize.overlap_grad_ring_attention : 0.000004s : 0.03% optimize.overlap_grad_flash_sp : 0.000020s : 0.14% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.01% optimize.split_layernorm_comm : 0.000002s : 0.01% optimize.handle_group_info : 0.000001s : 0.01% optimize.symbol_engine_optimizer.build : 0.000003s : 0.02% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000011s : 0.07% optimize.symbol_engine_optimizer.elim_not_effective : 0.000013s : 0.09% optimize.symbol_engine_optimizer.opt_reshape : 0.000007s : 0.05% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000011s : 0.07% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.01% pipeline_parallel_scheduler : 0.000002s : 0.01% auto_monad_reorder : 0.000018s : 0.13% get_jit_bprop_graph : 0.000002s : 0.01% rewriter_after_jit_bprop_graph : 0.000004s : 0.03% opt_after_jit_grad : 0.000492s : 3.41% validate : 0.000041s : 0.28% Time group info: ------[substitution.] 0.000184 29 1.01% : 0.000002s : 2: substitution.elim_not_effective 0.71% : 0.000001s : 2: substitution.fold_const_symbol 3.28% : 0.000006s : 4: substitution.graph_param_transform 74.78% : 0.000138s : 4: substitution.inline 2.39% : 0.000004s : 4: substitution.j_node_and_user_rematch 2.84% : 0.000005s : 4: substitution.remove_not_recompute_node 2.95% : 0.000005s : 4: substitution.replace_old_param 7.92% : 0.000015s : 4: substitution.tuple_list_get_item_eliminator 4.13% : 0.000008s : 1: substitution.value_based_eliminate ------[type_inference.] 0.006498 2 89.28% : 0.005801s : 1: type_inference.infer 10.72% : 0.000696s : 1: type_inference.specialize ------[replace.] 0.000062 8 62.94% : 0.000039s : 4: replace.inline 37.06% : 0.000023s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000148 8 91.42% : 0.000135s : 4: match.inline 8.58% : 0.000013s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000205 1278 0.91% : 0.000002s : 13: predicate.accumulaten_eliminater 0.77% : 0.000002s : 4: predicate.ad_related_special_op_eliminate 0.51% : 0.000001s : 8: predicate.addn_check_dump 0.91% : 0.000002s : 13: predicate.addn_zero_filter 0.90% : 0.000002s : 13: predicate.adjust_all_reduce_mul_add 2.24% : 0.000005s : 21: predicate.arithmetic_simplify 0.91% : 0.000002s : 13: predicate.cast_eliminate 0.62% : 0.000001s : 8: predicate.check_bprop_eliminate 0.51% : 0.000001s : 8: predicate.compare_switch_simplify 0.19% : 0.000000s : 4: predicate.const_output_eliminate 0.57% : 0.000001s : 8: predicate.depend_value_elim 0.92% : 0.000002s : 13: predicate.dict_get_item_const_eliminator 0.99% : 0.000002s : 13: predicate.dict_get_item_eliminator 0.92% : 0.000002s : 13: predicate.dict_set_item_eliminator 1.03% : 0.000002s : 8: predicate.dumpgradient_eliminate 0.20% : 0.000000s : 4: predicate.elim_not_effective 0.41% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.08% : 0.000002s : 17: predicate.environ_add_const_eliminate 1.12% : 0.000002s : 17: predicate.environ_get_add_eliminate 1.06% : 0.000002s : 17: predicate.environ_get_depend_swap 1.62% : 0.000003s : 25: predicate.environ_get_eliminate 1.05% : 0.000002s : 17: predicate.environ_get_set_eliminate 1.53% : 0.000003s : 21: predicate.exchange_switch_depend_value 2.43% : 0.000005s : 21: predicate.float_depend_g_call 0.50% : 0.000001s : 8: predicate.float_environ_get_switch 0.75% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.20% : 0.000000s : 4: predicate.fold_const_symbol 0.62% : 0.000001s : 8: predicate.get_grad_eliminate 0.22% : 0.000000s : 4: predicate.graph_param_transform 0.59% : 0.000001s : 8: predicate.incorporate_call 0.48% : 0.000001s : 8: predicate.incorporate_call_switch 6.15% : 0.000013s : 58: predicate.inline 0.83% : 0.000002s : 8: predicate.inline_without_move 0.32% : 0.000001s : 8: predicate.j_node_and_user_rematch 0.86% : 0.000002s : 8: predicate.less_batch_normalization 1.97% : 0.000004s : 25: predicate.list_to_tuple_eliminator_ 2.49% : 0.000005s : 38: predicate.load_eliminater 0.92% : 0.000002s : 4: predicate.loop_unroll_after_grad 2.36% : 0.000005s : 34: predicate.loop_unroll_before_grad 1.68% : 0.000003s : 21: predicate.make_slice_get_slice_eliminator 0.54% : 0.000001s : 8: predicate.merge_addn 0.55% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.58% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.80% : 0.000002s : 13: predicate.minmaximum_grad 1.13% : 0.000002s : 4: predicate.mutable_eliminate 0.38% : 0.000001s : 4: predicate.opt_reshape 0.39% : 0.000001s : 4: predicate.parallel_virtual_node 1.84% : 0.000004s : 21: predicate.partial_defer_inline 1.66% : 0.000003s : 21: predicate.partial_eliminate 0.88% : 0.000002s : 13: predicate.print_const_string_wrapper 0.57% : 0.000001s : 8: predicate.reduce_all_const_elim 1.12% : 0.000002s : 13: predicate.reduce_eliminate 2.55% : 0.000005s : 38: predicate.redundant_stop_gradient_eliminater 0.51% : 0.000001s : 8: predicate.remove_not_recompute_node 1.53% : 0.000003s : 25: predicate.replace_applicator 0.52% : 0.000001s : 8: predicate.replace_old_param 0.41% : 0.000001s : 4: predicate.reset_defer_inline 0.94% : 0.000002s : 13: predicate.reshape_eliminate 0.62% : 0.000001s : 8: predicate.row_tensor_add_zeros_like 0.42% : 0.000001s : 4: predicate.row_tensor_eliminate 0.99% : 0.000002s : 8: predicate.same_eliminate 0.41% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.75% : 0.000002s : 8: predicate.shard_identity_eliminate 0.82% : 0.000002s : 8: predicate.special_op_eliminate 0.72% : 0.000001s : 8: predicate.specialize_transform 0.92% : 0.000002s : 8: predicate.split_environ_get_set_with_tuple_value 0.79% : 0.000002s : 8: predicate.stack_unstack_eliminate 0.34% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.52% : 0.000003s : 21: predicate.switch_defer_inline 2.03% : 0.000004s : 29: predicate.switch_layer_defer_inline 5.09% : 0.000010s : 67: predicate.switch_simplify 0.85% : 0.000002s : 13: predicate.tile_eliminate 1.09% : 0.000002s : 13: predicate.transpose_eliminate 1.43% : 0.000003s : 21: predicate.tuple_list_convert_item_index_to_positive 1.59% : 0.000003s : 21: predicate.tuple_list_get_item_const_eliminator 1.39% : 0.000003s : 21: predicate.tuple_list_get_item_depend_reorder 3.43% : 0.000007s : 33: predicate.tuple_list_get_item_eliminator 1.44% : 0.000003s : 21: predicate.tuple_list_get_set_item_eliminator 2.21% : 0.000005s : 29: predicate.tuple_list_set_item_eliminator 1.82% : 0.000004s : 25: predicate.tuple_to_list_eliminator_ 2.42% : 0.000005s : 38: predicate.updatestate_pure_node_eliminater 3.13% : 0.000006s : 46: predicate.updatestate_useless_node_eliminater 0.42% : 0.000001s : 4: predicate.value_based_eliminate 0.69% : 0.000001s : 8: predicate.virtual_dataset_eliminate 0.62% : 0.000001s : 8: predicate.virtual_output_eliminate 0.27% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.52% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000557 11 56.61% : 0.000315s : 5: func_graph_cloner_run.FuncGraphClonerGraph 43.39% : 0.000242s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.033927 192 0.01% : 0.000004s : 1: ForceFp32Comm 9.48% : 0.003217s : 1: add_attr 9.45% : 0.003206s : 1: add_attr_with_inline 0.01% : 0.000004s : 1: add_comm_op_reuse_tag 0.15% : 0.000053s : 1: add_recomputation 0.01% : 0.000004s : 1: assign_add_opt 0.20% : 0.000069s : 1: auto_monad 0.07% : 0.000023s : 1: auto_monad_reorder 0.01% : 0.000003s : 1: begin_end_overlap_inline 0.02% : 0.000006s : 1: bias_add_comm_swap 1.58% : 0.000537s : 1: bootstrap 0.09% : 0.000032s : 1: cconv 0.01% : 0.000004s : 1: comm_op_add_attrs 0.05% : 0.000016s : 1: control_data_broadcast_order 0.03% : 0.000010s : 1: convert_after_rewriter 0.07% : 0.000024s : 1: cse_after_recomputation 0.02% : 0.000005s : 1: dataset_repeat_opt 0.02% : 0.000005s : 1: detach_backward 0.02% : 0.000008s : 1: environ_conv 0.08% : 0.000026s : 1: event_method 0.01% : 0.000005s : 1: full_micro_interleaved_order_control 0.01% : 0.000005s : 1: get_jit_bprop_graph 0.03% : 0.000010s : 1: graph_reusing 0.01% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000004s : 1: handle_group_info 0.02% : 0.000005s : 1: inline 0.02% : 0.000006s : 1: insert-virtual-dataset 0.01% : 0.000004s : 1: interleave_parallel_branches 0.01% : 0.000004s : 1: interleave_split_concat_branches 0.02% : 0.000006s : 1: label_fine_grained_interleaved_index 0.02% : 0.000008s : 1: label_micro_interleaved_index 1.38% : 0.000467s : 1: loop_unroll 0.01% : 0.000004s : 1: merge_cast_opt 0.02% : 0.000005s : 1: micro_interleaved_order_control 1.99% : 0.000677s : 1: mutable_eliminate 0.02% : 0.000007s : 1: offloading_packed_experts 0.05% : 0.000015s : 1: opt.transform.loop_unroll_optimizer 0.05% : 0.000018s : 1: opt.transform.mutable_eliminate 3.50% : 0.001187s : 78: opt.transform.opt_a 0.09% : 0.000030s : 1: opt.transform.opt_after_cconv 0.08% : 0.000027s : 1: opt.transform.opt_after_jit_grad 0.34% : 0.000114s : 28: opt.transform.opt_b 0.15% : 0.000050s : 2: opt.transform.opt_trans_graph 0.11% : 0.000038s : 4: opt.transform.symbol_engine_opt 15.58% : 0.005287s : 1: opt_a 0.32% : 0.000110s : 1: opt_after_cconv 1.48% : 0.000502s : 1: opt_after_jit_grad 0.68% : 0.000230s : 1: opt_b 22.20% : 0.007530s : 1: optimize 0.06% : 0.000020s : 1: optimize_parallel_all_gather_comm 0.03% : 0.000009s : 1: order_py_execute_after_rewriter 0.07% : 0.000024s : 1: overlap_grad_flash_sp 0.01% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.02% : 0.000007s : 1: overlap_grad_ring_attention 0.01% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.02% : 0.000005s : 1: overlap_param_gather 0.01% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.02% : 0.000008s : 1: overlap_recompute_and_grad_model_parallel 0.02% : 0.000005s : 1: overlap_recompute_comm 0.02% : 0.000007s : 1: parallel-infer-symbol 0.01% : 0.000004s : 1: parallel-infer-symbol-second 0.01% : 0.000005s : 1: partial_unused_args_eliminate 0.01% : 0.000005s : 1: pipeline_parallel_scheduler 0.01% : 0.000005s : 1: pipeline_split 0.11% : 0.000038s : 1: pre_auto_parallel 0.09% : 0.000029s : 1: py_interpret_to_execute 0.05% : 0.000016s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000004s : 1: remove_cast_before_assign_add 0.05% : 0.000018s : 1: remove_dup_value 8.44% : 0.002862s : 1: renormalize.infer 0.93% : 0.000315s : 1: renormalize.specialize 0.02% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.02% : 0.000008s : 1: rewriter_after_jit_bprop_graph 0.12% : 0.000041s : 1: rewriter_after_opt_a 0.25% : 0.000086s : 1: rewriter_before_opt_a 0.01% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.01% : 0.000005s : 1: slice_recompute_activation 0.01% : 0.000005s : 1: split_layernorm_comm 0.01% : 0.000005s : 1: split_matmul_comm_elemetwise 0.02% : 0.000008s : 1: swap_dp_allreduce_reducescatter 0.24% : 0.000082s : 1: symbol_engine_optimizer 0.24% : 0.000081s : 1: tuple_transform 19.39% : 0.006580s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:44:27.600.160 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:44:27.600.556 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0235326, [21] [bootstrap]: 0.00046376 [type_inference]: 0.0111651 [event_method]: 2.452e-05 [auto_monad]: 7.011e-05 [graph_reusing]: 7.5e-06 [inline]: 3.43e-06 [add_attr]: 0.00434344, [1] [add_attr_with_inline]: 0.00433019, [1] [Cycle 1]: 9.531e-05, [2] [tag_attr]: 2.457e-05 [meta_addattr_fg_expand]: 5.52001e-06 [parallel-infer-symbol]: 3.95e-06 [pre_auto_parallel]: 4.348e-05 [insert-virtual-dataset]: 2.52001e-06 [parallel-infer-symbol-second]: 8.30012e-07 [dataset_repeat_opt]: 2.77002e-06 [pipeline_split]: 1.64e-06 [optimize]: 0.00609192, [53] [py_interpret_to_execute]: 3.478e-05 [rewriter_before_opt_a]: 9.914e-05 [opt_a]: 0.00344591, [2] [Cycle 1]: 0.00253032, [45] [expand_dump_flag]: 2.90998e-06 [switch_simplify]: 4.5e-05 [loop_unroll]: 3.059e-05 [a_1]: 0.00067529 [with_stream_mark]: 2.18e-05 [recompute_prepare]: 1.143e-05 [updatestate_depend_eliminate]: 4.54998e-06 [updatestate_assign_eliminate]: 3.61999e-06 [updatestate_loads_eliminate]: 2.96999e-06 [parameter_eliminate]: 2.45002e-06 [a_2]: 0.00011381 [accelerated_algorithm]: 8.85001e-06 [shard]: 2.36e-06 [meta_shard_fg_expand]: 2.48e-06 [shard_inline]: 7.86001e-06 [merge_send_recv]: 1.028e-05 [auto_parallel]: 9.67999e-06 [parallel]: 2.057e-05 [flash_sp]: 1.094e-05 [merge_comm]: 4.60001e-06 [allreduce_fusion]: 3.68e-06 [matmul_add_comm_reduction]: 1.2e-05 [allreduce_slice_to_reducescatter]: 7.7e-07 [virtual_shard_identity]: 1.323e-05 [virtual_dataset]: 7.3e-06 [get_grad_eliminate_]: 6.63e-06 [virtual_output]: 6.37001e-06 [merge_forward]: 5.05999e-06 [cell_reuse_recompute_pass]: 1.40999e-06 [offload_activation]: 1.076e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.785e-05 [merge_recompute_call_nodes]: 1.54e-06 [before_grad]: 1.165e-05 [set_forward_comm_id_for_comm_node_pass]: 5.82999e-06 [meta_fg_expand]: 2.91e-06 [flash_sp_send_recv_attached]: 3.18e-06 [receive_attached]: 2.90002e-06 [after_resolve]: 1.461e-05 [a_after_grad]: 1.195e-05 [renormalize]: 0.00079796 [add_forward_monad_depend]: 7.51001e-06 [auto_monad_grad]: 2.81e-06 [auto_monad_eliminator]: 2.073e-05 [cse]: 2.973e-05 [a_3]: 0.00010017 [Cycle 2]: 0.0008967, [45] [expand_dump_flag]: 2.32001e-06 [switch_simplify]: 9.04998e-06 [loop_unroll]: 6.37001e-06 [a_1]: 0.00013421 [with_stream_mark]: 1.683e-05 [recompute_prepare]: 7.01001e-06 [updatestate_depend_eliminate]: 3.5e-06 [updatestate_assign_eliminate]: 2.96001e-06 [updatestate_loads_eliminate]: 2.98e-06 [parameter_eliminate]: 2.19001e-06 [a_2]: 0.00010427 [accelerated_algorithm]: 7.99002e-06 [shard]: 2.39999e-06 [meta_shard_fg_expand]: 1.97999e-06 [shard_inline]: 7.14001e-06 [merge_send_recv]: 9.26002e-06 [auto_parallel]: 7.96001e-06 [parallel]: 7.75e-06 [flash_sp]: 4.72e-06 [merge_comm]: 3.46999e-06 [allreduce_fusion]: 3.83999e-06 [matmul_add_comm_reduction]: 9.46e-06 [allreduce_slice_to_reducescatter]: 5.8001e-07 [virtual_shard_identity]: 9.32001e-06 [virtual_dataset]: 6.18002e-06 [get_grad_eliminate_]: 6.12999e-06 [virtual_output]: 7.20003e-06 [merge_forward]: 3.59002e-06 [cell_reuse_recompute_pass]: 2.58998e-06 [offload_activation]: 8.94e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.779e-05 [merge_recompute_call_nodes]: 1.37e-06 [before_grad]: 1.028e-05 [set_forward_comm_id_for_comm_node_pass]: 4.16001e-06 [meta_fg_expand]: 2.43e-06 [flash_sp_send_recv_attached]: 1.42999e-06 [receive_attached]: 2.44001e-06 [after_resolve]: 1.304e-05 [a_after_grad]: 9.55001e-06 [renormalize]: 8.9989e-08 [add_forward_monad_depend]: 2.63003e-06 [auto_monad_grad]: 1.72001e-06 [auto_monad_eliminator]: 1.097e-05 [cse]: 2.026e-05 [a_3]: 5.075e-05 [py_interpret_to_execute_after_opt_a]: 1.721e-05 [slice_cell_reuse_recomputed_activation]: 5.56998e-06 [rewriter_after_opt_a]: 4.714e-05 [convert_after_rewriter]: 1.007e-05 [order_py_execute_after_rewriter]: 8.26002e-06 [mutable_eliminate]: 0.00068321 [opt_b]: 0.00031539, [1] [Cycle 1]: 0.00030389, [7] [b_1]: 0.0001898 [b_2]: 1.034e-05 [updatestate_depend_eliminate]: 1.028e-05 [updatestate_assign_eliminate]: 2.89001e-06 [updatestate_loads_eliminate]: 2.70997e-06 [renormalize]: 6.40022e-07 [cse]: 2.619e-05 [optimize_parallel_all_gather_comm]: 2.492e-05 [overlap_param_gather]: 4.52e-06 [cconv]: 3.761e-05 [loop_unroll]: 0.00051202 [opt_after_cconv]: 0.00013793, [1] [Cycle 1]: 0.00012781, [7] [c_1]: 3.356e-05 [parameter_eliminate]: 4.82e-06 [updatestate_depend_eliminate]: 7e-06 [updatestate_assign_eliminate]: 2.84999e-06 [updatestate_loads_eliminate]: 2.47001e-06 [cse]: 2.008e-05 [renormalize]: 5.09986e-07 [remove_dup_value]: 1.796e-05 [tuple_transform]: 9.314e-05, [1] [Cycle 1]: 8.512e-05, [4] [d_1]: 4.504e-05 [none_parameter_eliminate]: 2.37001e-06 [renormalize]: 2.59985e-07 [switch_simplify]: 7.28e-06 [partial_unused_args_eliminate]: 4.52e-06 [add_recomputation]: 5.596e-05 [cse_after_recomputation]: 2.865e-05, [1] [Cycle 1]: 2.143e-05, [1] [cse]: 1.21e-05 [environ_conv]: 7.77998e-06 [swap_dp_allreduce_reducescatter]: 7.98001e-06 [bias_add_comm_swap]: 5.24e-06 [label_micro_interleaved_index]: 8.19002e-06 [label_fine_grained_interleaved_index]: 5.05999e-06 [merge_cast_opt]: 3.94002e-06 [slice_recompute_activation]: 4.81002e-06 [micro_interleaved_order_control]: 5.22e-06 [assign_add_opt]: 3.66001e-06 [ForceFp32Comm]: 3.44001e-06 [remove_cast_before_assign_add]: 3.36999e-06 [full_micro_interleaved_order_control]: 4.55001e-06 [reorder_send_recv_between_fp_bp]: 5.71e-06 [comm_op_add_attrs]: 3.53e-06 [add_comm_op_reuse_tag]: 3.28e-06 [interleave_split_concat_branches]: 3.85998e-06 [interleave_parallel_branches]: 3.57002e-06 [overlap_opt_shard_in_pipeline]: 3.54002e-06 [overlap_opt_shard_grad_in_pipeline]: 4.42998e-06 [control_data_broadcast_order]: 1.648e-05 [grouped_pairwise_exchange_alltoall]: 4.23999e-06 [offloading_packed_experts]: 6.68e-06 [overlap_recompute_and_grad_model_parallel]: 7.44002e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.53e-06 [overlap_recompute_allgather_and_fa_grad]: 3.65e-06 [overlap_recompute_comm]: 4.87e-06 [overlap_grad_ring_attention]: 6.49001e-06 [overlap_grad_flash_sp]: 2.504e-05 [begin_end_overlap_inline]: 3.03998e-06 [split_matmul_comm_elemetwise]: 4.54998e-06 [split_layernorm_comm]: 4.08001e-06 [handle_group_info]: 3.35003e-06 [symbol_engine_optimizer]: 0.00010126, [1] [Cycle 1]: 9.425e-05, [6] [build]: 3.45e-06 [elim_shapecalc]: 1.027e-05 [elim_not_effective]: 1.283e-05 [opt_reshape]: 7.2e-06 [fold_const_symbol]: 1.075e-05 [renormalize]: 4.19997e-07 [detach_backward]: 5.02e-06 [pipeline_parallel_scheduler]: 2.46998e-06 [auto_monad_reorder]: 2.199e-05 [get_jit_bprop_graph]: 1.89999e-06 [rewriter_after_jit_bprop_graph]: 5.92001e-06 [opt_after_jit_grad]: 0.00057763 [validate]: 4.521e-05 Sums bootstrap : 0.000464s : 2.68% type_inference : 0.011165s : 64.56% event_method : 0.000025s : 0.14% auto_monad : 0.000070s : 0.41% graph_reusing : 0.000007s : 0.04% inline : 0.000003s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000025s : 0.14% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.03% parallel-infer-symbol : 0.000004s : 0.02% pre_auto_parallel : 0.000043s : 0.25% insert-virtual-dataset : 0.000003s : 0.01% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000003s : 0.02% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000035s : 0.20% optimize.rewriter_before_opt_a : 0.000099s : 0.57% optimize.opt_a.expand_dump_flag : 0.000005s : 0.03% optimize.opt_a.switch_simplify : 0.000054s : 0.31% optimize.opt_a.loop_unroll : 0.000037s : 0.21% optimize.opt_a.a_1 : 0.000809s : 4.68% optimize.opt_a.with_stream_mark : 0.000039s : 0.22% optimize.opt_a.recompute_prepare : 0.000018s : 0.11% optimize.opt_a.updatestate_depend_eliminate : 0.000008s : 0.05% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.04% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.03% optimize.opt_a.parameter_eliminate : 0.000005s : 0.03% optimize.opt_a.a_2 : 0.000218s : 1.26% optimize.opt_a.accelerated_algorithm : 0.000017s : 0.10% optimize.opt_a.shard : 0.000005s : 0.03% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.03% optimize.opt_a.shard_inline : 0.000015s : 0.09% optimize.opt_a.merge_send_recv : 0.000020s : 0.11% optimize.opt_a.auto_parallel : 0.000018s : 0.10% optimize.opt_a.parallel : 0.000028s : 0.16% optimize.opt_a.flash_sp : 0.000016s : 0.09% optimize.opt_a.merge_comm : 0.000008s : 0.05% optimize.opt_a.allreduce_fusion : 0.000008s : 0.04% optimize.opt_a.matmul_add_comm_reduction : 0.000021s : 0.12% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000023s : 0.13% optimize.opt_a.virtual_dataset : 0.000013s : 0.08% optimize.opt_a.get_grad_eliminate_ : 0.000013s : 0.07% optimize.opt_a.virtual_output : 0.000014s : 0.08% optimize.opt_a.merge_forward : 0.000009s : 0.05% optimize.opt_a.cell_reuse_recompute_pass : 0.000004s : 0.02% optimize.opt_a.offload_activation : 0.000020s : 0.11% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000036s : 0.21% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.02% optimize.opt_a.before_grad : 0.000022s : 0.13% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000010s : 0.06% optimize.opt_a.meta_fg_expand : 0.000005s : 0.03% optimize.opt_a.flash_sp_send_recv_attached : 0.000005s : 0.03% optimize.opt_a.receive_attached : 0.000005s : 0.03% optimize.opt_a.after_resolve : 0.000028s : 0.16% optimize.opt_a.a_after_grad : 0.000022s : 0.12% optimize.opt_a.renormalize : 0.000798s : 4.61% optimize.opt_a.add_forward_monad_depend : 0.000010s : 0.06% optimize.opt_a.auto_monad_grad : 0.000005s : 0.03% optimize.opt_a.auto_monad_eliminator : 0.000032s : 0.18% optimize.opt_a.cse : 0.000050s : 0.29% optimize.opt_a.a_3 : 0.000151s : 0.87% optimize.py_interpret_to_execute_after_opt_a : 0.000017s : 0.10% optimize.slice_cell_reuse_recomputed_activation : 0.000006s : 0.03% optimize.rewriter_after_opt_a : 0.000047s : 0.27% optimize.convert_after_rewriter : 0.000010s : 0.06% optimize.order_py_execute_after_rewriter : 0.000008s : 0.05% optimize.mutable_eliminate : 0.000683s : 3.95% optimize.opt_b.b_1 : 0.000190s : 1.10% optimize.opt_b.b_2 : 0.000010s : 0.06% optimize.opt_b.updatestate_depend_eliminate : 0.000010s : 0.06% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.02% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.02% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000026s : 0.15% optimize.optimize_parallel_all_gather_comm : 0.000025s : 0.14% optimize.overlap_param_gather : 0.000005s : 0.03% optimize.cconv : 0.000038s : 0.22% optimize.loop_unroll : 0.000512s : 2.96% optimize.opt_after_cconv.c_1 : 0.000034s : 0.19% optimize.opt_after_cconv.parameter_eliminate : 0.000005s : 0.03% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000007s : 0.04% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.02% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.01% optimize.opt_after_cconv.cse : 0.000020s : 0.12% optimize.opt_after_cconv.renormalize : 0.000001s : 0.00% optimize.remove_dup_value : 0.000018s : 0.10% optimize.tuple_transform.d_1 : 0.000045s : 0.26% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000007s : 0.04% optimize.partial_unused_args_eliminate : 0.000005s : 0.03% optimize.add_recomputation : 0.000056s : 0.32% optimize.cse_after_recomputation.cse : 0.000012s : 0.07% optimize.environ_conv : 0.000008s : 0.04% optimize.swap_dp_allreduce_reducescatter : 0.000008s : 0.05% optimize.bias_add_comm_swap : 0.000005s : 0.03% optimize.label_micro_interleaved_index : 0.000008s : 0.05% optimize.label_fine_grained_interleaved_index : 0.000005s : 0.03% optimize.merge_cast_opt : 0.000004s : 0.02% optimize.slice_recompute_activation : 0.000005s : 0.03% optimize.micro_interleaved_order_control : 0.000005s : 0.03% optimize.assign_add_opt : 0.000004s : 0.02% optimize.ForceFp32Comm : 0.000003s : 0.02% optimize.remove_cast_before_assign_add : 0.000003s : 0.02% optimize.full_micro_interleaved_order_control : 0.000005s : 0.03% optimize.reorder_send_recv_between_fp_bp : 0.000006s : 0.03% optimize.comm_op_add_attrs : 0.000004s : 0.02% optimize.add_comm_op_reuse_tag : 0.000003s : 0.02% optimize.interleave_split_concat_branches : 0.000004s : 0.02% optimize.interleave_parallel_branches : 0.000004s : 0.02% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.02% optimize.overlap_opt_shard_grad_in_pipeline : 0.000004s : 0.03% optimize.control_data_broadcast_order : 0.000016s : 0.10% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.02% optimize.offloading_packed_experts : 0.000007s : 0.04% optimize.overlap_recompute_and_grad_model_parallel : 0.000007s : 0.04% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.02% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.02% optimize.overlap_recompute_comm : 0.000005s : 0.03% optimize.overlap_grad_ring_attention : 0.000006s : 0.04% optimize.overlap_grad_flash_sp : 0.000025s : 0.14% optimize.begin_end_overlap_inline : 0.000003s : 0.02% optimize.split_matmul_comm_elemetwise : 0.000005s : 0.03% optimize.split_layernorm_comm : 0.000004s : 0.02% optimize.handle_group_info : 0.000003s : 0.02% optimize.symbol_engine_optimizer.build : 0.000003s : 0.02% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000010s : 0.06% optimize.symbol_engine_optimizer.elim_not_effective : 0.000013s : 0.07% optimize.symbol_engine_optimizer.opt_reshape : 0.000007s : 0.04% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000011s : 0.06% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000005s : 0.03% pipeline_parallel_scheduler : 0.000002s : 0.01% auto_monad_reorder : 0.000022s : 0.13% get_jit_bprop_graph : 0.000002s : 0.01% rewriter_after_jit_bprop_graph : 0.000006s : 0.03% opt_after_jit_grad : 0.000578s : 3.34% validate : 0.000045s : 0.26% Time group info: ------[substitution.] 0.000224 29 0.82% : 0.000002s : 2: substitution.elim_not_effective 0.67% : 0.000002s : 2: substitution.fold_const_symbol 2.63% : 0.000006s : 4: substitution.graph_param_transform 77.43% : 0.000174s : 4: substitution.inline 1.91% : 0.000004s : 4: substitution.j_node_and_user_rematch 2.39% : 0.000005s : 4: substitution.remove_not_recompute_node 3.19% : 0.000007s : 4: substitution.replace_old_param 7.12% : 0.000016s : 4: substitution.tuple_list_get_item_eliminator 3.85% : 0.000009s : 1: substitution.value_based_eliminate ------[type_inference.] 0.011090 2 91.91% : 0.010193s : 1: type_inference.infer 8.09% : 0.000897s : 1: type_inference.specialize ------[replace.] 0.000067 8 65.25% : 0.000043s : 4: replace.inline 34.75% : 0.000023s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000184 8 92.59% : 0.000171s : 4: match.inline 7.41% : 0.000014s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000225 1278 0.79% : 0.000002s : 13: predicate.accumulaten_eliminater 0.86% : 0.000002s : 4: predicate.ad_related_special_op_eliminate 0.57% : 0.000001s : 8: predicate.addn_check_dump 1.02% : 0.000002s : 13: predicate.addn_zero_filter 0.73% : 0.000002s : 13: predicate.adjust_all_reduce_mul_add 1.97% : 0.000004s : 21: predicate.arithmetic_simplify 0.98% : 0.000002s : 13: predicate.cast_eliminate 0.55% : 0.000001s : 8: predicate.check_bprop_eliminate 0.75% : 0.000002s : 8: predicate.compare_switch_simplify 0.17% : 0.000000s : 4: predicate.const_output_eliminate 0.67% : 0.000002s : 8: predicate.depend_value_elim 0.91% : 0.000002s : 13: predicate.dict_get_item_const_eliminator 0.95% : 0.000002s : 13: predicate.dict_get_item_eliminator 0.74% : 0.000002s : 13: predicate.dict_set_item_eliminator 1.40% : 0.000003s : 8: predicate.dumpgradient_eliminate 0.25% : 0.000001s : 4: predicate.elim_not_effective 0.37% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.07% : 0.000002s : 17: predicate.environ_add_const_eliminate 0.99% : 0.000002s : 17: predicate.environ_get_add_eliminate 1.10% : 0.000002s : 17: predicate.environ_get_depend_swap 1.63% : 0.000004s : 25: predicate.environ_get_eliminate 0.97% : 0.000002s : 17: predicate.environ_get_set_eliminate 1.77% : 0.000004s : 21: predicate.exchange_switch_depend_value 2.48% : 0.000006s : 21: predicate.float_depend_g_call 0.52% : 0.000001s : 8: predicate.float_environ_get_switch 0.79% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.15% : 0.000000s : 4: predicate.fold_const_symbol 0.60% : 0.000001s : 8: predicate.get_grad_eliminate 0.18% : 0.000000s : 4: predicate.graph_param_transform 0.54% : 0.000001s : 8: predicate.incorporate_call 0.43% : 0.000001s : 8: predicate.incorporate_call_switch 6.64% : 0.000015s : 58: predicate.inline 0.83% : 0.000002s : 8: predicate.inline_without_move 0.30% : 0.000001s : 8: predicate.j_node_and_user_rematch 0.70% : 0.000002s : 8: predicate.less_batch_normalization 1.71% : 0.000004s : 25: predicate.list_to_tuple_eliminator_ 2.37% : 0.000005s : 38: predicate.load_eliminater 1.02% : 0.000002s : 4: predicate.loop_unroll_after_grad 2.13% : 0.000005s : 34: predicate.loop_unroll_before_grad 1.82% : 0.000004s : 21: predicate.make_slice_get_slice_eliminator 0.54% : 0.000001s : 8: predicate.merge_addn 0.62% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.71% : 0.000002s : 8: predicate.mini_step_allgather_replace 0.74% : 0.000002s : 13: predicate.minmaximum_grad 1.28% : 0.000003s : 4: predicate.mutable_eliminate 0.29% : 0.000001s : 4: predicate.opt_reshape 0.58% : 0.000001s : 4: predicate.parallel_virtual_node 1.87% : 0.000004s : 21: predicate.partial_defer_inline 1.50% : 0.000003s : 21: predicate.partial_eliminate 0.99% : 0.000002s : 13: predicate.print_const_string_wrapper 0.52% : 0.000001s : 8: predicate.reduce_all_const_elim 1.01% : 0.000002s : 13: predicate.reduce_eliminate 2.52% : 0.000006s : 38: predicate.redundant_stop_gradient_eliminater 0.50% : 0.000001s : 8: predicate.remove_not_recompute_node 1.32% : 0.000003s : 25: predicate.replace_applicator 0.58% : 0.000001s : 8: predicate.replace_old_param 0.52% : 0.000001s : 4: predicate.reset_defer_inline 0.88% : 0.000002s : 13: predicate.reshape_eliminate 0.72% : 0.000002s : 8: predicate.row_tensor_add_zeros_like 0.71% : 0.000002s : 4: predicate.row_tensor_eliminate 0.76% : 0.000002s : 8: predicate.same_eliminate 0.52% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.89% : 0.000002s : 8: predicate.shard_identity_eliminate 0.71% : 0.000002s : 8: predicate.special_op_eliminate 0.84% : 0.000002s : 8: predicate.specialize_transform 1.42% : 0.000003s : 8: predicate.split_environ_get_set_with_tuple_value 0.96% : 0.000002s : 8: predicate.stack_unstack_eliminate 0.30% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.39% : 0.000003s : 21: predicate.switch_defer_inline 1.96% : 0.000004s : 29: predicate.switch_layer_defer_inline 5.27% : 0.000012s : 67: predicate.switch_simplify 0.89% : 0.000002s : 13: predicate.tile_eliminate 1.05% : 0.000002s : 13: predicate.transpose_eliminate 1.38% : 0.000003s : 21: predicate.tuple_list_convert_item_index_to_positive 1.44% : 0.000003s : 21: predicate.tuple_list_get_item_const_eliminator 1.55% : 0.000004s : 21: predicate.tuple_list_get_item_depend_reorder 3.15% : 0.000007s : 33: predicate.tuple_list_get_item_eliminator 1.25% : 0.000003s : 21: predicate.tuple_list_get_set_item_eliminator 2.13% : 0.000005s : 29: predicate.tuple_list_set_item_eliminator 1.59% : 0.000004s : 25: predicate.tuple_to_list_eliminator_ 2.21% : 0.000005s : 38: predicate.updatestate_pure_node_eliminater 2.96% : 0.000007s : 46: predicate.updatestate_useless_node_eliminater 0.47% : 0.000001s : 4: predicate.value_based_eliminate 0.62% : 0.000001s : 8: predicate.virtual_dataset_eliminate 0.56% : 0.000001s : 8: predicate.virtual_output_eliminate 0.26% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.66% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000727 11 53.36% : 0.000388s : 5: func_graph_cloner_run.FuncGraphClonerGraph 46.64% : 0.000339s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.036200 192 0.02% : 0.000006s : 1: ForceFp32Comm 12.03% : 0.004356s : 1: add_attr 11.97% : 0.004334s : 1: add_attr_with_inline 0.02% : 0.000006s : 1: add_comm_op_reuse_tag 0.16% : 0.000059s : 1: add_recomputation 0.02% : 0.000006s : 1: assign_add_opt 0.22% : 0.000080s : 1: auto_monad 0.08% : 0.000030s : 1: auto_monad_reorder 0.02% : 0.000006s : 1: begin_end_overlap_inline 0.02% : 0.000008s : 1: bias_add_comm_swap 1.41% : 0.000512s : 1: bootstrap 0.11% : 0.000041s : 1: cconv 0.02% : 0.000006s : 1: comm_op_add_attrs 0.05% : 0.000019s : 1: control_data_broadcast_order 0.04% : 0.000013s : 1: convert_after_rewriter 0.09% : 0.000032s : 1: cse_after_recomputation 0.02% : 0.000008s : 1: dataset_repeat_opt 0.07% : 0.000024s : 1: detach_backward 0.03% : 0.000011s : 1: environ_conv 0.10% : 0.000037s : 1: event_method 0.02% : 0.000007s : 1: full_micro_interleaved_order_control 0.02% : 0.000008s : 1: get_jit_bprop_graph 0.04% : 0.000014s : 1: graph_reusing 0.02% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.02% : 0.000006s : 1: handle_group_info 0.03% : 0.000009s : 1: inline 0.02% : 0.000009s : 1: insert-virtual-dataset 0.02% : 0.000006s : 1: interleave_parallel_branches 0.02% : 0.000007s : 1: interleave_split_concat_branches 0.02% : 0.000008s : 1: label_fine_grained_interleaved_index 0.03% : 0.000011s : 1: label_micro_interleaved_index 1.43% : 0.000518s : 1: loop_unroll 0.02% : 0.000007s : 1: merge_cast_opt 0.02% : 0.000008s : 1: micro_interleaved_order_control 1.91% : 0.000690s : 1: mutable_eliminate 0.03% : 0.000010s : 1: offloading_packed_experts 0.05% : 0.000018s : 1: opt.transform.loop_unroll_optimizer 0.05% : 0.000020s : 1: opt.transform.mutable_eliminate 3.53% : 0.001276s : 78: opt.transform.opt_a 0.09% : 0.000032s : 1: opt.transform.opt_after_cconv 0.08% : 0.000030s : 1: opt.transform.opt_after_jit_grad 0.35% : 0.000125s : 28: opt.transform.opt_b 0.14% : 0.000050s : 2: opt.transform.opt_trans_graph 0.10% : 0.000038s : 4: opt.transform.symbol_engine_opt 9.53% : 0.003450s : 1: opt_a 0.39% : 0.000142s : 1: opt_after_cconv 1.63% : 0.000591s : 1: opt_after_jit_grad 0.88% : 0.000319s : 1: opt_b 17.83% : 0.006453s : 1: optimize 0.08% : 0.000029s : 1: optimize_parallel_all_gather_comm 0.03% : 0.000011s : 1: order_py_execute_after_rewriter 0.08% : 0.000028s : 1: overlap_grad_flash_sp 0.02% : 0.000006s : 1: overlap_grad_matmul_and_grad_allreduce 0.03% : 0.000010s : 1: overlap_grad_ring_attention 0.02% : 0.000007s : 1: overlap_opt_shard_grad_in_pipeline 0.02% : 0.000006s : 1: overlap_opt_shard_in_pipeline 0.02% : 0.000008s : 1: overlap_param_gather 0.02% : 0.000006s : 1: overlap_recompute_allgather_and_fa_grad 0.03% : 0.000010s : 1: overlap_recompute_and_grad_model_parallel 0.02% : 0.000008s : 1: overlap_recompute_comm 0.03% : 0.000011s : 1: parallel-infer-symbol 0.02% : 0.000006s : 1: parallel-infer-symbol-second 0.02% : 0.000007s : 1: partial_unused_args_eliminate 0.03% : 0.000010s : 1: pipeline_parallel_scheduler 0.02% : 0.000007s : 1: pipeline_split 0.14% : 0.000051s : 1: pre_auto_parallel 0.11% : 0.000038s : 1: py_interpret_to_execute 0.06% : 0.000021s : 1: py_interpret_to_execute_after_opt_a 0.02% : 0.000007s : 1: remove_cast_before_assign_add 0.06% : 0.000021s : 1: remove_dup_value 1.20% : 0.000434s : 1: renormalize.infer 0.97% : 0.000353s : 1: renormalize.specialize 0.02% : 0.000008s : 1: reorder_send_recv_between_fp_bp 0.03% : 0.000012s : 1: rewriter_after_jit_bprop_graph 0.14% : 0.000051s : 1: rewriter_after_opt_a 0.29% : 0.000103s : 1: rewriter_before_opt_a 0.03% : 0.000009s : 1: slice_cell_reuse_recomputed_activation 0.02% : 0.000007s : 1: slice_recompute_activation 0.02% : 0.000007s : 1: split_layernorm_comm 0.02% : 0.000007s : 1: split_matmul_comm_elemetwise 0.03% : 0.000011s : 1: swap_dp_allreduce_reducescatter 0.29% : 0.000104s : 1: symbol_engine_optimizer 0.27% : 0.000096s : 1: tuple_transform 31.00% : 0.011223s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:44:28.294.21 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0269448, [21] [bootstrap]: 0.00045185 [type_inference]: 0.00673218 [event_method]: 2.161e-05 [auto_monad]: 7.085e-05 [graph_reusing]: 5.45001e-06 [inline]: 3.43e-06 [add_attr]: 0.00415087, [1] [add_attr_with_inline]: 0.00413835, [1] [Cycle 1]: 7.97e-05, [2] [tag_attr]: 2.459e-05 [meta_addattr_fg_expand]: 7.06999e-06 [parallel-infer-symbol]: 3.73001e-06 [pre_auto_parallel]: 4.228e-05 [insert-virtual-dataset]: 2.72001e-06 [parallel-infer-symbol-second]: 1.01002e-06 [dataset_repeat_opt]: 2.62001e-06 [pipeline_split]: 1.78002e-06 [optimize]: 0.0146907, [53] [py_interpret_to_execute]: 3.211e-05 [rewriter_before_opt_a]: 9.595e-05 [opt_a]: 0.0123061, [2] [Cycle 1]: 0.0115138, [45] [expand_dump_flag]: 3.33e-06 [switch_simplify]: 4.861e-05 [loop_unroll]: 3.136e-05 [a_1]: 0.00068239 [with_stream_mark]: 2.177e-05 [recompute_prepare]: 9.92001e-06 [updatestate_depend_eliminate]: 4.43999e-06 [updatestate_assign_eliminate]: 3.38e-06 [updatestate_loads_eliminate]: 3.43e-06 [parameter_eliminate]: 2.22999e-06 [a_2]: 8.483e-05 [accelerated_algorithm]: 7.33e-06 [shard]: 2.27999e-06 [meta_shard_fg_expand]: 1.97999e-06 [shard_inline]: 6.90998e-06 [merge_send_recv]: 9.53997e-06 [auto_parallel]: 7.56999e-06 [parallel]: 2.118e-05 [flash_sp]: 1.015e-05 [merge_comm]: 3.89002e-06 [allreduce_fusion]: 3.65998e-06 [matmul_add_comm_reduction]: 1.052e-05 [allreduce_slice_to_reducescatter]: 7.10017e-07 [virtual_shard_identity]: 8.64998e-06 [virtual_dataset]: 6.80002e-06 [get_grad_eliminate_]: 6.17001e-06 [virtual_output]: 6.98998e-06 [merge_forward]: 3.93001e-06 [cell_reuse_recompute_pass]: 1.19e-06 [offload_activation]: 1.109e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.492e-05 [merge_recompute_call_nodes]: 1.54e-06 [before_grad]: 1.096e-05 [set_forward_comm_id_for_comm_node_pass]: 3.8e-06 [meta_fg_expand]: 3.2e-06 [flash_sp_send_recv_attached]: 2.69999e-06 [receive_attached]: 2.56998e-06 [after_resolve]: 1.347e-05 [a_after_grad]: 1.11e-05 [renormalize]: 0.00989451 [add_forward_monad_depend]: 1.232e-05 [auto_monad_grad]: 2.41e-06 [auto_monad_eliminator]: 2.579e-05 [cse]: 3.925e-05 [a_3]: 7.044e-05 [Cycle 2]: 0.00077919, [45] [expand_dump_flag]: 2.69999e-06 [switch_simplify]: 1.011e-05 [loop_unroll]: 8.03999e-06 [a_1]: 0.00015247 [with_stream_mark]: 2.219e-05 [recompute_prepare]: 7.97e-06 [updatestate_depend_eliminate]: 4.12e-06 [updatestate_assign_eliminate]: 3.18e-06 [updatestate_loads_eliminate]: 3.21999e-06 [parameter_eliminate]: 2.00002e-06 [a_2]: 7.925e-05 [accelerated_algorithm]: 6.84999e-06 [shard]: 2.46e-06 [meta_shard_fg_expand]: 2.12001e-06 [shard_inline]: 6.88e-06 [merge_send_recv]: 1.046e-05 [auto_parallel]: 9.91998e-06 [parallel]: 9.64e-06 [flash_sp]: 1.054e-05 [merge_comm]: 3.61999e-06 [allreduce_fusion]: 3.21001e-06 [matmul_add_comm_reduction]: 9.59999e-06 [allreduce_slice_to_reducescatter]: 9.29984e-07 [virtual_shard_identity]: 8.10999e-06 [virtual_dataset]: 6.16e-06 [get_grad_eliminate_]: 6.11e-06 [virtual_output]: 6.48e-06 [merge_forward]: 4.23999e-06 [cell_reuse_recompute_pass]: 3.48999e-06 [offload_activation]: 1.142e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.017e-05 [merge_recompute_call_nodes]: 1.60999e-06 [before_grad]: 1.175e-05 [set_forward_comm_id_for_comm_node_pass]: 3.67998e-06 [meta_fg_expand]: 2.49999e-06 [flash_sp_send_recv_attached]: 2.06e-06 [receive_attached]: 2.48e-06 [after_resolve]: 1.378e-05 [a_after_grad]: 1.006e-05 [renormalize]: 7.00238e-08 [add_forward_monad_depend]: 2.50002e-06 [auto_monad_grad]: 1.69998e-06 [auto_monad_eliminator]: 9.36e-06 [cse]: 1.677e-05 [a_3]: 3.872e-05 [py_interpret_to_execute_after_opt_a]: 1.949e-05 [slice_cell_reuse_recomputed_activation]: 1.85001e-06 [rewriter_after_opt_a]: 4.043e-05 [convert_after_rewriter]: 6.58e-06 [order_py_execute_after_rewriter]: 5.25999e-06 [mutable_eliminate]: 0.00074381 [opt_b]: 0.00023542, [1] [Cycle 1]: 0.00022722, [7] [b_1]: 0.0001423 [b_2]: 8.95999e-06 [updatestate_depend_eliminate]: 8.59002e-06 [updatestate_assign_eliminate]: 2.42001e-06 [updatestate_loads_eliminate]: 2.92002e-06 [renormalize]: 7.39994e-07 [cse]: 2.24e-05 [optimize_parallel_all_gather_comm]: 1.774e-05 [overlap_param_gather]: 1.91e-06 [cconv]: 3.397e-05 [loop_unroll]: 0.00045569 [opt_after_cconv]: 0.00011141, [1] [Cycle 1]: 0.00010463, [7] [c_1]: 3.215e-05 [parameter_eliminate]: 4.76002e-06 [updatestate_depend_eliminate]: 7.24001e-06 [updatestate_assign_eliminate]: 2.53998e-06 [updatestate_loads_eliminate]: 2.37001e-06 [cse]: 1.985e-05 [renormalize]: 5.39992e-07 [remove_dup_value]: 1.312e-05 [tuple_transform]: 7.811e-05, [1] [Cycle 1]: 7.361e-05, [4] [d_1]: 4.527e-05 [none_parameter_eliminate]: 2.09999e-06 [renormalize]: 1.90019e-07 [switch_simplify]: 6.89001e-06 [partial_unused_args_eliminate]: 2.46e-06 [add_recomputation]: 5.32e-05 [cse_after_recomputation]: 2.207e-05, [1] [Cycle 1]: 1.733e-05, [1] [cse]: 1.15e-05 [environ_conv]: 5.99e-06 [swap_dp_allreduce_reducescatter]: 5.50001e-06 [bias_add_comm_swap]: 2.98e-06 [label_micro_interleaved_index]: 4.90999e-06 [label_fine_grained_interleaved_index]: 2.72001e-06 [merge_cast_opt]: 1.30999e-06 [slice_recompute_activation]: 2.06998e-06 [micro_interleaved_order_control]: 2.17999e-06 [assign_add_opt]: 1.14998e-06 [ForceFp32Comm]: 8.2e-07 [remove_cast_before_assign_add]: 1.02998e-06 [full_micro_interleaved_order_control]: 2.29999e-06 [reorder_send_recv_between_fp_bp]: 2.63e-06 [comm_op_add_attrs]: 1.19e-06 [add_comm_op_reuse_tag]: 9.89996e-07 [interleave_split_concat_branches]: 1.17999e-06 [interleave_parallel_branches]: 1.07e-06 [overlap_opt_shard_in_pipeline]: 1.15999e-06 [overlap_opt_shard_grad_in_pipeline]: 1.77001e-06 [control_data_broadcast_order]: 1.355e-05 [grouped_pairwise_exchange_alltoall]: 1.76003e-06 [offloading_packed_experts]: 4.31002e-06 [overlap_recompute_and_grad_model_parallel]: 4.93001e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.19e-06 [overlap_recompute_allgather_and_fa_grad]: 1.42e-06 [overlap_recompute_comm]: 2.60002e-06 [overlap_grad_ring_attention]: 4.22e-06 [overlap_grad_flash_sp]: 2.201e-05 [begin_end_overlap_inline]: 5.89993e-07 [split_matmul_comm_elemetwise]: 1.97999e-06 [split_layernorm_comm]: 1.74e-06 [handle_group_info]: 9.39996e-07 [symbol_engine_optimizer]: 8.308e-05, [1] [Cycle 1]: 7.87e-05, [6] [build]: 4.14002e-06 [elim_shapecalc]: 1.201e-05 [elim_not_effective]: 1.347e-05 [opt_reshape]: 7.05002e-06 [fold_const_symbol]: 1.015e-05 [renormalize]: 2.30008e-07 [detach_backward]: 1.575e-05 [pipeline_parallel_scheduler]: 1.89999e-06 [auto_monad_reorder]: 1.877e-05 [get_jit_bprop_graph]: 2.54001e-06 [rewriter_after_jit_bprop_graph]: 5.76e-06 [opt_after_jit_grad]: 0.00049942 [validate]: 4.703e-05 Sums bootstrap : 0.000452s : 2.09% type_inference : 0.006732s : 31.09% event_method : 0.000022s : 0.10% auto_monad : 0.000071s : 0.33% graph_reusing : 0.000005s : 0.03% inline : 0.000003s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000025s : 0.11% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000007s : 0.03% parallel-infer-symbol : 0.000004s : 0.02% pre_auto_parallel : 0.000042s : 0.20% insert-virtual-dataset : 0.000003s : 0.01% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000003s : 0.01% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000032s : 0.15% optimize.rewriter_before_opt_a : 0.000096s : 0.44% optimize.opt_a.expand_dump_flag : 0.000006s : 0.03% optimize.opt_a.switch_simplify : 0.000059s : 0.27% optimize.opt_a.loop_unroll : 0.000039s : 0.18% optimize.opt_a.a_1 : 0.000835s : 3.86% optimize.opt_a.with_stream_mark : 0.000044s : 0.20% optimize.opt_a.recompute_prepare : 0.000018s : 0.08% optimize.opt_a.updatestate_depend_eliminate : 0.000009s : 0.04% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.03% optimize.opt_a.updatestate_loads_eliminate : 0.000007s : 0.03% optimize.opt_a.parameter_eliminate : 0.000004s : 0.02% optimize.opt_a.a_2 : 0.000164s : 0.76% optimize.opt_a.accelerated_algorithm : 0.000014s : 0.07% optimize.opt_a.shard : 0.000005s : 0.02% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.02% optimize.opt_a.shard_inline : 0.000014s : 0.06% optimize.opt_a.merge_send_recv : 0.000020s : 0.09% optimize.opt_a.auto_parallel : 0.000017s : 0.08% optimize.opt_a.parallel : 0.000031s : 0.14% optimize.opt_a.flash_sp : 0.000021s : 0.10% optimize.opt_a.merge_comm : 0.000008s : 0.03% optimize.opt_a.allreduce_fusion : 0.000007s : 0.03% optimize.opt_a.matmul_add_comm_reduction : 0.000020s : 0.09% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000002s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000017s : 0.08% optimize.opt_a.virtual_dataset : 0.000013s : 0.06% optimize.opt_a.get_grad_eliminate_ : 0.000012s : 0.06% optimize.opt_a.virtual_output : 0.000013s : 0.06% optimize.opt_a.merge_forward : 0.000008s : 0.04% optimize.opt_a.cell_reuse_recompute_pass : 0.000005s : 0.02% optimize.opt_a.offload_activation : 0.000023s : 0.10% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000035s : 0.16% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.01% optimize.opt_a.before_grad : 0.000023s : 0.10% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000007s : 0.03% optimize.opt_a.meta_fg_expand : 0.000006s : 0.03% optimize.opt_a.flash_sp_send_recv_attached : 0.000005s : 0.02% optimize.opt_a.receive_attached : 0.000005s : 0.02% optimize.opt_a.after_resolve : 0.000027s : 0.13% optimize.opt_a.a_after_grad : 0.000021s : 0.10% optimize.opt_a.renormalize : 0.009895s : 45.70% optimize.opt_a.add_forward_monad_depend : 0.000015s : 0.07% optimize.opt_a.auto_monad_grad : 0.000004s : 0.02% optimize.opt_a.auto_monad_eliminator : 0.000035s : 0.16% optimize.opt_a.cse : 0.000056s : 0.26% optimize.opt_a.a_3 : 0.000109s : 0.50% optimize.py_interpret_to_execute_after_opt_a : 0.000019s : 0.09% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.01% optimize.rewriter_after_opt_a : 0.000040s : 0.19% optimize.convert_after_rewriter : 0.000007s : 0.03% optimize.order_py_execute_after_rewriter : 0.000005s : 0.02% optimize.mutable_eliminate : 0.000744s : 3.44% optimize.opt_b.b_1 : 0.000142s : 0.66% optimize.opt_b.b_2 : 0.000009s : 0.04% optimize.opt_b.updatestate_depend_eliminate : 0.000009s : 0.04% optimize.opt_b.updatestate_assign_eliminate : 0.000002s : 0.01% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.01% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000022s : 0.10% optimize.optimize_parallel_all_gather_comm : 0.000018s : 0.08% optimize.overlap_param_gather : 0.000002s : 0.01% optimize.cconv : 0.000034s : 0.16% optimize.loop_unroll : 0.000456s : 2.10% optimize.opt_after_cconv.c_1 : 0.000032s : 0.15% optimize.opt_after_cconv.parameter_eliminate : 0.000005s : 0.02% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000007s : 0.03% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.01% optimize.opt_after_cconv.cse : 0.000020s : 0.09% optimize.opt_after_cconv.renormalize : 0.000001s : 0.00% optimize.remove_dup_value : 0.000013s : 0.06% optimize.tuple_transform.d_1 : 0.000045s : 0.21% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000007s : 0.03% optimize.partial_unused_args_eliminate : 0.000002s : 0.01% optimize.add_recomputation : 0.000053s : 0.25% optimize.cse_after_recomputation.cse : 0.000012s : 0.05% optimize.environ_conv : 0.000006s : 0.03% optimize.swap_dp_allreduce_reducescatter : 0.000006s : 0.03% optimize.bias_add_comm_swap : 0.000003s : 0.01% optimize.label_micro_interleaved_index : 0.000005s : 0.02% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.01% optimize.merge_cast_opt : 0.000001s : 0.01% optimize.slice_recompute_activation : 0.000002s : 0.01% optimize.micro_interleaved_order_control : 0.000002s : 0.01% optimize.assign_add_opt : 0.000001s : 0.01% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.00% optimize.full_micro_interleaved_order_control : 0.000002s : 0.01% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.01% optimize.comm_op_add_attrs : 0.000001s : 0.01% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.01% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.01% optimize.control_data_broadcast_order : 0.000014s : 0.06% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.01% optimize.offloading_packed_experts : 0.000004s : 0.02% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.02% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.01% optimize.overlap_recompute_comm : 0.000003s : 0.01% optimize.overlap_grad_ring_attention : 0.000004s : 0.02% optimize.overlap_grad_flash_sp : 0.000022s : 0.10% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.01% optimize.split_layernorm_comm : 0.000002s : 0.01% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000004s : 0.02% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000012s : 0.06% optimize.symbol_engine_optimizer.elim_not_effective : 0.000013s : 0.06% optimize.symbol_engine_optimizer.opt_reshape : 0.000007s : 0.03% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000010s : 0.05% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000016s : 0.07% pipeline_parallel_scheduler : 0.000002s : 0.01% auto_monad_reorder : 0.000019s : 0.09% get_jit_bprop_graph : 0.000003s : 0.01% rewriter_after_jit_bprop_graph : 0.000006s : 0.03% opt_after_jit_grad : 0.000499s : 2.31% validate : 0.000047s : 0.22% Time group info: ------[substitution.] 0.000225 29 0.83% : 0.000002s : 2: substitution.elim_not_effective 0.72% : 0.000002s : 2: substitution.fold_const_symbol 2.60% : 0.000006s : 4: substitution.graph_param_transform 75.65% : 0.000170s : 4: substitution.inline 2.13% : 0.000005s : 4: substitution.j_node_and_user_rematch 3.64% : 0.000008s : 4: substitution.remove_not_recompute_node 3.16% : 0.000007s : 4: substitution.replace_old_param 7.48% : 0.000017s : 4: substitution.tuple_list_get_item_eliminator 3.78% : 0.000009s : 1: substitution.value_based_eliminate ------[type_inference.] 0.006657 2 88.39% : 0.005884s : 1: type_inference.infer 11.61% : 0.000773s : 1: type_inference.specialize ------[replace.] 0.000067 8 65.78% : 0.000044s : 4: replace.inline 34.22% : 0.000023s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000183 8 91.81% : 0.000168s : 4: match.inline 8.19% : 0.000015s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000229 1278 0.99% : 0.000002s : 13: predicate.accumulaten_eliminater 0.85% : 0.000002s : 4: predicate.ad_related_special_op_eliminate 0.51% : 0.000001s : 8: predicate.addn_check_dump 1.15% : 0.000003s : 13: predicate.addn_zero_filter 0.77% : 0.000002s : 13: predicate.adjust_all_reduce_mul_add 2.10% : 0.000005s : 21: predicate.arithmetic_simplify 1.19% : 0.000003s : 13: predicate.cast_eliminate 0.64% : 0.000001s : 8: predicate.check_bprop_eliminate 0.49% : 0.000001s : 8: predicate.compare_switch_simplify 0.17% : 0.000000s : 4: predicate.const_output_eliminate 0.55% : 0.000001s : 8: predicate.depend_value_elim 0.92% : 0.000002s : 13: predicate.dict_get_item_const_eliminator 1.12% : 0.000003s : 13: predicate.dict_get_item_eliminator 0.95% : 0.000002s : 13: predicate.dict_set_item_eliminator 0.96% : 0.000002s : 8: predicate.dumpgradient_eliminate 0.25% : 0.000001s : 4: predicate.elim_not_effective 0.39% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.23% : 0.000003s : 17: predicate.environ_add_const_eliminate 1.09% : 0.000002s : 17: predicate.environ_get_add_eliminate 1.15% : 0.000003s : 17: predicate.environ_get_depend_swap 1.70% : 0.000004s : 25: predicate.environ_get_eliminate 1.07% : 0.000002s : 17: predicate.environ_get_set_eliminate 1.49% : 0.000003s : 21: predicate.exchange_switch_depend_value 2.47% : 0.000006s : 21: predicate.float_depend_g_call 0.62% : 0.000001s : 8: predicate.float_environ_get_switch 0.77% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.16% : 0.000000s : 4: predicate.fold_const_symbol 0.58% : 0.000001s : 8: predicate.get_grad_eliminate 0.25% : 0.000001s : 4: predicate.graph_param_transform 0.52% : 0.000001s : 8: predicate.incorporate_call 0.43% : 0.000001s : 8: predicate.incorporate_call_switch 6.20% : 0.000014s : 58: predicate.inline 1.08% : 0.000002s : 8: predicate.inline_without_move 0.32% : 0.000001s : 8: predicate.j_node_and_user_rematch 0.85% : 0.000002s : 8: predicate.less_batch_normalization 1.76% : 0.000004s : 25: predicate.list_to_tuple_eliminator_ 2.44% : 0.000006s : 38: predicate.load_eliminater 1.10% : 0.000003s : 4: predicate.loop_unroll_after_grad 2.23% : 0.000005s : 34: predicate.loop_unroll_before_grad 1.83% : 0.000004s : 21: predicate.make_slice_get_slice_eliminator 0.61% : 0.000001s : 8: predicate.merge_addn 0.53% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.55% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.82% : 0.000002s : 13: predicate.minmaximum_grad 1.19% : 0.000003s : 4: predicate.mutable_eliminate 0.35% : 0.000001s : 4: predicate.opt_reshape 0.50% : 0.000001s : 4: predicate.parallel_virtual_node 1.76% : 0.000004s : 21: predicate.partial_defer_inline 1.48% : 0.000003s : 21: predicate.partial_eliminate 0.93% : 0.000002s : 13: predicate.print_const_string_wrapper 0.63% : 0.000001s : 8: predicate.reduce_all_const_elim 1.41% : 0.000003s : 13: predicate.reduce_eliminate 2.35% : 0.000005s : 38: predicate.redundant_stop_gradient_eliminater 0.42% : 0.000001s : 8: predicate.remove_not_recompute_node 1.37% : 0.000003s : 25: predicate.replace_applicator 0.56% : 0.000001s : 8: predicate.replace_old_param 0.25% : 0.000001s : 4: predicate.reset_defer_inline 0.95% : 0.000002s : 13: predicate.reshape_eliminate 0.63% : 0.000001s : 8: predicate.row_tensor_add_zeros_like 0.41% : 0.000001s : 4: predicate.row_tensor_eliminate 0.96% : 0.000002s : 8: predicate.same_eliminate 0.50% : 0.000001s : 8: predicate.set_cell_output_no_recompute 1.04% : 0.000002s : 8: predicate.shard_identity_eliminate 0.66% : 0.000002s : 8: predicate.special_op_eliminate 0.61% : 0.000001s : 8: predicate.specialize_transform 0.95% : 0.000002s : 8: predicate.split_environ_get_set_with_tuple_value 0.73% : 0.000002s : 8: predicate.stack_unstack_eliminate 0.29% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.41% : 0.000003s : 21: predicate.switch_defer_inline 2.06% : 0.000005s : 29: predicate.switch_layer_defer_inline 4.83% : 0.000011s : 67: predicate.switch_simplify 1.05% : 0.000002s : 13: predicate.tile_eliminate 0.86% : 0.000002s : 13: predicate.transpose_eliminate 1.43% : 0.000003s : 21: predicate.tuple_list_convert_item_index_to_positive 1.69% : 0.000004s : 21: predicate.tuple_list_get_item_const_eliminator 1.52% : 0.000003s : 21: predicate.tuple_list_get_item_depend_reorder 3.43% : 0.000008s : 33: predicate.tuple_list_get_item_eliminator 1.41% : 0.000003s : 21: predicate.tuple_list_get_set_item_eliminator 2.31% : 0.000005s : 29: predicate.tuple_list_set_item_eliminator 1.62% : 0.000004s : 25: predicate.tuple_to_list_eliminator_ 2.28% : 0.000005s : 38: predicate.updatestate_pure_node_eliminater 2.87% : 0.000007s : 46: predicate.updatestate_useless_node_eliminater 0.46% : 0.000001s : 4: predicate.value_based_eliminate 0.62% : 0.000001s : 8: predicate.virtual_dataset_eliminate 0.73% : 0.000002s : 8: predicate.virtual_output_eliminate 0.24% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.38% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000781 11 56.04% : 0.000438s : 5: func_graph_cloner_run.FuncGraphClonerGraph 43.96% : 0.000343s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.057126 192 0.01% : 0.000004s : 1: ForceFp32Comm 7.28% : 0.004157s : 1: add_attr 7.25% : 0.004143s : 1: add_attr_with_inline 0.01% : 0.000004s : 1: add_comm_op_reuse_tag 0.10% : 0.000057s : 1: add_recomputation 0.01% : 0.000004s : 1: assign_add_opt 0.13% : 0.000076s : 1: auto_monad 0.04% : 0.000023s : 1: auto_monad_reorder 0.01% : 0.000003s : 1: begin_end_overlap_inline 0.01% : 0.000006s : 1: bias_add_comm_swap 0.85% : 0.000484s : 1: bootstrap 0.07% : 0.000038s : 1: cconv 0.01% : 0.000004s : 1: comm_op_add_attrs 0.03% : 0.000017s : 1: control_data_broadcast_order 0.02% : 0.000010s : 1: convert_after_rewriter 0.04% : 0.000025s : 1: cse_after_recomputation 0.01% : 0.000006s : 1: dataset_repeat_opt 0.04% : 0.000021s : 1: detach_backward 0.02% : 0.000009s : 1: environ_conv 0.05% : 0.000029s : 1: event_method 0.01% : 0.000005s : 1: full_micro_interleaved_order_control 0.01% : 0.000006s : 1: get_jit_bprop_graph 0.02% : 0.000009s : 1: graph_reusing 0.01% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000004s : 1: handle_group_info 0.01% : 0.000006s : 1: inline 0.01% : 0.000006s : 1: insert-virtual-dataset 0.01% : 0.000004s : 1: interleave_parallel_branches 0.01% : 0.000004s : 1: interleave_split_concat_branches 0.01% : 0.000006s : 1: label_fine_grained_interleaved_index 0.01% : 0.000008s : 1: label_micro_interleaved_index 0.81% : 0.000465s : 1: loop_unroll 0.01% : 0.000004s : 1: merge_cast_opt 0.01% : 0.000005s : 1: micro_interleaved_order_control 1.32% : 0.000755s : 1: mutable_eliminate 0.01% : 0.000007s : 1: offloading_packed_experts 0.03% : 0.000016s : 1: opt.transform.loop_unroll_optimizer 0.03% : 0.000019s : 1: opt.transform.mutable_eliminate 2.28% : 0.001304s : 78: opt.transform.opt_a 0.05% : 0.000031s : 1: opt.transform.opt_after_cconv 0.05% : 0.000028s : 1: opt.transform.opt_after_jit_grad 0.20% : 0.000117s : 28: opt.transform.opt_b 0.09% : 0.000049s : 2: opt.transform.opt_trans_graph 0.07% : 0.000038s : 4: opt.transform.symbol_engine_opt 21.55% : 0.012310s : 1: opt_a 0.20% : 0.000115s : 1: opt_after_cconv 0.89% : 0.000509s : 1: opt_after_jit_grad 0.42% : 0.000239s : 1: opt_b 25.73% : 0.014697s : 1: optimize 0.04% : 0.000021s : 1: optimize_parallel_all_gather_comm 0.01% : 0.000008s : 1: order_py_execute_after_rewriter 0.05% : 0.000026s : 1: overlap_grad_flash_sp 0.01% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.01% : 0.000007s : 1: overlap_grad_ring_attention 0.01% : 0.000004s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.01% : 0.000005s : 1: overlap_param_gather 0.01% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.01% : 0.000008s : 1: overlap_recompute_and_grad_model_parallel 0.01% : 0.000006s : 1: overlap_recompute_comm 0.01% : 0.000008s : 1: parallel-infer-symbol 0.01% : 0.000004s : 1: parallel-infer-symbol-second 0.01% : 0.000005s : 1: partial_unused_args_eliminate 0.01% : 0.000005s : 1: pipeline_parallel_scheduler 0.01% : 0.000005s : 1: pipeline_split 0.08% : 0.000046s : 1: pre_auto_parallel 0.06% : 0.000037s : 1: py_interpret_to_execute 0.04% : 0.000023s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000004s : 1: remove_cast_before_assign_add 0.03% : 0.000017s : 1: remove_dup_value 16.34% : 0.009334s : 1: renormalize.infer 0.95% : 0.000540s : 1: renormalize.specialize 0.01% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.02% : 0.000009s : 1: rewriter_after_jit_bprop_graph 0.08% : 0.000044s : 1: rewriter_after_opt_a 0.18% : 0.000100s : 1: rewriter_before_opt_a 0.01% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.01% : 0.000005s : 1: slice_recompute_activation 0.01% : 0.000005s : 1: split_layernorm_comm 0.01% : 0.000005s : 1: split_matmul_comm_elemetwise 0.01% : 0.000008s : 1: swap_dp_allreduce_reducescatter 0.15% : 0.000086s : 1: symbol_engine_optimizer 0.14% : 0.000081s : 1: tuple_transform 11.83% : 0.006756s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:44:28.859.954 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:44:28.860.218 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0495755, [21] [bootstrap]: 0.00047184 [type_inference]: 0.015997 [event_method]: 2.423e-05 [auto_monad]: 7.178e-05 [graph_reusing]: 6.38003e-06 [inline]: 2.76e-06 [add_attr]: 0.00367434, [1] [add_attr_with_inline]: 0.00366361, [1] [Cycle 1]: 8.496e-05, [2] [tag_attr]: 2.071e-05 [meta_addattr_fg_expand]: 6.12999e-06 [parallel-infer-symbol]: 3.38999e-06 [pre_auto_parallel]: 3.831e-05 [insert-virtual-dataset]: 2.20002e-06 [parallel-infer-symbol-second]: 9.50007e-07 [dataset_repeat_opt]: 2.02999e-06 [pipeline_split]: 1.68002e-06 [optimize]: 0.0277967, [53] [py_interpret_to_execute]: 3.293e-05 [rewriter_before_opt_a]: 9.107e-05 [opt_a]: 0.0248059, [2] [Cycle 1]: 0.0236618, [45] [expand_dump_flag]: 2.94999e-06 [switch_simplify]: 4.412e-05 [loop_unroll]: 3.197e-05 [a_1]: 0.00068458 [with_stream_mark]: 1.736e-05 [recompute_prepare]: 1.116e-05 [updatestate_depend_eliminate]: 5.46002e-06 [updatestate_assign_eliminate]: 4.21001e-06 [updatestate_loads_eliminate]: 4.08999e-06 [parameter_eliminate]: 1.94e-06 [a_2]: 0.00012862 [accelerated_algorithm]: 8.57998e-06 [shard]: 1.67999e-06 [meta_shard_fg_expand]: 2.02999e-06 [shard_inline]: 7.98999e-06 [merge_send_recv]: 1.035e-05 [auto_parallel]: 7.84002e-06 [parallel]: 1.827e-05 [flash_sp]: 1.029e-05 [merge_comm]: 5.10999e-06 [allreduce_fusion]: 4.28999e-06 [matmul_add_comm_reduction]: 1.156e-05 [allreduce_slice_to_reducescatter]: 9.09989e-07 [virtual_shard_identity]: 1.048e-05 [virtual_dataset]: 8.05e-06 [get_grad_eliminate_]: 7.77998e-06 [virtual_output]: 7.71001e-06 [merge_forward]: 4.75999e-06 [cell_reuse_recompute_pass]: 1.15001e-06 [offload_activation]: 1.198e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.871e-05 [merge_recompute_call_nodes]: 1.56998e-06 [before_grad]: 1.426e-05 [set_forward_comm_id_for_comm_node_pass]: 5.60001e-06 [meta_fg_expand]: 3.36001e-06 [flash_sp_send_recv_attached]: 2.43e-06 [receive_attached]: 2.54999e-06 [after_resolve]: 1.411e-05 [a_after_grad]: 1.24e-05 [renormalize]: 0.0218946 [add_forward_monad_depend]: 1.162e-05 [auto_monad_grad]: 2.48e-06 [auto_monad_eliminator]: 3.011e-05 [cse]: 4.335e-05 [a_3]: 9.313e-05 [Cycle 2]: 0.00112442, [45] [expand_dump_flag]: 2.63e-06 [switch_simplify]: 1.026e-05 [loop_unroll]: 8.25e-06 [a_1]: 0.00019999 [with_stream_mark]: 2.523e-05 [recompute_prepare]: 1.022e-05 [updatestate_depend_eliminate]: 5.71e-06 [updatestate_assign_eliminate]: 3.77998e-06 [updatestate_loads_eliminate]: 3.7e-06 [parameter_eliminate]: 2.71e-06 [a_2]: 0.00012662 [accelerated_algorithm]: 9.81e-06 [shard]: 3.06999e-06 [meta_shard_fg_expand]: 2.27999e-06 [shard_inline]: 7.94002e-06 [merge_send_recv]: 1.151e-05 [auto_parallel]: 1.034e-05 [parallel]: 1.155e-05 [flash_sp]: 4.33999e-06 [merge_comm]: 4.91002e-06 [allreduce_fusion]: 4.34002e-06 [matmul_add_comm_reduction]: 1.144e-05 [allreduce_slice_to_reducescatter]: 9.80013e-07 [virtual_shard_identity]: 1.086e-05 [virtual_dataset]: 1.004e-05 [get_grad_eliminate_]: 8.57998e-06 [virtual_output]: 7.7e-06 [merge_forward]: 6.24999e-06 [cell_reuse_recompute_pass]: 3.28e-06 [offload_activation]: 1.305e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.058e-05 [merge_recompute_call_nodes]: 1.76e-06 [before_grad]: 1.501e-05 [set_forward_comm_id_for_comm_node_pass]: 6.22001e-06 [meta_fg_expand]: 3.78999e-06 [flash_sp_send_recv_attached]: 2.01e-06 [receive_attached]: 2.44999e-06 [after_resolve]: 1.65e-05 [a_after_grad]: 1.27e-05 [renormalize]: 1.00001e-07 [add_forward_monad_depend]: 3.57002e-06 [auto_monad_grad]: 2.32001e-06 [auto_monad_eliminator]: 1.495e-05 [cse]: 2.66e-05 [a_3]: 6.339e-05 [py_interpret_to_execute_after_opt_a]: 2.516e-05 [slice_cell_reuse_recomputed_activation]: 5.84999e-06 [rewriter_after_opt_a]: 5.848e-05 [convert_after_rewriter]: 1.387e-05 [order_py_execute_after_rewriter]: 8.97e-06 [mutable_eliminate]: 0.00080049 [opt_b]: 0.00037284, [1] [Cycle 1]: 0.00035971, [7] [b_1]: 0.00022602 [b_2]: 1.065e-05 [updatestate_depend_eliminate]: 1.191e-05 [updatestate_assign_eliminate]: 3.76001e-06 [updatestate_loads_eliminate]: 3.45003e-06 [renormalize]: 5.90022e-07 [cse]: 3.663e-05 [optimize_parallel_all_gather_comm]: 2.587e-05 [overlap_param_gather]: 4.38999e-06 [cconv]: 3.93e-05 [loop_unroll]: 0.00053689 [opt_after_cconv]: 0.00016188, [1] [Cycle 1]: 0.00015099, [7] [c_1]: 3.997e-05 [parameter_eliminate]: 6.33e-06 [updatestate_depend_eliminate]: 9.22999e-06 [updatestate_assign_eliminate]: 3.33e-06 [updatestate_loads_eliminate]: 3.43e-06 [cse]: 3.041e-05 [renormalize]: 8.09989e-07 [remove_dup_value]: 2.144e-05 [tuple_transform]: 0.00011675, [1] [Cycle 1]: 0.0001083, [4] [d_1]: 5.939e-05 [none_parameter_eliminate]: 1.89999e-06 [renormalize]: 1.80007e-07 [switch_simplify]: 1.048e-05 [partial_unused_args_eliminate]: 4.68001e-06 [add_recomputation]: 7.185e-05 [cse_after_recomputation]: 3.827e-05, [1] [Cycle 1]: 3.035e-05, [1] [cse]: 1.815e-05 [environ_conv]: 1.083e-05 [swap_dp_allreduce_reducescatter]: 9.52001e-06 [bias_add_comm_swap]: 6.24999e-06 [label_micro_interleaved_index]: 8.65999e-06 [label_fine_grained_interleaved_index]: 5.17999e-06 [merge_cast_opt]: 4.05998e-06 [slice_recompute_activation]: 4.60999e-06 [micro_interleaved_order_control]: 4.65001e-06 [assign_add_opt]: 3.84002e-06 [ForceFp32Comm]: 3.48e-06 [remove_cast_before_assign_add]: 3.84002e-06 [full_micro_interleaved_order_control]: 4.80001e-06 [reorder_send_recv_between_fp_bp]: 5.82001e-06 [comm_op_add_attrs]: 3.58e-06 [add_comm_op_reuse_tag]: 3.23e-06 [interleave_split_concat_branches]: 3.66999e-06 [interleave_parallel_branches]: 3.95e-06 [overlap_opt_shard_in_pipeline]: 4.15999e-06 [overlap_opt_shard_grad_in_pipeline]: 4.50999e-06 [control_data_broadcast_order]: 2.143e-05 [grouped_pairwise_exchange_alltoall]: 3.78001e-06 [offloading_packed_experts]: 7.36999e-06 [overlap_recompute_and_grad_model_parallel]: 8e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.89002e-06 [overlap_recompute_allgather_and_fa_grad]: 3.74002e-06 [overlap_recompute_comm]: 5.47001e-06 [overlap_grad_ring_attention]: 7.34002e-06 [overlap_grad_flash_sp]: 2.869e-05 [begin_end_overlap_inline]: 3.13e-06 [split_matmul_comm_elemetwise]: 4.90001e-06 [split_layernorm_comm]: 3.98999e-06 [handle_group_info]: 3.7e-06 [symbol_engine_optimizer]: 0.0001209, [1] [Cycle 1]: 0.00011211, [6] [build]: 4.52e-06 [elim_shapecalc]: 1.513e-05 [elim_not_effective]: 1.778e-05 [opt_reshape]: 9.54999e-06 [fold_const_symbol]: 1.366e-05 [renormalize]: 3.9002e-07 [detach_backward]: 5.10999e-06 [pipeline_parallel_scheduler]: 2.59999e-06 [auto_monad_reorder]: 2.617e-05 [get_jit_bprop_graph]: 2.07001e-06 [rewriter_after_jit_bprop_graph]: 7.33999e-06 [opt_after_jit_grad]: 0.00057936 [validate]: 5.103e-05 Sums bootstrap : 0.000472s : 1.08% type_inference : 0.015997s : 36.55% event_method : 0.000024s : 0.06% auto_monad : 0.000072s : 0.16% graph_reusing : 0.000006s : 0.01% inline : 0.000003s : 0.01% add_attr.add_attr_with_inline.tag_attr : 0.000021s : 0.05% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.01% parallel-infer-symbol : 0.000003s : 0.01% pre_auto_parallel : 0.000038s : 0.09% insert-virtual-dataset : 0.000002s : 0.01% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000033s : 0.08% optimize.rewriter_before_opt_a : 0.000091s : 0.21% optimize.opt_a.expand_dump_flag : 0.000006s : 0.01% optimize.opt_a.switch_simplify : 0.000054s : 0.12% optimize.opt_a.loop_unroll : 0.000040s : 0.09% optimize.opt_a.a_1 : 0.000885s : 2.02% optimize.opt_a.with_stream_mark : 0.000043s : 0.10% optimize.opt_a.recompute_prepare : 0.000021s : 0.05% optimize.opt_a.updatestate_depend_eliminate : 0.000011s : 0.03% optimize.opt_a.updatestate_assign_eliminate : 0.000008s : 0.02% optimize.opt_a.updatestate_loads_eliminate : 0.000008s : 0.02% optimize.opt_a.parameter_eliminate : 0.000005s : 0.01% optimize.opt_a.a_2 : 0.000255s : 0.58% optimize.opt_a.accelerated_algorithm : 0.000018s : 0.04% optimize.opt_a.shard : 0.000005s : 0.01% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.01% optimize.opt_a.shard_inline : 0.000016s : 0.04% optimize.opt_a.merge_send_recv : 0.000022s : 0.05% optimize.opt_a.auto_parallel : 0.000018s : 0.04% optimize.opt_a.parallel : 0.000030s : 0.07% optimize.opt_a.flash_sp : 0.000015s : 0.03% optimize.opt_a.merge_comm : 0.000010s : 0.02% optimize.opt_a.allreduce_fusion : 0.000009s : 0.02% optimize.opt_a.matmul_add_comm_reduction : 0.000023s : 0.05% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000002s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000021s : 0.05% optimize.opt_a.virtual_dataset : 0.000018s : 0.04% optimize.opt_a.get_grad_eliminate_ : 0.000016s : 0.04% optimize.opt_a.virtual_output : 0.000015s : 0.04% optimize.opt_a.merge_forward : 0.000011s : 0.03% optimize.opt_a.cell_reuse_recompute_pass : 0.000004s : 0.01% optimize.opt_a.offload_activation : 0.000025s : 0.06% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000039s : 0.09% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.01% optimize.opt_a.before_grad : 0.000029s : 0.07% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000012s : 0.03% optimize.opt_a.meta_fg_expand : 0.000007s : 0.02% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.01% optimize.opt_a.receive_attached : 0.000005s : 0.01% optimize.opt_a.after_resolve : 0.000031s : 0.07% optimize.opt_a.a_after_grad : 0.000025s : 0.06% optimize.opt_a.renormalize : 0.021895s : 50.03% optimize.opt_a.add_forward_monad_depend : 0.000015s : 0.03% optimize.opt_a.auto_monad_grad : 0.000005s : 0.01% optimize.opt_a.auto_monad_eliminator : 0.000045s : 0.10% optimize.opt_a.cse : 0.000070s : 0.16% optimize.opt_a.a_3 : 0.000157s : 0.36% optimize.py_interpret_to_execute_after_opt_a : 0.000025s : 0.06% optimize.slice_cell_reuse_recomputed_activation : 0.000006s : 0.01% optimize.rewriter_after_opt_a : 0.000058s : 0.13% optimize.convert_after_rewriter : 0.000014s : 0.03% optimize.order_py_execute_after_rewriter : 0.000009s : 0.02% optimize.mutable_eliminate : 0.000800s : 1.83% optimize.opt_b.b_1 : 0.000226s : 0.52% optimize.opt_b.b_2 : 0.000011s : 0.02% optimize.opt_b.updatestate_depend_eliminate : 0.000012s : 0.03% optimize.opt_b.updatestate_assign_eliminate : 0.000004s : 0.01% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.01% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000037s : 0.08% optimize.optimize_parallel_all_gather_comm : 0.000026s : 0.06% optimize.overlap_param_gather : 0.000004s : 0.01% optimize.cconv : 0.000039s : 0.09% optimize.loop_unroll : 0.000537s : 1.23% optimize.opt_after_cconv.c_1 : 0.000040s : 0.09% optimize.opt_after_cconv.parameter_eliminate : 0.000006s : 0.01% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000009s : 0.02% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.cse : 0.000030s : 0.07% optimize.opt_after_cconv.renormalize : 0.000001s : 0.00% optimize.remove_dup_value : 0.000021s : 0.05% optimize.tuple_transform.d_1 : 0.000059s : 0.14% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000010s : 0.02% optimize.partial_unused_args_eliminate : 0.000005s : 0.01% optimize.add_recomputation : 0.000072s : 0.16% optimize.cse_after_recomputation.cse : 0.000018s : 0.04% optimize.environ_conv : 0.000011s : 0.02% optimize.swap_dp_allreduce_reducescatter : 0.000010s : 0.02% optimize.bias_add_comm_swap : 0.000006s : 0.01% optimize.label_micro_interleaved_index : 0.000009s : 0.02% optimize.label_fine_grained_interleaved_index : 0.000005s : 0.01% optimize.merge_cast_opt : 0.000004s : 0.01% optimize.slice_recompute_activation : 0.000005s : 0.01% optimize.micro_interleaved_order_control : 0.000005s : 0.01% optimize.assign_add_opt : 0.000004s : 0.01% optimize.ForceFp32Comm : 0.000003s : 0.01% optimize.remove_cast_before_assign_add : 0.000004s : 0.01% optimize.full_micro_interleaved_order_control : 0.000005s : 0.01% optimize.reorder_send_recv_between_fp_bp : 0.000006s : 0.01% optimize.comm_op_add_attrs : 0.000004s : 0.01% optimize.add_comm_op_reuse_tag : 0.000003s : 0.01% optimize.interleave_split_concat_branches : 0.000004s : 0.01% optimize.interleave_parallel_branches : 0.000004s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000005s : 0.01% optimize.control_data_broadcast_order : 0.000021s : 0.05% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.01% optimize.offloading_packed_experts : 0.000007s : 0.02% optimize.overlap_recompute_and_grad_model_parallel : 0.000008s : 0.02% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.01% optimize.overlap_recompute_comm : 0.000005s : 0.01% optimize.overlap_grad_ring_attention : 0.000007s : 0.02% optimize.overlap_grad_flash_sp : 0.000029s : 0.07% optimize.begin_end_overlap_inline : 0.000003s : 0.01% optimize.split_matmul_comm_elemetwise : 0.000005s : 0.01% optimize.split_layernorm_comm : 0.000004s : 0.01% optimize.handle_group_info : 0.000004s : 0.01% optimize.symbol_engine_optimizer.build : 0.000005s : 0.01% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000015s : 0.03% optimize.symbol_engine_optimizer.elim_not_effective : 0.000018s : 0.04% optimize.symbol_engine_optimizer.opt_reshape : 0.000010s : 0.02% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000014s : 0.03% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000005s : 0.01% pipeline_parallel_scheduler : 0.000003s : 0.01% auto_monad_reorder : 0.000026s : 0.06% get_jit_bprop_graph : 0.000002s : 0.00% rewriter_after_jit_bprop_graph : 0.000007s : 0.02% opt_after_jit_grad : 0.000579s : 1.32% validate : 0.000051s : 0.12% Time group info: ------[substitution.] 0.000242 39 13.52% : 0.000033s : 3: substitution.cast_eliminate 1.07% : 0.000003s : 3: substitution.elim_not_effective 0.84% : 0.000002s : 3: substitution.fold_const_symbol 3.07% : 0.000007s : 5: substitution.graph_param_transform 64.03% : 0.000155s : 4: substitution.inline 2.35% : 0.000006s : 6: substitution.j_node_and_user_rematch 3.31% : 0.000008s : 6: substitution.remove_not_recompute_node 3.00% : 0.000007s : 4: substitution.replace_old_param 5.13% : 0.000012s : 4: substitution.tuple_list_get_item_eliminator 3.67% : 0.000009s : 1: substitution.value_based_eliminate ------[type_inference.] 0.015922 2 93.13% : 0.014828s : 1: type_inference.infer 6.87% : 0.001094s : 1: type_inference.specialize ------[replace.] 0.000063 8 63.87% : 0.000040s : 4: replace.inline 36.13% : 0.000023s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000163 8 93.49% : 0.000152s : 4: match.inline 6.51% : 0.000011s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000260 1504 1.00% : 0.000003s : 15: predicate.accumulaten_eliminater 0.87% : 0.000002s : 5: predicate.ad_related_special_op_eliminate 0.58% : 0.000001s : 10: predicate.addn_check_dump 0.90% : 0.000002s : 15: predicate.addn_zero_filter 0.87% : 0.000002s : 15: predicate.adjust_all_reduce_mul_add 2.06% : 0.000005s : 25: predicate.arithmetic_simplify 1.18% : 0.000003s : 15: predicate.cast_eliminate 0.83% : 0.000002s : 10: predicate.check_bprop_eliminate 0.62% : 0.000002s : 10: predicate.compare_switch_simplify 0.18% : 0.000000s : 5: predicate.const_output_eliminate 0.61% : 0.000002s : 10: predicate.depend_value_elim 0.86% : 0.000002s : 15: predicate.dict_get_item_const_eliminator 1.14% : 0.000003s : 15: predicate.dict_get_item_eliminator 0.86% : 0.000002s : 15: predicate.dict_set_item_eliminator 1.01% : 0.000003s : 10: predicate.dumpgradient_eliminate 0.18% : 0.000000s : 5: predicate.elim_not_effective 0.39% : 0.000001s : 5: predicate.elim_shapecalc_of_broadcastargs 1.13% : 0.000003s : 20: predicate.environ_add_const_eliminate 1.03% : 0.000003s : 20: predicate.environ_get_add_eliminate 1.03% : 0.000003s : 20: predicate.environ_get_depend_swap 1.75% : 0.000005s : 30: predicate.environ_get_eliminate 1.03% : 0.000003s : 20: predicate.environ_get_set_eliminate 1.23% : 0.000003s : 23: predicate.exchange_switch_depend_value 2.27% : 0.000006s : 23: predicate.float_depend_g_call 0.56% : 0.000001s : 10: predicate.float_environ_get_switch 0.79% : 0.000002s : 15: predicate.float_tuple_getitem_switch 0.18% : 0.000000s : 5: predicate.fold_const_symbol 0.64% : 0.000002s : 10: predicate.get_grad_eliminate 0.28% : 0.000001s : 5: predicate.graph_param_transform 0.63% : 0.000002s : 10: predicate.incorporate_call 0.53% : 0.000001s : 10: predicate.incorporate_call_switch 6.00% : 0.000016s : 68: predicate.inline 1.12% : 0.000003s : 10: predicate.inline_without_move 0.29% : 0.000001s : 10: predicate.j_node_and_user_rematch 0.95% : 0.000002s : 10: predicate.less_batch_normalization 1.84% : 0.000005s : 29: predicate.list_to_tuple_eliminator_ 2.37% : 0.000006s : 44: predicate.load_eliminater 1.28% : 0.000003s : 5: predicate.loop_unroll_after_grad 2.06% : 0.000005s : 36: predicate.loop_unroll_before_grad 1.71% : 0.000004s : 25: predicate.make_slice_get_slice_eliminator 0.59% : 0.000002s : 10: predicate.merge_addn 0.68% : 0.000002s : 10: predicate.micro_step_allgather_replace 0.58% : 0.000002s : 10: predicate.mini_step_allgather_replace 0.76% : 0.000002s : 15: predicate.minmaximum_grad 1.57% : 0.000004s : 5: predicate.mutable_eliminate 0.38% : 0.000001s : 5: predicate.opt_reshape 0.41% : 0.000001s : 5: predicate.parallel_virtual_node 1.54% : 0.000004s : 23: predicate.partial_defer_inline 1.49% : 0.000004s : 24: predicate.partial_eliminate 0.96% : 0.000003s : 15: predicate.print_const_string_wrapper 0.61% : 0.000002s : 10: predicate.reduce_all_const_elim 1.14% : 0.000003s : 15: predicate.reduce_eliminate 2.40% : 0.000006s : 44: predicate.redundant_stop_gradient_eliminater 0.36% : 0.000001s : 10: predicate.remove_not_recompute_node 1.49% : 0.000004s : 29: predicate.replace_applicator 0.57% : 0.000001s : 10: predicate.replace_old_param 0.41% : 0.000001s : 5: predicate.reset_defer_inline 0.91% : 0.000002s : 15: predicate.reshape_eliminate 0.65% : 0.000002s : 10: predicate.row_tensor_add_zeros_like 0.43% : 0.000001s : 5: predicate.row_tensor_eliminate 1.04% : 0.000003s : 10: predicate.same_eliminate 0.48% : 0.000001s : 10: predicate.set_cell_output_no_recompute 1.05% : 0.000003s : 10: predicate.shard_identity_eliminate 0.85% : 0.000002s : 10: predicate.special_op_eliminate 0.70% : 0.000002s : 10: predicate.specialize_transform 1.21% : 0.000003s : 10: predicate.split_environ_get_set_with_tuple_value 0.85% : 0.000002s : 10: predicate.stack_unstack_eliminate 0.51% : 0.000001s : 5: predicate.switch_call_monad_eliminater 1.34% : 0.000004s : 23: predicate.switch_defer_inline 1.87% : 0.000005s : 33: predicate.switch_layer_defer_inline 4.60% : 0.000012s : 74: predicate.switch_simplify 0.91% : 0.000002s : 15: predicate.tile_eliminate 0.88% : 0.000002s : 15: predicate.transpose_eliminate 1.50% : 0.000004s : 25: predicate.tuple_list_convert_item_index_to_positive 1.43% : 0.000004s : 25: predicate.tuple_list_get_item_const_eliminator 1.42% : 0.000004s : 25: predicate.tuple_list_get_item_depend_reorder 3.42% : 0.000009s : 39: predicate.tuple_list_get_item_eliminator 1.42% : 0.000004s : 25: predicate.tuple_list_get_set_item_eliminator 2.21% : 0.000006s : 35: predicate.tuple_list_set_item_eliminator 1.62% : 0.000004s : 29: predicate.tuple_to_list_eliminator_ 2.19% : 0.000006s : 44: predicate.updatestate_pure_node_eliminater 2.94% : 0.000008s : 54: predicate.updatestate_useless_node_eliminater 0.53% : 0.000001s : 5: predicate.value_based_eliminate 0.76% : 0.000002s : 10: predicate.virtual_dataset_eliminate 0.65% : 0.000002s : 10: predicate.virtual_output_eliminate 0.27% : 0.000001s : 5: predicate.virtual_view_grad_eliminate 0.55% : 0.000001s : 5: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000871 11 37.77% : 0.000329s : 5: func_graph_cloner_run.FuncGraphClonerGraph 62.23% : 0.000542s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.104621 192 0.01% : 0.000006s : 1: ForceFp32Comm 3.52% : 0.003684s : 1: add_attr 3.51% : 0.003668s : 1: add_attr_with_inline 0.01% : 0.000006s : 1: add_comm_op_reuse_tag 0.07% : 0.000077s : 1: add_recomputation 0.01% : 0.000006s : 1: assign_add_opt 0.08% : 0.000082s : 1: auto_monad 0.03% : 0.000035s : 1: auto_monad_reorder 0.01% : 0.000006s : 1: begin_end_overlap_inline 0.01% : 0.000009s : 1: bias_add_comm_swap 0.50% : 0.000521s : 1: bootstrap 0.04% : 0.000043s : 1: cconv 0.01% : 0.000006s : 1: comm_op_add_attrs 0.02% : 0.000025s : 1: control_data_broadcast_order 0.02% : 0.000018s : 1: convert_after_rewriter 0.04% : 0.000043s : 1: cse_after_recomputation 0.01% : 0.000008s : 1: dataset_repeat_opt 0.03% : 0.000030s : 1: detach_backward 0.01% : 0.000014s : 1: environ_conv 0.04% : 0.000037s : 1: event_method 0.01% : 0.000008s : 1: full_micro_interleaved_order_control 0.01% : 0.000009s : 1: get_jit_bprop_graph 0.01% : 0.000012s : 1: graph_reusing 0.01% : 0.000006s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000007s : 1: handle_group_info 0.01% : 0.000008s : 1: inline 0.01% : 0.000009s : 1: insert-virtual-dataset 0.01% : 0.000007s : 1: interleave_parallel_branches 0.01% : 0.000007s : 1: interleave_split_concat_branches 0.01% : 0.000008s : 1: label_fine_grained_interleaved_index 0.01% : 0.000012s : 1: label_micro_interleaved_index 0.52% : 0.000545s : 1: loop_unroll 0.01% : 0.000007s : 1: merge_cast_opt 0.01% : 0.000007s : 1: micro_interleaved_order_control 0.77% : 0.000809s : 1: mutable_eliminate 0.01% : 0.000010s : 1: offloading_packed_experts 0.02% : 0.000023s : 1: opt.transform.loop_unroll_optimizer 0.03% : 0.000027s : 1: opt.transform.mutable_eliminate 1.38% : 0.001447s : 78: opt.transform.opt_a 0.04% : 0.000038s : 1: opt.transform.opt_after_cconv 0.03% : 0.000034s : 1: opt.transform.opt_after_jit_grad 0.15% : 0.000159s : 28: opt.transform.opt_b 0.06% : 0.000067s : 2: opt.transform.opt_trans_graph 0.05% : 0.000051s : 4: opt.transform.symbol_engine_opt 23.71% : 0.024811s : 1: opt_a 0.16% : 0.000166s : 1: opt_after_cconv 0.57% : 0.000593s : 1: opt_after_jit_grad 0.36% : 0.000378s : 1: opt_b 27.06% : 0.028310s : 1: optimize 0.03% : 0.000030s : 1: optimize_parallel_all_gather_comm 0.01% : 0.000012s : 1: order_py_execute_after_rewriter 0.03% : 0.000032s : 1: overlap_grad_flash_sp 0.01% : 0.000007s : 1: overlap_grad_matmul_and_grad_allreduce 0.01% : 0.000011s : 1: overlap_grad_ring_attention 0.01% : 0.000008s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000007s : 1: overlap_opt_shard_in_pipeline 0.01% : 0.000008s : 1: overlap_param_gather 0.01% : 0.000006s : 1: overlap_recompute_allgather_and_fa_grad 0.01% : 0.000011s : 1: overlap_recompute_and_grad_model_parallel 0.01% : 0.000008s : 1: overlap_recompute_comm 0.01% : 0.000010s : 1: parallel-infer-symbol 0.01% : 0.000007s : 1: parallel-infer-symbol-second 0.01% : 0.000008s : 1: partial_unused_args_eliminate 0.01% : 0.000011s : 1: pipeline_parallel_scheduler 0.01% : 0.000007s : 1: pipeline_split 0.04% : 0.000046s : 1: pre_auto_parallel 0.04% : 0.000037s : 1: py_interpret_to_execute 0.03% : 0.000029s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000007s : 1: remove_cast_before_assign_add 0.02% : 0.000025s : 1: remove_dup_value 20.36% : 0.021301s : 1: renormalize.infer 0.55% : 0.000574s : 1: renormalize.specialize 0.01% : 0.000009s : 1: reorder_send_recv_between_fp_bp 0.01% : 0.000013s : 1: rewriter_after_jit_bprop_graph 0.06% : 0.000064s : 1: rewriter_after_opt_a 0.09% : 0.000095s : 1: rewriter_before_opt_a 0.01% : 0.000009s : 1: slice_cell_reuse_recomputed_activation 0.01% : 0.000007s : 1: slice_recompute_activation 0.01% : 0.000007s : 1: split_layernorm_comm 0.01% : 0.000007s : 1: split_matmul_comm_elemetwise 0.01% : 0.000013s : 1: swap_dp_allreduce_reducescatter 0.12% : 0.000124s : 1: symbol_engine_optimizer 0.11% : 0.000120s : 1: tuple_transform 15.34% : 0.016054s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:44:29.678.310 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0289501, [21] [bootstrap]: 0.00042169 [type_inference]: 0.00631327 [event_method]: 1.888e-05 [auto_monad]: 6.287e-05 [graph_reusing]: 6.21e-06 [inline]: 2.98e-06 [add_attr]: 0.0159247, [1] [add_attr_with_inline]: 0.0159128, [1] [Cycle 1]: 7.811e-05, [2] [tag_attr]: 2.54e-05 [meta_addattr_fg_expand]: 5.94e-06 [parallel-infer-symbol]: 3.91999e-06 [pre_auto_parallel]: 4.236e-05 [insert-virtual-dataset]: 2.71e-06 [parallel-infer-symbol-second]: 8.80013e-07 [dataset_repeat_opt]: 2.17001e-06 [pipeline_split]: 2.09e-06 [optimize]: 0.00546505, [53] [py_interpret_to_execute]: 3.447e-05 [rewriter_before_opt_a]: 0.00010836 [opt_a]: 0.003261, [2] [Cycle 1]: 0.00244973, [45] [expand_dump_flag]: 3.15998e-06 [switch_simplify]: 4.811e-05 [loop_unroll]: 3.134e-05 [a_1]: 0.00073309 [with_stream_mark]: 1.726e-05 [recompute_prepare]: 1.043e-05 [updatestate_depend_eliminate]: 4.42998e-06 [updatestate_assign_eliminate]: 3.73999e-06 [updatestate_loads_eliminate]: 3.83001e-06 [parameter_eliminate]: 1.88002e-06 [a_2]: 0.00011079 [accelerated_algorithm]: 8.02e-06 [shard]: 2.04999e-06 [meta_shard_fg_expand]: 1.95001e-06 [shard_inline]: 7.65e-06 [merge_send_recv]: 9.71e-06 [auto_parallel]: 8.14997e-06 [parallel]: 2.036e-05 [flash_sp]: 8.72e-06 [merge_comm]: 4.55001e-06 [allreduce_fusion]: 4.17e-06 [matmul_add_comm_reduction]: 1.077e-05 [allreduce_slice_to_reducescatter]: 7.30011e-07 [virtual_shard_identity]: 1.04e-05 [virtual_dataset]: 8.31002e-06 [get_grad_eliminate_]: 7.63999e-06 [virtual_output]: 8.90001e-06 [merge_forward]: 5.37001e-06 [cell_reuse_recompute_pass]: 1.30001e-06 [offload_activation]: 1.133e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.695e-05 [merge_recompute_call_nodes]: 1.57001e-06 [before_grad]: 1.345e-05 [set_forward_comm_id_for_comm_node_pass]: 4.39998e-06 [meta_fg_expand]: 3.35e-06 [flash_sp_send_recv_attached]: 2.51e-06 [receive_attached]: 2.04999e-06 [after_resolve]: 1.288e-05 [a_after_grad]: 1.197e-05 [renormalize]: 0.00088545 [add_forward_monad_depend]: 6.52001e-06 [auto_monad_grad]: 2.47001e-06 [auto_monad_eliminator]: 1.875e-05 [cse]: 3.938e-05 [a_3]: 6.004e-05 [Cycle 2]: 0.00080015, [45] [expand_dump_flag]: 1.44e-06 [switch_simplify]: 9.05999e-06 [loop_unroll]: 7.43e-06 [a_1]: 0.00019047 [with_stream_mark]: 1.445e-05 [recompute_prepare]: 8.40999e-06 [updatestate_depend_eliminate]: 4.05e-06 [updatestate_assign_eliminate]: 3.03e-06 [updatestate_loads_eliminate]: 3.32002e-06 [parameter_eliminate]: 1.68002e-06 [a_2]: 9.936e-05 [accelerated_algorithm]: 8.28001e-06 [shard]: 2.21998e-06 [meta_shard_fg_expand]: 1.55999e-06 [shard_inline]: 8.18999e-06 [merge_send_recv]: 6.62002e-06 [auto_parallel]: 6.70002e-06 [parallel]: 6.14001e-06 [flash_sp]: 3.8e-06 [merge_comm]: 8.03999e-06 [allreduce_fusion]: 4.39002e-06 [matmul_add_comm_reduction]: 7.71999e-06 [allreduce_slice_to_reducescatter]: 8.50006e-07 [virtual_shard_identity]: 9.39998e-06 [virtual_dataset]: 8.13999e-06 [get_grad_eliminate_]: 7.25998e-06 [virtual_output]: 7e-06 [merge_forward]: 4.17998e-06 [cell_reuse_recompute_pass]: 1.49e-06 [offload_activation]: 9.10999e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.444e-05 [merge_recompute_call_nodes]: 1.04e-06 [before_grad]: 1.111e-05 [set_forward_comm_id_for_comm_node_pass]: 4.52e-06 [meta_fg_expand]: 3.12002e-06 [flash_sp_send_recv_attached]: 9.70002e-07 [receive_attached]: 1.33002e-06 [after_resolve]: 1.253e-05 [a_after_grad]: 1.132e-05 [renormalize]: 8.9989e-08 [add_forward_monad_depend]: 1.45999e-06 [auto_monad_grad]: 1.15999e-06 [auto_monad_eliminator]: 8.74e-06 [cse]: 1.817e-05 [a_3]: 4.535e-05 [py_interpret_to_execute_after_opt_a]: 1.261e-05 [slice_cell_reuse_recomputed_activation]: 1.86998e-06 [rewriter_after_opt_a]: 4.125e-05 [convert_after_rewriter]: 7.77e-06 [order_py_execute_after_rewriter]: 5.97001e-06 [mutable_eliminate]: 0.00056068 [opt_b]: 0.00025609, [1] [Cycle 1]: 0.00024989, [7] [b_1]: 0.00016677 [b_2]: 9.70002e-06 [updatestate_depend_eliminate]: 6.82002e-06 [updatestate_assign_eliminate]: 2.89999e-06 [updatestate_loads_eliminate]: 2.90998e-06 [renormalize]: 5.60016e-07 [cse]: 2.412e-05 [optimize_parallel_all_gather_comm]: 1.896e-05 [overlap_param_gather]: 1.85001e-06 [cconv]: 2.643e-05 [loop_unroll]: 0.0004238 [opt_after_cconv]: 0.00011419, [1] [Cycle 1]: 0.00010782, [7] [c_1]: 3.669e-05 [parameter_eliminate]: 3.48e-06 [updatestate_depend_eliminate]: 6.34001e-06 [updatestate_assign_eliminate]: 3.03e-06 [updatestate_loads_eliminate]: 3.03e-06 [cse]: 2.201e-05 [renormalize]: 3.50003e-07 [remove_dup_value]: 1.497e-05 [tuple_transform]: 8.328e-05, [1] [Cycle 1]: 7.901e-05, [4] [d_1]: 5.144e-05 [none_parameter_eliminate]: 1.59e-06 [renormalize]: 1.69995e-07 [switch_simplify]: 7.97e-06 [partial_unused_args_eliminate]: 1.70001e-06 [add_recomputation]: 5.721e-05 [cse_after_recomputation]: 2.475e-05, [1] [Cycle 1]: 2.028e-05, [1] [cse]: 1.496e-05 [environ_conv]: 6.40002e-06 [swap_dp_allreduce_reducescatter]: 6.24001e-06 [bias_add_comm_swap]: 2.33998e-06 [label_micro_interleaved_index]: 4.38999e-06 [label_fine_grained_interleaved_index]: 2.61e-06 [merge_cast_opt]: 1.40999e-06 [slice_recompute_activation]: 2.36998e-06 [micro_interleaved_order_control]: 2.44999e-06 [assign_add_opt]: 1.43002e-06 [ForceFp32Comm]: 8.30012e-07 [remove_cast_before_assign_add]: 1.02998e-06 [full_micro_interleaved_order_control]: 1.96003e-06 [reorder_send_recv_between_fp_bp]: 2.63998e-06 [comm_op_add_attrs]: 1.39998e-06 [add_comm_op_reuse_tag]: 9.29984e-07 [interleave_split_concat_branches]: 1.44e-06 [interleave_parallel_branches]: 1.03001e-06 [overlap_opt_shard_in_pipeline]: 1.14e-06 [overlap_opt_shard_grad_in_pipeline]: 2.46998e-06 [control_data_broadcast_order]: 1.549e-05 [grouped_pairwise_exchange_alltoall]: 1.63002e-06 [offloading_packed_experts]: 4.75999e-06 [overlap_recompute_and_grad_model_parallel]: 5.10001e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.44e-06 [overlap_recompute_allgather_and_fa_grad]: 1.30001e-06 [overlap_recompute_comm]: 2.62001e-06 [overlap_grad_ring_attention]: 4.47e-06 [overlap_grad_flash_sp]: 2.304e-05 [begin_end_overlap_inline]: 7.7e-07 [split_matmul_comm_elemetwise]: 2.12999e-06 [split_layernorm_comm]: 1.84e-06 [handle_group_info]: 1.32e-06 [symbol_engine_optimizer]: 8.22e-05, [1] [Cycle 1]: 7.75e-05, [6] [build]: 2.98e-06 [elim_shapecalc]: 1.095e-05 [elim_not_effective]: 1.45e-05 [opt_reshape]: 8.33999e-06 [fold_const_symbol]: 1.226e-05 [renormalize]: 2.10013e-07 [detach_backward]: 2.00002e-06 [pipeline_parallel_scheduler]: 1.47001e-06 [auto_monad_reorder]: 1.973e-05 [get_jit_bprop_graph]: 2.01e-06 [rewriter_after_jit_bprop_graph]: 4.35e-06 [opt_after_jit_grad]: 0.00046228 [validate]: 4.57e-05 Sums bootstrap : 0.000422s : 3.49% type_inference : 0.006313s : 52.30% event_method : 0.000019s : 0.16% auto_monad : 0.000063s : 0.52% graph_reusing : 0.000006s : 0.05% inline : 0.000003s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000025s : 0.21% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.05% parallel-infer-symbol : 0.000004s : 0.03% pre_auto_parallel : 0.000042s : 0.35% insert-virtual-dataset : 0.000003s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.02% optimize.py_interpret_to_execute : 0.000034s : 0.29% optimize.rewriter_before_opt_a : 0.000108s : 0.90% optimize.opt_a.expand_dump_flag : 0.000005s : 0.04% optimize.opt_a.switch_simplify : 0.000057s : 0.47% optimize.opt_a.loop_unroll : 0.000039s : 0.32% optimize.opt_a.a_1 : 0.000924s : 7.65% optimize.opt_a.with_stream_mark : 0.000032s : 0.26% optimize.opt_a.recompute_prepare : 0.000019s : 0.16% optimize.opt_a.updatestate_depend_eliminate : 0.000008s : 0.07% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.06% optimize.opt_a.updatestate_loads_eliminate : 0.000007s : 0.06% optimize.opt_a.parameter_eliminate : 0.000004s : 0.03% optimize.opt_a.a_2 : 0.000210s : 1.74% optimize.opt_a.accelerated_algorithm : 0.000016s : 0.14% optimize.opt_a.shard : 0.000004s : 0.04% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.03% optimize.opt_a.shard_inline : 0.000016s : 0.13% optimize.opt_a.merge_send_recv : 0.000016s : 0.14% optimize.opt_a.auto_parallel : 0.000015s : 0.12% optimize.opt_a.parallel : 0.000027s : 0.22% optimize.opt_a.flash_sp : 0.000013s : 0.10% optimize.opt_a.merge_comm : 0.000013s : 0.10% optimize.opt_a.allreduce_fusion : 0.000009s : 0.07% optimize.opt_a.matmul_add_comm_reduction : 0.000018s : 0.15% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000002s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000020s : 0.16% optimize.opt_a.virtual_dataset : 0.000016s : 0.14% optimize.opt_a.get_grad_eliminate_ : 0.000015s : 0.12% optimize.opt_a.virtual_output : 0.000016s : 0.13% optimize.opt_a.merge_forward : 0.000010s : 0.08% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.02% optimize.opt_a.offload_activation : 0.000020s : 0.17% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000031s : 0.26% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.02% optimize.opt_a.before_grad : 0.000025s : 0.20% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000009s : 0.07% optimize.opt_a.meta_fg_expand : 0.000006s : 0.05% optimize.opt_a.flash_sp_send_recv_attached : 0.000003s : 0.03% optimize.opt_a.receive_attached : 0.000003s : 0.03% optimize.opt_a.after_resolve : 0.000025s : 0.21% optimize.opt_a.a_after_grad : 0.000023s : 0.19% optimize.opt_a.renormalize : 0.000886s : 7.34% optimize.opt_a.add_forward_monad_depend : 0.000008s : 0.07% optimize.opt_a.auto_monad_grad : 0.000004s : 0.03% optimize.opt_a.auto_monad_eliminator : 0.000027s : 0.23% optimize.opt_a.cse : 0.000058s : 0.48% optimize.opt_a.a_3 : 0.000105s : 0.87% optimize.py_interpret_to_execute_after_opt_a : 0.000013s : 0.10% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.02% optimize.rewriter_after_opt_a : 0.000041s : 0.34% optimize.convert_after_rewriter : 0.000008s : 0.06% optimize.order_py_execute_after_rewriter : 0.000006s : 0.05% optimize.mutable_eliminate : 0.000561s : 4.65% optimize.opt_b.b_1 : 0.000167s : 1.38% optimize.opt_b.b_2 : 0.000010s : 0.08% optimize.opt_b.updatestate_depend_eliminate : 0.000007s : 0.06% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.02% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.02% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000024s : 0.20% optimize.optimize_parallel_all_gather_comm : 0.000019s : 0.16% optimize.overlap_param_gather : 0.000002s : 0.02% optimize.cconv : 0.000026s : 0.22% optimize.loop_unroll : 0.000424s : 3.51% optimize.opt_after_cconv.c_1 : 0.000037s : 0.30% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.05% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.cse : 0.000022s : 0.18% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000015s : 0.12% optimize.tuple_transform.d_1 : 0.000051s : 0.43% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000008s : 0.07% optimize.partial_unused_args_eliminate : 0.000002s : 0.01% optimize.add_recomputation : 0.000057s : 0.47% optimize.cse_after_recomputation.cse : 0.000015s : 0.12% optimize.environ_conv : 0.000006s : 0.05% optimize.swap_dp_allreduce_reducescatter : 0.000006s : 0.05% optimize.bias_add_comm_swap : 0.000002s : 0.02% optimize.label_micro_interleaved_index : 0.000004s : 0.04% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.02% optimize.merge_cast_opt : 0.000001s : 0.01% optimize.slice_recompute_activation : 0.000002s : 0.02% optimize.micro_interleaved_order_control : 0.000002s : 0.02% optimize.assign_add_opt : 0.000001s : 0.01% optimize.ForceFp32Comm : 0.000001s : 0.01% optimize.remove_cast_before_assign_add : 0.000001s : 0.01% optimize.full_micro_interleaved_order_control : 0.000002s : 0.02% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.02% optimize.comm_op_add_attrs : 0.000001s : 0.01% optimize.add_comm_op_reuse_tag : 0.000001s : 0.01% optimize.interleave_split_concat_branches : 0.000001s : 0.01% optimize.interleave_parallel_branches : 0.000001s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.02% optimize.control_data_broadcast_order : 0.000015s : 0.13% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.01% optimize.offloading_packed_experts : 0.000005s : 0.04% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.04% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.01% optimize.overlap_recompute_comm : 0.000003s : 0.02% optimize.overlap_grad_ring_attention : 0.000004s : 0.04% optimize.overlap_grad_flash_sp : 0.000023s : 0.19% optimize.begin_end_overlap_inline : 0.000001s : 0.01% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.02% optimize.split_layernorm_comm : 0.000002s : 0.02% optimize.handle_group_info : 0.000001s : 0.01% optimize.symbol_engine_optimizer.build : 0.000003s : 0.02% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000011s : 0.09% optimize.symbol_engine_optimizer.elim_not_effective : 0.000014s : 0.12% optimize.symbol_engine_optimizer.opt_reshape : 0.000008s : 0.07% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000012s : 0.10% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.02% pipeline_parallel_scheduler : 0.000001s : 0.01% auto_monad_reorder : 0.000020s : 0.16% get_jit_bprop_graph : 0.000002s : 0.02% rewriter_after_jit_bprop_graph : 0.000004s : 0.04% opt_after_jit_grad : 0.000462s : 3.83% validate : 0.000046s : 0.38% Time group info: ------[substitution.] 0.000251 39 11.34% : 0.000028s : 3: substitution.cast_eliminate 0.91% : 0.000002s : 3: substitution.elim_not_effective 0.67% : 0.000002s : 3: substitution.fold_const_symbol 2.62% : 0.000007s : 5: substitution.graph_param_transform 68.83% : 0.000172s : 4: substitution.inline 1.98% : 0.000005s : 6: substitution.j_node_and_user_rematch 2.38% : 0.000006s : 6: substitution.remove_not_recompute_node 2.71% : 0.000007s : 4: substitution.replace_old_param 5.54% : 0.000014s : 4: substitution.tuple_list_get_item_eliminator 3.05% : 0.000008s : 1: substitution.value_based_eliminate ------[type_inference.] 0.006251 2 88.34% : 0.005522s : 1: type_inference.infer 11.66% : 0.000729s : 1: type_inference.specialize ------[replace.] 0.000066 8 62.42% : 0.000041s : 4: replace.inline 37.58% : 0.000025s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000182 8 93.42% : 0.000170s : 4: match.inline 6.58% : 0.000012s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000245 1504 0.89% : 0.000002s : 15: predicate.accumulaten_eliminater 0.63% : 0.000002s : 5: predicate.ad_related_special_op_eliminate 0.61% : 0.000001s : 10: predicate.addn_check_dump 0.90% : 0.000002s : 15: predicate.addn_zero_filter 0.83% : 0.000002s : 15: predicate.adjust_all_reduce_mul_add 2.12% : 0.000005s : 25: predicate.arithmetic_simplify 1.07% : 0.000003s : 15: predicate.cast_eliminate 0.60% : 0.000001s : 10: predicate.check_bprop_eliminate 0.58% : 0.000001s : 10: predicate.compare_switch_simplify 0.20% : 0.000000s : 5: predicate.const_output_eliminate 0.71% : 0.000002s : 10: predicate.depend_value_elim 1.08% : 0.000003s : 15: predicate.dict_get_item_const_eliminator 0.98% : 0.000002s : 15: predicate.dict_get_item_eliminator 0.91% : 0.000002s : 15: predicate.dict_set_item_eliminator 0.91% : 0.000002s : 10: predicate.dumpgradient_eliminate 0.19% : 0.000000s : 5: predicate.elim_not_effective 0.43% : 0.000001s : 5: predicate.elim_shapecalc_of_broadcastargs 1.21% : 0.000003s : 20: predicate.environ_add_const_eliminate 1.07% : 0.000003s : 20: predicate.environ_get_add_eliminate 1.11% : 0.000003s : 20: predicate.environ_get_depend_swap 1.82% : 0.000004s : 30: predicate.environ_get_eliminate 1.11% : 0.000003s : 20: predicate.environ_get_set_eliminate 1.41% : 0.000003s : 23: predicate.exchange_switch_depend_value 2.18% : 0.000005s : 23: predicate.float_depend_g_call 0.60% : 0.000001s : 10: predicate.float_environ_get_switch 0.86% : 0.000002s : 15: predicate.float_tuple_getitem_switch 0.19% : 0.000000s : 5: predicate.fold_const_symbol 0.74% : 0.000002s : 10: predicate.get_grad_eliminate 0.22% : 0.000001s : 5: predicate.graph_param_transform 0.68% : 0.000002s : 10: predicate.incorporate_call 0.58% : 0.000001s : 10: predicate.incorporate_call_switch 6.64% : 0.000016s : 68: predicate.inline 0.80% : 0.000002s : 10: predicate.inline_without_move 0.30% : 0.000001s : 10: predicate.j_node_and_user_rematch 0.93% : 0.000002s : 10: predicate.less_batch_normalization 1.75% : 0.000004s : 29: predicate.list_to_tuple_eliminator_ 2.55% : 0.000006s : 44: predicate.load_eliminater 0.72% : 0.000002s : 5: predicate.loop_unroll_after_grad 2.05% : 0.000005s : 36: predicate.loop_unroll_before_grad 1.61% : 0.000004s : 25: predicate.make_slice_get_slice_eliminator 0.63% : 0.000002s : 10: predicate.merge_addn 0.59% : 0.000001s : 10: predicate.micro_step_allgather_replace 0.62% : 0.000002s : 10: predicate.mini_step_allgather_replace 0.86% : 0.000002s : 15: predicate.minmaximum_grad 0.86% : 0.000002s : 5: predicate.mutable_eliminate 0.44% : 0.000001s : 5: predicate.opt_reshape 0.42% : 0.000001s : 5: predicate.parallel_virtual_node 1.62% : 0.000004s : 23: predicate.partial_defer_inline 1.67% : 0.000004s : 24: predicate.partial_eliminate 0.90% : 0.000002s : 15: predicate.print_const_string_wrapper 0.78% : 0.000002s : 10: predicate.reduce_all_const_elim 1.17% : 0.000003s : 15: predicate.reduce_eliminate 2.58% : 0.000006s : 44: predicate.redundant_stop_gradient_eliminater 0.40% : 0.000001s : 10: predicate.remove_not_recompute_node 1.39% : 0.000003s : 29: predicate.replace_applicator 0.40% : 0.000001s : 10: predicate.replace_old_param 0.25% : 0.000001s : 5: predicate.reset_defer_inline 1.04% : 0.000003s : 15: predicate.reshape_eliminate 0.72% : 0.000002s : 10: predicate.row_tensor_add_zeros_like 0.41% : 0.000001s : 5: predicate.row_tensor_eliminate 0.79% : 0.000002s : 10: predicate.same_eliminate 0.46% : 0.000001s : 10: predicate.set_cell_output_no_recompute 0.84% : 0.000002s : 10: predicate.shard_identity_eliminate 0.75% : 0.000002s : 10: predicate.special_op_eliminate 0.79% : 0.000002s : 10: predicate.specialize_transform 0.82% : 0.000002s : 10: predicate.split_environ_get_set_with_tuple_value 0.76% : 0.000002s : 10: predicate.stack_unstack_eliminate 0.35% : 0.000001s : 5: predicate.switch_call_monad_eliminater 1.43% : 0.000004s : 23: predicate.switch_defer_inline 2.08% : 0.000005s : 33: predicate.switch_layer_defer_inline 5.07% : 0.000012s : 74: predicate.switch_simplify 0.89% : 0.000002s : 15: predicate.tile_eliminate 0.94% : 0.000002s : 15: predicate.transpose_eliminate 1.66% : 0.000004s : 25: predicate.tuple_list_convert_item_index_to_positive 1.52% : 0.000004s : 25: predicate.tuple_list_get_item_const_eliminator 1.40% : 0.000003s : 25: predicate.tuple_list_get_item_depend_reorder 3.26% : 0.000008s : 39: predicate.tuple_list_get_item_eliminator 1.46% : 0.000004s : 25: predicate.tuple_list_get_set_item_eliminator 2.28% : 0.000006s : 35: predicate.tuple_list_set_item_eliminator 1.72% : 0.000004s : 29: predicate.tuple_to_list_eliminator_ 2.52% : 0.000006s : 44: predicate.updatestate_pure_node_eliminater 3.19% : 0.000008s : 54: predicate.updatestate_useless_node_eliminater 0.54% : 0.000001s : 5: predicate.value_based_eliminate 0.68% : 0.000002s : 10: predicate.virtual_dataset_eliminate 0.66% : 0.000002s : 10: predicate.virtual_output_eliminate 0.29% : 0.000001s : 5: predicate.virtual_view_grad_eliminate 0.35% : 0.000001s : 5: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000601 11 52.12% : 0.000313s : 5: func_graph_cloner_run.FuncGraphClonerGraph 47.88% : 0.000288s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.052891 192 0.01% : 0.000004s : 1: ForceFp32Comm 30.12% : 0.015931s : 1: add_attr 30.09% : 0.015917s : 1: add_attr_with_inline 0.01% : 0.000004s : 1: add_comm_op_reuse_tag 0.12% : 0.000061s : 1: add_recomputation 0.01% : 0.000004s : 1: assign_add_opt 0.13% : 0.000069s : 1: auto_monad 0.04% : 0.000023s : 1: auto_monad_reorder 0.01% : 0.000004s : 1: begin_end_overlap_inline 0.01% : 0.000005s : 1: bias_add_comm_swap 0.84% : 0.000445s : 1: bootstrap 0.06% : 0.000030s : 1: cconv 0.01% : 0.000004s : 1: comm_op_add_attrs 0.04% : 0.000019s : 1: control_data_broadcast_order 0.02% : 0.000011s : 1: convert_after_rewriter 0.05% : 0.000028s : 1: cse_after_recomputation 0.01% : 0.000005s : 1: dataset_repeat_opt 0.01% : 0.000006s : 1: detach_backward 0.02% : 0.000010s : 1: environ_conv 0.05% : 0.000024s : 1: event_method 0.01% : 0.000005s : 1: full_micro_interleaved_order_control 0.01% : 0.000005s : 1: get_jit_bprop_graph 0.02% : 0.000010s : 1: graph_reusing 0.01% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000004s : 1: handle_group_info 0.01% : 0.000006s : 1: inline 0.01% : 0.000006s : 1: insert-virtual-dataset 0.01% : 0.000004s : 1: interleave_parallel_branches 0.01% : 0.000004s : 1: interleave_split_concat_branches 0.01% : 0.000006s : 1: label_fine_grained_interleaved_index 0.01% : 0.000007s : 1: label_micro_interleaved_index 0.82% : 0.000431s : 1: loop_unroll 0.01% : 0.000004s : 1: merge_cast_opt 0.01% : 0.000005s : 1: micro_interleaved_order_control 1.08% : 0.000569s : 1: mutable_eliminate 0.01% : 0.000008s : 1: offloading_packed_experts 0.03% : 0.000016s : 1: opt.transform.loop_unroll_optimizer 0.03% : 0.000016s : 1: opt.transform.mutable_eliminate 2.76% : 0.001461s : 78: opt.transform.opt_a 0.07% : 0.000035s : 1: opt.transform.opt_after_cconv 0.05% : 0.000028s : 1: opt.transform.opt_after_jit_grad 0.27% : 0.000145s : 28: opt.transform.opt_b 0.11% : 0.000057s : 2: opt.transform.opt_trans_graph 0.08% : 0.000042s : 4: opt.transform.symbol_engine_opt 6.17% : 0.003264s : 1: opt_a 0.22% : 0.000118s : 1: opt_after_cconv 0.89% : 0.000471s : 1: opt_after_jit_grad 0.49% : 0.000260s : 1: opt_b 10.34% : 0.005470s : 1: optimize 0.04% : 0.000022s : 1: optimize_parallel_all_gather_comm 0.02% : 0.000009s : 1: order_py_execute_after_rewriter 0.05% : 0.000026s : 1: overlap_grad_flash_sp 0.01% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.01% : 0.000008s : 1: overlap_grad_ring_attention 0.01% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.01% : 0.000005s : 1: overlap_param_gather 0.01% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.02% : 0.000008s : 1: overlap_recompute_and_grad_model_parallel 0.01% : 0.000006s : 1: overlap_recompute_comm 0.01% : 0.000008s : 1: parallel-infer-symbol 0.01% : 0.000004s : 1: parallel-infer-symbol-second 0.01% : 0.000005s : 1: partial_unused_args_eliminate 0.01% : 0.000005s : 1: pipeline_parallel_scheduler 0.01% : 0.000005s : 1: pipeline_split 0.09% : 0.000047s : 1: pre_auto_parallel 0.07% : 0.000039s : 1: py_interpret_to_execute 0.03% : 0.000016s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000004s : 1: remove_cast_before_assign_add 0.03% : 0.000018s : 1: remove_dup_value 0.90% : 0.000477s : 1: renormalize.infer 0.76% : 0.000400s : 1: renormalize.specialize 0.01% : 0.000005s : 1: reorder_send_recv_between_fp_bp 0.01% : 0.000008s : 1: rewriter_after_jit_bprop_graph 0.09% : 0.000046s : 1: rewriter_after_opt_a 0.21% : 0.000113s : 1: rewriter_before_opt_a 0.01% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.01% : 0.000005s : 1: slice_recompute_activation 0.01% : 0.000005s : 1: split_layernorm_comm 0.01% : 0.000005s : 1: split_matmul_comm_elemetwise 0.02% : 0.000009s : 1: swap_dp_allreduce_reducescatter 0.16% : 0.000085s : 1: symbol_engine_optimizer 0.16% : 0.000086s : 1: tuple_transform 11.97% : 0.006329s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:44:30.383.671 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:44:30.383.945 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0200053, [21] [bootstrap]: 0.00050661 [type_inference]: 0.00635642 [event_method]: 2.192e-05 [auto_monad]: 7.104e-05 [graph_reusing]: 5.85002e-06 [inline]: 2.24001e-06 [add_attr]: 0.0037479, [1] [add_attr_with_inline]: 0.0037343, [1] [Cycle 1]: 9.523e-05, [2] [tag_attr]: 2.494e-05 [meta_addattr_fg_expand]: 6.50002e-06 [parallel-infer-symbol]: 3.5e-06 [pre_auto_parallel]: 4.145e-05 [insert-virtual-dataset]: 2.60002e-06 [parallel-infer-symbol-second]: 7.7e-07 [dataset_repeat_opt]: 2.17001e-06 [pipeline_split]: 1.80001e-06 [optimize]: 0.00761167, [53] [py_interpret_to_execute]: 4.241e-05 [rewriter_before_opt_a]: 0.00011034 [opt_a]: 0.00421903, [2] [Cycle 1]: 0.003021, [45] [expand_dump_flag]: 3.88001e-06 [switch_simplify]: 4.549e-05 [loop_unroll]: 3.252e-05 [a_1]: 0.00076489 [with_stream_mark]: 2.359e-05 [recompute_prepare]: 1.309e-05 [updatestate_depend_eliminate]: 6.74001e-06 [updatestate_assign_eliminate]: 5.02e-06 [updatestate_loads_eliminate]: 4.72998e-06 [parameter_eliminate]: 2.66999e-06 [a_2]: 0.00015045 [accelerated_algorithm]: 1.08e-05 [shard]: 2.91e-06 [meta_shard_fg_expand]: 2.57001e-06 [shard_inline]: 9.12001e-06 [merge_send_recv]: 1.126e-05 [auto_parallel]: 1.154e-05 [parallel]: 2.175e-05 [flash_sp]: 1.009e-05 [merge_comm]: 5.22999e-06 [allreduce_fusion]: 5.07999e-06 [matmul_add_comm_reduction]: 1.225e-05 [allreduce_slice_to_reducescatter]: 7.2e-07 [virtual_shard_identity]: 1.172e-05 [virtual_dataset]: 1.537e-05 [get_grad_eliminate_]: 1.064e-05 [virtual_output]: 9.17999e-06 [merge_forward]: 6.39999e-06 [cell_reuse_recompute_pass]: 1.49998e-06 [offload_activation]: 1.343e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.49e-05 [merge_recompute_call_nodes]: 1.64998e-06 [before_grad]: 1.737e-05 [set_forward_comm_id_for_comm_node_pass]: 5.30001e-06 [meta_fg_expand]: 4.98001e-06 [flash_sp_send_recv_attached]: 3.26999e-06 [receive_attached]: 2.63e-06 [after_resolve]: 1.595e-05 [a_after_grad]: 1.412e-05 [renormalize]: 0.00107058 [add_forward_monad_depend]: 7.99002e-06 [auto_monad_grad]: 3.06001e-06 [auto_monad_eliminator]: 2.505e-05 [cse]: 4.877e-05 [a_3]: 9.276e-05 [Cycle 2]: 0.00117963, [45] [expand_dump_flag]: 2.22999e-06 [switch_simplify]: 1.192e-05 [loop_unroll]: 9.66e-06 [a_1]: 0.00023869 [with_stream_mark]: 2.087e-05 [recompute_prepare]: 1.068e-05 [updatestate_depend_eliminate]: 6.39999e-06 [updatestate_assign_eliminate]: 4.15e-06 [updatestate_loads_eliminate]: 4.05e-06 [parameter_eliminate]: 2.00002e-06 [a_2]: 0.00014355 [accelerated_algorithm]: 9.92999e-06 [shard]: 2.43e-06 [meta_shard_fg_expand]: 2.75002e-06 [shard_inline]: 9.39998e-06 [merge_send_recv]: 1.137e-05 [auto_parallel]: 1.141e-05 [parallel]: 8.42e-06 [flash_sp]: 4.43001e-06 [merge_comm]: 8.59e-06 [allreduce_fusion]: 5.05001e-06 [matmul_add_comm_reduction]: 1.184e-05 [allreduce_slice_to_reducescatter]: 6.09987e-07 [virtual_shard_identity]: 1.119e-05 [virtual_dataset]: 9.79e-06 [get_grad_eliminate_]: 9.87999e-06 [virtual_output]: 9.20001e-06 [merge_forward]: 6.46999e-06 [cell_reuse_recompute_pass]: 2.86999e-06 [offload_activation]: 1.276e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.307e-05 [merge_recompute_call_nodes]: 1.45001e-06 [before_grad]: 1.666e-05 [set_forward_comm_id_for_comm_node_pass]: 5.76998e-06 [meta_fg_expand]: 4.75001e-06 [flash_sp_send_recv_attached]: 1.84e-06 [receive_attached]: 2.04e-06 [after_resolve]: 1.564e-05 [a_after_grad]: 1.395e-05 [renormalize]: 7.99773e-08 [add_forward_monad_depend]: 2.52001e-06 [auto_monad_grad]: 1.91e-06 [auto_monad_eliminator]: 1.521e-05 [cse]: 3.464e-05 [a_3]: 7.051e-05 [py_interpret_to_execute_after_opt_a]: 2.431e-05 [slice_cell_reuse_recomputed_activation]: 6.00002e-06 [rewriter_after_opt_a]: 6.405e-05 [convert_after_rewriter]: 1.363e-05 [order_py_execute_after_rewriter]: 9.83002e-06 [mutable_eliminate]: 0.00085524 [opt_b]: 0.00041924, [1] [Cycle 1]: 0.00040564, [7] [b_1]: 0.00025877 [b_2]: 1.128e-05 [updatestate_depend_eliminate]: 1.408e-05 [updatestate_assign_eliminate]: 4.01001e-06 [updatestate_loads_eliminate]: 4.76002e-06 [renormalize]: 7.50006e-07 [cse]: 4.87e-05 [optimize_parallel_all_gather_comm]: 2.814e-05 [overlap_param_gather]: 4.50001e-06 [cconv]: 4.065e-05 [loop_unroll]: 0.00069835 [opt_after_cconv]: 0.00019192, [1] [Cycle 1]: 0.00017993, [7] [c_1]: 4.808e-05 [parameter_eliminate]: 5.68997e-06 [updatestate_depend_eliminate]: 1.063e-05 [updatestate_assign_eliminate]: 3.82998e-06 [updatestate_loads_eliminate]: 4.23999e-06 [cse]: 4.446e-05 [renormalize]: 5.89993e-07 [remove_dup_value]: 5.926e-05 [tuple_transform]: 0.00012264, [1] [Cycle 1]: 0.0001144, [4] [d_1]: 6.806e-05 [none_parameter_eliminate]: 1.75001e-06 [renormalize]: 6.59988e-07 [switch_simplify]: 1.081e-05 [partial_unused_args_eliminate]: 4.74002e-06 [add_recomputation]: 7.601e-05 [cse_after_recomputation]: 4e-05, [1] [Cycle 1]: 3.181e-05, [1] [cse]: 2.131e-05 [environ_conv]: 1.124e-05 [swap_dp_allreduce_reducescatter]: 1.045e-05 [bias_add_comm_swap]: 5.32999e-06 [label_micro_interleaved_index]: 9.46e-06 [label_fine_grained_interleaved_index]: 5.94e-06 [merge_cast_opt]: 4.49998e-06 [slice_recompute_activation]: 4.82998e-06 [micro_interleaved_order_control]: 5.24e-06 [assign_add_opt]: 4.27003e-06 [ForceFp32Comm]: 3.34001e-06 [remove_cast_before_assign_add]: 3.75e-06 [full_micro_interleaved_order_control]: 4.84e-06 [reorder_send_recv_between_fp_bp]: 5.15999e-06 [comm_op_add_attrs]: 3.52002e-06 [add_comm_op_reuse_tag]: 3.38999e-06 [interleave_split_concat_branches]: 3.77002e-06 [interleave_parallel_branches]: 3.73999e-06 [overlap_opt_shard_in_pipeline]: 4.20999e-06 [overlap_opt_shard_grad_in_pipeline]: 4.55001e-06 [control_data_broadcast_order]: 2.32e-05 [grouped_pairwise_exchange_alltoall]: 4.35999e-06 [offloading_packed_experts]: 9.02e-06 [overlap_recompute_and_grad_model_parallel]: 8.40999e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.76001e-06 [overlap_recompute_allgather_and_fa_grad]: 4.23999e-06 [overlap_recompute_comm]: 5.49e-06 [overlap_grad_ring_attention]: 7.82998e-06 [overlap_grad_flash_sp]: 3.306e-05 [begin_end_overlap_inline]: 2.89001e-06 [split_matmul_comm_elemetwise]: 4.87998e-06 [split_layernorm_comm]: 4.18001e-06 [handle_group_info]: 4.22e-06 [symbol_engine_optimizer]: 0.00012658, [1] [Cycle 1]: 0.00011902, [6] [build]: 5.54e-06 [elim_shapecalc]: 1.606e-05 [elim_not_effective]: 2.094e-05 [opt_reshape]: 1.042e-05 [fold_const_symbol]: 1.467e-05 [renormalize]: 2.19996e-07 [detach_backward]: 5.50001e-06 [pipeline_parallel_scheduler]: 2.14999e-06 [auto_monad_reorder]: 3.179e-05 [get_jit_bprop_graph]: 2.03002e-06 [rewriter_after_jit_bprop_graph]: 7.08e-06 [opt_after_jit_grad]: 0.00083361 [validate]: 5.896e-05 Sums bootstrap : 0.000507s : 3.56% type_inference : 0.006356s : 44.61% event_method : 0.000022s : 0.15% auto_monad : 0.000071s : 0.50% graph_reusing : 0.000006s : 0.04% inline : 0.000002s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000025s : 0.18% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000007s : 0.05% parallel-infer-symbol : 0.000003s : 0.02% pre_auto_parallel : 0.000041s : 0.29% insert-virtual-dataset : 0.000003s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000042s : 0.30% optimize.rewriter_before_opt_a : 0.000110s : 0.77% optimize.opt_a.expand_dump_flag : 0.000006s : 0.04% optimize.opt_a.switch_simplify : 0.000057s : 0.40% optimize.opt_a.loop_unroll : 0.000042s : 0.30% optimize.opt_a.a_1 : 0.001004s : 7.04% optimize.opt_a.with_stream_mark : 0.000044s : 0.31% optimize.opt_a.recompute_prepare : 0.000024s : 0.17% optimize.opt_a.updatestate_depend_eliminate : 0.000013s : 0.09% optimize.opt_a.updatestate_assign_eliminate : 0.000009s : 0.06% optimize.opt_a.updatestate_loads_eliminate : 0.000009s : 0.06% optimize.opt_a.parameter_eliminate : 0.000005s : 0.03% optimize.opt_a.a_2 : 0.000294s : 2.06% optimize.opt_a.accelerated_algorithm : 0.000021s : 0.15% optimize.opt_a.shard : 0.000005s : 0.04% optimize.opt_a.meta_shard_fg_expand : 0.000005s : 0.04% optimize.opt_a.shard_inline : 0.000019s : 0.13% optimize.opt_a.merge_send_recv : 0.000023s : 0.16% optimize.opt_a.auto_parallel : 0.000023s : 0.16% optimize.opt_a.parallel : 0.000030s : 0.21% optimize.opt_a.flash_sp : 0.000015s : 0.10% optimize.opt_a.merge_comm : 0.000014s : 0.10% optimize.opt_a.allreduce_fusion : 0.000010s : 0.07% optimize.opt_a.matmul_add_comm_reduction : 0.000024s : 0.17% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000023s : 0.16% optimize.opt_a.virtual_dataset : 0.000025s : 0.18% optimize.opt_a.get_grad_eliminate_ : 0.000021s : 0.14% optimize.opt_a.virtual_output : 0.000018s : 0.13% optimize.opt_a.merge_forward : 0.000013s : 0.09% optimize.opt_a.cell_reuse_recompute_pass : 0.000004s : 0.03% optimize.opt_a.offload_activation : 0.000026s : 0.18% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000048s : 0.34% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.02% optimize.opt_a.before_grad : 0.000034s : 0.24% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000011s : 0.08% optimize.opt_a.meta_fg_expand : 0.000010s : 0.07% optimize.opt_a.flash_sp_send_recv_attached : 0.000005s : 0.04% optimize.opt_a.receive_attached : 0.000005s : 0.03% optimize.opt_a.after_resolve : 0.000032s : 0.22% optimize.opt_a.a_after_grad : 0.000028s : 0.20% optimize.opt_a.renormalize : 0.001071s : 7.51% optimize.opt_a.add_forward_monad_depend : 0.000011s : 0.07% optimize.opt_a.auto_monad_grad : 0.000005s : 0.03% optimize.opt_a.auto_monad_eliminator : 0.000040s : 0.28% optimize.opt_a.cse : 0.000083s : 0.59% optimize.opt_a.a_3 : 0.000163s : 1.15% optimize.py_interpret_to_execute_after_opt_a : 0.000024s : 0.17% optimize.slice_cell_reuse_recomputed_activation : 0.000006s : 0.04% optimize.rewriter_after_opt_a : 0.000064s : 0.45% optimize.convert_after_rewriter : 0.000014s : 0.10% optimize.order_py_execute_after_rewriter : 0.000010s : 0.07% optimize.mutable_eliminate : 0.000855s : 6.00% optimize.opt_b.b_1 : 0.000259s : 1.82% optimize.opt_b.b_2 : 0.000011s : 0.08% optimize.opt_b.updatestate_depend_eliminate : 0.000014s : 0.10% optimize.opt_b.updatestate_assign_eliminate : 0.000004s : 0.03% optimize.opt_b.updatestate_loads_eliminate : 0.000005s : 0.03% optimize.opt_b.renormalize : 0.000001s : 0.01% optimize.opt_b.cse : 0.000049s : 0.34% optimize.optimize_parallel_all_gather_comm : 0.000028s : 0.20% optimize.overlap_param_gather : 0.000005s : 0.03% optimize.cconv : 0.000041s : 0.29% optimize.loop_unroll : 0.000698s : 4.90% optimize.opt_after_cconv.c_1 : 0.000048s : 0.34% optimize.opt_after_cconv.parameter_eliminate : 0.000006s : 0.04% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000011s : 0.07% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000004s : 0.03% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000004s : 0.03% optimize.opt_after_cconv.cse : 0.000044s : 0.31% optimize.opt_after_cconv.renormalize : 0.000001s : 0.00% optimize.remove_dup_value : 0.000059s : 0.42% optimize.tuple_transform.d_1 : 0.000068s : 0.48% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000001s : 0.00% optimize.tuple_transform.switch_simplify : 0.000011s : 0.08% optimize.partial_unused_args_eliminate : 0.000005s : 0.03% optimize.add_recomputation : 0.000076s : 0.53% optimize.cse_after_recomputation.cse : 0.000021s : 0.15% optimize.environ_conv : 0.000011s : 0.08% optimize.swap_dp_allreduce_reducescatter : 0.000010s : 0.07% optimize.bias_add_comm_swap : 0.000005s : 0.04% optimize.label_micro_interleaved_index : 0.000009s : 0.07% optimize.label_fine_grained_interleaved_index : 0.000006s : 0.04% optimize.merge_cast_opt : 0.000004s : 0.03% optimize.slice_recompute_activation : 0.000005s : 0.03% optimize.micro_interleaved_order_control : 0.000005s : 0.04% optimize.assign_add_opt : 0.000004s : 0.03% optimize.ForceFp32Comm : 0.000003s : 0.02% optimize.remove_cast_before_assign_add : 0.000004s : 0.03% optimize.full_micro_interleaved_order_control : 0.000005s : 0.03% optimize.reorder_send_recv_between_fp_bp : 0.000005s : 0.04% optimize.comm_op_add_attrs : 0.000004s : 0.02% optimize.add_comm_op_reuse_tag : 0.000003s : 0.02% optimize.interleave_split_concat_branches : 0.000004s : 0.03% optimize.interleave_parallel_branches : 0.000004s : 0.03% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.03% optimize.overlap_opt_shard_grad_in_pipeline : 0.000005s : 0.03% optimize.control_data_broadcast_order : 0.000023s : 0.16% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.03% optimize.offloading_packed_experts : 0.000009s : 0.06% optimize.overlap_recompute_and_grad_model_parallel : 0.000008s : 0.06% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.03% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.03% optimize.overlap_recompute_comm : 0.000005s : 0.04% optimize.overlap_grad_ring_attention : 0.000008s : 0.05% optimize.overlap_grad_flash_sp : 0.000033s : 0.23% optimize.begin_end_overlap_inline : 0.000003s : 0.02% optimize.split_matmul_comm_elemetwise : 0.000005s : 0.03% optimize.split_layernorm_comm : 0.000004s : 0.03% optimize.handle_group_info : 0.000004s : 0.03% optimize.symbol_engine_optimizer.build : 0.000006s : 0.04% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000016s : 0.11% optimize.symbol_engine_optimizer.elim_not_effective : 0.000021s : 0.15% optimize.symbol_engine_optimizer.opt_reshape : 0.000010s : 0.07% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000015s : 0.10% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000006s : 0.04% pipeline_parallel_scheduler : 0.000002s : 0.02% auto_monad_reorder : 0.000032s : 0.22% get_jit_bprop_graph : 0.000002s : 0.01% rewriter_after_jit_bprop_graph : 0.000007s : 0.05% opt_after_jit_grad : 0.000834s : 5.85% validate : 0.000059s : 0.41% Time group info: ------[substitution.] 0.000277 49 15.55% : 0.000043s : 6: substitution.cast_eliminate 1.12% : 0.000003s : 4: substitution.elim_not_effective 0.74% : 0.000002s : 4: substitution.fold_const_symbol 2.87% : 0.000008s : 6: substitution.graph_param_transform 63.32% : 0.000175s : 4: substitution.inline 2.49% : 0.000007s : 8: substitution.j_node_and_user_rematch 3.16% : 0.000009s : 8: substitution.remove_not_recompute_node 2.55% : 0.000007s : 4: substitution.replace_old_param 5.12% : 0.000014s : 4: substitution.tuple_list_get_item_eliminator 3.07% : 0.000008s : 1: substitution.value_based_eliminate ------[type_inference.] 0.006293 2 88.23% : 0.005552s : 1: type_inference.infer 11.77% : 0.000741s : 1: type_inference.specialize ------[replace.] 0.000068 8 64.48% : 0.000044s : 4: replace.inline 35.52% : 0.000024s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000184 8 93.57% : 0.000172s : 4: match.inline 6.43% : 0.000012s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000313 1730 0.93% : 0.000003s : 17: predicate.accumulaten_eliminater 1.42% : 0.000004s : 6: predicate.ad_related_special_op_eliminate 0.54% : 0.000002s : 12: predicate.addn_check_dump 0.85% : 0.000003s : 17: predicate.addn_zero_filter 0.72% : 0.000002s : 17: predicate.adjust_all_reduce_mul_add 2.07% : 0.000006s : 29: predicate.arithmetic_simplify 1.06% : 0.000003s : 17: predicate.cast_eliminate 0.61% : 0.000002s : 12: predicate.check_bprop_eliminate 0.53% : 0.000002s : 12: predicate.compare_switch_simplify 0.17% : 0.000001s : 6: predicate.const_output_eliminate 0.70% : 0.000002s : 12: predicate.depend_value_elim 0.96% : 0.000003s : 17: predicate.dict_get_item_const_eliminator 0.94% : 0.000003s : 17: predicate.dict_get_item_eliminator 0.80% : 0.000003s : 17: predicate.dict_set_item_eliminator 1.18% : 0.000004s : 12: predicate.dumpgradient_eliminate 0.23% : 0.000001s : 6: predicate.elim_not_effective 0.61% : 0.000002s : 6: predicate.elim_shapecalc_of_broadcastargs 1.18% : 0.000004s : 23: predicate.environ_add_const_eliminate 1.15% : 0.000004s : 23: predicate.environ_get_add_eliminate 1.07% : 0.000003s : 23: predicate.environ_get_depend_swap 1.76% : 0.000005s : 35: predicate.environ_get_eliminate 1.23% : 0.000004s : 23: predicate.environ_get_set_eliminate 1.24% : 0.000004s : 25: predicate.exchange_switch_depend_value 2.08% : 0.000007s : 25: predicate.float_depend_g_call 0.58% : 0.000002s : 12: predicate.float_environ_get_switch 0.93% : 0.000003s : 18: predicate.float_tuple_getitem_switch 0.15% : 0.000000s : 6: predicate.fold_const_symbol 0.79% : 0.000002s : 12: predicate.get_grad_eliminate 0.27% : 0.000001s : 6: predicate.graph_param_transform 0.61% : 0.000002s : 12: predicate.incorporate_call 0.51% : 0.000002s : 12: predicate.incorporate_call_switch 6.11% : 0.000019s : 78: predicate.inline 0.83% : 0.000003s : 12: predicate.inline_without_move 0.33% : 0.000001s : 12: predicate.j_node_and_user_rematch 0.92% : 0.000003s : 12: predicate.less_batch_normalization 1.80% : 0.000006s : 33: predicate.list_to_tuple_eliminator_ 2.68% : 0.000008s : 50: predicate.load_eliminater 1.24% : 0.000004s : 6: predicate.loop_unroll_after_grad 1.90% : 0.000006s : 38: predicate.loop_unroll_before_grad 1.62% : 0.000005s : 29: predicate.make_slice_get_slice_eliminator 0.76% : 0.000002s : 12: predicate.merge_addn 0.63% : 0.000002s : 12: predicate.micro_step_allgather_replace 0.59% : 0.000002s : 12: predicate.mini_step_allgather_replace 0.77% : 0.000002s : 17: predicate.minmaximum_grad 1.67% : 0.000005s : 6: predicate.mutable_eliminate 0.38% : 0.000001s : 6: predicate.opt_reshape 0.47% : 0.000001s : 6: predicate.parallel_virtual_node 1.49% : 0.000005s : 25: predicate.partial_defer_inline 1.44% : 0.000004s : 27: predicate.partial_eliminate 0.88% : 0.000003s : 17: predicate.print_const_string_wrapper 0.67% : 0.000002s : 12: predicate.reduce_all_const_elim 1.06% : 0.000003s : 17: predicate.reduce_eliminate 2.35% : 0.000007s : 50: predicate.redundant_stop_gradient_eliminater 0.36% : 0.000001s : 12: predicate.remove_not_recompute_node 1.35% : 0.000004s : 33: predicate.replace_applicator 0.43% : 0.000001s : 12: predicate.replace_old_param 0.35% : 0.000001s : 6: predicate.reset_defer_inline 1.10% : 0.000003s : 17: predicate.reshape_eliminate 0.65% : 0.000002s : 12: predicate.row_tensor_add_zeros_like 0.36% : 0.000001s : 6: predicate.row_tensor_eliminate 0.80% : 0.000002s : 12: predicate.same_eliminate 0.39% : 0.000001s : 12: predicate.set_cell_output_no_recompute 0.93% : 0.000003s : 12: predicate.shard_identity_eliminate 0.81% : 0.000003s : 12: predicate.special_op_eliminate 0.75% : 0.000002s : 12: predicate.specialize_transform 1.25% : 0.000004s : 12: predicate.split_environ_get_set_with_tuple_value 0.75% : 0.000002s : 12: predicate.stack_unstack_eliminate 0.37% : 0.000001s : 6: predicate.switch_call_monad_eliminater 1.21% : 0.000004s : 25: predicate.switch_defer_inline 1.82% : 0.000006s : 37: predicate.switch_layer_defer_inline 4.45% : 0.000014s : 81: predicate.switch_simplify 0.78% : 0.000002s : 17: predicate.tile_eliminate 0.99% : 0.000003s : 17: predicate.transpose_eliminate 1.69% : 0.000005s : 29: predicate.tuple_list_convert_item_index_to_positive 1.43% : 0.000004s : 29: predicate.tuple_list_get_item_const_eliminator 1.62% : 0.000005s : 29: predicate.tuple_list_get_item_depend_reorder 2.89% : 0.000009s : 45: predicate.tuple_list_get_item_eliminator 1.60% : 0.000005s : 29: predicate.tuple_list_get_set_item_eliminator 2.38% : 0.000007s : 41: predicate.tuple_list_set_item_eliminator 1.80% : 0.000006s : 33: predicate.tuple_to_list_eliminator_ 2.29% : 0.000007s : 50: predicate.updatestate_pure_node_eliminater 3.07% : 0.000010s : 62: predicate.updatestate_useless_node_eliminater 0.52% : 0.000002s : 6: predicate.value_based_eliminate 0.78% : 0.000002s : 12: predicate.virtual_dataset_eliminate 0.81% : 0.000003s : 12: predicate.virtual_output_eliminate 0.35% : 0.000001s : 6: predicate.virtual_view_grad_eliminate 0.39% : 0.000001s : 6: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000618 11 51.88% : 0.000321s : 5: func_graph_cloner_run.FuncGraphClonerGraph 48.12% : 0.000297s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.034395 192 0.02% : 0.000006s : 1: ForceFp32Comm 10.93% : 0.003760s : 1: add_attr 10.87% : 0.003739s : 1: add_attr_with_inline 0.02% : 0.000006s : 1: add_comm_op_reuse_tag 0.23% : 0.000081s : 1: add_recomputation 0.02% : 0.000007s : 1: assign_add_opt 0.23% : 0.000080s : 1: auto_monad 0.12% : 0.000042s : 1: auto_monad_reorder 0.02% : 0.000007s : 1: begin_end_overlap_inline 0.02% : 0.000008s : 1: bias_add_comm_swap 1.61% : 0.000553s : 1: bootstrap 0.13% : 0.000044s : 1: cconv 0.02% : 0.000006s : 1: comm_op_add_attrs 0.08% : 0.000026s : 1: control_data_broadcast_order 0.05% : 0.000017s : 1: convert_after_rewriter 0.13% : 0.000043s : 1: cse_after_recomputation 0.02% : 0.000008s : 1: dataset_repeat_opt 0.08% : 0.000028s : 1: detach_backward 0.04% : 0.000014s : 1: environ_conv 0.10% : 0.000033s : 1: event_method 0.02% : 0.000008s : 1: full_micro_interleaved_order_control 0.03% : 0.000009s : 1: get_jit_bprop_graph 0.04% : 0.000013s : 1: graph_reusing 0.02% : 0.000008s : 1: grouped_pairwise_exchange_alltoall 0.02% : 0.000007s : 1: handle_group_info 0.02% : 0.000008s : 1: inline 0.03% : 0.000009s : 1: insert-virtual-dataset 0.02% : 0.000006s : 1: interleave_parallel_branches 0.02% : 0.000007s : 1: interleave_split_concat_branches 0.03% : 0.000009s : 1: label_fine_grained_interleaved_index 0.04% : 0.000012s : 1: label_micro_interleaved_index 2.05% : 0.000707s : 1: loop_unroll 0.02% : 0.000007s : 1: merge_cast_opt 0.02% : 0.000008s : 1: micro_interleaved_order_control 2.52% : 0.000866s : 1: mutable_eliminate 0.04% : 0.000012s : 1: offloading_packed_experts 0.07% : 0.000025s : 1: opt.transform.loop_unroll_optimizer 0.10% : 0.000033s : 1: opt.transform.mutable_eliminate 4.82% : 0.001658s : 78: opt.transform.opt_a 0.13% : 0.000046s : 1: opt.transform.opt_after_cconv 0.14% : 0.000048s : 1: opt.transform.opt_after_jit_grad 0.55% : 0.000190s : 28: opt.transform.opt_b 0.22% : 0.000075s : 2: opt.transform.opt_trans_graph 0.17% : 0.000058s : 4: opt.transform.symbol_engine_opt 12.28% : 0.004223s : 1: opt_a 0.57% : 0.000195s : 1: opt_after_cconv 2.47% : 0.000849s : 1: opt_after_jit_grad 1.23% : 0.000424s : 1: opt_b 23.31% : 0.008017s : 1: optimize 0.09% : 0.000032s : 1: optimize_parallel_all_gather_comm 0.04% : 0.000013s : 1: order_py_execute_after_rewriter 0.11% : 0.000037s : 1: overlap_grad_flash_sp 0.02% : 0.000007s : 1: overlap_grad_matmul_and_grad_allreduce 0.03% : 0.000011s : 1: overlap_grad_ring_attention 0.02% : 0.000007s : 1: overlap_opt_shard_grad_in_pipeline 0.02% : 0.000007s : 1: overlap_opt_shard_in_pipeline 0.02% : 0.000008s : 1: overlap_param_gather 0.02% : 0.000007s : 1: overlap_recompute_allgather_and_fa_grad 0.03% : 0.000011s : 1: overlap_recompute_and_grad_model_parallel 0.02% : 0.000008s : 1: overlap_recompute_comm 0.03% : 0.000011s : 1: parallel-infer-symbol 0.02% : 0.000006s : 1: parallel-infer-symbol-second 0.02% : 0.000008s : 1: partial_unused_args_eliminate 0.03% : 0.000011s : 1: pipeline_parallel_scheduler 0.02% : 0.000007s : 1: pipeline_split 0.14% : 0.000049s : 1: pre_auto_parallel 0.14% : 0.000047s : 1: py_interpret_to_execute 0.08% : 0.000028s : 1: py_interpret_to_execute_after_opt_a 0.02% : 0.000006s : 1: remove_cast_before_assign_add 0.19% : 0.000064s : 1: remove_dup_value 1.69% : 0.000580s : 1: renormalize.infer 1.39% : 0.000478s : 1: renormalize.specialize 0.03% : 0.000009s : 1: reorder_send_recv_between_fp_bp 0.04% : 0.000013s : 1: rewriter_after_jit_bprop_graph 0.20% : 0.000069s : 1: rewriter_after_opt_a 0.33% : 0.000114s : 1: rewriter_before_opt_a 0.03% : 0.000009s : 1: slice_cell_reuse_recomputed_activation 0.02% : 0.000008s : 1: slice_recompute_activation 0.02% : 0.000007s : 1: split_layernorm_comm 0.02% : 0.000008s : 1: split_matmul_comm_elemetwise 0.04% : 0.000013s : 1: swap_dp_allreduce_reducescatter 0.38% : 0.000129s : 1: symbol_engine_optimizer 0.36% : 0.000125s : 1: tuple_transform 18.61% : 0.006402s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:44:30.898.574 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0184282, [21] [bootstrap]: 0.00043491 [type_inference]: 0.00612838 [event_method]: 2.035e-05 [auto_monad]: 7.144e-05 [graph_reusing]: 7.69002e-06 [inline]: 2.30002e-06 [add_attr]: 0.00369486, [1] [add_attr_with_inline]: 0.00368261, [1] [Cycle 1]: 7.541e-05, [2] [tag_attr]: 2.436e-05 [meta_addattr_fg_expand]: 6.54001e-06 [parallel-infer-symbol]: 3.77002e-06 [pre_auto_parallel]: 4.213e-05 [insert-virtual-dataset]: 2.45002e-06 [parallel-infer-symbol-second]: 9.20001e-07 [dataset_repeat_opt]: 2.22999e-06 [pipeline_split]: 1.94e-06 [optimize]: 0.00692862, [53] [py_interpret_to_execute]: 3.504e-05 [rewriter_before_opt_a]: 0.00012364 [opt_a]: 0.00386604, [2] [Cycle 1]: 0.00282857, [45] [expand_dump_flag]: 3.08998e-06 [switch_simplify]: 4.67e-05 [loop_unroll]: 3.535e-05 [a_1]: 0.0007695 [with_stream_mark]: 2.16e-05 [recompute_prepare]: 1.225e-05 [updatestate_depend_eliminate]: 6.01e-06 [updatestate_assign_eliminate]: 5.20001e-06 [updatestate_loads_eliminate]: 4.05e-06 [parameter_eliminate]: 2.43998e-06 [a_2]: 0.00012417 [accelerated_algorithm]: 1.045e-05 [shard]: 2.34001e-06 [meta_shard_fg_expand]: 2.61e-06 [shard_inline]: 8.84e-06 [merge_send_recv]: 1.248e-05 [auto_parallel]: 9.99999e-06 [parallel]: 2.056e-05 [flash_sp]: 1.094e-05 [merge_comm]: 5.76998e-06 [allreduce_fusion]: 4.82e-06 [matmul_add_comm_reduction]: 1.294e-05 [allreduce_slice_to_reducescatter]: 8.2e-07 [virtual_shard_identity]: 1.187e-05 [virtual_dataset]: 9.41e-06 [get_grad_eliminate_]: 8.94e-06 [virtual_output]: 9.71e-06 [merge_forward]: 6.14001e-06 [cell_reuse_recompute_pass]: 1.53002e-06 [offload_activation]: 1.338e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.987e-05 [merge_recompute_call_nodes]: 1.84e-06 [before_grad]: 1.676e-05 [set_forward_comm_id_for_comm_node_pass]: 5.87001e-06 [meta_fg_expand]: 4.75999e-06 [flash_sp_send_recv_attached]: 2.99001e-06 [receive_attached]: 2.11003e-06 [after_resolve]: 1.587e-05 [a_after_grad]: 1.457e-05 [renormalize]: 0.00108672 [add_forward_monad_depend]: 1.027e-05 [auto_monad_grad]: 2.65002e-06 [auto_monad_eliminator]: 2.764e-05 [cse]: 5.158e-05 [a_3]: 8.184e-05 [Cycle 2]: 0.00102264, [45] [expand_dump_flag]: 2.49001e-06 [switch_simplify]: 1.297e-05 [loop_unroll]: 1.018e-05 [a_1]: 0.00024228 [with_stream_mark]: 2.401e-05 [recompute_prepare]: 1.139e-05 [updatestate_depend_eliminate]: 6.34001e-06 [updatestate_assign_eliminate]: 4.18999e-06 [updatestate_loads_eliminate]: 4.20999e-06 [parameter_eliminate]: 2.01998e-06 [a_2]: 0.00011484 [accelerated_algorithm]: 1.079e-05 [shard]: 2.20002e-06 [meta_shard_fg_expand]: 3.16999e-06 [shard_inline]: 9.56e-06 [merge_send_recv]: 1.12e-05 [auto_parallel]: 1.071e-05 [parallel]: 8.43001e-06 [flash_sp]: 4.12e-06 [merge_comm]: 5.17e-06 [allreduce_fusion]: 9.67999e-06 [matmul_add_comm_reduction]: 1.356e-05 [allreduce_slice_to_reducescatter]: 6.90023e-07 [virtual_shard_identity]: 1.233e-05 [virtual_dataset]: 9.40001e-06 [get_grad_eliminate_]: 9.92999e-06 [virtual_output]: 9.17999e-06 [merge_forward]: 5.04e-06 [cell_reuse_recompute_pass]: 2.87002e-06 [offload_activation]: 1.41e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.887e-05 [merge_recompute_call_nodes]: 1.76e-06 [before_grad]: 1.579e-05 [set_forward_comm_id_for_comm_node_pass]: 5.84999e-06 [meta_fg_expand]: 4.42e-06 [flash_sp_send_recv_attached]: 1.99e-06 [receive_attached]: 1.79e-06 [after_resolve]: 1.775e-05 [a_after_grad]: 1.388e-05 [renormalize]: 8.00064e-08 [add_forward_monad_depend]: 2.42001e-06 [auto_monad_grad]: 1.76998e-06 [auto_monad_eliminator]: 1.449e-05 [cse]: 3.533e-05 [a_3]: 5.687e-05 [py_interpret_to_execute_after_opt_a]: 1.973e-05 [slice_cell_reuse_recomputed_activation]: 2.59001e-06 [rewriter_after_opt_a]: 7.086e-05 [convert_after_rewriter]: 1.07e-05 [order_py_execute_after_rewriter]: 6.99001e-06 [mutable_eliminate]: 0.00080463 [opt_b]: 0.00034456, [1] [Cycle 1]: 0.00033473, [7] [b_1]: 0.00021164 [b_2]: 1.289e-05 [updatestate_depend_eliminate]: 1.252e-05 [updatestate_assign_eliminate]: 4.32998e-06 [updatestate_loads_eliminate]: 4.02998e-06 [renormalize]: 1.40999e-06 [cse]: 4.775e-05 [optimize_parallel_all_gather_comm]: 2.528e-05 [overlap_param_gather]: 2.12999e-06 [cconv]: 3.95e-05 [loop_unroll]: 0.0006461 [opt_after_cconv]: 0.00015956, [1] [Cycle 1]: 0.00015064, [7] [c_1]: 4.525e-05 [parameter_eliminate]: 6.36998e-06 [updatestate_depend_eliminate]: 1.06e-05 [updatestate_assign_eliminate]: 3.95998e-06 [updatestate_loads_eliminate]: 3.86001e-06 [cse]: 4.095e-05 [renormalize]: 5.69999e-07 [remove_dup_value]: 5.573e-05 [tuple_transform]: 0.00010718, [1] [Cycle 1]: 0.00010198, [4] [d_1]: 6.834e-05 [none_parameter_eliminate]: 2.56998e-06 [renormalize]: 3.00002e-07 [switch_simplify]: 1.058e-05 [partial_unused_args_eliminate]: 2.41e-06 [add_recomputation]: 7.462e-05 [cse_after_recomputation]: 3.276e-05, [1] [Cycle 1]: 2.753e-05, [1] [cse]: 2.082e-05 [environ_conv]: 8.55999e-06 [swap_dp_allreduce_reducescatter]: 7.95e-06 [bias_add_comm_swap]: 3.09001e-06 [label_micro_interleaved_index]: 6.01e-06 [label_fine_grained_interleaved_index]: 3.28998e-06 [merge_cast_opt]: 1.82999e-06 [slice_recompute_activation]: 2.12999e-06 [micro_interleaved_order_control]: 2.32001e-06 [assign_add_opt]: 1.44e-06 [ForceFp32Comm]: 1.27999e-06 [remove_cast_before_assign_add]: 1.10001e-06 [full_micro_interleaved_order_control]: 2.46e-06 [reorder_send_recv_between_fp_bp]: 2.42001e-06 [comm_op_add_attrs]: 1.35001e-06 [add_comm_op_reuse_tag]: 1.33002e-06 [interleave_split_concat_branches]: 1.34998e-06 [interleave_parallel_branches]: 1.21002e-06 [overlap_opt_shard_in_pipeline]: 1.37e-06 [overlap_opt_shard_grad_in_pipeline]: 1.77999e-06 [control_data_broadcast_order]: 1.986e-05 [grouped_pairwise_exchange_alltoall]: 1.55999e-06 [offloading_packed_experts]: 5.36998e-06 [overlap_recompute_and_grad_model_parallel]: 5.77001e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.29e-06 [overlap_recompute_allgather_and_fa_grad]: 1.54e-06 [overlap_recompute_comm]: 2.53e-06 [overlap_grad_ring_attention]: 5.17e-06 [overlap_grad_flash_sp]: 2.949e-05 [begin_end_overlap_inline]: 5.79981e-07 [split_matmul_comm_elemetwise]: 2.44001e-06 [split_layernorm_comm]: 1.82999e-06 [handle_group_info]: 1.14e-06 [symbol_engine_optimizer]: 0.00010996, [1] [Cycle 1]: 0.00010417, [6] [build]: 5.40001e-06 [elim_shapecalc]: 1.805e-05 [elim_not_effective]: 2.017e-05 [opt_reshape]: 1.087e-05 [fold_const_symbol]: 1.538e-05 [renormalize]: 2.09984e-07 [detach_backward]: 2.35002e-06 [pipeline_parallel_scheduler]: 2.05002e-06 [auto_monad_reorder]: 2.635e-05 [get_jit_bprop_graph]: 2.11e-06 [rewriter_after_jit_bprop_graph]: 8.25999e-06 [opt_after_jit_grad]: 0.0007095 [validate]: 5.861e-05 Sums bootstrap : 0.000435s : 3.22% type_inference : 0.006128s : 45.37% event_method : 0.000020s : 0.15% auto_monad : 0.000071s : 0.53% graph_reusing : 0.000008s : 0.06% inline : 0.000002s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000024s : 0.18% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000007s : 0.05% parallel-infer-symbol : 0.000004s : 0.03% pre_auto_parallel : 0.000042s : 0.31% insert-virtual-dataset : 0.000002s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000035s : 0.26% optimize.rewriter_before_opt_a : 0.000124s : 0.92% optimize.opt_a.expand_dump_flag : 0.000006s : 0.04% optimize.opt_a.switch_simplify : 0.000060s : 0.44% optimize.opt_a.loop_unroll : 0.000046s : 0.34% optimize.opt_a.a_1 : 0.001012s : 7.49% optimize.opt_a.with_stream_mark : 0.000046s : 0.34% optimize.opt_a.recompute_prepare : 0.000024s : 0.18% optimize.opt_a.updatestate_depend_eliminate : 0.000012s : 0.09% optimize.opt_a.updatestate_assign_eliminate : 0.000009s : 0.07% optimize.opt_a.updatestate_loads_eliminate : 0.000008s : 0.06% optimize.opt_a.parameter_eliminate : 0.000004s : 0.03% optimize.opt_a.a_2 : 0.000239s : 1.77% optimize.opt_a.accelerated_algorithm : 0.000021s : 0.16% optimize.opt_a.shard : 0.000005s : 0.03% optimize.opt_a.meta_shard_fg_expand : 0.000006s : 0.04% optimize.opt_a.shard_inline : 0.000018s : 0.14% optimize.opt_a.merge_send_recv : 0.000024s : 0.18% optimize.opt_a.auto_parallel : 0.000021s : 0.15% optimize.opt_a.parallel : 0.000029s : 0.21% optimize.opt_a.flash_sp : 0.000015s : 0.11% optimize.opt_a.merge_comm : 0.000011s : 0.08% optimize.opt_a.allreduce_fusion : 0.000014s : 0.11% optimize.opt_a.matmul_add_comm_reduction : 0.000026s : 0.20% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000002s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000024s : 0.18% optimize.opt_a.virtual_dataset : 0.000019s : 0.14% optimize.opt_a.get_grad_eliminate_ : 0.000019s : 0.14% optimize.opt_a.virtual_output : 0.000019s : 0.14% optimize.opt_a.merge_forward : 0.000011s : 0.08% optimize.opt_a.cell_reuse_recompute_pass : 0.000004s : 0.03% optimize.opt_a.offload_activation : 0.000027s : 0.20% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000039s : 0.29% optimize.opt_a.merge_recompute_call_nodes : 0.000004s : 0.03% optimize.opt_a.before_grad : 0.000033s : 0.24% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000012s : 0.09% optimize.opt_a.meta_fg_expand : 0.000009s : 0.07% optimize.opt_a.flash_sp_send_recv_attached : 0.000005s : 0.04% optimize.opt_a.receive_attached : 0.000004s : 0.03% optimize.opt_a.after_resolve : 0.000034s : 0.25% optimize.opt_a.a_after_grad : 0.000028s : 0.21% optimize.opt_a.renormalize : 0.001087s : 8.05% optimize.opt_a.add_forward_monad_depend : 0.000013s : 0.09% optimize.opt_a.auto_monad_grad : 0.000004s : 0.03% optimize.opt_a.auto_monad_eliminator : 0.000042s : 0.31% optimize.opt_a.cse : 0.000087s : 0.64% optimize.opt_a.a_3 : 0.000139s : 1.03% optimize.py_interpret_to_execute_after_opt_a : 0.000020s : 0.15% optimize.slice_cell_reuse_recomputed_activation : 0.000003s : 0.02% optimize.rewriter_after_opt_a : 0.000071s : 0.52% optimize.convert_after_rewriter : 0.000011s : 0.08% optimize.order_py_execute_after_rewriter : 0.000007s : 0.05% optimize.mutable_eliminate : 0.000805s : 5.96% optimize.opt_b.b_1 : 0.000212s : 1.57% optimize.opt_b.b_2 : 0.000013s : 0.10% optimize.opt_b.updatestate_depend_eliminate : 0.000013s : 0.09% optimize.opt_b.updatestate_assign_eliminate : 0.000004s : 0.03% optimize.opt_b.updatestate_loads_eliminate : 0.000004s : 0.03% optimize.opt_b.renormalize : 0.000001s : 0.01% optimize.opt_b.cse : 0.000048s : 0.35% optimize.optimize_parallel_all_gather_comm : 0.000025s : 0.19% optimize.overlap_param_gather : 0.000002s : 0.02% optimize.cconv : 0.000040s : 0.29% optimize.loop_unroll : 0.000646s : 4.78% optimize.opt_after_cconv.c_1 : 0.000045s : 0.33% optimize.opt_after_cconv.parameter_eliminate : 0.000006s : 0.05% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000011s : 0.08% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000004s : 0.03% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000004s : 0.03% optimize.opt_after_cconv.cse : 0.000041s : 0.30% optimize.opt_after_cconv.renormalize : 0.000001s : 0.00% optimize.remove_dup_value : 0.000056s : 0.41% optimize.tuple_transform.d_1 : 0.000068s : 0.51% optimize.tuple_transform.none_parameter_eliminate : 0.000003s : 0.02% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000011s : 0.08% optimize.partial_unused_args_eliminate : 0.000002s : 0.02% optimize.add_recomputation : 0.000075s : 0.55% optimize.cse_after_recomputation.cse : 0.000021s : 0.15% optimize.environ_conv : 0.000009s : 0.06% optimize.swap_dp_allreduce_reducescatter : 0.000008s : 0.06% optimize.bias_add_comm_swap : 0.000003s : 0.02% optimize.label_micro_interleaved_index : 0.000006s : 0.04% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.02% optimize.merge_cast_opt : 0.000002s : 0.01% optimize.slice_recompute_activation : 0.000002s : 0.02% optimize.micro_interleaved_order_control : 0.000002s : 0.02% optimize.assign_add_opt : 0.000001s : 0.01% optimize.ForceFp32Comm : 0.000001s : 0.01% optimize.remove_cast_before_assign_add : 0.000001s : 0.01% optimize.full_micro_interleaved_order_control : 0.000002s : 0.02% optimize.reorder_send_recv_between_fp_bp : 0.000002s : 0.02% optimize.comm_op_add_attrs : 0.000001s : 0.01% optimize.add_comm_op_reuse_tag : 0.000001s : 0.01% optimize.interleave_split_concat_branches : 0.000001s : 0.01% optimize.interleave_parallel_branches : 0.000001s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.01% optimize.control_data_broadcast_order : 0.000020s : 0.15% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.01% optimize.offloading_packed_experts : 0.000005s : 0.04% optimize.overlap_recompute_and_grad_model_parallel : 0.000006s : 0.04% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000002s : 0.01% optimize.overlap_recompute_comm : 0.000003s : 0.02% optimize.overlap_grad_ring_attention : 0.000005s : 0.04% optimize.overlap_grad_flash_sp : 0.000029s : 0.22% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.02% optimize.split_layernorm_comm : 0.000002s : 0.01% optimize.handle_group_info : 0.000001s : 0.01% optimize.symbol_engine_optimizer.build : 0.000005s : 0.04% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000018s : 0.13% optimize.symbol_engine_optimizer.elim_not_effective : 0.000020s : 0.15% optimize.symbol_engine_optimizer.opt_reshape : 0.000011s : 0.08% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000015s : 0.11% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.02% pipeline_parallel_scheduler : 0.000002s : 0.02% auto_monad_reorder : 0.000026s : 0.20% get_jit_bprop_graph : 0.000002s : 0.02% rewriter_after_jit_bprop_graph : 0.000008s : 0.06% opt_after_jit_grad : 0.000709s : 5.25% validate : 0.000059s : 0.43% Time group info: ------[substitution.] 0.000280 49 15.50% : 0.000043s : 6: substitution.cast_eliminate 1.06% : 0.000003s : 4: substitution.elim_not_effective 0.90% : 0.000003s : 4: substitution.fold_const_symbol 2.85% : 0.000008s : 6: substitution.graph_param_transform 63.58% : 0.000178s : 4: substitution.inline 2.34% : 0.000007s : 8: substitution.j_node_and_user_rematch 2.91% : 0.000008s : 8: substitution.remove_not_recompute_node 2.84% : 0.000008s : 4: substitution.replace_old_param 4.68% : 0.000013s : 4: substitution.tuple_list_get_item_eliminator 3.35% : 0.000009s : 1: substitution.value_based_eliminate ------[type_inference.] 0.006063 2 87.74% : 0.005319s : 1: type_inference.infer 12.26% : 0.000743s : 1: type_inference.specialize ------[replace.] 0.000067 8 62.31% : 0.000042s : 4: replace.inline 37.69% : 0.000025s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000186 8 93.99% : 0.000175s : 4: match.inline 6.01% : 0.000011s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000306 1730 1.03% : 0.000003s : 17: predicate.accumulaten_eliminater 0.87% : 0.000003s : 6: predicate.ad_related_special_op_eliminate 0.60% : 0.000002s : 12: predicate.addn_check_dump 0.84% : 0.000003s : 17: predicate.addn_zero_filter 0.77% : 0.000002s : 17: predicate.adjust_all_reduce_mul_add 2.14% : 0.000007s : 29: predicate.arithmetic_simplify 1.11% : 0.000003s : 17: predicate.cast_eliminate 0.64% : 0.000002s : 12: predicate.check_bprop_eliminate 0.68% : 0.000002s : 12: predicate.compare_switch_simplify 0.17% : 0.000001s : 6: predicate.const_output_eliminate 0.70% : 0.000002s : 12: predicate.depend_value_elim 0.86% : 0.000003s : 17: predicate.dict_get_item_const_eliminator 1.09% : 0.000003s : 17: predicate.dict_get_item_eliminator 0.79% : 0.000002s : 17: predicate.dict_set_item_eliminator 1.22% : 0.000004s : 12: predicate.dumpgradient_eliminate 0.22% : 0.000001s : 6: predicate.elim_not_effective 0.51% : 0.000002s : 6: predicate.elim_shapecalc_of_broadcastargs 1.14% : 0.000003s : 23: predicate.environ_add_const_eliminate 1.08% : 0.000003s : 23: predicate.environ_get_add_eliminate 1.17% : 0.000004s : 23: predicate.environ_get_depend_swap 1.68% : 0.000005s : 35: predicate.environ_get_eliminate 1.10% : 0.000003s : 23: predicate.environ_get_set_eliminate 1.27% : 0.000004s : 25: predicate.exchange_switch_depend_value 2.13% : 0.000007s : 25: predicate.float_depend_g_call 0.59% : 0.000002s : 12: predicate.float_environ_get_switch 0.91% : 0.000003s : 18: predicate.float_tuple_getitem_switch 0.16% : 0.000001s : 6: predicate.fold_const_symbol 0.78% : 0.000002s : 12: predicate.get_grad_eliminate 0.27% : 0.000001s : 6: predicate.graph_param_transform 0.61% : 0.000002s : 12: predicate.incorporate_call 0.61% : 0.000002s : 12: predicate.incorporate_call_switch 6.01% : 0.000018s : 78: predicate.inline 0.91% : 0.000003s : 12: predicate.inline_without_move 0.30% : 0.000001s : 12: predicate.j_node_and_user_rematch 0.94% : 0.000003s : 12: predicate.less_batch_normalization 1.67% : 0.000005s : 33: predicate.list_to_tuple_eliminator_ 2.36% : 0.000007s : 50: predicate.load_eliminater 1.46% : 0.000004s : 6: predicate.loop_unroll_after_grad 1.96% : 0.000006s : 38: predicate.loop_unroll_before_grad 1.54% : 0.000005s : 29: predicate.make_slice_get_slice_eliminator 0.63% : 0.000002s : 12: predicate.merge_addn 0.62% : 0.000002s : 12: predicate.micro_step_allgather_replace 0.63% : 0.000002s : 12: predicate.mini_step_allgather_replace 0.89% : 0.000003s : 17: predicate.minmaximum_grad 1.44% : 0.000004s : 6: predicate.mutable_eliminate 0.50% : 0.000002s : 6: predicate.opt_reshape 0.46% : 0.000001s : 6: predicate.parallel_virtual_node 1.69% : 0.000005s : 25: predicate.partial_defer_inline 1.48% : 0.000005s : 27: predicate.partial_eliminate 0.92% : 0.000003s : 17: predicate.print_const_string_wrapper 0.64% : 0.000002s : 12: predicate.reduce_all_const_elim 1.19% : 0.000004s : 17: predicate.reduce_eliminate 2.30% : 0.000007s : 50: predicate.redundant_stop_gradient_eliminater 0.44% : 0.000001s : 12: predicate.remove_not_recompute_node 1.32% : 0.000004s : 33: predicate.replace_applicator 0.46% : 0.000001s : 12: predicate.replace_old_param 0.30% : 0.000001s : 6: predicate.reset_defer_inline 0.91% : 0.000003s : 17: predicate.reshape_eliminate 0.63% : 0.000002s : 12: predicate.row_tensor_add_zeros_like 0.50% : 0.000002s : 6: predicate.row_tensor_eliminate 0.86% : 0.000003s : 12: predicate.same_eliminate 0.38% : 0.000001s : 12: predicate.set_cell_output_no_recompute 0.79% : 0.000002s : 12: predicate.shard_identity_eliminate 0.69% : 0.000002s : 12: predicate.special_op_eliminate 0.83% : 0.000003s : 12: predicate.specialize_transform 1.20% : 0.000004s : 12: predicate.split_environ_get_set_with_tuple_value 0.94% : 0.000003s : 12: predicate.stack_unstack_eliminate 0.35% : 0.000001s : 6: predicate.switch_call_monad_eliminater 1.18% : 0.000004s : 25: predicate.switch_defer_inline 1.91% : 0.000006s : 37: predicate.switch_layer_defer_inline 4.37% : 0.000013s : 81: predicate.switch_simplify 0.92% : 0.000003s : 17: predicate.tile_eliminate 1.06% : 0.000003s : 17: predicate.transpose_eliminate 1.78% : 0.000005s : 29: predicate.tuple_list_convert_item_index_to_positive 1.52% : 0.000005s : 29: predicate.tuple_list_get_item_const_eliminator 1.56% : 0.000005s : 29: predicate.tuple_list_get_item_depend_reorder 3.21% : 0.000010s : 45: predicate.tuple_list_get_item_eliminator 1.51% : 0.000005s : 29: predicate.tuple_list_get_set_item_eliminator 2.54% : 0.000008s : 41: predicate.tuple_list_set_item_eliminator 1.83% : 0.000006s : 33: predicate.tuple_to_list_eliminator_ 2.23% : 0.000007s : 50: predicate.updatestate_pure_node_eliminater 2.83% : 0.000009s : 62: predicate.updatestate_useless_node_eliminater 0.54% : 0.000002s : 6: predicate.value_based_eliminate 0.68% : 0.000002s : 12: predicate.virtual_dataset_eliminate 0.75% : 0.000002s : 12: predicate.virtual_output_eliminate 0.30% : 0.000001s : 6: predicate.virtual_view_grad_eliminate 0.39% : 0.000001s : 6: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000593 11 49.57% : 0.000294s : 5: func_graph_cloner_run.FuncGraphClonerGraph 50.43% : 0.000299s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.032099 192 0.01% : 0.000004s : 1: ForceFp32Comm 11.53% : 0.003701s : 1: add_attr 11.49% : 0.003687s : 1: add_attr_with_inline 0.01% : 0.000004s : 1: add_comm_op_reuse_tag 0.25% : 0.000080s : 1: add_recomputation 0.01% : 0.000004s : 1: assign_add_opt 0.58% : 0.000187s : 1: auto_monad 0.09% : 0.000030s : 1: auto_monad_reorder 0.01% : 0.000005s : 1: begin_end_overlap_inline 0.02% : 0.000006s : 1: bias_add_comm_swap 1.44% : 0.000462s : 1: bootstrap 0.13% : 0.000043s : 1: cconv 0.01% : 0.000004s : 1: comm_op_add_attrs 0.08% : 0.000024s : 1: control_data_broadcast_order 0.04% : 0.000014s : 1: convert_after_rewriter 0.11% : 0.000036s : 1: cse_after_recomputation 0.02% : 0.000005s : 1: dataset_repeat_opt 0.02% : 0.000006s : 1: detach_backward 0.04% : 0.000012s : 1: environ_conv 0.08% : 0.000027s : 1: event_method 0.02% : 0.000006s : 1: full_micro_interleaved_order_control 0.02% : 0.000005s : 1: get_jit_bprop_graph 0.04% : 0.000013s : 1: graph_reusing 0.02% : 0.000005s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000004s : 1: handle_group_info 0.02% : 0.000005s : 1: inline 0.02% : 0.000006s : 1: insert-virtual-dataset 0.01% : 0.000004s : 1: interleave_parallel_branches 0.01% : 0.000004s : 1: interleave_split_concat_branches 0.02% : 0.000006s : 1: label_fine_grained_interleaved_index 0.03% : 0.000009s : 1: label_micro_interleaved_index 2.05% : 0.000658s : 1: loop_unroll 0.01% : 0.000005s : 1: merge_cast_opt 0.02% : 0.000006s : 1: micro_interleaved_order_control 2.56% : 0.000820s : 1: mutable_eliminate 0.03% : 0.000008s : 1: offloading_packed_experts 0.08% : 0.000027s : 1: opt.transform.loop_unroll_optimizer 0.09% : 0.000030s : 1: opt.transform.mutable_eliminate 5.18% : 0.001663s : 78: opt.transform.opt_a 0.14% : 0.000044s : 1: opt.transform.opt_after_cconv 0.13% : 0.000043s : 1: opt.transform.opt_after_jit_grad 0.58% : 0.000186s : 28: opt.transform.opt_b 0.24% : 0.000076s : 2: opt.transform.opt_trans_graph 0.19% : 0.000060s : 4: opt.transform.symbol_engine_opt 12.06% : 0.003870s : 1: opt_a 0.51% : 0.000163s : 1: opt_after_cconv 2.25% : 0.000723s : 1: opt_after_jit_grad 1.09% : 0.000349s : 1: opt_b 21.60% : 0.006934s : 1: optimize 0.09% : 0.000029s : 1: optimize_parallel_all_gather_comm 0.03% : 0.000010s : 1: order_py_execute_after_rewriter 0.10% : 0.000033s : 1: overlap_grad_flash_sp 0.01% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.03% : 0.000008s : 1: overlap_grad_ring_attention 0.01% : 0.000004s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.02% : 0.000006s : 1: overlap_param_gather 0.01% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.03% : 0.000009s : 1: overlap_recompute_and_grad_model_parallel 0.02% : 0.000006s : 1: overlap_recompute_comm 0.02% : 0.000007s : 1: parallel-infer-symbol 0.01% : 0.000004s : 1: parallel-infer-symbol-second 0.02% : 0.000006s : 1: partial_unused_args_eliminate 0.02% : 0.000005s : 1: pipeline_parallel_scheduler 0.01% : 0.000005s : 1: pipeline_split 0.14% : 0.000046s : 1: pre_auto_parallel 0.12% : 0.000039s : 1: py_interpret_to_execute 0.07% : 0.000023s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000004s : 1: remove_cast_before_assign_add 0.19% : 0.000061s : 1: remove_dup_value 1.86% : 0.000596s : 1: renormalize.infer 1.49% : 0.000478s : 1: renormalize.specialize 0.02% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.04% : 0.000012s : 1: rewriter_after_jit_bprop_graph 0.24% : 0.000077s : 1: rewriter_after_opt_a 0.40% : 0.000129s : 1: rewriter_before_opt_a 0.02% : 0.000006s : 1: slice_cell_reuse_recomputed_activation 0.02% : 0.000005s : 1: slice_recompute_activation 0.02% : 0.000005s : 1: split_layernorm_comm 0.02% : 0.000005s : 1: split_matmul_comm_elemetwise 0.03% : 0.000011s : 1: swap_dp_allreduce_reducescatter 0.35% : 0.000113s : 1: symbol_engine_optimizer 0.34% : 0.000110s : 1: tuple_transform 19.15% : 0.006147s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:44:31.299.624 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:44:31.299.950 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0219704, [21] [bootstrap]: 0.00049959 [type_inference]: 0.00788074 [event_method]: 2.204e-05 [auto_monad]: 7.006e-05 [graph_reusing]: 6.38e-06 [inline]: 2.47001e-06 [add_attr]: 0.00408832, [1] [add_attr_with_inline]: 0.00407529, [1] [Cycle 1]: 9.751e-05, [2] [tag_attr]: 2.504e-05 [meta_addattr_fg_expand]: 6.51e-06 [parallel-infer-symbol]: 3.78001e-06 [pre_auto_parallel]: 4.215e-05 [insert-virtual-dataset]: 2.66999e-06 [parallel-infer-symbol-second]: 6.89994e-07 [dataset_repeat_opt]: 2.37001e-06 [pipeline_split]: 1.64e-06 [optimize]: 0.00689556, [53] [py_interpret_to_execute]: 3.808e-05 [rewriter_before_opt_a]: 0.00010226 [opt_a]: 0.00385528, [2] [Cycle 1]: 0.00283244, [45] [expand_dump_flag]: 3.09999e-06 [switch_simplify]: 4.473e-05 [loop_unroll]: 3.122e-05 [a_1]: 0.00078957 [with_stream_mark]: 2.227e-05 [recompute_prepare]: 1.202e-05 [updatestate_depend_eliminate]: 4.94e-06 [updatestate_assign_eliminate]: 4.13999e-06 [updatestate_loads_eliminate]: 3.81999e-06 [parameter_eliminate]: 2.21998e-06 [a_2]: 0.00013249 [accelerated_algorithm]: 9.42999e-06 [shard]: 2.31e-06 [meta_shard_fg_expand]: 2.27999e-06 [shard_inline]: 8.54e-06 [merge_send_recv]: 1.086e-05 [auto_parallel]: 9.31e-06 [parallel]: 2.122e-05 [flash_sp]: 1.037e-05 [merge_comm]: 4.65001e-06 [allreduce_fusion]: 4.32998e-06 [matmul_add_comm_reduction]: 1.026e-05 [allreduce_slice_to_reducescatter]: 6.80011e-07 [virtual_shard_identity]: 9.99999e-06 [virtual_dataset]: 8.33999e-06 [get_grad_eliminate_]: 7.7e-06 [virtual_output]: 8.23001e-06 [merge_forward]: 5.12e-06 [cell_reuse_recompute_pass]: 1.35001e-06 [offload_activation]: 1.2e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.97e-05 [merge_recompute_call_nodes]: 1.49998e-06 [before_grad]: 1.368e-05 [set_forward_comm_id_for_comm_node_pass]: 4.65999e-06 [meta_fg_expand]: 3.84002e-06 [flash_sp_send_recv_attached]: 2.71e-06 [receive_attached]: 2.32001e-06 [after_resolve]: 1.402e-05 [a_after_grad]: 1.257e-05 [renormalize]: 0.00099386 [add_forward_monad_depend]: 7.21001e-06 [auto_monad_grad]: 2.83003e-06 [auto_monad_eliminator]: 2.265e-05 [cse]: 3.913e-05 [a_3]: 8.161e-05 [Cycle 2]: 0.0010042, [45] [expand_dump_flag]: 1.89e-06 [switch_simplify]: 9.59e-06 [loop_unroll]: 7.77998e-06 [a_1]: 0.00018978 [with_stream_mark]: 1.719e-05 [recompute_prepare]: 8.04002e-06 [updatestate_depend_eliminate]: 4.05e-06 [updatestate_assign_eliminate]: 3.43e-06 [updatestate_loads_eliminate]: 3.51001e-06 [parameter_eliminate]: 1.59e-06 [a_2]: 0.00012007 [accelerated_algorithm]: 7.7e-06 [shard]: 2.33002e-06 [meta_shard_fg_expand]: 2.41998e-06 [shard_inline]: 7.43e-06 [merge_send_recv]: 8.31002e-06 [auto_parallel]: 7.99002e-06 [parallel]: 8.17998e-06 [flash_sp]: 4e-06 [merge_comm]: 4.87e-06 [allreduce_fusion]: 4.27e-06 [matmul_add_comm_reduction]: 9.66e-06 [allreduce_slice_to_reducescatter]: 7.80012e-07 [virtual_shard_identity]: 8.48001e-06 [virtual_dataset]: 7.41999e-06 [get_grad_eliminate_]: 7.15e-06 [virtual_output]: 7.11999e-06 [merge_forward]: 3.77998e-06 [cell_reuse_recompute_pass]: 2.32999e-06 [offload_activation]: 9.92999e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.66e-05 [merge_recompute_call_nodes]: 1.12e-06 [before_grad]: 1.536e-05 [set_forward_comm_id_for_comm_node_pass]: 4.63001e-06 [meta_fg_expand]: 3.86999e-06 [flash_sp_send_recv_attached]: 1.30001e-06 [receive_attached]: 1.91e-06 [after_resolve]: 1.424e-05 [a_after_grad]: 1.141e-05 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 1.65001e-06 [auto_monad_grad]: 1.94e-06 [auto_monad_eliminator]: 1.217e-05 [cse]: 2.426e-05 [a_3]: 5.884e-05 [py_interpret_to_execute_after_opt_a]: 2.151e-05 [slice_cell_reuse_recomputed_activation]: 5.20001e-06 [rewriter_after_opt_a]: 5.313e-05 [convert_after_rewriter]: 1.131e-05 [order_py_execute_after_rewriter]: 8.97e-06 [mutable_eliminate]: 0.00078239 [opt_b]: 0.00035356, [1] [Cycle 1]: 0.00034063, [7] [b_1]: 0.00022325 [b_2]: 1.019e-05 [updatestate_depend_eliminate]: 8.87e-06 [updatestate_assign_eliminate]: 2.96999e-06 [updatestate_loads_eliminate]: 3.14001e-06 [renormalize]: 1.00999e-06 [cse]: 3.265e-05 [optimize_parallel_all_gather_comm]: 2.352e-05 [overlap_param_gather]: 4.65999e-06 [cconv]: 3.464e-05 [loop_unroll]: 0.00062931 [opt_after_cconv]: 0.00016398, [1] [Cycle 1]: 0.00015368, [7] [c_1]: 4.459e-05 [parameter_eliminate]: 4.57e-06 [updatestate_depend_eliminate]: 8.77e-06 [updatestate_assign_eliminate]: 3.25e-06 [updatestate_loads_eliminate]: 3.19001e-06 [cse]: 3.098e-05 [renormalize]: 5.19998e-07 [remove_dup_value]: 2.062e-05 [tuple_transform]: 0.00010912, [1] [Cycle 1]: 0.0001009, [4] [d_1]: 5.721e-05 [none_parameter_eliminate]: 2.51e-06 [renormalize]: 7.90023e-07 [switch_simplify]: 8.59e-06 [partial_unused_args_eliminate]: 4.63999e-06 [add_recomputation]: 6.709e-05 [cse_after_recomputation]: 3.482e-05, [1] [Cycle 1]: 2.763e-05, [1] [cse]: 1.805e-05 [environ_conv]: 1.006e-05 [swap_dp_allreduce_reducescatter]: 9.02e-06 [bias_add_comm_swap]: 5.47999e-06 [label_micro_interleaved_index]: 7.73999e-06 [label_fine_grained_interleaved_index]: 5.82001e-06 [merge_cast_opt]: 4.1e-06 [slice_recompute_activation]: 4.92e-06 [micro_interleaved_order_control]: 5.20999e-06 [assign_add_opt]: 3.93001e-06 [ForceFp32Comm]: 3.49001e-06 [remove_cast_before_assign_add]: 4.27998e-06 [full_micro_interleaved_order_control]: 5.07e-06 [reorder_send_recv_between_fp_bp]: 5.42999e-06 [comm_op_add_attrs]: 3.7e-06 [add_comm_op_reuse_tag]: 3.34001e-06 [interleave_split_concat_branches]: 3.75998e-06 [interleave_parallel_branches]: 3.43e-06 [overlap_opt_shard_in_pipeline]: 3.74002e-06 [overlap_opt_shard_grad_in_pipeline]: 4.45e-06 [control_data_broadcast_order]: 1.924e-05 [grouped_pairwise_exchange_alltoall]: 3.88001e-06 [offloading_packed_experts]: 6.74001e-06 [overlap_recompute_and_grad_model_parallel]: 8.03001e-06 [overlap_grad_matmul_and_grad_allreduce]: 5.04e-06 [overlap_recompute_allgather_and_fa_grad]: 3.82998e-06 [overlap_recompute_comm]: 4.94e-06 [overlap_grad_ring_attention]: 9.89001e-06 [overlap_grad_flash_sp]: 2.815e-05 [begin_end_overlap_inline]: 3.44001e-06 [split_matmul_comm_elemetwise]: 5.02e-06 [split_layernorm_comm]: 4.37e-06 [handle_group_info]: 3.48999e-06 [symbol_engine_optimizer]: 0.00011323, [1] [Cycle 1]: 0.00010559, [6] [build]: 4.53001e-06 [elim_shapecalc]: 1.389e-05 [elim_not_effective]: 1.682e-05 [opt_reshape]: 8.79e-06 [fold_const_symbol]: 1.236e-05 [renormalize]: 2.00002e-07 [detach_backward]: 7.46001e-06 [pipeline_parallel_scheduler]: 2.29001e-06 [auto_monad_reorder]: 3.85e-05 [get_jit_bprop_graph]: 2.23002e-06 [rewriter_after_jit_bprop_graph]: 9.59e-06 [opt_after_jit_grad]: 0.00087458 [validate]: 5.821e-05 Sums bootstrap : 0.000500s : 3.30% type_inference : 0.007881s : 51.98% event_method : 0.000022s : 0.15% auto_monad : 0.000070s : 0.46% graph_reusing : 0.000006s : 0.04% inline : 0.000002s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000025s : 0.17% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000007s : 0.04% parallel-infer-symbol : 0.000004s : 0.02% pre_auto_parallel : 0.000042s : 0.28% insert-virtual-dataset : 0.000003s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000038s : 0.25% optimize.rewriter_before_opt_a : 0.000102s : 0.67% optimize.opt_a.expand_dump_flag : 0.000005s : 0.03% optimize.opt_a.switch_simplify : 0.000054s : 0.36% optimize.opt_a.loop_unroll : 0.000039s : 0.26% optimize.opt_a.a_1 : 0.000979s : 6.46% optimize.opt_a.with_stream_mark : 0.000039s : 0.26% optimize.opt_a.recompute_prepare : 0.000020s : 0.13% optimize.opt_a.updatestate_depend_eliminate : 0.000009s : 0.06% optimize.opt_a.updatestate_assign_eliminate : 0.000008s : 0.05% optimize.opt_a.updatestate_loads_eliminate : 0.000007s : 0.05% optimize.opt_a.parameter_eliminate : 0.000004s : 0.03% optimize.opt_a.a_2 : 0.000253s : 1.67% optimize.opt_a.accelerated_algorithm : 0.000017s : 0.11% optimize.opt_a.shard : 0.000005s : 0.03% optimize.opt_a.meta_shard_fg_expand : 0.000005s : 0.03% optimize.opt_a.shard_inline : 0.000016s : 0.11% optimize.opt_a.merge_send_recv : 0.000019s : 0.13% optimize.opt_a.auto_parallel : 0.000017s : 0.11% optimize.opt_a.parallel : 0.000029s : 0.19% optimize.opt_a.flash_sp : 0.000014s : 0.09% optimize.opt_a.merge_comm : 0.000010s : 0.06% optimize.opt_a.allreduce_fusion : 0.000009s : 0.06% optimize.opt_a.matmul_add_comm_reduction : 0.000020s : 0.13% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000018s : 0.12% optimize.opt_a.virtual_dataset : 0.000016s : 0.10% optimize.opt_a.get_grad_eliminate_ : 0.000015s : 0.10% optimize.opt_a.virtual_output : 0.000015s : 0.10% optimize.opt_a.merge_forward : 0.000009s : 0.06% optimize.opt_a.cell_reuse_recompute_pass : 0.000004s : 0.02% optimize.opt_a.offload_activation : 0.000022s : 0.14% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000036s : 0.24% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.02% optimize.opt_a.before_grad : 0.000029s : 0.19% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000009s : 0.06% optimize.opt_a.meta_fg_expand : 0.000008s : 0.05% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.03% optimize.opt_a.receive_attached : 0.000004s : 0.03% optimize.opt_a.after_resolve : 0.000028s : 0.19% optimize.opt_a.a_after_grad : 0.000024s : 0.16% optimize.opt_a.renormalize : 0.000994s : 6.56% optimize.opt_a.add_forward_monad_depend : 0.000009s : 0.06% optimize.opt_a.auto_monad_grad : 0.000005s : 0.03% optimize.opt_a.auto_monad_eliminator : 0.000035s : 0.23% optimize.opt_a.cse : 0.000063s : 0.42% optimize.opt_a.a_3 : 0.000140s : 0.93% optimize.py_interpret_to_execute_after_opt_a : 0.000022s : 0.14% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.03% optimize.rewriter_after_opt_a : 0.000053s : 0.35% optimize.convert_after_rewriter : 0.000011s : 0.07% optimize.order_py_execute_after_rewriter : 0.000009s : 0.06% optimize.mutable_eliminate : 0.000782s : 5.16% optimize.opt_b.b_1 : 0.000223s : 1.47% optimize.opt_b.b_2 : 0.000010s : 0.07% optimize.opt_b.updatestate_depend_eliminate : 0.000009s : 0.06% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.02% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.02% optimize.opt_b.renormalize : 0.000001s : 0.01% optimize.opt_b.cse : 0.000033s : 0.22% optimize.optimize_parallel_all_gather_comm : 0.000024s : 0.16% optimize.overlap_param_gather : 0.000005s : 0.03% optimize.cconv : 0.000035s : 0.23% optimize.loop_unroll : 0.000629s : 4.15% optimize.opt_after_cconv.c_1 : 0.000045s : 0.29% optimize.opt_after_cconv.parameter_eliminate : 0.000005s : 0.03% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000009s : 0.06% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.02% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.02% optimize.opt_after_cconv.cse : 0.000031s : 0.20% optimize.opt_after_cconv.renormalize : 0.000001s : 0.00% optimize.remove_dup_value : 0.000021s : 0.14% optimize.tuple_transform.d_1 : 0.000057s : 0.38% optimize.tuple_transform.none_parameter_eliminate : 0.000003s : 0.02% optimize.tuple_transform.renormalize : 0.000001s : 0.01% optimize.tuple_transform.switch_simplify : 0.000009s : 0.06% optimize.partial_unused_args_eliminate : 0.000005s : 0.03% optimize.add_recomputation : 0.000067s : 0.44% optimize.cse_after_recomputation.cse : 0.000018s : 0.12% optimize.environ_conv : 0.000010s : 0.07% optimize.swap_dp_allreduce_reducescatter : 0.000009s : 0.06% optimize.bias_add_comm_swap : 0.000005s : 0.04% optimize.label_micro_interleaved_index : 0.000008s : 0.05% optimize.label_fine_grained_interleaved_index : 0.000006s : 0.04% optimize.merge_cast_opt : 0.000004s : 0.03% optimize.slice_recompute_activation : 0.000005s : 0.03% optimize.micro_interleaved_order_control : 0.000005s : 0.03% optimize.assign_add_opt : 0.000004s : 0.03% optimize.ForceFp32Comm : 0.000003s : 0.02% optimize.remove_cast_before_assign_add : 0.000004s : 0.03% optimize.full_micro_interleaved_order_control : 0.000005s : 0.03% optimize.reorder_send_recv_between_fp_bp : 0.000005s : 0.04% optimize.comm_op_add_attrs : 0.000004s : 0.02% optimize.add_comm_op_reuse_tag : 0.000003s : 0.02% optimize.interleave_split_concat_branches : 0.000004s : 0.02% optimize.interleave_parallel_branches : 0.000003s : 0.02% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.02% optimize.overlap_opt_shard_grad_in_pipeline : 0.000004s : 0.03% optimize.control_data_broadcast_order : 0.000019s : 0.13% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.03% optimize.offloading_packed_experts : 0.000007s : 0.04% optimize.overlap_recompute_and_grad_model_parallel : 0.000008s : 0.05% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000005s : 0.03% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.03% optimize.overlap_recompute_comm : 0.000005s : 0.03% optimize.overlap_grad_ring_attention : 0.000010s : 0.07% optimize.overlap_grad_flash_sp : 0.000028s : 0.19% optimize.begin_end_overlap_inline : 0.000003s : 0.02% optimize.split_matmul_comm_elemetwise : 0.000005s : 0.03% optimize.split_layernorm_comm : 0.000004s : 0.03% optimize.handle_group_info : 0.000003s : 0.02% optimize.symbol_engine_optimizer.build : 0.000005s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000014s : 0.09% optimize.symbol_engine_optimizer.elim_not_effective : 0.000017s : 0.11% optimize.symbol_engine_optimizer.opt_reshape : 0.000009s : 0.06% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000012s : 0.08% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000007s : 0.05% pipeline_parallel_scheduler : 0.000002s : 0.02% auto_monad_reorder : 0.000039s : 0.25% get_jit_bprop_graph : 0.000002s : 0.01% rewriter_after_jit_bprop_graph : 0.000010s : 0.06% opt_after_jit_grad : 0.000875s : 5.77% validate : 0.000058s : 0.38% Time group info: ------[substitution.] 0.000253 39 10.80% : 0.000027s : 3: substitution.cast_eliminate 0.91% : 0.000002s : 3: substitution.elim_not_effective 0.67% : 0.000002s : 3: substitution.fold_const_symbol 2.83% : 0.000007s : 5: substitution.graph_param_transform 68.21% : 0.000173s : 4: substitution.inline 2.02% : 0.000005s : 6: substitution.j_node_and_user_rematch 2.47% : 0.000006s : 6: substitution.remove_not_recompute_node 2.43% : 0.000006s : 4: substitution.replace_old_param 6.40% : 0.000016s : 4: substitution.tuple_list_get_item_eliminator 3.26% : 0.000008s : 1: substitution.value_based_eliminate ------[type_inference.] 0.007802 2 88.44% : 0.006901s : 1: type_inference.infer 11.56% : 0.000902s : 1: type_inference.specialize ------[replace.] 0.000071 8 59.25% : 0.000042s : 4: replace.inline 40.75% : 0.000029s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000184 8 92.35% : 0.000170s : 4: match.inline 7.65% : 0.000014s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000280 1596 1.05% : 0.000003s : 17: predicate.accumulaten_eliminater 1.29% : 0.000004s : 5: predicate.ad_related_special_op_eliminate 0.56% : 0.000002s : 10: predicate.addn_check_dump 0.84% : 0.000002s : 17: predicate.addn_zero_filter 0.90% : 0.000003s : 17: predicate.adjust_all_reduce_mul_add 2.02% : 0.000006s : 27: predicate.arithmetic_simplify 1.16% : 0.000003s : 17: predicate.cast_eliminate 0.64% : 0.000002s : 10: predicate.check_bprop_eliminate 0.57% : 0.000002s : 10: predicate.compare_switch_simplify 0.16% : 0.000000s : 5: predicate.const_output_eliminate 0.66% : 0.000002s : 10: predicate.depend_value_elim 1.01% : 0.000003s : 17: predicate.dict_get_item_const_eliminator 1.16% : 0.000003s : 17: predicate.dict_get_item_eliminator 0.95% : 0.000003s : 17: predicate.dict_set_item_eliminator 1.33% : 0.000004s : 10: predicate.dumpgradient_eliminate 0.24% : 0.000001s : 5: predicate.elim_not_effective 0.34% : 0.000001s : 5: predicate.elim_shapecalc_of_broadcastargs 1.43% : 0.000004s : 22: predicate.environ_add_const_eliminate 1.17% : 0.000003s : 22: predicate.environ_get_add_eliminate 1.15% : 0.000003s : 22: predicate.environ_get_depend_swap 1.70% : 0.000005s : 32: predicate.environ_get_eliminate 1.13% : 0.000003s : 22: predicate.environ_get_set_eliminate 1.35% : 0.000004s : 25: predicate.exchange_switch_depend_value 2.16% : 0.000006s : 25: predicate.float_depend_g_call 0.60% : 0.000002s : 10: predicate.float_environ_get_switch 0.84% : 0.000002s : 15: predicate.float_tuple_getitem_switch 0.18% : 0.000000s : 5: predicate.fold_const_symbol 0.66% : 0.000002s : 10: predicate.get_grad_eliminate 0.19% : 0.000001s : 5: predicate.graph_param_transform 0.59% : 0.000002s : 10: predicate.incorporate_call 0.46% : 0.000001s : 10: predicate.incorporate_call_switch 5.92% : 0.000017s : 72: predicate.inline 0.70% : 0.000002s : 10: predicate.inline_without_move 0.31% : 0.000001s : 10: predicate.j_node_and_user_rematch 0.79% : 0.000002s : 10: predicate.less_batch_normalization 1.77% : 0.000005s : 31: predicate.list_to_tuple_eliminator_ 2.71% : 0.000008s : 48: predicate.load_eliminater 0.99% : 0.000003s : 5: predicate.loop_unroll_after_grad 1.88% : 0.000005s : 36: predicate.loop_unroll_before_grad 1.78% : 0.000005s : 27: predicate.make_slice_get_slice_eliminator 0.53% : 0.000001s : 10: predicate.merge_addn 0.58% : 0.000002s : 10: predicate.micro_step_allgather_replace 0.56% : 0.000002s : 10: predicate.mini_step_allgather_replace 0.93% : 0.000003s : 17: predicate.minmaximum_grad 1.35% : 0.000004s : 5: predicate.mutable_eliminate 0.34% : 0.000001s : 5: predicate.opt_reshape 0.49% : 0.000001s : 5: predicate.parallel_virtual_node 1.73% : 0.000005s : 25: predicate.partial_defer_inline 1.60% : 0.000004s : 26: predicate.partial_eliminate 0.94% : 0.000003s : 17: predicate.print_const_string_wrapper 0.63% : 0.000002s : 10: predicate.reduce_all_const_elim 1.23% : 0.000003s : 17: predicate.reduce_eliminate 2.53% : 0.000007s : 48: predicate.redundant_stop_gradient_eliminater 0.36% : 0.000001s : 10: predicate.remove_not_recompute_node 1.30% : 0.000004s : 31: predicate.replace_applicator 0.49% : 0.000001s : 10: predicate.replace_old_param 0.36% : 0.000001s : 5: predicate.reset_defer_inline 1.08% : 0.000003s : 17: predicate.reshape_eliminate 0.63% : 0.000002s : 10: predicate.row_tensor_add_zeros_like 0.33% : 0.000001s : 5: predicate.row_tensor_eliminate 0.87% : 0.000002s : 10: predicate.same_eliminate 0.37% : 0.000001s : 10: predicate.set_cell_output_no_recompute 0.81% : 0.000002s : 10: predicate.shard_identity_eliminate 0.90% : 0.000003s : 10: predicate.special_op_eliminate 0.62% : 0.000002s : 10: predicate.specialize_transform 0.90% : 0.000003s : 10: predicate.split_environ_get_set_with_tuple_value 0.77% : 0.000002s : 10: predicate.stack_unstack_eliminate 0.37% : 0.000001s : 5: predicate.switch_call_monad_eliminater 1.46% : 0.000004s : 25: predicate.switch_defer_inline 1.93% : 0.000005s : 35: predicate.switch_layer_defer_inline 4.39% : 0.000012s : 76: predicate.switch_simplify 0.86% : 0.000002s : 17: predicate.tile_eliminate 0.91% : 0.000003s : 17: predicate.transpose_eliminate 1.67% : 0.000005s : 27: predicate.tuple_list_convert_item_index_to_positive 1.62% : 0.000005s : 27: predicate.tuple_list_get_item_const_eliminator 1.52% : 0.000004s : 27: predicate.tuple_list_get_item_depend_reorder 3.12% : 0.000009s : 41: predicate.tuple_list_get_item_eliminator 1.53% : 0.000004s : 27: predicate.tuple_list_get_set_item_eliminator 2.19% : 0.000006s : 37: predicate.tuple_list_set_item_eliminator 1.67% : 0.000005s : 31: predicate.tuple_to_list_eliminator_ 2.36% : 0.000007s : 48: predicate.updatestate_pure_node_eliminater 3.11% : 0.000009s : 58: predicate.updatestate_useless_node_eliminater 0.51% : 0.000001s : 5: predicate.value_based_eliminate 0.61% : 0.000002s : 10: predicate.virtual_dataset_eliminate 0.75% : 0.000002s : 10: predicate.virtual_output_eliminate 0.28% : 0.000001s : 5: predicate.virtual_view_grad_eliminate 0.56% : 0.000002s : 5: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000692 11 54.59% : 0.000378s : 5: func_graph_cloner_run.FuncGraphClonerGraph 45.41% : 0.000314s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.035688 192 0.02% : 0.000006s : 1: ForceFp32Comm 11.49% : 0.004100s : 1: add_attr 11.43% : 0.004079s : 1: add_attr_with_inline 0.02% : 0.000006s : 1: add_comm_op_reuse_tag 0.20% : 0.000071s : 1: add_recomputation 0.02% : 0.000007s : 1: assign_add_opt 0.22% : 0.000079s : 1: auto_monad 0.13% : 0.000046s : 1: auto_monad_reorder 0.02% : 0.000006s : 1: begin_end_overlap_inline 0.02% : 0.000008s : 1: bias_add_comm_swap 1.53% : 0.000547s : 1: bootstrap 0.11% : 0.000038s : 1: cconv 0.02% : 0.000006s : 1: comm_op_add_attrs 0.06% : 0.000022s : 1: control_data_broadcast_order 0.04% : 0.000014s : 1: convert_after_rewriter 0.11% : 0.000038s : 1: cse_after_recomputation 0.02% : 0.000008s : 1: dataset_repeat_opt 0.12% : 0.000041s : 1: detach_backward 0.04% : 0.000013s : 1: environ_conv 0.09% : 0.000032s : 1: event_method 0.02% : 0.000008s : 1: full_micro_interleaved_order_control 0.02% : 0.000009s : 1: get_jit_bprop_graph 0.04% : 0.000013s : 1: graph_reusing 0.02% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.02% : 0.000006s : 1: handle_group_info 0.02% : 0.000008s : 1: inline 0.02% : 0.000009s : 1: insert-virtual-dataset 0.02% : 0.000006s : 1: interleave_parallel_branches 0.02% : 0.000006s : 1: interleave_split_concat_branches 0.02% : 0.000009s : 1: label_fine_grained_interleaved_index 0.03% : 0.000010s : 1: label_micro_interleaved_index 1.79% : 0.000637s : 1: loop_unroll 0.02% : 0.000007s : 1: merge_cast_opt 0.02% : 0.000008s : 1: micro_interleaved_order_control 2.22% : 0.000791s : 1: mutable_eliminate 0.03% : 0.000009s : 1: offloading_packed_experts 0.06% : 0.000021s : 1: opt.transform.loop_unroll_optimizer 0.07% : 0.000023s : 1: opt.transform.mutable_eliminate 4.25% : 0.001518s : 78: opt.transform.opt_a 0.12% : 0.000043s : 1: opt.transform.opt_after_cconv 0.13% : 0.000048s : 1: opt.transform.opt_after_jit_grad 0.44% : 0.000156s : 28: opt.transform.opt_b 0.18% : 0.000063s : 2: opt.transform.opt_trans_graph 0.13% : 0.000048s : 4: opt.transform.symbol_engine_opt 10.81% : 0.003859s : 1: opt_a 0.47% : 0.000167s : 1: opt_after_cconv 2.49% : 0.000890s : 1: opt_after_jit_grad 1.00% : 0.000358s : 1: opt_b 22.56% : 0.008050s : 1: optimize 0.07% : 0.000027s : 1: optimize_parallel_all_gather_comm 0.03% : 0.000012s : 1: order_py_execute_after_rewriter 0.09% : 0.000031s : 1: overlap_grad_flash_sp 0.03% : 0.000010s : 1: overlap_grad_matmul_and_grad_allreduce 0.04% : 0.000014s : 1: overlap_grad_ring_attention 0.02% : 0.000008s : 1: overlap_opt_shard_grad_in_pipeline 0.02% : 0.000006s : 1: overlap_opt_shard_in_pipeline 0.02% : 0.000008s : 1: overlap_param_gather 0.02% : 0.000006s : 1: overlap_recompute_allgather_and_fa_grad 0.10% : 0.000036s : 1: overlap_recompute_and_grad_model_parallel 0.02% : 0.000008s : 1: overlap_recompute_comm 0.03% : 0.000011s : 1: parallel-infer-symbol 0.02% : 0.000007s : 1: parallel-infer-symbol-second 0.02% : 0.000007s : 1: partial_unused_args_eliminate 0.03% : 0.000011s : 1: pipeline_parallel_scheduler 0.02% : 0.000007s : 1: pipeline_split 0.14% : 0.000049s : 1: pre_auto_parallel 0.12% : 0.000042s : 1: py_interpret_to_execute 0.07% : 0.000025s : 1: py_interpret_to_execute_after_opt_a 0.02% : 0.000008s : 1: remove_cast_before_assign_add 0.07% : 0.000024s : 1: remove_dup_value 1.51% : 0.000537s : 1: renormalize.infer 1.25% : 0.000447s : 1: renormalize.specialize 0.02% : 0.000008s : 1: reorder_send_recv_between_fp_bp 0.04% : 0.000016s : 1: rewriter_after_jit_bprop_graph 0.16% : 0.000057s : 1: rewriter_after_opt_a 0.30% : 0.000107s : 1: rewriter_before_opt_a 0.02% : 0.000008s : 1: slice_cell_reuse_recomputed_activation 0.02% : 0.000008s : 1: slice_recompute_activation 0.02% : 0.000007s : 1: split_layernorm_comm 0.02% : 0.000008s : 1: split_matmul_comm_elemetwise 0.03% : 0.000012s : 1: swap_dp_allreduce_reducescatter 0.33% : 0.000116s : 1: symbol_engine_optimizer 0.31% : 0.000112s : 1: tuple_transform 22.22% : 0.007931s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:44:31.682.405 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0170513, [21] [bootstrap]: 0.00053084 [type_inference]: 0.00617393 [event_method]: 2.128e-05 [auto_monad]: 6.517e-05 [graph_reusing]: 5.94999e-06 [inline]: 2.65002e-06 [add_attr]: 0.00323523, [1] [add_attr_with_inline]: 0.00322461, [1] [Cycle 1]: 6.417e-05, [2] [tag_attr]: 2.016e-05 [meta_addattr_fg_expand]: 6.09999e-06 [parallel-infer-symbol]: 3.4e-06 [pre_auto_parallel]: 3.379e-05 [insert-virtual-dataset]: 2.96001e-06 [parallel-infer-symbol-second]: 8.30012e-07 [dataset_repeat_opt]: 2.21998e-06 [pipeline_split]: 1.93002e-06 [optimize]: 0.00591317, [53] [py_interpret_to_execute]: 2.594e-05 [rewriter_before_opt_a]: 8.466e-05 [opt_a]: 0.00312252, [2] [Cycle 1]: 0.00229173, [45] [expand_dump_flag]: 3.3e-06 [switch_simplify]: 4.427e-05 [loop_unroll]: 3.143e-05 [a_1]: 0.00074357 [with_stream_mark]: 1.576e-05 [recompute_prepare]: 1.148e-05 [updatestate_depend_eliminate]: 5.46998e-06 [updatestate_assign_eliminate]: 3.86999e-06 [updatestate_loads_eliminate]: 4.11001e-06 [parameter_eliminate]: 2.32999e-06 [a_2]: 0.00010202 [accelerated_algorithm]: 9.36e-06 [shard]: 2.04999e-06 [meta_shard_fg_expand]: 2.19001e-06 [shard_inline]: 7.48999e-06 [merge_send_recv]: 8.95999e-06 [auto_parallel]: 7.97e-06 [parallel]: 1.83e-05 [flash_sp]: 9.34e-06 [merge_comm]: 5.13002e-06 [allreduce_fusion]: 4.32e-06 [matmul_add_comm_reduction]: 1.071e-05 [allreduce_slice_to_reducescatter]: 8.2e-07 [virtual_shard_identity]: 1.153e-05 [virtual_dataset]: 8.07e-06 [get_grad_eliminate_]: 7.92e-06 [virtual_output]: 8.08999e-06 [merge_forward]: 4.45e-06 [cell_reuse_recompute_pass]: 1.72001e-06 [offload_activation]: 1.107e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.713e-05 [merge_recompute_call_nodes]: 1.37999e-06 [before_grad]: 1.296e-05 [set_forward_comm_id_for_comm_node_pass]: 5.64998e-06 [meta_fg_expand]: 3.35e-06 [flash_sp_send_recv_attached]: 3.09999e-06 [receive_attached]: 2.79999e-06 [after_resolve]: 1.419e-05 [a_after_grad]: 1.181e-05 [renormalize]: 0.0007021 [add_forward_monad_depend]: 7.1e-06 [auto_monad_grad]: 2.91999e-06 [auto_monad_eliminator]: 1.879e-05 [cse]: 3.898e-05 [a_3]: 6.06e-05 [Cycle 2]: 0.00081925, [45] [expand_dump_flag]: 2.17999e-06 [switch_simplify]: 9.26998e-06 [loop_unroll]: 7.50998e-06 [a_1]: 0.00018257 [with_stream_mark]: 1.715e-05 [recompute_prepare]: 8.37998e-06 [updatestate_depend_eliminate]: 4.27e-06 [updatestate_assign_eliminate]: 3.31999e-06 [updatestate_loads_eliminate]: 2.83998e-06 [parameter_eliminate]: 1.79998e-06 [a_2]: 9.198e-05 [accelerated_algorithm]: 8.18001e-06 [shard]: 1.95001e-06 [meta_shard_fg_expand]: 2.09e-06 [shard_inline]: 7.42002e-06 [merge_send_recv]: 9.00999e-06 [auto_parallel]: 8.57e-06 [parallel]: 7.85998e-06 [flash_sp]: 3.6e-06 [merge_comm]: 8.17e-06 [allreduce_fusion]: 3.98999e-06 [matmul_add_comm_reduction]: 8.12e-06 [allreduce_slice_to_reducescatter]: 4.39992e-07 [virtual_shard_identity]: 8.65001e-06 [virtual_dataset]: 8.1e-06 [get_grad_eliminate_]: 7.21999e-06 [virtual_output]: 6.93e-06 [merge_forward]: 4.43999e-06 [cell_reuse_recompute_pass]: 2.86e-06 [offload_activation]: 9.69999e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.53e-05 [merge_recompute_call_nodes]: 1.09e-06 [before_grad]: 1.249e-05 [set_forward_comm_id_for_comm_node_pass]: 7.64002e-06 [meta_fg_expand]: 2.98e-06 [flash_sp_send_recv_attached]: 1.17e-06 [receive_attached]: 1.94e-06 [after_resolve]: 1.469e-05 [a_after_grad]: 1.236e-05 [renormalize]: 8.00064e-08 [add_forward_monad_depend]: 1.74e-06 [auto_monad_grad]: 2.24999e-06 [auto_monad_eliminator]: 9.94001e-06 [cse]: 2.232e-05 [a_3]: 4.682e-05 [py_interpret_to_execute_after_opt_a]: 1.876e-05 [slice_cell_reuse_recomputed_activation]: 1.96998e-06 [rewriter_after_opt_a]: 4.89e-05 [convert_after_rewriter]: 8.49998e-06 [order_py_execute_after_rewriter]: 5.57001e-06 [mutable_eliminate]: 0.00080045 [opt_b]: 0.00029733, [1] [Cycle 1]: 0.00028754, [7] [b_1]: 0.00018225 [b_2]: 1.089e-05 [updatestate_depend_eliminate]: 1.056e-05 [updatestate_assign_eliminate]: 3.11999e-06 [updatestate_loads_eliminate]: 3.43999e-06 [renormalize]: 1.10999e-06 [cse]: 3.67e-05 [optimize_parallel_all_gather_comm]: 2.416e-05 [overlap_param_gather]: 2.16998e-06 [cconv]: 3.592e-05 [loop_unroll]: 0.00062152 [opt_after_cconv]: 0.00013853, [1] [Cycle 1]: 0.00013053, [7] [c_1]: 4.118e-05 [parameter_eliminate]: 6.64001e-06 [updatestate_depend_eliminate]: 8.70999e-06 [updatestate_assign_eliminate]: 3.38e-06 [updatestate_loads_eliminate]: 3.13998e-06 [cse]: 3.092e-05 [renormalize]: 7.09988e-07 [remove_dup_value]: 1.895e-05 [tuple_transform]: 9.516e-05, [1] [Cycle 1]: 9.035e-05, [4] [d_1]: 5.889e-05 [none_parameter_eliminate]: 2.07999e-06 [renormalize]: 2.50002e-07 [switch_simplify]: 8.74e-06 [partial_unused_args_eliminate]: 1.82001e-06 [add_recomputation]: 6.915e-05 [cse_after_recomputation]: 2.991e-05, [1] [Cycle 1]: 2.518e-05, [1] [cse]: 1.839e-05 [environ_conv]: 7.97e-06 [swap_dp_allreduce_reducescatter]: 6.38e-06 [bias_add_comm_swap]: 3.61999e-06 [label_micro_interleaved_index]: 6.04999e-06 [label_fine_grained_interleaved_index]: 2.71999e-06 [merge_cast_opt]: 1.44998e-06 [slice_recompute_activation]: 2.48e-06 [micro_interleaved_order_control]: 2.32999e-06 [assign_add_opt]: 1.27e-06 [ForceFp32Comm]: 8.89995e-07 [remove_cast_before_assign_add]: 1.04998e-06 [full_micro_interleaved_order_control]: 2.63998e-06 [reorder_send_recv_between_fp_bp]: 2.99999e-06 [comm_op_add_attrs]: 1.27e-06 [add_comm_op_reuse_tag]: 1.12e-06 [interleave_split_concat_branches]: 1.37999e-06 [interleave_parallel_branches]: 1.09e-06 [overlap_opt_shard_in_pipeline]: 1.55001e-06 [overlap_opt_shard_grad_in_pipeline]: 2.04e-06 [control_data_broadcast_order]: 1.789e-05 [grouped_pairwise_exchange_alltoall]: 1.89e-06 [offloading_packed_experts]: 4.70999e-06 [overlap_recompute_and_grad_model_parallel]: 5.30001e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.35999e-06 [overlap_recompute_allgather_and_fa_grad]: 1.37e-06 [overlap_recompute_comm]: 2.74999e-06 [overlap_grad_ring_attention]: 4.97999e-06 [overlap_grad_flash_sp]: 2.66e-05 [begin_end_overlap_inline]: 5.90022e-07 [split_matmul_comm_elemetwise]: 2.14e-06 [split_layernorm_comm]: 1.68002e-06 [handle_group_info]: 1.02e-06 [symbol_engine_optimizer]: 9.501e-05, [1] [Cycle 1]: 9.046e-05, [6] [build]: 3.88999e-06 [elim_shapecalc]: 1.36e-05 [elim_not_effective]: 1.736e-05 [opt_reshape]: 9.76e-06 [fold_const_symbol]: 1.329e-05 [renormalize]: 2.60014e-07 [detach_backward]: 2.14999e-06 [pipeline_parallel_scheduler]: 1.48002e-06 [auto_monad_reorder]: 2.261e-05 [get_jit_bprop_graph]: 2.11998e-06 [rewriter_after_jit_bprop_graph]: 6.97002e-06 [opt_after_jit_grad]: 0.00080538 [validate]: 5.437e-05 Sums bootstrap : 0.000531s : 4.16% type_inference : 0.006174s : 48.39% event_method : 0.000021s : 0.17% auto_monad : 0.000065s : 0.51% graph_reusing : 0.000006s : 0.05% inline : 0.000003s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000020s : 0.16% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.05% parallel-infer-symbol : 0.000003s : 0.03% pre_auto_parallel : 0.000034s : 0.26% insert-virtual-dataset : 0.000003s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.02% optimize.py_interpret_to_execute : 0.000026s : 0.20% optimize.rewriter_before_opt_a : 0.000085s : 0.66% optimize.opt_a.expand_dump_flag : 0.000005s : 0.04% optimize.opt_a.switch_simplify : 0.000054s : 0.42% optimize.opt_a.loop_unroll : 0.000039s : 0.31% optimize.opt_a.a_1 : 0.000926s : 7.26% optimize.opt_a.with_stream_mark : 0.000033s : 0.26% optimize.opt_a.recompute_prepare : 0.000020s : 0.16% optimize.opt_a.updatestate_depend_eliminate : 0.000010s : 0.08% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.06% optimize.opt_a.updatestate_loads_eliminate : 0.000007s : 0.05% optimize.opt_a.parameter_eliminate : 0.000004s : 0.03% optimize.opt_a.a_2 : 0.000194s : 1.52% optimize.opt_a.accelerated_algorithm : 0.000018s : 0.14% optimize.opt_a.shard : 0.000004s : 0.03% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.03% optimize.opt_a.shard_inline : 0.000015s : 0.12% optimize.opt_a.merge_send_recv : 0.000018s : 0.14% optimize.opt_a.auto_parallel : 0.000017s : 0.13% optimize.opt_a.parallel : 0.000026s : 0.21% optimize.opt_a.flash_sp : 0.000013s : 0.10% optimize.opt_a.merge_comm : 0.000013s : 0.10% optimize.opt_a.allreduce_fusion : 0.000008s : 0.07% optimize.opt_a.matmul_add_comm_reduction : 0.000019s : 0.15% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000020s : 0.16% optimize.opt_a.virtual_dataset : 0.000016s : 0.13% optimize.opt_a.get_grad_eliminate_ : 0.000015s : 0.12% optimize.opt_a.virtual_output : 0.000015s : 0.12% optimize.opt_a.merge_forward : 0.000009s : 0.07% optimize.opt_a.cell_reuse_recompute_pass : 0.000005s : 0.04% optimize.opt_a.offload_activation : 0.000021s : 0.16% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000032s : 0.25% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.02% optimize.opt_a.before_grad : 0.000025s : 0.20% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000013s : 0.10% optimize.opt_a.meta_fg_expand : 0.000006s : 0.05% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.03% optimize.opt_a.receive_attached : 0.000005s : 0.04% optimize.opt_a.after_resolve : 0.000029s : 0.23% optimize.opt_a.a_after_grad : 0.000024s : 0.19% optimize.opt_a.renormalize : 0.000702s : 5.50% optimize.opt_a.add_forward_monad_depend : 0.000009s : 0.07% optimize.opt_a.auto_monad_grad : 0.000005s : 0.04% optimize.opt_a.auto_monad_eliminator : 0.000029s : 0.23% optimize.opt_a.cse : 0.000061s : 0.48% optimize.opt_a.a_3 : 0.000107s : 0.84% optimize.py_interpret_to_execute_after_opt_a : 0.000019s : 0.15% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.02% optimize.rewriter_after_opt_a : 0.000049s : 0.38% optimize.convert_after_rewriter : 0.000008s : 0.07% optimize.order_py_execute_after_rewriter : 0.000006s : 0.04% optimize.mutable_eliminate : 0.000800s : 6.27% optimize.opt_b.b_1 : 0.000182s : 1.43% optimize.opt_b.b_2 : 0.000011s : 0.09% optimize.opt_b.updatestate_depend_eliminate : 0.000011s : 0.08% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.02% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_b.renormalize : 0.000001s : 0.01% optimize.opt_b.cse : 0.000037s : 0.29% optimize.optimize_parallel_all_gather_comm : 0.000024s : 0.19% optimize.overlap_param_gather : 0.000002s : 0.02% optimize.cconv : 0.000036s : 0.28% optimize.loop_unroll : 0.000622s : 4.87% optimize.opt_after_cconv.c_1 : 0.000041s : 0.32% optimize.opt_after_cconv.parameter_eliminate : 0.000007s : 0.05% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000009s : 0.07% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.02% optimize.opt_after_cconv.cse : 0.000031s : 0.24% optimize.opt_after_cconv.renormalize : 0.000001s : 0.01% optimize.remove_dup_value : 0.000019s : 0.15% optimize.tuple_transform.d_1 : 0.000059s : 0.46% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.02% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000009s : 0.07% optimize.partial_unused_args_eliminate : 0.000002s : 0.01% optimize.add_recomputation : 0.000069s : 0.54% optimize.cse_after_recomputation.cse : 0.000018s : 0.14% optimize.environ_conv : 0.000008s : 0.06% optimize.swap_dp_allreduce_reducescatter : 0.000006s : 0.05% optimize.bias_add_comm_swap : 0.000004s : 0.03% optimize.label_micro_interleaved_index : 0.000006s : 0.05% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.02% optimize.merge_cast_opt : 0.000001s : 0.01% optimize.slice_recompute_activation : 0.000002s : 0.02% optimize.micro_interleaved_order_control : 0.000002s : 0.02% optimize.assign_add_opt : 0.000001s : 0.01% optimize.ForceFp32Comm : 0.000001s : 0.01% optimize.remove_cast_before_assign_add : 0.000001s : 0.01% optimize.full_micro_interleaved_order_control : 0.000003s : 0.02% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.02% optimize.comm_op_add_attrs : 0.000001s : 0.01% optimize.add_comm_op_reuse_tag : 0.000001s : 0.01% optimize.interleave_split_concat_branches : 0.000001s : 0.01% optimize.interleave_parallel_branches : 0.000001s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000002s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.02% optimize.control_data_broadcast_order : 0.000018s : 0.14% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.01% optimize.offloading_packed_experts : 0.000005s : 0.04% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.04% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.01% optimize.overlap_recompute_comm : 0.000003s : 0.02% optimize.overlap_grad_ring_attention : 0.000005s : 0.04% optimize.overlap_grad_flash_sp : 0.000027s : 0.21% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.02% optimize.split_layernorm_comm : 0.000002s : 0.01% optimize.handle_group_info : 0.000001s : 0.01% optimize.symbol_engine_optimizer.build : 0.000004s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000014s : 0.11% optimize.symbol_engine_optimizer.elim_not_effective : 0.000017s : 0.14% optimize.symbol_engine_optimizer.opt_reshape : 0.000010s : 0.08% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000013s : 0.10% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.02% pipeline_parallel_scheduler : 0.000001s : 0.01% auto_monad_reorder : 0.000023s : 0.18% get_jit_bprop_graph : 0.000002s : 0.02% rewriter_after_jit_bprop_graph : 0.000007s : 0.05% opt_after_jit_grad : 0.000805s : 6.31% validate : 0.000054s : 0.43% Time group info: ------[substitution.] 0.000222 39 10.73% : 0.000024s : 3: substitution.cast_eliminate 1.09% : 0.000002s : 3: substitution.elim_not_effective 0.85% : 0.000002s : 3: substitution.fold_const_symbol 3.44% : 0.000008s : 5: substitution.graph_param_transform 64.79% : 0.000144s : 4: substitution.inline 2.25% : 0.000005s : 6: substitution.j_node_and_user_rematch 2.92% : 0.000006s : 6: substitution.remove_not_recompute_node 2.67% : 0.000006s : 4: substitution.replace_old_param 7.23% : 0.000016s : 4: substitution.tuple_list_get_item_eliminator 4.03% : 0.000009s : 1: substitution.value_based_eliminate ------[type_inference.] 0.006108 2 88.41% : 0.005400s : 1: type_inference.infer 11.59% : 0.000708s : 1: type_inference.specialize ------[replace.] 0.000064 8 60.75% : 0.000039s : 4: replace.inline 39.25% : 0.000025s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000155 8 91.12% : 0.000141s : 4: match.inline 8.88% : 0.000014s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000270 1596 0.93% : 0.000002s : 17: predicate.accumulaten_eliminater 1.16% : 0.000003s : 5: predicate.ad_related_special_op_eliminate 0.52% : 0.000001s : 10: predicate.addn_check_dump 1.05% : 0.000003s : 17: predicate.addn_zero_filter 0.86% : 0.000002s : 17: predicate.adjust_all_reduce_mul_add 2.14% : 0.000006s : 27: predicate.arithmetic_simplify 1.00% : 0.000003s : 17: predicate.cast_eliminate 0.55% : 0.000001s : 10: predicate.check_bprop_eliminate 0.53% : 0.000001s : 10: predicate.compare_switch_simplify 0.18% : 0.000000s : 5: predicate.const_output_eliminate 0.63% : 0.000002s : 10: predicate.depend_value_elim 0.97% : 0.000003s : 17: predicate.dict_get_item_const_eliminator 0.98% : 0.000003s : 17: predicate.dict_get_item_eliminator 0.90% : 0.000002s : 17: predicate.dict_set_item_eliminator 1.21% : 0.000003s : 10: predicate.dumpgradient_eliminate 0.27% : 0.000001s : 5: predicate.elim_not_effective 0.45% : 0.000001s : 5: predicate.elim_shapecalc_of_broadcastargs 1.36% : 0.000004s : 22: predicate.environ_add_const_eliminate 1.20% : 0.000003s : 22: predicate.environ_get_add_eliminate 1.12% : 0.000003s : 22: predicate.environ_get_depend_swap 1.61% : 0.000004s : 32: predicate.environ_get_eliminate 1.10% : 0.000003s : 22: predicate.environ_get_set_eliminate 1.33% : 0.000004s : 25: predicate.exchange_switch_depend_value 2.13% : 0.000006s : 25: predicate.float_depend_g_call 0.50% : 0.000001s : 10: predicate.float_environ_get_switch 0.74% : 0.000002s : 15: predicate.float_tuple_getitem_switch 0.18% : 0.000000s : 5: predicate.fold_const_symbol 0.62% : 0.000002s : 10: predicate.get_grad_eliminate 0.30% : 0.000001s : 5: predicate.graph_param_transform 0.59% : 0.000002s : 10: predicate.incorporate_call 0.50% : 0.000001s : 10: predicate.incorporate_call_switch 5.82% : 0.000016s : 72: predicate.inline 0.69% : 0.000002s : 10: predicate.inline_without_move 0.31% : 0.000001s : 10: predicate.j_node_and_user_rematch 0.71% : 0.000002s : 10: predicate.less_batch_normalization 2.02% : 0.000005s : 31: predicate.list_to_tuple_eliminator_ 2.56% : 0.000007s : 48: predicate.load_eliminater 1.24% : 0.000003s : 5: predicate.loop_unroll_after_grad 1.99% : 0.000005s : 36: predicate.loop_unroll_before_grad 1.70% : 0.000005s : 27: predicate.make_slice_get_slice_eliminator 0.61% : 0.000002s : 10: predicate.merge_addn 0.55% : 0.000001s : 10: predicate.micro_step_allgather_replace 0.72% : 0.000002s : 10: predicate.mini_step_allgather_replace 0.87% : 0.000002s : 17: predicate.minmaximum_grad 1.53% : 0.000004s : 5: predicate.mutable_eliminate 0.61% : 0.000002s : 5: predicate.opt_reshape 0.58% : 0.000002s : 5: predicate.parallel_virtual_node 1.55% : 0.000004s : 25: predicate.partial_defer_inline 1.60% : 0.000004s : 26: predicate.partial_eliminate 0.90% : 0.000002s : 17: predicate.print_const_string_wrapper 0.64% : 0.000002s : 10: predicate.reduce_all_const_elim 1.12% : 0.000003s : 17: predicate.reduce_eliminate 2.54% : 0.000007s : 48: predicate.redundant_stop_gradient_eliminater 0.43% : 0.000001s : 10: predicate.remove_not_recompute_node 1.24% : 0.000003s : 31: predicate.replace_applicator 0.60% : 0.000002s : 10: predicate.replace_old_param 0.39% : 0.000001s : 5: predicate.reset_defer_inline 0.92% : 0.000002s : 17: predicate.reshape_eliminate 0.56% : 0.000002s : 10: predicate.row_tensor_add_zeros_like 0.35% : 0.000001s : 5: predicate.row_tensor_eliminate 0.71% : 0.000002s : 10: predicate.same_eliminate 0.51% : 0.000001s : 10: predicate.set_cell_output_no_recompute 0.77% : 0.000002s : 10: predicate.shard_identity_eliminate 0.69% : 0.000002s : 10: predicate.special_op_eliminate 0.70% : 0.000002s : 10: predicate.specialize_transform 0.95% : 0.000003s : 10: predicate.split_environ_get_set_with_tuple_value 0.67% : 0.000002s : 10: predicate.stack_unstack_eliminate 0.50% : 0.000001s : 5: predicate.switch_call_monad_eliminater 1.40% : 0.000004s : 25: predicate.switch_defer_inline 2.05% : 0.000006s : 35: predicate.switch_layer_defer_inline 4.74% : 0.000013s : 76: predicate.switch_simplify 0.83% : 0.000002s : 17: predicate.tile_eliminate 0.94% : 0.000003s : 17: predicate.transpose_eliminate 1.58% : 0.000004s : 27: predicate.tuple_list_convert_item_index_to_positive 1.62% : 0.000004s : 27: predicate.tuple_list_get_item_const_eliminator 1.51% : 0.000004s : 27: predicate.tuple_list_get_item_depend_reorder 3.27% : 0.000009s : 41: predicate.tuple_list_get_item_eliminator 1.61% : 0.000004s : 27: predicate.tuple_list_get_set_item_eliminator 2.19% : 0.000006s : 37: predicate.tuple_list_set_item_eliminator 1.76% : 0.000005s : 31: predicate.tuple_to_list_eliminator_ 2.45% : 0.000007s : 48: predicate.updatestate_pure_node_eliminater 3.09% : 0.000008s : 58: predicate.updatestate_useless_node_eliminater 0.51% : 0.000001s : 5: predicate.value_based_eliminate 0.67% : 0.000002s : 10: predicate.virtual_dataset_eliminate 0.57% : 0.000002s : 10: predicate.virtual_output_eliminate 0.27% : 0.000001s : 5: predicate.virtual_view_grad_eliminate 0.46% : 0.000001s : 5: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000548 11 57.19% : 0.000313s : 5: func_graph_cloner_run.FuncGraphClonerGraph 42.81% : 0.000235s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.028604 192 0.01% : 0.000004s : 1: ForceFp32Comm 11.33% : 0.003240s : 1: add_attr 11.29% : 0.003229s : 1: add_attr_with_inline 0.01% : 0.000004s : 1: add_comm_op_reuse_tag 0.26% : 0.000074s : 1: add_recomputation 0.01% : 0.000004s : 1: assign_add_opt 0.25% : 0.000071s : 1: auto_monad 0.09% : 0.000027s : 1: auto_monad_reorder 0.01% : 0.000004s : 1: begin_end_overlap_inline 0.02% : 0.000007s : 1: bias_add_comm_swap 1.96% : 0.000561s : 1: bootstrap 0.14% : 0.000041s : 1: cconv 0.01% : 0.000004s : 1: comm_op_add_attrs 0.07% : 0.000021s : 1: control_data_broadcast_order 0.04% : 0.000012s : 1: convert_after_rewriter 0.11% : 0.000033s : 1: cse_after_recomputation 0.02% : 0.000005s : 1: dataset_repeat_opt 0.02% : 0.000005s : 1: detach_backward 0.04% : 0.000012s : 1: environ_conv 0.10% : 0.000028s : 1: event_method 0.02% : 0.000005s : 1: full_micro_interleaved_order_control 0.02% : 0.000006s : 1: get_jit_bprop_graph 0.03% : 0.000009s : 1: graph_reusing 0.02% : 0.000005s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000004s : 1: handle_group_info 0.02% : 0.000006s : 1: inline 0.02% : 0.000006s : 1: insert-virtual-dataset 0.01% : 0.000004s : 1: interleave_parallel_branches 0.02% : 0.000004s : 1: interleave_split_concat_branches 0.02% : 0.000006s : 1: label_fine_grained_interleaved_index 0.03% : 0.000009s : 1: label_micro_interleaved_index 2.21% : 0.000633s : 1: loop_unroll 0.01% : 0.000004s : 1: merge_cast_opt 0.02% : 0.000005s : 1: micro_interleaved_order_control 2.85% : 0.000816s : 1: mutable_eliminate 0.03% : 0.000008s : 1: offloading_packed_experts 0.08% : 0.000024s : 1: opt.transform.loop_unroll_optimizer 0.09% : 0.000026s : 1: opt.transform.mutable_eliminate 5.09% : 0.001456s : 78: opt.transform.opt_a 0.14% : 0.000040s : 1: opt.transform.opt_after_cconv 0.14% : 0.000041s : 1: opt.transform.opt_after_jit_grad 0.55% : 0.000156s : 28: opt.transform.opt_b 0.23% : 0.000065s : 2: opt.transform.opt_trans_graph 0.17% : 0.000050s : 4: opt.transform.symbol_engine_opt 10.93% : 0.003126s : 1: opt_a 0.50% : 0.000142s : 1: opt_after_cconv 2.87% : 0.000820s : 1: opt_after_jit_grad 1.05% : 0.000301s : 1: opt_b 20.69% : 0.005919s : 1: optimize 0.10% : 0.000028s : 1: optimize_parallel_all_gather_comm 0.03% : 0.000009s : 1: order_py_execute_after_rewriter 0.11% : 0.000030s : 1: overlap_grad_flash_sp 0.01% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.03% : 0.000009s : 1: overlap_grad_ring_attention 0.02% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.02% : 0.000005s : 1: overlap_param_gather 0.01% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.03% : 0.000008s : 1: overlap_recompute_and_grad_model_parallel 0.02% : 0.000006s : 1: overlap_recompute_comm 0.02% : 0.000007s : 1: parallel-infer-symbol 0.01% : 0.000004s : 1: parallel-infer-symbol-second 0.02% : 0.000005s : 1: partial_unused_args_eliminate 0.02% : 0.000004s : 1: pipeline_parallel_scheduler 0.02% : 0.000005s : 1: pipeline_split 0.13% : 0.000038s : 1: pre_auto_parallel 0.11% : 0.000030s : 1: py_interpret_to_execute 0.08% : 0.000023s : 1: py_interpret_to_execute_after_opt_a 0.02% : 0.000005s : 1: remove_cast_before_assign_add 0.08% : 0.000022s : 1: remove_dup_value 1.31% : 0.000373s : 1: renormalize.infer 1.12% : 0.000319s : 1: renormalize.specialize 0.02% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.04% : 0.000010s : 1: rewriter_after_jit_bprop_graph 0.19% : 0.000054s : 1: rewriter_after_opt_a 0.31% : 0.000089s : 1: rewriter_before_opt_a 0.02% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.02% : 0.000005s : 1: slice_recompute_activation 0.02% : 0.000005s : 1: split_layernorm_comm 0.02% : 0.000005s : 1: split_matmul_comm_elemetwise 0.03% : 0.000009s : 1: swap_dp_allreduce_reducescatter 0.34% : 0.000098s : 1: symbol_engine_optimizer 0.34% : 0.000098s : 1: tuple_transform 21.65% : 0.006192s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:44:32.399.48 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:44:32.402.43 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0425602, [21] [bootstrap]: 0.00041822 [type_inference]: 0.00610438 [event_method]: 2.064e-05 [auto_monad]: 6.577e-05 [graph_reusing]: 5.81e-06 [inline]: 1.97999e-06 [add_attr]: 0.00325115, [1] [add_attr_with_inline]: 0.00324138, [1] [Cycle 1]: 8.124e-05, [2] [tag_attr]: 2.01e-05 [meta_addattr_fg_expand]: 6.22001e-06 [parallel-infer-symbol]: 3.31001e-06 [pre_auto_parallel]: 3.723e-05 [insert-virtual-dataset]: 2.41e-06 [parallel-infer-symbol-second]: 7.39994e-07 [dataset_repeat_opt]: 2.67001e-06 [pipeline_split]: 1.71e-06 [optimize]: 0.0307159, [53] [py_interpret_to_execute]: 2.964e-05 [rewriter_before_opt_a]: 8.936e-05 [opt_a]: 0.00352383, [2] [Cycle 1]: 0.00248205, [45] [expand_dump_flag]: 3.24001e-06 [switch_simplify]: 4.361e-05 [loop_unroll]: 3.165e-05 [a_1]: 0.00073185 [with_stream_mark]: 1.63e-05 [recompute_prepare]: 1.116e-05 [updatestate_depend_eliminate]: 4.65001e-06 [updatestate_assign_eliminate]: 4.16001e-06 [updatestate_loads_eliminate]: 3.66001e-06 [parameter_eliminate]: 1.86e-06 [a_2]: 0.00015974 [accelerated_algorithm]: 1.06e-05 [shard]: 2.26e-06 [meta_shard_fg_expand]: 2.36e-06 [shard_inline]: 8.60999e-06 [merge_send_recv]: 1.097e-05 [auto_parallel]: 9.02e-06 [parallel]: 2.106e-05 [flash_sp]: 1.069e-05 [merge_comm]: 4.80001e-06 [allreduce_fusion]: 4.50999e-06 [matmul_add_comm_reduction]: 1.052e-05 [allreduce_slice_to_reducescatter]: 7.90023e-07 [virtual_shard_identity]: 1.077e-05 [virtual_dataset]: 8.55001e-06 [get_grad_eliminate_]: 8.84e-06 [virtual_output]: 8.94e-06 [merge_forward]: 5.17e-06 [cell_reuse_recompute_pass]: 1.76e-06 [offload_activation]: 1.12e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.021e-05 [merge_recompute_call_nodes]: 1.49998e-06 [before_grad]: 1.401e-05 [set_forward_comm_id_for_comm_node_pass]: 5.08002e-06 [meta_fg_expand]: 3.58e-06 [flash_sp_send_recv_attached]: 3.45e-06 [receive_attached]: 2.28002e-06 [after_resolve]: 1.588e-05 [a_after_grad]: 1.209e-05 [renormalize]: 0.00068864 [add_forward_monad_depend]: 5.20999e-06 [auto_monad_grad]: 2.31e-06 [auto_monad_eliminator]: 1.668e-05 [cse]: 3.661e-05 [a_3]: 7.52e-05 [Cycle 2]: 0.00102494, [45] [expand_dump_flag]: 2.89999e-06 [switch_simplify]: 9.27001e-06 [loop_unroll]: 7.8e-06 [a_1]: 0.00017887 [with_stream_mark]: 1.851e-05 [recompute_prepare]: 8.80999e-06 [updatestate_depend_eliminate]: 3.76999e-06 [updatestate_assign_eliminate]: 3.86999e-06 [updatestate_loads_eliminate]: 4.03001e-06 [parameter_eliminate]: 2.04e-06 [a_2]: 0.00011929 [accelerated_algorithm]: 8.84998e-06 [shard]: 2.20002e-06 [meta_shard_fg_expand]: 2.30002e-06 [shard_inline]: 7.96001e-06 [merge_send_recv]: 9.20999e-06 [auto_parallel]: 8.80999e-06 [parallel]: 7.63999e-06 [flash_sp]: 3.98001e-06 [merge_comm]: 4.35999e-06 [allreduce_fusion]: 4.03999e-06 [matmul_add_comm_reduction]: 1.007e-05 [allreduce_slice_to_reducescatter]: 4.89992e-07 [virtual_shard_identity]: 1.297e-05 [virtual_dataset]: 7.04001e-06 [get_grad_eliminate_]: 7.05e-06 [virtual_output]: 7.82e-06 [merge_forward]: 5.84e-06 [cell_reuse_recompute_pass]: 2.21e-06 [offload_activation]: 9.52999e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.908e-05 [merge_recompute_call_nodes]: 1.64e-06 [before_grad]: 1.282e-05 [set_forward_comm_id_for_comm_node_pass]: 5.76e-06 [meta_fg_expand]: 2.88e-06 [flash_sp_send_recv_attached]: 1.67999e-06 [receive_attached]: 1.94e-06 [after_resolve]: 1.374e-05 [a_after_grad]: 1.228e-05 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 3.04999e-06 [auto_monad_grad]: 1.64e-06 [auto_monad_eliminator]: 1.382e-05 [cse]: 2.493e-05 [a_3]: 6.29e-05 [py_interpret_to_execute_after_opt_a]: 1.558e-05 [slice_cell_reuse_recomputed_activation]: 5.73997e-06 [rewriter_after_opt_a]: 5.003e-05 [convert_after_rewriter]: 1.191e-05 [order_py_execute_after_rewriter]: 9.32001e-06 [mutable_eliminate]: 0.00062896 [opt_b]: 0.00036775, [1] [Cycle 1]: 0.00035501, [7] [b_1]: 0.00022414 [b_2]: 1.239e-05 [updatestate_depend_eliminate]: 9.03002e-06 [updatestate_assign_eliminate]: 3.66001e-06 [updatestate_loads_eliminate]: 3.31999e-06 [renormalize]: 7.89994e-07 [cse]: 3.241e-05 [optimize_parallel_all_gather_comm]: 2.501e-05 [overlap_param_gather]: 5.27001e-06 [cconv]: 3.846e-05 [loop_unroll]: 0.0248811 [opt_after_cconv]: 0.00020077, [1] [Cycle 1]: 0.00018739, [7] [c_1]: 4.517e-05 [parameter_eliminate]: 6.91001e-06 [updatestate_depend_eliminate]: 1.46e-05 [updatestate_assign_eliminate]: 4.07e-06 [updatestate_loads_eliminate]: 3.88001e-06 [cse]: 5.155e-05 [renormalize]: 8.60018e-07 [remove_dup_value]: 2.287e-05 [tuple_transform]: 0.00011654, [1] [Cycle 1]: 0.00010794, [4] [d_1]: 6.452e-05 [none_parameter_eliminate]: 1.92001e-06 [renormalize]: 1.8999e-07 [switch_simplify]: 9.21998e-06 [partial_unused_args_eliminate]: 5.05001e-06 [add_recomputation]: 7.479e-05 [cse_after_recomputation]: 3.733e-05, [1] [Cycle 1]: 2.953e-05, [1] [cse]: 1.782e-05 [environ_conv]: 1.249e-05 [swap_dp_allreduce_reducescatter]: 8.35001e-06 [bias_add_comm_swap]: 5.79e-06 [label_micro_interleaved_index]: 1.084e-05 [label_fine_grained_interleaved_index]: 5.61e-06 [merge_cast_opt]: 3.98999e-06 [slice_recompute_activation]: 4.71002e-06 [micro_interleaved_order_control]: 5.09e-06 [assign_add_opt]: 4.14002e-06 [ForceFp32Comm]: 3.56999e-06 [remove_cast_before_assign_add]: 3.48e-06 [full_micro_interleaved_order_control]: 5.01002e-06 [reorder_send_recv_between_fp_bp]: 5.36002e-06 [comm_op_add_attrs]: 4.10998e-06 [add_comm_op_reuse_tag]: 3.36001e-06 [interleave_split_concat_branches]: 3.50998e-06 [interleave_parallel_branches]: 3.48e-06 [overlap_opt_shard_in_pipeline]: 3.66001e-06 [overlap_opt_shard_grad_in_pipeline]: 4.95999e-06 [control_data_broadcast_order]: 2.175e-05 [grouped_pairwise_exchange_alltoall]: 4.03999e-06 [offloading_packed_experts]: 7.74002e-06 [overlap_recompute_and_grad_model_parallel]: 8.24002e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.51001e-06 [overlap_recompute_allgather_and_fa_grad]: 3.66001e-06 [overlap_recompute_comm]: 5.20001e-06 [overlap_grad_ring_attention]: 7.98999e-06 [overlap_grad_flash_sp]: 2.996e-05 [begin_end_overlap_inline]: 2.93e-06 [split_matmul_comm_elemetwise]: 5.14998e-06 [split_layernorm_comm]: 3.94002e-06 [handle_group_info]: 3.48e-06 [symbol_engine_optimizer]: 0.00011857, [1] [Cycle 1]: 0.00011009, [6] [build]: 5.35001e-06 [elim_shapecalc]: 1.516e-05 [elim_not_effective]: 1.79e-05 [opt_reshape]: 8.87999e-06 [fold_const_symbol]: 1.3e-05 [renormalize]: 3.10014e-07 [detach_backward]: 6.11e-06 [pipeline_parallel_scheduler]: 1.84998e-06 [auto_monad_reorder]: 2.976e-05 [get_jit_bprop_graph]: 2.33998e-06 [rewriter_after_jit_bprop_graph]: 9.54e-06 [opt_after_jit_grad]: 0.00080768 [validate]: 5.763e-05 Sums bootstrap : 0.000418s : 1.13% type_inference : 0.006104s : 16.50% event_method : 0.000021s : 0.06% auto_monad : 0.000066s : 0.18% graph_reusing : 0.000006s : 0.02% inline : 0.000002s : 0.01% add_attr.add_attr_with_inline.tag_attr : 0.000020s : 0.05% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.02% parallel-infer-symbol : 0.000003s : 0.01% pre_auto_parallel : 0.000037s : 0.10% insert-virtual-dataset : 0.000002s : 0.01% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000003s : 0.01% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000030s : 0.08% optimize.rewriter_before_opt_a : 0.000089s : 0.24% optimize.opt_a.expand_dump_flag : 0.000006s : 0.02% optimize.opt_a.switch_simplify : 0.000053s : 0.14% optimize.opt_a.loop_unroll : 0.000039s : 0.11% optimize.opt_a.a_1 : 0.000911s : 2.46% optimize.opt_a.with_stream_mark : 0.000035s : 0.09% optimize.opt_a.recompute_prepare : 0.000020s : 0.05% optimize.opt_a.updatestate_depend_eliminate : 0.000008s : 0.02% optimize.opt_a.updatestate_assign_eliminate : 0.000008s : 0.02% optimize.opt_a.updatestate_loads_eliminate : 0.000008s : 0.02% optimize.opt_a.parameter_eliminate : 0.000004s : 0.01% optimize.opt_a.a_2 : 0.000279s : 0.75% optimize.opt_a.accelerated_algorithm : 0.000019s : 0.05% optimize.opt_a.shard : 0.000004s : 0.01% optimize.opt_a.meta_shard_fg_expand : 0.000005s : 0.01% optimize.opt_a.shard_inline : 0.000017s : 0.04% optimize.opt_a.merge_send_recv : 0.000020s : 0.05% optimize.opt_a.auto_parallel : 0.000018s : 0.05% optimize.opt_a.parallel : 0.000029s : 0.08% optimize.opt_a.flash_sp : 0.000015s : 0.04% optimize.opt_a.merge_comm : 0.000009s : 0.02% optimize.opt_a.allreduce_fusion : 0.000009s : 0.02% optimize.opt_a.matmul_add_comm_reduction : 0.000021s : 0.06% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000024s : 0.06% optimize.opt_a.virtual_dataset : 0.000016s : 0.04% optimize.opt_a.get_grad_eliminate_ : 0.000016s : 0.04% optimize.opt_a.virtual_output : 0.000017s : 0.05% optimize.opt_a.merge_forward : 0.000011s : 0.03% optimize.opt_a.cell_reuse_recompute_pass : 0.000004s : 0.01% optimize.opt_a.offload_activation : 0.000021s : 0.06% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000039s : 0.11% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.01% optimize.opt_a.before_grad : 0.000027s : 0.07% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000011s : 0.03% optimize.opt_a.meta_fg_expand : 0.000006s : 0.02% optimize.opt_a.flash_sp_send_recv_attached : 0.000005s : 0.01% optimize.opt_a.receive_attached : 0.000004s : 0.01% optimize.opt_a.after_resolve : 0.000030s : 0.08% optimize.opt_a.a_after_grad : 0.000024s : 0.07% optimize.opt_a.renormalize : 0.000689s : 1.86% optimize.opt_a.add_forward_monad_depend : 0.000008s : 0.02% optimize.opt_a.auto_monad_grad : 0.000004s : 0.01% optimize.opt_a.auto_monad_eliminator : 0.000031s : 0.08% optimize.opt_a.cse : 0.000062s : 0.17% optimize.opt_a.a_3 : 0.000138s : 0.37% optimize.py_interpret_to_execute_after_opt_a : 0.000016s : 0.04% optimize.slice_cell_reuse_recomputed_activation : 0.000006s : 0.02% optimize.rewriter_after_opt_a : 0.000050s : 0.14% optimize.convert_after_rewriter : 0.000012s : 0.03% optimize.order_py_execute_after_rewriter : 0.000009s : 0.03% optimize.mutable_eliminate : 0.000629s : 1.70% optimize.opt_b.b_1 : 0.000224s : 0.61% optimize.opt_b.b_2 : 0.000012s : 0.03% optimize.opt_b.updatestate_depend_eliminate : 0.000009s : 0.02% optimize.opt_b.updatestate_assign_eliminate : 0.000004s : 0.01% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.01% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000032s : 0.09% optimize.optimize_parallel_all_gather_comm : 0.000025s : 0.07% optimize.overlap_param_gather : 0.000005s : 0.01% optimize.cconv : 0.000038s : 0.10% optimize.loop_unroll : 0.024881s : 67.24% optimize.opt_after_cconv.c_1 : 0.000045s : 0.12% optimize.opt_after_cconv.parameter_eliminate : 0.000007s : 0.02% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000015s : 0.04% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000004s : 0.01% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000004s : 0.01% optimize.opt_after_cconv.cse : 0.000052s : 0.14% optimize.opt_after_cconv.renormalize : 0.000001s : 0.00% optimize.remove_dup_value : 0.000023s : 0.06% optimize.tuple_transform.d_1 : 0.000065s : 0.17% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000009s : 0.02% optimize.partial_unused_args_eliminate : 0.000005s : 0.01% optimize.add_recomputation : 0.000075s : 0.20% optimize.cse_after_recomputation.cse : 0.000018s : 0.05% optimize.environ_conv : 0.000012s : 0.03% optimize.swap_dp_allreduce_reducescatter : 0.000008s : 0.02% optimize.bias_add_comm_swap : 0.000006s : 0.02% optimize.label_micro_interleaved_index : 0.000011s : 0.03% optimize.label_fine_grained_interleaved_index : 0.000006s : 0.02% optimize.merge_cast_opt : 0.000004s : 0.01% optimize.slice_recompute_activation : 0.000005s : 0.01% optimize.micro_interleaved_order_control : 0.000005s : 0.01% optimize.assign_add_opt : 0.000004s : 0.01% optimize.ForceFp32Comm : 0.000004s : 0.01% optimize.remove_cast_before_assign_add : 0.000003s : 0.01% optimize.full_micro_interleaved_order_control : 0.000005s : 0.01% optimize.reorder_send_recv_between_fp_bp : 0.000005s : 0.01% optimize.comm_op_add_attrs : 0.000004s : 0.01% optimize.add_comm_op_reuse_tag : 0.000003s : 0.01% optimize.interleave_split_concat_branches : 0.000004s : 0.01% optimize.interleave_parallel_branches : 0.000003s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000005s : 0.01% optimize.control_data_broadcast_order : 0.000022s : 0.06% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.01% optimize.offloading_packed_experts : 0.000008s : 0.02% optimize.overlap_recompute_and_grad_model_parallel : 0.000008s : 0.02% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.01% optimize.overlap_recompute_comm : 0.000005s : 0.01% optimize.overlap_grad_ring_attention : 0.000008s : 0.02% optimize.overlap_grad_flash_sp : 0.000030s : 0.08% optimize.begin_end_overlap_inline : 0.000003s : 0.01% optimize.split_matmul_comm_elemetwise : 0.000005s : 0.01% optimize.split_layernorm_comm : 0.000004s : 0.01% optimize.handle_group_info : 0.000003s : 0.01% optimize.symbol_engine_optimizer.build : 0.000005s : 0.01% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000015s : 0.04% optimize.symbol_engine_optimizer.elim_not_effective : 0.000018s : 0.05% optimize.symbol_engine_optimizer.opt_reshape : 0.000009s : 0.02% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000013s : 0.04% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000006s : 0.02% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000030s : 0.08% get_jit_bprop_graph : 0.000002s : 0.01% rewriter_after_jit_bprop_graph : 0.000010s : 0.03% opt_after_jit_grad : 0.000808s : 2.18% validate : 0.000058s : 0.16% Time group info: ------[substitution.] 0.000217 39 12.43% : 0.000027s : 3: substitution.cast_eliminate 1.20% : 0.000003s : 3: substitution.elim_not_effective 0.83% : 0.000002s : 3: substitution.fold_const_symbol 3.46% : 0.000008s : 5: substitution.graph_param_transform 62.51% : 0.000136s : 4: substitution.inline 2.16% : 0.000005s : 6: substitution.j_node_and_user_rematch 3.08% : 0.000007s : 6: substitution.remove_not_recompute_node 2.82% : 0.000006s : 4: substitution.replace_old_param 7.20% : 0.000016s : 4: substitution.tuple_list_get_item_eliminator 4.31% : 0.000009s : 1: substitution.value_based_eliminate ------[type_inference.] 0.006048 2 88.16% : 0.005331s : 1: type_inference.infer 11.84% : 0.000716s : 1: type_inference.specialize ------[replace.] 0.000062 8 59.40% : 0.000037s : 4: replace.inline 40.60% : 0.000025s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000147 8 90.75% : 0.000133s : 4: match.inline 9.25% : 0.000014s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000277 1596 0.83% : 0.000002s : 17: predicate.accumulaten_eliminater 1.02% : 0.000003s : 5: predicate.ad_related_special_op_eliminate 0.54% : 0.000001s : 10: predicate.addn_check_dump 0.83% : 0.000002s : 17: predicate.addn_zero_filter 0.80% : 0.000002s : 17: predicate.adjust_all_reduce_mul_add 1.80% : 0.000005s : 27: predicate.arithmetic_simplify 0.94% : 0.000003s : 17: predicate.cast_eliminate 0.55% : 0.000002s : 10: predicate.check_bprop_eliminate 0.59% : 0.000002s : 10: predicate.compare_switch_simplify 0.18% : 0.000001s : 5: predicate.const_output_eliminate 0.61% : 0.000002s : 10: predicate.depend_value_elim 0.89% : 0.000002s : 17: predicate.dict_get_item_const_eliminator 0.93% : 0.000003s : 17: predicate.dict_get_item_eliminator 0.85% : 0.000002s : 17: predicate.dict_set_item_eliminator 1.31% : 0.000004s : 10: predicate.dumpgradient_eliminate 0.20% : 0.000001s : 5: predicate.elim_not_effective 0.51% : 0.000001s : 5: predicate.elim_shapecalc_of_broadcastargs 1.09% : 0.000003s : 22: predicate.environ_add_const_eliminate 1.03% : 0.000003s : 22: predicate.environ_get_add_eliminate 1.16% : 0.000003s : 22: predicate.environ_get_depend_swap 1.72% : 0.000005s : 32: predicate.environ_get_eliminate 1.02% : 0.000003s : 22: predicate.environ_get_set_eliminate 1.32% : 0.000004s : 25: predicate.exchange_switch_depend_value 2.23% : 0.000006s : 25: predicate.float_depend_g_call 0.56% : 0.000002s : 10: predicate.float_environ_get_switch 0.77% : 0.000002s : 15: predicate.float_tuple_getitem_switch 0.18% : 0.000000s : 5: predicate.fold_const_symbol 0.77% : 0.000002s : 10: predicate.get_grad_eliminate 0.27% : 0.000001s : 5: predicate.graph_param_transform 0.58% : 0.000002s : 10: predicate.incorporate_call 0.48% : 0.000001s : 10: predicate.incorporate_call_switch 6.38% : 0.000018s : 72: predicate.inline 1.02% : 0.000003s : 10: predicate.inline_without_move 0.39% : 0.000001s : 10: predicate.j_node_and_user_rematch 0.72% : 0.000002s : 10: predicate.less_batch_normalization 1.92% : 0.000005s : 31: predicate.list_to_tuple_eliminator_ 2.48% : 0.000007s : 48: predicate.load_eliminater 2.79% : 0.000008s : 5: predicate.loop_unroll_after_grad 1.95% : 0.000005s : 36: predicate.loop_unroll_before_grad 1.90% : 0.000005s : 27: predicate.make_slice_get_slice_eliminator 0.56% : 0.000002s : 10: predicate.merge_addn 0.56% : 0.000002s : 10: predicate.micro_step_allgather_replace 0.55% : 0.000002s : 10: predicate.mini_step_allgather_replace 0.82% : 0.000002s : 17: predicate.minmaximum_grad 0.96% : 0.000003s : 5: predicate.mutable_eliminate 0.46% : 0.000001s : 5: predicate.opt_reshape 0.47% : 0.000001s : 5: predicate.parallel_virtual_node 1.60% : 0.000004s : 25: predicate.partial_defer_inline 1.59% : 0.000004s : 26: predicate.partial_eliminate 0.85% : 0.000002s : 17: predicate.print_const_string_wrapper 0.62% : 0.000002s : 10: predicate.reduce_all_const_elim 1.11% : 0.000003s : 17: predicate.reduce_eliminate 2.48% : 0.000007s : 48: predicate.redundant_stop_gradient_eliminater 0.42% : 0.000001s : 10: predicate.remove_not_recompute_node 1.26% : 0.000003s : 31: predicate.replace_applicator 0.55% : 0.000002s : 10: predicate.replace_old_param 0.31% : 0.000001s : 5: predicate.reset_defer_inline 1.01% : 0.000003s : 17: predicate.reshape_eliminate 0.53% : 0.000001s : 10: predicate.row_tensor_add_zeros_like 0.46% : 0.000001s : 5: predicate.row_tensor_eliminate 0.77% : 0.000002s : 10: predicate.same_eliminate 0.45% : 0.000001s : 10: predicate.set_cell_output_no_recompute 0.78% : 0.000002s : 10: predicate.shard_identity_eliminate 0.71% : 0.000002s : 10: predicate.special_op_eliminate 0.74% : 0.000002s : 10: predicate.specialize_transform 0.99% : 0.000003s : 10: predicate.split_environ_get_set_with_tuple_value 0.74% : 0.000002s : 10: predicate.stack_unstack_eliminate 0.38% : 0.000001s : 5: predicate.switch_call_monad_eliminater 1.45% : 0.000004s : 25: predicate.switch_defer_inline 1.90% : 0.000005s : 35: predicate.switch_layer_defer_inline 4.34% : 0.000012s : 76: predicate.switch_simplify 0.82% : 0.000002s : 17: predicate.tile_eliminate 0.92% : 0.000003s : 17: predicate.transpose_eliminate 1.67% : 0.000005s : 27: predicate.tuple_list_convert_item_index_to_positive 1.63% : 0.000005s : 27: predicate.tuple_list_get_item_const_eliminator 1.53% : 0.000004s : 27: predicate.tuple_list_get_item_depend_reorder 3.28% : 0.000009s : 41: predicate.tuple_list_get_item_eliminator 1.64% : 0.000005s : 27: predicate.tuple_list_get_set_item_eliminator 2.28% : 0.000006s : 37: predicate.tuple_list_set_item_eliminator 1.70% : 0.000005s : 31: predicate.tuple_to_list_eliminator_ 2.32% : 0.000006s : 48: predicate.updatestate_pure_node_eliminater 3.23% : 0.000009s : 58: predicate.updatestate_useless_node_eliminater 0.42% : 0.000001s : 5: predicate.value_based_eliminate 0.57% : 0.000002s : 10: predicate.virtual_dataset_eliminate 0.68% : 0.000002s : 10: predicate.virtual_output_eliminate 0.29% : 0.000001s : 5: predicate.virtual_view_grad_eliminate 0.48% : 0.000001s : 5: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000550 11 55.02% : 0.000303s : 5: func_graph_cloner_run.FuncGraphClonerGraph 44.98% : 0.000247s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.078943 192 0.01% : 0.000006s : 1: ForceFp32Comm 4.13% : 0.003261s : 1: add_attr 4.11% : 0.003245s : 1: add_attr_with_inline 0.01% : 0.000006s : 1: add_comm_op_reuse_tag 0.10% : 0.000079s : 1: add_recomputation 0.01% : 0.000007s : 1: assign_add_opt 0.09% : 0.000075s : 1: auto_monad 0.05% : 0.000039s : 1: auto_monad_reorder 0.01% : 0.000006s : 1: begin_end_overlap_inline 0.01% : 0.000009s : 1: bias_add_comm_swap 0.58% : 0.000461s : 1: bootstrap 0.05% : 0.000042s : 1: cconv 0.01% : 0.000007s : 1: comm_op_add_attrs 0.03% : 0.000026s : 1: control_data_broadcast_order 0.02% : 0.000016s : 1: convert_after_rewriter 0.05% : 0.000041s : 1: cse_after_recomputation 0.01% : 0.000008s : 1: dataset_repeat_opt 0.04% : 0.000033s : 1: detach_backward 0.02% : 0.000016s : 1: environ_conv 0.04% : 0.000031s : 1: event_method 0.01% : 0.000008s : 1: full_micro_interleaved_order_control 0.01% : 0.000009s : 1: get_jit_bprop_graph 0.02% : 0.000012s : 1: graph_reusing 0.01% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000006s : 1: handle_group_info 0.01% : 0.000008s : 1: inline 0.01% : 0.000009s : 1: insert-virtual-dataset 0.01% : 0.000006s : 1: interleave_parallel_branches 0.01% : 0.000006s : 1: interleave_split_concat_branches 0.01% : 0.000008s : 1: label_fine_grained_interleaved_index 0.02% : 0.000014s : 1: label_micro_interleaved_index 31.54% : 0.024896s : 1: loop_unroll 0.01% : 0.000007s : 1: merge_cast_opt 0.01% : 0.000008s : 1: micro_interleaved_order_control 0.81% : 0.000639s : 1: mutable_eliminate 0.01% : 0.000010s : 1: offloading_packed_experts 0.06% : 0.000045s : 1: opt.transform.loop_unroll_optimizer 0.03% : 0.000024s : 1: opt.transform.mutable_eliminate 1.85% : 0.001461s : 78: opt.transform.opt_a 0.05% : 0.000043s : 1: opt.transform.opt_after_cconv 0.05% : 0.000043s : 1: opt.transform.opt_after_jit_grad 0.20% : 0.000158s : 28: opt.transform.opt_b 0.09% : 0.000071s : 2: opt.transform.opt_trans_graph 0.06% : 0.000051s : 4: opt.transform.symbol_engine_opt 4.47% : 0.003527s : 1: opt_a 0.26% : 0.000205s : 1: opt_after_cconv 1.04% : 0.000823s : 1: opt_after_jit_grad 0.47% : 0.000373s : 1: opt_b 39.85% : 0.031457s : 1: optimize 0.04% : 0.000029s : 1: optimize_parallel_all_gather_comm 0.02% : 0.000013s : 1: order_py_execute_after_rewriter 0.04% : 0.000033s : 1: overlap_grad_flash_sp 0.01% : 0.000006s : 1: overlap_grad_matmul_and_grad_allreduce 0.02% : 0.000012s : 1: overlap_grad_ring_attention 0.01% : 0.000008s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000006s : 1: overlap_opt_shard_in_pipeline 0.01% : 0.000008s : 1: overlap_param_gather 0.01% : 0.000006s : 1: overlap_recompute_allgather_and_fa_grad 0.01% : 0.000011s : 1: overlap_recompute_and_grad_model_parallel 0.01% : 0.000008s : 1: overlap_recompute_comm 0.01% : 0.000010s : 1: parallel-infer-symbol 0.01% : 0.000006s : 1: parallel-infer-symbol-second 0.01% : 0.000008s : 1: partial_unused_args_eliminate 0.01% : 0.000010s : 1: pipeline_parallel_scheduler 0.01% : 0.000007s : 1: pipeline_split 0.06% : 0.000045s : 1: pre_auto_parallel 0.04% : 0.000034s : 1: py_interpret_to_execute 0.02% : 0.000020s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000007s : 1: remove_cast_before_assign_add 0.03% : 0.000026s : 1: remove_dup_value 0.45% : 0.000353s : 1: renormalize.infer 0.42% : 0.000328s : 1: renormalize.specialize 0.01% : 0.000009s : 1: reorder_send_recv_between_fp_bp 0.02% : 0.000015s : 1: rewriter_after_jit_bprop_graph 0.07% : 0.000055s : 1: rewriter_after_opt_a 0.12% : 0.000094s : 1: rewriter_before_opt_a 0.01% : 0.000008s : 1: slice_cell_reuse_recomputed_activation 0.01% : 0.000007s : 1: slice_recompute_activation 0.01% : 0.000007s : 1: split_layernorm_comm 0.01% : 0.000008s : 1: split_matmul_comm_elemetwise 0.01% : 0.000011s : 1: swap_dp_allreduce_reducescatter 0.15% : 0.000122s : 1: symbol_engine_optimizer 0.15% : 0.000119s : 1: tuple_transform 7.78% : 0.006144s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:44:32.654.307 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0331105, [21] [bootstrap]: 0.00038289 [type_inference]: 0.00587003 [event_method]: 1.96e-05 [auto_monad]: 7.221e-05 [graph_reusing]: 6.06e-06 [inline]: 2.59001e-06 [add_attr]: 0.0197739, [1] [add_attr_with_inline]: 0.0197591, [1] [Cycle 1]: 7.152e-05, [2] [tag_attr]: 2.171e-05 [meta_addattr_fg_expand]: 6.44001e-06 [parallel-infer-symbol]: 3.75998e-06 [pre_auto_parallel]: 3.773e-05 [insert-virtual-dataset]: 2.39999e-06 [parallel-infer-symbol-second]: 8.10018e-07 [dataset_repeat_opt]: 2.17001e-06 [pipeline_split]: 1.91003e-06 [optimize]: 0.00602217, [53] [py_interpret_to_execute]: 2.696e-05 [rewriter_before_opt_a]: 9.453e-05 [opt_a]: 0.00317354, [2] [Cycle 1]: 0.00236923, [45] [expand_dump_flag]: 3.53e-06 [switch_simplify]: 4.535e-05 [loop_unroll]: 3.247e-05 [a_1]: 0.00078594 [with_stream_mark]: 1.725e-05 [recompute_prepare]: 1.263e-05 [updatestate_depend_eliminate]: 4.62e-06 [updatestate_assign_eliminate]: 4.35999e-06 [updatestate_loads_eliminate]: 4.22e-06 [parameter_eliminate]: 2.09e-06 [a_2]: 0.00010293 [accelerated_algorithm]: 9.15999e-06 [shard]: 1.76e-06 [meta_shard_fg_expand]: 2.02001e-06 [shard_inline]: 7.66999e-06 [merge_send_recv]: 9.80002e-06 [auto_parallel]: 8.70999e-06 [parallel]: 1.827e-05 [flash_sp]: 9.51003e-06 [merge_comm]: 4.69998e-06 [allreduce_fusion]: 4.60999e-06 [matmul_add_comm_reduction]: 1.13e-05 [allreduce_slice_to_reducescatter]: 6.59988e-07 [virtual_shard_identity]: 1.037e-05 [virtual_dataset]: 7.78001e-06 [get_grad_eliminate_]: 7.88001e-06 [virtual_output]: 7.77e-06 [merge_forward]: 4.64002e-06 [cell_reuse_recompute_pass]: 1.72001e-06 [offload_activation]: 1.114e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.619e-05 [merge_recompute_call_nodes]: 1.55999e-06 [before_grad]: 1.317e-05 [set_forward_comm_id_for_comm_node_pass]: 4.83001e-06 [meta_fg_expand]: 3.38e-06 [flash_sp_send_recv_attached]: 3.71001e-06 [receive_attached]: 2.59001e-06 [after_resolve]: 1.431e-05 [a_after_grad]: 1.243e-05 [renormalize]: 0.00075183 [add_forward_monad_depend]: 6.22001e-06 [auto_monad_grad]: 2.58998e-06 [auto_monad_eliminator]: 2.052e-05 [cse]: 3.975e-05 [a_3]: 5.956e-05 [Cycle 2]: 0.0007928, [45] [expand_dump_flag]: 2.03002e-06 [switch_simplify]: 9.59999e-06 [loop_unroll]: 7.4e-06 [a_1]: 0.00017902 [with_stream_mark]: 1.677e-05 [recompute_prepare]: 7.88999e-06 [updatestate_depend_eliminate]: 3.93001e-06 [updatestate_assign_eliminate]: 3.21999e-06 [updatestate_loads_eliminate]: 2.88e-06 [parameter_eliminate]: 1.50999e-06 [a_2]: 9.213e-05 [accelerated_algorithm]: 7.96001e-06 [shard]: 1.23002e-06 [meta_shard_fg_expand]: 1.71e-06 [shard_inline]: 7.27002e-06 [merge_send_recv]: 9.76e-06 [auto_parallel]: 6.78e-06 [parallel]: 6.33e-06 [flash_sp]: 3.33e-06 [merge_comm]: 4.46002e-06 [allreduce_fusion]: 3.85e-06 [matmul_add_comm_reduction]: 8.66002e-06 [allreduce_slice_to_reducescatter]: 4.19997e-07 [virtual_shard_identity]: 8.78001e-06 [virtual_dataset]: 7.5e-06 [get_grad_eliminate_]: 7.09001e-06 [virtual_output]: 6.66999e-06 [merge_forward]: 4.12e-06 [cell_reuse_recompute_pass]: 2.07001e-06 [offload_activation]: 8.57998e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.545e-05 [merge_recompute_call_nodes]: 7.80012e-07 [before_grad]: 1.175e-05 [set_forward_comm_id_for_comm_node_pass]: 6.19999e-06 [meta_fg_expand]: 2.84001e-06 [flash_sp_send_recv_attached]: 1.10001e-06 [receive_attached]: 1.92001e-06 [after_resolve]: 1.346e-05 [a_after_grad]: 1.122e-05 [renormalize]: 6.00121e-08 [add_forward_monad_depend]: 2.19001e-06 [auto_monad_grad]: 1.30001e-06 [auto_monad_eliminator]: 9.86e-06 [cse]: 2.01e-05 [a_3]: 4.608e-05 [py_interpret_to_execute_after_opt_a]: 1.467e-05 [slice_cell_reuse_recomputed_activation]: 1.87001e-06 [rewriter_after_opt_a]: 4.344e-05 [convert_after_rewriter]: 7.78001e-06 [order_py_execute_after_rewriter]: 6.24999e-06 [mutable_eliminate]: 0.00089311 [opt_b]: 0.00029418, [1] [Cycle 1]: 0.00028459, [7] [b_1]: 0.00018461 [b_2]: 1.075e-05 [updatestate_depend_eliminate]: 1.06e-05 [updatestate_assign_eliminate]: 3.5e-06 [updatestate_loads_eliminate]: 3.51999e-06 [renormalize]: 5.10016e-07 [cse]: 3.375e-05 [optimize_parallel_all_gather_comm]: 2.314e-05 [overlap_param_gather]: 1.86e-06 [cconv]: 3.41e-05 [loop_unroll]: 0.00061337 [opt_after_cconv]: 0.00013571, [1] [Cycle 1]: 0.00012741, [7] [c_1]: 4.017e-05 [parameter_eliminate]: 5.30999e-06 [updatestate_depend_eliminate]: 7.87e-06 [updatestate_assign_eliminate]: 3.01001e-06 [updatestate_loads_eliminate]: 3.47997e-06 [cse]: 2.956e-05 [renormalize]: 5.89993e-07 [remove_dup_value]: 1.756e-05 [tuple_transform]: 9.259e-05, [1] [Cycle 1]: 8.738e-05, [4] [d_1]: 5.803e-05 [none_parameter_eliminate]: 1.62999e-06 [renormalize]: 3.69997e-07 [switch_simplify]: 8.44998e-06 [partial_unused_args_eliminate]: 2.47001e-06 [add_recomputation]: 6.581e-05 [cse_after_recomputation]: 2.859e-05, [1] [Cycle 1]: 2.301e-05, [1] [cse]: 1.725e-05 [environ_conv]: 7.63001e-06 [swap_dp_allreduce_reducescatter]: 6.29001e-06 [bias_add_comm_swap]: 3.13998e-06 [label_micro_interleaved_index]: 6.08998e-06 [label_fine_grained_interleaved_index]: 2.94001e-06 [merge_cast_opt]: 1.47999e-06 [slice_recompute_activation]: 2.49001e-06 [micro_interleaved_order_control]: 2.50002e-06 [assign_add_opt]: 1.50001e-06 [ForceFp32Comm]: 1.02e-06 [remove_cast_before_assign_add]: 1.12e-06 [full_micro_interleaved_order_control]: 2.14e-06 [reorder_send_recv_between_fp_bp]: 3.42002e-06 [comm_op_add_attrs]: 1.51998e-06 [add_comm_op_reuse_tag]: 9.49978e-07 [interleave_split_concat_branches]: 1.22e-06 [interleave_parallel_branches]: 1.32e-06 [overlap_opt_shard_in_pipeline]: 1.50999e-06 [overlap_opt_shard_grad_in_pipeline]: 1.81e-06 [control_data_broadcast_order]: 1.726e-05 [grouped_pairwise_exchange_alltoall]: 1.47001e-06 [offloading_packed_experts]: 5.15001e-06 [overlap_recompute_and_grad_model_parallel]: 5.18002e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.32e-06 [overlap_recompute_allgather_and_fa_grad]: 1.44e-06 [overlap_recompute_comm]: 2.51e-06 [overlap_grad_ring_attention]: 4.82e-06 [overlap_grad_flash_sp]: 2.405e-05 [begin_end_overlap_inline]: 6.00005e-07 [split_matmul_comm_elemetwise]: 2.91999e-06 [split_layernorm_comm]: 1.74e-06 [handle_group_info]: 1.22e-06 [symbol_engine_optimizer]: 9.161e-05, [1] [Cycle 1]: 8.674e-05, [6] [build]: 4.15999e-06 [elim_shapecalc]: 1.294e-05 [elim_not_effective]: 1.659e-05 [opt_reshape]: 8.89e-06 [fold_const_symbol]: 1.229e-05 [renormalize]: 2.50002e-07 [detach_backward]: 2.69001e-06 [pipeline_parallel_scheduler]: 1.71998e-06 [auto_monad_reorder]: 2.222e-05 [get_jit_bprop_graph]: 1.99e-06 [rewriter_after_jit_bprop_graph]: 5.85002e-06 [opt_after_jit_grad]: 0.00065579 [validate]: 5.617e-05 Sums bootstrap : 0.000383s : 3.11% type_inference : 0.005870s : 47.70% event_method : 0.000020s : 0.16% auto_monad : 0.000072s : 0.59% graph_reusing : 0.000006s : 0.05% inline : 0.000003s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000022s : 0.18% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.05% parallel-infer-symbol : 0.000004s : 0.03% pre_auto_parallel : 0.000038s : 0.31% insert-virtual-dataset : 0.000002s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.02% optimize.py_interpret_to_execute : 0.000027s : 0.22% optimize.rewriter_before_opt_a : 0.000095s : 0.77% optimize.opt_a.expand_dump_flag : 0.000006s : 0.05% optimize.opt_a.switch_simplify : 0.000055s : 0.45% optimize.opt_a.loop_unroll : 0.000040s : 0.32% optimize.opt_a.a_1 : 0.000965s : 7.84% optimize.opt_a.with_stream_mark : 0.000034s : 0.28% optimize.opt_a.recompute_prepare : 0.000021s : 0.17% optimize.opt_a.updatestate_depend_eliminate : 0.000009s : 0.07% optimize.opt_a.updatestate_assign_eliminate : 0.000008s : 0.06% optimize.opt_a.updatestate_loads_eliminate : 0.000007s : 0.06% optimize.opt_a.parameter_eliminate : 0.000004s : 0.03% optimize.opt_a.a_2 : 0.000195s : 1.59% optimize.opt_a.accelerated_algorithm : 0.000017s : 0.14% optimize.opt_a.shard : 0.000003s : 0.02% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.03% optimize.opt_a.shard_inline : 0.000015s : 0.12% optimize.opt_a.merge_send_recv : 0.000020s : 0.16% optimize.opt_a.auto_parallel : 0.000015s : 0.13% optimize.opt_a.parallel : 0.000025s : 0.20% optimize.opt_a.flash_sp : 0.000013s : 0.10% optimize.opt_a.merge_comm : 0.000009s : 0.07% optimize.opt_a.allreduce_fusion : 0.000008s : 0.07% optimize.opt_a.matmul_add_comm_reduction : 0.000020s : 0.16% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000019s : 0.16% optimize.opt_a.virtual_dataset : 0.000015s : 0.12% optimize.opt_a.get_grad_eliminate_ : 0.000015s : 0.12% optimize.opt_a.virtual_output : 0.000014s : 0.12% optimize.opt_a.merge_forward : 0.000009s : 0.07% optimize.opt_a.cell_reuse_recompute_pass : 0.000004s : 0.03% optimize.opt_a.offload_activation : 0.000020s : 0.16% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000032s : 0.26% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.02% optimize.opt_a.before_grad : 0.000025s : 0.20% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000011s : 0.09% optimize.opt_a.meta_fg_expand : 0.000006s : 0.05% optimize.opt_a.flash_sp_send_recv_attached : 0.000005s : 0.04% optimize.opt_a.receive_attached : 0.000005s : 0.04% optimize.opt_a.after_resolve : 0.000028s : 0.23% optimize.opt_a.a_after_grad : 0.000024s : 0.19% optimize.opt_a.renormalize : 0.000752s : 6.11% optimize.opt_a.add_forward_monad_depend : 0.000008s : 0.07% optimize.opt_a.auto_monad_grad : 0.000004s : 0.03% optimize.opt_a.auto_monad_eliminator : 0.000030s : 0.25% optimize.opt_a.cse : 0.000060s : 0.49% optimize.opt_a.a_3 : 0.000106s : 0.86% optimize.py_interpret_to_execute_after_opt_a : 0.000015s : 0.12% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.02% optimize.rewriter_after_opt_a : 0.000043s : 0.35% optimize.convert_after_rewriter : 0.000008s : 0.06% optimize.order_py_execute_after_rewriter : 0.000006s : 0.05% optimize.mutable_eliminate : 0.000893s : 7.26% optimize.opt_b.b_1 : 0.000185s : 1.50% optimize.opt_b.b_2 : 0.000011s : 0.09% optimize.opt_b.updatestate_depend_eliminate : 0.000011s : 0.09% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_b.updatestate_loads_eliminate : 0.000004s : 0.03% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000034s : 0.27% optimize.optimize_parallel_all_gather_comm : 0.000023s : 0.19% optimize.overlap_param_gather : 0.000002s : 0.02% optimize.cconv : 0.000034s : 0.28% optimize.loop_unroll : 0.000613s : 4.98% optimize.opt_after_cconv.c_1 : 0.000040s : 0.33% optimize.opt_after_cconv.parameter_eliminate : 0.000005s : 0.04% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000008s : 0.06% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.02% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.cse : 0.000030s : 0.24% optimize.opt_after_cconv.renormalize : 0.000001s : 0.00% optimize.remove_dup_value : 0.000018s : 0.14% optimize.tuple_transform.d_1 : 0.000058s : 0.47% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000008s : 0.07% optimize.partial_unused_args_eliminate : 0.000002s : 0.02% optimize.add_recomputation : 0.000066s : 0.53% optimize.cse_after_recomputation.cse : 0.000017s : 0.14% optimize.environ_conv : 0.000008s : 0.06% optimize.swap_dp_allreduce_reducescatter : 0.000006s : 0.05% optimize.bias_add_comm_swap : 0.000003s : 0.03% optimize.label_micro_interleaved_index : 0.000006s : 0.05% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.02% optimize.merge_cast_opt : 0.000001s : 0.01% optimize.slice_recompute_activation : 0.000002s : 0.02% optimize.micro_interleaved_order_control : 0.000003s : 0.02% optimize.assign_add_opt : 0.000002s : 0.01% optimize.ForceFp32Comm : 0.000001s : 0.01% optimize.remove_cast_before_assign_add : 0.000001s : 0.01% optimize.full_micro_interleaved_order_control : 0.000002s : 0.02% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.03% optimize.comm_op_add_attrs : 0.000002s : 0.01% optimize.add_comm_op_reuse_tag : 0.000001s : 0.01% optimize.interleave_split_concat_branches : 0.000001s : 0.01% optimize.interleave_parallel_branches : 0.000001s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000002s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.01% optimize.control_data_broadcast_order : 0.000017s : 0.14% optimize.grouped_pairwise_exchange_alltoall : 0.000001s : 0.01% optimize.offloading_packed_experts : 0.000005s : 0.04% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.04% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.01% optimize.overlap_recompute_comm : 0.000003s : 0.02% optimize.overlap_grad_ring_attention : 0.000005s : 0.04% optimize.overlap_grad_flash_sp : 0.000024s : 0.20% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000003s : 0.02% optimize.split_layernorm_comm : 0.000002s : 0.01% optimize.handle_group_info : 0.000001s : 0.01% optimize.symbol_engine_optimizer.build : 0.000004s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000013s : 0.11% optimize.symbol_engine_optimizer.elim_not_effective : 0.000017s : 0.13% optimize.symbol_engine_optimizer.opt_reshape : 0.000009s : 0.07% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000012s : 0.10% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000003s : 0.02% pipeline_parallel_scheduler : 0.000002s : 0.01% auto_monad_reorder : 0.000022s : 0.18% get_jit_bprop_graph : 0.000002s : 0.02% rewriter_after_jit_bprop_graph : 0.000006s : 0.05% opt_after_jit_grad : 0.000656s : 5.33% validate : 0.000056s : 0.46% Time group info: ------[substitution.] 0.000236 39 10.19% : 0.000024s : 3: substitution.cast_eliminate 1.33% : 0.000003s : 3: substitution.elim_not_effective 0.75% : 0.000002s : 3: substitution.fold_const_symbol 3.34% : 0.000008s : 5: substitution.graph_param_transform 66.67% : 0.000157s : 4: substitution.inline 1.99% : 0.000005s : 6: substitution.j_node_and_user_rematch 3.01% : 0.000007s : 6: substitution.remove_not_recompute_node 2.15% : 0.000005s : 4: substitution.replace_old_param 6.99% : 0.000016s : 4: substitution.tuple_list_get_item_eliminator 3.58% : 0.000008s : 1: substitution.value_based_eliminate ------[type_inference.] 0.005804 2 87.22% : 0.005062s : 1: type_inference.infer 12.78% : 0.000742s : 1: type_inference.specialize ------[replace.] 0.000067 8 61.33% : 0.000041s : 4: replace.inline 38.67% : 0.000026s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000169 8 91.59% : 0.000155s : 4: match.inline 8.41% : 0.000014s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000272 1596 0.96% : 0.000003s : 17: predicate.accumulaten_eliminater 0.78% : 0.000002s : 5: predicate.ad_related_special_op_eliminate 0.52% : 0.000001s : 10: predicate.addn_check_dump 0.89% : 0.000002s : 17: predicate.addn_zero_filter 0.80% : 0.000002s : 17: predicate.adjust_all_reduce_mul_add 2.16% : 0.000006s : 27: predicate.arithmetic_simplify 1.23% : 0.000003s : 17: predicate.cast_eliminate 0.65% : 0.000002s : 10: predicate.check_bprop_eliminate 0.58% : 0.000002s : 10: predicate.compare_switch_simplify 0.18% : 0.000000s : 5: predicate.const_output_eliminate 0.67% : 0.000002s : 10: predicate.depend_value_elim 1.00% : 0.000003s : 17: predicate.dict_get_item_const_eliminator 1.07% : 0.000003s : 17: predicate.dict_get_item_eliminator 0.91% : 0.000002s : 17: predicate.dict_set_item_eliminator 0.94% : 0.000003s : 10: predicate.dumpgradient_eliminate 0.23% : 0.000001s : 5: predicate.elim_not_effective 0.42% : 0.000001s : 5: predicate.elim_shapecalc_of_broadcastargs 1.17% : 0.000003s : 22: predicate.environ_add_const_eliminate 1.21% : 0.000003s : 22: predicate.environ_get_add_eliminate 1.18% : 0.000003s : 22: predicate.environ_get_depend_swap 1.71% : 0.000005s : 32: predicate.environ_get_eliminate 1.07% : 0.000003s : 22: predicate.environ_get_set_eliminate 1.36% : 0.000004s : 25: predicate.exchange_switch_depend_value 2.26% : 0.000006s : 25: predicate.float_depend_g_call 0.57% : 0.000002s : 10: predicate.float_environ_get_switch 0.80% : 0.000002s : 15: predicate.float_tuple_getitem_switch 0.18% : 0.000000s : 5: predicate.fold_const_symbol 0.61% : 0.000002s : 10: predicate.get_grad_eliminate 0.25% : 0.000001s : 5: predicate.graph_param_transform 0.55% : 0.000002s : 10: predicate.incorporate_call 0.49% : 0.000001s : 10: predicate.incorporate_call_switch 6.19% : 0.000017s : 72: predicate.inline 0.76% : 0.000002s : 10: predicate.inline_without_move 0.29% : 0.000001s : 10: predicate.j_node_and_user_rematch 0.84% : 0.000002s : 10: predicate.less_batch_normalization 1.84% : 0.000005s : 31: predicate.list_to_tuple_eliminator_ 2.54% : 0.000007s : 48: predicate.load_eliminater 0.93% : 0.000003s : 5: predicate.loop_unroll_after_grad 1.94% : 0.000005s : 36: predicate.loop_unroll_before_grad 1.76% : 0.000005s : 27: predicate.make_slice_get_slice_eliminator 0.70% : 0.000002s : 10: predicate.merge_addn 0.53% : 0.000001s : 10: predicate.micro_step_allgather_replace 0.56% : 0.000002s : 10: predicate.mini_step_allgather_replace 0.86% : 0.000002s : 17: predicate.minmaximum_grad 1.51% : 0.000004s : 5: predicate.mutable_eliminate 0.45% : 0.000001s : 5: predicate.opt_reshape 0.40% : 0.000001s : 5: predicate.parallel_virtual_node 1.64% : 0.000004s : 25: predicate.partial_defer_inline 1.58% : 0.000004s : 26: predicate.partial_eliminate 0.98% : 0.000003s : 17: predicate.print_const_string_wrapper 0.62% : 0.000002s : 10: predicate.reduce_all_const_elim 1.23% : 0.000003s : 17: predicate.reduce_eliminate 2.50% : 0.000007s : 48: predicate.redundant_stop_gradient_eliminater 0.37% : 0.000001s : 10: predicate.remove_not_recompute_node 1.39% : 0.000004s : 31: predicate.replace_applicator 0.56% : 0.000002s : 10: predicate.replace_old_param 0.37% : 0.000001s : 5: predicate.reset_defer_inline 0.95% : 0.000003s : 17: predicate.reshape_eliminate 0.59% : 0.000002s : 10: predicate.row_tensor_add_zeros_like 0.43% : 0.000001s : 5: predicate.row_tensor_eliminate 0.71% : 0.000002s : 10: predicate.same_eliminate 0.48% : 0.000001s : 10: predicate.set_cell_output_no_recompute 0.75% : 0.000002s : 10: predicate.shard_identity_eliminate 0.64% : 0.000002s : 10: predicate.special_op_eliminate 0.66% : 0.000002s : 10: predicate.specialize_transform 0.93% : 0.000003s : 10: predicate.split_environ_get_set_with_tuple_value 0.76% : 0.000002s : 10: predicate.stack_unstack_eliminate 0.31% : 0.000001s : 5: predicate.switch_call_monad_eliminater 1.42% : 0.000004s : 25: predicate.switch_defer_inline 1.97% : 0.000005s : 35: predicate.switch_layer_defer_inline 4.65% : 0.000013s : 76: predicate.switch_simplify 0.89% : 0.000002s : 17: predicate.tile_eliminate 1.00% : 0.000003s : 17: predicate.transpose_eliminate 1.68% : 0.000005s : 27: predicate.tuple_list_convert_item_index_to_positive 1.65% : 0.000004s : 27: predicate.tuple_list_get_item_const_eliminator 1.63% : 0.000004s : 27: predicate.tuple_list_get_item_depend_reorder 3.24% : 0.000009s : 41: predicate.tuple_list_get_item_eliminator 1.70% : 0.000005s : 27: predicate.tuple_list_get_set_item_eliminator 2.16% : 0.000006s : 37: predicate.tuple_list_set_item_eliminator 1.84% : 0.000005s : 31: predicate.tuple_to_list_eliminator_ 2.54% : 0.000007s : 48: predicate.updatestate_pure_node_eliminater 3.10% : 0.000008s : 58: predicate.updatestate_useless_node_eliminater 0.55% : 0.000001s : 5: predicate.value_based_eliminate 0.62% : 0.000002s : 10: predicate.virtual_dataset_eliminate 0.57% : 0.000002s : 10: predicate.virtual_output_eliminate 0.28% : 0.000001s : 5: predicate.virtual_view_grad_eliminate 0.54% : 0.000001s : 5: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000522 11 52.77% : 0.000276s : 5: func_graph_cloner_run.FuncGraphClonerGraph 47.23% : 0.000247s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.061379 192 0.01% : 0.000004s : 1: ForceFp32Comm 32.23% : 0.019780s : 1: add_attr 32.20% : 0.019763s : 1: add_attr_with_inline 0.01% : 0.000004s : 1: add_comm_op_reuse_tag 0.11% : 0.000070s : 1: add_recomputation 0.01% : 0.000004s : 1: assign_add_opt 0.13% : 0.000078s : 1: auto_monad 0.04% : 0.000027s : 1: auto_monad_reorder 0.01% : 0.000003s : 1: begin_end_overlap_inline 0.01% : 0.000006s : 1: bias_add_comm_swap 0.67% : 0.000412s : 1: bootstrap 0.06% : 0.000038s : 1: cconv 0.01% : 0.000005s : 1: comm_op_add_attrs 0.03% : 0.000021s : 1: control_data_broadcast_order 0.02% : 0.000011s : 1: convert_after_rewriter 0.05% : 0.000032s : 1: cse_after_recomputation 0.01% : 0.000005s : 1: dataset_repeat_opt 0.01% : 0.000006s : 1: detach_backward 0.02% : 0.000011s : 1: environ_conv 0.04% : 0.000026s : 1: event_method 0.01% : 0.000005s : 1: full_micro_interleaved_order_control 0.01% : 0.000005s : 1: get_jit_bprop_graph 0.02% : 0.000010s : 1: graph_reusing 0.01% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000004s : 1: handle_group_info 0.01% : 0.000006s : 1: inline 0.01% : 0.000006s : 1: insert-virtual-dataset 0.01% : 0.000004s : 1: interleave_parallel_branches 0.01% : 0.000004s : 1: interleave_split_concat_branches 0.01% : 0.000006s : 1: label_fine_grained_interleaved_index 0.01% : 0.000009s : 1: label_micro_interleaved_index 1.02% : 0.000623s : 1: loop_unroll 0.01% : 0.000004s : 1: merge_cast_opt 0.01% : 0.000005s : 1: micro_interleaved_order_control 1.48% : 0.000908s : 1: mutable_eliminate 0.01% : 0.000009s : 1: offloading_packed_experts 0.03% : 0.000019s : 1: opt.transform.loop_unroll_optimizer 0.04% : 0.000026s : 1: opt.transform.mutable_eliminate 2.43% : 0.001492s : 78: opt.transform.opt_a 0.06% : 0.000038s : 1: opt.transform.opt_after_cconv 0.06% : 0.000035s : 1: opt.transform.opt_after_jit_grad 0.26% : 0.000158s : 28: opt.transform.opt_b 0.10% : 0.000064s : 2: opt.transform.opt_trans_graph 0.08% : 0.000047s : 4: opt.transform.symbol_engine_opt 5.18% : 0.003177s : 1: opt_a 0.23% : 0.000140s : 1: opt_after_cconv 1.09% : 0.000668s : 1: opt_after_jit_grad 0.49% : 0.000298s : 1: opt_b 9.82% : 0.006028s : 1: optimize 0.04% : 0.000027s : 1: optimize_parallel_all_gather_comm 0.02% : 0.000009s : 1: order_py_execute_after_rewriter 0.04% : 0.000028s : 1: overlap_grad_flash_sp 0.01% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.01% : 0.000009s : 1: overlap_grad_ring_attention 0.01% : 0.000006s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.01% : 0.000005s : 1: overlap_param_gather 0.01% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.01% : 0.000008s : 1: overlap_recompute_and_grad_model_parallel 0.01% : 0.000006s : 1: overlap_recompute_comm 0.01% : 0.000008s : 1: parallel-infer-symbol 0.01% : 0.000004s : 1: parallel-infer-symbol-second 0.01% : 0.000006s : 1: partial_unused_args_eliminate 0.01% : 0.000005s : 1: pipeline_parallel_scheduler 0.01% : 0.000005s : 1: pipeline_split 0.07% : 0.000042s : 1: pre_auto_parallel 0.05% : 0.000031s : 1: py_interpret_to_execute 0.03% : 0.000019s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000005s : 1: remove_cast_before_assign_add 0.03% : 0.000021s : 1: remove_dup_value 0.67% : 0.000408s : 1: renormalize.infer 0.55% : 0.000336s : 1: renormalize.specialize 0.01% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.02% : 0.000010s : 1: rewriter_after_jit_bprop_graph 0.08% : 0.000048s : 1: rewriter_after_opt_a 0.16% : 0.000099s : 1: rewriter_before_opt_a 0.01% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.01% : 0.000005s : 1: slice_recompute_activation 0.01% : 0.000005s : 1: split_layernorm_comm 0.01% : 0.000006s : 1: split_matmul_comm_elemetwise 0.01% : 0.000009s : 1: swap_dp_allreduce_reducescatter 0.15% : 0.000094s : 1: symbol_engine_optimizer 0.16% : 0.000096s : 1: tuple_transform 9.60% : 0.005892s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:44:33.170.349 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:44:33.170.652 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0176828, [21] [bootstrap]: 0.00039281 [type_inference]: 0.00583709 [event_method]: 1.852e-05 [auto_monad]: 6.274e-05 [graph_reusing]: 5.99999e-06 [inline]: 2.41998e-06 [add_attr]: 0.0037125, [1] [add_attr_with_inline]: 0.00369984, [1] [Cycle 1]: 9.237e-05, [2] [tag_attr]: 2.248e-05 [meta_addattr_fg_expand]: 6.19001e-06 [parallel-infer-symbol]: 3.73001e-06 [pre_auto_parallel]: 4.128e-05 [insert-virtual-dataset]: 2.63e-06 [parallel-infer-symbol-second]: 7.89994e-07 [dataset_repeat_opt]: 2.37999e-06 [pipeline_split]: 1.75001e-06 [optimize]: 0.00625403, [53] [py_interpret_to_execute]: 3.382e-05 [rewriter_before_opt_a]: 9.9e-05 [opt_a]: 0.00348448, [2] [Cycle 1]: 0.00259928, [45] [expand_dump_flag]: 3.71001e-06 [switch_simplify]: 4.252e-05 [loop_unroll]: 3.016e-05 [a_1]: 0.00078685 [with_stream_mark]: 2.594e-05 [recompute_prepare]: 1.16e-05 [updatestate_depend_eliminate]: 4.42e-06 [updatestate_assign_eliminate]: 3.55e-06 [updatestate_loads_eliminate]: 3.03998e-06 [parameter_eliminate]: 2.19999e-06 [a_2]: 0.00011428 [accelerated_algorithm]: 7.46999e-06 [shard]: 2.29999e-06 [meta_shard_fg_expand]: 2.53e-06 [shard_inline]: 7.33999e-06 [merge_send_recv]: 8.91002e-06 [auto_parallel]: 9.02999e-06 [parallel]: 1.99e-05 [flash_sp]: 9.86e-06 [merge_comm]: 3.98001e-06 [allreduce_fusion]: 3.9e-06 [matmul_add_comm_reduction]: 1.018e-05 [allreduce_slice_to_reducescatter]: 6.30011e-07 [virtual_shard_identity]: 9.76e-06 [virtual_dataset]: 7.27002e-06 [get_grad_eliminate_]: 7.27002e-06 [virtual_output]: 6.67002e-06 [merge_forward]: 4.59002e-06 [cell_reuse_recompute_pass]: 1.66e-06 [offload_activation]: 1.06e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.812e-05 [merge_recompute_call_nodes]: 1.87001e-06 [before_grad]: 1.143e-05 [set_forward_comm_id_for_comm_node_pass]: 4.28001e-06 [meta_fg_expand]: 3.13998e-06 [flash_sp_send_recv_attached]: 2.99001e-06 [receive_attached]: 2.37001e-06 [after_resolve]: 1.322e-05 [a_after_grad]: 1.053e-05 [renormalize]: 0.00082265 [add_forward_monad_depend]: 7.32002e-06 [auto_monad_grad]: 2.61999e-06 [auto_monad_eliminator]: 1.919e-05 [cse]: 3.127e-05 [a_3]: 6.748e-05 [Cycle 2]: 0.00086938, [45] [expand_dump_flag]: 1.49e-06 [switch_simplify]: 8.23001e-06 [loop_unroll]: 6.68e-06 [a_1]: 0.00013642 [with_stream_mark]: 1.395e-05 [recompute_prepare]: 6.59001e-06 [updatestate_depend_eliminate]: 3.16999e-06 [updatestate_assign_eliminate]: 2.72001e-06 [updatestate_loads_eliminate]: 2.69001e-06 [parameter_eliminate]: 1.49998e-06 [a_2]: 0.00010106 [accelerated_algorithm]: 7.03e-06 [shard]: 1.76e-06 [meta_shard_fg_expand]: 1.80001e-06 [shard_inline]: 6.33998e-06 [merge_send_recv]: 6.68e-06 [auto_parallel]: 7.45998e-06 [parallel]: 6.08998e-06 [flash_sp]: 4.15999e-06 [merge_comm]: 3.51999e-06 [allreduce_fusion]: 3.3e-06 [matmul_add_comm_reduction]: 1.111e-05 [allreduce_slice_to_reducescatter]: 4.69998e-07 [virtual_shard_identity]: 7.37002e-06 [virtual_dataset]: 6.78e-06 [get_grad_eliminate_]: 6.04001e-06 [virtual_output]: 6.01e-06 [merge_forward]: 2.90998e-06 [cell_reuse_recompute_pass]: 2.91999e-06 [offload_activation]: 8.65001e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.642e-05 [merge_recompute_call_nodes]: 1.03001e-06 [before_grad]: 9.67999e-06 [set_forward_comm_id_for_comm_node_pass]: 3.35e-06 [meta_fg_expand]: 2.50002e-06 [flash_sp_send_recv_attached]: 1.07e-06 [receive_attached]: 1.14998e-06 [after_resolve]: 1.27e-05 [a_after_grad]: 1.025e-05 [renormalize]: 8.00064e-08 [add_forward_monad_depend]: 1.99e-06 [auto_monad_grad]: 1.32e-06 [auto_monad_eliminator]: 8.87e-06 [cse]: 1.612e-05 [a_3]: 5.096e-05 [py_interpret_to_execute_after_opt_a]: 1.69e-05 [slice_cell_reuse_recomputed_activation]: 4.75001e-06 [rewriter_after_opt_a]: 4.338e-05 [convert_after_rewriter]: 1.043e-05 [order_py_execute_after_rewriter]: 8.67e-06 [mutable_eliminate]: 0.00071019 [opt_b]: 0.00031068, [1] [Cycle 1]: 0.00029916, [7] [b_1]: 0.00019059 [b_2]: 8.85999e-06 [updatestate_depend_eliminate]: 7.85e-06 [updatestate_assign_eliminate]: 2.71e-06 [updatestate_loads_eliminate]: 2.64999e-06 [renormalize]: 8.70001e-07 [cse]: 2.674e-05 [optimize_parallel_all_gather_comm]: 2.291e-05 [overlap_param_gather]: 4.94003e-06 [cconv]: 5.819e-05 [loop_unroll]: 0.00058168 [opt_after_cconv]: 0.00014314, [1] [Cycle 1]: 0.00013256, [7] [c_1]: 3.631e-05 [parameter_eliminate]: 4.74e-06 [updatestate_depend_eliminate]: 6.58e-06 [updatestate_assign_eliminate]: 2.58e-06 [updatestate_loads_eliminate]: 2.70002e-06 [cse]: 2.183e-05 [renormalize]: 5.8001e-07 [remove_dup_value]: 1.855e-05 [tuple_transform]: 9.846e-05, [1] [Cycle 1]: 9.057e-05, [4] [d_1]: 5.077e-05 [none_parameter_eliminate]: 1.80001e-06 [renormalize]: 2.19996e-07 [switch_simplify]: 7.38e-06 [partial_unused_args_eliminate]: 4.82998e-06 [add_recomputation]: 5.514e-05 [cse_after_recomputation]: 3.069e-05, [1] [Cycle 1]: 2.332e-05, [1] [cse]: 1.401e-05 [environ_conv]: 9.59e-06 [swap_dp_allreduce_reducescatter]: 8.82e-06 [bias_add_comm_swap]: 5.99999e-06 [label_micro_interleaved_index]: 8.32e-06 [label_fine_grained_interleaved_index]: 5.21998e-06 [merge_cast_opt]: 3.81001e-06 [slice_recompute_activation]: 4.84e-06 [micro_interleaved_order_control]: 5.33002e-06 [assign_add_opt]: 3.65e-06 [ForceFp32Comm]: 3.4e-06 [remove_cast_before_assign_add]: 3.56001e-06 [full_micro_interleaved_order_control]: 4.42998e-06 [reorder_send_recv_between_fp_bp]: 5.86e-06 [comm_op_add_attrs]: 3.53e-06 [add_comm_op_reuse_tag]: 3.34001e-06 [interleave_split_concat_branches]: 3.68999e-06 [interleave_parallel_branches]: 3.56001e-06 [overlap_opt_shard_in_pipeline]: 3.85998e-06 [overlap_opt_shard_grad_in_pipeline]: 4.84e-06 [control_data_broadcast_order]: 1.76e-05 [grouped_pairwise_exchange_alltoall]: 3.92998e-06 [offloading_packed_experts]: 6.47001e-06 [overlap_recompute_and_grad_model_parallel]: 7.28e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.93001e-06 [overlap_recompute_allgather_and_fa_grad]: 3.8e-06 [overlap_recompute_comm]: 4.90001e-06 [overlap_grad_ring_attention]: 6.71999e-06 [overlap_grad_flash_sp]: 2.285e-05 [begin_end_overlap_inline]: 3.03998e-06 [split_matmul_comm_elemetwise]: 4.55999e-06 [split_layernorm_comm]: 4.37998e-06 [handle_group_info]: 3.55e-06 [symbol_engine_optimizer]: 0.00010167, [1] [Cycle 1]: 9.433e-05, [6] [build]: 3.75e-06 [elim_shapecalc]: 1.035e-05 [elim_not_effective]: 1.449e-05 [opt_reshape]: 7.31001e-06 [fold_const_symbol]: 1.116e-05 [renormalize]: 5.19998e-07 [detach_backward]: 4.07003e-06 [pipeline_parallel_scheduler]: 1.69e-06 [auto_monad_reorder]: 1.953e-05 [get_jit_bprop_graph]: 2.02999e-06 [rewriter_after_jit_bprop_graph]: 6.23e-06 [opt_after_jit_grad]: 0.0006641 [validate]: 4.939e-05 Sums bootstrap : 0.000393s : 3.23% type_inference : 0.005837s : 48.04% event_method : 0.000019s : 0.15% auto_monad : 0.000063s : 0.52% graph_reusing : 0.000006s : 0.05% inline : 0.000002s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000022s : 0.19% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.05% parallel-infer-symbol : 0.000004s : 0.03% pre_auto_parallel : 0.000041s : 0.34% insert-virtual-dataset : 0.000003s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000034s : 0.28% optimize.rewriter_before_opt_a : 0.000099s : 0.81% optimize.opt_a.expand_dump_flag : 0.000005s : 0.04% optimize.opt_a.switch_simplify : 0.000051s : 0.42% optimize.opt_a.loop_unroll : 0.000037s : 0.30% optimize.opt_a.a_1 : 0.000923s : 7.60% optimize.opt_a.with_stream_mark : 0.000040s : 0.33% optimize.opt_a.recompute_prepare : 0.000018s : 0.15% optimize.opt_a.updatestate_depend_eliminate : 0.000008s : 0.06% optimize.opt_a.updatestate_assign_eliminate : 0.000006s : 0.05% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.05% optimize.opt_a.parameter_eliminate : 0.000004s : 0.03% optimize.opt_a.a_2 : 0.000215s : 1.77% optimize.opt_a.accelerated_algorithm : 0.000014s : 0.12% optimize.opt_a.shard : 0.000004s : 0.03% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.04% optimize.opt_a.shard_inline : 0.000014s : 0.11% optimize.opt_a.merge_send_recv : 0.000016s : 0.13% optimize.opt_a.auto_parallel : 0.000016s : 0.14% optimize.opt_a.parallel : 0.000026s : 0.21% optimize.opt_a.flash_sp : 0.000014s : 0.12% optimize.opt_a.merge_comm : 0.000007s : 0.06% optimize.opt_a.allreduce_fusion : 0.000007s : 0.06% optimize.opt_a.matmul_add_comm_reduction : 0.000021s : 0.18% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000017s : 0.14% optimize.opt_a.virtual_dataset : 0.000014s : 0.12% optimize.opt_a.get_grad_eliminate_ : 0.000013s : 0.11% optimize.opt_a.virtual_output : 0.000013s : 0.10% optimize.opt_a.merge_forward : 0.000007s : 0.06% optimize.opt_a.cell_reuse_recompute_pass : 0.000005s : 0.04% optimize.opt_a.offload_activation : 0.000019s : 0.16% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000035s : 0.28% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.02% optimize.opt_a.before_grad : 0.000021s : 0.17% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000008s : 0.06% optimize.opt_a.meta_fg_expand : 0.000006s : 0.05% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.03% optimize.opt_a.receive_attached : 0.000004s : 0.03% optimize.opt_a.after_resolve : 0.000026s : 0.21% optimize.opt_a.a_after_grad : 0.000021s : 0.17% optimize.opt_a.renormalize : 0.000823s : 6.77% optimize.opt_a.add_forward_monad_depend : 0.000009s : 0.08% optimize.opt_a.auto_monad_grad : 0.000004s : 0.03% optimize.opt_a.auto_monad_eliminator : 0.000028s : 0.23% optimize.opt_a.cse : 0.000047s : 0.39% optimize.opt_a.a_3 : 0.000118s : 0.97% optimize.py_interpret_to_execute_after_opt_a : 0.000017s : 0.14% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.04% optimize.rewriter_after_opt_a : 0.000043s : 0.36% optimize.convert_after_rewriter : 0.000010s : 0.09% optimize.order_py_execute_after_rewriter : 0.000009s : 0.07% optimize.mutable_eliminate : 0.000710s : 5.85% optimize.opt_b.b_1 : 0.000191s : 1.57% optimize.opt_b.b_2 : 0.000009s : 0.07% optimize.opt_b.updatestate_depend_eliminate : 0.000008s : 0.06% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.02% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.02% optimize.opt_b.renormalize : 0.000001s : 0.01% optimize.opt_b.cse : 0.000027s : 0.22% optimize.optimize_parallel_all_gather_comm : 0.000023s : 0.19% optimize.overlap_param_gather : 0.000005s : 0.04% optimize.cconv : 0.000058s : 0.48% optimize.loop_unroll : 0.000582s : 4.79% optimize.opt_after_cconv.c_1 : 0.000036s : 0.30% optimize.opt_after_cconv.parameter_eliminate : 0.000005s : 0.04% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000007s : 0.05% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.02% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.02% optimize.opt_after_cconv.cse : 0.000022s : 0.18% optimize.opt_after_cconv.renormalize : 0.000001s : 0.00% optimize.remove_dup_value : 0.000019s : 0.15% optimize.tuple_transform.d_1 : 0.000051s : 0.42% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000007s : 0.06% optimize.partial_unused_args_eliminate : 0.000005s : 0.04% optimize.add_recomputation : 0.000055s : 0.45% optimize.cse_after_recomputation.cse : 0.000014s : 0.12% optimize.environ_conv : 0.000010s : 0.08% optimize.swap_dp_allreduce_reducescatter : 0.000009s : 0.07% optimize.bias_add_comm_swap : 0.000006s : 0.05% optimize.label_micro_interleaved_index : 0.000008s : 0.07% optimize.label_fine_grained_interleaved_index : 0.000005s : 0.04% optimize.merge_cast_opt : 0.000004s : 0.03% optimize.slice_recompute_activation : 0.000005s : 0.04% optimize.micro_interleaved_order_control : 0.000005s : 0.04% optimize.assign_add_opt : 0.000004s : 0.03% optimize.ForceFp32Comm : 0.000003s : 0.03% optimize.remove_cast_before_assign_add : 0.000004s : 0.03% optimize.full_micro_interleaved_order_control : 0.000004s : 0.04% optimize.reorder_send_recv_between_fp_bp : 0.000006s : 0.05% optimize.comm_op_add_attrs : 0.000004s : 0.03% optimize.add_comm_op_reuse_tag : 0.000003s : 0.03% optimize.interleave_split_concat_branches : 0.000004s : 0.03% optimize.interleave_parallel_branches : 0.000004s : 0.03% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.03% optimize.overlap_opt_shard_grad_in_pipeline : 0.000005s : 0.04% optimize.control_data_broadcast_order : 0.000018s : 0.14% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.03% optimize.offloading_packed_experts : 0.000006s : 0.05% optimize.overlap_recompute_and_grad_model_parallel : 0.000007s : 0.06% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.03% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.03% optimize.overlap_recompute_comm : 0.000005s : 0.04% optimize.overlap_grad_ring_attention : 0.000007s : 0.06% optimize.overlap_grad_flash_sp : 0.000023s : 0.19% optimize.begin_end_overlap_inline : 0.000003s : 0.03% optimize.split_matmul_comm_elemetwise : 0.000005s : 0.04% optimize.split_layernorm_comm : 0.000004s : 0.04% optimize.handle_group_info : 0.000004s : 0.03% optimize.symbol_engine_optimizer.build : 0.000004s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000010s : 0.09% optimize.symbol_engine_optimizer.elim_not_effective : 0.000014s : 0.12% optimize.symbol_engine_optimizer.opt_reshape : 0.000007s : 0.06% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000011s : 0.09% optimize.symbol_engine_optimizer.renormalize : 0.000001s : 0.00% detach_backward : 0.000004s : 0.03% pipeline_parallel_scheduler : 0.000002s : 0.01% auto_monad_reorder : 0.000020s : 0.16% get_jit_bprop_graph : 0.000002s : 0.02% rewriter_after_jit_bprop_graph : 0.000006s : 0.05% opt_after_jit_grad : 0.000664s : 5.47% validate : 0.000049s : 0.41% Time group info: ------[substitution.] 0.000210 29 1.02% : 0.000002s : 2: substitution.elim_not_effective 0.89% : 0.000002s : 2: substitution.fold_const_symbol 3.02% : 0.000006s : 4: substitution.graph_param_transform 76.67% : 0.000161s : 4: substitution.inline 2.06% : 0.000004s : 4: substitution.j_node_and_user_rematch 2.22% : 0.000005s : 4: substitution.remove_not_recompute_node 2.99% : 0.000006s : 4: substitution.replace_old_param 7.26% : 0.000015s : 4: substitution.tuple_list_get_item_eliminator 3.88% : 0.000008s : 1: substitution.value_based_eliminate ------[type_inference.] 0.005790 2 88.21% : 0.005107s : 1: type_inference.infer 11.79% : 0.000683s : 1: type_inference.specialize ------[replace.] 0.000066 8 63.15% : 0.000042s : 4: replace.inline 36.85% : 0.000024s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000172 8 92.24% : 0.000158s : 4: match.inline 7.76% : 0.000013s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000222 1278 0.97% : 0.000002s : 13: predicate.accumulaten_eliminater 0.74% : 0.000002s : 4: predicate.ad_related_special_op_eliminate 0.73% : 0.000002s : 8: predicate.addn_check_dump 0.90% : 0.000002s : 13: predicate.addn_zero_filter 0.79% : 0.000002s : 13: predicate.adjust_all_reduce_mul_add 2.16% : 0.000005s : 21: predicate.arithmetic_simplify 0.94% : 0.000002s : 13: predicate.cast_eliminate 0.57% : 0.000001s : 8: predicate.check_bprop_eliminate 0.50% : 0.000001s : 8: predicate.compare_switch_simplify 0.19% : 0.000000s : 4: predicate.const_output_eliminate 0.61% : 0.000001s : 8: predicate.depend_value_elim 0.86% : 0.000002s : 13: predicate.dict_get_item_const_eliminator 1.05% : 0.000002s : 13: predicate.dict_get_item_eliminator 0.80% : 0.000002s : 13: predicate.dict_set_item_eliminator 0.94% : 0.000002s : 8: predicate.dumpgradient_eliminate 0.18% : 0.000000s : 4: predicate.elim_not_effective 0.36% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.17% : 0.000003s : 17: predicate.environ_add_const_eliminate 1.04% : 0.000002s : 17: predicate.environ_get_add_eliminate 1.08% : 0.000002s : 17: predicate.environ_get_depend_swap 1.64% : 0.000004s : 25: predicate.environ_get_eliminate 1.24% : 0.000003s : 17: predicate.environ_get_set_eliminate 1.43% : 0.000003s : 21: predicate.exchange_switch_depend_value 2.24% : 0.000005s : 21: predicate.float_depend_g_call 0.56% : 0.000001s : 8: predicate.float_environ_get_switch 0.82% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.17% : 0.000000s : 4: predicate.fold_const_symbol 0.82% : 0.000002s : 8: predicate.get_grad_eliminate 0.20% : 0.000000s : 4: predicate.graph_param_transform 0.54% : 0.000001s : 8: predicate.incorporate_call 0.45% : 0.000001s : 8: predicate.incorporate_call_switch 6.11% : 0.000014s : 58: predicate.inline 0.71% : 0.000002s : 8: predicate.inline_without_move 0.30% : 0.000001s : 8: predicate.j_node_and_user_rematch 0.95% : 0.000002s : 8: predicate.less_batch_normalization 1.90% : 0.000004s : 25: predicate.list_to_tuple_eliminator_ 2.45% : 0.000005s : 38: predicate.load_eliminater 1.06% : 0.000002s : 4: predicate.loop_unroll_after_grad 2.26% : 0.000005s : 34: predicate.loop_unroll_before_grad 1.87% : 0.000004s : 21: predicate.make_slice_get_slice_eliminator 0.54% : 0.000001s : 8: predicate.merge_addn 0.59% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.59% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.79% : 0.000002s : 13: predicate.minmaximum_grad 1.38% : 0.000003s : 4: predicate.mutable_eliminate 0.38% : 0.000001s : 4: predicate.opt_reshape 0.45% : 0.000001s : 4: predicate.parallel_virtual_node 1.73% : 0.000004s : 21: predicate.partial_defer_inline 1.51% : 0.000003s : 21: predicate.partial_eliminate 0.94% : 0.000002s : 13: predicate.print_const_string_wrapper 0.73% : 0.000002s : 8: predicate.reduce_all_const_elim 1.26% : 0.000003s : 13: predicate.reduce_eliminate 2.50% : 0.000006s : 38: predicate.redundant_stop_gradient_eliminater 0.54% : 0.000001s : 8: predicate.remove_not_recompute_node 1.49% : 0.000003s : 25: predicate.replace_applicator 0.40% : 0.000001s : 8: predicate.replace_old_param 0.31% : 0.000001s : 4: predicate.reset_defer_inline 1.02% : 0.000002s : 13: predicate.reshape_eliminate 0.63% : 0.000001s : 8: predicate.row_tensor_add_zeros_like 0.38% : 0.000001s : 4: predicate.row_tensor_eliminate 0.82% : 0.000002s : 8: predicate.same_eliminate 0.40% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.82% : 0.000002s : 8: predicate.shard_identity_eliminate 0.64% : 0.000001s : 8: predicate.special_op_eliminate 0.75% : 0.000002s : 8: predicate.specialize_transform 0.83% : 0.000002s : 8: predicate.split_environ_get_set_with_tuple_value 0.74% : 0.000002s : 8: predicate.stack_unstack_eliminate 0.32% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.43% : 0.000003s : 21: predicate.switch_defer_inline 2.00% : 0.000004s : 29: predicate.switch_layer_defer_inline 4.95% : 0.000011s : 67: predicate.switch_simplify 0.98% : 0.000002s : 13: predicate.tile_eliminate 0.99% : 0.000002s : 13: predicate.transpose_eliminate 1.64% : 0.000004s : 21: predicate.tuple_list_convert_item_index_to_positive 1.70% : 0.000004s : 21: predicate.tuple_list_get_item_const_eliminator 1.41% : 0.000003s : 21: predicate.tuple_list_get_item_depend_reorder 3.24% : 0.000007s : 33: predicate.tuple_list_get_item_eliminator 1.40% : 0.000003s : 21: predicate.tuple_list_get_set_item_eliminator 2.42% : 0.000005s : 29: predicate.tuple_list_set_item_eliminator 1.98% : 0.000004s : 25: predicate.tuple_to_list_eliminator_ 2.29% : 0.000005s : 38: predicate.updatestate_pure_node_eliminater 3.07% : 0.000007s : 46: predicate.updatestate_useless_node_eliminater 0.51% : 0.000001s : 4: predicate.value_based_eliminate 0.73% : 0.000002s : 8: predicate.virtual_dataset_eliminate 0.64% : 0.000001s : 8: predicate.virtual_output_eliminate 0.28% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.59% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000530 11 50.08% : 0.000265s : 5: func_graph_cloner_run.FuncGraphClonerGraph 49.92% : 0.000264s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.030008 192 0.02% : 0.000006s : 1: ForceFp32Comm 12.41% : 0.003723s : 1: add_attr 12.34% : 0.003704s : 1: add_attr_with_inline 0.02% : 0.000007s : 1: add_comm_op_reuse_tag 0.20% : 0.000059s : 1: add_recomputation 0.02% : 0.000006s : 1: assign_add_opt 0.24% : 0.000071s : 1: auto_monad 0.09% : 0.000027s : 1: auto_monad_reorder 0.02% : 0.000006s : 1: begin_end_overlap_inline 0.03% : 0.000009s : 1: bias_add_comm_swap 1.44% : 0.000433s : 1: bootstrap 0.21% : 0.000062s : 1: cconv 0.02% : 0.000006s : 1: comm_op_add_attrs 0.07% : 0.000021s : 1: control_data_broadcast_order 0.04% : 0.000013s : 1: convert_after_rewriter 0.11% : 0.000034s : 1: cse_after_recomputation 0.03% : 0.000008s : 1: dataset_repeat_opt 0.07% : 0.000022s : 1: detach_backward 0.04% : 0.000012s : 1: environ_conv 0.09% : 0.000028s : 1: event_method 0.02% : 0.000007s : 1: full_micro_interleaved_order_control 0.03% : 0.000008s : 1: get_jit_bprop_graph 0.04% : 0.000012s : 1: graph_reusing 0.02% : 0.000006s : 1: grouped_pairwise_exchange_alltoall 0.02% : 0.000006s : 1: handle_group_info 0.03% : 0.000008s : 1: inline 0.03% : 0.000009s : 1: insert-virtual-dataset 0.02% : 0.000006s : 1: interleave_parallel_branches 0.02% : 0.000006s : 1: interleave_split_concat_branches 0.03% : 0.000008s : 1: label_fine_grained_interleaved_index 0.04% : 0.000011s : 1: label_micro_interleaved_index 1.96% : 0.000589s : 1: loop_unroll 0.02% : 0.000006s : 1: merge_cast_opt 0.03% : 0.000008s : 1: micro_interleaved_order_control 2.39% : 0.000718s : 1: mutable_eliminate 0.03% : 0.000009s : 1: offloading_packed_experts 0.06% : 0.000017s : 1: opt.transform.loop_unroll_optimizer 0.07% : 0.000020s : 1: opt.transform.mutable_eliminate 4.57% : 0.001372s : 78: opt.transform.opt_a 0.12% : 0.000035s : 1: opt.transform.opt_after_cconv 0.10% : 0.000029s : 1: opt.transform.opt_after_jit_grad 0.41% : 0.000124s : 28: opt.transform.opt_b 0.19% : 0.000056s : 2: opt.transform.opt_trans_graph 0.13% : 0.000040s : 4: opt.transform.symbol_engine_opt 11.62% : 0.003488s : 1: opt_a 0.49% : 0.000147s : 1: opt_after_cconv 2.26% : 0.000679s : 1: opt_after_jit_grad 1.05% : 0.000314s : 1: opt_b 22.01% : 0.006605s : 1: optimize 0.09% : 0.000026s : 1: optimize_parallel_all_gather_comm 0.04% : 0.000012s : 1: order_py_execute_after_rewriter 0.09% : 0.000026s : 1: overlap_grad_flash_sp 0.02% : 0.000007s : 1: overlap_grad_matmul_and_grad_allreduce 0.03% : 0.000010s : 1: overlap_grad_ring_attention 0.03% : 0.000008s : 1: overlap_opt_shard_grad_in_pipeline 0.02% : 0.000006s : 1: overlap_opt_shard_in_pipeline 0.03% : 0.000008s : 1: overlap_param_gather 0.02% : 0.000006s : 1: overlap_recompute_allgather_and_fa_grad 0.03% : 0.000011s : 1: overlap_recompute_and_grad_model_parallel 0.03% : 0.000008s : 1: overlap_recompute_comm 0.04% : 0.000011s : 1: parallel-infer-symbol 0.02% : 0.000006s : 1: parallel-infer-symbol-second 0.03% : 0.000008s : 1: partial_unused_args_eliminate 0.03% : 0.000009s : 1: pipeline_parallel_scheduler 0.02% : 0.000007s : 1: pipeline_split 0.16% : 0.000049s : 1: pre_auto_parallel 0.13% : 0.000038s : 1: py_interpret_to_execute 0.07% : 0.000021s : 1: py_interpret_to_execute_after_opt_a 0.02% : 0.000006s : 1: remove_cast_before_assign_add 0.07% : 0.000022s : 1: remove_dup_value 1.42% : 0.000426s : 1: renormalize.infer 1.28% : 0.000385s : 1: renormalize.specialize 0.03% : 0.000008s : 1: reorder_send_recv_between_fp_bp 0.04% : 0.000013s : 1: rewriter_after_jit_bprop_graph 0.16% : 0.000047s : 1: rewriter_after_opt_a 0.34% : 0.000103s : 1: rewriter_before_opt_a 0.03% : 0.000008s : 1: slice_cell_reuse_recomputed_activation 0.03% : 0.000009s : 1: slice_recompute_activation 0.03% : 0.000008s : 1: split_layernorm_comm 0.02% : 0.000007s : 1: split_matmul_comm_elemetwise 0.04% : 0.000012s : 1: swap_dp_allreduce_reducescatter 0.35% : 0.000105s : 1: symbol_engine_optimizer 0.34% : 0.000101s : 1: tuple_transform 19.56% : 0.005869s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:44:33.650.451 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0186946, [21] [bootstrap]: 0.00048023 [type_inference]: 0.00753084 [event_method]: 2.109e-05 [auto_monad]: 7.099e-05 [graph_reusing]: 5.89e-06 [inline]: 2.88e-06 [add_attr]: 0.00380897, [1] [add_attr_with_inline]: 0.00379596, [1] [Cycle 1]: 7.147e-05, [2] [tag_attr]: 2.344e-05 [meta_addattr_fg_expand]: 6.11e-06 [parallel-infer-symbol]: 3.93001e-06 [pre_auto_parallel]: 3.728e-05 [insert-virtual-dataset]: 2.89001e-06 [parallel-infer-symbol-second]: 7.89994e-07 [dataset_repeat_opt]: 2.13002e-06 [pipeline_split]: 1.72001e-06 [optimize]: 0.00581165, [53] [py_interpret_to_execute]: 2.82e-05 [rewriter_before_opt_a]: 8.797e-05 [opt_a]: 0.00298545, [2] [Cycle 1]: 0.00226402, [45] [expand_dump_flag]: 3.43999e-06 [switch_simplify]: 4.429e-05 [loop_unroll]: 3.175e-05 [a_1]: 0.00065397 [with_stream_mark]: 1.861e-05 [recompute_prepare]: 1.008e-05 [updatestate_depend_eliminate]: 4.25e-06 [updatestate_assign_eliminate]: 3.58e-06 [updatestate_loads_eliminate]: 3.00002e-06 [parameter_eliminate]: 1.81e-06 [a_2]: 8.458e-05 [accelerated_algorithm]: 7.58001e-06 [shard]: 1.87001e-06 [meta_shard_fg_expand]: 2.12999e-06 [shard_inline]: 6.49999e-06 [merge_send_recv]: 9.12999e-06 [auto_parallel]: 8.32998e-06 [parallel]: 1.956e-05 [flash_sp]: 9.24e-06 [merge_comm]: 3.98001e-06 [allreduce_fusion]: 3.61999e-06 [matmul_add_comm_reduction]: 1.07e-05 [allreduce_slice_to_reducescatter]: 7.00005e-07 [virtual_shard_identity]: 8.18001e-06 [virtual_dataset]: 6.89999e-06 [get_grad_eliminate_]: 6.91999e-06 [virtual_output]: 6.58e-06 [merge_forward]: 4.53999e-06 [cell_reuse_recompute_pass]: 1.34003e-06 [offload_activation]: 1.077e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.354e-05 [merge_recompute_call_nodes]: 1.64e-06 [before_grad]: 1.183e-05 [set_forward_comm_id_for_comm_node_pass]: 3.86001e-06 [meta_fg_expand]: 3.48e-06 [flash_sp_send_recv_attached]: 3.16999e-06 [receive_attached]: 2.21998e-06 [after_resolve]: 1.266e-05 [a_after_grad]: 9.69e-06 [renormalize]: 0.00079712 [add_forward_monad_depend]: 7.38e-06 [auto_monad_grad]: 2.93e-06 [auto_monad_eliminator]: 1.905e-05 [cse]: 3.124e-05 [a_3]: 5.424e-05 [Cycle 2]: 0.00071, [45] [expand_dump_flag]: 1.24998e-06 [switch_simplify]: 8.38999e-06 [loop_unroll]: 6.82002e-06 [a_1]: 0.00013601 [with_stream_mark]: 1.521e-05 [recompute_prepare]: 6.53998e-06 [updatestate_depend_eliminate]: 3.31999e-06 [updatestate_assign_eliminate]: 2.54001e-06 [updatestate_loads_eliminate]: 2.93e-06 [parameter_eliminate]: 1.60999e-06 [a_2]: 7.358e-05 [accelerated_algorithm]: 6.89999e-06 [shard]: 1.73997e-06 [meta_shard_fg_expand]: 1.93002e-06 [shard_inline]: 6.49999e-06 [merge_send_recv]: 6.84999e-06 [auto_parallel]: 7.45e-06 [parallel]: 7.48e-06 [flash_sp]: 3.5e-06 [merge_comm]: 4.07998e-06 [allreduce_fusion]: 3.38e-06 [matmul_add_comm_reduction]: 7.88001e-06 [allreduce_slice_to_reducescatter]: 6.69999e-07 [virtual_shard_identity]: 6.68998e-06 [virtual_dataset]: 6.57002e-06 [get_grad_eliminate_]: 6.09999e-06 [virtual_output]: 5.81998e-06 [merge_forward]: 4.03001e-06 [cell_reuse_recompute_pass]: 2.58e-06 [offload_activation]: 9.37001e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.617e-05 [merge_recompute_call_nodes]: 1.23002e-06 [before_grad]: 1.101e-05 [set_forward_comm_id_for_comm_node_pass]: 3.66999e-06 [meta_fg_expand]: 2.39001e-06 [flash_sp_send_recv_attached]: 1.39e-06 [receive_attached]: 1.47999e-06 [after_resolve]: 1.389e-05 [a_after_grad]: 9.29998e-06 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 2.15002e-06 [auto_monad_grad]: 1.47001e-06 [auto_monad_eliminator]: 9.36e-06 [cse]: 1.741e-05 [a_3]: 4.015e-05 [py_interpret_to_execute_after_opt_a]: 1.461e-05 [slice_cell_reuse_recomputed_activation]: 1.85001e-06 [rewriter_after_opt_a]: 4.172e-05 [convert_after_rewriter]: 7.56999e-06 [order_py_execute_after_rewriter]: 5.30999e-06 [mutable_eliminate]: 0.00075802 [opt_b]: 0.00026575, [1] [Cycle 1]: 0.00025638, [7] [b_1]: 0.00015974 [b_2]: 1.015e-05 [updatestate_depend_eliminate]: 9.19e-06 [updatestate_assign_eliminate]: 3.09001e-06 [updatestate_loads_eliminate]: 3.04999e-06 [renormalize]: 6.89994e-07 [cse]: 2.881e-05 [optimize_parallel_all_gather_comm]: 2.034e-05 [overlap_param_gather]: 1.94999e-06 [cconv]: 3.417e-05 [loop_unroll]: 0.00062569 [opt_after_cconv]: 0.00011956, [1] [Cycle 1]: 0.00011198, [7] [c_1]: 3.479e-05 [parameter_eliminate]: 5.30001e-06 [updatestate_depend_eliminate]: 7.33999e-06 [updatestate_assign_eliminate]: 2.59001e-06 [updatestate_loads_eliminate]: 3.14001e-06 [cse]: 2.308e-05 [renormalize]: 3.50003e-07 [remove_dup_value]: 1.458e-05 [tuple_transform]: 8.195e-05, [1] [Cycle 1]: 7.71e-05, [4] [d_1]: 4.853e-05 [none_parameter_eliminate]: 1.52999e-06 [renormalize]: 4.19997e-07 [switch_simplify]: 7.51001e-06 [partial_unused_args_eliminate]: 1.89e-06 [add_recomputation]: 5.682e-05 [cse_after_recomputation]: 2.339e-05, [1] [Cycle 1]: 1.82e-05, [1] [cse]: 1.219e-05 [environ_conv]: 5.26998e-06 [swap_dp_allreduce_reducescatter]: 6.53e-06 [bias_add_comm_swap]: 3.34001e-06 [label_micro_interleaved_index]: 6.44001e-06 [label_fine_grained_interleaved_index]: 2.72001e-06 [merge_cast_opt]: 1.50001e-06 [slice_recompute_activation]: 2.36e-06 [micro_interleaved_order_control]: 2.86e-06 [assign_add_opt]: 1.45001e-06 [ForceFp32Comm]: 8.90024e-07 [remove_cast_before_assign_add]: 1.17e-06 [full_micro_interleaved_order_control]: 2.46e-06 [reorder_send_recv_between_fp_bp]: 2.79001e-06 [comm_op_add_attrs]: 1.07e-06 [add_comm_op_reuse_tag]: 1.15999e-06 [interleave_split_concat_branches]: 1.29e-06 [interleave_parallel_branches]: 1.34e-06 [overlap_opt_shard_in_pipeline]: 1.29e-06 [overlap_opt_shard_grad_in_pipeline]: 2.02999e-06 [control_data_broadcast_order]: 3.634e-05 [grouped_pairwise_exchange_alltoall]: 1.55999e-06 [offloading_packed_experts]: 5.05999e-06 [overlap_recompute_and_grad_model_parallel]: 5.51e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.18001e-06 [overlap_recompute_allgather_and_fa_grad]: 1.35999e-06 [overlap_recompute_comm]: 2.42001e-06 [overlap_grad_ring_attention]: 0.00011897 [overlap_grad_flash_sp]: 2.62e-05 [begin_end_overlap_inline]: 5.10016e-07 [split_matmul_comm_elemetwise]: 2.50002e-06 [split_layernorm_comm]: 1.82999e-06 [handle_group_info]: 9.79984e-07 [symbol_engine_optimizer]: 9.401e-05, [1] [Cycle 1]: 8.82e-05, [6] [build]: 4.57e-06 [elim_shapecalc]: 1.47e-05 [elim_not_effective]: 1.521e-05 [opt_reshape]: 7.85998e-06 [fold_const_symbol]: 1.146e-05 [renormalize]: 2.69996e-07 [detach_backward]: 2.89001e-06 [pipeline_parallel_scheduler]: 1.61998e-06 [auto_monad_reorder]: 2.128e-05 [get_jit_bprop_graph]: 2.33002e-06 [rewriter_after_jit_bprop_graph]: 6.39999e-06 [opt_after_jit_grad]: 0.00064976 [validate]: 4.969e-05 Sums bootstrap : 0.000480s : 3.49% type_inference : 0.007531s : 54.80% event_method : 0.000021s : 0.15% auto_monad : 0.000071s : 0.52% graph_reusing : 0.000006s : 0.04% inline : 0.000003s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000023s : 0.17% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.04% parallel-infer-symbol : 0.000004s : 0.03% pre_auto_parallel : 0.000037s : 0.27% insert-virtual-dataset : 0.000003s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000028s : 0.21% optimize.rewriter_before_opt_a : 0.000088s : 0.64% optimize.opt_a.expand_dump_flag : 0.000005s : 0.03% optimize.opt_a.switch_simplify : 0.000053s : 0.38% optimize.opt_a.loop_unroll : 0.000039s : 0.28% optimize.opt_a.a_1 : 0.000790s : 5.75% optimize.opt_a.with_stream_mark : 0.000034s : 0.25% optimize.opt_a.recompute_prepare : 0.000017s : 0.12% optimize.opt_a.updatestate_depend_eliminate : 0.000008s : 0.06% optimize.opt_a.updatestate_assign_eliminate : 0.000006s : 0.04% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.04% optimize.opt_a.parameter_eliminate : 0.000003s : 0.02% optimize.opt_a.a_2 : 0.000158s : 1.15% optimize.opt_a.accelerated_algorithm : 0.000014s : 0.11% optimize.opt_a.shard : 0.000004s : 0.03% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.03% optimize.opt_a.shard_inline : 0.000013s : 0.09% optimize.opt_a.merge_send_recv : 0.000016s : 0.12% optimize.opt_a.auto_parallel : 0.000016s : 0.11% optimize.opt_a.parallel : 0.000027s : 0.20% optimize.opt_a.flash_sp : 0.000013s : 0.09% optimize.opt_a.merge_comm : 0.000008s : 0.06% optimize.opt_a.allreduce_fusion : 0.000007s : 0.05% optimize.opt_a.matmul_add_comm_reduction : 0.000019s : 0.14% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000015s : 0.11% optimize.opt_a.virtual_dataset : 0.000013s : 0.10% optimize.opt_a.get_grad_eliminate_ : 0.000013s : 0.09% optimize.opt_a.virtual_output : 0.000012s : 0.09% optimize.opt_a.merge_forward : 0.000009s : 0.06% optimize.opt_a.cell_reuse_recompute_pass : 0.000004s : 0.03% optimize.opt_a.offload_activation : 0.000020s : 0.15% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000030s : 0.22% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.02% optimize.opt_a.before_grad : 0.000023s : 0.17% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000008s : 0.05% optimize.opt_a.meta_fg_expand : 0.000006s : 0.04% optimize.opt_a.flash_sp_send_recv_attached : 0.000005s : 0.03% optimize.opt_a.receive_attached : 0.000004s : 0.03% optimize.opt_a.after_resolve : 0.000027s : 0.19% optimize.opt_a.a_after_grad : 0.000019s : 0.14% optimize.opt_a.renormalize : 0.000797s : 5.80% optimize.opt_a.add_forward_monad_depend : 0.000010s : 0.07% optimize.opt_a.auto_monad_grad : 0.000004s : 0.03% optimize.opt_a.auto_monad_eliminator : 0.000028s : 0.21% optimize.opt_a.cse : 0.000049s : 0.35% optimize.opt_a.a_3 : 0.000094s : 0.69% optimize.py_interpret_to_execute_after_opt_a : 0.000015s : 0.11% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.01% optimize.rewriter_after_opt_a : 0.000042s : 0.30% optimize.convert_after_rewriter : 0.000008s : 0.06% optimize.order_py_execute_after_rewriter : 0.000005s : 0.04% optimize.mutable_eliminate : 0.000758s : 5.52% optimize.opt_b.b_1 : 0.000160s : 1.16% optimize.opt_b.b_2 : 0.000010s : 0.07% optimize.opt_b.updatestate_depend_eliminate : 0.000009s : 0.07% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.02% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.02% optimize.opt_b.renormalize : 0.000001s : 0.01% optimize.opt_b.cse : 0.000029s : 0.21% optimize.optimize_parallel_all_gather_comm : 0.000020s : 0.15% optimize.overlap_param_gather : 0.000002s : 0.01% optimize.cconv : 0.000034s : 0.25% optimize.loop_unroll : 0.000626s : 4.55% optimize.opt_after_cconv.c_1 : 0.000035s : 0.25% optimize.opt_after_cconv.parameter_eliminate : 0.000005s : 0.04% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000007s : 0.05% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.02% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.02% optimize.opt_after_cconv.cse : 0.000023s : 0.17% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000015s : 0.11% optimize.tuple_transform.d_1 : 0.000049s : 0.35% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000008s : 0.05% optimize.partial_unused_args_eliminate : 0.000002s : 0.01% optimize.add_recomputation : 0.000057s : 0.41% optimize.cse_after_recomputation.cse : 0.000012s : 0.09% optimize.environ_conv : 0.000005s : 0.04% optimize.swap_dp_allreduce_reducescatter : 0.000007s : 0.05% optimize.bias_add_comm_swap : 0.000003s : 0.02% optimize.label_micro_interleaved_index : 0.000006s : 0.05% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.02% optimize.merge_cast_opt : 0.000002s : 0.01% optimize.slice_recompute_activation : 0.000002s : 0.02% optimize.micro_interleaved_order_control : 0.000003s : 0.02% optimize.assign_add_opt : 0.000001s : 0.01% optimize.ForceFp32Comm : 0.000001s : 0.01% optimize.remove_cast_before_assign_add : 0.000001s : 0.01% optimize.full_micro_interleaved_order_control : 0.000002s : 0.02% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.02% optimize.comm_op_add_attrs : 0.000001s : 0.01% optimize.add_comm_op_reuse_tag : 0.000001s : 0.01% optimize.interleave_split_concat_branches : 0.000001s : 0.01% optimize.interleave_parallel_branches : 0.000001s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.01% optimize.control_data_broadcast_order : 0.000036s : 0.26% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.01% optimize.offloading_packed_experts : 0.000005s : 0.04% optimize.overlap_recompute_and_grad_model_parallel : 0.000006s : 0.04% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.01% optimize.overlap_recompute_comm : 0.000002s : 0.02% optimize.overlap_grad_ring_attention : 0.000119s : 0.87% optimize.overlap_grad_flash_sp : 0.000026s : 0.19% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000003s : 0.02% optimize.split_layernorm_comm : 0.000002s : 0.01% optimize.handle_group_info : 0.000001s : 0.01% optimize.symbol_engine_optimizer.build : 0.000005s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000015s : 0.11% optimize.symbol_engine_optimizer.elim_not_effective : 0.000015s : 0.11% optimize.symbol_engine_optimizer.opt_reshape : 0.000008s : 0.06% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000011s : 0.08% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000003s : 0.02% pipeline_parallel_scheduler : 0.000002s : 0.01% auto_monad_reorder : 0.000021s : 0.15% get_jit_bprop_graph : 0.000002s : 0.02% rewriter_after_jit_bprop_graph : 0.000006s : 0.05% opt_after_jit_grad : 0.000650s : 4.73% validate : 0.000050s : 0.36% Time group info: ------[substitution.] 0.000208 29 1.07% : 0.000002s : 2: substitution.elim_not_effective 0.95% : 0.000002s : 2: substitution.fold_const_symbol 3.19% : 0.000007s : 4: substitution.graph_param_transform 74.99% : 0.000156s : 4: substitution.inline 2.36% : 0.000005s : 4: substitution.j_node_and_user_rematch 2.65% : 0.000006s : 4: substitution.remove_not_recompute_node 2.78% : 0.000006s : 4: substitution.replace_old_param 7.61% : 0.000016s : 4: substitution.tuple_list_get_item_eliminator 4.41% : 0.000009s : 1: substitution.value_based_eliminate ------[type_inference.] 0.007450 2 88.71% : 0.006609s : 1: type_inference.infer 11.29% : 0.000841s : 1: type_inference.specialize ------[replace.] 0.000063 8 63.51% : 0.000040s : 4: replace.inline 36.49% : 0.000023s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000168 8 91.65% : 0.000154s : 4: match.inline 8.35% : 0.000014s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000228 1278 0.92% : 0.000002s : 13: predicate.accumulaten_eliminater 0.89% : 0.000002s : 4: predicate.ad_related_special_op_eliminate 0.53% : 0.000001s : 8: predicate.addn_check_dump 0.89% : 0.000002s : 13: predicate.addn_zero_filter 0.82% : 0.000002s : 13: predicate.adjust_all_reduce_mul_add 1.97% : 0.000004s : 21: predicate.arithmetic_simplify 1.07% : 0.000002s : 13: predicate.cast_eliminate 0.57% : 0.000001s : 8: predicate.check_bprop_eliminate 0.57% : 0.000001s : 8: predicate.compare_switch_simplify 0.18% : 0.000000s : 4: predicate.const_output_eliminate 0.60% : 0.000001s : 8: predicate.depend_value_elim 0.89% : 0.000002s : 13: predicate.dict_get_item_const_eliminator 0.98% : 0.000002s : 13: predicate.dict_get_item_eliminator 0.81% : 0.000002s : 13: predicate.dict_set_item_eliminator 1.05% : 0.000002s : 8: predicate.dumpgradient_eliminate 0.23% : 0.000001s : 4: predicate.elim_not_effective 0.43% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.12% : 0.000003s : 17: predicate.environ_add_const_eliminate 1.09% : 0.000002s : 17: predicate.environ_get_add_eliminate 1.42% : 0.000003s : 17: predicate.environ_get_depend_swap 1.84% : 0.000004s : 25: predicate.environ_get_eliminate 0.97% : 0.000002s : 17: predicate.environ_get_set_eliminate 1.36% : 0.000003s : 21: predicate.exchange_switch_depend_value 2.24% : 0.000005s : 21: predicate.float_depend_g_call 0.53% : 0.000001s : 8: predicate.float_environ_get_switch 0.81% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.18% : 0.000000s : 4: predicate.fold_const_symbol 0.75% : 0.000002s : 8: predicate.get_grad_eliminate 0.26% : 0.000001s : 4: predicate.graph_param_transform 0.54% : 0.000001s : 8: predicate.incorporate_call 0.45% : 0.000001s : 8: predicate.incorporate_call_switch 5.76% : 0.000013s : 58: predicate.inline 0.61% : 0.000001s : 8: predicate.inline_without_move 0.32% : 0.000001s : 8: predicate.j_node_and_user_rematch 0.82% : 0.000002s : 8: predicate.less_batch_normalization 1.86% : 0.000004s : 25: predicate.list_to_tuple_eliminator_ 2.55% : 0.000006s : 38: predicate.load_eliminater 1.60% : 0.000004s : 4: predicate.loop_unroll_after_grad 2.31% : 0.000005s : 34: predicate.loop_unroll_before_grad 1.77% : 0.000004s : 21: predicate.make_slice_get_slice_eliminator 0.52% : 0.000001s : 8: predicate.merge_addn 0.53% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.53% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.76% : 0.000002s : 13: predicate.minmaximum_grad 2.09% : 0.000005s : 4: predicate.mutable_eliminate 0.35% : 0.000001s : 4: predicate.opt_reshape 0.68% : 0.000002s : 4: predicate.parallel_virtual_node 1.64% : 0.000004s : 21: predicate.partial_defer_inline 1.47% : 0.000003s : 21: predicate.partial_eliminate 0.85% : 0.000002s : 13: predicate.print_const_string_wrapper 0.60% : 0.000001s : 8: predicate.reduce_all_const_elim 1.10% : 0.000003s : 13: predicate.reduce_eliminate 2.47% : 0.000006s : 38: predicate.redundant_stop_gradient_eliminater 0.46% : 0.000001s : 8: predicate.remove_not_recompute_node 1.31% : 0.000003s : 25: predicate.replace_applicator 0.47% : 0.000001s : 8: predicate.replace_old_param 0.36% : 0.000001s : 4: predicate.reset_defer_inline 0.92% : 0.000002s : 13: predicate.reshape_eliminate 0.60% : 0.000001s : 8: predicate.row_tensor_add_zeros_like 0.48% : 0.000001s : 4: predicate.row_tensor_eliminate 0.74% : 0.000002s : 8: predicate.same_eliminate 0.42% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.82% : 0.000002s : 8: predicate.shard_identity_eliminate 0.76% : 0.000002s : 8: predicate.special_op_eliminate 0.63% : 0.000001s : 8: predicate.specialize_transform 0.78% : 0.000002s : 8: predicate.split_environ_get_set_with_tuple_value 0.77% : 0.000002s : 8: predicate.stack_unstack_eliminate 0.38% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.46% : 0.000003s : 21: predicate.switch_defer_inline 2.08% : 0.000005s : 29: predicate.switch_layer_defer_inline 4.84% : 0.000011s : 67: predicate.switch_simplify 1.07% : 0.000002s : 13: predicate.tile_eliminate 0.96% : 0.000002s : 13: predicate.transpose_eliminate 1.59% : 0.000004s : 21: predicate.tuple_list_convert_item_index_to_positive 1.52% : 0.000003s : 21: predicate.tuple_list_get_item_const_eliminator 1.53% : 0.000004s : 21: predicate.tuple_list_get_item_depend_reorder 3.18% : 0.000007s : 33: predicate.tuple_list_get_item_eliminator 1.64% : 0.000004s : 21: predicate.tuple_list_get_set_item_eliminator 2.14% : 0.000005s : 29: predicate.tuple_list_set_item_eliminator 1.70% : 0.000004s : 25: predicate.tuple_to_list_eliminator_ 2.37% : 0.000005s : 38: predicate.updatestate_pure_node_eliminater 3.23% : 0.000007s : 46: predicate.updatestate_useless_node_eliminater 0.60% : 0.000001s : 4: predicate.value_based_eliminate 0.77% : 0.000002s : 8: predicate.virtual_dataset_eliminate 0.58% : 0.000001s : 8: predicate.virtual_output_eliminate 0.27% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.40% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000672 11 54.61% : 0.000367s : 5: func_graph_cloner_run.FuncGraphClonerGraph 45.39% : 0.000305s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.030488 192 0.01% : 0.000004s : 1: ForceFp32Comm 12.51% : 0.003815s : 1: add_attr 12.47% : 0.003801s : 1: add_attr_with_inline 0.02% : 0.000005s : 1: add_comm_op_reuse_tag 0.20% : 0.000061s : 1: add_recomputation 0.01% : 0.000004s : 1: assign_add_opt 0.25% : 0.000076s : 1: auto_monad 0.08% : 0.000025s : 1: auto_monad_reorder 0.01% : 0.000003s : 1: begin_end_overlap_inline 0.02% : 0.000006s : 1: bias_add_comm_swap 1.69% : 0.000515s : 1: bootstrap 0.12% : 0.000038s : 1: cconv 0.01% : 0.000004s : 1: comm_op_add_attrs 0.14% : 0.000042s : 1: control_data_broadcast_order 0.04% : 0.000011s : 1: convert_after_rewriter 0.09% : 0.000027s : 1: cse_after_recomputation 0.02% : 0.000005s : 1: dataset_repeat_opt 0.02% : 0.000006s : 1: detach_backward 0.03% : 0.000008s : 1: environ_conv 0.09% : 0.000029s : 1: event_method 0.02% : 0.000006s : 1: full_micro_interleaved_order_control 0.02% : 0.000005s : 1: get_jit_bprop_graph 0.03% : 0.000009s : 1: graph_reusing 0.01% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000004s : 1: handle_group_info 0.02% : 0.000006s : 1: inline 0.02% : 0.000006s : 1: insert-virtual-dataset 0.01% : 0.000004s : 1: interleave_parallel_branches 0.01% : 0.000004s : 1: interleave_split_concat_branches 0.02% : 0.000007s : 1: label_fine_grained_interleaved_index 0.03% : 0.000009s : 1: label_micro_interleaved_index 2.09% : 0.000636s : 1: loop_unroll 0.01% : 0.000004s : 1: merge_cast_opt 0.02% : 0.000006s : 1: micro_interleaved_order_control 2.53% : 0.000772s : 1: mutable_eliminate 0.03% : 0.000008s : 1: offloading_packed_experts 0.07% : 0.000020s : 1: opt.transform.loop_unroll_optimizer 0.08% : 0.000025s : 1: opt.transform.mutable_eliminate 4.04% : 0.001230s : 78: opt.transform.opt_a 0.11% : 0.000033s : 1: opt.transform.opt_after_cconv 0.10% : 0.000030s : 1: opt.transform.opt_after_jit_grad 0.43% : 0.000131s : 28: opt.transform.opt_b 0.18% : 0.000054s : 2: opt.transform.opt_trans_graph 0.15% : 0.000044s : 4: opt.transform.symbol_engine_opt 9.80% : 0.002989s : 1: opt_a 0.40% : 0.000123s : 1: opt_after_cconv 2.17% : 0.000662s : 1: opt_after_jit_grad 0.89% : 0.000270s : 1: opt_b 19.08% : 0.005818s : 1: optimize 0.08% : 0.000024s : 1: optimize_parallel_all_gather_comm 0.03% : 0.000008s : 1: order_py_execute_after_rewriter 0.10% : 0.000030s : 1: overlap_grad_flash_sp 0.01% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.42% : 0.000127s : 1: overlap_grad_ring_attention 0.02% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.02% : 0.000005s : 1: overlap_param_gather 0.01% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.03% : 0.000009s : 1: overlap_recompute_and_grad_model_parallel 0.02% : 0.000005s : 1: overlap_recompute_comm 0.02% : 0.000008s : 1: parallel-infer-symbol 0.01% : 0.000004s : 1: parallel-infer-symbol-second 0.02% : 0.000005s : 1: partial_unused_args_eliminate 0.02% : 0.000005s : 1: pipeline_parallel_scheduler 0.01% : 0.000004s : 1: pipeline_split 0.14% : 0.000042s : 1: pre_auto_parallel 0.11% : 0.000032s : 1: py_interpret_to_execute 0.06% : 0.000018s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000004s : 1: remove_cast_before_assign_add 0.06% : 0.000018s : 1: remove_dup_value 1.38% : 0.000421s : 1: renormalize.infer 1.20% : 0.000366s : 1: renormalize.specialize 0.02% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.03% : 0.000010s : 1: rewriter_after_jit_bprop_graph 0.15% : 0.000046s : 1: rewriter_after_opt_a 0.30% : 0.000092s : 1: rewriter_before_opt_a 0.02% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.02% : 0.000006s : 1: slice_recompute_activation 0.02% : 0.000005s : 1: split_layernorm_comm 0.02% : 0.000005s : 1: split_matmul_comm_elemetwise 0.03% : 0.000010s : 1: swap_dp_allreduce_reducescatter 0.32% : 0.000097s : 1: symbol_engine_optimizer 0.28% : 0.000085s : 1: tuple_transform 24.79% : 0.007558s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:44:34.195.406 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:44:34.195.733 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.105032, [21] [bootstrap]: 0.00045072 [type_inference]: 0.0185764 [event_method]: 2.328e-05 [auto_monad]: 7.21e-05 [graph_reusing]: 6.23998e-06 [inline]: 2.51e-06 [add_attr]: 0.0763963, [1] [add_attr_with_inline]: 0.0763791, [1] [Cycle 1]: 0.0731403, [2] [tag_attr]: 0.0730195 [meta_addattr_fg_expand]: 1.352e-05 [parallel-infer-symbol]: 5.19e-06 [pre_auto_parallel]: 5.039e-05 [insert-virtual-dataset]: 2.51e-06 [parallel-infer-symbol-second]: 8.39995e-07 [dataset_repeat_opt]: 2.37999e-06 [pipeline_split]: 1.72001e-06 [optimize]: 0.00683598, [53] [py_interpret_to_execute]: 4.464e-05 [rewriter_before_opt_a]: 0.0001126 [opt_a]: 0.00393404, [2] [Cycle 1]: 0.0028865, [45] [expand_dump_flag]: 3.23e-06 [switch_simplify]: 4.503e-05 [loop_unroll]: 3.118e-05 [a_1]: 0.00082163 [with_stream_mark]: 2.277e-05 [recompute_prepare]: 1.358e-05 [updatestate_depend_eliminate]: 5.51e-06 [updatestate_assign_eliminate]: 4.25999e-06 [updatestate_loads_eliminate]: 3.75998e-06 [parameter_eliminate]: 2.19999e-06 [a_2]: 0.00013405 [accelerated_algorithm]: 9.25001e-06 [shard]: 2.73e-06 [meta_shard_fg_expand]: 2.59001e-06 [shard_inline]: 8.23001e-06 [merge_send_recv]: 1.069e-05 [auto_parallel]: 9.44e-06 [parallel]: 2.062e-05 [flash_sp]: 1.038e-05 [merge_comm]: 5.05001e-06 [allreduce_fusion]: 4.44002e-06 [matmul_add_comm_reduction]: 1.142e-05 [allreduce_slice_to_reducescatter]: 8.2e-07 [virtual_shard_identity]: 1.155e-05 [virtual_dataset]: 8.92e-06 [get_grad_eliminate_]: 8.22e-06 [virtual_output]: 8.89e-06 [merge_forward]: 4.63999e-06 [cell_reuse_recompute_pass]: 2.11e-06 [offload_activation]: 1.258e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.027e-05 [merge_recompute_call_nodes]: 1.74e-06 [before_grad]: 1.568e-05 [set_forward_comm_id_for_comm_node_pass]: 5.40999e-06 [meta_fg_expand]: 3.9e-06 [flash_sp_send_recv_attached]: 2.94999e-06 [receive_attached]: 2.43e-06 [after_resolve]: 1.626e-05 [a_after_grad]: 1.485e-05 [renormalize]: 0.00097866 [add_forward_monad_depend]: 6.89999e-06 [auto_monad_grad]: 2.84001e-06 [auto_monad_eliminator]: 2.062e-05 [cse]: 4.012e-05 [a_3]: 8.016e-05 [Cycle 2]: 0.00102752, [45] [expand_dump_flag]: 2.07999e-06 [switch_simplify]: 1.004e-05 [loop_unroll]: 7.65e-06 [a_1]: 0.0001865 [with_stream_mark]: 1.533e-05 [recompute_prepare]: 8.90999e-06 [updatestate_depend_eliminate]: 3.81999e-06 [updatestate_assign_eliminate]: 3.33e-06 [updatestate_loads_eliminate]: 3.08998e-06 [parameter_eliminate]: 1.50999e-06 [a_2]: 0.0001297 [accelerated_algorithm]: 9.25999e-06 [shard]: 2.12999e-06 [meta_shard_fg_expand]: 2.29001e-06 [shard_inline]: 8.41002e-06 [merge_send_recv]: 9.10001e-06 [auto_parallel]: 7.41999e-06 [parallel]: 6.65002e-06 [flash_sp]: 3.58999e-06 [merge_comm]: 4.3e-06 [allreduce_fusion]: 3.95998e-06 [matmul_add_comm_reduction]: 9.49e-06 [allreduce_slice_to_reducescatter]: 3.89991e-07 [virtual_shard_identity]: 1.06e-05 [virtual_dataset]: 7.25e-06 [get_grad_eliminate_]: 7.11001e-06 [virtual_output]: 6.94999e-06 [merge_forward]: 4.27998e-06 [cell_reuse_recompute_pass]: 2.43e-06 [offload_activation]: 9.32001e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.761e-05 [merge_recompute_call_nodes]: 9.39996e-07 [before_grad]: 1.428e-05 [set_forward_comm_id_for_comm_node_pass]: 5.15999e-06 [meta_fg_expand]: 2.96999e-06 [flash_sp_send_recv_attached]: 1.44998e-06 [receive_attached]: 1.67001e-06 [after_resolve]: 1.522e-05 [a_after_grad]: 1.341e-05 [renormalize]: 7.00238e-08 [add_forward_monad_depend]: 2.09e-06 [auto_monad_grad]: 1.43002e-06 [auto_monad_eliminator]: 1.259e-05 [cse]: 2.365e-05 [a_3]: 6.392e-05 [py_interpret_to_execute_after_opt_a]: 1.862e-05 [slice_cell_reuse_recomputed_activation]: 5.17e-06 [rewriter_after_opt_a]: 5.391e-05 [convert_after_rewriter]: 1.139e-05 [order_py_execute_after_rewriter]: 9.94001e-06 [mutable_eliminate]: 0.0007847 [opt_b]: 0.00035101, [1] [Cycle 1]: 0.00033916, [7] [b_1]: 0.00021828 [b_2]: 1.127e-05 [updatestate_depend_eliminate]: 9.12999e-06 [updatestate_assign_eliminate]: 3.81001e-06 [updatestate_loads_eliminate]: 4.10998e-06 [renormalize]: 1.14e-06 [cse]: 3.022e-05 [optimize_parallel_all_gather_comm]: 2.398e-05 [overlap_param_gather]: 4.97999e-06 [cconv]: 3.842e-05 [loop_unroll]: 0.00050972 [opt_after_cconv]: 0.00015187, [1] [Cycle 1]: 0.00014205, [7] [c_1]: 3.909e-05 [parameter_eliminate]: 4.62998e-06 [updatestate_depend_eliminate]: 8.11002e-06 [updatestate_assign_eliminate]: 3.56001e-06 [updatestate_loads_eliminate]: 3.28998e-06 [cse]: 2.44e-05 [renormalize]: 5.3001e-07 [remove_dup_value]: 2.056e-05 [tuple_transform]: 0.00010132, [1] [Cycle 1]: 9.353e-05, [4] [d_1]: 5.262e-05 [none_parameter_eliminate]: 1.72001e-06 [renormalize]: 2.19996e-07 [switch_simplify]: 8.35001e-06 [partial_unused_args_eliminate]: 4.75999e-06 [add_recomputation]: 6.521e-05 [cse_after_recomputation]: 3.51e-05, [1] [Cycle 1]: 2.801e-05, [1] [cse]: 1.753e-05 [environ_conv]: 1.053e-05 [swap_dp_allreduce_reducescatter]: 1.004e-05 [bias_add_comm_swap]: 5.97001e-06 [label_micro_interleaved_index]: 7.83999e-06 [label_fine_grained_interleaved_index]: 5.37999e-06 [merge_cast_opt]: 4.30999e-06 [slice_recompute_activation]: 4.38999e-06 [micro_interleaved_order_control]: 4.48999e-06 [assign_add_opt]: 3.55998e-06 [ForceFp32Comm]: 3.35e-06 [remove_cast_before_assign_add]: 3.28e-06 [full_micro_interleaved_order_control]: 4.38001e-06 [reorder_send_recv_between_fp_bp]: 5.56e-06 [comm_op_add_attrs]: 3.48e-06 [add_comm_op_reuse_tag]: 3.49001e-06 [interleave_split_concat_branches]: 3.41001e-06 [interleave_parallel_branches]: 3.25998e-06 [overlap_opt_shard_in_pipeline]: 3.41001e-06 [overlap_opt_shard_grad_in_pipeline]: 3.98001e-06 [control_data_broadcast_order]: 2.118e-05 [grouped_pairwise_exchange_alltoall]: 4.15e-06 [offloading_packed_experts]: 8.70001e-06 [overlap_recompute_and_grad_model_parallel]: 8.17e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.55e-06 [overlap_recompute_allgather_and_fa_grad]: 3.82998e-06 [overlap_recompute_comm]: 4.53001e-06 [overlap_grad_ring_attention]: 7.70998e-06 [overlap_grad_flash_sp]: 2.778e-05 [begin_end_overlap_inline]: 3.2e-06 [split_matmul_comm_elemetwise]: 4.57e-06 [split_layernorm_comm]: 4.40999e-06 [handle_group_info]: 3.74002e-06 [symbol_engine_optimizer]: 0.00012834, [1] [Cycle 1]: 0.00011994, [6] [build]: 3.90998e-06 [elim_shapecalc]: 1.292e-05 [elim_not_effective]: 1.724e-05 [opt_reshape]: 8.60999e-06 [fold_const_symbol]: 1.378e-05 [renormalize]: 6.29982e-07 [detach_backward]: 6.55002e-06 [pipeline_parallel_scheduler]: 2.17999e-06 [auto_monad_reorder]: 3.429e-05 [get_jit_bprop_graph]: 2.68003e-06 [rewriter_after_jit_bprop_graph]: 7.01001e-06 [opt_after_jit_grad]: 0.00060475 [validate]: 5.042e-05 Sums bootstrap : 0.000451s : 0.46% type_inference : 0.018576s : 18.87% event_method : 0.000023s : 0.02% auto_monad : 0.000072s : 0.07% graph_reusing : 0.000006s : 0.01% inline : 0.000003s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.073019s : 74.16% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000014s : 0.01% parallel-infer-symbol : 0.000005s : 0.01% pre_auto_parallel : 0.000050s : 0.05% insert-virtual-dataset : 0.000003s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000045s : 0.05% optimize.rewriter_before_opt_a : 0.000113s : 0.11% optimize.opt_a.expand_dump_flag : 0.000005s : 0.01% optimize.opt_a.switch_simplify : 0.000055s : 0.06% optimize.opt_a.loop_unroll : 0.000039s : 0.04% optimize.opt_a.a_1 : 0.001008s : 1.02% optimize.opt_a.with_stream_mark : 0.000038s : 0.04% optimize.opt_a.recompute_prepare : 0.000022s : 0.02% optimize.opt_a.updatestate_depend_eliminate : 0.000009s : 0.01% optimize.opt_a.updatestate_assign_eliminate : 0.000008s : 0.01% optimize.opt_a.updatestate_loads_eliminate : 0.000007s : 0.01% optimize.opt_a.parameter_eliminate : 0.000004s : 0.00% optimize.opt_a.a_2 : 0.000264s : 0.27% optimize.opt_a.accelerated_algorithm : 0.000019s : 0.02% optimize.opt_a.shard : 0.000005s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.000005s : 0.00% optimize.opt_a.shard_inline : 0.000017s : 0.02% optimize.opt_a.merge_send_recv : 0.000020s : 0.02% optimize.opt_a.auto_parallel : 0.000017s : 0.02% optimize.opt_a.parallel : 0.000027s : 0.03% optimize.opt_a.flash_sp : 0.000014s : 0.01% optimize.opt_a.merge_comm : 0.000009s : 0.01% optimize.opt_a.allreduce_fusion : 0.000008s : 0.01% optimize.opt_a.matmul_add_comm_reduction : 0.000021s : 0.02% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000022s : 0.02% optimize.opt_a.virtual_dataset : 0.000016s : 0.02% optimize.opt_a.get_grad_eliminate_ : 0.000015s : 0.02% optimize.opt_a.virtual_output : 0.000016s : 0.02% optimize.opt_a.merge_forward : 0.000009s : 0.01% optimize.opt_a.cell_reuse_recompute_pass : 0.000005s : 0.00% optimize.opt_a.offload_activation : 0.000022s : 0.02% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000038s : 0.04% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.00% optimize.opt_a.before_grad : 0.000030s : 0.03% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000011s : 0.01% optimize.opt_a.meta_fg_expand : 0.000007s : 0.01% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.00% optimize.opt_a.receive_attached : 0.000004s : 0.00% optimize.opt_a.after_resolve : 0.000031s : 0.03% optimize.opt_a.a_after_grad : 0.000028s : 0.03% optimize.opt_a.renormalize : 0.000979s : 0.99% optimize.opt_a.add_forward_monad_depend : 0.000009s : 0.01% optimize.opt_a.auto_monad_grad : 0.000004s : 0.00% optimize.opt_a.auto_monad_eliminator : 0.000033s : 0.03% optimize.opt_a.cse : 0.000064s : 0.06% optimize.opt_a.a_3 : 0.000144s : 0.15% optimize.py_interpret_to_execute_after_opt_a : 0.000019s : 0.02% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.01% optimize.rewriter_after_opt_a : 0.000054s : 0.05% optimize.convert_after_rewriter : 0.000011s : 0.01% optimize.order_py_execute_after_rewriter : 0.000010s : 0.01% optimize.mutable_eliminate : 0.000785s : 0.80% optimize.opt_b.b_1 : 0.000218s : 0.22% optimize.opt_b.b_2 : 0.000011s : 0.01% optimize.opt_b.updatestate_depend_eliminate : 0.000009s : 0.01% optimize.opt_b.updatestate_assign_eliminate : 0.000004s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000004s : 0.00% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000030s : 0.03% optimize.optimize_parallel_all_gather_comm : 0.000024s : 0.02% optimize.overlap_param_gather : 0.000005s : 0.01% optimize.cconv : 0.000038s : 0.04% optimize.loop_unroll : 0.000510s : 0.52% optimize.opt_after_cconv.c_1 : 0.000039s : 0.04% optimize.opt_after_cconv.parameter_eliminate : 0.000005s : 0.00% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000008s : 0.01% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000004s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.cse : 0.000024s : 0.02% optimize.opt_after_cconv.renormalize : 0.000001s : 0.00% optimize.remove_dup_value : 0.000021s : 0.02% optimize.tuple_transform.d_1 : 0.000053s : 0.05% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000008s : 0.01% optimize.partial_unused_args_eliminate : 0.000005s : 0.00% optimize.add_recomputation : 0.000065s : 0.07% optimize.cse_after_recomputation.cse : 0.000018s : 0.02% optimize.environ_conv : 0.000011s : 0.01% optimize.swap_dp_allreduce_reducescatter : 0.000010s : 0.01% optimize.bias_add_comm_swap : 0.000006s : 0.01% optimize.label_micro_interleaved_index : 0.000008s : 0.01% optimize.label_fine_grained_interleaved_index : 0.000005s : 0.01% optimize.merge_cast_opt : 0.000004s : 0.00% optimize.slice_recompute_activation : 0.000004s : 0.00% optimize.micro_interleaved_order_control : 0.000004s : 0.00% optimize.assign_add_opt : 0.000004s : 0.00% optimize.ForceFp32Comm : 0.000003s : 0.00% optimize.remove_cast_before_assign_add : 0.000003s : 0.00% optimize.full_micro_interleaved_order_control : 0.000004s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000006s : 0.01% optimize.comm_op_add_attrs : 0.000003s : 0.00% optimize.add_comm_op_reuse_tag : 0.000003s : 0.00% optimize.interleave_split_concat_branches : 0.000003s : 0.00% optimize.interleave_parallel_branches : 0.000003s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000003s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000004s : 0.00% optimize.control_data_broadcast_order : 0.000021s : 0.02% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.00% optimize.offloading_packed_experts : 0.000009s : 0.01% optimize.overlap_recompute_and_grad_model_parallel : 0.000008s : 0.01% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.00% optimize.overlap_recompute_comm : 0.000005s : 0.00% optimize.overlap_grad_ring_attention : 0.000008s : 0.01% optimize.overlap_grad_flash_sp : 0.000028s : 0.03% optimize.begin_end_overlap_inline : 0.000003s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000005s : 0.00% optimize.split_layernorm_comm : 0.000004s : 0.00% optimize.handle_group_info : 0.000004s : 0.00% optimize.symbol_engine_optimizer.build : 0.000004s : 0.00% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000013s : 0.01% optimize.symbol_engine_optimizer.elim_not_effective : 0.000017s : 0.02% optimize.symbol_engine_optimizer.opt_reshape : 0.000009s : 0.01% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000014s : 0.01% optimize.symbol_engine_optimizer.renormalize : 0.000001s : 0.00% detach_backward : 0.000007s : 0.01% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000034s : 0.03% get_jit_bprop_graph : 0.000003s : 0.00% rewriter_after_jit_bprop_graph : 0.000007s : 0.01% opt_after_jit_grad : 0.000605s : 0.61% validate : 0.000050s : 0.05% Time group info: ------[substitution.] 0.000266 39 10.41% : 0.000028s : 3: substitution.cast_eliminate 0.78% : 0.000002s : 3: substitution.elim_not_effective 0.83% : 0.000002s : 3: substitution.fold_const_symbol 2.57% : 0.000007s : 5: substitution.graph_param_transform 68.79% : 0.000183s : 4: substitution.inline 1.93% : 0.000005s : 6: substitution.j_node_and_user_rematch 2.39% : 0.000006s : 6: substitution.remove_not_recompute_node 2.47% : 0.000007s : 4: substitution.replace_old_param 6.58% : 0.000017s : 4: substitution.tuple_list_get_item_eliminator 3.26% : 0.000009s : 1: substitution.value_based_eliminate ------[type_inference.] 0.018511 2 95.53% : 0.017684s : 1: type_inference.infer 4.47% : 0.000827s : 1: type_inference.specialize ------[replace.] 0.000068 8 61.24% : 0.000042s : 4: replace.inline 38.76% : 0.000027s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000196 8 92.22% : 0.000180s : 4: match.inline 7.78% : 0.000015s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000284 1596 0.89% : 0.000003s : 17: predicate.accumulaten_eliminater 0.84% : 0.000002s : 5: predicate.ad_related_special_op_eliminate 0.56% : 0.000002s : 10: predicate.addn_check_dump 0.92% : 0.000003s : 17: predicate.addn_zero_filter 0.84% : 0.000002s : 17: predicate.adjust_all_reduce_mul_add 2.10% : 0.000006s : 27: predicate.arithmetic_simplify 1.02% : 0.000003s : 17: predicate.cast_eliminate 0.77% : 0.000002s : 10: predicate.check_bprop_eliminate 3.23% : 0.000009s : 10: predicate.compare_switch_simplify 0.18% : 0.000000s : 5: predicate.const_output_eliminate 0.67% : 0.000002s : 10: predicate.depend_value_elim 0.98% : 0.000003s : 17: predicate.dict_get_item_const_eliminator 1.04% : 0.000003s : 17: predicate.dict_get_item_eliminator 0.88% : 0.000003s : 17: predicate.dict_set_item_eliminator 0.90% : 0.000003s : 10: predicate.dumpgradient_eliminate 0.17% : 0.000000s : 5: predicate.elim_not_effective 0.41% : 0.000001s : 5: predicate.elim_shapecalc_of_broadcastargs 1.12% : 0.000003s : 22: predicate.environ_add_const_eliminate 1.07% : 0.000003s : 22: predicate.environ_get_add_eliminate 1.09% : 0.000003s : 22: predicate.environ_get_depend_swap 1.75% : 0.000005s : 32: predicate.environ_get_eliminate 1.05% : 0.000003s : 22: predicate.environ_get_set_eliminate 1.35% : 0.000004s : 25: predicate.exchange_switch_depend_value 2.35% : 0.000007s : 25: predicate.float_depend_g_call 0.65% : 0.000002s : 10: predicate.float_environ_get_switch 0.75% : 0.000002s : 15: predicate.float_tuple_getitem_switch 0.17% : 0.000000s : 5: predicate.fold_const_symbol 0.61% : 0.000002s : 10: predicate.get_grad_eliminate 0.17% : 0.000000s : 5: predicate.graph_param_transform 0.57% : 0.000002s : 10: predicate.incorporate_call 0.44% : 0.000001s : 10: predicate.incorporate_call_switch 6.31% : 0.000018s : 72: predicate.inline 1.09% : 0.000003s : 10: predicate.inline_without_move 0.29% : 0.000001s : 10: predicate.j_node_and_user_rematch 0.75% : 0.000002s : 10: predicate.less_batch_normalization 1.67% : 0.000005s : 31: predicate.list_to_tuple_eliminator_ 2.49% : 0.000007s : 48: predicate.load_eliminater 0.77% : 0.000002s : 5: predicate.loop_unroll_after_grad 1.83% : 0.000005s : 36: predicate.loop_unroll_before_grad 1.82% : 0.000005s : 27: predicate.make_slice_get_slice_eliminator 0.56% : 0.000002s : 10: predicate.merge_addn 0.77% : 0.000002s : 10: predicate.micro_step_allgather_replace 0.56% : 0.000002s : 10: predicate.mini_step_allgather_replace 0.84% : 0.000002s : 17: predicate.minmaximum_grad 0.91% : 0.000003s : 5: predicate.mutable_eliminate 0.32% : 0.000001s : 5: predicate.opt_reshape 0.31% : 0.000001s : 5: predicate.parallel_virtual_node 1.73% : 0.000005s : 25: predicate.partial_defer_inline 1.58% : 0.000004s : 26: predicate.partial_eliminate 0.87% : 0.000002s : 17: predicate.print_const_string_wrapper 0.56% : 0.000002s : 10: predicate.reduce_all_const_elim 1.20% : 0.000003s : 17: predicate.reduce_eliminate 2.41% : 0.000007s : 48: predicate.redundant_stop_gradient_eliminater 0.37% : 0.000001s : 10: predicate.remove_not_recompute_node 1.33% : 0.000004s : 31: predicate.replace_applicator 0.52% : 0.000001s : 10: predicate.replace_old_param 0.38% : 0.000001s : 5: predicate.reset_defer_inline 0.96% : 0.000003s : 17: predicate.reshape_eliminate 0.64% : 0.000002s : 10: predicate.row_tensor_add_zeros_like 0.41% : 0.000001s : 5: predicate.row_tensor_eliminate 0.72% : 0.000002s : 10: predicate.same_eliminate 0.42% : 0.000001s : 10: predicate.set_cell_output_no_recompute 0.98% : 0.000003s : 10: predicate.shard_identity_eliminate 0.62% : 0.000002s : 10: predicate.special_op_eliminate 0.76% : 0.000002s : 10: predicate.specialize_transform 0.92% : 0.000003s : 10: predicate.split_environ_get_set_with_tuple_value 0.94% : 0.000003s : 10: predicate.stack_unstack_eliminate 0.31% : 0.000001s : 5: predicate.switch_call_monad_eliminater 1.48% : 0.000004s : 25: predicate.switch_defer_inline 1.95% : 0.000006s : 35: predicate.switch_layer_defer_inline 4.45% : 0.000013s : 76: predicate.switch_simplify 0.88% : 0.000002s : 17: predicate.tile_eliminate 0.97% : 0.000003s : 17: predicate.transpose_eliminate 1.47% : 0.000004s : 27: predicate.tuple_list_convert_item_index_to_positive 1.62% : 0.000005s : 27: predicate.tuple_list_get_item_const_eliminator 1.46% : 0.000004s : 27: predicate.tuple_list_get_item_depend_reorder 3.24% : 0.000009s : 41: predicate.tuple_list_get_item_eliminator 1.47% : 0.000004s : 27: predicate.tuple_list_get_set_item_eliminator 2.10% : 0.000006s : 37: predicate.tuple_list_set_item_eliminator 1.76% : 0.000005s : 31: predicate.tuple_to_list_eliminator_ 2.33% : 0.000007s : 48: predicate.updatestate_pure_node_eliminater 3.08% : 0.000009s : 58: predicate.updatestate_useless_node_eliminater 0.41% : 0.000001s : 5: predicate.value_based_eliminate 0.64% : 0.000002s : 10: predicate.virtual_dataset_eliminate 0.58% : 0.000002s : 10: predicate.virtual_output_eliminate 0.27% : 0.000001s : 5: predicate.virtual_view_grad_eliminate 0.39% : 0.000001s : 5: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.012108 11 97.57% : 0.011813s : 5: func_graph_cloner_run.FuncGraphClonerGraph 2.43% : 0.000294s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.191034 192 0.00% : 0.000006s : 1: ForceFp32Comm 40.00% : 0.076410s : 1: add_attr 39.98% : 0.076383s : 1: add_attr_with_inline 0.00% : 0.000006s : 1: add_comm_op_reuse_tag 0.04% : 0.000070s : 1: add_recomputation 0.00% : 0.000006s : 1: assign_add_opt 0.04% : 0.000081s : 1: auto_monad 0.02% : 0.000043s : 1: auto_monad_reorder 0.00% : 0.000006s : 1: begin_end_overlap_inline 0.00% : 0.000009s : 1: bias_add_comm_swap 0.26% : 0.000496s : 1: bootstrap 0.02% : 0.000042s : 1: cconv 0.00% : 0.000006s : 1: comm_op_add_attrs 0.01% : 0.000025s : 1: control_data_broadcast_order 0.01% : 0.000015s : 1: convert_after_rewriter 0.02% : 0.000038s : 1: cse_after_recomputation 0.00% : 0.000008s : 1: dataset_repeat_opt 0.02% : 0.000038s : 1: detach_backward 0.01% : 0.000014s : 1: environ_conv 0.02% : 0.000035s : 1: event_method 0.00% : 0.000007s : 1: full_micro_interleaved_order_control 0.01% : 0.000010s : 1: get_jit_bprop_graph 0.01% : 0.000013s : 1: graph_reusing 0.00% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000006s : 1: handle_group_info 0.00% : 0.000008s : 1: inline 0.00% : 0.000009s : 1: insert-virtual-dataset 0.00% : 0.000006s : 1: interleave_parallel_branches 0.00% : 0.000006s : 1: interleave_split_concat_branches 0.00% : 0.000008s : 1: label_fine_grained_interleaved_index 0.01% : 0.000011s : 1: label_micro_interleaved_index 0.27% : 0.000517s : 1: loop_unroll 0.00% : 0.000007s : 1: merge_cast_opt 0.00% : 0.000007s : 1: micro_interleaved_order_control 0.42% : 0.000793s : 1: mutable_eliminate 0.01% : 0.000012s : 1: offloading_packed_experts 0.01% : 0.000019s : 1: opt.transform.loop_unroll_optimizer 0.01% : 0.000022s : 1: opt.transform.mutable_eliminate 0.83% : 0.001580s : 78: opt.transform.opt_a 0.02% : 0.000038s : 1: opt.transform.opt_after_cconv 0.02% : 0.000034s : 1: opt.transform.opt_after_jit_grad 0.08% : 0.000153s : 28: opt.transform.opt_b 0.03% : 0.000059s : 2: opt.transform.opt_trans_graph 0.03% : 0.000049s : 4: opt.transform.symbol_engine_opt 2.06% : 0.003938s : 1: opt_a 0.08% : 0.000156s : 1: opt_after_cconv 0.32% : 0.000616s : 1: opt_after_jit_grad 0.19% : 0.000355s : 1: opt_b 4.42% : 0.008443s : 1: optimize 0.01% : 0.000028s : 1: optimize_parallel_all_gather_comm 0.01% : 0.000013s : 1: order_py_execute_after_rewriter 0.02% : 0.000032s : 1: overlap_grad_flash_sp 0.00% : 0.000006s : 1: overlap_grad_matmul_and_grad_allreduce 0.01% : 0.000011s : 1: overlap_grad_ring_attention 0.00% : 0.000007s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000006s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000008s : 1: overlap_param_gather 0.00% : 0.000006s : 1: overlap_recompute_allgather_and_fa_grad 0.01% : 0.000011s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000007s : 1: overlap_recompute_comm 0.01% : 0.000013s : 1: parallel-infer-symbol 0.00% : 0.000006s : 1: parallel-infer-symbol-second 0.00% : 0.000008s : 1: partial_unused_args_eliminate 0.01% : 0.000012s : 1: pipeline_parallel_scheduler 0.00% : 0.000007s : 1: pipeline_split 0.03% : 0.000059s : 1: pre_auto_parallel 0.03% : 0.000049s : 1: py_interpret_to_execute 0.01% : 0.000022s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000006s : 1: remove_cast_before_assign_add 0.01% : 0.000024s : 1: remove_dup_value 0.28% : 0.000527s : 1: renormalize.infer 0.23% : 0.000441s : 1: renormalize.specialize 0.00% : 0.000008s : 1: reorder_send_recv_between_fp_bp 0.01% : 0.000014s : 1: rewriter_after_jit_bprop_graph 0.03% : 0.000058s : 1: rewriter_after_opt_a 0.06% : 0.000117s : 1: rewriter_before_opt_a 0.00% : 0.000008s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000007s : 1: slice_recompute_activation 0.00% : 0.000007s : 1: split_layernorm_comm 0.00% : 0.000008s : 1: split_matmul_comm_elemetwise 0.01% : 0.000013s : 1: swap_dp_allreduce_reducescatter 0.07% : 0.000132s : 1: symbol_engine_optimizer 0.05% : 0.000104s : 1: tuple_transform 9.75% : 0.018622s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:44:34.902.672 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0996143, [21] [bootstrap]: 0.0004659 [type_inference]: 0.0774979 [event_method]: 2.2e-05 [auto_monad]: 7.345e-05 [graph_reusing]: 5.81998e-06 [inline]: 3.24001e-06 [add_attr]: 0.00377882, [1] [add_attr_with_inline]: 0.00376561, [1] [Cycle 1]: 8.126e-05, [2] [tag_attr]: 2.659e-05 [meta_addattr_fg_expand]: 6.15002e-06 [parallel-infer-symbol]: 3.78001e-06 [pre_auto_parallel]: 4.522e-05 [insert-virtual-dataset]: 2.55997e-06 [parallel-infer-symbol-second]: 8.00006e-07 [dataset_repeat_opt]: 2.50002e-06 [pipeline_split]: 1.59998e-06 [optimize]: 0.0169097, [53] [py_interpret_to_execute]: 3.412e-05 [rewriter_before_opt_a]: 9.981e-05 [opt_a]: 0.0143408, [2] [Cycle 1]: 0.0133886, [45] [expand_dump_flag]: 3.27002e-06 [switch_simplify]: 4.663e-05 [loop_unroll]: 3.377e-05 [a_1]: 0.00084332 [with_stream_mark]: 2.703e-05 [recompute_prepare]: 1.629e-05 [updatestate_depend_eliminate]: 5.40001e-06 [updatestate_assign_eliminate]: 4.09002e-06 [updatestate_loads_eliminate]: 3.8e-06 [parameter_eliminate]: 1.99e-06 [a_2]: 0.00011044 [accelerated_algorithm]: 1.072e-05 [shard]: 3.21999e-06 [meta_shard_fg_expand]: 2.50002e-06 [shard_inline]: 8.38999e-06 [merge_send_recv]: 1.1e-05 [auto_parallel]: 9.37999e-06 [parallel]: 2.143e-05 [flash_sp]: 1.315e-05 [merge_comm]: 5.64e-06 [allreduce_fusion]: 5.10999e-06 [matmul_add_comm_reduction]: 1.224e-05 [allreduce_slice_to_reducescatter]: 8.90024e-07 [virtual_shard_identity]: 1.321e-05 [virtual_dataset]: 9.39e-06 [get_grad_eliminate_]: 8.79e-06 [virtual_output]: 8.46002e-06 [merge_forward]: 5.00001e-06 [cell_reuse_recompute_pass]: 1.96e-06 [offload_activation]: 1.327e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.202e-05 [merge_recompute_call_nodes]: 2.03002e-06 [before_grad]: 1.607e-05 [set_forward_comm_id_for_comm_node_pass]: 5.52999e-06 [meta_fg_expand]: 3.95e-06 [flash_sp_send_recv_attached]: 3.5e-06 [receive_attached]: 1.96e-06 [after_resolve]: 1.579e-05 [a_after_grad]: 1.462e-05 [renormalize]: 0.011576 [add_forward_monad_depend]: 1.163e-05 [auto_monad_grad]: 3.11001e-06 [auto_monad_eliminator]: 2.577e-05 [cse]: 3.871e-05 [a_3]: 7.338e-05 [Cycle 2]: 0.00093771, [45] [expand_dump_flag]: 2.19999e-06 [switch_simplify]: 1.255e-05 [loop_unroll]: 8.03999e-06 [a_1]: 0.00019731 [with_stream_mark]: 2.33e-05 [recompute_prepare]: 9.47999e-06 [updatestate_depend_eliminate]: 5.00001e-06 [updatestate_assign_eliminate]: 3.61999e-06 [updatestate_loads_eliminate]: 3.78999e-06 [parameter_eliminate]: 2.06e-06 [a_2]: 9.444e-05 [accelerated_algorithm]: 8.98002e-06 [shard]: 2.53e-06 [meta_shard_fg_expand]: 2.32999e-06 [shard_inline]: 7.25998e-06 [merge_send_recv]: 1.188e-05 [auto_parallel]: 1.142e-05 [parallel]: 8.58001e-06 [flash_sp]: 3.287e-05 [merge_comm]: 5.64e-06 [allreduce_fusion]: 4.49002e-06 [matmul_add_comm_reduction]: 1.319e-05 [allreduce_slice_to_reducescatter]: 1.30999e-06 [virtual_shard_identity]: 1.242e-05 [virtual_dataset]: 8.80999e-06 [get_grad_eliminate_]: 7.54002e-06 [virtual_output]: 8.15e-06 [merge_forward]: 5.63002e-06 [cell_reuse_recompute_pass]: 3.34001e-06 [offload_activation]: 1.207e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.846e-05 [merge_recompute_call_nodes]: 1.50001e-06 [before_grad]: 1.344e-05 [set_forward_comm_id_for_comm_node_pass]: 5.30999e-06 [meta_fg_expand]: 3.75e-06 [flash_sp_send_recv_attached]: 1.82001e-06 [receive_attached]: 3.14999e-06 [after_resolve]: 1.637e-05 [a_after_grad]: 1.245e-05 [renormalize]: 8.00064e-08 [add_forward_monad_depend]: 2.64001e-06 [auto_monad_grad]: 1.69998e-06 [auto_monad_eliminator]: 1.403e-05 [cse]: 2.488e-05 [a_3]: 4.706e-05 [py_interpret_to_execute_after_opt_a]: 1.921e-05 [slice_cell_reuse_recomputed_activation]: 2.01e-06 [rewriter_after_opt_a]: 4.91e-05 [convert_after_rewriter]: 7.75e-06 [order_py_execute_after_rewriter]: 5.71e-06 [mutable_eliminate]: 0.00073844 [opt_b]: 0.00028417, [1] [Cycle 1]: 0.00027661, [7] [b_1]: 0.00017458 [b_2]: 1.038e-05 [updatestate_depend_eliminate]: 1.075e-05 [updatestate_assign_eliminate]: 3.34001e-06 [updatestate_loads_eliminate]: 3.60998e-06 [renormalize]: 9.30013e-07 [cse]: 3.321e-05 [optimize_parallel_all_gather_comm]: 2.165e-05 [overlap_param_gather]: 1.81998e-06 [cconv]: 3.183e-05 [loop_unroll]: 0.00049166 [opt_after_cconv]: 0.00012941, [1] [Cycle 1]: 0.00012298, [7] [c_1]: 3.85e-05 [parameter_eliminate]: 5.22e-06 [updatestate_depend_eliminate]: 7.68001e-06 [updatestate_assign_eliminate]: 2.88998e-06 [updatestate_loads_eliminate]: 3.25998e-06 [cse]: 2.829e-05 [renormalize]: 5.19998e-07 [remove_dup_value]: 1.602e-05 [tuple_transform]: 9.148e-05, [1] [Cycle 1]: 8.612e-05, [4] [d_1]: 5.698e-05 [none_parameter_eliminate]: 1.83002e-06 [renormalize]: 1.99972e-07 [switch_simplify]: 8.62e-06 [partial_unused_args_eliminate]: 1.82999e-06 [add_recomputation]: 6.194e-05 [cse_after_recomputation]: 2.874e-05, [1] [Cycle 1]: 2.343e-05, [1] [cse]: 1.625e-05 [environ_conv]: 8.23999e-06 [swap_dp_allreduce_reducescatter]: 6.76e-06 [bias_add_comm_swap]: 4e-06 [label_micro_interleaved_index]: 5.47001e-06 [label_fine_grained_interleaved_index]: 2.81e-06 [merge_cast_opt]: 1.27e-06 [slice_recompute_activation]: 2.36e-06 [micro_interleaved_order_control]: 2.64001e-06 [assign_add_opt]: 1.39e-06 [ForceFp32Comm]: 1.17e-06 [remove_cast_before_assign_add]: 1.39e-06 [full_micro_interleaved_order_control]: 2.04e-06 [reorder_send_recv_between_fp_bp]: 2.62001e-06 [comm_op_add_attrs]: 1.04e-06 [add_comm_op_reuse_tag]: 9.89996e-07 [interleave_split_concat_branches]: 1.10999e-06 [interleave_parallel_branches]: 1.03001e-06 [overlap_opt_shard_in_pipeline]: 1.69e-06 [overlap_opt_shard_grad_in_pipeline]: 1.85001e-06 [control_data_broadcast_order]: 1.714e-05 [grouped_pairwise_exchange_alltoall]: 2.21e-06 [offloading_packed_experts]: 4.64998e-06 [overlap_recompute_and_grad_model_parallel]: 5.47001e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.24e-06 [overlap_recompute_allgather_and_fa_grad]: 1.38002e-06 [overlap_recompute_comm]: 2.19001e-06 [overlap_grad_ring_attention]: 4.60001e-06 [overlap_grad_flash_sp]: 2.528e-05 [begin_end_overlap_inline]: 7.50006e-07 [split_matmul_comm_elemetwise]: 2.01e-06 [split_layernorm_comm]: 1.91e-06 [handle_group_info]: 1.04e-06 [symbol_engine_optimizer]: 9.831e-05, [1] [Cycle 1]: 9.353e-05, [6] [build]: 4.37998e-06 [elim_shapecalc]: 1.538e-05 [elim_not_effective]: 1.781e-05 [opt_reshape]: 9.55001e-06 [fold_const_symbol]: 1.272e-05 [renormalize]: 2.3999e-07 [detach_backward]: 2.23998e-06 [pipeline_parallel_scheduler]: 1.76e-06 [auto_monad_reorder]: 2.331e-05 [get_jit_bprop_graph]: 1.92999e-06 [rewriter_after_jit_bprop_graph]: 5.72999e-06 [opt_after_jit_grad]: 0.00054626 [validate]: 5.063e-05 Sums bootstrap : 0.000466s : 0.49% type_inference : 0.077498s : 81.81% event_method : 0.000022s : 0.02% auto_monad : 0.000073s : 0.08% graph_reusing : 0.000006s : 0.01% inline : 0.000003s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000027s : 0.03% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.01% parallel-infer-symbol : 0.000004s : 0.00% pre_auto_parallel : 0.000045s : 0.05% insert-virtual-dataset : 0.000003s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000003s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000034s : 0.04% optimize.rewriter_before_opt_a : 0.000100s : 0.11% optimize.opt_a.expand_dump_flag : 0.000005s : 0.01% optimize.opt_a.switch_simplify : 0.000059s : 0.06% optimize.opt_a.loop_unroll : 0.000042s : 0.04% optimize.opt_a.a_1 : 0.001041s : 1.10% optimize.opt_a.with_stream_mark : 0.000050s : 0.05% optimize.opt_a.recompute_prepare : 0.000026s : 0.03% optimize.opt_a.updatestate_depend_eliminate : 0.000010s : 0.01% optimize.opt_a.updatestate_assign_eliminate : 0.000008s : 0.01% optimize.opt_a.updatestate_loads_eliminate : 0.000008s : 0.01% optimize.opt_a.parameter_eliminate : 0.000004s : 0.00% optimize.opt_a.a_2 : 0.000205s : 0.22% optimize.opt_a.accelerated_algorithm : 0.000020s : 0.02% optimize.opt_a.shard : 0.000006s : 0.01% optimize.opt_a.meta_shard_fg_expand : 0.000005s : 0.01% optimize.opt_a.shard_inline : 0.000016s : 0.02% optimize.opt_a.merge_send_recv : 0.000023s : 0.02% optimize.opt_a.auto_parallel : 0.000021s : 0.02% optimize.opt_a.parallel : 0.000030s : 0.03% optimize.opt_a.flash_sp : 0.000046s : 0.05% optimize.opt_a.merge_comm : 0.000011s : 0.01% optimize.opt_a.allreduce_fusion : 0.000010s : 0.01% optimize.opt_a.matmul_add_comm_reduction : 0.000025s : 0.03% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000002s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000026s : 0.03% optimize.opt_a.virtual_dataset : 0.000018s : 0.02% optimize.opt_a.get_grad_eliminate_ : 0.000016s : 0.02% optimize.opt_a.virtual_output : 0.000017s : 0.02% optimize.opt_a.merge_forward : 0.000011s : 0.01% optimize.opt_a.cell_reuse_recompute_pass : 0.000005s : 0.01% optimize.opt_a.offload_activation : 0.000025s : 0.03% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000040s : 0.04% optimize.opt_a.merge_recompute_call_nodes : 0.000004s : 0.00% optimize.opt_a.before_grad : 0.000030s : 0.03% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000011s : 0.01% optimize.opt_a.meta_fg_expand : 0.000008s : 0.01% optimize.opt_a.flash_sp_send_recv_attached : 0.000005s : 0.01% optimize.opt_a.receive_attached : 0.000005s : 0.01% optimize.opt_a.after_resolve : 0.000032s : 0.03% optimize.opt_a.a_after_grad : 0.000027s : 0.03% optimize.opt_a.renormalize : 0.011576s : 12.22% optimize.opt_a.add_forward_monad_depend : 0.000014s : 0.02% optimize.opt_a.auto_monad_grad : 0.000005s : 0.01% optimize.opt_a.auto_monad_eliminator : 0.000040s : 0.04% optimize.opt_a.cse : 0.000064s : 0.07% optimize.opt_a.a_3 : 0.000120s : 0.13% optimize.py_interpret_to_execute_after_opt_a : 0.000019s : 0.02% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.00% optimize.rewriter_after_opt_a : 0.000049s : 0.05% optimize.convert_after_rewriter : 0.000008s : 0.01% optimize.order_py_execute_after_rewriter : 0.000006s : 0.01% optimize.mutable_eliminate : 0.000738s : 0.78% optimize.opt_b.b_1 : 0.000175s : 0.18% optimize.opt_b.b_2 : 0.000010s : 0.01% optimize.opt_b.updatestate_depend_eliminate : 0.000011s : 0.01% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000004s : 0.00% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000033s : 0.04% optimize.optimize_parallel_all_gather_comm : 0.000022s : 0.02% optimize.overlap_param_gather : 0.000002s : 0.00% optimize.cconv : 0.000032s : 0.03% optimize.loop_unroll : 0.000492s : 0.52% optimize.opt_after_cconv.c_1 : 0.000039s : 0.04% optimize.opt_after_cconv.parameter_eliminate : 0.000005s : 0.01% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000008s : 0.01% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.cse : 0.000028s : 0.03% optimize.opt_after_cconv.renormalize : 0.000001s : 0.00% optimize.remove_dup_value : 0.000016s : 0.02% optimize.tuple_transform.d_1 : 0.000057s : 0.06% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000009s : 0.01% optimize.partial_unused_args_eliminate : 0.000002s : 0.00% optimize.add_recomputation : 0.000062s : 0.07% optimize.cse_after_recomputation.cse : 0.000016s : 0.02% optimize.environ_conv : 0.000008s : 0.01% optimize.swap_dp_allreduce_reducescatter : 0.000007s : 0.01% optimize.bias_add_comm_swap : 0.000004s : 0.00% optimize.label_micro_interleaved_index : 0.000005s : 0.01% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.00% optimize.merge_cast_opt : 0.000001s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.00% optimize.micro_interleaved_order_control : 0.000003s : 0.00% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.00% optimize.full_micro_interleaved_order_control : 0.000002s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.00% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000002s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.00% optimize.control_data_broadcast_order : 0.000017s : 0.02% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.00% optimize.offloading_packed_experts : 0.000005s : 0.00% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.01% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.00% optimize.overlap_recompute_comm : 0.000002s : 0.00% optimize.overlap_grad_ring_attention : 0.000005s : 0.00% optimize.overlap_grad_flash_sp : 0.000025s : 0.03% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.00% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000004s : 0.00% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000015s : 0.02% optimize.symbol_engine_optimizer.elim_not_effective : 0.000018s : 0.02% optimize.symbol_engine_optimizer.opt_reshape : 0.000010s : 0.01% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000013s : 0.01% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.00% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000023s : 0.02% get_jit_bprop_graph : 0.000002s : 0.00% rewriter_after_jit_bprop_graph : 0.000006s : 0.01% opt_after_jit_grad : 0.000546s : 0.58% validate : 0.000051s : 0.05% Time group info: ------[substitution.] 0.000273 39 11.71% : 0.000032s : 3: substitution.cast_eliminate 0.84% : 0.000002s : 3: substitution.elim_not_effective 0.72% : 0.000002s : 3: substitution.fold_const_symbol 2.63% : 0.000007s : 5: substitution.graph_param_transform 66.78% : 0.000183s : 4: substitution.inline 2.25% : 0.000006s : 6: substitution.j_node_and_user_rematch 2.92% : 0.000008s : 6: substitution.remove_not_recompute_node 2.61% : 0.000007s : 4: substitution.replace_old_param 6.34% : 0.000017s : 4: substitution.tuple_list_get_item_eliminator 3.21% : 0.000009s : 1: substitution.value_based_eliminate ------[type_inference.] 0.077423 2 98.94% : 0.076603s : 1: type_inference.infer 1.06% : 0.000820s : 1: type_inference.specialize ------[replace.] 0.000078 8 57.39% : 0.000045s : 4: replace.inline 42.61% : 0.000033s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000195 8 92.23% : 0.000180s : 4: match.inline 7.77% : 0.000015s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000284 1596 0.96% : 0.000003s : 17: predicate.accumulaten_eliminater 0.99% : 0.000003s : 5: predicate.ad_related_special_op_eliminate 0.47% : 0.000001s : 10: predicate.addn_check_dump 0.98% : 0.000003s : 17: predicate.addn_zero_filter 0.83% : 0.000002s : 17: predicate.adjust_all_reduce_mul_add 2.25% : 0.000006s : 27: predicate.arithmetic_simplify 1.05% : 0.000003s : 17: predicate.cast_eliminate 0.58% : 0.000002s : 10: predicate.check_bprop_eliminate 0.57% : 0.000002s : 10: predicate.compare_switch_simplify 0.18% : 0.000001s : 5: predicate.const_output_eliminate 0.65% : 0.000002s : 10: predicate.depend_value_elim 1.01% : 0.000003s : 17: predicate.dict_get_item_const_eliminator 1.13% : 0.000003s : 17: predicate.dict_get_item_eliminator 0.93% : 0.000003s : 17: predicate.dict_set_item_eliminator 1.09% : 0.000003s : 10: predicate.dumpgradient_eliminate 0.24% : 0.000001s : 5: predicate.elim_not_effective 0.48% : 0.000001s : 5: predicate.elim_shapecalc_of_broadcastargs 1.18% : 0.000003s : 22: predicate.environ_add_const_eliminate 1.06% : 0.000003s : 22: predicate.environ_get_add_eliminate 1.21% : 0.000003s : 22: predicate.environ_get_depend_swap 1.76% : 0.000005s : 32: predicate.environ_get_eliminate 1.12% : 0.000003s : 22: predicate.environ_get_set_eliminate 1.37% : 0.000004s : 25: predicate.exchange_switch_depend_value 2.16% : 0.000006s : 25: predicate.float_depend_g_call 0.56% : 0.000002s : 10: predicate.float_environ_get_switch 0.93% : 0.000003s : 15: predicate.float_tuple_getitem_switch 0.15% : 0.000000s : 5: predicate.fold_const_symbol 0.65% : 0.000002s : 10: predicate.get_grad_eliminate 0.28% : 0.000001s : 5: predicate.graph_param_transform 0.56% : 0.000002s : 10: predicate.incorporate_call 0.45% : 0.000001s : 10: predicate.incorporate_call_switch 6.00% : 0.000017s : 72: predicate.inline 0.82% : 0.000002s : 10: predicate.inline_without_move 0.29% : 0.000001s : 10: predicate.j_node_and_user_rematch 0.91% : 0.000003s : 10: predicate.less_batch_normalization 1.80% : 0.000005s : 31: predicate.list_to_tuple_eliminator_ 2.39% : 0.000007s : 48: predicate.load_eliminater 1.13% : 0.000003s : 5: predicate.loop_unroll_after_grad 2.10% : 0.000006s : 36: predicate.loop_unroll_before_grad 1.81% : 0.000005s : 27: predicate.make_slice_get_slice_eliminator 0.54% : 0.000002s : 10: predicate.merge_addn 0.51% : 0.000001s : 10: predicate.micro_step_allgather_replace 0.54% : 0.000002s : 10: predicate.mini_step_allgather_replace 0.97% : 0.000003s : 17: predicate.minmaximum_grad 1.10% : 0.000003s : 5: predicate.mutable_eliminate 0.45% : 0.000001s : 5: predicate.opt_reshape 0.31% : 0.000001s : 5: predicate.parallel_virtual_node 1.69% : 0.000005s : 25: predicate.partial_defer_inline 1.62% : 0.000005s : 26: predicate.partial_eliminate 0.86% : 0.000002s : 17: predicate.print_const_string_wrapper 0.60% : 0.000002s : 10: predicate.reduce_all_const_elim 1.24% : 0.000004s : 17: predicate.reduce_eliminate 2.54% : 0.000007s : 48: predicate.redundant_stop_gradient_eliminater 0.49% : 0.000001s : 10: predicate.remove_not_recompute_node 1.30% : 0.000004s : 31: predicate.replace_applicator 0.59% : 0.000002s : 10: predicate.replace_old_param 0.37% : 0.000001s : 5: predicate.reset_defer_inline 0.97% : 0.000003s : 17: predicate.reshape_eliminate 0.62% : 0.000002s : 10: predicate.row_tensor_add_zeros_like 0.33% : 0.000001s : 5: predicate.row_tensor_eliminate 0.91% : 0.000003s : 10: predicate.same_eliminate 0.37% : 0.000001s : 10: predicate.set_cell_output_no_recompute 0.83% : 0.000002s : 10: predicate.shard_identity_eliminate 0.59% : 0.000002s : 10: predicate.special_op_eliminate 0.66% : 0.000002s : 10: predicate.specialize_transform 1.00% : 0.000003s : 10: predicate.split_environ_get_set_with_tuple_value 1.04% : 0.000003s : 10: predicate.stack_unstack_eliminate 0.38% : 0.000001s : 5: predicate.switch_call_monad_eliminater 1.46% : 0.000004s : 25: predicate.switch_defer_inline 2.05% : 0.000006s : 35: predicate.switch_layer_defer_inline 4.60% : 0.000013s : 76: predicate.switch_simplify 0.94% : 0.000003s : 17: predicate.tile_eliminate 0.93% : 0.000003s : 17: predicate.transpose_eliminate 1.84% : 0.000005s : 27: predicate.tuple_list_convert_item_index_to_positive 1.63% : 0.000005s : 27: predicate.tuple_list_get_item_const_eliminator 1.44% : 0.000004s : 27: predicate.tuple_list_get_item_depend_reorder 3.33% : 0.000009s : 41: predicate.tuple_list_get_item_eliminator 1.50% : 0.000004s : 27: predicate.tuple_list_get_set_item_eliminator 2.23% : 0.000006s : 37: predicate.tuple_list_set_item_eliminator 1.75% : 0.000005s : 31: predicate.tuple_to_list_eliminator_ 2.46% : 0.000007s : 48: predicate.updatestate_pure_node_eliminater 2.98% : 0.000008s : 58: predicate.updatestate_useless_node_eliminater 0.40% : 0.000001s : 5: predicate.value_based_eliminate 0.76% : 0.000002s : 10: predicate.virtual_dataset_eliminate 0.56% : 0.000002s : 10: predicate.virtual_output_eliminate 0.27% : 0.000001s : 5: predicate.virtual_view_grad_eliminate 0.35% : 0.000001s : 5: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000724 11 52.46% : 0.000380s : 5: func_graph_cloner_run.FuncGraphClonerGraph 47.54% : 0.000344s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.133718 192 0.00% : 0.000004s : 1: ForceFp32Comm 2.83% : 0.003786s : 1: add_attr 2.82% : 0.003770s : 1: add_attr_with_inline 0.00% : 0.000004s : 1: add_comm_op_reuse_tag 0.05% : 0.000066s : 1: add_recomputation 0.00% : 0.000004s : 1: assign_add_opt 0.06% : 0.000079s : 1: auto_monad 0.02% : 0.000028s : 1: auto_monad_reorder 0.00% : 0.000003s : 1: begin_end_overlap_inline 0.01% : 0.000007s : 1: bias_add_comm_swap 0.37% : 0.000498s : 1: bootstrap 0.03% : 0.000035s : 1: cconv 0.00% : 0.000004s : 1: comm_op_add_attrs 0.02% : 0.000021s : 1: control_data_broadcast_order 0.01% : 0.000011s : 1: convert_after_rewriter 0.02% : 0.000032s : 1: cse_after_recomputation 0.00% : 0.000006s : 1: dataset_repeat_opt 0.00% : 0.000006s : 1: detach_backward 0.01% : 0.000011s : 1: environ_conv 0.02% : 0.000029s : 1: event_method 0.00% : 0.000005s : 1: full_micro_interleaved_order_control 0.00% : 0.000005s : 1: get_jit_bprop_graph 0.01% : 0.000009s : 1: graph_reusing 0.00% : 0.000005s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000004s : 1: handle_group_info 0.00% : 0.000006s : 1: inline 0.00% : 0.000006s : 1: insert-virtual-dataset 0.00% : 0.000004s : 1: interleave_parallel_branches 0.00% : 0.000004s : 1: interleave_split_concat_branches 0.00% : 0.000006s : 1: label_fine_grained_interleaved_index 0.01% : 0.000008s : 1: label_micro_interleaved_index 0.37% : 0.000501s : 1: loop_unroll 0.00% : 0.000004s : 1: merge_cast_opt 0.00% : 0.000005s : 1: micro_interleaved_order_control 0.56% : 0.000749s : 1: mutable_eliminate 0.01% : 0.000008s : 1: offloading_packed_experts 0.02% : 0.000021s : 1: opt.transform.loop_unroll_optimizer 0.02% : 0.000024s : 1: opt.transform.mutable_eliminate 1.21% : 0.001618s : 78: opt.transform.opt_a 0.03% : 0.000037s : 1: opt.transform.opt_after_cconv 0.03% : 0.000036s : 1: opt.transform.opt_after_jit_grad 0.11% : 0.000150s : 28: opt.transform.opt_b 0.05% : 0.000063s : 2: opt.transform.opt_trans_graph 0.04% : 0.000051s : 4: opt.transform.symbol_engine_opt 10.73% : 0.014344s : 1: opt_a 0.10% : 0.000133s : 1: opt_after_cconv 0.42% : 0.000557s : 1: opt_after_jit_grad 0.22% : 0.000289s : 1: opt_b 12.65% : 0.016916s : 1: optimize 0.02% : 0.000025s : 1: optimize_parallel_all_gather_comm 0.01% : 0.000009s : 1: order_py_execute_after_rewriter 0.02% : 0.000029s : 1: overlap_grad_flash_sp 0.00% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.01% : 0.000008s : 1: overlap_grad_ring_attention 0.00% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000005s : 1: overlap_param_gather 0.00% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.01% : 0.000008s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000005s : 1: overlap_recompute_comm 0.01% : 0.000008s : 1: parallel-infer-symbol 0.00% : 0.000004s : 1: parallel-infer-symbol-second 0.00% : 0.000005s : 1: partial_unused_args_eliminate 0.00% : 0.000005s : 1: pipeline_parallel_scheduler 0.00% : 0.000004s : 1: pipeline_split 0.04% : 0.000050s : 1: pre_auto_parallel 0.03% : 0.000039s : 1: py_interpret_to_execute 0.02% : 0.000023s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000005s : 1: remove_cast_before_assign_add 0.01% : 0.000020s : 1: remove_dup_value 8.28% : 0.011069s : 1: renormalize.infer 0.37% : 0.000491s : 1: renormalize.specialize 0.00% : 0.000005s : 1: reorder_send_recv_between_fp_bp 0.01% : 0.000009s : 1: rewriter_after_jit_bprop_graph 0.04% : 0.000055s : 1: rewriter_after_opt_a 0.08% : 0.000105s : 1: rewriter_before_opt_a 0.00% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000005s : 1: slice_recompute_activation 0.00% : 0.000005s : 1: split_layernorm_comm 0.00% : 0.000005s : 1: split_matmul_comm_elemetwise 0.01% : 0.000010s : 1: swap_dp_allreduce_reducescatter 0.08% : 0.000102s : 1: symbol_engine_optimizer 0.07% : 0.000095s : 1: tuple_transform 57.97% : 0.077518s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:44:35.651.580 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:44:35.651.856 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0353571, [21] [bootstrap]: 0.00050653 [type_inference]: 0.00615839 [event_method]: 2.051e-05 [auto_monad]: 7.091e-05 [graph_reusing]: 6.72002e-06 [inline]: 2.27999e-06 [add_attr]: 0.00327184, [1] [add_attr_with_inline]: 0.0032607, [1] [Cycle 1]: 8.042e-05, [2] [tag_attr]: 2.113e-05 [meta_addattr_fg_expand]: 6.46999e-06 [parallel-infer-symbol]: 3.63999e-06 [pre_auto_parallel]: 3.819e-05 [insert-virtual-dataset]: 2.60997e-06 [parallel-infer-symbol-second]: 6.90023e-07 [dataset_repeat_opt]: 2.07999e-06 [pipeline_split]: 1.63002e-06 [optimize]: 0.0240545, [53] [py_interpret_to_execute]: 3.187e-05 [rewriter_before_opt_a]: 9.519e-05 [opt_a]: 0.0213059, [2] [Cycle 1]: 0.0202303, [45] [expand_dump_flag]: 3.09999e-06 [switch_simplify]: 4.537e-05 [loop_unroll]: 3.144e-05 [a_1]: 0.00078096 [with_stream_mark]: 2.065e-05 [recompute_prepare]: 1.311e-05 [updatestate_depend_eliminate]: 5.09e-06 [updatestate_assign_eliminate]: 4.09002e-06 [updatestate_loads_eliminate]: 3.91999e-06 [parameter_eliminate]: 2.08998e-06 [a_2]: 0.0001326 [accelerated_algorithm]: 9.54e-06 [shard]: 1.92999e-06 [meta_shard_fg_expand]: 1.94999e-06 [shard_inline]: 9.10999e-06 [merge_send_recv]: 9.90002e-06 [auto_parallel]: 8.48999e-06 [parallel]: 1.833e-05 [flash_sp]: 1.023e-05 [merge_comm]: 5.42999e-06 [allreduce_fusion]: 4.17e-06 [matmul_add_comm_reduction]: 1.117e-05 [allreduce_slice_to_reducescatter]: 8.39995e-07 [virtual_shard_identity]: 1.12e-05 [virtual_dataset]: 8.25e-06 [get_grad_eliminate_]: 7.18e-06 [virtual_output]: 8.77e-06 [merge_forward]: 5.77001e-06 [cell_reuse_recompute_pass]: 1.42e-06 [offload_activation]: 1.13e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.068e-05 [merge_recompute_call_nodes]: 1.94999e-06 [before_grad]: 1.329e-05 [set_forward_comm_id_for_comm_node_pass]: 5.32999e-06 [meta_fg_expand]: 3.58999e-06 [flash_sp_send_recv_attached]: 3.03998e-06 [receive_attached]: 2.48e-06 [after_resolve]: 1.393e-05 [a_after_grad]: 1.192e-05 [renormalize]: 0.0183667 [add_forward_monad_depend]: 8.97999e-06 [auto_monad_grad]: 2.59999e-06 [auto_monad_eliminator]: 2.185e-05 [cse]: 3.966e-05 [a_3]: 8.782e-05 [Cycle 2]: 0.00105651, [45] [expand_dump_flag]: 2.47001e-06 [switch_simplify]: 1.04e-05 [loop_unroll]: 8.23999e-06 [a_1]: 0.00019501 [with_stream_mark]: 1.905e-05 [recompute_prepare]: 1.015e-05 [updatestate_depend_eliminate]: 4.98001e-06 [updatestate_assign_eliminate]: 3.92002e-06 [updatestate_loads_eliminate]: 3.7e-06 [parameter_eliminate]: 2.28002e-06 [a_2]: 0.00012335 [accelerated_algorithm]: 9.55001e-06 [shard]: 2.21e-06 [meta_shard_fg_expand]: 2.48e-06 [shard_inline]: 7.87003e-06 [merge_send_recv]: 9.51998e-06 [auto_parallel]: 1.061e-05 [parallel]: 9.64999e-06 [flash_sp]: 4.87e-06 [merge_comm]: 4.42998e-06 [allreduce_fusion]: 4.84e-06 [matmul_add_comm_reduction]: 1.048e-05 [allreduce_slice_to_reducescatter]: 8.70001e-07 [virtual_shard_identity]: 1.082e-05 [virtual_dataset]: 7.97e-06 [get_grad_eliminate_]: 7.95e-06 [virtual_output]: 7.68999e-06 [merge_forward]: 5.57001e-06 [cell_reuse_recompute_pass]: 3.2e-06 [offload_activation]: 1.163e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.007e-05 [merge_recompute_call_nodes]: 1.71e-06 [before_grad]: 1.311e-05 [set_forward_comm_id_for_comm_node_pass]: 5.12999e-06 [meta_fg_expand]: 3.73001e-06 [flash_sp_send_recv_attached]: 1.47999e-06 [receive_attached]: 2.86e-06 [after_resolve]: 1.339e-05 [a_after_grad]: 1.274e-05 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 2.78e-06 [auto_monad_grad]: 1.82999e-06 [auto_monad_eliminator]: 1.219e-05 [cse]: 2.356e-05 [a_3]: 5.937e-05 [py_interpret_to_execute_after_opt_a]: 1.84e-05 [slice_cell_reuse_recomputed_activation]: 4.60001e-06 [rewriter_after_opt_a]: 4.886e-05 [convert_after_rewriter]: 1.163e-05 [order_py_execute_after_rewriter]: 8.47998e-06 [mutable_eliminate]: 0.00077085 [opt_b]: 0.00033836, [1] [Cycle 1]: 0.00032807, [7] [b_1]: 0.00021415 [b_2]: 9.99001e-06 [updatestate_depend_eliminate]: 8.12e-06 [updatestate_assign_eliminate]: 3.33e-06 [updatestate_loads_eliminate]: 3.48999e-06 [renormalize]: 6.00005e-07 [cse]: 3.004e-05 [optimize_parallel_all_gather_comm]: 2.26e-05 [overlap_param_gather]: 5.05001e-06 [cconv]: 3.558e-05 [loop_unroll]: 0.00047839 [opt_after_cconv]: 0.00014544, [1] [Cycle 1]: 0.00013637, [7] [c_1]: 3.767e-05 [parameter_eliminate]: 3.14999e-06 [updatestate_depend_eliminate]: 6.78998e-06 [updatestate_assign_eliminate]: 3.11001e-06 [updatestate_loads_eliminate]: 3.21999e-06 [cse]: 2.567e-05 [renormalize]: 8.50006e-07 [remove_dup_value]: 1.864e-05 [tuple_transform]: 0.00010282, [1] [Cycle 1]: 9.495e-05, [4] [d_1]: 5.4e-05 [none_parameter_eliminate]: 1.96e-06 [renormalize]: 1.60013e-07 [switch_simplify]: 8.33001e-06 [partial_unused_args_eliminate]: 4.63001e-06 [add_recomputation]: 6.035e-05 [cse_after_recomputation]: 3.238e-05, [1] [Cycle 1]: 2.524e-05, [1] [cse]: 1.556e-05 [environ_conv]: 9.86998e-06 [swap_dp_allreduce_reducescatter]: 8.79e-06 [bias_add_comm_swap]: 5.29998e-06 [label_micro_interleaved_index]: 7.83001e-06 [label_fine_grained_interleaved_index]: 5.47001e-06 [merge_cast_opt]: 3.71999e-06 [slice_recompute_activation]: 4.45e-06 [micro_interleaved_order_control]: 4.67e-06 [assign_add_opt]: 3.75998e-06 [ForceFp32Comm]: 3.73001e-06 [remove_cast_before_assign_add]: 3.5e-06 [full_micro_interleaved_order_control]: 5.00001e-06 [reorder_send_recv_between_fp_bp]: 5.20999e-06 [comm_op_add_attrs]: 4.21001e-06 [add_comm_op_reuse_tag]: 3.45003e-06 [interleave_split_concat_branches]: 3.78999e-06 [interleave_parallel_branches]: 3.31001e-06 [overlap_opt_shard_in_pipeline]: 3.34001e-06 [overlap_opt_shard_grad_in_pipeline]: 4.28001e-06 [control_data_broadcast_order]: 1.845e-05 [grouped_pairwise_exchange_alltoall]: 4.27e-06 [offloading_packed_experts]: 6.70002e-06 [overlap_recompute_and_grad_model_parallel]: 7.38e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.6e-06 [overlap_recompute_allgather_and_fa_grad]: 3.66999e-06 [overlap_recompute_comm]: 4.94998e-06 [overlap_grad_ring_attention]: 6.94999e-06 [overlap_grad_flash_sp]: 2.852e-05 [begin_end_overlap_inline]: 3.00998e-06 [split_matmul_comm_elemetwise]: 4.99e-06 [split_layernorm_comm]: 4.52e-06 [handle_group_info]: 3.43e-06 [symbol_engine_optimizer]: 0.00010636, [1] [Cycle 1]: 9.928e-05, [6] [build]: 3.5e-06 [elim_shapecalc]: 1.245e-05 [elim_not_effective]: 1.642e-05 [opt_reshape]: 8.28999e-06 [fold_const_symbol]: 1.237e-05 [renormalize]: 2.09984e-07 [detach_backward]: 4.24002e-06 [pipeline_parallel_scheduler]: 1.77999e-06 [auto_monad_reorder]: 2.246e-05 [get_jit_bprop_graph]: 1.84998e-06 [rewriter_after_jit_bprop_graph]: 4.2e-06 [opt_after_jit_grad]: 0.0005183 [validate]: 4.293e-05 Sums bootstrap : 0.000507s : 1.68% type_inference : 0.006158s : 20.38% event_method : 0.000021s : 0.07% auto_monad : 0.000071s : 0.23% graph_reusing : 0.000007s : 0.02% inline : 0.000002s : 0.01% add_attr.add_attr_with_inline.tag_attr : 0.000021s : 0.07% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.02% parallel-infer-symbol : 0.000004s : 0.01% pre_auto_parallel : 0.000038s : 0.13% insert-virtual-dataset : 0.000003s : 0.01% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.01% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000032s : 0.11% optimize.rewriter_before_opt_a : 0.000095s : 0.32% optimize.opt_a.expand_dump_flag : 0.000006s : 0.02% optimize.opt_a.switch_simplify : 0.000056s : 0.18% optimize.opt_a.loop_unroll : 0.000040s : 0.13% optimize.opt_a.a_1 : 0.000976s : 3.23% optimize.opt_a.with_stream_mark : 0.000040s : 0.13% optimize.opt_a.recompute_prepare : 0.000023s : 0.08% optimize.opt_a.updatestate_depend_eliminate : 0.000010s : 0.03% optimize.opt_a.updatestate_assign_eliminate : 0.000008s : 0.03% optimize.opt_a.updatestate_loads_eliminate : 0.000008s : 0.03% optimize.opt_a.parameter_eliminate : 0.000004s : 0.01% optimize.opt_a.a_2 : 0.000256s : 0.85% optimize.opt_a.accelerated_algorithm : 0.000019s : 0.06% optimize.opt_a.shard : 0.000004s : 0.01% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.01% optimize.opt_a.shard_inline : 0.000017s : 0.06% optimize.opt_a.merge_send_recv : 0.000019s : 0.06% optimize.opt_a.auto_parallel : 0.000019s : 0.06% optimize.opt_a.parallel : 0.000028s : 0.09% optimize.opt_a.flash_sp : 0.000015s : 0.05% optimize.opt_a.merge_comm : 0.000010s : 0.03% optimize.opt_a.allreduce_fusion : 0.000009s : 0.03% optimize.opt_a.matmul_add_comm_reduction : 0.000022s : 0.07% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000002s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000022s : 0.07% optimize.opt_a.virtual_dataset : 0.000016s : 0.05% optimize.opt_a.get_grad_eliminate_ : 0.000015s : 0.05% optimize.opt_a.virtual_output : 0.000016s : 0.05% optimize.opt_a.merge_forward : 0.000011s : 0.04% optimize.opt_a.cell_reuse_recompute_pass : 0.000005s : 0.02% optimize.opt_a.offload_activation : 0.000023s : 0.08% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000041s : 0.13% optimize.opt_a.merge_recompute_call_nodes : 0.000004s : 0.01% optimize.opt_a.before_grad : 0.000026s : 0.09% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000010s : 0.03% optimize.opt_a.meta_fg_expand : 0.000007s : 0.02% optimize.opt_a.flash_sp_send_recv_attached : 0.000005s : 0.01% optimize.opt_a.receive_attached : 0.000005s : 0.02% optimize.opt_a.after_resolve : 0.000027s : 0.09% optimize.opt_a.a_after_grad : 0.000025s : 0.08% optimize.opt_a.renormalize : 0.018367s : 60.79% optimize.opt_a.add_forward_monad_depend : 0.000012s : 0.04% optimize.opt_a.auto_monad_grad : 0.000004s : 0.01% optimize.opt_a.auto_monad_eliminator : 0.000034s : 0.11% optimize.opt_a.cse : 0.000063s : 0.21% optimize.opt_a.a_3 : 0.000147s : 0.49% optimize.py_interpret_to_execute_after_opt_a : 0.000018s : 0.06% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.02% optimize.rewriter_after_opt_a : 0.000049s : 0.16% optimize.convert_after_rewriter : 0.000012s : 0.04% optimize.order_py_execute_after_rewriter : 0.000008s : 0.03% optimize.mutable_eliminate : 0.000771s : 2.55% optimize.opt_b.b_1 : 0.000214s : 0.71% optimize.opt_b.b_2 : 0.000010s : 0.03% optimize.opt_b.updatestate_depend_eliminate : 0.000008s : 0.03% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.01% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.01% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000030s : 0.10% optimize.optimize_parallel_all_gather_comm : 0.000023s : 0.07% optimize.overlap_param_gather : 0.000005s : 0.02% optimize.cconv : 0.000036s : 0.12% optimize.loop_unroll : 0.000478s : 1.58% optimize.opt_after_cconv.c_1 : 0.000038s : 0.12% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000007s : 0.02% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.cse : 0.000026s : 0.08% optimize.opt_after_cconv.renormalize : 0.000001s : 0.00% optimize.remove_dup_value : 0.000019s : 0.06% optimize.tuple_transform.d_1 : 0.000054s : 0.18% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000008s : 0.03% optimize.partial_unused_args_eliminate : 0.000005s : 0.02% optimize.add_recomputation : 0.000060s : 0.20% optimize.cse_after_recomputation.cse : 0.000016s : 0.05% optimize.environ_conv : 0.000010s : 0.03% optimize.swap_dp_allreduce_reducescatter : 0.000009s : 0.03% optimize.bias_add_comm_swap : 0.000005s : 0.02% optimize.label_micro_interleaved_index : 0.000008s : 0.03% optimize.label_fine_grained_interleaved_index : 0.000005s : 0.02% optimize.merge_cast_opt : 0.000004s : 0.01% optimize.slice_recompute_activation : 0.000004s : 0.01% optimize.micro_interleaved_order_control : 0.000005s : 0.02% optimize.assign_add_opt : 0.000004s : 0.01% optimize.ForceFp32Comm : 0.000004s : 0.01% optimize.remove_cast_before_assign_add : 0.000003s : 0.01% optimize.full_micro_interleaved_order_control : 0.000005s : 0.02% optimize.reorder_send_recv_between_fp_bp : 0.000005s : 0.02% optimize.comm_op_add_attrs : 0.000004s : 0.01% optimize.add_comm_op_reuse_tag : 0.000003s : 0.01% optimize.interleave_split_concat_branches : 0.000004s : 0.01% optimize.interleave_parallel_branches : 0.000003s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000003s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000004s : 0.01% optimize.control_data_broadcast_order : 0.000018s : 0.06% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.01% optimize.offloading_packed_experts : 0.000007s : 0.02% optimize.overlap_recompute_and_grad_model_parallel : 0.000007s : 0.02% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.01% optimize.overlap_recompute_comm : 0.000005s : 0.02% optimize.overlap_grad_ring_attention : 0.000007s : 0.02% optimize.overlap_grad_flash_sp : 0.000029s : 0.09% optimize.begin_end_overlap_inline : 0.000003s : 0.01% optimize.split_matmul_comm_elemetwise : 0.000005s : 0.02% optimize.split_layernorm_comm : 0.000005s : 0.01% optimize.handle_group_info : 0.000003s : 0.01% optimize.symbol_engine_optimizer.build : 0.000003s : 0.01% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000012s : 0.04% optimize.symbol_engine_optimizer.elim_not_effective : 0.000016s : 0.05% optimize.symbol_engine_optimizer.opt_reshape : 0.000008s : 0.03% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000012s : 0.04% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000004s : 0.01% pipeline_parallel_scheduler : 0.000002s : 0.01% auto_monad_reorder : 0.000022s : 0.07% get_jit_bprop_graph : 0.000002s : 0.01% rewriter_after_jit_bprop_graph : 0.000004s : 0.01% opt_after_jit_grad : 0.000518s : 1.72% validate : 0.000043s : 0.14% Time group info: ------[substitution.] 0.000236 39 11.41% : 0.000027s : 3: substitution.cast_eliminate 0.95% : 0.000002s : 3: substitution.elim_not_effective 0.74% : 0.000002s : 3: substitution.fold_const_symbol 2.79% : 0.000007s : 5: substitution.graph_param_transform 66.07% : 0.000156s : 4: substitution.inline 2.17% : 0.000005s : 6: substitution.j_node_and_user_rematch 2.86% : 0.000007s : 6: substitution.remove_not_recompute_node 2.68% : 0.000006s : 4: substitution.replace_old_param 7.09% : 0.000017s : 4: substitution.tuple_list_get_item_eliminator 3.24% : 0.000008s : 1: substitution.value_based_eliminate ------[type_inference.] 0.006098 2 87.87% : 0.005358s : 1: type_inference.infer 12.13% : 0.000740s : 1: type_inference.specialize ------[replace.] 0.000068 8 57.77% : 0.000040s : 4: replace.inline 42.23% : 0.000029s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000167 8 91.32% : 0.000153s : 4: match.inline 8.68% : 0.000015s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000274 1596 0.91% : 0.000002s : 17: predicate.accumulaten_eliminater 0.67% : 0.000002s : 5: predicate.ad_related_special_op_eliminate 0.56% : 0.000002s : 10: predicate.addn_check_dump 0.91% : 0.000002s : 17: predicate.addn_zero_filter 0.82% : 0.000002s : 17: predicate.adjust_all_reduce_mul_add 2.30% : 0.000006s : 27: predicate.arithmetic_simplify 1.17% : 0.000003s : 17: predicate.cast_eliminate 0.65% : 0.000002s : 10: predicate.check_bprop_eliminate 0.56% : 0.000002s : 10: predicate.compare_switch_simplify 0.17% : 0.000000s : 5: predicate.const_output_eliminate 0.60% : 0.000002s : 10: predicate.depend_value_elim 1.03% : 0.000003s : 17: predicate.dict_get_item_const_eliminator 1.11% : 0.000003s : 17: predicate.dict_get_item_eliminator 0.88% : 0.000002s : 17: predicate.dict_set_item_eliminator 0.90% : 0.000002s : 10: predicate.dumpgradient_eliminate 0.24% : 0.000001s : 5: predicate.elim_not_effective 0.37% : 0.000001s : 5: predicate.elim_shapecalc_of_broadcastargs 1.23% : 0.000003s : 22: predicate.environ_add_const_eliminate 1.27% : 0.000003s : 22: predicate.environ_get_add_eliminate 1.17% : 0.000003s : 22: predicate.environ_get_depend_swap 1.77% : 0.000005s : 32: predicate.environ_get_eliminate 1.13% : 0.000003s : 22: predicate.environ_get_set_eliminate 1.42% : 0.000004s : 25: predicate.exchange_switch_depend_value 2.31% : 0.000006s : 25: predicate.float_depend_g_call 0.54% : 0.000001s : 10: predicate.float_environ_get_switch 0.84% : 0.000002s : 15: predicate.float_tuple_getitem_switch 0.15% : 0.000000s : 5: predicate.fold_const_symbol 0.60% : 0.000002s : 10: predicate.get_grad_eliminate 0.21% : 0.000001s : 5: predicate.graph_param_transform 0.60% : 0.000002s : 10: predicate.incorporate_call 0.47% : 0.000001s : 10: predicate.incorporate_call_switch 5.86% : 0.000016s : 72: predicate.inline 0.80% : 0.000002s : 10: predicate.inline_without_move 0.43% : 0.000001s : 10: predicate.j_node_and_user_rematch 0.85% : 0.000002s : 10: predicate.less_batch_normalization 1.93% : 0.000005s : 31: predicate.list_to_tuple_eliminator_ 2.57% : 0.000007s : 48: predicate.load_eliminater 0.79% : 0.000002s : 5: predicate.loop_unroll_after_grad 2.02% : 0.000006s : 36: predicate.loop_unroll_before_grad 1.73% : 0.000005s : 27: predicate.make_slice_get_slice_eliminator 0.55% : 0.000002s : 10: predicate.merge_addn 0.53% : 0.000001s : 10: predicate.micro_step_allgather_replace 0.59% : 0.000002s : 10: predicate.mini_step_allgather_replace 0.84% : 0.000002s : 17: predicate.minmaximum_grad 0.96% : 0.000003s : 5: predicate.mutable_eliminate 0.38% : 0.000001s : 5: predicate.opt_reshape 0.35% : 0.000001s : 5: predicate.parallel_virtual_node 1.70% : 0.000005s : 25: predicate.partial_defer_inline 1.63% : 0.000004s : 26: predicate.partial_eliminate 0.96% : 0.000003s : 17: predicate.print_const_string_wrapper 0.56% : 0.000002s : 10: predicate.reduce_all_const_elim 1.25% : 0.000003s : 17: predicate.reduce_eliminate 2.63% : 0.000007s : 48: predicate.redundant_stop_gradient_eliminater 0.43% : 0.000001s : 10: predicate.remove_not_recompute_node 1.34% : 0.000004s : 31: predicate.replace_applicator 0.50% : 0.000001s : 10: predicate.replace_old_param 0.27% : 0.000001s : 5: predicate.reset_defer_inline 0.99% : 0.000003s : 17: predicate.reshape_eliminate 0.64% : 0.000002s : 10: predicate.row_tensor_add_zeros_like 0.50% : 0.000001s : 5: predicate.row_tensor_eliminate 0.82% : 0.000002s : 10: predicate.same_eliminate 0.40% : 0.000001s : 10: predicate.set_cell_output_no_recompute 0.72% : 0.000002s : 10: predicate.shard_identity_eliminate 0.62% : 0.000002s : 10: predicate.special_op_eliminate 0.76% : 0.000002s : 10: predicate.specialize_transform 1.01% : 0.000003s : 10: predicate.split_environ_get_set_with_tuple_value 0.74% : 0.000002s : 10: predicate.stack_unstack_eliminate 0.33% : 0.000001s : 5: predicate.switch_call_monad_eliminater 1.48% : 0.000004s : 25: predicate.switch_defer_inline 2.13% : 0.000006s : 35: predicate.switch_layer_defer_inline 4.68% : 0.000013s : 76: predicate.switch_simplify 0.87% : 0.000002s : 17: predicate.tile_eliminate 1.09% : 0.000003s : 17: predicate.transpose_eliminate 1.54% : 0.000004s : 27: predicate.tuple_list_convert_item_index_to_positive 1.82% : 0.000005s : 27: predicate.tuple_list_get_item_const_eliminator 1.60% : 0.000004s : 27: predicate.tuple_list_get_item_depend_reorder 3.19% : 0.000009s : 41: predicate.tuple_list_get_item_eliminator 1.52% : 0.000004s : 27: predicate.tuple_list_get_set_item_eliminator 2.29% : 0.000006s : 37: predicate.tuple_list_set_item_eliminator 1.67% : 0.000005s : 31: predicate.tuple_to_list_eliminator_ 2.59% : 0.000007s : 48: predicate.updatestate_pure_node_eliminater 3.20% : 0.000009s : 58: predicate.updatestate_useless_node_eliminater 0.49% : 0.000001s : 5: predicate.value_based_eliminate 0.87% : 0.000002s : 10: predicate.virtual_dataset_eliminate 0.78% : 0.000002s : 10: predicate.virtual_output_eliminate 0.28% : 0.000001s : 5: predicate.virtual_view_grad_eliminate 0.37% : 0.000001s : 5: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000614 11 50.89% : 0.000312s : 5: func_graph_cloner_run.FuncGraphClonerGraph 49.11% : 0.000301s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.082798 192 0.01% : 0.000006s : 1: ForceFp32Comm 3.96% : 0.003282s : 1: add_attr 3.94% : 0.003264s : 1: add_attr_with_inline 0.01% : 0.000006s : 1: add_comm_op_reuse_tag 0.08% : 0.000064s : 1: add_recomputation 0.01% : 0.000006s : 1: assign_add_opt 0.10% : 0.000080s : 1: auto_monad 0.04% : 0.000030s : 1: auto_monad_reorder 0.01% : 0.000006s : 1: begin_end_overlap_inline 0.01% : 0.000008s : 1: bias_add_comm_swap 0.67% : 0.000552s : 1: bootstrap 0.05% : 0.000039s : 1: cconv 0.01% : 0.000007s : 1: comm_op_add_attrs 0.03% : 0.000021s : 1: control_data_broadcast_order 0.02% : 0.000015s : 1: convert_after_rewriter 0.04% : 0.000035s : 1: cse_after_recomputation 0.01% : 0.000008s : 1: dataset_repeat_opt 0.02% : 0.000020s : 1: detach_backward 0.02% : 0.000013s : 1: environ_conv 0.04% : 0.000030s : 1: event_method 0.01% : 0.000008s : 1: full_micro_interleaved_order_control 0.01% : 0.000009s : 1: get_jit_bprop_graph 0.02% : 0.000013s : 1: graph_reusing 0.01% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000006s : 1: handle_group_info 0.01% : 0.000008s : 1: inline 0.01% : 0.000009s : 1: insert-virtual-dataset 0.01% : 0.000006s : 1: interleave_parallel_branches 0.01% : 0.000006s : 1: interleave_split_concat_branches 0.01% : 0.000008s : 1: label_fine_grained_interleaved_index 0.01% : 0.000011s : 1: label_micro_interleaved_index 0.58% : 0.000484s : 1: loop_unroll 0.01% : 0.000006s : 1: merge_cast_opt 0.01% : 0.000007s : 1: micro_interleaved_order_control 0.94% : 0.000779s : 1: mutable_eliminate 0.01% : 0.000009s : 1: offloading_packed_experts 0.02% : 0.000017s : 1: opt.transform.loop_unroll_optimizer 0.03% : 0.000023s : 1: opt.transform.mutable_eliminate 1.85% : 0.001533s : 78: opt.transform.opt_a 0.04% : 0.000036s : 1: opt.transform.opt_after_cconv 0.04% : 0.000030s : 1: opt.transform.opt_after_jit_grad 0.18% : 0.000149s : 28: opt.transform.opt_b 0.07% : 0.000060s : 2: opt.transform.opt_trans_graph 0.06% : 0.000046s : 4: opt.transform.symbol_engine_opt 25.74% : 0.021310s : 1: opt_a 0.18% : 0.000149s : 1: opt_after_cconv 0.64% : 0.000528s : 1: opt_after_jit_grad 0.41% : 0.000342s : 1: opt_b 29.50% : 0.024422s : 1: optimize 0.03% : 0.000026s : 1: optimize_parallel_all_gather_comm 0.01% : 0.000012s : 1: order_py_execute_after_rewriter 0.04% : 0.000032s : 1: overlap_grad_flash_sp 0.01% : 0.000007s : 1: overlap_grad_matmul_and_grad_allreduce 0.01% : 0.000010s : 1: overlap_grad_ring_attention 0.01% : 0.000007s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000006s : 1: overlap_opt_shard_in_pipeline 0.01% : 0.000008s : 1: overlap_param_gather 0.01% : 0.000006s : 1: overlap_recompute_allgather_and_fa_grad 0.01% : 0.000010s : 1: overlap_recompute_and_grad_model_parallel 0.01% : 0.000008s : 1: overlap_recompute_comm 0.01% : 0.000011s : 1: parallel-infer-symbol 0.01% : 0.000006s : 1: parallel-infer-symbol-second 0.01% : 0.000008s : 1: partial_unused_args_eliminate 0.01% : 0.000009s : 1: pipeline_parallel_scheduler 0.01% : 0.000007s : 1: pipeline_split 0.06% : 0.000046s : 1: pre_auto_parallel 0.04% : 0.000036s : 1: py_interpret_to_execute 0.03% : 0.000022s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000006s : 1: remove_cast_before_assign_add 0.03% : 0.000022s : 1: remove_dup_value 21.64% : 0.017915s : 1: renormalize.infer 0.53% : 0.000437s : 1: renormalize.specialize 0.01% : 0.000008s : 1: reorder_send_recv_between_fp_bp 0.01% : 0.000011s : 1: rewriter_after_jit_bprop_graph 0.06% : 0.000053s : 1: rewriter_after_opt_a 0.12% : 0.000099s : 1: rewriter_before_opt_a 0.01% : 0.000007s : 1: slice_cell_reuse_recomputed_activation 0.01% : 0.000007s : 1: slice_recompute_activation 0.01% : 0.000007s : 1: split_layernorm_comm 0.01% : 0.000008s : 1: split_matmul_comm_elemetwise 0.01% : 0.000012s : 1: swap_dp_allreduce_reducescatter 0.13% : 0.000109s : 1: symbol_engine_optimizer 0.13% : 0.000106s : 1: tuple_transform 7.49% : 0.006202s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:44:36.589.32 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0292173, [21] [bootstrap]: 0.00048637 [type_inference]: 0.00637998 [event_method]: 2.097e-05 [auto_monad]: 6.677e-05 [graph_reusing]: 6.44001e-06 [inline]: 2.94001e-06 [add_attr]: 0.003296, [1] [add_attr_with_inline]: 0.00328603, [1] [Cycle 1]: 6.365e-05, [2] [tag_attr]: 2.085e-05 [meta_addattr_fg_expand]: 6.36e-06 [parallel-infer-symbol]: 3.59002e-06 [pre_auto_parallel]: 3.67e-05 [insert-virtual-dataset]: 2.44001e-06 [parallel-infer-symbol-second]: 8.50006e-07 [dataset_repeat_opt]: 0.00284846 [pipeline_split]: 2.91e-06 [optimize]: 0.015275, [53] [py_interpret_to_execute]: 0.00918374 [rewriter_before_opt_a]: 0.00012632 [opt_a]: 0.00350189, [2] [Cycle 1]: 0.00266857, [45] [expand_dump_flag]: 6.49999e-06 [switch_simplify]: 5.057e-05 [loop_unroll]: 3.166e-05 [a_1]: 0.00082401 [with_stream_mark]: 2.88e-05 [recompute_prepare]: 1.5e-05 [updatestate_depend_eliminate]: 5.64e-06 [updatestate_assign_eliminate]: 3.83001e-06 [updatestate_loads_eliminate]: 3.71001e-06 [parameter_eliminate]: 2.40002e-06 [a_2]: 0.00010351 [accelerated_algorithm]: 9.17001e-06 [shard]: 2.06e-06 [meta_shard_fg_expand]: 2.71e-06 [shard_inline]: 7.53e-06 [merge_send_recv]: 1.01e-05 [auto_parallel]: 1.153e-05 [parallel]: 2.049e-05 [flash_sp]: 1.152e-05 [merge_comm]: 5.23002e-06 [allreduce_fusion]: 4.18999e-06 [matmul_add_comm_reduction]: 1.124e-05 [allreduce_slice_to_reducescatter]: 8.59989e-07 [virtual_shard_identity]: 1.07e-05 [virtual_dataset]: 8.15999e-06 [get_grad_eliminate_]: 9.00999e-06 [virtual_output]: 8.18999e-06 [merge_forward]: 5.11997e-06 [cell_reuse_recompute_pass]: 1.50999e-06 [offload_activation]: 1.247e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.954e-05 [merge_recompute_call_nodes]: 1.79998e-06 [before_grad]: 1.384e-05 [set_forward_comm_id_for_comm_node_pass]: 5.76998e-06 [meta_fg_expand]: 3.59002e-06 [flash_sp_send_recv_attached]: 3.14999e-06 [receive_attached]: 1.142e-05 [after_resolve]: 1.583e-05 [a_after_grad]: 1.255e-05 [renormalize]: 0.00093587 [add_forward_monad_depend]: 7.53999e-06 [auto_monad_grad]: 2.52001e-06 [auto_monad_eliminator]: 2.049e-05 [cse]: 3.969e-05 [a_3]: 6.604e-05 [Cycle 2]: 0.00081709, [45] [expand_dump_flag]: 1.58002e-06 [switch_simplify]: 9.23002e-06 [loop_unroll]: 7.45e-06 [a_1]: 0.00018389 [with_stream_mark]: 1.553e-05 [recompute_prepare]: 8.80999e-06 [updatestate_depend_eliminate]: 4.13999e-06 [updatestate_assign_eliminate]: 2.98e-06 [updatestate_loads_eliminate]: 2.93998e-06 [parameter_eliminate]: 2.06e-06 [a_2]: 9.258e-05 [accelerated_algorithm]: 8.87e-06 [shard]: 1.27e-06 [meta_shard_fg_expand]: 2.34001e-06 [shard_inline]: 7.27997e-06 [merge_send_recv]: 9.16998e-06 [auto_parallel]: 7.85e-06 [parallel]: 6.40002e-06 [flash_sp]: 3.73001e-06 [merge_comm]: 4.62e-06 [allreduce_fusion]: 4.42998e-06 [matmul_add_comm_reduction]: 7.18998e-06 [allreduce_slice_to_reducescatter]: 5.79981e-07 [virtual_shard_identity]: 1.024e-05 [virtual_dataset]: 7.6e-06 [get_grad_eliminate_]: 7.24001e-06 [virtual_output]: 7.43e-06 [merge_forward]: 4.25e-06 [cell_reuse_recompute_pass]: 1.99e-06 [offload_activation]: 1.013e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.629e-05 [merge_recompute_call_nodes]: 1.19e-06 [before_grad]: 1.25e-05 [set_forward_comm_id_for_comm_node_pass]: 4.15e-06 [meta_fg_expand]: 2.88e-06 [flash_sp_send_recv_attached]: 1.33002e-06 [receive_attached]: 1.34998e-06 [after_resolve]: 1.337e-05 [a_after_grad]: 1.215e-05 [renormalize]: 6.00121e-08 [add_forward_monad_depend]: 2.53e-06 [auto_monad_grad]: 1.48002e-06 [auto_monad_eliminator]: 1.175e-05 [cse]: 2.316e-05 [a_3]: 4.774e-05 [py_interpret_to_execute_after_opt_a]: 1.549e-05 [slice_cell_reuse_recomputed_activation]: 2.26e-06 [rewriter_after_opt_a]: 4.825e-05 [convert_after_rewriter]: 8.77e-06 [order_py_execute_after_rewriter]: 5.80002e-06 [mutable_eliminate]: 0.00075445 [opt_b]: 0.00027793, [1] [Cycle 1]: 0.0002695, [7] [b_1]: 0.00017169 [b_2]: 1.05e-05 [updatestate_depend_eliminate]: 9.16002e-06 [updatestate_assign_eliminate]: 3.14999e-06 [updatestate_loads_eliminate]: 3.34001e-06 [renormalize]: 6.69999e-07 [cse]: 3.293e-05 [optimize_parallel_all_gather_comm]: 2.156e-05 [overlap_param_gather]: 2.04999e-06 [cconv]: 3.216e-05 [loop_unroll]: 0.00048413 [opt_after_cconv]: 0.00012658, [1] [Cycle 1]: 0.00011963, [7] [c_1]: 3.792e-05 [parameter_eliminate]: 5.14998e-06 [updatestate_depend_eliminate]: 7.58999e-06 [updatestate_assign_eliminate]: 3.3e-06 [updatestate_loads_eliminate]: 3.18e-06 [cse]: 2.622e-05 [renormalize]: 4.69998e-07 [remove_dup_value]: 1.537e-05 [tuple_transform]: 8.754e-05, [1] [Cycle 1]: 8.281e-05, [4] [d_1]: 5.427e-05 [none_parameter_eliminate]: 1.67999e-06 [renormalize]: 2.00002e-07 [switch_simplify]: 7.96001e-06 [partial_unused_args_eliminate]: 1.89999e-06 [add_recomputation]: 7.663e-05 [cse_after_recomputation]: 2.82e-05, [1] [Cycle 1]: 2.304e-05, [1] [cse]: 1.703e-05 [environ_conv]: 7.21999e-06 [swap_dp_allreduce_reducescatter]: 6.16998e-06 [bias_add_comm_swap]: 3.39001e-06 [label_micro_interleaved_index]: 5.05001e-06 [label_fine_grained_interleaved_index]: 2.53998e-06 [merge_cast_opt]: 1.32999e-06 [slice_recompute_activation]: 2.32999e-06 [micro_interleaved_order_control]: 2.56e-06 [assign_add_opt]: 1.34e-06 [ForceFp32Comm]: 1.32999e-06 [remove_cast_before_assign_add]: 1.01002e-06 [full_micro_interleaved_order_control]: 2.11e-06 [reorder_send_recv_between_fp_bp]: 2.91e-06 [comm_op_add_attrs]: 1.00999e-06 [add_comm_op_reuse_tag]: 1.00001e-06 [interleave_split_concat_branches]: 1.10999e-06 [interleave_parallel_branches]: 1.05999e-06 [overlap_opt_shard_in_pipeline]: 1.33002e-06 [overlap_opt_shard_grad_in_pipeline]: 2.12999e-06 [control_data_broadcast_order]: 1.56e-05 [grouped_pairwise_exchange_alltoall]: 1.90001e-06 [offloading_packed_experts]: 4.85999e-06 [overlap_recompute_and_grad_model_parallel]: 5.80002e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.17e-06 [overlap_recompute_allgather_and_fa_grad]: 1.51002e-06 [overlap_recompute_comm]: 2.52001e-06 [overlap_grad_ring_attention]: 4.68001e-06 [overlap_grad_flash_sp]: 2.386e-05 [begin_end_overlap_inline]: 4.69998e-07 [split_matmul_comm_elemetwise]: 2.38002e-06 [split_layernorm_comm]: 1.61998e-06 [handle_group_info]: 1.02e-06 [symbol_engine_optimizer]: 9.01e-05, [1] [Cycle 1]: 8.517e-05, [6] [build]: 3.42002e-06 [elim_shapecalc]: 1.404e-05 [elim_not_effective]: 1.581e-05 [opt_reshape]: 8.43999e-06 [fold_const_symbol]: 1.21e-05 [renormalize]: 3.39991e-07 [detach_backward]: 2.50002e-06 [pipeline_parallel_scheduler]: 1.44e-06 [auto_monad_reorder]: 2.131e-05 [get_jit_bprop_graph]: 1.86998e-06 [rewriter_after_jit_bprop_graph]: 6.29001e-06 [opt_after_jit_grad]: 0.00051009 [validate]: 5.132e-05 Sums bootstrap : 0.000486s : 1.96% type_inference : 0.006380s : 25.72% event_method : 0.000021s : 0.08% auto_monad : 0.000067s : 0.27% graph_reusing : 0.000006s : 0.03% inline : 0.000003s : 0.01% add_attr.add_attr_with_inline.tag_attr : 0.000021s : 0.08% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.03% parallel-infer-symbol : 0.000004s : 0.01% pre_auto_parallel : 0.000037s : 0.15% insert-virtual-dataset : 0.000002s : 0.01% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.002848s : 11.48% pipeline_split : 0.000003s : 0.01% optimize.py_interpret_to_execute : 0.009184s : 37.02% optimize.rewriter_before_opt_a : 0.000126s : 0.51% optimize.opt_a.expand_dump_flag : 0.000008s : 0.03% optimize.opt_a.switch_simplify : 0.000060s : 0.24% optimize.opt_a.loop_unroll : 0.000039s : 0.16% optimize.opt_a.a_1 : 0.001008s : 4.06% optimize.opt_a.with_stream_mark : 0.000044s : 0.18% optimize.opt_a.recompute_prepare : 0.000024s : 0.10% optimize.opt_a.updatestate_depend_eliminate : 0.000010s : 0.04% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.03% optimize.opt_a.updatestate_loads_eliminate : 0.000007s : 0.03% optimize.opt_a.parameter_eliminate : 0.000004s : 0.02% optimize.opt_a.a_2 : 0.000196s : 0.79% optimize.opt_a.accelerated_algorithm : 0.000018s : 0.07% optimize.opt_a.shard : 0.000003s : 0.01% optimize.opt_a.meta_shard_fg_expand : 0.000005s : 0.02% optimize.opt_a.shard_inline : 0.000015s : 0.06% optimize.opt_a.merge_send_recv : 0.000019s : 0.08% optimize.opt_a.auto_parallel : 0.000019s : 0.08% optimize.opt_a.parallel : 0.000027s : 0.11% optimize.opt_a.flash_sp : 0.000015s : 0.06% optimize.opt_a.merge_comm : 0.000010s : 0.04% optimize.opt_a.allreduce_fusion : 0.000009s : 0.03% optimize.opt_a.matmul_add_comm_reduction : 0.000018s : 0.07% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000021s : 0.08% optimize.opt_a.virtual_dataset : 0.000016s : 0.06% optimize.opt_a.get_grad_eliminate_ : 0.000016s : 0.07% optimize.opt_a.virtual_output : 0.000016s : 0.06% optimize.opt_a.merge_forward : 0.000009s : 0.04% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.01% optimize.opt_a.offload_activation : 0.000023s : 0.09% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000036s : 0.14% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.01% optimize.opt_a.before_grad : 0.000026s : 0.11% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000010s : 0.04% optimize.opt_a.meta_fg_expand : 0.000006s : 0.03% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.02% optimize.opt_a.receive_attached : 0.000013s : 0.05% optimize.opt_a.after_resolve : 0.000029s : 0.12% optimize.opt_a.a_after_grad : 0.000025s : 0.10% optimize.opt_a.renormalize : 0.000936s : 3.77% optimize.opt_a.add_forward_monad_depend : 0.000010s : 0.04% optimize.opt_a.auto_monad_grad : 0.000004s : 0.02% optimize.opt_a.auto_monad_eliminator : 0.000032s : 0.13% optimize.opt_a.cse : 0.000063s : 0.25% optimize.opt_a.a_3 : 0.000114s : 0.46% optimize.py_interpret_to_execute_after_opt_a : 0.000015s : 0.06% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.01% optimize.rewriter_after_opt_a : 0.000048s : 0.19% optimize.convert_after_rewriter : 0.000009s : 0.04% optimize.order_py_execute_after_rewriter : 0.000006s : 0.02% optimize.mutable_eliminate : 0.000754s : 3.04% optimize.opt_b.b_1 : 0.000172s : 0.69% optimize.opt_b.b_2 : 0.000011s : 0.04% optimize.opt_b.updatestate_depend_eliminate : 0.000009s : 0.04% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.01% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.01% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000033s : 0.13% optimize.optimize_parallel_all_gather_comm : 0.000022s : 0.09% optimize.overlap_param_gather : 0.000002s : 0.01% optimize.cconv : 0.000032s : 0.13% optimize.loop_unroll : 0.000484s : 1.95% optimize.opt_after_cconv.c_1 : 0.000038s : 0.15% optimize.opt_after_cconv.parameter_eliminate : 0.000005s : 0.02% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000008s : 0.03% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.cse : 0.000026s : 0.11% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000015s : 0.06% optimize.tuple_transform.d_1 : 0.000054s : 0.22% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000008s : 0.03% optimize.partial_unused_args_eliminate : 0.000002s : 0.01% optimize.add_recomputation : 0.000077s : 0.31% optimize.cse_after_recomputation.cse : 0.000017s : 0.07% optimize.environ_conv : 0.000007s : 0.03% optimize.swap_dp_allreduce_reducescatter : 0.000006s : 0.02% optimize.bias_add_comm_swap : 0.000003s : 0.01% optimize.label_micro_interleaved_index : 0.000005s : 0.02% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.01% optimize.merge_cast_opt : 0.000001s : 0.01% optimize.slice_recompute_activation : 0.000002s : 0.01% optimize.micro_interleaved_order_control : 0.000003s : 0.01% optimize.assign_add_opt : 0.000001s : 0.01% optimize.ForceFp32Comm : 0.000001s : 0.01% optimize.remove_cast_before_assign_add : 0.000001s : 0.00% optimize.full_micro_interleaved_order_control : 0.000002s : 0.01% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.01% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.01% optimize.control_data_broadcast_order : 0.000016s : 0.06% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.01% optimize.offloading_packed_experts : 0.000005s : 0.02% optimize.overlap_recompute_and_grad_model_parallel : 0.000006s : 0.02% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000002s : 0.01% optimize.overlap_recompute_comm : 0.000003s : 0.01% optimize.overlap_grad_ring_attention : 0.000005s : 0.02% optimize.overlap_grad_flash_sp : 0.000024s : 0.10% optimize.begin_end_overlap_inline : 0.000000s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.01% optimize.split_layernorm_comm : 0.000002s : 0.01% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000003s : 0.01% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000014s : 0.06% optimize.symbol_engine_optimizer.elim_not_effective : 0.000016s : 0.06% optimize.symbol_engine_optimizer.opt_reshape : 0.000008s : 0.03% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000012s : 0.05% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000003s : 0.01% pipeline_parallel_scheduler : 0.000001s : 0.01% auto_monad_reorder : 0.000021s : 0.09% get_jit_bprop_graph : 0.000002s : 0.01% rewriter_after_jit_bprop_graph : 0.000006s : 0.03% opt_after_jit_grad : 0.000510s : 2.06% validate : 0.000051s : 0.21% Time group info: ------[substitution.] 0.000273 39 10.48% : 0.000029s : 3: substitution.cast_eliminate 0.78% : 0.000002s : 3: substitution.elim_not_effective 0.71% : 0.000002s : 3: substitution.fold_const_symbol 2.82% : 0.000008s : 5: substitution.graph_param_transform 69.14% : 0.000189s : 4: substitution.inline 1.66% : 0.000005s : 6: substitution.j_node_and_user_rematch 2.63% : 0.000007s : 6: substitution.remove_not_recompute_node 2.22% : 0.000006s : 4: substitution.replace_old_param 6.36% : 0.000017s : 4: substitution.tuple_list_get_item_eliminator 3.19% : 0.000009s : 1: substitution.value_based_eliminate ------[type_inference.] 0.006314 2 88.60% : 0.005595s : 1: type_inference.infer 11.40% : 0.000720s : 1: type_inference.specialize ------[replace.] 0.000072 8 58.96% : 0.000042s : 4: replace.inline 41.04% : 0.000030s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000202 8 92.40% : 0.000186s : 4: match.inline 7.60% : 0.000015s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000266 1596 0.98% : 0.000003s : 17: predicate.accumulaten_eliminater 0.70% : 0.000002s : 5: predicate.ad_related_special_op_eliminate 0.56% : 0.000001s : 10: predicate.addn_check_dump 0.91% : 0.000002s : 17: predicate.addn_zero_filter 0.84% : 0.000002s : 17: predicate.adjust_all_reduce_mul_add 2.03% : 0.000005s : 27: predicate.arithmetic_simplify 1.27% : 0.000003s : 17: predicate.cast_eliminate 0.55% : 0.000001s : 10: predicate.check_bprop_eliminate 0.58% : 0.000002s : 10: predicate.compare_switch_simplify 0.17% : 0.000000s : 5: predicate.const_output_eliminate 0.73% : 0.000002s : 10: predicate.depend_value_elim 0.97% : 0.000003s : 17: predicate.dict_get_item_const_eliminator 1.08% : 0.000003s : 17: predicate.dict_get_item_eliminator 0.99% : 0.000003s : 17: predicate.dict_set_item_eliminator 0.95% : 0.000003s : 10: predicate.dumpgradient_eliminate 0.24% : 0.000001s : 5: predicate.elim_not_effective 0.46% : 0.000001s : 5: predicate.elim_shapecalc_of_broadcastargs 1.20% : 0.000003s : 22: predicate.environ_add_const_eliminate 1.08% : 0.000003s : 22: predicate.environ_get_add_eliminate 1.07% : 0.000003s : 22: predicate.environ_get_depend_swap 1.74% : 0.000005s : 32: predicate.environ_get_eliminate 1.11% : 0.000003s : 22: predicate.environ_get_set_eliminate 1.37% : 0.000004s : 25: predicate.exchange_switch_depend_value 2.31% : 0.000006s : 25: predicate.float_depend_g_call 0.53% : 0.000001s : 10: predicate.float_environ_get_switch 0.73% : 0.000002s : 15: predicate.float_tuple_getitem_switch 0.18% : 0.000000s : 5: predicate.fold_const_symbol 0.64% : 0.000002s : 10: predicate.get_grad_eliminate 0.18% : 0.000000s : 5: predicate.graph_param_transform 0.61% : 0.000002s : 10: predicate.incorporate_call 0.52% : 0.000001s : 10: predicate.incorporate_call_switch 6.09% : 0.000016s : 72: predicate.inline 0.82% : 0.000002s : 10: predicate.inline_without_move 0.32% : 0.000001s : 10: predicate.j_node_and_user_rematch 0.95% : 0.000003s : 10: predicate.less_batch_normalization 1.77% : 0.000005s : 31: predicate.list_to_tuple_eliminator_ 2.54% : 0.000007s : 48: predicate.load_eliminater 0.94% : 0.000003s : 5: predicate.loop_unroll_after_grad 1.97% : 0.000005s : 36: predicate.loop_unroll_before_grad 1.67% : 0.000004s : 27: predicate.make_slice_get_slice_eliminator 0.55% : 0.000001s : 10: predicate.merge_addn 0.54% : 0.000001s : 10: predicate.micro_step_allgather_replace 0.86% : 0.000002s : 10: predicate.mini_step_allgather_replace 0.91% : 0.000002s : 17: predicate.minmaximum_grad 1.20% : 0.000003s : 5: predicate.mutable_eliminate 0.38% : 0.000001s : 5: predicate.opt_reshape 0.39% : 0.000001s : 5: predicate.parallel_virtual_node 1.59% : 0.000004s : 25: predicate.partial_defer_inline 1.63% : 0.000004s : 26: predicate.partial_eliminate 0.94% : 0.000003s : 17: predicate.print_const_string_wrapper 0.76% : 0.000002s : 10: predicate.reduce_all_const_elim 1.27% : 0.000003s : 17: predicate.reduce_eliminate 2.52% : 0.000007s : 48: predicate.redundant_stop_gradient_eliminater 0.45% : 0.000001s : 10: predicate.remove_not_recompute_node 1.36% : 0.000004s : 31: predicate.replace_applicator 0.47% : 0.000001s : 10: predicate.replace_old_param 0.28% : 0.000001s : 5: predicate.reset_defer_inline 1.20% : 0.000003s : 17: predicate.reshape_eliminate 0.64% : 0.000002s : 10: predicate.row_tensor_add_zeros_like 0.39% : 0.000001s : 5: predicate.row_tensor_eliminate 0.95% : 0.000003s : 10: predicate.same_eliminate 0.40% : 0.000001s : 10: predicate.set_cell_output_no_recompute 0.78% : 0.000002s : 10: predicate.shard_identity_eliminate 0.65% : 0.000002s : 10: predicate.special_op_eliminate 0.68% : 0.000002s : 10: predicate.specialize_transform 0.95% : 0.000003s : 10: predicate.split_environ_get_set_with_tuple_value 0.80% : 0.000002s : 10: predicate.stack_unstack_eliminate 0.39% : 0.000001s : 5: predicate.switch_call_monad_eliminater 1.48% : 0.000004s : 25: predicate.switch_defer_inline 2.01% : 0.000005s : 35: predicate.switch_layer_defer_inline 4.74% : 0.000013s : 76: predicate.switch_simplify 0.91% : 0.000002s : 17: predicate.tile_eliminate 0.91% : 0.000002s : 17: predicate.transpose_eliminate 1.50% : 0.000004s : 27: predicate.tuple_list_convert_item_index_to_positive 1.64% : 0.000004s : 27: predicate.tuple_list_get_item_const_eliminator 1.51% : 0.000004s : 27: predicate.tuple_list_get_item_depend_reorder 3.57% : 0.000009s : 41: predicate.tuple_list_get_item_eliminator 1.58% : 0.000004s : 27: predicate.tuple_list_get_set_item_eliminator 2.21% : 0.000006s : 37: predicate.tuple_list_set_item_eliminator 1.74% : 0.000005s : 31: predicate.tuple_to_list_eliminator_ 2.52% : 0.000007s : 48: predicate.updatestate_pure_node_eliminater 3.10% : 0.000008s : 58: predicate.updatestate_useless_node_eliminater 0.50% : 0.000001s : 5: predicate.value_based_eliminate 0.61% : 0.000002s : 10: predicate.virtual_dataset_eliminate 0.61% : 0.000002s : 10: predicate.virtual_output_eliminate 0.29% : 0.000001s : 5: predicate.virtual_view_grad_eliminate 0.33% : 0.000001s : 5: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000582 11 54.28% : 0.000316s : 5: func_graph_cloner_run.FuncGraphClonerGraph 45.72% : 0.000266s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.050484 192 0.01% : 0.000004s : 1: ForceFp32Comm 6.54% : 0.003301s : 1: add_attr 6.52% : 0.003290s : 1: add_attr_with_inline 0.01% : 0.000004s : 1: add_comm_op_reuse_tag 0.16% : 0.000082s : 1: add_recomputation 0.01% : 0.000004s : 1: assign_add_opt 0.14% : 0.000072s : 1: auto_monad 0.05% : 0.000026s : 1: auto_monad_reorder 0.01% : 0.000003s : 1: begin_end_overlap_inline 0.01% : 0.000007s : 1: bias_add_comm_swap 1.02% : 0.000516s : 1: bootstrap 0.07% : 0.000036s : 1: cconv 0.01% : 0.000004s : 1: comm_op_add_attrs 0.04% : 0.000019s : 1: control_data_broadcast_order 0.02% : 0.000012s : 1: convert_after_rewriter 0.06% : 0.000031s : 1: cse_after_recomputation 5.70% : 0.002876s : 1: dataset_repeat_opt 0.01% : 0.000006s : 1: detach_backward 0.02% : 0.000010s : 1: environ_conv 0.06% : 0.000028s : 1: event_method 0.01% : 0.000005s : 1: full_micro_interleaved_order_control 0.01% : 0.000005s : 1: get_jit_bprop_graph 0.02% : 0.000010s : 1: graph_reusing 0.01% : 0.000005s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000004s : 1: handle_group_info 0.01% : 0.000006s : 1: inline 0.01% : 0.000006s : 1: insert-virtual-dataset 0.01% : 0.000004s : 1: interleave_parallel_branches 0.01% : 0.000004s : 1: interleave_split_concat_branches 0.01% : 0.000006s : 1: label_fine_grained_interleaved_index 0.02% : 0.000008s : 1: label_micro_interleaved_index 0.98% : 0.000493s : 1: loop_unroll 0.01% : 0.000004s : 1: merge_cast_opt 0.01% : 0.000005s : 1: micro_interleaved_order_control 1.52% : 0.000765s : 1: mutable_eliminate 0.02% : 0.000008s : 1: offloading_packed_experts 0.04% : 0.000020s : 1: opt.transform.loop_unroll_optimizer 0.05% : 0.000023s : 1: opt.transform.mutable_eliminate 3.08% : 0.001555s : 78: opt.transform.opt_a 0.07% : 0.000036s : 1: opt.transform.opt_after_cconv 0.06% : 0.000031s : 1: opt.transform.opt_after_jit_grad 0.29% : 0.000149s : 28: opt.transform.opt_b 0.12% : 0.000060s : 2: opt.transform.opt_trans_graph 0.09% : 0.000046s : 4: opt.transform.symbol_engine_opt 6.94% : 0.003506s : 1: opt_a 0.26% : 0.000130s : 1: opt_after_cconv 1.03% : 0.000521s : 1: opt_after_jit_grad 0.56% : 0.000282s : 1: opt_b 30.27% : 0.015280s : 1: optimize 0.05% : 0.000025s : 1: optimize_parallel_all_gather_comm 0.02% : 0.000009s : 1: order_py_execute_after_rewriter 0.05% : 0.000027s : 1: overlap_grad_flash_sp 0.01% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.02% : 0.000008s : 1: overlap_grad_ring_attention 0.01% : 0.000006s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.01% : 0.000005s : 1: overlap_param_gather 0.01% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.02% : 0.000009s : 1: overlap_recompute_and_grad_model_parallel 0.01% : 0.000006s : 1: overlap_recompute_comm 0.01% : 0.000007s : 1: parallel-infer-symbol 0.01% : 0.000004s : 1: parallel-infer-symbol-second 0.01% : 0.000005s : 1: partial_unused_args_eliminate 0.01% : 0.000004s : 1: pipeline_parallel_scheduler 0.02% : 0.000008s : 1: pipeline_split 0.08% : 0.000041s : 1: pre_auto_parallel 18.25% : 0.009215s : 1: py_interpret_to_execute 0.04% : 0.000019s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000004s : 1: remove_cast_before_assign_add 0.04% : 0.000019s : 1: remove_dup_value 1.07% : 0.000542s : 1: renormalize.infer 0.76% : 0.000383s : 1: renormalize.specialize 0.01% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.02% : 0.000009s : 1: rewriter_after_jit_bprop_graph 0.10% : 0.000053s : 1: rewriter_after_opt_a 0.27% : 0.000135s : 1: rewriter_before_opt_a 0.01% : 0.000006s : 1: slice_cell_reuse_recomputed_activation 0.01% : 0.000005s : 1: slice_recompute_activation 0.01% : 0.000004s : 1: split_layernorm_comm 0.01% : 0.000005s : 1: split_matmul_comm_elemetwise 0.02% : 0.000009s : 1: swap_dp_allreduce_reducescatter 0.18% : 0.000093s : 1: symbol_engine_optimizer 0.18% : 0.000090s : 1: tuple_transform 12.68% : 0.006401s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:44:36.494.249 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:44:36.494.535 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0317027, [21] [bootstrap]: 0.00046302 [type_inference]: 0.0071517 [event_method]: 2.037e-05 [auto_monad]: 6.86e-05 [graph_reusing]: 5.89999e-06 [inline]: 3.02002e-06 [add_attr]: 0.0159341, [1] [add_attr_with_inline]: 0.0159201, [1] [Cycle 1]: 9.513e-05, [2] [tag_attr]: 2.562e-05 [meta_addattr_fg_expand]: 6.52001e-06 [parallel-infer-symbol]: 4.42e-06 [pre_auto_parallel]: 4.28e-05 [insert-virtual-dataset]: 2.85998e-06 [parallel-infer-symbol-second]: 7.99977e-07 [dataset_repeat_opt]: 2.05002e-06 [pipeline_split]: 1.57999e-06 [optimize]: 0.00668732, [53] [py_interpret_to_execute]: 3.869e-05 [rewriter_before_opt_a]: 0.00010354 [opt_a]: 0.00390533, [2] [Cycle 1]: 0.00273116, [45] [expand_dump_flag]: 3.14001e-06 [switch_simplify]: 4.599e-05 [loop_unroll]: 3.298e-05 [a_1]: 0.00074515 [with_stream_mark]: 2.203e-05 [recompute_prepare]: 1.7e-05 [updatestate_depend_eliminate]: 5.92001e-06 [updatestate_assign_eliminate]: 4.89998e-06 [updatestate_loads_eliminate]: 4.36002e-06 [parameter_eliminate]: 2.07999e-06 [a_2]: 0.00015218 [accelerated_algorithm]: 1.07e-05 [shard]: 2.41e-06 [meta_shard_fg_expand]: 2.50002e-06 [shard_inline]: 9.50001e-06 [merge_send_recv]: 1.052e-05 [auto_parallel]: 1.145e-05 [parallel]: 1.954e-05 [flash_sp]: 9.71e-06 [merge_comm]: 6.28e-06 [allreduce_fusion]: 5.71e-06 [matmul_add_comm_reduction]: 1.249e-05 [allreduce_slice_to_reducescatter]: 6.10016e-07 [virtual_shard_identity]: 1.405e-05 [virtual_dataset]: 9.30001e-06 [get_grad_eliminate_]: 8.99e-06 [virtual_output]: 9.59e-06 [merge_forward]: 5.63002e-06 [cell_reuse_recompute_pass]: 1.39998e-06 [offload_activation]: 1.148e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.381e-05 [merge_recompute_call_nodes]: 1.57999e-06 [before_grad]: 1.621e-05 [set_forward_comm_id_for_comm_node_pass]: 6.23e-06 [meta_fg_expand]: 4.53999e-06 [flash_sp_send_recv_attached]: 3.16999e-06 [receive_attached]: 1.93002e-06 [after_resolve]: 1.612e-05 [a_after_grad]: 1.514e-05 [renormalize]: 0.00084134 [add_forward_monad_depend]: 7.41999e-06 [auto_monad_grad]: 2.73e-06 [auto_monad_eliminator]: 2.319e-05 [cse]: 4.656e-05 [a_3]: 8.773e-05 [Cycle 2]: 0.00115811, [45] [expand_dump_flag]: 2.12999e-06 [switch_simplify]: 1.139e-05 [loop_unroll]: 8.52e-06 [a_1]: 0.00021836 [with_stream_mark]: 1.811e-05 [recompute_prepare]: 1.009e-05 [updatestate_depend_eliminate]: 5.30999e-06 [updatestate_assign_eliminate]: 4.23001e-06 [updatestate_loads_eliminate]: 4.3e-06 [parameter_eliminate]: 1.71998e-06 [a_2]: 0.00017694 [accelerated_algorithm]: 1.082e-05 [shard]: 1.79e-06 [meta_shard_fg_expand]: 2.56e-06 [shard_inline]: 9.31e-06 [merge_send_recv]: 1.09e-05 [auto_parallel]: 9.94001e-06 [parallel]: 7.73999e-06 [flash_sp]: 4.05e-06 [merge_comm]: 5.27999e-06 [allreduce_fusion]: 7.13998e-06 [matmul_add_comm_reduction]: 1.079e-05 [allreduce_slice_to_reducescatter]: 4.49974e-07 [virtual_shard_identity]: 1.148e-05 [virtual_dataset]: 8.94e-06 [get_grad_eliminate_]: 8.40001e-06 [virtual_output]: 8.48001e-06 [merge_forward]: 5.35999e-06 [cell_reuse_recompute_pass]: 2.30002e-06 [offload_activation]: 1.144e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.15e-05 [merge_recompute_call_nodes]: 1.07e-06 [before_grad]: 1.46e-05 [set_forward_comm_id_for_comm_node_pass]: 5.61e-06 [meta_fg_expand]: 4.15e-06 [flash_sp_send_recv_attached]: 1.05999e-06 [receive_attached]: 1.39e-06 [after_resolve]: 1.427e-05 [a_after_grad]: 1.391e-05 [renormalize]: 8.00064e-08 [add_forward_monad_depend]: 2.69001e-06 [auto_monad_grad]: 1.79e-06 [auto_monad_eliminator]: 1.539e-05 [cse]: 3.134e-05 [a_3]: 7.019e-05 [py_interpret_to_execute_after_opt_a]: 1.881e-05 [slice_cell_reuse_recomputed_activation]: 4.99998e-06 [rewriter_after_opt_a]: 5.837e-05 [convert_after_rewriter]: 1.187e-05 [order_py_execute_after_rewriter]: 9.64e-06 [mutable_eliminate]: 0.00061781 [opt_b]: 0.00037166, [1] [Cycle 1]: 0.00036138, [7] [b_1]: 0.00024474 [b_2]: 1.066e-05 [updatestate_depend_eliminate]: 8.37e-06 [updatestate_assign_eliminate]: 3.56999e-06 [updatestate_loads_eliminate]: 3.85998e-06 [renormalize]: 4.39992e-07 [cse]: 3.124e-05 [optimize_parallel_all_gather_comm]: 2.385e-05 [overlap_param_gather]: 4.74e-06 [cconv]: 3.103e-05 [loop_unroll]: 0.00048601 [opt_after_cconv]: 0.00016019, [1] [Cycle 1]: 0.00015078, [7] [c_1]: 4.381e-05 [parameter_eliminate]: 3.93001e-06 [updatestate_depend_eliminate]: 7.6e-06 [updatestate_assign_eliminate]: 3.81001e-06 [updatestate_loads_eliminate]: 4.16001e-06 [cse]: 2.908e-05 [renormalize]: 4.09986e-07 [remove_dup_value]: 4.824e-05 [tuple_transform]: 0.00011547, [1] [Cycle 1]: 0.00010803, [4] [d_1]: 6.397e-05 [none_parameter_eliminate]: 1.86e-06 [renormalize]: 2.19996e-07 [switch_simplify]: 1.009e-05 [partial_unused_args_eliminate]: 4.73001e-06 [add_recomputation]: 6.821e-05 [cse_after_recomputation]: 3.595e-05, [1] [Cycle 1]: 2.829e-05, [1] [cse]: 1.842e-05 [environ_conv]: 1.13e-05 [swap_dp_allreduce_reducescatter]: 9.79999e-06 [bias_add_comm_swap]: 5.20001e-06 [label_micro_interleaved_index]: 7.63999e-06 [label_fine_grained_interleaved_index]: 5.47999e-06 [merge_cast_opt]: 4.48999e-06 [slice_recompute_activation]: 4.77e-06 [micro_interleaved_order_control]: 4.32e-06 [assign_add_opt]: 3.76001e-06 [ForceFp32Comm]: 3.61001e-06 [remove_cast_before_assign_add]: 3.57002e-06 [full_micro_interleaved_order_control]: 4.37e-06 [reorder_send_recv_between_fp_bp]: 5.17e-06 [comm_op_add_attrs]: 4.09002e-06 [add_comm_op_reuse_tag]: 3.43999e-06 [interleave_split_concat_branches]: 3.56999e-06 [interleave_parallel_branches]: 3.35e-06 [overlap_opt_shard_in_pipeline]: 4.27003e-06 [overlap_opt_shard_grad_in_pipeline]: 4.66002e-06 [control_data_broadcast_order]: 2.09e-05 [grouped_pairwise_exchange_alltoall]: 3.93999e-06 [offloading_packed_experts]: 8.08999e-06 [overlap_recompute_and_grad_model_parallel]: 8.84998e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.61001e-06 [overlap_recompute_allgather_and_fa_grad]: 3.65998e-06 [overlap_recompute_comm]: 4.85999e-06 [overlap_grad_ring_attention]: 7.5e-06 [overlap_grad_flash_sp]: 4.586e-05 [begin_end_overlap_inline]: 3.38e-06 [split_matmul_comm_elemetwise]: 4.71002e-06 [split_layernorm_comm]: 4.05e-06 [handle_group_info]: 3.47002e-06 [symbol_engine_optimizer]: 0.000125, [1] [Cycle 1]: 0.00011751, [6] [build]: 4.87e-06 [elim_shapecalc]: 1.63e-05 [elim_not_effective]: 1.981e-05 [opt_reshape]: 1.068e-05 [fold_const_symbol]: 1.529e-05 [renormalize]: 2.10013e-07 [detach_backward]: 4.33001e-06 [pipeline_parallel_scheduler]: 1.79e-06 [auto_monad_reorder]: 2.418e-05 [get_jit_bprop_graph]: 1.65001e-06 [rewriter_after_jit_bprop_graph]: 3.7e-06 [opt_after_jit_grad]: 0.00057892 [validate]: 5.116e-05 Sums bootstrap : 0.000463s : 3.34% type_inference : 0.007152s : 51.58% event_method : 0.000020s : 0.15% auto_monad : 0.000069s : 0.49% graph_reusing : 0.000006s : 0.04% inline : 0.000003s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000026s : 0.18% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000007s : 0.05% parallel-infer-symbol : 0.000004s : 0.03% pre_auto_parallel : 0.000043s : 0.31% insert-virtual-dataset : 0.000003s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.01% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000039s : 0.28% optimize.rewriter_before_opt_a : 0.000104s : 0.75% optimize.opt_a.expand_dump_flag : 0.000005s : 0.04% optimize.opt_a.switch_simplify : 0.000057s : 0.41% optimize.opt_a.loop_unroll : 0.000041s : 0.30% optimize.opt_a.a_1 : 0.000964s : 6.95% optimize.opt_a.with_stream_mark : 0.000040s : 0.29% optimize.opt_a.recompute_prepare : 0.000027s : 0.20% optimize.opt_a.updatestate_depend_eliminate : 0.000011s : 0.08% optimize.opt_a.updatestate_assign_eliminate : 0.000009s : 0.07% optimize.opt_a.updatestate_loads_eliminate : 0.000009s : 0.06% optimize.opt_a.parameter_eliminate : 0.000004s : 0.03% optimize.opt_a.a_2 : 0.000329s : 2.37% optimize.opt_a.accelerated_algorithm : 0.000022s : 0.16% optimize.opt_a.shard : 0.000004s : 0.03% optimize.opt_a.meta_shard_fg_expand : 0.000005s : 0.04% optimize.opt_a.shard_inline : 0.000019s : 0.14% optimize.opt_a.merge_send_recv : 0.000021s : 0.15% optimize.opt_a.auto_parallel : 0.000021s : 0.15% optimize.opt_a.parallel : 0.000027s : 0.20% optimize.opt_a.flash_sp : 0.000014s : 0.10% optimize.opt_a.merge_comm : 0.000012s : 0.08% optimize.opt_a.allreduce_fusion : 0.000013s : 0.09% optimize.opt_a.matmul_add_comm_reduction : 0.000023s : 0.17% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000026s : 0.18% optimize.opt_a.virtual_dataset : 0.000018s : 0.13% optimize.opt_a.get_grad_eliminate_ : 0.000017s : 0.13% optimize.opt_a.virtual_output : 0.000018s : 0.13% optimize.opt_a.merge_forward : 0.000011s : 0.08% optimize.opt_a.cell_reuse_recompute_pass : 0.000004s : 0.03% optimize.opt_a.offload_activation : 0.000023s : 0.17% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000045s : 0.33% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.02% optimize.opt_a.before_grad : 0.000031s : 0.22% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000012s : 0.09% optimize.opt_a.meta_fg_expand : 0.000009s : 0.06% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.03% optimize.opt_a.receive_attached : 0.000003s : 0.02% optimize.opt_a.after_resolve : 0.000030s : 0.22% optimize.opt_a.a_after_grad : 0.000029s : 0.21% optimize.opt_a.renormalize : 0.000841s : 6.07% optimize.opt_a.add_forward_monad_depend : 0.000010s : 0.07% optimize.opt_a.auto_monad_grad : 0.000005s : 0.03% optimize.opt_a.auto_monad_eliminator : 0.000039s : 0.28% optimize.opt_a.cse : 0.000078s : 0.56% optimize.opt_a.a_3 : 0.000158s : 1.14% optimize.py_interpret_to_execute_after_opt_a : 0.000019s : 0.14% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.04% optimize.rewriter_after_opt_a : 0.000058s : 0.42% optimize.convert_after_rewriter : 0.000012s : 0.09% optimize.order_py_execute_after_rewriter : 0.000010s : 0.07% optimize.mutable_eliminate : 0.000618s : 4.46% optimize.opt_b.b_1 : 0.000245s : 1.77% optimize.opt_b.b_2 : 0.000011s : 0.08% optimize.opt_b.updatestate_depend_eliminate : 0.000008s : 0.06% optimize.opt_b.updatestate_assign_eliminate : 0.000004s : 0.03% optimize.opt_b.updatestate_loads_eliminate : 0.000004s : 0.03% optimize.opt_b.renormalize : 0.000000s : 0.00% optimize.opt_b.cse : 0.000031s : 0.23% optimize.optimize_parallel_all_gather_comm : 0.000024s : 0.17% optimize.overlap_param_gather : 0.000005s : 0.03% optimize.cconv : 0.000031s : 0.22% optimize.loop_unroll : 0.000486s : 3.51% optimize.opt_after_cconv.c_1 : 0.000044s : 0.32% optimize.opt_after_cconv.parameter_eliminate : 0.000004s : 0.03% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000008s : 0.05% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000004s : 0.03% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000004s : 0.03% optimize.opt_after_cconv.cse : 0.000029s : 0.21% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000048s : 0.35% optimize.tuple_transform.d_1 : 0.000064s : 0.46% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000010s : 0.07% optimize.partial_unused_args_eliminate : 0.000005s : 0.03% optimize.add_recomputation : 0.000068s : 0.49% optimize.cse_after_recomputation.cse : 0.000018s : 0.13% optimize.environ_conv : 0.000011s : 0.08% optimize.swap_dp_allreduce_reducescatter : 0.000010s : 0.07% optimize.bias_add_comm_swap : 0.000005s : 0.04% optimize.label_micro_interleaved_index : 0.000008s : 0.06% optimize.label_fine_grained_interleaved_index : 0.000005s : 0.04% optimize.merge_cast_opt : 0.000004s : 0.03% optimize.slice_recompute_activation : 0.000005s : 0.03% optimize.micro_interleaved_order_control : 0.000004s : 0.03% optimize.assign_add_opt : 0.000004s : 0.03% optimize.ForceFp32Comm : 0.000004s : 0.03% optimize.remove_cast_before_assign_add : 0.000004s : 0.03% optimize.full_micro_interleaved_order_control : 0.000004s : 0.03% optimize.reorder_send_recv_between_fp_bp : 0.000005s : 0.04% optimize.comm_op_add_attrs : 0.000004s : 0.03% optimize.add_comm_op_reuse_tag : 0.000003s : 0.02% optimize.interleave_split_concat_branches : 0.000004s : 0.03% optimize.interleave_parallel_branches : 0.000003s : 0.02% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.03% optimize.overlap_opt_shard_grad_in_pipeline : 0.000005s : 0.03% optimize.control_data_broadcast_order : 0.000021s : 0.15% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.03% optimize.offloading_packed_experts : 0.000008s : 0.06% optimize.overlap_recompute_and_grad_model_parallel : 0.000009s : 0.06% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.03% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.03% optimize.overlap_recompute_comm : 0.000005s : 0.04% optimize.overlap_grad_ring_attention : 0.000007s : 0.05% optimize.overlap_grad_flash_sp : 0.000046s : 0.33% optimize.begin_end_overlap_inline : 0.000003s : 0.02% optimize.split_matmul_comm_elemetwise : 0.000005s : 0.03% optimize.split_layernorm_comm : 0.000004s : 0.03% optimize.handle_group_info : 0.000003s : 0.03% optimize.symbol_engine_optimizer.build : 0.000005s : 0.04% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000016s : 0.12% optimize.symbol_engine_optimizer.elim_not_effective : 0.000020s : 0.14% optimize.symbol_engine_optimizer.opt_reshape : 0.000011s : 0.08% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000015s : 0.11% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000004s : 0.03% pipeline_parallel_scheduler : 0.000002s : 0.01% auto_monad_reorder : 0.000024s : 0.17% get_jit_bprop_graph : 0.000002s : 0.01% rewriter_after_jit_bprop_graph : 0.000004s : 0.03% opt_after_jit_grad : 0.000579s : 4.18% validate : 0.000051s : 0.37% Time group info: ------[substitution.] 0.000250 49 14.04% : 0.000035s : 6: substitution.cast_eliminate 1.07% : 0.000003s : 4: substitution.elim_not_effective 0.82% : 0.000002s : 4: substitution.fold_const_symbol 2.89% : 0.000007s : 6: substitution.graph_param_transform 64.18% : 0.000160s : 4: substitution.inline 2.33% : 0.000006s : 8: substitution.j_node_and_user_rematch 3.17% : 0.000008s : 8: substitution.remove_not_recompute_node 2.38% : 0.000006s : 4: substitution.replace_old_param 5.61% : 0.000014s : 4: substitution.tuple_list_get_item_eliminator 3.52% : 0.000009s : 1: substitution.value_based_eliminate ------[type_inference.] 0.007064 2 88.55% : 0.006255s : 1: type_inference.infer 11.45% : 0.000809s : 1: type_inference.specialize ------[replace.] 0.000065 8 61.95% : 0.000040s : 4: replace.inline 38.05% : 0.000025s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000170 8 92.84% : 0.000158s : 4: match.inline 7.16% : 0.000012s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000308 1730 0.83% : 0.000003s : 17: predicate.accumulaten_eliminater 0.84% : 0.000003s : 6: predicate.ad_related_special_op_eliminate 0.54% : 0.000002s : 12: predicate.addn_check_dump 0.96% : 0.000003s : 17: predicate.addn_zero_filter 0.83% : 0.000003s : 17: predicate.adjust_all_reduce_mul_add 2.11% : 0.000006s : 29: predicate.arithmetic_simplify 0.95% : 0.000003s : 17: predicate.cast_eliminate 0.56% : 0.000002s : 12: predicate.check_bprop_eliminate 0.60% : 0.000002s : 12: predicate.compare_switch_simplify 0.18% : 0.000001s : 6: predicate.const_output_eliminate 0.60% : 0.000002s : 12: predicate.depend_value_elim 0.91% : 0.000003s : 17: predicate.dict_get_item_const_eliminator 0.98% : 0.000003s : 17: predicate.dict_get_item_eliminator 0.82% : 0.000003s : 17: predicate.dict_set_item_eliminator 1.00% : 0.000003s : 12: predicate.dumpgradient_eliminate 0.24% : 0.000001s : 6: predicate.elim_not_effective 0.37% : 0.000001s : 6: predicate.elim_shapecalc_of_broadcastargs 1.10% : 0.000003s : 23: predicate.environ_add_const_eliminate 1.05% : 0.000003s : 23: predicate.environ_get_add_eliminate 1.15% : 0.000004s : 23: predicate.environ_get_depend_swap 1.94% : 0.000006s : 35: predicate.environ_get_eliminate 1.01% : 0.000003s : 23: predicate.environ_get_set_eliminate 1.21% : 0.000004s : 25: predicate.exchange_switch_depend_value 1.89% : 0.000006s : 25: predicate.float_depend_g_call 0.75% : 0.000002s : 12: predicate.float_environ_get_switch 0.85% : 0.000003s : 18: predicate.float_tuple_getitem_switch 0.19% : 0.000001s : 6: predicate.fold_const_symbol 0.64% : 0.000002s : 12: predicate.get_grad_eliminate 0.21% : 0.000001s : 6: predicate.graph_param_transform 0.60% : 0.000002s : 12: predicate.incorporate_call 7.69% : 0.000024s : 12: predicate.incorporate_call_switch 5.64% : 0.000017s : 78: predicate.inline 0.78% : 0.000002s : 12: predicate.inline_without_move 0.32% : 0.000001s : 12: predicate.j_node_and_user_rematch 0.67% : 0.000002s : 12: predicate.less_batch_normalization 1.62% : 0.000005s : 33: predicate.list_to_tuple_eliminator_ 2.23% : 0.000007s : 50: predicate.load_eliminater 0.69% : 0.000002s : 6: predicate.loop_unroll_after_grad 1.82% : 0.000006s : 38: predicate.loop_unroll_before_grad 1.47% : 0.000005s : 29: predicate.make_slice_get_slice_eliminator 0.60% : 0.000002s : 12: predicate.merge_addn 0.55% : 0.000002s : 12: predicate.micro_step_allgather_replace 0.60% : 0.000002s : 12: predicate.mini_step_allgather_replace 0.74% : 0.000002s : 17: predicate.minmaximum_grad 0.75% : 0.000002s : 6: predicate.mutable_eliminate 0.40% : 0.000001s : 6: predicate.opt_reshape 0.36% : 0.000001s : 6: predicate.parallel_virtual_node 1.35% : 0.000004s : 25: predicate.partial_defer_inline 1.43% : 0.000004s : 27: predicate.partial_eliminate 0.79% : 0.000002s : 17: predicate.print_const_string_wrapper 0.56% : 0.000002s : 12: predicate.reduce_all_const_elim 1.12% : 0.000003s : 17: predicate.reduce_eliminate 2.21% : 0.000007s : 50: predicate.redundant_stop_gradient_eliminater 0.45% : 0.000001s : 12: predicate.remove_not_recompute_node 1.14% : 0.000004s : 33: predicate.replace_applicator 0.44% : 0.000001s : 12: predicate.replace_old_param 0.21% : 0.000001s : 6: predicate.reset_defer_inline 0.85% : 0.000003s : 17: predicate.reshape_eliminate 0.61% : 0.000002s : 12: predicate.row_tensor_add_zeros_like 0.33% : 0.000001s : 6: predicate.row_tensor_eliminate 0.93% : 0.000003s : 12: predicate.same_eliminate 0.63% : 0.000002s : 12: predicate.set_cell_output_no_recompute 0.72% : 0.000002s : 12: predicate.shard_identity_eliminate 0.62% : 0.000002s : 12: predicate.special_op_eliminate 0.69% : 0.000002s : 12: predicate.specialize_transform 0.96% : 0.000003s : 12: predicate.split_environ_get_set_with_tuple_value 0.72% : 0.000002s : 12: predicate.stack_unstack_eliminate 0.33% : 0.000001s : 6: predicate.switch_call_monad_eliminater 1.22% : 0.000004s : 25: predicate.switch_defer_inline 1.82% : 0.000006s : 37: predicate.switch_layer_defer_inline 4.49% : 0.000014s : 81: predicate.switch_simplify 0.81% : 0.000002s : 17: predicate.tile_eliminate 0.87% : 0.000003s : 17: predicate.transpose_eliminate 1.39% : 0.000004s : 29: predicate.tuple_list_convert_item_index_to_positive 1.42% : 0.000004s : 29: predicate.tuple_list_get_item_const_eliminator 1.45% : 0.000004s : 29: predicate.tuple_list_get_item_depend_reorder 2.92% : 0.000009s : 45: predicate.tuple_list_get_item_eliminator 1.50% : 0.000005s : 29: predicate.tuple_list_get_set_item_eliminator 2.03% : 0.000006s : 41: predicate.tuple_list_set_item_eliminator 1.53% : 0.000005s : 33: predicate.tuple_to_list_eliminator_ 2.16% : 0.000007s : 50: predicate.updatestate_pure_node_eliminater 2.84% : 0.000009s : 62: predicate.updatestate_useless_node_eliminater 1.39% : 0.000004s : 6: predicate.value_based_eliminate 0.75% : 0.000002s : 12: predicate.virtual_dataset_eliminate 0.65% : 0.000002s : 12: predicate.virtual_output_eliminate 0.33% : 0.000001s : 6: predicate.virtual_view_grad_eliminate 0.52% : 0.000002s : 6: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000557 11 55.73% : 0.000310s : 5: func_graph_cloner_run.FuncGraphClonerGraph 44.27% : 0.000247s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.057063 192 0.01% : 0.000006s : 1: ForceFp32Comm 27.94% : 0.015945s : 1: add_attr 27.91% : 0.015925s : 1: add_attr_with_inline 0.01% : 0.000006s : 1: add_comm_op_reuse_tag 0.13% : 0.000072s : 1: add_recomputation 0.01% : 0.000007s : 1: assign_add_opt 0.14% : 0.000078s : 1: auto_monad 0.06% : 0.000032s : 1: auto_monad_reorder 0.01% : 0.000007s : 1: begin_end_overlap_inline 0.01% : 0.000008s : 1: bias_add_comm_swap 0.89% : 0.000509s : 1: bootstrap 0.06% : 0.000034s : 1: cconv 0.01% : 0.000007s : 1: comm_op_add_attrs 0.04% : 0.000024s : 1: control_data_broadcast_order 0.03% : 0.000015s : 1: convert_after_rewriter 0.07% : 0.000039s : 1: cse_after_recomputation 0.01% : 0.000008s : 1: dataset_repeat_opt 0.04% : 0.000023s : 1: detach_backward 0.03% : 0.000014s : 1: environ_conv 0.05% : 0.000031s : 1: event_method 0.01% : 0.000007s : 1: full_micro_interleaved_order_control 0.01% : 0.000008s : 1: get_jit_bprop_graph 0.02% : 0.000013s : 1: graph_reusing 0.01% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000006s : 1: handle_group_info 0.02% : 0.000009s : 1: inline 0.02% : 0.000009s : 1: insert-virtual-dataset 0.01% : 0.000006s : 1: interleave_parallel_branches 0.01% : 0.000006s : 1: interleave_split_concat_branches 0.01% : 0.000008s : 1: label_fine_grained_interleaved_index 0.02% : 0.000011s : 1: label_micro_interleaved_index 0.86% : 0.000492s : 1: loop_unroll 0.01% : 0.000007s : 1: merge_cast_opt 0.01% : 0.000007s : 1: micro_interleaved_order_control 1.09% : 0.000624s : 1: mutable_eliminate 0.02% : 0.000011s : 1: offloading_packed_experts 0.03% : 0.000018s : 1: opt.transform.loop_unroll_optimizer 0.03% : 0.000020s : 1: opt.transform.mutable_eliminate 2.86% : 0.001631s : 78: opt.transform.opt_a 0.07% : 0.000042s : 1: opt.transform.opt_after_cconv 0.07% : 0.000039s : 1: opt.transform.opt_after_jit_grad 0.32% : 0.000181s : 28: opt.transform.opt_b 0.13% : 0.000072s : 2: opt.transform.opt_trans_graph 0.10% : 0.000057s : 4: opt.transform.symbol_engine_opt 6.85% : 0.003908s : 1: opt_a 0.29% : 0.000164s : 1: opt_after_cconv 1.04% : 0.000591s : 1: opt_after_jit_grad 0.66% : 0.000375s : 1: opt_b 12.39% : 0.007072s : 1: optimize 0.05% : 0.000027s : 1: optimize_parallel_all_gather_comm 0.02% : 0.000012s : 1: order_py_execute_after_rewriter 0.09% : 0.000049s : 1: overlap_grad_flash_sp 0.01% : 0.000006s : 1: overlap_grad_matmul_and_grad_allreduce 0.02% : 0.000010s : 1: overlap_grad_ring_attention 0.01% : 0.000007s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000007s : 1: overlap_opt_shard_in_pipeline 0.01% : 0.000008s : 1: overlap_param_gather 0.01% : 0.000006s : 1: overlap_recompute_allgather_and_fa_grad 0.02% : 0.000012s : 1: overlap_recompute_and_grad_model_parallel 0.01% : 0.000008s : 1: overlap_recompute_comm 0.02% : 0.000011s : 1: parallel-infer-symbol 0.01% : 0.000007s : 1: parallel-infer-symbol-second 0.01% : 0.000008s : 1: partial_unused_args_eliminate 0.02% : 0.000010s : 1: pipeline_parallel_scheduler 0.01% : 0.000007s : 1: pipeline_split 0.09% : 0.000050s : 1: pre_auto_parallel 0.08% : 0.000043s : 1: py_interpret_to_execute 0.04% : 0.000022s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000006s : 1: remove_cast_before_assign_add 0.09% : 0.000052s : 1: remove_dup_value 0.79% : 0.000453s : 1: renormalize.infer 0.66% : 0.000378s : 1: renormalize.specialize 0.02% : 0.000009s : 1: reorder_send_recv_between_fp_bp 0.02% : 0.000010s : 1: rewriter_after_jit_bprop_graph 0.11% : 0.000063s : 1: rewriter_after_opt_a 0.19% : 0.000108s : 1: rewriter_before_opt_a 0.01% : 0.000008s : 1: slice_cell_reuse_recomputed_activation 0.01% : 0.000007s : 1: slice_recompute_activation 0.01% : 0.000007s : 1: split_layernorm_comm 0.01% : 0.000007s : 1: split_matmul_comm_elemetwise 0.02% : 0.000013s : 1: swap_dp_allreduce_reducescatter 0.22% : 0.000128s : 1: symbol_engine_optimizer 0.21% : 0.000118s : 1: tuple_transform 12.61% : 0.007194s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:44:37.304.22 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0165302, [21] [bootstrap]: 0.00037802 [type_inference]: 0.00593088 [event_method]: 2.089e-05 [auto_monad]: 6.849e-05 [graph_reusing]: 5.52001e-06 [inline]: 2.73e-06 [add_attr]: 0.00323254, [1] [add_attr_with_inline]: 0.0032218, [1] [Cycle 1]: 6.931e-05, [2] [tag_attr]: 2.159e-05 [meta_addattr_fg_expand]: 6.91001e-06 [parallel-infer-symbol]: 3.84002e-06 [pre_auto_parallel]: 3.536e-05 [insert-virtual-dataset]: 2.53e-06 [parallel-infer-symbol-second]: 9.40025e-07 [dataset_repeat_opt]: 2.12999e-06 [pipeline_split]: 1.64998e-06 [optimize]: 0.00604853, [53] [py_interpret_to_execute]: 2.765e-05 [rewriter_before_opt_a]: 9.45e-05 [opt_a]: 0.00353746, [2] [Cycle 1]: 0.0025229, [45] [expand_dump_flag]: 2.84001e-06 [switch_simplify]: 4.942e-05 [loop_unroll]: 3.691e-05 [a_1]: 0.00078274 [with_stream_mark]: 2.013e-05 [recompute_prepare]: 1.408e-05 [updatestate_depend_eliminate]: 5.61998e-06 [updatestate_assign_eliminate]: 4.58999e-06 [updatestate_loads_eliminate]: 4.80999e-06 [parameter_eliminate]: 1.92001e-06 [a_2]: 0.00012978 [accelerated_algorithm]: 1.138e-05 [shard]: 2.21998e-06 [meta_shard_fg_expand]: 2.35002e-06 [shard_inline]: 1.042e-05 [merge_send_recv]: 1.052e-05 [auto_parallel]: 1.018e-05 [parallel]: 1.966e-05 [flash_sp]: 1.019e-05 [merge_comm]: 6.48e-06 [allreduce_fusion]: 5.21002e-06 [matmul_add_comm_reduction]: 1.189e-05 [allreduce_slice_to_reducescatter]: 8.09989e-07 [virtual_shard_identity]: 1.267e-05 [virtual_dataset]: 9.91e-06 [get_grad_eliminate_]: 9.14998e-06 [virtual_output]: 9.56e-06 [merge_forward]: 6.24001e-06 [cell_reuse_recompute_pass]: 1.27999e-06 [offload_activation]: 1.365e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.022e-05 [merge_recompute_call_nodes]: 1.72999e-06 [before_grad]: 1.491e-05 [set_forward_comm_id_for_comm_node_pass]: 6.06e-06 [meta_fg_expand]: 4.48999e-06 [flash_sp_send_recv_attached]: 2.48002e-06 [receive_attached]: 1.99e-06 [after_resolve]: 1.636e-05 [a_after_grad]: 1.558e-05 [renormalize]: 0.00080223 [add_forward_monad_depend]: 5.66003e-06 [auto_monad_grad]: 2.66e-06 [auto_monad_eliminator]: 2.06e-05 [cse]: 4.637e-05 [a_3]: 7.46e-05 [Cycle 2]: 0.00100371, [45] [expand_dump_flag]: 2.40002e-06 [switch_simplify]: 1.165e-05 [loop_unroll]: 9.20999e-06 [a_1]: 0.00023732 [with_stream_mark]: 1.681e-05 [recompute_prepare]: 1.14e-05 [updatestate_depend_eliminate]: 5.64e-06 [updatestate_assign_eliminate]: 3.61001e-06 [updatestate_loads_eliminate]: 4.43999e-06 [parameter_eliminate]: 1.35001e-06 [a_2]: 0.00012524 [accelerated_algorithm]: 1.096e-05 [shard]: 1.35999e-06 [meta_shard_fg_expand]: 2.34001e-06 [shard_inline]: 9.71e-06 [merge_send_recv]: 7.56999e-06 [auto_parallel]: 9.02e-06 [parallel]: 5.61e-06 [flash_sp]: 3.86999e-06 [merge_comm]: 5.04998e-06 [allreduce_fusion]: 4.70999e-06 [matmul_add_comm_reduction]: 8.49002e-06 [allreduce_slice_to_reducescatter]: 3.60014e-07 [virtual_shard_identity]: 1.122e-05 [virtual_dataset]: 8.89e-06 [get_grad_eliminate_]: 9.97999e-06 [virtual_output]: 9.14e-06 [merge_forward]: 4.67998e-06 [cell_reuse_recompute_pass]: 1.37e-06 [offload_activation]: 1.029e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.972e-05 [merge_recompute_call_nodes]: 9.20001e-07 [before_grad]: 1.515e-05 [set_forward_comm_id_for_comm_node_pass]: 5.07e-06 [meta_fg_expand]: 3.98001e-06 [flash_sp_send_recv_attached]: 1.18001e-06 [receive_attached]: 1.15001e-06 [after_resolve]: 1.411e-05 [a_after_grad]: 1.461e-05 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 1.64e-06 [auto_monad_grad]: 1.81e-06 [auto_monad_eliminator]: 1.298e-05 [cse]: 5.236e-05 [a_3]: 5.969e-05 [py_interpret_to_execute_after_opt_a]: 1.343e-05 [slice_cell_reuse_recomputed_activation]: 2.16e-06 [rewriter_after_opt_a]: 4.939e-05 [convert_after_rewriter]: 9.72999e-06 [order_py_execute_after_rewriter]: 6.53e-06 [mutable_eliminate]: 0.00055402 [opt_b]: 0.00032356, [1] [Cycle 1]: 0.00031585, [7] [b_1]: 0.00021186 [b_2]: 1.257e-05 [updatestate_depend_eliminate]: 8.35001e-06 [updatestate_assign_eliminate]: 3.68999e-06 [updatestate_loads_eliminate]: 4.33001e-06 [renormalize]: 4.30009e-07 [cse]: 3.35e-05 [optimize_parallel_all_gather_comm]: 2.042e-05 [overlap_param_gather]: 2.02999e-06 [cconv]: 2.726e-05 [loop_unroll]: 0.00050304 [opt_after_cconv]: 0.00014764, [1] [Cycle 1]: 0.0001413, [7] [c_1]: 4.658e-05 [parameter_eliminate]: 3.6e-06 [updatestate_depend_eliminate]: 8.59e-06 [updatestate_assign_eliminate]: 3.97998e-06 [updatestate_loads_eliminate]: 3.88999e-06 [cse]: 3.401e-05 [renormalize]: 6.39993e-07 [remove_dup_value]: 4.261e-05 [tuple_transform]: 0.00010858, [1] [Cycle 1]: 0.00010341, [4] [d_1]: 6.783e-05 [none_parameter_eliminate]: 1.87999e-06 [renormalize]: 2.69996e-07 [switch_simplify]: 1.147e-05 [partial_unused_args_eliminate]: 2.39001e-06 [add_recomputation]: 6.792e-05 [cse_after_recomputation]: 3.19e-05, [1] [Cycle 1]: 2.643e-05, [1] [cse]: 1.976e-05 [environ_conv]: 6.96001e-06 [swap_dp_allreduce_reducescatter]: 7.63001e-06 [bias_add_comm_swap]: 2.96999e-06 [label_micro_interleaved_index]: 4.92e-06 [label_fine_grained_interleaved_index]: 2.79999e-06 [merge_cast_opt]: 1.35999e-06 [slice_recompute_activation]: 2.55002e-06 [micro_interleaved_order_control]: 2.17001e-06 [assign_add_opt]: 1.24e-06 [ForceFp32Comm]: 1.35001e-06 [remove_cast_before_assign_add]: 1.12999e-06 [full_micro_interleaved_order_control]: 2.21e-06 [reorder_send_recv_between_fp_bp]: 2.79001e-06 [comm_op_add_attrs]: 1.41002e-06 [add_comm_op_reuse_tag]: 9.89996e-07 [interleave_split_concat_branches]: 1.25999e-06 [interleave_parallel_branches]: 1.19e-06 [overlap_opt_shard_in_pipeline]: 1.29998e-06 [overlap_opt_shard_grad_in_pipeline]: 2.49001e-06 [control_data_broadcast_order]: 1.913e-05 [grouped_pairwise_exchange_alltoall]: 1.69e-06 [offloading_packed_experts]: 4.89e-06 [overlap_recompute_and_grad_model_parallel]: 6.26998e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.32999e-06 [overlap_recompute_allgather_and_fa_grad]: 1.46002e-06 [overlap_recompute_comm]: 2.74001e-06 [overlap_grad_ring_attention]: 5.39e-06 [overlap_grad_flash_sp]: 2.755e-05 [begin_end_overlap_inline]: 8.2e-07 [split_matmul_comm_elemetwise]: 2.36e-06 [split_layernorm_comm]: 1.91e-06 [handle_group_info]: 1.17e-06 [symbol_engine_optimizer]: 0.00010855, [1] [Cycle 1]: 0.00010319, [6] [build]: 4.43001e-06 [elim_shapecalc]: 1.734e-05 [elim_not_effective]: 2.043e-05 [opt_reshape]: 1.077e-05 [fold_const_symbol]: 1.502e-05 [renormalize]: 1.90019e-07 [detach_backward]: 1.99999e-06 [pipeline_parallel_scheduler]: 1.61998e-06 [auto_monad_reorder]: 2.347e-05 [get_jit_bprop_graph]: 2.40002e-06 [rewriter_after_jit_bprop_graph]: 4.65001e-06 [opt_after_jit_grad]: 0.00055792 [validate]: 5.176e-05 Sums bootstrap : 0.000378s : 3.09% type_inference : 0.005931s : 48.49% event_method : 0.000021s : 0.17% auto_monad : 0.000068s : 0.56% graph_reusing : 0.000006s : 0.05% inline : 0.000003s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000022s : 0.18% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000007s : 0.06% parallel-infer-symbol : 0.000004s : 0.03% pre_auto_parallel : 0.000035s : 0.29% insert-virtual-dataset : 0.000003s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000028s : 0.23% optimize.rewriter_before_opt_a : 0.000095s : 0.77% optimize.opt_a.expand_dump_flag : 0.000005s : 0.04% optimize.opt_a.switch_simplify : 0.000061s : 0.50% optimize.opt_a.loop_unroll : 0.000046s : 0.38% optimize.opt_a.a_1 : 0.001020s : 8.34% optimize.opt_a.with_stream_mark : 0.000037s : 0.30% optimize.opt_a.recompute_prepare : 0.000025s : 0.21% optimize.opt_a.updatestate_depend_eliminate : 0.000011s : 0.09% optimize.opt_a.updatestate_assign_eliminate : 0.000008s : 0.07% optimize.opt_a.updatestate_loads_eliminate : 0.000009s : 0.08% optimize.opt_a.parameter_eliminate : 0.000003s : 0.03% optimize.opt_a.a_2 : 0.000255s : 2.09% optimize.opt_a.accelerated_algorithm : 0.000022s : 0.18% optimize.opt_a.shard : 0.000004s : 0.03% optimize.opt_a.meta_shard_fg_expand : 0.000005s : 0.04% optimize.opt_a.shard_inline : 0.000020s : 0.16% optimize.opt_a.merge_send_recv : 0.000018s : 0.15% optimize.opt_a.auto_parallel : 0.000019s : 0.16% optimize.opt_a.parallel : 0.000025s : 0.21% optimize.opt_a.flash_sp : 0.000014s : 0.11% optimize.opt_a.merge_comm : 0.000012s : 0.09% optimize.opt_a.allreduce_fusion : 0.000010s : 0.08% optimize.opt_a.matmul_add_comm_reduction : 0.000020s : 0.17% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000024s : 0.20% optimize.opt_a.virtual_dataset : 0.000019s : 0.15% optimize.opt_a.get_grad_eliminate_ : 0.000019s : 0.16% optimize.opt_a.virtual_output : 0.000019s : 0.15% optimize.opt_a.merge_forward : 0.000011s : 0.09% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.02% optimize.opt_a.offload_activation : 0.000024s : 0.20% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000040s : 0.33% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.02% optimize.opt_a.before_grad : 0.000030s : 0.25% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000011s : 0.09% optimize.opt_a.meta_fg_expand : 0.000008s : 0.07% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.03% optimize.opt_a.receive_attached : 0.000003s : 0.03% optimize.opt_a.after_resolve : 0.000030s : 0.25% optimize.opt_a.a_after_grad : 0.000030s : 0.25% optimize.opt_a.renormalize : 0.000802s : 6.56% optimize.opt_a.add_forward_monad_depend : 0.000007s : 0.06% optimize.opt_a.auto_monad_grad : 0.000004s : 0.04% optimize.opt_a.auto_monad_eliminator : 0.000034s : 0.27% optimize.opt_a.cse : 0.000099s : 0.81% optimize.opt_a.a_3 : 0.000134s : 1.10% optimize.py_interpret_to_execute_after_opt_a : 0.000013s : 0.11% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.02% optimize.rewriter_after_opt_a : 0.000049s : 0.40% optimize.convert_after_rewriter : 0.000010s : 0.08% optimize.order_py_execute_after_rewriter : 0.000007s : 0.05% optimize.mutable_eliminate : 0.000554s : 4.53% optimize.opt_b.b_1 : 0.000212s : 1.73% optimize.opt_b.b_2 : 0.000013s : 0.10% optimize.opt_b.updatestate_depend_eliminate : 0.000008s : 0.07% optimize.opt_b.updatestate_assign_eliminate : 0.000004s : 0.03% optimize.opt_b.updatestate_loads_eliminate : 0.000004s : 0.04% optimize.opt_b.renormalize : 0.000000s : 0.00% optimize.opt_b.cse : 0.000033s : 0.27% optimize.optimize_parallel_all_gather_comm : 0.000020s : 0.17% optimize.overlap_param_gather : 0.000002s : 0.02% optimize.cconv : 0.000027s : 0.22% optimize.loop_unroll : 0.000503s : 4.11% optimize.opt_after_cconv.c_1 : 0.000047s : 0.38% optimize.opt_after_cconv.parameter_eliminate : 0.000004s : 0.03% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000009s : 0.07% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000004s : 0.03% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000004s : 0.03% optimize.opt_after_cconv.cse : 0.000034s : 0.28% optimize.opt_after_cconv.renormalize : 0.000001s : 0.01% optimize.remove_dup_value : 0.000043s : 0.35% optimize.tuple_transform.d_1 : 0.000068s : 0.55% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.02% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000011s : 0.09% optimize.partial_unused_args_eliminate : 0.000002s : 0.02% optimize.add_recomputation : 0.000068s : 0.56% optimize.cse_after_recomputation.cse : 0.000020s : 0.16% optimize.environ_conv : 0.000007s : 0.06% optimize.swap_dp_allreduce_reducescatter : 0.000008s : 0.06% optimize.bias_add_comm_swap : 0.000003s : 0.02% optimize.label_micro_interleaved_index : 0.000005s : 0.04% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.02% optimize.merge_cast_opt : 0.000001s : 0.01% optimize.slice_recompute_activation : 0.000003s : 0.02% optimize.micro_interleaved_order_control : 0.000002s : 0.02% optimize.assign_add_opt : 0.000001s : 0.01% optimize.ForceFp32Comm : 0.000001s : 0.01% optimize.remove_cast_before_assign_add : 0.000001s : 0.01% optimize.full_micro_interleaved_order_control : 0.000002s : 0.02% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.02% optimize.comm_op_add_attrs : 0.000001s : 0.01% optimize.add_comm_op_reuse_tag : 0.000001s : 0.01% optimize.interleave_split_concat_branches : 0.000001s : 0.01% optimize.interleave_parallel_branches : 0.000001s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.02% optimize.control_data_broadcast_order : 0.000019s : 0.16% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.01% optimize.offloading_packed_experts : 0.000005s : 0.04% optimize.overlap_recompute_and_grad_model_parallel : 0.000006s : 0.05% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.01% optimize.overlap_recompute_comm : 0.000003s : 0.02% optimize.overlap_grad_ring_attention : 0.000005s : 0.04% optimize.overlap_grad_flash_sp : 0.000028s : 0.23% optimize.begin_end_overlap_inline : 0.000001s : 0.01% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.02% optimize.split_layernorm_comm : 0.000002s : 0.02% optimize.handle_group_info : 0.000001s : 0.01% optimize.symbol_engine_optimizer.build : 0.000004s : 0.04% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000017s : 0.14% optimize.symbol_engine_optimizer.elim_not_effective : 0.000020s : 0.17% optimize.symbol_engine_optimizer.opt_reshape : 0.000011s : 0.09% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000015s : 0.12% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.02% pipeline_parallel_scheduler : 0.000002s : 0.01% auto_monad_reorder : 0.000023s : 0.19% get_jit_bprop_graph : 0.000002s : 0.02% rewriter_after_jit_bprop_graph : 0.000005s : 0.04% opt_after_jit_grad : 0.000558s : 4.56% validate : 0.000052s : 0.42% Time group info: ------[substitution.] 0.000245 49 14.32% : 0.000035s : 6: substitution.cast_eliminate 1.19% : 0.000003s : 4: substitution.elim_not_effective 0.88% : 0.000002s : 4: substitution.fold_const_symbol 3.28% : 0.000008s : 6: substitution.graph_param_transform 63.85% : 0.000157s : 4: substitution.inline 2.09% : 0.000005s : 8: substitution.j_node_and_user_rematch 3.26% : 0.000008s : 8: substitution.remove_not_recompute_node 2.10% : 0.000005s : 4: substitution.replace_old_param 5.17% : 0.000013s : 4: substitution.tuple_list_get_item_eliminator 3.86% : 0.000009s : 1: substitution.value_based_eliminate ------[type_inference.] 0.005863 2 87.10% : 0.005107s : 1: type_inference.infer 12.90% : 0.000756s : 1: type_inference.specialize ------[replace.] 0.000069 8 58.58% : 0.000041s : 4: replace.inline 41.42% : 0.000029s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000164 8 93.40% : 0.000153s : 4: match.inline 6.60% : 0.000011s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000304 1730 0.90% : 0.000003s : 17: predicate.accumulaten_eliminater 0.93% : 0.000003s : 6: predicate.ad_related_special_op_eliminate 0.59% : 0.000002s : 12: predicate.addn_check_dump 0.84% : 0.000003s : 17: predicate.addn_zero_filter 0.79% : 0.000002s : 17: predicate.adjust_all_reduce_mul_add 2.14% : 0.000007s : 29: predicate.arithmetic_simplify 1.19% : 0.000004s : 17: predicate.cast_eliminate 0.65% : 0.000002s : 12: predicate.check_bprop_eliminate 0.69% : 0.000002s : 12: predicate.compare_switch_simplify 0.18% : 0.000001s : 6: predicate.const_output_eliminate 0.72% : 0.000002s : 12: predicate.depend_value_elim 0.90% : 0.000003s : 17: predicate.dict_get_item_const_eliminator 1.02% : 0.000003s : 17: predicate.dict_get_item_eliminator 0.92% : 0.000003s : 17: predicate.dict_set_item_eliminator 1.04% : 0.000003s : 12: predicate.dumpgradient_eliminate 0.26% : 0.000001s : 6: predicate.elim_not_effective 0.60% : 0.000002s : 6: predicate.elim_shapecalc_of_broadcastargs 1.23% : 0.000004s : 23: predicate.environ_add_const_eliminate 1.08% : 0.000003s : 23: predicate.environ_get_add_eliminate 1.11% : 0.000003s : 23: predicate.environ_get_depend_swap 1.78% : 0.000005s : 35: predicate.environ_get_eliminate 1.13% : 0.000003s : 23: predicate.environ_get_set_eliminate 1.35% : 0.000004s : 25: predicate.exchange_switch_depend_value 1.97% : 0.000006s : 25: predicate.float_depend_g_call 0.62% : 0.000002s : 12: predicate.float_environ_get_switch 0.88% : 0.000003s : 18: predicate.float_tuple_getitem_switch 0.18% : 0.000001s : 6: predicate.fold_const_symbol 0.67% : 0.000002s : 12: predicate.get_grad_eliminate 0.20% : 0.000001s : 6: predicate.graph_param_transform 0.65% : 0.000002s : 12: predicate.incorporate_call 0.55% : 0.000002s : 12: predicate.incorporate_call_switch 5.79% : 0.000018s : 78: predicate.inline 0.88% : 0.000003s : 12: predicate.inline_without_move 0.31% : 0.000001s : 12: predicate.j_node_and_user_rematch 0.92% : 0.000003s : 12: predicate.less_batch_normalization 1.82% : 0.000006s : 33: predicate.list_to_tuple_eliminator_ 2.54% : 0.000008s : 50: predicate.load_eliminater 0.97% : 0.000003s : 6: predicate.loop_unroll_after_grad 2.08% : 0.000006s : 38: predicate.loop_unroll_before_grad 1.75% : 0.000005s : 29: predicate.make_slice_get_slice_eliminator 0.67% : 0.000002s : 12: predicate.merge_addn 0.65% : 0.000002s : 12: predicate.micro_step_allgather_replace 0.65% : 0.000002s : 12: predicate.mini_step_allgather_replace 0.83% : 0.000003s : 17: predicate.minmaximum_grad 0.98% : 0.000003s : 6: predicate.mutable_eliminate 0.35% : 0.000001s : 6: predicate.opt_reshape 0.37% : 0.000001s : 6: predicate.parallel_virtual_node 1.67% : 0.000005s : 25: predicate.partial_defer_inline 1.59% : 0.000005s : 27: predicate.partial_eliminate 0.86% : 0.000003s : 17: predicate.print_const_string_wrapper 0.62% : 0.000002s : 12: predicate.reduce_all_const_elim 1.29% : 0.000004s : 17: predicate.reduce_eliminate 2.50% : 0.000008s : 50: predicate.redundant_stop_gradient_eliminater 0.56% : 0.000002s : 12: predicate.remove_not_recompute_node 1.21% : 0.000004s : 33: predicate.replace_applicator 0.47% : 0.000001s : 12: predicate.replace_old_param 0.28% : 0.000001s : 6: predicate.reset_defer_inline 0.94% : 0.000003s : 17: predicate.reshape_eliminate 0.69% : 0.000002s : 12: predicate.row_tensor_add_zeros_like 0.44% : 0.000001s : 6: predicate.row_tensor_eliminate 0.70% : 0.000002s : 12: predicate.same_eliminate 0.42% : 0.000001s : 12: predicate.set_cell_output_no_recompute 0.75% : 0.000002s : 12: predicate.shard_identity_eliminate 0.83% : 0.000003s : 12: predicate.special_op_eliminate 0.75% : 0.000002s : 12: predicate.specialize_transform 0.97% : 0.000003s : 12: predicate.split_environ_get_set_with_tuple_value 0.87% : 0.000003s : 12: predicate.stack_unstack_eliminate 0.34% : 0.000001s : 6: predicate.switch_call_monad_eliminater 1.38% : 0.000004s : 25: predicate.switch_defer_inline 1.95% : 0.000006s : 37: predicate.switch_layer_defer_inline 5.17% : 0.000016s : 81: predicate.switch_simplify 0.86% : 0.000003s : 17: predicate.tile_eliminate 0.90% : 0.000003s : 17: predicate.transpose_eliminate 1.48% : 0.000004s : 29: predicate.tuple_list_convert_item_index_to_positive 1.60% : 0.000005s : 29: predicate.tuple_list_get_item_const_eliminator 1.47% : 0.000004s : 29: predicate.tuple_list_get_item_depend_reorder 3.11% : 0.000009s : 45: predicate.tuple_list_get_item_eliminator 1.58% : 0.000005s : 29: predicate.tuple_list_get_set_item_eliminator 2.48% : 0.000008s : 41: predicate.tuple_list_set_item_eliminator 1.80% : 0.000005s : 33: predicate.tuple_to_list_eliminator_ 2.44% : 0.000007s : 50: predicate.updatestate_pure_node_eliminater 3.12% : 0.000009s : 62: predicate.updatestate_useless_node_eliminater 0.47% : 0.000001s : 6: predicate.value_based_eliminate 0.70% : 0.000002s : 12: predicate.virtual_dataset_eliminate 0.70% : 0.000002s : 12: predicate.virtual_output_eliminate 0.29% : 0.000001s : 6: predicate.virtual_view_grad_eliminate 0.40% : 0.000001s : 6: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000546 11 50.77% : 0.000277s : 5: func_graph_cloner_run.FuncGraphClonerGraph 49.23% : 0.000269s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.028603 192 0.02% : 0.000004s : 1: ForceFp32Comm 11.32% : 0.003238s : 1: add_attr 11.28% : 0.003226s : 1: add_attr_with_inline 0.01% : 0.000004s : 1: add_comm_op_reuse_tag 0.26% : 0.000073s : 1: add_recomputation 0.01% : 0.000004s : 1: assign_add_opt 0.26% : 0.000074s : 1: auto_monad 0.10% : 0.000028s : 1: auto_monad_reorder 0.02% : 0.000004s : 1: begin_end_overlap_inline 0.02% : 0.000006s : 1: bias_add_comm_swap 1.41% : 0.000402s : 1: bootstrap 0.11% : 0.000032s : 1: cconv 0.02% : 0.000004s : 1: comm_op_add_attrs 0.08% : 0.000023s : 1: control_data_broadcast_order 0.05% : 0.000013s : 1: convert_after_rewriter 0.12% : 0.000035s : 1: cse_after_recomputation 0.02% : 0.000005s : 1: dataset_repeat_opt 0.02% : 0.000006s : 1: detach_backward 0.04% : 0.000011s : 1: environ_conv 0.09% : 0.000026s : 1: event_method 0.02% : 0.000005s : 1: full_micro_interleaved_order_control 0.02% : 0.000006s : 1: get_jit_bprop_graph 0.03% : 0.000009s : 1: graph_reusing 0.02% : 0.000005s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000004s : 1: handle_group_info 0.02% : 0.000006s : 1: inline 0.02% : 0.000006s : 1: insert-virtual-dataset 0.01% : 0.000004s : 1: interleave_parallel_branches 0.01% : 0.000004s : 1: interleave_split_concat_branches 0.02% : 0.000006s : 1: label_fine_grained_interleaved_index 0.03% : 0.000008s : 1: label_micro_interleaved_index 1.79% : 0.000511s : 1: loop_unroll 0.02% : 0.000004s : 1: merge_cast_opt 0.02% : 0.000005s : 1: micro_interleaved_order_control 1.97% : 0.000564s : 1: mutable_eliminate 0.03% : 0.000008s : 1: offloading_packed_experts 0.08% : 0.000022s : 1: opt.transform.loop_unroll_optimizer 0.08% : 0.000022s : 1: opt.transform.mutable_eliminate 5.91% : 0.001690s : 78: opt.transform.opt_a 0.16% : 0.000045s : 1: opt.transform.opt_after_cconv 0.14% : 0.000039s : 1: opt.transform.opt_after_jit_grad 0.66% : 0.000190s : 28: opt.transform.opt_b 0.27% : 0.000076s : 2: opt.transform.opt_trans_graph 0.21% : 0.000059s : 4: opt.transform.symbol_engine_opt 12.38% : 0.003541s : 1: opt_a 0.53% : 0.000151s : 1: opt_after_cconv 1.99% : 0.000569s : 1: opt_after_jit_grad 1.15% : 0.000328s : 1: opt_b 21.16% : 0.006054s : 1: optimize 0.09% : 0.000024s : 1: optimize_parallel_all_gather_comm 0.03% : 0.000010s : 1: order_py_execute_after_rewriter 0.11% : 0.000031s : 1: overlap_grad_flash_sp 0.01% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.03% : 0.000009s : 1: overlap_grad_ring_attention 0.02% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.02% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.02% : 0.000005s : 1: overlap_param_gather 0.02% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.03% : 0.000009s : 1: overlap_recompute_and_grad_model_parallel 0.02% : 0.000006s : 1: overlap_recompute_comm 0.03% : 0.000008s : 1: parallel-infer-symbol 0.01% : 0.000004s : 1: parallel-infer-symbol-second 0.02% : 0.000006s : 1: partial_unused_args_eliminate 0.02% : 0.000005s : 1: pipeline_parallel_scheduler 0.02% : 0.000005s : 1: pipeline_split 0.14% : 0.000040s : 1: pre_auto_parallel 0.11% : 0.000032s : 1: py_interpret_to_execute 0.06% : 0.000017s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000004s : 1: remove_cast_before_assign_add 0.17% : 0.000047s : 1: remove_dup_value 1.51% : 0.000431s : 1: renormalize.infer 1.26% : 0.000360s : 1: renormalize.specialize 0.02% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.03% : 0.000008s : 1: rewriter_after_jit_bprop_graph 0.19% : 0.000054s : 1: rewriter_after_opt_a 0.35% : 0.000099s : 1: rewriter_before_opt_a 0.02% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.02% : 0.000006s : 1: slice_recompute_activation 0.02% : 0.000005s : 1: split_layernorm_comm 0.02% : 0.000005s : 1: split_matmul_comm_elemetwise 0.04% : 0.000011s : 1: swap_dp_allreduce_reducescatter 0.39% : 0.000112s : 1: symbol_engine_optimizer 0.39% : 0.000112s : 1: tuple_transform 20.79% : 0.005947s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:44:37.403.394 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:44:37.403.654 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0176234, [21] [bootstrap]: 0.00043511 [type_inference]: 0.00606326 [event_method]: 2.094e-05 [auto_monad]: 6.634e-05 [graph_reusing]: 5.70001e-06 [inline]: 2.37001e-06 [add_attr]: 0.00319304, [1] [add_attr_with_inline]: 0.00318349, [1] [Cycle 1]: 7.904e-05, [2] [tag_attr]: 2.051e-05 [meta_addattr_fg_expand]: 6.22001e-06 [parallel-infer-symbol]: 3.16999e-06 [pre_auto_parallel]: 3.298e-05 [insert-virtual-dataset]: 2.87002e-06 [parallel-infer-symbol-second]: 7.00005e-07 [dataset_repeat_opt]: 1.93997e-06 [pipeline_split]: 1.67999e-06 [optimize]: 0.00630275, [53] [py_interpret_to_execute]: 3.026e-05 [rewriter_before_opt_a]: 8.836e-05 [opt_a]: 0.00367961, [2] [Cycle 1]: 0.00264192, [45] [expand_dump_flag]: 2.79001e-06 [switch_simplify]: 4.278e-05 [loop_unroll]: 3.166e-05 [a_1]: 0.00067824 [with_stream_mark]: 1.89e-05 [recompute_prepare]: 1.395e-05 [updatestate_depend_eliminate]: 5.97999e-06 [updatestate_assign_eliminate]: 4.18001e-06 [updatestate_loads_eliminate]: 3.80998e-06 [parameter_eliminate]: 0.00018416 [a_2]: 0.00014161 [accelerated_algorithm]: 8.95001e-06 [shard]: 2.56998e-06 [meta_shard_fg_expand]: 2.63e-06 [shard_inline]: 8.2e-06 [merge_send_recv]: 1.374e-05 [auto_parallel]: 1.065e-05 [parallel]: 1.913e-05 [flash_sp]: 1.053e-05 [merge_comm]: 4.95001e-06 [allreduce_fusion]: 4.50001e-06 [matmul_add_comm_reduction]: 1.135e-05 [allreduce_slice_to_reducescatter]: 8.79983e-07 [virtual_shard_identity]: 1.089e-05 [virtual_dataset]: 8.82e-06 [get_grad_eliminate_]: 7.64002e-06 [virtual_output]: 7.65e-06 [merge_forward]: 4.60001e-06 [cell_reuse_recompute_pass]: 1.80001e-06 [offload_activation]: 1.072e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.134e-05 [merge_recompute_call_nodes]: 1.64998e-06 [before_grad]: 1.355e-05 [set_forward_comm_id_for_comm_node_pass]: 5.24998e-06 [meta_fg_expand]: 3.76999e-06 [flash_sp_send_recv_attached]: 3.09999e-06 [receive_attached]: 2.48e-06 [after_resolve]: 1.397e-05 [a_after_grad]: 1.246e-05 [renormalize]: 0.00072525 [add_forward_monad_depend]: 6.33998e-06 [auto_monad_grad]: 2.51e-06 [auto_monad_eliminator]: 1.802e-05 [cse]: 3.576e-05 [a_3]: 7.226e-05 [Cycle 2]: 0.00102282, [45] [expand_dump_flag]: 1.29e-06 [switch_simplify]: 9.02e-06 [loop_unroll]: 8.10999e-06 [a_1]: 0.0001913 [with_stream_mark]: 1.465e-05 [recompute_prepare]: 8.40001e-06 [updatestate_depend_eliminate]: 4.57e-06 [updatestate_assign_eliminate]: 3.14001e-06 [updatestate_loads_eliminate]: 3.27002e-06 [parameter_eliminate]: 1.30001e-06 [a_2]: 0.00012314 [accelerated_algorithm]: 9.31e-06 [shard]: 1.47001e-06 [meta_shard_fg_expand]: 2.64001e-06 [shard_inline]: 7.73001e-06 [merge_send_recv]: 8.07e-06 [auto_parallel]: 7.65e-06 [parallel]: 6.38e-06 [flash_sp]: 3.78999e-06 [merge_comm]: 4.42e-06 [allreduce_fusion]: 4.37e-06 [matmul_add_comm_reduction]: 8.13001e-06 [allreduce_slice_to_reducescatter]: 6.80011e-07 [virtual_shard_identity]: 1.161e-05 [virtual_dataset]: 8.23999e-06 [get_grad_eliminate_]: 7.11001e-06 [virtual_output]: 9.45001e-06 [merge_forward]: 4.87e-06 [cell_reuse_recompute_pass]: 2.09e-06 [offload_activation]: 9.31002e-06 [cell_reuse_handle_not_recompute_node_pass]: 2.007e-05 [merge_recompute_call_nodes]: 9.79984e-07 [before_grad]: 1.145e-05 [set_forward_comm_id_for_comm_node_pass]: 5.17999e-06 [meta_fg_expand]: 3.18e-06 [flash_sp_send_recv_attached]: 1.77999e-06 [receive_attached]: 1.09e-06 [after_resolve]: 1.249e-05 [a_after_grad]: 1.116e-05 [renormalize]: 7.00238e-08 [add_forward_monad_depend]: 2.51e-06 [auto_monad_grad]: 1.42999e-06 [auto_monad_eliminator]: 1.279e-05 [cse]: 2.228e-05 [a_3]: 6.124e-05 [py_interpret_to_execute_after_opt_a]: 1.874e-05 [slice_cell_reuse_recomputed_activation]: 5.00001e-06 [rewriter_after_opt_a]: 4.877e-05 [convert_after_rewriter]: 1.17e-05 [order_py_execute_after_rewriter]: 8.97999e-06 [mutable_eliminate]: 0.00054159 [opt_b]: 0.00034264, [1] [Cycle 1]: 0.00033179, [7] [b_1]: 0.00021847 [b_2]: 1.019e-05 [updatestate_depend_eliminate]: 8.37e-06 [updatestate_assign_eliminate]: 3.18e-06 [updatestate_loads_eliminate]: 3.3e-06 [renormalize]: 1.15001e-06 [cse]: 2.624e-05 [optimize_parallel_all_gather_comm]: 2.19e-05 [overlap_param_gather]: 5.10999e-06 [cconv]: 3.184e-05 [loop_unroll]: 0.00055134 [opt_after_cconv]: 0.00015374, [1] [Cycle 1]: 0.00014371, [7] [c_1]: 3.799e-05 [parameter_eliminate]: 4.84998e-06 [updatestate_depend_eliminate]: 7.51001e-06 [updatestate_assign_eliminate]: 3.32002e-06 [updatestate_loads_eliminate]: 3.9e-06 [cse]: 2.773e-05 [renormalize]: 5.29981e-07 [remove_dup_value]: 1.928e-05 [tuple_transform]: 0.0001086, [1] [Cycle 1]: 0.00010093, [4] [d_1]: 5.667e-05 [none_parameter_eliminate]: 1.96003e-06 [renormalize]: 2.19996e-07 [switch_simplify]: 9.22999e-06 [partial_unused_args_eliminate]: 5.24e-06 [add_recomputation]: 6.355e-05 [cse_after_recomputation]: 3.294e-05, [1] [Cycle 1]: 2.587e-05, [1] [cse]: 1.608e-05 [environ_conv]: 1.07e-05 [swap_dp_allreduce_reducescatter]: 9.12001e-06 [bias_add_comm_swap]: 5.13002e-06 [label_micro_interleaved_index]: 6.92002e-06 [label_fine_grained_interleaved_index]: 5.37001e-06 [merge_cast_opt]: 3.65e-06 [slice_recompute_activation]: 4.48001e-06 [micro_interleaved_order_control]: 4.82e-06 [assign_add_opt]: 3.65e-06 [ForceFp32Comm]: 3.33e-06 [remove_cast_before_assign_add]: 3.38e-06 [full_micro_interleaved_order_control]: 4.80999e-06 [reorder_send_recv_between_fp_bp]: 5.20001e-06 [comm_op_add_attrs]: 3.66999e-06 [add_comm_op_reuse_tag]: 3.24001e-06 [interleave_split_concat_branches]: 3.41999e-06 [interleave_parallel_branches]: 3.36001e-06 [overlap_opt_shard_in_pipeline]: 3.96001e-06 [overlap_opt_shard_grad_in_pipeline]: 4.63001e-06 [control_data_broadcast_order]: 2.044e-05 [grouped_pairwise_exchange_alltoall]: 4.24002e-06 [offloading_packed_experts]: 7.56001e-06 [overlap_recompute_and_grad_model_parallel]: 7.83001e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.88001e-06 [overlap_recompute_allgather_and_fa_grad]: 3.63e-06 [overlap_recompute_comm]: 4.58001e-06 [overlap_grad_ring_attention]: 7.31001e-06 [overlap_grad_flash_sp]: 2.861e-05 [begin_end_overlap_inline]: 3.01001e-06 [split_matmul_comm_elemetwise]: 4.92e-06 [split_layernorm_comm]: 4.17e-06 [handle_group_info]: 4.08999e-06 [symbol_engine_optimizer]: 0.00011438, [1] [Cycle 1]: 0.00010691, [6] [build]: 4.68999e-06 [elim_shapecalc]: 1.346e-05 [elim_not_effective]: 1.748e-05 [opt_reshape]: 8.43001e-06 [fold_const_symbol]: 1.277e-05 [renormalize]: 1.79978e-07 [detach_backward]: 4.75999e-06 [pipeline_parallel_scheduler]: 1.99999e-06 [auto_monad_reorder]: 2.561e-05 [get_jit_bprop_graph]: 1.79e-06 [rewriter_after_jit_bprop_graph]: 5.24998e-06 [opt_after_jit_grad]: 0.0005659 [validate]: 4.843e-05 Sums bootstrap : 0.000435s : 3.53% type_inference : 0.006063s : 49.16% event_method : 0.000021s : 0.17% auto_monad : 0.000066s : 0.54% graph_reusing : 0.000006s : 0.05% inline : 0.000002s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000021s : 0.17% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.05% parallel-infer-symbol : 0.000003s : 0.03% pre_auto_parallel : 0.000033s : 0.27% insert-virtual-dataset : 0.000003s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000030s : 0.25% optimize.rewriter_before_opt_a : 0.000088s : 0.72% optimize.opt_a.expand_dump_flag : 0.000004s : 0.03% optimize.opt_a.switch_simplify : 0.000052s : 0.42% optimize.opt_a.loop_unroll : 0.000040s : 0.32% optimize.opt_a.a_1 : 0.000870s : 7.05% optimize.opt_a.with_stream_mark : 0.000034s : 0.27% optimize.opt_a.recompute_prepare : 0.000022s : 0.18% optimize.opt_a.updatestate_depend_eliminate : 0.000011s : 0.09% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.06% optimize.opt_a.updatestate_loads_eliminate : 0.000007s : 0.06% optimize.opt_a.parameter_eliminate : 0.000185s : 1.50% optimize.opt_a.a_2 : 0.000265s : 2.15% optimize.opt_a.accelerated_algorithm : 0.000018s : 0.15% optimize.opt_a.shard : 0.000004s : 0.03% optimize.opt_a.meta_shard_fg_expand : 0.000005s : 0.04% optimize.opt_a.shard_inline : 0.000016s : 0.13% optimize.opt_a.merge_send_recv : 0.000022s : 0.18% optimize.opt_a.auto_parallel : 0.000018s : 0.15% optimize.opt_a.parallel : 0.000026s : 0.21% optimize.opt_a.flash_sp : 0.000014s : 0.12% optimize.opt_a.merge_comm : 0.000009s : 0.08% optimize.opt_a.allreduce_fusion : 0.000009s : 0.07% optimize.opt_a.matmul_add_comm_reduction : 0.000019s : 0.16% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000002s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000022s : 0.18% optimize.opt_a.virtual_dataset : 0.000017s : 0.14% optimize.opt_a.get_grad_eliminate_ : 0.000015s : 0.12% optimize.opt_a.virtual_output : 0.000017s : 0.14% optimize.opt_a.merge_forward : 0.000009s : 0.08% optimize.opt_a.cell_reuse_recompute_pass : 0.000004s : 0.03% optimize.opt_a.offload_activation : 0.000020s : 0.16% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000041s : 0.34% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.02% optimize.opt_a.before_grad : 0.000025s : 0.20% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000010s : 0.08% optimize.opt_a.meta_fg_expand : 0.000007s : 0.06% optimize.opt_a.flash_sp_send_recv_attached : 0.000005s : 0.04% optimize.opt_a.receive_attached : 0.000004s : 0.03% optimize.opt_a.after_resolve : 0.000026s : 0.21% optimize.opt_a.a_after_grad : 0.000024s : 0.19% optimize.opt_a.renormalize : 0.000725s : 5.88% optimize.opt_a.add_forward_monad_depend : 0.000009s : 0.07% optimize.opt_a.auto_monad_grad : 0.000004s : 0.03% optimize.opt_a.auto_monad_eliminator : 0.000031s : 0.25% optimize.opt_a.cse : 0.000058s : 0.47% optimize.opt_a.a_3 : 0.000134s : 1.08% optimize.py_interpret_to_execute_after_opt_a : 0.000019s : 0.15% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.04% optimize.rewriter_after_opt_a : 0.000049s : 0.40% optimize.convert_after_rewriter : 0.000012s : 0.09% optimize.order_py_execute_after_rewriter : 0.000009s : 0.07% optimize.mutable_eliminate : 0.000542s : 4.39% optimize.opt_b.b_1 : 0.000218s : 1.77% optimize.opt_b.b_2 : 0.000010s : 0.08% optimize.opt_b.updatestate_depend_eliminate : 0.000008s : 0.07% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_b.renormalize : 0.000001s : 0.01% optimize.opt_b.cse : 0.000026s : 0.21% optimize.optimize_parallel_all_gather_comm : 0.000022s : 0.18% optimize.overlap_param_gather : 0.000005s : 0.04% optimize.cconv : 0.000032s : 0.26% optimize.loop_unroll : 0.000551s : 4.47% optimize.opt_after_cconv.c_1 : 0.000038s : 0.31% optimize.opt_after_cconv.parameter_eliminate : 0.000005s : 0.04% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000008s : 0.06% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000004s : 0.03% optimize.opt_after_cconv.cse : 0.000028s : 0.22% optimize.opt_after_cconv.renormalize : 0.000001s : 0.00% optimize.remove_dup_value : 0.000019s : 0.16% optimize.tuple_transform.d_1 : 0.000057s : 0.46% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.02% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000009s : 0.07% optimize.partial_unused_args_eliminate : 0.000005s : 0.04% optimize.add_recomputation : 0.000064s : 0.52% optimize.cse_after_recomputation.cse : 0.000016s : 0.13% optimize.environ_conv : 0.000011s : 0.09% optimize.swap_dp_allreduce_reducescatter : 0.000009s : 0.07% optimize.bias_add_comm_swap : 0.000005s : 0.04% optimize.label_micro_interleaved_index : 0.000007s : 0.06% optimize.label_fine_grained_interleaved_index : 0.000005s : 0.04% optimize.merge_cast_opt : 0.000004s : 0.03% optimize.slice_recompute_activation : 0.000004s : 0.04% optimize.micro_interleaved_order_control : 0.000005s : 0.04% optimize.assign_add_opt : 0.000004s : 0.03% optimize.ForceFp32Comm : 0.000003s : 0.03% optimize.remove_cast_before_assign_add : 0.000003s : 0.03% optimize.full_micro_interleaved_order_control : 0.000005s : 0.04% optimize.reorder_send_recv_between_fp_bp : 0.000005s : 0.04% optimize.comm_op_add_attrs : 0.000004s : 0.03% optimize.add_comm_op_reuse_tag : 0.000003s : 0.03% optimize.interleave_split_concat_branches : 0.000003s : 0.03% optimize.interleave_parallel_branches : 0.000003s : 0.03% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.03% optimize.overlap_opt_shard_grad_in_pipeline : 0.000005s : 0.04% optimize.control_data_broadcast_order : 0.000020s : 0.17% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.03% optimize.offloading_packed_experts : 0.000008s : 0.06% optimize.overlap_recompute_and_grad_model_parallel : 0.000008s : 0.06% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.03% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.03% optimize.overlap_recompute_comm : 0.000005s : 0.04% optimize.overlap_grad_ring_attention : 0.000007s : 0.06% optimize.overlap_grad_flash_sp : 0.000029s : 0.23% optimize.begin_end_overlap_inline : 0.000003s : 0.02% optimize.split_matmul_comm_elemetwise : 0.000005s : 0.04% optimize.split_layernorm_comm : 0.000004s : 0.03% optimize.handle_group_info : 0.000004s : 0.03% optimize.symbol_engine_optimizer.build : 0.000005s : 0.04% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000013s : 0.11% optimize.symbol_engine_optimizer.elim_not_effective : 0.000017s : 0.14% optimize.symbol_engine_optimizer.opt_reshape : 0.000008s : 0.07% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000013s : 0.10% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000005s : 0.04% pipeline_parallel_scheduler : 0.000002s : 0.02% auto_monad_reorder : 0.000026s : 0.21% get_jit_bprop_graph : 0.000002s : 0.01% rewriter_after_jit_bprop_graph : 0.000005s : 0.04% opt_after_jit_grad : 0.000566s : 4.59% validate : 0.000048s : 0.39% Time group info: ------[substitution.] 0.000218 39 10.79% : 0.000024s : 3: substitution.cast_eliminate 1.10% : 0.000002s : 3: substitution.elim_not_effective 0.79% : 0.000002s : 3: substitution.fold_const_symbol 3.24% : 0.000007s : 5: substitution.graph_param_transform 66.87% : 0.000146s : 4: substitution.inline 1.95% : 0.000004s : 6: substitution.j_node_and_user_rematch 3.19% : 0.000007s : 6: substitution.remove_not_recompute_node 2.46% : 0.000005s : 4: substitution.replace_old_param 5.70% : 0.000012s : 4: substitution.tuple_list_get_item_eliminator 3.92% : 0.000009s : 1: substitution.value_based_eliminate ------[type_inference.] 0.006006 2 87.82% : 0.005275s : 1: type_inference.infer 12.18% : 0.000731s : 1: type_inference.specialize ------[replace.] 0.000064 8 61.02% : 0.000039s : 4: replace.inline 38.98% : 0.000025s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000154 8 93.10% : 0.000143s : 4: match.inline 6.90% : 0.000011s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000246 1504 0.90% : 0.000002s : 15: predicate.accumulaten_eliminater 0.99% : 0.000002s : 5: predicate.ad_related_special_op_eliminate 0.68% : 0.000002s : 10: predicate.addn_check_dump 0.84% : 0.000002s : 15: predicate.addn_zero_filter 0.84% : 0.000002s : 15: predicate.adjust_all_reduce_mul_add 2.18% : 0.000005s : 25: predicate.arithmetic_simplify 0.99% : 0.000002s : 15: predicate.cast_eliminate 0.65% : 0.000002s : 10: predicate.check_bprop_eliminate 0.58% : 0.000001s : 10: predicate.compare_switch_simplify 0.21% : 0.000001s : 5: predicate.const_output_eliminate 0.67% : 0.000002s : 10: predicate.depend_value_elim 0.91% : 0.000002s : 15: predicate.dict_get_item_const_eliminator 1.14% : 0.000003s : 15: predicate.dict_get_item_eliminator 0.88% : 0.000002s : 15: predicate.dict_set_item_eliminator 1.02% : 0.000003s : 10: predicate.dumpgradient_eliminate 0.26% : 0.000001s : 5: predicate.elim_not_effective 0.53% : 0.000001s : 5: predicate.elim_shapecalc_of_broadcastargs 1.12% : 0.000003s : 20: predicate.environ_add_const_eliminate 1.13% : 0.000003s : 20: predicate.environ_get_add_eliminate 1.13% : 0.000003s : 20: predicate.environ_get_depend_swap 1.83% : 0.000004s : 30: predicate.environ_get_eliminate 1.09% : 0.000003s : 20: predicate.environ_get_set_eliminate 1.38% : 0.000003s : 23: predicate.exchange_switch_depend_value 2.18% : 0.000005s : 23: predicate.float_depend_g_call 0.55% : 0.000001s : 10: predicate.float_environ_get_switch 0.87% : 0.000002s : 15: predicate.float_tuple_getitem_switch 0.19% : 0.000000s : 5: predicate.fold_const_symbol 0.63% : 0.000002s : 10: predicate.get_grad_eliminate 0.20% : 0.000000s : 5: predicate.graph_param_transform 0.67% : 0.000002s : 10: predicate.incorporate_call 0.55% : 0.000001s : 10: predicate.incorporate_call_switch 5.89% : 0.000014s : 68: predicate.inline 0.83% : 0.000002s : 10: predicate.inline_without_move 0.35% : 0.000001s : 10: predicate.j_node_and_user_rematch 0.79% : 0.000002s : 10: predicate.less_batch_normalization 1.88% : 0.000005s : 29: predicate.list_to_tuple_eliminator_ 2.45% : 0.000006s : 44: predicate.load_eliminater 0.98% : 0.000002s : 5: predicate.loop_unroll_after_grad 2.26% : 0.000006s : 36: predicate.loop_unroll_before_grad 1.64% : 0.000004s : 25: predicate.make_slice_get_slice_eliminator 0.69% : 0.000002s : 10: predicate.merge_addn 0.56% : 0.000001s : 10: predicate.micro_step_allgather_replace 0.58% : 0.000001s : 10: predicate.mini_step_allgather_replace 0.81% : 0.000002s : 15: predicate.minmaximum_grad 0.95% : 0.000002s : 5: predicate.mutable_eliminate 0.35% : 0.000001s : 5: predicate.opt_reshape 0.41% : 0.000001s : 5: predicate.parallel_virtual_node 1.77% : 0.000004s : 23: predicate.partial_defer_inline 1.59% : 0.000004s : 24: predicate.partial_eliminate 0.98% : 0.000002s : 15: predicate.print_const_string_wrapper 0.60% : 0.000001s : 10: predicate.reduce_all_const_elim 1.23% : 0.000003s : 15: predicate.reduce_eliminate 2.52% : 0.000006s : 44: predicate.redundant_stop_gradient_eliminater 0.45% : 0.000001s : 10: predicate.remove_not_recompute_node 1.31% : 0.000003s : 29: predicate.replace_applicator 0.56% : 0.000001s : 10: predicate.replace_old_param 0.23% : 0.000001s : 5: predicate.reset_defer_inline 0.93% : 0.000002s : 15: predicate.reshape_eliminate 0.70% : 0.000002s : 10: predicate.row_tensor_add_zeros_like 0.35% : 0.000001s : 5: predicate.row_tensor_eliminate 0.73% : 0.000002s : 10: predicate.same_eliminate 0.52% : 0.000001s : 10: predicate.set_cell_output_no_recompute 0.97% : 0.000002s : 10: predicate.shard_identity_eliminate 0.70% : 0.000002s : 10: predicate.special_op_eliminate 0.88% : 0.000002s : 10: predicate.specialize_transform 0.92% : 0.000002s : 10: predicate.split_environ_get_set_with_tuple_value 0.77% : 0.000002s : 10: predicate.stack_unstack_eliminate 0.35% : 0.000001s : 5: predicate.switch_call_monad_eliminater 1.43% : 0.000004s : 23: predicate.switch_defer_inline 1.97% : 0.000005s : 33: predicate.switch_layer_defer_inline 5.25% : 0.000013s : 74: predicate.switch_simplify 0.93% : 0.000002s : 15: predicate.tile_eliminate 0.94% : 0.000002s : 15: predicate.transpose_eliminate 1.53% : 0.000004s : 25: predicate.tuple_list_convert_item_index_to_positive 1.55% : 0.000004s : 25: predicate.tuple_list_get_item_const_eliminator 1.43% : 0.000004s : 25: predicate.tuple_list_get_item_depend_reorder 3.21% : 0.000008s : 39: predicate.tuple_list_get_item_eliminator 1.54% : 0.000004s : 25: predicate.tuple_list_get_set_item_eliminator 2.31% : 0.000006s : 35: predicate.tuple_list_set_item_eliminator 1.76% : 0.000004s : 29: predicate.tuple_to_list_eliminator_ 2.35% : 0.000006s : 44: predicate.updatestate_pure_node_eliminater 3.08% : 0.000008s : 54: predicate.updatestate_useless_node_eliminater 0.47% : 0.000001s : 5: predicate.value_based_eliminate 0.65% : 0.000002s : 10: predicate.virtual_dataset_eliminate 0.60% : 0.000001s : 10: predicate.virtual_output_eliminate 0.29% : 0.000001s : 5: predicate.virtual_view_grad_eliminate 0.43% : 0.000001s : 5: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000556 11 55.03% : 0.000306s : 5: func_graph_cloner_run.FuncGraphClonerGraph 44.97% : 0.000250s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.029487 192 0.02% : 0.000006s : 1: ForceFp32Comm 10.86% : 0.003203s : 1: add_attr 10.81% : 0.003187s : 1: add_attr_with_inline 0.02% : 0.000006s : 1: add_comm_op_reuse_tag 0.23% : 0.000068s : 1: add_recomputation 0.02% : 0.000006s : 1: assign_add_opt 0.26% : 0.000076s : 1: auto_monad 0.11% : 0.000034s : 1: auto_monad_reorder 0.02% : 0.000006s : 1: begin_end_overlap_inline 0.03% : 0.000008s : 1: bias_add_comm_swap 1.63% : 0.000482s : 1: bootstrap 0.12% : 0.000036s : 1: cconv 0.02% : 0.000007s : 1: comm_op_add_attrs 0.08% : 0.000023s : 1: control_data_broadcast_order 0.05% : 0.000015s : 1: convert_after_rewriter 0.12% : 0.000036s : 1: cse_after_recomputation 0.03% : 0.000008s : 1: dataset_repeat_opt 0.09% : 0.000026s : 1: detach_backward 0.05% : 0.000014s : 1: environ_conv 0.11% : 0.000032s : 1: event_method 0.03% : 0.000008s : 1: full_micro_interleaved_order_control 0.03% : 0.000008s : 1: get_jit_bprop_graph 0.04% : 0.000012s : 1: graph_reusing 0.02% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.02% : 0.000007s : 1: handle_group_info 0.03% : 0.000008s : 1: inline 0.03% : 0.000009s : 1: insert-virtual-dataset 0.02% : 0.000006s : 1: interleave_parallel_branches 0.02% : 0.000007s : 1: interleave_split_concat_branches 0.03% : 0.000009s : 1: label_fine_grained_interleaved_index 0.03% : 0.000010s : 1: label_micro_interleaved_index 1.89% : 0.000558s : 1: loop_unroll 0.02% : 0.000006s : 1: merge_cast_opt 0.03% : 0.000008s : 1: micro_interleaved_order_control 1.86% : 0.000548s : 1: mutable_eliminate 0.04% : 0.000010s : 1: offloading_packed_experts 0.07% : 0.000020s : 1: opt.transform.loop_unroll_optimizer 0.06% : 0.000018s : 1: opt.transform.mutable_eliminate 4.80% : 0.001416s : 78: opt.transform.opt_a 0.12% : 0.000036s : 1: opt.transform.opt_after_cconv 0.12% : 0.000036s : 1: opt.transform.opt_after_jit_grad 0.52% : 0.000153s : 28: opt.transform.opt_b 0.21% : 0.000063s : 2: opt.transform.opt_trans_graph 0.16% : 0.000048s : 4: opt.transform.symbol_engine_opt 12.49% : 0.003683s : 1: opt_a 0.53% : 0.000157s : 1: opt_after_cconv 1.96% : 0.000578s : 1: opt_after_jit_grad 1.17% : 0.000346s : 1: opt_b 23.32% : 0.006876s : 1: optimize 0.09% : 0.000025s : 1: optimize_parallel_all_gather_comm 0.04% : 0.000012s : 1: order_py_execute_after_rewriter 0.11% : 0.000032s : 1: overlap_grad_flash_sp 0.02% : 0.000007s : 1: overlap_grad_matmul_and_grad_allreduce 0.04% : 0.000011s : 1: overlap_grad_ring_attention 0.03% : 0.000008s : 1: overlap_opt_shard_grad_in_pipeline 0.02% : 0.000007s : 1: overlap_opt_shard_in_pipeline 0.03% : 0.000008s : 1: overlap_param_gather 0.02% : 0.000006s : 1: overlap_recompute_allgather_and_fa_grad 0.04% : 0.000011s : 1: overlap_recompute_and_grad_model_parallel 0.03% : 0.000007s : 1: overlap_recompute_comm 0.03% : 0.000010s : 1: parallel-infer-symbol 0.02% : 0.000006s : 1: parallel-infer-symbol-second 0.03% : 0.000008s : 1: partial_unused_args_eliminate 0.03% : 0.000010s : 1: pipeline_parallel_scheduler 0.02% : 0.000007s : 1: pipeline_split 0.14% : 0.000041s : 1: pre_auto_parallel 0.11% : 0.000034s : 1: py_interpret_to_execute 0.08% : 0.000022s : 1: py_interpret_to_execute_after_opt_a 0.02% : 0.000006s : 1: remove_cast_before_assign_add 0.08% : 0.000023s : 1: remove_dup_value 1.33% : 0.000391s : 1: renormalize.infer 1.10% : 0.000324s : 1: renormalize.specialize 0.03% : 0.000008s : 1: reorder_send_recv_between_fp_bp 0.04% : 0.000011s : 1: rewriter_after_jit_bprop_graph 0.18% : 0.000053s : 1: rewriter_after_opt_a 0.31% : 0.000092s : 1: rewriter_before_opt_a 0.03% : 0.000008s : 1: slice_cell_reuse_recomputed_activation 0.02% : 0.000007s : 1: slice_recompute_activation 0.02% : 0.000007s : 1: split_layernorm_comm 0.03% : 0.000008s : 1: split_matmul_comm_elemetwise 0.04% : 0.000012s : 1: swap_dp_allreduce_reducescatter 0.40% : 0.000118s : 1: symbol_engine_optimizer 0.38% : 0.000111s : 1: tuple_transform 20.71% : 0.006106s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:44:37.718.399 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0162228, [21] [bootstrap]: 0.00037327 [type_inference]: 0.00596697 [event_method]: 1.988e-05 [auto_monad]: 6.856e-05 [graph_reusing]: 6.35002e-06 [inline]: 2.48998e-06 [add_attr]: 0.00354821, [1] [add_attr_with_inline]: 0.00353695, [1] [Cycle 1]: 6.748e-05, [2] [tag_attr]: 2.122e-05 [meta_addattr_fg_expand]: 6.57002e-06 [parallel-infer-symbol]: 3.6e-06 [pre_auto_parallel]: 3.841e-05 [insert-virtual-dataset]: 2.96999e-06 [parallel-infer-symbol-second]: 7.99977e-07 [dataset_repeat_opt]: 2.03002e-06 [pipeline_split]: 1.80001e-06 [optimize]: 0.00548118, [53] [py_interpret_to_execute]: 2.937e-05 [rewriter_before_opt_a]: 9.141e-05 [opt_a]: 0.00322855, [2] [Cycle 1]: 0.00237934, [45] [expand_dump_flag]: 3.25e-06 [switch_simplify]: 4.516e-05 [loop_unroll]: 3.178e-05 [a_1]: 0.00070464 [with_stream_mark]: 2.091e-05 [recompute_prepare]: 1.36e-05 [updatestate_depend_eliminate]: 5.27001e-06 [updatestate_assign_eliminate]: 3.88999e-06 [updatestate_loads_eliminate]: 4.05e-06 [parameter_eliminate]: 2.02001e-06 [a_2]: 0.00010344 [accelerated_algorithm]: 8.25e-06 [shard]: 2.35002e-06 [meta_shard_fg_expand]: 2.16e-06 [shard_inline]: 8.11002e-06 [merge_send_recv]: 9.74e-06 [auto_parallel]: 8.06001e-06 [parallel]: 1.919e-05 [flash_sp]: 9.57001e-06 [merge_comm]: 5.06002e-06 [allreduce_fusion]: 4.25999e-06 [matmul_add_comm_reduction]: 1.198e-05 [allreduce_slice_to_reducescatter]: 7.89994e-07 [virtual_shard_identity]: 1.139e-05 [virtual_dataset]: 8.47e-06 [get_grad_eliminate_]: 7.6e-06 [virtual_output]: 7.84002e-06 [merge_forward]: 5.22999e-06 [cell_reuse_recompute_pass]: 1.09e-06 [offload_activation]: 1.224e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.899e-05 [merge_recompute_call_nodes]: 1.76e-06 [before_grad]: 1.34e-05 [set_forward_comm_id_for_comm_node_pass]: 5.59e-06 [meta_fg_expand]: 3.60998e-06 [flash_sp_send_recv_attached]: 2.96001e-06 [receive_attached]: 2.62001e-06 [after_resolve]: 1.42e-05 [a_after_grad]: 1.296e-05 [renormalize]: 0.00082491 [add_forward_monad_depend]: 6.93e-06 [auto_monad_grad]: 3.52002e-06 [auto_monad_eliminator]: 2.017e-05 [cse]: 4.048e-05 [a_3]: 6.231e-05 [Cycle 2]: 0.00083835, [45] [expand_dump_flag]: 1.62001e-06 [switch_simplify]: 9.61e-06 [loop_unroll]: 7.56999e-06 [a_1]: 0.00017894 [with_stream_mark]: 1.724e-05 [recompute_prepare]: 8.63001e-06 [updatestate_depend_eliminate]: 4.12e-06 [updatestate_assign_eliminate]: 3.33e-06 [updatestate_loads_eliminate]: 3.38e-06 [parameter_eliminate]: 1.21002e-06 [a_2]: 9.287e-05 [accelerated_algorithm]: 9.00999e-06 [shard]: 1.52999e-06 [meta_shard_fg_expand]: 1.89e-06 [shard_inline]: 7.67998e-06 [merge_send_recv]: 7.01001e-06 [auto_parallel]: 1.082e-05 [parallel]: 6.14999e-06 [flash_sp]: 3.83001e-06 [merge_comm]: 5.04e-06 [allreduce_fusion]: 4.04002e-06 [matmul_add_comm_reduction]: 7.78999e-06 [allreduce_slice_to_reducescatter]: 5.19998e-07 [virtual_shard_identity]: 1.053e-05 [virtual_dataset]: 7.5e-06 [get_grad_eliminate_]: 7.33e-06 [virtual_output]: 7.21001e-06 [merge_forward]: 3.86999e-06 [cell_reuse_recompute_pass]: 1.82001e-06 [offload_activation]: 1.278e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.796e-05 [merge_recompute_call_nodes]: 1.13001e-06 [before_grad]: 2.578e-05 [set_forward_comm_id_for_comm_node_pass]: 5.20999e-06 [meta_fg_expand]: 3.25e-06 [flash_sp_send_recv_attached]: 1.20001e-06 [receive_attached]: 1.44e-06 [after_resolve]: 1.482e-05 [a_after_grad]: 1.272e-05 [renormalize]: 8.9989e-08 [add_forward_monad_depend]: 2.63e-06 [auto_monad_grad]: 1.59e-06 [auto_monad_eliminator]: 1.37e-05 [cse]: 2.447e-05 [a_3]: 4.678e-05 [py_interpret_to_execute_after_opt_a]: 1.499e-05 [slice_cell_reuse_recomputed_activation]: 2.06e-06 [rewriter_after_opt_a]: 4.475e-05 [convert_after_rewriter]: 7.87e-06 [order_py_execute_after_rewriter]: 5.72999e-06 [mutable_eliminate]: 0.00057467 [opt_b]: 0.00026242, [1] [Cycle 1]: 0.000255, [7] [b_1]: 0.00016998 [b_2]: 9.51e-06 [updatestate_depend_eliminate]: 7.31999e-06 [updatestate_assign_eliminate]: 2.96001e-06 [updatestate_loads_eliminate]: 3.3e-06 [renormalize]: 4.80009e-07 [cse]: 2.461e-05 [optimize_parallel_all_gather_comm]: 1.735e-05 [overlap_param_gather]: 2.12999e-06 [cconv]: 2.921e-05 [loop_unroll]: 0.00044443 [opt_after_cconv]: 0.0001161, [1] [Cycle 1]: 0.00010982, [7] [c_1]: 3.78e-05 [parameter_eliminate]: 2.14999e-06 [updatestate_depend_eliminate]: 5.64e-06 [updatestate_assign_eliminate]: 3.16999e-06 [updatestate_loads_eliminate]: 3.03998e-06 [cse]: 2.337e-05 [renormalize]: 3.50003e-07 [remove_dup_value]: 1.511e-05 [tuple_transform]: 8.716e-05, [1] [Cycle 1]: 8.251e-05, [4] [d_1]: 5.312e-05 [none_parameter_eliminate]: 1.59e-06 [renormalize]: 2.09984e-07 [switch_simplify]: 8.52998e-06 [partial_unused_args_eliminate]: 1.88997e-06 [add_recomputation]: 5.742e-05 [cse_after_recomputation]: 2.593e-05, [1] [Cycle 1]: 2.125e-05, [1] [cse]: 1.556e-05 [environ_conv]: 6.21998e-06 [swap_dp_allreduce_reducescatter]: 5.88998e-06 [bias_add_comm_swap]: 2.48998e-06 [label_micro_interleaved_index]: 4.14997e-06 [label_fine_grained_interleaved_index]: 2.79001e-06 [merge_cast_opt]: 1.31002e-06 [slice_recompute_activation]: 2.02999e-06 [micro_interleaved_order_control]: 2.30002e-06 [assign_add_opt]: 1.60001e-06 [ForceFp32Comm]: 8.70001e-07 [remove_cast_before_assign_add]: 1.12999e-06 [full_micro_interleaved_order_control]: 2.04999e-06 [reorder_send_recv_between_fp_bp]: 2.89001e-06 [comm_op_add_attrs]: 1.07e-06 [add_comm_op_reuse_tag]: 9.89996e-07 [interleave_split_concat_branches]: 1.15001e-06 [interleave_parallel_branches]: 1.11002e-06 [overlap_opt_shard_in_pipeline]: 1.22e-06 [overlap_opt_shard_grad_in_pipeline]: 1.83002e-06 [control_data_broadcast_order]: 1.535e-05 [grouped_pairwise_exchange_alltoall]: 2.29999e-06 [offloading_packed_experts]: 4.58999e-06 [overlap_recompute_and_grad_model_parallel]: 5.62999e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.55001e-06 [overlap_recompute_allgather_and_fa_grad]: 1.54e-06 [overlap_recompute_comm]: 2.18002e-06 [overlap_grad_ring_attention]: 4.33001e-06 [overlap_grad_flash_sp]: 2.442e-05 [begin_end_overlap_inline]: 7.7e-07 [split_matmul_comm_elemetwise]: 2.29999e-06 [split_layernorm_comm]: 1.87999e-06 [handle_group_info]: 1.47001e-06 [symbol_engine_optimizer]: 8.864e-05, [1] [Cycle 1]: 8.356e-05, [6] [build]: 3.73001e-06 [elim_shapecalc]: 1.359e-05 [elim_not_effective]: 1.51e-05 [opt_reshape]: 8.52998e-06 [fold_const_symbol]: 1.206e-05 [renormalize]: 1.90019e-07 [detach_backward]: 2.22001e-06 [pipeline_parallel_scheduler]: 1.69998e-06 [auto_monad_reorder]: 2.067e-05 [get_jit_bprop_graph]: 1.57001e-06 [rewriter_after_jit_bprop_graph]: 3.46999e-06 [opt_after_jit_grad]: 0.0004834 [validate]: 4.198e-05 Sums bootstrap : 0.000373s : 3.20% type_inference : 0.005967s : 51.09% event_method : 0.000020s : 0.17% auto_monad : 0.000069s : 0.59% graph_reusing : 0.000006s : 0.05% inline : 0.000002s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000021s : 0.18% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000007s : 0.06% parallel-infer-symbol : 0.000004s : 0.03% pre_auto_parallel : 0.000038s : 0.33% insert-virtual-dataset : 0.000003s : 0.03% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.02% optimize.py_interpret_to_execute : 0.000029s : 0.25% optimize.rewriter_before_opt_a : 0.000091s : 0.78% optimize.opt_a.expand_dump_flag : 0.000005s : 0.04% optimize.opt_a.switch_simplify : 0.000055s : 0.47% optimize.opt_a.loop_unroll : 0.000039s : 0.34% optimize.opt_a.a_1 : 0.000884s : 7.57% optimize.opt_a.with_stream_mark : 0.000038s : 0.33% optimize.opt_a.recompute_prepare : 0.000022s : 0.19% optimize.opt_a.updatestate_depend_eliminate : 0.000009s : 0.08% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.06% optimize.opt_a.updatestate_loads_eliminate : 0.000007s : 0.06% optimize.opt_a.parameter_eliminate : 0.000003s : 0.03% optimize.opt_a.a_2 : 0.000196s : 1.68% optimize.opt_a.accelerated_algorithm : 0.000017s : 0.15% optimize.opt_a.shard : 0.000004s : 0.03% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.03% optimize.opt_a.shard_inline : 0.000016s : 0.14% optimize.opt_a.merge_send_recv : 0.000017s : 0.14% optimize.opt_a.auto_parallel : 0.000019s : 0.16% optimize.opt_a.parallel : 0.000025s : 0.22% optimize.opt_a.flash_sp : 0.000013s : 0.11% optimize.opt_a.merge_comm : 0.000010s : 0.09% optimize.opt_a.allreduce_fusion : 0.000008s : 0.07% optimize.opt_a.matmul_add_comm_reduction : 0.000020s : 0.17% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000022s : 0.19% optimize.opt_a.virtual_dataset : 0.000016s : 0.14% optimize.opt_a.get_grad_eliminate_ : 0.000015s : 0.13% optimize.opt_a.virtual_output : 0.000015s : 0.13% optimize.opt_a.merge_forward : 0.000009s : 0.08% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.02% optimize.opt_a.offload_activation : 0.000025s : 0.21% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000037s : 0.32% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.02% optimize.opt_a.before_grad : 0.000039s : 0.34% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000011s : 0.09% optimize.opt_a.meta_fg_expand : 0.000007s : 0.06% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.04% optimize.opt_a.receive_attached : 0.000004s : 0.03% optimize.opt_a.after_resolve : 0.000029s : 0.25% optimize.opt_a.a_after_grad : 0.000026s : 0.22% optimize.opt_a.renormalize : 0.000825s : 7.06% optimize.opt_a.add_forward_monad_depend : 0.000010s : 0.08% optimize.opt_a.auto_monad_grad : 0.000005s : 0.04% optimize.opt_a.auto_monad_eliminator : 0.000034s : 0.29% optimize.opt_a.cse : 0.000065s : 0.56% optimize.opt_a.a_3 : 0.000109s : 0.93% optimize.py_interpret_to_execute_after_opt_a : 0.000015s : 0.13% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.02% optimize.rewriter_after_opt_a : 0.000045s : 0.38% optimize.convert_after_rewriter : 0.000008s : 0.07% optimize.order_py_execute_after_rewriter : 0.000006s : 0.05% optimize.mutable_eliminate : 0.000575s : 4.92% optimize.opt_b.b_1 : 0.000170s : 1.46% optimize.opt_b.b_2 : 0.000010s : 0.08% optimize.opt_b.updatestate_depend_eliminate : 0.000007s : 0.06% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_b.renormalize : 0.000000s : 0.00% optimize.opt_b.cse : 0.000025s : 0.21% optimize.optimize_parallel_all_gather_comm : 0.000017s : 0.15% optimize.overlap_param_gather : 0.000002s : 0.02% optimize.cconv : 0.000029s : 0.25% optimize.loop_unroll : 0.000444s : 3.81% optimize.opt_after_cconv.c_1 : 0.000038s : 0.32% optimize.opt_after_cconv.parameter_eliminate : 0.000002s : 0.02% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.05% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.cse : 0.000023s : 0.20% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000015s : 0.13% optimize.tuple_transform.d_1 : 0.000053s : 0.45% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000009s : 0.07% optimize.partial_unused_args_eliminate : 0.000002s : 0.02% optimize.add_recomputation : 0.000057s : 0.49% optimize.cse_after_recomputation.cse : 0.000016s : 0.13% optimize.environ_conv : 0.000006s : 0.05% optimize.swap_dp_allreduce_reducescatter : 0.000006s : 0.05% optimize.bias_add_comm_swap : 0.000002s : 0.02% optimize.label_micro_interleaved_index : 0.000004s : 0.04% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.02% optimize.merge_cast_opt : 0.000001s : 0.01% optimize.slice_recompute_activation : 0.000002s : 0.02% optimize.micro_interleaved_order_control : 0.000002s : 0.02% optimize.assign_add_opt : 0.000002s : 0.01% optimize.ForceFp32Comm : 0.000001s : 0.01% optimize.remove_cast_before_assign_add : 0.000001s : 0.01% optimize.full_micro_interleaved_order_control : 0.000002s : 0.02% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.02% optimize.comm_op_add_attrs : 0.000001s : 0.01% optimize.add_comm_op_reuse_tag : 0.000001s : 0.01% optimize.interleave_split_concat_branches : 0.000001s : 0.01% optimize.interleave_parallel_branches : 0.000001s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.02% optimize.control_data_broadcast_order : 0.000015s : 0.13% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.02% optimize.offloading_packed_experts : 0.000005s : 0.04% optimize.overlap_recompute_and_grad_model_parallel : 0.000006s : 0.05% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000002s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000002s : 0.01% optimize.overlap_recompute_comm : 0.000002s : 0.02% optimize.overlap_grad_ring_attention : 0.000004s : 0.04% optimize.overlap_grad_flash_sp : 0.000024s : 0.21% optimize.begin_end_overlap_inline : 0.000001s : 0.01% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.02% optimize.split_layernorm_comm : 0.000002s : 0.02% optimize.handle_group_info : 0.000001s : 0.01% optimize.symbol_engine_optimizer.build : 0.000004s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000014s : 0.12% optimize.symbol_engine_optimizer.elim_not_effective : 0.000015s : 0.13% optimize.symbol_engine_optimizer.opt_reshape : 0.000009s : 0.07% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000012s : 0.10% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.02% pipeline_parallel_scheduler : 0.000002s : 0.01% auto_monad_reorder : 0.000021s : 0.18% get_jit_bprop_graph : 0.000002s : 0.01% rewriter_after_jit_bprop_graph : 0.000003s : 0.03% opt_after_jit_grad : 0.000483s : 4.14% validate : 0.000042s : 0.36% Time group info: ------[substitution.] 0.000239 39 11.82% : 0.000028s : 3: substitution.cast_eliminate 0.94% : 0.000002s : 3: substitution.elim_not_effective 0.70% : 0.000002s : 3: substitution.fold_const_symbol 2.98% : 0.000007s : 5: substitution.graph_param_transform 67.48% : 0.000161s : 4: substitution.inline 1.93% : 0.000005s : 6: substitution.j_node_and_user_rematch 2.88% : 0.000007s : 6: substitution.remove_not_recompute_node 2.14% : 0.000005s : 4: substitution.replace_old_param 5.17% : 0.000012s : 4: substitution.tuple_list_get_item_eliminator 3.95% : 0.000009s : 1: substitution.value_based_eliminate ------[type_inference.] 0.005903 2 87.46% : 0.005163s : 1: type_inference.infer 12.54% : 0.000740s : 1: type_inference.specialize ------[replace.] 0.000068 8 61.39% : 0.000042s : 4: replace.inline 38.61% : 0.000026s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000169 8 93.74% : 0.000158s : 4: match.inline 6.26% : 0.000011s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000242 1504 0.90% : 0.000002s : 15: predicate.accumulaten_eliminater 0.69% : 0.000002s : 5: predicate.ad_related_special_op_eliminate 0.57% : 0.000001s : 10: predicate.addn_check_dump 0.87% : 0.000002s : 15: predicate.addn_zero_filter 0.82% : 0.000002s : 15: predicate.adjust_all_reduce_mul_add 1.99% : 0.000005s : 25: predicate.arithmetic_simplify 1.26% : 0.000003s : 15: predicate.cast_eliminate 0.60% : 0.000001s : 10: predicate.check_bprop_eliminate 0.58% : 0.000001s : 10: predicate.compare_switch_simplify 0.20% : 0.000000s : 5: predicate.const_output_eliminate 0.64% : 0.000002s : 10: predicate.depend_value_elim 0.96% : 0.000002s : 15: predicate.dict_get_item_const_eliminator 0.97% : 0.000002s : 15: predicate.dict_get_item_eliminator 0.83% : 0.000002s : 15: predicate.dict_set_item_eliminator 0.87% : 0.000002s : 10: predicate.dumpgradient_eliminate 0.42% : 0.000001s : 5: predicate.elim_not_effective 0.38% : 0.000001s : 5: predicate.elim_shapecalc_of_broadcastargs 1.18% : 0.000003s : 20: predicate.environ_add_const_eliminate 1.09% : 0.000003s : 20: predicate.environ_get_add_eliminate 1.10% : 0.000003s : 20: predicate.environ_get_depend_swap 1.79% : 0.000004s : 30: predicate.environ_get_eliminate 1.08% : 0.000003s : 20: predicate.environ_get_set_eliminate 1.35% : 0.000003s : 23: predicate.exchange_switch_depend_value 2.45% : 0.000006s : 23: predicate.float_depend_g_call 0.65% : 0.000002s : 10: predicate.float_environ_get_switch 0.86% : 0.000002s : 15: predicate.float_tuple_getitem_switch 0.17% : 0.000000s : 5: predicate.fold_const_symbol 0.64% : 0.000002s : 10: predicate.get_grad_eliminate 0.21% : 0.000001s : 5: predicate.graph_param_transform 0.71% : 0.000002s : 10: predicate.incorporate_call 0.56% : 0.000001s : 10: predicate.incorporate_call_switch 6.32% : 0.000015s : 68: predicate.inline 0.98% : 0.000002s : 10: predicate.inline_without_move 0.32% : 0.000001s : 10: predicate.j_node_and_user_rematch 0.82% : 0.000002s : 10: predicate.less_batch_normalization 2.04% : 0.000005s : 29: predicate.list_to_tuple_eliminator_ 2.51% : 0.000006s : 44: predicate.load_eliminater 0.77% : 0.000002s : 5: predicate.loop_unroll_after_grad 2.15% : 0.000005s : 36: predicate.loop_unroll_before_grad 1.80% : 0.000004s : 25: predicate.make_slice_get_slice_eliminator 0.63% : 0.000002s : 10: predicate.merge_addn 0.58% : 0.000001s : 10: predicate.micro_step_allgather_replace 0.64% : 0.000002s : 10: predicate.mini_step_allgather_replace 0.81% : 0.000002s : 15: predicate.minmaximum_grad 1.04% : 0.000003s : 5: predicate.mutable_eliminate 0.38% : 0.000001s : 5: predicate.opt_reshape 0.40% : 0.000001s : 5: predicate.parallel_virtual_node 1.63% : 0.000004s : 23: predicate.partial_defer_inline 1.62% : 0.000004s : 24: predicate.partial_eliminate 0.84% : 0.000002s : 15: predicate.print_const_string_wrapper 0.59% : 0.000001s : 10: predicate.reduce_all_const_elim 1.07% : 0.000003s : 15: predicate.reduce_eliminate 2.39% : 0.000006s : 44: predicate.redundant_stop_gradient_eliminater 0.48% : 0.000001s : 10: predicate.remove_not_recompute_node 1.37% : 0.000003s : 29: predicate.replace_applicator 0.51% : 0.000001s : 10: predicate.replace_old_param 0.25% : 0.000001s : 5: predicate.reset_defer_inline 0.94% : 0.000002s : 15: predicate.reshape_eliminate 0.64% : 0.000002s : 10: predicate.row_tensor_add_zeros_like 0.34% : 0.000001s : 5: predicate.row_tensor_eliminate 0.74% : 0.000002s : 10: predicate.same_eliminate 0.49% : 0.000001s : 10: predicate.set_cell_output_no_recompute 0.89% : 0.000002s : 10: predicate.shard_identity_eliminate 0.72% : 0.000002s : 10: predicate.special_op_eliminate 0.76% : 0.000002s : 10: predicate.specialize_transform 1.04% : 0.000003s : 10: predicate.split_environ_get_set_with_tuple_value 1.16% : 0.000003s : 10: predicate.stack_unstack_eliminate 0.37% : 0.000001s : 5: predicate.switch_call_monad_eliminater 1.42% : 0.000003s : 23: predicate.switch_defer_inline 1.96% : 0.000005s : 33: predicate.switch_layer_defer_inline 4.82% : 0.000012s : 74: predicate.switch_simplify 0.80% : 0.000002s : 15: predicate.tile_eliminate 0.90% : 0.000002s : 15: predicate.transpose_eliminate 1.56% : 0.000004s : 25: predicate.tuple_list_convert_item_index_to_positive 1.62% : 0.000004s : 25: predicate.tuple_list_get_item_const_eliminator 1.51% : 0.000004s : 25: predicate.tuple_list_get_item_depend_reorder 3.41% : 0.000008s : 39: predicate.tuple_list_get_item_eliminator 1.55% : 0.000004s : 25: predicate.tuple_list_get_set_item_eliminator 2.24% : 0.000005s : 35: predicate.tuple_list_set_item_eliminator 1.77% : 0.000004s : 29: predicate.tuple_to_list_eliminator_ 2.45% : 0.000006s : 44: predicate.updatestate_pure_node_eliminater 3.08% : 0.000007s : 54: predicate.updatestate_useless_node_eliminater 0.44% : 0.000001s : 5: predicate.value_based_eliminate 0.69% : 0.000002s : 10: predicate.virtual_dataset_eliminate 0.62% : 0.000002s : 10: predicate.virtual_output_eliminate 0.28% : 0.000001s : 5: predicate.virtual_view_grad_eliminate 0.43% : 0.000001s : 5: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000535 11 50.61% : 0.000271s : 5: func_graph_cloner_run.FuncGraphClonerGraph 49.39% : 0.000264s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.027729 192 0.01% : 0.000004s : 1: ForceFp32Comm 12.82% : 0.003554s : 1: add_attr 12.77% : 0.003541s : 1: add_attr_with_inline 0.01% : 0.000004s : 1: add_comm_op_reuse_tag 0.22% : 0.000061s : 1: add_recomputation 0.02% : 0.000004s : 1: assign_add_opt 0.27% : 0.000075s : 1: auto_monad 0.09% : 0.000025s : 1: auto_monad_reorder 0.01% : 0.000003s : 1: begin_end_overlap_inline 0.02% : 0.000005s : 1: bias_add_comm_swap 1.44% : 0.000398s : 1: bootstrap 0.12% : 0.000033s : 1: cconv 0.01% : 0.000004s : 1: comm_op_add_attrs 0.07% : 0.000019s : 1: control_data_broadcast_order 0.04% : 0.000011s : 1: convert_after_rewriter 0.10% : 0.000029s : 1: cse_after_recomputation 0.02% : 0.000005s : 1: dataset_repeat_opt 0.02% : 0.000005s : 1: detach_backward 0.03% : 0.000010s : 1: environ_conv 0.10% : 0.000027s : 1: event_method 0.02% : 0.000005s : 1: full_micro_interleaved_order_control 0.02% : 0.000005s : 1: get_jit_bprop_graph 0.04% : 0.000010s : 1: graph_reusing 0.02% : 0.000005s : 1: grouped_pairwise_exchange_alltoall 0.02% : 0.000004s : 1: handle_group_info 0.02% : 0.000006s : 1: inline 0.02% : 0.000007s : 1: insert-virtual-dataset 0.01% : 0.000004s : 1: interleave_parallel_branches 0.01% : 0.000004s : 1: interleave_split_concat_branches 0.02% : 0.000006s : 1: label_fine_grained_interleaved_index 0.03% : 0.000007s : 1: label_micro_interleaved_index 1.63% : 0.000453s : 1: loop_unroll 0.01% : 0.000004s : 1: merge_cast_opt 0.02% : 0.000005s : 1: micro_interleaved_order_control 2.10% : 0.000583s : 1: mutable_eliminate 0.03% : 0.000007s : 1: offloading_packed_experts 0.06% : 0.000017s : 1: opt.transform.loop_unroll_optimizer 0.07% : 0.000018s : 1: opt.transform.mutable_eliminate 5.18% : 0.001437s : 78: opt.transform.opt_a 0.13% : 0.000036s : 1: opt.transform.opt_after_cconv 0.10% : 0.000029s : 1: opt.transform.opt_after_jit_grad 0.53% : 0.000148s : 28: opt.transform.opt_b 0.21% : 0.000059s : 2: opt.transform.opt_trans_graph 0.16% : 0.000045s : 4: opt.transform.symbol_engine_opt 11.65% : 0.003232s : 1: opt_a 0.43% : 0.000120s : 1: opt_after_cconv 1.78% : 0.000493s : 1: opt_after_jit_grad 0.96% : 0.000266s : 1: opt_b 19.79% : 0.005487s : 1: optimize 0.07% : 0.000021s : 1: optimize_parallel_all_gather_comm 0.03% : 0.000009s : 1: order_py_execute_after_rewriter 0.10% : 0.000028s : 1: overlap_grad_flash_sp 0.02% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.03% : 0.000008s : 1: overlap_grad_ring_attention 0.02% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.02% : 0.000005s : 1: overlap_param_gather 0.02% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.03% : 0.000009s : 1: overlap_recompute_and_grad_model_parallel 0.02% : 0.000005s : 1: overlap_recompute_comm 0.03% : 0.000007s : 1: parallel-infer-symbol 0.01% : 0.000004s : 1: parallel-infer-symbol-second 0.02% : 0.000005s : 1: partial_unused_args_eliminate 0.02% : 0.000005s : 1: pipeline_parallel_scheduler 0.02% : 0.000005s : 1: pipeline_split 0.15% : 0.000042s : 1: pre_auto_parallel 0.12% : 0.000034s : 1: py_interpret_to_execute 0.07% : 0.000019s : 1: py_interpret_to_execute_after_opt_a 0.02% : 0.000004s : 1: remove_cast_before_assign_add 0.07% : 0.000019s : 1: remove_dup_value 1.64% : 0.000455s : 1: renormalize.infer 1.30% : 0.000359s : 1: renormalize.specialize 0.02% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.02% : 0.000007s : 1: rewriter_after_jit_bprop_graph 0.18% : 0.000049s : 1: rewriter_after_opt_a 0.35% : 0.000097s : 1: rewriter_before_opt_a 0.02% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.02% : 0.000005s : 1: slice_recompute_activation 0.02% : 0.000005s : 1: split_layernorm_comm 0.02% : 0.000005s : 1: split_matmul_comm_elemetwise 0.03% : 0.000009s : 1: swap_dp_allreduce_reducescatter 0.33% : 0.000091s : 1: symbol_engine_optimizer 0.32% : 0.000090s : 1: tuple_transform 21.59% : 0.005986s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:44:38.434.91 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:44:38.437.36 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0169608, [21] [bootstrap]: 0.00044884 [type_inference]: 0.00624208 [event_method]: 2.034e-05 [auto_monad]: 6.746e-05 [graph_reusing]: 6.96001e-06 [inline]: 2.93e-06 [add_attr]: 0.00320007, [1] [add_attr_with_inline]: 0.0031913, [1] [Cycle 1]: 7.422e-05, [2] [tag_attr]: 1.879e-05 [meta_addattr_fg_expand]: 6.09001e-06 [parallel-infer-symbol]: 3.18998e-06 [pre_auto_parallel]: 3.036e-05 [insert-virtual-dataset]: 2.48e-06 [parallel-infer-symbol-second]: 6.69999e-07 [dataset_repeat_opt]: 1.72999e-06 [pipeline_split]: 1.58002e-06 [optimize]: 0.00568436, [53] [py_interpret_to_execute]: 2.7e-05 [rewriter_before_opt_a]: 8.452e-05 [opt_a]: 0.00302043, [2] [Cycle 1]: 0.00210253, [45] [expand_dump_flag]: 2.94001e-06 [switch_simplify]: 4.266e-05 [loop_unroll]: 3.054e-05 [a_1]: 0.00061779 [with_stream_mark]: 1.591e-05 [recompute_prepare]: 1.193e-05 [updatestate_depend_eliminate]: 4.18999e-06 [updatestate_assign_eliminate]: 3.4e-06 [updatestate_loads_eliminate]: 3.2e-06 [parameter_eliminate]: 1.95001e-06 [a_2]: 0.00011029 [accelerated_algorithm]: 7.03e-06 [shard]: 1.75001e-06 [meta_shard_fg_expand]: 1.74e-06 [shard_inline]: 6.84001e-06 [merge_send_recv]: 9.07001e-06 [auto_parallel]: 6.96001e-06 [parallel]: 1.945e-05 [flash_sp]: 8.38001e-06 [merge_comm]: 4.17e-06 [allreduce_fusion]: 3.64002e-06 [matmul_add_comm_reduction]: 9.62001e-06 [allreduce_slice_to_reducescatter]: 6.00005e-07 [virtual_shard_identity]: 9.15001e-06 [virtual_dataset]: 6.64999e-06 [get_grad_eliminate_]: 6.33e-06 [virtual_output]: 7.11001e-06 [merge_forward]: 4.50001e-06 [cell_reuse_recompute_pass]: 1.59998e-06 [offload_activation]: 1.103e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.731e-05 [merge_recompute_call_nodes]: 2.09999e-06 [before_grad]: 1.05e-05 [set_forward_comm_id_for_comm_node_pass]: 4.20999e-06 [meta_fg_expand]: 2.96999e-06 [flash_sp_send_recv_attached]: 2.88e-06 [receive_attached]: 2.31e-06 [after_resolve]: 1.14e-05 [a_after_grad]: 1.034e-05 [renormalize]: 0.00053317 [add_forward_monad_depend]: 5.17999e-06 [auto_monad_grad]: 2.32999e-06 [auto_monad_eliminator]: 1.51e-05 [cse]: 2.884e-05 [a_3]: 6.519e-05 [Cycle 2]: 0.00090225, [45] [expand_dump_flag]: 1.22e-06 [switch_simplify]: 7.7e-06 [loop_unroll]: 6.68e-06 [a_1]: 0.0001292 [with_stream_mark]: 1.429e-05 [recompute_prepare]: 7.11001e-06 [updatestate_depend_eliminate]: 3.52002e-06 [updatestate_assign_eliminate]: 2.44999e-06 [updatestate_loads_eliminate]: 2.73003e-06 [parameter_eliminate]: 1.49e-06 [a_2]: 0.00010137 [accelerated_algorithm]: 7.01001e-06 [shard]: 1.84e-06 [meta_shard_fg_expand]: 1.67001e-06 [shard_inline]: 7.01999e-06 [merge_send_recv]: 8.52e-06 [auto_parallel]: 8.02e-06 [parallel]: 6.56999e-06 [flash_sp]: 4.38999e-06 [merge_comm]: 4.82e-06 [allreduce_fusion]: 3.58e-06 [matmul_add_comm_reduction]: 8.78001e-06 [allreduce_slice_to_reducescatter]: 7.60017e-07 [virtual_shard_identity]: 9.44e-06 [virtual_dataset]: 5.95002e-06 [get_grad_eliminate_]: 5.92001e-06 [virtual_output]: 6.71e-06 [merge_forward]: 3.98999e-06 [cell_reuse_recompute_pass]: 1.32e-06 [offload_activation]: 9.61e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.996e-05 [merge_recompute_call_nodes]: 1.19e-06 [before_grad]: 1.079e-05 [set_forward_comm_id_for_comm_node_pass]: 4.62e-06 [meta_fg_expand]: 2.56e-06 [flash_sp_send_recv_attached]: 1.42999e-06 [receive_attached]: 1.74e-06 [after_resolve]: 1.213e-05 [a_after_grad]: 1.009e-05 [renormalize]: 8.00064e-08 [add_forward_monad_depend]: 3.3e-06 [auto_monad_grad]: 1.84e-06 [auto_monad_eliminator]: 1.328e-05 [cse]: 2.204e-05 [a_3]: 5.367e-05 [py_interpret_to_execute_after_opt_a]: 1.934e-05 [slice_cell_reuse_recomputed_activation]: 5.28002e-06 [rewriter_after_opt_a]: 4.913e-05 [convert_after_rewriter]: 1.173e-05 [order_py_execute_after_rewriter]: 8.24998e-06 [mutable_eliminate]: 0.00064564 [opt_b]: 0.00030526, [1] [Cycle 1]: 0.0002944, [7] [b_1]: 0.00018745 [b_2]: 9.19998e-06 [updatestate_depend_eliminate]: 8.23999e-06 [updatestate_assign_eliminate]: 3.08e-06 [updatestate_loads_eliminate]: 2.91e-06 [renormalize]: 6.19999e-07 [cse]: 2.332e-05 [optimize_parallel_all_gather_comm]: 2.342e-05 [overlap_param_gather]: 4.44002e-06 [cconv]: 3.403e-05 [loop_unroll]: 0.00050041 [opt_after_cconv]: 0.00021063, [1] [Cycle 1]: 0.00020046, [7] [c_1]: 9.407e-05 [parameter_eliminate]: 5.30999e-06 [updatestate_depend_eliminate]: 8.89e-06 [updatestate_assign_eliminate]: 2.46e-06 [updatestate_loads_eliminate]: 2.64001e-06 [cse]: 2.125e-05 [renormalize]: 4.39992e-07 [remove_dup_value]: 1.809e-05 [tuple_transform]: 9.795e-05, [1] [Cycle 1]: 8.962e-05, [4] [d_1]: 4.79e-05 [none_parameter_eliminate]: 2.06e-06 [renormalize]: 2.30008e-07 [switch_simplify]: 7.41001e-06 [partial_unused_args_eliminate]: 4.74e-06 [add_recomputation]: 5.333e-05 [cse_after_recomputation]: 2.951e-05, [1] [Cycle 1]: 2.228e-05, [1] [cse]: 1.19e-05 [environ_conv]: 9.25999e-06 [swap_dp_allreduce_reducescatter]: 8.43999e-06 [bias_add_comm_swap]: 5.81e-06 [label_micro_interleaved_index]: 8.43999e-06 [label_fine_grained_interleaved_index]: 5.14998e-06 [merge_cast_opt]: 4.08999e-06 [slice_recompute_activation]: 4.77e-06 [micro_interleaved_order_control]: 4.51002e-06 [assign_add_opt]: 4.32e-06 [ForceFp32Comm]: 3.45e-06 [remove_cast_before_assign_add]: 3.35e-06 [full_micro_interleaved_order_control]: 4.50001e-06 [reorder_send_recv_between_fp_bp]: 5.51998e-06 [comm_op_add_attrs]: 3.47002e-06 [add_comm_op_reuse_tag]: 3.38999e-06 [interleave_split_concat_branches]: 3.34001e-06 [interleave_parallel_branches]: 3.75e-06 [overlap_opt_shard_in_pipeline]: 3.46001e-06 [overlap_opt_shard_grad_in_pipeline]: 4.35999e-06 [control_data_broadcast_order]: 1.712e-05 [grouped_pairwise_exchange_alltoall]: 4.03001e-06 [offloading_packed_experts]: 6.24999e-06 [overlap_recompute_and_grad_model_parallel]: 8.13999e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.88999e-06 [overlap_recompute_allgather_and_fa_grad]: 3.97e-06 [overlap_recompute_comm]: 4.88001e-06 [overlap_grad_ring_attention]: 7.03e-06 [overlap_grad_flash_sp]: 2.579e-05 [begin_end_overlap_inline]: 3.32002e-06 [split_matmul_comm_elemetwise]: 5.17e-06 [split_layernorm_comm]: 4.3e-06 [handle_group_info]: 4.25999e-06 [symbol_engine_optimizer]: 0.000108, [1] [Cycle 1]: 0.00010078, [6] [build]: 4.34997e-06 [elim_shapecalc]: 1.188e-05 [elim_not_effective]: 1.542e-05 [opt_reshape]: 7.41999e-06 [fold_const_symbol]: 1.062e-05 [renormalize]: 2.19996e-07 [detach_backward]: 4.3e-06 [pipeline_parallel_scheduler]: 1.95001e-06 [auto_monad_reorder]: 2.166e-05 [get_jit_bprop_graph]: 1.92001e-06 [rewriter_after_jit_bprop_graph]: 5.41002e-06 [opt_after_jit_grad]: 0.00056351 [validate]: 4.602e-05 Sums bootstrap : 0.000449s : 3.77% type_inference : 0.006242s : 52.48% event_method : 0.000020s : 0.17% auto_monad : 0.000067s : 0.57% graph_reusing : 0.000007s : 0.06% inline : 0.000003s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000019s : 0.16% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.05% parallel-infer-symbol : 0.000003s : 0.03% pre_auto_parallel : 0.000030s : 0.26% insert-virtual-dataset : 0.000002s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.01% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000027s : 0.23% optimize.rewriter_before_opt_a : 0.000085s : 0.71% optimize.opt_a.expand_dump_flag : 0.000004s : 0.03% optimize.opt_a.switch_simplify : 0.000050s : 0.42% optimize.opt_a.loop_unroll : 0.000037s : 0.31% optimize.opt_a.a_1 : 0.000747s : 6.28% optimize.opt_a.with_stream_mark : 0.000030s : 0.25% optimize.opt_a.recompute_prepare : 0.000019s : 0.16% optimize.opt_a.updatestate_depend_eliminate : 0.000008s : 0.06% optimize.opt_a.updatestate_assign_eliminate : 0.000006s : 0.05% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.05% optimize.opt_a.parameter_eliminate : 0.000003s : 0.03% optimize.opt_a.a_2 : 0.000212s : 1.78% optimize.opt_a.accelerated_algorithm : 0.000014s : 0.12% optimize.opt_a.shard : 0.000004s : 0.03% optimize.opt_a.meta_shard_fg_expand : 0.000003s : 0.03% optimize.opt_a.shard_inline : 0.000014s : 0.12% optimize.opt_a.merge_send_recv : 0.000018s : 0.15% optimize.opt_a.auto_parallel : 0.000015s : 0.13% optimize.opt_a.parallel : 0.000026s : 0.22% optimize.opt_a.flash_sp : 0.000013s : 0.11% optimize.opt_a.merge_comm : 0.000009s : 0.08% optimize.opt_a.allreduce_fusion : 0.000007s : 0.06% optimize.opt_a.matmul_add_comm_reduction : 0.000018s : 0.15% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000019s : 0.16% optimize.opt_a.virtual_dataset : 0.000013s : 0.11% optimize.opt_a.get_grad_eliminate_ : 0.000012s : 0.10% optimize.opt_a.virtual_output : 0.000014s : 0.12% optimize.opt_a.merge_forward : 0.000008s : 0.07% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.02% optimize.opt_a.offload_activation : 0.000021s : 0.17% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000037s : 0.31% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.03% optimize.opt_a.before_grad : 0.000021s : 0.18% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000009s : 0.07% optimize.opt_a.meta_fg_expand : 0.000006s : 0.05% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.04% optimize.opt_a.receive_attached : 0.000004s : 0.03% optimize.opt_a.after_resolve : 0.000024s : 0.20% optimize.opt_a.a_after_grad : 0.000020s : 0.17% optimize.opt_a.renormalize : 0.000533s : 4.48% optimize.opt_a.add_forward_monad_depend : 0.000008s : 0.07% optimize.opt_a.auto_monad_grad : 0.000004s : 0.04% optimize.opt_a.auto_monad_eliminator : 0.000028s : 0.24% optimize.opt_a.cse : 0.000051s : 0.43% optimize.opt_a.a_3 : 0.000119s : 1.00% optimize.py_interpret_to_execute_after_opt_a : 0.000019s : 0.16% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.04% optimize.rewriter_after_opt_a : 0.000049s : 0.41% optimize.convert_after_rewriter : 0.000012s : 0.10% optimize.order_py_execute_after_rewriter : 0.000008s : 0.07% optimize.mutable_eliminate : 0.000646s : 5.43% optimize.opt_b.b_1 : 0.000187s : 1.58% optimize.opt_b.b_2 : 0.000009s : 0.08% optimize.opt_b.updatestate_depend_eliminate : 0.000008s : 0.07% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.02% optimize.opt_b.renormalize : 0.000001s : 0.01% optimize.opt_b.cse : 0.000023s : 0.20% optimize.optimize_parallel_all_gather_comm : 0.000023s : 0.20% optimize.overlap_param_gather : 0.000004s : 0.04% optimize.cconv : 0.000034s : 0.29% optimize.loop_unroll : 0.000500s : 4.21% optimize.opt_after_cconv.c_1 : 0.000094s : 0.79% optimize.opt_after_cconv.parameter_eliminate : 0.000005s : 0.04% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000009s : 0.07% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000002s : 0.02% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.02% optimize.opt_after_cconv.cse : 0.000021s : 0.18% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000018s : 0.15% optimize.tuple_transform.d_1 : 0.000048s : 0.40% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.02% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000007s : 0.06% optimize.partial_unused_args_eliminate : 0.000005s : 0.04% optimize.add_recomputation : 0.000053s : 0.45% optimize.cse_after_recomputation.cse : 0.000012s : 0.10% optimize.environ_conv : 0.000009s : 0.08% optimize.swap_dp_allreduce_reducescatter : 0.000008s : 0.07% optimize.bias_add_comm_swap : 0.000006s : 0.05% optimize.label_micro_interleaved_index : 0.000008s : 0.07% optimize.label_fine_grained_interleaved_index : 0.000005s : 0.04% optimize.merge_cast_opt : 0.000004s : 0.03% optimize.slice_recompute_activation : 0.000005s : 0.04% optimize.micro_interleaved_order_control : 0.000005s : 0.04% optimize.assign_add_opt : 0.000004s : 0.04% optimize.ForceFp32Comm : 0.000003s : 0.03% optimize.remove_cast_before_assign_add : 0.000003s : 0.03% optimize.full_micro_interleaved_order_control : 0.000005s : 0.04% optimize.reorder_send_recv_between_fp_bp : 0.000006s : 0.05% optimize.comm_op_add_attrs : 0.000003s : 0.03% optimize.add_comm_op_reuse_tag : 0.000003s : 0.03% optimize.interleave_split_concat_branches : 0.000003s : 0.03% optimize.interleave_parallel_branches : 0.000004s : 0.03% optimize.overlap_opt_shard_in_pipeline : 0.000003s : 0.03% optimize.overlap_opt_shard_grad_in_pipeline : 0.000004s : 0.04% optimize.control_data_broadcast_order : 0.000017s : 0.14% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.03% optimize.offloading_packed_experts : 0.000006s : 0.05% optimize.overlap_recompute_and_grad_model_parallel : 0.000008s : 0.07% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.03% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.03% optimize.overlap_recompute_comm : 0.000005s : 0.04% optimize.overlap_grad_ring_attention : 0.000007s : 0.06% optimize.overlap_grad_flash_sp : 0.000026s : 0.22% optimize.begin_end_overlap_inline : 0.000003s : 0.03% optimize.split_matmul_comm_elemetwise : 0.000005s : 0.04% optimize.split_layernorm_comm : 0.000004s : 0.04% optimize.handle_group_info : 0.000004s : 0.04% optimize.symbol_engine_optimizer.build : 0.000004s : 0.04% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000012s : 0.10% optimize.symbol_engine_optimizer.elim_not_effective : 0.000015s : 0.13% optimize.symbol_engine_optimizer.opt_reshape : 0.000007s : 0.06% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000011s : 0.09% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000004s : 0.04% pipeline_parallel_scheduler : 0.000002s : 0.02% auto_monad_reorder : 0.000022s : 0.18% get_jit_bprop_graph : 0.000002s : 0.02% rewriter_after_jit_bprop_graph : 0.000005s : 0.05% opt_after_jit_grad : 0.000564s : 4.74% validate : 0.000046s : 0.39% Time group info: ------[substitution.] 0.000185 29 1.23% : 0.000002s : 2: substitution.elim_not_effective 0.69% : 0.000001s : 2: substitution.fold_const_symbol 3.46% : 0.000006s : 4: substitution.graph_param_transform 73.71% : 0.000136s : 4: substitution.inline 1.91% : 0.000004s : 4: substitution.j_node_and_user_rematch 3.18% : 0.000006s : 4: substitution.remove_not_recompute_node 2.70% : 0.000005s : 4: substitution.replace_old_param 8.36% : 0.000015s : 4: substitution.tuple_list_get_item_eliminator 4.75% : 0.000009s : 1: substitution.value_based_eliminate ------[type_inference.] 0.006182 2 88.02% : 0.005442s : 1: type_inference.infer 11.98% : 0.000741s : 1: type_inference.specialize ------[replace.] 0.000060 8 62.80% : 0.000037s : 4: replace.inline 37.20% : 0.000022s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000147 8 90.87% : 0.000134s : 4: match.inline 9.13% : 0.000013s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000209 1278 0.87% : 0.000002s : 13: predicate.accumulaten_eliminater 1.03% : 0.000002s : 4: predicate.ad_related_special_op_eliminate 0.48% : 0.000001s : 8: predicate.addn_check_dump 0.89% : 0.000002s : 13: predicate.addn_zero_filter 0.81% : 0.000002s : 13: predicate.adjust_all_reduce_mul_add 2.03% : 0.000004s : 21: predicate.arithmetic_simplify 0.87% : 0.000002s : 13: predicate.cast_eliminate 0.63% : 0.000001s : 8: predicate.check_bprop_eliminate 0.48% : 0.000001s : 8: predicate.compare_switch_simplify 0.19% : 0.000000s : 4: predicate.const_output_eliminate 0.63% : 0.000001s : 8: predicate.depend_value_elim 0.87% : 0.000002s : 13: predicate.dict_get_item_const_eliminator 0.90% : 0.000002s : 13: predicate.dict_get_item_eliminator 0.81% : 0.000002s : 13: predicate.dict_set_item_eliminator 1.08% : 0.000002s : 8: predicate.dumpgradient_eliminate 0.26% : 0.000001s : 4: predicate.elim_not_effective 0.33% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.10% : 0.000002s : 17: predicate.environ_add_const_eliminate 1.05% : 0.000002s : 17: predicate.environ_get_add_eliminate 1.20% : 0.000002s : 17: predicate.environ_get_depend_swap 1.74% : 0.000004s : 25: predicate.environ_get_eliminate 1.07% : 0.000002s : 17: predicate.environ_get_set_eliminate 1.40% : 0.000003s : 21: predicate.exchange_switch_depend_value 2.36% : 0.000005s : 21: predicate.float_depend_g_call 0.53% : 0.000001s : 8: predicate.float_environ_get_switch 0.75% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.19% : 0.000000s : 4: predicate.fold_const_symbol 0.74% : 0.000002s : 8: predicate.get_grad_eliminate 0.21% : 0.000000s : 4: predicate.graph_param_transform 0.62% : 0.000001s : 8: predicate.incorporate_call 0.50% : 0.000001s : 8: predicate.incorporate_call_switch 6.57% : 0.000014s : 58: predicate.inline 0.88% : 0.000002s : 8: predicate.inline_without_move 0.34% : 0.000001s : 8: predicate.j_node_and_user_rematch 0.74% : 0.000002s : 8: predicate.less_batch_normalization 1.81% : 0.000004s : 25: predicate.list_to_tuple_eliminator_ 2.45% : 0.000005s : 38: predicate.load_eliminater 0.85% : 0.000002s : 4: predicate.loop_unroll_after_grad 2.44% : 0.000005s : 34: predicate.loop_unroll_before_grad 1.67% : 0.000003s : 21: predicate.make_slice_get_slice_eliminator 0.50% : 0.000001s : 8: predicate.merge_addn 0.56% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.56% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.80% : 0.000002s : 13: predicate.minmaximum_grad 0.88% : 0.000002s : 4: predicate.mutable_eliminate 0.36% : 0.000001s : 4: predicate.opt_reshape 0.38% : 0.000001s : 4: predicate.parallel_virtual_node 1.65% : 0.000003s : 21: predicate.partial_defer_inline 1.64% : 0.000003s : 21: predicate.partial_eliminate 0.98% : 0.000002s : 13: predicate.print_const_string_wrapper 0.58% : 0.000001s : 8: predicate.reduce_all_const_elim 1.11% : 0.000002s : 13: predicate.reduce_eliminate 2.40% : 0.000005s : 38: predicate.redundant_stop_gradient_eliminater 0.78% : 0.000002s : 8: predicate.remove_not_recompute_node 1.46% : 0.000003s : 25: predicate.replace_applicator 0.67% : 0.000001s : 8: predicate.replace_old_param 0.25% : 0.000001s : 4: predicate.reset_defer_inline 0.97% : 0.000002s : 13: predicate.reshape_eliminate 0.60% : 0.000001s : 8: predicate.row_tensor_add_zeros_like 0.54% : 0.000001s : 4: predicate.row_tensor_eliminate 0.97% : 0.000002s : 8: predicate.same_eliminate 0.47% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.94% : 0.000002s : 8: predicate.shard_identity_eliminate 0.89% : 0.000002s : 8: predicate.special_op_eliminate 0.68% : 0.000001s : 8: predicate.specialize_transform 1.11% : 0.000002s : 8: predicate.split_environ_get_set_with_tuple_value 0.83% : 0.000002s : 8: predicate.stack_unstack_eliminate 0.35% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.47% : 0.000003s : 21: predicate.switch_defer_inline 2.36% : 0.000005s : 29: predicate.switch_layer_defer_inline 5.25% : 0.000011s : 67: predicate.switch_simplify 0.82% : 0.000002s : 13: predicate.tile_eliminate 0.82% : 0.000002s : 13: predicate.transpose_eliminate 1.47% : 0.000003s : 21: predicate.tuple_list_convert_item_index_to_positive 1.49% : 0.000003s : 21: predicate.tuple_list_get_item_const_eliminator 1.35% : 0.000003s : 21: predicate.tuple_list_get_item_depend_reorder 3.26% : 0.000007s : 33: predicate.tuple_list_get_item_eliminator 1.42% : 0.000003s : 21: predicate.tuple_list_get_set_item_eliminator 2.14% : 0.000004s : 29: predicate.tuple_list_set_item_eliminator 1.65% : 0.000003s : 25: predicate.tuple_to_list_eliminator_ 2.42% : 0.000005s : 38: predicate.updatestate_pure_node_eliminater 3.15% : 0.000007s : 46: predicate.updatestate_useless_node_eliminater 0.49% : 0.000001s : 4: predicate.value_based_eliminate 0.60% : 0.000001s : 8: predicate.virtual_dataset_eliminate 0.92% : 0.000002s : 8: predicate.virtual_output_eliminate 0.26% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.35% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000546 11 57.75% : 0.000316s : 5: func_graph_cloner_run.FuncGraphClonerGraph 42.25% : 0.000231s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.027796 192 0.02% : 0.000006s : 1: ForceFp32Comm 11.54% : 0.003209s : 1: add_attr 11.49% : 0.003195s : 1: add_attr_with_inline 0.02% : 0.000006s : 1: add_comm_op_reuse_tag 0.20% : 0.000057s : 1: add_recomputation 0.02% : 0.000007s : 1: assign_add_opt 0.28% : 0.000076s : 1: auto_monad 0.10% : 0.000029s : 1: auto_monad_reorder 0.02% : 0.000006s : 1: begin_end_overlap_inline 0.03% : 0.000009s : 1: bias_add_comm_swap 1.78% : 0.000494s : 1: bootstrap 0.13% : 0.000037s : 1: cconv 0.02% : 0.000006s : 1: comm_op_add_attrs 0.07% : 0.000021s : 1: control_data_broadcast_order 0.05% : 0.000015s : 1: convert_after_rewriter 0.12% : 0.000033s : 1: cse_after_recomputation 0.03% : 0.000007s : 1: dataset_repeat_opt 0.08% : 0.000021s : 1: detach_backward 0.04% : 0.000012s : 1: environ_conv 0.11% : 0.000031s : 1: event_method 0.03% : 0.000007s : 1: full_micro_interleaved_order_control 0.03% : 0.000008s : 1: get_jit_bprop_graph 0.05% : 0.000014s : 1: graph_reusing 0.02% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.03% : 0.000008s : 1: handle_group_info 0.03% : 0.000008s : 1: inline 0.03% : 0.000009s : 1: insert-virtual-dataset 0.02% : 0.000006s : 1: interleave_parallel_branches 0.02% : 0.000006s : 1: interleave_split_concat_branches 0.03% : 0.000008s : 1: label_fine_grained_interleaved_index 0.04% : 0.000011s : 1: label_micro_interleaved_index 1.83% : 0.000508s : 1: loop_unroll 0.02% : 0.000007s : 1: merge_cast_opt 0.03% : 0.000007s : 1: micro_interleaved_order_control 2.36% : 0.000655s : 1: mutable_eliminate 0.03% : 0.000009s : 1: offloading_packed_experts 0.06% : 0.000017s : 1: opt.transform.loop_unroll_optimizer 0.08% : 0.000021s : 1: opt.transform.mutable_eliminate 4.29% : 0.001193s : 78: opt.transform.opt_a 0.33% : 0.000091s : 1: opt.transform.opt_after_cconv 0.10% : 0.000029s : 1: opt.transform.opt_after_jit_grad 0.43% : 0.000120s : 28: opt.transform.opt_b 0.19% : 0.000053s : 2: opt.transform.opt_trans_graph 0.15% : 0.000041s : 4: opt.transform.symbol_engine_opt 10.88% : 0.003025s : 1: opt_a 0.77% : 0.000214s : 1: opt_after_cconv 2.07% : 0.000576s : 1: opt_after_jit_grad 1.11% : 0.000310s : 1: opt_b 21.70% : 0.006032s : 1: optimize 0.10% : 0.000027s : 1: optimize_parallel_all_gather_comm 0.04% : 0.000012s : 1: order_py_execute_after_rewriter 0.10% : 0.000029s : 1: overlap_grad_flash_sp 0.03% : 0.000007s : 1: overlap_grad_matmul_and_grad_allreduce 0.04% : 0.000010s : 1: overlap_grad_ring_attention 0.02% : 0.000007s : 1: overlap_opt_shard_grad_in_pipeline 0.02% : 0.000006s : 1: overlap_opt_shard_in_pipeline 0.03% : 0.000008s : 1: overlap_param_gather 0.02% : 0.000006s : 1: overlap_recompute_allgather_and_fa_grad 0.04% : 0.000012s : 1: overlap_recompute_and_grad_model_parallel 0.03% : 0.000008s : 1: overlap_recompute_comm 0.03% : 0.000010s : 1: parallel-infer-symbol 0.02% : 0.000006s : 1: parallel-infer-symbol-second 0.03% : 0.000008s : 1: partial_unused_args_eliminate 0.03% : 0.000010s : 1: pipeline_parallel_scheduler 0.03% : 0.000007s : 1: pipeline_split 0.14% : 0.000038s : 1: pre_auto_parallel 0.11% : 0.000031s : 1: py_interpret_to_execute 0.09% : 0.000024s : 1: py_interpret_to_execute_after_opt_a 0.02% : 0.000006s : 1: remove_cast_before_assign_add 0.08% : 0.000022s : 1: remove_dup_value 0.94% : 0.000261s : 1: renormalize.infer 0.95% : 0.000264s : 1: renormalize.specialize 0.03% : 0.000008s : 1: reorder_send_recv_between_fp_bp 0.04% : 0.000011s : 1: rewriter_after_jit_bprop_graph 0.19% : 0.000054s : 1: rewriter_after_opt_a 0.32% : 0.000089s : 1: rewriter_before_opt_a 0.03% : 0.000009s : 1: slice_cell_reuse_recomputed_activation 0.03% : 0.000008s : 1: slice_recompute_activation 0.03% : 0.000007s : 1: split_layernorm_comm 0.03% : 0.000008s : 1: split_matmul_comm_elemetwise 0.04% : 0.000011s : 1: swap_dp_allreduce_reducescatter 0.40% : 0.000111s : 1: symbol_engine_optimizer 0.36% : 0.000101s : 1: tuple_transform 22.61% : 0.006284s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:44:38.366.791 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0163586, [21] [bootstrap]: 0.00046498 [type_inference]: 0.00671929 [event_method]: 2.092e-05 [auto_monad]: 6.756e-05 [graph_reusing]: 7.03998e-06 [inline]: 3.09001e-06 [add_attr]: 0.00325595, [1] [add_attr_with_inline]: 0.00324546, [1] [Cycle 1]: 6.1e-05, [2] [tag_attr]: 2.027e-05 [meta_addattr_fg_expand]: 5.61e-06 [parallel-infer-symbol]: 3.29001e-06 [pre_auto_parallel]: 3.492e-05 [insert-virtual-dataset]: 2.51e-06 [parallel-infer-symbol-second]: 8.50006e-07 [dataset_repeat_opt]: 2.28002e-06 [pipeline_split]: 1.99e-06 [optimize]: 0.00506958, [53] [py_interpret_to_execute]: 3.918e-05 [rewriter_before_opt_a]: 8.439e-05 [opt_a]: 0.0028222, [2] [Cycle 1]: 0.00211475, [45] [expand_dump_flag]: 3.28e-06 [switch_simplify]: 4.31e-05 [loop_unroll]: 3.073e-05 [a_1]: 0.00062936 [with_stream_mark]: 1.787e-05 [recompute_prepare]: 1.358e-05 [updatestate_depend_eliminate]: 4.23001e-06 [updatestate_assign_eliminate]: 2.96999e-06 [updatestate_loads_eliminate]: 3.18e-06 [parameter_eliminate]: 1.96e-06 [a_2]: 8.13e-05 [accelerated_algorithm]: 7.63001e-06 [shard]: 1.84998e-06 [meta_shard_fg_expand]: 2.06e-06 [shard_inline]: 7.03e-06 [merge_send_recv]: 9.15999e-06 [auto_parallel]: 7.65e-06 [parallel]: 1.921e-05 [flash_sp]: 8.60999e-06 [merge_comm]: 4.04997e-06 [allreduce_fusion]: 3.52997e-06 [matmul_add_comm_reduction]: 9.67999e-06 [allreduce_slice_to_reducescatter]: 6.79982e-07 [virtual_shard_identity]: 8.87e-06 [virtual_dataset]: 6.49999e-06 [get_grad_eliminate_]: 6.12001e-06 [virtual_output]: 6.51e-06 [merge_forward]: 4.17e-06 [cell_reuse_recompute_pass]: 1.39e-06 [offload_activation]: 1.043e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.4e-05 [merge_recompute_call_nodes]: 1.59e-06 [before_grad]: 1.068e-05 [set_forward_comm_id_for_comm_node_pass]: 3.64002e-06 [meta_fg_expand]: 2.96001e-06 [flash_sp_send_recv_attached]: 3.11001e-06 [receive_attached]: 2.21998e-06 [after_resolve]: 1.425e-05 [a_after_grad]: 1.056e-05 [renormalize]: 0.00072701 [add_forward_monad_depend]: 6.59001e-06 [auto_monad_grad]: 2.28002e-06 [auto_monad_eliminator]: 1.909e-05 [cse]: 3.224e-05 [a_3]: 5.016e-05 [Cycle 2]: 0.00069653, [45] [expand_dump_flag]: 1.44e-06 [switch_simplify]: 8.37998e-06 [loop_unroll]: 6.02001e-06 [a_1]: 0.00013031 [with_stream_mark]: 1.437e-05 [recompute_prepare]: 6.95998e-06 [updatestate_depend_eliminate]: 3.53999e-06 [updatestate_assign_eliminate]: 3.52997e-06 [updatestate_loads_eliminate]: 2.85002e-06 [parameter_eliminate]: 1.49998e-06 [a_2]: 7.172e-05 [accelerated_algorithm]: 7.45998e-06 [shard]: 2.22001e-06 [meta_shard_fg_expand]: 1.97001e-06 [shard_inline]: 6.47001e-06 [merge_send_recv]: 6.86001e-06 [auto_parallel]: 6.98e-06 [parallel]: 6.51e-06 [flash_sp]: 3.45998e-06 [merge_comm]: 4.72e-06 [allreduce_fusion]: 3.27002e-06 [matmul_add_comm_reduction]: 6.49999e-06 [allreduce_slice_to_reducescatter]: 5.50004e-07 [virtual_shard_identity]: 7.55e-06 [virtual_dataset]: 5.89e-06 [get_grad_eliminate_]: 5.57999e-06 [virtual_output]: 6.32001e-06 [merge_forward]: 3.65e-06 [cell_reuse_recompute_pass]: 1.76e-06 [offload_activation]: 8.58001e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.665e-05 [merge_recompute_call_nodes]: 1.25999e-06 [before_grad]: 1.005e-05 [set_forward_comm_id_for_comm_node_pass]: 4.02e-06 [meta_fg_expand]: 2.73e-06 [flash_sp_send_recv_attached]: 1.49e-06 [receive_attached]: 1.40999e-06 [after_resolve]: 1.322e-05 [a_after_grad]: 9.71998e-06 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 2.69999e-06 [auto_monad_grad]: 1.64e-06 [auto_monad_eliminator]: 1.098e-05 [cse]: 1.847e-05 [a_3]: 3.678e-05 [py_interpret_to_execute_after_opt_a]: 1.379e-05 [slice_cell_reuse_recomputed_activation]: 2.66e-06 [rewriter_after_opt_a]: 4.059e-05 [convert_after_rewriter]: 7.2e-06 [order_py_execute_after_rewriter]: 5.18002e-06 [mutable_eliminate]: 0.00057293 [opt_b]: 0.00023713, [1] [Cycle 1]: 0.00022932, [7] [b_1]: 0.0001417 [b_2]: 9.13002e-06 [updatestate_depend_eliminate]: 8.34998e-06 [updatestate_assign_eliminate]: 2.53e-06 [updatestate_loads_eliminate]: 3.26999e-06 [renormalize]: 1.07e-06 [cse]: 2.435e-05 [optimize_parallel_all_gather_comm]: 1.888e-05 [overlap_param_gather]: 1.92001e-06 [cconv]: 3.187e-05 [loop_unroll]: 0.0004824 [opt_after_cconv]: 0.00010603, [1] [Cycle 1]: 0.00010034, [7] [c_1]: 3.137e-05 [parameter_eliminate]: 4.31002e-06 [updatestate_depend_eliminate]: 6.71999e-06 [updatestate_assign_eliminate]: 2.44001e-06 [updatestate_loads_eliminate]: 2.49001e-06 [cse]: 1.918e-05 [renormalize]: 4.00003e-07 [remove_dup_value]: 1.33e-05 [tuple_transform]: 7.7e-05, [1] [Cycle 1]: 7.232e-05, [4] [d_1]: 4.524e-05 [none_parameter_eliminate]: 1.86e-06 [renormalize]: 2.30008e-07 [switch_simplify]: 6.61e-06 [partial_unused_args_eliminate]: 2.50002e-06 [add_recomputation]: 5.187e-05 [cse_after_recomputation]: 2.351e-05, [1] [Cycle 1]: 1.882e-05, [1] [cse]: 1.275e-05 [environ_conv]: 5.62001e-06 [swap_dp_allreduce_reducescatter]: 5.71e-06 [bias_add_comm_swap]: 2.53e-06 [label_micro_interleaved_index]: 4.79998e-06 [label_fine_grained_interleaved_index]: 2.93e-06 [merge_cast_opt]: 1.49e-06 [slice_recompute_activation]: 2.20002e-06 [micro_interleaved_order_control]: 2.31998e-06 [assign_add_opt]: 1.28002e-06 [ForceFp32Comm]: 8.50006e-07 [remove_cast_before_assign_add]: 1.08001e-06 [full_micro_interleaved_order_control]: 2.25002e-06 [reorder_send_recv_between_fp_bp]: 2.58998e-06 [comm_op_add_attrs]: 1.02e-06 [add_comm_op_reuse_tag]: 9.60019e-07 [interleave_split_concat_branches]: 1.05999e-06 [interleave_parallel_branches]: 1.13001e-06 [overlap_opt_shard_in_pipeline]: 1.34e-06 [overlap_opt_shard_grad_in_pipeline]: 1.86003e-06 [control_data_broadcast_order]: 1.334e-05 [grouped_pairwise_exchange_alltoall]: 1.77999e-06 [offloading_packed_experts]: 3.93001e-06 [overlap_recompute_and_grad_model_parallel]: 5.19e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.32e-06 [overlap_recompute_allgather_and_fa_grad]: 1.35001e-06 [overlap_recompute_comm]: 2.17001e-06 [overlap_grad_ring_attention]: 4.37e-06 [overlap_grad_flash_sp]: 2.087e-05 [begin_end_overlap_inline]: 8.79983e-07 [split_matmul_comm_elemetwise]: 2.21003e-06 [split_layernorm_comm]: 1.57001e-06 [handle_group_info]: 1.02e-06 [symbol_engine_optimizer]: 7.631e-05, [1] [Cycle 1]: 7.156e-05, [6] [build]: 3.52002e-06 [elim_shapecalc]: 9.99001e-06 [elim_not_effective]: 1.29e-05 [opt_reshape]: 6.86001e-06 [fold_const_symbol]: 9.82001e-06 [renormalize]: 2.00002e-07 [detach_backward]: 2.51e-06 [pipeline_parallel_scheduler]: 1.50999e-06 [auto_monad_reorder]: 1.67e-05 [get_jit_bprop_graph]: 1.96003e-06 [rewriter_after_jit_bprop_graph]: 5.66003e-06 [opt_after_jit_grad]: 0.0004726 [validate]: 3.742e-05 Sums bootstrap : 0.000465s : 3.85% type_inference : 0.006719s : 55.68% event_method : 0.000021s : 0.17% auto_monad : 0.000068s : 0.56% graph_reusing : 0.000007s : 0.06% inline : 0.000003s : 0.03% add_attr.add_attr_with_inline.tag_attr : 0.000020s : 0.17% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.05% parallel-infer-symbol : 0.000003s : 0.03% pre_auto_parallel : 0.000035s : 0.29% insert-virtual-dataset : 0.000003s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.02% optimize.py_interpret_to_execute : 0.000039s : 0.32% optimize.rewriter_before_opt_a : 0.000084s : 0.70% optimize.opt_a.expand_dump_flag : 0.000005s : 0.04% optimize.opt_a.switch_simplify : 0.000051s : 0.43% optimize.opt_a.loop_unroll : 0.000037s : 0.30% optimize.opt_a.a_1 : 0.000760s : 6.30% optimize.opt_a.with_stream_mark : 0.000032s : 0.27% optimize.opt_a.recompute_prepare : 0.000021s : 0.17% optimize.opt_a.updatestate_depend_eliminate : 0.000008s : 0.06% optimize.opt_a.updatestate_assign_eliminate : 0.000006s : 0.05% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.05% optimize.opt_a.parameter_eliminate : 0.000003s : 0.03% optimize.opt_a.a_2 : 0.000153s : 1.27% optimize.opt_a.accelerated_algorithm : 0.000015s : 0.13% optimize.opt_a.shard : 0.000004s : 0.03% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.03% optimize.opt_a.shard_inline : 0.000014s : 0.11% optimize.opt_a.merge_send_recv : 0.000016s : 0.13% optimize.opt_a.auto_parallel : 0.000015s : 0.12% optimize.opt_a.parallel : 0.000026s : 0.21% optimize.opt_a.flash_sp : 0.000012s : 0.10% optimize.opt_a.merge_comm : 0.000009s : 0.07% optimize.opt_a.allreduce_fusion : 0.000007s : 0.06% optimize.opt_a.matmul_add_comm_reduction : 0.000016s : 0.13% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000016s : 0.14% optimize.opt_a.virtual_dataset : 0.000012s : 0.10% optimize.opt_a.get_grad_eliminate_ : 0.000012s : 0.10% optimize.opt_a.virtual_output : 0.000013s : 0.11% optimize.opt_a.merge_forward : 0.000008s : 0.06% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.03% optimize.opt_a.offload_activation : 0.000019s : 0.16% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000031s : 0.25% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.02% optimize.opt_a.before_grad : 0.000021s : 0.17% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000008s : 0.06% optimize.opt_a.meta_fg_expand : 0.000006s : 0.05% optimize.opt_a.flash_sp_send_recv_attached : 0.000005s : 0.04% optimize.opt_a.receive_attached : 0.000004s : 0.03% optimize.opt_a.after_resolve : 0.000027s : 0.23% optimize.opt_a.a_after_grad : 0.000020s : 0.17% optimize.opt_a.renormalize : 0.000727s : 6.03% optimize.opt_a.add_forward_monad_depend : 0.000009s : 0.08% optimize.opt_a.auto_monad_grad : 0.000004s : 0.03% optimize.opt_a.auto_monad_eliminator : 0.000030s : 0.25% optimize.opt_a.cse : 0.000051s : 0.42% optimize.opt_a.a_3 : 0.000087s : 0.72% optimize.py_interpret_to_execute_after_opt_a : 0.000014s : 0.11% optimize.slice_cell_reuse_recomputed_activation : 0.000003s : 0.02% optimize.rewriter_after_opt_a : 0.000041s : 0.34% optimize.convert_after_rewriter : 0.000007s : 0.06% optimize.order_py_execute_after_rewriter : 0.000005s : 0.04% optimize.mutable_eliminate : 0.000573s : 4.75% optimize.opt_b.b_1 : 0.000142s : 1.17% optimize.opt_b.b_2 : 0.000009s : 0.08% optimize.opt_b.updatestate_depend_eliminate : 0.000008s : 0.07% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.02% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_b.renormalize : 0.000001s : 0.01% optimize.opt_b.cse : 0.000024s : 0.20% optimize.optimize_parallel_all_gather_comm : 0.000019s : 0.16% optimize.overlap_param_gather : 0.000002s : 0.02% optimize.cconv : 0.000032s : 0.26% optimize.loop_unroll : 0.000482s : 4.00% optimize.opt_after_cconv.c_1 : 0.000031s : 0.26% optimize.opt_after_cconv.parameter_eliminate : 0.000004s : 0.04% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000007s : 0.06% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000002s : 0.02% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.02% optimize.opt_after_cconv.cse : 0.000019s : 0.16% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000013s : 0.11% optimize.tuple_transform.d_1 : 0.000045s : 0.37% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.02% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000007s : 0.05% optimize.partial_unused_args_eliminate : 0.000003s : 0.02% optimize.add_recomputation : 0.000052s : 0.43% optimize.cse_after_recomputation.cse : 0.000013s : 0.11% optimize.environ_conv : 0.000006s : 0.05% optimize.swap_dp_allreduce_reducescatter : 0.000006s : 0.05% optimize.bias_add_comm_swap : 0.000003s : 0.02% optimize.label_micro_interleaved_index : 0.000005s : 0.04% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.02% optimize.merge_cast_opt : 0.000001s : 0.01% optimize.slice_recompute_activation : 0.000002s : 0.02% optimize.micro_interleaved_order_control : 0.000002s : 0.02% optimize.assign_add_opt : 0.000001s : 0.01% optimize.ForceFp32Comm : 0.000001s : 0.01% optimize.remove_cast_before_assign_add : 0.000001s : 0.01% optimize.full_micro_interleaved_order_control : 0.000002s : 0.02% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.02% optimize.comm_op_add_attrs : 0.000001s : 0.01% optimize.add_comm_op_reuse_tag : 0.000001s : 0.01% optimize.interleave_split_concat_branches : 0.000001s : 0.01% optimize.interleave_parallel_branches : 0.000001s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.02% optimize.control_data_broadcast_order : 0.000013s : 0.11% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.01% optimize.offloading_packed_experts : 0.000004s : 0.03% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.04% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.01% optimize.overlap_recompute_comm : 0.000002s : 0.02% optimize.overlap_grad_ring_attention : 0.000004s : 0.04% optimize.overlap_grad_flash_sp : 0.000021s : 0.17% optimize.begin_end_overlap_inline : 0.000001s : 0.01% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.02% optimize.split_layernorm_comm : 0.000002s : 0.01% optimize.handle_group_info : 0.000001s : 0.01% optimize.symbol_engine_optimizer.build : 0.000004s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000010s : 0.08% optimize.symbol_engine_optimizer.elim_not_effective : 0.000013s : 0.11% optimize.symbol_engine_optimizer.opt_reshape : 0.000007s : 0.06% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000010s : 0.08% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000003s : 0.02% pipeline_parallel_scheduler : 0.000002s : 0.01% auto_monad_reorder : 0.000017s : 0.14% get_jit_bprop_graph : 0.000002s : 0.02% rewriter_after_jit_bprop_graph : 0.000006s : 0.05% opt_after_jit_grad : 0.000473s : 3.92% validate : 0.000037s : 0.31% Time group info: ------[substitution.] 0.000195 29 0.89% : 0.000002s : 2: substitution.elim_not_effective 0.64% : 0.000001s : 2: substitution.fold_const_symbol 3.42% : 0.000007s : 4: substitution.graph_param_transform 74.53% : 0.000145s : 4: substitution.inline 2.20% : 0.000004s : 4: substitution.j_node_and_user_rematch 2.65% : 0.000005s : 4: substitution.remove_not_recompute_node 2.94% : 0.000006s : 4: substitution.replace_old_param 8.08% : 0.000016s : 4: substitution.tuple_list_get_item_eliminator 4.65% : 0.000009s : 1: substitution.value_based_eliminate ------[type_inference.] 0.006645 2 87.58% : 0.005820s : 1: type_inference.infer 12.42% : 0.000825s : 1: type_inference.specialize ------[replace.] 0.000062 8 62.50% : 0.000039s : 4: replace.inline 37.50% : 0.000023s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000156 8 91.04% : 0.000142s : 4: match.inline 8.96% : 0.000014s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000204 1278 0.91% : 0.000002s : 13: predicate.accumulaten_eliminater 0.76% : 0.000002s : 4: predicate.ad_related_special_op_eliminate 0.53% : 0.000001s : 8: predicate.addn_check_dump 0.88% : 0.000002s : 13: predicate.addn_zero_filter 0.83% : 0.000002s : 13: predicate.adjust_all_reduce_mul_add 2.21% : 0.000005s : 21: predicate.arithmetic_simplify 0.93% : 0.000002s : 13: predicate.cast_eliminate 0.55% : 0.000001s : 8: predicate.check_bprop_eliminate 0.52% : 0.000001s : 8: predicate.compare_switch_simplify 0.19% : 0.000000s : 4: predicate.const_output_eliminate 0.55% : 0.000001s : 8: predicate.depend_value_elim 0.93% : 0.000002s : 13: predicate.dict_get_item_const_eliminator 1.11% : 0.000002s : 13: predicate.dict_get_item_eliminator 0.91% : 0.000002s : 13: predicate.dict_set_item_eliminator 0.93% : 0.000002s : 8: predicate.dumpgradient_eliminate 0.23% : 0.000000s : 4: predicate.elim_not_effective 0.32% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.18% : 0.000002s : 17: predicate.environ_add_const_eliminate 1.02% : 0.000002s : 17: predicate.environ_get_add_eliminate 1.05% : 0.000002s : 17: predicate.environ_get_depend_swap 1.62% : 0.000003s : 25: predicate.environ_get_eliminate 1.09% : 0.000002s : 17: predicate.environ_get_set_eliminate 1.45% : 0.000003s : 21: predicate.exchange_switch_depend_value 2.44% : 0.000005s : 21: predicate.float_depend_g_call 0.50% : 0.000001s : 8: predicate.float_environ_get_switch 0.76% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.17% : 0.000000s : 4: predicate.fold_const_symbol 0.67% : 0.000001s : 8: predicate.get_grad_eliminate 0.20% : 0.000000s : 4: predicate.graph_param_transform 0.58% : 0.000001s : 8: predicate.incorporate_call 0.46% : 0.000001s : 8: predicate.incorporate_call_switch 6.22% : 0.000013s : 58: predicate.inline 0.84% : 0.000002s : 8: predicate.inline_without_move 0.34% : 0.000001s : 8: predicate.j_node_and_user_rematch 1.00% : 0.000002s : 8: predicate.less_batch_normalization 1.76% : 0.000004s : 25: predicate.list_to_tuple_eliminator_ 2.51% : 0.000005s : 38: predicate.load_eliminater 0.98% : 0.000002s : 4: predicate.loop_unroll_after_grad 2.38% : 0.000005s : 34: predicate.loop_unroll_before_grad 1.53% : 0.000003s : 21: predicate.make_slice_get_slice_eliminator 0.50% : 0.000001s : 8: predicate.merge_addn 0.61% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.56% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.83% : 0.000002s : 13: predicate.minmaximum_grad 1.26% : 0.000003s : 4: predicate.mutable_eliminate 0.36% : 0.000001s : 4: predicate.opt_reshape 0.49% : 0.000001s : 4: predicate.parallel_virtual_node 1.86% : 0.000004s : 21: predicate.partial_defer_inline 1.70% : 0.000003s : 21: predicate.partial_eliminate 0.92% : 0.000002s : 13: predicate.print_const_string_wrapper 0.53% : 0.000001s : 8: predicate.reduce_all_const_elim 1.15% : 0.000002s : 13: predicate.reduce_eliminate 2.52% : 0.000005s : 38: predicate.redundant_stop_gradient_eliminater 0.57% : 0.000001s : 8: predicate.remove_not_recompute_node 1.42% : 0.000003s : 25: predicate.replace_applicator 0.41% : 0.000001s : 8: predicate.replace_old_param 0.38% : 0.000001s : 4: predicate.reset_defer_inline 0.90% : 0.000002s : 13: predicate.reshape_eliminate 0.64% : 0.000001s : 8: predicate.row_tensor_add_zeros_like 0.34% : 0.000001s : 4: predicate.row_tensor_eliminate 0.71% : 0.000001s : 8: predicate.same_eliminate 0.54% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.86% : 0.000002s : 8: predicate.shard_identity_eliminate 0.67% : 0.000001s : 8: predicate.special_op_eliminate 0.72% : 0.000001s : 8: predicate.specialize_transform 0.89% : 0.000002s : 8: predicate.split_environ_get_set_with_tuple_value 0.82% : 0.000002s : 8: predicate.stack_unstack_eliminate 0.34% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.54% : 0.000003s : 21: predicate.switch_defer_inline 2.10% : 0.000004s : 29: predicate.switch_layer_defer_inline 5.12% : 0.000010s : 67: predicate.switch_simplify 0.92% : 0.000002s : 13: predicate.tile_eliminate 1.14% : 0.000002s : 13: predicate.transpose_eliminate 1.44% : 0.000003s : 21: predicate.tuple_list_convert_item_index_to_positive 1.63% : 0.000003s : 21: predicate.tuple_list_get_item_const_eliminator 1.55% : 0.000003s : 21: predicate.tuple_list_get_item_depend_reorder 3.44% : 0.000007s : 33: predicate.tuple_list_get_item_eliminator 1.46% : 0.000003s : 21: predicate.tuple_list_get_set_item_eliminator 2.21% : 0.000005s : 29: predicate.tuple_list_set_item_eliminator 1.85% : 0.000004s : 25: predicate.tuple_to_list_eliminator_ 2.44% : 0.000005s : 38: predicate.updatestate_pure_node_eliminater 3.07% : 0.000006s : 46: predicate.updatestate_useless_node_eliminater 0.61% : 0.000001s : 4: predicate.value_based_eliminate 0.63% : 0.000001s : 8: predicate.virtual_dataset_eliminate 0.63% : 0.000001s : 8: predicate.virtual_output_eliminate 0.24% : 0.000000s : 4: predicate.virtual_view_grad_eliminate 0.44% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000639 11 55.54% : 0.000355s : 5: func_graph_cloner_run.FuncGraphClonerGraph 44.46% : 0.000284s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.026736 192 0.01% : 0.000003s : 1: ForceFp32Comm 12.20% : 0.003261s : 1: add_attr 12.15% : 0.003250s : 1: add_attr_with_inline 0.02% : 0.000004s : 1: add_comm_op_reuse_tag 0.21% : 0.000056s : 1: add_recomputation 0.02% : 0.000004s : 1: assign_add_opt 0.27% : 0.000073s : 1: auto_monad 0.08% : 0.000021s : 1: auto_monad_reorder 0.01% : 0.000004s : 1: begin_end_overlap_inline 0.02% : 0.000005s : 1: bias_add_comm_swap 1.88% : 0.000502s : 1: bootstrap 0.13% : 0.000035s : 1: cconv 0.01% : 0.000004s : 1: comm_op_add_attrs 0.06% : 0.000017s : 1: control_data_broadcast_order 0.04% : 0.000010s : 1: convert_after_rewriter 0.10% : 0.000026s : 1: cse_after_recomputation 0.02% : 0.000005s : 1: dataset_repeat_opt 0.02% : 0.000006s : 1: detach_backward 0.03% : 0.000009s : 1: environ_conv 0.11% : 0.000028s : 1: event_method 0.02% : 0.000005s : 1: full_micro_interleaved_order_control 0.02% : 0.000005s : 1: get_jit_bprop_graph 0.04% : 0.000010s : 1: graph_reusing 0.02% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000004s : 1: handle_group_info 0.02% : 0.000006s : 1: inline 0.02% : 0.000006s : 1: insert-virtual-dataset 0.01% : 0.000004s : 1: interleave_parallel_branches 0.01% : 0.000004s : 1: interleave_split_concat_branches 0.02% : 0.000006s : 1: label_fine_grained_interleaved_index 0.03% : 0.000008s : 1: label_micro_interleaved_index 1.84% : 0.000491s : 1: loop_unroll 0.02% : 0.000004s : 1: merge_cast_opt 0.02% : 0.000005s : 1: micro_interleaved_order_control 2.18% : 0.000584s : 1: mutable_eliminate 0.03% : 0.000007s : 1: offloading_packed_experts 0.06% : 0.000017s : 1: opt.transform.loop_unroll_optimizer 0.07% : 0.000019s : 1: opt.transform.mutable_eliminate 4.46% : 0.001194s : 78: opt.transform.opt_a 0.11% : 0.000030s : 1: opt.transform.opt_after_cconv 0.10% : 0.000026s : 1: opt.transform.opt_after_jit_grad 0.44% : 0.000118s : 28: opt.transform.opt_b 0.19% : 0.000050s : 2: opt.transform.opt_trans_graph 0.13% : 0.000036s : 4: opt.transform.symbol_engine_opt 10.57% : 0.002826s : 1: opt_a 0.41% : 0.000109s : 1: opt_after_cconv 1.80% : 0.000482s : 1: opt_after_jit_grad 0.90% : 0.000242s : 1: opt_b 18.98% : 0.005075s : 1: optimize 0.08% : 0.000023s : 1: optimize_parallel_all_gather_comm 0.03% : 0.000009s : 1: order_py_execute_after_rewriter 0.09% : 0.000024s : 1: overlap_grad_flash_sp 0.01% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.03% : 0.000007s : 1: overlap_grad_ring_attention 0.02% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.02% : 0.000005s : 1: overlap_param_gather 0.01% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.03% : 0.000009s : 1: overlap_recompute_and_grad_model_parallel 0.02% : 0.000006s : 1: overlap_recompute_comm 0.03% : 0.000007s : 1: parallel-infer-symbol 0.01% : 0.000004s : 1: parallel-infer-symbol-second 0.02% : 0.000005s : 1: partial_unused_args_eliminate 0.02% : 0.000005s : 1: pipeline_parallel_scheduler 0.02% : 0.000005s : 1: pipeline_split 0.15% : 0.000039s : 1: pre_auto_parallel 0.16% : 0.000044s : 1: py_interpret_to_execute 0.06% : 0.000017s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000004s : 1: remove_cast_before_assign_add 0.06% : 0.000017s : 1: remove_dup_value 1.47% : 0.000394s : 1: renormalize.infer 1.20% : 0.000322s : 1: renormalize.specialize 0.02% : 0.000005s : 1: reorder_send_recv_between_fp_bp 0.03% : 0.000009s : 1: rewriter_after_jit_bprop_graph 0.17% : 0.000046s : 1: rewriter_after_opt_a 0.33% : 0.000089s : 1: rewriter_before_opt_a 0.02% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.02% : 0.000006s : 1: slice_recompute_activation 0.02% : 0.000005s : 1: split_layernorm_comm 0.02% : 0.000005s : 1: split_matmul_comm_elemetwise 0.03% : 0.000009s : 1: swap_dp_allreduce_reducescatter 0.30% : 0.000079s : 1: symbol_engine_optimizer 0.30% : 0.000080s : 1: tuple_transform 25.21% : 0.006741s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:44:38.643.895 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:44:38.644.159 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0190593, [21] [bootstrap]: 0.0004774 [type_inference]: 0.00716261 [event_method]: 1.994e-05 [auto_monad]: 6.953e-05 [graph_reusing]: 5.91e-06 [inline]: 2.67001e-06 [add_attr]: 0.00371927, [1] [add_attr_with_inline]: 0.00369386, [1] [Cycle 1]: 9.156e-05, [2] [tag_attr]: 2.423e-05 [meta_addattr_fg_expand]: 6.69999e-06 [parallel-infer-symbol]: 3.69002e-06 [pre_auto_parallel]: 4.432e-05 [insert-virtual-dataset]: 2.54999e-06 [parallel-infer-symbol-second]: 6.89994e-07 [dataset_repeat_opt]: 1.91e-06 [pipeline_split]: 1.64e-06 [optimize]: 0.00625122, [53] [py_interpret_to_execute]: 4.715e-05 [rewriter_before_opt_a]: 0.0001004 [opt_a]: 0.00363569, [2] [Cycle 1]: 0.00265065, [45] [expand_dump_flag]: 3.21999e-06 [switch_simplify]: 4.358e-05 [loop_unroll]: 3.075e-05 [a_1]: 0.00078437 [with_stream_mark]: 2.278e-05 [recompute_prepare]: 1.301e-05 [updatestate_depend_eliminate]: 5.24998e-06 [updatestate_assign_eliminate]: 4.47e-06 [updatestate_loads_eliminate]: 3.57997e-06 [parameter_eliminate]: 2.06998e-06 [a_2]: 0.00013106 [accelerated_algorithm]: 8.32e-06 [shard]: 2.28998e-06 [meta_shard_fg_expand]: 2.01e-06 [shard_inline]: 8.93002e-06 [merge_send_recv]: 1.012e-05 [auto_parallel]: 8.1e-06 [parallel]: 1.853e-05 [flash_sp]: 1.003e-05 [merge_comm]: 4.84998e-06 [allreduce_fusion]: 4.07003e-06 [matmul_add_comm_reduction]: 1.066e-05 [allreduce_slice_to_reducescatter]: 8.79983e-07 [virtual_shard_identity]: 1.13e-05 [virtual_dataset]: 8.57998e-06 [get_grad_eliminate_]: 8.05999e-06 [virtual_output]: 7.83001e-06 [merge_forward]: 5.60001e-06 [cell_reuse_recompute_pass]: 1.50001e-06 [offload_activation]: 1.192e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.126e-05 [merge_recompute_call_nodes]: 1.63002e-06 [before_grad]: 1.369e-05 [set_forward_comm_id_for_comm_node_pass]: 5.17999e-06 [meta_fg_expand]: 3.98001e-06 [flash_sp_send_recv_attached]: 3.09001e-06 [receive_attached]: 2.76999e-06 [after_resolve]: 1.489e-05 [a_after_grad]: 1.22e-05 [renormalize]: 0.00083198 [add_forward_monad_depend]: 6.69999e-06 [auto_monad_grad]: 2.85002e-06 [auto_monad_eliminator]: 2.006e-05 [cse]: 3.644e-05 [a_3]: 7.365e-05 [Cycle 2]: 0.00096992, [45] [expand_dump_flag]: 2.86e-06 [switch_simplify]: 9.17001e-06 [loop_unroll]: 7.83999e-06 [a_1]: 0.00017536 [with_stream_mark]: 1.672e-05 [recompute_prepare]: 9.04998e-06 [updatestate_depend_eliminate]: 4.22998e-06 [updatestate_assign_eliminate]: 3.27002e-06 [updatestate_loads_eliminate]: 3.18e-06 [parameter_eliminate]: 1.30001e-06 [a_2]: 0.00011922 [accelerated_algorithm]: 8.03001e-06 [shard]: 1.32e-06 [meta_shard_fg_expand]: 1.61998e-06 [shard_inline]: 8.17998e-06 [merge_send_recv]: 7.3e-06 [auto_parallel]: 7.50998e-06 [parallel]: 6.37001e-06 [flash_sp]: 3.2e-06 [merge_comm]: 4.17e-06 [allreduce_fusion]: 4.02998e-06 [matmul_add_comm_reduction]: 8.03001e-06 [allreduce_slice_to_reducescatter]: 3.19997e-07 [virtual_shard_identity]: 9.49e-06 [virtual_dataset]: 7.65998e-06 [get_grad_eliminate_]: 7.13e-06 [virtual_output]: 7.28999e-06 [merge_forward]: 4.23001e-06 [cell_reuse_recompute_pass]: 1.39e-06 [offload_activation]: 1.018e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.875e-05 [merge_recompute_call_nodes]: 9.20001e-07 [before_grad]: 1.233e-05 [set_forward_comm_id_for_comm_node_pass]: 4.77e-06 [meta_fg_expand]: 2.76e-06 [flash_sp_send_recv_attached]: 1.12999e-06 [receive_attached]: 1.24e-06 [after_resolve]: 1.306e-05 [a_after_grad]: 1.086e-05 [renormalize]: 6.00121e-08 [add_forward_monad_depend]: 2.79001e-06 [auto_monad_grad]: 1.34e-06 [auto_monad_eliminator]: 1.247e-05 [cse]: 2.121e-05 [a_3]: 5.916e-05 [py_interpret_to_execute_after_opt_a]: 1.715e-05 [slice_cell_reuse_recomputed_activation]: 4.60001e-06 [rewriter_after_opt_a]: 6.16e-05 [convert_after_rewriter]: 1.24e-05 [order_py_execute_after_rewriter]: 8.95999e-06 [mutable_eliminate]: 0.00065272 [opt_b]: 0.00032834, [1] [Cycle 1]: 0.00031737, [7] [b_1]: 0.00021021 [b_2]: 9.58002e-06 [updatestate_depend_eliminate]: 7.78001e-06 [updatestate_assign_eliminate]: 3.14001e-06 [updatestate_loads_eliminate]: 3.31001e-06 [renormalize]: 5.89993e-07 [cse]: 2.56e-05 [optimize_parallel_all_gather_comm]: 2.263e-05 [overlap_param_gather]: 4.43001e-06 [cconv]: 3.403e-05 [loop_unroll]: 0.00045497 [opt_after_cconv]: 0.00014046, [1] [Cycle 1]: 0.00013165, [7] [c_1]: 3.784e-05 [parameter_eliminate]: 3.92002e-06 [updatestate_depend_eliminate]: 6.51e-06 [updatestate_assign_eliminate]: 3.02002e-06 [updatestate_loads_eliminate]: 2.86e-06 [cse]: 2.178e-05 [renormalize]: 5.09986e-07 [remove_dup_value]: 1.961e-05 [tuple_transform]: 0.00010113, [1] [Cycle 1]: 9.4e-05, [4] [d_1]: 5.135e-05 [none_parameter_eliminate]: 1.91e-06 [renormalize]: 2.40019e-07 [switch_simplify]: 8.74e-06 [partial_unused_args_eliminate]: 4.90999e-06 [add_recomputation]: 5.958e-05 [cse_after_recomputation]: 3.239e-05, [1] [Cycle 1]: 2.563e-05, [1] [cse]: 1.614e-05 [environ_conv]: 9.17001e-06 [swap_dp_allreduce_reducescatter]: 9.00001e-06 [bias_add_comm_swap]: 5.39e-06 [label_micro_interleaved_index]: 7.27002e-06 [label_fine_grained_interleaved_index]: 5.02999e-06 [merge_cast_opt]: 3.63e-06 [slice_recompute_activation]: 4.57998e-06 [micro_interleaved_order_control]: 4.33001e-06 [assign_add_opt]: 3.63e-06 [ForceFp32Comm]: 3.14001e-06 [remove_cast_before_assign_add]: 3.68e-06 [full_micro_interleaved_order_control]: 4.63999e-06 [reorder_send_recv_between_fp_bp]: 5.68997e-06 [comm_op_add_attrs]: 3.36999e-06 [add_comm_op_reuse_tag]: 3.31001e-06 [interleave_split_concat_branches]: 3.63999e-06 [interleave_parallel_branches]: 3.37002e-06 [overlap_opt_shard_in_pipeline]: 3.43e-06 [overlap_opt_shard_grad_in_pipeline]: 4.02e-06 [control_data_broadcast_order]: 1.821e-05 [grouped_pairwise_exchange_alltoall]: 3.86001e-06 [offloading_packed_experts]: 6.95002e-06 [overlap_recompute_and_grad_model_parallel]: 8.13001e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.61001e-06 [overlap_recompute_allgather_and_fa_grad]: 3.77002e-06 [overlap_recompute_comm]: 4.65999e-06 [overlap_grad_ring_attention]: 7.58001e-06 [overlap_grad_flash_sp]: 2.71e-05 [begin_end_overlap_inline]: 3.12002e-06 [split_matmul_comm_elemetwise]: 4.82e-06 [split_layernorm_comm]: 4.25999e-06 [handle_group_info]: 3.63e-06 [symbol_engine_optimizer]: 0.00010584, [1] [Cycle 1]: 9.87e-05, [6] [build]: 2.89001e-06 [elim_shapecalc]: 1.202e-05 [elim_not_effective]: 1.499e-05 [opt_reshape]: 8.62e-06 [fold_const_symbol]: 1.322e-05 [renormalize]: 2.80008e-07 [detach_backward]: 3.6e-06 [pipeline_parallel_scheduler]: 1.94e-06 [auto_monad_reorder]: 2.274e-05 [get_jit_bprop_graph]: 1.76e-06 [rewriter_after_jit_bprop_graph]: 5.04003e-06 [opt_after_jit_grad]: 0.0005056 [validate]: 4.631e-05 Sums bootstrap : 0.000477s : 3.55% type_inference : 0.007163s : 53.32% event_method : 0.000020s : 0.15% auto_monad : 0.000070s : 0.52% graph_reusing : 0.000006s : 0.04% inline : 0.000003s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000024s : 0.18% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000007s : 0.05% parallel-infer-symbol : 0.000004s : 0.03% pre_auto_parallel : 0.000044s : 0.33% insert-virtual-dataset : 0.000003s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.01% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000047s : 0.35% optimize.rewriter_before_opt_a : 0.000100s : 0.75% optimize.opt_a.expand_dump_flag : 0.000006s : 0.05% optimize.opt_a.switch_simplify : 0.000053s : 0.39% optimize.opt_a.loop_unroll : 0.000039s : 0.29% optimize.opt_a.a_1 : 0.000960s : 7.14% optimize.opt_a.with_stream_mark : 0.000040s : 0.29% optimize.opt_a.recompute_prepare : 0.000022s : 0.16% optimize.opt_a.updatestate_depend_eliminate : 0.000009s : 0.07% optimize.opt_a.updatestate_assign_eliminate : 0.000008s : 0.06% optimize.opt_a.updatestate_loads_eliminate : 0.000007s : 0.05% optimize.opt_a.parameter_eliminate : 0.000003s : 0.03% optimize.opt_a.a_2 : 0.000250s : 1.86% optimize.opt_a.accelerated_algorithm : 0.000016s : 0.12% optimize.opt_a.shard : 0.000004s : 0.03% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.03% optimize.opt_a.shard_inline : 0.000017s : 0.13% optimize.opt_a.merge_send_recv : 0.000017s : 0.13% optimize.opt_a.auto_parallel : 0.000016s : 0.12% optimize.opt_a.parallel : 0.000025s : 0.19% optimize.opt_a.flash_sp : 0.000013s : 0.10% optimize.opt_a.merge_comm : 0.000009s : 0.07% optimize.opt_a.allreduce_fusion : 0.000008s : 0.06% optimize.opt_a.matmul_add_comm_reduction : 0.000019s : 0.14% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000021s : 0.15% optimize.opt_a.virtual_dataset : 0.000016s : 0.12% optimize.opt_a.get_grad_eliminate_ : 0.000015s : 0.11% optimize.opt_a.virtual_output : 0.000015s : 0.11% optimize.opt_a.merge_forward : 0.000010s : 0.07% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.02% optimize.opt_a.offload_activation : 0.000022s : 0.16% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000040s : 0.30% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.02% optimize.opt_a.before_grad : 0.000026s : 0.19% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000010s : 0.07% optimize.opt_a.meta_fg_expand : 0.000007s : 0.05% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.03% optimize.opt_a.receive_attached : 0.000004s : 0.03% optimize.opt_a.after_resolve : 0.000028s : 0.21% optimize.opt_a.a_after_grad : 0.000023s : 0.17% optimize.opt_a.renormalize : 0.000832s : 6.19% optimize.opt_a.add_forward_monad_depend : 0.000009s : 0.07% optimize.opt_a.auto_monad_grad : 0.000004s : 0.03% optimize.opt_a.auto_monad_eliminator : 0.000033s : 0.24% optimize.opt_a.cse : 0.000058s : 0.43% optimize.opt_a.a_3 : 0.000133s : 0.99% optimize.py_interpret_to_execute_after_opt_a : 0.000017s : 0.13% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.03% optimize.rewriter_after_opt_a : 0.000062s : 0.46% optimize.convert_after_rewriter : 0.000012s : 0.09% optimize.order_py_execute_after_rewriter : 0.000009s : 0.07% optimize.mutable_eliminate : 0.000653s : 4.86% optimize.opt_b.b_1 : 0.000210s : 1.56% optimize.opt_b.b_2 : 0.000010s : 0.07% optimize.opt_b.updatestate_depend_eliminate : 0.000008s : 0.06% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.02% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.02% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000026s : 0.19% optimize.optimize_parallel_all_gather_comm : 0.000023s : 0.17% optimize.overlap_param_gather : 0.000004s : 0.03% optimize.cconv : 0.000034s : 0.25% optimize.loop_unroll : 0.000455s : 3.39% optimize.opt_after_cconv.c_1 : 0.000038s : 0.28% optimize.opt_after_cconv.parameter_eliminate : 0.000004s : 0.03% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000007s : 0.05% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.02% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.02% optimize.opt_after_cconv.cse : 0.000022s : 0.16% optimize.opt_after_cconv.renormalize : 0.000001s : 0.00% optimize.remove_dup_value : 0.000020s : 0.15% optimize.tuple_transform.d_1 : 0.000051s : 0.38% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000009s : 0.07% optimize.partial_unused_args_eliminate : 0.000005s : 0.04% optimize.add_recomputation : 0.000060s : 0.44% optimize.cse_after_recomputation.cse : 0.000016s : 0.12% optimize.environ_conv : 0.000009s : 0.07% optimize.swap_dp_allreduce_reducescatter : 0.000009s : 0.07% optimize.bias_add_comm_swap : 0.000005s : 0.04% optimize.label_micro_interleaved_index : 0.000007s : 0.05% optimize.label_fine_grained_interleaved_index : 0.000005s : 0.04% optimize.merge_cast_opt : 0.000004s : 0.03% optimize.slice_recompute_activation : 0.000005s : 0.03% optimize.micro_interleaved_order_control : 0.000004s : 0.03% optimize.assign_add_opt : 0.000004s : 0.03% optimize.ForceFp32Comm : 0.000003s : 0.02% optimize.remove_cast_before_assign_add : 0.000004s : 0.03% optimize.full_micro_interleaved_order_control : 0.000005s : 0.03% optimize.reorder_send_recv_between_fp_bp : 0.000006s : 0.04% optimize.comm_op_add_attrs : 0.000003s : 0.03% optimize.add_comm_op_reuse_tag : 0.000003s : 0.02% optimize.interleave_split_concat_branches : 0.000004s : 0.03% optimize.interleave_parallel_branches : 0.000003s : 0.03% optimize.overlap_opt_shard_in_pipeline : 0.000003s : 0.03% optimize.overlap_opt_shard_grad_in_pipeline : 0.000004s : 0.03% optimize.control_data_broadcast_order : 0.000018s : 0.14% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.03% optimize.offloading_packed_experts : 0.000007s : 0.05% optimize.overlap_recompute_and_grad_model_parallel : 0.000008s : 0.06% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.03% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.03% optimize.overlap_recompute_comm : 0.000005s : 0.03% optimize.overlap_grad_ring_attention : 0.000008s : 0.06% optimize.overlap_grad_flash_sp : 0.000027s : 0.20% optimize.begin_end_overlap_inline : 0.000003s : 0.02% optimize.split_matmul_comm_elemetwise : 0.000005s : 0.04% optimize.split_layernorm_comm : 0.000004s : 0.03% optimize.handle_group_info : 0.000004s : 0.03% optimize.symbol_engine_optimizer.build : 0.000003s : 0.02% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000012s : 0.09% optimize.symbol_engine_optimizer.elim_not_effective : 0.000015s : 0.11% optimize.symbol_engine_optimizer.opt_reshape : 0.000009s : 0.06% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000013s : 0.10% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000004s : 0.03% pipeline_parallel_scheduler : 0.000002s : 0.01% auto_monad_reorder : 0.000023s : 0.17% get_jit_bprop_graph : 0.000002s : 0.01% rewriter_after_jit_bprop_graph : 0.000005s : 0.04% opt_after_jit_grad : 0.000506s : 3.76% validate : 0.000046s : 0.34% Time group info: ------[substitution.] 0.000245 39 9.96% : 0.000024s : 3: substitution.cast_eliminate 0.92% : 0.000002s : 3: substitution.elim_not_effective 0.80% : 0.000002s : 3: substitution.fold_const_symbol 2.53% : 0.000006s : 5: substitution.graph_param_transform 69.30% : 0.000170s : 4: substitution.inline 2.11% : 0.000005s : 6: substitution.j_node_and_user_rematch 2.66% : 0.000007s : 6: substitution.remove_not_recompute_node 2.14% : 0.000005s : 4: substitution.replace_old_param 6.32% : 0.000016s : 4: substitution.tuple_list_get_item_eliminator 3.26% : 0.000008s : 1: substitution.value_based_eliminate ------[type_inference.] 0.007099 2 88.89% : 0.006311s : 1: type_inference.infer 11.11% : 0.000789s : 1: type_inference.specialize ------[replace.] 0.000071 8 59.82% : 0.000043s : 4: replace.inline 40.18% : 0.000029s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000181 8 92.58% : 0.000167s : 4: match.inline 7.42% : 0.000013s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000255 1596 0.91% : 0.000002s : 17: predicate.accumulaten_eliminater 0.75% : 0.000002s : 5: predicate.ad_related_special_op_eliminate 0.56% : 0.000001s : 10: predicate.addn_check_dump 0.90% : 0.000002s : 17: predicate.addn_zero_filter 0.85% : 0.000002s : 17: predicate.adjust_all_reduce_mul_add 1.99% : 0.000005s : 27: predicate.arithmetic_simplify 1.00% : 0.000003s : 17: predicate.cast_eliminate 0.61% : 0.000002s : 10: predicate.check_bprop_eliminate 0.57% : 0.000001s : 10: predicate.compare_switch_simplify 0.19% : 0.000000s : 5: predicate.const_output_eliminate 0.64% : 0.000002s : 10: predicate.depend_value_elim 0.98% : 0.000002s : 17: predicate.dict_get_item_const_eliminator 1.08% : 0.000003s : 17: predicate.dict_get_item_eliminator 0.93% : 0.000002s : 17: predicate.dict_set_item_eliminator 0.89% : 0.000002s : 10: predicate.dumpgradient_eliminate 0.22% : 0.000001s : 5: predicate.elim_not_effective 0.37% : 0.000001s : 5: predicate.elim_shapecalc_of_broadcastargs 1.16% : 0.000003s : 22: predicate.environ_add_const_eliminate 1.15% : 0.000003s : 22: predicate.environ_get_add_eliminate 1.12% : 0.000003s : 22: predicate.environ_get_depend_swap 1.71% : 0.000004s : 32: predicate.environ_get_eliminate 1.12% : 0.000003s : 22: predicate.environ_get_set_eliminate 1.39% : 0.000004s : 25: predicate.exchange_switch_depend_value 2.29% : 0.000006s : 25: predicate.float_depend_g_call 0.57% : 0.000001s : 10: predicate.float_environ_get_switch 0.85% : 0.000002s : 15: predicate.float_tuple_getitem_switch 0.19% : 0.000000s : 5: predicate.fold_const_symbol 0.75% : 0.000002s : 10: predicate.get_grad_eliminate 0.20% : 0.000001s : 5: predicate.graph_param_transform 0.64% : 0.000002s : 10: predicate.incorporate_call 0.51% : 0.000001s : 10: predicate.incorporate_call_switch 6.26% : 0.000016s : 72: predicate.inline 0.74% : 0.000002s : 10: predicate.inline_without_move 0.31% : 0.000001s : 10: predicate.j_node_and_user_rematch 0.75% : 0.000002s : 10: predicate.less_batch_normalization 1.93% : 0.000005s : 31: predicate.list_to_tuple_eliminator_ 2.60% : 0.000007s : 48: predicate.load_eliminater 0.82% : 0.000002s : 5: predicate.loop_unroll_after_grad 1.99% : 0.000005s : 36: predicate.loop_unroll_before_grad 1.64% : 0.000004s : 27: predicate.make_slice_get_slice_eliminator 0.57% : 0.000001s : 10: predicate.merge_addn 0.56% : 0.000001s : 10: predicate.micro_step_allgather_replace 0.60% : 0.000002s : 10: predicate.mini_step_allgather_replace 0.89% : 0.000002s : 17: predicate.minmaximum_grad 1.01% : 0.000003s : 5: predicate.mutable_eliminate 0.45% : 0.000001s : 5: predicate.opt_reshape 0.54% : 0.000001s : 5: predicate.parallel_virtual_node 1.74% : 0.000004s : 25: predicate.partial_defer_inline 1.69% : 0.000004s : 26: predicate.partial_eliminate 0.98% : 0.000003s : 17: predicate.print_const_string_wrapper 0.71% : 0.000002s : 10: predicate.reduce_all_const_elim 1.17% : 0.000003s : 17: predicate.reduce_eliminate 2.54% : 0.000006s : 48: predicate.redundant_stop_gradient_eliminater 0.50% : 0.000001s : 10: predicate.remove_not_recompute_node 1.34% : 0.000003s : 31: predicate.replace_applicator 0.50% : 0.000001s : 10: predicate.replace_old_param 0.32% : 0.000001s : 5: predicate.reset_defer_inline 0.97% : 0.000002s : 17: predicate.reshape_eliminate 0.59% : 0.000002s : 10: predicate.row_tensor_add_zeros_like 0.38% : 0.000001s : 5: predicate.row_tensor_eliminate 0.82% : 0.000002s : 10: predicate.same_eliminate 0.48% : 0.000001s : 10: predicate.set_cell_output_no_recompute 0.93% : 0.000002s : 10: predicate.shard_identity_eliminate 0.73% : 0.000002s : 10: predicate.special_op_eliminate 0.75% : 0.000002s : 10: predicate.specialize_transform 1.00% : 0.000003s : 10: predicate.split_environ_get_set_with_tuple_value 0.71% : 0.000002s : 10: predicate.stack_unstack_eliminate 0.35% : 0.000001s : 5: predicate.switch_call_monad_eliminater 1.48% : 0.000004s : 25: predicate.switch_defer_inline 2.01% : 0.000005s : 35: predicate.switch_layer_defer_inline 4.66% : 0.000012s : 76: predicate.switch_simplify 0.96% : 0.000002s : 17: predicate.tile_eliminate 0.94% : 0.000002s : 17: predicate.transpose_eliminate 1.54% : 0.000004s : 27: predicate.tuple_list_convert_item_index_to_positive 1.63% : 0.000004s : 27: predicate.tuple_list_get_item_const_eliminator 1.55% : 0.000004s : 27: predicate.tuple_list_get_item_depend_reorder 3.38% : 0.000009s : 41: predicate.tuple_list_get_item_eliminator 1.45% : 0.000004s : 27: predicate.tuple_list_get_set_item_eliminator 2.20% : 0.000006s : 37: predicate.tuple_list_set_item_eliminator 1.95% : 0.000005s : 31: predicate.tuple_to_list_eliminator_ 2.51% : 0.000006s : 48: predicate.updatestate_pure_node_eliminater 3.21% : 0.000008s : 58: predicate.updatestate_useless_node_eliminater 0.53% : 0.000001s : 5: predicate.value_based_eliminate 0.65% : 0.000002s : 10: predicate.virtual_dataset_eliminate 0.62% : 0.000002s : 10: predicate.virtual_output_eliminate 0.27% : 0.000001s : 5: predicate.virtual_view_grad_eliminate 0.47% : 0.000001s : 5: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000628 11 57.66% : 0.000362s : 5: func_graph_cloner_run.FuncGraphClonerGraph 42.34% : 0.000266s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.031544 192 0.02% : 0.000006s : 1: ForceFp32Comm 11.83% : 0.003733s : 1: add_attr 11.72% : 0.003698s : 1: add_attr_with_inline 0.02% : 0.000006s : 1: add_comm_op_reuse_tag 0.20% : 0.000063s : 1: add_recomputation 0.02% : 0.000006s : 1: assign_add_opt 0.26% : 0.000080s : 1: auto_monad 0.10% : 0.000031s : 1: auto_monad_reorder 0.02% : 0.000006s : 1: begin_end_overlap_inline 0.03% : 0.000008s : 1: bias_add_comm_swap 1.67% : 0.000527s : 1: bootstrap 0.12% : 0.000037s : 1: cconv 0.02% : 0.000006s : 1: comm_op_add_attrs 0.07% : 0.000021s : 1: control_data_broadcast_order 0.05% : 0.000016s : 1: convert_after_rewriter 0.11% : 0.000036s : 1: cse_after_recomputation 0.02% : 0.000008s : 1: dataset_repeat_opt 0.06% : 0.000020s : 1: detach_backward 0.04% : 0.000012s : 1: environ_conv 0.10% : 0.000030s : 1: event_method 0.02% : 0.000007s : 1: full_micro_interleaved_order_control 0.03% : 0.000008s : 1: get_jit_bprop_graph 0.04% : 0.000012s : 1: graph_reusing 0.02% : 0.000006s : 1: grouped_pairwise_exchange_alltoall 0.02% : 0.000006s : 1: handle_group_info 0.03% : 0.000008s : 1: inline 0.03% : 0.000009s : 1: insert-virtual-dataset 0.02% : 0.000006s : 1: interleave_parallel_branches 0.02% : 0.000006s : 1: interleave_split_concat_branches 0.03% : 0.000008s : 1: label_fine_grained_interleaved_index 0.03% : 0.000010s : 1: label_micro_interleaved_index 1.46% : 0.000461s : 1: loop_unroll 0.02% : 0.000006s : 1: merge_cast_opt 0.02% : 0.000007s : 1: micro_interleaved_order_control 2.09% : 0.000659s : 1: mutable_eliminate 0.03% : 0.000010s : 1: offloading_packed_experts 0.05% : 0.000016s : 1: opt.transform.loop_unroll_optimizer 0.05% : 0.000017s : 1: opt.transform.mutable_eliminate 4.74% : 0.001494s : 78: opt.transform.opt_a 0.11% : 0.000036s : 1: opt.transform.opt_after_cconv 0.09% : 0.000030s : 1: opt.transform.opt_after_jit_grad 0.47% : 0.000147s : 28: opt.transform.opt_b 0.18% : 0.000058s : 2: opt.transform.opt_trans_graph 0.14% : 0.000045s : 4: opt.transform.symbol_engine_opt 11.54% : 0.003639s : 1: opt_a 0.46% : 0.000144s : 1: opt_after_cconv 1.64% : 0.000516s : 1: opt_after_jit_grad 1.05% : 0.000332s : 1: opt_b 21.22% : 0.006693s : 1: optimize 0.08% : 0.000026s : 1: optimize_parallel_all_gather_comm 0.04% : 0.000012s : 1: order_py_execute_after_rewriter 0.10% : 0.000031s : 1: overlap_grad_flash_sp 0.02% : 0.000006s : 1: overlap_grad_matmul_and_grad_allreduce 0.03% : 0.000010s : 1: overlap_grad_ring_attention 0.02% : 0.000007s : 1: overlap_opt_shard_grad_in_pipeline 0.02% : 0.000007s : 1: overlap_opt_shard_in_pipeline 0.02% : 0.000007s : 1: overlap_param_gather 0.02% : 0.000006s : 1: overlap_recompute_allgather_and_fa_grad 0.04% : 0.000011s : 1: overlap_recompute_and_grad_model_parallel 0.02% : 0.000008s : 1: overlap_recompute_comm 0.03% : 0.000011s : 1: parallel-infer-symbol 0.02% : 0.000006s : 1: parallel-infer-symbol-second 0.02% : 0.000008s : 1: partial_unused_args_eliminate 0.03% : 0.000009s : 1: pipeline_parallel_scheduler 0.02% : 0.000007s : 1: pipeline_split 0.16% : 0.000051s : 1: pre_auto_parallel 0.16% : 0.000052s : 1: py_interpret_to_execute 0.06% : 0.000020s : 1: py_interpret_to_execute_after_opt_a 0.02% : 0.000006s : 1: remove_cast_before_assign_add 0.07% : 0.000023s : 1: remove_dup_value 1.48% : 0.000467s : 1: renormalize.infer 1.12% : 0.000354s : 1: renormalize.specialize 0.03% : 0.000008s : 1: reorder_send_recv_between_fp_bp 0.03% : 0.000011s : 1: rewriter_after_jit_bprop_graph 0.21% : 0.000066s : 1: rewriter_after_opt_a 0.33% : 0.000105s : 1: rewriter_before_opt_a 0.02% : 0.000007s : 1: slice_cell_reuse_recomputed_activation 0.02% : 0.000007s : 1: slice_recompute_activation 0.02% : 0.000007s : 1: split_layernorm_comm 0.02% : 0.000007s : 1: split_matmul_comm_elemetwise 0.04% : 0.000012s : 1: swap_dp_allreduce_reducescatter 0.34% : 0.000109s : 1: symbol_engine_optimizer 0.33% : 0.000104s : 1: tuple_transform 22.85% : 0.007208s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:44:39.466.90 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0164235, [21] [bootstrap]: 0.00040951 [type_inference]: 0.00649336 [event_method]: 2.236e-05 [auto_monad]: 7.056e-05 [graph_reusing]: 5.59e-06 [inline]: 2.66e-06 [add_attr]: 0.00327367, [1] [add_attr_with_inline]: 0.00326246, [1] [Cycle 1]: 6.613e-05, [2] [tag_attr]: 2.102e-05 [meta_addattr_fg_expand]: 6.59999e-06 [parallel-infer-symbol]: 3.73999e-06 [pre_auto_parallel]: 3.867e-05 [insert-virtual-dataset]: 2.67001e-06 [parallel-infer-symbol-second]: 8.70001e-07 [dataset_repeat_opt]: 2.36e-06 [pipeline_split]: 1.59998e-06 [optimize]: 0.00534817, [53] [py_interpret_to_execute]: 2.775e-05 [rewriter_before_opt_a]: 8.866e-05 [opt_a]: 0.00311449, [2] [Cycle 1]: 0.00231723, [45] [expand_dump_flag]: 3.33998e-06 [switch_simplify]: 4.523e-05 [loop_unroll]: 3.206e-05 [a_1]: 0.00075835 [with_stream_mark]: 1.984e-05 [recompute_prepare]: 1.206e-05 [updatestate_depend_eliminate]: 5.22e-06 [updatestate_assign_eliminate]: 3.91999e-06 [updatestate_loads_eliminate]: 3.55e-06 [parameter_eliminate]: 1.95001e-06 [a_2]: 0.00010245 [accelerated_algorithm]: 9.82999e-06 [shard]: 2.05002e-06 [meta_shard_fg_expand]: 1.99999e-06 [shard_inline]: 7.33e-06 [merge_send_recv]: 9.69e-06 [auto_parallel]: 8.47e-06 [parallel]: 1.866e-05 [flash_sp]: 9.24998e-06 [merge_comm]: 4.93001e-06 [allreduce_fusion]: 4.23999e-06 [matmul_add_comm_reduction]: 1.024e-05 [allreduce_slice_to_reducescatter]: 5.89993e-07 [virtual_shard_identity]: 1.127e-05 [virtual_dataset]: 7.73001e-06 [get_grad_eliminate_]: 7.44002e-06 [virtual_output]: 7.83999e-06 [merge_forward]: 5.24e-06 [cell_reuse_recompute_pass]: 1.09e-06 [offload_activation]: 1.169e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.784e-05 [merge_recompute_call_nodes]: 1.82001e-06 [before_grad]: 1.245e-05 [set_forward_comm_id_for_comm_node_pass]: 4.57e-06 [meta_fg_expand]: 3.5e-06 [flash_sp_send_recv_attached]: 3.38e-06 [receive_attached]: 2.39001e-06 [after_resolve]: 1.48e-05 [a_after_grad]: 1.272e-05 [renormalize]: 0.00073183 [add_forward_monad_depend]: 6.32001e-06 [auto_monad_grad]: 2.12999e-06 [auto_monad_eliminator]: 1.83e-05 [cse]: 3.721e-05 [a_3]: 6.038e-05 [Cycle 2]: 0.00078666, [45] [expand_dump_flag]: 1.84998e-06 [switch_simplify]: 9.42999e-06 [loop_unroll]: 7.36999e-06 [a_1]: 0.00017547 [with_stream_mark]: 1.515e-05 [recompute_prepare]: 8.35001e-06 [updatestate_depend_eliminate]: 3.85998e-06 [updatestate_assign_eliminate]: 2.71999e-06 [updatestate_loads_eliminate]: 3.25e-06 [parameter_eliminate]: 1.16002e-06 [a_2]: 9.21e-05 [accelerated_algorithm]: 8.33999e-06 [shard]: 1.47001e-06 [meta_shard_fg_expand]: 1.62001e-06 [shard_inline]: 8.29998e-06 [merge_send_recv]: 6.71999e-06 [auto_parallel]: 9.72999e-06 [parallel]: 4.79998e-06 [flash_sp]: 3.42997e-06 [merge_comm]: 4.71002e-06 [allreduce_fusion]: 3.83999e-06 [matmul_add_comm_reduction]: 6.69001e-06 [allreduce_slice_to_reducescatter]: 3.60014e-07 [virtual_shard_identity]: 1.067e-05 [virtual_dataset]: 7.29001e-06 [get_grad_eliminate_]: 7.49002e-06 [virtual_output]: 6.80998e-06 [merge_forward]: 3.76999e-06 [cell_reuse_recompute_pass]: 1.35001e-06 [offload_activation]: 1.033e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.599e-05 [merge_recompute_call_nodes]: 8.59989e-07 [before_grad]: 1.156e-05 [set_forward_comm_id_for_comm_node_pass]: 4.65999e-06 [meta_fg_expand]: 2.91999e-06 [flash_sp_send_recv_attached]: 9.89996e-07 [receive_attached]: 1.12999e-06 [after_resolve]: 1.305e-05 [a_after_grad]: 1.14e-05 [renormalize]: 6.00121e-08 [add_forward_monad_depend]: 2.66999e-06 [auto_monad_grad]: 1.45001e-06 [auto_monad_eliminator]: 9.66e-06 [cse]: 1.958e-05 [a_3]: 4.504e-05 [py_interpret_to_execute_after_opt_a]: 1.128e-05 [slice_cell_reuse_recomputed_activation]: 2.44999e-06 [rewriter_after_opt_a]: 4.212e-05 [convert_after_rewriter]: 7.36001e-06 [order_py_execute_after_rewriter]: 5.74999e-06 [mutable_eliminate]: 0.00051338 [opt_b]: 0.00028271, [1] [Cycle 1]: 0.00027504, [7] [b_1]: 0.00018367 [b_2]: 1.042e-05 [updatestate_depend_eliminate]: 7.65998e-06 [updatestate_assign_eliminate]: 2.88e-06 [updatestate_loads_eliminate]: 3.43e-06 [renormalize]: 5.49975e-07 [cse]: 2.62e-05 [optimize_parallel_all_gather_comm]: 2.006e-05 [overlap_param_gather]: 2.06e-06 [cconv]: 2.735e-05 [loop_unroll]: 0.00046548 [opt_after_cconv]: 0.00012247, [1] [Cycle 1]: 0.00011519, [7] [c_1]: 3.73e-05 [parameter_eliminate]: 3.93001e-06 [updatestate_depend_eliminate]: 6.88e-06 [updatestate_assign_eliminate]: 2.94001e-06 [updatestate_loads_eliminate]: 3.18998e-06 [cse]: 2.571e-05 [renormalize]: 6.39993e-07 [remove_dup_value]: 1.639e-05 [tuple_transform]: 8.875e-05, [1] [Cycle 1]: 8.402e-05, [4] [d_1]: 5.445e-05 [none_parameter_eliminate]: 1.79998e-06 [renormalize]: 1.60013e-07 [switch_simplify]: 8.35001e-06 [partial_unused_args_eliminate]: 1.84e-06 [add_recomputation]: 5.973e-05 [cse_after_recomputation]: 2.649e-05, [1] [Cycle 1]: 2.126e-05, [1] [cse]: 1.561e-05 [environ_conv]: 7.05002e-06 [swap_dp_allreduce_reducescatter]: 6.09001e-06 [bias_add_comm_swap]: 2.44999e-06 [label_micro_interleaved_index]: 4.02998e-06 [label_fine_grained_interleaved_index]: 2.69999e-06 [merge_cast_opt]: 1.37999e-06 [slice_recompute_activation]: 2.24001e-06 [micro_interleaved_order_control]: 2.14999e-06 [assign_add_opt]: 1.60999e-06 [ForceFp32Comm]: 8.60018e-07 [remove_cast_before_assign_add]: 1.00001e-06 [full_micro_interleaved_order_control]: 2.17999e-06 [reorder_send_recv_between_fp_bp]: 2.68e-06 [comm_op_add_attrs]: 1.03001e-06 [add_comm_op_reuse_tag]: 9.49978e-07 [interleave_split_concat_branches]: 1.05999e-06 [interleave_parallel_branches]: 1.00999e-06 [overlap_opt_shard_in_pipeline]: 1.70001e-06 [overlap_opt_shard_grad_in_pipeline]: 1.91e-06 [control_data_broadcast_order]: 1.599e-05 [grouped_pairwise_exchange_alltoall]: 1.54998e-06 [offloading_packed_experts]: 4.83001e-06 [overlap_recompute_and_grad_model_parallel]: 5.09e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.17999e-06 [overlap_recompute_allgather_and_fa_grad]: 1.46002e-06 [overlap_recompute_comm]: 2.03997e-06 [overlap_grad_ring_attention]: 4.94e-06 [overlap_grad_flash_sp]: 2.198e-05 [begin_end_overlap_inline]: 4.89992e-07 [split_matmul_comm_elemetwise]: 2.20002e-06 [split_layernorm_comm]: 1.92001e-06 [handle_group_info]: 9.89996e-07 [symbol_engine_optimizer]: 8.976e-05, [1] [Cycle 1]: 8.463e-05, [6] [build]: 3.45e-06 [elim_shapecalc]: 1.37e-05 [elim_not_effective]: 1.609e-05 [opt_reshape]: 7.93001e-06 [fold_const_symbol]: 1.268e-05 [renormalize]: 2.3999e-07 [detach_backward]: 1.96998e-06 [pipeline_parallel_scheduler]: 1.43002e-06 [auto_monad_reorder]: 2.156e-05 [get_jit_bprop_graph]: 1.72001e-06 [rewriter_after_jit_bprop_graph]: 4.18999e-06 [opt_after_jit_grad]: 0.00049875 [validate]: 4.907e-05 Sums bootstrap : 0.000410s : 3.37% type_inference : 0.006493s : 53.50% event_method : 0.000022s : 0.18% auto_monad : 0.000071s : 0.58% graph_reusing : 0.000006s : 0.05% inline : 0.000003s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000021s : 0.17% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000007s : 0.05% parallel-infer-symbol : 0.000004s : 0.03% pre_auto_parallel : 0.000039s : 0.32% insert-virtual-dataset : 0.000003s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000028s : 0.23% optimize.rewriter_before_opt_a : 0.000089s : 0.73% optimize.opt_a.expand_dump_flag : 0.000005s : 0.04% optimize.opt_a.switch_simplify : 0.000055s : 0.45% optimize.opt_a.loop_unroll : 0.000039s : 0.32% optimize.opt_a.a_1 : 0.000934s : 7.69% optimize.opt_a.with_stream_mark : 0.000035s : 0.29% optimize.opt_a.recompute_prepare : 0.000020s : 0.17% optimize.opt_a.updatestate_depend_eliminate : 0.000009s : 0.07% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.05% optimize.opt_a.updatestate_loads_eliminate : 0.000007s : 0.06% optimize.opt_a.parameter_eliminate : 0.000003s : 0.03% optimize.opt_a.a_2 : 0.000195s : 1.60% optimize.opt_a.accelerated_algorithm : 0.000018s : 0.15% optimize.opt_a.shard : 0.000004s : 0.03% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.03% optimize.opt_a.shard_inline : 0.000016s : 0.13% optimize.opt_a.merge_send_recv : 0.000016s : 0.14% optimize.opt_a.auto_parallel : 0.000018s : 0.15% optimize.opt_a.parallel : 0.000023s : 0.19% optimize.opt_a.flash_sp : 0.000013s : 0.10% optimize.opt_a.merge_comm : 0.000010s : 0.08% optimize.opt_a.allreduce_fusion : 0.000008s : 0.07% optimize.opt_a.matmul_add_comm_reduction : 0.000017s : 0.14% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000022s : 0.18% optimize.opt_a.virtual_dataset : 0.000015s : 0.12% optimize.opt_a.get_grad_eliminate_ : 0.000015s : 0.12% optimize.opt_a.virtual_output : 0.000015s : 0.12% optimize.opt_a.merge_forward : 0.000009s : 0.07% optimize.opt_a.cell_reuse_recompute_pass : 0.000002s : 0.02% optimize.opt_a.offload_activation : 0.000022s : 0.18% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000034s : 0.28% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.02% optimize.opt_a.before_grad : 0.000024s : 0.20% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000009s : 0.08% optimize.opt_a.meta_fg_expand : 0.000006s : 0.05% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.04% optimize.opt_a.receive_attached : 0.000004s : 0.03% optimize.opt_a.after_resolve : 0.000028s : 0.23% optimize.opt_a.a_after_grad : 0.000024s : 0.20% optimize.opt_a.renormalize : 0.000732s : 6.03% optimize.opt_a.add_forward_monad_depend : 0.000009s : 0.07% optimize.opt_a.auto_monad_grad : 0.000004s : 0.03% optimize.opt_a.auto_monad_eliminator : 0.000028s : 0.23% optimize.opt_a.cse : 0.000057s : 0.47% optimize.opt_a.a_3 : 0.000105s : 0.87% optimize.py_interpret_to_execute_after_opt_a : 0.000011s : 0.09% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.02% optimize.rewriter_after_opt_a : 0.000042s : 0.35% optimize.convert_after_rewriter : 0.000007s : 0.06% optimize.order_py_execute_after_rewriter : 0.000006s : 0.05% optimize.mutable_eliminate : 0.000513s : 4.23% optimize.opt_b.b_1 : 0.000184s : 1.51% optimize.opt_b.b_2 : 0.000010s : 0.09% optimize.opt_b.updatestate_depend_eliminate : 0.000008s : 0.06% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.02% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000026s : 0.22% optimize.optimize_parallel_all_gather_comm : 0.000020s : 0.17% optimize.overlap_param_gather : 0.000002s : 0.02% optimize.cconv : 0.000027s : 0.23% optimize.loop_unroll : 0.000465s : 3.84% optimize.opt_after_cconv.c_1 : 0.000037s : 0.31% optimize.opt_after_cconv.parameter_eliminate : 0.000004s : 0.03% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000007s : 0.06% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.02% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.cse : 0.000026s : 0.21% optimize.opt_after_cconv.renormalize : 0.000001s : 0.01% optimize.remove_dup_value : 0.000016s : 0.14% optimize.tuple_transform.d_1 : 0.000054s : 0.45% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000008s : 0.07% optimize.partial_unused_args_eliminate : 0.000002s : 0.02% optimize.add_recomputation : 0.000060s : 0.49% optimize.cse_after_recomputation.cse : 0.000016s : 0.13% optimize.environ_conv : 0.000007s : 0.06% optimize.swap_dp_allreduce_reducescatter : 0.000006s : 0.05% optimize.bias_add_comm_swap : 0.000002s : 0.02% optimize.label_micro_interleaved_index : 0.000004s : 0.03% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.02% optimize.merge_cast_opt : 0.000001s : 0.01% optimize.slice_recompute_activation : 0.000002s : 0.02% optimize.micro_interleaved_order_control : 0.000002s : 0.02% optimize.assign_add_opt : 0.000002s : 0.01% optimize.ForceFp32Comm : 0.000001s : 0.01% optimize.remove_cast_before_assign_add : 0.000001s : 0.01% optimize.full_micro_interleaved_order_control : 0.000002s : 0.02% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.02% optimize.comm_op_add_attrs : 0.000001s : 0.01% optimize.add_comm_op_reuse_tag : 0.000001s : 0.01% optimize.interleave_split_concat_branches : 0.000001s : 0.01% optimize.interleave_parallel_branches : 0.000001s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000002s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.02% optimize.control_data_broadcast_order : 0.000016s : 0.13% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.01% optimize.offloading_packed_experts : 0.000005s : 0.04% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.04% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.01% optimize.overlap_recompute_comm : 0.000002s : 0.02% optimize.overlap_grad_ring_attention : 0.000005s : 0.04% optimize.overlap_grad_flash_sp : 0.000022s : 0.18% optimize.begin_end_overlap_inline : 0.000000s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.02% optimize.split_layernorm_comm : 0.000002s : 0.02% optimize.handle_group_info : 0.000001s : 0.01% optimize.symbol_engine_optimizer.build : 0.000003s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000014s : 0.11% optimize.symbol_engine_optimizer.elim_not_effective : 0.000016s : 0.13% optimize.symbol_engine_optimizer.opt_reshape : 0.000008s : 0.07% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000013s : 0.10% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.02% pipeline_parallel_scheduler : 0.000001s : 0.01% auto_monad_reorder : 0.000022s : 0.18% get_jit_bprop_graph : 0.000002s : 0.01% rewriter_after_jit_bprop_graph : 0.000004s : 0.03% opt_after_jit_grad : 0.000499s : 4.11% validate : 0.000049s : 0.40% Time group info: ------[substitution.] 0.000225 39 10.11% : 0.000023s : 3: substitution.cast_eliminate 0.99% : 0.000002s : 3: substitution.elim_not_effective 0.73% : 0.000002s : 3: substitution.fold_const_symbol 3.26% : 0.000007s : 5: substitution.graph_param_transform 66.88% : 0.000150s : 4: substitution.inline 1.72% : 0.000004s : 6: substitution.j_node_and_user_rematch 2.67% : 0.000006s : 6: substitution.remove_not_recompute_node 2.30% : 0.000005s : 4: substitution.replace_old_param 7.47% : 0.000017s : 4: substitution.tuple_list_get_item_eliminator 3.87% : 0.000009s : 1: substitution.value_based_eliminate ------[type_inference.] 0.006420 2 87.38% : 0.005609s : 1: type_inference.infer 12.62% : 0.000810s : 1: type_inference.specialize ------[replace.] 0.000066 8 56.49% : 0.000037s : 4: replace.inline 43.51% : 0.000029s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000162 8 91.03% : 0.000147s : 4: match.inline 8.97% : 0.000015s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000268 1596 0.88% : 0.000002s : 17: predicate.accumulaten_eliminater 0.98% : 0.000003s : 5: predicate.ad_related_special_op_eliminate 0.52% : 0.000001s : 10: predicate.addn_check_dump 0.88% : 0.000002s : 17: predicate.addn_zero_filter 0.81% : 0.000002s : 17: predicate.adjust_all_reduce_mul_add 1.99% : 0.000005s : 27: predicate.arithmetic_simplify 1.00% : 0.000003s : 17: predicate.cast_eliminate 0.56% : 0.000002s : 10: predicate.check_bprop_eliminate 0.56% : 0.000002s : 10: predicate.compare_switch_simplify 0.18% : 0.000000s : 5: predicate.const_output_eliminate 0.57% : 0.000002s : 10: predicate.depend_value_elim 0.96% : 0.000003s : 17: predicate.dict_get_item_const_eliminator 0.99% : 0.000003s : 17: predicate.dict_get_item_eliminator 1.00% : 0.000003s : 17: predicate.dict_set_item_eliminator 0.95% : 0.000003s : 10: predicate.dumpgradient_eliminate 0.27% : 0.000001s : 5: predicate.elim_not_effective 0.31% : 0.000001s : 5: predicate.elim_shapecalc_of_broadcastargs 1.17% : 0.000003s : 22: predicate.environ_add_const_eliminate 1.05% : 0.000003s : 22: predicate.environ_get_add_eliminate 1.08% : 0.000003s : 22: predicate.environ_get_depend_swap 1.66% : 0.000004s : 32: predicate.environ_get_eliminate 1.09% : 0.000003s : 22: predicate.environ_get_set_eliminate 1.33% : 0.000004s : 25: predicate.exchange_switch_depend_value 2.00% : 0.000005s : 25: predicate.float_depend_g_call 0.50% : 0.000001s : 10: predicate.float_environ_get_switch 0.79% : 0.000002s : 15: predicate.float_tuple_getitem_switch 0.18% : 0.000000s : 5: predicate.fold_const_symbol 0.59% : 0.000002s : 10: predicate.get_grad_eliminate 0.20% : 0.000001s : 5: predicate.graph_param_transform 0.60% : 0.000002s : 10: predicate.incorporate_call 0.49% : 0.000001s : 10: predicate.incorporate_call_switch 6.24% : 0.000017s : 72: predicate.inline 0.67% : 0.000002s : 10: predicate.inline_without_move 0.31% : 0.000001s : 10: predicate.j_node_and_user_rematch 0.90% : 0.000002s : 10: predicate.less_batch_normalization 1.70% : 0.000005s : 31: predicate.list_to_tuple_eliminator_ 2.40% : 0.000006s : 48: predicate.load_eliminater 1.01% : 0.000003s : 5: predicate.loop_unroll_after_grad 1.92% : 0.000005s : 36: predicate.loop_unroll_before_grad 1.60% : 0.000004s : 27: predicate.make_slice_get_slice_eliminator 0.53% : 0.000001s : 10: predicate.merge_addn 0.55% : 0.000001s : 10: predicate.micro_step_allgather_replace 0.55% : 0.000001s : 10: predicate.mini_step_allgather_replace 0.86% : 0.000002s : 17: predicate.minmaximum_grad 1.08% : 0.000003s : 5: predicate.mutable_eliminate 0.32% : 0.000001s : 5: predicate.opt_reshape 0.37% : 0.000001s : 5: predicate.parallel_virtual_node 1.71% : 0.000005s : 25: predicate.partial_defer_inline 1.62% : 0.000004s : 26: predicate.partial_eliminate 0.86% : 0.000002s : 17: predicate.print_const_string_wrapper 0.58% : 0.000002s : 10: predicate.reduce_all_const_elim 1.23% : 0.000003s : 17: predicate.reduce_eliminate 2.56% : 0.000007s : 48: predicate.redundant_stop_gradient_eliminater 0.51% : 0.000001s : 10: predicate.remove_not_recompute_node 1.43% : 0.000004s : 31: predicate.replace_applicator 0.42% : 0.000001s : 10: predicate.replace_old_param 0.24% : 0.000001s : 5: predicate.reset_defer_inline 0.94% : 0.000003s : 17: predicate.reshape_eliminate 0.60% : 0.000002s : 10: predicate.row_tensor_add_zeros_like 0.44% : 0.000001s : 5: predicate.row_tensor_eliminate 0.67% : 0.000002s : 10: predicate.same_eliminate 0.39% : 0.000001s : 10: predicate.set_cell_output_no_recompute 0.74% : 0.000002s : 10: predicate.shard_identity_eliminate 0.71% : 0.000002s : 10: predicate.special_op_eliminate 0.67% : 0.000002s : 10: predicate.specialize_transform 0.81% : 0.000002s : 10: predicate.split_environ_get_set_with_tuple_value 0.71% : 0.000002s : 10: predicate.stack_unstack_eliminate 0.33% : 0.000001s : 5: predicate.switch_call_monad_eliminater 1.37% : 0.000004s : 25: predicate.switch_defer_inline 1.95% : 0.000005s : 35: predicate.switch_layer_defer_inline 4.42% : 0.000012s : 76: predicate.switch_simplify 0.85% : 0.000002s : 17: predicate.tile_eliminate 0.90% : 0.000002s : 17: predicate.transpose_eliminate 1.45% : 0.000004s : 27: predicate.tuple_list_convert_item_index_to_positive 1.57% : 0.000004s : 27: predicate.tuple_list_get_item_const_eliminator 1.40% : 0.000004s : 27: predicate.tuple_list_get_item_depend_reorder 3.25% : 0.000009s : 41: predicate.tuple_list_get_item_eliminator 5.88% : 0.000016s : 27: predicate.tuple_list_get_set_item_eliminator 2.11% : 0.000006s : 37: predicate.tuple_list_set_item_eliminator 1.70% : 0.000005s : 31: predicate.tuple_to_list_eliminator_ 2.45% : 0.000007s : 48: predicate.updatestate_pure_node_eliminater 3.09% : 0.000008s : 58: predicate.updatestate_useless_node_eliminater 0.47% : 0.000001s : 5: predicate.value_based_eliminate 0.59% : 0.000002s : 10: predicate.virtual_dataset_eliminate 0.58% : 0.000002s : 10: predicate.virtual_output_eliminate 0.28% : 0.000001s : 5: predicate.virtual_view_grad_eliminate 0.34% : 0.000001s : 5: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000606 11 52.71% : 0.000319s : 5: func_graph_cloner_run.FuncGraphClonerGraph 47.29% : 0.000286s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.027466 192 0.01% : 0.000004s : 1: ForceFp32Comm 11.94% : 0.003279s : 1: add_attr 11.89% : 0.003266s : 1: add_attr_with_inline 0.01% : 0.000004s : 1: add_comm_op_reuse_tag 0.23% : 0.000064s : 1: add_recomputation 0.02% : 0.000004s : 1: assign_add_opt 0.28% : 0.000076s : 1: auto_monad 0.09% : 0.000026s : 1: auto_monad_reorder 0.01% : 0.000003s : 1: begin_end_overlap_inline 0.02% : 0.000005s : 1: bias_add_comm_swap 1.58% : 0.000435s : 1: bootstrap 0.11% : 0.000031s : 1: cconv 0.01% : 0.000004s : 1: comm_op_add_attrs 0.07% : 0.000020s : 1: control_data_broadcast_order 0.04% : 0.000011s : 1: convert_after_rewriter 0.11% : 0.000029s : 1: cse_after_recomputation 0.02% : 0.000005s : 1: dataset_repeat_opt 0.02% : 0.000005s : 1: detach_backward 0.04% : 0.000010s : 1: environ_conv 0.10% : 0.000028s : 1: event_method 0.02% : 0.000005s : 1: full_micro_interleaved_order_control 0.02% : 0.000005s : 1: get_jit_bprop_graph 0.03% : 0.000009s : 1: graph_reusing 0.02% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000004s : 1: handle_group_info 0.02% : 0.000006s : 1: inline 0.02% : 0.000006s : 1: insert-virtual-dataset 0.01% : 0.000004s : 1: interleave_parallel_branches 0.01% : 0.000004s : 1: interleave_split_concat_branches 0.02% : 0.000006s : 1: label_fine_grained_interleaved_index 0.02% : 0.000007s : 1: label_micro_interleaved_index 1.73% : 0.000475s : 1: loop_unroll 0.01% : 0.000004s : 1: merge_cast_opt 0.02% : 0.000005s : 1: micro_interleaved_order_control 1.90% : 0.000522s : 1: mutable_eliminate 0.03% : 0.000008s : 1: offloading_packed_experts 0.07% : 0.000019s : 1: opt.transform.loop_unroll_optimizer 0.07% : 0.000019s : 1: opt.transform.mutable_eliminate 5.32% : 0.001462s : 78: opt.transform.opt_a 0.13% : 0.000036s : 1: opt.transform.opt_after_cconv 0.12% : 0.000032s : 1: opt.transform.opt_after_jit_grad 0.59% : 0.000162s : 28: opt.transform.opt_b 0.22% : 0.000060s : 2: opt.transform.opt_trans_graph 0.17% : 0.000046s : 4: opt.transform.symbol_engine_opt 11.35% : 0.003118s : 1: opt_a 0.46% : 0.000126s : 1: opt_after_cconv 1.85% : 0.000509s : 1: opt_after_jit_grad 1.04% : 0.000286s : 1: opt_b 19.49% : 0.005353s : 1: optimize 0.09% : 0.000024s : 1: optimize_parallel_all_gather_comm 0.03% : 0.000009s : 1: order_py_execute_after_rewriter 0.09% : 0.000025s : 1: overlap_grad_flash_sp 0.01% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.03% : 0.000008s : 1: overlap_grad_ring_attention 0.02% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.02% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.02% : 0.000005s : 1: overlap_param_gather 0.01% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.03% : 0.000008s : 1: overlap_recompute_and_grad_model_parallel 0.02% : 0.000005s : 1: overlap_recompute_comm 0.08% : 0.000022s : 1: parallel-infer-symbol 0.01% : 0.000004s : 1: parallel-infer-symbol-second 0.02% : 0.000005s : 1: partial_unused_args_eliminate 0.02% : 0.000004s : 1: pipeline_parallel_scheduler 0.02% : 0.000005s : 1: pipeline_split 0.16% : 0.000044s : 1: pre_auto_parallel 0.12% : 0.000032s : 1: py_interpret_to_execute 0.05% : 0.000015s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000004s : 1: remove_cast_before_assign_add 0.07% : 0.000020s : 1: remove_dup_value 1.43% : 0.000393s : 1: renormalize.infer 1.20% : 0.000330s : 1: renormalize.specialize 0.02% : 0.000005s : 1: reorder_send_recv_between_fp_bp 0.03% : 0.000007s : 1: rewriter_after_jit_bprop_graph 0.17% : 0.000047s : 1: rewriter_after_opt_a 0.34% : 0.000093s : 1: rewriter_before_opt_a 0.02% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.02% : 0.000005s : 1: slice_recompute_activation 0.02% : 0.000004s : 1: split_layernorm_comm 0.02% : 0.000005s : 1: split_matmul_comm_elemetwise 0.03% : 0.000009s : 1: swap_dp_allreduce_reducescatter 0.34% : 0.000092s : 1: symbol_engine_optimizer 0.33% : 0.000092s : 1: tuple_transform 23.71% : 0.006513s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:44:39.632.030 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:44:39.632.317 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0386303, [21] [bootstrap]: 0.0004583 [type_inference]: 0.0174985 [event_method]: 2.449e-05 [auto_monad]: 7.308e-05 [graph_reusing]: 6.23e-06 [inline]: 2.54001e-06 [add_attr]: 0.00359455, [1] [add_attr_with_inline]: 0.00358333, [1] [Cycle 1]: 8.219e-05, [2] [tag_attr]: 2.158e-05 [meta_addattr_fg_expand]: 6.14999e-06 [parallel-infer-symbol]: 3.71999e-06 [pre_auto_parallel]: 3.791e-05 [insert-virtual-dataset]: 2.36e-06 [parallel-infer-symbol-second]: 6.89994e-07 [dataset_repeat_opt]: 2.22999e-06 [pipeline_split]: 1.74e-06 [optimize]: 0.0156622, [53] [py_interpret_to_execute]: 3.552e-05 [rewriter_before_opt_a]: 9.311e-05 [opt_a]: 0.012775, [2] [Cycle 1]: 0.0117027, [45] [expand_dump_flag]: 3.87002e-06 [switch_simplify]: 4.287e-05 [loop_unroll]: 3.157e-05 [a_1]: 0.00071745 [with_stream_mark]: 2.034e-05 [recompute_prepare]: 1.294e-05 [updatestate_depend_eliminate]: 4.84998e-06 [updatestate_assign_eliminate]: 3.88999e-06 [updatestate_loads_eliminate]: 3.83999e-06 [parameter_eliminate]: 2.36e-06 [a_2]: 0.00013085 [accelerated_algorithm]: 8.18001e-06 [shard]: 2.33002e-06 [meta_shard_fg_expand]: 2.22999e-06 [shard_inline]: 8.2e-06 [merge_send_recv]: 1.048e-05 [auto_parallel]: 8.70999e-06 [parallel]: 1.932e-05 [flash_sp]: 9.56e-06 [merge_comm]: 4.99e-06 [allreduce_fusion]: 4.20999e-06 [matmul_add_comm_reduction]: 1.05e-05 [allreduce_slice_to_reducescatter]: 6.29982e-07 [virtual_shard_identity]: 1.071e-05 [virtual_dataset]: 8.48999e-06 [get_grad_eliminate_]: 7.87e-06 [virtual_output]: 7.83001e-06 [merge_forward]: 5.19e-06 [cell_reuse_recompute_pass]: 1.20001e-06 [offload_activation]: 1.224e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.985e-05 [merge_recompute_call_nodes]: 1.45001e-06 [before_grad]: 1.405e-05 [set_forward_comm_id_for_comm_node_pass]: 5.27999e-06 [meta_fg_expand]: 3.94002e-06 [flash_sp_send_recv_attached]: 3.16001e-06 [receive_attached]: 2.06e-06 [after_resolve]: 1.369e-05 [a_after_grad]: 1.273e-05 [renormalize]: 0.00990681 [add_forward_monad_depend]: 1.229e-05 [auto_monad_grad]: 3.31001e-06 [auto_monad_eliminator]: 2.707e-05 [cse]: 4.11e-05 [a_3]: 8.822e-05 [Cycle 2]: 0.00105281, [45] [expand_dump_flag]: 2.24999e-06 [switch_simplify]: 1.026e-05 [loop_unroll]: 7.75e-06 [a_1]: 0.00019331 [with_stream_mark]: 2.289e-05 [recompute_prepare]: 1.027e-05 [updatestate_depend_eliminate]: 6.23e-06 [updatestate_assign_eliminate]: 3.6e-06 [updatestate_loads_eliminate]: 3.80998e-06 [parameter_eliminate]: 2.16e-06 [a_2]: 0.0001217 [accelerated_algorithm]: 1.009e-05 [shard]: 2.74999e-06 [meta_shard_fg_expand]: 2.72001e-06 [shard_inline]: 8.10999e-06 [merge_send_recv]: 1.15e-05 [auto_parallel]: 1.032e-05 [parallel]: 8.98002e-06 [flash_sp]: 4.28001e-06 [merge_comm]: 5.62999e-06 [allreduce_fusion]: 4.28001e-06 [matmul_add_comm_reduction]: 1.109e-05 [allreduce_slice_to_reducescatter]: 9.49978e-07 [virtual_shard_identity]: 1.097e-05 [virtual_dataset]: 8.08999e-06 [get_grad_eliminate_]: 7.61999e-06 [virtual_output]: 7.32997e-06 [merge_forward]: 4.92e-06 [cell_reuse_recompute_pass]: 3.33e-06 [offload_activation]: 1.197e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.907e-05 [merge_recompute_call_nodes]: 1.76003e-06 [before_grad]: 1.297e-05 [set_forward_comm_id_for_comm_node_pass]: 5.34e-06 [meta_fg_expand]: 3.36001e-06 [flash_sp_send_recv_attached]: 2.22999e-06 [receive_attached]: 2.52001e-06 [after_resolve]: 1.463e-05 [a_after_grad]: 1.264e-05 [renormalize]: 8.00064e-08 [add_forward_monad_depend]: 3.24001e-06 [auto_monad_grad]: 1.45999e-06 [auto_monad_eliminator]: 1.283e-05 [cse]: 2.785e-05 [a_3]: 6.093e-05 [py_interpret_to_execute_after_opt_a]: 2.12e-05 [slice_cell_reuse_recomputed_activation]: 4.92e-06 [rewriter_after_opt_a]: 5.053e-05 [convert_after_rewriter]: 1.101e-05 [order_py_execute_after_rewriter]: 8.27e-06 [mutable_eliminate]: 0.00074318 [opt_b]: 0.00033568, [1] [Cycle 1]: 0.00032453, [7] [b_1]: 0.00021491 [b_2]: 9.61e-06 [updatestate_depend_eliminate]: 9.00001e-06 [updatestate_assign_eliminate]: 3.23e-06 [updatestate_loads_eliminate]: 3.20998e-06 [renormalize]: 8.09989e-07 [cse]: 2.64e-05 [optimize_parallel_all_gather_comm]: 2.294e-05 [overlap_param_gather]: 5.04998e-06 [cconv]: 3.601e-05 [loop_unroll]: 0.00046221 [opt_after_cconv]: 0.00014624, [1] [Cycle 1]: 0.00013688, [7] [c_1]: 3.683e-05 [parameter_eliminate]: 4.14997e-06 [updatestate_depend_eliminate]: 7.71001e-06 [updatestate_assign_eliminate]: 3.29001e-06 [updatestate_loads_eliminate]: 2.99999e-06 [cse]: 2.556e-05 [renormalize]: 3.4002e-07 [remove_dup_value]: 1.957e-05 [tuple_transform]: 0.00010117, [1] [Cycle 1]: 9.355e-05, [4] [d_1]: 5.204e-05 [none_parameter_eliminate]: 2.19999e-06 [renormalize]: 1.79978e-07 [switch_simplify]: 8.67998e-06 [partial_unused_args_eliminate]: 4.67e-06 [add_recomputation]: 7.075e-05 [cse_after_recomputation]: 3.464e-05, [1] [Cycle 1]: 2.704e-05, [1] [cse]: 1.753e-05 [environ_conv]: 0.00014822 [swap_dp_allreduce_reducescatter]: 1.448e-05 [bias_add_comm_swap]: 6.04999e-06 [label_micro_interleaved_index]: 7.21001e-06 [label_fine_grained_interleaved_index]: 5.48002e-06 [merge_cast_opt]: 3.73001e-06 [slice_recompute_activation]: 5.20999e-06 [micro_interleaved_order_control]: 4.65999e-06 [assign_add_opt]: 3.87998e-06 [ForceFp32Comm]: 3.25998e-06 [remove_cast_before_assign_add]: 3.10002e-06 [full_micro_interleaved_order_control]: 5.04003e-06 [reorder_send_recv_between_fp_bp]: 5.19e-06 [comm_op_add_attrs]: 3.98001e-06 [add_comm_op_reuse_tag]: 3.41001e-06 [interleave_split_concat_branches]: 3.46999e-06 [interleave_parallel_branches]: 3.41001e-06 [overlap_opt_shard_in_pipeline]: 3.49001e-06 [overlap_opt_shard_grad_in_pipeline]: 4.07e-06 [control_data_broadcast_order]: 1.965e-05 [grouped_pairwise_exchange_alltoall]: 4.32e-06 [offloading_packed_experts]: 6.99001e-06 [overlap_recompute_and_grad_model_parallel]: 7.83999e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.54002e-06 [overlap_recompute_allgather_and_fa_grad]: 3.68e-06 [overlap_recompute_comm]: 4.63001e-06 [overlap_grad_ring_attention]: 7.01001e-06 [overlap_grad_flash_sp]: 2.872e-05 [begin_end_overlap_inline]: 2.96001e-06 [split_matmul_comm_elemetwise]: 5.22e-06 [split_layernorm_comm]: 4.74002e-06 [handle_group_info]: 3.38999e-06 [symbol_engine_optimizer]: 0.00011608, [1] [Cycle 1]: 0.00010846, [6] [build]: 4.28001e-06 [elim_shapecalc]: 1.548e-05 [elim_not_effective]: 1.787e-05 [opt_reshape]: 8.49998e-06 [fold_const_symbol]: 1.308e-05 [renormalize]: 2.89991e-07 [detach_backward]: 3.94002e-06 [pipeline_parallel_scheduler]: 1.81998e-06 [auto_monad_reorder]: 2.417e-05 [get_jit_bprop_graph]: 1.93002e-06 [rewriter_after_jit_bprop_graph]: 5.04e-06 [opt_after_jit_grad]: 0.00053571 [validate]: 4.783e-05 Sums bootstrap : 0.000458s : 1.38% type_inference : 0.017499s : 52.79% event_method : 0.000024s : 0.07% auto_monad : 0.000073s : 0.22% graph_reusing : 0.000006s : 0.02% inline : 0.000003s : 0.01% add_attr.add_attr_with_inline.tag_attr : 0.000022s : 0.07% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.02% parallel-infer-symbol : 0.000004s : 0.01% pre_auto_parallel : 0.000038s : 0.11% insert-virtual-dataset : 0.000002s : 0.01% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.01% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000036s : 0.11% optimize.rewriter_before_opt_a : 0.000093s : 0.28% optimize.opt_a.expand_dump_flag : 0.000006s : 0.02% optimize.opt_a.switch_simplify : 0.000053s : 0.16% optimize.opt_a.loop_unroll : 0.000039s : 0.12% optimize.opt_a.a_1 : 0.000911s : 2.75% optimize.opt_a.with_stream_mark : 0.000043s : 0.13% optimize.opt_a.recompute_prepare : 0.000023s : 0.07% optimize.opt_a.updatestate_depend_eliminate : 0.000011s : 0.03% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.02% optimize.opt_a.updatestate_loads_eliminate : 0.000008s : 0.02% optimize.opt_a.parameter_eliminate : 0.000005s : 0.01% optimize.opt_a.a_2 : 0.000253s : 0.76% optimize.opt_a.accelerated_algorithm : 0.000018s : 0.06% optimize.opt_a.shard : 0.000005s : 0.02% optimize.opt_a.meta_shard_fg_expand : 0.000005s : 0.01% optimize.opt_a.shard_inline : 0.000016s : 0.05% optimize.opt_a.merge_send_recv : 0.000022s : 0.07% optimize.opt_a.auto_parallel : 0.000019s : 0.06% optimize.opt_a.parallel : 0.000028s : 0.09% optimize.opt_a.flash_sp : 0.000014s : 0.04% optimize.opt_a.merge_comm : 0.000011s : 0.03% optimize.opt_a.allreduce_fusion : 0.000008s : 0.03% optimize.opt_a.matmul_add_comm_reduction : 0.000022s : 0.07% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000002s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000022s : 0.07% optimize.opt_a.virtual_dataset : 0.000017s : 0.05% optimize.opt_a.get_grad_eliminate_ : 0.000015s : 0.05% optimize.opt_a.virtual_output : 0.000015s : 0.05% optimize.opt_a.merge_forward : 0.000010s : 0.03% optimize.opt_a.cell_reuse_recompute_pass : 0.000005s : 0.01% optimize.opt_a.offload_activation : 0.000024s : 0.07% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000039s : 0.12% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.01% optimize.opt_a.before_grad : 0.000027s : 0.08% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000011s : 0.03% optimize.opt_a.meta_fg_expand : 0.000007s : 0.02% optimize.opt_a.flash_sp_send_recv_attached : 0.000005s : 0.02% optimize.opt_a.receive_attached : 0.000005s : 0.01% optimize.opt_a.after_resolve : 0.000028s : 0.09% optimize.opt_a.a_after_grad : 0.000025s : 0.08% optimize.opt_a.renormalize : 0.009907s : 29.89% optimize.opt_a.add_forward_monad_depend : 0.000016s : 0.05% optimize.opt_a.auto_monad_grad : 0.000005s : 0.01% optimize.opt_a.auto_monad_eliminator : 0.000040s : 0.12% optimize.opt_a.cse : 0.000069s : 0.21% optimize.opt_a.a_3 : 0.000149s : 0.45% optimize.py_interpret_to_execute_after_opt_a : 0.000021s : 0.06% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.01% optimize.rewriter_after_opt_a : 0.000051s : 0.15% optimize.convert_after_rewriter : 0.000011s : 0.03% optimize.order_py_execute_after_rewriter : 0.000008s : 0.02% optimize.mutable_eliminate : 0.000743s : 2.24% optimize.opt_b.b_1 : 0.000215s : 0.65% optimize.opt_b.b_2 : 0.000010s : 0.03% optimize.opt_b.updatestate_depend_eliminate : 0.000009s : 0.03% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.01% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.01% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000026s : 0.08% optimize.optimize_parallel_all_gather_comm : 0.000023s : 0.07% optimize.overlap_param_gather : 0.000005s : 0.02% optimize.cconv : 0.000036s : 0.11% optimize.loop_unroll : 0.000462s : 1.39% optimize.opt_after_cconv.c_1 : 0.000037s : 0.11% optimize.opt_after_cconv.parameter_eliminate : 0.000004s : 0.01% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000008s : 0.02% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.cse : 0.000026s : 0.08% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000020s : 0.06% optimize.tuple_transform.d_1 : 0.000052s : 0.16% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000009s : 0.03% optimize.partial_unused_args_eliminate : 0.000005s : 0.01% optimize.add_recomputation : 0.000071s : 0.21% optimize.cse_after_recomputation.cse : 0.000018s : 0.05% optimize.environ_conv : 0.000148s : 0.45% optimize.swap_dp_allreduce_reducescatter : 0.000014s : 0.04% optimize.bias_add_comm_swap : 0.000006s : 0.02% optimize.label_micro_interleaved_index : 0.000007s : 0.02% optimize.label_fine_grained_interleaved_index : 0.000005s : 0.02% optimize.merge_cast_opt : 0.000004s : 0.01% optimize.slice_recompute_activation : 0.000005s : 0.02% optimize.micro_interleaved_order_control : 0.000005s : 0.01% optimize.assign_add_opt : 0.000004s : 0.01% optimize.ForceFp32Comm : 0.000003s : 0.01% optimize.remove_cast_before_assign_add : 0.000003s : 0.01% optimize.full_micro_interleaved_order_control : 0.000005s : 0.02% optimize.reorder_send_recv_between_fp_bp : 0.000005s : 0.02% optimize.comm_op_add_attrs : 0.000004s : 0.01% optimize.add_comm_op_reuse_tag : 0.000003s : 0.01% optimize.interleave_split_concat_branches : 0.000003s : 0.01% optimize.interleave_parallel_branches : 0.000003s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000003s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000004s : 0.01% optimize.control_data_broadcast_order : 0.000020s : 0.06% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.01% optimize.offloading_packed_experts : 0.000007s : 0.02% optimize.overlap_recompute_and_grad_model_parallel : 0.000008s : 0.02% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.01% optimize.overlap_recompute_comm : 0.000005s : 0.01% optimize.overlap_grad_ring_attention : 0.000007s : 0.02% optimize.overlap_grad_flash_sp : 0.000029s : 0.09% optimize.begin_end_overlap_inline : 0.000003s : 0.01% optimize.split_matmul_comm_elemetwise : 0.000005s : 0.02% optimize.split_layernorm_comm : 0.000005s : 0.01% optimize.handle_group_info : 0.000003s : 0.01% optimize.symbol_engine_optimizer.build : 0.000004s : 0.01% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000015s : 0.05% optimize.symbol_engine_optimizer.elim_not_effective : 0.000018s : 0.05% optimize.symbol_engine_optimizer.opt_reshape : 0.000008s : 0.03% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000013s : 0.04% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000004s : 0.01% pipeline_parallel_scheduler : 0.000002s : 0.01% auto_monad_reorder : 0.000024s : 0.07% get_jit_bprop_graph : 0.000002s : 0.01% rewriter_after_jit_bprop_graph : 0.000005s : 0.02% opt_after_jit_grad : 0.000536s : 1.62% validate : 0.000048s : 0.14% Time group info: ------[substitution.] 0.000241 39 13.62% : 0.000033s : 3: substitution.cast_eliminate 0.97% : 0.000002s : 3: substitution.elim_not_effective 0.73% : 0.000002s : 3: substitution.fold_const_symbol 2.70% : 0.000007s : 5: substitution.graph_param_transform 65.54% : 0.000158s : 4: substitution.inline 2.06% : 0.000005s : 6: substitution.j_node_and_user_rematch 2.85% : 0.000007s : 6: substitution.remove_not_recompute_node 2.28% : 0.000006s : 4: substitution.replace_old_param 5.59% : 0.000013s : 4: substitution.tuple_list_get_item_eliminator 3.67% : 0.000009s : 1: substitution.value_based_eliminate ------[type_inference.] 0.017425 2 94.59% : 0.016483s : 1: type_inference.infer 5.41% : 0.000942s : 1: type_inference.specialize ------[replace.] 0.000067 8 63.17% : 0.000042s : 4: replace.inline 36.83% : 0.000025s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000167 8 93.07% : 0.000155s : 4: match.inline 6.93% : 0.000012s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000253 1504 0.89% : 0.000002s : 15: predicate.accumulaten_eliminater 0.71% : 0.000002s : 5: predicate.ad_related_special_op_eliminate 0.58% : 0.000001s : 10: predicate.addn_check_dump 0.94% : 0.000002s : 15: predicate.addn_zero_filter 0.83% : 0.000002s : 15: predicate.adjust_all_reduce_mul_add 1.91% : 0.000005s : 25: predicate.arithmetic_simplify 1.04% : 0.000003s : 15: predicate.cast_eliminate 0.62% : 0.000002s : 10: predicate.check_bprop_eliminate 0.59% : 0.000001s : 10: predicate.compare_switch_simplify 0.20% : 0.000001s : 5: predicate.const_output_eliminate 0.66% : 0.000002s : 10: predicate.depend_value_elim 1.03% : 0.000003s : 15: predicate.dict_get_item_const_eliminator 1.04% : 0.000003s : 15: predicate.dict_get_item_eliminator 0.92% : 0.000002s : 15: predicate.dict_set_item_eliminator 0.93% : 0.000002s : 10: predicate.dumpgradient_eliminate 0.29% : 0.000001s : 5: predicate.elim_not_effective 0.40% : 0.000001s : 5: predicate.elim_shapecalc_of_broadcastargs 1.23% : 0.000003s : 20: predicate.environ_add_const_eliminate 1.10% : 0.000003s : 20: predicate.environ_get_add_eliminate 1.12% : 0.000003s : 20: predicate.environ_get_depend_swap 1.78% : 0.000004s : 30: predicate.environ_get_eliminate 1.09% : 0.000003s : 20: predicate.environ_get_set_eliminate 1.38% : 0.000004s : 23: predicate.exchange_switch_depend_value 2.31% : 0.000006s : 23: predicate.float_depend_g_call 0.56% : 0.000001s : 10: predicate.float_environ_get_switch 0.82% : 0.000002s : 15: predicate.float_tuple_getitem_switch 0.18% : 0.000000s : 5: predicate.fold_const_symbol 0.80% : 0.000002s : 10: predicate.get_grad_eliminate 0.19% : 0.000000s : 5: predicate.graph_param_transform 0.60% : 0.000002s : 10: predicate.incorporate_call 0.55% : 0.000001s : 10: predicate.incorporate_call_switch 6.16% : 0.000016s : 68: predicate.inline 0.78% : 0.000002s : 10: predicate.inline_without_move 0.34% : 0.000001s : 10: predicate.j_node_and_user_rematch 0.78% : 0.000002s : 10: predicate.less_batch_normalization 1.74% : 0.000004s : 29: predicate.list_to_tuple_eliminator_ 2.32% : 0.000006s : 44: predicate.load_eliminater 1.01% : 0.000003s : 5: predicate.loop_unroll_after_grad 2.04% : 0.000005s : 36: predicate.loop_unroll_before_grad 1.74% : 0.000004s : 25: predicate.make_slice_get_slice_eliminator 0.60% : 0.000002s : 10: predicate.merge_addn 0.61% : 0.000002s : 10: predicate.micro_step_allgather_replace 0.66% : 0.000002s : 10: predicate.mini_step_allgather_replace 0.86% : 0.000002s : 15: predicate.minmaximum_grad 0.94% : 0.000002s : 5: predicate.mutable_eliminate 0.36% : 0.000001s : 5: predicate.opt_reshape 0.54% : 0.000001s : 5: predicate.parallel_virtual_node 1.57% : 0.000004s : 23: predicate.partial_defer_inline 1.56% : 0.000004s : 24: predicate.partial_eliminate 0.92% : 0.000002s : 15: predicate.print_const_string_wrapper 0.72% : 0.000002s : 10: predicate.reduce_all_const_elim 1.43% : 0.000004s : 15: predicate.reduce_eliminate 2.63% : 0.000007s : 44: predicate.redundant_stop_gradient_eliminater 0.49% : 0.000001s : 10: predicate.remove_not_recompute_node 1.42% : 0.000004s : 29: predicate.replace_applicator 0.47% : 0.000001s : 10: predicate.replace_old_param 0.32% : 0.000001s : 5: predicate.reset_defer_inline 1.00% : 0.000003s : 15: predicate.reshape_eliminate 0.59% : 0.000002s : 10: predicate.row_tensor_add_zeros_like 0.39% : 0.000001s : 5: predicate.row_tensor_eliminate 0.97% : 0.000002s : 10: predicate.same_eliminate 0.49% : 0.000001s : 10: predicate.set_cell_output_no_recompute 0.74% : 0.000002s : 10: predicate.shard_identity_eliminate 0.65% : 0.000002s : 10: predicate.special_op_eliminate 0.85% : 0.000002s : 10: predicate.specialize_transform 1.11% : 0.000003s : 10: predicate.split_environ_get_set_with_tuple_value 0.86% : 0.000002s : 10: predicate.stack_unstack_eliminate 0.34% : 0.000001s : 5: predicate.switch_call_monad_eliminater 1.44% : 0.000004s : 23: predicate.switch_defer_inline 1.99% : 0.000005s : 33: predicate.switch_layer_defer_inline 4.73% : 0.000012s : 74: predicate.switch_simplify 0.82% : 0.000002s : 15: predicate.tile_eliminate 0.93% : 0.000002s : 15: predicate.transpose_eliminate 1.62% : 0.000004s : 25: predicate.tuple_list_convert_item_index_to_positive 1.51% : 0.000004s : 25: predicate.tuple_list_get_item_const_eliminator 1.46% : 0.000004s : 25: predicate.tuple_list_get_item_depend_reorder 3.28% : 0.000008s : 39: predicate.tuple_list_get_item_eliminator 1.49% : 0.000004s : 25: predicate.tuple_list_get_set_item_eliminator 2.18% : 0.000006s : 35: predicate.tuple_list_set_item_eliminator 1.73% : 0.000004s : 29: predicate.tuple_to_list_eliminator_ 2.43% : 0.000006s : 44: predicate.updatestate_pure_node_eliminater 3.14% : 0.000008s : 54: predicate.updatestate_useless_node_eliminater 0.58% : 0.000001s : 5: predicate.value_based_eliminate 1.02% : 0.000003s : 10: predicate.virtual_dataset_eliminate 0.71% : 0.000002s : 10: predicate.virtual_output_eliminate 0.27% : 0.000001s : 5: predicate.virtual_view_grad_eliminate 0.43% : 0.000001s : 5: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000729 11 44.79% : 0.000326s : 5: func_graph_cloner_run.FuncGraphClonerGraph 55.21% : 0.000402s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.069459 192 0.01% : 0.000006s : 1: ForceFp32Comm 5.19% : 0.003604s : 1: add_attr 5.16% : 0.003588s : 1: add_attr_with_inline 0.01% : 0.000006s : 1: add_comm_op_reuse_tag 0.11% : 0.000075s : 1: add_recomputation 0.01% : 0.000006s : 1: assign_add_opt 0.12% : 0.000085s : 1: auto_monad 0.05% : 0.000032s : 1: auto_monad_reorder 0.01% : 0.000006s : 1: begin_end_overlap_inline 0.01% : 0.000009s : 1: bias_add_comm_swap 0.73% : 0.000508s : 1: bootstrap 0.06% : 0.000039s : 1: cconv 0.01% : 0.000007s : 1: comm_op_add_attrs 0.03% : 0.000023s : 1: control_data_broadcast_order 0.02% : 0.000014s : 1: convert_after_rewriter 0.05% : 0.000038s : 1: cse_after_recomputation 0.01% : 0.000008s : 1: dataset_repeat_opt 0.03% : 0.000020s : 1: detach_backward 0.22% : 0.000155s : 1: environ_conv 0.05% : 0.000036s : 1: event_method 0.01% : 0.000008s : 1: full_micro_interleaved_order_control 0.01% : 0.000008s : 1: get_jit_bprop_graph 0.02% : 0.000012s : 1: graph_reusing 0.01% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000006s : 1: handle_group_info 0.01% : 0.000008s : 1: inline 0.01% : 0.000009s : 1: insert-virtual-dataset 0.01% : 0.000006s : 1: interleave_parallel_branches 0.01% : 0.000006s : 1: interleave_split_concat_branches 0.01% : 0.000008s : 1: label_fine_grained_interleaved_index 0.01% : 0.000010s : 1: label_micro_interleaved_index 0.67% : 0.000469s : 1: loop_unroll 0.01% : 0.000006s : 1: merge_cast_opt 0.01% : 0.000007s : 1: micro_interleaved_order_control 1.08% : 0.000750s : 1: mutable_eliminate 0.01% : 0.000010s : 1: offloading_packed_experts 0.03% : 0.000018s : 1: opt.transform.loop_unroll_optimizer 0.03% : 0.000019s : 1: opt.transform.mutable_eliminate 2.11% : 0.001462s : 78: opt.transform.opt_a 0.05% : 0.000035s : 1: opt.transform.opt_after_cconv 0.04% : 0.000031s : 1: opt.transform.opt_after_jit_grad 0.22% : 0.000150s : 28: opt.transform.opt_b 0.08% : 0.000058s : 2: opt.transform.opt_trans_graph 0.07% : 0.000050s : 4: opt.transform.symbol_engine_opt 18.40% : 0.012779s : 1: opt_a 0.22% : 0.000150s : 1: opt_after_cconv 0.79% : 0.000547s : 1: opt_after_jit_grad 0.49% : 0.000339s : 1: opt_b 23.08% : 0.016032s : 1: optimize 0.04% : 0.000026s : 1: optimize_parallel_all_gather_comm 0.02% : 0.000011s : 1: order_py_execute_after_rewriter 0.05% : 0.000032s : 1: overlap_grad_flash_sp 0.01% : 0.000006s : 1: overlap_grad_matmul_and_grad_allreduce 0.01% : 0.000010s : 1: overlap_grad_ring_attention 0.01% : 0.000007s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000006s : 1: overlap_opt_shard_in_pipeline 0.01% : 0.000008s : 1: overlap_param_gather 0.01% : 0.000006s : 1: overlap_recompute_allgather_and_fa_grad 0.02% : 0.000011s : 1: overlap_recompute_and_grad_model_parallel 0.01% : 0.000008s : 1: overlap_recompute_comm 0.02% : 0.000011s : 1: parallel-infer-symbol 0.01% : 0.000006s : 1: parallel-infer-symbol-second 0.01% : 0.000008s : 1: partial_unused_args_eliminate 0.01% : 0.000009s : 1: pipeline_parallel_scheduler 0.01% : 0.000007s : 1: pipeline_split 0.07% : 0.000046s : 1: pre_auto_parallel 0.06% : 0.000039s : 1: py_interpret_to_execute 0.04% : 0.000025s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000006s : 1: remove_cast_before_assign_add 0.03% : 0.000023s : 1: remove_dup_value 13.54% : 0.009408s : 1: renormalize.infer 0.69% : 0.000482s : 1: renormalize.specialize 0.01% : 0.000008s : 1: reorder_send_recv_between_fp_bp 0.02% : 0.000011s : 1: rewriter_after_jit_bprop_graph 0.08% : 0.000055s : 1: rewriter_after_opt_a 0.14% : 0.000098s : 1: rewriter_before_opt_a 0.01% : 0.000008s : 1: slice_cell_reuse_recomputed_activation 0.01% : 0.000008s : 1: slice_recompute_activation 0.01% : 0.000007s : 1: split_layernorm_comm 0.01% : 0.000008s : 1: split_matmul_comm_elemetwise 0.03% : 0.000018s : 1: swap_dp_allreduce_reducescatter 0.17% : 0.000119s : 1: symbol_engine_optimizer 0.15% : 0.000104s : 1: tuple_transform 25.27% : 0.017550s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:44:40.222.325 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0165345, [21] [bootstrap]: 0.00044676 [type_inference]: 0.00630821 [event_method]: 2.088e-05 [auto_monad]: 6.952e-05 [graph_reusing]: 5.88998e-06 [inline]: 2.78e-06 [add_attr]: 0.00333798, [1] [add_attr_with_inline]: 0.00332678, [1] [Cycle 1]: 7.26e-05, [2] [tag_attr]: 2.207e-05 [meta_addattr_fg_expand]: 6.09999e-06 [parallel-infer-symbol]: 3.91001e-06 [pre_auto_parallel]: 4.109e-05 [insert-virtual-dataset]: 2.82002e-06 [parallel-infer-symbol-second]: 8.89995e-07 [dataset_repeat_opt]: 2.39001e-06 [pipeline_split]: 1.77999e-06 [optimize]: 0.00556014, [53] [py_interpret_to_execute]: 2.981e-05 [rewriter_before_opt_a]: 9.05e-05 [opt_a]: 0.00331334, [2] [Cycle 1]: 0.00249051, [45] [expand_dump_flag]: 3.38e-06 [switch_simplify]: 4.572e-05 [loop_unroll]: 3.506e-05 [a_1]: 0.00071223 [with_stream_mark]: 2.274e-05 [recompute_prepare]: 1.444e-05 [updatestate_depend_eliminate]: 5.92999e-06 [updatestate_assign_eliminate]: 3.86999e-06 [updatestate_loads_eliminate]: 4.40999e-06 [parameter_eliminate]: 2.54999e-06 [a_2]: 0.0001048 [accelerated_algorithm]: 1.094e-05 [shard]: 2.14e-06 [meta_shard_fg_expand]: 2.46e-06 [shard_inline]: 9.09e-06 [merge_send_recv]: 1.107e-05 [auto_parallel]: 9.86998e-06 [parallel]: 2.028e-05 [flash_sp]: 1.043e-05 [merge_comm]: 4.65001e-06 [allreduce_fusion]: 4.53001e-06 [matmul_add_comm_reduction]: 1.11e-05 [allreduce_slice_to_reducescatter]: 7.09988e-07 [virtual_shard_identity]: 1.071e-05 [virtual_dataset]: 7.93001e-06 [get_grad_eliminate_]: 7.8e-06 [virtual_output]: 7.82002e-06 [merge_forward]: 5.14e-06 [cell_reuse_recompute_pass]: 1.44e-06 [offload_activation]: 1.231e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.931e-05 [merge_recompute_call_nodes]: 1.48002e-06 [before_grad]: 1.369e-05 [set_forward_comm_id_for_comm_node_pass]: 5.56e-06 [meta_fg_expand]: 4.2e-06 [flash_sp_send_recv_attached]: 3.26001e-06 [receive_attached]: 2.29999e-06 [after_resolve]: 1.412e-05 [a_after_grad]: 1.222e-05 [renormalize]: 0.00082619 [add_forward_monad_depend]: 7.31001e-06 [auto_monad_grad]: 3.17002e-06 [auto_monad_eliminator]: 2.481e-05 [cse]: 7.659e-05 [a_3]: 6.74e-05 [Cycle 2]: 0.00081153, [45] [expand_dump_flag]: 2.42001e-06 [switch_simplify]: 9.86e-06 [loop_unroll]: 7.85998e-06 [a_1]: 0.00018125 [with_stream_mark]: 1.624e-05 [recompute_prepare]: 8.22e-06 [updatestate_depend_eliminate]: 4.50001e-06 [updatestate_assign_eliminate]: 3.24001e-06 [updatestate_loads_eliminate]: 3.16001e-06 [parameter_eliminate]: 1.42e-06 [a_2]: 9.503e-05 [accelerated_algorithm]: 8.10999e-06 [shard]: 1.60001e-06 [meta_shard_fg_expand]: 2.02999e-06 [shard_inline]: 8.02998e-06 [merge_send_recv]: 7.73999e-06 [auto_parallel]: 7.88001e-06 [parallel]: 6.13002e-06 [flash_sp]: 3.73001e-06 [merge_comm]: 4.89e-06 [allreduce_fusion]: 3.98999e-06 [matmul_add_comm_reduction]: 7.78001e-06 [allreduce_slice_to_reducescatter]: 4.50003e-07 [virtual_shard_identity]: 1.009e-05 [virtual_dataset]: 7.2e-06 [get_grad_eliminate_]: 7.04001e-06 [virtual_output]: 8.70999e-06 [merge_forward]: 4.01001e-06 [cell_reuse_recompute_pass]: 1.59e-06 [offload_activation]: 9.46e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.618e-05 [merge_recompute_call_nodes]: 1.20999e-06 [before_grad]: 1.285e-05 [set_forward_comm_id_for_comm_node_pass]: 5.00001e-06 [meta_fg_expand]: 3.14999e-06 [flash_sp_send_recv_attached]: 9.50007e-07 [receive_attached]: 1.49e-06 [after_resolve]: 1.328e-05 [a_after_grad]: 1.128e-05 [renormalize]: 7.00238e-08 [add_forward_monad_depend]: 1.57999e-06 [auto_monad_grad]: 1.39e-06 [auto_monad_eliminator]: 1.157e-05 [cse]: 2.218e-05 [a_3]: 4.698e-05 [py_interpret_to_execute_after_opt_a]: 1.421e-05 [slice_cell_reuse_recomputed_activation]: 2.14e-06 [rewriter_after_opt_a]: 4.473e-05 [convert_after_rewriter]: 8.40001e-06 [order_py_execute_after_rewriter]: 6.17999e-06 [mutable_eliminate]: 0.00057288 [opt_b]: 0.00026042, [1] [Cycle 1]: 0.00025367, [7] [b_1]: 0.00016988 [b_2]: 9.34e-06 [updatestate_depend_eliminate]: 6.31998e-06 [updatestate_assign_eliminate]: 2.83e-06 [updatestate_loads_eliminate]: 3.13998e-06 [renormalize]: 7.89994e-07 [cse]: 2.475e-05 [optimize_parallel_all_gather_comm]: 1.8e-05 [overlap_param_gather]: 2.17999e-06 [cconv]: 2.402e-05 [loop_unroll]: 0.0004441 [opt_after_cconv]: 0.00012082, [1] [Cycle 1]: 0.00011498, [7] [c_1]: 3.781e-05 [parameter_eliminate]: 3.46001e-06 [updatestate_depend_eliminate]: 7.31001e-06 [updatestate_assign_eliminate]: 3.11001e-06 [updatestate_loads_eliminate]: 3.39001e-06 [cse]: 2.537e-05 [renormalize]: 2.69996e-07 [remove_dup_value]: 1.542e-05 [tuple_transform]: 8.523e-05, [1] [Cycle 1]: 8.09e-05, [4] [d_1]: 5.262e-05 [none_parameter_eliminate]: 1.60999e-06 [renormalize]: 2.30008e-07 [switch_simplify]: 8.08001e-06 [partial_unused_args_eliminate]: 1.96e-06 [add_recomputation]: 5.675e-05 [cse_after_recomputation]: 2.641e-05, [1] [Cycle 1]: 2.181e-05, [1] [cse]: 1.585e-05 [environ_conv]: 6.39999e-06 [swap_dp_allreduce_reducescatter]: 6.44001e-06 [bias_add_comm_swap]: 3.63e-06 [label_micro_interleaved_index]: 4.27e-06 [label_fine_grained_interleaved_index]: 2.77002e-06 [merge_cast_opt]: 1.27e-06 [slice_recompute_activation]: 1.94999e-06 [micro_interleaved_order_control]: 2.14e-06 [assign_add_opt]: 1.28002e-06 [ForceFp32Comm]: 8.00006e-07 [remove_cast_before_assign_add]: 9.89996e-07 [full_micro_interleaved_order_control]: 2.01998e-06 [reorder_send_recv_between_fp_bp]: 2.74001e-06 [comm_op_add_attrs]: 1.69998e-06 [add_comm_op_reuse_tag]: 1.31002e-06 [interleave_split_concat_branches]: 1.15999e-06 [interleave_parallel_branches]: 1.00999e-06 [overlap_opt_shard_in_pipeline]: 1.16002e-06 [overlap_opt_shard_grad_in_pipeline]: 1.71998e-06 [control_data_broadcast_order]: 1.538e-05 [grouped_pairwise_exchange_alltoall]: 1.81e-06 [offloading_packed_experts]: 4.33999e-06 [overlap_recompute_and_grad_model_parallel]: 5.39e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.12999e-06 [overlap_recompute_allgather_and_fa_grad]: 1.34998e-06 [overlap_recompute_comm]: 2.36e-06 [overlap_grad_ring_attention]: 4.79e-06 [overlap_grad_flash_sp]: 2.285e-05 [begin_end_overlap_inline]: 5.50004e-07 [split_matmul_comm_elemetwise]: 2.10002e-06 [split_layernorm_comm]: 1.99e-06 [handle_group_info]: 9.70002e-07 [symbol_engine_optimizer]: 9.122e-05, [1] [Cycle 1]: 8.613e-05, [6] [build]: 4.11001e-06 [elim_shapecalc]: 1.334e-05 [elim_not_effective]: 1.68e-05 [opt_reshape]: 8.40001e-06 [fold_const_symbol]: 1.195e-05 [renormalize]: 2.50002e-07 [detach_backward]: 1.95001e-06 [pipeline_parallel_scheduler]: 1.51002e-06 [auto_monad_reorder]: 2.065e-05 [get_jit_bprop_graph]: 1.91998e-06 [rewriter_after_jit_bprop_graph]: 3.73001e-06 [opt_after_jit_grad]: 0.00049348 [validate]: 4.619e-05 Sums bootstrap : 0.000447s : 3.68% type_inference : 0.006308s : 51.89% event_method : 0.000021s : 0.17% auto_monad : 0.000070s : 0.57% graph_reusing : 0.000006s : 0.05% inline : 0.000003s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000022s : 0.18% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.05% parallel-infer-symbol : 0.000004s : 0.03% pre_auto_parallel : 0.000041s : 0.34% insert-virtual-dataset : 0.000003s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000030s : 0.25% optimize.rewriter_before_opt_a : 0.000091s : 0.74% optimize.opt_a.expand_dump_flag : 0.000006s : 0.05% optimize.opt_a.switch_simplify : 0.000056s : 0.46% optimize.opt_a.loop_unroll : 0.000043s : 0.35% optimize.opt_a.a_1 : 0.000893s : 7.35% optimize.opt_a.with_stream_mark : 0.000039s : 0.32% optimize.opt_a.recompute_prepare : 0.000023s : 0.19% optimize.opt_a.updatestate_depend_eliminate : 0.000010s : 0.09% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.06% optimize.opt_a.updatestate_loads_eliminate : 0.000008s : 0.06% optimize.opt_a.parameter_eliminate : 0.000004s : 0.03% optimize.opt_a.a_2 : 0.000200s : 1.64% optimize.opt_a.accelerated_algorithm : 0.000019s : 0.16% optimize.opt_a.shard : 0.000004s : 0.03% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.04% optimize.opt_a.shard_inline : 0.000017s : 0.14% optimize.opt_a.merge_send_recv : 0.000019s : 0.15% optimize.opt_a.auto_parallel : 0.000018s : 0.15% optimize.opt_a.parallel : 0.000026s : 0.22% optimize.opt_a.flash_sp : 0.000014s : 0.12% optimize.opt_a.merge_comm : 0.000010s : 0.08% optimize.opt_a.allreduce_fusion : 0.000009s : 0.07% optimize.opt_a.matmul_add_comm_reduction : 0.000019s : 0.16% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000021s : 0.17% optimize.opt_a.virtual_dataset : 0.000015s : 0.12% optimize.opt_a.get_grad_eliminate_ : 0.000015s : 0.12% optimize.opt_a.virtual_output : 0.000017s : 0.14% optimize.opt_a.merge_forward : 0.000009s : 0.08% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.02% optimize.opt_a.offload_activation : 0.000022s : 0.18% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000035s : 0.29% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.02% optimize.opt_a.before_grad : 0.000027s : 0.22% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000011s : 0.09% optimize.opt_a.meta_fg_expand : 0.000007s : 0.06% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.03% optimize.opt_a.receive_attached : 0.000004s : 0.03% optimize.opt_a.after_resolve : 0.000027s : 0.23% optimize.opt_a.a_after_grad : 0.000024s : 0.19% optimize.opt_a.renormalize : 0.000826s : 6.80% optimize.opt_a.add_forward_monad_depend : 0.000009s : 0.07% optimize.opt_a.auto_monad_grad : 0.000005s : 0.04% optimize.opt_a.auto_monad_eliminator : 0.000036s : 0.30% optimize.opt_a.cse : 0.000099s : 0.81% optimize.opt_a.a_3 : 0.000114s : 0.94% optimize.py_interpret_to_execute_after_opt_a : 0.000014s : 0.12% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.02% optimize.rewriter_after_opt_a : 0.000045s : 0.37% optimize.convert_after_rewriter : 0.000008s : 0.07% optimize.order_py_execute_after_rewriter : 0.000006s : 0.05% optimize.mutable_eliminate : 0.000573s : 4.71% optimize.opt_b.b_1 : 0.000170s : 1.40% optimize.opt_b.b_2 : 0.000009s : 0.08% optimize.opt_b.updatestate_depend_eliminate : 0.000006s : 0.05% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.02% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_b.renormalize : 0.000001s : 0.01% optimize.opt_b.cse : 0.000025s : 0.20% optimize.optimize_parallel_all_gather_comm : 0.000018s : 0.15% optimize.overlap_param_gather : 0.000002s : 0.02% optimize.cconv : 0.000024s : 0.20% optimize.loop_unroll : 0.000444s : 3.65% optimize.opt_after_cconv.c_1 : 0.000038s : 0.31% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000007s : 0.06% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.cse : 0.000025s : 0.21% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000015s : 0.13% optimize.tuple_transform.d_1 : 0.000053s : 0.43% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000008s : 0.07% optimize.partial_unused_args_eliminate : 0.000002s : 0.02% optimize.add_recomputation : 0.000057s : 0.47% optimize.cse_after_recomputation.cse : 0.000016s : 0.13% optimize.environ_conv : 0.000006s : 0.05% optimize.swap_dp_allreduce_reducescatter : 0.000006s : 0.05% optimize.bias_add_comm_swap : 0.000004s : 0.03% optimize.label_micro_interleaved_index : 0.000004s : 0.04% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.02% optimize.merge_cast_opt : 0.000001s : 0.01% optimize.slice_recompute_activation : 0.000002s : 0.02% optimize.micro_interleaved_order_control : 0.000002s : 0.02% optimize.assign_add_opt : 0.000001s : 0.01% optimize.ForceFp32Comm : 0.000001s : 0.01% optimize.remove_cast_before_assign_add : 0.000001s : 0.01% optimize.full_micro_interleaved_order_control : 0.000002s : 0.02% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.02% optimize.comm_op_add_attrs : 0.000002s : 0.01% optimize.add_comm_op_reuse_tag : 0.000001s : 0.01% optimize.interleave_split_concat_branches : 0.000001s : 0.01% optimize.interleave_parallel_branches : 0.000001s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.01% optimize.control_data_broadcast_order : 0.000015s : 0.13% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.01% optimize.offloading_packed_experts : 0.000004s : 0.04% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.04% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.01% optimize.overlap_recompute_comm : 0.000002s : 0.02% optimize.overlap_grad_ring_attention : 0.000005s : 0.04% optimize.overlap_grad_flash_sp : 0.000023s : 0.19% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.02% optimize.split_layernorm_comm : 0.000002s : 0.02% optimize.handle_group_info : 0.000001s : 0.01% optimize.symbol_engine_optimizer.build : 0.000004s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000013s : 0.11% optimize.symbol_engine_optimizer.elim_not_effective : 0.000017s : 0.14% optimize.symbol_engine_optimizer.opt_reshape : 0.000008s : 0.07% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000012s : 0.10% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.02% pipeline_parallel_scheduler : 0.000002s : 0.01% auto_monad_reorder : 0.000021s : 0.17% get_jit_bprop_graph : 0.000002s : 0.02% rewriter_after_jit_bprop_graph : 0.000004s : 0.03% opt_after_jit_grad : 0.000493s : 4.06% validate : 0.000046s : 0.38% Time group info: ------[substitution.] 0.000241 39 12.09% : 0.000029s : 3: substitution.cast_eliminate 1.03% : 0.000002s : 3: substitution.elim_not_effective 0.65% : 0.000002s : 3: substitution.fold_const_symbol 2.77% : 0.000007s : 5: substitution.graph_param_transform 67.44% : 0.000162s : 4: substitution.inline 1.92% : 0.000005s : 6: substitution.j_node_and_user_rematch 2.93% : 0.000007s : 6: substitution.remove_not_recompute_node 2.27% : 0.000005s : 4: substitution.replace_old_param 5.46% : 0.000013s : 4: substitution.tuple_list_get_item_eliminator 3.44% : 0.000008s : 1: substitution.value_based_eliminate ------[type_inference.] 0.006240 2 86.34% : 0.005388s : 1: type_inference.infer 13.66% : 0.000853s : 1: type_inference.specialize ------[replace.] 0.000068 8 60.87% : 0.000041s : 4: replace.inline 39.13% : 0.000026s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000170 8 93.31% : 0.000159s : 4: match.inline 6.69% : 0.000011s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000244 1504 0.89% : 0.000002s : 15: predicate.accumulaten_eliminater 1.01% : 0.000002s : 5: predicate.ad_related_special_op_eliminate 0.56% : 0.000001s : 10: predicate.addn_check_dump 0.84% : 0.000002s : 15: predicate.addn_zero_filter 0.79% : 0.000002s : 15: predicate.adjust_all_reduce_mul_add 2.06% : 0.000005s : 25: predicate.arithmetic_simplify 1.22% : 0.000003s : 15: predicate.cast_eliminate 0.64% : 0.000002s : 10: predicate.check_bprop_eliminate 0.64% : 0.000002s : 10: predicate.compare_switch_simplify 0.20% : 0.000000s : 5: predicate.const_output_eliminate 0.59% : 0.000001s : 10: predicate.depend_value_elim 0.96% : 0.000002s : 15: predicate.dict_get_item_const_eliminator 0.95% : 0.000002s : 15: predicate.dict_get_item_eliminator 0.88% : 0.000002s : 15: predicate.dict_set_item_eliminator 0.95% : 0.000002s : 10: predicate.dumpgradient_eliminate 0.32% : 0.000001s : 5: predicate.elim_not_effective 0.38% : 0.000001s : 5: predicate.elim_shapecalc_of_broadcastargs 1.13% : 0.000003s : 20: predicate.environ_add_const_eliminate 1.06% : 0.000003s : 20: predicate.environ_get_add_eliminate 1.09% : 0.000003s : 20: predicate.environ_get_depend_swap 1.68% : 0.000004s : 30: predicate.environ_get_eliminate 1.10% : 0.000003s : 20: predicate.environ_get_set_eliminate 1.32% : 0.000003s : 23: predicate.exchange_switch_depend_value 2.46% : 0.000006s : 23: predicate.float_depend_g_call 0.53% : 0.000001s : 10: predicate.float_environ_get_switch 0.98% : 0.000002s : 15: predicate.float_tuple_getitem_switch 0.18% : 0.000000s : 5: predicate.fold_const_symbol 0.64% : 0.000002s : 10: predicate.get_grad_eliminate 0.21% : 0.000001s : 5: predicate.graph_param_transform 0.65% : 0.000002s : 10: predicate.incorporate_call 0.54% : 0.000001s : 10: predicate.incorporate_call_switch 5.99% : 0.000015s : 68: predicate.inline 0.82% : 0.000002s : 10: predicate.inline_without_move 0.32% : 0.000001s : 10: predicate.j_node_and_user_rematch 0.86% : 0.000002s : 10: predicate.less_batch_normalization 1.74% : 0.000004s : 29: predicate.list_to_tuple_eliminator_ 2.55% : 0.000006s : 44: predicate.load_eliminater 0.86% : 0.000002s : 5: predicate.loop_unroll_after_grad 2.41% : 0.000006s : 36: predicate.loop_unroll_before_grad 1.63% : 0.000004s : 25: predicate.make_slice_get_slice_eliminator 0.60% : 0.000001s : 10: predicate.merge_addn 0.65% : 0.000002s : 10: predicate.micro_step_allgather_replace 0.59% : 0.000001s : 10: predicate.mini_step_allgather_replace 0.84% : 0.000002s : 15: predicate.minmaximum_grad 0.85% : 0.000002s : 5: predicate.mutable_eliminate 0.39% : 0.000001s : 5: predicate.opt_reshape 0.54% : 0.000001s : 5: predicate.parallel_virtual_node 1.85% : 0.000005s : 23: predicate.partial_defer_inline 1.61% : 0.000004s : 24: predicate.partial_eliminate 0.89% : 0.000002s : 15: predicate.print_const_string_wrapper 0.63% : 0.000002s : 10: predicate.reduce_all_const_elim 1.27% : 0.000003s : 15: predicate.reduce_eliminate 2.43% : 0.000006s : 44: predicate.redundant_stop_gradient_eliminater 0.47% : 0.000001s : 10: predicate.remove_not_recompute_node 1.29% : 0.000003s : 29: predicate.replace_applicator 0.56% : 0.000001s : 10: predicate.replace_old_param 0.32% : 0.000001s : 5: predicate.reset_defer_inline 0.90% : 0.000002s : 15: predicate.reshape_eliminate 0.67% : 0.000002s : 10: predicate.row_tensor_add_zeros_like 0.35% : 0.000001s : 5: predicate.row_tensor_eliminate 0.98% : 0.000002s : 10: predicate.same_eliminate 0.41% : 0.000001s : 10: predicate.set_cell_output_no_recompute 0.79% : 0.000002s : 10: predicate.shard_identity_eliminate 0.73% : 0.000002s : 10: predicate.special_op_eliminate 0.75% : 0.000002s : 10: predicate.specialize_transform 1.02% : 0.000002s : 10: predicate.split_environ_get_set_with_tuple_value 0.79% : 0.000002s : 10: predicate.stack_unstack_eliminate 0.36% : 0.000001s : 5: predicate.switch_call_monad_eliminater 1.38% : 0.000003s : 23: predicate.switch_defer_inline 1.98% : 0.000005s : 33: predicate.switch_layer_defer_inline 5.39% : 0.000013s : 74: predicate.switch_simplify 0.84% : 0.000002s : 15: predicate.tile_eliminate 0.91% : 0.000002s : 15: predicate.transpose_eliminate 1.42% : 0.000003s : 25: predicate.tuple_list_convert_item_index_to_positive 1.52% : 0.000004s : 25: predicate.tuple_list_get_item_const_eliminator 1.42% : 0.000003s : 25: predicate.tuple_list_get_item_depend_reorder 3.54% : 0.000009s : 39: predicate.tuple_list_get_item_eliminator 1.46% : 0.000004s : 25: predicate.tuple_list_get_set_item_eliminator 2.29% : 0.000006s : 35: predicate.tuple_list_set_item_eliminator 1.70% : 0.000004s : 29: predicate.tuple_to_list_eliminator_ 2.37% : 0.000006s : 44: predicate.updatestate_pure_node_eliminater 3.13% : 0.000008s : 54: predicate.updatestate_useless_node_eliminater 0.51% : 0.000001s : 5: predicate.value_based_eliminate 0.61% : 0.000001s : 10: predicate.virtual_dataset_eliminate 0.65% : 0.000002s : 10: predicate.virtual_output_eliminate 0.29% : 0.000001s : 5: predicate.virtual_view_grad_eliminate 0.41% : 0.000001s : 5: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000648 11 44.07% : 0.000285s : 5: func_graph_cloner_run.FuncGraphClonerGraph 55.93% : 0.000362s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.027911 192 0.01% : 0.000003s : 1: ForceFp32Comm 11.98% : 0.003344s : 1: add_attr 11.93% : 0.003331s : 1: add_attr_with_inline 0.01% : 0.000004s : 1: add_comm_op_reuse_tag 0.22% : 0.000061s : 1: add_recomputation 0.01% : 0.000004s : 1: assign_add_opt 0.27% : 0.000076s : 1: auto_monad 0.09% : 0.000025s : 1: auto_monad_reorder 0.01% : 0.000003s : 1: begin_end_overlap_inline 0.02% : 0.000007s : 1: bias_add_comm_swap 1.70% : 0.000476s : 1: bootstrap 0.10% : 0.000027s : 1: cconv 0.02% : 0.000004s : 1: comm_op_add_attrs 0.07% : 0.000019s : 1: control_data_broadcast_order 0.04% : 0.000012s : 1: convert_after_rewriter 0.11% : 0.000029s : 1: cse_after_recomputation 0.02% : 0.000006s : 1: dataset_repeat_opt 0.02% : 0.000005s : 1: detach_backward 0.03% : 0.000010s : 1: environ_conv 0.10% : 0.000029s : 1: event_method 0.02% : 0.000005s : 1: full_micro_interleaved_order_control 0.02% : 0.000005s : 1: get_jit_bprop_graph 0.03% : 0.000009s : 1: graph_reusing 0.02% : 0.000005s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000004s : 1: handle_group_info 0.02% : 0.000006s : 1: inline 0.02% : 0.000007s : 1: insert-virtual-dataset 0.01% : 0.000004s : 1: interleave_parallel_branches 0.01% : 0.000004s : 1: interleave_split_concat_branches 0.02% : 0.000006s : 1: label_fine_grained_interleaved_index 0.03% : 0.000007s : 1: label_micro_interleaved_index 1.62% : 0.000452s : 1: loop_unroll 0.01% : 0.000004s : 1: merge_cast_opt 0.02% : 0.000005s : 1: micro_interleaved_order_control 2.08% : 0.000581s : 1: mutable_eliminate 0.03% : 0.000008s : 1: offloading_packed_experts 0.06% : 0.000017s : 1: opt.transform.loop_unroll_optimizer 0.06% : 0.000018s : 1: opt.transform.mutable_eliminate 5.17% : 0.001444s : 78: opt.transform.opt_a 0.13% : 0.000036s : 1: opt.transform.opt_after_cconv 0.11% : 0.000031s : 1: opt.transform.opt_after_jit_grad 0.53% : 0.000148s : 28: opt.transform.opt_b 0.21% : 0.000059s : 2: opt.transform.opt_trans_graph 0.17% : 0.000046s : 4: opt.transform.symbol_engine_opt 11.88% : 0.003317s : 1: opt_a 0.44% : 0.000124s : 1: opt_after_cconv 1.80% : 0.000503s : 1: opt_after_jit_grad 0.95% : 0.000264s : 1: opt_b 19.94% : 0.005565s : 1: optimize 0.08% : 0.000022s : 1: optimize_parallel_all_gather_comm 0.03% : 0.000009s : 1: order_py_execute_after_rewriter 0.09% : 0.000026s : 1: overlap_grad_flash_sp 0.01% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.03% : 0.000008s : 1: overlap_grad_ring_attention 0.02% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.02% : 0.000005s : 1: overlap_param_gather 0.01% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.03% : 0.000008s : 1: overlap_recompute_and_grad_model_parallel 0.02% : 0.000005s : 1: overlap_recompute_comm 0.03% : 0.000008s : 1: parallel-infer-symbol 0.01% : 0.000004s : 1: parallel-infer-symbol-second 0.02% : 0.000005s : 1: partial_unused_args_eliminate 0.02% : 0.000004s : 1: pipeline_parallel_scheduler 0.02% : 0.000005s : 1: pipeline_split 0.16% : 0.000046s : 1: pre_auto_parallel 0.12% : 0.000034s : 1: py_interpret_to_execute 0.06% : 0.000017s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000004s : 1: remove_cast_before_assign_add 0.07% : 0.000019s : 1: remove_dup_value 1.65% : 0.000462s : 1: renormalize.infer 1.26% : 0.000352s : 1: renormalize.specialize 0.02% : 0.000005s : 1: reorder_send_recv_between_fp_bp 0.02% : 0.000007s : 1: rewriter_after_jit_bprop_graph 0.17% : 0.000049s : 1: rewriter_after_opt_a 0.34% : 0.000095s : 1: rewriter_before_opt_a 0.02% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.02% : 0.000005s : 1: slice_recompute_activation 0.02% : 0.000005s : 1: split_layernorm_comm 0.02% : 0.000005s : 1: split_matmul_comm_elemetwise 0.03% : 0.000010s : 1: swap_dp_allreduce_reducescatter 0.34% : 0.000094s : 1: symbol_engine_optimizer 0.32% : 0.000088s : 1: tuple_transform 22.67% : 0.006327s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:44:40.751.430 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:44:40.751.700 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0170188, [21] [bootstrap]: 0.0003816 [type_inference]: 0.00563554 [event_method]: 2.05e-05 [auto_monad]: 6.559e-05 [graph_reusing]: 6.49001e-06 [inline]: 3.04999e-06 [add_attr]: 0.00336408, [1] [add_attr_with_inline]: 0.00335385, [1] [Cycle 1]: 8.775e-05, [2] [tag_attr]: 2.284e-05 [meta_addattr_fg_expand]: 5.63002e-06 [parallel-infer-symbol]: 3.3e-06 [pre_auto_parallel]: 3.881e-05 [insert-virtual-dataset]: 2.43e-06 [parallel-infer-symbol-second]: 6.59988e-07 [dataset_repeat_opt]: 2.10002e-06 [pipeline_split]: 1.63997e-06 [optimize]: 0.00623595, [53] [py_interpret_to_execute]: 3.404e-05 [rewriter_before_opt_a]: 9.904e-05 [opt_a]: 0.00354316, [2] [Cycle 1]: 0.00256942, [45] [expand_dump_flag]: 3.14001e-06 [switch_simplify]: 4.351e-05 [loop_unroll]: 3.153e-05 [a_1]: 0.00073038 [with_stream_mark]: 2.075e-05 [recompute_prepare]: 1.213e-05 [updatestate_depend_eliminate]: 4.79998e-06 [updatestate_assign_eliminate]: 4.92999e-06 [updatestate_loads_eliminate]: 3.8e-06 [parameter_eliminate]: 2.12001e-06 [a_2]: 0.00013144 [accelerated_algorithm]: 8.90999e-06 [shard]: 2.07001e-06 [meta_shard_fg_expand]: 2.14999e-06 [shard_inline]: 7.8e-06 [merge_send_recv]: 9.59e-06 [auto_parallel]: 8.03999e-06 [parallel]: 1.924e-05 [flash_sp]: 9.52999e-06 [merge_comm]: 4.89e-06 [allreduce_fusion]: 4.35999e-06 [matmul_add_comm_reduction]: 1.056e-05 [allreduce_slice_to_reducescatter]: 6.30011e-07 [virtual_shard_identity]: 1.008e-05 [virtual_dataset]: 7.97003e-06 [get_grad_eliminate_]: 7.66999e-06 [virtual_output]: 7.63999e-06 [merge_forward]: 4.70999e-06 [cell_reuse_recompute_pass]: 1.72001e-06 [offload_activation]: 1.125e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.822e-05 [merge_recompute_call_nodes]: 1.44e-06 [before_grad]: 1.325e-05 [set_forward_comm_id_for_comm_node_pass]: 4.64002e-06 [meta_fg_expand]: 3.24001e-06 [flash_sp_send_recv_attached]: 2.43998e-06 [receive_attached]: 2.22999e-06 [after_resolve]: 1.346e-05 [a_after_grad]: 1.193e-05 [renormalize]: 0.0008393 [add_forward_monad_depend]: 6.24001e-06 [auto_monad_grad]: 2.84999e-06 [auto_monad_eliminator]: 1.658e-05 [cse]: 3.617e-05 [a_3]: 7.279e-05 [Cycle 2]: 0.00095806, [45] [expand_dump_flag]: 1.32e-06 [switch_simplify]: 8.75999e-06 [loop_unroll]: 7.79002e-06 [a_1]: 0.00017908 [with_stream_mark]: 1.297e-05 [recompute_prepare]: 7.55e-06 [updatestate_depend_eliminate]: 4.2e-06 [updatestate_assign_eliminate]: 3.43e-06 [updatestate_loads_eliminate]: 3.11001e-06 [parameter_eliminate]: 1.45001e-06 [a_2]: 0.00012523 [accelerated_algorithm]: 7.70998e-06 [shard]: 1.94e-06 [meta_shard_fg_expand]: 1.79e-06 [shard_inline]: 7.32002e-06 [merge_send_recv]: 6.81001e-06 [auto_parallel]: 7.37002e-06 [parallel]: 6.05002e-06 [flash_sp]: 3.73999e-06 [merge_comm]: 4.19997e-06 [allreduce_fusion]: 4.33001e-06 [matmul_add_comm_reduction]: 8.15999e-06 [allreduce_slice_to_reducescatter]: 6.69999e-07 [virtual_shard_identity]: 8.42e-06 [virtual_dataset]: 7.35e-06 [get_grad_eliminate_]: 6.99001e-06 [virtual_output]: 6.89999e-06 [merge_forward]: 3.9e-06 [cell_reuse_recompute_pass]: 2.56998e-06 [offload_activation]: 8.40999e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.813e-05 [merge_recompute_call_nodes]: 1.57001e-06 [before_grad]: 1.346e-05 [set_forward_comm_id_for_comm_node_pass]: 4.74e-06 [meta_fg_expand]: 3.19001e-06 [flash_sp_send_recv_attached]: 1.17e-06 [receive_attached]: 1.30001e-06 [after_resolve]: 1.172e-05 [a_after_grad]: 1.16e-05 [renormalize]: 8.00064e-08 [add_forward_monad_depend]: 1.23002e-06 [auto_monad_grad]: 9.70002e-07 [auto_monad_eliminator]: 1.092e-05 [cse]: 2.066e-05 [a_3]: 5.953e-05 [py_interpret_to_execute_after_opt_a]: 1.71e-05 [slice_cell_reuse_recomputed_activation]: 4.60999e-06 [rewriter_after_opt_a]: 4.571e-05 [convert_after_rewriter]: 1.076e-05 [order_py_execute_after_rewriter]: 9.12001e-06 [mutable_eliminate]: 0.00069539 [opt_b]: 0.00035616, [1] [Cycle 1]: 0.00034501, [7] [b_1]: 0.00023059 [b_2]: 1.042e-05 [updatestate_depend_eliminate]: 8.02e-06 [updatestate_assign_eliminate]: 2.85002e-06 [updatestate_loads_eliminate]: 3.08e-06 [renormalize]: 5.89993e-07 [cse]: 2.929e-05 [optimize_parallel_all_gather_comm]: 2.257e-05 [overlap_param_gather]: 4.55999e-06 [cconv]: 3.644e-05 [loop_unroll]: 0.00048481 [opt_after_cconv]: 0.00014381, [1] [Cycle 1]: 0.00013441, [7] [c_1]: 3.69e-05 [parameter_eliminate]: 3.80998e-06 [updatestate_depend_eliminate]: 6.59999e-06 [updatestate_assign_eliminate]: 3.75e-06 [updatestate_loads_eliminate]: 3.01999e-06 [cse]: 2.435e-05 [renormalize]: 4.09986e-07 [remove_dup_value]: 1.811e-05 [tuple_transform]: 0.00010162, [1] [Cycle 1]: 9.391e-05, [4] [d_1]: 5.272e-05 [none_parameter_eliminate]: 1.60999e-06 [renormalize]: 1.8999e-07 [switch_simplify]: 8.15999e-06 [partial_unused_args_eliminate]: 5.02999e-06 [add_recomputation]: 6.203e-05 [cse_after_recomputation]: 3.318e-05, [1] [Cycle 1]: 2.582e-05, [1] [cse]: 1.608e-05 [environ_conv]: 9.77999e-06 [swap_dp_allreduce_reducescatter]: 8.58001e-06 [bias_add_comm_swap]: 5.37999e-06 [label_micro_interleaved_index]: 7.56999e-06 [label_fine_grained_interleaved_index]: 5.35001e-06 [merge_cast_opt]: 3.73001e-06 [slice_recompute_activation]: 4.37e-06 [micro_interleaved_order_control]: 4.55001e-06 [assign_add_opt]: 3.67002e-06 [ForceFp32Comm]: 3.28e-06 [remove_cast_before_assign_add]: 3.38e-06 [full_micro_interleaved_order_control]: 4.42e-06 [reorder_send_recv_between_fp_bp]: 5.29e-06 [comm_op_add_attrs]: 4.06001e-06 [add_comm_op_reuse_tag]: 3.21999e-06 [interleave_split_concat_branches]: 3.43e-06 [interleave_parallel_branches]: 3.33998e-06 [overlap_opt_shard_in_pipeline]: 3.56999e-06 [overlap_opt_shard_grad_in_pipeline]: 4.19002e-06 [control_data_broadcast_order]: 1.798e-05 [grouped_pairwise_exchange_alltoall]: 3.75e-06 [offloading_packed_experts]: 7.3e-06 [overlap_recompute_and_grad_model_parallel]: 7.83999e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.7e-06 [overlap_recompute_allgather_and_fa_grad]: 3.98001e-06 [overlap_recompute_comm]: 4.74e-06 [overlap_grad_ring_attention]: 7.00998e-06 [overlap_grad_flash_sp]: 2.569e-05 [begin_end_overlap_inline]: 2.91e-06 [split_matmul_comm_elemetwise]: 4.83001e-06 [split_layernorm_comm]: 3.83999e-06 [handle_group_info]: 3.31001e-06 [symbol_engine_optimizer]: 0.00010776, [1] [Cycle 1]: 0.00010103, [6] [build]: 3.98999e-06 [elim_shapecalc]: 1.162e-05 [elim_not_effective]: 1.594e-05 [opt_reshape]: 8.75001e-06 [fold_const_symbol]: 1.33e-05 [renormalize]: 2.50002e-07 [detach_backward]: 3.64002e-06 [pipeline_parallel_scheduler]: 1.84998e-06 [auto_monad_reorder]: 2.317e-05 [get_jit_bprop_graph]: 2.43e-06 [rewriter_after_jit_bprop_graph]: 4.87998e-06 [opt_after_jit_grad]: 0.00056186 [validate]: 4.895e-05 Sums bootstrap : 0.000382s : 3.22% type_inference : 0.005636s : 47.51% event_method : 0.000021s : 0.17% auto_monad : 0.000066s : 0.55% graph_reusing : 0.000006s : 0.05% inline : 0.000003s : 0.03% add_attr.add_attr_with_inline.tag_attr : 0.000023s : 0.19% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.05% parallel-infer-symbol : 0.000003s : 0.03% pre_auto_parallel : 0.000039s : 0.33% insert-virtual-dataset : 0.000002s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000034s : 0.29% optimize.rewriter_before_opt_a : 0.000099s : 0.83% optimize.opt_a.expand_dump_flag : 0.000004s : 0.04% optimize.opt_a.switch_simplify : 0.000052s : 0.44% optimize.opt_a.loop_unroll : 0.000039s : 0.33% optimize.opt_a.a_1 : 0.000909s : 7.67% optimize.opt_a.with_stream_mark : 0.000034s : 0.28% optimize.opt_a.recompute_prepare : 0.000020s : 0.17% optimize.opt_a.updatestate_depend_eliminate : 0.000009s : 0.08% optimize.opt_a.updatestate_assign_eliminate : 0.000008s : 0.07% optimize.opt_a.updatestate_loads_eliminate : 0.000007s : 0.06% optimize.opt_a.parameter_eliminate : 0.000004s : 0.03% optimize.opt_a.a_2 : 0.000257s : 2.16% optimize.opt_a.accelerated_algorithm : 0.000017s : 0.14% optimize.opt_a.shard : 0.000004s : 0.03% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.03% optimize.opt_a.shard_inline : 0.000015s : 0.13% optimize.opt_a.merge_send_recv : 0.000016s : 0.14% optimize.opt_a.auto_parallel : 0.000015s : 0.13% optimize.opt_a.parallel : 0.000025s : 0.21% optimize.opt_a.flash_sp : 0.000013s : 0.11% optimize.opt_a.merge_comm : 0.000009s : 0.08% optimize.opt_a.allreduce_fusion : 0.000009s : 0.07% optimize.opt_a.matmul_add_comm_reduction : 0.000019s : 0.16% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000018s : 0.16% optimize.opt_a.virtual_dataset : 0.000015s : 0.13% optimize.opt_a.get_grad_eliminate_ : 0.000015s : 0.12% optimize.opt_a.virtual_output : 0.000015s : 0.12% optimize.opt_a.merge_forward : 0.000009s : 0.07% optimize.opt_a.cell_reuse_recompute_pass : 0.000004s : 0.04% optimize.opt_a.offload_activation : 0.000020s : 0.17% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000036s : 0.31% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.03% optimize.opt_a.before_grad : 0.000027s : 0.23% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000009s : 0.08% optimize.opt_a.meta_fg_expand : 0.000006s : 0.05% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.03% optimize.opt_a.receive_attached : 0.000004s : 0.03% optimize.opt_a.after_resolve : 0.000025s : 0.21% optimize.opt_a.a_after_grad : 0.000024s : 0.20% optimize.opt_a.renormalize : 0.000839s : 7.08% optimize.opt_a.add_forward_monad_depend : 0.000007s : 0.06% optimize.opt_a.auto_monad_grad : 0.000004s : 0.03% optimize.opt_a.auto_monad_eliminator : 0.000027s : 0.23% optimize.opt_a.cse : 0.000057s : 0.48% optimize.opt_a.a_3 : 0.000132s : 1.12% optimize.py_interpret_to_execute_after_opt_a : 0.000017s : 0.14% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.04% optimize.rewriter_after_opt_a : 0.000046s : 0.39% optimize.convert_after_rewriter : 0.000011s : 0.09% optimize.order_py_execute_after_rewriter : 0.000009s : 0.08% optimize.mutable_eliminate : 0.000695s : 5.86% optimize.opt_b.b_1 : 0.000231s : 1.94% optimize.opt_b.b_2 : 0.000010s : 0.09% optimize.opt_b.updatestate_depend_eliminate : 0.000008s : 0.07% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.02% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000029s : 0.25% optimize.optimize_parallel_all_gather_comm : 0.000023s : 0.19% optimize.overlap_param_gather : 0.000005s : 0.04% optimize.cconv : 0.000036s : 0.31% optimize.loop_unroll : 0.000485s : 4.09% optimize.opt_after_cconv.c_1 : 0.000037s : 0.31% optimize.opt_after_cconv.parameter_eliminate : 0.000004s : 0.03% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000007s : 0.06% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000004s : 0.03% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.cse : 0.000024s : 0.21% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000018s : 0.15% optimize.tuple_transform.d_1 : 0.000053s : 0.44% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000008s : 0.07% optimize.partial_unused_args_eliminate : 0.000005s : 0.04% optimize.add_recomputation : 0.000062s : 0.52% optimize.cse_after_recomputation.cse : 0.000016s : 0.14% optimize.environ_conv : 0.000010s : 0.08% optimize.swap_dp_allreduce_reducescatter : 0.000009s : 0.07% optimize.bias_add_comm_swap : 0.000005s : 0.05% optimize.label_micro_interleaved_index : 0.000008s : 0.06% optimize.label_fine_grained_interleaved_index : 0.000005s : 0.05% optimize.merge_cast_opt : 0.000004s : 0.03% optimize.slice_recompute_activation : 0.000004s : 0.04% optimize.micro_interleaved_order_control : 0.000005s : 0.04% optimize.assign_add_opt : 0.000004s : 0.03% optimize.ForceFp32Comm : 0.000003s : 0.03% optimize.remove_cast_before_assign_add : 0.000003s : 0.03% optimize.full_micro_interleaved_order_control : 0.000004s : 0.04% optimize.reorder_send_recv_between_fp_bp : 0.000005s : 0.04% optimize.comm_op_add_attrs : 0.000004s : 0.03% optimize.add_comm_op_reuse_tag : 0.000003s : 0.03% optimize.interleave_split_concat_branches : 0.000003s : 0.03% optimize.interleave_parallel_branches : 0.000003s : 0.03% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.03% optimize.overlap_opt_shard_grad_in_pipeline : 0.000004s : 0.04% optimize.control_data_broadcast_order : 0.000018s : 0.15% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.03% optimize.offloading_packed_experts : 0.000007s : 0.06% optimize.overlap_recompute_and_grad_model_parallel : 0.000008s : 0.07% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.03% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.03% optimize.overlap_recompute_comm : 0.000005s : 0.04% optimize.overlap_grad_ring_attention : 0.000007s : 0.06% optimize.overlap_grad_flash_sp : 0.000026s : 0.22% optimize.begin_end_overlap_inline : 0.000003s : 0.02% optimize.split_matmul_comm_elemetwise : 0.000005s : 0.04% optimize.split_layernorm_comm : 0.000004s : 0.03% optimize.handle_group_info : 0.000003s : 0.03% optimize.symbol_engine_optimizer.build : 0.000004s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000012s : 0.10% optimize.symbol_engine_optimizer.elim_not_effective : 0.000016s : 0.13% optimize.symbol_engine_optimizer.opt_reshape : 0.000009s : 0.07% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000013s : 0.11% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000004s : 0.03% pipeline_parallel_scheduler : 0.000002s : 0.02% auto_monad_reorder : 0.000023s : 0.20% get_jit_bprop_graph : 0.000002s : 0.02% rewriter_after_jit_bprop_graph : 0.000005s : 0.04% opt_after_jit_grad : 0.000562s : 4.74% validate : 0.000049s : 0.41% Time group info: ------[substitution.] 0.000263 39 21.19% : 0.000056s : 3: substitution.cast_eliminate 0.85% : 0.000002s : 3: substitution.elim_not_effective 0.88% : 0.000002s : 3: substitution.fold_const_symbol 2.58% : 0.000007s : 5: substitution.graph_param_transform 60.06% : 0.000158s : 4: substitution.inline 2.04% : 0.000005s : 6: substitution.j_node_and_user_rematch 2.55% : 0.000007s : 6: substitution.remove_not_recompute_node 1.99% : 0.000005s : 4: substitution.replace_old_param 4.85% : 0.000013s : 4: substitution.tuple_list_get_item_eliminator 3.02% : 0.000008s : 1: substitution.value_based_eliminate ------[type_inference.] 0.005579 2 86.89% : 0.004848s : 1: type_inference.infer 13.11% : 0.000731s : 1: type_inference.specialize ------[replace.] 0.000068 8 60.43% : 0.000041s : 4: replace.inline 39.57% : 0.000027s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000166 8 93.48% : 0.000155s : 4: match.inline 6.52% : 0.000011s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000259 1504 0.87% : 0.000002s : 15: predicate.accumulaten_eliminater 0.79% : 0.000002s : 5: predicate.ad_related_special_op_eliminate 0.63% : 0.000002s : 10: predicate.addn_check_dump 0.86% : 0.000002s : 15: predicate.addn_zero_filter 0.74% : 0.000002s : 15: predicate.adjust_all_reduce_mul_add 2.05% : 0.000005s : 25: predicate.arithmetic_simplify 1.13% : 0.000003s : 15: predicate.cast_eliminate 0.60% : 0.000002s : 10: predicate.check_bprop_eliminate 0.71% : 0.000002s : 10: predicate.compare_switch_simplify 0.18% : 0.000000s : 5: predicate.const_output_eliminate 0.65% : 0.000002s : 10: predicate.depend_value_elim 0.98% : 0.000003s : 15: predicate.dict_get_item_const_eliminator 0.95% : 0.000002s : 15: predicate.dict_get_item_eliminator 0.78% : 0.000002s : 15: predicate.dict_set_item_eliminator 0.84% : 0.000002s : 10: predicate.dumpgradient_eliminate 0.21% : 0.000001s : 5: predicate.elim_not_effective 0.36% : 0.000001s : 5: predicate.elim_shapecalc_of_broadcastargs 1.06% : 0.000003s : 20: predicate.environ_add_const_eliminate 1.01% : 0.000003s : 20: predicate.environ_get_add_eliminate 1.02% : 0.000003s : 20: predicate.environ_get_depend_swap 1.64% : 0.000004s : 30: predicate.environ_get_eliminate 1.03% : 0.000003s : 20: predicate.environ_get_set_eliminate 1.27% : 0.000003s : 23: predicate.exchange_switch_depend_value 2.17% : 0.000006s : 23: predicate.float_depend_g_call 0.58% : 0.000002s : 10: predicate.float_environ_get_switch 0.90% : 0.000002s : 15: predicate.float_tuple_getitem_switch 0.19% : 0.000001s : 5: predicate.fold_const_symbol 0.70% : 0.000002s : 10: predicate.get_grad_eliminate 0.20% : 0.000001s : 5: predicate.graph_param_transform 0.60% : 0.000002s : 10: predicate.incorporate_call 0.52% : 0.000001s : 10: predicate.incorporate_call_switch 6.07% : 0.000016s : 68: predicate.inline 0.78% : 0.000002s : 10: predicate.inline_without_move 0.29% : 0.000001s : 10: predicate.j_node_and_user_rematch 0.90% : 0.000002s : 10: predicate.less_batch_normalization 1.80% : 0.000005s : 29: predicate.list_to_tuple_eliminator_ 2.33% : 0.000006s : 44: predicate.load_eliminater 0.85% : 0.000002s : 5: predicate.loop_unroll_after_grad 2.09% : 0.000005s : 36: predicate.loop_unroll_before_grad 1.51% : 0.000004s : 25: predicate.make_slice_get_slice_eliminator 0.59% : 0.000002s : 10: predicate.merge_addn 0.58% : 0.000001s : 10: predicate.micro_step_allgather_replace 0.58% : 0.000001s : 10: predicate.mini_step_allgather_replace 0.83% : 0.000002s : 15: predicate.minmaximum_grad 1.08% : 0.000003s : 5: predicate.mutable_eliminate 0.33% : 0.000001s : 5: predicate.opt_reshape 5.61% : 0.000015s : 5: predicate.parallel_virtual_node 1.54% : 0.000004s : 23: predicate.partial_defer_inline 1.52% : 0.000004s : 24: predicate.partial_eliminate 0.82% : 0.000002s : 15: predicate.print_const_string_wrapper 0.57% : 0.000001s : 10: predicate.reduce_all_const_elim 1.07% : 0.000003s : 15: predicate.reduce_eliminate 2.32% : 0.000006s : 44: predicate.redundant_stop_gradient_eliminater 0.44% : 0.000001s : 10: predicate.remove_not_recompute_node 1.28% : 0.000003s : 29: predicate.replace_applicator 0.48% : 0.000001s : 10: predicate.replace_old_param 0.25% : 0.000001s : 5: predicate.reset_defer_inline 0.87% : 0.000002s : 15: predicate.reshape_eliminate 0.60% : 0.000002s : 10: predicate.row_tensor_add_zeros_like 0.44% : 0.000001s : 5: predicate.row_tensor_eliminate 0.85% : 0.000002s : 10: predicate.same_eliminate 0.39% : 0.000001s : 10: predicate.set_cell_output_no_recompute 0.84% : 0.000002s : 10: predicate.shard_identity_eliminate 0.72% : 0.000002s : 10: predicate.special_op_eliminate 0.74% : 0.000002s : 10: predicate.specialize_transform 0.80% : 0.000002s : 10: predicate.split_environ_get_set_with_tuple_value 0.73% : 0.000002s : 10: predicate.stack_unstack_eliminate 0.34% : 0.000001s : 5: predicate.switch_call_monad_eliminater 1.37% : 0.000004s : 23: predicate.switch_defer_inline 1.92% : 0.000005s : 33: predicate.switch_layer_defer_inline 4.74% : 0.000012s : 74: predicate.switch_simplify 0.80% : 0.000002s : 15: predicate.tile_eliminate 0.88% : 0.000002s : 15: predicate.transpose_eliminate 1.62% : 0.000004s : 25: predicate.tuple_list_convert_item_index_to_positive 1.47% : 0.000004s : 25: predicate.tuple_list_get_item_const_eliminator 1.44% : 0.000004s : 25: predicate.tuple_list_get_item_depend_reorder 2.85% : 0.000007s : 39: predicate.tuple_list_get_item_eliminator 1.36% : 0.000004s : 25: predicate.tuple_list_get_set_item_eliminator 2.13% : 0.000006s : 35: predicate.tuple_list_set_item_eliminator 1.66% : 0.000004s : 29: predicate.tuple_to_list_eliminator_ 2.24% : 0.000006s : 44: predicate.updatestate_pure_node_eliminater 2.90% : 0.000008s : 54: predicate.updatestate_useless_node_eliminater 0.68% : 0.000002s : 5: predicate.value_based_eliminate 0.61% : 0.000002s : 10: predicate.virtual_dataset_eliminate 0.59% : 0.000002s : 10: predicate.virtual_output_eliminate 0.27% : 0.000001s : 5: predicate.virtual_view_grad_eliminate 0.46% : 0.000001s : 5: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000523 11 50.58% : 0.000265s : 5: func_graph_cloner_run.FuncGraphClonerGraph 49.42% : 0.000259s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.029122 192 0.02% : 0.000006s : 1: ForceFp32Comm 11.59% : 0.003374s : 1: add_attr 11.53% : 0.003358s : 1: add_attr_with_inline 0.02% : 0.000006s : 1: add_comm_op_reuse_tag 0.23% : 0.000066s : 1: add_recomputation 0.02% : 0.000006s : 1: assign_add_opt 0.26% : 0.000075s : 1: auto_monad 0.11% : 0.000031s : 1: auto_monad_reorder 0.02% : 0.000006s : 1: begin_end_overlap_inline 0.03% : 0.000008s : 1: bias_add_comm_swap 1.44% : 0.000419s : 1: bootstrap 0.14% : 0.000040s : 1: cconv 0.02% : 0.000007s : 1: comm_op_add_attrs 0.07% : 0.000021s : 1: control_data_broadcast_order 0.05% : 0.000014s : 1: convert_after_rewriter 0.12% : 0.000036s : 1: cse_after_recomputation 0.03% : 0.000008s : 1: dataset_repeat_opt 0.07% : 0.000019s : 1: detach_backward 0.05% : 0.000013s : 1: environ_conv 0.11% : 0.000031s : 1: event_method 0.02% : 0.000007s : 1: full_micro_interleaved_order_control 0.03% : 0.000009s : 1: get_jit_bprop_graph 0.04% : 0.000013s : 1: graph_reusing 0.02% : 0.000006s : 1: grouped_pairwise_exchange_alltoall 0.02% : 0.000006s : 1: handle_group_info 0.03% : 0.000009s : 1: inline 0.03% : 0.000009s : 1: insert-virtual-dataset 0.02% : 0.000006s : 1: interleave_parallel_branches 0.02% : 0.000006s : 1: interleave_split_concat_branches 0.03% : 0.000008s : 1: label_fine_grained_interleaved_index 0.04% : 0.000010s : 1: label_micro_interleaved_index 1.69% : 0.000491s : 1: loop_unroll 0.02% : 0.000006s : 1: merge_cast_opt 0.02% : 0.000007s : 1: micro_interleaved_order_control 2.41% : 0.000702s : 1: mutable_eliminate 0.03% : 0.000010s : 1: offloading_packed_experts 0.06% : 0.000017s : 1: opt.transform.loop_unroll_optimizer 0.07% : 0.000020s : 1: opt.transform.mutable_eliminate 4.93% : 0.001436s : 78: opt.transform.opt_a 0.12% : 0.000035s : 1: opt.transform.opt_after_cconv 0.11% : 0.000032s : 1: opt.transform.opt_after_jit_grad 0.57% : 0.000166s : 28: opt.transform.opt_b 0.20% : 0.000059s : 2: opt.transform.opt_trans_graph 0.16% : 0.000046s : 4: opt.transform.symbol_engine_opt 12.18% : 0.003547s : 1: opt_a 0.51% : 0.000147s : 1: opt_after_cconv 1.97% : 0.000573s : 1: opt_after_jit_grad 1.24% : 0.000361s : 1: opt_b 22.66% : 0.006600s : 1: optimize 0.09% : 0.000026s : 1: optimize_parallel_all_gather_comm 0.04% : 0.000012s : 1: order_py_execute_after_rewriter 0.10% : 0.000029s : 1: overlap_grad_flash_sp 0.02% : 0.000007s : 1: overlap_grad_matmul_and_grad_allreduce 0.03% : 0.000010s : 1: overlap_grad_ring_attention 0.02% : 0.000007s : 1: overlap_opt_shard_grad_in_pipeline 0.02% : 0.000006s : 1: overlap_opt_shard_in_pipeline 0.03% : 0.000008s : 1: overlap_param_gather 0.02% : 0.000007s : 1: overlap_recompute_allgather_and_fa_grad 0.04% : 0.000011s : 1: overlap_recompute_and_grad_model_parallel 0.03% : 0.000008s : 1: overlap_recompute_comm 0.04% : 0.000010s : 1: parallel-infer-symbol 0.02% : 0.000006s : 1: parallel-infer-symbol-second 0.03% : 0.000008s : 1: partial_unused_args_eliminate 0.03% : 0.000009s : 1: pipeline_parallel_scheduler 0.02% : 0.000007s : 1: pipeline_split 0.16% : 0.000047s : 1: pre_auto_parallel 0.13% : 0.000038s : 1: py_interpret_to_execute 0.07% : 0.000020s : 1: py_interpret_to_execute_after_opt_a 0.02% : 0.000006s : 1: remove_cast_before_assign_add 0.07% : 0.000021s : 1: remove_dup_value 1.61% : 0.000469s : 1: renormalize.infer 1.24% : 0.000361s : 1: renormalize.specialize 0.03% : 0.000008s : 1: reorder_send_recv_between_fp_bp 0.04% : 0.000011s : 1: rewriter_after_jit_bprop_graph 0.17% : 0.000049s : 1: rewriter_after_opt_a 0.35% : 0.000103s : 1: rewriter_before_opt_a 0.03% : 0.000007s : 1: slice_cell_reuse_recomputed_activation 0.02% : 0.000007s : 1: slice_recompute_activation 0.02% : 0.000006s : 1: split_layernorm_comm 0.03% : 0.000007s : 1: split_matmul_comm_elemetwise 0.04% : 0.000011s : 1: swap_dp_allreduce_reducescatter 0.38% : 0.000111s : 1: symbol_engine_optimizer 0.36% : 0.000104s : 1: tuple_transform 19.49% : 0.005676s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:44:41.279.038 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0416734, [21] [bootstrap]: 0.00044478 [type_inference]: 0.0182738 [event_method]: 2.189e-05 [auto_monad]: 6.567e-05 [graph_reusing]: 5.86e-06 [inline]: 2.12001e-06 [add_attr]: 0.00350674, [1] [add_attr_with_inline]: 0.00349387, [1] [Cycle 1]: 7.576e-05, [2] [tag_attr]: 2.392e-05 [meta_addattr_fg_expand]: 6.73e-06 [parallel-infer-symbol]: 3.72002e-06 [pre_auto_parallel]: 4.259e-05 [insert-virtual-dataset]: 2.58e-06 [parallel-infer-symbol-second]: 7.79983e-07 [dataset_repeat_opt]: 2.24001e-06 [pipeline_split]: 2.55002e-06 [optimize]: 0.0184345, [53] [py_interpret_to_execute]: 3.352e-05 [rewriter_before_opt_a]: 9.634e-05 [opt_a]: 0.0156945, [2] [Cycle 1]: 0.00260074, [45] [expand_dump_flag]: 3.11999e-06 [switch_simplify]: 4.565e-05 [loop_unroll]: 3.12e-05 [a_1]: 0.00074374 [with_stream_mark]: 2.526e-05 [recompute_prepare]: 1.569e-05 [updatestate_depend_eliminate]: 5.34998e-06 [updatestate_assign_eliminate]: 3.88001e-06 [updatestate_loads_eliminate]: 4.13001e-06 [parameter_eliminate]: 2.54999e-06 [a_2]: 0.00010693 [accelerated_algorithm]: 9.51e-06 [shard]: 2.34999e-06 [meta_shard_fg_expand]: 2.69999e-06 [shard_inline]: 8.65999e-06 [merge_send_recv]: 1.061e-05 [auto_parallel]: 1.189e-05 [parallel]: 2.206e-05 [flash_sp]: 1.124e-05 [merge_comm]: 4.82e-06 [allreduce_fusion]: 4.10998e-06 [matmul_add_comm_reduction]: 1.138e-05 [allreduce_slice_to_reducescatter]: 9.09989e-07 [virtual_shard_identity]: 1.308e-05 [virtual_dataset]: 9.27999e-06 [get_grad_eliminate_]: 7.85998e-06 [virtual_output]: 8.33999e-06 [merge_forward]: 6.02001e-06 [cell_reuse_recompute_pass]: 1.41998e-06 [offload_activation]: 1.387e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.22e-05 [merge_recompute_call_nodes]: 2.11e-06 [before_grad]: 1.528e-05 [set_forward_comm_id_for_comm_node_pass]: 7.23e-06 [meta_fg_expand]: 3.68999e-06 [flash_sp_send_recv_attached]: 4.44998e-06 [receive_attached]: 2.39001e-06 [after_resolve]: 1.685e-05 [a_after_grad]: 1.447e-05 [renormalize]: 0.00091852 [add_forward_monad_depend]: 7.95e-06 [auto_monad_grad]: 3.16999e-06 [auto_monad_eliminator]: 2.323e-05 [cse]: 4.171e-05 [a_3]: 6.861e-05 [Cycle 2]: 0.0130804, [45] [expand_dump_flag]: 2.78998e-06 [switch_simplify]: 1.058e-05 [loop_unroll]: 7.84002e-06 [a_1]: 0.012248 [with_stream_mark]: 4.064e-05 [recompute_prepare]: 2.023e-05 [updatestate_depend_eliminate]: 6.21e-06 [updatestate_assign_eliminate]: 3.9e-06 [updatestate_loads_eliminate]: 4.15999e-06 [parameter_eliminate]: 2.49001e-06 [a_2]: 0.0001049 [accelerated_algorithm]: 8.65999e-06 [shard]: 3.75e-06 [meta_shard_fg_expand]: 3.66001e-06 [shard_inline]: 8.45001e-06 [merge_send_recv]: 1.179e-05 [auto_parallel]: 1.01e-05 [parallel]: 1.012e-05 [flash_sp]: 4.57e-06 [merge_comm]: 4.70999e-06 [allreduce_fusion]: 1.262e-05 [matmul_add_comm_reduction]: 1.393e-05 [allreduce_slice_to_reducescatter]: 8.39995e-07 [virtual_shard_identity]: 1.094e-05 [virtual_dataset]: 7.68999e-06 [get_grad_eliminate_]: 7.65e-06 [virtual_output]: 8.33001e-06 [merge_forward]: 6.16e-06 [cell_reuse_recompute_pass]: 3.91999e-06 [offload_activation]: 1.287e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.398e-05 [merge_recompute_call_nodes]: 1.55001e-06 [before_grad]: 1.31e-05 [set_forward_comm_id_for_comm_node_pass]: 5.79e-06 [meta_fg_expand]: 4.08001e-06 [flash_sp_send_recv_attached]: 2.44001e-06 [receive_attached]: 2.21e-06 [after_resolve]: 1.616e-05 [a_after_grad]: 1.259e-05 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 4.07e-06 [auto_monad_grad]: 3.4e-06 [auto_monad_eliminator]: 2.2e-05 [cse]: 4.367e-05 [a_3]: 5.123e-05 [py_interpret_to_execute_after_opt_a]: 2.157e-05 [slice_cell_reuse_recomputed_activation]: 2.32001e-06 [rewriter_after_opt_a]: 5.469e-05 [convert_after_rewriter]: 8.12e-06 [order_py_execute_after_rewriter]: 6.66999e-06 [mutable_eliminate]: 0.00079087 [opt_b]: 0.00029792, [1] [Cycle 1]: 0.00028803, [7] [b_1]: 0.00018134 [b_2]: 1.228e-05 [updatestate_depend_eliminate]: 9.35001e-06 [updatestate_assign_eliminate]: 3.49001e-06 [updatestate_loads_eliminate]: 3.47997e-06 [renormalize]: 6.59988e-07 [cse]: 3.634e-05 [optimize_parallel_all_gather_comm]: 2.29e-05 [overlap_param_gather]: 2.14999e-06 [cconv]: 3.664e-05 [loop_unroll]: 0.00056814 [opt_after_cconv]: 0.000133, [1] [Cycle 1]: 0.00012501, [7] [c_1]: 3.778e-05 [parameter_eliminate]: 4.91002e-06 [updatestate_depend_eliminate]: 8.33999e-06 [updatestate_assign_eliminate]: 3.43e-06 [updatestate_loads_eliminate]: 3.69002e-06 [cse]: 3.04e-05 [renormalize]: 4.7998e-07 [remove_dup_value]: 1.782e-05 [tuple_transform]: 9.196e-05, [1] [Cycle 1]: 8.698e-05, [4] [d_1]: 5.694e-05 [none_parameter_eliminate]: 2.21e-06 [renormalize]: 2.00002e-07 [switch_simplify]: 8.56997e-06 [partial_unused_args_eliminate]: 1.91e-06 [add_recomputation]: 6.607e-05 [cse_after_recomputation]: 2.852e-05, [1] [Cycle 1]: 2.352e-05, [1] [cse]: 1.632e-05 [environ_conv]: 7.67002e-06 [swap_dp_allreduce_reducescatter]: 6.85998e-06 [bias_add_comm_swap]: 3.37997e-06 [label_micro_interleaved_index]: 4.84e-06 [label_fine_grained_interleaved_index]: 2.83e-06 [merge_cast_opt]: 1.66e-06 [slice_recompute_activation]: 2.12001e-06 [micro_interleaved_order_control]: 2.69001e-06 [assign_add_opt]: 1.32e-06 [ForceFp32Comm]: 1.10999e-06 [remove_cast_before_assign_add]: 1.37e-06 [full_micro_interleaved_order_control]: 2.37001e-06 [reorder_send_recv_between_fp_bp]: 2.88e-06 [comm_op_add_attrs]: 1.25999e-06 [add_comm_op_reuse_tag]: 1.29e-06 [interleave_split_concat_branches]: 1.12e-06 [interleave_parallel_branches]: 1.05999e-06 [overlap_opt_shard_in_pipeline]: 1.54998e-06 [overlap_opt_shard_grad_in_pipeline]: 1.77999e-06 [control_data_broadcast_order]: 1.623e-05 [grouped_pairwise_exchange_alltoall]: 1.69e-06 [offloading_packed_experts]: 4.55999e-06 [overlap_recompute_and_grad_model_parallel]: 5.97999e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.17e-06 [overlap_recompute_allgather_and_fa_grad]: 1.49e-06 [overlap_recompute_comm]: 2.21e-06 [overlap_grad_ring_attention]: 4.79998e-06 [overlap_grad_flash_sp]: 2.605e-05 [begin_end_overlap_inline]: 6.89994e-07 [split_matmul_comm_elemetwise]: 2.29001e-06 [split_layernorm_comm]: 1.97999e-06 [handle_group_info]: 1.02e-06 [symbol_engine_optimizer]: 9.976e-05, [1] [Cycle 1]: 9.486e-05, [6] [build]: 5.64998e-06 [elim_shapecalc]: 1.607e-05 [elim_not_effective]: 1.746e-05 [opt_reshape]: 8.99e-06 [fold_const_symbol]: 1.281e-05 [renormalize]: 2.19996e-07 [detach_backward]: 2.45002e-06 [pipeline_parallel_scheduler]: 1.52001e-06 [auto_monad_reorder]: 2.214e-05 [get_jit_bprop_graph]: 1.81e-06 [rewriter_after_jit_bprop_graph]: 7.31001e-06 [opt_after_jit_grad]: 0.00061134 [validate]: 5.553e-05 Sums bootstrap : 0.000445s : 1.20% type_inference : 0.018274s : 49.34% event_method : 0.000022s : 0.06% auto_monad : 0.000066s : 0.18% graph_reusing : 0.000006s : 0.02% inline : 0.000002s : 0.01% add_attr.add_attr_with_inline.tag_attr : 0.000024s : 0.06% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000007s : 0.02% parallel-infer-symbol : 0.000004s : 0.01% pre_auto_parallel : 0.000043s : 0.11% insert-virtual-dataset : 0.000003s : 0.01% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.01% pipeline_split : 0.000003s : 0.01% optimize.py_interpret_to_execute : 0.000034s : 0.09% optimize.rewriter_before_opt_a : 0.000096s : 0.26% optimize.opt_a.expand_dump_flag : 0.000006s : 0.02% optimize.opt_a.switch_simplify : 0.000056s : 0.15% optimize.opt_a.loop_unroll : 0.000039s : 0.11% optimize.opt_a.a_1 : 0.012992s : 35.07% optimize.opt_a.with_stream_mark : 0.000066s : 0.18% optimize.opt_a.recompute_prepare : 0.000036s : 0.10% optimize.opt_a.updatestate_depend_eliminate : 0.000012s : 0.03% optimize.opt_a.updatestate_assign_eliminate : 0.000008s : 0.02% optimize.opt_a.updatestate_loads_eliminate : 0.000008s : 0.02% optimize.opt_a.parameter_eliminate : 0.000005s : 0.01% optimize.opt_a.a_2 : 0.000212s : 0.57% optimize.opt_a.accelerated_algorithm : 0.000018s : 0.05% optimize.opt_a.shard : 0.000006s : 0.02% optimize.opt_a.meta_shard_fg_expand : 0.000006s : 0.02% optimize.opt_a.shard_inline : 0.000017s : 0.05% optimize.opt_a.merge_send_recv : 0.000022s : 0.06% optimize.opt_a.auto_parallel : 0.000022s : 0.06% optimize.opt_a.parallel : 0.000032s : 0.09% optimize.opt_a.flash_sp : 0.000016s : 0.04% optimize.opt_a.merge_comm : 0.000010s : 0.03% optimize.opt_a.allreduce_fusion : 0.000017s : 0.05% optimize.opt_a.matmul_add_comm_reduction : 0.000025s : 0.07% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000002s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000024s : 0.06% optimize.opt_a.virtual_dataset : 0.000017s : 0.05% optimize.opt_a.get_grad_eliminate_ : 0.000016s : 0.04% optimize.opt_a.virtual_output : 0.000017s : 0.05% optimize.opt_a.merge_forward : 0.000012s : 0.03% optimize.opt_a.cell_reuse_recompute_pass : 0.000005s : 0.01% optimize.opt_a.offload_activation : 0.000027s : 0.07% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000046s : 0.12% optimize.opt_a.merge_recompute_call_nodes : 0.000004s : 0.01% optimize.opt_a.before_grad : 0.000028s : 0.08% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000013s : 0.04% optimize.opt_a.meta_fg_expand : 0.000008s : 0.02% optimize.opt_a.flash_sp_send_recv_attached : 0.000007s : 0.02% optimize.opt_a.receive_attached : 0.000005s : 0.01% optimize.opt_a.after_resolve : 0.000033s : 0.09% optimize.opt_a.a_after_grad : 0.000027s : 0.07% optimize.opt_a.renormalize : 0.000919s : 2.48% optimize.opt_a.add_forward_monad_depend : 0.000012s : 0.03% optimize.opt_a.auto_monad_grad : 0.000007s : 0.02% optimize.opt_a.auto_monad_eliminator : 0.000045s : 0.12% optimize.opt_a.cse : 0.000085s : 0.23% optimize.opt_a.a_3 : 0.000120s : 0.32% optimize.py_interpret_to_execute_after_opt_a : 0.000022s : 0.06% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.01% optimize.rewriter_after_opt_a : 0.000055s : 0.15% optimize.convert_after_rewriter : 0.000008s : 0.02% optimize.order_py_execute_after_rewriter : 0.000007s : 0.02% optimize.mutable_eliminate : 0.000791s : 2.14% optimize.opt_b.b_1 : 0.000181s : 0.49% optimize.opt_b.b_2 : 0.000012s : 0.03% optimize.opt_b.updatestate_depend_eliminate : 0.000009s : 0.03% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.01% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.01% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000036s : 0.10% optimize.optimize_parallel_all_gather_comm : 0.000023s : 0.06% optimize.overlap_param_gather : 0.000002s : 0.01% optimize.cconv : 0.000037s : 0.10% optimize.loop_unroll : 0.000568s : 1.53% optimize.opt_after_cconv.c_1 : 0.000038s : 0.10% optimize.opt_after_cconv.parameter_eliminate : 0.000005s : 0.01% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000008s : 0.02% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000004s : 0.01% optimize.opt_after_cconv.cse : 0.000030s : 0.08% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000018s : 0.05% optimize.tuple_transform.d_1 : 0.000057s : 0.15% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000009s : 0.02% optimize.partial_unused_args_eliminate : 0.000002s : 0.01% optimize.add_recomputation : 0.000066s : 0.18% optimize.cse_after_recomputation.cse : 0.000016s : 0.04% optimize.environ_conv : 0.000008s : 0.02% optimize.swap_dp_allreduce_reducescatter : 0.000007s : 0.02% optimize.bias_add_comm_swap : 0.000003s : 0.01% optimize.label_micro_interleaved_index : 0.000005s : 0.01% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.01% optimize.merge_cast_opt : 0.000002s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.01% optimize.micro_interleaved_order_control : 0.000003s : 0.01% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.00% optimize.full_micro_interleaved_order_control : 0.000002s : 0.01% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.01% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000002s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.00% optimize.control_data_broadcast_order : 0.000016s : 0.04% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.00% optimize.offloading_packed_experts : 0.000005s : 0.01% optimize.overlap_recompute_and_grad_model_parallel : 0.000006s : 0.02% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.00% optimize.overlap_recompute_comm : 0.000002s : 0.01% optimize.overlap_grad_ring_attention : 0.000005s : 0.01% optimize.overlap_grad_flash_sp : 0.000026s : 0.07% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.01% optimize.split_layernorm_comm : 0.000002s : 0.01% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000006s : 0.02% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000016s : 0.04% optimize.symbol_engine_optimizer.elim_not_effective : 0.000017s : 0.05% optimize.symbol_engine_optimizer.opt_reshape : 0.000009s : 0.02% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000013s : 0.03% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.01% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000022s : 0.06% get_jit_bprop_graph : 0.000002s : 0.00% rewriter_after_jit_bprop_graph : 0.000007s : 0.02% opt_after_jit_grad : 0.000611s : 1.65% validate : 0.000056s : 0.15% Time group info: ------[substitution.] 0.000263 39 12.31% : 0.000032s : 3: substitution.cast_eliminate 0.97% : 0.000003s : 3: substitution.elim_not_effective 0.73% : 0.000002s : 3: substitution.fold_const_symbol 2.83% : 0.000007s : 5: substitution.graph_param_transform 66.90% : 0.000176s : 4: substitution.inline 2.28% : 0.000006s : 6: substitution.j_node_and_user_rematch 2.99% : 0.000008s : 6: substitution.remove_not_recompute_node 2.53% : 0.000007s : 4: substitution.replace_old_param 4.91% : 0.000013s : 4: substitution.tuple_list_get_item_eliminator 3.54% : 0.000009s : 1: substitution.value_based_eliminate ------[type_inference.] 0.018206 2 96.29% : 0.017530s : 1: type_inference.infer 3.71% : 0.000676s : 1: type_inference.specialize ------[replace.] 0.000076 8 58.44% : 0.000045s : 4: replace.inline 41.56% : 0.000032s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000184 8 93.96% : 0.000173s : 4: match.inline 6.04% : 0.000011s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000260 1504 0.78% : 0.000002s : 15: predicate.accumulaten_eliminater 0.96% : 0.000002s : 5: predicate.ad_related_special_op_eliminate 0.58% : 0.000002s : 10: predicate.addn_check_dump 0.82% : 0.000002s : 15: predicate.addn_zero_filter 0.75% : 0.000002s : 15: predicate.adjust_all_reduce_mul_add 2.11% : 0.000005s : 25: predicate.arithmetic_simplify 1.16% : 0.000003s : 15: predicate.cast_eliminate 0.63% : 0.000002s : 10: predicate.check_bprop_eliminate 0.56% : 0.000001s : 10: predicate.compare_switch_simplify 0.18% : 0.000000s : 5: predicate.const_output_eliminate 0.65% : 0.000002s : 10: predicate.depend_value_elim 0.83% : 0.000002s : 15: predicate.dict_get_item_const_eliminator 0.88% : 0.000002s : 15: predicate.dict_get_item_eliminator 0.85% : 0.000002s : 15: predicate.dict_set_item_eliminator 0.97% : 0.000003s : 10: predicate.dumpgradient_eliminate 0.28% : 0.000001s : 5: predicate.elim_not_effective 0.48% : 0.000001s : 5: predicate.elim_shapecalc_of_broadcastargs 1.18% : 0.000003s : 20: predicate.environ_add_const_eliminate 0.99% : 0.000003s : 20: predicate.environ_get_add_eliminate 1.02% : 0.000003s : 20: predicate.environ_get_depend_swap 1.60% : 0.000004s : 30: predicate.environ_get_eliminate 1.07% : 0.000003s : 20: predicate.environ_get_set_eliminate 1.32% : 0.000003s : 23: predicate.exchange_switch_depend_value 2.18% : 0.000006s : 23: predicate.float_depend_g_call 0.57% : 0.000001s : 10: predicate.float_environ_get_switch 0.97% : 0.000003s : 15: predicate.float_tuple_getitem_switch 0.18% : 0.000000s : 5: predicate.fold_const_symbol 0.75% : 0.000002s : 10: predicate.get_grad_eliminate 0.22% : 0.000001s : 5: predicate.graph_param_transform 0.76% : 0.000002s : 10: predicate.incorporate_call 0.51% : 0.000001s : 10: predicate.incorporate_call_switch 6.65% : 0.000017s : 68: predicate.inline 0.95% : 0.000002s : 10: predicate.inline_without_move 0.36% : 0.000001s : 10: predicate.j_node_and_user_rematch 0.99% : 0.000003s : 10: predicate.less_batch_normalization 1.77% : 0.000005s : 29: predicate.list_to_tuple_eliminator_ 2.29% : 0.000006s : 44: predicate.load_eliminater 0.98% : 0.000003s : 5: predicate.loop_unroll_after_grad 1.97% : 0.000005s : 36: predicate.loop_unroll_before_grad 1.65% : 0.000004s : 25: predicate.make_slice_get_slice_eliminator 0.66% : 0.000002s : 10: predicate.merge_addn 0.58% : 0.000002s : 10: predicate.micro_step_allgather_replace 0.62% : 0.000002s : 10: predicate.mini_step_allgather_replace 0.84% : 0.000002s : 15: predicate.minmaximum_grad 1.32% : 0.000003s : 5: predicate.mutable_eliminate 0.36% : 0.000001s : 5: predicate.opt_reshape 0.42% : 0.000001s : 5: predicate.parallel_virtual_node 1.89% : 0.000005s : 23: predicate.partial_defer_inline 1.49% : 0.000004s : 24: predicate.partial_eliminate 0.86% : 0.000002s : 15: predicate.print_const_string_wrapper 0.62% : 0.000002s : 10: predicate.reduce_all_const_elim 1.35% : 0.000004s : 15: predicate.reduce_eliminate 2.28% : 0.000006s : 44: predicate.redundant_stop_gradient_eliminater 0.51% : 0.000001s : 10: predicate.remove_not_recompute_node 1.34% : 0.000003s : 29: predicate.replace_applicator 0.54% : 0.000001s : 10: predicate.replace_old_param 0.22% : 0.000001s : 5: predicate.reset_defer_inline 1.35% : 0.000004s : 15: predicate.reshape_eliminate 0.78% : 0.000002s : 10: predicate.row_tensor_add_zeros_like 0.42% : 0.000001s : 5: predicate.row_tensor_eliminate 0.82% : 0.000002s : 10: predicate.same_eliminate 0.51% : 0.000001s : 10: predicate.set_cell_output_no_recompute 0.72% : 0.000002s : 10: predicate.shard_identity_eliminate 0.83% : 0.000002s : 10: predicate.special_op_eliminate 0.81% : 0.000002s : 10: predicate.specialize_transform 1.23% : 0.000003s : 10: predicate.split_environ_get_set_with_tuple_value 1.08% : 0.000003s : 10: predicate.stack_unstack_eliminate 0.33% : 0.000001s : 5: predicate.switch_call_monad_eliminater 1.40% : 0.000004s : 23: predicate.switch_defer_inline 1.80% : 0.000005s : 33: predicate.switch_layer_defer_inline 4.78% : 0.000012s : 74: predicate.switch_simplify 0.85% : 0.000002s : 15: predicate.tile_eliminate 0.79% : 0.000002s : 15: predicate.transpose_eliminate 1.47% : 0.000004s : 25: predicate.tuple_list_convert_item_index_to_positive 1.48% : 0.000004s : 25: predicate.tuple_list_get_item_const_eliminator 1.44% : 0.000004s : 25: predicate.tuple_list_get_item_depend_reorder 3.33% : 0.000009s : 39: predicate.tuple_list_get_item_eliminator 1.45% : 0.000004s : 25: predicate.tuple_list_get_set_item_eliminator 2.21% : 0.000006s : 35: predicate.tuple_list_set_item_eliminator 1.65% : 0.000004s : 29: predicate.tuple_to_list_eliminator_ 2.24% : 0.000006s : 44: predicate.updatestate_pure_node_eliminater 3.00% : 0.000008s : 54: predicate.updatestate_useless_node_eliminater 0.62% : 0.000002s : 5: predicate.value_based_eliminate 0.78% : 0.000002s : 10: predicate.virtual_dataset_eliminate 0.80% : 0.000002s : 10: predicate.virtual_output_eliminate 0.27% : 0.000001s : 5: predicate.virtual_view_grad_eliminate 0.40% : 0.000001s : 5: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000534 11 51.23% : 0.000273s : 5: func_graph_cloner_run.FuncGraphClonerGraph 48.77% : 0.000260s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.078333 192 0.00% : 0.000004s : 1: ForceFp32Comm 4.48% : 0.003513s : 1: add_attr 4.47% : 0.003498s : 1: add_attr_with_inline 0.01% : 0.000005s : 1: add_comm_op_reuse_tag 0.09% : 0.000071s : 1: add_recomputation 0.01% : 0.000004s : 1: assign_add_opt 0.09% : 0.000072s : 1: auto_monad 0.03% : 0.000027s : 1: auto_monad_reorder 0.01% : 0.000004s : 1: begin_end_overlap_inline 0.01% : 0.000006s : 1: bias_add_comm_swap 0.60% : 0.000472s : 1: bootstrap 0.05% : 0.000041s : 1: cconv 0.01% : 0.000004s : 1: comm_op_add_attrs 0.03% : 0.000020s : 1: control_data_broadcast_order 0.01% : 0.000011s : 1: convert_after_rewriter 0.04% : 0.000031s : 1: cse_after_recomputation 0.01% : 0.000005s : 1: dataset_repeat_opt 0.01% : 0.000006s : 1: detach_backward 0.01% : 0.000011s : 1: environ_conv 0.04% : 0.000029s : 1: event_method 0.01% : 0.000005s : 1: full_micro_interleaved_order_control 0.01% : 0.000005s : 1: get_jit_bprop_graph 0.01% : 0.000009s : 1: graph_reusing 0.01% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000004s : 1: handle_group_info 0.01% : 0.000005s : 1: inline 0.01% : 0.000006s : 1: insert-virtual-dataset 0.01% : 0.000004s : 1: interleave_parallel_branches 0.00% : 0.000004s : 1: interleave_split_concat_branches 0.01% : 0.000006s : 1: label_fine_grained_interleaved_index 0.01% : 0.000008s : 1: label_micro_interleaved_index 0.74% : 0.000578s : 1: loop_unroll 0.01% : 0.000004s : 1: merge_cast_opt 0.01% : 0.000005s : 1: micro_interleaved_order_control 1.02% : 0.000802s : 1: mutable_eliminate 0.01% : 0.000007s : 1: offloading_packed_experts 0.03% : 0.000020s : 1: opt.transform.loop_unroll_optimizer 0.03% : 0.000023s : 1: opt.transform.mutable_eliminate 17.33% : 0.013577s : 78: opt.transform.opt_a 0.05% : 0.000036s : 1: opt.transform.opt_after_cconv 0.05% : 0.000037s : 1: opt.transform.opt_after_jit_grad 0.20% : 0.000158s : 28: opt.transform.opt_b 0.08% : 0.000063s : 2: opt.transform.opt_trans_graph 0.06% : 0.000050s : 4: opt.transform.symbol_engine_opt 20.04% : 0.015698s : 1: opt_a 0.17% : 0.000137s : 1: opt_after_cconv 0.79% : 0.000623s : 1: opt_after_jit_grad 0.39% : 0.000302s : 1: opt_b 23.54% : 0.018441s : 1: optimize 0.03% : 0.000027s : 1: optimize_parallel_all_gather_comm 0.01% : 0.000010s : 1: order_py_execute_after_rewriter 0.04% : 0.000030s : 1: overlap_grad_flash_sp 0.00% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.01% : 0.000008s : 1: overlap_grad_ring_attention 0.01% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.01% : 0.000005s : 1: overlap_param_gather 0.01% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.01% : 0.000009s : 1: overlap_recompute_and_grad_model_parallel 0.01% : 0.000005s : 1: overlap_recompute_comm 0.01% : 0.000007s : 1: parallel-infer-symbol 0.00% : 0.000004s : 1: parallel-infer-symbol-second 0.01% : 0.000005s : 1: partial_unused_args_eliminate 0.01% : 0.000004s : 1: pipeline_parallel_scheduler 0.01% : 0.000005s : 1: pipeline_split 0.06% : 0.000047s : 1: pre_auto_parallel 0.05% : 0.000039s : 1: py_interpret_to_execute 0.03% : 0.000025s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000005s : 1: remove_cast_before_assign_add 0.03% : 0.000021s : 1: remove_dup_value 0.65% : 0.000512s : 1: renormalize.infer 0.50% : 0.000395s : 1: renormalize.specialize 0.01% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.01% : 0.000010s : 1: rewriter_after_jit_bprop_graph 0.08% : 0.000059s : 1: rewriter_after_opt_a 0.13% : 0.000101s : 1: rewriter_before_opt_a 0.01% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.01% : 0.000005s : 1: slice_recompute_activation 0.01% : 0.000005s : 1: split_layernorm_comm 0.01% : 0.000005s : 1: split_matmul_comm_elemetwise 0.01% : 0.000010s : 1: swap_dp_allreduce_reducescatter 0.13% : 0.000103s : 1: symbol_engine_optimizer 0.12% : 0.000095s : 1: tuple_transform 23.35% : 0.018294s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:44:41.831.288 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:44:41.831.564 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0426461, [21] [bootstrap]: 0.00045043 [type_inference]: 0.00634451 [event_method]: 2.173e-05 [auto_monad]: 6.781e-05 [graph_reusing]: 6.88e-06 [inline]: 2.75002e-06 [add_attr]: 0.00331072, [1] [add_attr_with_inline]: 0.00329912, [1] [Cycle 1]: 8.636e-05, [2] [tag_attr]: 2.242e-05 [meta_addattr_fg_expand]: 6.51e-06 [parallel-infer-symbol]: 3.79002e-06 [pre_auto_parallel]: 3.909e-05 [insert-virtual-dataset]: 3.08e-06 [parallel-infer-symbol-second]: 7.09988e-07 [dataset_repeat_opt]: 2.11e-06 [pipeline_split]: 1.71e-06 [optimize]: 0.0310213, [53] [py_interpret_to_execute]: 3.307e-05 [rewriter_before_opt_a]: 9.596e-05 [opt_a]: 0.00375002, [2] [Cycle 1]: 0.00267003, [45] [expand_dump_flag]: 3.24001e-06 [switch_simplify]: 4.455e-05 [loop_unroll]: 3.162e-05 [a_1]: 0.00071643 [with_stream_mark]: 2.477e-05 [recompute_prepare]: 1.411e-05 [updatestate_depend_eliminate]: 5.12e-06 [updatestate_assign_eliminate]: 3.95e-06 [updatestate_loads_eliminate]: 3.97e-06 [parameter_eliminate]: 2.09e-06 [a_2]: 0.00013211 [accelerated_algorithm]: 8.52e-06 [shard]: 2.41e-06 [meta_shard_fg_expand]: 2.31998e-06 [shard_inline]: 8.65999e-06 [merge_send_recv]: 1.003e-05 [auto_parallel]: 8.86002e-06 [parallel]: 1.922e-05 [flash_sp]: 9.81998e-06 [merge_comm]: 4.95001e-06 [allreduce_fusion]: 4.35999e-06 [matmul_add_comm_reduction]: 1.115e-05 [allreduce_slice_to_reducescatter]: 7.89994e-07 [virtual_shard_identity]: 1.141e-05 [virtual_dataset]: 8.50999e-06 [get_grad_eliminate_]: 7.97e-06 [virtual_output]: 8.30999e-06 [merge_forward]: 5.24e-06 [cell_reuse_recompute_pass]: 1.05001e-06 [offload_activation]: 1.393e-05 [cell_reuse_handle_not_recompute_node_pass]: 4.11e-05 [merge_recompute_call_nodes]: 2.03997e-06 [before_grad]: 1.509e-05 [set_forward_comm_id_for_comm_node_pass]: 5.79e-06 [meta_fg_expand]: 3.8e-06 [flash_sp_send_recv_attached]: 3.89002e-06 [receive_attached]: 2.63998e-06 [after_resolve]: 1.646e-05 [a_after_grad]: 1.335e-05 [renormalize]: 0.00085898 [add_forward_monad_depend]: 7.86001e-06 [auto_monad_grad]: 2.96999e-06 [auto_monad_eliminator]: 2.161e-05 [cse]: 3.771e-05 [a_3]: 7.935e-05 [Cycle 2]: 0.00106102, [45] [expand_dump_flag]: 2.19001e-06 [switch_simplify]: 1.08e-05 [loop_unroll]: 7.5e-06 [a_1]: 0.00018549 [with_stream_mark]: 1.849e-05 [recompute_prepare]: 9.41e-06 [updatestate_depend_eliminate]: 5.05999e-06 [updatestate_assign_eliminate]: 3.66001e-06 [updatestate_loads_eliminate]: 3.83999e-06 [parameter_eliminate]: 1.76e-06 [a_2]: 0.00012204 [accelerated_algorithm]: 9.79e-06 [shard]: 3.08e-06 [meta_shard_fg_expand]: 2.40002e-06 [shard_inline]: 1.093e-05 [merge_send_recv]: 8.45001e-06 [auto_parallel]: 1.06e-05 [parallel]: 7.41999e-06 [flash_sp]: 4.03999e-06 [merge_comm]: 5.29998e-06 [allreduce_fusion]: 4.28999e-06 [matmul_add_comm_reduction]: 9.00001e-06 [allreduce_slice_to_reducescatter]: 3.39991e-07 [virtual_shard_identity]: 1.083e-05 [virtual_dataset]: 7.34002e-06 [get_grad_eliminate_]: 7.13998e-06 [virtual_output]: 6.98998e-06 [merge_forward]: 5.06002e-06 [cell_reuse_recompute_pass]: 2.19999e-06 [offload_activation]: 1.158e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.181e-05 [merge_recompute_call_nodes]: 1.52999e-06 [before_grad]: 1.346e-05 [set_forward_comm_id_for_comm_node_pass]: 7.30998e-06 [meta_fg_expand]: 3.21999e-06 [flash_sp_send_recv_attached]: 1.30001e-06 [receive_attached]: 1.84e-06 [after_resolve]: 1.371e-05 [a_after_grad]: 1.272e-05 [renormalize]: 5.9983e-08 [add_forward_monad_depend]: 4.18001e-06 [auto_monad_grad]: 1.80001e-06 [auto_monad_eliminator]: 1.453e-05 [cse]: 2.755e-05 [a_3]: 6.396e-05 [py_interpret_to_execute_after_opt_a]: 2.249e-05 [slice_cell_reuse_recomputed_activation]: 5.46e-06 [rewriter_after_opt_a]: 5.901e-05 [convert_after_rewriter]: 1.198e-05 [order_py_execute_after_rewriter]: 1.004e-05 [mutable_eliminate]: 0.00073791 [opt_b]: 0.00037142, [1] [Cycle 1]: 0.00035829, [7] [b_1]: 0.00022367 [b_2]: 1.071e-05 [updatestate_depend_eliminate]: 1.131e-05 [updatestate_assign_eliminate]: 4.03999e-06 [updatestate_loads_eliminate]: 4.07e-06 [renormalize]: 5.10016e-07 [cse]: 3.537e-05 [optimize_parallel_all_gather_comm]: 2.908e-05 [overlap_param_gather]: 4.94e-06 [cconv]: 4.062e-05 [loop_unroll]: 0.0248359 [opt_after_cconv]: 0.00019567, [1] [Cycle 1]: 0.00018241, [7] [c_1]: 4.354e-05 [parameter_eliminate]: 7.21999e-06 [updatestate_depend_eliminate]: 1.4e-05 [updatestate_assign_eliminate]: 4.27998e-06 [updatestate_loads_eliminate]: 3.95e-06 [cse]: 4.883e-05 [renormalize]: 5.79981e-07 [remove_dup_value]: 1.991e-05 [tuple_transform]: 0.00011506, [1] [Cycle 1]: 0.00010696, [4] [d_1]: 6.419e-05 [none_parameter_eliminate]: 1.86998e-06 [renormalize]: 1.60013e-07 [switch_simplify]: 9.50001e-06 [partial_unused_args_eliminate]: 4.95999e-06 [add_recomputation]: 7.294e-05 [cse_after_recomputation]: 3.719e-05, [1] [Cycle 1]: 2.901e-05, [1] [cse]: 1.723e-05 [environ_conv]: 1.089e-05 [swap_dp_allreduce_reducescatter]: 9.73998e-06 [bias_add_comm_swap]: 5.62999e-06 [label_micro_interleaved_index]: 1.021e-05 [label_fine_grained_interleaved_index]: 5.56998e-06 [merge_cast_opt]: 3.80998e-06 [slice_recompute_activation]: 5.07e-06 [micro_interleaved_order_control]: 5.18002e-06 [assign_add_opt]: 3.93001e-06 [ForceFp32Comm]: 3.53999e-06 [remove_cast_before_assign_add]: 3.65998e-06 [full_micro_interleaved_order_control]: 4.84e-06 [reorder_send_recv_between_fp_bp]: 5.37001e-06 [comm_op_add_attrs]: 4.1e-06 [add_comm_op_reuse_tag]: 3.25998e-06 [interleave_split_concat_branches]: 3.45e-06 [interleave_parallel_branches]: 3.4e-06 [overlap_opt_shard_in_pipeline]: 3.66999e-06 [overlap_opt_shard_grad_in_pipeline]: 4.2e-06 [control_data_broadcast_order]: 2.085e-05 [grouped_pairwise_exchange_alltoall]: 3.8e-06 [offloading_packed_experts]: 8.08001e-06 [overlap_recompute_and_grad_model_parallel]: 8.31002e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.58e-06 [overlap_recompute_allgather_and_fa_grad]: 3.73001e-06 [overlap_recompute_comm]: 4.86002e-06 [overlap_grad_ring_attention]: 7.28e-06 [overlap_grad_flash_sp]: 2.968e-05 [begin_end_overlap_inline]: 2.94999e-06 [split_matmul_comm_elemetwise]: 4.54002e-06 [split_layernorm_comm]: 4.35e-06 [handle_group_info]: 3.57997e-06 [symbol_engine_optimizer]: 0.00011781, [1] [Cycle 1]: 0.00010988, [6] [build]: 5.14e-06 [elim_shapecalc]: 1.507e-05 [elim_not_effective]: 1.677e-05 [opt_reshape]: 1.061e-05 [fold_const_symbol]: 1.35e-05 [renormalize]: 1.80007e-07 [detach_backward]: 4.47e-06 [pipeline_parallel_scheduler]: 1.87999e-06 [auto_monad_reorder]: 2.572e-05 [get_jit_bprop_graph]: 2.58e-06 [rewriter_after_jit_bprop_graph]: 6.96001e-06 [opt_after_jit_grad]: 0.00063056 [validate]: 5.028e-05 Sums bootstrap : 0.000450s : 1.21% type_inference : 0.006345s : 16.97% event_method : 0.000022s : 0.06% auto_monad : 0.000068s : 0.18% graph_reusing : 0.000007s : 0.02% inline : 0.000003s : 0.01% add_attr.add_attr_with_inline.tag_attr : 0.000022s : 0.06% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000007s : 0.02% parallel-infer-symbol : 0.000004s : 0.01% pre_auto_parallel : 0.000039s : 0.10% insert-virtual-dataset : 0.000003s : 0.01% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.01% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000033s : 0.09% optimize.rewriter_before_opt_a : 0.000096s : 0.26% optimize.opt_a.expand_dump_flag : 0.000005s : 0.01% optimize.opt_a.switch_simplify : 0.000055s : 0.15% optimize.opt_a.loop_unroll : 0.000039s : 0.10% optimize.opt_a.a_1 : 0.000902s : 2.41% optimize.opt_a.with_stream_mark : 0.000043s : 0.12% optimize.opt_a.recompute_prepare : 0.000024s : 0.06% optimize.opt_a.updatestate_depend_eliminate : 0.000010s : 0.03% optimize.opt_a.updatestate_assign_eliminate : 0.000008s : 0.02% optimize.opt_a.updatestate_loads_eliminate : 0.000008s : 0.02% optimize.opt_a.parameter_eliminate : 0.000004s : 0.01% optimize.opt_a.a_2 : 0.000254s : 0.68% optimize.opt_a.accelerated_algorithm : 0.000018s : 0.05% optimize.opt_a.shard : 0.000005s : 0.01% optimize.opt_a.meta_shard_fg_expand : 0.000005s : 0.01% optimize.opt_a.shard_inline : 0.000020s : 0.05% optimize.opt_a.merge_send_recv : 0.000018s : 0.05% optimize.opt_a.auto_parallel : 0.000019s : 0.05% optimize.opt_a.parallel : 0.000027s : 0.07% optimize.opt_a.flash_sp : 0.000014s : 0.04% optimize.opt_a.merge_comm : 0.000010s : 0.03% optimize.opt_a.allreduce_fusion : 0.000009s : 0.02% optimize.opt_a.matmul_add_comm_reduction : 0.000020s : 0.05% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000022s : 0.06% optimize.opt_a.virtual_dataset : 0.000016s : 0.04% optimize.opt_a.get_grad_eliminate_ : 0.000015s : 0.04% optimize.opt_a.virtual_output : 0.000015s : 0.04% optimize.opt_a.merge_forward : 0.000010s : 0.03% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.01% optimize.opt_a.offload_activation : 0.000026s : 0.07% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000063s : 0.17% optimize.opt_a.merge_recompute_call_nodes : 0.000004s : 0.01% optimize.opt_a.before_grad : 0.000029s : 0.08% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000013s : 0.04% optimize.opt_a.meta_fg_expand : 0.000007s : 0.02% optimize.opt_a.flash_sp_send_recv_attached : 0.000005s : 0.01% optimize.opt_a.receive_attached : 0.000004s : 0.01% optimize.opt_a.after_resolve : 0.000030s : 0.08% optimize.opt_a.a_after_grad : 0.000026s : 0.07% optimize.opt_a.renormalize : 0.000859s : 2.30% optimize.opt_a.add_forward_monad_depend : 0.000012s : 0.03% optimize.opt_a.auto_monad_grad : 0.000005s : 0.01% optimize.opt_a.auto_monad_eliminator : 0.000036s : 0.10% optimize.opt_a.cse : 0.000065s : 0.17% optimize.opt_a.a_3 : 0.000143s : 0.38% optimize.py_interpret_to_execute_after_opt_a : 0.000022s : 0.06% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.01% optimize.rewriter_after_opt_a : 0.000059s : 0.16% optimize.convert_after_rewriter : 0.000012s : 0.03% optimize.order_py_execute_after_rewriter : 0.000010s : 0.03% optimize.mutable_eliminate : 0.000738s : 1.97% optimize.opt_b.b_1 : 0.000224s : 0.60% optimize.opt_b.b_2 : 0.000011s : 0.03% optimize.opt_b.updatestate_depend_eliminate : 0.000011s : 0.03% optimize.opt_b.updatestate_assign_eliminate : 0.000004s : 0.01% optimize.opt_b.updatestate_loads_eliminate : 0.000004s : 0.01% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000035s : 0.09% optimize.optimize_parallel_all_gather_comm : 0.000029s : 0.08% optimize.overlap_param_gather : 0.000005s : 0.01% optimize.cconv : 0.000041s : 0.11% optimize.loop_unroll : 0.024836s : 66.44% optimize.opt_after_cconv.c_1 : 0.000044s : 0.12% optimize.opt_after_cconv.parameter_eliminate : 0.000007s : 0.02% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000014s : 0.04% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000004s : 0.01% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000004s : 0.01% optimize.opt_after_cconv.cse : 0.000049s : 0.13% optimize.opt_after_cconv.renormalize : 0.000001s : 0.00% optimize.remove_dup_value : 0.000020s : 0.05% optimize.tuple_transform.d_1 : 0.000064s : 0.17% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000010s : 0.03% optimize.partial_unused_args_eliminate : 0.000005s : 0.01% optimize.add_recomputation : 0.000073s : 0.20% optimize.cse_after_recomputation.cse : 0.000017s : 0.05% optimize.environ_conv : 0.000011s : 0.03% optimize.swap_dp_allreduce_reducescatter : 0.000010s : 0.03% optimize.bias_add_comm_swap : 0.000006s : 0.02% optimize.label_micro_interleaved_index : 0.000010s : 0.03% optimize.label_fine_grained_interleaved_index : 0.000006s : 0.01% optimize.merge_cast_opt : 0.000004s : 0.01% optimize.slice_recompute_activation : 0.000005s : 0.01% optimize.micro_interleaved_order_control : 0.000005s : 0.01% optimize.assign_add_opt : 0.000004s : 0.01% optimize.ForceFp32Comm : 0.000004s : 0.01% optimize.remove_cast_before_assign_add : 0.000004s : 0.01% optimize.full_micro_interleaved_order_control : 0.000005s : 0.01% optimize.reorder_send_recv_between_fp_bp : 0.000005s : 0.01% optimize.comm_op_add_attrs : 0.000004s : 0.01% optimize.add_comm_op_reuse_tag : 0.000003s : 0.01% optimize.interleave_split_concat_branches : 0.000003s : 0.01% optimize.interleave_parallel_branches : 0.000003s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000004s : 0.01% optimize.control_data_broadcast_order : 0.000021s : 0.06% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.01% optimize.offloading_packed_experts : 0.000008s : 0.02% optimize.overlap_recompute_and_grad_model_parallel : 0.000008s : 0.02% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.01% optimize.overlap_recompute_comm : 0.000005s : 0.01% optimize.overlap_grad_ring_attention : 0.000007s : 0.02% optimize.overlap_grad_flash_sp : 0.000030s : 0.08% optimize.begin_end_overlap_inline : 0.000003s : 0.01% optimize.split_matmul_comm_elemetwise : 0.000005s : 0.01% optimize.split_layernorm_comm : 0.000004s : 0.01% optimize.handle_group_info : 0.000004s : 0.01% optimize.symbol_engine_optimizer.build : 0.000005s : 0.01% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000015s : 0.04% optimize.symbol_engine_optimizer.elim_not_effective : 0.000017s : 0.04% optimize.symbol_engine_optimizer.opt_reshape : 0.000011s : 0.03% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000013s : 0.04% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000004s : 0.01% pipeline_parallel_scheduler : 0.000002s : 0.01% auto_monad_reorder : 0.000026s : 0.07% get_jit_bprop_graph : 0.000003s : 0.01% rewriter_after_jit_bprop_graph : 0.000007s : 0.02% opt_after_jit_grad : 0.000631s : 1.69% validate : 0.000050s : 0.13% Time group info: ------[substitution.] 0.000252 39 11.84% : 0.000030s : 3: substitution.cast_eliminate 0.93% : 0.000002s : 3: substitution.elim_not_effective 0.84% : 0.000002s : 3: substitution.fold_const_symbol 3.28% : 0.000008s : 5: substitution.graph_param_transform 67.01% : 0.000169s : 4: substitution.inline 2.37% : 0.000006s : 6: substitution.j_node_and_user_rematch 2.81% : 0.000007s : 6: substitution.remove_not_recompute_node 2.31% : 0.000006s : 4: substitution.replace_old_param 5.04% : 0.000013s : 4: substitution.tuple_list_get_item_eliminator 3.57% : 0.000009s : 1: substitution.value_based_eliminate ------[type_inference.] 0.006284 2 87.59% : 0.005504s : 1: type_inference.infer 12.41% : 0.000780s : 1: type_inference.specialize ------[replace.] 0.000065 8 60.71% : 0.000040s : 4: replace.inline 39.29% : 0.000026s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000176 8 93.81% : 0.000165s : 4: match.inline 6.19% : 0.000011s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000259 1504 0.81% : 0.000002s : 15: predicate.accumulaten_eliminater 0.87% : 0.000002s : 5: predicate.ad_related_special_op_eliminate 0.53% : 0.000001s : 10: predicate.addn_check_dump 0.89% : 0.000002s : 15: predicate.addn_zero_filter 0.75% : 0.000002s : 15: predicate.adjust_all_reduce_mul_add 2.13% : 0.000006s : 25: predicate.arithmetic_simplify 1.13% : 0.000003s : 15: predicate.cast_eliminate 0.58% : 0.000001s : 10: predicate.check_bprop_eliminate 0.54% : 0.000001s : 10: predicate.compare_switch_simplify 0.17% : 0.000000s : 5: predicate.const_output_eliminate 0.66% : 0.000002s : 10: predicate.depend_value_elim 0.82% : 0.000002s : 15: predicate.dict_get_item_const_eliminator 0.93% : 0.000002s : 15: predicate.dict_get_item_eliminator 0.75% : 0.000002s : 15: predicate.dict_set_item_eliminator 0.99% : 0.000003s : 10: predicate.dumpgradient_eliminate 0.24% : 0.000001s : 5: predicate.elim_not_effective 0.53% : 0.000001s : 5: predicate.elim_shapecalc_of_broadcastargs 1.07% : 0.000003s : 20: predicate.environ_add_const_eliminate 1.05% : 0.000003s : 20: predicate.environ_get_add_eliminate 1.03% : 0.000003s : 20: predicate.environ_get_depend_swap 1.71% : 0.000004s : 30: predicate.environ_get_eliminate 1.05% : 0.000003s : 20: predicate.environ_get_set_eliminate 1.29% : 0.000003s : 23: predicate.exchange_switch_depend_value 2.13% : 0.000005s : 23: predicate.float_depend_g_call 0.54% : 0.000001s : 10: predicate.float_environ_get_switch 0.81% : 0.000002s : 15: predicate.float_tuple_getitem_switch 0.18% : 0.000000s : 5: predicate.fold_const_symbol 0.67% : 0.000002s : 10: predicate.get_grad_eliminate 0.19% : 0.000000s : 5: predicate.graph_param_transform 0.60% : 0.000002s : 10: predicate.incorporate_call 0.54% : 0.000001s : 10: predicate.incorporate_call_switch 6.36% : 0.000016s : 68: predicate.inline 0.88% : 0.000002s : 10: predicate.inline_without_move 0.33% : 0.000001s : 10: predicate.j_node_and_user_rematch 0.77% : 0.000002s : 10: predicate.less_batch_normalization 1.80% : 0.000005s : 29: predicate.list_to_tuple_eliminator_ 2.42% : 0.000006s : 44: predicate.load_eliminater 2.96% : 0.000008s : 5: predicate.loop_unroll_after_grad 1.92% : 0.000005s : 36: predicate.loop_unroll_before_grad 1.77% : 0.000005s : 25: predicate.make_slice_get_slice_eliminator 0.56% : 0.000001s : 10: predicate.merge_addn 0.58% : 0.000002s : 10: predicate.micro_step_allgather_replace 0.61% : 0.000002s : 10: predicate.mini_step_allgather_replace 0.77% : 0.000002s : 15: predicate.minmaximum_grad 1.30% : 0.000003s : 5: predicate.mutable_eliminate 0.43% : 0.000001s : 5: predicate.opt_reshape 0.49% : 0.000001s : 5: predicate.parallel_virtual_node 1.74% : 0.000005s : 23: predicate.partial_defer_inline 1.51% : 0.000004s : 24: predicate.partial_eliminate 0.80% : 0.000002s : 15: predicate.print_const_string_wrapper 0.58% : 0.000001s : 10: predicate.reduce_all_const_elim 1.07% : 0.000003s : 15: predicate.reduce_eliminate 2.44% : 0.000006s : 44: predicate.redundant_stop_gradient_eliminater 0.75% : 0.000002s : 10: predicate.remove_not_recompute_node 1.29% : 0.000003s : 29: predicate.replace_applicator 0.51% : 0.000001s : 10: predicate.replace_old_param 0.32% : 0.000001s : 5: predicate.reset_defer_inline 0.99% : 0.000003s : 15: predicate.reshape_eliminate 0.66% : 0.000002s : 10: predicate.row_tensor_add_zeros_like 0.34% : 0.000001s : 5: predicate.row_tensor_eliminate 0.75% : 0.000002s : 10: predicate.same_eliminate 0.51% : 0.000001s : 10: predicate.set_cell_output_no_recompute 0.73% : 0.000002s : 10: predicate.shard_identity_eliminate 0.68% : 0.000002s : 10: predicate.special_op_eliminate 0.77% : 0.000002s : 10: predicate.specialize_transform 1.18% : 0.000003s : 10: predicate.split_environ_get_set_with_tuple_value 0.69% : 0.000002s : 10: predicate.stack_unstack_eliminate 0.39% : 0.000001s : 5: predicate.switch_call_monad_eliminater 1.31% : 0.000003s : 23: predicate.switch_defer_inline 2.04% : 0.000005s : 33: predicate.switch_layer_defer_inline 4.65% : 0.000012s : 74: predicate.switch_simplify 0.85% : 0.000002s : 15: predicate.tile_eliminate 0.84% : 0.000002s : 15: predicate.transpose_eliminate 1.56% : 0.000004s : 25: predicate.tuple_list_convert_item_index_to_positive 1.55% : 0.000004s : 25: predicate.tuple_list_get_item_const_eliminator 1.38% : 0.000004s : 25: predicate.tuple_list_get_item_depend_reorder 3.35% : 0.000009s : 39: predicate.tuple_list_get_item_eliminator 1.46% : 0.000004s : 25: predicate.tuple_list_get_set_item_eliminator 2.42% : 0.000006s : 35: predicate.tuple_list_set_item_eliminator 1.75% : 0.000005s : 29: predicate.tuple_to_list_eliminator_ 2.33% : 0.000006s : 44: predicate.updatestate_pure_node_eliminater 3.05% : 0.000008s : 54: predicate.updatestate_useless_node_eliminater 0.54% : 0.000001s : 5: predicate.value_based_eliminate 0.66% : 0.000002s : 10: predicate.virtual_dataset_eliminate 0.67% : 0.000002s : 10: predicate.virtual_output_eliminate 0.30% : 0.000001s : 5: predicate.virtual_view_grad_eliminate 0.42% : 0.000001s : 5: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000609 11 52.57% : 0.000320s : 5: func_graph_cloner_run.FuncGraphClonerGraph 47.43% : 0.000289s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.079564 192 0.01% : 0.000006s : 1: ForceFp32Comm 4.17% : 0.003321s : 1: add_attr 4.15% : 0.003304s : 1: add_attr_with_inline 0.01% : 0.000006s : 1: add_comm_op_reuse_tag 0.10% : 0.000077s : 1: add_recomputation 0.01% : 0.000006s : 1: assign_add_opt 0.10% : 0.000079s : 1: auto_monad 0.04% : 0.000034s : 1: auto_monad_reorder 0.01% : 0.000006s : 1: begin_end_overlap_inline 0.01% : 0.000008s : 1: bias_add_comm_swap 0.62% : 0.000493s : 1: bootstrap 0.06% : 0.000044s : 1: cconv 0.01% : 0.000007s : 1: comm_op_add_attrs 0.03% : 0.000024s : 1: control_data_broadcast_order 0.02% : 0.000015s : 1: convert_after_rewriter 0.05% : 0.000040s : 1: cse_after_recomputation 0.01% : 0.000008s : 1: dataset_repeat_opt 0.03% : 0.000023s : 1: detach_backward 0.02% : 0.000014s : 1: environ_conv 0.04% : 0.000032s : 1: event_method 0.01% : 0.000008s : 1: full_micro_interleaved_order_control 0.01% : 0.000008s : 1: get_jit_bprop_graph 0.02% : 0.000013s : 1: graph_reusing 0.01% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000007s : 1: handle_group_info 0.01% : 0.000008s : 1: inline 0.01% : 0.000010s : 1: insert-virtual-dataset 0.01% : 0.000006s : 1: interleave_parallel_branches 0.01% : 0.000006s : 1: interleave_split_concat_branches 0.01% : 0.000008s : 1: label_fine_grained_interleaved_index 0.02% : 0.000013s : 1: label_micro_interleaved_index 31.23% : 0.024850s : 1: loop_unroll 0.01% : 0.000006s : 1: merge_cast_opt 0.01% : 0.000008s : 1: micro_interleaved_order_control 0.94% : 0.000747s : 1: mutable_eliminate 0.01% : 0.000011s : 1: offloading_packed_experts 0.05% : 0.000039s : 1: opt.transform.loop_unroll_optimizer 0.03% : 0.000026s : 1: opt.transform.mutable_eliminate 1.84% : 0.001466s : 78: opt.transform.opt_a 0.05% : 0.000042s : 1: opt.transform.opt_after_cconv 0.04% : 0.000036s : 1: opt.transform.opt_after_jit_grad 0.20% : 0.000156s : 28: opt.transform.opt_b 0.09% : 0.000071s : 2: opt.transform.opt_trans_graph 0.06% : 0.000052s : 4: opt.transform.symbol_engine_opt 4.72% : 0.003754s : 1: opt_a 0.25% : 0.000200s : 1: opt_after_cconv 0.81% : 0.000642s : 1: opt_after_jit_grad 0.47% : 0.000376s : 1: opt_b 39.48% : 0.031412s : 1: optimize 0.04% : 0.000034s : 1: optimize_parallel_all_gather_comm 0.02% : 0.000013s : 1: order_py_execute_after_rewriter 0.04% : 0.000033s : 1: overlap_grad_flash_sp 0.01% : 0.000006s : 1: overlap_grad_matmul_and_grad_allreduce 0.01% : 0.000011s : 1: overlap_grad_ring_attention 0.01% : 0.000007s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000007s : 1: overlap_opt_shard_in_pipeline 0.01% : 0.000009s : 1: overlap_param_gather 0.01% : 0.000006s : 1: overlap_recompute_allgather_and_fa_grad 0.01% : 0.000011s : 1: overlap_recompute_and_grad_model_parallel 0.01% : 0.000008s : 1: overlap_recompute_comm 0.01% : 0.000011s : 1: parallel-infer-symbol 0.01% : 0.000006s : 1: parallel-infer-symbol-second 0.01% : 0.000008s : 1: partial_unused_args_eliminate 0.01% : 0.000010s : 1: pipeline_parallel_scheduler 0.01% : 0.000007s : 1: pipeline_split 0.06% : 0.000047s : 1: pre_auto_parallel 0.05% : 0.000038s : 1: py_interpret_to_execute 0.03% : 0.000027s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000007s : 1: remove_cast_before_assign_add 0.03% : 0.000023s : 1: remove_dup_value 0.58% : 0.000464s : 1: renormalize.infer 0.48% : 0.000383s : 1: renormalize.specialize 0.01% : 0.000008s : 1: reorder_send_recv_between_fp_bp 0.02% : 0.000013s : 1: rewriter_after_jit_bprop_graph 0.08% : 0.000064s : 1: rewriter_after_opt_a 0.13% : 0.000100s : 1: rewriter_before_opt_a 0.01% : 0.000008s : 1: slice_cell_reuse_recomputed_activation 0.01% : 0.000008s : 1: slice_recompute_activation 0.01% : 0.000007s : 1: split_layernorm_comm 0.01% : 0.000007s : 1: split_matmul_comm_elemetwise 0.02% : 0.000012s : 1: swap_dp_allreduce_reducescatter 0.15% : 0.000121s : 1: symbol_engine_optimizer 0.15% : 0.000118s : 1: tuple_transform 8.03% : 0.006387s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:44:42.390.744 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0457747, [21] [bootstrap]: 0.0004539 [type_inference]: 0.0350766 [event_method]: 2.198e-05 [auto_monad]: 6.793e-05 [graph_reusing]: 6.19001e-06 [inline]: 2.77002e-06 [add_attr]: 0.00353864, [1] [add_attr_with_inline]: 0.00352603, [1] [Cycle 1]: 6.699e-05, [2] [tag_attr]: 2.354e-05 [meta_addattr_fg_expand]: 6.12001e-06 [parallel-infer-symbol]: 3.61999e-06 [pre_auto_parallel]: 3.959e-05 [insert-virtual-dataset]: 2.58998e-06 [parallel-infer-symbol-second]: 6.69999e-07 [dataset_repeat_opt]: 1.99e-06 [pipeline_split]: 1.82001e-06 [optimize]: 0.00575557, [53] [py_interpret_to_execute]: 3.123e-05 [rewriter_before_opt_a]: 9.458e-05 [opt_a]: 0.00326426, [2] [Cycle 1]: 0.0023925, [45] [expand_dump_flag]: 3.6e-06 [switch_simplify]: 4.683e-05 [loop_unroll]: 3.209e-05 [a_1]: 0.00070533 [with_stream_mark]: 2.016e-05 [recompute_prepare]: 1.39e-05 [updatestate_depend_eliminate]: 4.85999e-06 [updatestate_assign_eliminate]: 3.68e-06 [updatestate_loads_eliminate]: 3.65e-06 [parameter_eliminate]: 2.06998e-06 [a_2]: 0.00010413 [accelerated_algorithm]: 9.84001e-06 [shard]: 1.81998e-06 [meta_shard_fg_expand]: 2.19001e-06 [shard_inline]: 8.27998e-06 [merge_send_recv]: 1.162e-05 [auto_parallel]: 8.39998e-06 [parallel]: 1.868e-05 [flash_sp]: 1.024e-05 [merge_comm]: 4.94e-06 [allreduce_fusion]: 4.37998e-06 [matmul_add_comm_reduction]: 9.80002e-06 [allreduce_slice_to_reducescatter]: 9.09989e-07 [virtual_shard_identity]: 1.064e-05 [virtual_dataset]: 8.17e-06 [get_grad_eliminate_]: 7.49002e-06 [virtual_output]: 8.10999e-06 [merge_forward]: 5.12999e-06 [cell_reuse_recompute_pass]: 1.27999e-06 [offload_activation]: 1.118e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.869e-05 [merge_recompute_call_nodes]: 1.52001e-06 [before_grad]: 1.267e-05 [set_forward_comm_id_for_comm_node_pass]: 4.94e-06 [meta_fg_expand]: 4.17e-06 [flash_sp_send_recv_attached]: 3.26999e-06 [receive_attached]: 2.32001e-06 [after_resolve]: 1.409e-05 [a_after_grad]: 1.201e-05 [renormalize]: 0.00083855 [add_forward_monad_depend]: 7.95e-06 [auto_monad_grad]: 2.52001e-06 [auto_monad_eliminator]: 1.97e-05 [cse]: 3.994e-05 [a_3]: 6.199e-05 [Cycle 2]: 0.00085975, [45] [expand_dump_flag]: 1.67001e-06 [switch_simplify]: 9.59999e-06 [loop_unroll]: 7.73001e-06 [a_1]: 0.0001797 [with_stream_mark]: 1.74e-05 [recompute_prepare]: 9.09e-06 [updatestate_depend_eliminate]: 4.17998e-06 [updatestate_assign_eliminate]: 3.04999e-06 [updatestate_loads_eliminate]: 3.04001e-06 [parameter_eliminate]: 1.37e-06 [a_2]: 9.48e-05 [accelerated_algorithm]: 8.12e-06 [shard]: 1.94999e-06 [meta_shard_fg_expand]: 2.48998e-06 [shard_inline]: 7.46999e-06 [merge_send_recv]: 2.244e-05 [auto_parallel]: 9.17001e-06 [parallel]: 6.68e-06 [flash_sp]: 4.82998e-06 [merge_comm]: 5.05999e-06 [allreduce_fusion]: 3.85e-06 [matmul_add_comm_reduction]: 9.57001e-06 [allreduce_slice_to_reducescatter]: 7.39994e-07 [virtual_shard_identity]: 1.059e-05 [virtual_dataset]: 8.27998e-06 [get_grad_eliminate_]: 7.48999e-06 [virtual_output]: 7.43e-06 [merge_forward]: 5.13002e-06 [cell_reuse_recompute_pass]: 1.77001e-06 [offload_activation]: 1.132e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.682e-05 [merge_recompute_call_nodes]: 1.35999e-06 [before_grad]: 1.232e-05 [set_forward_comm_id_for_comm_node_pass]: 4.90999e-06 [meta_fg_expand]: 3.60998e-06 [flash_sp_send_recv_attached]: 1.02e-06 [receive_attached]: 1.44998e-06 [after_resolve]: 1.381e-05 [a_after_grad]: 1.172e-05 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 3.30998e-06 [auto_monad_grad]: 1.89999e-06 [auto_monad_eliminator]: 1.339e-05 [cse]: 2.537e-05 [a_3]: 4.887e-05 [py_interpret_to_execute_after_opt_a]: 1.569e-05 [slice_cell_reuse_recomputed_activation]: 2.56e-06 [rewriter_after_opt_a]: 4.593e-05 [convert_after_rewriter]: 8.37e-06 [order_py_execute_after_rewriter]: 5.82001e-06 [mutable_eliminate]: 0.00066601 [opt_b]: 0.00028237, [1] [Cycle 1]: 0.00027357, [7] [b_1]: 0.00017424 [b_2]: 1.052e-05 [updatestate_depend_eliminate]: 9.25999e-06 [updatestate_assign_eliminate]: 3.03998e-06 [updatestate_loads_eliminate]: 4e-06 [renormalize]: 9.00007e-07 [cse]: 3.286e-05 [optimize_parallel_all_gather_comm]: 1.918e-05 [overlap_param_gather]: 1.94e-06 [cconv]: 3.193e-05 [loop_unroll]: 0.00051142 [opt_after_cconv]: 0.00013321, [1] [Cycle 1]: 0.00012531, [7] [c_1]: 3.886e-05 [parameter_eliminate]: 4.93001e-06 [updatestate_depend_eliminate]: 8.60999e-06 [updatestate_assign_eliminate]: 3.02002e-06 [updatestate_loads_eliminate]: 3.18998e-06 [cse]: 2.929e-05 [renormalize]: 5.90022e-07 [remove_dup_value]: 1.723e-05 [tuple_transform]: 9.009e-05, [1] [Cycle 1]: 8.529e-05, [4] [d_1]: 5.531e-05 [none_parameter_eliminate]: 2.09999e-06 [renormalize]: 2.80008e-07 [switch_simplify]: 8.17e-06 [partial_unused_args_eliminate]: 1.92001e-06 [add_recomputation]: 6.381e-05 [cse_after_recomputation]: 2.678e-05, [1] [Cycle 1]: 2.171e-05, [1] [cse]: 1.567e-05 [environ_conv]: 6.98e-06 [swap_dp_allreduce_reducescatter]: 6.44999e-06 [bias_add_comm_swap]: 3.14999e-06 [label_micro_interleaved_index]: 4.3e-06 [label_fine_grained_interleaved_index]: 2.78e-06 [merge_cast_opt]: 1.54998e-06 [slice_recompute_activation]: 2.04999e-06 [micro_interleaved_order_control]: 2.66e-06 [assign_add_opt]: 1.19e-06 [ForceFp32Comm]: 8.30012e-07 [remove_cast_before_assign_add]: 1.33002e-06 [full_micro_interleaved_order_control]: 2.37001e-06 [reorder_send_recv_between_fp_bp]: 2.59999e-06 [comm_op_add_attrs]: 1.09998e-06 [add_comm_op_reuse_tag]: 9.70002e-07 [interleave_split_concat_branches]: 1.10999e-06 [interleave_parallel_branches]: 1.12999e-06 [overlap_opt_shard_in_pipeline]: 1.49e-06 [overlap_opt_shard_grad_in_pipeline]: 1.92999e-06 [control_data_broadcast_order]: 1.636e-05 [grouped_pairwise_exchange_alltoall]: 1.66e-06 [offloading_packed_experts]: 4.57998e-06 [overlap_recompute_and_grad_model_parallel]: 5.29e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.15999e-06 [overlap_recompute_allgather_and_fa_grad]: 1.44e-06 [overlap_recompute_comm]: 2.34001e-06 [overlap_grad_ring_attention]: 5.10999e-06 [overlap_grad_flash_sp]: 2.421e-05 [begin_end_overlap_inline]: 7.29982e-07 [split_matmul_comm_elemetwise]: 2.39999e-06 [split_layernorm_comm]: 1.86e-06 [handle_group_info]: 9.80013e-07 [symbol_engine_optimizer]: 9.438e-05, [1] [Cycle 1]: 8.934e-05, [6] [build]: 5.50001e-06 [elim_shapecalc]: 1.437e-05 [elim_not_effective]: 1.774e-05 [opt_reshape]: 8.58001e-06 [fold_const_symbol]: 1.284e-05 [renormalize]: 2.00002e-07 [detach_backward]: 2.27999e-06 [pipeline_parallel_scheduler]: 1.52999e-06 [auto_monad_reorder]: 2.097e-05 [get_jit_bprop_graph]: 2.16998e-06 [rewriter_after_jit_bprop_graph]: 6.10002e-06 [opt_after_jit_grad]: 0.00055212 [validate]: 5.109e-05 Sums bootstrap : 0.000454s : 1.10% type_inference : 0.035077s : 85.15% event_method : 0.000022s : 0.05% auto_monad : 0.000068s : 0.16% graph_reusing : 0.000006s : 0.02% inline : 0.000003s : 0.01% add_attr.add_attr_with_inline.tag_attr : 0.000024s : 0.06% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.01% parallel-infer-symbol : 0.000004s : 0.01% pre_auto_parallel : 0.000040s : 0.10% insert-virtual-dataset : 0.000003s : 0.01% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000031s : 0.08% optimize.rewriter_before_opt_a : 0.000095s : 0.23% optimize.opt_a.expand_dump_flag : 0.000005s : 0.01% optimize.opt_a.switch_simplify : 0.000056s : 0.14% optimize.opt_a.loop_unroll : 0.000040s : 0.10% optimize.opt_a.a_1 : 0.000885s : 2.15% optimize.opt_a.with_stream_mark : 0.000038s : 0.09% optimize.opt_a.recompute_prepare : 0.000023s : 0.06% optimize.opt_a.updatestate_depend_eliminate : 0.000009s : 0.02% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.02% optimize.opt_a.updatestate_loads_eliminate : 0.000007s : 0.02% optimize.opt_a.parameter_eliminate : 0.000003s : 0.01% optimize.opt_a.a_2 : 0.000199s : 0.48% optimize.opt_a.accelerated_algorithm : 0.000018s : 0.04% optimize.opt_a.shard : 0.000004s : 0.01% optimize.opt_a.meta_shard_fg_expand : 0.000005s : 0.01% optimize.opt_a.shard_inline : 0.000016s : 0.04% optimize.opt_a.merge_send_recv : 0.000034s : 0.08% optimize.opt_a.auto_parallel : 0.000018s : 0.04% optimize.opt_a.parallel : 0.000025s : 0.06% optimize.opt_a.flash_sp : 0.000015s : 0.04% optimize.opt_a.merge_comm : 0.000010s : 0.02% optimize.opt_a.allreduce_fusion : 0.000008s : 0.02% optimize.opt_a.matmul_add_comm_reduction : 0.000019s : 0.05% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000002s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000021s : 0.05% optimize.opt_a.virtual_dataset : 0.000016s : 0.04% optimize.opt_a.get_grad_eliminate_ : 0.000015s : 0.04% optimize.opt_a.virtual_output : 0.000016s : 0.04% optimize.opt_a.merge_forward : 0.000010s : 0.02% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.01% optimize.opt_a.offload_activation : 0.000022s : 0.05% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000036s : 0.09% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.01% optimize.opt_a.before_grad : 0.000025s : 0.06% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000010s : 0.02% optimize.opt_a.meta_fg_expand : 0.000008s : 0.02% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.01% optimize.opt_a.receive_attached : 0.000004s : 0.01% optimize.opt_a.after_resolve : 0.000028s : 0.07% optimize.opt_a.a_after_grad : 0.000024s : 0.06% optimize.opt_a.renormalize : 0.000839s : 2.04% optimize.opt_a.add_forward_monad_depend : 0.000011s : 0.03% optimize.opt_a.auto_monad_grad : 0.000004s : 0.01% optimize.opt_a.auto_monad_eliminator : 0.000033s : 0.08% optimize.opt_a.cse : 0.000065s : 0.16% optimize.opt_a.a_3 : 0.000111s : 0.27% optimize.py_interpret_to_execute_after_opt_a : 0.000016s : 0.04% optimize.slice_cell_reuse_recomputed_activation : 0.000003s : 0.01% optimize.rewriter_after_opt_a : 0.000046s : 0.11% optimize.convert_after_rewriter : 0.000008s : 0.02% optimize.order_py_execute_after_rewriter : 0.000006s : 0.01% optimize.mutable_eliminate : 0.000666s : 1.62% optimize.opt_b.b_1 : 0.000174s : 0.42% optimize.opt_b.b_2 : 0.000011s : 0.03% optimize.opt_b.updatestate_depend_eliminate : 0.000009s : 0.02% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.01% optimize.opt_b.updatestate_loads_eliminate : 0.000004s : 0.01% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000033s : 0.08% optimize.optimize_parallel_all_gather_comm : 0.000019s : 0.05% optimize.overlap_param_gather : 0.000002s : 0.00% optimize.cconv : 0.000032s : 0.08% optimize.loop_unroll : 0.000511s : 1.24% optimize.opt_after_cconv.c_1 : 0.000039s : 0.09% optimize.opt_after_cconv.parameter_eliminate : 0.000005s : 0.01% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000009s : 0.02% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.cse : 0.000029s : 0.07% optimize.opt_after_cconv.renormalize : 0.000001s : 0.00% optimize.remove_dup_value : 0.000017s : 0.04% optimize.tuple_transform.d_1 : 0.000055s : 0.13% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000008s : 0.02% optimize.partial_unused_args_eliminate : 0.000002s : 0.00% optimize.add_recomputation : 0.000064s : 0.15% optimize.cse_after_recomputation.cse : 0.000016s : 0.04% optimize.environ_conv : 0.000007s : 0.02% optimize.swap_dp_allreduce_reducescatter : 0.000006s : 0.02% optimize.bias_add_comm_swap : 0.000003s : 0.01% optimize.label_micro_interleaved_index : 0.000004s : 0.01% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.01% optimize.merge_cast_opt : 0.000002s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.00% optimize.micro_interleaved_order_control : 0.000003s : 0.01% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.00% optimize.full_micro_interleaved_order_control : 0.000002s : 0.01% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.01% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.00% optimize.control_data_broadcast_order : 0.000016s : 0.04% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.00% optimize.offloading_packed_experts : 0.000005s : 0.01% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.01% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.00% optimize.overlap_recompute_comm : 0.000002s : 0.01% optimize.overlap_grad_ring_attention : 0.000005s : 0.01% optimize.overlap_grad_flash_sp : 0.000024s : 0.06% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.01% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000006s : 0.01% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000014s : 0.03% optimize.symbol_engine_optimizer.elim_not_effective : 0.000018s : 0.04% optimize.symbol_engine_optimizer.opt_reshape : 0.000009s : 0.02% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000013s : 0.03% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.01% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000021s : 0.05% get_jit_bprop_graph : 0.000002s : 0.01% rewriter_after_jit_bprop_graph : 0.000006s : 0.01% opt_after_jit_grad : 0.000552s : 1.34% validate : 0.000051s : 0.12% Time group info: ------[substitution.] 0.000238 39 11.53% : 0.000027s : 3: substitution.cast_eliminate 0.96% : 0.000002s : 3: substitution.elim_not_effective 0.89% : 0.000002s : 3: substitution.fold_const_symbol 3.10% : 0.000007s : 5: substitution.graph_param_transform 67.85% : 0.000161s : 4: substitution.inline 1.90% : 0.000005s : 6: substitution.j_node_and_user_rematch 2.49% : 0.000006s : 6: substitution.remove_not_recompute_node 2.32% : 0.000006s : 4: substitution.replace_old_param 5.32% : 0.000013s : 4: substitution.tuple_list_get_item_eliminator 3.66% : 0.000009s : 1: substitution.value_based_eliminate ------[type_inference.] 0.035006 2 97.69% : 0.034196s : 1: type_inference.infer 2.31% : 0.000810s : 1: type_inference.specialize ------[replace.] 0.000066 8 61.91% : 0.000041s : 4: replace.inline 38.09% : 0.000025s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000170 8 93.55% : 0.000159s : 4: match.inline 6.45% : 0.000011s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000247 1504 0.85% : 0.000002s : 15: predicate.accumulaten_eliminater 0.81% : 0.000002s : 5: predicate.ad_related_special_op_eliminate 0.56% : 0.000001s : 10: predicate.addn_check_dump 0.85% : 0.000002s : 15: predicate.addn_zero_filter 0.77% : 0.000002s : 15: predicate.adjust_all_reduce_mul_add 2.21% : 0.000005s : 25: predicate.arithmetic_simplify 1.16% : 0.000003s : 15: predicate.cast_eliminate 0.70% : 0.000002s : 10: predicate.check_bprop_eliminate 0.59% : 0.000001s : 10: predicate.compare_switch_simplify 0.19% : 0.000000s : 5: predicate.const_output_eliminate 0.62% : 0.000002s : 10: predicate.depend_value_elim 0.90% : 0.000002s : 15: predicate.dict_get_item_const_eliminator 1.00% : 0.000002s : 15: predicate.dict_get_item_eliminator 0.94% : 0.000002s : 15: predicate.dict_set_item_eliminator 1.08% : 0.000003s : 10: predicate.dumpgradient_eliminate 0.26% : 0.000001s : 5: predicate.elim_not_effective 0.61% : 0.000002s : 5: predicate.elim_shapecalc_of_broadcastargs 1.17% : 0.000003s : 20: predicate.environ_add_const_eliminate 1.11% : 0.000003s : 20: predicate.environ_get_add_eliminate 1.11% : 0.000003s : 20: predicate.environ_get_depend_swap 1.66% : 0.000004s : 30: predicate.environ_get_eliminate 1.11% : 0.000003s : 20: predicate.environ_get_set_eliminate 1.30% : 0.000003s : 23: predicate.exchange_switch_depend_value 2.23% : 0.000006s : 23: predicate.float_depend_g_call 0.62% : 0.000002s : 10: predicate.float_environ_get_switch 0.83% : 0.000002s : 15: predicate.float_tuple_getitem_switch 0.17% : 0.000000s : 5: predicate.fold_const_symbol 0.67% : 0.000002s : 10: predicate.get_grad_eliminate 0.19% : 0.000000s : 5: predicate.graph_param_transform 0.62% : 0.000002s : 10: predicate.incorporate_call 0.55% : 0.000001s : 10: predicate.incorporate_call_switch 6.15% : 0.000015s : 68: predicate.inline 0.85% : 0.000002s : 10: predicate.inline_without_move 0.33% : 0.000001s : 10: predicate.j_node_and_user_rematch 0.85% : 0.000002s : 10: predicate.less_batch_normalization 1.82% : 0.000004s : 29: predicate.list_to_tuple_eliminator_ 2.44% : 0.000006s : 44: predicate.load_eliminater 1.11% : 0.000003s : 5: predicate.loop_unroll_after_grad 2.14% : 0.000005s : 36: predicate.loop_unroll_before_grad 1.64% : 0.000004s : 25: predicate.make_slice_get_slice_eliminator 0.62% : 0.000002s : 10: predicate.merge_addn 0.67% : 0.000002s : 10: predicate.micro_step_allgather_replace 0.59% : 0.000001s : 10: predicate.mini_step_allgather_replace 0.80% : 0.000002s : 15: predicate.minmaximum_grad 1.31% : 0.000003s : 5: predicate.mutable_eliminate 0.41% : 0.000001s : 5: predicate.opt_reshape 0.49% : 0.000001s : 5: predicate.parallel_virtual_node 1.85% : 0.000005s : 23: predicate.partial_defer_inline 1.59% : 0.000004s : 24: predicate.partial_eliminate 0.90% : 0.000002s : 15: predicate.print_const_string_wrapper 0.58% : 0.000001s : 10: predicate.reduce_all_const_elim 1.12% : 0.000003s : 15: predicate.reduce_eliminate 2.45% : 0.000006s : 44: predicate.redundant_stop_gradient_eliminater 0.55% : 0.000001s : 10: predicate.remove_not_recompute_node 1.28% : 0.000003s : 29: predicate.replace_applicator 0.51% : 0.000001s : 10: predicate.replace_old_param 0.24% : 0.000001s : 5: predicate.reset_defer_inline 0.95% : 0.000002s : 15: predicate.reshape_eliminate 0.59% : 0.000001s : 10: predicate.row_tensor_add_zeros_like 0.35% : 0.000001s : 5: predicate.row_tensor_eliminate 0.74% : 0.000002s : 10: predicate.same_eliminate 0.49% : 0.000001s : 10: predicate.set_cell_output_no_recompute 0.81% : 0.000002s : 10: predicate.shard_identity_eliminate 0.73% : 0.000002s : 10: predicate.special_op_eliminate 0.82% : 0.000002s : 10: predicate.specialize_transform 0.90% : 0.000002s : 10: predicate.split_environ_get_set_with_tuple_value 0.70% : 0.000002s : 10: predicate.stack_unstack_eliminate 0.44% : 0.000001s : 5: predicate.switch_call_monad_eliminater 1.42% : 0.000004s : 23: predicate.switch_defer_inline 1.95% : 0.000005s : 33: predicate.switch_layer_defer_inline 4.85% : 0.000012s : 74: predicate.switch_simplify 0.88% : 0.000002s : 15: predicate.tile_eliminate 0.85% : 0.000002s : 15: predicate.transpose_eliminate 1.50% : 0.000004s : 25: predicate.tuple_list_convert_item_index_to_positive 1.53% : 0.000004s : 25: predicate.tuple_list_get_item_const_eliminator 1.43% : 0.000004s : 25: predicate.tuple_list_get_item_depend_reorder 3.23% : 0.000008s : 39: predicate.tuple_list_get_item_eliminator 1.65% : 0.000004s : 25: predicate.tuple_list_get_set_item_eliminator 2.30% : 0.000006s : 35: predicate.tuple_list_set_item_eliminator 1.81% : 0.000004s : 29: predicate.tuple_to_list_eliminator_ 2.40% : 0.000006s : 44: predicate.updatestate_pure_node_eliminater 3.08% : 0.000008s : 54: predicate.updatestate_useless_node_eliminater 0.56% : 0.000001s : 5: predicate.value_based_eliminate 0.80% : 0.000002s : 10: predicate.virtual_dataset_eliminate 0.77% : 0.000002s : 10: predicate.virtual_output_eliminate 0.34% : 0.000001s : 5: predicate.virtual_view_grad_eliminate 0.41% : 0.000001s : 5: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000645 11 56.48% : 0.000364s : 5: func_graph_cloner_run.FuncGraphClonerGraph 43.52% : 0.000281s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.057556 192 0.01% : 0.000003s : 1: ForceFp32Comm 6.16% : 0.003545s : 1: add_attr 6.13% : 0.003530s : 1: add_attr_with_inline 0.01% : 0.000004s : 1: add_comm_op_reuse_tag 0.12% : 0.000068s : 1: add_recomputation 0.01% : 0.000004s : 1: assign_add_opt 0.13% : 0.000075s : 1: auto_monad 0.04% : 0.000026s : 1: auto_monad_reorder 0.01% : 0.000004s : 1: begin_end_overlap_inline 0.01% : 0.000006s : 1: bias_add_comm_swap 0.84% : 0.000482s : 1: bootstrap 0.06% : 0.000036s : 1: cconv 0.01% : 0.000004s : 1: comm_op_add_attrs 0.03% : 0.000020s : 1: control_data_broadcast_order 0.02% : 0.000012s : 1: convert_after_rewriter 0.05% : 0.000030s : 1: cse_after_recomputation 0.01% : 0.000005s : 1: dataset_repeat_opt 0.01% : 0.000006s : 1: detach_backward 0.02% : 0.000010s : 1: environ_conv 0.05% : 0.000029s : 1: event_method 0.01% : 0.000005s : 1: full_micro_interleaved_order_control 0.01% : 0.000005s : 1: get_jit_bprop_graph 0.02% : 0.000010s : 1: graph_reusing 0.01% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000004s : 1: handle_group_info 0.01% : 0.000006s : 1: inline 0.01% : 0.000006s : 1: insert-virtual-dataset 0.01% : 0.000004s : 1: interleave_parallel_branches 0.01% : 0.000004s : 1: interleave_split_concat_branches 0.01% : 0.000006s : 1: label_fine_grained_interleaved_index 0.01% : 0.000007s : 1: label_micro_interleaved_index 0.91% : 0.000522s : 1: loop_unroll 0.01% : 0.000004s : 1: merge_cast_opt 0.01% : 0.000005s : 1: micro_interleaved_order_control 1.18% : 0.000678s : 1: mutable_eliminate 0.01% : 0.000007s : 1: offloading_packed_experts 0.04% : 0.000021s : 1: opt.transform.loop_unroll_optimizer 0.04% : 0.000021s : 1: opt.transform.mutable_eliminate 2.48% : 0.001426s : 78: opt.transform.opt_a 0.06% : 0.000037s : 1: opt.transform.opt_after_cconv 0.06% : 0.000033s : 1: opt.transform.opt_after_jit_grad 0.26% : 0.000151s : 28: opt.transform.opt_b 0.11% : 0.000061s : 2: opt.transform.opt_trans_graph 0.08% : 0.000049s : 4: opt.transform.symbol_engine_opt 5.68% : 0.003267s : 1: opt_a 0.24% : 0.000137s : 1: opt_after_cconv 0.98% : 0.000564s : 1: opt_after_jit_grad 0.50% : 0.000287s : 1: opt_b 10.01% : 0.005761s : 1: optimize 0.04% : 0.000023s : 1: optimize_parallel_all_gather_comm 0.02% : 0.000009s : 1: order_py_execute_after_rewriter 0.05% : 0.000028s : 1: overlap_grad_flash_sp 0.01% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.01% : 0.000009s : 1: overlap_grad_ring_attention 0.01% : 0.000006s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.01% : 0.000005s : 1: overlap_param_gather 0.01% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.01% : 0.000008s : 1: overlap_recompute_and_grad_model_parallel 0.01% : 0.000005s : 1: overlap_recompute_comm 0.01% : 0.000007s : 1: parallel-infer-symbol 0.01% : 0.000004s : 1: parallel-infer-symbol-second 0.01% : 0.000005s : 1: partial_unused_args_eliminate 0.01% : 0.000005s : 1: pipeline_parallel_scheduler 0.01% : 0.000005s : 1: pipeline_split 0.08% : 0.000044s : 1: pre_auto_parallel 0.06% : 0.000036s : 1: py_interpret_to_execute 0.03% : 0.000019s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000004s : 1: remove_cast_before_assign_add 0.04% : 0.000020s : 1: remove_dup_value 0.81% : 0.000465s : 1: renormalize.infer 0.63% : 0.000364s : 1: renormalize.specialize 0.01% : 0.000005s : 1: reorder_send_recv_between_fp_bp 0.02% : 0.000009s : 1: rewriter_after_jit_bprop_graph 0.09% : 0.000052s : 1: rewriter_after_opt_a 0.17% : 0.000099s : 1: rewriter_before_opt_a 0.01% : 0.000006s : 1: slice_cell_reuse_recomputed_activation 0.01% : 0.000005s : 1: slice_recompute_activation 0.01% : 0.000005s : 1: split_layernorm_comm 0.01% : 0.000005s : 1: split_matmul_comm_elemetwise 0.02% : 0.000009s : 1: swap_dp_allreduce_reducescatter 0.17% : 0.000097s : 1: symbol_engine_optimizer 0.16% : 0.000093s : 1: tuple_transform 60.98% : 0.035097s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:44:42.991.647 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:44:42.991.942 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0455835, [21] [bootstrap]: 0.00041822 [type_inference]: 0.0340382 [event_method]: 2.032e-05 [auto_monad]: 6.344e-05 [graph_reusing]: 6.94999e-06 [inline]: 2.14999e-06 [add_attr]: 0.00342902, [1] [add_attr_with_inline]: 0.00341722, [1] [Cycle 1]: 9.394e-05, [2] [tag_attr]: 2.348e-05 [meta_addattr_fg_expand]: 5.64998e-06 [parallel-infer-symbol]: 3.56001e-06 [pre_auto_parallel]: 3.969e-05 [insert-virtual-dataset]: 2.56e-06 [parallel-infer-symbol-second]: 7.80012e-07 [dataset_repeat_opt]: 2.58003e-06 [pipeline_split]: 1.80001e-06 [optimize]: 0.00617392, [53] [py_interpret_to_execute]: 3.457e-05 [rewriter_before_opt_a]: 9.24e-05 [opt_a]: 0.0034776, [2] [Cycle 1]: 0.00246126, [45] [expand_dump_flag]: 3.39001e-06 [switch_simplify]: 4.288e-05 [loop_unroll]: 3.076e-05 [a_1]: 0.00064806 [with_stream_mark]: 2.335e-05 [recompute_prepare]: 1.114e-05 [updatestate_depend_eliminate]: 5.04e-06 [updatestate_assign_eliminate]: 3.23e-06 [updatestate_loads_eliminate]: 3.41001e-06 [parameter_eliminate]: 2.91999e-06 [a_2]: 0.00011252 [accelerated_algorithm]: 8.48999e-06 [shard]: 2.45002e-06 [meta_shard_fg_expand]: 2.03997e-06 [shard_inline]: 7.6e-06 [merge_send_recv]: 1.025e-05 [auto_parallel]: 8.43001e-06 [parallel]: 2.183e-05 [flash_sp]: 1.116e-05 [merge_comm]: 4.35e-06 [allreduce_fusion]: 3.73999e-06 [matmul_add_comm_reduction]: 1.02e-05 [allreduce_slice_to_reducescatter]: 6.39993e-07 [virtual_shard_identity]: 1.163e-05 [virtual_dataset]: 7.05e-06 [get_grad_eliminate_]: 6.21998e-06 [virtual_output]: 6.79001e-06 [merge_forward]: 4.90001e-06 [cell_reuse_recompute_pass]: 2.06e-06 [offload_activation]: 1.214e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.808e-05 [merge_recompute_call_nodes]: 1.49e-06 [before_grad]: 1.268e-05 [set_forward_comm_id_for_comm_node_pass]: 3.89002e-06 [meta_fg_expand]: 3.21999e-06 [flash_sp_send_recv_attached]: 3.5e-06 [receive_attached]: 2.68e-06 [after_resolve]: 1.454e-05 [a_after_grad]: 1.137e-05 [renormalize]: 0.00080636 [add_forward_monad_depend]: 7.77998e-06 [auto_monad_grad]: 2.63998e-06 [auto_monad_eliminator]: 1.898e-05 [cse]: 3.019e-05 [a_3]: 6.818e-05 [Cycle 2]: 0.00100105, [45] [expand_dump_flag]: 2.17001e-06 [switch_simplify]: 8.88002e-06 [loop_unroll]: 6.09999e-06 [a_1]: 0.00019696 [with_stream_mark]: 2.032e-05 [recompute_prepare]: 9.15999e-06 [updatestate_depend_eliminate]: 3.90998e-06 [updatestate_assign_eliminate]: 3.2e-06 [updatestate_loads_eliminate]: 2.56e-06 [parameter_eliminate]: 1.55001e-06 [a_2]: 0.00010461 [accelerated_algorithm]: 7.81001e-06 [shard]: 2.48e-06 [meta_shard_fg_expand]: 2.14e-06 [shard_inline]: 7.03e-06 [merge_send_recv]: 7.66999e-06 [auto_parallel]: 7.49002e-06 [parallel]: 7.23e-06 [flash_sp]: 4.15e-06 [merge_comm]: 1.066e-05 [allreduce_fusion]: 3.42002e-06 [matmul_add_comm_reduction]: 8.3e-06 [allreduce_slice_to_reducescatter]: 5.00004e-07 [virtual_shard_identity]: 9.02e-06 [virtual_dataset]: 7.01999e-06 [get_grad_eliminate_]: 6.28e-06 [virtual_output]: 5.91998e-06 [merge_forward]: 3.91999e-06 [cell_reuse_recompute_pass]: 2.02001e-06 [offload_activation]: 9.17999e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.864e-05 [merge_recompute_call_nodes]: 1.12999e-06 [before_grad]: 1.063e-05 [set_forward_comm_id_for_comm_node_pass]: 6.21e-06 [meta_fg_expand]: 3.28e-06 [flash_sp_send_recv_attached]: 1.39998e-06 [receive_attached]: 1.73002e-06 [after_resolve]: 1.477e-05 [a_after_grad]: 1.087e-05 [renormalize]: 8.00064e-08 [add_forward_monad_depend]: 2.81e-06 [auto_monad_grad]: 2.29999e-06 [auto_monad_eliminator]: 1.23e-05 [cse]: 2.142e-05 [a_3]: 5.384e-05 [py_interpret_to_execute_after_opt_a]: 1.832e-05 [slice_cell_reuse_recomputed_activation]: 5.19e-06 [rewriter_after_opt_a]: 4.473e-05 [convert_after_rewriter]: 9.81998e-06 [order_py_execute_after_rewriter]: 7.72002e-06 [mutable_eliminate]: 0.00069583 [opt_b]: 0.00031593, [1] [Cycle 1]: 0.00030359, [7] [b_1]: 0.00019472 [b_2]: 9.00999e-06 [updatestate_depend_eliminate]: 9.76e-06 [updatestate_assign_eliminate]: 2.58998e-06 [updatestate_loads_eliminate]: 2.98e-06 [renormalize]: 7.30011e-07 [cse]: 2.428e-05 [optimize_parallel_all_gather_comm]: 2.216e-05 [overlap_param_gather]: 4.78001e-06 [cconv]: 3.635e-05 [loop_unroll]: 0.00051846 [opt_after_cconv]: 0.00014033, [1] [Cycle 1]: 0.00013032, [7] [c_1]: 3.125e-05 [parameter_eliminate]: 4.71002e-06 [updatestate_depend_eliminate]: 8.00999e-06 [updatestate_assign_eliminate]: 2.86999e-06 [updatestate_loads_eliminate]: 3.01001e-06 [cse]: 2.348e-05 [renormalize]: 7.10017e-07 [remove_dup_value]: 1.808e-05 [tuple_transform]: 9.77e-05, [1] [Cycle 1]: 8.96e-05, [4] [d_1]: 4.792e-05 [none_parameter_eliminate]: 1.93997e-06 [renormalize]: 2.60014e-07 [switch_simplify]: 7.08e-06 [partial_unused_args_eliminate]: 4.68999e-06 [add_recomputation]: 5.711e-05 [cse_after_recomputation]: 3.274e-05, [1] [Cycle 1]: 2.497e-05, [1] [cse]: 1.291e-05 [environ_conv]: 9.87001e-06 [swap_dp_allreduce_reducescatter]: 9.56e-06 [bias_add_comm_swap]: 5.59e-06 [label_micro_interleaved_index]: 7.22002e-06 [label_fine_grained_interleaved_index]: 5.51e-06 [merge_cast_opt]: 3.83001e-06 [slice_recompute_activation]: 4.53999e-06 [micro_interleaved_order_control]: 4.93001e-06 [assign_add_opt]: 3.93999e-06 [ForceFp32Comm]: 3.58999e-06 [remove_cast_before_assign_add]: 4.32e-06 [full_micro_interleaved_order_control]: 4.75001e-06 [reorder_send_recv_between_fp_bp]: 5.42999e-06 [comm_op_add_attrs]: 3.57997e-06 [add_comm_op_reuse_tag]: 3.16001e-06 [interleave_split_concat_branches]: 3.53999e-06 [interleave_parallel_branches]: 3.4e-06 [overlap_opt_shard_in_pipeline]: 3.91001e-06 [overlap_opt_shard_grad_in_pipeline]: 4.07e-06 [control_data_broadcast_order]: 2.071e-05 [grouped_pairwise_exchange_alltoall]: 4.38999e-06 [offloading_packed_experts]: 8.17e-06 [overlap_recompute_and_grad_model_parallel]: 8.33001e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.48e-06 [overlap_recompute_allgather_and_fa_grad]: 3.66999e-06 [overlap_recompute_comm]: 4.84e-06 [overlap_grad_ring_attention]: 6.96999e-06 [overlap_grad_flash_sp]: 2.679e-05 [begin_end_overlap_inline]: 3.90998e-06 [split_matmul_comm_elemetwise]: 4.77e-06 [split_layernorm_comm]: 4.47e-06 [handle_group_info]: 3.81999e-06 [symbol_engine_optimizer]: 0.00011403, [1] [Cycle 1]: 0.0001059, [6] [build]: 4.38001e-06 [elim_shapecalc]: 1.566e-05 [elim_not_effective]: 1.534e-05 [opt_reshape]: 7.55998e-06 [fold_const_symbol]: 1.055e-05 [renormalize]: 1.69995e-07 [detach_backward]: 5.89e-06 [pipeline_parallel_scheduler]: 2.42001e-06 [auto_monad_reorder]: 2.488e-05 [get_jit_bprop_graph]: 1.69e-06 [rewriter_after_jit_bprop_graph]: 7.48e-06 [opt_after_jit_grad]: 0.00065862 [validate]: 5.095e-05 Sums bootstrap : 0.000418s : 1.04% type_inference : 0.034038s : 84.55% event_method : 0.000020s : 0.05% auto_monad : 0.000063s : 0.16% graph_reusing : 0.000007s : 0.02% inline : 0.000002s : 0.01% add_attr.add_attr_with_inline.tag_attr : 0.000023s : 0.06% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.01% parallel-infer-symbol : 0.000004s : 0.01% pre_auto_parallel : 0.000040s : 0.10% insert-virtual-dataset : 0.000003s : 0.01% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000003s : 0.01% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000035s : 0.09% optimize.rewriter_before_opt_a : 0.000092s : 0.23% optimize.opt_a.expand_dump_flag : 0.000006s : 0.01% optimize.opt_a.switch_simplify : 0.000052s : 0.13% optimize.opt_a.loop_unroll : 0.000037s : 0.09% optimize.opt_a.a_1 : 0.000845s : 2.10% optimize.opt_a.with_stream_mark : 0.000044s : 0.11% optimize.opt_a.recompute_prepare : 0.000020s : 0.05% optimize.opt_a.updatestate_depend_eliminate : 0.000009s : 0.02% optimize.opt_a.updatestate_assign_eliminate : 0.000006s : 0.02% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.01% optimize.opt_a.parameter_eliminate : 0.000004s : 0.01% optimize.opt_a.a_2 : 0.000217s : 0.54% optimize.opt_a.accelerated_algorithm : 0.000016s : 0.04% optimize.opt_a.shard : 0.000005s : 0.01% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.01% optimize.opt_a.shard_inline : 0.000015s : 0.04% optimize.opt_a.merge_send_recv : 0.000018s : 0.04% optimize.opt_a.auto_parallel : 0.000016s : 0.04% optimize.opt_a.parallel : 0.000029s : 0.07% optimize.opt_a.flash_sp : 0.000015s : 0.04% optimize.opt_a.merge_comm : 0.000015s : 0.04% optimize.opt_a.allreduce_fusion : 0.000007s : 0.02% optimize.opt_a.matmul_add_comm_reduction : 0.000019s : 0.05% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000021s : 0.05% optimize.opt_a.virtual_dataset : 0.000014s : 0.03% optimize.opt_a.get_grad_eliminate_ : 0.000012s : 0.03% optimize.opt_a.virtual_output : 0.000013s : 0.03% optimize.opt_a.merge_forward : 0.000009s : 0.02% optimize.opt_a.cell_reuse_recompute_pass : 0.000004s : 0.01% optimize.opt_a.offload_activation : 0.000021s : 0.05% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000037s : 0.09% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.01% optimize.opt_a.before_grad : 0.000023s : 0.06% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000010s : 0.03% optimize.opt_a.meta_fg_expand : 0.000006s : 0.02% optimize.opt_a.flash_sp_send_recv_attached : 0.000005s : 0.01% optimize.opt_a.receive_attached : 0.000004s : 0.01% optimize.opt_a.after_resolve : 0.000029s : 0.07% optimize.opt_a.a_after_grad : 0.000022s : 0.06% optimize.opt_a.renormalize : 0.000806s : 2.00% optimize.opt_a.add_forward_monad_depend : 0.000011s : 0.03% optimize.opt_a.auto_monad_grad : 0.000005s : 0.01% optimize.opt_a.auto_monad_eliminator : 0.000031s : 0.08% optimize.opt_a.cse : 0.000052s : 0.13% optimize.opt_a.a_3 : 0.000122s : 0.30% optimize.py_interpret_to_execute_after_opt_a : 0.000018s : 0.05% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.01% optimize.rewriter_after_opt_a : 0.000045s : 0.11% optimize.convert_after_rewriter : 0.000010s : 0.02% optimize.order_py_execute_after_rewriter : 0.000008s : 0.02% optimize.mutable_eliminate : 0.000696s : 1.73% optimize.opt_b.b_1 : 0.000195s : 0.48% optimize.opt_b.b_2 : 0.000009s : 0.02% optimize.opt_b.updatestate_depend_eliminate : 0.000010s : 0.02% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.01% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.01% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000024s : 0.06% optimize.optimize_parallel_all_gather_comm : 0.000022s : 0.06% optimize.overlap_param_gather : 0.000005s : 0.01% optimize.cconv : 0.000036s : 0.09% optimize.loop_unroll : 0.000518s : 1.29% optimize.opt_after_cconv.c_1 : 0.000031s : 0.08% optimize.opt_after_cconv.parameter_eliminate : 0.000005s : 0.01% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000008s : 0.02% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.cse : 0.000023s : 0.06% optimize.opt_after_cconv.renormalize : 0.000001s : 0.00% optimize.remove_dup_value : 0.000018s : 0.04% optimize.tuple_transform.d_1 : 0.000048s : 0.12% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000007s : 0.02% optimize.partial_unused_args_eliminate : 0.000005s : 0.01% optimize.add_recomputation : 0.000057s : 0.14% optimize.cse_after_recomputation.cse : 0.000013s : 0.03% optimize.environ_conv : 0.000010s : 0.02% optimize.swap_dp_allreduce_reducescatter : 0.000010s : 0.02% optimize.bias_add_comm_swap : 0.000006s : 0.01% optimize.label_micro_interleaved_index : 0.000007s : 0.02% optimize.label_fine_grained_interleaved_index : 0.000006s : 0.01% optimize.merge_cast_opt : 0.000004s : 0.01% optimize.slice_recompute_activation : 0.000005s : 0.01% optimize.micro_interleaved_order_control : 0.000005s : 0.01% optimize.assign_add_opt : 0.000004s : 0.01% optimize.ForceFp32Comm : 0.000004s : 0.01% optimize.remove_cast_before_assign_add : 0.000004s : 0.01% optimize.full_micro_interleaved_order_control : 0.000005s : 0.01% optimize.reorder_send_recv_between_fp_bp : 0.000005s : 0.01% optimize.comm_op_add_attrs : 0.000004s : 0.01% optimize.add_comm_op_reuse_tag : 0.000003s : 0.01% optimize.interleave_split_concat_branches : 0.000004s : 0.01% optimize.interleave_parallel_branches : 0.000003s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000004s : 0.01% optimize.control_data_broadcast_order : 0.000021s : 0.05% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.01% optimize.offloading_packed_experts : 0.000008s : 0.02% optimize.overlap_recompute_and_grad_model_parallel : 0.000008s : 0.02% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000003s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.01% optimize.overlap_recompute_comm : 0.000005s : 0.01% optimize.overlap_grad_ring_attention : 0.000007s : 0.02% optimize.overlap_grad_flash_sp : 0.000027s : 0.07% optimize.begin_end_overlap_inline : 0.000004s : 0.01% optimize.split_matmul_comm_elemetwise : 0.000005s : 0.01% optimize.split_layernorm_comm : 0.000004s : 0.01% optimize.handle_group_info : 0.000004s : 0.01% optimize.symbol_engine_optimizer.build : 0.000004s : 0.01% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000016s : 0.04% optimize.symbol_engine_optimizer.elim_not_effective : 0.000015s : 0.04% optimize.symbol_engine_optimizer.opt_reshape : 0.000008s : 0.02% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000011s : 0.03% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000006s : 0.01% pipeline_parallel_scheduler : 0.000002s : 0.01% auto_monad_reorder : 0.000025s : 0.06% get_jit_bprop_graph : 0.000002s : 0.00% rewriter_after_jit_bprop_graph : 0.000007s : 0.02% opt_after_jit_grad : 0.000659s : 1.64% validate : 0.000051s : 0.13% Time group info: ------[substitution.] 0.000211 29 0.89% : 0.000002s : 2: substitution.elim_not_effective 0.66% : 0.000001s : 2: substitution.fold_const_symbol 2.89% : 0.000006s : 4: substitution.graph_param_transform 76.08% : 0.000161s : 4: substitution.inline 2.31% : 0.000005s : 4: substitution.j_node_and_user_rematch 2.28% : 0.000005s : 4: substitution.remove_not_recompute_node 3.06% : 0.000006s : 4: substitution.replace_old_param 7.66% : 0.000016s : 4: substitution.tuple_list_get_item_eliminator 4.17% : 0.000009s : 1: substitution.value_based_eliminate ------[type_inference.] 0.033979 2 97.93% : 0.033276s : 1: type_inference.infer 2.07% : 0.000702s : 1: type_inference.specialize ------[replace.] 0.000065 8 64.44% : 0.000042s : 4: replace.inline 35.56% : 0.000023s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000172 8 91.70% : 0.000158s : 4: match.inline 8.30% : 0.000014s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000281 1278 0.65% : 0.000002s : 13: predicate.accumulaten_eliminater 1.02% : 0.000003s : 4: predicate.ad_related_special_op_eliminate 0.43% : 0.000001s : 8: predicate.addn_check_dump 0.72% : 0.000002s : 13: predicate.addn_zero_filter 0.62% : 0.000002s : 13: predicate.adjust_all_reduce_mul_add 1.67% : 0.000005s : 21: predicate.arithmetic_simplify 0.65% : 0.000002s : 13: predicate.cast_eliminate 0.56% : 0.000002s : 8: predicate.check_bprop_eliminate 0.44% : 0.000001s : 8: predicate.compare_switch_simplify 0.14% : 0.000000s : 4: predicate.const_output_eliminate 0.47% : 0.000001s : 8: predicate.depend_value_elim 0.68% : 0.000002s : 13: predicate.dict_get_item_const_eliminator 0.84% : 0.000002s : 13: predicate.dict_get_item_eliminator 0.64% : 0.000002s : 13: predicate.dict_set_item_eliminator 0.97% : 0.000003s : 8: predicate.dumpgradient_eliminate 0.24% : 0.000001s : 4: predicate.elim_not_effective 0.55% : 0.000002s : 4: predicate.elim_shapecalc_of_broadcastargs 0.82% : 0.000002s : 17: predicate.environ_add_const_eliminate 0.83% : 0.000002s : 17: predicate.environ_get_add_eliminate 0.75% : 0.000002s : 17: predicate.environ_get_depend_swap 1.25% : 0.000004s : 25: predicate.environ_get_eliminate 0.83% : 0.000002s : 17: predicate.environ_get_set_eliminate 1.11% : 0.000003s : 21: predicate.exchange_switch_depend_value 1.89% : 0.000005s : 21: predicate.float_depend_g_call 0.43% : 0.000001s : 8: predicate.float_environ_get_switch 0.61% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.14% : 0.000000s : 4: predicate.fold_const_symbol 0.48% : 0.000001s : 8: predicate.get_grad_eliminate 0.16% : 0.000000s : 4: predicate.graph_param_transform 0.44% : 0.000001s : 8: predicate.incorporate_call 0.36% : 0.000001s : 8: predicate.incorporate_call_switch 4.79% : 0.000013s : 58: predicate.inline 0.81% : 0.000002s : 8: predicate.inline_without_move 0.23% : 0.000001s : 8: predicate.j_node_and_user_rematch 0.53% : 0.000001s : 8: predicate.less_batch_normalization 1.44% : 0.000004s : 25: predicate.list_to_tuple_eliminator_ 1.84% : 0.000005s : 38: predicate.load_eliminater 1.32% : 0.000004s : 4: predicate.loop_unroll_after_grad 1.75% : 0.000005s : 34: predicate.loop_unroll_before_grad 1.23% : 0.000003s : 21: predicate.make_slice_get_slice_eliminator 0.41% : 0.000001s : 8: predicate.merge_addn 0.45% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.47% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.58% : 0.000002s : 13: predicate.minmaximum_grad 1.15% : 0.000003s : 4: predicate.mutable_eliminate 0.32% : 0.000001s : 4: predicate.opt_reshape 0.37% : 0.000001s : 4: predicate.parallel_virtual_node 1.45% : 0.000004s : 21: predicate.partial_defer_inline 1.17% : 0.000003s : 21: predicate.partial_eliminate 0.63% : 0.000002s : 13: predicate.print_const_string_wrapper 0.45% : 0.000001s : 8: predicate.reduce_all_const_elim 0.79% : 0.000002s : 13: predicate.reduce_eliminate 1.79% : 0.000005s : 38: predicate.redundant_stop_gradient_eliminater 0.36% : 0.000001s : 8: predicate.remove_not_recompute_node 1.05% : 0.000003s : 25: predicate.replace_applicator 0.44% : 0.000001s : 8: predicate.replace_old_param 0.28% : 0.000001s : 4: predicate.reset_defer_inline 0.63% : 0.000002s : 13: predicate.reshape_eliminate 0.56% : 0.000002s : 8: predicate.row_tensor_add_zeros_like 0.39% : 0.000001s : 4: predicate.row_tensor_eliminate 0.58% : 0.000002s : 8: predicate.same_eliminate 0.33% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.89% : 0.000002s : 8: predicate.shard_identity_eliminate 0.58% : 0.000002s : 8: predicate.special_op_eliminate 0.55% : 0.000002s : 8: predicate.specialize_transform 0.81% : 0.000002s : 8: predicate.split_environ_get_set_with_tuple_value 0.77% : 0.000002s : 8: predicate.stack_unstack_eliminate 0.23% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.10% : 0.000003s : 21: predicate.switch_defer_inline 1.53% : 0.000004s : 29: predicate.switch_layer_defer_inline 25.91% : 0.000073s : 67: predicate.switch_simplify 0.64% : 0.000002s : 13: predicate.tile_eliminate 0.64% : 0.000002s : 13: predicate.transpose_eliminate 1.16% : 0.000003s : 21: predicate.tuple_list_convert_item_index_to_positive 1.13% : 0.000003s : 21: predicate.tuple_list_get_item_const_eliminator 1.02% : 0.000003s : 21: predicate.tuple_list_get_item_depend_reorder 2.76% : 0.000008s : 33: predicate.tuple_list_get_item_eliminator 1.04% : 0.000003s : 21: predicate.tuple_list_get_set_item_eliminator 1.69% : 0.000005s : 29: predicate.tuple_list_set_item_eliminator 1.27% : 0.000004s : 25: predicate.tuple_to_list_eliminator_ 1.82% : 0.000005s : 38: predicate.updatestate_pure_node_eliminater 2.33% : 0.000007s : 46: predicate.updatestate_useless_node_eliminater 0.44% : 0.000001s : 4: predicate.value_based_eliminate 0.52% : 0.000001s : 8: predicate.virtual_dataset_eliminate 0.52% : 0.000001s : 8: predicate.virtual_output_eliminate 0.23% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.41% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000545 11 51.30% : 0.000279s : 5: func_graph_cloner_run.FuncGraphClonerGraph 48.70% : 0.000265s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.057474 192 0.01% : 0.000006s : 1: ForceFp32Comm 5.98% : 0.003440s : 1: add_attr 5.95% : 0.003421s : 1: add_attr_with_inline 0.01% : 0.000006s : 1: add_comm_op_reuse_tag 0.11% : 0.000061s : 1: add_recomputation 0.01% : 0.000007s : 1: assign_add_opt 0.13% : 0.000072s : 1: auto_monad 0.06% : 0.000033s : 1: auto_monad_reorder 0.01% : 0.000007s : 1: begin_end_overlap_inline 0.01% : 0.000008s : 1: bias_add_comm_swap 0.80% : 0.000461s : 1: bootstrap 0.07% : 0.000040s : 1: cconv 0.01% : 0.000006s : 1: comm_op_add_attrs 0.04% : 0.000024s : 1: control_data_broadcast_order 0.02% : 0.000013s : 1: convert_after_rewriter 0.06% : 0.000037s : 1: cse_after_recomputation 0.01% : 0.000008s : 1: dataset_repeat_opt 0.05% : 0.000029s : 1: detach_backward 0.02% : 0.000013s : 1: environ_conv 0.05% : 0.000031s : 1: event_method 0.01% : 0.000007s : 1: full_micro_interleaved_order_control 0.01% : 0.000008s : 1: get_jit_bprop_graph 0.02% : 0.000013s : 1: graph_reusing 0.01% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000007s : 1: handle_group_info 0.01% : 0.000008s : 1: inline 0.02% : 0.000009s : 1: insert-virtual-dataset 0.01% : 0.000006s : 1: interleave_parallel_branches 0.01% : 0.000006s : 1: interleave_split_concat_branches 0.02% : 0.000009s : 1: label_fine_grained_interleaved_index 0.02% : 0.000010s : 1: label_micro_interleaved_index 0.91% : 0.000526s : 1: loop_unroll 0.01% : 0.000006s : 1: merge_cast_opt 0.01% : 0.000008s : 1: micro_interleaved_order_control 1.22% : 0.000703s : 1: mutable_eliminate 0.02% : 0.000011s : 1: offloading_packed_experts 0.04% : 0.000020s : 1: opt.transform.loop_unroll_optimizer 0.03% : 0.000019s : 1: opt.transform.mutable_eliminate 2.28% : 0.001311s : 78: opt.transform.opt_a 0.05% : 0.000030s : 1: opt.transform.opt_after_cconv 0.06% : 0.000036s : 1: opt.transform.opt_after_jit_grad 0.22% : 0.000125s : 28: opt.transform.opt_b 0.09% : 0.000053s : 2: opt.transform.opt_trans_graph 0.08% : 0.000045s : 4: opt.transform.symbol_engine_opt 6.06% : 0.003481s : 1: opt_a 0.25% : 0.000144s : 1: opt_after_cconv 1.17% : 0.000674s : 1: opt_after_jit_grad 0.56% : 0.000320s : 1: opt_b 11.37% : 0.006534s : 1: optimize 0.04% : 0.000025s : 1: optimize_parallel_all_gather_comm 0.02% : 0.000011s : 1: order_py_execute_after_rewriter 0.05% : 0.000030s : 1: overlap_grad_flash_sp 0.01% : 0.000006s : 1: overlap_grad_matmul_and_grad_allreduce 0.02% : 0.000010s : 1: overlap_grad_ring_attention 0.01% : 0.000007s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000007s : 1: overlap_opt_shard_in_pipeline 0.01% : 0.000008s : 1: overlap_param_gather 0.01% : 0.000006s : 1: overlap_recompute_allgather_and_fa_grad 0.02% : 0.000012s : 1: overlap_recompute_and_grad_model_parallel 0.01% : 0.000008s : 1: overlap_recompute_comm 0.02% : 0.000011s : 1: parallel-infer-symbol 0.01% : 0.000006s : 1: parallel-infer-symbol-second 0.01% : 0.000008s : 1: partial_unused_args_eliminate 0.02% : 0.000011s : 1: pipeline_parallel_scheduler 0.01% : 0.000007s : 1: pipeline_split 0.08% : 0.000048s : 1: pre_auto_parallel 0.07% : 0.000039s : 1: py_interpret_to_execute 0.04% : 0.000022s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000007s : 1: remove_cast_before_assign_add 0.04% : 0.000021s : 1: remove_dup_value 0.74% : 0.000425s : 1: renormalize.infer 0.64% : 0.000369s : 1: renormalize.specialize 0.01% : 0.000008s : 1: reorder_send_recv_between_fp_bp 0.02% : 0.000013s : 1: rewriter_after_jit_bprop_graph 0.09% : 0.000049s : 1: rewriter_after_opt_a 0.17% : 0.000096s : 1: rewriter_before_opt_a 0.01% : 0.000008s : 1: slice_cell_reuse_recomputed_activation 0.01% : 0.000008s : 1: slice_recompute_activation 0.01% : 0.000007s : 1: split_layernorm_comm 0.01% : 0.000008s : 1: split_matmul_comm_elemetwise 0.02% : 0.000013s : 1: swap_dp_allreduce_reducescatter 0.20% : 0.000117s : 1: symbol_engine_optimizer 0.18% : 0.000101s : 1: tuple_transform 59.30% : 0.034082s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:44:43.526.441 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0298732, [21] [bootstrap]: 0.00047143 [type_inference]: 0.0187722 [event_method]: 2.025e-05 [auto_monad]: 6.628e-05 [graph_reusing]: 5.91e-06 [inline]: 2.89001e-06 [add_attr]: 0.00379132, [1] [add_attr_with_inline]: 0.00377702, [1] [Cycle 1]: 7.757e-05, [2] [tag_attr]: 2.356e-05 [meta_addattr_fg_expand]: 6.41998e-06 [parallel-infer-symbol]: 3.76999e-06 [pre_auto_parallel]: 4.068e-05 [insert-virtual-dataset]: 2.54001e-06 [parallel-infer-symbol-second]: 9.00007e-07 [dataset_repeat_opt]: 2.12999e-06 [pipeline_split]: 1.66998e-06 [optimize]: 0.00589924, [53] [py_interpret_to_execute]: 3.052e-05 [rewriter_before_opt_a]: 9.681e-05 [opt_a]: 0.00342421, [2] [Cycle 1]: 0.00270693, [45] [expand_dump_flag]: 3.33998e-06 [switch_simplify]: 4.697e-05 [loop_unroll]: 3.095e-05 [a_1]: 0.00095154 [with_stream_mark]: 2.178e-05 [recompute_prepare]: 1.282e-05 [updatestate_depend_eliminate]: 4.26001e-06 [updatestate_assign_eliminate]: 3.4e-06 [updatestate_loads_eliminate]: 3.44001e-06 [parameter_eliminate]: 2.14e-06 [a_2]: 8.463e-05 [accelerated_algorithm]: 7.55e-06 [shard]: 2.69999e-06 [meta_shard_fg_expand]: 2.54001e-06 [shard_inline]: 6.91001e-06 [merge_send_recv]: 1.028e-05 [auto_parallel]: 8.84998e-06 [parallel]: 1.96e-05 [flash_sp]: 1.035e-05 [merge_comm]: 4.45999e-06 [allreduce_fusion]: 3.42002e-06 [matmul_add_comm_reduction]: 9.71e-06 [allreduce_slice_to_reducescatter]: 7.7e-07 [virtual_shard_identity]: 1.055e-05 [virtual_dataset]: 7.63999e-06 [get_grad_eliminate_]: 6.25997e-06 [virtual_output]: 6.69999e-06 [merge_forward]: 4.97999e-06 [cell_reuse_recompute_pass]: 1.71e-06 [offload_activation]: 1.139e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.456e-05 [merge_recompute_call_nodes]: 1.54e-06 [before_grad]: 1.117e-05 [set_forward_comm_id_for_comm_node_pass]: 5.04003e-06 [meta_fg_expand]: 0.000117 [flash_sp_send_recv_attached]: 4.41002e-06 [receive_attached]: 3.01001e-06 [after_resolve]: 2.375e-05 [a_after_grad]: 1.47e-05 [renormalize]: 0.0007549 [add_forward_monad_depend]: 4.68001e-06 [auto_monad_grad]: 3.25e-06 [auto_monad_eliminator]: 1.503e-05 [cse]: 2.943e-05 [a_3]: 5.655e-05 [Cycle 2]: 0.0007039, [45] [expand_dump_flag]: 1.87999e-06 [switch_simplify]: 9.05999e-06 [loop_unroll]: 6.96999e-06 [a_1]: 0.00013789 [with_stream_mark]: 1.581e-05 [recompute_prepare]: 6.95002e-06 [updatestate_depend_eliminate]: 3.38999e-06 [updatestate_assign_eliminate]: 2.87002e-06 [updatestate_loads_eliminate]: 2.83998e-06 [parameter_eliminate]: 1.67001e-06 [a_2]: 7.398e-05 [accelerated_algorithm]: 7.25e-06 [shard]: 2.01e-06 [meta_shard_fg_expand]: 1.82001e-06 [shard_inline]: 6.29001e-06 [merge_send_recv]: 7.05e-06 [auto_parallel]: 8.60001e-06 [parallel]: 7.98001e-06 [flash_sp]: 4.48999e-06 [merge_comm]: 3.56001e-06 [allreduce_fusion]: 3.76999e-06 [matmul_add_comm_reduction]: 7.66001e-06 [allreduce_slice_to_reducescatter]: 5.59987e-07 [virtual_shard_identity]: 7.43e-06 [virtual_dataset]: 6.65002e-06 [get_grad_eliminate_]: 7.16001e-06 [virtual_output]: 5.91e-06 [merge_forward]: 3.93001e-06 [cell_reuse_recompute_pass]: 3.01001e-06 [offload_activation]: 8.80999e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.414e-05 [merge_recompute_call_nodes]: 2.04e-06 [before_grad]: 1.033e-05 [set_forward_comm_id_for_comm_node_pass]: 3.52002e-06 [meta_fg_expand]: 2.69001e-06 [flash_sp_send_recv_attached]: 1.12e-06 [receive_attached]: 2.06003e-06 [after_resolve]: 1.237e-05 [a_after_grad]: 9.49e-06 [renormalize]: 8.00064e-08 [add_forward_monad_depend]: 1.27999e-06 [auto_monad_grad]: 1.42e-06 [auto_monad_eliminator]: 7.78001e-06 [cse]: 1.46e-05 [a_3]: 3.69e-05 [py_interpret_to_execute_after_opt_a]: 1.295e-05 [slice_cell_reuse_recomputed_activation]: 1.84998e-06 [rewriter_after_opt_a]: 3.754e-05 [convert_after_rewriter]: 6.72002e-06 [order_py_execute_after_rewriter]: 5.14998e-06 [mutable_eliminate]: 0.00068457 [opt_b]: 0.00031689, [1] [Cycle 1]: 0.00030894, [7] [b_1]: 0.0002229 [b_2]: 9.29e-06 [updatestate_depend_eliminate]: 5.57001e-06 [updatestate_assign_eliminate]: 2.54001e-06 [updatestate_loads_eliminate]: 2.52001e-06 [renormalize]: 5.59987e-07 [cse]: 2.396e-05 [optimize_parallel_all_gather_comm]: 1.756e-05 [overlap_param_gather]: 2.01003e-06 [cconv]: 2.607e-05 [loop_unroll]: 0.00054155 [opt_after_cconv]: 0.00010998, [1] [Cycle 1]: 0.00010338, [7] [c_1]: 3.282e-05 [parameter_eliminate]: 3.13e-06 [updatestate_depend_eliminate]: 5.27999e-06 [updatestate_assign_eliminate]: 2.76999e-06 [updatestate_loads_eliminate]: 2.37001e-06 [cse]: 2.105e-05 [renormalize]: 5.19998e-07 [remove_dup_value]: 1.364e-05 [tuple_transform]: 7.946e-05, [1] [Cycle 1]: 7.476e-05, [4] [d_1]: 4.664e-05 [none_parameter_eliminate]: 1.60001e-06 [renormalize]: 2.89991e-07 [switch_simplify]: 6.87002e-06 [partial_unused_args_eliminate]: 1.87001e-06 [add_recomputation]: 5.147e-05 [cse_after_recomputation]: 2.352e-05, [1] [Cycle 1]: 1.829e-05, [1] [cse]: 1.231e-05 [environ_conv]: 5.54e-06 [swap_dp_allreduce_reducescatter]: 5.79e-06 [bias_add_comm_swap]: 3.21999e-06 [label_micro_interleaved_index]: 5.00999e-06 [label_fine_grained_interleaved_index]: 3.28e-06 [merge_cast_opt]: 1.39e-06 [slice_recompute_activation]: 2.29001e-06 [micro_interleaved_order_control]: 2.38002e-06 [assign_add_opt]: 1.57001e-06 [ForceFp32Comm]: 8.10018e-07 [remove_cast_before_assign_add]: 1.09e-06 [full_micro_interleaved_order_control]: 2.37999e-06 [reorder_send_recv_between_fp_bp]: 2.73e-06 [comm_op_add_attrs]: 1.02e-06 [add_comm_op_reuse_tag]: 1.00999e-06 [interleave_split_concat_branches]: 1.05001e-06 [interleave_parallel_branches]: 1.22e-06 [overlap_opt_shard_in_pipeline]: 1.54e-06 [overlap_opt_shard_grad_in_pipeline]: 1.96998e-06 [control_data_broadcast_order]: 1.305e-05 [grouped_pairwise_exchange_alltoall]: 1.52999e-06 [offloading_packed_experts]: 3.83001e-06 [overlap_recompute_and_grad_model_parallel]: 5.30001e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.19e-06 [overlap_recompute_allgather_and_fa_grad]: 1.35001e-06 [overlap_recompute_comm]: 2.41e-06 [overlap_grad_ring_attention]: 4.83001e-06 [overlap_grad_flash_sp]: 2.116e-05 [begin_end_overlap_inline]: 6.29982e-07 [split_matmul_comm_elemetwise]: 2.26e-06 [split_layernorm_comm]: 1.79e-06 [handle_group_info]: 1.24e-06 [symbol_engine_optimizer]: 7.973e-05, [1] [Cycle 1]: 7.501e-05, [6] [build]: 3.43e-06 [elim_shapecalc]: 1.087e-05 [elim_not_effective]: 1.272e-05 [opt_reshape]: 7.73001e-06 [fold_const_symbol]: 1.091e-05 [renormalize]: 2.00002e-07 [detach_backward]: 2.04e-06 [pipeline_parallel_scheduler]: 1.50999e-06 [auto_monad_reorder]: 1.792e-05 [get_jit_bprop_graph]: 1.80001e-06 [rewriter_after_jit_bprop_graph]: 4.00998e-06 [opt_after_jit_grad]: 0.00055124 [validate]: 4.575e-05 Sums bootstrap : 0.000471s : 1.89% type_inference : 0.018772s : 75.16% event_method : 0.000020s : 0.08% auto_monad : 0.000066s : 0.27% graph_reusing : 0.000006s : 0.02% inline : 0.000003s : 0.01% add_attr.add_attr_with_inline.tag_attr : 0.000024s : 0.09% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.03% parallel-infer-symbol : 0.000004s : 0.02% pre_auto_parallel : 0.000041s : 0.16% insert-virtual-dataset : 0.000003s : 0.01% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.01% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000031s : 0.12% optimize.rewriter_before_opt_a : 0.000097s : 0.39% optimize.opt_a.expand_dump_flag : 0.000005s : 0.02% optimize.opt_a.switch_simplify : 0.000056s : 0.22% optimize.opt_a.loop_unroll : 0.000038s : 0.15% optimize.opt_a.a_1 : 0.001089s : 4.36% optimize.opt_a.with_stream_mark : 0.000038s : 0.15% optimize.opt_a.recompute_prepare : 0.000020s : 0.08% optimize.opt_a.updatestate_depend_eliminate : 0.000008s : 0.03% optimize.opt_a.updatestate_assign_eliminate : 0.000006s : 0.03% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.03% optimize.opt_a.parameter_eliminate : 0.000004s : 0.02% optimize.opt_a.a_2 : 0.000159s : 0.64% optimize.opt_a.accelerated_algorithm : 0.000015s : 0.06% optimize.opt_a.shard : 0.000005s : 0.02% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.02% optimize.opt_a.shard_inline : 0.000013s : 0.05% optimize.opt_a.merge_send_recv : 0.000017s : 0.07% optimize.opt_a.auto_parallel : 0.000017s : 0.07% optimize.opt_a.parallel : 0.000028s : 0.11% optimize.opt_a.flash_sp : 0.000015s : 0.06% optimize.opt_a.merge_comm : 0.000008s : 0.03% optimize.opt_a.allreduce_fusion : 0.000007s : 0.03% optimize.opt_a.matmul_add_comm_reduction : 0.000017s : 0.07% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000018s : 0.07% optimize.opt_a.virtual_dataset : 0.000014s : 0.06% optimize.opt_a.get_grad_eliminate_ : 0.000013s : 0.05% optimize.opt_a.virtual_output : 0.000013s : 0.05% optimize.opt_a.merge_forward : 0.000009s : 0.04% optimize.opt_a.cell_reuse_recompute_pass : 0.000005s : 0.02% optimize.opt_a.offload_activation : 0.000020s : 0.08% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000029s : 0.11% optimize.opt_a.merge_recompute_call_nodes : 0.000004s : 0.01% optimize.opt_a.before_grad : 0.000021s : 0.09% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000009s : 0.03% optimize.opt_a.meta_fg_expand : 0.000120s : 0.48% optimize.opt_a.flash_sp_send_recv_attached : 0.000006s : 0.02% optimize.opt_a.receive_attached : 0.000005s : 0.02% optimize.opt_a.after_resolve : 0.000036s : 0.14% optimize.opt_a.a_after_grad : 0.000024s : 0.10% optimize.opt_a.renormalize : 0.000755s : 3.02% optimize.opt_a.add_forward_monad_depend : 0.000006s : 0.02% optimize.opt_a.auto_monad_grad : 0.000005s : 0.02% optimize.opt_a.auto_monad_eliminator : 0.000023s : 0.09% optimize.opt_a.cse : 0.000044s : 0.18% optimize.opt_a.a_3 : 0.000093s : 0.37% optimize.py_interpret_to_execute_after_opt_a : 0.000013s : 0.05% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.01% optimize.rewriter_after_opt_a : 0.000038s : 0.15% optimize.convert_after_rewriter : 0.000007s : 0.03% optimize.order_py_execute_after_rewriter : 0.000005s : 0.02% optimize.mutable_eliminate : 0.000685s : 2.74% optimize.opt_b.b_1 : 0.000223s : 0.89% optimize.opt_b.b_2 : 0.000009s : 0.04% optimize.opt_b.updatestate_depend_eliminate : 0.000006s : 0.02% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.01% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.01% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000024s : 0.10% optimize.optimize_parallel_all_gather_comm : 0.000018s : 0.07% optimize.overlap_param_gather : 0.000002s : 0.01% optimize.cconv : 0.000026s : 0.10% optimize.loop_unroll : 0.000542s : 2.17% optimize.opt_after_cconv.c_1 : 0.000033s : 0.13% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000005s : 0.02% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.01% optimize.opt_after_cconv.cse : 0.000021s : 0.08% optimize.opt_after_cconv.renormalize : 0.000001s : 0.00% optimize.remove_dup_value : 0.000014s : 0.05% optimize.tuple_transform.d_1 : 0.000047s : 0.19% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000007s : 0.03% optimize.partial_unused_args_eliminate : 0.000002s : 0.01% optimize.add_recomputation : 0.000051s : 0.21% optimize.cse_after_recomputation.cse : 0.000012s : 0.05% optimize.environ_conv : 0.000006s : 0.02% optimize.swap_dp_allreduce_reducescatter : 0.000006s : 0.02% optimize.bias_add_comm_swap : 0.000003s : 0.01% optimize.label_micro_interleaved_index : 0.000005s : 0.02% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.01% optimize.merge_cast_opt : 0.000001s : 0.01% optimize.slice_recompute_activation : 0.000002s : 0.01% optimize.micro_interleaved_order_control : 0.000002s : 0.01% optimize.assign_add_opt : 0.000002s : 0.01% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.00% optimize.full_micro_interleaved_order_control : 0.000002s : 0.01% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.01% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000002s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.01% optimize.control_data_broadcast_order : 0.000013s : 0.05% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.01% optimize.offloading_packed_experts : 0.000004s : 0.02% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.02% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.01% optimize.overlap_recompute_comm : 0.000002s : 0.01% optimize.overlap_grad_ring_attention : 0.000005s : 0.02% optimize.overlap_grad_flash_sp : 0.000021s : 0.08% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.01% optimize.split_layernorm_comm : 0.000002s : 0.01% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000003s : 0.01% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000011s : 0.04% optimize.symbol_engine_optimizer.elim_not_effective : 0.000013s : 0.05% optimize.symbol_engine_optimizer.opt_reshape : 0.000008s : 0.03% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000011s : 0.04% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.01% pipeline_parallel_scheduler : 0.000002s : 0.01% auto_monad_reorder : 0.000018s : 0.07% get_jit_bprop_graph : 0.000002s : 0.01% rewriter_after_jit_bprop_graph : 0.000004s : 0.02% opt_after_jit_grad : 0.000551s : 2.21% validate : 0.000046s : 0.18% Time group info: ------[substitution.] 0.000263 29 0.71% : 0.000002s : 2: substitution.elim_not_effective 0.69% : 0.000002s : 2: substitution.fold_const_symbol 2.50% : 0.000007s : 4: substitution.graph_param_transform 79.59% : 0.000209s : 4: substitution.inline 1.70% : 0.000004s : 4: substitution.j_node_and_user_rematch 2.10% : 0.000006s : 4: substitution.remove_not_recompute_node 2.68% : 0.000007s : 4: substitution.replace_old_param 6.64% : 0.000017s : 4: substitution.tuple_list_get_item_eliminator 3.39% : 0.000009s : 1: substitution.value_based_eliminate ------[type_inference.] 0.018699 2 96.03% : 0.017957s : 1: type_inference.infer 3.97% : 0.000742s : 1: type_inference.specialize ------[replace.] 0.000291 8 91.18% : 0.000265s : 4: replace.inline 8.82% : 0.000026s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000221 8 93.19% : 0.000206s : 4: match.inline 6.81% : 0.000015s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000229 1278 0.86% : 0.000002s : 13: predicate.accumulaten_eliminater 0.80% : 0.000002s : 4: predicate.ad_related_special_op_eliminate 0.52% : 0.000001s : 8: predicate.addn_check_dump 1.20% : 0.000003s : 13: predicate.addn_zero_filter 0.91% : 0.000002s : 13: predicate.adjust_all_reduce_mul_add 2.10% : 0.000005s : 21: predicate.arithmetic_simplify 1.13% : 0.000003s : 13: predicate.cast_eliminate 0.57% : 0.000001s : 8: predicate.check_bprop_eliminate 0.49% : 0.000001s : 8: predicate.compare_switch_simplify 0.18% : 0.000000s : 4: predicate.const_output_eliminate 0.60% : 0.000001s : 8: predicate.depend_value_elim 0.94% : 0.000002s : 13: predicate.dict_get_item_const_eliminator 1.28% : 0.000003s : 13: predicate.dict_get_item_eliminator 0.82% : 0.000002s : 13: predicate.dict_set_item_eliminator 1.08% : 0.000002s : 8: predicate.dumpgradient_eliminate 0.18% : 0.000000s : 4: predicate.elim_not_effective 0.39% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.29% : 0.000003s : 17: predicate.environ_add_const_eliminate 1.09% : 0.000003s : 17: predicate.environ_get_add_eliminate 1.00% : 0.000002s : 17: predicate.environ_get_depend_swap 1.62% : 0.000004s : 25: predicate.environ_get_eliminate 1.17% : 0.000003s : 17: predicate.environ_get_set_eliminate 1.50% : 0.000003s : 21: predicate.exchange_switch_depend_value 2.24% : 0.000005s : 21: predicate.float_depend_g_call 0.55% : 0.000001s : 8: predicate.float_environ_get_switch 0.79% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.18% : 0.000000s : 4: predicate.fold_const_symbol 0.68% : 0.000002s : 8: predicate.get_grad_eliminate 0.20% : 0.000000s : 4: predicate.graph_param_transform 0.71% : 0.000002s : 8: predicate.incorporate_call 0.42% : 0.000001s : 8: predicate.incorporate_call_switch 6.13% : 0.000014s : 58: predicate.inline 1.13% : 0.000003s : 8: predicate.inline_without_move 0.31% : 0.000001s : 8: predicate.j_node_and_user_rematch 0.93% : 0.000002s : 8: predicate.less_batch_normalization 1.89% : 0.000004s : 25: predicate.list_to_tuple_eliminator_ 2.42% : 0.000006s : 38: predicate.load_eliminater 0.81% : 0.000002s : 4: predicate.loop_unroll_after_grad 2.14% : 0.000005s : 34: predicate.loop_unroll_before_grad 1.80% : 0.000004s : 21: predicate.make_slice_get_slice_eliminator 0.55% : 0.000001s : 8: predicate.merge_addn 0.58% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.54% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.82% : 0.000002s : 13: predicate.minmaximum_grad 0.72% : 0.000002s : 4: predicate.mutable_eliminate 0.39% : 0.000001s : 4: predicate.opt_reshape 0.41% : 0.000001s : 4: predicate.parallel_virtual_node 2.05% : 0.000005s : 21: predicate.partial_defer_inline 1.48% : 0.000003s : 21: predicate.partial_eliminate 0.85% : 0.000002s : 13: predicate.print_const_string_wrapper 0.61% : 0.000001s : 8: predicate.reduce_all_const_elim 1.29% : 0.000003s : 13: predicate.reduce_eliminate 2.42% : 0.000006s : 38: predicate.redundant_stop_gradient_eliminater 0.35% : 0.000001s : 8: predicate.remove_not_recompute_node 1.35% : 0.000003s : 25: predicate.replace_applicator 0.56% : 0.000001s : 8: predicate.replace_old_param 0.28% : 0.000001s : 4: predicate.reset_defer_inline 0.88% : 0.000002s : 13: predicate.reshape_eliminate 0.72% : 0.000002s : 8: predicate.row_tensor_add_zeros_like 0.35% : 0.000001s : 4: predicate.row_tensor_eliminate 0.69% : 0.000002s : 8: predicate.same_eliminate 0.39% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.87% : 0.000002s : 8: predicate.shard_identity_eliminate 0.74% : 0.000002s : 8: predicate.special_op_eliminate 0.66% : 0.000002s : 8: predicate.specialize_transform 1.03% : 0.000002s : 8: predicate.split_environ_get_set_with_tuple_value 0.83% : 0.000002s : 8: predicate.stack_unstack_eliminate 0.31% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.43% : 0.000003s : 21: predicate.switch_defer_inline 1.91% : 0.000004s : 29: predicate.switch_layer_defer_inline 4.88% : 0.000011s : 67: predicate.switch_simplify 1.07% : 0.000002s : 13: predicate.tile_eliminate 0.89% : 0.000002s : 13: predicate.transpose_eliminate 1.65% : 0.000004s : 21: predicate.tuple_list_convert_item_index_to_positive 1.84% : 0.000004s : 21: predicate.tuple_list_get_item_const_eliminator 1.52% : 0.000003s : 21: predicate.tuple_list_get_item_depend_reorder 3.34% : 0.000008s : 33: predicate.tuple_list_get_item_eliminator 1.58% : 0.000004s : 21: predicate.tuple_list_get_set_item_eliminator 2.34% : 0.000005s : 29: predicate.tuple_list_set_item_eliminator 1.62% : 0.000004s : 25: predicate.tuple_to_list_eliminator_ 2.25% : 0.000005s : 38: predicate.updatestate_pure_node_eliminater 2.98% : 0.000007s : 46: predicate.updatestate_useless_node_eliminater 0.58% : 0.000001s : 4: predicate.value_based_eliminate 0.91% : 0.000002s : 8: predicate.virtual_dataset_eliminate 0.67% : 0.000002s : 8: predicate.virtual_output_eliminate 0.25% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.50% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000575 11 55.00% : 0.000316s : 5: func_graph_cloner_run.FuncGraphClonerGraph 45.00% : 0.000259s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.042043 192 0.01% : 0.000004s : 1: ForceFp32Comm 9.03% : 0.003798s : 1: add_attr 9.00% : 0.003782s : 1: add_attr_with_inline 0.01% : 0.000004s : 1: add_comm_op_reuse_tag 0.13% : 0.000056s : 1: add_recomputation 0.01% : 0.000004s : 1: assign_add_opt 0.17% : 0.000072s : 1: auto_monad 0.05% : 0.000021s : 1: auto_monad_reorder 0.01% : 0.000003s : 1: begin_end_overlap_inline 0.01% : 0.000006s : 1: bias_add_comm_swap 1.20% : 0.000503s : 1: bootstrap 0.07% : 0.000029s : 1: cconv 0.01% : 0.000004s : 1: comm_op_add_attrs 0.04% : 0.000016s : 1: control_data_broadcast_order 0.02% : 0.000010s : 1: convert_after_rewriter 0.06% : 0.000026s : 1: cse_after_recomputation 0.01% : 0.000005s : 1: dataset_repeat_opt 0.01% : 0.000006s : 1: detach_backward 0.02% : 0.000009s : 1: environ_conv 0.06% : 0.000027s : 1: event_method 0.01% : 0.000005s : 1: full_micro_interleaved_order_control 0.01% : 0.000005s : 1: get_jit_bprop_graph 0.02% : 0.000009s : 1: graph_reusing 0.01% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000004s : 1: handle_group_info 0.01% : 0.000006s : 1: inline 0.01% : 0.000006s : 1: insert-virtual-dataset 0.01% : 0.000004s : 1: interleave_parallel_branches 0.01% : 0.000004s : 1: interleave_split_concat_branches 0.02% : 0.000006s : 1: label_fine_grained_interleaved_index 0.02% : 0.000008s : 1: label_micro_interleaved_index 1.31% : 0.000551s : 1: loop_unroll 0.01% : 0.000004s : 1: merge_cast_opt 0.01% : 0.000005s : 1: micro_interleaved_order_control 1.65% : 0.000695s : 1: mutable_eliminate 0.02% : 0.000007s : 1: offloading_packed_experts 0.04% : 0.000015s : 1: opt.transform.loop_unroll_optimizer 0.04% : 0.000015s : 1: opt.transform.mutable_eliminate 3.68% : 0.001546s : 78: opt.transform.opt_a 0.07% : 0.000031s : 1: opt.transform.opt_after_cconv 0.07% : 0.000029s : 1: opt.transform.opt_after_jit_grad 0.35% : 0.000148s : 28: opt.transform.opt_b 0.12% : 0.000051s : 2: opt.transform.opt_trans_graph 0.09% : 0.000038s : 4: opt.transform.symbol_engine_opt 8.15% : 0.003427s : 1: opt_a 0.27% : 0.000114s : 1: opt_after_cconv 1.34% : 0.000562s : 1: opt_after_jit_grad 0.76% : 0.000321s : 1: opt_b 14.04% : 0.005905s : 1: optimize 0.05% : 0.000021s : 1: optimize_parallel_all_gather_comm 0.02% : 0.000008s : 1: order_py_execute_after_rewriter 0.06% : 0.000025s : 1: overlap_grad_flash_sp 0.01% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.02% : 0.000008s : 1: overlap_grad_ring_attention 0.01% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.01% : 0.000005s : 1: overlap_param_gather 0.01% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.02% : 0.000009s : 1: overlap_recompute_and_grad_model_parallel 0.01% : 0.000005s : 1: overlap_recompute_comm 0.02% : 0.000008s : 1: parallel-infer-symbol 0.01% : 0.000004s : 1: parallel-infer-symbol-second 0.01% : 0.000005s : 1: partial_unused_args_eliminate 0.01% : 0.000005s : 1: pipeline_parallel_scheduler 0.01% : 0.000004s : 1: pipeline_split 0.11% : 0.000045s : 1: pre_auto_parallel 0.08% : 0.000035s : 1: py_interpret_to_execute 0.04% : 0.000016s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000004s : 1: remove_cast_before_assign_add 0.04% : 0.000017s : 1: remove_dup_value 0.90% : 0.000380s : 1: renormalize.infer 0.87% : 0.000365s : 1: renormalize.specialize 0.01% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.02% : 0.000007s : 1: rewriter_after_jit_bprop_graph 0.10% : 0.000042s : 1: rewriter_after_opt_a 0.24% : 0.000103s : 1: rewriter_before_opt_a 0.01% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.01% : 0.000006s : 1: slice_recompute_activation 0.01% : 0.000005s : 1: split_layernorm_comm 0.01% : 0.000005s : 1: split_matmul_comm_elemetwise 0.02% : 0.000009s : 1: swap_dp_allreduce_reducescatter 0.20% : 0.000082s : 1: symbol_engine_optimizer 0.20% : 0.000083s : 1: tuple_transform 44.70% : 0.018795s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:44:44.139.573 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:44:44.139.867 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0361574, [21] [bootstrap]: 0.00043886 [type_inference]: 0.00933476 [event_method]: 1.915e-05 [auto_monad]: 6.615e-05 [graph_reusing]: 6.86999e-06 [inline]: 2.52001e-06 [add_attr]: 0.00375023, [1] [add_attr_with_inline]: 0.00373449, [1] [Cycle 1]: 7.853e-05, [2] [tag_attr]: 2.11e-05 [meta_addattr_fg_expand]: 6.16e-06 [parallel-infer-symbol]: 4.20999e-06 [pre_auto_parallel]: 3.514e-05 [insert-virtual-dataset]: 2.64999e-06 [parallel-infer-symbol-second]: 9.29984e-07 [dataset_repeat_opt]: 1.97999e-06 [pipeline_split]: 1.57999e-06 [optimize]: 0.0210598, [53] [py_interpret_to_execute]: 3.038e-05 [rewriter_before_opt_a]: 9.176e-05 [opt_a]: 0.0181891, [2] [Cycle 1]: 0.0172453, [45] [expand_dump_flag]: 3.03e-06 [switch_simplify]: 4.247e-05 [loop_unroll]: 3.02e-05 [a_1]: 0.00070647 [with_stream_mark]: 1.76e-05 [recompute_prepare]: 8.70999e-06 [updatestate_depend_eliminate]: 3.88001e-06 [updatestate_assign_eliminate]: 3.6e-06 [updatestate_loads_eliminate]: 3.26999e-06 [parameter_eliminate]: 2.01998e-06 [a_2]: 0.00011418 [accelerated_algorithm]: 8.3e-06 [shard]: 1.96e-06 [meta_shard_fg_expand]: 2.16998e-06 [shard_inline]: 7.36001e-06 [merge_send_recv]: 8.64e-06 [auto_parallel]: 6.59999e-06 [parallel]: 1.968e-05 [flash_sp]: 8.44998e-06 [merge_comm]: 4.38001e-06 [allreduce_fusion]: 3.55e-06 [matmul_add_comm_reduction]: 1.104e-05 [allreduce_slice_to_reducescatter]: 6.80011e-07 [virtual_shard_identity]: 8.95999e-06 [virtual_dataset]: 7.26001e-06 [get_grad_eliminate_]: 6.91999e-06 [virtual_output]: 6.91999e-06 [merge_forward]: 4.13001e-06 [cell_reuse_recompute_pass]: 1.35999e-06 [offload_activation]: 1.077e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.469e-05 [merge_recompute_call_nodes]: 1.55999e-06 [before_grad]: 1.133e-05 [set_forward_comm_id_for_comm_node_pass]: 4.33999e-06 [meta_fg_expand]: 3.22002e-06 [flash_sp_send_recv_attached]: 2.77002e-06 [receive_attached]: 2.48998e-06 [after_resolve]: 1.278e-05 [a_after_grad]: 1.052e-05 [renormalize]: 0.015526 [add_forward_monad_depend]: 1.298e-05 [auto_monad_grad]: 2.84999e-06 [auto_monad_eliminator]: 2.664e-05 [cse]: 3.176e-05 [a_3]: 8.077e-05 [Cycle 2]: 0.00092457, [45] [expand_dump_flag]: 2.81e-06 [switch_simplify]: 9.98002e-06 [loop_unroll]: 7.41001e-06 [a_1]: 0.00014901 [with_stream_mark]: 2.296e-05 [recompute_prepare]: 6.50002e-06 [updatestate_depend_eliminate]: 3.83001e-06 [updatestate_assign_eliminate]: 3.55998e-06 [updatestate_loads_eliminate]: 3.35003e-06 [parameter_eliminate]: 2.17001e-06 [a_2]: 0.00010425 [accelerated_algorithm]: 7.14001e-06 [shard]: 2.83e-06 [meta_shard_fg_expand]: 2.59999e-06 [shard_inline]: 6.99001e-06 [merge_send_recv]: 9.00001e-06 [auto_parallel]: 9.75002e-06 [parallel]: 9.96998e-06 [flash_sp]: 4.38999e-06 [merge_comm]: 3.57002e-06 [allreduce_fusion]: 3.4e-06 [matmul_add_comm_reduction]: 9.77999e-06 [allreduce_slice_to_reducescatter]: 7.89994e-07 [virtual_shard_identity]: 8.57e-06 [virtual_dataset]: 6.46e-06 [get_grad_eliminate_]: 6.59001e-06 [virtual_output]: 6.35002e-06 [merge_forward]: 5.05001e-06 [cell_reuse_recompute_pass]: 3.26001e-06 [offload_activation]: 1.106e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.067e-05 [merge_recompute_call_nodes]: 1.49998e-06 [before_grad]: 1.19e-05 [set_forward_comm_id_for_comm_node_pass]: 3.56001e-06 [meta_fg_expand]: 3.23e-06 [flash_sp_send_recv_attached]: 1.62001e-06 [receive_attached]: 2.43002e-06 [after_resolve]: 1.346e-05 [a_after_grad]: 9.78002e-06 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 1.94999e-06 [auto_monad_grad]: 1.15999e-06 [auto_monad_eliminator]: 1.003e-05 [cse]: 1.706e-05 [a_3]: 5.161e-05 [py_interpret_to_execute_after_opt_a]: 2.118e-05 [slice_cell_reuse_recomputed_activation]: 4.80999e-06 [rewriter_after_opt_a]: 5.362e-05 [convert_after_rewriter]: 1.149e-05 [order_py_execute_after_rewriter]: 9.02999e-06 [mutable_eliminate]: 0.00077287 [opt_b]: 0.00029753, [1] [Cycle 1]: 0.00028516, [7] [b_1]: 0.00017855 [b_2]: 9.40001e-06 [updatestate_depend_eliminate]: 8.21002e-06 [updatestate_assign_eliminate]: 2.71e-06 [updatestate_loads_eliminate]: 2.61e-06 [renormalize]: 6.19999e-07 [cse]: 2.356e-05 [optimize_parallel_all_gather_comm]: 2.492e-05 [overlap_param_gather]: 6.22001e-06 [cconv]: 3.641e-05 [loop_unroll]: 0.0005944 [opt_after_cconv]: 0.00014037, [1] [Cycle 1]: 0.0001306, [7] [c_1]: 3.452e-05 [parameter_eliminate]: 3.76999e-06 [updatestate_depend_eliminate]: 6.82002e-06 [updatestate_assign_eliminate]: 3.26001e-06 [updatestate_loads_eliminate]: 2.42001e-06 [cse]: 2.171e-05 [renormalize]: 5.00004e-07 [remove_dup_value]: 1.759e-05 [tuple_transform]: 9.806e-05, [1] [Cycle 1]: 8.899e-05, [4] [d_1]: 4.782e-05 [none_parameter_eliminate]: 1.69e-06 [renormalize]: 2.59985e-07 [switch_simplify]: 7.83001e-06 [partial_unused_args_eliminate]: 4.60999e-06 [add_recomputation]: 7.276e-05 [cse_after_recomputation]: 3.319e-05, [1] [Cycle 1]: 2.485e-05, [1] [cse]: 1.415e-05 [environ_conv]: 9.54e-06 [swap_dp_allreduce_reducescatter]: 8.54e-06 [bias_add_comm_swap]: 6.26e-06 [label_micro_interleaved_index]: 9.31e-06 [label_fine_grained_interleaved_index]: 5.72001e-06 [merge_cast_opt]: 4.84998e-06 [slice_recompute_activation]: 5.33002e-06 [micro_interleaved_order_control]: 5.39998e-06 [assign_add_opt]: 4.38999e-06 [ForceFp32Comm]: 3.81001e-06 [remove_cast_before_assign_add]: 3.48e-06 [full_micro_interleaved_order_control]: 4.70001e-06 [reorder_send_recv_between_fp_bp]: 6.12999e-06 [comm_op_add_attrs]: 4.22e-06 [add_comm_op_reuse_tag]: 3.91999e-06 [interleave_split_concat_branches]: 3.70998e-06 [interleave_parallel_branches]: 3.87002e-06 [overlap_opt_shard_in_pipeline]: 4.11001e-06 [overlap_opt_shard_grad_in_pipeline]: 4.68001e-06 [control_data_broadcast_order]: 1.917e-05 [grouped_pairwise_exchange_alltoall]: 4.08001e-06 [offloading_packed_experts]: 7.03e-06 [overlap_recompute_and_grad_model_parallel]: 7.55e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.81999e-06 [overlap_recompute_allgather_and_fa_grad]: 3.85e-06 [overlap_recompute_comm]: 5.52001e-06 [overlap_grad_ring_attention]: 6.74999e-06 [overlap_grad_flash_sp]: 2.567e-05 [begin_end_overlap_inline]: 3.06001e-06 [split_matmul_comm_elemetwise]: 4.80001e-06 [split_layernorm_comm]: 4.82e-06 [handle_group_info]: 3.55e-06 [symbol_engine_optimizer]: 0.00011516, [1] [Cycle 1]: 0.00010679, [6] [build]: 4.47003e-06 [elim_shapecalc]: 1.297e-05 [elim_not_effective]: 1.722e-05 [opt_reshape]: 8.96002e-06 [fold_const_symbol]: 1.244e-05 [renormalize]: 2.3999e-07 [detach_backward]: 4.41002e-06 [pipeline_parallel_scheduler]: 1.88002e-06 [auto_monad_reorder]: 2.173e-05 [get_jit_bprop_graph]: 2.15002e-06 [rewriter_after_jit_bprop_graph]: 6.66999e-06 [opt_after_jit_grad]: 0.00068909 [validate]: 4.817e-05 Sums bootstrap : 0.000439s : 1.44% type_inference : 0.009335s : 30.62% event_method : 0.000019s : 0.06% auto_monad : 0.000066s : 0.22% graph_reusing : 0.000007s : 0.02% inline : 0.000003s : 0.01% add_attr.add_attr_with_inline.tag_attr : 0.000021s : 0.07% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.02% parallel-infer-symbol : 0.000004s : 0.01% pre_auto_parallel : 0.000035s : 0.12% insert-virtual-dataset : 0.000003s : 0.01% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.01% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000030s : 0.10% optimize.rewriter_before_opt_a : 0.000092s : 0.30% optimize.opt_a.expand_dump_flag : 0.000006s : 0.02% optimize.opt_a.switch_simplify : 0.000052s : 0.17% optimize.opt_a.loop_unroll : 0.000038s : 0.12% optimize.opt_a.a_1 : 0.000855s : 2.81% optimize.opt_a.with_stream_mark : 0.000041s : 0.13% optimize.opt_a.recompute_prepare : 0.000015s : 0.05% optimize.opt_a.updatestate_depend_eliminate : 0.000008s : 0.03% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.02% optimize.opt_a.updatestate_loads_eliminate : 0.000007s : 0.02% optimize.opt_a.parameter_eliminate : 0.000004s : 0.01% optimize.opt_a.a_2 : 0.000218s : 0.72% optimize.opt_a.accelerated_algorithm : 0.000015s : 0.05% optimize.opt_a.shard : 0.000005s : 0.02% optimize.opt_a.meta_shard_fg_expand : 0.000005s : 0.02% optimize.opt_a.shard_inline : 0.000014s : 0.05% optimize.opt_a.merge_send_recv : 0.000018s : 0.06% optimize.opt_a.auto_parallel : 0.000016s : 0.05% optimize.opt_a.parallel : 0.000030s : 0.10% optimize.opt_a.flash_sp : 0.000013s : 0.04% optimize.opt_a.merge_comm : 0.000008s : 0.03% optimize.opt_a.allreduce_fusion : 0.000007s : 0.02% optimize.opt_a.matmul_add_comm_reduction : 0.000021s : 0.07% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000018s : 0.06% optimize.opt_a.virtual_dataset : 0.000014s : 0.04% optimize.opt_a.get_grad_eliminate_ : 0.000014s : 0.04% optimize.opt_a.virtual_output : 0.000013s : 0.04% optimize.opt_a.merge_forward : 0.000009s : 0.03% optimize.opt_a.cell_reuse_recompute_pass : 0.000005s : 0.02% optimize.opt_a.offload_activation : 0.000022s : 0.07% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000035s : 0.12% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.01% optimize.opt_a.before_grad : 0.000023s : 0.08% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000008s : 0.03% optimize.opt_a.meta_fg_expand : 0.000006s : 0.02% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.01% optimize.opt_a.receive_attached : 0.000005s : 0.02% optimize.opt_a.after_resolve : 0.000026s : 0.09% optimize.opt_a.a_after_grad : 0.000020s : 0.07% optimize.opt_a.renormalize : 0.015526s : 50.92% optimize.opt_a.add_forward_monad_depend : 0.000015s : 0.05% optimize.opt_a.auto_monad_grad : 0.000004s : 0.01% optimize.opt_a.auto_monad_eliminator : 0.000037s : 0.12% optimize.opt_a.cse : 0.000049s : 0.16% optimize.opt_a.a_3 : 0.000132s : 0.43% optimize.py_interpret_to_execute_after_opt_a : 0.000021s : 0.07% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.02% optimize.rewriter_after_opt_a : 0.000054s : 0.18% optimize.convert_after_rewriter : 0.000011s : 0.04% optimize.order_py_execute_after_rewriter : 0.000009s : 0.03% optimize.mutable_eliminate : 0.000773s : 2.53% optimize.opt_b.b_1 : 0.000179s : 0.59% optimize.opt_b.b_2 : 0.000009s : 0.03% optimize.opt_b.updatestate_depend_eliminate : 0.000008s : 0.03% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.01% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.01% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000024s : 0.08% optimize.optimize_parallel_all_gather_comm : 0.000025s : 0.08% optimize.overlap_param_gather : 0.000006s : 0.02% optimize.cconv : 0.000036s : 0.12% optimize.loop_unroll : 0.000594s : 1.95% optimize.opt_after_cconv.c_1 : 0.000035s : 0.11% optimize.opt_after_cconv.parameter_eliminate : 0.000004s : 0.01% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000007s : 0.02% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.01% optimize.opt_after_cconv.cse : 0.000022s : 0.07% optimize.opt_after_cconv.renormalize : 0.000001s : 0.00% optimize.remove_dup_value : 0.000018s : 0.06% optimize.tuple_transform.d_1 : 0.000048s : 0.16% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000008s : 0.03% optimize.partial_unused_args_eliminate : 0.000005s : 0.02% optimize.add_recomputation : 0.000073s : 0.24% optimize.cse_after_recomputation.cse : 0.000014s : 0.05% optimize.environ_conv : 0.000010s : 0.03% optimize.swap_dp_allreduce_reducescatter : 0.000009s : 0.03% optimize.bias_add_comm_swap : 0.000006s : 0.02% optimize.label_micro_interleaved_index : 0.000009s : 0.03% optimize.label_fine_grained_interleaved_index : 0.000006s : 0.02% optimize.merge_cast_opt : 0.000005s : 0.02% optimize.slice_recompute_activation : 0.000005s : 0.02% optimize.micro_interleaved_order_control : 0.000005s : 0.02% optimize.assign_add_opt : 0.000004s : 0.01% optimize.ForceFp32Comm : 0.000004s : 0.01% optimize.remove_cast_before_assign_add : 0.000003s : 0.01% optimize.full_micro_interleaved_order_control : 0.000005s : 0.02% optimize.reorder_send_recv_between_fp_bp : 0.000006s : 0.02% optimize.comm_op_add_attrs : 0.000004s : 0.01% optimize.add_comm_op_reuse_tag : 0.000004s : 0.01% optimize.interleave_split_concat_branches : 0.000004s : 0.01% optimize.interleave_parallel_branches : 0.000004s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000005s : 0.02% optimize.control_data_broadcast_order : 0.000019s : 0.06% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.01% optimize.offloading_packed_experts : 0.000007s : 0.02% optimize.overlap_recompute_and_grad_model_parallel : 0.000008s : 0.02% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.01% optimize.overlap_recompute_comm : 0.000006s : 0.02% optimize.overlap_grad_ring_attention : 0.000007s : 0.02% optimize.overlap_grad_flash_sp : 0.000026s : 0.08% optimize.begin_end_overlap_inline : 0.000003s : 0.01% optimize.split_matmul_comm_elemetwise : 0.000005s : 0.02% optimize.split_layernorm_comm : 0.000005s : 0.02% optimize.handle_group_info : 0.000004s : 0.01% optimize.symbol_engine_optimizer.build : 0.000004s : 0.01% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000013s : 0.04% optimize.symbol_engine_optimizer.elim_not_effective : 0.000017s : 0.06% optimize.symbol_engine_optimizer.opt_reshape : 0.000009s : 0.03% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000012s : 0.04% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000004s : 0.01% pipeline_parallel_scheduler : 0.000002s : 0.01% auto_monad_reorder : 0.000022s : 0.07% get_jit_bprop_graph : 0.000002s : 0.01% rewriter_after_jit_bprop_graph : 0.000007s : 0.02% opt_after_jit_grad : 0.000689s : 2.26% validate : 0.000048s : 0.16% Time group info: ------[substitution.] 0.000252 28 0.87% : 0.000002s : 2: substitution.elim_not_effective 0.73% : 0.000002s : 2: substitution.fold_const_symbol 2.57% : 0.000006s : 4: substitution.graph_param_transform 81.80% : 0.000206s : 4: substitution.inline 2.07% : 0.000005s : 4: substitution.j_node_and_user_rematch 2.65% : 0.000007s : 4: substitution.remove_not_recompute_node 2.85% : 0.000007s : 4: substitution.replace_old_param 6.46% : 0.000016s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.009278 2 91.85% : 0.008521s : 1: type_inference.infer 8.15% : 0.000756s : 1: type_inference.specialize ------[replace.] 0.000065 8 65.03% : 0.000042s : 4: replace.inline 34.97% : 0.000023s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000217 8 93.38% : 0.000203s : 4: match.inline 6.62% : 0.000014s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000232 1278 0.95% : 0.000002s : 13: predicate.accumulaten_eliminater 0.93% : 0.000002s : 4: predicate.ad_related_special_op_eliminate 0.51% : 0.000001s : 8: predicate.addn_check_dump 0.85% : 0.000002s : 13: predicate.addn_zero_filter 0.89% : 0.000002s : 13: predicate.adjust_all_reduce_mul_add 2.24% : 0.000005s : 21: predicate.arithmetic_simplify 1.11% : 0.000003s : 13: predicate.cast_eliminate 0.67% : 0.000002s : 8: predicate.check_bprop_eliminate 0.70% : 0.000002s : 8: predicate.compare_switch_simplify 0.17% : 0.000000s : 4: predicate.const_output_eliminate 0.68% : 0.000002s : 8: predicate.depend_value_elim 0.86% : 0.000002s : 13: predicate.dict_get_item_const_eliminator 1.11% : 0.000003s : 13: predicate.dict_get_item_eliminator 0.84% : 0.000002s : 13: predicate.dict_set_item_eliminator 0.88% : 0.000002s : 8: predicate.dumpgradient_eliminate 0.29% : 0.000001s : 4: predicate.elim_not_effective 0.49% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.12% : 0.000003s : 17: predicate.environ_add_const_eliminate 1.11% : 0.000003s : 17: predicate.environ_get_add_eliminate 1.07% : 0.000002s : 17: predicate.environ_get_depend_swap 1.69% : 0.000004s : 25: predicate.environ_get_eliminate 1.04% : 0.000002s : 17: predicate.environ_get_set_eliminate 1.35% : 0.000003s : 21: predicate.exchange_switch_depend_value 2.44% : 0.000006s : 21: predicate.float_depend_g_call 0.69% : 0.000002s : 8: predicate.float_environ_get_switch 1.04% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.15% : 0.000000s : 4: predicate.fold_const_symbol 0.75% : 0.000002s : 8: predicate.get_grad_eliminate 0.19% : 0.000000s : 4: predicate.graph_param_transform 0.64% : 0.000001s : 8: predicate.incorporate_call 0.43% : 0.000001s : 8: predicate.incorporate_call_switch 5.81% : 0.000014s : 58: predicate.inline 0.71% : 0.000002s : 8: predicate.inline_without_move 0.29% : 0.000001s : 8: predicate.j_node_and_user_rematch 0.77% : 0.000002s : 8: predicate.less_batch_normalization 1.91% : 0.000004s : 25: predicate.list_to_tuple_eliminator_ 2.59% : 0.000006s : 38: predicate.load_eliminater 0.96% : 0.000002s : 4: predicate.loop_unroll_after_grad 2.09% : 0.000005s : 34: predicate.loop_unroll_before_grad 1.71% : 0.000004s : 21: predicate.make_slice_get_slice_eliminator 0.52% : 0.000001s : 8: predicate.merge_addn 0.68% : 0.000002s : 8: predicate.micro_step_allgather_replace 0.59% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.76% : 0.000002s : 13: predicate.minmaximum_grad 1.44% : 0.000003s : 4: predicate.mutable_eliminate 0.47% : 0.000001s : 4: predicate.opt_reshape 0.47% : 0.000001s : 4: predicate.parallel_virtual_node 1.79% : 0.000004s : 21: predicate.partial_defer_inline 1.51% : 0.000004s : 21: predicate.partial_eliminate 0.86% : 0.000002s : 13: predicate.print_const_string_wrapper 0.61% : 0.000001s : 8: predicate.reduce_all_const_elim 1.23% : 0.000003s : 13: predicate.reduce_eliminate 2.31% : 0.000005s : 38: predicate.redundant_stop_gradient_eliminater 0.40% : 0.000001s : 8: predicate.remove_not_recompute_node 1.43% : 0.000003s : 25: predicate.replace_applicator 0.57% : 0.000001s : 8: predicate.replace_old_param 0.29% : 0.000001s : 4: predicate.reset_defer_inline 1.08% : 0.000003s : 13: predicate.reshape_eliminate 0.60% : 0.000001s : 8: predicate.row_tensor_add_zeros_like 0.56% : 0.000001s : 4: predicate.row_tensor_eliminate 0.93% : 0.000002s : 8: predicate.same_eliminate 0.38% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.83% : 0.000002s : 8: predicate.shard_identity_eliminate 0.77% : 0.000002s : 8: predicate.special_op_eliminate 0.62% : 0.000001s : 8: predicate.specialize_transform 1.20% : 0.000003s : 8: predicate.split_environ_get_set_with_tuple_value 0.75% : 0.000002s : 8: predicate.stack_unstack_eliminate 0.28% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.42% : 0.000003s : 21: predicate.switch_defer_inline 1.95% : 0.000005s : 29: predicate.switch_layer_defer_inline 4.69% : 0.000011s : 67: predicate.switch_simplify 1.02% : 0.000002s : 13: predicate.tile_eliminate 0.98% : 0.000002s : 13: predicate.transpose_eliminate 1.92% : 0.000004s : 21: predicate.tuple_list_convert_item_index_to_positive 1.52% : 0.000004s : 21: predicate.tuple_list_get_item_const_eliminator 1.36% : 0.000003s : 21: predicate.tuple_list_get_item_depend_reorder 3.34% : 0.000008s : 33: predicate.tuple_list_get_item_eliminator 1.46% : 0.000003s : 21: predicate.tuple_list_get_set_item_eliminator 2.22% : 0.000005s : 29: predicate.tuple_list_set_item_eliminator 1.91% : 0.000004s : 25: predicate.tuple_to_list_eliminator_ 2.25% : 0.000005s : 38: predicate.updatestate_pure_node_eliminater 2.90% : 0.000007s : 46: predicate.updatestate_useless_node_eliminater 0.39% : 0.000001s : 4: predicate.value_based_eliminate 0.64% : 0.000001s : 8: predicate.virtual_dataset_eliminate 0.65% : 0.000002s : 8: predicate.virtual_output_eliminate 0.28% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.43% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000650 11 49.61% : 0.000322s : 5: func_graph_cloner_run.FuncGraphClonerGraph 50.39% : 0.000328s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.077954 192 0.01% : 0.000007s : 1: ForceFp32Comm 4.82% : 0.003761s : 1: add_attr 4.80% : 0.003739s : 1: add_attr_with_inline 0.01% : 0.000007s : 1: add_comm_op_reuse_tag 0.10% : 0.000077s : 1: add_recomputation 0.01% : 0.000007s : 1: assign_add_opt 0.10% : 0.000075s : 1: auto_monad 0.04% : 0.000030s : 1: auto_monad_reorder 0.01% : 0.000006s : 1: begin_end_overlap_inline 0.01% : 0.000009s : 1: bias_add_comm_swap 0.62% : 0.000484s : 1: bootstrap 0.05% : 0.000040s : 1: cconv 0.01% : 0.000007s : 1: comm_op_add_attrs 0.03% : 0.000022s : 1: control_data_broadcast_order 0.02% : 0.000015s : 1: convert_after_rewriter 0.05% : 0.000037s : 1: cse_after_recomputation 0.01% : 0.000008s : 1: dataset_repeat_opt 0.03% : 0.000023s : 1: detach_backward 0.02% : 0.000013s : 1: environ_conv 0.04% : 0.000030s : 1: event_method 0.01% : 0.000008s : 1: full_micro_interleaved_order_control 0.01% : 0.000008s : 1: get_jit_bprop_graph 0.02% : 0.000013s : 1: graph_reusing 0.01% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000006s : 1: handle_group_info 0.01% : 0.000008s : 1: inline 0.01% : 0.000009s : 1: insert-virtual-dataset 0.01% : 0.000007s : 1: interleave_parallel_branches 0.01% : 0.000007s : 1: interleave_split_concat_branches 0.01% : 0.000009s : 1: label_fine_grained_interleaved_index 0.02% : 0.000013s : 1: label_micro_interleaved_index 0.77% : 0.000602s : 1: loop_unroll 0.01% : 0.000008s : 1: merge_cast_opt 0.01% : 0.000008s : 1: micro_interleaved_order_control 1.00% : 0.000781s : 1: mutable_eliminate 0.01% : 0.000010s : 1: offloading_packed_experts 0.02% : 0.000017s : 1: opt.transform.loop_unroll_optimizer 0.03% : 0.000020s : 1: opt.transform.mutable_eliminate 1.69% : 0.001320s : 78: opt.transform.opt_a 0.04% : 0.000032s : 1: opt.transform.opt_after_cconv 0.04% : 0.000032s : 1: opt.transform.opt_after_jit_grad 0.14% : 0.000111s : 28: opt.transform.opt_b 0.07% : 0.000053s : 2: opt.transform.opt_trans_graph 0.06% : 0.000047s : 4: opt.transform.symbol_engine_opt 23.34% : 0.018193s : 1: opt_a 0.18% : 0.000144s : 1: opt_after_cconv 0.90% : 0.000701s : 1: opt_after_jit_grad 0.39% : 0.000302s : 1: opt_b 27.51% : 0.021447s : 1: optimize 0.04% : 0.000028s : 1: optimize_parallel_all_gather_comm 0.02% : 0.000013s : 1: order_py_execute_after_rewriter 0.04% : 0.000029s : 1: overlap_grad_flash_sp 0.01% : 0.000006s : 1: overlap_grad_matmul_and_grad_allreduce 0.01% : 0.000010s : 1: overlap_grad_ring_attention 0.01% : 0.000008s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000007s : 1: overlap_opt_shard_in_pipeline 0.01% : 0.000010s : 1: overlap_param_gather 0.01% : 0.000007s : 1: overlap_recompute_allgather_and_fa_grad 0.01% : 0.000010s : 1: overlap_recompute_and_grad_model_parallel 0.01% : 0.000008s : 1: overlap_recompute_comm 0.01% : 0.000011s : 1: parallel-infer-symbol 0.01% : 0.000006s : 1: parallel-infer-symbol-second 0.01% : 0.000007s : 1: partial_unused_args_eliminate 0.01% : 0.000009s : 1: pipeline_parallel_scheduler 0.01% : 0.000007s : 1: pipeline_split 0.06% : 0.000043s : 1: pre_auto_parallel 0.04% : 0.000034s : 1: py_interpret_to_execute 0.03% : 0.000025s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000006s : 1: remove_cast_before_assign_add 0.03% : 0.000021s : 1: remove_dup_value 19.27% : 0.015022s : 1: renormalize.infer 0.62% : 0.000485s : 1: renormalize.specialize 0.01% : 0.000009s : 1: reorder_send_recv_between_fp_bp 0.02% : 0.000013s : 1: rewriter_after_jit_bprop_graph 0.07% : 0.000058s : 1: rewriter_after_opt_a 0.12% : 0.000095s : 1: rewriter_before_opt_a 0.01% : 0.000008s : 1: slice_cell_reuse_recomputed_activation 0.01% : 0.000008s : 1: slice_recompute_activation 0.01% : 0.000008s : 1: split_layernorm_comm 0.01% : 0.000008s : 1: split_matmul_comm_elemetwise 0.01% : 0.000011s : 1: swap_dp_allreduce_reducescatter 0.15% : 0.000118s : 1: symbol_engine_optimizer 0.13% : 0.000101s : 1: tuple_transform 12.04% : 0.009384s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:44:44.686.454 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0363262, [21] [bootstrap]: 0.00048609 [type_inference]: 0.0195242 [event_method]: 1.917e-05 [auto_monad]: 6.832e-05 [graph_reusing]: 5.75001e-06 [inline]: 3.52002e-06 [add_attr]: 0.00406192, [1] [add_attr_with_inline]: 0.00404761, [1] [Cycle 1]: 7.299e-05, [2] [tag_attr]: 2.377e-05 [meta_addattr_fg_expand]: 5.91003e-06 [parallel-infer-symbol]: 3.71999e-06 [pre_auto_parallel]: 3.921e-05 [insert-virtual-dataset]: 2.87002e-06 [parallel-infer-symbol-second]: 7.10017e-07 [dataset_repeat_opt]: 1.77999e-06 [pipeline_split]: 1.82001e-06 [optimize]: 0.0111161, [53] [py_interpret_to_execute]: 3.013e-05 [rewriter_before_opt_a]: 9.272e-05 [opt_a]: 0.00311491, [2] [Cycle 1]: 0.0023317, [45] [expand_dump_flag]: 3.41999e-06 [switch_simplify]: 4.406e-05 [loop_unroll]: 2.998e-05 [a_1]: 0.00069299 [with_stream_mark]: 2.235e-05 [recompute_prepare]: 1.148e-05 [updatestate_depend_eliminate]: 4.15e-06 [updatestate_assign_eliminate]: 3.25998e-06 [updatestate_loads_eliminate]: 3.30998e-06 [parameter_eliminate]: 2.41e-06 [a_2]: 8.49e-05 [accelerated_algorithm]: 7.51001e-06 [shard]: 1.89e-06 [meta_shard_fg_expand]: 2.26e-06 [shard_inline]: 6.66e-06 [merge_send_recv]: 9.52999e-06 [auto_parallel]: 8.04002e-06 [parallel]: 2.084e-05 [flash_sp]: 1.023e-05 [merge_comm]: 4e-06 [allreduce_fusion]: 3.55e-06 [matmul_add_comm_reduction]: 1.063e-05 [allreduce_slice_to_reducescatter]: 7.80012e-07 [virtual_shard_identity]: 8.84e-06 [virtual_dataset]: 7.10002e-06 [get_grad_eliminate_]: 6.44999e-06 [virtual_output]: 7.08e-06 [merge_forward]: 4.53001e-06 [cell_reuse_recompute_pass]: 1.15999e-06 [offload_activation]: 1.063e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.593e-05 [merge_recompute_call_nodes]: 1.50999e-06 [before_grad]: 1.133e-05 [set_forward_comm_id_for_comm_node_pass]: 3.40003e-06 [meta_fg_expand]: 3.41999e-06 [flash_sp_send_recv_attached]: 2.72001e-06 [receive_attached]: 2.06e-06 [after_resolve]: 1.302e-05 [a_after_grad]: 1.123e-05 [renormalize]: 0.00083769 [add_forward_monad_depend]: 9.71e-06 [auto_monad_grad]: 2.94999e-06 [auto_monad_eliminator]: 1.912e-05 [cse]: 2.923e-05 [a_3]: 5.964e-05 [Cycle 2]: 0.00076865, [45] [expand_dump_flag]: 2.14e-06 [switch_simplify]: 1.018e-05 [loop_unroll]: 6.64999e-06 [a_1]: 0.0001428 [with_stream_mark]: 1.767e-05 [recompute_prepare]: 6.86001e-06 [updatestate_depend_eliminate]: 3.04001e-06 [updatestate_assign_eliminate]: 2.68e-06 [updatestate_loads_eliminate]: 3.93999e-06 [parameter_eliminate]: 1.91998e-06 [a_2]: 7.917e-05 [accelerated_algorithm]: 7.66999e-06 [shard]: 2.26e-06 [meta_shard_fg_expand]: 2.40002e-06 [shard_inline]: 6.71e-06 [merge_send_recv]: 7.8e-06 [auto_parallel]: 8.58001e-06 [parallel]: 9.06002e-06 [flash_sp]: 4.46002e-06 [merge_comm]: 3.70998e-06 [allreduce_fusion]: 3.35998e-06 [matmul_add_comm_reduction]: 8.65001e-06 [allreduce_slice_to_reducescatter]: 6.10016e-07 [virtual_shard_identity]: 8.57e-06 [virtual_dataset]: 6.12999e-06 [get_grad_eliminate_]: 6.06e-06 [virtual_output]: 6.59999e-06 [merge_forward]: 4.45e-06 [cell_reuse_recompute_pass]: 3.48999e-06 [offload_activation]: 9.87001e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.821e-05 [merge_recompute_call_nodes]: 1.25999e-06 [before_grad]: 1.489e-05 [set_forward_comm_id_for_comm_node_pass]: 4.33999e-06 [meta_fg_expand]: 3.09999e-06 [flash_sp_send_recv_attached]: 1.64e-06 [receive_attached]: 1.75001e-06 [after_resolve]: 1.433e-05 [a_after_grad]: 1.115e-05 [renormalize]: 1.09983e-07 [add_forward_monad_depend]: 3.16999e-06 [auto_monad_grad]: 2.20002e-06 [auto_monad_eliminator]: 9.96e-06 [cse]: 1.83e-05 [a_3]: 3.925e-05 [py_interpret_to_execute_after_opt_a]: 1.905e-05 [slice_cell_reuse_recomputed_activation]: 2.27999e-06 [rewriter_after_opt_a]: 4.204e-05 [convert_after_rewriter]: 9.64e-06 [order_py_execute_after_rewriter]: 5.69e-06 [mutable_eliminate]: 0.00086477 [opt_b]: 0.00024021, [1] [Cycle 1]: 0.00022933, [7] [b_1]: 0.00013601 [b_2]: 9.89999e-06 [updatestate_depend_eliminate]: 8.75999e-06 [updatestate_assign_eliminate]: 2.71e-06 [updatestate_loads_eliminate]: 3.16999e-06 [renormalize]: 1.01002e-06 [cse]: 2.928e-05 [optimize_parallel_all_gather_comm]: 0.00502293 [overlap_param_gather]: 6.83e-06 [cconv]: 4.224e-05 [loop_unroll]: 0.0008066 [opt_after_cconv]: 0.00014019, [1] [Cycle 1]: 0.0001302, [7] [c_1]: 3.668e-05 [parameter_eliminate]: 6.50002e-06 [updatestate_depend_eliminate]: 1.067e-05 [updatestate_assign_eliminate]: 3.30003e-06 [updatestate_loads_eliminate]: 3.04999e-06 [cse]: 3.207e-05 [renormalize]: 9.89996e-07 [remove_dup_value]: 1.596e-05 [tuple_transform]: 9.17e-05, [1] [Cycle 1]: 8.655e-05, [4] [d_1]: 5.557e-05 [none_parameter_eliminate]: 1.61002e-06 [renormalize]: 1.69995e-07 [switch_simplify]: 7.66999e-06 [partial_unused_args_eliminate]: 2.17999e-06 [add_recomputation]: 5.889e-05 [cse_after_recomputation]: 2.418e-05, [1] [Cycle 1]: 1.91e-05, [1] [cse]: 1.249e-05 [environ_conv]: 6.45002e-06 [swap_dp_allreduce_reducescatter]: 5.92999e-06 [bias_add_comm_swap]: 3.37997e-06 [label_micro_interleaved_index]: 5.58002e-06 [label_fine_grained_interleaved_index]: 2.76999e-06 [merge_cast_opt]: 1.37e-06 [slice_recompute_activation]: 2.14e-06 [micro_interleaved_order_control]: 2.46e-06 [assign_add_opt]: 1.41998e-06 [ForceFp32Comm]: 9.40025e-07 [remove_cast_before_assign_add]: 1.33002e-06 [full_micro_interleaved_order_control]: 2.16e-06 [reorder_send_recv_between_fp_bp]: 2.82002e-06 [comm_op_add_attrs]: 1.10001e-06 [add_comm_op_reuse_tag]: 1.25999e-06 [interleave_split_concat_branches]: 1.09e-06 [interleave_parallel_branches]: 1.04e-06 [overlap_opt_shard_in_pipeline]: 1.50001e-06 [overlap_opt_shard_grad_in_pipeline]: 2.29001e-06 [control_data_broadcast_order]: 1.531e-05 [grouped_pairwise_exchange_alltoall]: 1.58002e-06 [offloading_packed_experts]: 3.86001e-06 [overlap_recompute_and_grad_model_parallel]: 4.89e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.20999e-06 [overlap_recompute_allgather_and_fa_grad]: 1.35001e-06 [overlap_recompute_comm]: 2.26e-06 [overlap_grad_ring_attention]: 4.43999e-06 [overlap_grad_flash_sp]: 2.32e-05 [begin_end_overlap_inline]: 5.8001e-07 [split_matmul_comm_elemetwise]: 2.33998e-06 [split_layernorm_comm]: 1.83002e-06 [handle_group_info]: 1.07e-06 [symbol_engine_optimizer]: 8.498e-05, [1] [Cycle 1]: 8.013e-05, [6] [build]: 4.12998e-06 [elim_shapecalc]: 1.11e-05 [elim_not_effective]: 1.536e-05 [opt_reshape]: 7.66001e-06 [fold_const_symbol]: 1.068e-05 [renormalize]: 5.69999e-07 [detach_backward]: 2.61999e-06 [pipeline_parallel_scheduler]: 2.01e-06 [auto_monad_reorder]: 1.827e-05 [get_jit_bprop_graph]: 2.19001e-06 [rewriter_after_jit_bprop_graph]: 7.53e-06 [opt_after_jit_grad]: 0.00073297 [validate]: 4.961e-05 Sums bootstrap : 0.000486s : 1.56% type_inference : 0.019524s : 62.71% event_method : 0.000019s : 0.06% auto_monad : 0.000068s : 0.22% graph_reusing : 0.000006s : 0.02% inline : 0.000004s : 0.01% add_attr.add_attr_with_inline.tag_attr : 0.000024s : 0.08% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.02% parallel-infer-symbol : 0.000004s : 0.01% pre_auto_parallel : 0.000039s : 0.13% insert-virtual-dataset : 0.000003s : 0.01% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.01% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000030s : 0.10% optimize.rewriter_before_opt_a : 0.000093s : 0.30% optimize.opt_a.expand_dump_flag : 0.000006s : 0.02% optimize.opt_a.switch_simplify : 0.000054s : 0.17% optimize.opt_a.loop_unroll : 0.000037s : 0.12% optimize.opt_a.a_1 : 0.000836s : 2.68% optimize.opt_a.with_stream_mark : 0.000040s : 0.13% optimize.opt_a.recompute_prepare : 0.000018s : 0.06% optimize.opt_a.updatestate_depend_eliminate : 0.000007s : 0.02% optimize.opt_a.updatestate_assign_eliminate : 0.000006s : 0.02% optimize.opt_a.updatestate_loads_eliminate : 0.000007s : 0.02% optimize.opt_a.parameter_eliminate : 0.000004s : 0.01% optimize.opt_a.a_2 : 0.000164s : 0.53% optimize.opt_a.accelerated_algorithm : 0.000015s : 0.05% optimize.opt_a.shard : 0.000004s : 0.01% optimize.opt_a.meta_shard_fg_expand : 0.000005s : 0.01% optimize.opt_a.shard_inline : 0.000013s : 0.04% optimize.opt_a.merge_send_recv : 0.000017s : 0.06% optimize.opt_a.auto_parallel : 0.000017s : 0.05% optimize.opt_a.parallel : 0.000030s : 0.10% optimize.opt_a.flash_sp : 0.000015s : 0.05% optimize.opt_a.merge_comm : 0.000008s : 0.02% optimize.opt_a.allreduce_fusion : 0.000007s : 0.02% optimize.opt_a.matmul_add_comm_reduction : 0.000019s : 0.06% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000017s : 0.06% optimize.opt_a.virtual_dataset : 0.000013s : 0.04% optimize.opt_a.get_grad_eliminate_ : 0.000013s : 0.04% optimize.opt_a.virtual_output : 0.000014s : 0.04% optimize.opt_a.merge_forward : 0.000009s : 0.03% optimize.opt_a.cell_reuse_recompute_pass : 0.000005s : 0.01% optimize.opt_a.offload_activation : 0.000021s : 0.07% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000034s : 0.11% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.01% optimize.opt_a.before_grad : 0.000026s : 0.08% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000008s : 0.02% optimize.opt_a.meta_fg_expand : 0.000007s : 0.02% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.01% optimize.opt_a.receive_attached : 0.000004s : 0.01% optimize.opt_a.after_resolve : 0.000027s : 0.09% optimize.opt_a.a_after_grad : 0.000022s : 0.07% optimize.opt_a.renormalize : 0.000838s : 2.69% optimize.opt_a.add_forward_monad_depend : 0.000013s : 0.04% optimize.opt_a.auto_monad_grad : 0.000005s : 0.02% optimize.opt_a.auto_monad_eliminator : 0.000029s : 0.09% optimize.opt_a.cse : 0.000048s : 0.15% optimize.opt_a.a_3 : 0.000099s : 0.32% optimize.py_interpret_to_execute_after_opt_a : 0.000019s : 0.06% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.01% optimize.rewriter_after_opt_a : 0.000042s : 0.14% optimize.convert_after_rewriter : 0.000010s : 0.03% optimize.order_py_execute_after_rewriter : 0.000006s : 0.02% optimize.mutable_eliminate : 0.000865s : 2.78% optimize.opt_b.b_1 : 0.000136s : 0.44% optimize.opt_b.b_2 : 0.000010s : 0.03% optimize.opt_b.updatestate_depend_eliminate : 0.000009s : 0.03% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.01% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.01% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000029s : 0.09% optimize.optimize_parallel_all_gather_comm : 0.005023s : 16.13% optimize.overlap_param_gather : 0.000007s : 0.02% optimize.cconv : 0.000042s : 0.14% optimize.loop_unroll : 0.000807s : 2.59% optimize.opt_after_cconv.c_1 : 0.000037s : 0.12% optimize.opt_after_cconv.parameter_eliminate : 0.000007s : 0.02% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000011s : 0.03% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.cse : 0.000032s : 0.10% optimize.opt_after_cconv.renormalize : 0.000001s : 0.00% optimize.remove_dup_value : 0.000016s : 0.05% optimize.tuple_transform.d_1 : 0.000056s : 0.18% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000008s : 0.02% optimize.partial_unused_args_eliminate : 0.000002s : 0.01% optimize.add_recomputation : 0.000059s : 0.19% optimize.cse_after_recomputation.cse : 0.000012s : 0.04% optimize.environ_conv : 0.000006s : 0.02% optimize.swap_dp_allreduce_reducescatter : 0.000006s : 0.02% optimize.bias_add_comm_swap : 0.000003s : 0.01% optimize.label_micro_interleaved_index : 0.000006s : 0.02% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.01% optimize.merge_cast_opt : 0.000001s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.01% optimize.micro_interleaved_order_control : 0.000002s : 0.01% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.00% optimize.full_micro_interleaved_order_control : 0.000002s : 0.01% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.01% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000002s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.01% optimize.control_data_broadcast_order : 0.000015s : 0.05% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.01% optimize.offloading_packed_experts : 0.000004s : 0.01% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.02% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.00% optimize.overlap_recompute_comm : 0.000002s : 0.01% optimize.overlap_grad_ring_attention : 0.000004s : 0.01% optimize.overlap_grad_flash_sp : 0.000023s : 0.07% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.01% optimize.split_layernorm_comm : 0.000002s : 0.01% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000004s : 0.01% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000011s : 0.04% optimize.symbol_engine_optimizer.elim_not_effective : 0.000015s : 0.05% optimize.symbol_engine_optimizer.opt_reshape : 0.000008s : 0.02% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000011s : 0.03% optimize.symbol_engine_optimizer.renormalize : 0.000001s : 0.00% detach_backward : 0.000003s : 0.01% pipeline_parallel_scheduler : 0.000002s : 0.01% auto_monad_reorder : 0.000018s : 0.06% get_jit_bprop_graph : 0.000002s : 0.01% rewriter_after_jit_bprop_graph : 0.000008s : 0.02% opt_after_jit_grad : 0.000733s : 2.35% validate : 0.000050s : 0.16% Time group info: ------[substitution.] 0.000219 28 1.27% : 0.000003s : 2: substitution.elim_not_effective 0.62% : 0.000001s : 2: substitution.fold_const_symbol 3.32% : 0.000007s : 4: substitution.graph_param_transform 79.12% : 0.000173s : 4: substitution.inline 2.29% : 0.000005s : 4: substitution.j_node_and_user_rematch 2.61% : 0.000006s : 4: substitution.remove_not_recompute_node 3.30% : 0.000007s : 4: substitution.replace_old_param 7.46% : 0.000016s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.019446 2 95.76% : 0.018622s : 1: type_inference.infer 4.24% : 0.000825s : 1: type_inference.specialize ------[replace.] 0.000068 8 63.02% : 0.000043s : 4: replace.inline 36.98% : 0.000025s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000185 8 92.19% : 0.000171s : 4: match.inline 7.81% : 0.000014s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000232 1278 0.97% : 0.000002s : 13: predicate.accumulaten_eliminater 1.09% : 0.000003s : 4: predicate.ad_related_special_op_eliminate 0.52% : 0.000001s : 8: predicate.addn_check_dump 0.85% : 0.000002s : 13: predicate.addn_zero_filter 0.88% : 0.000002s : 13: predicate.adjust_all_reduce_mul_add 2.27% : 0.000005s : 21: predicate.arithmetic_simplify 0.90% : 0.000002s : 13: predicate.cast_eliminate 0.66% : 0.000002s : 8: predicate.check_bprop_eliminate 0.61% : 0.000001s : 8: predicate.compare_switch_simplify 0.16% : 0.000000s : 4: predicate.const_output_eliminate 0.63% : 0.000001s : 8: predicate.depend_value_elim 0.87% : 0.000002s : 13: predicate.dict_get_item_const_eliminator 1.23% : 0.000003s : 13: predicate.dict_get_item_eliminator 0.84% : 0.000002s : 13: predicate.dict_set_item_eliminator 1.11% : 0.000003s : 8: predicate.dumpgradient_eliminate 0.19% : 0.000000s : 4: predicate.elim_not_effective 0.56% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.28% : 0.000003s : 17: predicate.environ_add_const_eliminate 1.25% : 0.000003s : 17: predicate.environ_get_add_eliminate 1.13% : 0.000003s : 17: predicate.environ_get_depend_swap 1.50% : 0.000003s : 25: predicate.environ_get_eliminate 0.93% : 0.000002s : 17: predicate.environ_get_set_eliminate 1.42% : 0.000003s : 21: predicate.exchange_switch_depend_value 2.25% : 0.000005s : 21: predicate.float_depend_g_call 0.63% : 0.000001s : 8: predicate.float_environ_get_switch 0.83% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.13% : 0.000000s : 4: predicate.fold_const_symbol 0.58% : 0.000001s : 8: predicate.get_grad_eliminate 0.25% : 0.000001s : 4: predicate.graph_param_transform 0.50% : 0.000001s : 8: predicate.incorporate_call 0.43% : 0.000001s : 8: predicate.incorporate_call_switch 5.86% : 0.000014s : 58: predicate.inline 0.74% : 0.000002s : 8: predicate.inline_without_move 0.28% : 0.000001s : 8: predicate.j_node_and_user_rematch 0.81% : 0.000002s : 8: predicate.less_batch_normalization 1.85% : 0.000004s : 25: predicate.list_to_tuple_eliminator_ 2.46% : 0.000006s : 38: predicate.load_eliminater 1.43% : 0.000003s : 4: predicate.loop_unroll_after_grad 2.19% : 0.000005s : 34: predicate.loop_unroll_before_grad 2.04% : 0.000005s : 21: predicate.make_slice_get_slice_eliminator 0.70% : 0.000002s : 8: predicate.merge_addn 0.60% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.61% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.78% : 0.000002s : 13: predicate.minmaximum_grad 1.36% : 0.000003s : 4: predicate.mutable_eliminate 0.39% : 0.000001s : 4: predicate.opt_reshape 0.36% : 0.000001s : 4: predicate.parallel_virtual_node 1.69% : 0.000004s : 21: predicate.partial_defer_inline 1.57% : 0.000004s : 21: predicate.partial_eliminate 0.78% : 0.000002s : 13: predicate.print_const_string_wrapper 0.63% : 0.000001s : 8: predicate.reduce_all_const_elim 1.14% : 0.000003s : 13: predicate.reduce_eliminate 2.45% : 0.000006s : 38: predicate.redundant_stop_gradient_eliminater 0.59% : 0.000001s : 8: predicate.remove_not_recompute_node 1.29% : 0.000003s : 25: predicate.replace_applicator 0.55% : 0.000001s : 8: predicate.replace_old_param 0.31% : 0.000001s : 4: predicate.reset_defer_inline 1.05% : 0.000002s : 13: predicate.reshape_eliminate 0.66% : 0.000002s : 8: predicate.row_tensor_add_zeros_like 0.52% : 0.000001s : 4: predicate.row_tensor_eliminate 0.81% : 0.000002s : 8: predicate.same_eliminate 0.37% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.68% : 0.000002s : 8: predicate.shard_identity_eliminate 0.80% : 0.000002s : 8: predicate.special_op_eliminate 0.66% : 0.000002s : 8: predicate.specialize_transform 1.26% : 0.000003s : 8: predicate.split_environ_get_set_with_tuple_value 0.93% : 0.000002s : 8: predicate.stack_unstack_eliminate 0.36% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.40% : 0.000003s : 21: predicate.switch_defer_inline 1.91% : 0.000004s : 29: predicate.switch_layer_defer_inline 4.99% : 0.000012s : 67: predicate.switch_simplify 0.85% : 0.000002s : 13: predicate.tile_eliminate 0.97% : 0.000002s : 13: predicate.transpose_eliminate 1.45% : 0.000003s : 21: predicate.tuple_list_convert_item_index_to_positive 1.38% : 0.000003s : 21: predicate.tuple_list_get_item_const_eliminator 1.51% : 0.000004s : 21: predicate.tuple_list_get_item_depend_reorder 3.36% : 0.000008s : 33: predicate.tuple_list_get_item_eliminator 1.50% : 0.000003s : 21: predicate.tuple_list_get_set_item_eliminator 2.30% : 0.000005s : 29: predicate.tuple_list_set_item_eliminator 1.79% : 0.000004s : 25: predicate.tuple_to_list_eliminator_ 2.17% : 0.000005s : 38: predicate.updatestate_pure_node_eliminater 2.94% : 0.000007s : 46: predicate.updatestate_useless_node_eliminater 0.40% : 0.000001s : 4: predicate.value_based_eliminate 0.62% : 0.000001s : 8: predicate.virtual_dataset_eliminate 0.79% : 0.000002s : 8: predicate.virtual_output_eliminate 0.24% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.44% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000660 11 53.63% : 0.000354s : 5: func_graph_cloner_run.FuncGraphClonerGraph 46.37% : 0.000306s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.053801 192 0.01% : 0.000004s : 1: ForceFp32Comm 7.56% : 0.004068s : 1: add_attr 7.53% : 0.004052s : 1: add_attr_with_inline 0.01% : 0.000005s : 1: add_comm_op_reuse_tag 0.12% : 0.000064s : 1: add_recomputation 0.01% : 0.000004s : 1: assign_add_opt 0.14% : 0.000074s : 1: auto_monad 0.04% : 0.000022s : 1: auto_monad_reorder 0.01% : 0.000003s : 1: begin_end_overlap_inline 0.01% : 0.000006s : 1: bias_add_comm_swap 0.96% : 0.000519s : 1: bootstrap 0.09% : 0.000047s : 1: cconv 0.01% : 0.000004s : 1: comm_op_add_attrs 0.03% : 0.000018s : 1: control_data_broadcast_order 0.02% : 0.000013s : 1: convert_after_rewriter 0.05% : 0.000027s : 1: cse_after_recomputation 0.01% : 0.000005s : 1: dataset_repeat_opt 0.01% : 0.000006s : 1: detach_backward 0.02% : 0.000010s : 1: environ_conv 0.05% : 0.000026s : 1: event_method 0.01% : 0.000005s : 1: full_micro_interleaved_order_control 0.01% : 0.000005s : 1: get_jit_bprop_graph 0.02% : 0.000009s : 1: graph_reusing 0.01% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000004s : 1: handle_group_info 0.01% : 0.000007s : 1: inline 0.01% : 0.000006s : 1: insert-virtual-dataset 0.01% : 0.000004s : 1: interleave_parallel_branches 0.01% : 0.000004s : 1: interleave_split_concat_branches 0.01% : 0.000006s : 1: label_fine_grained_interleaved_index 0.02% : 0.000009s : 1: label_micro_interleaved_index 1.52% : 0.000819s : 1: loop_unroll 0.01% : 0.000004s : 1: merge_cast_opt 0.01% : 0.000005s : 1: micro_interleaved_order_control 1.64% : 0.000881s : 1: mutable_eliminate 0.01% : 0.000007s : 1: offloading_packed_experts 0.05% : 0.000025s : 1: opt.transform.loop_unroll_optimizer 0.05% : 0.000028s : 1: opt.transform.mutable_eliminate 2.41% : 0.001298s : 78: opt.transform.opt_a 0.06% : 0.000035s : 1: opt.transform.opt_after_cconv 0.06% : 0.000033s : 1: opt.transform.opt_after_jit_grad 0.20% : 0.000108s : 28: opt.transform.opt_b 0.11% : 0.000060s : 2: opt.transform.opt_trans_graph 0.07% : 0.000040s : 4: opt.transform.symbol_engine_opt 5.80% : 0.003118s : 1: opt_a 0.27% : 0.000144s : 1: opt_after_cconv 1.39% : 0.000746s : 1: opt_after_jit_grad 0.45% : 0.000245s : 1: opt_b 20.67% : 0.011122s : 1: optimize 9.38% : 0.005048s : 1: optimize_parallel_all_gather_comm 0.02% : 0.000009s : 1: order_py_execute_after_rewriter 0.05% : 0.000027s : 1: overlap_grad_flash_sp 0.01% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.01% : 0.000007s : 1: overlap_grad_ring_attention 0.01% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.03% : 0.000014s : 1: overlap_param_gather 0.01% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.02% : 0.000010s : 1: overlap_recompute_and_grad_model_parallel 0.01% : 0.000005s : 1: overlap_recompute_comm 0.01% : 0.000008s : 1: parallel-infer-symbol 0.01% : 0.000004s : 1: parallel-infer-symbol-second 0.01% : 0.000006s : 1: partial_unused_args_eliminate 0.01% : 0.000005s : 1: pipeline_parallel_scheduler 0.01% : 0.000005s : 1: pipeline_split 0.08% : 0.000044s : 1: pre_auto_parallel 0.06% : 0.000034s : 1: py_interpret_to_execute 0.04% : 0.000024s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000004s : 1: remove_cast_before_assign_add 0.04% : 0.000019s : 1: remove_dup_value 0.79% : 0.000423s : 1: renormalize.infer 0.75% : 0.000404s : 1: renormalize.specialize 0.01% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.02% : 0.000011s : 1: rewriter_after_jit_bprop_graph 0.09% : 0.000047s : 1: rewriter_after_opt_a 0.18% : 0.000097s : 1: rewriter_before_opt_a 0.01% : 0.000006s : 1: slice_cell_reuse_recomputed_activation 0.01% : 0.000006s : 1: slice_recompute_activation 0.01% : 0.000006s : 1: split_layernorm_comm 0.01% : 0.000005s : 1: split_matmul_comm_elemetwise 0.02% : 0.000009s : 1: swap_dp_allreduce_reducescatter 0.16% : 0.000088s : 1: symbol_engine_optimizer 0.18% : 0.000095s : 1: tuple_transform 36.34% : 0.019550s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:44:45.198.054 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:44:45.198.375 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0513709, [21] [bootstrap]: 0.00051118 [type_inference]: 0.0167097 [event_method]: 2.437e-05 [auto_monad]: 7.136e-05 [graph_reusing]: 6.06e-06 [inline]: 2.97002e-06 [add_attr]: 0.0041746, [1] [add_attr_with_inline]: 0.00416075, [1] [Cycle 1]: 0.00010266, [2] [tag_attr]: 2.738e-05 [meta_addattr_fg_expand]: 6.72002e-06 [parallel-infer-symbol]: 4.29002e-06 [pre_auto_parallel]: 4.472e-05 [insert-virtual-dataset]: 2.32001e-06 [parallel-infer-symbol-second]: 7.00005e-07 [dataset_repeat_opt]: 2.16e-06 [pipeline_split]: 1.65001e-06 [optimize]: 0.0281813, [53] [py_interpret_to_execute]: 4.481e-05 [rewriter_before_opt_a]: 0.00010217 [opt_a]: 0.0246266, [2] [Cycle 1]: 0.0234946, [45] [expand_dump_flag]: 3.95e-06 [switch_simplify]: 4.348e-05 [loop_unroll]: 3.761e-05 [a_1]: 0.00073871 [with_stream_mark]: 2.289e-05 [recompute_prepare]: 1.205e-05 [updatestate_depend_eliminate]: 4.90001e-06 [updatestate_assign_eliminate]: 4.32e-06 [updatestate_loads_eliminate]: 3.68999e-06 [parameter_eliminate]: 2.00002e-06 [a_2]: 0.00013956 [accelerated_algorithm]: 9.15001e-06 [shard]: 2.11998e-06 [meta_shard_fg_expand]: 2.88e-06 [shard_inline]: 8.05999e-06 [merge_send_recv]: 9.72999e-06 [auto_parallel]: 8.84998e-06 [parallel]: 2.125e-05 [flash_sp]: 1.044e-05 [merge_comm]: 4.93001e-06 [allreduce_fusion]: 4.28999e-06 [matmul_add_comm_reduction]: 1.14e-05 [allreduce_slice_to_reducescatter]: 1.10999e-06 [virtual_shard_identity]: 9.36e-06 [virtual_dataset]: 8.37e-06 [get_grad_eliminate_]: 8.55001e-06 [virtual_output]: 8.17e-06 [merge_forward]: 4.94003e-06 [cell_reuse_recompute_pass]: 1.82999e-06 [offload_activation]: 1.3e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.882e-05 [merge_recompute_call_nodes]: 1.50001e-06 [before_grad]: 1.416e-05 [set_forward_comm_id_for_comm_node_pass]: 4.87998e-06 [meta_fg_expand]: 4.30999e-06 [flash_sp_send_recv_attached]: 2.86999e-06 [receive_attached]: 2.12999e-06 [after_resolve]: 1.454e-05 [a_after_grad]: 1.286e-05 [renormalize]: 0.0214767 [add_forward_monad_depend]: 1.223e-05 [auto_monad_grad]: 2.88e-06 [auto_monad_eliminator]: 2.962e-05 [cse]: 4.484e-05 [a_3]: 0.00010011 [Cycle 2]: 0.00111132, [45] [expand_dump_flag]: 2.48e-06 [switch_simplify]: 1.15e-05 [loop_unroll]: 9.88002e-06 [a_1]: 0.00021269 [with_stream_mark]: 2.32e-05 [recompute_prepare]: 9.04e-06 [updatestate_depend_eliminate]: 5.59e-06 [updatestate_assign_eliminate]: 4.23001e-06 [updatestate_loads_eliminate]: 3.80998e-06 [parameter_eliminate]: 2.11998e-06 [a_2]: 0.00012775 [accelerated_algorithm]: 8.68001e-06 [shard]: 2.19001e-06 [meta_shard_fg_expand]: 2.73e-06 [shard_inline]: 7.97e-06 [merge_send_recv]: 1.08e-05 [auto_parallel]: 1.174e-05 [parallel]: 9.11998e-06 [flash_sp]: 4.37e-06 [merge_comm]: 4.81002e-06 [allreduce_fusion]: 4.29997e-06 [matmul_add_comm_reduction]: 1.05e-05 [allreduce_slice_to_reducescatter]: 9.80013e-07 [virtual_shard_identity]: 8.80999e-06 [virtual_dataset]: 7.83999e-06 [get_grad_eliminate_]: 7.55998e-06 [virtual_output]: 7.87e-06 [merge_forward]: 5.37001e-06 [cell_reuse_recompute_pass]: 3.76001e-06 [offload_activation]: 1.23e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.942e-05 [merge_recompute_call_nodes]: 1.76e-06 [before_grad]: 1.402e-05 [set_forward_comm_id_for_comm_node_pass]: 5.24998e-06 [meta_fg_expand]: 4.17998e-06 [flash_sp_send_recv_attached]: 1.94e-06 [receive_attached]: 3.35e-06 [after_resolve]: 2.11e-05 [a_after_grad]: 1.43e-05 [renormalize]: 8.9989e-08 [add_forward_monad_depend]: 1.99e-06 [auto_monad_grad]: 2.61e-06 [auto_monad_eliminator]: 1.389e-05 [cse]: 2.717e-05 [a_3]: 6.456e-05 [py_interpret_to_execute_after_opt_a]: 2.784e-05 [slice_cell_reuse_recomputed_activation]: 5.18002e-06 [rewriter_after_opt_a]: 6.163e-05 [convert_after_rewriter]: 1.223e-05 [order_py_execute_after_rewriter]: 9.81e-06 [mutable_eliminate]: 0.00086275 [opt_b]: 0.00037815, [1] [Cycle 1]: 0.00036348, [7] [b_1]: 0.00022509 [b_2]: 1.278e-05 [updatestate_depend_eliminate]: 1.152e-05 [updatestate_assign_eliminate]: 4.34002e-06 [updatestate_loads_eliminate]: 4.00998e-06 [renormalize]: 9.60019e-07 [cse]: 4.145e-05 [optimize_parallel_all_gather_comm]: 3.077e-05 [overlap_param_gather]: 5.14e-06 [cconv]: 8.706e-05 [loop_unroll]: 0.00092114 [opt_after_cconv]: 0.0001779, [1] [Cycle 1]: 0.0001658, [7] [c_1]: 4.279e-05 [parameter_eliminate]: 6.46999e-06 [updatestate_depend_eliminate]: 1.041e-05 [updatestate_assign_eliminate]: 3.59002e-06 [updatestate_loads_eliminate]: 3.08e-06 [cse]: 3.882e-05 [renormalize]: 8.89995e-07 [remove_dup_value]: 2.169e-05 [tuple_transform]: 0.0001177, [1] [Cycle 1]: 0.00010958, [4] [d_1]: 6.525e-05 [none_parameter_eliminate]: 1.67001e-06 [renormalize]: 2.50002e-07 [switch_simplify]: 9.62999e-06 [partial_unused_args_eliminate]: 4.75999e-06 [add_recomputation]: 7.42e-05 [cse_after_recomputation]: 3.484e-05, [1] [Cycle 1]: 2.715e-05, [1] [cse]: 1.723e-05 [environ_conv]: 1.198e-05 [swap_dp_allreduce_reducescatter]: 9.54e-06 [bias_add_comm_swap]: 5.64e-06 [label_micro_interleaved_index]: 1.09e-05 [label_fine_grained_interleaved_index]: 5.23002e-06 [merge_cast_opt]: 4.13999e-06 [slice_recompute_activation]: 5.17e-06 [micro_interleaved_order_control]: 4.97e-06 [assign_add_opt]: 4.33999e-06 [ForceFp32Comm]: 3.55e-06 [remove_cast_before_assign_add]: 3.56999e-06 [full_micro_interleaved_order_control]: 4.69002e-06 [reorder_send_recv_between_fp_bp]: 5.51e-06 [comm_op_add_attrs]: 3.43999e-06 [add_comm_op_reuse_tag]: 3.64002e-06 [interleave_split_concat_branches]: 3.6e-06 [interleave_parallel_branches]: 3.53999e-06 [overlap_opt_shard_in_pipeline]: 4.40999e-06 [overlap_opt_shard_grad_in_pipeline]: 4.63001e-06 [control_data_broadcast_order]: 2.146e-05 [grouped_pairwise_exchange_alltoall]: 4.30999e-06 [offloading_packed_experts]: 8.08001e-06 [overlap_recompute_and_grad_model_parallel]: 9.64e-06 [overlap_grad_matmul_and_grad_allreduce]: 4.13999e-06 [overlap_recompute_allgather_and_fa_grad]: 4.15e-06 [overlap_recompute_comm]: 5.57001e-06 [overlap_grad_ring_attention]: 7.50998e-06 [overlap_grad_flash_sp]: 3.019e-05 [begin_end_overlap_inline]: 2.99001e-06 [split_matmul_comm_elemetwise]: 4.62e-06 [split_layernorm_comm]: 4.32e-06 [handle_group_info]: 3.88999e-06 [symbol_engine_optimizer]: 0.00011977, [1] [Cycle 1]: 0.00011154, [6] [build]: 4.98001e-06 [elim_shapecalc]: 1.602e-05 [elim_not_effective]: 1.739e-05 [opt_reshape]: 9.79e-06 [fold_const_symbol]: 1.331e-05 [renormalize]: 4.00003e-07 [detach_backward]: 5.64998e-06 [pipeline_parallel_scheduler]: 2.49001e-06 [auto_monad_reorder]: 2.904e-05 [get_jit_bprop_graph]: 2.31e-06 [rewriter_after_jit_bprop_graph]: 8.09997e-06 [opt_after_jit_grad]: 0.00080239 [validate]: 5.857e-05 Sums bootstrap : 0.000511s : 1.14% type_inference : 0.016710s : 37.12% event_method : 0.000024s : 0.05% auto_monad : 0.000071s : 0.16% graph_reusing : 0.000006s : 0.01% inline : 0.000003s : 0.01% add_attr.add_attr_with_inline.tag_attr : 0.000027s : 0.06% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000007s : 0.01% parallel-infer-symbol : 0.000004s : 0.01% pre_auto_parallel : 0.000045s : 0.10% insert-virtual-dataset : 0.000002s : 0.01% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000045s : 0.10% optimize.rewriter_before_opt_a : 0.000102s : 0.23% optimize.opt_a.expand_dump_flag : 0.000006s : 0.01% optimize.opt_a.switch_simplify : 0.000055s : 0.12% optimize.opt_a.loop_unroll : 0.000047s : 0.11% optimize.opt_a.a_1 : 0.000951s : 2.11% optimize.opt_a.with_stream_mark : 0.000046s : 0.10% optimize.opt_a.recompute_prepare : 0.000021s : 0.05% optimize.opt_a.updatestate_depend_eliminate : 0.000010s : 0.02% optimize.opt_a.updatestate_assign_eliminate : 0.000009s : 0.02% optimize.opt_a.updatestate_loads_eliminate : 0.000007s : 0.02% optimize.opt_a.parameter_eliminate : 0.000004s : 0.01% optimize.opt_a.a_2 : 0.000267s : 0.59% optimize.opt_a.accelerated_algorithm : 0.000018s : 0.04% optimize.opt_a.shard : 0.000004s : 0.01% optimize.opt_a.meta_shard_fg_expand : 0.000006s : 0.01% optimize.opt_a.shard_inline : 0.000016s : 0.04% optimize.opt_a.merge_send_recv : 0.000021s : 0.05% optimize.opt_a.auto_parallel : 0.000021s : 0.05% optimize.opt_a.parallel : 0.000030s : 0.07% optimize.opt_a.flash_sp : 0.000015s : 0.03% optimize.opt_a.merge_comm : 0.000010s : 0.02% optimize.opt_a.allreduce_fusion : 0.000009s : 0.02% optimize.opt_a.matmul_add_comm_reduction : 0.000022s : 0.05% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000002s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000018s : 0.04% optimize.opt_a.virtual_dataset : 0.000016s : 0.04% optimize.opt_a.get_grad_eliminate_ : 0.000016s : 0.04% optimize.opt_a.virtual_output : 0.000016s : 0.04% optimize.opt_a.merge_forward : 0.000010s : 0.02% optimize.opt_a.cell_reuse_recompute_pass : 0.000006s : 0.01% optimize.opt_a.offload_activation : 0.000025s : 0.06% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000038s : 0.08% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.01% optimize.opt_a.before_grad : 0.000028s : 0.06% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000010s : 0.02% optimize.opt_a.meta_fg_expand : 0.000008s : 0.02% optimize.opt_a.flash_sp_send_recv_attached : 0.000005s : 0.01% optimize.opt_a.receive_attached : 0.000005s : 0.01% optimize.opt_a.after_resolve : 0.000036s : 0.08% optimize.opt_a.a_after_grad : 0.000027s : 0.06% optimize.opt_a.renormalize : 0.021477s : 47.71% optimize.opt_a.add_forward_monad_depend : 0.000014s : 0.03% optimize.opt_a.auto_monad_grad : 0.000005s : 0.01% optimize.opt_a.auto_monad_eliminator : 0.000044s : 0.10% optimize.opt_a.cse : 0.000072s : 0.16% optimize.opt_a.a_3 : 0.000165s : 0.37% optimize.py_interpret_to_execute_after_opt_a : 0.000028s : 0.06% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.01% optimize.rewriter_after_opt_a : 0.000062s : 0.14% optimize.convert_after_rewriter : 0.000012s : 0.03% optimize.order_py_execute_after_rewriter : 0.000010s : 0.02% optimize.mutable_eliminate : 0.000863s : 1.92% optimize.opt_b.b_1 : 0.000225s : 0.50% optimize.opt_b.b_2 : 0.000013s : 0.03% optimize.opt_b.updatestate_depend_eliminate : 0.000012s : 0.03% optimize.opt_b.updatestate_assign_eliminate : 0.000004s : 0.01% optimize.opt_b.updatestate_loads_eliminate : 0.000004s : 0.01% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000041s : 0.09% optimize.optimize_parallel_all_gather_comm : 0.000031s : 0.07% optimize.overlap_param_gather : 0.000005s : 0.01% optimize.cconv : 0.000087s : 0.19% optimize.loop_unroll : 0.000921s : 2.05% optimize.opt_after_cconv.c_1 : 0.000043s : 0.10% optimize.opt_after_cconv.parameter_eliminate : 0.000006s : 0.01% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000010s : 0.02% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000004s : 0.01% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.cse : 0.000039s : 0.09% optimize.opt_after_cconv.renormalize : 0.000001s : 0.00% optimize.remove_dup_value : 0.000022s : 0.05% optimize.tuple_transform.d_1 : 0.000065s : 0.14% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000010s : 0.02% optimize.partial_unused_args_eliminate : 0.000005s : 0.01% optimize.add_recomputation : 0.000074s : 0.16% optimize.cse_after_recomputation.cse : 0.000017s : 0.04% optimize.environ_conv : 0.000012s : 0.03% optimize.swap_dp_allreduce_reducescatter : 0.000010s : 0.02% optimize.bias_add_comm_swap : 0.000006s : 0.01% optimize.label_micro_interleaved_index : 0.000011s : 0.02% optimize.label_fine_grained_interleaved_index : 0.000005s : 0.01% optimize.merge_cast_opt : 0.000004s : 0.01% optimize.slice_recompute_activation : 0.000005s : 0.01% optimize.micro_interleaved_order_control : 0.000005s : 0.01% optimize.assign_add_opt : 0.000004s : 0.01% optimize.ForceFp32Comm : 0.000004s : 0.01% optimize.remove_cast_before_assign_add : 0.000004s : 0.01% optimize.full_micro_interleaved_order_control : 0.000005s : 0.01% optimize.reorder_send_recv_between_fp_bp : 0.000006s : 0.01% optimize.comm_op_add_attrs : 0.000003s : 0.01% optimize.add_comm_op_reuse_tag : 0.000004s : 0.01% optimize.interleave_split_concat_branches : 0.000004s : 0.01% optimize.interleave_parallel_branches : 0.000004s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000005s : 0.01% optimize.control_data_broadcast_order : 0.000021s : 0.05% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.01% optimize.offloading_packed_experts : 0.000008s : 0.02% optimize.overlap_recompute_and_grad_model_parallel : 0.000010s : 0.02% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.01% optimize.overlap_recompute_comm : 0.000006s : 0.01% optimize.overlap_grad_ring_attention : 0.000008s : 0.02% optimize.overlap_grad_flash_sp : 0.000030s : 0.07% optimize.begin_end_overlap_inline : 0.000003s : 0.01% optimize.split_matmul_comm_elemetwise : 0.000005s : 0.01% optimize.split_layernorm_comm : 0.000004s : 0.01% optimize.handle_group_info : 0.000004s : 0.01% optimize.symbol_engine_optimizer.build : 0.000005s : 0.01% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000016s : 0.04% optimize.symbol_engine_optimizer.elim_not_effective : 0.000017s : 0.04% optimize.symbol_engine_optimizer.opt_reshape : 0.000010s : 0.02% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000013s : 0.03% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000006s : 0.01% pipeline_parallel_scheduler : 0.000002s : 0.01% auto_monad_reorder : 0.000029s : 0.06% get_jit_bprop_graph : 0.000002s : 0.01% rewriter_after_jit_bprop_graph : 0.000008s : 0.02% opt_after_jit_grad : 0.000802s : 1.78% validate : 0.000059s : 0.13% Time group info: ------[substitution.] 0.000266 38 14.37% : 0.000038s : 3: substitution.cast_eliminate 0.90% : 0.000002s : 3: substitution.elim_not_effective 0.75% : 0.000002s : 3: substitution.fold_const_symbol 3.04% : 0.000008s : 5: substitution.graph_param_transform 68.13% : 0.000181s : 4: substitution.inline 2.05% : 0.000005s : 6: substitution.j_node_and_user_rematch 2.60% : 0.000007s : 6: substitution.remove_not_recompute_node 2.99% : 0.000008s : 4: substitution.replace_old_param 5.18% : 0.000014s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.016634 2 41.70% : 0.006937s : 1: type_inference.infer 58.30% : 0.009697s : 1: type_inference.specialize ------[replace.] 0.000068 8 62.34% : 0.000043s : 4: replace.inline 37.66% : 0.000026s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000191 8 93.67% : 0.000178s : 4: match.inline 6.33% : 0.000012s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000292 1504 0.87% : 0.000003s : 15: predicate.accumulaten_eliminater 1.31% : 0.000004s : 5: predicate.ad_related_special_op_eliminate 0.49% : 0.000001s : 10: predicate.addn_check_dump 0.91% : 0.000003s : 15: predicate.addn_zero_filter 0.74% : 0.000002s : 15: predicate.adjust_all_reduce_mul_add 2.03% : 0.000006s : 25: predicate.arithmetic_simplify 1.08% : 0.000003s : 15: predicate.cast_eliminate 0.73% : 0.000002s : 10: predicate.check_bprop_eliminate 0.57% : 0.000002s : 10: predicate.compare_switch_simplify 0.17% : 0.000000s : 5: predicate.const_output_eliminate 0.82% : 0.000002s : 10: predicate.depend_value_elim 0.89% : 0.000003s : 15: predicate.dict_get_item_const_eliminator 0.95% : 0.000003s : 15: predicate.dict_get_item_eliminator 0.75% : 0.000002s : 15: predicate.dict_set_item_eliminator 1.34% : 0.000004s : 10: predicate.dumpgradient_eliminate 0.18% : 0.000001s : 5: predicate.elim_not_effective 0.58% : 0.000002s : 5: predicate.elim_shapecalc_of_broadcastargs 1.23% : 0.000004s : 20: predicate.environ_add_const_eliminate 1.02% : 0.000003s : 20: predicate.environ_get_add_eliminate 1.01% : 0.000003s : 20: predicate.environ_get_depend_swap 1.61% : 0.000005s : 30: predicate.environ_get_eliminate 1.12% : 0.000003s : 20: predicate.environ_get_set_eliminate 1.22% : 0.000004s : 23: predicate.exchange_switch_depend_value 2.16% : 0.000006s : 23: predicate.float_depend_g_call 0.52% : 0.000002s : 10: predicate.float_environ_get_switch 0.76% : 0.000002s : 15: predicate.float_tuple_getitem_switch 0.15% : 0.000000s : 5: predicate.fold_const_symbol 0.73% : 0.000002s : 10: predicate.get_grad_eliminate 0.32% : 0.000001s : 5: predicate.graph_param_transform 0.56% : 0.000002s : 10: predicate.incorporate_call 0.51% : 0.000001s : 10: predicate.incorporate_call_switch 6.10% : 0.000018s : 68: predicate.inline 0.85% : 0.000002s : 10: predicate.inline_without_move 0.28% : 0.000001s : 10: predicate.j_node_and_user_rematch 0.85% : 0.000002s : 10: predicate.less_batch_normalization 1.90% : 0.000006s : 29: predicate.list_to_tuple_eliminator_ 2.31% : 0.000007s : 44: predicate.load_eliminater 2.04% : 0.000006s : 5: predicate.loop_unroll_after_grad 2.19% : 0.000006s : 36: predicate.loop_unroll_before_grad 1.56% : 0.000005s : 25: predicate.make_slice_get_slice_eliminator 0.60% : 0.000002s : 10: predicate.merge_addn 0.67% : 0.000002s : 10: predicate.micro_step_allgather_replace 0.61% : 0.000002s : 10: predicate.mini_step_allgather_replace 0.84% : 0.000002s : 15: predicate.minmaximum_grad 1.95% : 0.000006s : 5: predicate.mutable_eliminate 0.47% : 0.000001s : 5: predicate.opt_reshape 0.43% : 0.000001s : 5: predicate.parallel_virtual_node 1.47% : 0.000004s : 23: predicate.partial_defer_inline 1.36% : 0.000004s : 24: predicate.partial_eliminate 0.84% : 0.000002s : 15: predicate.print_const_string_wrapper 0.67% : 0.000002s : 10: predicate.reduce_all_const_elim 1.33% : 0.000004s : 15: predicate.reduce_eliminate 2.32% : 0.000007s : 44: predicate.redundant_stop_gradient_eliminater 0.32% : 0.000001s : 10: predicate.remove_not_recompute_node 1.25% : 0.000004s : 29: predicate.replace_applicator 0.51% : 0.000001s : 10: predicate.replace_old_param 0.42% : 0.000001s : 5: predicate.reset_defer_inline 1.18% : 0.000003s : 15: predicate.reshape_eliminate 0.70% : 0.000002s : 10: predicate.row_tensor_add_zeros_like 0.74% : 0.000002s : 5: predicate.row_tensor_eliminate 1.10% : 0.000003s : 10: predicate.same_eliminate 0.51% : 0.000001s : 10: predicate.set_cell_output_no_recompute 0.81% : 0.000002s : 10: predicate.shard_identity_eliminate 0.67% : 0.000002s : 10: predicate.special_op_eliminate 0.70% : 0.000002s : 10: predicate.specialize_transform 1.13% : 0.000003s : 10: predicate.split_environ_get_set_with_tuple_value 0.91% : 0.000003s : 10: predicate.stack_unstack_eliminate 0.35% : 0.000001s : 5: predicate.switch_call_monad_eliminater 1.25% : 0.000004s : 23: predicate.switch_defer_inline 1.89% : 0.000006s : 33: predicate.switch_layer_defer_inline 4.17% : 0.000012s : 74: predicate.switch_simplify 0.84% : 0.000002s : 15: predicate.tile_eliminate 0.81% : 0.000002s : 15: predicate.transpose_eliminate 1.44% : 0.000004s : 25: predicate.tuple_list_convert_item_index_to_positive 1.65% : 0.000005s : 25: predicate.tuple_list_get_item_const_eliminator 1.51% : 0.000004s : 25: predicate.tuple_list_get_item_depend_reorder 2.93% : 0.000009s : 39: predicate.tuple_list_get_item_eliminator 1.47% : 0.000004s : 25: predicate.tuple_list_get_set_item_eliminator 2.32% : 0.000007s : 35: predicate.tuple_list_set_item_eliminator 1.77% : 0.000005s : 29: predicate.tuple_to_list_eliminator_ 2.12% : 0.000006s : 44: predicate.updatestate_pure_node_eliminater 3.02% : 0.000009s : 54: predicate.updatestate_useless_node_eliminater 0.48% : 0.000001s : 5: predicate.value_based_eliminate 0.62% : 0.000002s : 10: predicate.virtual_dataset_eliminate 0.63% : 0.000002s : 10: predicate.virtual_output_eliminate 0.23% : 0.000001s : 5: predicate.virtual_view_grad_eliminate 0.55% : 0.000002s : 5: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000902 11 41.71% : 0.000376s : 5: func_graph_cloner_run.FuncGraphClonerGraph 58.29% : 0.000526s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.106984 192 0.01% : 0.000006s : 1: ForceFp32Comm 3.91% : 0.004187s : 1: add_attr 3.89% : 0.004165s : 1: add_attr_with_inline 0.01% : 0.000006s : 1: add_comm_op_reuse_tag 0.07% : 0.000079s : 1: add_recomputation 0.01% : 0.000007s : 1: assign_add_opt 0.08% : 0.000084s : 1: auto_monad 0.04% : 0.000039s : 1: auto_monad_reorder 0.01% : 0.000006s : 1: begin_end_overlap_inline 0.01% : 0.000008s : 1: bias_add_comm_swap 0.53% : 0.000565s : 1: bootstrap 0.09% : 0.000092s : 1: cconv 0.01% : 0.000006s : 1: comm_op_add_attrs 0.02% : 0.000025s : 1: control_data_broadcast_order 0.01% : 0.000015s : 1: convert_after_rewriter 0.04% : 0.000038s : 1: cse_after_recomputation 0.01% : 0.000008s : 1: dataset_repeat_opt 0.03% : 0.000028s : 1: detach_backward 0.01% : 0.000015s : 1: environ_conv 0.03% : 0.000036s : 1: event_method 0.01% : 0.000007s : 1: full_micro_interleaved_order_control 0.01% : 0.000009s : 1: get_jit_bprop_graph 0.01% : 0.000013s : 1: graph_reusing 0.01% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000007s : 1: handle_group_info 0.01% : 0.000008s : 1: inline 0.01% : 0.000009s : 1: insert-virtual-dataset 0.01% : 0.000006s : 1: interleave_parallel_branches 0.01% : 0.000006s : 1: interleave_split_concat_branches 0.01% : 0.000008s : 1: label_fine_grained_interleaved_index 0.01% : 0.000014s : 1: label_micro_interleaved_index 0.87% : 0.000932s : 1: loop_unroll 0.01% : 0.000007s : 1: merge_cast_opt 0.01% : 0.000008s : 1: micro_interleaved_order_control 0.82% : 0.000874s : 1: mutable_eliminate 0.01% : 0.000011s : 1: offloading_packed_experts 0.03% : 0.000031s : 1: opt.transform.loop_unroll_optimizer 0.03% : 0.000033s : 1: opt.transform.mutable_eliminate 1.44% : 0.001536s : 78: opt.transform.opt_a 0.04% : 0.000041s : 1: opt.transform.opt_after_cconv 0.04% : 0.000047s : 1: opt.transform.opt_after_jit_grad 0.14% : 0.000153s : 28: opt.transform.opt_b 0.07% : 0.000072s : 2: opt.transform.opt_trans_graph 0.05% : 0.000051s : 4: opt.transform.symbol_engine_opt 23.02% : 0.024630s : 1: opt_a 0.17% : 0.000182s : 1: opt_after_cconv 0.76% : 0.000817s : 1: opt_after_jit_grad 0.36% : 0.000382s : 1: opt_b 26.73% : 0.028600s : 1: optimize 0.03% : 0.000035s : 1: optimize_parallel_all_gather_comm 0.01% : 0.000013s : 1: order_py_execute_after_rewriter 0.03% : 0.000034s : 1: overlap_grad_flash_sp 0.01% : 0.000007s : 1: overlap_grad_matmul_and_grad_allreduce 0.01% : 0.000011s : 1: overlap_grad_ring_attention 0.01% : 0.000008s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000007s : 1: overlap_opt_shard_in_pipeline 0.01% : 0.000009s : 1: overlap_param_gather 0.01% : 0.000007s : 1: overlap_recompute_allgather_and_fa_grad 0.01% : 0.000012s : 1: overlap_recompute_and_grad_model_parallel 0.01% : 0.000008s : 1: overlap_recompute_comm 0.01% : 0.000012s : 1: parallel-infer-symbol 0.01% : 0.000007s : 1: parallel-infer-symbol-second 0.01% : 0.000008s : 1: partial_unused_args_eliminate 0.01% : 0.000011s : 1: pipeline_parallel_scheduler 0.01% : 0.000007s : 1: pipeline_split 0.05% : 0.000053s : 1: pre_auto_parallel 0.05% : 0.000049s : 1: py_interpret_to_execute 0.03% : 0.000031s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000007s : 1: remove_cast_before_assign_add 0.02% : 0.000026s : 1: remove_dup_value 19.53% : 0.020889s : 1: renormalize.infer 0.53% : 0.000568s : 1: renormalize.specialize 0.01% : 0.000008s : 1: reorder_send_recv_between_fp_bp 0.01% : 0.000015s : 1: rewriter_after_jit_bprop_graph 0.06% : 0.000066s : 1: rewriter_after_opt_a 0.10% : 0.000106s : 1: rewriter_before_opt_a 0.01% : 0.000009s : 1: slice_cell_reuse_recomputed_activation 0.01% : 0.000008s : 1: slice_recompute_activation 0.01% : 0.000007s : 1: split_layernorm_comm 0.01% : 0.000007s : 1: split_matmul_comm_elemetwise 0.01% : 0.000012s : 1: swap_dp_allreduce_reducescatter 0.11% : 0.000123s : 1: symbol_engine_optimizer 0.11% : 0.000121s : 1: tuple_transform 15.67% : 0.016767s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:44:45.926.552 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0175918, [21] [bootstrap]: 0.00044796 [type_inference]: 0.00667861 [event_method]: 2.17e-05 [auto_monad]: 6.641e-05 [graph_reusing]: 6.29001e-06 [inline]: 2.63e-06 [add_attr]: 0.00351913, [1] [add_attr_with_inline]: 0.00350849, [1] [Cycle 1]: 7.59e-05, [2] [tag_attr]: 2.32e-05 [meta_addattr_fg_expand]: 6.19001e-06 [parallel-infer-symbol]: 4.15e-06 [pre_auto_parallel]: 5.976e-05 [insert-virtual-dataset]: 2.82002e-06 [parallel-infer-symbol-second]: 8.00006e-07 [dataset_repeat_opt]: 2.12999e-06 [pipeline_split]: 1.76e-06 [optimize]: 0.00587411, [53] [py_interpret_to_execute]: 2.987e-05 [rewriter_before_opt_a]: 9.552e-05 [opt_a]: 0.00325908, [2] [Cycle 1]: 0.00245035, [45] [expand_dump_flag]: 3.10002e-06 [switch_simplify]: 4.539e-05 [loop_unroll]: 3.083e-05 [a_1]: 0.00070193 [with_stream_mark]: 1.999e-05 [recompute_prepare]: 1.171e-05 [updatestate_depend_eliminate]: 4.69998e-06 [updatestate_assign_eliminate]: 3.81001e-06 [updatestate_loads_eliminate]: 3.68e-06 [parameter_eliminate]: 2.14e-06 [a_2]: 0.0001038 [accelerated_algorithm]: 8.87999e-06 [shard]: 2.04e-06 [meta_shard_fg_expand]: 2.34001e-06 [shard_inline]: 8.47e-06 [merge_send_recv]: 9.97001e-06 [auto_parallel]: 7.68999e-06 [parallel]: 2.095e-05 [flash_sp]: 1.019e-05 [merge_comm]: 4.99998e-06 [allreduce_fusion]: 4.16001e-06 [matmul_add_comm_reduction]: 1.003e-05 [allreduce_slice_to_reducescatter]: 7.99977e-07 [virtual_shard_identity]: 9.17001e-06 [virtual_dataset]: 8.07e-06 [get_grad_eliminate_]: 7.93999e-06 [virtual_output]: 7.78999e-06 [merge_forward]: 4.51002e-06 [cell_reuse_recompute_pass]: 1.24e-06 [offload_activation]: 1.231e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.763e-05 [merge_recompute_call_nodes]: 1.40999e-06 [before_grad]: 1.37e-05 [set_forward_comm_id_for_comm_node_pass]: 4.64998e-06 [meta_fg_expand]: 3.88999e-06 [flash_sp_send_recv_attached]: 3.03e-06 [receive_attached]: 2.53e-06 [after_resolve]: 1.35e-05 [a_after_grad]: 1.222e-05 [renormalize]: 0.00092412 [add_forward_monad_depend]: 7.11999e-06 [auto_monad_grad]: 2.53e-06 [auto_monad_eliminator]: 1.909e-05 [cse]: 3.699e-05 [a_3]: 6.103e-05 [Cycle 2]: 0.00079782, [45] [expand_dump_flag]: 2.32999e-06 [switch_simplify]: 9.94001e-06 [loop_unroll]: 8.20999e-06 [a_1]: 0.0001794 [with_stream_mark]: 1.778e-05 [recompute_prepare]: 8.40999e-06 [updatestate_depend_eliminate]: 4.63001e-06 [updatestate_assign_eliminate]: 3.11001e-06 [updatestate_loads_eliminate]: 3.33998e-06 [parameter_eliminate]: 1.47999e-06 [a_2]: 9.341e-05 [accelerated_algorithm]: 7.71001e-06 [shard]: 2.07999e-06 [meta_shard_fg_expand]: 2.10002e-06 [shard_inline]: 7.47002e-06 [merge_send_recv]: 7.36999e-06 [auto_parallel]: 7.66001e-06 [parallel]: 5.63002e-06 [flash_sp]: 3.45998e-06 [merge_comm]: 4.56002e-06 [allreduce_fusion]: 4.22998e-06 [matmul_add_comm_reduction]: 7.49002e-06 [allreduce_slice_to_reducescatter]: 3.70026e-07 [virtual_shard_identity]: 8.72e-06 [virtual_dataset]: 7.35e-06 [get_grad_eliminate_]: 7.06999e-06 [virtual_output]: 6.83e-06 [merge_forward]: 3.66001e-06 [cell_reuse_recompute_pass]: 1.92999e-06 [offload_activation]: 8.54e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.522e-05 [merge_recompute_call_nodes]: 1.17e-06 [before_grad]: 1.198e-05 [set_forward_comm_id_for_comm_node_pass]: 4.12e-06 [meta_fg_expand]: 3.01001e-06 [flash_sp_send_recv_attached]: 1.10001e-06 [receive_attached]: 1.66e-06 [after_resolve]: 1.177e-05 [a_after_grad]: 1.133e-05 [renormalize]: 7.99773e-08 [add_forward_monad_depend]: 1.87999e-06 [auto_monad_grad]: 8.50006e-07 [auto_monad_eliminator]: 1.044e-05 [cse]: 2.059e-05 [a_3]: 4.789e-05 [py_interpret_to_execute_after_opt_a]: 1.364e-05 [slice_cell_reuse_recomputed_activation]: 2.26e-06 [rewriter_after_opt_a]: 4.506e-05 [convert_after_rewriter]: 8.47e-06 [order_py_execute_after_rewriter]: 5.84e-06 [mutable_eliminate]: 0.00070409 [opt_b]: 0.00027474, [1] [Cycle 1]: 0.0002667, [7] [b_1]: 0.00016385 [b_2]: 1.029e-05 [updatestate_depend_eliminate]: 1.008e-05 [updatestate_assign_eliminate]: 3.58e-06 [updatestate_loads_eliminate]: 3.05002e-06 [renormalize]: 7.2e-07 [cse]: 3.671e-05 [optimize_parallel_all_gather_comm]: 2.213e-05 [overlap_param_gather]: 1.99999e-06 [cconv]: 3.656e-05 [loop_unroll]: 0.00058558 [opt_after_cconv]: 0.00013532, [1] [Cycle 1]: 0.00012662, [7] [c_1]: 4.006e-05 [parameter_eliminate]: 5.62001e-06 [updatestate_depend_eliminate]: 7.77e-06 [updatestate_assign_eliminate]: 3.08998e-06 [updatestate_loads_eliminate]: 2.96001e-06 [cse]: 2.966e-05 [renormalize]: 1.00999e-06 [remove_dup_value]: 1.759e-05 [tuple_transform]: 9.869e-05, [1] [Cycle 1]: 9.337e-05, [4] [d_1]: 6.14e-05 [none_parameter_eliminate]: 1.62001e-06 [renormalize]: 5.19998e-07 [switch_simplify]: 9.36998e-06 [partial_unused_args_eliminate]: 2.19999e-06 [add_recomputation]: 6.605e-05 [cse_after_recomputation]: 2.819e-05, [1] [Cycle 1]: 2.308e-05, [1] [cse]: 1.696e-05 [environ_conv]: 7.80998e-06 [swap_dp_allreduce_reducescatter]: 6.53e-06 [bias_add_comm_swap]: 3.41001e-06 [label_micro_interleaved_index]: 5.32001e-06 [label_fine_grained_interleaved_index]: 2.91999e-06 [merge_cast_opt]: 1.49e-06 [slice_recompute_activation]: 2.16e-06 [micro_interleaved_order_control]: 2.26e-06 [assign_add_opt]: 1.80001e-06 [ForceFp32Comm]: 9.70002e-07 [remove_cast_before_assign_add]: 1.14e-06 [full_micro_interleaved_order_control]: 2.04999e-06 [reorder_send_recv_between_fp_bp]: 3.00998e-06 [comm_op_add_attrs]: 1.40001e-06 [add_comm_op_reuse_tag]: 1.00999e-06 [interleave_split_concat_branches]: 1.10001e-06 [interleave_parallel_branches]: 1.05001e-06 [overlap_opt_shard_in_pipeline]: 1.37999e-06 [overlap_opt_shard_grad_in_pipeline]: 2.26998e-06 [control_data_broadcast_order]: 1.555e-05 [grouped_pairwise_exchange_alltoall]: 1.51998e-06 [offloading_packed_experts]: 5.25999e-06 [overlap_recompute_and_grad_model_parallel]: 5.80002e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.17999e-06 [overlap_recompute_allgather_and_fa_grad]: 1.52001e-06 [overlap_recompute_comm]: 2.38998e-06 [overlap_grad_ring_attention]: 4.63001e-06 [overlap_grad_flash_sp]: 2.458e-05 [begin_end_overlap_inline]: 5.69999e-07 [split_matmul_comm_elemetwise]: 2.33998e-06 [split_layernorm_comm]: 1.58002e-06 [handle_group_info]: 1.10999e-06 [symbol_engine_optimizer]: 9.025e-05, [1] [Cycle 1]: 8.575e-05, [6] [build]: 4.2e-06 [elim_shapecalc]: 1.214e-05 [elim_not_effective]: 1.722e-05 [opt_reshape]: 9.03002e-06 [fold_const_symbol]: 1.29e-05 [renormalize]: 2.59985e-07 [detach_backward]: 2.41e-06 [pipeline_parallel_scheduler]: 1.64e-06 [auto_monad_reorder]: 1.996e-05 [get_jit_bprop_graph]: 2.83998e-06 [rewriter_after_jit_bprop_graph]: 6.14001e-06 [opt_after_jit_grad]: 0.00064863 [validate]: 5.217e-05 Sums bootstrap : 0.000448s : 3.43% type_inference : 0.006679s : 51.17% event_method : 0.000022s : 0.17% auto_monad : 0.000066s : 0.51% graph_reusing : 0.000006s : 0.05% inline : 0.000003s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000023s : 0.18% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.05% parallel-infer-symbol : 0.000004s : 0.03% pre_auto_parallel : 0.000060s : 0.46% insert-virtual-dataset : 0.000003s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000030s : 0.23% optimize.rewriter_before_opt_a : 0.000096s : 0.73% optimize.opt_a.expand_dump_flag : 0.000005s : 0.04% optimize.opt_a.switch_simplify : 0.000055s : 0.42% optimize.opt_a.loop_unroll : 0.000039s : 0.30% optimize.opt_a.a_1 : 0.000881s : 6.75% optimize.opt_a.with_stream_mark : 0.000038s : 0.29% optimize.opt_a.recompute_prepare : 0.000020s : 0.15% optimize.opt_a.updatestate_depend_eliminate : 0.000009s : 0.07% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.05% optimize.opt_a.updatestate_loads_eliminate : 0.000007s : 0.05% optimize.opt_a.parameter_eliminate : 0.000004s : 0.03% optimize.opt_a.a_2 : 0.000197s : 1.51% optimize.opt_a.accelerated_algorithm : 0.000017s : 0.13% optimize.opt_a.shard : 0.000004s : 0.03% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.03% optimize.opt_a.shard_inline : 0.000016s : 0.12% optimize.opt_a.merge_send_recv : 0.000017s : 0.13% optimize.opt_a.auto_parallel : 0.000015s : 0.12% optimize.opt_a.parallel : 0.000027s : 0.20% optimize.opt_a.flash_sp : 0.000014s : 0.10% optimize.opt_a.merge_comm : 0.000010s : 0.07% optimize.opt_a.allreduce_fusion : 0.000008s : 0.06% optimize.opt_a.matmul_add_comm_reduction : 0.000018s : 0.13% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000018s : 0.14% optimize.opt_a.virtual_dataset : 0.000015s : 0.12% optimize.opt_a.get_grad_eliminate_ : 0.000015s : 0.12% optimize.opt_a.virtual_output : 0.000015s : 0.11% optimize.opt_a.merge_forward : 0.000008s : 0.06% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.02% optimize.opt_a.offload_activation : 0.000021s : 0.16% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000033s : 0.25% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.02% optimize.opt_a.before_grad : 0.000026s : 0.20% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000009s : 0.07% optimize.opt_a.meta_fg_expand : 0.000007s : 0.05% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.03% optimize.opt_a.receive_attached : 0.000004s : 0.03% optimize.opt_a.after_resolve : 0.000025s : 0.19% optimize.opt_a.a_after_grad : 0.000024s : 0.18% optimize.opt_a.renormalize : 0.000924s : 7.08% optimize.opt_a.add_forward_monad_depend : 0.000009s : 0.07% optimize.opt_a.auto_monad_grad : 0.000003s : 0.03% optimize.opt_a.auto_monad_eliminator : 0.000030s : 0.23% optimize.opt_a.cse : 0.000058s : 0.44% optimize.opt_a.a_3 : 0.000109s : 0.83% optimize.py_interpret_to_execute_after_opt_a : 0.000014s : 0.10% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.02% optimize.rewriter_after_opt_a : 0.000045s : 0.35% optimize.convert_after_rewriter : 0.000008s : 0.06% optimize.order_py_execute_after_rewriter : 0.000006s : 0.04% optimize.mutable_eliminate : 0.000704s : 5.39% optimize.opt_b.b_1 : 0.000164s : 1.26% optimize.opt_b.b_2 : 0.000010s : 0.08% optimize.opt_b.updatestate_depend_eliminate : 0.000010s : 0.08% optimize.opt_b.updatestate_assign_eliminate : 0.000004s : 0.03% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.02% optimize.opt_b.renormalize : 0.000001s : 0.01% optimize.opt_b.cse : 0.000037s : 0.28% optimize.optimize_parallel_all_gather_comm : 0.000022s : 0.17% optimize.overlap_param_gather : 0.000002s : 0.02% optimize.cconv : 0.000037s : 0.28% optimize.loop_unroll : 0.000586s : 4.49% optimize.opt_after_cconv.c_1 : 0.000040s : 0.31% optimize.opt_after_cconv.parameter_eliminate : 0.000006s : 0.04% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000008s : 0.06% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.02% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.02% optimize.opt_after_cconv.cse : 0.000030s : 0.23% optimize.opt_after_cconv.renormalize : 0.000001s : 0.01% optimize.remove_dup_value : 0.000018s : 0.13% optimize.tuple_transform.d_1 : 0.000061s : 0.47% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000001s : 0.00% optimize.tuple_transform.switch_simplify : 0.000009s : 0.07% optimize.partial_unused_args_eliminate : 0.000002s : 0.02% optimize.add_recomputation : 0.000066s : 0.51% optimize.cse_after_recomputation.cse : 0.000017s : 0.13% optimize.environ_conv : 0.000008s : 0.06% optimize.swap_dp_allreduce_reducescatter : 0.000007s : 0.05% optimize.bias_add_comm_swap : 0.000003s : 0.03% optimize.label_micro_interleaved_index : 0.000005s : 0.04% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.02% optimize.merge_cast_opt : 0.000001s : 0.01% optimize.slice_recompute_activation : 0.000002s : 0.02% optimize.micro_interleaved_order_control : 0.000002s : 0.02% optimize.assign_add_opt : 0.000002s : 0.01% optimize.ForceFp32Comm : 0.000001s : 0.01% optimize.remove_cast_before_assign_add : 0.000001s : 0.01% optimize.full_micro_interleaved_order_control : 0.000002s : 0.02% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.02% optimize.comm_op_add_attrs : 0.000001s : 0.01% optimize.add_comm_op_reuse_tag : 0.000001s : 0.01% optimize.interleave_split_concat_branches : 0.000001s : 0.01% optimize.interleave_parallel_branches : 0.000001s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.02% optimize.control_data_broadcast_order : 0.000016s : 0.12% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.01% optimize.offloading_packed_experts : 0.000005s : 0.04% optimize.overlap_recompute_and_grad_model_parallel : 0.000006s : 0.04% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000002s : 0.01% optimize.overlap_recompute_comm : 0.000002s : 0.02% optimize.overlap_grad_ring_attention : 0.000005s : 0.04% optimize.overlap_grad_flash_sp : 0.000025s : 0.19% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.02% optimize.split_layernorm_comm : 0.000002s : 0.01% optimize.handle_group_info : 0.000001s : 0.01% optimize.symbol_engine_optimizer.build : 0.000004s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000012s : 0.09% optimize.symbol_engine_optimizer.elim_not_effective : 0.000017s : 0.13% optimize.symbol_engine_optimizer.opt_reshape : 0.000009s : 0.07% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000013s : 0.10% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.02% pipeline_parallel_scheduler : 0.000002s : 0.01% auto_monad_reorder : 0.000020s : 0.15% get_jit_bprop_graph : 0.000003s : 0.02% rewriter_after_jit_bprop_graph : 0.000006s : 0.05% opt_after_jit_grad : 0.000649s : 4.97% validate : 0.000052s : 0.40% Time group info: ------[substitution.] 0.000235 38 11.99% : 0.000028s : 3: substitution.cast_eliminate 1.13% : 0.000003s : 3: substitution.elim_not_effective 0.91% : 0.000002s : 3: substitution.fold_const_symbol 3.28% : 0.000008s : 5: substitution.graph_param_transform 69.81% : 0.000164s : 4: substitution.inline 2.07% : 0.000005s : 6: substitution.j_node_and_user_rematch 3.15% : 0.000007s : 6: substitution.remove_not_recompute_node 2.32% : 0.000005s : 4: substitution.replace_old_param 5.34% : 0.000013s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.006607 2 87.77% : 0.005799s : 1: type_inference.infer 12.23% : 0.000808s : 1: type_inference.specialize ------[replace.] 0.000064 8 63.10% : 0.000040s : 4: replace.inline 36.90% : 0.000024s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000172 8 93.79% : 0.000162s : 4: match.inline 6.21% : 0.000011s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000254 1504 0.82% : 0.000002s : 15: predicate.accumulaten_eliminater 0.98% : 0.000002s : 5: predicate.ad_related_special_op_eliminate 0.60% : 0.000002s : 10: predicate.addn_check_dump 0.84% : 0.000002s : 15: predicate.addn_zero_filter 0.79% : 0.000002s : 15: predicate.adjust_all_reduce_mul_add 2.05% : 0.000005s : 25: predicate.arithmetic_simplify 1.08% : 0.000003s : 15: predicate.cast_eliminate 0.59% : 0.000001s : 10: predicate.check_bprop_eliminate 0.65% : 0.000002s : 10: predicate.compare_switch_simplify 0.18% : 0.000000s : 5: predicate.const_output_eliminate 0.62% : 0.000002s : 10: predicate.depend_value_elim 0.88% : 0.000002s : 15: predicate.dict_get_item_const_eliminator 0.95% : 0.000002s : 15: predicate.dict_get_item_eliminator 0.90% : 0.000002s : 15: predicate.dict_set_item_eliminator 1.20% : 0.000003s : 10: predicate.dumpgradient_eliminate 0.31% : 0.000001s : 5: predicate.elim_not_effective 0.46% : 0.000001s : 5: predicate.elim_shapecalc_of_broadcastargs 1.25% : 0.000003s : 20: predicate.environ_add_const_eliminate 1.08% : 0.000003s : 20: predicate.environ_get_add_eliminate 1.08% : 0.000003s : 20: predicate.environ_get_depend_swap 1.76% : 0.000004s : 30: predicate.environ_get_eliminate 1.02% : 0.000003s : 20: predicate.environ_get_set_eliminate 1.26% : 0.000003s : 23: predicate.exchange_switch_depend_value 2.27% : 0.000006s : 23: predicate.float_depend_g_call 0.55% : 0.000001s : 10: predicate.float_environ_get_switch 0.82% : 0.000002s : 15: predicate.float_tuple_getitem_switch 0.20% : 0.000001s : 5: predicate.fold_const_symbol 0.78% : 0.000002s : 10: predicate.get_grad_eliminate 0.27% : 0.000001s : 5: predicate.graph_param_transform 0.62% : 0.000002s : 10: predicate.incorporate_call 0.53% : 0.000001s : 10: predicate.incorporate_call_switch 6.71% : 0.000017s : 68: predicate.inline 0.77% : 0.000002s : 10: predicate.inline_without_move 0.31% : 0.000001s : 10: predicate.j_node_and_user_rematch 0.89% : 0.000002s : 10: predicate.less_batch_normalization 1.80% : 0.000005s : 29: predicate.list_to_tuple_eliminator_ 2.39% : 0.000006s : 44: predicate.load_eliminater 1.34% : 0.000003s : 5: predicate.loop_unroll_after_grad 2.12% : 0.000005s : 36: predicate.loop_unroll_before_grad 1.85% : 0.000005s : 25: predicate.make_slice_get_slice_eliminator 0.59% : 0.000001s : 10: predicate.merge_addn 0.56% : 0.000001s : 10: predicate.micro_step_allgather_replace 0.68% : 0.000002s : 10: predicate.mini_step_allgather_replace 0.83% : 0.000002s : 15: predicate.minmaximum_grad 1.38% : 0.000004s : 5: predicate.mutable_eliminate 0.40% : 0.000001s : 5: predicate.opt_reshape 0.47% : 0.000001s : 5: predicate.parallel_virtual_node 1.54% : 0.000004s : 23: predicate.partial_defer_inline 1.52% : 0.000004s : 24: predicate.partial_eliminate 0.85% : 0.000002s : 15: predicate.print_const_string_wrapper 0.56% : 0.000001s : 10: predicate.reduce_all_const_elim 1.39% : 0.000004s : 15: predicate.reduce_eliminate 2.36% : 0.000006s : 44: predicate.redundant_stop_gradient_eliminater 0.39% : 0.000001s : 10: predicate.remove_not_recompute_node 1.26% : 0.000003s : 29: predicate.replace_applicator 0.49% : 0.000001s : 10: predicate.replace_old_param 0.38% : 0.000001s : 5: predicate.reset_defer_inline 1.01% : 0.000003s : 15: predicate.reshape_eliminate 0.61% : 0.000002s : 10: predicate.row_tensor_add_zeros_like 0.41% : 0.000001s : 5: predicate.row_tensor_eliminate 0.90% : 0.000002s : 10: predicate.same_eliminate 0.42% : 0.000001s : 10: predicate.set_cell_output_no_recompute 0.82% : 0.000002s : 10: predicate.shard_identity_eliminate 0.69% : 0.000002s : 10: predicate.special_op_eliminate 0.75% : 0.000002s : 10: predicate.specialize_transform 0.84% : 0.000002s : 10: predicate.split_environ_get_set_with_tuple_value 0.91% : 0.000002s : 10: predicate.stack_unstack_eliminate 0.42% : 0.000001s : 5: predicate.switch_call_monad_eliminater 1.39% : 0.000004s : 23: predicate.switch_defer_inline 1.97% : 0.000005s : 33: predicate.switch_layer_defer_inline 4.78% : 0.000012s : 74: predicate.switch_simplify 0.81% : 0.000002s : 15: predicate.tile_eliminate 0.84% : 0.000002s : 15: predicate.transpose_eliminate 1.72% : 0.000004s : 25: predicate.tuple_list_convert_item_index_to_positive 1.77% : 0.000004s : 25: predicate.tuple_list_get_item_const_eliminator 1.31% : 0.000003s : 25: predicate.tuple_list_get_item_depend_reorder 3.05% : 0.000008s : 39: predicate.tuple_list_get_item_eliminator 1.59% : 0.000004s : 25: predicate.tuple_list_get_set_item_eliminator 2.28% : 0.000006s : 35: predicate.tuple_list_set_item_eliminator 1.66% : 0.000004s : 29: predicate.tuple_to_list_eliminator_ 2.24% : 0.000006s : 44: predicate.updatestate_pure_node_eliminater 3.00% : 0.000008s : 54: predicate.updatestate_useless_node_eliminater 0.42% : 0.000001s : 5: predicate.value_based_eliminate 0.74% : 0.000002s : 10: predicate.virtual_dataset_eliminate 0.59% : 0.000001s : 10: predicate.virtual_output_eliminate 0.27% : 0.000001s : 5: predicate.virtual_view_grad_eliminate 0.56% : 0.000001s : 5: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000611 11 53.90% : 0.000329s : 5: func_graph_cloner_run.FuncGraphClonerGraph 46.10% : 0.000282s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.029537 192 0.01% : 0.000004s : 1: ForceFp32Comm 11.93% : 0.003525s : 1: add_attr 11.89% : 0.003513s : 1: add_attr_with_inline 0.01% : 0.000004s : 1: add_comm_op_reuse_tag 0.24% : 0.000071s : 1: add_recomputation 0.02% : 0.000005s : 1: assign_add_opt 0.25% : 0.000073s : 1: auto_monad 0.08% : 0.000024s : 1: auto_monad_reorder 0.01% : 0.000003s : 1: begin_end_overlap_inline 0.02% : 0.000006s : 1: bias_add_comm_swap 1.61% : 0.000476s : 1: bootstrap 0.14% : 0.000040s : 1: cconv 0.01% : 0.000004s : 1: comm_op_add_attrs 0.06% : 0.000019s : 1: control_data_broadcast_order 0.04% : 0.000012s : 1: convert_after_rewriter 0.11% : 0.000031s : 1: cse_after_recomputation 0.02% : 0.000005s : 1: dataset_repeat_opt 0.02% : 0.000006s : 1: detach_backward 0.04% : 0.000011s : 1: environ_conv 0.10% : 0.000029s : 1: event_method 0.02% : 0.000005s : 1: full_micro_interleaved_order_control 0.02% : 0.000006s : 1: get_jit_bprop_graph 0.03% : 0.000010s : 1: graph_reusing 0.01% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000004s : 1: handle_group_info 0.02% : 0.000005s : 1: inline 0.02% : 0.000007s : 1: insert-virtual-dataset 0.01% : 0.000004s : 1: interleave_parallel_branches 0.01% : 0.000004s : 1: interleave_split_concat_branches 0.02% : 0.000007s : 1: label_fine_grained_interleaved_index 0.03% : 0.000008s : 1: label_micro_interleaved_index 2.01% : 0.000595s : 1: loop_unroll 0.01% : 0.000004s : 1: merge_cast_opt 0.02% : 0.000005s : 1: micro_interleaved_order_control 2.42% : 0.000715s : 1: mutable_eliminate 0.03% : 0.000008s : 1: offloading_packed_experts 0.07% : 0.000022s : 1: opt.transform.loop_unroll_optimizer 0.08% : 0.000022s : 1: opt.transform.mutable_eliminate 4.77% : 0.001409s : 78: opt.transform.opt_a 0.13% : 0.000038s : 1: opt.transform.opt_after_cconv 0.12% : 0.000034s : 1: opt.transform.opt_after_jit_grad 0.47% : 0.000140s : 28: opt.transform.opt_b 0.23% : 0.000068s : 2: opt.transform.opt_trans_graph 0.16% : 0.000047s : 4: opt.transform.symbol_engine_opt 11.05% : 0.003263s : 1: opt_a 0.47% : 0.000139s : 1: opt_after_cconv 2.24% : 0.000662s : 1: opt_after_jit_grad 0.94% : 0.000279s : 1: opt_b 19.91% : 0.005880s : 1: optimize 0.09% : 0.000026s : 1: optimize_parallel_all_gather_comm 0.03% : 0.000009s : 1: order_py_execute_after_rewriter 0.09% : 0.000028s : 1: overlap_grad_flash_sp 0.01% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.03% : 0.000008s : 1: overlap_grad_ring_attention 0.02% : 0.000006s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.02% : 0.000005s : 1: overlap_param_gather 0.01% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.03% : 0.000009s : 1: overlap_recompute_and_grad_model_parallel 0.02% : 0.000005s : 1: overlap_recompute_comm 0.03% : 0.000008s : 1: parallel-infer-symbol 0.01% : 0.000004s : 1: parallel-infer-symbol-second 0.02% : 0.000005s : 1: partial_unused_args_eliminate 0.02% : 0.000005s : 1: pipeline_parallel_scheduler 0.02% : 0.000005s : 1: pipeline_split 0.22% : 0.000065s : 1: pre_auto_parallel 0.12% : 0.000035s : 1: py_interpret_to_execute 0.06% : 0.000018s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000004s : 1: remove_cast_before_assign_add 0.07% : 0.000021s : 1: remove_dup_value 1.82% : 0.000539s : 1: renormalize.infer 1.27% : 0.000375s : 1: renormalize.specialize 0.02% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.03% : 0.000009s : 1: rewriter_after_jit_bprop_graph 0.17% : 0.000050s : 1: rewriter_after_opt_a 0.34% : 0.000100s : 1: rewriter_before_opt_a 0.02% : 0.000006s : 1: slice_cell_reuse_recomputed_activation 0.02% : 0.000005s : 1: slice_recompute_activation 0.01% : 0.000004s : 1: split_layernorm_comm 0.02% : 0.000005s : 1: split_matmul_comm_elemetwise 0.03% : 0.000009s : 1: swap_dp_allreduce_reducescatter 0.31% : 0.000093s : 1: symbol_engine_optimizer 0.35% : 0.000102s : 1: tuple_transform 22.69% : 0.006702s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:44:46.447.613 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:44:46.447.947 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0368333, [21] [bootstrap]: 0.00042026 [type_inference]: 0.0237805 [event_method]: 2.32e-05 [auto_monad]: 7.096e-05 [graph_reusing]: 6.54999e-06 [inline]: 2.33002e-06 [add_attr]: 0.00372467, [1] [add_attr_with_inline]: 0.00371219, [1] [Cycle 1]: 0.00010096, [2] [tag_attr]: 2.499e-05 [meta_addattr_fg_expand]: 6.86001e-06 [parallel-infer-symbol]: 3.71001e-06 [pre_auto_parallel]: 4.412e-05 [insert-virtual-dataset]: 2.61e-06 [parallel-infer-symbol-second]: 8.70001e-07 [dataset_repeat_opt]: 2.39999e-06 [pipeline_split]: 2.02999e-06 [optimize]: 0.00711697, [53] [py_interpret_to_execute]: 4.456e-05 [rewriter_before_opt_a]: 0.00011327 [opt_a]: 0.00409722, [2] [Cycle 1]: 0.00294639, [45] [expand_dump_flag]: 3.26999e-06 [switch_simplify]: 4.672e-05 [loop_unroll]: 3.231e-05 [a_1]: 0.00077012 [with_stream_mark]: 2.358e-05 [recompute_prepare]: 1.423e-05 [updatestate_depend_eliminate]: 6.47001e-06 [updatestate_assign_eliminate]: 5.04e-06 [updatestate_loads_eliminate]: 3.98001e-06 [parameter_eliminate]: 2.07999e-06 [a_2]: 0.00015737 [accelerated_algorithm]: 9.59999e-06 [shard]: 1.82001e-06 [meta_shard_fg_expand]: 2.57001e-06 [shard_inline]: 9.24998e-06 [merge_send_recv]: 1.143e-05 [auto_parallel]: 1.038e-05 [parallel]: 2.102e-05 [flash_sp]: 1.152e-05 [merge_comm]: 5.65001e-06 [allreduce_fusion]: 5.22e-06 [matmul_add_comm_reduction]: 1.191e-05 [allreduce_slice_to_reducescatter]: 6.79982e-07 [virtual_shard_identity]: 1.181e-05 [virtual_dataset]: 9.31002e-06 [get_grad_eliminate_]: 9.30001e-06 [virtual_output]: 9.34e-06 [merge_forward]: 5.91998e-06 [cell_reuse_recompute_pass]: 1.48002e-06 [offload_activation]: 1.333e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.097e-05 [merge_recompute_call_nodes]: 1.66002e-06 [before_grad]: 1.58e-05 [set_forward_comm_id_for_comm_node_pass]: 5.32999e-06 [meta_fg_expand]: 3.98001e-06 [flash_sp_send_recv_attached]: 2.71e-06 [receive_attached]: 2.64999e-06 [after_resolve]: 1.497e-05 [a_after_grad]: 1.398e-05 [renormalize]: 0.00102554 [add_forward_monad_depend]: 8.52e-06 [auto_monad_grad]: 3.16999e-06 [auto_monad_eliminator]: 2.249e-05 [cse]: 4.648e-05 [a_3]: 8.577e-05 [Cycle 2]: 0.00113334, [45] [expand_dump_flag]: 2.03002e-06 [switch_simplify]: 1.119e-05 [loop_unroll]: 9.10999e-06 [a_1]: 0.0002265 [with_stream_mark]: 1.706e-05 [recompute_prepare]: 9.59e-06 [updatestate_depend_eliminate]: 4.66002e-06 [updatestate_assign_eliminate]: 4.05e-06 [updatestate_loads_eliminate]: 3.77002e-06 [parameter_eliminate]: 1.69e-06 [a_2]: 0.0001413 [accelerated_algorithm]: 9.08002e-06 [shard]: 1.96e-06 [meta_shard_fg_expand]: 2.70997e-06 [shard_inline]: 9.04e-06 [merge_send_recv]: 9.64999e-06 [auto_parallel]: 9.86e-06 [parallel]: 7.35e-06 [flash_sp]: 3.97e-06 [merge_comm]: 5.40001e-06 [allreduce_fusion]: 4.68001e-06 [matmul_add_comm_reduction]: 9.72999e-06 [allreduce_slice_to_reducescatter]: 5.69999e-07 [virtual_shard_identity]: 9.99999e-06 [virtual_dataset]: 8.92e-06 [get_grad_eliminate_]: 9.01002e-06 [virtual_output]: 9.60001e-06 [merge_forward]: 4.96002e-06 [cell_reuse_recompute_pass]: 2.58e-06 [offload_activation]: 1.106e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.031e-05 [merge_recompute_call_nodes]: 1.32e-06 [before_grad]: 1.533e-05 [set_forward_comm_id_for_comm_node_pass]: 6.05002e-06 [meta_fg_expand]: 3.94002e-06 [flash_sp_send_recv_attached]: 1.92999e-06 [receive_attached]: 2.13002e-06 [after_resolve]: 1.518e-05 [a_after_grad]: 1.358e-05 [renormalize]: 8.9989e-08 [add_forward_monad_depend]: 2.12001e-06 [auto_monad_grad]: 1.12e-06 [auto_monad_eliminator]: 1.308e-05 [cse]: 2.921e-05 [a_3]: 8.883e-05 [py_interpret_to_execute_after_opt_a]: 2.146e-05 [slice_cell_reuse_recomputed_activation]: 5.40999e-06 [rewriter_after_opt_a]: 6.069e-05 [convert_after_rewriter]: 1.303e-05 [order_py_execute_after_rewriter]: 9.66e-06 [mutable_eliminate]: 0.0007611 [opt_b]: 0.00037339, [1] [Cycle 1]: 0.00036193, [7] [b_1]: 0.00023704 [b_2]: 1.137e-05 [updatestate_depend_eliminate]: 9.56e-06 [updatestate_assign_eliminate]: 4.10998e-06 [updatestate_loads_eliminate]: 4e-06 [renormalize]: 1.27999e-06 [cse]: 3.561e-05 [optimize_parallel_all_gather_comm]: 2.521e-05 [overlap_param_gather]: 5.16002e-06 [cconv]: 3.664e-05 [loop_unroll]: 0.00050477 [opt_after_cconv]: 0.00016664, [1] [Cycle 1]: 0.00015617, [7] [c_1]: 4.574e-05 [parameter_eliminate]: 5.05999e-06 [updatestate_depend_eliminate]: 8.32e-06 [updatestate_assign_eliminate]: 3.9e-06 [updatestate_loads_eliminate]: 3.61001e-06 [cse]: 2.826e-05 [renormalize]: 9.70002e-07 [remove_dup_value]: 5.732e-05 [tuple_transform]: 0.00012234, [1] [Cycle 1]: 0.00011399, [4] [d_1]: 6.874e-05 [none_parameter_eliminate]: 1.73002e-06 [renormalize]: 2.3999e-07 [switch_simplify]: 9.74999e-06 [partial_unused_args_eliminate]: 5.05001e-06 [add_recomputation]: 7.201e-05 [cse_after_recomputation]: 3.876e-05, [1] [Cycle 1]: 3.009e-05, [1] [cse]: 2.013e-05 [environ_conv]: 1.169e-05 [swap_dp_allreduce_reducescatter]: 1.002e-05 [bias_add_comm_swap]: 7.8e-06 [label_micro_interleaved_index]: 8.35001e-06 [label_fine_grained_interleaved_index]: 5.74999e-06 [merge_cast_opt]: 4.33001e-06 [slice_recompute_activation]: 4.92999e-06 [micro_interleaved_order_control]: 5.46e-06 [assign_add_opt]: 4.2e-06 [ForceFp32Comm]: 4.22e-06 [remove_cast_before_assign_add]: 4.00998e-06 [full_micro_interleaved_order_control]: 4.86002e-06 [reorder_send_recv_between_fp_bp]: 5.40999e-06 [comm_op_add_attrs]: 3.78001e-06 [add_comm_op_reuse_tag]: 3.63e-06 [interleave_split_concat_branches]: 3.78999e-06 [interleave_parallel_branches]: 3.71999e-06 [overlap_opt_shard_in_pipeline]: 4.15e-06 [overlap_opt_shard_grad_in_pipeline]: 4.58001e-06 [control_data_broadcast_order]: 2.44e-05 [grouped_pairwise_exchange_alltoall]: 4.86002e-06 [offloading_packed_experts]: 8.97e-06 [overlap_recompute_and_grad_model_parallel]: 8.94e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.63e-06 [overlap_recompute_allgather_and_fa_grad]: 3.65e-06 [overlap_recompute_comm]: 5.24e-06 [overlap_grad_ring_attention]: 7.78999e-06 [overlap_grad_flash_sp]: 3.18e-05 [begin_end_overlap_inline]: 3.71001e-06 [split_matmul_comm_elemetwise]: 5.32001e-06 [split_layernorm_comm]: 4.67998e-06 [handle_group_info]: 3.53e-06 [symbol_engine_optimizer]: 0.0001249, [1] [Cycle 1]: 0.00011653, [6] [build]: 4.72e-06 [elim_shapecalc]: 1.722e-05 [elim_not_effective]: 1.824e-05 [opt_reshape]: 1.012e-05 [fold_const_symbol]: 1.518e-05 [renormalize]: 2.09984e-07 [detach_backward]: 4.59998e-06 [pipeline_parallel_scheduler]: 2.21e-06 [auto_monad_reorder]: 2.773e-05 [get_jit_bprop_graph]: 2.15002e-06 [rewriter_after_jit_bprop_graph]: 7.51001e-06 [opt_after_jit_grad]: 0.00072345 [validate]: 5.563e-05 Sums bootstrap : 0.000420s : 1.36% type_inference : 0.023780s : 76.68% event_method : 0.000023s : 0.07% auto_monad : 0.000071s : 0.23% graph_reusing : 0.000007s : 0.02% inline : 0.000002s : 0.01% add_attr.add_attr_with_inline.tag_attr : 0.000025s : 0.08% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000007s : 0.02% parallel-infer-symbol : 0.000004s : 0.01% pre_auto_parallel : 0.000044s : 0.14% insert-virtual-dataset : 0.000003s : 0.01% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.01% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000045s : 0.14% optimize.rewriter_before_opt_a : 0.000113s : 0.37% optimize.opt_a.expand_dump_flag : 0.000005s : 0.02% optimize.opt_a.switch_simplify : 0.000058s : 0.19% optimize.opt_a.loop_unroll : 0.000041s : 0.13% optimize.opt_a.a_1 : 0.000997s : 3.21% optimize.opt_a.with_stream_mark : 0.000041s : 0.13% optimize.opt_a.recompute_prepare : 0.000024s : 0.08% optimize.opt_a.updatestate_depend_eliminate : 0.000011s : 0.04% optimize.opt_a.updatestate_assign_eliminate : 0.000009s : 0.03% optimize.opt_a.updatestate_loads_eliminate : 0.000008s : 0.02% optimize.opt_a.parameter_eliminate : 0.000004s : 0.01% optimize.opt_a.a_2 : 0.000299s : 0.96% optimize.opt_a.accelerated_algorithm : 0.000019s : 0.06% optimize.opt_a.shard : 0.000004s : 0.01% optimize.opt_a.meta_shard_fg_expand : 0.000005s : 0.02% optimize.opt_a.shard_inline : 0.000018s : 0.06% optimize.opt_a.merge_send_recv : 0.000021s : 0.07% optimize.opt_a.auto_parallel : 0.000020s : 0.07% optimize.opt_a.parallel : 0.000028s : 0.09% optimize.opt_a.flash_sp : 0.000015s : 0.05% optimize.opt_a.merge_comm : 0.000011s : 0.04% optimize.opt_a.allreduce_fusion : 0.000010s : 0.03% optimize.opt_a.matmul_add_comm_reduction : 0.000022s : 0.07% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000022s : 0.07% optimize.opt_a.virtual_dataset : 0.000018s : 0.06% optimize.opt_a.get_grad_eliminate_ : 0.000018s : 0.06% optimize.opt_a.virtual_output : 0.000019s : 0.06% optimize.opt_a.merge_forward : 0.000011s : 0.04% optimize.opt_a.cell_reuse_recompute_pass : 0.000004s : 0.01% optimize.opt_a.offload_activation : 0.000024s : 0.08% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000041s : 0.13% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.01% optimize.opt_a.before_grad : 0.000031s : 0.10% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000011s : 0.04% optimize.opt_a.meta_fg_expand : 0.000008s : 0.03% optimize.opt_a.flash_sp_send_recv_attached : 0.000005s : 0.01% optimize.opt_a.receive_attached : 0.000005s : 0.02% optimize.opt_a.after_resolve : 0.000030s : 0.10% optimize.opt_a.a_after_grad : 0.000028s : 0.09% optimize.opt_a.renormalize : 0.001026s : 3.31% optimize.opt_a.add_forward_monad_depend : 0.000011s : 0.03% optimize.opt_a.auto_monad_grad : 0.000004s : 0.01% optimize.opt_a.auto_monad_eliminator : 0.000036s : 0.11% optimize.opt_a.cse : 0.000076s : 0.24% optimize.opt_a.a_3 : 0.000175s : 0.56% optimize.py_interpret_to_execute_after_opt_a : 0.000021s : 0.07% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.02% optimize.rewriter_after_opt_a : 0.000061s : 0.20% optimize.convert_after_rewriter : 0.000013s : 0.04% optimize.order_py_execute_after_rewriter : 0.000010s : 0.03% optimize.mutable_eliminate : 0.000761s : 2.45% optimize.opt_b.b_1 : 0.000237s : 0.76% optimize.opt_b.b_2 : 0.000011s : 0.04% optimize.opt_b.updatestate_depend_eliminate : 0.000010s : 0.03% optimize.opt_b.updatestate_assign_eliminate : 0.000004s : 0.01% optimize.opt_b.updatestate_loads_eliminate : 0.000004s : 0.01% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000036s : 0.11% optimize.optimize_parallel_all_gather_comm : 0.000025s : 0.08% optimize.overlap_param_gather : 0.000005s : 0.02% optimize.cconv : 0.000037s : 0.12% optimize.loop_unroll : 0.000505s : 1.63% optimize.opt_after_cconv.c_1 : 0.000046s : 0.15% optimize.opt_after_cconv.parameter_eliminate : 0.000005s : 0.02% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000008s : 0.03% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000004s : 0.01% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000004s : 0.01% optimize.opt_after_cconv.cse : 0.000028s : 0.09% optimize.opt_after_cconv.renormalize : 0.000001s : 0.00% optimize.remove_dup_value : 0.000057s : 0.18% optimize.tuple_transform.d_1 : 0.000069s : 0.22% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000010s : 0.03% optimize.partial_unused_args_eliminate : 0.000005s : 0.02% optimize.add_recomputation : 0.000072s : 0.23% optimize.cse_after_recomputation.cse : 0.000020s : 0.06% optimize.environ_conv : 0.000012s : 0.04% optimize.swap_dp_allreduce_reducescatter : 0.000010s : 0.03% optimize.bias_add_comm_swap : 0.000008s : 0.03% optimize.label_micro_interleaved_index : 0.000008s : 0.03% optimize.label_fine_grained_interleaved_index : 0.000006s : 0.02% optimize.merge_cast_opt : 0.000004s : 0.01% optimize.slice_recompute_activation : 0.000005s : 0.02% optimize.micro_interleaved_order_control : 0.000005s : 0.02% optimize.assign_add_opt : 0.000004s : 0.01% optimize.ForceFp32Comm : 0.000004s : 0.01% optimize.remove_cast_before_assign_add : 0.000004s : 0.01% optimize.full_micro_interleaved_order_control : 0.000005s : 0.02% optimize.reorder_send_recv_between_fp_bp : 0.000005s : 0.02% optimize.comm_op_add_attrs : 0.000004s : 0.01% optimize.add_comm_op_reuse_tag : 0.000004s : 0.01% optimize.interleave_split_concat_branches : 0.000004s : 0.01% optimize.interleave_parallel_branches : 0.000004s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000005s : 0.01% optimize.control_data_broadcast_order : 0.000024s : 0.08% optimize.grouped_pairwise_exchange_alltoall : 0.000005s : 0.02% optimize.offloading_packed_experts : 0.000009s : 0.03% optimize.overlap_recompute_and_grad_model_parallel : 0.000009s : 0.03% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.01% optimize.overlap_recompute_comm : 0.000005s : 0.02% optimize.overlap_grad_ring_attention : 0.000008s : 0.03% optimize.overlap_grad_flash_sp : 0.000032s : 0.10% optimize.begin_end_overlap_inline : 0.000004s : 0.01% optimize.split_matmul_comm_elemetwise : 0.000005s : 0.02% optimize.split_layernorm_comm : 0.000005s : 0.02% optimize.handle_group_info : 0.000004s : 0.01% optimize.symbol_engine_optimizer.build : 0.000005s : 0.02% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000017s : 0.06% optimize.symbol_engine_optimizer.elim_not_effective : 0.000018s : 0.06% optimize.symbol_engine_optimizer.opt_reshape : 0.000010s : 0.03% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000015s : 0.05% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000005s : 0.01% pipeline_parallel_scheduler : 0.000002s : 0.01% auto_monad_reorder : 0.000028s : 0.09% get_jit_bprop_graph : 0.000002s : 0.01% rewriter_after_jit_bprop_graph : 0.000008s : 0.02% opt_after_jit_grad : 0.000723s : 2.33% validate : 0.000056s : 0.18% Time group info: ------[substitution.] 0.000264 48 14.61% : 0.000039s : 6: substitution.cast_eliminate 1.00% : 0.000003s : 4: substitution.elim_not_effective 0.71% : 0.000002s : 4: substitution.fold_const_symbol 3.28% : 0.000009s : 6: substitution.graph_param_transform 67.46% : 0.000178s : 4: substitution.inline 2.45% : 0.000006s : 8: substitution.j_node_and_user_rematch 3.07% : 0.000008s : 8: substitution.remove_not_recompute_node 2.18% : 0.000006s : 4: substitution.replace_old_param 5.24% : 0.000014s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.007669 2 89.22% : 0.006842s : 1: type_inference.infer 10.78% : 0.000827s : 1: type_inference.specialize ------[replace.] 0.000066 8 62.70% : 0.000042s : 4: replace.inline 37.30% : 0.000025s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000187 8 93.65% : 0.000175s : 4: match.inline 6.35% : 0.000012s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000298 1730 1.10% : 0.000003s : 17: predicate.accumulaten_eliminater 1.19% : 0.000004s : 6: predicate.ad_related_special_op_eliminate 0.61% : 0.000002s : 12: predicate.addn_check_dump 0.87% : 0.000003s : 17: predicate.addn_zero_filter 0.80% : 0.000002s : 17: predicate.adjust_all_reduce_mul_add 2.02% : 0.000006s : 29: predicate.arithmetic_simplify 0.96% : 0.000003s : 17: predicate.cast_eliminate 0.71% : 0.000002s : 12: predicate.check_bprop_eliminate 0.64% : 0.000002s : 12: predicate.compare_switch_simplify 0.19% : 0.000001s : 6: predicate.const_output_eliminate 0.65% : 0.000002s : 12: predicate.depend_value_elim 1.08% : 0.000003s : 17: predicate.dict_get_item_const_eliminator 0.98% : 0.000003s : 17: predicate.dict_get_item_eliminator 0.91% : 0.000003s : 17: predicate.dict_set_item_eliminator 1.13% : 0.000003s : 12: predicate.dumpgradient_eliminate 0.18% : 0.000001s : 6: predicate.elim_not_effective 0.68% : 0.000002s : 6: predicate.elim_shapecalc_of_broadcastargs 1.12% : 0.000003s : 23: predicate.environ_add_const_eliminate 1.31% : 0.000004s : 23: predicate.environ_get_add_eliminate 1.15% : 0.000003s : 23: predicate.environ_get_depend_swap 1.91% : 0.000006s : 35: predicate.environ_get_eliminate 1.12% : 0.000003s : 23: predicate.environ_get_set_eliminate 1.28% : 0.000004s : 25: predicate.exchange_switch_depend_value 2.06% : 0.000006s : 25: predicate.float_depend_g_call 0.61% : 0.000002s : 12: predicate.float_environ_get_switch 0.94% : 0.000003s : 18: predicate.float_tuple_getitem_switch 0.17% : 0.000001s : 6: predicate.fold_const_symbol 0.75% : 0.000002s : 12: predicate.get_grad_eliminate 0.22% : 0.000001s : 6: predicate.graph_param_transform 0.63% : 0.000002s : 12: predicate.incorporate_call 0.58% : 0.000002s : 12: predicate.incorporate_call_switch 6.02% : 0.000018s : 78: predicate.inline 0.84% : 0.000002s : 12: predicate.inline_without_move 0.33% : 0.000001s : 12: predicate.j_node_and_user_rematch 0.81% : 0.000002s : 12: predicate.less_batch_normalization 1.80% : 0.000005s : 33: predicate.list_to_tuple_eliminator_ 2.32% : 0.000007s : 50: predicate.load_eliminater 1.05% : 0.000003s : 6: predicate.loop_unroll_after_grad 1.86% : 0.000006s : 38: predicate.loop_unroll_before_grad 1.63% : 0.000005s : 29: predicate.make_slice_get_slice_eliminator 0.62% : 0.000002s : 12: predicate.merge_addn 0.68% : 0.000002s : 12: predicate.micro_step_allgather_replace 0.61% : 0.000002s : 12: predicate.mini_step_allgather_replace 0.86% : 0.000003s : 17: predicate.minmaximum_grad 1.08% : 0.000003s : 6: predicate.mutable_eliminate 0.36% : 0.000001s : 6: predicate.opt_reshape 0.45% : 0.000001s : 6: predicate.parallel_virtual_node 1.77% : 0.000005s : 25: predicate.partial_defer_inline 1.54% : 0.000005s : 27: predicate.partial_eliminate 0.91% : 0.000003s : 17: predicate.print_const_string_wrapper 0.64% : 0.000002s : 12: predicate.reduce_all_const_elim 1.21% : 0.000004s : 17: predicate.reduce_eliminate 2.64% : 0.000008s : 50: predicate.redundant_stop_gradient_eliminater 0.38% : 0.000001s : 12: predicate.remove_not_recompute_node 1.25% : 0.000004s : 33: predicate.replace_applicator 0.51% : 0.000002s : 12: predicate.replace_old_param 0.24% : 0.000001s : 6: predicate.reset_defer_inline 0.88% : 0.000003s : 17: predicate.reshape_eliminate 0.72% : 0.000002s : 12: predicate.row_tensor_add_zeros_like 0.40% : 0.000001s : 6: predicate.row_tensor_eliminate 0.99% : 0.000003s : 12: predicate.same_eliminate 0.45% : 0.000001s : 12: predicate.set_cell_output_no_recompute 0.77% : 0.000002s : 12: predicate.shard_identity_eliminate 0.69% : 0.000002s : 12: predicate.special_op_eliminate 0.85% : 0.000003s : 12: predicate.specialize_transform 1.09% : 0.000003s : 12: predicate.split_environ_get_set_with_tuple_value 0.85% : 0.000003s : 12: predicate.stack_unstack_eliminate 0.35% : 0.000001s : 6: predicate.switch_call_monad_eliminater 1.27% : 0.000004s : 25: predicate.switch_defer_inline 1.87% : 0.000006s : 37: predicate.switch_layer_defer_inline 4.39% : 0.000013s : 81: predicate.switch_simplify 0.83% : 0.000002s : 17: predicate.tile_eliminate 0.92% : 0.000003s : 17: predicate.transpose_eliminate 1.58% : 0.000005s : 29: predicate.tuple_list_convert_item_index_to_positive 1.54% : 0.000005s : 29: predicate.tuple_list_get_item_const_eliminator 1.42% : 0.000004s : 29: predicate.tuple_list_get_item_depend_reorder 3.09% : 0.000009s : 45: predicate.tuple_list_get_item_eliminator 1.70% : 0.000005s : 29: predicate.tuple_list_get_set_item_eliminator 2.47% : 0.000007s : 41: predicate.tuple_list_set_item_eliminator 1.78% : 0.000005s : 33: predicate.tuple_to_list_eliminator_ 2.40% : 0.000007s : 50: predicate.updatestate_pure_node_eliminater 3.02% : 0.000009s : 62: predicate.updatestate_useless_node_eliminater 0.42% : 0.000001s : 6: predicate.value_based_eliminate 0.69% : 0.000002s : 12: predicate.virtual_dataset_eliminate 0.86% : 0.000003s : 12: predicate.virtual_output_eliminate 0.29% : 0.000001s : 6: predicate.virtual_view_grad_eliminate 0.49% : 0.000001s : 6: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000695 11 56.18% : 0.000390s : 5: func_graph_cloner_run.FuncGraphClonerGraph 43.82% : 0.000304s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.050614 192 0.02% : 0.000008s : 1: ForceFp32Comm 7.38% : 0.003737s : 1: add_attr 7.34% : 0.003716s : 1: add_attr_with_inline 0.01% : 0.000007s : 1: add_comm_op_reuse_tag 0.15% : 0.000076s : 1: add_recomputation 0.01% : 0.000007s : 1: assign_add_opt 0.16% : 0.000081s : 1: auto_monad 0.07% : 0.000037s : 1: auto_monad_reorder 0.01% : 0.000007s : 1: begin_end_overlap_inline 0.02% : 0.000011s : 1: bias_add_comm_swap 0.92% : 0.000465s : 1: bootstrap 0.08% : 0.000040s : 1: cconv 0.01% : 0.000007s : 1: comm_op_add_attrs 0.06% : 0.000028s : 1: control_data_broadcast_order 0.03% : 0.000016s : 1: convert_after_rewriter 0.08% : 0.000042s : 1: cse_after_recomputation 0.02% : 0.000008s : 1: dataset_repeat_opt 0.05% : 0.000027s : 1: detach_backward 0.03% : 0.000015s : 1: environ_conv 0.07% : 0.000035s : 1: event_method 0.01% : 0.000008s : 1: full_micro_interleaved_order_control 0.02% : 0.000009s : 1: get_jit_bprop_graph 0.03% : 0.000013s : 1: graph_reusing 0.02% : 0.000008s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000006s : 1: handle_group_info 0.02% : 0.000008s : 1: inline 0.02% : 0.000009s : 1: insert-virtual-dataset 0.01% : 0.000007s : 1: interleave_parallel_branches 0.01% : 0.000007s : 1: interleave_split_concat_branches 0.02% : 0.000009s : 1: label_fine_grained_interleaved_index 0.02% : 0.000011s : 1: label_micro_interleaved_index 1.01% : 0.000512s : 1: loop_unroll 0.01% : 0.000007s : 1: merge_cast_opt 0.02% : 0.000008s : 1: micro_interleaved_order_control 1.52% : 0.000769s : 1: mutable_eliminate 0.02% : 0.000012s : 1: offloading_packed_experts 0.04% : 0.000019s : 1: opt.transform.loop_unroll_optimizer 0.05% : 0.000023s : 1: opt.transform.mutable_eliminate 3.26% : 0.001650s : 78: opt.transform.opt_a 0.09% : 0.000044s : 1: opt.transform.opt_after_cconv 0.09% : 0.000043s : 1: opt.transform.opt_after_jit_grad 0.34% : 0.000171s : 28: opt.transform.opt_b 0.15% : 0.000076s : 2: opt.transform.opt_trans_graph 0.11% : 0.000056s : 4: opt.transform.symbol_engine_opt 8.10% : 0.004101s : 1: opt_a 0.34% : 0.000171s : 1: opt_after_cconv 1.46% : 0.000738s : 1: opt_after_jit_grad 0.75% : 0.000377s : 1: opt_b 15.09% : 0.007639s : 1: optimize 0.06% : 0.000029s : 1: optimize_parallel_all_gather_comm 0.03% : 0.000013s : 1: order_py_execute_after_rewriter 0.07% : 0.000036s : 1: overlap_grad_flash_sp 0.01% : 0.000006s : 1: overlap_grad_matmul_and_grad_allreduce 0.02% : 0.000011s : 1: overlap_grad_ring_attention 0.01% : 0.000007s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000007s : 1: overlap_opt_shard_in_pipeline 0.02% : 0.000008s : 1: overlap_param_gather 0.01% : 0.000006s : 1: overlap_recompute_allgather_and_fa_grad 0.02% : 0.000012s : 1: overlap_recompute_and_grad_model_parallel 0.02% : 0.000008s : 1: overlap_recompute_comm 0.02% : 0.000011s : 1: parallel-infer-symbol 0.01% : 0.000006s : 1: parallel-infer-symbol-second 0.02% : 0.000008s : 1: partial_unused_args_eliminate 0.02% : 0.000011s : 1: pipeline_parallel_scheduler 0.02% : 0.000008s : 1: pipeline_split 0.10% : 0.000052s : 1: pre_auto_parallel 0.10% : 0.000050s : 1: py_interpret_to_execute 0.05% : 0.000025s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000007s : 1: remove_cast_before_assign_add 0.12% : 0.000062s : 1: remove_dup_value 1.14% : 0.000577s : 1: renormalize.infer 0.87% : 0.000438s : 1: renormalize.specialize 0.02% : 0.000009s : 1: reorder_send_recv_between_fp_bp 0.03% : 0.000014s : 1: rewriter_after_jit_bprop_graph 0.13% : 0.000065s : 1: rewriter_after_opt_a 0.23% : 0.000118s : 1: rewriter_before_opt_a 0.02% : 0.000008s : 1: slice_cell_reuse_recomputed_activation 0.02% : 0.000008s : 1: slice_recompute_activation 0.01% : 0.000008s : 1: split_layernorm_comm 0.02% : 0.000008s : 1: split_matmul_comm_elemetwise 0.03% : 0.000013s : 1: swap_dp_allreduce_reducescatter 0.25% : 0.000128s : 1: symbol_engine_optimizer 0.25% : 0.000125s : 1: tuple_transform 47.09% : 0.023832s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:44:47.502.95 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0742167, [21] [bootstrap]: 0.00042325 [type_inference]: 0.0475407 [event_method]: 2.162e-05 [auto_monad]: 7.091e-05 [graph_reusing]: 6.07999e-06 [inline]: 3.00002e-06 [add_attr]: 0.00346814, [1] [add_attr_with_inline]: 0.00345665, [1] [Cycle 1]: 6.887e-05, [2] [tag_attr]: 2.333e-05 [meta_addattr_fg_expand]: 6.28998e-06 [parallel-infer-symbol]: 3.83999e-06 [pre_auto_parallel]: 3.957e-05 [insert-virtual-dataset]: 2.97002e-06 [parallel-infer-symbol-second]: 1.10001e-06 [dataset_repeat_opt]: 1.88002e-06 [pipeline_split]: 1.95001e-06 [optimize]: 0.0213712, [53] [py_interpret_to_execute]: 3.056e-05 [rewriter_before_opt_a]: 9.578e-05 [opt_a]: 0.00344826, [2] [Cycle 1]: 0.0025494, [45] [expand_dump_flag]: 3.16999e-06 [switch_simplify]: 4.534e-05 [loop_unroll]: 3.25e-05 [a_1]: 0.00073384 [with_stream_mark]: 1.918e-05 [recompute_prepare]: 1.268e-05 [updatestate_depend_eliminate]: 6.00002e-06 [updatestate_assign_eliminate]: 4.32e-06 [updatestate_loads_eliminate]: 4.15e-06 [parameter_eliminate]: 2.31e-06 [a_2]: 0.00012307 [accelerated_algorithm]: 1.07e-05 [shard]: 2.12999e-06 [meta_shard_fg_expand]: 2.40002e-06 [shard_inline]: 9.14998e-06 [merge_send_recv]: 1.036e-05 [auto_parallel]: 8.72e-06 [parallel]: 1.831e-05 [flash_sp]: 9.82999e-06 [merge_comm]: 5.65001e-06 [allreduce_fusion]: 5.10999e-06 [matmul_add_comm_reduction]: 1.152e-05 [allreduce_slice_to_reducescatter]: 6.50005e-07 [virtual_shard_identity]: 1.122e-05 [virtual_dataset]: 9.38002e-06 [get_grad_eliminate_]: 8.77999e-06 [virtual_output]: 8.79e-06 [merge_forward]: 5.60001e-06 [cell_reuse_recompute_pass]: 1.35999e-06 [offload_activation]: 1.197e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.831e-05 [merge_recompute_call_nodes]: 1.49e-06 [before_grad]: 1.543e-05 [set_forward_comm_id_for_comm_node_pass]: 5.52999e-06 [meta_fg_expand]: 4.33999e-06 [flash_sp_send_recv_attached]: 2.59999e-06 [receive_attached]: 2.37001e-06 [after_resolve]: 1.463e-05 [a_after_grad]: 1.394e-05 [renormalize]: 0.00092664 [add_forward_monad_depend]: 6.13002e-06 [auto_monad_grad]: 2.34001e-06 [auto_monad_eliminator]: 2.045e-05 [cse]: 4.569e-05 [a_3]: 6.861e-05 [Cycle 2]: 0.00088828, [45] [expand_dump_flag]: 1.65001e-06 [switch_simplify]: 1.102e-05 [loop_unroll]: 9.04e-06 [a_1]: 0.00021469 [with_stream_mark]: 1.649e-05 [recompute_prepare]: 9.92999e-06 [updatestate_depend_eliminate]: 4.62e-06 [updatestate_assign_eliminate]: 3.52002e-06 [updatestate_loads_eliminate]: 3.65e-06 [parameter_eliminate]: 1.69998e-06 [a_2]: 0.00011437 [accelerated_algorithm]: 9.07001e-06 [shard]: 2.16e-06 [meta_shard_fg_expand]: 1.90001e-06 [shard_inline]: 9.05001e-06 [merge_send_recv]: 7.76001e-06 [auto_parallel]: 7.98001e-06 [parallel]: 5.74e-06 [flash_sp]: 3.90998e-06 [merge_comm]: 4.81002e-06 [allreduce_fusion]: 4.43999e-06 [matmul_add_comm_reduction]: 9.89001e-06 [allreduce_slice_to_reducescatter]: 5.3001e-07 [virtual_shard_identity]: 9.86e-06 [virtual_dataset]: 8.76002e-06 [get_grad_eliminate_]: 8.03001e-06 [virtual_output]: 8.15e-06 [merge_forward]: 5.04998e-06 [cell_reuse_recompute_pass]: 2.31998e-06 [offload_activation]: 1.041e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.72e-05 [merge_recompute_call_nodes]: 1.12999e-06 [before_grad]: 1.405e-05 [set_forward_comm_id_for_comm_node_pass]: 5.27999e-06 [meta_fg_expand]: 3.3e-06 [flash_sp_send_recv_attached]: 1.24e-06 [receive_attached]: 1.49e-06 [after_resolve]: 1.369e-05 [a_after_grad]: 1.437e-05 [renormalize]: 9.00181e-08 [add_forward_monad_depend]: 1.48002e-06 [auto_monad_grad]: 1.52001e-06 [auto_monad_eliminator]: 1.073e-05 [cse]: 2.501e-05 [a_3]: 5.533e-05 [py_interpret_to_execute_after_opt_a]: 1.375e-05 [slice_cell_reuse_recomputed_activation]: 2.07001e-06 [rewriter_after_opt_a]: 4.96e-05 [convert_after_rewriter]: 8.67998e-06 [order_py_execute_after_rewriter]: 6.51999e-06 [mutable_eliminate]: 0.00063408 [opt_b]: 0.00029815, [1] [Cycle 1]: 0.00028988, [7] [b_1]: 0.00019075 [b_2]: 1.246e-05 [updatestate_depend_eliminate]: 8.77e-06 [updatestate_assign_eliminate]: 3.76999e-06 [updatestate_loads_eliminate]: 3.65998e-06 [renormalize]: 7.2e-07 [cse]: 3.262e-05 [optimize_parallel_all_gather_comm]: 2.088e-05 [overlap_param_gather]: 1.79e-06 [cconv]: 2.933e-05 [loop_unroll]: 0.0004539 [opt_after_cconv]: 0.00013415, [1] [Cycle 1]: 0.00012763, [7] [c_1]: 4.308e-05 [parameter_eliminate]: 3.83999e-06 [updatestate_depend_eliminate]: 6.88e-06 [updatestate_assign_eliminate]: 3.81999e-06 [updatestate_loads_eliminate]: 3.61001e-06 [cse]: 2.978e-05 [renormalize]: 2.50002e-07 [remove_dup_value]: 4.518e-05 [tuple_transform]: 0.0001015, [1] [Cycle 1]: 9.665e-05, [4] [d_1]: 6.436e-05 [none_parameter_eliminate]: 1.87999e-06 [renormalize]: 2.09984e-07 [switch_simplify]: 9.82999e-06 [partial_unused_args_eliminate]: 1.88997e-06 [add_recomputation]: 6.583e-05 [cse_after_recomputation]: 2.844e-05, [1] [Cycle 1]: 2.36e-05, [1] [cse]: 1.705e-05 [environ_conv]: 6.96001e-06 [swap_dp_allreduce_reducescatter]: 7.08e-06 [bias_add_comm_swap]: 2.98e-06 [label_micro_interleaved_index]: 5.09e-06 [label_fine_grained_interleaved_index]: 2.69999e-06 [merge_cast_opt]: 1.30999e-06 [slice_recompute_activation]: 2.14999e-06 [micro_interleaved_order_control]: 2.19001e-06 [assign_add_opt]: 1.32e-06 [ForceFp32Comm]: 8.70001e-07 [remove_cast_before_assign_add]: 1.05999e-06 [full_micro_interleaved_order_control]: 2.41e-06 [reorder_send_recv_between_fp_bp]: 3.03e-06 [comm_op_add_attrs]: 1.01002e-06 [add_comm_op_reuse_tag]: 1.19998e-06 [interleave_split_concat_branches]: 1.09e-06 [interleave_parallel_branches]: 1.36002e-06 [overlap_opt_shard_in_pipeline]: 1.52999e-06 [overlap_opt_shard_grad_in_pipeline]: 2.01e-06 [control_data_broadcast_order]: 1.758e-05 [grouped_pairwise_exchange_alltoall]: 1.60999e-06 [offloading_packed_experts]: 4.73001e-06 [overlap_recompute_and_grad_model_parallel]: 5.78997e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.29e-06 [overlap_recompute_allgather_and_fa_grad]: 1.37999e-06 [overlap_recompute_comm]: 2.06e-06 [overlap_grad_ring_attention]: 5.02999e-06 [overlap_grad_flash_sp]: 2.63e-05 [begin_end_overlap_inline]: 5.39992e-07 [split_matmul_comm_elemetwise]: 2.14e-06 [split_layernorm_comm]: 1.82999e-06 [handle_group_info]: 9.79984e-07 [symbol_engine_optimizer]: 0.00010224, [1] [Cycle 1]: 9.675e-05, [6] [build]: 4.82998e-06 [elim_shapecalc]: 1.668e-05 [elim_not_effective]: 1.813e-05 [opt_reshape]: 9.62001e-06 [fold_const_symbol]: 1.432e-05 [renormalize]: 2.00002e-07 [detach_backward]: 5.23002e-06 [pipeline_parallel_scheduler]: 2.09e-06 [auto_monad_reorder]: 5.124e-05 [get_jit_bprop_graph]: 2.54001e-06 [rewriter_after_jit_bprop_graph]: 1.334e-05 [opt_after_jit_grad]: 0.00089612 [validate]: 6.324e-05 Sums bootstrap : 0.000423s : 0.78% type_inference : 0.047541s : 87.68% event_method : 0.000022s : 0.04% auto_monad : 0.000071s : 0.13% graph_reusing : 0.000006s : 0.01% inline : 0.000003s : 0.01% add_attr.add_attr_with_inline.tag_attr : 0.000023s : 0.04% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.01% parallel-infer-symbol : 0.000004s : 0.01% pre_auto_parallel : 0.000040s : 0.07% insert-virtual-dataset : 0.000003s : 0.01% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000031s : 0.06% optimize.rewriter_before_opt_a : 0.000096s : 0.18% optimize.opt_a.expand_dump_flag : 0.000005s : 0.01% optimize.opt_a.switch_simplify : 0.000056s : 0.10% optimize.opt_a.loop_unroll : 0.000042s : 0.08% optimize.opt_a.a_1 : 0.000949s : 1.75% optimize.opt_a.with_stream_mark : 0.000036s : 0.07% optimize.opt_a.recompute_prepare : 0.000023s : 0.04% optimize.opt_a.updatestate_depend_eliminate : 0.000011s : 0.02% optimize.opt_a.updatestate_assign_eliminate : 0.000008s : 0.01% optimize.opt_a.updatestate_loads_eliminate : 0.000008s : 0.01% optimize.opt_a.parameter_eliminate : 0.000004s : 0.01% optimize.opt_a.a_2 : 0.000237s : 0.44% optimize.opt_a.accelerated_algorithm : 0.000020s : 0.04% optimize.opt_a.shard : 0.000004s : 0.01% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.01% optimize.opt_a.shard_inline : 0.000018s : 0.03% optimize.opt_a.merge_send_recv : 0.000018s : 0.03% optimize.opt_a.auto_parallel : 0.000017s : 0.03% optimize.opt_a.parallel : 0.000024s : 0.04% optimize.opt_a.flash_sp : 0.000014s : 0.03% optimize.opt_a.merge_comm : 0.000010s : 0.02% optimize.opt_a.allreduce_fusion : 0.000010s : 0.02% optimize.opt_a.matmul_add_comm_reduction : 0.000021s : 0.04% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000021s : 0.04% optimize.opt_a.virtual_dataset : 0.000018s : 0.03% optimize.opt_a.get_grad_eliminate_ : 0.000017s : 0.03% optimize.opt_a.virtual_output : 0.000017s : 0.03% optimize.opt_a.merge_forward : 0.000011s : 0.02% optimize.opt_a.cell_reuse_recompute_pass : 0.000004s : 0.01% optimize.opt_a.offload_activation : 0.000022s : 0.04% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000036s : 0.07% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.00% optimize.opt_a.before_grad : 0.000029s : 0.05% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000011s : 0.02% optimize.opt_a.meta_fg_expand : 0.000008s : 0.01% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.01% optimize.opt_a.receive_attached : 0.000004s : 0.01% optimize.opt_a.after_resolve : 0.000028s : 0.05% optimize.opt_a.a_after_grad : 0.000028s : 0.05% optimize.opt_a.renormalize : 0.000927s : 1.71% optimize.opt_a.add_forward_monad_depend : 0.000008s : 0.01% optimize.opt_a.auto_monad_grad : 0.000004s : 0.01% optimize.opt_a.auto_monad_eliminator : 0.000031s : 0.06% optimize.opt_a.cse : 0.000071s : 0.13% optimize.opt_a.a_3 : 0.000124s : 0.23% optimize.py_interpret_to_execute_after_opt_a : 0.000014s : 0.03% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.00% optimize.rewriter_after_opt_a : 0.000050s : 0.09% optimize.convert_after_rewriter : 0.000009s : 0.02% optimize.order_py_execute_after_rewriter : 0.000007s : 0.01% optimize.mutable_eliminate : 0.000634s : 1.17% optimize.opt_b.b_1 : 0.000191s : 0.35% optimize.opt_b.b_2 : 0.000012s : 0.02% optimize.opt_b.updatestate_depend_eliminate : 0.000009s : 0.02% optimize.opt_b.updatestate_assign_eliminate : 0.000004s : 0.01% optimize.opt_b.updatestate_loads_eliminate : 0.000004s : 0.01% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000033s : 0.06% optimize.optimize_parallel_all_gather_comm : 0.000021s : 0.04% optimize.overlap_param_gather : 0.000002s : 0.00% optimize.cconv : 0.000029s : 0.05% optimize.loop_unroll : 0.000454s : 0.84% optimize.opt_after_cconv.c_1 : 0.000043s : 0.08% optimize.opt_after_cconv.parameter_eliminate : 0.000004s : 0.01% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000007s : 0.01% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000004s : 0.01% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000004s : 0.01% optimize.opt_after_cconv.cse : 0.000030s : 0.05% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000045s : 0.08% optimize.tuple_transform.d_1 : 0.000064s : 0.12% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000010s : 0.02% optimize.partial_unused_args_eliminate : 0.000002s : 0.00% optimize.add_recomputation : 0.000066s : 0.12% optimize.cse_after_recomputation.cse : 0.000017s : 0.03% optimize.environ_conv : 0.000007s : 0.01% optimize.swap_dp_allreduce_reducescatter : 0.000007s : 0.01% optimize.bias_add_comm_swap : 0.000003s : 0.01% optimize.label_micro_interleaved_index : 0.000005s : 0.01% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.00% optimize.merge_cast_opt : 0.000001s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.00% optimize.micro_interleaved_order_control : 0.000002s : 0.00% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.00% optimize.full_micro_interleaved_order_control : 0.000002s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.01% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000002s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.00% optimize.control_data_broadcast_order : 0.000018s : 0.03% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.00% optimize.offloading_packed_experts : 0.000005s : 0.01% optimize.overlap_recompute_and_grad_model_parallel : 0.000006s : 0.01% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.00% optimize.overlap_recompute_comm : 0.000002s : 0.00% optimize.overlap_grad_ring_attention : 0.000005s : 0.01% optimize.overlap_grad_flash_sp : 0.000026s : 0.05% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.00% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000005s : 0.01% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000017s : 0.03% optimize.symbol_engine_optimizer.elim_not_effective : 0.000018s : 0.03% optimize.symbol_engine_optimizer.opt_reshape : 0.000010s : 0.02% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000014s : 0.03% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000005s : 0.01% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000051s : 0.09% get_jit_bprop_graph : 0.000003s : 0.00% rewriter_after_jit_bprop_graph : 0.000013s : 0.02% opt_after_jit_grad : 0.000896s : 1.65% validate : 0.000063s : 0.12% Time group info: ------[substitution.] 0.000238 48 15.01% : 0.000036s : 6: substitution.cast_eliminate 1.05% : 0.000002s : 4: substitution.elim_not_effective 0.83% : 0.000002s : 4: substitution.fold_const_symbol 3.23% : 0.000008s : 6: substitution.graph_param_transform 66.39% : 0.000158s : 4: substitution.inline 2.30% : 0.000005s : 8: substitution.j_node_and_user_rematch 3.33% : 0.000008s : 8: substitution.remove_not_recompute_node 2.25% : 0.000005s : 4: substitution.replace_old_param 5.61% : 0.000013s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.047470 2 98.32% : 0.046674s : 1: type_inference.infer 1.68% : 0.000796s : 1: type_inference.specialize ------[replace.] 0.000063 8 62.76% : 0.000039s : 4: replace.inline 37.24% : 0.000023s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000167 8 93.10% : 0.000155s : 4: match.inline 6.90% : 0.000012s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000283 1730 0.94% : 0.000003s : 17: predicate.accumulaten_eliminater 1.35% : 0.000004s : 6: predicate.ad_related_special_op_eliminate 0.60% : 0.000002s : 12: predicate.addn_check_dump 0.99% : 0.000003s : 17: predicate.addn_zero_filter 0.83% : 0.000002s : 17: predicate.adjust_all_reduce_mul_add 2.10% : 0.000006s : 29: predicate.arithmetic_simplify 1.18% : 0.000003s : 17: predicate.cast_eliminate 0.64% : 0.000002s : 12: predicate.check_bprop_eliminate 0.67% : 0.000002s : 12: predicate.compare_switch_simplify 0.19% : 0.000001s : 6: predicate.const_output_eliminate 0.65% : 0.000002s : 12: predicate.depend_value_elim 0.89% : 0.000003s : 17: predicate.dict_get_item_const_eliminator 1.00% : 0.000003s : 17: predicate.dict_get_item_eliminator 0.84% : 0.000002s : 17: predicate.dict_set_item_eliminator 1.14% : 0.000003s : 12: predicate.dumpgradient_eliminate 0.21% : 0.000001s : 6: predicate.elim_not_effective 0.57% : 0.000002s : 6: predicate.elim_shapecalc_of_broadcastargs 1.15% : 0.000003s : 23: predicate.environ_add_const_eliminate 1.24% : 0.000004s : 23: predicate.environ_get_add_eliminate 1.25% : 0.000004s : 23: predicate.environ_get_depend_swap 1.84% : 0.000005s : 35: predicate.environ_get_eliminate 1.12% : 0.000003s : 23: predicate.environ_get_set_eliminate 1.27% : 0.000004s : 25: predicate.exchange_switch_depend_value 2.01% : 0.000006s : 25: predicate.float_depend_g_call 0.58% : 0.000002s : 12: predicate.float_environ_get_switch 0.83% : 0.000002s : 18: predicate.float_tuple_getitem_switch 0.19% : 0.000001s : 6: predicate.fold_const_symbol 0.71% : 0.000002s : 12: predicate.get_grad_eliminate 0.24% : 0.000001s : 6: predicate.graph_param_transform 0.68% : 0.000002s : 12: predicate.incorporate_call 0.57% : 0.000002s : 12: predicate.incorporate_call_switch 5.91% : 0.000017s : 78: predicate.inline 1.15% : 0.000003s : 12: predicate.inline_without_move 0.35% : 0.000001s : 12: predicate.j_node_and_user_rematch 1.07% : 0.000003s : 12: predicate.less_batch_normalization 1.91% : 0.000005s : 33: predicate.list_to_tuple_eliminator_ 2.54% : 0.000007s : 50: predicate.load_eliminater 1.01% : 0.000003s : 6: predicate.loop_unroll_after_grad 1.97% : 0.000006s : 38: predicate.loop_unroll_before_grad 1.48% : 0.000004s : 29: predicate.make_slice_get_slice_eliminator 0.63% : 0.000002s : 12: predicate.merge_addn 0.67% : 0.000002s : 12: predicate.micro_step_allgather_replace 0.68% : 0.000002s : 12: predicate.mini_step_allgather_replace 0.77% : 0.000002s : 17: predicate.minmaximum_grad 0.83% : 0.000002s : 6: predicate.mutable_eliminate 0.38% : 0.000001s : 6: predicate.opt_reshape 0.38% : 0.000001s : 6: predicate.parallel_virtual_node 1.49% : 0.000004s : 25: predicate.partial_defer_inline 1.56% : 0.000004s : 27: predicate.partial_eliminate 1.00% : 0.000003s : 17: predicate.print_const_string_wrapper 0.63% : 0.000002s : 12: predicate.reduce_all_const_elim 1.14% : 0.000003s : 17: predicate.reduce_eliminate 2.40% : 0.000007s : 50: predicate.redundant_stop_gradient_eliminater 0.37% : 0.000001s : 12: predicate.remove_not_recompute_node 1.25% : 0.000004s : 33: predicate.replace_applicator 0.58% : 0.000002s : 12: predicate.replace_old_param 0.24% : 0.000001s : 6: predicate.reset_defer_inline 0.98% : 0.000003s : 17: predicate.reshape_eliminate 0.70% : 0.000002s : 12: predicate.row_tensor_add_zeros_like 0.48% : 0.000001s : 6: predicate.row_tensor_eliminate 0.85% : 0.000002s : 12: predicate.same_eliminate 0.51% : 0.000001s : 12: predicate.set_cell_output_no_recompute 0.77% : 0.000002s : 12: predicate.shard_identity_eliminate 0.80% : 0.000002s : 12: predicate.special_op_eliminate 0.85% : 0.000002s : 12: predicate.specialize_transform 0.82% : 0.000002s : 12: predicate.split_environ_get_set_with_tuple_value 0.81% : 0.000002s : 12: predicate.stack_unstack_eliminate 0.35% : 0.000001s : 6: predicate.switch_call_monad_eliminater 1.32% : 0.000004s : 25: predicate.switch_defer_inline 1.99% : 0.000006s : 37: predicate.switch_layer_defer_inline 4.62% : 0.000013s : 81: predicate.switch_simplify 0.86% : 0.000002s : 17: predicate.tile_eliminate 0.94% : 0.000003s : 17: predicate.transpose_eliminate 1.56% : 0.000004s : 29: predicate.tuple_list_convert_item_index_to_positive 1.61% : 0.000005s : 29: predicate.tuple_list_get_item_const_eliminator 1.49% : 0.000004s : 29: predicate.tuple_list_get_item_depend_reorder 3.21% : 0.000009s : 45: predicate.tuple_list_get_item_eliminator 1.50% : 0.000004s : 29: predicate.tuple_list_get_set_item_eliminator 2.45% : 0.000007s : 41: predicate.tuple_list_set_item_eliminator 1.74% : 0.000005s : 33: predicate.tuple_to_list_eliminator_ 2.37% : 0.000007s : 50: predicate.updatestate_pure_node_eliminater 3.02% : 0.000009s : 62: predicate.updatestate_useless_node_eliminater 0.38% : 0.000001s : 6: predicate.value_based_eliminate 0.69% : 0.000002s : 12: predicate.virtual_dataset_eliminate 0.68% : 0.000002s : 12: predicate.virtual_output_eliminate 0.33% : 0.000001s : 6: predicate.virtual_view_grad_eliminate 0.42% : 0.000001s : 6: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000680 11 57.59% : 0.000392s : 5: func_graph_cloner_run.FuncGraphClonerGraph 42.41% : 0.000288s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.086351 192 0.00% : 0.000004s : 1: ForceFp32Comm 4.02% : 0.003474s : 1: add_attr 4.01% : 0.003461s : 1: add_attr_with_inline 0.01% : 0.000004s : 1: add_comm_op_reuse_tag 0.08% : 0.000070s : 1: add_recomputation 0.00% : 0.000004s : 1: assign_add_opt 0.09% : 0.000077s : 1: auto_monad 0.07% : 0.000057s : 1: auto_monad_reorder 0.00% : 0.000004s : 1: begin_end_overlap_inline 0.01% : 0.000006s : 1: bias_add_comm_swap 0.52% : 0.000452s : 1: bootstrap 0.04% : 0.000033s : 1: cconv 0.00% : 0.000004s : 1: comm_op_add_attrs 0.02% : 0.000021s : 1: control_data_broadcast_order 0.01% : 0.000012s : 1: convert_after_rewriter 0.04% : 0.000031s : 1: cse_after_recomputation 0.01% : 0.000005s : 1: dataset_repeat_opt 0.01% : 0.000013s : 1: detach_backward 0.01% : 0.000010s : 1: environ_conv 0.03% : 0.000029s : 1: event_method 0.01% : 0.000005s : 1: full_micro_interleaved_order_control 0.01% : 0.000006s : 1: get_jit_bprop_graph 0.01% : 0.000010s : 1: graph_reusing 0.01% : 0.000005s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000004s : 1: handle_group_info 0.01% : 0.000006s : 1: inline 0.01% : 0.000006s : 1: insert-virtual-dataset 0.00% : 0.000004s : 1: interleave_parallel_branches 0.00% : 0.000004s : 1: interleave_split_concat_branches 0.01% : 0.000006s : 1: label_fine_grained_interleaved_index 0.01% : 0.000008s : 1: label_micro_interleaved_index 0.54% : 0.000462s : 1: loop_unroll 0.00% : 0.000004s : 1: merge_cast_opt 0.01% : 0.000005s : 1: micro_interleaved_order_control 0.75% : 0.000644s : 1: mutable_eliminate 0.01% : 0.000008s : 1: offloading_packed_experts 0.02% : 0.000019s : 1: opt.transform.loop_unroll_optimizer 0.02% : 0.000020s : 1: opt.transform.mutable_eliminate 1.81% : 0.001567s : 78: opt.transform.opt_a 0.05% : 0.000041s : 1: opt.transform.opt_after_cconv 0.06% : 0.000050s : 1: opt.transform.opt_after_jit_grad 0.20% : 0.000170s : 28: opt.transform.opt_b 0.08% : 0.000072s : 2: opt.transform.opt_trans_graph 0.06% : 0.000053s : 4: opt.transform.symbol_engine_opt 4.00% : 0.003451s : 1: opt_a 0.16% : 0.000138s : 1: opt_after_cconv 1.05% : 0.000910s : 1: opt_after_jit_grad 0.35% : 0.000302s : 1: opt_b 24.80% : 0.021417s : 1: optimize 0.03% : 0.000024s : 1: optimize_parallel_all_gather_comm 0.01% : 0.000010s : 1: order_py_execute_after_rewriter 0.04% : 0.000031s : 1: overlap_grad_flash_sp 0.00% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.01% : 0.000008s : 1: overlap_grad_ring_attention 0.01% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.01% : 0.000005s : 1: overlap_param_gather 0.00% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.01% : 0.000009s : 1: overlap_recompute_and_grad_model_parallel 0.01% : 0.000005s : 1: overlap_recompute_comm 0.01% : 0.000007s : 1: parallel-infer-symbol 0.00% : 0.000004s : 1: parallel-infer-symbol-second 0.01% : 0.000005s : 1: partial_unused_args_eliminate 0.01% : 0.000005s : 1: pipeline_parallel_scheduler 0.01% : 0.000005s : 1: pipeline_split 0.05% : 0.000044s : 1: pre_auto_parallel 0.04% : 0.000035s : 1: py_interpret_to_execute 0.02% : 0.000017s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000004s : 1: remove_cast_before_assign_add 0.06% : 0.000051s : 1: remove_dup_value 0.61% : 0.000528s : 1: renormalize.infer 0.45% : 0.000390s : 1: renormalize.specialize 0.01% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.02% : 0.000018s : 1: rewriter_after_jit_bprop_graph 0.06% : 0.000054s : 1: rewriter_after_opt_a 0.12% : 0.000100s : 1: rewriter_before_opt_a 0.01% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.01% : 0.000005s : 1: slice_recompute_activation 0.01% : 0.000005s : 1: split_layernorm_comm 0.01% : 0.000005s : 1: split_matmul_comm_elemetwise 0.01% : 0.000010s : 1: swap_dp_allreduce_reducescatter 0.12% : 0.000105s : 1: symbol_engine_optimizer 0.12% : 0.000104s : 1: tuple_transform 55.08% : 0.047563s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:44:47.563.416 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:44:47.563.686 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0342922, [21] [bootstrap]: 0.00042509 [type_inference]: 0.00948519 [event_method]: 2.588e-05 [auto_monad]: 0.00023983 [graph_reusing]: 7.75e-06 [inline]: 3.58e-06 [add_attr]: 0.00407069, [1] [add_attr_with_inline]: 0.00405768, [1] [Cycle 1]: 0.00010867, [2] [tag_attr]: 2.644e-05 [meta_addattr_fg_expand]: 6.53e-06 [parallel-infer-symbol]: 4.27998e-06 [pre_auto_parallel]: 4.463e-05 [insert-virtual-dataset]: 2.42001e-06 [parallel-infer-symbol-second]: 7.50006e-07 [dataset_repeat_opt]: 1.99e-06 [pipeline_split]: 1.66998e-06 [optimize]: 0.0182055, [53] [py_interpret_to_execute]: 4.091e-05 [rewriter_before_opt_a]: 0.00010871 [opt_a]: 0.0132276, [2] [Cycle 1]: 0.0121117, [45] [expand_dump_flag]: 3.51999e-06 [switch_simplify]: 4.488e-05 [loop_unroll]: 3.224e-05 [a_1]: 0.00081225 [with_stream_mark]: 2.389e-05 [recompute_prepare]: 1.174e-05 [updatestate_depend_eliminate]: 5.24e-06 [updatestate_assign_eliminate]: 4.05998e-06 [updatestate_loads_eliminate]: 3.93001e-06 [parameter_eliminate]: 2.26e-06 [a_2]: 0.00013496 [accelerated_algorithm]: 1.025e-05 [shard]: 2.32001e-06 [meta_shard_fg_expand]: 2.89999e-06 [shard_inline]: 8.45999e-06 [merge_send_recv]: 1.083e-05 [auto_parallel]: 1.067e-05 [parallel]: 2.198e-05 [flash_sp]: 1.119e-05 [merge_comm]: 4.94e-06 [allreduce_fusion]: 4.33001e-06 [matmul_add_comm_reduction]: 1.195e-05 [allreduce_slice_to_reducescatter]: 7.59988e-07 [virtual_shard_identity]: 1.167e-05 [virtual_dataset]: 8.57e-06 [get_grad_eliminate_]: 8.77999e-06 [virtual_output]: 8.13999e-06 [merge_forward]: 5.37999e-06 [cell_reuse_recompute_pass]: 1.76e-06 [offload_activation]: 1.269e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.048e-05 [merge_recompute_call_nodes]: 1.72999e-06 [before_grad]: 1.428e-05 [set_forward_comm_id_for_comm_node_pass]: 4.92e-06 [meta_fg_expand]: 3.8e-06 [flash_sp_send_recv_attached]: 3.00998e-06 [receive_attached]: 2.63e-06 [after_resolve]: 1.528e-05 [a_after_grad]: 1.303e-05 [renormalize]: 0.0101558 [add_forward_monad_depend]: 1.158e-05 [auto_monad_grad]: 3.31001e-06 [auto_monad_eliminator]: 2.692e-05 [cse]: 4.212e-05 [a_3]: 8.877e-05 [Cycle 2]: 0.00108493, [45] [expand_dump_flag]: 2.77002e-06 [switch_simplify]: 1.13e-05 [loop_unroll]: 8.05e-06 [a_1]: 0.00020587 [with_stream_mark]: 2.273e-05 [recompute_prepare]: 8.87e-06 [updatestate_depend_eliminate]: 6.31e-06 [updatestate_assign_eliminate]: 4.03001e-06 [updatestate_loads_eliminate]: 4.03999e-06 [parameter_eliminate]: 2.56e-06 [a_2]: 0.00012523 [accelerated_algorithm]: 8.70001e-06 [shard]: 2.54999e-06 [meta_shard_fg_expand]: 2.69999e-06 [shard_inline]: 8.42e-06 [merge_send_recv]: 1.097e-05 [auto_parallel]: 1.15e-05 [parallel]: 1.001e-05 [flash_sp]: 4.08999e-06 [merge_comm]: 5.39e-06 [allreduce_fusion]: 4.46002e-06 [matmul_add_comm_reduction]: 1.293e-05 [allreduce_slice_to_reducescatter]: 7.00005e-07 [virtual_shard_identity]: 1.042e-05 [virtual_dataset]: 9.44e-06 [get_grad_eliminate_]: 7.83001e-06 [virtual_output]: 7.56001e-06 [merge_forward]: 5.64998e-06 [cell_reuse_recompute_pass]: 3.78001e-06 [offload_activation]: 1.312e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.88e-05 [merge_recompute_call_nodes]: 1.74998e-06 [before_grad]: 1.345e-05 [set_forward_comm_id_for_comm_node_pass]: 5.97001e-06 [meta_fg_expand]: 3.8e-06 [flash_sp_send_recv_attached]: 1.77001e-06 [receive_attached]: 2.39999e-06 [after_resolve]: 1.565e-05 [a_after_grad]: 1.258e-05 [renormalize]: 8.00064e-08 [add_forward_monad_depend]: 2.52001e-06 [auto_monad_grad]: 2.86e-06 [auto_monad_eliminator]: 1.501e-05 [cse]: 2.771e-05 [a_3]: 6.181e-05 [py_interpret_to_execute_after_opt_a]: 2.628e-05 [slice_cell_reuse_recomputed_activation]: 6.09001e-06 [rewriter_after_opt_a]: 6.107e-05 [convert_after_rewriter]: 1.244e-05 [order_py_execute_after_rewriter]: 9.74e-06 [mutable_eliminate]: 0.0023445 [opt_b]: 0.00040172, [1] [Cycle 1]: 0.00038854, [7] [b_1]: 0.00023359 [b_2]: 1.312e-05 [updatestate_depend_eliminate]: 1.563e-05 [updatestate_assign_eliminate]: 4.09997e-06 [updatestate_loads_eliminate]: 3.43e-06 [renormalize]: 1.20999e-06 [cse]: 5.204e-05 [optimize_parallel_all_gather_comm]: 3.155e-05 [overlap_param_gather]: 6.86999e-06 [cconv]: 4.661e-05 [loop_unroll]: 0.00079192 [opt_after_cconv]: 0.000194, [1] [Cycle 1]: 0.00018053, [7] [c_1]: 4.54e-05 [parameter_eliminate]: 7.73001e-06 [updatestate_depend_eliminate]: 1.127e-05 [updatestate_assign_eliminate]: 3.78001e-06 [updatestate_loads_eliminate]: 3.95e-06 [cse]: 4.17e-05 [renormalize]: 7.50006e-07 [remove_dup_value]: 2.416e-05 [tuple_transform]: 0.0001227, [1] [Cycle 1]: 0.00011361, [4] [d_1]: 6.806e-05 [none_parameter_eliminate]: 1.92001e-06 [renormalize]: 4.00003e-07 [switch_simplify]: 9.18002e-06 [partial_unused_args_eliminate]: 5.52999e-06 [add_recomputation]: 7.869e-05 [cse_after_recomputation]: 4.011e-05, [1] [Cycle 1]: 3.158e-05, [1] [cse]: 1.964e-05 [environ_conv]: 1.212e-05 [swap_dp_allreduce_reducescatter]: 9.59999e-06 [bias_add_comm_swap]: 6.29999e-06 [label_micro_interleaved_index]: 1.129e-05 [label_fine_grained_interleaved_index]: 5.72001e-06 [merge_cast_opt]: 4.47e-06 [slice_recompute_activation]: 5.61e-06 [micro_interleaved_order_control]: 4.96002e-06 [assign_add_opt]: 4.13001e-06 [ForceFp32Comm]: 3.52997e-06 [remove_cast_before_assign_add]: 3.6e-06 [full_micro_interleaved_order_control]: 4.86002e-06 [reorder_send_recv_between_fp_bp]: 6.57002e-06 [comm_op_add_attrs]: 3.81999e-06 [add_comm_op_reuse_tag]: 3.78999e-06 [interleave_split_concat_branches]: 3.8e-06 [interleave_parallel_branches]: 3.88001e-06 [overlap_opt_shard_in_pipeline]: 4.27e-06 [overlap_opt_shard_grad_in_pipeline]: 5.15001e-06 [control_data_broadcast_order]: 2.348e-05 [grouped_pairwise_exchange_alltoall]: 4.07003e-06 [offloading_packed_experts]: 8.52e-06 [overlap_recompute_and_grad_model_parallel]: 9.15999e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.91999e-06 [overlap_recompute_allgather_and_fa_grad]: 3.98999e-06 [overlap_recompute_comm]: 5.07999e-06 [overlap_grad_ring_attention]: 8.06001e-06 [overlap_grad_flash_sp]: 3.155e-05 [begin_end_overlap_inline]: 3.2e-06 [split_matmul_comm_elemetwise]: 5.20001e-06 [split_layernorm_comm]: 4.68999e-06 [handle_group_info]: 3.71001e-06 [symbol_engine_optimizer]: 0.00013609, [1] [Cycle 1]: 0.00012774, [6] [build]: 6.11998e-06 [elim_shapecalc]: 1.815e-05 [elim_not_effective]: 2.092e-05 [opt_reshape]: 1.13e-05 [fold_const_symbol]: 1.476e-05 [renormalize]: 9.80013e-07 [detach_backward]: 6.41e-06 [pipeline_parallel_scheduler]: 2.46e-06 [auto_monad_reorder]: 2.991e-05 [get_jit_bprop_graph]: 2.23002e-06 [rewriter_after_jit_bprop_graph]: 9.31e-06 [opt_after_jit_grad]: 0.00086296 [validate]: 5.92e-05 Sums bootstrap : 0.000425s : 1.52% type_inference : 0.009485s : 33.82% event_method : 0.000026s : 0.09% auto_monad : 0.000240s : 0.86% graph_reusing : 0.000008s : 0.03% inline : 0.000004s : 0.01% add_attr.add_attr_with_inline.tag_attr : 0.000026s : 0.09% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000007s : 0.02% parallel-infer-symbol : 0.000004s : 0.02% pre_auto_parallel : 0.000045s : 0.16% insert-virtual-dataset : 0.000002s : 0.01% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.01% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000041s : 0.15% optimize.rewriter_before_opt_a : 0.000109s : 0.39% optimize.opt_a.expand_dump_flag : 0.000006s : 0.02% optimize.opt_a.switch_simplify : 0.000056s : 0.20% optimize.opt_a.loop_unroll : 0.000040s : 0.14% optimize.opt_a.a_1 : 0.001018s : 3.63% optimize.opt_a.with_stream_mark : 0.000047s : 0.17% optimize.opt_a.recompute_prepare : 0.000021s : 0.07% optimize.opt_a.updatestate_depend_eliminate : 0.000012s : 0.04% optimize.opt_a.updatestate_assign_eliminate : 0.000008s : 0.03% optimize.opt_a.updatestate_loads_eliminate : 0.000008s : 0.03% optimize.opt_a.parameter_eliminate : 0.000005s : 0.02% optimize.opt_a.a_2 : 0.000260s : 0.93% optimize.opt_a.accelerated_algorithm : 0.000019s : 0.07% optimize.opt_a.shard : 0.000005s : 0.02% optimize.opt_a.meta_shard_fg_expand : 0.000006s : 0.02% optimize.opt_a.shard_inline : 0.000017s : 0.06% optimize.opt_a.merge_send_recv : 0.000022s : 0.08% optimize.opt_a.auto_parallel : 0.000022s : 0.08% optimize.opt_a.parallel : 0.000032s : 0.11% optimize.opt_a.flash_sp : 0.000015s : 0.05% optimize.opt_a.merge_comm : 0.000010s : 0.04% optimize.opt_a.allreduce_fusion : 0.000009s : 0.03% optimize.opt_a.matmul_add_comm_reduction : 0.000025s : 0.09% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000022s : 0.08% optimize.opt_a.virtual_dataset : 0.000018s : 0.06% optimize.opt_a.get_grad_eliminate_ : 0.000017s : 0.06% optimize.opt_a.virtual_output : 0.000016s : 0.06% optimize.opt_a.merge_forward : 0.000011s : 0.04% optimize.opt_a.cell_reuse_recompute_pass : 0.000006s : 0.02% optimize.opt_a.offload_activation : 0.000026s : 0.09% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000039s : 0.14% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.01% optimize.opt_a.before_grad : 0.000028s : 0.10% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000011s : 0.04% optimize.opt_a.meta_fg_expand : 0.000008s : 0.03% optimize.opt_a.flash_sp_send_recv_attached : 0.000005s : 0.02% optimize.opt_a.receive_attached : 0.000005s : 0.02% optimize.opt_a.after_resolve : 0.000031s : 0.11% optimize.opt_a.a_after_grad : 0.000026s : 0.09% optimize.opt_a.renormalize : 0.010156s : 36.21% optimize.opt_a.add_forward_monad_depend : 0.000014s : 0.05% optimize.opt_a.auto_monad_grad : 0.000006s : 0.02% optimize.opt_a.auto_monad_eliminator : 0.000042s : 0.15% optimize.opt_a.cse : 0.000070s : 0.25% optimize.opt_a.a_3 : 0.000151s : 0.54% optimize.py_interpret_to_execute_after_opt_a : 0.000026s : 0.09% optimize.slice_cell_reuse_recomputed_activation : 0.000006s : 0.02% optimize.rewriter_after_opt_a : 0.000061s : 0.22% optimize.convert_after_rewriter : 0.000012s : 0.04% optimize.order_py_execute_after_rewriter : 0.000010s : 0.03% optimize.mutable_eliminate : 0.002345s : 8.36% optimize.opt_b.b_1 : 0.000234s : 0.83% optimize.opt_b.b_2 : 0.000013s : 0.05% optimize.opt_b.updatestate_depend_eliminate : 0.000016s : 0.06% optimize.opt_b.updatestate_assign_eliminate : 0.000004s : 0.01% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.01% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000052s : 0.19% optimize.optimize_parallel_all_gather_comm : 0.000032s : 0.11% optimize.overlap_param_gather : 0.000007s : 0.02% optimize.cconv : 0.000047s : 0.17% optimize.loop_unroll : 0.000792s : 2.82% optimize.opt_after_cconv.c_1 : 0.000045s : 0.16% optimize.opt_after_cconv.parameter_eliminate : 0.000008s : 0.03% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000011s : 0.04% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000004s : 0.01% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000004s : 0.01% optimize.opt_after_cconv.cse : 0.000042s : 0.15% optimize.opt_after_cconv.renormalize : 0.000001s : 0.00% optimize.remove_dup_value : 0.000024s : 0.09% optimize.tuple_transform.d_1 : 0.000068s : 0.24% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000009s : 0.03% optimize.partial_unused_args_eliminate : 0.000006s : 0.02% optimize.add_recomputation : 0.000079s : 0.28% optimize.cse_after_recomputation.cse : 0.000020s : 0.07% optimize.environ_conv : 0.000012s : 0.04% optimize.swap_dp_allreduce_reducescatter : 0.000010s : 0.03% optimize.bias_add_comm_swap : 0.000006s : 0.02% optimize.label_micro_interleaved_index : 0.000011s : 0.04% optimize.label_fine_grained_interleaved_index : 0.000006s : 0.02% optimize.merge_cast_opt : 0.000004s : 0.02% optimize.slice_recompute_activation : 0.000006s : 0.02% optimize.micro_interleaved_order_control : 0.000005s : 0.02% optimize.assign_add_opt : 0.000004s : 0.01% optimize.ForceFp32Comm : 0.000004s : 0.01% optimize.remove_cast_before_assign_add : 0.000004s : 0.01% optimize.full_micro_interleaved_order_control : 0.000005s : 0.02% optimize.reorder_send_recv_between_fp_bp : 0.000007s : 0.02% optimize.comm_op_add_attrs : 0.000004s : 0.01% optimize.add_comm_op_reuse_tag : 0.000004s : 0.01% optimize.interleave_split_concat_branches : 0.000004s : 0.01% optimize.interleave_parallel_branches : 0.000004s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.02% optimize.overlap_opt_shard_grad_in_pipeline : 0.000005s : 0.02% optimize.control_data_broadcast_order : 0.000023s : 0.08% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.01% optimize.offloading_packed_experts : 0.000009s : 0.03% optimize.overlap_recompute_and_grad_model_parallel : 0.000009s : 0.03% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.01% optimize.overlap_recompute_comm : 0.000005s : 0.02% optimize.overlap_grad_ring_attention : 0.000008s : 0.03% optimize.overlap_grad_flash_sp : 0.000032s : 0.11% optimize.begin_end_overlap_inline : 0.000003s : 0.01% optimize.split_matmul_comm_elemetwise : 0.000005s : 0.02% optimize.split_layernorm_comm : 0.000005s : 0.02% optimize.handle_group_info : 0.000004s : 0.01% optimize.symbol_engine_optimizer.build : 0.000006s : 0.02% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000018s : 0.06% optimize.symbol_engine_optimizer.elim_not_effective : 0.000021s : 0.07% optimize.symbol_engine_optimizer.opt_reshape : 0.000011s : 0.04% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000015s : 0.05% optimize.symbol_engine_optimizer.renormalize : 0.000001s : 0.00% detach_backward : 0.000006s : 0.02% pipeline_parallel_scheduler : 0.000002s : 0.01% auto_monad_reorder : 0.000030s : 0.11% get_jit_bprop_graph : 0.000002s : 0.01% rewriter_after_jit_bprop_graph : 0.000009s : 0.03% opt_after_jit_grad : 0.000863s : 3.08% validate : 0.000059s : 0.21% Time group info: ------[substitution.] 0.000265 38 13.05% : 0.000035s : 3: substitution.cast_eliminate 1.07% : 0.000003s : 3: substitution.elim_not_effective 0.83% : 0.000002s : 3: substitution.fold_const_symbol 2.94% : 0.000008s : 5: substitution.graph_param_transform 68.33% : 0.000181s : 4: substitution.inline 2.15% : 0.000006s : 6: substitution.j_node_and_user_rematch 2.58% : 0.000007s : 6: substitution.remove_not_recompute_node 3.00% : 0.000008s : 4: substitution.replace_old_param 6.07% : 0.000016s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.009402 2 76.00% : 0.007146s : 1: type_inference.infer 24.00% : 0.002256s : 1: type_inference.specialize ------[replace.] 0.000071 8 60.84% : 0.000043s : 4: replace.inline 39.16% : 0.000028s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000193 8 92.70% : 0.000179s : 4: match.inline 7.30% : 0.000014s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000302 1596 1.14% : 0.000003s : 17: predicate.accumulaten_eliminater 1.05% : 0.000003s : 5: predicate.ad_related_special_op_eliminate 0.58% : 0.000002s : 10: predicate.addn_check_dump 0.86% : 0.000003s : 17: predicate.addn_zero_filter 0.89% : 0.000003s : 17: predicate.adjust_all_reduce_mul_add 2.24% : 0.000007s : 27: predicate.arithmetic_simplify 1.08% : 0.000003s : 17: predicate.cast_eliminate 0.54% : 0.000002s : 10: predicate.check_bprop_eliminate 0.53% : 0.000002s : 10: predicate.compare_switch_simplify 0.17% : 0.000001s : 5: predicate.const_output_eliminate 0.59% : 0.000002s : 10: predicate.depend_value_elim 1.05% : 0.000003s : 17: predicate.dict_get_item_const_eliminator 0.99% : 0.000003s : 17: predicate.dict_get_item_eliminator 0.87% : 0.000003s : 17: predicate.dict_set_item_eliminator 1.04% : 0.000003s : 10: predicate.dumpgradient_eliminate 0.19% : 0.000001s : 5: predicate.elim_not_effective 0.54% : 0.000002s : 5: predicate.elim_shapecalc_of_broadcastargs 1.19% : 0.000004s : 22: predicate.environ_add_const_eliminate 1.18% : 0.000004s : 22: predicate.environ_get_add_eliminate 1.19% : 0.000004s : 22: predicate.environ_get_depend_swap 1.68% : 0.000005s : 32: predicate.environ_get_eliminate 1.15% : 0.000003s : 22: predicate.environ_get_set_eliminate 1.26% : 0.000004s : 25: predicate.exchange_switch_depend_value 2.08% : 0.000006s : 25: predicate.float_depend_g_call 0.52% : 0.000002s : 10: predicate.float_environ_get_switch 0.70% : 0.000002s : 15: predicate.float_tuple_getitem_switch 0.17% : 0.000001s : 5: predicate.fold_const_symbol 0.79% : 0.000002s : 10: predicate.get_grad_eliminate 0.18% : 0.000001s : 5: predicate.graph_param_transform 0.57% : 0.000002s : 10: predicate.incorporate_call 0.43% : 0.000001s : 10: predicate.incorporate_call_switch 5.79% : 0.000017s : 72: predicate.inline 0.78% : 0.000002s : 10: predicate.inline_without_move 0.26% : 0.000001s : 10: predicate.j_node_and_user_rematch 1.02% : 0.000003s : 10: predicate.less_batch_normalization 1.84% : 0.000006s : 31: predicate.list_to_tuple_eliminator_ 2.59% : 0.000008s : 48: predicate.load_eliminater 1.77% : 0.000005s : 5: predicate.loop_unroll_after_grad 1.72% : 0.000005s : 36: predicate.loop_unroll_before_grad 1.82% : 0.000005s : 27: predicate.make_slice_get_slice_eliminator 0.53% : 0.000002s : 10: predicate.merge_addn 0.79% : 0.000002s : 10: predicate.micro_step_allgather_replace 0.74% : 0.000002s : 10: predicate.mini_step_allgather_replace 0.89% : 0.000003s : 17: predicate.minmaximum_grad 2.04% : 0.000006s : 5: predicate.mutable_eliminate 0.42% : 0.000001s : 5: predicate.opt_reshape 0.36% : 0.000001s : 5: predicate.parallel_virtual_node 1.59% : 0.000005s : 25: predicate.partial_defer_inline 1.49% : 0.000005s : 26: predicate.partial_eliminate 1.07% : 0.000003s : 17: predicate.print_const_string_wrapper 0.60% : 0.000002s : 10: predicate.reduce_all_const_elim 1.18% : 0.000004s : 17: predicate.reduce_eliminate 2.42% : 0.000007s : 48: predicate.redundant_stop_gradient_eliminater 0.36% : 0.000001s : 10: predicate.remove_not_recompute_node 1.23% : 0.000004s : 31: predicate.replace_applicator 0.49% : 0.000001s : 10: predicate.replace_old_param 0.42% : 0.000001s : 5: predicate.reset_defer_inline 0.90% : 0.000003s : 17: predicate.reshape_eliminate 0.69% : 0.000002s : 10: predicate.row_tensor_add_zeros_like 0.44% : 0.000001s : 5: predicate.row_tensor_eliminate 0.89% : 0.000003s : 10: predicate.same_eliminate 0.42% : 0.000001s : 10: predicate.set_cell_output_no_recompute 0.84% : 0.000003s : 10: predicate.shard_identity_eliminate 0.70% : 0.000002s : 10: predicate.special_op_eliminate 0.62% : 0.000002s : 10: predicate.specialize_transform 1.01% : 0.000003s : 10: predicate.split_environ_get_set_with_tuple_value 0.80% : 0.000002s : 10: predicate.stack_unstack_eliminate 0.35% : 0.000001s : 5: predicate.switch_call_monad_eliminater 1.33% : 0.000004s : 25: predicate.switch_defer_inline 1.89% : 0.000006s : 35: predicate.switch_layer_defer_inline 4.00% : 0.000012s : 76: predicate.switch_simplify 0.93% : 0.000003s : 17: predicate.tile_eliminate 0.88% : 0.000003s : 17: predicate.transpose_eliminate 1.88% : 0.000006s : 27: predicate.tuple_list_convert_item_index_to_positive 1.70% : 0.000005s : 27: predicate.tuple_list_get_item_const_eliminator 1.50% : 0.000005s : 27: predicate.tuple_list_get_item_depend_reorder 3.47% : 0.000010s : 41: predicate.tuple_list_get_item_eliminator 1.58% : 0.000005s : 27: predicate.tuple_list_get_set_item_eliminator 2.26% : 0.000007s : 37: predicate.tuple_list_set_item_eliminator 1.72% : 0.000005s : 31: predicate.tuple_to_list_eliminator_ 2.26% : 0.000007s : 48: predicate.updatestate_pure_node_eliminater 2.88% : 0.000009s : 58: predicate.updatestate_useless_node_eliminater 0.49% : 0.000001s : 5: predicate.value_based_eliminate 0.66% : 0.000002s : 10: predicate.virtual_dataset_eliminate 0.61% : 0.000002s : 10: predicate.virtual_output_eliminate 0.28% : 0.000001s : 5: predicate.virtual_view_grad_eliminate 0.42% : 0.000001s : 5: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000736 11 45.29% : 0.000333s : 5: func_graph_cloner_run.FuncGraphClonerGraph 54.71% : 0.000403s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.068565 192 0.01% : 0.000008s : 1: ForceFp32Comm 5.96% : 0.004083s : 1: add_attr 5.92% : 0.004062s : 1: add_attr_with_inline 0.01% : 0.000007s : 1: add_comm_op_reuse_tag 0.12% : 0.000083s : 1: add_recomputation 0.01% : 0.000007s : 1: assign_add_opt 0.41% : 0.000281s : 1: auto_monad 0.06% : 0.000039s : 1: auto_monad_reorder 0.01% : 0.000006s : 1: begin_end_overlap_inline 0.01% : 0.000009s : 1: bias_add_comm_swap 0.68% : 0.000469s : 1: bootstrap 0.07% : 0.000050s : 1: cconv 0.01% : 0.000007s : 1: comm_op_add_attrs 0.04% : 0.000028s : 1: control_data_broadcast_order 0.02% : 0.000016s : 1: convert_after_rewriter 0.06% : 0.000043s : 1: cse_after_recomputation 0.01% : 0.000008s : 1: dataset_repeat_opt 0.05% : 0.000034s : 1: detach_backward 0.02% : 0.000015s : 1: environ_conv 0.06% : 0.000039s : 1: event_method 0.01% : 0.000008s : 1: full_micro_interleaved_order_control 0.01% : 0.000009s : 1: get_jit_bprop_graph 0.02% : 0.000016s : 1: graph_reusing 0.01% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000007s : 1: handle_group_info 0.01% : 0.000009s : 1: inline 0.01% : 0.000009s : 1: insert-virtual-dataset 0.01% : 0.000007s : 1: interleave_parallel_branches 0.01% : 0.000007s : 1: interleave_split_concat_branches 0.01% : 0.000009s : 1: label_fine_grained_interleaved_index 0.02% : 0.000015s : 1: label_micro_interleaved_index 1.17% : 0.000804s : 1: loop_unroll 0.01% : 0.000007s : 1: merge_cast_opt 0.01% : 0.000008s : 1: micro_interleaved_order_control 3.44% : 0.002359s : 1: mutable_eliminate 0.02% : 0.000011s : 1: offloading_packed_experts 0.05% : 0.000031s : 1: opt.transform.loop_unroll_optimizer 0.06% : 0.000040s : 1: opt.transform.mutable_eliminate 2.31% : 0.001582s : 78: opt.transform.opt_a 0.06% : 0.000043s : 1: opt.transform.opt_after_cconv 0.06% : 0.000044s : 1: opt.transform.opt_after_jit_grad 0.23% : 0.000159s : 28: opt.transform.opt_b 0.11% : 0.000074s : 2: opt.transform.opt_trans_graph 0.09% : 0.000060s : 4: opt.transform.symbol_engine_opt 19.30% : 0.013231s : 1: opt_a 0.29% : 0.000199s : 1: opt_after_cconv 1.28% : 0.000877s : 1: opt_after_jit_grad 0.59% : 0.000407s : 1: opt_b 27.22% : 0.018666s : 1: optimize 0.05% : 0.000035s : 1: optimize_parallel_all_gather_comm 0.02% : 0.000013s : 1: order_py_execute_after_rewriter 0.05% : 0.000035s : 1: overlap_grad_flash_sp 0.01% : 0.000007s : 1: overlap_grad_matmul_and_grad_allreduce 0.02% : 0.000011s : 1: overlap_grad_ring_attention 0.01% : 0.000008s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000008s : 1: overlap_opt_shard_in_pipeline 0.02% : 0.000010s : 1: overlap_param_gather 0.01% : 0.000007s : 1: overlap_recompute_allgather_and_fa_grad 0.02% : 0.000012s : 1: overlap_recompute_and_grad_model_parallel 0.01% : 0.000009s : 1: overlap_recompute_comm 0.02% : 0.000012s : 1: parallel-infer-symbol 0.01% : 0.000007s : 1: parallel-infer-symbol-second 0.01% : 0.000009s : 1: partial_unused_args_eliminate 0.02% : 0.000011s : 1: pipeline_parallel_scheduler 0.01% : 0.000007s : 1: pipeline_split 0.08% : 0.000053s : 1: pre_auto_parallel 0.07% : 0.000046s : 1: py_interpret_to_execute 0.05% : 0.000031s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000006s : 1: remove_cast_before_assign_add 0.04% : 0.000028s : 1: remove_dup_value 14.00% : 0.009597s : 1: renormalize.infer 0.79% : 0.000540s : 1: renormalize.specialize 0.01% : 0.000009s : 1: reorder_send_recv_between_fp_bp 0.02% : 0.000016s : 1: rewriter_after_jit_bprop_graph 0.10% : 0.000067s : 1: rewriter_after_opt_a 0.16% : 0.000113s : 1: rewriter_before_opt_a 0.01% : 0.000009s : 1: slice_cell_reuse_recomputed_activation 0.01% : 0.000008s : 1: slice_recompute_activation 0.01% : 0.000008s : 1: split_layernorm_comm 0.01% : 0.000008s : 1: split_matmul_comm_elemetwise 0.02% : 0.000013s : 1: swap_dp_allreduce_reducescatter 0.20% : 0.000140s : 1: symbol_engine_optimizer 0.18% : 0.000126s : 1: tuple_transform 13.92% : 0.009547s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:44:48.222.372 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0486118, [21] [bootstrap]: 0.0205922 [type_inference]: 0.00762238 [event_method]: 2.246e-05 [auto_monad]: 7.069e-05 [graph_reusing]: 6.59999e-06 [inline]: 3.08998e-06 [add_attr]: 0.0120308, [1] [add_attr_with_inline]: 0.0120058, [1] [Cycle 1]: 0.00801893, [2] [tag_attr]: 0.00793001 [meta_addattr_fg_expand]: 1.021e-05 [parallel-infer-symbol]: 5.29e-06 [pre_auto_parallel]: 5.378e-05 [insert-virtual-dataset]: 2.84999e-06 [parallel-infer-symbol-second]: 9.30013e-07 [dataset_repeat_opt]: 2.61e-06 [pipeline_split]: 2.12001e-06 [optimize]: 0.00703886, [53] [py_interpret_to_execute]: 4.158e-05 [rewriter_before_opt_a]: 0.00010716 [opt_a]: 0.00389629, [2] [Cycle 1]: 0.00293058, [45] [expand_dump_flag]: 3.37997e-06 [switch_simplify]: 4.66e-05 [loop_unroll]: 3.151e-05 [a_1]: 0.00085065 [with_stream_mark]: 2.904e-05 [recompute_prepare]: 1.723e-05 [updatestate_depend_eliminate]: 5.69e-06 [updatestate_assign_eliminate]: 4.08001e-06 [updatestate_loads_eliminate]: 3.69002e-06 [parameter_eliminate]: 2.24001e-06 [a_2]: 0.00010795 [accelerated_algorithm]: 9.64e-06 [shard]: 2.63e-06 [meta_shard_fg_expand]: 3.18e-06 [shard_inline]: 8.97999e-06 [merge_send_recv]: 1.146e-05 [auto_parallel]: 1.133e-05 [parallel]: 2.298e-05 [flash_sp]: 1.174e-05 [merge_comm]: 5.44e-06 [allreduce_fusion]: 4.22e-06 [matmul_add_comm_reduction]: 1.16e-05 [allreduce_slice_to_reducescatter]: 9.5999e-07 [virtual_shard_identity]: 1.209e-05 [virtual_dataset]: 8.28001e-06 [get_grad_eliminate_]: 7.93001e-06 [virtual_output]: 8.15e-06 [merge_forward]: 5.40001e-06 [cell_reuse_recompute_pass]: 1.32e-06 [offload_activation]: 1.202e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.046e-05 [merge_recompute_call_nodes]: 1.50999e-06 [before_grad]: 1.452e-05 [set_forward_comm_id_for_comm_node_pass]: 4.85999e-06 [meta_fg_expand]: 4.13001e-06 [flash_sp_send_recv_attached]: 2.98e-06 [receive_attached]: 2.45002e-06 [after_resolve]: 1.546e-05 [a_after_grad]: 1.248e-05 [renormalize]: 0.00111293 [add_forward_monad_depend]: 1.083e-05 [auto_monad_grad]: 2.64999e-06 [auto_monad_eliminator]: 2.474e-05 [cse]: 4.168e-05 [a_3]: 7.469e-05 [Cycle 2]: 0.00094787, [45] [expand_dump_flag]: 2.59999e-06 [switch_simplify]: 1.087e-05 [loop_unroll]: 7.83001e-06 [a_1]: 0.00020074 [with_stream_mark]: 2.108e-05 [recompute_prepare]: 8.91002e-06 [updatestate_depend_eliminate]: 5.20999e-06 [updatestate_assign_eliminate]: 3.65e-06 [updatestate_loads_eliminate]: 3.43999e-06 [parameter_eliminate]: 2.53003e-06 [a_2]: 9.655e-05 [accelerated_algorithm]: 9.31e-06 [shard]: 2.71999e-06 [meta_shard_fg_expand]: 3.19001e-06 [shard_inline]: 7.69002e-06 [merge_send_recv]: 1.07e-05 [auto_parallel]: 1.083e-05 [parallel]: 1.01e-05 [flash_sp]: 4.03999e-06 [merge_comm]: 4.82e-06 [allreduce_fusion]: 4.42998e-06 [matmul_add_comm_reduction]: 1.101e-05 [allreduce_slice_to_reducescatter]: 1.07e-06 [virtual_shard_identity]: 1.009e-05 [virtual_dataset]: 7.49002e-06 [get_grad_eliminate_]: 7.99002e-06 [virtual_output]: 8.27e-06 [merge_forward]: 5.84e-06 [cell_reuse_recompute_pass]: 3.35998e-06 [offload_activation]: 1.194e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.918e-05 [merge_recompute_call_nodes]: 1.15999e-06 [before_grad]: 1.331e-05 [set_forward_comm_id_for_comm_node_pass]: 5.20001e-06 [meta_fg_expand]: 4.03999e-06 [flash_sp_send_recv_attached]: 1.97999e-06 [receive_attached]: 2.58e-06 [after_resolve]: 1.54e-05 [a_after_grad]: 1.557e-05 [renormalize]: 8.9989e-08 [add_forward_monad_depend]: 2.82002e-06 [auto_monad_grad]: 2.37999e-06 [auto_monad_eliminator]: 1.775e-05 [cse]: 3.066e-05 [a_3]: 4.912e-05 [py_interpret_to_execute_after_opt_a]: 2.29e-05 [slice_cell_reuse_recomputed_activation]: 2.39999e-06 [rewriter_after_opt_a]: 6.302e-05 [convert_after_rewriter]: 9.82999e-06 [order_py_execute_after_rewriter]: 7.18e-06 [mutable_eliminate]: 0.00090658 [opt_b]: 0.00029892, [1] [Cycle 1]: 0.00028734, [7] [b_1]: 0.00017057 [b_2]: 1.092e-05 [updatestate_depend_eliminate]: 1.185e-05 [updatestate_assign_eliminate]: 3.58e-06 [updatestate_loads_eliminate]: 3.51001e-06 [renormalize]: 7.39994e-07 [cse]: 4.614e-05 [optimize_parallel_all_gather_comm]: 2.438e-05 [overlap_param_gather]: 2.01e-06 [cconv]: 4.147e-05 [loop_unroll]: 0.00071083 [opt_after_cconv]: 0.00015757, [1] [Cycle 1]: 0.00014853, [7] [c_1]: 4.317e-05 [parameter_eliminate]: 6.51999e-06 [updatestate_depend_eliminate]: 1.162e-05 [updatestate_assign_eliminate]: 3.85998e-06 [updatestate_loads_eliminate]: 3.44001e-06 [cse]: 4.048e-05 [renormalize]: 8.09989e-07 [remove_dup_value]: 2.168e-05 [tuple_transform]: 0.00010647, [1] [Cycle 1]: 0.00010115, [4] [d_1]: 6.769e-05 [none_parameter_eliminate]: 2.00002e-06 [renormalize]: 1.8999e-07 [switch_simplify]: 9.39998e-06 [partial_unused_args_eliminate]: 2.36998e-06 [add_recomputation]: 7.448e-05 [cse_after_recomputation]: 3.227e-05, [1] [Cycle 1]: 2.592e-05, [1] [cse]: 1.902e-05 [environ_conv]: 8.18001e-06 [swap_dp_allreduce_reducescatter]: 6.47001e-06 [bias_add_comm_swap]: 3.86001e-06 [label_micro_interleaved_index]: 6.98e-06 [label_fine_grained_interleaved_index]: 2.76999e-06 [merge_cast_opt]: 1.61002e-06 [slice_recompute_activation]: 2.78e-06 [micro_interleaved_order_control]: 3.01999e-06 [assign_add_opt]: 1.37999e-06 [ForceFp32Comm]: 9.70002e-07 [remove_cast_before_assign_add]: 1.21002e-06 [full_micro_interleaved_order_control]: 2.12999e-06 [reorder_send_recv_between_fp_bp]: 2.74999e-06 [comm_op_add_attrs]: 1.47001e-06 [add_comm_op_reuse_tag]: 1.34998e-06 [interleave_split_concat_branches]: 1.24e-06 [interleave_parallel_branches]: 1.02e-06 [overlap_opt_shard_in_pipeline]: 1.32999e-06 [overlap_opt_shard_grad_in_pipeline]: 2.04e-06 [control_data_broadcast_order]: 1.925e-05 [grouped_pairwise_exchange_alltoall]: 1.70001e-06 [offloading_packed_experts]: 5.57999e-06 [overlap_recompute_and_grad_model_parallel]: 5.90002e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.19e-06 [overlap_recompute_allgather_and_fa_grad]: 1.40001e-06 [overlap_recompute_comm]: 2.86999e-06 [overlap_grad_ring_attention]: 5.30001e-06 [overlap_grad_flash_sp]: 2.578e-05 [begin_end_overlap_inline]: 8.80013e-07 [split_matmul_comm_elemetwise]: 2.94001e-06 [split_layernorm_comm]: 1.61002e-06 [handle_group_info]: 1.30999e-06 [symbol_engine_optimizer]: 0.00010246, [1] [Cycle 1]: 9.755e-05, [6] [build]: 5.42001e-06 [elim_shapecalc]: 1.641e-05 [elim_not_effective]: 1.855e-05 [opt_reshape]: 9.52001e-06 [fold_const_symbol]: 1.263e-05 [renormalize]: 2.30008e-07 [detach_backward]: 2.37001e-06 [pipeline_parallel_scheduler]: 1.74e-06 [auto_monad_reorder]: 2.43e-05 [get_jit_bprop_graph]: 2.29999e-06 [rewriter_after_jit_bprop_graph]: 8.99e-06 [opt_after_jit_grad]: 0.00084054 [validate]: 6.554e-05 Sums bootstrap : 0.020592s : 47.62% type_inference : 0.007622s : 17.63% event_method : 0.000022s : 0.05% auto_monad : 0.000071s : 0.16% graph_reusing : 0.000007s : 0.02% inline : 0.000003s : 0.01% add_attr.add_attr_with_inline.tag_attr : 0.007930s : 18.34% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000010s : 0.02% parallel-infer-symbol : 0.000005s : 0.01% pre_auto_parallel : 0.000054s : 0.12% insert-virtual-dataset : 0.000003s : 0.01% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000003s : 0.01% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000042s : 0.10% optimize.rewriter_before_opt_a : 0.000107s : 0.25% optimize.opt_a.expand_dump_flag : 0.000006s : 0.01% optimize.opt_a.switch_simplify : 0.000057s : 0.13% optimize.opt_a.loop_unroll : 0.000039s : 0.09% optimize.opt_a.a_1 : 0.001051s : 2.43% optimize.opt_a.with_stream_mark : 0.000050s : 0.12% optimize.opt_a.recompute_prepare : 0.000026s : 0.06% optimize.opt_a.updatestate_depend_eliminate : 0.000011s : 0.03% optimize.opt_a.updatestate_assign_eliminate : 0.000008s : 0.02% optimize.opt_a.updatestate_loads_eliminate : 0.000007s : 0.02% optimize.opt_a.parameter_eliminate : 0.000005s : 0.01% optimize.opt_a.a_2 : 0.000204s : 0.47% optimize.opt_a.accelerated_algorithm : 0.000019s : 0.04% optimize.opt_a.shard : 0.000005s : 0.01% optimize.opt_a.meta_shard_fg_expand : 0.000006s : 0.01% optimize.opt_a.shard_inline : 0.000017s : 0.04% optimize.opt_a.merge_send_recv : 0.000022s : 0.05% optimize.opt_a.auto_parallel : 0.000022s : 0.05% optimize.opt_a.parallel : 0.000033s : 0.08% optimize.opt_a.flash_sp : 0.000016s : 0.04% optimize.opt_a.merge_comm : 0.000010s : 0.02% optimize.opt_a.allreduce_fusion : 0.000009s : 0.02% optimize.opt_a.matmul_add_comm_reduction : 0.000023s : 0.05% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000002s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000022s : 0.05% optimize.opt_a.virtual_dataset : 0.000016s : 0.04% optimize.opt_a.get_grad_eliminate_ : 0.000016s : 0.04% optimize.opt_a.virtual_output : 0.000016s : 0.04% optimize.opt_a.merge_forward : 0.000011s : 0.03% optimize.opt_a.cell_reuse_recompute_pass : 0.000005s : 0.01% optimize.opt_a.offload_activation : 0.000024s : 0.06% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000040s : 0.09% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.01% optimize.opt_a.before_grad : 0.000028s : 0.06% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000010s : 0.02% optimize.opt_a.meta_fg_expand : 0.000008s : 0.02% optimize.opt_a.flash_sp_send_recv_attached : 0.000005s : 0.01% optimize.opt_a.receive_attached : 0.000005s : 0.01% optimize.opt_a.after_resolve : 0.000031s : 0.07% optimize.opt_a.a_after_grad : 0.000028s : 0.06% optimize.opt_a.renormalize : 0.001113s : 2.57% optimize.opt_a.add_forward_monad_depend : 0.000014s : 0.03% optimize.opt_a.auto_monad_grad : 0.000005s : 0.01% optimize.opt_a.auto_monad_eliminator : 0.000042s : 0.10% optimize.opt_a.cse : 0.000072s : 0.17% optimize.opt_a.a_3 : 0.000124s : 0.29% optimize.py_interpret_to_execute_after_opt_a : 0.000023s : 0.05% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.01% optimize.rewriter_after_opt_a : 0.000063s : 0.15% optimize.convert_after_rewriter : 0.000010s : 0.02% optimize.order_py_execute_after_rewriter : 0.000007s : 0.02% optimize.mutable_eliminate : 0.000907s : 2.10% optimize.opt_b.b_1 : 0.000171s : 0.39% optimize.opt_b.b_2 : 0.000011s : 0.03% optimize.opt_b.updatestate_depend_eliminate : 0.000012s : 0.03% optimize.opt_b.updatestate_assign_eliminate : 0.000004s : 0.01% optimize.opt_b.updatestate_loads_eliminate : 0.000004s : 0.01% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000046s : 0.11% optimize.optimize_parallel_all_gather_comm : 0.000024s : 0.06% optimize.overlap_param_gather : 0.000002s : 0.00% optimize.cconv : 0.000041s : 0.10% optimize.loop_unroll : 0.000711s : 1.64% optimize.opt_after_cconv.c_1 : 0.000043s : 0.10% optimize.opt_after_cconv.parameter_eliminate : 0.000007s : 0.02% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000012s : 0.03% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000004s : 0.01% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.cse : 0.000040s : 0.09% optimize.opt_after_cconv.renormalize : 0.000001s : 0.00% optimize.remove_dup_value : 0.000022s : 0.05% optimize.tuple_transform.d_1 : 0.000068s : 0.16% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000009s : 0.02% optimize.partial_unused_args_eliminate : 0.000002s : 0.01% optimize.add_recomputation : 0.000074s : 0.17% optimize.cse_after_recomputation.cse : 0.000019s : 0.04% optimize.environ_conv : 0.000008s : 0.02% optimize.swap_dp_allreduce_reducescatter : 0.000006s : 0.01% optimize.bias_add_comm_swap : 0.000004s : 0.01% optimize.label_micro_interleaved_index : 0.000007s : 0.02% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.01% optimize.merge_cast_opt : 0.000002s : 0.00% optimize.slice_recompute_activation : 0.000003s : 0.01% optimize.micro_interleaved_order_control : 0.000003s : 0.01% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.00% optimize.full_micro_interleaved_order_control : 0.000002s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.01% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.00% optimize.control_data_broadcast_order : 0.000019s : 0.04% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.00% optimize.offloading_packed_experts : 0.000006s : 0.01% optimize.overlap_recompute_and_grad_model_parallel : 0.000006s : 0.01% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.00% optimize.overlap_recompute_comm : 0.000003s : 0.01% optimize.overlap_grad_ring_attention : 0.000005s : 0.01% optimize.overlap_grad_flash_sp : 0.000026s : 0.06% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000003s : 0.01% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000005s : 0.01% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000016s : 0.04% optimize.symbol_engine_optimizer.elim_not_effective : 0.000019s : 0.04% optimize.symbol_engine_optimizer.opt_reshape : 0.000010s : 0.02% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000013s : 0.03% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.01% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000024s : 0.06% get_jit_bprop_graph : 0.000002s : 0.01% rewriter_after_jit_bprop_graph : 0.000009s : 0.02% opt_after_jit_grad : 0.000841s : 1.94% validate : 0.000066s : 0.15% Time group info: ------[substitution.] 0.000289 38 11.33% : 0.000033s : 3: substitution.cast_eliminate 0.92% : 0.000003s : 3: substitution.elim_not_effective 0.65% : 0.000002s : 3: substitution.fold_const_symbol 2.84% : 0.000008s : 5: substitution.graph_param_transform 70.05% : 0.000203s : 4: substitution.inline 2.00% : 0.000006s : 6: substitution.j_node_and_user_rematch 2.98% : 0.000009s : 6: substitution.remove_not_recompute_node 2.95% : 0.000009s : 4: substitution.replace_old_param 6.28% : 0.000018s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.007538 2 88.39% : 0.006663s : 1: type_inference.infer 11.61% : 0.000875s : 1: type_inference.specialize ------[replace.] 0.000074 8 60.86% : 0.000045s : 4: replace.inline 39.14% : 0.000029s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000216 8 92.51% : 0.000199s : 4: match.inline 7.49% : 0.000016s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000295 1596 1.04% : 0.000003s : 17: predicate.accumulaten_eliminater 1.37% : 0.000004s : 5: predicate.ad_related_special_op_eliminate 0.58% : 0.000002s : 10: predicate.addn_check_dump 1.20% : 0.000004s : 17: predicate.addn_zero_filter 0.78% : 0.000002s : 17: predicate.adjust_all_reduce_mul_add 2.08% : 0.000006s : 27: predicate.arithmetic_simplify 1.24% : 0.000004s : 17: predicate.cast_eliminate 0.70% : 0.000002s : 10: predicate.check_bprop_eliminate 0.57% : 0.000002s : 10: predicate.compare_switch_simplify 0.16% : 0.000000s : 5: predicate.const_output_eliminate 0.75% : 0.000002s : 10: predicate.depend_value_elim 0.96% : 0.000003s : 17: predicate.dict_get_item_const_eliminator 1.30% : 0.000004s : 17: predicate.dict_get_item_eliminator 0.90% : 0.000003s : 17: predicate.dict_set_item_eliminator 1.13% : 0.000003s : 10: predicate.dumpgradient_eliminate 0.25% : 0.000001s : 5: predicate.elim_not_effective 0.46% : 0.000001s : 5: predicate.elim_shapecalc_of_broadcastargs 1.23% : 0.000004s : 22: predicate.environ_add_const_eliminate 1.06% : 0.000003s : 22: predicate.environ_get_add_eliminate 1.09% : 0.000003s : 22: predicate.environ_get_depend_swap 1.83% : 0.000005s : 32: predicate.environ_get_eliminate 1.04% : 0.000003s : 22: predicate.environ_get_set_eliminate 1.29% : 0.000004s : 25: predicate.exchange_switch_depend_value 2.09% : 0.000006s : 25: predicate.float_depend_g_call 0.50% : 0.000001s : 10: predicate.float_environ_get_switch 0.80% : 0.000002s : 15: predicate.float_tuple_getitem_switch 0.16% : 0.000000s : 5: predicate.fold_const_symbol 0.60% : 0.000002s : 10: predicate.get_grad_eliminate 0.17% : 0.000001s : 5: predicate.graph_param_transform 0.52% : 0.000002s : 10: predicate.incorporate_call 0.43% : 0.000001s : 10: predicate.incorporate_call_switch 5.90% : 0.000017s : 72: predicate.inline 0.82% : 0.000002s : 10: predicate.inline_without_move 0.30% : 0.000001s : 10: predicate.j_node_and_user_rematch 0.93% : 0.000003s : 10: predicate.less_batch_normalization 1.68% : 0.000005s : 31: predicate.list_to_tuple_eliminator_ 2.56% : 0.000008s : 48: predicate.load_eliminater 1.14% : 0.000003s : 5: predicate.loop_unroll_after_grad 1.84% : 0.000005s : 36: predicate.loop_unroll_before_grad 1.75% : 0.000005s : 27: predicate.make_slice_get_slice_eliminator 0.59% : 0.000002s : 10: predicate.merge_addn 0.60% : 0.000002s : 10: predicate.micro_step_allgather_replace 0.55% : 0.000002s : 10: predicate.mini_step_allgather_replace 0.81% : 0.000002s : 17: predicate.minmaximum_grad 1.55% : 0.000005s : 5: predicate.mutable_eliminate 0.36% : 0.000001s : 5: predicate.opt_reshape 0.36% : 0.000001s : 5: predicate.parallel_virtual_node 1.93% : 0.000006s : 25: predicate.partial_defer_inline 1.46% : 0.000004s : 26: predicate.partial_eliminate 0.87% : 0.000003s : 17: predicate.print_const_string_wrapper 0.87% : 0.000003s : 10: predicate.reduce_all_const_elim 1.36% : 0.000004s : 17: predicate.reduce_eliminate 2.47% : 0.000007s : 48: predicate.redundant_stop_gradient_eliminater 0.43% : 0.000001s : 10: predicate.remove_not_recompute_node 1.21% : 0.000004s : 31: predicate.replace_applicator 0.49% : 0.000001s : 10: predicate.replace_old_param 0.34% : 0.000001s : 5: predicate.reset_defer_inline 0.89% : 0.000003s : 17: predicate.reshape_eliminate 0.67% : 0.000002s : 10: predicate.row_tensor_add_zeros_like 0.38% : 0.000001s : 5: predicate.row_tensor_eliminate 0.70% : 0.000002s : 10: predicate.same_eliminate 0.43% : 0.000001s : 10: predicate.set_cell_output_no_recompute 0.66% : 0.000002s : 10: predicate.shard_identity_eliminate 0.71% : 0.000002s : 10: predicate.special_op_eliminate 0.71% : 0.000002s : 10: predicate.specialize_transform 1.08% : 0.000003s : 10: predicate.split_environ_get_set_with_tuple_value 0.89% : 0.000003s : 10: predicate.stack_unstack_eliminate 0.34% : 0.000001s : 5: predicate.switch_call_monad_eliminater 1.33% : 0.000004s : 25: predicate.switch_defer_inline 1.90% : 0.000006s : 35: predicate.switch_layer_defer_inline 4.34% : 0.000013s : 76: predicate.switch_simplify 0.89% : 0.000003s : 17: predicate.tile_eliminate 1.00% : 0.000003s : 17: predicate.transpose_eliminate 1.67% : 0.000005s : 27: predicate.tuple_list_convert_item_index_to_positive 1.59% : 0.000005s : 27: predicate.tuple_list_get_item_const_eliminator 1.49% : 0.000004s : 27: predicate.tuple_list_get_item_depend_reorder 3.36% : 0.000010s : 41: predicate.tuple_list_get_item_eliminator 1.50% : 0.000004s : 27: predicate.tuple_list_get_set_item_eliminator 2.47% : 0.000007s : 37: predicate.tuple_list_set_item_eliminator 1.87% : 0.000005s : 31: predicate.tuple_to_list_eliminator_ 2.38% : 0.000007s : 48: predicate.updatestate_pure_node_eliminater 3.02% : 0.000009s : 58: predicate.updatestate_useless_node_eliminater 0.34% : 0.000001s : 5: predicate.value_based_eliminate 0.66% : 0.000002s : 10: predicate.virtual_dataset_eliminate 0.59% : 0.000002s : 10: predicate.virtual_output_eliminate 0.28% : 0.000001s : 5: predicate.virtual_view_grad_eliminate 0.42% : 0.000001s : 5: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000718 11 51.79% : 0.000372s : 5: func_graph_cloner_run.FuncGraphClonerGraph 48.21% : 0.000346s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.070644 192 0.01% : 0.000004s : 1: ForceFp32Comm 17.04% : 0.012040s : 1: add_attr 17.00% : 0.012011s : 1: add_attr_with_inline 0.01% : 0.000004s : 1: add_comm_op_reuse_tag 0.11% : 0.000080s : 1: add_recomputation 0.01% : 0.000004s : 1: assign_add_opt 0.11% : 0.000078s : 1: auto_monad 0.04% : 0.000030s : 1: auto_monad_reorder 0.01% : 0.000004s : 1: begin_end_overlap_inline 0.01% : 0.000007s : 1: bias_add_comm_swap 29.21% : 0.020637s : 1: bootstrap 0.07% : 0.000046s : 1: cconv 0.01% : 0.000005s : 1: comm_op_add_attrs 0.03% : 0.000023s : 1: control_data_broadcast_order 0.02% : 0.000014s : 1: convert_after_rewriter 0.05% : 0.000036s : 1: cse_after_recomputation 0.01% : 0.000006s : 1: dataset_repeat_opt 0.01% : 0.000006s : 1: detach_backward 0.02% : 0.000012s : 1: environ_conv 0.04% : 0.000030s : 1: event_method 0.01% : 0.000006s : 1: full_micro_interleaved_order_control 0.01% : 0.000006s : 1: get_jit_bprop_graph 0.01% : 0.000010s : 1: graph_reusing 0.01% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000004s : 1: handle_group_info 0.01% : 0.000006s : 1: inline 0.01% : 0.000007s : 1: insert-virtual-dataset 0.01% : 0.000004s : 1: interleave_parallel_branches 0.01% : 0.000004s : 1: interleave_split_concat_branches 0.01% : 0.000006s : 1: label_fine_grained_interleaved_index 0.01% : 0.000010s : 1: label_micro_interleaved_index 1.03% : 0.000726s : 1: loop_unroll 0.01% : 0.000005s : 1: merge_cast_opt 0.01% : 0.000006s : 1: micro_interleaved_order_control 1.31% : 0.000926s : 1: mutable_eliminate 0.01% : 0.000008s : 1: offloading_packed_experts 0.04% : 0.000027s : 1: opt.transform.loop_unroll_optimizer 0.05% : 0.000034s : 1: opt.transform.mutable_eliminate 2.30% : 0.001622s : 78: opt.transform.opt_a 0.06% : 0.000041s : 1: opt.transform.opt_after_cconv 0.09% : 0.000060s : 1: opt.transform.opt_after_jit_grad 0.20% : 0.000143s : 28: opt.transform.opt_b 0.10% : 0.000074s : 2: opt.transform.opt_trans_graph 0.07% : 0.000053s : 4: opt.transform.symbol_engine_opt 5.52% : 0.003901s : 1: opt_a 0.23% : 0.000162s : 1: opt_after_cconv 1.22% : 0.000860s : 1: opt_after_jit_grad 0.43% : 0.000303s : 1: opt_b 9.97% : 0.007045s : 1: optimize 0.04% : 0.000028s : 1: optimize_parallel_all_gather_comm 0.02% : 0.000011s : 1: order_py_execute_after_rewriter 0.04% : 0.000030s : 1: overlap_grad_flash_sp 0.01% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.01% : 0.000009s : 1: overlap_grad_ring_attention 0.01% : 0.000006s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.01% : 0.000005s : 1: overlap_param_gather 0.01% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.01% : 0.000009s : 1: overlap_recompute_and_grad_model_parallel 0.01% : 0.000006s : 1: overlap_recompute_comm 0.01% : 0.000010s : 1: parallel-infer-symbol 0.01% : 0.000004s : 1: parallel-infer-symbol-second 0.01% : 0.000006s : 1: partial_unused_args_eliminate 0.01% : 0.000005s : 1: pipeline_parallel_scheduler 0.01% : 0.000005s : 1: pipeline_split 0.08% : 0.000058s : 1: pre_auto_parallel 0.07% : 0.000046s : 1: py_interpret_to_execute 0.04% : 0.000027s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000005s : 1: remove_cast_before_assign_add 0.04% : 0.000025s : 1: remove_dup_value 0.85% : 0.000603s : 1: renormalize.infer 0.70% : 0.000495s : 1: renormalize.specialize 0.01% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.02% : 0.000013s : 1: rewriter_after_jit_bprop_graph 0.10% : 0.000070s : 1: rewriter_after_opt_a 0.16% : 0.000112s : 1: rewriter_before_opt_a 0.01% : 0.000006s : 1: slice_cell_reuse_recomputed_activation 0.01% : 0.000006s : 1: slice_recompute_activation 0.01% : 0.000005s : 1: split_layernorm_comm 0.01% : 0.000006s : 1: split_matmul_comm_elemetwise 0.01% : 0.000010s : 1: swap_dp_allreduce_reducescatter 0.15% : 0.000105s : 1: symbol_engine_optimizer 0.16% : 0.000110s : 1: tuple_transform 10.83% : 0.007651s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:44:48.684.198 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:44:48.684.458 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0211895, [21] [bootstrap]: 0.00041986 [type_inference]: 0.00675832 [event_method]: 2.204e-05 [auto_monad]: 7.016e-05 [graph_reusing]: 6.48e-06 [inline]: 2.50997e-06 [add_attr]: 0.00413301, [1] [add_attr_with_inline]: 0.00411869, [1] [Cycle 1]: 0.00010829, [2] [tag_attr]: 2.812e-05 [meta_addattr_fg_expand]: 6.76999e-06 [parallel-infer-symbol]: 3.56999e-06 [pre_auto_parallel]: 4.602e-05 [insert-virtual-dataset]: 2.78e-06 [parallel-infer-symbol-second]: 7.40023e-07 [dataset_repeat_opt]: 2.14999e-06 [pipeline_split]: 1.83002e-06 [optimize]: 0.00798856, [53] [py_interpret_to_execute]: 4.012e-05 [rewriter_before_opt_a]: 0.00010835 [opt_a]: 0.00442988, [2] [Cycle 1]: 0.003195, [45] [expand_dump_flag]: 3.31001e-06 [switch_simplify]: 4.631e-05 [loop_unroll]: 3.241e-05 [a_1]: 0.0008419 [with_stream_mark]: 2.853e-05 [recompute_prepare]: 1.545e-05 [updatestate_depend_eliminate]: 5.23002e-06 [updatestate_assign_eliminate]: 4.29997e-06 [updatestate_loads_eliminate]: 3.78001e-06 [parameter_eliminate]: 2.53e-06 [a_2]: 0.00013627 [accelerated_algorithm]: 1.119e-05 [shard]: 2.58e-06 [meta_shard_fg_expand]: 2.48e-06 [shard_inline]: 9.39e-06 [merge_send_recv]: 1.176e-05 [auto_parallel]: 1.11e-05 [parallel]: 2.124e-05 [flash_sp]: 1.267e-05 [merge_comm]: 5.20001e-06 [allreduce_fusion]: 4.34002e-06 [matmul_add_comm_reduction]: 1.236e-05 [allreduce_slice_to_reducescatter]: 8.99978e-07 [virtual_shard_identity]: 1.315e-05 [virtual_dataset]: 8.20999e-06 [get_grad_eliminate_]: 8.3e-06 [virtual_output]: 8.42e-06 [merge_forward]: 4.82e-06 [cell_reuse_recompute_pass]: 2.34001e-06 [offload_activation]: 1.25e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.329e-05 [merge_recompute_call_nodes]: 1.92001e-06 [before_grad]: 1.544e-05 [set_forward_comm_id_for_comm_node_pass]: 5.86e-06 [meta_fg_expand]: 4.53001e-06 [flash_sp_send_recv_attached]: 3.2e-06 [receive_attached]: 2.31998e-06 [after_resolve]: 1.535e-05 [a_after_grad]: 1.252e-05 [renormalize]: 0.00116547 [add_forward_monad_depend]: 1.071e-05 [auto_monad_grad]: 3.40998e-06 [auto_monad_eliminator]: 2.647e-05 [cse]: 4.386e-05 [a_3]: 9.189e-05 [Cycle 2]: 0.00121315, [45] [expand_dump_flag]: 2.36e-06 [switch_simplify]: 1.152e-05 [loop_unroll]: 8.72e-06 [a_1]: 0.00021583 [with_stream_mark]: 2.559e-05 [recompute_prepare]: 1.136e-05 [updatestate_depend_eliminate]: 6.04999e-06 [updatestate_assign_eliminate]: 3.53e-06 [updatestate_loads_eliminate]: 3.98999e-06 [parameter_eliminate]: 2.49001e-06 [a_2]: 0.0001274 [accelerated_algorithm]: 9.29e-06 [shard]: 2.89001e-06 [meta_shard_fg_expand]: 2.32999e-06 [shard_inline]: 8.97e-06 [merge_send_recv]: 1.216e-05 [auto_parallel]: 1.298e-05 [parallel]: 1.19e-05 [flash_sp]: 4.79998e-06 [merge_comm]: 6.38e-06 [allreduce_fusion]: 5.10001e-06 [matmul_add_comm_reduction]: 1.265e-05 [allreduce_slice_to_reducescatter]: 1.25999e-06 [virtual_shard_identity]: 1.365e-05 [virtual_dataset]: 8.17e-06 [get_grad_eliminate_]: 8.39002e-06 [virtual_output]: 8.25e-06 [merge_forward]: 5.96e-06 [cell_reuse_recompute_pass]: 3.16001e-06 [offload_activation]: 1.289e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.525e-05 [merge_recompute_call_nodes]: 1.74998e-06 [before_grad]: 1.716e-05 [set_forward_comm_id_for_comm_node_pass]: 7.17997e-06 [meta_fg_expand]: 4.05e-06 [flash_sp_send_recv_attached]: 2.06e-06 [receive_attached]: 2.29001e-06 [after_resolve]: 1.865e-05 [a_after_grad]: 1.205e-05 [renormalize]: 8.00064e-08 [add_forward_monad_depend]: 4.90999e-06 [auto_monad_grad]: 2.97002e-06 [auto_monad_eliminator]: 2.076e-05 [cse]: 3.504e-05 [a_3]: 7.258e-05 [py_interpret_to_execute_after_opt_a]: 3.102e-05 [slice_cell_reuse_recomputed_activation]: 5.60001e-06 [rewriter_after_opt_a]: 6.451e-05 [convert_after_rewriter]: 1.414e-05 [order_py_execute_after_rewriter]: 9.50001e-06 [mutable_eliminate]: 0.00091013 [opt_b]: 0.00038875, [1] [Cycle 1]: 0.00037236, [7] [b_1]: 0.00021819 [b_2]: 1.169e-05 [updatestate_depend_eliminate]: 1.469e-05 [updatestate_assign_eliminate]: 4.02e-06 [updatestate_loads_eliminate]: 4.3e-06 [renormalize]: 1.25999e-06 [cse]: 5.096e-05 [optimize_parallel_all_gather_comm]: 7.962e-05 [overlap_param_gather]: 5.73002e-06 [cconv]: 4.722e-05 [loop_unroll]: 0.00077586 [opt_after_cconv]: 0.00019115, [1] [Cycle 1]: 0.00017828, [7] [c_1]: 4.575e-05 [parameter_eliminate]: 7.11999e-06 [updatestate_depend_eliminate]: 1.245e-05 [updatestate_assign_eliminate]: 4.42e-06 [updatestate_loads_eliminate]: 3.59002e-06 [cse]: 4.218e-05 [renormalize]: 7.60017e-07 [remove_dup_value]: 2.106e-05 [tuple_transform]: 0.00012234, [1] [Cycle 1]: 0.00011423, [4] [d_1]: 6.649e-05 [none_parameter_eliminate]: 2.99999e-06 [renormalize]: 2.09984e-07 [switch_simplify]: 9.73002e-06 [partial_unused_args_eliminate]: 5.19e-06 [add_recomputation]: 7.659e-05 [cse_after_recomputation]: 4.043e-05, [1] [Cycle 1]: 3.2e-05, [1] [cse]: 2.1e-05 [environ_conv]: 1.191e-05 [swap_dp_allreduce_reducescatter]: 1.016e-05 [bias_add_comm_swap]: 6.39001e-06 [label_micro_interleaved_index]: 1.227e-05 [label_fine_grained_interleaved_index]: 5.30001e-06 [merge_cast_opt]: 4.16001e-06 [slice_recompute_activation]: 4.74e-06 [micro_interleaved_order_control]: 4.70999e-06 [assign_add_opt]: 4.03001e-06 [ForceFp32Comm]: 3.56999e-06 [remove_cast_before_assign_add]: 3.85e-06 [full_micro_interleaved_order_control]: 4.70999e-06 [reorder_send_recv_between_fp_bp]: 5.48002e-06 [comm_op_add_attrs]: 4.17e-06 [add_comm_op_reuse_tag]: 3.24001e-06 [interleave_split_concat_branches]: 3.79002e-06 [interleave_parallel_branches]: 3.82002e-06 [overlap_opt_shard_in_pipeline]: 4.33001e-06 [overlap_opt_shard_grad_in_pipeline]: 4.58001e-06 [control_data_broadcast_order]: 2.266e-05 [grouped_pairwise_exchange_alltoall]: 4.21001e-06 [offloading_packed_experts]: 9.49e-06 [overlap_recompute_and_grad_model_parallel]: 8.99e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.9e-06 [overlap_recompute_allgather_and_fa_grad]: 4.55999e-06 [overlap_recompute_comm]: 4.43999e-06 [overlap_grad_ring_attention]: 8.52e-06 [overlap_grad_flash_sp]: 3.274e-05 [begin_end_overlap_inline]: 3.25e-06 [split_matmul_comm_elemetwise]: 5.25999e-06 [split_layernorm_comm]: 4.74e-06 [handle_group_info]: 3.41999e-06 [symbol_engine_optimizer]: 0.00013862, [1] [Cycle 1]: 0.00012868, [6] [build]: 6.86999e-06 [elim_shapecalc]: 1.716e-05 [elim_not_effective]: 2.078e-05 [opt_reshape]: 1.122e-05 [fold_const_symbol]: 1.391e-05 [renormalize]: 2.19996e-07 [detach_backward]: 7.15003e-06 [pipeline_parallel_scheduler]: 2.53998e-06 [auto_monad_reorder]: 3.174e-05 [get_jit_bprop_graph]: 2.22999e-06 [rewriter_after_jit_bprop_graph]: 9.36002e-06 [opt_after_jit_grad]: 0.00086193 [validate]: 6.103e-05 Sums bootstrap : 0.000420s : 2.83% type_inference : 0.006758s : 45.48% event_method : 0.000022s : 0.15% auto_monad : 0.000070s : 0.47% graph_reusing : 0.000006s : 0.04% inline : 0.000003s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000028s : 0.19% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000007s : 0.05% parallel-infer-symbol : 0.000004s : 0.02% pre_auto_parallel : 0.000046s : 0.31% insert-virtual-dataset : 0.000003s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.01% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000040s : 0.27% optimize.rewriter_before_opt_a : 0.000108s : 0.73% optimize.opt_a.expand_dump_flag : 0.000006s : 0.04% optimize.opt_a.switch_simplify : 0.000058s : 0.39% optimize.opt_a.loop_unroll : 0.000041s : 0.28% optimize.opt_a.a_1 : 0.001058s : 7.12% optimize.opt_a.with_stream_mark : 0.000054s : 0.36% optimize.opt_a.recompute_prepare : 0.000027s : 0.18% optimize.opt_a.updatestate_depend_eliminate : 0.000011s : 0.08% optimize.opt_a.updatestate_assign_eliminate : 0.000008s : 0.05% optimize.opt_a.updatestate_loads_eliminate : 0.000008s : 0.05% optimize.opt_a.parameter_eliminate : 0.000005s : 0.03% optimize.opt_a.a_2 : 0.000264s : 1.77% optimize.opt_a.accelerated_algorithm : 0.000020s : 0.14% optimize.opt_a.shard : 0.000005s : 0.04% optimize.opt_a.meta_shard_fg_expand : 0.000005s : 0.03% optimize.opt_a.shard_inline : 0.000018s : 0.12% optimize.opt_a.merge_send_recv : 0.000024s : 0.16% optimize.opt_a.auto_parallel : 0.000024s : 0.16% optimize.opt_a.parallel : 0.000033s : 0.22% optimize.opt_a.flash_sp : 0.000017s : 0.12% optimize.opt_a.merge_comm : 0.000012s : 0.08% optimize.opt_a.allreduce_fusion : 0.000009s : 0.06% optimize.opt_a.matmul_add_comm_reduction : 0.000025s : 0.17% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000002s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000027s : 0.18% optimize.opt_a.virtual_dataset : 0.000016s : 0.11% optimize.opt_a.get_grad_eliminate_ : 0.000017s : 0.11% optimize.opt_a.virtual_output : 0.000017s : 0.11% optimize.opt_a.merge_forward : 0.000011s : 0.07% optimize.opt_a.cell_reuse_recompute_pass : 0.000006s : 0.04% optimize.opt_a.offload_activation : 0.000025s : 0.17% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000049s : 0.33% optimize.opt_a.merge_recompute_call_nodes : 0.000004s : 0.02% optimize.opt_a.before_grad : 0.000033s : 0.22% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000013s : 0.09% optimize.opt_a.meta_fg_expand : 0.000009s : 0.06% optimize.opt_a.flash_sp_send_recv_attached : 0.000005s : 0.04% optimize.opt_a.receive_attached : 0.000005s : 0.03% optimize.opt_a.after_resolve : 0.000034s : 0.23% optimize.opt_a.a_after_grad : 0.000025s : 0.17% optimize.opt_a.renormalize : 0.001166s : 7.84% optimize.opt_a.add_forward_monad_depend : 0.000016s : 0.11% optimize.opt_a.auto_monad_grad : 0.000006s : 0.04% optimize.opt_a.auto_monad_eliminator : 0.000047s : 0.32% optimize.opt_a.cse : 0.000079s : 0.53% optimize.opt_a.a_3 : 0.000164s : 1.11% optimize.py_interpret_to_execute_after_opt_a : 0.000031s : 0.21% optimize.slice_cell_reuse_recomputed_activation : 0.000006s : 0.04% optimize.rewriter_after_opt_a : 0.000065s : 0.43% optimize.convert_after_rewriter : 0.000014s : 0.10% optimize.order_py_execute_after_rewriter : 0.000010s : 0.06% optimize.mutable_eliminate : 0.000910s : 6.12% optimize.opt_b.b_1 : 0.000218s : 1.47% optimize.opt_b.b_2 : 0.000012s : 0.08% optimize.opt_b.updatestate_depend_eliminate : 0.000015s : 0.10% optimize.opt_b.updatestate_assign_eliminate : 0.000004s : 0.03% optimize.opt_b.updatestate_loads_eliminate : 0.000004s : 0.03% optimize.opt_b.renormalize : 0.000001s : 0.01% optimize.opt_b.cse : 0.000051s : 0.34% optimize.optimize_parallel_all_gather_comm : 0.000080s : 0.54% optimize.overlap_param_gather : 0.000006s : 0.04% optimize.cconv : 0.000047s : 0.32% optimize.loop_unroll : 0.000776s : 5.22% optimize.opt_after_cconv.c_1 : 0.000046s : 0.31% optimize.opt_after_cconv.parameter_eliminate : 0.000007s : 0.05% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000012s : 0.08% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000004s : 0.03% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000004s : 0.02% optimize.opt_after_cconv.cse : 0.000042s : 0.28% optimize.opt_after_cconv.renormalize : 0.000001s : 0.01% optimize.remove_dup_value : 0.000021s : 0.14% optimize.tuple_transform.d_1 : 0.000066s : 0.45% optimize.tuple_transform.none_parameter_eliminate : 0.000003s : 0.02% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000010s : 0.07% optimize.partial_unused_args_eliminate : 0.000005s : 0.03% optimize.add_recomputation : 0.000077s : 0.52% optimize.cse_after_recomputation.cse : 0.000021s : 0.14% optimize.environ_conv : 0.000012s : 0.08% optimize.swap_dp_allreduce_reducescatter : 0.000010s : 0.07% optimize.bias_add_comm_swap : 0.000006s : 0.04% optimize.label_micro_interleaved_index : 0.000012s : 0.08% optimize.label_fine_grained_interleaved_index : 0.000005s : 0.04% optimize.merge_cast_opt : 0.000004s : 0.03% optimize.slice_recompute_activation : 0.000005s : 0.03% optimize.micro_interleaved_order_control : 0.000005s : 0.03% optimize.assign_add_opt : 0.000004s : 0.03% optimize.ForceFp32Comm : 0.000004s : 0.02% optimize.remove_cast_before_assign_add : 0.000004s : 0.03% optimize.full_micro_interleaved_order_control : 0.000005s : 0.03% optimize.reorder_send_recv_between_fp_bp : 0.000005s : 0.04% optimize.comm_op_add_attrs : 0.000004s : 0.03% optimize.add_comm_op_reuse_tag : 0.000003s : 0.02% optimize.interleave_split_concat_branches : 0.000004s : 0.03% optimize.interleave_parallel_branches : 0.000004s : 0.03% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.03% optimize.overlap_opt_shard_grad_in_pipeline : 0.000005s : 0.03% optimize.control_data_broadcast_order : 0.000023s : 0.15% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.03% optimize.offloading_packed_experts : 0.000009s : 0.06% optimize.overlap_recompute_and_grad_model_parallel : 0.000009s : 0.06% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.03% optimize.overlap_recompute_allgather_and_fa_grad : 0.000005s : 0.03% optimize.overlap_recompute_comm : 0.000004s : 0.03% optimize.overlap_grad_ring_attention : 0.000009s : 0.06% optimize.overlap_grad_flash_sp : 0.000033s : 0.22% optimize.begin_end_overlap_inline : 0.000003s : 0.02% optimize.split_matmul_comm_elemetwise : 0.000005s : 0.04% optimize.split_layernorm_comm : 0.000005s : 0.03% optimize.handle_group_info : 0.000003s : 0.02% optimize.symbol_engine_optimizer.build : 0.000007s : 0.05% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000017s : 0.12% optimize.symbol_engine_optimizer.elim_not_effective : 0.000021s : 0.14% optimize.symbol_engine_optimizer.opt_reshape : 0.000011s : 0.08% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000014s : 0.09% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000007s : 0.05% pipeline_parallel_scheduler : 0.000003s : 0.02% auto_monad_reorder : 0.000032s : 0.21% get_jit_bprop_graph : 0.000002s : 0.02% rewriter_after_jit_bprop_graph : 0.000009s : 0.06% opt_after_jit_grad : 0.000862s : 5.80% validate : 0.000061s : 0.41% Time group info: ------[substitution.] 0.000282 38 12.41% : 0.000035s : 3: substitution.cast_eliminate 1.08% : 0.000003s : 3: substitution.elim_not_effective 0.62% : 0.000002s : 3: substitution.fold_const_symbol 2.84% : 0.000008s : 5: substitution.graph_param_transform 67.81% : 0.000191s : 4: substitution.inline 2.31% : 0.000007s : 6: substitution.j_node_and_user_rematch 3.32% : 0.000009s : 6: substitution.remove_not_recompute_node 3.06% : 0.000009s : 4: substitution.replace_old_param 6.55% : 0.000018s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.006696 2 88.18% : 0.005904s : 1: type_inference.infer 11.82% : 0.000792s : 1: type_inference.specialize ------[replace.] 0.000079 8 56.25% : 0.000044s : 4: replace.inline 43.75% : 0.000035s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000203 8 92.14% : 0.000187s : 4: match.inline 7.86% : 0.000016s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000308 1596 0.87% : 0.000003s : 17: predicate.accumulaten_eliminater 0.99% : 0.000003s : 5: predicate.ad_related_special_op_eliminate 0.55% : 0.000002s : 10: predicate.addn_check_dump 1.04% : 0.000003s : 17: predicate.addn_zero_filter 0.82% : 0.000003s : 17: predicate.adjust_all_reduce_mul_add 2.25% : 0.000007s : 27: predicate.arithmetic_simplify 0.95% : 0.000003s : 17: predicate.cast_eliminate 0.68% : 0.000002s : 10: predicate.check_bprop_eliminate 0.50% : 0.000002s : 10: predicate.compare_switch_simplify 0.16% : 0.000000s : 5: predicate.const_output_eliminate 0.59% : 0.000002s : 10: predicate.depend_value_elim 0.93% : 0.000003s : 17: predicate.dict_get_item_const_eliminator 1.04% : 0.000003s : 17: predicate.dict_get_item_eliminator 0.87% : 0.000003s : 17: predicate.dict_set_item_eliminator 1.39% : 0.000004s : 10: predicate.dumpgradient_eliminate 0.18% : 0.000001s : 5: predicate.elim_not_effective 0.51% : 0.000002s : 5: predicate.elim_shapecalc_of_broadcastargs 1.18% : 0.000004s : 22: predicate.environ_add_const_eliminate 1.12% : 0.000003s : 22: predicate.environ_get_add_eliminate 1.05% : 0.000003s : 22: predicate.environ_get_depend_swap 1.63% : 0.000005s : 32: predicate.environ_get_eliminate 1.12% : 0.000003s : 22: predicate.environ_get_set_eliminate 1.19% : 0.000004s : 25: predicate.exchange_switch_depend_value 2.14% : 0.000007s : 25: predicate.float_depend_g_call 0.52% : 0.000002s : 10: predicate.float_environ_get_switch 0.77% : 0.000002s : 15: predicate.float_tuple_getitem_switch 0.16% : 0.000000s : 5: predicate.fold_const_symbol 0.72% : 0.000002s : 10: predicate.get_grad_eliminate 0.17% : 0.000001s : 5: predicate.graph_param_transform 0.62% : 0.000002s : 10: predicate.incorporate_call 0.45% : 0.000001s : 10: predicate.incorporate_call_switch 6.30% : 0.000019s : 72: predicate.inline 0.69% : 0.000002s : 10: predicate.inline_without_move 0.30% : 0.000001s : 10: predicate.j_node_and_user_rematch 0.85% : 0.000003s : 10: predicate.less_batch_normalization 1.80% : 0.000006s : 31: predicate.list_to_tuple_eliminator_ 2.43% : 0.000007s : 48: predicate.load_eliminater 1.45% : 0.000004s : 5: predicate.loop_unroll_after_grad 1.74% : 0.000005s : 36: predicate.loop_unroll_before_grad 1.86% : 0.000006s : 27: predicate.make_slice_get_slice_eliminator 0.61% : 0.000002s : 10: predicate.merge_addn 0.67% : 0.000002s : 10: predicate.micro_step_allgather_replace 0.61% : 0.000002s : 10: predicate.mini_step_allgather_replace 0.90% : 0.000003s : 17: predicate.minmaximum_grad 1.72% : 0.000005s : 5: predicate.mutable_eliminate 0.44% : 0.000001s : 5: predicate.opt_reshape 0.37% : 0.000001s : 5: predicate.parallel_virtual_node 1.90% : 0.000006s : 25: predicate.partial_defer_inline 1.39% : 0.000004s : 26: predicate.partial_eliminate 0.95% : 0.000003s : 17: predicate.print_const_string_wrapper 0.65% : 0.000002s : 10: predicate.reduce_all_const_elim 1.10% : 0.000003s : 17: predicate.reduce_eliminate 2.46% : 0.000008s : 48: predicate.redundant_stop_gradient_eliminater 0.41% : 0.000001s : 10: predicate.remove_not_recompute_node 1.43% : 0.000004s : 31: predicate.replace_applicator 0.48% : 0.000001s : 10: predicate.replace_old_param 0.27% : 0.000001s : 5: predicate.reset_defer_inline 1.08% : 0.000003s : 17: predicate.reshape_eliminate 0.84% : 0.000003s : 10: predicate.row_tensor_add_zeros_like 0.55% : 0.000002s : 5: predicate.row_tensor_eliminate 0.83% : 0.000003s : 10: predicate.same_eliminate 0.36% : 0.000001s : 10: predicate.set_cell_output_no_recompute 0.82% : 0.000003s : 10: predicate.shard_identity_eliminate 0.65% : 0.000002s : 10: predicate.special_op_eliminate 0.75% : 0.000002s : 10: predicate.specialize_transform 1.08% : 0.000003s : 10: predicate.split_environ_get_set_with_tuple_value 0.84% : 0.000003s : 10: predicate.stack_unstack_eliminate 0.30% : 0.000001s : 5: predicate.switch_call_monad_eliminater 1.55% : 0.000005s : 25: predicate.switch_defer_inline 1.92% : 0.000006s : 35: predicate.switch_layer_defer_inline 4.40% : 0.000014s : 76: predicate.switch_simplify 0.91% : 0.000003s : 17: predicate.tile_eliminate 0.89% : 0.000003s : 17: predicate.transpose_eliminate 1.55% : 0.000005s : 27: predicate.tuple_list_convert_item_index_to_positive 1.39% : 0.000004s : 27: predicate.tuple_list_get_item_const_eliminator 1.49% : 0.000005s : 27: predicate.tuple_list_get_item_depend_reorder 3.27% : 0.000010s : 41: predicate.tuple_list_get_item_eliminator 1.71% : 0.000005s : 27: predicate.tuple_list_get_set_item_eliminator 2.31% : 0.000007s : 37: predicate.tuple_list_set_item_eliminator 1.77% : 0.000005s : 31: predicate.tuple_to_list_eliminator_ 2.30% : 0.000007s : 48: predicate.updatestate_pure_node_eliminater 2.83% : 0.000009s : 58: predicate.updatestate_useless_node_eliminater 0.36% : 0.000001s : 5: predicate.value_based_eliminate 0.56% : 0.000002s : 10: predicate.virtual_dataset_eliminate 0.72% : 0.000002s : 10: predicate.virtual_output_eliminate 0.28% : 0.000001s : 5: predicate.virtual_view_grad_eliminate 0.72% : 0.000002s : 5: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000665 11 50.72% : 0.000337s : 5: func_graph_cloner_run.FuncGraphClonerGraph 49.28% : 0.000328s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.036378 192 0.02% : 0.000006s : 1: ForceFp32Comm 11.40% : 0.004148s : 1: add_attr 11.33% : 0.004123s : 1: add_attr_with_inline 0.02% : 0.000006s : 1: add_comm_op_reuse_tag 0.22% : 0.000081s : 1: add_recomputation 0.02% : 0.000007s : 1: assign_add_opt 0.22% : 0.000081s : 1: auto_monad 0.11% : 0.000040s : 1: auto_monad_reorder 0.02% : 0.000006s : 1: begin_end_overlap_inline 0.03% : 0.000010s : 1: bias_add_comm_swap 1.28% : 0.000466s : 1: bootstrap 0.14% : 0.000051s : 1: cconv 0.02% : 0.000007s : 1: comm_op_add_attrs 0.07% : 0.000027s : 1: control_data_broadcast_order 0.05% : 0.000018s : 1: convert_after_rewriter 0.12% : 0.000044s : 1: cse_after_recomputation 0.02% : 0.000008s : 1: dataset_repeat_opt 0.10% : 0.000035s : 1: detach_backward 0.04% : 0.000015s : 1: environ_conv 0.09% : 0.000033s : 1: event_method 0.02% : 0.000007s : 1: full_micro_interleaved_order_control 0.02% : 0.000008s : 1: get_jit_bprop_graph 0.04% : 0.000013s : 1: graph_reusing 0.02% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.02% : 0.000006s : 1: handle_group_info 0.02% : 0.000008s : 1: inline 0.02% : 0.000009s : 1: insert-virtual-dataset 0.02% : 0.000006s : 1: interleave_parallel_branches 0.02% : 0.000006s : 1: interleave_split_concat_branches 0.02% : 0.000008s : 1: label_fine_grained_interleaved_index 0.04% : 0.000015s : 1: label_micro_interleaved_index 2.17% : 0.000788s : 1: loop_unroll 0.02% : 0.000007s : 1: merge_cast_opt 0.02% : 0.000007s : 1: micro_interleaved_order_control 2.54% : 0.000923s : 1: mutable_eliminate 0.04% : 0.000013s : 1: offloading_packed_experts 0.09% : 0.000032s : 1: opt.transform.loop_unroll_optimizer 0.10% : 0.000036s : 1: opt.transform.mutable_eliminate 4.56% : 0.001659s : 78: opt.transform.opt_a 0.12% : 0.000044s : 1: opt.transform.opt_after_cconv 0.12% : 0.000045s : 1: opt.transform.opt_after_jit_grad 0.40% : 0.000146s : 28: opt.transform.opt_b 0.20% : 0.000074s : 2: opt.transform.opt_trans_graph 0.16% : 0.000057s : 4: opt.transform.symbol_engine_opt 12.19% : 0.004435s : 1: opt_a 0.54% : 0.000196s : 1: opt_after_cconv 2.42% : 0.000880s : 1: opt_after_jit_grad 1.08% : 0.000394s : 1: opt_b 23.21% : 0.008442s : 1: optimize 0.23% : 0.000085s : 1: optimize_parallel_all_gather_comm 0.04% : 0.000013s : 1: order_py_execute_after_rewriter 0.10% : 0.000037s : 1: overlap_grad_flash_sp 0.02% : 0.000007s : 1: overlap_grad_matmul_and_grad_allreduce 0.03% : 0.000012s : 1: overlap_grad_ring_attention 0.02% : 0.000008s : 1: overlap_opt_shard_grad_in_pipeline 0.02% : 0.000007s : 1: overlap_opt_shard_in_pipeline 0.02% : 0.000009s : 1: overlap_param_gather 0.02% : 0.000007s : 1: overlap_recompute_allgather_and_fa_grad 0.03% : 0.000012s : 1: overlap_recompute_and_grad_model_parallel 0.02% : 0.000007s : 1: overlap_recompute_comm 0.03% : 0.000011s : 1: parallel-infer-symbol 0.02% : 0.000006s : 1: parallel-infer-symbol-second 0.02% : 0.000008s : 1: partial_unused_args_eliminate 0.03% : 0.000012s : 1: pipeline_parallel_scheduler 0.02% : 0.000007s : 1: pipeline_split 0.15% : 0.000054s : 1: pre_auto_parallel 0.12% : 0.000044s : 1: py_interpret_to_execute 0.10% : 0.000036s : 1: py_interpret_to_execute_after_opt_a 0.02% : 0.000008s : 1: remove_cast_before_assign_add 0.07% : 0.000024s : 1: remove_dup_value 1.81% : 0.000657s : 1: renormalize.infer 1.35% : 0.000491s : 1: renormalize.specialize 0.02% : 0.000008s : 1: reorder_send_recv_between_fp_bp 0.04% : 0.000015s : 1: rewriter_after_jit_bprop_graph 0.19% : 0.000070s : 1: rewriter_after_opt_a 0.31% : 0.000113s : 1: rewriter_before_opt_a 0.02% : 0.000009s : 1: slice_cell_reuse_recomputed_activation 0.02% : 0.000008s : 1: slice_recompute_activation 0.02% : 0.000007s : 1: split_layernorm_comm 0.02% : 0.000008s : 1: split_matmul_comm_elemetwise 0.04% : 0.000013s : 1: swap_dp_allreduce_reducescatter 0.39% : 0.000142s : 1: symbol_engine_optimizer 0.34% : 0.000125s : 1: tuple_transform 18.72% : 0.006809s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:44:49.186.525 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0364888, [21] [bootstrap]: 0.00054206 [type_inference]: 0.0242308 [event_method]: 2.274e-05 [auto_monad]: 6.895e-05 [graph_reusing]: 6.20002e-06 [inline]: 3.23e-06 [add_attr]: 0.00442111, [1] [add_attr_with_inline]: 0.00440541, [1] [Cycle 1]: 0.00011157, [2] [tag_attr]: 2.668e-05 [meta_addattr_fg_expand]: 7.1e-06 [parallel-infer-symbol]: 4.54002e-06 [pre_auto_parallel]: 4.547e-05 [insert-virtual-dataset]: 2.78e-06 [parallel-infer-symbol-second]: 8.79983e-07 [dataset_repeat_opt]: 2.39999e-06 [pipeline_split]: 1.92001e-06 [optimize]: 0.00630737, [53] [py_interpret_to_execute]: 3.504e-05 [rewriter_before_opt_a]: 0.00010409 [opt_a]: 0.00363817, [2] [Cycle 1]: 0.00275954, [45] [expand_dump_flag]: 3.29001e-06 [switch_simplify]: 4.92e-05 [loop_unroll]: 3.208e-05 [a_1]: 0.0008048 [with_stream_mark]: 2.144e-05 [recompute_prepare]: 1.179e-05 [updatestate_depend_eliminate]: 5.57001e-06 [updatestate_assign_eliminate]: 3.71001e-06 [updatestate_loads_eliminate]: 4.02998e-06 [parameter_eliminate]: 2.53e-06 [a_2]: 0.00011019 [accelerated_algorithm]: 1.017e-05 [shard]: 2.58e-06 [meta_shard_fg_expand]: 2.76999e-06 [shard_inline]: 8.3e-06 [merge_send_recv]: 1.061e-05 [auto_parallel]: 9.57001e-06 [parallel]: 2.12e-05 [flash_sp]: 1.17e-05 [merge_comm]: 5.92001e-06 [allreduce_fusion]: 4.28999e-06 [matmul_add_comm_reduction]: 1.104e-05 [allreduce_slice_to_reducescatter]: 9.70002e-07 [virtual_shard_identity]: 1.074e-05 [virtual_dataset]: 8.65999e-06 [get_grad_eliminate_]: 7.95e-06 [virtual_output]: 8.42998e-06 [merge_forward]: 4.68999e-06 [cell_reuse_recompute_pass]: 1.60001e-06 [offload_activation]: 1.315e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.727e-05 [merge_recompute_call_nodes]: 1.52001e-06 [before_grad]: 1.396e-05 [set_forward_comm_id_for_comm_node_pass]: 4.42e-06 [meta_fg_expand]: 3.81999e-06 [flash_sp_send_recv_attached]: 3.48e-06 [receive_attached]: 2.37999e-06 [after_resolve]: 1.566e-05 [a_after_grad]: 1.268e-05 [renormalize]: 0.00105204 [add_forward_monad_depend]: 8.80999e-06 [auto_monad_grad]: 2.68003e-06 [auto_monad_eliminator]: 2.175e-05 [cse]: 4.003e-05 [a_3]: 6.872e-05 [Cycle 2]: 0.00086491, [45] [expand_dump_flag]: 2.16e-06 [switch_simplify]: 1.019e-05 [loop_unroll]: 7.87e-06 [a_1]: 0.00019781 [with_stream_mark]: 2.028e-05 [recompute_prepare]: 7.68999e-06 [updatestate_depend_eliminate]: 5.10001e-06 [updatestate_assign_eliminate]: 3.53999e-06 [updatestate_loads_eliminate]: 3.26001e-06 [parameter_eliminate]: 1.61002e-06 [a_2]: 9.486e-05 [accelerated_algorithm]: 7.95e-06 [shard]: 1.65001e-06 [meta_shard_fg_expand]: 2.76999e-06 [shard_inline]: 7.81001e-06 [merge_send_recv]: 8.87e-06 [auto_parallel]: 9.99999e-06 [parallel]: 1.49e-05 [flash_sp]: 3.97e-06 [merge_comm]: 4.35e-06 [allreduce_fusion]: 4.25e-06 [matmul_add_comm_reduction]: 9.56e-06 [allreduce_slice_to_reducescatter]: 6.89994e-07 [virtual_shard_identity]: 9.33002e-06 [virtual_dataset]: 7.11001e-06 [get_grad_eliminate_]: 7.51999e-06 [virtual_output]: 8.1e-06 [merge_forward]: 4.95001e-06 [cell_reuse_recompute_pass]: 2.66e-06 [offload_activation]: 9.86e-06 [cell_reuse_handle_not_recompute_node_pass]: 2.015e-05 [merge_recompute_call_nodes]: 1.59e-06 [before_grad]: 1.238e-05 [set_forward_comm_id_for_comm_node_pass]: 5.09e-06 [meta_fg_expand]: 3.33998e-06 [flash_sp_send_recv_attached]: 1.85001e-06 [receive_attached]: 2.16e-06 [after_resolve]: 1.436e-05 [a_after_grad]: 1.214e-05 [renormalize]: 8.9989e-08 [add_forward_monad_depend]: 2.11e-06 [auto_monad_grad]: 1.91e-06 [auto_monad_eliminator]: 1.283e-05 [cse]: 2.523e-05 [a_3]: 4.673e-05 [py_interpret_to_execute_after_opt_a]: 1.912e-05 [slice_cell_reuse_recomputed_activation]: 2.58e-06 [rewriter_after_opt_a]: 7.691e-05 [convert_after_rewriter]: 9.79999e-06 [order_py_execute_after_rewriter]: 6.11e-06 [mutable_eliminate]: 0.00076426 [opt_b]: 0.00026922, [1] [Cycle 1]: 0.00026032, [7] [b_1]: 0.00016346 [b_2]: 1.096e-05 [updatestate_depend_eliminate]: 8.30999e-06 [updatestate_assign_eliminate]: 3.81001e-06 [updatestate_loads_eliminate]: 3.03e-06 [renormalize]: 3.9002e-07 [cse]: 3.197e-05 [optimize_parallel_all_gather_comm]: 2.197e-05 [overlap_param_gather]: 1.96e-06 [cconv]: 3.483e-05 [loop_unroll]: 0.00054245 [opt_after_cconv]: 0.00012506, [1] [Cycle 1]: 0.00011827, [7] [c_1]: 3.842e-05 [parameter_eliminate]: 5.71e-06 [updatestate_depend_eliminate]: 6.39999e-06 [updatestate_assign_eliminate]: 3.04999e-06 [updatestate_loads_eliminate]: 2.98e-06 [cse]: 2.607e-05 [renormalize]: 5.39992e-07 [remove_dup_value]: 1.7e-05 [tuple_transform]: 9.399e-05, [1] [Cycle 1]: 8.894e-05, [4] [d_1]: 5.93e-05 [none_parameter_eliminate]: 1.98002e-06 [renormalize]: 2.29978e-07 [switch_simplify]: 8.84e-06 [partial_unused_args_eliminate]: 2.24999e-06 [add_recomputation]: 7.34e-05 [cse_after_recomputation]: 2.729e-05, [1] [Cycle 1]: 2.261e-05, [1] [cse]: 1.662e-05 [environ_conv]: 7.53e-06 [swap_dp_allreduce_reducescatter]: 6.41998e-06 [bias_add_comm_swap]: 3.30003e-06 [label_micro_interleaved_index]: 5.34e-06 [label_fine_grained_interleaved_index]: 2.58998e-06 [merge_cast_opt]: 1.44e-06 [slice_recompute_activation]: 2.16998e-06 [micro_interleaved_order_control]: 2.27001e-06 [assign_add_opt]: 1.49e-06 [ForceFp32Comm]: 8.80013e-07 [remove_cast_before_assign_add]: 1.18001e-06 [full_micro_interleaved_order_control]: 2.34999e-06 [reorder_send_recv_between_fp_bp]: 2.70002e-06 [comm_op_add_attrs]: 1.02998e-06 [add_comm_op_reuse_tag]: 9.39996e-07 [interleave_split_concat_branches]: 1.07e-06 [interleave_parallel_branches]: 1.02e-06 [overlap_opt_shard_in_pipeline]: 1.35999e-06 [overlap_opt_shard_grad_in_pipeline]: 1.82999e-06 [control_data_broadcast_order]: 1.649e-05 [grouped_pairwise_exchange_alltoall]: 1.72001e-06 [offloading_packed_experts]: 4.53001e-06 [overlap_recompute_and_grad_model_parallel]: 5.92999e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.18001e-06 [overlap_recompute_allgather_and_fa_grad]: 1.37e-06 [overlap_recompute_comm]: 2.21e-06 [overlap_grad_ring_attention]: 4.55999e-06 [overlap_grad_flash_sp]: 2.496e-05 [begin_end_overlap_inline]: 5.50004e-07 [split_matmul_comm_elemetwise]: 2.12999e-06 [split_layernorm_comm]: 1.82999e-06 [handle_group_info]: 1.57001e-06 [symbol_engine_optimizer]: 8.959e-05, [1] [Cycle 1]: 8.464e-05, [6] [build]: 4.1e-06 [elim_shapecalc]: 1.199e-05 [elim_not_effective]: 1.679e-05 [opt_reshape]: 8.67e-06 [fold_const_symbol]: 1.278e-05 [renormalize]: 2.00002e-07 [detach_backward]: 2.37001e-06 [pipeline_parallel_scheduler]: 1.89e-06 [auto_monad_reorder]: 2.055e-05 [get_jit_bprop_graph]: 2.73e-06 [rewriter_after_jit_bprop_graph]: 5.90002e-06 [opt_after_jit_grad]: 0.0005521 [validate]: 5.503e-05 Sums bootstrap : 0.000542s : 1.75% type_inference : 0.024231s : 78.18% event_method : 0.000023s : 0.07% auto_monad : 0.000069s : 0.22% graph_reusing : 0.000006s : 0.02% inline : 0.000003s : 0.01% add_attr.add_attr_with_inline.tag_attr : 0.000027s : 0.09% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000007s : 0.02% parallel-infer-symbol : 0.000005s : 0.01% pre_auto_parallel : 0.000045s : 0.15% insert-virtual-dataset : 0.000003s : 0.01% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.01% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000035s : 0.11% optimize.rewriter_before_opt_a : 0.000104s : 0.34% optimize.opt_a.expand_dump_flag : 0.000005s : 0.02% optimize.opt_a.switch_simplify : 0.000059s : 0.19% optimize.opt_a.loop_unroll : 0.000040s : 0.13% optimize.opt_a.a_1 : 0.001003s : 3.23% optimize.opt_a.with_stream_mark : 0.000042s : 0.13% optimize.opt_a.recompute_prepare : 0.000019s : 0.06% optimize.opt_a.updatestate_depend_eliminate : 0.000011s : 0.03% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.02% optimize.opt_a.updatestate_loads_eliminate : 0.000007s : 0.02% optimize.opt_a.parameter_eliminate : 0.000004s : 0.01% optimize.opt_a.a_2 : 0.000205s : 0.66% optimize.opt_a.accelerated_algorithm : 0.000018s : 0.06% optimize.opt_a.shard : 0.000004s : 0.01% optimize.opt_a.meta_shard_fg_expand : 0.000006s : 0.02% optimize.opt_a.shard_inline : 0.000016s : 0.05% optimize.opt_a.merge_send_recv : 0.000019s : 0.06% optimize.opt_a.auto_parallel : 0.000020s : 0.06% optimize.opt_a.parallel : 0.000036s : 0.12% optimize.opt_a.flash_sp : 0.000016s : 0.05% optimize.opt_a.merge_comm : 0.000010s : 0.03% optimize.opt_a.allreduce_fusion : 0.000009s : 0.03% optimize.opt_a.matmul_add_comm_reduction : 0.000021s : 0.07% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000002s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000020s : 0.06% optimize.opt_a.virtual_dataset : 0.000016s : 0.05% optimize.opt_a.get_grad_eliminate_ : 0.000015s : 0.05% optimize.opt_a.virtual_output : 0.000017s : 0.05% optimize.opt_a.merge_forward : 0.000010s : 0.03% optimize.opt_a.cell_reuse_recompute_pass : 0.000004s : 0.01% optimize.opt_a.offload_activation : 0.000023s : 0.07% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000037s : 0.12% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.01% optimize.opt_a.before_grad : 0.000026s : 0.08% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000010s : 0.03% optimize.opt_a.meta_fg_expand : 0.000007s : 0.02% optimize.opt_a.flash_sp_send_recv_attached : 0.000005s : 0.02% optimize.opt_a.receive_attached : 0.000005s : 0.01% optimize.opt_a.after_resolve : 0.000030s : 0.10% optimize.opt_a.a_after_grad : 0.000025s : 0.08% optimize.opt_a.renormalize : 0.001052s : 3.39% optimize.opt_a.add_forward_monad_depend : 0.000011s : 0.04% optimize.opt_a.auto_monad_grad : 0.000005s : 0.01% optimize.opt_a.auto_monad_eliminator : 0.000035s : 0.11% optimize.opt_a.cse : 0.000065s : 0.21% optimize.opt_a.a_3 : 0.000115s : 0.37% optimize.py_interpret_to_execute_after_opt_a : 0.000019s : 0.06% optimize.slice_cell_reuse_recomputed_activation : 0.000003s : 0.01% optimize.rewriter_after_opt_a : 0.000077s : 0.25% optimize.convert_after_rewriter : 0.000010s : 0.03% optimize.order_py_execute_after_rewriter : 0.000006s : 0.02% optimize.mutable_eliminate : 0.000764s : 2.47% optimize.opt_b.b_1 : 0.000163s : 0.53% optimize.opt_b.b_2 : 0.000011s : 0.04% optimize.opt_b.updatestate_depend_eliminate : 0.000008s : 0.03% optimize.opt_b.updatestate_assign_eliminate : 0.000004s : 0.01% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.01% optimize.opt_b.renormalize : 0.000000s : 0.00% optimize.opt_b.cse : 0.000032s : 0.10% optimize.optimize_parallel_all_gather_comm : 0.000022s : 0.07% optimize.overlap_param_gather : 0.000002s : 0.01% optimize.cconv : 0.000035s : 0.11% optimize.loop_unroll : 0.000542s : 1.75% optimize.opt_after_cconv.c_1 : 0.000038s : 0.12% optimize.opt_after_cconv.parameter_eliminate : 0.000006s : 0.02% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.02% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.cse : 0.000026s : 0.08% optimize.opt_after_cconv.renormalize : 0.000001s : 0.00% optimize.remove_dup_value : 0.000017s : 0.05% optimize.tuple_transform.d_1 : 0.000059s : 0.19% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000009s : 0.03% optimize.partial_unused_args_eliminate : 0.000002s : 0.01% optimize.add_recomputation : 0.000073s : 0.24% optimize.cse_after_recomputation.cse : 0.000017s : 0.05% optimize.environ_conv : 0.000008s : 0.02% optimize.swap_dp_allreduce_reducescatter : 0.000006s : 0.02% optimize.bias_add_comm_swap : 0.000003s : 0.01% optimize.label_micro_interleaved_index : 0.000005s : 0.02% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.01% optimize.merge_cast_opt : 0.000001s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.01% optimize.micro_interleaved_order_control : 0.000002s : 0.01% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.00% optimize.full_micro_interleaved_order_control : 0.000002s : 0.01% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.01% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.01% optimize.control_data_broadcast_order : 0.000016s : 0.05% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.01% optimize.offloading_packed_experts : 0.000005s : 0.01% optimize.overlap_recompute_and_grad_model_parallel : 0.000006s : 0.02% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.00% optimize.overlap_recompute_comm : 0.000002s : 0.01% optimize.overlap_grad_ring_attention : 0.000005s : 0.01% optimize.overlap_grad_flash_sp : 0.000025s : 0.08% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.01% optimize.split_layernorm_comm : 0.000002s : 0.01% optimize.handle_group_info : 0.000002s : 0.01% optimize.symbol_engine_optimizer.build : 0.000004s : 0.01% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000012s : 0.04% optimize.symbol_engine_optimizer.elim_not_effective : 0.000017s : 0.05% optimize.symbol_engine_optimizer.opt_reshape : 0.000009s : 0.03% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000013s : 0.04% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.01% pipeline_parallel_scheduler : 0.000002s : 0.01% auto_monad_reorder : 0.000021s : 0.07% get_jit_bprop_graph : 0.000003s : 0.01% rewriter_after_jit_bprop_graph : 0.000006s : 0.02% opt_after_jit_grad : 0.000552s : 1.78% validate : 0.000055s : 0.18% Time group info: ------[substitution.] 0.000266 38 11.70% : 0.000031s : 3: substitution.cast_eliminate 0.84% : 0.000002s : 3: substitution.elim_not_effective 0.64% : 0.000002s : 3: substitution.fold_const_symbol 2.95% : 0.000008s : 5: substitution.graph_param_transform 68.70% : 0.000183s : 4: substitution.inline 2.23% : 0.000006s : 6: substitution.j_node_and_user_rematch 4.17% : 0.000011s : 6: substitution.remove_not_recompute_node 2.73% : 0.000007s : 4: substitution.replace_old_param 6.05% : 0.000016s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.024149 2 96.54% : 0.023312s : 1: type_inference.infer 3.46% : 0.000836s : 1: type_inference.specialize ------[replace.] 0.000072 8 61.63% : 0.000044s : 4: replace.inline 38.37% : 0.000028s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000194 8 92.72% : 0.000180s : 4: match.inline 7.28% : 0.000014s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000282 1596 1.03% : 0.000003s : 17: predicate.accumulaten_eliminater 0.80% : 0.000002s : 5: predicate.ad_related_special_op_eliminate 0.57% : 0.000002s : 10: predicate.addn_check_dump 1.02% : 0.000003s : 17: predicate.addn_zero_filter 0.88% : 0.000002s : 17: predicate.adjust_all_reduce_mul_add 2.10% : 0.000006s : 27: predicate.arithmetic_simplify 1.07% : 0.000003s : 17: predicate.cast_eliminate 0.52% : 0.000001s : 10: predicate.check_bprop_eliminate 0.62% : 0.000002s : 10: predicate.compare_switch_simplify 0.17% : 0.000000s : 5: predicate.const_output_eliminate 0.64% : 0.000002s : 10: predicate.depend_value_elim 0.96% : 0.000003s : 17: predicate.dict_get_item_const_eliminator 1.16% : 0.000003s : 17: predicate.dict_get_item_eliminator 0.85% : 0.000002s : 17: predicate.dict_set_item_eliminator 1.00% : 0.000003s : 10: predicate.dumpgradient_eliminate 0.17% : 0.000000s : 5: predicate.elim_not_effective 0.49% : 0.000001s : 5: predicate.elim_shapecalc_of_broadcastargs 1.31% : 0.000004s : 22: predicate.environ_add_const_eliminate 1.12% : 0.000003s : 22: predicate.environ_get_add_eliminate 1.13% : 0.000003s : 22: predicate.environ_get_depend_swap 1.70% : 0.000005s : 32: predicate.environ_get_eliminate 1.23% : 0.000003s : 22: predicate.environ_get_set_eliminate 1.45% : 0.000004s : 25: predicate.exchange_switch_depend_value 2.25% : 0.000006s : 25: predicate.float_depend_g_call 0.57% : 0.000002s : 10: predicate.float_environ_get_switch 0.85% : 0.000002s : 15: predicate.float_tuple_getitem_switch 0.17% : 0.000000s : 5: predicate.fold_const_symbol 0.68% : 0.000002s : 10: predicate.get_grad_eliminate 0.20% : 0.000001s : 5: predicate.graph_param_transform 0.53% : 0.000002s : 10: predicate.incorporate_call 0.50% : 0.000001s : 10: predicate.incorporate_call_switch 5.76% : 0.000016s : 72: predicate.inline 0.75% : 0.000002s : 10: predicate.inline_without_move 0.30% : 0.000001s : 10: predicate.j_node_and_user_rematch 0.92% : 0.000003s : 10: predicate.less_batch_normalization 1.85% : 0.000005s : 31: predicate.list_to_tuple_eliminator_ 2.48% : 0.000007s : 48: predicate.load_eliminater 0.99% : 0.000003s : 5: predicate.loop_unroll_after_grad 1.93% : 0.000005s : 36: predicate.loop_unroll_before_grad 1.66% : 0.000005s : 27: predicate.make_slice_get_slice_eliminator 0.56% : 0.000002s : 10: predicate.merge_addn 0.59% : 0.000002s : 10: predicate.micro_step_allgather_replace 0.58% : 0.000002s : 10: predicate.mini_step_allgather_replace 0.81% : 0.000002s : 17: predicate.minmaximum_grad 1.47% : 0.000004s : 5: predicate.mutable_eliminate 0.33% : 0.000001s : 5: predicate.opt_reshape 0.36% : 0.000001s : 5: predicate.parallel_virtual_node 1.88% : 0.000005s : 25: predicate.partial_defer_inline 1.55% : 0.000004s : 26: predicate.partial_eliminate 1.01% : 0.000003s : 17: predicate.print_const_string_wrapper 0.67% : 0.000002s : 10: predicate.reduce_all_const_elim 1.26% : 0.000004s : 17: predicate.reduce_eliminate 2.46% : 0.000007s : 48: predicate.redundant_stop_gradient_eliminater 0.35% : 0.000001s : 10: predicate.remove_not_recompute_node 1.28% : 0.000004s : 31: predicate.replace_applicator 0.38% : 0.000001s : 10: predicate.replace_old_param 0.29% : 0.000001s : 5: predicate.reset_defer_inline 1.20% : 0.000003s : 17: predicate.reshape_eliminate 0.59% : 0.000002s : 10: predicate.row_tensor_add_zeros_like 0.34% : 0.000001s : 5: predicate.row_tensor_eliminate 0.85% : 0.000002s : 10: predicate.same_eliminate 0.38% : 0.000001s : 10: predicate.set_cell_output_no_recompute 0.88% : 0.000002s : 10: predicate.shard_identity_eliminate 0.67% : 0.000002s : 10: predicate.special_op_eliminate 0.67% : 0.000002s : 10: predicate.specialize_transform 1.00% : 0.000003s : 10: predicate.split_environ_get_set_with_tuple_value 0.93% : 0.000003s : 10: predicate.stack_unstack_eliminate 0.37% : 0.000001s : 5: predicate.switch_call_monad_eliminater 1.39% : 0.000004s : 25: predicate.switch_defer_inline 1.96% : 0.000006s : 35: predicate.switch_layer_defer_inline 4.59% : 0.000013s : 76: predicate.switch_simplify 0.95% : 0.000003s : 17: predicate.tile_eliminate 0.86% : 0.000002s : 17: predicate.transpose_eliminate 1.61% : 0.000005s : 27: predicate.tuple_list_convert_item_index_to_positive 1.56% : 0.000004s : 27: predicate.tuple_list_get_item_const_eliminator 1.69% : 0.000005s : 27: predicate.tuple_list_get_item_depend_reorder 3.59% : 0.000010s : 41: predicate.tuple_list_get_item_eliminator 1.40% : 0.000004s : 27: predicate.tuple_list_get_set_item_eliminator 2.51% : 0.000007s : 37: predicate.tuple_list_set_item_eliminator 1.92% : 0.000005s : 31: predicate.tuple_to_list_eliminator_ 2.45% : 0.000007s : 48: predicate.updatestate_pure_node_eliminater 3.03% : 0.000009s : 58: predicate.updatestate_useless_node_eliminater 0.35% : 0.000001s : 5: predicate.value_based_eliminate 0.75% : 0.000002s : 10: predicate.virtual_dataset_eliminate 0.59% : 0.000002s : 10: predicate.virtual_output_eliminate 0.25% : 0.000001s : 5: predicate.virtual_view_grad_eliminate 0.45% : 0.000001s : 5: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000728 11 55.05% : 0.000401s : 5: func_graph_cloner_run.FuncGraphClonerGraph 44.95% : 0.000327s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.050015 192 0.01% : 0.000004s : 1: ForceFp32Comm 8.85% : 0.004428s : 1: add_attr 8.82% : 0.004410s : 1: add_attr_with_inline 0.01% : 0.000004s : 1: add_comm_op_reuse_tag 0.16% : 0.000078s : 1: add_recomputation 0.01% : 0.000004s : 1: assign_add_opt 0.15% : 0.000076s : 1: auto_monad 0.05% : 0.000025s : 1: auto_monad_reorder 0.01% : 0.000003s : 1: begin_end_overlap_inline 0.01% : 0.000006s : 1: bias_add_comm_swap 1.16% : 0.000580s : 1: bootstrap 0.08% : 0.000039s : 1: cconv 0.01% : 0.000004s : 1: comm_op_add_attrs 0.04% : 0.000020s : 1: control_data_broadcast_order 0.03% : 0.000014s : 1: convert_after_rewriter 0.06% : 0.000030s : 1: cse_after_recomputation 0.01% : 0.000005s : 1: dataset_repeat_opt 0.01% : 0.000006s : 1: detach_backward 0.02% : 0.000011s : 1: environ_conv 0.06% : 0.000030s : 1: event_method 0.01% : 0.000005s : 1: full_micro_interleaved_order_control 0.01% : 0.000006s : 1: get_jit_bprop_graph 0.02% : 0.000010s : 1: graph_reusing 0.01% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000004s : 1: handle_group_info 0.01% : 0.000006s : 1: inline 0.01% : 0.000007s : 1: insert-virtual-dataset 0.01% : 0.000004s : 1: interleave_parallel_branches 0.01% : 0.000004s : 1: interleave_split_concat_branches 0.01% : 0.000006s : 1: label_fine_grained_interleaved_index 0.02% : 0.000008s : 1: label_micro_interleaved_index 1.10% : 0.000552s : 1: loop_unroll 0.01% : 0.000004s : 1: merge_cast_opt 0.01% : 0.000005s : 1: micro_interleaved_order_control 1.55% : 0.000777s : 1: mutable_eliminate 0.01% : 0.000007s : 1: offloading_packed_experts 0.04% : 0.000018s : 1: opt.transform.loop_unroll_optimizer 0.05% : 0.000026s : 1: opt.transform.mutable_eliminate 3.10% : 0.001551s : 78: opt.transform.opt_a 0.07% : 0.000037s : 1: opt.transform.opt_after_cconv 0.06% : 0.000031s : 1: opt.transform.opt_after_jit_grad 0.28% : 0.000139s : 28: opt.transform.opt_b 0.13% : 0.000065s : 2: opt.transform.opt_trans_graph 0.09% : 0.000046s : 4: opt.transform.symbol_engine_opt 7.28% : 0.003642s : 1: opt_a 0.26% : 0.000129s : 1: opt_after_cconv 1.12% : 0.000561s : 1: opt_after_jit_grad 0.55% : 0.000273s : 1: opt_b 12.62% : 0.006314s : 1: optimize 0.05% : 0.000025s : 1: optimize_parallel_all_gather_comm 0.02% : 0.000009s : 1: order_py_execute_after_rewriter 0.06% : 0.000028s : 1: overlap_grad_flash_sp 0.01% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.02% : 0.000008s : 1: overlap_grad_ring_attention 0.01% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.01% : 0.000005s : 1: overlap_param_gather 0.01% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.02% : 0.000009s : 1: overlap_recompute_and_grad_model_parallel 0.01% : 0.000005s : 1: overlap_recompute_comm 0.02% : 0.000009s : 1: parallel-infer-symbol 0.01% : 0.000004s : 1: parallel-infer-symbol-second 0.01% : 0.000005s : 1: partial_unused_args_eliminate 0.01% : 0.000005s : 1: pipeline_parallel_scheduler 0.01% : 0.000005s : 1: pipeline_split 0.10% : 0.000050s : 1: pre_auto_parallel 0.08% : 0.000039s : 1: py_interpret_to_execute 0.05% : 0.000023s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000004s : 1: remove_cast_before_assign_add 0.04% : 0.000021s : 1: remove_dup_value 1.14% : 0.000572s : 1: renormalize.infer 0.94% : 0.000468s : 1: renormalize.specialize 0.01% : 0.000005s : 1: reorder_send_recv_between_fp_bp 0.02% : 0.000009s : 1: rewriter_after_jit_bprop_graph 0.17% : 0.000083s : 1: rewriter_after_opt_a 0.22% : 0.000108s : 1: rewriter_before_opt_a 0.01% : 0.000006s : 1: slice_cell_reuse_recomputed_activation 0.01% : 0.000005s : 1: slice_recompute_activation 0.01% : 0.000005s : 1: split_layernorm_comm 0.01% : 0.000005s : 1: split_matmul_comm_elemetwise 0.02% : 0.000009s : 1: swap_dp_allreduce_reducescatter 0.18% : 0.000092s : 1: symbol_engine_optimizer 0.19% : 0.000097s : 1: tuple_transform 48.51% : 0.024261s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:44:49.791.164 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:44:49.791.460 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0308576, [21] [bootstrap]: 0.00043486 [type_inference]: 0.00612095 [event_method]: 1.961e-05 [auto_monad]: 6.122e-05 [graph_reusing]: 5.54e-06 [inline]: 1.94999e-06 [add_attr]: 0.00353587, [1] [add_attr_with_inline]: 0.00352401, [1] [Cycle 1]: 8.871e-05, [2] [tag_attr]: 2.301e-05 [meta_addattr_fg_expand]: 5.77999e-06 [parallel-infer-symbol]: 4.13001e-06 [pre_auto_parallel]: 4.084e-05 [insert-virtual-dataset]: 2.66e-06 [parallel-infer-symbol-second]: 9.50007e-07 [dataset_repeat_opt]: 2.11e-06 [pipeline_split]: 1.67999e-06 [optimize]: 0.0189273, [53] [py_interpret_to_execute]: 3.433e-05 [rewriter_before_opt_a]: 9.841e-05 [opt_a]: 0.0157513, [2] [Cycle 1]: 0.0147435, [45] [expand_dump_flag]: 2.73e-06 [switch_simplify]: 4.327e-05 [loop_unroll]: 3.086e-05 [a_1]: 0.00064162 [with_stream_mark]: 1.909e-05 [recompute_prepare]: 9.35001e-06 [updatestate_depend_eliminate]: 4.32e-06 [updatestate_assign_eliminate]: 3.4e-06 [updatestate_loads_eliminate]: 3.54002e-06 [parameter_eliminate]: 1.94e-06 [a_2]: 0.00010982 [accelerated_algorithm]: 7.66001e-06 [shard]: 1.67999e-06 [meta_shard_fg_expand]: 1.91003e-06 [shard_inline]: 6.84999e-06 [merge_send_recv]: 9.00001e-06 [auto_parallel]: 6.81001e-06 [parallel]: 1.92e-05 [flash_sp]: 9.16002e-06 [merge_comm]: 4.13001e-06 [allreduce_fusion]: 4e-06 [matmul_add_comm_reduction]: 9.71e-06 [allreduce_slice_to_reducescatter]: 6.19999e-07 [virtual_shard_identity]: 7.78001e-06 [virtual_dataset]: 6.83e-06 [get_grad_eliminate_]: 6.49001e-06 [virtual_output]: 7.53999e-06 [merge_forward]: 4.09002e-06 [cell_reuse_recompute_pass]: 1.15999e-06 [offload_activation]: 1.058e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.617e-05 [merge_recompute_call_nodes]: 1.79e-06 [before_grad]: 5.37e-05 [set_forward_comm_id_for_comm_node_pass]: 2.136e-05 [meta_fg_expand]: 6.01e-06 [flash_sp_send_recv_attached]: 8.99e-06 [receive_attached]: 2.94001e-06 [after_resolve]: 1.983e-05 [a_after_grad]: 1.326e-05 [renormalize]: 0.00093038 [add_forward_monad_depend]: 9.42001e-06 [auto_monad_grad]: 2.58e-06 [auto_monad_eliminator]: 1.868e-05 [cse]: 3.164e-05 [a_3]: 7.407e-05 [Cycle 2]: 0.00098947, [45] [expand_dump_flag]: 3.60998e-06 [switch_simplify]: 9.49999e-06 [loop_unroll]: 8.13999e-06 [a_1]: 0.00014725 [with_stream_mark]: 1.96e-05 [recompute_prepare]: 7.68999e-06 [updatestate_depend_eliminate]: 3.93999e-06 [updatestate_assign_eliminate]: 3.60003e-06 [updatestate_loads_eliminate]: 3.14999e-06 [parameter_eliminate]: 2.06998e-06 [a_2]: 0.00010958 [accelerated_algorithm]: 7.36999e-06 [shard]: 2.76e-06 [meta_shard_fg_expand]: 2.53998e-06 [shard_inline]: 7.2e-06 [merge_send_recv]: 7.83999e-06 [auto_parallel]: 1.229e-05 [parallel]: 1.47e-05 [flash_sp]: 4.60001e-06 [merge_comm]: 3.9e-06 [allreduce_fusion]: 3.69002e-06 [matmul_add_comm_reduction]: 1.047e-05 [allreduce_slice_to_reducescatter]: 1.07e-06 [virtual_shard_identity]: 1.005e-05 [virtual_dataset]: 6.71999e-06 [get_grad_eliminate_]: 6.99001e-06 [virtual_output]: 8.54e-06 [merge_forward]: 4.63999e-06 [cell_reuse_recompute_pass]: 3.42002e-06 [offload_activation]: 1.189e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.963e-05 [merge_recompute_call_nodes]: 1.74998e-06 [before_grad]: 1.242e-05 [set_forward_comm_id_for_comm_node_pass]: 4.63001e-06 [meta_fg_expand]: 3.27997e-06 [flash_sp_send_recv_attached]: 1.37999e-06 [receive_attached]: 3.18e-06 [after_resolve]: 1.362e-05 [a_after_grad]: 1.03e-05 [renormalize]: 8.00064e-08 [add_forward_monad_depend]: 2.26e-06 [auto_monad_grad]: 1.25999e-06 [auto_monad_eliminator]: 1.115e-05 [cse]: 1.921e-05 [a_3]: 5.688e-05 [py_interpret_to_execute_after_opt_a]: 2.153e-05 [slice_cell_reuse_recomputed_activation]: 5.55001e-06 [rewriter_after_opt_a]: 5.512e-05 [convert_after_rewriter]: 1.166e-05 [order_py_execute_after_rewriter]: 8.76997e-06 [mutable_eliminate]: 0.00083876 [opt_b]: 0.00031861, [1] [Cycle 1]: 0.00030528, [7] [b_1]: 0.00017858 [b_2]: 1.157e-05 [updatestate_depend_eliminate]: 1.025e-05 [updatestate_assign_eliminate]: 3.51999e-06 [updatestate_loads_eliminate]: 3.14999e-06 [renormalize]: 1.03001e-06 [cse]: 3.295e-05 [optimize_parallel_all_gather_comm]: 2.741e-05 [overlap_param_gather]: 5.73997e-06 [cconv]: 4.296e-05 [loop_unroll]: 0.00074264 [opt_after_cconv]: 0.000166, [1] [Cycle 1]: 0.00015427, [7] [c_1]: 3.895e-05 [parameter_eliminate]: 7.18e-06 [updatestate_depend_eliminate]: 8.18001e-06 [updatestate_assign_eliminate]: 3.5e-06 [updatestate_loads_eliminate]: 2.39999e-06 [cse]: 3.068e-05 [renormalize]: 8.49977e-07 [remove_dup_value]: 2.034e-05 [tuple_transform]: 0.00010751, [1] [Cycle 1]: 9.878e-05, [4] [d_1]: 5.456e-05 [none_parameter_eliminate]: 1.97001e-06 [renormalize]: 2.19996e-07 [switch_simplify]: 8.09002e-06 [partial_unused_args_eliminate]: 4.95001e-06 [add_recomputation]: 6.505e-05 [cse_after_recomputation]: 3.06e-05, [1] [Cycle 1]: 2.319e-05, [1] [cse]: 1.297e-05 [environ_conv]: 9.46003e-06 [swap_dp_allreduce_reducescatter]: 8.99e-06 [bias_add_comm_swap]: 6.29999e-06 [label_micro_interleaved_index]: 9.64e-06 [label_fine_grained_interleaved_index]: 5.66e-06 [merge_cast_opt]: 3.96001e-06 [slice_recompute_activation]: 5.07999e-06 [micro_interleaved_order_control]: 5.22e-06 [assign_add_opt]: 4.27e-06 [ForceFp32Comm]: 3.71001e-06 [remove_cast_before_assign_add]: 3.78999e-06 [full_micro_interleaved_order_control]: 4.63999e-06 [reorder_send_recv_between_fp_bp]: 5.66003e-06 [comm_op_add_attrs]: 3.75998e-06 [add_comm_op_reuse_tag]: 3.63e-06 [interleave_split_concat_branches]: 3.55e-06 [interleave_parallel_branches]: 3.78001e-06 [overlap_opt_shard_in_pipeline]: 4.42e-06 [overlap_opt_shard_grad_in_pipeline]: 4.58999e-06 [control_data_broadcast_order]: 2.043e-05 [grouped_pairwise_exchange_alltoall]: 3.91001e-06 [offloading_packed_experts]: 6.87002e-06 [overlap_recompute_and_grad_model_parallel]: 8.05e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.63e-06 [overlap_recompute_allgather_and_fa_grad]: 4.02998e-06 [overlap_recompute_comm]: 5.54e-06 [overlap_grad_ring_attention]: 6.95002e-06 [overlap_grad_flash_sp]: 2.687e-05 [begin_end_overlap_inline]: 3.04001e-06 [split_matmul_comm_elemetwise]: 4.91997e-06 [split_layernorm_comm]: 3.93999e-06 [handle_group_info]: 3.86999e-06 [symbol_engine_optimizer]: 0.00011676, [1] [Cycle 1]: 0.0001078, [6] [build]: 4.79002e-06 [elim_shapecalc]: 1.396e-05 [elim_not_effective]: 1.734e-05 [opt_reshape]: 8.26002e-06 [fold_const_symbol]: 1.151e-05 [renormalize]: 4.00003e-07 [detach_backward]: 6.44001e-06 [pipeline_parallel_scheduler]: 1.81e-06 [auto_monad_reorder]: 2.517e-05 [get_jit_bprop_graph]: 1.81e-06 [rewriter_after_jit_bprop_graph]: 9.10001e-06 [opt_after_jit_grad]: 0.00090463 [validate]: 5.832e-05 Sums bootstrap : 0.000435s : 3.29% type_inference : 0.006121s : 46.37% event_method : 0.000020s : 0.15% auto_monad : 0.000061s : 0.46% graph_reusing : 0.000006s : 0.04% inline : 0.000002s : 0.01% add_attr.add_attr_with_inline.tag_attr : 0.000023s : 0.17% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.04% parallel-infer-symbol : 0.000004s : 0.03% pre_auto_parallel : 0.000041s : 0.31% insert-virtual-dataset : 0.000003s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000034s : 0.26% optimize.rewriter_before_opt_a : 0.000098s : 0.75% optimize.opt_a.expand_dump_flag : 0.000006s : 0.05% optimize.opt_a.switch_simplify : 0.000053s : 0.40% optimize.opt_a.loop_unroll : 0.000039s : 0.30% optimize.opt_a.a_1 : 0.000789s : 5.98% optimize.opt_a.with_stream_mark : 0.000039s : 0.29% optimize.opt_a.recompute_prepare : 0.000017s : 0.13% optimize.opt_a.updatestate_depend_eliminate : 0.000008s : 0.06% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.05% optimize.opt_a.updatestate_loads_eliminate : 0.000007s : 0.05% optimize.opt_a.parameter_eliminate : 0.000004s : 0.03% optimize.opt_a.a_2 : 0.000219s : 1.66% optimize.opt_a.accelerated_algorithm : 0.000015s : 0.11% optimize.opt_a.shard : 0.000004s : 0.03% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.03% optimize.opt_a.shard_inline : 0.000014s : 0.11% optimize.opt_a.merge_send_recv : 0.000017s : 0.13% optimize.opt_a.auto_parallel : 0.000019s : 0.14% optimize.opt_a.parallel : 0.000034s : 0.26% optimize.opt_a.flash_sp : 0.000014s : 0.10% optimize.opt_a.merge_comm : 0.000008s : 0.06% optimize.opt_a.allreduce_fusion : 0.000008s : 0.06% optimize.opt_a.matmul_add_comm_reduction : 0.000020s : 0.15% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000002s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000018s : 0.14% optimize.opt_a.virtual_dataset : 0.000014s : 0.10% optimize.opt_a.get_grad_eliminate_ : 0.000013s : 0.10% optimize.opt_a.virtual_output : 0.000016s : 0.12% optimize.opt_a.merge_forward : 0.000009s : 0.07% optimize.opt_a.cell_reuse_recompute_pass : 0.000005s : 0.03% optimize.opt_a.offload_activation : 0.000022s : 0.17% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000036s : 0.27% optimize.opt_a.merge_recompute_call_nodes : 0.000004s : 0.03% optimize.opt_a.before_grad : 0.000066s : 0.50% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000026s : 0.20% optimize.opt_a.meta_fg_expand : 0.000009s : 0.07% optimize.opt_a.flash_sp_send_recv_attached : 0.000010s : 0.08% optimize.opt_a.receive_attached : 0.000006s : 0.05% optimize.opt_a.after_resolve : 0.000033s : 0.25% optimize.opt_a.a_after_grad : 0.000024s : 0.18% optimize.opt_a.renormalize : 0.000930s : 7.05% optimize.opt_a.add_forward_monad_depend : 0.000012s : 0.09% optimize.opt_a.auto_monad_grad : 0.000004s : 0.03% optimize.opt_a.auto_monad_eliminator : 0.000030s : 0.23% optimize.opt_a.cse : 0.000051s : 0.39% optimize.opt_a.a_3 : 0.000131s : 0.99% optimize.py_interpret_to_execute_after_opt_a : 0.000022s : 0.16% optimize.slice_cell_reuse_recomputed_activation : 0.000006s : 0.04% optimize.rewriter_after_opt_a : 0.000055s : 0.42% optimize.convert_after_rewriter : 0.000012s : 0.09% optimize.order_py_execute_after_rewriter : 0.000009s : 0.07% optimize.mutable_eliminate : 0.000839s : 6.35% optimize.opt_b.b_1 : 0.000179s : 1.35% optimize.opt_b.b_2 : 0.000012s : 0.09% optimize.opt_b.updatestate_depend_eliminate : 0.000010s : 0.08% optimize.opt_b.updatestate_assign_eliminate : 0.000004s : 0.03% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.02% optimize.opt_b.renormalize : 0.000001s : 0.01% optimize.opt_b.cse : 0.000033s : 0.25% optimize.optimize_parallel_all_gather_comm : 0.000027s : 0.21% optimize.overlap_param_gather : 0.000006s : 0.04% optimize.cconv : 0.000043s : 0.33% optimize.loop_unroll : 0.000743s : 5.63% optimize.opt_after_cconv.c_1 : 0.000039s : 0.30% optimize.opt_after_cconv.parameter_eliminate : 0.000007s : 0.05% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000008s : 0.06% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.02% optimize.opt_after_cconv.cse : 0.000031s : 0.23% optimize.opt_after_cconv.renormalize : 0.000001s : 0.01% optimize.remove_dup_value : 0.000020s : 0.15% optimize.tuple_transform.d_1 : 0.000055s : 0.41% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000008s : 0.06% optimize.partial_unused_args_eliminate : 0.000005s : 0.04% optimize.add_recomputation : 0.000065s : 0.49% optimize.cse_after_recomputation.cse : 0.000013s : 0.10% optimize.environ_conv : 0.000009s : 0.07% optimize.swap_dp_allreduce_reducescatter : 0.000009s : 0.07% optimize.bias_add_comm_swap : 0.000006s : 0.05% optimize.label_micro_interleaved_index : 0.000010s : 0.07% optimize.label_fine_grained_interleaved_index : 0.000006s : 0.04% optimize.merge_cast_opt : 0.000004s : 0.03% optimize.slice_recompute_activation : 0.000005s : 0.04% optimize.micro_interleaved_order_control : 0.000005s : 0.04% optimize.assign_add_opt : 0.000004s : 0.03% optimize.ForceFp32Comm : 0.000004s : 0.03% optimize.remove_cast_before_assign_add : 0.000004s : 0.03% optimize.full_micro_interleaved_order_control : 0.000005s : 0.04% optimize.reorder_send_recv_between_fp_bp : 0.000006s : 0.04% optimize.comm_op_add_attrs : 0.000004s : 0.03% optimize.add_comm_op_reuse_tag : 0.000004s : 0.03% optimize.interleave_split_concat_branches : 0.000004s : 0.03% optimize.interleave_parallel_branches : 0.000004s : 0.03% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.03% optimize.overlap_opt_shard_grad_in_pipeline : 0.000005s : 0.03% optimize.control_data_broadcast_order : 0.000020s : 0.15% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.03% optimize.offloading_packed_experts : 0.000007s : 0.05% optimize.overlap_recompute_and_grad_model_parallel : 0.000008s : 0.06% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.03% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.03% optimize.overlap_recompute_comm : 0.000006s : 0.04% optimize.overlap_grad_ring_attention : 0.000007s : 0.05% optimize.overlap_grad_flash_sp : 0.000027s : 0.20% optimize.begin_end_overlap_inline : 0.000003s : 0.02% optimize.split_matmul_comm_elemetwise : 0.000005s : 0.04% optimize.split_layernorm_comm : 0.000004s : 0.03% optimize.handle_group_info : 0.000004s : 0.03% optimize.symbol_engine_optimizer.build : 0.000005s : 0.04% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000014s : 0.11% optimize.symbol_engine_optimizer.elim_not_effective : 0.000017s : 0.13% optimize.symbol_engine_optimizer.opt_reshape : 0.000008s : 0.06% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000012s : 0.09% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000006s : 0.05% pipeline_parallel_scheduler : 0.000002s : 0.01% auto_monad_reorder : 0.000025s : 0.19% get_jit_bprop_graph : 0.000002s : 0.01% rewriter_after_jit_bprop_graph : 0.000009s : 0.07% opt_after_jit_grad : 0.000905s : 6.85% validate : 0.000058s : 0.44% Time group info: ------[substitution.] 0.000210 28 0.97% : 0.000002s : 2: substitution.elim_not_effective 0.85% : 0.000002s : 2: substitution.fold_const_symbol 3.38% : 0.000007s : 4: substitution.graph_param_transform 76.09% : 0.000160s : 4: substitution.inline 4.30% : 0.000009s : 4: substitution.j_node_and_user_rematch 2.78% : 0.000006s : 4: substitution.remove_not_recompute_node 4.33% : 0.000009s : 4: substitution.replace_old_param 7.31% : 0.000015s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.006069 2 88.32% : 0.005360s : 1: type_inference.infer 11.68% : 0.000709s : 1: type_inference.specialize ------[replace.] 0.000064 8 64.42% : 0.000041s : 4: replace.inline 35.58% : 0.000023s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000171 8 92.18% : 0.000157s : 4: match.inline 7.82% : 0.000013s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000236 1278 0.86% : 0.000002s : 13: predicate.accumulaten_eliminater 1.33% : 0.000003s : 4: predicate.ad_related_special_op_eliminate 0.47% : 0.000001s : 8: predicate.addn_check_dump 0.91% : 0.000002s : 13: predicate.addn_zero_filter 0.75% : 0.000002s : 13: predicate.adjust_all_reduce_mul_add 2.00% : 0.000005s : 21: predicate.arithmetic_simplify 0.94% : 0.000002s : 13: predicate.cast_eliminate 0.64% : 0.000002s : 8: predicate.check_bprop_eliminate 0.45% : 0.000001s : 8: predicate.compare_switch_simplify 0.17% : 0.000000s : 4: predicate.const_output_eliminate 0.53% : 0.000001s : 8: predicate.depend_value_elim 1.00% : 0.000002s : 13: predicate.dict_get_item_const_eliminator 0.96% : 0.000002s : 13: predicate.dict_get_item_eliminator 0.90% : 0.000002s : 13: predicate.dict_set_item_eliminator 1.41% : 0.000003s : 8: predicate.dumpgradient_eliminate 0.21% : 0.000001s : 4: predicate.elim_not_effective 0.56% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.22% : 0.000003s : 17: predicate.environ_add_const_eliminate 1.00% : 0.000002s : 17: predicate.environ_get_add_eliminate 0.95% : 0.000002s : 17: predicate.environ_get_depend_swap 1.66% : 0.000004s : 25: predicate.environ_get_eliminate 1.26% : 0.000003s : 17: predicate.environ_get_set_eliminate 1.33% : 0.000003s : 21: predicate.exchange_switch_depend_value 2.21% : 0.000005s : 21: predicate.float_depend_g_call 0.50% : 0.000001s : 8: predicate.float_environ_get_switch 0.74% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.14% : 0.000000s : 4: predicate.fold_const_symbol 0.72% : 0.000002s : 8: predicate.get_grad_eliminate 0.20% : 0.000000s : 4: predicate.graph_param_transform 0.56% : 0.000001s : 8: predicate.incorporate_call 0.43% : 0.000001s : 8: predicate.incorporate_call_switch 6.06% : 0.000014s : 58: predicate.inline 0.83% : 0.000002s : 8: predicate.inline_without_move 0.74% : 0.000002s : 8: predicate.j_node_and_user_rematch 0.92% : 0.000002s : 8: predicate.less_batch_normalization 1.90% : 0.000004s : 25: predicate.list_to_tuple_eliminator_ 2.38% : 0.000006s : 38: predicate.load_eliminater 1.78% : 0.000004s : 4: predicate.loop_unroll_after_grad 2.15% : 0.000005s : 34: predicate.loop_unroll_before_grad 1.96% : 0.000005s : 21: predicate.make_slice_get_slice_eliminator 0.51% : 0.000001s : 8: predicate.merge_addn 0.62% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.56% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.71% : 0.000002s : 13: predicate.minmaximum_grad 2.05% : 0.000005s : 4: predicate.mutable_eliminate 0.35% : 0.000001s : 4: predicate.opt_reshape 0.42% : 0.000001s : 4: predicate.parallel_virtual_node 1.51% : 0.000004s : 21: predicate.partial_defer_inline 1.50% : 0.000004s : 21: predicate.partial_eliminate 0.90% : 0.000002s : 13: predicate.print_const_string_wrapper 0.74% : 0.000002s : 8: predicate.reduce_all_const_elim 1.05% : 0.000002s : 13: predicate.reduce_eliminate 2.29% : 0.000005s : 38: predicate.redundant_stop_gradient_eliminater 0.51% : 0.000001s : 8: predicate.remove_not_recompute_node 1.19% : 0.000003s : 25: predicate.replace_applicator 0.57% : 0.000001s : 8: predicate.replace_old_param 0.38% : 0.000001s : 4: predicate.reset_defer_inline 0.80% : 0.000002s : 13: predicate.reshape_eliminate 0.75% : 0.000002s : 8: predicate.row_tensor_add_zeros_like 0.55% : 0.000001s : 4: predicate.row_tensor_eliminate 1.07% : 0.000003s : 8: predicate.same_eliminate 0.40% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.83% : 0.000002s : 8: predicate.shard_identity_eliminate 0.85% : 0.000002s : 8: predicate.special_op_eliminate 0.64% : 0.000002s : 8: predicate.specialize_transform 1.42% : 0.000003s : 8: predicate.split_environ_get_set_with_tuple_value 0.92% : 0.000002s : 8: predicate.stack_unstack_eliminate 0.36% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.46% : 0.000003s : 21: predicate.switch_defer_inline 1.92% : 0.000005s : 29: predicate.switch_layer_defer_inline 4.56% : 0.000011s : 67: predicate.switch_simplify 0.82% : 0.000002s : 13: predicate.tile_eliminate 0.89% : 0.000002s : 13: predicate.transpose_eliminate 1.27% : 0.000003s : 21: predicate.tuple_list_convert_item_index_to_positive 1.39% : 0.000003s : 21: predicate.tuple_list_get_item_const_eliminator 1.17% : 0.000003s : 21: predicate.tuple_list_get_item_depend_reorder 3.36% : 0.000008s : 33: predicate.tuple_list_get_item_eliminator 1.43% : 0.000003s : 21: predicate.tuple_list_get_set_item_eliminator 2.08% : 0.000005s : 29: predicate.tuple_list_set_item_eliminator 1.56% : 0.000004s : 25: predicate.tuple_to_list_eliminator_ 2.26% : 0.000005s : 38: predicate.updatestate_pure_node_eliminater 3.08% : 0.000007s : 46: predicate.updatestate_useless_node_eliminater 0.44% : 0.000001s : 4: predicate.value_based_eliminate 0.67% : 0.000002s : 8: predicate.virtual_dataset_eliminate 0.63% : 0.000001s : 8: predicate.virtual_output_eliminate 0.42% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.39% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000600 11 52.93% : 0.000318s : 5: func_graph_cloner_run.FuncGraphClonerGraph 47.07% : 0.000282s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.055717 192 0.01% : 0.000007s : 1: ForceFp32Comm 6.37% : 0.003548s : 1: add_attr 6.33% : 0.003528s : 1: add_attr_with_inline 0.01% : 0.000007s : 1: add_comm_op_reuse_tag 0.13% : 0.000070s : 1: add_recomputation 0.01% : 0.000007s : 1: assign_add_opt 0.12% : 0.000069s : 1: auto_monad 0.06% : 0.000034s : 1: auto_monad_reorder 0.01% : 0.000006s : 1: begin_end_overlap_inline 0.02% : 0.000009s : 1: bias_add_comm_swap 0.87% : 0.000485s : 1: bootstrap 0.08% : 0.000047s : 1: cconv 0.01% : 0.000007s : 1: comm_op_add_attrs 0.04% : 0.000024s : 1: control_data_broadcast_order 0.03% : 0.000016s : 1: convert_after_rewriter 0.06% : 0.000034s : 1: cse_after_recomputation 0.01% : 0.000008s : 1: dataset_repeat_opt 0.06% : 0.000033s : 1: detach_backward 0.02% : 0.000012s : 1: environ_conv 0.05% : 0.000030s : 1: event_method 0.01% : 0.000007s : 1: full_micro_interleaved_order_control 0.01% : 0.000008s : 1: get_jit_bprop_graph 0.02% : 0.000011s : 1: graph_reusing 0.01% : 0.000006s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000007s : 1: handle_group_info 0.01% : 0.000007s : 1: inline 0.02% : 0.000010s : 1: insert-virtual-dataset 0.01% : 0.000006s : 1: interleave_parallel_branches 0.01% : 0.000006s : 1: interleave_split_concat_branches 0.02% : 0.000008s : 1: label_fine_grained_interleaved_index 0.02% : 0.000013s : 1: label_micro_interleaved_index 1.35% : 0.000753s : 1: loop_unroll 0.01% : 0.000007s : 1: merge_cast_opt 0.01% : 0.000008s : 1: micro_interleaved_order_control 1.52% : 0.000849s : 1: mutable_eliminate 0.02% : 0.000010s : 1: offloading_packed_experts 0.05% : 0.000025s : 1: opt.transform.loop_unroll_optimizer 0.04% : 0.000024s : 1: opt.transform.mutable_eliminate 2.33% : 0.001300s : 78: opt.transform.opt_a 0.07% : 0.000037s : 1: opt.transform.opt_after_cconv 0.08% : 0.000044s : 1: opt.transform.opt_after_jit_grad 0.20% : 0.000111s : 28: opt.transform.opt_b 0.11% : 0.000060s : 2: opt.transform.opt_trans_graph 0.08% : 0.000047s : 4: opt.transform.symbol_engine_opt 28.28% : 0.015757s : 1: opt_a 0.30% : 0.000170s : 1: opt_after_cconv 1.66% : 0.000924s : 1: opt_after_jit_grad 0.58% : 0.000323s : 1: opt_b 34.70% : 0.019332s : 1: optimize 0.06% : 0.000031s : 1: optimize_parallel_all_gather_comm 0.02% : 0.000012s : 1: order_py_execute_after_rewriter 0.05% : 0.000030s : 1: overlap_grad_flash_sp 0.01% : 0.000006s : 1: overlap_grad_matmul_and_grad_allreduce 0.02% : 0.000010s : 1: overlap_grad_ring_attention 0.01% : 0.000007s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000007s : 1: overlap_opt_shard_in_pipeline 0.02% : 0.000009s : 1: overlap_param_gather 0.01% : 0.000007s : 1: overlap_recompute_allgather_and_fa_grad 0.02% : 0.000012s : 1: overlap_recompute_and_grad_model_parallel 0.01% : 0.000008s : 1: overlap_recompute_comm 0.02% : 0.000011s : 1: parallel-infer-symbol 0.01% : 0.000007s : 1: parallel-infer-symbol-second 0.01% : 0.000008s : 1: partial_unused_args_eliminate 0.02% : 0.000011s : 1: pipeline_parallel_scheduler 0.01% : 0.000007s : 1: pipeline_split 0.09% : 0.000049s : 1: pre_auto_parallel 0.07% : 0.000039s : 1: py_interpret_to_execute 0.05% : 0.000026s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000007s : 1: remove_cast_before_assign_add 0.04% : 0.000024s : 1: remove_dup_value 0.89% : 0.000495s : 1: renormalize.infer 0.76% : 0.000422s : 1: renormalize.specialize 0.02% : 0.000009s : 1: reorder_send_recv_between_fp_bp 0.03% : 0.000015s : 1: rewriter_after_jit_bprop_graph 0.11% : 0.000060s : 1: rewriter_after_opt_a 0.18% : 0.000102s : 1: rewriter_before_opt_a 0.02% : 0.000009s : 1: slice_cell_reuse_recomputed_activation 0.02% : 0.000009s : 1: slice_recompute_activation 0.01% : 0.000007s : 1: split_layernorm_comm 0.01% : 0.000008s : 1: split_matmul_comm_elemetwise 0.02% : 0.000012s : 1: swap_dp_allreduce_reducescatter 0.22% : 0.000120s : 1: symbol_engine_optimizer 0.20% : 0.000111s : 1: tuple_transform 11.06% : 0.006160s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:44:50.246.452 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0388645, [21] [bootstrap]: 0.00049864 [type_inference]: 0.00734772 [event_method]: 2.172e-05 [auto_monad]: 9.84e-05 [graph_reusing]: 7.13998e-06 [inline]: 3.65998e-06 [add_attr]: 0.00408866, [1] [add_attr_with_inline]: 0.00407364, [1] [Cycle 1]: 8.205e-05, [2] [tag_attr]: 2.558e-05 [meta_addattr_fg_expand]: 6.31e-06 [parallel-infer-symbol]: 4.08001e-06 [pre_auto_parallel]: 4.463e-05 [insert-virtual-dataset]: 3.01001e-06 [parallel-infer-symbol-second]: 9.00007e-07 [dataset_repeat_opt]: 2.48998e-06 [pipeline_split]: 1.89e-06 [optimize]: 0.0259965, [53] [py_interpret_to_execute]: 3.523e-05 [rewriter_before_opt_a]: 9.942e-05 [opt_a]: 0.00326036, [2] [Cycle 1]: 0.00247519, [45] [expand_dump_flag]: 3.31999e-06 [switch_simplify]: 4.478e-05 [loop_unroll]: 3.019e-05 [a_1]: 0.00070379 [with_stream_mark]: 2.525e-05 [recompute_prepare]: 1.375e-05 [updatestate_depend_eliminate]: 5.10999e-06 [updatestate_assign_eliminate]: 3.18e-06 [updatestate_loads_eliminate]: 3.41001e-06 [parameter_eliminate]: 2.68e-06 [a_2]: 8.49e-05 [accelerated_algorithm]: 8.53001e-06 [shard]: 2.71999e-06 [meta_shard_fg_expand]: 2.49001e-06 [shard_inline]: 6.54001e-06 [merge_send_recv]: 9.39e-06 [auto_parallel]: 1.022e-05 [parallel]: 2.208e-05 [flash_sp]: 1.09e-05 [merge_comm]: 4.86002e-06 [allreduce_fusion]: 3.80998e-06 [matmul_add_comm_reduction]: 1.121e-05 [allreduce_slice_to_reducescatter]: 7.39994e-07 [virtual_shard_identity]: 1.072e-05 [virtual_dataset]: 6.68e-06 [get_grad_eliminate_]: 6.79999e-06 [virtual_output]: 7.1e-06 [merge_forward]: 4.68001e-06 [cell_reuse_recompute_pass]: 1.58002e-06 [offload_activation]: 1.104e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.645e-05 [merge_recompute_call_nodes]: 1.89e-06 [before_grad]: 1.233e-05 [set_forward_comm_id_for_comm_node_pass]: 4.26001e-06 [meta_fg_expand]: 3.29001e-06 [flash_sp_send_recv_attached]: 3.14001e-06 [receive_attached]: 2.51e-06 [after_resolve]: 1.422e-05 [a_after_grad]: 1.099e-05 [renormalize]: 0.00090728 [add_forward_monad_depend]: 1.042e-05 [auto_monad_grad]: 2.87002e-06 [auto_monad_eliminator]: 2.055e-05 [cse]: 3.516e-05 [a_3]: 6.309e-05 [Cycle 2]: 0.00077129, [45] [expand_dump_flag]: 3.21999e-06 [switch_simplify]: 9.76e-06 [loop_unroll]: 6.84001e-06 [a_1]: 0.00014502 [with_stream_mark]: 2.15e-05 [recompute_prepare]: 7.53e-06 [updatestate_depend_eliminate]: 4.3e-06 [updatestate_assign_eliminate]: 2.56998e-06 [updatestate_loads_eliminate]: 3.55003e-06 [parameter_eliminate]: 1.86998e-06 [a_2]: 7.487e-05 [accelerated_algorithm]: 7.43e-06 [shard]: 2.79001e-06 [meta_shard_fg_expand]: 2.65002e-06 [shard_inline]: 6.04999e-06 [merge_send_recv]: 7.65e-06 [auto_parallel]: 9.51e-06 [parallel]: 8.32e-06 [flash_sp]: 4.38001e-06 [merge_comm]: 3.83001e-06 [allreduce_fusion]: 4.38999e-06 [matmul_add_comm_reduction]: 1.041e-05 [allreduce_slice_to_reducescatter]: 1.20999e-06 [virtual_shard_identity]: 8.25e-06 [virtual_dataset]: 6.33e-06 [get_grad_eliminate_]: 5.99e-06 [virtual_output]: 6.08998e-06 [merge_forward]: 3.86001e-06 [cell_reuse_recompute_pass]: 3.43e-06 [offload_activation]: 9.89001e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.792e-05 [merge_recompute_call_nodes]: 1.60999e-06 [before_grad]: 1.062e-05 [set_forward_comm_id_for_comm_node_pass]: 3.66999e-06 [meta_fg_expand]: 3.23e-06 [flash_sp_send_recv_attached]: 2.01e-06 [receive_attached]: 2.11998e-06 [after_resolve]: 1.193e-05 [a_after_grad]: 9.94001e-06 [renormalize]: 8.00064e-08 [add_forward_monad_depend]: 2.68e-06 [auto_monad_grad]: 1.66e-06 [auto_monad_eliminator]: 1.167e-05 [cse]: 1.859e-05 [a_3]: 3.901e-05 [py_interpret_to_execute_after_opt_a]: 1.813e-05 [slice_cell_reuse_recomputed_activation]: 2.21998e-06 [rewriter_after_opt_a]: 4.236e-05 [convert_after_rewriter]: 7.53999e-06 [order_py_execute_after_rewriter]: 5.30001e-06 [mutable_eliminate]: 0.0209707 [opt_b]: 0.00025384, [1] [Cycle 1]: 0.00024353, [7] [b_1]: 0.00013809 [b_2]: 8.89998e-06 [updatestate_depend_eliminate]: 1.093e-05 [updatestate_assign_eliminate]: 3.06001e-06 [updatestate_loads_eliminate]: 3.32997e-06 [renormalize]: 1.12e-06 [cse]: 4.024e-05 [optimize_parallel_all_gather_comm]: 2.457e-05 [overlap_param_gather]: 2.09e-06 [cconv]: 3.854e-05 [loop_unroll]: 0.00050589 [opt_after_cconv]: 0.00011336, [1] [Cycle 1]: 0.00010703, [7] [c_1]: 3.276e-05 [parameter_eliminate]: 6.31e-06 [updatestate_depend_eliminate]: 6.36e-06 [updatestate_assign_eliminate]: 2.46e-06 [updatestate_loads_eliminate]: 2.41e-06 [cse]: 2.116e-05 [renormalize]: 4.39992e-07 [remove_dup_value]: 1.642e-05 [tuple_transform]: 8.392e-05, [1] [Cycle 1]: 7.942e-05, [4] [d_1]: 5.163e-05 [none_parameter_eliminate]: 1.62999e-06 [renormalize]: 2.40019e-07 [switch_simplify]: 7.40003e-06 [partial_unused_args_eliminate]: 1.94999e-06 [add_recomputation]: 5.63e-05 [cse_after_recomputation]: 2.271e-05, [1] [Cycle 1]: 1.742e-05, [1] [cse]: 1.188e-05 [environ_conv]: 5.67001e-06 [swap_dp_allreduce_reducescatter]: 5.56002e-06 [bias_add_comm_swap]: 3.71001e-06 [label_micro_interleaved_index]: 5.67999e-06 [label_fine_grained_interleaved_index]: 3.05998e-06 [merge_cast_opt]: 1.44e-06 [slice_recompute_activation]: 2.17999e-06 [micro_interleaved_order_control]: 2.53e-06 [assign_add_opt]: 1.37e-06 [ForceFp32Comm]: 1.13001e-06 [remove_cast_before_assign_add]: 1.22e-06 [full_micro_interleaved_order_control]: 2.29001e-06 [reorder_send_recv_between_fp_bp]: 2.81e-06 [comm_op_add_attrs]: 1.02e-06 [add_comm_op_reuse_tag]: 9.70002e-07 [interleave_split_concat_branches]: 1.02998e-06 [interleave_parallel_branches]: 1.09e-06 [overlap_opt_shard_in_pipeline]: 1.20001e-06 [overlap_opt_shard_grad_in_pipeline]: 2.37999e-06 [control_data_broadcast_order]: 1.409e-05 [grouped_pairwise_exchange_alltoall]: 1.59e-06 [offloading_packed_experts]: 4.17e-06 [overlap_recompute_and_grad_model_parallel]: 4.89003e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.25001e-06 [overlap_recompute_allgather_and_fa_grad]: 1.59998e-06 [overlap_recompute_comm]: 2.31e-06 [overlap_grad_ring_attention]: 4.18999e-06 [overlap_grad_flash_sp]: 2.341e-05 [begin_end_overlap_inline]: 6.19999e-07 [split_matmul_comm_elemetwise]: 2.24999e-06 [split_layernorm_comm]: 1.67999e-06 [handle_group_info]: 1.32e-06 [symbol_engine_optimizer]: 8.093e-05, [1] [Cycle 1]: 7.616e-05, [6] [build]: 4.84e-06 [elim_shapecalc]: 1.098e-05 [elim_not_effective]: 1.33e-05 [opt_reshape]: 7.36999e-06 [fold_const_symbol]: 1.058e-05 [renormalize]: 2.69996e-07 [detach_backward]: 2.21e-06 [pipeline_parallel_scheduler]: 1.77999e-06 [auto_monad_reorder]: 1.799e-05 [get_jit_bprop_graph]: 1.87001e-06 [rewriter_after_jit_bprop_graph]: 6.54999e-06 [opt_after_jit_grad]: 0.00048979 [validate]: 4.693e-05 Sums bootstrap : 0.000499s : 1.48% type_inference : 0.007348s : 21.82% event_method : 0.000022s : 0.06% auto_monad : 0.000098s : 0.29% graph_reusing : 0.000007s : 0.02% inline : 0.000004s : 0.01% add_attr.add_attr_with_inline.tag_attr : 0.000026s : 0.08% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.02% parallel-infer-symbol : 0.000004s : 0.01% pre_auto_parallel : 0.000045s : 0.13% insert-virtual-dataset : 0.000003s : 0.01% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.01% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000035s : 0.10% optimize.rewriter_before_opt_a : 0.000099s : 0.30% optimize.opt_a.expand_dump_flag : 0.000007s : 0.02% optimize.opt_a.switch_simplify : 0.000055s : 0.16% optimize.opt_a.loop_unroll : 0.000037s : 0.11% optimize.opt_a.a_1 : 0.000849s : 2.52% optimize.opt_a.with_stream_mark : 0.000047s : 0.14% optimize.opt_a.recompute_prepare : 0.000021s : 0.06% optimize.opt_a.updatestate_depend_eliminate : 0.000009s : 0.03% optimize.opt_a.updatestate_assign_eliminate : 0.000006s : 0.02% optimize.opt_a.updatestate_loads_eliminate : 0.000007s : 0.02% optimize.opt_a.parameter_eliminate : 0.000005s : 0.01% optimize.opt_a.a_2 : 0.000160s : 0.47% optimize.opt_a.accelerated_algorithm : 0.000016s : 0.05% optimize.opt_a.shard : 0.000006s : 0.02% optimize.opt_a.meta_shard_fg_expand : 0.000005s : 0.02% optimize.opt_a.shard_inline : 0.000013s : 0.04% optimize.opt_a.merge_send_recv : 0.000017s : 0.05% optimize.opt_a.auto_parallel : 0.000020s : 0.06% optimize.opt_a.parallel : 0.000030s : 0.09% optimize.opt_a.flash_sp : 0.000015s : 0.05% optimize.opt_a.merge_comm : 0.000009s : 0.03% optimize.opt_a.allreduce_fusion : 0.000008s : 0.02% optimize.opt_a.matmul_add_comm_reduction : 0.000022s : 0.06% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000002s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000019s : 0.06% optimize.opt_a.virtual_dataset : 0.000013s : 0.04% optimize.opt_a.get_grad_eliminate_ : 0.000013s : 0.04% optimize.opt_a.virtual_output : 0.000013s : 0.04% optimize.opt_a.merge_forward : 0.000009s : 0.03% optimize.opt_a.cell_reuse_recompute_pass : 0.000005s : 0.01% optimize.opt_a.offload_activation : 0.000021s : 0.06% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000034s : 0.10% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.01% optimize.opt_a.before_grad : 0.000023s : 0.07% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000008s : 0.02% optimize.opt_a.meta_fg_expand : 0.000007s : 0.02% optimize.opt_a.flash_sp_send_recv_attached : 0.000005s : 0.02% optimize.opt_a.receive_attached : 0.000005s : 0.01% optimize.opt_a.after_resolve : 0.000026s : 0.08% optimize.opt_a.a_after_grad : 0.000021s : 0.06% optimize.opt_a.renormalize : 0.000907s : 2.69% optimize.opt_a.add_forward_monad_depend : 0.000013s : 0.04% optimize.opt_a.auto_monad_grad : 0.000005s : 0.01% optimize.opt_a.auto_monad_eliminator : 0.000032s : 0.10% optimize.opt_a.cse : 0.000054s : 0.16% optimize.opt_a.a_3 : 0.000102s : 0.30% optimize.py_interpret_to_execute_after_opt_a : 0.000018s : 0.05% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.01% optimize.rewriter_after_opt_a : 0.000042s : 0.13% optimize.convert_after_rewriter : 0.000008s : 0.02% optimize.order_py_execute_after_rewriter : 0.000005s : 0.02% optimize.mutable_eliminate : 0.020971s : 62.29% optimize.opt_b.b_1 : 0.000138s : 0.41% optimize.opt_b.b_2 : 0.000009s : 0.03% optimize.opt_b.updatestate_depend_eliminate : 0.000011s : 0.03% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.01% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.01% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000040s : 0.12% optimize.optimize_parallel_all_gather_comm : 0.000025s : 0.07% optimize.overlap_param_gather : 0.000002s : 0.01% optimize.cconv : 0.000039s : 0.11% optimize.loop_unroll : 0.000506s : 1.50% optimize.opt_after_cconv.c_1 : 0.000033s : 0.10% optimize.opt_after_cconv.parameter_eliminate : 0.000006s : 0.02% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.02% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000002s : 0.01% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.01% optimize.opt_after_cconv.cse : 0.000021s : 0.06% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000016s : 0.05% optimize.tuple_transform.d_1 : 0.000052s : 0.15% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000007s : 0.02% optimize.partial_unused_args_eliminate : 0.000002s : 0.01% optimize.add_recomputation : 0.000056s : 0.17% optimize.cse_after_recomputation.cse : 0.000012s : 0.04% optimize.environ_conv : 0.000006s : 0.02% optimize.swap_dp_allreduce_reducescatter : 0.000006s : 0.02% optimize.bias_add_comm_swap : 0.000004s : 0.01% optimize.label_micro_interleaved_index : 0.000006s : 0.02% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.01% optimize.merge_cast_opt : 0.000001s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.01% optimize.micro_interleaved_order_control : 0.000003s : 0.01% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.00% optimize.full_micro_interleaved_order_control : 0.000002s : 0.01% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.01% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.01% optimize.control_data_broadcast_order : 0.000014s : 0.04% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.00% optimize.offloading_packed_experts : 0.000004s : 0.01% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.01% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000002s : 0.00% optimize.overlap_recompute_comm : 0.000002s : 0.01% optimize.overlap_grad_ring_attention : 0.000004s : 0.01% optimize.overlap_grad_flash_sp : 0.000023s : 0.07% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.01% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000005s : 0.01% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000011s : 0.03% optimize.symbol_engine_optimizer.elim_not_effective : 0.000013s : 0.04% optimize.symbol_engine_optimizer.opt_reshape : 0.000007s : 0.02% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000011s : 0.03% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.01% pipeline_parallel_scheduler : 0.000002s : 0.01% auto_monad_reorder : 0.000018s : 0.05% get_jit_bprop_graph : 0.000002s : 0.01% rewriter_after_jit_bprop_graph : 0.000007s : 0.02% opt_after_jit_grad : 0.000490s : 1.45% validate : 0.000047s : 0.14% Time group info: ------[substitution.] 0.000231 28 0.80% : 0.000002s : 2: substitution.elim_not_effective 0.66% : 0.000002s : 2: substitution.fold_const_symbol 3.08% : 0.000007s : 4: substitution.graph_param_transform 79.95% : 0.000185s : 4: substitution.inline 2.32% : 0.000005s : 4: substitution.j_node_and_user_rematch 2.80% : 0.000006s : 4: substitution.remove_not_recompute_node 2.69% : 0.000006s : 4: substitution.replace_old_param 7.70% : 0.000018s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.007271 2 88.91% : 0.006464s : 1: type_inference.infer 11.09% : 0.000807s : 1: type_inference.specialize ------[replace.] 0.000071 8 61.83% : 0.000044s : 4: replace.inline 38.17% : 0.000027s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000198 8 91.98% : 0.000182s : 4: match.inline 8.02% : 0.000016s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000234 1278 0.95% : 0.000002s : 13: predicate.accumulaten_eliminater 0.71% : 0.000002s : 4: predicate.ad_related_special_op_eliminate 0.44% : 0.000001s : 8: predicate.addn_check_dump 1.00% : 0.000002s : 13: predicate.addn_zero_filter 0.77% : 0.000002s : 13: predicate.adjust_all_reduce_mul_add 2.20% : 0.000005s : 21: predicate.arithmetic_simplify 0.98% : 0.000002s : 13: predicate.cast_eliminate 0.51% : 0.000001s : 8: predicate.check_bprop_eliminate 0.41% : 0.000001s : 8: predicate.compare_switch_simplify 0.15% : 0.000000s : 4: predicate.const_output_eliminate 0.59% : 0.000001s : 8: predicate.depend_value_elim 0.97% : 0.000002s : 13: predicate.dict_get_item_const_eliminator 1.04% : 0.000002s : 13: predicate.dict_get_item_eliminator 1.03% : 0.000002s : 13: predicate.dict_set_item_eliminator 0.94% : 0.000002s : 8: predicate.dumpgradient_eliminate 0.19% : 0.000000s : 4: predicate.elim_not_effective 0.34% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.36% : 0.000003s : 17: predicate.environ_add_const_eliminate 1.00% : 0.000002s : 17: predicate.environ_get_add_eliminate 1.03% : 0.000002s : 17: predicate.environ_get_depend_swap 1.68% : 0.000004s : 25: predicate.environ_get_eliminate 1.15% : 0.000003s : 17: predicate.environ_get_set_eliminate 1.46% : 0.000003s : 21: predicate.exchange_switch_depend_value 2.37% : 0.000006s : 21: predicate.float_depend_g_call 0.51% : 0.000001s : 8: predicate.float_environ_get_switch 0.75% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.18% : 0.000000s : 4: predicate.fold_const_symbol 0.67% : 0.000002s : 8: predicate.get_grad_eliminate 0.27% : 0.000001s : 4: predicate.graph_param_transform 0.52% : 0.000001s : 8: predicate.incorporate_call 0.47% : 0.000001s : 8: predicate.incorporate_call_switch 5.64% : 0.000013s : 58: predicate.inline 0.89% : 0.000002s : 8: predicate.inline_without_move 0.27% : 0.000001s : 8: predicate.j_node_and_user_rematch 1.09% : 0.000003s : 8: predicate.less_batch_normalization 1.85% : 0.000004s : 25: predicate.list_to_tuple_eliminator_ 2.38% : 0.000006s : 38: predicate.load_eliminater 0.90% : 0.000002s : 4: predicate.loop_unroll_after_grad 2.13% : 0.000005s : 34: predicate.loop_unroll_before_grad 1.62% : 0.000004s : 21: predicate.make_slice_get_slice_eliminator 0.49% : 0.000001s : 8: predicate.merge_addn 0.50% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.72% : 0.000002s : 8: predicate.mini_step_allgather_replace 1.12% : 0.000003s : 13: predicate.minmaximum_grad 2.34% : 0.000005s : 4: predicate.mutable_eliminate 0.35% : 0.000001s : 4: predicate.opt_reshape 0.39% : 0.000001s : 4: predicate.parallel_virtual_node 1.83% : 0.000004s : 21: predicate.partial_defer_inline 1.50% : 0.000004s : 21: predicate.partial_eliminate 0.98% : 0.000002s : 13: predicate.print_const_string_wrapper 0.65% : 0.000002s : 8: predicate.reduce_all_const_elim 1.21% : 0.000003s : 13: predicate.reduce_eliminate 2.26% : 0.000005s : 38: predicate.redundant_stop_gradient_eliminater 0.51% : 0.000001s : 8: predicate.remove_not_recompute_node 1.27% : 0.000003s : 25: predicate.replace_applicator 0.47% : 0.000001s : 8: predicate.replace_old_param 0.43% : 0.000001s : 4: predicate.reset_defer_inline 1.03% : 0.000002s : 13: predicate.reshape_eliminate 0.70% : 0.000002s : 8: predicate.row_tensor_add_zeros_like 0.47% : 0.000001s : 4: predicate.row_tensor_eliminate 1.00% : 0.000002s : 8: predicate.same_eliminate 0.49% : 0.000001s : 8: predicate.set_cell_output_no_recompute 1.05% : 0.000002s : 8: predicate.shard_identity_eliminate 0.74% : 0.000002s : 8: predicate.special_op_eliminate 0.71% : 0.000002s : 8: predicate.specialize_transform 1.18% : 0.000003s : 8: predicate.split_environ_get_set_with_tuple_value 0.81% : 0.000002s : 8: predicate.stack_unstack_eliminate 0.35% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.59% : 0.000004s : 21: predicate.switch_defer_inline 1.86% : 0.000004s : 29: predicate.switch_layer_defer_inline 4.78% : 0.000011s : 67: predicate.switch_simplify 0.86% : 0.000002s : 13: predicate.tile_eliminate 1.15% : 0.000003s : 13: predicate.transpose_eliminate 1.46% : 0.000003s : 21: predicate.tuple_list_convert_item_index_to_positive 1.58% : 0.000004s : 21: predicate.tuple_list_get_item_const_eliminator 1.65% : 0.000004s : 21: predicate.tuple_list_get_item_depend_reorder 3.51% : 0.000008s : 33: predicate.tuple_list_get_item_eliminator 1.44% : 0.000003s : 21: predicate.tuple_list_get_set_item_eliminator 2.11% : 0.000005s : 29: predicate.tuple_list_set_item_eliminator 1.67% : 0.000004s : 25: predicate.tuple_to_list_eliminator_ 2.12% : 0.000005s : 38: predicate.updatestate_pure_node_eliminater 2.84% : 0.000007s : 46: predicate.updatestate_useless_node_eliminater 0.35% : 0.000001s : 4: predicate.value_based_eliminate 0.57% : 0.000001s : 8: predicate.virtual_dataset_eliminate 0.87% : 0.000002s : 8: predicate.virtual_output_eliminate 0.26% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.41% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000664 11 52.84% : 0.000351s : 5: func_graph_cloner_run.FuncGraphClonerGraph 47.16% : 0.000313s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.071314 192 0.01% : 0.000004s : 1: ForceFp32Comm 5.74% : 0.004095s : 1: add_attr 5.72% : 0.004079s : 1: add_attr_with_inline 0.01% : 0.000004s : 1: add_comm_op_reuse_tag 0.08% : 0.000060s : 1: add_recomputation 0.01% : 0.000004s : 1: assign_add_opt 0.15% : 0.000105s : 1: auto_monad 0.03% : 0.000022s : 1: auto_monad_reorder 0.00% : 0.000003s : 1: begin_end_overlap_inline 0.01% : 0.000007s : 1: bias_add_comm_swap 0.74% : 0.000530s : 1: bootstrap 0.06% : 0.000043s : 1: cconv 0.01% : 0.000004s : 1: comm_op_add_attrs 0.02% : 0.000017s : 1: control_data_broadcast_order 0.01% : 0.000011s : 1: convert_after_rewriter 0.04% : 0.000026s : 1: cse_after_recomputation 0.01% : 0.000005s : 1: dataset_repeat_opt 0.01% : 0.000006s : 1: detach_backward 0.01% : 0.000009s : 1: environ_conv 0.04% : 0.000029s : 1: event_method 0.01% : 0.000005s : 1: full_micro_interleaved_order_control 0.01% : 0.000005s : 1: get_jit_bprop_graph 0.02% : 0.000011s : 1: graph_reusing 0.01% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000004s : 1: handle_group_info 0.01% : 0.000007s : 1: inline 0.01% : 0.000007s : 1: insert-virtual-dataset 0.01% : 0.000004s : 1: interleave_parallel_branches 0.01% : 0.000004s : 1: interleave_split_concat_branches 0.01% : 0.000006s : 1: label_fine_grained_interleaved_index 0.01% : 0.000008s : 1: label_micro_interleaved_index 0.72% : 0.000514s : 1: loop_unroll 0.01% : 0.000004s : 1: merge_cast_opt 0.01% : 0.000005s : 1: micro_interleaved_order_control 29.43% : 0.020989s : 1: mutable_eliminate 0.01% : 0.000007s : 1: offloading_packed_experts 0.02% : 0.000016s : 1: opt.transform.loop_unroll_optimizer 0.05% : 0.000034s : 1: opt.transform.mutable_eliminate 1.83% : 0.001307s : 78: opt.transform.opt_a 0.04% : 0.000031s : 1: opt.transform.opt_after_cconv 0.04% : 0.000026s : 1: opt.transform.opt_after_jit_grad 0.15% : 0.000108s : 28: opt.transform.opt_b 0.08% : 0.000057s : 2: opt.transform.opt_trans_graph 0.05% : 0.000038s : 4: opt.transform.symbol_engine_opt 4.58% : 0.003265s : 1: opt_a 0.16% : 0.000117s : 1: opt_after_cconv 0.70% : 0.000499s : 1: opt_after_jit_grad 0.36% : 0.000258s : 1: opt_b 36.46% : 0.026003s : 1: optimize 0.04% : 0.000028s : 1: optimize_parallel_all_gather_comm 0.01% : 0.000009s : 1: order_py_execute_after_rewriter 0.04% : 0.000027s : 1: overlap_grad_flash_sp 0.01% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.01% : 0.000007s : 1: overlap_grad_ring_attention 0.01% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.01% : 0.000005s : 1: overlap_param_gather 0.01% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.01% : 0.000008s : 1: overlap_recompute_and_grad_model_parallel 0.01% : 0.000005s : 1: overlap_recompute_comm 0.01% : 0.000008s : 1: parallel-infer-symbol 0.01% : 0.000004s : 1: parallel-infer-symbol-second 0.01% : 0.000005s : 1: partial_unused_args_eliminate 0.01% : 0.000005s : 1: pipeline_parallel_scheduler 0.01% : 0.000005s : 1: pipeline_split 0.07% : 0.000049s : 1: pre_auto_parallel 0.06% : 0.000040s : 1: py_interpret_to_execute 0.03% : 0.000022s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000004s : 1: remove_cast_before_assign_add 0.03% : 0.000020s : 1: remove_dup_value 0.64% : 0.000457s : 1: renormalize.infer 0.61% : 0.000437s : 1: renormalize.specialize 0.01% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.01% : 0.000010s : 1: rewriter_after_jit_bprop_graph 0.07% : 0.000048s : 1: rewriter_after_opt_a 0.15% : 0.000104s : 1: rewriter_before_opt_a 0.01% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.01% : 0.000006s : 1: slice_recompute_activation 0.01% : 0.000005s : 1: split_layernorm_comm 0.01% : 0.000005s : 1: split_matmul_comm_elemetwise 0.01% : 0.000009s : 1: swap_dp_allreduce_reducescatter 0.12% : 0.000084s : 1: symbol_engine_optimizer 0.12% : 0.000087s : 1: tuple_transform 10.35% : 0.007378s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:44:50.711.161 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:44:50.711.470 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0445579, [21] [bootstrap]: 0.00048342 [type_inference]: 0.0074645 [event_method]: 2.358e-05 [auto_monad]: 7.14e-05 [graph_reusing]: 5.93002e-06 [inline]: 3.43e-06 [add_attr]: 0.00408917, [1] [add_attr_with_inline]: 0.00407574, [1] [Cycle 1]: 9.46e-05, [2] [tag_attr]: 2.408e-05 [meta_addattr_fg_expand]: 6.54001e-06 [parallel-infer-symbol]: 3.47002e-06 [pre_auto_parallel]: 4.23e-05 [insert-virtual-dataset]: 2.40002e-06 [parallel-infer-symbol-second]: 6.90023e-07 [dataset_repeat_opt]: 2.49001e-06 [pipeline_split]: 1.58002e-06 [optimize]: 0.0306296, [53] [py_interpret_to_execute]: 3.653e-05 [rewriter_before_opt_a]: 0.00010622 [opt_a]: 0.027482, [2] [Cycle 1]: 0.0264145, [45] [expand_dump_flag]: 3.11999e-06 [switch_simplify]: 4.513e-05 [loop_unroll]: 3.122e-05 [a_1]: 0.00080075 [with_stream_mark]: 2.345e-05 [recompute_prepare]: 1.386e-05 [updatestate_depend_eliminate]: 4.92999e-06 [updatestate_assign_eliminate]: 3.84002e-06 [updatestate_loads_eliminate]: 3.55998e-06 [parameter_eliminate]: 2.41998e-06 [a_2]: 0.00013191 [accelerated_algorithm]: 9.49e-06 [shard]: 2.46e-06 [meta_shard_fg_expand]: 2.49001e-06 [shard_inline]: 8.48999e-06 [merge_send_recv]: 1.13e-05 [auto_parallel]: 9.61e-06 [parallel]: 1.963e-05 [flash_sp]: 1.154e-05 [merge_comm]: 4.98001e-06 [allreduce_fusion]: 4.2e-06 [matmul_add_comm_reduction]: 1.174e-05 [allreduce_slice_to_reducescatter]: 9.39996e-07 [virtual_shard_identity]: 1.221e-05 [virtual_dataset]: 8e-06 [get_grad_eliminate_]: 7.66001e-06 [virtual_output]: 8.36002e-06 [merge_forward]: 4.50999e-06 [cell_reuse_recompute_pass]: 2.02001e-06 [offload_activation]: 1.287e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.143e-05 [merge_recompute_call_nodes]: 1.86e-06 [before_grad]: 1.444e-05 [set_forward_comm_id_for_comm_node_pass]: 5.69999e-06 [meta_fg_expand]: 3.56999e-06 [flash_sp_send_recv_attached]: 4.67998e-06 [receive_attached]: 2.06e-06 [after_resolve]: 1.623e-05 [a_after_grad]: 1.29e-05 [renormalize]: 0.0244638 [add_forward_monad_depend]: 1.186e-05 [auto_monad_grad]: 3.11999e-06 [auto_monad_eliminator]: 2.628e-05 [cse]: 4.211e-05 [a_3]: 8.682e-05 [Cycle 2]: 0.00104823, [45] [expand_dump_flag]: 2.24999e-06 [switch_simplify]: 1.019e-05 [loop_unroll]: 8.35999e-06 [a_1]: 0.00019644 [with_stream_mark]: 2.139e-05 [recompute_prepare]: 8.2e-06 [updatestate_depend_eliminate]: 4.46002e-06 [updatestate_assign_eliminate]: 3.76999e-06 [updatestate_loads_eliminate]: 3.66999e-06 [parameter_eliminate]: 1.89e-06 [a_2]: 0.00012037 [accelerated_algorithm]: 8.41002e-06 [shard]: 2.48e-06 [meta_shard_fg_expand]: 2.38998e-06 [shard_inline]: 7.65e-06 [merge_send_recv]: 1.061e-05 [auto_parallel]: 1.105e-05 [parallel]: 8.90999e-06 [flash_sp]: 4.42e-06 [merge_comm]: 4.62e-06 [allreduce_fusion]: 7.23e-06 [matmul_add_comm_reduction]: 1.073e-05 [allreduce_slice_to_reducescatter]: 8.09989e-07 [virtual_shard_identity]: 9.82999e-06 [virtual_dataset]: 7.48999e-06 [get_grad_eliminate_]: 7.33999e-06 [virtual_output]: 7.4e-06 [merge_forward]: 5.77999e-06 [cell_reuse_recompute_pass]: 3.23e-06 [offload_activation]: 1.078e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.892e-05 [merge_recompute_call_nodes]: 1.61998e-06 [before_grad]: 1.38e-05 [set_forward_comm_id_for_comm_node_pass]: 5.76e-06 [meta_fg_expand]: 4.03999e-06 [flash_sp_send_recv_attached]: 1.63002e-06 [receive_attached]: 2.96001e-06 [after_resolve]: 1.665e-05 [a_after_grad]: 1.302e-05 [renormalize]: 8.00064e-08 [add_forward_monad_depend]: 2.59001e-06 [auto_monad_grad]: 1.67001e-06 [auto_monad_eliminator]: 1.207e-05 [cse]: 2.3e-05 [a_3]: 6.012e-05 [py_interpret_to_execute_after_opt_a]: 2.321e-05 [slice_cell_reuse_recomputed_activation]: 4.63999e-06 [rewriter_after_opt_a]: 5.227e-05 [convert_after_rewriter]: 1.086e-05 [order_py_execute_after_rewriter]: 9.04e-06 [mutable_eliminate]: 0.00076932 [opt_b]: 0.00033486, [1] [Cycle 1]: 0.00032401, [7] [b_1]: 0.00020705 [b_2]: 9.87999e-06 [updatestate_depend_eliminate]: 8.81002e-06 [updatestate_assign_eliminate]: 3.54002e-06 [updatestate_loads_eliminate]: 3.36001e-06 [renormalize]: 6.00005e-07 [cse]: 3.285e-05 [optimize_parallel_all_gather_comm]: 2.606e-05 [overlap_param_gather]: 5.47001e-06 [cconv]: 3.769e-05 [loop_unroll]: 0.00052208 [opt_after_cconv]: 0.00015296, [1] [Cycle 1]: 0.00014224, [7] [c_1]: 4.101e-05 [parameter_eliminate]: 4.75001e-06 [updatestate_depend_eliminate]: 6.78e-06 [updatestate_assign_eliminate]: 3.32002e-06 [updatestate_loads_eliminate]: 3.01001e-06 [cse]: 2.502e-05 [renormalize]: 4.89992e-07 [remove_dup_value]: 1.966e-05 [tuple_transform]: 0.00036018, [1] [Cycle 1]: 0.00035102, [4] [d_1]: 0.00029277 [none_parameter_eliminate]: 2.94001e-06 [renormalize]: 3.29979e-07 [switch_simplify]: 1.059e-05 [partial_unused_args_eliminate]: 5.29e-06 [add_recomputation]: 9.205e-05 [cse_after_recomputation]: 4.027e-05, [1] [Cycle 1]: 3.164e-05, [1] [cse]: 2.172e-05 [environ_conv]: 1.192e-05 [swap_dp_allreduce_reducescatter]: 9.73002e-06 [bias_add_comm_swap]: 6.49999e-06 [label_micro_interleaved_index]: 8.22e-06 [label_fine_grained_interleaved_index]: 5.99999e-06 [merge_cast_opt]: 4.84e-06 [slice_recompute_activation]: 4.58001e-06 [micro_interleaved_order_control]: 4.84e-06 [assign_add_opt]: 3.61001e-06 [ForceFp32Comm]: 3.48999e-06 [remove_cast_before_assign_add]: 4.16001e-06 [full_micro_interleaved_order_control]: 4.77e-06 [reorder_send_recv_between_fp_bp]: 5.34998e-06 [comm_op_add_attrs]: 3.69002e-06 [add_comm_op_reuse_tag]: 3.45e-06 [interleave_split_concat_branches]: 3.65e-06 [interleave_parallel_branches]: 3.53e-06 [overlap_opt_shard_in_pipeline]: 3.81999e-06 [overlap_opt_shard_grad_in_pipeline]: 4.25e-06 [control_data_broadcast_order]: 1.89e-05 [grouped_pairwise_exchange_alltoall]: 4.32e-06 [offloading_packed_experts]: 7.66001e-06 [overlap_recompute_and_grad_model_parallel]: 8.22e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.71999e-06 [overlap_recompute_allgather_and_fa_grad]: 3.71001e-06 [overlap_recompute_comm]: 5.29998e-06 [overlap_grad_ring_attention]: 7.56999e-06 [overlap_grad_flash_sp]: 2.777e-05 [begin_end_overlap_inline]: 2.90002e-06 [split_matmul_comm_elemetwise]: 4.72e-06 [split_layernorm_comm]: 4.31002e-06 [handle_group_info]: 3.81999e-06 [symbol_engine_optimizer]: 0.00010879, [1] [Cycle 1]: 0.00010189, [6] [build]: 4.11001e-06 [elim_shapecalc]: 1.304e-05 [elim_not_effective]: 1.675e-05 [opt_reshape]: 8.52e-06 [fold_const_symbol]: 1.27e-05 [renormalize]: 2.50002e-07 [detach_backward]: 3.86001e-06 [pipeline_parallel_scheduler]: 1.89e-06 [auto_monad_reorder]: 2.441e-05 [get_jit_bprop_graph]: 2.12001e-06 [rewriter_after_jit_bprop_graph]: 6.84999e-06 [opt_after_jit_grad]: 0.00066101 [validate]: 4.986e-05 Sums bootstrap : 0.000483s : 1.27% type_inference : 0.007464s : 19.56% event_method : 0.000024s : 0.06% auto_monad : 0.000071s : 0.19% graph_reusing : 0.000006s : 0.02% inline : 0.000003s : 0.01% add_attr.add_attr_with_inline.tag_attr : 0.000024s : 0.06% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000007s : 0.02% parallel-infer-symbol : 0.000003s : 0.01% pre_auto_parallel : 0.000042s : 0.11% insert-virtual-dataset : 0.000002s : 0.01% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.01% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000037s : 0.10% optimize.rewriter_before_opt_a : 0.000106s : 0.28% optimize.opt_a.expand_dump_flag : 0.000005s : 0.01% optimize.opt_a.switch_simplify : 0.000055s : 0.14% optimize.opt_a.loop_unroll : 0.000040s : 0.10% optimize.opt_a.a_1 : 0.000997s : 2.61% optimize.opt_a.with_stream_mark : 0.000045s : 0.12% optimize.opt_a.recompute_prepare : 0.000022s : 0.06% optimize.opt_a.updatestate_depend_eliminate : 0.000009s : 0.02% optimize.opt_a.updatestate_assign_eliminate : 0.000008s : 0.02% optimize.opt_a.updatestate_loads_eliminate : 0.000007s : 0.02% optimize.opt_a.parameter_eliminate : 0.000004s : 0.01% optimize.opt_a.a_2 : 0.000252s : 0.66% optimize.opt_a.accelerated_algorithm : 0.000018s : 0.05% optimize.opt_a.shard : 0.000005s : 0.01% optimize.opt_a.meta_shard_fg_expand : 0.000005s : 0.01% optimize.opt_a.shard_inline : 0.000016s : 0.04% optimize.opt_a.merge_send_recv : 0.000022s : 0.06% optimize.opt_a.auto_parallel : 0.000021s : 0.05% optimize.opt_a.parallel : 0.000029s : 0.07% optimize.opt_a.flash_sp : 0.000016s : 0.04% optimize.opt_a.merge_comm : 0.000010s : 0.03% optimize.opt_a.allreduce_fusion : 0.000011s : 0.03% optimize.opt_a.matmul_add_comm_reduction : 0.000022s : 0.06% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000002s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000022s : 0.06% optimize.opt_a.virtual_dataset : 0.000015s : 0.04% optimize.opt_a.get_grad_eliminate_ : 0.000015s : 0.04% optimize.opt_a.virtual_output : 0.000016s : 0.04% optimize.opt_a.merge_forward : 0.000010s : 0.03% optimize.opt_a.cell_reuse_recompute_pass : 0.000005s : 0.01% optimize.opt_a.offload_activation : 0.000024s : 0.06% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000040s : 0.11% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.01% optimize.opt_a.before_grad : 0.000028s : 0.07% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000011s : 0.03% optimize.opt_a.meta_fg_expand : 0.000008s : 0.02% optimize.opt_a.flash_sp_send_recv_attached : 0.000006s : 0.02% optimize.opt_a.receive_attached : 0.000005s : 0.01% optimize.opt_a.after_resolve : 0.000033s : 0.09% optimize.opt_a.a_after_grad : 0.000026s : 0.07% optimize.opt_a.renormalize : 0.024464s : 64.10% optimize.opt_a.add_forward_monad_depend : 0.000014s : 0.04% optimize.opt_a.auto_monad_grad : 0.000005s : 0.01% optimize.opt_a.auto_monad_eliminator : 0.000038s : 0.10% optimize.opt_a.cse : 0.000065s : 0.17% optimize.opt_a.a_3 : 0.000147s : 0.38% optimize.py_interpret_to_execute_after_opt_a : 0.000023s : 0.06% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.01% optimize.rewriter_after_opt_a : 0.000052s : 0.14% optimize.convert_after_rewriter : 0.000011s : 0.03% optimize.order_py_execute_after_rewriter : 0.000009s : 0.02% optimize.mutable_eliminate : 0.000769s : 2.02% optimize.opt_b.b_1 : 0.000207s : 0.54% optimize.opt_b.b_2 : 0.000010s : 0.03% optimize.opt_b.updatestate_depend_eliminate : 0.000009s : 0.02% optimize.opt_b.updatestate_assign_eliminate : 0.000004s : 0.01% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.01% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000033s : 0.09% optimize.optimize_parallel_all_gather_comm : 0.000026s : 0.07% optimize.overlap_param_gather : 0.000005s : 0.01% optimize.cconv : 0.000038s : 0.10% optimize.loop_unroll : 0.000522s : 1.37% optimize.opt_after_cconv.c_1 : 0.000041s : 0.11% optimize.opt_after_cconv.parameter_eliminate : 0.000005s : 0.01% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000007s : 0.02% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.cse : 0.000025s : 0.07% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000020s : 0.05% optimize.tuple_transform.d_1 : 0.000293s : 0.77% optimize.tuple_transform.none_parameter_eliminate : 0.000003s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000011s : 0.03% optimize.partial_unused_args_eliminate : 0.000005s : 0.01% optimize.add_recomputation : 0.000092s : 0.24% optimize.cse_after_recomputation.cse : 0.000022s : 0.06% optimize.environ_conv : 0.000012s : 0.03% optimize.swap_dp_allreduce_reducescatter : 0.000010s : 0.03% optimize.bias_add_comm_swap : 0.000006s : 0.02% optimize.label_micro_interleaved_index : 0.000008s : 0.02% optimize.label_fine_grained_interleaved_index : 0.000006s : 0.02% optimize.merge_cast_opt : 0.000005s : 0.01% optimize.slice_recompute_activation : 0.000005s : 0.01% optimize.micro_interleaved_order_control : 0.000005s : 0.01% optimize.assign_add_opt : 0.000004s : 0.01% optimize.ForceFp32Comm : 0.000003s : 0.01% optimize.remove_cast_before_assign_add : 0.000004s : 0.01% optimize.full_micro_interleaved_order_control : 0.000005s : 0.01% optimize.reorder_send_recv_between_fp_bp : 0.000005s : 0.01% optimize.comm_op_add_attrs : 0.000004s : 0.01% optimize.add_comm_op_reuse_tag : 0.000003s : 0.01% optimize.interleave_split_concat_branches : 0.000004s : 0.01% optimize.interleave_parallel_branches : 0.000004s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000004s : 0.01% optimize.control_data_broadcast_order : 0.000019s : 0.05% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.01% optimize.offloading_packed_experts : 0.000008s : 0.02% optimize.overlap_recompute_and_grad_model_parallel : 0.000008s : 0.02% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.01% optimize.overlap_recompute_comm : 0.000005s : 0.01% optimize.overlap_grad_ring_attention : 0.000008s : 0.02% optimize.overlap_grad_flash_sp : 0.000028s : 0.07% optimize.begin_end_overlap_inline : 0.000003s : 0.01% optimize.split_matmul_comm_elemetwise : 0.000005s : 0.01% optimize.split_layernorm_comm : 0.000004s : 0.01% optimize.handle_group_info : 0.000004s : 0.01% optimize.symbol_engine_optimizer.build : 0.000004s : 0.01% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000013s : 0.03% optimize.symbol_engine_optimizer.elim_not_effective : 0.000017s : 0.04% optimize.symbol_engine_optimizer.opt_reshape : 0.000009s : 0.02% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000013s : 0.03% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000004s : 0.01% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000024s : 0.06% get_jit_bprop_graph : 0.000002s : 0.01% rewriter_after_jit_bprop_graph : 0.000007s : 0.02% opt_after_jit_grad : 0.000661s : 1.73% validate : 0.000050s : 0.13% Time group info: ------[substitution.] 0.000261 38 12.59% : 0.000033s : 3: substitution.cast_eliminate 0.88% : 0.000002s : 3: substitution.elim_not_effective 0.76% : 0.000002s : 3: substitution.fold_const_symbol 3.38% : 0.000009s : 5: substitution.graph_param_transform 67.35% : 0.000176s : 4: substitution.inline 2.39% : 0.000006s : 6: substitution.j_node_and_user_rematch 2.99% : 0.000008s : 6: substitution.remove_not_recompute_node 3.21% : 0.000008s : 4: substitution.replace_old_param 6.45% : 0.000017s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.007394 2 88.62% : 0.006553s : 1: type_inference.infer 11.38% : 0.000841s : 1: type_inference.specialize ------[replace.] 0.000073 8 61.46% : 0.000045s : 4: replace.inline 38.54% : 0.000028s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000188 8 92.06% : 0.000173s : 4: match.inline 7.94% : 0.000015s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000278 1596 1.06% : 0.000003s : 17: predicate.accumulaten_eliminater 0.76% : 0.000002s : 5: predicate.ad_related_special_op_eliminate 0.54% : 0.000002s : 10: predicate.addn_check_dump 1.02% : 0.000003s : 17: predicate.addn_zero_filter 0.84% : 0.000002s : 17: predicate.adjust_all_reduce_mul_add 2.23% : 0.000006s : 27: predicate.arithmetic_simplify 1.14% : 0.000003s : 17: predicate.cast_eliminate 0.58% : 0.000002s : 10: predicate.check_bprop_eliminate 0.57% : 0.000002s : 10: predicate.compare_switch_simplify 0.19% : 0.000001s : 5: predicate.const_output_eliminate 0.58% : 0.000002s : 10: predicate.depend_value_elim 0.99% : 0.000003s : 17: predicate.dict_get_item_const_eliminator 1.05% : 0.000003s : 17: predicate.dict_get_item_eliminator 0.92% : 0.000003s : 17: predicate.dict_set_item_eliminator 0.93% : 0.000003s : 10: predicate.dumpgradient_eliminate 0.21% : 0.000001s : 5: predicate.elim_not_effective 0.34% : 0.000001s : 5: predicate.elim_shapecalc_of_broadcastargs 1.22% : 0.000003s : 22: predicate.environ_add_const_eliminate 1.10% : 0.000003s : 22: predicate.environ_get_add_eliminate 1.15% : 0.000003s : 22: predicate.environ_get_depend_swap 1.83% : 0.000005s : 32: predicate.environ_get_eliminate 1.25% : 0.000003s : 22: predicate.environ_get_set_eliminate 1.46% : 0.000004s : 25: predicate.exchange_switch_depend_value 2.42% : 0.000007s : 25: predicate.float_depend_g_call 0.58% : 0.000002s : 10: predicate.float_environ_get_switch 0.78% : 0.000002s : 15: predicate.float_tuple_getitem_switch 0.17% : 0.000000s : 5: predicate.fold_const_symbol 0.66% : 0.000002s : 10: predicate.get_grad_eliminate 0.30% : 0.000001s : 5: predicate.graph_param_transform 0.57% : 0.000002s : 10: predicate.incorporate_call 0.49% : 0.000001s : 10: predicate.incorporate_call_switch 5.95% : 0.000017s : 72: predicate.inline 0.90% : 0.000002s : 10: predicate.inline_without_move 0.30% : 0.000001s : 10: predicate.j_node_and_user_rematch 0.77% : 0.000002s : 10: predicate.less_batch_normalization 2.10% : 0.000006s : 31: predicate.list_to_tuple_eliminator_ 2.51% : 0.000007s : 48: predicate.load_eliminater 0.97% : 0.000003s : 5: predicate.loop_unroll_after_grad 1.96% : 0.000005s : 36: predicate.loop_unroll_before_grad 1.71% : 0.000005s : 27: predicate.make_slice_get_slice_eliminator 0.70% : 0.000002s : 10: predicate.merge_addn 0.62% : 0.000002s : 10: predicate.micro_step_allgather_replace 0.57% : 0.000002s : 10: predicate.mini_step_allgather_replace 0.90% : 0.000002s : 17: predicate.minmaximum_grad 1.28% : 0.000004s : 5: predicate.mutable_eliminate 0.28% : 0.000001s : 5: predicate.opt_reshape 0.35% : 0.000001s : 5: predicate.parallel_virtual_node 1.57% : 0.000004s : 25: predicate.partial_defer_inline 1.58% : 0.000004s : 26: predicate.partial_eliminate 0.93% : 0.000003s : 17: predicate.print_const_string_wrapper 0.62% : 0.000002s : 10: predicate.reduce_all_const_elim 1.23% : 0.000003s : 17: predicate.reduce_eliminate 2.60% : 0.000007s : 48: predicate.redundant_stop_gradient_eliminater 0.44% : 0.000001s : 10: predicate.remove_not_recompute_node 1.40% : 0.000004s : 31: predicate.replace_applicator 0.46% : 0.000001s : 10: predicate.replace_old_param 0.32% : 0.000001s : 5: predicate.reset_defer_inline 1.07% : 0.000003s : 17: predicate.reshape_eliminate 0.56% : 0.000002s : 10: predicate.row_tensor_add_zeros_like 0.41% : 0.000001s : 5: predicate.row_tensor_eliminate 0.79% : 0.000002s : 10: predicate.same_eliminate 0.37% : 0.000001s : 10: predicate.set_cell_output_no_recompute 0.87% : 0.000002s : 10: predicate.shard_identity_eliminate 0.81% : 0.000002s : 10: predicate.special_op_eliminate 0.67% : 0.000002s : 10: predicate.specialize_transform 0.94% : 0.000003s : 10: predicate.split_environ_get_set_with_tuple_value 0.77% : 0.000002s : 10: predicate.stack_unstack_eliminate 0.34% : 0.000001s : 5: predicate.switch_call_monad_eliminater 1.52% : 0.000004s : 25: predicate.switch_defer_inline 1.97% : 0.000005s : 35: predicate.switch_layer_defer_inline 4.41% : 0.000012s : 76: predicate.switch_simplify 0.99% : 0.000003s : 17: predicate.tile_eliminate 0.97% : 0.000003s : 17: predicate.transpose_eliminate 1.73% : 0.000005s : 27: predicate.tuple_list_convert_item_index_to_positive 1.68% : 0.000005s : 27: predicate.tuple_list_get_item_const_eliminator 1.64% : 0.000005s : 27: predicate.tuple_list_get_item_depend_reorder 3.16% : 0.000009s : 41: predicate.tuple_list_get_item_eliminator 1.42% : 0.000004s : 27: predicate.tuple_list_get_set_item_eliminator 2.39% : 0.000007s : 37: predicate.tuple_list_set_item_eliminator 1.73% : 0.000005s : 31: predicate.tuple_to_list_eliminator_ 2.37% : 0.000007s : 48: predicate.updatestate_pure_node_eliminater 3.00% : 0.000008s : 58: predicate.updatestate_useless_node_eliminater 0.40% : 0.000001s : 5: predicate.value_based_eliminate 0.64% : 0.000002s : 10: predicate.virtual_dataset_eliminate 0.75% : 0.000002s : 10: predicate.virtual_output_eliminate 0.28% : 0.000001s : 5: predicate.virtual_view_grad_eliminate 0.43% : 0.000001s : 5: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000706 11 49.13% : 0.000347s : 5: func_graph_cloner_run.FuncGraphClonerGraph 50.87% : 0.000359s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.105728 192 0.01% : 0.000006s : 1: ForceFp32Comm 3.88% : 0.004101s : 1: add_attr 3.86% : 0.004080s : 1: add_attr_with_inline 0.01% : 0.000006s : 1: add_comm_op_reuse_tag 0.09% : 0.000096s : 1: add_recomputation 0.01% : 0.000006s : 1: assign_add_opt 0.08% : 0.000082s : 1: auto_monad 0.03% : 0.000031s : 1: auto_monad_reorder 0.01% : 0.000006s : 1: begin_end_overlap_inline 0.01% : 0.000010s : 1: bias_add_comm_swap 0.50% : 0.000531s : 1: bootstrap 0.04% : 0.000041s : 1: cconv 0.01% : 0.000007s : 1: comm_op_add_attrs 0.02% : 0.000022s : 1: control_data_broadcast_order 0.01% : 0.000014s : 1: convert_after_rewriter 0.04% : 0.000043s : 1: cse_after_recomputation 0.01% : 0.000008s : 1: dataset_repeat_opt 0.02% : 0.000021s : 1: detach_backward 0.01% : 0.000015s : 1: environ_conv 0.03% : 0.000034s : 1: event_method 0.01% : 0.000007s : 1: full_micro_interleaved_order_control 0.01% : 0.000008s : 1: get_jit_bprop_graph 0.01% : 0.000013s : 1: graph_reusing 0.01% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000007s : 1: handle_group_info 0.01% : 0.000009s : 1: inline 0.01% : 0.000009s : 1: insert-virtual-dataset 0.01% : 0.000006s : 1: interleave_parallel_branches 0.01% : 0.000006s : 1: interleave_split_concat_branches 0.01% : 0.000009s : 1: label_fine_grained_interleaved_index 0.01% : 0.000011s : 1: label_micro_interleaved_index 0.50% : 0.000529s : 1: loop_unroll 0.01% : 0.000008s : 1: merge_cast_opt 0.01% : 0.000007s : 1: micro_interleaved_order_control 0.74% : 0.000778s : 1: mutable_eliminate 0.01% : 0.000010s : 1: offloading_packed_experts 0.02% : 0.000020s : 1: opt.transform.loop_unroll_optimizer 0.02% : 0.000023s : 1: opt.transform.mutable_eliminate 1.47% : 0.001553s : 78: opt.transform.opt_a 0.04% : 0.000039s : 1: opt.transform.opt_after_cconv 0.03% : 0.000033s : 1: opt.transform.opt_after_jit_grad 0.13% : 0.000139s : 28: opt.transform.opt_b 0.28% : 0.000300s : 2: opt.transform.opt_trans_graph 0.04% : 0.000047s : 4: opt.transform.symbol_engine_opt 26.00% : 0.027486s : 1: opt_a 0.15% : 0.000156s : 1: opt_after_cconv 0.64% : 0.000672s : 1: opt_after_jit_grad 0.32% : 0.000339s : 1: opt_b 29.64% : 0.031338s : 1: optimize 0.03% : 0.000030s : 1: optimize_parallel_all_gather_comm 0.01% : 0.000012s : 1: order_py_execute_after_rewriter 0.03% : 0.000031s : 1: overlap_grad_flash_sp 0.01% : 0.000007s : 1: overlap_grad_matmul_and_grad_allreduce 0.01% : 0.000011s : 1: overlap_grad_ring_attention 0.01% : 0.000007s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000006s : 1: overlap_opt_shard_in_pipeline 0.01% : 0.000009s : 1: overlap_param_gather 0.01% : 0.000006s : 1: overlap_recompute_allgather_and_fa_grad 0.01% : 0.000011s : 1: overlap_recompute_and_grad_model_parallel 0.01% : 0.000008s : 1: overlap_recompute_comm 0.01% : 0.000011s : 1: parallel-infer-symbol 0.01% : 0.000006s : 1: parallel-infer-symbol-second 0.01% : 0.000009s : 1: partial_unused_args_eliminate 0.01% : 0.000009s : 1: pipeline_parallel_scheduler 0.01% : 0.000007s : 1: pipeline_split 0.05% : 0.000050s : 1: pre_auto_parallel 0.04% : 0.000041s : 1: py_interpret_to_execute 0.03% : 0.000027s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000007s : 1: remove_cast_before_assign_add 0.02% : 0.000023s : 1: remove_dup_value 22.63% : 0.023921s : 1: renormalize.infer 0.50% : 0.000524s : 1: renormalize.specialize 0.01% : 0.000008s : 1: reorder_send_recv_between_fp_bp 0.01% : 0.000013s : 1: rewriter_after_jit_bprop_graph 0.05% : 0.000056s : 1: rewriter_after_opt_a 0.10% : 0.000110s : 1: rewriter_before_opt_a 0.01% : 0.000008s : 1: slice_cell_reuse_recomputed_activation 0.01% : 0.000007s : 1: slice_recompute_activation 0.01% : 0.000007s : 1: split_layernorm_comm 0.01% : 0.000007s : 1: split_matmul_comm_elemetwise 0.01% : 0.000013s : 1: swap_dp_allreduce_reducescatter 0.11% : 0.000112s : 1: symbol_engine_optimizer 0.34% : 0.000364s : 1: tuple_transform 7.11% : 0.007519s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:44:51.171.354 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0389116, [21] [bootstrap]: 0.00042576 [type_inference]: 0.0274246 [event_method]: 2.086e-05 [auto_monad]: 6.795e-05 [graph_reusing]: 5.84999e-06 [inline]: 2.63e-06 [add_attr]: 0.00359522, [1] [add_attr_with_inline]: 0.003584, [1] [Cycle 1]: 7.074e-05, [2] [tag_attr]: 2.222e-05 [meta_addattr_fg_expand]: 5.99e-06 [parallel-infer-symbol]: 3.3e-06 [pre_auto_parallel]: 3.994e-05 [insert-virtual-dataset]: 2.89999e-06 [parallel-infer-symbol-second]: 9.5999e-07 [dataset_repeat_opt]: 2.07001e-06 [pipeline_split]: 1.94999e-06 [optimize]: 0.00640142, [53] [py_interpret_to_execute]: 2.95e-05 [rewriter_before_opt_a]: 9.495e-05 [opt_a]: 0.00359503, [2] [Cycle 1]: 0.00270312, [45] [expand_dump_flag]: 2.91e-06 [switch_simplify]: 4.539e-05 [loop_unroll]: 3.141e-05 [a_1]: 0.00077834 [with_stream_mark]: 1.791e-05 [recompute_prepare]: 1.302e-05 [updatestate_depend_eliminate]: 4.68001e-06 [updatestate_assign_eliminate]: 4.03001e-06 [updatestate_loads_eliminate]: 3.95e-06 [parameter_eliminate]: 2.27001e-06 [a_2]: 0.00010383 [accelerated_algorithm]: 8.80999e-06 [shard]: 1.79998e-06 [meta_shard_fg_expand]: 2.05002e-06 [shard_inline]: 7.63001e-06 [merge_send_recv]: 9.84001e-06 [auto_parallel]: 9.20999e-06 [parallel]: 2.011e-05 [flash_sp]: 9.43002e-06 [merge_comm]: 4.89e-06 [allreduce_fusion]: 4.68999e-06 [matmul_add_comm_reduction]: 1.044e-05 [allreduce_slice_to_reducescatter]: 6.80011e-07 [virtual_shard_identity]: 9.90002e-06 [virtual_dataset]: 7.95998e-06 [get_grad_eliminate_]: 7.90998e-06 [virtual_output]: 7.82e-06 [merge_forward]: 5.15999e-06 [cell_reuse_recompute_pass]: 1.29e-06 [offload_activation]: 1.176e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.584e-05 [merge_recompute_call_nodes]: 1.40001e-06 [before_grad]: 1.37e-05 [set_forward_comm_id_for_comm_node_pass]: 4.75999e-06 [meta_fg_expand]: 3.37002e-06 [flash_sp_send_recv_attached]: 2.60002e-06 [receive_attached]: 2.80002e-06 [after_resolve]: 1.34e-05 [a_after_grad]: 1.198e-05 [renormalize]: 0.00106221 [add_forward_monad_depend]: 8.97999e-06 [auto_monad_grad]: 2.51998e-06 [auto_monad_eliminator]: 2.284e-05 [cse]: 3.946e-05 [a_3]: 7.435e-05 [Cycle 2]: 0.00087819, [45] [expand_dump_flag]: 2.48002e-06 [switch_simplify]: 1.121e-05 [loop_unroll]: 8.23001e-06 [a_1]: 0.00019817 [with_stream_mark]: 2.047e-05 [recompute_prepare]: 8.75999e-06 [updatestate_depend_eliminate]: 5.22999e-06 [updatestate_assign_eliminate]: 3.75e-06 [updatestate_loads_eliminate]: 2.94001e-06 [parameter_eliminate]: 1.56998e-06 [a_2]: 9.449e-05 [accelerated_algorithm]: 8.94e-06 [shard]: 2.66e-06 [meta_shard_fg_expand]: 2.44001e-06 [shard_inline]: 8e-06 [merge_send_recv]: 9.41e-06 [auto_parallel]: 9.71e-06 [parallel]: 7.48e-06 [flash_sp]: 8.77e-06 [merge_comm]: 4.3e-06 [allreduce_fusion]: 4.05e-06 [matmul_add_comm_reduction]: 9.31e-06 [allreduce_slice_to_reducescatter]: 8.10018e-07 [virtual_shard_identity]: 9.93002e-06 [virtual_dataset]: 7.36999e-06 [get_grad_eliminate_]: 7.5e-06 [virtual_output]: 7.26999e-06 [merge_forward]: 4.53001e-06 [cell_reuse_recompute_pass]: 3.68999e-06 [offload_activation]: 1.102e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.39e-05 [merge_recompute_call_nodes]: 1.45999e-06 [before_grad]: 1.335e-05 [set_forward_comm_id_for_comm_node_pass]: 4.58001e-06 [meta_fg_expand]: 3.58e-06 [flash_sp_send_recv_attached]: 1.82001e-06 [receive_attached]: 2.81e-06 [after_resolve]: 1.478e-05 [a_after_grad]: 1.193e-05 [renormalize]: 8.00064e-08 [add_forward_monad_depend]: 2.38002e-06 [auto_monad_grad]: 1.91e-06 [auto_monad_eliminator]: 1.439e-05 [cse]: 2.627e-05 [a_3]: 4.729e-05 [py_interpret_to_execute_after_opt_a]: 1.893e-05 [slice_cell_reuse_recomputed_activation]: 1.96e-06 [rewriter_after_opt_a]: 4.915e-05 [convert_after_rewriter]: 7.5e-06 [order_py_execute_after_rewriter]: 5.74999e-06 [mutable_eliminate]: 0.00080011 [opt_b]: 0.00028557, [1] [Cycle 1]: 0.00027645, [7] [b_1]: 0.00016671 [b_2]: 1.096e-05 [updatestate_depend_eliminate]: 1.167e-05 [updatestate_assign_eliminate]: 3.53999e-06 [updatestate_loads_eliminate]: 4.09997e-06 [renormalize]: 9.50007e-07 [cse]: 3.898e-05 [optimize_parallel_all_gather_comm]: 2.385e-05 [overlap_param_gather]: 2.39999e-06 [cconv]: 3.869e-05 [loop_unroll]: 0.00061093 [opt_after_cconv]: 0.0001394, [1] [Cycle 1]: 0.00013148, [7] [c_1]: 4.016e-05 [parameter_eliminate]: 6.29999e-06 [updatestate_depend_eliminate]: 8.28001e-06 [updatestate_assign_eliminate]: 3.51999e-06 [updatestate_loads_eliminate]: 3.28e-06 [cse]: 3.284e-05 [renormalize]: 6.30011e-07 [remove_dup_value]: 1.738e-05 [tuple_transform]: 0.00011829, [1] [Cycle 1]: 0.00011376, [4] [d_1]: 6.33e-05 [none_parameter_eliminate]: 1.90001e-06 [renormalize]: 3.19997e-07 [switch_simplify]: 9.43002e-06 [partial_unused_args_eliminate]: 2.31e-06 [add_recomputation]: 6.844e-05 [cse_after_recomputation]: 2.906e-05, [1] [Cycle 1]: 2.387e-05, [1] [cse]: 1.809e-05 [environ_conv]: 7.37002e-06 [swap_dp_allreduce_reducescatter]: 6.38998e-06 [bias_add_comm_swap]: 3.35e-06 [label_micro_interleaved_index]: 5.91998e-06 [label_fine_grained_interleaved_index]: 2.98e-06 [merge_cast_opt]: 1.81e-06 [slice_recompute_activation]: 2.39999e-06 [micro_interleaved_order_control]: 3.33e-06 [assign_add_opt]: 1.45999e-06 [ForceFp32Comm]: 1.25001e-06 [remove_cast_before_assign_add]: 1.30999e-06 [full_micro_interleaved_order_control]: 2.16998e-06 [reorder_send_recv_between_fp_bp]: 3.01001e-06 [comm_op_add_attrs]: 1.29003e-06 [add_comm_op_reuse_tag]: 1.35999e-06 [interleave_split_concat_branches]: 1.12e-06 [interleave_parallel_branches]: 1.05999e-06 [overlap_opt_shard_in_pipeline]: 1.27999e-06 [overlap_opt_shard_grad_in_pipeline]: 2.12999e-06 [control_data_broadcast_order]: 1.701e-05 [grouped_pairwise_exchange_alltoall]: 1.92999e-06 [offloading_packed_experts]: 4.58999e-06 [overlap_recompute_and_grad_model_parallel]: 5.80002e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.24998e-06 [overlap_recompute_allgather_and_fa_grad]: 1.44e-06 [overlap_recompute_comm]: 3.08e-06 [overlap_grad_ring_attention]: 5.15001e-06 [overlap_grad_flash_sp]: 2.663e-05 [begin_end_overlap_inline]: 5.39992e-07 [split_matmul_comm_elemetwise]: 2.37001e-06 [split_layernorm_comm]: 1.69e-06 [handle_group_info]: 1.72001e-06 [symbol_engine_optimizer]: 9.25e-05, [1] [Cycle 1]: 8.726e-05, [6] [build]: 4.83001e-06 [elim_shapecalc]: 1.38e-05 [elim_not_effective]: 1.59e-05 [opt_reshape]: 8.97999e-06 [fold_const_symbol]: 1.304e-05 [renormalize]: 6.99976e-07 [detach_backward]: 2.39001e-06 [pipeline_parallel_scheduler]: 1.59e-06 [auto_monad_reorder]: 2.19e-05 [get_jit_bprop_graph]: 2.07001e-06 [rewriter_after_jit_bprop_graph]: 6.74001e-06 [opt_after_jit_grad]: 0.00065367 [validate]: 5.788e-05 Sums bootstrap : 0.000426s : 1.24% type_inference : 0.027425s : 80.10% event_method : 0.000021s : 0.06% auto_monad : 0.000068s : 0.20% graph_reusing : 0.000006s : 0.02% inline : 0.000003s : 0.01% add_attr.add_attr_with_inline.tag_attr : 0.000022s : 0.06% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.02% parallel-infer-symbol : 0.000003s : 0.01% pre_auto_parallel : 0.000040s : 0.12% insert-virtual-dataset : 0.000003s : 0.01% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.01% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000029s : 0.09% optimize.rewriter_before_opt_a : 0.000095s : 0.28% optimize.opt_a.expand_dump_flag : 0.000005s : 0.02% optimize.opt_a.switch_simplify : 0.000057s : 0.17% optimize.opt_a.loop_unroll : 0.000040s : 0.12% optimize.opt_a.a_1 : 0.000977s : 2.85% optimize.opt_a.with_stream_mark : 0.000038s : 0.11% optimize.opt_a.recompute_prepare : 0.000022s : 0.06% optimize.opt_a.updatestate_depend_eliminate : 0.000010s : 0.03% optimize.opt_a.updatestate_assign_eliminate : 0.000008s : 0.02% optimize.opt_a.updatestate_loads_eliminate : 0.000007s : 0.02% optimize.opt_a.parameter_eliminate : 0.000004s : 0.01% optimize.opt_a.a_2 : 0.000198s : 0.58% optimize.opt_a.accelerated_algorithm : 0.000018s : 0.05% optimize.opt_a.shard : 0.000004s : 0.01% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.01% optimize.opt_a.shard_inline : 0.000016s : 0.05% optimize.opt_a.merge_send_recv : 0.000019s : 0.06% optimize.opt_a.auto_parallel : 0.000019s : 0.06% optimize.opt_a.parallel : 0.000028s : 0.08% optimize.opt_a.flash_sp : 0.000018s : 0.05% optimize.opt_a.merge_comm : 0.000009s : 0.03% optimize.opt_a.allreduce_fusion : 0.000009s : 0.03% optimize.opt_a.matmul_add_comm_reduction : 0.000020s : 0.06% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000020s : 0.06% optimize.opt_a.virtual_dataset : 0.000015s : 0.04% optimize.opt_a.get_grad_eliminate_ : 0.000015s : 0.05% optimize.opt_a.virtual_output : 0.000015s : 0.04% optimize.opt_a.merge_forward : 0.000010s : 0.03% optimize.opt_a.cell_reuse_recompute_pass : 0.000005s : 0.01% optimize.opt_a.offload_activation : 0.000023s : 0.07% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000040s : 0.12% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.01% optimize.opt_a.before_grad : 0.000027s : 0.08% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000009s : 0.03% optimize.opt_a.meta_fg_expand : 0.000007s : 0.02% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.01% optimize.opt_a.receive_attached : 0.000006s : 0.02% optimize.opt_a.after_resolve : 0.000028s : 0.08% optimize.opt_a.a_after_grad : 0.000024s : 0.07% optimize.opt_a.renormalize : 0.001062s : 3.10% optimize.opt_a.add_forward_monad_depend : 0.000011s : 0.03% optimize.opt_a.auto_monad_grad : 0.000004s : 0.01% optimize.opt_a.auto_monad_eliminator : 0.000037s : 0.11% optimize.opt_a.cse : 0.000066s : 0.19% optimize.opt_a.a_3 : 0.000122s : 0.36% optimize.py_interpret_to_execute_after_opt_a : 0.000019s : 0.06% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.01% optimize.rewriter_after_opt_a : 0.000049s : 0.14% optimize.convert_after_rewriter : 0.000007s : 0.02% optimize.order_py_execute_after_rewriter : 0.000006s : 0.02% optimize.mutable_eliminate : 0.000800s : 2.34% optimize.opt_b.b_1 : 0.000167s : 0.49% optimize.opt_b.b_2 : 0.000011s : 0.03% optimize.opt_b.updatestate_depend_eliminate : 0.000012s : 0.03% optimize.opt_b.updatestate_assign_eliminate : 0.000004s : 0.01% optimize.opt_b.updatestate_loads_eliminate : 0.000004s : 0.01% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000039s : 0.11% optimize.optimize_parallel_all_gather_comm : 0.000024s : 0.07% optimize.overlap_param_gather : 0.000002s : 0.01% optimize.cconv : 0.000039s : 0.11% optimize.loop_unroll : 0.000611s : 1.78% optimize.opt_after_cconv.c_1 : 0.000040s : 0.12% optimize.opt_after_cconv.parameter_eliminate : 0.000006s : 0.02% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000008s : 0.02% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000004s : 0.01% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.cse : 0.000033s : 0.10% optimize.opt_after_cconv.renormalize : 0.000001s : 0.00% optimize.remove_dup_value : 0.000017s : 0.05% optimize.tuple_transform.d_1 : 0.000063s : 0.18% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000009s : 0.03% optimize.partial_unused_args_eliminate : 0.000002s : 0.01% optimize.add_recomputation : 0.000068s : 0.20% optimize.cse_after_recomputation.cse : 0.000018s : 0.05% optimize.environ_conv : 0.000007s : 0.02% optimize.swap_dp_allreduce_reducescatter : 0.000006s : 0.02% optimize.bias_add_comm_swap : 0.000003s : 0.01% optimize.label_micro_interleaved_index : 0.000006s : 0.02% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.01% optimize.merge_cast_opt : 0.000002s : 0.01% optimize.slice_recompute_activation : 0.000002s : 0.01% optimize.micro_interleaved_order_control : 0.000003s : 0.01% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.00% optimize.full_micro_interleaved_order_control : 0.000002s : 0.01% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.01% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.01% optimize.control_data_broadcast_order : 0.000017s : 0.05% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.01% optimize.offloading_packed_experts : 0.000005s : 0.01% optimize.overlap_recompute_and_grad_model_parallel : 0.000006s : 0.02% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.00% optimize.overlap_recompute_comm : 0.000003s : 0.01% optimize.overlap_grad_ring_attention : 0.000005s : 0.02% optimize.overlap_grad_flash_sp : 0.000027s : 0.08% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.01% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000002s : 0.01% optimize.symbol_engine_optimizer.build : 0.000005s : 0.01% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000014s : 0.04% optimize.symbol_engine_optimizer.elim_not_effective : 0.000016s : 0.05% optimize.symbol_engine_optimizer.opt_reshape : 0.000009s : 0.03% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000013s : 0.04% optimize.symbol_engine_optimizer.renormalize : 0.000001s : 0.00% detach_backward : 0.000002s : 0.01% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000022s : 0.06% get_jit_bprop_graph : 0.000002s : 0.01% rewriter_after_jit_bprop_graph : 0.000007s : 0.02% opt_after_jit_grad : 0.000654s : 1.91% validate : 0.000058s : 0.17% Time group info: ------[substitution.] 0.000245 38 13.20% : 0.000032s : 3: substitution.cast_eliminate 0.91% : 0.000002s : 3: substitution.elim_not_effective 0.81% : 0.000002s : 3: substitution.fold_const_symbol 3.18% : 0.000008s : 5: substitution.graph_param_transform 66.19% : 0.000162s : 4: substitution.inline 2.47% : 0.000006s : 6: substitution.j_node_and_user_rematch 4.29% : 0.000010s : 6: substitution.remove_not_recompute_node 2.57% : 0.000006s : 4: substitution.replace_old_param 6.38% : 0.000016s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.027356 2 97.18% : 0.026584s : 1: type_inference.infer 2.82% : 0.000771s : 1: type_inference.specialize ------[replace.] 0.000068 8 61.78% : 0.000042s : 4: replace.inline 38.22% : 0.000026s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000173 8 92.25% : 0.000159s : 4: match.inline 7.75% : 0.000013s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000279 1596 0.99% : 0.000003s : 17: predicate.accumulaten_eliminater 0.83% : 0.000002s : 5: predicate.ad_related_special_op_eliminate 0.53% : 0.000001s : 10: predicate.addn_check_dump 0.94% : 0.000003s : 17: predicate.addn_zero_filter 0.87% : 0.000002s : 17: predicate.adjust_all_reduce_mul_add 2.11% : 0.000006s : 27: predicate.arithmetic_simplify 1.19% : 0.000003s : 17: predicate.cast_eliminate 0.58% : 0.000002s : 10: predicate.check_bprop_eliminate 0.63% : 0.000002s : 10: predicate.compare_switch_simplify 0.17% : 0.000000s : 5: predicate.const_output_eliminate 0.62% : 0.000002s : 10: predicate.depend_value_elim 0.96% : 0.000003s : 17: predicate.dict_get_item_const_eliminator 1.02% : 0.000003s : 17: predicate.dict_get_item_eliminator 0.96% : 0.000003s : 17: predicate.dict_set_item_eliminator 0.93% : 0.000003s : 10: predicate.dumpgradient_eliminate 0.19% : 0.000001s : 5: predicate.elim_not_effective 0.33% : 0.000001s : 5: predicate.elim_shapecalc_of_broadcastargs 1.57% : 0.000004s : 22: predicate.environ_add_const_eliminate 1.13% : 0.000003s : 22: predicate.environ_get_add_eliminate 1.20% : 0.000003s : 22: predicate.environ_get_depend_swap 1.62% : 0.000005s : 32: predicate.environ_get_eliminate 1.10% : 0.000003s : 22: predicate.environ_get_set_eliminate 1.31% : 0.000004s : 25: predicate.exchange_switch_depend_value 2.21% : 0.000006s : 25: predicate.float_depend_g_call 0.52% : 0.000001s : 10: predicate.float_environ_get_switch 0.90% : 0.000003s : 15: predicate.float_tuple_getitem_switch 0.17% : 0.000000s : 5: predicate.fold_const_symbol 0.70% : 0.000002s : 10: predicate.get_grad_eliminate 0.21% : 0.000001s : 5: predicate.graph_param_transform 0.56% : 0.000002s : 10: predicate.incorporate_call 0.51% : 0.000001s : 10: predicate.incorporate_call_switch 6.04% : 0.000017s : 72: predicate.inline 0.78% : 0.000002s : 10: predicate.inline_without_move 0.30% : 0.000001s : 10: predicate.j_node_and_user_rematch 0.84% : 0.000002s : 10: predicate.less_batch_normalization 1.82% : 0.000005s : 31: predicate.list_to_tuple_eliminator_ 2.52% : 0.000007s : 48: predicate.load_eliminater 1.10% : 0.000003s : 5: predicate.loop_unroll_after_grad 1.91% : 0.000005s : 36: predicate.loop_unroll_before_grad 1.67% : 0.000005s : 27: predicate.make_slice_get_slice_eliminator 0.54% : 0.000002s : 10: predicate.merge_addn 0.68% : 0.000002s : 10: predicate.micro_step_allgather_replace 0.60% : 0.000002s : 10: predicate.mini_step_allgather_replace 0.81% : 0.000002s : 17: predicate.minmaximum_grad 1.64% : 0.000005s : 5: predicate.mutable_eliminate 0.38% : 0.000001s : 5: predicate.opt_reshape 0.37% : 0.000001s : 5: predicate.parallel_virtual_node 1.63% : 0.000005s : 25: predicate.partial_defer_inline 1.59% : 0.000004s : 26: predicate.partial_eliminate 0.95% : 0.000003s : 17: predicate.print_const_string_wrapper 0.58% : 0.000002s : 10: predicate.reduce_all_const_elim 1.27% : 0.000004s : 17: predicate.reduce_eliminate 2.54% : 0.000007s : 48: predicate.redundant_stop_gradient_eliminater 0.35% : 0.000001s : 10: predicate.remove_not_recompute_node 1.26% : 0.000004s : 31: predicate.replace_applicator 0.54% : 0.000002s : 10: predicate.replace_old_param 0.21% : 0.000001s : 5: predicate.reset_defer_inline 0.99% : 0.000003s : 17: predicate.reshape_eliminate 0.64% : 0.000002s : 10: predicate.row_tensor_add_zeros_like 0.37% : 0.000001s : 5: predicate.row_tensor_eliminate 0.88% : 0.000002s : 10: predicate.same_eliminate 0.39% : 0.000001s : 10: predicate.set_cell_output_no_recompute 0.88% : 0.000002s : 10: predicate.shard_identity_eliminate 0.76% : 0.000002s : 10: predicate.special_op_eliminate 0.67% : 0.000002s : 10: predicate.specialize_transform 1.23% : 0.000003s : 10: predicate.split_environ_get_set_with_tuple_value 0.73% : 0.000002s : 10: predicate.stack_unstack_eliminate 0.37% : 0.000001s : 5: predicate.switch_call_monad_eliminater 1.43% : 0.000004s : 25: predicate.switch_defer_inline 1.97% : 0.000006s : 35: predicate.switch_layer_defer_inline 4.81% : 0.000013s : 76: predicate.switch_simplify 0.98% : 0.000003s : 17: predicate.tile_eliminate 1.03% : 0.000003s : 17: predicate.transpose_eliminate 1.61% : 0.000004s : 27: predicate.tuple_list_convert_item_index_to_positive 1.57% : 0.000004s : 27: predicate.tuple_list_get_item_const_eliminator 1.54% : 0.000004s : 27: predicate.tuple_list_get_item_depend_reorder 3.46% : 0.000010s : 41: predicate.tuple_list_get_item_eliminator 1.52% : 0.000004s : 27: predicate.tuple_list_get_set_item_eliminator 2.37% : 0.000007s : 37: predicate.tuple_list_set_item_eliminator 1.72% : 0.000005s : 31: predicate.tuple_to_list_eliminator_ 2.29% : 0.000006s : 48: predicate.updatestate_pure_node_eliminater 2.97% : 0.000008s : 58: predicate.updatestate_useless_node_eliminater 0.45% : 0.000001s : 5: predicate.value_based_eliminate 0.62% : 0.000002s : 10: predicate.virtual_dataset_eliminate 0.62% : 0.000002s : 10: predicate.virtual_output_eliminate 0.28% : 0.000001s : 5: predicate.virtual_view_grad_eliminate 0.35% : 0.000001s : 5: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000705 11 57.64% : 0.000406s : 5: func_graph_cloner_run.FuncGraphClonerGraph 42.36% : 0.000299s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.051711 192 0.01% : 0.000004s : 1: ForceFp32Comm 6.96% : 0.003602s : 1: add_attr 6.94% : 0.003588s : 1: add_attr_with_inline 0.01% : 0.000004s : 1: add_comm_op_reuse_tag 0.14% : 0.000073s : 1: add_recomputation 0.01% : 0.000004s : 1: assign_add_opt 0.14% : 0.000074s : 1: auto_monad 0.05% : 0.000026s : 1: auto_monad_reorder 0.01% : 0.000003s : 1: begin_end_overlap_inline 0.01% : 0.000006s : 1: bias_add_comm_swap 0.88% : 0.000454s : 1: bootstrap 0.08% : 0.000042s : 1: cconv 0.01% : 0.000004s : 1: comm_op_add_attrs 0.04% : 0.000020s : 1: control_data_broadcast_order 0.02% : 0.000011s : 1: convert_after_rewriter 0.06% : 0.000032s : 1: cse_after_recomputation 0.01% : 0.000005s : 1: dataset_repeat_opt 0.01% : 0.000005s : 1: detach_backward 0.02% : 0.000011s : 1: environ_conv 0.05% : 0.000028s : 1: event_method 0.01% : 0.000005s : 1: full_micro_interleaved_order_control 0.01% : 0.000005s : 1: get_jit_bprop_graph 0.02% : 0.000009s : 1: graph_reusing 0.01% : 0.000005s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000004s : 1: handle_group_info 0.01% : 0.000006s : 1: inline 0.01% : 0.000006s : 1: insert-virtual-dataset 0.01% : 0.000004s : 1: interleave_parallel_branches 0.01% : 0.000004s : 1: interleave_split_concat_branches 0.01% : 0.000006s : 1: label_fine_grained_interleaved_index 0.02% : 0.000009s : 1: label_micro_interleaved_index 1.20% : 0.000622s : 1: loop_unroll 0.01% : 0.000005s : 1: merge_cast_opt 0.01% : 0.000006s : 1: micro_interleaved_order_control 1.58% : 0.000815s : 1: mutable_eliminate 0.01% : 0.000008s : 1: offloading_packed_experts 0.04% : 0.000021s : 1: opt.transform.loop_unroll_optimizer 0.06% : 0.000029s : 1: opt.transform.mutable_eliminate 2.95% : 0.001527s : 78: opt.transform.opt_a 0.07% : 0.000038s : 1: opt.transform.opt_after_cconv 0.07% : 0.000036s : 1: opt.transform.opt_after_jit_grad 0.27% : 0.000140s : 28: opt.transform.opt_b 0.14% : 0.000070s : 2: opt.transform.opt_trans_graph 0.09% : 0.000048s : 4: opt.transform.symbol_engine_opt 6.96% : 0.003599s : 1: opt_a 0.28% : 0.000143s : 1: opt_after_cconv 1.29% : 0.000669s : 1: opt_after_jit_grad 0.56% : 0.000290s : 1: opt_b 12.39% : 0.006407s : 1: optimize 0.05% : 0.000028s : 1: optimize_parallel_all_gather_comm 0.02% : 0.000009s : 1: order_py_execute_after_rewriter 0.06% : 0.000030s : 1: overlap_grad_flash_sp 0.01% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.02% : 0.000008s : 1: overlap_grad_ring_attention 0.01% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.01% : 0.000006s : 1: overlap_param_gather 0.01% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.02% : 0.000009s : 1: overlap_recompute_and_grad_model_parallel 0.01% : 0.000006s : 1: overlap_recompute_comm 0.01% : 0.000007s : 1: parallel-infer-symbol 0.01% : 0.000004s : 1: parallel-infer-symbol-second 0.01% : 0.000005s : 1: partial_unused_args_eliminate 0.01% : 0.000005s : 1: pipeline_parallel_scheduler 0.01% : 0.000005s : 1: pipeline_split 0.09% : 0.000044s : 1: pre_auto_parallel 0.07% : 0.000034s : 1: py_interpret_to_execute 0.04% : 0.000023s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000005s : 1: remove_cast_before_assign_add 0.04% : 0.000021s : 1: remove_dup_value 1.17% : 0.000608s : 1: renormalize.infer 0.86% : 0.000443s : 1: renormalize.specialize 0.01% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.02% : 0.000010s : 1: rewriter_after_jit_bprop_graph 0.10% : 0.000054s : 1: rewriter_after_opt_a 0.19% : 0.000100s : 1: rewriter_before_opt_a 0.01% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.01% : 0.000005s : 1: slice_recompute_activation 0.01% : 0.000004s : 1: split_layernorm_comm 0.01% : 0.000005s : 1: split_matmul_comm_elemetwise 0.02% : 0.000009s : 1: swap_dp_allreduce_reducescatter 0.18% : 0.000095s : 1: symbol_engine_optimizer 0.24% : 0.000122s : 1: tuple_transform 53.08% : 0.027447s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:44:51.643.599 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:44:51.643.908 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0338521, [21] [bootstrap]: 0.00048922 [type_inference]: 0.0201306 [event_method]: 2.561e-05 [auto_monad]: 7.201e-05 [graph_reusing]: 6.49999e-06 [inline]: 3.2e-06 [add_attr]: 0.00416823, [1] [add_attr_with_inline]: 0.00415495, [1] [Cycle 1]: 0.0001049, [2] [tag_attr]: 2.651e-05 [meta_addattr_fg_expand]: 6.78998e-06 [parallel-infer-symbol]: 3.68999e-06 [pre_auto_parallel]: 4.326e-05 [insert-virtual-dataset]: 2.79999e-06 [parallel-infer-symbol-second]: 9.00007e-07 [dataset_repeat_opt]: 2.02999e-06 [pipeline_split]: 1.67001e-06 [optimize]: 0.00743506, [53] [py_interpret_to_execute]: 4.008e-05 [rewriter_before_opt_a]: 0.00010655 [opt_a]: 0.00438327, [2] [Cycle 1]: 0.00316973, [45] [expand_dump_flag]: 3.38e-06 [switch_simplify]: 4.409e-05 [loop_unroll]: 3.194e-05 [a_1]: 0.00085222 [with_stream_mark]: 2.557e-05 [recompute_prepare]: 1.57e-05 [updatestate_depend_eliminate]: 5.54e-06 [updatestate_assign_eliminate]: 4.13999e-06 [updatestate_loads_eliminate]: 4.32e-06 [parameter_eliminate]: 2.61e-06 [a_2]: 0.00013801 [accelerated_algorithm]: 1.136e-05 [shard]: 2.96999e-06 [meta_shard_fg_expand]: 3.78999e-06 [shard_inline]: 9.22001e-06 [merge_send_recv]: 1.069e-05 [auto_parallel]: 1.236e-05 [parallel]: 2.121e-05 [flash_sp]: 1.304e-05 [merge_comm]: 5.84e-06 [allreduce_fusion]: 4.70001e-06 [matmul_add_comm_reduction]: 1.134e-05 [allreduce_slice_to_reducescatter]: 8.39995e-07 [virtual_shard_identity]: 1.358e-05 [virtual_dataset]: 8.43999e-06 [get_grad_eliminate_]: 7.68001e-06 [virtual_output]: 8.42998e-06 [merge_forward]: 5.69e-06 [cell_reuse_recompute_pass]: 1.87001e-06 [offload_activation]: 1.296e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.559e-05 [merge_recompute_call_nodes]: 1.72001e-06 [before_grad]: 1.382e-05 [set_forward_comm_id_for_comm_node_pass]: 5.49e-06 [meta_fg_expand]: 4.85999e-06 [flash_sp_send_recv_attached]: 3.45e-06 [receive_attached]: 2.72001e-06 [after_resolve]: 1.438e-05 [a_after_grad]: 1.364e-05 [renormalize]: 0.00112677 [add_forward_monad_depend]: 1.014e-05 [auto_monad_grad]: 3.09999e-06 [auto_monad_eliminator]: 2.729e-05 [cse]: 4.518e-05 [a_3]: 8.964e-05 [Cycle 2]: 0.00117632, [45] [expand_dump_flag]: 2.41e-06 [switch_simplify]: 1.196e-05 [loop_unroll]: 9.39e-06 [a_1]: 0.00020742 [with_stream_mark]: 2.343e-05 [recompute_prepare]: 1.099e-05 [updatestate_depend_eliminate]: 5.19e-06 [updatestate_assign_eliminate]: 4.22e-06 [updatestate_loads_eliminate]: 3.83001e-06 [parameter_eliminate]: 2.91e-06 [a_2]: 0.00013507 [accelerated_algorithm]: 1.034e-05 [shard]: 2.88e-06 [meta_shard_fg_expand]: 2.31e-06 [shard_inline]: 8.93002e-06 [merge_send_recv]: 1.151e-05 [auto_parallel]: 1.169e-05 [parallel]: 9.21002e-06 [flash_sp]: 4.28001e-06 [merge_comm]: 5.47001e-06 [allreduce_fusion]: 5.47999e-06 [matmul_add_comm_reduction]: 1.031e-05 [allreduce_slice_to_reducescatter]: 5.89993e-07 [virtual_shard_identity]: 1.284e-05 [virtual_dataset]: 7.79002e-06 [get_grad_eliminate_]: 8.97999e-06 [virtual_output]: 8.18999e-06 [merge_forward]: 6.06e-06 [cell_reuse_recompute_pass]: 4.07998e-06 [offload_activation]: 1.295e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.364e-05 [merge_recompute_call_nodes]: 1.60001e-06 [before_grad]: 1.606e-05 [set_forward_comm_id_for_comm_node_pass]: 6.50997e-06 [meta_fg_expand]: 3.43e-06 [flash_sp_send_recv_attached]: 1.82001e-06 [receive_attached]: 2.06998e-06 [after_resolve]: 1.686e-05 [a_after_grad]: 1.262e-05 [renormalize]: 7.00238e-08 [add_forward_monad_depend]: 4.71002e-06 [auto_monad_grad]: 2.57001e-06 [auto_monad_eliminator]: 1.898e-05 [cse]: 3.412e-05 [a_3]: 6.629e-05 [py_interpret_to_execute_after_opt_a]: 3.014e-05 [slice_cell_reuse_recomputed_activation]: 6.23998e-06 [rewriter_after_opt_a]: 5.986e-05 [convert_after_rewriter]: 1.224e-05 [order_py_execute_after_rewriter]: 9.15001e-06 [mutable_eliminate]: 0.00081144 [opt_b]: 0.00035711, [1] [Cycle 1]: 0.00034322, [7] [b_1]: 0.00020835 [b_2]: 1.104e-05 [updatestate_depend_eliminate]: 1.187e-05 [updatestate_assign_eliminate]: 3.82002e-06 [updatestate_loads_eliminate]: 3.91999e-06 [renormalize]: 5.50004e-07 [cse]: 4.126e-05 [optimize_parallel_all_gather_comm]: 3.007e-05 [overlap_param_gather]: 5.77001e-06 [cconv]: 4.452e-05 [loop_unroll]: 0.00055031 [opt_after_cconv]: 0.0001612, [1] [Cycle 1]: 0.00015006, [7] [c_1]: 3.972e-05 [parameter_eliminate]: 5.47001e-06 [updatestate_depend_eliminate]: 8.72e-06 [updatestate_assign_eliminate]: 3.31999e-06 [updatestate_loads_eliminate]: 3.66001e-06 [cse]: 2.993e-05 [renormalize]: 6.09987e-07 [remove_dup_value]: 2.131e-05 [tuple_transform]: 0.00011424, [1] [Cycle 1]: 0.00010572, [4] [d_1]: 5.92e-05 [none_parameter_eliminate]: 2.18998e-06 [renormalize]: 1.59984e-07 [switch_simplify]: 9.41998e-06 [partial_unused_args_eliminate]: 5.35001e-06 [add_recomputation]: 7.124e-05 [cse_after_recomputation]: 3.752e-05, [1] [Cycle 1]: 2.912e-05, [1] [cse]: 1.834e-05 [environ_conv]: 1.017e-05 [swap_dp_allreduce_reducescatter]: 9.31e-06 [bias_add_comm_swap]: 6.41998e-06 [label_micro_interleaved_index]: 8.03001e-06 [label_fine_grained_interleaved_index]: 5.37001e-06 [merge_cast_opt]: 4.4e-06 [slice_recompute_activation]: 4.92e-06 [micro_interleaved_order_control]: 5.14e-06 [assign_add_opt]: 3.70003e-06 [ForceFp32Comm]: 3.88001e-06 [remove_cast_before_assign_add]: 3.58999e-06 [full_micro_interleaved_order_control]: 5.58002e-06 [reorder_send_recv_between_fp_bp]: 5.74999e-06 [comm_op_add_attrs]: 3.8e-06 [add_comm_op_reuse_tag]: 4.05e-06 [interleave_split_concat_branches]: 3.88001e-06 [interleave_parallel_branches]: 3.76999e-06 [overlap_opt_shard_in_pipeline]: 3.93001e-06 [overlap_opt_shard_grad_in_pipeline]: 5.15999e-06 [control_data_broadcast_order]: 2.159e-05 [grouped_pairwise_exchange_alltoall]: 4.30999e-06 [offloading_packed_experts]: 8.3e-06 [overlap_recompute_and_grad_model_parallel]: 8.90999e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.73001e-06 [overlap_recompute_allgather_and_fa_grad]: 4.37e-06 [overlap_recompute_comm]: 4.92999e-06 [overlap_grad_ring_attention]: 7.21999e-06 [overlap_grad_flash_sp]: 2.955e-05 [begin_end_overlap_inline]: 3.13e-06 [split_matmul_comm_elemetwise]: 4.72e-06 [split_layernorm_comm]: 4.33999e-06 [handle_group_info]: 3.61999e-06 [symbol_engine_optimizer]: 0.00012497, [1] [Cycle 1]: 0.00011618, [6] [build]: 5.12e-06 [elim_shapecalc]: 1.73e-05 [elim_not_effective]: 1.919e-05 [opt_reshape]: 9.04e-06 [fold_const_symbol]: 1.349e-05 [renormalize]: 2.50002e-07 [detach_backward]: 5.25001e-06 [pipeline_parallel_scheduler]: 2.11e-06 [auto_monad_reorder]: 2.858e-05 [get_jit_bprop_graph]: 1.92999e-06 [rewriter_after_jit_bprop_graph]: 7.64002e-06 [opt_after_jit_grad]: 0.00066784 [validate]: 5.658e-05 Sums bootstrap : 0.000489s : 1.77% type_inference : 0.020131s : 73.00% event_method : 0.000026s : 0.09% auto_monad : 0.000072s : 0.26% graph_reusing : 0.000006s : 0.02% inline : 0.000003s : 0.01% add_attr.add_attr_with_inline.tag_attr : 0.000027s : 0.10% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000007s : 0.02% parallel-infer-symbol : 0.000004s : 0.01% pre_auto_parallel : 0.000043s : 0.16% insert-virtual-dataset : 0.000003s : 0.01% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.01% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000040s : 0.15% optimize.rewriter_before_opt_a : 0.000107s : 0.39% optimize.opt_a.expand_dump_flag : 0.000006s : 0.02% optimize.opt_a.switch_simplify : 0.000056s : 0.20% optimize.opt_a.loop_unroll : 0.000041s : 0.15% optimize.opt_a.a_1 : 0.001060s : 3.84% optimize.opt_a.with_stream_mark : 0.000049s : 0.18% optimize.opt_a.recompute_prepare : 0.000027s : 0.10% optimize.opt_a.updatestate_depend_eliminate : 0.000011s : 0.04% optimize.opt_a.updatestate_assign_eliminate : 0.000008s : 0.03% optimize.opt_a.updatestate_loads_eliminate : 0.000008s : 0.03% optimize.opt_a.parameter_eliminate : 0.000006s : 0.02% optimize.opt_a.a_2 : 0.000273s : 0.99% optimize.opt_a.accelerated_algorithm : 0.000022s : 0.08% optimize.opt_a.shard : 0.000006s : 0.02% optimize.opt_a.meta_shard_fg_expand : 0.000006s : 0.02% optimize.opt_a.shard_inline : 0.000018s : 0.07% optimize.opt_a.merge_send_recv : 0.000022s : 0.08% optimize.opt_a.auto_parallel : 0.000024s : 0.09% optimize.opt_a.parallel : 0.000030s : 0.11% optimize.opt_a.flash_sp : 0.000017s : 0.06% optimize.opt_a.merge_comm : 0.000011s : 0.04% optimize.opt_a.allreduce_fusion : 0.000010s : 0.04% optimize.opt_a.matmul_add_comm_reduction : 0.000022s : 0.08% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000026s : 0.10% optimize.opt_a.virtual_dataset : 0.000016s : 0.06% optimize.opt_a.get_grad_eliminate_ : 0.000017s : 0.06% optimize.opt_a.virtual_output : 0.000017s : 0.06% optimize.opt_a.merge_forward : 0.000012s : 0.04% optimize.opt_a.cell_reuse_recompute_pass : 0.000006s : 0.02% optimize.opt_a.offload_activation : 0.000026s : 0.09% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000049s : 0.18% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.01% optimize.opt_a.before_grad : 0.000030s : 0.11% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000012s : 0.04% optimize.opt_a.meta_fg_expand : 0.000008s : 0.03% optimize.opt_a.flash_sp_send_recv_attached : 0.000005s : 0.02% optimize.opt_a.receive_attached : 0.000005s : 0.02% optimize.opt_a.after_resolve : 0.000031s : 0.11% optimize.opt_a.a_after_grad : 0.000026s : 0.10% optimize.opt_a.renormalize : 0.001127s : 4.09% optimize.opt_a.add_forward_monad_depend : 0.000015s : 0.05% optimize.opt_a.auto_monad_grad : 0.000006s : 0.02% optimize.opt_a.auto_monad_eliminator : 0.000046s : 0.17% optimize.opt_a.cse : 0.000079s : 0.29% optimize.opt_a.a_3 : 0.000156s : 0.57% optimize.py_interpret_to_execute_after_opt_a : 0.000030s : 0.11% optimize.slice_cell_reuse_recomputed_activation : 0.000006s : 0.02% optimize.rewriter_after_opt_a : 0.000060s : 0.22% optimize.convert_after_rewriter : 0.000012s : 0.04% optimize.order_py_execute_after_rewriter : 0.000009s : 0.03% optimize.mutable_eliminate : 0.000811s : 2.94% optimize.opt_b.b_1 : 0.000208s : 0.76% optimize.opt_b.b_2 : 0.000011s : 0.04% optimize.opt_b.updatestate_depend_eliminate : 0.000012s : 0.04% optimize.opt_b.updatestate_assign_eliminate : 0.000004s : 0.01% optimize.opt_b.updatestate_loads_eliminate : 0.000004s : 0.01% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000041s : 0.15% optimize.optimize_parallel_all_gather_comm : 0.000030s : 0.11% optimize.overlap_param_gather : 0.000006s : 0.02% optimize.cconv : 0.000045s : 0.16% optimize.loop_unroll : 0.000550s : 2.00% optimize.opt_after_cconv.c_1 : 0.000040s : 0.14% optimize.opt_after_cconv.parameter_eliminate : 0.000005s : 0.02% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000009s : 0.03% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000004s : 0.01% optimize.opt_after_cconv.cse : 0.000030s : 0.11% optimize.opt_after_cconv.renormalize : 0.000001s : 0.00% optimize.remove_dup_value : 0.000021s : 0.08% optimize.tuple_transform.d_1 : 0.000059s : 0.21% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000009s : 0.03% optimize.partial_unused_args_eliminate : 0.000005s : 0.02% optimize.add_recomputation : 0.000071s : 0.26% optimize.cse_after_recomputation.cse : 0.000018s : 0.07% optimize.environ_conv : 0.000010s : 0.04% optimize.swap_dp_allreduce_reducescatter : 0.000009s : 0.03% optimize.bias_add_comm_swap : 0.000006s : 0.02% optimize.label_micro_interleaved_index : 0.000008s : 0.03% optimize.label_fine_grained_interleaved_index : 0.000005s : 0.02% optimize.merge_cast_opt : 0.000004s : 0.02% optimize.slice_recompute_activation : 0.000005s : 0.02% optimize.micro_interleaved_order_control : 0.000005s : 0.02% optimize.assign_add_opt : 0.000004s : 0.01% optimize.ForceFp32Comm : 0.000004s : 0.01% optimize.remove_cast_before_assign_add : 0.000004s : 0.01% optimize.full_micro_interleaved_order_control : 0.000006s : 0.02% optimize.reorder_send_recv_between_fp_bp : 0.000006s : 0.02% optimize.comm_op_add_attrs : 0.000004s : 0.01% optimize.add_comm_op_reuse_tag : 0.000004s : 0.01% optimize.interleave_split_concat_branches : 0.000004s : 0.01% optimize.interleave_parallel_branches : 0.000004s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000005s : 0.02% optimize.control_data_broadcast_order : 0.000022s : 0.08% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.02% optimize.offloading_packed_experts : 0.000008s : 0.03% optimize.overlap_recompute_and_grad_model_parallel : 0.000009s : 0.03% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.02% optimize.overlap_recompute_comm : 0.000005s : 0.02% optimize.overlap_grad_ring_attention : 0.000007s : 0.03% optimize.overlap_grad_flash_sp : 0.000030s : 0.11% optimize.begin_end_overlap_inline : 0.000003s : 0.01% optimize.split_matmul_comm_elemetwise : 0.000005s : 0.02% optimize.split_layernorm_comm : 0.000004s : 0.02% optimize.handle_group_info : 0.000004s : 0.01% optimize.symbol_engine_optimizer.build : 0.000005s : 0.02% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000017s : 0.06% optimize.symbol_engine_optimizer.elim_not_effective : 0.000019s : 0.07% optimize.symbol_engine_optimizer.opt_reshape : 0.000009s : 0.03% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000013s : 0.05% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000005s : 0.02% pipeline_parallel_scheduler : 0.000002s : 0.01% auto_monad_reorder : 0.000029s : 0.10% get_jit_bprop_graph : 0.000002s : 0.01% rewriter_after_jit_bprop_graph : 0.000008s : 0.03% opt_after_jit_grad : 0.000668s : 2.42% validate : 0.000057s : 0.21% Time group info: ------[substitution.] 0.000270 38 12.30% : 0.000033s : 3: substitution.cast_eliminate 0.78% : 0.000002s : 3: substitution.elim_not_effective 0.65% : 0.000002s : 3: substitution.fold_const_symbol 2.64% : 0.000007s : 5: substitution.graph_param_transform 68.26% : 0.000185s : 4: substitution.inline 2.15% : 0.000006s : 6: substitution.j_node_and_user_rematch 2.82% : 0.000008s : 6: substitution.remove_not_recompute_node 3.45% : 0.000009s : 4: substitution.replace_old_param 6.96% : 0.000019s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.020055 2 95.04% : 0.019061s : 1: type_inference.infer 4.96% : 0.000994s : 1: type_inference.specialize ------[replace.] 0.000075 8 57.71% : 0.000043s : 4: replace.inline 42.29% : 0.000032s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000198 8 91.77% : 0.000182s : 4: match.inline 8.23% : 0.000016s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000292 1596 0.98% : 0.000003s : 17: predicate.accumulaten_eliminater 0.89% : 0.000003s : 5: predicate.ad_related_special_op_eliminate 0.52% : 0.000002s : 10: predicate.addn_check_dump 1.14% : 0.000003s : 17: predicate.addn_zero_filter 0.85% : 0.000002s : 17: predicate.adjust_all_reduce_mul_add 2.11% : 0.000006s : 27: predicate.arithmetic_simplify 1.18% : 0.000003s : 17: predicate.cast_eliminate 0.71% : 0.000002s : 10: predicate.check_bprop_eliminate 0.69% : 0.000002s : 10: predicate.compare_switch_simplify 0.16% : 0.000000s : 5: predicate.const_output_eliminate 0.71% : 0.000002s : 10: predicate.depend_value_elim 0.98% : 0.000003s : 17: predicate.dict_get_item_const_eliminator 1.04% : 0.000003s : 17: predicate.dict_get_item_eliminator 1.01% : 0.000003s : 17: predicate.dict_set_item_eliminator 0.85% : 0.000002s : 10: predicate.dumpgradient_eliminate 0.22% : 0.000001s : 5: predicate.elim_not_effective 0.41% : 0.000001s : 5: predicate.elim_shapecalc_of_broadcastargs 1.16% : 0.000003s : 22: predicate.environ_add_const_eliminate 1.17% : 0.000003s : 22: predicate.environ_get_add_eliminate 1.16% : 0.000003s : 22: predicate.environ_get_depend_swap 1.78% : 0.000005s : 32: predicate.environ_get_eliminate 1.25% : 0.000004s : 22: predicate.environ_get_set_eliminate 1.29% : 0.000004s : 25: predicate.exchange_switch_depend_value 2.20% : 0.000006s : 25: predicate.float_depend_g_call 0.52% : 0.000002s : 10: predicate.float_environ_get_switch 0.83% : 0.000002s : 15: predicate.float_tuple_getitem_switch 0.14% : 0.000000s : 5: predicate.fold_const_symbol 0.81% : 0.000002s : 10: predicate.get_grad_eliminate 0.19% : 0.000001s : 5: predicate.graph_param_transform 0.54% : 0.000002s : 10: predicate.incorporate_call 0.44% : 0.000001s : 10: predicate.incorporate_call_switch 6.36% : 0.000019s : 72: predicate.inline 0.75% : 0.000002s : 10: predicate.inline_without_move 0.28% : 0.000001s : 10: predicate.j_node_and_user_rematch 1.07% : 0.000003s : 10: predicate.less_batch_normalization 1.77% : 0.000005s : 31: predicate.list_to_tuple_eliminator_ 2.30% : 0.000007s : 48: predicate.load_eliminater 1.10% : 0.000003s : 5: predicate.loop_unroll_after_grad 2.01% : 0.000006s : 36: predicate.loop_unroll_before_grad 1.68% : 0.000005s : 27: predicate.make_slice_get_slice_eliminator 0.79% : 0.000002s : 10: predicate.merge_addn 0.52% : 0.000002s : 10: predicate.micro_step_allgather_replace 0.59% : 0.000002s : 10: predicate.mini_step_allgather_replace 0.87% : 0.000003s : 17: predicate.minmaximum_grad 1.48% : 0.000004s : 5: predicate.mutable_eliminate 0.43% : 0.000001s : 5: predicate.opt_reshape 0.37% : 0.000001s : 5: predicate.parallel_virtual_node 1.82% : 0.000005s : 25: predicate.partial_defer_inline 1.55% : 0.000005s : 26: predicate.partial_eliminate 0.87% : 0.000003s : 17: predicate.print_const_string_wrapper 0.71% : 0.000002s : 10: predicate.reduce_all_const_elim 1.20% : 0.000003s : 17: predicate.reduce_eliminate 2.56% : 0.000007s : 48: predicate.redundant_stop_gradient_eliminater 0.46% : 0.000001s : 10: predicate.remove_not_recompute_node 1.43% : 0.000004s : 31: predicate.replace_applicator 0.43% : 0.000001s : 10: predicate.replace_old_param 0.28% : 0.000001s : 5: predicate.reset_defer_inline 1.16% : 0.000003s : 17: predicate.reshape_eliminate 0.65% : 0.000002s : 10: predicate.row_tensor_add_zeros_like 0.41% : 0.000001s : 5: predicate.row_tensor_eliminate 0.79% : 0.000002s : 10: predicate.same_eliminate 0.35% : 0.000001s : 10: predicate.set_cell_output_no_recompute 1.02% : 0.000003s : 10: predicate.shard_identity_eliminate 0.65% : 0.000002s : 10: predicate.special_op_eliminate 0.75% : 0.000002s : 10: predicate.specialize_transform 1.05% : 0.000003s : 10: predicate.split_environ_get_set_with_tuple_value 0.71% : 0.000002s : 10: predicate.stack_unstack_eliminate 0.29% : 0.000001s : 5: predicate.switch_call_monad_eliminater 1.39% : 0.000004s : 25: predicate.switch_defer_inline 1.93% : 0.000006s : 35: predicate.switch_layer_defer_inline 4.65% : 0.000014s : 76: predicate.switch_simplify 0.90% : 0.000003s : 17: predicate.tile_eliminate 0.95% : 0.000003s : 17: predicate.transpose_eliminate 1.49% : 0.000004s : 27: predicate.tuple_list_convert_item_index_to_positive 1.55% : 0.000005s : 27: predicate.tuple_list_get_item_const_eliminator 1.37% : 0.000004s : 27: predicate.tuple_list_get_item_depend_reorder 3.22% : 0.000009s : 41: predicate.tuple_list_get_item_eliminator 1.52% : 0.000004s : 27: predicate.tuple_list_get_set_item_eliminator 2.19% : 0.000006s : 37: predicate.tuple_list_set_item_eliminator 1.61% : 0.000005s : 31: predicate.tuple_to_list_eliminator_ 2.38% : 0.000007s : 48: predicate.updatestate_pure_node_eliminater 2.97% : 0.000009s : 58: predicate.updatestate_useless_node_eliminater 0.46% : 0.000001s : 5: predicate.value_based_eliminate 0.62% : 0.000002s : 10: predicate.virtual_dataset_eliminate 0.73% : 0.000002s : 10: predicate.virtual_output_eliminate 0.28% : 0.000001s : 5: predicate.virtual_view_grad_eliminate 0.37% : 0.000001s : 5: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000795 11 48.06% : 0.000382s : 5: func_graph_cloner_run.FuncGraphClonerGraph 51.94% : 0.000413s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.048472 192 0.01% : 0.000007s : 1: ForceFp32Comm 8.62% : 0.004181s : 1: add_attr 8.58% : 0.004160s : 1: add_attr_with_inline 0.01% : 0.000007s : 1: add_comm_op_reuse_tag 0.16% : 0.000076s : 1: add_recomputation 0.01% : 0.000006s : 1: assign_add_opt 0.17% : 0.000085s : 1: auto_monad 0.08% : 0.000038s : 1: auto_monad_reorder 0.01% : 0.000006s : 1: begin_end_overlap_inline 0.02% : 0.000009s : 1: bias_add_comm_swap 1.12% : 0.000542s : 1: bootstrap 0.10% : 0.000048s : 1: cconv 0.01% : 0.000007s : 1: comm_op_add_attrs 0.05% : 0.000025s : 1: control_data_broadcast_order 0.03% : 0.000016s : 1: convert_after_rewriter 0.08% : 0.000041s : 1: cse_after_recomputation 0.02% : 0.000008s : 1: dataset_repeat_opt 0.06% : 0.000030s : 1: detach_backward 0.03% : 0.000013s : 1: environ_conv 0.07% : 0.000036s : 1: event_method 0.02% : 0.000008s : 1: full_micro_interleaved_order_control 0.02% : 0.000008s : 1: get_jit_bprop_graph 0.03% : 0.000013s : 1: graph_reusing 0.01% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000006s : 1: handle_group_info 0.02% : 0.000009s : 1: inline 0.02% : 0.000010s : 1: insert-virtual-dataset 0.01% : 0.000006s : 1: interleave_parallel_branches 0.01% : 0.000007s : 1: interleave_split_concat_branches 0.02% : 0.000008s : 1: label_fine_grained_interleaved_index 0.02% : 0.000011s : 1: label_micro_interleaved_index 1.15% : 0.000558s : 1: loop_unroll 0.01% : 0.000007s : 1: merge_cast_opt 0.02% : 0.000008s : 1: micro_interleaved_order_control 1.70% : 0.000822s : 1: mutable_eliminate 0.02% : 0.000011s : 1: offloading_packed_experts 0.04% : 0.000021s : 1: opt.transform.loop_unroll_optimizer 0.06% : 0.000029s : 1: opt.transform.mutable_eliminate 3.41% : 0.001655s : 78: opt.transform.opt_a 0.08% : 0.000038s : 1: opt.transform.opt_after_cconv 0.13% : 0.000063s : 1: opt.transform.opt_after_jit_grad 0.29% : 0.000141s : 28: opt.transform.opt_b 0.14% : 0.000066s : 2: opt.transform.opt_trans_graph 0.11% : 0.000054s : 4: opt.transform.symbol_engine_opt 9.06% : 0.004390s : 1: opt_a 0.34% : 0.000165s : 1: opt_after_cconv 1.41% : 0.000681s : 1: opt_after_jit_grad 0.75% : 0.000362s : 1: opt_b 16.15% : 0.007826s : 1: optimize 0.07% : 0.000034s : 1: optimize_parallel_all_gather_comm 0.03% : 0.000012s : 1: order_py_execute_after_rewriter 0.07% : 0.000033s : 1: overlap_grad_flash_sp 0.01% : 0.000007s : 1: overlap_grad_matmul_and_grad_allreduce 0.02% : 0.000011s : 1: overlap_grad_ring_attention 0.02% : 0.000008s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000007s : 1: overlap_opt_shard_in_pipeline 0.02% : 0.000009s : 1: overlap_param_gather 0.01% : 0.000007s : 1: overlap_recompute_allgather_and_fa_grad 0.02% : 0.000012s : 1: overlap_recompute_and_grad_model_parallel 0.02% : 0.000008s : 1: overlap_recompute_comm 0.02% : 0.000011s : 1: parallel-infer-symbol 0.01% : 0.000007s : 1: parallel-infer-symbol-second 0.02% : 0.000008s : 1: partial_unused_args_eliminate 0.02% : 0.000010s : 1: pipeline_parallel_scheduler 0.02% : 0.000008s : 1: pipeline_split 0.11% : 0.000051s : 1: pre_auto_parallel 0.09% : 0.000044s : 1: py_interpret_to_execute 0.07% : 0.000035s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000007s : 1: remove_cast_before_assign_add 0.05% : 0.000025s : 1: remove_dup_value 1.28% : 0.000622s : 1: renormalize.infer 1.01% : 0.000490s : 1: renormalize.specialize 0.02% : 0.000009s : 1: reorder_send_recv_between_fp_bp 0.03% : 0.000013s : 1: rewriter_after_jit_bprop_graph 0.13% : 0.000064s : 1: rewriter_after_opt_a 0.23% : 0.000111s : 1: rewriter_before_opt_a 0.02% : 0.000010s : 1: slice_cell_reuse_recomputed_activation 0.02% : 0.000007s : 1: slice_recompute_activation 0.02% : 0.000007s : 1: split_layernorm_comm 0.02% : 0.000007s : 1: split_matmul_comm_elemetwise 0.03% : 0.000012s : 1: swap_dp_allreduce_reducescatter 0.27% : 0.000129s : 1: symbol_engine_optimizer 0.24% : 0.000117s : 1: tuple_transform 41.66% : 0.020191s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:44:52.114.008 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0345385, [21] [bootstrap]: 0.00045297 [type_inference]: 0.00688449 [event_method]: 2.088e-05 [auto_monad]: 8.254e-05 [graph_reusing]: 6.40997e-06 [inline]: 2.46e-06 [add_attr]: 0.00351852, [1] [add_attr_with_inline]: 0.00350681, [1] [Cycle 1]: 7.336e-05, [2] [tag_attr]: 2.403e-05 [meta_addattr_fg_expand]: 6.17999e-06 [parallel-infer-symbol]: 3.36001e-06 [pre_auto_parallel]: 4.068e-05 [insert-virtual-dataset]: 2.49999e-06 [parallel-infer-symbol-second]: 8.89995e-07 [dataset_repeat_opt]: 2.04e-06 [pipeline_split]: 1.95001e-06 [optimize]: 0.0225545, [53] [py_interpret_to_execute]: 3.095e-05 [rewriter_before_opt_a]: 9.535e-05 [opt_a]: 0.0196905, [2] [Cycle 1]: 0.0188308, [45] [expand_dump_flag]: 3.30998e-06 [switch_simplify]: 4.882e-05 [loop_unroll]: 3.62e-05 [a_1]: 0.0168864 [with_stream_mark]: 2.848e-05 [recompute_prepare]: 1.494e-05 [updatestate_depend_eliminate]: 4.79e-06 [updatestate_assign_eliminate]: 3.77002e-06 [updatestate_loads_eliminate]: 4.08001e-06 [parameter_eliminate]: 2.51e-06 [a_2]: 0.00010416 [accelerated_algorithm]: 9.59999e-06 [shard]: 2.52001e-06 [meta_shard_fg_expand]: 3.21999e-06 [shard_inline]: 8e-06 [merge_send_recv]: 1.005e-05 [auto_parallel]: 1.075e-05 [parallel]: 2.005e-05 [flash_sp]: 1.149e-05 [merge_comm]: 4.63001e-06 [allreduce_fusion]: 4.43001e-06 [matmul_add_comm_reduction]: 1.146e-05 [allreduce_slice_to_reducescatter]: 7.29982e-07 [virtual_shard_identity]: 1.115e-05 [virtual_dataset]: 8.11002e-06 [get_grad_eliminate_]: 8.17e-06 [virtual_output]: 8.12e-06 [merge_forward]: 5.56e-06 [cell_reuse_recompute_pass]: 1.82001e-06 [offload_activation]: 1.139e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.71e-05 [merge_recompute_call_nodes]: 1.44e-06 [before_grad]: 1.391e-05 [set_forward_comm_id_for_comm_node_pass]: 4.67998e-06 [meta_fg_expand]: 3.88001e-06 [flash_sp_send_recv_attached]: 2.82002e-06 [receive_attached]: 3.18998e-06 [after_resolve]: 1.376e-05 [a_after_grad]: 1.291e-05 [renormalize]: 0.00102813 [add_forward_monad_depend]: 8.60999e-06 [auto_monad_grad]: 2.98e-06 [auto_monad_eliminator]: 2.279e-05 [cse]: 3.987e-05 [a_3]: 6.436e-05 [Cycle 2]: 0.00084459, [45] [expand_dump_flag]: 2.43002e-06 [switch_simplify]: 9.81e-06 [loop_unroll]: 8.15e-06 [a_1]: 0.00019379 [with_stream_mark]: 1.738e-05 [recompute_prepare]: 8.42e-06 [updatestate_depend_eliminate]: 4.2e-06 [updatestate_assign_eliminate]: 3.35003e-06 [updatestate_loads_eliminate]: 3.5e-06 [parameter_eliminate]: 1.84998e-06 [a_2]: 9.461e-05 [accelerated_algorithm]: 8.2e-06 [shard]: 1.76e-06 [meta_shard_fg_expand]: 2.26e-06 [shard_inline]: 7.51999e-06 [merge_send_recv]: 8.75001e-06 [auto_parallel]: 9.32999e-06 [parallel]: 6.58e-06 [flash_sp]: 9.95002e-06 [merge_comm]: 4.30999e-06 [allreduce_fusion]: 4e-06 [matmul_add_comm_reduction]: 7.95998e-06 [allreduce_slice_to_reducescatter]: 5.69999e-07 [virtual_shard_identity]: 8.90001e-06 [virtual_dataset]: 7.28999e-06 [get_grad_eliminate_]: 7.01001e-06 [virtual_output]: 7.13e-06 [merge_forward]: 4.42e-06 [cell_reuse_recompute_pass]: 2.91e-06 [offload_activation]: 9.90002e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.864e-05 [merge_recompute_call_nodes]: 9.30013e-07 [before_grad]: 1.206e-05 [set_forward_comm_id_for_comm_node_pass]: 4.78001e-06 [meta_fg_expand]: 3.31999e-06 [flash_sp_send_recv_attached]: 1.60999e-06 [receive_attached]: 2.01003e-06 [after_resolve]: 1.356e-05 [a_after_grad]: 1.115e-05 [renormalize]: 5.9983e-08 [add_forward_monad_depend]: 1.42e-06 [auto_monad_grad]: 1.51998e-06 [auto_monad_eliminator]: 1.119e-05 [cse]: 2.622e-05 [a_3]: 5.013e-05 [py_interpret_to_execute_after_opt_a]: 1.758e-05 [slice_cell_reuse_recomputed_activation]: 2.39001e-06 [rewriter_after_opt_a]: 4.733e-05 [convert_after_rewriter]: 8.48001e-06 [order_py_execute_after_rewriter]: 6.19001e-06 [mutable_eliminate]: 0.0008911 [opt_b]: 0.00028162, [1] [Cycle 1]: 0.00027309, [7] [b_1]: 0.0001667 [b_2]: 1.04e-05 [updatestate_depend_eliminate]: 1.142e-05 [updatestate_assign_eliminate]: 3.51999e-06 [updatestate_loads_eliminate]: 3.56999e-06 [renormalize]: 8.60018e-07 [cse]: 3.786e-05 [optimize_parallel_all_gather_comm]: 2.103e-05 [overlap_param_gather]: 2.07999e-06 [cconv]: 3.285e-05 [loop_unroll]: 0.0006265 [opt_after_cconv]: 0.0001363, [1] [Cycle 1]: 0.00012853, [7] [c_1]: 4.007e-05 [parameter_eliminate]: 5.59998e-06 [updatestate_depend_eliminate]: 8.96998e-06 [updatestate_assign_eliminate]: 3.38999e-06 [updatestate_loads_eliminate]: 3.06999e-06 [cse]: 3.13e-05 [renormalize]: 8.70001e-07 [remove_dup_value]: 1.787e-05 [tuple_transform]: 9.811e-05, [1] [Cycle 1]: 9.335e-05, [4] [d_1]: 6.265e-05 [none_parameter_eliminate]: 1.87999e-06 [renormalize]: 3.00002e-07 [switch_simplify]: 8.82e-06 [partial_unused_args_eliminate]: 2.14e-06 [add_recomputation]: 6.669e-05 [cse_after_recomputation]: 2.659e-05, [1] [Cycle 1]: 2.16e-05, [1] [cse]: 1.593e-05 [environ_conv]: 7.56001e-06 [swap_dp_allreduce_reducescatter]: 6.51e-06 [bias_add_comm_swap]: 3.40998e-06 [label_micro_interleaved_index]: 6.31998e-06 [label_fine_grained_interleaved_index]: 2.63e-06 [merge_cast_opt]: 1.37e-06 [slice_recompute_activation]: 2.48e-06 [micro_interleaved_order_control]: 2.61e-06 [assign_add_opt]: 1.29998e-06 [ForceFp32Comm]: 1.16002e-06 [remove_cast_before_assign_add]: 1.09e-06 [full_micro_interleaved_order_control]: 2.15002e-06 [reorder_send_recv_between_fp_bp]: 2.43998e-06 [comm_op_add_attrs]: 1.37999e-06 [add_comm_op_reuse_tag]: 9.70002e-07 [interleave_split_concat_branches]: 1.12e-06 [interleave_parallel_branches]: 1.01997e-06 [overlap_opt_shard_in_pipeline]: 1.93002e-06 [overlap_opt_shard_grad_in_pipeline]: 2.09999e-06 [control_data_broadcast_order]: 1.613e-05 [grouped_pairwise_exchange_alltoall]: 1.82999e-06 [offloading_packed_experts]: 4.95001e-06 [overlap_recompute_and_grad_model_parallel]: 5.66e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.19e-06 [overlap_recompute_allgather_and_fa_grad]: 1.36998e-06 [overlap_recompute_comm]: 2.24001e-06 [overlap_grad_ring_attention]: 4.70001e-06 [overlap_grad_flash_sp]: 2.352e-05 [begin_end_overlap_inline]: 7.09988e-07 [split_matmul_comm_elemetwise]: 2.16e-06 [split_layernorm_comm]: 1.54998e-06 [handle_group_info]: 1.00001e-06 [symbol_engine_optimizer]: 9.086e-05, [1] [Cycle 1]: 8.641e-05, [6] [build]: 4.37e-06 [elim_shapecalc]: 1.257e-05 [elim_not_effective]: 1.742e-05 [opt_reshape]: 9.50001e-06 [fold_const_symbol]: 1.317e-05 [renormalize]: 2.69996e-07 [detach_backward]: 2.26e-06 [pipeline_parallel_scheduler]: 2.14e-06 [auto_monad_reorder]: 2.095e-05 [get_jit_bprop_graph]: 2.10002e-06 [rewriter_after_jit_bprop_graph]: 6.33e-06 [opt_after_jit_grad]: 0.00070011 [validate]: 5.432e-05 Sums bootstrap : 0.000453s : 1.51% type_inference : 0.006884s : 23.00% event_method : 0.000021s : 0.07% auto_monad : 0.000083s : 0.28% graph_reusing : 0.000006s : 0.02% inline : 0.000002s : 0.01% add_attr.add_attr_with_inline.tag_attr : 0.000024s : 0.08% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.02% parallel-infer-symbol : 0.000003s : 0.01% pre_auto_parallel : 0.000041s : 0.14% insert-virtual-dataset : 0.000002s : 0.01% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.01% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000031s : 0.10% optimize.rewriter_before_opt_a : 0.000095s : 0.32% optimize.opt_a.expand_dump_flag : 0.000006s : 0.02% optimize.opt_a.switch_simplify : 0.000059s : 0.20% optimize.opt_a.loop_unroll : 0.000044s : 0.15% optimize.opt_a.a_1 : 0.017080s : 57.06% optimize.opt_a.with_stream_mark : 0.000046s : 0.15% optimize.opt_a.recompute_prepare : 0.000023s : 0.08% optimize.opt_a.updatestate_depend_eliminate : 0.000009s : 0.03% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.02% optimize.opt_a.updatestate_loads_eliminate : 0.000008s : 0.03% optimize.opt_a.parameter_eliminate : 0.000004s : 0.01% optimize.opt_a.a_2 : 0.000199s : 0.66% optimize.opt_a.accelerated_algorithm : 0.000018s : 0.06% optimize.opt_a.shard : 0.000004s : 0.01% optimize.opt_a.meta_shard_fg_expand : 0.000005s : 0.02% optimize.opt_a.shard_inline : 0.000016s : 0.05% optimize.opt_a.merge_send_recv : 0.000019s : 0.06% optimize.opt_a.auto_parallel : 0.000020s : 0.07% optimize.opt_a.parallel : 0.000027s : 0.09% optimize.opt_a.flash_sp : 0.000021s : 0.07% optimize.opt_a.merge_comm : 0.000009s : 0.03% optimize.opt_a.allreduce_fusion : 0.000008s : 0.03% optimize.opt_a.matmul_add_comm_reduction : 0.000019s : 0.06% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000020s : 0.07% optimize.opt_a.virtual_dataset : 0.000015s : 0.05% optimize.opt_a.get_grad_eliminate_ : 0.000015s : 0.05% optimize.opt_a.virtual_output : 0.000015s : 0.05% optimize.opt_a.merge_forward : 0.000010s : 0.03% optimize.opt_a.cell_reuse_recompute_pass : 0.000005s : 0.02% optimize.opt_a.offload_activation : 0.000021s : 0.07% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000036s : 0.12% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.01% optimize.opt_a.before_grad : 0.000026s : 0.09% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000009s : 0.03% optimize.opt_a.meta_fg_expand : 0.000007s : 0.02% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.01% optimize.opt_a.receive_attached : 0.000005s : 0.02% optimize.opt_a.after_resolve : 0.000027s : 0.09% optimize.opt_a.a_after_grad : 0.000024s : 0.08% optimize.opt_a.renormalize : 0.001028s : 3.43% optimize.opt_a.add_forward_monad_depend : 0.000010s : 0.03% optimize.opt_a.auto_monad_grad : 0.000004s : 0.02% optimize.opt_a.auto_monad_eliminator : 0.000034s : 0.11% optimize.opt_a.cse : 0.000066s : 0.22% optimize.opt_a.a_3 : 0.000114s : 0.38% optimize.py_interpret_to_execute_after_opt_a : 0.000018s : 0.06% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.01% optimize.rewriter_after_opt_a : 0.000047s : 0.16% optimize.convert_after_rewriter : 0.000008s : 0.03% optimize.order_py_execute_after_rewriter : 0.000006s : 0.02% optimize.mutable_eliminate : 0.000891s : 2.98% optimize.opt_b.b_1 : 0.000167s : 0.56% optimize.opt_b.b_2 : 0.000010s : 0.03% optimize.opt_b.updatestate_depend_eliminate : 0.000011s : 0.04% optimize.opt_b.updatestate_assign_eliminate : 0.000004s : 0.01% optimize.opt_b.updatestate_loads_eliminate : 0.000004s : 0.01% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000038s : 0.13% optimize.optimize_parallel_all_gather_comm : 0.000021s : 0.07% optimize.overlap_param_gather : 0.000002s : 0.01% optimize.cconv : 0.000033s : 0.11% optimize.loop_unroll : 0.000626s : 2.09% optimize.opt_after_cconv.c_1 : 0.000040s : 0.13% optimize.opt_after_cconv.parameter_eliminate : 0.000006s : 0.02% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000009s : 0.03% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.cse : 0.000031s : 0.10% optimize.opt_after_cconv.renormalize : 0.000001s : 0.00% optimize.remove_dup_value : 0.000018s : 0.06% optimize.tuple_transform.d_1 : 0.000063s : 0.21% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000009s : 0.03% optimize.partial_unused_args_eliminate : 0.000002s : 0.01% optimize.add_recomputation : 0.000067s : 0.22% optimize.cse_after_recomputation.cse : 0.000016s : 0.05% optimize.environ_conv : 0.000008s : 0.03% optimize.swap_dp_allreduce_reducescatter : 0.000007s : 0.02% optimize.bias_add_comm_swap : 0.000003s : 0.01% optimize.label_micro_interleaved_index : 0.000006s : 0.02% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.01% optimize.merge_cast_opt : 0.000001s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.01% optimize.micro_interleaved_order_control : 0.000003s : 0.01% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.00% optimize.full_micro_interleaved_order_control : 0.000002s : 0.01% optimize.reorder_send_recv_between_fp_bp : 0.000002s : 0.01% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000002s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.01% optimize.control_data_broadcast_order : 0.000016s : 0.05% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.01% optimize.offloading_packed_experts : 0.000005s : 0.02% optimize.overlap_recompute_and_grad_model_parallel : 0.000006s : 0.02% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.00% optimize.overlap_recompute_comm : 0.000002s : 0.01% optimize.overlap_grad_ring_attention : 0.000005s : 0.02% optimize.overlap_grad_flash_sp : 0.000024s : 0.08% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.01% optimize.split_layernorm_comm : 0.000002s : 0.01% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000004s : 0.01% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000013s : 0.04% optimize.symbol_engine_optimizer.elim_not_effective : 0.000017s : 0.06% optimize.symbol_engine_optimizer.opt_reshape : 0.000010s : 0.03% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000013s : 0.04% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.01% pipeline_parallel_scheduler : 0.000002s : 0.01% auto_monad_reorder : 0.000021s : 0.07% get_jit_bprop_graph : 0.000002s : 0.01% rewriter_after_jit_bprop_graph : 0.000006s : 0.02% opt_after_jit_grad : 0.000700s : 2.34% validate : 0.000054s : 0.18% Time group info: ------[substitution.] 0.016261 38 0.18% : 0.000029s : 3: substitution.cast_eliminate 0.02% : 0.000003s : 3: substitution.elim_not_effective 0.01% : 0.000002s : 3: substitution.fold_const_symbol 0.05% : 0.000008s : 5: substitution.graph_param_transform 99.49% : 0.016179s : 4: substitution.inline 0.03% : 0.000005s : 6: substitution.j_node_and_user_rematch 0.07% : 0.000011s : 6: substitution.remove_not_recompute_node 0.04% : 0.000006s : 4: substitution.replace_old_param 0.11% : 0.000018s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.006815 2 88.77% : 0.006050s : 1: type_inference.infer 11.23% : 0.000765s : 1: type_inference.specialize ------[replace.] 0.000074 8 62.10% : 0.000046s : 4: replace.inline 37.90% : 0.000028s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.016189 8 99.90% : 0.016174s : 4: match.inline 0.10% : 0.000016s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000290 1596 0.94% : 0.000003s : 17: predicate.accumulaten_eliminater 0.76% : 0.000002s : 5: predicate.ad_related_special_op_eliminate 0.51% : 0.000001s : 10: predicate.addn_check_dump 1.01% : 0.000003s : 17: predicate.addn_zero_filter 0.92% : 0.000003s : 17: predicate.adjust_all_reduce_mul_add 2.26% : 0.000007s : 27: predicate.arithmetic_simplify 1.01% : 0.000003s : 17: predicate.cast_eliminate 0.75% : 0.000002s : 10: predicate.check_bprop_eliminate 0.52% : 0.000002s : 10: predicate.compare_switch_simplify 0.17% : 0.000001s : 5: predicate.const_output_eliminate 0.77% : 0.000002s : 10: predicate.depend_value_elim 0.92% : 0.000003s : 17: predicate.dict_get_item_const_eliminator 0.97% : 0.000003s : 17: predicate.dict_get_item_eliminator 0.88% : 0.000003s : 17: predicate.dict_set_item_eliminator 0.99% : 0.000003s : 10: predicate.dumpgradient_eliminate 0.18% : 0.000001s : 5: predicate.elim_not_effective 0.40% : 0.000001s : 5: predicate.elim_shapecalc_of_broadcastargs 1.35% : 0.000004s : 22: predicate.environ_add_const_eliminate 1.23% : 0.000004s : 22: predicate.environ_get_add_eliminate 1.12% : 0.000003s : 22: predicate.environ_get_depend_swap 1.63% : 0.000005s : 32: predicate.environ_get_eliminate 1.07% : 0.000003s : 22: predicate.environ_get_set_eliminate 1.28% : 0.000004s : 25: predicate.exchange_switch_depend_value 2.42% : 0.000007s : 25: predicate.float_depend_g_call 0.50% : 0.000001s : 10: predicate.float_environ_get_switch 0.79% : 0.000002s : 15: predicate.float_tuple_getitem_switch 0.17% : 0.000000s : 5: predicate.fold_const_symbol 0.78% : 0.000002s : 10: predicate.get_grad_eliminate 0.18% : 0.000001s : 5: predicate.graph_param_transform 0.54% : 0.000002s : 10: predicate.incorporate_call 0.43% : 0.000001s : 10: predicate.incorporate_call_switch 5.90% : 0.000017s : 72: predicate.inline 0.71% : 0.000002s : 10: predicate.inline_without_move 0.28% : 0.000001s : 10: predicate.j_node_and_user_rematch 1.11% : 0.000003s : 10: predicate.less_batch_normalization 1.84% : 0.000005s : 31: predicate.list_to_tuple_eliminator_ 2.45% : 0.000007s : 48: predicate.load_eliminater 1.03% : 0.000003s : 5: predicate.loop_unroll_after_grad 2.05% : 0.000006s : 36: predicate.loop_unroll_before_grad 1.85% : 0.000005s : 27: predicate.make_slice_get_slice_eliminator 0.58% : 0.000002s : 10: predicate.merge_addn 0.63% : 0.000002s : 10: predicate.micro_step_allgather_replace 0.60% : 0.000002s : 10: predicate.mini_step_allgather_replace 0.83% : 0.000002s : 17: predicate.minmaximum_grad 1.27% : 0.000004s : 5: predicate.mutable_eliminate 0.39% : 0.000001s : 5: predicate.opt_reshape 0.44% : 0.000001s : 5: predicate.parallel_virtual_node 1.93% : 0.000006s : 25: predicate.partial_defer_inline 1.53% : 0.000004s : 26: predicate.partial_eliminate 0.96% : 0.000003s : 17: predicate.print_const_string_wrapper 0.62% : 0.000002s : 10: predicate.reduce_all_const_elim 1.27% : 0.000004s : 17: predicate.reduce_eliminate 2.55% : 0.000007s : 48: predicate.redundant_stop_gradient_eliminater 0.33% : 0.000001s : 10: predicate.remove_not_recompute_node 1.17% : 0.000003s : 31: predicate.replace_applicator 0.42% : 0.000001s : 10: predicate.replace_old_param 0.29% : 0.000001s : 5: predicate.reset_defer_inline 1.13% : 0.000003s : 17: predicate.reshape_eliminate 0.58% : 0.000002s : 10: predicate.row_tensor_add_zeros_like 0.44% : 0.000001s : 5: predicate.row_tensor_eliminate 0.70% : 0.000002s : 10: predicate.same_eliminate 0.36% : 0.000001s : 10: predicate.set_cell_output_no_recompute 0.74% : 0.000002s : 10: predicate.shard_identity_eliminate 0.77% : 0.000002s : 10: predicate.special_op_eliminate 0.62% : 0.000002s : 10: predicate.specialize_transform 0.92% : 0.000003s : 10: predicate.split_environ_get_set_with_tuple_value 0.79% : 0.000002s : 10: predicate.stack_unstack_eliminate 0.32% : 0.000001s : 5: predicate.switch_call_monad_eliminater 1.38% : 0.000004s : 25: predicate.switch_defer_inline 2.03% : 0.000006s : 35: predicate.switch_layer_defer_inline 4.66% : 0.000014s : 76: predicate.switch_simplify 1.22% : 0.000004s : 17: predicate.tile_eliminate 1.01% : 0.000003s : 17: predicate.transpose_eliminate 1.55% : 0.000004s : 27: predicate.tuple_list_convert_item_index_to_positive 1.62% : 0.000005s : 27: predicate.tuple_list_get_item_const_eliminator 1.92% : 0.000006s : 27: predicate.tuple_list_get_item_depend_reorder 3.65% : 0.000011s : 41: predicate.tuple_list_get_item_eliminator 1.64% : 0.000005s : 27: predicate.tuple_list_get_set_item_eliminator 2.35% : 0.000007s : 37: predicate.tuple_list_set_item_eliminator 1.65% : 0.000005s : 31: predicate.tuple_to_list_eliminator_ 2.38% : 0.000007s : 48: predicate.updatestate_pure_node_eliminater 2.82% : 0.000008s : 58: predicate.updatestate_useless_node_eliminater 0.36% : 0.000001s : 5: predicate.value_based_eliminate 0.62% : 0.000002s : 10: predicate.virtual_dataset_eliminate 0.56% : 0.000002s : 10: predicate.virtual_output_eliminate 0.25% : 0.000001s : 5: predicate.virtual_view_grad_eliminate 0.45% : 0.000001s : 5: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000641 11 53.18% : 0.000341s : 5: func_graph_cloner_run.FuncGraphClonerGraph 46.82% : 0.000300s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.079492 192 0.00% : 0.000004s : 1: ForceFp32Comm 4.43% : 0.003524s : 1: add_attr 4.42% : 0.003511s : 1: add_attr_with_inline 0.00% : 0.000004s : 1: add_comm_op_reuse_tag 0.09% : 0.000071s : 1: add_recomputation 0.01% : 0.000004s : 1: assign_add_opt 0.11% : 0.000089s : 1: auto_monad 0.03% : 0.000025s : 1: auto_monad_reorder 0.00% : 0.000003s : 1: begin_end_overlap_inline 0.01% : 0.000006s : 1: bias_add_comm_swap 0.61% : 0.000486s : 1: bootstrap 0.05% : 0.000037s : 1: cconv 0.01% : 0.000004s : 1: comm_op_add_attrs 0.02% : 0.000020s : 1: control_data_broadcast_order 0.02% : 0.000012s : 1: convert_after_rewriter 0.04% : 0.000030s : 1: cse_after_recomputation 0.01% : 0.000005s : 1: dataset_repeat_opt 0.01% : 0.000006s : 1: detach_backward 0.01% : 0.000011s : 1: environ_conv 0.04% : 0.000028s : 1: event_method 0.01% : 0.000005s : 1: full_micro_interleaved_order_control 0.01% : 0.000005s : 1: get_jit_bprop_graph 0.01% : 0.000010s : 1: graph_reusing 0.01% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000004s : 1: handle_group_info 0.01% : 0.000005s : 1: inline 0.01% : 0.000006s : 1: insert-virtual-dataset 0.00% : 0.000004s : 1: interleave_parallel_branches 0.00% : 0.000004s : 1: interleave_split_concat_branches 0.01% : 0.000006s : 1: label_fine_grained_interleaved_index 0.01% : 0.000009s : 1: label_micro_interleaved_index 0.80% : 0.000638s : 1: loop_unroll 0.01% : 0.000004s : 1: merge_cast_opt 0.01% : 0.000006s : 1: micro_interleaved_order_control 1.14% : 0.000907s : 1: mutable_eliminate 0.01% : 0.000008s : 1: offloading_packed_experts 0.03% : 0.000021s : 1: opt.transform.loop_unroll_optimizer 0.03% : 0.000027s : 1: opt.transform.mutable_eliminate 22.18% : 0.017629s : 78: opt.transform.opt_a 0.05% : 0.000038s : 1: opt.transform.opt_after_cconv 0.05% : 0.000038s : 1: opt.transform.opt_after_jit_grad 0.18% : 0.000141s : 28: opt.transform.opt_b 0.09% : 0.000069s : 2: opt.transform.opt_trans_graph 0.06% : 0.000048s : 4: opt.transform.symbol_engine_opt 24.78% : 0.019695s : 1: opt_a 0.18% : 0.000140s : 1: opt_after_cconv 0.90% : 0.000715s : 1: opt_after_jit_grad 0.36% : 0.000286s : 1: opt_b 28.38% : 0.022561s : 1: optimize 0.03% : 0.000025s : 1: optimize_parallel_all_gather_comm 0.01% : 0.000009s : 1: order_py_execute_after_rewriter 0.03% : 0.000027s : 1: overlap_grad_flash_sp 0.00% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.01% : 0.000008s : 1: overlap_grad_ring_attention 0.01% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000005s : 1: overlap_opt_shard_in_pipeline 0.01% : 0.000005s : 1: overlap_param_gather 0.01% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.01% : 0.000009s : 1: overlap_recompute_and_grad_model_parallel 0.01% : 0.000005s : 1: overlap_recompute_comm 0.01% : 0.000007s : 1: parallel-infer-symbol 0.00% : 0.000004s : 1: parallel-infer-symbol-second 0.01% : 0.000005s : 1: partial_unused_args_eliminate 0.01% : 0.000005s : 1: pipeline_parallel_scheduler 0.01% : 0.000005s : 1: pipeline_split 0.06% : 0.000045s : 1: pre_auto_parallel 0.04% : 0.000035s : 1: py_interpret_to_execute 0.03% : 0.000021s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000005s : 1: remove_cast_before_assign_add 0.03% : 0.000022s : 1: remove_dup_value 0.73% : 0.000579s : 1: renormalize.infer 0.55% : 0.000439s : 1: renormalize.specialize 0.01% : 0.000005s : 1: reorder_send_recv_between_fp_bp 0.01% : 0.000009s : 1: rewriter_after_jit_bprop_graph 0.07% : 0.000052s : 1: rewriter_after_opt_a 0.13% : 0.000100s : 1: rewriter_before_opt_a 0.01% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.01% : 0.000005s : 1: slice_recompute_activation 0.01% : 0.000004s : 1: split_layernorm_comm 0.01% : 0.000005s : 1: split_matmul_comm_elemetwise 0.01% : 0.000009s : 1: swap_dp_allreduce_reducescatter 0.12% : 0.000094s : 1: symbol_engine_optimizer 0.13% : 0.000101s : 1: tuple_transform 8.69% : 0.006908s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:44:52.569.459 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:44:52.569.752 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0358623, [21] [bootstrap]: 0.00053727 [type_inference]: 0.00745523 [event_method]: 2.396e-05 [auto_monad]: 7.681e-05 [graph_reusing]: 6.79999e-06 [inline]: 3.08e-06 [add_attr]: 0.0042299, [1] [add_attr_with_inline]: 0.00421544, [1] [Cycle 1]: 0.00010357, [2] [tag_attr]: 2.769e-05 [meta_addattr_fg_expand]: 6.49999e-06 [parallel-infer-symbol]: 3.7e-06 [pre_auto_parallel]: 4.704e-05 [insert-virtual-dataset]: 2.42001e-06 [parallel-infer-symbol-second]: 7.59988e-07 [dataset_repeat_opt]: 2.17001e-06 [pipeline_split]: 1.64998e-06 [optimize]: 0.0213488, [53] [py_interpret_to_execute]: 3.986e-05 [rewriter_before_opt_a]: 0.0001327 [opt_a]: 0.0180814, [2] [Cycle 1]: 0.0168943, [45] [expand_dump_flag]: 4.33001e-06 [switch_simplify]: 4.74e-05 [loop_unroll]: 3.276e-05 [a_1]: 0.0007887 [with_stream_mark]: 2.662e-05 [recompute_prepare]: 1.758e-05 [updatestate_depend_eliminate]: 7.15e-06 [updatestate_assign_eliminate]: 4.79e-06 [updatestate_loads_eliminate]: 4.73001e-06 [parameter_eliminate]: 2.24999e-06 [a_2]: 0.00015513 [accelerated_algorithm]: 1.162e-05 [shard]: 2.96001e-06 [meta_shard_fg_expand]: 3.4e-06 [shard_inline]: 1.091e-05 [merge_send_recv]: 1.153e-05 [auto_parallel]: 1.263e-05 [parallel]: 2.028e-05 [flash_sp]: 1.462e-05 [merge_comm]: 6.04001e-06 [allreduce_fusion]: 5.01997e-06 [matmul_add_comm_reduction]: 1.329e-05 [allreduce_slice_to_reducescatter]: 6.89994e-07 [virtual_shard_identity]: 1.351e-05 [virtual_dataset]: 9.94999e-06 [get_grad_eliminate_]: 9.51e-06 [virtual_output]: 9.23002e-06 [merge_forward]: 6.17001e-06 [cell_reuse_recompute_pass]: 2.49999e-06 [offload_activation]: 1.53e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.83e-05 [merge_recompute_call_nodes]: 1.75001e-06 [before_grad]: 1.836e-05 [set_forward_comm_id_for_comm_node_pass]: 6.87002e-06 [meta_fg_expand]: 5.34998e-06 [flash_sp_send_recv_attached]: 4.37e-06 [receive_attached]: 2.66999e-06 [after_resolve]: 1.733e-05 [a_after_grad]: 1.531e-05 [renormalize]: 0.014851 [add_forward_monad_depend]: 1.203e-05 [auto_monad_grad]: 2.61999e-06 [auto_monad_eliminator]: 2.976e-05 [cse]: 5.293e-05 [a_3]: 9.777e-05 [Cycle 2]: 0.00116731, [45] [expand_dump_flag]: 3.31001e-06 [switch_simplify]: 1.206e-05 [loop_unroll]: 9.29e-06 [a_1]: 0.00023247 [with_stream_mark]: 2.24e-05 [recompute_prepare]: 9.37999e-06 [updatestate_depend_eliminate]: 6.17999e-06 [updatestate_assign_eliminate]: 4.47003e-06 [updatestate_loads_eliminate]: 4.41002e-06 [parameter_eliminate]: 2.51998e-06 [a_2]: 0.00014165 [accelerated_algorithm]: 9.54999e-06 [shard]: 2.71e-06 [meta_shard_fg_expand]: 3.56999e-06 [shard_inline]: 8.98002e-06 [merge_send_recv]: 1.085e-05 [auto_parallel]: 1.231e-05 [parallel]: 9.96e-06 [flash_sp]: 5.28002e-06 [merge_comm]: 6.50997e-06 [allreduce_fusion]: 5.54e-06 [matmul_add_comm_reduction]: 1.208e-05 [allreduce_slice_to_reducescatter]: 7.39994e-07 [virtual_shard_identity]: 1.164e-05 [virtual_dataset]: 8.73001e-06 [get_grad_eliminate_]: 9.20999e-06 [virtual_output]: 8.97999e-06 [merge_forward]: 7e-06 [cell_reuse_recompute_pass]: 3.84002e-06 [offload_activation]: 1.412e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.148e-05 [merge_recompute_call_nodes]: 1.65001e-06 [before_grad]: 1.63e-05 [set_forward_comm_id_for_comm_node_pass]: 5.99999e-06 [meta_fg_expand]: 4.47e-06 [flash_sp_send_recv_attached]: 2.07999e-06 [receive_attached]: 2.68998e-06 [after_resolve]: 1.584e-05 [a_after_grad]: 1.496e-05 [renormalize]: 7.99773e-08 [add_forward_monad_depend]: 3.35998e-06 [auto_monad_grad]: 2.21e-06 [auto_monad_eliminator]: 1.369e-05 [cse]: 3.122e-05 [a_3]: 6.918e-05 [py_interpret_to_execute_after_opt_a]: 2.59e-05 [slice_cell_reuse_recomputed_activation]: 5.30001e-06 [rewriter_after_opt_a]: 6.105e-05 [convert_after_rewriter]: 1.262e-05 [order_py_execute_after_rewriter]: 9.44e-06 [mutable_eliminate]: 0.0008015 [opt_b]: 0.00038901, [1] [Cycle 1]: 0.00037655, [7] [b_1]: 0.00023934 [b_2]: 1.12e-05 [updatestate_depend_eliminate]: 1.241e-05 [updatestate_assign_eliminate]: 4.27998e-06 [updatestate_loads_eliminate]: 3.86999e-06 [renormalize]: 1.05001e-06 [cse]: 4.361e-05 [optimize_parallel_all_gather_comm]: 2.837e-05 [overlap_param_gather]: 5.70001e-06 [cconv]: 4.181e-05 [loop_unroll]: 0.00068435 [opt_after_cconv]: 0.00017468, [1] [Cycle 1]: 0.00016412, [7] [c_1]: 4.659e-05 [parameter_eliminate]: 6.19999e-06 [updatestate_depend_eliminate]: 8.87999e-06 [updatestate_assign_eliminate]: 3.92998e-06 [updatestate_loads_eliminate]: 3.7e-06 [cse]: 3.617e-05 [renormalize]: 3.89991e-07 [remove_dup_value]: 6.006e-05 [tuple_transform]: 0.00012517, [1] [Cycle 1]: 0.00011704, [4] [d_1]: 7.239e-05 [none_parameter_eliminate]: 1.92001e-06 [renormalize]: 2.00002e-07 [switch_simplify]: 1.063e-05 [partial_unused_args_eliminate]: 5.12e-06 [add_recomputation]: 7.451e-05 [cse_after_recomputation]: 3.647e-05, [1] [Cycle 1]: 2.929e-05, [1] [cse]: 1.93e-05 [environ_conv]: 1.191e-05 [swap_dp_allreduce_reducescatter]: 9.67999e-06 [bias_add_comm_swap]: 5.94999e-06 [label_micro_interleaved_index]: 8.84e-06 [label_fine_grained_interleaved_index]: 5.44e-06 [merge_cast_opt]: 3.97e-06 [slice_recompute_activation]: 4.77e-06 [micro_interleaved_order_control]: 5.06002e-06 [assign_add_opt]: 3.64002e-06 [ForceFp32Comm]: 3.37002e-06 [remove_cast_before_assign_add]: 3.55e-06 [full_micro_interleaved_order_control]: 4.50999e-06 [reorder_send_recv_between_fp_bp]: 4.92999e-06 [comm_op_add_attrs]: 3.50998e-06 [add_comm_op_reuse_tag]: 3.46001e-06 [interleave_split_concat_branches]: 3.36999e-06 [interleave_parallel_branches]: 3.41999e-06 [overlap_opt_shard_in_pipeline]: 3.8e-06 [overlap_opt_shard_grad_in_pipeline]: 4.25e-06 [control_data_broadcast_order]: 2.203e-05 [grouped_pairwise_exchange_alltoall]: 4.22e-06 [offloading_packed_experts]: 8.28001e-06 [overlap_recompute_and_grad_model_parallel]: 8.00999e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.56001e-06 [overlap_recompute_allgather_and_fa_grad]: 3.71999e-06 [overlap_recompute_comm]: 4.95999e-06 [overlap_grad_ring_attention]: 7.92e-06 [overlap_grad_flash_sp]: 3.104e-05 [begin_end_overlap_inline]: 2.98998e-06 [split_matmul_comm_elemetwise]: 4.57998e-06 [split_layernorm_comm]: 4.38001e-06 [handle_group_info]: 3.68e-06 [symbol_engine_optimizer]: 0.00011502, [1] [Cycle 1]: 0.00010823, [6] [build]: 4.28999e-06 [elim_shapecalc]: 1.362e-05 [elim_not_effective]: 1.814e-05 [opt_reshape]: 1.1e-05 [fold_const_symbol]: 1.508e-05 [renormalize]: 2.80008e-07 [detach_backward]: 6.88998e-06 [pipeline_parallel_scheduler]: 1.83002e-06 [auto_monad_reorder]: 3.263e-05 [get_jit_bprop_graph]: 2.02001e-06 [rewriter_after_jit_bprop_graph]: 7.56999e-06 [opt_after_jit_grad]: 0.00076135 [validate]: 5.291e-05 Sums bootstrap : 0.000537s : 1.85% type_inference : 0.007455s : 25.65% event_method : 0.000024s : 0.08% auto_monad : 0.000077s : 0.26% graph_reusing : 0.000007s : 0.02% inline : 0.000003s : 0.01% add_attr.add_attr_with_inline.tag_attr : 0.000028s : 0.10% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.02% parallel-infer-symbol : 0.000004s : 0.01% pre_auto_parallel : 0.000047s : 0.16% insert-virtual-dataset : 0.000002s : 0.01% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.01% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000040s : 0.14% optimize.rewriter_before_opt_a : 0.000133s : 0.46% optimize.opt_a.expand_dump_flag : 0.000008s : 0.03% optimize.opt_a.switch_simplify : 0.000059s : 0.20% optimize.opt_a.loop_unroll : 0.000042s : 0.14% optimize.opt_a.a_1 : 0.001021s : 3.51% optimize.opt_a.with_stream_mark : 0.000049s : 0.17% optimize.opt_a.recompute_prepare : 0.000027s : 0.09% optimize.opt_a.updatestate_depend_eliminate : 0.000013s : 0.05% optimize.opt_a.updatestate_assign_eliminate : 0.000009s : 0.03% optimize.opt_a.updatestate_loads_eliminate : 0.000009s : 0.03% optimize.opt_a.parameter_eliminate : 0.000005s : 0.02% optimize.opt_a.a_2 : 0.000297s : 1.02% optimize.opt_a.accelerated_algorithm : 0.000021s : 0.07% optimize.opt_a.shard : 0.000006s : 0.02% optimize.opt_a.meta_shard_fg_expand : 0.000007s : 0.02% optimize.opt_a.shard_inline : 0.000020s : 0.07% optimize.opt_a.merge_send_recv : 0.000022s : 0.08% optimize.opt_a.auto_parallel : 0.000025s : 0.09% optimize.opt_a.parallel : 0.000030s : 0.10% optimize.opt_a.flash_sp : 0.000020s : 0.07% optimize.opt_a.merge_comm : 0.000013s : 0.04% optimize.opt_a.allreduce_fusion : 0.000011s : 0.04% optimize.opt_a.matmul_add_comm_reduction : 0.000025s : 0.09% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000025s : 0.09% optimize.opt_a.virtual_dataset : 0.000019s : 0.06% optimize.opt_a.get_grad_eliminate_ : 0.000019s : 0.06% optimize.opt_a.virtual_output : 0.000018s : 0.06% optimize.opt_a.merge_forward : 0.000013s : 0.05% optimize.opt_a.cell_reuse_recompute_pass : 0.000006s : 0.02% optimize.opt_a.offload_activation : 0.000029s : 0.10% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000050s : 0.17% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.01% optimize.opt_a.before_grad : 0.000035s : 0.12% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000013s : 0.04% optimize.opt_a.meta_fg_expand : 0.000010s : 0.03% optimize.opt_a.flash_sp_send_recv_attached : 0.000006s : 0.02% optimize.opt_a.receive_attached : 0.000005s : 0.02% optimize.opt_a.after_resolve : 0.000033s : 0.11% optimize.opt_a.a_after_grad : 0.000030s : 0.10% optimize.opt_a.renormalize : 0.014851s : 51.09% optimize.opt_a.add_forward_monad_depend : 0.000015s : 0.05% optimize.opt_a.auto_monad_grad : 0.000005s : 0.02% optimize.opt_a.auto_monad_eliminator : 0.000043s : 0.15% optimize.opt_a.cse : 0.000084s : 0.29% optimize.opt_a.a_3 : 0.000167s : 0.57% optimize.py_interpret_to_execute_after_opt_a : 0.000026s : 0.09% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.02% optimize.rewriter_after_opt_a : 0.000061s : 0.21% optimize.convert_after_rewriter : 0.000013s : 0.04% optimize.order_py_execute_after_rewriter : 0.000009s : 0.03% optimize.mutable_eliminate : 0.000802s : 2.76% optimize.opt_b.b_1 : 0.000239s : 0.82% optimize.opt_b.b_2 : 0.000011s : 0.04% optimize.opt_b.updatestate_depend_eliminate : 0.000012s : 0.04% optimize.opt_b.updatestate_assign_eliminate : 0.000004s : 0.01% optimize.opt_b.updatestate_loads_eliminate : 0.000004s : 0.01% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000044s : 0.15% optimize.optimize_parallel_all_gather_comm : 0.000028s : 0.10% optimize.overlap_param_gather : 0.000006s : 0.02% optimize.cconv : 0.000042s : 0.14% optimize.loop_unroll : 0.000684s : 2.35% optimize.opt_after_cconv.c_1 : 0.000047s : 0.16% optimize.opt_after_cconv.parameter_eliminate : 0.000006s : 0.02% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000009s : 0.03% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000004s : 0.01% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000004s : 0.01% optimize.opt_after_cconv.cse : 0.000036s : 0.12% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000060s : 0.21% optimize.tuple_transform.d_1 : 0.000072s : 0.25% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000011s : 0.04% optimize.partial_unused_args_eliminate : 0.000005s : 0.02% optimize.add_recomputation : 0.000075s : 0.26% optimize.cse_after_recomputation.cse : 0.000019s : 0.07% optimize.environ_conv : 0.000012s : 0.04% optimize.swap_dp_allreduce_reducescatter : 0.000010s : 0.03% optimize.bias_add_comm_swap : 0.000006s : 0.02% optimize.label_micro_interleaved_index : 0.000009s : 0.03% optimize.label_fine_grained_interleaved_index : 0.000005s : 0.02% optimize.merge_cast_opt : 0.000004s : 0.01% optimize.slice_recompute_activation : 0.000005s : 0.02% optimize.micro_interleaved_order_control : 0.000005s : 0.02% optimize.assign_add_opt : 0.000004s : 0.01% optimize.ForceFp32Comm : 0.000003s : 0.01% optimize.remove_cast_before_assign_add : 0.000004s : 0.01% optimize.full_micro_interleaved_order_control : 0.000005s : 0.02% optimize.reorder_send_recv_between_fp_bp : 0.000005s : 0.02% optimize.comm_op_add_attrs : 0.000004s : 0.01% optimize.add_comm_op_reuse_tag : 0.000003s : 0.01% optimize.interleave_split_concat_branches : 0.000003s : 0.01% optimize.interleave_parallel_branches : 0.000003s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000004s : 0.01% optimize.control_data_broadcast_order : 0.000022s : 0.08% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.01% optimize.offloading_packed_experts : 0.000008s : 0.03% optimize.overlap_recompute_and_grad_model_parallel : 0.000008s : 0.03% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.01% optimize.overlap_recompute_comm : 0.000005s : 0.02% optimize.overlap_grad_ring_attention : 0.000008s : 0.03% optimize.overlap_grad_flash_sp : 0.000031s : 0.11% optimize.begin_end_overlap_inline : 0.000003s : 0.01% optimize.split_matmul_comm_elemetwise : 0.000005s : 0.02% optimize.split_layernorm_comm : 0.000004s : 0.02% optimize.handle_group_info : 0.000004s : 0.01% optimize.symbol_engine_optimizer.build : 0.000004s : 0.01% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000014s : 0.05% optimize.symbol_engine_optimizer.elim_not_effective : 0.000018s : 0.06% optimize.symbol_engine_optimizer.opt_reshape : 0.000011s : 0.04% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000015s : 0.05% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000007s : 0.02% pipeline_parallel_scheduler : 0.000002s : 0.01% auto_monad_reorder : 0.000033s : 0.11% get_jit_bprop_graph : 0.000002s : 0.01% rewriter_after_jit_bprop_graph : 0.000008s : 0.03% opt_after_jit_grad : 0.000761s : 2.62% validate : 0.000053s : 0.18% Time group info: ------[substitution.] 0.000280 48 15.23% : 0.000043s : 6: substitution.cast_eliminate 0.89% : 0.000002s : 4: substitution.elim_not_effective 0.87% : 0.000002s : 4: substitution.fold_const_symbol 3.21% : 0.000009s : 6: substitution.graph_param_transform 65.80% : 0.000185s : 4: substitution.inline 2.67% : 0.000008s : 8: substitution.j_node_and_user_rematch 3.31% : 0.000009s : 8: substitution.remove_not_recompute_node 2.76% : 0.000008s : 4: substitution.replace_old_param 5.26% : 0.000015s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.007383 2 87.27% : 0.006443s : 1: type_inference.infer 12.73% : 0.000940s : 1: type_inference.specialize ------[replace.] 0.000071 8 62.67% : 0.000045s : 4: replace.inline 37.33% : 0.000027s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000194 8 93.35% : 0.000181s : 4: match.inline 6.65% : 0.000013s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000305 1730 0.85% : 0.000003s : 17: predicate.accumulaten_eliminater 1.05% : 0.000003s : 6: predicate.ad_related_special_op_eliminate 0.60% : 0.000002s : 12: predicate.addn_check_dump 0.92% : 0.000003s : 17: predicate.addn_zero_filter 0.85% : 0.000003s : 17: predicate.adjust_all_reduce_mul_add 2.13% : 0.000006s : 29: predicate.arithmetic_simplify 0.95% : 0.000003s : 17: predicate.cast_eliminate 0.67% : 0.000002s : 12: predicate.check_bprop_eliminate 0.56% : 0.000002s : 12: predicate.compare_switch_simplify 0.18% : 0.000001s : 6: predicate.const_output_eliminate 0.68% : 0.000002s : 12: predicate.depend_value_elim 0.91% : 0.000003s : 17: predicate.dict_get_item_const_eliminator 1.14% : 0.000003s : 17: predicate.dict_get_item_eliminator 0.90% : 0.000003s : 17: predicate.dict_set_item_eliminator 1.08% : 0.000003s : 12: predicate.dumpgradient_eliminate 0.20% : 0.000001s : 6: predicate.elim_not_effective 0.56% : 0.000002s : 6: predicate.elim_shapecalc_of_broadcastargs 1.22% : 0.000004s : 23: predicate.environ_add_const_eliminate 1.13% : 0.000003s : 23: predicate.environ_get_add_eliminate 1.18% : 0.000004s : 23: predicate.environ_get_depend_swap 2.01% : 0.000006s : 35: predicate.environ_get_eliminate 1.13% : 0.000003s : 23: predicate.environ_get_set_eliminate 1.20% : 0.000004s : 25: predicate.exchange_switch_depend_value 2.00% : 0.000006s : 25: predicate.float_depend_g_call 0.74% : 0.000002s : 12: predicate.float_environ_get_switch 0.89% : 0.000003s : 18: predicate.float_tuple_getitem_switch 0.18% : 0.000001s : 6: predicate.fold_const_symbol 0.74% : 0.000002s : 12: predicate.get_grad_eliminate 0.28% : 0.000001s : 6: predicate.graph_param_transform 0.63% : 0.000002s : 12: predicate.incorporate_call 0.54% : 0.000002s : 12: predicate.incorporate_call_switch 6.09% : 0.000019s : 78: predicate.inline 0.95% : 0.000003s : 12: predicate.inline_without_move 0.31% : 0.000001s : 12: predicate.j_node_and_user_rematch 0.97% : 0.000003s : 12: predicate.less_batch_normalization 1.79% : 0.000005s : 33: predicate.list_to_tuple_eliminator_ 2.27% : 0.000007s : 50: predicate.load_eliminater 1.02% : 0.000003s : 6: predicate.loop_unroll_after_grad 1.95% : 0.000006s : 38: predicate.loop_unroll_before_grad 1.71% : 0.000005s : 29: predicate.make_slice_get_slice_eliminator 0.65% : 0.000002s : 12: predicate.merge_addn 0.62% : 0.000002s : 12: predicate.micro_step_allgather_replace 0.68% : 0.000002s : 12: predicate.mini_step_allgather_replace 0.84% : 0.000003s : 17: predicate.minmaximum_grad 1.36% : 0.000004s : 6: predicate.mutable_eliminate 0.40% : 0.000001s : 6: predicate.opt_reshape 0.50% : 0.000002s : 6: predicate.parallel_virtual_node 1.47% : 0.000004s : 25: predicate.partial_defer_inline 1.47% : 0.000004s : 27: predicate.partial_eliminate 0.99% : 0.000003s : 17: predicate.print_const_string_wrapper 0.70% : 0.000002s : 12: predicate.reduce_all_const_elim 1.17% : 0.000004s : 17: predicate.reduce_eliminate 2.48% : 0.000008s : 50: predicate.redundant_stop_gradient_eliminater 0.38% : 0.000001s : 12: predicate.remove_not_recompute_node 1.24% : 0.000004s : 33: predicate.replace_applicator 0.51% : 0.000002s : 12: predicate.replace_old_param 0.38% : 0.000001s : 6: predicate.reset_defer_inline 1.12% : 0.000003s : 17: predicate.reshape_eliminate 0.65% : 0.000002s : 12: predicate.row_tensor_add_zeros_like 0.38% : 0.000001s : 6: predicate.row_tensor_eliminate 0.89% : 0.000003s : 12: predicate.same_eliminate 0.49% : 0.000002s : 12: predicate.set_cell_output_no_recompute 1.09% : 0.000003s : 12: predicate.shard_identity_eliminate 0.73% : 0.000002s : 12: predicate.special_op_eliminate 0.85% : 0.000003s : 12: predicate.specialize_transform 0.97% : 0.000003s : 12: predicate.split_environ_get_set_with_tuple_value 0.89% : 0.000003s : 12: predicate.stack_unstack_eliminate 0.39% : 0.000001s : 6: predicate.switch_call_monad_eliminater 1.35% : 0.000004s : 25: predicate.switch_defer_inline 1.84% : 0.000006s : 37: predicate.switch_layer_defer_inline 4.32% : 0.000013s : 81: predicate.switch_simplify 0.92% : 0.000003s : 17: predicate.tile_eliminate 0.92% : 0.000003s : 17: predicate.transpose_eliminate 1.48% : 0.000005s : 29: predicate.tuple_list_convert_item_index_to_positive 1.57% : 0.000005s : 29: predicate.tuple_list_get_item_const_eliminator 1.51% : 0.000005s : 29: predicate.tuple_list_get_item_depend_reorder 3.25% : 0.000010s : 45: predicate.tuple_list_get_item_eliminator 1.65% : 0.000005s : 29: predicate.tuple_list_get_set_item_eliminator 2.34% : 0.000007s : 41: predicate.tuple_list_set_item_eliminator 1.88% : 0.000006s : 33: predicate.tuple_to_list_eliminator_ 2.25% : 0.000007s : 50: predicate.updatestate_pure_node_eliminater 2.84% : 0.000009s : 62: predicate.updatestate_useless_node_eliminater 0.39% : 0.000001s : 6: predicate.value_based_eliminate 0.71% : 0.000002s : 12: predicate.virtual_dataset_eliminate 0.67% : 0.000002s : 12: predicate.virtual_output_eliminate 0.28% : 0.000001s : 6: predicate.virtual_view_grad_eliminate 0.41% : 0.000001s : 6: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000778 11 50.21% : 0.000391s : 5: func_graph_cloner_run.FuncGraphClonerGraph 49.79% : 0.000387s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.078241 192 0.01% : 0.000006s : 1: ForceFp32Comm 5.42% : 0.004242s : 1: add_attr 5.39% : 0.004220s : 1: add_attr_with_inline 0.01% : 0.000006s : 1: add_comm_op_reuse_tag 0.10% : 0.000078s : 1: add_recomputation 0.01% : 0.000007s : 1: assign_add_opt 0.11% : 0.000087s : 1: auto_monad 0.05% : 0.000041s : 1: auto_monad_reorder 0.01% : 0.000006s : 1: begin_end_overlap_inline 0.01% : 0.000009s : 1: bias_add_comm_swap 0.74% : 0.000583s : 1: bootstrap 0.06% : 0.000045s : 1: cconv 0.01% : 0.000006s : 1: comm_op_add_attrs 0.03% : 0.000025s : 1: control_data_broadcast_order 0.02% : 0.000016s : 1: convert_after_rewriter 0.05% : 0.000040s : 1: cse_after_recomputation 0.01% : 0.000008s : 1: dataset_repeat_opt 0.04% : 0.000034s : 1: detach_backward 0.02% : 0.000015s : 1: environ_conv 0.05% : 0.000036s : 1: event_method 0.01% : 0.000008s : 1: full_micro_interleaved_order_control 0.01% : 0.000009s : 1: get_jit_bprop_graph 0.02% : 0.000013s : 1: graph_reusing 0.01% : 0.000008s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000006s : 1: handle_group_info 0.01% : 0.000009s : 1: inline 0.01% : 0.000009s : 1: insert-virtual-dataset 0.01% : 0.000006s : 1: interleave_parallel_branches 0.01% : 0.000006s : 1: interleave_split_concat_branches 0.01% : 0.000008s : 1: label_fine_grained_interleaved_index 0.02% : 0.000012s : 1: label_micro_interleaved_index 0.88% : 0.000692s : 1: loop_unroll 0.01% : 0.000007s : 1: merge_cast_opt 0.01% : 0.000008s : 1: micro_interleaved_order_control 1.04% : 0.000812s : 1: mutable_eliminate 0.01% : 0.000011s : 1: offloading_packed_experts 0.03% : 0.000022s : 1: opt.transform.loop_unroll_optimizer 0.04% : 0.000028s : 1: opt.transform.mutable_eliminate 2.15% : 0.001686s : 78: opt.transform.opt_a 0.06% : 0.000044s : 1: opt.transform.opt_after_cconv 0.05% : 0.000041s : 1: opt.transform.opt_after_jit_grad 0.22% : 0.000172s : 28: opt.transform.opt_b 0.10% : 0.000081s : 2: opt.transform.opt_trans_graph 0.07% : 0.000054s : 4: opt.transform.symbol_engine_opt 23.11% : 0.018085s : 1: opt_a 0.23% : 0.000179s : 1: opt_after_cconv 0.99% : 0.000773s : 1: opt_after_jit_grad 0.50% : 0.000393s : 1: opt_b 28.51% : 0.022309s : 1: optimize 0.04% : 0.000032s : 1: optimize_parallel_all_gather_comm 0.02% : 0.000012s : 1: order_py_execute_after_rewriter 0.04% : 0.000034s : 1: overlap_grad_flash_sp 0.01% : 0.000006s : 1: overlap_grad_matmul_and_grad_allreduce 0.01% : 0.000011s : 1: overlap_grad_ring_attention 0.01% : 0.000007s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000006s : 1: overlap_opt_shard_in_pipeline 0.01% : 0.000009s : 1: overlap_param_gather 0.01% : 0.000006s : 1: overlap_recompute_allgather_and_fa_grad 0.01% : 0.000011s : 1: overlap_recompute_and_grad_model_parallel 0.01% : 0.000008s : 1: overlap_recompute_comm 0.01% : 0.000012s : 1: parallel-infer-symbol 0.01% : 0.000006s : 1: parallel-infer-symbol-second 0.01% : 0.000008s : 1: partial_unused_args_eliminate 0.01% : 0.000011s : 1: pipeline_parallel_scheduler 0.01% : 0.000007s : 1: pipeline_split 0.07% : 0.000055s : 1: pre_auto_parallel 0.06% : 0.000044s : 1: py_interpret_to_execute 0.04% : 0.000030s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000006s : 1: remove_cast_before_assign_add 0.08% : 0.000065s : 1: remove_dup_value 18.25% : 0.014275s : 1: renormalize.infer 0.71% : 0.000554s : 1: renormalize.specialize 0.01% : 0.000008s : 1: reorder_send_recv_between_fp_bp 0.02% : 0.000014s : 1: rewriter_after_jit_bprop_graph 0.08% : 0.000065s : 1: rewriter_after_opt_a 0.18% : 0.000137s : 1: rewriter_before_opt_a 0.01% : 0.000008s : 1: slice_cell_reuse_recomputed_activation 0.01% : 0.000008s : 1: slice_recompute_activation 0.01% : 0.000007s : 1: split_layernorm_comm 0.01% : 0.000007s : 1: split_matmul_comm_elemetwise 0.02% : 0.000013s : 1: swap_dp_allreduce_reducescatter 0.15% : 0.000118s : 1: symbol_engine_optimizer 0.16% : 0.000128s : 1: tuple_transform 9.60% : 0.007514s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:44:53.126.416 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0478825, [21] [bootstrap]: 0.00047772 [type_inference]: 0.0353738 [event_method]: 2.313e-05 [auto_monad]: 7.442e-05 [graph_reusing]: 5.97001e-06 [inline]: 2.69999e-06 [add_attr]: 0.00381299, [1] [add_attr_with_inline]: 0.00379876, [1] [Cycle 1]: 8.274e-05, [2] [tag_attr]: 2.49e-05 [meta_addattr_fg_expand]: 6.14999e-06 [parallel-infer-symbol]: 4.18999e-06 [pre_auto_parallel]: 4.529e-05 [insert-virtual-dataset]: 3.26999e-06 [parallel-infer-symbol-second]: 9.99979e-07 [dataset_repeat_opt]: 2.14e-06 [pipeline_split]: 2.06e-06 [optimize]: 0.00707316, [53] [py_interpret_to_execute]: 4.099e-05 [rewriter_before_opt_a]: 0.00010663 [opt_a]: 0.0040088, [2] [Cycle 1]: 0.00300739, [45] [expand_dump_flag]: 3.58999e-06 [switch_simplify]: 4.707e-05 [loop_unroll]: 3.29e-05 [a_1]: 0.00079721 [with_stream_mark]: 2.307e-05 [recompute_prepare]: 1.424e-05 [updatestate_depend_eliminate]: 5.63997e-06 [updatestate_assign_eliminate]: 4.52998e-06 [updatestate_loads_eliminate]: 4.75001e-06 [parameter_eliminate]: 2.18002e-06 [a_2]: 0.00012472 [accelerated_algorithm]: 1.109e-05 [shard]: 2.93003e-06 [meta_shard_fg_expand]: 2.33998e-06 [shard_inline]: 9.62999e-06 [merge_send_recv]: 1.151e-05 [auto_parallel]: 1.062e-05 [parallel]: 2.116e-05 [flash_sp]: 1.033e-05 [merge_comm]: 5.57999e-06 [allreduce_fusion]: 5.07e-06 [matmul_add_comm_reduction]: 1.195e-05 [allreduce_slice_to_reducescatter]: 9.39996e-07 [virtual_shard_identity]: 1.174e-05 [virtual_dataset]: 9.94001e-06 [get_grad_eliminate_]: 9.02999e-06 [virtual_output]: 8.79e-06 [merge_forward]: 5.92999e-06 [cell_reuse_recompute_pass]: 1.67001e-06 [offload_activation]: 1.398e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.083e-05 [merge_recompute_call_nodes]: 1.51998e-06 [before_grad]: 1.637e-05 [set_forward_comm_id_for_comm_node_pass]: 5.25001e-06 [meta_fg_expand]: 3.96001e-06 [flash_sp_send_recv_attached]: 2.66999e-06 [receive_attached]: 2.94001e-06 [after_resolve]: 1.55e-05 [a_after_grad]: 1.662e-05 [renormalize]: 0.00125236 [add_forward_monad_depend]: 8.97e-06 [auto_monad_grad]: 2.68e-06 [auto_monad_eliminator]: 2.411e-05 [cse]: 4.813e-05 [a_3]: 7.602e-05 [Cycle 2]: 0.00098801, [45] [expand_dump_flag]: 2.01003e-06 [switch_simplify]: 1.195e-05 [loop_unroll]: 9.44998e-06 [a_1]: 0.00023387 [with_stream_mark]: 2.031e-05 [recompute_prepare]: 9.89999e-06 [updatestate_depend_eliminate]: 6.24999e-06 [updatestate_assign_eliminate]: 4.47e-06 [updatestate_loads_eliminate]: 4.85001e-06 [parameter_eliminate]: 2.15002e-06 [a_2]: 0.00011951 [accelerated_algorithm]: 9.64999e-06 [shard]: 2.89001e-06 [meta_shard_fg_expand]: 3.36999e-06 [shard_inline]: 8.94003e-06 [merge_send_recv]: 1.05e-05 [auto_parallel]: 1.147e-05 [parallel]: 8.87e-06 [flash_sp]: 4.38001e-06 [merge_comm]: 5.14998e-06 [allreduce_fusion]: 5.44e-06 [matmul_add_comm_reduction]: 1.436e-05 [allreduce_slice_to_reducescatter]: 4.39992e-07 [virtual_shard_identity]: 1.12e-05 [virtual_dataset]: 9.12001e-06 [get_grad_eliminate_]: 9.40001e-06 [virtual_output]: 8.90001e-06 [merge_forward]: 6.07999e-06 [cell_reuse_recompute_pass]: 3.46999e-06 [offload_activation]: 1.254e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.866e-05 [merge_recompute_call_nodes]: 1.29e-06 [before_grad]: 1.532e-05 [set_forward_comm_id_for_comm_node_pass]: 5.47999e-06 [meta_fg_expand]: 4.22e-06 [flash_sp_send_recv_attached]: 2.04e-06 [receive_attached]: 2.06e-06 [after_resolve]: 1.52e-05 [a_after_grad]: 1.393e-05 [renormalize]: 8.00064e-08 [add_forward_monad_depend]: 2.22999e-06 [auto_monad_grad]: 2.46998e-06 [auto_monad_eliminator]: 1.467e-05 [cse]: 3.339e-05 [a_3]: 5.764e-05 [py_interpret_to_execute_after_opt_a]: 2.014e-05 [slice_cell_reuse_recomputed_activation]: 2.24001e-06 [rewriter_after_opt_a]: 5.36e-05 [convert_after_rewriter]: 9.51e-06 [order_py_execute_after_rewriter]: 7.20998e-06 [mutable_eliminate]: 0.00079403 [opt_b]: 0.00033093, [1] [Cycle 1]: 0.00032115, [7] [b_1]: 0.0001982 [b_2]: 1.239e-05 [updatestate_depend_eliminate]: 1.196e-05 [updatestate_assign_eliminate]: 4.50999e-06 [updatestate_loads_eliminate]: 5.37999e-06 [renormalize]: 1.05001e-06 [cse]: 4.507e-05 [optimize_parallel_all_gather_comm]: 2.527e-05 [overlap_param_gather]: 2.26e-06 [cconv]: 3.727e-05 [loop_unroll]: 0.00070406 [opt_after_cconv]: 0.00016116, [1] [Cycle 1]: 0.00015336, [7] [c_1]: 4.673e-05 [parameter_eliminate]: 6.38003e-06 [updatestate_depend_eliminate]: 9.82001e-06 [updatestate_assign_eliminate]: 3.8e-06 [updatestate_loads_eliminate]: 3.91999e-06 [cse]: 4.234e-05 [renormalize]: 8.89995e-07 [remove_dup_value]: 5.78e-05 [tuple_transform]: 0.00010803, [1] [Cycle 1]: 0.0001032, [4] [d_1]: 7.049e-05 [none_parameter_eliminate]: 2.11e-06 [renormalize]: 2.00002e-07 [switch_simplify]: 1.019e-05 [partial_unused_args_eliminate]: 2.76e-06 [add_recomputation]: 7.394e-05 [cse_after_recomputation]: 3.16e-05, [1] [Cycle 1]: 2.575e-05, [1] [cse]: 1.941e-05 [environ_conv]: 8.00999e-06 [swap_dp_allreduce_reducescatter]: 7.47002e-06 [bias_add_comm_swap]: 3.71999e-06 [label_micro_interleaved_index]: 5.95002e-06 [label_fine_grained_interleaved_index]: 2.64001e-06 [merge_cast_opt]: 1.50001e-06 [slice_recompute_activation]: 2.47001e-06 [micro_interleaved_order_control]: 2.48e-06 [assign_add_opt]: 1.33002e-06 [ForceFp32Comm]: 1.52001e-06 [remove_cast_before_assign_add]: 1.05999e-06 [full_micro_interleaved_order_control]: 2.15002e-06 [reorder_send_recv_between_fp_bp]: 2.96999e-06 [comm_op_add_attrs]: 1.03001e-06 [add_comm_op_reuse_tag]: 1.09003e-06 [interleave_split_concat_branches]: 1.66998e-06 [interleave_parallel_branches]: 1.01002e-06 [overlap_opt_shard_in_pipeline]: 1.32e-06 [overlap_opt_shard_grad_in_pipeline]: 1.86e-06 [control_data_broadcast_order]: 2.01e-05 [grouped_pairwise_exchange_alltoall]: 2.34999e-06 [offloading_packed_experts]: 5.92999e-06 [overlap_recompute_and_grad_model_parallel]: 6.54999e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.92001e-06 [overlap_recompute_allgather_and_fa_grad]: 1.47999e-06 [overlap_recompute_comm]: 2.24999e-06 [overlap_grad_ring_attention]: 5.95002e-06 [overlap_grad_flash_sp]: 2.73e-05 [begin_end_overlap_inline]: 5.00004e-07 [split_matmul_comm_elemetwise]: 2.61999e-06 [split_layernorm_comm]: 1.76e-06 [handle_group_info]: 1.22e-06 [symbol_engine_optimizer]: 0.00010556, [1] [Cycle 1]: 0.00010045, [6] [build]: 4.52e-06 [elim_shapecalc]: 1.686e-05 [elim_not_effective]: 1.973e-05 [opt_reshape]: 1.084e-05 [fold_const_symbol]: 1.565e-05 [renormalize]: 3.9002e-07 [detach_backward]: 3.09999e-06 [pipeline_parallel_scheduler]: 1.64e-06 [auto_monad_reorder]: 2.266e-05 [get_jit_bprop_graph]: 2.41e-06 [rewriter_after_jit_bprop_graph]: 7.11999e-06 [opt_after_jit_grad]: 0.0007041 [validate]: 5.673e-05 Sums bootstrap : 0.000478s : 1.11% type_inference : 0.035374s : 82.35% event_method : 0.000023s : 0.05% auto_monad : 0.000074s : 0.17% graph_reusing : 0.000006s : 0.01% inline : 0.000003s : 0.01% add_attr.add_attr_with_inline.tag_attr : 0.000025s : 0.06% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.01% parallel-infer-symbol : 0.000004s : 0.01% pre_auto_parallel : 0.000045s : 0.11% insert-virtual-dataset : 0.000003s : 0.01% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000041s : 0.10% optimize.rewriter_before_opt_a : 0.000107s : 0.25% optimize.opt_a.expand_dump_flag : 0.000006s : 0.01% optimize.opt_a.switch_simplify : 0.000059s : 0.14% optimize.opt_a.loop_unroll : 0.000042s : 0.10% optimize.opt_a.a_1 : 0.001031s : 2.40% optimize.opt_a.with_stream_mark : 0.000043s : 0.10% optimize.opt_a.recompute_prepare : 0.000024s : 0.06% optimize.opt_a.updatestate_depend_eliminate : 0.000012s : 0.03% optimize.opt_a.updatestate_assign_eliminate : 0.000009s : 0.02% optimize.opt_a.updatestate_loads_eliminate : 0.000010s : 0.02% optimize.opt_a.parameter_eliminate : 0.000004s : 0.01% optimize.opt_a.a_2 : 0.000244s : 0.57% optimize.opt_a.accelerated_algorithm : 0.000021s : 0.05% optimize.opt_a.shard : 0.000006s : 0.01% optimize.opt_a.meta_shard_fg_expand : 0.000006s : 0.01% optimize.opt_a.shard_inline : 0.000019s : 0.04% optimize.opt_a.merge_send_recv : 0.000022s : 0.05% optimize.opt_a.auto_parallel : 0.000022s : 0.05% optimize.opt_a.parallel : 0.000030s : 0.07% optimize.opt_a.flash_sp : 0.000015s : 0.03% optimize.opt_a.merge_comm : 0.000011s : 0.02% optimize.opt_a.allreduce_fusion : 0.000011s : 0.02% optimize.opt_a.matmul_add_comm_reduction : 0.000026s : 0.06% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000023s : 0.05% optimize.opt_a.virtual_dataset : 0.000019s : 0.04% optimize.opt_a.get_grad_eliminate_ : 0.000018s : 0.04% optimize.opt_a.virtual_output : 0.000018s : 0.04% optimize.opt_a.merge_forward : 0.000012s : 0.03% optimize.opt_a.cell_reuse_recompute_pass : 0.000005s : 0.01% optimize.opt_a.offload_activation : 0.000027s : 0.06% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000039s : 0.09% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.01% optimize.opt_a.before_grad : 0.000032s : 0.07% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000011s : 0.02% optimize.opt_a.meta_fg_expand : 0.000008s : 0.02% optimize.opt_a.flash_sp_send_recv_attached : 0.000005s : 0.01% optimize.opt_a.receive_attached : 0.000005s : 0.01% optimize.opt_a.after_resolve : 0.000031s : 0.07% optimize.opt_a.a_after_grad : 0.000031s : 0.07% optimize.opt_a.renormalize : 0.001252s : 2.92% optimize.opt_a.add_forward_monad_depend : 0.000011s : 0.03% optimize.opt_a.auto_monad_grad : 0.000005s : 0.01% optimize.opt_a.auto_monad_eliminator : 0.000039s : 0.09% optimize.opt_a.cse : 0.000082s : 0.19% optimize.opt_a.a_3 : 0.000134s : 0.31% optimize.py_interpret_to_execute_after_opt_a : 0.000020s : 0.05% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.01% optimize.rewriter_after_opt_a : 0.000054s : 0.12% optimize.convert_after_rewriter : 0.000010s : 0.02% optimize.order_py_execute_after_rewriter : 0.000007s : 0.02% optimize.mutable_eliminate : 0.000794s : 1.85% optimize.opt_b.b_1 : 0.000198s : 0.46% optimize.opt_b.b_2 : 0.000012s : 0.03% optimize.opt_b.updatestate_depend_eliminate : 0.000012s : 0.03% optimize.opt_b.updatestate_assign_eliminate : 0.000005s : 0.01% optimize.opt_b.updatestate_loads_eliminate : 0.000005s : 0.01% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000045s : 0.10% optimize.optimize_parallel_all_gather_comm : 0.000025s : 0.06% optimize.overlap_param_gather : 0.000002s : 0.01% optimize.cconv : 0.000037s : 0.09% optimize.loop_unroll : 0.000704s : 1.64% optimize.opt_after_cconv.c_1 : 0.000047s : 0.11% optimize.opt_after_cconv.parameter_eliminate : 0.000006s : 0.01% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000010s : 0.02% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000004s : 0.01% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000004s : 0.01% optimize.opt_after_cconv.cse : 0.000042s : 0.10% optimize.opt_after_cconv.renormalize : 0.000001s : 0.00% optimize.remove_dup_value : 0.000058s : 0.13% optimize.tuple_transform.d_1 : 0.000070s : 0.16% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000010s : 0.02% optimize.partial_unused_args_eliminate : 0.000003s : 0.01% optimize.add_recomputation : 0.000074s : 0.17% optimize.cse_after_recomputation.cse : 0.000019s : 0.05% optimize.environ_conv : 0.000008s : 0.02% optimize.swap_dp_allreduce_reducescatter : 0.000007s : 0.02% optimize.bias_add_comm_swap : 0.000004s : 0.01% optimize.label_micro_interleaved_index : 0.000006s : 0.01% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.01% optimize.merge_cast_opt : 0.000002s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.01% optimize.micro_interleaved_order_control : 0.000002s : 0.01% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000002s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.00% optimize.full_micro_interleaved_order_control : 0.000002s : 0.01% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.01% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000002s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.00% optimize.control_data_broadcast_order : 0.000020s : 0.05% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.01% optimize.offloading_packed_experts : 0.000006s : 0.01% optimize.overlap_recompute_and_grad_model_parallel : 0.000007s : 0.02% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000002s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.00% optimize.overlap_recompute_comm : 0.000002s : 0.01% optimize.overlap_grad_ring_attention : 0.000006s : 0.01% optimize.overlap_grad_flash_sp : 0.000027s : 0.06% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000003s : 0.01% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000005s : 0.01% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000017s : 0.04% optimize.symbol_engine_optimizer.elim_not_effective : 0.000020s : 0.05% optimize.symbol_engine_optimizer.opt_reshape : 0.000011s : 0.03% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000016s : 0.04% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000003s : 0.01% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000023s : 0.05% get_jit_bprop_graph : 0.000002s : 0.01% rewriter_after_jit_bprop_graph : 0.000007s : 0.02% opt_after_jit_grad : 0.000704s : 1.64% validate : 0.000057s : 0.13% Time group info: ------[substitution.] 0.000271 48 14.66% : 0.000040s : 6: substitution.cast_eliminate 1.07% : 0.000003s : 4: substitution.elim_not_effective 0.72% : 0.000002s : 4: substitution.fold_const_symbol 3.34% : 0.000009s : 6: substitution.graph_param_transform 67.15% : 0.000182s : 4: substitution.inline 2.46% : 0.000007s : 8: substitution.j_node_and_user_rematch 3.30% : 0.000009s : 8: substitution.remove_not_recompute_node 2.30% : 0.000006s : 4: substitution.replace_old_param 5.00% : 0.000014s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.035298 2 97.57% : 0.034441s : 1: type_inference.infer 2.43% : 0.000857s : 1: type_inference.specialize ------[replace.] 0.000069 8 61.17% : 0.000042s : 4: replace.inline 38.83% : 0.000027s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000190 8 93.90% : 0.000179s : 4: match.inline 6.10% : 0.000012s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000310 1730 0.93% : 0.000003s : 17: predicate.accumulaten_eliminater 0.88% : 0.000003s : 6: predicate.ad_related_special_op_eliminate 0.57% : 0.000002s : 12: predicate.addn_check_dump 0.99% : 0.000003s : 17: predicate.addn_zero_filter 0.82% : 0.000003s : 17: predicate.adjust_all_reduce_mul_add 2.08% : 0.000006s : 29: predicate.arithmetic_simplify 1.22% : 0.000004s : 17: predicate.cast_eliminate 0.61% : 0.000002s : 12: predicate.check_bprop_eliminate 0.53% : 0.000002s : 12: predicate.compare_switch_simplify 0.20% : 0.000001s : 6: predicate.const_output_eliminate 0.67% : 0.000002s : 12: predicate.depend_value_elim 0.88% : 0.000003s : 17: predicate.dict_get_item_const_eliminator 1.02% : 0.000003s : 17: predicate.dict_get_item_eliminator 0.85% : 0.000003s : 17: predicate.dict_set_item_eliminator 1.07% : 0.000003s : 12: predicate.dumpgradient_eliminate 0.19% : 0.000001s : 6: predicate.elim_not_effective 0.53% : 0.000002s : 6: predicate.elim_shapecalc_of_broadcastargs 1.18% : 0.000004s : 23: predicate.environ_add_const_eliminate 1.05% : 0.000003s : 23: predicate.environ_get_add_eliminate 1.21% : 0.000004s : 23: predicate.environ_get_depend_swap 1.68% : 0.000005s : 35: predicate.environ_get_eliminate 1.12% : 0.000003s : 23: predicate.environ_get_set_eliminate 1.20% : 0.000004s : 25: predicate.exchange_switch_depend_value 2.04% : 0.000006s : 25: predicate.float_depend_g_call 0.58% : 0.000002s : 12: predicate.float_environ_get_switch 0.86% : 0.000003s : 18: predicate.float_tuple_getitem_switch 0.19% : 0.000001s : 6: predicate.fold_const_symbol 0.77% : 0.000002s : 12: predicate.get_grad_eliminate 0.24% : 0.000001s : 6: predicate.graph_param_transform 0.63% : 0.000002s : 12: predicate.incorporate_call 0.53% : 0.000002s : 12: predicate.incorporate_call_switch 6.20% : 0.000019s : 78: predicate.inline 1.11% : 0.000003s : 12: predicate.inline_without_move 0.32% : 0.000001s : 12: predicate.j_node_and_user_rematch 0.92% : 0.000003s : 12: predicate.less_batch_normalization 1.88% : 0.000006s : 33: predicate.list_to_tuple_eliminator_ 2.61% : 0.000008s : 50: predicate.load_eliminater 1.19% : 0.000004s : 6: predicate.loop_unroll_after_grad 1.85% : 0.000006s : 38: predicate.loop_unroll_before_grad 1.69% : 0.000005s : 29: predicate.make_slice_get_slice_eliminator 0.62% : 0.000002s : 12: predicate.merge_addn 0.63% : 0.000002s : 12: predicate.micro_step_allgather_replace 0.60% : 0.000002s : 12: predicate.mini_step_allgather_replace 0.77% : 0.000002s : 17: predicate.minmaximum_grad 1.38% : 0.000004s : 6: predicate.mutable_eliminate 0.48% : 0.000001s : 6: predicate.opt_reshape 0.60% : 0.000002s : 6: predicate.parallel_virtual_node 1.68% : 0.000005s : 25: predicate.partial_defer_inline 1.46% : 0.000005s : 27: predicate.partial_eliminate 0.98% : 0.000003s : 17: predicate.print_const_string_wrapper 0.65% : 0.000002s : 12: predicate.reduce_all_const_elim 1.21% : 0.000004s : 17: predicate.reduce_eliminate 2.41% : 0.000007s : 50: predicate.redundant_stop_gradient_eliminater 0.40% : 0.000001s : 12: predicate.remove_not_recompute_node 1.19% : 0.000004s : 33: predicate.replace_applicator 0.43% : 0.000001s : 12: predicate.replace_old_param 0.29% : 0.000001s : 6: predicate.reset_defer_inline 0.99% : 0.000003s : 17: predicate.reshape_eliminate 0.63% : 0.000002s : 12: predicate.row_tensor_add_zeros_like 0.41% : 0.000001s : 6: predicate.row_tensor_eliminate 0.73% : 0.000002s : 12: predicate.same_eliminate 0.42% : 0.000001s : 12: predicate.set_cell_output_no_recompute 0.80% : 0.000002s : 12: predicate.shard_identity_eliminate 0.94% : 0.000003s : 12: predicate.special_op_eliminate 0.79% : 0.000002s : 12: predicate.specialize_transform 1.10% : 0.000003s : 12: predicate.split_environ_get_set_with_tuple_value 1.01% : 0.000003s : 12: predicate.stack_unstack_eliminate 0.40% : 0.000001s : 6: predicate.switch_call_monad_eliminater 1.32% : 0.000004s : 25: predicate.switch_defer_inline 1.91% : 0.000006s : 37: predicate.switch_layer_defer_inline 4.52% : 0.000014s : 81: predicate.switch_simplify 0.89% : 0.000003s : 17: predicate.tile_eliminate 0.91% : 0.000003s : 17: predicate.transpose_eliminate 1.61% : 0.000005s : 29: predicate.tuple_list_convert_item_index_to_positive 1.59% : 0.000005s : 29: predicate.tuple_list_get_item_const_eliminator 1.52% : 0.000005s : 29: predicate.tuple_list_get_item_depend_reorder 3.11% : 0.000010s : 45: predicate.tuple_list_get_item_eliminator 1.70% : 0.000005s : 29: predicate.tuple_list_get_set_item_eliminator 2.28% : 0.000007s : 41: predicate.tuple_list_set_item_eliminator 1.75% : 0.000005s : 33: predicate.tuple_to_list_eliminator_ 2.34% : 0.000007s : 50: predicate.updatestate_pure_node_eliminater 3.01% : 0.000009s : 62: predicate.updatestate_useless_node_eliminater 0.41% : 0.000001s : 6: predicate.value_based_eliminate 0.67% : 0.000002s : 12: predicate.virtual_dataset_eliminate 0.69% : 0.000002s : 12: predicate.virtual_output_eliminate 0.29% : 0.000001s : 6: predicate.virtual_view_grad_eliminate 0.42% : 0.000001s : 6: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000695 11 51.54% : 0.000358s : 5: func_graph_cloner_run.FuncGraphClonerGraph 48.46% : 0.000337s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.061980 192 0.01% : 0.000004s : 1: ForceFp32Comm 6.16% : 0.003820s : 1: add_attr 6.14% : 0.003804s : 1: add_attr_with_inline 0.01% : 0.000004s : 1: add_comm_op_reuse_tag 0.13% : 0.000078s : 1: add_recomputation 0.01% : 0.000004s : 1: assign_add_opt 0.13% : 0.000080s : 1: auto_monad 0.04% : 0.000027s : 1: auto_monad_reorder 0.01% : 0.000005s : 1: begin_end_overlap_inline 0.01% : 0.000007s : 1: bias_add_comm_swap 0.82% : 0.000510s : 1: bootstrap 0.07% : 0.000041s : 1: cconv 0.01% : 0.000004s : 1: comm_op_add_attrs 0.04% : 0.000023s : 1: control_data_broadcast_order 0.02% : 0.000013s : 1: convert_after_rewriter 0.06% : 0.000035s : 1: cse_after_recomputation 0.01% : 0.000005s : 1: dataset_repeat_opt 0.01% : 0.000007s : 1: detach_backward 0.02% : 0.000012s : 1: environ_conv 0.05% : 0.000030s : 1: event_method 0.01% : 0.000005s : 1: full_micro_interleaved_order_control 0.01% : 0.000006s : 1: get_jit_bprop_graph 0.02% : 0.000010s : 1: graph_reusing 0.01% : 0.000006s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000004s : 1: handle_group_info 0.01% : 0.000006s : 1: inline 0.01% : 0.000007s : 1: insert-virtual-dataset 0.01% : 0.000004s : 1: interleave_parallel_branches 0.01% : 0.000004s : 1: interleave_split_concat_branches 0.01% : 0.000006s : 1: label_fine_grained_interleaved_index 0.01% : 0.000009s : 1: label_micro_interleaved_index 1.16% : 0.000717s : 1: loop_unroll 0.01% : 0.000004s : 1: merge_cast_opt 0.01% : 0.000005s : 1: micro_interleaved_order_control 1.31% : 0.000809s : 1: mutable_eliminate 0.01% : 0.000009s : 1: offloading_packed_experts 0.04% : 0.000026s : 1: opt.transform.loop_unroll_optimizer 0.05% : 0.000032s : 1: opt.transform.mutable_eliminate 2.71% : 0.001680s : 78: opt.transform.opt_a 0.07% : 0.000045s : 1: opt.transform.opt_after_cconv 0.07% : 0.000042s : 1: opt.transform.opt_after_jit_grad 0.28% : 0.000173s : 28: opt.transform.opt_b 0.13% : 0.000078s : 2: opt.transform.opt_trans_graph 0.09% : 0.000058s : 4: opt.transform.symbol_engine_opt 6.47% : 0.004013s : 1: opt_a 0.27% : 0.000165s : 1: opt_after_cconv 1.16% : 0.000719s : 1: opt_after_jit_grad 0.54% : 0.000335s : 1: opt_b 11.42% : 0.007080s : 1: optimize 0.05% : 0.000029s : 1: optimize_parallel_all_gather_comm 0.02% : 0.000010s : 1: order_py_execute_after_rewriter 0.05% : 0.000031s : 1: overlap_grad_flash_sp 0.01% : 0.000005s : 1: overlap_grad_matmul_and_grad_allreduce 0.01% : 0.000009s : 1: overlap_grad_ring_attention 0.01% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000005s : 1: overlap_opt_shard_in_pipeline 0.01% : 0.000006s : 1: overlap_param_gather 0.01% : 0.000005s : 1: overlap_recompute_allgather_and_fa_grad 0.02% : 0.000010s : 1: overlap_recompute_and_grad_model_parallel 0.01% : 0.000005s : 1: overlap_recompute_comm 0.01% : 0.000008s : 1: parallel-infer-symbol 0.01% : 0.000004s : 1: parallel-infer-symbol-second 0.01% : 0.000006s : 1: partial_unused_args_eliminate 0.01% : 0.000005s : 1: pipeline_parallel_scheduler 0.01% : 0.000005s : 1: pipeline_split 0.08% : 0.000050s : 1: pre_auto_parallel 0.07% : 0.000045s : 1: py_interpret_to_execute 0.04% : 0.000024s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000004s : 1: remove_cast_before_assign_add 0.10% : 0.000064s : 1: remove_dup_value 1.24% : 0.000767s : 1: renormalize.infer 0.76% : 0.000473s : 1: renormalize.specialize 0.01% : 0.000007s : 1: reorder_send_recv_between_fp_bp 0.02% : 0.000011s : 1: rewriter_after_jit_bprop_graph 0.09% : 0.000058s : 1: rewriter_after_opt_a 0.18% : 0.000111s : 1: rewriter_before_opt_a 0.01% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.01% : 0.000006s : 1: slice_recompute_activation 0.01% : 0.000005s : 1: split_layernorm_comm 0.01% : 0.000006s : 1: split_matmul_comm_elemetwise 0.02% : 0.000010s : 1: swap_dp_allreduce_reducescatter 0.18% : 0.000109s : 1: symbol_engine_optimizer 0.18% : 0.000111s : 1: tuple_transform 57.12% : 0.035401s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:44:53.588.124 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:44:53.588.394 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0452067, [21] [bootstrap]: 0.00048293 [type_inference]: 0.0196671 [event_method]: 2.281e-05 [auto_monad]: 6.661e-05 [graph_reusing]: 6.38e-06 [inline]: 2.56998e-06 [add_attr]: 0.00373027, [1] [add_attr_with_inline]: 0.00372016, [1] [Cycle 1]: 8.22e-05, [2] [tag_attr]: 2.208e-05 [meta_addattr_fg_expand]: 6.04999e-06 [parallel-infer-symbol]: 3.41001e-06 [pre_auto_parallel]: 3.803e-05 [insert-virtual-dataset]: 2.43998e-06 [parallel-infer-symbol-second]: 8.2e-07 [dataset_repeat_opt]: 2.31e-06 [pipeline_split]: 1.67001e-06 [optimize]: 0.0185184, [53] [py_interpret_to_execute]: 3.224e-05 [rewriter_before_opt_a]: 9.749e-05 [opt_a]: 0.00357267, [2] [Cycle 1]: 0.0026019, [45] [expand_dump_flag]: 3.39001e-06 [switch_simplify]: 4.238e-05 [loop_unroll]: 3.26e-05 [a_1]: 0.00069989 [with_stream_mark]: 1.99e-05 [recompute_prepare]: 1.17e-05 [updatestate_depend_eliminate]: 4.94e-06 [updatestate_assign_eliminate]: 3.98001e-06 [updatestate_loads_eliminate]: 4.05998e-06 [parameter_eliminate]: 2.19999e-06 [a_2]: 0.00013506 [accelerated_algorithm]: 9.54999e-06 [shard]: 2.39001e-06 [meta_shard_fg_expand]: 2.69001e-06 [shard_inline]: 8.18001e-06 [merge_send_recv]: 9.69e-06 [auto_parallel]: 8.32e-06 [parallel]: 1.973e-05 [flash_sp]: 1.033e-05 [merge_comm]: 4.61002e-06 [allreduce_fusion]: 4.22003e-06 [matmul_add_comm_reduction]: 1.098e-05 [allreduce_slice_to_reducescatter]: 6.50005e-07 [virtual_shard_identity]: 9.69e-06 [virtual_dataset]: 8.13999e-06 [get_grad_eliminate_]: 7.86001e-06 [virtual_output]: 8.59e-06 [merge_forward]: 4.77e-06 [cell_reuse_recompute_pass]: 1.49e-06 [offload_activation]: 1.215e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.853e-05 [merge_recompute_call_nodes]: 1.60999e-06 [before_grad]: 1.38e-05 [set_forward_comm_id_for_comm_node_pass]: 4.65001e-06 [meta_fg_expand]: 4.02998e-06 [flash_sp_send_recv_attached]: 3.23e-06 [receive_attached]: 2.12001e-06 [after_resolve]: 1.455e-05 [a_after_grad]: 1.322e-05 [renormalize]: 0.00087041 [add_forward_monad_depend]: 6.60997e-06 [auto_monad_grad]: 2.74001e-06 [auto_monad_eliminator]: 1.771e-05 [cse]: 3.759e-05 [a_3]: 7.499e-05 [Cycle 2]: 0.00095628, [45] [expand_dump_flag]: 2.00002e-06 [switch_simplify]: 9.14e-06 [loop_unroll]: 7.53e-06 [a_1]: 0.0001779 [with_stream_mark]: 1.408e-05 [recompute_prepare]: 8.05e-06 [updatestate_depend_eliminate]: 3.97e-06 [updatestate_assign_eliminate]: 2.84999e-06 [updatestate_loads_eliminate]: 3.14999e-06 [parameter_eliminate]: 1.22e-06 [a_2]: 0.00012032 [accelerated_algorithm]: 7.72002e-06 [shard]: 2.04e-06 [meta_shard_fg_expand]: 1.89999e-06 [shard_inline]: 7.82002e-06 [merge_send_recv]: 6.26e-06 [auto_parallel]: 7.23e-06 [parallel]: 6.23002e-06 [flash_sp]: 3.63e-06 [merge_comm]: 4.42e-06 [allreduce_fusion]: 4.13999e-06 [matmul_add_comm_reduction]: 7.80998e-06 [allreduce_slice_to_reducescatter]: 5.50004e-07 [virtual_shard_identity]: 8.57998e-06 [virtual_dataset]: 7.55e-06 [get_grad_eliminate_]: 7.13998e-06 [virtual_output]: 7.23999e-06 [merge_forward]: 3.86999e-06 [cell_reuse_recompute_pass]: 2.18998e-06 [offload_activation]: 8.43001e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.736e-05 [merge_recompute_call_nodes]: 8.89995e-07 [before_grad]: 1.246e-05 [set_forward_comm_id_for_comm_node_pass]: 4.48999e-06 [meta_fg_expand]: 3.11999e-06 [flash_sp_send_recv_attached]: 9.99979e-07 [receive_attached]: 1.67001e-06 [after_resolve]: 1.23e-05 [a_after_grad]: 1.151e-05 [renormalize]: 8.9989e-08 [add_forward_monad_depend]: 1.41998e-06 [auto_monad_grad]: 1.35999e-06 [auto_monad_eliminator]: 9.14e-06 [cse]: 2.002e-05 [a_3]: 5.955e-05 [py_interpret_to_execute_after_opt_a]: 1.636e-05 [slice_cell_reuse_recomputed_activation]: 5.44e-06 [rewriter_after_opt_a]: 4.653e-05 [convert_after_rewriter]: 1.047e-05 [order_py_execute_after_rewriter]: 8.57e-06 [mutable_eliminate]: 0.00066151 [opt_b]: 0.00031656, [1] [Cycle 1]: 0.00030585, [7] [b_1]: 0.00020075 [b_2]: 9.49999e-06 [updatestate_depend_eliminate]: 6.64001e-06 [updatestate_assign_eliminate]: 3.2e-06 [updatestate_loads_eliminate]: 2.96999e-06 [renormalize]: 5.8001e-07 [cse]: 2.524e-05 [optimize_parallel_all_gather_comm]: 2.332e-05 [overlap_param_gather]: 5.87001e-06 [cconv]: 3.486e-05 [loop_unroll]: 0.00048474 [opt_after_cconv]: 0.0001457, [1] [Cycle 1]: 0.00013696, [7] [c_1]: 3.798e-05 [parameter_eliminate]: 3.60998e-06 [updatestate_depend_eliminate]: 6.53998e-06 [updatestate_assign_eliminate]: 3.35e-06 [updatestate_loads_eliminate]: 2.89999e-06 [cse]: 2.465e-05 [renormalize]: 3.69997e-07 [remove_dup_value]: 1.896e-05 [tuple_transform]: 0.00022866, [1] [Cycle 1]: 0.00020171, [4] [d_1]: 0.00010568 [none_parameter_eliminate]: 6.23e-06 [renormalize]: 1.27e-06 [switch_simplify]: 1.16e-05 [partial_unused_args_eliminate]: 5.51e-06 [add_recomputation]: 9.255e-05 [cse_after_recomputation]: 6.264e-05, [1] [Cycle 1]: 5.444e-05, [1] [cse]: 4.355e-05 [environ_conv]: 1.267e-05 [swap_dp_allreduce_reducescatter]: 1.166e-05 [bias_add_comm_swap]: 6.74999e-06 [label_micro_interleaved_index]: 1.239e-05 [label_fine_grained_interleaved_index]: 5.62001e-06 [merge_cast_opt]: 4.22998e-06 [slice_recompute_activation]: 4.65999e-06 [micro_interleaved_order_control]: 5.20999e-06 [assign_add_opt]: 4.43999e-06 [ForceFp32Comm]: 3.38999e-06 [remove_cast_before_assign_add]: 3.5e-06 [full_micro_interleaved_order_control]: 4.52003e-06 [reorder_send_recv_between_fp_bp]: 5.25001e-06 [comm_op_add_attrs]: 3.7e-06 [add_comm_op_reuse_tag]: 3.23e-06 [interleave_split_concat_branches]: 3.4e-06 [interleave_parallel_branches]: 3.5e-06 [overlap_opt_shard_in_pipeline]: 3.86001e-06 [overlap_opt_shard_grad_in_pipeline]: 4.28001e-06 [control_data_broadcast_order]: 2.38e-05 [grouped_pairwise_exchange_alltoall]: 3.87002e-06 [offloading_packed_experts]: 8.28999e-06 [overlap_recompute_and_grad_model_parallel]: 8.68001e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.95e-06 [overlap_recompute_allgather_and_fa_grad]: 3.89002e-06 [overlap_recompute_comm]: 5.40999e-06 [overlap_grad_ring_attention]: 7.6e-06 [overlap_grad_flash_sp]: 3.224e-05 [begin_end_overlap_inline]: 3.14001e-06 [split_matmul_comm_elemetwise]: 4.55001e-06 [split_layernorm_comm]: 4.2e-06 [handle_group_info]: 4.15e-06 [symbol_engine_optimizer]: 0.00012898, [1] [Cycle 1]: 0.00012073, [6] [build]: 5.05001e-06 [elim_shapecalc]: 1.959e-05 [elim_not_effective]: 2.007e-05 [opt_reshape]: 9.59e-06 [fold_const_symbol]: 1.353e-05 [renormalize]: 3.30008e-07 [detach_backward]: 6.41e-06 [pipeline_parallel_scheduler]: 2.11e-06 [auto_monad_reorder]: 3.372e-05 [get_jit_bprop_graph]: 2.22001e-06 [rewriter_after_jit_bprop_graph]: 8.64e-06 [opt_after_jit_grad]: 0.00080987 [validate]: 5.779e-05 Sums bootstrap : 0.000483s : 1.83% type_inference : 0.019667s : 74.56% event_method : 0.000023s : 0.09% auto_monad : 0.000067s : 0.25% graph_reusing : 0.000006s : 0.02% inline : 0.000003s : 0.01% add_attr.add_attr_with_inline.tag_attr : 0.000022s : 0.08% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.02% parallel-infer-symbol : 0.000003s : 0.01% pre_auto_parallel : 0.000038s : 0.14% insert-virtual-dataset : 0.000002s : 0.01% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.01% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000032s : 0.12% optimize.rewriter_before_opt_a : 0.000097s : 0.37% optimize.opt_a.expand_dump_flag : 0.000005s : 0.02% optimize.opt_a.switch_simplify : 0.000052s : 0.20% optimize.opt_a.loop_unroll : 0.000040s : 0.15% optimize.opt_a.a_1 : 0.000878s : 3.33% optimize.opt_a.with_stream_mark : 0.000034s : 0.13% optimize.opt_a.recompute_prepare : 0.000020s : 0.07% optimize.opt_a.updatestate_depend_eliminate : 0.000009s : 0.03% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.03% optimize.opt_a.updatestate_loads_eliminate : 0.000007s : 0.03% optimize.opt_a.parameter_eliminate : 0.000003s : 0.01% optimize.opt_a.a_2 : 0.000255s : 0.97% optimize.opt_a.accelerated_algorithm : 0.000017s : 0.07% optimize.opt_a.shard : 0.000004s : 0.02% optimize.opt_a.meta_shard_fg_expand : 0.000005s : 0.02% optimize.opt_a.shard_inline : 0.000016s : 0.06% optimize.opt_a.merge_send_recv : 0.000016s : 0.06% optimize.opt_a.auto_parallel : 0.000016s : 0.06% optimize.opt_a.parallel : 0.000026s : 0.10% optimize.opt_a.flash_sp : 0.000014s : 0.05% optimize.opt_a.merge_comm : 0.000009s : 0.03% optimize.opt_a.allreduce_fusion : 0.000008s : 0.03% optimize.opt_a.matmul_add_comm_reduction : 0.000019s : 0.07% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000018s : 0.07% optimize.opt_a.virtual_dataset : 0.000016s : 0.06% optimize.opt_a.get_grad_eliminate_ : 0.000015s : 0.06% optimize.opt_a.virtual_output : 0.000016s : 0.06% optimize.opt_a.merge_forward : 0.000009s : 0.03% optimize.opt_a.cell_reuse_recompute_pass : 0.000004s : 0.01% optimize.opt_a.offload_activation : 0.000021s : 0.08% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000036s : 0.14% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.01% optimize.opt_a.before_grad : 0.000026s : 0.10% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000009s : 0.03% optimize.opt_a.meta_fg_expand : 0.000007s : 0.03% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.02% optimize.opt_a.receive_attached : 0.000004s : 0.01% optimize.opt_a.after_resolve : 0.000027s : 0.10% optimize.opt_a.a_after_grad : 0.000025s : 0.09% optimize.opt_a.renormalize : 0.000870s : 3.30% optimize.opt_a.add_forward_monad_depend : 0.000008s : 0.03% optimize.opt_a.auto_monad_grad : 0.000004s : 0.02% optimize.opt_a.auto_monad_eliminator : 0.000027s : 0.10% optimize.opt_a.cse : 0.000058s : 0.22% optimize.opt_a.a_3 : 0.000135s : 0.51% optimize.py_interpret_to_execute_after_opt_a : 0.000016s : 0.06% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.02% optimize.rewriter_after_opt_a : 0.000047s : 0.18% optimize.convert_after_rewriter : 0.000010s : 0.04% optimize.order_py_execute_after_rewriter : 0.000009s : 0.03% optimize.mutable_eliminate : 0.000662s : 2.51% optimize.opt_b.b_1 : 0.000201s : 0.76% optimize.opt_b.b_2 : 0.000009s : 0.04% optimize.opt_b.updatestate_depend_eliminate : 0.000007s : 0.03% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.01% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.01% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000025s : 0.10% optimize.optimize_parallel_all_gather_comm : 0.000023s : 0.09% optimize.overlap_param_gather : 0.000006s : 0.02% optimize.cconv : 0.000035s : 0.13% optimize.loop_unroll : 0.000485s : 1.84% optimize.opt_after_cconv.c_1 : 0.000038s : 0.14% optimize.opt_after_cconv.parameter_eliminate : 0.000004s : 0.01% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000007s : 0.02% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.cse : 0.000025s : 0.09% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000019s : 0.07% optimize.tuple_transform.d_1 : 0.000106s : 0.40% optimize.tuple_transform.none_parameter_eliminate : 0.000006s : 0.02% optimize.tuple_transform.renormalize : 0.000001s : 0.00% optimize.tuple_transform.switch_simplify : 0.000012s : 0.04% optimize.partial_unused_args_eliminate : 0.000006s : 0.02% optimize.add_recomputation : 0.000093s : 0.35% optimize.cse_after_recomputation.cse : 0.000044s : 0.17% optimize.environ_conv : 0.000013s : 0.05% optimize.swap_dp_allreduce_reducescatter : 0.000012s : 0.04% optimize.bias_add_comm_swap : 0.000007s : 0.03% optimize.label_micro_interleaved_index : 0.000012s : 0.05% optimize.label_fine_grained_interleaved_index : 0.000006s : 0.02% optimize.merge_cast_opt : 0.000004s : 0.02% optimize.slice_recompute_activation : 0.000005s : 0.02% optimize.micro_interleaved_order_control : 0.000005s : 0.02% optimize.assign_add_opt : 0.000004s : 0.02% optimize.ForceFp32Comm : 0.000003s : 0.01% optimize.remove_cast_before_assign_add : 0.000003s : 0.01% optimize.full_micro_interleaved_order_control : 0.000005s : 0.02% optimize.reorder_send_recv_between_fp_bp : 0.000005s : 0.02% optimize.comm_op_add_attrs : 0.000004s : 0.01% optimize.add_comm_op_reuse_tag : 0.000003s : 0.01% optimize.interleave_split_concat_branches : 0.000003s : 0.01% optimize.interleave_parallel_branches : 0.000003s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000004s : 0.02% optimize.control_data_broadcast_order : 0.000024s : 0.09% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.01% optimize.offloading_packed_experts : 0.000008s : 0.03% optimize.overlap_recompute_and_grad_model_parallel : 0.000009s : 0.03% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.01% optimize.overlap_recompute_comm : 0.000005s : 0.02% optimize.overlap_grad_ring_attention : 0.000008s : 0.03% optimize.overlap_grad_flash_sp : 0.000032s : 0.12% optimize.begin_end_overlap_inline : 0.000003s : 0.01% optimize.split_matmul_comm_elemetwise : 0.000005s : 0.02% optimize.split_layernorm_comm : 0.000004s : 0.02% optimize.handle_group_info : 0.000004s : 0.02% optimize.symbol_engine_optimizer.build : 0.000005s : 0.02% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000020s : 0.07% optimize.symbol_engine_optimizer.elim_not_effective : 0.000020s : 0.08% optimize.symbol_engine_optimizer.opt_reshape : 0.000010s : 0.04% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000014s : 0.05% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000006s : 0.02% pipeline_parallel_scheduler : 0.000002s : 0.01% auto_monad_reorder : 0.000034s : 0.13% get_jit_bprop_graph : 0.000002s : 0.01% rewriter_after_jit_bprop_graph : 0.000009s : 0.03% opt_after_jit_grad : 0.000810s : 3.07% validate : 0.000058s : 0.22% Time group info: ------[substitution.] 0.000235 38 10.73% : 0.000025s : 3: substitution.cast_eliminate 1.29% : 0.000003s : 3: substitution.elim_not_effective 0.70% : 0.000002s : 3: substitution.fold_const_symbol 5.91% : 0.000014s : 5: substitution.graph_param_transform 68.54% : 0.000161s : 4: substitution.inline 2.09% : 0.000005s : 6: substitution.j_node_and_user_rematch 2.66% : 0.000006s : 6: substitution.remove_not_recompute_node 2.46% : 0.000006s : 4: substitution.replace_old_param 5.63% : 0.000013s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.019603 2 95.88% : 0.018795s : 1: type_inference.infer 4.12% : 0.000808s : 1: type_inference.specialize ------[replace.] 0.000064 8 62.77% : 0.000040s : 4: replace.inline 37.23% : 0.000024s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000169 8 93.31% : 0.000158s : 4: match.inline 6.69% : 0.000011s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000254 1504 0.84% : 0.000002s : 15: predicate.accumulaten_eliminater 1.11% : 0.000003s : 5: predicate.ad_related_special_op_eliminate 0.53% : 0.000001s : 10: predicate.addn_check_dump 0.86% : 0.000002s : 15: predicate.addn_zero_filter 0.77% : 0.000002s : 15: predicate.adjust_all_reduce_mul_add 2.09% : 0.000005s : 25: predicate.arithmetic_simplify 1.08% : 0.000003s : 15: predicate.cast_eliminate 0.61% : 0.000002s : 10: predicate.check_bprop_eliminate 0.58% : 0.000001s : 10: predicate.compare_switch_simplify 0.19% : 0.000000s : 5: predicate.const_output_eliminate 0.69% : 0.000002s : 10: predicate.depend_value_elim 0.90% : 0.000002s : 15: predicate.dict_get_item_const_eliminator 1.00% : 0.000003s : 15: predicate.dict_get_item_eliminator 0.86% : 0.000002s : 15: predicate.dict_set_item_eliminator 1.14% : 0.000003s : 10: predicate.dumpgradient_eliminate 0.30% : 0.000001s : 5: predicate.elim_not_effective 0.62% : 0.000002s : 5: predicate.elim_shapecalc_of_broadcastargs 1.05% : 0.000003s : 20: predicate.environ_add_const_eliminate 1.06% : 0.000003s : 20: predicate.environ_get_add_eliminate 1.02% : 0.000003s : 20: predicate.environ_get_depend_swap 1.66% : 0.000004s : 30: predicate.environ_get_eliminate 1.09% : 0.000003s : 20: predicate.environ_get_set_eliminate 1.34% : 0.000003s : 23: predicate.exchange_switch_depend_value 2.11% : 0.000005s : 23: predicate.float_depend_g_call 0.55% : 0.000001s : 10: predicate.float_environ_get_switch 0.87% : 0.000002s : 15: predicate.float_tuple_getitem_switch 0.19% : 0.000000s : 5: predicate.fold_const_symbol 0.72% : 0.000002s : 10: predicate.get_grad_eliminate 0.57% : 0.000001s : 5: predicate.graph_param_transform 0.63% : 0.000002s : 10: predicate.incorporate_call 0.53% : 0.000001s : 10: predicate.incorporate_call_switch 6.03% : 0.000015s : 68: predicate.inline 0.87% : 0.000002s : 10: predicate.inline_without_move 0.32% : 0.000001s : 10: predicate.j_node_and_user_rematch 0.88% : 0.000002s : 10: predicate.less_batch_normalization 2.22% : 0.000006s : 29: predicate.list_to_tuple_eliminator_ 2.37% : 0.000006s : 44: predicate.load_eliminater 0.96% : 0.000002s : 5: predicate.loop_unroll_after_grad 2.20% : 0.000006s : 36: predicate.loop_unroll_before_grad 1.52% : 0.000004s : 25: predicate.make_slice_get_slice_eliminator 0.61% : 0.000002s : 10: predicate.merge_addn 0.58% : 0.000001s : 10: predicate.micro_step_allgather_replace 0.63% : 0.000002s : 10: predicate.mini_step_allgather_replace 0.77% : 0.000002s : 15: predicate.minmaximum_grad 1.17% : 0.000003s : 5: predicate.mutable_eliminate 0.52% : 0.000001s : 5: predicate.opt_reshape 0.33% : 0.000001s : 5: predicate.parallel_virtual_node 1.71% : 0.000004s : 23: predicate.partial_defer_inline 1.53% : 0.000004s : 24: predicate.partial_eliminate 0.87% : 0.000002s : 15: predicate.print_const_string_wrapper 0.81% : 0.000002s : 10: predicate.reduce_all_const_elim 1.18% : 0.000003s : 15: predicate.reduce_eliminate 2.53% : 0.000006s : 44: predicate.redundant_stop_gradient_eliminater 0.44% : 0.000001s : 10: predicate.remove_not_recompute_node 1.28% : 0.000003s : 29: predicate.replace_applicator 0.50% : 0.000001s : 10: predicate.replace_old_param 0.25% : 0.000001s : 5: predicate.reset_defer_inline 1.01% : 0.000003s : 15: predicate.reshape_eliminate 0.62% : 0.000002s : 10: predicate.row_tensor_add_zeros_like 0.36% : 0.000001s : 5: predicate.row_tensor_eliminate 0.81% : 0.000002s : 10: predicate.same_eliminate 0.41% : 0.000001s : 10: predicate.set_cell_output_no_recompute 0.71% : 0.000002s : 10: predicate.shard_identity_eliminate 0.70% : 0.000002s : 10: predicate.special_op_eliminate 0.73% : 0.000002s : 10: predicate.specialize_transform 0.83% : 0.000002s : 10: predicate.split_environ_get_set_with_tuple_value 0.91% : 0.000002s : 10: predicate.stack_unstack_eliminate 0.34% : 0.000001s : 5: predicate.switch_call_monad_eliminater 1.46% : 0.000004s : 23: predicate.switch_defer_inline 1.93% : 0.000005s : 33: predicate.switch_layer_defer_inline 4.89% : 0.000012s : 74: predicate.switch_simplify 0.84% : 0.000002s : 15: predicate.tile_eliminate 0.85% : 0.000002s : 15: predicate.transpose_eliminate 1.62% : 0.000004s : 25: predicate.tuple_list_convert_item_index_to_positive 1.55% : 0.000004s : 25: predicate.tuple_list_get_item_const_eliminator 1.66% : 0.000004s : 25: predicate.tuple_list_get_item_depend_reorder 3.53% : 0.000009s : 39: predicate.tuple_list_get_item_eliminator 1.66% : 0.000004s : 25: predicate.tuple_list_get_set_item_eliminator 2.28% : 0.000006s : 35: predicate.tuple_list_set_item_eliminator 1.78% : 0.000005s : 29: predicate.tuple_to_list_eliminator_ 2.33% : 0.000006s : 44: predicate.updatestate_pure_node_eliminater 2.97% : 0.000008s : 54: predicate.updatestate_useless_node_eliminater 0.56% : 0.000001s : 5: predicate.value_based_eliminate 0.65% : 0.000002s : 10: predicate.virtual_dataset_eliminate 0.68% : 0.000002s : 10: predicate.virtual_output_eliminate 0.28% : 0.000001s : 5: predicate.virtual_view_grad_eliminate 0.45% : 0.000001s : 5: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000680 11 57.23% : 0.000389s : 5: func_graph_cloner_run.FuncGraphClonerGraph 42.77% : 0.000291s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.057923 192 0.01% : 0.000006s : 1: ForceFp32Comm 6.46% : 0.003741s : 1: add_attr 6.43% : 0.003724s : 1: add_attr_with_inline 0.01% : 0.000006s : 1: add_comm_op_reuse_tag 0.17% : 0.000097s : 1: add_recomputation 0.01% : 0.000007s : 1: assign_add_opt 0.13% : 0.000077s : 1: auto_monad 0.07% : 0.000042s : 1: auto_monad_reorder 0.01% : 0.000006s : 1: begin_end_overlap_inline 0.02% : 0.000010s : 1: bias_add_comm_swap 0.92% : 0.000530s : 1: bootstrap 0.07% : 0.000038s : 1: cconv 0.01% : 0.000006s : 1: comm_op_add_attrs 0.05% : 0.000028s : 1: control_data_broadcast_order 0.02% : 0.000014s : 1: convert_after_rewriter 0.11% : 0.000066s : 1: cse_after_recomputation 0.01% : 0.000008s : 1: dataset_repeat_opt 0.08% : 0.000044s : 1: detach_backward 0.03% : 0.000016s : 1: environ_conv 0.06% : 0.000034s : 1: event_method 0.01% : 0.000007s : 1: full_micro_interleaved_order_control 0.01% : 0.000008s : 1: get_jit_bprop_graph 0.02% : 0.000013s : 1: graph_reusing 0.01% : 0.000006s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000007s : 1: handle_group_info 0.01% : 0.000008s : 1: inline 0.02% : 0.000009s : 1: insert-virtual-dataset 0.01% : 0.000006s : 1: interleave_parallel_branches 0.01% : 0.000007s : 1: interleave_split_concat_branches 0.01% : 0.000008s : 1: label_fine_grained_interleaved_index 0.03% : 0.000015s : 1: label_micro_interleaved_index 0.85% : 0.000492s : 1: loop_unroll 0.01% : 0.000007s : 1: merge_cast_opt 0.01% : 0.000008s : 1: micro_interleaved_order_control 1.15% : 0.000669s : 1: mutable_eliminate 0.02% : 0.000011s : 1: offloading_packed_experts 0.03% : 0.000017s : 1: opt.transform.loop_unroll_optimizer 0.03% : 0.000019s : 1: opt.transform.mutable_eliminate 2.44% : 0.001413s : 78: opt.transform.opt_a 0.06% : 0.000036s : 1: opt.transform.opt_after_cconv 0.07% : 0.000041s : 1: opt.transform.opt_after_jit_grad 0.24% : 0.000136s : 28: opt.transform.opt_b 0.19% : 0.000108s : 2: opt.transform.opt_trans_graph 0.10% : 0.000057s : 4: opt.transform.symbol_engine_opt 6.17% : 0.003576s : 1: opt_a 0.26% : 0.000149s : 1: opt_after_cconv 1.42% : 0.000824s : 1: opt_after_jit_grad 0.55% : 0.000320s : 1: opt_b 34.45% : 0.019955s : 1: optimize 0.05% : 0.000027s : 1: optimize_parallel_all_gather_comm 0.02% : 0.000012s : 1: order_py_execute_after_rewriter 0.06% : 0.000036s : 1: overlap_grad_flash_sp 0.01% : 0.000007s : 1: overlap_grad_matmul_and_grad_allreduce 0.02% : 0.000011s : 1: overlap_grad_ring_attention 0.01% : 0.000008s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000007s : 1: overlap_opt_shard_in_pipeline 0.02% : 0.000010s : 1: overlap_param_gather 0.01% : 0.000007s : 1: overlap_recompute_allgather_and_fa_grad 0.02% : 0.000012s : 1: overlap_recompute_and_grad_model_parallel 0.01% : 0.000008s : 1: overlap_recompute_comm 0.02% : 0.000010s : 1: parallel-infer-symbol 0.01% : 0.000006s : 1: parallel-infer-symbol-second 0.02% : 0.000009s : 1: partial_unused_args_eliminate 0.02% : 0.000011s : 1: pipeline_parallel_scheduler 0.01% : 0.000007s : 1: pipeline_split 0.08% : 0.000045s : 1: pre_auto_parallel 0.06% : 0.000036s : 1: py_interpret_to_execute 0.03% : 0.000020s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000008s : 1: remove_cast_before_assign_add 0.04% : 0.000022s : 1: remove_dup_value 0.84% : 0.000489s : 1: renormalize.infer 0.64% : 0.000372s : 1: renormalize.specialize 0.01% : 0.000008s : 1: reorder_send_recv_between_fp_bp 0.03% : 0.000015s : 1: rewriter_after_jit_bprop_graph 0.09% : 0.000050s : 1: rewriter_after_opt_a 0.18% : 0.000102s : 1: rewriter_before_opt_a 0.01% : 0.000008s : 1: slice_cell_reuse_recomputed_activation 0.01% : 0.000007s : 1: slice_recompute_activation 0.01% : 0.000007s : 1: split_layernorm_comm 0.01% : 0.000007s : 1: split_matmul_comm_elemetwise 0.03% : 0.000015s : 1: swap_dp_allreduce_reducescatter 0.23% : 0.000133s : 1: symbol_engine_optimizer 0.41% : 0.000239s : 1: tuple_transform 34.04% : 0.019719s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:44:54.544.40 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0470353, [21] [bootstrap]: 0.0004957 [type_inference]: 0.0357936 [event_method]: 2.365e-05 [auto_monad]: 7.16e-05 [graph_reusing]: 6.31e-06 [inline]: 3.18e-06 [add_attr]: 0.00410401, [1] [add_attr_with_inline]: 0.00409249, [1] [Cycle 1]: 7.215e-05, [2] [tag_attr]: 2.377e-05 [meta_addattr_fg_expand]: 5.93998e-06 [parallel-infer-symbol]: 3.18e-06 [pre_auto_parallel]: 3.918e-05 [insert-virtual-dataset]: 2.79999e-06 [parallel-infer-symbol-second]: 7.39994e-07 [dataset_repeat_opt]: 2.06e-06 [pipeline_split]: 1.62999e-06 [optimize]: 0.00565041, [53] [py_interpret_to_execute]: 2.927e-05 [rewriter_before_opt_a]: 9.705e-05 [opt_a]: 0.00324919, [2] [Cycle 1]: 0.00244969, [45] [expand_dump_flag]: 3.21001e-06 [switch_simplify]: 4.465e-05 [loop_unroll]: 3.041e-05 [a_1]: 0.00069905 [with_stream_mark]: 2.139e-05 [recompute_prepare]: 1.024e-05 [updatestate_depend_eliminate]: 4.89e-06 [updatestate_assign_eliminate]: 4e-06 [updatestate_loads_eliminate]: 3.7e-06 [parameter_eliminate]: 2.08002e-06 [a_2]: 0.00010292 [accelerated_algorithm]: 8.74998e-06 [shard]: 1.94999e-06 [meta_shard_fg_expand]: 2.26e-06 [shard_inline]: 7.92e-06 [merge_send_recv]: 1.089e-05 [auto_parallel]: 7.83001e-06 [parallel]: 1.961e-05 [flash_sp]: 9.31998e-06 [merge_comm]: 4.57e-06 [allreduce_fusion]: 4.33001e-06 [matmul_add_comm_reduction]: 1.009e-05 [allreduce_slice_to_reducescatter]: 6.30011e-07 [virtual_shard_identity]: 1.024e-05 [virtual_dataset]: 7.51999e-06 [get_grad_eliminate_]: 8.80001e-06 [virtual_output]: 7.36001e-06 [merge_forward]: 5.12999e-06 [cell_reuse_recompute_pass]: 1.24003e-06 [offload_activation]: 1.126e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.589e-05 [merge_recompute_call_nodes]: 1.54998e-06 [before_grad]: 1.378e-05 [set_forward_comm_id_for_comm_node_pass]: 4.75001e-06 [meta_fg_expand]: 3.3e-06 [flash_sp_send_recv_attached]: 2.68998e-06 [receive_attached]: 2.06998e-06 [after_resolve]: 1.342e-05 [a_after_grad]: 1.201e-05 [renormalize]: 0.00092253 [add_forward_monad_depend]: 6.23998e-06 [auto_monad_grad]: 2.48e-06 [auto_monad_eliminator]: 1.932e-05 [cse]: 3.982e-05 [a_3]: 6.275e-05 [Cycle 2]: 0.00078913, [45] [expand_dump_flag]: 1.50001e-06 [switch_simplify]: 9.83002e-06 [loop_unroll]: 7.43e-06 [a_1]: 0.00017808 [with_stream_mark]: 1.605e-05 [recompute_prepare]: 7.85e-06 [updatestate_depend_eliminate]: 3.75e-06 [updatestate_assign_eliminate]: 2.99999e-06 [updatestate_loads_eliminate]: 3.91999e-06 [parameter_eliminate]: 1.52001e-06 [a_2]: 9.278e-05 [accelerated_algorithm]: 7.9e-06 [shard]: 1.96998e-06 [meta_shard_fg_expand]: 1.73002e-06 [shard_inline]: 7.46999e-06 [merge_send_recv]: 1.036e-05 [auto_parallel]: 8.33001e-06 [parallel]: 5.59998e-06 [flash_sp]: 3.40998e-06 [merge_comm]: 5.27001e-06 [allreduce_fusion]: 4.32e-06 [matmul_add_comm_reduction]: 8.20999e-06 [allreduce_slice_to_reducescatter]: 7.29982e-07 [virtual_shard_identity]: 8.75001e-06 [virtual_dataset]: 9.54e-06 [get_grad_eliminate_]: 7.02002e-06 [virtual_output]: 6.81999e-06 [merge_forward]: 4.36002e-06 [cell_reuse_recompute_pass]: 1.74e-06 [offload_activation]: 9.88002e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.466e-05 [merge_recompute_call_nodes]: 9.20001e-07 [before_grad]: 1.151e-05 [set_forward_comm_id_for_comm_node_pass]: 4.02e-06 [meta_fg_expand]: 3.16999e-06 [flash_sp_send_recv_attached]: 1.40001e-06 [receive_attached]: 1.37999e-06 [after_resolve]: 1.276e-05 [a_after_grad]: 1.114e-05 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 1.75001e-06 [auto_monad_grad]: 1.57001e-06 [auto_monad_eliminator]: 9.62001e-06 [cse]: 2.035e-05 [a_3]: 4.597e-05 [py_interpret_to_execute_after_opt_a]: 1.514e-05 [slice_cell_reuse_recomputed_activation]: 2.29001e-06 [rewriter_after_opt_a]: 4.279e-05 [convert_after_rewriter]: 7.53999e-06 [order_py_execute_after_rewriter]: 5.94e-06 [mutable_eliminate]: 0.00067649 [opt_b]: 0.00026273, [1] [Cycle 1]: 0.00025486, [7] [b_1]: 0.00016494 [b_2]: 9.66e-06 [updatestate_depend_eliminate]: 8.70999e-06 [updatestate_assign_eliminate]: 2.98e-06 [updatestate_loads_eliminate]: 3.30998e-06 [renormalize]: 6.80011e-07 [cse]: 2.851e-05 [optimize_parallel_all_gather_comm]: 1.825e-05 [overlap_param_gather]: 2.36e-06 [cconv]: 3.145e-05 [loop_unroll]: 0.00046601 [opt_after_cconv]: 0.00012157, [1] [Cycle 1]: 0.00011467, [7] [c_1]: 3.629e-05 [parameter_eliminate]: 3.76999e-06 [updatestate_depend_eliminate]: 6.96001e-06 [updatestate_assign_eliminate]: 3.21999e-06 [updatestate_loads_eliminate]: 3.26001e-06 [cse]: 2.479e-05 [renormalize]: 4.00003e-07 [remove_dup_value]: 1.683e-05 [tuple_transform]: 9.224e-05, [1] [Cycle 1]: 8.731e-05, [4] [d_1]: 5.63e-05 [none_parameter_eliminate]: 1.66002e-06 [renormalize]: 2.59985e-07 [switch_simplify]: 9.28002e-06 [partial_unused_args_eliminate]: 2.36e-06 [add_recomputation]: 5.97e-05 [cse_after_recomputation]: 2.936e-05, [1] [Cycle 1]: 2.374e-05, [1] [cse]: 1.739e-05 [environ_conv]: 7.15998e-06 [swap_dp_allreduce_reducescatter]: 5.82999e-06 [bias_add_comm_swap]: 3.04999e-06 [label_micro_interleaved_index]: 4.07e-06 [label_fine_grained_interleaved_index]: 3.01001e-06 [merge_cast_opt]: 1.32999e-06 [slice_recompute_activation]: 2.16e-06 [micro_interleaved_order_control]: 2.14999e-06 [assign_add_opt]: 1.20999e-06 [ForceFp32Comm]: 8.89995e-07 [remove_cast_before_assign_add]: 7.7e-07 [full_micro_interleaved_order_control]: 2.37999e-06 [reorder_send_recv_between_fp_bp]: 2.56998e-06 [comm_op_add_attrs]: 1.05001e-06 [add_comm_op_reuse_tag]: 9.50007e-07 [interleave_split_concat_branches]: 1.10001e-06 [interleave_parallel_branches]: 1.02e-06 [overlap_opt_shard_in_pipeline]: 1.45001e-06 [overlap_opt_shard_grad_in_pipeline]: 2.27999e-06 [control_data_broadcast_order]: 1.598e-05 [grouped_pairwise_exchange_alltoall]: 1.50999e-06 [offloading_packed_experts]: 4.32e-06 [overlap_recompute_and_grad_model_parallel]: 5.29e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.34e-06 [overlap_recompute_allgather_and_fa_grad]: 1.50001e-06 [overlap_recompute_comm]: 2.23998e-06 [overlap_grad_ring_attention]: 4.63999e-06 [overlap_grad_flash_sp]: 2.319e-05 [begin_end_overlap_inline]: 5.8001e-07 [split_matmul_comm_elemetwise]: 2.12999e-06 [split_layernorm_comm]: 1.83002e-06 [handle_group_info]: 9.70002e-07 [symbol_engine_optimizer]: 8.765e-05, [1] [Cycle 1]: 8.317e-05, [6] [build]: 3.39001e-06 [elim_shapecalc]: 1.157e-05 [elim_not_effective]: 1.574e-05 [opt_reshape]: 8.50001e-06 [fold_const_symbol]: 1.286e-05 [renormalize]: 1.8999e-07 [detach_backward]: 2.41e-06 [pipeline_parallel_scheduler]: 1.69998e-06 [auto_monad_reorder]: 2.062e-05 [get_jit_bprop_graph]: 1.87001e-06 [rewriter_after_jit_bprop_graph]: 5.05001e-06 [opt_after_jit_grad]: 0.00057671 [validate]: 5.493e-05 Sums bootstrap : 0.000496s : 1.18% type_inference : 0.035794s : 85.38% event_method : 0.000024s : 0.06% auto_monad : 0.000072s : 0.17% graph_reusing : 0.000006s : 0.02% inline : 0.000003s : 0.01% add_attr.add_attr_with_inline.tag_attr : 0.000024s : 0.06% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.01% parallel-infer-symbol : 0.000003s : 0.01% pre_auto_parallel : 0.000039s : 0.09% insert-virtual-dataset : 0.000003s : 0.01% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000029s : 0.07% optimize.rewriter_before_opt_a : 0.000097s : 0.23% optimize.opt_a.expand_dump_flag : 0.000005s : 0.01% optimize.opt_a.switch_simplify : 0.000054s : 0.13% optimize.opt_a.loop_unroll : 0.000038s : 0.09% optimize.opt_a.a_1 : 0.000877s : 2.09% optimize.opt_a.with_stream_mark : 0.000037s : 0.09% optimize.opt_a.recompute_prepare : 0.000018s : 0.04% optimize.opt_a.updatestate_depend_eliminate : 0.000009s : 0.02% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.02% optimize.opt_a.updatestate_loads_eliminate : 0.000008s : 0.02% optimize.opt_a.parameter_eliminate : 0.000004s : 0.01% optimize.opt_a.a_2 : 0.000196s : 0.47% optimize.opt_a.accelerated_algorithm : 0.000017s : 0.04% optimize.opt_a.shard : 0.000004s : 0.01% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.01% optimize.opt_a.shard_inline : 0.000015s : 0.04% optimize.opt_a.merge_send_recv : 0.000021s : 0.05% optimize.opt_a.auto_parallel : 0.000016s : 0.04% optimize.opt_a.parallel : 0.000025s : 0.06% optimize.opt_a.flash_sp : 0.000013s : 0.03% optimize.opt_a.merge_comm : 0.000010s : 0.02% optimize.opt_a.allreduce_fusion : 0.000009s : 0.02% optimize.opt_a.matmul_add_comm_reduction : 0.000018s : 0.04% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000019s : 0.05% optimize.opt_a.virtual_dataset : 0.000017s : 0.04% optimize.opt_a.get_grad_eliminate_ : 0.000016s : 0.04% optimize.opt_a.virtual_output : 0.000014s : 0.03% optimize.opt_a.merge_forward : 0.000009s : 0.02% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.01% optimize.opt_a.offload_activation : 0.000021s : 0.05% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000031s : 0.07% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.01% optimize.opt_a.before_grad : 0.000025s : 0.06% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000009s : 0.02% optimize.opt_a.meta_fg_expand : 0.000006s : 0.02% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.01% optimize.opt_a.receive_attached : 0.000003s : 0.01% optimize.opt_a.after_resolve : 0.000026s : 0.06% optimize.opt_a.a_after_grad : 0.000023s : 0.06% optimize.opt_a.renormalize : 0.000923s : 2.20% optimize.opt_a.add_forward_monad_depend : 0.000008s : 0.02% optimize.opt_a.auto_monad_grad : 0.000004s : 0.01% optimize.opt_a.auto_monad_eliminator : 0.000029s : 0.07% optimize.opt_a.cse : 0.000060s : 0.14% optimize.opt_a.a_3 : 0.000109s : 0.26% optimize.py_interpret_to_execute_after_opt_a : 0.000015s : 0.04% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.01% optimize.rewriter_after_opt_a : 0.000043s : 0.10% optimize.convert_after_rewriter : 0.000008s : 0.02% optimize.order_py_execute_after_rewriter : 0.000006s : 0.01% optimize.mutable_eliminate : 0.000676s : 1.61% optimize.opt_b.b_1 : 0.000165s : 0.39% optimize.opt_b.b_2 : 0.000010s : 0.02% optimize.opt_b.updatestate_depend_eliminate : 0.000009s : 0.02% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.01% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.01% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000029s : 0.07% optimize.optimize_parallel_all_gather_comm : 0.000018s : 0.04% optimize.overlap_param_gather : 0.000002s : 0.01% optimize.cconv : 0.000031s : 0.08% optimize.loop_unroll : 0.000466s : 1.11% optimize.opt_after_cconv.c_1 : 0.000036s : 0.09% optimize.opt_after_cconv.parameter_eliminate : 0.000004s : 0.01% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000007s : 0.02% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.cse : 0.000025s : 0.06% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000017s : 0.04% optimize.tuple_transform.d_1 : 0.000056s : 0.13% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000009s : 0.02% optimize.partial_unused_args_eliminate : 0.000002s : 0.01% optimize.add_recomputation : 0.000060s : 0.14% optimize.cse_after_recomputation.cse : 0.000017s : 0.04% optimize.environ_conv : 0.000007s : 0.02% optimize.swap_dp_allreduce_reducescatter : 0.000006s : 0.01% optimize.bias_add_comm_swap : 0.000003s : 0.01% optimize.label_micro_interleaved_index : 0.000004s : 0.01% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.01% optimize.merge_cast_opt : 0.000001s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.01% optimize.micro_interleaved_order_control : 0.000002s : 0.01% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.00% optimize.full_micro_interleaved_order_control : 0.000002s : 0.01% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.01% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.01% optimize.control_data_broadcast_order : 0.000016s : 0.04% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.00% optimize.offloading_packed_experts : 0.000004s : 0.01% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.01% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000002s : 0.00% optimize.overlap_recompute_comm : 0.000002s : 0.01% optimize.overlap_grad_ring_attention : 0.000005s : 0.01% optimize.overlap_grad_flash_sp : 0.000023s : 0.06% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.01% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000003s : 0.01% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000012s : 0.03% optimize.symbol_engine_optimizer.elim_not_effective : 0.000016s : 0.04% optimize.symbol_engine_optimizer.opt_reshape : 0.000009s : 0.02% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000013s : 0.03% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.01% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000021s : 0.05% get_jit_bprop_graph : 0.000002s : 0.00% rewriter_after_jit_bprop_graph : 0.000005s : 0.01% opt_after_jit_grad : 0.000577s : 1.38% validate : 0.000055s : 0.13% Time group info: ------[substitution.] 0.000234 38 12.15% : 0.000028s : 3: substitution.cast_eliminate 0.93% : 0.000002s : 3: substitution.elim_not_effective 0.68% : 0.000002s : 3: substitution.fold_const_symbol 2.82% : 0.000007s : 5: substitution.graph_param_transform 70.66% : 0.000166s : 4: substitution.inline 2.10% : 0.000005s : 6: substitution.j_node_and_user_rematch 2.67% : 0.000006s : 6: substitution.remove_not_recompute_node 2.47% : 0.000006s : 4: substitution.replace_old_param 5.53% : 0.000013s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.035707 2 97.47% : 0.034803s : 1: type_inference.infer 2.53% : 0.000905s : 1: type_inference.specialize ------[replace.] 0.000063 8 64.57% : 0.000041s : 4: replace.inline 35.43% : 0.000022s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000174 8 93.58% : 0.000163s : 4: match.inline 6.42% : 0.000011s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000243 1504 0.86% : 0.000002s : 15: predicate.accumulaten_eliminater 0.79% : 0.000002s : 5: predicate.ad_related_special_op_eliminate 0.55% : 0.000001s : 10: predicate.addn_check_dump 0.96% : 0.000002s : 15: predicate.addn_zero_filter 0.79% : 0.000002s : 15: predicate.adjust_all_reduce_mul_add 2.02% : 0.000005s : 25: predicate.arithmetic_simplify 1.07% : 0.000003s : 15: predicate.cast_eliminate 0.61% : 0.000001s : 10: predicate.check_bprop_eliminate 0.61% : 0.000001s : 10: predicate.compare_switch_simplify 0.19% : 0.000000s : 5: predicate.const_output_eliminate 0.76% : 0.000002s : 10: predicate.depend_value_elim 0.93% : 0.000002s : 15: predicate.dict_get_item_const_eliminator 0.98% : 0.000002s : 15: predicate.dict_get_item_eliminator 0.89% : 0.000002s : 15: predicate.dict_set_item_eliminator 0.97% : 0.000002s : 10: predicate.dumpgradient_eliminate 0.27% : 0.000001s : 5: predicate.elim_not_effective 0.42% : 0.000001s : 5: predicate.elim_shapecalc_of_broadcastargs 1.27% : 0.000003s : 20: predicate.environ_add_const_eliminate 1.07% : 0.000003s : 20: predicate.environ_get_add_eliminate 1.07% : 0.000003s : 20: predicate.environ_get_depend_swap 1.69% : 0.000004s : 30: predicate.environ_get_eliminate 1.11% : 0.000003s : 20: predicate.environ_get_set_eliminate 1.34% : 0.000003s : 23: predicate.exchange_switch_depend_value 2.21% : 0.000005s : 23: predicate.float_depend_g_call 0.57% : 0.000001s : 10: predicate.float_environ_get_switch 0.84% : 0.000002s : 15: predicate.float_tuple_getitem_switch 0.19% : 0.000000s : 5: predicate.fold_const_symbol 0.91% : 0.000002s : 10: predicate.get_grad_eliminate 0.31% : 0.000001s : 5: predicate.graph_param_transform 0.65% : 0.000002s : 10: predicate.incorporate_call 0.53% : 0.000001s : 10: predicate.incorporate_call_switch 6.14% : 0.000015s : 68: predicate.inline 0.77% : 0.000002s : 10: predicate.inline_without_move 0.33% : 0.000001s : 10: predicate.j_node_and_user_rematch 0.86% : 0.000002s : 10: predicate.less_batch_normalization 1.84% : 0.000004s : 29: predicate.list_to_tuple_eliminator_ 2.45% : 0.000006s : 44: predicate.load_eliminater 1.24% : 0.000003s : 5: predicate.loop_unroll_after_grad 2.10% : 0.000005s : 36: predicate.loop_unroll_before_grad 1.68% : 0.000004s : 25: predicate.make_slice_get_slice_eliminator 0.60% : 0.000001s : 10: predicate.merge_addn 0.58% : 0.000001s : 10: predicate.micro_step_allgather_replace 0.61% : 0.000001s : 10: predicate.mini_step_allgather_replace 0.79% : 0.000002s : 15: predicate.minmaximum_grad 1.25% : 0.000003s : 5: predicate.mutable_eliminate 0.38% : 0.000001s : 5: predicate.opt_reshape 0.44% : 0.000001s : 5: predicate.parallel_virtual_node 1.82% : 0.000004s : 23: predicate.partial_defer_inline 1.58% : 0.000004s : 24: predicate.partial_eliminate 0.84% : 0.000002s : 15: predicate.print_const_string_wrapper 0.60% : 0.000001s : 10: predicate.reduce_all_const_elim 1.09% : 0.000003s : 15: predicate.reduce_eliminate 2.43% : 0.000006s : 44: predicate.redundant_stop_gradient_eliminater 0.39% : 0.000001s : 10: predicate.remove_not_recompute_node 1.36% : 0.000003s : 29: predicate.replace_applicator 0.55% : 0.000001s : 10: predicate.replace_old_param 0.26% : 0.000001s : 5: predicate.reset_defer_inline 0.92% : 0.000002s : 15: predicate.reshape_eliminate 0.64% : 0.000002s : 10: predicate.row_tensor_add_zeros_like 0.44% : 0.000001s : 5: predicate.row_tensor_eliminate 0.82% : 0.000002s : 10: predicate.same_eliminate 0.45% : 0.000001s : 10: predicate.set_cell_output_no_recompute 0.80% : 0.000002s : 10: predicate.shard_identity_eliminate 0.72% : 0.000002s : 10: predicate.special_op_eliminate 0.79% : 0.000002s : 10: predicate.specialize_transform 1.07% : 0.000003s : 10: predicate.split_environ_get_set_with_tuple_value 0.71% : 0.000002s : 10: predicate.stack_unstack_eliminate 0.35% : 0.000001s : 5: predicate.switch_call_monad_eliminater 1.42% : 0.000003s : 23: predicate.switch_defer_inline 1.97% : 0.000005s : 33: predicate.switch_layer_defer_inline 5.06% : 0.000012s : 74: predicate.switch_simplify 0.85% : 0.000002s : 15: predicate.tile_eliminate 0.90% : 0.000002s : 15: predicate.transpose_eliminate 1.62% : 0.000004s : 25: predicate.tuple_list_convert_item_index_to_positive 1.84% : 0.000004s : 25: predicate.tuple_list_get_item_const_eliminator 1.58% : 0.000004s : 25: predicate.tuple_list_get_item_depend_reorder 3.16% : 0.000008s : 39: predicate.tuple_list_get_item_eliminator 1.61% : 0.000004s : 25: predicate.tuple_list_get_set_item_eliminator 2.26% : 0.000005s : 35: predicate.tuple_list_set_item_eliminator 1.77% : 0.000004s : 29: predicate.tuple_to_list_eliminator_ 2.34% : 0.000006s : 44: predicate.updatestate_pure_node_eliminater 3.02% : 0.000007s : 54: predicate.updatestate_useless_node_eliminater 0.49% : 0.000001s : 5: predicate.value_based_eliminate 0.61% : 0.000001s : 10: predicate.virtual_dataset_eliminate 0.62% : 0.000002s : 10: predicate.virtual_output_eliminate 0.31% : 0.000001s : 5: predicate.virtual_view_grad_eliminate 0.51% : 0.000001s : 5: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000693 11 52.78% : 0.000366s : 5: func_graph_cloner_run.FuncGraphClonerGraph 47.22% : 0.000327s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.059310 192 0.01% : 0.000004s : 1: ForceFp32Comm 6.93% : 0.004110s : 1: add_attr 6.91% : 0.004097s : 1: add_attr_with_inline 0.01% : 0.000004s : 1: add_comm_op_reuse_tag 0.11% : 0.000064s : 1: add_recomputation 0.01% : 0.000004s : 1: assign_add_opt 0.13% : 0.000079s : 1: auto_monad 0.04% : 0.000024s : 1: auto_monad_reorder 0.01% : 0.000003s : 1: begin_end_overlap_inline 0.01% : 0.000006s : 1: bias_add_comm_swap 0.89% : 0.000529s : 1: bootstrap 0.06% : 0.000035s : 1: cconv 0.01% : 0.000004s : 1: comm_op_add_attrs 0.03% : 0.000019s : 1: control_data_broadcast_order 0.02% : 0.000011s : 1: convert_after_rewriter 0.05% : 0.000032s : 1: cse_after_recomputation 0.01% : 0.000005s : 1: dataset_repeat_opt 0.01% : 0.000006s : 1: detach_backward 0.02% : 0.000010s : 1: environ_conv 0.05% : 0.000031s : 1: event_method 0.01% : 0.000005s : 1: full_micro_interleaved_order_control 0.01% : 0.000005s : 1: get_jit_bprop_graph 0.02% : 0.000010s : 1: graph_reusing 0.01% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000004s : 1: handle_group_info 0.01% : 0.000006s : 1: inline 0.01% : 0.000006s : 1: insert-virtual-dataset 0.01% : 0.000004s : 1: interleave_parallel_branches 0.01% : 0.000004s : 1: interleave_split_concat_branches 0.01% : 0.000006s : 1: label_fine_grained_interleaved_index 0.01% : 0.000007s : 1: label_micro_interleaved_index 0.80% : 0.000474s : 1: loop_unroll 0.01% : 0.000004s : 1: merge_cast_opt 0.01% : 0.000005s : 1: micro_interleaved_order_control 1.16% : 0.000686s : 1: mutable_eliminate 0.01% : 0.000007s : 1: offloading_packed_experts 0.03% : 0.000018s : 1: opt.transform.loop_unroll_optimizer 0.03% : 0.000020s : 1: opt.transform.mutable_eliminate 2.36% : 0.001399s : 78: opt.transform.opt_a 0.06% : 0.000035s : 1: opt.transform.opt_after_cconv 0.06% : 0.000033s : 1: opt.transform.opt_after_jit_grad 0.24% : 0.000140s : 28: opt.transform.opt_b 0.11% : 0.000063s : 2: opt.transform.opt_trans_graph 0.07% : 0.000044s : 4: opt.transform.symbol_engine_opt 5.48% : 0.003252s : 1: opt_a 0.21% : 0.000125s : 1: opt_after_cconv 0.99% : 0.000588s : 1: opt_after_jit_grad 0.45% : 0.000266s : 1: opt_b 9.54% : 0.005655s : 1: optimize 0.04% : 0.000022s : 1: optimize_parallel_all_gather_comm 0.02% : 0.000009s : 1: order_py_execute_after_rewriter 0.05% : 0.000027s : 1: overlap_grad_flash_sp 0.01% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.01% : 0.000008s : 1: overlap_grad_ring_attention 0.01% : 0.000006s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000005s : 1: overlap_opt_shard_in_pipeline 0.01% : 0.000006s : 1: overlap_param_gather 0.01% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.01% : 0.000008s : 1: overlap_recompute_and_grad_model_parallel 0.01% : 0.000005s : 1: overlap_recompute_comm 0.01% : 0.000007s : 1: parallel-infer-symbol 0.01% : 0.000003s : 1: parallel-infer-symbol-second 0.01% : 0.000005s : 1: partial_unused_args_eliminate 0.01% : 0.000005s : 1: pipeline_parallel_scheduler 0.01% : 0.000004s : 1: pipeline_split 0.07% : 0.000043s : 1: pre_auto_parallel 0.06% : 0.000034s : 1: py_interpret_to_execute 0.03% : 0.000018s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000004s : 1: remove_cast_before_assign_add 0.03% : 0.000021s : 1: remove_dup_value 0.90% : 0.000533s : 1: renormalize.infer 0.64% : 0.000379s : 1: renormalize.specialize 0.01% : 0.000005s : 1: reorder_send_recv_between_fp_bp 0.01% : 0.000008s : 1: rewriter_after_jit_bprop_graph 0.08% : 0.000047s : 1: rewriter_after_opt_a 0.17% : 0.000102s : 1: rewriter_before_opt_a 0.01% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.01% : 0.000005s : 1: slice_recompute_activation 0.01% : 0.000005s : 1: split_layernorm_comm 0.01% : 0.000005s : 1: split_matmul_comm_elemetwise 0.01% : 0.000009s : 1: swap_dp_allreduce_reducescatter 0.15% : 0.000090s : 1: symbol_engine_optimizer 0.16% : 0.000096s : 1: tuple_transform 60.40% : 0.035821s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:44:54.590.580 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:44:54.590.897 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0523612, [21] [bootstrap]: 0.00044051 [type_inference]: 0.0235793 [event_method]: 2.231e-05 [auto_monad]: 6.844e-05 [graph_reusing]: 6.45002e-06 [inline]: 3.13e-06 [add_attr]: 0.00383691, [1] [add_attr_with_inline]: 0.00382621, [1] [Cycle 1]: 9.137e-05, [2] [tag_attr]: 2.288e-05 [meta_addattr_fg_expand]: 6.17001e-06 [parallel-infer-symbol]: 3.56999e-06 [pre_auto_parallel]: 3.893e-05 [insert-virtual-dataset]: 2.37001e-06 [parallel-infer-symbol-second]: 6.90023e-07 [dataset_repeat_opt]: 2.19001e-06 [pipeline_split]: 1.80001e-06 [optimize]: 0.0227662, [53] [py_interpret_to_execute]: 3.087e-05 [rewriter_before_opt_a]: 9.118e-05 [opt_a]: 0.0196472, [2] [Cycle 1]: 0.0186634, [45] [expand_dump_flag]: 2.94001e-06 [switch_simplify]: 4.293e-05 [loop_unroll]: 3.257e-05 [a_1]: 0.00064656 [with_stream_mark]: 1.783e-05 [recompute_prepare]: 9.17999e-06 [updatestate_depend_eliminate]: 4.1e-06 [updatestate_assign_eliminate]: 4.25999e-06 [updatestate_loads_eliminate]: 3.06001e-06 [parameter_eliminate]: 1.92999e-06 [a_2]: 0.00011065 [accelerated_algorithm]: 6.93e-06 [shard]: 1.74e-06 [meta_shard_fg_expand]: 1.76003e-06 [shard_inline]: 6.59999e-06 [merge_send_recv]: 8.67e-06 [auto_parallel]: 6.68998e-06 [parallel]: 1.921e-05 [flash_sp]: 8.42998e-06 [merge_comm]: 4.52e-06 [allreduce_fusion]: 3.79002e-06 [matmul_add_comm_reduction]: 9.96e-06 [allreduce_slice_to_reducescatter]: 6.80011e-07 [virtual_shard_identity]: 8.37e-06 [virtual_dataset]: 6.75002e-06 [get_grad_eliminate_]: 6.22001e-06 [virtual_output]: 6.39001e-06 [merge_forward]: 4.08001e-06 [cell_reuse_recompute_pass]: 1.20001e-06 [offload_activation]: 1.022e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.558e-05 [merge_recompute_call_nodes]: 1.54998e-06 [before_grad]: 1.12e-05 [set_forward_comm_id_for_comm_node_pass]: 3.60003e-06 [meta_fg_expand]: 2.86e-06 [flash_sp_send_recv_attached]: 2.46998e-06 [receive_attached]: 2.43e-06 [after_resolve]: 1.236e-05 [a_after_grad]: 1.127e-05 [renormalize]: 0.0170262 [add_forward_monad_depend]: 9.73002e-06 [auto_monad_grad]: 3.17002e-06 [auto_monad_eliminator]: 2.187e-05 [cse]: 3.237e-05 [a_3]: 7.989e-05 [Cycle 2]: 0.00096449, [45] [expand_dump_flag]: 2.17999e-06 [switch_simplify]: 1.009e-05 [loop_unroll]: 7.16001e-06 [a_1]: 0.00014853 [with_stream_mark]: 1.822e-05 [recompute_prepare]: 7.92e-06 [updatestate_depend_eliminate]: 4.03001e-06 [updatestate_assign_eliminate]: 3.86999e-06 [updatestate_loads_eliminate]: 3.30998e-06 [parameter_eliminate]: 1.76e-06 [a_2]: 0.00010747 [accelerated_algorithm]: 6.66999e-06 [shard]: 2.83e-06 [meta_shard_fg_expand]: 2.20002e-06 [shard_inline]: 6.39001e-06 [merge_send_recv]: 8.84e-06 [auto_parallel]: 1.098e-05 [parallel]: 9.27001e-06 [flash_sp]: 4.43999e-06 [merge_comm]: 4.01001e-06 [allreduce_fusion]: 3.52002e-06 [matmul_add_comm_reduction]: 9.62001e-06 [allreduce_slice_to_reducescatter]: 8.30012e-07 [virtual_shard_identity]: 9.46e-06 [virtual_dataset]: 7.3e-06 [get_grad_eliminate_]: 6.24001e-06 [virtual_output]: 6.93e-06 [merge_forward]: 5.17e-06 [cell_reuse_recompute_pass]: 3.75e-06 [offload_activation]: 1.047e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.014e-05 [merge_recompute_call_nodes]: 1.54998e-06 [before_grad]: 1.059e-05 [set_forward_comm_id_for_comm_node_pass]: 4.60001e-06 [meta_fg_expand]: 3.06999e-06 [flash_sp_send_recv_attached]: 2.48e-06 [receive_attached]: 2.51998e-06 [after_resolve]: 1.407e-05 [a_after_grad]: 1.006e-05 [renormalize]: 6.00121e-08 [add_forward_monad_depend]: 1.94999e-06 [auto_monad_grad]: 1.72999e-06 [auto_monad_eliminator]: 1.059e-05 [cse]: 1.802e-05 [a_3]: 5.471e-05 [py_interpret_to_execute_after_opt_a]: 2.151e-05 [slice_cell_reuse_recomputed_activation]: 5.04e-06 [rewriter_after_opt_a]: 4.68e-05 [convert_after_rewriter]: 1.097e-05 [order_py_execute_after_rewriter]: 9.37999e-06 [mutable_eliminate]: 0.000846 [opt_b]: 0.00031046, [1] [Cycle 1]: 0.00029746, [7] [b_1]: 0.00018507 [b_2]: 8.58001e-06 [updatestate_depend_eliminate]: 1.009e-05 [updatestate_assign_eliminate]: 3.16999e-06 [updatestate_loads_eliminate]: 2.73e-06 [renormalize]: 1.12999e-06 [cse]: 2.724e-05 [optimize_parallel_all_gather_comm]: 2.52e-05 [overlap_param_gather]: 4.50001e-06 [cconv]: 3.996e-05 [loop_unroll]: 0.00074178 [opt_after_cconv]: 0.00015784, [1] [Cycle 1]: 0.00014552, [7] [c_1]: 3.791e-05 [parameter_eliminate]: 6.25002e-06 [updatestate_depend_eliminate]: 8.14002e-06 [updatestate_assign_eliminate]: 3.14999e-06 [updatestate_loads_eliminate]: 2.63998e-06 [cse]: 2.781e-05 [renormalize]: 3.19997e-07 [remove_dup_value]: 2.012e-05 [tuple_transform]: 0.00010692, [1] [Cycle 1]: 9.888e-05, [4] [d_1]: 5.608e-05 [none_parameter_eliminate]: 1.77999e-06 [renormalize]: 2.19996e-07 [switch_simplify]: 7.8e-06 [partial_unused_args_eliminate]: 4.79e-06 [add_recomputation]: 6.335e-05 [cse_after_recomputation]: 3.016e-05, [1] [Cycle 1]: 2.25e-05, [1] [cse]: 1.268e-05 [environ_conv]: 1.009e-05 [swap_dp_allreduce_reducescatter]: 8.28999e-06 [bias_add_comm_swap]: 5.84e-06 [label_micro_interleaved_index]: 9.97999e-06 [label_fine_grained_interleaved_index]: 5.75001e-06 [merge_cast_opt]: 4.43999e-06 [slice_recompute_activation]: 5.09998e-06 [micro_interleaved_order_control]: 5.22999e-06 [assign_add_opt]: 4e-06 [ForceFp32Comm]: 3.71999e-06 [remove_cast_before_assign_add]: 3.98001e-06 [full_micro_interleaved_order_control]: 5.39e-06 [reorder_send_recv_between_fp_bp]: 5.76003e-06 [comm_op_add_attrs]: 3.63999e-06 [add_comm_op_reuse_tag]: 3.36999e-06 [interleave_split_concat_branches]: 3.58999e-06 [interleave_parallel_branches]: 3.7e-06 [overlap_opt_shard_in_pipeline]: 3.97998e-06 [overlap_opt_shard_grad_in_pipeline]: 4.77998e-06 [control_data_broadcast_order]: 1.866e-05 [grouped_pairwise_exchange_alltoall]: 4.90999e-06 [offloading_packed_experts]: 7.69002e-06 [overlap_recompute_and_grad_model_parallel]: 7.83001e-06 [overlap_grad_matmul_and_grad_allreduce]: 4.32e-06 [overlap_recompute_allgather_and_fa_grad]: 4.13999e-06 [overlap_recompute_comm]: 5.28002e-06 [overlap_grad_ring_attention]: 7.51999e-06 [overlap_grad_flash_sp]: 2.751e-05 [begin_end_overlap_inline]: 3.41999e-06 [split_matmul_comm_elemetwise]: 4.79e-06 [split_layernorm_comm]: 4.57e-06 [handle_group_info]: 3.51999e-06 [symbol_engine_optimizer]: 0.0001082, [1] [Cycle 1]: 0.00010091, [6] [build]: 4.97999e-06 [elim_shapecalc]: 1.249e-05 [elim_not_effective]: 1.503e-05 [opt_reshape]: 8.03999e-06 [fold_const_symbol]: 1.085e-05 [renormalize]: 2.80008e-07 [detach_backward]: 5.49e-06 [pipeline_parallel_scheduler]: 1.96998e-06 [auto_monad_reorder]: 2.321e-05 [get_jit_bprop_graph]: 2.02999e-06 [rewriter_after_jit_bprop_graph]: 7.10002e-06 [opt_after_jit_grad]: 0.00083067 [validate]: 5.144e-05 Sums bootstrap : 0.000441s : 0.95% type_inference : 0.023579s : 50.64% event_method : 0.000022s : 0.05% auto_monad : 0.000068s : 0.15% graph_reusing : 0.000006s : 0.01% inline : 0.000003s : 0.01% add_attr.add_attr_with_inline.tag_attr : 0.000023s : 0.05% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.01% parallel-infer-symbol : 0.000004s : 0.01% pre_auto_parallel : 0.000039s : 0.08% insert-virtual-dataset : 0.000002s : 0.01% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000031s : 0.07% optimize.rewriter_before_opt_a : 0.000091s : 0.20% optimize.opt_a.expand_dump_flag : 0.000005s : 0.01% optimize.opt_a.switch_simplify : 0.000053s : 0.11% optimize.opt_a.loop_unroll : 0.000040s : 0.09% optimize.opt_a.a_1 : 0.000795s : 1.71% optimize.opt_a.with_stream_mark : 0.000036s : 0.08% optimize.opt_a.recompute_prepare : 0.000017s : 0.04% optimize.opt_a.updatestate_depend_eliminate : 0.000008s : 0.02% optimize.opt_a.updatestate_assign_eliminate : 0.000008s : 0.02% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.01% optimize.opt_a.parameter_eliminate : 0.000004s : 0.01% optimize.opt_a.a_2 : 0.000218s : 0.47% optimize.opt_a.accelerated_algorithm : 0.000014s : 0.03% optimize.opt_a.shard : 0.000005s : 0.01% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.01% optimize.opt_a.shard_inline : 0.000013s : 0.03% optimize.opt_a.merge_send_recv : 0.000018s : 0.04% optimize.opt_a.auto_parallel : 0.000018s : 0.04% optimize.opt_a.parallel : 0.000028s : 0.06% optimize.opt_a.flash_sp : 0.000013s : 0.03% optimize.opt_a.merge_comm : 0.000009s : 0.02% optimize.opt_a.allreduce_fusion : 0.000007s : 0.02% optimize.opt_a.matmul_add_comm_reduction : 0.000020s : 0.04% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000002s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000018s : 0.04% optimize.opt_a.virtual_dataset : 0.000014s : 0.03% optimize.opt_a.get_grad_eliminate_ : 0.000012s : 0.03% optimize.opt_a.virtual_output : 0.000013s : 0.03% optimize.opt_a.merge_forward : 0.000009s : 0.02% optimize.opt_a.cell_reuse_recompute_pass : 0.000005s : 0.01% optimize.opt_a.offload_activation : 0.000021s : 0.04% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000036s : 0.08% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.01% optimize.opt_a.before_grad : 0.000022s : 0.05% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000008s : 0.02% optimize.opt_a.meta_fg_expand : 0.000006s : 0.01% optimize.opt_a.flash_sp_send_recv_attached : 0.000005s : 0.01% optimize.opt_a.receive_attached : 0.000005s : 0.01% optimize.opt_a.after_resolve : 0.000026s : 0.06% optimize.opt_a.a_after_grad : 0.000021s : 0.05% optimize.opt_a.renormalize : 0.017026s : 36.56% optimize.opt_a.add_forward_monad_depend : 0.000012s : 0.03% optimize.opt_a.auto_monad_grad : 0.000005s : 0.01% optimize.opt_a.auto_monad_eliminator : 0.000032s : 0.07% optimize.opt_a.cse : 0.000050s : 0.11% optimize.opt_a.a_3 : 0.000135s : 0.29% optimize.py_interpret_to_execute_after_opt_a : 0.000022s : 0.05% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.01% optimize.rewriter_after_opt_a : 0.000047s : 0.10% optimize.convert_after_rewriter : 0.000011s : 0.02% optimize.order_py_execute_after_rewriter : 0.000009s : 0.02% optimize.mutable_eliminate : 0.000846s : 1.82% optimize.opt_b.b_1 : 0.000185s : 0.40% optimize.opt_b.b_2 : 0.000009s : 0.02% optimize.opt_b.updatestate_depend_eliminate : 0.000010s : 0.02% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.01% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.01% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000027s : 0.06% optimize.optimize_parallel_all_gather_comm : 0.000025s : 0.05% optimize.overlap_param_gather : 0.000005s : 0.01% optimize.cconv : 0.000040s : 0.09% optimize.loop_unroll : 0.000742s : 1.59% optimize.opt_after_cconv.c_1 : 0.000038s : 0.08% optimize.opt_after_cconv.parameter_eliminate : 0.000006s : 0.01% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000008s : 0.02% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.cse : 0.000028s : 0.06% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000020s : 0.04% optimize.tuple_transform.d_1 : 0.000056s : 0.12% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000008s : 0.02% optimize.partial_unused_args_eliminate : 0.000005s : 0.01% optimize.add_recomputation : 0.000063s : 0.14% optimize.cse_after_recomputation.cse : 0.000013s : 0.03% optimize.environ_conv : 0.000010s : 0.02% optimize.swap_dp_allreduce_reducescatter : 0.000008s : 0.02% optimize.bias_add_comm_swap : 0.000006s : 0.01% optimize.label_micro_interleaved_index : 0.000010s : 0.02% optimize.label_fine_grained_interleaved_index : 0.000006s : 0.01% optimize.merge_cast_opt : 0.000004s : 0.01% optimize.slice_recompute_activation : 0.000005s : 0.01% optimize.micro_interleaved_order_control : 0.000005s : 0.01% optimize.assign_add_opt : 0.000004s : 0.01% optimize.ForceFp32Comm : 0.000004s : 0.01% optimize.remove_cast_before_assign_add : 0.000004s : 0.01% optimize.full_micro_interleaved_order_control : 0.000005s : 0.01% optimize.reorder_send_recv_between_fp_bp : 0.000006s : 0.01% optimize.comm_op_add_attrs : 0.000004s : 0.01% optimize.add_comm_op_reuse_tag : 0.000003s : 0.01% optimize.interleave_split_concat_branches : 0.000004s : 0.01% optimize.interleave_parallel_branches : 0.000004s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000005s : 0.01% optimize.control_data_broadcast_order : 0.000019s : 0.04% optimize.grouped_pairwise_exchange_alltoall : 0.000005s : 0.01% optimize.offloading_packed_experts : 0.000008s : 0.02% optimize.overlap_recompute_and_grad_model_parallel : 0.000008s : 0.02% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.01% optimize.overlap_recompute_comm : 0.000005s : 0.01% optimize.overlap_grad_ring_attention : 0.000008s : 0.02% optimize.overlap_grad_flash_sp : 0.000028s : 0.06% optimize.begin_end_overlap_inline : 0.000003s : 0.01% optimize.split_matmul_comm_elemetwise : 0.000005s : 0.01% optimize.split_layernorm_comm : 0.000005s : 0.01% optimize.handle_group_info : 0.000004s : 0.01% optimize.symbol_engine_optimizer.build : 0.000005s : 0.01% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000012s : 0.03% optimize.symbol_engine_optimizer.elim_not_effective : 0.000015s : 0.03% optimize.symbol_engine_optimizer.opt_reshape : 0.000008s : 0.02% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000011s : 0.02% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000005s : 0.01% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000023s : 0.05% get_jit_bprop_graph : 0.000002s : 0.00% rewriter_after_jit_bprop_graph : 0.000007s : 0.02% opt_after_jit_grad : 0.000831s : 1.78% validate : 0.000051s : 0.11% Time group info: ------[substitution.] 0.000204 28 1.16% : 0.000002s : 2: substitution.elim_not_effective 0.88% : 0.000002s : 2: substitution.fold_const_symbol 3.34% : 0.000007s : 4: substitution.graph_param_transform 78.49% : 0.000160s : 4: substitution.inline 2.22% : 0.000005s : 4: substitution.j_node_and_user_rematch 2.87% : 0.000006s : 4: substitution.remove_not_recompute_node 3.26% : 0.000007s : 4: substitution.replace_old_param 7.77% : 0.000016s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.023511 2 96.52% : 0.022693s : 1: type_inference.infer 3.48% : 0.000818s : 1: type_inference.specialize ------[replace.] 0.000061 8 64.66% : 0.000040s : 4: replace.inline 35.34% : 0.000022s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000171 8 92.12% : 0.000157s : 4: match.inline 7.88% : 0.000013s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000237 1278 0.91% : 0.000002s : 13: predicate.accumulaten_eliminater 1.08% : 0.000003s : 4: predicate.ad_related_special_op_eliminate 0.52% : 0.000001s : 8: predicate.addn_check_dump 0.93% : 0.000002s : 13: predicate.addn_zero_filter 0.88% : 0.000002s : 13: predicate.adjust_all_reduce_mul_add 1.95% : 0.000005s : 21: predicate.arithmetic_simplify 1.06% : 0.000003s : 13: predicate.cast_eliminate 0.65% : 0.000002s : 8: predicate.check_bprop_eliminate 0.51% : 0.000001s : 8: predicate.compare_switch_simplify 0.15% : 0.000000s : 4: predicate.const_output_eliminate 0.91% : 0.000002s : 8: predicate.depend_value_elim 0.87% : 0.000002s : 13: predicate.dict_get_item_const_eliminator 1.05% : 0.000002s : 13: predicate.dict_get_item_eliminator 0.77% : 0.000002s : 13: predicate.dict_set_item_eliminator 1.13% : 0.000003s : 8: predicate.dumpgradient_eliminate 0.16% : 0.000000s : 4: predicate.elim_not_effective 0.32% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.35% : 0.000003s : 17: predicate.environ_add_const_eliminate 1.02% : 0.000002s : 17: predicate.environ_get_add_eliminate 1.18% : 0.000003s : 17: predicate.environ_get_depend_swap 1.57% : 0.000004s : 25: predicate.environ_get_eliminate 1.10% : 0.000003s : 17: predicate.environ_get_set_eliminate 1.26% : 0.000003s : 21: predicate.exchange_switch_depend_value 2.18% : 0.000005s : 21: predicate.float_depend_g_call 0.58% : 0.000001s : 8: predicate.float_environ_get_switch 0.77% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.17% : 0.000000s : 4: predicate.fold_const_symbol 0.64% : 0.000002s : 8: predicate.get_grad_eliminate 0.19% : 0.000000s : 4: predicate.graph_param_transform 0.51% : 0.000001s : 8: predicate.incorporate_call 0.41% : 0.000001s : 8: predicate.incorporate_call_switch 6.11% : 0.000014s : 58: predicate.inline 0.62% : 0.000001s : 8: predicate.inline_without_move 0.30% : 0.000001s : 8: predicate.j_node_and_user_rematch 0.74% : 0.000002s : 8: predicate.less_batch_normalization 1.92% : 0.000005s : 25: predicate.list_to_tuple_eliminator_ 2.40% : 0.000006s : 38: predicate.load_eliminater 1.31% : 0.000003s : 4: predicate.loop_unroll_after_grad 2.23% : 0.000005s : 34: predicate.loop_unroll_before_grad 1.75% : 0.000004s : 21: predicate.make_slice_get_slice_eliminator 0.54% : 0.000001s : 8: predicate.merge_addn 0.57% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.62% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.91% : 0.000002s : 13: predicate.minmaximum_grad 1.70% : 0.000004s : 4: predicate.mutable_eliminate 0.38% : 0.000001s : 4: predicate.opt_reshape 0.32% : 0.000001s : 4: predicate.parallel_virtual_node 1.57% : 0.000004s : 21: predicate.partial_defer_inline 1.42% : 0.000003s : 21: predicate.partial_eliminate 0.82% : 0.000002s : 13: predicate.print_const_string_wrapper 0.62% : 0.000001s : 8: predicate.reduce_all_const_elim 1.29% : 0.000003s : 13: predicate.reduce_eliminate 2.40% : 0.000006s : 38: predicate.redundant_stop_gradient_eliminater 0.50% : 0.000001s : 8: predicate.remove_not_recompute_node 1.35% : 0.000003s : 25: predicate.replace_applicator 0.51% : 0.000001s : 8: predicate.replace_old_param 0.36% : 0.000001s : 4: predicate.reset_defer_inline 1.14% : 0.000003s : 13: predicate.reshape_eliminate 0.75% : 0.000002s : 8: predicate.row_tensor_add_zeros_like 0.40% : 0.000001s : 4: predicate.row_tensor_eliminate 1.00% : 0.000002s : 8: predicate.same_eliminate 0.46% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.68% : 0.000002s : 8: predicate.shard_identity_eliminate 0.71% : 0.000002s : 8: predicate.special_op_eliminate 0.73% : 0.000002s : 8: predicate.specialize_transform 1.05% : 0.000002s : 8: predicate.split_environ_get_set_with_tuple_value 0.85% : 0.000002s : 8: predicate.stack_unstack_eliminate 0.35% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.40% : 0.000003s : 21: predicate.switch_defer_inline 1.96% : 0.000005s : 29: predicate.switch_layer_defer_inline 4.70% : 0.000011s : 67: predicate.switch_simplify 0.94% : 0.000002s : 13: predicate.tile_eliminate 1.00% : 0.000002s : 13: predicate.transpose_eliminate 1.89% : 0.000004s : 21: predicate.tuple_list_convert_item_index_to_positive 1.48% : 0.000004s : 21: predicate.tuple_list_get_item_const_eliminator 1.47% : 0.000003s : 21: predicate.tuple_list_get_item_depend_reorder 3.26% : 0.000008s : 33: predicate.tuple_list_get_item_eliminator 1.99% : 0.000005s : 21: predicate.tuple_list_get_set_item_eliminator 2.05% : 0.000005s : 29: predicate.tuple_list_set_item_eliminator 1.93% : 0.000005s : 25: predicate.tuple_to_list_eliminator_ 2.34% : 0.000006s : 38: predicate.updatestate_pure_node_eliminater 2.82% : 0.000007s : 46: predicate.updatestate_useless_node_eliminater 0.30% : 0.000001s : 4: predicate.value_based_eliminate 0.65% : 0.000002s : 8: predicate.virtual_dataset_eliminate 0.73% : 0.000002s : 8: predicate.virtual_output_eliminate 0.27% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.67% : 0.000002s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000720 11 52.07% : 0.000375s : 5: func_graph_cloner_run.FuncGraphClonerGraph 47.93% : 0.000345s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.097477 192 0.01% : 0.000006s : 1: ForceFp32Comm 3.95% : 0.003847s : 1: add_attr 3.93% : 0.003830s : 1: add_attr_with_inline 0.01% : 0.000007s : 1: add_comm_op_reuse_tag 0.07% : 0.000067s : 1: add_recomputation 0.01% : 0.000007s : 1: assign_add_opt 0.08% : 0.000078s : 1: auto_monad 0.03% : 0.000032s : 1: auto_monad_reorder 0.01% : 0.000006s : 1: begin_end_overlap_inline 0.01% : 0.000009s : 1: bias_add_comm_swap 0.50% : 0.000485s : 1: bootstrap 0.04% : 0.000043s : 1: cconv 0.01% : 0.000006s : 1: comm_op_add_attrs 0.02% : 0.000022s : 1: control_data_broadcast_order 0.01% : 0.000014s : 1: convert_after_rewriter 0.03% : 0.000033s : 1: cse_after_recomputation 0.01% : 0.000008s : 1: dataset_repeat_opt 0.03% : 0.000026s : 1: detach_backward 0.01% : 0.000013s : 1: environ_conv 0.03% : 0.000032s : 1: event_method 0.01% : 0.000008s : 1: full_micro_interleaved_order_control 0.01% : 0.000008s : 1: get_jit_bprop_graph 0.01% : 0.000014s : 1: graph_reusing 0.01% : 0.000008s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000006s : 1: handle_group_info 0.01% : 0.000009s : 1: inline 0.01% : 0.000009s : 1: insert-virtual-dataset 0.01% : 0.000006s : 1: interleave_parallel_branches 0.01% : 0.000006s : 1: interleave_split_concat_branches 0.01% : 0.000009s : 1: label_fine_grained_interleaved_index 0.01% : 0.000013s : 1: label_micro_interleaved_index 0.77% : 0.000753s : 1: loop_unroll 0.01% : 0.000007s : 1: merge_cast_opt 0.01% : 0.000008s : 1: micro_interleaved_order_control 0.88% : 0.000857s : 1: mutable_eliminate 0.01% : 0.000011s : 1: offloading_packed_experts 0.09% : 0.000089s : 1: opt.transform.loop_unroll_optimizer 0.02% : 0.000022s : 1: opt.transform.mutable_eliminate 1.29% : 0.001258s : 78: opt.transform.opt_a 0.04% : 0.000036s : 1: opt.transform.opt_after_cconv 0.04% : 0.000037s : 1: opt.transform.opt_after_jit_grad 0.12% : 0.000114s : 28: opt.transform.opt_b 0.06% : 0.000061s : 2: opt.transform.opt_trans_graph 0.04% : 0.000042s : 4: opt.transform.symbol_engine_opt 20.16% : 0.019651s : 1: opt_a 0.17% : 0.000162s : 1: opt_after_cconv 0.87% : 0.000846s : 1: opt_after_jit_grad 0.32% : 0.000315s : 1: opt_b 23.75% : 0.023148s : 1: optimize 0.03% : 0.000029s : 1: optimize_parallel_all_gather_comm 0.01% : 0.000012s : 1: order_py_execute_after_rewriter 0.03% : 0.000031s : 1: overlap_grad_flash_sp 0.01% : 0.000007s : 1: overlap_grad_matmul_and_grad_allreduce 0.01% : 0.000010s : 1: overlap_grad_ring_attention 0.01% : 0.000007s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000007s : 1: overlap_opt_shard_in_pipeline 0.01% : 0.000008s : 1: overlap_param_gather 0.01% : 0.000007s : 1: overlap_recompute_allgather_and_fa_grad 0.01% : 0.000011s : 1: overlap_recompute_and_grad_model_parallel 0.01% : 0.000008s : 1: overlap_recompute_comm 0.01% : 0.000010s : 1: parallel-infer-symbol 0.01% : 0.000006s : 1: parallel-infer-symbol-second 0.01% : 0.000008s : 1: partial_unused_args_eliminate 0.01% : 0.000010s : 1: pipeline_parallel_scheduler 0.01% : 0.000007s : 1: pipeline_split 0.05% : 0.000047s : 1: pre_auto_parallel 0.04% : 0.000034s : 1: py_interpret_to_execute 0.03% : 0.000025s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000007s : 1: remove_cast_before_assign_add 0.02% : 0.000023s : 1: remove_dup_value 16.96% : 0.016532s : 1: renormalize.infer 0.49% : 0.000479s : 1: renormalize.specialize 0.01% : 0.000008s : 1: reorder_send_recv_between_fp_bp 0.01% : 0.000013s : 1: rewriter_after_jit_bprop_graph 0.05% : 0.000051s : 1: rewriter_after_opt_a 0.10% : 0.000095s : 1: rewriter_before_opt_a 0.01% : 0.000008s : 1: slice_cell_reuse_recomputed_activation 0.01% : 0.000009s : 1: slice_recompute_activation 0.01% : 0.000008s : 1: split_layernorm_comm 0.01% : 0.000007s : 1: split_matmul_comm_elemetwise 0.01% : 0.000011s : 1: swap_dp_allreduce_reducescatter 0.11% : 0.000111s : 1: symbol_engine_optimizer 0.11% : 0.000110s : 1: tuple_transform 24.25% : 0.023634s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:44:55.190.600 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0348369, [21] [bootstrap]: 0.00045094 [type_inference]: 0.00728626 [event_method]: 2.25e-05 [auto_monad]: 7.284e-05 [graph_reusing]: 6.79999e-06 [inline]: 3.14999e-06 [add_attr]: 0.00409117, [1] [add_attr_with_inline]: 0.00407732, [1] [Cycle 1]: 8.273e-05, [2] [tag_attr]: 2.449e-05 [meta_addattr_fg_expand]: 6.23998e-06 [parallel-infer-symbol]: 3.74002e-06 [pre_auto_parallel]: 4.51e-05 [insert-virtual-dataset]: 2.65002e-06 [parallel-infer-symbol-second]: 8.39995e-07 [dataset_repeat_opt]: 2.44999e-06 [pipeline_split]: 1.68002e-06 [optimize]: 0.00575548, [53] [py_interpret_to_execute]: 3.442e-05 [rewriter_before_opt_a]: 9.538e-05 [opt_a]: 0.00311011, [2] [Cycle 1]: 0.00236908, [45] [expand_dump_flag]: 3.3e-06 [switch_simplify]: 4.488e-05 [loop_unroll]: 3.019e-05 [a_1]: 0.00067479 [with_stream_mark]: 2.235e-05 [recompute_prepare]: 1.131e-05 [updatestate_depend_eliminate]: 3.80998e-06 [updatestate_assign_eliminate]: 3.52002e-06 [updatestate_loads_eliminate]: 2.88e-06 [parameter_eliminate]: 1.97999e-06 [a_2]: 8.413e-05 [accelerated_algorithm]: 7.4e-06 [shard]: 2.06e-06 [meta_shard_fg_expand]: 2.02001e-06 [shard_inline]: 6.53e-06 [merge_send_recv]: 9.10001e-06 [auto_parallel]: 9.00001e-06 [parallel]: 2.041e-05 [flash_sp]: 9.36998e-06 [merge_comm]: 4.14997e-06 [allreduce_fusion]: 3.56999e-06 [matmul_add_comm_reduction]: 1.09e-05 [allreduce_slice_to_reducescatter]: 1.35001e-06 [virtual_shard_identity]: 8.46002e-06 [virtual_dataset]: 7.92e-06 [get_grad_eliminate_]: 6.86999e-06 [virtual_output]: 8.17e-06 [merge_forward]: 4.80999e-06 [cell_reuse_recompute_pass]: 1.55999e-06 [offload_activation]: 1.072e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.395e-05 [merge_recompute_call_nodes]: 1.60001e-06 [before_grad]: 1.086e-05 [set_forward_comm_id_for_comm_node_pass]: 3.95e-06 [meta_fg_expand]: 3.53e-06 [flash_sp_send_recv_attached]: 3.03998e-06 [receive_attached]: 2.04999e-06 [after_resolve]: 1.305e-05 [a_after_grad]: 1.105e-05 [renormalize]: 0.00089799 [add_forward_monad_depend]: 8.46002e-06 [auto_monad_grad]: 2.71e-06 [auto_monad_eliminator]: 1.834e-05 [cse]: 3.189e-05 [a_3]: 5.544e-05 [Cycle 2]: 0.00072629, [45] [expand_dump_flag]: 2.01e-06 [switch_simplify]: 8.3e-06 [loop_unroll]: 6.32001e-06 [a_1]: 0.00013861 [with_stream_mark]: 1.629e-05 [recompute_prepare]: 8.20999e-06 [updatestate_depend_eliminate]: 3.11001e-06 [updatestate_assign_eliminate]: 2.92002e-06 [updatestate_loads_eliminate]: 3.00998e-06 [parameter_eliminate]: 1.69e-06 [a_2]: 7.235e-05 [accelerated_algorithm]: 6.58e-06 [shard]: 2.64001e-06 [meta_shard_fg_expand]: 2.06003e-06 [shard_inline]: 6.18998e-06 [merge_send_recv]: 8.28999e-06 [auto_parallel]: 9.20001e-06 [parallel]: 1.732e-05 [flash_sp]: 3.86999e-06 [merge_comm]: 3.58e-06 [allreduce_fusion]: 3.4e-06 [matmul_add_comm_reduction]: 8.65001e-06 [allreduce_slice_to_reducescatter]: 5.60016e-07 [virtual_shard_identity]: 7.51999e-06 [virtual_dataset]: 5.96e-06 [get_grad_eliminate_]: 6.34001e-06 [virtual_output]: 8.67998e-06 [merge_forward]: 4.14002e-06 [cell_reuse_recompute_pass]: 3.21001e-06 [offload_activation]: 1.094e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.669e-05 [merge_recompute_call_nodes]: 1.17e-06 [before_grad]: 1.039e-05 [set_forward_comm_id_for_comm_node_pass]: 3.55998e-06 [meta_fg_expand]: 2.41e-06 [flash_sp_send_recv_attached]: 1.82999e-06 [receive_attached]: 2.01e-06 [after_resolve]: 1.203e-05 [a_after_grad]: 1.008e-05 [renormalize]: 8.00064e-08 [add_forward_monad_depend]: 2.10002e-06 [auto_monad_grad]: 1.77001e-06 [auto_monad_eliminator]: 1.116e-05 [cse]: 1.779e-05 [a_3]: 3.766e-05 [py_interpret_to_execute_after_opt_a]: 1.554e-05 [slice_cell_reuse_recomputed_activation]: 2.43002e-06 [rewriter_after_opt_a]: 4.083e-05 [convert_after_rewriter]: 7.77e-06 [order_py_execute_after_rewriter]: 5.09e-06 [mutable_eliminate]: 0.00078676 [opt_b]: 0.00023628, [1] [Cycle 1]: 0.00022689, [7] [b_1]: 0.00013469 [b_2]: 8.97e-06 [updatestate_depend_eliminate]: 9.46998e-06 [updatestate_assign_eliminate]: 2.64001e-06 [updatestate_loads_eliminate]: 2.68e-06 [renormalize]: 1.00001e-06 [cse]: 2.869e-05 [optimize_parallel_all_gather_comm]: 2.253e-05 [overlap_param_gather]: 2.14e-06 [cconv]: 3.412e-05 [loop_unroll]: 0.00061697 [opt_after_cconv]: 0.00012365, [1] [Cycle 1]: 0.00011559, [7] [c_1]: 3.416e-05 [parameter_eliminate]: 5.77999e-06 [updatestate_depend_eliminate]: 8.96998e-06 [updatestate_assign_eliminate]: 3.11999e-06 [updatestate_loads_eliminate]: 2.21e-06 [cse]: 2.513e-05 [renormalize]: 7.00005e-07 [remove_dup_value]: 1.537e-05 [tuple_transform]: 8.801e-05, [1] [Cycle 1]: 8.269e-05, [4] [d_1]: 5.453e-05 [none_parameter_eliminate]: 1.69e-06 [renormalize]: 2.3999e-07 [switch_simplify]: 7.16001e-06 [partial_unused_args_eliminate]: 1.91e-06 [add_recomputation]: 5.639e-05 [cse_after_recomputation]: 2.42e-05, [1] [Cycle 1]: 1.921e-05, [1] [cse]: 1.246e-05 [environ_conv]: 6.81001e-06 [swap_dp_allreduce_reducescatter]: 5.13002e-06 [bias_add_comm_swap]: 3.51001e-06 [label_micro_interleaved_index]: 6.94001e-06 [label_fine_grained_interleaved_index]: 2.83e-06 [merge_cast_opt]: 1.38002e-06 [slice_recompute_activation]: 2.43e-06 [micro_interleaved_order_control]: 2.45002e-06 [assign_add_opt]: 1.40999e-06 [ForceFp32Comm]: 8.2e-07 [remove_cast_before_assign_add]: 1.09e-06 [full_micro_interleaved_order_control]: 2.49001e-06 [reorder_send_recv_between_fp_bp]: 2.69999e-06 [comm_op_add_attrs]: 1.01002e-06 [add_comm_op_reuse_tag]: 1.29e-06 [interleave_split_concat_branches]: 1.11002e-06 [interleave_parallel_branches]: 1.13001e-06 [overlap_opt_shard_in_pipeline]: 1.30999e-06 [overlap_opt_shard_grad_in_pipeline]: 2.14e-06 [control_data_broadcast_order]: 1.447e-05 [grouped_pairwise_exchange_alltoall]: 1.62001e-06 [offloading_packed_experts]: 4.79e-06 [overlap_recompute_and_grad_model_parallel]: 4.82e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.27e-06 [overlap_recompute_allgather_and_fa_grad]: 1.42999e-06 [overlap_recompute_comm]: 2.41998e-06 [overlap_grad_ring_attention]: 4.67998e-06 [overlap_grad_flash_sp]: 2.156e-05 [begin_end_overlap_inline]: 5.69999e-07 [split_matmul_comm_elemetwise]: 2.45002e-06 [split_layernorm_comm]: 1.77999e-06 [handle_group_info]: 1.12e-06 [symbol_engine_optimizer]: 8.274e-05, [1] [Cycle 1]: 7.766e-05, [6] [build]: 4.63999e-06 [elim_shapecalc]: 1.138e-05 [elim_not_effective]: 1.432e-05 [opt_reshape]: 7.87e-06 [fold_const_symbol]: 9.89001e-06 [renormalize]: 1.8999e-07 [detach_backward]: 2.25002e-06 [pipeline_parallel_scheduler]: 1.45999e-06 [auto_monad_reorder]: 1.712e-05 [get_jit_bprop_graph]: 1.91003e-06 [rewriter_after_jit_bprop_graph]: 6.49001e-06 [opt_after_jit_grad]: 0.0168086 [validate]: 6.183e-05 Sums bootstrap : 0.000451s : 1.52% type_inference : 0.007286s : 24.55% event_method : 0.000022s : 0.08% auto_monad : 0.000073s : 0.25% graph_reusing : 0.000007s : 0.02% inline : 0.000003s : 0.01% add_attr.add_attr_with_inline.tag_attr : 0.000024s : 0.08% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.02% parallel-infer-symbol : 0.000004s : 0.01% pre_auto_parallel : 0.000045s : 0.15% insert-virtual-dataset : 0.000003s : 0.01% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.01% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000034s : 0.12% optimize.rewriter_before_opt_a : 0.000095s : 0.32% optimize.opt_a.expand_dump_flag : 0.000005s : 0.02% optimize.opt_a.switch_simplify : 0.000053s : 0.18% optimize.opt_a.loop_unroll : 0.000037s : 0.12% optimize.opt_a.a_1 : 0.000813s : 2.74% optimize.opt_a.with_stream_mark : 0.000039s : 0.13% optimize.opt_a.recompute_prepare : 0.000020s : 0.07% optimize.opt_a.updatestate_depend_eliminate : 0.000007s : 0.02% optimize.opt_a.updatestate_assign_eliminate : 0.000006s : 0.02% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.02% optimize.opt_a.parameter_eliminate : 0.000004s : 0.01% optimize.opt_a.a_2 : 0.000156s : 0.53% optimize.opt_a.accelerated_algorithm : 0.000014s : 0.05% optimize.opt_a.shard : 0.000005s : 0.02% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.01% optimize.opt_a.shard_inline : 0.000013s : 0.04% optimize.opt_a.merge_send_recv : 0.000017s : 0.06% optimize.opt_a.auto_parallel : 0.000018s : 0.06% optimize.opt_a.parallel : 0.000038s : 0.13% optimize.opt_a.flash_sp : 0.000013s : 0.04% optimize.opt_a.merge_comm : 0.000008s : 0.03% optimize.opt_a.allreduce_fusion : 0.000007s : 0.02% optimize.opt_a.matmul_add_comm_reduction : 0.000020s : 0.07% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000002s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000016s : 0.05% optimize.opt_a.virtual_dataset : 0.000014s : 0.05% optimize.opt_a.get_grad_eliminate_ : 0.000013s : 0.04% optimize.opt_a.virtual_output : 0.000017s : 0.06% optimize.opt_a.merge_forward : 0.000009s : 0.03% optimize.opt_a.cell_reuse_recompute_pass : 0.000005s : 0.02% optimize.opt_a.offload_activation : 0.000022s : 0.07% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000031s : 0.10% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.01% optimize.opt_a.before_grad : 0.000021s : 0.07% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000008s : 0.03% optimize.opt_a.meta_fg_expand : 0.000006s : 0.02% optimize.opt_a.flash_sp_send_recv_attached : 0.000005s : 0.02% optimize.opt_a.receive_attached : 0.000004s : 0.01% optimize.opt_a.after_resolve : 0.000025s : 0.08% optimize.opt_a.a_after_grad : 0.000021s : 0.07% optimize.opt_a.renormalize : 0.000898s : 3.03% optimize.opt_a.add_forward_monad_depend : 0.000011s : 0.04% optimize.opt_a.auto_monad_grad : 0.000004s : 0.02% optimize.opt_a.auto_monad_eliminator : 0.000030s : 0.10% optimize.opt_a.cse : 0.000050s : 0.17% optimize.opt_a.a_3 : 0.000093s : 0.31% optimize.py_interpret_to_execute_after_opt_a : 0.000016s : 0.05% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.01% optimize.rewriter_after_opt_a : 0.000041s : 0.14% optimize.convert_after_rewriter : 0.000008s : 0.03% optimize.order_py_execute_after_rewriter : 0.000005s : 0.02% optimize.mutable_eliminate : 0.000787s : 2.65% optimize.opt_b.b_1 : 0.000135s : 0.45% optimize.opt_b.b_2 : 0.000009s : 0.03% optimize.opt_b.updatestate_depend_eliminate : 0.000009s : 0.03% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.01% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.01% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000029s : 0.10% optimize.optimize_parallel_all_gather_comm : 0.000023s : 0.08% optimize.overlap_param_gather : 0.000002s : 0.01% optimize.cconv : 0.000034s : 0.11% optimize.loop_unroll : 0.000617s : 2.08% optimize.opt_after_cconv.c_1 : 0.000034s : 0.12% optimize.opt_after_cconv.parameter_eliminate : 0.000006s : 0.02% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000009s : 0.03% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.01% optimize.opt_after_cconv.cse : 0.000025s : 0.08% optimize.opt_after_cconv.renormalize : 0.000001s : 0.00% optimize.remove_dup_value : 0.000015s : 0.05% optimize.tuple_transform.d_1 : 0.000055s : 0.18% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000007s : 0.02% optimize.partial_unused_args_eliminate : 0.000002s : 0.01% optimize.add_recomputation : 0.000056s : 0.19% optimize.cse_after_recomputation.cse : 0.000012s : 0.04% optimize.environ_conv : 0.000007s : 0.02% optimize.swap_dp_allreduce_reducescatter : 0.000005s : 0.02% optimize.bias_add_comm_swap : 0.000004s : 0.01% optimize.label_micro_interleaved_index : 0.000007s : 0.02% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.01% optimize.merge_cast_opt : 0.000001s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.01% optimize.micro_interleaved_order_control : 0.000002s : 0.01% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.00% optimize.full_micro_interleaved_order_control : 0.000002s : 0.01% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.01% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.01% optimize.control_data_broadcast_order : 0.000014s : 0.05% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.01% optimize.offloading_packed_experts : 0.000005s : 0.02% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.02% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.00% optimize.overlap_recompute_comm : 0.000002s : 0.01% optimize.overlap_grad_ring_attention : 0.000005s : 0.02% optimize.overlap_grad_flash_sp : 0.000022s : 0.07% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.01% optimize.split_layernorm_comm : 0.000002s : 0.01% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000005s : 0.02% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000011s : 0.04% optimize.symbol_engine_optimizer.elim_not_effective : 0.000014s : 0.05% optimize.symbol_engine_optimizer.opt_reshape : 0.000008s : 0.03% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000010s : 0.03% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.01% pipeline_parallel_scheduler : 0.000001s : 0.00% auto_monad_reorder : 0.000017s : 0.06% get_jit_bprop_graph : 0.000002s : 0.01% rewriter_after_jit_bprop_graph : 0.000006s : 0.02% opt_after_jit_grad : 0.016809s : 56.64% validate : 0.000062s : 0.21% Time group info: ------[substitution.] 0.000220 28 1.02% : 0.000002s : 2: substitution.elim_not_effective 0.72% : 0.000002s : 2: substitution.fold_const_symbol 3.12% : 0.000007s : 4: substitution.graph_param_transform 79.68% : 0.000175s : 4: substitution.inline 2.34% : 0.000005s : 4: substitution.j_node_and_user_rematch 2.46% : 0.000005s : 4: substitution.remove_not_recompute_node 2.88% : 0.000006s : 4: substitution.replace_old_param 7.78% : 0.000017s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.007205 2 88.17% : 0.006353s : 1: type_inference.infer 11.83% : 0.000852s : 1: type_inference.specialize ------[replace.] 0.000068 8 61.58% : 0.000042s : 4: replace.inline 38.42% : 0.000026s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000187 8 92.06% : 0.000173s : 4: match.inline 7.94% : 0.000015s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000229 1278 0.92% : 0.000002s : 13: predicate.accumulaten_eliminater 1.93% : 0.000004s : 4: predicate.ad_related_special_op_eliminate 0.48% : 0.000001s : 8: predicate.addn_check_dump 0.96% : 0.000002s : 13: predicate.addn_zero_filter 0.73% : 0.000002s : 13: predicate.adjust_all_reduce_mul_add 2.25% : 0.000005s : 21: predicate.arithmetic_simplify 0.92% : 0.000002s : 13: predicate.cast_eliminate 0.64% : 0.000001s : 8: predicate.check_bprop_eliminate 0.47% : 0.000001s : 8: predicate.compare_switch_simplify 0.17% : 0.000000s : 4: predicate.const_output_eliminate 0.61% : 0.000001s : 8: predicate.depend_value_elim 0.98% : 0.000002s : 13: predicate.dict_get_item_const_eliminator 0.98% : 0.000002s : 13: predicate.dict_get_item_eliminator 0.80% : 0.000002s : 13: predicate.dict_set_item_eliminator 1.81% : 0.000004s : 8: predicate.dumpgradient_eliminate 0.18% : 0.000000s : 4: predicate.elim_not_effective 0.38% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.19% : 0.000003s : 17: predicate.environ_add_const_eliminate 1.05% : 0.000002s : 17: predicate.environ_get_add_eliminate 1.07% : 0.000002s : 17: predicate.environ_get_depend_swap 1.80% : 0.000004s : 25: predicate.environ_get_eliminate 1.04% : 0.000002s : 17: predicate.environ_get_set_eliminate 1.36% : 0.000003s : 21: predicate.exchange_switch_depend_value 2.43% : 0.000006s : 21: predicate.float_depend_g_call 0.52% : 0.000001s : 8: predicate.float_environ_get_switch 0.93% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.14% : 0.000000s : 4: predicate.fold_const_symbol 0.79% : 0.000002s : 8: predicate.get_grad_eliminate 0.25% : 0.000001s : 4: predicate.graph_param_transform 0.51% : 0.000001s : 8: predicate.incorporate_call 0.43% : 0.000001s : 8: predicate.incorporate_call_switch 5.78% : 0.000013s : 58: predicate.inline 0.83% : 0.000002s : 8: predicate.inline_without_move 0.29% : 0.000001s : 8: predicate.j_node_and_user_rematch 0.87% : 0.000002s : 8: predicate.less_batch_normalization 1.78% : 0.000004s : 25: predicate.list_to_tuple_eliminator_ 2.43% : 0.000006s : 38: predicate.load_eliminater 0.99% : 0.000002s : 4: predicate.loop_unroll_after_grad 2.08% : 0.000005s : 34: predicate.loop_unroll_before_grad 1.75% : 0.000004s : 21: predicate.make_slice_get_slice_eliminator 0.52% : 0.000001s : 8: predicate.merge_addn 0.51% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.51% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.83% : 0.000002s : 13: predicate.minmaximum_grad 1.50% : 0.000003s : 4: predicate.mutable_eliminate 0.38% : 0.000001s : 4: predicate.opt_reshape 0.38% : 0.000001s : 4: predicate.parallel_virtual_node 1.62% : 0.000004s : 21: predicate.partial_defer_inline 1.49% : 0.000003s : 21: predicate.partial_eliminate 0.86% : 0.000002s : 13: predicate.print_const_string_wrapper 0.66% : 0.000002s : 8: predicate.reduce_all_const_elim 1.25% : 0.000003s : 13: predicate.reduce_eliminate 2.45% : 0.000006s : 38: predicate.redundant_stop_gradient_eliminater 0.44% : 0.000001s : 8: predicate.remove_not_recompute_node 1.29% : 0.000003s : 25: predicate.replace_applicator 0.60% : 0.000001s : 8: predicate.replace_old_param 0.37% : 0.000001s : 4: predicate.reset_defer_inline 0.98% : 0.000002s : 13: predicate.reshape_eliminate 0.55% : 0.000001s : 8: predicate.row_tensor_add_zeros_like 0.38% : 0.000001s : 4: predicate.row_tensor_eliminate 0.69% : 0.000002s : 8: predicate.same_eliminate 0.38% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.82% : 0.000002s : 8: predicate.shard_identity_eliminate 0.86% : 0.000002s : 8: predicate.special_op_eliminate 0.62% : 0.000001s : 8: predicate.specialize_transform 0.98% : 0.000002s : 8: predicate.split_environ_get_set_with_tuple_value 0.74% : 0.000002s : 8: predicate.stack_unstack_eliminate 0.30% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.41% : 0.000003s : 21: predicate.switch_defer_inline 1.88% : 0.000004s : 29: predicate.switch_layer_defer_inline 4.54% : 0.000010s : 67: predicate.switch_simplify 0.93% : 0.000002s : 13: predicate.tile_eliminate 0.86% : 0.000002s : 13: predicate.transpose_eliminate 1.49% : 0.000003s : 21: predicate.tuple_list_convert_item_index_to_positive 1.64% : 0.000004s : 21: predicate.tuple_list_get_item_const_eliminator 1.52% : 0.000003s : 21: predicate.tuple_list_get_item_depend_reorder 3.36% : 0.000008s : 33: predicate.tuple_list_get_item_eliminator 1.46% : 0.000003s : 21: predicate.tuple_list_get_set_item_eliminator 2.24% : 0.000005s : 29: predicate.tuple_list_set_item_eliminator 2.00% : 0.000005s : 25: predicate.tuple_to_list_eliminator_ 2.51% : 0.000006s : 38: predicate.updatestate_pure_node_eliminater 3.13% : 0.000007s : 46: predicate.updatestate_useless_node_eliminater 0.42% : 0.000001s : 4: predicate.value_based_eliminate 0.62% : 0.000001s : 8: predicate.virtual_dataset_eliminate 0.78% : 0.000002s : 8: predicate.virtual_output_eliminate 0.35% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.40% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000665 11 52.19% : 0.000347s : 5: func_graph_cloner_run.FuncGraphClonerGraph 47.81% : 0.000318s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.047001 192 0.01% : 0.000004s : 1: ForceFp32Comm 8.72% : 0.004098s : 1: add_attr 8.69% : 0.004082s : 1: add_attr_with_inline 0.01% : 0.000005s : 1: add_comm_op_reuse_tag 0.13% : 0.000061s : 1: add_recomputation 0.01% : 0.000004s : 1: assign_add_opt 0.17% : 0.000079s : 1: auto_monad 0.04% : 0.000021s : 1: auto_monad_reorder 0.01% : 0.000003s : 1: begin_end_overlap_inline 0.01% : 0.000006s : 1: bias_add_comm_swap 1.02% : 0.000480s : 1: bootstrap 0.08% : 0.000038s : 1: cconv 0.01% : 0.000004s : 1: comm_op_add_attrs 0.04% : 0.000018s : 1: control_data_broadcast_order 0.02% : 0.000011s : 1: convert_after_rewriter 0.06% : 0.000027s : 1: cse_after_recomputation 0.01% : 0.000005s : 1: dataset_repeat_opt 0.01% : 0.000005s : 1: detach_backward 0.02% : 0.000010s : 1: environ_conv 0.06% : 0.000030s : 1: event_method 0.01% : 0.000005s : 1: full_micro_interleaved_order_control 0.01% : 0.000005s : 1: get_jit_bprop_graph 0.02% : 0.000010s : 1: graph_reusing 0.01% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000004s : 1: handle_group_info 0.01% : 0.000006s : 1: inline 0.01% : 0.000006s : 1: insert-virtual-dataset 0.01% : 0.000004s : 1: interleave_parallel_branches 0.01% : 0.000004s : 1: interleave_split_concat_branches 0.01% : 0.000006s : 1: label_fine_grained_interleaved_index 0.02% : 0.000010s : 1: label_micro_interleaved_index 1.34% : 0.000628s : 1: loop_unroll 0.01% : 0.000004s : 1: merge_cast_opt 0.01% : 0.000005s : 1: micro_interleaved_order_control 1.71% : 0.000801s : 1: mutable_eliminate 0.02% : 0.000008s : 1: offloading_packed_experts 0.04% : 0.000020s : 1: opt.transform.loop_unroll_optimizer 0.05% : 0.000024s : 1: opt.transform.mutable_eliminate 2.68% : 0.001258s : 78: opt.transform.opt_a 0.07% : 0.000033s : 1: opt.transform.opt_after_cconv 0.11% : 0.000051s : 1: opt.transform.opt_after_jit_grad 0.23% : 0.000107s : 28: opt.transform.opt_b 0.13% : 0.000060s : 2: opt.transform.opt_trans_graph 0.08% : 0.000040s : 4: opt.transform.symbol_engine_opt 6.63% : 0.003114s : 1: opt_a 0.27% : 0.000128s : 1: opt_after_cconv 35.81% : 0.016833s : 1: opt_after_jit_grad 0.51% : 0.000241s : 1: opt_b 12.26% : 0.005761s : 1: optimize 0.06% : 0.000026s : 1: optimize_parallel_all_gather_comm 0.02% : 0.000008s : 1: order_py_execute_after_rewriter 0.05% : 0.000025s : 1: overlap_grad_flash_sp 0.01% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.02% : 0.000008s : 1: overlap_grad_ring_attention 0.01% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.01% : 0.000005s : 1: overlap_param_gather 0.01% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.02% : 0.000009s : 1: overlap_recompute_and_grad_model_parallel 0.01% : 0.000005s : 1: overlap_recompute_comm 0.02% : 0.000008s : 1: parallel-infer-symbol 0.01% : 0.000004s : 1: parallel-infer-symbol-second 0.01% : 0.000005s : 1: partial_unused_args_eliminate 0.01% : 0.000004s : 1: pipeline_parallel_scheduler 0.01% : 0.000004s : 1: pipeline_split 0.11% : 0.000049s : 1: pre_auto_parallel 0.08% : 0.000039s : 1: py_interpret_to_execute 0.04% : 0.000019s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000004s : 1: remove_cast_before_assign_add 0.04% : 0.000019s : 1: remove_dup_value 0.98% : 0.000463s : 1: renormalize.infer 0.90% : 0.000425s : 1: renormalize.specialize 0.01% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.02% : 0.000010s : 1: rewriter_after_jit_bprop_graph 0.10% : 0.000045s : 1: rewriter_after_opt_a 0.21% : 0.000100s : 1: rewriter_before_opt_a 0.01% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.01% : 0.000006s : 1: slice_recompute_activation 0.01% : 0.000005s : 1: split_layernorm_comm 0.01% : 0.000005s : 1: split_matmul_comm_elemetwise 0.02% : 0.000008s : 1: swap_dp_allreduce_reducescatter 0.18% : 0.000086s : 1: symbol_engine_optimizer 0.19% : 0.000091s : 1: tuple_transform 15.56% : 0.007315s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:44:55.679.052 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:44:55.679.326 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0537303, [21] [bootstrap]: 0.00050153 [type_inference]: 0.0401868 [event_method]: 2.449e-05 [auto_monad]: 7.021e-05 [graph_reusing]: 5.71e-06 [inline]: 2.98e-06 [add_attr]: 0.00418485, [1] [add_attr_with_inline]: 0.0041708, [1] [Cycle 1]: 0.00011045, [2] [tag_attr]: 2.535e-05 [meta_addattr_fg_expand]: 7.66999e-06 [parallel-infer-symbol]: 3.71999e-06 [pre_auto_parallel]: 4.639e-05 [insert-virtual-dataset]: 2.72001e-06 [parallel-infer-symbol-second]: 1.00001e-06 [dataset_repeat_opt]: 2.22001e-06 [pipeline_split]: 1.67001e-06 [optimize]: 0.00667807, [53] [py_interpret_to_execute]: 4.002e-05 [rewriter_before_opt_a]: 0.00011199 [opt_a]: 0.00388044, [2] [Cycle 1]: 0.00288058, [45] [expand_dump_flag]: 3.53999e-06 [switch_simplify]: 4.642e-05 [loop_unroll]: 3.308e-05 [a_1]: 0.00084542 [with_stream_mark]: 2.664e-05 [recompute_prepare]: 1.604e-05 [updatestate_depend_eliminate]: 5.87999e-06 [updatestate_assign_eliminate]: 4.81002e-06 [updatestate_loads_eliminate]: 3.71999e-06 [parameter_eliminate]: 2.71e-06 [a_2]: 0.00013697 [accelerated_algorithm]: 9.10001e-06 [shard]: 2.37001e-06 [meta_shard_fg_expand]: 3.3e-06 [shard_inline]: 1.599e-05 [merge_send_recv]: 9.62999e-06 [auto_parallel]: 1.206e-05 [parallel]: 2.095e-05 [flash_sp]: 1.169e-05 [merge_comm]: 4.92e-06 [allreduce_fusion]: 4.25999e-06 [matmul_add_comm_reduction]: 1.186e-05 [allreduce_slice_to_reducescatter]: 7.2e-07 [virtual_shard_identity]: 1.223e-05 [virtual_dataset]: 8.18001e-06 [get_grad_eliminate_]: 7.87e-06 [virtual_output]: 8.37e-06 [merge_forward]: 5.08002e-06 [cell_reuse_recompute_pass]: 1.76998e-06 [offload_activation]: 1.215e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.377e-05 [merge_recompute_call_nodes]: 1.77001e-06 [before_grad]: 1.502e-05 [set_forward_comm_id_for_comm_node_pass]: 4.95999e-06 [meta_fg_expand]: 3.69002e-06 [flash_sp_send_recv_attached]: 2.89001e-06 [receive_attached]: 2.48e-06 [after_resolve]: 1.559e-05 [a_after_grad]: 1.409e-05 [renormalize]: 0.00088829 [add_forward_monad_depend]: 7.57002e-06 [auto_monad_grad]: 2.48e-06 [auto_monad_eliminator]: 1.928e-05 [cse]: 3.814e-05 [a_3]: 7.736e-05 [Cycle 2]: 0.00098275, [45] [expand_dump_flag]: 1.87001e-06 [switch_simplify]: 9.97999e-06 [loop_unroll]: 7.61001e-06 [a_1]: 0.00018365 [with_stream_mark]: 1.657e-05 [recompute_prepare]: 8.05999e-06 [updatestate_depend_eliminate]: 5.09e-06 [updatestate_assign_eliminate]: 3.63999e-06 [updatestate_loads_eliminate]: 3.23998e-06 [parameter_eliminate]: 1.82999e-06 [a_2]: 0.0001189 [accelerated_algorithm]: 8.03999e-06 [shard]: 1.97999e-06 [meta_shard_fg_expand]: 2.70997e-06 [shard_inline]: 8.13999e-06 [merge_send_recv]: 7.08e-06 [auto_parallel]: 8.65001e-06 [parallel]: 8.17e-06 [flash_sp]: 3.9e-06 [merge_comm]: 4.23999e-06 [allreduce_fusion]: 4.05998e-06 [matmul_add_comm_reduction]: 9.76e-06 [allreduce_slice_to_reducescatter]: 8.89995e-07 [virtual_shard_identity]: 9.57001e-06 [virtual_dataset]: 7.53999e-06 [get_grad_eliminate_]: 7.91001e-06 [virtual_output]: 7.39002e-06 [merge_forward]: 4.45e-06 [cell_reuse_recompute_pass]: 2.08002e-06 [offload_activation]: 1.13e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.706e-05 [merge_recompute_call_nodes]: 1.40999e-06 [before_grad]: 1.328e-05 [set_forward_comm_id_for_comm_node_pass]: 4.50999e-06 [meta_fg_expand]: 3.12002e-06 [flash_sp_send_recv_attached]: 1.64e-06 [receive_attached]: 1.84e-06 [after_resolve]: 1.176e-05 [a_after_grad]: 1.125e-05 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 1.77999e-06 [auto_monad_grad]: 1.66e-06 [auto_monad_eliminator]: 1.49e-05 [cse]: 2.087e-05 [a_3]: 5.96e-05 [py_interpret_to_execute_after_opt_a]: 1.912e-05 [slice_cell_reuse_recomputed_activation]: 5.47999e-06 [rewriter_after_opt_a]: 4.833e-05 [convert_after_rewriter]: 1.048e-05 [order_py_execute_after_rewriter]: 8.54e-06 [mutable_eliminate]: 0.00070971 [opt_b]: 0.00032542, [1] [Cycle 1]: 0.00031474, [7] [b_1]: 0.00019957 [b_2]: 1.001e-05 [updatestate_depend_eliminate]: 7.8e-06 [updatestate_assign_eliminate]: 3.57002e-06 [updatestate_loads_eliminate]: 3.28e-06 [renormalize]: 7.09988e-07 [cse]: 3.045e-05 [optimize_parallel_all_gather_comm]: 2.335e-05 [overlap_param_gather]: 5.22e-06 [cconv]: 3.546e-05 [loop_unroll]: 0.00048104 [opt_after_cconv]: 0.00021387, [1] [Cycle 1]: 0.0002043, [7] [c_1]: 3.896e-05 [parameter_eliminate]: 4.80999e-06 [updatestate_depend_eliminate]: 6.40002e-06 [updatestate_assign_eliminate]: 3.11999e-06 [updatestate_loads_eliminate]: 3.14999e-06 [cse]: 8.592e-05 [renormalize]: 5.19998e-07 [remove_dup_value]: 2.045e-05 [tuple_transform]: 0.0001194, [1] [Cycle 1]: 0.00011115, [4] [d_1]: 6.647e-05 [none_parameter_eliminate]: 2.46e-06 [renormalize]: 2.3999e-07 [switch_simplify]: 8.89003e-06 [partial_unused_args_eliminate]: 4.4e-06 [add_recomputation]: 6.05e-05 [cse_after_recomputation]: 3.351e-05, [1] [Cycle 1]: 2.591e-05, [1] [cse]: 1.6e-05 [environ_conv]: 9.90002e-06 [swap_dp_allreduce_reducescatter]: 9.13002e-06 [bias_add_comm_swap]: 5.57001e-06 [label_micro_interleaved_index]: 8.08999e-06 [label_fine_grained_interleaved_index]: 4.98001e-06 [merge_cast_opt]: 4.27e-06 [slice_recompute_activation]: 4.72998e-06 [micro_interleaved_order_control]: 4.90999e-06 [assign_add_opt]: 3.7e-06 [ForceFp32Comm]: 3.25e-06 [remove_cast_before_assign_add]: 3.63e-06 [full_micro_interleaved_order_control]: 4.53999e-06 [reorder_send_recv_between_fp_bp]: 5.19e-06 [comm_op_add_attrs]: 3.91999e-06 [add_comm_op_reuse_tag]: 3.31999e-06 [interleave_split_concat_branches]: 3.41999e-06 [interleave_parallel_branches]: 3.48e-06 [overlap_opt_shard_in_pipeline]: 3.69002e-06 [overlap_opt_shard_grad_in_pipeline]: 4.16001e-06 [control_data_broadcast_order]: 1.846e-05 [grouped_pairwise_exchange_alltoall]: 4.11001e-06 [offloading_packed_experts]: 7.36999e-06 [overlap_recompute_and_grad_model_parallel]: 8.03001e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.61001e-06 [overlap_recompute_allgather_and_fa_grad]: 3.86001e-06 [overlap_recompute_comm]: 4.99e-06 [overlap_grad_ring_attention]: 6.93998e-06 [overlap_grad_flash_sp]: 2.606e-05 [begin_end_overlap_inline]: 2.94001e-06 [split_matmul_comm_elemetwise]: 5.27001e-06 [split_layernorm_comm]: 4.60001e-06 [handle_group_info]: 3.88001e-06 [symbol_engine_optimizer]: 0.00010566, [1] [Cycle 1]: 9.86e-05, [6] [build]: 3.34001e-06 [elim_shapecalc]: 1.153e-05 [elim_not_effective]: 1.515e-05 [opt_reshape]: 8.74e-06 [fold_const_symbol]: 1.253e-05 [renormalize]: 1.69995e-07 [detach_backward]: 4.72e-06 [pipeline_parallel_scheduler]: 2.21e-06 [auto_monad_reorder]: 2.438e-05 [get_jit_bprop_graph]: 2.44999e-06 [rewriter_after_jit_bprop_graph]: 6.98e-06 [opt_after_jit_grad]: 0.00064332 [validate]: 5.187e-05 Sums bootstrap : 0.000502s : 1.07% type_inference : 0.040187s : 85.52% event_method : 0.000024s : 0.05% auto_monad : 0.000070s : 0.15% graph_reusing : 0.000006s : 0.01% inline : 0.000003s : 0.01% add_attr.add_attr_with_inline.tag_attr : 0.000025s : 0.05% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000008s : 0.02% parallel-infer-symbol : 0.000004s : 0.01% pre_auto_parallel : 0.000046s : 0.10% insert-virtual-dataset : 0.000003s : 0.01% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000040s : 0.09% optimize.rewriter_before_opt_a : 0.000112s : 0.24% optimize.opt_a.expand_dump_flag : 0.000005s : 0.01% optimize.opt_a.switch_simplify : 0.000056s : 0.12% optimize.opt_a.loop_unroll : 0.000041s : 0.09% optimize.opt_a.a_1 : 0.001029s : 2.19% optimize.opt_a.with_stream_mark : 0.000043s : 0.09% optimize.opt_a.recompute_prepare : 0.000024s : 0.05% optimize.opt_a.updatestate_depend_eliminate : 0.000011s : 0.02% optimize.opt_a.updatestate_assign_eliminate : 0.000008s : 0.02% optimize.opt_a.updatestate_loads_eliminate : 0.000007s : 0.01% optimize.opt_a.parameter_eliminate : 0.000005s : 0.01% optimize.opt_a.a_2 : 0.000256s : 0.54% optimize.opt_a.accelerated_algorithm : 0.000017s : 0.04% optimize.opt_a.shard : 0.000004s : 0.01% optimize.opt_a.meta_shard_fg_expand : 0.000006s : 0.01% optimize.opt_a.shard_inline : 0.000024s : 0.05% optimize.opt_a.merge_send_recv : 0.000017s : 0.04% optimize.opt_a.auto_parallel : 0.000021s : 0.04% optimize.opt_a.parallel : 0.000029s : 0.06% optimize.opt_a.flash_sp : 0.000016s : 0.03% optimize.opt_a.merge_comm : 0.000009s : 0.02% optimize.opt_a.allreduce_fusion : 0.000008s : 0.02% optimize.opt_a.matmul_add_comm_reduction : 0.000022s : 0.05% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000002s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000022s : 0.05% optimize.opt_a.virtual_dataset : 0.000016s : 0.03% optimize.opt_a.get_grad_eliminate_ : 0.000016s : 0.03% optimize.opt_a.virtual_output : 0.000016s : 0.03% optimize.opt_a.merge_forward : 0.000010s : 0.02% optimize.opt_a.cell_reuse_recompute_pass : 0.000004s : 0.01% optimize.opt_a.offload_activation : 0.000023s : 0.05% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000041s : 0.09% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.01% optimize.opt_a.before_grad : 0.000028s : 0.06% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000009s : 0.02% optimize.opt_a.meta_fg_expand : 0.000007s : 0.01% optimize.opt_a.flash_sp_send_recv_attached : 0.000005s : 0.01% optimize.opt_a.receive_attached : 0.000004s : 0.01% optimize.opt_a.after_resolve : 0.000027s : 0.06% optimize.opt_a.a_after_grad : 0.000025s : 0.05% optimize.opt_a.renormalize : 0.000888s : 1.89% optimize.opt_a.add_forward_monad_depend : 0.000009s : 0.02% optimize.opt_a.auto_monad_grad : 0.000004s : 0.01% optimize.opt_a.auto_monad_eliminator : 0.000034s : 0.07% optimize.opt_a.cse : 0.000059s : 0.13% optimize.opt_a.a_3 : 0.000137s : 0.29% optimize.py_interpret_to_execute_after_opt_a : 0.000019s : 0.04% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.01% optimize.rewriter_after_opt_a : 0.000048s : 0.10% optimize.convert_after_rewriter : 0.000010s : 0.02% optimize.order_py_execute_after_rewriter : 0.000009s : 0.02% optimize.mutable_eliminate : 0.000710s : 1.51% optimize.opt_b.b_1 : 0.000200s : 0.42% optimize.opt_b.b_2 : 0.000010s : 0.02% optimize.opt_b.updatestate_depend_eliminate : 0.000008s : 0.02% optimize.opt_b.updatestate_assign_eliminate : 0.000004s : 0.01% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.01% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000030s : 0.06% optimize.optimize_parallel_all_gather_comm : 0.000023s : 0.05% optimize.overlap_param_gather : 0.000005s : 0.01% optimize.cconv : 0.000035s : 0.08% optimize.loop_unroll : 0.000481s : 1.02% optimize.opt_after_cconv.c_1 : 0.000039s : 0.08% optimize.opt_after_cconv.parameter_eliminate : 0.000005s : 0.01% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.01% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.cse : 0.000086s : 0.18% optimize.opt_after_cconv.renormalize : 0.000001s : 0.00% optimize.remove_dup_value : 0.000020s : 0.04% optimize.tuple_transform.d_1 : 0.000066s : 0.14% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000009s : 0.02% optimize.partial_unused_args_eliminate : 0.000004s : 0.01% optimize.add_recomputation : 0.000060s : 0.13% optimize.cse_after_recomputation.cse : 0.000016s : 0.03% optimize.environ_conv : 0.000010s : 0.02% optimize.swap_dp_allreduce_reducescatter : 0.000009s : 0.02% optimize.bias_add_comm_swap : 0.000006s : 0.01% optimize.label_micro_interleaved_index : 0.000008s : 0.02% optimize.label_fine_grained_interleaved_index : 0.000005s : 0.01% optimize.merge_cast_opt : 0.000004s : 0.01% optimize.slice_recompute_activation : 0.000005s : 0.01% optimize.micro_interleaved_order_control : 0.000005s : 0.01% optimize.assign_add_opt : 0.000004s : 0.01% optimize.ForceFp32Comm : 0.000003s : 0.01% optimize.remove_cast_before_assign_add : 0.000004s : 0.01% optimize.full_micro_interleaved_order_control : 0.000005s : 0.01% optimize.reorder_send_recv_between_fp_bp : 0.000005s : 0.01% optimize.comm_op_add_attrs : 0.000004s : 0.01% optimize.add_comm_op_reuse_tag : 0.000003s : 0.01% optimize.interleave_split_concat_branches : 0.000003s : 0.01% optimize.interleave_parallel_branches : 0.000003s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000004s : 0.01% optimize.control_data_broadcast_order : 0.000018s : 0.04% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.01% optimize.offloading_packed_experts : 0.000007s : 0.02% optimize.overlap_recompute_and_grad_model_parallel : 0.000008s : 0.02% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.01% optimize.overlap_recompute_comm : 0.000005s : 0.01% optimize.overlap_grad_ring_attention : 0.000007s : 0.01% optimize.overlap_grad_flash_sp : 0.000026s : 0.06% optimize.begin_end_overlap_inline : 0.000003s : 0.01% optimize.split_matmul_comm_elemetwise : 0.000005s : 0.01% optimize.split_layernorm_comm : 0.000005s : 0.01% optimize.handle_group_info : 0.000004s : 0.01% optimize.symbol_engine_optimizer.build : 0.000003s : 0.01% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000012s : 0.02% optimize.symbol_engine_optimizer.elim_not_effective : 0.000015s : 0.03% optimize.symbol_engine_optimizer.opt_reshape : 0.000009s : 0.02% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000013s : 0.03% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000005s : 0.01% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000024s : 0.05% get_jit_bprop_graph : 0.000002s : 0.01% rewriter_after_jit_bprop_graph : 0.000007s : 0.01% opt_after_jit_grad : 0.000643s : 1.37% validate : 0.000052s : 0.11% Time group info: ------[substitution.] 0.000275 38 10.69% : 0.000029s : 3: substitution.cast_eliminate 0.93% : 0.000003s : 3: substitution.elim_not_effective 0.63% : 0.000002s : 3: substitution.fold_const_symbol 2.97% : 0.000008s : 5: substitution.graph_param_transform 70.83% : 0.000195s : 4: substitution.inline 2.40% : 0.000007s : 6: substitution.j_node_and_user_rematch 2.61% : 0.000007s : 6: substitution.remove_not_recompute_node 2.17% : 0.000006s : 4: substitution.replace_old_param 6.77% : 0.000019s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.040117 2 97.83% : 0.039247s : 1: type_inference.infer 2.17% : 0.000870s : 1: type_inference.specialize ------[replace.] 0.000071 8 60.70% : 0.000043s : 4: replace.inline 39.30% : 0.000028s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000207 8 92.43% : 0.000192s : 4: match.inline 7.57% : 0.000016s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000276 1596 0.98% : 0.000003s : 17: predicate.accumulaten_eliminater 0.83% : 0.000002s : 5: predicate.ad_related_special_op_eliminate 0.68% : 0.000002s : 10: predicate.addn_check_dump 0.97% : 0.000003s : 17: predicate.addn_zero_filter 0.86% : 0.000002s : 17: predicate.adjust_all_reduce_mul_add 2.23% : 0.000006s : 27: predicate.arithmetic_simplify 0.97% : 0.000003s : 17: predicate.cast_eliminate 0.66% : 0.000002s : 10: predicate.check_bprop_eliminate 0.69% : 0.000002s : 10: predicate.compare_switch_simplify 0.17% : 0.000000s : 5: predicate.const_output_eliminate 0.73% : 0.000002s : 10: predicate.depend_value_elim 0.93% : 0.000003s : 17: predicate.dict_get_item_const_eliminator 1.20% : 0.000003s : 17: predicate.dict_get_item_eliminator 0.92% : 0.000003s : 17: predicate.dict_set_item_eliminator 0.91% : 0.000003s : 10: predicate.dumpgradient_eliminate 0.18% : 0.000001s : 5: predicate.elim_not_effective 0.37% : 0.000001s : 5: predicate.elim_shapecalc_of_broadcastargs 1.14% : 0.000003s : 22: predicate.environ_add_const_eliminate 1.06% : 0.000003s : 22: predicate.environ_get_add_eliminate 1.31% : 0.000004s : 22: predicate.environ_get_depend_swap 1.68% : 0.000005s : 32: predicate.environ_get_eliminate 1.14% : 0.000003s : 22: predicate.environ_get_set_eliminate 1.42% : 0.000004s : 25: predicate.exchange_switch_depend_value 2.19% : 0.000006s : 25: predicate.float_depend_g_call 0.64% : 0.000002s : 10: predicate.float_environ_get_switch 0.85% : 0.000002s : 15: predicate.float_tuple_getitem_switch 0.16% : 0.000000s : 5: predicate.fold_const_symbol 0.64% : 0.000002s : 10: predicate.get_grad_eliminate 0.24% : 0.000001s : 5: predicate.graph_param_transform 0.55% : 0.000002s : 10: predicate.incorporate_call 0.45% : 0.000001s : 10: predicate.incorporate_call_switch 6.08% : 0.000017s : 72: predicate.inline 0.82% : 0.000002s : 10: predicate.inline_without_move 0.29% : 0.000001s : 10: predicate.j_node_and_user_rematch 0.78% : 0.000002s : 10: predicate.less_batch_normalization 1.91% : 0.000005s : 31: predicate.list_to_tuple_eliminator_ 2.49% : 0.000007s : 48: predicate.load_eliminater 0.91% : 0.000003s : 5: predicate.loop_unroll_after_grad 2.01% : 0.000006s : 36: predicate.loop_unroll_before_grad 1.76% : 0.000005s : 27: predicate.make_slice_get_slice_eliminator 0.62% : 0.000002s : 10: predicate.merge_addn 0.61% : 0.000002s : 10: predicate.micro_step_allgather_replace 0.75% : 0.000002s : 10: predicate.mini_step_allgather_replace 0.85% : 0.000002s : 17: predicate.minmaximum_grad 1.09% : 0.000003s : 5: predicate.mutable_eliminate 0.37% : 0.000001s : 5: predicate.opt_reshape 0.34% : 0.000001s : 5: predicate.parallel_virtual_node 1.70% : 0.000005s : 25: predicate.partial_defer_inline 1.56% : 0.000004s : 26: predicate.partial_eliminate 0.90% : 0.000002s : 17: predicate.print_const_string_wrapper 0.62% : 0.000002s : 10: predicate.reduce_all_const_elim 1.37% : 0.000004s : 17: predicate.reduce_eliminate 2.46% : 0.000007s : 48: predicate.redundant_stop_gradient_eliminater 0.37% : 0.000001s : 10: predicate.remove_not_recompute_node 1.24% : 0.000003s : 31: predicate.replace_applicator 0.57% : 0.000002s : 10: predicate.replace_old_param 0.22% : 0.000001s : 5: predicate.reset_defer_inline 1.08% : 0.000003s : 17: predicate.reshape_eliminate 0.59% : 0.000002s : 10: predicate.row_tensor_add_zeros_like 0.45% : 0.000001s : 5: predicate.row_tensor_eliminate 0.75% : 0.000002s : 10: predicate.same_eliminate 0.50% : 0.000001s : 10: predicate.set_cell_output_no_recompute 0.81% : 0.000002s : 10: predicate.shard_identity_eliminate 0.78% : 0.000002s : 10: predicate.special_op_eliminate 0.75% : 0.000002s : 10: predicate.specialize_transform 0.91% : 0.000003s : 10: predicate.split_environ_get_set_with_tuple_value 0.76% : 0.000002s : 10: predicate.stack_unstack_eliminate 0.34% : 0.000001s : 5: predicate.switch_call_monad_eliminater 1.40% : 0.000004s : 25: predicate.switch_defer_inline 1.81% : 0.000005s : 35: predicate.switch_layer_defer_inline 4.84% : 0.000013s : 76: predicate.switch_simplify 0.97% : 0.000003s : 17: predicate.tile_eliminate 1.05% : 0.000003s : 17: predicate.transpose_eliminate 1.68% : 0.000005s : 27: predicate.tuple_list_convert_item_index_to_positive 1.85% : 0.000005s : 27: predicate.tuple_list_get_item_const_eliminator 1.47% : 0.000004s : 27: predicate.tuple_list_get_item_depend_reorder 3.14% : 0.000009s : 41: predicate.tuple_list_get_item_eliminator 1.71% : 0.000005s : 27: predicate.tuple_list_get_set_item_eliminator 2.38% : 0.000007s : 37: predicate.tuple_list_set_item_eliminator 1.67% : 0.000005s : 31: predicate.tuple_to_list_eliminator_ 2.36% : 0.000007s : 48: predicate.updatestate_pure_node_eliminater 3.20% : 0.000009s : 58: predicate.updatestate_useless_node_eliminater 0.36% : 0.000001s : 5: predicate.value_based_eliminate 0.66% : 0.000002s : 10: predicate.virtual_dataset_eliminate 0.61% : 0.000002s : 10: predicate.virtual_output_eliminate 0.29% : 0.000001s : 5: predicate.virtual_view_grad_eliminate 0.40% : 0.000001s : 5: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000737 11 57.82% : 0.000426s : 5: func_graph_cloner_run.FuncGraphClonerGraph 42.18% : 0.000311s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.067279 192 0.01% : 0.000006s : 1: ForceFp32Comm 6.24% : 0.004198s : 1: add_attr 6.21% : 0.004175s : 1: add_attr_with_inline 0.01% : 0.000006s : 1: add_comm_op_reuse_tag 0.10% : 0.000064s : 1: add_recomputation 0.01% : 0.000006s : 1: assign_add_opt 0.12% : 0.000080s : 1: auto_monad 0.05% : 0.000035s : 1: auto_monad_reorder 0.01% : 0.000006s : 1: begin_end_overlap_inline 0.01% : 0.000008s : 1: bias_add_comm_swap 0.82% : 0.000551s : 1: bootstrap 0.06% : 0.000039s : 1: cconv 0.01% : 0.000007s : 1: comm_op_add_attrs 0.03% : 0.000022s : 1: control_data_broadcast_order 0.02% : 0.000014s : 1: convert_after_rewriter 0.05% : 0.000037s : 1: cse_after_recomputation 0.01% : 0.000008s : 1: dataset_repeat_opt 0.04% : 0.000026s : 1: detach_backward 0.02% : 0.000013s : 1: environ_conv 0.05% : 0.000036s : 1: event_method 0.01% : 0.000007s : 1: full_micro_interleaved_order_control 0.01% : 0.000009s : 1: get_jit_bprop_graph 0.02% : 0.000012s : 1: graph_reusing 0.01% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000007s : 1: handle_group_info 0.01% : 0.000009s : 1: inline 0.02% : 0.000011s : 1: insert-virtual-dataset 0.01% : 0.000006s : 1: interleave_parallel_branches 0.01% : 0.000006s : 1: interleave_split_concat_branches 0.01% : 0.000008s : 1: label_fine_grained_interleaved_index 0.02% : 0.000011s : 1: label_micro_interleaved_index 0.72% : 0.000487s : 1: loop_unroll 0.01% : 0.000007s : 1: merge_cast_opt 0.01% : 0.000008s : 1: micro_interleaved_order_control 1.07% : 0.000717s : 1: mutable_eliminate 0.02% : 0.000010s : 1: offloading_packed_experts 0.03% : 0.000017s : 1: opt.transform.loop_unroll_optimizer 0.03% : 0.000021s : 1: opt.transform.mutable_eliminate 2.37% : 0.001592s : 78: opt.transform.opt_a 0.06% : 0.000037s : 1: opt.transform.opt_after_cconv 0.06% : 0.000038s : 1: opt.transform.opt_after_jit_grad 0.20% : 0.000134s : 28: opt.transform.opt_b 0.11% : 0.000072s : 2: opt.transform.opt_trans_graph 0.07% : 0.000044s : 4: opt.transform.symbol_engine_opt 5.77% : 0.003885s : 1: opt_a 0.32% : 0.000218s : 1: opt_after_cconv 0.97% : 0.000655s : 1: opt_after_jit_grad 0.49% : 0.000330s : 1: opt_b 11.39% : 0.007662s : 1: optimize 0.04% : 0.000027s : 1: optimize_parallel_all_gather_comm 0.02% : 0.000011s : 1: order_py_execute_after_rewriter 0.04% : 0.000029s : 1: overlap_grad_flash_sp 0.01% : 0.000007s : 1: overlap_grad_matmul_and_grad_allreduce 0.01% : 0.000010s : 1: overlap_grad_ring_attention 0.01% : 0.000007s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000007s : 1: overlap_opt_shard_in_pipeline 0.01% : 0.000008s : 1: overlap_param_gather 0.01% : 0.000006s : 1: overlap_recompute_allgather_and_fa_grad 0.02% : 0.000011s : 1: overlap_recompute_and_grad_model_parallel 0.01% : 0.000008s : 1: overlap_recompute_comm 0.02% : 0.000011s : 1: parallel-infer-symbol 0.01% : 0.000007s : 1: parallel-infer-symbol-second 0.01% : 0.000007s : 1: partial_unused_args_eliminate 0.02% : 0.000013s : 1: pipeline_parallel_scheduler 0.01% : 0.000007s : 1: pipeline_split 0.08% : 0.000055s : 1: pre_auto_parallel 0.07% : 0.000044s : 1: py_interpret_to_execute 0.03% : 0.000023s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000006s : 1: remove_cast_before_assign_add 0.04% : 0.000024s : 1: remove_dup_value 0.76% : 0.000508s : 1: renormalize.infer 0.55% : 0.000370s : 1: renormalize.specialize 0.01% : 0.000008s : 1: reorder_send_recv_between_fp_bp 0.02% : 0.000015s : 1: rewriter_after_jit_bprop_graph 0.08% : 0.000052s : 1: rewriter_after_opt_a 0.17% : 0.000118s : 1: rewriter_before_opt_a 0.01% : 0.000008s : 1: slice_cell_reuse_recomputed_activation 0.01% : 0.000008s : 1: slice_recompute_activation 0.01% : 0.000007s : 1: split_layernorm_comm 0.01% : 0.000008s : 1: split_matmul_comm_elemetwise 0.02% : 0.000012s : 1: swap_dp_allreduce_reducescatter 0.16% : 0.000109s : 1: symbol_engine_optimizer 0.18% : 0.000123s : 1: tuple_transform 59.81% : 0.040241s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:44:56.152.898 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0172473, [21] [bootstrap]: 0.00045456 [type_inference]: 0.00652727 [event_method]: 2.156e-05 [auto_monad]: 6.671e-05 [graph_reusing]: 6.16e-06 [inline]: 2.49001e-06 [add_attr]: 0.0034638, [1] [add_attr_with_inline]: 0.00345236, [1] [Cycle 1]: 9.22e-05, [2] [tag_attr]: 2.372e-05 [meta_addattr_fg_expand]: 6.12999e-06 [parallel-infer-symbol]: 3.78999e-06 [pre_auto_parallel]: 4.099e-05 [insert-virtual-dataset]: 2.39001e-06 [parallel-infer-symbol-second]: 8.30012e-07 [dataset_repeat_opt]: 2.32001e-06 [pipeline_split]: 1.66e-06 [optimize]: 0.00587873, [53] [py_interpret_to_execute]: 3.021e-05 [rewriter_before_opt_a]: 9.242e-05 [opt_a]: 0.00341253, [2] [Cycle 1]: 0.0025714, [45] [expand_dump_flag]: 3.23e-06 [switch_simplify]: 4.659e-05 [loop_unroll]: 3.125e-05 [a_1]: 0.00077507 [with_stream_mark]: 2.125e-05 [recompute_prepare]: 1.223e-05 [updatestate_depend_eliminate]: 4.95001e-06 [updatestate_assign_eliminate]: 3.73999e-06 [updatestate_loads_eliminate]: 3.75998e-06 [parameter_eliminate]: 2.34001e-06 [a_2]: 0.00010235 [accelerated_algorithm]: 9.20999e-06 [shard]: 2.23998e-06 [meta_shard_fg_expand]: 2.83e-06 [shard_inline]: 7.83001e-06 [merge_send_recv]: 1.081e-05 [auto_parallel]: 9.17999e-06 [parallel]: 2.202e-05 [flash_sp]: 1.015e-05 [merge_comm]: 5.12e-06 [allreduce_fusion]: 4.33001e-06 [matmul_add_comm_reduction]: 1.108e-05 [allreduce_slice_to_reducescatter]: 6.59988e-07 [virtual_shard_identity]: 1.092e-05 [virtual_dataset]: 8.27e-06 [get_grad_eliminate_]: 7.88999e-06 [virtual_output]: 7.88001e-06 [merge_forward]: 4.77998e-06 [cell_reuse_recompute_pass]: 1.34e-06 [offload_activation]: 1.268e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.783e-05 [merge_recompute_call_nodes]: 2.07999e-06 [before_grad]: 1.306e-05 [set_forward_comm_id_for_comm_node_pass]: 4.39998e-06 [meta_fg_expand]: 3.41999e-06 [flash_sp_send_recv_attached]: 2.88e-06 [receive_attached]: 2.26998e-06 [after_resolve]: 1.358e-05 [a_after_grad]: 1.221e-05 [renormalize]: 0.00093834 [add_forward_monad_depend]: 7.68001e-06 [auto_monad_grad]: 2.77002e-06 [auto_monad_eliminator]: 2.072e-05 [cse]: 3.803e-05 [a_3]: 6.473e-05 [Cycle 2]: 0.00082971, [45] [expand_dump_flag]: 1.72999e-06 [switch_simplify]: 1.026e-05 [loop_unroll]: 7.68001e-06 [a_1]: 0.00018491 [with_stream_mark]: 1.835e-05 [recompute_prepare]: 8.27e-06 [updatestate_depend_eliminate]: 4.62998e-06 [updatestate_assign_eliminate]: 3.23e-06 [updatestate_loads_eliminate]: 3.46001e-06 [parameter_eliminate]: 1.71e-06 [a_2]: 9.326e-05 [accelerated_algorithm]: 8.60999e-06 [shard]: 2.12001e-06 [meta_shard_fg_expand]: 2.06e-06 [shard_inline]: 7.23e-06 [merge_send_recv]: 1.118e-05 [auto_parallel]: 7.75e-06 [parallel]: 6.88e-06 [flash_sp]: 3.66001e-06 [merge_comm]: 4.90999e-06 [allreduce_fusion]: 3.99002e-06 [matmul_add_comm_reduction]: 9.47999e-06 [allreduce_slice_to_reducescatter]: 3.69997e-07 [virtual_shard_identity]: 1.073e-05 [virtual_dataset]: 8.14002e-06 [get_grad_eliminate_]: 7.08e-06 [virtual_output]: 6.89999e-06 [merge_forward]: 4.26001e-06 [cell_reuse_recompute_pass]: 2.46e-06 [offload_activation]: 1.055e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.643e-05 [merge_recompute_call_nodes]: 1.69998e-06 [before_grad]: 1.344e-05 [set_forward_comm_id_for_comm_node_pass]: 4.80999e-06 [meta_fg_expand]: 2.93e-06 [flash_sp_send_recv_attached]: 1.19e-06 [receive_attached]: 1.34e-06 [after_resolve]: 1.306e-05 [a_after_grad]: 1.153e-05 [renormalize]: 7.99773e-08 [add_forward_monad_depend]: 2.29001e-06 [auto_monad_grad]: 1.37999e-06 [auto_monad_eliminator]: 1.126e-05 [cse]: 2.304e-05 [a_3]: 4.767e-05 [py_interpret_to_execute_after_opt_a]: 1.559e-05 [slice_cell_reuse_recomputed_activation]: 2.66e-06 [rewriter_after_opt_a]: 4.504e-05 [convert_after_rewriter]: 8.07998e-06 [order_py_execute_after_rewriter]: 5.64e-06 [mutable_eliminate]: 0.00069884 [opt_b]: 0.00026225, [1] [Cycle 1]: 0.00025501, [7] [b_1]: 0.00015902 [b_2]: 1.067e-05 [updatestate_depend_eliminate]: 9.24e-06 [updatestate_assign_eliminate]: 3.38e-06 [updatestate_loads_eliminate]: 3.25e-06 [renormalize]: 7.80012e-07 [cse]: 3.148e-05 [optimize_parallel_all_gather_comm]: 1.962e-05 [overlap_param_gather]: 1.86998e-06 [cconv]: 3.146e-05 [loop_unroll]: 0.00047272 [opt_after_cconv]: 0.00012896, [1] [Cycle 1]: 0.00012276, [7] [c_1]: 3.919e-05 [parameter_eliminate]: 4.38999e-06 [updatestate_depend_eliminate]: 8.60001e-06 [updatestate_assign_eliminate]: 3.31999e-06 [updatestate_loads_eliminate]: 3.29001e-06 [cse]: 2.669e-05 [renormalize]: 3.89991e-07 [remove_dup_value]: 1.735e-05 [tuple_transform]: 9.2e-05, [1] [Cycle 1]: 8.672e-05, [4] [d_1]: 5.582e-05 [none_parameter_eliminate]: 1.72001e-06 [renormalize]: 2.00002e-07 [switch_simplify]: 9.24e-06 [partial_unused_args_eliminate]: 2.22001e-06 [add_recomputation]: 6.555e-05 [cse_after_recomputation]: 2.851e-05, [1] [Cycle 1]: 2.328e-05, [1] [cse]: 1.691e-05 [environ_conv]: 7.86001e-06 [swap_dp_allreduce_reducescatter]: 6.26e-06 [bias_add_comm_swap]: 2.76e-06 [label_micro_interleaved_index]: 4.70999e-06 [label_fine_grained_interleaved_index]: 2.94001e-06 [merge_cast_opt]: 1.40999e-06 [slice_recompute_activation]: 2.06e-06 [micro_interleaved_order_control]: 2.16998e-06 [assign_add_opt]: 1.29e-06 [ForceFp32Comm]: 8.2e-07 [remove_cast_before_assign_add]: 9.70002e-07 [full_micro_interleaved_order_control]: 2.69001e-06 [reorder_send_recv_between_fp_bp]: 2.96999e-06 [comm_op_add_attrs]: 1.08001e-06 [add_comm_op_reuse_tag]: 1.00999e-06 [interleave_split_concat_branches]: 1.10999e-06 [interleave_parallel_branches]: 1.02e-06 [overlap_opt_shard_in_pipeline]: 1.35001e-06 [overlap_opt_shard_grad_in_pipeline]: 2.06998e-06 [control_data_broadcast_order]: 1.564e-05 [grouped_pairwise_exchange_alltoall]: 1.62999e-06 [offloading_packed_experts]: 4.68001e-06 [overlap_recompute_and_grad_model_parallel]: 5.52999e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.30999e-06 [overlap_recompute_allgather_and_fa_grad]: 1.32999e-06 [overlap_recompute_comm]: 2.38002e-06 [overlap_grad_ring_attention]: 5.22e-06 [overlap_grad_flash_sp]: 2.664e-05 [begin_end_overlap_inline]: 6.39993e-07 [split_matmul_comm_elemetwise]: 2.43e-06 [split_layernorm_comm]: 2.01e-06 [handle_group_info]: 9.89996e-07 [symbol_engine_optimizer]: 9.582e-05, [1] [Cycle 1]: 9.125e-05, [6] [build]: 4.34002e-06 [elim_shapecalc]: 1.475e-05 [elim_not_effective]: 1.792e-05 [opt_reshape]: 9.10999e-06 [fold_const_symbol]: 1.34e-05 [renormalize]: 2.89991e-07 [detach_backward]: 2.25002e-06 [pipeline_parallel_scheduler]: 1.69e-06 [auto_monad_reorder]: 2.272e-05 [get_jit_bprop_graph]: 2.06e-06 [rewriter_after_jit_bprop_graph]: 5.41998e-06 [opt_after_jit_grad]: 0.00052724 [validate]: 4.774e-05 Sums bootstrap : 0.000455s : 3.56% type_inference : 0.006527s : 51.19% event_method : 0.000022s : 0.17% auto_monad : 0.000067s : 0.52% graph_reusing : 0.000006s : 0.05% inline : 0.000002s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000024s : 0.19% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.05% parallel-infer-symbol : 0.000004s : 0.03% pre_auto_parallel : 0.000041s : 0.32% insert-virtual-dataset : 0.000002s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000030s : 0.24% optimize.rewriter_before_opt_a : 0.000092s : 0.72% optimize.opt_a.expand_dump_flag : 0.000005s : 0.04% optimize.opt_a.switch_simplify : 0.000057s : 0.45% optimize.opt_a.loop_unroll : 0.000039s : 0.31% optimize.opt_a.a_1 : 0.000960s : 7.53% optimize.opt_a.with_stream_mark : 0.000040s : 0.31% optimize.opt_a.recompute_prepare : 0.000020s : 0.16% optimize.opt_a.updatestate_depend_eliminate : 0.000010s : 0.08% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.05% optimize.opt_a.updatestate_loads_eliminate : 0.000007s : 0.06% optimize.opt_a.parameter_eliminate : 0.000004s : 0.03% optimize.opt_a.a_2 : 0.000196s : 1.53% optimize.opt_a.accelerated_algorithm : 0.000018s : 0.14% optimize.opt_a.shard : 0.000004s : 0.03% optimize.opt_a.meta_shard_fg_expand : 0.000005s : 0.04% optimize.opt_a.shard_inline : 0.000015s : 0.12% optimize.opt_a.merge_send_recv : 0.000022s : 0.17% optimize.opt_a.auto_parallel : 0.000017s : 0.13% optimize.opt_a.parallel : 0.000029s : 0.23% optimize.opt_a.flash_sp : 0.000014s : 0.11% optimize.opt_a.merge_comm : 0.000010s : 0.08% optimize.opt_a.allreduce_fusion : 0.000008s : 0.07% optimize.opt_a.matmul_add_comm_reduction : 0.000021s : 0.16% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000022s : 0.17% optimize.opt_a.virtual_dataset : 0.000016s : 0.13% optimize.opt_a.get_grad_eliminate_ : 0.000015s : 0.12% optimize.opt_a.virtual_output : 0.000015s : 0.12% optimize.opt_a.merge_forward : 0.000009s : 0.07% optimize.opt_a.cell_reuse_recompute_pass : 0.000004s : 0.03% optimize.opt_a.offload_activation : 0.000023s : 0.18% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000034s : 0.27% optimize.opt_a.merge_recompute_call_nodes : 0.000004s : 0.03% optimize.opt_a.before_grad : 0.000026s : 0.21% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000009s : 0.07% optimize.opt_a.meta_fg_expand : 0.000006s : 0.05% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.03% optimize.opt_a.receive_attached : 0.000004s : 0.03% optimize.opt_a.after_resolve : 0.000027s : 0.21% optimize.opt_a.a_after_grad : 0.000024s : 0.19% optimize.opt_a.renormalize : 0.000938s : 7.36% optimize.opt_a.add_forward_monad_depend : 0.000010s : 0.08% optimize.opt_a.auto_monad_grad : 0.000004s : 0.03% optimize.opt_a.auto_monad_eliminator : 0.000032s : 0.25% optimize.opt_a.cse : 0.000061s : 0.48% optimize.opt_a.a_3 : 0.000112s : 0.88% optimize.py_interpret_to_execute_after_opt_a : 0.000016s : 0.12% optimize.slice_cell_reuse_recomputed_activation : 0.000003s : 0.02% optimize.rewriter_after_opt_a : 0.000045s : 0.35% optimize.convert_after_rewriter : 0.000008s : 0.06% optimize.order_py_execute_after_rewriter : 0.000006s : 0.04% optimize.mutable_eliminate : 0.000699s : 5.48% optimize.opt_b.b_1 : 0.000159s : 1.25% optimize.opt_b.b_2 : 0.000011s : 0.08% optimize.opt_b.updatestate_depend_eliminate : 0.000009s : 0.07% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_b.renormalize : 0.000001s : 0.01% optimize.opt_b.cse : 0.000031s : 0.25% optimize.optimize_parallel_all_gather_comm : 0.000020s : 0.15% optimize.overlap_param_gather : 0.000002s : 0.01% optimize.cconv : 0.000031s : 0.25% optimize.loop_unroll : 0.000473s : 3.71% optimize.opt_after_cconv.c_1 : 0.000039s : 0.31% optimize.opt_after_cconv.parameter_eliminate : 0.000004s : 0.03% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000009s : 0.07% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.cse : 0.000027s : 0.21% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000017s : 0.14% optimize.tuple_transform.d_1 : 0.000056s : 0.44% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000009s : 0.07% optimize.partial_unused_args_eliminate : 0.000002s : 0.02% optimize.add_recomputation : 0.000066s : 0.51% optimize.cse_after_recomputation.cse : 0.000017s : 0.13% optimize.environ_conv : 0.000008s : 0.06% optimize.swap_dp_allreduce_reducescatter : 0.000006s : 0.05% optimize.bias_add_comm_swap : 0.000003s : 0.02% optimize.label_micro_interleaved_index : 0.000005s : 0.04% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.02% optimize.merge_cast_opt : 0.000001s : 0.01% optimize.slice_recompute_activation : 0.000002s : 0.02% optimize.micro_interleaved_order_control : 0.000002s : 0.02% optimize.assign_add_opt : 0.000001s : 0.01% optimize.ForceFp32Comm : 0.000001s : 0.01% optimize.remove_cast_before_assign_add : 0.000001s : 0.01% optimize.full_micro_interleaved_order_control : 0.000003s : 0.02% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.02% optimize.comm_op_add_attrs : 0.000001s : 0.01% optimize.add_comm_op_reuse_tag : 0.000001s : 0.01% optimize.interleave_split_concat_branches : 0.000001s : 0.01% optimize.interleave_parallel_branches : 0.000001s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.02% optimize.control_data_broadcast_order : 0.000016s : 0.12% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.01% optimize.offloading_packed_experts : 0.000005s : 0.04% optimize.overlap_recompute_and_grad_model_parallel : 0.000006s : 0.04% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.01% optimize.overlap_recompute_comm : 0.000002s : 0.02% optimize.overlap_grad_ring_attention : 0.000005s : 0.04% optimize.overlap_grad_flash_sp : 0.000027s : 0.21% optimize.begin_end_overlap_inline : 0.000001s : 0.01% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.02% optimize.split_layernorm_comm : 0.000002s : 0.02% optimize.handle_group_info : 0.000001s : 0.01% optimize.symbol_engine_optimizer.build : 0.000004s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000015s : 0.12% optimize.symbol_engine_optimizer.elim_not_effective : 0.000018s : 0.14% optimize.symbol_engine_optimizer.opt_reshape : 0.000009s : 0.07% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000013s : 0.11% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.02% pipeline_parallel_scheduler : 0.000002s : 0.01% auto_monad_reorder : 0.000023s : 0.18% get_jit_bprop_graph : 0.000002s : 0.02% rewriter_after_jit_bprop_graph : 0.000005s : 0.04% opt_after_jit_grad : 0.000527s : 4.13% validate : 0.000048s : 0.37% Time group info: ------[substitution.] 0.000236 38 11.35% : 0.000027s : 3: substitution.cast_eliminate 1.12% : 0.000003s : 3: substitution.elim_not_effective 0.76% : 0.000002s : 3: substitution.fold_const_symbol 2.84% : 0.000007s : 5: substitution.graph_param_transform 69.59% : 0.000164s : 4: substitution.inline 2.39% : 0.000006s : 6: substitution.j_node_and_user_rematch 2.71% : 0.000006s : 6: substitution.remove_not_recompute_node 2.24% : 0.000005s : 4: substitution.replace_old_param 7.00% : 0.000016s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.006456 2 87.88% : 0.005673s : 1: type_inference.infer 12.12% : 0.000783s : 1: type_inference.specialize ------[replace.] 0.000067 8 60.16% : 0.000041s : 4: replace.inline 39.84% : 0.000027s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000176 8 91.78% : 0.000161s : 4: match.inline 8.22% : 0.000014s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000263 1596 0.98% : 0.000003s : 17: predicate.accumulaten_eliminater 0.76% : 0.000002s : 5: predicate.ad_related_special_op_eliminate 0.54% : 0.000001s : 10: predicate.addn_check_dump 1.10% : 0.000003s : 17: predicate.addn_zero_filter 0.85% : 0.000002s : 17: predicate.adjust_all_reduce_mul_add 2.05% : 0.000005s : 27: predicate.arithmetic_simplify 1.07% : 0.000003s : 17: predicate.cast_eliminate 0.58% : 0.000002s : 10: predicate.check_bprop_eliminate 0.55% : 0.000001s : 10: predicate.compare_switch_simplify 0.17% : 0.000000s : 5: predicate.const_output_eliminate 0.61% : 0.000002s : 10: predicate.depend_value_elim 0.98% : 0.000003s : 17: predicate.dict_get_item_const_eliminator 1.05% : 0.000003s : 17: predicate.dict_get_item_eliminator 0.95% : 0.000002s : 17: predicate.dict_set_item_eliminator 0.89% : 0.000002s : 10: predicate.dumpgradient_eliminate 0.21% : 0.000001s : 5: predicate.elim_not_effective 0.47% : 0.000001s : 5: predicate.elim_shapecalc_of_broadcastargs 1.31% : 0.000003s : 22: predicate.environ_add_const_eliminate 1.14% : 0.000003s : 22: predicate.environ_get_add_eliminate 1.17% : 0.000003s : 22: predicate.environ_get_depend_swap 1.91% : 0.000005s : 32: predicate.environ_get_eliminate 1.10% : 0.000003s : 22: predicate.environ_get_set_eliminate 1.44% : 0.000004s : 25: predicate.exchange_switch_depend_value 2.41% : 0.000006s : 25: predicate.float_depend_g_call 0.55% : 0.000001s : 10: predicate.float_environ_get_switch 0.80% : 0.000002s : 15: predicate.float_tuple_getitem_switch 0.18% : 0.000000s : 5: predicate.fold_const_symbol 0.60% : 0.000002s : 10: predicate.get_grad_eliminate 0.22% : 0.000001s : 5: predicate.graph_param_transform 0.58% : 0.000002s : 10: predicate.incorporate_call 0.49% : 0.000001s : 10: predicate.incorporate_call_switch 6.22% : 0.000016s : 72: predicate.inline 0.76% : 0.000002s : 10: predicate.inline_without_move 0.30% : 0.000001s : 10: predicate.j_node_and_user_rematch 0.87% : 0.000002s : 10: predicate.less_batch_normalization 1.76% : 0.000005s : 31: predicate.list_to_tuple_eliminator_ 2.54% : 0.000007s : 48: predicate.load_eliminater 0.86% : 0.000002s : 5: predicate.loop_unroll_after_grad 1.96% : 0.000005s : 36: predicate.loop_unroll_before_grad 1.74% : 0.000005s : 27: predicate.make_slice_get_slice_eliminator 0.54% : 0.000001s : 10: predicate.merge_addn 0.54% : 0.000001s : 10: predicate.micro_step_allgather_replace 0.57% : 0.000001s : 10: predicate.mini_step_allgather_replace 1.08% : 0.000003s : 17: predicate.minmaximum_grad 1.33% : 0.000003s : 5: predicate.mutable_eliminate 0.32% : 0.000001s : 5: predicate.opt_reshape 0.45% : 0.000001s : 5: predicate.parallel_virtual_node 1.77% : 0.000005s : 25: predicate.partial_defer_inline 1.66% : 0.000004s : 26: predicate.partial_eliminate 0.98% : 0.000003s : 17: predicate.print_const_string_wrapper 0.54% : 0.000001s : 10: predicate.reduce_all_const_elim 1.23% : 0.000003s : 17: predicate.reduce_eliminate 2.55% : 0.000007s : 48: predicate.redundant_stop_gradient_eliminater 0.65% : 0.000002s : 10: predicate.remove_not_recompute_node 1.46% : 0.000004s : 31: predicate.replace_applicator 0.48% : 0.000001s : 10: predicate.replace_old_param 0.32% : 0.000001s : 5: predicate.reset_defer_inline 1.05% : 0.000003s : 17: predicate.reshape_eliminate 0.68% : 0.000002s : 10: predicate.row_tensor_add_zeros_like 0.40% : 0.000001s : 5: predicate.row_tensor_eliminate 0.87% : 0.000002s : 10: predicate.same_eliminate 0.41% : 0.000001s : 10: predicate.set_cell_output_no_recompute 0.68% : 0.000002s : 10: predicate.shard_identity_eliminate 0.79% : 0.000002s : 10: predicate.special_op_eliminate 0.69% : 0.000002s : 10: predicate.specialize_transform 0.90% : 0.000002s : 10: predicate.split_environ_get_set_with_tuple_value 0.69% : 0.000002s : 10: predicate.stack_unstack_eliminate 0.38% : 0.000001s : 5: predicate.switch_call_monad_eliminater 1.47% : 0.000004s : 25: predicate.switch_defer_inline 2.03% : 0.000005s : 35: predicate.switch_layer_defer_inline 4.68% : 0.000012s : 76: predicate.switch_simplify 0.91% : 0.000002s : 17: predicate.tile_eliminate 0.91% : 0.000002s : 17: predicate.transpose_eliminate 1.59% : 0.000004s : 27: predicate.tuple_list_convert_item_index_to_positive 1.60% : 0.000004s : 27: predicate.tuple_list_get_item_const_eliminator 1.49% : 0.000004s : 27: predicate.tuple_list_get_item_depend_reorder 3.28% : 0.000009s : 41: predicate.tuple_list_get_item_eliminator 1.50% : 0.000004s : 27: predicate.tuple_list_get_set_item_eliminator 2.19% : 0.000006s : 37: predicate.tuple_list_set_item_eliminator 1.72% : 0.000005s : 31: predicate.tuple_to_list_eliminator_ 2.46% : 0.000006s : 48: predicate.updatestate_pure_node_eliminater 3.15% : 0.000008s : 58: predicate.updatestate_useless_node_eliminater 0.38% : 0.000001s : 5: predicate.value_based_eliminate 0.68% : 0.000002s : 10: predicate.virtual_dataset_eliminate 0.59% : 0.000002s : 10: predicate.virtual_output_eliminate 0.27% : 0.000001s : 5: predicate.virtual_view_grad_eliminate 0.37% : 0.000001s : 5: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000602 11 51.99% : 0.000313s : 5: func_graph_cloner_run.FuncGraphClonerGraph 48.01% : 0.000289s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.029234 192 0.01% : 0.000004s : 1: ForceFp32Comm 11.87% : 0.003470s : 1: add_attr 11.82% : 0.003456s : 1: add_attr_with_inline 0.01% : 0.000004s : 1: add_comm_op_reuse_tag 0.24% : 0.000070s : 1: add_recomputation 0.01% : 0.000004s : 1: assign_add_opt 0.25% : 0.000073s : 1: auto_monad 0.09% : 0.000027s : 1: auto_monad_reorder 0.01% : 0.000004s : 1: begin_end_overlap_inline 0.02% : 0.000006s : 1: bias_add_comm_swap 1.66% : 0.000485s : 1: bootstrap 0.12% : 0.000035s : 1: cconv 0.01% : 0.000004s : 1: comm_op_add_attrs 0.07% : 0.000019s : 1: control_data_broadcast_order 0.04% : 0.000012s : 1: convert_after_rewriter 0.11% : 0.000032s : 1: cse_after_recomputation 0.02% : 0.000005s : 1: dataset_repeat_opt 0.02% : 0.000006s : 1: detach_backward 0.04% : 0.000011s : 1: environ_conv 0.10% : 0.000029s : 1: event_method 0.02% : 0.000006s : 1: full_micro_interleaved_order_control 0.02% : 0.000005s : 1: get_jit_bprop_graph 0.04% : 0.000010s : 1: graph_reusing 0.01% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000004s : 1: handle_group_info 0.02% : 0.000005s : 1: inline 0.02% : 0.000006s : 1: insert-virtual-dataset 0.01% : 0.000004s : 1: interleave_parallel_branches 0.01% : 0.000004s : 1: interleave_split_concat_branches 0.02% : 0.000006s : 1: label_fine_grained_interleaved_index 0.03% : 0.000007s : 1: label_micro_interleaved_index 1.65% : 0.000483s : 1: loop_unroll 0.01% : 0.000004s : 1: merge_cast_opt 0.02% : 0.000005s : 1: micro_interleaved_order_control 2.43% : 0.000710s : 1: mutable_eliminate 0.03% : 0.000008s : 1: offloading_packed_experts 0.06% : 0.000019s : 1: opt.transform.loop_unroll_optimizer 0.08% : 0.000022s : 1: opt.transform.mutable_eliminate 5.12% : 0.001496s : 78: opt.transform.opt_a 0.13% : 0.000038s : 1: opt.transform.opt_after_cconv 0.10% : 0.000030s : 1: opt.transform.opt_after_jit_grad 0.47% : 0.000137s : 28: opt.transform.opt_b 0.21% : 0.000062s : 2: opt.transform.opt_trans_graph 0.17% : 0.000050s : 4: opt.transform.symbol_engine_opt 11.68% : 0.003416s : 1: opt_a 0.45% : 0.000132s : 1: opt_after_cconv 1.84% : 0.000538s : 1: opt_after_jit_grad 0.91% : 0.000266s : 1: opt_b 20.13% : 0.005884s : 1: optimize 0.08% : 0.000023s : 1: optimize_parallel_all_gather_comm 0.03% : 0.000009s : 1: order_py_execute_after_rewriter 0.11% : 0.000031s : 1: overlap_grad_flash_sp 0.01% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.03% : 0.000009s : 1: overlap_grad_ring_attention 0.02% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.02% : 0.000005s : 1: overlap_param_gather 0.01% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.03% : 0.000008s : 1: overlap_recompute_and_grad_model_parallel 0.02% : 0.000005s : 1: overlap_recompute_comm 0.03% : 0.000007s : 1: parallel-infer-symbol 0.01% : 0.000004s : 1: parallel-infer-symbol-second 0.02% : 0.000005s : 1: partial_unused_args_eliminate 0.02% : 0.000005s : 1: pipeline_parallel_scheduler 0.02% : 0.000005s : 1: pipeline_split 0.15% : 0.000045s : 1: pre_auto_parallel 0.12% : 0.000035s : 1: py_interpret_to_execute 0.06% : 0.000019s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000004s : 1: remove_cast_before_assign_add 0.07% : 0.000021s : 1: remove_dup_value 1.83% : 0.000535s : 1: renormalize.infer 1.34% : 0.000392s : 1: renormalize.specialize 0.02% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.03% : 0.000009s : 1: rewriter_after_jit_bprop_graph 0.17% : 0.000049s : 1: rewriter_after_opt_a 0.33% : 0.000097s : 1: rewriter_before_opt_a 0.02% : 0.000006s : 1: slice_cell_reuse_recomputed_activation 0.02% : 0.000005s : 1: slice_recompute_activation 0.02% : 0.000005s : 1: split_layernorm_comm 0.02% : 0.000005s : 1: split_matmul_comm_elemetwise 0.03% : 0.000009s : 1: swap_dp_allreduce_reducescatter 0.34% : 0.000098s : 1: symbol_engine_optimizer 0.33% : 0.000095s : 1: tuple_transform 22.40% : 0.006549s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:44:56.632.285 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:44:56.632.599 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0510032, [21] [bootstrap]: 0.00045084 [type_inference]: 0.0237715 [event_method]: 2.306e-05 [auto_monad]: 6.968e-05 [graph_reusing]: 6.15002e-06 [inline]: 3.00998e-06 [add_attr]: 0.0037276, [1] [add_attr_with_inline]: 0.00371589, [1] [Cycle 1]: 9.664e-05, [2] [tag_attr]: 2.529e-05 [meta_addattr_fg_expand]: 6.46e-06 [parallel-infer-symbol]: 4.48999e-06 [pre_auto_parallel]: 4.324e-05 [insert-virtual-dataset]: 2.76e-06 [parallel-infer-symbol-second]: 8.60018e-07 [dataset_repeat_opt]: 2.15002e-06 [pipeline_split]: 1.69e-06 [optimize]: 0.0216046, [53] [py_interpret_to_execute]: 3.809e-05 [rewriter_before_opt_a]: 0.00011191 [opt_a]: 0.0187751, [2] [Cycle 1]: 0.0176988, [45] [expand_dump_flag]: 3.11999e-06 [switch_simplify]: 4.612e-05 [loop_unroll]: 3.237e-05 [a_1]: 0.00077417 [with_stream_mark]: 2.689e-05 [recompute_prepare]: 1.616e-05 [updatestate_depend_eliminate]: 5.37999e-06 [updatestate_assign_eliminate]: 4.32e-06 [updatestate_loads_eliminate]: 3.90998e-06 [parameter_eliminate]: 2.46e-06 [a_2]: 0.00013457 [accelerated_algorithm]: 1.088e-05 [shard]: 2.56e-06 [meta_shard_fg_expand]: 3.01001e-06 [shard_inline]: 8.42e-06 [merge_send_recv]: 1.222e-05 [auto_parallel]: 1.214e-05 [parallel]: 2.192e-05 [flash_sp]: 1.204e-05 [merge_comm]: 5.91e-06 [allreduce_fusion]: 4.35999e-06 [matmul_add_comm_reduction]: 1.142e-05 [allreduce_slice_to_reducescatter]: 9.00007e-07 [virtual_shard_identity]: 1.381e-05 [virtual_dataset]: 8.43999e-06 [get_grad_eliminate_]: 8.59e-06 [virtual_output]: 8.85001e-06 [merge_forward]: 5.19998e-06 [cell_reuse_recompute_pass]: 1.64e-06 [offload_activation]: 1.266e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.459e-05 [merge_recompute_call_nodes]: 1.96e-06 [before_grad]: 1.51e-05 [set_forward_comm_id_for_comm_node_pass]: 5.30001e-06 [meta_fg_expand]: 4.26001e-06 [flash_sp_send_recv_attached]: 3.75998e-06 [receive_attached]: 2.35002e-06 [after_resolve]: 1.64e-05 [a_after_grad]: 1.374e-05 [renormalize]: 0.0157602 [add_forward_monad_depend]: 1.24e-05 [auto_monad_grad]: 3.22002e-06 [auto_monad_eliminator]: 2.648e-05 [cse]: 4.135e-05 [a_3]: 8.65e-05 [Cycle 2]: 0.00105659, [45] [expand_dump_flag]: 2.43998e-06 [switch_simplify]: 1.121e-05 [loop_unroll]: 7.95e-06 [a_1]: 0.0001948 [with_stream_mark]: 2.149e-05 [recompute_prepare]: 8.89998e-06 [updatestate_depend_eliminate]: 4.99998e-06 [updatestate_assign_eliminate]: 3.9e-06 [updatestate_loads_eliminate]: 3.71001e-06 [parameter_eliminate]: 2.04999e-06 [a_2]: 0.00012341 [accelerated_algorithm]: 8.37e-06 [shard]: 2.39999e-06 [meta_shard_fg_expand]: 2.21998e-06 [shard_inline]: 7.88001e-06 [merge_send_recv]: 1.011e-05 [auto_parallel]: 1.078e-05 [parallel]: 1.318e-05 [flash_sp]: 4.27e-06 [merge_comm]: 4.82e-06 [allreduce_fusion]: 4.62e-06 [matmul_add_comm_reduction]: 1.208e-05 [allreduce_slice_to_reducescatter]: 8.50006e-07 [virtual_shard_identity]: 1.001e-05 [virtual_dataset]: 7.66999e-06 [get_grad_eliminate_]: 7.51999e-06 [virtual_output]: 7.88001e-06 [merge_forward]: 4.82e-06 [cell_reuse_recompute_pass]: 3.38e-06 [offload_activation]: 1.128e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.241e-05 [merge_recompute_call_nodes]: 1.42e-06 [before_grad]: 1.434e-05 [set_forward_comm_id_for_comm_node_pass]: 5.98002e-06 [meta_fg_expand]: 3.32002e-06 [flash_sp_send_recv_attached]: 1.98002e-06 [receive_attached]: 2.66e-06 [after_resolve]: 1.547e-05 [a_after_grad]: 1.219e-05 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 2.59001e-06 [auto_monad_grad]: 2.34999e-06 [auto_monad_eliminator]: 1.298e-05 [cse]: 2.324e-05 [a_3]: 6.086e-05 [py_interpret_to_execute_after_opt_a]: 2.181e-05 [slice_cell_reuse_recomputed_activation]: 5.15001e-06 [rewriter_after_opt_a]: 5.254e-05 [convert_after_rewriter]: 1.16e-05 [order_py_execute_after_rewriter]: 8.54e-06 [mutable_eliminate]: 0.00075584 [opt_b]: 0.00032936, [1] [Cycle 1]: 0.00031901, [7] [b_1]: 0.0002031 [b_2]: 1.062e-05 [updatestate_depend_eliminate]: 9.00999e-06 [updatestate_assign_eliminate]: 3.6e-06 [updatestate_loads_eliminate]: 3.51999e-06 [renormalize]: 8.99978e-07 [cse]: 2.954e-05 [optimize_parallel_all_gather_comm]: 2.338e-05 [overlap_param_gather]: 5.34e-06 [cconv]: 3.775e-05 [loop_unroll]: 0.00049381 [opt_after_cconv]: 0.00014902, [1] [Cycle 1]: 0.00013894, [7] [c_1]: 3.816e-05 [parameter_eliminate]: 5.25999e-06 [updatestate_depend_eliminate]: 7.18998e-06 [updatestate_assign_eliminate]: 3.28e-06 [updatestate_loads_eliminate]: 3.53999e-06 [cse]: 2.404e-05 [renormalize]: 7.00005e-07 [remove_dup_value]: 1.92e-05 [tuple_transform]: 0.00013023, [1] [Cycle 1]: 0.00012198, [4] [d_1]: 5.853e-05 [none_parameter_eliminate]: 1.76e-06 [renormalize]: 1.90019e-07 [switch_simplify]: 9.67001e-06 [partial_unused_args_eliminate]: 4.99e-06 [add_recomputation]: 6.802e-05 [cse_after_recomputation]: 3.471e-05, [1] [Cycle 1]: 2.684e-05, [1] [cse]: 1.703e-05 [environ_conv]: 1.022e-05 [swap_dp_allreduce_reducescatter]: 9.12001e-06 [bias_add_comm_swap]: 6.04999e-06 [label_micro_interleaved_index]: 7.7e-06 [label_fine_grained_interleaved_index]: 5.82001e-06 [merge_cast_opt]: 4.18999e-06 [slice_recompute_activation]: 4.89e-06 [micro_interleaved_order_control]: 4.35999e-06 [assign_add_opt]: 3.75e-06 [ForceFp32Comm]: 3.86001e-06 [remove_cast_before_assign_add]: 3.56999e-06 [full_micro_interleaved_order_control]: 4.70001e-06 [reorder_send_recv_between_fp_bp]: 5.32999e-06 [comm_op_add_attrs]: 4.09997e-06 [add_comm_op_reuse_tag]: 3.65e-06 [interleave_split_concat_branches]: 4.02e-06 [interleave_parallel_branches]: 3.41001e-06 [overlap_opt_shard_in_pipeline]: 3.94002e-06 [overlap_opt_shard_grad_in_pipeline]: 4.12998e-06 [control_data_broadcast_order]: 1.949e-05 [grouped_pairwise_exchange_alltoall]: 4.02e-06 [offloading_packed_experts]: 7.31999e-06 [overlap_recompute_and_grad_model_parallel]: 7.7e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.60998e-06 [overlap_recompute_allgather_and_fa_grad]: 4.1e-06 [overlap_recompute_comm]: 5.12e-06 [overlap_grad_ring_attention]: 7.14001e-06 [overlap_grad_flash_sp]: 2.818e-05 [begin_end_overlap_inline]: 2.98e-06 [split_matmul_comm_elemetwise]: 5.52999e-06 [split_layernorm_comm]: 4.50001e-06 [handle_group_info]: 4.27e-06 [symbol_engine_optimizer]: 0.00010511, [1] [Cycle 1]: 9.78e-05, [6] [build]: 3.8e-06 [elim_shapecalc]: 1.191e-05 [elim_not_effective]: 1.522e-05 [opt_reshape]: 8.28999e-06 [fold_const_symbol]: 1.213e-05 [renormalize]: 2.19996e-07 [detach_backward]: 4.60999e-06 [pipeline_parallel_scheduler]: 1.74e-06 [auto_monad_reorder]: 2.439e-05 [get_jit_bprop_graph]: 1.94999e-06 [rewriter_after_jit_bprop_graph]: 7.38e-06 [opt_after_jit_grad]: 0.00056234 [validate]: 4.575e-05 Sums bootstrap : 0.000451s : 0.99% type_inference : 0.023772s : 52.44% event_method : 0.000023s : 0.05% auto_monad : 0.000070s : 0.15% graph_reusing : 0.000006s : 0.01% inline : 0.000003s : 0.01% add_attr.add_attr_with_inline.tag_attr : 0.000025s : 0.06% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.01% parallel-infer-symbol : 0.000004s : 0.01% pre_auto_parallel : 0.000043s : 0.10% insert-virtual-dataset : 0.000003s : 0.01% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000038s : 0.08% optimize.rewriter_before_opt_a : 0.000112s : 0.25% optimize.opt_a.expand_dump_flag : 0.000006s : 0.01% optimize.opt_a.switch_simplify : 0.000057s : 0.13% optimize.opt_a.loop_unroll : 0.000040s : 0.09% optimize.opt_a.a_1 : 0.000969s : 2.14% optimize.opt_a.with_stream_mark : 0.000048s : 0.11% optimize.opt_a.recompute_prepare : 0.000025s : 0.06% optimize.opt_a.updatestate_depend_eliminate : 0.000010s : 0.02% optimize.opt_a.updatestate_assign_eliminate : 0.000008s : 0.02% optimize.opt_a.updatestate_loads_eliminate : 0.000008s : 0.02% optimize.opt_a.parameter_eliminate : 0.000005s : 0.01% optimize.opt_a.a_2 : 0.000258s : 0.57% optimize.opt_a.accelerated_algorithm : 0.000019s : 0.04% optimize.opt_a.shard : 0.000005s : 0.01% optimize.opt_a.meta_shard_fg_expand : 0.000005s : 0.01% optimize.opt_a.shard_inline : 0.000016s : 0.04% optimize.opt_a.merge_send_recv : 0.000022s : 0.05% optimize.opt_a.auto_parallel : 0.000023s : 0.05% optimize.opt_a.parallel : 0.000035s : 0.08% optimize.opt_a.flash_sp : 0.000016s : 0.04% optimize.opt_a.merge_comm : 0.000011s : 0.02% optimize.opt_a.allreduce_fusion : 0.000009s : 0.02% optimize.opt_a.matmul_add_comm_reduction : 0.000024s : 0.05% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000002s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000024s : 0.05% optimize.opt_a.virtual_dataset : 0.000016s : 0.04% optimize.opt_a.get_grad_eliminate_ : 0.000016s : 0.04% optimize.opt_a.virtual_output : 0.000017s : 0.04% optimize.opt_a.merge_forward : 0.000010s : 0.02% optimize.opt_a.cell_reuse_recompute_pass : 0.000005s : 0.01% optimize.opt_a.offload_activation : 0.000024s : 0.05% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000047s : 0.10% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.01% optimize.opt_a.before_grad : 0.000029s : 0.06% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000011s : 0.02% optimize.opt_a.meta_fg_expand : 0.000008s : 0.02% optimize.opt_a.flash_sp_send_recv_attached : 0.000006s : 0.01% optimize.opt_a.receive_attached : 0.000005s : 0.01% optimize.opt_a.after_resolve : 0.000032s : 0.07% optimize.opt_a.a_after_grad : 0.000026s : 0.06% optimize.opt_a.renormalize : 0.015760s : 34.77% optimize.opt_a.add_forward_monad_depend : 0.000015s : 0.03% optimize.opt_a.auto_monad_grad : 0.000006s : 0.01% optimize.opt_a.auto_monad_eliminator : 0.000039s : 0.09% optimize.opt_a.cse : 0.000065s : 0.14% optimize.opt_a.a_3 : 0.000147s : 0.33% optimize.py_interpret_to_execute_after_opt_a : 0.000022s : 0.05% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.01% optimize.rewriter_after_opt_a : 0.000053s : 0.12% optimize.convert_after_rewriter : 0.000012s : 0.03% optimize.order_py_execute_after_rewriter : 0.000009s : 0.02% optimize.mutable_eliminate : 0.000756s : 1.67% optimize.opt_b.b_1 : 0.000203s : 0.45% optimize.opt_b.b_2 : 0.000011s : 0.02% optimize.opt_b.updatestate_depend_eliminate : 0.000009s : 0.02% optimize.opt_b.updatestate_assign_eliminate : 0.000004s : 0.01% optimize.opt_b.updatestate_loads_eliminate : 0.000004s : 0.01% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000030s : 0.07% optimize.optimize_parallel_all_gather_comm : 0.000023s : 0.05% optimize.overlap_param_gather : 0.000005s : 0.01% optimize.cconv : 0.000038s : 0.08% optimize.loop_unroll : 0.000494s : 1.09% optimize.opt_after_cconv.c_1 : 0.000038s : 0.08% optimize.opt_after_cconv.parameter_eliminate : 0.000005s : 0.01% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000007s : 0.02% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000004s : 0.01% optimize.opt_after_cconv.cse : 0.000024s : 0.05% optimize.opt_after_cconv.renormalize : 0.000001s : 0.00% optimize.remove_dup_value : 0.000019s : 0.04% optimize.tuple_transform.d_1 : 0.000059s : 0.13% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000010s : 0.02% optimize.partial_unused_args_eliminate : 0.000005s : 0.01% optimize.add_recomputation : 0.000068s : 0.15% optimize.cse_after_recomputation.cse : 0.000017s : 0.04% optimize.environ_conv : 0.000010s : 0.02% optimize.swap_dp_allreduce_reducescatter : 0.000009s : 0.02% optimize.bias_add_comm_swap : 0.000006s : 0.01% optimize.label_micro_interleaved_index : 0.000008s : 0.02% optimize.label_fine_grained_interleaved_index : 0.000006s : 0.01% optimize.merge_cast_opt : 0.000004s : 0.01% optimize.slice_recompute_activation : 0.000005s : 0.01% optimize.micro_interleaved_order_control : 0.000004s : 0.01% optimize.assign_add_opt : 0.000004s : 0.01% optimize.ForceFp32Comm : 0.000004s : 0.01% optimize.remove_cast_before_assign_add : 0.000004s : 0.01% optimize.full_micro_interleaved_order_control : 0.000005s : 0.01% optimize.reorder_send_recv_between_fp_bp : 0.000005s : 0.01% optimize.comm_op_add_attrs : 0.000004s : 0.01% optimize.add_comm_op_reuse_tag : 0.000004s : 0.01% optimize.interleave_split_concat_branches : 0.000004s : 0.01% optimize.interleave_parallel_branches : 0.000003s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000004s : 0.01% optimize.control_data_broadcast_order : 0.000019s : 0.04% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.01% optimize.offloading_packed_experts : 0.000007s : 0.02% optimize.overlap_recompute_and_grad_model_parallel : 0.000008s : 0.02% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.01% optimize.overlap_recompute_comm : 0.000005s : 0.01% optimize.overlap_grad_ring_attention : 0.000007s : 0.02% optimize.overlap_grad_flash_sp : 0.000028s : 0.06% optimize.begin_end_overlap_inline : 0.000003s : 0.01% optimize.split_matmul_comm_elemetwise : 0.000006s : 0.01% optimize.split_layernorm_comm : 0.000005s : 0.01% optimize.handle_group_info : 0.000004s : 0.01% optimize.symbol_engine_optimizer.build : 0.000004s : 0.01% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000012s : 0.03% optimize.symbol_engine_optimizer.elim_not_effective : 0.000015s : 0.03% optimize.symbol_engine_optimizer.opt_reshape : 0.000008s : 0.02% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000012s : 0.03% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000005s : 0.01% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000024s : 0.05% get_jit_bprop_graph : 0.000002s : 0.00% rewriter_after_jit_bprop_graph : 0.000007s : 0.02% opt_after_jit_grad : 0.000562s : 1.24% validate : 0.000046s : 0.10% Time group info: ------[substitution.] 0.000275 38 13.49% : 0.000037s : 3: substitution.cast_eliminate 0.90% : 0.000002s : 3: substitution.elim_not_effective 0.60% : 0.000002s : 3: substitution.fold_const_symbol 2.84% : 0.000008s : 5: substitution.graph_param_transform 68.68% : 0.000189s : 4: substitution.inline 2.44% : 0.000007s : 6: substitution.j_node_and_user_rematch 3.23% : 0.000009s : 6: substitution.remove_not_recompute_node 3.22% : 0.000009s : 4: substitution.replace_old_param 4.61% : 0.000013s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.023708 2 96.57% : 0.022894s : 1: type_inference.infer 3.43% : 0.000814s : 1: type_inference.specialize ------[replace.] 0.000071 8 62.97% : 0.000045s : 4: replace.inline 37.03% : 0.000026s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000197 8 94.48% : 0.000186s : 4: match.inline 5.52% : 0.000011s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000257 1504 0.85% : 0.000002s : 15: predicate.accumulaten_eliminater 0.85% : 0.000002s : 5: predicate.ad_related_special_op_eliminate 0.57% : 0.000001s : 10: predicate.addn_check_dump 0.98% : 0.000003s : 15: predicate.addn_zero_filter 0.77% : 0.000002s : 15: predicate.adjust_all_reduce_mul_add 2.39% : 0.000006s : 25: predicate.arithmetic_simplify 1.20% : 0.000003s : 15: predicate.cast_eliminate 0.66% : 0.000002s : 10: predicate.check_bprop_eliminate 0.55% : 0.000001s : 10: predicate.compare_switch_simplify 0.18% : 0.000000s : 5: predicate.const_output_eliminate 0.70% : 0.000002s : 10: predicate.depend_value_elim 0.87% : 0.000002s : 15: predicate.dict_get_item_const_eliminator 1.04% : 0.000003s : 15: predicate.dict_get_item_eliminator 0.90% : 0.000002s : 15: predicate.dict_set_item_eliminator 0.91% : 0.000002s : 10: predicate.dumpgradient_eliminate 0.21% : 0.000001s : 5: predicate.elim_not_effective 0.38% : 0.000001s : 5: predicate.elim_shapecalc_of_broadcastargs 1.26% : 0.000003s : 20: predicate.environ_add_const_eliminate 1.03% : 0.000003s : 20: predicate.environ_get_add_eliminate 1.04% : 0.000003s : 20: predicate.environ_get_depend_swap 1.74% : 0.000004s : 30: predicate.environ_get_eliminate 1.05% : 0.000003s : 20: predicate.environ_get_set_eliminate 1.32% : 0.000003s : 23: predicate.exchange_switch_depend_value 2.26% : 0.000006s : 23: predicate.float_depend_g_call 0.54% : 0.000001s : 10: predicate.float_environ_get_switch 0.78% : 0.000002s : 15: predicate.float_tuple_getitem_switch 0.17% : 0.000000s : 5: predicate.fold_const_symbol 0.77% : 0.000002s : 10: predicate.get_grad_eliminate 0.20% : 0.000001s : 5: predicate.graph_param_transform 0.61% : 0.000002s : 10: predicate.incorporate_call 0.52% : 0.000001s : 10: predicate.incorporate_call_switch 6.40% : 0.000016s : 68: predicate.inline 0.89% : 0.000002s : 10: predicate.inline_without_move 0.35% : 0.000001s : 10: predicate.j_node_and_user_rematch 1.14% : 0.000003s : 10: predicate.less_batch_normalization 1.82% : 0.000005s : 29: predicate.list_to_tuple_eliminator_ 2.38% : 0.000006s : 44: predicate.load_eliminater 0.70% : 0.000002s : 5: predicate.loop_unroll_after_grad 2.02% : 0.000005s : 36: predicate.loop_unroll_before_grad 1.90% : 0.000005s : 25: predicate.make_slice_get_slice_eliminator 0.67% : 0.000002s : 10: predicate.merge_addn 0.60% : 0.000002s : 10: predicate.micro_step_allgather_replace 0.69% : 0.000002s : 10: predicate.mini_step_allgather_replace 0.79% : 0.000002s : 15: predicate.minmaximum_grad 1.05% : 0.000003s : 5: predicate.mutable_eliminate 0.38% : 0.000001s : 5: predicate.opt_reshape 0.34% : 0.000001s : 5: predicate.parallel_virtual_node 1.62% : 0.000004s : 23: predicate.partial_defer_inline 1.51% : 0.000004s : 24: predicate.partial_eliminate 0.92% : 0.000002s : 15: predicate.print_const_string_wrapper 0.71% : 0.000002s : 10: predicate.reduce_all_const_elim 1.12% : 0.000003s : 15: predicate.reduce_eliminate 2.37% : 0.000006s : 44: predicate.redundant_stop_gradient_eliminater 0.58% : 0.000001s : 10: predicate.remove_not_recompute_node 1.36% : 0.000003s : 29: predicate.replace_applicator 0.56% : 0.000001s : 10: predicate.replace_old_param 0.28% : 0.000001s : 5: predicate.reset_defer_inline 0.99% : 0.000003s : 15: predicate.reshape_eliminate 0.60% : 0.000002s : 10: predicate.row_tensor_add_zeros_like 0.37% : 0.000001s : 5: predicate.row_tensor_eliminate 0.83% : 0.000002s : 10: predicate.same_eliminate 0.42% : 0.000001s : 10: predicate.set_cell_output_no_recompute 1.20% : 0.000003s : 10: predicate.shard_identity_eliminate 0.66% : 0.000002s : 10: predicate.special_op_eliminate 0.79% : 0.000002s : 10: predicate.specialize_transform 1.15% : 0.000003s : 10: predicate.split_environ_get_set_with_tuple_value 0.85% : 0.000002s : 10: predicate.stack_unstack_eliminate 0.33% : 0.000001s : 5: predicate.switch_call_monad_eliminater 1.39% : 0.000004s : 23: predicate.switch_defer_inline 1.86% : 0.000005s : 33: predicate.switch_layer_defer_inline 4.65% : 0.000012s : 74: predicate.switch_simplify 0.88% : 0.000002s : 15: predicate.tile_eliminate 0.91% : 0.000002s : 15: predicate.transpose_eliminate 1.50% : 0.000004s : 25: predicate.tuple_list_convert_item_index_to_positive 1.53% : 0.000004s : 25: predicate.tuple_list_get_item_const_eliminator 1.58% : 0.000004s : 25: predicate.tuple_list_get_item_depend_reorder 3.48% : 0.000009s : 39: predicate.tuple_list_get_item_eliminator 1.60% : 0.000004s : 25: predicate.tuple_list_get_set_item_eliminator 2.26% : 0.000006s : 35: predicate.tuple_list_set_item_eliminator 1.76% : 0.000005s : 29: predicate.tuple_to_list_eliminator_ 2.30% : 0.000006s : 44: predicate.updatestate_pure_node_eliminater 3.09% : 0.000008s : 54: predicate.updatestate_useless_node_eliminater 0.52% : 0.000001s : 5: predicate.value_based_eliminate 0.71% : 0.000002s : 10: predicate.virtual_dataset_eliminate 0.65% : 0.000002s : 10: predicate.virtual_output_eliminate 0.28% : 0.000001s : 5: predicate.virtual_view_grad_eliminate 0.38% : 0.000001s : 5: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000722 11 51.25% : 0.000370s : 5: func_graph_cloner_run.FuncGraphClonerGraph 48.75% : 0.000352s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.093834 192 0.01% : 0.000006s : 1: ForceFp32Comm 3.99% : 0.003740s : 1: add_attr 3.96% : 0.003720s : 1: add_attr_with_inline 0.01% : 0.000006s : 1: add_comm_op_reuse_tag 0.08% : 0.000072s : 1: add_recomputation 0.01% : 0.000006s : 1: assign_add_opt 0.08% : 0.000080s : 1: auto_monad 0.03% : 0.000032s : 1: auto_monad_reorder 0.01% : 0.000006s : 1: begin_end_overlap_inline 0.01% : 0.000009s : 1: bias_add_comm_swap 0.53% : 0.000497s : 1: bootstrap 0.04% : 0.000041s : 1: cconv 0.01% : 0.000007s : 1: comm_op_add_attrs 0.02% : 0.000022s : 1: control_data_broadcast_order 0.02% : 0.000015s : 1: convert_after_rewriter 0.04% : 0.000038s : 1: cse_after_recomputation 0.01% : 0.000008s : 1: dataset_repeat_opt 0.02% : 0.000020s : 1: detach_backward 0.01% : 0.000013s : 1: environ_conv 0.04% : 0.000034s : 1: event_method 0.01% : 0.000007s : 1: full_micro_interleaved_order_control 0.01% : 0.000008s : 1: get_jit_bprop_graph 0.01% : 0.000013s : 1: graph_reusing 0.01% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000007s : 1: handle_group_info 0.01% : 0.000009s : 1: inline 0.01% : 0.000009s : 1: insert-virtual-dataset 0.01% : 0.000006s : 1: interleave_parallel_branches 0.01% : 0.000007s : 1: interleave_split_concat_branches 0.01% : 0.000009s : 1: label_fine_grained_interleaved_index 0.01% : 0.000010s : 1: label_micro_interleaved_index 0.53% : 0.000501s : 1: loop_unroll 0.01% : 0.000007s : 1: merge_cast_opt 0.01% : 0.000007s : 1: micro_interleaved_order_control 0.81% : 0.000764s : 1: mutable_eliminate 0.01% : 0.000010s : 1: offloading_packed_experts 0.02% : 0.000018s : 1: opt.transform.loop_unroll_optimizer 0.02% : 0.000021s : 1: opt.transform.mutable_eliminate 1.64% : 0.001543s : 78: opt.transform.opt_a 0.04% : 0.000037s : 1: opt.transform.opt_after_cconv 0.03% : 0.000033s : 1: opt.transform.opt_after_jit_grad 0.15% : 0.000138s : 28: opt.transform.opt_b 0.07% : 0.000066s : 2: opt.transform.opt_trans_graph 0.05% : 0.000044s : 4: opt.transform.symbol_engine_opt 20.01% : 0.018779s : 1: opt_a 0.16% : 0.000153s : 1: opt_after_cconv 0.61% : 0.000573s : 1: opt_after_jit_grad 0.35% : 0.000333s : 1: opt_b 23.42% : 0.021980s : 1: optimize 0.03% : 0.000027s : 1: optimize_parallel_all_gather_comm 0.01% : 0.000011s : 1: order_py_execute_after_rewriter 0.03% : 0.000031s : 1: overlap_grad_flash_sp 0.01% : 0.000007s : 1: overlap_grad_matmul_and_grad_allreduce 0.01% : 0.000010s : 1: overlap_grad_ring_attention 0.01% : 0.000007s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000007s : 1: overlap_opt_shard_in_pipeline 0.01% : 0.000009s : 1: overlap_param_gather 0.01% : 0.000007s : 1: overlap_recompute_allgather_and_fa_grad 0.01% : 0.000010s : 1: overlap_recompute_and_grad_model_parallel 0.01% : 0.000008s : 1: overlap_recompute_comm 0.01% : 0.000012s : 1: parallel-infer-symbol 0.01% : 0.000007s : 1: parallel-infer-symbol-second 0.01% : 0.000008s : 1: partial_unused_args_eliminate 0.01% : 0.000010s : 1: pipeline_parallel_scheduler 0.01% : 0.000007s : 1: pipeline_split 0.06% : 0.000052s : 1: pre_auto_parallel 0.04% : 0.000042s : 1: py_interpret_to_execute 0.03% : 0.000025s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000006s : 1: remove_cast_before_assign_add 0.02% : 0.000023s : 1: remove_dup_value 16.24% : 0.015236s : 1: renormalize.infer 0.54% : 0.000506s : 1: renormalize.specialize 0.01% : 0.000008s : 1: reorder_send_recv_between_fp_bp 0.01% : 0.000013s : 1: rewriter_after_jit_bprop_graph 0.06% : 0.000057s : 1: rewriter_after_opt_a 0.12% : 0.000116s : 1: rewriter_before_opt_a 0.01% : 0.000008s : 1: slice_cell_reuse_recomputed_activation 0.01% : 0.000007s : 1: slice_recompute_activation 0.01% : 0.000007s : 1: split_layernorm_comm 0.01% : 0.000008s : 1: split_matmul_comm_elemetwise 0.01% : 0.000012s : 1: swap_dp_allreduce_reducescatter 0.12% : 0.000108s : 1: symbol_engine_optimizer 0.14% : 0.000133s : 1: tuple_transform 25.38% : 0.023819s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:44:57.113.858 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0386858, [21] [bootstrap]: 0.00045265 [type_inference]: 0.00675081 [event_method]: 2.241e-05 [auto_monad]: 7.161e-05 [graph_reusing]: 7.30998e-06 [inline]: 2.97002e-06 [add_attr]: 0.00358142, [1] [add_attr_with_inline]: 0.00357016, [1] [Cycle 1]: 7.841e-05, [2] [tag_attr]: 2.397e-05 [meta_addattr_fg_expand]: 6.14001e-06 [parallel-infer-symbol]: 3.24001e-06 [pre_auto_parallel]: 3.963e-05 [insert-virtual-dataset]: 2.49001e-06 [parallel-infer-symbol-second]: 7.79983e-07 [dataset_repeat_opt]: 1.92999e-06 [pipeline_split]: 1.60001e-06 [optimize]: 0.0266715, [53] [py_interpret_to_execute]: 3.005e-05 [rewriter_before_opt_a]: 9.492e-05 [opt_a]: 0.00351091, [2] [Cycle 1]: 0.00261782, [45] [expand_dump_flag]: 2.88998e-06 [switch_simplify]: 4.482e-05 [loop_unroll]: 3.196e-05 [a_1]: 0.0007467 [with_stream_mark]: 2.39e-05 [recompute_prepare]: 1.314e-05 [updatestate_depend_eliminate]: 5.15999e-06 [updatestate_assign_eliminate]: 3.81001e-06 [updatestate_loads_eliminate]: 3.50003e-06 [parameter_eliminate]: 2.16998e-06 [a_2]: 0.00010648 [accelerated_algorithm]: 8.72e-06 [shard]: 1.89e-06 [meta_shard_fg_expand]: 2.59001e-06 [shard_inline]: 8.62998e-06 [merge_send_recv]: 1.127e-05 [auto_parallel]: 1.012e-05 [parallel]: 2.111e-05 [flash_sp]: 1.05e-05 [merge_comm]: 5.57001e-06 [allreduce_fusion]: 4.33001e-06 [matmul_add_comm_reduction]: 1.203e-05 [allreduce_slice_to_reducescatter]: 7.2e-07 [virtual_shard_identity]: 1.099e-05 [virtual_dataset]: 8.31002e-06 [get_grad_eliminate_]: 8.52e-06 [virtual_output]: 8.90999e-06 [merge_forward]: 4.95001e-06 [cell_reuse_recompute_pass]: 1.44e-06 [offload_activation]: 1.234e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.795e-05 [merge_recompute_call_nodes]: 1.99999e-06 [before_grad]: 1.518e-05 [set_forward_comm_id_for_comm_node_pass]: 4.87998e-06 [meta_fg_expand]: 3.41999e-06 [flash_sp_send_recv_attached]: 3.06001e-06 [receive_attached]: 2.12999e-06 [after_resolve]: 1.403e-05 [a_after_grad]: 1.307e-05 [renormalize]: 0.00097832 [add_forward_monad_depend]: 7.62998e-06 [auto_monad_grad]: 3.66999e-06 [auto_monad_eliminator]: 2.065e-05 [cse]: 4.101e-05 [a_3]: 7.226e-05 [Cycle 2]: 0.00088028, [45] [expand_dump_flag]: 2.01e-06 [switch_simplify]: 1.034e-05 [loop_unroll]: 8.37e-06 [a_1]: 0.00020217 [with_stream_mark]: 1.843e-05 [recompute_prepare]: 8.52e-06 [updatestate_depend_eliminate]: 5.02e-06 [updatestate_assign_eliminate]: 3.76001e-06 [updatestate_loads_eliminate]: 3.65e-06 [parameter_eliminate]: 2.12001e-06 [a_2]: 9.807e-05 [accelerated_algorithm]: 8.17998e-06 [shard]: 2.44001e-06 [meta_shard_fg_expand]: 2.58e-06 [shard_inline]: 1.174e-05 [merge_send_recv]: 8.37e-06 [auto_parallel]: 9.54999e-06 [parallel]: 7.81001e-06 [flash_sp]: 4.04997e-06 [merge_comm]: 4.57998e-06 [allreduce_fusion]: 4.23001e-06 [matmul_add_comm_reduction]: 8.66002e-06 [allreduce_slice_to_reducescatter]: 6.09987e-07 [virtual_shard_identity]: 1.018e-05 [virtual_dataset]: 7.38999e-06 [get_grad_eliminate_]: 7.29001e-06 [virtual_output]: 7.16999e-06 [merge_forward]: 7.02002e-06 [cell_reuse_recompute_pass]: 2.95002e-06 [offload_activation]: 1.15e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.847e-05 [merge_recompute_call_nodes]: 1.32e-06 [before_grad]: 1.445e-05 [set_forward_comm_id_for_comm_node_pass]: 5.27999e-06 [meta_fg_expand]: 3.14001e-06 [flash_sp_send_recv_attached]: 1.81e-06 [receive_attached]: 2.01e-06 [after_resolve]: 1.478e-05 [a_after_grad]: 1.177e-05 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 2.11e-06 [auto_monad_grad]: 1.17999e-06 [auto_monad_eliminator]: 1.202e-05 [cse]: 2.78e-05 [a_3]: 4.893e-05 [py_interpret_to_execute_after_opt_a]: 1.834e-05 [slice_cell_reuse_recomputed_activation]: 2.04999e-06 [rewriter_after_opt_a]: 4.65e-05 [convert_after_rewriter]: 8.70001e-06 [order_py_execute_after_rewriter]: 6.07001e-06 [mutable_eliminate]: 0.00076825 [opt_b]: 0.0204346, [1] [Cycle 1]: 0.0204226, [7] [b_1]: 0.0202555 [b_2]: 1.651e-05 [updatestate_depend_eliminate]: 1.569e-05 [updatestate_assign_eliminate]: 4.03001e-06 [updatestate_loads_eliminate]: 3.81001e-06 [renormalize]: 1.18001e-06 [cse]: 5.068e-05 [optimize_parallel_all_gather_comm]: 2.867e-05 [overlap_param_gather]: 2.01e-06 [cconv]: 4.291e-05 [loop_unroll]: 0.00079844 [opt_after_cconv]: 0.00015311, [1] [Cycle 1]: 0.00014304, [7] [c_1]: 4.374e-05 [parameter_eliminate]: 6.04999e-06 [updatestate_depend_eliminate]: 1.061e-05 [updatestate_assign_eliminate]: 3.43999e-06 [updatestate_loads_eliminate]: 3.25e-06 [cse]: 3.708e-05 [renormalize]: 9.60019e-07 [remove_dup_value]: 1.916e-05 [tuple_transform]: 0.00010429, [1] [Cycle 1]: 9.825e-05, [4] [d_1]: 6.641e-05 [none_parameter_eliminate]: 1.94e-06 [renormalize]: 1.69995e-07 [switch_simplify]: 9.32999e-06 [partial_unused_args_eliminate]: 2.12001e-06 [add_recomputation]: 6.901e-05 [cse_after_recomputation]: 3.047e-05, [1] [Cycle 1]: 2.512e-05, [1] [cse]: 1.834e-05 [environ_conv]: 8.24998e-06 [swap_dp_allreduce_reducescatter]: 7.22002e-06 [bias_add_comm_swap]: 3.58e-06 [label_micro_interleaved_index]: 8.26002e-06 [label_fine_grained_interleaved_index]: 3.45e-06 [merge_cast_opt]: 1.52001e-06 [slice_recompute_activation]: 2.20002e-06 [micro_interleaved_order_control]: 2.54001e-06 [assign_add_opt]: 1.45999e-06 [ForceFp32Comm]: 1.29e-06 [remove_cast_before_assign_add]: 1.19e-06 [full_micro_interleaved_order_control]: 2.69001e-06 [reorder_send_recv_between_fp_bp]: 2.80002e-06 [comm_op_add_attrs]: 1.18001e-06 [add_comm_op_reuse_tag]: 1.00999e-06 [interleave_split_concat_branches]: 1.14e-06 [interleave_parallel_branches]: 1.07998e-06 [overlap_opt_shard_in_pipeline]: 1.54e-06 [overlap_opt_shard_grad_in_pipeline]: 2.53e-06 [control_data_broadcast_order]: 1.912e-05 [grouped_pairwise_exchange_alltoall]: 1.87999e-06 [offloading_packed_experts]: 5.39e-06 [overlap_recompute_and_grad_model_parallel]: 6.17001e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.37e-06 [overlap_recompute_allgather_and_fa_grad]: 1.78002e-06 [overlap_recompute_comm]: 2.53998e-06 [overlap_grad_ring_attention]: 5.13002e-06 [overlap_grad_flash_sp]: 2.556e-05 [begin_end_overlap_inline]: 4.90021e-07 [split_matmul_comm_elemetwise]: 2.45002e-06 [split_layernorm_comm]: 2.11e-06 [handle_group_info]: 1.37e-06 [symbol_engine_optimizer]: 0.00010478, [1] [Cycle 1]: 9.98e-05, [6] [build]: 5.02999e-06 [elim_shapecalc]: 1.616e-05 [elim_not_effective]: 1.977e-05 [opt_reshape]: 9.96e-06 [fold_const_symbol]: 1.482e-05 [renormalize]: 2.50002e-07 [detach_backward]: 2.54001e-06 [pipeline_parallel_scheduler]: 1.79e-06 [auto_monad_reorder]: 2.382e-05 [get_jit_bprop_graph]: 2.31e-06 [rewriter_after_jit_bprop_graph]: 7.31001e-06 [opt_after_jit_grad]: 0.0007967 [validate]: 6.18e-05 Sums bootstrap : 0.000453s : 1.33% type_inference : 0.006751s : 19.88% event_method : 0.000022s : 0.07% auto_monad : 0.000072s : 0.21% graph_reusing : 0.000007s : 0.02% inline : 0.000003s : 0.01% add_attr.add_attr_with_inline.tag_attr : 0.000024s : 0.07% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.02% parallel-infer-symbol : 0.000003s : 0.01% pre_auto_parallel : 0.000040s : 0.12% insert-virtual-dataset : 0.000002s : 0.01% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.01% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000030s : 0.09% optimize.rewriter_before_opt_a : 0.000095s : 0.28% optimize.opt_a.expand_dump_flag : 0.000005s : 0.01% optimize.opt_a.switch_simplify : 0.000055s : 0.16% optimize.opt_a.loop_unroll : 0.000040s : 0.12% optimize.opt_a.a_1 : 0.000949s : 2.79% optimize.opt_a.with_stream_mark : 0.000042s : 0.12% optimize.opt_a.recompute_prepare : 0.000022s : 0.06% optimize.opt_a.updatestate_depend_eliminate : 0.000010s : 0.03% optimize.opt_a.updatestate_assign_eliminate : 0.000008s : 0.02% optimize.opt_a.updatestate_loads_eliminate : 0.000007s : 0.02% optimize.opt_a.parameter_eliminate : 0.000004s : 0.01% optimize.opt_a.a_2 : 0.000205s : 0.60% optimize.opt_a.accelerated_algorithm : 0.000017s : 0.05% optimize.opt_a.shard : 0.000004s : 0.01% optimize.opt_a.meta_shard_fg_expand : 0.000005s : 0.02% optimize.opt_a.shard_inline : 0.000020s : 0.06% optimize.opt_a.merge_send_recv : 0.000020s : 0.06% optimize.opt_a.auto_parallel : 0.000020s : 0.06% optimize.opt_a.parallel : 0.000029s : 0.09% optimize.opt_a.flash_sp : 0.000015s : 0.04% optimize.opt_a.merge_comm : 0.000010s : 0.03% optimize.opt_a.allreduce_fusion : 0.000009s : 0.03% optimize.opt_a.matmul_add_comm_reduction : 0.000021s : 0.06% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000021s : 0.06% optimize.opt_a.virtual_dataset : 0.000016s : 0.05% optimize.opt_a.get_grad_eliminate_ : 0.000016s : 0.05% optimize.opt_a.virtual_output : 0.000016s : 0.05% optimize.opt_a.merge_forward : 0.000012s : 0.04% optimize.opt_a.cell_reuse_recompute_pass : 0.000004s : 0.01% optimize.opt_a.offload_activation : 0.000024s : 0.07% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000036s : 0.11% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.01% optimize.opt_a.before_grad : 0.000030s : 0.09% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000010s : 0.03% optimize.opt_a.meta_fg_expand : 0.000007s : 0.02% optimize.opt_a.flash_sp_send_recv_attached : 0.000005s : 0.01% optimize.opt_a.receive_attached : 0.000004s : 0.01% optimize.opt_a.after_resolve : 0.000029s : 0.08% optimize.opt_a.a_after_grad : 0.000025s : 0.07% optimize.opt_a.renormalize : 0.000978s : 2.88% optimize.opt_a.add_forward_monad_depend : 0.000010s : 0.03% optimize.opt_a.auto_monad_grad : 0.000005s : 0.01% optimize.opt_a.auto_monad_eliminator : 0.000033s : 0.10% optimize.opt_a.cse : 0.000069s : 0.20% optimize.opt_a.a_3 : 0.000121s : 0.36% optimize.py_interpret_to_execute_after_opt_a : 0.000018s : 0.05% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.01% optimize.rewriter_after_opt_a : 0.000047s : 0.14% optimize.convert_after_rewriter : 0.000009s : 0.03% optimize.order_py_execute_after_rewriter : 0.000006s : 0.02% optimize.mutable_eliminate : 0.000768s : 2.26% optimize.opt_b.b_1 : 0.020256s : 59.65% optimize.opt_b.b_2 : 0.000017s : 0.05% optimize.opt_b.updatestate_depend_eliminate : 0.000016s : 0.05% optimize.opt_b.updatestate_assign_eliminate : 0.000004s : 0.01% optimize.opt_b.updatestate_loads_eliminate : 0.000004s : 0.01% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000051s : 0.15% optimize.optimize_parallel_all_gather_comm : 0.000029s : 0.08% optimize.overlap_param_gather : 0.000002s : 0.01% optimize.cconv : 0.000043s : 0.13% optimize.loop_unroll : 0.000798s : 2.35% optimize.opt_after_cconv.c_1 : 0.000044s : 0.13% optimize.opt_after_cconv.parameter_eliminate : 0.000006s : 0.02% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000011s : 0.03% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.cse : 0.000037s : 0.11% optimize.opt_after_cconv.renormalize : 0.000001s : 0.00% optimize.remove_dup_value : 0.000019s : 0.06% optimize.tuple_transform.d_1 : 0.000066s : 0.20% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000009s : 0.03% optimize.partial_unused_args_eliminate : 0.000002s : 0.01% optimize.add_recomputation : 0.000069s : 0.20% optimize.cse_after_recomputation.cse : 0.000018s : 0.05% optimize.environ_conv : 0.000008s : 0.02% optimize.swap_dp_allreduce_reducescatter : 0.000007s : 0.02% optimize.bias_add_comm_swap : 0.000004s : 0.01% optimize.label_micro_interleaved_index : 0.000008s : 0.02% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.01% optimize.merge_cast_opt : 0.000002s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.01% optimize.micro_interleaved_order_control : 0.000003s : 0.01% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.00% optimize.full_micro_interleaved_order_control : 0.000003s : 0.01% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.01% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000002s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000003s : 0.01% optimize.control_data_broadcast_order : 0.000019s : 0.06% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.01% optimize.offloading_packed_experts : 0.000005s : 0.02% optimize.overlap_recompute_and_grad_model_parallel : 0.000006s : 0.02% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000002s : 0.01% optimize.overlap_recompute_comm : 0.000003s : 0.01% optimize.overlap_grad_ring_attention : 0.000005s : 0.02% optimize.overlap_grad_flash_sp : 0.000026s : 0.08% optimize.begin_end_overlap_inline : 0.000000s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.01% optimize.split_layernorm_comm : 0.000002s : 0.01% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000005s : 0.01% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000016s : 0.05% optimize.symbol_engine_optimizer.elim_not_effective : 0.000020s : 0.06% optimize.symbol_engine_optimizer.opt_reshape : 0.000010s : 0.03% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000015s : 0.04% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000003s : 0.01% pipeline_parallel_scheduler : 0.000002s : 0.01% auto_monad_reorder : 0.000024s : 0.07% get_jit_bprop_graph : 0.000002s : 0.01% rewriter_after_jit_bprop_graph : 0.000007s : 0.02% opt_after_jit_grad : 0.000797s : 2.35% validate : 0.000062s : 0.18% Time group info: ------[substitution.] 0.000246 38 13.02% : 0.000032s : 3: substitution.cast_eliminate 1.19% : 0.000003s : 3: substitution.elim_not_effective 1.06% : 0.000003s : 3: substitution.fold_const_symbol 3.14% : 0.000008s : 5: substitution.graph_param_transform 68.16% : 0.000168s : 4: substitution.inline 2.52% : 0.000006s : 6: substitution.j_node_and_user_rematch 2.92% : 0.000007s : 6: substitution.remove_not_recompute_node 2.73% : 0.000007s : 4: substitution.replace_old_param 5.24% : 0.000013s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.006678 2 88.13% : 0.005885s : 1: type_inference.infer 11.87% : 0.000792s : 1: type_inference.specialize ------[replace.] 0.000068 8 60.94% : 0.000041s : 4: replace.inline 39.06% : 0.000026s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000176 8 93.76% : 0.000165s : 4: match.inline 6.24% : 0.000011s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000276 1504 0.80% : 0.000002s : 15: predicate.accumulaten_eliminater 1.36% : 0.000004s : 5: predicate.ad_related_special_op_eliminate 0.57% : 0.000002s : 10: predicate.addn_check_dump 0.84% : 0.000002s : 15: predicate.addn_zero_filter 0.73% : 0.000002s : 15: predicate.adjust_all_reduce_mul_add 2.01% : 0.000006s : 25: predicate.arithmetic_simplify 0.98% : 0.000003s : 15: predicate.cast_eliminate 0.63% : 0.000002s : 10: predicate.check_bprop_eliminate 0.54% : 0.000001s : 10: predicate.compare_switch_simplify 0.18% : 0.000000s : 5: predicate.const_output_eliminate 0.65% : 0.000002s : 10: predicate.depend_value_elim 0.97% : 0.000003s : 15: predicate.dict_get_item_const_eliminator 1.08% : 0.000003s : 15: predicate.dict_get_item_eliminator 0.88% : 0.000002s : 15: predicate.dict_set_item_eliminator 1.07% : 0.000003s : 10: predicate.dumpgradient_eliminate 0.18% : 0.000000s : 5: predicate.elim_not_effective 0.49% : 0.000001s : 5: predicate.elim_shapecalc_of_broadcastargs 1.10% : 0.000003s : 20: predicate.environ_add_const_eliminate 1.00% : 0.000003s : 20: predicate.environ_get_add_eliminate 1.22% : 0.000003s : 20: predicate.environ_get_depend_swap 1.73% : 0.000005s : 30: predicate.environ_get_eliminate 1.16% : 0.000003s : 20: predicate.environ_get_set_eliminate 1.24% : 0.000003s : 23: predicate.exchange_switch_depend_value 2.05% : 0.000006s : 23: predicate.float_depend_g_call 0.56% : 0.000002s : 10: predicate.float_environ_get_switch 1.37% : 0.000004s : 15: predicate.float_tuple_getitem_switch 0.19% : 0.000001s : 5: predicate.fold_const_symbol 0.90% : 0.000002s : 10: predicate.get_grad_eliminate 0.26% : 0.000001s : 5: predicate.graph_param_transform 0.56% : 0.000002s : 10: predicate.incorporate_call 0.48% : 0.000001s : 10: predicate.incorporate_call_switch 6.19% : 0.000017s : 68: predicate.inline 0.73% : 0.000002s : 10: predicate.inline_without_move 0.30% : 0.000001s : 10: predicate.j_node_and_user_rematch 0.88% : 0.000002s : 10: predicate.less_batch_normalization 1.74% : 0.000005s : 29: predicate.list_to_tuple_eliminator_ 2.37% : 0.000007s : 44: predicate.load_eliminater 1.45% : 0.000004s : 5: predicate.loop_unroll_after_grad 1.88% : 0.000005s : 36: predicate.loop_unroll_before_grad 2.23% : 0.000006s : 25: predicate.make_slice_get_slice_eliminator 0.52% : 0.000001s : 10: predicate.merge_addn 0.56% : 0.000002s : 10: predicate.micro_step_allgather_replace 0.60% : 0.000002s : 10: predicate.mini_step_allgather_replace 0.79% : 0.000002s : 15: predicate.minmaximum_grad 1.11% : 0.000003s : 5: predicate.mutable_eliminate 0.43% : 0.000001s : 5: predicate.opt_reshape 0.40% : 0.000001s : 5: predicate.parallel_virtual_node 1.59% : 0.000004s : 23: predicate.partial_defer_inline 1.44% : 0.000004s : 24: predicate.partial_eliminate 0.89% : 0.000002s : 15: predicate.print_const_string_wrapper 0.65% : 0.000002s : 10: predicate.reduce_all_const_elim 1.13% : 0.000003s : 15: predicate.reduce_eliminate 2.41% : 0.000007s : 44: predicate.redundant_stop_gradient_eliminater 0.35% : 0.000001s : 10: predicate.remove_not_recompute_node 1.23% : 0.000003s : 29: predicate.replace_applicator 0.69% : 0.000002s : 10: predicate.replace_old_param 0.54% : 0.000001s : 5: predicate.reset_defer_inline 1.01% : 0.000003s : 15: predicate.reshape_eliminate 0.61% : 0.000002s : 10: predicate.row_tensor_add_zeros_like 0.39% : 0.000001s : 5: predicate.row_tensor_eliminate 0.96% : 0.000003s : 10: predicate.same_eliminate 0.39% : 0.000001s : 10: predicate.set_cell_output_no_recompute 0.79% : 0.000002s : 10: predicate.shard_identity_eliminate 1.10% : 0.000003s : 10: predicate.special_op_eliminate 0.72% : 0.000002s : 10: predicate.specialize_transform 1.01% : 0.000003s : 10: predicate.split_environ_get_set_with_tuple_value 1.07% : 0.000003s : 10: predicate.stack_unstack_eliminate 0.39% : 0.000001s : 5: predicate.switch_call_monad_eliminater 1.36% : 0.000004s : 23: predicate.switch_defer_inline 1.90% : 0.000005s : 33: predicate.switch_layer_defer_inline 4.35% : 0.000012s : 74: predicate.switch_simplify 0.92% : 0.000003s : 15: predicate.tile_eliminate 0.85% : 0.000002s : 15: predicate.transpose_eliminate 1.69% : 0.000005s : 25: predicate.tuple_list_convert_item_index_to_positive 1.48% : 0.000004s : 25: predicate.tuple_list_get_item_const_eliminator 1.47% : 0.000004s : 25: predicate.tuple_list_get_item_depend_reorder 3.24% : 0.000009s : 39: predicate.tuple_list_get_item_eliminator 1.50% : 0.000004s : 25: predicate.tuple_list_get_set_item_eliminator 2.29% : 0.000006s : 35: predicate.tuple_list_set_item_eliminator 1.82% : 0.000005s : 29: predicate.tuple_to_list_eliminator_ 2.24% : 0.000006s : 44: predicate.updatestate_pure_node_eliminater 3.00% : 0.000008s : 54: predicate.updatestate_useless_node_eliminater 0.48% : 0.000001s : 5: predicate.value_based_eliminate 0.73% : 0.000002s : 10: predicate.virtual_dataset_eliminate 0.56% : 0.000002s : 10: predicate.virtual_output_eliminate 0.33% : 0.000001s : 5: predicate.virtual_view_grad_eliminate 0.44% : 0.000001s : 5: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000637 11 51.99% : 0.000331s : 5: func_graph_cloner_run.FuncGraphClonerGraph 48.01% : 0.000306s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.091741 192 0.00% : 0.000004s : 1: ForceFp32Comm 3.91% : 0.003588s : 1: add_attr 3.90% : 0.003574s : 1: add_attr_with_inline 0.00% : 0.000004s : 1: add_comm_op_reuse_tag 0.08% : 0.000074s : 1: add_recomputation 0.01% : 0.000005s : 1: assign_add_opt 0.09% : 0.000079s : 1: auto_monad 0.03% : 0.000029s : 1: auto_monad_reorder 0.00% : 0.000004s : 1: begin_end_overlap_inline 0.01% : 0.000007s : 1: bias_add_comm_swap 0.53% : 0.000483s : 1: bootstrap 0.05% : 0.000047s : 1: cconv 0.00% : 0.000004s : 1: comm_op_add_attrs 0.02% : 0.000023s : 1: control_data_broadcast_order 0.01% : 0.000012s : 1: convert_after_rewriter 0.04% : 0.000034s : 1: cse_after_recomputation 0.01% : 0.000005s : 1: dataset_repeat_opt 0.01% : 0.000006s : 1: detach_backward 0.01% : 0.000012s : 1: environ_conv 0.03% : 0.000029s : 1: event_method 0.01% : 0.000006s : 1: full_micro_interleaved_order_control 0.01% : 0.000005s : 1: get_jit_bprop_graph 0.01% : 0.000011s : 1: graph_reusing 0.01% : 0.000005s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000005s : 1: handle_group_info 0.01% : 0.000006s : 1: inline 0.01% : 0.000006s : 1: insert-virtual-dataset 0.00% : 0.000004s : 1: interleave_parallel_branches 0.00% : 0.000004s : 1: interleave_split_concat_branches 0.01% : 0.000007s : 1: label_fine_grained_interleaved_index 0.01% : 0.000011s : 1: label_micro_interleaved_index 0.89% : 0.000814s : 1: loop_unroll 0.00% : 0.000004s : 1: merge_cast_opt 0.01% : 0.000006s : 1: micro_interleaved_order_control 0.86% : 0.000785s : 1: mutable_eliminate 0.01% : 0.000009s : 1: offloading_packed_experts 0.03% : 0.000027s : 1: opt.transform.loop_unroll_optimizer 0.03% : 0.000029s : 1: opt.transform.mutable_eliminate 1.65% : 0.001512s : 78: opt.transform.opt_a 0.05% : 0.000042s : 1: opt.transform.opt_after_cconv 0.05% : 0.000046s : 1: opt.transform.opt_after_jit_grad 22.03% : 0.020213s : 28: opt.transform.opt_b 0.08% : 0.000073s : 2: opt.transform.opt_trans_graph 0.06% : 0.000056s : 4: opt.transform.symbol_engine_opt 3.83% : 0.003515s : 1: opt_a 0.17% : 0.000157s : 1: opt_after_cconv 0.89% : 0.000814s : 1: opt_after_jit_grad 22.28% : 0.020440s : 1: opt_b 29.08% : 0.026678s : 1: optimize 0.04% : 0.000033s : 1: optimize_parallel_all_gather_comm 0.01% : 0.000009s : 1: order_py_execute_after_rewriter 0.03% : 0.000030s : 1: overlap_grad_flash_sp 0.00% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.01% : 0.000009s : 1: overlap_grad_ring_attention 0.01% : 0.000007s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000005s : 1: overlap_opt_shard_in_pipeline 0.01% : 0.000005s : 1: overlap_param_gather 0.01% : 0.000005s : 1: overlap_recompute_allgather_and_fa_grad 0.01% : 0.000009s : 1: overlap_recompute_and_grad_model_parallel 0.01% : 0.000006s : 1: overlap_recompute_comm 0.01% : 0.000007s : 1: parallel-infer-symbol 0.00% : 0.000004s : 1: parallel-infer-symbol-second 0.01% : 0.000006s : 1: partial_unused_args_eliminate 0.01% : 0.000005s : 1: pipeline_parallel_scheduler 0.00% : 0.000004s : 1: pipeline_split 0.05% : 0.000044s : 1: pre_auto_parallel 0.04% : 0.000034s : 1: py_interpret_to_execute 0.02% : 0.000022s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000006s : 1: remove_cast_before_assign_add 0.03% : 0.000023s : 1: remove_dup_value 0.58% : 0.000536s : 1: renormalize.infer 0.47% : 0.000431s : 1: renormalize.specialize 0.01% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.01% : 0.000011s : 1: rewriter_after_jit_bprop_graph 0.06% : 0.000051s : 1: rewriter_after_opt_a 0.11% : 0.000099s : 1: rewriter_before_opt_a 0.01% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.01% : 0.000005s : 1: slice_recompute_activation 0.01% : 0.000005s : 1: split_layernorm_comm 0.01% : 0.000005s : 1: split_matmul_comm_elemetwise 0.01% : 0.000010s : 1: swap_dp_allreduce_reducescatter 0.12% : 0.000108s : 1: symbol_engine_optimizer 0.12% : 0.000107s : 1: tuple_transform 7.38% : 0.006773s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:44:57.551.226 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:44:57.551.552 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0646974, [21] [bootstrap]: 0.00047858 [type_inference]: 0.0350448 [event_method]: 2.397e-05 [auto_monad]: 7.216e-05 [graph_reusing]: 6.54999e-06 [inline]: 2.88e-06 [add_attr]: 0.00414424, [1] [add_attr_with_inline]: 0.00412952, [1] [Cycle 1]: 0.00010153, [2] [tag_attr]: 2.593e-05 [meta_addattr_fg_expand]: 6.36e-06 [parallel-infer-symbol]: 3.89002e-06 [pre_auto_parallel]: 4.424e-05 [insert-virtual-dataset]: 2.48e-06 [parallel-infer-symbol-second]: 7.50006e-07 [dataset_repeat_opt]: 1.95001e-06 [pipeline_split]: 1.93002e-06 [optimize]: 0.0230238, [53] [py_interpret_to_execute]: 3.742e-05 [rewriter_before_opt_a]: 0.00010545 [opt_a]: 0.00397705, [2] [Cycle 1]: 0.00291818, [45] [expand_dump_flag]: 3.33e-06 [switch_simplify]: 4.477e-05 [loop_unroll]: 3.203e-05 [a_1]: 0.00079875 [with_stream_mark]: 2.446e-05 [recompute_prepare]: 1.341e-05 [updatestate_depend_eliminate]: 5.39998e-06 [updatestate_assign_eliminate]: 4.38999e-06 [updatestate_loads_eliminate]: 3.86001e-06 [parameter_eliminate]: 2.41e-06 [a_2]: 0.0001388 [accelerated_algorithm]: 9.99001e-06 [shard]: 2.24999e-06 [meta_shard_fg_expand]: 3.06001e-06 [shard_inline]: 8.37e-06 [merge_send_recv]: 1.1e-05 [auto_parallel]: 1.018e-05 [parallel]: 2.288e-05 [flash_sp]: 1.061e-05 [merge_comm]: 5.62001e-06 [allreduce_fusion]: 4.58999e-06 [matmul_add_comm_reduction]: 1.108e-05 [allreduce_slice_to_reducescatter]: 9.39996e-07 [virtual_shard_identity]: 1.094e-05 [virtual_dataset]: 8.79e-06 [get_grad_eliminate_]: 8.40001e-06 [virtual_output]: 9.00001e-06 [merge_forward]: 4.72e-06 [cell_reuse_recompute_pass]: 1.74e-06 [offload_activation]: 1.363e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.071e-05 [merge_recompute_call_nodes]: 1.82999e-06 [before_grad]: 1.525e-05 [set_forward_comm_id_for_comm_node_pass]: 4.60001e-06 [meta_fg_expand]: 4.64002e-06 [flash_sp_send_recv_attached]: 2.69001e-06 [receive_attached]: 2.96001e-06 [after_resolve]: 1.466e-05 [a_after_grad]: 1.33e-05 [renormalize]: 0.00100637 [add_forward_monad_depend]: 7.81001e-06 [auto_monad_grad]: 3.4e-06 [auto_monad_eliminator]: 2.158e-05 [cse]: 3.954e-05 [a_3]: 8.358e-05 [Cycle 2]: 0.00103942, [45] [expand_dump_flag]: 2.71999e-06 [switch_simplify]: 1.092e-05 [loop_unroll]: 7.78999e-06 [a_1]: 0.00019472 [with_stream_mark]: 1.855e-05 [recompute_prepare]: 7.8e-06 [updatestate_depend_eliminate]: 5.09998e-06 [updatestate_assign_eliminate]: 3.18e-06 [updatestate_loads_eliminate]: 3.32002e-06 [parameter_eliminate]: 1.81e-06 [a_2]: 0.00012327 [accelerated_algorithm]: 8.24002e-06 [shard]: 2.02999e-06 [meta_shard_fg_expand]: 2.92002e-06 [shard_inline]: 8.50001e-06 [merge_send_recv]: 9.71998e-06 [auto_parallel]: 9.19e-06 [parallel]: 7.91001e-06 [flash_sp]: 4.01001e-06 [merge_comm]: 4.47998e-06 [allreduce_fusion]: 4.21001e-06 [matmul_add_comm_reduction]: 8.96998e-06 [allreduce_slice_to_reducescatter]: 8.80013e-07 [virtual_shard_identity]: 1.055e-05 [virtual_dataset]: 7.49002e-06 [get_grad_eliminate_]: 7.45e-06 [virtual_output]: 7.46999e-06 [merge_forward]: 9.04998e-06 [cell_reuse_recompute_pass]: 2.93998e-06 [offload_activation]: 1.05e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.982e-05 [merge_recompute_call_nodes]: 1.00001e-06 [before_grad]: 1.356e-05 [set_forward_comm_id_for_comm_node_pass]: 5.22e-06 [meta_fg_expand]: 3.78001e-06 [flash_sp_send_recv_attached]: 1.66e-06 [receive_attached]: 1.74998e-06 [after_resolve]: 1.388e-05 [a_after_grad]: 1.181e-05 [renormalize]: 1.8999e-07 [add_forward_monad_depend]: 2.19999e-06 [auto_monad_grad]: 2.16e-06 [auto_monad_eliminator]: 1.442e-05 [cse]: 2.523e-05 [a_3]: 6.191e-05 [py_interpret_to_execute_after_opt_a]: 2.2e-05 [slice_cell_reuse_recomputed_activation]: 5.00999e-06 [rewriter_after_opt_a]: 5.109e-05 [convert_after_rewriter]: 1.06e-05 [order_py_execute_after_rewriter]: 8.76997e-06 [mutable_eliminate]: 0.0169069 [opt_b]: 0.00036907, [1] [Cycle 1]: 0.00035579, [7] [b_1]: 0.00021343 [b_2]: 1.077e-05 [updatestate_depend_eliminate]: 1.367e-05 [updatestate_assign_eliminate]: 4.00998e-06 [updatestate_loads_eliminate]: 4.05e-06 [renormalize]: 1.30999e-06 [cse]: 4.721e-05 [optimize_parallel_all_gather_comm]: 2.89e-05 [overlap_param_gather]: 5.30001e-06 [cconv]: 4.191e-05 [loop_unroll]: 0.0005186 [opt_after_cconv]: 0.00015298, [1] [Cycle 1]: 0.00014318, [7] [c_1]: 3.87e-05 [parameter_eliminate]: 6.58e-06 [updatestate_depend_eliminate]: 7.91001e-06 [updatestate_assign_eliminate]: 3.11001e-06 [updatestate_loads_eliminate]: 3.09999e-06 [cse]: 2.614e-05 [renormalize]: 5.3001e-07 [remove_dup_value]: 1.999e-05 [tuple_transform]: 0.00010746, [1] [Cycle 1]: 9.907e-05, [4] [d_1]: 5.823e-05 [none_parameter_eliminate]: 1.47001e-06 [renormalize]: 2.10013e-07 [switch_simplify]: 8.46002e-06 [partial_unused_args_eliminate]: 4.99998e-06 [add_recomputation]: 6.691e-05 [cse_after_recomputation]: 3.462e-05, [1] [Cycle 1]: 2.671e-05, [1] [cse]: 1.674e-05 [environ_conv]: 1.076e-05 [swap_dp_allreduce_reducescatter]: 9.05001e-06 [bias_add_comm_swap]: 6.42001e-06 [label_micro_interleaved_index]: 7.38e-06 [label_fine_grained_interleaved_index]: 5.40001e-06 [merge_cast_opt]: 4.25e-06 [slice_recompute_activation]: 5.37001e-06 [micro_interleaved_order_control]: 4.67e-06 [assign_add_opt]: 3.7e-06 [ForceFp32Comm]: 3.60998e-06 [remove_cast_before_assign_add]: 3.55998e-06 [full_micro_interleaved_order_control]: 4.79e-06 [reorder_send_recv_between_fp_bp]: 6.07999e-06 [comm_op_add_attrs]: 4.21001e-06 [add_comm_op_reuse_tag]: 3.25e-06 [interleave_split_concat_branches]: 3.54002e-06 [interleave_parallel_branches]: 3.47002e-06 [overlap_opt_shard_in_pipeline]: 3.56999e-06 [overlap_opt_shard_grad_in_pipeline]: 4.60001e-06 [control_data_broadcast_order]: 1.936e-05 [grouped_pairwise_exchange_alltoall]: 4.13999e-06 [offloading_packed_experts]: 7.45e-06 [overlap_recompute_and_grad_model_parallel]: 8.00999e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.51001e-06 [overlap_recompute_allgather_and_fa_grad]: 3.88001e-06 [overlap_recompute_comm]: 4.99e-06 [overlap_grad_ring_attention]: 7.26001e-06 [overlap_grad_flash_sp]: 2.884e-05 [begin_end_overlap_inline]: 3.02002e-06 [split_matmul_comm_elemetwise]: 4.99e-06 [split_layernorm_comm]: 4.3e-06 [handle_group_info]: 3.36999e-06 [symbol_engine_optimizer]: 0.00011619, [1] [Cycle 1]: 0.00010848, [6] [build]: 4.92e-06 [elim_shapecalc]: 1.496e-05 [elim_not_effective]: 1.789e-05 [opt_reshape]: 9.05999e-06 [fold_const_symbol]: 1.282e-05 [renormalize]: 2.10013e-07 [detach_backward]: 5.74999e-06 [pipeline_parallel_scheduler]: 1.85001e-06 [auto_monad_reorder]: 2.893e-05 [get_jit_bprop_graph]: 1.94999e-06 [rewriter_after_jit_bprop_graph]: 6.74999e-06 [opt_after_jit_grad]: 0.00073119 [validate]: 5.478e-05 Sums bootstrap : 0.000479s : 0.82% type_inference : 0.035045s : 60.15% event_method : 0.000024s : 0.04% auto_monad : 0.000072s : 0.12% graph_reusing : 0.000007s : 0.01% inline : 0.000003s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000026s : 0.04% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.01% parallel-infer-symbol : 0.000004s : 0.01% pre_auto_parallel : 0.000044s : 0.08% insert-virtual-dataset : 0.000002s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000037s : 0.06% optimize.rewriter_before_opt_a : 0.000105s : 0.18% optimize.opt_a.expand_dump_flag : 0.000006s : 0.01% optimize.opt_a.switch_simplify : 0.000056s : 0.10% optimize.opt_a.loop_unroll : 0.000040s : 0.07% optimize.opt_a.a_1 : 0.000993s : 1.71% optimize.opt_a.with_stream_mark : 0.000043s : 0.07% optimize.opt_a.recompute_prepare : 0.000021s : 0.04% optimize.opt_a.updatestate_depend_eliminate : 0.000010s : 0.02% optimize.opt_a.updatestate_assign_eliminate : 0.000008s : 0.01% optimize.opt_a.updatestate_loads_eliminate : 0.000007s : 0.01% optimize.opt_a.parameter_eliminate : 0.000004s : 0.01% optimize.opt_a.a_2 : 0.000262s : 0.45% optimize.opt_a.accelerated_algorithm : 0.000018s : 0.03% optimize.opt_a.shard : 0.000004s : 0.01% optimize.opt_a.meta_shard_fg_expand : 0.000006s : 0.01% optimize.opt_a.shard_inline : 0.000017s : 0.03% optimize.opt_a.merge_send_recv : 0.000021s : 0.04% optimize.opt_a.auto_parallel : 0.000019s : 0.03% optimize.opt_a.parallel : 0.000031s : 0.05% optimize.opt_a.flash_sp : 0.000015s : 0.03% optimize.opt_a.merge_comm : 0.000010s : 0.02% optimize.opt_a.allreduce_fusion : 0.000009s : 0.02% optimize.opt_a.matmul_add_comm_reduction : 0.000020s : 0.03% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000002s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000021s : 0.04% optimize.opt_a.virtual_dataset : 0.000016s : 0.03% optimize.opt_a.get_grad_eliminate_ : 0.000016s : 0.03% optimize.opt_a.virtual_output : 0.000016s : 0.03% optimize.opt_a.merge_forward : 0.000014s : 0.02% optimize.opt_a.cell_reuse_recompute_pass : 0.000005s : 0.01% optimize.opt_a.offload_activation : 0.000024s : 0.04% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000041s : 0.07% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.00% optimize.opt_a.before_grad : 0.000029s : 0.05% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000010s : 0.02% optimize.opt_a.meta_fg_expand : 0.000008s : 0.01% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.01% optimize.opt_a.receive_attached : 0.000005s : 0.01% optimize.opt_a.after_resolve : 0.000029s : 0.05% optimize.opt_a.a_after_grad : 0.000025s : 0.04% optimize.opt_a.renormalize : 0.001007s : 1.73% optimize.opt_a.add_forward_monad_depend : 0.000010s : 0.02% optimize.opt_a.auto_monad_grad : 0.000006s : 0.01% optimize.opt_a.auto_monad_eliminator : 0.000036s : 0.06% optimize.opt_a.cse : 0.000065s : 0.11% optimize.opt_a.a_3 : 0.000145s : 0.25% optimize.py_interpret_to_execute_after_opt_a : 0.000022s : 0.04% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.01% optimize.rewriter_after_opt_a : 0.000051s : 0.09% optimize.convert_after_rewriter : 0.000011s : 0.02% optimize.order_py_execute_after_rewriter : 0.000009s : 0.02% optimize.mutable_eliminate : 0.016907s : 29.02% optimize.opt_b.b_1 : 0.000213s : 0.37% optimize.opt_b.b_2 : 0.000011s : 0.02% optimize.opt_b.updatestate_depend_eliminate : 0.000014s : 0.02% optimize.opt_b.updatestate_assign_eliminate : 0.000004s : 0.01% optimize.opt_b.updatestate_loads_eliminate : 0.000004s : 0.01% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000047s : 0.08% optimize.optimize_parallel_all_gather_comm : 0.000029s : 0.05% optimize.overlap_param_gather : 0.000005s : 0.01% optimize.cconv : 0.000042s : 0.07% optimize.loop_unroll : 0.000519s : 0.89% optimize.opt_after_cconv.c_1 : 0.000039s : 0.07% optimize.opt_after_cconv.parameter_eliminate : 0.000007s : 0.01% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000008s : 0.01% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.cse : 0.000026s : 0.04% optimize.opt_after_cconv.renormalize : 0.000001s : 0.00% optimize.remove_dup_value : 0.000020s : 0.03% optimize.tuple_transform.d_1 : 0.000058s : 0.10% optimize.tuple_transform.none_parameter_eliminate : 0.000001s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000008s : 0.01% optimize.partial_unused_args_eliminate : 0.000005s : 0.01% optimize.add_recomputation : 0.000067s : 0.11% optimize.cse_after_recomputation.cse : 0.000017s : 0.03% optimize.environ_conv : 0.000011s : 0.02% optimize.swap_dp_allreduce_reducescatter : 0.000009s : 0.02% optimize.bias_add_comm_swap : 0.000006s : 0.01% optimize.label_micro_interleaved_index : 0.000007s : 0.01% optimize.label_fine_grained_interleaved_index : 0.000005s : 0.01% optimize.merge_cast_opt : 0.000004s : 0.01% optimize.slice_recompute_activation : 0.000005s : 0.01% optimize.micro_interleaved_order_control : 0.000005s : 0.01% optimize.assign_add_opt : 0.000004s : 0.01% optimize.ForceFp32Comm : 0.000004s : 0.01% optimize.remove_cast_before_assign_add : 0.000004s : 0.01% optimize.full_micro_interleaved_order_control : 0.000005s : 0.01% optimize.reorder_send_recv_between_fp_bp : 0.000006s : 0.01% optimize.comm_op_add_attrs : 0.000004s : 0.01% optimize.add_comm_op_reuse_tag : 0.000003s : 0.01% optimize.interleave_split_concat_branches : 0.000004s : 0.01% optimize.interleave_parallel_branches : 0.000003s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000005s : 0.01% optimize.control_data_broadcast_order : 0.000019s : 0.03% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.01% optimize.offloading_packed_experts : 0.000007s : 0.01% optimize.overlap_recompute_and_grad_model_parallel : 0.000008s : 0.01% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.01% optimize.overlap_recompute_comm : 0.000005s : 0.01% optimize.overlap_grad_ring_attention : 0.000007s : 0.01% optimize.overlap_grad_flash_sp : 0.000029s : 0.05% optimize.begin_end_overlap_inline : 0.000003s : 0.01% optimize.split_matmul_comm_elemetwise : 0.000005s : 0.01% optimize.split_layernorm_comm : 0.000004s : 0.01% optimize.handle_group_info : 0.000003s : 0.01% optimize.symbol_engine_optimizer.build : 0.000005s : 0.01% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000015s : 0.03% optimize.symbol_engine_optimizer.elim_not_effective : 0.000018s : 0.03% optimize.symbol_engine_optimizer.opt_reshape : 0.000009s : 0.02% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000013s : 0.02% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000006s : 0.01% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000029s : 0.05% get_jit_bprop_graph : 0.000002s : 0.00% rewriter_after_jit_bprop_graph : 0.000007s : 0.01% opt_after_jit_grad : 0.000731s : 1.25% validate : 0.000055s : 0.09% Time group info: ------[substitution.] 0.000265 38 12.29% : 0.000033s : 3: substitution.cast_eliminate 0.97% : 0.000003s : 3: substitution.elim_not_effective 0.65% : 0.000002s : 3: substitution.fold_const_symbol 2.78% : 0.000007s : 5: substitution.graph_param_transform 70.08% : 0.000185s : 4: substitution.inline 2.55% : 0.000007s : 6: substitution.j_node_and_user_rematch 2.92% : 0.000008s : 6: substitution.remove_not_recompute_node 2.48% : 0.000007s : 4: substitution.replace_old_param 5.27% : 0.000014s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.034971 2 97.58% : 0.034125s : 1: type_inference.infer 2.42% : 0.000846s : 1: type_inference.specialize ------[replace.] 0.000071 8 64.47% : 0.000046s : 4: replace.inline 35.53% : 0.000025s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000195 8 93.87% : 0.000183s : 4: match.inline 6.13% : 0.000012s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000383 1504 0.66% : 0.000003s : 15: predicate.accumulaten_eliminater 28.87% : 0.000111s : 5: predicate.ad_related_special_op_eliminate 0.37% : 0.000001s : 10: predicate.addn_check_dump 0.69% : 0.000003s : 15: predicate.addn_zero_filter 0.57% : 0.000002s : 15: predicate.adjust_all_reduce_mul_add 1.67% : 0.000006s : 25: predicate.arithmetic_simplify 0.74% : 0.000003s : 15: predicate.cast_eliminate 0.53% : 0.000002s : 10: predicate.check_bprop_eliminate 0.39% : 0.000001s : 10: predicate.compare_switch_simplify 0.13% : 0.000000s : 5: predicate.const_output_eliminate 0.49% : 0.000002s : 10: predicate.depend_value_elim 0.69% : 0.000003s : 15: predicate.dict_get_item_const_eliminator 0.81% : 0.000003s : 15: predicate.dict_get_item_eliminator 0.63% : 0.000002s : 15: predicate.dict_set_item_eliminator 0.77% : 0.000003s : 10: predicate.dumpgradient_eliminate 0.14% : 0.000001s : 5: predicate.elim_not_effective 0.28% : 0.000001s : 5: predicate.elim_shapecalc_of_broadcastargs 0.86% : 0.000003s : 20: predicate.environ_add_const_eliminate 0.69% : 0.000003s : 20: predicate.environ_get_add_eliminate 0.86% : 0.000003s : 20: predicate.environ_get_depend_swap 1.31% : 0.000005s : 30: predicate.environ_get_eliminate 0.81% : 0.000003s : 20: predicate.environ_get_set_eliminate 1.04% : 0.000004s : 23: predicate.exchange_switch_depend_value 1.61% : 0.000006s : 23: predicate.float_depend_g_call 0.36% : 0.000001s : 10: predicate.float_environ_get_switch 0.63% : 0.000002s : 15: predicate.float_tuple_getitem_switch 0.13% : 0.000001s : 5: predicate.fold_const_symbol 0.49% : 0.000002s : 10: predicate.get_grad_eliminate 0.16% : 0.000001s : 5: predicate.graph_param_transform 0.46% : 0.000002s : 10: predicate.incorporate_call 0.36% : 0.000001s : 10: predicate.incorporate_call_switch 4.30% : 0.000016s : 68: predicate.inline 0.60% : 0.000002s : 10: predicate.inline_without_move 0.24% : 0.000001s : 10: predicate.j_node_and_user_rematch 0.73% : 0.000003s : 10: predicate.less_batch_normalization 1.31% : 0.000005s : 29: predicate.list_to_tuple_eliminator_ 1.67% : 0.000006s : 44: predicate.load_eliminater 0.58% : 0.000002s : 5: predicate.loop_unroll_after_grad 1.38% : 0.000005s : 36: predicate.loop_unroll_before_grad 1.18% : 0.000005s : 25: predicate.make_slice_get_slice_eliminator 0.40% : 0.000002s : 10: predicate.merge_addn 0.45% : 0.000002s : 10: predicate.micro_step_allgather_replace 0.41% : 0.000002s : 10: predicate.mini_step_allgather_replace 0.57% : 0.000002s : 15: predicate.minmaximum_grad 1.57% : 0.000006s : 5: predicate.mutable_eliminate 0.30% : 0.000001s : 5: predicate.opt_reshape 0.32% : 0.000001s : 5: predicate.parallel_virtual_node 1.23% : 0.000005s : 23: predicate.partial_defer_inline 1.03% : 0.000004s : 24: predicate.partial_eliminate 0.68% : 0.000003s : 15: predicate.print_const_string_wrapper 0.48% : 0.000002s : 10: predicate.reduce_all_const_elim 0.92% : 0.000004s : 15: predicate.reduce_eliminate 1.70% : 0.000007s : 44: predicate.redundant_stop_gradient_eliminater 0.25% : 0.000001s : 10: predicate.remove_not_recompute_node 0.85% : 0.000003s : 29: predicate.replace_applicator 0.35% : 0.000001s : 10: predicate.replace_old_param 0.25% : 0.000001s : 5: predicate.reset_defer_inline 0.71% : 0.000003s : 15: predicate.reshape_eliminate 0.49% : 0.000002s : 10: predicate.row_tensor_add_zeros_like 0.28% : 0.000001s : 5: predicate.row_tensor_eliminate 0.60% : 0.000002s : 10: predicate.same_eliminate 0.32% : 0.000001s : 10: predicate.set_cell_output_no_recompute 0.60% : 0.000002s : 10: predicate.shard_identity_eliminate 0.56% : 0.000002s : 10: predicate.special_op_eliminate 0.60% : 0.000002s : 10: predicate.specialize_transform 0.72% : 0.000003s : 10: predicate.split_environ_get_set_with_tuple_value 0.64% : 0.000002s : 10: predicate.stack_unstack_eliminate 0.26% : 0.000001s : 5: predicate.switch_call_monad_eliminater 0.95% : 0.000004s : 23: predicate.switch_defer_inline 1.39% : 0.000005s : 33: predicate.switch_layer_defer_inline 3.27% : 0.000013s : 74: predicate.switch_simplify 0.68% : 0.000003s : 15: predicate.tile_eliminate 0.63% : 0.000002s : 15: predicate.transpose_eliminate 1.10% : 0.000004s : 25: predicate.tuple_list_convert_item_index_to_positive 1.19% : 0.000005s : 25: predicate.tuple_list_get_item_const_eliminator 1.33% : 0.000005s : 25: predicate.tuple_list_get_item_depend_reorder 2.25% : 0.000009s : 39: predicate.tuple_list_get_item_eliminator 1.04% : 0.000004s : 25: predicate.tuple_list_get_set_item_eliminator 1.54% : 0.000006s : 35: predicate.tuple_list_set_item_eliminator 1.44% : 0.000006s : 29: predicate.tuple_to_list_eliminator_ 1.70% : 0.000007s : 44: predicate.updatestate_pure_node_eliminater 1.98% : 0.000008s : 54: predicate.updatestate_useless_node_eliminater 0.26% : 0.000001s : 5: predicate.value_based_eliminate 0.47% : 0.000002s : 10: predicate.virtual_dataset_eliminate 0.54% : 0.000002s : 10: predicate.virtual_output_eliminate 0.19% : 0.000001s : 5: predicate.virtual_view_grad_eliminate 0.32% : 0.000001s : 5: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000669 11 51.69% : 0.000346s : 5: func_graph_cloner_run.FuncGraphClonerGraph 48.31% : 0.000323s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.094757 192 0.01% : 0.000006s : 1: ForceFp32Comm 4.39% : 0.004157s : 1: add_attr 4.36% : 0.004134s : 1: add_attr_with_inline 0.01% : 0.000006s : 1: add_comm_op_reuse_tag 0.07% : 0.000071s : 1: add_recomputation 0.01% : 0.000006s : 1: assign_add_opt 0.09% : 0.000084s : 1: auto_monad 0.04% : 0.000037s : 1: auto_monad_reorder 0.01% : 0.000006s : 1: begin_end_overlap_inline 0.01% : 0.000009s : 1: bias_add_comm_swap 0.56% : 0.000526s : 1: bootstrap 0.05% : 0.000045s : 1: cconv 0.01% : 0.000007s : 1: comm_op_add_attrs 0.02% : 0.000023s : 1: control_data_broadcast_order 0.01% : 0.000014s : 1: convert_after_rewriter 0.04% : 0.000038s : 1: cse_after_recomputation 0.01% : 0.000008s : 1: dataset_repeat_opt 0.03% : 0.000027s : 1: detach_backward 0.01% : 0.000014s : 1: environ_conv 0.04% : 0.000035s : 1: event_method 0.01% : 0.000007s : 1: full_micro_interleaved_order_control 0.01% : 0.000008s : 1: get_jit_bprop_graph 0.01% : 0.000013s : 1: graph_reusing 0.01% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000006s : 1: handle_group_info 0.01% : 0.000009s : 1: inline 0.01% : 0.000009s : 1: insert-virtual-dataset 0.01% : 0.000006s : 1: interleave_parallel_branches 0.01% : 0.000006s : 1: interleave_split_concat_branches 0.01% : 0.000008s : 1: label_fine_grained_interleaved_index 0.01% : 0.000010s : 1: label_micro_interleaved_index 0.55% : 0.000526s : 1: loop_unroll 0.01% : 0.000007s : 1: merge_cast_opt 0.01% : 0.000007s : 1: micro_interleaved_order_control 17.85% : 0.016918s : 1: mutable_eliminate 0.01% : 0.000010s : 1: offloading_packed_experts 0.02% : 0.000018s : 1: opt.transform.loop_unroll_optimizer 0.04% : 0.000035s : 1: opt.transform.mutable_eliminate 1.64% : 0.001556s : 78: opt.transform.opt_a 0.04% : 0.000037s : 1: opt.transform.opt_after_cconv 0.16% : 0.000148s : 1: opt.transform.opt_after_jit_grad 0.15% : 0.000145s : 28: opt.transform.opt_b 0.07% : 0.000064s : 2: opt.transform.opt_trans_graph 0.05% : 0.000051s : 4: opt.transform.symbol_engine_opt 4.20% : 0.003981s : 1: opt_a 0.17% : 0.000156s : 1: opt_after_cconv 0.79% : 0.000745s : 1: opt_after_jit_grad 0.39% : 0.000374s : 1: opt_b 25.06% : 0.023746s : 1: optimize 0.03% : 0.000033s : 1: optimize_parallel_all_gather_comm 0.01% : 0.000012s : 1: order_py_execute_after_rewriter 0.03% : 0.000032s : 1: overlap_grad_flash_sp 0.01% : 0.000006s : 1: overlap_grad_matmul_and_grad_allreduce 0.01% : 0.000011s : 1: overlap_grad_ring_attention 0.01% : 0.000007s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000007s : 1: overlap_opt_shard_in_pipeline 0.01% : 0.000008s : 1: overlap_param_gather 0.01% : 0.000006s : 1: overlap_recompute_allgather_and_fa_grad 0.01% : 0.000011s : 1: overlap_recompute_and_grad_model_parallel 0.01% : 0.000008s : 1: overlap_recompute_comm 0.01% : 0.000011s : 1: parallel-infer-symbol 0.01% : 0.000006s : 1: parallel-infer-symbol-second 0.01% : 0.000008s : 1: partial_unused_args_eliminate 0.01% : 0.000009s : 1: pipeline_parallel_scheduler 0.01% : 0.000007s : 1: pipeline_split 0.06% : 0.000052s : 1: pre_auto_parallel 0.04% : 0.000041s : 1: py_interpret_to_execute 0.03% : 0.000025s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000007s : 1: remove_cast_before_assign_add 0.02% : 0.000024s : 1: remove_dup_value 0.59% : 0.000556s : 1: renormalize.infer 0.46% : 0.000439s : 1: renormalize.specialize 0.01% : 0.000009s : 1: reorder_send_recv_between_fp_bp 0.01% : 0.000013s : 1: rewriter_after_jit_bprop_graph 0.06% : 0.000055s : 1: rewriter_after_opt_a 0.12% : 0.000109s : 1: rewriter_before_opt_a 0.01% : 0.000008s : 1: slice_cell_reuse_recomputed_activation 0.01% : 0.000008s : 1: slice_recompute_activation 0.01% : 0.000007s : 1: split_layernorm_comm 0.01% : 0.000008s : 1: split_matmul_comm_elemetwise 0.01% : 0.000012s : 1: swap_dp_allreduce_reducescatter 0.13% : 0.000119s : 1: symbol_engine_optimizer 0.12% : 0.000110s : 1: tuple_transform 37.05% : 0.035103s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:44:58.899.44 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.031157, [21] [bootstrap]: 0.00051351 [type_inference]: 0.00753871 [event_method]: 2.179e-05 [auto_monad]: 6.807e-05 [graph_reusing]: 6.29999e-06 [inline]: 2.54999e-06 [add_attr]: 0.0039202, [1] [add_attr_with_inline]: 0.00390621, [1] [Cycle 1]: 7.966e-05, [2] [tag_attr]: 2.66e-05 [meta_addattr_fg_expand]: 6.19001e-06 [parallel-infer-symbol]: 4.11001e-06 [pre_auto_parallel]: 4.111e-05 [insert-virtual-dataset]: 2.61e-06 [parallel-infer-symbol-second]: 9.00007e-07 [dataset_repeat_opt]: 2.11e-06 [pipeline_split]: 1.81e-06 [optimize]: 0.0182644, [53] [py_interpret_to_execute]: 3.122e-05 [rewriter_before_opt_a]: 0.00010025 [opt_a]: 0.0157195, [2] [Cycle 1]: 0.00265153, [45] [expand_dump_flag]: 3.05002e-06 [switch_simplify]: 4.489e-05 [loop_unroll]: 3.154e-05 [a_1]: 0.00073364 [with_stream_mark]: 2.137e-05 [recompute_prepare]: 1.121e-05 [updatestate_depend_eliminate]: 5.23002e-06 [updatestate_assign_eliminate]: 4.07e-06 [updatestate_loads_eliminate]: 4.24002e-06 [parameter_eliminate]: 2.22999e-06 [a_2]: 0.00010558 [accelerated_algorithm]: 9.17001e-06 [shard]: 2.51e-06 [meta_shard_fg_expand]: 3.23e-06 [shard_inline]: 8.13001e-06 [merge_send_recv]: 1.029e-05 [auto_parallel]: 8.64e-06 [parallel]: 2.123e-05 [flash_sp]: 1.09e-05 [merge_comm]: 5.02e-06 [allreduce_fusion]: 4.32e-06 [matmul_add_comm_reduction]: 1.194e-05 [allreduce_slice_to_reducescatter]: 6.79982e-07 [virtual_shard_identity]: 1.086e-05 [virtual_dataset]: 8.00999e-06 [get_grad_eliminate_]: 7.99997e-06 [virtual_output]: 8.60001e-06 [merge_forward]: 4.89e-06 [cell_reuse_recompute_pass]: 1.62001e-06 [offload_activation]: 1.247e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.862e-05 [merge_recompute_call_nodes]: 2.06998e-06 [before_grad]: 1.406e-05 [set_forward_comm_id_for_comm_node_pass]: 4.35e-06 [meta_fg_expand]: 3.9e-06 [flash_sp_send_recv_attached]: 2.99001e-06 [receive_attached]: 2.42001e-06 [after_resolve]: 1.457e-05 [a_after_grad]: 1.296e-05 [renormalize]: 0.00100893 [add_forward_monad_depend]: 9.54e-06 [auto_monad_grad]: 3.2e-06 [auto_monad_eliminator]: 2.431e-05 [cse]: 4.254e-05 [a_3]: 7.274e-05 [Cycle 2]: 0.0130531, [45] [expand_dump_flag]: 2.53e-06 [switch_simplify]: 1.059e-05 [loop_unroll]: 8.42e-06 [a_1]: 0.00019422 [with_stream_mark]: 1.996e-05 [recompute_prepare]: 7.74002e-06 [updatestate_depend_eliminate]: 4.39002e-06 [updatestate_assign_eliminate]: 3.65998e-06 [updatestate_loads_eliminate]: 3.56999e-06 [parameter_eliminate]: 1.66002e-06 [a_2]: 9.642e-05 [accelerated_algorithm]: 7.59002e-06 [shard]: 2.37999e-06 [meta_shard_fg_expand]: 2.71e-06 [shard_inline]: 7.44002e-06 [merge_send_recv]: 1.042e-05 [auto_parallel]: 1.017e-05 [parallel]: 7.8e-06 [flash_sp]: 4.52e-06 [merge_comm]: 4.80999e-06 [allreduce_fusion]: 3.97998e-06 [matmul_add_comm_reduction]: 1.132e-05 [allreduce_slice_to_reducescatter]: 8.10018e-07 [virtual_shard_identity]: 9.39e-06 [virtual_dataset]: 7.26999e-06 [get_grad_eliminate_]: 8.03001e-06 [virtual_output]: 7.5e-06 [merge_forward]: 4.18001e-06 [cell_reuse_recompute_pass]: 3.4e-06 [offload_activation]: 1.293e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.813e-05 [merge_recompute_call_nodes]: 1.94e-06 [before_grad]: 1.345e-05 [set_forward_comm_id_for_comm_node_pass]: 5.43002e-06 [meta_fg_expand]: 3.5e-06 [flash_sp_send_recv_attached]: 1.60001e-06 [receive_attached]: 2.17999e-06 [after_resolve]: 1.372e-05 [a_after_grad]: 1.228e-05 [renormalize]: 8.9989e-08 [add_forward_monad_depend]: 2.58e-06 [auto_monad_grad]: 1.77001e-06 [auto_monad_eliminator]: 4.457e-05 [cse]: 4.493e-05 [a_3]: 7.233e-05 [py_interpret_to_execute_after_opt_a]: 2.338e-05 [slice_cell_reuse_recomputed_activation]: 3.46999e-06 [rewriter_after_opt_a]: 6.605e-05 [convert_after_rewriter]: 8.52e-06 [order_py_execute_after_rewriter]: 6.59001e-06 [mutable_eliminate]: 0.00075488 [opt_b]: 0.00026937, [1] [Cycle 1]: 0.0002605, [7] [b_1]: 0.00016556 [b_2]: 1.004e-05 [updatestate_depend_eliminate]: 9.68002e-06 [updatestate_assign_eliminate]: 3.58999e-06 [updatestate_loads_eliminate]: 3.73001e-06 [renormalize]: 7.30011e-07 [cse]: 2.921e-05 [optimize_parallel_all_gather_comm]: 2.28e-05 [overlap_param_gather]: 2.17999e-06 [cconv]: 3.539e-05 [loop_unroll]: 0.00045899 [opt_after_cconv]: 0.00012212, [1] [Cycle 1]: 0.00011544, [7] [c_1]: 3.686e-05 [parameter_eliminate]: 5.00999e-06 [updatestate_depend_eliminate]: 6.33998e-06 [updatestate_assign_eliminate]: 3.12002e-06 [updatestate_loads_eliminate]: 3.09999e-06 [cse]: 2.544e-05 [renormalize]: 5.19998e-07 [remove_dup_value]: 1.855e-05 [tuple_transform]: 9.079e-05, [1] [Cycle 1]: 8.596e-05, [4] [d_1]: 5.608e-05 [none_parameter_eliminate]: 1.78002e-06 [renormalize]: 2.00002e-07 [switch_simplify]: 8.46002e-06 [partial_unused_args_eliminate]: 1.76e-06 [add_recomputation]: 6.552e-05 [cse_after_recomputation]: 2.731e-05, [1] [Cycle 1]: 2.213e-05, [1] [cse]: 1.604e-05 [environ_conv]: 7.44002e-06 [swap_dp_allreduce_reducescatter]: 6.19001e-06 [bias_add_comm_swap]: 3.58e-06 [label_micro_interleaved_index]: 4.52998e-06 [label_fine_grained_interleaved_index]: 3.25e-06 [merge_cast_opt]: 1.52999e-06 [slice_recompute_activation]: 2.37999e-06 [micro_interleaved_order_control]: 2.57001e-06 [assign_add_opt]: 1.47999e-06 [ForceFp32Comm]: 8.99978e-07 [remove_cast_before_assign_add]: 1.07998e-06 [full_micro_interleaved_order_control]: 1.96998e-06 [reorder_send_recv_between_fp_bp]: 2.61e-06 [comm_op_add_attrs]: 1.00999e-06 [add_comm_op_reuse_tag]: 1.04003e-06 [interleave_split_concat_branches]: 1.36002e-06 [interleave_parallel_branches]: 1.04e-06 [overlap_opt_shard_in_pipeline]: 1.66e-06 [overlap_opt_shard_grad_in_pipeline]: 1.96e-06 [control_data_broadcast_order]: 1.617e-05 [grouped_pairwise_exchange_alltoall]: 1.52999e-06 [offloading_packed_experts]: 4.48001e-06 [overlap_recompute_and_grad_model_parallel]: 5.45001e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.16997e-06 [overlap_recompute_allgather_and_fa_grad]: 1.52001e-06 [overlap_recompute_comm]: 2.37999e-06 [overlap_grad_ring_attention]: 4.75999e-06 [overlap_grad_flash_sp]: 2.48e-05 [begin_end_overlap_inline]: 4.69998e-07 [split_matmul_comm_elemetwise]: 2.24001e-06 [split_layernorm_comm]: 1.82001e-06 [handle_group_info]: 9.70002e-07 [symbol_engine_optimizer]: 9.022e-05, [1] [Cycle 1]: 8.493e-05, [6] [build]: 4.18001e-06 [elim_shapecalc]: 1.289e-05 [elim_not_effective]: 1.583e-05 [opt_reshape]: 9.79999e-06 [fold_const_symbol]: 1.254e-05 [renormalize]: 7.59988e-07 [detach_backward]: 2.02001e-06 [pipeline_parallel_scheduler]: 1.68002e-06 [auto_monad_reorder]: 2.075e-05 [get_jit_bprop_graph]: 2.11e-06 [rewriter_after_jit_bprop_graph]: 5.47001e-06 [opt_after_jit_grad]: 0.00051183 [validate]: 5.119e-05 Sums bootstrap : 0.000514s : 3.65% type_inference : 0.007539s : 53.61% event_method : 0.000022s : 0.15% auto_monad : 0.000068s : 0.48% graph_reusing : 0.000006s : 0.04% inline : 0.000003s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000027s : 0.19% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.04% parallel-infer-symbol : 0.000004s : 0.03% pre_auto_parallel : 0.000041s : 0.29% insert-virtual-dataset : 0.000003s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000031s : 0.22% optimize.rewriter_before_opt_a : 0.000100s : 0.71% optimize.opt_a.expand_dump_flag : 0.000006s : 0.04% optimize.opt_a.switch_simplify : 0.000055s : 0.39% optimize.opt_a.loop_unroll : 0.000040s : 0.28% optimize.opt_a.a_1 : 0.000928s : 6.60% optimize.opt_a.with_stream_mark : 0.000041s : 0.29% optimize.opt_a.recompute_prepare : 0.000019s : 0.13% optimize.opt_a.updatestate_depend_eliminate : 0.000010s : 0.07% optimize.opt_a.updatestate_assign_eliminate : 0.000008s : 0.05% optimize.opt_a.updatestate_loads_eliminate : 0.000008s : 0.06% optimize.opt_a.parameter_eliminate : 0.000004s : 0.03% optimize.opt_a.a_2 : 0.000202s : 1.44% optimize.opt_a.accelerated_algorithm : 0.000017s : 0.12% optimize.opt_a.shard : 0.000005s : 0.03% optimize.opt_a.meta_shard_fg_expand : 0.000006s : 0.04% optimize.opt_a.shard_inline : 0.000016s : 0.11% optimize.opt_a.merge_send_recv : 0.000021s : 0.15% optimize.opt_a.auto_parallel : 0.000019s : 0.13% optimize.opt_a.parallel : 0.000029s : 0.21% optimize.opt_a.flash_sp : 0.000015s : 0.11% optimize.opt_a.merge_comm : 0.000010s : 0.07% optimize.opt_a.allreduce_fusion : 0.000008s : 0.06% optimize.opt_a.matmul_add_comm_reduction : 0.000023s : 0.17% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000020s : 0.14% optimize.opt_a.virtual_dataset : 0.000015s : 0.11% optimize.opt_a.get_grad_eliminate_ : 0.000016s : 0.11% optimize.opt_a.virtual_output : 0.000016s : 0.11% optimize.opt_a.merge_forward : 0.000009s : 0.06% optimize.opt_a.cell_reuse_recompute_pass : 0.000005s : 0.04% optimize.opt_a.offload_activation : 0.000025s : 0.18% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000037s : 0.26% optimize.opt_a.merge_recompute_call_nodes : 0.000004s : 0.03% optimize.opt_a.before_grad : 0.000028s : 0.20% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000010s : 0.07% optimize.opt_a.meta_fg_expand : 0.000007s : 0.05% optimize.opt_a.flash_sp_send_recv_attached : 0.000005s : 0.03% optimize.opt_a.receive_attached : 0.000005s : 0.03% optimize.opt_a.after_resolve : 0.000028s : 0.20% optimize.opt_a.a_after_grad : 0.000025s : 0.18% optimize.opt_a.renormalize : 0.001009s : 7.18% optimize.opt_a.add_forward_monad_depend : 0.000012s : 0.09% optimize.opt_a.auto_monad_grad : 0.000005s : 0.04% optimize.opt_a.auto_monad_eliminator : 0.000069s : 0.49% optimize.opt_a.cse : 0.000087s : 0.62% optimize.opt_a.a_3 : 0.000145s : 1.03% optimize.py_interpret_to_execute_after_opt_a : 0.000023s : 0.17% optimize.slice_cell_reuse_recomputed_activation : 0.000003s : 0.02% optimize.rewriter_after_opt_a : 0.000066s : 0.47% optimize.convert_after_rewriter : 0.000009s : 0.06% optimize.order_py_execute_after_rewriter : 0.000007s : 0.05% optimize.mutable_eliminate : 0.000755s : 5.37% optimize.opt_b.b_1 : 0.000166s : 1.18% optimize.opt_b.b_2 : 0.000010s : 0.07% optimize.opt_b.updatestate_depend_eliminate : 0.000010s : 0.07% optimize.opt_b.updatestate_assign_eliminate : 0.000004s : 0.03% optimize.opt_b.updatestate_loads_eliminate : 0.000004s : 0.03% optimize.opt_b.renormalize : 0.000001s : 0.01% optimize.opt_b.cse : 0.000029s : 0.21% optimize.optimize_parallel_all_gather_comm : 0.000023s : 0.16% optimize.overlap_param_gather : 0.000002s : 0.02% optimize.cconv : 0.000035s : 0.25% optimize.loop_unroll : 0.000459s : 3.26% optimize.opt_after_cconv.c_1 : 0.000037s : 0.26% optimize.opt_after_cconv.parameter_eliminate : 0.000005s : 0.04% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.05% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.02% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.02% optimize.opt_after_cconv.cse : 0.000025s : 0.18% optimize.opt_after_cconv.renormalize : 0.000001s : 0.00% optimize.remove_dup_value : 0.000019s : 0.13% optimize.tuple_transform.d_1 : 0.000056s : 0.40% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000008s : 0.06% optimize.partial_unused_args_eliminate : 0.000002s : 0.01% optimize.add_recomputation : 0.000066s : 0.47% optimize.cse_after_recomputation.cse : 0.000016s : 0.11% optimize.environ_conv : 0.000007s : 0.05% optimize.swap_dp_allreduce_reducescatter : 0.000006s : 0.04% optimize.bias_add_comm_swap : 0.000004s : 0.03% optimize.label_micro_interleaved_index : 0.000005s : 0.03% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.02% optimize.merge_cast_opt : 0.000002s : 0.01% optimize.slice_recompute_activation : 0.000002s : 0.02% optimize.micro_interleaved_order_control : 0.000003s : 0.02% optimize.assign_add_opt : 0.000001s : 0.01% optimize.ForceFp32Comm : 0.000001s : 0.01% optimize.remove_cast_before_assign_add : 0.000001s : 0.01% optimize.full_micro_interleaved_order_control : 0.000002s : 0.01% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.02% optimize.comm_op_add_attrs : 0.000001s : 0.01% optimize.add_comm_op_reuse_tag : 0.000001s : 0.01% optimize.interleave_split_concat_branches : 0.000001s : 0.01% optimize.interleave_parallel_branches : 0.000001s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000002s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.01% optimize.control_data_broadcast_order : 0.000016s : 0.12% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.01% optimize.offloading_packed_experts : 0.000004s : 0.03% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.04% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000002s : 0.01% optimize.overlap_recompute_comm : 0.000002s : 0.02% optimize.overlap_grad_ring_attention : 0.000005s : 0.03% optimize.overlap_grad_flash_sp : 0.000025s : 0.18% optimize.begin_end_overlap_inline : 0.000000s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.02% optimize.split_layernorm_comm : 0.000002s : 0.01% optimize.handle_group_info : 0.000001s : 0.01% optimize.symbol_engine_optimizer.build : 0.000004s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000013s : 0.09% optimize.symbol_engine_optimizer.elim_not_effective : 0.000016s : 0.11% optimize.symbol_engine_optimizer.opt_reshape : 0.000010s : 0.07% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000013s : 0.09% optimize.symbol_engine_optimizer.renormalize : 0.000001s : 0.01% detach_backward : 0.000002s : 0.01% pipeline_parallel_scheduler : 0.000002s : 0.01% auto_monad_reorder : 0.000021s : 0.15% get_jit_bprop_graph : 0.000002s : 0.02% rewriter_after_jit_bprop_graph : 0.000005s : 0.04% opt_after_jit_grad : 0.000512s : 3.64% validate : 0.000051s : 0.36% Time group info: ------[substitution.] 0.000251 38 12.28% : 0.000031s : 3: substitution.cast_eliminate 0.85% : 0.000002s : 3: substitution.elim_not_effective 0.67% : 0.000002s : 3: substitution.fold_const_symbol 2.83% : 0.000007s : 5: substitution.graph_param_transform 70.05% : 0.000176s : 4: substitution.inline 2.26% : 0.000006s : 6: substitution.j_node_and_user_rematch 3.08% : 0.000008s : 6: substitution.remove_not_recompute_node 2.50% : 0.000006s : 4: substitution.replace_old_param 5.48% : 0.000014s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.007456 2 88.17% : 0.006574s : 1: type_inference.infer 11.83% : 0.000882s : 1: type_inference.specialize ------[replace.] 0.000069 8 62.55% : 0.000043s : 4: replace.inline 37.45% : 0.000026s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000185 8 93.58% : 0.000173s : 4: match.inline 6.42% : 0.000012s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000261 1504 0.98% : 0.000003s : 15: predicate.accumulaten_eliminater 0.88% : 0.000002s : 5: predicate.ad_related_special_op_eliminate 0.57% : 0.000001s : 10: predicate.addn_check_dump 1.02% : 0.000003s : 15: predicate.addn_zero_filter 0.91% : 0.000002s : 15: predicate.adjust_all_reduce_mul_add 2.18% : 0.000006s : 25: predicate.arithmetic_simplify 1.14% : 0.000003s : 15: predicate.cast_eliminate 0.74% : 0.000002s : 10: predicate.check_bprop_eliminate 0.59% : 0.000002s : 10: predicate.compare_switch_simplify 0.17% : 0.000000s : 5: predicate.const_output_eliminate 0.67% : 0.000002s : 10: predicate.depend_value_elim 0.92% : 0.000002s : 15: predicate.dict_get_item_const_eliminator 1.19% : 0.000003s : 15: predicate.dict_get_item_eliminator 0.84% : 0.000002s : 15: predicate.dict_set_item_eliminator 0.98% : 0.000003s : 10: predicate.dumpgradient_eliminate 0.22% : 0.000001s : 5: predicate.elim_not_effective 0.52% : 0.000001s : 5: predicate.elim_shapecalc_of_broadcastargs 1.29% : 0.000003s : 20: predicate.environ_add_const_eliminate 1.06% : 0.000003s : 20: predicate.environ_get_add_eliminate 1.03% : 0.000003s : 20: predicate.environ_get_depend_swap 1.72% : 0.000004s : 30: predicate.environ_get_eliminate 1.12% : 0.000003s : 20: predicate.environ_get_set_eliminate 1.34% : 0.000003s : 23: predicate.exchange_switch_depend_value 2.11% : 0.000006s : 23: predicate.float_depend_g_call 0.57% : 0.000001s : 10: predicate.float_environ_get_switch 0.87% : 0.000002s : 15: predicate.float_tuple_getitem_switch 0.15% : 0.000000s : 5: predicate.fold_const_symbol 0.77% : 0.000002s : 10: predicate.get_grad_eliminate 0.27% : 0.000001s : 5: predicate.graph_param_transform 0.61% : 0.000002s : 10: predicate.incorporate_call 0.50% : 0.000001s : 10: predicate.incorporate_call_switch 5.72% : 0.000015s : 68: predicate.inline 0.88% : 0.000002s : 10: predicate.inline_without_move 0.30% : 0.000001s : 10: predicate.j_node_and_user_rematch 1.03% : 0.000003s : 10: predicate.less_batch_normalization 1.77% : 0.000005s : 29: predicate.list_to_tuple_eliminator_ 2.36% : 0.000006s : 44: predicate.load_eliminater 0.78% : 0.000002s : 5: predicate.loop_unroll_after_grad 2.02% : 0.000005s : 36: predicate.loop_unroll_before_grad 1.60% : 0.000004s : 25: predicate.make_slice_get_slice_eliminator 0.61% : 0.000002s : 10: predicate.merge_addn 0.64% : 0.000002s : 10: predicate.micro_step_allgather_replace 0.75% : 0.000002s : 10: predicate.mini_step_allgather_replace 0.80% : 0.000002s : 15: predicate.minmaximum_grad 1.07% : 0.000003s : 5: predicate.mutable_eliminate 0.49% : 0.000001s : 5: predicate.opt_reshape 0.48% : 0.000001s : 5: predicate.parallel_virtual_node 1.59% : 0.000004s : 23: predicate.partial_defer_inline 1.51% : 0.000004s : 24: predicate.partial_eliminate 1.06% : 0.000003s : 15: predicate.print_const_string_wrapper 0.59% : 0.000002s : 10: predicate.reduce_all_const_elim 1.18% : 0.000003s : 15: predicate.reduce_eliminate 2.51% : 0.000007s : 44: predicate.redundant_stop_gradient_eliminater 0.44% : 0.000001s : 10: predicate.remove_not_recompute_node 1.41% : 0.000004s : 29: predicate.replace_applicator 0.46% : 0.000001s : 10: predicate.replace_old_param 0.32% : 0.000001s : 5: predicate.reset_defer_inline 1.12% : 0.000003s : 15: predicate.reshape_eliminate 0.62% : 0.000002s : 10: predicate.row_tensor_add_zeros_like 0.35% : 0.000001s : 5: predicate.row_tensor_eliminate 1.05% : 0.000003s : 10: predicate.same_eliminate 0.43% : 0.000001s : 10: predicate.set_cell_output_no_recompute 0.83% : 0.000002s : 10: predicate.shard_identity_eliminate 0.70% : 0.000002s : 10: predicate.special_op_eliminate 0.72% : 0.000002s : 10: predicate.specialize_transform 1.48% : 0.000004s : 10: predicate.split_environ_get_set_with_tuple_value 0.92% : 0.000002s : 10: predicate.stack_unstack_eliminate 0.38% : 0.000001s : 5: predicate.switch_call_monad_eliminater 1.38% : 0.000004s : 23: predicate.switch_defer_inline 2.01% : 0.000005s : 33: predicate.switch_layer_defer_inline 4.58% : 0.000012s : 74: predicate.switch_simplify 1.12% : 0.000003s : 15: predicate.tile_eliminate 1.01% : 0.000003s : 15: predicate.transpose_eliminate 1.60% : 0.000004s : 25: predicate.tuple_list_convert_item_index_to_positive 1.59% : 0.000004s : 25: predicate.tuple_list_get_item_const_eliminator 1.40% : 0.000004s : 25: predicate.tuple_list_get_item_depend_reorder 3.14% : 0.000008s : 39: predicate.tuple_list_get_item_eliminator 1.49% : 0.000004s : 25: predicate.tuple_list_get_set_item_eliminator 2.37% : 0.000006s : 35: predicate.tuple_list_set_item_eliminator 1.82% : 0.000005s : 29: predicate.tuple_to_list_eliminator_ 2.26% : 0.000006s : 44: predicate.updatestate_pure_node_eliminater 2.95% : 0.000008s : 54: predicate.updatestate_useless_node_eliminater 0.36% : 0.000001s : 5: predicate.value_based_eliminate 0.69% : 0.000002s : 10: predicate.virtual_dataset_eliminate 0.72% : 0.000002s : 10: predicate.virtual_output_eliminate 0.25% : 0.000001s : 5: predicate.virtual_view_grad_eliminate 0.34% : 0.000001s : 5: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000684 11 52.14% : 0.000357s : 5: func_graph_cloner_run.FuncGraphClonerGraph 47.86% : 0.000328s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.056035 192 0.01% : 0.000004s : 1: ForceFp32Comm 7.01% : 0.003926s : 1: add_attr 6.98% : 0.003911s : 1: add_attr_with_inline 0.01% : 0.000004s : 1: add_comm_op_reuse_tag 0.13% : 0.000071s : 1: add_recomputation 0.01% : 0.000004s : 1: assign_add_opt 0.13% : 0.000075s : 1: auto_monad 0.04% : 0.000025s : 1: auto_monad_reorder 0.01% : 0.000003s : 1: begin_end_overlap_inline 0.01% : 0.000007s : 1: bias_add_comm_swap 0.98% : 0.000549s : 1: bootstrap 0.07% : 0.000039s : 1: cconv 0.01% : 0.000004s : 1: comm_op_add_attrs 0.03% : 0.000019s : 1: control_data_broadcast_order 0.02% : 0.000012s : 1: convert_after_rewriter 0.05% : 0.000030s : 1: cse_after_recomputation 0.01% : 0.000005s : 1: dataset_repeat_opt 0.01% : 0.000005s : 1: detach_backward 0.02% : 0.000011s : 1: environ_conv 0.05% : 0.000029s : 1: event_method 0.01% : 0.000005s : 1: full_micro_interleaved_order_control 0.01% : 0.000005s : 1: get_jit_bprop_graph 0.02% : 0.000010s : 1: graph_reusing 0.01% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000004s : 1: handle_group_info 0.01% : 0.000006s : 1: inline 0.01% : 0.000006s : 1: insert-virtual-dataset 0.01% : 0.000004s : 1: interleave_parallel_branches 0.01% : 0.000004s : 1: interleave_split_concat_branches 0.01% : 0.000006s : 1: label_fine_grained_interleaved_index 0.01% : 0.000007s : 1: label_micro_interleaved_index 0.83% : 0.000468s : 1: loop_unroll 0.01% : 0.000004s : 1: merge_cast_opt 0.01% : 0.000005s : 1: micro_interleaved_order_control 1.36% : 0.000764s : 1: mutable_eliminate 0.01% : 0.000007s : 1: offloading_packed_experts 0.03% : 0.000017s : 1: opt.transform.loop_unroll_optimizer 0.04% : 0.000023s : 1: opt.transform.mutable_eliminate 2.65% : 0.001487s : 78: opt.transform.opt_a 0.06% : 0.000036s : 1: opt.transform.opt_after_cconv 0.06% : 0.000032s : 1: opt.transform.opt_after_jit_grad 0.25% : 0.000139s : 28: opt.transform.opt_b 0.11% : 0.000062s : 2: opt.transform.opt_trans_graph 0.08% : 0.000047s : 4: opt.transform.symbol_engine_opt 28.06% : 0.015723s : 1: opt_a 0.22% : 0.000126s : 1: opt_after_cconv 0.93% : 0.000522s : 1: opt_after_jit_grad 0.49% : 0.000273s : 1: opt_b 32.61% : 0.018270s : 1: optimize 0.05% : 0.000026s : 1: optimize_parallel_all_gather_comm 0.02% : 0.000010s : 1: order_py_execute_after_rewriter 0.05% : 0.000028s : 1: overlap_grad_flash_sp 0.01% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.01% : 0.000008s : 1: overlap_grad_ring_attention 0.01% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.01% : 0.000006s : 1: overlap_param_gather 0.01% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.01% : 0.000008s : 1: overlap_recompute_and_grad_model_parallel 0.01% : 0.000005s : 1: overlap_recompute_comm 0.01% : 0.000008s : 1: parallel-infer-symbol 0.01% : 0.000004s : 1: parallel-infer-symbol-second 0.01% : 0.000005s : 1: partial_unused_args_eliminate 0.01% : 0.000005s : 1: pipeline_parallel_scheduler 0.01% : 0.000005s : 1: pipeline_split 0.08% : 0.000046s : 1: pre_auto_parallel 0.06% : 0.000035s : 1: py_interpret_to_execute 0.05% : 0.000027s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000004s : 1: remove_cast_before_assign_add 0.04% : 0.000022s : 1: remove_dup_value 0.98% : 0.000548s : 1: renormalize.infer 0.80% : 0.000450s : 1: renormalize.specialize 0.01% : 0.000005s : 1: reorder_send_recv_between_fp_bp 0.02% : 0.000009s : 1: rewriter_after_jit_bprop_graph 0.13% : 0.000070s : 1: rewriter_after_opt_a 0.19% : 0.000104s : 1: rewriter_before_opt_a 0.01% : 0.000007s : 1: slice_cell_reuse_recomputed_activation 0.01% : 0.000005s : 1: slice_recompute_activation 0.01% : 0.000005s : 1: split_layernorm_comm 0.01% : 0.000005s : 1: split_matmul_comm_elemetwise 0.02% : 0.000009s : 1: swap_dp_allreduce_reducescatter 0.17% : 0.000093s : 1: symbol_engine_optimizer 0.17% : 0.000094s : 1: tuple_transform 13.50% : 0.007563s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:44:58.500.141 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:44:58.500.451 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0638496, [21] [bootstrap]: 0.0004394 [type_inference]: 0.0355264 [event_method]: 2.149e-05 [auto_monad]: 6.547e-05 [graph_reusing]: 6.99001e-06 [inline]: 2.48e-06 [add_attr]: 0.00359897, [1] [add_attr_with_inline]: 0.00358822, [1] [Cycle 1]: 8.998e-05, [2] [tag_attr]: 2.329e-05 [meta_addattr_fg_expand]: 6.29001e-06 [parallel-infer-symbol]: 3.46999e-06 [pre_auto_parallel]: 3.872e-05 [insert-virtual-dataset]: 2.68e-06 [parallel-infer-symbol-second]: 8.00006e-07 [dataset_repeat_opt]: 2.06e-06 [pipeline_split]: 1.64e-06 [optimize]: 0.0226425, [53] [py_interpret_to_execute]: 3.003e-05 [rewriter_before_opt_a]: 9.755e-05 [opt_a]: 0.00366592, [2] [Cycle 1]: 0.00263502, [45] [expand_dump_flag]: 3.44001e-06 [switch_simplify]: 4.47e-05 [loop_unroll]: 3.155e-05 [a_1]: 0.00070585 [with_stream_mark]: 1.977e-05 [recompute_prepare]: 1.107e-05 [updatestate_depend_eliminate]: 5.23002e-06 [updatestate_assign_eliminate]: 4.52e-06 [updatestate_loads_eliminate]: 3.79002e-06 [parameter_eliminate]: 2.02001e-06 [a_2]: 0.00013268 [accelerated_algorithm]: 8.79e-06 [shard]: 2.22999e-06 [meta_shard_fg_expand]: 2.12001e-06 [shard_inline]: 7.75e-06 [merge_send_recv]: 9.22001e-06 [auto_parallel]: 8.28999e-06 [parallel]: 1.933e-05 [flash_sp]: 9.56e-06 [merge_comm]: 5.37001e-06 [allreduce_fusion]: 4.25e-06 [matmul_add_comm_reduction]: 1.056e-05 [allreduce_slice_to_reducescatter]: 6.39993e-07 [virtual_shard_identity]: 1.027e-05 [virtual_dataset]: 8.12e-06 [get_grad_eliminate_]: 7.61999e-06 [virtual_output]: 7.65e-06 [merge_forward]: 4.53001e-06 [cell_reuse_recompute_pass]: 1.55999e-06 [offload_activation]: 1.173e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.779e-05 [merge_recompute_call_nodes]: 1.89e-06 [before_grad]: 1.295e-05 [set_forward_comm_id_for_comm_node_pass]: 4.29997e-06 [meta_fg_expand]: 3.32002e-06 [flash_sp_send_recv_attached]: 2.37999e-06 [receive_attached]: 2.36e-06 [after_resolve]: 1.355e-05 [a_after_grad]: 1.23e-05 [renormalize]: 0.00091165 [add_forward_monad_depend]: 7.63001e-06 [auto_monad_grad]: 2.64999e-06 [auto_monad_eliminator]: 1.971e-05 [cse]: 3.734e-05 [a_3]: 8.043e-05 [Cycle 2]: 0.00101544, [45] [expand_dump_flag]: 2.19001e-06 [switch_simplify]: 1.024e-05 [loop_unroll]: 7.77e-06 [a_1]: 0.000184 [with_stream_mark]: 1.932e-05 [recompute_prepare]: 7.98999e-06 [updatestate_depend_eliminate]: 4.49002e-06 [updatestate_assign_eliminate]: 3.85e-06 [updatestate_loads_eliminate]: 3.52002e-06 [parameter_eliminate]: 1.76998e-06 [a_2]: 0.00012012 [accelerated_algorithm]: 7.82e-06 [shard]: 2.22001e-06 [meta_shard_fg_expand]: 3.14999e-06 [shard_inline]: 7.98001e-06 [merge_send_recv]: 7.30003e-06 [auto_parallel]: 9.57999e-06 [parallel]: 8.40001e-06 [flash_sp]: 3.65998e-06 [merge_comm]: 4.90001e-06 [allreduce_fusion]: 4.48999e-06 [matmul_add_comm_reduction]: 9.47999e-06 [allreduce_slice_to_reducescatter]: 1.15001e-06 [virtual_shard_identity]: 9.36e-06 [virtual_dataset]: 7.80998e-06 [get_grad_eliminate_]: 7.57002e-06 [virtual_output]: 7.45998e-06 [merge_forward]: 5.10001e-06 [cell_reuse_recompute_pass]: 2.02999e-06 [offload_activation]: 1.122e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.805e-05 [merge_recompute_call_nodes]: 1.98002e-06 [before_grad]: 1.278e-05 [set_forward_comm_id_for_comm_node_pass]: 4.73001e-06 [meta_fg_expand]: 2.94999e-06 [flash_sp_send_recv_attached]: 1.35001e-06 [receive_attached]: 1.99999e-06 [after_resolve]: 1.345e-05 [a_after_grad]: 1.234e-05 [renormalize]: 1.50001e-07 [add_forward_monad_depend]: 2.59001e-06 [auto_monad_grad]: 2.27999e-06 [auto_monad_eliminator]: 1.198e-05 [cse]: 2.252e-05 [a_3]: 6.192e-05 [py_interpret_to_execute_after_opt_a]: 2.137e-05 [slice_cell_reuse_recomputed_activation]: 5.51998e-06 [rewriter_after_opt_a]: 5.386e-05 [convert_after_rewriter]: 1.208e-05 [order_py_execute_after_rewriter]: 8.49998e-06 [mutable_eliminate]: 0.00071135 [opt_b]: 0.0003332, [1] [Cycle 1]: 0.00032243, [7] [b_1]: 0.00020425 [b_2]: 9.57001e-06 [updatestate_depend_eliminate]: 9.84001e-06 [updatestate_assign_eliminate]: 3.51001e-06 [updatestate_loads_eliminate]: 3.76001e-06 [renormalize]: 1.15001e-06 [cse]: 3.132e-05 [optimize_parallel_all_gather_comm]: 2.495e-05 [overlap_param_gather]: 5.30001e-06 [cconv]: 3.736e-05 [loop_unroll]: 0.00051218 [opt_after_cconv]: 0.00015584, [1] [Cycle 1]: 0.00014531, [7] [c_1]: 3.902e-05 [parameter_eliminate]: 5.29e-06 [updatestate_depend_eliminate]: 9.44998e-06 [updatestate_assign_eliminate]: 3.48e-06 [updatestate_loads_eliminate]: 2.93998e-06 [cse]: 2.658e-05 [renormalize]: 6.29982e-07 [remove_dup_value]: 1.966e-05 [tuple_transform]: 0.00010852, [1] [Cycle 1]: 0.00010036, [4] [d_1]: 5.699e-05 [none_parameter_eliminate]: 1.89e-06 [renormalize]: 2.50002e-07 [switch_simplify]: 9.32001e-06 [partial_unused_args_eliminate]: 4.76002e-06 [add_recomputation]: 6.678e-05 [cse_after_recomputation]: 0.00013596, [1] [Cycle 1]: 0.00011251, [1] [cse]: 6.56e-05 [environ_conv]: 1.352e-05 [swap_dp_allreduce_reducescatter]: 1.538e-05 [bias_add_comm_swap]: 6.57002e-06 [label_micro_interleaved_index]: 1.068e-05 [label_fine_grained_interleaved_index]: 5.34e-06 [merge_cast_opt]: 4.10998e-06 [slice_recompute_activation]: 4.97e-06 [micro_interleaved_order_control]: 4.82e-06 [assign_add_opt]: 3.90998e-06 [ForceFp32Comm]: 3.51001e-06 [remove_cast_before_assign_add]: 3.5e-06 [full_micro_interleaved_order_control]: 4.91002e-06 [reorder_send_recv_between_fp_bp]: 5.62999e-06 [comm_op_add_attrs]: 3.61999e-06 [add_comm_op_reuse_tag]: 3.38999e-06 [interleave_split_concat_branches]: 3.45003e-06 [interleave_parallel_branches]: 3.68e-06 [overlap_opt_shard_in_pipeline]: 3.86999e-06 [overlap_opt_shard_grad_in_pipeline]: 4.18001e-06 [control_data_broadcast_order]: 2.413e-05 [grouped_pairwise_exchange_alltoall]: 3.99002e-06 [offloading_packed_experts]: 8.65999e-06 [overlap_recompute_and_grad_model_parallel]: 8.37e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.58999e-06 [overlap_recompute_allgather_and_fa_grad]: 3.79002e-06 [overlap_recompute_comm]: 4.90999e-06 [overlap_grad_ring_attention]: 8.21002e-06 [overlap_grad_flash_sp]: 3.058e-05 [begin_end_overlap_inline]: 3.01001e-06 [split_matmul_comm_elemetwise]: 4.49998e-06 [split_layernorm_comm]: 4.38999e-06 [handle_group_info]: 3.18998e-06 [symbol_engine_optimizer]: 0.00013815, [1] [Cycle 1]: 0.00013014, [6] [build]: 5.12999e-06 [elim_shapecalc]: 2.645e-05 [elim_not_effective]: 2.472e-05 [opt_reshape]: 9.41e-06 [fold_const_symbol]: 1.313e-05 [renormalize]: 4.69998e-07 [detach_backward]: 4.43999e-06 [pipeline_parallel_scheduler]: 1.71e-06 [auto_monad_reorder]: 2.807e-05 [get_jit_bprop_graph]: 1.90001e-06 [rewriter_after_jit_bprop_graph]: 7.45e-06 [opt_after_jit_grad]: 0.00075498 [validate]: 4.886e-05 Sums bootstrap : 0.000439s : 1.04% type_inference : 0.035526s : 84.03% event_method : 0.000021s : 0.05% auto_monad : 0.000065s : 0.15% graph_reusing : 0.000007s : 0.02% inline : 0.000002s : 0.01% add_attr.add_attr_with_inline.tag_attr : 0.000023s : 0.06% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.01% parallel-infer-symbol : 0.000003s : 0.01% pre_auto_parallel : 0.000039s : 0.09% insert-virtual-dataset : 0.000003s : 0.01% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000030s : 0.07% optimize.rewriter_before_opt_a : 0.000098s : 0.23% optimize.opt_a.expand_dump_flag : 0.000006s : 0.01% optimize.opt_a.switch_simplify : 0.000055s : 0.13% optimize.opt_a.loop_unroll : 0.000039s : 0.09% optimize.opt_a.a_1 : 0.000890s : 2.10% optimize.opt_a.with_stream_mark : 0.000039s : 0.09% optimize.opt_a.recompute_prepare : 0.000019s : 0.05% optimize.opt_a.updatestate_depend_eliminate : 0.000010s : 0.02% optimize.opt_a.updatestate_assign_eliminate : 0.000008s : 0.02% optimize.opt_a.updatestate_loads_eliminate : 0.000007s : 0.02% optimize.opt_a.parameter_eliminate : 0.000004s : 0.01% optimize.opt_a.a_2 : 0.000253s : 0.60% optimize.opt_a.accelerated_algorithm : 0.000017s : 0.04% optimize.opt_a.shard : 0.000004s : 0.01% optimize.opt_a.meta_shard_fg_expand : 0.000005s : 0.01% optimize.opt_a.shard_inline : 0.000016s : 0.04% optimize.opt_a.merge_send_recv : 0.000017s : 0.04% optimize.opt_a.auto_parallel : 0.000018s : 0.04% optimize.opt_a.parallel : 0.000028s : 0.07% optimize.opt_a.flash_sp : 0.000013s : 0.03% optimize.opt_a.merge_comm : 0.000010s : 0.02% optimize.opt_a.allreduce_fusion : 0.000009s : 0.02% optimize.opt_a.matmul_add_comm_reduction : 0.000020s : 0.05% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000002s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000020s : 0.05% optimize.opt_a.virtual_dataset : 0.000016s : 0.04% optimize.opt_a.get_grad_eliminate_ : 0.000015s : 0.04% optimize.opt_a.virtual_output : 0.000015s : 0.04% optimize.opt_a.merge_forward : 0.000010s : 0.02% optimize.opt_a.cell_reuse_recompute_pass : 0.000004s : 0.01% optimize.opt_a.offload_activation : 0.000023s : 0.05% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000036s : 0.08% optimize.opt_a.merge_recompute_call_nodes : 0.000004s : 0.01% optimize.opt_a.before_grad : 0.000026s : 0.06% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000009s : 0.02% optimize.opt_a.meta_fg_expand : 0.000006s : 0.01% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.01% optimize.opt_a.receive_attached : 0.000004s : 0.01% optimize.opt_a.after_resolve : 0.000027s : 0.06% optimize.opt_a.a_after_grad : 0.000025s : 0.06% optimize.opt_a.renormalize : 0.000912s : 2.16% optimize.opt_a.add_forward_monad_depend : 0.000010s : 0.02% optimize.opt_a.auto_monad_grad : 0.000005s : 0.01% optimize.opt_a.auto_monad_eliminator : 0.000032s : 0.07% optimize.opt_a.cse : 0.000060s : 0.14% optimize.opt_a.a_3 : 0.000142s : 0.34% optimize.py_interpret_to_execute_after_opt_a : 0.000021s : 0.05% optimize.slice_cell_reuse_recomputed_activation : 0.000006s : 0.01% optimize.rewriter_after_opt_a : 0.000054s : 0.13% optimize.convert_after_rewriter : 0.000012s : 0.03% optimize.order_py_execute_after_rewriter : 0.000008s : 0.02% optimize.mutable_eliminate : 0.000711s : 1.68% optimize.opt_b.b_1 : 0.000204s : 0.48% optimize.opt_b.b_2 : 0.000010s : 0.02% optimize.opt_b.updatestate_depend_eliminate : 0.000010s : 0.02% optimize.opt_b.updatestate_assign_eliminate : 0.000004s : 0.01% optimize.opt_b.updatestate_loads_eliminate : 0.000004s : 0.01% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000031s : 0.07% optimize.optimize_parallel_all_gather_comm : 0.000025s : 0.06% optimize.overlap_param_gather : 0.000005s : 0.01% optimize.cconv : 0.000037s : 0.09% optimize.loop_unroll : 0.000512s : 1.21% optimize.opt_after_cconv.c_1 : 0.000039s : 0.09% optimize.opt_after_cconv.parameter_eliminate : 0.000005s : 0.01% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000009s : 0.02% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.cse : 0.000027s : 0.06% optimize.opt_after_cconv.renormalize : 0.000001s : 0.00% optimize.remove_dup_value : 0.000020s : 0.05% optimize.tuple_transform.d_1 : 0.000057s : 0.13% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000009s : 0.02% optimize.partial_unused_args_eliminate : 0.000005s : 0.01% optimize.add_recomputation : 0.000067s : 0.16% optimize.cse_after_recomputation.cse : 0.000066s : 0.16% optimize.environ_conv : 0.000014s : 0.03% optimize.swap_dp_allreduce_reducescatter : 0.000015s : 0.04% optimize.bias_add_comm_swap : 0.000007s : 0.02% optimize.label_micro_interleaved_index : 0.000011s : 0.03% optimize.label_fine_grained_interleaved_index : 0.000005s : 0.01% optimize.merge_cast_opt : 0.000004s : 0.01% optimize.slice_recompute_activation : 0.000005s : 0.01% optimize.micro_interleaved_order_control : 0.000005s : 0.01% optimize.assign_add_opt : 0.000004s : 0.01% optimize.ForceFp32Comm : 0.000004s : 0.01% optimize.remove_cast_before_assign_add : 0.000003s : 0.01% optimize.full_micro_interleaved_order_control : 0.000005s : 0.01% optimize.reorder_send_recv_between_fp_bp : 0.000006s : 0.01% optimize.comm_op_add_attrs : 0.000004s : 0.01% optimize.add_comm_op_reuse_tag : 0.000003s : 0.01% optimize.interleave_split_concat_branches : 0.000003s : 0.01% optimize.interleave_parallel_branches : 0.000004s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000004s : 0.01% optimize.control_data_broadcast_order : 0.000024s : 0.06% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.01% optimize.offloading_packed_experts : 0.000009s : 0.02% optimize.overlap_recompute_and_grad_model_parallel : 0.000008s : 0.02% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.01% optimize.overlap_recompute_comm : 0.000005s : 0.01% optimize.overlap_grad_ring_attention : 0.000008s : 0.02% optimize.overlap_grad_flash_sp : 0.000031s : 0.07% optimize.begin_end_overlap_inline : 0.000003s : 0.01% optimize.split_matmul_comm_elemetwise : 0.000004s : 0.01% optimize.split_layernorm_comm : 0.000004s : 0.01% optimize.handle_group_info : 0.000003s : 0.01% optimize.symbol_engine_optimizer.build : 0.000005s : 0.01% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000026s : 0.06% optimize.symbol_engine_optimizer.elim_not_effective : 0.000025s : 0.06% optimize.symbol_engine_optimizer.opt_reshape : 0.000009s : 0.02% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000013s : 0.03% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000004s : 0.01% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000028s : 0.07% get_jit_bprop_graph : 0.000002s : 0.00% rewriter_after_jit_bprop_graph : 0.000007s : 0.02% opt_after_jit_grad : 0.000755s : 1.79% validate : 0.000049s : 0.12% Time group info: ------[substitution.] 0.000237 38 12.85% : 0.000030s : 3: substitution.cast_eliminate 1.24% : 0.000003s : 3: substitution.elim_not_effective 0.75% : 0.000002s : 3: substitution.fold_const_symbol 2.96% : 0.000007s : 5: substitution.graph_param_transform 69.60% : 0.000165s : 4: substitution.inline 2.16% : 0.000005s : 6: substitution.j_node_and_user_rematch 2.72% : 0.000006s : 6: substitution.remove_not_recompute_node 2.34% : 0.000006s : 4: substitution.replace_old_param 5.38% : 0.000013s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.035466 2 97.84% : 0.034699s : 1: type_inference.infer 2.16% : 0.000767s : 1: type_inference.specialize ------[replace.] 0.000064 8 64.04% : 0.000041s : 4: replace.inline 35.96% : 0.000023s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000173 8 93.66% : 0.000162s : 4: match.inline 6.34% : 0.000011s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000251 1504 0.90% : 0.000002s : 15: predicate.accumulaten_eliminater 0.87% : 0.000002s : 5: predicate.ad_related_special_op_eliminate 0.64% : 0.000002s : 10: predicate.addn_check_dump 0.95% : 0.000002s : 15: predicate.addn_zero_filter 0.76% : 0.000002s : 15: predicate.adjust_all_reduce_mul_add 2.06% : 0.000005s : 25: predicate.arithmetic_simplify 0.98% : 0.000002s : 15: predicate.cast_eliminate 0.66% : 0.000002s : 10: predicate.check_bprop_eliminate 0.62% : 0.000002s : 10: predicate.compare_switch_simplify 0.19% : 0.000000s : 5: predicate.const_output_eliminate 0.71% : 0.000002s : 10: predicate.depend_value_elim 0.89% : 0.000002s : 15: predicate.dict_get_item_const_eliminator 0.98% : 0.000002s : 15: predicate.dict_get_item_eliminator 0.86% : 0.000002s : 15: predicate.dict_set_item_eliminator 1.07% : 0.000003s : 10: predicate.dumpgradient_eliminate 0.47% : 0.000001s : 5: predicate.elim_not_effective 0.65% : 0.000002s : 5: predicate.elim_shapecalc_of_broadcastargs 1.16% : 0.000003s : 20: predicate.environ_add_const_eliminate 1.03% : 0.000003s : 20: predicate.environ_get_add_eliminate 1.04% : 0.000003s : 20: predicate.environ_get_depend_swap 1.66% : 0.000004s : 30: predicate.environ_get_eliminate 1.08% : 0.000003s : 20: predicate.environ_get_set_eliminate 1.28% : 0.000003s : 23: predicate.exchange_switch_depend_value 2.17% : 0.000005s : 23: predicate.float_depend_g_call 0.55% : 0.000001s : 10: predicate.float_environ_get_switch 0.80% : 0.000002s : 15: predicate.float_tuple_getitem_switch 0.20% : 0.000000s : 5: predicate.fold_const_symbol 0.63% : 0.000002s : 10: predicate.get_grad_eliminate 0.28% : 0.000001s : 5: predicate.graph_param_transform 0.65% : 0.000002s : 10: predicate.incorporate_call 0.54% : 0.000001s : 10: predicate.incorporate_call_switch 6.24% : 0.000016s : 68: predicate.inline 0.77% : 0.000002s : 10: predicate.inline_without_move 0.33% : 0.000001s : 10: predicate.j_node_and_user_rematch 0.87% : 0.000002s : 10: predicate.less_batch_normalization 1.91% : 0.000005s : 29: predicate.list_to_tuple_eliminator_ 2.52% : 0.000006s : 44: predicate.load_eliminater 1.04% : 0.000003s : 5: predicate.loop_unroll_after_grad 2.07% : 0.000005s : 36: predicate.loop_unroll_before_grad 1.73% : 0.000004s : 25: predicate.make_slice_get_slice_eliminator 0.65% : 0.000002s : 10: predicate.merge_addn 0.58% : 0.000001s : 10: predicate.micro_step_allgather_replace 0.59% : 0.000001s : 10: predicate.mini_step_allgather_replace 0.75% : 0.000002s : 15: predicate.minmaximum_grad 1.22% : 0.000003s : 5: predicate.mutable_eliminate 0.37% : 0.000001s : 5: predicate.opt_reshape 0.44% : 0.000001s : 5: predicate.parallel_virtual_node 1.68% : 0.000004s : 23: predicate.partial_defer_inline 1.60% : 0.000004s : 24: predicate.partial_eliminate 0.84% : 0.000002s : 15: predicate.print_const_string_wrapper 0.61% : 0.000002s : 10: predicate.reduce_all_const_elim 1.19% : 0.000003s : 15: predicate.reduce_eliminate 2.39% : 0.000006s : 44: predicate.redundant_stop_gradient_eliminater 0.38% : 0.000001s : 10: predicate.remove_not_recompute_node 1.49% : 0.000004s : 29: predicate.replace_applicator 0.39% : 0.000001s : 10: predicate.replace_old_param 0.38% : 0.000001s : 5: predicate.reset_defer_inline 1.03% : 0.000003s : 15: predicate.reshape_eliminate 0.65% : 0.000002s : 10: predicate.row_tensor_add_zeros_like 0.37% : 0.000001s : 5: predicate.row_tensor_eliminate 1.02% : 0.000003s : 10: predicate.same_eliminate 0.51% : 0.000001s : 10: predicate.set_cell_output_no_recompute 0.73% : 0.000002s : 10: predicate.shard_identity_eliminate 0.77% : 0.000002s : 10: predicate.special_op_eliminate 0.76% : 0.000002s : 10: predicate.specialize_transform 1.03% : 0.000003s : 10: predicate.split_environ_get_set_with_tuple_value 0.77% : 0.000002s : 10: predicate.stack_unstack_eliminate 0.34% : 0.000001s : 5: predicate.switch_call_monad_eliminater 1.38% : 0.000003s : 23: predicate.switch_defer_inline 1.95% : 0.000005s : 33: predicate.switch_layer_defer_inline 4.94% : 0.000012s : 74: predicate.switch_simplify 0.83% : 0.000002s : 15: predicate.tile_eliminate 0.89% : 0.000002s : 15: predicate.transpose_eliminate 1.38% : 0.000003s : 25: predicate.tuple_list_convert_item_index_to_positive 1.55% : 0.000004s : 25: predicate.tuple_list_get_item_const_eliminator 1.52% : 0.000004s : 25: predicate.tuple_list_get_item_depend_reorder 3.23% : 0.000008s : 39: predicate.tuple_list_get_item_eliminator 1.44% : 0.000004s : 25: predicate.tuple_list_get_set_item_eliminator 2.51% : 0.000006s : 35: predicate.tuple_list_set_item_eliminator 1.91% : 0.000005s : 29: predicate.tuple_to_list_eliminator_ 2.50% : 0.000006s : 44: predicate.updatestate_pure_node_eliminater 3.17% : 0.000008s : 54: predicate.updatestate_useless_node_eliminater 0.41% : 0.000001s : 5: predicate.value_based_eliminate 0.65% : 0.000002s : 10: predicate.virtual_dataset_eliminate 0.67% : 0.000002s : 10: predicate.virtual_output_eliminate 0.28% : 0.000001s : 5: predicate.virtual_view_grad_eliminate 0.42% : 0.000001s : 5: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000699 11 59.38% : 0.000415s : 5: func_graph_cloner_run.FuncGraphClonerGraph 40.62% : 0.000284s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.076625 192 0.01% : 0.000006s : 1: ForceFp32Comm 4.71% : 0.003610s : 1: add_attr 4.69% : 0.003592s : 1: add_attr_with_inline 0.01% : 0.000006s : 1: add_comm_op_reuse_tag 0.09% : 0.000072s : 1: add_recomputation 0.01% : 0.000007s : 1: assign_add_opt 0.10% : 0.000077s : 1: auto_monad 0.05% : 0.000036s : 1: auto_monad_reorder 0.01% : 0.000006s : 1: begin_end_overlap_inline 0.01% : 0.000009s : 1: bias_add_comm_swap 0.64% : 0.000487s : 1: bootstrap 0.05% : 0.000041s : 1: cconv 0.01% : 0.000006s : 1: comm_op_add_attrs 0.04% : 0.000028s : 1: control_data_broadcast_order 0.02% : 0.000016s : 1: convert_after_rewriter 0.19% : 0.000145s : 1: cse_after_recomputation 0.01% : 0.000008s : 1: dataset_repeat_opt 0.03% : 0.000021s : 1: detach_backward 0.02% : 0.000016s : 1: environ_conv 0.04% : 0.000033s : 1: event_method 0.01% : 0.000008s : 1: full_micro_interleaved_order_control 0.01% : 0.000008s : 1: get_jit_bprop_graph 0.02% : 0.000013s : 1: graph_reusing 0.01% : 0.000006s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000006s : 1: handle_group_info 0.01% : 0.000008s : 1: inline 0.01% : 0.000009s : 1: insert-virtual-dataset 0.01% : 0.000006s : 1: interleave_parallel_branches 0.01% : 0.000006s : 1: interleave_split_concat_branches 0.01% : 0.000008s : 1: label_fine_grained_interleaved_index 0.02% : 0.000014s : 1: label_micro_interleaved_index 0.68% : 0.000520s : 1: loop_unroll 0.01% : 0.000007s : 1: merge_cast_opt 0.01% : 0.000008s : 1: micro_interleaved_order_control 0.94% : 0.000720s : 1: mutable_eliminate 0.01% : 0.000011s : 1: offloading_packed_experts 0.03% : 0.000021s : 1: opt.transform.loop_unroll_optimizer 0.03% : 0.000023s : 1: opt.transform.mutable_eliminate 1.87% : 0.001429s : 78: opt.transform.opt_a 0.05% : 0.000037s : 1: opt.transform.opt_after_cconv 0.04% : 0.000034s : 1: opt.transform.opt_after_jit_grad 0.18% : 0.000138s : 28: opt.transform.opt_b 0.08% : 0.000064s : 2: opt.transform.opt_trans_graph 0.09% : 0.000067s : 4: opt.transform.symbol_engine_opt 4.79% : 0.003670s : 1: opt_a 0.21% : 0.000159s : 1: opt_after_cconv 1.00% : 0.000766s : 1: opt_after_jit_grad 0.44% : 0.000337s : 1: opt_b 30.04% : 0.023018s : 1: optimize 0.04% : 0.000029s : 1: optimize_parallel_all_gather_comm 0.02% : 0.000012s : 1: order_py_execute_after_rewriter 0.04% : 0.000034s : 1: overlap_grad_flash_sp 0.01% : 0.000006s : 1: overlap_grad_matmul_and_grad_allreduce 0.01% : 0.000011s : 1: overlap_grad_ring_attention 0.01% : 0.000007s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000007s : 1: overlap_opt_shard_in_pipeline 0.01% : 0.000008s : 1: overlap_param_gather 0.01% : 0.000006s : 1: overlap_recompute_allgather_and_fa_grad 0.02% : 0.000012s : 1: overlap_recompute_and_grad_model_parallel 0.01% : 0.000008s : 1: overlap_recompute_comm 0.01% : 0.000010s : 1: parallel-infer-symbol 0.01% : 0.000006s : 1: parallel-infer-symbol-second 0.01% : 0.000008s : 1: partial_unused_args_eliminate 0.01% : 0.000010s : 1: pipeline_parallel_scheduler 0.01% : 0.000007s : 1: pipeline_split 0.06% : 0.000046s : 1: pre_auto_parallel 0.04% : 0.000034s : 1: py_interpret_to_execute 0.03% : 0.000026s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000007s : 1: remove_cast_before_assign_add 0.03% : 0.000023s : 1: remove_dup_value 0.67% : 0.000516s : 1: renormalize.infer 0.50% : 0.000386s : 1: renormalize.specialize 0.01% : 0.000008s : 1: reorder_send_recv_between_fp_bp 0.02% : 0.000013s : 1: rewriter_after_jit_bprop_graph 0.08% : 0.000058s : 1: rewriter_after_opt_a 0.13% : 0.000101s : 1: rewriter_before_opt_a 0.01% : 0.000008s : 1: slice_cell_reuse_recomputed_activation 0.01% : 0.000008s : 1: slice_recompute_activation 0.01% : 0.000007s : 1: split_layernorm_comm 0.01% : 0.000007s : 1: split_matmul_comm_elemetwise 0.02% : 0.000018s : 1: swap_dp_allreduce_reducescatter 0.18% : 0.000141s : 1: symbol_engine_optimizer 0.15% : 0.000111s : 1: tuple_transform 46.43% : 0.035579s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:44:59.303.80 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0511695, [21] [bootstrap]: 0.00042503 [type_inference]: 0.0228339 [event_method]: 2.343e-05 [auto_monad]: 7.609e-05 [graph_reusing]: 6.40002e-06 [inline]: 3.31001e-06 [add_attr]: 0.00414942, [1] [add_attr_with_inline]: 0.00413477, [1] [Cycle 1]: 8.34e-05, [2] [tag_attr]: 2.75e-05 [meta_addattr_fg_expand]: 6.57002e-06 [parallel-infer-symbol]: 3.56001e-06 [pre_auto_parallel]: 4.299e-05 [insert-virtual-dataset]: 2.59001e-06 [parallel-infer-symbol-second]: 8.2e-07 [dataset_repeat_opt]: 2.26998e-06 [pipeline_split]: 1.72999e-06 [optimize]: 0.0227962, [53] [py_interpret_to_execute]: 3.463e-05 [rewriter_before_opt_a]: 0.00010319 [opt_a]: 0.00366622, [2] [Cycle 1]: 0.00273314, [45] [expand_dump_flag]: 3.11999e-06 [switch_simplify]: 4.497e-05 [loop_unroll]: 3.262e-05 [a_1]: 0.00075039 [with_stream_mark]: 2.446e-05 [recompute_prepare]: 1.321e-05 [updatestate_depend_eliminate]: 5.20001e-06 [updatestate_assign_eliminate]: 4.23001e-06 [updatestate_loads_eliminate]: 3.69002e-06 [parameter_eliminate]: 2.72001e-06 [a_2]: 0.000106 [accelerated_algorithm]: 9.72001e-06 [shard]: 2.34999e-06 [meta_shard_fg_expand]: 2.34001e-06 [shard_inline]: 8.94e-06 [merge_send_recv]: 1.244e-05 [auto_parallel]: 1.118e-05 [parallel]: 2.228e-05 [flash_sp]: 1.197e-05 [merge_comm]: 5.44e-06 [allreduce_fusion]: 4.53999e-06 [matmul_add_comm_reduction]: 1.141e-05 [allreduce_slice_to_reducescatter]: 7.80012e-07 [virtual_shard_identity]: 1.387e-05 [virtual_dataset]: 9.27001e-06 [get_grad_eliminate_]: 7.9e-06 [virtual_output]: 8.73001e-06 [merge_forward]: 5.26998e-06 [cell_reuse_recompute_pass]: 1.54e-06 [offload_activation]: 1.235e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.053e-05 [merge_recompute_call_nodes]: 2.36e-06 [before_grad]: 1.439e-05 [set_forward_comm_id_for_comm_node_pass]: 5.19998e-06 [meta_fg_expand]: 3.93001e-06 [flash_sp_send_recv_attached]: 3.16999e-06 [receive_attached]: 2.07001e-06 [after_resolve]: 1.478e-05 [a_after_grad]: 1.354e-05 [renormalize]: 0.00102126 [add_forward_monad_depend]: 1.058e-05 [auto_monad_grad]: 2.77002e-06 [auto_monad_eliminator]: 2.32e-05 [cse]: 4.124e-05 [a_3]: 6.864e-05 [Cycle 2]: 0.00091902, [45] [expand_dump_flag]: 2.31e-06 [switch_simplify]: 1.055e-05 [loop_unroll]: 7.70998e-06 [a_1]: 0.00019854 [with_stream_mark]: 2.245e-05 [recompute_prepare]: 9.10001e-06 [updatestate_depend_eliminate]: 5.39e-06 [updatestate_assign_eliminate]: 4.07e-06 [updatestate_loads_eliminate]: 3.65e-06 [parameter_eliminate]: 1.97999e-06 [a_2]: 0.0001011 [accelerated_algorithm]: 8.98002e-06 [shard]: 2.83e-06 [meta_shard_fg_expand]: 2.89001e-06 [shard_inline]: 7.88001e-06 [merge_send_recv]: 1.065e-05 [auto_parallel]: 9.86e-06 [parallel]: 9.11998e-06 [flash_sp]: 3.95998e-06 [merge_comm]: 4.62e-06 [allreduce_fusion]: 4.12998e-06 [matmul_add_comm_reduction]: 1.285e-05 [allreduce_slice_to_reducescatter]: 7.39994e-07 [virtual_shard_identity]: 9.76998e-06 [virtual_dataset]: 8.32998e-06 [get_grad_eliminate_]: 8.32e-06 [virtual_output]: 7.66001e-06 [merge_forward]: 5.32999e-06 [cell_reuse_recompute_pass]: 4.07e-06 [offload_activation]: 1.193e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.999e-05 [merge_recompute_call_nodes]: 1.82999e-06 [before_grad]: 1.378e-05 [set_forward_comm_id_for_comm_node_pass]: 5.64e-06 [meta_fg_expand]: 3.48999e-06 [flash_sp_send_recv_attached]: 2.27999e-06 [receive_attached]: 2.71999e-06 [after_resolve]: 1.643e-05 [a_after_grad]: 1.298e-05 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 2.49001e-06 [auto_monad_grad]: 2.22999e-06 [auto_monad_eliminator]: 1.428e-05 [cse]: 2.772e-05 [a_3]: 5.109e-05 [py_interpret_to_execute_after_opt_a]: 2.08e-05 [slice_cell_reuse_recomputed_activation]: 2.63e-06 [rewriter_after_opt_a]: 5.235e-05 [convert_after_rewriter]: 8.67e-06 [order_py_execute_after_rewriter]: 5.76998e-06 [mutable_eliminate]: 0.00084415 [opt_b]: 0.00029519, [1] [Cycle 1]: 0.00028442, [7] [b_1]: 0.00017031 [b_2]: 1.051e-05 [updatestate_depend_eliminate]: 1.276e-05 [updatestate_assign_eliminate]: 3.56001e-06 [updatestate_loads_eliminate]: 4.32998e-06 [renormalize]: 9.50007e-07 [cse]: 4.223e-05 [optimize_parallel_all_gather_comm]: 2.6e-05 [overlap_param_gather]: 2.16998e-06 [cconv]: 3.898e-05 [loop_unroll]: 0.0168459 [opt_after_cconv]: 0.00016812, [1] [Cycle 1]: 0.00015819, [7] [c_1]: 4.267e-05 [parameter_eliminate]: 6.96999e-06 [updatestate_depend_eliminate]: 1.405e-05 [updatestate_assign_eliminate]: 3.91999e-06 [updatestate_loads_eliminate]: 3.58999e-06 [cse]: 4.907e-05 [renormalize]: 8.10018e-07 [remove_dup_value]: 1.832e-05 [tuple_transform]: 9.707e-05, [1] [Cycle 1]: 9.213e-05, [4] [d_1]: 6.235e-05 [none_parameter_eliminate]: 1.72001e-06 [renormalize]: 2.29978e-07 [switch_simplify]: 9.03002e-06 [partial_unused_args_eliminate]: 2.01e-06 [add_recomputation]: 6.911e-05 [cse_after_recomputation]: 2.822e-05, [1] [Cycle 1]: 2.335e-05, [1] [cse]: 1.793e-05 [environ_conv]: 8.52998e-06 [swap_dp_allreduce_reducescatter]: 6.26e-06 [bias_add_comm_swap]: 3.83001e-06 [label_micro_interleaved_index]: 6.99001e-06 [label_fine_grained_interleaved_index]: 2.63e-06 [merge_cast_opt]: 1.35001e-06 [slice_recompute_activation]: 2.12999e-06 [micro_interleaved_order_control]: 2.64001e-06 [assign_add_opt]: 1.49e-06 [ForceFp32Comm]: 8.79983e-07 [remove_cast_before_assign_add]: 1.37e-06 [full_micro_interleaved_order_control]: 2.05002e-06 [reorder_send_recv_between_fp_bp]: 3.07002e-06 [comm_op_add_attrs]: 1.30999e-06 [add_comm_op_reuse_tag]: 9.5999e-07 [interleave_split_concat_branches]: 1.30001e-06 [interleave_parallel_branches]: 1.11997e-06 [overlap_opt_shard_in_pipeline]: 1.45999e-06 [overlap_opt_shard_grad_in_pipeline]: 1.91e-06 [control_data_broadcast_order]: 1.732e-05 [grouped_pairwise_exchange_alltoall]: 1.88002e-06 [offloading_packed_experts]: 4.60001e-06 [overlap_recompute_and_grad_model_parallel]: 5.17e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.20999e-06 [overlap_recompute_allgather_and_fa_grad]: 1.34e-06 [overlap_recompute_comm]: 2.63e-06 [overlap_grad_ring_attention]: 4.58001e-06 [overlap_grad_flash_sp]: 2.409e-05 [begin_end_overlap_inline]: 5.69999e-07 [split_matmul_comm_elemetwise]: 2.73e-06 [split_layernorm_comm]: 1.53002e-06 [handle_group_info]: 1.30999e-06 [symbol_engine_optimizer]: 8.914e-05, [1] [Cycle 1]: 8.428e-05, [6] [build]: 4.31002e-06 [elim_shapecalc]: 1.316e-05 [elim_not_effective]: 1.669e-05 [opt_reshape]: 8.91997e-06 [fold_const_symbol]: 1.22e-05 [renormalize]: 2.00002e-07 [detach_backward]: 2.56e-06 [pipeline_parallel_scheduler]: 1.81003e-06 [auto_monad_reorder]: 2.128e-05 [get_jit_bprop_graph]: 2.44001e-06 [rewriter_after_jit_bprop_graph]: 7.23e-06 [opt_after_jit_grad]: 0.00054615 [validate]: 4.892e-05 Sums bootstrap : 0.000425s : 0.93% type_inference : 0.022834s : 49.78% event_method : 0.000023s : 0.05% auto_monad : 0.000076s : 0.17% graph_reusing : 0.000006s : 0.01% inline : 0.000003s : 0.01% add_attr.add_attr_with_inline.tag_attr : 0.000028s : 0.06% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000007s : 0.01% parallel-infer-symbol : 0.000004s : 0.01% pre_auto_parallel : 0.000043s : 0.09% insert-virtual-dataset : 0.000003s : 0.01% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000035s : 0.08% optimize.rewriter_before_opt_a : 0.000103s : 0.22% optimize.opt_a.expand_dump_flag : 0.000005s : 0.01% optimize.opt_a.switch_simplify : 0.000056s : 0.12% optimize.opt_a.loop_unroll : 0.000040s : 0.09% optimize.opt_a.a_1 : 0.000949s : 2.07% optimize.opt_a.with_stream_mark : 0.000047s : 0.10% optimize.opt_a.recompute_prepare : 0.000022s : 0.05% optimize.opt_a.updatestate_depend_eliminate : 0.000011s : 0.02% optimize.opt_a.updatestate_assign_eliminate : 0.000008s : 0.02% optimize.opt_a.updatestate_loads_eliminate : 0.000007s : 0.02% optimize.opt_a.parameter_eliminate : 0.000005s : 0.01% optimize.opt_a.a_2 : 0.000207s : 0.45% optimize.opt_a.accelerated_algorithm : 0.000019s : 0.04% optimize.opt_a.shard : 0.000005s : 0.01% optimize.opt_a.meta_shard_fg_expand : 0.000005s : 0.01% optimize.opt_a.shard_inline : 0.000017s : 0.04% optimize.opt_a.merge_send_recv : 0.000023s : 0.05% optimize.opt_a.auto_parallel : 0.000021s : 0.05% optimize.opt_a.parallel : 0.000031s : 0.07% optimize.opt_a.flash_sp : 0.000016s : 0.03% optimize.opt_a.merge_comm : 0.000010s : 0.02% optimize.opt_a.allreduce_fusion : 0.000009s : 0.02% optimize.opt_a.matmul_add_comm_reduction : 0.000024s : 0.05% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000002s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000024s : 0.05% optimize.opt_a.virtual_dataset : 0.000018s : 0.04% optimize.opt_a.get_grad_eliminate_ : 0.000016s : 0.04% optimize.opt_a.virtual_output : 0.000016s : 0.04% optimize.opt_a.merge_forward : 0.000011s : 0.02% optimize.opt_a.cell_reuse_recompute_pass : 0.000006s : 0.01% optimize.opt_a.offload_activation : 0.000024s : 0.05% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000041s : 0.09% optimize.opt_a.merge_recompute_call_nodes : 0.000004s : 0.01% optimize.opt_a.before_grad : 0.000028s : 0.06% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000011s : 0.02% optimize.opt_a.meta_fg_expand : 0.000007s : 0.02% optimize.opt_a.flash_sp_send_recv_attached : 0.000005s : 0.01% optimize.opt_a.receive_attached : 0.000005s : 0.01% optimize.opt_a.after_resolve : 0.000031s : 0.07% optimize.opt_a.a_after_grad : 0.000027s : 0.06% optimize.opt_a.renormalize : 0.001021s : 2.23% optimize.opt_a.add_forward_monad_depend : 0.000013s : 0.03% optimize.opt_a.auto_monad_grad : 0.000005s : 0.01% optimize.opt_a.auto_monad_eliminator : 0.000037s : 0.08% optimize.opt_a.cse : 0.000069s : 0.15% optimize.opt_a.a_3 : 0.000120s : 0.26% optimize.py_interpret_to_execute_after_opt_a : 0.000021s : 0.05% optimize.slice_cell_reuse_recomputed_activation : 0.000003s : 0.01% optimize.rewriter_after_opt_a : 0.000052s : 0.11% optimize.convert_after_rewriter : 0.000009s : 0.02% optimize.order_py_execute_after_rewriter : 0.000006s : 0.01% optimize.mutable_eliminate : 0.000844s : 1.84% optimize.opt_b.b_1 : 0.000170s : 0.37% optimize.opt_b.b_2 : 0.000011s : 0.02% optimize.opt_b.updatestate_depend_eliminate : 0.000013s : 0.03% optimize.opt_b.updatestate_assign_eliminate : 0.000004s : 0.01% optimize.opt_b.updatestate_loads_eliminate : 0.000004s : 0.01% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000042s : 0.09% optimize.optimize_parallel_all_gather_comm : 0.000026s : 0.06% optimize.overlap_param_gather : 0.000002s : 0.00% optimize.cconv : 0.000039s : 0.08% optimize.loop_unroll : 0.016846s : 36.72% optimize.opt_after_cconv.c_1 : 0.000043s : 0.09% optimize.opt_after_cconv.parameter_eliminate : 0.000007s : 0.02% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000014s : 0.03% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000004s : 0.01% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000004s : 0.01% optimize.opt_after_cconv.cse : 0.000049s : 0.11% optimize.opt_after_cconv.renormalize : 0.000001s : 0.00% optimize.remove_dup_value : 0.000018s : 0.04% optimize.tuple_transform.d_1 : 0.000062s : 0.14% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000009s : 0.02% optimize.partial_unused_args_eliminate : 0.000002s : 0.00% optimize.add_recomputation : 0.000069s : 0.15% optimize.cse_after_recomputation.cse : 0.000018s : 0.04% optimize.environ_conv : 0.000009s : 0.02% optimize.swap_dp_allreduce_reducescatter : 0.000006s : 0.01% optimize.bias_add_comm_swap : 0.000004s : 0.01% optimize.label_micro_interleaved_index : 0.000007s : 0.02% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.01% optimize.merge_cast_opt : 0.000001s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.00% optimize.micro_interleaved_order_control : 0.000003s : 0.01% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.00% optimize.full_micro_interleaved_order_control : 0.000002s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.01% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.00% optimize.control_data_broadcast_order : 0.000017s : 0.04% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.00% optimize.offloading_packed_experts : 0.000005s : 0.01% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.01% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.00% optimize.overlap_recompute_comm : 0.000003s : 0.01% optimize.overlap_grad_ring_attention : 0.000005s : 0.01% optimize.overlap_grad_flash_sp : 0.000024s : 0.05% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000003s : 0.01% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000004s : 0.01% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000013s : 0.03% optimize.symbol_engine_optimizer.elim_not_effective : 0.000017s : 0.04% optimize.symbol_engine_optimizer.opt_reshape : 0.000009s : 0.02% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000012s : 0.03% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000003s : 0.01% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000021s : 0.05% get_jit_bprop_graph : 0.000002s : 0.01% rewriter_after_jit_bprop_graph : 0.000007s : 0.02% opt_after_jit_grad : 0.000546s : 1.19% validate : 0.000049s : 0.11% Time group info: ------[substitution.] 0.000260 38 12.70% : 0.000033s : 3: substitution.cast_eliminate 1.25% : 0.000003s : 3: substitution.elim_not_effective 0.60% : 0.000002s : 3: substitution.fold_const_symbol 3.08% : 0.000008s : 5: substitution.graph_param_transform 69.38% : 0.000180s : 4: substitution.inline 2.00% : 0.000005s : 6: substitution.j_node_and_user_rematch 3.20% : 0.000008s : 6: substitution.remove_not_recompute_node 2.67% : 0.000007s : 4: substitution.replace_old_param 5.11% : 0.000013s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.022744 2 95.94% : 0.021820s : 1: type_inference.infer 4.06% : 0.000925s : 1: type_inference.specialize ------[replace.] 0.000070 8 60.43% : 0.000042s : 4: replace.inline 39.57% : 0.000028s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000189 8 93.96% : 0.000177s : 4: match.inline 6.04% : 0.000011s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000276 1504 1.12% : 0.000003s : 15: predicate.accumulaten_eliminater 0.80% : 0.000002s : 5: predicate.ad_related_special_op_eliminate 0.63% : 0.000002s : 10: predicate.addn_check_dump 1.06% : 0.000003s : 15: predicate.addn_zero_filter 0.84% : 0.000002s : 15: predicate.adjust_all_reduce_mul_add 2.04% : 0.000006s : 25: predicate.arithmetic_simplify 0.99% : 0.000003s : 15: predicate.cast_eliminate 0.55% : 0.000002s : 10: predicate.check_bprop_eliminate 0.59% : 0.000002s : 10: predicate.compare_switch_simplify 0.17% : 0.000000s : 5: predicate.const_output_eliminate 0.70% : 0.000002s : 10: predicate.depend_value_elim 0.93% : 0.000003s : 15: predicate.dict_get_item_const_eliminator 1.04% : 0.000003s : 15: predicate.dict_get_item_eliminator 0.90% : 0.000002s : 15: predicate.dict_set_item_eliminator 0.97% : 0.000003s : 10: predicate.dumpgradient_eliminate 0.20% : 0.000001s : 5: predicate.elim_not_effective 0.40% : 0.000001s : 5: predicate.elim_shapecalc_of_broadcastargs 1.19% : 0.000003s : 20: predicate.environ_add_const_eliminate 1.06% : 0.000003s : 20: predicate.environ_get_add_eliminate 1.06% : 0.000003s : 20: predicate.environ_get_depend_swap 1.51% : 0.000004s : 30: predicate.environ_get_eliminate 1.01% : 0.000003s : 20: predicate.environ_get_set_eliminate 1.23% : 0.000003s : 23: predicate.exchange_switch_depend_value 2.07% : 0.000006s : 23: predicate.float_depend_g_call 0.55% : 0.000002s : 10: predicate.float_environ_get_switch 0.81% : 0.000002s : 15: predicate.float_tuple_getitem_switch 0.16% : 0.000000s : 5: predicate.fold_const_symbol 0.69% : 0.000002s : 10: predicate.get_grad_eliminate 0.21% : 0.000001s : 5: predicate.graph_param_transform 0.56% : 0.000002s : 10: predicate.incorporate_call 0.48% : 0.000001s : 10: predicate.incorporate_call_switch 6.07% : 0.000017s : 68: predicate.inline 1.07% : 0.000003s : 10: predicate.inline_without_move 0.31% : 0.000001s : 10: predicate.j_node_and_user_rematch 0.90% : 0.000002s : 10: predicate.less_batch_normalization 1.72% : 0.000005s : 29: predicate.list_to_tuple_eliminator_ 2.48% : 0.000007s : 44: predicate.load_eliminater 2.15% : 0.000006s : 5: predicate.loop_unroll_after_grad 2.01% : 0.000006s : 36: predicate.loop_unroll_before_grad 2.08% : 0.000006s : 25: predicate.make_slice_get_slice_eliminator 0.58% : 0.000002s : 10: predicate.merge_addn 0.61% : 0.000002s : 10: predicate.micro_step_allgather_replace 0.59% : 0.000002s : 10: predicate.mini_step_allgather_replace 0.77% : 0.000002s : 15: predicate.minmaximum_grad 1.97% : 0.000005s : 5: predicate.mutable_eliminate 0.42% : 0.000001s : 5: predicate.opt_reshape 0.37% : 0.000001s : 5: predicate.parallel_virtual_node 1.73% : 0.000005s : 23: predicate.partial_defer_inline 1.42% : 0.000004s : 24: predicate.partial_eliminate 0.81% : 0.000002s : 15: predicate.print_const_string_wrapper 0.64% : 0.000002s : 10: predicate.reduce_all_const_elim 1.38% : 0.000004s : 15: predicate.reduce_eliminate 2.32% : 0.000006s : 44: predicate.redundant_stop_gradient_eliminater 0.50% : 0.000001s : 10: predicate.remove_not_recompute_node 1.21% : 0.000003s : 29: predicate.replace_applicator 0.43% : 0.000001s : 10: predicate.replace_old_param 0.54% : 0.000001s : 5: predicate.reset_defer_inline 0.88% : 0.000002s : 15: predicate.reshape_eliminate 0.60% : 0.000002s : 10: predicate.row_tensor_add_zeros_like 0.44% : 0.000001s : 5: predicate.row_tensor_eliminate 0.92% : 0.000003s : 10: predicate.same_eliminate 0.48% : 0.000001s : 10: predicate.set_cell_output_no_recompute 0.83% : 0.000002s : 10: predicate.shard_identity_eliminate 0.62% : 0.000002s : 10: predicate.special_op_eliminate 0.66% : 0.000002s : 10: predicate.specialize_transform 1.00% : 0.000003s : 10: predicate.split_environ_get_set_with_tuple_value 0.97% : 0.000003s : 10: predicate.stack_unstack_eliminate 0.36% : 0.000001s : 5: predicate.switch_call_monad_eliminater 1.26% : 0.000003s : 23: predicate.switch_defer_inline 1.86% : 0.000005s : 33: predicate.switch_layer_defer_inline 4.48% : 0.000012s : 74: predicate.switch_simplify 0.97% : 0.000003s : 15: predicate.tile_eliminate 1.04% : 0.000003s : 15: predicate.transpose_eliminate 1.59% : 0.000004s : 25: predicate.tuple_list_convert_item_index_to_positive 1.50% : 0.000004s : 25: predicate.tuple_list_get_item_const_eliminator 1.73% : 0.000005s : 25: predicate.tuple_list_get_item_depend_reorder 3.28% : 0.000009s : 39: predicate.tuple_list_get_item_eliminator 1.58% : 0.000004s : 25: predicate.tuple_list_get_set_item_eliminator 2.13% : 0.000006s : 35: predicate.tuple_list_set_item_eliminator 1.66% : 0.000005s : 29: predicate.tuple_to_list_eliminator_ 2.19% : 0.000006s : 44: predicate.updatestate_pure_node_eliminater 2.88% : 0.000008s : 54: predicate.updatestate_useless_node_eliminater 0.37% : 0.000001s : 5: predicate.value_based_eliminate 0.79% : 0.000002s : 10: predicate.virtual_dataset_eliminate 0.63% : 0.000002s : 10: predicate.virtual_output_eliminate 0.28% : 0.000001s : 5: predicate.virtual_view_grad_eliminate 0.38% : 0.000001s : 5: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000666 11 46.26% : 0.000308s : 5: func_graph_cloner_run.FuncGraphClonerGraph 53.74% : 0.000358s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.080892 192 0.00% : 0.000004s : 1: ForceFp32Comm 5.14% : 0.004156s : 1: add_attr 5.12% : 0.004140s : 1: add_attr_with_inline 0.00% : 0.000004s : 1: add_comm_op_reuse_tag 0.09% : 0.000073s : 1: add_recomputation 0.01% : 0.000005s : 1: assign_add_opt 0.10% : 0.000083s : 1: auto_monad 0.03% : 0.000026s : 1: auto_monad_reorder 0.00% : 0.000003s : 1: begin_end_overlap_inline 0.01% : 0.000007s : 1: bias_add_comm_swap 0.56% : 0.000454s : 1: bootstrap 0.05% : 0.000043s : 1: cconv 0.01% : 0.000004s : 1: comm_op_add_attrs 0.03% : 0.000020s : 1: control_data_broadcast_order 0.02% : 0.000013s : 1: convert_after_rewriter 0.04% : 0.000031s : 1: cse_after_recomputation 0.01% : 0.000005s : 1: dataset_repeat_opt 0.01% : 0.000006s : 1: detach_backward 0.01% : 0.000012s : 1: environ_conv 0.04% : 0.000031s : 1: event_method 0.01% : 0.000005s : 1: full_micro_interleaved_order_control 0.01% : 0.000005s : 1: get_jit_bprop_graph 0.01% : 0.000010s : 1: graph_reusing 0.01% : 0.000005s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000004s : 1: handle_group_info 0.01% : 0.000006s : 1: inline 0.01% : 0.000006s : 1: insert-virtual-dataset 0.01% : 0.000004s : 1: interleave_parallel_branches 0.01% : 0.000004s : 1: interleave_split_concat_branches 0.01% : 0.000006s : 1: label_fine_grained_interleaved_index 0.01% : 0.000010s : 1: label_micro_interleaved_index 20.85% : 0.016863s : 1: loop_unroll 0.01% : 0.000004s : 1: merge_cast_opt 0.01% : 0.000005s : 1: micro_interleaved_order_control 1.06% : 0.000861s : 1: mutable_eliminate 0.01% : 0.000007s : 1: offloading_packed_experts 0.04% : 0.000034s : 1: opt.transform.loop_unroll_optimizer 0.04% : 0.000034s : 1: opt.transform.mutable_eliminate 1.88% : 0.001518s : 78: opt.transform.opt_a 0.05% : 0.000041s : 1: opt.transform.opt_after_cconv 0.04% : 0.000030s : 1: opt.transform.opt_after_jit_grad 0.17% : 0.000141s : 28: opt.transform.opt_b 0.09% : 0.000069s : 2: opt.transform.opt_trans_graph 0.06% : 0.000047s : 4: opt.transform.symbol_engine_opt 4.54% : 0.003670s : 1: opt_a 0.21% : 0.000173s : 1: opt_after_cconv 0.69% : 0.000556s : 1: opt_after_jit_grad 0.37% : 0.000299s : 1: opt_b 28.19% : 0.022802s : 1: optimize 0.04% : 0.000030s : 1: optimize_parallel_all_gather_comm 0.01% : 0.000009s : 1: order_py_execute_after_rewriter 0.03% : 0.000027s : 1: overlap_grad_flash_sp 0.01% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.01% : 0.000009s : 1: overlap_grad_ring_attention 0.01% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000005s : 1: overlap_opt_shard_in_pipeline 0.01% : 0.000006s : 1: overlap_param_gather 0.01% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.01% : 0.000008s : 1: overlap_recompute_and_grad_model_parallel 0.01% : 0.000005s : 1: overlap_recompute_comm 0.01% : 0.000007s : 1: parallel-infer-symbol 0.00% : 0.000004s : 1: parallel-infer-symbol-second 0.01% : 0.000005s : 1: partial_unused_args_eliminate 0.01% : 0.000005s : 1: pipeline_parallel_scheduler 0.01% : 0.000005s : 1: pipeline_split 0.06% : 0.000047s : 1: pre_auto_parallel 0.05% : 0.000039s : 1: py_interpret_to_execute 0.03% : 0.000026s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000005s : 1: remove_cast_before_assign_add 0.03% : 0.000022s : 1: remove_dup_value 0.69% : 0.000560s : 1: renormalize.infer 0.56% : 0.000450s : 1: renormalize.specialize 0.01% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.01% : 0.000010s : 1: rewriter_after_jit_bprop_graph 0.07% : 0.000059s : 1: rewriter_after_opt_a 0.13% : 0.000108s : 1: rewriter_before_opt_a 0.01% : 0.000006s : 1: slice_cell_reuse_recomputed_activation 0.01% : 0.000005s : 1: slice_recompute_activation 0.01% : 0.000004s : 1: split_layernorm_comm 0.01% : 0.000006s : 1: split_matmul_comm_elemetwise 0.01% : 0.000009s : 1: swap_dp_allreduce_reducescatter 0.11% : 0.000092s : 1: symbol_engine_optimizer 0.12% : 0.000100s : 1: tuple_transform 28.26% : 0.022860s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:44:59.491.196 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:44:59.491.482 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0190165, [21] [bootstrap]: 0.00043709 [type_inference]: 0.00641652 [event_method]: 2.134e-05 [auto_monad]: 6.271e-05 [graph_reusing]: 6.51e-06 [inline]: 3.16999e-06 [add_attr]: 0.00399897, [1] [add_attr_with_inline]: 0.00398579, [1] [Cycle 1]: 0.00010183, [2] [tag_attr]: 2.475e-05 [meta_addattr_fg_expand]: 6.17999e-06 [parallel-infer-symbol]: 4.25e-06 [pre_auto_parallel]: 4.333e-05 [insert-virtual-dataset]: 2.69999e-06 [parallel-infer-symbol-second]: 7.79983e-07 [dataset_repeat_opt]: 1.82999e-06 [pipeline_split]: 2.17999e-06 [optimize]: 0.00642814, [53] [py_interpret_to_execute]: 3.945e-05 [rewriter_before_opt_a]: 0.0001422 [opt_a]: 0.00347224, [2] [Cycle 1]: 0.00256185, [45] [expand_dump_flag]: 3.34001e-06 [switch_simplify]: 4.296e-05 [loop_unroll]: 3.103e-05 [a_1]: 0.00067312 [with_stream_mark]: 2.439e-05 [recompute_prepare]: 1.022e-05 [updatestate_depend_eliminate]: 4.33999e-06 [updatestate_assign_eliminate]: 4.06001e-06 [updatestate_loads_eliminate]: 3.16001e-06 [parameter_eliminate]: 1.89999e-06 [a_2]: 0.00011226 [accelerated_algorithm]: 7.28e-06 [shard]: 2.16e-06 [meta_shard_fg_expand]: 2.06e-06 [shard_inline]: 6.55002e-06 [merge_send_recv]: 9.42001e-06 [auto_parallel]: 9.86e-06 [parallel]: 2.186e-05 [flash_sp]: 1.038e-05 [merge_comm]: 3.96001e-06 [allreduce_fusion]: 3.48999e-06 [matmul_add_comm_reduction]: 1.025e-05 [allreduce_slice_to_reducescatter]: 9.10019e-07 [virtual_shard_identity]: 1.012e-05 [virtual_dataset]: 7.36999e-06 [get_grad_eliminate_]: 6.64001e-06 [virtual_output]: 7.27002e-06 [merge_forward]: 5.07e-06 [cell_reuse_recompute_pass]: 1.44e-06 [offload_activation]: 1.1e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.748e-05 [merge_recompute_call_nodes]: 1.70001e-06 [before_grad]: 1.119e-05 [set_forward_comm_id_for_comm_node_pass]: 4.28999e-06 [meta_fg_expand]: 3.36999e-06 [flash_sp_send_recv_attached]: 2.99999e-06 [receive_attached]: 2.14e-06 [after_resolve]: 1.35e-05 [a_after_grad]: 1.117e-05 [renormalize]: 0.00088229 [add_forward_monad_depend]: 8.42998e-06 [auto_monad_grad]: 3.48e-06 [auto_monad_eliminator]: 2.032e-05 [cse]: 3.024e-05 [a_3]: 7.145e-05 [Cycle 2]: 0.00089273, [45] [expand_dump_flag]: 1.82999e-06 [switch_simplify]: 8.87e-06 [loop_unroll]: 6.79999e-06 [a_1]: 0.00013815 [with_stream_mark]: 1.783e-05 [recompute_prepare]: 6.59999e-06 [updatestate_depend_eliminate]: 3.63999e-06 [updatestate_assign_eliminate]: 2.46998e-06 [updatestate_loads_eliminate]: 2.27001e-06 [parameter_eliminate]: 2.02999e-06 [a_2]: 0.00010118 [accelerated_algorithm]: 6.89001e-06 [shard]: 2.59001e-06 [meta_shard_fg_expand]: 2.11e-06 [shard_inline]: 6.77002e-06 [merge_send_recv]: 1.758e-05 [auto_parallel]: 8.44002e-06 [parallel]: 7.88001e-06 [flash_sp]: 4.18999e-06 [merge_comm]: 4.42e-06 [allreduce_fusion]: 3.76999e-06 [matmul_add_comm_reduction]: 7.38e-06 [allreduce_slice_to_reducescatter]: 4.60015e-07 [virtual_shard_identity]: 8.25999e-06 [virtual_dataset]: 9.31e-06 [get_grad_eliminate_]: 6.08002e-06 [virtual_output]: 6.54999e-06 [merge_forward]: 4.11001e-06 [cell_reuse_recompute_pass]: 2.83e-06 [offload_activation]: 9.27001e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.806e-05 [merge_recompute_call_nodes]: 8.99978e-07 [before_grad]: 1.117e-05 [set_forward_comm_id_for_comm_node_pass]: 4.00998e-06 [meta_fg_expand]: 2.67001e-06 [flash_sp_send_recv_attached]: 1.29e-06 [receive_attached]: 1.89e-06 [after_resolve]: 1.155e-05 [a_after_grad]: 9.72001e-06 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 1.71e-06 [auto_monad_grad]: 1.89999e-06 [auto_monad_eliminator]: 1.119e-05 [cse]: 1.848e-05 [a_3]: 5.07e-05 [py_interpret_to_execute_after_opt_a]: 2.054e-05 [slice_cell_reuse_recomputed_activation]: 5.00001e-06 [rewriter_after_opt_a]: 4.516e-05 [convert_after_rewriter]: 1.005e-05 [order_py_execute_after_rewriter]: 8e-06 [mutable_eliminate]: 0.00081803 [opt_b]: 0.00029981, [1] [Cycle 1]: 0.00028837, [7] [b_1]: 0.00017704 [b_2]: 1.033e-05 [updatestate_depend_eliminate]: 9.37999e-06 [updatestate_assign_eliminate]: 3.03e-06 [updatestate_loads_eliminate]: 3.08e-06 [renormalize]: 7.50006e-07 [cse]: 2.706e-05 [optimize_parallel_all_gather_comm]: 2.311e-05 [overlap_param_gather]: 5.40999e-06 [cconv]: 3.912e-05 [loop_unroll]: 0.0006081 [opt_after_cconv]: 0.00014279, [1] [Cycle 1]: 0.0001324, [7] [c_1]: 3.359e-05 [parameter_eliminate]: 5.14998e-06 [updatestate_depend_eliminate]: 7.13998e-06 [updatestate_assign_eliminate]: 2.71e-06 [updatestate_loads_eliminate]: 2.49999e-06 [cse]: 2.387e-05 [renormalize]: 5.10016e-07 [remove_dup_value]: 1.921e-05 [tuple_transform]: 0.00010002, [1] [Cycle 1]: 9.192e-05, [4] [d_1]: 5.161e-05 [none_parameter_eliminate]: 1.86e-06 [renormalize]: 2.3999e-07 [switch_simplify]: 7.31999e-06 [partial_unused_args_eliminate]: 4.89e-06 [add_recomputation]: 5.691e-05 [cse_after_recomputation]: 2.938e-05, [1] [Cycle 1]: 2.188e-05, [1] [cse]: 1.201e-05 [environ_conv]: 9.71e-06 [swap_dp_allreduce_reducescatter]: 8.56997e-06 [bias_add_comm_swap]: 5.94999e-06 [label_micro_interleaved_index]: 9.31e-06 [label_fine_grained_interleaved_index]: 5.06002e-06 [merge_cast_opt]: 4.39002e-06 [slice_recompute_activation]: 5.27001e-06 [micro_interleaved_order_control]: 5.52999e-06 [assign_add_opt]: 3.89002e-06 [ForceFp32Comm]: 4.27998e-06 [remove_cast_before_assign_add]: 3.43e-06 [full_micro_interleaved_order_control]: 4.53999e-06 [reorder_send_recv_between_fp_bp]: 5.14e-06 [comm_op_add_attrs]: 3.71001e-06 [add_comm_op_reuse_tag]: 3.56999e-06 [interleave_split_concat_branches]: 3.48e-06 [interleave_parallel_branches]: 3.68999e-06 [overlap_opt_shard_in_pipeline]: 3.66999e-06 [overlap_opt_shard_grad_in_pipeline]: 4.47e-06 [control_data_broadcast_order]: 1.795e-05 [grouped_pairwise_exchange_alltoall]: 3.84002e-06 [offloading_packed_experts]: 7.23e-06 [overlap_recompute_and_grad_model_parallel]: 7.81001e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.61999e-06 [overlap_recompute_allgather_and_fa_grad]: 4.03001e-06 [overlap_recompute_comm]: 5.35999e-06 [overlap_grad_ring_attention]: 7.66001e-06 [overlap_grad_flash_sp]: 2.699e-05 [begin_end_overlap_inline]: 3.09999e-06 [split_matmul_comm_elemetwise]: 4.92e-06 [split_layernorm_comm]: 4.45999e-06 [handle_group_info]: 3.81001e-06 [symbol_engine_optimizer]: 0.00011004, [1] [Cycle 1]: 0.00010194, [6] [build]: 4.30999e-06 [elim_shapecalc]: 1.358e-05 [elim_not_effective]: 1.474e-05 [opt_reshape]: 7.26999e-06 [fold_const_symbol]: 1.227e-05 [renormalize]: 2.40019e-07 [detach_backward]: 5.67999e-06 [pipeline_parallel_scheduler]: 1.92001e-06 [auto_monad_reorder]: 2.594e-05 [get_jit_bprop_graph]: 2.43e-06 [rewriter_after_jit_bprop_graph]: 7.38999e-06 [opt_after_jit_grad]: 0.00082037 [validate]: 5.183e-05 Sums bootstrap : 0.000437s : 3.33% type_inference : 0.006417s : 48.94% event_method : 0.000021s : 0.16% auto_monad : 0.000063s : 0.48% graph_reusing : 0.000007s : 0.05% inline : 0.000003s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000025s : 0.19% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.05% parallel-infer-symbol : 0.000004s : 0.03% pre_auto_parallel : 0.000043s : 0.33% insert-virtual-dataset : 0.000003s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.01% pipeline_split : 0.000002s : 0.02% optimize.py_interpret_to_execute : 0.000039s : 0.30% optimize.rewriter_before_opt_a : 0.000142s : 1.08% optimize.opt_a.expand_dump_flag : 0.000005s : 0.04% optimize.opt_a.switch_simplify : 0.000052s : 0.40% optimize.opt_a.loop_unroll : 0.000038s : 0.29% optimize.opt_a.a_1 : 0.000811s : 6.19% optimize.opt_a.with_stream_mark : 0.000042s : 0.32% optimize.opt_a.recompute_prepare : 0.000017s : 0.13% optimize.opt_a.updatestate_depend_eliminate : 0.000008s : 0.06% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.05% optimize.opt_a.updatestate_loads_eliminate : 0.000005s : 0.04% optimize.opt_a.parameter_eliminate : 0.000004s : 0.03% optimize.opt_a.a_2 : 0.000213s : 1.63% optimize.opt_a.accelerated_algorithm : 0.000014s : 0.11% optimize.opt_a.shard : 0.000005s : 0.04% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.03% optimize.opt_a.shard_inline : 0.000013s : 0.10% optimize.opt_a.merge_send_recv : 0.000027s : 0.21% optimize.opt_a.auto_parallel : 0.000018s : 0.14% optimize.opt_a.parallel : 0.000030s : 0.23% optimize.opt_a.flash_sp : 0.000015s : 0.11% optimize.opt_a.merge_comm : 0.000008s : 0.06% optimize.opt_a.allreduce_fusion : 0.000007s : 0.06% optimize.opt_a.matmul_add_comm_reduction : 0.000018s : 0.13% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000018s : 0.14% optimize.opt_a.virtual_dataset : 0.000017s : 0.13% optimize.opt_a.get_grad_eliminate_ : 0.000013s : 0.10% optimize.opt_a.virtual_output : 0.000014s : 0.11% optimize.opt_a.merge_forward : 0.000009s : 0.07% optimize.opt_a.cell_reuse_recompute_pass : 0.000004s : 0.03% optimize.opt_a.offload_activation : 0.000020s : 0.15% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000036s : 0.27% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.02% optimize.opt_a.before_grad : 0.000022s : 0.17% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000008s : 0.06% optimize.opt_a.meta_fg_expand : 0.000006s : 0.05% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.03% optimize.opt_a.receive_attached : 0.000004s : 0.03% optimize.opt_a.after_resolve : 0.000025s : 0.19% optimize.opt_a.a_after_grad : 0.000021s : 0.16% optimize.opt_a.renormalize : 0.000882s : 6.73% optimize.opt_a.add_forward_monad_depend : 0.000010s : 0.08% optimize.opt_a.auto_monad_grad : 0.000005s : 0.04% optimize.opt_a.auto_monad_eliminator : 0.000032s : 0.24% optimize.opt_a.cse : 0.000049s : 0.37% optimize.opt_a.a_3 : 0.000122s : 0.93% optimize.py_interpret_to_execute_after_opt_a : 0.000021s : 0.16% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.04% optimize.rewriter_after_opt_a : 0.000045s : 0.34% optimize.convert_after_rewriter : 0.000010s : 0.08% optimize.order_py_execute_after_rewriter : 0.000008s : 0.06% optimize.mutable_eliminate : 0.000818s : 6.24% optimize.opt_b.b_1 : 0.000177s : 1.35% optimize.opt_b.b_2 : 0.000010s : 0.08% optimize.opt_b.updatestate_depend_eliminate : 0.000009s : 0.07% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.02% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.02% optimize.opt_b.renormalize : 0.000001s : 0.01% optimize.opt_b.cse : 0.000027s : 0.21% optimize.optimize_parallel_all_gather_comm : 0.000023s : 0.18% optimize.overlap_param_gather : 0.000005s : 0.04% optimize.cconv : 0.000039s : 0.30% optimize.loop_unroll : 0.000608s : 4.64% optimize.opt_after_cconv.c_1 : 0.000034s : 0.26% optimize.opt_after_cconv.parameter_eliminate : 0.000005s : 0.04% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000007s : 0.05% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.02% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.02% optimize.opt_after_cconv.cse : 0.000024s : 0.18% optimize.opt_after_cconv.renormalize : 0.000001s : 0.00% optimize.remove_dup_value : 0.000019s : 0.15% optimize.tuple_transform.d_1 : 0.000052s : 0.39% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000007s : 0.06% optimize.partial_unused_args_eliminate : 0.000005s : 0.04% optimize.add_recomputation : 0.000057s : 0.43% optimize.cse_after_recomputation.cse : 0.000012s : 0.09% optimize.environ_conv : 0.000010s : 0.07% optimize.swap_dp_allreduce_reducescatter : 0.000009s : 0.07% optimize.bias_add_comm_swap : 0.000006s : 0.05% optimize.label_micro_interleaved_index : 0.000009s : 0.07% optimize.label_fine_grained_interleaved_index : 0.000005s : 0.04% optimize.merge_cast_opt : 0.000004s : 0.03% optimize.slice_recompute_activation : 0.000005s : 0.04% optimize.micro_interleaved_order_control : 0.000006s : 0.04% optimize.assign_add_opt : 0.000004s : 0.03% optimize.ForceFp32Comm : 0.000004s : 0.03% optimize.remove_cast_before_assign_add : 0.000003s : 0.03% optimize.full_micro_interleaved_order_control : 0.000005s : 0.03% optimize.reorder_send_recv_between_fp_bp : 0.000005s : 0.04% optimize.comm_op_add_attrs : 0.000004s : 0.03% optimize.add_comm_op_reuse_tag : 0.000004s : 0.03% optimize.interleave_split_concat_branches : 0.000003s : 0.03% optimize.interleave_parallel_branches : 0.000004s : 0.03% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.03% optimize.overlap_opt_shard_grad_in_pipeline : 0.000004s : 0.03% optimize.control_data_broadcast_order : 0.000018s : 0.14% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.03% optimize.offloading_packed_experts : 0.000007s : 0.06% optimize.overlap_recompute_and_grad_model_parallel : 0.000008s : 0.06% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.03% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.03% optimize.overlap_recompute_comm : 0.000005s : 0.04% optimize.overlap_grad_ring_attention : 0.000008s : 0.06% optimize.overlap_grad_flash_sp : 0.000027s : 0.21% optimize.begin_end_overlap_inline : 0.000003s : 0.02% optimize.split_matmul_comm_elemetwise : 0.000005s : 0.04% optimize.split_layernorm_comm : 0.000004s : 0.03% optimize.handle_group_info : 0.000004s : 0.03% optimize.symbol_engine_optimizer.build : 0.000004s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000014s : 0.10% optimize.symbol_engine_optimizer.elim_not_effective : 0.000015s : 0.11% optimize.symbol_engine_optimizer.opt_reshape : 0.000007s : 0.06% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000012s : 0.09% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000006s : 0.04% pipeline_parallel_scheduler : 0.000002s : 0.01% auto_monad_reorder : 0.000026s : 0.20% get_jit_bprop_graph : 0.000002s : 0.02% rewriter_after_jit_bprop_graph : 0.000007s : 0.06% opt_after_jit_grad : 0.000820s : 6.26% validate : 0.000052s : 0.40% Time group info: ------[substitution.] 0.000220 28 0.96% : 0.000002s : 2: substitution.elim_not_effective 0.75% : 0.000002s : 2: substitution.fold_const_symbol 2.98% : 0.000007s : 4: substitution.graph_param_transform 80.37% : 0.000177s : 4: substitution.inline 2.48% : 0.000005s : 4: substitution.j_node_and_user_rematch 2.71% : 0.000006s : 4: substitution.remove_not_recompute_node 2.62% : 0.000006s : 4: substitution.replace_old_param 7.13% : 0.000016s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.006356 2 88.36% : 0.005616s : 1: type_inference.infer 11.64% : 0.000740s : 1: type_inference.specialize ------[replace.] 0.000067 8 63.38% : 0.000042s : 4: replace.inline 36.62% : 0.000024s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000188 8 92.69% : 0.000174s : 4: match.inline 7.31% : 0.000014s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000227 1278 1.04% : 0.000002s : 13: predicate.accumulaten_eliminater 1.12% : 0.000003s : 4: predicate.ad_related_special_op_eliminate 0.67% : 0.000002s : 8: predicate.addn_check_dump 0.83% : 0.000002s : 13: predicate.addn_zero_filter 0.83% : 0.000002s : 13: predicate.adjust_all_reduce_mul_add 2.12% : 0.000005s : 21: predicate.arithmetic_simplify 0.94% : 0.000002s : 13: predicate.cast_eliminate 0.66% : 0.000002s : 8: predicate.check_bprop_eliminate 0.50% : 0.000001s : 8: predicate.compare_switch_simplify 0.16% : 0.000000s : 4: predicate.const_output_eliminate 0.70% : 0.000002s : 8: predicate.depend_value_elim 0.91% : 0.000002s : 13: predicate.dict_get_item_const_eliminator 1.18% : 0.000003s : 13: predicate.dict_get_item_eliminator 0.80% : 0.000002s : 13: predicate.dict_set_item_eliminator 1.02% : 0.000002s : 8: predicate.dumpgradient_eliminate 0.21% : 0.000000s : 4: predicate.elim_not_effective 0.49% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.26% : 0.000003s : 17: predicate.environ_add_const_eliminate 1.12% : 0.000003s : 17: predicate.environ_get_add_eliminate 1.06% : 0.000002s : 17: predicate.environ_get_depend_swap 1.64% : 0.000004s : 25: predicate.environ_get_eliminate 1.01% : 0.000002s : 17: predicate.environ_get_set_eliminate 1.33% : 0.000003s : 21: predicate.exchange_switch_depend_value 2.31% : 0.000005s : 21: predicate.float_depend_g_call 0.56% : 0.000001s : 8: predicate.float_environ_get_switch 0.96% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.17% : 0.000000s : 4: predicate.fold_const_symbol 0.63% : 0.000001s : 8: predicate.get_grad_eliminate 0.18% : 0.000000s : 4: predicate.graph_param_transform 0.53% : 0.000001s : 8: predicate.incorporate_call 0.45% : 0.000001s : 8: predicate.incorporate_call_switch 5.93% : 0.000013s : 58: predicate.inline 0.74% : 0.000002s : 8: predicate.inline_without_move 0.32% : 0.000001s : 8: predicate.j_node_and_user_rematch 0.83% : 0.000002s : 8: predicate.less_batch_normalization 1.79% : 0.000004s : 25: predicate.list_to_tuple_eliminator_ 2.46% : 0.000006s : 38: predicate.load_eliminater 0.93% : 0.000002s : 4: predicate.loop_unroll_after_grad 2.37% : 0.000005s : 34: predicate.loop_unroll_before_grad 1.69% : 0.000004s : 21: predicate.make_slice_get_slice_eliminator 0.47% : 0.000001s : 8: predicate.merge_addn 0.67% : 0.000002s : 8: predicate.micro_step_allgather_replace 0.59% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.78% : 0.000002s : 13: predicate.minmaximum_grad 1.33% : 0.000003s : 4: predicate.mutable_eliminate 0.45% : 0.000001s : 4: predicate.opt_reshape 0.47% : 0.000001s : 4: predicate.parallel_virtual_node 1.75% : 0.000004s : 21: predicate.partial_defer_inline 1.47% : 0.000003s : 21: predicate.partial_eliminate 0.80% : 0.000002s : 13: predicate.print_const_string_wrapper 0.68% : 0.000002s : 8: predicate.reduce_all_const_elim 1.30% : 0.000003s : 13: predicate.reduce_eliminate 2.38% : 0.000005s : 38: predicate.redundant_stop_gradient_eliminater 0.48% : 0.000001s : 8: predicate.remove_not_recompute_node 1.34% : 0.000003s : 25: predicate.replace_applicator 0.55% : 0.000001s : 8: predicate.replace_old_param 0.29% : 0.000001s : 4: predicate.reset_defer_inline 1.09% : 0.000002s : 13: predicate.reshape_eliminate 0.60% : 0.000001s : 8: predicate.row_tensor_add_zeros_like 0.56% : 0.000001s : 4: predicate.row_tensor_eliminate 0.94% : 0.000002s : 8: predicate.same_eliminate 0.44% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.94% : 0.000002s : 8: predicate.shard_identity_eliminate 0.63% : 0.000001s : 8: predicate.special_op_eliminate 0.66% : 0.000001s : 8: predicate.specialize_transform 1.17% : 0.000003s : 8: predicate.split_environ_get_set_with_tuple_value 0.93% : 0.000002s : 8: predicate.stack_unstack_eliminate 0.31% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.41% : 0.000003s : 21: predicate.switch_defer_inline 1.85% : 0.000004s : 29: predicate.switch_layer_defer_inline 4.93% : 0.000011s : 67: predicate.switch_simplify 0.90% : 0.000002s : 13: predicate.tile_eliminate 0.93% : 0.000002s : 13: predicate.transpose_eliminate 1.49% : 0.000003s : 21: predicate.tuple_list_convert_item_index_to_positive 1.57% : 0.000004s : 21: predicate.tuple_list_get_item_const_eliminator 2.09% : 0.000005s : 21: predicate.tuple_list_get_item_depend_reorder 3.09% : 0.000007s : 33: predicate.tuple_list_get_item_eliminator 1.30% : 0.000003s : 21: predicate.tuple_list_get_set_item_eliminator 2.20% : 0.000005s : 29: predicate.tuple_list_set_item_eliminator 2.01% : 0.000005s : 25: predicate.tuple_to_list_eliminator_ 2.34% : 0.000005s : 38: predicate.updatestate_pure_node_eliminater 2.82% : 0.000006s : 46: predicate.updatestate_useless_node_eliminater 0.41% : 0.000001s : 4: predicate.value_based_eliminate 0.59% : 0.000001s : 8: predicate.virtual_dataset_eliminate 0.71% : 0.000002s : 8: predicate.virtual_output_eliminate 0.24% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.58% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000614 11 53.65% : 0.000330s : 5: func_graph_cloner_run.FuncGraphClonerGraph 46.35% : 0.000285s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.031742 192 0.02% : 0.000007s : 1: ForceFp32Comm 12.64% : 0.004012s : 1: add_attr 12.57% : 0.003990s : 1: add_attr_with_inline 0.02% : 0.000007s : 1: add_comm_op_reuse_tag 0.19% : 0.000061s : 1: add_recomputation 0.02% : 0.000007s : 1: assign_add_opt 0.23% : 0.000072s : 1: auto_monad 0.11% : 0.000035s : 1: auto_monad_reorder 0.02% : 0.000006s : 1: begin_end_overlap_inline 0.03% : 0.000009s : 1: bias_add_comm_swap 1.52% : 0.000483s : 1: bootstrap 0.13% : 0.000042s : 1: cconv 0.02% : 0.000006s : 1: comm_op_add_attrs 0.07% : 0.000022s : 1: control_data_broadcast_order 0.04% : 0.000013s : 1: convert_after_rewriter 0.10% : 0.000032s : 1: cse_after_recomputation 0.02% : 0.000007s : 1: dataset_repeat_opt 0.09% : 0.000030s : 1: detach_backward 0.04% : 0.000013s : 1: environ_conv 0.10% : 0.000033s : 1: event_method 0.02% : 0.000007s : 1: full_micro_interleaved_order_control 0.03% : 0.000009s : 1: get_jit_bprop_graph 0.04% : 0.000013s : 1: graph_reusing 0.02% : 0.000006s : 1: grouped_pairwise_exchange_alltoall 0.02% : 0.000007s : 1: handle_group_info 0.03% : 0.000009s : 1: inline 0.03% : 0.000009s : 1: insert-virtual-dataset 0.02% : 0.000006s : 1: interleave_parallel_branches 0.02% : 0.000007s : 1: interleave_split_concat_branches 0.03% : 0.000008s : 1: label_fine_grained_interleaved_index 0.04% : 0.000012s : 1: label_micro_interleaved_index 1.94% : 0.000615s : 1: loop_unroll 0.02% : 0.000007s : 1: merge_cast_opt 0.03% : 0.000008s : 1: micro_interleaved_order_control 2.60% : 0.000826s : 1: mutable_eliminate 0.03% : 0.000011s : 1: offloading_packed_experts 0.05% : 0.000016s : 1: opt.transform.loop_unroll_optimizer 0.07% : 0.000022s : 1: opt.transform.mutable_eliminate 3.98% : 0.001264s : 78: opt.transform.opt_a 0.10% : 0.000032s : 1: opt.transform.opt_after_cconv 0.11% : 0.000036s : 1: opt.transform.opt_after_jit_grad 0.35% : 0.000111s : 28: opt.transform.opt_b 0.18% : 0.000056s : 2: opt.transform.opt_trans_graph 0.14% : 0.000044s : 4: opt.transform.symbol_engine_opt 10.95% : 0.003476s : 1: opt_a 0.46% : 0.000146s : 1: opt_after_cconv 2.63% : 0.000834s : 1: opt_after_jit_grad 0.96% : 0.000304s : 1: opt_b 21.47% : 0.006814s : 1: optimize 0.08% : 0.000027s : 1: optimize_parallel_all_gather_comm 0.03% : 0.000011s : 1: order_py_execute_after_rewriter 0.10% : 0.000031s : 1: overlap_grad_flash_sp 0.02% : 0.000006s : 1: overlap_grad_matmul_and_grad_allreduce 0.03% : 0.000010s : 1: overlap_grad_ring_attention 0.02% : 0.000007s : 1: overlap_opt_shard_grad_in_pipeline 0.02% : 0.000006s : 1: overlap_opt_shard_in_pipeline 0.03% : 0.000008s : 1: overlap_param_gather 0.02% : 0.000007s : 1: overlap_recompute_allgather_and_fa_grad 0.03% : 0.000011s : 1: overlap_recompute_and_grad_model_parallel 0.03% : 0.000008s : 1: overlap_recompute_comm 0.04% : 0.000012s : 1: parallel-infer-symbol 0.02% : 0.000006s : 1: parallel-infer-symbol-second 0.03% : 0.000008s : 1: partial_unused_args_eliminate 0.03% : 0.000010s : 1: pipeline_parallel_scheduler 0.02% : 0.000008s : 1: pipeline_split 0.16% : 0.000052s : 1: pre_auto_parallel 0.14% : 0.000044s : 1: py_interpret_to_execute 0.08% : 0.000024s : 1: py_interpret_to_execute_after_opt_a 0.02% : 0.000006s : 1: remove_cast_before_assign_add 0.07% : 0.000023s : 1: remove_dup_value 1.42% : 0.000451s : 1: renormalize.infer 1.33% : 0.000421s : 1: renormalize.specialize 0.03% : 0.000008s : 1: reorder_send_recv_between_fp_bp 0.04% : 0.000013s : 1: rewriter_after_jit_bprop_graph 0.15% : 0.000049s : 1: rewriter_after_opt_a 0.46% : 0.000147s : 1: rewriter_before_opt_a 0.02% : 0.000008s : 1: slice_cell_reuse_recomputed_activation 0.03% : 0.000009s : 1: slice_recompute_activation 0.03% : 0.000008s : 1: split_layernorm_comm 0.02% : 0.000008s : 1: split_matmul_comm_elemetwise 0.04% : 0.000012s : 1: swap_dp_allreduce_reducescatter 0.36% : 0.000113s : 1: symbol_engine_optimizer 0.32% : 0.000103s : 1: tuple_transform 20.36% : 0.006464s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:44:59.965.872 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0163002, [21] [bootstrap]: 0.00044583 [type_inference]: 0.00658791 [event_method]: 2.089e-05 [auto_monad]: 6.481e-05 [graph_reusing]: 5.39e-06 [inline]: 1.94999e-06 [add_attr]: 0.00349719, [1] [add_attr_with_inline]: 0.00348595, [1] [Cycle 1]: 7.234e-05, [2] [tag_attr]: 2.359e-05 [meta_addattr_fg_expand]: 5.70001e-06 [parallel-infer-symbol]: 3.58e-06 [pre_auto_parallel]: 3.903e-05 [insert-virtual-dataset]: 2.49999e-06 [parallel-infer-symbol-second]: 1.00001e-06 [dataset_repeat_opt]: 1.94999e-06 [pipeline_split]: 1.65001e-06 [optimize]: 0.00494764, [53] [py_interpret_to_execute]: 2.856e-05 [rewriter_before_opt_a]: 8.629e-05 [opt_a]: 0.0028083, [2] [Cycle 1]: 0.0021218, [45] [expand_dump_flag]: 3.06999e-06 [switch_simplify]: 4.235e-05 [loop_unroll]: 2.965e-05 [a_1]: 0.0006395 [with_stream_mark]: 1.966e-05 [recompute_prepare]: 9.52999e-06 [updatestate_depend_eliminate]: 4.13001e-06 [updatestate_assign_eliminate]: 3.10002e-06 [updatestate_loads_eliminate]: 3.23e-06 [parameter_eliminate]: 1.86e-06 [a_2]: 8.196e-05 [accelerated_algorithm]: 6.93998e-06 [shard]: 1.86998e-06 [meta_shard_fg_expand]: 2.50002e-06 [shard_inline]: 6.59999e-06 [merge_send_recv]: 8.35001e-06 [auto_parallel]: 7.04001e-06 [parallel]: 1.787e-05 [flash_sp]: 8.48999e-06 [merge_comm]: 3.76001e-06 [allreduce_fusion]: 3.85e-06 [matmul_add_comm_reduction]: 9.37001e-06 [allreduce_slice_to_reducescatter]: 8.29983e-07 [virtual_shard_identity]: 9.59e-06 [virtual_dataset]: 6.59001e-06 [get_grad_eliminate_]: 6.22001e-06 [virtual_output]: 6.34999e-06 [merge_forward]: 4.13999e-06 [cell_reuse_recompute_pass]: 1.15999e-06 [offload_activation]: 1.049e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.599e-05 [merge_recompute_call_nodes]: 1.47001e-06 [before_grad]: 1.124e-05 [set_forward_comm_id_for_comm_node_pass]: 3.98001e-06 [meta_fg_expand]: 3.13e-06 [flash_sp_send_recv_attached]: 2.79999e-06 [receive_attached]: 2.50002e-06 [after_resolve]: 1.336e-05 [a_after_grad]: 1.048e-05 [renormalize]: 0.00073029 [add_forward_monad_depend]: 6.51e-06 [auto_monad_grad]: 2.48002e-06 [auto_monad_eliminator]: 1.605e-05 [cse]: 3.027e-05 [a_3]: 5.148e-05 [Cycle 2]: 0.00067648, [45] [expand_dump_flag]: 1.50999e-06 [switch_simplify]: 8.11002e-06 [loop_unroll]: 6.17999e-06 [a_1]: 0.00012951 [with_stream_mark]: 1.474e-05 [recompute_prepare]: 6.48e-06 [updatestate_depend_eliminate]: 3.43e-06 [updatestate_assign_eliminate]: 2.54999e-06 [updatestate_loads_eliminate]: 2.39999e-06 [parameter_eliminate]: 1.40999e-06 [a_2]: 7.13e-05 [accelerated_algorithm]: 6.69999e-06 [shard]: 1.81e-06 [meta_shard_fg_expand]: 1.86e-06 [shard_inline]: 6.16998e-06 [merge_send_recv]: 6.19999e-06 [auto_parallel]: 6.11e-06 [parallel]: 6.07001e-06 [flash_sp]: 3.46999e-06 [merge_comm]: 4.1e-06 [allreduce_fusion]: 3.15998e-06 [matmul_add_comm_reduction]: 7.37002e-06 [allreduce_slice_to_reducescatter]: 5.50004e-07 [virtual_shard_identity]: 8.54e-06 [virtual_dataset]: 5.63997e-06 [get_grad_eliminate_]: 5.79999e-06 [virtual_output]: 6.06e-06 [merge_forward]: 3.85e-06 [cell_reuse_recompute_pass]: 1.78002e-06 [offload_activation]: 7.85998e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.497e-05 [merge_recompute_call_nodes]: 1.37999e-06 [before_grad]: 1.038e-05 [set_forward_comm_id_for_comm_node_pass]: 3.86001e-06 [meta_fg_expand]: 2.48002e-06 [flash_sp_send_recv_attached]: 1.07e-06 [receive_attached]: 1.45001e-06 [after_resolve]: 1.039e-05 [a_after_grad]: 9.66998e-06 [renormalize]: 7.00238e-08 [add_forward_monad_depend]: 2.07999e-06 [auto_monad_grad]: 1.22e-06 [auto_monad_eliminator]: 8.72e-06 [cse]: 1.544e-05 [a_3]: 3.64e-05 [py_interpret_to_execute_after_opt_a]: 1.365e-05 [slice_cell_reuse_recomputed_activation]: 1.99e-06 [rewriter_after_opt_a]: 3.911e-05 [convert_after_rewriter]: 7.14001e-06 [order_py_execute_after_rewriter]: 4.92e-06 [mutable_eliminate]: 0.00061531 [opt_b]: 0.00021053, [1] [Cycle 1]: 0.00020372, [7] [b_1]: 0.00012399 [b_2]: 8.52e-06 [updatestate_depend_eliminate]: 7.45e-06 [updatestate_assign_eliminate]: 2.98e-06 [updatestate_loads_eliminate]: 2.58e-06 [renormalize]: 9.79984e-07 [cse]: 2.173e-05 [optimize_parallel_all_gather_comm]: 1.735e-05 [overlap_param_gather]: 2.11998e-06 [cconv]: 2.943e-05 [loop_unroll]: 0.00042963 [opt_after_cconv]: 0.00010221, [1] [Cycle 1]: 9.637e-05, [7] [c_1]: 3.117e-05 [parameter_eliminate]: 3.83001e-06 [updatestate_depend_eliminate]: 5.17e-06 [updatestate_assign_eliminate]: 2.37999e-06 [updatestate_loads_eliminate]: 2.26e-06 [cse]: 1.716e-05 [renormalize]: 4.00003e-07 [remove_dup_value]: 1.408e-05 [tuple_transform]: 7.487e-05, [1] [Cycle 1]: 7.071e-05, [4] [d_1]: 4.321e-05 [none_parameter_eliminate]: 1.44e-06 [renormalize]: 1.80007e-07 [switch_simplify]: 6.82002e-06 [partial_unused_args_eliminate]: 1.92001e-06 [add_recomputation]: 4.935e-05 [cse_after_recomputation]: 2.089e-05, [1] [Cycle 1]: 1.623e-05, [1] [cse]: 1.079e-05 [environ_conv]: 5.31998e-06 [swap_dp_allreduce_reducescatter]: 5.35001e-06 [bias_add_comm_swap]: 3.01001e-06 [label_micro_interleaved_index]: 4.63999e-06 [label_fine_grained_interleaved_index]: 2.86999e-06 [merge_cast_opt]: 1.52999e-06 [slice_recompute_activation]: 2.00002e-06 [micro_interleaved_order_control]: 2.26998e-06 [assign_add_opt]: 1.27e-06 [ForceFp32Comm]: 7.79983e-07 [remove_cast_before_assign_add]: 1.00001e-06 [full_micro_interleaved_order_control]: 2.01998e-06 [reorder_send_recv_between_fp_bp]: 2.74001e-06 [comm_op_add_attrs]: 1.00999e-06 [add_comm_op_reuse_tag]: 1.12e-06 [interleave_split_concat_branches]: 1.07e-06 [interleave_parallel_branches]: 9.99979e-07 [overlap_opt_shard_in_pipeline]: 1.15001e-06 [overlap_opt_shard_grad_in_pipeline]: 1.86003e-06 [control_data_broadcast_order]: 1.231e-05 [grouped_pairwise_exchange_alltoall]: 1.48002e-06 [offloading_packed_experts]: 4.01001e-06 [overlap_recompute_and_grad_model_parallel]: 4.63999e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.13001e-06 [overlap_recompute_allgather_and_fa_grad]: 1.72999e-06 [overlap_recompute_comm]: 2.26e-06 [overlap_grad_ring_attention]: 4.42e-06 [overlap_grad_flash_sp]: 1.9e-05 [begin_end_overlap_inline]: 6.50005e-07 [split_matmul_comm_elemetwise]: 2.15002e-06 [split_layernorm_comm]: 1.60001e-06 [handle_group_info]: 1.00001e-06 [symbol_engine_optimizer]: 7.317e-05, [1] [Cycle 1]: 6.9e-05, [6] [build]: 2.71999e-06 [elim_shapecalc]: 9.29998e-06 [elim_not_effective]: 1.179e-05 [opt_reshape]: 7.18e-06 [fold_const_symbol]: 1.009e-05 [renormalize]: 2.10013e-07 [detach_backward]: 2.06e-06 [pipeline_parallel_scheduler]: 1.76998e-06 [auto_monad_reorder]: 1.586e-05 [get_jit_bprop_graph]: 2.34001e-06 [rewriter_after_jit_bprop_graph]: 4.62e-06 [opt_after_jit_grad]: 0.00045504 [validate]: 4.052e-05 Sums bootstrap : 0.000446s : 3.77% type_inference : 0.006588s : 55.72% event_method : 0.000021s : 0.18% auto_monad : 0.000065s : 0.55% graph_reusing : 0.000005s : 0.05% inline : 0.000002s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000024s : 0.20% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.05% parallel-infer-symbol : 0.000004s : 0.03% pre_auto_parallel : 0.000039s : 0.33% insert-virtual-dataset : 0.000002s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000029s : 0.24% optimize.rewriter_before_opt_a : 0.000086s : 0.73% optimize.opt_a.expand_dump_flag : 0.000005s : 0.04% optimize.opt_a.switch_simplify : 0.000050s : 0.43% optimize.opt_a.loop_unroll : 0.000036s : 0.30% optimize.opt_a.a_1 : 0.000769s : 6.50% optimize.opt_a.with_stream_mark : 0.000034s : 0.29% optimize.opt_a.recompute_prepare : 0.000016s : 0.14% optimize.opt_a.updatestate_depend_eliminate : 0.000008s : 0.06% optimize.opt_a.updatestate_assign_eliminate : 0.000006s : 0.05% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.05% optimize.opt_a.parameter_eliminate : 0.000003s : 0.03% optimize.opt_a.a_2 : 0.000153s : 1.30% optimize.opt_a.accelerated_algorithm : 0.000014s : 0.12% optimize.opt_a.shard : 0.000004s : 0.03% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.04% optimize.opt_a.shard_inline : 0.000013s : 0.11% optimize.opt_a.merge_send_recv : 0.000015s : 0.12% optimize.opt_a.auto_parallel : 0.000013s : 0.11% optimize.opt_a.parallel : 0.000024s : 0.20% optimize.opt_a.flash_sp : 0.000012s : 0.10% optimize.opt_a.merge_comm : 0.000008s : 0.07% optimize.opt_a.allreduce_fusion : 0.000007s : 0.06% optimize.opt_a.matmul_add_comm_reduction : 0.000017s : 0.14% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000018s : 0.15% optimize.opt_a.virtual_dataset : 0.000012s : 0.10% optimize.opt_a.get_grad_eliminate_ : 0.000012s : 0.10% optimize.opt_a.virtual_output : 0.000012s : 0.10% optimize.opt_a.merge_forward : 0.000008s : 0.07% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.02% optimize.opt_a.offload_activation : 0.000018s : 0.16% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000031s : 0.26% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.02% optimize.opt_a.before_grad : 0.000022s : 0.18% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000008s : 0.07% optimize.opt_a.meta_fg_expand : 0.000006s : 0.05% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.03% optimize.opt_a.receive_attached : 0.000004s : 0.03% optimize.opt_a.after_resolve : 0.000024s : 0.20% optimize.opt_a.a_after_grad : 0.000020s : 0.17% optimize.opt_a.renormalize : 0.000730s : 6.18% optimize.opt_a.add_forward_monad_depend : 0.000009s : 0.07% optimize.opt_a.auto_monad_grad : 0.000004s : 0.03% optimize.opt_a.auto_monad_eliminator : 0.000025s : 0.21% optimize.opt_a.cse : 0.000046s : 0.39% optimize.opt_a.a_3 : 0.000088s : 0.74% optimize.py_interpret_to_execute_after_opt_a : 0.000014s : 0.12% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.02% optimize.rewriter_after_opt_a : 0.000039s : 0.33% optimize.convert_after_rewriter : 0.000007s : 0.06% optimize.order_py_execute_after_rewriter : 0.000005s : 0.04% optimize.mutable_eliminate : 0.000615s : 5.20% optimize.opt_b.b_1 : 0.000124s : 1.05% optimize.opt_b.b_2 : 0.000009s : 0.07% optimize.opt_b.updatestate_depend_eliminate : 0.000007s : 0.06% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.02% optimize.opt_b.renormalize : 0.000001s : 0.01% optimize.opt_b.cse : 0.000022s : 0.18% optimize.optimize_parallel_all_gather_comm : 0.000017s : 0.15% optimize.overlap_param_gather : 0.000002s : 0.02% optimize.cconv : 0.000029s : 0.25% optimize.loop_unroll : 0.000430s : 3.63% optimize.opt_after_cconv.c_1 : 0.000031s : 0.26% optimize.opt_after_cconv.parameter_eliminate : 0.000004s : 0.03% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000005s : 0.04% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000002s : 0.02% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.02% optimize.opt_after_cconv.cse : 0.000017s : 0.15% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000014s : 0.12% optimize.tuple_transform.d_1 : 0.000043s : 0.37% optimize.tuple_transform.none_parameter_eliminate : 0.000001s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000007s : 0.06% optimize.partial_unused_args_eliminate : 0.000002s : 0.02% optimize.add_recomputation : 0.000049s : 0.42% optimize.cse_after_recomputation.cse : 0.000011s : 0.09% optimize.environ_conv : 0.000005s : 0.04% optimize.swap_dp_allreduce_reducescatter : 0.000005s : 0.05% optimize.bias_add_comm_swap : 0.000003s : 0.03% optimize.label_micro_interleaved_index : 0.000005s : 0.04% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.02% optimize.merge_cast_opt : 0.000002s : 0.01% optimize.slice_recompute_activation : 0.000002s : 0.02% optimize.micro_interleaved_order_control : 0.000002s : 0.02% optimize.assign_add_opt : 0.000001s : 0.01% optimize.ForceFp32Comm : 0.000001s : 0.01% optimize.remove_cast_before_assign_add : 0.000001s : 0.01% optimize.full_micro_interleaved_order_control : 0.000002s : 0.02% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.02% optimize.comm_op_add_attrs : 0.000001s : 0.01% optimize.add_comm_op_reuse_tag : 0.000001s : 0.01% optimize.interleave_split_concat_branches : 0.000001s : 0.01% optimize.interleave_parallel_branches : 0.000001s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.02% optimize.control_data_broadcast_order : 0.000012s : 0.10% optimize.grouped_pairwise_exchange_alltoall : 0.000001s : 0.01% optimize.offloading_packed_experts : 0.000004s : 0.03% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.04% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000002s : 0.01% optimize.overlap_recompute_comm : 0.000002s : 0.02% optimize.overlap_grad_ring_attention : 0.000004s : 0.04% optimize.overlap_grad_flash_sp : 0.000019s : 0.16% optimize.begin_end_overlap_inline : 0.000001s : 0.01% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.02% optimize.split_layernorm_comm : 0.000002s : 0.01% optimize.handle_group_info : 0.000001s : 0.01% optimize.symbol_engine_optimizer.build : 0.000003s : 0.02% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000009s : 0.08% optimize.symbol_engine_optimizer.elim_not_effective : 0.000012s : 0.10% optimize.symbol_engine_optimizer.opt_reshape : 0.000007s : 0.06% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000010s : 0.09% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.02% pipeline_parallel_scheduler : 0.000002s : 0.01% auto_monad_reorder : 0.000016s : 0.13% get_jit_bprop_graph : 0.000002s : 0.02% rewriter_after_jit_bprop_graph : 0.000005s : 0.04% opt_after_jit_grad : 0.000455s : 3.85% validate : 0.000041s : 0.34% Time group info: ------[substitution.] 0.000198 28 0.97% : 0.000002s : 2: substitution.elim_not_effective 0.69% : 0.000001s : 2: substitution.fold_const_symbol 3.11% : 0.000006s : 4: substitution.graph_param_transform 80.14% : 0.000158s : 4: substitution.inline 2.19% : 0.000004s : 4: substitution.j_node_and_user_rematch 2.71% : 0.000005s : 4: substitution.remove_not_recompute_node 2.44% : 0.000005s : 4: substitution.replace_old_param 7.74% : 0.000015s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.006520 2 88.73% : 0.005785s : 1: type_inference.infer 11.27% : 0.000735s : 1: type_inference.specialize ------[replace.] 0.000063 8 64.40% : 0.000041s : 4: replace.inline 35.60% : 0.000023s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000169 8 92.10% : 0.000156s : 4: match.inline 7.90% : 0.000013s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000203 1278 0.87% : 0.000002s : 13: predicate.accumulaten_eliminater 0.73% : 0.000001s : 4: predicate.ad_related_special_op_eliminate 0.51% : 0.000001s : 8: predicate.addn_check_dump 0.84% : 0.000002s : 13: predicate.addn_zero_filter 0.81% : 0.000002s : 13: predicate.adjust_all_reduce_mul_add 1.93% : 0.000004s : 21: predicate.arithmetic_simplify 0.91% : 0.000002s : 13: predicate.cast_eliminate 0.62% : 0.000001s : 8: predicate.check_bprop_eliminate 0.49% : 0.000001s : 8: predicate.compare_switch_simplify 0.23% : 0.000000s : 4: predicate.const_output_eliminate 0.62% : 0.000001s : 8: predicate.depend_value_elim 0.90% : 0.000002s : 13: predicate.dict_get_item_const_eliminator 1.01% : 0.000002s : 13: predicate.dict_get_item_eliminator 0.85% : 0.000002s : 13: predicate.dict_set_item_eliminator 0.87% : 0.000002s : 8: predicate.dumpgradient_eliminate 0.21% : 0.000000s : 4: predicate.elim_not_effective 0.39% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.11% : 0.000002s : 17: predicate.environ_add_const_eliminate 1.10% : 0.000002s : 17: predicate.environ_get_add_eliminate 1.12% : 0.000002s : 17: predicate.environ_get_depend_swap 1.71% : 0.000003s : 25: predicate.environ_get_eliminate 1.06% : 0.000002s : 17: predicate.environ_get_set_eliminate 1.45% : 0.000003s : 21: predicate.exchange_switch_depend_value 2.52% : 0.000005s : 21: predicate.float_depend_g_call 0.53% : 0.000001s : 8: predicate.float_environ_get_switch 0.79% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.19% : 0.000000s : 4: predicate.fold_const_symbol 0.64% : 0.000001s : 8: predicate.get_grad_eliminate 0.21% : 0.000000s : 4: predicate.graph_param_transform 0.67% : 0.000001s : 8: predicate.incorporate_call 0.51% : 0.000001s : 8: predicate.incorporate_call_switch 6.52% : 0.000013s : 58: predicate.inline 0.81% : 0.000002s : 8: predicate.inline_without_move 0.36% : 0.000001s : 8: predicate.j_node_and_user_rematch 1.03% : 0.000002s : 8: predicate.less_batch_normalization 1.78% : 0.000004s : 25: predicate.list_to_tuple_eliminator_ 2.56% : 0.000005s : 38: predicate.load_eliminater 1.10% : 0.000002s : 4: predicate.loop_unroll_after_grad 2.38% : 0.000005s : 34: predicate.loop_unroll_before_grad 1.60% : 0.000003s : 21: predicate.make_slice_get_slice_eliminator 0.56% : 0.000001s : 8: predicate.merge_addn 0.55% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.53% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.80% : 0.000002s : 13: predicate.minmaximum_grad 1.59% : 0.000003s : 4: predicate.mutable_eliminate 0.35% : 0.000001s : 4: predicate.opt_reshape 0.37% : 0.000001s : 4: predicate.parallel_virtual_node 1.83% : 0.000004s : 21: predicate.partial_defer_inline 1.70% : 0.000003s : 21: predicate.partial_eliminate 0.89% : 0.000002s : 13: predicate.print_const_string_wrapper 0.57% : 0.000001s : 8: predicate.reduce_all_const_elim 1.04% : 0.000002s : 13: predicate.reduce_eliminate 2.50% : 0.000005s : 38: predicate.redundant_stop_gradient_eliminater 0.74% : 0.000002s : 8: predicate.remove_not_recompute_node 1.44% : 0.000003s : 25: predicate.replace_applicator 0.40% : 0.000001s : 8: predicate.replace_old_param 0.26% : 0.000001s : 4: predicate.reset_defer_inline 0.92% : 0.000002s : 13: predicate.reshape_eliminate 0.65% : 0.000001s : 8: predicate.row_tensor_add_zeros_like 0.30% : 0.000001s : 4: predicate.row_tensor_eliminate 0.83% : 0.000002s : 8: predicate.same_eliminate 0.53% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.97% : 0.000002s : 8: predicate.shard_identity_eliminate 0.70% : 0.000001s : 8: predicate.special_op_eliminate 0.72% : 0.000001s : 8: predicate.specialize_transform 0.84% : 0.000002s : 8: predicate.split_environ_get_set_with_tuple_value 0.93% : 0.000002s : 8: predicate.stack_unstack_eliminate 0.41% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.50% : 0.000003s : 21: predicate.switch_defer_inline 2.01% : 0.000004s : 29: predicate.switch_layer_defer_inline 5.26% : 0.000011s : 67: predicate.switch_simplify 0.92% : 0.000002s : 13: predicate.tile_eliminate 0.96% : 0.000002s : 13: predicate.transpose_eliminate 1.48% : 0.000003s : 21: predicate.tuple_list_convert_item_index_to_positive 1.61% : 0.000003s : 21: predicate.tuple_list_get_item_const_eliminator 1.38% : 0.000003s : 21: predicate.tuple_list_get_item_depend_reorder 3.14% : 0.000006s : 33: predicate.tuple_list_get_item_eliminator 1.46% : 0.000003s : 21: predicate.tuple_list_get_set_item_eliminator 2.18% : 0.000004s : 29: predicate.tuple_list_set_item_eliminator 1.70% : 0.000003s : 25: predicate.tuple_to_list_eliminator_ 2.44% : 0.000005s : 38: predicate.updatestate_pure_node_eliminater 3.10% : 0.000006s : 46: predicate.updatestate_useless_node_eliminater 0.33% : 0.000001s : 4: predicate.value_based_eliminate 0.61% : 0.000001s : 8: predicate.virtual_dataset_eliminate 0.74% : 0.000002s : 8: predicate.virtual_output_eliminate 0.29% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.42% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000601 11 55.27% : 0.000332s : 5: func_graph_cloner_run.FuncGraphClonerGraph 44.73% : 0.000269s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.026806 192 0.01% : 0.000003s : 1: ForceFp32Comm 13.07% : 0.003503s : 1: add_attr 13.02% : 0.003490s : 1: add_attr_with_inline 0.01% : 0.000004s : 1: add_comm_op_reuse_tag 0.20% : 0.000054s : 1: add_recomputation 0.01% : 0.000004s : 1: assign_add_opt 0.26% : 0.000070s : 1: auto_monad 0.07% : 0.000019s : 1: auto_monad_reorder 0.02% : 0.000004s : 1: begin_end_overlap_inline 0.02% : 0.000006s : 1: bias_add_comm_swap 1.77% : 0.000476s : 1: bootstrap 0.12% : 0.000033s : 1: cconv 0.01% : 0.000004s : 1: comm_op_add_attrs 0.06% : 0.000015s : 1: control_data_broadcast_order 0.04% : 0.000011s : 1: convert_after_rewriter 0.09% : 0.000024s : 1: cse_after_recomputation 0.02% : 0.000005s : 1: dataset_repeat_opt 0.02% : 0.000005s : 1: detach_backward 0.03% : 0.000009s : 1: environ_conv 0.10% : 0.000028s : 1: event_method 0.02% : 0.000005s : 1: full_micro_interleaved_order_control 0.02% : 0.000005s : 1: get_jit_bprop_graph 0.03% : 0.000009s : 1: graph_reusing 0.02% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000004s : 1: handle_group_info 0.02% : 0.000005s : 1: inline 0.02% : 0.000006s : 1: insert-virtual-dataset 0.01% : 0.000004s : 1: interleave_parallel_branches 0.01% : 0.000004s : 1: interleave_split_concat_branches 0.02% : 0.000006s : 1: label_fine_grained_interleaved_index 0.03% : 0.000007s : 1: label_micro_interleaved_index 1.63% : 0.000437s : 1: loop_unroll 0.02% : 0.000004s : 1: merge_cast_opt 0.02% : 0.000005s : 1: micro_interleaved_order_control 2.33% : 0.000624s : 1: mutable_eliminate 0.03% : 0.000007s : 1: offloading_packed_experts 0.05% : 0.000014s : 1: opt.transform.loop_unroll_optimizer 0.07% : 0.000019s : 1: opt.transform.mutable_eliminate 4.46% : 0.001196s : 78: opt.transform.opt_a 0.11% : 0.000030s : 1: opt.transform.opt_after_cconv 0.09% : 0.000023s : 1: opt.transform.opt_after_jit_grad 0.37% : 0.000100s : 28: opt.transform.opt_b 0.18% : 0.000048s : 2: opt.transform.opt_trans_graph 0.13% : 0.000035s : 4: opt.transform.symbol_engine_opt 10.49% : 0.002812s : 1: opt_a 0.39% : 0.000106s : 1: opt_after_cconv 1.73% : 0.000464s : 1: opt_after_jit_grad 0.80% : 0.000214s : 1: opt_b 18.48% : 0.004953s : 1: optimize 0.08% : 0.000021s : 1: optimize_parallel_all_gather_comm 0.03% : 0.000008s : 1: order_py_execute_after_rewriter 0.08% : 0.000023s : 1: overlap_grad_flash_sp 0.02% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.03% : 0.000007s : 1: overlap_grad_ring_attention 0.02% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.02% : 0.000005s : 1: overlap_param_gather 0.02% : 0.000005s : 1: overlap_recompute_allgather_and_fa_grad 0.03% : 0.000008s : 1: overlap_recompute_and_grad_model_parallel 0.02% : 0.000005s : 1: overlap_recompute_comm 0.03% : 0.000007s : 1: parallel-infer-symbol 0.01% : 0.000004s : 1: parallel-infer-symbol-second 0.02% : 0.000005s : 1: partial_unused_args_eliminate 0.02% : 0.000005s : 1: pipeline_parallel_scheduler 0.02% : 0.000005s : 1: pipeline_split 0.16% : 0.000044s : 1: pre_auto_parallel 0.12% : 0.000033s : 1: py_interpret_to_execute 0.06% : 0.000017s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000004s : 1: remove_cast_before_assign_add 0.07% : 0.000018s : 1: remove_dup_value 1.44% : 0.000386s : 1: renormalize.infer 1.25% : 0.000335s : 1: renormalize.specialize 0.02% : 0.000005s : 1: reorder_send_recv_between_fp_bp 0.03% : 0.000008s : 1: rewriter_after_jit_bprop_graph 0.16% : 0.000043s : 1: rewriter_after_opt_a 0.34% : 0.000090s : 1: rewriter_before_opt_a 0.02% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.02% : 0.000005s : 1: slice_recompute_activation 0.02% : 0.000005s : 1: split_layernorm_comm 0.02% : 0.000005s : 1: split_matmul_comm_elemetwise 0.03% : 0.000008s : 1: swap_dp_allreduce_reducescatter 0.28% : 0.000076s : 1: symbol_engine_optimizer 0.29% : 0.000078s : 1: tuple_transform 24.65% : 0.006609s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:45:00.290.711 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:45:00.290.968 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.021255, [21] [bootstrap]: 0.00045476 [type_inference]: 0.00823458 [event_method]: 2.395e-05 [auto_monad]: 7.135e-05 [graph_reusing]: 6.04001e-06 [inline]: 2.67001e-06 [add_attr]: 0.00403042, [1] [add_attr_with_inline]: 0.00401589, [1] [Cycle 1]: 0.00010412, [2] [tag_attr]: 2.892e-05 [meta_addattr_fg_expand]: 6.24999e-06 [parallel-infer-symbol]: 4.12e-06 [pre_auto_parallel]: 4.441e-05 [insert-virtual-dataset]: 2.74001e-06 [parallel-infer-symbol-second]: 8.10018e-07 [dataset_repeat_opt]: 2.07999e-06 [pipeline_split]: 1.77999e-06 [optimize]: 0.00672886, [53] [py_interpret_to_execute]: 4.027e-05 [rewriter_before_opt_a]: 0.00010335 [opt_a]: 0.00366081, [2] [Cycle 1]: 0.00265469, [45] [expand_dump_flag]: 3.41001e-06 [switch_simplify]: 4.336e-05 [loop_unroll]: 3.111e-05 [a_1]: 0.00069545 [with_stream_mark]: 2.688e-05 [recompute_prepare]: 1.199e-05 [updatestate_depend_eliminate]: 4.98001e-06 [updatestate_assign_eliminate]: 4.07003e-06 [updatestate_loads_eliminate]: 3.27002e-06 [parameter_eliminate]: 2.91e-06 [a_2]: 0.00011579 [accelerated_algorithm]: 9.34998e-06 [shard]: 2.53998e-06 [meta_shard_fg_expand]: 2.80002e-06 [shard_inline]: 7.3e-06 [merge_send_recv]: 1.019e-05 [auto_parallel]: 1.035e-05 [parallel]: 2.06e-05 [flash_sp]: 1.093e-05 [merge_comm]: 4.60001e-06 [allreduce_fusion]: 3.84002e-06 [matmul_add_comm_reduction]: 1.001e-05 [allreduce_slice_to_reducescatter]: 1.10001e-06 [virtual_shard_identity]: 1.246e-05 [virtual_dataset]: 7.36001e-06 [get_grad_eliminate_]: 6.58998e-06 [virtual_output]: 6.83e-06 [merge_forward]: 5.01002e-06 [cell_reuse_recompute_pass]: 1.50999e-06 [offload_activation]: 1.182e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.194e-05 [merge_recompute_call_nodes]: 1.47999e-06 [before_grad]: 1.327e-05 [set_forward_comm_id_for_comm_node_pass]: 4.75999e-06 [meta_fg_expand]: 2.96999e-06 [flash_sp_send_recv_attached]: 3.18998e-06 [receive_attached]: 2.36e-06 [after_resolve]: 1.499e-05 [a_after_grad]: 1.242e-05 [renormalize]: 0.00089704 [add_forward_monad_depend]: 9.51003e-06 [auto_monad_grad]: 3.13998e-06 [auto_monad_eliminator]: 2.032e-05 [cse]: 3.029e-05 [a_3]: 7.472e-05 [Cycle 2]: 0.00098661, [45] [expand_dump_flag]: 2.37001e-06 [switch_simplify]: 9.41e-06 [loop_unroll]: 6.66e-06 [a_1]: 0.00014189 [with_stream_mark]: 2.263e-05 [recompute_prepare]: 7.82e-06 [updatestate_depend_eliminate]: 4.23999e-06 [updatestate_assign_eliminate]: 3.25e-06 [updatestate_loads_eliminate]: 2.58e-06 [parameter_eliminate]: 1.56998e-06 [a_2]: 0.00010314 [accelerated_algorithm]: 7.33e-06 [shard]: 3.11999e-06 [meta_shard_fg_expand]: 1.94e-06 [shard_inline]: 6.81001e-06 [merge_send_recv]: 7.75e-06 [auto_parallel]: 9.56e-06 [parallel]: 9.60001e-06 [flash_sp]: 3.98999e-06 [merge_comm]: 5.92001e-06 [allreduce_fusion]: 3.81001e-06 [matmul_add_comm_reduction]: 1.511e-05 [allreduce_slice_to_reducescatter]: 7.39994e-07 [virtual_shard_identity]: 9.52999e-06 [virtual_dataset]: 7.18e-06 [get_grad_eliminate_]: 6.47001e-06 [virtual_output]: 6.26998e-06 [merge_forward]: 4.87e-06 [cell_reuse_recompute_pass]: 3.18e-06 [offload_activation]: 1.034e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.316e-05 [merge_recompute_call_nodes]: 1.14998e-06 [before_grad]: 1.088e-05 [set_forward_comm_id_for_comm_node_pass]: 4.33001e-06 [meta_fg_expand]: 3.28998e-06 [flash_sp_send_recv_attached]: 1.76e-06 [receive_attached]: 2.41998e-06 [after_resolve]: 1.399e-05 [a_after_grad]: 1.063e-05 [renormalize]: 5.9983e-08 [add_forward_monad_depend]: 3.40998e-06 [auto_monad_grad]: 1.69998e-06 [auto_monad_eliminator]: 1.327e-05 [cse]: 2.201e-05 [a_3]: 5.53e-05 [py_interpret_to_execute_after_opt_a]: 2.353e-05 [slice_cell_reuse_recomputed_activation]: 5.46998e-06 [rewriter_after_opt_a]: 5.236e-05 [convert_after_rewriter]: 1.217e-05 [order_py_execute_after_rewriter]: 9.22999e-06 [mutable_eliminate]: 0.00081696 [opt_b]: 0.00032708, [1] [Cycle 1]: 0.00031322, [7] [b_1]: 0.00019042 [b_2]: 1.031e-05 [updatestate_depend_eliminate]: 1.076e-05 [updatestate_assign_eliminate]: 3.4e-06 [updatestate_loads_eliminate]: 3.05002e-06 [renormalize]: 8.00006e-07 [cse]: 3.049e-05 [optimize_parallel_all_gather_comm]: 2.723e-05 [overlap_param_gather]: 6.09001e-06 [cconv]: 4.138e-05 [loop_unroll]: 0.00067547 [opt_after_cconv]: 0.00015299, [1] [Cycle 1]: 0.00014137, [7] [c_1]: 3.529e-05 [parameter_eliminate]: 5.92999e-06 [updatestate_depend_eliminate]: 8.87999e-06 [updatestate_assign_eliminate]: 2.98e-06 [updatestate_loads_eliminate]: 2.71e-06 [cse]: 2.676e-05 [renormalize]: 7.00005e-07 [remove_dup_value]: 1.953e-05 [tuple_transform]: 0.00010276, [1] [Cycle 1]: 9.525e-05, [4] [d_1]: 5.336e-05 [none_parameter_eliminate]: 1.74998e-06 [renormalize]: 2.00002e-07 [switch_simplify]: 7.53e-06 [partial_unused_args_eliminate]: 4.95999e-06 [add_recomputation]: 5.967e-05 [cse_after_recomputation]: 3.049e-05, [1] [Cycle 1]: 2.282e-05, [1] [cse]: 1.307e-05 [environ_conv]: 9.01002e-06 [swap_dp_allreduce_reducescatter]: 8.85001e-06 [bias_add_comm_swap]: 6.16e-06 [label_micro_interleaved_index]: 9.22001e-06 [label_fine_grained_interleaved_index]: 5.43002e-06 [merge_cast_opt]: 4.89003e-06 [slice_recompute_activation]: 4.80999e-06 [micro_interleaved_order_control]: 5.14e-06 [assign_add_opt]: 4.11001e-06 [ForceFp32Comm]: 3.51999e-06 [remove_cast_before_assign_add]: 3.73999e-06 [full_micro_interleaved_order_control]: 4.36002e-06 [reorder_send_recv_between_fp_bp]: 5.20001e-06 [comm_op_add_attrs]: 4.10998e-06 [add_comm_op_reuse_tag]: 3.18998e-06 [interleave_split_concat_branches]: 3.99002e-06 [interleave_parallel_branches]: 3.40998e-06 [overlap_opt_shard_in_pipeline]: 3.61001e-06 [overlap_opt_shard_grad_in_pipeline]: 4.61002e-06 [control_data_broadcast_order]: 1.926e-05 [grouped_pairwise_exchange_alltoall]: 4.47e-06 [offloading_packed_experts]: 7.59002e-06 [overlap_recompute_and_grad_model_parallel]: 7.98999e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.63999e-06 [overlap_recompute_allgather_and_fa_grad]: 4.05e-06 [overlap_recompute_comm]: 5.34e-06 [overlap_grad_ring_attention]: 7.41001e-06 [overlap_grad_flash_sp]: 2.774e-05 [begin_end_overlap_inline]: 3.17002e-06 [split_matmul_comm_elemetwise]: 4.56002e-06 [split_layernorm_comm]: 4.94e-06 [handle_group_info]: 3.98001e-06 [symbol_engine_optimizer]: 0.00011175, [1] [Cycle 1]: 0.00010395, [6] [build]: 5.33002e-06 [elim_shapecalc]: 1.283e-05 [elim_not_effective]: 1.449e-05 [opt_reshape]: 8.70001e-06 [fold_const_symbol]: 1.083e-05 [renormalize]: 3.60014e-07 [detach_backward]: 4.86002e-06 [pipeline_parallel_scheduler]: 2.01e-06 [auto_monad_reorder]: 2.452e-05 [get_jit_bprop_graph]: 1.96998e-06 [rewriter_after_jit_bprop_graph]: 7.53999e-06 [opt_after_jit_grad]: 0.00088912 [validate]: 5.279e-05 Sums bootstrap : 0.000455s : 2.99% type_inference : 0.008235s : 54.09% event_method : 0.000024s : 0.16% auto_monad : 0.000071s : 0.47% graph_reusing : 0.000006s : 0.04% inline : 0.000003s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000029s : 0.19% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.04% parallel-infer-symbol : 0.000004s : 0.03% pre_auto_parallel : 0.000044s : 0.29% insert-virtual-dataset : 0.000003s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.01% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000040s : 0.26% optimize.rewriter_before_opt_a : 0.000103s : 0.68% optimize.opt_a.expand_dump_flag : 0.000006s : 0.04% optimize.opt_a.switch_simplify : 0.000053s : 0.35% optimize.opt_a.loop_unroll : 0.000038s : 0.25% optimize.opt_a.a_1 : 0.000837s : 5.50% optimize.opt_a.with_stream_mark : 0.000050s : 0.33% optimize.opt_a.recompute_prepare : 0.000020s : 0.13% optimize.opt_a.updatestate_depend_eliminate : 0.000009s : 0.06% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.05% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.04% optimize.opt_a.parameter_eliminate : 0.000004s : 0.03% optimize.opt_a.a_2 : 0.000219s : 1.44% optimize.opt_a.accelerated_algorithm : 0.000017s : 0.11% optimize.opt_a.shard : 0.000006s : 0.04% optimize.opt_a.meta_shard_fg_expand : 0.000005s : 0.03% optimize.opt_a.shard_inline : 0.000014s : 0.09% optimize.opt_a.merge_send_recv : 0.000018s : 0.12% optimize.opt_a.auto_parallel : 0.000020s : 0.13% optimize.opt_a.parallel : 0.000030s : 0.20% optimize.opt_a.flash_sp : 0.000015s : 0.10% optimize.opt_a.merge_comm : 0.000011s : 0.07% optimize.opt_a.allreduce_fusion : 0.000008s : 0.05% optimize.opt_a.matmul_add_comm_reduction : 0.000025s : 0.16% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000002s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000022s : 0.14% optimize.opt_a.virtual_dataset : 0.000015s : 0.10% optimize.opt_a.get_grad_eliminate_ : 0.000013s : 0.09% optimize.opt_a.virtual_output : 0.000013s : 0.09% optimize.opt_a.merge_forward : 0.000010s : 0.06% optimize.opt_a.cell_reuse_recompute_pass : 0.000005s : 0.03% optimize.opt_a.offload_activation : 0.000022s : 0.15% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000045s : 0.30% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.02% optimize.opt_a.before_grad : 0.000024s : 0.16% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000009s : 0.06% optimize.opt_a.meta_fg_expand : 0.000006s : 0.04% optimize.opt_a.flash_sp_send_recv_attached : 0.000005s : 0.03% optimize.opt_a.receive_attached : 0.000005s : 0.03% optimize.opt_a.after_resolve : 0.000029s : 0.19% optimize.opt_a.a_after_grad : 0.000023s : 0.15% optimize.opt_a.renormalize : 0.000897s : 5.89% optimize.opt_a.add_forward_monad_depend : 0.000013s : 0.08% optimize.opt_a.auto_monad_grad : 0.000005s : 0.03% optimize.opt_a.auto_monad_eliminator : 0.000034s : 0.22% optimize.opt_a.cse : 0.000052s : 0.34% optimize.opt_a.a_3 : 0.000130s : 0.85% optimize.py_interpret_to_execute_after_opt_a : 0.000024s : 0.15% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.04% optimize.rewriter_after_opt_a : 0.000052s : 0.34% optimize.convert_after_rewriter : 0.000012s : 0.08% optimize.order_py_execute_after_rewriter : 0.000009s : 0.06% optimize.mutable_eliminate : 0.000817s : 5.37% optimize.opt_b.b_1 : 0.000190s : 1.25% optimize.opt_b.b_2 : 0.000010s : 0.07% optimize.opt_b.updatestate_depend_eliminate : 0.000011s : 0.07% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.02% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.02% optimize.opt_b.renormalize : 0.000001s : 0.01% optimize.opt_b.cse : 0.000030s : 0.20% optimize.optimize_parallel_all_gather_comm : 0.000027s : 0.18% optimize.overlap_param_gather : 0.000006s : 0.04% optimize.cconv : 0.000041s : 0.27% optimize.loop_unroll : 0.000675s : 4.44% optimize.opt_after_cconv.c_1 : 0.000035s : 0.23% optimize.opt_after_cconv.parameter_eliminate : 0.000006s : 0.04% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000009s : 0.06% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.02% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.02% optimize.opt_after_cconv.cse : 0.000027s : 0.18% optimize.opt_after_cconv.renormalize : 0.000001s : 0.00% optimize.remove_dup_value : 0.000020s : 0.13% optimize.tuple_transform.d_1 : 0.000053s : 0.35% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000008s : 0.05% optimize.partial_unused_args_eliminate : 0.000005s : 0.03% optimize.add_recomputation : 0.000060s : 0.39% optimize.cse_after_recomputation.cse : 0.000013s : 0.09% optimize.environ_conv : 0.000009s : 0.06% optimize.swap_dp_allreduce_reducescatter : 0.000009s : 0.06% optimize.bias_add_comm_swap : 0.000006s : 0.04% optimize.label_micro_interleaved_index : 0.000009s : 0.06% optimize.label_fine_grained_interleaved_index : 0.000005s : 0.04% optimize.merge_cast_opt : 0.000005s : 0.03% optimize.slice_recompute_activation : 0.000005s : 0.03% optimize.micro_interleaved_order_control : 0.000005s : 0.03% optimize.assign_add_opt : 0.000004s : 0.03% optimize.ForceFp32Comm : 0.000004s : 0.02% optimize.remove_cast_before_assign_add : 0.000004s : 0.02% optimize.full_micro_interleaved_order_control : 0.000004s : 0.03% optimize.reorder_send_recv_between_fp_bp : 0.000005s : 0.03% optimize.comm_op_add_attrs : 0.000004s : 0.03% optimize.add_comm_op_reuse_tag : 0.000003s : 0.02% optimize.interleave_split_concat_branches : 0.000004s : 0.03% optimize.interleave_parallel_branches : 0.000003s : 0.02% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.02% optimize.overlap_opt_shard_grad_in_pipeline : 0.000005s : 0.03% optimize.control_data_broadcast_order : 0.000019s : 0.13% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.03% optimize.offloading_packed_experts : 0.000008s : 0.05% optimize.overlap_recompute_and_grad_model_parallel : 0.000008s : 0.05% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.02% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.03% optimize.overlap_recompute_comm : 0.000005s : 0.04% optimize.overlap_grad_ring_attention : 0.000007s : 0.05% optimize.overlap_grad_flash_sp : 0.000028s : 0.18% optimize.begin_end_overlap_inline : 0.000003s : 0.02% optimize.split_matmul_comm_elemetwise : 0.000005s : 0.03% optimize.split_layernorm_comm : 0.000005s : 0.03% optimize.handle_group_info : 0.000004s : 0.03% optimize.symbol_engine_optimizer.build : 0.000005s : 0.04% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000013s : 0.08% optimize.symbol_engine_optimizer.elim_not_effective : 0.000014s : 0.10% optimize.symbol_engine_optimizer.opt_reshape : 0.000009s : 0.06% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000011s : 0.07% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000005s : 0.03% pipeline_parallel_scheduler : 0.000002s : 0.01% auto_monad_reorder : 0.000025s : 0.16% get_jit_bprop_graph : 0.000002s : 0.01% rewriter_after_jit_bprop_graph : 0.000008s : 0.05% opt_after_jit_grad : 0.000889s : 5.84% validate : 0.000053s : 0.35% Time group info: ------[substitution.] 0.000224 28 0.93% : 0.000002s : 2: substitution.elim_not_effective 0.75% : 0.000002s : 2: substitution.fold_const_symbol 3.24% : 0.000007s : 4: substitution.graph_param_transform 79.28% : 0.000177s : 4: substitution.inline 2.29% : 0.000005s : 4: substitution.j_node_and_user_rematch 3.02% : 0.000007s : 4: substitution.remove_not_recompute_node 3.28% : 0.000007s : 4: substitution.replace_old_param 7.21% : 0.000016s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.008162 2 87.11% : 0.007110s : 1: type_inference.infer 12.89% : 0.001052s : 1: type_inference.specialize ------[replace.] 0.000070 8 63.35% : 0.000044s : 4: replace.inline 36.65% : 0.000026s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000189 8 92.51% : 0.000175s : 4: match.inline 7.49% : 0.000014s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000244 1278 0.84% : 0.000002s : 13: predicate.accumulaten_eliminater 1.51% : 0.000004s : 4: predicate.ad_related_special_op_eliminate 0.47% : 0.000001s : 8: predicate.addn_check_dump 1.11% : 0.000003s : 13: predicate.addn_zero_filter 0.91% : 0.000002s : 13: predicate.adjust_all_reduce_mul_add 1.95% : 0.000005s : 21: predicate.arithmetic_simplify 1.09% : 0.000003s : 13: predicate.cast_eliminate 0.61% : 0.000001s : 8: predicate.check_bprop_eliminate 0.59% : 0.000001s : 8: predicate.compare_switch_simplify 0.17% : 0.000000s : 4: predicate.const_output_eliminate 0.62% : 0.000002s : 8: predicate.depend_value_elim 0.80% : 0.000002s : 13: predicate.dict_get_item_const_eliminator 1.12% : 0.000003s : 13: predicate.dict_get_item_eliminator 0.75% : 0.000002s : 13: predicate.dict_set_item_eliminator 1.50% : 0.000004s : 8: predicate.dumpgradient_eliminate 0.18% : 0.000000s : 4: predicate.elim_not_effective 0.63% : 0.000002s : 4: predicate.elim_shapecalc_of_broadcastargs 1.41% : 0.000003s : 17: predicate.environ_add_const_eliminate 0.96% : 0.000002s : 17: predicate.environ_get_add_eliminate 1.05% : 0.000003s : 17: predicate.environ_get_depend_swap 1.78% : 0.000004s : 25: predicate.environ_get_eliminate 1.02% : 0.000002s : 17: predicate.environ_get_set_eliminate 1.34% : 0.000003s : 21: predicate.exchange_switch_depend_value 2.38% : 0.000006s : 21: predicate.float_depend_g_call 0.53% : 0.000001s : 8: predicate.float_environ_get_switch 0.75% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.16% : 0.000000s : 4: predicate.fold_const_symbol 0.73% : 0.000002s : 8: predicate.get_grad_eliminate 0.27% : 0.000001s : 4: predicate.graph_param_transform 0.48% : 0.000001s : 8: predicate.incorporate_call 0.43% : 0.000001s : 8: predicate.incorporate_call_switch 5.94% : 0.000014s : 58: predicate.inline 0.97% : 0.000002s : 8: predicate.inline_without_move 0.29% : 0.000001s : 8: predicate.j_node_and_user_rematch 1.15% : 0.000003s : 8: predicate.less_batch_normalization 1.78% : 0.000004s : 25: predicate.list_to_tuple_eliminator_ 2.61% : 0.000006s : 38: predicate.load_eliminater 1.38% : 0.000003s : 4: predicate.loop_unroll_after_grad 2.07% : 0.000005s : 34: predicate.loop_unroll_before_grad 1.65% : 0.000004s : 21: predicate.make_slice_get_slice_eliminator 0.60% : 0.000001s : 8: predicate.merge_addn 0.70% : 0.000002s : 8: predicate.micro_step_allgather_replace 0.52% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.72% : 0.000002s : 13: predicate.minmaximum_grad 2.15% : 0.000005s : 4: predicate.mutable_eliminate 0.39% : 0.000001s : 4: predicate.opt_reshape 0.32% : 0.000001s : 4: predicate.parallel_virtual_node 1.67% : 0.000004s : 21: predicate.partial_defer_inline 1.38% : 0.000003s : 21: predicate.partial_eliminate 0.78% : 0.000002s : 13: predicate.print_const_string_wrapper 0.66% : 0.000002s : 8: predicate.reduce_all_const_elim 1.17% : 0.000003s : 13: predicate.reduce_eliminate 2.49% : 0.000006s : 38: predicate.redundant_stop_gradient_eliminater 0.54% : 0.000001s : 8: predicate.remove_not_recompute_node 1.15% : 0.000003s : 25: predicate.replace_applicator 0.41% : 0.000001s : 8: predicate.replace_old_param 0.46% : 0.000001s : 4: predicate.reset_defer_inline 0.99% : 0.000002s : 13: predicate.reshape_eliminate 0.61% : 0.000002s : 8: predicate.row_tensor_add_zeros_like 0.40% : 0.000001s : 4: predicate.row_tensor_eliminate 0.82% : 0.000002s : 8: predicate.same_eliminate 0.51% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.78% : 0.000002s : 8: predicate.shard_identity_eliminate 0.84% : 0.000002s : 8: predicate.special_op_eliminate 0.66% : 0.000002s : 8: predicate.specialize_transform 1.21% : 0.000003s : 8: predicate.split_environ_get_set_with_tuple_value 0.79% : 0.000002s : 8: predicate.stack_unstack_eliminate 0.32% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.36% : 0.000003s : 21: predicate.switch_defer_inline 1.91% : 0.000005s : 29: predicate.switch_layer_defer_inline 4.54% : 0.000011s : 67: predicate.switch_simplify 0.82% : 0.000002s : 13: predicate.tile_eliminate 0.96% : 0.000002s : 13: predicate.transpose_eliminate 1.28% : 0.000003s : 21: predicate.tuple_list_convert_item_index_to_positive 1.48% : 0.000004s : 21: predicate.tuple_list_get_item_const_eliminator 1.58% : 0.000004s : 21: predicate.tuple_list_get_item_depend_reorder 3.26% : 0.000008s : 33: predicate.tuple_list_get_item_eliminator 1.52% : 0.000004s : 21: predicate.tuple_list_get_set_item_eliminator 2.21% : 0.000005s : 29: predicate.tuple_list_set_item_eliminator 1.84% : 0.000004s : 25: predicate.tuple_to_list_eliminator_ 2.11% : 0.000005s : 38: predicate.updatestate_pure_node_eliminater 2.70% : 0.000007s : 46: predicate.updatestate_useless_node_eliminater 0.48% : 0.000001s : 4: predicate.value_based_eliminate 0.68% : 0.000002s : 8: predicate.virtual_dataset_eliminate 0.59% : 0.000001s : 8: predicate.virtual_output_eliminate 0.27% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.39% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000639 11 51.16% : 0.000327s : 5: func_graph_cloner_run.FuncGraphClonerGraph 48.84% : 0.000312s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.034401 192 0.02% : 0.000006s : 1: ForceFp32Comm 11.75% : 0.004043s : 1: add_attr 11.69% : 0.004021s : 1: add_attr_with_inline 0.02% : 0.000007s : 1: add_comm_op_reuse_tag 0.19% : 0.000064s : 1: add_recomputation 0.02% : 0.000007s : 1: assign_add_opt 0.24% : 0.000081s : 1: auto_monad 0.09% : 0.000033s : 1: auto_monad_reorder 0.02% : 0.000006s : 1: begin_end_overlap_inline 0.03% : 0.000009s : 1: bias_add_comm_swap 1.44% : 0.000497s : 1: bootstrap 0.13% : 0.000045s : 1: cconv 0.02% : 0.000007s : 1: comm_op_add_attrs 0.07% : 0.000024s : 1: control_data_broadcast_order 0.05% : 0.000016s : 1: convert_after_rewriter 0.10% : 0.000034s : 1: cse_after_recomputation 0.02% : 0.000008s : 1: dataset_repeat_opt 0.09% : 0.000029s : 1: detach_backward 0.04% : 0.000012s : 1: environ_conv 0.11% : 0.000036s : 1: event_method 0.02% : 0.000007s : 1: full_micro_interleaved_order_control 0.02% : 0.000009s : 1: get_jit_bprop_graph 0.04% : 0.000013s : 1: graph_reusing 0.02% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.02% : 0.000007s : 1: handle_group_info 0.02% : 0.000009s : 1: inline 0.03% : 0.000010s : 1: insert-virtual-dataset 0.02% : 0.000006s : 1: interleave_parallel_branches 0.02% : 0.000007s : 1: interleave_split_concat_branches 0.02% : 0.000008s : 1: label_fine_grained_interleaved_index 0.03% : 0.000012s : 1: label_micro_interleaved_index 1.99% : 0.000684s : 1: loop_unroll 0.02% : 0.000008s : 1: merge_cast_opt 0.02% : 0.000008s : 1: micro_interleaved_order_control 2.41% : 0.000828s : 1: mutable_eliminate 0.03% : 0.000011s : 1: offloading_packed_experts 0.06% : 0.000020s : 1: opt.transform.loop_unroll_optimizer 0.09% : 0.000029s : 1: opt.transform.mutable_eliminate 3.83% : 0.001317s : 78: opt.transform.opt_a 0.10% : 0.000033s : 1: opt.transform.opt_after_cconv 0.12% : 0.000040s : 1: opt.transform.opt_after_jit_grad 0.34% : 0.000117s : 28: opt.transform.opt_b 0.17% : 0.000059s : 2: opt.transform.opt_trans_graph 0.12% : 0.000042s : 4: opt.transform.symbol_engine_opt 10.65% : 0.003664s : 1: opt_a 0.46% : 0.000157s : 1: opt_after_cconv 2.64% : 0.000906s : 1: opt_after_jit_grad 0.97% : 0.000332s : 1: opt_b 20.63% : 0.007096s : 1: optimize 0.09% : 0.000031s : 1: optimize_parallel_all_gather_comm 0.04% : 0.000012s : 1: order_py_execute_after_rewriter 0.09% : 0.000031s : 1: overlap_grad_flash_sp 0.02% : 0.000006s : 1: overlap_grad_matmul_and_grad_allreduce 0.03% : 0.000010s : 1: overlap_grad_ring_attention 0.02% : 0.000007s : 1: overlap_opt_shard_grad_in_pipeline 0.02% : 0.000006s : 1: overlap_opt_shard_in_pipeline 0.03% : 0.000010s : 1: overlap_param_gather 0.02% : 0.000007s : 1: overlap_recompute_allgather_and_fa_grad 0.04% : 0.000013s : 1: overlap_recompute_and_grad_model_parallel 0.02% : 0.000008s : 1: overlap_recompute_comm 0.03% : 0.000011s : 1: parallel-infer-symbol 0.02% : 0.000006s : 1: parallel-infer-symbol-second 0.02% : 0.000008s : 1: partial_unused_args_eliminate 0.03% : 0.000010s : 1: pipeline_parallel_scheduler 0.02% : 0.000007s : 1: pipeline_split 0.15% : 0.000052s : 1: pre_auto_parallel 0.13% : 0.000045s : 1: py_interpret_to_execute 0.08% : 0.000028s : 1: py_interpret_to_execute_after_opt_a 0.02% : 0.000006s : 1: remove_cast_before_assign_add 0.07% : 0.000023s : 1: remove_dup_value 1.37% : 0.000470s : 1: renormalize.infer 1.21% : 0.000415s : 1: renormalize.specialize 0.02% : 0.000008s : 1: reorder_send_recv_between_fp_bp 0.04% : 0.000013s : 1: rewriter_after_jit_bprop_graph 0.17% : 0.000057s : 1: rewriter_after_opt_a 0.31% : 0.000108s : 1: rewriter_before_opt_a 0.02% : 0.000008s : 1: slice_cell_reuse_recomputed_activation 0.03% : 0.000009s : 1: slice_recompute_activation 0.03% : 0.000009s : 1: split_layernorm_comm 0.02% : 0.000007s : 1: split_matmul_comm_elemetwise 0.03% : 0.000012s : 1: swap_dp_allreduce_reducescatter 0.33% : 0.000115s : 1: symbol_engine_optimizer 0.31% : 0.000106s : 1: tuple_transform 24.11% : 0.008294s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:45:00.563.046 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0193682, [21] [bootstrap]: 0.00049167 [type_inference]: 0.00747088 [event_method]: 2.447e-05 [auto_monad]: 7.58e-05 [graph_reusing]: 7.40998e-06 [inline]: 3.01001e-06 [add_attr]: 0.00434295, [1] [add_attr_with_inline]: 0.0043279, [1] [Cycle 1]: 8.309e-05, [2] [tag_attr]: 2.686e-05 [meta_addattr_fg_expand]: 6.13002e-06 [parallel-infer-symbol]: 4.27998e-06 [pre_auto_parallel]: 4.644e-05 [insert-virtual-dataset]: 3.13e-06 [parallel-infer-symbol-second]: 9.20001e-07 [dataset_repeat_opt]: 2.17001e-06 [pipeline_split]: 1.76998e-06 [optimize]: 0.00593873, [53] [py_interpret_to_execute]: 3.642e-05 [rewriter_before_opt_a]: 9.918e-05 [opt_a]: 0.00325139, [2] [Cycle 1]: 0.00248498, [45] [expand_dump_flag]: 3.81999e-06 [switch_simplify]: 4.504e-05 [loop_unroll]: 3.055e-05 [a_1]: 0.00074991 [with_stream_mark]: 2.441e-05 [recompute_prepare]: 1.114e-05 [updatestate_depend_eliminate]: 4.28999e-06 [updatestate_assign_eliminate]: 3.31999e-06 [updatestate_loads_eliminate]: 2.99999e-06 [parameter_eliminate]: 2.06e-06 [a_2]: 8.478e-05 [accelerated_algorithm]: 7.97e-06 [shard]: 2.12999e-06 [meta_shard_fg_expand]: 2.22001e-06 [shard_inline]: 6.84999e-06 [merge_send_recv]: 1.142e-05 [auto_parallel]: 8.37e-06 [parallel]: 2.033e-05 [flash_sp]: 1.161e-05 [merge_comm]: 4.52e-06 [allreduce_fusion]: 3.68999e-06 [matmul_add_comm_reduction]: 1.035e-05 [allreduce_slice_to_reducescatter]: 7.59988e-07 [virtual_shard_identity]: 9.27999e-06 [virtual_dataset]: 6.91001e-06 [get_grad_eliminate_]: 6.43e-06 [virtual_output]: 7.06999e-06 [merge_forward]: 3.91999e-06 [cell_reuse_recompute_pass]: 1.69998e-06 [offload_activation]: 1.094e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.607e-05 [merge_recompute_call_nodes]: 1.66e-06 [before_grad]: 1.112e-05 [set_forward_comm_id_for_comm_node_pass]: 4.66002e-06 [meta_fg_expand]: 3.56999e-06 [flash_sp_send_recv_attached]: 3.21999e-06 [receive_attached]: 2.48e-06 [after_resolve]: 1.406e-05 [a_after_grad]: 1.208e-05 [renormalize]: 0.00090586 [add_forward_monad_depend]: 8.90999e-06 [auto_monad_grad]: 2.91e-06 [auto_monad_eliminator]: 1.972e-05 [cse]: 3.337e-05 [a_3]: 5.993e-05 [Cycle 2]: 0.00075286, [45] [expand_dump_flag]: 2.17999e-06 [switch_simplify]: 8.46002e-06 [loop_unroll]: 6.31e-06 [a_1]: 0.00014682 [with_stream_mark]: 1.852e-05 [recompute_prepare]: 8.21002e-06 [updatestate_depend_eliminate]: 3.43e-06 [updatestate_assign_eliminate]: 2.96001e-06 [updatestate_loads_eliminate]: 3.06999e-06 [parameter_eliminate]: 2.01e-06 [a_2]: 7.531e-05 [accelerated_algorithm]: 7.31999e-06 [shard]: 2.18002e-06 [meta_shard_fg_expand]: 2.30002e-06 [shard_inline]: 6.07999e-06 [merge_send_recv]: 7.25e-06 [auto_parallel]: 8.2e-06 [parallel]: 8.24998e-06 [flash_sp]: 4.48001e-06 [merge_comm]: 3.83001e-06 [allreduce_fusion]: 3.84002e-06 [matmul_add_comm_reduction]: 8.93002e-06 [allreduce_slice_to_reducescatter]: 7.2e-07 [virtual_shard_identity]: 8.07e-06 [virtual_dataset]: 6.64999e-06 [get_grad_eliminate_]: 7.08998e-06 [virtual_output]: 6.47001e-06 [merge_forward]: 3.54002e-06 [cell_reuse_recompute_pass]: 2.44999e-06 [offload_activation]: 9.39e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.694e-05 [merge_recompute_call_nodes]: 1.22e-06 [before_grad]: 1.128e-05 [set_forward_comm_id_for_comm_node_pass]: 3.86999e-06 [meta_fg_expand]: 2.71e-06 [flash_sp_send_recv_attached]: 1.39e-06 [receive_attached]: 1.88997e-06 [after_resolve]: 1.545e-05 [a_after_grad]: 1.173e-05 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 2.78998e-06 [auto_monad_grad]: 1.39e-06 [auto_monad_eliminator]: 1.116e-05 [cse]: 2.01e-05 [a_3]: 3.921e-05 [py_interpret_to_execute_after_opt_a]: 1.712e-05 [slice_cell_reuse_recomputed_activation]: 2.14999e-06 [rewriter_after_opt_a]: 4.447e-05 [convert_after_rewriter]: 8.05e-06 [order_py_execute_after_rewriter]: 5.46e-06 [mutable_eliminate]: 0.00077642 [opt_b]: 0.00028186, [1] [Cycle 1]: 0.00027294, [7] [b_1]: 0.00013863 [b_2]: 9.14e-06 [updatestate_depend_eliminate]: 1.022e-05 [updatestate_assign_eliminate]: 2.93e-06 [updatestate_loads_eliminate]: 2.74999e-06 [renormalize]: 1.25001e-06 [cse]: 2.997e-05 [optimize_parallel_all_gather_comm]: 2.392e-05 [overlap_param_gather]: 2.43998e-06 [cconv]: 3.622e-05 [loop_unroll]: 0.00061311 [opt_after_cconv]: 0.00012358, [1] [Cycle 1]: 0.00011582, [7] [c_1]: 3.407e-05 [parameter_eliminate]: 5.96e-06 [updatestate_depend_eliminate]: 7.59002e-06 [updatestate_assign_eliminate]: 2.62001e-06 [updatestate_loads_eliminate]: 2.51e-06 [cse]: 2.731e-05 [renormalize]: 7.7e-07 [remove_dup_value]: 1.575e-05 [tuple_transform]: 8.653e-05, [1] [Cycle 1]: 8.139e-05, [4] [d_1]: 5.267e-05 [none_parameter_eliminate]: 1.64998e-06 [renormalize]: 2.50002e-07 [switch_simplify]: 7.51001e-06 [partial_unused_args_eliminate]: 1.95001e-06 [add_recomputation]: 5.783e-05 [cse_after_recomputation]: 2.494e-05, [1] [Cycle 1]: 1.916e-05, [1] [cse]: 1.261e-05 [environ_conv]: 5.94e-06 [swap_dp_allreduce_reducescatter]: 4.99003e-06 [bias_add_comm_swap]: 3.24001e-06 [label_micro_interleaved_index]: 5.58002e-06 [label_fine_grained_interleaved_index]: 2.72001e-06 [merge_cast_opt]: 1.48002e-06 [slice_recompute_activation]: 2.09e-06 [micro_interleaved_order_control]: 2.48e-06 [assign_add_opt]: 1.52001e-06 [ForceFp32Comm]: 9.00007e-07 [remove_cast_before_assign_add]: 1.17e-06 [full_micro_interleaved_order_control]: 2.06998e-06 [reorder_send_recv_between_fp_bp]: 2.72001e-06 [comm_op_add_attrs]: 1.00001e-06 [add_comm_op_reuse_tag]: 1.00001e-06 [interleave_split_concat_branches]: 1.18001e-06 [interleave_parallel_branches]: 1.00999e-06 [overlap_opt_shard_in_pipeline]: 1.70001e-06 [overlap_opt_shard_grad_in_pipeline]: 1.71e-06 [control_data_broadcast_order]: 1.456e-05 [grouped_pairwise_exchange_alltoall]: 1.67001e-06 [offloading_packed_experts]: 4.18001e-06 [overlap_recompute_and_grad_model_parallel]: 4.85001e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.15999e-06 [overlap_recompute_allgather_and_fa_grad]: 1.39e-06 [overlap_recompute_comm]: 2.35002e-06 [overlap_grad_ring_attention]: 4.68001e-06 [overlap_grad_flash_sp]: 2.295e-05 [begin_end_overlap_inline]: 5.8001e-07 [split_matmul_comm_elemetwise]: 2.41998e-06 [split_layernorm_comm]: 1.86998e-06 [handle_group_info]: 1.33002e-06 [symbol_engine_optimizer]: 8.069e-05, [1] [Cycle 1]: 7.618e-05, [6] [build]: 3.8e-06 [elim_shapecalc]: 1.089e-05 [elim_not_effective]: 1.349e-05 [opt_reshape]: 7.77998e-06 [fold_const_symbol]: 1.013e-05 [renormalize]: 2.19996e-07 [detach_backward]: 2.54999e-06 [pipeline_parallel_scheduler]: 1.69e-06 [auto_monad_reorder]: 1.975e-05 [get_jit_bprop_graph]: 2.84999e-06 [rewriter_after_jit_bprop_graph]: 6.53e-06 [opt_after_jit_grad]: 0.00068347 [validate]: 4.946e-05 Sums bootstrap : 0.000492s : 3.54% type_inference : 0.007471s : 53.74% event_method : 0.000024s : 0.18% auto_monad : 0.000076s : 0.55% graph_reusing : 0.000007s : 0.05% inline : 0.000003s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000027s : 0.19% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.04% parallel-infer-symbol : 0.000004s : 0.03% pre_auto_parallel : 0.000046s : 0.33% insert-virtual-dataset : 0.000003s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000036s : 0.26% optimize.rewriter_before_opt_a : 0.000099s : 0.71% optimize.opt_a.expand_dump_flag : 0.000006s : 0.04% optimize.opt_a.switch_simplify : 0.000053s : 0.38% optimize.opt_a.loop_unroll : 0.000037s : 0.27% optimize.opt_a.a_1 : 0.000897s : 6.45% optimize.opt_a.with_stream_mark : 0.000043s : 0.31% optimize.opt_a.recompute_prepare : 0.000019s : 0.14% optimize.opt_a.updatestate_depend_eliminate : 0.000008s : 0.06% optimize.opt_a.updatestate_assign_eliminate : 0.000006s : 0.05% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.04% optimize.opt_a.parameter_eliminate : 0.000004s : 0.03% optimize.opt_a.a_2 : 0.000160s : 1.15% optimize.opt_a.accelerated_algorithm : 0.000015s : 0.11% optimize.opt_a.shard : 0.000004s : 0.03% optimize.opt_a.meta_shard_fg_expand : 0.000005s : 0.03% optimize.opt_a.shard_inline : 0.000013s : 0.09% optimize.opt_a.merge_send_recv : 0.000019s : 0.13% optimize.opt_a.auto_parallel : 0.000017s : 0.12% optimize.opt_a.parallel : 0.000029s : 0.21% optimize.opt_a.flash_sp : 0.000016s : 0.12% optimize.opt_a.merge_comm : 0.000008s : 0.06% optimize.opt_a.allreduce_fusion : 0.000008s : 0.05% optimize.opt_a.matmul_add_comm_reduction : 0.000019s : 0.14% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000017s : 0.12% optimize.opt_a.virtual_dataset : 0.000014s : 0.10% optimize.opt_a.get_grad_eliminate_ : 0.000014s : 0.10% optimize.opt_a.virtual_output : 0.000014s : 0.10% optimize.opt_a.merge_forward : 0.000007s : 0.05% optimize.opt_a.cell_reuse_recompute_pass : 0.000004s : 0.03% optimize.opt_a.offload_activation : 0.000020s : 0.15% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000033s : 0.24% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.02% optimize.opt_a.before_grad : 0.000022s : 0.16% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000009s : 0.06% optimize.opt_a.meta_fg_expand : 0.000006s : 0.05% optimize.opt_a.flash_sp_send_recv_attached : 0.000005s : 0.03% optimize.opt_a.receive_attached : 0.000004s : 0.03% optimize.opt_a.after_resolve : 0.000030s : 0.21% optimize.opt_a.a_after_grad : 0.000024s : 0.17% optimize.opt_a.renormalize : 0.000906s : 6.52% optimize.opt_a.add_forward_monad_depend : 0.000012s : 0.08% optimize.opt_a.auto_monad_grad : 0.000004s : 0.03% optimize.opt_a.auto_monad_eliminator : 0.000031s : 0.22% optimize.opt_a.cse : 0.000053s : 0.38% optimize.opt_a.a_3 : 0.000099s : 0.71% optimize.py_interpret_to_execute_after_opt_a : 0.000017s : 0.12% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.02% optimize.rewriter_after_opt_a : 0.000044s : 0.32% optimize.convert_after_rewriter : 0.000008s : 0.06% optimize.order_py_execute_after_rewriter : 0.000005s : 0.04% optimize.mutable_eliminate : 0.000776s : 5.59% optimize.opt_b.b_1 : 0.000139s : 1.00% optimize.opt_b.b_2 : 0.000009s : 0.07% optimize.opt_b.updatestate_depend_eliminate : 0.000010s : 0.07% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.02% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.02% optimize.opt_b.renormalize : 0.000001s : 0.01% optimize.opt_b.cse : 0.000030s : 0.22% optimize.optimize_parallel_all_gather_comm : 0.000024s : 0.17% optimize.overlap_param_gather : 0.000002s : 0.02% optimize.cconv : 0.000036s : 0.26% optimize.loop_unroll : 0.000613s : 4.41% optimize.opt_after_cconv.c_1 : 0.000034s : 0.25% optimize.opt_after_cconv.parameter_eliminate : 0.000006s : 0.04% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000008s : 0.05% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.02% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.02% optimize.opt_after_cconv.cse : 0.000027s : 0.20% optimize.opt_after_cconv.renormalize : 0.000001s : 0.01% optimize.remove_dup_value : 0.000016s : 0.11% optimize.tuple_transform.d_1 : 0.000053s : 0.38% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000008s : 0.05% optimize.partial_unused_args_eliminate : 0.000002s : 0.01% optimize.add_recomputation : 0.000058s : 0.42% optimize.cse_after_recomputation.cse : 0.000013s : 0.09% optimize.environ_conv : 0.000006s : 0.04% optimize.swap_dp_allreduce_reducescatter : 0.000005s : 0.04% optimize.bias_add_comm_swap : 0.000003s : 0.02% optimize.label_micro_interleaved_index : 0.000006s : 0.04% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.02% optimize.merge_cast_opt : 0.000001s : 0.01% optimize.slice_recompute_activation : 0.000002s : 0.02% optimize.micro_interleaved_order_control : 0.000002s : 0.02% optimize.assign_add_opt : 0.000002s : 0.01% optimize.ForceFp32Comm : 0.000001s : 0.01% optimize.remove_cast_before_assign_add : 0.000001s : 0.01% optimize.full_micro_interleaved_order_control : 0.000002s : 0.01% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.02% optimize.comm_op_add_attrs : 0.000001s : 0.01% optimize.add_comm_op_reuse_tag : 0.000001s : 0.01% optimize.interleave_split_concat_branches : 0.000001s : 0.01% optimize.interleave_parallel_branches : 0.000001s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000002s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.01% optimize.control_data_broadcast_order : 0.000015s : 0.10% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.01% optimize.offloading_packed_experts : 0.000004s : 0.03% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.03% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.01% optimize.overlap_recompute_comm : 0.000002s : 0.02% optimize.overlap_grad_ring_attention : 0.000005s : 0.03% optimize.overlap_grad_flash_sp : 0.000023s : 0.17% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.02% optimize.split_layernorm_comm : 0.000002s : 0.01% optimize.handle_group_info : 0.000001s : 0.01% optimize.symbol_engine_optimizer.build : 0.000004s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000011s : 0.08% optimize.symbol_engine_optimizer.elim_not_effective : 0.000013s : 0.10% optimize.symbol_engine_optimizer.opt_reshape : 0.000008s : 0.06% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000010s : 0.07% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000003s : 0.02% pipeline_parallel_scheduler : 0.000002s : 0.01% auto_monad_reorder : 0.000020s : 0.14% get_jit_bprop_graph : 0.000003s : 0.02% rewriter_after_jit_bprop_graph : 0.000007s : 0.05% opt_after_jit_grad : 0.000683s : 4.92% validate : 0.000049s : 0.36% Time group info: ------[substitution.] 0.000228 28 1.03% : 0.000002s : 2: substitution.elim_not_effective 0.56% : 0.000001s : 2: substitution.fold_const_symbol 2.87% : 0.000007s : 4: substitution.graph_param_transform 81.41% : 0.000186s : 4: substitution.inline 2.02% : 0.000005s : 4: substitution.j_node_and_user_rematch 2.37% : 0.000005s : 4: substitution.remove_not_recompute_node 2.96% : 0.000007s : 4: substitution.replace_old_param 6.79% : 0.000015s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.007380 2 87.74% : 0.006475s : 1: type_inference.infer 12.26% : 0.000905s : 1: type_inference.specialize ------[replace.] 0.000070 8 63.76% : 0.000045s : 4: replace.inline 36.24% : 0.000025s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000196 8 93.24% : 0.000183s : 4: match.inline 6.76% : 0.000013s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000239 1278 1.11% : 0.000003s : 13: predicate.accumulaten_eliminater 1.15% : 0.000003s : 4: predicate.ad_related_special_op_eliminate 0.62% : 0.000001s : 8: predicate.addn_check_dump 1.01% : 0.000002s : 13: predicate.addn_zero_filter 0.75% : 0.000002s : 13: predicate.adjust_all_reduce_mul_add 1.97% : 0.000005s : 21: predicate.arithmetic_simplify 1.10% : 0.000003s : 13: predicate.cast_eliminate 0.62% : 0.000001s : 8: predicate.check_bprop_eliminate 0.50% : 0.000001s : 8: predicate.compare_switch_simplify 0.16% : 0.000000s : 4: predicate.const_output_eliminate 0.70% : 0.000002s : 8: predicate.depend_value_elim 0.96% : 0.000002s : 13: predicate.dict_get_item_const_eliminator 1.01% : 0.000002s : 13: predicate.dict_get_item_eliminator 0.81% : 0.000002s : 13: predicate.dict_set_item_eliminator 1.10% : 0.000003s : 8: predicate.dumpgradient_eliminate 0.18% : 0.000000s : 4: predicate.elim_not_effective 0.42% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.14% : 0.000003s : 17: predicate.environ_add_const_eliminate 1.05% : 0.000003s : 17: predicate.environ_get_add_eliminate 1.02% : 0.000002s : 17: predicate.environ_get_depend_swap 1.90% : 0.000005s : 25: predicate.environ_get_eliminate 1.22% : 0.000003s : 17: predicate.environ_get_set_eliminate 1.50% : 0.000004s : 21: predicate.exchange_switch_depend_value 2.26% : 0.000005s : 21: predicate.float_depend_g_call 0.54% : 0.000001s : 8: predicate.float_environ_get_switch 0.96% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.14% : 0.000000s : 4: predicate.fold_const_symbol 0.74% : 0.000002s : 8: predicate.get_grad_eliminate 0.18% : 0.000000s : 4: predicate.graph_param_transform 0.48% : 0.000001s : 8: predicate.incorporate_call 0.40% : 0.000001s : 8: predicate.incorporate_call_switch 5.88% : 0.000014s : 58: predicate.inline 0.78% : 0.000002s : 8: predicate.inline_without_move 0.27% : 0.000001s : 8: predicate.j_node_and_user_rematch 0.91% : 0.000002s : 8: predicate.less_batch_normalization 1.79% : 0.000004s : 25: predicate.list_to_tuple_eliminator_ 2.37% : 0.000006s : 38: predicate.load_eliminater 1.23% : 0.000003s : 4: predicate.loop_unroll_after_grad 2.16% : 0.000005s : 34: predicate.loop_unroll_before_grad 1.53% : 0.000004s : 21: predicate.make_slice_get_slice_eliminator 0.47% : 0.000001s : 8: predicate.merge_addn 0.57% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.60% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.81% : 0.000002s : 13: predicate.minmaximum_grad 1.14% : 0.000003s : 4: predicate.mutable_eliminate 0.47% : 0.000001s : 4: predicate.opt_reshape 0.36% : 0.000001s : 4: predicate.parallel_virtual_node 1.72% : 0.000004s : 21: predicate.partial_defer_inline 1.44% : 0.000003s : 21: predicate.partial_eliminate 0.89% : 0.000002s : 13: predicate.print_const_string_wrapper 0.74% : 0.000002s : 8: predicate.reduce_all_const_elim 1.54% : 0.000004s : 13: predicate.reduce_eliminate 2.50% : 0.000006s : 38: predicate.redundant_stop_gradient_eliminater 0.50% : 0.000001s : 8: predicate.remove_not_recompute_node 1.32% : 0.000003s : 25: predicate.replace_applicator 0.42% : 0.000001s : 8: predicate.replace_old_param 0.31% : 0.000001s : 4: predicate.reset_defer_inline 0.95% : 0.000002s : 13: predicate.reshape_eliminate 0.68% : 0.000002s : 8: predicate.row_tensor_add_zeros_like 0.54% : 0.000001s : 4: predicate.row_tensor_eliminate 0.87% : 0.000002s : 8: predicate.same_eliminate 0.68% : 0.000002s : 8: predicate.set_cell_output_no_recompute 0.80% : 0.000002s : 8: predicate.shard_identity_eliminate 0.67% : 0.000002s : 8: predicate.special_op_eliminate 0.59% : 0.000001s : 8: predicate.specialize_transform 1.22% : 0.000003s : 8: predicate.split_environ_get_set_with_tuple_value 1.03% : 0.000002s : 8: predicate.stack_unstack_eliminate 0.30% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.44% : 0.000003s : 21: predicate.switch_defer_inline 1.86% : 0.000004s : 29: predicate.switch_layer_defer_inline 4.79% : 0.000011s : 67: predicate.switch_simplify 0.97% : 0.000002s : 13: predicate.tile_eliminate 0.90% : 0.000002s : 13: predicate.transpose_eliminate 1.92% : 0.000005s : 21: predicate.tuple_list_convert_item_index_to_positive 1.47% : 0.000004s : 21: predicate.tuple_list_get_item_const_eliminator 1.56% : 0.000004s : 21: predicate.tuple_list_get_item_depend_reorder 3.33% : 0.000008s : 33: predicate.tuple_list_get_item_eliminator 1.51% : 0.000004s : 21: predicate.tuple_list_get_set_item_eliminator 2.18% : 0.000005s : 29: predicate.tuple_list_set_item_eliminator 1.79% : 0.000004s : 25: predicate.tuple_to_list_eliminator_ 2.19% : 0.000005s : 38: predicate.updatestate_pure_node_eliminater 2.99% : 0.000007s : 46: predicate.updatestate_useless_node_eliminater 0.36% : 0.000001s : 4: predicate.value_based_eliminate 0.67% : 0.000002s : 8: predicate.virtual_dataset_eliminate 0.65% : 0.000002s : 8: predicate.virtual_output_eliminate 0.30% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.41% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000694 11 52.62% : 0.000365s : 5: func_graph_cloner_run.FuncGraphClonerGraph 47.38% : 0.000329s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.032067 192 0.01% : 0.000004s : 1: ForceFp32Comm 13.57% : 0.004350s : 1: add_attr 13.51% : 0.004333s : 1: add_attr_with_inline 0.02% : 0.000005s : 1: add_comm_op_reuse_tag 0.19% : 0.000062s : 1: add_recomputation 0.01% : 0.000004s : 1: assign_add_opt 0.26% : 0.000084s : 1: auto_monad 0.07% : 0.000024s : 1: auto_monad_reorder 0.01% : 0.000003s : 1: begin_end_overlap_inline 0.02% : 0.000006s : 1: bias_add_comm_swap 1.63% : 0.000524s : 1: bootstrap 0.12% : 0.000040s : 1: cconv 0.01% : 0.000004s : 1: comm_op_add_attrs 0.05% : 0.000018s : 1: control_data_broadcast_order 0.04% : 0.000012s : 1: convert_after_rewriter 0.09% : 0.000028s : 1: cse_after_recomputation 0.02% : 0.000005s : 1: dataset_repeat_opt 0.02% : 0.000006s : 1: detach_backward 0.03% : 0.000010s : 1: environ_conv 0.10% : 0.000033s : 1: event_method 0.01% : 0.000005s : 1: full_micro_interleaved_order_control 0.02% : 0.000006s : 1: get_jit_bprop_graph 0.03% : 0.000011s : 1: graph_reusing 0.01% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000004s : 1: handle_group_info 0.02% : 0.000006s : 1: inline 0.02% : 0.000007s : 1: insert-virtual-dataset 0.01% : 0.000004s : 1: interleave_parallel_branches 0.01% : 0.000004s : 1: interleave_split_concat_branches 0.02% : 0.000006s : 1: label_fine_grained_interleaved_index 0.03% : 0.000008s : 1: label_micro_interleaved_index 1.95% : 0.000624s : 1: loop_unroll 0.01% : 0.000005s : 1: merge_cast_opt 0.02% : 0.000005s : 1: micro_interleaved_order_control 2.46% : 0.000788s : 1: mutable_eliminate 0.02% : 0.000007s : 1: offloading_packed_experts 0.06% : 0.000020s : 1: opt.transform.loop_unroll_optimizer 0.07% : 0.000021s : 1: opt.transform.mutable_eliminate 4.24% : 0.001358s : 78: opt.transform.opt_a 0.10% : 0.000033s : 1: opt.transform.opt_after_cconv 0.10% : 0.000033s : 1: opt.transform.opt_after_jit_grad 0.34% : 0.000111s : 28: opt.transform.opt_b 0.18% : 0.000058s : 2: opt.transform.opt_trans_graph 0.12% : 0.000038s : 4: opt.transform.symbol_engine_opt 10.15% : 0.003255s : 1: opt_a 0.40% : 0.000127s : 1: opt_after_cconv 2.18% : 0.000698s : 1: opt_after_jit_grad 0.89% : 0.000286s : 1: opt_b 18.54% : 0.005944s : 1: optimize 0.09% : 0.000028s : 1: optimize_parallel_all_gather_comm 0.03% : 0.000009s : 1: order_py_execute_after_rewriter 0.08% : 0.000027s : 1: overlap_grad_flash_sp 0.01% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.02% : 0.000008s : 1: overlap_grad_ring_attention 0.01% : 0.000004s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.02% : 0.000006s : 1: overlap_param_gather 0.01% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.03% : 0.000008s : 1: overlap_recompute_and_grad_model_parallel 0.02% : 0.000005s : 1: overlap_recompute_comm 0.03% : 0.000008s : 1: parallel-infer-symbol 0.01% : 0.000004s : 1: parallel-infer-symbol-second 0.02% : 0.000005s : 1: partial_unused_args_eliminate 0.01% : 0.000005s : 1: pipeline_parallel_scheduler 0.01% : 0.000005s : 1: pipeline_split 0.16% : 0.000051s : 1: pre_auto_parallel 0.13% : 0.000041s : 1: py_interpret_to_execute 0.07% : 0.000022s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000004s : 1: remove_cast_before_assign_add 0.06% : 0.000019s : 1: remove_dup_value 1.48% : 0.000473s : 1: renormalize.infer 1.31% : 0.000421s : 1: renormalize.specialize 0.02% : 0.000005s : 1: reorder_send_recv_between_fp_bp 0.03% : 0.000010s : 1: rewriter_after_jit_bprop_graph 0.16% : 0.000050s : 1: rewriter_after_opt_a 0.32% : 0.000104s : 1: rewriter_before_opt_a 0.02% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.02% : 0.000007s : 1: slice_recompute_activation 0.02% : 0.000005s : 1: split_layernorm_comm 0.02% : 0.000005s : 1: split_matmul_comm_elemetwise 0.02% : 0.000008s : 1: swap_dp_allreduce_reducescatter 0.26% : 0.000083s : 1: symbol_engine_optimizer 0.28% : 0.000090s : 1: tuple_transform 23.39% : 0.007501s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:45:00.823.757 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:45:00.824.058 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0199507, [21] [bootstrap]: 0.00048583 [type_inference]: 0.00725052 [event_method]: 2.102e-05 [auto_monad]: 6.733e-05 [graph_reusing]: 5.59998e-06 [inline]: 3.08e-06 [add_attr]: 0.00352235, [1] [add_attr_with_inline]: 0.00351117, [1] [Cycle 1]: 9.137e-05, [2] [tag_attr]: 2.155e-05 [meta_addattr_fg_expand]: 6.24001e-06 [parallel-infer-symbol]: 4.26001e-06 [pre_auto_parallel]: 3.947e-05 [insert-virtual-dataset]: 2.44999e-06 [parallel-infer-symbol-second]: 1.07998e-06 [dataset_repeat_opt]: 2.24999e-06 [pipeline_split]: 2.12001e-06 [optimize]: 0.00711539, [53] [py_interpret_to_execute]: 3.331e-05 [rewriter_before_opt_a]: 9.698e-05 [opt_a]: 0.0041856, [2] [Cycle 1]: 0.00310103, [45] [expand_dump_flag]: 2.89001e-06 [switch_simplify]: 4.602e-05 [loop_unroll]: 3.198e-05 [a_1]: 0.00071428 [with_stream_mark]: 2.134e-05 [recompute_prepare]: 1.228e-05 [updatestate_depend_eliminate]: 5.35999e-06 [updatestate_assign_eliminate]: 4.84e-06 [updatestate_loads_eliminate]: 4.13001e-06 [parameter_eliminate]: 2.56e-06 [a_2]: 0.00048368 [accelerated_algorithm]: 2.209e-05 [shard]: 3.4e-06 [meta_shard_fg_expand]: 3.59002e-06 [shard_inline]: 9.21002e-06 [merge_send_recv]: 1.362e-05 [auto_parallel]: 1.305e-05 [parallel]: 2.176e-05 [flash_sp]: 1.24e-05 [merge_comm]: 5.84999e-06 [allreduce_fusion]: 4.53001e-06 [matmul_add_comm_reduction]: 1.162e-05 [allreduce_slice_to_reducescatter]: 8.80013e-07 [virtual_shard_identity]: 1.105e-05 [virtual_dataset]: 8.76002e-06 [get_grad_eliminate_]: 7.58001e-06 [virtual_output]: 8.31002e-06 [merge_forward]: 5.25001e-06 [cell_reuse_recompute_pass]: 1.84e-06 [offload_activation]: 1.135e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.322e-05 [merge_recompute_call_nodes]: 1.43002e-06 [before_grad]: 1.532e-05 [set_forward_comm_id_for_comm_node_pass]: 8.07998e-06 [meta_fg_expand]: 4.07e-06 [flash_sp_send_recv_attached]: 3.11001e-06 [receive_attached]: 2.04999e-06 [after_resolve]: 1.743e-05 [a_after_grad]: 1.237e-05 [renormalize]: 0.00090221 [add_forward_monad_depend]: 7.9e-06 [auto_monad_grad]: 2.74999e-06 [auto_monad_eliminator]: 2.19e-05 [cse]: 3.678e-05 [a_3]: 7.842e-05 [Cycle 2]: 0.00106736, [45] [expand_dump_flag]: 1.83002e-06 [switch_simplify]: 1.016e-05 [loop_unroll]: 7.66999e-06 [a_1]: 0.00018274 [with_stream_mark]: 1.802e-05 [recompute_prepare]: 9.71e-06 [updatestate_depend_eliminate]: 5.42001e-06 [updatestate_assign_eliminate]: 3.84002e-06 [updatestate_loads_eliminate]: 4.29997e-06 [parameter_eliminate]: 1.96998e-06 [a_2]: 0.00012363 [accelerated_algorithm]: 9.52999e-06 [shard]: 2.55002e-06 [meta_shard_fg_expand]: 2.35002e-06 [shard_inline]: 1.137e-05 [merge_send_recv]: 9.36998e-06 [auto_parallel]: 9.85002e-06 [parallel]: 7.68001e-06 [flash_sp]: 4.27e-06 [merge_comm]: 5.86998e-06 [allreduce_fusion]: 4.38001e-06 [matmul_add_comm_reduction]: 9.48002e-06 [allreduce_slice_to_reducescatter]: 5.50004e-07 [virtual_shard_identity]: 1.385e-05 [virtual_dataset]: 8.08001e-06 [get_grad_eliminate_]: 8.15999e-06 [virtual_output]: 7.15e-06 [merge_forward]: 4.77e-06 [cell_reuse_recompute_pass]: 2.38998e-06 [offload_activation]: 1.126e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.161e-05 [merge_recompute_call_nodes]: 1.46002e-06 [before_grad]: 1.303e-05 [set_forward_comm_id_for_comm_node_pass]: 5.82001e-06 [meta_fg_expand]: 2.92002e-06 [flash_sp_send_recv_attached]: 2.28002e-06 [receive_attached]: 1.84998e-06 [after_resolve]: 1.371e-05 [a_after_grad]: 1.244e-05 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 2.72001e-06 [auto_monad_grad]: 1.91003e-06 [auto_monad_eliminator]: 1.412e-05 [cse]: 2.75e-05 [a_3]: 6.255e-05 [py_interpret_to_execute_after_opt_a]: 2.272e-05 [slice_cell_reuse_recomputed_activation]: 4.97999e-06 [rewriter_after_opt_a]: 6.123e-05 [convert_after_rewriter]: 1.179e-05 [order_py_execute_after_rewriter]: 9.21002e-06 [mutable_eliminate]: 0.00072905 [opt_b]: 0.00036556, [1] [Cycle 1]: 0.00035274, [7] [b_1]: 0.00021972 [b_2]: 1.065e-05 [updatestate_depend_eliminate]: 1.088e-05 [updatestate_assign_eliminate]: 3.67002e-06 [updatestate_loads_eliminate]: 3.23e-06 [renormalize]: 9.29984e-07 [cse]: 3.745e-05 [optimize_parallel_all_gather_comm]: 2.727e-05 [overlap_param_gather]: 5.97999e-06 [cconv]: 3.915e-05 [loop_unroll]: 0.00052321 [opt_after_cconv]: 0.00016328, [1] [Cycle 1]: 0.00015309, [7] [c_1]: 4.141e-05 [parameter_eliminate]: 5.34e-06 [updatestate_depend_eliminate]: 8.55999e-06 [updatestate_assign_eliminate]: 3.45e-06 [updatestate_loads_eliminate]: 3.49001e-06 [cse]: 3.087e-05 [renormalize]: 6.40022e-07 [remove_dup_value]: 2.082e-05 [tuple_transform]: 0.00011313, [1] [Cycle 1]: 0.00010512, [4] [d_1]: 5.955e-05 [none_parameter_eliminate]: 1.94999e-06 [renormalize]: 2.50002e-07 [switch_simplify]: 8.40999e-06 [partial_unused_args_eliminate]: 5.55001e-06 [add_recomputation]: 6.707e-05 [cse_after_recomputation]: 3.796e-05, [1] [Cycle 1]: 2.971e-05, [1] [cse]: 1.802e-05 [environ_conv]: 1.093e-05 [swap_dp_allreduce_reducescatter]: 9.47999e-06 [bias_add_comm_swap]: 6.20002e-06 [label_micro_interleaved_index]: 8.66002e-06 [label_fine_grained_interleaved_index]: 5.42999e-06 [merge_cast_opt]: 4.67e-06 [slice_recompute_activation]: 4.77998e-06 [micro_interleaved_order_control]: 4.66002e-06 [assign_add_opt]: 3.95998e-06 [ForceFp32Comm]: 3.20002e-06 [remove_cast_before_assign_add]: 3.51001e-06 [full_micro_interleaved_order_control]: 4.85999e-06 [reorder_send_recv_between_fp_bp]: 5.51998e-06 [comm_op_add_attrs]: 3.88001e-06 [add_comm_op_reuse_tag]: 3.50998e-06 [interleave_split_concat_branches]: 3.75998e-06 [interleave_parallel_branches]: 3.81999e-06 [overlap_opt_shard_in_pipeline]: 3.63999e-06 [overlap_opt_shard_grad_in_pipeline]: 4.19002e-06 [control_data_broadcast_order]: 2.166e-05 [grouped_pairwise_exchange_alltoall]: 4.02e-06 [offloading_packed_experts]: 7.07002e-06 [overlap_recompute_and_grad_model_parallel]: 8.03999e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.9e-06 [overlap_recompute_allgather_and_fa_grad]: 4.40999e-06 [overlap_recompute_comm]: 5.62999e-06 [overlap_grad_ring_attention]: 8.84e-06 [overlap_grad_flash_sp]: 2.869e-05 [begin_end_overlap_inline]: 3.2e-06 [split_matmul_comm_elemetwise]: 4.44002e-06 [split_layernorm_comm]: 4.28001e-06 [handle_group_info]: 3.97998e-06 [symbol_engine_optimizer]: 0.00011986, [1] [Cycle 1]: 0.00011101, [6] [build]: 5.72001e-06 [elim_shapecalc]: 1.555e-05 [elim_not_effective]: 1.696e-05 [opt_reshape]: 8.65999e-06 [fold_const_symbol]: 1.223e-05 [renormalize]: 2.3999e-07 [detach_backward]: 5.05001e-06 [pipeline_parallel_scheduler]: 2.06e-06 [auto_monad_reorder]: 2.754e-05 [get_jit_bprop_graph]: 1.96998e-06 [rewriter_after_jit_bprop_graph]: 6.23002e-06 [opt_after_jit_grad]: 0.00061209 [validate]: 5.101e-05 Sums bootstrap : 0.000486s : 3.39% type_inference : 0.007251s : 50.56% event_method : 0.000021s : 0.15% auto_monad : 0.000067s : 0.47% graph_reusing : 0.000006s : 0.04% inline : 0.000003s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000022s : 0.15% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.04% parallel-infer-symbol : 0.000004s : 0.03% pre_auto_parallel : 0.000039s : 0.28% insert-virtual-dataset : 0.000002s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000033s : 0.23% optimize.rewriter_before_opt_a : 0.000097s : 0.68% optimize.opt_a.expand_dump_flag : 0.000005s : 0.03% optimize.opt_a.switch_simplify : 0.000056s : 0.39% optimize.opt_a.loop_unroll : 0.000040s : 0.28% optimize.opt_a.a_1 : 0.000897s : 6.26% optimize.opt_a.with_stream_mark : 0.000039s : 0.27% optimize.opt_a.recompute_prepare : 0.000022s : 0.15% optimize.opt_a.updatestate_depend_eliminate : 0.000011s : 0.08% optimize.opt_a.updatestate_assign_eliminate : 0.000009s : 0.06% optimize.opt_a.updatestate_loads_eliminate : 0.000008s : 0.06% optimize.opt_a.parameter_eliminate : 0.000005s : 0.03% optimize.opt_a.a_2 : 0.000607s : 4.23% optimize.opt_a.accelerated_algorithm : 0.000032s : 0.22% optimize.opt_a.shard : 0.000006s : 0.04% optimize.opt_a.meta_shard_fg_expand : 0.000006s : 0.04% optimize.opt_a.shard_inline : 0.000021s : 0.14% optimize.opt_a.merge_send_recv : 0.000023s : 0.16% optimize.opt_a.auto_parallel : 0.000023s : 0.16% optimize.opt_a.parallel : 0.000029s : 0.21% optimize.opt_a.flash_sp : 0.000017s : 0.12% optimize.opt_a.merge_comm : 0.000012s : 0.08% optimize.opt_a.allreduce_fusion : 0.000009s : 0.06% optimize.opt_a.matmul_add_comm_reduction : 0.000021s : 0.15% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000025s : 0.17% optimize.opt_a.virtual_dataset : 0.000017s : 0.12% optimize.opt_a.get_grad_eliminate_ : 0.000016s : 0.11% optimize.opt_a.virtual_output : 0.000015s : 0.11% optimize.opt_a.merge_forward : 0.000010s : 0.07% optimize.opt_a.cell_reuse_recompute_pass : 0.000004s : 0.03% optimize.opt_a.offload_activation : 0.000023s : 0.16% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000045s : 0.31% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.02% optimize.opt_a.before_grad : 0.000028s : 0.20% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000014s : 0.10% optimize.opt_a.meta_fg_expand : 0.000007s : 0.05% optimize.opt_a.flash_sp_send_recv_attached : 0.000005s : 0.04% optimize.opt_a.receive_attached : 0.000004s : 0.03% optimize.opt_a.after_resolve : 0.000031s : 0.22% optimize.opt_a.a_after_grad : 0.000025s : 0.17% optimize.opt_a.renormalize : 0.000902s : 6.29% optimize.opt_a.add_forward_monad_depend : 0.000011s : 0.07% optimize.opt_a.auto_monad_grad : 0.000005s : 0.03% optimize.opt_a.auto_monad_eliminator : 0.000036s : 0.25% optimize.opt_a.cse : 0.000064s : 0.45% optimize.opt_a.a_3 : 0.000141s : 0.98% optimize.py_interpret_to_execute_after_opt_a : 0.000023s : 0.16% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.03% optimize.rewriter_after_opt_a : 0.000061s : 0.43% optimize.convert_after_rewriter : 0.000012s : 0.08% optimize.order_py_execute_after_rewriter : 0.000009s : 0.06% optimize.mutable_eliminate : 0.000729s : 5.08% optimize.opt_b.b_1 : 0.000220s : 1.53% optimize.opt_b.b_2 : 0.000011s : 0.07% optimize.opt_b.updatestate_depend_eliminate : 0.000011s : 0.08% optimize.opt_b.updatestate_assign_eliminate : 0.000004s : 0.03% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.02% optimize.opt_b.renormalize : 0.000001s : 0.01% optimize.opt_b.cse : 0.000037s : 0.26% optimize.optimize_parallel_all_gather_comm : 0.000027s : 0.19% optimize.overlap_param_gather : 0.000006s : 0.04% optimize.cconv : 0.000039s : 0.27% optimize.loop_unroll : 0.000523s : 3.65% optimize.opt_after_cconv.c_1 : 0.000041s : 0.29% optimize.opt_after_cconv.parameter_eliminate : 0.000005s : 0.04% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000009s : 0.06% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.02% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.02% optimize.opt_after_cconv.cse : 0.000031s : 0.22% optimize.opt_after_cconv.renormalize : 0.000001s : 0.00% optimize.remove_dup_value : 0.000021s : 0.15% optimize.tuple_transform.d_1 : 0.000060s : 0.42% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000008s : 0.06% optimize.partial_unused_args_eliminate : 0.000006s : 0.04% optimize.add_recomputation : 0.000067s : 0.47% optimize.cse_after_recomputation.cse : 0.000018s : 0.13% optimize.environ_conv : 0.000011s : 0.08% optimize.swap_dp_allreduce_reducescatter : 0.000009s : 0.07% optimize.bias_add_comm_swap : 0.000006s : 0.04% optimize.label_micro_interleaved_index : 0.000009s : 0.06% optimize.label_fine_grained_interleaved_index : 0.000005s : 0.04% optimize.merge_cast_opt : 0.000005s : 0.03% optimize.slice_recompute_activation : 0.000005s : 0.03% optimize.micro_interleaved_order_control : 0.000005s : 0.03% optimize.assign_add_opt : 0.000004s : 0.03% optimize.ForceFp32Comm : 0.000003s : 0.02% optimize.remove_cast_before_assign_add : 0.000004s : 0.02% optimize.full_micro_interleaved_order_control : 0.000005s : 0.03% optimize.reorder_send_recv_between_fp_bp : 0.000006s : 0.04% optimize.comm_op_add_attrs : 0.000004s : 0.03% optimize.add_comm_op_reuse_tag : 0.000004s : 0.02% optimize.interleave_split_concat_branches : 0.000004s : 0.03% optimize.interleave_parallel_branches : 0.000004s : 0.03% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.03% optimize.overlap_opt_shard_grad_in_pipeline : 0.000004s : 0.03% optimize.control_data_broadcast_order : 0.000022s : 0.15% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.03% optimize.offloading_packed_experts : 0.000007s : 0.05% optimize.overlap_recompute_and_grad_model_parallel : 0.000008s : 0.06% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.03% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.03% optimize.overlap_recompute_comm : 0.000006s : 0.04% optimize.overlap_grad_ring_attention : 0.000009s : 0.06% optimize.overlap_grad_flash_sp : 0.000029s : 0.20% optimize.begin_end_overlap_inline : 0.000003s : 0.02% optimize.split_matmul_comm_elemetwise : 0.000004s : 0.03% optimize.split_layernorm_comm : 0.000004s : 0.03% optimize.handle_group_info : 0.000004s : 0.03% optimize.symbol_engine_optimizer.build : 0.000006s : 0.04% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000016s : 0.11% optimize.symbol_engine_optimizer.elim_not_effective : 0.000017s : 0.12% optimize.symbol_engine_optimizer.opt_reshape : 0.000009s : 0.06% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000012s : 0.09% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000005s : 0.04% pipeline_parallel_scheduler : 0.000002s : 0.01% auto_monad_reorder : 0.000028s : 0.19% get_jit_bprop_graph : 0.000002s : 0.01% rewriter_after_jit_bprop_graph : 0.000006s : 0.04% opt_after_jit_grad : 0.000612s : 4.27% validate : 0.000051s : 0.36% Time group info: ------[substitution.] 0.000239 38 13.22% : 0.000032s : 3: substitution.cast_eliminate 1.03% : 0.000002s : 3: substitution.elim_not_effective 0.69% : 0.000002s : 3: substitution.fold_const_symbol 3.14% : 0.000007s : 5: substitution.graph_param_transform 68.59% : 0.000164s : 4: substitution.inline 2.46% : 0.000006s : 6: substitution.j_node_and_user_rematch 2.94% : 0.000007s : 6: substitution.remove_not_recompute_node 2.43% : 0.000006s : 4: substitution.replace_old_param 5.51% : 0.000013s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.007184 2 88.71% : 0.006373s : 1: type_inference.infer 11.29% : 0.000811s : 1: type_inference.specialize ------[replace.] 0.000069 8 61.27% : 0.000042s : 4: replace.inline 38.73% : 0.000027s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000172 8 93.42% : 0.000161s : 4: match.inline 6.58% : 0.000011s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000571 1504 0.37% : 0.000002s : 15: predicate.accumulaten_eliminater 0.41% : 0.000002s : 5: predicate.ad_related_special_op_eliminate 0.26% : 0.000001s : 10: predicate.addn_check_dump 0.40% : 0.000002s : 15: predicate.addn_zero_filter 0.34% : 0.000002s : 15: predicate.adjust_all_reduce_mul_add 0.79% : 0.000005s : 25: predicate.arithmetic_simplify 0.43% : 0.000002s : 15: predicate.cast_eliminate 0.32% : 0.000002s : 10: predicate.check_bprop_eliminate 0.25% : 0.000001s : 10: predicate.compare_switch_simplify 0.09% : 0.000001s : 5: predicate.const_output_eliminate 0.29% : 0.000002s : 10: predicate.depend_value_elim 0.38% : 0.000002s : 15: predicate.dict_get_item_const_eliminator 0.41% : 0.000002s : 15: predicate.dict_get_item_eliminator 0.35% : 0.000002s : 15: predicate.dict_set_item_eliminator 0.48% : 0.000003s : 10: predicate.dumpgradient_eliminate 0.07% : 0.000000s : 5: predicate.elim_not_effective 0.26% : 0.000002s : 5: predicate.elim_shapecalc_of_broadcastargs 0.49% : 0.000003s : 20: predicate.environ_add_const_eliminate 0.46% : 0.000003s : 20: predicate.environ_get_add_eliminate 0.47% : 0.000003s : 20: predicate.environ_get_depend_swap 0.96% : 0.000005s : 30: predicate.environ_get_eliminate 0.47% : 0.000003s : 20: predicate.environ_get_set_eliminate 0.57% : 0.000003s : 23: predicate.exchange_switch_depend_value 0.97% : 0.000006s : 23: predicate.float_depend_g_call 0.24% : 0.000001s : 10: predicate.float_environ_get_switch 0.37% : 0.000002s : 15: predicate.float_tuple_getitem_switch 0.08% : 0.000000s : 5: predicate.fold_const_symbol 0.29% : 0.000002s : 10: predicate.get_grad_eliminate 0.11% : 0.000001s : 5: predicate.graph_param_transform 0.28% : 0.000002s : 10: predicate.incorporate_call 55.98% : 0.000320s : 10: predicate.incorporate_call_switch 2.86% : 0.000016s : 68: predicate.inline 0.38% : 0.000002s : 10: predicate.inline_without_move 0.14% : 0.000001s : 10: predicate.j_node_and_user_rematch 0.50% : 0.000003s : 10: predicate.less_batch_normalization 0.77% : 0.000004s : 29: predicate.list_to_tuple_eliminator_ 1.08% : 0.000006s : 44: predicate.load_eliminater 0.47% : 0.000003s : 5: predicate.loop_unroll_after_grad 0.89% : 0.000005s : 36: predicate.loop_unroll_before_grad 0.71% : 0.000004s : 25: predicate.make_slice_get_slice_eliminator 0.28% : 0.000002s : 10: predicate.merge_addn 0.27% : 0.000002s : 10: predicate.micro_step_allgather_replace 0.28% : 0.000002s : 10: predicate.mini_step_allgather_replace 0.35% : 0.000002s : 15: predicate.minmaximum_grad 0.51% : 0.000003s : 5: predicate.mutable_eliminate 0.17% : 0.000001s : 5: predicate.opt_reshape 0.16% : 0.000001s : 5: predicate.parallel_virtual_node 0.80% : 0.000005s : 23: predicate.partial_defer_inline 0.72% : 0.000004s : 24: predicate.partial_eliminate 0.36% : 0.000002s : 15: predicate.print_const_string_wrapper 0.25% : 0.000001s : 10: predicate.reduce_all_const_elim 0.48% : 0.000003s : 15: predicate.reduce_eliminate 1.07% : 0.000006s : 44: predicate.redundant_stop_gradient_eliminater 0.22% : 0.000001s : 10: predicate.remove_not_recompute_node 0.57% : 0.000003s : 29: predicate.replace_applicator 0.25% : 0.000001s : 10: predicate.replace_old_param 0.14% : 0.000001s : 5: predicate.reset_defer_inline 0.40% : 0.000002s : 15: predicate.reshape_eliminate 0.25% : 0.000001s : 10: predicate.row_tensor_add_zeros_like 0.29% : 0.000002s : 5: predicate.row_tensor_eliminate 0.34% : 0.000002s : 10: predicate.same_eliminate 0.18% : 0.000001s : 10: predicate.set_cell_output_no_recompute 0.47% : 0.000003s : 10: predicate.shard_identity_eliminate 0.42% : 0.000002s : 10: predicate.special_op_eliminate 0.48% : 0.000003s : 10: predicate.specialize_transform 0.54% : 0.000003s : 10: predicate.split_environ_get_set_with_tuple_value 0.36% : 0.000002s : 10: predicate.stack_unstack_eliminate 0.17% : 0.000001s : 5: predicate.switch_call_monad_eliminater 0.59% : 0.000003s : 23: predicate.switch_defer_inline 0.85% : 0.000005s : 33: predicate.switch_layer_defer_inline 2.23% : 0.000013s : 74: predicate.switch_simplify 0.39% : 0.000002s : 15: predicate.tile_eliminate 0.37% : 0.000002s : 15: predicate.transpose_eliminate 0.67% : 0.000004s : 25: predicate.tuple_list_convert_item_index_to_positive 0.67% : 0.000004s : 25: predicate.tuple_list_get_item_const_eliminator 0.62% : 0.000004s : 25: predicate.tuple_list_get_item_depend_reorder 1.34% : 0.000008s : 39: predicate.tuple_list_get_item_eliminator 0.60% : 0.000003s : 25: predicate.tuple_list_get_set_item_eliminator 1.01% : 0.000006s : 35: predicate.tuple_list_set_item_eliminator 0.79% : 0.000005s : 29: predicate.tuple_to_list_eliminator_ 1.03% : 0.000006s : 44: predicate.updatestate_pure_node_eliminater 1.40% : 0.000008s : 54: predicate.updatestate_useless_node_eliminater 0.23% : 0.000001s : 5: predicate.value_based_eliminate 0.33% : 0.000002s : 10: predicate.virtual_dataset_eliminate 0.28% : 0.000002s : 10: predicate.virtual_output_eliminate 0.14% : 0.000001s : 5: predicate.virtual_view_grad_eliminate 0.24% : 0.000001s : 5: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000671 11 55.64% : 0.000373s : 5: func_graph_cloner_run.FuncGraphClonerGraph 44.36% : 0.000298s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.033521 192 0.02% : 0.000006s : 1: ForceFp32Comm 10.54% : 0.003532s : 1: add_attr 10.49% : 0.003516s : 1: add_attr_with_inline 0.02% : 0.000006s : 1: add_comm_op_reuse_tag 0.22% : 0.000073s : 1: add_recomputation 0.02% : 0.000007s : 1: assign_add_opt 0.23% : 0.000078s : 1: auto_monad 0.11% : 0.000035s : 1: auto_monad_reorder 0.02% : 0.000006s : 1: begin_end_overlap_inline 0.03% : 0.000009s : 1: bias_add_comm_swap 1.60% : 0.000535s : 1: bootstrap 0.13% : 0.000044s : 1: cconv 0.02% : 0.000007s : 1: comm_op_add_attrs 0.08% : 0.000026s : 1: control_data_broadcast_order 0.05% : 0.000016s : 1: convert_after_rewriter 0.12% : 0.000041s : 1: cse_after_recomputation 0.02% : 0.000008s : 1: dataset_repeat_opt 0.09% : 0.000029s : 1: detach_backward 0.04% : 0.000014s : 1: environ_conv 0.10% : 0.000034s : 1: event_method 0.02% : 0.000008s : 1: full_micro_interleaved_order_control 0.02% : 0.000008s : 1: get_jit_bprop_graph 0.04% : 0.000012s : 1: graph_reusing 0.02% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.02% : 0.000007s : 1: handle_group_info 0.03% : 0.000009s : 1: inline 0.03% : 0.000009s : 1: insert-virtual-dataset 0.02% : 0.000006s : 1: interleave_parallel_branches 0.02% : 0.000007s : 1: interleave_split_concat_branches 0.03% : 0.000009s : 1: label_fine_grained_interleaved_index 0.03% : 0.000011s : 1: label_micro_interleaved_index 1.58% : 0.000531s : 1: loop_unroll 0.02% : 0.000007s : 1: merge_cast_opt 0.02% : 0.000007s : 1: micro_interleaved_order_control 2.24% : 0.000751s : 1: mutable_eliminate 0.03% : 0.000010s : 1: offloading_packed_experts 0.06% : 0.000021s : 1: opt.transform.loop_unroll_optimizer 0.07% : 0.000024s : 1: opt.transform.mutable_eliminate 5.40% : 0.001810s : 78: opt.transform.opt_a 0.12% : 0.000040s : 1: opt.transform.opt_after_cconv 0.11% : 0.000038s : 1: opt.transform.opt_after_jit_grad 0.45% : 0.000149s : 28: opt.transform.opt_b 0.19% : 0.000065s : 2: opt.transform.opt_trans_graph 0.15% : 0.000049s : 4: opt.transform.symbol_engine_opt 12.50% : 0.004189s : 1: opt_a 0.50% : 0.000167s : 1: opt_after_cconv 1.87% : 0.000625s : 1: opt_after_jit_grad 1.11% : 0.000371s : 1: opt_b 22.53% : 0.007554s : 1: optimize 0.10% : 0.000032s : 1: optimize_parallel_all_gather_comm 0.04% : 0.000012s : 1: order_py_execute_after_rewriter 0.10% : 0.000034s : 1: overlap_grad_flash_sp 0.02% : 0.000007s : 1: overlap_grad_matmul_and_grad_allreduce 0.04% : 0.000013s : 1: overlap_grad_ring_attention 0.02% : 0.000008s : 1: overlap_opt_shard_grad_in_pipeline 0.02% : 0.000006s : 1: overlap_opt_shard_in_pipeline 0.03% : 0.000010s : 1: overlap_param_gather 0.02% : 0.000007s : 1: overlap_recompute_allgather_and_fa_grad 0.03% : 0.000011s : 1: overlap_recompute_and_grad_model_parallel 0.03% : 0.000009s : 1: overlap_recompute_comm 0.03% : 0.000011s : 1: parallel-infer-symbol 0.02% : 0.000007s : 1: parallel-infer-symbol-second 0.03% : 0.000009s : 1: partial_unused_args_eliminate 0.03% : 0.000011s : 1: pipeline_parallel_scheduler 0.02% : 0.000008s : 1: pipeline_split 0.14% : 0.000047s : 1: pre_auto_parallel 0.11% : 0.000038s : 1: py_interpret_to_execute 0.08% : 0.000027s : 1: py_interpret_to_execute_after_opt_a 0.02% : 0.000007s : 1: remove_cast_before_assign_add 0.07% : 0.000025s : 1: remove_dup_value 1.54% : 0.000516s : 1: renormalize.infer 1.12% : 0.000375s : 1: renormalize.specialize 0.03% : 0.000009s : 1: reorder_send_recv_between_fp_bp 0.04% : 0.000012s : 1: rewriter_after_jit_bprop_graph 0.20% : 0.000068s : 1: rewriter_after_opt_a 0.30% : 0.000102s : 1: rewriter_before_opt_a 0.03% : 0.000009s : 1: slice_cell_reuse_recomputed_activation 0.02% : 0.000008s : 1: slice_recompute_activation 0.02% : 0.000007s : 1: split_layernorm_comm 0.02% : 0.000007s : 1: split_matmul_comm_elemetwise 0.04% : 0.000013s : 1: swap_dp_allreduce_reducescatter 0.37% : 0.000123s : 1: symbol_engine_optimizer 0.35% : 0.000117s : 1: tuple_transform 21.77% : 0.007296s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:45:01.806.86 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0162482, [21] [bootstrap]: 0.00047361 [type_inference]: 0.00612409 [event_method]: 2.116e-05 [auto_monad]: 6.485e-05 [graph_reusing]: 5.66e-06 [inline]: 1.95001e-06 [add_attr]: 0.00327751, [1] [add_attr_with_inline]: 0.00326711, [1] [Cycle 1]: 6.86e-05, [2] [tag_attr]: 2.173e-05 [meta_addattr_fg_expand]: 6.01998e-06 [parallel-infer-symbol]: 3.44001e-06 [pre_auto_parallel]: 3.779e-05 [insert-virtual-dataset]: 2.48e-06 [parallel-infer-symbol-second]: 8.2e-07 [dataset_repeat_opt]: 2.17001e-06 [pipeline_split]: 1.86998e-06 [optimize]: 0.005499, [53] [py_interpret_to_execute]: 2.821e-05 [rewriter_before_opt_a]: 0.00011816 [opt_a]: 0.0031186, [2] [Cycle 1]: 0.00232702, [45] [expand_dump_flag]: 3.34001e-06 [switch_simplify]: 4.373e-05 [loop_unroll]: 3.085e-05 [a_1]: 0.00067879 [with_stream_mark]: 1.834e-05 [recompute_prepare]: 1.103e-05 [updatestate_depend_eliminate]: 4.74998e-06 [updatestate_assign_eliminate]: 3.9e-06 [updatestate_loads_eliminate]: 3.6e-06 [parameter_eliminate]: 2.21998e-06 [a_2]: 0.00010276 [accelerated_algorithm]: 8.23999e-06 [shard]: 1.74e-06 [meta_shard_fg_expand]: 2.26e-06 [shard_inline]: 8.08001e-06 [merge_send_recv]: 9.98998e-06 [auto_parallel]: 7.64002e-06 [parallel]: 1.969e-05 [flash_sp]: 8.40999e-06 [merge_comm]: 4.94998e-06 [allreduce_fusion]: 4.60001e-06 [matmul_add_comm_reduction]: 1.098e-05 [allreduce_slice_to_reducescatter]: 6.30011e-07 [virtual_shard_identity]: 1.03e-05 [virtual_dataset]: 8.77999e-06 [get_grad_eliminate_]: 7.66999e-06 [virtual_output]: 7.95998e-06 [merge_forward]: 4.72e-06 [cell_reuse_recompute_pass]: 1.17e-06 [offload_activation]: 1.204e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.563e-05 [merge_recompute_call_nodes]: 1.67999e-06 [before_grad]: 1.344e-05 [set_forward_comm_id_for_comm_node_pass]: 4.49998e-06 [meta_fg_expand]: 3.39001e-06 [flash_sp_send_recv_attached]: 2.37999e-06 [receive_attached]: 2.14e-06 [after_resolve]: 1.369e-05 [a_after_grad]: 1.207e-05 [renormalize]: 0.00083021 [add_forward_monad_depend]: 6.44001e-06 [auto_monad_grad]: 2.67001e-06 [auto_monad_eliminator]: 1.936e-05 [cse]: 3.834e-05 [a_3]: 6.24e-05 [Cycle 2]: 0.0007806, [45] [expand_dump_flag]: 1.82999e-06 [switch_simplify]: 9.02999e-06 [loop_unroll]: 7.51999e-06 [a_1]: 0.00018084 [with_stream_mark]: 1.352e-05 [recompute_prepare]: 8.21002e-06 [updatestate_depend_eliminate]: 3.98999e-06 [updatestate_assign_eliminate]: 3.28e-06 [updatestate_loads_eliminate]: 2.73e-06 [parameter_eliminate]: 1.25999e-06 [a_2]: 9.389e-05 [accelerated_algorithm]: 7.53999e-06 [shard]: 1.54e-06 [meta_shard_fg_expand]: 1.74e-06 [shard_inline]: 7.26001e-06 [merge_send_recv]: 6.53e-06 [auto_parallel]: 6.59999e-06 [parallel]: 5.11002e-06 [flash_sp]: 4.92e-06 [merge_comm]: 4.17e-06 [allreduce_fusion]: 4.22e-06 [matmul_add_comm_reduction]: 7.23999e-06 [allreduce_slice_to_reducescatter]: 3.60014e-07 [virtual_shard_identity]: 8.45001e-06 [virtual_dataset]: 7.54002e-06 [get_grad_eliminate_]: 7.71001e-06 [virtual_output]: 7.25e-06 [merge_forward]: 3.38e-06 [cell_reuse_recompute_pass]: 1.98002e-06 [offload_activation]: 8.77e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.424e-05 [merge_recompute_call_nodes]: 9.40025e-07 [before_grad]: 1.176e-05 [set_forward_comm_id_for_comm_node_pass]: 4.42e-06 [meta_fg_expand]: 3.08e-06 [flash_sp_send_recv_attached]: 1.17999e-06 [receive_attached]: 1.69e-06 [after_resolve]: 1.181e-05 [a_after_grad]: 1.114e-05 [renormalize]: 8.00064e-08 [add_forward_monad_depend]: 1.57999e-06 [auto_monad_grad]: 1.01002e-06 [auto_monad_eliminator]: 1.016e-05 [cse]: 1.969e-05 [a_3]: 4.528e-05 [py_interpret_to_execute_after_opt_a]: 1.41e-05 [slice_cell_reuse_recomputed_activation]: 1.82001e-06 [rewriter_after_opt_a]: 4.198e-05 [convert_after_rewriter]: 7.65e-06 [order_py_execute_after_rewriter]: 5.82001e-06 [mutable_eliminate]: 0.00063108 [opt_b]: 0.00029599, [1] [Cycle 1]: 0.0002888, [7] [b_1]: 0.00019769 [b_2]: 1.051e-05 [updatestate_depend_eliminate]: 8.84998e-06 [updatestate_assign_eliminate]: 3.31001e-06 [updatestate_loads_eliminate]: 3.14001e-06 [renormalize]: 7.99977e-07 [cse]: 2.722e-05 [optimize_parallel_all_gather_comm]: 2e-05 [overlap_param_gather]: 1.85001e-06 [cconv]: 3.111e-05 [loop_unroll]: 0.00045115 [opt_after_cconv]: 0.00011998, [1] [Cycle 1]: 0.00011368, [7] [c_1]: 3.773e-05 [parameter_eliminate]: 3.86999e-06 [updatestate_depend_eliminate]: 6.84999e-06 [updatestate_assign_eliminate]: 3.36999e-06 [updatestate_loads_eliminate]: 2.76e-06 [cse]: 2.409e-05 [renormalize]: 6.39993e-07 [remove_dup_value]: 1.575e-05 [tuple_transform]: 8.734e-05, [1] [Cycle 1]: 8.291e-05, [4] [d_1]: 5.379e-05 [none_parameter_eliminate]: 1.62001e-06 [renormalize]: 2.09984e-07 [switch_simplify]: 8.17e-06 [partial_unused_args_eliminate]: 1.75001e-06 [add_recomputation]: 5.979e-05 [cse_after_recomputation]: 2.729e-05, [1] [Cycle 1]: 2.257e-05, [1] [cse]: 1.643e-05 [environ_conv]: 6.91999e-06 [swap_dp_allreduce_reducescatter]: 5.95002e-06 [bias_add_comm_swap]: 2.59999e-06 [label_micro_interleaved_index]: 4.97e-06 [label_fine_grained_interleaved_index]: 2.62001e-06 [merge_cast_opt]: 1.27999e-06 [slice_recompute_activation]: 1.99999e-06 [micro_interleaved_order_control]: 2.19001e-06 [assign_add_opt]: 1.49998e-06 [ForceFp32Comm]: 1.04e-06 [remove_cast_before_assign_add]: 1.04e-06 [full_micro_interleaved_order_control]: 2.01998e-06 [reorder_send_recv_between_fp_bp]: 2.70002e-06 [comm_op_add_attrs]: 9.90025e-07 [add_comm_op_reuse_tag]: 9.79984e-07 [interleave_split_concat_branches]: 1.02998e-06 [interleave_parallel_branches]: 1.02e-06 [overlap_opt_shard_in_pipeline]: 1.27e-06 [overlap_opt_shard_grad_in_pipeline]: 1.66e-06 [control_data_broadcast_order]: 1.581e-05 [grouped_pairwise_exchange_alltoall]: 1.55001e-06 [offloading_packed_experts]: 4.74e-06 [overlap_recompute_and_grad_model_parallel]: 5.10001e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.35001e-06 [overlap_recompute_allgather_and_fa_grad]: 1.34e-06 [overlap_recompute_comm]: 2.14999e-06 [overlap_grad_ring_attention]: 4.48001e-06 [overlap_grad_flash_sp]: 2.476e-05 [begin_end_overlap_inline]: 5.3001e-07 [split_matmul_comm_elemetwise]: 2.24001e-06 [split_layernorm_comm]: 1.59e-06 [handle_group_info]: 9.69972e-07 [symbol_engine_optimizer]: 8.565e-05, [1] [Cycle 1]: 8.105e-05, [6] [build]: 3.80998e-06 [elim_shapecalc]: 1.117e-05 [elim_not_effective]: 1.524e-05 [opt_reshape]: 8.28999e-06 [fold_const_symbol]: 1.251e-05 [renormalize]: 2.19996e-07 [detach_backward]: 2.36e-06 [pipeline_parallel_scheduler]: 1.53002e-06 [auto_monad_reorder]: 1.987e-05 [get_jit_bprop_graph]: 1.86003e-06 [rewriter_after_jit_bprop_graph]: 4.82e-06 [opt_after_jit_grad]: 0.00049083 [validate]: 4.665e-05 Sums bootstrap : 0.000474s : 3.95% type_inference : 0.006124s : 51.13% event_method : 0.000021s : 0.18% auto_monad : 0.000065s : 0.54% graph_reusing : 0.000006s : 0.05% inline : 0.000002s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000022s : 0.18% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.05% parallel-infer-symbol : 0.000003s : 0.03% pre_auto_parallel : 0.000038s : 0.32% insert-virtual-dataset : 0.000002s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.02% optimize.py_interpret_to_execute : 0.000028s : 0.24% optimize.rewriter_before_opt_a : 0.000118s : 0.99% optimize.opt_a.expand_dump_flag : 0.000005s : 0.04% optimize.opt_a.switch_simplify : 0.000053s : 0.44% optimize.opt_a.loop_unroll : 0.000038s : 0.32% optimize.opt_a.a_1 : 0.000860s : 7.18% optimize.opt_a.with_stream_mark : 0.000032s : 0.27% optimize.opt_a.recompute_prepare : 0.000019s : 0.16% optimize.opt_a.updatestate_depend_eliminate : 0.000009s : 0.07% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.06% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.05% optimize.opt_a.parameter_eliminate : 0.000003s : 0.03% optimize.opt_a.a_2 : 0.000197s : 1.64% optimize.opt_a.accelerated_algorithm : 0.000016s : 0.13% optimize.opt_a.shard : 0.000003s : 0.03% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.03% optimize.opt_a.shard_inline : 0.000015s : 0.13% optimize.opt_a.merge_send_recv : 0.000017s : 0.14% optimize.opt_a.auto_parallel : 0.000014s : 0.12% optimize.opt_a.parallel : 0.000025s : 0.21% optimize.opt_a.flash_sp : 0.000013s : 0.11% optimize.opt_a.merge_comm : 0.000009s : 0.08% optimize.opt_a.allreduce_fusion : 0.000009s : 0.07% optimize.opt_a.matmul_add_comm_reduction : 0.000018s : 0.15% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000019s : 0.16% optimize.opt_a.virtual_dataset : 0.000016s : 0.14% optimize.opt_a.get_grad_eliminate_ : 0.000015s : 0.13% optimize.opt_a.virtual_output : 0.000015s : 0.13% optimize.opt_a.merge_forward : 0.000008s : 0.07% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.03% optimize.opt_a.offload_activation : 0.000021s : 0.17% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000030s : 0.25% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.02% optimize.opt_a.before_grad : 0.000025s : 0.21% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000009s : 0.07% optimize.opt_a.meta_fg_expand : 0.000006s : 0.05% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.03% optimize.opt_a.receive_attached : 0.000004s : 0.03% optimize.opt_a.after_resolve : 0.000026s : 0.21% optimize.opt_a.a_after_grad : 0.000023s : 0.19% optimize.opt_a.renormalize : 0.000830s : 6.93% optimize.opt_a.add_forward_monad_depend : 0.000008s : 0.07% optimize.opt_a.auto_monad_grad : 0.000004s : 0.03% optimize.opt_a.auto_monad_eliminator : 0.000030s : 0.25% optimize.opt_a.cse : 0.000058s : 0.48% optimize.opt_a.a_3 : 0.000108s : 0.90% optimize.py_interpret_to_execute_after_opt_a : 0.000014s : 0.12% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.02% optimize.rewriter_after_opt_a : 0.000042s : 0.35% optimize.convert_after_rewriter : 0.000008s : 0.06% optimize.order_py_execute_after_rewriter : 0.000006s : 0.05% optimize.mutable_eliminate : 0.000631s : 5.27% optimize.opt_b.b_1 : 0.000198s : 1.65% optimize.opt_b.b_2 : 0.000011s : 0.09% optimize.opt_b.updatestate_depend_eliminate : 0.000009s : 0.07% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_b.renormalize : 0.000001s : 0.01% optimize.opt_b.cse : 0.000027s : 0.23% optimize.optimize_parallel_all_gather_comm : 0.000020s : 0.17% optimize.overlap_param_gather : 0.000002s : 0.02% optimize.cconv : 0.000031s : 0.26% optimize.loop_unroll : 0.000451s : 3.77% optimize.opt_after_cconv.c_1 : 0.000038s : 0.32% optimize.opt_after_cconv.parameter_eliminate : 0.000004s : 0.03% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000007s : 0.06% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.02% optimize.opt_after_cconv.cse : 0.000024s : 0.20% optimize.opt_after_cconv.renormalize : 0.000001s : 0.01% optimize.remove_dup_value : 0.000016s : 0.13% optimize.tuple_transform.d_1 : 0.000054s : 0.45% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000008s : 0.07% optimize.partial_unused_args_eliminate : 0.000002s : 0.01% optimize.add_recomputation : 0.000060s : 0.50% optimize.cse_after_recomputation.cse : 0.000016s : 0.14% optimize.environ_conv : 0.000007s : 0.06% optimize.swap_dp_allreduce_reducescatter : 0.000006s : 0.05% optimize.bias_add_comm_swap : 0.000003s : 0.02% optimize.label_micro_interleaved_index : 0.000005s : 0.04% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.02% optimize.merge_cast_opt : 0.000001s : 0.01% optimize.slice_recompute_activation : 0.000002s : 0.02% optimize.micro_interleaved_order_control : 0.000002s : 0.02% optimize.assign_add_opt : 0.000001s : 0.01% optimize.ForceFp32Comm : 0.000001s : 0.01% optimize.remove_cast_before_assign_add : 0.000001s : 0.01% optimize.full_micro_interleaved_order_control : 0.000002s : 0.02% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.02% optimize.comm_op_add_attrs : 0.000001s : 0.01% optimize.add_comm_op_reuse_tag : 0.000001s : 0.01% optimize.interleave_split_concat_branches : 0.000001s : 0.01% optimize.interleave_parallel_branches : 0.000001s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.01% optimize.control_data_broadcast_order : 0.000016s : 0.13% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.01% optimize.offloading_packed_experts : 0.000005s : 0.04% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.04% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.01% optimize.overlap_recompute_comm : 0.000002s : 0.02% optimize.overlap_grad_ring_attention : 0.000004s : 0.04% optimize.overlap_grad_flash_sp : 0.000025s : 0.21% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.02% optimize.split_layernorm_comm : 0.000002s : 0.01% optimize.handle_group_info : 0.000001s : 0.01% optimize.symbol_engine_optimizer.build : 0.000004s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000011s : 0.09% optimize.symbol_engine_optimizer.elim_not_effective : 0.000015s : 0.13% optimize.symbol_engine_optimizer.opt_reshape : 0.000008s : 0.07% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000013s : 0.10% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.02% pipeline_parallel_scheduler : 0.000002s : 0.01% auto_monad_reorder : 0.000020s : 0.17% get_jit_bprop_graph : 0.000002s : 0.02% rewriter_after_jit_bprop_graph : 0.000005s : 0.04% opt_after_jit_grad : 0.000491s : 4.10% validate : 0.000047s : 0.39% Time group info: ------[substitution.] 0.000217 38 12.28% : 0.000027s : 3: substitution.cast_eliminate 1.07% : 0.000002s : 3: substitution.elim_not_effective 0.72% : 0.000002s : 3: substitution.fold_const_symbol 3.11% : 0.000007s : 5: substitution.graph_param_transform 69.68% : 0.000151s : 4: substitution.inline 2.19% : 0.000005s : 6: substitution.j_node_and_user_rematch 2.79% : 0.000006s : 6: substitution.remove_not_recompute_node 2.41% : 0.000005s : 4: substitution.replace_old_param 5.76% : 0.000012s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.006055 2 87.39% : 0.005291s : 1: type_inference.infer 12.61% : 0.000763s : 1: type_inference.specialize ------[replace.] 0.000062 8 61.80% : 0.000038s : 4: replace.inline 38.20% : 0.000024s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000159 8 93.28% : 0.000148s : 4: match.inline 6.72% : 0.000011s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000241 1504 0.86% : 0.000002s : 15: predicate.accumulaten_eliminater 0.76% : 0.000002s : 5: predicate.ad_related_special_op_eliminate 0.60% : 0.000001s : 10: predicate.addn_check_dump 0.93% : 0.000002s : 15: predicate.addn_zero_filter 0.80% : 0.000002s : 15: predicate.adjust_all_reduce_mul_add 1.99% : 0.000005s : 25: predicate.arithmetic_simplify 1.04% : 0.000002s : 15: predicate.cast_eliminate 0.69% : 0.000002s : 10: predicate.check_bprop_eliminate 0.57% : 0.000001s : 10: predicate.compare_switch_simplify 0.20% : 0.000000s : 5: predicate.const_output_eliminate 0.63% : 0.000002s : 10: predicate.depend_value_elim 0.99% : 0.000002s : 15: predicate.dict_get_item_const_eliminator 1.02% : 0.000002s : 15: predicate.dict_get_item_eliminator 0.86% : 0.000002s : 15: predicate.dict_set_item_eliminator 0.95% : 0.000002s : 10: predicate.dumpgradient_eliminate 0.20% : 0.000000s : 5: predicate.elim_not_effective 0.44% : 0.000001s : 5: predicate.elim_shapecalc_of_broadcastargs 1.19% : 0.000003s : 20: predicate.environ_add_const_eliminate 1.09% : 0.000003s : 20: predicate.environ_get_add_eliminate 1.17% : 0.000003s : 20: predicate.environ_get_depend_swap 1.93% : 0.000005s : 30: predicate.environ_get_eliminate 1.10% : 0.000003s : 20: predicate.environ_get_set_eliminate 1.33% : 0.000003s : 23: predicate.exchange_switch_depend_value 2.14% : 0.000005s : 23: predicate.float_depend_g_call 0.63% : 0.000002s : 10: predicate.float_environ_get_switch 0.83% : 0.000002s : 15: predicate.float_tuple_getitem_switch 0.21% : 0.000000s : 5: predicate.fold_const_symbol 0.68% : 0.000002s : 10: predicate.get_grad_eliminate 0.22% : 0.000001s : 5: predicate.graph_param_transform 0.64% : 0.000002s : 10: predicate.incorporate_call 0.57% : 0.000001s : 10: predicate.incorporate_call_switch 6.36% : 0.000015s : 68: predicate.inline 0.84% : 0.000002s : 10: predicate.inline_without_move 0.34% : 0.000001s : 10: predicate.j_node_and_user_rematch 0.85% : 0.000002s : 10: predicate.less_batch_normalization 1.90% : 0.000005s : 29: predicate.list_to_tuple_eliminator_ 2.63% : 0.000006s : 44: predicate.load_eliminater 1.04% : 0.000002s : 5: predicate.loop_unroll_after_grad 2.22% : 0.000005s : 36: predicate.loop_unroll_before_grad 1.66% : 0.000004s : 25: predicate.make_slice_get_slice_eliminator 0.65% : 0.000002s : 10: predicate.merge_addn 0.67% : 0.000002s : 10: predicate.micro_step_allgather_replace 0.59% : 0.000001s : 10: predicate.mini_step_allgather_replace 0.79% : 0.000002s : 15: predicate.minmaximum_grad 1.26% : 0.000003s : 5: predicate.mutable_eliminate 0.37% : 0.000001s : 5: predicate.opt_reshape 0.46% : 0.000001s : 5: predicate.parallel_virtual_node 1.67% : 0.000004s : 23: predicate.partial_defer_inline 1.63% : 0.000004s : 24: predicate.partial_eliminate 0.83% : 0.000002s : 15: predicate.print_const_string_wrapper 0.66% : 0.000002s : 10: predicate.reduce_all_const_elim 1.19% : 0.000003s : 15: predicate.reduce_eliminate 2.48% : 0.000006s : 44: predicate.redundant_stop_gradient_eliminater 0.41% : 0.000001s : 10: predicate.remove_not_recompute_node 1.40% : 0.000003s : 29: predicate.replace_applicator 0.51% : 0.000001s : 10: predicate.replace_old_param 0.26% : 0.000001s : 5: predicate.reset_defer_inline 1.01% : 0.000002s : 15: predicate.reshape_eliminate 0.64% : 0.000002s : 10: predicate.row_tensor_add_zeros_like 0.45% : 0.000001s : 5: predicate.row_tensor_eliminate 0.97% : 0.000002s : 10: predicate.same_eliminate 0.43% : 0.000001s : 10: predicate.set_cell_output_no_recompute 0.87% : 0.000002s : 10: predicate.shard_identity_eliminate 0.78% : 0.000002s : 10: predicate.special_op_eliminate 0.78% : 0.000002s : 10: predicate.specialize_transform 0.97% : 0.000002s : 10: predicate.split_environ_get_set_with_tuple_value 0.79% : 0.000002s : 10: predicate.stack_unstack_eliminate 0.35% : 0.000001s : 5: predicate.switch_call_monad_eliminater 1.45% : 0.000003s : 23: predicate.switch_defer_inline 1.98% : 0.000005s : 33: predicate.switch_layer_defer_inline 4.73% : 0.000011s : 74: predicate.switch_simplify 0.85% : 0.000002s : 15: predicate.tile_eliminate 0.86% : 0.000002s : 15: predicate.transpose_eliminate 1.51% : 0.000004s : 25: predicate.tuple_list_convert_item_index_to_positive 1.57% : 0.000004s : 25: predicate.tuple_list_get_item_const_eliminator 1.38% : 0.000003s : 25: predicate.tuple_list_get_item_depend_reorder 3.37% : 0.000008s : 39: predicate.tuple_list_get_item_eliminator 1.43% : 0.000003s : 25: predicate.tuple_list_get_set_item_eliminator 2.23% : 0.000005s : 35: predicate.tuple_list_set_item_eliminator 1.71% : 0.000004s : 29: predicate.tuple_to_list_eliminator_ 2.39% : 0.000006s : 44: predicate.updatestate_pure_node_eliminater 3.01% : 0.000007s : 54: predicate.updatestate_useless_node_eliminater 0.35% : 0.000001s : 5: predicate.value_based_eliminate 0.84% : 0.000002s : 10: predicate.virtual_dataset_eliminate 0.64% : 0.000002s : 10: predicate.virtual_output_eliminate 0.30% : 0.000001s : 5: predicate.virtual_view_grad_eliminate 0.44% : 0.000001s : 5: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000587 11 51.18% : 0.000300s : 5: func_graph_cloner_run.FuncGraphClonerGraph 48.82% : 0.000287s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.027476 192 0.01% : 0.000004s : 1: ForceFp32Comm 11.95% : 0.003283s : 1: add_attr 11.91% : 0.003271s : 1: add_attr_with_inline 0.01% : 0.000004s : 1: add_comm_op_reuse_tag 0.23% : 0.000064s : 1: add_recomputation 0.02% : 0.000004s : 1: assign_add_opt 0.26% : 0.000071s : 1: auto_monad 0.09% : 0.000024s : 1: auto_monad_reorder 0.01% : 0.000003s : 1: begin_end_overlap_inline 0.02% : 0.000005s : 1: bias_add_comm_swap 1.84% : 0.000506s : 1: bootstrap 0.13% : 0.000034s : 1: cconv 0.01% : 0.000004s : 1: comm_op_add_attrs 0.07% : 0.000019s : 1: control_data_broadcast_order 0.04% : 0.000011s : 1: convert_after_rewriter 0.11% : 0.000030s : 1: cse_after_recomputation 0.02% : 0.000005s : 1: dataset_repeat_opt 0.02% : 0.000006s : 1: detach_backward 0.04% : 0.000010s : 1: environ_conv 0.10% : 0.000029s : 1: event_method 0.02% : 0.000005s : 1: full_micro_interleaved_order_control 0.02% : 0.000005s : 1: get_jit_bprop_graph 0.03% : 0.000009s : 1: graph_reusing 0.02% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000004s : 1: handle_group_info 0.02% : 0.000005s : 1: inline 0.02% : 0.000006s : 1: insert-virtual-dataset 0.01% : 0.000004s : 1: interleave_parallel_branches 0.01% : 0.000004s : 1: interleave_split_concat_branches 0.02% : 0.000006s : 1: label_fine_grained_interleaved_index 0.03% : 0.000008s : 1: label_micro_interleaved_index 1.67% : 0.000460s : 1: loop_unroll 0.01% : 0.000004s : 1: merge_cast_opt 0.02% : 0.000005s : 1: micro_interleaved_order_control 2.33% : 0.000641s : 1: mutable_eliminate 0.03% : 0.000008s : 1: offloading_packed_experts 0.07% : 0.000018s : 1: opt.transform.loop_unroll_optimizer 0.08% : 0.000021s : 1: opt.transform.mutable_eliminate 5.03% : 0.001382s : 78: opt.transform.opt_a 0.13% : 0.000036s : 1: opt.transform.opt_after_cconv 0.11% : 0.000029s : 1: opt.transform.opt_after_jit_grad 0.64% : 0.000176s : 28: opt.transform.opt_b 0.22% : 0.000060s : 2: opt.transform.opt_trans_graph 0.16% : 0.000043s : 4: opt.transform.symbol_engine_opt 11.36% : 0.003122s : 1: opt_a 0.45% : 0.000124s : 1: opt_after_cconv 1.82% : 0.000501s : 1: opt_after_jit_grad 1.09% : 0.000300s : 1: opt_b 20.03% : 0.005504s : 1: optimize 0.09% : 0.000024s : 1: optimize_parallel_all_gather_comm 0.03% : 0.000009s : 1: order_py_execute_after_rewriter 0.10% : 0.000028s : 1: overlap_grad_flash_sp 0.01% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.03% : 0.000008s : 1: overlap_grad_ring_attention 0.02% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.02% : 0.000005s : 1: overlap_param_gather 0.01% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.03% : 0.000008s : 1: overlap_recompute_and_grad_model_parallel 0.02% : 0.000005s : 1: overlap_recompute_comm 0.03% : 0.000007s : 1: parallel-infer-symbol 0.01% : 0.000004s : 1: parallel-infer-symbol-second 0.02% : 0.000005s : 1: partial_unused_args_eliminate 0.02% : 0.000005s : 1: pipeline_parallel_scheduler 0.02% : 0.000005s : 1: pipeline_split 0.15% : 0.000042s : 1: pre_auto_parallel 0.12% : 0.000033s : 1: py_interpret_to_execute 0.06% : 0.000018s : 1: py_interpret_to_execute_after_opt_a 0.02% : 0.000004s : 1: remove_cast_before_assign_add 0.07% : 0.000019s : 1: remove_dup_value 1.73% : 0.000474s : 1: renormalize.infer 1.26% : 0.000347s : 1: renormalize.specialize 0.02% : 0.000005s : 1: reorder_send_recv_between_fp_bp 0.03% : 0.000008s : 1: rewriter_after_jit_bprop_graph 0.17% : 0.000046s : 1: rewriter_after_opt_a 0.45% : 0.000123s : 1: rewriter_before_opt_a 0.02% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.02% : 0.000005s : 1: slice_recompute_activation 0.02% : 0.000004s : 1: split_layernorm_comm 0.02% : 0.000005s : 1: split_matmul_comm_elemetwise 0.03% : 0.000009s : 1: swap_dp_allreduce_reducescatter 0.32% : 0.000088s : 1: symbol_engine_optimizer 0.33% : 0.000090s : 1: tuple_transform 22.36% : 0.006144s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:45:01.487.739 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:45:01.488.046 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0204892, [21] [bootstrap]: 0.00045671 [type_inference]: 0.0068398 [event_method]: 2.316e-05 [auto_monad]: 7.115e-05 [graph_reusing]: 6.41e-06 [inline]: 2.66999e-06 [add_attr]: 0.00385639, [1] [add_attr_with_inline]: 0.00384407, [1] [Cycle 1]: 9.483e-05, [2] [tag_attr]: 2.508e-05 [meta_addattr_fg_expand]: 6.59001e-06 [parallel-infer-symbol]: 3.08e-06 [pre_auto_parallel]: 4.245e-05 [insert-virtual-dataset]: 2.51e-06 [parallel-infer-symbol-second]: 1.00999e-06 [dataset_repeat_opt]: 2.27001e-06 [pipeline_split]: 1.78002e-06 [optimize]: 0.00736404, [53] [py_interpret_to_execute]: 3.832e-05 [rewriter_before_opt_a]: 0.00010841 [opt_a]: 0.00413908, [2] [Cycle 1]: 0.00299488, [45] [expand_dump_flag]: 3.45e-06 [switch_simplify]: 4.586e-05 [loop_unroll]: 3.375e-05 [a_1]: 0.00077601 [with_stream_mark]: 2.315e-05 [recompute_prepare]: 1.295e-05 [updatestate_depend_eliminate]: 5.84e-06 [updatestate_assign_eliminate]: 4.92e-06 [updatestate_loads_eliminate]: 4.72998e-06 [parameter_eliminate]: 2.26e-06 [a_2]: 0.00015755 [accelerated_algorithm]: 1.18e-05 [shard]: 1.83002e-06 [meta_shard_fg_expand]: 3.04001e-06 [shard_inline]: 1.075e-05 [merge_send_recv]: 1.227e-05 [auto_parallel]: 9.89001e-06 [parallel]: 2.056e-05 [flash_sp]: 1.075e-05 [merge_comm]: 5.79e-06 [allreduce_fusion]: 4.94e-06 [matmul_add_comm_reduction]: 1.202e-05 [allreduce_slice_to_reducescatter]: 6.79982e-07 [virtual_shard_identity]: 1.186e-05 [virtual_dataset]: 9.54e-06 [get_grad_eliminate_]: 8.94e-06 [virtual_output]: 9.49e-06 [merge_forward]: 5.25999e-06 [cell_reuse_recompute_pass]: 2.22999e-06 [offload_activation]: 1.291e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.348e-05 [merge_recompute_call_nodes]: 2.01e-06 [before_grad]: 1.623e-05 [set_forward_comm_id_for_comm_node_pass]: 5.34e-06 [meta_fg_expand]: 4.43001e-06 [flash_sp_send_recv_attached]: 3.55e-06 [receive_attached]: 2.12999e-06 [after_resolve]: 1.465e-05 [a_after_grad]: 1.43e-05 [renormalize]: 0.00106206 [add_forward_monad_depend]: 8.85001e-06 [auto_monad_grad]: 2.78e-06 [auto_monad_eliminator]: 2.289e-05 [cse]: 4.757e-05 [a_3]: 8.956e-05 [Cycle 2]: 0.00112658, [45] [expand_dump_flag]: 1.76e-06 [switch_simplify]: 1.123e-05 [loop_unroll]: 9.07001e-06 [a_1]: 0.00022643 [with_stream_mark]: 1.993e-05 [recompute_prepare]: 9.44e-06 [updatestate_depend_eliminate]: 1.041e-05 [updatestate_assign_eliminate]: 4.28999e-06 [updatestate_loads_eliminate]: 3.88999e-06 [parameter_eliminate]: 1.66e-06 [a_2]: 0.00014164 [accelerated_algorithm]: 9.10999e-06 [shard]: 2.64001e-06 [meta_shard_fg_expand]: 2.83e-06 [shard_inline]: 9.00001e-06 [merge_send_recv]: 9.74999e-06 [auto_parallel]: 1.091e-05 [parallel]: 7.88001e-06 [flash_sp]: 4.05998e-06 [merge_comm]: 5.56998e-06 [allreduce_fusion]: 5.00001e-06 [matmul_add_comm_reduction]: 1.068e-05 [allreduce_slice_to_reducescatter]: 4.69998e-07 [virtual_shard_identity]: 1.034e-05 [virtual_dataset]: 8.81002e-06 [get_grad_eliminate_]: 9.49999e-06 [virtual_output]: 9.07999e-06 [merge_forward]: 5.49998e-06 [cell_reuse_recompute_pass]: 2.58e-06 [offload_activation]: 1.177e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.137e-05 [merge_recompute_call_nodes]: 1.07e-06 [before_grad]: 1.504e-05 [set_forward_comm_id_for_comm_node_pass]: 5.62001e-06 [meta_fg_expand]: 3.86001e-06 [flash_sp_send_recv_attached]: 1.69e-06 [receive_attached]: 1.98002e-06 [after_resolve]: 1.516e-05 [a_after_grad]: 1.403e-05 [renormalize]: 1.30007e-07 [add_forward_monad_depend]: 1.51002e-06 [auto_monad_grad]: 2.24999e-06 [auto_monad_eliminator]: 1.372e-05 [cse]: 3.088e-05 [a_3]: 6.926e-05 [py_interpret_to_execute_after_opt_a]: 2.126e-05 [slice_cell_reuse_recomputed_activation]: 5.20001e-06 [rewriter_after_opt_a]: 5.525e-05 [convert_after_rewriter]: 1.205e-05 [order_py_execute_after_rewriter]: 1.009e-05 [mutable_eliminate]: 0.00076553 [opt_b]: 0.00038972, [1] [Cycle 1]: 0.00037806, [7] [b_1]: 0.00024068 [b_2]: 1.216e-05 [updatestate_depend_eliminate]: 1.105e-05 [updatestate_assign_eliminate]: 4.03001e-06 [updatestate_loads_eliminate]: 3.64002e-06 [renormalize]: 5.60016e-07 [cse]: 4.519e-05 [optimize_parallel_all_gather_comm]: 2.831e-05 [overlap_param_gather]: 6.66999e-06 [cconv]: 3.883e-05 [loop_unroll]: 0.00069168 [opt_after_cconv]: 0.00018171, [1] [Cycle 1]: 0.00017071, [7] [c_1]: 4.625e-05 [parameter_eliminate]: 5.72001e-06 [updatestate_depend_eliminate]: 8.47e-06 [updatestate_assign_eliminate]: 4.37e-06 [updatestate_loads_eliminate]: 4.03999e-06 [cse]: 3.979e-05 [renormalize]: 1.04003e-06 [remove_dup_value]: 5.787e-05 [tuple_transform]: 0.00012444, [1] [Cycle 1]: 0.00011578, [4] [d_1]: 6.936e-05 [none_parameter_eliminate]: 2.01e-06 [renormalize]: 2.30008e-07 [switch_simplify]: 1.011e-05 [partial_unused_args_eliminate]: 5.11002e-06 [add_recomputation]: 7.632e-05 [cse_after_recomputation]: 3.809e-05, [1] [Cycle 1]: 3.068e-05, [1] [cse]: 2.05e-05 [environ_conv]: 1.108e-05 [swap_dp_allreduce_reducescatter]: 1.04e-05 [bias_add_comm_swap]: 6.09999e-06 [label_micro_interleaved_index]: 8.45999e-06 [label_fine_grained_interleaved_index]: 5.84e-06 [merge_cast_opt]: 4.2e-06 [slice_recompute_activation]: 4.68001e-06 [micro_interleaved_order_control]: 4.94e-06 [assign_add_opt]: 4.1e-06 [ForceFp32Comm]: 3.38e-06 [remove_cast_before_assign_add]: 3.94002e-06 [full_micro_interleaved_order_control]: 4.79e-06 [reorder_send_recv_between_fp_bp]: 5.61e-06 [comm_op_add_attrs]: 3.91001e-06 [add_comm_op_reuse_tag]: 3.48e-06 [interleave_split_concat_branches]: 4.1e-06 [interleave_parallel_branches]: 3.8e-06 [overlap_opt_shard_in_pipeline]: 4e-06 [overlap_opt_shard_grad_in_pipeline]: 4.67e-06 [control_data_broadcast_order]: 2.156e-05 [grouped_pairwise_exchange_alltoall]: 3.94997e-06 [offloading_packed_experts]: 8.28999e-06 [overlap_recompute_and_grad_model_parallel]: 8.35999e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.68999e-06 [overlap_recompute_allgather_and_fa_grad]: 3.83001e-06 [overlap_recompute_comm]: 5.47001e-06 [overlap_grad_ring_attention]: 7.43e-06 [overlap_grad_flash_sp]: 2.983e-05 [begin_end_overlap_inline]: 3.32997e-06 [split_matmul_comm_elemetwise]: 4.62998e-06 [split_layernorm_comm]: 4.12e-06 [handle_group_info]: 3.67002e-06 [symbol_engine_optimizer]: 0.00012324, [1] [Cycle 1]: 0.00011573, [6] [build]: 4.64002e-06 [elim_shapecalc]: 1.56e-05 [elim_not_effective]: 1.911e-05 [opt_reshape]: 1.058e-05 [fold_const_symbol]: 1.629e-05 [renormalize]: 6.10016e-07 [detach_backward]: 6.04001e-06 [pipeline_parallel_scheduler]: 1.87999e-06 [auto_monad_reorder]: 2.935e-05 [get_jit_bprop_graph]: 2.52001e-06 [rewriter_after_jit_bprop_graph]: 7.66999e-06 [opt_after_jit_grad]: 0.0007735 [validate]: 5.512e-05 Sums bootstrap : 0.000457s : 3.17% type_inference : 0.006840s : 47.45% event_method : 0.000023s : 0.16% auto_monad : 0.000071s : 0.49% graph_reusing : 0.000006s : 0.04% inline : 0.000003s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000025s : 0.17% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000007s : 0.05% parallel-infer-symbol : 0.000003s : 0.02% pre_auto_parallel : 0.000042s : 0.29% insert-virtual-dataset : 0.000003s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000038s : 0.27% optimize.rewriter_before_opt_a : 0.000108s : 0.75% optimize.opt_a.expand_dump_flag : 0.000005s : 0.04% optimize.opt_a.switch_simplify : 0.000057s : 0.40% optimize.opt_a.loop_unroll : 0.000043s : 0.30% optimize.opt_a.a_1 : 0.001002s : 6.95% optimize.opt_a.with_stream_mark : 0.000043s : 0.30% optimize.opt_a.recompute_prepare : 0.000022s : 0.16% optimize.opt_a.updatestate_depend_eliminate : 0.000016s : 0.11% optimize.opt_a.updatestate_assign_eliminate : 0.000009s : 0.06% optimize.opt_a.updatestate_loads_eliminate : 0.000009s : 0.06% optimize.opt_a.parameter_eliminate : 0.000004s : 0.03% optimize.opt_a.a_2 : 0.000299s : 2.08% optimize.opt_a.accelerated_algorithm : 0.000021s : 0.15% optimize.opt_a.shard : 0.000004s : 0.03% optimize.opt_a.meta_shard_fg_expand : 0.000006s : 0.04% optimize.opt_a.shard_inline : 0.000020s : 0.14% optimize.opt_a.merge_send_recv : 0.000022s : 0.15% optimize.opt_a.auto_parallel : 0.000021s : 0.14% optimize.opt_a.parallel : 0.000028s : 0.20% optimize.opt_a.flash_sp : 0.000015s : 0.10% optimize.opt_a.merge_comm : 0.000011s : 0.08% optimize.opt_a.allreduce_fusion : 0.000010s : 0.07% optimize.opt_a.matmul_add_comm_reduction : 0.000023s : 0.16% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000022s : 0.15% optimize.opt_a.virtual_dataset : 0.000018s : 0.13% optimize.opt_a.get_grad_eliminate_ : 0.000018s : 0.13% optimize.opt_a.virtual_output : 0.000019s : 0.13% optimize.opt_a.merge_forward : 0.000011s : 0.07% optimize.opt_a.cell_reuse_recompute_pass : 0.000005s : 0.03% optimize.opt_a.offload_activation : 0.000025s : 0.17% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000045s : 0.31% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.02% optimize.opt_a.before_grad : 0.000031s : 0.22% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000011s : 0.08% optimize.opt_a.meta_fg_expand : 0.000008s : 0.06% optimize.opt_a.flash_sp_send_recv_attached : 0.000005s : 0.04% optimize.opt_a.receive_attached : 0.000004s : 0.03% optimize.opt_a.after_resolve : 0.000030s : 0.21% optimize.opt_a.a_after_grad : 0.000028s : 0.20% optimize.opt_a.renormalize : 0.001062s : 7.37% optimize.opt_a.add_forward_monad_depend : 0.000010s : 0.07% optimize.opt_a.auto_monad_grad : 0.000005s : 0.03% optimize.opt_a.auto_monad_eliminator : 0.000037s : 0.25% optimize.opt_a.cse : 0.000078s : 0.54% optimize.opt_a.a_3 : 0.000159s : 1.10% optimize.py_interpret_to_execute_after_opt_a : 0.000021s : 0.15% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.04% optimize.rewriter_after_opt_a : 0.000055s : 0.38% optimize.convert_after_rewriter : 0.000012s : 0.08% optimize.order_py_execute_after_rewriter : 0.000010s : 0.07% optimize.mutable_eliminate : 0.000766s : 5.31% optimize.opt_b.b_1 : 0.000241s : 1.67% optimize.opt_b.b_2 : 0.000012s : 0.08% optimize.opt_b.updatestate_depend_eliminate : 0.000011s : 0.08% optimize.opt_b.updatestate_assign_eliminate : 0.000004s : 0.03% optimize.opt_b.updatestate_loads_eliminate : 0.000004s : 0.03% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000045s : 0.31% optimize.optimize_parallel_all_gather_comm : 0.000028s : 0.20% optimize.overlap_param_gather : 0.000007s : 0.05% optimize.cconv : 0.000039s : 0.27% optimize.loop_unroll : 0.000692s : 4.80% optimize.opt_after_cconv.c_1 : 0.000046s : 0.32% optimize.opt_after_cconv.parameter_eliminate : 0.000006s : 0.04% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000008s : 0.06% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000004s : 0.03% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000004s : 0.03% optimize.opt_after_cconv.cse : 0.000040s : 0.28% optimize.opt_after_cconv.renormalize : 0.000001s : 0.01% optimize.remove_dup_value : 0.000058s : 0.40% optimize.tuple_transform.d_1 : 0.000069s : 0.48% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000010s : 0.07% optimize.partial_unused_args_eliminate : 0.000005s : 0.04% optimize.add_recomputation : 0.000076s : 0.53% optimize.cse_after_recomputation.cse : 0.000020s : 0.14% optimize.environ_conv : 0.000011s : 0.08% optimize.swap_dp_allreduce_reducescatter : 0.000010s : 0.07% optimize.bias_add_comm_swap : 0.000006s : 0.04% optimize.label_micro_interleaved_index : 0.000008s : 0.06% optimize.label_fine_grained_interleaved_index : 0.000006s : 0.04% optimize.merge_cast_opt : 0.000004s : 0.03% optimize.slice_recompute_activation : 0.000005s : 0.03% optimize.micro_interleaved_order_control : 0.000005s : 0.03% optimize.assign_add_opt : 0.000004s : 0.03% optimize.ForceFp32Comm : 0.000003s : 0.02% optimize.remove_cast_before_assign_add : 0.000004s : 0.03% optimize.full_micro_interleaved_order_control : 0.000005s : 0.03% optimize.reorder_send_recv_between_fp_bp : 0.000006s : 0.04% optimize.comm_op_add_attrs : 0.000004s : 0.03% optimize.add_comm_op_reuse_tag : 0.000003s : 0.02% optimize.interleave_split_concat_branches : 0.000004s : 0.03% optimize.interleave_parallel_branches : 0.000004s : 0.03% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.03% optimize.overlap_opt_shard_grad_in_pipeline : 0.000005s : 0.03% optimize.control_data_broadcast_order : 0.000022s : 0.15% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.03% optimize.offloading_packed_experts : 0.000008s : 0.06% optimize.overlap_recompute_and_grad_model_parallel : 0.000008s : 0.06% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.03% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.03% optimize.overlap_recompute_comm : 0.000005s : 0.04% optimize.overlap_grad_ring_attention : 0.000007s : 0.05% optimize.overlap_grad_flash_sp : 0.000030s : 0.21% optimize.begin_end_overlap_inline : 0.000003s : 0.02% optimize.split_matmul_comm_elemetwise : 0.000005s : 0.03% optimize.split_layernorm_comm : 0.000004s : 0.03% optimize.handle_group_info : 0.000004s : 0.03% optimize.symbol_engine_optimizer.build : 0.000005s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000016s : 0.11% optimize.symbol_engine_optimizer.elim_not_effective : 0.000019s : 0.13% optimize.symbol_engine_optimizer.opt_reshape : 0.000011s : 0.07% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000016s : 0.11% optimize.symbol_engine_optimizer.renormalize : 0.000001s : 0.00% detach_backward : 0.000006s : 0.04% pipeline_parallel_scheduler : 0.000002s : 0.01% auto_monad_reorder : 0.000029s : 0.20% get_jit_bprop_graph : 0.000003s : 0.02% rewriter_after_jit_bprop_graph : 0.000008s : 0.05% opt_after_jit_grad : 0.000774s : 5.37% validate : 0.000055s : 0.38% Time group info: ------[substitution.] 0.000262 48 14.84% : 0.000039s : 6: substitution.cast_eliminate 1.10% : 0.000003s : 4: substitution.elim_not_effective 0.80% : 0.000002s : 4: substitution.fold_const_symbol 3.14% : 0.000008s : 6: substitution.graph_param_transform 67.07% : 0.000176s : 4: substitution.inline 2.48% : 0.000006s : 8: substitution.j_node_and_user_rematch 3.20% : 0.000008s : 8: substitution.remove_not_recompute_node 2.37% : 0.000006s : 4: substitution.replace_old_param 5.00% : 0.000013s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.006774 2 87.62% : 0.005935s : 1: type_inference.infer 12.38% : 0.000839s : 1: type_inference.specialize ------[replace.] 0.000066 8 62.37% : 0.000041s : 4: replace.inline 37.63% : 0.000025s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000184 8 93.93% : 0.000173s : 4: match.inline 6.07% : 0.000011s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000308 1730 0.89% : 0.000003s : 17: predicate.accumulaten_eliminater 1.06% : 0.000003s : 6: predicate.ad_related_special_op_eliminate 0.61% : 0.000002s : 12: predicate.addn_check_dump 0.88% : 0.000003s : 17: predicate.addn_zero_filter 0.79% : 0.000002s : 17: predicate.adjust_all_reduce_mul_add 2.11% : 0.000006s : 29: predicate.arithmetic_simplify 1.07% : 0.000003s : 17: predicate.cast_eliminate 0.65% : 0.000002s : 12: predicate.check_bprop_eliminate 0.66% : 0.000002s : 12: predicate.compare_switch_simplify 0.18% : 0.000001s : 6: predicate.const_output_eliminate 0.66% : 0.000002s : 12: predicate.depend_value_elim 0.97% : 0.000003s : 17: predicate.dict_get_item_const_eliminator 1.01% : 0.000003s : 17: predicate.dict_get_item_eliminator 0.89% : 0.000003s : 17: predicate.dict_set_item_eliminator 1.02% : 0.000003s : 12: predicate.dumpgradient_eliminate 0.18% : 0.000001s : 6: predicate.elim_not_effective 0.44% : 0.000001s : 6: predicate.elim_shapecalc_of_broadcastargs 1.15% : 0.000004s : 23: predicate.environ_add_const_eliminate 1.15% : 0.000004s : 23: predicate.environ_get_add_eliminate 1.29% : 0.000004s : 23: predicate.environ_get_depend_swap 1.73% : 0.000005s : 35: predicate.environ_get_eliminate 1.19% : 0.000004s : 23: predicate.environ_get_set_eliminate 1.23% : 0.000004s : 25: predicate.exchange_switch_depend_value 2.04% : 0.000006s : 25: predicate.float_depend_g_call 0.64% : 0.000002s : 12: predicate.float_environ_get_switch 0.94% : 0.000003s : 18: predicate.float_tuple_getitem_switch 0.17% : 0.000001s : 6: predicate.fold_const_symbol 0.71% : 0.000002s : 12: predicate.get_grad_eliminate 0.22% : 0.000001s : 6: predicate.graph_param_transform 0.65% : 0.000002s : 12: predicate.incorporate_call 0.54% : 0.000002s : 12: predicate.incorporate_call_switch 5.84% : 0.000018s : 78: predicate.inline 0.85% : 0.000003s : 12: predicate.inline_without_move 0.31% : 0.000001s : 12: predicate.j_node_and_user_rematch 0.94% : 0.000003s : 12: predicate.less_batch_normalization 1.82% : 0.000006s : 33: predicate.list_to_tuple_eliminator_ 2.30% : 0.000007s : 50: predicate.load_eliminater 1.05% : 0.000003s : 6: predicate.loop_unroll_after_grad 1.88% : 0.000006s : 38: predicate.loop_unroll_before_grad 2.17% : 0.000007s : 29: predicate.make_slice_get_slice_eliminator 0.60% : 0.000002s : 12: predicate.merge_addn 0.68% : 0.000002s : 12: predicate.micro_step_allgather_replace 0.59% : 0.000002s : 12: predicate.mini_step_allgather_replace 0.79% : 0.000002s : 17: predicate.minmaximum_grad 1.38% : 0.000004s : 6: predicate.mutable_eliminate 0.34% : 0.000001s : 6: predicate.opt_reshape 0.36% : 0.000001s : 6: predicate.parallel_virtual_node 1.57% : 0.000005s : 25: predicate.partial_defer_inline 1.49% : 0.000005s : 27: predicate.partial_eliminate 0.83% : 0.000003s : 17: predicate.print_const_string_wrapper 0.64% : 0.000002s : 12: predicate.reduce_all_const_elim 1.35% : 0.000004s : 17: predicate.reduce_eliminate 2.57% : 0.000008s : 50: predicate.redundant_stop_gradient_eliminater 0.35% : 0.000001s : 12: predicate.remove_not_recompute_node 1.18% : 0.000004s : 33: predicate.replace_applicator 0.53% : 0.000002s : 12: predicate.replace_old_param 0.35% : 0.000001s : 6: predicate.reset_defer_inline 0.97% : 0.000003s : 17: predicate.reshape_eliminate 0.66% : 0.000002s : 12: predicate.row_tensor_add_zeros_like 0.47% : 0.000001s : 6: predicate.row_tensor_eliminate 0.79% : 0.000002s : 12: predicate.same_eliminate 0.40% : 0.000001s : 12: predicate.set_cell_output_no_recompute 0.86% : 0.000003s : 12: predicate.shard_identity_eliminate 0.85% : 0.000003s : 12: predicate.special_op_eliminate 0.78% : 0.000002s : 12: predicate.specialize_transform 1.20% : 0.000004s : 12: predicate.split_environ_get_set_with_tuple_value 0.92% : 0.000003s : 12: predicate.stack_unstack_eliminate 0.32% : 0.000001s : 6: predicate.switch_call_monad_eliminater 1.44% : 0.000004s : 25: predicate.switch_defer_inline 1.90% : 0.000006s : 37: predicate.switch_layer_defer_inline 4.38% : 0.000014s : 81: predicate.switch_simplify 0.87% : 0.000003s : 17: predicate.tile_eliminate 0.93% : 0.000003s : 17: predicate.transpose_eliminate 1.64% : 0.000005s : 29: predicate.tuple_list_convert_item_index_to_positive 1.52% : 0.000005s : 29: predicate.tuple_list_get_item_const_eliminator 1.79% : 0.000006s : 29: predicate.tuple_list_get_item_depend_reorder 3.28% : 0.000010s : 45: predicate.tuple_list_get_item_eliminator 1.63% : 0.000005s : 29: predicate.tuple_list_get_set_item_eliminator 2.33% : 0.000007s : 41: predicate.tuple_list_set_item_eliminator 1.66% : 0.000005s : 33: predicate.tuple_to_list_eliminator_ 2.22% : 0.000007s : 50: predicate.updatestate_pure_node_eliminater 2.98% : 0.000009s : 62: predicate.updatestate_useless_node_eliminater 0.37% : 0.000001s : 6: predicate.value_based_eliminate 0.66% : 0.000002s : 12: predicate.virtual_dataset_eliminate 0.84% : 0.000003s : 12: predicate.virtual_output_eliminate 0.31% : 0.000001s : 6: predicate.virtual_view_grad_eliminate 0.51% : 0.000002s : 6: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000674 11 53.55% : 0.000361s : 5: func_graph_cloner_run.FuncGraphClonerGraph 46.45% : 0.000313s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.034699 192 0.02% : 0.000006s : 1: ForceFp32Comm 11.15% : 0.003867s : 1: add_attr 11.09% : 0.003849s : 1: add_attr_with_inline 0.02% : 0.000007s : 1: add_comm_op_reuse_tag 0.23% : 0.000080s : 1: add_recomputation 0.02% : 0.000007s : 1: assign_add_opt 0.23% : 0.000080s : 1: auto_monad 0.11% : 0.000038s : 1: auto_monad_reorder 0.02% : 0.000007s : 1: begin_end_overlap_inline 0.03% : 0.000009s : 1: bias_add_comm_swap 1.44% : 0.000500s : 1: bootstrap 0.12% : 0.000042s : 1: cconv 0.02% : 0.000007s : 1: comm_op_add_attrs 0.07% : 0.000025s : 1: control_data_broadcast_order 0.04% : 0.000015s : 1: convert_after_rewriter 0.12% : 0.000041s : 1: cse_after_recomputation 0.02% : 0.000008s : 1: dataset_repeat_opt 0.09% : 0.000032s : 1: detach_backward 0.04% : 0.000014s : 1: environ_conv 0.10% : 0.000034s : 1: event_method 0.02% : 0.000008s : 1: full_micro_interleaved_order_control 0.03% : 0.000009s : 1: get_jit_bprop_graph 0.04% : 0.000013s : 1: graph_reusing 0.02% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.02% : 0.000007s : 1: handle_group_info 0.02% : 0.000008s : 1: inline 0.03% : 0.000009s : 1: insert-virtual-dataset 0.02% : 0.000007s : 1: interleave_parallel_branches 0.02% : 0.000007s : 1: interleave_split_concat_branches 0.02% : 0.000009s : 1: label_fine_grained_interleaved_index 0.03% : 0.000012s : 1: label_micro_interleaved_index 2.02% : 0.000701s : 1: loop_unroll 0.02% : 0.000007s : 1: merge_cast_opt 0.02% : 0.000008s : 1: micro_interleaved_order_control 2.24% : 0.000777s : 1: mutable_eliminate 0.03% : 0.000011s : 1: offloading_packed_experts 0.07% : 0.000025s : 1: opt.transform.loop_unroll_optimizer 0.08% : 0.000027s : 1: opt.transform.mutable_eliminate 4.74% : 0.001646s : 78: opt.transform.opt_a 0.13% : 0.000044s : 1: opt.transform.opt_after_cconv 0.12% : 0.000043s : 1: opt.transform.opt_after_jit_grad 0.50% : 0.000173s : 28: opt.transform.opt_b 0.22% : 0.000077s : 2: opt.transform.opt_trans_graph 0.16% : 0.000057s : 4: opt.transform.symbol_engine_opt 11.94% : 0.004143s : 1: opt_a 0.54% : 0.000186s : 1: opt_after_cconv 2.27% : 0.000788s : 1: opt_after_jit_grad 1.14% : 0.000394s : 1: opt_b 23.11% : 0.008018s : 1: optimize 0.09% : 0.000032s : 1: optimize_parallel_all_gather_comm 0.04% : 0.000013s : 1: order_py_execute_after_rewriter 0.10% : 0.000033s : 1: overlap_grad_flash_sp 0.02% : 0.000006s : 1: overlap_grad_matmul_and_grad_allreduce 0.03% : 0.000010s : 1: overlap_grad_ring_attention 0.02% : 0.000007s : 1: overlap_opt_shard_grad_in_pipeline 0.02% : 0.000007s : 1: overlap_opt_shard_in_pipeline 0.03% : 0.000010s : 1: overlap_param_gather 0.02% : 0.000006s : 1: overlap_recompute_allgather_and_fa_grad 0.03% : 0.000011s : 1: overlap_recompute_and_grad_model_parallel 0.02% : 0.000008s : 1: overlap_recompute_comm 0.03% : 0.000010s : 1: parallel-infer-symbol 0.02% : 0.000007s : 1: parallel-infer-symbol-second 0.02% : 0.000008s : 1: partial_unused_args_eliminate 0.03% : 0.000010s : 1: pipeline_parallel_scheduler 0.02% : 0.000008s : 1: pipeline_split 0.14% : 0.000050s : 1: pre_auto_parallel 0.12% : 0.000043s : 1: py_interpret_to_execute 0.07% : 0.000026s : 1: py_interpret_to_execute_after_opt_a 0.02% : 0.000007s : 1: remove_cast_before_assign_add 0.18% : 0.000062s : 1: remove_dup_value 1.75% : 0.000607s : 1: renormalize.infer 1.28% : 0.000445s : 1: renormalize.specialize 0.03% : 0.000009s : 1: reorder_send_recv_between_fp_bp 0.04% : 0.000014s : 1: rewriter_after_jit_bprop_graph 0.17% : 0.000059s : 1: rewriter_after_opt_a 0.32% : 0.000113s : 1: rewriter_before_opt_a 0.02% : 0.000008s : 1: slice_cell_reuse_recomputed_activation 0.02% : 0.000007s : 1: slice_recompute_activation 0.02% : 0.000007s : 1: split_layernorm_comm 0.02% : 0.000007s : 1: split_matmul_comm_elemetwise 0.04% : 0.000013s : 1: swap_dp_allreduce_reducescatter 0.36% : 0.000126s : 1: symbol_engine_optimizer 0.37% : 0.000128s : 1: tuple_transform 19.86% : 0.006892s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:45:01.826.559 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0178713, [21] [bootstrap]: 0.00050276 [type_inference]: 0.00670607 [event_method]: 2.175e-05 [auto_monad]: 7.012e-05 [graph_reusing]: 5.77001e-06 [inline]: 3.03e-06 [add_attr]: 0.00380089, [1] [add_attr_with_inline]: 0.00378834, [1] [Cycle 1]: 8.168e-05, [2] [tag_attr]: 2.568e-05 [meta_addattr_fg_expand]: 6.65002e-06 [parallel-infer-symbol]: 3.71001e-06 [pre_auto_parallel]: 4.117e-05 [insert-virtual-dataset]: 2.54999e-06 [parallel-infer-symbol-second]: 8.50006e-07 [dataset_repeat_opt]: 2.07001e-06 [pipeline_split]: 1.64e-06 [optimize]: 0.00597176, [53] [py_interpret_to_execute]: 3.423e-05 [rewriter_before_opt_a]: 0.00010349 [opt_a]: 0.00346818, [2] [Cycle 1]: 0.0025832, [45] [expand_dump_flag]: 3.41999e-06 [switch_simplify]: 4.705e-05 [loop_unroll]: 3.222e-05 [a_1]: 0.00077982 [with_stream_mark]: 1.936e-05 [recompute_prepare]: 1.265e-05 [updatestate_depend_eliminate]: 5.74e-06 [updatestate_assign_eliminate]: 4.51002e-06 [updatestate_loads_eliminate]: 4.22e-06 [parameter_eliminate]: 1.89e-06 [a_2]: 0.00012175 [accelerated_algorithm]: 1.097e-05 [shard]: 1.76003e-06 [meta_shard_fg_expand]: 2.58e-06 [shard_inline]: 9.38002e-06 [merge_send_recv]: 1.124e-05 [auto_parallel]: 8.83001e-06 [parallel]: 1.972e-05 [flash_sp]: 1.013e-05 [merge_comm]: 5.15999e-06 [allreduce_fusion]: 5.12e-06 [matmul_add_comm_reduction]: 1.186e-05 [allreduce_slice_to_reducescatter]: 7.50006e-07 [virtual_shard_identity]: 1.12e-05 [virtual_dataset]: 9.42999e-06 [get_grad_eliminate_]: 8.84998e-06 [virtual_output]: 9.21998e-06 [merge_forward]: 4.90999e-06 [cell_reuse_recompute_pass]: 1.20001e-06 [offload_activation]: 1.283e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.834e-05 [merge_recompute_call_nodes]: 1.67999e-06 [before_grad]: 1.572e-05 [set_forward_comm_id_for_comm_node_pass]: 5.32999e-06 [meta_fg_expand]: 4.35999e-06 [flash_sp_send_recv_attached]: 2.64001e-06 [receive_attached]: 2.24999e-06 [after_resolve]: 1.478e-05 [a_after_grad]: 1.414e-05 [renormalize]: 0.00091296 [add_forward_monad_depend]: 5.65001e-06 [auto_monad_grad]: 2.96001e-06 [auto_monad_eliminator]: 1.955e-05 [cse]: 4.354e-05 [a_3]: 6.935e-05 [Cycle 2]: 0.000875, [45] [expand_dump_flag]: 1.21997e-06 [switch_simplify]: 1.01e-05 [loop_unroll]: 8.47e-06 [a_1]: 0.00021232 [with_stream_mark]: 1.388e-05 [recompute_prepare]: 8.77e-06 [updatestate_depend_eliminate]: 4.55001e-06 [updatestate_assign_eliminate]: 3.7e-06 [updatestate_loads_eliminate]: 3.61999e-06 [parameter_eliminate]: 1.25001e-06 [a_2]: 0.00011026 [accelerated_algorithm]: 8.85999e-06 [shard]: 1.69e-06 [meta_shard_fg_expand]: 2.21998e-06 [shard_inline]: 1.193e-05 [merge_send_recv]: 7.35998e-06 [auto_parallel]: 7.48e-06 [parallel]: 6.28e-06 [flash_sp]: 4.03001e-06 [merge_comm]: 4.77e-06 [allreduce_fusion]: 4.53001e-06 [matmul_add_comm_reduction]: 9.19e-06 [allreduce_slice_to_reducescatter]: 8.00006e-07 [virtual_shard_identity]: 9.97999e-06 [virtual_dataset]: 8.45001e-06 [get_grad_eliminate_]: 8.23999e-06 [virtual_output]: 8.03999e-06 [merge_forward]: 4.25999e-06 [cell_reuse_recompute_pass]: 2.01e-06 [offload_activation]: 9.57001e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.697e-05 [merge_recompute_call_nodes]: 1.09998e-06 [before_grad]: 1.52e-05 [set_forward_comm_id_for_comm_node_pass]: 5.37001e-06 [meta_fg_expand]: 3.76999e-06 [flash_sp_send_recv_attached]: 9.39996e-07 [receive_attached]: 1.42e-06 [after_resolve]: 1.27e-05 [a_after_grad]: 1.371e-05 [renormalize]: 8.00064e-08 [add_forward_monad_depend]: 2.21e-06 [auto_monad_grad]: 9.50007e-07 [auto_monad_eliminator]: 1.097e-05 [cse]: 2.415e-05 [a_3]: 5.472e-05 [py_interpret_to_execute_after_opt_a]: 1.367e-05 [slice_cell_reuse_recomputed_activation]: 2.20002e-06 [rewriter_after_opt_a]: 4.562e-05 [convert_after_rewriter]: 8.36002e-06 [order_py_execute_after_rewriter]: 6.56999e-06 [mutable_eliminate]: 0.00066308 [opt_b]: 0.00031486, [1] [Cycle 1]: 0.000308, [7] [b_1]: 0.00018351 [b_2]: 1.042e-05 [updatestate_depend_eliminate]: 8.08999e-06 [updatestate_assign_eliminate]: 3.66999e-06 [updatestate_loads_eliminate]: 3.92002e-06 [renormalize]: 5.59987e-07 [cse]: 3.095e-05 [optimize_parallel_all_gather_comm]: 1.985e-05 [overlap_param_gather]: 1.81998e-06 [cconv]: 3.109e-05 [loop_unroll]: 0.00046135 [opt_after_cconv]: 0.00013078, [1] [Cycle 1]: 0.00012459, [7] [c_1]: 4.262e-05 [parameter_eliminate]: 3.51001e-06 [updatestate_depend_eliminate]: 7.15e-06 [updatestate_assign_eliminate]: 3.90998e-06 [updatestate_loads_eliminate]: 3.67002e-06 [cse]: 2.765e-05 [renormalize]: 3.80009e-07 [remove_dup_value]: 4.222e-05 [tuple_transform]: 9.856e-05, [1] [Cycle 1]: 9.341e-05, [4] [d_1]: 6.233e-05 [none_parameter_eliminate]: 1.81e-06 [renormalize]: 2.29978e-07 [switch_simplify]: 9.46998e-06 [partial_unused_args_eliminate]: 1.71e-06 [add_recomputation]: 6.295e-05 [cse_after_recomputation]: 2.753e-05, [1] [Cycle 1]: 2.269e-05, [1] [cse]: 1.71e-05 [environ_conv]: 6.53e-06 [swap_dp_allreduce_reducescatter]: 6.83e-06 [bias_add_comm_swap]: 3.51001e-06 [label_micro_interleaved_index]: 4.59998e-06 [label_fine_grained_interleaved_index]: 2.84001e-06 [merge_cast_opt]: 1.40999e-06 [slice_recompute_activation]: 2.00002e-06 [micro_interleaved_order_control]: 1.99e-06 [assign_add_opt]: 1.60001e-06 [ForceFp32Comm]: 9.80013e-07 [remove_cast_before_assign_add]: 1.02e-06 [full_micro_interleaved_order_control]: 2.18002e-06 [reorder_send_recv_between_fp_bp]: 3.04999e-06 [comm_op_add_attrs]: 9.90025e-07 [add_comm_op_reuse_tag]: 9.50007e-07 [interleave_split_concat_branches]: 1.06002e-06 [interleave_parallel_branches]: 1.00999e-06 [overlap_opt_shard_in_pipeline]: 1.10001e-06 [overlap_opt_shard_grad_in_pipeline]: 1.81e-06 [control_data_broadcast_order]: 1.675e-05 [grouped_pairwise_exchange_alltoall]: 1.52001e-06 [offloading_packed_experts]: 5.02999e-06 [overlap_recompute_and_grad_model_parallel]: 5.51e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.17e-06 [overlap_recompute_allgather_and_fa_grad]: 1.49998e-06 [overlap_recompute_comm]: 2.41e-06 [overlap_grad_ring_attention]: 5.15001e-06 [overlap_grad_flash_sp]: 2.592e-05 [begin_end_overlap_inline]: 5.60016e-07 [split_matmul_comm_elemetwise]: 2.50002e-06 [split_layernorm_comm]: 2.02999e-06 [handle_group_info]: 9.5999e-07 [symbol_engine_optimizer]: 9.312e-05, [1] [Cycle 1]: 8.854e-05, [6] [build]: 4.57e-06 [elim_shapecalc]: 1.297e-05 [elim_not_effective]: 1.667e-05 [opt_reshape]: 9.78002e-06 [fold_const_symbol]: 1.428e-05 [renormalize]: 2.10013e-07 [detach_backward]: 1.99e-06 [pipeline_parallel_scheduler]: 1.76e-06 [auto_monad_reorder]: 2.12e-05 [get_jit_bprop_graph]: 1.72001e-06 [rewriter_after_jit_bprop_graph]: 4.75001e-06 [opt_after_jit_grad]: 0.00048923 [validate]: 4.71e-05 Sums bootstrap : 0.000503s : 3.86% type_inference : 0.006706s : 51.43% event_method : 0.000022s : 0.17% auto_monad : 0.000070s : 0.54% graph_reusing : 0.000006s : 0.04% inline : 0.000003s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000026s : 0.20% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000007s : 0.05% parallel-infer-symbol : 0.000004s : 0.03% pre_auto_parallel : 0.000041s : 0.32% insert-virtual-dataset : 0.000003s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000034s : 0.26% optimize.rewriter_before_opt_a : 0.000103s : 0.79% optimize.opt_a.expand_dump_flag : 0.000005s : 0.04% optimize.opt_a.switch_simplify : 0.000057s : 0.44% optimize.opt_a.loop_unroll : 0.000041s : 0.31% optimize.opt_a.a_1 : 0.000992s : 7.61% optimize.opt_a.with_stream_mark : 0.000033s : 0.25% optimize.opt_a.recompute_prepare : 0.000021s : 0.16% optimize.opt_a.updatestate_depend_eliminate : 0.000010s : 0.08% optimize.opt_a.updatestate_assign_eliminate : 0.000008s : 0.06% optimize.opt_a.updatestate_loads_eliminate : 0.000008s : 0.06% optimize.opt_a.parameter_eliminate : 0.000003s : 0.02% optimize.opt_a.a_2 : 0.000232s : 1.78% optimize.opt_a.accelerated_algorithm : 0.000020s : 0.15% optimize.opt_a.shard : 0.000003s : 0.03% optimize.opt_a.meta_shard_fg_expand : 0.000005s : 0.04% optimize.opt_a.shard_inline : 0.000021s : 0.16% optimize.opt_a.merge_send_recv : 0.000019s : 0.14% optimize.opt_a.auto_parallel : 0.000016s : 0.13% optimize.opt_a.parallel : 0.000026s : 0.20% optimize.opt_a.flash_sp : 0.000014s : 0.11% optimize.opt_a.merge_comm : 0.000010s : 0.08% optimize.opt_a.allreduce_fusion : 0.000010s : 0.07% optimize.opt_a.matmul_add_comm_reduction : 0.000021s : 0.16% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000002s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000021s : 0.16% optimize.opt_a.virtual_dataset : 0.000018s : 0.14% optimize.opt_a.get_grad_eliminate_ : 0.000017s : 0.13% optimize.opt_a.virtual_output : 0.000017s : 0.13% optimize.opt_a.merge_forward : 0.000009s : 0.07% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.02% optimize.opt_a.offload_activation : 0.000022s : 0.17% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000035s : 0.27% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.02% optimize.opt_a.before_grad : 0.000031s : 0.24% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000011s : 0.08% optimize.opt_a.meta_fg_expand : 0.000008s : 0.06% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.03% optimize.opt_a.receive_attached : 0.000004s : 0.03% optimize.opt_a.after_resolve : 0.000027s : 0.21% optimize.opt_a.a_after_grad : 0.000028s : 0.21% optimize.opt_a.renormalize : 0.000913s : 7.00% optimize.opt_a.add_forward_monad_depend : 0.000008s : 0.06% optimize.opt_a.auto_monad_grad : 0.000004s : 0.03% optimize.opt_a.auto_monad_eliminator : 0.000031s : 0.23% optimize.opt_a.cse : 0.000068s : 0.52% optimize.opt_a.a_3 : 0.000124s : 0.95% optimize.py_interpret_to_execute_after_opt_a : 0.000014s : 0.10% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.02% optimize.rewriter_after_opt_a : 0.000046s : 0.35% optimize.convert_after_rewriter : 0.000008s : 0.06% optimize.order_py_execute_after_rewriter : 0.000007s : 0.05% optimize.mutable_eliminate : 0.000663s : 5.09% optimize.opt_b.b_1 : 0.000184s : 1.41% optimize.opt_b.b_2 : 0.000010s : 0.08% optimize.opt_b.updatestate_depend_eliminate : 0.000008s : 0.06% optimize.opt_b.updatestate_assign_eliminate : 0.000004s : 0.03% optimize.opt_b.updatestate_loads_eliminate : 0.000004s : 0.03% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000031s : 0.24% optimize.optimize_parallel_all_gather_comm : 0.000020s : 0.15% optimize.overlap_param_gather : 0.000002s : 0.01% optimize.cconv : 0.000031s : 0.24% optimize.loop_unroll : 0.000461s : 3.54% optimize.opt_after_cconv.c_1 : 0.000043s : 0.33% optimize.opt_after_cconv.parameter_eliminate : 0.000004s : 0.03% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000007s : 0.05% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000004s : 0.03% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000004s : 0.03% optimize.opt_after_cconv.cse : 0.000028s : 0.21% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000042s : 0.32% optimize.tuple_transform.d_1 : 0.000062s : 0.48% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000009s : 0.07% optimize.partial_unused_args_eliminate : 0.000002s : 0.01% optimize.add_recomputation : 0.000063s : 0.48% optimize.cse_after_recomputation.cse : 0.000017s : 0.13% optimize.environ_conv : 0.000007s : 0.05% optimize.swap_dp_allreduce_reducescatter : 0.000007s : 0.05% optimize.bias_add_comm_swap : 0.000004s : 0.03% optimize.label_micro_interleaved_index : 0.000005s : 0.04% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.02% optimize.merge_cast_opt : 0.000001s : 0.01% optimize.slice_recompute_activation : 0.000002s : 0.02% optimize.micro_interleaved_order_control : 0.000002s : 0.02% optimize.assign_add_opt : 0.000002s : 0.01% optimize.ForceFp32Comm : 0.000001s : 0.01% optimize.remove_cast_before_assign_add : 0.000001s : 0.01% optimize.full_micro_interleaved_order_control : 0.000002s : 0.02% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.02% optimize.comm_op_add_attrs : 0.000001s : 0.01% optimize.add_comm_op_reuse_tag : 0.000001s : 0.01% optimize.interleave_split_concat_branches : 0.000001s : 0.01% optimize.interleave_parallel_branches : 0.000001s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.01% optimize.control_data_broadcast_order : 0.000017s : 0.13% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.01% optimize.offloading_packed_experts : 0.000005s : 0.04% optimize.overlap_recompute_and_grad_model_parallel : 0.000006s : 0.04% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.01% optimize.overlap_recompute_comm : 0.000002s : 0.02% optimize.overlap_grad_ring_attention : 0.000005s : 0.04% optimize.overlap_grad_flash_sp : 0.000026s : 0.20% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000003s : 0.02% optimize.split_layernorm_comm : 0.000002s : 0.02% optimize.handle_group_info : 0.000001s : 0.01% optimize.symbol_engine_optimizer.build : 0.000005s : 0.04% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000013s : 0.10% optimize.symbol_engine_optimizer.elim_not_effective : 0.000017s : 0.13% optimize.symbol_engine_optimizer.opt_reshape : 0.000010s : 0.08% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000014s : 0.11% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.02% pipeline_parallel_scheduler : 0.000002s : 0.01% auto_monad_reorder : 0.000021s : 0.16% get_jit_bprop_graph : 0.000002s : 0.01% rewriter_after_jit_bprop_graph : 0.000005s : 0.04% opt_after_jit_grad : 0.000489s : 3.75% validate : 0.000047s : 0.36% Time group info: ------[substitution.] 0.000250 48 14.37% : 0.000036s : 6: substitution.cast_eliminate 0.99% : 0.000002s : 4: substitution.elim_not_effective 0.78% : 0.000002s : 4: substitution.fold_const_symbol 3.09% : 0.000008s : 6: substitution.graph_param_transform 67.78% : 0.000169s : 4: substitution.inline 2.59% : 0.000006s : 8: substitution.j_node_and_user_rematch 3.08% : 0.000008s : 8: substitution.remove_not_recompute_node 2.16% : 0.000005s : 4: substitution.replace_old_param 5.15% : 0.000013s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.006636 2 87.95% : 0.005836s : 1: type_inference.infer 12.05% : 0.000799s : 1: type_inference.specialize ------[replace.] 0.000082 8 69.15% : 0.000056s : 4: replace.inline 30.85% : 0.000025s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000178 8 93.87% : 0.000167s : 4: match.inline 6.13% : 0.000011s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000282 1730 0.91% : 0.000003s : 17: predicate.accumulaten_eliminater 0.67% : 0.000002s : 6: predicate.ad_related_special_op_eliminate 0.59% : 0.000002s : 12: predicate.addn_check_dump 0.92% : 0.000003s : 17: predicate.addn_zero_filter 0.82% : 0.000002s : 17: predicate.adjust_all_reduce_mul_add 2.13% : 0.000006s : 29: predicate.arithmetic_simplify 1.11% : 0.000003s : 17: predicate.cast_eliminate 0.66% : 0.000002s : 12: predicate.check_bprop_eliminate 0.62% : 0.000002s : 12: predicate.compare_switch_simplify 0.19% : 0.000001s : 6: predicate.const_output_eliminate 0.65% : 0.000002s : 12: predicate.depend_value_elim 0.95% : 0.000003s : 17: predicate.dict_get_item_const_eliminator 1.05% : 0.000003s : 17: predicate.dict_get_item_eliminator 0.91% : 0.000003s : 17: predicate.dict_set_item_eliminator 0.93% : 0.000003s : 12: predicate.dumpgradient_eliminate 0.19% : 0.000001s : 6: predicate.elim_not_effective 0.39% : 0.000001s : 6: predicate.elim_shapecalc_of_broadcastargs 1.16% : 0.000003s : 23: predicate.environ_add_const_eliminate 1.08% : 0.000003s : 23: predicate.environ_get_add_eliminate 1.22% : 0.000003s : 23: predicate.environ_get_depend_swap 1.80% : 0.000005s : 35: predicate.environ_get_eliminate 1.14% : 0.000003s : 23: predicate.environ_get_set_eliminate 1.32% : 0.000004s : 25: predicate.exchange_switch_depend_value 2.13% : 0.000006s : 25: predicate.float_depend_g_call 0.61% : 0.000002s : 12: predicate.float_environ_get_switch 0.94% : 0.000003s : 18: predicate.float_tuple_getitem_switch 0.20% : 0.000001s : 6: predicate.fold_const_symbol 0.73% : 0.000002s : 12: predicate.get_grad_eliminate 0.21% : 0.000001s : 6: predicate.graph_param_transform 0.65% : 0.000002s : 12: predicate.incorporate_call 0.57% : 0.000002s : 12: predicate.incorporate_call_switch 6.10% : 0.000017s : 78: predicate.inline 0.89% : 0.000003s : 12: predicate.inline_without_move 0.35% : 0.000001s : 12: predicate.j_node_and_user_rematch 0.98% : 0.000003s : 12: predicate.less_batch_normalization 1.80% : 0.000005s : 33: predicate.list_to_tuple_eliminator_ 2.46% : 0.000007s : 50: predicate.load_eliminater 0.77% : 0.000002s : 6: predicate.loop_unroll_after_grad 1.92% : 0.000005s : 38: predicate.loop_unroll_before_grad 1.75% : 0.000005s : 29: predicate.make_slice_get_slice_eliminator 0.64% : 0.000002s : 12: predicate.merge_addn 0.60% : 0.000002s : 12: predicate.micro_step_allgather_replace 0.78% : 0.000002s : 12: predicate.mini_step_allgather_replace 0.82% : 0.000002s : 17: predicate.minmaximum_grad 0.96% : 0.000003s : 6: predicate.mutable_eliminate 0.43% : 0.000001s : 6: predicate.opt_reshape 0.38% : 0.000001s : 6: predicate.parallel_virtual_node 1.80% : 0.000005s : 25: predicate.partial_defer_inline 1.58% : 0.000004s : 27: predicate.partial_eliminate 0.96% : 0.000003s : 17: predicate.print_const_string_wrapper 0.61% : 0.000002s : 12: predicate.reduce_all_const_elim 1.31% : 0.000004s : 17: predicate.reduce_eliminate 2.63% : 0.000007s : 50: predicate.redundant_stop_gradient_eliminater 0.39% : 0.000001s : 12: predicate.remove_not_recompute_node 1.32% : 0.000004s : 33: predicate.replace_applicator 0.46% : 0.000001s : 12: predicate.replace_old_param 0.25% : 0.000001s : 6: predicate.reset_defer_inline 1.09% : 0.000003s : 17: predicate.reshape_eliminate 0.73% : 0.000002s : 12: predicate.row_tensor_add_zeros_like 0.37% : 0.000001s : 6: predicate.row_tensor_eliminate 0.81% : 0.000002s : 12: predicate.same_eliminate 0.49% : 0.000001s : 12: predicate.set_cell_output_no_recompute 0.88% : 0.000002s : 12: predicate.shard_identity_eliminate 0.82% : 0.000002s : 12: predicate.special_op_eliminate 0.80% : 0.000002s : 12: predicate.specialize_transform 0.87% : 0.000002s : 12: predicate.split_environ_get_set_with_tuple_value 0.96% : 0.000003s : 12: predicate.stack_unstack_eliminate 0.35% : 0.000001s : 6: predicate.switch_call_monad_eliminater 1.40% : 0.000004s : 25: predicate.switch_defer_inline 1.94% : 0.000005s : 37: predicate.switch_layer_defer_inline 4.60% : 0.000013s : 81: predicate.switch_simplify 0.97% : 0.000003s : 17: predicate.tile_eliminate 0.88% : 0.000002s : 17: predicate.transpose_eliminate 1.60% : 0.000005s : 29: predicate.tuple_list_convert_item_index_to_positive 1.55% : 0.000004s : 29: predicate.tuple_list_get_item_const_eliminator 1.56% : 0.000004s : 29: predicate.tuple_list_get_item_depend_reorder 3.17% : 0.000009s : 45: predicate.tuple_list_get_item_eliminator 1.62% : 0.000005s : 29: predicate.tuple_list_get_set_item_eliminator 2.41% : 0.000007s : 41: predicate.tuple_list_set_item_eliminator 1.73% : 0.000005s : 33: predicate.tuple_to_list_eliminator_ 2.41% : 0.000007s : 50: predicate.updatestate_pure_node_eliminater 3.10% : 0.000009s : 62: predicate.updatestate_useless_node_eliminater 0.36% : 0.000001s : 6: predicate.value_based_eliminate 0.67% : 0.000002s : 12: predicate.virtual_dataset_eliminate 0.77% : 0.000002s : 12: predicate.virtual_output_eliminate 0.30% : 0.000001s : 6: predicate.virtual_view_grad_eliminate 0.43% : 0.000001s : 6: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000619 11 53.46% : 0.000331s : 5: func_graph_cloner_run.FuncGraphClonerGraph 46.54% : 0.000288s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.030413 192 0.01% : 0.000004s : 1: ForceFp32Comm 12.52% : 0.003807s : 1: add_attr 12.47% : 0.003793s : 1: add_attr_with_inline 0.01% : 0.000004s : 1: add_comm_op_reuse_tag 0.22% : 0.000067s : 1: add_recomputation 0.01% : 0.000004s : 1: assign_add_opt 0.25% : 0.000076s : 1: auto_monad 0.08% : 0.000025s : 1: auto_monad_reorder 0.01% : 0.000004s : 1: begin_end_overlap_inline 0.02% : 0.000006s : 1: bias_add_comm_swap 1.76% : 0.000535s : 1: bootstrap 0.11% : 0.000035s : 1: cconv 0.01% : 0.000004s : 1: comm_op_add_attrs 0.07% : 0.000020s : 1: control_data_broadcast_order 0.04% : 0.000013s : 1: convert_after_rewriter 0.10% : 0.000031s : 1: cse_after_recomputation 0.02% : 0.000005s : 1: dataset_repeat_opt 0.02% : 0.000005s : 1: detach_backward 0.03% : 0.000010s : 1: environ_conv 0.09% : 0.000029s : 1: event_method 0.02% : 0.000005s : 1: full_micro_interleaved_order_control 0.02% : 0.000005s : 1: get_jit_bprop_graph 0.03% : 0.000009s : 1: graph_reusing 0.01% : 0.000005s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000004s : 1: handle_group_info 0.02% : 0.000006s : 1: inline 0.02% : 0.000006s : 1: insert-virtual-dataset 0.01% : 0.000004s : 1: interleave_parallel_branches 0.01% : 0.000004s : 1: interleave_split_concat_branches 0.02% : 0.000006s : 1: label_fine_grained_interleaved_index 0.02% : 0.000007s : 1: label_micro_interleaved_index 1.54% : 0.000469s : 1: loop_unroll 0.01% : 0.000004s : 1: merge_cast_opt 0.02% : 0.000005s : 1: micro_interleaved_order_control 2.21% : 0.000673s : 1: mutable_eliminate 0.03% : 0.000008s : 1: offloading_packed_experts 0.06% : 0.000018s : 1: opt.transform.loop_unroll_optimizer 0.07% : 0.000020s : 1: opt.transform.mutable_eliminate 5.29% : 0.001608s : 78: opt.transform.opt_a 0.14% : 0.000041s : 1: opt.transform.opt_after_cconv 0.11% : 0.000033s : 1: opt.transform.opt_after_jit_grad 0.53% : 0.000162s : 28: opt.transform.opt_b 0.23% : 0.000070s : 2: opt.transform.opt_trans_graph 0.16% : 0.000050s : 4: opt.transform.symbol_engine_opt 11.41% : 0.003471s : 1: opt_a 0.44% : 0.000134s : 1: opt_after_cconv 1.64% : 0.000499s : 1: opt_after_jit_grad 1.05% : 0.000319s : 1: opt_b 19.65% : 0.005977s : 1: optimize 0.08% : 0.000024s : 1: optimize_parallel_all_gather_comm 0.03% : 0.000010s : 1: order_py_execute_after_rewriter 0.10% : 0.000029s : 1: overlap_grad_flash_sp 0.01% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.03% : 0.000008s : 1: overlap_grad_ring_attention 0.02% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.02% : 0.000005s : 1: overlap_param_gather 0.01% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.03% : 0.000008s : 1: overlap_recompute_and_grad_model_parallel 0.02% : 0.000005s : 1: overlap_recompute_comm 0.02% : 0.000008s : 1: parallel-infer-symbol 0.01% : 0.000004s : 1: parallel-infer-symbol-second 0.02% : 0.000005s : 1: partial_unused_args_eliminate 0.02% : 0.000005s : 1: pipeline_parallel_scheduler 0.01% : 0.000004s : 1: pipeline_split 0.15% : 0.000045s : 1: pre_auto_parallel 0.13% : 0.000039s : 1: py_interpret_to_execute 0.06% : 0.000017s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000004s : 1: remove_cast_before_assign_add 0.15% : 0.000047s : 1: remove_dup_value 1.71% : 0.000519s : 1: renormalize.infer 1.27% : 0.000385s : 1: renormalize.specialize 0.02% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.03% : 0.000008s : 1: rewriter_after_jit_bprop_graph 0.16% : 0.000050s : 1: rewriter_after_opt_a 0.36% : 0.000108s : 1: rewriter_before_opt_a 0.02% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.02% : 0.000005s : 1: slice_recompute_activation 0.02% : 0.000005s : 1: split_layernorm_comm 0.02% : 0.000005s : 1: split_matmul_comm_elemetwise 0.03% : 0.000010s : 1: swap_dp_allreduce_reducescatter 0.32% : 0.000096s : 1: symbol_engine_optimizer 0.33% : 0.000102s : 1: tuple_transform 22.14% : 0.006732s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:45:02.179.578 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:45:02.179.869 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0176068, [21] [bootstrap]: 0.00047424 [type_inference]: 0.00648241 [event_method]: 2.151e-05 [auto_monad]: 6.799e-05 [graph_reusing]: 5.70001e-06 [inline]: 2.18002e-06 [add_attr]: 0.00325783, [1] [add_attr_with_inline]: 0.00324694, [1] [Cycle 1]: 7.684e-05, [2] [tag_attr]: 2.055e-05 [meta_addattr_fg_expand]: 6.66e-06 [parallel-infer-symbol]: 3.34001e-06 [pre_auto_parallel]: 3.519e-05 [insert-virtual-dataset]: 2.44001e-06 [parallel-infer-symbol-second]: 7.09988e-07 [dataset_repeat_opt]: 1.87001e-06 [pipeline_split]: 1.66998e-06 [optimize]: 0.00607459, [53] [py_interpret_to_execute]: 2.956e-05 [rewriter_before_opt_a]: 8.946e-05 [opt_a]: 0.00358833, [2] [Cycle 1]: 0.00263301, [45] [expand_dump_flag]: 2.88998e-06 [switch_simplify]: 4.452e-05 [loop_unroll]: 3.126e-05 [a_1]: 0.00085375 [with_stream_mark]: 1.924e-05 [recompute_prepare]: 1.072e-05 [updatestate_depend_eliminate]: 4.55001e-06 [updatestate_assign_eliminate]: 4.35999e-06 [updatestate_loads_eliminate]: 3.48e-06 [parameter_eliminate]: 2.11998e-06 [a_2]: 0.00012888 [accelerated_algorithm]: 8.32e-06 [shard]: 2.27999e-06 [meta_shard_fg_expand]: 2.15002e-06 [shard_inline]: 8.57e-06 [merge_send_recv]: 9.50001e-06 [auto_parallel]: 8.42998e-06 [parallel]: 1.892e-05 [flash_sp]: 8.82e-06 [merge_comm]: 4.59998e-06 [allreduce_fusion]: 4.50999e-06 [matmul_add_comm_reduction]: 1.09e-05 [allreduce_slice_to_reducescatter]: 8.2e-07 [virtual_shard_identity]: 9.99001e-06 [virtual_dataset]: 7.96001e-06 [get_grad_eliminate_]: 7.53e-06 [virtual_output]: 7.82002e-06 [merge_forward]: 4.51002e-06 [cell_reuse_recompute_pass]: 1.34e-06 [offload_activation]: 1.048e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.797e-05 [merge_recompute_call_nodes]: 1.67001e-06 [before_grad]: 1.289e-05 [set_forward_comm_id_for_comm_node_pass]: 4.58001e-06 [meta_fg_expand]: 3.38e-06 [flash_sp_send_recv_attached]: 2.53003e-06 [receive_attached]: 1.96e-06 [after_resolve]: 1.363e-05 [a_after_grad]: 1.228e-05 [renormalize]: 0.00079365 [add_forward_monad_depend]: 5.56998e-06 [auto_monad_grad]: 2.53e-06 [auto_monad_eliminator]: 1.695e-05 [cse]: 3.758e-05 [a_3]: 7.209e-05 [Cycle 2]: 0.00094136, [45] [expand_dump_flag]: 1.39998e-06 [switch_simplify]: 8.73001e-06 [loop_unroll]: 7.28999e-06 [a_1]: 0.00017803 [with_stream_mark]: 1.396e-05 [recompute_prepare]: 7.86001e-06 [updatestate_depend_eliminate]: 3.85998e-06 [updatestate_assign_eliminate]: 3.13998e-06 [updatestate_loads_eliminate]: 2.91e-06 [parameter_eliminate]: 1.24998e-06 [a_2]: 0.00011777 [accelerated_algorithm]: 7.49002e-06 [shard]: 1.42e-06 [meta_shard_fg_expand]: 1.77001e-06 [shard_inline]: 7.85998e-06 [merge_send_recv]: 6.44001e-06 [auto_parallel]: 6.63e-06 [parallel]: 8.37e-06 [flash_sp]: 3.83999e-06 [merge_comm]: 4.02998e-06 [allreduce_fusion]: 4.28999e-06 [matmul_add_comm_reduction]: 6.93e-06 [allreduce_slice_to_reducescatter]: 5.89993e-07 [virtual_shard_identity]: 8.62e-06 [virtual_dataset]: 7.26999e-06 [get_grad_eliminate_]: 6.98e-06 [virtual_output]: 7.25003e-06 [merge_forward]: 3.39001e-06 [cell_reuse_recompute_pass]: 1.29998e-06 [offload_activation]: 8.35001e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.825e-05 [merge_recompute_call_nodes]: 1.33002e-06 [before_grad]: 1.179e-05 [set_forward_comm_id_for_comm_node_pass]: 4.33999e-06 [meta_fg_expand]: 2.84999e-06 [flash_sp_send_recv_attached]: 1.13001e-06 [receive_attached]: 9.89996e-07 [after_resolve]: 1.314e-05 [a_after_grad]: 1.136e-05 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 1.55999e-06 [auto_monad_grad]: 1.00999e-06 [auto_monad_eliminator]: 9.02999e-06 [cse]: 2.072e-05 [a_3]: 5.758e-05 [py_interpret_to_execute_after_opt_a]: 1.52e-05 [slice_cell_reuse_recomputed_activation]: 4.68001e-06 [rewriter_after_opt_a]: 4.523e-05 [convert_after_rewriter]: 1.086e-05 [order_py_execute_after_rewriter]: 8.97999e-06 [mutable_eliminate]: 0.00058791 [opt_b]: 0.00031295, [1] [Cycle 1]: 0.00030377, [7] [b_1]: 0.00019845 [b_2]: 1.034e-05 [updatestate_depend_eliminate]: 7.7e-06 [updatestate_assign_eliminate]: 2.91999e-06 [updatestate_loads_eliminate]: 2.99999e-06 [renormalize]: 5.59987e-07 [cse]: 2.462e-05 [optimize_parallel_all_gather_comm]: 2.101e-05 [overlap_param_gather]: 4.90999e-06 [cconv]: 2.839e-05 [loop_unroll]: 0.00046583 [opt_after_cconv]: 0.00014178, [1] [Cycle 1]: 0.00013335, [7] [c_1]: 3.748e-05 [parameter_eliminate]: 3.49001e-06 [updatestate_depend_eliminate]: 6.27001e-06 [updatestate_assign_eliminate]: 3.28e-06 [updatestate_loads_eliminate]: 3.06001e-06 [cse]: 2.383e-05 [renormalize]: 4.80009e-07 [remove_dup_value]: 1.923e-05 [tuple_transform]: 0.00010024, [1] [Cycle 1]: 9.298e-05, [4] [d_1]: 5.262e-05 [none_parameter_eliminate]: 1.82999e-06 [renormalize]: 1.59984e-07 [switch_simplify]: 8.54e-06 [partial_unused_args_eliminate]: 4.65001e-06 [add_recomputation]: 5.983e-05 [cse_after_recomputation]: 3.165e-05, [1] [Cycle 1]: 2.499e-05, [1] [cse]: 1.571e-05 [environ_conv]: 1.055e-05 [swap_dp_allreduce_reducescatter]: 8.37e-06 [bias_add_comm_swap]: 5.19e-06 [label_micro_interleaved_index]: 7.55e-06 [label_fine_grained_interleaved_index]: 5.41002e-06 [merge_cast_opt]: 3.80998e-06 [slice_recompute_activation]: 4.31002e-06 [micro_interleaved_order_control]: 4.51002e-06 [assign_add_opt]: 3.60998e-06 [ForceFp32Comm]: 3.48999e-06 [remove_cast_before_assign_add]: 3.26999e-06 [full_micro_interleaved_order_control]: 4.38999e-06 [reorder_send_recv_between_fp_bp]: 5.39e-06 [comm_op_add_attrs]: 3.58999e-06 [add_comm_op_reuse_tag]: 3.36999e-06 [interleave_split_concat_branches]: 3.75e-06 [interleave_parallel_branches]: 3.72002e-06 [overlap_opt_shard_in_pipeline]: 3.45998e-06 [overlap_opt_shard_grad_in_pipeline]: 4.1e-06 [control_data_broadcast_order]: 1.83e-05 [grouped_pairwise_exchange_alltoall]: 4.07998e-06 [offloading_packed_experts]: 7.33e-06 [overlap_recompute_and_grad_model_parallel]: 7.83001e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.48e-06 [overlap_recompute_allgather_and_fa_grad]: 3.63e-06 [overlap_recompute_comm]: 4.45999e-06 [overlap_grad_ring_attention]: 7.45e-06 [overlap_grad_flash_sp]: 2.63e-05 [begin_end_overlap_inline]: 2.99001e-06 [split_matmul_comm_elemetwise]: 4.62998e-06 [split_layernorm_comm]: 4.22e-06 [handle_group_info]: 3.63e-06 [symbol_engine_optimizer]: 0.00010403, [1] [Cycle 1]: 9.664e-05, [6] [build]: 3.18e-06 [elim_shapecalc]: 1.121e-05 [elim_not_effective]: 1.544e-05 [opt_reshape]: 8.43999e-06 [fold_const_symbol]: 1.216e-05 [renormalize]: 3.60014e-07 [detach_backward]: 4.04002e-06 [pipeline_parallel_scheduler]: 2.03002e-06 [auto_monad_reorder]: 2.133e-05 [get_jit_bprop_graph]: 2.17001e-06 [rewriter_after_jit_bprop_graph]: 4.68999e-06 [opt_after_jit_grad]: 0.00050285 [validate]: 4.406e-05 Sums bootstrap : 0.000474s : 3.77% type_inference : 0.006482s : 51.47% event_method : 0.000022s : 0.17% auto_monad : 0.000068s : 0.54% graph_reusing : 0.000006s : 0.05% inline : 0.000002s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000021s : 0.16% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000007s : 0.05% parallel-infer-symbol : 0.000003s : 0.03% pre_auto_parallel : 0.000035s : 0.28% insert-virtual-dataset : 0.000002s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.01% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000030s : 0.23% optimize.rewriter_before_opt_a : 0.000089s : 0.71% optimize.opt_a.expand_dump_flag : 0.000004s : 0.03% optimize.opt_a.switch_simplify : 0.000053s : 0.42% optimize.opt_a.loop_unroll : 0.000039s : 0.31% optimize.opt_a.a_1 : 0.001032s : 8.19% optimize.opt_a.with_stream_mark : 0.000033s : 0.26% optimize.opt_a.recompute_prepare : 0.000019s : 0.15% optimize.opt_a.updatestate_depend_eliminate : 0.000008s : 0.07% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.06% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.05% optimize.opt_a.parameter_eliminate : 0.000003s : 0.03% optimize.opt_a.a_2 : 0.000247s : 1.96% optimize.opt_a.accelerated_algorithm : 0.000016s : 0.13% optimize.opt_a.shard : 0.000004s : 0.03% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.03% optimize.opt_a.shard_inline : 0.000016s : 0.13% optimize.opt_a.merge_send_recv : 0.000016s : 0.13% optimize.opt_a.auto_parallel : 0.000015s : 0.12% optimize.opt_a.parallel : 0.000027s : 0.22% optimize.opt_a.flash_sp : 0.000013s : 0.10% optimize.opt_a.merge_comm : 0.000009s : 0.07% optimize.opt_a.allreduce_fusion : 0.000009s : 0.07% optimize.opt_a.matmul_add_comm_reduction : 0.000018s : 0.14% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000019s : 0.15% optimize.opt_a.virtual_dataset : 0.000015s : 0.12% optimize.opt_a.get_grad_eliminate_ : 0.000015s : 0.12% optimize.opt_a.virtual_output : 0.000015s : 0.12% optimize.opt_a.merge_forward : 0.000008s : 0.06% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.02% optimize.opt_a.offload_activation : 0.000019s : 0.15% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000036s : 0.29% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.02% optimize.opt_a.before_grad : 0.000025s : 0.20% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000009s : 0.07% optimize.opt_a.meta_fg_expand : 0.000006s : 0.05% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.03% optimize.opt_a.receive_attached : 0.000003s : 0.02% optimize.opt_a.after_resolve : 0.000027s : 0.21% optimize.opt_a.a_after_grad : 0.000024s : 0.19% optimize.opt_a.renormalize : 0.000794s : 6.30% optimize.opt_a.add_forward_monad_depend : 0.000007s : 0.06% optimize.opt_a.auto_monad_grad : 0.000004s : 0.03% optimize.opt_a.auto_monad_eliminator : 0.000026s : 0.21% optimize.opt_a.cse : 0.000058s : 0.46% optimize.opt_a.a_3 : 0.000130s : 1.03% optimize.py_interpret_to_execute_after_opt_a : 0.000015s : 0.12% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.04% optimize.rewriter_after_opt_a : 0.000045s : 0.36% optimize.convert_after_rewriter : 0.000011s : 0.09% optimize.order_py_execute_after_rewriter : 0.000009s : 0.07% optimize.mutable_eliminate : 0.000588s : 4.67% optimize.opt_b.b_1 : 0.000198s : 1.58% optimize.opt_b.b_2 : 0.000010s : 0.08% optimize.opt_b.updatestate_depend_eliminate : 0.000008s : 0.06% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.02% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.02% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000025s : 0.20% optimize.optimize_parallel_all_gather_comm : 0.000021s : 0.17% optimize.overlap_param_gather : 0.000005s : 0.04% optimize.cconv : 0.000028s : 0.23% optimize.loop_unroll : 0.000466s : 3.70% optimize.opt_after_cconv.c_1 : 0.000037s : 0.30% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.05% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.02% optimize.opt_after_cconv.cse : 0.000024s : 0.19% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000019s : 0.15% optimize.tuple_transform.d_1 : 0.000053s : 0.42% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000009s : 0.07% optimize.partial_unused_args_eliminate : 0.000005s : 0.04% optimize.add_recomputation : 0.000060s : 0.48% optimize.cse_after_recomputation.cse : 0.000016s : 0.12% optimize.environ_conv : 0.000011s : 0.08% optimize.swap_dp_allreduce_reducescatter : 0.000008s : 0.07% optimize.bias_add_comm_swap : 0.000005s : 0.04% optimize.label_micro_interleaved_index : 0.000008s : 0.06% optimize.label_fine_grained_interleaved_index : 0.000005s : 0.04% optimize.merge_cast_opt : 0.000004s : 0.03% optimize.slice_recompute_activation : 0.000004s : 0.03% optimize.micro_interleaved_order_control : 0.000005s : 0.04% optimize.assign_add_opt : 0.000004s : 0.03% optimize.ForceFp32Comm : 0.000003s : 0.03% optimize.remove_cast_before_assign_add : 0.000003s : 0.03% optimize.full_micro_interleaved_order_control : 0.000004s : 0.03% optimize.reorder_send_recv_between_fp_bp : 0.000005s : 0.04% optimize.comm_op_add_attrs : 0.000004s : 0.03% optimize.add_comm_op_reuse_tag : 0.000003s : 0.03% optimize.interleave_split_concat_branches : 0.000004s : 0.03% optimize.interleave_parallel_branches : 0.000004s : 0.03% optimize.overlap_opt_shard_in_pipeline : 0.000003s : 0.03% optimize.overlap_opt_shard_grad_in_pipeline : 0.000004s : 0.03% optimize.control_data_broadcast_order : 0.000018s : 0.15% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.03% optimize.offloading_packed_experts : 0.000007s : 0.06% optimize.overlap_recompute_and_grad_model_parallel : 0.000008s : 0.06% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000003s : 0.03% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.03% optimize.overlap_recompute_comm : 0.000004s : 0.04% optimize.overlap_grad_ring_attention : 0.000007s : 0.06% optimize.overlap_grad_flash_sp : 0.000026s : 0.21% optimize.begin_end_overlap_inline : 0.000003s : 0.02% optimize.split_matmul_comm_elemetwise : 0.000005s : 0.04% optimize.split_layernorm_comm : 0.000004s : 0.03% optimize.handle_group_info : 0.000004s : 0.03% optimize.symbol_engine_optimizer.build : 0.000003s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000011s : 0.09% optimize.symbol_engine_optimizer.elim_not_effective : 0.000015s : 0.12% optimize.symbol_engine_optimizer.opt_reshape : 0.000008s : 0.07% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000012s : 0.10% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000004s : 0.03% pipeline_parallel_scheduler : 0.000002s : 0.02% auto_monad_reorder : 0.000021s : 0.17% get_jit_bprop_graph : 0.000002s : 0.02% rewriter_after_jit_bprop_graph : 0.000005s : 0.04% opt_after_jit_grad : 0.000503s : 3.99% validate : 0.000044s : 0.35% Time group info: ------[substitution.] 0.000241 38 10.05% : 0.000024s : 3: substitution.cast_eliminate 0.89% : 0.000002s : 3: substitution.elim_not_effective 0.73% : 0.000002s : 3: substitution.fold_const_symbol 2.82% : 0.000007s : 5: substitution.graph_param_transform 71.72% : 0.000173s : 4: substitution.inline 1.88% : 0.000005s : 6: substitution.j_node_and_user_rematch 3.25% : 0.000008s : 6: substitution.remove_not_recompute_node 2.02% : 0.000005s : 4: substitution.replace_old_param 6.65% : 0.000016s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.006427 2 88.19% : 0.005668s : 1: type_inference.infer 11.81% : 0.000759s : 1: type_inference.specialize ------[replace.] 0.000066 8 61.11% : 0.000040s : 4: replace.inline 38.89% : 0.000026s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000173 8 91.94% : 0.000159s : 4: match.inline 8.06% : 0.000014s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000332 1596 0.75% : 0.000002s : 17: predicate.accumulaten_eliminater 0.59% : 0.000002s : 5: predicate.ad_related_special_op_eliminate 0.43% : 0.000001s : 10: predicate.addn_check_dump 0.77% : 0.000003s : 17: predicate.addn_zero_filter 0.67% : 0.000002s : 17: predicate.adjust_all_reduce_mul_add 1.64% : 0.000005s : 27: predicate.arithmetic_simplify 0.88% : 0.000003s : 17: predicate.cast_eliminate 0.58% : 0.000002s : 10: predicate.check_bprop_eliminate 0.46% : 0.000002s : 10: predicate.compare_switch_simplify 0.15% : 0.000001s : 5: predicate.const_output_eliminate 0.50% : 0.000002s : 10: predicate.depend_value_elim 0.76% : 0.000003s : 17: predicate.dict_get_item_const_eliminator 0.88% : 0.000003s : 17: predicate.dict_get_item_eliminator 0.76% : 0.000003s : 17: predicate.dict_set_item_eliminator 0.69% : 0.000002s : 10: predicate.dumpgradient_eliminate 0.15% : 0.000001s : 5: predicate.elim_not_effective 0.34% : 0.000001s : 5: predicate.elim_shapecalc_of_broadcastargs 0.93% : 0.000003s : 22: predicate.environ_add_const_eliminate 0.87% : 0.000003s : 22: predicate.environ_get_add_eliminate 0.97% : 0.000003s : 22: predicate.environ_get_depend_swap 1.43% : 0.000005s : 32: predicate.environ_get_eliminate 23.05% : 0.000076s : 22: predicate.environ_get_set_eliminate 1.05% : 0.000003s : 25: predicate.exchange_switch_depend_value 1.77% : 0.000006s : 25: predicate.float_depend_g_call 0.41% : 0.000001s : 10: predicate.float_environ_get_switch 0.61% : 0.000002s : 15: predicate.float_tuple_getitem_switch 0.14% : 0.000000s : 5: predicate.fold_const_symbol 0.50% : 0.000002s : 10: predicate.get_grad_eliminate 0.15% : 0.000000s : 5: predicate.graph_param_transform 0.46% : 0.000002s : 10: predicate.incorporate_call 0.38% : 0.000001s : 10: predicate.incorporate_call_switch 4.88% : 0.000016s : 72: predicate.inline 0.64% : 0.000002s : 10: predicate.inline_without_move 0.25% : 0.000001s : 10: predicate.j_node_and_user_rematch 0.57% : 0.000002s : 10: predicate.less_batch_normalization 1.41% : 0.000005s : 31: predicate.list_to_tuple_eliminator_ 2.10% : 0.000007s : 48: predicate.load_eliminater 0.61% : 0.000002s : 5: predicate.loop_unroll_after_grad 1.54% : 0.000005s : 36: predicate.loop_unroll_before_grad 1.30% : 0.000004s : 27: predicate.make_slice_get_slice_eliminator 0.43% : 0.000001s : 10: predicate.merge_addn 0.47% : 0.000002s : 10: predicate.micro_step_allgather_replace 0.47% : 0.000002s : 10: predicate.mini_step_allgather_replace 0.67% : 0.000002s : 17: predicate.minmaximum_grad 0.73% : 0.000002s : 5: predicate.mutable_eliminate 0.29% : 0.000001s : 5: predicate.opt_reshape 0.32% : 0.000001s : 5: predicate.parallel_virtual_node 1.50% : 0.000005s : 25: predicate.partial_defer_inline 1.30% : 0.000004s : 26: predicate.partial_eliminate 0.71% : 0.000002s : 17: predicate.print_const_string_wrapper 0.44% : 0.000001s : 10: predicate.reduce_all_const_elim 0.99% : 0.000003s : 17: predicate.reduce_eliminate 1.98% : 0.000007s : 48: predicate.redundant_stop_gradient_eliminater 0.28% : 0.000001s : 10: predicate.remove_not_recompute_node 1.10% : 0.000004s : 31: predicate.replace_applicator 0.46% : 0.000002s : 10: predicate.replace_old_param 0.19% : 0.000001s : 5: predicate.reset_defer_inline 0.75% : 0.000003s : 17: predicate.reshape_eliminate 0.45% : 0.000001s : 10: predicate.row_tensor_add_zeros_like 0.39% : 0.000001s : 5: predicate.row_tensor_eliminate 0.66% : 0.000002s : 10: predicate.same_eliminate 0.33% : 0.000001s : 10: predicate.set_cell_output_no_recompute 0.56% : 0.000002s : 10: predicate.shard_identity_eliminate 0.56% : 0.000002s : 10: predicate.special_op_eliminate 0.58% : 0.000002s : 10: predicate.specialize_transform 0.62% : 0.000002s : 10: predicate.split_environ_get_set_with_tuple_value 0.56% : 0.000002s : 10: predicate.stack_unstack_eliminate 0.27% : 0.000001s : 5: predicate.switch_call_monad_eliminater 1.11% : 0.000004s : 25: predicate.switch_defer_inline 1.55% : 0.000005s : 35: predicate.switch_layer_defer_inline 3.71% : 0.000012s : 76: predicate.switch_simplify 0.73% : 0.000002s : 17: predicate.tile_eliminate 0.72% : 0.000002s : 17: predicate.transpose_eliminate 1.20% : 0.000004s : 27: predicate.tuple_list_convert_item_index_to_positive 1.26% : 0.000004s : 27: predicate.tuple_list_get_item_const_eliminator 1.19% : 0.000004s : 27: predicate.tuple_list_get_item_depend_reorder 2.62% : 0.000009s : 41: predicate.tuple_list_get_item_eliminator 1.34% : 0.000004s : 27: predicate.tuple_list_get_set_item_eliminator 1.81% : 0.000006s : 37: predicate.tuple_list_set_item_eliminator 1.36% : 0.000004s : 31: predicate.tuple_to_list_eliminator_ 1.99% : 0.000007s : 48: predicate.updatestate_pure_node_eliminater 2.56% : 0.000008s : 58: predicate.updatestate_useless_node_eliminater 0.26% : 0.000001s : 5: predicate.value_based_eliminate 0.49% : 0.000002s : 10: predicate.virtual_dataset_eliminate 0.46% : 0.000002s : 10: predicate.virtual_output_eliminate 0.20% : 0.000001s : 5: predicate.virtual_view_grad_eliminate 0.39% : 0.000001s : 5: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000612 11 56.11% : 0.000344s : 5: func_graph_cloner_run.FuncGraphClonerGraph 43.89% : 0.000269s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.029485 192 0.02% : 0.000006s : 1: ForceFp32Comm 11.09% : 0.003269s : 1: add_attr 11.03% : 0.003251s : 1: add_attr_with_inline 0.02% : 0.000006s : 1: add_comm_op_reuse_tag 0.22% : 0.000064s : 1: add_recomputation 0.02% : 0.000006s : 1: assign_add_opt 0.26% : 0.000077s : 1: auto_monad 0.10% : 0.000029s : 1: auto_monad_reorder 0.02% : 0.000006s : 1: begin_end_overlap_inline 0.03% : 0.000008s : 1: bias_add_comm_swap 1.77% : 0.000521s : 1: bootstrap 0.11% : 0.000031s : 1: cconv 0.02% : 0.000006s : 1: comm_op_add_attrs 0.07% : 0.000021s : 1: control_data_broadcast_order 0.05% : 0.000014s : 1: convert_after_rewriter 0.12% : 0.000035s : 1: cse_after_recomputation 0.02% : 0.000007s : 1: dataset_repeat_opt 0.06% : 0.000019s : 1: detach_backward 0.05% : 0.000013s : 1: environ_conv 0.14% : 0.000042s : 1: event_method 0.02% : 0.000007s : 1: full_micro_interleaved_order_control 0.03% : 0.000008s : 1: get_jit_bprop_graph 0.04% : 0.000012s : 1: graph_reusing 0.02% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.02% : 0.000006s : 1: handle_group_info 0.03% : 0.000008s : 1: inline 0.03% : 0.000008s : 1: insert-virtual-dataset 0.02% : 0.000006s : 1: interleave_parallel_branches 0.02% : 0.000006s : 1: interleave_split_concat_branches 0.03% : 0.000008s : 1: label_fine_grained_interleaved_index 0.04% : 0.000010s : 1: label_micro_interleaved_index 1.60% : 0.000472s : 1: loop_unroll 0.02% : 0.000006s : 1: merge_cast_opt 0.02% : 0.000007s : 1: micro_interleaved_order_control 2.02% : 0.000595s : 1: mutable_eliminate 0.03% : 0.000010s : 1: offloading_packed_experts 0.05% : 0.000016s : 1: opt.transform.loop_unroll_optimizer 0.06% : 0.000018s : 1: opt.transform.mutable_eliminate 5.27% : 0.001554s : 78: opt.transform.opt_a 0.12% : 0.000036s : 1: opt.transform.opt_after_cconv 0.10% : 0.000030s : 1: opt.transform.opt_after_jit_grad 0.46% : 0.000135s : 28: opt.transform.opt_b 0.20% : 0.000059s : 2: opt.transform.opt_trans_graph 0.15% : 0.000043s : 4: opt.transform.symbol_engine_opt 12.18% : 0.003592s : 1: opt_a 0.49% : 0.000145s : 1: opt_after_cconv 1.74% : 0.000513s : 1: opt_after_jit_grad 1.07% : 0.000316s : 1: opt_b 21.73% : 0.006408s : 1: optimize 0.08% : 0.000024s : 1: optimize_parallel_all_gather_comm 0.04% : 0.000012s : 1: order_py_execute_after_rewriter 0.10% : 0.000029s : 1: overlap_grad_flash_sp 0.02% : 0.000006s : 1: overlap_grad_matmul_and_grad_allreduce 0.04% : 0.000010s : 1: overlap_grad_ring_attention 0.02% : 0.000007s : 1: overlap_opt_shard_grad_in_pipeline 0.02% : 0.000006s : 1: overlap_opt_shard_in_pipeline 0.03% : 0.000008s : 1: overlap_param_gather 0.02% : 0.000006s : 1: overlap_recompute_allgather_and_fa_grad 0.04% : 0.000011s : 1: overlap_recompute_and_grad_model_parallel 0.02% : 0.000007s : 1: overlap_recompute_comm 0.03% : 0.000010s : 1: parallel-infer-symbol 0.02% : 0.000006s : 1: parallel-infer-symbol-second 0.03% : 0.000008s : 1: partial_unused_args_eliminate 0.03% : 0.000010s : 1: pipeline_parallel_scheduler 0.02% : 0.000007s : 1: pipeline_split 0.14% : 0.000042s : 1: pre_auto_parallel 0.11% : 0.000033s : 1: py_interpret_to_execute 0.06% : 0.000018s : 1: py_interpret_to_execute_after_opt_a 0.02% : 0.000006s : 1: remove_cast_before_assign_add 0.08% : 0.000023s : 1: remove_dup_value 1.51% : 0.000445s : 1: renormalize.infer 1.15% : 0.000340s : 1: renormalize.specialize 0.03% : 0.000008s : 1: reorder_send_recv_between_fp_bp 0.04% : 0.000010s : 1: rewriter_after_jit_bprop_graph 0.16% : 0.000049s : 1: rewriter_after_opt_a 0.32% : 0.000093s : 1: rewriter_before_opt_a 0.02% : 0.000007s : 1: slice_cell_reuse_recomputed_activation 0.02% : 0.000007s : 1: slice_recompute_activation 0.02% : 0.000007s : 1: split_layernorm_comm 0.02% : 0.000007s : 1: split_matmul_comm_elemetwise 0.04% : 0.000011s : 1: swap_dp_allreduce_reducescatter 0.36% : 0.000107s : 1: symbol_engine_optimizer 0.35% : 0.000103s : 1: tuple_transform 22.12% : 0.006521s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:45:02.510.421 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0169883, [21] [bootstrap]: 0.00043893 [type_inference]: 0.00592502 [event_method]: 2.024e-05 [auto_monad]: 6.697e-05 [graph_reusing]: 5.56998e-06 [inline]: 2.29999e-06 [add_attr]: 0.00326428, [1] [add_attr_with_inline]: 0.00325411, [1] [Cycle 1]: 6.647e-05, [2] [tag_attr]: 2.158e-05 [meta_addattr_fg_expand]: 6.28e-06 [parallel-infer-symbol]: 3.55e-06 [pre_auto_parallel]: 3.743e-05 [insert-virtual-dataset]: 2.70002e-06 [parallel-infer-symbol-second]: 7.99977e-07 [dataset_repeat_opt]: 1.86e-06 [pipeline_split]: 1.69e-06 [optimize]: 0.00630643, [53] [py_interpret_to_execute]: 2.919e-05 [rewriter_before_opt_a]: 9.205e-05 [opt_a]: 0.00353947, [2] [Cycle 1]: 0.00264776, [45] [expand_dump_flag]: 3.03998e-06 [switch_simplify]: 4.457e-05 [loop_unroll]: 3.419e-05 [a_1]: 0.00079716 [with_stream_mark]: 2.262e-05 [recompute_prepare]: 1.201e-05 [updatestate_depend_eliminate]: 6.39001e-06 [updatestate_assign_eliminate]: 4.07e-06 [updatestate_loads_eliminate]: 3.80998e-06 [parameter_eliminate]: 2.29001e-06 [a_2]: 0.00011022 [accelerated_algorithm]: 8.75999e-06 [shard]: 2.79999e-06 [meta_shard_fg_expand]: 2.39001e-06 [shard_inline]: 7.65e-06 [merge_send_recv]: 1.031e-05 [auto_parallel]: 8.53001e-06 [parallel]: 2.048e-05 [flash_sp]: 1.033e-05 [merge_comm]: 5.27001e-06 [allreduce_fusion]: 4.59998e-06 [matmul_add_comm_reduction]: 1.115e-05 [allreduce_slice_to_reducescatter]: 6.79982e-07 [virtual_shard_identity]: 1.171e-05 [virtual_dataset]: 8.54998e-06 [get_grad_eliminate_]: 7.65998e-06 [virtual_output]: 7.66999e-06 [merge_forward]: 4.66002e-06 [cell_reuse_recompute_pass]: 1.32999e-06 [offload_activation]: 1.205e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.756e-05 [merge_recompute_call_nodes]: 1.84e-06 [before_grad]: 1.377e-05 [set_forward_comm_id_for_comm_node_pass]: 4.65999e-06 [meta_fg_expand]: 3.95998e-06 [flash_sp_send_recv_attached]: 2.48e-06 [receive_attached]: 2.42001e-06 [after_resolve]: 1.466e-05 [a_after_grad]: 1.231e-05 [renormalize]: 0.0009716 [add_forward_monad_depend]: 8.12e-06 [auto_monad_grad]: 2.69001e-06 [auto_monad_eliminator]: 2.132e-05 [cse]: 4.027e-05 [a_3]: 6.795e-05 [Cycle 2]: 0.00087895, [45] [expand_dump_flag]: 2.16e-06 [switch_simplify]: 1.028e-05 [loop_unroll]: 7.61001e-06 [a_1]: 0.00019434 [with_stream_mark]: 1.756e-05 [recompute_prepare]: 7.68999e-06 [updatestate_depend_eliminate]: 4.31002e-06 [updatestate_assign_eliminate]: 3.24001e-06 [updatestate_loads_eliminate]: 3.04999e-06 [parameter_eliminate]: 1.72999e-06 [a_2]: 9.406e-05 [accelerated_algorithm]: 7.95e-06 [shard]: 1.76003e-06 [meta_shard_fg_expand]: 2.52001e-06 [shard_inline]: 8.17e-06 [merge_send_recv]: 7.71001e-06 [auto_parallel]: 9.04e-06 [parallel]: 7.64002e-06 [flash_sp]: 3.86999e-06 [merge_comm]: 4.65999e-06 [allreduce_fusion]: 4.35e-06 [matmul_add_comm_reduction]: 8.67998e-06 [allreduce_slice_to_reducescatter]: 6.69999e-07 [virtual_shard_identity]: 9.19998e-06 [virtual_dataset]: 8.18999e-06 [get_grad_eliminate_]: 7.48e-06 [virtual_output]: 7.97003e-06 [merge_forward]: 4.77e-06 [cell_reuse_recompute_pass]: 3.13e-06 [offload_activation]: 9.98998e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.74e-05 [merge_recompute_call_nodes]: 1.39e-06 [before_grad]: 1.285e-05 [set_forward_comm_id_for_comm_node_pass]: 4.75999e-06 [meta_fg_expand]: 3.03998e-06 [flash_sp_send_recv_attached]: 1.34e-06 [receive_attached]: 2.01e-06 [after_resolve]: 1.372e-05 [a_after_grad]: 1.128e-05 [renormalize]: 8.00064e-08 [add_forward_monad_depend]: 1.64e-06 [auto_monad_grad]: 1.53002e-06 [auto_monad_eliminator]: 1.159e-05 [cse]: 2.448e-05 [a_3]: 7.776e-05 [py_interpret_to_execute_after_opt_a]: 1.734e-05 [slice_cell_reuse_recomputed_activation]: 2.12999e-06 [rewriter_after_opt_a]: 4.72e-05 [convert_after_rewriter]: 8.57e-06 [order_py_execute_after_rewriter]: 6.32001e-06 [mutable_eliminate]: 0.0007902 [opt_b]: 0.00028058, [1] [Cycle 1]: 0.00027252, [7] [b_1]: 0.00017143 [b_2]: 1.139e-05 [updatestate_depend_eliminate]: 9.22999e-06 [updatestate_assign_eliminate]: 3.5e-06 [updatestate_loads_eliminate]: 3.06001e-06 [renormalize]: 8.30012e-07 [cse]: 3.291e-05 [optimize_parallel_all_gather_comm]: 2.233e-05 [overlap_param_gather]: 1.96e-06 [cconv]: 3.317e-05 [loop_unroll]: 0.00061883 [opt_after_cconv]: 0.0001351, [1] [Cycle 1]: 0.00012775, [7] [c_1]: 4.046e-05 [parameter_eliminate]: 4.88001e-06 [updatestate_depend_eliminate]: 7.05e-06 [updatestate_assign_eliminate]: 3.6e-06 [updatestate_loads_eliminate]: 2.89999e-06 [cse]: 3.072e-05 [renormalize]: 8.09989e-07 [remove_dup_value]: 1.726e-05 [tuple_transform]: 0.00010196, [1] [Cycle 1]: 9.691e-05, [4] [d_1]: 6.5e-05 [none_parameter_eliminate]: 1.69998e-06 [renormalize]: 2.80008e-07 [switch_simplify]: 9.09998e-06 [partial_unused_args_eliminate]: 2.24001e-06 [add_recomputation]: 6.753e-05 [cse_after_recomputation]: 2.939e-05, [1] [Cycle 1]: 2.37e-05, [1] [cse]: 1.737e-05 [environ_conv]: 8.21002e-06 [swap_dp_allreduce_reducescatter]: 6.71e-06 [bias_add_comm_swap]: 3.28998e-06 [label_micro_interleaved_index]: 5.25999e-06 [label_fine_grained_interleaved_index]: 2.83998e-06 [merge_cast_opt]: 1.94e-06 [slice_recompute_activation]: 2.16e-06 [micro_interleaved_order_control]: 2.34999e-06 [assign_add_opt]: 1.39998e-06 [ForceFp32Comm]: 1.05999e-06 [remove_cast_before_assign_add]: 1.20001e-06 [full_micro_interleaved_order_control]: 2.16e-06 [reorder_send_recv_between_fp_bp]: 2.74999e-06 [comm_op_add_attrs]: 1.05001e-06 [add_comm_op_reuse_tag]: 9.89996e-07 [interleave_split_concat_branches]: 1.17e-06 [interleave_parallel_branches]: 1.17e-06 [overlap_opt_shard_in_pipeline]: 1.64998e-06 [overlap_opt_shard_grad_in_pipeline]: 2.12999e-06 [control_data_broadcast_order]: 1.641e-05 [grouped_pairwise_exchange_alltoall]: 1.89999e-06 [offloading_packed_experts]: 5.27001e-06 [overlap_recompute_and_grad_model_parallel]: 5.83002e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.27e-06 [overlap_recompute_allgather_and_fa_grad]: 1.41002e-06 [overlap_recompute_comm]: 2.78e-06 [overlap_grad_ring_attention]: 4.61002e-06 [overlap_grad_flash_sp]: 2.549e-05 [begin_end_overlap_inline]: 5.3001e-07 [split_matmul_comm_elemetwise]: 2.44999e-06 [split_layernorm_comm]: 1.73002e-06 [handle_group_info]: 1.15001e-06 [symbol_engine_optimizer]: 9.297e-05, [1] [Cycle 1]: 8.809e-05, [6] [build]: 3.63e-06 [elim_shapecalc]: 1.307e-05 [elim_not_effective]: 1.605e-05 [opt_reshape]: 9.47999e-06 [fold_const_symbol]: 1.387e-05 [renormalize]: 3.30008e-07 [detach_backward]: 2.16998e-06 [pipeline_parallel_scheduler]: 1.69e-06 [auto_monad_reorder]: 2.15e-05 [get_jit_bprop_graph]: 2.99001e-06 [rewriter_after_jit_bprop_graph]: 5.69999e-06 [opt_after_jit_grad]: 0.00065984 [validate]: 5.241e-05 Sums bootstrap : 0.000439s : 3.47% type_inference : 0.005925s : 46.83% event_method : 0.000020s : 0.16% auto_monad : 0.000067s : 0.53% graph_reusing : 0.000006s : 0.04% inline : 0.000002s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000022s : 0.17% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.05% parallel-infer-symbol : 0.000004s : 0.03% pre_auto_parallel : 0.000037s : 0.30% insert-virtual-dataset : 0.000003s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.01% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000029s : 0.23% optimize.rewriter_before_opt_a : 0.000092s : 0.73% optimize.opt_a.expand_dump_flag : 0.000005s : 0.04% optimize.opt_a.switch_simplify : 0.000055s : 0.43% optimize.opt_a.loop_unroll : 0.000042s : 0.33% optimize.opt_a.a_1 : 0.000991s : 7.84% optimize.opt_a.with_stream_mark : 0.000040s : 0.32% optimize.opt_a.recompute_prepare : 0.000020s : 0.16% optimize.opt_a.updatestate_depend_eliminate : 0.000011s : 0.08% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.06% optimize.opt_a.updatestate_loads_eliminate : 0.000007s : 0.05% optimize.opt_a.parameter_eliminate : 0.000004s : 0.03% optimize.opt_a.a_2 : 0.000204s : 1.61% optimize.opt_a.accelerated_algorithm : 0.000017s : 0.13% optimize.opt_a.shard : 0.000005s : 0.04% optimize.opt_a.meta_shard_fg_expand : 0.000005s : 0.04% optimize.opt_a.shard_inline : 0.000016s : 0.13% optimize.opt_a.merge_send_recv : 0.000018s : 0.14% optimize.opt_a.auto_parallel : 0.000018s : 0.14% optimize.opt_a.parallel : 0.000028s : 0.22% optimize.opt_a.flash_sp : 0.000014s : 0.11% optimize.opt_a.merge_comm : 0.000010s : 0.08% optimize.opt_a.allreduce_fusion : 0.000009s : 0.07% optimize.opt_a.matmul_add_comm_reduction : 0.000020s : 0.16% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000021s : 0.17% optimize.opt_a.virtual_dataset : 0.000017s : 0.13% optimize.opt_a.get_grad_eliminate_ : 0.000015s : 0.12% optimize.opt_a.virtual_output : 0.000016s : 0.12% optimize.opt_a.merge_forward : 0.000009s : 0.07% optimize.opt_a.cell_reuse_recompute_pass : 0.000004s : 0.04% optimize.opt_a.offload_activation : 0.000022s : 0.17% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000035s : 0.28% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.03% optimize.opt_a.before_grad : 0.000027s : 0.21% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000009s : 0.07% optimize.opt_a.meta_fg_expand : 0.000007s : 0.06% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.03% optimize.opt_a.receive_attached : 0.000004s : 0.04% optimize.opt_a.after_resolve : 0.000028s : 0.22% optimize.opt_a.a_after_grad : 0.000024s : 0.19% optimize.opt_a.renormalize : 0.000972s : 7.68% optimize.opt_a.add_forward_monad_depend : 0.000010s : 0.08% optimize.opt_a.auto_monad_grad : 0.000004s : 0.03% optimize.opt_a.auto_monad_eliminator : 0.000033s : 0.26% optimize.opt_a.cse : 0.000065s : 0.51% optimize.opt_a.a_3 : 0.000146s : 1.15% optimize.py_interpret_to_execute_after_opt_a : 0.000017s : 0.14% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.02% optimize.rewriter_after_opt_a : 0.000047s : 0.37% optimize.convert_after_rewriter : 0.000009s : 0.07% optimize.order_py_execute_after_rewriter : 0.000006s : 0.05% optimize.mutable_eliminate : 0.000790s : 6.25% optimize.opt_b.b_1 : 0.000171s : 1.35% optimize.opt_b.b_2 : 0.000011s : 0.09% optimize.opt_b.updatestate_depend_eliminate : 0.000009s : 0.07% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.02% optimize.opt_b.renormalize : 0.000001s : 0.01% optimize.opt_b.cse : 0.000033s : 0.26% optimize.optimize_parallel_all_gather_comm : 0.000022s : 0.18% optimize.overlap_param_gather : 0.000002s : 0.02% optimize.cconv : 0.000033s : 0.26% optimize.loop_unroll : 0.000619s : 4.89% optimize.opt_after_cconv.c_1 : 0.000040s : 0.32% optimize.opt_after_cconv.parameter_eliminate : 0.000005s : 0.04% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000007s : 0.06% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000004s : 0.03% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.02% optimize.opt_after_cconv.cse : 0.000031s : 0.24% optimize.opt_after_cconv.renormalize : 0.000001s : 0.01% optimize.remove_dup_value : 0.000017s : 0.14% optimize.tuple_transform.d_1 : 0.000065s : 0.51% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000009s : 0.07% optimize.partial_unused_args_eliminate : 0.000002s : 0.02% optimize.add_recomputation : 0.000068s : 0.53% optimize.cse_after_recomputation.cse : 0.000017s : 0.14% optimize.environ_conv : 0.000008s : 0.06% optimize.swap_dp_allreduce_reducescatter : 0.000007s : 0.05% optimize.bias_add_comm_swap : 0.000003s : 0.03% optimize.label_micro_interleaved_index : 0.000005s : 0.04% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.02% optimize.merge_cast_opt : 0.000002s : 0.02% optimize.slice_recompute_activation : 0.000002s : 0.02% optimize.micro_interleaved_order_control : 0.000002s : 0.02% optimize.assign_add_opt : 0.000001s : 0.01% optimize.ForceFp32Comm : 0.000001s : 0.01% optimize.remove_cast_before_assign_add : 0.000001s : 0.01% optimize.full_micro_interleaved_order_control : 0.000002s : 0.02% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.02% optimize.comm_op_add_attrs : 0.000001s : 0.01% optimize.add_comm_op_reuse_tag : 0.000001s : 0.01% optimize.interleave_split_concat_branches : 0.000001s : 0.01% optimize.interleave_parallel_branches : 0.000001s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000002s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.02% optimize.control_data_broadcast_order : 0.000016s : 0.13% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.02% optimize.offloading_packed_experts : 0.000005s : 0.04% optimize.overlap_recompute_and_grad_model_parallel : 0.000006s : 0.05% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.01% optimize.overlap_recompute_comm : 0.000003s : 0.02% optimize.overlap_grad_ring_attention : 0.000005s : 0.04% optimize.overlap_grad_flash_sp : 0.000025s : 0.20% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.02% optimize.split_layernorm_comm : 0.000002s : 0.01% optimize.handle_group_info : 0.000001s : 0.01% optimize.symbol_engine_optimizer.build : 0.000004s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000013s : 0.10% optimize.symbol_engine_optimizer.elim_not_effective : 0.000016s : 0.13% optimize.symbol_engine_optimizer.opt_reshape : 0.000009s : 0.07% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000014s : 0.11% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.02% pipeline_parallel_scheduler : 0.000002s : 0.01% auto_monad_reorder : 0.000021s : 0.17% get_jit_bprop_graph : 0.000003s : 0.02% rewriter_after_jit_bprop_graph : 0.000006s : 0.05% opt_after_jit_grad : 0.000660s : 5.22% validate : 0.000052s : 0.41% Time group info: ------[substitution.] 0.000237 38 11.73% : 0.000028s : 3: substitution.cast_eliminate 1.03% : 0.000002s : 3: substitution.elim_not_effective 0.84% : 0.000002s : 3: substitution.fold_const_symbol 3.43% : 0.000008s : 5: substitution.graph_param_transform 67.27% : 0.000159s : 4: substitution.inline 2.39% : 0.000006s : 6: substitution.j_node_and_user_rematch 3.37% : 0.000008s : 6: substitution.remove_not_recompute_node 2.99% : 0.000007s : 4: substitution.replace_old_param 6.96% : 0.000016s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.005858 2 87.35% : 0.005117s : 1: type_inference.infer 12.65% : 0.000741s : 1: type_inference.specialize ------[replace.] 0.000067 8 60.01% : 0.000040s : 4: replace.inline 39.99% : 0.000027s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000171 8 91.67% : 0.000157s : 4: match.inline 8.33% : 0.000014s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000287 1596 0.99% : 0.000003s : 17: predicate.accumulaten_eliminater 1.00% : 0.000003s : 5: predicate.ad_related_special_op_eliminate 0.53% : 0.000002s : 10: predicate.addn_check_dump 0.98% : 0.000003s : 17: predicate.addn_zero_filter 0.86% : 0.000002s : 17: predicate.adjust_all_reduce_mul_add 1.97% : 0.000006s : 27: predicate.arithmetic_simplify 1.12% : 0.000003s : 17: predicate.cast_eliminate 0.62% : 0.000002s : 10: predicate.check_bprop_eliminate 0.61% : 0.000002s : 10: predicate.compare_switch_simplify 0.17% : 0.000001s : 5: predicate.const_output_eliminate 0.77% : 0.000002s : 10: predicate.depend_value_elim 1.05% : 0.000003s : 17: predicate.dict_get_item_const_eliminator 1.07% : 0.000003s : 17: predicate.dict_get_item_eliminator 0.89% : 0.000003s : 17: predicate.dict_set_item_eliminator 0.99% : 0.000003s : 10: predicate.dumpgradient_eliminate 0.20% : 0.000001s : 5: predicate.elim_not_effective 0.40% : 0.000001s : 5: predicate.elim_shapecalc_of_broadcastargs 1.38% : 0.000004s : 22: predicate.environ_add_const_eliminate 1.09% : 0.000003s : 22: predicate.environ_get_add_eliminate 1.13% : 0.000003s : 22: predicate.environ_get_depend_swap 1.65% : 0.000005s : 32: predicate.environ_get_eliminate 1.05% : 0.000003s : 22: predicate.environ_get_set_eliminate 1.30% : 0.000004s : 25: predicate.exchange_switch_depend_value 2.45% : 0.000007s : 25: predicate.float_depend_g_call 0.62% : 0.000002s : 10: predicate.float_environ_get_switch 0.78% : 0.000002s : 15: predicate.float_tuple_getitem_switch 0.17% : 0.000000s : 5: predicate.fold_const_symbol 0.58% : 0.000002s : 10: predicate.get_grad_eliminate 0.17% : 0.000000s : 5: predicate.graph_param_transform 0.57% : 0.000002s : 10: predicate.incorporate_call 0.47% : 0.000001s : 10: predicate.incorporate_call_switch 5.84% : 0.000017s : 72: predicate.inline 0.67% : 0.000002s : 10: predicate.inline_without_move 0.29% : 0.000001s : 10: predicate.j_node_and_user_rematch 0.83% : 0.000002s : 10: predicate.less_batch_normalization 1.82% : 0.000005s : 31: predicate.list_to_tuple_eliminator_ 2.60% : 0.000007s : 48: predicate.load_eliminater 0.98% : 0.000003s : 5: predicate.loop_unroll_after_grad 1.86% : 0.000005s : 36: predicate.loop_unroll_before_grad 1.64% : 0.000005s : 27: predicate.make_slice_get_slice_eliminator 0.55% : 0.000002s : 10: predicate.merge_addn 0.60% : 0.000002s : 10: predicate.micro_step_allgather_replace 0.65% : 0.000002s : 10: predicate.mini_step_allgather_replace 0.86% : 0.000002s : 17: predicate.minmaximum_grad 1.22% : 0.000004s : 5: predicate.mutable_eliminate 0.43% : 0.000001s : 5: predicate.opt_reshape 0.43% : 0.000001s : 5: predicate.parallel_virtual_node 1.73% : 0.000005s : 25: predicate.partial_defer_inline 1.56% : 0.000004s : 26: predicate.partial_eliminate 0.95% : 0.000003s : 17: predicate.print_const_string_wrapper 0.53% : 0.000002s : 10: predicate.reduce_all_const_elim 1.39% : 0.000004s : 17: predicate.reduce_eliminate 2.48% : 0.000007s : 48: predicate.redundant_stop_gradient_eliminater 0.32% : 0.000001s : 10: predicate.remove_not_recompute_node 1.27% : 0.000004s : 31: predicate.replace_applicator 0.52% : 0.000001s : 10: predicate.replace_old_param 0.27% : 0.000001s : 5: predicate.reset_defer_inline 1.03% : 0.000003s : 17: predicate.reshape_eliminate 0.74% : 0.000002s : 10: predicate.row_tensor_add_zeros_like 0.45% : 0.000001s : 5: predicate.row_tensor_eliminate 1.00% : 0.000003s : 10: predicate.same_eliminate 0.35% : 0.000001s : 10: predicate.set_cell_output_no_recompute 0.98% : 0.000003s : 10: predicate.shard_identity_eliminate 0.80% : 0.000002s : 10: predicate.special_op_eliminate 0.63% : 0.000002s : 10: predicate.specialize_transform 1.05% : 0.000003s : 10: predicate.split_environ_get_set_with_tuple_value 0.72% : 0.000002s : 10: predicate.stack_unstack_eliminate 0.30% : 0.000001s : 5: predicate.switch_call_monad_eliminater 1.45% : 0.000004s : 25: predicate.switch_defer_inline 1.91% : 0.000005s : 35: predicate.switch_layer_defer_inline 4.40% : 0.000013s : 76: predicate.switch_simplify 0.90% : 0.000003s : 17: predicate.tile_eliminate 0.94% : 0.000003s : 17: predicate.transpose_eliminate 1.63% : 0.000005s : 27: predicate.tuple_list_convert_item_index_to_positive 1.65% : 0.000005s : 27: predicate.tuple_list_get_item_const_eliminator 1.67% : 0.000005s : 27: predicate.tuple_list_get_item_depend_reorder 3.63% : 0.000010s : 41: predicate.tuple_list_get_item_eliminator 1.53% : 0.000004s : 27: predicate.tuple_list_get_set_item_eliminator 2.55% : 0.000007s : 37: predicate.tuple_list_set_item_eliminator 1.88% : 0.000005s : 31: predicate.tuple_to_list_eliminator_ 2.38% : 0.000007s : 48: predicate.updatestate_pure_node_eliminater 3.12% : 0.000009s : 58: predicate.updatestate_useless_node_eliminater 0.36% : 0.000001s : 5: predicate.value_based_eliminate 0.69% : 0.000002s : 10: predicate.virtual_dataset_eliminate 0.64% : 0.000002s : 10: predicate.virtual_output_eliminate 0.28% : 0.000001s : 5: predicate.virtual_view_grad_eliminate 0.42% : 0.000001s : 5: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000593 11 51.71% : 0.000307s : 5: func_graph_cloner_run.FuncGraphClonerGraph 48.29% : 0.000286s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.029330 192 0.01% : 0.000004s : 1: ForceFp32Comm 11.15% : 0.003271s : 1: add_attr 11.11% : 0.003258s : 1: add_attr_with_inline 0.01% : 0.000004s : 1: add_comm_op_reuse_tag 0.25% : 0.000072s : 1: add_recomputation 0.01% : 0.000004s : 1: assign_add_opt 0.25% : 0.000072s : 1: auto_monad 0.09% : 0.000027s : 1: auto_monad_reorder 0.01% : 0.000004s : 1: begin_end_overlap_inline 0.02% : 0.000006s : 1: bias_add_comm_swap 1.59% : 0.000467s : 1: bootstrap 0.13% : 0.000037s : 1: cconv 0.01% : 0.000004s : 1: comm_op_add_attrs 0.07% : 0.000020s : 1: control_data_broadcast_order 0.04% : 0.000012s : 1: convert_after_rewriter 0.11% : 0.000033s : 1: cse_after_recomputation 0.02% : 0.000005s : 1: dataset_repeat_opt 0.02% : 0.000006s : 1: detach_backward 0.04% : 0.000012s : 1: environ_conv 0.09% : 0.000027s : 1: event_method 0.02% : 0.000005s : 1: full_micro_interleaved_order_control 0.02% : 0.000006s : 1: get_jit_bprop_graph 0.03% : 0.000009s : 1: graph_reusing 0.02% : 0.000005s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000004s : 1: handle_group_info 0.02% : 0.000005s : 1: inline 0.02% : 0.000006s : 1: insert-virtual-dataset 0.01% : 0.000004s : 1: interleave_parallel_branches 0.02% : 0.000005s : 1: interleave_split_concat_branches 0.02% : 0.000006s : 1: label_fine_grained_interleaved_index 0.03% : 0.000008s : 1: label_micro_interleaved_index 2.15% : 0.000631s : 1: loop_unroll 0.02% : 0.000005s : 1: merge_cast_opt 0.02% : 0.000005s : 1: micro_interleaved_order_control 2.74% : 0.000803s : 1: mutable_eliminate 0.03% : 0.000008s : 1: offloading_packed_experts 0.07% : 0.000020s : 1: opt.transform.loop_unroll_optimizer 0.08% : 0.000023s : 1: opt.transform.mutable_eliminate 5.35% : 0.001569s : 78: opt.transform.opt_a 0.13% : 0.000039s : 1: opt.transform.opt_after_cconv 0.13% : 0.000039s : 1: opt.transform.opt_after_jit_grad 0.49% : 0.000145s : 28: opt.transform.opt_b 0.24% : 0.000071s : 2: opt.transform.opt_trans_graph 0.16% : 0.000048s : 4: opt.transform.symbol_engine_opt 12.08% : 0.003544s : 1: opt_a 0.47% : 0.000139s : 1: opt_after_cconv 2.30% : 0.000674s : 1: opt_after_jit_grad 0.97% : 0.000285s : 1: opt_b 21.52% : 0.006312s : 1: optimize 0.09% : 0.000027s : 1: optimize_parallel_all_gather_comm 0.03% : 0.000010s : 1: order_py_execute_after_rewriter 0.10% : 0.000029s : 1: overlap_grad_flash_sp 0.02% : 0.000005s : 1: overlap_grad_matmul_and_grad_allreduce 0.03% : 0.000009s : 1: overlap_grad_ring_attention 0.02% : 0.000006s : 1: overlap_opt_shard_grad_in_pipeline 0.02% : 0.000005s : 1: overlap_opt_shard_in_pipeline 0.02% : 0.000005s : 1: overlap_param_gather 0.02% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.03% : 0.000009s : 1: overlap_recompute_and_grad_model_parallel 0.02% : 0.000006s : 1: overlap_recompute_comm 0.03% : 0.000007s : 1: parallel-infer-symbol 0.01% : 0.000004s : 1: parallel-infer-symbol-second 0.02% : 0.000005s : 1: partial_unused_args_eliminate 0.02% : 0.000005s : 1: pipeline_parallel_scheduler 0.02% : 0.000004s : 1: pipeline_split 0.14% : 0.000041s : 1: pre_auto_parallel 0.11% : 0.000033s : 1: py_interpret_to_execute 0.07% : 0.000022s : 1: py_interpret_to_execute_after_opt_a 0.02% : 0.000005s : 1: remove_cast_before_assign_add 0.07% : 0.000021s : 1: remove_dup_value 1.84% : 0.000541s : 1: renormalize.infer 1.43% : 0.000421s : 1: renormalize.specialize 0.02% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.03% : 0.000009s : 1: rewriter_after_jit_bprop_graph 0.18% : 0.000051s : 1: rewriter_after_opt_a 0.33% : 0.000096s : 1: rewriter_before_opt_a 0.02% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.02% : 0.000005s : 1: slice_recompute_activation 0.02% : 0.000005s : 1: split_layernorm_comm 0.02% : 0.000005s : 1: split_matmul_comm_elemetwise 0.03% : 0.000010s : 1: swap_dp_allreduce_reducescatter 0.33% : 0.000096s : 1: symbol_engine_optimizer 0.36% : 0.000105s : 1: tuple_transform 20.27% : 0.005945s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:45:02.879.586 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:45:02.879.876 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0182456, [21] [bootstrap]: 0.00049956 [type_inference]: 0.00647711 [event_method]: 2.117e-05 [auto_monad]: 6.653e-05 [graph_reusing]: 5.61e-06 [inline]: 3.3e-06 [add_attr]: 0.00338342, [1] [add_attr_with_inline]: 0.00337398, [1] [Cycle 1]: 7.69e-05, [2] [tag_attr]: 2.13e-05 [meta_addattr_fg_expand]: 5.87001e-06 [parallel-infer-symbol]: 3.23998e-06 [pre_auto_parallel]: 3.603e-05 [insert-virtual-dataset]: 2.75997e-06 [parallel-infer-symbol-second]: 7.79983e-07 [dataset_repeat_opt]: 2.22999e-06 [pipeline_split]: 1.74998e-06 [optimize]: 0.00623307, [53] [py_interpret_to_execute]: 3.04e-05 [rewriter_before_opt_a]: 9.259e-05 [opt_a]: 0.00345735, [2] [Cycle 1]: 0.00249957, [45] [expand_dump_flag]: 2.96001e-06 [switch_simplify]: 4.401e-05 [loop_unroll]: 3.156e-05 [a_1]: 0.00073414 [with_stream_mark]: 1.579e-05 [recompute_prepare]: 1.011e-05 [updatestate_depend_eliminate]: 4.25e-06 [updatestate_assign_eliminate]: 3.94002e-06 [updatestate_loads_eliminate]: 3.76999e-06 [parameter_eliminate]: 1.99999e-06 [a_2]: 0.00012965 [accelerated_algorithm]: 8.47e-06 [shard]: 2.19001e-06 [meta_shard_fg_expand]: 1.98002e-06 [shard_inline]: 7.5e-06 [merge_send_recv]: 9.86e-06 [auto_parallel]: 6.63998e-06 [parallel]: 1.836e-05 [flash_sp]: 8.66002e-06 [merge_comm]: 4.94998e-06 [allreduce_fusion]: 4.25e-06 [matmul_add_comm_reduction]: 1.033e-05 [allreduce_slice_to_reducescatter]: 7.00005e-07 [virtual_shard_identity]: 9.47999e-06 [virtual_dataset]: 8.18001e-06 [get_grad_eliminate_]: 8.22e-06 [virtual_output]: 8.05999e-06 [merge_forward]: 4.62998e-06 [cell_reuse_recompute_pass]: 1.49e-06 [offload_activation]: 1.153e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.917e-05 [merge_recompute_call_nodes]: 1.52001e-06 [before_grad]: 1.301e-05 [set_forward_comm_id_for_comm_node_pass]: 4.75999e-06 [meta_fg_expand]: 3.55e-06 [flash_sp_send_recv_attached]: 2.88e-06 [receive_attached]: 2.32999e-06 [after_resolve]: 1.333e-05 [a_after_grad]: 1.287e-05 [renormalize]: 0.0007887 [add_forward_monad_depend]: 5.18002e-06 [auto_monad_grad]: 2.79001e-06 [auto_monad_eliminator]: 1.641e-05 [cse]: 3.576e-05 [a_3]: 7.044e-05 [Cycle 2]: 0.00094427, [45] [expand_dump_flag]: 1.06002e-06 [switch_simplify]: 9.22001e-06 [loop_unroll]: 7.8e-06 [a_1]: 0.00017676 [with_stream_mark]: 1.275e-05 [recompute_prepare]: 7.66999e-06 [updatestate_depend_eliminate]: 4.13001e-06 [updatestate_assign_eliminate]: 3.10998e-06 [updatestate_loads_eliminate]: 2.99001e-06 [parameter_eliminate]: 1.15999e-06 [a_2]: 0.00011979 [accelerated_algorithm]: 7.85998e-06 [shard]: 1.49003e-06 [meta_shard_fg_expand]: 1.91003e-06 [shard_inline]: 1.067e-05 [merge_send_recv]: 5.84e-06 [auto_parallel]: 6.88e-06 [parallel]: 4.67e-06 [flash_sp]: 3.81999e-06 [merge_comm]: 4.15999e-06 [allreduce_fusion]: 4.35999e-06 [matmul_add_comm_reduction]: 7.12002e-06 [allreduce_slice_to_reducescatter]: 5.00004e-07 [virtual_shard_identity]: 8.70001e-06 [virtual_dataset]: 7.41999e-06 [get_grad_eliminate_]: 7.16999e-06 [virtual_output]: 7.02002e-06 [merge_forward]: 5.43002e-06 [cell_reuse_recompute_pass]: 1.47001e-06 [offload_activation]: 7.63999e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.675e-05 [merge_recompute_call_nodes]: 1.29998e-06 [before_grad]: 1.288e-05 [set_forward_comm_id_for_comm_node_pass]: 4.68001e-06 [meta_fg_expand]: 2.88e-06 [flash_sp_send_recv_attached]: 9.50007e-07 [receive_attached]: 1.03001e-06 [after_resolve]: 1.19e-05 [a_after_grad]: 1.139e-05 [renormalize]: 6.00121e-08 [add_forward_monad_depend]: 1.42999e-06 [auto_monad_grad]: 1.05999e-06 [auto_monad_eliminator]: 9.63997e-06 [cse]: 2.042e-05 [a_3]: 5.894e-05 [py_interpret_to_execute_after_opt_a]: 1.59e-05 [slice_cell_reuse_recomputed_activation]: 5.30999e-06 [rewriter_after_opt_a]: 4.813e-05 [convert_after_rewriter]: 1.051e-05 [order_py_execute_after_rewriter]: 8.99998e-06 [mutable_eliminate]: 0.00064697 [opt_b]: 0.00032858, [1] [Cycle 1]: 0.00031657, [7] [b_1]: 0.00020412 [b_2]: 9.74999e-06 [updatestate_depend_eliminate]: 7.41001e-06 [updatestate_assign_eliminate]: 3.48999e-06 [updatestate_loads_eliminate]: 3.65998e-06 [renormalize]: 5.89993e-07 [cse]: 3.031e-05 [optimize_parallel_all_gather_comm]: 2.4e-05 [overlap_param_gather]: 5.43002e-06 [cconv]: 3.37e-05 [loop_unroll]: 0.00055268 [opt_after_cconv]: 0.00015615, [1] [Cycle 1]: 0.00014535, [7] [c_1]: 3.987e-05 [parameter_eliminate]: 4.85001e-06 [updatestate_depend_eliminate]: 6.73e-06 [updatestate_assign_eliminate]: 3.53e-06 [updatestate_loads_eliminate]: 3.33e-06 [cse]: 2.916e-05 [renormalize]: 3.30008e-07 [remove_dup_value]: 2.032e-05 [tuple_transform]: 0.00010937, [1] [Cycle 1]: 0.00010125, [4] [d_1]: 5.938e-05 [none_parameter_eliminate]: 1.87999e-06 [renormalize]: 2.30008e-07 [switch_simplify]: 8.69e-06 [partial_unused_args_eliminate]: 4.73001e-06 [add_recomputation]: 6.586e-05 [cse_after_recomputation]: 6.74e-05, [1] [Cycle 1]: 5.906e-05, [1] [cse]: 4.683e-05 [environ_conv]: 1.119e-05 [swap_dp_allreduce_reducescatter]: 1.024e-05 [bias_add_comm_swap]: 5.59998e-06 [label_micro_interleaved_index]: 8.54e-06 [label_fine_grained_interleaved_index]: 5.58002e-06 [merge_cast_opt]: 3.94002e-06 [slice_recompute_activation]: 4.47998e-06 [micro_interleaved_order_control]: 4.87e-06 [assign_add_opt]: 3.89002e-06 [ForceFp32Comm]: 3.23998e-06 [remove_cast_before_assign_add]: 3.7e-06 [full_micro_interleaved_order_control]: 5.00999e-06 [reorder_send_recv_between_fp_bp]: 5.64e-06 [comm_op_add_attrs]: 3.38999e-06 [add_comm_op_reuse_tag]: 3.28998e-06 [interleave_split_concat_branches]: 3.68e-06 [interleave_parallel_branches]: 3.63999e-06 [overlap_opt_shard_in_pipeline]: 3.97e-06 [overlap_opt_shard_grad_in_pipeline]: 4.53999e-06 [control_data_broadcast_order]: 2.033e-05 [grouped_pairwise_exchange_alltoall]: 4.17e-06 [offloading_packed_experts]: 7.01001e-06 [overlap_recompute_and_grad_model_parallel]: 7.78001e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.91999e-06 [overlap_recompute_allgather_and_fa_grad]: 3.64002e-06 [overlap_recompute_comm]: 4.84e-06 [overlap_grad_ring_attention]: 7.36999e-06 [overlap_grad_flash_sp]: 2.863e-05 [begin_end_overlap_inline]: 2.97002e-06 [split_matmul_comm_elemetwise]: 4.57e-06 [split_layernorm_comm]: 4.04002e-06 [handle_group_info]: 3.66001e-06 [symbol_engine_optimizer]: 0.00012175, [1] [Cycle 1]: 0.00011343, [6] [build]: 3.65998e-06 [elim_shapecalc]: 1.433e-05 [elim_not_effective]: 1.91e-05 [opt_reshape]: 9.86e-06 [fold_const_symbol]: 1.372e-05 [renormalize]: 2.9002e-07 [detach_backward]: 5.02999e-06 [pipeline_parallel_scheduler]: 1.77999e-06 [auto_monad_reorder]: 2.675e-05 [get_jit_bprop_graph]: 2.31998e-06 [rewriter_after_jit_bprop_graph]: 7.15e-06 [opt_after_jit_grad]: 0.00073172 [validate]: 5.525e-05 Sums bootstrap : 0.000500s : 3.85% type_inference : 0.006477s : 49.88% event_method : 0.000021s : 0.16% auto_monad : 0.000067s : 0.51% graph_reusing : 0.000006s : 0.04% inline : 0.000003s : 0.03% add_attr.add_attr_with_inline.tag_attr : 0.000021s : 0.16% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.05% parallel-infer-symbol : 0.000003s : 0.02% pre_auto_parallel : 0.000036s : 0.28% insert-virtual-dataset : 0.000003s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000030s : 0.23% optimize.rewriter_before_opt_a : 0.000093s : 0.71% optimize.opt_a.expand_dump_flag : 0.000004s : 0.03% optimize.opt_a.switch_simplify : 0.000053s : 0.41% optimize.opt_a.loop_unroll : 0.000039s : 0.30% optimize.opt_a.a_1 : 0.000911s : 7.01% optimize.opt_a.with_stream_mark : 0.000029s : 0.22% optimize.opt_a.recompute_prepare : 0.000018s : 0.14% optimize.opt_a.updatestate_depend_eliminate : 0.000008s : 0.06% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.05% optimize.opt_a.updatestate_loads_eliminate : 0.000007s : 0.05% optimize.opt_a.parameter_eliminate : 0.000003s : 0.02% optimize.opt_a.a_2 : 0.000249s : 1.92% optimize.opt_a.accelerated_algorithm : 0.000016s : 0.13% optimize.opt_a.shard : 0.000004s : 0.03% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.03% optimize.opt_a.shard_inline : 0.000018s : 0.14% optimize.opt_a.merge_send_recv : 0.000016s : 0.12% optimize.opt_a.auto_parallel : 0.000014s : 0.10% optimize.opt_a.parallel : 0.000023s : 0.18% optimize.opt_a.flash_sp : 0.000012s : 0.10% optimize.opt_a.merge_comm : 0.000009s : 0.07% optimize.opt_a.allreduce_fusion : 0.000009s : 0.07% optimize.opt_a.matmul_add_comm_reduction : 0.000017s : 0.13% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000018s : 0.14% optimize.opt_a.virtual_dataset : 0.000016s : 0.12% optimize.opt_a.get_grad_eliminate_ : 0.000015s : 0.12% optimize.opt_a.virtual_output : 0.000015s : 0.12% optimize.opt_a.merge_forward : 0.000010s : 0.08% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.02% optimize.opt_a.offload_activation : 0.000019s : 0.15% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000036s : 0.28% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.02% optimize.opt_a.before_grad : 0.000026s : 0.20% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000009s : 0.07% optimize.opt_a.meta_fg_expand : 0.000006s : 0.05% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.03% optimize.opt_a.receive_attached : 0.000003s : 0.03% optimize.opt_a.after_resolve : 0.000025s : 0.19% optimize.opt_a.a_after_grad : 0.000024s : 0.19% optimize.opt_a.renormalize : 0.000789s : 6.07% optimize.opt_a.add_forward_monad_depend : 0.000007s : 0.05% optimize.opt_a.auto_monad_grad : 0.000004s : 0.03% optimize.opt_a.auto_monad_eliminator : 0.000026s : 0.20% optimize.opt_a.cse : 0.000056s : 0.43% optimize.opt_a.a_3 : 0.000129s : 1.00% optimize.py_interpret_to_execute_after_opt_a : 0.000016s : 0.12% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.04% optimize.rewriter_after_opt_a : 0.000048s : 0.37% optimize.convert_after_rewriter : 0.000011s : 0.08% optimize.order_py_execute_after_rewriter : 0.000009s : 0.07% optimize.mutable_eliminate : 0.000647s : 4.98% optimize.opt_b.b_1 : 0.000204s : 1.57% optimize.opt_b.b_2 : 0.000010s : 0.08% optimize.opt_b.updatestate_depend_eliminate : 0.000007s : 0.06% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_b.updatestate_loads_eliminate : 0.000004s : 0.03% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000030s : 0.23% optimize.optimize_parallel_all_gather_comm : 0.000024s : 0.18% optimize.overlap_param_gather : 0.000005s : 0.04% optimize.cconv : 0.000034s : 0.26% optimize.loop_unroll : 0.000553s : 4.26% optimize.opt_after_cconv.c_1 : 0.000040s : 0.31% optimize.opt_after_cconv.parameter_eliminate : 0.000005s : 0.04% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000007s : 0.05% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000004s : 0.03% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.cse : 0.000029s : 0.22% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000020s : 0.16% optimize.tuple_transform.d_1 : 0.000059s : 0.46% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000009s : 0.07% optimize.partial_unused_args_eliminate : 0.000005s : 0.04% optimize.add_recomputation : 0.000066s : 0.51% optimize.cse_after_recomputation.cse : 0.000047s : 0.36% optimize.environ_conv : 0.000011s : 0.09% optimize.swap_dp_allreduce_reducescatter : 0.000010s : 0.08% optimize.bias_add_comm_swap : 0.000006s : 0.04% optimize.label_micro_interleaved_index : 0.000009s : 0.07% optimize.label_fine_grained_interleaved_index : 0.000006s : 0.04% optimize.merge_cast_opt : 0.000004s : 0.03% optimize.slice_recompute_activation : 0.000004s : 0.03% optimize.micro_interleaved_order_control : 0.000005s : 0.04% optimize.assign_add_opt : 0.000004s : 0.03% optimize.ForceFp32Comm : 0.000003s : 0.02% optimize.remove_cast_before_assign_add : 0.000004s : 0.03% optimize.full_micro_interleaved_order_control : 0.000005s : 0.04% optimize.reorder_send_recv_between_fp_bp : 0.000006s : 0.04% optimize.comm_op_add_attrs : 0.000003s : 0.03% optimize.add_comm_op_reuse_tag : 0.000003s : 0.03% optimize.interleave_split_concat_branches : 0.000004s : 0.03% optimize.interleave_parallel_branches : 0.000004s : 0.03% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.03% optimize.overlap_opt_shard_grad_in_pipeline : 0.000005s : 0.03% optimize.control_data_broadcast_order : 0.000020s : 0.16% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.03% optimize.offloading_packed_experts : 0.000007s : 0.05% optimize.overlap_recompute_and_grad_model_parallel : 0.000008s : 0.06% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.03% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.03% optimize.overlap_recompute_comm : 0.000005s : 0.04% optimize.overlap_grad_ring_attention : 0.000007s : 0.06% optimize.overlap_grad_flash_sp : 0.000029s : 0.22% optimize.begin_end_overlap_inline : 0.000003s : 0.02% optimize.split_matmul_comm_elemetwise : 0.000005s : 0.04% optimize.split_layernorm_comm : 0.000004s : 0.03% optimize.handle_group_info : 0.000004s : 0.03% optimize.symbol_engine_optimizer.build : 0.000004s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000014s : 0.11% optimize.symbol_engine_optimizer.elim_not_effective : 0.000019s : 0.15% optimize.symbol_engine_optimizer.opt_reshape : 0.000010s : 0.08% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000014s : 0.11% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000005s : 0.04% pipeline_parallel_scheduler : 0.000002s : 0.01% auto_monad_reorder : 0.000027s : 0.21% get_jit_bprop_graph : 0.000002s : 0.02% rewriter_after_jit_bprop_graph : 0.000007s : 0.06% opt_after_jit_grad : 0.000732s : 5.63% validate : 0.000055s : 0.43% Time group info: ------[substitution.] 0.000209 38 11.20% : 0.000023s : 3: substitution.cast_eliminate 1.22% : 0.000003s : 3: substitution.elim_not_effective 0.83% : 0.000002s : 3: substitution.fold_const_symbol 3.43% : 0.000007s : 5: substitution.graph_param_transform 68.82% : 0.000144s : 4: substitution.inline 2.19% : 0.000005s : 6: substitution.j_node_and_user_rematch 3.03% : 0.000006s : 6: substitution.remove_not_recompute_node 2.26% : 0.000005s : 4: substitution.replace_old_param 7.01% : 0.000015s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.006421 2 88.72% : 0.005696s : 1: type_inference.infer 11.28% : 0.000725s : 1: type_inference.specialize ------[replace.] 0.000062 8 60.31% : 0.000037s : 4: replace.inline 39.69% : 0.000025s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000154 8 91.80% : 0.000141s : 4: match.inline 8.20% : 0.000013s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000266 1596 0.86% : 0.000002s : 17: predicate.accumulaten_eliminater 1.27% : 0.000003s : 5: predicate.ad_related_special_op_eliminate 0.50% : 0.000001s : 10: predicate.addn_check_dump 0.86% : 0.000002s : 17: predicate.addn_zero_filter 0.85% : 0.000002s : 17: predicate.adjust_all_reduce_mul_add 2.04% : 0.000005s : 27: predicate.arithmetic_simplify 1.07% : 0.000003s : 17: predicate.cast_eliminate 0.64% : 0.000002s : 10: predicate.check_bprop_eliminate 0.53% : 0.000001s : 10: predicate.compare_switch_simplify 0.18% : 0.000000s : 5: predicate.const_output_eliminate 0.58% : 0.000002s : 10: predicate.depend_value_elim 0.96% : 0.000003s : 17: predicate.dict_get_item_const_eliminator 1.04% : 0.000003s : 17: predicate.dict_get_item_eliminator 0.91% : 0.000002s : 17: predicate.dict_set_item_eliminator 1.04% : 0.000003s : 10: predicate.dumpgradient_eliminate 0.22% : 0.000001s : 5: predicate.elim_not_effective 0.52% : 0.000001s : 5: predicate.elim_shapecalc_of_broadcastargs 1.18% : 0.000003s : 22: predicate.environ_add_const_eliminate 1.13% : 0.000003s : 22: predicate.environ_get_add_eliminate 1.14% : 0.000003s : 22: predicate.environ_get_depend_swap 1.68% : 0.000004s : 32: predicate.environ_get_eliminate 1.11% : 0.000003s : 22: predicate.environ_get_set_eliminate 1.35% : 0.000004s : 25: predicate.exchange_switch_depend_value 2.23% : 0.000006s : 25: predicate.float_depend_g_call 0.56% : 0.000001s : 10: predicate.float_environ_get_switch 0.93% : 0.000002s : 15: predicate.float_tuple_getitem_switch 0.21% : 0.000001s : 5: predicate.fold_const_symbol 0.62% : 0.000002s : 10: predicate.get_grad_eliminate 0.19% : 0.000001s : 5: predicate.graph_param_transform 0.58% : 0.000002s : 10: predicate.incorporate_call 0.49% : 0.000001s : 10: predicate.incorporate_call_switch 6.11% : 0.000016s : 72: predicate.inline 0.77% : 0.000002s : 10: predicate.inline_without_move 0.33% : 0.000001s : 10: predicate.j_node_and_user_rematch 0.79% : 0.000002s : 10: predicate.less_batch_normalization 1.85% : 0.000005s : 31: predicate.list_to_tuple_eliminator_ 2.52% : 0.000007s : 48: predicate.load_eliminater 1.15% : 0.000003s : 5: predicate.loop_unroll_after_grad 2.08% : 0.000006s : 36: predicate.loop_unroll_before_grad 1.73% : 0.000005s : 27: predicate.make_slice_get_slice_eliminator 0.54% : 0.000001s : 10: predicate.merge_addn 0.62% : 0.000002s : 10: predicate.micro_step_allgather_replace 0.68% : 0.000002s : 10: predicate.mini_step_allgather_replace 0.86% : 0.000002s : 17: predicate.minmaximum_grad 1.21% : 0.000003s : 5: predicate.mutable_eliminate 0.45% : 0.000001s : 5: predicate.opt_reshape 0.42% : 0.000001s : 5: predicate.parallel_virtual_node 1.59% : 0.000004s : 25: predicate.partial_defer_inline 1.65% : 0.000004s : 26: predicate.partial_eliminate 0.91% : 0.000002s : 17: predicate.print_const_string_wrapper 0.58% : 0.000002s : 10: predicate.reduce_all_const_elim 1.23% : 0.000003s : 17: predicate.reduce_eliminate 2.59% : 0.000007s : 48: predicate.redundant_stop_gradient_eliminater 0.39% : 0.000001s : 10: predicate.remove_not_recompute_node 1.24% : 0.000003s : 31: predicate.replace_applicator 0.47% : 0.000001s : 10: predicate.replace_old_param 0.42% : 0.000001s : 5: predicate.reset_defer_inline 1.06% : 0.000003s : 17: predicate.reshape_eliminate 0.58% : 0.000002s : 10: predicate.row_tensor_add_zeros_like 0.37% : 0.000001s : 5: predicate.row_tensor_eliminate 0.84% : 0.000002s : 10: predicate.same_eliminate 0.39% : 0.000001s : 10: predicate.set_cell_output_no_recompute 0.70% : 0.000002s : 10: predicate.shard_identity_eliminate 0.90% : 0.000002s : 10: predicate.special_op_eliminate 0.68% : 0.000002s : 10: predicate.specialize_transform 0.79% : 0.000002s : 10: predicate.split_environ_get_set_with_tuple_value 0.69% : 0.000002s : 10: predicate.stack_unstack_eliminate 0.34% : 0.000001s : 5: predicate.switch_call_monad_eliminater 1.47% : 0.000004s : 25: predicate.switch_defer_inline 2.14% : 0.000006s : 35: predicate.switch_layer_defer_inline 4.65% : 0.000012s : 76: predicate.switch_simplify 0.93% : 0.000002s : 17: predicate.tile_eliminate 0.95% : 0.000003s : 17: predicate.transpose_eliminate 1.76% : 0.000005s : 27: predicate.tuple_list_convert_item_index_to_positive 1.63% : 0.000004s : 27: predicate.tuple_list_get_item_const_eliminator 1.53% : 0.000004s : 27: predicate.tuple_list_get_item_depend_reorder 3.33% : 0.000009s : 41: predicate.tuple_list_get_item_eliminator 1.59% : 0.000004s : 27: predicate.tuple_list_get_set_item_eliminator 2.30% : 0.000006s : 37: predicate.tuple_list_set_item_eliminator 1.75% : 0.000005s : 31: predicate.tuple_to_list_eliminator_ 2.44% : 0.000006s : 48: predicate.updatestate_pure_node_eliminater 3.09% : 0.000008s : 58: predicate.updatestate_useless_node_eliminater 0.57% : 0.000002s : 5: predicate.value_based_eliminate 0.61% : 0.000002s : 10: predicate.virtual_dataset_eliminate 0.61% : 0.000002s : 10: predicate.virtual_output_eliminate 0.29% : 0.000001s : 5: predicate.virtual_view_grad_eliminate 0.37% : 0.000001s : 5: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000581 11 55.59% : 0.000323s : 5: func_graph_cloner_run.FuncGraphClonerGraph 44.41% : 0.000258s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.030304 192 0.02% : 0.000006s : 1: ForceFp32Comm 11.20% : 0.003394s : 1: add_attr 11.15% : 0.003378s : 1: add_attr_with_inline 0.02% : 0.000006s : 1: add_comm_op_reuse_tag 0.23% : 0.000069s : 1: add_recomputation 0.02% : 0.000007s : 1: assign_add_opt 0.25% : 0.000075s : 1: auto_monad 0.11% : 0.000034s : 1: auto_monad_reorder 0.02% : 0.000006s : 1: begin_end_overlap_inline 0.03% : 0.000009s : 1: bias_add_comm_swap 1.85% : 0.000559s : 1: bootstrap 0.12% : 0.000037s : 1: cconv 0.02% : 0.000006s : 1: comm_op_add_attrs 0.08% : 0.000023s : 1: control_data_broadcast_order 0.05% : 0.000014s : 1: convert_after_rewriter 0.23% : 0.000071s : 1: cse_after_recomputation 0.03% : 0.000008s : 1: dataset_repeat_opt 0.08% : 0.000023s : 1: detach_backward 0.05% : 0.000015s : 1: environ_conv 0.10% : 0.000031s : 1: event_method 0.03% : 0.000008s : 1: full_micro_interleaved_order_control 0.03% : 0.000009s : 1: get_jit_bprop_graph 0.04% : 0.000012s : 1: graph_reusing 0.02% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.02% : 0.000006s : 1: handle_group_info 0.03% : 0.000009s : 1: inline 0.03% : 0.000009s : 1: insert-virtual-dataset 0.02% : 0.000006s : 1: interleave_parallel_branches 0.02% : 0.000006s : 1: interleave_split_concat_branches 0.03% : 0.000008s : 1: label_fine_grained_interleaved_index 0.04% : 0.000012s : 1: label_micro_interleaved_index 1.85% : 0.000560s : 1: loop_unroll 0.02% : 0.000007s : 1: merge_cast_opt 0.03% : 0.000008s : 1: micro_interleaved_order_control 2.16% : 0.000655s : 1: mutable_eliminate 0.03% : 0.000010s : 1: offloading_packed_experts 0.06% : 0.000019s : 1: opt.transform.loop_unroll_optimizer 0.07% : 0.000021s : 1: opt.transform.mutable_eliminate 4.74% : 0.001436s : 78: opt.transform.opt_a 0.13% : 0.000038s : 1: opt.transform.opt_after_cconv 0.13% : 0.000039s : 1: opt.transform.opt_after_jit_grad 0.46% : 0.000139s : 28: opt.transform.opt_b 0.22% : 0.000066s : 2: opt.transform.opt_trans_graph 0.17% : 0.000053s : 4: opt.transform.symbol_engine_opt 11.42% : 0.003461s : 1: opt_a 0.53% : 0.000160s : 1: opt_after_cconv 2.46% : 0.000746s : 1: opt_after_jit_grad 1.10% : 0.000332s : 1: opt_b 21.88% : 0.006631s : 1: optimize 0.09% : 0.000027s : 1: optimize_parallel_all_gather_comm 0.04% : 0.000012s : 1: order_py_execute_after_rewriter 0.11% : 0.000032s : 1: overlap_grad_flash_sp 0.02% : 0.000007s : 1: overlap_grad_matmul_and_grad_allreduce 0.04% : 0.000012s : 1: overlap_grad_ring_attention 0.03% : 0.000008s : 1: overlap_opt_shard_grad_in_pipeline 0.02% : 0.000007s : 1: overlap_opt_shard_in_pipeline 0.03% : 0.000009s : 1: overlap_param_gather 0.02% : 0.000006s : 1: overlap_recompute_allgather_and_fa_grad 0.04% : 0.000011s : 1: overlap_recompute_and_grad_model_parallel 0.03% : 0.000008s : 1: overlap_recompute_comm 0.03% : 0.000010s : 1: parallel-infer-symbol 0.02% : 0.000006s : 1: parallel-infer-symbol-second 0.03% : 0.000008s : 1: partial_unused_args_eliminate 0.03% : 0.000010s : 1: pipeline_parallel_scheduler 0.02% : 0.000007s : 1: pipeline_split 0.14% : 0.000043s : 1: pre_auto_parallel 0.11% : 0.000034s : 1: py_interpret_to_execute 0.06% : 0.000019s : 1: py_interpret_to_execute_after_opt_a 0.02% : 0.000007s : 1: remove_cast_before_assign_add 0.08% : 0.000024s : 1: remove_dup_value 1.45% : 0.000439s : 1: renormalize.infer 1.13% : 0.000342s : 1: renormalize.specialize 0.03% : 0.000008s : 1: reorder_send_recv_between_fp_bp 0.04% : 0.000013s : 1: rewriter_after_jit_bprop_graph 0.17% : 0.000052s : 1: rewriter_after_opt_a 0.32% : 0.000096s : 1: rewriter_before_opt_a 0.03% : 0.000008s : 1: slice_cell_reuse_recomputed_activation 0.02% : 0.000007s : 1: slice_recompute_activation 0.02% : 0.000007s : 1: split_layernorm_comm 0.02% : 0.000007s : 1: split_matmul_comm_elemetwise 0.04% : 0.000013s : 1: swap_dp_allreduce_reducescatter 0.41% : 0.000125s : 1: symbol_engine_optimizer 0.37% : 0.000112s : 1: tuple_transform 21.52% : 0.006521s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:45:03.182.446 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0176311, [21] [bootstrap]: 0.00045969 [type_inference]: 0.00661283 [event_method]: 2.122e-05 [auto_monad]: 6.798e-05 [graph_reusing]: 5.66e-06 [inline]: 2.79001e-06 [add_attr]: 0.00382574, [1] [add_attr_with_inline]: 0.00381193, [1] [Cycle 1]: 7.787e-05, [2] [tag_attr]: 2.334e-05 [meta_addattr_fg_expand]: 6.36998e-06 [parallel-infer-symbol]: 3.86999e-06 [pre_auto_parallel]: 4.244e-05 [insert-virtual-dataset]: 2.53998e-06 [parallel-infer-symbol-second]: 8.00006e-07 [dataset_repeat_opt]: 2.08002e-06 [pipeline_split]: 1.81003e-06 [optimize]: 0.00578827, [53] [py_interpret_to_execute]: 3.073e-05 [rewriter_before_opt_a]: 9.79e-05 [opt_a]: 0.00331401, [2] [Cycle 1]: 0.00250863, [45] [expand_dump_flag]: 3.33e-06 [switch_simplify]: 4.54e-05 [loop_unroll]: 3.165e-05 [a_1]: 0.00081253 [with_stream_mark]: 1.997e-05 [recompute_prepare]: 1.242e-05 [updatestate_depend_eliminate]: 4.72e-06 [updatestate_assign_eliminate]: 3.73001e-06 [updatestate_loads_eliminate]: 3.81999e-06 [parameter_eliminate]: 1.99999e-06 [a_2]: 0.00010351 [accelerated_algorithm]: 9.04003e-06 [shard]: 2.09e-06 [meta_shard_fg_expand]: 2.32999e-06 [shard_inline]: 7.76001e-06 [merge_send_recv]: 9.29e-06 [auto_parallel]: 7.45e-06 [parallel]: 2e-05 [flash_sp]: 9.15999e-06 [merge_comm]: 4.58999e-06 [allreduce_fusion]: 4.24002e-06 [matmul_add_comm_reduction]: 1.083e-05 [allreduce_slice_to_reducescatter]: 6.00005e-07 [virtual_shard_identity]: 1.016e-05 [virtual_dataset]: 8.42e-06 [get_grad_eliminate_]: 7.53e-06 [virtual_output]: 7.91001e-06 [merge_forward]: 4.77998e-06 [cell_reuse_recompute_pass]: 1.25001e-06 [offload_activation]: 1.182e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.703e-05 [merge_recompute_call_nodes]: 1.89999e-06 [before_grad]: 1.319e-05 [set_forward_comm_id_for_comm_node_pass]: 5.05999e-06 [meta_fg_expand]: 3.50998e-06 [flash_sp_send_recv_attached]: 2.38998e-06 [receive_attached]: 2.32001e-06 [after_resolve]: 1.353e-05 [a_after_grad]: 1.202e-05 [renormalize]: 0.00087229 [add_forward_monad_depend]: 5.76998e-06 [auto_monad_grad]: 2.50997e-06 [auto_monad_eliminator]: 1.784e-05 [cse]: 3.858e-05 [a_3]: 6.046e-05 [Cycle 2]: 0.0007934, [45] [expand_dump_flag]: 1.44998e-06 [switch_simplify]: 9.92999e-06 [loop_unroll]: 7.41001e-06 [a_1]: 0.00018218 [with_stream_mark]: 1.505e-05 [recompute_prepare]: 8.1e-06 [updatestate_depend_eliminate]: 3.94002e-06 [updatestate_assign_eliminate]: 3.14999e-06 [updatestate_loads_eliminate]: 3.60998e-06 [parameter_eliminate]: 1.45999e-06 [a_2]: 9.545e-05 [accelerated_algorithm]: 7.68999e-06 [shard]: 1.37e-06 [meta_shard_fg_expand]: 1.69e-06 [shard_inline]: 7.81001e-06 [merge_send_recv]: 7.05e-06 [auto_parallel]: 7.21999e-06 [parallel]: 6.35997e-06 [flash_sp]: 3.48999e-06 [merge_comm]: 4.57e-06 [allreduce_fusion]: 5.10001e-06 [matmul_add_comm_reduction]: 6.81001e-06 [allreduce_slice_to_reducescatter]: 6.69999e-07 [virtual_shard_identity]: 8.45999e-06 [virtual_dataset]: 7.75e-06 [get_grad_eliminate_]: 6.76e-06 [virtual_output]: 6.74999e-06 [merge_forward]: 3.32002e-06 [cell_reuse_recompute_pass]: 1.92999e-06 [offload_activation]: 9.39998e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.512e-05 [merge_recompute_call_nodes]: 1.10001e-06 [before_grad]: 1.366e-05 [set_forward_comm_id_for_comm_node_pass]: 4.29997e-06 [meta_fg_expand]: 3.15998e-06 [flash_sp_send_recv_attached]: 1.05001e-06 [receive_attached]: 1.30001e-06 [after_resolve]: 1.168e-05 [a_after_grad]: 1.198e-05 [renormalize]: 8.9989e-08 [add_forward_monad_depend]: 2.02999e-06 [auto_monad_grad]: 1.00001e-06 [auto_monad_eliminator]: 9.17999e-06 [cse]: 1.978e-05 [a_3]: 4.555e-05 [py_interpret_to_execute_after_opt_a]: 1.366e-05 [slice_cell_reuse_recomputed_activation]: 2.02001e-06 [rewriter_after_opt_a]: 4.228e-05 [convert_after_rewriter]: 7.75998e-06 [order_py_execute_after_rewriter]: 6.07001e-06 [mutable_eliminate]: 0.0006659 [opt_b]: 0.00028339, [1] [Cycle 1]: 0.00027642, [7] [b_1]: 0.00017279 [b_2]: 1.131e-05 [updatestate_depend_eliminate]: 7.98001e-06 [updatestate_assign_eliminate]: 3.3e-06 [updatestate_loads_eliminate]: 3.13e-06 [renormalize]: 4.19997e-07 [cse]: 2.836e-05 [optimize_parallel_all_gather_comm]: 1.918e-05 [overlap_param_gather]: 1.87001e-06 [cconv]: 2.898e-05 [loop_unroll]: 0.00049795 [opt_after_cconv]: 0.00013085, [1] [Cycle 1]: 0.00012458, [7] [c_1]: 3.999e-05 [parameter_eliminate]: 5.18002e-06 [updatestate_depend_eliminate]: 7.25e-06 [updatestate_assign_eliminate]: 3.75e-06 [updatestate_loads_eliminate]: 3.41999e-06 [cse]: 2.715e-05 [renormalize]: 7.29982e-07 [remove_dup_value]: 1.649e-05 [tuple_transform]: 9.195e-05, [1] [Cycle 1]: 8.682e-05, [4] [d_1]: 5.527e-05 [none_parameter_eliminate]: 2.37001e-06 [renormalize]: 2.79979e-07 [switch_simplify]: 9.20001e-06 [partial_unused_args_eliminate]: 2.36e-06 [add_recomputation]: 6.553e-05 [cse_after_recomputation]: 2.759e-05, [1] [Cycle 1]: 2.216e-05, [1] [cse]: 1.63e-05 [environ_conv]: 7.18998e-06 [swap_dp_allreduce_reducescatter]: 5.91e-06 [bias_add_comm_swap]: 3.16999e-06 [label_micro_interleaved_index]: 5.14e-06 [label_fine_grained_interleaved_index]: 2.99001e-06 [merge_cast_opt]: 1.42e-06 [slice_recompute_activation]: 2.25002e-06 [micro_interleaved_order_control]: 2.52001e-06 [assign_add_opt]: 1.29998e-06 [ForceFp32Comm]: 8.2e-07 [remove_cast_before_assign_add]: 9.89996e-07 [full_micro_interleaved_order_control]: 2.14e-06 [reorder_send_recv_between_fp_bp]: 3.33e-06 [comm_op_add_attrs]: 1.02e-06 [add_comm_op_reuse_tag]: 9.39996e-07 [interleave_split_concat_branches]: 1.07e-06 [interleave_parallel_branches]: 1.24e-06 [overlap_opt_shard_in_pipeline]: 1.22e-06 [overlap_opt_shard_grad_in_pipeline]: 2.09999e-06 [control_data_broadcast_order]: 1.656e-05 [grouped_pairwise_exchange_alltoall]: 1.50999e-06 [offloading_packed_experts]: 4.72e-06 [overlap_recompute_and_grad_model_parallel]: 5.39e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.46998e-06 [overlap_recompute_allgather_and_fa_grad]: 1.50999e-06 [overlap_recompute_comm]: 2.46e-06 [overlap_grad_ring_attention]: 4.50001e-06 [overlap_grad_flash_sp]: 2.514e-05 [begin_end_overlap_inline]: 6.00005e-07 [split_matmul_comm_elemetwise]: 2.54999e-06 [split_layernorm_comm]: 1.64998e-06 [handle_group_info]: 1.05999e-06 [symbol_engine_optimizer]: 9.313e-05, [1] [Cycle 1]: 8.747e-05, [6] [build]: 4.70001e-06 [elim_shapecalc]: 1.468e-05 [elim_not_effective]: 1.542e-05 [opt_reshape]: 8.78001e-06 [fold_const_symbol]: 1.251e-05 [renormalize]: 3.09985e-07 [detach_backward]: 2.08002e-06 [pipeline_parallel_scheduler]: 1.54e-06 [auto_monad_reorder]: 2.23e-05 [get_jit_bprop_graph]: 1.82999e-06 [rewriter_after_jit_bprop_graph]: 4.98001e-06 [opt_after_jit_grad]: 0.00053984 [validate]: 4.987e-05 Sums bootstrap : 0.000460s : 3.60% type_inference : 0.006613s : 51.77% event_method : 0.000021s : 0.17% auto_monad : 0.000068s : 0.53% graph_reusing : 0.000006s : 0.04% inline : 0.000003s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000023s : 0.18% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.05% parallel-infer-symbol : 0.000004s : 0.03% pre_auto_parallel : 0.000042s : 0.33% insert-virtual-dataset : 0.000003s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000031s : 0.24% optimize.rewriter_before_opt_a : 0.000098s : 0.77% optimize.opt_a.expand_dump_flag : 0.000005s : 0.04% optimize.opt_a.switch_simplify : 0.000055s : 0.43% optimize.opt_a.loop_unroll : 0.000039s : 0.31% optimize.opt_a.a_1 : 0.000995s : 7.79% optimize.opt_a.with_stream_mark : 0.000035s : 0.27% optimize.opt_a.recompute_prepare : 0.000021s : 0.16% optimize.opt_a.updatestate_depend_eliminate : 0.000009s : 0.07% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.05% optimize.opt_a.updatestate_loads_eliminate : 0.000007s : 0.06% optimize.opt_a.parameter_eliminate : 0.000003s : 0.03% optimize.opt_a.a_2 : 0.000199s : 1.56% optimize.opt_a.accelerated_algorithm : 0.000017s : 0.13% optimize.opt_a.shard : 0.000003s : 0.03% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.03% optimize.opt_a.shard_inline : 0.000016s : 0.12% optimize.opt_a.merge_send_recv : 0.000016s : 0.13% optimize.opt_a.auto_parallel : 0.000015s : 0.11% optimize.opt_a.parallel : 0.000026s : 0.21% optimize.opt_a.flash_sp : 0.000013s : 0.10% optimize.opt_a.merge_comm : 0.000009s : 0.07% optimize.opt_a.allreduce_fusion : 0.000009s : 0.07% optimize.opt_a.matmul_add_comm_reduction : 0.000018s : 0.14% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000019s : 0.15% optimize.opt_a.virtual_dataset : 0.000016s : 0.13% optimize.opt_a.get_grad_eliminate_ : 0.000014s : 0.11% optimize.opt_a.virtual_output : 0.000015s : 0.11% optimize.opt_a.merge_forward : 0.000008s : 0.06% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.02% optimize.opt_a.offload_activation : 0.000021s : 0.17% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000032s : 0.25% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.02% optimize.opt_a.before_grad : 0.000027s : 0.21% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000009s : 0.07% optimize.opt_a.meta_fg_expand : 0.000007s : 0.05% optimize.opt_a.flash_sp_send_recv_attached : 0.000003s : 0.03% optimize.opt_a.receive_attached : 0.000004s : 0.03% optimize.opt_a.after_resolve : 0.000025s : 0.20% optimize.opt_a.a_after_grad : 0.000024s : 0.19% optimize.opt_a.renormalize : 0.000872s : 6.83% optimize.opt_a.add_forward_monad_depend : 0.000008s : 0.06% optimize.opt_a.auto_monad_grad : 0.000004s : 0.03% optimize.opt_a.auto_monad_eliminator : 0.000027s : 0.21% optimize.opt_a.cse : 0.000058s : 0.46% optimize.opt_a.a_3 : 0.000106s : 0.83% optimize.py_interpret_to_execute_after_opt_a : 0.000014s : 0.11% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.02% optimize.rewriter_after_opt_a : 0.000042s : 0.33% optimize.convert_after_rewriter : 0.000008s : 0.06% optimize.order_py_execute_after_rewriter : 0.000006s : 0.05% optimize.mutable_eliminate : 0.000666s : 5.21% optimize.opt_b.b_1 : 0.000173s : 1.35% optimize.opt_b.b_2 : 0.000011s : 0.09% optimize.opt_b.updatestate_depend_eliminate : 0.000008s : 0.06% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.02% optimize.opt_b.renormalize : 0.000000s : 0.00% optimize.opt_b.cse : 0.000028s : 0.22% optimize.optimize_parallel_all_gather_comm : 0.000019s : 0.15% optimize.overlap_param_gather : 0.000002s : 0.01% optimize.cconv : 0.000029s : 0.23% optimize.loop_unroll : 0.000498s : 3.90% optimize.opt_after_cconv.c_1 : 0.000040s : 0.31% optimize.opt_after_cconv.parameter_eliminate : 0.000005s : 0.04% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000007s : 0.06% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000004s : 0.03% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.cse : 0.000027s : 0.21% optimize.opt_after_cconv.renormalize : 0.000001s : 0.01% optimize.remove_dup_value : 0.000016s : 0.13% optimize.tuple_transform.d_1 : 0.000055s : 0.43% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.02% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000009s : 0.07% optimize.partial_unused_args_eliminate : 0.000002s : 0.02% optimize.add_recomputation : 0.000066s : 0.51% optimize.cse_after_recomputation.cse : 0.000016s : 0.13% optimize.environ_conv : 0.000007s : 0.06% optimize.swap_dp_allreduce_reducescatter : 0.000006s : 0.05% optimize.bias_add_comm_swap : 0.000003s : 0.02% optimize.label_micro_interleaved_index : 0.000005s : 0.04% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.02% optimize.merge_cast_opt : 0.000001s : 0.01% optimize.slice_recompute_activation : 0.000002s : 0.02% optimize.micro_interleaved_order_control : 0.000003s : 0.02% optimize.assign_add_opt : 0.000001s : 0.01% optimize.ForceFp32Comm : 0.000001s : 0.01% optimize.remove_cast_before_assign_add : 0.000001s : 0.01% optimize.full_micro_interleaved_order_control : 0.000002s : 0.02% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.03% optimize.comm_op_add_attrs : 0.000001s : 0.01% optimize.add_comm_op_reuse_tag : 0.000001s : 0.01% optimize.interleave_split_concat_branches : 0.000001s : 0.01% optimize.interleave_parallel_branches : 0.000001s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.02% optimize.control_data_broadcast_order : 0.000017s : 0.13% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.01% optimize.offloading_packed_experts : 0.000005s : 0.04% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.04% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000002s : 0.01% optimize.overlap_recompute_comm : 0.000002s : 0.02% optimize.overlap_grad_ring_attention : 0.000005s : 0.04% optimize.overlap_grad_flash_sp : 0.000025s : 0.20% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000003s : 0.02% optimize.split_layernorm_comm : 0.000002s : 0.01% optimize.handle_group_info : 0.000001s : 0.01% optimize.symbol_engine_optimizer.build : 0.000005s : 0.04% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000015s : 0.11% optimize.symbol_engine_optimizer.elim_not_effective : 0.000015s : 0.12% optimize.symbol_engine_optimizer.opt_reshape : 0.000009s : 0.07% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000013s : 0.10% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.02% pipeline_parallel_scheduler : 0.000002s : 0.01% auto_monad_reorder : 0.000022s : 0.17% get_jit_bprop_graph : 0.000002s : 0.01% rewriter_after_jit_bprop_graph : 0.000005s : 0.04% opt_after_jit_grad : 0.000540s : 4.23% validate : 0.000050s : 0.39% Time group info: ------[substitution.] 0.000241 38 10.40% : 0.000025s : 3: substitution.cast_eliminate 0.91% : 0.000002s : 3: substitution.elim_not_effective 0.68% : 0.000002s : 3: substitution.fold_const_symbol 2.83% : 0.000007s : 5: substitution.graph_param_transform 71.56% : 0.000173s : 4: substitution.inline 2.23% : 0.000005s : 6: substitution.j_node_and_user_rematch 2.71% : 0.000007s : 6: substitution.remove_not_recompute_node 2.03% : 0.000005s : 4: substitution.replace_old_param 6.66% : 0.000016s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.006538 2 87.67% : 0.005732s : 1: type_inference.infer 12.33% : 0.000806s : 1: type_inference.specialize ------[replace.] 0.000069 8 61.09% : 0.000042s : 4: replace.inline 38.91% : 0.000027s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000184 8 92.44% : 0.000170s : 4: match.inline 7.56% : 0.000014s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000305 1596 0.87% : 0.000003s : 17: predicate.accumulaten_eliminater 0.73% : 0.000002s : 5: predicate.ad_related_special_op_eliminate 0.50% : 0.000002s : 10: predicate.addn_check_dump 0.82% : 0.000003s : 17: predicate.addn_zero_filter 0.77% : 0.000002s : 17: predicate.adjust_all_reduce_mul_add 1.92% : 0.000006s : 27: predicate.arithmetic_simplify 1.02% : 0.000003s : 17: predicate.cast_eliminate 0.51% : 0.000002s : 10: predicate.check_bprop_eliminate 0.47% : 0.000001s : 10: predicate.compare_switch_simplify 0.17% : 0.000001s : 5: predicate.const_output_eliminate 0.51% : 0.000002s : 10: predicate.depend_value_elim 0.85% : 0.000003s : 17: predicate.dict_get_item_const_eliminator 0.92% : 0.000003s : 17: predicate.dict_get_item_eliminator 0.78% : 0.000002s : 17: predicate.dict_set_item_eliminator 0.88% : 0.000003s : 10: predicate.dumpgradient_eliminate 0.16% : 0.000000s : 5: predicate.elim_not_effective 0.44% : 0.000001s : 5: predicate.elim_shapecalc_of_broadcastargs 1.00% : 0.000003s : 22: predicate.environ_add_const_eliminate 1.08% : 0.000003s : 22: predicate.environ_get_add_eliminate 1.01% : 0.000003s : 22: predicate.environ_get_depend_swap 1.50% : 0.000005s : 32: predicate.environ_get_eliminate 0.97% : 0.000003s : 22: predicate.environ_get_set_eliminate 1.20% : 0.000004s : 25: predicate.exchange_switch_depend_value 1.95% : 0.000006s : 25: predicate.float_depend_g_call 0.48% : 0.000001s : 10: predicate.float_environ_get_switch 0.67% : 0.000002s : 15: predicate.float_tuple_getitem_switch 0.15% : 0.000000s : 5: predicate.fold_const_symbol 0.53% : 0.000002s : 10: predicate.get_grad_eliminate 0.17% : 0.000001s : 5: predicate.graph_param_transform 0.53% : 0.000002s : 10: predicate.incorporate_call 0.48% : 0.000001s : 10: predicate.incorporate_call_switch 13.31% : 0.000041s : 72: predicate.inline 0.79% : 0.000002s : 10: predicate.inline_without_move 0.33% : 0.000001s : 10: predicate.j_node_and_user_rematch 0.79% : 0.000002s : 10: predicate.less_batch_normalization 5.73% : 0.000017s : 31: predicate.list_to_tuple_eliminator_ 2.24% : 0.000007s : 48: predicate.load_eliminater 0.83% : 0.000003s : 5: predicate.loop_unroll_after_grad 1.65% : 0.000005s : 36: predicate.loop_unroll_before_grad 1.49% : 0.000005s : 27: predicate.make_slice_get_slice_eliminator 0.49% : 0.000002s : 10: predicate.merge_addn 0.60% : 0.000002s : 10: predicate.micro_step_allgather_replace 0.49% : 0.000001s : 10: predicate.mini_step_allgather_replace 0.77% : 0.000002s : 17: predicate.minmaximum_grad 1.07% : 0.000003s : 5: predicate.mutable_eliminate 0.28% : 0.000001s : 5: predicate.opt_reshape 0.36% : 0.000001s : 5: predicate.parallel_virtual_node 1.45% : 0.000004s : 25: predicate.partial_defer_inline 1.44% : 0.000004s : 26: predicate.partial_eliminate 0.82% : 0.000003s : 17: predicate.print_const_string_wrapper 0.46% : 0.000001s : 10: predicate.reduce_all_const_elim 1.10% : 0.000003s : 17: predicate.reduce_eliminate 2.24% : 0.000007s : 48: predicate.redundant_stop_gradient_eliminater 0.32% : 0.000001s : 10: predicate.remove_not_recompute_node 1.19% : 0.000004s : 31: predicate.replace_applicator 0.42% : 0.000001s : 10: predicate.replace_old_param 0.28% : 0.000001s : 5: predicate.reset_defer_inline 0.85% : 0.000003s : 17: predicate.reshape_eliminate 0.50% : 0.000002s : 10: predicate.row_tensor_add_zeros_like 0.40% : 0.000001s : 5: predicate.row_tensor_eliminate 0.64% : 0.000002s : 10: predicate.same_eliminate 0.33% : 0.000001s : 10: predicate.set_cell_output_no_recompute 0.65% : 0.000002s : 10: predicate.shard_identity_eliminate 0.69% : 0.000002s : 10: predicate.special_op_eliminate 0.58% : 0.000002s : 10: predicate.specialize_transform 0.75% : 0.000002s : 10: predicate.split_environ_get_set_with_tuple_value 0.63% : 0.000002s : 10: predicate.stack_unstack_eliminate 0.36% : 0.000001s : 5: predicate.switch_call_monad_eliminater 1.31% : 0.000004s : 25: predicate.switch_defer_inline 1.82% : 0.000006s : 35: predicate.switch_layer_defer_inline 4.34% : 0.000013s : 76: predicate.switch_simplify 0.86% : 0.000003s : 17: predicate.tile_eliminate 0.85% : 0.000003s : 17: predicate.transpose_eliminate 1.42% : 0.000004s : 27: predicate.tuple_list_convert_item_index_to_positive 1.46% : 0.000004s : 27: predicate.tuple_list_get_item_const_eliminator 1.33% : 0.000004s : 27: predicate.tuple_list_get_item_depend_reorder 2.76% : 0.000008s : 41: predicate.tuple_list_get_item_eliminator 1.32% : 0.000004s : 27: predicate.tuple_list_get_set_item_eliminator 2.19% : 0.000007s : 37: predicate.tuple_list_set_item_eliminator 1.58% : 0.000005s : 31: predicate.tuple_to_list_eliminator_ 2.25% : 0.000007s : 48: predicate.updatestate_pure_node_eliminater 3.00% : 0.000009s : 58: predicate.updatestate_useless_node_eliminater 0.33% : 0.000001s : 5: predicate.value_based_eliminate 0.67% : 0.000002s : 10: predicate.virtual_dataset_eliminate 0.54% : 0.000002s : 10: predicate.virtual_output_eliminate 0.30% : 0.000001s : 5: predicate.virtual_view_grad_eliminate 0.31% : 0.000001s : 5: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000615 11 53.29% : 0.000328s : 5: func_graph_cloner_run.FuncGraphClonerGraph 46.71% : 0.000287s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.029856 192 0.01% : 0.000003s : 1: ForceFp32Comm 12.83% : 0.003832s : 1: add_attr 12.78% : 0.003817s : 1: add_attr_with_inline 0.01% : 0.000004s : 1: add_comm_op_reuse_tag 0.23% : 0.000070s : 1: add_recomputation 0.01% : 0.000004s : 1: assign_add_opt 0.25% : 0.000074s : 1: auto_monad 0.09% : 0.000026s : 1: auto_monad_reorder 0.01% : 0.000003s : 1: begin_end_overlap_inline 0.02% : 0.000006s : 1: bias_add_comm_swap 1.65% : 0.000492s : 1: bootstrap 0.11% : 0.000032s : 1: cconv 0.01% : 0.000004s : 1: comm_op_add_attrs 0.07% : 0.000020s : 1: control_data_broadcast_order 0.04% : 0.000011s : 1: convert_after_rewriter 0.10% : 0.000031s : 1: cse_after_recomputation 0.02% : 0.000005s : 1: dataset_repeat_opt 0.02% : 0.000005s : 1: detach_backward 0.03% : 0.000010s : 1: environ_conv 0.09% : 0.000028s : 1: event_method 0.02% : 0.000005s : 1: full_micro_interleaved_order_control 0.02% : 0.000005s : 1: get_jit_bprop_graph 0.03% : 0.000009s : 1: graph_reusing 0.01% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000004s : 1: handle_group_info 0.02% : 0.000006s : 1: inline 0.02% : 0.000006s : 1: insert-virtual-dataset 0.01% : 0.000004s : 1: interleave_parallel_branches 0.01% : 0.000004s : 1: interleave_split_concat_branches 0.02% : 0.000006s : 1: label_fine_grained_interleaved_index 0.03% : 0.000008s : 1: label_micro_interleaved_index 1.70% : 0.000508s : 1: loop_unroll 0.01% : 0.000004s : 1: merge_cast_opt 0.02% : 0.000005s : 1: micro_interleaved_order_control 2.26% : 0.000676s : 1: mutable_eliminate 0.03% : 0.000008s : 1: offloading_packed_experts 0.07% : 0.000020s : 1: opt.transform.loop_unroll_optimizer 0.07% : 0.000020s : 1: opt.transform.mutable_eliminate 5.09% : 0.001518s : 78: opt.transform.opt_a 0.13% : 0.000039s : 1: opt.transform.opt_after_cconv 0.11% : 0.000033s : 1: opt.transform.opt_after_jit_grad 0.49% : 0.000148s : 28: opt.transform.opt_b 0.21% : 0.000062s : 2: opt.transform.opt_trans_graph 0.16% : 0.000047s : 4: opt.transform.symbol_engine_opt 11.11% : 0.003317s : 1: opt_a 0.45% : 0.000135s : 1: opt_after_cconv 1.84% : 0.000550s : 1: opt_after_jit_grad 0.98% : 0.000292s : 1: opt_b 19.41% : 0.005794s : 1: optimize 0.08% : 0.000023s : 1: optimize_parallel_all_gather_comm 0.03% : 0.000009s : 1: order_py_execute_after_rewriter 0.10% : 0.000029s : 1: overlap_grad_flash_sp 0.01% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.03% : 0.000008s : 1: overlap_grad_ring_attention 0.02% : 0.000006s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.02% : 0.000005s : 1: overlap_param_gather 0.01% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.03% : 0.000008s : 1: overlap_recompute_and_grad_model_parallel 0.02% : 0.000005s : 1: overlap_recompute_comm 0.03% : 0.000008s : 1: parallel-infer-symbol 0.01% : 0.000004s : 1: parallel-infer-symbol-second 0.02% : 0.000005s : 1: partial_unused_args_eliminate 0.02% : 0.000004s : 1: pipeline_parallel_scheduler 0.02% : 0.000005s : 1: pipeline_split 0.16% : 0.000047s : 1: pre_auto_parallel 0.12% : 0.000035s : 1: py_interpret_to_execute 0.06% : 0.000017s : 1: py_interpret_to_execute_after_opt_a 0.02% : 0.000005s : 1: remove_cast_before_assign_add 0.07% : 0.000020s : 1: remove_dup_value 1.64% : 0.000490s : 1: renormalize.infer 1.25% : 0.000374s : 1: renormalize.specialize 0.02% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.03% : 0.000008s : 1: rewriter_after_jit_bprop_graph 0.15% : 0.000046s : 1: rewriter_after_opt_a 0.34% : 0.000103s : 1: rewriter_before_opt_a 0.02% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.02% : 0.000005s : 1: slice_recompute_activation 0.01% : 0.000004s : 1: split_layernorm_comm 0.02% : 0.000005s : 1: split_matmul_comm_elemetwise 0.03% : 0.000009s : 1: swap_dp_allreduce_reducescatter 0.32% : 0.000096s : 1: symbol_engine_optimizer 0.32% : 0.000095s : 1: tuple_transform 22.23% : 0.006637s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:45:03.535.435 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:45:03.535.742 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0201763, [21] [bootstrap]: 0.00052894 [type_inference]: 0.00728376 [event_method]: 2.117e-05 [auto_monad]: 7.053e-05 [graph_reusing]: 6.58e-06 [inline]: 3.13e-06 [add_attr]: 0.00403261, [1] [add_attr_with_inline]: 0.00401864, [1] [Cycle 1]: 9.339e-05, [2] [tag_attr]: 2.393e-05 [meta_addattr_fg_expand]: 6.24001e-06 [parallel-infer-symbol]: 3.35e-06 [pre_auto_parallel]: 4.411e-05 [insert-virtual-dataset]: 2.98e-06 [parallel-infer-symbol-second]: 7.99977e-07 [dataset_repeat_opt]: 2.14e-06 [pipeline_split]: 1.59e-06 [optimize]: 0.00642505, [53] [py_interpret_to_execute]: 3.694e-05 [rewriter_before_opt_a]: 9.599e-05 [opt_a]: 0.00351082, [2] [Cycle 1]: 0.00257832, [45] [expand_dump_flag]: 3.25998e-06 [switch_simplify]: 4.333e-05 [loop_unroll]: 3.029e-05 [a_1]: 0.00070991 [with_stream_mark]: 2.621e-05 [recompute_prepare]: 1.003e-05 [updatestate_depend_eliminate]: 4.87998e-06 [updatestate_assign_eliminate]: 3.58e-06 [updatestate_loads_eliminate]: 3.83001e-06 [parameter_eliminate]: 2.26e-06 [a_2]: 0.00011579 [accelerated_algorithm]: 8.58001e-06 [shard]: 2.22999e-06 [meta_shard_fg_expand]: 2.22999e-06 [shard_inline]: 6.68e-06 [merge_send_recv]: 9.54999e-06 [auto_parallel]: 9.25001e-06 [parallel]: 2.235e-05 [flash_sp]: 1.071e-05 [merge_comm]: 4.03999e-06 [allreduce_fusion]: 4.28001e-06 [matmul_add_comm_reduction]: 1.015e-05 [allreduce_slice_to_reducescatter]: 8.30012e-07 [virtual_shard_identity]: 9.57001e-06 [virtual_dataset]: 7.33e-06 [get_grad_eliminate_]: 6.83e-06 [virtual_output]: 7.23999e-06 [merge_forward]: 4.67e-06 [cell_reuse_recompute_pass]: 1.25001e-06 [offload_activation]: 1.061e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.723e-05 [merge_recompute_call_nodes]: 1.44e-06 [before_grad]: 1.243e-05 [set_forward_comm_id_for_comm_node_pass]: 4.04002e-06 [meta_fg_expand]: 3.46001e-06 [flash_sp_send_recv_attached]: 3.21999e-06 [receive_attached]: 2.66999e-06 [after_resolve]: 1.378e-05 [a_after_grad]: 1.121e-05 [renormalize]: 0.00086105 [add_forward_monad_depend]: 7.75e-06 [auto_monad_grad]: 3.03e-06 [auto_monad_eliminator]: 1.981e-05 [cse]: 2.959e-05 [a_3]: 6.951e-05 [Cycle 2]: 0.00091421, [45] [expand_dump_flag]: 2.46998e-06 [switch_simplify]: 9.39e-06 [loop_unroll]: 6.54001e-06 [a_1]: 0.0001395 [with_stream_mark]: 1.602e-05 [recompute_prepare]: 6.81001e-06 [updatestate_depend_eliminate]: 3.74002e-06 [updatestate_assign_eliminate]: 2.89001e-06 [updatestate_loads_eliminate]: 2.29001e-06 [parameter_eliminate]: 1.74e-06 [a_2]: 0.00010304 [accelerated_algorithm]: 6.54001e-06 [shard]: 1.84e-06 [meta_shard_fg_expand]: 2.34001e-06 [shard_inline]: 6.88e-06 [merge_send_recv]: 7.37997e-06 [auto_parallel]: 7.53e-06 [parallel]: 8.08999e-06 [flash_sp]: 3.48e-06 [merge_comm]: 4.12e-06 [allreduce_fusion]: 3.91001e-06 [matmul_add_comm_reduction]: 9.59999e-06 [allreduce_slice_to_reducescatter]: 5.59987e-07 [virtual_shard_identity]: 8.3e-06 [virtual_dataset]: 7.14001e-06 [get_grad_eliminate_]: 7.61001e-06 [virtual_output]: 6.04999e-06 [merge_forward]: 3.4e-06 [cell_reuse_recompute_pass]: 2.98e-06 [offload_activation]: 1.172e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.943e-05 [merge_recompute_call_nodes]: 1.09e-06 [before_grad]: 1.091e-05 [set_forward_comm_id_for_comm_node_pass]: 3.68999e-06 [meta_fg_expand]: 2.58e-06 [flash_sp_send_recv_attached]: 1.72001e-06 [receive_attached]: 1.61998e-06 [after_resolve]: 1.26e-05 [a_after_grad]: 9.87999e-06 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 2.47001e-06 [auto_monad_grad]: 2.06e-06 [auto_monad_eliminator]: 1.04e-05 [cse]: 1.835e-05 [a_3]: 5.188e-05 [py_interpret_to_execute_after_opt_a]: 1.786e-05 [slice_cell_reuse_recomputed_activation]: 4.58001e-06 [rewriter_after_opt_a]: 4.659e-05 [convert_after_rewriter]: 1.024e-05 [order_py_execute_after_rewriter]: 8.35001e-06 [mutable_eliminate]: 0.00077456 [opt_b]: 0.00033525, [1] [Cycle 1]: 0.00032308, [7] [b_1]: 0.00017695 [b_2]: 9.27001e-06 [updatestate_depend_eliminate]: 9.46e-06 [updatestate_assign_eliminate]: 2.86e-06 [updatestate_loads_eliminate]: 2.98e-06 [renormalize]: 9.39996e-07 [cse]: 5.742e-05 [optimize_parallel_all_gather_comm]: 2.619e-05 [overlap_param_gather]: 5.37001e-06 [cconv]: 3.812e-05 [loop_unroll]: 0.00063277 [opt_after_cconv]: 0.00014492, [1] [Cycle 1]: 0.00013433, [7] [c_1]: 3.34e-05 [parameter_eliminate]: 6.01e-06 [updatestate_depend_eliminate]: 7.53e-06 [updatestate_assign_eliminate]: 2.84999e-06 [updatestate_loads_eliminate]: 2.80002e-06 [cse]: 2.438e-05 [renormalize]: 6.60017e-07 [remove_dup_value]: 1.812e-05 [tuple_transform]: 0.00010006, [1] [Cycle 1]: 9.253e-05, [4] [d_1]: 5.14e-05 [none_parameter_eliminate]: 1.43002e-06 [renormalize]: 2.50002e-07 [switch_simplify]: 7.13e-06 [partial_unused_args_eliminate]: 4.87e-06 [add_recomputation]: 5.686e-05 [cse_after_recomputation]: 2.964e-05, [1] [Cycle 1]: 2.169e-05, [1] [cse]: 1.208e-05 [environ_conv]: 1.036e-05 [swap_dp_allreduce_reducescatter]: 8.63001e-06 [bias_add_comm_swap]: 5.87001e-06 [label_micro_interleaved_index]: 8.80001e-06 [label_fine_grained_interleaved_index]: 5.82001e-06 [merge_cast_opt]: 3.95998e-06 [slice_recompute_activation]: 5.00999e-06 [micro_interleaved_order_control]: 4.51002e-06 [assign_add_opt]: 3.85e-06 [ForceFp32Comm]: 3.75998e-06 [remove_cast_before_assign_add]: 3.84002e-06 [full_micro_interleaved_order_control]: 4.99e-06 [reorder_send_recv_between_fp_bp]: 5.62999e-06 [comm_op_add_attrs]: 4.07e-06 [add_comm_op_reuse_tag]: 3.35e-06 [interleave_split_concat_branches]: 3.48999e-06 [interleave_parallel_branches]: 3.46999e-06 [overlap_opt_shard_in_pipeline]: 3.68999e-06 [overlap_opt_shard_grad_in_pipeline]: 4.52998e-06 [control_data_broadcast_order]: 1.712e-05 [grouped_pairwise_exchange_alltoall]: 4.80001e-06 [offloading_packed_experts]: 6.94999e-06 [overlap_recompute_and_grad_model_parallel]: 8.22003e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.44001e-06 [overlap_recompute_allgather_and_fa_grad]: 3.98999e-06 [overlap_recompute_comm]: 5.02e-06 [overlap_grad_ring_attention]: 6.76e-06 [overlap_grad_flash_sp]: 2.528e-05 [begin_end_overlap_inline]: 2.99001e-06 [split_matmul_comm_elemetwise]: 4.38001e-06 [split_layernorm_comm]: 4.46002e-06 [handle_group_info]: 3.41001e-06 [symbol_engine_optimizer]: 0.00010469, [1] [Cycle 1]: 9.72e-05, [6] [build]: 3.5e-06 [elim_shapecalc]: 1.126e-05 [elim_not_effective]: 1.454e-05 [opt_reshape]: 8.17998e-06 [fold_const_symbol]: 1.126e-05 [renormalize]: 2.59985e-07 [detach_backward]: 5.35999e-06 [pipeline_parallel_scheduler]: 2.21e-06 [auto_monad_reorder]: 2.279e-05 [get_jit_bprop_graph]: 1.88002e-06 [rewriter_after_jit_bprop_graph]: 8.54e-06 [opt_after_jit_grad]: 0.00071644 [validate]: 4.817e-05 Sums bootstrap : 0.000529s : 3.79% type_inference : 0.007284s : 52.25% event_method : 0.000021s : 0.15% auto_monad : 0.000071s : 0.51% graph_reusing : 0.000007s : 0.05% inline : 0.000003s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000024s : 0.17% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.04% parallel-infer-symbol : 0.000003s : 0.02% pre_auto_parallel : 0.000044s : 0.32% insert-virtual-dataset : 0.000003s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000037s : 0.26% optimize.rewriter_before_opt_a : 0.000096s : 0.69% optimize.opt_a.expand_dump_flag : 0.000006s : 0.04% optimize.opt_a.switch_simplify : 0.000053s : 0.38% optimize.opt_a.loop_unroll : 0.000037s : 0.26% optimize.opt_a.a_1 : 0.000849s : 6.09% optimize.opt_a.with_stream_mark : 0.000042s : 0.30% optimize.opt_a.recompute_prepare : 0.000017s : 0.12% optimize.opt_a.updatestate_depend_eliminate : 0.000009s : 0.06% optimize.opt_a.updatestate_assign_eliminate : 0.000006s : 0.05% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.04% optimize.opt_a.parameter_eliminate : 0.000004s : 0.03% optimize.opt_a.a_2 : 0.000219s : 1.57% optimize.opt_a.accelerated_algorithm : 0.000015s : 0.11% optimize.opt_a.shard : 0.000004s : 0.03% optimize.opt_a.meta_shard_fg_expand : 0.000005s : 0.03% optimize.opt_a.shard_inline : 0.000014s : 0.10% optimize.opt_a.merge_send_recv : 0.000017s : 0.12% optimize.opt_a.auto_parallel : 0.000017s : 0.12% optimize.opt_a.parallel : 0.000030s : 0.22% optimize.opt_a.flash_sp : 0.000014s : 0.10% optimize.opt_a.merge_comm : 0.000008s : 0.06% optimize.opt_a.allreduce_fusion : 0.000008s : 0.06% optimize.opt_a.matmul_add_comm_reduction : 0.000020s : 0.14% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000018s : 0.13% optimize.opt_a.virtual_dataset : 0.000014s : 0.10% optimize.opt_a.get_grad_eliminate_ : 0.000014s : 0.10% optimize.opt_a.virtual_output : 0.000013s : 0.10% optimize.opt_a.merge_forward : 0.000008s : 0.06% optimize.opt_a.cell_reuse_recompute_pass : 0.000004s : 0.03% optimize.opt_a.offload_activation : 0.000022s : 0.16% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000037s : 0.26% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.02% optimize.opt_a.before_grad : 0.000023s : 0.17% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000008s : 0.06% optimize.opt_a.meta_fg_expand : 0.000006s : 0.04% optimize.opt_a.flash_sp_send_recv_attached : 0.000005s : 0.04% optimize.opt_a.receive_attached : 0.000004s : 0.03% optimize.opt_a.after_resolve : 0.000026s : 0.19% optimize.opt_a.a_after_grad : 0.000021s : 0.15% optimize.opt_a.renormalize : 0.000861s : 6.18% optimize.opt_a.add_forward_monad_depend : 0.000010s : 0.07% optimize.opt_a.auto_monad_grad : 0.000005s : 0.04% optimize.opt_a.auto_monad_eliminator : 0.000030s : 0.22% optimize.opt_a.cse : 0.000048s : 0.34% optimize.opt_a.a_3 : 0.000121s : 0.87% optimize.py_interpret_to_execute_after_opt_a : 0.000018s : 0.13% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.03% optimize.rewriter_after_opt_a : 0.000047s : 0.33% optimize.convert_after_rewriter : 0.000010s : 0.07% optimize.order_py_execute_after_rewriter : 0.000008s : 0.06% optimize.mutable_eliminate : 0.000775s : 5.56% optimize.opt_b.b_1 : 0.000177s : 1.27% optimize.opt_b.b_2 : 0.000009s : 0.07% optimize.opt_b.updatestate_depend_eliminate : 0.000009s : 0.07% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.02% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.02% optimize.opt_b.renormalize : 0.000001s : 0.01% optimize.opt_b.cse : 0.000057s : 0.41% optimize.optimize_parallel_all_gather_comm : 0.000026s : 0.19% optimize.overlap_param_gather : 0.000005s : 0.04% optimize.cconv : 0.000038s : 0.27% optimize.loop_unroll : 0.000633s : 4.54% optimize.opt_after_cconv.c_1 : 0.000033s : 0.24% optimize.opt_after_cconv.parameter_eliminate : 0.000006s : 0.04% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000008s : 0.05% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.02% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.02% optimize.opt_after_cconv.cse : 0.000024s : 0.17% optimize.opt_after_cconv.renormalize : 0.000001s : 0.00% optimize.remove_dup_value : 0.000018s : 0.13% optimize.tuple_transform.d_1 : 0.000051s : 0.37% optimize.tuple_transform.none_parameter_eliminate : 0.000001s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000007s : 0.05% optimize.partial_unused_args_eliminate : 0.000005s : 0.03% optimize.add_recomputation : 0.000057s : 0.41% optimize.cse_after_recomputation.cse : 0.000012s : 0.09% optimize.environ_conv : 0.000010s : 0.07% optimize.swap_dp_allreduce_reducescatter : 0.000009s : 0.06% optimize.bias_add_comm_swap : 0.000006s : 0.04% optimize.label_micro_interleaved_index : 0.000009s : 0.06% optimize.label_fine_grained_interleaved_index : 0.000006s : 0.04% optimize.merge_cast_opt : 0.000004s : 0.03% optimize.slice_recompute_activation : 0.000005s : 0.04% optimize.micro_interleaved_order_control : 0.000005s : 0.03% optimize.assign_add_opt : 0.000004s : 0.03% optimize.ForceFp32Comm : 0.000004s : 0.03% optimize.remove_cast_before_assign_add : 0.000004s : 0.03% optimize.full_micro_interleaved_order_control : 0.000005s : 0.04% optimize.reorder_send_recv_between_fp_bp : 0.000006s : 0.04% optimize.comm_op_add_attrs : 0.000004s : 0.03% optimize.add_comm_op_reuse_tag : 0.000003s : 0.02% optimize.interleave_split_concat_branches : 0.000003s : 0.03% optimize.interleave_parallel_branches : 0.000003s : 0.02% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.03% optimize.overlap_opt_shard_grad_in_pipeline : 0.000005s : 0.03% optimize.control_data_broadcast_order : 0.000017s : 0.12% optimize.grouped_pairwise_exchange_alltoall : 0.000005s : 0.03% optimize.offloading_packed_experts : 0.000007s : 0.05% optimize.overlap_recompute_and_grad_model_parallel : 0.000008s : 0.06% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000003s : 0.02% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.03% optimize.overlap_recompute_comm : 0.000005s : 0.04% optimize.overlap_grad_ring_attention : 0.000007s : 0.05% optimize.overlap_grad_flash_sp : 0.000025s : 0.18% optimize.begin_end_overlap_inline : 0.000003s : 0.02% optimize.split_matmul_comm_elemetwise : 0.000004s : 0.03% optimize.split_layernorm_comm : 0.000004s : 0.03% optimize.handle_group_info : 0.000003s : 0.02% optimize.symbol_engine_optimizer.build : 0.000003s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000011s : 0.08% optimize.symbol_engine_optimizer.elim_not_effective : 0.000015s : 0.10% optimize.symbol_engine_optimizer.opt_reshape : 0.000008s : 0.06% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000011s : 0.08% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000005s : 0.04% pipeline_parallel_scheduler : 0.000002s : 0.02% auto_monad_reorder : 0.000023s : 0.16% get_jit_bprop_graph : 0.000002s : 0.01% rewriter_after_jit_bprop_graph : 0.000009s : 0.06% opt_after_jit_grad : 0.000716s : 5.14% validate : 0.000048s : 0.35% Time group info: ------[substitution.] 0.000244 28 0.78% : 0.000002s : 2: substitution.elim_not_effective 0.86% : 0.000002s : 2: substitution.fold_const_symbol 2.66% : 0.000006s : 4: substitution.graph_param_transform 71.84% : 0.000176s : 4: substitution.inline 2.15% : 0.000005s : 4: substitution.j_node_and_user_rematch 2.26% : 0.000006s : 4: substitution.remove_not_recompute_node 2.46% : 0.000006s : 4: substitution.replace_old_param 16.99% : 0.000042s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.007219 2 87.38% : 0.006308s : 1: type_inference.infer 12.62% : 0.000911s : 1: type_inference.specialize ------[replace.] 0.000068 8 60.92% : 0.000041s : 4: replace.inline 39.08% : 0.000027s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000189 8 91.56% : 0.000173s : 4: match.inline 8.44% : 0.000016s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000228 1278 0.89% : 0.000002s : 13: predicate.accumulaten_eliminater 1.20% : 0.000003s : 4: predicate.ad_related_special_op_eliminate 0.48% : 0.000001s : 8: predicate.addn_check_dump 0.94% : 0.000002s : 13: predicate.addn_zero_filter 0.78% : 0.000002s : 13: predicate.adjust_all_reduce_mul_add 2.12% : 0.000005s : 21: predicate.arithmetic_simplify 1.00% : 0.000002s : 13: predicate.cast_eliminate 0.54% : 0.000001s : 8: predicate.check_bprop_eliminate 0.54% : 0.000001s : 8: predicate.compare_switch_simplify 0.17% : 0.000000s : 4: predicate.const_output_eliminate 0.64% : 0.000001s : 8: predicate.depend_value_elim 0.85% : 0.000002s : 13: predicate.dict_get_item_const_eliminator 1.19% : 0.000003s : 13: predicate.dict_get_item_eliminator 0.86% : 0.000002s : 13: predicate.dict_set_item_eliminator 1.01% : 0.000002s : 8: predicate.dumpgradient_eliminate 0.30% : 0.000001s : 4: predicate.elim_not_effective 0.35% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.13% : 0.000003s : 17: predicate.environ_add_const_eliminate 1.05% : 0.000002s : 17: predicate.environ_get_add_eliminate 1.02% : 0.000002s : 17: predicate.environ_get_depend_swap 1.55% : 0.000004s : 25: predicate.environ_get_eliminate 0.97% : 0.000002s : 17: predicate.environ_get_set_eliminate 1.38% : 0.000003s : 21: predicate.exchange_switch_depend_value 2.43% : 0.000006s : 21: predicate.float_depend_g_call 0.62% : 0.000001s : 8: predicate.float_environ_get_switch 0.88% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.16% : 0.000000s : 4: predicate.fold_const_symbol 0.69% : 0.000002s : 8: predicate.get_grad_eliminate 0.27% : 0.000001s : 4: predicate.graph_param_transform 0.61% : 0.000001s : 8: predicate.incorporate_call 0.46% : 0.000001s : 8: predicate.incorporate_call_switch 6.27% : 0.000014s : 58: predicate.inline 0.75% : 0.000002s : 8: predicate.inline_without_move 0.28% : 0.000001s : 8: predicate.j_node_and_user_rematch 0.93% : 0.000002s : 8: predicate.less_batch_normalization 1.73% : 0.000004s : 25: predicate.list_to_tuple_eliminator_ 2.47% : 0.000006s : 38: predicate.load_eliminater 1.07% : 0.000002s : 4: predicate.loop_unroll_after_grad 2.22% : 0.000005s : 34: predicate.loop_unroll_before_grad 1.75% : 0.000004s : 21: predicate.make_slice_get_slice_eliminator 0.64% : 0.000001s : 8: predicate.merge_addn 0.53% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.58% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.76% : 0.000002s : 13: predicate.minmaximum_grad 1.48% : 0.000003s : 4: predicate.mutable_eliminate 0.41% : 0.000001s : 4: predicate.opt_reshape 0.49% : 0.000001s : 4: predicate.parallel_virtual_node 1.83% : 0.000004s : 21: predicate.partial_defer_inline 1.50% : 0.000003s : 21: predicate.partial_eliminate 0.80% : 0.000002s : 13: predicate.print_const_string_wrapper 0.65% : 0.000001s : 8: predicate.reduce_all_const_elim 1.29% : 0.000003s : 13: predicate.reduce_eliminate 2.46% : 0.000006s : 38: predicate.redundant_stop_gradient_eliminater 0.42% : 0.000001s : 8: predicate.remove_not_recompute_node 1.30% : 0.000003s : 25: predicate.replace_applicator 0.44% : 0.000001s : 8: predicate.replace_old_param 0.49% : 0.000001s : 4: predicate.reset_defer_inline 0.94% : 0.000002s : 13: predicate.reshape_eliminate 0.64% : 0.000001s : 8: predicate.row_tensor_add_zeros_like 0.39% : 0.000001s : 4: predicate.row_tensor_eliminate 0.83% : 0.000002s : 8: predicate.same_eliminate 0.39% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.87% : 0.000002s : 8: predicate.shard_identity_eliminate 0.81% : 0.000002s : 8: predicate.special_op_eliminate 0.63% : 0.000001s : 8: predicate.specialize_transform 0.95% : 0.000002s : 8: predicate.split_environ_get_set_with_tuple_value 0.74% : 0.000002s : 8: predicate.stack_unstack_eliminate 0.35% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.58% : 0.000004s : 21: predicate.switch_defer_inline 2.05% : 0.000005s : 29: predicate.switch_layer_defer_inline 4.96% : 0.000011s : 67: predicate.switch_simplify 0.87% : 0.000002s : 13: predicate.tile_eliminate 1.02% : 0.000002s : 13: predicate.transpose_eliminate 1.63% : 0.000004s : 21: predicate.tuple_list_convert_item_index_to_positive 1.63% : 0.000004s : 21: predicate.tuple_list_get_item_const_eliminator 1.46% : 0.000003s : 21: predicate.tuple_list_get_item_depend_reorder 3.31% : 0.000008s : 33: predicate.tuple_list_get_item_eliminator 1.49% : 0.000003s : 21: predicate.tuple_list_get_set_item_eliminator 2.32% : 0.000005s : 29: predicate.tuple_list_set_item_eliminator 1.75% : 0.000004s : 25: predicate.tuple_to_list_eliminator_ 2.27% : 0.000005s : 38: predicate.updatestate_pure_node_eliminater 3.05% : 0.000007s : 46: predicate.updatestate_useless_node_eliminater 0.54% : 0.000001s : 4: predicate.value_based_eliminate 0.64% : 0.000001s : 8: predicate.virtual_dataset_eliminate 0.66% : 0.000001s : 8: predicate.virtual_output_eliminate 0.27% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.36% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000709 11 55.80% : 0.000396s : 5: func_graph_cloner_run.FuncGraphClonerGraph 44.20% : 0.000314s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.032952 192 0.02% : 0.000006s : 1: ForceFp32Comm 12.28% : 0.004045s : 1: add_attr 12.21% : 0.004023s : 1: add_attr_with_inline 0.02% : 0.000007s : 1: add_comm_op_reuse_tag 0.18% : 0.000061s : 1: add_recomputation 0.02% : 0.000006s : 1: assign_add_opt 0.24% : 0.000080s : 1: auto_monad 0.10% : 0.000032s : 1: auto_monad_reorder 0.02% : 0.000006s : 1: begin_end_overlap_inline 0.03% : 0.000009s : 1: bias_add_comm_swap 1.75% : 0.000578s : 1: bootstrap 0.13% : 0.000042s : 1: cconv 0.02% : 0.000007s : 1: comm_op_add_attrs 0.06% : 0.000020s : 1: control_data_broadcast_order 0.04% : 0.000013s : 1: convert_after_rewriter 0.10% : 0.000033s : 1: cse_after_recomputation 0.02% : 0.000008s : 1: dataset_repeat_opt 0.08% : 0.000027s : 1: detach_backward 0.04% : 0.000013s : 1: environ_conv 0.10% : 0.000033s : 1: event_method 0.02% : 0.000008s : 1: full_micro_interleaved_order_control 0.03% : 0.000008s : 1: get_jit_bprop_graph 0.04% : 0.000013s : 1: graph_reusing 0.02% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.02% : 0.000006s : 1: handle_group_info 0.03% : 0.000009s : 1: inline 0.03% : 0.000009s : 1: insert-virtual-dataset 0.02% : 0.000006s : 1: interleave_parallel_branches 0.02% : 0.000006s : 1: interleave_split_concat_branches 0.03% : 0.000009s : 1: label_fine_grained_interleaved_index 0.03% : 0.000011s : 1: label_micro_interleaved_index 1.95% : 0.000641s : 1: loop_unroll 0.02% : 0.000007s : 1: merge_cast_opt 0.02% : 0.000007s : 1: micro_interleaved_order_control 2.38% : 0.000783s : 1: mutable_eliminate 0.03% : 0.000010s : 1: offloading_packed_experts 0.06% : 0.000018s : 1: opt.transform.loop_unroll_optimizer 0.07% : 0.000022s : 1: opt.transform.mutable_eliminate 3.97% : 0.001310s : 78: opt.transform.opt_a 0.10% : 0.000032s : 1: opt.transform.opt_after_cconv 0.10% : 0.000033s : 1: opt.transform.opt_after_jit_grad 0.33% : 0.000109s : 28: opt.transform.opt_b 0.17% : 0.000056s : 2: opt.transform.opt_trans_graph 0.12% : 0.000041s : 4: opt.transform.symbol_engine_opt 10.67% : 0.003515s : 1: opt_a 0.45% : 0.000149s : 1: opt_after_cconv 2.22% : 0.000731s : 1: opt_after_jit_grad 1.03% : 0.000339s : 1: opt_b 21.47% : 0.007074s : 1: optimize 0.09% : 0.000030s : 1: optimize_parallel_all_gather_comm 0.03% : 0.000011s : 1: order_py_execute_after_rewriter 0.09% : 0.000029s : 1: overlap_grad_flash_sp 0.02% : 0.000006s : 1: overlap_grad_matmul_and_grad_allreduce 0.03% : 0.000010s : 1: overlap_grad_ring_attention 0.02% : 0.000007s : 1: overlap_opt_shard_grad_in_pipeline 0.02% : 0.000007s : 1: overlap_opt_shard_in_pipeline 0.03% : 0.000009s : 1: overlap_param_gather 0.02% : 0.000007s : 1: overlap_recompute_allgather_and_fa_grad 0.03% : 0.000011s : 1: overlap_recompute_and_grad_model_parallel 0.02% : 0.000008s : 1: overlap_recompute_comm 0.03% : 0.000011s : 1: parallel-infer-symbol 0.02% : 0.000006s : 1: parallel-infer-symbol-second 0.02% : 0.000008s : 1: partial_unused_args_eliminate 0.03% : 0.000011s : 1: pipeline_parallel_scheduler 0.02% : 0.000007s : 1: pipeline_split 0.16% : 0.000052s : 1: pre_auto_parallel 0.12% : 0.000041s : 1: py_interpret_to_execute 0.06% : 0.000021s : 1: py_interpret_to_execute_after_opt_a 0.02% : 0.000006s : 1: remove_cast_before_assign_add 0.07% : 0.000021s : 1: remove_dup_value 1.35% : 0.000445s : 1: renormalize.infer 1.23% : 0.000406s : 1: renormalize.specialize 0.03% : 0.000008s : 1: reorder_send_recv_between_fp_bp 0.04% : 0.000015s : 1: rewriter_after_jit_bprop_graph 0.15% : 0.000051s : 1: rewriter_after_opt_a 0.30% : 0.000100s : 1: rewriter_before_opt_a 0.02% : 0.000007s : 1: slice_cell_reuse_recomputed_activation 0.03% : 0.000008s : 1: slice_recompute_activation 0.02% : 0.000007s : 1: split_layernorm_comm 0.02% : 0.000007s : 1: split_matmul_comm_elemetwise 0.04% : 0.000012s : 1: swap_dp_allreduce_reducescatter 0.33% : 0.000108s : 1: symbol_engine_optimizer 0.31% : 0.000103s : 1: tuple_transform 22.27% : 0.007338s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:45:03.838.542 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0182954, [21] [bootstrap]: 0.00046846 [type_inference]: 0.00677057 [event_method]: 2.277e-05 [auto_monad]: 6.654e-05 [graph_reusing]: 6.14999e-06 [inline]: 2.81e-06 [add_attr]: 0.00390507, [1] [add_attr_with_inline]: 0.00389193, [1] [Cycle 1]: 7.949e-05, [2] [tag_attr]: 2.406e-05 [meta_addattr_fg_expand]: 6.23e-06 [parallel-infer-symbol]: 3.73001e-06 [pre_auto_parallel]: 4.326e-05 [insert-virtual-dataset]: 2.49001e-06 [parallel-infer-symbol-second]: 9.00007e-07 [dataset_repeat_opt]: 2.26e-06 [pipeline_split]: 1.95001e-06 [optimize]: 0.00603141, [53] [py_interpret_to_execute]: 3.224e-05 [rewriter_before_opt_a]: 9.598e-05 [opt_a]: 0.00332091, [2] [Cycle 1]: 0.00251592, [45] [expand_dump_flag]: 2.98998e-06 [switch_simplify]: 4.587e-05 [loop_unroll]: 3.08e-05 [a_1]: 0.00069842 [with_stream_mark]: 2.522e-05 [recompute_prepare]: 1.423e-05 [updatestate_depend_eliminate]: 4.84e-06 [updatestate_assign_eliminate]: 3.43e-06 [updatestate_loads_eliminate]: 3.10998e-06 [parameter_eliminate]: 2.26998e-06 [a_2]: 8.842e-05 [accelerated_algorithm]: 7.86001e-06 [shard]: 2.02999e-06 [meta_shard_fg_expand]: 2.36e-06 [shard_inline]: 7.18e-06 [merge_send_recv]: 1.126e-05 [auto_parallel]: 1.06e-05 [parallel]: 2.315e-05 [flash_sp]: 1.037e-05 [merge_comm]: 5.47001e-06 [allreduce_fusion]: 3.58e-06 [matmul_add_comm_reduction]: 1.062e-05 [allreduce_slice_to_reducescatter]: 7.30011e-07 [virtual_shard_identity]: 1.121e-05 [virtual_dataset]: 7.26001e-06 [get_grad_eliminate_]: 6.76e-06 [virtual_output]: 6.89001e-06 [merge_forward]: 4.31002e-06 [cell_reuse_recompute_pass]: 1.68002e-06 [offload_activation]: 1.159e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.849e-05 [merge_recompute_call_nodes]: 1.74998e-06 [before_grad]: 1.294e-05 [set_forward_comm_id_for_comm_node_pass]: 4.05e-06 [meta_fg_expand]: 3.42002e-06 [flash_sp_send_recv_attached]: 3.96001e-06 [receive_attached]: 2.36e-06 [after_resolve]: 1.414e-05 [a_after_grad]: 1.148e-05 [renormalize]: 0.00094542 [add_forward_monad_depend]: 1.038e-05 [auto_monad_grad]: 2.81e-06 [auto_monad_eliminator]: 2.215e-05 [cse]: 3.449e-05 [a_3]: 6.121e-05 [Cycle 2]: 0.00079029, [45] [expand_dump_flag]: 1.80001e-06 [switch_simplify]: 9.44998e-06 [loop_unroll]: 6.86001e-06 [a_1]: 0.00014542 [with_stream_mark]: 2.184e-05 [recompute_prepare]: 8.43001e-06 [updatestate_depend_eliminate]: 4.48001e-06 [updatestate_assign_eliminate]: 2.96001e-06 [updatestate_loads_eliminate]: 3.66001e-06 [parameter_eliminate]: 1.60999e-06 [a_2]: 7.624e-05 [accelerated_algorithm]: 7.52002e-06 [shard]: 3.03e-06 [meta_shard_fg_expand]: 2.28998e-06 [shard_inline]: 6.56999e-06 [merge_send_recv]: 8.68001e-06 [auto_parallel]: 9.96e-06 [parallel]: 9.04e-06 [flash_sp]: 3.58e-06 [merge_comm]: 3.98001e-06 [allreduce_fusion]: 3.92998e-06 [matmul_add_comm_reduction]: 1e-05 [allreduce_slice_to_reducescatter]: 7.50006e-07 [virtual_shard_identity]: 9.34e-06 [virtual_dataset]: 6.44999e-06 [get_grad_eliminate_]: 5.96e-06 [virtual_output]: 6.82002e-06 [merge_forward]: 4.38999e-06 [cell_reuse_recompute_pass]: 3.13e-06 [offload_activation]: 1.166e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.044e-05 [merge_recompute_call_nodes]: 2.21e-06 [before_grad]: 1.108e-05 [set_forward_comm_id_for_comm_node_pass]: 3.75998e-06 [meta_fg_expand]: 3.01999e-06 [flash_sp_send_recv_attached]: 1.92999e-06 [receive_attached]: 1.89e-06 [after_resolve]: 1.4e-05 [a_after_grad]: 1.13e-05 [renormalize]: 8.9989e-08 [add_forward_monad_depend]: 3.24001e-06 [auto_monad_grad]: 2.54001e-06 [auto_monad_eliminator]: 1.195e-05 [cse]: 2.23e-05 [a_3]: 3.887e-05 [py_interpret_to_execute_after_opt_a]: 1.856e-05 [slice_cell_reuse_recomputed_activation]: 2.42001e-06 [rewriter_after_opt_a]: 4.526e-05 [convert_after_rewriter]: 8.08001e-06 [order_py_execute_after_rewriter]: 6.14001e-06 [mutable_eliminate]: 0.00078804 [opt_b]: 0.00024933, [1] [Cycle 1]: 0.0002404, [7] [b_1]: 0.00014347 [b_2]: 9.42001e-06 [updatestate_depend_eliminate]: 1.084e-05 [updatestate_assign_eliminate]: 2.93998e-06 [updatestate_loads_eliminate]: 2.79001e-06 [renormalize]: 7.59988e-07 [cse]: 3.119e-05 [optimize_parallel_all_gather_comm]: 2.158e-05 [overlap_param_gather]: 2.06e-06 [cconv]: 3.893e-05 [loop_unroll]: 0.00062869 [opt_after_cconv]: 0.00012845, [1] [Cycle 1]: 0.00012065, [7] [c_1]: 3.386e-05 [parameter_eliminate]: 6.53e-06 [updatestate_depend_eliminate]: 8.97e-06 [updatestate_assign_eliminate]: 2.88e-06 [updatestate_loads_eliminate]: 2.44999e-06 [cse]: 2.84e-05 [renormalize]: 7.7e-07 [remove_dup_value]: 1.452e-05 [tuple_transform]: 8.943e-05, [1] [Cycle 1]: 8.42e-05, [4] [d_1]: 5.331e-05 [none_parameter_eliminate]: 1.83002e-06 [renormalize]: 5.3001e-07 [switch_simplify]: 7.5e-06 [partial_unused_args_eliminate]: 2.29999e-06 [add_recomputation]: 5.841e-05 [cse_after_recomputation]: 2.458e-05, [1] [Cycle 1]: 1.973e-05, [1] [cse]: 1.263e-05 [environ_conv]: 6.86999e-06 [swap_dp_allreduce_reducescatter]: 5.90002e-06 [bias_add_comm_swap]: 3.46001e-06 [label_micro_interleaved_index]: 6.21998e-06 [label_fine_grained_interleaved_index]: 2.74001e-06 [merge_cast_opt]: 1.67001e-06 [slice_recompute_activation]: 2.12999e-06 [micro_interleaved_order_control]: 2.14e-06 [assign_add_opt]: 1.34e-06 [ForceFp32Comm]: 9.20001e-07 [remove_cast_before_assign_add]: 1.15001e-06 [full_micro_interleaved_order_control]: 2.29001e-06 [reorder_send_recv_between_fp_bp]: 2.83e-06 [comm_op_add_attrs]: 1.04e-06 [add_comm_op_reuse_tag]: 9.69972e-07 [interleave_split_concat_branches]: 1.10001e-06 [interleave_parallel_branches]: 1.04e-06 [overlap_opt_shard_in_pipeline]: 1.41998e-06 [overlap_opt_shard_grad_in_pipeline]: 1.71e-06 [control_data_broadcast_order]: 1.426e-05 [grouped_pairwise_exchange_alltoall]: 1.53002e-06 [offloading_packed_experts]: 4.25e-06 [overlap_recompute_and_grad_model_parallel]: 4.99998e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.17e-06 [overlap_recompute_allgather_and_fa_grad]: 1.39e-06 [overlap_recompute_comm]: 2.88e-06 [overlap_grad_ring_attention]: 4.30999e-06 [overlap_grad_flash_sp]: 2.243e-05 [begin_end_overlap_inline]: 5.3001e-07 [split_matmul_comm_elemetwise]: 2.61999e-06 [split_layernorm_comm]: 1.70001e-06 [handle_group_info]: 1.04e-06 [symbol_engine_optimizer]: 8.739e-05, [1] [Cycle 1]: 8.234e-05, [6] [build]: 3.54002e-06 [elim_shapecalc]: 1.411e-05 [elim_not_effective]: 1.401e-05 [opt_reshape]: 7.85998e-06 [fold_const_symbol]: 1.106e-05 [renormalize]: 4.19997e-07 [detach_backward]: 2.64001e-06 [pipeline_parallel_scheduler]: 1.91e-06 [auto_monad_reorder]: 1.911e-05 [get_jit_bprop_graph]: 2.14999e-06 [rewriter_after_jit_bprop_graph]: 6.45002e-06 [opt_after_jit_grad]: 0.0007071 [validate]: 4.88e-05 Sums bootstrap : 0.000468s : 3.53% type_inference : 0.006771s : 51.07% event_method : 0.000023s : 0.17% auto_monad : 0.000067s : 0.50% graph_reusing : 0.000006s : 0.05% inline : 0.000003s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000024s : 0.18% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.05% parallel-infer-symbol : 0.000004s : 0.03% pre_auto_parallel : 0.000043s : 0.33% insert-virtual-dataset : 0.000002s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000032s : 0.24% optimize.rewriter_before_opt_a : 0.000096s : 0.72% optimize.opt_a.expand_dump_flag : 0.000005s : 0.04% optimize.opt_a.switch_simplify : 0.000055s : 0.42% optimize.opt_a.loop_unroll : 0.000038s : 0.28% optimize.opt_a.a_1 : 0.000844s : 6.36% optimize.opt_a.with_stream_mark : 0.000047s : 0.35% optimize.opt_a.recompute_prepare : 0.000023s : 0.17% optimize.opt_a.updatestate_depend_eliminate : 0.000009s : 0.07% optimize.opt_a.updatestate_assign_eliminate : 0.000006s : 0.05% optimize.opt_a.updatestate_loads_eliminate : 0.000007s : 0.05% optimize.opt_a.parameter_eliminate : 0.000004s : 0.03% optimize.opt_a.a_2 : 0.000165s : 1.24% optimize.opt_a.accelerated_algorithm : 0.000015s : 0.12% optimize.opt_a.shard : 0.000005s : 0.04% optimize.opt_a.meta_shard_fg_expand : 0.000005s : 0.04% optimize.opt_a.shard_inline : 0.000014s : 0.10% optimize.opt_a.merge_send_recv : 0.000020s : 0.15% optimize.opt_a.auto_parallel : 0.000021s : 0.16% optimize.opt_a.parallel : 0.000032s : 0.24% optimize.opt_a.flash_sp : 0.000014s : 0.11% optimize.opt_a.merge_comm : 0.000009s : 0.07% optimize.opt_a.allreduce_fusion : 0.000008s : 0.06% optimize.opt_a.matmul_add_comm_reduction : 0.000021s : 0.16% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000021s : 0.16% optimize.opt_a.virtual_dataset : 0.000014s : 0.10% optimize.opt_a.get_grad_eliminate_ : 0.000013s : 0.10% optimize.opt_a.virtual_output : 0.000014s : 0.10% optimize.opt_a.merge_forward : 0.000009s : 0.07% optimize.opt_a.cell_reuse_recompute_pass : 0.000005s : 0.04% optimize.opt_a.offload_activation : 0.000023s : 0.18% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000039s : 0.29% optimize.opt_a.merge_recompute_call_nodes : 0.000004s : 0.03% optimize.opt_a.before_grad : 0.000024s : 0.18% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000008s : 0.06% optimize.opt_a.meta_fg_expand : 0.000006s : 0.05% optimize.opt_a.flash_sp_send_recv_attached : 0.000006s : 0.04% optimize.opt_a.receive_attached : 0.000004s : 0.03% optimize.opt_a.after_resolve : 0.000028s : 0.21% optimize.opt_a.a_after_grad : 0.000023s : 0.17% optimize.opt_a.renormalize : 0.000946s : 7.13% optimize.opt_a.add_forward_monad_depend : 0.000014s : 0.10% optimize.opt_a.auto_monad_grad : 0.000005s : 0.04% optimize.opt_a.auto_monad_eliminator : 0.000034s : 0.26% optimize.opt_a.cse : 0.000057s : 0.43% optimize.opt_a.a_3 : 0.000100s : 0.75% optimize.py_interpret_to_execute_after_opt_a : 0.000019s : 0.14% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.02% optimize.rewriter_after_opt_a : 0.000045s : 0.34% optimize.convert_after_rewriter : 0.000008s : 0.06% optimize.order_py_execute_after_rewriter : 0.000006s : 0.05% optimize.mutable_eliminate : 0.000788s : 5.94% optimize.opt_b.b_1 : 0.000143s : 1.08% optimize.opt_b.b_2 : 0.000009s : 0.07% optimize.opt_b.updatestate_depend_eliminate : 0.000011s : 0.08% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.02% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.02% optimize.opt_b.renormalize : 0.000001s : 0.01% optimize.opt_b.cse : 0.000031s : 0.24% optimize.optimize_parallel_all_gather_comm : 0.000022s : 0.16% optimize.overlap_param_gather : 0.000002s : 0.02% optimize.cconv : 0.000039s : 0.29% optimize.loop_unroll : 0.000629s : 4.74% optimize.opt_after_cconv.c_1 : 0.000034s : 0.26% optimize.opt_after_cconv.parameter_eliminate : 0.000007s : 0.05% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000009s : 0.07% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.02% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.02% optimize.opt_after_cconv.cse : 0.000028s : 0.21% optimize.opt_after_cconv.renormalize : 0.000001s : 0.01% optimize.remove_dup_value : 0.000015s : 0.11% optimize.tuple_transform.d_1 : 0.000053s : 0.40% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000001s : 0.00% optimize.tuple_transform.switch_simplify : 0.000007s : 0.06% optimize.partial_unused_args_eliminate : 0.000002s : 0.02% optimize.add_recomputation : 0.000058s : 0.44% optimize.cse_after_recomputation.cse : 0.000013s : 0.10% optimize.environ_conv : 0.000007s : 0.05% optimize.swap_dp_allreduce_reducescatter : 0.000006s : 0.04% optimize.bias_add_comm_swap : 0.000003s : 0.03% optimize.label_micro_interleaved_index : 0.000006s : 0.05% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.02% optimize.merge_cast_opt : 0.000002s : 0.01% optimize.slice_recompute_activation : 0.000002s : 0.02% optimize.micro_interleaved_order_control : 0.000002s : 0.02% optimize.assign_add_opt : 0.000001s : 0.01% optimize.ForceFp32Comm : 0.000001s : 0.01% optimize.remove_cast_before_assign_add : 0.000001s : 0.01% optimize.full_micro_interleaved_order_control : 0.000002s : 0.02% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.02% optimize.comm_op_add_attrs : 0.000001s : 0.01% optimize.add_comm_op_reuse_tag : 0.000001s : 0.01% optimize.interleave_split_concat_branches : 0.000001s : 0.01% optimize.interleave_parallel_branches : 0.000001s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.01% optimize.control_data_broadcast_order : 0.000014s : 0.11% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.01% optimize.offloading_packed_experts : 0.000004s : 0.03% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.04% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.01% optimize.overlap_recompute_comm : 0.000003s : 0.02% optimize.overlap_grad_ring_attention : 0.000004s : 0.03% optimize.overlap_grad_flash_sp : 0.000022s : 0.17% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000003s : 0.02% optimize.split_layernorm_comm : 0.000002s : 0.01% optimize.handle_group_info : 0.000001s : 0.01% optimize.symbol_engine_optimizer.build : 0.000004s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000014s : 0.11% optimize.symbol_engine_optimizer.elim_not_effective : 0.000014s : 0.11% optimize.symbol_engine_optimizer.opt_reshape : 0.000008s : 0.06% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000011s : 0.08% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000003s : 0.02% pipeline_parallel_scheduler : 0.000002s : 0.01% auto_monad_reorder : 0.000019s : 0.14% get_jit_bprop_graph : 0.000002s : 0.02% rewriter_after_jit_bprop_graph : 0.000006s : 0.05% opt_after_jit_grad : 0.000707s : 5.33% validate : 0.000049s : 0.37% Time group info: ------[substitution.] 0.000230 28 0.89% : 0.000002s : 2: substitution.elim_not_effective 0.57% : 0.000001s : 2: substitution.fold_const_symbol 2.80% : 0.000006s : 4: substitution.graph_param_transform 80.08% : 0.000184s : 4: substitution.inline 2.28% : 0.000005s : 4: substitution.j_node_and_user_rematch 3.01% : 0.000007s : 4: substitution.remove_not_recompute_node 3.55% : 0.000008s : 4: substitution.replace_old_param 6.82% : 0.000016s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.006692 2 87.70% : 0.005869s : 1: type_inference.infer 12.30% : 0.000823s : 1: type_inference.specialize ------[replace.] 0.000070 8 62.95% : 0.000044s : 4: replace.inline 37.05% : 0.000026s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000195 8 92.94% : 0.000181s : 4: match.inline 7.06% : 0.000014s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000235 1278 0.93% : 0.000002s : 13: predicate.accumulaten_eliminater 1.17% : 0.000003s : 4: predicate.ad_related_special_op_eliminate 0.51% : 0.000001s : 8: predicate.addn_check_dump 1.04% : 0.000002s : 13: predicate.addn_zero_filter 0.73% : 0.000002s : 13: predicate.adjust_all_reduce_mul_add 2.11% : 0.000005s : 21: predicate.arithmetic_simplify 1.07% : 0.000003s : 13: predicate.cast_eliminate 0.55% : 0.000001s : 8: predicate.check_bprop_eliminate 0.61% : 0.000001s : 8: predicate.compare_switch_simplify 0.21% : 0.000001s : 4: predicate.const_output_eliminate 0.53% : 0.000001s : 8: predicate.depend_value_elim 0.87% : 0.000002s : 13: predicate.dict_get_item_const_eliminator 1.03% : 0.000002s : 13: predicate.dict_get_item_eliminator 0.84% : 0.000002s : 13: predicate.dict_set_item_eliminator 0.95% : 0.000002s : 8: predicate.dumpgradient_eliminate 0.16% : 0.000000s : 4: predicate.elim_not_effective 0.69% : 0.000002s : 4: predicate.elim_shapecalc_of_broadcastargs 1.10% : 0.000003s : 17: predicate.environ_add_const_eliminate 0.98% : 0.000002s : 17: predicate.environ_get_add_eliminate 1.11% : 0.000003s : 17: predicate.environ_get_depend_swap 1.80% : 0.000004s : 25: predicate.environ_get_eliminate 1.02% : 0.000002s : 17: predicate.environ_get_set_eliminate 1.33% : 0.000003s : 21: predicate.exchange_switch_depend_value 2.46% : 0.000006s : 21: predicate.float_depend_g_call 0.64% : 0.000002s : 8: predicate.float_environ_get_switch 0.82% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.15% : 0.000000s : 4: predicate.fold_const_symbol 0.62% : 0.000001s : 8: predicate.get_grad_eliminate 0.17% : 0.000000s : 4: predicate.graph_param_transform 0.50% : 0.000001s : 8: predicate.incorporate_call 0.41% : 0.000001s : 8: predicate.incorporate_call_switch 5.64% : 0.000013s : 58: predicate.inline 0.81% : 0.000002s : 8: predicate.inline_without_move 0.29% : 0.000001s : 8: predicate.j_node_and_user_rematch 1.06% : 0.000002s : 8: predicate.less_batch_normalization 1.88% : 0.000004s : 25: predicate.list_to_tuple_eliminator_ 2.44% : 0.000006s : 38: predicate.load_eliminater 1.52% : 0.000004s : 4: predicate.loop_unroll_after_grad 2.14% : 0.000005s : 34: predicate.loop_unroll_before_grad 1.79% : 0.000004s : 21: predicate.make_slice_get_slice_eliminator 0.52% : 0.000001s : 8: predicate.merge_addn 0.51% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.58% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.83% : 0.000002s : 13: predicate.minmaximum_grad 1.66% : 0.000004s : 4: predicate.mutable_eliminate 0.33% : 0.000001s : 4: predicate.opt_reshape 0.36% : 0.000001s : 4: predicate.parallel_virtual_node 1.87% : 0.000004s : 21: predicate.partial_defer_inline 1.45% : 0.000003s : 21: predicate.partial_eliminate 0.88% : 0.000002s : 13: predicate.print_const_string_wrapper 0.80% : 0.000002s : 8: predicate.reduce_all_const_elim 1.26% : 0.000003s : 13: predicate.reduce_eliminate 2.22% : 0.000005s : 38: predicate.redundant_stop_gradient_eliminater 0.43% : 0.000001s : 8: predicate.remove_not_recompute_node 1.31% : 0.000003s : 25: predicate.replace_applicator 0.71% : 0.000002s : 8: predicate.replace_old_param 0.32% : 0.000001s : 4: predicate.reset_defer_inline 1.16% : 0.000003s : 13: predicate.reshape_eliminate 0.54% : 0.000001s : 8: predicate.row_tensor_add_zeros_like 0.51% : 0.000001s : 4: predicate.row_tensor_eliminate 0.69% : 0.000002s : 8: predicate.same_eliminate 0.57% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.98% : 0.000002s : 8: predicate.shard_identity_eliminate 0.69% : 0.000002s : 8: predicate.special_op_eliminate 0.70% : 0.000002s : 8: predicate.specialize_transform 0.95% : 0.000002s : 8: predicate.split_environ_get_set_with_tuple_value 0.79% : 0.000002s : 8: predicate.stack_unstack_eliminate 0.30% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.36% : 0.000003s : 21: predicate.switch_defer_inline 1.86% : 0.000004s : 29: predicate.switch_layer_defer_inline 4.68% : 0.000011s : 67: predicate.switch_simplify 1.03% : 0.000002s : 13: predicate.tile_eliminate 0.87% : 0.000002s : 13: predicate.transpose_eliminate 1.41% : 0.000003s : 21: predicate.tuple_list_convert_item_index_to_positive 1.51% : 0.000004s : 21: predicate.tuple_list_get_item_const_eliminator 1.65% : 0.000004s : 21: predicate.tuple_list_get_item_depend_reorder 3.37% : 0.000008s : 33: predicate.tuple_list_get_item_eliminator 1.44% : 0.000003s : 21: predicate.tuple_list_get_set_item_eliminator 2.86% : 0.000007s : 29: predicate.tuple_list_set_item_eliminator 1.80% : 0.000004s : 25: predicate.tuple_to_list_eliminator_ 2.21% : 0.000005s : 38: predicate.updatestate_pure_node_eliminater 2.78% : 0.000007s : 46: predicate.updatestate_useless_node_eliminater 0.43% : 0.000001s : 4: predicate.value_based_eliminate 0.65% : 0.000002s : 8: predicate.virtual_dataset_eliminate 0.65% : 0.000002s : 8: predicate.virtual_output_eliminate 0.27% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.52% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000643 11 52.18% : 0.000335s : 5: func_graph_cloner_run.FuncGraphClonerGraph 47.82% : 0.000307s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.030655 192 0.01% : 0.000004s : 1: ForceFp32Comm 12.76% : 0.003912s : 1: add_attr 12.71% : 0.003896s : 1: add_attr_with_inline 0.01% : 0.000005s : 1: add_comm_op_reuse_tag 0.21% : 0.000064s : 1: add_recomputation 0.01% : 0.000004s : 1: assign_add_opt 0.24% : 0.000073s : 1: auto_monad 0.08% : 0.000023s : 1: auto_monad_reorder 0.01% : 0.000003s : 1: begin_end_overlap_inline 0.02% : 0.000007s : 1: bias_add_comm_swap 1.64% : 0.000502s : 1: bootstrap 0.14% : 0.000043s : 1: cconv 0.01% : 0.000004s : 1: comm_op_add_attrs 0.06% : 0.000017s : 1: control_data_broadcast_order 0.04% : 0.000012s : 1: convert_after_rewriter 0.09% : 0.000027s : 1: cse_after_recomputation 0.02% : 0.000005s : 1: dataset_repeat_opt 0.02% : 0.000006s : 1: detach_backward 0.03% : 0.000010s : 1: environ_conv 0.10% : 0.000030s : 1: event_method 0.02% : 0.000005s : 1: full_micro_interleaved_order_control 0.02% : 0.000005s : 1: get_jit_bprop_graph 0.03% : 0.000010s : 1: graph_reusing 0.01% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000004s : 1: handle_group_info 0.02% : 0.000006s : 1: inline 0.02% : 0.000006s : 1: insert-virtual-dataset 0.01% : 0.000004s : 1: interleave_parallel_branches 0.01% : 0.000004s : 1: interleave_split_concat_branches 0.02% : 0.000006s : 1: label_fine_grained_interleaved_index 0.03% : 0.000009s : 1: label_micro_interleaved_index 2.09% : 0.000641s : 1: loop_unroll 0.02% : 0.000005s : 1: merge_cast_opt 0.02% : 0.000005s : 1: micro_interleaved_order_control 2.63% : 0.000805s : 1: mutable_eliminate 0.02% : 0.000007s : 1: offloading_packed_experts 0.07% : 0.000023s : 1: opt.transform.loop_unroll_optimizer 0.08% : 0.000025s : 1: opt.transform.mutable_eliminate 4.28% : 0.001312s : 78: opt.transform.opt_a 0.10% : 0.000032s : 1: opt.transform.opt_after_cconv 0.11% : 0.000034s : 1: opt.transform.opt_after_jit_grad 0.37% : 0.000113s : 28: opt.transform.opt_b 0.19% : 0.000058s : 2: opt.transform.opt_trans_graph 0.14% : 0.000042s : 4: opt.transform.symbol_engine_opt 10.85% : 0.003325s : 1: opt_a 0.43% : 0.000132s : 1: opt_after_cconv 2.35% : 0.000721s : 1: opt_after_jit_grad 0.83% : 0.000254s : 1: opt_b 19.69% : 0.006037s : 1: optimize 0.08% : 0.000026s : 1: optimize_parallel_all_gather_comm 0.03% : 0.000009s : 1: order_py_execute_after_rewriter 0.09% : 0.000026s : 1: overlap_grad_flash_sp 0.01% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.02% : 0.000007s : 1: overlap_grad_ring_attention 0.01% : 0.000004s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.02% : 0.000005s : 1: overlap_param_gather 0.01% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.03% : 0.000009s : 1: overlap_recompute_and_grad_model_parallel 0.02% : 0.000006s : 1: overlap_recompute_comm 0.02% : 0.000007s : 1: parallel-infer-symbol 0.01% : 0.000004s : 1: parallel-infer-symbol-second 0.02% : 0.000005s : 1: partial_unused_args_eliminate 0.02% : 0.000005s : 1: pipeline_parallel_scheduler 0.02% : 0.000005s : 1: pipeline_split 0.16% : 0.000048s : 1: pre_auto_parallel 0.12% : 0.000037s : 1: py_interpret_to_execute 0.07% : 0.000022s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000004s : 1: remove_cast_before_assign_add 0.06% : 0.000020s : 1: remove_dup_value 1.59% : 0.000487s : 1: renormalize.infer 1.46% : 0.000446s : 1: renormalize.specialize 0.02% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.03% : 0.000010s : 1: rewriter_after_jit_bprop_graph 0.16% : 0.000050s : 1: rewriter_after_opt_a 0.33% : 0.000102s : 1: rewriter_before_opt_a 0.02% : 0.000006s : 1: slice_cell_reuse_recomputed_activation 0.02% : 0.000006s : 1: slice_recompute_activation 0.02% : 0.000005s : 1: split_layernorm_comm 0.02% : 0.000005s : 1: split_matmul_comm_elemetwise 0.03% : 0.000009s : 1: swap_dp_allreduce_reducescatter 0.30% : 0.000091s : 1: symbol_engine_optimizer 0.30% : 0.000093s : 1: tuple_transform 22.17% : 0.006796s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:45:04.159.515 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:45:04.159.821 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0191861, [21] [bootstrap]: 0.00046797 [type_inference]: 0.00649609 [event_method]: 2.017e-05 [auto_monad]: 6.971e-05 [graph_reusing]: 5.97999e-06 [inline]: 2.86999e-06 [add_attr]: 0.00374591, [1] [add_attr_with_inline]: 0.00373171, [1] [Cycle 1]: 9.259e-05, [2] [tag_attr]: 2.439e-05 [meta_addattr_fg_expand]: 6.44999e-06 [parallel-infer-symbol]: 3.91999e-06 [pre_auto_parallel]: 4.204e-05 [insert-virtual-dataset]: 2.49999e-06 [parallel-infer-symbol-second]: 9.20001e-07 [dataset_repeat_opt]: 2.09999e-06 [pipeline_split]: 1.59e-06 [optimize]: 0.00685508, [53] [py_interpret_to_execute]: 3.735e-05 [rewriter_before_opt_a]: 9.943e-05 [opt_a]: 0.00391685, [2] [Cycle 1]: 0.00279022, [45] [expand_dump_flag]: 3.16001e-06 [switch_simplify]: 4.513e-05 [loop_unroll]: 3.184e-05 [a_1]: 0.00077731 [with_stream_mark]: 2.176e-05 [recompute_prepare]: 1.112e-05 [updatestate_depend_eliminate]: 4.94998e-06 [updatestate_assign_eliminate]: 4.12998e-06 [updatestate_loads_eliminate]: 4.17e-06 [parameter_eliminate]: 2.07999e-06 [a_2]: 0.00013221 [accelerated_algorithm]: 9.27999e-06 [shard]: 3.17002e-06 [meta_shard_fg_expand]: 3.03998e-06 [shard_inline]: 7.73999e-06 [merge_send_recv]: 1.105e-05 [auto_parallel]: 8.71002e-06 [parallel]: 2.092e-05 [flash_sp]: 1.038e-05 [merge_comm]: 5.09e-06 [allreduce_fusion]: 4.25999e-06 [matmul_add_comm_reduction]: 1.189e-05 [allreduce_slice_to_reducescatter]: 6.69999e-07 [virtual_shard_identity]: 1.007e-05 [virtual_dataset]: 8.47e-06 [get_grad_eliminate_]: 7.61001e-06 [virtual_output]: 7.93999e-06 [merge_forward]: 4.78001e-06 [cell_reuse_recompute_pass]: 1.59998e-06 [offload_activation]: 1.192e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.88e-05 [merge_recompute_call_nodes]: 1.92999e-06 [before_grad]: 1.366e-05 [set_forward_comm_id_for_comm_node_pass]: 4.47e-06 [meta_fg_expand]: 3.49001e-06 [flash_sp_send_recv_attached]: 2.81e-06 [receive_attached]: 2.44001e-06 [after_resolve]: 1.41e-05 [a_after_grad]: 1.233e-05 [renormalize]: 0.0009641 [add_forward_monad_depend]: 8.02998e-06 [auto_monad_grad]: 2.76e-06 [auto_monad_eliminator]: 2.157e-05 [cse]: 3.942e-05 [a_3]: 8.017e-05 [Cycle 2]: 0.00110869, [45] [expand_dump_flag]: 1.55999e-06 [switch_simplify]: 9.79e-06 [loop_unroll]: 7.58999e-06 [a_1]: 0.00019045 [with_stream_mark]: 1.757e-05 [recompute_prepare]: 7.9e-06 [updatestate_depend_eliminate]: 4.59998e-06 [updatestate_assign_eliminate]: 3.20998e-06 [updatestate_loads_eliminate]: 3.04999e-06 [parameter_eliminate]: 1.54e-06 [a_2]: 0.00012093 [accelerated_algorithm]: 7.96001e-06 [shard]: 2.58e-06 [meta_shard_fg_expand]: 1.91e-06 [shard_inline]: 8.55001e-06 [merge_send_recv]: 7.8e-06 [auto_parallel]: 7.77998e-06 [parallel]: 7.25e-06 [flash_sp]: 3.6e-06 [merge_comm]: 4.40999e-06 [allreduce_fusion]: 4.35999e-06 [matmul_add_comm_reduction]: 9.34998e-06 [allreduce_slice_to_reducescatter]: 8.00006e-07 [virtual_shard_identity]: 8.94e-06 [virtual_dataset]: 7.84002e-06 [get_grad_eliminate_]: 7.23e-06 [virtual_output]: 2.087e-05 [merge_forward]: 5.84e-06 [cell_reuse_recompute_pass]: 2.49001e-06 [offload_activation]: 1.023e-05 [cell_reuse_handle_not_recompute_node_pass]: 7.183e-05 [merge_recompute_call_nodes]: 1.29e-06 [before_grad]: 1.749e-05 [set_forward_comm_id_for_comm_node_pass]: 6.74999e-06 [meta_fg_expand]: 3.78999e-06 [flash_sp_send_recv_attached]: 1.40001e-06 [receive_attached]: 2.02001e-06 [after_resolve]: 1.401e-05 [a_after_grad]: 1.289e-05 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 3.24001e-06 [auto_monad_grad]: 1.62999e-06 [auto_monad_eliminator]: 1.326e-05 [cse]: 2.582e-05 [a_3]: 6.284e-05 [py_interpret_to_execute_after_opt_a]: 2.021e-05 [slice_cell_reuse_recomputed_activation]: 5.18002e-06 [rewriter_after_opt_a]: 5.105e-05 [convert_after_rewriter]: 1.19e-05 [order_py_execute_after_rewriter]: 9.82999e-06 [mutable_eliminate]: 0.00076395 [opt_b]: 0.00033566, [1] [Cycle 1]: 0.00032325, [7] [b_1]: 0.0002053 [b_2]: 1.011e-05 [updatestate_depend_eliminate]: 9.01998e-06 [updatestate_assign_eliminate]: 3.48e-06 [updatestate_loads_eliminate]: 3.14001e-06 [renormalize]: 6.00005e-07 [cse]: 3.182e-05 [optimize_parallel_all_gather_comm]: 2.34e-05 [overlap_param_gather]: 6.04001e-06 [cconv]: 3.685e-05 [loop_unroll]: 0.00060473 [opt_after_cconv]: 0.00016083, [1] [Cycle 1]: 0.00014998, [7] [c_1]: 4.259e-05 [parameter_eliminate]: 4.84e-06 [updatestate_depend_eliminate]: 8.3e-06 [updatestate_assign_eliminate]: 3.33e-06 [updatestate_loads_eliminate]: 3.31999e-06 [cse]: 2.961e-05 [renormalize]: 7.89994e-07 [remove_dup_value]: 2.195e-05 [tuple_transform]: 0.00011113, [1] [Cycle 1]: 0.00010336, [4] [d_1]: 6.144e-05 [none_parameter_eliminate]: 1.60999e-06 [renormalize]: 5.19998e-07 [switch_simplify]: 8.47e-06 [partial_unused_args_eliminate]: 5.20001e-06 [add_recomputation]: 6.6e-05 [cse_after_recomputation]: 3.262e-05, [1] [Cycle 1]: 2.516e-05, [1] [cse]: 1.554e-05 [environ_conv]: 1.165e-05 [swap_dp_allreduce_reducescatter]: 1.009e-05 [bias_add_comm_swap]: 6.14001e-06 [label_micro_interleaved_index]: 8.79998e-06 [label_fine_grained_interleaved_index]: 5.03002e-06 [merge_cast_opt]: 3.79002e-06 [slice_recompute_activation]: 4.66002e-06 [micro_interleaved_order_control]: 4.50001e-06 [assign_add_opt]: 3.96001e-06 [ForceFp32Comm]: 3.53999e-06 [remove_cast_before_assign_add]: 3.56001e-06 [full_micro_interleaved_order_control]: 4.65001e-06 [reorder_send_recv_between_fp_bp]: 5.25999e-06 [comm_op_add_attrs]: 3.65e-06 [add_comm_op_reuse_tag]: 3.46999e-06 [interleave_split_concat_branches]: 3.45e-06 [interleave_parallel_branches]: 3.41001e-06 [overlap_opt_shard_in_pipeline]: 4.15e-06 [overlap_opt_shard_grad_in_pipeline]: 4.73001e-06 [control_data_broadcast_order]: 1.868e-05 [grouped_pairwise_exchange_alltoall]: 3.8e-06 [offloading_packed_experts]: 7.41999e-06 [overlap_recompute_and_grad_model_parallel]: 8.55999e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.53e-06 [overlap_recompute_allgather_and_fa_grad]: 3.93001e-06 [overlap_recompute_comm]: 5.50001e-06 [overlap_grad_ring_attention]: 6.92002e-06 [overlap_grad_flash_sp]: 2.927e-05 [begin_end_overlap_inline]: 2.99001e-06 [split_matmul_comm_elemetwise]: 5.11997e-06 [split_layernorm_comm]: 4.12e-06 [handle_group_info]: 4.13001e-06 [symbol_engine_optimizer]: 0.00010699, [1] [Cycle 1]: 9.997e-05, [6] [build]: 4.11001e-06 [elim_shapecalc]: 1.2e-05 [elim_not_effective]: 1.64e-05 [opt_reshape]: 8.65001e-06 [fold_const_symbol]: 1.245e-05 [renormalize]: 1.69995e-07 [detach_backward]: 4.68001e-06 [pipeline_parallel_scheduler]: 2.20002e-06 [auto_monad_reorder]: 2.449e-05 [get_jit_bprop_graph]: 2.02999e-06 [rewriter_after_jit_bprop_graph]: 6.08002e-06 [opt_after_jit_grad]: 0.00071487 [validate]: 4.97e-05 Sums bootstrap : 0.000468s : 3.46% type_inference : 0.006496s : 48.02% event_method : 0.000020s : 0.15% auto_monad : 0.000070s : 0.52% graph_reusing : 0.000006s : 0.04% inline : 0.000003s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000024s : 0.18% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.05% parallel-infer-symbol : 0.000004s : 0.03% pre_auto_parallel : 0.000042s : 0.31% insert-virtual-dataset : 0.000002s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000037s : 0.28% optimize.rewriter_before_opt_a : 0.000099s : 0.74% optimize.opt_a.expand_dump_flag : 0.000005s : 0.03% optimize.opt_a.switch_simplify : 0.000055s : 0.41% optimize.opt_a.loop_unroll : 0.000039s : 0.29% optimize.opt_a.a_1 : 0.000968s : 7.15% optimize.opt_a.with_stream_mark : 0.000039s : 0.29% optimize.opt_a.recompute_prepare : 0.000019s : 0.14% optimize.opt_a.updatestate_depend_eliminate : 0.000010s : 0.07% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.05% optimize.opt_a.updatestate_loads_eliminate : 0.000007s : 0.05% optimize.opt_a.parameter_eliminate : 0.000004s : 0.03% optimize.opt_a.a_2 : 0.000253s : 1.87% optimize.opt_a.accelerated_algorithm : 0.000017s : 0.13% optimize.opt_a.shard : 0.000006s : 0.04% optimize.opt_a.meta_shard_fg_expand : 0.000005s : 0.04% optimize.opt_a.shard_inline : 0.000016s : 0.12% optimize.opt_a.merge_send_recv : 0.000019s : 0.14% optimize.opt_a.auto_parallel : 0.000016s : 0.12% optimize.opt_a.parallel : 0.000028s : 0.21% optimize.opt_a.flash_sp : 0.000014s : 0.10% optimize.opt_a.merge_comm : 0.000009s : 0.07% optimize.opt_a.allreduce_fusion : 0.000009s : 0.06% optimize.opt_a.matmul_add_comm_reduction : 0.000021s : 0.16% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000019s : 0.14% optimize.opt_a.virtual_dataset : 0.000016s : 0.12% optimize.opt_a.get_grad_eliminate_ : 0.000015s : 0.11% optimize.opt_a.virtual_output : 0.000029s : 0.21% optimize.opt_a.merge_forward : 0.000011s : 0.08% optimize.opt_a.cell_reuse_recompute_pass : 0.000004s : 0.03% optimize.opt_a.offload_activation : 0.000022s : 0.16% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000091s : 0.67% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.02% optimize.opt_a.before_grad : 0.000031s : 0.23% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000011s : 0.08% optimize.opt_a.meta_fg_expand : 0.000007s : 0.05% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.03% optimize.opt_a.receive_attached : 0.000004s : 0.03% optimize.opt_a.after_resolve : 0.000028s : 0.21% optimize.opt_a.a_after_grad : 0.000025s : 0.19% optimize.opt_a.renormalize : 0.000964s : 7.13% optimize.opt_a.add_forward_monad_depend : 0.000011s : 0.08% optimize.opt_a.auto_monad_grad : 0.000004s : 0.03% optimize.opt_a.auto_monad_eliminator : 0.000035s : 0.26% optimize.opt_a.cse : 0.000065s : 0.48% optimize.opt_a.a_3 : 0.000143s : 1.06% optimize.py_interpret_to_execute_after_opt_a : 0.000020s : 0.15% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.04% optimize.rewriter_after_opt_a : 0.000051s : 0.38% optimize.convert_after_rewriter : 0.000012s : 0.09% optimize.order_py_execute_after_rewriter : 0.000010s : 0.07% optimize.mutable_eliminate : 0.000764s : 5.65% optimize.opt_b.b_1 : 0.000205s : 1.52% optimize.opt_b.b_2 : 0.000010s : 0.07% optimize.opt_b.updatestate_depend_eliminate : 0.000009s : 0.07% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.02% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000032s : 0.24% optimize.optimize_parallel_all_gather_comm : 0.000023s : 0.17% optimize.overlap_param_gather : 0.000006s : 0.04% optimize.cconv : 0.000037s : 0.27% optimize.loop_unroll : 0.000605s : 4.47% optimize.opt_after_cconv.c_1 : 0.000043s : 0.31% optimize.opt_after_cconv.parameter_eliminate : 0.000005s : 0.04% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000008s : 0.06% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.02% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.02% optimize.opt_after_cconv.cse : 0.000030s : 0.22% optimize.opt_after_cconv.renormalize : 0.000001s : 0.01% optimize.remove_dup_value : 0.000022s : 0.16% optimize.tuple_transform.d_1 : 0.000061s : 0.45% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000001s : 0.00% optimize.tuple_transform.switch_simplify : 0.000008s : 0.06% optimize.partial_unused_args_eliminate : 0.000005s : 0.04% optimize.add_recomputation : 0.000066s : 0.49% optimize.cse_after_recomputation.cse : 0.000016s : 0.11% optimize.environ_conv : 0.000012s : 0.09% optimize.swap_dp_allreduce_reducescatter : 0.000010s : 0.07% optimize.bias_add_comm_swap : 0.000006s : 0.05% optimize.label_micro_interleaved_index : 0.000009s : 0.07% optimize.label_fine_grained_interleaved_index : 0.000005s : 0.04% optimize.merge_cast_opt : 0.000004s : 0.03% optimize.slice_recompute_activation : 0.000005s : 0.03% optimize.micro_interleaved_order_control : 0.000005s : 0.03% optimize.assign_add_opt : 0.000004s : 0.03% optimize.ForceFp32Comm : 0.000004s : 0.03% optimize.remove_cast_before_assign_add : 0.000004s : 0.03% optimize.full_micro_interleaved_order_control : 0.000005s : 0.03% optimize.reorder_send_recv_between_fp_bp : 0.000005s : 0.04% optimize.comm_op_add_attrs : 0.000004s : 0.03% optimize.add_comm_op_reuse_tag : 0.000003s : 0.03% optimize.interleave_split_concat_branches : 0.000003s : 0.03% optimize.interleave_parallel_branches : 0.000003s : 0.03% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.03% optimize.overlap_opt_shard_grad_in_pipeline : 0.000005s : 0.03% optimize.control_data_broadcast_order : 0.000019s : 0.14% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.03% optimize.offloading_packed_experts : 0.000007s : 0.05% optimize.overlap_recompute_and_grad_model_parallel : 0.000009s : 0.06% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.03% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.03% optimize.overlap_recompute_comm : 0.000006s : 0.04% optimize.overlap_grad_ring_attention : 0.000007s : 0.05% optimize.overlap_grad_flash_sp : 0.000029s : 0.22% optimize.begin_end_overlap_inline : 0.000003s : 0.02% optimize.split_matmul_comm_elemetwise : 0.000005s : 0.04% optimize.split_layernorm_comm : 0.000004s : 0.03% optimize.handle_group_info : 0.000004s : 0.03% optimize.symbol_engine_optimizer.build : 0.000004s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000012s : 0.09% optimize.symbol_engine_optimizer.elim_not_effective : 0.000016s : 0.12% optimize.symbol_engine_optimizer.opt_reshape : 0.000009s : 0.06% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000012s : 0.09% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000005s : 0.03% pipeline_parallel_scheduler : 0.000002s : 0.02% auto_monad_reorder : 0.000024s : 0.18% get_jit_bprop_graph : 0.000002s : 0.02% rewriter_after_jit_bprop_graph : 0.000006s : 0.04% opt_after_jit_grad : 0.000715s : 5.28% validate : 0.000050s : 0.37% Time group info: ------[substitution.] 0.000240 38 11.42% : 0.000027s : 3: substitution.cast_eliminate 0.95% : 0.000002s : 3: substitution.elim_not_effective 0.71% : 0.000002s : 3: substitution.fold_const_symbol 3.16% : 0.000008s : 5: substitution.graph_param_transform 68.92% : 0.000165s : 4: substitution.inline 2.33% : 0.000006s : 6: substitution.j_node_and_user_rematch 2.95% : 0.000007s : 6: substitution.remove_not_recompute_node 2.58% : 0.000006s : 4: substitution.replace_old_param 6.97% : 0.000017s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.006436 2 87.21% : 0.005612s : 1: type_inference.infer 12.79% : 0.000823s : 1: type_inference.specialize ------[replace.] 0.000067 8 61.97% : 0.000042s : 4: replace.inline 38.03% : 0.000026s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000177 8 91.72% : 0.000163s : 4: match.inline 8.28% : 0.000015s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000282 1596 0.93% : 0.000003s : 17: predicate.accumulaten_eliminater 1.04% : 0.000003s : 5: predicate.ad_related_special_op_eliminate 0.53% : 0.000001s : 10: predicate.addn_check_dump 1.01% : 0.000003s : 17: predicate.addn_zero_filter 0.84% : 0.000002s : 17: predicate.adjust_all_reduce_mul_add 2.23% : 0.000006s : 27: predicate.arithmetic_simplify 1.05% : 0.000003s : 17: predicate.cast_eliminate 0.57% : 0.000002s : 10: predicate.check_bprop_eliminate 0.54% : 0.000002s : 10: predicate.compare_switch_simplify 0.18% : 0.000001s : 5: predicate.const_output_eliminate 0.59% : 0.000002s : 10: predicate.depend_value_elim 0.96% : 0.000003s : 17: predicate.dict_get_item_const_eliminator 1.04% : 0.000003s : 17: predicate.dict_get_item_eliminator 0.87% : 0.000002s : 17: predicate.dict_set_item_eliminator 0.98% : 0.000003s : 10: predicate.dumpgradient_eliminate 0.18% : 0.000000s : 5: predicate.elim_not_effective 0.37% : 0.000001s : 5: predicate.elim_shapecalc_of_broadcastargs 1.18% : 0.000003s : 22: predicate.environ_add_const_eliminate 1.23% : 0.000003s : 22: predicate.environ_get_add_eliminate 1.13% : 0.000003s : 22: predicate.environ_get_depend_swap 1.83% : 0.000005s : 32: predicate.environ_get_eliminate 1.25% : 0.000004s : 22: predicate.environ_get_set_eliminate 1.31% : 0.000004s : 25: predicate.exchange_switch_depend_value 2.12% : 0.000006s : 25: predicate.float_depend_g_call 0.70% : 0.000002s : 10: predicate.float_environ_get_switch 0.91% : 0.000003s : 15: predicate.float_tuple_getitem_switch 0.16% : 0.000000s : 5: predicate.fold_const_symbol 0.64% : 0.000002s : 10: predicate.get_grad_eliminate 0.19% : 0.000001s : 5: predicate.graph_param_transform 0.57% : 0.000002s : 10: predicate.incorporate_call 0.46% : 0.000001s : 10: predicate.incorporate_call_switch 6.07% : 0.000017s : 72: predicate.inline 0.83% : 0.000002s : 10: predicate.inline_without_move 0.31% : 0.000001s : 10: predicate.j_node_and_user_rematch 0.98% : 0.000003s : 10: predicate.less_batch_normalization 1.84% : 0.000005s : 31: predicate.list_to_tuple_eliminator_ 2.47% : 0.000007s : 48: predicate.load_eliminater 1.17% : 0.000003s : 5: predicate.loop_unroll_after_grad 1.93% : 0.000005s : 36: predicate.loop_unroll_before_grad 1.73% : 0.000005s : 27: predicate.make_slice_get_slice_eliminator 0.61% : 0.000002s : 10: predicate.merge_addn 0.60% : 0.000002s : 10: predicate.micro_step_allgather_replace 0.58% : 0.000002s : 10: predicate.mini_step_allgather_replace 0.85% : 0.000002s : 17: predicate.minmaximum_grad 1.32% : 0.000004s : 5: predicate.mutable_eliminate 0.40% : 0.000001s : 5: predicate.opt_reshape 0.35% : 0.000001s : 5: predicate.parallel_virtual_node 1.56% : 0.000004s : 25: predicate.partial_defer_inline 1.55% : 0.000004s : 26: predicate.partial_eliminate 0.88% : 0.000002s : 17: predicate.print_const_string_wrapper 0.54% : 0.000002s : 10: predicate.reduce_all_const_elim 1.27% : 0.000004s : 17: predicate.reduce_eliminate 2.40% : 0.000007s : 48: predicate.redundant_stop_gradient_eliminater 0.36% : 0.000001s : 10: predicate.remove_not_recompute_node 1.23% : 0.000003s : 31: predicate.replace_applicator 0.51% : 0.000001s : 10: predicate.replace_old_param 0.45% : 0.000001s : 5: predicate.reset_defer_inline 1.15% : 0.000003s : 17: predicate.reshape_eliminate 0.59% : 0.000002s : 10: predicate.row_tensor_add_zeros_like 0.34% : 0.000001s : 5: predicate.row_tensor_eliminate 0.89% : 0.000003s : 10: predicate.same_eliminate 0.35% : 0.000001s : 10: predicate.set_cell_output_no_recompute 0.75% : 0.000002s : 10: predicate.shard_identity_eliminate 0.81% : 0.000002s : 10: predicate.special_op_eliminate 0.65% : 0.000002s : 10: predicate.specialize_transform 1.07% : 0.000003s : 10: predicate.split_environ_get_set_with_tuple_value 0.92% : 0.000003s : 10: predicate.stack_unstack_eliminate 0.37% : 0.000001s : 5: predicate.switch_call_monad_eliminater 1.43% : 0.000004s : 25: predicate.switch_defer_inline 1.90% : 0.000005s : 35: predicate.switch_layer_defer_inline 4.40% : 0.000012s : 76: predicate.switch_simplify 0.95% : 0.000003s : 17: predicate.tile_eliminate 0.91% : 0.000003s : 17: predicate.transpose_eliminate 1.72% : 0.000005s : 27: predicate.tuple_list_convert_item_index_to_positive 1.62% : 0.000005s : 27: predicate.tuple_list_get_item_const_eliminator 1.59% : 0.000004s : 27: predicate.tuple_list_get_item_depend_reorder 3.37% : 0.000009s : 41: predicate.tuple_list_get_item_eliminator 1.58% : 0.000004s : 27: predicate.tuple_list_get_set_item_eliminator 2.43% : 0.000007s : 37: predicate.tuple_list_set_item_eliminator 1.87% : 0.000005s : 31: predicate.tuple_to_list_eliminator_ 2.41% : 0.000007s : 48: predicate.updatestate_pure_node_eliminater 3.29% : 0.000009s : 58: predicate.updatestate_useless_node_eliminater 0.34% : 0.000001s : 5: predicate.value_based_eliminate 0.64% : 0.000002s : 10: predicate.virtual_dataset_eliminate 0.58% : 0.000002s : 10: predicate.virtual_output_eliminate 0.25% : 0.000001s : 5: predicate.virtual_view_grad_eliminate 0.47% : 0.000001s : 5: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000651 11 52.07% : 0.000339s : 5: func_graph_cloner_run.FuncGraphClonerGraph 47.93% : 0.000312s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.032492 192 0.02% : 0.000006s : 1: ForceFp32Comm 11.56% : 0.003757s : 1: add_attr 11.50% : 0.003736s : 1: add_attr_with_inline 0.02% : 0.000006s : 1: add_comm_op_reuse_tag 0.22% : 0.000070s : 1: add_recomputation 0.02% : 0.000007s : 1: assign_add_opt 0.24% : 0.000079s : 1: auto_monad 0.10% : 0.000033s : 1: auto_monad_reorder 0.02% : 0.000006s : 1: begin_end_overlap_inline 0.03% : 0.000009s : 1: bias_add_comm_swap 1.59% : 0.000516s : 1: bootstrap 0.12% : 0.000040s : 1: cconv 0.02% : 0.000007s : 1: comm_op_add_attrs 0.07% : 0.000022s : 1: control_data_broadcast_order 0.05% : 0.000016s : 1: convert_after_rewriter 0.11% : 0.000036s : 1: cse_after_recomputation 0.02% : 0.000008s : 1: dataset_repeat_opt 0.07% : 0.000024s : 1: detach_backward 0.04% : 0.000015s : 1: environ_conv 0.09% : 0.000031s : 1: event_method 0.02% : 0.000007s : 1: full_micro_interleaved_order_control 0.03% : 0.000009s : 1: get_jit_bprop_graph 0.04% : 0.000013s : 1: graph_reusing 0.02% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.02% : 0.000007s : 1: handle_group_info 0.03% : 0.000009s : 1: inline 0.03% : 0.000009s : 1: insert-virtual-dataset 0.02% : 0.000006s : 1: interleave_parallel_branches 0.02% : 0.000006s : 1: interleave_split_concat_branches 0.03% : 0.000008s : 1: label_fine_grained_interleaved_index 0.04% : 0.000012s : 1: label_micro_interleaved_index 1.89% : 0.000613s : 1: loop_unroll 0.02% : 0.000006s : 1: merge_cast_opt 0.02% : 0.000007s : 1: micro_interleaved_order_control 2.38% : 0.000773s : 1: mutable_eliminate 0.03% : 0.000010s : 1: offloading_packed_experts 0.06% : 0.000021s : 1: opt.transform.loop_unroll_optimizer 0.07% : 0.000024s : 1: opt.transform.mutable_eliminate 4.70% : 0.001527s : 78: opt.transform.opt_a 0.13% : 0.000041s : 1: opt.transform.opt_after_cconv 0.12% : 0.000038s : 1: opt.transform.opt_after_jit_grad 0.43% : 0.000140s : 28: opt.transform.opt_b 0.21% : 0.000067s : 2: opt.transform.opt_trans_graph 0.14% : 0.000046s : 4: opt.transform.symbol_engine_opt 12.07% : 0.003920s : 1: opt_a 0.51% : 0.000164s : 1: opt_after_cconv 2.24% : 0.000728s : 1: opt_after_jit_grad 1.05% : 0.000340s : 1: opt_b 22.28% : 0.007241s : 1: optimize 0.08% : 0.000027s : 1: optimize_parallel_all_gather_comm 0.04% : 0.000013s : 1: order_py_execute_after_rewriter 0.10% : 0.000032s : 1: overlap_grad_flash_sp 0.02% : 0.000006s : 1: overlap_grad_matmul_and_grad_allreduce 0.03% : 0.000010s : 1: overlap_grad_ring_attention 0.03% : 0.000008s : 1: overlap_opt_shard_grad_in_pipeline 0.02% : 0.000007s : 1: overlap_opt_shard_in_pipeline 0.03% : 0.000009s : 1: overlap_param_gather 0.02% : 0.000007s : 1: overlap_recompute_allgather_and_fa_grad 0.03% : 0.000011s : 1: overlap_recompute_and_grad_model_parallel 0.03% : 0.000008s : 1: overlap_recompute_comm 0.03% : 0.000011s : 1: parallel-infer-symbol 0.02% : 0.000007s : 1: parallel-infer-symbol-second 0.03% : 0.000008s : 1: partial_unused_args_eliminate 0.03% : 0.000010s : 1: pipeline_parallel_scheduler 0.02% : 0.000007s : 1: pipeline_split 0.15% : 0.000050s : 1: pre_auto_parallel 0.13% : 0.000041s : 1: py_interpret_to_execute 0.07% : 0.000024s : 1: py_interpret_to_execute_after_opt_a 0.02% : 0.000007s : 1: remove_cast_before_assign_add 0.08% : 0.000025s : 1: remove_dup_value 1.60% : 0.000520s : 1: renormalize.infer 1.33% : 0.000433s : 1: renormalize.specialize 0.02% : 0.000008s : 1: reorder_send_recv_between_fp_bp 0.04% : 0.000012s : 1: rewriter_after_jit_bprop_graph 0.17% : 0.000055s : 1: rewriter_after_opt_a 0.32% : 0.000103s : 1: rewriter_before_opt_a 0.02% : 0.000008s : 1: slice_cell_reuse_recomputed_activation 0.02% : 0.000007s : 1: slice_recompute_activation 0.02% : 0.000007s : 1: split_layernorm_comm 0.02% : 0.000008s : 1: split_matmul_comm_elemetwise 0.04% : 0.000013s : 1: swap_dp_allreduce_reducescatter 0.34% : 0.000110s : 1: symbol_engine_optimizer 0.35% : 0.000114s : 1: tuple_transform 20.15% : 0.006546s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:45:04.446.051 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0182432, [21] [bootstrap]: 0.00047873 [type_inference]: 0.00668037 [event_method]: 2.32e-05 [auto_monad]: 6.8e-05 [graph_reusing]: 6.01e-06 [inline]: 3.38999e-06 [add_attr]: 0.00391346, [1] [add_attr_with_inline]: 0.00390006, [1] [Cycle 1]: 7.802e-05, [2] [tag_attr]: 2.554e-05 [meta_addattr_fg_expand]: 6.74999e-06 [parallel-infer-symbol]: 3.81001e-06 [pre_auto_parallel]: 4.426e-05 [insert-virtual-dataset]: 2.83998e-06 [parallel-infer-symbol-second]: 7.79983e-07 [dataset_repeat_opt]: 2.37999e-06 [pipeline_split]: 1.94e-06 [optimize]: 0.00624574, [53] [py_interpret_to_execute]: 3.29e-05 [rewriter_before_opt_a]: 0.0001019 [opt_a]: 0.0036014, [2] [Cycle 1]: 0.00272742, [45] [expand_dump_flag]: 3.16001e-06 [switch_simplify]: 4.655e-05 [loop_unroll]: 6.576e-05 [a_1]: 0.00081227 [with_stream_mark]: 2.119e-05 [recompute_prepare]: 1.149e-05 [updatestate_depend_eliminate]: 5.30999e-06 [updatestate_assign_eliminate]: 4.03999e-06 [updatestate_loads_eliminate]: 3.94002e-06 [parameter_eliminate]: 2.27999e-06 [a_2]: 0.00010739 [accelerated_algorithm]: 9.32001e-06 [shard]: 2.94999e-06 [meta_shard_fg_expand]: 2.19001e-06 [shard_inline]: 8.58001e-06 [merge_send_recv]: 1.14e-05 [auto_parallel]: 8.80001e-06 [parallel]: 2.015e-05 [flash_sp]: 1.067e-05 [merge_comm]: 4.73001e-06 [allreduce_fusion]: 4.18001e-06 [matmul_add_comm_reduction]: 1.156e-05 [allreduce_slice_to_reducescatter]: 1.02e-06 [virtual_shard_identity]: 1.085e-05 [virtual_dataset]: 8.60999e-06 [get_grad_eliminate_]: 8.01001e-06 [virtual_output]: 8.13999e-06 [merge_forward]: 4.72e-06 [cell_reuse_recompute_pass]: 1.40001e-06 [offload_activation]: 1.123e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.652e-05 [merge_recompute_call_nodes]: 1.76e-06 [before_grad]: 1.368e-05 [set_forward_comm_id_for_comm_node_pass]: 4.76002e-06 [meta_fg_expand]: 3.75e-06 [flash_sp_send_recv_attached]: 3.13998e-06 [receive_attached]: 2.56e-06 [after_resolve]: 1.421e-05 [a_after_grad]: 1.299e-05 [renormalize]: 0.00099099 [add_forward_monad_depend]: 8.67e-06 [auto_monad_grad]: 3.29001e-06 [auto_monad_eliminator]: 2.101e-05 [cse]: 4.213e-05 [a_3]: 7.396e-05 [Cycle 2]: 0.00086084, [45] [expand_dump_flag]: 2.27001e-06 [switch_simplify]: 1.076e-05 [loop_unroll]: 8.72998e-06 [a_1]: 0.00020544 [with_stream_mark]: 1.642e-05 [recompute_prepare]: 9.00999e-06 [updatestate_depend_eliminate]: 4.15e-06 [updatestate_assign_eliminate]: 3.35e-06 [updatestate_loads_eliminate]: 3.48999e-06 [parameter_eliminate]: 1.94999e-06 [a_2]: 9.669e-05 [accelerated_algorithm]: 8.23001e-06 [shard]: 2.58998e-06 [meta_shard_fg_expand]: 2.51998e-06 [shard_inline]: 8.57e-06 [merge_send_recv]: 9.14e-06 [auto_parallel]: 7.88001e-06 [parallel]: 8.77999e-06 [flash_sp]: 4.28999e-06 [merge_comm]: 4.90001e-06 [allreduce_fusion]: 5.07e-06 [matmul_add_comm_reduction]: 8.92e-06 [allreduce_slice_to_reducescatter]: 5.3001e-07 [virtual_shard_identity]: 9.52001e-06 [virtual_dataset]: 7.65e-06 [get_grad_eliminate_]: 7.36001e-06 [virtual_output]: 6.84999e-06 [merge_forward]: 4.15999e-06 [cell_reuse_recompute_pass]: 2.71e-06 [offload_activation]: 9.37001e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.499e-05 [merge_recompute_call_nodes]: 1.10999e-06 [before_grad]: 1.41e-05 [set_forward_comm_id_for_comm_node_pass]: 4.85001e-06 [meta_fg_expand]: 3.21999e-06 [flash_sp_send_recv_attached]: 2.26e-06 [receive_attached]: 1.81e-06 [after_resolve]: 1.298e-05 [a_after_grad]: 1.199e-05 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 1.97999e-06 [auto_monad_grad]: 1.15001e-06 [auto_monad_eliminator]: 1.064e-05 [cse]: 2.052e-05 [a_3]: 4.659e-05 [py_interpret_to_execute_after_opt_a]: 1.552e-05 [slice_cell_reuse_recomputed_activation]: 2.16e-06 [rewriter_after_opt_a]: 4.799e-05 [convert_after_rewriter]: 8.77e-06 [order_py_execute_after_rewriter]: 6.79999e-06 [mutable_eliminate]: 0.00086764 [opt_b]: 0.00027673, [1] [Cycle 1]: 0.00026911, [7] [b_1]: 0.00017081 [b_2]: 1.068e-05 [updatestate_depend_eliminate]: 8.21002e-06 [updatestate_assign_eliminate]: 3.78001e-06 [updatestate_loads_eliminate]: 3.46001e-06 [renormalize]: 6.30011e-07 [cse]: 3.138e-05 [optimize_parallel_all_gather_comm]: 2.111e-05 [overlap_param_gather]: 1.89e-06 [cconv]: 3.278e-05 [loop_unroll]: 0.000476 [opt_after_cconv]: 0.00012322, [1] [Cycle 1]: 0.00011711, [7] [c_1]: 3.94e-05 [parameter_eliminate]: 4.20999e-06 [updatestate_depend_eliminate]: 6.98e-06 [updatestate_assign_eliminate]: 3.01999e-06 [updatestate_loads_eliminate]: 3.13e-06 [cse]: 2.557e-05 [renormalize]: 3.89991e-07 [remove_dup_value]: 1.593e-05 [tuple_transform]: 8.969e-05, [1] [Cycle 1]: 8.481e-05, [4] [d_1]: 5.592e-05 [none_parameter_eliminate]: 1.63002e-06 [renormalize]: 2.00002e-07 [switch_simplify]: 8.54e-06 [partial_unused_args_eliminate]: 1.97999e-06 [add_recomputation]: 5.979e-05 [cse_after_recomputation]: 2.671e-05, [1] [Cycle 1]: 2.142e-05, [1] [cse]: 1.575e-05 [environ_conv]: 7.34002e-06 [swap_dp_allreduce_reducescatter]: 6.01998e-06 [bias_add_comm_swap]: 2.86e-06 [label_micro_interleaved_index]: 4.58001e-06 [label_fine_grained_interleaved_index]: 3.01001e-06 [merge_cast_opt]: 1.29998e-06 [slice_recompute_activation]: 2.46e-06 [micro_interleaved_order_control]: 2.53003e-06 [assign_add_opt]: 1.17e-06 [ForceFp32Comm]: 8.70001e-07 [remove_cast_before_assign_add]: 1.05999e-06 [full_micro_interleaved_order_control]: 2.29999e-06 [reorder_send_recv_between_fp_bp]: 2.63998e-06 [comm_op_add_attrs]: 1.05001e-06 [add_comm_op_reuse_tag]: 1.02e-06 [interleave_split_concat_branches]: 1.11002e-06 [interleave_parallel_branches]: 1.05999e-06 [overlap_opt_shard_in_pipeline]: 1.15999e-06 [overlap_opt_shard_grad_in_pipeline]: 2.21e-06 [control_data_broadcast_order]: 1.597e-05 [grouped_pairwise_exchange_alltoall]: 1.66e-06 [offloading_packed_experts]: 4.55001e-06 [overlap_recompute_and_grad_model_parallel]: 5.67999e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.24998e-06 [overlap_recompute_allgather_and_fa_grad]: 1.55999e-06 [overlap_recompute_comm]: 2.16e-06 [overlap_grad_ring_attention]: 5.37001e-06 [overlap_grad_flash_sp]: 2.459e-05 [begin_end_overlap_inline]: 6.30011e-07 [split_matmul_comm_elemetwise]: 2.32999e-06 [split_layernorm_comm]: 1.74e-06 [handle_group_info]: 1.27e-06 [symbol_engine_optimizer]: 8.79e-05, [1] [Cycle 1]: 8.357e-05, [6] [build]: 4.22998e-06 [elim_shapecalc]: 1.305e-05 [elim_not_effective]: 1.589e-05 [opt_reshape]: 8.27e-06 [fold_const_symbol]: 1.273e-05 [renormalize]: 1.90019e-07 [detach_backward]: 1.99e-06 [pipeline_parallel_scheduler]: 1.68002e-06 [auto_monad_reorder]: 1.998e-05 [get_jit_bprop_graph]: 2.21e-06 [rewriter_after_jit_bprop_graph]: 5.13002e-06 [opt_after_jit_grad]: 0.00051522 [validate]: 4.874e-05 Sums bootstrap : 0.000479s : 3.61% type_inference : 0.006680s : 50.33% event_method : 0.000023s : 0.17% auto_monad : 0.000068s : 0.51% graph_reusing : 0.000006s : 0.05% inline : 0.000003s : 0.03% add_attr.add_attr_with_inline.tag_attr : 0.000026s : 0.19% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000007s : 0.05% parallel-infer-symbol : 0.000004s : 0.03% pre_auto_parallel : 0.000044s : 0.33% insert-virtual-dataset : 0.000003s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000033s : 0.25% optimize.rewriter_before_opt_a : 0.000102s : 0.77% optimize.opt_a.expand_dump_flag : 0.000005s : 0.04% optimize.opt_a.switch_simplify : 0.000057s : 0.43% optimize.opt_a.loop_unroll : 0.000074s : 0.56% optimize.opt_a.a_1 : 0.001018s : 7.67% optimize.opt_a.with_stream_mark : 0.000038s : 0.28% optimize.opt_a.recompute_prepare : 0.000021s : 0.15% optimize.opt_a.updatestate_depend_eliminate : 0.000009s : 0.07% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.06% optimize.opt_a.updatestate_loads_eliminate : 0.000007s : 0.06% optimize.opt_a.parameter_eliminate : 0.000004s : 0.03% optimize.opt_a.a_2 : 0.000204s : 1.54% optimize.opt_a.accelerated_algorithm : 0.000018s : 0.13% optimize.opt_a.shard : 0.000006s : 0.04% optimize.opt_a.meta_shard_fg_expand : 0.000005s : 0.04% optimize.opt_a.shard_inline : 0.000017s : 0.13% optimize.opt_a.merge_send_recv : 0.000021s : 0.15% optimize.opt_a.auto_parallel : 0.000017s : 0.13% optimize.opt_a.parallel : 0.000029s : 0.22% optimize.opt_a.flash_sp : 0.000015s : 0.11% optimize.opt_a.merge_comm : 0.000010s : 0.07% optimize.opt_a.allreduce_fusion : 0.000009s : 0.07% optimize.opt_a.matmul_add_comm_reduction : 0.000020s : 0.15% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000002s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000020s : 0.15% optimize.opt_a.virtual_dataset : 0.000016s : 0.12% optimize.opt_a.get_grad_eliminate_ : 0.000015s : 0.12% optimize.opt_a.virtual_output : 0.000015s : 0.11% optimize.opt_a.merge_forward : 0.000009s : 0.07% optimize.opt_a.cell_reuse_recompute_pass : 0.000004s : 0.03% optimize.opt_a.offload_activation : 0.000021s : 0.16% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000032s : 0.24% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.02% optimize.opt_a.before_grad : 0.000028s : 0.21% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000010s : 0.07% optimize.opt_a.meta_fg_expand : 0.000007s : 0.05% optimize.opt_a.flash_sp_send_recv_attached : 0.000005s : 0.04% optimize.opt_a.receive_attached : 0.000004s : 0.03% optimize.opt_a.after_resolve : 0.000027s : 0.20% optimize.opt_a.a_after_grad : 0.000025s : 0.19% optimize.opt_a.renormalize : 0.000991s : 7.47% optimize.opt_a.add_forward_monad_depend : 0.000011s : 0.08% optimize.opt_a.auto_monad_grad : 0.000004s : 0.03% optimize.opt_a.auto_monad_eliminator : 0.000032s : 0.24% optimize.opt_a.cse : 0.000063s : 0.47% optimize.opt_a.a_3 : 0.000121s : 0.91% optimize.py_interpret_to_execute_after_opt_a : 0.000016s : 0.12% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.02% optimize.rewriter_after_opt_a : 0.000048s : 0.36% optimize.convert_after_rewriter : 0.000009s : 0.07% optimize.order_py_execute_after_rewriter : 0.000007s : 0.05% optimize.mutable_eliminate : 0.000868s : 6.54% optimize.opt_b.b_1 : 0.000171s : 1.29% optimize.opt_b.b_2 : 0.000011s : 0.08% optimize.opt_b.updatestate_depend_eliminate : 0.000008s : 0.06% optimize.opt_b.updatestate_assign_eliminate : 0.000004s : 0.03% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000031s : 0.24% optimize.optimize_parallel_all_gather_comm : 0.000021s : 0.16% optimize.overlap_param_gather : 0.000002s : 0.01% optimize.cconv : 0.000033s : 0.25% optimize.loop_unroll : 0.000476s : 3.59% optimize.opt_after_cconv.c_1 : 0.000039s : 0.30% optimize.opt_after_cconv.parameter_eliminate : 0.000004s : 0.03% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000007s : 0.05% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.02% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.02% optimize.opt_after_cconv.cse : 0.000026s : 0.19% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000016s : 0.12% optimize.tuple_transform.d_1 : 0.000056s : 0.42% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000009s : 0.06% optimize.partial_unused_args_eliminate : 0.000002s : 0.01% optimize.add_recomputation : 0.000060s : 0.45% optimize.cse_after_recomputation.cse : 0.000016s : 0.12% optimize.environ_conv : 0.000007s : 0.06% optimize.swap_dp_allreduce_reducescatter : 0.000006s : 0.05% optimize.bias_add_comm_swap : 0.000003s : 0.02% optimize.label_micro_interleaved_index : 0.000005s : 0.03% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.02% optimize.merge_cast_opt : 0.000001s : 0.01% optimize.slice_recompute_activation : 0.000002s : 0.02% optimize.micro_interleaved_order_control : 0.000003s : 0.02% optimize.assign_add_opt : 0.000001s : 0.01% optimize.ForceFp32Comm : 0.000001s : 0.01% optimize.remove_cast_before_assign_add : 0.000001s : 0.01% optimize.full_micro_interleaved_order_control : 0.000002s : 0.02% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.02% optimize.comm_op_add_attrs : 0.000001s : 0.01% optimize.add_comm_op_reuse_tag : 0.000001s : 0.01% optimize.interleave_split_concat_branches : 0.000001s : 0.01% optimize.interleave_parallel_branches : 0.000001s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.02% optimize.control_data_broadcast_order : 0.000016s : 0.12% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.01% optimize.offloading_packed_experts : 0.000005s : 0.03% optimize.overlap_recompute_and_grad_model_parallel : 0.000006s : 0.04% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000002s : 0.01% optimize.overlap_recompute_comm : 0.000002s : 0.02% optimize.overlap_grad_ring_attention : 0.000005s : 0.04% optimize.overlap_grad_flash_sp : 0.000025s : 0.19% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.02% optimize.split_layernorm_comm : 0.000002s : 0.01% optimize.handle_group_info : 0.000001s : 0.01% optimize.symbol_engine_optimizer.build : 0.000004s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000013s : 0.10% optimize.symbol_engine_optimizer.elim_not_effective : 0.000016s : 0.12% optimize.symbol_engine_optimizer.opt_reshape : 0.000008s : 0.06% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000013s : 0.10% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.01% pipeline_parallel_scheduler : 0.000002s : 0.01% auto_monad_reorder : 0.000020s : 0.15% get_jit_bprop_graph : 0.000002s : 0.02% rewriter_after_jit_bprop_graph : 0.000005s : 0.04% opt_after_jit_grad : 0.000515s : 3.88% validate : 0.000049s : 0.37% Time group info: ------[substitution.] 0.000245 38 11.75% : 0.000029s : 3: substitution.cast_eliminate 0.86% : 0.000002s : 3: substitution.elim_not_effective 0.97% : 0.000002s : 3: substitution.fold_const_symbol 3.11% : 0.000008s : 5: substitution.graph_param_transform 69.10% : 0.000169s : 4: substitution.inline 2.55% : 0.000006s : 6: substitution.j_node_and_user_rematch 2.71% : 0.000007s : 6: substitution.remove_not_recompute_node 2.28% : 0.000006s : 4: substitution.replace_old_param 6.68% : 0.000016s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.006603 2 87.59% : 0.005783s : 1: type_inference.infer 12.41% : 0.000819s : 1: type_inference.specialize ------[replace.] 0.000070 8 61.37% : 0.000043s : 4: replace.inline 38.63% : 0.000027s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000181 8 92.09% : 0.000166s : 4: match.inline 7.91% : 0.000014s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000285 1596 0.99% : 0.000003s : 17: predicate.accumulaten_eliminater 0.61% : 0.000002s : 5: predicate.ad_related_special_op_eliminate 0.50% : 0.000001s : 10: predicate.addn_check_dump 0.98% : 0.000003s : 17: predicate.addn_zero_filter 0.85% : 0.000002s : 17: predicate.adjust_all_reduce_mul_add 2.17% : 0.000006s : 27: predicate.arithmetic_simplify 1.03% : 0.000003s : 17: predicate.cast_eliminate 0.60% : 0.000002s : 10: predicate.check_bprop_eliminate 0.52% : 0.000001s : 10: predicate.compare_switch_simplify 0.19% : 0.000001s : 5: predicate.const_output_eliminate 0.55% : 0.000002s : 10: predicate.depend_value_elim 1.12% : 0.000003s : 17: predicate.dict_get_item_const_eliminator 1.12% : 0.000003s : 17: predicate.dict_get_item_eliminator 0.92% : 0.000003s : 17: predicate.dict_set_item_eliminator 0.87% : 0.000002s : 10: predicate.dumpgradient_eliminate 0.19% : 0.000001s : 5: predicate.elim_not_effective 0.36% : 0.000001s : 5: predicate.elim_shapecalc_of_broadcastargs 1.20% : 0.000003s : 22: predicate.environ_add_const_eliminate 1.13% : 0.000003s : 22: predicate.environ_get_add_eliminate 1.15% : 0.000003s : 22: predicate.environ_get_depend_swap 1.63% : 0.000005s : 32: predicate.environ_get_eliminate 1.16% : 0.000003s : 22: predicate.environ_get_set_eliminate 1.39% : 0.000004s : 25: predicate.exchange_switch_depend_value 2.26% : 0.000006s : 25: predicate.float_depend_g_call 0.63% : 0.000002s : 10: predicate.float_environ_get_switch 0.80% : 0.000002s : 15: predicate.float_tuple_getitem_switch 0.18% : 0.000000s : 5: predicate.fold_const_symbol 0.76% : 0.000002s : 10: predicate.get_grad_eliminate 0.18% : 0.000001s : 5: predicate.graph_param_transform 0.56% : 0.000002s : 10: predicate.incorporate_call 0.47% : 0.000001s : 10: predicate.incorporate_call_switch 6.24% : 0.000018s : 72: predicate.inline 0.75% : 0.000002s : 10: predicate.inline_without_move 0.30% : 0.000001s : 10: predicate.j_node_and_user_rematch 0.97% : 0.000003s : 10: predicate.less_batch_normalization 1.93% : 0.000006s : 31: predicate.list_to_tuple_eliminator_ 2.54% : 0.000007s : 48: predicate.load_eliminater 0.72% : 0.000002s : 5: predicate.loop_unroll_after_grad 1.93% : 0.000006s : 36: predicate.loop_unroll_before_grad 1.84% : 0.000005s : 27: predicate.make_slice_get_slice_eliminator 0.61% : 0.000002s : 10: predicate.merge_addn 0.68% : 0.000002s : 10: predicate.micro_step_allgather_replace 0.65% : 0.000002s : 10: predicate.mini_step_allgather_replace 0.90% : 0.000003s : 17: predicate.minmaximum_grad 1.31% : 0.000004s : 5: predicate.mutable_eliminate 0.33% : 0.000001s : 5: predicate.opt_reshape 0.37% : 0.000001s : 5: predicate.parallel_virtual_node 1.87% : 0.000005s : 25: predicate.partial_defer_inline 1.55% : 0.000004s : 26: predicate.partial_eliminate 0.97% : 0.000003s : 17: predicate.print_const_string_wrapper 0.59% : 0.000002s : 10: predicate.reduce_all_const_elim 1.39% : 0.000004s : 17: predicate.reduce_eliminate 2.58% : 0.000007s : 48: predicate.redundant_stop_gradient_eliminater 0.35% : 0.000001s : 10: predicate.remove_not_recompute_node 1.37% : 0.000004s : 31: predicate.replace_applicator 0.39% : 0.000001s : 10: predicate.replace_old_param 0.29% : 0.000001s : 5: predicate.reset_defer_inline 1.02% : 0.000003s : 17: predicate.reshape_eliminate 0.55% : 0.000002s : 10: predicate.row_tensor_add_zeros_like 0.35% : 0.000001s : 5: predicate.row_tensor_eliminate 0.75% : 0.000002s : 10: predicate.same_eliminate 0.39% : 0.000001s : 10: predicate.set_cell_output_no_recompute 0.81% : 0.000002s : 10: predicate.shard_identity_eliminate 0.71% : 0.000002s : 10: predicate.special_op_eliminate 0.78% : 0.000002s : 10: predicate.specialize_transform 0.91% : 0.000003s : 10: predicate.split_environ_get_set_with_tuple_value 0.98% : 0.000003s : 10: predicate.stack_unstack_eliminate 0.32% : 0.000001s : 5: predicate.switch_call_monad_eliminater 1.57% : 0.000004s : 25: predicate.switch_defer_inline 2.17% : 0.000006s : 35: predicate.switch_layer_defer_inline 4.95% : 0.000014s : 76: predicate.switch_simplify 0.94% : 0.000003s : 17: predicate.tile_eliminate 0.95% : 0.000003s : 17: predicate.transpose_eliminate 1.58% : 0.000004s : 27: predicate.tuple_list_convert_item_index_to_positive 1.59% : 0.000005s : 27: predicate.tuple_list_get_item_const_eliminator 1.52% : 0.000004s : 27: predicate.tuple_list_get_item_depend_reorder 3.26% : 0.000009s : 41: predicate.tuple_list_get_item_eliminator 1.74% : 0.000005s : 27: predicate.tuple_list_get_set_item_eliminator 2.20% : 0.000006s : 37: predicate.tuple_list_set_item_eliminator 1.68% : 0.000005s : 31: predicate.tuple_to_list_eliminator_ 2.42% : 0.000007s : 48: predicate.updatestate_pure_node_eliminater 3.14% : 0.000009s : 58: predicate.updatestate_useless_node_eliminater 0.37% : 0.000001s : 5: predicate.value_based_eliminate 0.62% : 0.000002s : 10: predicate.virtual_dataset_eliminate 0.56% : 0.000002s : 10: predicate.virtual_output_eliminate 0.26% : 0.000001s : 5: predicate.virtual_view_grad_eliminate 0.41% : 0.000001s : 5: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000637 11 51.97% : 0.000331s : 5: func_graph_cloner_run.FuncGraphClonerGraph 48.03% : 0.000306s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.031208 192 0.01% : 0.000004s : 1: ForceFp32Comm 12.56% : 0.003920s : 1: add_attr 12.51% : 0.003905s : 1: add_attr_with_inline 0.01% : 0.000004s : 1: add_comm_op_reuse_tag 0.20% : 0.000064s : 1: add_recomputation 0.01% : 0.000004s : 1: assign_add_opt 0.24% : 0.000073s : 1: auto_monad 0.08% : 0.000024s : 1: auto_monad_reorder 0.01% : 0.000003s : 1: begin_end_overlap_inline 0.02% : 0.000006s : 1: bias_add_comm_swap 1.63% : 0.000508s : 1: bootstrap 0.12% : 0.000036s : 1: cconv 0.01% : 0.000004s : 1: comm_op_add_attrs 0.06% : 0.000019s : 1: control_data_broadcast_order 0.04% : 0.000012s : 1: convert_after_rewriter 0.10% : 0.000030s : 1: cse_after_recomputation 0.02% : 0.000005s : 1: dataset_repeat_opt 0.02% : 0.000005s : 1: detach_backward 0.03% : 0.000011s : 1: environ_conv 0.10% : 0.000031s : 1: event_method 0.02% : 0.000005s : 1: full_micro_interleaved_order_control 0.02% : 0.000005s : 1: get_jit_bprop_graph 0.03% : 0.000010s : 1: graph_reusing 0.01% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000004s : 1: handle_group_info 0.02% : 0.000006s : 1: inline 0.02% : 0.000007s : 1: insert-virtual-dataset 0.01% : 0.000004s : 1: interleave_parallel_branches 0.01% : 0.000004s : 1: interleave_split_concat_branches 0.02% : 0.000006s : 1: label_fine_grained_interleaved_index 0.02% : 0.000007s : 1: label_micro_interleaved_index 1.55% : 0.000485s : 1: loop_unroll 0.01% : 0.000004s : 1: merge_cast_opt 0.02% : 0.000005s : 1: micro_interleaved_order_control 2.82% : 0.000879s : 1: mutable_eliminate 0.02% : 0.000007s : 1: offloading_packed_experts 0.05% : 0.000017s : 1: opt.transform.loop_unroll_optimizer 0.08% : 0.000025s : 1: opt.transform.mutable_eliminate 5.14% : 0.001604s : 78: opt.transform.opt_a 0.12% : 0.000038s : 1: opt.transform.opt_after_cconv 0.10% : 0.000032s : 1: opt.transform.opt_after_jit_grad 0.47% : 0.000145s : 28: opt.transform.opt_b 0.20% : 0.000062s : 2: opt.transform.opt_trans_graph 0.15% : 0.000046s : 4: opt.transform.symbol_engine_opt 11.55% : 0.003606s : 1: opt_a 0.41% : 0.000127s : 1: opt_after_cconv 1.68% : 0.000526s : 1: opt_after_jit_grad 0.90% : 0.000280s : 1: opt_b 20.03% : 0.006251s : 1: optimize 0.08% : 0.000025s : 1: optimize_parallel_all_gather_comm 0.03% : 0.000010s : 1: order_py_execute_after_rewriter 0.09% : 0.000028s : 1: overlap_grad_flash_sp 0.01% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.03% : 0.000009s : 1: overlap_grad_ring_attention 0.02% : 0.000006s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.02% : 0.000005s : 1: overlap_param_gather 0.01% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.03% : 0.000009s : 1: overlap_recompute_and_grad_model_parallel 0.02% : 0.000005s : 1: overlap_recompute_comm 0.02% : 0.000008s : 1: parallel-infer-symbol 0.01% : 0.000004s : 1: parallel-infer-symbol-second 0.02% : 0.000005s : 1: partial_unused_args_eliminate 0.01% : 0.000005s : 1: pipeline_parallel_scheduler 0.02% : 0.000005s : 1: pipeline_split 0.16% : 0.000049s : 1: pre_auto_parallel 0.12% : 0.000037s : 1: py_interpret_to_execute 0.06% : 0.000019s : 1: py_interpret_to_execute_after_opt_a 0.02% : 0.000005s : 1: remove_cast_before_assign_add 0.06% : 0.000020s : 1: remove_dup_value 1.76% : 0.000549s : 1: renormalize.infer 1.38% : 0.000431s : 1: renormalize.specialize 0.02% : 0.000005s : 1: reorder_send_recv_between_fp_bp 0.03% : 0.000008s : 1: rewriter_after_jit_bprop_graph 0.17% : 0.000053s : 1: rewriter_after_opt_a 0.34% : 0.000107s : 1: rewriter_before_opt_a 0.02% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.02% : 0.000005s : 1: slice_recompute_activation 0.01% : 0.000005s : 1: split_layernorm_comm 0.02% : 0.000005s : 1: split_matmul_comm_elemetwise 0.03% : 0.000009s : 1: swap_dp_allreduce_reducescatter 0.29% : 0.000091s : 1: symbol_engine_optimizer 0.30% : 0.000093s : 1: tuple_transform 21.49% : 0.006707s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:45:04.775.693 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:45:04.775.998 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0218804, [21] [bootstrap]: 0.00122618 [type_inference]: 0.00697791 [event_method]: 2.203e-05 [auto_monad]: 7.273e-05 [graph_reusing]: 6.17999e-06 [inline]: 2.68998e-06 [add_attr]: 0.00412352, [1] [add_attr_with_inline]: 0.00411047, [1] [Cycle 1]: 9.781e-05, [2] [tag_attr]: 2.596e-05 [meta_addattr_fg_expand]: 6.19001e-06 [parallel-infer-symbol]: 4.53001e-06 [pre_auto_parallel]: 4.372e-05 [insert-virtual-dataset]: 2.93e-06 [parallel-infer-symbol-second]: 1.37e-06 [dataset_repeat_opt]: 2.17001e-06 [pipeline_split]: 1.54998e-06 [optimize]: 0.00746151, [53] [py_interpret_to_execute]: 3.937e-05 [rewriter_before_opt_a]: 0.00010754 [opt_a]: 0.00419255, [2] [Cycle 1]: 0.00303183, [45] [expand_dump_flag]: 3.46999e-06 [switch_simplify]: 4.555e-05 [loop_unroll]: 3.186e-05 [a_1]: 0.00082766 [with_stream_mark]: 2.716e-05 [recompute_prepare]: 1.539e-05 [updatestate_depend_eliminate]: 5.34e-06 [updatestate_assign_eliminate]: 4.89e-06 [updatestate_loads_eliminate]: 4.03001e-06 [parameter_eliminate]: 1.86003e-06 [a_2]: 0.00013209 [accelerated_algorithm]: 9.67999e-06 [shard]: 2.44001e-06 [meta_shard_fg_expand]: 2.75002e-06 [shard_inline]: 8.47998e-06 [merge_send_recv]: 1.218e-05 [auto_parallel]: 1.063e-05 [parallel]: 2.215e-05 [flash_sp]: 1.2e-05 [merge_comm]: 5.25999e-06 [allreduce_fusion]: 4.55001e-06 [matmul_add_comm_reduction]: 1.331e-05 [allreduce_slice_to_reducescatter]: 9.20001e-07 [virtual_shard_identity]: 1.057e-05 [virtual_dataset]: 8.63001e-06 [get_grad_eliminate_]: 9.09e-06 [virtual_output]: 8.38001e-06 [merge_forward]: 5.62001e-06 [cell_reuse_recompute_pass]: 2.29001e-06 [offload_activation]: 1.18e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.227e-05 [merge_recompute_call_nodes]: 1.54e-06 [before_grad]: 1.467e-05 [set_forward_comm_id_for_comm_node_pass]: 5.72001e-06 [meta_fg_expand]: 3.99002e-06 [flash_sp_send_recv_attached]: 3.93999e-06 [receive_attached]: 2.19001e-06 [after_resolve]: 1.845e-05 [a_after_grad]: 1.421e-05 [renormalize]: 0.00105597 [add_forward_monad_depend]: 9.67999e-06 [auto_monad_grad]: 2.86999e-06 [auto_monad_eliminator]: 2.604e-05 [cse]: 4.065e-05 [a_3]: 8.617e-05 [Cycle 2]: 0.00114096, [45] [expand_dump_flag]: 2.53e-06 [switch_simplify]: 1.031e-05 [loop_unroll]: 7.98999e-06 [a_1]: 0.0002167 [with_stream_mark]: 2.375e-05 [recompute_prepare]: 1.164e-05 [updatestate_depend_eliminate]: 5.27001e-06 [updatestate_assign_eliminate]: 3.9e-06 [updatestate_loads_eliminate]: 3.86001e-06 [parameter_eliminate]: 2.07999e-06 [a_2]: 0.00012636 [accelerated_algorithm]: 9.42999e-06 [shard]: 2.81e-06 [meta_shard_fg_expand]: 2.76999e-06 [shard_inline]: 7.66001e-06 [merge_send_recv]: 1.009e-05 [auto_parallel]: 1.155e-05 [parallel]: 1.596e-05 [flash_sp]: 4.48001e-06 [merge_comm]: 4.58001e-06 [allreduce_fusion]: 4.65999e-06 [matmul_add_comm_reduction]: 1.22e-05 [allreduce_slice_to_reducescatter]: 7.90023e-07 [virtual_shard_identity]: 1.153e-05 [virtual_dataset]: 7.83999e-06 [get_grad_eliminate_]: 7.35e-06 [virtual_output]: 8.93002e-06 [merge_forward]: 4.93001e-06 [cell_reuse_recompute_pass]: 3.53e-06 [offload_activation]: 1.221e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.759e-05 [merge_recompute_call_nodes]: 2.02999e-06 [before_grad]: 1.473e-05 [set_forward_comm_id_for_comm_node_pass]: 5.85002e-06 [meta_fg_expand]: 3.60998e-06 [flash_sp_send_recv_attached]: 1.91998e-06 [receive_attached]: 2.14999e-06 [after_resolve]: 1.644e-05 [a_after_grad]: 1.192e-05 [renormalize]: 8.00064e-08 [add_forward_monad_depend]: 4e-06 [auto_monad_grad]: 2.46998e-06 [auto_monad_eliminator]: 1.548e-05 [cse]: 2.358e-05 [a_3]: 6.28e-05 [py_interpret_to_execute_after_opt_a]: 2.424e-05 [slice_cell_reuse_recomputed_activation]: 5.27001e-06 [rewriter_after_opt_a]: 5.429e-05 [convert_after_rewriter]: 1.327e-05 [order_py_execute_after_rewriter]: 9.67999e-06 [mutable_eliminate]: 0.00085711 [opt_b]: 0.00036431, [1] [Cycle 1]: 0.00035081, [7] [b_1]: 0.00021463 [b_2]: 1.28e-05 [updatestate_depend_eliminate]: 9.66e-06 [updatestate_assign_eliminate]: 3.93999e-06 [updatestate_loads_eliminate]: 3.79002e-06 [renormalize]: 1.25999e-06 [cse]: 4.064e-05 [optimize_parallel_all_gather_comm]: 2.837e-05 [overlap_param_gather]: 5.43997e-06 [cconv]: 4.365e-05 [loop_unroll]: 0.00069763 [opt_after_cconv]: 0.0001642, [1] [Cycle 1]: 0.00015217, [7] [c_1]: 4.036e-05 [parameter_eliminate]: 4.01001e-06 [updatestate_depend_eliminate]: 8.73001e-06 [updatestate_assign_eliminate]: 3.41999e-06 [updatestate_loads_eliminate]: 3.31001e-06 [cse]: 3.325e-05 [renormalize]: 9.49978e-07 [remove_dup_value]: 2.17e-05 [tuple_transform]: 0.00011142, [1] [Cycle 1]: 0.00010361, [4] [d_1]: 6.117e-05 [none_parameter_eliminate]: 2.17999e-06 [renormalize]: 2.30008e-07 [switch_simplify]: 8.90999e-06 [partial_unused_args_eliminate]: 4.74998e-06 [add_recomputation]: 6.621e-05 [cse_after_recomputation]: 3.542e-05, [1] [Cycle 1]: 2.76e-05, [1] [cse]: 1.782e-05 [environ_conv]: 1.119e-05 [swap_dp_allreduce_reducescatter]: 9.57001e-06 [bias_add_comm_swap]: 6.36e-06 [label_micro_interleaved_index]: 9.17999e-06 [label_fine_grained_interleaved_index]: 5.19998e-06 [merge_cast_opt]: 4.06001e-06 [slice_recompute_activation]: 4.48001e-06 [micro_interleaved_order_control]: 5.44e-06 [assign_add_opt]: 3.75998e-06 [ForceFp32Comm]: 3.51001e-06 [remove_cast_before_assign_add]: 3.5e-06 [full_micro_interleaved_order_control]: 4.68999e-06 [reorder_send_recv_between_fp_bp]: 5.80002e-06 [comm_op_add_attrs]: 3.3e-06 [add_comm_op_reuse_tag]: 3.43999e-06 [interleave_split_concat_branches]: 3.41999e-06 [interleave_parallel_branches]: 3.55e-06 [overlap_opt_shard_in_pipeline]: 3.78999e-06 [overlap_opt_shard_grad_in_pipeline]: 4.62998e-06 [control_data_broadcast_order]: 2.093e-05 [grouped_pairwise_exchange_alltoall]: 3.9e-06 [offloading_packed_experts]: 7.88001e-06 [overlap_recompute_and_grad_model_parallel]: 7.92e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.64002e-06 [overlap_recompute_allgather_and_fa_grad]: 3.66001e-06 [overlap_recompute_comm]: 5.67001e-06 [overlap_grad_ring_attention]: 7.58001e-06 [overlap_grad_flash_sp]: 2.688e-05 [begin_end_overlap_inline]: 2.96001e-06 [split_matmul_comm_elemetwise]: 4.88001e-06 [split_layernorm_comm]: 3.97002e-06 [handle_group_info]: 3.46001e-06 [symbol_engine_optimizer]: 0.00017365, [1] [Cycle 1]: 0.00016535, [6] [build]: 3.63999e-06 [elim_shapecalc]: 1.252e-05 [elim_not_effective]: 1.775e-05 [opt_reshape]: 9.47999e-06 [fold_const_symbol]: 6.936e-05 [renormalize]: 2.30008e-07 [detach_backward]: 5.92999e-06 [pipeline_parallel_scheduler]: 2.74999e-06 [auto_monad_reorder]: 2.851e-05 [get_jit_bprop_graph]: 2.19001e-06 [rewriter_after_jit_bprop_graph]: 7.88999e-06 [opt_after_jit_grad]: 0.00080313 [validate]: 5.718e-05 Sums bootstrap : 0.001226s : 7.97% type_inference : 0.006978s : 45.34% event_method : 0.000022s : 0.14% auto_monad : 0.000073s : 0.47% graph_reusing : 0.000006s : 0.04% inline : 0.000003s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000026s : 0.17% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.04% parallel-infer-symbol : 0.000005s : 0.03% pre_auto_parallel : 0.000044s : 0.28% insert-virtual-dataset : 0.000003s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.01% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000039s : 0.26% optimize.rewriter_before_opt_a : 0.000108s : 0.70% optimize.opt_a.expand_dump_flag : 0.000006s : 0.04% optimize.opt_a.switch_simplify : 0.000056s : 0.36% optimize.opt_a.loop_unroll : 0.000040s : 0.26% optimize.opt_a.a_1 : 0.001044s : 6.79% optimize.opt_a.with_stream_mark : 0.000051s : 0.33% optimize.opt_a.recompute_prepare : 0.000027s : 0.18% optimize.opt_a.updatestate_depend_eliminate : 0.000011s : 0.07% optimize.opt_a.updatestate_assign_eliminate : 0.000009s : 0.06% optimize.opt_a.updatestate_loads_eliminate : 0.000008s : 0.05% optimize.opt_a.parameter_eliminate : 0.000004s : 0.03% optimize.opt_a.a_2 : 0.000258s : 1.68% optimize.opt_a.accelerated_algorithm : 0.000019s : 0.12% optimize.opt_a.shard : 0.000005s : 0.03% optimize.opt_a.meta_shard_fg_expand : 0.000006s : 0.04% optimize.opt_a.shard_inline : 0.000016s : 0.10% optimize.opt_a.merge_send_recv : 0.000022s : 0.14% optimize.opt_a.auto_parallel : 0.000022s : 0.14% optimize.opt_a.parallel : 0.000038s : 0.25% optimize.opt_a.flash_sp : 0.000016s : 0.11% optimize.opt_a.merge_comm : 0.000010s : 0.06% optimize.opt_a.allreduce_fusion : 0.000009s : 0.06% optimize.opt_a.matmul_add_comm_reduction : 0.000026s : 0.17% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000002s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000022s : 0.14% optimize.opt_a.virtual_dataset : 0.000016s : 0.11% optimize.opt_a.get_grad_eliminate_ : 0.000016s : 0.11% optimize.opt_a.virtual_output : 0.000017s : 0.11% optimize.opt_a.merge_forward : 0.000011s : 0.07% optimize.opt_a.cell_reuse_recompute_pass : 0.000006s : 0.04% optimize.opt_a.offload_activation : 0.000024s : 0.16% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000050s : 0.32% optimize.opt_a.merge_recompute_call_nodes : 0.000004s : 0.02% optimize.opt_a.before_grad : 0.000029s : 0.19% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000012s : 0.08% optimize.opt_a.meta_fg_expand : 0.000008s : 0.05% optimize.opt_a.flash_sp_send_recv_attached : 0.000006s : 0.04% optimize.opt_a.receive_attached : 0.000004s : 0.03% optimize.opt_a.after_resolve : 0.000035s : 0.23% optimize.opt_a.a_after_grad : 0.000026s : 0.17% optimize.opt_a.renormalize : 0.001056s : 6.86% optimize.opt_a.add_forward_monad_depend : 0.000014s : 0.09% optimize.opt_a.auto_monad_grad : 0.000005s : 0.03% optimize.opt_a.auto_monad_eliminator : 0.000042s : 0.27% optimize.opt_a.cse : 0.000064s : 0.42% optimize.opt_a.a_3 : 0.000149s : 0.97% optimize.py_interpret_to_execute_after_opt_a : 0.000024s : 0.16% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.03% optimize.rewriter_after_opt_a : 0.000054s : 0.35% optimize.convert_after_rewriter : 0.000013s : 0.09% optimize.order_py_execute_after_rewriter : 0.000010s : 0.06% optimize.mutable_eliminate : 0.000857s : 5.57% optimize.opt_b.b_1 : 0.000215s : 1.39% optimize.opt_b.b_2 : 0.000013s : 0.08% optimize.opt_b.updatestate_depend_eliminate : 0.000010s : 0.06% optimize.opt_b.updatestate_assign_eliminate : 0.000004s : 0.03% optimize.opt_b.updatestate_loads_eliminate : 0.000004s : 0.02% optimize.opt_b.renormalize : 0.000001s : 0.01% optimize.opt_b.cse : 0.000041s : 0.26% optimize.optimize_parallel_all_gather_comm : 0.000028s : 0.18% optimize.overlap_param_gather : 0.000005s : 0.04% optimize.cconv : 0.000044s : 0.28% optimize.loop_unroll : 0.000698s : 4.53% optimize.opt_after_cconv.c_1 : 0.000040s : 0.26% optimize.opt_after_cconv.parameter_eliminate : 0.000004s : 0.03% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000009s : 0.06% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.02% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.02% optimize.opt_after_cconv.cse : 0.000033s : 0.22% optimize.opt_after_cconv.renormalize : 0.000001s : 0.01% optimize.remove_dup_value : 0.000022s : 0.14% optimize.tuple_transform.d_1 : 0.000061s : 0.40% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000009s : 0.06% optimize.partial_unused_args_eliminate : 0.000005s : 0.03% optimize.add_recomputation : 0.000066s : 0.43% optimize.cse_after_recomputation.cse : 0.000018s : 0.12% optimize.environ_conv : 0.000011s : 0.07% optimize.swap_dp_allreduce_reducescatter : 0.000010s : 0.06% optimize.bias_add_comm_swap : 0.000006s : 0.04% optimize.label_micro_interleaved_index : 0.000009s : 0.06% optimize.label_fine_grained_interleaved_index : 0.000005s : 0.03% optimize.merge_cast_opt : 0.000004s : 0.03% optimize.slice_recompute_activation : 0.000004s : 0.03% optimize.micro_interleaved_order_control : 0.000005s : 0.04% optimize.assign_add_opt : 0.000004s : 0.02% optimize.ForceFp32Comm : 0.000004s : 0.02% optimize.remove_cast_before_assign_add : 0.000003s : 0.02% optimize.full_micro_interleaved_order_control : 0.000005s : 0.03% optimize.reorder_send_recv_between_fp_bp : 0.000006s : 0.04% optimize.comm_op_add_attrs : 0.000003s : 0.02% optimize.add_comm_op_reuse_tag : 0.000003s : 0.02% optimize.interleave_split_concat_branches : 0.000003s : 0.02% optimize.interleave_parallel_branches : 0.000004s : 0.02% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.02% optimize.overlap_opt_shard_grad_in_pipeline : 0.000005s : 0.03% optimize.control_data_broadcast_order : 0.000021s : 0.14% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.03% optimize.offloading_packed_experts : 0.000008s : 0.05% optimize.overlap_recompute_and_grad_model_parallel : 0.000008s : 0.05% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.02% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.02% optimize.overlap_recompute_comm : 0.000006s : 0.04% optimize.overlap_grad_ring_attention : 0.000008s : 0.05% optimize.overlap_grad_flash_sp : 0.000027s : 0.17% optimize.begin_end_overlap_inline : 0.000003s : 0.02% optimize.split_matmul_comm_elemetwise : 0.000005s : 0.03% optimize.split_layernorm_comm : 0.000004s : 0.03% optimize.handle_group_info : 0.000003s : 0.02% optimize.symbol_engine_optimizer.build : 0.000004s : 0.02% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000013s : 0.08% optimize.symbol_engine_optimizer.elim_not_effective : 0.000018s : 0.12% optimize.symbol_engine_optimizer.opt_reshape : 0.000009s : 0.06% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000069s : 0.45% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000006s : 0.04% pipeline_parallel_scheduler : 0.000003s : 0.02% auto_monad_reorder : 0.000029s : 0.19% get_jit_bprop_graph : 0.000002s : 0.01% rewriter_after_jit_bprop_graph : 0.000008s : 0.05% opt_after_jit_grad : 0.000803s : 5.22% validate : 0.000057s : 0.37% Time group info: ------[substitution.] 0.000322 38 10.23% : 0.000033s : 3: substitution.cast_eliminate 0.79% : 0.000003s : 3: substitution.elim_not_effective 17.09% : 0.000055s : 3: substitution.fold_const_symbol 2.56% : 0.000008s : 5: substitution.graph_param_transform 56.62% : 0.000182s : 4: substitution.inline 1.85% : 0.000006s : 6: substitution.j_node_and_user_rematch 3.32% : 0.000011s : 6: substitution.remove_not_recompute_node 2.46% : 0.000008s : 4: substitution.replace_old_param 5.08% : 0.000016s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.006911 2 87.73% : 0.006063s : 1: type_inference.infer 12.27% : 0.000848s : 1: type_inference.specialize ------[replace.] 0.000080 8 59.57% : 0.000047s : 4: replace.inline 40.43% : 0.000032s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000194 8 92.57% : 0.000179s : 4: match.inline 7.43% : 0.000014s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000290 1596 0.95% : 0.000003s : 17: predicate.accumulaten_eliminater 1.07% : 0.000003s : 5: predicate.ad_related_special_op_eliminate 0.63% : 0.000002s : 10: predicate.addn_check_dump 1.04% : 0.000003s : 17: predicate.addn_zero_filter 0.78% : 0.000002s : 17: predicate.adjust_all_reduce_mul_add 2.04% : 0.000006s : 27: predicate.arithmetic_simplify 1.08% : 0.000003s : 17: predicate.cast_eliminate 0.67% : 0.000002s : 10: predicate.check_bprop_eliminate 0.55% : 0.000002s : 10: predicate.compare_switch_simplify 0.18% : 0.000001s : 5: predicate.const_output_eliminate 0.70% : 0.000002s : 10: predicate.depend_value_elim 0.90% : 0.000003s : 17: predicate.dict_get_item_const_eliminator 1.07% : 0.000003s : 17: predicate.dict_get_item_eliminator 0.85% : 0.000002s : 17: predicate.dict_set_item_eliminator 0.96% : 0.000003s : 10: predicate.dumpgradient_eliminate 0.17% : 0.000000s : 5: predicate.elim_not_effective 0.43% : 0.000001s : 5: predicate.elim_shapecalc_of_broadcastargs 1.22% : 0.000004s : 22: predicate.environ_add_const_eliminate 1.12% : 0.000003s : 22: predicate.environ_get_add_eliminate 1.13% : 0.000003s : 22: predicate.environ_get_depend_swap 1.67% : 0.000005s : 32: predicate.environ_get_eliminate 1.23% : 0.000004s : 22: predicate.environ_get_set_eliminate 1.32% : 0.000004s : 25: predicate.exchange_switch_depend_value 2.37% : 0.000007s : 25: predicate.float_depend_g_call 0.53% : 0.000002s : 10: predicate.float_environ_get_switch 0.77% : 0.000002s : 15: predicate.float_tuple_getitem_switch 0.15% : 0.000000s : 5: predicate.fold_const_symbol 0.57% : 0.000002s : 10: predicate.get_grad_eliminate 0.25% : 0.000001s : 5: predicate.graph_param_transform 0.55% : 0.000002s : 10: predicate.incorporate_call 0.45% : 0.000001s : 10: predicate.incorporate_call_switch 6.23% : 0.000018s : 72: predicate.inline 0.86% : 0.000003s : 10: predicate.inline_without_move 0.29% : 0.000001s : 10: predicate.j_node_and_user_rematch 0.88% : 0.000003s : 10: predicate.less_batch_normalization 1.75% : 0.000005s : 31: predicate.list_to_tuple_eliminator_ 2.48% : 0.000007s : 48: predicate.load_eliminater 0.99% : 0.000003s : 5: predicate.loop_unroll_after_grad 1.85% : 0.000005s : 36: predicate.loop_unroll_before_grad 1.65% : 0.000005s : 27: predicate.make_slice_get_slice_eliminator 0.50% : 0.000001s : 10: predicate.merge_addn 0.58% : 0.000002s : 10: predicate.micro_step_allgather_replace 0.55% : 0.000002s : 10: predicate.mini_step_allgather_replace 1.07% : 0.000003s : 17: predicate.minmaximum_grad 1.14% : 0.000003s : 5: predicate.mutable_eliminate 0.38% : 0.000001s : 5: predicate.opt_reshape 0.40% : 0.000001s : 5: predicate.parallel_virtual_node 1.74% : 0.000005s : 25: predicate.partial_defer_inline 1.54% : 0.000004s : 26: predicate.partial_eliminate 1.13% : 0.000003s : 17: predicate.print_const_string_wrapper 0.54% : 0.000002s : 10: predicate.reduce_all_const_elim 1.31% : 0.000004s : 17: predicate.reduce_eliminate 2.47% : 0.000007s : 48: predicate.redundant_stop_gradient_eliminater 0.48% : 0.000001s : 10: predicate.remove_not_recompute_node 1.47% : 0.000004s : 31: predicate.replace_applicator 0.51% : 0.000001s : 10: predicate.replace_old_param 0.40% : 0.000001s : 5: predicate.reset_defer_inline 0.98% : 0.000003s : 17: predicate.reshape_eliminate 0.60% : 0.000002s : 10: predicate.row_tensor_add_zeros_like 0.35% : 0.000001s : 5: predicate.row_tensor_eliminate 0.86% : 0.000002s : 10: predicate.same_eliminate 0.63% : 0.000002s : 10: predicate.set_cell_output_no_recompute 0.81% : 0.000002s : 10: predicate.shard_identity_eliminate 0.63% : 0.000002s : 10: predicate.special_op_eliminate 0.74% : 0.000002s : 10: predicate.specialize_transform 1.01% : 0.000003s : 10: predicate.split_environ_get_set_with_tuple_value 0.75% : 0.000002s : 10: predicate.stack_unstack_eliminate 0.37% : 0.000001s : 5: predicate.switch_call_monad_eliminater 1.32% : 0.000004s : 25: predicate.switch_defer_inline 1.90% : 0.000006s : 35: predicate.switch_layer_defer_inline 4.54% : 0.000013s : 76: predicate.switch_simplify 0.86% : 0.000002s : 17: predicate.tile_eliminate 0.92% : 0.000003s : 17: predicate.transpose_eliminate 1.76% : 0.000005s : 27: predicate.tuple_list_convert_item_index_to_positive 1.73% : 0.000005s : 27: predicate.tuple_list_get_item_const_eliminator 1.60% : 0.000005s : 27: predicate.tuple_list_get_item_depend_reorder 3.31% : 0.000010s : 41: predicate.tuple_list_get_item_eliminator 1.67% : 0.000005s : 27: predicate.tuple_list_get_set_item_eliminator 2.49% : 0.000007s : 37: predicate.tuple_list_set_item_eliminator 1.63% : 0.000005s : 31: predicate.tuple_to_list_eliminator_ 2.34% : 0.000007s : 48: predicate.updatestate_pure_node_eliminater 3.05% : 0.000009s : 58: predicate.updatestate_useless_node_eliminater 0.37% : 0.000001s : 5: predicate.value_based_eliminate 0.71% : 0.000002s : 10: predicate.virtual_dataset_eliminate 0.74% : 0.000002s : 10: predicate.virtual_output_eliminate 0.27% : 0.000001s : 5: predicate.virtual_view_grad_eliminate 0.49% : 0.000001s : 5: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000660 11 51.44% : 0.000340s : 5: func_graph_cloner_run.FuncGraphClonerGraph 48.56% : 0.000321s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.036422 192 0.02% : 0.000006s : 1: ForceFp32Comm 11.36% : 0.004137s : 1: add_attr 11.30% : 0.004115s : 1: add_attr_with_inline 0.02% : 0.000006s : 1: add_comm_op_reuse_tag 0.19% : 0.000070s : 1: add_recomputation 0.02% : 0.000006s : 1: assign_add_opt 0.23% : 0.000082s : 1: auto_monad 0.10% : 0.000037s : 1: auto_monad_reorder 0.02% : 0.000006s : 1: begin_end_overlap_inline 0.02% : 0.000009s : 1: bias_add_comm_swap 3.50% : 0.001274s : 1: bootstrap 0.13% : 0.000047s : 1: cconv 0.02% : 0.000006s : 1: comm_op_add_attrs 0.07% : 0.000024s : 1: control_data_broadcast_order 0.05% : 0.000018s : 1: convert_after_rewriter 0.11% : 0.000039s : 1: cse_after_recomputation 0.02% : 0.000008s : 1: dataset_repeat_opt 0.08% : 0.000028s : 1: detach_backward 0.04% : 0.000014s : 1: environ_conv 0.09% : 0.000033s : 1: event_method 0.02% : 0.000007s : 1: full_micro_interleaved_order_control 0.02% : 0.000009s : 1: get_jit_bprop_graph 0.04% : 0.000013s : 1: graph_reusing 0.02% : 0.000006s : 1: grouped_pairwise_exchange_alltoall 0.02% : 0.000006s : 1: handle_group_info 0.02% : 0.000009s : 1: inline 0.03% : 0.000010s : 1: insert-virtual-dataset 0.02% : 0.000006s : 1: interleave_parallel_branches 0.02% : 0.000006s : 1: interleave_split_concat_branches 0.02% : 0.000008s : 1: label_fine_grained_interleaved_index 0.03% : 0.000012s : 1: label_micro_interleaved_index 1.94% : 0.000707s : 1: loop_unroll 0.02% : 0.000007s : 1: merge_cast_opt 0.02% : 0.000008s : 1: micro_interleaved_order_control 2.39% : 0.000869s : 1: mutable_eliminate 0.03% : 0.000011s : 1: offloading_packed_experts 0.07% : 0.000024s : 1: opt.transform.loop_unroll_optimizer 0.08% : 0.000030s : 1: opt.transform.mutable_eliminate 4.46% : 0.001625s : 78: opt.transform.opt_a 0.11% : 0.000039s : 1: opt.transform.opt_after_cconv 0.11% : 0.000039s : 1: opt.transform.opt_after_jit_grad 0.40% : 0.000146s : 28: opt.transform.opt_b 0.19% : 0.000068s : 2: opt.transform.opt_trans_graph 0.29% : 0.000105s : 4: opt.transform.symbol_engine_opt 11.52% : 0.004197s : 1: opt_a 0.46% : 0.000168s : 1: opt_after_cconv 2.25% : 0.000818s : 1: opt_after_jit_grad 1.01% : 0.000368s : 1: opt_b 22.49% : 0.008191s : 1: optimize 0.09% : 0.000032s : 1: optimize_parallel_all_gather_comm 0.04% : 0.000013s : 1: order_py_execute_after_rewriter 0.08% : 0.000030s : 1: overlap_grad_flash_sp 0.02% : 0.000006s : 1: overlap_grad_matmul_and_grad_allreduce 0.03% : 0.000011s : 1: overlap_grad_ring_attention 0.02% : 0.000008s : 1: overlap_opt_shard_grad_in_pipeline 0.02% : 0.000006s : 1: overlap_opt_shard_in_pipeline 0.02% : 0.000009s : 1: overlap_param_gather 0.02% : 0.000006s : 1: overlap_recompute_allgather_and_fa_grad 0.03% : 0.000011s : 1: overlap_recompute_and_grad_model_parallel 0.02% : 0.000009s : 1: overlap_recompute_comm 0.03% : 0.000012s : 1: parallel-infer-symbol 0.02% : 0.000007s : 1: parallel-infer-symbol-second 0.02% : 0.000008s : 1: partial_unused_args_eliminate 0.03% : 0.000011s : 1: pipeline_parallel_scheduler 0.02% : 0.000007s : 1: pipeline_split 0.14% : 0.000052s : 1: pre_auto_parallel 0.12% : 0.000043s : 1: py_interpret_to_execute 0.08% : 0.000029s : 1: py_interpret_to_execute_after_opt_a 0.02% : 0.000006s : 1: remove_cast_before_assign_add 0.07% : 0.000025s : 1: remove_dup_value 1.60% : 0.000584s : 1: renormalize.infer 1.26% : 0.000460s : 1: renormalize.specialize 0.02% : 0.000009s : 1: reorder_send_recv_between_fp_bp 0.04% : 0.000014s : 1: rewriter_after_jit_bprop_graph 0.16% : 0.000059s : 1: rewriter_after_opt_a 0.31% : 0.000113s : 1: rewriter_before_opt_a 0.02% : 0.000008s : 1: slice_cell_reuse_recomputed_activation 0.02% : 0.000007s : 1: slice_recompute_activation 0.02% : 0.000007s : 1: split_layernorm_comm 0.02% : 0.000008s : 1: split_matmul_comm_elemetwise 0.03% : 0.000012s : 1: swap_dp_allreduce_reducescatter 0.49% : 0.000177s : 1: symbol_engine_optimizer 0.31% : 0.000114s : 1: tuple_transform 19.31% : 0.007032s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:45:05.984.53 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0182997, [21] [bootstrap]: 0.00046886 [type_inference]: 0.00641481 [event_method]: 2.08e-05 [auto_monad]: 6.789e-05 [graph_reusing]: 6.09001e-06 [inline]: 2.68003e-06 [add_attr]: 0.00371038, [1] [add_attr_with_inline]: 0.0036978, [1] [Cycle 1]: 7.508e-05, [2] [tag_attr]: 2.413e-05 [meta_addattr_fg_expand]: 6.94001e-06 [parallel-infer-symbol]: 3.8e-06 [pre_auto_parallel]: 4.164e-05 [insert-virtual-dataset]: 2.76e-06 [parallel-infer-symbol-second]: 8.60018e-07 [dataset_repeat_opt]: 2.18002e-06 [pipeline_split]: 1.71e-06 [optimize]: 0.00659003, [53] [py_interpret_to_execute]: 2.937e-05 [rewriter_before_opt_a]: 9.346e-05 [opt_a]: 0.00366385, [2] [Cycle 1]: 0.00276614, [45] [expand_dump_flag]: 3.25e-06 [switch_simplify]: 4.547e-05 [loop_unroll]: 3.132e-05 [a_1]: 0.00085446 [with_stream_mark]: 2.54e-05 [recompute_prepare]: 1.277e-05 [updatestate_depend_eliminate]: 5.46e-06 [updatestate_assign_eliminate]: 3.81999e-06 [updatestate_loads_eliminate]: 3.66999e-06 [parameter_eliminate]: 2.61e-06 [a_2]: 0.00010462 [accelerated_algorithm]: 9.44998e-06 [shard]: 2.02999e-06 [meta_shard_fg_expand]: 2.58e-06 [shard_inline]: 8.37e-06 [merge_send_recv]: 1.014e-05 [auto_parallel]: 9.19e-06 [parallel]: 1.96e-05 [flash_sp]: 1.16e-05 [merge_comm]: 5.03002e-06 [allreduce_fusion]: 4.18999e-06 [matmul_add_comm_reduction]: 1.153e-05 [allreduce_slice_to_reducescatter]: 8.89995e-07 [virtual_shard_identity]: 1.143e-05 [virtual_dataset]: 8.1e-06 [get_grad_eliminate_]: 7.70998e-06 [virtual_output]: 7.95998e-06 [merge_forward]: 4.62e-06 [cell_reuse_recompute_pass]: 1.63002e-06 [offload_activation]: 1.171e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.859e-05 [merge_recompute_call_nodes]: 2.09999e-06 [before_grad]: 1.419e-05 [set_forward_comm_id_for_comm_node_pass]: 4.70999e-06 [meta_fg_expand]: 3.75998e-06 [flash_sp_send_recv_attached]: 3.19001e-06 [receive_attached]: 2.46e-06 [after_resolve]: 1.419e-05 [a_after_grad]: 1.292e-05 [renormalize]: 0.00101423 [add_forward_monad_depend]: 8.2e-06 [auto_monad_grad]: 2.81e-06 [auto_monad_eliminator]: 2.422e-05 [cse]: 4.266e-05 [a_3]: 6.947e-05 [Cycle 2]: 0.00088265, [45] [expand_dump_flag]: 2.81e-06 [switch_simplify]: 1.04e-05 [loop_unroll]: 7.71999e-06 [a_1]: 0.00020005 [with_stream_mark]: 2.034e-05 [recompute_prepare]: 8.60999e-06 [updatestate_depend_eliminate]: 4.50001e-06 [updatestate_assign_eliminate]: 3.91001e-06 [updatestate_loads_eliminate]: 3.2e-06 [parameter_eliminate]: 1.60001e-06 [a_2]: 9.409e-05 [accelerated_algorithm]: 8.47998e-06 [shard]: 2.37999e-06 [meta_shard_fg_expand]: 2.79001e-06 [shard_inline]: 7.63001e-06 [merge_send_recv]: 9.27999e-06 [auto_parallel]: 1.015e-05 [parallel]: 7.75998e-06 [flash_sp]: 4.08999e-06 [merge_comm]: 4.99e-06 [allreduce_fusion]: 4.47998e-06 [matmul_add_comm_reduction]: 9.82999e-06 [allreduce_slice_to_reducescatter]: 1.03001e-06 [virtual_shard_identity]: 9.72001e-06 [virtual_dataset]: 7.46999e-06 [get_grad_eliminate_]: 8.54e-06 [virtual_output]: 6.94999e-06 [merge_forward]: 4.50999e-06 [cell_reuse_recompute_pass]: 2.68998e-06 [offload_activation]: 1.255e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.743e-05 [merge_recompute_call_nodes]: 1.31998e-06 [before_grad]: 1.371e-05 [set_forward_comm_id_for_comm_node_pass]: 4.50001e-06 [meta_fg_expand]: 3.68999e-06 [flash_sp_send_recv_attached]: 1.34e-06 [receive_attached]: 2.06e-06 [after_resolve]: 1.417e-05 [a_after_grad]: 1.197e-05 [renormalize]: 8.00064e-08 [add_forward_monad_depend]: 2.95998e-06 [auto_monad_grad]: 1.79e-06 [auto_monad_eliminator]: 1.306e-05 [cse]: 2.781e-05 [a_3]: 4.932e-05 [py_interpret_to_execute_after_opt_a]: 1.829e-05 [slice_cell_reuse_recomputed_activation]: 2.49001e-06 [rewriter_after_opt_a]: 4.933e-05 [convert_after_rewriter]: 7.94997e-06 [order_py_execute_after_rewriter]: 6.23e-06 [mutable_eliminate]: 0.00086316 [opt_b]: 0.00028424, [1] [Cycle 1]: 0.00027504, [7] [b_1]: 0.00016395 [b_2]: 1.113e-05 [updatestate_depend_eliminate]: 1.008e-05 [updatestate_assign_eliminate]: 3.61001e-06 [updatestate_loads_eliminate]: 3.53e-06 [renormalize]: 1.31002e-06 [cse]: 3.998e-05 [optimize_parallel_all_gather_comm]: 2.34e-05 [overlap_param_gather]: 2.37001e-06 [cconv]: 3.658e-05 [loop_unroll]: 0.00067633 [opt_after_cconv]: 0.00014616, [1] [Cycle 1]: 0.00013683, [7] [c_1]: 4.141e-05 [parameter_eliminate]: 6.06998e-06 [updatestate_depend_eliminate]: 8.94998e-06 [updatestate_assign_eliminate]: 3.45e-06 [updatestate_loads_eliminate]: 3.04001e-06 [cse]: 3.524e-05 [renormalize]: 8.89995e-07 [remove_dup_value]: 1.876e-05 [tuple_transform]: 9.901e-05, [1] [Cycle 1]: 9.397e-05, [4] [d_1]: 6.214e-05 [none_parameter_eliminate]: 1.96e-06 [renormalize]: 3.00002e-07 [switch_simplify]: 8.79e-06 [partial_unused_args_eliminate]: 2.35002e-06 [add_recomputation]: 7.267e-05 [cse_after_recomputation]: 3.197e-05, [1] [Cycle 1]: 2.67e-05, [1] [cse]: 1.924e-05 [environ_conv]: 8.03001e-06 [swap_dp_allreduce_reducescatter]: 6.09999e-06 [bias_add_comm_swap]: 3.29001e-06 [label_micro_interleaved_index]: 5.24e-06 [label_fine_grained_interleaved_index]: 2.71e-06 [merge_cast_opt]: 1.47001e-06 [slice_recompute_activation]: 2.02999e-06 [micro_interleaved_order_control]: 2.31e-06 [assign_add_opt]: 1.66e-06 [ForceFp32Comm]: 9.60019e-07 [remove_cast_before_assign_add]: 9.50007e-07 [full_micro_interleaved_order_control]: 2.67001e-06 [reorder_send_recv_between_fp_bp]: 2.91e-06 [comm_op_add_attrs]: 1.12999e-06 [add_comm_op_reuse_tag]: 1.15999e-06 [interleave_split_concat_branches]: 1.12e-06 [interleave_parallel_branches]: 1.28002e-06 [overlap_opt_shard_in_pipeline]: 1.39e-06 [overlap_opt_shard_grad_in_pipeline]: 2.34001e-06 [control_data_broadcast_order]: 1.86e-05 [grouped_pairwise_exchange_alltoall]: 2.19001e-06 [offloading_packed_experts]: 4.88001e-06 [overlap_recompute_and_grad_model_parallel]: 5.70001e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.27e-06 [overlap_recompute_allgather_and_fa_grad]: 1.62999e-06 [overlap_recompute_comm]: 2.26e-06 [overlap_grad_ring_attention]: 4.88001e-06 [overlap_grad_flash_sp]: 2.475e-05 [begin_end_overlap_inline]: 4.69998e-07 [split_matmul_comm_elemetwise]: 2.28002e-06 [split_layernorm_comm]: 1.72999e-06 [handle_group_info]: 1.05001e-06 [symbol_engine_optimizer]: 9.794e-05, [1] [Cycle 1]: 9.192e-05, [6] [build]: 4.79998e-06 [elim_shapecalc]: 1.561e-05 [elim_not_effective]: 1.789e-05 [opt_reshape]: 9.12001e-06 [fold_const_symbol]: 1.261e-05 [renormalize]: 3.10014e-07 [detach_backward]: 2.29001e-06 [pipeline_parallel_scheduler]: 1.66e-06 [auto_monad_reorder]: 2.435e-05 [get_jit_bprop_graph]: 2.07001e-06 [rewriter_after_jit_bprop_graph]: 5.89e-06 [opt_after_jit_grad]: 0.00069577 [validate]: 5.744e-05 Sums bootstrap : 0.000469s : 3.48% type_inference : 0.006415s : 47.55% event_method : 0.000021s : 0.15% auto_monad : 0.000068s : 0.50% graph_reusing : 0.000006s : 0.05% inline : 0.000003s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000024s : 0.18% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000007s : 0.05% parallel-infer-symbol : 0.000004s : 0.03% pre_auto_parallel : 0.000042s : 0.31% insert-virtual-dataset : 0.000003s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000029s : 0.22% optimize.rewriter_before_opt_a : 0.000093s : 0.69% optimize.opt_a.expand_dump_flag : 0.000006s : 0.04% optimize.opt_a.switch_simplify : 0.000056s : 0.41% optimize.opt_a.loop_unroll : 0.000039s : 0.29% optimize.opt_a.a_1 : 0.001055s : 7.82% optimize.opt_a.with_stream_mark : 0.000046s : 0.34% optimize.opt_a.recompute_prepare : 0.000021s : 0.16% optimize.opt_a.updatestate_depend_eliminate : 0.000010s : 0.07% optimize.opt_a.updatestate_assign_eliminate : 0.000008s : 0.06% optimize.opt_a.updatestate_loads_eliminate : 0.000007s : 0.05% optimize.opt_a.parameter_eliminate : 0.000004s : 0.03% optimize.opt_a.a_2 : 0.000199s : 1.47% optimize.opt_a.accelerated_algorithm : 0.000018s : 0.13% optimize.opt_a.shard : 0.000004s : 0.03% optimize.opt_a.meta_shard_fg_expand : 0.000005s : 0.04% optimize.opt_a.shard_inline : 0.000016s : 0.12% optimize.opt_a.merge_send_recv : 0.000019s : 0.14% optimize.opt_a.auto_parallel : 0.000019s : 0.14% optimize.opt_a.parallel : 0.000027s : 0.20% optimize.opt_a.flash_sp : 0.000016s : 0.12% optimize.opt_a.merge_comm : 0.000010s : 0.07% optimize.opt_a.allreduce_fusion : 0.000009s : 0.06% optimize.opt_a.matmul_add_comm_reduction : 0.000021s : 0.16% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000002s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000021s : 0.16% optimize.opt_a.virtual_dataset : 0.000016s : 0.12% optimize.opt_a.get_grad_eliminate_ : 0.000016s : 0.12% optimize.opt_a.virtual_output : 0.000015s : 0.11% optimize.opt_a.merge_forward : 0.000009s : 0.07% optimize.opt_a.cell_reuse_recompute_pass : 0.000004s : 0.03% optimize.opt_a.offload_activation : 0.000024s : 0.18% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000036s : 0.27% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.03% optimize.opt_a.before_grad : 0.000028s : 0.21% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000009s : 0.07% optimize.opt_a.meta_fg_expand : 0.000007s : 0.06% optimize.opt_a.flash_sp_send_recv_attached : 0.000005s : 0.03% optimize.opt_a.receive_attached : 0.000005s : 0.03% optimize.opt_a.after_resolve : 0.000028s : 0.21% optimize.opt_a.a_after_grad : 0.000025s : 0.18% optimize.opt_a.renormalize : 0.001014s : 7.52% optimize.opt_a.add_forward_monad_depend : 0.000011s : 0.08% optimize.opt_a.auto_monad_grad : 0.000005s : 0.03% optimize.opt_a.auto_monad_eliminator : 0.000037s : 0.28% optimize.opt_a.cse : 0.000070s : 0.52% optimize.opt_a.a_3 : 0.000119s : 0.88% optimize.py_interpret_to_execute_after_opt_a : 0.000018s : 0.14% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.02% optimize.rewriter_after_opt_a : 0.000049s : 0.37% optimize.convert_after_rewriter : 0.000008s : 0.06% optimize.order_py_execute_after_rewriter : 0.000006s : 0.05% optimize.mutable_eliminate : 0.000863s : 6.40% optimize.opt_b.b_1 : 0.000164s : 1.22% optimize.opt_b.b_2 : 0.000011s : 0.08% optimize.opt_b.updatestate_depend_eliminate : 0.000010s : 0.07% optimize.opt_b.updatestate_assign_eliminate : 0.000004s : 0.03% optimize.opt_b.updatestate_loads_eliminate : 0.000004s : 0.03% optimize.opt_b.renormalize : 0.000001s : 0.01% optimize.opt_b.cse : 0.000040s : 0.30% optimize.optimize_parallel_all_gather_comm : 0.000023s : 0.17% optimize.overlap_param_gather : 0.000002s : 0.02% optimize.cconv : 0.000037s : 0.27% optimize.loop_unroll : 0.000676s : 5.01% optimize.opt_after_cconv.c_1 : 0.000041s : 0.31% optimize.opt_after_cconv.parameter_eliminate : 0.000006s : 0.04% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000009s : 0.07% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.02% optimize.opt_after_cconv.cse : 0.000035s : 0.26% optimize.opt_after_cconv.renormalize : 0.000001s : 0.01% optimize.remove_dup_value : 0.000019s : 0.14% optimize.tuple_transform.d_1 : 0.000062s : 0.46% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000009s : 0.07% optimize.partial_unused_args_eliminate : 0.000002s : 0.02% optimize.add_recomputation : 0.000073s : 0.54% optimize.cse_after_recomputation.cse : 0.000019s : 0.14% optimize.environ_conv : 0.000008s : 0.06% optimize.swap_dp_allreduce_reducescatter : 0.000006s : 0.05% optimize.bias_add_comm_swap : 0.000003s : 0.02% optimize.label_micro_interleaved_index : 0.000005s : 0.04% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.02% optimize.merge_cast_opt : 0.000001s : 0.01% optimize.slice_recompute_activation : 0.000002s : 0.02% optimize.micro_interleaved_order_control : 0.000002s : 0.02% optimize.assign_add_opt : 0.000002s : 0.01% optimize.ForceFp32Comm : 0.000001s : 0.01% optimize.remove_cast_before_assign_add : 0.000001s : 0.01% optimize.full_micro_interleaved_order_control : 0.000003s : 0.02% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.02% optimize.comm_op_add_attrs : 0.000001s : 0.01% optimize.add_comm_op_reuse_tag : 0.000001s : 0.01% optimize.interleave_split_concat_branches : 0.000001s : 0.01% optimize.interleave_parallel_branches : 0.000001s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.02% optimize.control_data_broadcast_order : 0.000019s : 0.14% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.02% optimize.offloading_packed_experts : 0.000005s : 0.04% optimize.overlap_recompute_and_grad_model_parallel : 0.000006s : 0.04% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000002s : 0.01% optimize.overlap_recompute_comm : 0.000002s : 0.02% optimize.overlap_grad_ring_attention : 0.000005s : 0.04% optimize.overlap_grad_flash_sp : 0.000025s : 0.18% optimize.begin_end_overlap_inline : 0.000000s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.02% optimize.split_layernorm_comm : 0.000002s : 0.01% optimize.handle_group_info : 0.000001s : 0.01% optimize.symbol_engine_optimizer.build : 0.000005s : 0.04% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000016s : 0.12% optimize.symbol_engine_optimizer.elim_not_effective : 0.000018s : 0.13% optimize.symbol_engine_optimizer.opt_reshape : 0.000009s : 0.07% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000013s : 0.09% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.02% pipeline_parallel_scheduler : 0.000002s : 0.01% auto_monad_reorder : 0.000024s : 0.18% get_jit_bprop_graph : 0.000002s : 0.02% rewriter_after_jit_bprop_graph : 0.000006s : 0.04% opt_after_jit_grad : 0.000696s : 5.16% validate : 0.000057s : 0.43% Time group info: ------[substitution.] 0.000260 38 12.22% : 0.000032s : 3: substitution.cast_eliminate 0.98% : 0.000003s : 3: substitution.elim_not_effective 0.62% : 0.000002s : 3: substitution.fold_const_symbol 3.13% : 0.000008s : 5: substitution.graph_param_transform 69.84% : 0.000181s : 4: substitution.inline 2.41% : 0.000006s : 6: substitution.j_node_and_user_rematch 2.63% : 0.000007s : 6: substitution.remove_not_recompute_node 2.23% : 0.000006s : 4: substitution.replace_old_param 5.94% : 0.000015s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.006345 2 87.72% : 0.005566s : 1: type_inference.infer 12.28% : 0.000779s : 1: type_inference.specialize ------[replace.] 0.000071 8 59.18% : 0.000042s : 4: replace.inline 40.82% : 0.000029s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000192 8 92.99% : 0.000178s : 4: match.inline 7.01% : 0.000013s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000287 1596 0.97% : 0.000003s : 17: predicate.accumulaten_eliminater 0.88% : 0.000003s : 5: predicate.ad_related_special_op_eliminate 0.53% : 0.000002s : 10: predicate.addn_check_dump 1.19% : 0.000003s : 17: predicate.addn_zero_filter 0.77% : 0.000002s : 17: predicate.adjust_all_reduce_mul_add 2.13% : 0.000006s : 27: predicate.arithmetic_simplify 1.39% : 0.000004s : 17: predicate.cast_eliminate 0.56% : 0.000002s : 10: predicate.check_bprop_eliminate 0.55% : 0.000002s : 10: predicate.compare_switch_simplify 0.19% : 0.000001s : 5: predicate.const_output_eliminate 0.57% : 0.000002s : 10: predicate.depend_value_elim 0.92% : 0.000003s : 17: predicate.dict_get_item_const_eliminator 1.09% : 0.000003s : 17: predicate.dict_get_item_eliminator 0.83% : 0.000002s : 17: predicate.dict_set_item_eliminator 1.16% : 0.000003s : 10: predicate.dumpgradient_eliminate 0.19% : 0.000001s : 5: predicate.elim_not_effective 0.52% : 0.000001s : 5: predicate.elim_shapecalc_of_broadcastargs 1.18% : 0.000003s : 22: predicate.environ_add_const_eliminate 1.16% : 0.000003s : 22: predicate.environ_get_add_eliminate 1.06% : 0.000003s : 22: predicate.environ_get_depend_swap 1.73% : 0.000005s : 32: predicate.environ_get_eliminate 1.28% : 0.000004s : 22: predicate.environ_get_set_eliminate 1.36% : 0.000004s : 25: predicate.exchange_switch_depend_value 2.11% : 0.000006s : 25: predicate.float_depend_g_call 0.50% : 0.000001s : 10: predicate.float_environ_get_switch 0.75% : 0.000002s : 15: predicate.float_tuple_getitem_switch 0.23% : 0.000001s : 5: predicate.fold_const_symbol 0.72% : 0.000002s : 10: predicate.get_grad_eliminate 0.29% : 0.000001s : 5: predicate.graph_param_transform 0.54% : 0.000002s : 10: predicate.incorporate_call 0.49% : 0.000001s : 10: predicate.incorporate_call_switch 6.13% : 0.000018s : 72: predicate.inline 0.85% : 0.000002s : 10: predicate.inline_without_move 0.28% : 0.000001s : 10: predicate.j_node_and_user_rematch 0.99% : 0.000003s : 10: predicate.less_batch_normalization 1.68% : 0.000005s : 31: predicate.list_to_tuple_eliminator_ 2.56% : 0.000007s : 48: predicate.load_eliminater 1.54% : 0.000004s : 5: predicate.loop_unroll_after_grad 1.82% : 0.000005s : 36: predicate.loop_unroll_before_grad 1.64% : 0.000005s : 27: predicate.make_slice_get_slice_eliminator 0.54% : 0.000002s : 10: predicate.merge_addn 0.69% : 0.000002s : 10: predicate.micro_step_allgather_replace 0.51% : 0.000001s : 10: predicate.mini_step_allgather_replace 0.78% : 0.000002s : 17: predicate.minmaximum_grad 1.58% : 0.000005s : 5: predicate.mutable_eliminate 0.40% : 0.000001s : 5: predicate.opt_reshape 0.36% : 0.000001s : 5: predicate.parallel_virtual_node 1.72% : 0.000005s : 25: predicate.partial_defer_inline 1.52% : 0.000004s : 26: predicate.partial_eliminate 0.94% : 0.000003s : 17: predicate.print_const_string_wrapper 0.54% : 0.000002s : 10: predicate.reduce_all_const_elim 1.22% : 0.000003s : 17: predicate.reduce_eliminate 2.41% : 0.000007s : 48: predicate.redundant_stop_gradient_eliminater 0.43% : 0.000001s : 10: predicate.remove_not_recompute_node 1.38% : 0.000004s : 31: predicate.replace_applicator 0.48% : 0.000001s : 10: predicate.replace_old_param 0.32% : 0.000001s : 5: predicate.reset_defer_inline 0.98% : 0.000003s : 17: predicate.reshape_eliminate 0.52% : 0.000001s : 10: predicate.row_tensor_add_zeros_like 0.37% : 0.000001s : 5: predicate.row_tensor_eliminate 0.78% : 0.000002s : 10: predicate.same_eliminate 0.43% : 0.000001s : 10: predicate.set_cell_output_no_recompute 0.79% : 0.000002s : 10: predicate.shard_identity_eliminate 0.93% : 0.000003s : 10: predicate.special_op_eliminate 0.82% : 0.000002s : 10: predicate.specialize_transform 1.09% : 0.000003s : 10: predicate.split_environ_get_set_with_tuple_value 0.82% : 0.000002s : 10: predicate.stack_unstack_eliminate 0.47% : 0.000001s : 5: predicate.switch_call_monad_eliminater 1.37% : 0.000004s : 25: predicate.switch_defer_inline 1.82% : 0.000005s : 35: predicate.switch_layer_defer_inline 4.49% : 0.000013s : 76: predicate.switch_simplify 0.98% : 0.000003s : 17: predicate.tile_eliminate 0.90% : 0.000003s : 17: predicate.transpose_eliminate 1.55% : 0.000004s : 27: predicate.tuple_list_convert_item_index_to_positive 1.63% : 0.000005s : 27: predicate.tuple_list_get_item_const_eliminator 1.56% : 0.000004s : 27: predicate.tuple_list_get_item_depend_reorder 3.23% : 0.000009s : 41: predicate.tuple_list_get_item_eliminator 1.60% : 0.000005s : 27: predicate.tuple_list_get_set_item_eliminator 2.30% : 0.000007s : 37: predicate.tuple_list_set_item_eliminator 1.72% : 0.000005s : 31: predicate.tuple_to_list_eliminator_ 2.33% : 0.000007s : 48: predicate.updatestate_pure_node_eliminater 3.05% : 0.000009s : 58: predicate.updatestate_useless_node_eliminater 0.38% : 0.000001s : 5: predicate.value_based_eliminate 0.63% : 0.000002s : 10: predicate.virtual_dataset_eliminate 0.62% : 0.000002s : 10: predicate.virtual_output_eliminate 0.29% : 0.000001s : 5: predicate.virtual_view_grad_eliminate 0.37% : 0.000001s : 5: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000617 11 50.21% : 0.000310s : 5: func_graph_cloner_run.FuncGraphClonerGraph 49.79% : 0.000307s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.031441 192 0.01% : 0.000004s : 1: ForceFp32Comm 11.82% : 0.003718s : 1: add_attr 11.77% : 0.003702s : 1: add_attr_with_inline 0.01% : 0.000004s : 1: add_comm_op_reuse_tag 0.25% : 0.000078s : 1: add_recomputation 0.01% : 0.000004s : 1: assign_add_opt 0.23% : 0.000073s : 1: auto_monad 0.09% : 0.000029s : 1: auto_monad_reorder 0.01% : 0.000003s : 1: begin_end_overlap_inline 0.02% : 0.000007s : 1: bias_add_comm_swap 1.60% : 0.000503s : 1: bootstrap 0.13% : 0.000040s : 1: cconv 0.01% : 0.000004s : 1: comm_op_add_attrs 0.07% : 0.000022s : 1: control_data_broadcast_order 0.04% : 0.000011s : 1: convert_after_rewriter 0.11% : 0.000035s : 1: cse_after_recomputation 0.02% : 0.000005s : 1: dataset_repeat_opt 0.02% : 0.000006s : 1: detach_backward 0.04% : 0.000011s : 1: environ_conv 0.09% : 0.000028s : 1: event_method 0.02% : 0.000006s : 1: full_micro_interleaved_order_control 0.02% : 0.000006s : 1: get_jit_bprop_graph 0.03% : 0.000010s : 1: graph_reusing 0.02% : 0.000005s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000004s : 1: handle_group_info 0.02% : 0.000006s : 1: inline 0.02% : 0.000006s : 1: insert-virtual-dataset 0.01% : 0.000004s : 1: interleave_parallel_branches 0.01% : 0.000004s : 1: interleave_split_concat_branches 0.02% : 0.000006s : 1: label_fine_grained_interleaved_index 0.03% : 0.000008s : 1: label_micro_interleaved_index 2.19% : 0.000688s : 1: loop_unroll 0.01% : 0.000005s : 1: merge_cast_opt 0.02% : 0.000005s : 1: micro_interleaved_order_control 2.79% : 0.000877s : 1: mutable_eliminate 0.02% : 0.000008s : 1: offloading_packed_experts 0.08% : 0.000026s : 1: opt.transform.loop_unroll_optimizer 0.09% : 0.000027s : 1: opt.transform.mutable_eliminate 5.09% : 0.001601s : 78: opt.transform.opt_a 0.13% : 0.000040s : 1: opt.transform.opt_after_cconv 0.13% : 0.000039s : 1: opt.transform.opt_after_jit_grad 0.44% : 0.000139s : 28: opt.transform.opt_b 0.22% : 0.000068s : 2: opt.transform.opt_trans_graph 0.16% : 0.000050s : 4: opt.transform.symbol_engine_opt 11.66% : 0.003668s : 1: opt_a 0.48% : 0.000150s : 1: opt_after_cconv 2.26% : 0.000709s : 1: opt_after_jit_grad 0.92% : 0.000289s : 1: opt_b 20.98% : 0.006596s : 1: optimize 0.09% : 0.000027s : 1: optimize_parallel_all_gather_comm 0.03% : 0.000009s : 1: order_py_execute_after_rewriter 0.09% : 0.000029s : 1: overlap_grad_flash_sp 0.01% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.03% : 0.000009s : 1: overlap_grad_ring_attention 0.02% : 0.000006s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.02% : 0.000006s : 1: overlap_param_gather 0.01% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.03% : 0.000009s : 1: overlap_recompute_and_grad_model_parallel 0.02% : 0.000005s : 1: overlap_recompute_comm 0.02% : 0.000008s : 1: parallel-infer-symbol 0.01% : 0.000004s : 1: parallel-infer-symbol-second 0.02% : 0.000006s : 1: partial_unused_args_eliminate 0.02% : 0.000005s : 1: pipeline_parallel_scheduler 0.01% : 0.000005s : 1: pipeline_split 0.15% : 0.000046s : 1: pre_auto_parallel 0.11% : 0.000034s : 1: py_interpret_to_execute 0.07% : 0.000023s : 1: py_interpret_to_execute_after_opt_a 0.02% : 0.000005s : 1: remove_cast_before_assign_add 0.07% : 0.000023s : 1: remove_dup_value 1.77% : 0.000556s : 1: renormalize.infer 1.42% : 0.000446s : 1: renormalize.specialize 0.02% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.03% : 0.000009s : 1: rewriter_after_jit_bprop_graph 0.17% : 0.000054s : 1: rewriter_after_opt_a 0.31% : 0.000098s : 1: rewriter_before_opt_a 0.02% : 0.000006s : 1: slice_cell_reuse_recomputed_activation 0.01% : 0.000005s : 1: slice_recompute_activation 0.01% : 0.000005s : 1: split_layernorm_comm 0.02% : 0.000005s : 1: split_matmul_comm_elemetwise 0.03% : 0.000009s : 1: swap_dp_allreduce_reducescatter 0.32% : 0.000101s : 1: symbol_engine_optimizer 0.32% : 0.000102s : 1: tuple_transform 20.48% : 0.006438s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:45:05.419.589 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:45:05.419.890 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0181272, [21] [bootstrap]: 0.00043957 [type_inference]: 0.00577919 [event_method]: 2.197e-05 [auto_monad]: 7.198e-05 [graph_reusing]: 6.64999e-06 [inline]: 2.39001e-06 [add_attr]: 0.00327641, [1] [add_attr_with_inline]: 0.00326612, [1] [Cycle 1]: 7.709e-05, [2] [tag_attr]: 2.042e-05 [meta_addattr_fg_expand]: 6.47001e-06 [parallel-infer-symbol]: 4e-06 [pre_auto_parallel]: 3.75e-05 [insert-virtual-dataset]: 2.59001e-06 [parallel-infer-symbol-second]: 7.39994e-07 [dataset_repeat_opt]: 1.84e-06 [pipeline_split]: 1.55001e-06 [optimize]: 0.00701291, [53] [py_interpret_to_execute]: 3.377e-05 [rewriter_before_opt_a]: 9.465e-05 [opt_a]: 0.00389803, [2] [Cycle 1]: 0.00277121, [45] [expand_dump_flag]: 3.45998e-06 [switch_simplify]: 4.381e-05 [loop_unroll]: 3.125e-05 [a_1]: 0.00071825 [with_stream_mark]: 1.735e-05 [recompute_prepare]: 1.234e-05 [updatestate_depend_eliminate]: 5.55001e-06 [updatestate_assign_eliminate]: 5.15999e-06 [updatestate_loads_eliminate]: 4.43999e-06 [parameter_eliminate]: 2.19001e-06 [a_2]: 0.00014918 [accelerated_algorithm]: 1.016e-05 [shard]: 1.99e-06 [meta_shard_fg_expand]: 2.21e-06 [shard_inline]: 8.87e-06 [merge_send_recv]: 1.131e-05 [auto_parallel]: 8.01001e-06 [parallel]: 1.955e-05 [flash_sp]: 8.40001e-06 [merge_comm]: 6.21e-06 [allreduce_fusion]: 5.17999e-06 [matmul_add_comm_reduction]: 1.062e-05 [allreduce_slice_to_reducescatter]: 8.00006e-07 [virtual_shard_identity]: 1.111e-05 [virtual_dataset]: 9.27001e-06 [get_grad_eliminate_]: 9.02e-06 [virtual_output]: 8.87e-06 [merge_forward]: 5.32001e-06 [cell_reuse_recompute_pass]: 1.50999e-06 [offload_activation]: 1.162e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.018e-05 [merge_recompute_call_nodes]: 1.47999e-06 [before_grad]: 1.665e-05 [set_forward_comm_id_for_comm_node_pass]: 6.41998e-06 [meta_fg_expand]: 4.07003e-06 [flash_sp_send_recv_attached]: 3.14999e-06 [receive_attached]: 3.05002e-06 [after_resolve]: 1.471e-05 [a_after_grad]: 1.447e-05 [renormalize]: 0.00095579 [add_forward_monad_depend]: 7.98001e-06 [auto_monad_grad]: 2.98e-06 [auto_monad_eliminator]: 2.271e-05 [cse]: 4.697e-05 [a_3]: 8.986e-05 [Cycle 2]: 0.00111042, [45] [expand_dump_flag]: 1.77999e-06 [switch_simplify]: 1.095e-05 [loop_unroll]: 8.80001e-06 [a_1]: 0.00022779 [with_stream_mark]: 1.835e-05 [recompute_prepare]: 8.94e-06 [updatestate_depend_eliminate]: 5.52001e-06 [updatestate_assign_eliminate]: 4.05e-06 [updatestate_loads_eliminate]: 4.15e-06 [parameter_eliminate]: 1.44e-06 [a_2]: 0.00013942 [accelerated_algorithm]: 9.00001e-06 [shard]: 2.24999e-06 [meta_shard_fg_expand]: 2.94999e-06 [shard_inline]: 9.45001e-06 [merge_send_recv]: 8.60001e-06 [auto_parallel]: 8.60999e-06 [parallel]: 6.85002e-06 [flash_sp]: 4.94e-06 [merge_comm]: 5.23002e-06 [allreduce_fusion]: 4.65999e-06 [matmul_add_comm_reduction]: 9.81e-06 [allreduce_slice_to_reducescatter]: 8.50006e-07 [virtual_shard_identity]: 1.215e-05 [virtual_dataset]: 8.99e-06 [get_grad_eliminate_]: 9.62001e-06 [virtual_output]: 8.68001e-06 [merge_forward]: 5.35999e-06 [cell_reuse_recompute_pass]: 2.44001e-06 [offload_activation]: 1.111e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.015e-05 [merge_recompute_call_nodes]: 9.5999e-07 [before_grad]: 1.56e-05 [set_forward_comm_id_for_comm_node_pass]: 5.42999e-06 [meta_fg_expand]: 4.47e-06 [flash_sp_send_recv_attached]: 1.19e-06 [receive_attached]: 1.76e-06 [after_resolve]: 1.548e-05 [a_after_grad]: 1.355e-05 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 1.59998e-06 [auto_monad_grad]: 1.86e-06 [auto_monad_eliminator]: 1.322e-05 [cse]: 2.877e-05 [a_3]: 6.918e-05 [py_interpret_to_execute_after_opt_a]: 2.063e-05 [slice_cell_reuse_recomputed_activation]: 5.47999e-06 [rewriter_after_opt_a]: 5.357e-05 [convert_after_rewriter]: 1.211e-05 [order_py_execute_after_rewriter]: 1.005e-05 [mutable_eliminate]: 0.00077169 [opt_b]: 0.00037753, [1] [Cycle 1]: 0.00036593, [7] [b_1]: 0.00023838 [b_2]: 1.12e-05 [updatestate_depend_eliminate]: 9.56e-06 [updatestate_assign_eliminate]: 4.08999e-06 [updatestate_loads_eliminate]: 4.03999e-06 [renormalize]: 6.50005e-07 [cse]: 4.061e-05 [optimize_parallel_all_gather_comm]: 2.611e-05 [overlap_param_gather]: 5.38002e-06 [cconv]: 3.995e-05 [loop_unroll]: 0.00064643 [opt_after_cconv]: 0.00017604, [1] [Cycle 1]: 0.00016481, [7] [c_1]: 4.604e-05 [parameter_eliminate]: 4.23999e-06 [updatestate_depend_eliminate]: 8.59e-06 [updatestate_assign_eliminate]: 4.21001e-06 [updatestate_loads_eliminate]: 4.16001e-06 [cse]: 3.734e-05 [renormalize]: 5.8001e-07 [remove_dup_value]: 5.557e-05 [tuple_transform]: 0.00011858, [1] [Cycle 1]: 0.0001108, [4] [d_1]: 6.75e-05 [none_parameter_eliminate]: 1.82001e-06 [renormalize]: 2.30008e-07 [switch_simplify]: 1.03e-05 [partial_unused_args_eliminate]: 5.03002e-06 [add_recomputation]: 7.583e-05 [cse_after_recomputation]: 3.634e-05, [1] [Cycle 1]: 2.92e-05, [1] [cse]: 1.926e-05 [environ_conv]: 1.049e-05 [swap_dp_allreduce_reducescatter]: 1.011e-05 [bias_add_comm_swap]: 5.37999e-06 [label_micro_interleaved_index]: 8.43001e-06 [label_fine_grained_interleaved_index]: 5.24e-06 [merge_cast_opt]: 3.97e-06 [slice_recompute_activation]: 4.54002e-06 [micro_interleaved_order_control]: 5.00001e-06 [assign_add_opt]: 3.86001e-06 [ForceFp32Comm]: 3.73001e-06 [remove_cast_before_assign_add]: 3.65e-06 [full_micro_interleaved_order_control]: 4.51002e-06 [reorder_send_recv_between_fp_bp]: 5.04e-06 [comm_op_add_attrs]: 4.17e-06 [add_comm_op_reuse_tag]: 3.63999e-06 [interleave_split_concat_branches]: 3.75e-06 [interleave_parallel_branches]: 3.58999e-06 [overlap_opt_shard_in_pipeline]: 3.97998e-06 [overlap_opt_shard_grad_in_pipeline]: 4.63001e-06 [control_data_broadcast_order]: 2.083e-05 [grouped_pairwise_exchange_alltoall]: 4.29002e-06 [offloading_packed_experts]: 8.13001e-06 [overlap_recompute_and_grad_model_parallel]: 9.05001e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.6e-06 [overlap_recompute_allgather_and_fa_grad]: 3.63999e-06 [overlap_recompute_comm]: 4.97999e-06 [overlap_grad_ring_attention]: 7.53e-06 [overlap_grad_flash_sp]: 3.026e-05 [begin_end_overlap_inline]: 2.83e-06 [split_matmul_comm_elemetwise]: 4.68001e-06 [split_layernorm_comm]: 4.80001e-06 [handle_group_info]: 3.58999e-06 [symbol_engine_optimizer]: 0.00011952, [1] [Cycle 1]: 0.00011251, [6] [build]: 4.12e-06 [elim_shapecalc]: 1.39e-05 [elim_not_effective]: 1.837e-05 [opt_reshape]: 1.065e-05 [fold_const_symbol]: 1.566e-05 [renormalize]: 2.80008e-07 [detach_backward]: 3.83001e-06 [pipeline_parallel_scheduler]: 1.99999e-06 [auto_monad_reorder]: 2.687e-05 [get_jit_bprop_graph]: 1.80001e-06 [rewriter_after_jit_bprop_graph]: 6.48e-06 [opt_after_jit_grad]: 0.00070969 [validate]: 5.417e-05 Sums bootstrap : 0.000440s : 3.39% type_inference : 0.005779s : 44.63% event_method : 0.000022s : 0.17% auto_monad : 0.000072s : 0.56% graph_reusing : 0.000007s : 0.05% inline : 0.000002s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000020s : 0.16% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.05% parallel-infer-symbol : 0.000004s : 0.03% pre_auto_parallel : 0.000038s : 0.29% insert-virtual-dataset : 0.000003s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.01% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000034s : 0.26% optimize.rewriter_before_opt_a : 0.000095s : 0.73% optimize.opt_a.expand_dump_flag : 0.000005s : 0.04% optimize.opt_a.switch_simplify : 0.000055s : 0.42% optimize.opt_a.loop_unroll : 0.000040s : 0.31% optimize.opt_a.a_1 : 0.000946s : 7.31% optimize.opt_a.with_stream_mark : 0.000036s : 0.28% optimize.opt_a.recompute_prepare : 0.000021s : 0.16% optimize.opt_a.updatestate_depend_eliminate : 0.000011s : 0.09% optimize.opt_a.updatestate_assign_eliminate : 0.000009s : 0.07% optimize.opt_a.updatestate_loads_eliminate : 0.000009s : 0.07% optimize.opt_a.parameter_eliminate : 0.000004s : 0.03% optimize.opt_a.a_2 : 0.000289s : 2.23% optimize.opt_a.accelerated_algorithm : 0.000019s : 0.15% optimize.opt_a.shard : 0.000004s : 0.03% optimize.opt_a.meta_shard_fg_expand : 0.000005s : 0.04% optimize.opt_a.shard_inline : 0.000018s : 0.14% optimize.opt_a.merge_send_recv : 0.000020s : 0.15% optimize.opt_a.auto_parallel : 0.000017s : 0.13% optimize.opt_a.parallel : 0.000026s : 0.20% optimize.opt_a.flash_sp : 0.000013s : 0.10% optimize.opt_a.merge_comm : 0.000011s : 0.09% optimize.opt_a.allreduce_fusion : 0.000010s : 0.08% optimize.opt_a.matmul_add_comm_reduction : 0.000020s : 0.16% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000002s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000023s : 0.18% optimize.opt_a.virtual_dataset : 0.000018s : 0.14% optimize.opt_a.get_grad_eliminate_ : 0.000019s : 0.14% optimize.opt_a.virtual_output : 0.000018s : 0.14% optimize.opt_a.merge_forward : 0.000011s : 0.08% optimize.opt_a.cell_reuse_recompute_pass : 0.000004s : 0.03% optimize.opt_a.offload_activation : 0.000023s : 0.18% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000040s : 0.31% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.02% optimize.opt_a.before_grad : 0.000032s : 0.25% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000012s : 0.09% optimize.opt_a.meta_fg_expand : 0.000009s : 0.07% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.03% optimize.opt_a.receive_attached : 0.000005s : 0.04% optimize.opt_a.after_resolve : 0.000030s : 0.23% optimize.opt_a.a_after_grad : 0.000028s : 0.22% optimize.opt_a.renormalize : 0.000956s : 7.38% optimize.opt_a.add_forward_monad_depend : 0.000010s : 0.07% optimize.opt_a.auto_monad_grad : 0.000005s : 0.04% optimize.opt_a.auto_monad_eliminator : 0.000036s : 0.28% optimize.opt_a.cse : 0.000076s : 0.58% optimize.opt_a.a_3 : 0.000159s : 1.23% optimize.py_interpret_to_execute_after_opt_a : 0.000021s : 0.16% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.04% optimize.rewriter_after_opt_a : 0.000054s : 0.41% optimize.convert_after_rewriter : 0.000012s : 0.09% optimize.order_py_execute_after_rewriter : 0.000010s : 0.08% optimize.mutable_eliminate : 0.000772s : 5.96% optimize.opt_b.b_1 : 0.000238s : 1.84% optimize.opt_b.b_2 : 0.000011s : 0.09% optimize.opt_b.updatestate_depend_eliminate : 0.000010s : 0.07% optimize.opt_b.updatestate_assign_eliminate : 0.000004s : 0.03% optimize.opt_b.updatestate_loads_eliminate : 0.000004s : 0.03% optimize.opt_b.renormalize : 0.000001s : 0.01% optimize.opt_b.cse : 0.000041s : 0.31% optimize.optimize_parallel_all_gather_comm : 0.000026s : 0.20% optimize.overlap_param_gather : 0.000005s : 0.04% optimize.cconv : 0.000040s : 0.31% optimize.loop_unroll : 0.000646s : 4.99% optimize.opt_after_cconv.c_1 : 0.000046s : 0.36% optimize.opt_after_cconv.parameter_eliminate : 0.000004s : 0.03% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000009s : 0.07% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000004s : 0.03% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000004s : 0.03% optimize.opt_after_cconv.cse : 0.000037s : 0.29% optimize.opt_after_cconv.renormalize : 0.000001s : 0.00% optimize.remove_dup_value : 0.000056s : 0.43% optimize.tuple_transform.d_1 : 0.000068s : 0.52% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000010s : 0.08% optimize.partial_unused_args_eliminate : 0.000005s : 0.04% optimize.add_recomputation : 0.000076s : 0.59% optimize.cse_after_recomputation.cse : 0.000019s : 0.15% optimize.environ_conv : 0.000010s : 0.08% optimize.swap_dp_allreduce_reducescatter : 0.000010s : 0.08% optimize.bias_add_comm_swap : 0.000005s : 0.04% optimize.label_micro_interleaved_index : 0.000008s : 0.07% optimize.label_fine_grained_interleaved_index : 0.000005s : 0.04% optimize.merge_cast_opt : 0.000004s : 0.03% optimize.slice_recompute_activation : 0.000005s : 0.04% optimize.micro_interleaved_order_control : 0.000005s : 0.04% optimize.assign_add_opt : 0.000004s : 0.03% optimize.ForceFp32Comm : 0.000004s : 0.03% optimize.remove_cast_before_assign_add : 0.000004s : 0.03% optimize.full_micro_interleaved_order_control : 0.000005s : 0.03% optimize.reorder_send_recv_between_fp_bp : 0.000005s : 0.04% optimize.comm_op_add_attrs : 0.000004s : 0.03% optimize.add_comm_op_reuse_tag : 0.000004s : 0.03% optimize.interleave_split_concat_branches : 0.000004s : 0.03% optimize.interleave_parallel_branches : 0.000004s : 0.03% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.03% optimize.overlap_opt_shard_grad_in_pipeline : 0.000005s : 0.04% optimize.control_data_broadcast_order : 0.000021s : 0.16% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.03% optimize.offloading_packed_experts : 0.000008s : 0.06% optimize.overlap_recompute_and_grad_model_parallel : 0.000009s : 0.07% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.03% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.03% optimize.overlap_recompute_comm : 0.000005s : 0.04% optimize.overlap_grad_ring_attention : 0.000008s : 0.06% optimize.overlap_grad_flash_sp : 0.000030s : 0.23% optimize.begin_end_overlap_inline : 0.000003s : 0.02% optimize.split_matmul_comm_elemetwise : 0.000005s : 0.04% optimize.split_layernorm_comm : 0.000005s : 0.04% optimize.handle_group_info : 0.000004s : 0.03% optimize.symbol_engine_optimizer.build : 0.000004s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000014s : 0.11% optimize.symbol_engine_optimizer.elim_not_effective : 0.000018s : 0.14% optimize.symbol_engine_optimizer.opt_reshape : 0.000011s : 0.08% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000016s : 0.12% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000004s : 0.03% pipeline_parallel_scheduler : 0.000002s : 0.02% auto_monad_reorder : 0.000027s : 0.21% get_jit_bprop_graph : 0.000002s : 0.01% rewriter_after_jit_bprop_graph : 0.000006s : 0.05% opt_after_jit_grad : 0.000710s : 5.48% validate : 0.000054s : 0.42% Time group info: ------[substitution.] 0.000232 48 15.50% : 0.000036s : 6: substitution.cast_eliminate 1.22% : 0.000003s : 4: substitution.elim_not_effective 1.03% : 0.000002s : 4: substitution.fold_const_symbol 3.35% : 0.000008s : 6: substitution.graph_param_transform 64.42% : 0.000149s : 4: substitution.inline 2.86% : 0.000007s : 8: substitution.j_node_and_user_rematch 3.39% : 0.000008s : 8: substitution.remove_not_recompute_node 2.81% : 0.000007s : 4: substitution.replace_old_param 5.43% : 0.000013s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.005722 2 86.97% : 0.004976s : 1: type_inference.infer 13.03% : 0.000745s : 1: type_inference.specialize ------[replace.] 0.000061 8 62.03% : 0.000038s : 4: replace.inline 37.97% : 0.000023s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000157 8 93.12% : 0.000147s : 4: match.inline 6.88% : 0.000011s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000291 1730 0.84% : 0.000002s : 17: predicate.accumulaten_eliminater 0.87% : 0.000003s : 6: predicate.ad_related_special_op_eliminate 0.55% : 0.000002s : 12: predicate.addn_check_dump 0.88% : 0.000003s : 17: predicate.addn_zero_filter 0.79% : 0.000002s : 17: predicate.adjust_all_reduce_mul_add 2.10% : 0.000006s : 29: predicate.arithmetic_simplify 1.16% : 0.000003s : 17: predicate.cast_eliminate 0.64% : 0.000002s : 12: predicate.check_bprop_eliminate 0.63% : 0.000002s : 12: predicate.compare_switch_simplify 0.19% : 0.000001s : 6: predicate.const_output_eliminate 0.63% : 0.000002s : 12: predicate.depend_value_elim 0.95% : 0.000003s : 17: predicate.dict_get_item_const_eliminator 1.05% : 0.000003s : 17: predicate.dict_get_item_eliminator 0.81% : 0.000002s : 17: predicate.dict_set_item_eliminator 1.01% : 0.000003s : 12: predicate.dumpgradient_eliminate 0.20% : 0.000001s : 6: predicate.elim_not_effective 0.41% : 0.000001s : 6: predicate.elim_shapecalc_of_broadcastargs 1.31% : 0.000004s : 23: predicate.environ_add_const_eliminate 1.09% : 0.000003s : 23: predicate.environ_get_add_eliminate 1.10% : 0.000003s : 23: predicate.environ_get_depend_swap 1.82% : 0.000005s : 35: predicate.environ_get_eliminate 1.08% : 0.000003s : 23: predicate.environ_get_set_eliminate 1.26% : 0.000004s : 25: predicate.exchange_switch_depend_value 2.13% : 0.000006s : 25: predicate.float_depend_g_call 0.58% : 0.000002s : 12: predicate.float_environ_get_switch 0.92% : 0.000003s : 18: predicate.float_tuple_getitem_switch 0.19% : 0.000001s : 6: predicate.fold_const_symbol 0.76% : 0.000002s : 12: predicate.get_grad_eliminate 0.20% : 0.000001s : 6: predicate.graph_param_transform 0.64% : 0.000002s : 12: predicate.incorporate_call 0.57% : 0.000002s : 12: predicate.incorporate_call_switch 6.53% : 0.000019s : 78: predicate.inline 0.91% : 0.000003s : 12: predicate.inline_without_move 0.34% : 0.000001s : 12: predicate.j_node_and_user_rematch 0.79% : 0.000002s : 12: predicate.less_batch_normalization 1.82% : 0.000005s : 33: predicate.list_to_tuple_eliminator_ 2.50% : 0.000007s : 50: predicate.load_eliminater 1.05% : 0.000003s : 6: predicate.loop_unroll_after_grad 1.88% : 0.000005s : 38: predicate.loop_unroll_before_grad 1.71% : 0.000005s : 29: predicate.make_slice_get_slice_eliminator 0.65% : 0.000002s : 12: predicate.merge_addn 0.70% : 0.000002s : 12: predicate.micro_step_allgather_replace 0.65% : 0.000002s : 12: predicate.mini_step_allgather_replace 0.84% : 0.000002s : 17: predicate.minmaximum_grad 1.41% : 0.000004s : 6: predicate.mutable_eliminate 0.44% : 0.000001s : 6: predicate.opt_reshape 0.49% : 0.000001s : 6: predicate.parallel_virtual_node 1.51% : 0.000004s : 25: predicate.partial_defer_inline 1.56% : 0.000005s : 27: predicate.partial_eliminate 0.89% : 0.000003s : 17: predicate.print_const_string_wrapper 0.70% : 0.000002s : 12: predicate.reduce_all_const_elim 1.28% : 0.000004s : 17: predicate.reduce_eliminate 2.41% : 0.000007s : 50: predicate.redundant_stop_gradient_eliminater 0.35% : 0.000001s : 12: predicate.remove_not_recompute_node 1.25% : 0.000004s : 33: predicate.replace_applicator 0.53% : 0.000002s : 12: predicate.replace_old_param 0.31% : 0.000001s : 6: predicate.reset_defer_inline 0.94% : 0.000003s : 17: predicate.reshape_eliminate 0.69% : 0.000002s : 12: predicate.row_tensor_add_zeros_like 0.41% : 0.000001s : 6: predicate.row_tensor_eliminate 0.84% : 0.000002s : 12: predicate.same_eliminate 0.39% : 0.000001s : 12: predicate.set_cell_output_no_recompute 0.92% : 0.000003s : 12: predicate.shard_identity_eliminate 0.78% : 0.000002s : 12: predicate.special_op_eliminate 0.84% : 0.000002s : 12: predicate.specialize_transform 0.92% : 0.000003s : 12: predicate.split_environ_get_set_with_tuple_value 0.83% : 0.000002s : 12: predicate.stack_unstack_eliminate 0.36% : 0.000001s : 6: predicate.switch_call_monad_eliminater 1.33% : 0.000004s : 25: predicate.switch_defer_inline 1.90% : 0.000006s : 37: predicate.switch_layer_defer_inline 4.41% : 0.000013s : 81: predicate.switch_simplify 0.86% : 0.000003s : 17: predicate.tile_eliminate 0.90% : 0.000003s : 17: predicate.transpose_eliminate 1.75% : 0.000005s : 29: predicate.tuple_list_convert_item_index_to_positive 1.63% : 0.000005s : 29: predicate.tuple_list_get_item_const_eliminator 1.47% : 0.000004s : 29: predicate.tuple_list_get_item_depend_reorder 3.25% : 0.000009s : 45: predicate.tuple_list_get_item_eliminator 1.55% : 0.000005s : 29: predicate.tuple_list_get_set_item_eliminator 2.49% : 0.000007s : 41: predicate.tuple_list_set_item_eliminator 1.77% : 0.000005s : 33: predicate.tuple_to_list_eliminator_ 2.30% : 0.000007s : 50: predicate.updatestate_pure_node_eliminater 3.06% : 0.000009s : 62: predicate.updatestate_useless_node_eliminater 0.40% : 0.000001s : 6: predicate.value_based_eliminate 0.69% : 0.000002s : 12: predicate.virtual_dataset_eliminate 0.71% : 0.000002s : 12: predicate.virtual_output_eliminate 0.30% : 0.000001s : 6: predicate.virtual_view_grad_eliminate 0.41% : 0.000001s : 6: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000583 11 50.94% : 0.000297s : 5: func_graph_cloner_run.FuncGraphClonerGraph 49.06% : 0.000286s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.031223 192 0.02% : 0.000006s : 1: ForceFp32Comm 10.52% : 0.003286s : 1: add_attr 10.47% : 0.003270s : 1: add_attr_with_inline 0.02% : 0.000007s : 1: add_comm_op_reuse_tag 0.25% : 0.000079s : 1: add_recomputation 0.02% : 0.000007s : 1: assign_add_opt 0.26% : 0.000082s : 1: auto_monad 0.11% : 0.000035s : 1: auto_monad_reorder 0.02% : 0.000006s : 1: begin_end_overlap_inline 0.03% : 0.000008s : 1: bias_add_comm_swap 1.55% : 0.000483s : 1: bootstrap 0.14% : 0.000043s : 1: cconv 0.02% : 0.000007s : 1: comm_op_add_attrs 0.08% : 0.000024s : 1: control_data_broadcast_order 0.05% : 0.000015s : 1: convert_after_rewriter 0.13% : 0.000040s : 1: cse_after_recomputation 0.02% : 0.000007s : 1: dataset_repeat_opt 0.07% : 0.000021s : 1: detach_backward 0.04% : 0.000013s : 1: environ_conv 0.10% : 0.000032s : 1: event_method 0.02% : 0.000007s : 1: full_micro_interleaved_order_control 0.03% : 0.000008s : 1: get_jit_bprop_graph 0.04% : 0.000014s : 1: graph_reusing 0.02% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.02% : 0.000006s : 1: handle_group_info 0.03% : 0.000008s : 1: inline 0.03% : 0.000009s : 1: insert-virtual-dataset 0.02% : 0.000006s : 1: interleave_parallel_branches 0.02% : 0.000007s : 1: interleave_split_concat_branches 0.03% : 0.000008s : 1: label_fine_grained_interleaved_index 0.04% : 0.000011s : 1: label_micro_interleaved_index 2.09% : 0.000654s : 1: loop_unroll 0.02% : 0.000007s : 1: merge_cast_opt 0.02% : 0.000008s : 1: micro_interleaved_order_control 2.50% : 0.000780s : 1: mutable_eliminate 0.03% : 0.000011s : 1: offloading_packed_experts 0.07% : 0.000023s : 1: opt.transform.loop_unroll_optimizer 0.08% : 0.000026s : 1: opt.transform.mutable_eliminate 5.04% : 0.001574s : 78: opt.transform.opt_a 0.14% : 0.000045s : 1: opt.transform.opt_after_cconv 0.13% : 0.000040s : 1: opt.transform.opt_after_jit_grad 0.55% : 0.000171s : 28: opt.transform.opt_b 0.24% : 0.000075s : 2: opt.transform.opt_trans_graph 0.18% : 0.000055s : 4: opt.transform.symbol_engine_opt 12.50% : 0.003902s : 1: opt_a 0.58% : 0.000180s : 1: opt_after_cconv 2.31% : 0.000722s : 1: opt_after_jit_grad 1.22% : 0.000382s : 1: opt_b 23.77% : 0.007420s : 1: optimize 0.09% : 0.000029s : 1: optimize_parallel_all_gather_comm 0.04% : 0.000013s : 1: order_py_execute_after_rewriter 0.11% : 0.000034s : 1: overlap_grad_flash_sp 0.02% : 0.000006s : 1: overlap_grad_matmul_and_grad_allreduce 0.03% : 0.000010s : 1: overlap_grad_ring_attention 0.02% : 0.000007s : 1: overlap_opt_shard_grad_in_pipeline 0.02% : 0.000007s : 1: overlap_opt_shard_in_pipeline 0.03% : 0.000008s : 1: overlap_param_gather 0.02% : 0.000006s : 1: overlap_recompute_allgather_and_fa_grad 0.04% : 0.000012s : 1: overlap_recompute_and_grad_model_parallel 0.02% : 0.000008s : 1: overlap_recompute_comm 0.03% : 0.000011s : 1: parallel-infer-symbol 0.02% : 0.000006s : 1: parallel-infer-symbol-second 0.03% : 0.000008s : 1: partial_unused_args_eliminate 0.03% : 0.000010s : 1: pipeline_parallel_scheduler 0.02% : 0.000007s : 1: pipeline_split 0.14% : 0.000045s : 1: pre_auto_parallel 0.12% : 0.000037s : 1: py_interpret_to_execute 0.08% : 0.000024s : 1: py_interpret_to_execute_after_opt_a 0.02% : 0.000006s : 1: remove_cast_before_assign_add 0.19% : 0.000060s : 1: remove_dup_value 1.68% : 0.000526s : 1: renormalize.infer 1.35% : 0.000420s : 1: renormalize.specialize 0.03% : 0.000009s : 1: reorder_send_recv_between_fp_bp 0.04% : 0.000013s : 1: rewriter_after_jit_bprop_graph 0.18% : 0.000057s : 1: rewriter_after_opt_a 0.31% : 0.000098s : 1: rewriter_before_opt_a 0.03% : 0.000009s : 1: slice_cell_reuse_recomputed_activation 0.02% : 0.000008s : 1: slice_recompute_activation 0.02% : 0.000007s : 1: split_layernorm_comm 0.02% : 0.000007s : 1: split_matmul_comm_elemetwise 0.04% : 0.000013s : 1: swap_dp_allreduce_reducescatter 0.39% : 0.000122s : 1: symbol_engine_optimizer 0.39% : 0.000121s : 1: tuple_transform 18.64% : 0.005820s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:45:05.766.217 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0188698, [21] [bootstrap]: 0.00048986 [type_inference]: 0.00727533 [event_method]: 2.432e-05 [auto_monad]: 7.616e-05 [graph_reusing]: 7.01999e-06 [inline]: 2.48e-06 [add_attr]: 0.00383494, [1] [add_attr_with_inline]: 0.00382241, [1] [Cycle 1]: 7.537e-05, [2] [tag_attr]: 2.567e-05 [meta_addattr_fg_expand]: 6.36e-06 [parallel-infer-symbol]: 4.07e-06 [pre_auto_parallel]: 4.323e-05 [insert-virtual-dataset]: 2.53003e-06 [parallel-infer-symbol-second]: 9.39996e-07 [dataset_repeat_opt]: 1.96e-06 [pipeline_split]: 1.69e-06 [optimize]: 0.0062946, [53] [py_interpret_to_execute]: 3.263e-05 [rewriter_before_opt_a]: 0.00010055 [opt_a]: 0.0036341, [2] [Cycle 1]: 0.00268991, [45] [expand_dump_flag]: 3.07002e-06 [switch_simplify]: 4.804e-05 [loop_unroll]: 3.298e-05 [a_1]: 0.00075696 [with_stream_mark]: 2.309e-05 [recompute_prepare]: 1.434e-05 [updatestate_depend_eliminate]: 5.59e-06 [updatestate_assign_eliminate]: 4.17998e-06 [updatestate_loads_eliminate]: 4.27e-06 [parameter_eliminate]: 2.24001e-06 [a_2]: 0.00012249 [accelerated_algorithm]: 1.004e-05 [shard]: 2.43002e-06 [meta_shard_fg_expand]: 2.71e-06 [shard_inline]: 9.08002e-06 [merge_send_recv]: 1.024e-05 [auto_parallel]: 9.61e-06 [parallel]: 2.014e-05 [flash_sp]: 1.063e-05 [merge_comm]: 5.87999e-06 [allreduce_fusion]: 4.98001e-06 [matmul_add_comm_reduction]: 1.214e-05 [allreduce_slice_to_reducescatter]: 9.49978e-07 [virtual_shard_identity]: 1.423e-05 [virtual_dataset]: 9.39e-06 [get_grad_eliminate_]: 8.72e-06 [virtual_output]: 8.95999e-06 [merge_forward]: 5.87001e-06 [cell_reuse_recompute_pass]: 1.34e-06 [offload_activation]: 1.389e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.158e-05 [merge_recompute_call_nodes]: 1.83002e-06 [before_grad]: 1.579e-05 [set_forward_comm_id_for_comm_node_pass]: 5.44e-06 [meta_fg_expand]: 4.51002e-06 [flash_sp_send_recv_attached]: 2.99001e-06 [receive_attached]: 2.37999e-06 [after_resolve]: 1.558e-05 [a_after_grad]: 1.393e-05 [renormalize]: 0.0009585 [add_forward_monad_depend]: 7.38e-06 [auto_monad_grad]: 2.98998e-06 [auto_monad_eliminator]: 2.274e-05 [cse]: 4.623e-05 [a_3]: 7.126e-05 [Cycle 2]: 0.00093201, [45] [expand_dump_flag]: 1.59998e-06 [switch_simplify]: 1.061e-05 [loop_unroll]: 8.77999e-06 [a_1]: 0.0002191 [with_stream_mark]: 1.824e-05 [recompute_prepare]: 9.56e-06 [updatestate_depend_eliminate]: 9.04e-06 [updatestate_assign_eliminate]: 3.65e-06 [updatestate_loads_eliminate]: 3.49001e-06 [parameter_eliminate]: 1.83002e-06 [a_2]: 0.00011308 [accelerated_algorithm]: 9.20999e-06 [shard]: 2.93e-06 [meta_shard_fg_expand]: 2.67001e-06 [shard_inline]: 9.91e-06 [merge_send_recv]: 8.18999e-06 [auto_parallel]: 8.35999e-06 [parallel]: 5.97999e-06 [flash_sp]: 3.86001e-06 [merge_comm]: 5.22999e-06 [allreduce_fusion]: 5.19e-06 [matmul_add_comm_reduction]: 1.032e-05 [allreduce_slice_to_reducescatter]: 7.39994e-07 [virtual_shard_identity]: 1.072e-05 [virtual_dataset]: 8.85999e-06 [get_grad_eliminate_]: 9.99999e-06 [virtual_output]: 8.40001e-06 [merge_forward]: 5.44e-06 [cell_reuse_recompute_pass]: 1.72001e-06 [offload_activation]: 1.097e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.806e-05 [merge_recompute_call_nodes]: 1.48002e-06 [before_grad]: 1.443e-05 [set_forward_comm_id_for_comm_node_pass]: 5.61e-06 [meta_fg_expand]: 3.86001e-06 [flash_sp_send_recv_attached]: 1.10999e-06 [receive_attached]: 1.60001e-06 [after_resolve]: 1.345e-05 [a_after_grad]: 1.367e-05 [renormalize]: 8.00064e-08 [add_forward_monad_depend]: 2.59001e-06 [auto_monad_grad]: 1.81998e-06 [auto_monad_eliminator]: 1.407e-05 [cse]: 2.957e-05 [a_3]: 5.6e-05 [py_interpret_to_execute_after_opt_a]: 1.627e-05 [slice_cell_reuse_recomputed_activation]: 1.91e-06 [rewriter_after_opt_a]: 5.276e-05 [convert_after_rewriter]: 8.17e-06 [order_py_execute_after_rewriter]: 6.63003e-06 [mutable_eliminate]: 0.00069992 [opt_b]: 0.00029459, [1] [Cycle 1]: 0.00028725, [7] [b_1]: 0.00018363 [b_2]: 1.095e-05 [updatestate_depend_eliminate]: 9.66998e-06 [updatestate_assign_eliminate]: 4.48999e-06 [updatestate_loads_eliminate]: 3.78999e-06 [renormalize]: 5.00004e-07 [cse]: 3.636e-05 [optimize_parallel_all_gather_comm]: 2.101e-05 [overlap_param_gather]: 2.32999e-06 [cconv]: 3.401e-05 [loop_unroll]: 0.00054078 [opt_after_cconv]: 0.00014596, [1] [Cycle 1]: 0.00013882, [7] [c_1]: 4.397e-05 [parameter_eliminate]: 6.01998e-06 [updatestate_depend_eliminate]: 8.97999e-06 [updatestate_assign_eliminate]: 4.1e-06 [updatestate_loads_eliminate]: 3.63999e-06 [cse]: 3.395e-05 [renormalize]: 6.29982e-07 [remove_dup_value]: 5.093e-05 [tuple_transform]: 0.00010186, [1] [Cycle 1]: 9.698e-05, [4] [d_1]: 6.497e-05 [none_parameter_eliminate]: 1.87999e-06 [renormalize]: 2.30008e-07 [switch_simplify]: 9.96e-06 [partial_unused_args_eliminate]: 1.96e-06 [add_recomputation]: 6.713e-05 [cse_after_recomputation]: 2.896e-05, [1] [Cycle 1]: 2.36e-05, [1] [cse]: 1.774e-05 [environ_conv]: 7.55003e-06 [swap_dp_allreduce_reducescatter]: 6.66999e-06 [bias_add_comm_swap]: 2.93e-06 [label_micro_interleaved_index]: 5.57999e-06 [label_fine_grained_interleaved_index]: 2.59999e-06 [merge_cast_opt]: 1.30999e-06 [slice_recompute_activation]: 2.09999e-06 [micro_interleaved_order_control]: 2.17999e-06 [assign_add_opt]: 1.44998e-06 [ForceFp32Comm]: 1.11002e-06 [remove_cast_before_assign_add]: 1.27e-06 [full_micro_interleaved_order_control]: 2.22001e-06 [reorder_send_recv_between_fp_bp]: 2.88e-06 [comm_op_add_attrs]: 9.80013e-07 [add_comm_op_reuse_tag]: 1.01002e-06 [interleave_split_concat_branches]: 1.15001e-06 [interleave_parallel_branches]: 1.02e-06 [overlap_opt_shard_in_pipeline]: 1.29e-06 [overlap_opt_shard_grad_in_pipeline]: 1.79e-06 [control_data_broadcast_order]: 1.684e-05 [grouped_pairwise_exchange_alltoall]: 1.52001e-06 [offloading_packed_experts]: 5.04e-06 [overlap_recompute_and_grad_model_parallel]: 5.81e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.17e-06 [overlap_recompute_allgather_and_fa_grad]: 1.32e-06 [overlap_recompute_comm]: 2.34001e-06 [overlap_grad_ring_attention]: 4.80001e-06 [overlap_grad_flash_sp]: 2.641e-05 [begin_end_overlap_inline]: 5.29981e-07 [split_matmul_comm_elemetwise]: 2.39001e-06 [split_layernorm_comm]: 1.94e-06 [handle_group_info]: 1.29998e-06 [symbol_engine_optimizer]: 9.895e-05, [1] [Cycle 1]: 9.414e-05, [6] [build]: 4.06001e-06 [elim_shapecalc]: 1.553e-05 [elim_not_effective]: 1.832e-05 [opt_reshape]: 1.062e-05 [fold_const_symbol]: 1.457e-05 [renormalize]: 3.7998e-07 [detach_backward]: 2.71999e-06 [pipeline_parallel_scheduler]: 1.67999e-06 [auto_monad_reorder]: 2.318e-05 [get_jit_bprop_graph]: 2.49999e-06 [rewriter_after_jit_bprop_graph]: 5.84e-06 [opt_after_jit_grad]: 0.00055608 [validate]: 5.288e-05 Sums bootstrap : 0.000490s : 3.51% type_inference : 0.007275s : 52.16% event_method : 0.000024s : 0.17% auto_monad : 0.000076s : 0.55% graph_reusing : 0.000007s : 0.05% inline : 0.000002s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000026s : 0.18% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.05% parallel-infer-symbol : 0.000004s : 0.03% pre_auto_parallel : 0.000043s : 0.31% insert-virtual-dataset : 0.000003s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.01% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000033s : 0.23% optimize.rewriter_before_opt_a : 0.000101s : 0.72% optimize.opt_a.expand_dump_flag : 0.000005s : 0.03% optimize.opt_a.switch_simplify : 0.000059s : 0.42% optimize.opt_a.loop_unroll : 0.000042s : 0.30% optimize.opt_a.a_1 : 0.000976s : 7.00% optimize.opt_a.with_stream_mark : 0.000041s : 0.30% optimize.opt_a.recompute_prepare : 0.000024s : 0.17% optimize.opt_a.updatestate_depend_eliminate : 0.000015s : 0.10% optimize.opt_a.updatestate_assign_eliminate : 0.000008s : 0.06% optimize.opt_a.updatestate_loads_eliminate : 0.000008s : 0.06% optimize.opt_a.parameter_eliminate : 0.000004s : 0.03% optimize.opt_a.a_2 : 0.000236s : 1.69% optimize.opt_a.accelerated_algorithm : 0.000019s : 0.14% optimize.opt_a.shard : 0.000005s : 0.04% optimize.opt_a.meta_shard_fg_expand : 0.000005s : 0.04% optimize.opt_a.shard_inline : 0.000019s : 0.14% optimize.opt_a.merge_send_recv : 0.000018s : 0.13% optimize.opt_a.auto_parallel : 0.000018s : 0.13% optimize.opt_a.parallel : 0.000026s : 0.19% optimize.opt_a.flash_sp : 0.000014s : 0.10% optimize.opt_a.merge_comm : 0.000011s : 0.08% optimize.opt_a.allreduce_fusion : 0.000010s : 0.07% optimize.opt_a.matmul_add_comm_reduction : 0.000022s : 0.16% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000002s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000025s : 0.18% optimize.opt_a.virtual_dataset : 0.000018s : 0.13% optimize.opt_a.get_grad_eliminate_ : 0.000019s : 0.13% optimize.opt_a.virtual_output : 0.000017s : 0.12% optimize.opt_a.merge_forward : 0.000011s : 0.08% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.02% optimize.opt_a.offload_activation : 0.000025s : 0.18% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000040s : 0.28% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.02% optimize.opt_a.before_grad : 0.000030s : 0.22% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000011s : 0.08% optimize.opt_a.meta_fg_expand : 0.000008s : 0.06% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.03% optimize.opt_a.receive_attached : 0.000004s : 0.03% optimize.opt_a.after_resolve : 0.000029s : 0.21% optimize.opt_a.a_after_grad : 0.000028s : 0.20% optimize.opt_a.renormalize : 0.000959s : 6.87% optimize.opt_a.add_forward_monad_depend : 0.000010s : 0.07% optimize.opt_a.auto_monad_grad : 0.000005s : 0.03% optimize.opt_a.auto_monad_eliminator : 0.000037s : 0.26% optimize.opt_a.cse : 0.000076s : 0.54% optimize.opt_a.a_3 : 0.000127s : 0.91% optimize.py_interpret_to_execute_after_opt_a : 0.000016s : 0.12% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.01% optimize.rewriter_after_opt_a : 0.000053s : 0.38% optimize.convert_after_rewriter : 0.000008s : 0.06% optimize.order_py_execute_after_rewriter : 0.000007s : 0.05% optimize.mutable_eliminate : 0.000700s : 5.02% optimize.opt_b.b_1 : 0.000184s : 1.32% optimize.opt_b.b_2 : 0.000011s : 0.08% optimize.opt_b.updatestate_depend_eliminate : 0.000010s : 0.07% optimize.opt_b.updatestate_assign_eliminate : 0.000004s : 0.03% optimize.opt_b.updatestate_loads_eliminate : 0.000004s : 0.03% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000036s : 0.26% optimize.optimize_parallel_all_gather_comm : 0.000021s : 0.15% optimize.overlap_param_gather : 0.000002s : 0.02% optimize.cconv : 0.000034s : 0.24% optimize.loop_unroll : 0.000541s : 3.88% optimize.opt_after_cconv.c_1 : 0.000044s : 0.32% optimize.opt_after_cconv.parameter_eliminate : 0.000006s : 0.04% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000009s : 0.06% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000004s : 0.03% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000004s : 0.03% optimize.opt_after_cconv.cse : 0.000034s : 0.24% optimize.opt_after_cconv.renormalize : 0.000001s : 0.00% optimize.remove_dup_value : 0.000051s : 0.37% optimize.tuple_transform.d_1 : 0.000065s : 0.47% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000010s : 0.07% optimize.partial_unused_args_eliminate : 0.000002s : 0.01% optimize.add_recomputation : 0.000067s : 0.48% optimize.cse_after_recomputation.cse : 0.000018s : 0.13% optimize.environ_conv : 0.000008s : 0.05% optimize.swap_dp_allreduce_reducescatter : 0.000007s : 0.05% optimize.bias_add_comm_swap : 0.000003s : 0.02% optimize.label_micro_interleaved_index : 0.000006s : 0.04% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.02% optimize.merge_cast_opt : 0.000001s : 0.01% optimize.slice_recompute_activation : 0.000002s : 0.02% optimize.micro_interleaved_order_control : 0.000002s : 0.02% optimize.assign_add_opt : 0.000001s : 0.01% optimize.ForceFp32Comm : 0.000001s : 0.01% optimize.remove_cast_before_assign_add : 0.000001s : 0.01% optimize.full_micro_interleaved_order_control : 0.000002s : 0.02% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.02% optimize.comm_op_add_attrs : 0.000001s : 0.01% optimize.add_comm_op_reuse_tag : 0.000001s : 0.01% optimize.interleave_split_concat_branches : 0.000001s : 0.01% optimize.interleave_parallel_branches : 0.000001s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.01% optimize.control_data_broadcast_order : 0.000017s : 0.12% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.01% optimize.offloading_packed_experts : 0.000005s : 0.04% optimize.overlap_recompute_and_grad_model_parallel : 0.000006s : 0.04% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.01% optimize.overlap_recompute_comm : 0.000002s : 0.02% optimize.overlap_grad_ring_attention : 0.000005s : 0.03% optimize.overlap_grad_flash_sp : 0.000026s : 0.19% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.02% optimize.split_layernorm_comm : 0.000002s : 0.01% optimize.handle_group_info : 0.000001s : 0.01% optimize.symbol_engine_optimizer.build : 0.000004s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000016s : 0.11% optimize.symbol_engine_optimizer.elim_not_effective : 0.000018s : 0.13% optimize.symbol_engine_optimizer.opt_reshape : 0.000011s : 0.08% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000015s : 0.10% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000003s : 0.02% pipeline_parallel_scheduler : 0.000002s : 0.01% auto_monad_reorder : 0.000023s : 0.17% get_jit_bprop_graph : 0.000002s : 0.02% rewriter_after_jit_bprop_graph : 0.000006s : 0.04% opt_after_jit_grad : 0.000556s : 3.99% validate : 0.000053s : 0.38% Time group info: ------[substitution.] 0.000250 48 15.25% : 0.000038s : 6: substitution.cast_eliminate 1.02% : 0.000003s : 4: substitution.elim_not_effective 0.86% : 0.000002s : 4: substitution.fold_const_symbol 3.15% : 0.000008s : 6: substitution.graph_param_transform 66.98% : 0.000168s : 4: substitution.inline 2.28% : 0.000006s : 8: substitution.j_node_and_user_rematch 3.03% : 0.000008s : 8: substitution.remove_not_recompute_node 2.05% : 0.000005s : 4: substitution.replace_old_param 5.38% : 0.000013s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.007189 2 87.13% : 0.006264s : 1: type_inference.infer 12.87% : 0.000925s : 1: type_inference.specialize ------[replace.] 0.000067 8 62.61% : 0.000042s : 4: replace.inline 37.39% : 0.000025s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000176 8 93.39% : 0.000165s : 4: match.inline 6.61% : 0.000012s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000288 1730 0.86% : 0.000002s : 17: predicate.accumulaten_eliminater 0.85% : 0.000002s : 6: predicate.ad_related_special_op_eliminate 0.56% : 0.000002s : 12: predicate.addn_check_dump 0.97% : 0.000003s : 17: predicate.addn_zero_filter 0.81% : 0.000002s : 17: predicate.adjust_all_reduce_mul_add 2.19% : 0.000006s : 29: predicate.arithmetic_simplify 1.10% : 0.000003s : 17: predicate.cast_eliminate 0.68% : 0.000002s : 12: predicate.check_bprop_eliminate 0.66% : 0.000002s : 12: predicate.compare_switch_simplify 0.20% : 0.000001s : 6: predicate.const_output_eliminate 0.67% : 0.000002s : 12: predicate.depend_value_elim 0.88% : 0.000003s : 17: predicate.dict_get_item_const_eliminator 0.97% : 0.000003s : 17: predicate.dict_get_item_eliminator 0.86% : 0.000002s : 17: predicate.dict_set_item_eliminator 1.05% : 0.000003s : 12: predicate.dumpgradient_eliminate 0.19% : 0.000001s : 6: predicate.elim_not_effective 0.48% : 0.000001s : 6: predicate.elim_shapecalc_of_broadcastargs 1.08% : 0.000003s : 23: predicate.environ_add_const_eliminate 1.21% : 0.000003s : 23: predicate.environ_get_add_eliminate 1.07% : 0.000003s : 23: predicate.environ_get_depend_swap 1.71% : 0.000005s : 35: predicate.environ_get_eliminate 1.07% : 0.000003s : 23: predicate.environ_get_set_eliminate 1.26% : 0.000004s : 25: predicate.exchange_switch_depend_value 2.18% : 0.000006s : 25: predicate.float_depend_g_call 0.56% : 0.000002s : 12: predicate.float_environ_get_switch 0.82% : 0.000002s : 18: predicate.float_tuple_getitem_switch 0.19% : 0.000001s : 6: predicate.fold_const_symbol 0.79% : 0.000002s : 12: predicate.get_grad_eliminate 0.28% : 0.000001s : 6: predicate.graph_param_transform 0.67% : 0.000002s : 12: predicate.incorporate_call 0.57% : 0.000002s : 12: predicate.incorporate_call_switch 6.36% : 0.000018s : 78: predicate.inline 0.83% : 0.000002s : 12: predicate.inline_without_move 0.33% : 0.000001s : 12: predicate.j_node_and_user_rematch 0.93% : 0.000003s : 12: predicate.less_batch_normalization 1.91% : 0.000006s : 33: predicate.list_to_tuple_eliminator_ 2.35% : 0.000007s : 50: predicate.load_eliminater 1.26% : 0.000004s : 6: predicate.loop_unroll_after_grad 2.13% : 0.000006s : 38: predicate.loop_unroll_before_grad 1.66% : 0.000005s : 29: predicate.make_slice_get_slice_eliminator 0.62% : 0.000002s : 12: predicate.merge_addn 0.67% : 0.000002s : 12: predicate.micro_step_allgather_replace 0.61% : 0.000002s : 12: predicate.mini_step_allgather_replace 0.83% : 0.000002s : 17: predicate.minmaximum_grad 1.60% : 0.000005s : 6: predicate.mutable_eliminate 0.46% : 0.000001s : 6: predicate.opt_reshape 0.38% : 0.000001s : 6: predicate.parallel_virtual_node 1.68% : 0.000005s : 25: predicate.partial_defer_inline 1.57% : 0.000005s : 27: predicate.partial_eliminate 0.86% : 0.000002s : 17: predicate.print_const_string_wrapper 0.60% : 0.000002s : 12: predicate.reduce_all_const_elim 1.06% : 0.000003s : 17: predicate.reduce_eliminate 2.52% : 0.000007s : 50: predicate.redundant_stop_gradient_eliminater 0.36% : 0.000001s : 12: predicate.remove_not_recompute_node 1.19% : 0.000003s : 33: predicate.replace_applicator 0.47% : 0.000001s : 12: predicate.replace_old_param 0.24% : 0.000001s : 6: predicate.reset_defer_inline 0.89% : 0.000003s : 17: predicate.reshape_eliminate 0.67% : 0.000002s : 12: predicate.row_tensor_add_zeros_like 0.47% : 0.000001s : 6: predicate.row_tensor_eliminate 0.97% : 0.000003s : 12: predicate.same_eliminate 0.56% : 0.000002s : 12: predicate.set_cell_output_no_recompute 1.05% : 0.000003s : 12: predicate.shard_identity_eliminate 0.76% : 0.000002s : 12: predicate.special_op_eliminate 0.77% : 0.000002s : 12: predicate.specialize_transform 0.81% : 0.000002s : 12: predicate.split_environ_get_set_with_tuple_value 0.81% : 0.000002s : 12: predicate.stack_unstack_eliminate 0.42% : 0.000001s : 6: predicate.switch_call_monad_eliminater 1.31% : 0.000004s : 25: predicate.switch_defer_inline 1.85% : 0.000005s : 37: predicate.switch_layer_defer_inline 4.85% : 0.000014s : 81: predicate.switch_simplify 0.89% : 0.000003s : 17: predicate.tile_eliminate 1.05% : 0.000003s : 17: predicate.transpose_eliminate 1.52% : 0.000004s : 29: predicate.tuple_list_convert_item_index_to_positive 1.50% : 0.000004s : 29: predicate.tuple_list_get_item_const_eliminator 1.55% : 0.000004s : 29: predicate.tuple_list_get_item_depend_reorder 3.16% : 0.000009s : 45: predicate.tuple_list_get_item_eliminator 1.44% : 0.000004s : 29: predicate.tuple_list_get_set_item_eliminator 2.28% : 0.000007s : 41: predicate.tuple_list_set_item_eliminator 1.66% : 0.000005s : 33: predicate.tuple_to_list_eliminator_ 2.38% : 0.000007s : 50: predicate.updatestate_pure_node_eliminater 3.04% : 0.000009s : 62: predicate.updatestate_useless_node_eliminater 0.38% : 0.000001s : 6: predicate.value_based_eliminate 0.64% : 0.000002s : 12: predicate.virtual_dataset_eliminate 0.68% : 0.000002s : 12: predicate.virtual_output_eliminate 0.35% : 0.000001s : 6: predicate.virtual_view_grad_eliminate 0.40% : 0.000001s : 6: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000658 11 51.11% : 0.000336s : 5: func_graph_cloner_run.FuncGraphClonerGraph 48.89% : 0.000322s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.031828 192 0.01% : 0.000004s : 1: ForceFp32Comm 12.07% : 0.003842s : 1: add_attr 12.02% : 0.003827s : 1: add_attr_with_inline 0.01% : 0.000004s : 1: add_comm_op_reuse_tag 0.22% : 0.000072s : 1: add_recomputation 0.01% : 0.000004s : 1: assign_add_opt 0.26% : 0.000082s : 1: auto_monad 0.09% : 0.000027s : 1: auto_monad_reorder 0.01% : 0.000004s : 1: begin_end_overlap_inline 0.02% : 0.000006s : 1: bias_add_comm_swap 1.63% : 0.000519s : 1: bootstrap 0.12% : 0.000038s : 1: cconv 0.01% : 0.000004s : 1: comm_op_add_attrs 0.06% : 0.000020s : 1: control_data_broadcast_order 0.04% : 0.000012s : 1: convert_after_rewriter 0.10% : 0.000032s : 1: cse_after_recomputation 0.02% : 0.000005s : 1: dataset_repeat_opt 0.02% : 0.000006s : 1: detach_backward 0.03% : 0.000011s : 1: environ_conv 0.10% : 0.000031s : 1: event_method 0.02% : 0.000005s : 1: full_micro_interleaved_order_control 0.02% : 0.000006s : 1: get_jit_bprop_graph 0.03% : 0.000011s : 1: graph_reusing 0.01% : 0.000005s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000004s : 1: handle_group_info 0.02% : 0.000005s : 1: inline 0.02% : 0.000006s : 1: insert-virtual-dataset 0.01% : 0.000004s : 1: interleave_parallel_branches 0.01% : 0.000004s : 1: interleave_split_concat_branches 0.02% : 0.000006s : 1: label_fine_grained_interleaved_index 0.03% : 0.000008s : 1: label_micro_interleaved_index 1.73% : 0.000550s : 1: loop_unroll 0.01% : 0.000004s : 1: merge_cast_opt 0.02% : 0.000005s : 1: micro_interleaved_order_control 2.23% : 0.000710s : 1: mutable_eliminate 0.03% : 0.000008s : 1: offloading_packed_experts 0.07% : 0.000023s : 1: opt.transform.loop_unroll_optimizer 0.08% : 0.000026s : 1: opt.transform.mutable_eliminate 5.04% : 0.001605s : 78: opt.transform.opt_a 0.13% : 0.000042s : 1: opt.transform.opt_after_cconv 0.12% : 0.000039s : 1: opt.transform.opt_after_jit_grad 0.51% : 0.000161s : 28: opt.transform.opt_b 0.23% : 0.000072s : 2: opt.transform.opt_trans_graph 0.17% : 0.000054s : 4: opt.transform.symbol_engine_opt 11.43% : 0.003637s : 1: opt_a 0.47% : 0.000150s : 1: opt_after_cconv 1.78% : 0.000566s : 1: opt_after_jit_grad 0.94% : 0.000298s : 1: opt_b 19.79% : 0.006300s : 1: optimize 0.08% : 0.000025s : 1: optimize_parallel_all_gather_comm 0.03% : 0.000010s : 1: order_py_execute_after_rewriter 0.09% : 0.000030s : 1: overlap_grad_flash_sp 0.01% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.02% : 0.000008s : 1: overlap_grad_ring_attention 0.01% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.02% : 0.000005s : 1: overlap_param_gather 0.01% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.03% : 0.000009s : 1: overlap_recompute_and_grad_model_parallel 0.02% : 0.000005s : 1: overlap_recompute_comm 0.02% : 0.000007s : 1: parallel-infer-symbol 0.01% : 0.000004s : 1: parallel-infer-symbol-second 0.02% : 0.000005s : 1: partial_unused_args_eliminate 0.02% : 0.000005s : 1: pipeline_parallel_scheduler 0.01% : 0.000005s : 1: pipeline_split 0.15% : 0.000047s : 1: pre_auto_parallel 0.12% : 0.000037s : 1: py_interpret_to_execute 0.06% : 0.000020s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000004s : 1: remove_cast_before_assign_add 0.18% : 0.000056s : 1: remove_dup_value 1.72% : 0.000547s : 1: renormalize.infer 1.26% : 0.000401s : 1: renormalize.specialize 0.02% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.03% : 0.000010s : 1: rewriter_after_jit_bprop_graph 0.18% : 0.000057s : 1: rewriter_after_opt_a 0.33% : 0.000106s : 1: rewriter_before_opt_a 0.02% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.01% : 0.000005s : 1: slice_recompute_activation 0.02% : 0.000005s : 1: split_layernorm_comm 0.02% : 0.000005s : 1: split_matmul_comm_elemetwise 0.03% : 0.000010s : 1: swap_dp_allreduce_reducescatter 0.32% : 0.000102s : 1: symbol_engine_optimizer 0.33% : 0.000105s : 1: tuple_transform 22.94% : 0.007300s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:45:06.103.598 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:45:06.103.899 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0181835, [21] [bootstrap]: 0.0004852 [type_inference]: 0.00667679 [event_method]: 2.026e-05 [auto_monad]: 6.41e-05 [graph_reusing]: 5.91e-06 [inline]: 2.14e-06 [add_attr]: 0.00338949, [1] [add_attr_with_inline]: 0.0033803, [1] [Cycle 1]: 7.38e-05, [2] [tag_attr]: 2.061e-05 [meta_addattr_fg_expand]: 5.99e-06 [parallel-infer-symbol]: 3.40998e-06 [pre_auto_parallel]: 3.429e-05 [insert-virtual-dataset]: 2.42001e-06 [parallel-infer-symbol-second]: 6.99976e-07 [dataset_repeat_opt]: 2.12999e-06 [pipeline_split]: 1.63997e-06 [optimize]: 0.005902, [53] [py_interpret_to_execute]: 3.006e-05 [rewriter_before_opt_a]: 9.44e-05 [opt_a]: 0.00342973, [2] [Cycle 1]: 0.00246388, [45] [expand_dump_flag]: 3.13e-06 [switch_simplify]: 4.446e-05 [loop_unroll]: 3.443e-05 [a_1]: 0.00070306 [with_stream_mark]: 1.796e-05 [recompute_prepare]: 1.162e-05 [updatestate_depend_eliminate]: 5.00001e-06 [updatestate_assign_eliminate]: 4.45e-06 [updatestate_loads_eliminate]: 3.6e-06 [parameter_eliminate]: 2.32999e-06 [a_2]: 0.00014176 [accelerated_algorithm]: 9.15999e-06 [shard]: 2.01e-06 [meta_shard_fg_expand]: 2.01e-06 [shard_inline]: 7.98001e-06 [merge_send_recv]: 1.065e-05 [auto_parallel]: 7.4e-06 [parallel]: 1.932e-05 [flash_sp]: 8.93002e-06 [merge_comm]: 5.39e-06 [allreduce_fusion]: 4.37998e-06 [matmul_add_comm_reduction]: 1.043e-05 [allreduce_slice_to_reducescatter]: 6.69999e-07 [virtual_shard_identity]: 1.045e-05 [virtual_dataset]: 8.74e-06 [get_grad_eliminate_]: 7.71999e-06 [virtual_output]: 8.45999e-06 [merge_forward]: 4.74e-06 [cell_reuse_recompute_pass]: 1.40999e-06 [offload_activation]: 1.095e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.977e-05 [merge_recompute_call_nodes]: 1.59e-06 [before_grad]: 1.466e-05 [set_forward_comm_id_for_comm_node_pass]: 4.40999e-06 [meta_fg_expand]: 3.69002e-06 [flash_sp_send_recv_attached]: 2.66999e-06 [receive_attached]: 2.22001e-06 [after_resolve]: 1.366e-05 [a_after_grad]: 1.199e-05 [renormalize]: 0.00071514 [add_forward_monad_depend]: 5.76e-06 [auto_monad_grad]: 2.59999e-06 [auto_monad_eliminator]: 1.649e-05 [cse]: 3.648e-05 [a_3]: 0.00010246 [Cycle 2]: 0.00095003, [45] [expand_dump_flag]: 1.15999e-06 [switch_simplify]: 9.52001e-06 [loop_unroll]: 7.89002e-06 [a_1]: 0.00017839 [with_stream_mark]: 1.314e-05 [recompute_prepare]: 7.78001e-06 [updatestate_depend_eliminate]: 4.05e-06 [updatestate_assign_eliminate]: 3.01001e-06 [updatestate_loads_eliminate]: 3.2e-06 [parameter_eliminate]: 1.43002e-06 [a_2]: 0.00012163 [accelerated_algorithm]: 7.51999e-06 [shard]: 1.82001e-06 [meta_shard_fg_expand]: 1.54998e-06 [shard_inline]: 7.6e-06 [merge_send_recv]: 6.63e-06 [auto_parallel]: 1.024e-05 [parallel]: 4.64998e-06 [flash_sp]: 4.13999e-06 [merge_comm]: 4.33001e-06 [allreduce_fusion]: 4.08001e-06 [matmul_add_comm_reduction]: 7.17002e-06 [allreduce_slice_to_reducescatter]: 4.80009e-07 [virtual_shard_identity]: 8.45001e-06 [virtual_dataset]: 7.48e-06 [get_grad_eliminate_]: 7.60998e-06 [virtual_output]: 6.95998e-06 [merge_forward]: 3.35e-06 [cell_reuse_recompute_pass]: 1.32e-06 [offload_activation]: 9.70002e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.712e-05 [merge_recompute_call_nodes]: 7.99977e-07 [before_grad]: 1.286e-05 [set_forward_comm_id_for_comm_node_pass]: 4.50001e-06 [meta_fg_expand]: 2.94001e-06 [flash_sp_send_recv_attached]: 8.70001e-07 [receive_attached]: 1.19e-06 [after_resolve]: 1.182e-05 [a_after_grad]: 1.154e-05 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 1.39998e-06 [auto_monad_grad]: 1.42e-06 [auto_monad_eliminator]: 9.82999e-06 [cse]: 1.976e-05 [a_3]: 5.887e-05 [py_interpret_to_execute_after_opt_a]: 1.537e-05 [slice_cell_reuse_recomputed_activation]: 5.05001e-06 [rewriter_after_opt_a]: 4.609e-05 [convert_after_rewriter]: 1.019e-05 [order_py_execute_after_rewriter]: 8.55999e-06 [mutable_eliminate]: 0.00055147 [opt_b]: 0.0003126, [1] [Cycle 1]: 0.00030356, [7] [b_1]: 0.00019955 [b_2]: 9.36002e-06 [updatestate_depend_eliminate]: 6.84001e-06 [updatestate_assign_eliminate]: 3.21999e-06 [updatestate_loads_eliminate]: 3.16001e-06 [renormalize]: 4.59986e-07 [cse]: 2.38e-05 [optimize_parallel_all_gather_comm]: 2.229e-05 [overlap_param_gather]: 5.21002e-06 [cconv]: 2.97e-05 [loop_unroll]: 0.00047091 [opt_after_cconv]: 0.00014091, [1] [Cycle 1]: 0.00013192, [7] [c_1]: 3.698e-05 [parameter_eliminate]: 3.6e-06 [updatestate_depend_eliminate]: 7.38999e-06 [updatestate_assign_eliminate]: 3.22002e-06 [updatestate_loads_eliminate]: 3.02002e-06 [cse]: 2.242e-05 [renormalize]: 4.50003e-07 [remove_dup_value]: 1.943e-05 [tuple_transform]: 0.00010179, [1] [Cycle 1]: 9.472e-05, [4] [d_1]: 5.378e-05 [none_parameter_eliminate]: 1.50999e-06 [renormalize]: 2.3999e-07 [switch_simplify]: 8.38001e-06 [partial_unused_args_eliminate]: 4.56002e-06 [add_recomputation]: 6.188e-05 [cse_after_recomputation]: 3.21e-05, [1] [Cycle 1]: 2.491e-05, [1] [cse]: 1.566e-05 [environ_conv]: 1.001e-05 [swap_dp_allreduce_reducescatter]: 8.72e-06 [bias_add_comm_swap]: 5.49e-06 [label_micro_interleaved_index]: 7.90998e-06 [label_fine_grained_interleaved_index]: 5.32001e-06 [merge_cast_opt]: 3.98001e-06 [slice_recompute_activation]: 4.84e-06 [micro_interleaved_order_control]: 4.79e-06 [assign_add_opt]: 4.12e-06 [ForceFp32Comm]: 3.76001e-06 [remove_cast_before_assign_add]: 3.4e-06 [full_micro_interleaved_order_control]: 5.06002e-06 [reorder_send_recv_between_fp_bp]: 5.40001e-06 [comm_op_add_attrs]: 3.53e-06 [add_comm_op_reuse_tag]: 3.28e-06 [interleave_split_concat_branches]: 3.9e-06 [interleave_parallel_branches]: 3.45998e-06 [overlap_opt_shard_in_pipeline]: 3.66999e-06 [overlap_opt_shard_grad_in_pipeline]: 4.43001e-06 [control_data_broadcast_order]: 1.755e-05 [grouped_pairwise_exchange_alltoall]: 4.25e-06 [offloading_packed_experts]: 7.21001e-06 [overlap_recompute_and_grad_model_parallel]: 8.45999e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.63999e-06 [overlap_recompute_allgather_and_fa_grad]: 3.62002e-06 [overlap_recompute_comm]: 4.60999e-06 [overlap_grad_ring_attention]: 7e-06 [overlap_grad_flash_sp]: 2.458e-05 [begin_end_overlap_inline]: 2.92002e-06 [split_matmul_comm_elemetwise]: 5.07e-06 [split_layernorm_comm]: 4.17e-06 [handle_group_info]: 3.93001e-06 [symbol_engine_optimizer]: 0.00010483, [1] [Cycle 1]: 9.823e-05, [6] [build]: 3.49001e-06 [elim_shapecalc]: 1.129e-05 [elim_not_effective]: 1.562e-05 [opt_reshape]: 8.52e-06 [fold_const_symbol]: 1.305e-05 [renormalize]: 3.19997e-07 [detach_backward]: 3.37002e-06 [pipeline_parallel_scheduler]: 1.81998e-06 [auto_monad_reorder]: 2.518e-05 [get_jit_bprop_graph]: 1.59e-06 [rewriter_after_jit_bprop_graph]: 5.13002e-06 [opt_after_jit_grad]: 0.00061687 [validate]: 4.896e-05 Sums bootstrap : 0.000485s : 3.81% type_inference : 0.006677s : 52.46% event_method : 0.000020s : 0.16% auto_monad : 0.000064s : 0.50% graph_reusing : 0.000006s : 0.05% inline : 0.000002s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000021s : 0.16% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.05% parallel-infer-symbol : 0.000003s : 0.03% pre_auto_parallel : 0.000034s : 0.27% insert-virtual-dataset : 0.000002s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000030s : 0.24% optimize.rewriter_before_opt_a : 0.000094s : 0.74% optimize.opt_a.expand_dump_flag : 0.000004s : 0.03% optimize.opt_a.switch_simplify : 0.000054s : 0.42% optimize.opt_a.loop_unroll : 0.000042s : 0.33% optimize.opt_a.a_1 : 0.000881s : 6.92% optimize.opt_a.with_stream_mark : 0.000031s : 0.24% optimize.opt_a.recompute_prepare : 0.000019s : 0.15% optimize.opt_a.updatestate_depend_eliminate : 0.000009s : 0.07% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.06% optimize.opt_a.updatestate_loads_eliminate : 0.000007s : 0.05% optimize.opt_a.parameter_eliminate : 0.000004s : 0.03% optimize.opt_a.a_2 : 0.000263s : 2.07% optimize.opt_a.accelerated_algorithm : 0.000017s : 0.13% optimize.opt_a.shard : 0.000004s : 0.03% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.03% optimize.opt_a.shard_inline : 0.000016s : 0.12% optimize.opt_a.merge_send_recv : 0.000017s : 0.14% optimize.opt_a.auto_parallel : 0.000018s : 0.14% optimize.opt_a.parallel : 0.000024s : 0.19% optimize.opt_a.flash_sp : 0.000013s : 0.10% optimize.opt_a.merge_comm : 0.000010s : 0.08% optimize.opt_a.allreduce_fusion : 0.000008s : 0.07% optimize.opt_a.matmul_add_comm_reduction : 0.000018s : 0.14% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000019s : 0.15% optimize.opt_a.virtual_dataset : 0.000016s : 0.13% optimize.opt_a.get_grad_eliminate_ : 0.000015s : 0.12% optimize.opt_a.virtual_output : 0.000015s : 0.12% optimize.opt_a.merge_forward : 0.000008s : 0.06% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.02% optimize.opt_a.offload_activation : 0.000021s : 0.16% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000037s : 0.29% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.02% optimize.opt_a.before_grad : 0.000028s : 0.22% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000009s : 0.07% optimize.opt_a.meta_fg_expand : 0.000007s : 0.05% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.03% optimize.opt_a.receive_attached : 0.000003s : 0.03% optimize.opt_a.after_resolve : 0.000025s : 0.20% optimize.opt_a.a_after_grad : 0.000024s : 0.18% optimize.opt_a.renormalize : 0.000715s : 5.62% optimize.opt_a.add_forward_monad_depend : 0.000007s : 0.06% optimize.opt_a.auto_monad_grad : 0.000004s : 0.03% optimize.opt_a.auto_monad_eliminator : 0.000026s : 0.21% optimize.opt_a.cse : 0.000056s : 0.44% optimize.opt_a.a_3 : 0.000161s : 1.27% optimize.py_interpret_to_execute_after_opt_a : 0.000015s : 0.12% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.04% optimize.rewriter_after_opt_a : 0.000046s : 0.36% optimize.convert_after_rewriter : 0.000010s : 0.08% optimize.order_py_execute_after_rewriter : 0.000009s : 0.07% optimize.mutable_eliminate : 0.000551s : 4.33% optimize.opt_b.b_1 : 0.000200s : 1.57% optimize.opt_b.b_2 : 0.000009s : 0.07% optimize.opt_b.updatestate_depend_eliminate : 0.000007s : 0.05% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.02% optimize.opt_b.renormalize : 0.000000s : 0.00% optimize.opt_b.cse : 0.000024s : 0.19% optimize.optimize_parallel_all_gather_comm : 0.000022s : 0.18% optimize.overlap_param_gather : 0.000005s : 0.04% optimize.cconv : 0.000030s : 0.23% optimize.loop_unroll : 0.000471s : 3.70% optimize.opt_after_cconv.c_1 : 0.000037s : 0.29% optimize.opt_after_cconv.parameter_eliminate : 0.000004s : 0.03% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000007s : 0.06% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.02% optimize.opt_after_cconv.cse : 0.000022s : 0.18% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000019s : 0.15% optimize.tuple_transform.d_1 : 0.000054s : 0.42% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000008s : 0.07% optimize.partial_unused_args_eliminate : 0.000005s : 0.04% optimize.add_recomputation : 0.000062s : 0.49% optimize.cse_after_recomputation.cse : 0.000016s : 0.12% optimize.environ_conv : 0.000010s : 0.08% optimize.swap_dp_allreduce_reducescatter : 0.000009s : 0.07% optimize.bias_add_comm_swap : 0.000005s : 0.04% optimize.label_micro_interleaved_index : 0.000008s : 0.06% optimize.label_fine_grained_interleaved_index : 0.000005s : 0.04% optimize.merge_cast_opt : 0.000004s : 0.03% optimize.slice_recompute_activation : 0.000005s : 0.04% optimize.micro_interleaved_order_control : 0.000005s : 0.04% optimize.assign_add_opt : 0.000004s : 0.03% optimize.ForceFp32Comm : 0.000004s : 0.03% optimize.remove_cast_before_assign_add : 0.000003s : 0.03% optimize.full_micro_interleaved_order_control : 0.000005s : 0.04% optimize.reorder_send_recv_between_fp_bp : 0.000005s : 0.04% optimize.comm_op_add_attrs : 0.000004s : 0.03% optimize.add_comm_op_reuse_tag : 0.000003s : 0.03% optimize.interleave_split_concat_branches : 0.000004s : 0.03% optimize.interleave_parallel_branches : 0.000003s : 0.03% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.03% optimize.overlap_opt_shard_grad_in_pipeline : 0.000004s : 0.03% optimize.control_data_broadcast_order : 0.000018s : 0.14% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.03% optimize.offloading_packed_experts : 0.000007s : 0.06% optimize.overlap_recompute_and_grad_model_parallel : 0.000008s : 0.07% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.03% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.03% optimize.overlap_recompute_comm : 0.000005s : 0.04% optimize.overlap_grad_ring_attention : 0.000007s : 0.05% optimize.overlap_grad_flash_sp : 0.000025s : 0.19% optimize.begin_end_overlap_inline : 0.000003s : 0.02% optimize.split_matmul_comm_elemetwise : 0.000005s : 0.04% optimize.split_layernorm_comm : 0.000004s : 0.03% optimize.handle_group_info : 0.000004s : 0.03% optimize.symbol_engine_optimizer.build : 0.000003s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000011s : 0.09% optimize.symbol_engine_optimizer.elim_not_effective : 0.000016s : 0.12% optimize.symbol_engine_optimizer.opt_reshape : 0.000009s : 0.07% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000013s : 0.10% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000003s : 0.03% pipeline_parallel_scheduler : 0.000002s : 0.01% auto_monad_reorder : 0.000025s : 0.20% get_jit_bprop_graph : 0.000002s : 0.01% rewriter_after_jit_bprop_graph : 0.000005s : 0.04% opt_after_jit_grad : 0.000617s : 4.85% validate : 0.000049s : 0.38% Time group info: ------[substitution.] 0.000212 38 13.06% : 0.000028s : 3: substitution.cast_eliminate 1.09% : 0.000002s : 3: substitution.elim_not_effective 0.80% : 0.000002s : 3: substitution.fold_const_symbol 3.18% : 0.000007s : 5: substitution.graph_param_transform 68.00% : 0.000144s : 4: substitution.inline 2.33% : 0.000005s : 6: substitution.j_node_and_user_rematch 3.23% : 0.000007s : 6: substitution.remove_not_recompute_node 2.24% : 0.000005s : 4: substitution.replace_old_param 6.08% : 0.000013s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.006619 2 88.84% : 0.005880s : 1: type_inference.infer 11.16% : 0.000738s : 1: type_inference.specialize ------[replace.] 0.000064 8 61.72% : 0.000040s : 4: replace.inline 38.28% : 0.000025s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000152 8 92.91% : 0.000142s : 4: match.inline 7.09% : 0.000011s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000249 1504 0.82% : 0.000002s : 15: predicate.accumulaten_eliminater 1.13% : 0.000003s : 5: predicate.ad_related_special_op_eliminate 0.65% : 0.000002s : 10: predicate.addn_check_dump 0.92% : 0.000002s : 15: predicate.addn_zero_filter 0.80% : 0.000002s : 15: predicate.adjust_all_reduce_mul_add 2.28% : 0.000006s : 25: predicate.arithmetic_simplify 0.96% : 0.000002s : 15: predicate.cast_eliminate 0.64% : 0.000002s : 10: predicate.check_bprop_eliminate 0.60% : 0.000001s : 10: predicate.compare_switch_simplify 0.19% : 0.000000s : 5: predicate.const_output_eliminate 0.64% : 0.000002s : 10: predicate.depend_value_elim 0.94% : 0.000002s : 15: predicate.dict_get_item_const_eliminator 0.98% : 0.000002s : 15: predicate.dict_get_item_eliminator 0.91% : 0.000002s : 15: predicate.dict_set_item_eliminator 0.98% : 0.000002s : 10: predicate.dumpgradient_eliminate 0.22% : 0.000001s : 5: predicate.elim_not_effective 0.41% : 0.000001s : 5: predicate.elim_shapecalc_of_broadcastargs 1.17% : 0.000003s : 20: predicate.environ_add_const_eliminate 1.03% : 0.000003s : 20: predicate.environ_get_add_eliminate 1.07% : 0.000003s : 20: predicate.environ_get_depend_swap 1.73% : 0.000004s : 30: predicate.environ_get_eliminate 1.07% : 0.000003s : 20: predicate.environ_get_set_eliminate 1.39% : 0.000003s : 23: predicate.exchange_switch_depend_value 2.40% : 0.000006s : 23: predicate.float_depend_g_call 0.59% : 0.000001s : 10: predicate.float_environ_get_switch 0.89% : 0.000002s : 15: predicate.float_tuple_getitem_switch 0.20% : 0.000000s : 5: predicate.fold_const_symbol 0.68% : 0.000002s : 10: predicate.get_grad_eliminate 0.21% : 0.000001s : 5: predicate.graph_param_transform 0.67% : 0.000002s : 10: predicate.incorporate_call 0.57% : 0.000001s : 10: predicate.incorporate_call_switch 6.19% : 0.000015s : 68: predicate.inline 0.81% : 0.000002s : 10: predicate.inline_without_move 0.36% : 0.000001s : 10: predicate.j_node_and_user_rematch 0.77% : 0.000002s : 10: predicate.less_batch_normalization 1.83% : 0.000005s : 29: predicate.list_to_tuple_eliminator_ 2.58% : 0.000006s : 44: predicate.load_eliminater 0.97% : 0.000002s : 5: predicate.loop_unroll_after_grad 2.22% : 0.000006s : 36: predicate.loop_unroll_before_grad 1.65% : 0.000004s : 25: predicate.make_slice_get_slice_eliminator 0.63% : 0.000002s : 10: predicate.merge_addn 0.61% : 0.000002s : 10: predicate.micro_step_allgather_replace 0.62% : 0.000002s : 10: predicate.mini_step_allgather_replace 0.85% : 0.000002s : 15: predicate.minmaximum_grad 1.08% : 0.000003s : 5: predicate.mutable_eliminate 0.37% : 0.000001s : 5: predicate.opt_reshape 0.40% : 0.000001s : 5: predicate.parallel_virtual_node 1.70% : 0.000004s : 23: predicate.partial_defer_inline 1.59% : 0.000004s : 24: predicate.partial_eliminate 0.91% : 0.000002s : 15: predicate.print_const_string_wrapper 0.82% : 0.000002s : 10: predicate.reduce_all_const_elim 1.09% : 0.000003s : 15: predicate.reduce_eliminate 2.53% : 0.000006s : 44: predicate.redundant_stop_gradient_eliminater 0.39% : 0.000001s : 10: predicate.remove_not_recompute_node 1.24% : 0.000003s : 29: predicate.replace_applicator 0.48% : 0.000001s : 10: predicate.replace_old_param 0.25% : 0.000001s : 5: predicate.reset_defer_inline 0.92% : 0.000002s : 15: predicate.reshape_eliminate 0.65% : 0.000002s : 10: predicate.row_tensor_add_zeros_like 0.42% : 0.000001s : 5: predicate.row_tensor_eliminate 0.73% : 0.000002s : 10: predicate.same_eliminate 0.41% : 0.000001s : 10: predicate.set_cell_output_no_recompute 0.82% : 0.000002s : 10: predicate.shard_identity_eliminate 0.66% : 0.000002s : 10: predicate.special_op_eliminate 0.79% : 0.000002s : 10: predicate.specialize_transform 1.13% : 0.000003s : 10: predicate.split_environ_get_set_with_tuple_value 0.72% : 0.000002s : 10: predicate.stack_unstack_eliminate 0.34% : 0.000001s : 5: predicate.switch_call_monad_eliminater 1.46% : 0.000004s : 23: predicate.switch_defer_inline 2.04% : 0.000005s : 33: predicate.switch_layer_defer_inline 4.96% : 0.000012s : 74: predicate.switch_simplify 0.86% : 0.000002s : 15: predicate.tile_eliminate 0.99% : 0.000002s : 15: predicate.transpose_eliminate 1.48% : 0.000004s : 25: predicate.tuple_list_convert_item_index_to_positive 1.64% : 0.000004s : 25: predicate.tuple_list_get_item_const_eliminator 1.50% : 0.000004s : 25: predicate.tuple_list_get_item_depend_reorder 3.24% : 0.000008s : 39: predicate.tuple_list_get_item_eliminator 1.52% : 0.000004s : 25: predicate.tuple_list_get_set_item_eliminator 2.18% : 0.000005s : 35: predicate.tuple_list_set_item_eliminator 1.79% : 0.000004s : 29: predicate.tuple_to_list_eliminator_ 2.40% : 0.000006s : 44: predicate.updatestate_pure_node_eliminater 3.21% : 0.000008s : 54: predicate.updatestate_useless_node_eliminater 0.35% : 0.000001s : 5: predicate.value_based_eliminate 0.69% : 0.000002s : 10: predicate.virtual_dataset_eliminate 0.72% : 0.000002s : 10: predicate.virtual_output_eliminate 0.31% : 0.000001s : 5: predicate.virtual_view_grad_eliminate 0.46% : 0.000001s : 5: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000648 11 61.80% : 0.000400s : 5: func_graph_cloner_run.FuncGraphClonerGraph 38.20% : 0.000247s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.029845 192 0.02% : 0.000007s : 1: ForceFp32Comm 11.39% : 0.003399s : 1: add_attr 11.34% : 0.003384s : 1: add_attr_with_inline 0.02% : 0.000006s : 1: add_comm_op_reuse_tag 0.22% : 0.000066s : 1: add_recomputation 0.02% : 0.000007s : 1: assign_add_opt 0.25% : 0.000074s : 1: auto_monad 0.11% : 0.000033s : 1: auto_monad_reorder 0.02% : 0.000006s : 1: begin_end_overlap_inline 0.03% : 0.000008s : 1: bias_add_comm_swap 1.79% : 0.000534s : 1: bootstrap 0.11% : 0.000033s : 1: cconv 0.02% : 0.000006s : 1: comm_op_add_attrs 0.07% : 0.000021s : 1: control_data_broadcast_order 0.04% : 0.000013s : 1: convert_after_rewriter 0.12% : 0.000035s : 1: cse_after_recomputation 0.03% : 0.000008s : 1: dataset_repeat_opt 0.07% : 0.000022s : 1: detach_backward 0.04% : 0.000013s : 1: environ_conv 0.10% : 0.000031s : 1: event_method 0.03% : 0.000008s : 1: full_micro_interleaved_order_control 0.03% : 0.000008s : 1: get_jit_bprop_graph 0.04% : 0.000012s : 1: graph_reusing 0.02% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.02% : 0.000007s : 1: handle_group_info 0.03% : 0.000008s : 1: inline 0.03% : 0.000009s : 1: insert-virtual-dataset 0.02% : 0.000006s : 1: interleave_parallel_branches 0.02% : 0.000007s : 1: interleave_split_concat_branches 0.03% : 0.000008s : 1: label_fine_grained_interleaved_index 0.04% : 0.000011s : 1: label_micro_interleaved_index 1.60% : 0.000477s : 1: loop_unroll 0.02% : 0.000007s : 1: merge_cast_opt 0.03% : 0.000008s : 1: micro_interleaved_order_control 1.87% : 0.000558s : 1: mutable_eliminate 0.03% : 0.000010s : 1: offloading_packed_experts 0.06% : 0.000017s : 1: opt.transform.loop_unroll_optimizer 0.06% : 0.000017s : 1: opt.transform.mutable_eliminate 4.87% : 0.001453s : 78: opt.transform.opt_a 0.12% : 0.000036s : 1: opt.transform.opt_after_cconv 0.12% : 0.000036s : 1: opt.transform.opt_after_jit_grad 0.46% : 0.000136s : 28: opt.transform.opt_b 0.20% : 0.000060s : 2: opt.transform.opt_trans_graph 0.15% : 0.000045s : 4: opt.transform.symbol_engine_opt 11.50% : 0.003433s : 1: opt_a 0.48% : 0.000145s : 1: opt_after_cconv 2.11% : 0.000630s : 1: opt_after_jit_grad 1.06% : 0.000316s : 1: opt_b 21.69% : 0.006473s : 1: optimize 0.09% : 0.000026s : 1: optimize_parallel_all_gather_comm 0.04% : 0.000011s : 1: order_py_execute_after_rewriter 0.09% : 0.000028s : 1: overlap_grad_flash_sp 0.02% : 0.000006s : 1: overlap_grad_matmul_and_grad_allreduce 0.03% : 0.000010s : 1: overlap_grad_ring_attention 0.02% : 0.000007s : 1: overlap_opt_shard_grad_in_pipeline 0.02% : 0.000006s : 1: overlap_opt_shard_in_pipeline 0.03% : 0.000008s : 1: overlap_param_gather 0.02% : 0.000006s : 1: overlap_recompute_allgather_and_fa_grad 0.04% : 0.000011s : 1: overlap_recompute_and_grad_model_parallel 0.02% : 0.000007s : 1: overlap_recompute_comm 0.04% : 0.000011s : 1: parallel-infer-symbol 0.02% : 0.000006s : 1: parallel-infer-symbol-second 0.03% : 0.000008s : 1: partial_unused_args_eliminate 0.03% : 0.000010s : 1: pipeline_parallel_scheduler 0.02% : 0.000007s : 1: pipeline_split 0.14% : 0.000042s : 1: pre_auto_parallel 0.11% : 0.000034s : 1: py_interpret_to_execute 0.06% : 0.000019s : 1: py_interpret_to_execute_after_opt_a 0.02% : 0.000006s : 1: remove_cast_before_assign_add 0.08% : 0.000023s : 1: remove_dup_value 1.32% : 0.000394s : 1: renormalize.infer 1.05% : 0.000313s : 1: renormalize.specialize 0.03% : 0.000008s : 1: reorder_send_recv_between_fp_bp 0.20% : 0.000060s : 1: rewriter_after_jit_bprop_graph 0.17% : 0.000049s : 1: rewriter_after_opt_a 0.33% : 0.000099s : 1: rewriter_before_opt_a 0.03% : 0.000008s : 1: slice_cell_reuse_recomputed_activation 0.03% : 0.000008s : 1: slice_recompute_activation 0.02% : 0.000007s : 1: split_layernorm_comm 0.03% : 0.000008s : 1: split_matmul_comm_elemetwise 0.04% : 0.000012s : 1: swap_dp_allreduce_reducescatter 0.36% : 0.000108s : 1: symbol_engine_optimizer 0.35% : 0.000105s : 1: tuple_transform 22.52% : 0.006722s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:45:06.434.401 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0164856, [21] [bootstrap]: 0.00047373 [type_inference]: 0.00625333 [event_method]: 2.058e-05 [auto_monad]: 6.742e-05 [graph_reusing]: 5.78002e-06 [inline]: 2.61e-06 [add_attr]: 0.00346365, [1] [add_attr_with_inline]: 0.00345309, [1] [Cycle 1]: 7.17e-05, [2] [tag_attr]: 2.217e-05 [meta_addattr_fg_expand]: 5.79999e-06 [parallel-infer-symbol]: 4.07e-06 [pre_auto_parallel]: 3.769e-05 [insert-virtual-dataset]: 2.69999e-06 [parallel-infer-symbol-second]: 9.10019e-07 [dataset_repeat_opt]: 2.39001e-06 [pipeline_split]: 1.97001e-06 [optimize]: 0.00543237, [53] [py_interpret_to_execute]: 2.844e-05 [rewriter_before_opt_a]: 8.797e-05 [opt_a]: 0.00316994, [2] [Cycle 1]: 0.00234122, [45] [expand_dump_flag]: 3.06001e-06 [switch_simplify]: 4.488e-05 [loop_unroll]: 3.147e-05 [a_1]: 0.00068143 [with_stream_mark]: 1.884e-05 [recompute_prepare]: 1.131e-05 [updatestate_depend_eliminate]: 4.94e-06 [updatestate_assign_eliminate]: 3.75998e-06 [updatestate_loads_eliminate]: 3.66001e-06 [parameter_eliminate]: 2.29999e-06 [a_2]: 0.00010161 [accelerated_algorithm]: 8.67998e-06 [shard]: 1.72001e-06 [meta_shard_fg_expand]: 2.16e-06 [shard_inline]: 8.12e-06 [merge_send_recv]: 1.058e-05 [auto_parallel]: 7.67002e-06 [parallel]: 1.883e-05 [flash_sp]: 9.75002e-06 [merge_comm]: 4.77e-06 [allreduce_fusion]: 4.11001e-06 [matmul_add_comm_reduction]: 1.172e-05 [allreduce_slice_to_reducescatter]: 8.70001e-07 [virtual_shard_identity]: 1.046e-05 [virtual_dataset]: 8.28999e-06 [get_grad_eliminate_]: 7.56999e-06 [virtual_output]: 7.91001e-06 [merge_forward]: 4.48999e-06 [cell_reuse_recompute_pass]: 1.17e-06 [offload_activation]: 1.178e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.672e-05 [merge_recompute_call_nodes]: 1.86e-06 [before_grad]: 1.253e-05 [set_forward_comm_id_for_comm_node_pass]: 4.69002e-06 [meta_fg_expand]: 3.21999e-06 [flash_sp_send_recv_attached]: 2.83e-06 [receive_attached]: 2.04999e-06 [after_resolve]: 1.337e-05 [a_after_grad]: 1.217e-05 [renormalize]: 0.00083902 [add_forward_monad_depend]: 6.25002e-06 [auto_monad_grad]: 3.11001e-06 [auto_monad_eliminator]: 1.858e-05 [cse]: 3.761e-05 [a_3]: 6.009e-05 [Cycle 2]: 0.00081778, [45] [expand_dump_flag]: 1.40999e-06 [switch_simplify]: 9.24998e-06 [loop_unroll]: 7.23e-06 [a_1]: 0.00019087 [with_stream_mark]: 1.647e-05 [recompute_prepare]: 8.62998e-06 [updatestate_depend_eliminate]: 4.42003e-06 [updatestate_assign_eliminate]: 3.23e-06 [updatestate_loads_eliminate]: 3.20998e-06 [parameter_eliminate]: 1.44e-06 [a_2]: 9.448e-05 [accelerated_algorithm]: 7.82e-06 [shard]: 1.34998e-06 [meta_shard_fg_expand]: 1.86e-06 [shard_inline]: 7.40998e-06 [merge_send_recv]: 7.56999e-06 [auto_parallel]: 7.33e-06 [parallel]: 6.52001e-06 [flash_sp]: 3.44001e-06 [merge_comm]: 4.39998e-06 [allreduce_fusion]: 3.98999e-06 [matmul_add_comm_reduction]: 8.88002e-06 [allreduce_slice_to_reducescatter]: 5.10016e-07 [virtual_shard_identity]: 9.97999e-06 [virtual_dataset]: 7.26999e-06 [get_grad_eliminate_]: 7.46001e-06 [virtual_output]: 6.88e-06 [merge_forward]: 3.80998e-06 [cell_reuse_recompute_pass]: 1.60999e-06 [offload_activation]: 9.22999e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.508e-05 [merge_recompute_call_nodes]: 1.22e-06 [before_grad]: 1.187e-05 [set_forward_comm_id_for_comm_node_pass]: 4.98001e-06 [meta_fg_expand]: 2.89001e-06 [flash_sp_send_recv_attached]: 1.20999e-06 [receive_attached]: 1.63002e-06 [after_resolve]: 1.346e-05 [a_after_grad]: 1.24e-05 [renormalize]: 8.00064e-08 [add_forward_monad_depend]: 2.07001e-06 [auto_monad_grad]: 1.52999e-06 [auto_monad_eliminator]: 1.135e-05 [cse]: 2.095e-05 [a_3]: 4.683e-05 [py_interpret_to_execute_after_opt_a]: 1.226e-05 [slice_cell_reuse_recomputed_activation]: 1.86e-06 [rewriter_after_opt_a]: 4.322e-05 [convert_after_rewriter]: 7.82e-06 [order_py_execute_after_rewriter]: 5.67001e-06 [mutable_eliminate]: 0.00059446 [opt_b]: 0.00025052, [1] [Cycle 1]: 0.00024375, [7] [b_1]: 0.0001561 [b_2]: 9.86e-06 [updatestate_depend_eliminate]: 8.75001e-06 [updatestate_assign_eliminate]: 3.66001e-06 [updatestate_loads_eliminate]: 3.03e-06 [renormalize]: 9.20001e-07 [cse]: 2.535e-05 [optimize_parallel_all_gather_comm]: 1.952e-05 [overlap_param_gather]: 2.19001e-06 [cconv]: 2.867e-05 [loop_unroll]: 0.0004561 [opt_after_cconv]: 0.00011977, [1] [Cycle 1]: 0.00011364, [7] [c_1]: 3.773e-05 [parameter_eliminate]: 4.41002e-06 [updatestate_depend_eliminate]: 6.93e-06 [updatestate_assign_eliminate]: 3.18e-06 [updatestate_loads_eliminate]: 2.81999e-06 [cse]: 2.394e-05 [renormalize]: 5.8001e-07 [remove_dup_value]: 1.563e-05 [tuple_transform]: 8.712e-05, [1] [Cycle 1]: 8.229e-05, [4] [d_1]: 5.349e-05 [none_parameter_eliminate]: 1.73002e-06 [renormalize]: 2.3999e-07 [switch_simplify]: 8.1e-06 [partial_unused_args_eliminate]: 2.11e-06 [add_recomputation]: 5.775e-05 [cse_after_recomputation]: 2.529e-05, [1] [Cycle 1]: 2.061e-05, [1] [cse]: 1.517e-05 [environ_conv]: 7.67998e-06 [swap_dp_allreduce_reducescatter]: 6.12001e-06 [bias_add_comm_swap]: 2.79999e-06 [label_micro_interleaved_index]: 4.15e-06 [label_fine_grained_interleaved_index]: 2.68998e-06 [merge_cast_opt]: 1.27e-06 [slice_recompute_activation]: 2.30002e-06 [micro_interleaved_order_control]: 2.34999e-06 [assign_add_opt]: 1.62001e-06 [ForceFp32Comm]: 9.30013e-07 [remove_cast_before_assign_add]: 9.89996e-07 [full_micro_interleaved_order_control]: 2.04e-06 [reorder_send_recv_between_fp_bp]: 2.69001e-06 [comm_op_add_attrs]: 1.00999e-06 [add_comm_op_reuse_tag]: 9.80013e-07 [interleave_split_concat_branches]: 1.09e-06 [interleave_parallel_branches]: 1.12999e-06 [overlap_opt_shard_in_pipeline]: 1.23002e-06 [overlap_opt_shard_grad_in_pipeline]: 1.84e-06 [control_data_broadcast_order]: 1.576e-05 [grouped_pairwise_exchange_alltoall]: 1.57001e-06 [offloading_packed_experts]: 4.68001e-06 [overlap_recompute_and_grad_model_parallel]: 5.49e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.15999e-06 [overlap_recompute_allgather_and_fa_grad]: 1.35001e-06 [overlap_recompute_comm]: 2.53003e-06 [overlap_grad_ring_attention]: 4.75999e-06 [overlap_grad_flash_sp]: 2.229e-05 [begin_end_overlap_inline]: 6.10016e-07 [split_matmul_comm_elemetwise]: 2.32999e-06 [split_layernorm_comm]: 1.83002e-06 [handle_group_info]: 1.29998e-06 [symbol_engine_optimizer]: 8.388e-05, [1] [Cycle 1]: 7.929e-05, [6] [build]: 3.26001e-06 [elim_shapecalc]: 1.089e-05 [elim_not_effective]: 1.479e-05 [opt_reshape]: 9.05999e-06 [fold_const_symbol]: 1.245e-05 [renormalize]: 1.80007e-07 [detach_backward]: 2.39001e-06 [pipeline_parallel_scheduler]: 1.52001e-06 [auto_monad_reorder]: 2.026e-05 [get_jit_bprop_graph]: 1.89999e-06 [rewriter_after_jit_bprop_graph]: 5.12999e-06 [opt_after_jit_grad]: 0.00048585 [validate]: 4.401e-05 Sums bootstrap : 0.000474s : 3.94% type_inference : 0.006253s : 51.98% event_method : 0.000021s : 0.17% auto_monad : 0.000067s : 0.56% graph_reusing : 0.000006s : 0.05% inline : 0.000003s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000022s : 0.18% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.05% parallel-infer-symbol : 0.000004s : 0.03% pre_auto_parallel : 0.000038s : 0.31% insert-virtual-dataset : 0.000003s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.02% optimize.py_interpret_to_execute : 0.000028s : 0.24% optimize.rewriter_before_opt_a : 0.000088s : 0.73% optimize.opt_a.expand_dump_flag : 0.000004s : 0.04% optimize.opt_a.switch_simplify : 0.000054s : 0.45% optimize.opt_a.loop_unroll : 0.000039s : 0.32% optimize.opt_a.a_1 : 0.000872s : 7.25% optimize.opt_a.with_stream_mark : 0.000035s : 0.29% optimize.opt_a.recompute_prepare : 0.000020s : 0.17% optimize.opt_a.updatestate_depend_eliminate : 0.000009s : 0.08% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.06% optimize.opt_a.updatestate_loads_eliminate : 0.000007s : 0.06% optimize.opt_a.parameter_eliminate : 0.000004s : 0.03% optimize.opt_a.a_2 : 0.000196s : 1.63% optimize.opt_a.accelerated_algorithm : 0.000016s : 0.14% optimize.opt_a.shard : 0.000003s : 0.03% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.03% optimize.opt_a.shard_inline : 0.000016s : 0.13% optimize.opt_a.merge_send_recv : 0.000018s : 0.15% optimize.opt_a.auto_parallel : 0.000015s : 0.12% optimize.opt_a.parallel : 0.000025s : 0.21% optimize.opt_a.flash_sp : 0.000013s : 0.11% optimize.opt_a.merge_comm : 0.000009s : 0.08% optimize.opt_a.allreduce_fusion : 0.000008s : 0.07% optimize.opt_a.matmul_add_comm_reduction : 0.000021s : 0.17% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000020s : 0.17% optimize.opt_a.virtual_dataset : 0.000016s : 0.13% optimize.opt_a.get_grad_eliminate_ : 0.000015s : 0.12% optimize.opt_a.virtual_output : 0.000015s : 0.12% optimize.opt_a.merge_forward : 0.000008s : 0.07% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.02% optimize.opt_a.offload_activation : 0.000021s : 0.17% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000032s : 0.26% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.03% optimize.opt_a.before_grad : 0.000024s : 0.20% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000010s : 0.08% optimize.opt_a.meta_fg_expand : 0.000006s : 0.05% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.03% optimize.opt_a.receive_attached : 0.000004s : 0.03% optimize.opt_a.after_resolve : 0.000027s : 0.22% optimize.opt_a.a_after_grad : 0.000025s : 0.20% optimize.opt_a.renormalize : 0.000839s : 6.97% optimize.opt_a.add_forward_monad_depend : 0.000008s : 0.07% optimize.opt_a.auto_monad_grad : 0.000005s : 0.04% optimize.opt_a.auto_monad_eliminator : 0.000030s : 0.25% optimize.opt_a.cse : 0.000059s : 0.49% optimize.opt_a.a_3 : 0.000107s : 0.89% optimize.py_interpret_to_execute_after_opt_a : 0.000012s : 0.10% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.02% optimize.rewriter_after_opt_a : 0.000043s : 0.36% optimize.convert_after_rewriter : 0.000008s : 0.07% optimize.order_py_execute_after_rewriter : 0.000006s : 0.05% optimize.mutable_eliminate : 0.000594s : 4.94% optimize.opt_b.b_1 : 0.000156s : 1.30% optimize.opt_b.b_2 : 0.000010s : 0.08% optimize.opt_b.updatestate_depend_eliminate : 0.000009s : 0.07% optimize.opt_b.updatestate_assign_eliminate : 0.000004s : 0.03% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_b.renormalize : 0.000001s : 0.01% optimize.opt_b.cse : 0.000025s : 0.21% optimize.optimize_parallel_all_gather_comm : 0.000020s : 0.16% optimize.overlap_param_gather : 0.000002s : 0.02% optimize.cconv : 0.000029s : 0.24% optimize.loop_unroll : 0.000456s : 3.79% optimize.opt_after_cconv.c_1 : 0.000038s : 0.31% optimize.opt_after_cconv.parameter_eliminate : 0.000004s : 0.04% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000007s : 0.06% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.02% optimize.opt_after_cconv.cse : 0.000024s : 0.20% optimize.opt_after_cconv.renormalize : 0.000001s : 0.00% optimize.remove_dup_value : 0.000016s : 0.13% optimize.tuple_transform.d_1 : 0.000053s : 0.44% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000008s : 0.07% optimize.partial_unused_args_eliminate : 0.000002s : 0.02% optimize.add_recomputation : 0.000058s : 0.48% optimize.cse_after_recomputation.cse : 0.000015s : 0.13% optimize.environ_conv : 0.000008s : 0.06% optimize.swap_dp_allreduce_reducescatter : 0.000006s : 0.05% optimize.bias_add_comm_swap : 0.000003s : 0.02% optimize.label_micro_interleaved_index : 0.000004s : 0.03% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.02% optimize.merge_cast_opt : 0.000001s : 0.01% optimize.slice_recompute_activation : 0.000002s : 0.02% optimize.micro_interleaved_order_control : 0.000002s : 0.02% optimize.assign_add_opt : 0.000002s : 0.01% optimize.ForceFp32Comm : 0.000001s : 0.01% optimize.remove_cast_before_assign_add : 0.000001s : 0.01% optimize.full_micro_interleaved_order_control : 0.000002s : 0.02% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.02% optimize.comm_op_add_attrs : 0.000001s : 0.01% optimize.add_comm_op_reuse_tag : 0.000001s : 0.01% optimize.interleave_split_concat_branches : 0.000001s : 0.01% optimize.interleave_parallel_branches : 0.000001s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.02% optimize.control_data_broadcast_order : 0.000016s : 0.13% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.01% optimize.offloading_packed_experts : 0.000005s : 0.04% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.05% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.01% optimize.overlap_recompute_comm : 0.000003s : 0.02% optimize.overlap_grad_ring_attention : 0.000005s : 0.04% optimize.overlap_grad_flash_sp : 0.000022s : 0.19% optimize.begin_end_overlap_inline : 0.000001s : 0.01% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.02% optimize.split_layernorm_comm : 0.000002s : 0.02% optimize.handle_group_info : 0.000001s : 0.01% optimize.symbol_engine_optimizer.build : 0.000003s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000011s : 0.09% optimize.symbol_engine_optimizer.elim_not_effective : 0.000015s : 0.12% optimize.symbol_engine_optimizer.opt_reshape : 0.000009s : 0.08% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000012s : 0.10% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.02% pipeline_parallel_scheduler : 0.000002s : 0.01% auto_monad_reorder : 0.000020s : 0.17% get_jit_bprop_graph : 0.000002s : 0.02% rewriter_after_jit_bprop_graph : 0.000005s : 0.04% opt_after_jit_grad : 0.000486s : 4.04% validate : 0.000044s : 0.37% Time group info: ------[substitution.] 0.000213 38 11.64% : 0.000025s : 3: substitution.cast_eliminate 1.15% : 0.000002s : 3: substitution.elim_not_effective 0.78% : 0.000002s : 3: substitution.fold_const_symbol 3.22% : 0.000007s : 5: substitution.graph_param_transform 70.15% : 0.000150s : 4: substitution.inline 1.93% : 0.000004s : 6: substitution.j_node_and_user_rematch 3.00% : 0.000006s : 6: substitution.remove_not_recompute_node 2.51% : 0.000005s : 4: substitution.replace_old_param 5.62% : 0.000012s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.006180 2 87.60% : 0.005414s : 1: type_inference.infer 12.40% : 0.000766s : 1: type_inference.specialize ------[replace.] 0.000063 8 62.69% : 0.000040s : 4: replace.inline 37.31% : 0.000024s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000157 8 93.64% : 0.000147s : 4: match.inline 6.36% : 0.000010s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000241 1504 0.90% : 0.000002s : 15: predicate.accumulaten_eliminater 0.85% : 0.000002s : 5: predicate.ad_related_special_op_eliminate 0.61% : 0.000001s : 10: predicate.addn_check_dump 0.91% : 0.000002s : 15: predicate.addn_zero_filter 0.80% : 0.000002s : 15: predicate.adjust_all_reduce_mul_add 2.02% : 0.000005s : 25: predicate.arithmetic_simplify 1.04% : 0.000003s : 15: predicate.cast_eliminate 0.62% : 0.000001s : 10: predicate.check_bprop_eliminate 0.60% : 0.000001s : 10: predicate.compare_switch_simplify 0.18% : 0.000000s : 5: predicate.const_output_eliminate 0.63% : 0.000002s : 10: predicate.depend_value_elim 0.94% : 0.000002s : 15: predicate.dict_get_item_const_eliminator 1.03% : 0.000002s : 15: predicate.dict_get_item_eliminator 0.86% : 0.000002s : 15: predicate.dict_set_item_eliminator 1.27% : 0.000003s : 10: predicate.dumpgradient_eliminate 0.19% : 0.000000s : 5: predicate.elim_not_effective 0.39% : 0.000001s : 5: predicate.elim_shapecalc_of_broadcastargs 1.21% : 0.000003s : 20: predicate.environ_add_const_eliminate 1.11% : 0.000003s : 20: predicate.environ_get_add_eliminate 1.07% : 0.000003s : 20: predicate.environ_get_depend_swap 1.72% : 0.000004s : 30: predicate.environ_get_eliminate 1.10% : 0.000003s : 20: predicate.environ_get_set_eliminate 1.36% : 0.000003s : 23: predicate.exchange_switch_depend_value 2.24% : 0.000005s : 23: predicate.float_depend_g_call 0.61% : 0.000001s : 10: predicate.float_environ_get_switch 0.85% : 0.000002s : 15: predicate.float_tuple_getitem_switch 0.21% : 0.000001s : 5: predicate.fold_const_symbol 0.64% : 0.000002s : 10: predicate.get_grad_eliminate 0.22% : 0.000001s : 5: predicate.graph_param_transform 0.63% : 0.000002s : 10: predicate.incorporate_call 0.56% : 0.000001s : 10: predicate.incorporate_call_switch 6.22% : 0.000015s : 68: predicate.inline 0.92% : 0.000002s : 10: predicate.inline_without_move 0.32% : 0.000001s : 10: predicate.j_node_and_user_rematch 0.85% : 0.000002s : 10: predicate.less_batch_normalization 1.72% : 0.000004s : 29: predicate.list_to_tuple_eliminator_ 2.60% : 0.000006s : 44: predicate.load_eliminater 1.02% : 0.000002s : 5: predicate.loop_unroll_after_grad 2.09% : 0.000005s : 36: predicate.loop_unroll_before_grad 1.60% : 0.000004s : 25: predicate.make_slice_get_slice_eliminator 0.60% : 0.000001s : 10: predicate.merge_addn 0.61% : 0.000001s : 10: predicate.micro_step_allgather_replace 0.59% : 0.000001s : 10: predicate.mini_step_allgather_replace 0.87% : 0.000002s : 15: predicate.minmaximum_grad 1.21% : 0.000003s : 5: predicate.mutable_eliminate 0.43% : 0.000001s : 5: predicate.opt_reshape 0.36% : 0.000001s : 5: predicate.parallel_virtual_node 1.72% : 0.000004s : 23: predicate.partial_defer_inline 1.64% : 0.000004s : 24: predicate.partial_eliminate 0.85% : 0.000002s : 15: predicate.print_const_string_wrapper 0.62% : 0.000001s : 10: predicate.reduce_all_const_elim 1.27% : 0.000003s : 15: predicate.reduce_eliminate 2.45% : 0.000006s : 44: predicate.redundant_stop_gradient_eliminater 0.51% : 0.000001s : 10: predicate.remove_not_recompute_node 1.40% : 0.000003s : 29: predicate.replace_applicator 0.58% : 0.000001s : 10: predicate.replace_old_param 0.33% : 0.000001s : 5: predicate.reset_defer_inline 0.92% : 0.000002s : 15: predicate.reshape_eliminate 0.64% : 0.000002s : 10: predicate.row_tensor_add_zeros_like 0.34% : 0.000001s : 5: predicate.row_tensor_eliminate 0.82% : 0.000002s : 10: predicate.same_eliminate 0.40% : 0.000001s : 10: predicate.set_cell_output_no_recompute 0.83% : 0.000002s : 10: predicate.shard_identity_eliminate 0.82% : 0.000002s : 10: predicate.special_op_eliminate 0.75% : 0.000002s : 10: predicate.specialize_transform 0.85% : 0.000002s : 10: predicate.split_environ_get_set_with_tuple_value 0.83% : 0.000002s : 10: predicate.stack_unstack_eliminate 0.41% : 0.000001s : 5: predicate.switch_call_monad_eliminater 1.49% : 0.000004s : 23: predicate.switch_defer_inline 1.94% : 0.000005s : 33: predicate.switch_layer_defer_inline 5.01% : 0.000012s : 74: predicate.switch_simplify 0.88% : 0.000002s : 15: predicate.tile_eliminate 0.89% : 0.000002s : 15: predicate.transpose_eliminate 1.63% : 0.000004s : 25: predicate.tuple_list_convert_item_index_to_positive 1.69% : 0.000004s : 25: predicate.tuple_list_get_item_const_eliminator 1.41% : 0.000003s : 25: predicate.tuple_list_get_item_depend_reorder 3.17% : 0.000008s : 39: predicate.tuple_list_get_item_eliminator 1.45% : 0.000003s : 25: predicate.tuple_list_get_set_item_eliminator 2.16% : 0.000005s : 35: predicate.tuple_list_set_item_eliminator 1.73% : 0.000004s : 29: predicate.tuple_to_list_eliminator_ 2.39% : 0.000006s : 44: predicate.updatestate_pure_node_eliminater 3.21% : 0.000008s : 54: predicate.updatestate_useless_node_eliminater 0.41% : 0.000001s : 5: predicate.value_based_eliminate 0.74% : 0.000002s : 10: predicate.virtual_dataset_eliminate 0.67% : 0.000002s : 10: predicate.virtual_output_eliminate 0.29% : 0.000001s : 5: predicate.virtual_view_grad_eliminate 0.44% : 0.000001s : 5: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000617 11 54.41% : 0.000336s : 5: func_graph_cloner_run.FuncGraphClonerGraph 45.59% : 0.000281s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.027823 192 0.01% : 0.000004s : 1: ForceFp32Comm 12.47% : 0.003469s : 1: add_attr 12.43% : 0.003457s : 1: add_attr_with_inline 0.01% : 0.000004s : 1: add_comm_op_reuse_tag 0.22% : 0.000062s : 1: add_recomputation 0.02% : 0.000004s : 1: assign_add_opt 0.26% : 0.000073s : 1: auto_monad 0.09% : 0.000024s : 1: auto_monad_reorder 0.01% : 0.000003s : 1: begin_end_overlap_inline 0.02% : 0.000006s : 1: bias_add_comm_swap 1.80% : 0.000502s : 1: bootstrap 0.12% : 0.000032s : 1: cconv 0.01% : 0.000004s : 1: comm_op_add_attrs 0.07% : 0.000019s : 1: control_data_broadcast_order 0.04% : 0.000011s : 1: convert_after_rewriter 0.10% : 0.000028s : 1: cse_after_recomputation 0.02% : 0.000005s : 1: dataset_repeat_opt 0.02% : 0.000006s : 1: detach_backward 0.04% : 0.000011s : 1: environ_conv 0.10% : 0.000028s : 1: event_method 0.02% : 0.000005s : 1: full_micro_interleaved_order_control 0.02% : 0.000005s : 1: get_jit_bprop_graph 0.03% : 0.000009s : 1: graph_reusing 0.02% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000004s : 1: handle_group_info 0.02% : 0.000006s : 1: inline 0.02% : 0.000006s : 1: insert-virtual-dataset 0.01% : 0.000004s : 1: interleave_parallel_branches 0.01% : 0.000004s : 1: interleave_split_concat_branches 0.02% : 0.000006s : 1: label_fine_grained_interleaved_index 0.02% : 0.000007s : 1: label_micro_interleaved_index 1.67% : 0.000465s : 1: loop_unroll 0.01% : 0.000004s : 1: merge_cast_opt 0.02% : 0.000005s : 1: micro_interleaved_order_control 2.17% : 0.000604s : 1: mutable_eliminate 0.03% : 0.000008s : 1: offloading_packed_experts 0.07% : 0.000018s : 1: opt.transform.loop_unroll_optimizer 0.07% : 0.000020s : 1: opt.transform.mutable_eliminate 5.03% : 0.001398s : 78: opt.transform.opt_a 0.13% : 0.000036s : 1: opt.transform.opt_after_cconv 0.11% : 0.000030s : 1: opt.transform.opt_after_jit_grad 0.48% : 0.000134s : 28: opt.transform.opt_b 0.21% : 0.000059s : 2: opt.transform.opt_trans_graph 0.16% : 0.000043s : 4: opt.transform.symbol_engine_opt 11.40% : 0.003173s : 1: opt_a 0.44% : 0.000123s : 1: opt_after_cconv 1.78% : 0.000495s : 1: opt_after_jit_grad 0.92% : 0.000255s : 1: opt_b 19.54% : 0.005438s : 1: optimize 0.08% : 0.000023s : 1: optimize_parallel_all_gather_comm 0.03% : 0.000009s : 1: order_py_execute_after_rewriter 0.09% : 0.000026s : 1: overlap_grad_flash_sp 0.01% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.03% : 0.000008s : 1: overlap_grad_ring_attention 0.02% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.02% : 0.000005s : 1: overlap_param_gather 0.01% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.03% : 0.000008s : 1: overlap_recompute_and_grad_model_parallel 0.02% : 0.000006s : 1: overlap_recompute_comm 0.03% : 0.000008s : 1: parallel-infer-symbol 0.01% : 0.000004s : 1: parallel-infer-symbol-second 0.02% : 0.000005s : 1: partial_unused_args_eliminate 0.02% : 0.000004s : 1: pipeline_parallel_scheduler 0.02% : 0.000005s : 1: pipeline_split 0.15% : 0.000042s : 1: pre_auto_parallel 0.12% : 0.000033s : 1: py_interpret_to_execute 0.06% : 0.000016s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000004s : 1: remove_cast_before_assign_add 0.07% : 0.000019s : 1: remove_dup_value 1.71% : 0.000476s : 1: renormalize.infer 1.27% : 0.000354s : 1: renormalize.specialize 0.02% : 0.000005s : 1: reorder_send_recv_between_fp_bp 0.03% : 0.000008s : 1: rewriter_after_jit_bprop_graph 0.17% : 0.000047s : 1: rewriter_after_opt_a 0.33% : 0.000093s : 1: rewriter_before_opt_a 0.02% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.02% : 0.000005s : 1: slice_recompute_activation 0.02% : 0.000005s : 1: split_layernorm_comm 0.02% : 0.000005s : 1: split_matmul_comm_elemetwise 0.03% : 0.000009s : 1: swap_dp_allreduce_reducescatter 0.31% : 0.000087s : 1: symbol_engine_optimizer 0.32% : 0.000090s : 1: tuple_transform 22.54% : 0.006272s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:45:06.775.532 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:45:06.775.848 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.017148, [21] [bootstrap]: 0.00050108 [type_inference]: 0.00613281 [event_method]: 1.994e-05 [auto_monad]: 6.44e-05 [graph_reusing]: 5.68002e-06 [inline]: 1.97999e-06 [add_attr]: 0.00335209, [1] [add_attr_with_inline]: 0.00334111, [1] [Cycle 1]: 8.043e-05, [2] [tag_attr]: 2.054e-05 [meta_addattr_fg_expand]: 5.82001e-06 [parallel-infer-symbol]: 3.2e-06 [pre_auto_parallel]: 3.778e-05 [insert-virtual-dataset]: 2.41998e-06 [parallel-infer-symbol-second]: 8.00006e-07 [dataset_repeat_opt]: 1.89e-06 [pipeline_split]: 1.96998e-06 [optimize]: 0.00580321, [53] [py_interpret_to_execute]: 2.996e-05 [rewriter_before_opt_a]: 8.908e-05 [opt_a]: 0.00333079, [2] [Cycle 1]: 0.00224781, [45] [expand_dump_flag]: 3.53999e-06 [switch_simplify]: 4.37e-05 [loop_unroll]: 3.073e-05 [a_1]: 0.00062423 [with_stream_mark]: 1.805e-05 [recompute_prepare]: 9.25001e-06 [updatestate_depend_eliminate]: 4.14002e-06 [updatestate_assign_eliminate]: 3.45e-06 [updatestate_loads_eliminate]: 2.98998e-06 [parameter_eliminate]: 1.99e-06 [a_2]: 0.0001099 [accelerated_algorithm]: 7.21001e-06 [shard]: 1.98002e-06 [meta_shard_fg_expand]: 2.54999e-06 [shard_inline]: 7.03e-06 [merge_send_recv]: 8.54e-06 [auto_parallel]: 6.60002e-06 [parallel]: 1.882e-05 [flash_sp]: 8.40001e-06 [merge_comm]: 4.02e-06 [allreduce_fusion]: 3.61001e-06 [matmul_add_comm_reduction]: 9.54999e-06 [allreduce_slice_to_reducescatter]: 7.7e-07 [virtual_shard_identity]: 7.95e-06 [virtual_dataset]: 6.67002e-06 [get_grad_eliminate_]: 6.13998e-06 [virtual_output]: 6.38e-06 [merge_forward]: 3.80998e-06 [cell_reuse_recompute_pass]: 1.54e-06 [offload_activation]: 9.95002e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.518e-05 [merge_recompute_call_nodes]: 1.60999e-06 [before_grad]: 1.079e-05 [set_forward_comm_id_for_comm_node_pass]: 3.4e-06 [meta_fg_expand]: 3.33e-06 [flash_sp_send_recv_attached]: 2.83998e-06 [receive_attached]: 1.86998e-06 [after_resolve]: 1.203e-05 [a_after_grad]: 1.055e-05 [renormalize]: 0.00070931 [add_forward_monad_depend]: 5.93002e-06 [auto_monad_grad]: 2.78998e-06 [auto_monad_eliminator]: 1.583e-05 [cse]: 2.866e-05 [a_3]: 6.344e-05 [Cycle 2]: 0.00106717, [45] [expand_dump_flag]: 1.37999e-06 [switch_simplify]: 7.92e-06 [loop_unroll]: 6.19001e-06 [a_1]: 0.00013088 [with_stream_mark]: 1.286e-05 [recompute_prepare]: 6.45002e-06 [updatestate_depend_eliminate]: 3.61999e-06 [updatestate_assign_eliminate]: 2.61999e-06 [updatestate_loads_eliminate]: 2.19001e-06 [parameter_eliminate]: 1.40999e-06 [a_2]: 0.00010007 [accelerated_algorithm]: 6.28e-06 [shard]: 1.26997e-06 [meta_shard_fg_expand]: 1.42999e-06 [shard_inline]: 6.32001e-06 [merge_send_recv]: 4.95999e-06 [auto_parallel]: 6.14001e-06 [parallel]: 5.43002e-06 [flash_sp]: 3.58999e-06 [merge_comm]: 3.43e-06 [allreduce_fusion]: 9.57999e-06 [matmul_add_comm_reduction]: 6.81001e-06 [allreduce_slice_to_reducescatter]: 1.05999e-06 [virtual_shard_identity]: 7.14001e-06 [virtual_dataset]: 6.06003e-06 [get_grad_eliminate_]: 5.87999e-06 [virtual_output]: 5.83002e-06 [merge_forward]: 3.32002e-06 [cell_reuse_recompute_pass]: 1.31002e-06 [offload_activation]: 7.11001e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.564e-05 [merge_recompute_call_nodes]: 8.2e-07 [before_grad]: 1.03e-05 [set_forward_comm_id_for_comm_node_pass]: 3.5e-06 [meta_fg_expand]: 2.24001e-06 [flash_sp_send_recv_attached]: 9.50007e-07 [receive_attached]: 1.69e-06 [after_resolve]: 1.187e-05 [a_after_grad]: 1.042e-05 [renormalize]: 8.00064e-08 [add_forward_monad_depend]: 1.60999e-06 [auto_monad_grad]: 1.09e-06 [auto_monad_eliminator]: 7.93001e-06 [cse]: 1.488e-05 [a_3]: 0.00026979 [py_interpret_to_execute_after_opt_a]: 1.671e-05 [slice_cell_reuse_recomputed_activation]: 5.07999e-06 [rewriter_after_opt_a]: 4.529e-05 [convert_after_rewriter]: 9.92001e-06 [order_py_execute_after_rewriter]: 7.98999e-06 [mutable_eliminate]: 0.00062276 [opt_b]: 0.00027611, [1] [Cycle 1]: 0.00026623, [7] [b_1]: 0.00017126 [b_2]: 8.50999e-06 [updatestate_depend_eliminate]: 6.01e-06 [updatestate_assign_eliminate]: 2.43e-06 [updatestate_loads_eliminate]: 2.31e-06 [renormalize]: 5.39992e-07 [cse]: 1.993e-05 [optimize_parallel_all_gather_comm]: 2.06e-05 [overlap_param_gather]: 5.31002e-06 [cconv]: 3.291e-05 [loop_unroll]: 0.00048841 [opt_after_cconv]: 0.00013262, [1] [Cycle 1]: 0.00012242, [7] [c_1]: 3.271e-05 [parameter_eliminate]: 3.73999e-06 [updatestate_depend_eliminate]: 6.06e-06 [updatestate_assign_eliminate]: 2.71999e-06 [updatestate_loads_eliminate]: 2.53998e-06 [cse]: 1.806e-05 [renormalize]: 4.30009e-07 [remove_dup_value]: 1.769e-05 [tuple_transform]: 9.286e-05, [1] [Cycle 1]: 8.539e-05, [4] [d_1]: 4.506e-05 [none_parameter_eliminate]: 1.63002e-06 [renormalize]: 2.59985e-07 [switch_simplify]: 7.51999e-06 [partial_unused_args_eliminate]: 4.61002e-06 [add_recomputation]: 5.145e-05 [cse_after_recomputation]: 2.825e-05, [1] [Cycle 1]: 2.112e-05, [1] [cse]: 1.217e-05 [environ_conv]: 8.55001e-06 [swap_dp_allreduce_reducescatter]: 7.87e-06 [bias_add_comm_swap]: 5.13002e-06 [label_micro_interleaved_index]: 7.3e-06 [label_fine_grained_interleaved_index]: 5.52001e-06 [merge_cast_opt]: 4.10998e-06 [slice_recompute_activation]: 4.67998e-06 [micro_interleaved_order_control]: 4.83001e-06 [assign_add_opt]: 3.85e-06 [ForceFp32Comm]: 3.17002e-06 [remove_cast_before_assign_add]: 3.41001e-06 [full_micro_interleaved_order_control]: 4.75999e-06 [reorder_send_recv_between_fp_bp]: 5.10001e-06 [comm_op_add_attrs]: 3.63e-06 [add_comm_op_reuse_tag]: 3.23e-06 [interleave_split_concat_branches]: 3.46999e-06 [interleave_parallel_branches]: 3.5e-06 [overlap_opt_shard_in_pipeline]: 3.91001e-06 [overlap_opt_shard_grad_in_pipeline]: 4.42e-06 [control_data_broadcast_order]: 1.5e-05 [grouped_pairwise_exchange_alltoall]: 4.15e-06 [offloading_packed_experts]: 6.53e-06 [overlap_recompute_and_grad_model_parallel]: 7.31001e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.54002e-06 [overlap_recompute_allgather_and_fa_grad]: 3.7e-06 [overlap_recompute_comm]: 4.89e-06 [overlap_grad_ring_attention]: 6.93998e-06 [overlap_grad_flash_sp]: 2.195e-05 [begin_end_overlap_inline]: 2.99999e-06 [split_matmul_comm_elemetwise]: 4.62e-06 [split_layernorm_comm]: 4.16001e-06 [handle_group_info]: 3.36999e-06 [symbol_engine_optimizer]: 9.809e-05, [1] [Cycle 1]: 9.112e-05, [6] [build]: 4.02e-06 [elim_shapecalc]: 1.052e-05 [elim_not_effective]: 1.319e-05 [opt_reshape]: 7.23e-06 [fold_const_symbol]: 1.033e-05 [renormalize]: 2.29978e-07 [detach_backward]: 4.16001e-06 [pipeline_parallel_scheduler]: 1.89e-06 [auto_monad_reorder]: 1.892e-05 [get_jit_bprop_graph]: 1.48002e-06 [rewriter_after_jit_bprop_graph]: 5.22999e-06 [opt_after_jit_grad]: 0.00054638 [validate]: 4.086e-05 Sums bootstrap : 0.000501s : 4.17% type_inference : 0.006133s : 51.03% event_method : 0.000020s : 0.17% auto_monad : 0.000064s : 0.54% graph_reusing : 0.000006s : 0.05% inline : 0.000002s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000021s : 0.17% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.05% parallel-infer-symbol : 0.000003s : 0.03% pre_auto_parallel : 0.000038s : 0.31% insert-virtual-dataset : 0.000002s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.02% optimize.py_interpret_to_execute : 0.000030s : 0.25% optimize.rewriter_before_opt_a : 0.000089s : 0.74% optimize.opt_a.expand_dump_flag : 0.000005s : 0.04% optimize.opt_a.switch_simplify : 0.000052s : 0.43% optimize.opt_a.loop_unroll : 0.000037s : 0.31% optimize.opt_a.a_1 : 0.000755s : 6.28% optimize.opt_a.with_stream_mark : 0.000031s : 0.26% optimize.opt_a.recompute_prepare : 0.000016s : 0.13% optimize.opt_a.updatestate_depend_eliminate : 0.000008s : 0.06% optimize.opt_a.updatestate_assign_eliminate : 0.000006s : 0.05% optimize.opt_a.updatestate_loads_eliminate : 0.000005s : 0.04% optimize.opt_a.parameter_eliminate : 0.000003s : 0.03% optimize.opt_a.a_2 : 0.000210s : 1.75% optimize.opt_a.accelerated_algorithm : 0.000013s : 0.11% optimize.opt_a.shard : 0.000003s : 0.03% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.03% optimize.opt_a.shard_inline : 0.000013s : 0.11% optimize.opt_a.merge_send_recv : 0.000013s : 0.11% optimize.opt_a.auto_parallel : 0.000013s : 0.11% optimize.opt_a.parallel : 0.000024s : 0.20% optimize.opt_a.flash_sp : 0.000012s : 0.10% optimize.opt_a.merge_comm : 0.000007s : 0.06% optimize.opt_a.allreduce_fusion : 0.000013s : 0.11% optimize.opt_a.matmul_add_comm_reduction : 0.000016s : 0.14% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000002s : 0.02% optimize.opt_a.virtual_shard_identity : 0.000015s : 0.13% optimize.opt_a.virtual_dataset : 0.000013s : 0.11% optimize.opt_a.get_grad_eliminate_ : 0.000012s : 0.10% optimize.opt_a.virtual_output : 0.000012s : 0.10% optimize.opt_a.merge_forward : 0.000007s : 0.06% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.02% optimize.opt_a.offload_activation : 0.000017s : 0.14% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000031s : 0.26% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.02% optimize.opt_a.before_grad : 0.000021s : 0.18% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000007s : 0.06% optimize.opt_a.meta_fg_expand : 0.000006s : 0.05% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.03% optimize.opt_a.receive_attached : 0.000004s : 0.03% optimize.opt_a.after_resolve : 0.000024s : 0.20% optimize.opt_a.a_after_grad : 0.000021s : 0.17% optimize.opt_a.renormalize : 0.000709s : 5.90% optimize.opt_a.add_forward_monad_depend : 0.000008s : 0.06% optimize.opt_a.auto_monad_grad : 0.000004s : 0.03% optimize.opt_a.auto_monad_eliminator : 0.000024s : 0.20% optimize.opt_a.cse : 0.000044s : 0.36% optimize.opt_a.a_3 : 0.000333s : 2.77% optimize.py_interpret_to_execute_after_opt_a : 0.000017s : 0.14% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.04% optimize.rewriter_after_opt_a : 0.000045s : 0.38% optimize.convert_after_rewriter : 0.000010s : 0.08% optimize.order_py_execute_after_rewriter : 0.000008s : 0.07% optimize.mutable_eliminate : 0.000623s : 5.18% optimize.opt_b.b_1 : 0.000171s : 1.42% optimize.opt_b.b_2 : 0.000009s : 0.07% optimize.opt_b.updatestate_depend_eliminate : 0.000006s : 0.05% optimize.opt_b.updatestate_assign_eliminate : 0.000002s : 0.02% optimize.opt_b.updatestate_loads_eliminate : 0.000002s : 0.02% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000020s : 0.17% optimize.optimize_parallel_all_gather_comm : 0.000021s : 0.17% optimize.overlap_param_gather : 0.000005s : 0.04% optimize.cconv : 0.000033s : 0.27% optimize.loop_unroll : 0.000488s : 4.06% optimize.opt_after_cconv.c_1 : 0.000033s : 0.27% optimize.opt_after_cconv.parameter_eliminate : 0.000004s : 0.03% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.05% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.02% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.02% optimize.opt_after_cconv.cse : 0.000018s : 0.15% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000018s : 0.15% optimize.tuple_transform.d_1 : 0.000045s : 0.37% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000008s : 0.06% optimize.partial_unused_args_eliminate : 0.000005s : 0.04% optimize.add_recomputation : 0.000051s : 0.43% optimize.cse_after_recomputation.cse : 0.000012s : 0.10% optimize.environ_conv : 0.000009s : 0.07% optimize.swap_dp_allreduce_reducescatter : 0.000008s : 0.07% optimize.bias_add_comm_swap : 0.000005s : 0.04% optimize.label_micro_interleaved_index : 0.000007s : 0.06% optimize.label_fine_grained_interleaved_index : 0.000006s : 0.05% optimize.merge_cast_opt : 0.000004s : 0.03% optimize.slice_recompute_activation : 0.000005s : 0.04% optimize.micro_interleaved_order_control : 0.000005s : 0.04% optimize.assign_add_opt : 0.000004s : 0.03% optimize.ForceFp32Comm : 0.000003s : 0.03% optimize.remove_cast_before_assign_add : 0.000003s : 0.03% optimize.full_micro_interleaved_order_control : 0.000005s : 0.04% optimize.reorder_send_recv_between_fp_bp : 0.000005s : 0.04% optimize.comm_op_add_attrs : 0.000004s : 0.03% optimize.add_comm_op_reuse_tag : 0.000003s : 0.03% optimize.interleave_split_concat_branches : 0.000003s : 0.03% optimize.interleave_parallel_branches : 0.000003s : 0.03% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.03% optimize.overlap_opt_shard_grad_in_pipeline : 0.000004s : 0.04% optimize.control_data_broadcast_order : 0.000015s : 0.12% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.03% optimize.offloading_packed_experts : 0.000007s : 0.05% optimize.overlap_recompute_and_grad_model_parallel : 0.000007s : 0.06% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.03% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.03% optimize.overlap_recompute_comm : 0.000005s : 0.04% optimize.overlap_grad_ring_attention : 0.000007s : 0.06% optimize.overlap_grad_flash_sp : 0.000022s : 0.18% optimize.begin_end_overlap_inline : 0.000003s : 0.02% optimize.split_matmul_comm_elemetwise : 0.000005s : 0.04% optimize.split_layernorm_comm : 0.000004s : 0.03% optimize.handle_group_info : 0.000003s : 0.03% optimize.symbol_engine_optimizer.build : 0.000004s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000011s : 0.09% optimize.symbol_engine_optimizer.elim_not_effective : 0.000013s : 0.11% optimize.symbol_engine_optimizer.opt_reshape : 0.000007s : 0.06% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000010s : 0.09% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000004s : 0.03% pipeline_parallel_scheduler : 0.000002s : 0.02% auto_monad_reorder : 0.000019s : 0.16% get_jit_bprop_graph : 0.000001s : 0.01% rewriter_after_jit_bprop_graph : 0.000005s : 0.04% opt_after_jit_grad : 0.000546s : 4.55% validate : 0.000041s : 0.34% Time group info: ------[substitution.] 0.000183 28 1.13% : 0.000002s : 2: substitution.elim_not_effective 0.81% : 0.000001s : 2: substitution.fold_const_symbol 3.54% : 0.000006s : 4: substitution.graph_param_transform 78.73% : 0.000144s : 4: substitution.inline 2.16% : 0.000004s : 4: substitution.j_node_and_user_rematch 2.83% : 0.000005s : 4: substitution.remove_not_recompute_node 2.77% : 0.000005s : 4: substitution.replace_old_param 8.02% : 0.000015s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.006077 2 88.40% : 0.005372s : 1: type_inference.infer 11.60% : 0.000705s : 1: type_inference.specialize ------[replace.] 0.000060 8 61.11% : 0.000037s : 4: replace.inline 38.89% : 0.000023s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000154 8 91.70% : 0.000141s : 4: match.inline 8.30% : 0.000013s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000212 1278 1.04% : 0.000002s : 13: predicate.accumulaten_eliminater 0.75% : 0.000002s : 4: predicate.ad_related_special_op_eliminate 0.50% : 0.000001s : 8: predicate.addn_check_dump 1.18% : 0.000003s : 13: predicate.addn_zero_filter 0.80% : 0.000002s : 13: predicate.adjust_all_reduce_mul_add 2.13% : 0.000005s : 21: predicate.arithmetic_simplify 0.96% : 0.000002s : 13: predicate.cast_eliminate 1.31% : 0.000003s : 8: predicate.check_bprop_eliminate 0.53% : 0.000001s : 8: predicate.compare_switch_simplify 0.18% : 0.000000s : 4: predicate.const_output_eliminate 0.54% : 0.000001s : 8: predicate.depend_value_elim 0.99% : 0.000002s : 13: predicate.dict_get_item_const_eliminator 1.12% : 0.000002s : 13: predicate.dict_get_item_eliminator 0.86% : 0.000002s : 13: predicate.dict_set_item_eliminator 1.12% : 0.000002s : 8: predicate.dumpgradient_eliminate 0.19% : 0.000000s : 4: predicate.elim_not_effective 0.59% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.09% : 0.000002s : 17: predicate.environ_add_const_eliminate 1.11% : 0.000002s : 17: predicate.environ_get_add_eliminate 1.19% : 0.000003s : 17: predicate.environ_get_depend_swap 1.74% : 0.000004s : 25: predicate.environ_get_eliminate 1.04% : 0.000002s : 17: predicate.environ_get_set_eliminate 1.38% : 0.000003s : 21: predicate.exchange_switch_depend_value 2.53% : 0.000005s : 21: predicate.float_depend_g_call 0.48% : 0.000001s : 8: predicate.float_environ_get_switch 1.03% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.18% : 0.000000s : 4: predicate.fold_const_symbol 0.60% : 0.000001s : 8: predicate.get_grad_eliminate 0.17% : 0.000000s : 4: predicate.graph_param_transform 0.57% : 0.000001s : 8: predicate.incorporate_call 0.48% : 0.000001s : 8: predicate.incorporate_call_switch 5.95% : 0.000013s : 58: predicate.inline 0.74% : 0.000002s : 8: predicate.inline_without_move 0.33% : 0.000001s : 8: predicate.j_node_and_user_rematch 0.86% : 0.000002s : 8: predicate.less_batch_normalization 1.88% : 0.000004s : 25: predicate.list_to_tuple_eliminator_ 2.51% : 0.000005s : 38: predicate.load_eliminater 1.16% : 0.000002s : 4: predicate.loop_unroll_after_grad 2.28% : 0.000005s : 34: predicate.loop_unroll_before_grad 1.61% : 0.000003s : 21: predicate.make_slice_get_slice_eliminator 0.53% : 0.000001s : 8: predicate.merge_addn 0.57% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.71% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.79% : 0.000002s : 13: predicate.minmaximum_grad 1.01% : 0.000002s : 4: predicate.mutable_eliminate 0.47% : 0.000001s : 4: predicate.opt_reshape 0.34% : 0.000001s : 4: predicate.parallel_virtual_node 1.78% : 0.000004s : 21: predicate.partial_defer_inline 1.62% : 0.000003s : 21: predicate.partial_eliminate 1.10% : 0.000002s : 13: predicate.print_const_string_wrapper 0.57% : 0.000001s : 8: predicate.reduce_all_const_elim 1.11% : 0.000002s : 13: predicate.reduce_eliminate 2.44% : 0.000005s : 38: predicate.redundant_stop_gradient_eliminater 0.60% : 0.000001s : 8: predicate.remove_not_recompute_node 1.40% : 0.000003s : 25: predicate.replace_applicator 0.39% : 0.000001s : 8: predicate.replace_old_param 0.32% : 0.000001s : 4: predicate.reset_defer_inline 0.87% : 0.000002s : 13: predicate.reshape_eliminate 0.65% : 0.000001s : 8: predicate.row_tensor_add_zeros_like 0.38% : 0.000001s : 4: predicate.row_tensor_eliminate 0.71% : 0.000001s : 8: predicate.same_eliminate 0.41% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.73% : 0.000002s : 8: predicate.shard_identity_eliminate 0.70% : 0.000001s : 8: predicate.special_op_eliminate 0.65% : 0.000001s : 8: predicate.specialize_transform 0.93% : 0.000002s : 8: predicate.split_environ_get_set_with_tuple_value 0.94% : 0.000002s : 8: predicate.stack_unstack_eliminate 0.37% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.48% : 0.000003s : 21: predicate.switch_defer_inline 2.30% : 0.000005s : 29: predicate.switch_layer_defer_inline 4.95% : 0.000011s : 67: predicate.switch_simplify 1.03% : 0.000002s : 13: predicate.tile_eliminate 0.83% : 0.000002s : 13: predicate.transpose_eliminate 1.50% : 0.000003s : 21: predicate.tuple_list_convert_item_index_to_positive 1.59% : 0.000003s : 21: predicate.tuple_list_get_item_const_eliminator 1.40% : 0.000003s : 21: predicate.tuple_list_get_item_depend_reorder 3.29% : 0.000007s : 33: predicate.tuple_list_get_item_eliminator 1.38% : 0.000003s : 21: predicate.tuple_list_get_set_item_eliminator 2.21% : 0.000005s : 29: predicate.tuple_list_set_item_eliminator 1.64% : 0.000003s : 25: predicate.tuple_to_list_eliminator_ 2.30% : 0.000005s : 38: predicate.updatestate_pure_node_eliminater 3.04% : 0.000006s : 46: predicate.updatestate_useless_node_eliminater 0.34% : 0.000001s : 4: predicate.value_based_eliminate 0.56% : 0.000001s : 8: predicate.virtual_dataset_eliminate 0.62% : 0.000001s : 8: predicate.virtual_output_eliminate 0.27% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.48% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000582 11 55.34% : 0.000322s : 5: func_graph_cloner_run.FuncGraphClonerGraph 44.66% : 0.000260s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.028552 192 0.02% : 0.000006s : 1: ForceFp32Comm 11.77% : 0.003362s : 1: add_attr 11.72% : 0.003345s : 1: add_attr_with_inline 0.02% : 0.000007s : 1: add_comm_op_reuse_tag 0.19% : 0.000055s : 1: add_recomputation 0.02% : 0.000006s : 1: assign_add_opt 0.26% : 0.000073s : 1: auto_monad 0.09% : 0.000026s : 1: auto_monad_reorder 0.02% : 0.000006s : 1: begin_end_overlap_inline 0.03% : 0.000008s : 1: bias_add_comm_swap 1.94% : 0.000553s : 1: bootstrap 0.13% : 0.000036s : 1: cconv 0.02% : 0.000006s : 1: comm_op_add_attrs 0.06% : 0.000018s : 1: control_data_broadcast_order 0.04% : 0.000013s : 1: convert_after_rewriter 0.11% : 0.000031s : 1: cse_after_recomputation 0.03% : 0.000008s : 1: dataset_repeat_opt 0.07% : 0.000019s : 1: detach_backward 0.04% : 0.000012s : 1: environ_conv 0.11% : 0.000031s : 1: event_method 0.03% : 0.000007s : 1: full_micro_interleaved_order_control 0.03% : 0.000008s : 1: get_jit_bprop_graph 0.04% : 0.000012s : 1: graph_reusing 0.02% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.02% : 0.000006s : 1: handle_group_info 0.03% : 0.000008s : 1: inline 0.03% : 0.000009s : 1: insert-virtual-dataset 0.02% : 0.000006s : 1: interleave_parallel_branches 0.02% : 0.000007s : 1: interleave_split_concat_branches 0.03% : 0.000008s : 1: label_fine_grained_interleaved_index 0.04% : 0.000010s : 1: label_micro_interleaved_index 1.73% : 0.000495s : 1: loop_unroll 0.02% : 0.000007s : 1: merge_cast_opt 0.03% : 0.000007s : 1: micro_interleaved_order_control 2.21% : 0.000630s : 1: mutable_eliminate 0.03% : 0.000010s : 1: offloading_packed_experts 0.06% : 0.000016s : 1: opt.transform.loop_unroll_optimizer 0.06% : 0.000017s : 1: opt.transform.mutable_eliminate 4.90% : 0.001399s : 78: opt.transform.opt_a 0.11% : 0.000031s : 1: opt.transform.opt_after_cconv 0.09% : 0.000026s : 1: opt.transform.opt_after_jit_grad 0.37% : 0.000105s : 28: opt.transform.opt_b 0.18% : 0.000050s : 2: opt.transform.opt_trans_graph 0.13% : 0.000038s : 4: opt.transform.symbol_engine_opt 11.68% : 0.003334s : 1: opt_a 0.48% : 0.000136s : 1: opt_after_cconv 1.96% : 0.000558s : 1: opt_after_jit_grad 0.98% : 0.000280s : 1: opt_b 21.49% : 0.006137s : 1: optimize 0.08% : 0.000024s : 1: optimize_parallel_all_gather_comm 0.04% : 0.000011s : 1: order_py_execute_after_rewriter 0.09% : 0.000025s : 1: overlap_grad_flash_sp 0.02% : 0.000006s : 1: overlap_grad_matmul_and_grad_allreduce 0.03% : 0.000010s : 1: overlap_grad_ring_attention 0.02% : 0.000007s : 1: overlap_opt_shard_grad_in_pipeline 0.02% : 0.000007s : 1: overlap_opt_shard_in_pipeline 0.03% : 0.000008s : 1: overlap_param_gather 0.02% : 0.000006s : 1: overlap_recompute_allgather_and_fa_grad 0.04% : 0.000011s : 1: overlap_recompute_and_grad_model_parallel 0.03% : 0.000008s : 1: overlap_recompute_comm 0.03% : 0.000010s : 1: parallel-infer-symbol 0.02% : 0.000007s : 1: parallel-infer-symbol-second 0.03% : 0.000008s : 1: partial_unused_args_eliminate 0.03% : 0.000009s : 1: pipeline_parallel_scheduler 0.03% : 0.000007s : 1: pipeline_split 0.16% : 0.000045s : 1: pre_auto_parallel 0.12% : 0.000034s : 1: py_interpret_to_execute 0.07% : 0.000021s : 1: py_interpret_to_execute_after_opt_a 0.02% : 0.000006s : 1: remove_cast_before_assign_add 0.07% : 0.000021s : 1: remove_dup_value 1.30% : 0.000371s : 1: renormalize.infer 1.16% : 0.000330s : 1: renormalize.specialize 0.03% : 0.000008s : 1: reorder_send_recv_between_fp_bp 0.04% : 0.000011s : 1: rewriter_after_jit_bprop_graph 0.17% : 0.000049s : 1: rewriter_after_opt_a 0.32% : 0.000093s : 1: rewriter_before_opt_a 0.03% : 0.000008s : 1: slice_cell_reuse_recomputed_activation 0.03% : 0.000008s : 1: slice_recompute_activation 0.03% : 0.000007s : 1: split_layernorm_comm 0.03% : 0.000007s : 1: split_matmul_comm_elemetwise 0.04% : 0.000011s : 1: swap_dp_allreduce_reducescatter 0.35% : 0.000101s : 1: symbol_engine_optimizer 0.34% : 0.000096s : 1: tuple_transform 21.64% : 0.006178s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:45:07.945.48 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0171655, [21] [bootstrap]: 0.00049835 [type_inference]: 0.00696837 [event_method]: 2.198e-05 [auto_monad]: 7.113e-05 [graph_reusing]: 6.36998e-06 [inline]: 2.77002e-06 [add_attr]: 0.00377671, [1] [add_attr_with_inline]: 0.00376604, [1] [Cycle 1]: 6.633e-05, [2] [tag_attr]: 2.18e-05 [meta_addattr_fg_expand]: 6.09001e-06 [parallel-infer-symbol]: 3.70998e-06 [pre_auto_parallel]: 3.949e-05 [insert-virtual-dataset]: 2.98998e-06 [parallel-infer-symbol-second]: 9.40025e-07 [dataset_repeat_opt]: 2.29001e-06 [pipeline_split]: 1.62001e-06 [optimize]: 0.00503987, [53] [py_interpret_to_execute]: 2.878e-05 [rewriter_before_opt_a]: 8.944e-05 [opt_a]: 0.00287407, [2] [Cycle 1]: 0.00218389, [45] [expand_dump_flag]: 3.24001e-06 [switch_simplify]: 4.548e-05 [loop_unroll]: 3.05e-05 [a_1]: 0.00065806 [with_stream_mark]: 1.935e-05 [recompute_prepare]: 9.71e-06 [updatestate_depend_eliminate]: 3.68e-06 [updatestate_assign_eliminate]: 3.26001e-06 [updatestate_loads_eliminate]: 3.25998e-06 [parameter_eliminate]: 1.79998e-06 [a_2]: 8.544e-05 [accelerated_algorithm]: 8.08999e-06 [shard]: 1.76998e-06 [meta_shard_fg_expand]: 1.92001e-06 [shard_inline]: 7.44002e-06 [merge_send_recv]: 9.12999e-06 [auto_parallel]: 6.84999e-06 [parallel]: 2.063e-05 [flash_sp]: 8.61997e-06 [merge_comm]: 3.78999e-06 [allreduce_fusion]: 3.43e-06 [matmul_add_comm_reduction]: 8.81002e-06 [allreduce_slice_to_reducescatter]: 9.99979e-07 [virtual_shard_identity]: 8.68001e-06 [virtual_dataset]: 6.93e-06 [get_grad_eliminate_]: 6.12999e-06 [virtual_output]: 6.21e-06 [merge_forward]: 3.88001e-06 [cell_reuse_recompute_pass]: 1.19e-06 [offload_activation]: 9.51e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.345e-05 [merge_recompute_call_nodes]: 1.44e-06 [before_grad]: 1.108e-05 [set_forward_comm_id_for_comm_node_pass]: 3.46001e-06 [meta_fg_expand]: 3.09001e-06 [flash_sp_send_recv_attached]: 2.99999e-06 [receive_attached]: 2.58e-06 [after_resolve]: 1.159e-05 [a_after_grad]: 1.053e-05 [renormalize]: 0.00075936 [add_forward_monad_depend]: 6.04001e-06 [auto_monad_grad]: 2.67001e-06 [auto_monad_eliminator]: 1.594e-05 [cse]: 3.167e-05 [a_3]: 4.958e-05 [Cycle 2]: 0.00067925, [45] [expand_dump_flag]: 2.03002e-06 [switch_simplify]: 8.03999e-06 [loop_unroll]: 6.23e-06 [a_1]: 0.00013268 [with_stream_mark]: 1.363e-05 [recompute_prepare]: 6.81999e-06 [updatestate_depend_eliminate]: 3.20002e-06 [updatestate_assign_eliminate]: 2.63e-06 [updatestate_loads_eliminate]: 2.69999e-06 [parameter_eliminate]: 1.22999e-06 [a_2]: 7.442e-05 [accelerated_algorithm]: 6.67002e-06 [shard]: 1.82001e-06 [meta_shard_fg_expand]: 1.49e-06 [shard_inline]: 6.31998e-06 [merge_send_recv]: 5.57999e-06 [auto_parallel]: 6.44999e-06 [parallel]: 5.46998e-06 [flash_sp]: 3.79002e-06 [merge_comm]: 3.55e-06 [allreduce_fusion]: 3.57002e-06 [matmul_add_comm_reduction]: 6.54999e-06 [allreduce_slice_to_reducescatter]: 5.79981e-07 [virtual_shard_identity]: 1.012e-05 [virtual_dataset]: 6.01998e-06 [get_grad_eliminate_]: 5.54e-06 [virtual_output]: 6.03998e-06 [merge_forward]: 3.53e-06 [cell_reuse_recompute_pass]: 1.74e-06 [offload_activation]: 7.28e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.448e-05 [merge_recompute_call_nodes]: 1.07998e-06 [before_grad]: 9.81e-06 [set_forward_comm_id_for_comm_node_pass]: 3.19001e-06 [meta_fg_expand]: 2.43002e-06 [flash_sp_send_recv_attached]: 8.80013e-07 [receive_attached]: 1.07998e-06 [after_resolve]: 1.085e-05 [a_after_grad]: 1.094e-05 [renormalize]: 6.00121e-08 [add_forward_monad_depend]: 1.50999e-06 [auto_monad_grad]: 1.12999e-06 [auto_monad_eliminator]: 7.5e-06 [cse]: 1.472e-05 [a_3]: 3.823e-05 [py_interpret_to_execute_after_opt_a]: 1.123e-05 [slice_cell_reuse_recomputed_activation]: 2.54001e-06 [rewriter_after_opt_a]: 3.538e-05 [convert_after_rewriter]: 7.51001e-06 [order_py_execute_after_rewriter]: 5.04e-06 [mutable_eliminate]: 0.00059566 [opt_b]: 0.0002057, [1] [Cycle 1]: 0.00019868, [7] [b_1]: 0.0001245 [b_2]: 7.85e-06 [updatestate_depend_eliminate]: 5.89e-06 [updatestate_assign_eliminate]: 2.49999e-06 [updatestate_loads_eliminate]: 2.55002e-06 [renormalize]: 5.60016e-07 [cse]: 1.871e-05 [optimize_parallel_all_gather_comm]: 1.744e-05 [overlap_param_gather]: 2.07001e-06 [cconv]: 2.758e-05 [loop_unroll]: 0.00045568 [opt_after_cconv]: 0.00010387, [1] [Cycle 1]: 9.802e-05, [7] [c_1]: 3.105e-05 [parameter_eliminate]: 3.51999e-06 [updatestate_depend_eliminate]: 5.64e-06 [updatestate_assign_eliminate]: 2.41e-06 [updatestate_loads_eliminate]: 2.35002e-06 [cse]: 1.823e-05 [renormalize]: 4.2998e-07 [remove_dup_value]: 1.48e-05 [tuple_transform]: 7.718e-05, [1] [Cycle 1]: 7.253e-05, [4] [d_1]: 4.521e-05 [none_parameter_eliminate]: 1.72001e-06 [renormalize]: 4.10015e-07 [switch_simplify]: 6.76e-06 [partial_unused_args_eliminate]: 1.70001e-06 [add_recomputation]: 5.12e-05 [cse_after_recomputation]: 2.086e-05, [1] [Cycle 1]: 1.63e-05, [1] [cse]: 1.112e-05 [environ_conv]: 5.15999e-06 [swap_dp_allreduce_reducescatter]: 4.97999e-06 [bias_add_comm_swap]: 2.53998e-06 [label_micro_interleaved_index]: 4.72e-06 [label_fine_grained_interleaved_index]: 3.01001e-06 [merge_cast_opt]: 1.47001e-06 [slice_recompute_activation]: 2.60002e-06 [micro_interleaved_order_control]: 2.21e-06 [assign_add_opt]: 1.24e-06 [ForceFp32Comm]: 1.30999e-06 [remove_cast_before_assign_add]: 1.02e-06 [full_micro_interleaved_order_control]: 2.29001e-06 [reorder_send_recv_between_fp_bp]: 2.55002e-06 [comm_op_add_attrs]: 9.99979e-07 [add_comm_op_reuse_tag]: 9.29984e-07 [interleave_split_concat_branches]: 1.10001e-06 [interleave_parallel_branches]: 9.70002e-07 [overlap_opt_shard_in_pipeline]: 1.20999e-06 [overlap_opt_shard_grad_in_pipeline]: 1.81e-06 [control_data_broadcast_order]: 1.316e-05 [grouped_pairwise_exchange_alltoall]: 1.67999e-06 [offloading_packed_experts]: 3.85998e-06 [overlap_recompute_and_grad_model_parallel]: 4.95001e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.17e-06 [overlap_recompute_allgather_and_fa_grad]: 1.52001e-06 [overlap_recompute_comm]: 2.12999e-06 [overlap_grad_ring_attention]: 4.07e-06 [overlap_grad_flash_sp]: 2.047e-05 [begin_end_overlap_inline]: 5.79981e-07 [split_matmul_comm_elemetwise]: 2.46998e-06 [split_layernorm_comm]: 2.00002e-06 [handle_group_info]: 1.04e-06 [symbol_engine_optimizer]: 7.822e-05, [1] [Cycle 1]: 7.37e-05, [6] [build]: 3.21999e-06 [elim_shapecalc]: 1.135e-05 [elim_not_effective]: 1.304e-05 [opt_reshape]: 6.94001e-06 [fold_const_symbol]: 1.02e-05 [renormalize]: 2.3999e-07 [detach_backward]: 2.08002e-06 [pipeline_parallel_scheduler]: 1.57001e-06 [auto_monad_reorder]: 1.637e-05 [get_jit_bprop_graph]: 1.78002e-06 [rewriter_after_jit_bprop_graph]: 4.59998e-06 [opt_after_jit_grad]: 0.00049248 [validate]: 4.122e-05 Sums bootstrap : 0.000498s : 4.03% type_inference : 0.006968s : 56.34% event_method : 0.000022s : 0.18% auto_monad : 0.000071s : 0.58% graph_reusing : 0.000006s : 0.05% inline : 0.000003s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000022s : 0.18% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.05% parallel-infer-symbol : 0.000004s : 0.03% pre_auto_parallel : 0.000039s : 0.32% insert-virtual-dataset : 0.000003s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000029s : 0.23% optimize.rewriter_before_opt_a : 0.000089s : 0.72% optimize.opt_a.expand_dump_flag : 0.000005s : 0.04% optimize.opt_a.switch_simplify : 0.000054s : 0.43% optimize.opt_a.loop_unroll : 0.000037s : 0.30% optimize.opt_a.a_1 : 0.000791s : 6.39% optimize.opt_a.with_stream_mark : 0.000033s : 0.27% optimize.opt_a.recompute_prepare : 0.000017s : 0.13% optimize.opt_a.updatestate_depend_eliminate : 0.000007s : 0.06% optimize.opt_a.updatestate_assign_eliminate : 0.000006s : 0.05% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.05% optimize.opt_a.parameter_eliminate : 0.000003s : 0.02% optimize.opt_a.a_2 : 0.000160s : 1.29% optimize.opt_a.accelerated_algorithm : 0.000015s : 0.12% optimize.opt_a.shard : 0.000004s : 0.03% optimize.opt_a.meta_shard_fg_expand : 0.000003s : 0.03% optimize.opt_a.shard_inline : 0.000014s : 0.11% optimize.opt_a.merge_send_recv : 0.000015s : 0.12% optimize.opt_a.auto_parallel : 0.000013s : 0.11% optimize.opt_a.parallel : 0.000026s : 0.21% optimize.opt_a.flash_sp : 0.000012s : 0.10% optimize.opt_a.merge_comm : 0.000007s : 0.06% optimize.opt_a.allreduce_fusion : 0.000007s : 0.06% optimize.opt_a.matmul_add_comm_reduction : 0.000015s : 0.12% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000002s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000019s : 0.15% optimize.opt_a.virtual_dataset : 0.000013s : 0.10% optimize.opt_a.get_grad_eliminate_ : 0.000012s : 0.09% optimize.opt_a.virtual_output : 0.000012s : 0.10% optimize.opt_a.merge_forward : 0.000007s : 0.06% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.02% optimize.opt_a.offload_activation : 0.000017s : 0.14% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000028s : 0.23% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.02% optimize.opt_a.before_grad : 0.000021s : 0.17% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000007s : 0.05% optimize.opt_a.meta_fg_expand : 0.000006s : 0.04% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.03% optimize.opt_a.receive_attached : 0.000004s : 0.03% optimize.opt_a.after_resolve : 0.000022s : 0.18% optimize.opt_a.a_after_grad : 0.000021s : 0.17% optimize.opt_a.renormalize : 0.000759s : 6.14% optimize.opt_a.add_forward_monad_depend : 0.000008s : 0.06% optimize.opt_a.auto_monad_grad : 0.000004s : 0.03% optimize.opt_a.auto_monad_eliminator : 0.000023s : 0.19% optimize.opt_a.cse : 0.000046s : 0.38% optimize.opt_a.a_3 : 0.000088s : 0.71% optimize.py_interpret_to_execute_after_opt_a : 0.000011s : 0.09% optimize.slice_cell_reuse_recomputed_activation : 0.000003s : 0.02% optimize.rewriter_after_opt_a : 0.000035s : 0.29% optimize.convert_after_rewriter : 0.000008s : 0.06% optimize.order_py_execute_after_rewriter : 0.000005s : 0.04% optimize.mutable_eliminate : 0.000596s : 4.82% optimize.opt_b.b_1 : 0.000124s : 1.01% optimize.opt_b.b_2 : 0.000008s : 0.06% optimize.opt_b.updatestate_depend_eliminate : 0.000006s : 0.05% optimize.opt_b.updatestate_assign_eliminate : 0.000002s : 0.02% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.02% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000019s : 0.15% optimize.optimize_parallel_all_gather_comm : 0.000017s : 0.14% optimize.overlap_param_gather : 0.000002s : 0.02% optimize.cconv : 0.000028s : 0.22% optimize.loop_unroll : 0.000456s : 3.68% optimize.opt_after_cconv.c_1 : 0.000031s : 0.25% optimize.opt_after_cconv.parameter_eliminate : 0.000004s : 0.03% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.05% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000002s : 0.02% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.02% optimize.opt_after_cconv.cse : 0.000018s : 0.15% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000015s : 0.12% optimize.tuple_transform.d_1 : 0.000045s : 0.37% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000007s : 0.05% optimize.partial_unused_args_eliminate : 0.000002s : 0.01% optimize.add_recomputation : 0.000051s : 0.41% optimize.cse_after_recomputation.cse : 0.000011s : 0.09% optimize.environ_conv : 0.000005s : 0.04% optimize.swap_dp_allreduce_reducescatter : 0.000005s : 0.04% optimize.bias_add_comm_swap : 0.000003s : 0.02% optimize.label_micro_interleaved_index : 0.000005s : 0.04% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.02% optimize.merge_cast_opt : 0.000001s : 0.01% optimize.slice_recompute_activation : 0.000003s : 0.02% optimize.micro_interleaved_order_control : 0.000002s : 0.02% optimize.assign_add_opt : 0.000001s : 0.01% optimize.ForceFp32Comm : 0.000001s : 0.01% optimize.remove_cast_before_assign_add : 0.000001s : 0.01% optimize.full_micro_interleaved_order_control : 0.000002s : 0.02% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.02% optimize.comm_op_add_attrs : 0.000001s : 0.01% optimize.add_comm_op_reuse_tag : 0.000001s : 0.01% optimize.interleave_split_concat_branches : 0.000001s : 0.01% optimize.interleave_parallel_branches : 0.000001s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.01% optimize.control_data_broadcast_order : 0.000013s : 0.11% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.01% optimize.offloading_packed_experts : 0.000004s : 0.03% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.04% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000002s : 0.01% optimize.overlap_recompute_comm : 0.000002s : 0.02% optimize.overlap_grad_ring_attention : 0.000004s : 0.03% optimize.overlap_grad_flash_sp : 0.000020s : 0.17% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.02% optimize.split_layernorm_comm : 0.000002s : 0.02% optimize.handle_group_info : 0.000001s : 0.01% optimize.symbol_engine_optimizer.build : 0.000003s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000011s : 0.09% optimize.symbol_engine_optimizer.elim_not_effective : 0.000013s : 0.11% optimize.symbol_engine_optimizer.opt_reshape : 0.000007s : 0.06% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000010s : 0.08% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.02% pipeline_parallel_scheduler : 0.000002s : 0.01% auto_monad_reorder : 0.000016s : 0.13% get_jit_bprop_graph : 0.000002s : 0.01% rewriter_after_jit_bprop_graph : 0.000005s : 0.04% opt_after_jit_grad : 0.000492s : 3.98% validate : 0.000041s : 0.33% Time group info: ------[substitution.] 0.000184 28 0.97% : 0.000002s : 2: substitution.elim_not_effective 0.69% : 0.000001s : 2: substitution.fold_const_symbol 3.33% : 0.000006s : 4: substitution.graph_param_transform 79.46% : 0.000146s : 4: substitution.inline 1.93% : 0.000004s : 4: substitution.j_node_and_user_rematch 3.00% : 0.000006s : 4: substitution.remove_not_recompute_node 2.43% : 0.000004s : 4: substitution.replace_old_param 8.19% : 0.000015s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.006886 2 87.38% : 0.006017s : 1: type_inference.infer 12.62% : 0.000869s : 1: type_inference.specialize ------[replace.] 0.000061 8 63.51% : 0.000039s : 4: replace.inline 36.49% : 0.000022s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000157 8 91.63% : 0.000144s : 4: match.inline 8.37% : 0.000013s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000234 1278 0.83% : 0.000002s : 13: predicate.accumulaten_eliminater 0.74% : 0.000002s : 4: predicate.ad_related_special_op_eliminate 0.45% : 0.000001s : 8: predicate.addn_check_dump 0.81% : 0.000002s : 13: predicate.addn_zero_filter 0.69% : 0.000002s : 13: predicate.adjust_all_reduce_mul_add 1.84% : 0.000004s : 21: predicate.arithmetic_simplify 0.91% : 0.000002s : 13: predicate.cast_eliminate 0.62% : 0.000001s : 8: predicate.check_bprop_eliminate 0.47% : 0.000001s : 8: predicate.compare_switch_simplify 0.17% : 0.000000s : 4: predicate.const_output_eliminate 0.77% : 0.000002s : 8: predicate.depend_value_elim 0.85% : 0.000002s : 13: predicate.dict_get_item_const_eliminator 0.93% : 0.000002s : 13: predicate.dict_get_item_eliminator 0.73% : 0.000002s : 13: predicate.dict_set_item_eliminator 0.84% : 0.000002s : 8: predicate.dumpgradient_eliminate 0.18% : 0.000000s : 4: predicate.elim_not_effective 0.41% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.00% : 0.000002s : 17: predicate.environ_add_const_eliminate 0.94% : 0.000002s : 17: predicate.environ_get_add_eliminate 1.05% : 0.000002s : 17: predicate.environ_get_depend_swap 1.47% : 0.000003s : 25: predicate.environ_get_eliminate 1.01% : 0.000002s : 17: predicate.environ_get_set_eliminate 1.32% : 0.000003s : 21: predicate.exchange_switch_depend_value 2.10% : 0.000005s : 21: predicate.float_depend_g_call 0.44% : 0.000001s : 8: predicate.float_environ_get_switch 0.68% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.15% : 0.000000s : 4: predicate.fold_const_symbol 0.53% : 0.000001s : 8: predicate.get_grad_eliminate 0.25% : 0.000001s : 4: predicate.graph_param_transform 0.54% : 0.000001s : 8: predicate.incorporate_call 0.45% : 0.000001s : 8: predicate.incorporate_call_switch 5.58% : 0.000013s : 58: predicate.inline 0.85% : 0.000002s : 8: predicate.inline_without_move 0.28% : 0.000001s : 8: predicate.j_node_and_user_rematch 0.68% : 0.000002s : 8: predicate.less_batch_normalization 1.51% : 0.000004s : 25: predicate.list_to_tuple_eliminator_ 2.13% : 0.000005s : 38: predicate.load_eliminater 0.89% : 0.000002s : 4: predicate.loop_unroll_after_grad 2.15% : 0.000005s : 34: predicate.loop_unroll_before_grad 1.41% : 0.000003s : 21: predicate.make_slice_get_slice_eliminator 0.58% : 0.000001s : 8: predicate.merge_addn 0.49% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.60% : 0.000001s : 8: predicate.mini_step_allgather_replace 10.99% : 0.000026s : 13: predicate.minmaximum_grad 1.24% : 0.000003s : 4: predicate.mutable_eliminate 0.33% : 0.000001s : 4: predicate.opt_reshape 0.29% : 0.000001s : 4: predicate.parallel_virtual_node 1.55% : 0.000004s : 21: predicate.partial_defer_inline 1.51% : 0.000004s : 21: predicate.partial_eliminate 0.78% : 0.000002s : 13: predicate.print_const_string_wrapper 0.54% : 0.000001s : 8: predicate.reduce_all_const_elim 1.01% : 0.000002s : 13: predicate.reduce_eliminate 2.22% : 0.000005s : 38: predicate.redundant_stop_gradient_eliminater 0.53% : 0.000001s : 8: predicate.remove_not_recompute_node 1.19% : 0.000003s : 25: predicate.replace_applicator 0.46% : 0.000001s : 8: predicate.replace_old_param 0.33% : 0.000001s : 4: predicate.reset_defer_inline 0.83% : 0.000002s : 13: predicate.reshape_eliminate 0.57% : 0.000001s : 8: predicate.row_tensor_add_zeros_like 0.32% : 0.000001s : 4: predicate.row_tensor_eliminate 0.71% : 0.000002s : 8: predicate.same_eliminate 0.37% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.70% : 0.000002s : 8: predicate.shard_identity_eliminate 0.63% : 0.000001s : 8: predicate.special_op_eliminate 0.68% : 0.000002s : 8: predicate.specialize_transform 0.74% : 0.000002s : 8: predicate.split_environ_get_set_with_tuple_value 0.77% : 0.000002s : 8: predicate.stack_unstack_eliminate 0.36% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.33% : 0.000003s : 21: predicate.switch_defer_inline 2.08% : 0.000005s : 29: predicate.switch_layer_defer_inline 4.71% : 0.000011s : 67: predicate.switch_simplify 0.89% : 0.000002s : 13: predicate.tile_eliminate 0.78% : 0.000002s : 13: predicate.transpose_eliminate 1.51% : 0.000004s : 21: predicate.tuple_list_convert_item_index_to_positive 1.51% : 0.000004s : 21: predicate.tuple_list_get_item_const_eliminator 1.24% : 0.000003s : 21: predicate.tuple_list_get_item_depend_reorder 3.04% : 0.000007s : 33: predicate.tuple_list_get_item_eliminator 1.39% : 0.000003s : 21: predicate.tuple_list_get_set_item_eliminator 2.00% : 0.000005s : 29: predicate.tuple_list_set_item_eliminator 1.57% : 0.000004s : 25: predicate.tuple_to_list_eliminator_ 2.16% : 0.000005s : 38: predicate.updatestate_pure_node_eliminater 2.66% : 0.000006s : 46: predicate.updatestate_useless_node_eliminater 0.27% : 0.000001s : 4: predicate.value_based_eliminate 0.62% : 0.000001s : 8: predicate.virtual_dataset_eliminate 0.53% : 0.000001s : 8: predicate.virtual_output_eliminate 0.26% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.48% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000655 11 52.99% : 0.000347s : 5: func_graph_cloner_run.FuncGraphClonerGraph 47.01% : 0.000308s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.028110 192 0.01% : 0.000004s : 1: ForceFp32Comm 13.46% : 0.003783s : 1: add_attr 13.41% : 0.003770s : 1: add_attr_with_inline 0.01% : 0.000004s : 1: add_comm_op_reuse_tag 0.20% : 0.000055s : 1: add_recomputation 0.01% : 0.000004s : 1: assign_add_opt 0.27% : 0.000077s : 1: auto_monad 0.07% : 0.000020s : 1: auto_monad_reorder 0.01% : 0.000003s : 1: begin_end_overlap_inline 0.02% : 0.000005s : 1: bias_add_comm_swap 1.88% : 0.000529s : 1: bootstrap 0.11% : 0.000031s : 1: cconv 0.01% : 0.000004s : 1: comm_op_add_attrs 0.06% : 0.000016s : 1: control_data_broadcast_order 0.04% : 0.000011s : 1: convert_after_rewriter 0.08% : 0.000024s : 1: cse_after_recomputation 0.02% : 0.000005s : 1: dataset_repeat_opt 0.02% : 0.000005s : 1: detach_backward 0.03% : 0.000008s : 1: environ_conv 0.10% : 0.000029s : 1: event_method 0.02% : 0.000005s : 1: full_micro_interleaved_order_control 0.02% : 0.000005s : 1: get_jit_bprop_graph 0.04% : 0.000010s : 1: graph_reusing 0.02% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000004s : 1: handle_group_info 0.02% : 0.000006s : 1: inline 0.02% : 0.000007s : 1: insert-virtual-dataset 0.01% : 0.000004s : 1: interleave_parallel_branches 0.01% : 0.000004s : 1: interleave_split_concat_branches 0.02% : 0.000006s : 1: label_fine_grained_interleaved_index 0.03% : 0.000008s : 1: label_micro_interleaved_index 1.65% : 0.000464s : 1: loop_unroll 0.02% : 0.000004s : 1: merge_cast_opt 0.02% : 0.000005s : 1: micro_interleaved_order_control 2.15% : 0.000604s : 1: mutable_eliminate 0.02% : 0.000007s : 1: offloading_packed_experts 0.05% : 0.000015s : 1: opt.transform.loop_unroll_optimizer 0.06% : 0.000017s : 1: opt.transform.mutable_eliminate 4.37% : 0.001229s : 78: opt.transform.opt_a 0.11% : 0.000030s : 1: opt.transform.opt_after_cconv 0.09% : 0.000026s : 1: opt.transform.opt_after_jit_grad 0.36% : 0.000101s : 28: opt.transform.opt_b 0.18% : 0.000049s : 2: opt.transform.opt_trans_graph 0.13% : 0.000037s : 4: opt.transform.symbol_engine_opt 10.24% : 0.002877s : 1: opt_a 0.38% : 0.000107s : 1: opt_after_cconv 1.78% : 0.000502s : 1: opt_after_jit_grad 0.74% : 0.000209s : 1: opt_b 17.95% : 0.005045s : 1: optimize 0.07% : 0.000021s : 1: optimize_parallel_all_gather_comm 0.03% : 0.000008s : 1: order_py_execute_after_rewriter 0.09% : 0.000024s : 1: overlap_grad_flash_sp 0.02% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.02% : 0.000007s : 1: overlap_grad_ring_attention 0.02% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.06% : 0.000016s : 1: overlap_opt_shard_in_pipeline 0.02% : 0.000005s : 1: overlap_param_gather 0.01% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.03% : 0.000008s : 1: overlap_recompute_and_grad_model_parallel 0.02% : 0.000005s : 1: overlap_recompute_comm 0.03% : 0.000007s : 1: parallel-infer-symbol 0.01% : 0.000004s : 1: parallel-infer-symbol-second 0.02% : 0.000005s : 1: partial_unused_args_eliminate 0.02% : 0.000005s : 1: pipeline_parallel_scheduler 0.02% : 0.000004s : 1: pipeline_split 0.16% : 0.000044s : 1: pre_auto_parallel 0.12% : 0.000033s : 1: py_interpret_to_execute 0.05% : 0.000015s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000004s : 1: remove_cast_before_assign_add 0.07% : 0.000019s : 1: remove_dup_value 1.45% : 0.000406s : 1: renormalize.infer 1.22% : 0.000344s : 1: renormalize.specialize 0.02% : 0.000005s : 1: reorder_send_recv_between_fp_bp 0.03% : 0.000008s : 1: rewriter_after_jit_bprop_graph 0.14% : 0.000039s : 1: rewriter_after_opt_a 0.34% : 0.000095s : 1: rewriter_before_opt_a 0.02% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.02% : 0.000006s : 1: slice_recompute_activation 0.02% : 0.000005s : 1: split_layernorm_comm 0.02% : 0.000005s : 1: split_matmul_comm_elemetwise 0.03% : 0.000008s : 1: swap_dp_allreduce_reducescatter 0.29% : 0.000081s : 1: symbol_engine_optimizer 0.28% : 0.000080s : 1: tuple_transform 24.88% : 0.006994s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:45:07.447.767 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:45:07.448.074 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0181044, [21] [bootstrap]: 0.00048402 [type_inference]: 0.00638275 [event_method]: 2.216e-05 [auto_monad]: 6.838e-05 [graph_reusing]: 6.56e-06 [inline]: 2.56998e-06 [add_attr]: 0.00359963, [1] [add_attr_with_inline]: 0.00358866, [1] [Cycle 1]: 8.89e-05, [2] [tag_attr]: 2.304e-05 [meta_addattr_fg_expand]: 6.92002e-06 [parallel-infer-symbol]: 3.49001e-06 [pre_auto_parallel]: 4.049e-05 [insert-virtual-dataset]: 2.46e-06 [parallel-infer-symbol-second]: 8.2e-07 [dataset_repeat_opt]: 2.18002e-06 [pipeline_split]: 1.99e-06 [optimize]: 0.00618383, [53] [py_interpret_to_execute]: 3.467e-05 [rewriter_before_opt_a]: 9.925e-05 [opt_a]: 0.00363253, [2] [Cycle 1]: 0.00260971, [45] [expand_dump_flag]: 3.16001e-06 [switch_simplify]: 4.492e-05 [loop_unroll]: 3.435e-05 [a_1]: 0.00076302 [with_stream_mark]: 1.912e-05 [recompute_prepare]: 9.72001e-06 [updatestate_depend_eliminate]: 4.95001e-06 [updatestate_assign_eliminate]: 4.03999e-06 [updatestate_loads_eliminate]: 3.50998e-06 [parameter_eliminate]: 1.97001e-06 [a_2]: 0.0001299 [accelerated_algorithm]: 9.67001e-06 [shard]: 2.22999e-06 [meta_shard_fg_expand]: 1.86998e-06 [shard_inline]: 8.25999e-06 [merge_send_recv]: 1.003e-05 [auto_parallel]: 8.74998e-06 [parallel]: 1.918e-05 [flash_sp]: 9.70002e-06 [merge_comm]: 4.77e-06 [allreduce_fusion]: 4.32e-06 [matmul_add_comm_reduction]: 1.052e-05 [allreduce_slice_to_reducescatter]: 7.50006e-07 [virtual_shard_identity]: 9.29998e-06 [virtual_dataset]: 8.07e-06 [get_grad_eliminate_]: 7.38e-06 [virtual_output]: 7.66001e-06 [merge_forward]: 4.70001e-06 [cell_reuse_recompute_pass]: 1.19998e-06 [offload_activation]: 1.107e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.891e-05 [merge_recompute_call_nodes]: 1.55999e-06 [before_grad]: 1.376e-05 [set_forward_comm_id_for_comm_node_pass]: 4.97e-06 [meta_fg_expand]: 3.31001e-06 [flash_sp_send_recv_attached]: 3.50998e-06 [receive_attached]: 2.43e-06 [after_resolve]: 1.366e-05 [a_after_grad]: 1.247e-05 [renormalize]: 0.00084913 [add_forward_monad_depend]: 6.56e-06 [auto_monad_grad]: 2.83998e-06 [auto_monad_eliminator]: 1.807e-05 [cse]: 3.567e-05 [a_3]: 7.221e-05 [Cycle 2]: 0.0010089, [45] [expand_dump_flag]: 1.41998e-06 [switch_simplify]: 8.59e-06 [loop_unroll]: 7.31999e-06 [a_1]: 0.00020229 [with_stream_mark]: 1.537e-05 [recompute_prepare]: 8.45001e-06 [updatestate_depend_eliminate]: 4.50999e-06 [updatestate_assign_eliminate]: 3.3e-06 [updatestate_loads_eliminate]: 3.06001e-06 [parameter_eliminate]: 1.26997e-06 [a_2]: 0.00012839 [accelerated_algorithm]: 7.52002e-06 [shard]: 1.57999e-06 [meta_shard_fg_expand]: 1.67999e-06 [shard_inline]: 8.13001e-06 [merge_send_recv]: 6.84999e-06 [auto_parallel]: 6.91001e-06 [parallel]: 5.39e-06 [flash_sp]: 3.5e-06 [merge_comm]: 4.35999e-06 [allreduce_fusion]: 4.1e-06 [matmul_add_comm_reduction]: 7.68001e-06 [allreduce_slice_to_reducescatter]: 1.10001e-06 [virtual_shard_identity]: 9.14e-06 [virtual_dataset]: 8.13001e-06 [get_grad_eliminate_]: 7.83001e-06 [virtual_output]: 8.1e-06 [merge_forward]: 4.47e-06 [cell_reuse_recompute_pass]: 2.15002e-06 [offload_activation]: 9.24998e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.926e-05 [merge_recompute_call_nodes]: 8.00006e-07 [before_grad]: 1.172e-05 [set_forward_comm_id_for_comm_node_pass]: 4.2e-06 [meta_fg_expand]: 2.89999e-06 [flash_sp_send_recv_attached]: 1.17e-06 [receive_attached]: 1.45999e-06 [after_resolve]: 1.325e-05 [a_after_grad]: 1.35e-05 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 2.63998e-06 [auto_monad_grad]: 1.10999e-06 [auto_monad_eliminator]: 9.89001e-06 [cse]: 1.912e-05 [a_3]: 5.952e-05 [py_interpret_to_execute_after_opt_a]: 1.465e-05 [slice_cell_reuse_recomputed_activation]: 4.92e-06 [rewriter_after_opt_a]: 4.813e-05 [convert_after_rewriter]: 1.111e-05 [order_py_execute_after_rewriter]: 8.76002e-06 [mutable_eliminate]: 0.00061112 [opt_b]: 0.00031593, [1] [Cycle 1]: 0.00030607, [7] [b_1]: 0.00020007 [b_2]: 1.064e-05 [updatestate_depend_eliminate]: 6.54999e-06 [updatestate_assign_eliminate]: 3.13998e-06 [updatestate_loads_eliminate]: 3.46999e-06 [renormalize]: 5.49975e-07 [cse]: 2.277e-05 [optimize_parallel_all_gather_comm]: 2.059e-05 [overlap_param_gather]: 4.82998e-06 [cconv]: 3.058e-05 [loop_unroll]: 0.00047677 [opt_after_cconv]: 0.00014153, [1] [Cycle 1]: 0.00013231, [7] [c_1]: 3.769e-05 [parameter_eliminate]: 2.49001e-06 [updatestate_depend_eliminate]: 6.01998e-06 [updatestate_assign_eliminate]: 3.43e-06 [updatestate_loads_eliminate]: 3.12002e-06 [cse]: 2.305e-05 [renormalize]: 3.89991e-07 [remove_dup_value]: 1.973e-05 [tuple_transform]: 0.00010212, [1] [Cycle 1]: 9.474e-05, [4] [d_1]: 5.405e-05 [none_parameter_eliminate]: 1.59998e-06 [renormalize]: 1.8999e-07 [switch_simplify]: 8.35999e-06 [partial_unused_args_eliminate]: 4.99e-06 [add_recomputation]: 6.067e-05 [cse_after_recomputation]: 3.083e-05, [1] [Cycle 1]: 2.4e-05, [1] [cse]: 1.422e-05 [environ_conv]: 9.67001e-06 [swap_dp_allreduce_reducescatter]: 8.32998e-06 [bias_add_comm_swap]: 5.25999e-06 [label_micro_interleaved_index]: 7.38e-06 [label_fine_grained_interleaved_index]: 5.82001e-06 [merge_cast_opt]: 4.2e-06 [slice_recompute_activation]: 4.94e-06 [micro_interleaved_order_control]: 4.92e-06 [assign_add_opt]: 3.79002e-06 [ForceFp32Comm]: 3.23e-06 [remove_cast_before_assign_add]: 3.84002e-06 [full_micro_interleaved_order_control]: 4.72998e-06 [reorder_send_recv_between_fp_bp]: 5.17999e-06 [comm_op_add_attrs]: 3.81999e-06 [add_comm_op_reuse_tag]: 3.57997e-06 [interleave_split_concat_branches]: 3.68e-06 [interleave_parallel_branches]: 3.58999e-06 [overlap_opt_shard_in_pipeline]: 3.53999e-06 [overlap_opt_shard_grad_in_pipeline]: 4.38001e-06 [control_data_broadcast_order]: 1.742e-05 [grouped_pairwise_exchange_alltoall]: 4.02998e-06 [offloading_packed_experts]: 7e-06 [overlap_recompute_and_grad_model_parallel]: 8.07e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.60003e-06 [overlap_recompute_allgather_and_fa_grad]: 3.66001e-06 [overlap_recompute_comm]: 5.27001e-06 [overlap_grad_ring_attention]: 7.35998e-06 [overlap_grad_flash_sp]: 2.415e-05 [begin_end_overlap_inline]: 2.99999e-06 [split_matmul_comm_elemetwise]: 4.55001e-06 [split_layernorm_comm]: 4.30999e-06 [handle_group_info]: 3.6e-06 [symbol_engine_optimizer]: 0.00010375, [1] [Cycle 1]: 9.652e-05, [6] [build]: 3.21001e-06 [elim_shapecalc]: 1.105e-05 [elim_not_effective]: 1.546e-05 [opt_reshape]: 8.50999e-06 [fold_const_symbol]: 1.278e-05 [renormalize]: 2.30008e-07 [detach_backward]: 3.5e-06 [pipeline_parallel_scheduler]: 1.64e-06 [auto_monad_reorder]: 2.17e-05 [get_jit_bprop_graph]: 1.67001e-06 [rewriter_after_jit_bprop_graph]: 4.42e-06 [opt_after_jit_grad]: 0.00062766 [validate]: 4.583e-05 Sums bootstrap : 0.000484s : 3.81% type_inference : 0.006383s : 50.19% event_method : 0.000022s : 0.17% auto_monad : 0.000068s : 0.54% graph_reusing : 0.000007s : 0.05% inline : 0.000003s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000023s : 0.18% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000007s : 0.05% parallel-infer-symbol : 0.000003s : 0.03% pre_auto_parallel : 0.000040s : 0.32% insert-virtual-dataset : 0.000002s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.02% optimize.py_interpret_to_execute : 0.000035s : 0.27% optimize.rewriter_before_opt_a : 0.000099s : 0.78% optimize.opt_a.expand_dump_flag : 0.000005s : 0.04% optimize.opt_a.switch_simplify : 0.000054s : 0.42% optimize.opt_a.loop_unroll : 0.000042s : 0.33% optimize.opt_a.a_1 : 0.000965s : 7.59% optimize.opt_a.with_stream_mark : 0.000034s : 0.27% optimize.opt_a.recompute_prepare : 0.000018s : 0.14% optimize.opt_a.updatestate_depend_eliminate : 0.000009s : 0.07% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.06% optimize.opt_a.updatestate_loads_eliminate : 0.000007s : 0.05% optimize.opt_a.parameter_eliminate : 0.000003s : 0.03% optimize.opt_a.a_2 : 0.000258s : 2.03% optimize.opt_a.accelerated_algorithm : 0.000017s : 0.14% optimize.opt_a.shard : 0.000004s : 0.03% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.03% optimize.opt_a.shard_inline : 0.000016s : 0.13% optimize.opt_a.merge_send_recv : 0.000017s : 0.13% optimize.opt_a.auto_parallel : 0.000016s : 0.12% optimize.opt_a.parallel : 0.000025s : 0.19% optimize.opt_a.flash_sp : 0.000013s : 0.10% optimize.opt_a.merge_comm : 0.000009s : 0.07% optimize.opt_a.allreduce_fusion : 0.000008s : 0.07% optimize.opt_a.matmul_add_comm_reduction : 0.000018s : 0.14% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000002s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000018s : 0.14% optimize.opt_a.virtual_dataset : 0.000016s : 0.13% optimize.opt_a.get_grad_eliminate_ : 0.000015s : 0.12% optimize.opt_a.virtual_output : 0.000016s : 0.12% optimize.opt_a.merge_forward : 0.000009s : 0.07% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.03% optimize.opt_a.offload_activation : 0.000020s : 0.16% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000038s : 0.30% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.02% optimize.opt_a.before_grad : 0.000025s : 0.20% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000009s : 0.07% optimize.opt_a.meta_fg_expand : 0.000006s : 0.05% optimize.opt_a.flash_sp_send_recv_attached : 0.000005s : 0.04% optimize.opt_a.receive_attached : 0.000004s : 0.03% optimize.opt_a.after_resolve : 0.000027s : 0.21% optimize.opt_a.a_after_grad : 0.000026s : 0.20% optimize.opt_a.renormalize : 0.000849s : 6.68% optimize.opt_a.add_forward_monad_depend : 0.000009s : 0.07% optimize.opt_a.auto_monad_grad : 0.000004s : 0.03% optimize.opt_a.auto_monad_eliminator : 0.000028s : 0.22% optimize.opt_a.cse : 0.000055s : 0.43% optimize.opt_a.a_3 : 0.000132s : 1.04% optimize.py_interpret_to_execute_after_opt_a : 0.000015s : 0.12% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.04% optimize.rewriter_after_opt_a : 0.000048s : 0.38% optimize.convert_after_rewriter : 0.000011s : 0.09% optimize.order_py_execute_after_rewriter : 0.000009s : 0.07% optimize.mutable_eliminate : 0.000611s : 4.81% optimize.opt_b.b_1 : 0.000200s : 1.57% optimize.opt_b.b_2 : 0.000011s : 0.08% optimize.opt_b.updatestate_depend_eliminate : 0.000007s : 0.05% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.02% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000023s : 0.18% optimize.optimize_parallel_all_gather_comm : 0.000021s : 0.16% optimize.overlap_param_gather : 0.000005s : 0.04% optimize.cconv : 0.000031s : 0.24% optimize.loop_unroll : 0.000477s : 3.75% optimize.opt_after_cconv.c_1 : 0.000038s : 0.30% optimize.opt_after_cconv.parameter_eliminate : 0.000002s : 0.02% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.05% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.02% optimize.opt_after_cconv.cse : 0.000023s : 0.18% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000020s : 0.16% optimize.tuple_transform.d_1 : 0.000054s : 0.43% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000008s : 0.07% optimize.partial_unused_args_eliminate : 0.000005s : 0.04% optimize.add_recomputation : 0.000061s : 0.48% optimize.cse_after_recomputation.cse : 0.000014s : 0.11% optimize.environ_conv : 0.000010s : 0.08% optimize.swap_dp_allreduce_reducescatter : 0.000008s : 0.07% optimize.bias_add_comm_swap : 0.000005s : 0.04% optimize.label_micro_interleaved_index : 0.000007s : 0.06% optimize.label_fine_grained_interleaved_index : 0.000006s : 0.05% optimize.merge_cast_opt : 0.000004s : 0.03% optimize.slice_recompute_activation : 0.000005s : 0.04% optimize.micro_interleaved_order_control : 0.000005s : 0.04% optimize.assign_add_opt : 0.000004s : 0.03% optimize.ForceFp32Comm : 0.000003s : 0.03% optimize.remove_cast_before_assign_add : 0.000004s : 0.03% optimize.full_micro_interleaved_order_control : 0.000005s : 0.04% optimize.reorder_send_recv_between_fp_bp : 0.000005s : 0.04% optimize.comm_op_add_attrs : 0.000004s : 0.03% optimize.add_comm_op_reuse_tag : 0.000004s : 0.03% optimize.interleave_split_concat_branches : 0.000004s : 0.03% optimize.interleave_parallel_branches : 0.000004s : 0.03% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.03% optimize.overlap_opt_shard_grad_in_pipeline : 0.000004s : 0.03% optimize.control_data_broadcast_order : 0.000017s : 0.14% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.03% optimize.offloading_packed_experts : 0.000007s : 0.06% optimize.overlap_recompute_and_grad_model_parallel : 0.000008s : 0.06% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.03% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.03% optimize.overlap_recompute_comm : 0.000005s : 0.04% optimize.overlap_grad_ring_attention : 0.000007s : 0.06% optimize.overlap_grad_flash_sp : 0.000024s : 0.19% optimize.begin_end_overlap_inline : 0.000003s : 0.02% optimize.split_matmul_comm_elemetwise : 0.000005s : 0.04% optimize.split_layernorm_comm : 0.000004s : 0.03% optimize.handle_group_info : 0.000004s : 0.03% optimize.symbol_engine_optimizer.build : 0.000003s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000011s : 0.09% optimize.symbol_engine_optimizer.elim_not_effective : 0.000015s : 0.12% optimize.symbol_engine_optimizer.opt_reshape : 0.000009s : 0.07% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000013s : 0.10% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000003s : 0.03% pipeline_parallel_scheduler : 0.000002s : 0.01% auto_monad_reorder : 0.000022s : 0.17% get_jit_bprop_graph : 0.000002s : 0.01% rewriter_after_jit_bprop_graph : 0.000004s : 0.03% opt_after_jit_grad : 0.000628s : 4.94% validate : 0.000046s : 0.36% Time group info: ------[substitution.] 0.000228 38 10.59% : 0.000024s : 3: substitution.cast_eliminate 1.09% : 0.000002s : 3: substitution.elim_not_effective 0.86% : 0.000002s : 3: substitution.fold_const_symbol 3.11% : 0.000007s : 5: substitution.graph_param_transform 70.14% : 0.000160s : 4: substitution.inline 1.85% : 0.000004s : 6: substitution.j_node_and_user_rematch 3.32% : 0.000008s : 6: substitution.remove_not_recompute_node 2.46% : 0.000006s : 4: substitution.replace_old_param 6.58% : 0.000015s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.006320 2 87.49% : 0.005530s : 1: type_inference.infer 12.51% : 0.000790s : 1: type_inference.specialize ------[replace.] 0.000065 8 61.06% : 0.000040s : 4: replace.inline 38.94% : 0.000025s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000170 8 92.43% : 0.000157s : 4: match.inline 7.57% : 0.000013s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000263 1596 0.99% : 0.000003s : 17: predicate.accumulaten_eliminater 0.78% : 0.000002s : 5: predicate.ad_related_special_op_eliminate 0.58% : 0.000002s : 10: predicate.addn_check_dump 0.98% : 0.000003s : 17: predicate.addn_zero_filter 0.93% : 0.000002s : 17: predicate.adjust_all_reduce_mul_add 2.12% : 0.000006s : 27: predicate.arithmetic_simplify 1.02% : 0.000003s : 17: predicate.cast_eliminate 0.57% : 0.000001s : 10: predicate.check_bprop_eliminate 0.65% : 0.000002s : 10: predicate.compare_switch_simplify 0.18% : 0.000000s : 5: predicate.const_output_eliminate 0.63% : 0.000002s : 10: predicate.depend_value_elim 0.97% : 0.000003s : 17: predicate.dict_get_item_const_eliminator 1.03% : 0.000003s : 17: predicate.dict_get_item_eliminator 0.98% : 0.000003s : 17: predicate.dict_set_item_eliminator 1.02% : 0.000003s : 10: predicate.dumpgradient_eliminate 0.21% : 0.000001s : 5: predicate.elim_not_effective 0.35% : 0.000001s : 5: predicate.elim_shapecalc_of_broadcastargs 1.22% : 0.000003s : 22: predicate.environ_add_const_eliminate 1.09% : 0.000003s : 22: predicate.environ_get_add_eliminate 1.11% : 0.000003s : 22: predicate.environ_get_depend_swap 1.68% : 0.000004s : 32: predicate.environ_get_eliminate 1.12% : 0.000003s : 22: predicate.environ_get_set_eliminate 1.37% : 0.000004s : 25: predicate.exchange_switch_depend_value 2.36% : 0.000006s : 25: predicate.float_depend_g_call 0.55% : 0.000001s : 10: predicate.float_environ_get_switch 0.84% : 0.000002s : 15: predicate.float_tuple_getitem_switch 0.19% : 0.000000s : 5: predicate.fold_const_symbol 0.64% : 0.000002s : 10: predicate.get_grad_eliminate 0.28% : 0.000001s : 5: predicate.graph_param_transform 0.59% : 0.000002s : 10: predicate.incorporate_call 0.54% : 0.000001s : 10: predicate.incorporate_call_switch 6.18% : 0.000016s : 72: predicate.inline 0.91% : 0.000002s : 10: predicate.inline_without_move 0.32% : 0.000001s : 10: predicate.j_node_and_user_rematch 0.75% : 0.000002s : 10: predicate.less_batch_normalization 1.76% : 0.000005s : 31: predicate.list_to_tuple_eliminator_ 2.58% : 0.000007s : 48: predicate.load_eliminater 0.85% : 0.000002s : 5: predicate.loop_unroll_after_grad 2.11% : 0.000006s : 36: predicate.loop_unroll_before_grad 1.65% : 0.000004s : 27: predicate.make_slice_get_slice_eliminator 0.60% : 0.000002s : 10: predicate.merge_addn 0.56% : 0.000001s : 10: predicate.micro_step_allgather_replace 0.59% : 0.000002s : 10: predicate.mini_step_allgather_replace 0.87% : 0.000002s : 17: predicate.minmaximum_grad 1.14% : 0.000003s : 5: predicate.mutable_eliminate 0.37% : 0.000001s : 5: predicate.opt_reshape 0.33% : 0.000001s : 5: predicate.parallel_virtual_node 1.77% : 0.000005s : 25: predicate.partial_defer_inline 1.67% : 0.000004s : 26: predicate.partial_eliminate 0.93% : 0.000002s : 17: predicate.print_const_string_wrapper 0.61% : 0.000002s : 10: predicate.reduce_all_const_elim 1.28% : 0.000003s : 17: predicate.reduce_eliminate 2.59% : 0.000007s : 48: predicate.redundant_stop_gradient_eliminater 0.34% : 0.000001s : 10: predicate.remove_not_recompute_node 1.26% : 0.000003s : 31: predicate.replace_applicator 0.51% : 0.000001s : 10: predicate.replace_old_param 0.29% : 0.000001s : 5: predicate.reset_defer_inline 0.95% : 0.000002s : 17: predicate.reshape_eliminate 0.59% : 0.000002s : 10: predicate.row_tensor_add_zeros_like 0.46% : 0.000001s : 5: predicate.row_tensor_eliminate 0.84% : 0.000002s : 10: predicate.same_eliminate 0.43% : 0.000001s : 10: predicate.set_cell_output_no_recompute 0.85% : 0.000002s : 10: predicate.shard_identity_eliminate 0.71% : 0.000002s : 10: predicate.special_op_eliminate 0.71% : 0.000002s : 10: predicate.specialize_transform 0.79% : 0.000002s : 10: predicate.split_environ_get_set_with_tuple_value 0.84% : 0.000002s : 10: predicate.stack_unstack_eliminate 0.35% : 0.000001s : 5: predicate.switch_call_monad_eliminater 1.48% : 0.000004s : 25: predicate.switch_defer_inline 2.17% : 0.000006s : 35: predicate.switch_layer_defer_inline 4.78% : 0.000013s : 76: predicate.switch_simplify 0.93% : 0.000002s : 17: predicate.tile_eliminate 0.98% : 0.000003s : 17: predicate.transpose_eliminate 1.56% : 0.000004s : 27: predicate.tuple_list_convert_item_index_to_positive 1.64% : 0.000004s : 27: predicate.tuple_list_get_item_const_eliminator 1.62% : 0.000004s : 27: predicate.tuple_list_get_item_depend_reorder 3.25% : 0.000009s : 41: predicate.tuple_list_get_item_eliminator 1.61% : 0.000004s : 27: predicate.tuple_list_get_set_item_eliminator 2.40% : 0.000006s : 37: predicate.tuple_list_set_item_eliminator 1.76% : 0.000005s : 31: predicate.tuple_to_list_eliminator_ 2.40% : 0.000006s : 48: predicate.updatestate_pure_node_eliminater 3.16% : 0.000008s : 58: predicate.updatestate_useless_node_eliminater 0.36% : 0.000001s : 5: predicate.value_based_eliminate 0.70% : 0.000002s : 10: predicate.virtual_dataset_eliminate 0.61% : 0.000002s : 10: predicate.virtual_output_eliminate 0.28% : 0.000001s : 5: predicate.virtual_view_grad_eliminate 0.43% : 0.000001s : 5: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000607 11 52.85% : 0.000321s : 5: func_graph_cloner_run.FuncGraphClonerGraph 47.15% : 0.000286s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.030437 192 0.02% : 0.000006s : 1: ForceFp32Comm 11.86% : 0.003611s : 1: add_attr 11.80% : 0.003593s : 1: add_attr_with_inline 0.02% : 0.000006s : 1: add_comm_op_reuse_tag 0.21% : 0.000064s : 1: add_recomputation 0.02% : 0.000007s : 1: assign_add_opt 0.25% : 0.000078s : 1: auto_monad 0.10% : 0.000030s : 1: auto_monad_reorder 0.02% : 0.000006s : 1: begin_end_overlap_inline 0.03% : 0.000008s : 1: bias_add_comm_swap 1.74% : 0.000530s : 1: bootstrap 0.11% : 0.000034s : 1: cconv 0.02% : 0.000007s : 1: comm_op_add_attrs 0.07% : 0.000020s : 1: control_data_broadcast_order 0.05% : 0.000014s : 1: convert_after_rewriter 0.11% : 0.000034s : 1: cse_after_recomputation 0.03% : 0.000008s : 1: dataset_repeat_opt 0.06% : 0.000018s : 1: detach_backward 0.04% : 0.000013s : 1: environ_conv 0.11% : 0.000033s : 1: event_method 0.02% : 0.000007s : 1: full_micro_interleaved_order_control 0.03% : 0.000008s : 1: get_jit_bprop_graph 0.04% : 0.000013s : 1: graph_reusing 0.02% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.02% : 0.000006s : 1: handle_group_info 0.03% : 0.000008s : 1: inline 0.03% : 0.000009s : 1: insert-virtual-dataset 0.02% : 0.000006s : 1: interleave_parallel_branches 0.02% : 0.000006s : 1: interleave_split_concat_branches 0.03% : 0.000009s : 1: label_fine_grained_interleaved_index 0.03% : 0.000011s : 1: label_micro_interleaved_index 1.59% : 0.000483s : 1: loop_unroll 0.02% : 0.000007s : 1: merge_cast_opt 0.02% : 0.000008s : 1: micro_interleaved_order_control 2.03% : 0.000618s : 1: mutable_eliminate 0.03% : 0.000010s : 1: offloading_packed_experts 0.05% : 0.000016s : 1: opt.transform.loop_unroll_optimizer 0.06% : 0.000018s : 1: opt.transform.mutable_eliminate 4.94% : 0.001504s : 78: opt.transform.opt_a 0.12% : 0.000036s : 1: opt.transform.opt_after_cconv 0.10% : 0.000031s : 1: opt.transform.opt_after_jit_grad 0.45% : 0.000137s : 28: opt.transform.opt_b 0.20% : 0.000060s : 2: opt.transform.opt_trans_graph 0.14% : 0.000044s : 4: opt.transform.symbol_engine_opt 11.95% : 0.003636s : 1: opt_a 0.48% : 0.000145s : 1: opt_after_cconv 2.10% : 0.000639s : 1: opt_after_jit_grad 1.05% : 0.000319s : 1: opt_b 21.41% : 0.006517s : 1: optimize 0.08% : 0.000024s : 1: optimize_parallel_all_gather_comm 0.04% : 0.000012s : 1: order_py_execute_after_rewriter 0.09% : 0.000027s : 1: overlap_grad_flash_sp 0.02% : 0.000006s : 1: overlap_grad_matmul_and_grad_allreduce 0.03% : 0.000011s : 1: overlap_grad_ring_attention 0.02% : 0.000008s : 1: overlap_opt_shard_grad_in_pipeline 0.02% : 0.000006s : 1: overlap_opt_shard_in_pipeline 0.03% : 0.000008s : 1: overlap_param_gather 0.02% : 0.000006s : 1: overlap_recompute_allgather_and_fa_grad 0.04% : 0.000011s : 1: overlap_recompute_and_grad_model_parallel 0.03% : 0.000008s : 1: overlap_recompute_comm 0.03% : 0.000011s : 1: parallel-infer-symbol 0.02% : 0.000006s : 1: parallel-infer-symbol-second 0.03% : 0.000008s : 1: partial_unused_args_eliminate 0.03% : 0.000009s : 1: pipeline_parallel_scheduler 0.03% : 0.000008s : 1: pipeline_split 0.16% : 0.000048s : 1: pre_auto_parallel 0.13% : 0.000039s : 1: py_interpret_to_execute 0.06% : 0.000018s : 1: py_interpret_to_execute_after_opt_a 0.02% : 0.000007s : 1: remove_cast_before_assign_add 0.08% : 0.000023s : 1: remove_dup_value 1.59% : 0.000483s : 1: renormalize.infer 1.17% : 0.000357s : 1: renormalize.specialize 0.03% : 0.000008s : 1: reorder_send_recv_between_fp_bp 0.03% : 0.000010s : 1: rewriter_after_jit_bprop_graph 0.17% : 0.000052s : 1: rewriter_after_opt_a 0.34% : 0.000104s : 1: rewriter_before_opt_a 0.03% : 0.000008s : 1: slice_cell_reuse_recomputed_activation 0.03% : 0.000008s : 1: slice_recompute_activation 0.02% : 0.000007s : 1: split_layernorm_comm 0.02% : 0.000007s : 1: split_matmul_comm_elemetwise 0.04% : 0.000011s : 1: swap_dp_allreduce_reducescatter 0.35% : 0.000107s : 1: symbol_engine_optimizer 0.35% : 0.000105s : 1: tuple_transform 21.13% : 0.006431s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:45:07.755.273 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0194652, [21] [bootstrap]: 0.00044255 [type_inference]: 0.00678908 [event_method]: 2.467e-05 [auto_monad]: 7.964e-05 [graph_reusing]: 7.16999e-06 [inline]: 3.61999e-06 [add_attr]: 0.00433611, [1] [add_attr_with_inline]: 0.00432156, [1] [Cycle 1]: 8.761e-05, [2] [tag_attr]: 2.744e-05 [meta_addattr_fg_expand]: 7.13e-06 [parallel-infer-symbol]: 4e-06 [pre_auto_parallel]: 4.708e-05 [insert-virtual-dataset]: 2.93e-06 [parallel-infer-symbol-second]: 8.99978e-07 [dataset_repeat_opt]: 2.79999e-06 [pipeline_split]: 1.77999e-06 [optimize]: 0.00682245, [53] [py_interpret_to_execute]: 3.511e-05 [rewriter_before_opt_a]: 0.00013692 [opt_a]: 0.00388188, [2] [Cycle 1]: 0.00292828, [45] [expand_dump_flag]: 3.56001e-06 [switch_simplify]: 4.842e-05 [loop_unroll]: 3.295e-05 [a_1]: 0.00087031 [with_stream_mark]: 2.39e-05 [recompute_prepare]: 1.194e-05 [updatestate_depend_eliminate]: 5.72999e-06 [updatestate_assign_eliminate]: 4.03999e-06 [updatestate_loads_eliminate]: 4.32e-06 [parameter_eliminate]: 2.06e-06 [a_2]: 0.00011307 [accelerated_algorithm]: 9.43002e-06 [shard]: 2.31e-06 [meta_shard_fg_expand]: 3.08998e-06 [shard_inline]: 8.69998e-06 [merge_send_recv]: 1.207e-05 [auto_parallel]: 1.071e-05 [parallel]: 2.223e-05 [flash_sp]: 1.151e-05 [merge_comm]: 5.61e-06 [allreduce_fusion]: 4.77e-06 [matmul_add_comm_reduction]: 1.256e-05 [allreduce_slice_to_reducescatter]: 8.80013e-07 [virtual_shard_identity]: 1.252e-05 [virtual_dataset]: 8.72998e-06 [get_grad_eliminate_]: 8.55001e-06 [virtual_output]: 8.67e-06 [merge_forward]: 5.49998e-06 [cell_reuse_recompute_pass]: 1.75001e-06 [offload_activation]: 1.198e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.854e-05 [merge_recompute_call_nodes]: 1.78002e-06 [before_grad]: 1.488e-05 [set_forward_comm_id_for_comm_node_pass]: 5.32999e-06 [meta_fg_expand]: 3.91999e-06 [flash_sp_send_recv_attached]: 2.58998e-06 [receive_attached]: 3.04001e-06 [after_resolve]: 1.447e-05 [a_after_grad]: 1.356e-05 [renormalize]: 0.00110721 [add_forward_monad_depend]: 8.87999e-06 [auto_monad_grad]: 2.76999e-06 [auto_monad_eliminator]: 2.373e-05 [cse]: 4.207e-05 [a_3]: 7.527e-05 [Cycle 2]: 0.00093979, [45] [expand_dump_flag]: 2.87002e-06 [switch_simplify]: 1.186e-05 [loop_unroll]: 8.70999e-06 [a_1]: 0.0002156 [with_stream_mark]: 2.11e-05 [recompute_prepare]: 9.46e-06 [updatestate_depend_eliminate]: 5.49e-06 [updatestate_assign_eliminate]: 3.44001e-06 [updatestate_loads_eliminate]: 3.45e-06 [parameter_eliminate]: 2.00002e-06 [a_2]: 0.00010223 [accelerated_algorithm]: 9.58002e-06 [shard]: 2.81999e-06 [meta_shard_fg_expand]: 2.65997e-06 [shard_inline]: 8.90999e-06 [merge_send_recv]: 9.37999e-06 [auto_parallel]: 9.40001e-06 [parallel]: 8.50001e-06 [flash_sp]: 4.48999e-06 [merge_comm]: 4.92999e-06 [allreduce_fusion]: 4.35e-06 [matmul_add_comm_reduction]: 8.77e-06 [allreduce_slice_to_reducescatter]: 8.50006e-07 [virtual_shard_identity]: 1.055e-05 [virtual_dataset]: 7.65e-06 [get_grad_eliminate_]: 8.74003e-06 [virtual_output]: 7.73001e-06 [merge_forward]: 4.77998e-06 [cell_reuse_recompute_pass]: 3.18998e-06 [offload_activation]: 1.218e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.701e-05 [merge_recompute_call_nodes]: 9.39996e-07 [before_grad]: 1.398e-05 [set_forward_comm_id_for_comm_node_pass]: 5.30999e-06 [meta_fg_expand]: 3.3e-06 [flash_sp_send_recv_attached]: 2.06e-06 [receive_attached]: 2.26e-06 [after_resolve]: 1.418e-05 [a_after_grad]: 1.352e-05 [renormalize]: 1.00001e-07 [add_forward_monad_depend]: 2.46e-06 [auto_monad_grad]: 2.21998e-06 [auto_monad_eliminator]: 1.418e-05 [cse]: 2.824e-05 [a_3]: 5.354e-05 [py_interpret_to_execute_after_opt_a]: 3.264e-05 [slice_cell_reuse_recomputed_activation]: 2.69001e-06 [rewriter_after_opt_a]: 5.803e-05 [convert_after_rewriter]: 9.17999e-06 [order_py_execute_after_rewriter]: 6.29001e-06 [mutable_eliminate]: 0.00085367 [opt_b]: 0.00029467, [1] [Cycle 1]: 0.000285, [7] [b_1]: 0.00017435 [b_2]: 1.244e-05 [updatestate_depend_eliminate]: 1.042e-05 [updatestate_assign_eliminate]: 4e-06 [updatestate_loads_eliminate]: 3.63e-06 [renormalize]: 8.39995e-07 [cse]: 3.747e-05 [optimize_parallel_all_gather_comm]: 2.328e-05 [overlap_param_gather]: 2.29001e-06 [cconv]: 3.291e-05 [loop_unroll]: 0.00062799 [opt_after_cconv]: 0.00013474, [1] [Cycle 1]: 0.00012762, [7] [c_1]: 4.022e-05 [parameter_eliminate]: 4.87e-06 [updatestate_depend_eliminate]: 8.03001e-06 [updatestate_assign_eliminate]: 3.63e-06 [updatestate_loads_eliminate]: 3.13e-06 [cse]: 3.004e-05 [renormalize]: 5.10016e-07 [remove_dup_value]: 1.787e-05 [tuple_transform]: 0.00010119, [1] [Cycle 1]: 9.581e-05, [4] [d_1]: 6.277e-05 [none_parameter_eliminate]: 2.40002e-06 [renormalize]: 2.80008e-07 [switch_simplify]: 9.14998e-06 [partial_unused_args_eliminate]: 2.34001e-06 [add_recomputation]: 6.832e-05 [cse_after_recomputation]: 2.848e-05, [1] [Cycle 1]: 2.313e-05, [1] [cse]: 1.754e-05 [environ_conv]: 8.12998e-06 [swap_dp_allreduce_reducescatter]: 5.92999e-06 [bias_add_comm_swap]: 2.98e-06 [label_micro_interleaved_index]: 6.68e-06 [label_fine_grained_interleaved_index]: 3.45e-06 [merge_cast_opt]: 1.92999e-06 [slice_recompute_activation]: 2.22999e-06 [micro_interleaved_order_control]: 2.44999e-06 [assign_add_opt]: 1.42999e-06 [ForceFp32Comm]: 1.17999e-06 [remove_cast_before_assign_add]: 1.16002e-06 [full_micro_interleaved_order_control]: 2.73e-06 [reorder_send_recv_between_fp_bp]: 2.76999e-06 [comm_op_add_attrs]: 1.07998e-06 [add_comm_op_reuse_tag]: 1.04e-06 [interleave_split_concat_branches]: 1.32999e-06 [interleave_parallel_branches]: 1.19e-06 [overlap_opt_shard_in_pipeline]: 1.32999e-06 [overlap_opt_shard_grad_in_pipeline]: 2.32999e-06 [control_data_broadcast_order]: 1.764e-05 [grouped_pairwise_exchange_alltoall]: 1.99e-06 [offloading_packed_experts]: 4.90001e-06 [overlap_recompute_and_grad_model_parallel]: 5.55001e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.26002e-06 [overlap_recompute_allgather_and_fa_grad]: 1.49e-06 [overlap_recompute_comm]: 2.59999e-06 [overlap_grad_ring_attention]: 4.79e-06 [overlap_grad_flash_sp]: 2.437e-05 [begin_end_overlap_inline]: 6.00005e-07 [split_matmul_comm_elemetwise]: 2.66999e-06 [split_layernorm_comm]: 1.77999e-06 [handle_group_info]: 1.20001e-06 [symbol_engine_optimizer]: 9.626e-05, [1] [Cycle 1]: 9.143e-05, [6] [build]: 5.13002e-06 [elim_shapecalc]: 1.327e-05 [elim_not_effective]: 1.847e-05 [opt_reshape]: 9.56e-06 [fold_const_symbol]: 1.366e-05 [renormalize]: 3.39991e-07 [detach_backward]: 2.46e-06 [pipeline_parallel_scheduler]: 1.52001e-06 [auto_monad_reorder]: 2.151e-05 [get_jit_bprop_graph]: 2.21e-06 [rewriter_after_jit_bprop_graph]: 5.62001e-06 [opt_after_jit_grad]: 0.00062761 [validate]: 5.33e-05 Sums bootstrap : 0.000443s : 3.17% type_inference : 0.006789s : 48.56% event_method : 0.000025s : 0.18% auto_monad : 0.000080s : 0.57% graph_reusing : 0.000007s : 0.05% inline : 0.000004s : 0.03% add_attr.add_attr_with_inline.tag_attr : 0.000027s : 0.20% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000007s : 0.05% parallel-infer-symbol : 0.000004s : 0.03% pre_auto_parallel : 0.000047s : 0.34% insert-virtual-dataset : 0.000003s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000003s : 0.02% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000035s : 0.25% optimize.rewriter_before_opt_a : 0.000137s : 0.98% optimize.opt_a.expand_dump_flag : 0.000006s : 0.05% optimize.opt_a.switch_simplify : 0.000060s : 0.43% optimize.opt_a.loop_unroll : 0.000042s : 0.30% optimize.opt_a.a_1 : 0.001086s : 7.77% optimize.opt_a.with_stream_mark : 0.000045s : 0.32% optimize.opt_a.recompute_prepare : 0.000021s : 0.15% optimize.opt_a.updatestate_depend_eliminate : 0.000011s : 0.08% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.05% optimize.opt_a.updatestate_loads_eliminate : 0.000008s : 0.06% optimize.opt_a.parameter_eliminate : 0.000004s : 0.03% optimize.opt_a.a_2 : 0.000215s : 1.54% optimize.opt_a.accelerated_algorithm : 0.000019s : 0.14% optimize.opt_a.shard : 0.000005s : 0.04% optimize.opt_a.meta_shard_fg_expand : 0.000006s : 0.04% optimize.opt_a.shard_inline : 0.000018s : 0.13% optimize.opt_a.merge_send_recv : 0.000021s : 0.15% optimize.opt_a.auto_parallel : 0.000020s : 0.14% optimize.opt_a.parallel : 0.000031s : 0.22% optimize.opt_a.flash_sp : 0.000016s : 0.11% optimize.opt_a.merge_comm : 0.000011s : 0.08% optimize.opt_a.allreduce_fusion : 0.000009s : 0.07% optimize.opt_a.matmul_add_comm_reduction : 0.000021s : 0.15% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000002s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000023s : 0.17% optimize.opt_a.virtual_dataset : 0.000016s : 0.12% optimize.opt_a.get_grad_eliminate_ : 0.000017s : 0.12% optimize.opt_a.virtual_output : 0.000016s : 0.12% optimize.opt_a.merge_forward : 0.000010s : 0.07% optimize.opt_a.cell_reuse_recompute_pass : 0.000005s : 0.04% optimize.opt_a.offload_activation : 0.000024s : 0.17% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000036s : 0.25% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.02% optimize.opt_a.before_grad : 0.000029s : 0.21% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000011s : 0.08% optimize.opt_a.meta_fg_expand : 0.000007s : 0.05% optimize.opt_a.flash_sp_send_recv_attached : 0.000005s : 0.03% optimize.opt_a.receive_attached : 0.000005s : 0.04% optimize.opt_a.after_resolve : 0.000029s : 0.20% optimize.opt_a.a_after_grad : 0.000027s : 0.19% optimize.opt_a.renormalize : 0.001107s : 7.92% optimize.opt_a.add_forward_monad_depend : 0.000011s : 0.08% optimize.opt_a.auto_monad_grad : 0.000005s : 0.04% optimize.opt_a.auto_monad_eliminator : 0.000038s : 0.27% optimize.opt_a.cse : 0.000070s : 0.50% optimize.opt_a.a_3 : 0.000129s : 0.92% optimize.py_interpret_to_execute_after_opt_a : 0.000033s : 0.23% optimize.slice_cell_reuse_recomputed_activation : 0.000003s : 0.02% optimize.rewriter_after_opt_a : 0.000058s : 0.42% optimize.convert_after_rewriter : 0.000009s : 0.07% optimize.order_py_execute_after_rewriter : 0.000006s : 0.04% optimize.mutable_eliminate : 0.000854s : 6.11% optimize.opt_b.b_1 : 0.000174s : 1.25% optimize.opt_b.b_2 : 0.000012s : 0.09% optimize.opt_b.updatestate_depend_eliminate : 0.000010s : 0.07% optimize.opt_b.updatestate_assign_eliminate : 0.000004s : 0.03% optimize.opt_b.updatestate_loads_eliminate : 0.000004s : 0.03% optimize.opt_b.renormalize : 0.000001s : 0.01% optimize.opt_b.cse : 0.000037s : 0.27% optimize.optimize_parallel_all_gather_comm : 0.000023s : 0.17% optimize.overlap_param_gather : 0.000002s : 0.02% optimize.cconv : 0.000033s : 0.24% optimize.loop_unroll : 0.000628s : 4.49% optimize.opt_after_cconv.c_1 : 0.000040s : 0.29% optimize.opt_after_cconv.parameter_eliminate : 0.000005s : 0.03% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000008s : 0.06% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000004s : 0.03% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.02% optimize.opt_after_cconv.cse : 0.000030s : 0.21% optimize.opt_after_cconv.renormalize : 0.000001s : 0.00% optimize.remove_dup_value : 0.000018s : 0.13% optimize.tuple_transform.d_1 : 0.000063s : 0.45% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.02% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000009s : 0.07% optimize.partial_unused_args_eliminate : 0.000002s : 0.02% optimize.add_recomputation : 0.000068s : 0.49% optimize.cse_after_recomputation.cse : 0.000018s : 0.13% optimize.environ_conv : 0.000008s : 0.06% optimize.swap_dp_allreduce_reducescatter : 0.000006s : 0.04% optimize.bias_add_comm_swap : 0.000003s : 0.02% optimize.label_micro_interleaved_index : 0.000007s : 0.05% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.02% optimize.merge_cast_opt : 0.000002s : 0.01% optimize.slice_recompute_activation : 0.000002s : 0.02% optimize.micro_interleaved_order_control : 0.000002s : 0.02% optimize.assign_add_opt : 0.000001s : 0.01% optimize.ForceFp32Comm : 0.000001s : 0.01% optimize.remove_cast_before_assign_add : 0.000001s : 0.01% optimize.full_micro_interleaved_order_control : 0.000003s : 0.02% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.02% optimize.comm_op_add_attrs : 0.000001s : 0.01% optimize.add_comm_op_reuse_tag : 0.000001s : 0.01% optimize.interleave_split_concat_branches : 0.000001s : 0.01% optimize.interleave_parallel_branches : 0.000001s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.02% optimize.control_data_broadcast_order : 0.000018s : 0.13% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.01% optimize.offloading_packed_experts : 0.000005s : 0.04% optimize.overlap_recompute_and_grad_model_parallel : 0.000006s : 0.04% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.01% optimize.overlap_recompute_comm : 0.000003s : 0.02% optimize.overlap_grad_ring_attention : 0.000005s : 0.03% optimize.overlap_grad_flash_sp : 0.000024s : 0.17% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000003s : 0.02% optimize.split_layernorm_comm : 0.000002s : 0.01% optimize.handle_group_info : 0.000001s : 0.01% optimize.symbol_engine_optimizer.build : 0.000005s : 0.04% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000013s : 0.09% optimize.symbol_engine_optimizer.elim_not_effective : 0.000018s : 0.13% optimize.symbol_engine_optimizer.opt_reshape : 0.000010s : 0.07% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000014s : 0.10% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.02% pipeline_parallel_scheduler : 0.000002s : 0.01% auto_monad_reorder : 0.000022s : 0.15% get_jit_bprop_graph : 0.000002s : 0.02% rewriter_after_jit_bprop_graph : 0.000006s : 0.04% opt_after_jit_grad : 0.000628s : 4.49% validate : 0.000053s : 0.38% Time group info: ------[substitution.] 0.000265 38 11.76% : 0.000031s : 3: substitution.cast_eliminate 1.03% : 0.000003s : 3: substitution.elim_not_effective 0.62% : 0.000002s : 3: substitution.fold_const_symbol 2.70% : 0.000007s : 5: substitution.graph_param_transform 69.89% : 0.000185s : 4: substitution.inline 2.36% : 0.000006s : 6: substitution.j_node_and_user_rematch 2.60% : 0.000007s : 6: substitution.remove_not_recompute_node 2.34% : 0.000006s : 4: substitution.replace_old_param 6.69% : 0.000018s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.006700 2 85.36% : 0.005719s : 1: type_inference.infer 14.64% : 0.000981s : 1: type_inference.specialize ------[replace.] 0.000074 8 60.05% : 0.000045s : 4: replace.inline 39.95% : 0.000030s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000198 8 92.18% : 0.000182s : 4: match.inline 7.82% : 0.000015s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000310 1596 1.00% : 0.000003s : 17: predicate.accumulaten_eliminater 0.65% : 0.000002s : 5: predicate.ad_related_special_op_eliminate 0.55% : 0.000002s : 10: predicate.addn_check_dump 0.99% : 0.000003s : 17: predicate.addn_zero_filter 0.89% : 0.000003s : 17: predicate.adjust_all_reduce_mul_add 2.25% : 0.000007s : 27: predicate.arithmetic_simplify 1.10% : 0.000003s : 17: predicate.cast_eliminate 0.59% : 0.000002s : 10: predicate.check_bprop_eliminate 0.57% : 0.000002s : 10: predicate.compare_switch_simplify 0.16% : 0.000000s : 5: predicate.const_output_eliminate 0.63% : 0.000002s : 10: predicate.depend_value_elim 0.98% : 0.000003s : 17: predicate.dict_get_item_const_eliminator 1.18% : 0.000004s : 17: predicate.dict_get_item_eliminator 0.90% : 0.000003s : 17: predicate.dict_set_item_eliminator 0.80% : 0.000002s : 10: predicate.dumpgradient_eliminate 0.18% : 0.000001s : 5: predicate.elim_not_effective 0.43% : 0.000001s : 5: predicate.elim_shapecalc_of_broadcastargs 1.18% : 0.000004s : 22: predicate.environ_add_const_eliminate 1.16% : 0.000004s : 22: predicate.environ_get_add_eliminate 1.07% : 0.000003s : 22: predicate.environ_get_depend_swap 1.77% : 0.000005s : 32: predicate.environ_get_eliminate 1.15% : 0.000004s : 22: predicate.environ_get_set_eliminate 1.31% : 0.000004s : 25: predicate.exchange_switch_depend_value 2.18% : 0.000007s : 25: predicate.float_depend_g_call 0.57% : 0.000002s : 10: predicate.float_environ_get_switch 0.80% : 0.000002s : 15: predicate.float_tuple_getitem_switch 0.16% : 0.000001s : 5: predicate.fold_const_symbol 0.72% : 0.000002s : 10: predicate.get_grad_eliminate 0.25% : 0.000001s : 5: predicate.graph_param_transform 0.52% : 0.000002s : 10: predicate.incorporate_call 0.46% : 0.000001s : 10: predicate.incorporate_call_switch 6.13% : 0.000019s : 72: predicate.inline 0.65% : 0.000002s : 10: predicate.inline_without_move 0.29% : 0.000001s : 10: predicate.j_node_and_user_rematch 0.99% : 0.000003s : 10: predicate.less_batch_normalization 1.82% : 0.000006s : 31: predicate.list_to_tuple_eliminator_ 2.86% : 0.000009s : 48: predicate.load_eliminater 1.15% : 0.000004s : 5: predicate.loop_unroll_after_grad 1.82% : 0.000006s : 36: predicate.loop_unroll_before_grad 1.83% : 0.000006s : 27: predicate.make_slice_get_slice_eliminator 0.58% : 0.000002s : 10: predicate.merge_addn 0.58% : 0.000002s : 10: predicate.micro_step_allgather_replace 0.61% : 0.000002s : 10: predicate.mini_step_allgather_replace 0.93% : 0.000003s : 17: predicate.minmaximum_grad 1.78% : 0.000006s : 5: predicate.mutable_eliminate 0.34% : 0.000001s : 5: predicate.opt_reshape 0.50% : 0.000002s : 5: predicate.parallel_virtual_node 1.72% : 0.000005s : 25: predicate.partial_defer_inline 1.52% : 0.000005s : 26: predicate.partial_eliminate 0.90% : 0.000003s : 17: predicate.print_const_string_wrapper 0.55% : 0.000002s : 10: predicate.reduce_all_const_elim 1.23% : 0.000004s : 17: predicate.reduce_eliminate 2.73% : 0.000008s : 48: predicate.redundant_stop_gradient_eliminater 0.33% : 0.000001s : 10: predicate.remove_not_recompute_node 1.19% : 0.000004s : 31: predicate.replace_applicator 0.44% : 0.000001s : 10: predicate.replace_old_param 0.34% : 0.000001s : 5: predicate.reset_defer_inline 1.08% : 0.000003s : 17: predicate.reshape_eliminate 0.52% : 0.000002s : 10: predicate.row_tensor_add_zeros_like 0.44% : 0.000001s : 5: predicate.row_tensor_eliminate 0.87% : 0.000003s : 10: predicate.same_eliminate 0.43% : 0.000001s : 10: predicate.set_cell_output_no_recompute 1.07% : 0.000003s : 10: predicate.shard_identity_eliminate 0.66% : 0.000002s : 10: predicate.special_op_eliminate 0.69% : 0.000002s : 10: predicate.specialize_transform 0.90% : 0.000003s : 10: predicate.split_environ_get_set_with_tuple_value 0.87% : 0.000003s : 10: predicate.stack_unstack_eliminate 0.33% : 0.000001s : 5: predicate.switch_call_monad_eliminater 1.42% : 0.000004s : 25: predicate.switch_defer_inline 2.00% : 0.000006s : 35: predicate.switch_layer_defer_inline 4.41% : 0.000014s : 76: predicate.switch_simplify 0.95% : 0.000003s : 17: predicate.tile_eliminate 1.01% : 0.000003s : 17: predicate.transpose_eliminate 1.73% : 0.000005s : 27: predicate.tuple_list_convert_item_index_to_positive 1.56% : 0.000005s : 27: predicate.tuple_list_get_item_const_eliminator 1.49% : 0.000005s : 27: predicate.tuple_list_get_item_depend_reorder 3.31% : 0.000010s : 41: predicate.tuple_list_get_item_eliminator 1.51% : 0.000005s : 27: predicate.tuple_list_get_set_item_eliminator 2.23% : 0.000007s : 37: predicate.tuple_list_set_item_eliminator 1.81% : 0.000006s : 31: predicate.tuple_to_list_eliminator_ 2.40% : 0.000007s : 48: predicate.updatestate_pure_node_eliminater 3.05% : 0.000009s : 58: predicate.updatestate_useless_node_eliminater 0.40% : 0.000001s : 5: predicate.value_based_eliminate 0.58% : 0.000002s : 10: predicate.virtual_dataset_eliminate 0.67% : 0.000002s : 10: predicate.virtual_output_eliminate 0.26% : 0.000001s : 5: predicate.virtual_view_grad_eliminate 0.37% : 0.000001s : 5: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000693 11 48.70% : 0.000337s : 5: func_graph_cloner_run.FuncGraphClonerGraph 51.30% : 0.000356s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.033625 192 0.01% : 0.000004s : 1: ForceFp32Comm 12.92% : 0.004344s : 1: add_attr 12.87% : 0.004326s : 1: add_attr_with_inline 0.01% : 0.000004s : 1: add_comm_op_reuse_tag 0.22% : 0.000073s : 1: add_recomputation 0.01% : 0.000004s : 1: assign_add_opt 0.26% : 0.000086s : 1: auto_monad 0.08% : 0.000026s : 1: auto_monad_reorder 0.01% : 0.000003s : 1: begin_end_overlap_inline 0.02% : 0.000006s : 1: bias_add_comm_swap 1.41% : 0.000475s : 1: bootstrap 0.11% : 0.000037s : 1: cconv 0.01% : 0.000004s : 1: comm_op_add_attrs 0.06% : 0.000021s : 1: control_data_broadcast_order 0.04% : 0.000013s : 1: convert_after_rewriter 0.09% : 0.000032s : 1: cse_after_recomputation 0.02% : 0.000006s : 1: dataset_repeat_opt 0.02% : 0.000006s : 1: detach_backward 0.03% : 0.000012s : 1: environ_conv 0.10% : 0.000034s : 1: event_method 0.02% : 0.000006s : 1: full_micro_interleaved_order_control 0.02% : 0.000006s : 1: get_jit_bprop_graph 0.03% : 0.000011s : 1: graph_reusing 0.01% : 0.000005s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000004s : 1: handle_group_info 0.02% : 0.000007s : 1: inline 0.02% : 0.000007s : 1: insert-virtual-dataset 0.01% : 0.000004s : 1: interleave_parallel_branches 0.01% : 0.000004s : 1: interleave_split_concat_branches 0.02% : 0.000007s : 1: label_fine_grained_interleaved_index 0.03% : 0.000010s : 1: label_micro_interleaved_index 1.90% : 0.000638s : 1: loop_unroll 0.01% : 0.000005s : 1: merge_cast_opt 0.02% : 0.000006s : 1: micro_interleaved_order_control 2.58% : 0.000869s : 1: mutable_eliminate 0.02% : 0.000008s : 1: offloading_packed_experts 0.06% : 0.000021s : 1: opt.transform.loop_unroll_optimizer 0.09% : 0.000030s : 1: opt.transform.mutable_eliminate 4.96% : 0.001669s : 78: opt.transform.opt_a 0.11% : 0.000039s : 1: opt.transform.opt_after_cconv 0.10% : 0.000033s : 1: opt.transform.opt_after_jit_grad 0.44% : 0.000149s : 28: opt.transform.opt_b 0.21% : 0.000069s : 2: opt.transform.opt_trans_graph 0.15% : 0.000051s : 4: opt.transform.symbol_engine_opt 11.56% : 0.003887s : 1: opt_a 0.41% : 0.000139s : 1: opt_after_cconv 1.90% : 0.000639s : 1: opt_after_jit_grad 0.89% : 0.000300s : 1: opt_b 20.31% : 0.006829s : 1: optimize 0.08% : 0.000027s : 1: optimize_parallel_all_gather_comm 0.03% : 0.000010s : 1: order_py_execute_after_rewriter 0.08% : 0.000027s : 1: overlap_grad_flash_sp 0.01% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.03% : 0.000010s : 1: overlap_grad_ring_attention 0.02% : 0.000006s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.02% : 0.000006s : 1: overlap_param_gather 0.01% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.03% : 0.000008s : 1: overlap_recompute_and_grad_model_parallel 0.02% : 0.000005s : 1: overlap_recompute_comm 0.02% : 0.000008s : 1: parallel-infer-symbol 0.01% : 0.000004s : 1: parallel-infer-symbol-second 0.02% : 0.000005s : 1: partial_unused_args_eliminate 0.01% : 0.000005s : 1: pipeline_parallel_scheduler 0.01% : 0.000005s : 1: pipeline_split 0.15% : 0.000052s : 1: pre_auto_parallel 0.12% : 0.000040s : 1: py_interpret_to_execute 0.11% : 0.000037s : 1: py_interpret_to_execute_after_opt_a 0.02% : 0.000006s : 1: remove_cast_before_assign_add 0.06% : 0.000021s : 1: remove_dup_value 1.83% : 0.000616s : 1: renormalize.infer 1.42% : 0.000476s : 1: renormalize.specialize 0.02% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.03% : 0.000009s : 1: rewriter_after_jit_bprop_graph 0.19% : 0.000063s : 1: rewriter_after_opt_a 0.42% : 0.000142s : 1: rewriter_before_opt_a 0.02% : 0.000006s : 1: slice_cell_reuse_recomputed_activation 0.02% : 0.000005s : 1: slice_recompute_activation 0.01% : 0.000005s : 1: split_layernorm_comm 0.02% : 0.000006s : 1: split_matmul_comm_elemetwise 0.03% : 0.000009s : 1: swap_dp_allreduce_reducescatter 0.29% : 0.000099s : 1: symbol_engine_optimizer 0.31% : 0.000104s : 1: tuple_transform 20.28% : 0.006819s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:45:08.875.08 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:45:08.878.02 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0208541, [21] [bootstrap]: 0.00049344 [type_inference]: 0.00721145 [event_method]: 2.437e-05 [auto_monad]: 7.268e-05 [graph_reusing]: 6.53998e-06 [inline]: 3.28e-06 [add_attr]: 0.0040879, [1] [add_attr_with_inline]: 0.00407415, [1] [Cycle 1]: 0.00010278, [2] [tag_attr]: 2.507e-05 [meta_addattr_fg_expand]: 6.47001e-06 [parallel-infer-symbol]: 3.56999e-06 [pre_auto_parallel]: 4.399e-05 [insert-virtual-dataset]: 2.61e-06 [parallel-infer-symbol-second]: 9.50007e-07 [dataset_repeat_opt]: 2.01e-06 [pipeline_split]: 1.62001e-06 [optimize]: 0.00729414, [53] [py_interpret_to_execute]: 3.799e-05 [rewriter_before_opt_a]: 0.00010759 [opt_a]: 0.00414482, [2] [Cycle 1]: 0.00305892, [45] [expand_dump_flag]: 3.45998e-06 [switch_simplify]: 4.302e-05 [loop_unroll]: 3.306e-05 [a_1]: 0.00073963 [with_stream_mark]: 2.008e-05 [recompute_prepare]: 1.219e-05 [updatestate_depend_eliminate]: 5.07e-06 [updatestate_assign_eliminate]: 4.80001e-06 [updatestate_loads_eliminate]: 3.70998e-06 [parameter_eliminate]: 2.32001e-06 [a_2]: 0.00013526 [accelerated_algorithm]: 1.061e-05 [shard]: 1.96e-06 [meta_shard_fg_expand]: 2.59001e-06 [shard_inline]: 8.47e-06 [merge_send_recv]: 9.98998e-06 [auto_parallel]: 1.006e-05 [parallel]: 2.08e-05 [flash_sp]: 1.166e-05 [merge_comm]: 4.77e-06 [allreduce_fusion]: 4.32e-06 [matmul_add_comm_reduction]: 1.1e-05 [allreduce_slice_to_reducescatter]: 5.69999e-07 [virtual_shard_identity]: 1.171e-05 [virtual_dataset]: 9.00001e-06 [get_grad_eliminate_]: 7.83999e-06 [virtual_output]: 8.65001e-06 [merge_forward]: 4.63001e-06 [cell_reuse_recompute_pass]: 1.99999e-06 [offload_activation]: 1.211e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.084e-05 [merge_recompute_call_nodes]: 2.04e-06 [before_grad]: 1.432e-05 [set_forward_comm_id_for_comm_node_pass]: 5.38002e-06 [meta_fg_expand]: 3.93001e-06 [flash_sp_send_recv_attached]: 3.57997e-06 [receive_attached]: 1.98002e-06 [after_resolve]: 1.645e-05 [a_after_grad]: 1.309e-05 [renormalize]: 0.00121073 [add_forward_monad_depend]: 1.08e-05 [auto_monad_grad]: 3.13e-06 [auto_monad_eliminator]: 2.406e-05 [cse]: 4.113e-05 [a_3]: 8.676e-05 [Cycle 2]: 0.00106692, [45] [expand_dump_flag]: 2.16e-06 [switch_simplify]: 1.167e-05 [loop_unroll]: 8.25e-06 [a_1]: 0.00019709 [with_stream_mark]: 1.982e-05 [recompute_prepare]: 8.91997e-06 [updatestate_depend_eliminate]: 4.58999e-06 [updatestate_assign_eliminate]: 3.65003e-06 [updatestate_loads_eliminate]: 3.47002e-06 [parameter_eliminate]: 1.59e-06 [a_2]: 0.00012412 [accelerated_algorithm]: 8.42e-06 [shard]: 2.82002e-06 [meta_shard_fg_expand]: 2.74999e-06 [shard_inline]: 1.336e-05 [merge_send_recv]: 8.72998e-06 [auto_parallel]: 9.81e-06 [parallel]: 8.42e-06 [flash_sp]: 4.20999e-06 [merge_comm]: 4.94e-06 [allreduce_fusion]: 4.30999e-06 [matmul_add_comm_reduction]: 9.77001e-06 [allreduce_slice_to_reducescatter]: 7.2e-07 [virtual_shard_identity]: 9.61e-06 [virtual_dataset]: 8.72e-06 [get_grad_eliminate_]: 7.73999e-06 [virtual_output]: 7.46999e-06 [merge_forward]: 5.25999e-06 [cell_reuse_recompute_pass]: 3.93999e-06 [offload_activation]: 1.098e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.031e-05 [merge_recompute_call_nodes]: 1.47999e-06 [before_grad]: 1.466e-05 [set_forward_comm_id_for_comm_node_pass]: 4.82e-06 [meta_fg_expand]: 3.68999e-06 [flash_sp_send_recv_attached]: 1.61998e-06 [receive_attached]: 1.89e-06 [after_resolve]: 1.448e-05 [a_after_grad]: 1.249e-05 [renormalize]: 1.59984e-07 [add_forward_monad_depend]: 1.88002e-06 [auto_monad_grad]: 1.27e-06 [auto_monad_eliminator]: 1.327e-05 [cse]: 2.622e-05 [a_3]: 6.195e-05 [py_interpret_to_execute_after_opt_a]: 2.417e-05 [slice_cell_reuse_recomputed_activation]: 5.22999e-06 [rewriter_after_opt_a]: 5.708e-05 [convert_after_rewriter]: 1.234e-05 [order_py_execute_after_rewriter]: 9.88998e-06 [mutable_eliminate]: 0.00083747 [opt_b]: 0.00035168, [1] [Cycle 1]: 0.00033904, [7] [b_1]: 0.00021435 [b_2]: 1.077e-05 [updatestate_depend_eliminate]: 1e-05 [updatestate_assign_eliminate]: 3.75e-06 [updatestate_loads_eliminate]: 3.64002e-06 [renormalize]: 8.2e-07 [cse]: 3.623e-05 [optimize_parallel_all_gather_comm]: 2.764e-05 [overlap_param_gather]: 6.26e-06 [cconv]: 4.131e-05 [loop_unroll]: 0.00064809 [opt_after_cconv]: 0.00016755, [1] [Cycle 1]: 0.00015643, [7] [c_1]: 4.178e-05 [parameter_eliminate]: 6.98e-06 [updatestate_depend_eliminate]: 9.39e-06 [updatestate_assign_eliminate]: 4.14002e-06 [updatestate_loads_eliminate]: 3.41001e-06 [cse]: 3.252e-05 [renormalize]: 8.39995e-07 [remove_dup_value]: 2.373e-05 [tuple_transform]: 0.00011864, [1] [Cycle 1]: 0.00011084, [4] [d_1]: 6.532e-05 [none_parameter_eliminate]: 1.62999e-06 [renormalize]: 2.89991e-07 [switch_simplify]: 9.82999e-06 [partial_unused_args_eliminate]: 5.36998e-06 [add_recomputation]: 6.949e-05 [cse_after_recomputation]: 3.547e-05, [1] [Cycle 1]: 2.797e-05, [1] [cse]: 1.725e-05 [environ_conv]: 1.081e-05 [swap_dp_allreduce_reducescatter]: 9.30001e-06 [bias_add_comm_swap]: 6.07001e-06 [label_micro_interleaved_index]: 9.56998e-06 [label_fine_grained_interleaved_index]: 6.02001e-06 [merge_cast_opt]: 4.45999e-06 [slice_recompute_activation]: 4.79e-06 [micro_interleaved_order_control]: 5.22e-06 [assign_add_opt]: 3.8e-06 [ForceFp32Comm]: 3.29001e-06 [remove_cast_before_assign_add]: 3.45e-06 [full_micro_interleaved_order_control]: 5.15999e-06 [reorder_send_recv_between_fp_bp]: 6.02999e-06 [comm_op_add_attrs]: 3.55e-06 [add_comm_op_reuse_tag]: 3.34001e-06 [interleave_split_concat_branches]: 3.56999e-06 [interleave_parallel_branches]: 3.58999e-06 [overlap_opt_shard_in_pipeline]: 3.66999e-06 [overlap_opt_shard_grad_in_pipeline]: 4.45e-06 [control_data_broadcast_order]: 1.871e-05 [grouped_pairwise_exchange_alltoall]: 3.82998e-06 [offloading_packed_experts]: 7.35e-06 [overlap_recompute_and_grad_model_parallel]: 8.37998e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.63e-06 [overlap_recompute_allgather_and_fa_grad]: 3.91999e-06 [overlap_recompute_comm]: 4.60999e-06 [overlap_grad_ring_attention]: 6.98e-06 [overlap_grad_flash_sp]: 2.951e-05 [begin_end_overlap_inline]: 3.33e-06 [split_matmul_comm_elemetwise]: 5.59e-06 [split_layernorm_comm]: 4.58001e-06 [handle_group_info]: 3.4e-06 [symbol_engine_optimizer]: 0.00012346, [1] [Cycle 1]: 0.00011487, [6] [build]: 4.08001e-06 [elim_shapecalc]: 1.457e-05 [elim_not_effective]: 1.868e-05 [opt_reshape]: 9.94001e-06 [fold_const_symbol]: 1.349e-05 [renormalize]: 5.10016e-07 [detach_backward]: 6.42001e-06 [pipeline_parallel_scheduler]: 2.36e-06 [auto_monad_reorder]: 2.772e-05 [get_jit_bprop_graph]: 2.56998e-06 [rewriter_after_jit_bprop_graph]: 6.57002e-06 [opt_after_jit_grad]: 0.00079054 [validate]: 5.805e-05 Sums bootstrap : 0.000493s : 3.34% type_inference : 0.007211s : 48.87% event_method : 0.000024s : 0.17% auto_monad : 0.000073s : 0.49% graph_reusing : 0.000007s : 0.04% inline : 0.000003s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000025s : 0.17% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.04% parallel-infer-symbol : 0.000004s : 0.02% pre_auto_parallel : 0.000044s : 0.30% insert-virtual-dataset : 0.000003s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.01% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000038s : 0.26% optimize.rewriter_before_opt_a : 0.000108s : 0.73% optimize.opt_a.expand_dump_flag : 0.000006s : 0.04% optimize.opt_a.switch_simplify : 0.000055s : 0.37% optimize.opt_a.loop_unroll : 0.000041s : 0.28% optimize.opt_a.a_1 : 0.000937s : 6.35% optimize.opt_a.with_stream_mark : 0.000040s : 0.27% optimize.opt_a.recompute_prepare : 0.000021s : 0.14% optimize.opt_a.updatestate_depend_eliminate : 0.000010s : 0.07% optimize.opt_a.updatestate_assign_eliminate : 0.000008s : 0.06% optimize.opt_a.updatestate_loads_eliminate : 0.000007s : 0.05% optimize.opt_a.parameter_eliminate : 0.000004s : 0.03% optimize.opt_a.a_2 : 0.000259s : 1.76% optimize.opt_a.accelerated_algorithm : 0.000019s : 0.13% optimize.opt_a.shard : 0.000005s : 0.03% optimize.opt_a.meta_shard_fg_expand : 0.000005s : 0.04% optimize.opt_a.shard_inline : 0.000022s : 0.15% optimize.opt_a.merge_send_recv : 0.000019s : 0.13% optimize.opt_a.auto_parallel : 0.000020s : 0.13% optimize.opt_a.parallel : 0.000029s : 0.20% optimize.opt_a.flash_sp : 0.000016s : 0.11% optimize.opt_a.merge_comm : 0.000010s : 0.07% optimize.opt_a.allreduce_fusion : 0.000009s : 0.06% optimize.opt_a.matmul_add_comm_reduction : 0.000021s : 0.14% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000021s : 0.14% optimize.opt_a.virtual_dataset : 0.000018s : 0.12% optimize.opt_a.get_grad_eliminate_ : 0.000016s : 0.11% optimize.opt_a.virtual_output : 0.000016s : 0.11% optimize.opt_a.merge_forward : 0.000010s : 0.07% optimize.opt_a.cell_reuse_recompute_pass : 0.000006s : 0.04% optimize.opt_a.offload_activation : 0.000023s : 0.16% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000041s : 0.28% optimize.opt_a.merge_recompute_call_nodes : 0.000004s : 0.02% optimize.opt_a.before_grad : 0.000029s : 0.20% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000010s : 0.07% optimize.opt_a.meta_fg_expand : 0.000008s : 0.05% optimize.opt_a.flash_sp_send_recv_attached : 0.000005s : 0.04% optimize.opt_a.receive_attached : 0.000004s : 0.03% optimize.opt_a.after_resolve : 0.000031s : 0.21% optimize.opt_a.a_after_grad : 0.000026s : 0.17% optimize.opt_a.renormalize : 0.001211s : 8.21% optimize.opt_a.add_forward_monad_depend : 0.000013s : 0.09% optimize.opt_a.auto_monad_grad : 0.000004s : 0.03% optimize.opt_a.auto_monad_eliminator : 0.000037s : 0.25% optimize.opt_a.cse : 0.000067s : 0.46% optimize.opt_a.a_3 : 0.000149s : 1.01% optimize.py_interpret_to_execute_after_opt_a : 0.000024s : 0.16% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.04% optimize.rewriter_after_opt_a : 0.000057s : 0.39% optimize.convert_after_rewriter : 0.000012s : 0.08% optimize.order_py_execute_after_rewriter : 0.000010s : 0.07% optimize.mutable_eliminate : 0.000837s : 5.68% optimize.opt_b.b_1 : 0.000214s : 1.45% optimize.opt_b.b_2 : 0.000011s : 0.07% optimize.opt_b.updatestate_depend_eliminate : 0.000010s : 0.07% optimize.opt_b.updatestate_assign_eliminate : 0.000004s : 0.03% optimize.opt_b.updatestate_loads_eliminate : 0.000004s : 0.02% optimize.opt_b.renormalize : 0.000001s : 0.01% optimize.opt_b.cse : 0.000036s : 0.25% optimize.optimize_parallel_all_gather_comm : 0.000028s : 0.19% optimize.overlap_param_gather : 0.000006s : 0.04% optimize.cconv : 0.000041s : 0.28% optimize.loop_unroll : 0.000648s : 4.39% optimize.opt_after_cconv.c_1 : 0.000042s : 0.28% optimize.opt_after_cconv.parameter_eliminate : 0.000007s : 0.05% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000009s : 0.06% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000004s : 0.03% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.02% optimize.opt_after_cconv.cse : 0.000033s : 0.22% optimize.opt_after_cconv.renormalize : 0.000001s : 0.01% optimize.remove_dup_value : 0.000024s : 0.16% optimize.tuple_transform.d_1 : 0.000065s : 0.44% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000010s : 0.07% optimize.partial_unused_args_eliminate : 0.000005s : 0.04% optimize.add_recomputation : 0.000069s : 0.47% optimize.cse_after_recomputation.cse : 0.000017s : 0.12% optimize.environ_conv : 0.000011s : 0.07% optimize.swap_dp_allreduce_reducescatter : 0.000009s : 0.06% optimize.bias_add_comm_swap : 0.000006s : 0.04% optimize.label_micro_interleaved_index : 0.000010s : 0.06% optimize.label_fine_grained_interleaved_index : 0.000006s : 0.04% optimize.merge_cast_opt : 0.000004s : 0.03% optimize.slice_recompute_activation : 0.000005s : 0.03% optimize.micro_interleaved_order_control : 0.000005s : 0.04% optimize.assign_add_opt : 0.000004s : 0.03% optimize.ForceFp32Comm : 0.000003s : 0.02% optimize.remove_cast_before_assign_add : 0.000003s : 0.02% optimize.full_micro_interleaved_order_control : 0.000005s : 0.03% optimize.reorder_send_recv_between_fp_bp : 0.000006s : 0.04% optimize.comm_op_add_attrs : 0.000004s : 0.02% optimize.add_comm_op_reuse_tag : 0.000003s : 0.02% optimize.interleave_split_concat_branches : 0.000004s : 0.02% optimize.interleave_parallel_branches : 0.000004s : 0.02% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.02% optimize.overlap_opt_shard_grad_in_pipeline : 0.000004s : 0.03% optimize.control_data_broadcast_order : 0.000019s : 0.13% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.03% optimize.offloading_packed_experts : 0.000007s : 0.05% optimize.overlap_recompute_and_grad_model_parallel : 0.000008s : 0.06% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.02% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.03% optimize.overlap_recompute_comm : 0.000005s : 0.03% optimize.overlap_grad_ring_attention : 0.000007s : 0.05% optimize.overlap_grad_flash_sp : 0.000030s : 0.20% optimize.begin_end_overlap_inline : 0.000003s : 0.02% optimize.split_matmul_comm_elemetwise : 0.000006s : 0.04% optimize.split_layernorm_comm : 0.000005s : 0.03% optimize.handle_group_info : 0.000003s : 0.02% optimize.symbol_engine_optimizer.build : 0.000004s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000015s : 0.10% optimize.symbol_engine_optimizer.elim_not_effective : 0.000019s : 0.13% optimize.symbol_engine_optimizer.opt_reshape : 0.000010s : 0.07% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000013s : 0.09% optimize.symbol_engine_optimizer.renormalize : 0.000001s : 0.00% detach_backward : 0.000006s : 0.04% pipeline_parallel_scheduler : 0.000002s : 0.02% auto_monad_reorder : 0.000028s : 0.19% get_jit_bprop_graph : 0.000003s : 0.02% rewriter_after_jit_bprop_graph : 0.000007s : 0.04% opt_after_jit_grad : 0.000791s : 5.36% validate : 0.000058s : 0.39% Time group info: ------[substitution.] 0.000261 38 12.64% : 0.000033s : 3: substitution.cast_eliminate 1.19% : 0.000003s : 3: substitution.elim_not_effective 0.61% : 0.000002s : 3: substitution.fold_const_symbol 2.84% : 0.000007s : 5: substitution.graph_param_transform 69.42% : 0.000181s : 4: substitution.inline 2.19% : 0.000006s : 6: substitution.j_node_and_user_rematch 3.13% : 0.000008s : 6: substitution.remove_not_recompute_node 2.96% : 0.000008s : 4: substitution.replace_old_param 5.02% : 0.000013s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.007137 2 87.41% : 0.006239s : 1: type_inference.infer 12.59% : 0.000899s : 1: type_inference.specialize ------[replace.] 0.000069 8 63.37% : 0.000044s : 4: replace.inline 36.63% : 0.000025s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000189 8 94.11% : 0.000178s : 4: match.inline 5.89% : 0.000011s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000273 1504 0.89% : 0.000002s : 15: predicate.accumulaten_eliminater 1.34% : 0.000004s : 5: predicate.ad_related_special_op_eliminate 0.60% : 0.000002s : 10: predicate.addn_check_dump 0.83% : 0.000002s : 15: predicate.addn_zero_filter 0.78% : 0.000002s : 15: predicate.adjust_all_reduce_mul_add 2.11% : 0.000006s : 25: predicate.arithmetic_simplify 0.96% : 0.000003s : 15: predicate.cast_eliminate 0.67% : 0.000002s : 10: predicate.check_bprop_eliminate 0.58% : 0.000002s : 10: predicate.compare_switch_simplify 0.16% : 0.000000s : 5: predicate.const_output_eliminate 0.76% : 0.000002s : 10: predicate.depend_value_elim 0.85% : 0.000002s : 15: predicate.dict_get_item_const_eliminator 1.11% : 0.000003s : 15: predicate.dict_get_item_eliminator 0.83% : 0.000002s : 15: predicate.dict_set_item_eliminator 1.07% : 0.000003s : 10: predicate.dumpgradient_eliminate 0.27% : 0.000001s : 5: predicate.elim_not_effective 0.38% : 0.000001s : 5: predicate.elim_shapecalc_of_broadcastargs 1.20% : 0.000003s : 20: predicate.environ_add_const_eliminate 1.16% : 0.000003s : 20: predicate.environ_get_add_eliminate 1.07% : 0.000003s : 20: predicate.environ_get_depend_swap 1.75% : 0.000005s : 30: predicate.environ_get_eliminate 1.44% : 0.000004s : 20: predicate.environ_get_set_eliminate 1.26% : 0.000003s : 23: predicate.exchange_switch_depend_value 2.09% : 0.000006s : 23: predicate.float_depend_g_call 0.57% : 0.000002s : 10: predicate.float_environ_get_switch 0.86% : 0.000002s : 15: predicate.float_tuple_getitem_switch 0.18% : 0.000001s : 5: predicate.fold_const_symbol 0.73% : 0.000002s : 10: predicate.get_grad_eliminate 0.19% : 0.000001s : 5: predicate.graph_param_transform 0.60% : 0.000002s : 10: predicate.incorporate_call 0.49% : 0.000001s : 10: predicate.incorporate_call_switch 5.96% : 0.000016s : 68: predicate.inline 0.89% : 0.000002s : 10: predicate.inline_without_move 0.31% : 0.000001s : 10: predicate.j_node_and_user_rematch 1.14% : 0.000003s : 10: predicate.less_batch_normalization 1.81% : 0.000005s : 29: predicate.list_to_tuple_eliminator_ 2.61% : 0.000007s : 44: predicate.load_eliminater 1.02% : 0.000003s : 5: predicate.loop_unroll_after_grad 1.97% : 0.000005s : 36: predicate.loop_unroll_before_grad 1.74% : 0.000005s : 25: predicate.make_slice_get_slice_eliminator 0.64% : 0.000002s : 10: predicate.merge_addn 0.72% : 0.000002s : 10: predicate.micro_step_allgather_replace 0.65% : 0.000002s : 10: predicate.mini_step_allgather_replace 0.93% : 0.000003s : 15: predicate.minmaximum_grad 1.31% : 0.000004s : 5: predicate.mutable_eliminate 0.45% : 0.000001s : 5: predicate.opt_reshape 0.43% : 0.000001s : 5: predicate.parallel_virtual_node 1.58% : 0.000004s : 23: predicate.partial_defer_inline 1.42% : 0.000004s : 24: predicate.partial_eliminate 0.83% : 0.000002s : 15: predicate.print_const_string_wrapper 0.66% : 0.000002s : 10: predicate.reduce_all_const_elim 1.13% : 0.000003s : 15: predicate.reduce_eliminate 2.47% : 0.000007s : 44: predicate.redundant_stop_gradient_eliminater 0.34% : 0.000001s : 10: predicate.remove_not_recompute_node 1.22% : 0.000003s : 29: predicate.replace_applicator 0.41% : 0.000001s : 10: predicate.replace_old_param 0.29% : 0.000001s : 5: predicate.reset_defer_inline 1.09% : 0.000003s : 15: predicate.reshape_eliminate 0.62% : 0.000002s : 10: predicate.row_tensor_add_zeros_like 0.42% : 0.000001s : 5: predicate.row_tensor_eliminate 0.75% : 0.000002s : 10: predicate.same_eliminate 0.45% : 0.000001s : 10: predicate.set_cell_output_no_recompute 0.88% : 0.000002s : 10: predicate.shard_identity_eliminate 0.83% : 0.000002s : 10: predicate.special_op_eliminate 0.67% : 0.000002s : 10: predicate.specialize_transform 1.12% : 0.000003s : 10: predicate.split_environ_get_set_with_tuple_value 0.78% : 0.000002s : 10: predicate.stack_unstack_eliminate 0.36% : 0.000001s : 5: predicate.switch_call_monad_eliminater 1.33% : 0.000004s : 23: predicate.switch_defer_inline 1.96% : 0.000005s : 33: predicate.switch_layer_defer_inline 4.61% : 0.000013s : 74: predicate.switch_simplify 0.88% : 0.000002s : 15: predicate.tile_eliminate 0.86% : 0.000002s : 15: predicate.transpose_eliminate 1.72% : 0.000005s : 25: predicate.tuple_list_convert_item_index_to_positive 1.62% : 0.000004s : 25: predicate.tuple_list_get_item_const_eliminator 1.55% : 0.000004s : 25: predicate.tuple_list_get_item_depend_reorder 3.26% : 0.000009s : 39: predicate.tuple_list_get_item_eliminator 1.55% : 0.000004s : 25: predicate.tuple_list_get_set_item_eliminator 2.22% : 0.000006s : 35: predicate.tuple_list_set_item_eliminator 2.00% : 0.000005s : 29: predicate.tuple_to_list_eliminator_ 2.33% : 0.000006s : 44: predicate.updatestate_pure_node_eliminater 2.97% : 0.000008s : 54: predicate.updatestate_useless_node_eliminater 0.40% : 0.000001s : 5: predicate.value_based_eliminate 0.70% : 0.000002s : 10: predicate.virtual_dataset_eliminate 0.64% : 0.000002s : 10: predicate.virtual_output_eliminate 0.30% : 0.000001s : 5: predicate.virtual_view_grad_eliminate 0.41% : 0.000001s : 5: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000726 11 50.54% : 0.000367s : 5: func_graph_cloner_run.FuncGraphClonerGraph 49.46% : 0.000359s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.035181 192 0.02% : 0.000006s : 1: ForceFp32Comm 11.66% : 0.004101s : 1: add_attr 11.59% : 0.004079s : 1: add_attr_with_inline 0.02% : 0.000006s : 1: add_comm_op_reuse_tag 0.21% : 0.000074s : 1: add_recomputation 0.02% : 0.000006s : 1: assign_add_opt 0.24% : 0.000086s : 1: auto_monad 0.10% : 0.000036s : 1: auto_monad_reorder 0.02% : 0.000006s : 1: begin_end_overlap_inline 0.03% : 0.000009s : 1: bias_add_comm_swap 1.54% : 0.000541s : 1: bootstrap 0.13% : 0.000045s : 1: cconv 0.02% : 0.000006s : 1: comm_op_add_attrs 0.06% : 0.000022s : 1: control_data_broadcast_order 0.04% : 0.000015s : 1: convert_after_rewriter 0.11% : 0.000039s : 1: cse_after_recomputation 0.02% : 0.000007s : 1: dataset_repeat_opt 0.08% : 0.000030s : 1: detach_backward 0.04% : 0.000014s : 1: environ_conv 0.10% : 0.000036s : 1: event_method 0.02% : 0.000008s : 1: full_micro_interleaved_order_control 0.03% : 0.000009s : 1: get_jit_bprop_graph 0.04% : 0.000014s : 1: graph_reusing 0.02% : 0.000006s : 1: grouped_pairwise_exchange_alltoall 0.02% : 0.000006s : 1: handle_group_info 0.03% : 0.000010s : 1: inline 0.02% : 0.000009s : 1: insert-virtual-dataset 0.02% : 0.000006s : 1: interleave_parallel_branches 0.02% : 0.000006s : 1: interleave_split_concat_branches 0.03% : 0.000009s : 1: label_fine_grained_interleaved_index 0.03% : 0.000012s : 1: label_micro_interleaved_index 1.86% : 0.000656s : 1: loop_unroll 0.02% : 0.000007s : 1: merge_cast_opt 0.02% : 0.000008s : 1: micro_interleaved_order_control 2.41% : 0.000848s : 1: mutable_eliminate 0.03% : 0.000010s : 1: offloading_packed_experts 0.07% : 0.000023s : 1: opt.transform.loop_unroll_optimizer 0.07% : 0.000026s : 1: opt.transform.mutable_eliminate 4.28% : 0.001507s : 78: opt.transform.opt_a 0.11% : 0.000040s : 1: opt.transform.opt_after_cconv 0.12% : 0.000044s : 1: opt.transform.opt_after_jit_grad 0.41% : 0.000146s : 28: opt.transform.opt_b 0.21% : 0.000072s : 2: opt.transform.opt_trans_graph 0.15% : 0.000052s : 4: opt.transform.symbol_engine_opt 11.79% : 0.004149s : 1: opt_a 0.49% : 0.000172s : 1: opt_after_cconv 2.29% : 0.000806s : 1: opt_after_jit_grad 1.01% : 0.000356s : 1: opt_b 21.89% : 0.007700s : 1: optimize 0.09% : 0.000031s : 1: optimize_parallel_all_gather_comm 0.04% : 0.000013s : 1: order_py_execute_after_rewriter 0.09% : 0.000033s : 1: overlap_grad_flash_sp 0.02% : 0.000007s : 1: overlap_grad_matmul_and_grad_allreduce 0.03% : 0.000010s : 1: overlap_grad_ring_attention 0.02% : 0.000008s : 1: overlap_opt_shard_grad_in_pipeline 0.02% : 0.000007s : 1: overlap_opt_shard_in_pipeline 0.03% : 0.000009s : 1: overlap_param_gather 0.02% : 0.000007s : 1: overlap_recompute_allgather_and_fa_grad 0.03% : 0.000011s : 1: overlap_recompute_and_grad_model_parallel 0.02% : 0.000007s : 1: overlap_recompute_comm 0.03% : 0.000011s : 1: parallel-infer-symbol 0.02% : 0.000007s : 1: parallel-infer-symbol-second 0.02% : 0.000009s : 1: partial_unused_args_eliminate 0.03% : 0.000011s : 1: pipeline_parallel_scheduler 0.02% : 0.000007s : 1: pipeline_split 0.15% : 0.000051s : 1: pre_auto_parallel 0.12% : 0.000042s : 1: py_interpret_to_execute 0.08% : 0.000029s : 1: py_interpret_to_execute_after_opt_a 0.02% : 0.000007s : 1: remove_cast_before_assign_add 0.08% : 0.000027s : 1: remove_dup_value 2.07% : 0.000727s : 1: renormalize.infer 1.34% : 0.000471s : 1: renormalize.specialize 0.03% : 0.000009s : 1: reorder_send_recv_between_fp_bp 0.04% : 0.000013s : 1: rewriter_after_jit_bprop_graph 0.18% : 0.000062s : 1: rewriter_after_opt_a 0.32% : 0.000112s : 1: rewriter_before_opt_a 0.02% : 0.000008s : 1: slice_cell_reuse_recomputed_activation 0.02% : 0.000008s : 1: slice_recompute_activation 0.02% : 0.000007s : 1: split_layernorm_comm 0.02% : 0.000008s : 1: split_matmul_comm_elemetwise 0.03% : 0.000012s : 1: swap_dp_allreduce_reducescatter 0.36% : 0.000126s : 1: symbol_engine_optimizer 0.35% : 0.000122s : 1: tuple_transform 20.67% : 0.007271s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:45:08.302.500 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0458311, [21] [bootstrap]: 0.00045121 [type_inference]: 0.00613348 [event_method]: 2.3e-05 [auto_monad]: 7.036e-05 [graph_reusing]: 5.71998e-06 [inline]: 2.79001e-06 [add_attr]: 0.00497071, [1] [add_attr_with_inline]: 0.00495711, [1] [Cycle 1]: 7.477e-05, [2] [tag_attr]: 2.537e-05 [meta_addattr_fg_expand]: 5.99999e-06 [parallel-infer-symbol]: 3.9e-06 [pre_auto_parallel]: 4.267e-05 [insert-virtual-dataset]: 2.98003e-06 [parallel-infer-symbol-second]: 9.79984e-07 [dataset_repeat_opt]: 1.82001e-06 [pipeline_split]: 1.76003e-06 [optimize]: 0.0333307, [53] [py_interpret_to_execute]: 3.131e-05 [rewriter_before_opt_a]: 9.511e-05 [opt_a]: 0.0307723, [2] [Cycle 1]: 0.0298478, [45] [expand_dump_flag]: 2.79001e-06 [switch_simplify]: 4.444e-05 [loop_unroll]: 3.161e-05 [a_1]: 0.00071285 [with_stream_mark]: 2.212e-05 [recompute_prepare]: 1.319e-05 [updatestate_depend_eliminate]: 4.40999e-06 [updatestate_assign_eliminate]: 4.42e-06 [updatestate_loads_eliminate]: 3.48e-06 [parameter_eliminate]: 1.78002e-06 [a_2]: 0.00010486 [accelerated_algorithm]: 9.15001e-06 [shard]: 1.96998e-06 [meta_shard_fg_expand]: 2.44001e-06 [shard_inline]: 8.25999e-06 [merge_send_recv]: 9.37001e-06 [auto_parallel]: 8.90001e-06 [parallel]: 2.119e-05 [flash_sp]: 1.023e-05 [merge_comm]: 4.79e-06 [allreduce_fusion]: 4.53001e-06 [matmul_add_comm_reduction]: 1.227e-05 [allreduce_slice_to_reducescatter]: 8.09989e-07 [virtual_shard_identity]: 1.1e-05 [virtual_dataset]: 8.22e-06 [get_grad_eliminate_]: 7.66001e-06 [virtual_output]: 8.08999e-06 [merge_forward]: 4.89998e-06 [cell_reuse_recompute_pass]: 1.66e-06 [offload_activation]: 1.08e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.863e-05 [merge_recompute_call_nodes]: 1.47999e-06 [before_grad]: 1.421e-05 [set_forward_comm_id_for_comm_node_pass]: 4.63999e-06 [meta_fg_expand]: 3.38999e-06 [flash_sp_send_recv_attached]: 2.84001e-06 [receive_attached]: 2.32999e-06 [after_resolve]: 1.425e-05 [a_after_grad]: 1.292e-05 [renormalize]: 0.0282244 [add_forward_monad_depend]: 1.148e-05 [auto_monad_grad]: 3.04001e-06 [auto_monad_eliminator]: 2.9e-05 [cse]: 4.264e-05 [a_3]: 7.58e-05 [Cycle 2]: 0.0009122, [45] [expand_dump_flag]: 2.04e-06 [switch_simplify]: 1.132e-05 [loop_unroll]: 7.86001e-06 [a_1]: 0.00019854 [with_stream_mark]: 2.171e-05 [recompute_prepare]: 9.10999e-06 [updatestate_depend_eliminate]: 5.88002e-06 [updatestate_assign_eliminate]: 3.74002e-06 [updatestate_loads_eliminate]: 3.88001e-06 [parameter_eliminate]: 1.99e-06 [a_2]: 9.521e-05 [accelerated_algorithm]: 9.62001e-06 [shard]: 2.64999e-06 [meta_shard_fg_expand]: 2.90002e-06 [shard_inline]: 8.38001e-06 [merge_send_recv]: 1.142e-05 [auto_parallel]: 1.069e-05 [parallel]: 9.72001e-06 [flash_sp]: 4.23999e-06 [merge_comm]: 5.10999e-06 [allreduce_fusion]: 7.09001e-06 [matmul_add_comm_reduction]: 1.138e-05 [allreduce_slice_to_reducescatter]: 6.69999e-07 [virtual_shard_identity]: 1.113e-05 [virtual_dataset]: 7.71999e-06 [get_grad_eliminate_]: 7.32002e-06 [virtual_output]: 7.13998e-06 [merge_forward]: 5.64998e-06 [cell_reuse_recompute_pass]: 3.34001e-06 [offload_activation]: 1.184e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.708e-05 [merge_recompute_call_nodes]: 1.68002e-06 [before_grad]: 2.558e-05 [set_forward_comm_id_for_comm_node_pass]: 6.56999e-06 [meta_fg_expand]: 3.34001e-06 [flash_sp_send_recv_attached]: 1.94e-06 [receive_attached]: 2.73998e-06 [after_resolve]: 1.663e-05 [a_after_grad]: 1.29e-05 [renormalize]: 8.00064e-08 [add_forward_monad_depend]: 2.56998e-06 [auto_monad_grad]: 2.17001e-06 [auto_monad_eliminator]: 1.216e-05 [cse]: 2.259e-05 [a_3]: 4.881e-05 [py_interpret_to_execute_after_opt_a]: 2.11e-05 [slice_cell_reuse_recomputed_activation]: 1.85001e-06 [rewriter_after_opt_a]: 4.869e-05 [convert_after_rewriter]: 8.53001e-06 [order_py_execute_after_rewriter]: 6.46e-06 [mutable_eliminate]: 0.00075327 [opt_b]: 0.00026864, [1] [Cycle 1]: 0.00026084, [7] [b_1]: 0.00016139 [b_2]: 1.083e-05 [updatestate_depend_eliminate]: 1.104e-05 [updatestate_assign_eliminate]: 3.33e-06 [updatestate_loads_eliminate]: 3.29001e-06 [renormalize]: 8.39995e-07 [cse]: 3.073e-05 [optimize_parallel_all_gather_comm]: 2.164e-05 [overlap_param_gather]: 2.02999e-06 [cconv]: 3.359e-05 [loop_unroll]: 0.00048291 [opt_after_cconv]: 0.0001275, [1] [Cycle 1]: 0.00012111, [7] [c_1]: 3.917e-05 [parameter_eliminate]: 4.57e-06 [updatestate_depend_eliminate]: 8.07e-06 [updatestate_assign_eliminate]: 3.36999e-06 [updatestate_loads_eliminate]: 3.04999e-06 [cse]: 2.638e-05 [renormalize]: 2.69996e-07 [remove_dup_value]: 1.734e-05 [tuple_transform]: 9.225e-05, [1] [Cycle 1]: 8.763e-05, [4] [d_1]: 5.687e-05 [none_parameter_eliminate]: 2.71999e-06 [renormalize]: 1.69995e-07 [switch_simplify]: 8.79998e-06 [partial_unused_args_eliminate]: 2.19001e-06 [add_recomputation]: 6.477e-05 [cse_after_recomputation]: 2.76e-05, [1] [Cycle 1]: 2.228e-05, [1] [cse]: 1.607e-05 [environ_conv]: 6.66e-06 [swap_dp_allreduce_reducescatter]: 6.36e-06 [bias_add_comm_swap]: 3.19001e-06 [label_micro_interleaved_index]: 5.06002e-06 [label_fine_grained_interleaved_index]: 2.83998e-06 [merge_cast_opt]: 1.60999e-06 [slice_recompute_activation]: 2.07999e-06 [micro_interleaved_order_control]: 2.61999e-06 [assign_add_opt]: 1.17e-06 [ForceFp32Comm]: 8.60018e-07 [remove_cast_before_assign_add]: 1.02e-06 [full_micro_interleaved_order_control]: 2.71e-06 [reorder_send_recv_between_fp_bp]: 2.93998e-06 [comm_op_add_attrs]: 1.02e-06 [add_comm_op_reuse_tag]: 1.24e-06 [interleave_split_concat_branches]: 1.19998e-06 [interleave_parallel_branches]: 1.09998e-06 [overlap_opt_shard_in_pipeline]: 1.29e-06 [overlap_opt_shard_grad_in_pipeline]: 2.01e-06 [control_data_broadcast_order]: 1.643e-05 [grouped_pairwise_exchange_alltoall]: 2.02001e-06 [offloading_packed_experts]: 4.87e-06 [overlap_recompute_and_grad_model_parallel]: 5.97001e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.56002e-06 [overlap_recompute_allgather_and_fa_grad]: 1.37e-06 [overlap_recompute_comm]: 2.26998e-06 [overlap_grad_ring_attention]: 5.10001e-06 [overlap_grad_flash_sp]: 2.602e-05 [begin_end_overlap_inline]: 5.60016e-07 [split_matmul_comm_elemetwise]: 2.36e-06 [split_layernorm_comm]: 1.66998e-06 [handle_group_info]: 9.39996e-07 [symbol_engine_optimizer]: 9.767e-05, [1] [Cycle 1]: 9.206e-05, [6] [build]: 5.15999e-06 [elim_shapecalc]: 1.584e-05 [elim_not_effective]: 1.629e-05 [opt_reshape]: 8.73001e-06 [fold_const_symbol]: 1.286e-05 [renormalize]: 3.4002e-07 [detach_backward]: 2.22001e-06 [pipeline_parallel_scheduler]: 1.54e-06 [auto_monad_reorder]: 2.228e-05 [get_jit_bprop_graph]: 2.24001e-06 [rewriter_after_jit_bprop_graph]: 5.64e-06 [opt_after_jit_grad]: 0.00050732 [validate]: 4.996e-05 Sums bootstrap : 0.000451s : 1.14% type_inference : 0.006133s : 15.43% event_method : 0.000023s : 0.06% auto_monad : 0.000070s : 0.18% graph_reusing : 0.000006s : 0.01% inline : 0.000003s : 0.01% add_attr.add_attr_with_inline.tag_attr : 0.000025s : 0.06% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.02% parallel-infer-symbol : 0.000004s : 0.01% pre_auto_parallel : 0.000043s : 0.11% insert-virtual-dataset : 0.000003s : 0.01% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000031s : 0.08% optimize.rewriter_before_opt_a : 0.000095s : 0.24% optimize.opt_a.expand_dump_flag : 0.000005s : 0.01% optimize.opt_a.switch_simplify : 0.000056s : 0.14% optimize.opt_a.loop_unroll : 0.000039s : 0.10% optimize.opt_a.a_1 : 0.000911s : 2.29% optimize.opt_a.with_stream_mark : 0.000044s : 0.11% optimize.opt_a.recompute_prepare : 0.000022s : 0.06% optimize.opt_a.updatestate_depend_eliminate : 0.000010s : 0.03% optimize.opt_a.updatestate_assign_eliminate : 0.000008s : 0.02% optimize.opt_a.updatestate_loads_eliminate : 0.000007s : 0.02% optimize.opt_a.parameter_eliminate : 0.000004s : 0.01% optimize.opt_a.a_2 : 0.000200s : 0.50% optimize.opt_a.accelerated_algorithm : 0.000019s : 0.05% optimize.opt_a.shard : 0.000005s : 0.01% optimize.opt_a.meta_shard_fg_expand : 0.000005s : 0.01% optimize.opt_a.shard_inline : 0.000017s : 0.04% optimize.opt_a.merge_send_recv : 0.000021s : 0.05% optimize.opt_a.auto_parallel : 0.000020s : 0.05% optimize.opt_a.parallel : 0.000031s : 0.08% optimize.opt_a.flash_sp : 0.000014s : 0.04% optimize.opt_a.merge_comm : 0.000010s : 0.02% optimize.opt_a.allreduce_fusion : 0.000012s : 0.03% optimize.opt_a.matmul_add_comm_reduction : 0.000024s : 0.06% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000022s : 0.06% optimize.opt_a.virtual_dataset : 0.000016s : 0.04% optimize.opt_a.get_grad_eliminate_ : 0.000015s : 0.04% optimize.opt_a.virtual_output : 0.000015s : 0.04% optimize.opt_a.merge_forward : 0.000011s : 0.03% optimize.opt_a.cell_reuse_recompute_pass : 0.000005s : 0.01% optimize.opt_a.offload_activation : 0.000023s : 0.06% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000036s : 0.09% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.01% optimize.opt_a.before_grad : 0.000040s : 0.10% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000011s : 0.03% optimize.opt_a.meta_fg_expand : 0.000007s : 0.02% optimize.opt_a.flash_sp_send_recv_attached : 0.000005s : 0.01% optimize.opt_a.receive_attached : 0.000005s : 0.01% optimize.opt_a.after_resolve : 0.000031s : 0.08% optimize.opt_a.a_after_grad : 0.000026s : 0.06% optimize.opt_a.renormalize : 0.028224s : 71.00% optimize.opt_a.add_forward_monad_depend : 0.000014s : 0.04% optimize.opt_a.auto_monad_grad : 0.000005s : 0.01% optimize.opt_a.auto_monad_eliminator : 0.000041s : 0.10% optimize.opt_a.cse : 0.000065s : 0.16% optimize.opt_a.a_3 : 0.000125s : 0.31% optimize.py_interpret_to_execute_after_opt_a : 0.000021s : 0.05% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.00% optimize.rewriter_after_opt_a : 0.000049s : 0.12% optimize.convert_after_rewriter : 0.000009s : 0.02% optimize.order_py_execute_after_rewriter : 0.000006s : 0.02% optimize.mutable_eliminate : 0.000753s : 1.89% optimize.opt_b.b_1 : 0.000161s : 0.41% optimize.opt_b.b_2 : 0.000011s : 0.03% optimize.opt_b.updatestate_depend_eliminate : 0.000011s : 0.03% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.01% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.01% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000031s : 0.08% optimize.optimize_parallel_all_gather_comm : 0.000022s : 0.05% optimize.overlap_param_gather : 0.000002s : 0.01% optimize.cconv : 0.000034s : 0.08% optimize.loop_unroll : 0.000483s : 1.21% optimize.opt_after_cconv.c_1 : 0.000039s : 0.10% optimize.opt_after_cconv.parameter_eliminate : 0.000005s : 0.01% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000008s : 0.02% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.cse : 0.000026s : 0.07% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000017s : 0.04% optimize.tuple_transform.d_1 : 0.000057s : 0.14% optimize.tuple_transform.none_parameter_eliminate : 0.000003s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000009s : 0.02% optimize.partial_unused_args_eliminate : 0.000002s : 0.01% optimize.add_recomputation : 0.000065s : 0.16% optimize.cse_after_recomputation.cse : 0.000016s : 0.04% optimize.environ_conv : 0.000007s : 0.02% optimize.swap_dp_allreduce_reducescatter : 0.000006s : 0.02% optimize.bias_add_comm_swap : 0.000003s : 0.01% optimize.label_micro_interleaved_index : 0.000005s : 0.01% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.01% optimize.merge_cast_opt : 0.000002s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.01% optimize.micro_interleaved_order_control : 0.000003s : 0.01% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.00% optimize.full_micro_interleaved_order_control : 0.000003s : 0.01% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.01% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.01% optimize.control_data_broadcast_order : 0.000016s : 0.04% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.01% optimize.offloading_packed_experts : 0.000005s : 0.01% optimize.overlap_recompute_and_grad_model_parallel : 0.000006s : 0.02% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000002s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.00% optimize.overlap_recompute_comm : 0.000002s : 0.01% optimize.overlap_grad_ring_attention : 0.000005s : 0.01% optimize.overlap_grad_flash_sp : 0.000026s : 0.07% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.01% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000005s : 0.01% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000016s : 0.04% optimize.symbol_engine_optimizer.elim_not_effective : 0.000016s : 0.04% optimize.symbol_engine_optimizer.opt_reshape : 0.000009s : 0.02% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000013s : 0.03% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.01% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000022s : 0.06% get_jit_bprop_graph : 0.000002s : 0.01% rewriter_after_jit_bprop_graph : 0.000006s : 0.01% opt_after_jit_grad : 0.000507s : 1.28% validate : 0.000050s : 0.13% Time group info: ------[substitution.] 0.000242 38 13.41% : 0.000032s : 3: substitution.cast_eliminate 1.07% : 0.000003s : 3: substitution.elim_not_effective 0.82% : 0.000002s : 3: substitution.fold_const_symbol 3.14% : 0.000008s : 5: substitution.graph_param_transform 68.71% : 0.000166s : 4: substitution.inline 2.39% : 0.000006s : 6: substitution.j_node_and_user_rematch 2.83% : 0.000007s : 6: substitution.remove_not_recompute_node 2.56% : 0.000006s : 4: substitution.replace_old_param 5.07% : 0.000012s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.006044 2 86.96% : 0.005256s : 1: type_inference.infer 13.04% : 0.000788s : 1: type_inference.specialize ------[replace.] 0.000067 8 60.85% : 0.000041s : 4: replace.inline 39.15% : 0.000026s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000173 8 93.99% : 0.000162s : 4: match.inline 6.01% : 0.000010s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000261 1504 0.97% : 0.000003s : 15: predicate.accumulaten_eliminater 0.78% : 0.000002s : 5: predicate.ad_related_special_op_eliminate 0.59% : 0.000002s : 10: predicate.addn_check_dump 0.92% : 0.000002s : 15: predicate.addn_zero_filter 0.85% : 0.000002s : 15: predicate.adjust_all_reduce_mul_add 2.05% : 0.000005s : 25: predicate.arithmetic_simplify 1.55% : 0.000004s : 15: predicate.cast_eliminate 0.61% : 0.000002s : 10: predicate.check_bprop_eliminate 0.58% : 0.000002s : 10: predicate.compare_switch_simplify 0.17% : 0.000000s : 5: predicate.const_output_eliminate 0.60% : 0.000002s : 10: predicate.depend_value_elim 0.90% : 0.000002s : 15: predicate.dict_get_item_const_eliminator 1.00% : 0.000003s : 15: predicate.dict_get_item_eliminator 0.80% : 0.000002s : 15: predicate.dict_set_item_eliminator 1.07% : 0.000003s : 10: predicate.dumpgradient_eliminate 0.19% : 0.000000s : 5: predicate.elim_not_effective 0.54% : 0.000001s : 5: predicate.elim_shapecalc_of_broadcastargs 1.16% : 0.000003s : 20: predicate.environ_add_const_eliminate 1.07% : 0.000003s : 20: predicate.environ_get_add_eliminate 0.99% : 0.000003s : 20: predicate.environ_get_depend_swap 1.58% : 0.000004s : 30: predicate.environ_get_eliminate 1.08% : 0.000003s : 20: predicate.environ_get_set_eliminate 1.25% : 0.000003s : 23: predicate.exchange_switch_depend_value 2.39% : 0.000006s : 23: predicate.float_depend_g_call 0.62% : 0.000002s : 10: predicate.float_environ_get_switch 0.84% : 0.000002s : 15: predicate.float_tuple_getitem_switch 0.18% : 0.000000s : 5: predicate.fold_const_symbol 0.70% : 0.000002s : 10: predicate.get_grad_eliminate 0.21% : 0.000001s : 5: predicate.graph_param_transform 0.61% : 0.000002s : 10: predicate.incorporate_call 0.53% : 0.000001s : 10: predicate.incorporate_call_switch 6.15% : 0.000016s : 68: predicate.inline 1.16% : 0.000003s : 10: predicate.inline_without_move 0.33% : 0.000001s : 10: predicate.j_node_and_user_rematch 1.04% : 0.000003s : 10: predicate.less_batch_normalization 1.71% : 0.000004s : 29: predicate.list_to_tuple_eliminator_ 2.38% : 0.000006s : 44: predicate.load_eliminater 1.27% : 0.000003s : 5: predicate.loop_unroll_after_grad 2.02% : 0.000005s : 36: predicate.loop_unroll_before_grad 1.71% : 0.000004s : 25: predicate.make_slice_get_slice_eliminator 0.58% : 0.000002s : 10: predicate.merge_addn 0.57% : 0.000001s : 10: predicate.micro_step_allgather_replace 0.67% : 0.000002s : 10: predicate.mini_step_allgather_replace 0.85% : 0.000002s : 15: predicate.minmaximum_grad 1.09% : 0.000003s : 5: predicate.mutable_eliminate 0.41% : 0.000001s : 5: predicate.opt_reshape 0.38% : 0.000001s : 5: predicate.parallel_virtual_node 1.84% : 0.000005s : 23: predicate.partial_defer_inline 1.52% : 0.000004s : 24: predicate.partial_eliminate 0.87% : 0.000002s : 15: predicate.print_const_string_wrapper 0.62% : 0.000002s : 10: predicate.reduce_all_const_elim 1.17% : 0.000003s : 15: predicate.reduce_eliminate 2.40% : 0.000006s : 44: predicate.redundant_stop_gradient_eliminater 0.45% : 0.000001s : 10: predicate.remove_not_recompute_node 1.41% : 0.000004s : 29: predicate.replace_applicator 0.66% : 0.000002s : 10: predicate.replace_old_param 0.36% : 0.000001s : 5: predicate.reset_defer_inline 0.95% : 0.000002s : 15: predicate.reshape_eliminate 0.60% : 0.000002s : 10: predicate.row_tensor_add_zeros_like 0.56% : 0.000001s : 5: predicate.row_tensor_eliminate 0.82% : 0.000002s : 10: predicate.same_eliminate 0.50% : 0.000001s : 10: predicate.set_cell_output_no_recompute 0.83% : 0.000002s : 10: predicate.shard_identity_eliminate 0.69% : 0.000002s : 10: predicate.special_op_eliminate 0.72% : 0.000002s : 10: predicate.specialize_transform 1.04% : 0.000003s : 10: predicate.split_environ_get_set_with_tuple_value 0.74% : 0.000002s : 10: predicate.stack_unstack_eliminate 0.34% : 0.000001s : 5: predicate.switch_call_monad_eliminater 1.35% : 0.000004s : 23: predicate.switch_defer_inline 1.89% : 0.000005s : 33: predicate.switch_layer_defer_inline 5.01% : 0.000013s : 74: predicate.switch_simplify 0.91% : 0.000002s : 15: predicate.tile_eliminate 0.95% : 0.000002s : 15: predicate.transpose_eliminate 1.69% : 0.000004s : 25: predicate.tuple_list_convert_item_index_to_positive 1.44% : 0.000004s : 25: predicate.tuple_list_get_item_const_eliminator 1.51% : 0.000004s : 25: predicate.tuple_list_get_item_depend_reorder 3.24% : 0.000008s : 39: predicate.tuple_list_get_item_eliminator 1.53% : 0.000004s : 25: predicate.tuple_list_get_set_item_eliminator 2.19% : 0.000006s : 35: predicate.tuple_list_set_item_eliminator 1.61% : 0.000004s : 29: predicate.tuple_to_list_eliminator_ 2.22% : 0.000006s : 44: predicate.updatestate_pure_node_eliminater 3.17% : 0.000008s : 54: predicate.updatestate_useless_node_eliminater 0.39% : 0.000001s : 5: predicate.value_based_eliminate 0.80% : 0.000002s : 10: predicate.virtual_dataset_eliminate 0.64% : 0.000002s : 10: predicate.virtual_output_eliminate 0.27% : 0.000001s : 5: predicate.virtual_view_grad_eliminate 0.41% : 0.000001s : 5: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000668 11 47.67% : 0.000318s : 5: func_graph_cloner_run.FuncGraphClonerGraph 52.33% : 0.000350s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.114042 192 0.00% : 0.000004s : 1: ForceFp32Comm 4.36% : 0.004978s : 1: add_attr 4.35% : 0.004962s : 1: add_attr_with_inline 0.00% : 0.000004s : 1: add_comm_op_reuse_tag 0.06% : 0.000070s : 1: add_recomputation 0.00% : 0.000004s : 1: assign_add_opt 0.07% : 0.000077s : 1: auto_monad 0.02% : 0.000027s : 1: auto_monad_reorder 0.00% : 0.000003s : 1: begin_end_overlap_inline 0.01% : 0.000006s : 1: bias_add_comm_swap 0.42% : 0.000483s : 1: bootstrap 0.03% : 0.000038s : 1: cconv 0.00% : 0.000004s : 1: comm_op_add_attrs 0.02% : 0.000020s : 1: control_data_broadcast_order 0.01% : 0.000012s : 1: convert_after_rewriter 0.03% : 0.000031s : 1: cse_after_recomputation 0.00% : 0.000005s : 1: dataset_repeat_opt 0.00% : 0.000006s : 1: detach_backward 0.01% : 0.000010s : 1: environ_conv 0.03% : 0.000032s : 1: event_method 0.01% : 0.000006s : 1: full_micro_interleaved_order_control 0.00% : 0.000005s : 1: get_jit_bprop_graph 0.01% : 0.000009s : 1: graph_reusing 0.00% : 0.000005s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000004s : 1: handle_group_info 0.01% : 0.000006s : 1: inline 0.01% : 0.000006s : 1: insert-virtual-dataset 0.00% : 0.000004s : 1: interleave_parallel_branches 0.00% : 0.000004s : 1: interleave_split_concat_branches 0.01% : 0.000006s : 1: label_fine_grained_interleaved_index 0.01% : 0.000008s : 1: label_micro_interleaved_index 0.43% : 0.000492s : 1: loop_unroll 0.00% : 0.000004s : 1: merge_cast_opt 0.00% : 0.000005s : 1: micro_interleaved_order_control 0.67% : 0.000764s : 1: mutable_eliminate 0.01% : 0.000008s : 1: offloading_packed_experts 0.02% : 0.000021s : 1: opt.transform.loop_unroll_optimizer 0.02% : 0.000023s : 1: opt.transform.mutable_eliminate 1.30% : 0.001482s : 78: opt.transform.opt_a 0.03% : 0.000037s : 1: opt.transform.opt_after_cconv 0.03% : 0.000033s : 1: opt.transform.opt_after_jit_grad 0.12% : 0.000139s : 28: opt.transform.opt_b 0.06% : 0.000063s : 2: opt.transform.opt_trans_graph 0.04% : 0.000049s : 4: opt.transform.symbol_engine_opt 26.99% : 0.030776s : 1: opt_a 0.12% : 0.000132s : 1: opt_after_cconv 0.45% : 0.000518s : 1: opt_after_jit_grad 0.24% : 0.000273s : 1: opt_b 29.23% : 0.033337s : 1: optimize 0.02% : 0.000026s : 1: optimize_parallel_all_gather_comm 0.01% : 0.000010s : 1: order_py_execute_after_rewriter 0.03% : 0.000029s : 1: overlap_grad_flash_sp 0.00% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.01% : 0.000009s : 1: overlap_grad_ring_attention 0.00% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000005s : 1: overlap_param_gather 0.00% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.01% : 0.000009s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000005s : 1: overlap_recompute_comm 0.01% : 0.000008s : 1: parallel-infer-symbol 0.00% : 0.000004s : 1: parallel-infer-symbol-second 0.00% : 0.000005s : 1: partial_unused_args_eliminate 0.00% : 0.000005s : 1: pipeline_parallel_scheduler 0.00% : 0.000005s : 1: pipeline_split 0.04% : 0.000047s : 1: pre_auto_parallel 0.03% : 0.000036s : 1: py_interpret_to_execute 0.02% : 0.000025s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000004s : 1: remove_cast_before_assign_add 0.02% : 0.000021s : 1: remove_dup_value 24.27% : 0.027673s : 1: renormalize.infer 0.47% : 0.000534s : 1: renormalize.specialize 0.01% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.01% : 0.000009s : 1: rewriter_after_jit_bprop_graph 0.05% : 0.000053s : 1: rewriter_after_opt_a 0.09% : 0.000101s : 1: rewriter_before_opt_a 0.00% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000005s : 1: slice_recompute_activation 0.00% : 0.000004s : 1: split_layernorm_comm 0.00% : 0.000005s : 1: split_matmul_comm_elemetwise 0.01% : 0.000009s : 1: swap_dp_allreduce_reducescatter 0.09% : 0.000100s : 1: symbol_engine_optimizer 0.08% : 0.000095s : 1: tuple_transform 5.42% : 0.006181s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:45:08.871.000 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:45:08.871.265 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0473386, [21] [bootstrap]: 0.00056418 [type_inference]: 0.0340646 [event_method]: 2.406e-05 [auto_monad]: 7.468e-05 [graph_reusing]: 6.07001e-06 [inline]: 3.20002e-06 [add_attr]: 0.00366657, [1] [add_attr_with_inline]: 0.00365495, [1] [Cycle 1]: 8.664e-05, [2] [tag_attr]: 2.262e-05 [meta_addattr_fg_expand]: 6.23e-06 [parallel-infer-symbol]: 3.28e-06 [pre_auto_parallel]: 4.018e-05 [insert-virtual-dataset]: 2.22999e-06 [parallel-infer-symbol-second]: 1.03001e-06 [dataset_repeat_opt]: 3.43e-06 [pipeline_split]: 1.92001e-06 [optimize]: 0.0063432, [53] [py_interpret_to_execute]: 3.968e-05 [rewriter_before_opt_a]: 0.00010007 [opt_a]: 0.00370702, [2] [Cycle 1]: 0.00268149, [45] [expand_dump_flag]: 4.42998e-06 [switch_simplify]: 4.508e-05 [loop_unroll]: 3.187e-05 [a_1]: 0.00071664 [with_stream_mark]: 2.3e-05 [recompute_prepare]: 1.287e-05 [updatestate_depend_eliminate]: 4.85999e-06 [updatestate_assign_eliminate]: 4.1e-06 [updatestate_loads_eliminate]: 3.7e-06 [parameter_eliminate]: 2.22001e-06 [a_2]: 0.00013412 [accelerated_algorithm]: 9.24e-06 [shard]: 2.31e-06 [meta_shard_fg_expand]: 2.47001e-06 [shard_inline]: 8.25999e-06 [merge_send_recv]: 8.92e-06 [auto_parallel]: 8.75999e-06 [parallel]: 1.974e-05 [flash_sp]: 1.025e-05 [merge_comm]: 4.87e-06 [allreduce_fusion]: 4.58001e-06 [matmul_add_comm_reduction]: 1.071e-05 [allreduce_slice_to_reducescatter]: 6.79982e-07 [virtual_shard_identity]: 1.044e-05 [virtual_dataset]: 8.16002e-06 [get_grad_eliminate_]: 7.8e-06 [virtual_output]: 7.92998e-06 [merge_forward]: 5.46998e-06 [cell_reuse_recompute_pass]: 1.84998e-06 [offload_activation]: 1.206e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.789e-05 [merge_recompute_call_nodes]: 1.47999e-06 [before_grad]: 1.413e-05 [set_forward_comm_id_for_comm_node_pass]: 5.54e-06 [meta_fg_expand]: 3.83001e-06 [flash_sp_send_recv_attached]: 3.17002e-06 [receive_attached]: 2.34999e-06 [after_resolve]: 1.427e-05 [a_after_grad]: 1.216e-05 [renormalize]: 0.00089961 [add_forward_monad_depend]: 7.55998e-06 [auto_monad_grad]: 2.74001e-06 [auto_monad_eliminator]: 1.996e-05 [cse]: 4.51e-05 [a_3]: 7.511e-05 [Cycle 2]: 0.00100933, [45] [expand_dump_flag]: 1.84e-06 [switch_simplify]: 1.019e-05 [loop_unroll]: 7.92003e-06 [a_1]: 0.00018279 [with_stream_mark]: 1.645e-05 [recompute_prepare]: 1.06e-05 [updatestate_depend_eliminate]: 4.25999e-06 [updatestate_assign_eliminate]: 3.24001e-06 [updatestate_loads_eliminate]: 3.14999e-06 [parameter_eliminate]: 1.72001e-06 [a_2]: 0.00012585 [accelerated_algorithm]: 9.22001e-06 [shard]: 2.09e-06 [meta_shard_fg_expand]: 2.19999e-06 [shard_inline]: 7.85998e-06 [merge_send_recv]: 7.51999e-06 [auto_parallel]: 7.73001e-06 [parallel]: 5.72001e-06 [flash_sp]: 3.5e-06 [merge_comm]: 4.63001e-06 [allreduce_fusion]: 4.11001e-06 [matmul_add_comm_reduction]: 9.27001e-06 [allreduce_slice_to_reducescatter]: 3.69997e-07 [virtual_shard_identity]: 9.58002e-06 [virtual_dataset]: 7.58001e-06 [get_grad_eliminate_]: 7.02002e-06 [virtual_output]: 7.31001e-06 [merge_forward]: 4.57998e-06 [cell_reuse_recompute_pass]: 2.06e-06 [offload_activation]: 9.40001e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.896e-05 [merge_recompute_call_nodes]: 7.59988e-07 [before_grad]: 1.315e-05 [set_forward_comm_id_for_comm_node_pass]: 5.28002e-06 [meta_fg_expand]: 3.07002e-06 [flash_sp_send_recv_attached]: 1.20999e-06 [receive_attached]: 1.74e-06 [after_resolve]: 1.406e-05 [a_after_grad]: 1.238e-05 [renormalize]: 7.99773e-08 [add_forward_monad_depend]: 2.07999e-06 [auto_monad_grad]: 2.40002e-06 [auto_monad_eliminator]: 1.099e-05 [cse]: 2.176e-05 [a_3]: 6.072e-05 [py_interpret_to_execute_after_opt_a]: 1.696e-05 [slice_cell_reuse_recomputed_activation]: 4.94e-06 [rewriter_after_opt_a]: 4.731e-05 [convert_after_rewriter]: 1.218e-05 [order_py_execute_after_rewriter]: 8.48999e-06 [mutable_eliminate]: 0.00063559 [opt_b]: 0.00032119, [1] [Cycle 1]: 0.00031093, [7] [b_1]: 0.00020206 [b_2]: 9.67001e-06 [updatestate_depend_eliminate]: 8.73001e-06 [updatestate_assign_eliminate]: 3.51001e-06 [updatestate_loads_eliminate]: 3.07002e-06 [renormalize]: 6.09987e-07 [cse]: 2.561e-05 [optimize_parallel_all_gather_comm]: 2.199e-05 [overlap_param_gather]: 5.62001e-06 [cconv]: 3.517e-05 [loop_unroll]: 0.00047249 [opt_after_cconv]: 0.00015131, [1] [Cycle 1]: 0.00014132, [7] [c_1]: 3.853e-05 [parameter_eliminate]: 3.36001e-06 [updatestate_depend_eliminate]: 7.87998e-06 [updatestate_assign_eliminate]: 3.36001e-06 [updatestate_loads_eliminate]: 3.17002e-06 [cse]: 2.579e-05 [renormalize]: 4.89992e-07 [remove_dup_value]: 1.888e-05 [tuple_transform]: 0.00010234, [1] [Cycle 1]: 9.492e-05, [4] [d_1]: 5.332e-05 [none_parameter_eliminate]: 2.11e-06 [renormalize]: 2.19996e-07 [switch_simplify]: 8.69998e-06 [partial_unused_args_eliminate]: 4.67e-06 [add_recomputation]: 6.372e-05 [cse_after_recomputation]: 3.402e-05, [1] [Cycle 1]: 2.682e-05, [1] [cse]: 1.676e-05 [environ_conv]: 1.032e-05 [swap_dp_allreduce_reducescatter]: 9.46e-06 [bias_add_comm_swap]: 5.54e-06 [label_micro_interleaved_index]: 8.18999e-06 [label_fine_grained_interleaved_index]: 5.32001e-06 [merge_cast_opt]: 3.76001e-06 [slice_recompute_activation]: 4.55001e-06 [micro_interleaved_order_control]: 5.69999e-06 [assign_add_opt]: 3.77002e-06 [ForceFp32Comm]: 3.48e-06 [remove_cast_before_assign_add]: 3.33998e-06 [full_micro_interleaved_order_control]: 4.79002e-06 [reorder_send_recv_between_fp_bp]: 5.57001e-06 [comm_op_add_attrs]: 3.4e-06 [add_comm_op_reuse_tag]: 3.2e-06 [interleave_split_concat_branches]: 3.79002e-06 [interleave_parallel_branches]: 3.90998e-06 [overlap_opt_shard_in_pipeline]: 3.55998e-06 [overlap_opt_shard_grad_in_pipeline]: 4.67e-06 [control_data_broadcast_order]: 2.109e-05 [grouped_pairwise_exchange_alltoall]: 3.95e-06 [offloading_packed_experts]: 7.39002e-06 [overlap_recompute_and_grad_model_parallel]: 7.63001e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.76001e-06 [overlap_recompute_allgather_and_fa_grad]: 4.17e-06 [overlap_recompute_comm]: 5.20999e-06 [overlap_grad_ring_attention]: 7.8e-06 [overlap_grad_flash_sp]: 2.676e-05 [begin_end_overlap_inline]: 3.09001e-06 [split_matmul_comm_elemetwise]: 4.51002e-06 [split_layernorm_comm]: 4.1e-06 [handle_group_info]: 3.58999e-06 [symbol_engine_optimizer]: 0.00011625, [1] [Cycle 1]: 0.00010782, [6] [build]: 3.64002e-06 [elim_shapecalc]: 1.402e-05 [elim_not_effective]: 1.762e-05 [opt_reshape]: 8.75999e-06 [fold_const_symbol]: 1.259e-05 [renormalize]: 2.3999e-07 [detach_backward]: 5.96e-06 [pipeline_parallel_scheduler]: 2.17999e-06 [auto_monad_reorder]: 2.92e-05 [get_jit_bprop_graph]: 2.15002e-06 [rewriter_after_jit_bprop_graph]: 6.31e-06 [opt_after_jit_grad]: 0.00060732 [validate]: 5.178e-05 Sums bootstrap : 0.000564s : 1.39% type_inference : 0.034065s : 83.91% event_method : 0.000024s : 0.06% auto_monad : 0.000075s : 0.18% graph_reusing : 0.000006s : 0.01% inline : 0.000003s : 0.01% add_attr.add_attr_with_inline.tag_attr : 0.000023s : 0.06% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.02% parallel-infer-symbol : 0.000003s : 0.01% pre_auto_parallel : 0.000040s : 0.10% insert-virtual-dataset : 0.000002s : 0.01% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000003s : 0.01% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000040s : 0.10% optimize.rewriter_before_opt_a : 0.000100s : 0.25% optimize.opt_a.expand_dump_flag : 0.000006s : 0.02% optimize.opt_a.switch_simplify : 0.000055s : 0.14% optimize.opt_a.loop_unroll : 0.000040s : 0.10% optimize.opt_a.a_1 : 0.000899s : 2.22% optimize.opt_a.with_stream_mark : 0.000039s : 0.10% optimize.opt_a.recompute_prepare : 0.000023s : 0.06% optimize.opt_a.updatestate_depend_eliminate : 0.000009s : 0.02% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.02% optimize.opt_a.updatestate_loads_eliminate : 0.000007s : 0.02% optimize.opt_a.parameter_eliminate : 0.000004s : 0.01% optimize.opt_a.a_2 : 0.000260s : 0.64% optimize.opt_a.accelerated_algorithm : 0.000018s : 0.05% optimize.opt_a.shard : 0.000004s : 0.01% optimize.opt_a.meta_shard_fg_expand : 0.000005s : 0.01% optimize.opt_a.shard_inline : 0.000016s : 0.04% optimize.opt_a.merge_send_recv : 0.000016s : 0.04% optimize.opt_a.auto_parallel : 0.000016s : 0.04% optimize.opt_a.parallel : 0.000025s : 0.06% optimize.opt_a.flash_sp : 0.000014s : 0.03% optimize.opt_a.merge_comm : 0.000010s : 0.02% optimize.opt_a.allreduce_fusion : 0.000009s : 0.02% optimize.opt_a.matmul_add_comm_reduction : 0.000020s : 0.05% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000020s : 0.05% optimize.opt_a.virtual_dataset : 0.000016s : 0.04% optimize.opt_a.get_grad_eliminate_ : 0.000015s : 0.04% optimize.opt_a.virtual_output : 0.000015s : 0.04% optimize.opt_a.merge_forward : 0.000010s : 0.02% optimize.opt_a.cell_reuse_recompute_pass : 0.000004s : 0.01% optimize.opt_a.offload_activation : 0.000021s : 0.05% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000047s : 0.12% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.01% optimize.opt_a.before_grad : 0.000027s : 0.07% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000011s : 0.03% optimize.opt_a.meta_fg_expand : 0.000007s : 0.02% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.01% optimize.opt_a.receive_attached : 0.000004s : 0.01% optimize.opt_a.after_resolve : 0.000028s : 0.07% optimize.opt_a.a_after_grad : 0.000025s : 0.06% optimize.opt_a.renormalize : 0.000900s : 2.22% optimize.opt_a.add_forward_monad_depend : 0.000010s : 0.02% optimize.opt_a.auto_monad_grad : 0.000005s : 0.01% optimize.opt_a.auto_monad_eliminator : 0.000031s : 0.08% optimize.opt_a.cse : 0.000067s : 0.16% optimize.opt_a.a_3 : 0.000136s : 0.33% optimize.py_interpret_to_execute_after_opt_a : 0.000017s : 0.04% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.01% optimize.rewriter_after_opt_a : 0.000047s : 0.12% optimize.convert_after_rewriter : 0.000012s : 0.03% optimize.order_py_execute_after_rewriter : 0.000008s : 0.02% optimize.mutable_eliminate : 0.000636s : 1.57% optimize.opt_b.b_1 : 0.000202s : 0.50% optimize.opt_b.b_2 : 0.000010s : 0.02% optimize.opt_b.updatestate_depend_eliminate : 0.000009s : 0.02% optimize.opt_b.updatestate_assign_eliminate : 0.000004s : 0.01% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.01% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000026s : 0.06% optimize.optimize_parallel_all_gather_comm : 0.000022s : 0.05% optimize.overlap_param_gather : 0.000006s : 0.01% optimize.cconv : 0.000035s : 0.09% optimize.loop_unroll : 0.000472s : 1.16% optimize.opt_after_cconv.c_1 : 0.000039s : 0.09% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000008s : 0.02% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.cse : 0.000026s : 0.06% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000019s : 0.05% optimize.tuple_transform.d_1 : 0.000053s : 0.13% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000009s : 0.02% optimize.partial_unused_args_eliminate : 0.000005s : 0.01% optimize.add_recomputation : 0.000064s : 0.16% optimize.cse_after_recomputation.cse : 0.000017s : 0.04% optimize.environ_conv : 0.000010s : 0.03% optimize.swap_dp_allreduce_reducescatter : 0.000009s : 0.02% optimize.bias_add_comm_swap : 0.000006s : 0.01% optimize.label_micro_interleaved_index : 0.000008s : 0.02% optimize.label_fine_grained_interleaved_index : 0.000005s : 0.01% optimize.merge_cast_opt : 0.000004s : 0.01% optimize.slice_recompute_activation : 0.000005s : 0.01% optimize.micro_interleaved_order_control : 0.000006s : 0.01% optimize.assign_add_opt : 0.000004s : 0.01% optimize.ForceFp32Comm : 0.000003s : 0.01% optimize.remove_cast_before_assign_add : 0.000003s : 0.01% optimize.full_micro_interleaved_order_control : 0.000005s : 0.01% optimize.reorder_send_recv_between_fp_bp : 0.000006s : 0.01% optimize.comm_op_add_attrs : 0.000003s : 0.01% optimize.add_comm_op_reuse_tag : 0.000003s : 0.01% optimize.interleave_split_concat_branches : 0.000004s : 0.01% optimize.interleave_parallel_branches : 0.000004s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000005s : 0.01% optimize.control_data_broadcast_order : 0.000021s : 0.05% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.01% optimize.offloading_packed_experts : 0.000007s : 0.02% optimize.overlap_recompute_and_grad_model_parallel : 0.000008s : 0.02% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.01% optimize.overlap_recompute_comm : 0.000005s : 0.01% optimize.overlap_grad_ring_attention : 0.000008s : 0.02% optimize.overlap_grad_flash_sp : 0.000027s : 0.07% optimize.begin_end_overlap_inline : 0.000003s : 0.01% optimize.split_matmul_comm_elemetwise : 0.000005s : 0.01% optimize.split_layernorm_comm : 0.000004s : 0.01% optimize.handle_group_info : 0.000004s : 0.01% optimize.symbol_engine_optimizer.build : 0.000004s : 0.01% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000014s : 0.03% optimize.symbol_engine_optimizer.elim_not_effective : 0.000018s : 0.04% optimize.symbol_engine_optimizer.opt_reshape : 0.000009s : 0.02% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000013s : 0.03% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000006s : 0.01% pipeline_parallel_scheduler : 0.000002s : 0.01% auto_monad_reorder : 0.000029s : 0.07% get_jit_bprop_graph : 0.000002s : 0.01% rewriter_after_jit_bprop_graph : 0.000006s : 0.02% opt_after_jit_grad : 0.000607s : 1.50% validate : 0.000052s : 0.13% Time group info: ------[substitution.] 0.000243 38 11.50% : 0.000028s : 3: substitution.cast_eliminate 0.97% : 0.000002s : 3: substitution.elim_not_effective 0.68% : 0.000002s : 3: substitution.fold_const_symbol 2.88% : 0.000007s : 5: substitution.graph_param_transform 68.43% : 0.000166s : 4: substitution.inline 2.25% : 0.000005s : 6: substitution.j_node_and_user_rematch 5.39% : 0.000013s : 6: substitution.remove_not_recompute_node 2.50% : 0.000006s : 4: substitution.replace_old_param 5.40% : 0.000013s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.033985 2 96.96% : 0.032951s : 1: type_inference.infer 3.04% : 0.001034s : 1: type_inference.specialize ------[replace.] 0.000066 8 62.41% : 0.000041s : 4: replace.inline 37.59% : 0.000025s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000174 8 93.49% : 0.000163s : 4: match.inline 6.51% : 0.000011s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000249 1504 0.95% : 0.000002s : 15: predicate.accumulaten_eliminater 0.82% : 0.000002s : 5: predicate.ad_related_special_op_eliminate 0.56% : 0.000001s : 10: predicate.addn_check_dump 0.85% : 0.000002s : 15: predicate.addn_zero_filter 0.78% : 0.000002s : 15: predicate.adjust_all_reduce_mul_add 2.28% : 0.000006s : 25: predicate.arithmetic_simplify 1.04% : 0.000003s : 15: predicate.cast_eliminate 0.61% : 0.000002s : 10: predicate.check_bprop_eliminate 0.67% : 0.000002s : 10: predicate.compare_switch_simplify 0.20% : 0.000000s : 5: predicate.const_output_eliminate 0.63% : 0.000002s : 10: predicate.depend_value_elim 0.86% : 0.000002s : 15: predicate.dict_get_item_const_eliminator 0.96% : 0.000002s : 15: predicate.dict_get_item_eliminator 0.81% : 0.000002s : 15: predicate.dict_set_item_eliminator 1.20% : 0.000003s : 10: predicate.dumpgradient_eliminate 0.37% : 0.000001s : 5: predicate.elim_not_effective 0.45% : 0.000001s : 5: predicate.elim_shapecalc_of_broadcastargs 1.13% : 0.000003s : 20: predicate.environ_add_const_eliminate 1.04% : 0.000003s : 20: predicate.environ_get_add_eliminate 1.06% : 0.000003s : 20: predicate.environ_get_depend_swap 1.64% : 0.000004s : 30: predicate.environ_get_eliminate 1.10% : 0.000003s : 20: predicate.environ_get_set_eliminate 1.34% : 0.000003s : 23: predicate.exchange_switch_depend_value 2.31% : 0.000006s : 23: predicate.float_depend_g_call 0.55% : 0.000001s : 10: predicate.float_environ_get_switch 0.96% : 0.000002s : 15: predicate.float_tuple_getitem_switch 0.18% : 0.000000s : 5: predicate.fold_const_symbol 0.60% : 0.000002s : 10: predicate.get_grad_eliminate 0.20% : 0.000000s : 5: predicate.graph_param_transform 0.70% : 0.000002s : 10: predicate.incorporate_call 0.55% : 0.000001s : 10: predicate.incorporate_call_switch 6.64% : 0.000017s : 68: predicate.inline 1.18% : 0.000003s : 10: predicate.inline_without_move 0.33% : 0.000001s : 10: predicate.j_node_and_user_rematch 0.93% : 0.000002s : 10: predicate.less_batch_normalization 1.76% : 0.000004s : 29: predicate.list_to_tuple_eliminator_ 2.37% : 0.000006s : 44: predicate.load_eliminater 1.10% : 0.000003s : 5: predicate.loop_unroll_after_grad 2.17% : 0.000005s : 36: predicate.loop_unroll_before_grad 1.76% : 0.000004s : 25: predicate.make_slice_get_slice_eliminator 0.67% : 0.000002s : 10: predicate.merge_addn 0.59% : 0.000001s : 10: predicate.micro_step_allgather_replace 0.59% : 0.000001s : 10: predicate.mini_step_allgather_replace 0.80% : 0.000002s : 15: predicate.minmaximum_grad 1.20% : 0.000003s : 5: predicate.mutable_eliminate 0.38% : 0.000001s : 5: predicate.opt_reshape 0.39% : 0.000001s : 5: predicate.parallel_virtual_node 1.64% : 0.000004s : 23: predicate.partial_defer_inline 1.57% : 0.000004s : 24: predicate.partial_eliminate 0.87% : 0.000002s : 15: predicate.print_const_string_wrapper 0.61% : 0.000002s : 10: predicate.reduce_all_const_elim 1.14% : 0.000003s : 15: predicate.reduce_eliminate 2.41% : 0.000006s : 44: predicate.redundant_stop_gradient_eliminater 0.46% : 0.000001s : 10: predicate.remove_not_recompute_node 1.29% : 0.000003s : 29: predicate.replace_applicator 0.58% : 0.000001s : 10: predicate.replace_old_param 0.35% : 0.000001s : 5: predicate.reset_defer_inline 0.92% : 0.000002s : 15: predicate.reshape_eliminate 0.77% : 0.000002s : 10: predicate.row_tensor_add_zeros_like 0.37% : 0.000001s : 5: predicate.row_tensor_eliminate 0.84% : 0.000002s : 10: predicate.same_eliminate 0.45% : 0.000001s : 10: predicate.set_cell_output_no_recompute 0.74% : 0.000002s : 10: predicate.shard_identity_eliminate 0.71% : 0.000002s : 10: predicate.special_op_eliminate 0.74% : 0.000002s : 10: predicate.specialize_transform 0.97% : 0.000002s : 10: predicate.split_environ_get_set_with_tuple_value 0.83% : 0.000002s : 10: predicate.stack_unstack_eliminate 0.33% : 0.000001s : 5: predicate.switch_call_monad_eliminater 1.37% : 0.000003s : 23: predicate.switch_defer_inline 1.95% : 0.000005s : 33: predicate.switch_layer_defer_inline 4.89% : 0.000012s : 74: predicate.switch_simplify 0.87% : 0.000002s : 15: predicate.tile_eliminate 0.86% : 0.000002s : 15: predicate.transpose_eliminate 1.53% : 0.000004s : 25: predicate.tuple_list_convert_item_index_to_positive 1.75% : 0.000004s : 25: predicate.tuple_list_get_item_const_eliminator 1.39% : 0.000003s : 25: predicate.tuple_list_get_item_depend_reorder 3.23% : 0.000008s : 39: predicate.tuple_list_get_item_eliminator 1.44% : 0.000004s : 25: predicate.tuple_list_get_set_item_eliminator 2.16% : 0.000005s : 35: predicate.tuple_list_set_item_eliminator 1.75% : 0.000004s : 29: predicate.tuple_to_list_eliminator_ 2.31% : 0.000006s : 44: predicate.updatestate_pure_node_eliminater 3.17% : 0.000008s : 54: predicate.updatestate_useless_node_eliminater 0.37% : 0.000001s : 5: predicate.value_based_eliminate 0.64% : 0.000002s : 10: predicate.virtual_dataset_eliminate 0.65% : 0.000002s : 10: predicate.virtual_output_eliminate 0.30% : 0.000001s : 5: predicate.virtual_view_grad_eliminate 0.51% : 0.000001s : 5: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000687 11 46.28% : 0.000318s : 5: func_graph_cloner_run.FuncGraphClonerGraph 53.72% : 0.000369s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.059905 192 0.01% : 0.000006s : 1: ForceFp32Comm 6.14% : 0.003677s : 1: add_attr 6.11% : 0.003659s : 1: add_attr_with_inline 0.01% : 0.000006s : 1: add_comm_op_reuse_tag 0.11% : 0.000068s : 1: add_recomputation 0.01% : 0.000007s : 1: assign_add_opt 0.14% : 0.000086s : 1: auto_monad 0.06% : 0.000038s : 1: auto_monad_reorder 0.01% : 0.000006s : 1: begin_end_overlap_inline 0.01% : 0.000008s : 1: bias_add_comm_swap 1.02% : 0.000612s : 1: bootstrap 0.06% : 0.000039s : 1: cconv 0.01% : 0.000006s : 1: comm_op_add_attrs 0.04% : 0.000024s : 1: control_data_broadcast_order 0.03% : 0.000016s : 1: convert_after_rewriter 0.06% : 0.000037s : 1: cse_after_recomputation 0.03% : 0.000016s : 1: dataset_repeat_opt 0.05% : 0.000028s : 1: detach_backward 0.02% : 0.000014s : 1: environ_conv 0.06% : 0.000037s : 1: event_method 0.01% : 0.000007s : 1: full_micro_interleaved_order_control 0.01% : 0.000009s : 1: get_jit_bprop_graph 0.02% : 0.000013s : 1: graph_reusing 0.01% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000006s : 1: handle_group_info 0.02% : 0.000009s : 1: inline 0.01% : 0.000009s : 1: insert-virtual-dataset 0.01% : 0.000007s : 1: interleave_parallel_branches 0.01% : 0.000006s : 1: interleave_split_concat_branches 0.01% : 0.000009s : 1: label_fine_grained_interleaved_index 0.02% : 0.000012s : 1: label_micro_interleaved_index 0.80% : 0.000479s : 1: loop_unroll 0.01% : 0.000007s : 1: merge_cast_opt 0.01% : 0.000009s : 1: micro_interleaved_order_control 1.07% : 0.000642s : 1: mutable_eliminate 0.02% : 0.000010s : 1: offloading_packed_experts 0.03% : 0.000019s : 1: opt.transform.loop_unroll_optimizer 0.03% : 0.000021s : 1: opt.transform.mutable_eliminate 2.43% : 0.001457s : 78: opt.transform.opt_a 0.06% : 0.000037s : 1: opt.transform.opt_after_cconv 0.06% : 0.000037s : 1: opt.transform.opt_after_jit_grad 0.23% : 0.000138s : 28: opt.transform.opt_b 0.10% : 0.000060s : 2: opt.transform.opt_trans_graph 0.08% : 0.000049s : 4: opt.transform.symbol_engine_opt 6.19% : 0.003711s : 1: opt_a 0.26% : 0.000155s : 1: opt_after_cconv 1.04% : 0.000620s : 1: opt_after_jit_grad 0.54% : 0.000325s : 1: opt_b 12.89% : 0.007721s : 1: optimize 0.04% : 0.000025s : 1: optimize_parallel_all_gather_comm 0.02% : 0.000011s : 1: order_py_execute_after_rewriter 0.05% : 0.000030s : 1: overlap_grad_flash_sp 0.01% : 0.000007s : 1: overlap_grad_matmul_and_grad_allreduce 0.02% : 0.000011s : 1: overlap_grad_ring_attention 0.01% : 0.000007s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000006s : 1: overlap_opt_shard_in_pipeline 0.01% : 0.000009s : 1: overlap_param_gather 0.01% : 0.000007s : 1: overlap_recompute_allgather_and_fa_grad 0.02% : 0.000010s : 1: overlap_recompute_and_grad_model_parallel 0.01% : 0.000008s : 1: overlap_recompute_comm 0.02% : 0.000010s : 1: parallel-infer-symbol 0.26% : 0.000158s : 1: parallel-infer-symbol-second 0.01% : 0.000008s : 1: partial_unused_args_eliminate 0.02% : 0.000011s : 1: pipeline_parallel_scheduler 0.01% : 0.000008s : 1: pipeline_split 0.08% : 0.000048s : 1: pre_auto_parallel 0.07% : 0.000044s : 1: py_interpret_to_execute 0.03% : 0.000021s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000006s : 1: remove_cast_before_assign_add 0.04% : 0.000022s : 1: remove_dup_value 0.84% : 0.000505s : 1: renormalize.infer 0.64% : 0.000385s : 1: renormalize.specialize 0.01% : 0.000009s : 1: reorder_send_recv_between_fp_bp 0.02% : 0.000012s : 1: rewriter_after_jit_bprop_graph 0.08% : 0.000051s : 1: rewriter_after_opt_a 0.17% : 0.000105s : 1: rewriter_before_opt_a 0.01% : 0.000008s : 1: slice_cell_reuse_recomputed_activation 0.01% : 0.000007s : 1: slice_recompute_activation 0.01% : 0.000007s : 1: split_layernorm_comm 0.01% : 0.000007s : 1: split_matmul_comm_elemetwise 0.02% : 0.000012s : 1: swap_dp_allreduce_reducescatter 0.20% : 0.000119s : 1: symbol_engine_optimizer 0.18% : 0.000105s : 1: tuple_transform 56.96% : 0.034124s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:45:09.406.293 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0296761, [21] [bootstrap]: 0.00047007 [type_inference]: 0.00630233 [event_method]: 2.181e-05 [auto_monad]: 6.672e-05 [graph_reusing]: 5.26002e-06 [inline]: 3.14999e-06 [add_attr]: 0.0158011, [1] [add_attr_with_inline]: 0.0157875, [1] [Cycle 1]: 7.684e-05, [2] [tag_attr]: 2.498e-05 [meta_addattr_fg_expand]: 5.93002e-06 [parallel-infer-symbol]: 4.54002e-06 [pre_auto_parallel]: 4.244e-05 [insert-virtual-dataset]: 2.31e-06 [parallel-infer-symbol-second]: 1.24e-06 [dataset_repeat_opt]: 2.11e-06 [pipeline_split]: 2.07001e-06 [optimize]: 0.00615966, [53] [py_interpret_to_execute]: 3.059e-05 [rewriter_before_opt_a]: 9.795e-05 [opt_a]: 0.00351745, [2] [Cycle 1]: 0.00261035, [45] [expand_dump_flag]: 4.52e-06 [switch_simplify]: 4.762e-05 [loop_unroll]: 3.153e-05 [a_1]: 0.0007526 [with_stream_mark]: 2.278e-05 [recompute_prepare]: 1.423e-05 [updatestate_depend_eliminate]: 4.48001e-06 [updatestate_assign_eliminate]: 3.75e-06 [updatestate_loads_eliminate]: 3.57002e-06 [parameter_eliminate]: 1.86998e-06 [a_2]: 0.00010533 [accelerated_algorithm]: 9.67999e-06 [shard]: 2.14e-06 [meta_shard_fg_expand]: 2.71999e-06 [shard_inline]: 8.02003e-06 [merge_send_recv]: 9.84999e-06 [auto_parallel]: 9.14e-06 [parallel]: 2.025e-05 [flash_sp]: 1.148e-05 [merge_comm]: 5.54998e-06 [allreduce_fusion]: 4.49998e-06 [matmul_add_comm_reduction]: 1.128e-05 [allreduce_slice_to_reducescatter]: 8.39995e-07 [virtual_shard_identity]: 1.269e-05 [virtual_dataset]: 8.28001e-06 [get_grad_eliminate_]: 7.88001e-06 [virtual_output]: 9.24e-06 [merge_forward]: 6.15002e-06 [cell_reuse_recompute_pass]: 2.22001e-06 [offload_activation]: 1.246e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.018e-05 [merge_recompute_call_nodes]: 1.93002e-06 [before_grad]: 1.315e-05 [set_forward_comm_id_for_comm_node_pass]: 5.79e-06 [meta_fg_expand]: 3.47002e-06 [flash_sp_send_recv_attached]: 3.02002e-06 [receive_attached]: 2.17999e-06 [after_resolve]: 1.609e-05 [a_after_grad]: 1.475e-05 [renormalize]: 0.00095039 [add_forward_monad_depend]: 8.50001e-06 [auto_monad_grad]: 2.46e-06 [auto_monad_eliminator]: 2.266e-05 [cse]: 3.946e-05 [a_3]: 6.943e-05 [Cycle 2]: 0.00089384, [45] [expand_dump_flag]: 2.59999e-06 [switch_simplify]: 1.078e-05 [loop_unroll]: 9.39e-06 [a_1]: 0.00018888 [with_stream_mark]: 1.725e-05 [recompute_prepare]: 9.29998e-06 [updatestate_depend_eliminate]: 5.10001e-06 [updatestate_assign_eliminate]: 3.71999e-06 [updatestate_loads_eliminate]: 3.48999e-06 [parameter_eliminate]: 1.86e-06 [a_2]: 9.885e-05 [accelerated_algorithm]: 9.76e-06 [shard]: 2.64001e-06 [meta_shard_fg_expand]: 2.66999e-06 [shard_inline]: 7.8e-06 [merge_send_recv]: 1.39e-05 [auto_parallel]: 1.023e-05 [parallel]: 8.59e-06 [flash_sp]: 4.74e-06 [merge_comm]: 5.22999e-06 [allreduce_fusion]: 4.69998e-06 [matmul_add_comm_reduction]: 9.22999e-06 [allreduce_slice_to_reducescatter]: 5.19998e-07 [virtual_shard_identity]: 1.29e-05 [virtual_dataset]: 7.87e-06 [get_grad_eliminate_]: 7.35e-06 [virtual_output]: 7.4e-06 [merge_forward]: 4.68999e-06 [cell_reuse_recompute_pass]: 2.22999e-06 [offload_activation]: 1.03e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.094e-05 [merge_recompute_call_nodes]: 1.67001e-06 [before_grad]: 1.355e-05 [set_forward_comm_id_for_comm_node_pass]: 5.71998e-06 [meta_fg_expand]: 3.46999e-06 [flash_sp_send_recv_attached]: 1.64e-06 [receive_attached]: 1.99999e-06 [after_resolve]: 1.527e-05 [a_after_grad]: 1.191e-05 [renormalize]: 8.00064e-08 [add_forward_monad_depend]: 2.10002e-06 [auto_monad_grad]: 1.60999e-06 [auto_monad_eliminator]: 1.433e-05 [cse]: 2.498e-05 [a_3]: 4.885e-05 [py_interpret_to_execute_after_opt_a]: 1.733e-05 [slice_cell_reuse_recomputed_activation]: 2.78003e-06 [rewriter_after_opt_a]: 4.892e-05 [convert_after_rewriter]: 9.92001e-06 [order_py_execute_after_rewriter]: 6.42001e-06 [mutable_eliminate]: 0.00068724 [opt_b]: 0.0002651, [1] [Cycle 1]: 0.0002574, [7] [b_1]: 0.00015952 [b_2]: 1.058e-05 [updatestate_depend_eliminate]: 1.061e-05 [updatestate_assign_eliminate]: 3.38e-06 [updatestate_loads_eliminate]: 3.81999e-06 [renormalize]: 3.60014e-07 [cse]: 3.042e-05 [optimize_parallel_all_gather_comm]: 2.147e-05 [overlap_param_gather]: 2.51998e-06 [cconv]: 3.614e-05 [loop_unroll]: 0.00061704 [opt_after_cconv]: 0.0001301, [1] [Cycle 1]: 0.00012363, [7] [c_1]: 3.821e-05 [parameter_eliminate]: 4.70999e-06 [updatestate_depend_eliminate]: 8.69e-06 [updatestate_assign_eliminate]: 3.73999e-06 [updatestate_loads_eliminate]: 3.23998e-06 [cse]: 2.775e-05 [renormalize]: 6.80011e-07 [remove_dup_value]: 1.704e-05 [tuple_transform]: 9.417e-05, [1] [Cycle 1]: 8.884e-05, [4] [d_1]: 5.706e-05 [none_parameter_eliminate]: 2.06998e-06 [renormalize]: 3.09985e-07 [switch_simplify]: 9.27001e-06 [partial_unused_args_eliminate]: 2.06e-06 [add_recomputation]: 6.391e-05 [cse_after_recomputation]: 2.863e-05, [1] [Cycle 1]: 2.254e-05, [1] [cse]: 1.625e-05 [environ_conv]: 6.66999e-06 [swap_dp_allreduce_reducescatter]: 6.84999e-06 [bias_add_comm_swap]: 2.61999e-06 [label_micro_interleaved_index]: 5.14e-06 [label_fine_grained_interleaved_index]: 3.28e-06 [merge_cast_opt]: 1.29e-06 [slice_recompute_activation]: 2.49999e-06 [micro_interleaved_order_control]: 2.65997e-06 [assign_add_opt]: 1.91e-06 [ForceFp32Comm]: 1.09998e-06 [remove_cast_before_assign_add]: 9.89996e-07 [full_micro_interleaved_order_control]: 2.32001e-06 [reorder_send_recv_between_fp_bp]: 2.64999e-06 [comm_op_add_attrs]: 9.70002e-07 [add_comm_op_reuse_tag]: 1.51998e-06 [interleave_split_concat_branches]: 1.18001e-06 [interleave_parallel_branches]: 1.07e-06 [overlap_opt_shard_in_pipeline]: 1.26002e-06 [overlap_opt_shard_grad_in_pipeline]: 2.27999e-06 [control_data_broadcast_order]: 1.725e-05 [grouped_pairwise_exchange_alltoall]: 1.79e-06 [offloading_packed_experts]: 4.70999e-06 [overlap_recompute_and_grad_model_parallel]: 5.34e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.17e-06 [overlap_recompute_allgather_and_fa_grad]: 1.64e-06 [overlap_recompute_comm]: 2.51998e-06 [overlap_grad_ring_attention]: 4.77998e-06 [overlap_grad_flash_sp]: 2.486e-05 [begin_end_overlap_inline]: 5.19998e-07 [split_matmul_comm_elemetwise]: 2.12001e-06 [split_layernorm_comm]: 1.98002e-06 [handle_group_info]: 8.90024e-07 [symbol_engine_optimizer]: 9.785e-05, [1] [Cycle 1]: 9.311e-05, [6] [build]: 4.14002e-06 [elim_shapecalc]: 1.611e-05 [elim_not_effective]: 1.724e-05 [opt_reshape]: 8.70001e-06 [fold_const_symbol]: 1.21e-05 [renormalize]: 2.59985e-07 [detach_backward]: 2.04999e-06 [pipeline_parallel_scheduler]: 1.92999e-06 [auto_monad_reorder]: 2.288e-05 [get_jit_bprop_graph]: 2.34001e-06 [rewriter_after_jit_bprop_graph]: 6.04999e-06 [opt_after_jit_grad]: 0.00053779 [validate]: 4.855e-05 Sums bootstrap : 0.000470s : 3.68% type_inference : 0.006302s : 49.33% event_method : 0.000022s : 0.17% auto_monad : 0.000067s : 0.52% graph_reusing : 0.000005s : 0.04% inline : 0.000003s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000025s : 0.20% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.05% parallel-infer-symbol : 0.000005s : 0.04% pre_auto_parallel : 0.000042s : 0.33% insert-virtual-dataset : 0.000002s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.02% optimize.py_interpret_to_execute : 0.000031s : 0.24% optimize.rewriter_before_opt_a : 0.000098s : 0.77% optimize.opt_a.expand_dump_flag : 0.000007s : 0.06% optimize.opt_a.switch_simplify : 0.000058s : 0.46% optimize.opt_a.loop_unroll : 0.000041s : 0.32% optimize.opt_a.a_1 : 0.000941s : 7.37% optimize.opt_a.with_stream_mark : 0.000040s : 0.31% optimize.opt_a.recompute_prepare : 0.000024s : 0.18% optimize.opt_a.updatestate_depend_eliminate : 0.000010s : 0.07% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.06% optimize.opt_a.updatestate_loads_eliminate : 0.000007s : 0.06% optimize.opt_a.parameter_eliminate : 0.000004s : 0.03% optimize.opt_a.a_2 : 0.000204s : 1.60% optimize.opt_a.accelerated_algorithm : 0.000019s : 0.15% optimize.opt_a.shard : 0.000005s : 0.04% optimize.opt_a.meta_shard_fg_expand : 0.000005s : 0.04% optimize.opt_a.shard_inline : 0.000016s : 0.12% optimize.opt_a.merge_send_recv : 0.000024s : 0.19% optimize.opt_a.auto_parallel : 0.000019s : 0.15% optimize.opt_a.parallel : 0.000029s : 0.23% optimize.opt_a.flash_sp : 0.000016s : 0.13% optimize.opt_a.merge_comm : 0.000011s : 0.08% optimize.opt_a.allreduce_fusion : 0.000009s : 0.07% optimize.opt_a.matmul_add_comm_reduction : 0.000021s : 0.16% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000026s : 0.20% optimize.opt_a.virtual_dataset : 0.000016s : 0.13% optimize.opt_a.get_grad_eliminate_ : 0.000015s : 0.12% optimize.opt_a.virtual_output : 0.000017s : 0.13% optimize.opt_a.merge_forward : 0.000011s : 0.08% optimize.opt_a.cell_reuse_recompute_pass : 0.000004s : 0.03% optimize.opt_a.offload_activation : 0.000023s : 0.18% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000041s : 0.32% optimize.opt_a.merge_recompute_call_nodes : 0.000004s : 0.03% optimize.opt_a.before_grad : 0.000027s : 0.21% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000012s : 0.09% optimize.opt_a.meta_fg_expand : 0.000007s : 0.05% optimize.opt_a.flash_sp_send_recv_attached : 0.000005s : 0.04% optimize.opt_a.receive_attached : 0.000004s : 0.03% optimize.opt_a.after_resolve : 0.000031s : 0.25% optimize.opt_a.a_after_grad : 0.000027s : 0.21% optimize.opt_a.renormalize : 0.000950s : 7.44% optimize.opt_a.add_forward_monad_depend : 0.000011s : 0.08% optimize.opt_a.auto_monad_grad : 0.000004s : 0.03% optimize.opt_a.auto_monad_eliminator : 0.000037s : 0.29% optimize.opt_a.cse : 0.000064s : 0.50% optimize.opt_a.a_3 : 0.000118s : 0.93% optimize.py_interpret_to_execute_after_opt_a : 0.000017s : 0.14% optimize.slice_cell_reuse_recomputed_activation : 0.000003s : 0.02% optimize.rewriter_after_opt_a : 0.000049s : 0.38% optimize.convert_after_rewriter : 0.000010s : 0.08% optimize.order_py_execute_after_rewriter : 0.000006s : 0.05% optimize.mutable_eliminate : 0.000687s : 5.38% optimize.opt_b.b_1 : 0.000160s : 1.25% optimize.opt_b.b_2 : 0.000011s : 0.08% optimize.opt_b.updatestate_depend_eliminate : 0.000011s : 0.08% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_b.updatestate_loads_eliminate : 0.000004s : 0.03% optimize.opt_b.renormalize : 0.000000s : 0.00% optimize.opt_b.cse : 0.000030s : 0.24% optimize.optimize_parallel_all_gather_comm : 0.000021s : 0.17% optimize.overlap_param_gather : 0.000003s : 0.02% optimize.cconv : 0.000036s : 0.28% optimize.loop_unroll : 0.000617s : 4.83% optimize.opt_after_cconv.c_1 : 0.000038s : 0.30% optimize.opt_after_cconv.parameter_eliminate : 0.000005s : 0.04% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000009s : 0.07% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000004s : 0.03% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.cse : 0.000028s : 0.22% optimize.opt_after_cconv.renormalize : 0.000001s : 0.01% optimize.remove_dup_value : 0.000017s : 0.13% optimize.tuple_transform.d_1 : 0.000057s : 0.45% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.02% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000009s : 0.07% optimize.partial_unused_args_eliminate : 0.000002s : 0.02% optimize.add_recomputation : 0.000064s : 0.50% optimize.cse_after_recomputation.cse : 0.000016s : 0.13% optimize.environ_conv : 0.000007s : 0.05% optimize.swap_dp_allreduce_reducescatter : 0.000007s : 0.05% optimize.bias_add_comm_swap : 0.000003s : 0.02% optimize.label_micro_interleaved_index : 0.000005s : 0.04% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.03% optimize.merge_cast_opt : 0.000001s : 0.01% optimize.slice_recompute_activation : 0.000002s : 0.02% optimize.micro_interleaved_order_control : 0.000003s : 0.02% optimize.assign_add_opt : 0.000002s : 0.01% optimize.ForceFp32Comm : 0.000001s : 0.01% optimize.remove_cast_before_assign_add : 0.000001s : 0.01% optimize.full_micro_interleaved_order_control : 0.000002s : 0.02% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.02% optimize.comm_op_add_attrs : 0.000001s : 0.01% optimize.add_comm_op_reuse_tag : 0.000002s : 0.01% optimize.interleave_split_concat_branches : 0.000001s : 0.01% optimize.interleave_parallel_branches : 0.000001s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.02% optimize.control_data_broadcast_order : 0.000017s : 0.14% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.01% optimize.offloading_packed_experts : 0.000005s : 0.04% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.04% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000002s : 0.01% optimize.overlap_recompute_comm : 0.000003s : 0.02% optimize.overlap_grad_ring_attention : 0.000005s : 0.04% optimize.overlap_grad_flash_sp : 0.000025s : 0.19% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.02% optimize.split_layernorm_comm : 0.000002s : 0.02% optimize.handle_group_info : 0.000001s : 0.01% optimize.symbol_engine_optimizer.build : 0.000004s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000016s : 0.13% optimize.symbol_engine_optimizer.elim_not_effective : 0.000017s : 0.13% optimize.symbol_engine_optimizer.opt_reshape : 0.000009s : 0.07% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000012s : 0.09% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.02% pipeline_parallel_scheduler : 0.000002s : 0.02% auto_monad_reorder : 0.000023s : 0.18% get_jit_bprop_graph : 0.000002s : 0.02% rewriter_after_jit_bprop_graph : 0.000006s : 0.05% opt_after_jit_grad : 0.000538s : 4.21% validate : 0.000049s : 0.38% Time group info: ------[substitution.] 0.000262 38 16.85% : 0.000044s : 3: substitution.cast_eliminate 1.02% : 0.000003s : 3: substitution.elim_not_effective 0.64% : 0.000002s : 3: substitution.fold_const_symbol 2.84% : 0.000007s : 5: substitution.graph_param_transform 65.99% : 0.000173s : 4: substitution.inline 1.78% : 0.000005s : 6: substitution.j_node_and_user_rematch 2.85% : 0.000007s : 6: substitution.remove_not_recompute_node 2.50% : 0.000007s : 4: substitution.replace_old_param 5.53% : 0.000015s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.006233 2 87.63% : 0.005462s : 1: type_inference.infer 12.37% : 0.000771s : 1: type_inference.specialize ------[replace.] 0.000072 8 62.25% : 0.000045s : 4: replace.inline 37.75% : 0.000027s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000182 8 93.24% : 0.000169s : 4: match.inline 6.76% : 0.000012s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000259 1504 0.86% : 0.000002s : 15: predicate.accumulaten_eliminater 0.81% : 0.000002s : 5: predicate.ad_related_special_op_eliminate 0.56% : 0.000001s : 10: predicate.addn_check_dump 0.87% : 0.000002s : 15: predicate.addn_zero_filter 0.82% : 0.000002s : 15: predicate.adjust_all_reduce_mul_add 2.00% : 0.000005s : 25: predicate.arithmetic_simplify 1.08% : 0.000003s : 15: predicate.cast_eliminate 0.69% : 0.000002s : 10: predicate.check_bprop_eliminate 0.57% : 0.000001s : 10: predicate.compare_switch_simplify 0.19% : 0.000000s : 5: predicate.const_output_eliminate 0.68% : 0.000002s : 10: predicate.depend_value_elim 0.95% : 0.000002s : 15: predicate.dict_get_item_const_eliminator 1.05% : 0.000003s : 15: predicate.dict_get_item_eliminator 0.83% : 0.000002s : 15: predicate.dict_set_item_eliminator 1.01% : 0.000003s : 10: predicate.dumpgradient_eliminate 0.21% : 0.000001s : 5: predicate.elim_not_effective 0.57% : 0.000001s : 5: predicate.elim_shapecalc_of_broadcastargs 1.10% : 0.000003s : 20: predicate.environ_add_const_eliminate 1.08% : 0.000003s : 20: predicate.environ_get_add_eliminate 1.05% : 0.000003s : 20: predicate.environ_get_depend_swap 1.64% : 0.000004s : 30: predicate.environ_get_eliminate 1.11% : 0.000003s : 20: predicate.environ_get_set_eliminate 1.47% : 0.000004s : 23: predicate.exchange_switch_depend_value 2.22% : 0.000006s : 23: predicate.float_depend_g_call 0.51% : 0.000001s : 10: predicate.float_environ_get_switch 0.80% : 0.000002s : 15: predicate.float_tuple_getitem_switch 0.15% : 0.000000s : 5: predicate.fold_const_symbol 0.66% : 0.000002s : 10: predicate.get_grad_eliminate 0.19% : 0.000001s : 5: predicate.graph_param_transform 0.60% : 0.000002s : 10: predicate.incorporate_call 0.51% : 0.000001s : 10: predicate.incorporate_call_switch 6.38% : 0.000017s : 68: predicate.inline 1.12% : 0.000003s : 10: predicate.inline_without_move 0.32% : 0.000001s : 10: predicate.j_node_and_user_rematch 1.20% : 0.000003s : 10: predicate.less_batch_normalization 1.73% : 0.000004s : 29: predicate.list_to_tuple_eliminator_ 2.39% : 0.000006s : 44: predicate.load_eliminater 1.24% : 0.000003s : 5: predicate.loop_unroll_after_grad 2.05% : 0.000005s : 36: predicate.loop_unroll_before_grad 1.50% : 0.000004s : 25: predicate.make_slice_get_slice_eliminator 0.64% : 0.000002s : 10: predicate.merge_addn 0.60% : 0.000002s : 10: predicate.micro_step_allgather_replace 0.64% : 0.000002s : 10: predicate.mini_step_allgather_replace 0.79% : 0.000002s : 15: predicate.minmaximum_grad 1.25% : 0.000003s : 5: predicate.mutable_eliminate 0.34% : 0.000001s : 5: predicate.opt_reshape 0.39% : 0.000001s : 5: predicate.parallel_virtual_node 1.80% : 0.000005s : 23: predicate.partial_defer_inline 1.54% : 0.000004s : 24: predicate.partial_eliminate 0.87% : 0.000002s : 15: predicate.print_const_string_wrapper 0.61% : 0.000002s : 10: predicate.reduce_all_const_elim 1.16% : 0.000003s : 15: predicate.reduce_eliminate 2.38% : 0.000006s : 44: predicate.redundant_stop_gradient_eliminater 0.42% : 0.000001s : 10: predicate.remove_not_recompute_node 1.26% : 0.000003s : 29: predicate.replace_applicator 0.63% : 0.000002s : 10: predicate.replace_old_param 0.31% : 0.000001s : 5: predicate.reset_defer_inline 1.05% : 0.000003s : 15: predicate.reshape_eliminate 0.61% : 0.000002s : 10: predicate.row_tensor_add_zeros_like 0.42% : 0.000001s : 5: predicate.row_tensor_eliminate 0.89% : 0.000002s : 10: predicate.same_eliminate 0.52% : 0.000001s : 10: predicate.set_cell_output_no_recompute 1.15% : 0.000003s : 10: predicate.shard_identity_eliminate 0.65% : 0.000002s : 10: predicate.special_op_eliminate 0.74% : 0.000002s : 10: predicate.specialize_transform 1.23% : 0.000003s : 10: predicate.split_environ_get_set_with_tuple_value 1.06% : 0.000003s : 10: predicate.stack_unstack_eliminate 0.34% : 0.000001s : 5: predicate.switch_call_monad_eliminater 1.40% : 0.000004s : 23: predicate.switch_defer_inline 1.97% : 0.000005s : 33: predicate.switch_layer_defer_inline 5.24% : 0.000014s : 74: predicate.switch_simplify 0.87% : 0.000002s : 15: predicate.tile_eliminate 0.94% : 0.000002s : 15: predicate.transpose_eliminate 1.40% : 0.000004s : 25: predicate.tuple_list_convert_item_index_to_positive 1.49% : 0.000004s : 25: predicate.tuple_list_get_item_const_eliminator 1.37% : 0.000004s : 25: predicate.tuple_list_get_item_depend_reorder 3.37% : 0.000009s : 39: predicate.tuple_list_get_item_eliminator 1.40% : 0.000004s : 25: predicate.tuple_list_get_set_item_eliminator 2.08% : 0.000005s : 35: predicate.tuple_list_set_item_eliminator 1.62% : 0.000004s : 29: predicate.tuple_to_list_eliminator_ 2.34% : 0.000006s : 44: predicate.updatestate_pure_node_eliminater 3.12% : 0.000008s : 54: predicate.updatestate_useless_node_eliminater 0.41% : 0.000001s : 5: predicate.value_based_eliminate 0.63% : 0.000002s : 10: predicate.virtual_dataset_eliminate 0.69% : 0.000002s : 10: predicate.virtual_output_eliminate 0.25% : 0.000001s : 5: predicate.virtual_view_grad_eliminate 0.39% : 0.000001s : 5: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000614 11 54.32% : 0.000334s : 5: func_graph_cloner_run.FuncGraphClonerGraph 45.68% : 0.000281s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.054314 192 0.01% : 0.000004s : 1: ForceFp32Comm 29.10% : 0.015807s : 1: add_attr 29.08% : 0.015792s : 1: add_attr_with_inline 0.01% : 0.000004s : 1: add_comm_op_reuse_tag 0.13% : 0.000069s : 1: add_recomputation 0.01% : 0.000005s : 1: assign_add_opt 0.13% : 0.000073s : 1: auto_monad 0.05% : 0.000027s : 1: auto_monad_reorder 0.01% : 0.000004s : 1: begin_end_overlap_inline 0.01% : 0.000005s : 1: bias_add_comm_swap 0.92% : 0.000502s : 1: bootstrap 0.07% : 0.000040s : 1: cconv 0.01% : 0.000004s : 1: comm_op_add_attrs 0.04% : 0.000021s : 1: control_data_broadcast_order 0.03% : 0.000014s : 1: convert_after_rewriter 0.06% : 0.000032s : 1: cse_after_recomputation 0.01% : 0.000005s : 1: dataset_repeat_opt 0.01% : 0.000006s : 1: detach_backward 0.02% : 0.000010s : 1: environ_conv 0.05% : 0.000029s : 1: event_method 0.01% : 0.000006s : 1: full_micro_interleaved_order_control 0.01% : 0.000006s : 1: get_jit_bprop_graph 0.02% : 0.000009s : 1: graph_reusing 0.01% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000004s : 1: handle_group_info 0.01% : 0.000006s : 1: inline 0.01% : 0.000006s : 1: insert-virtual-dataset 0.01% : 0.000004s : 1: interleave_parallel_branches 0.01% : 0.000004s : 1: interleave_split_concat_branches 0.01% : 0.000007s : 1: label_fine_grained_interleaved_index 0.01% : 0.000008s : 1: label_micro_interleaved_index 1.16% : 0.000628s : 1: loop_unroll 0.01% : 0.000004s : 1: merge_cast_opt 0.01% : 0.000005s : 1: micro_interleaved_order_control 1.29% : 0.000699s : 1: mutable_eliminate 0.01% : 0.000008s : 1: offloading_packed_experts 0.04% : 0.000021s : 1: opt.transform.loop_unroll_optimizer 0.04% : 0.000022s : 1: opt.transform.mutable_eliminate 2.79% : 0.001517s : 78: opt.transform.opt_a 0.07% : 0.000037s : 1: opt.transform.opt_after_cconv 0.06% : 0.000034s : 1: opt.transform.opt_after_jit_grad 0.25% : 0.000137s : 28: opt.transform.opt_b 0.12% : 0.000064s : 2: opt.transform.opt_trans_graph 0.09% : 0.000050s : 4: opt.transform.symbol_engine_opt 6.48% : 0.003521s : 1: opt_a 0.25% : 0.000134s : 1: opt_after_cconv 1.01% : 0.000549s : 1: opt_after_jit_grad 0.50% : 0.000269s : 1: opt_b 11.35% : 0.006165s : 1: optimize 0.05% : 0.000025s : 1: optimize_parallel_all_gather_comm 0.02% : 0.000011s : 1: order_py_execute_after_rewriter 0.05% : 0.000029s : 1: overlap_grad_flash_sp 0.01% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.01% : 0.000008s : 1: overlap_grad_ring_attention 0.01% : 0.000006s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.01% : 0.000006s : 1: overlap_param_gather 0.01% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.02% : 0.000008s : 1: overlap_recompute_and_grad_model_parallel 0.01% : 0.000005s : 1: overlap_recompute_comm 0.02% : 0.000009s : 1: parallel-infer-symbol 0.01% : 0.000004s : 1: parallel-infer-symbol-second 0.01% : 0.000005s : 1: partial_unused_args_eliminate 0.01% : 0.000005s : 1: pipeline_parallel_scheduler 0.01% : 0.000005s : 1: pipeline_split 0.09% : 0.000047s : 1: pre_auto_parallel 0.06% : 0.000035s : 1: py_interpret_to_execute 0.04% : 0.000022s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000004s : 1: remove_cast_before_assign_add 0.04% : 0.000021s : 1: remove_dup_value 1.00% : 0.000544s : 1: renormalize.infer 0.73% : 0.000395s : 1: renormalize.specialize 0.01% : 0.000005s : 1: reorder_send_recv_between_fp_bp 0.02% : 0.000009s : 1: rewriter_after_jit_bprop_graph 0.10% : 0.000055s : 1: rewriter_after_opt_a 0.19% : 0.000102s : 1: rewriter_before_opt_a 0.01% : 0.000006s : 1: slice_cell_reuse_recomputed_activation 0.01% : 0.000005s : 1: slice_recompute_activation 0.01% : 0.000005s : 1: split_layernorm_comm 0.01% : 0.000005s : 1: split_matmul_comm_elemetwise 0.02% : 0.000010s : 1: swap_dp_allreduce_reducescatter 0.19% : 0.000101s : 1: symbol_engine_optimizer 0.18% : 0.000097s : 1: tuple_transform 11.65% : 0.006326s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:45:09.891.427 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:45:09.891.717 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0186676, [21] [bootstrap]: 0.00044704 [type_inference]: 0.00653222 [event_method]: 2.113e-05 [auto_monad]: 7.036e-05 [graph_reusing]: 6.86001e-06 [inline]: 3.01001e-06 [add_attr]: 0.00348342, [1] [add_attr_with_inline]: 0.00347065, [1] [Cycle 1]: 9.274e-05, [2] [tag_attr]: 2.228e-05 [meta_addattr_fg_expand]: 6.21998e-06 [parallel-infer-symbol]: 4.12e-06 [pre_auto_parallel]: 4.001e-05 [insert-virtual-dataset]: 2.39001e-06 [parallel-infer-symbol-second]: 9.00007e-07 [dataset_repeat_opt]: 2.35002e-06 [pipeline_split]: 1.61002e-06 [optimize]: 0.00675083, [53] [py_interpret_to_execute]: 3.292e-05 [rewriter_before_opt_a]: 9.671e-05 [opt_a]: 0.00378658, [2] [Cycle 1]: 0.00274712, [45] [expand_dump_flag]: 3.45998e-06 [switch_simplify]: 4.398e-05 [loop_unroll]: 3.136e-05 [a_1]: 0.0007374 [with_stream_mark]: 2.429e-05 [recompute_prepare]: 1.598e-05 [updatestate_depend_eliminate]: 5.71e-06 [updatestate_assign_eliminate]: 3.98999e-06 [updatestate_loads_eliminate]: 4.20999e-06 [parameter_eliminate]: 2.77002e-06 [a_2]: 0.00014376 [accelerated_algorithm]: 9.79999e-06 [shard]: 2.61999e-06 [meta_shard_fg_expand]: 2.68998e-06 [shard_inline]: 8.45001e-06 [merge_send_recv]: 1.093e-05 [auto_parallel]: 1e-05 [parallel]: 1.913e-05 [flash_sp]: 1.118e-05 [merge_comm]: 4.69998e-06 [allreduce_fusion]: 5.19e-06 [matmul_add_comm_reduction]: 1.197e-05 [allreduce_slice_to_reducescatter]: 7.2e-07 [virtual_shard_identity]: 1.225e-05 [virtual_dataset]: 8.42998e-06 [get_grad_eliminate_]: 8.08999e-06 [virtual_output]: 8.22e-06 [merge_forward]: 5.37001e-06 [cell_reuse_recompute_pass]: 1.73002e-06 [offload_activation]: 1.179e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.164e-05 [merge_recompute_call_nodes]: 2.16998e-06 [before_grad]: 1.42e-05 [set_forward_comm_id_for_comm_node_pass]: 5.03002e-06 [meta_fg_expand]: 3.55e-06 [flash_sp_send_recv_attached]: 3.04001e-06 [receive_attached]: 2.03002e-06 [after_resolve]: 1.496e-05 [a_after_grad]: 1.365e-05 [renormalize]: 0.00091556 [add_forward_monad_depend]: 7.40998e-06 [auto_monad_grad]: 2.68998e-06 [auto_monad_eliminator]: 2.132e-05 [cse]: 3.645e-05 [a_3]: 7.87e-05 [Cycle 2]: 0.00102258, [45] [expand_dump_flag]: 2.01998e-06 [switch_simplify]: 1.01e-05 [loop_unroll]: 7.68001e-06 [a_1]: 0.00018155 [with_stream_mark]: 1.511e-05 [recompute_prepare]: 9.17999e-06 [updatestate_depend_eliminate]: 3.95e-06 [updatestate_assign_eliminate]: 3.65e-06 [updatestate_loads_eliminate]: 3.48e-06 [parameter_eliminate]: 1.32999e-06 [a_2]: 0.00012333 [accelerated_algorithm]: 8.1e-06 [shard]: 1.82999e-06 [meta_shard_fg_expand]: 2.07001e-06 [shard_inline]: 1.208e-05 [merge_send_recv]: 9.02e-06 [auto_parallel]: 8.13001e-06 [parallel]: 7.41001e-06 [flash_sp]: 3.91999e-06 [merge_comm]: 4.46002e-06 [allreduce_fusion]: 4.15e-06 [matmul_add_comm_reduction]: 1.008e-05 [allreduce_slice_to_reducescatter]: 5.60016e-07 [virtual_shard_identity]: 1.131e-05 [virtual_dataset]: 7.89002e-06 [get_grad_eliminate_]: 6.96001e-06 [virtual_output]: 6.89999e-06 [merge_forward]: 4.75999e-06 [cell_reuse_recompute_pass]: 2.06998e-06 [offload_activation]: 9.88998e-06 [cell_reuse_handle_not_recompute_node_pass]: 2.069e-05 [merge_recompute_call_nodes]: 1.42e-06 [before_grad]: 1.302e-05 [set_forward_comm_id_for_comm_node_pass]: 5.67999e-06 [meta_fg_expand]: 3.16001e-06 [flash_sp_send_recv_attached]: 1.86e-06 [receive_attached]: 1.71e-06 [after_resolve]: 1.446e-05 [a_after_grad]: 1.135e-05 [renormalize]: 5.9983e-08 [add_forward_monad_depend]: 2.76e-06 [auto_monad_grad]: 2.17999e-06 [auto_monad_eliminator]: 1.294e-05 [cse]: 2.321e-05 [a_3]: 6.482e-05 [py_interpret_to_execute_after_opt_a]: 1.837e-05 [slice_cell_reuse_recomputed_activation]: 4.70999e-06 [rewriter_after_opt_a]: 5.336e-05 [convert_after_rewriter]: 1.128e-05 [order_py_execute_after_rewriter]: 8.85001e-06 [mutable_eliminate]: 0.00090312 [opt_b]: 0.00033299, [1] [Cycle 1]: 0.00032028, [7] [b_1]: 0.00020449 [b_2]: 1.038e-05 [updatestate_depend_eliminate]: 1.003e-05 [updatestate_assign_eliminate]: 3.48999e-06 [updatestate_loads_eliminate]: 3.06001e-06 [renormalize]: 7.00005e-07 [cse]: 2.912e-05 [optimize_parallel_all_gather_comm]: 2.347e-05 [overlap_param_gather]: 5.10999e-06 [cconv]: 3.835e-05 [loop_unroll]: 0.00050613 [opt_after_cconv]: 0.0001495, [1] [Cycle 1]: 0.00013979, [7] [c_1]: 3.832e-05 [parameter_eliminate]: 4.39002e-06 [updatestate_depend_eliminate]: 7.77998e-06 [updatestate_assign_eliminate]: 3.47002e-06 [updatestate_loads_eliminate]: 3.01001e-06 [cse]: 2.608e-05 [renormalize]: 4.2998e-07 [remove_dup_value]: 1.877e-05 [tuple_transform]: 0.00010884, [1] [Cycle 1]: 0.00010091, [4] [d_1]: 5.626e-05 [none_parameter_eliminate]: 2.23002e-06 [renormalize]: 2.19996e-07 [switch_simplify]: 9.33997e-06 [partial_unused_args_eliminate]: 5.04998e-06 [add_recomputation]: 6.707e-05 [cse_after_recomputation]: 3.328e-05, [1] [Cycle 1]: 2.581e-05, [1] [cse]: 1.563e-05 [environ_conv]: 1.04e-05 [swap_dp_allreduce_reducescatter]: 8.61002e-06 [bias_add_comm_swap]: 5.82999e-06 [label_micro_interleaved_index]: 7.33999e-06 [label_fine_grained_interleaved_index]: 5.48002e-06 [merge_cast_opt]: 3.76001e-06 [slice_recompute_activation]: 4.70001e-06 [micro_interleaved_order_control]: 5.02e-06 [assign_add_opt]: 4.22998e-06 [ForceFp32Comm]: 3.31999e-06 [remove_cast_before_assign_add]: 3.28998e-06 [full_micro_interleaved_order_control]: 4.43999e-06 [reorder_send_recv_between_fp_bp]: 5.19e-06 [comm_op_add_attrs]: 3.25002e-06 [add_comm_op_reuse_tag]: 3.58e-06 [interleave_split_concat_branches]: 3.73001e-06 [interleave_parallel_branches]: 3.8e-06 [overlap_opt_shard_in_pipeline]: 3.48e-06 [overlap_opt_shard_grad_in_pipeline]: 4.06001e-06 [control_data_broadcast_order]: 2.019e-05 [grouped_pairwise_exchange_alltoall]: 4.74e-06 [offloading_packed_experts]: 7.77998e-06 [overlap_recompute_and_grad_model_parallel]: 8.08001e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.83999e-06 [overlap_recompute_allgather_and_fa_grad]: 3.7e-06 [overlap_recompute_comm]: 5.05001e-06 [overlap_grad_ring_attention]: 7.16999e-06 [overlap_grad_flash_sp]: 2.762e-05 [begin_end_overlap_inline]: 3.16001e-06 [split_matmul_comm_elemetwise]: 4.92999e-06 [split_layernorm_comm]: 3.93999e-06 [handle_group_info]: 3.83001e-06 [symbol_engine_optimizer]: 0.00011737, [1] [Cycle 1]: 0.00010828, [6] [build]: 4.07e-06 [elim_shapecalc]: 1.482e-05 [elim_not_effective]: 1.677e-05 [opt_reshape]: 8.77e-06 [fold_const_symbol]: 1.352e-05 [renormalize]: 2.10013e-07 [detach_backward]: 5.44e-06 [pipeline_parallel_scheduler]: 2.64999e-06 [auto_monad_reorder]: 2.879e-05 [get_jit_bprop_graph]: 2.14e-06 [rewriter_after_jit_bprop_graph]: 6.04999e-06 [opt_after_jit_grad]: 0.00057158 [validate]: 4.894e-05 Sums bootstrap : 0.000447s : 3.37% type_inference : 0.006532s : 49.19% event_method : 0.000021s : 0.16% auto_monad : 0.000070s : 0.53% graph_reusing : 0.000007s : 0.05% inline : 0.000003s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000022s : 0.17% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.05% parallel-infer-symbol : 0.000004s : 0.03% pre_auto_parallel : 0.000040s : 0.30% insert-virtual-dataset : 0.000002s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000033s : 0.25% optimize.rewriter_before_opt_a : 0.000097s : 0.73% optimize.opt_a.expand_dump_flag : 0.000005s : 0.04% optimize.opt_a.switch_simplify : 0.000054s : 0.41% optimize.opt_a.loop_unroll : 0.000039s : 0.29% optimize.opt_a.a_1 : 0.000919s : 6.92% optimize.opt_a.with_stream_mark : 0.000039s : 0.30% optimize.opt_a.recompute_prepare : 0.000025s : 0.19% optimize.opt_a.updatestate_depend_eliminate : 0.000010s : 0.07% optimize.opt_a.updatestate_assign_eliminate : 0.000008s : 0.06% optimize.opt_a.updatestate_loads_eliminate : 0.000008s : 0.06% optimize.opt_a.parameter_eliminate : 0.000004s : 0.03% optimize.opt_a.a_2 : 0.000267s : 2.01% optimize.opt_a.accelerated_algorithm : 0.000018s : 0.13% optimize.opt_a.shard : 0.000004s : 0.03% optimize.opt_a.meta_shard_fg_expand : 0.000005s : 0.04% optimize.opt_a.shard_inline : 0.000021s : 0.15% optimize.opt_a.merge_send_recv : 0.000020s : 0.15% optimize.opt_a.auto_parallel : 0.000018s : 0.14% optimize.opt_a.parallel : 0.000027s : 0.20% optimize.opt_a.flash_sp : 0.000015s : 0.11% optimize.opt_a.merge_comm : 0.000009s : 0.07% optimize.opt_a.allreduce_fusion : 0.000009s : 0.07% optimize.opt_a.matmul_add_comm_reduction : 0.000022s : 0.17% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000024s : 0.18% optimize.opt_a.virtual_dataset : 0.000016s : 0.12% optimize.opt_a.get_grad_eliminate_ : 0.000015s : 0.11% optimize.opt_a.virtual_output : 0.000015s : 0.11% optimize.opt_a.merge_forward : 0.000010s : 0.08% optimize.opt_a.cell_reuse_recompute_pass : 0.000004s : 0.03% optimize.opt_a.offload_activation : 0.000022s : 0.16% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000042s : 0.32% optimize.opt_a.merge_recompute_call_nodes : 0.000004s : 0.03% optimize.opt_a.before_grad : 0.000027s : 0.20% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000011s : 0.08% optimize.opt_a.meta_fg_expand : 0.000007s : 0.05% optimize.opt_a.flash_sp_send_recv_attached : 0.000005s : 0.04% optimize.opt_a.receive_attached : 0.000004s : 0.03% optimize.opt_a.after_resolve : 0.000029s : 0.22% optimize.opt_a.a_after_grad : 0.000025s : 0.19% optimize.opt_a.renormalize : 0.000916s : 6.90% optimize.opt_a.add_forward_monad_depend : 0.000010s : 0.08% optimize.opt_a.auto_monad_grad : 0.000005s : 0.04% optimize.opt_a.auto_monad_eliminator : 0.000034s : 0.26% optimize.opt_a.cse : 0.000060s : 0.45% optimize.opt_a.a_3 : 0.000144s : 1.08% optimize.py_interpret_to_execute_after_opt_a : 0.000018s : 0.14% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.04% optimize.rewriter_after_opt_a : 0.000053s : 0.40% optimize.convert_after_rewriter : 0.000011s : 0.08% optimize.order_py_execute_after_rewriter : 0.000009s : 0.07% optimize.mutable_eliminate : 0.000903s : 6.80% optimize.opt_b.b_1 : 0.000204s : 1.54% optimize.opt_b.b_2 : 0.000010s : 0.08% optimize.opt_b.updatestate_depend_eliminate : 0.000010s : 0.08% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.02% optimize.opt_b.renormalize : 0.000001s : 0.01% optimize.opt_b.cse : 0.000029s : 0.22% optimize.optimize_parallel_all_gather_comm : 0.000023s : 0.18% optimize.overlap_param_gather : 0.000005s : 0.04% optimize.cconv : 0.000038s : 0.29% optimize.loop_unroll : 0.000506s : 3.81% optimize.opt_after_cconv.c_1 : 0.000038s : 0.29% optimize.opt_after_cconv.parameter_eliminate : 0.000004s : 0.03% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000008s : 0.06% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.02% optimize.opt_after_cconv.cse : 0.000026s : 0.20% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000019s : 0.14% optimize.tuple_transform.d_1 : 0.000056s : 0.42% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.02% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000009s : 0.07% optimize.partial_unused_args_eliminate : 0.000005s : 0.04% optimize.add_recomputation : 0.000067s : 0.51% optimize.cse_after_recomputation.cse : 0.000016s : 0.12% optimize.environ_conv : 0.000010s : 0.08% optimize.swap_dp_allreduce_reducescatter : 0.000009s : 0.06% optimize.bias_add_comm_swap : 0.000006s : 0.04% optimize.label_micro_interleaved_index : 0.000007s : 0.06% optimize.label_fine_grained_interleaved_index : 0.000005s : 0.04% optimize.merge_cast_opt : 0.000004s : 0.03% optimize.slice_recompute_activation : 0.000005s : 0.04% optimize.micro_interleaved_order_control : 0.000005s : 0.04% optimize.assign_add_opt : 0.000004s : 0.03% optimize.ForceFp32Comm : 0.000003s : 0.03% optimize.remove_cast_before_assign_add : 0.000003s : 0.02% optimize.full_micro_interleaved_order_control : 0.000004s : 0.03% optimize.reorder_send_recv_between_fp_bp : 0.000005s : 0.04% optimize.comm_op_add_attrs : 0.000003s : 0.02% optimize.add_comm_op_reuse_tag : 0.000004s : 0.03% optimize.interleave_split_concat_branches : 0.000004s : 0.03% optimize.interleave_parallel_branches : 0.000004s : 0.03% optimize.overlap_opt_shard_in_pipeline : 0.000003s : 0.03% optimize.overlap_opt_shard_grad_in_pipeline : 0.000004s : 0.03% optimize.control_data_broadcast_order : 0.000020s : 0.15% optimize.grouped_pairwise_exchange_alltoall : 0.000005s : 0.04% optimize.offloading_packed_experts : 0.000008s : 0.06% optimize.overlap_recompute_and_grad_model_parallel : 0.000008s : 0.06% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.03% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.03% optimize.overlap_recompute_comm : 0.000005s : 0.04% optimize.overlap_grad_ring_attention : 0.000007s : 0.05% optimize.overlap_grad_flash_sp : 0.000028s : 0.21% optimize.begin_end_overlap_inline : 0.000003s : 0.02% optimize.split_matmul_comm_elemetwise : 0.000005s : 0.04% optimize.split_layernorm_comm : 0.000004s : 0.03% optimize.handle_group_info : 0.000004s : 0.03% optimize.symbol_engine_optimizer.build : 0.000004s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000015s : 0.11% optimize.symbol_engine_optimizer.elim_not_effective : 0.000017s : 0.13% optimize.symbol_engine_optimizer.opt_reshape : 0.000009s : 0.07% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000014s : 0.10% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000005s : 0.04% pipeline_parallel_scheduler : 0.000003s : 0.02% auto_monad_reorder : 0.000029s : 0.22% get_jit_bprop_graph : 0.000002s : 0.02% rewriter_after_jit_bprop_graph : 0.000006s : 0.05% opt_after_jit_grad : 0.000572s : 4.30% validate : 0.000049s : 0.37% Time group info: ------[substitution.] 0.000257 38 11.48% : 0.000029s : 3: substitution.cast_eliminate 0.88% : 0.000002s : 3: substitution.elim_not_effective 0.84% : 0.000002s : 3: substitution.fold_const_symbol 2.84% : 0.000007s : 5: substitution.graph_param_transform 71.55% : 0.000184s : 4: substitution.inline 2.09% : 0.000005s : 6: substitution.j_node_and_user_rematch 2.65% : 0.000007s : 6: substitution.remove_not_recompute_node 2.35% : 0.000006s : 4: substitution.replace_old_param 5.33% : 0.000014s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.006470 2 87.69% : 0.005673s : 1: type_inference.infer 12.31% : 0.000796s : 1: type_inference.specialize ------[replace.] 0.000070 8 61.95% : 0.000043s : 4: replace.inline 38.05% : 0.000027s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000191 8 93.77% : 0.000179s : 4: match.inline 6.23% : 0.000012s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000455 1504 0.47% : 0.000002s : 15: predicate.accumulaten_eliminater 0.53% : 0.000002s : 5: predicate.ad_related_special_op_eliminate 0.39% : 0.000002s : 10: predicate.addn_check_dump 0.51% : 0.000002s : 15: predicate.addn_zero_filter 0.42% : 0.000002s : 15: predicate.adjust_all_reduce_mul_add 1.15% : 0.000005s : 25: predicate.arithmetic_simplify 0.72% : 0.000003s : 15: predicate.cast_eliminate 0.36% : 0.000002s : 10: predicate.check_bprop_eliminate 0.33% : 0.000002s : 10: predicate.compare_switch_simplify 0.11% : 0.000000s : 5: predicate.const_output_eliminate 0.37% : 0.000002s : 10: predicate.depend_value_elim 0.49% : 0.000002s : 15: predicate.dict_get_item_const_eliminator 0.53% : 0.000002s : 15: predicate.dict_get_item_eliminator 0.44% : 0.000002s : 15: predicate.dict_set_item_eliminator 0.66% : 0.000003s : 10: predicate.dumpgradient_eliminate 0.14% : 0.000001s : 5: predicate.elim_not_effective 0.36% : 0.000002s : 5: predicate.elim_shapecalc_of_broadcastargs 0.60% : 0.000003s : 20: predicate.environ_add_const_eliminate 0.60% : 0.000003s : 20: predicate.environ_get_add_eliminate 0.58% : 0.000003s : 20: predicate.environ_get_depend_swap 0.98% : 0.000004s : 30: predicate.environ_get_eliminate 0.61% : 0.000003s : 20: predicate.environ_get_set_eliminate 0.72% : 0.000003s : 23: predicate.exchange_switch_depend_value 1.40% : 0.000006s : 23: predicate.float_depend_g_call 0.33% : 0.000002s : 10: predicate.float_environ_get_switch 0.45% : 0.000002s : 15: predicate.float_tuple_getitem_switch 0.11% : 0.000000s : 5: predicate.fold_const_symbol 0.41% : 0.000002s : 10: predicate.get_grad_eliminate 0.11% : 0.000000s : 5: predicate.graph_param_transform 0.34% : 0.000002s : 10: predicate.incorporate_call 0.29% : 0.000001s : 10: predicate.incorporate_call_switch 3.75% : 0.000017s : 68: predicate.inline 0.53% : 0.000002s : 10: predicate.inline_without_move 0.18% : 0.000001s : 10: predicate.j_node_and_user_rematch 0.48% : 0.000002s : 10: predicate.less_batch_normalization 0.96% : 0.000004s : 29: predicate.list_to_tuple_eliminator_ 1.31% : 0.000006s : 44: predicate.load_eliminater 0.54% : 0.000002s : 5: predicate.loop_unroll_after_grad 1.13% : 0.000005s : 36: predicate.loop_unroll_before_grad 1.08% : 0.000005s : 25: predicate.make_slice_get_slice_eliminator 0.35% : 0.000002s : 10: predicate.merge_addn 0.33% : 0.000001s : 10: predicate.micro_step_allgather_replace 0.32% : 0.000001s : 10: predicate.mini_step_allgather_replace 0.42% : 0.000002s : 15: predicate.minmaximum_grad 44.18% : 0.000201s : 5: predicate.mutable_eliminate 0.20% : 0.000001s : 5: predicate.opt_reshape 0.20% : 0.000001s : 5: predicate.parallel_virtual_node 1.03% : 0.000005s : 23: predicate.partial_defer_inline 0.86% : 0.000004s : 24: predicate.partial_eliminate 0.48% : 0.000002s : 15: predicate.print_const_string_wrapper 0.31% : 0.000001s : 10: predicate.reduce_all_const_elim 0.58% : 0.000003s : 15: predicate.reduce_eliminate 1.31% : 0.000006s : 44: predicate.redundant_stop_gradient_eliminater 0.30% : 0.000001s : 10: predicate.remove_not_recompute_node 0.79% : 0.000004s : 29: predicate.replace_applicator 0.33% : 0.000002s : 10: predicate.replace_old_param 0.22% : 0.000001s : 5: predicate.reset_defer_inline 0.51% : 0.000002s : 15: predicate.reshape_eliminate 0.40% : 0.000002s : 10: predicate.row_tensor_add_zeros_like 0.20% : 0.000001s : 5: predicate.row_tensor_eliminate 0.56% : 0.000003s : 10: predicate.same_eliminate 0.33% : 0.000002s : 10: predicate.set_cell_output_no_recompute 0.61% : 0.000003s : 10: predicate.shard_identity_eliminate 0.38% : 0.000002s : 10: predicate.special_op_eliminate 0.54% : 0.000002s : 10: predicate.specialize_transform 0.55% : 0.000003s : 10: predicate.split_environ_get_set_with_tuple_value 0.48% : 0.000002s : 10: predicate.stack_unstack_eliminate 0.19% : 0.000001s : 5: predicate.switch_call_monad_eliminater 0.76% : 0.000003s : 23: predicate.switch_defer_inline 1.11% : 0.000005s : 33: predicate.switch_layer_defer_inline 2.74% : 0.000012s : 74: predicate.switch_simplify 0.48% : 0.000002s : 15: predicate.tile_eliminate 0.45% : 0.000002s : 15: predicate.transpose_eliminate 0.86% : 0.000004s : 25: predicate.tuple_list_convert_item_index_to_positive 0.82% : 0.000004s : 25: predicate.tuple_list_get_item_const_eliminator 0.86% : 0.000004s : 25: predicate.tuple_list_get_item_depend_reorder 1.98% : 0.000009s : 39: predicate.tuple_list_get_item_eliminator 0.83% : 0.000004s : 25: predicate.tuple_list_get_set_item_eliminator 1.14% : 0.000005s : 35: predicate.tuple_list_set_item_eliminator 0.95% : 0.000004s : 29: predicate.tuple_to_list_eliminator_ 1.25% : 0.000006s : 44: predicate.updatestate_pure_node_eliminater 1.79% : 0.000008s : 54: predicate.updatestate_useless_node_eliminater 0.26% : 0.000001s : 5: predicate.value_based_eliminate 0.44% : 0.000002s : 10: predicate.virtual_dataset_eliminate 0.36% : 0.000002s : 10: predicate.virtual_output_eliminate 0.16% : 0.000001s : 5: predicate.virtual_view_grad_eliminate 0.28% : 0.000001s : 5: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000629 11 53.17% : 0.000334s : 5: func_graph_cloner_run.FuncGraphClonerGraph 46.83% : 0.000294s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.031720 192 0.02% : 0.000006s : 1: ForceFp32Comm 11.02% : 0.003495s : 1: add_attr 10.96% : 0.003475s : 1: add_attr_with_inline 0.02% : 0.000006s : 1: add_comm_op_reuse_tag 0.22% : 0.000071s : 1: add_recomputation 0.02% : 0.000007s : 1: assign_add_opt 0.26% : 0.000081s : 1: auto_monad 0.12% : 0.000037s : 1: auto_monad_reorder 0.02% : 0.000006s : 1: begin_end_overlap_inline 0.03% : 0.000009s : 1: bias_add_comm_swap 1.55% : 0.000491s : 1: bootstrap 0.13% : 0.000042s : 1: cconv 0.02% : 0.000006s : 1: comm_op_add_attrs 0.08% : 0.000024s : 1: control_data_broadcast_order 0.05% : 0.000015s : 1: convert_after_rewriter 0.12% : 0.000037s : 1: cse_after_recomputation 0.02% : 0.000008s : 1: dataset_repeat_opt 0.08% : 0.000026s : 1: detach_backward 0.04% : 0.000014s : 1: environ_conv 0.10% : 0.000032s : 1: event_method 0.02% : 0.000007s : 1: full_micro_interleaved_order_control 0.03% : 0.000009s : 1: get_jit_bprop_graph 0.04% : 0.000013s : 1: graph_reusing 0.03% : 0.000008s : 1: grouped_pairwise_exchange_alltoall 0.02% : 0.000006s : 1: handle_group_info 0.03% : 0.000009s : 1: inline 0.03% : 0.000009s : 1: insert-virtual-dataset 0.02% : 0.000006s : 1: interleave_parallel_branches 0.02% : 0.000006s : 1: interleave_split_concat_branches 0.03% : 0.000008s : 1: label_fine_grained_interleaved_index 0.03% : 0.000010s : 1: label_micro_interleaved_index 1.62% : 0.000514s : 1: loop_unroll 0.02% : 0.000006s : 1: merge_cast_opt 0.02% : 0.000008s : 1: micro_interleaved_order_control 2.87% : 0.000911s : 1: mutable_eliminate 0.03% : 0.000011s : 1: offloading_packed_experts 0.07% : 0.000021s : 1: opt.transform.loop_unroll_optimizer 0.71% : 0.000225s : 1: opt.transform.mutable_eliminate 4.70% : 0.001492s : 78: opt.transform.opt_a 0.12% : 0.000037s : 1: opt.transform.opt_after_cconv 0.12% : 0.000037s : 1: opt.transform.opt_after_jit_grad 0.44% : 0.000141s : 28: opt.transform.opt_b 0.20% : 0.000063s : 2: opt.transform.opt_trans_graph 0.16% : 0.000050s : 4: opt.transform.symbol_engine_opt 11.95% : 0.003790s : 1: opt_a 0.48% : 0.000153s : 1: opt_after_cconv 1.84% : 0.000585s : 1: opt_after_jit_grad 1.06% : 0.000338s : 1: opt_b 22.42% : 0.007112s : 1: optimize 0.09% : 0.000027s : 1: optimize_parallel_all_gather_comm 0.04% : 0.000012s : 1: order_py_execute_after_rewriter 0.10% : 0.000031s : 1: overlap_grad_flash_sp 0.02% : 0.000007s : 1: overlap_grad_matmul_and_grad_allreduce 0.03% : 0.000010s : 1: overlap_grad_ring_attention 0.02% : 0.000007s : 1: overlap_opt_shard_grad_in_pipeline 0.02% : 0.000006s : 1: overlap_opt_shard_in_pipeline 0.03% : 0.000008s : 1: overlap_param_gather 0.02% : 0.000007s : 1: overlap_recompute_allgather_and_fa_grad 0.03% : 0.000011s : 1: overlap_recompute_and_grad_model_parallel 0.02% : 0.000008s : 1: overlap_recompute_comm 0.04% : 0.000011s : 1: parallel-infer-symbol 0.02% : 0.000006s : 1: parallel-infer-symbol-second 0.03% : 0.000008s : 1: partial_unused_args_eliminate 0.03% : 0.000010s : 1: pipeline_parallel_scheduler 0.02% : 0.000007s : 1: pipeline_split 0.15% : 0.000048s : 1: pre_auto_parallel 0.12% : 0.000037s : 1: py_interpret_to_execute 0.07% : 0.000022s : 1: py_interpret_to_execute_after_opt_a 0.02% : 0.000006s : 1: remove_cast_before_assign_add 0.07% : 0.000022s : 1: remove_dup_value 1.65% : 0.000522s : 1: renormalize.infer 1.20% : 0.000380s : 1: renormalize.specialize 0.02% : 0.000008s : 1: reorder_send_recv_between_fp_bp 0.04% : 0.000012s : 1: rewriter_after_jit_bprop_graph 0.18% : 0.000058s : 1: rewriter_after_opt_a 0.32% : 0.000101s : 1: rewriter_before_opt_a 0.02% : 0.000008s : 1: slice_cell_reuse_recomputed_activation 0.02% : 0.000008s : 1: slice_recompute_activation 0.02% : 0.000007s : 1: split_layernorm_comm 0.02% : 0.000008s : 1: split_matmul_comm_elemetwise 0.04% : 0.000011s : 1: swap_dp_allreduce_reducescatter 0.38% : 0.000120s : 1: symbol_engine_optimizer 0.35% : 0.000112s : 1: tuple_transform 20.74% : 0.006580s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:45:10.510.331 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0591936, [21] [bootstrap]: 0.00047786 [type_inference]: 0.0188304 [event_method]: 2.419e-05 [auto_monad]: 7.167e-05 [graph_reusing]: 5.92999e-06 [inline]: 3.45e-06 [add_attr]: 0.0039177, [1] [add_attr_with_inline]: 0.00390428, [1] [Cycle 1]: 7.341e-05, [2] [tag_attr]: 2.342e-05 [meta_addattr_fg_expand]: 5.97999e-06 [parallel-infer-symbol]: 3.73001e-06 [pre_auto_parallel]: 4.018e-05 [insert-virtual-dataset]: 2.53e-06 [parallel-infer-symbol-second]: 1.25001e-06 [dataset_repeat_opt]: 2.10002e-06 [pipeline_split]: 1.59e-06 [optimize]: 0.0350175, [53] [py_interpret_to_execute]: 3.045e-05 [rewriter_before_opt_a]: 9.666e-05 [opt_a]: 0.03238, [2] [Cycle 1]: 0.0314119, [45] [expand_dump_flag]: 2.89001e-06 [switch_simplify]: 4.425e-05 [loop_unroll]: 3.087e-05 [a_1]: 0.00071966 [with_stream_mark]: 2.241e-05 [recompute_prepare]: 1.468e-05 [updatestate_depend_eliminate]: 5.23002e-06 [updatestate_assign_eliminate]: 4.48001e-06 [updatestate_loads_eliminate]: 4.18001e-06 [parameter_eliminate]: 2.05002e-06 [a_2]: 0.00010498 [accelerated_algorithm]: 1.043e-05 [shard]: 2.29999e-06 [meta_shard_fg_expand]: 2.32999e-06 [shard_inline]: 8.3e-06 [merge_send_recv]: 1.045e-05 [auto_parallel]: 9.44e-06 [parallel]: 2.023e-05 [flash_sp]: 1.16e-05 [merge_comm]: 4.92999e-06 [allreduce_fusion]: 4.18001e-06 [matmul_add_comm_reduction]: 1.139e-05 [allreduce_slice_to_reducescatter]: 6.29982e-07 [virtual_shard_identity]: 1.046e-05 [virtual_dataset]: 8.57e-06 [get_grad_eliminate_]: 7.31999e-06 [virtual_output]: 8.2e-06 [merge_forward]: 4.60001e-06 [cell_reuse_recompute_pass]: 1.59e-06 [offload_activation]: 1.164e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.96e-05 [merge_recompute_call_nodes]: 1.45999e-06 [before_grad]: 1.388e-05 [set_forward_comm_id_for_comm_node_pass]: 5.36998e-06 [meta_fg_expand]: 3.63e-06 [flash_sp_send_recv_attached]: 3.21001e-06 [receive_attached]: 2.32999e-06 [after_resolve]: 1.564e-05 [a_after_grad]: 1.286e-05 [renormalize]: 0.0297514 [add_forward_monad_depend]: 1.262e-05 [auto_monad_grad]: 2.74001e-06 [auto_monad_eliminator]: 2.921e-05 [cse]: 4.383e-05 [a_3]: 7.75e-05 [Cycle 2]: 0.00095418, [45] [expand_dump_flag]: 2.09999e-06 [switch_simplify]: 1.125e-05 [loop_unroll]: 8.3e-06 [a_1]: 0.0001997 [with_stream_mark]: 2.475e-05 [recompute_prepare]: 1.052e-05 [updatestate_depend_eliminate]: 5.04e-06 [updatestate_assign_eliminate]: 3.74002e-06 [updatestate_loads_eliminate]: 3.73001e-06 [parameter_eliminate]: 2.06e-06 [a_2]: 9.852e-05 [accelerated_algorithm]: 9.41e-06 [shard]: 2.64001e-06 [meta_shard_fg_expand]: 3.13e-06 [shard_inline]: 9.07999e-06 [merge_send_recv]: 1.158e-05 [auto_parallel]: 1.342e-05 [parallel]: 9.47999e-06 [flash_sp]: 4.12003e-06 [merge_comm]: 4.38001e-06 [allreduce_fusion]: 6.59001e-06 [matmul_add_comm_reduction]: 1.17e-05 [allreduce_slice_to_reducescatter]: 1.17e-06 [virtual_shard_identity]: 1.354e-05 [virtual_dataset]: 8.02998e-06 [get_grad_eliminate_]: 8.37998e-06 [virtual_output]: 7.15e-06 [merge_forward]: 5.36998e-06 [cell_reuse_recompute_pass]: 3.45e-06 [offload_activation]: 1.29e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.924e-05 [merge_recompute_call_nodes]: 1.62001e-06 [before_grad]: 1.34e-05 [set_forward_comm_id_for_comm_node_pass]: 6.24999e-06 [meta_fg_expand]: 3.57002e-06 [flash_sp_send_recv_attached]: 1.99999e-06 [receive_attached]: 2.17001e-06 [after_resolve]: 1.698e-05 [a_after_grad]: 1.211e-05 [renormalize]: 7.99773e-08 [add_forward_monad_depend]: 3.88001e-06 [auto_monad_grad]: 2.44999e-06 [auto_monad_eliminator]: 1.467e-05 [cse]: 2.903e-05 [a_3]: 5e-05 [py_interpret_to_execute_after_opt_a]: 2.096e-05 [slice_cell_reuse_recomputed_activation]: 2.37001e-06 [rewriter_after_opt_a]: 5.749e-05 [convert_after_rewriter]: 9.10999e-06 [order_py_execute_after_rewriter]: 6.89999e-06 [mutable_eliminate]: 0.00077603 [opt_b]: 0.00026804, [1] [Cycle 1]: 0.00026032, [7] [b_1]: 0.0001608 [b_2]: 9.90002e-06 [updatestate_depend_eliminate]: 9.48002e-06 [updatestate_assign_eliminate]: 4.20999e-06 [updatestate_loads_eliminate]: 3.49001e-06 [renormalize]: 2.69996e-07 [cse]: 3.398e-05 [optimize_parallel_all_gather_comm]: 2.148e-05 [overlap_param_gather]: 2.17999e-06 [cconv]: 3.356e-05 [loop_unroll]: 0.00048197 [opt_after_cconv]: 0.00012864, [1] [Cycle 1]: 0.00012173, [7] [c_1]: 3.817e-05 [parameter_eliminate]: 5.17e-06 [updatestate_depend_eliminate]: 7.60998e-06 [updatestate_assign_eliminate]: 3.4e-06 [updatestate_loads_eliminate]: 2.81e-06 [cse]: 2.787e-05 [renormalize]: 1.11002e-06 [remove_dup_value]: 1.65e-05 [tuple_transform]: 9.095e-05, [1] [Cycle 1]: 8.571e-05, [4] [d_1]: 5.556e-05 [none_parameter_eliminate]: 2.32001e-06 [renormalize]: 1.50001e-07 [switch_simplify]: 8.07998e-06 [partial_unused_args_eliminate]: 1.84998e-06 [add_recomputation]: 6.647e-05 [cse_after_recomputation]: 2.875e-05, [1] [Cycle 1]: 2.306e-05, [1] [cse]: 1.698e-05 [environ_conv]: 7.97e-06 [swap_dp_allreduce_reducescatter]: 6.31e-06 [bias_add_comm_swap]: 3.34001e-06 [label_micro_interleaved_index]: 4.62e-06 [label_fine_grained_interleaved_index]: 2.98998e-06 [merge_cast_opt]: 1.53002e-06 [slice_recompute_activation]: 2.09e-06 [micro_interleaved_order_control]: 2.41e-06 [assign_add_opt]: 1.96998e-06 [ForceFp32Comm]: 1.32e-06 [remove_cast_before_assign_add]: 1.05999e-06 [full_micro_interleaved_order_control]: 2.65002e-06 [reorder_send_recv_between_fp_bp]: 3.26001e-06 [comm_op_add_attrs]: 1.02e-06 [add_comm_op_reuse_tag]: 1.20001e-06 [interleave_split_concat_branches]: 1.19998e-06 [interleave_parallel_branches]: 1.10999e-06 [overlap_opt_shard_in_pipeline]: 1.49e-06 [overlap_opt_shard_grad_in_pipeline]: 2.22999e-06 [control_data_broadcast_order]: 1.866e-05 [grouped_pairwise_exchange_alltoall]: 1.96e-06 [offloading_packed_experts]: 4.53001e-06 [overlap_recompute_and_grad_model_parallel]: 6.38998e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.59e-06 [overlap_recompute_allgather_and_fa_grad]: 1.62999e-06 [overlap_recompute_comm]: 2.83e-06 [overlap_grad_ring_attention]: 5.04e-06 [overlap_grad_flash_sp]: 2.806e-05 [begin_end_overlap_inline]: 5.00004e-07 [split_matmul_comm_elemetwise]: 2.22999e-06 [split_layernorm_comm]: 1.62001e-06 [handle_group_info]: 1.30999e-06 [symbol_engine_optimizer]: 0.00010348, [1] [Cycle 1]: 9.778e-05, [6] [build]: 4.40999e-06 [elim_shapecalc]: 1.594e-05 [elim_not_effective]: 1.988e-05 [opt_reshape]: 8.80001e-06 [fold_const_symbol]: 1.397e-05 [renormalize]: 2.3999e-07 [detach_backward]: 2.26e-06 [pipeline_parallel_scheduler]: 1.94999e-06 [auto_monad_reorder]: 2.309e-05 [get_jit_bprop_graph]: 1.89e-06 [rewriter_after_jit_bprop_graph]: 5.84e-06 [opt_after_jit_grad]: 0.0005297 [validate]: 5.061e-05 Sums bootstrap : 0.000478s : 0.88% type_inference : 0.018830s : 34.80% event_method : 0.000024s : 0.04% auto_monad : 0.000072s : 0.13% graph_reusing : 0.000006s : 0.01% inline : 0.000003s : 0.01% add_attr.add_attr_with_inline.tag_attr : 0.000023s : 0.04% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.01% parallel-infer-symbol : 0.000004s : 0.01% pre_auto_parallel : 0.000040s : 0.07% insert-virtual-dataset : 0.000003s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000030s : 0.06% optimize.rewriter_before_opt_a : 0.000097s : 0.18% optimize.opt_a.expand_dump_flag : 0.000005s : 0.01% optimize.opt_a.switch_simplify : 0.000055s : 0.10% optimize.opt_a.loop_unroll : 0.000039s : 0.07% optimize.opt_a.a_1 : 0.000919s : 1.70% optimize.opt_a.with_stream_mark : 0.000047s : 0.09% optimize.opt_a.recompute_prepare : 0.000025s : 0.05% optimize.opt_a.updatestate_depend_eliminate : 0.000010s : 0.02% optimize.opt_a.updatestate_assign_eliminate : 0.000008s : 0.02% optimize.opt_a.updatestate_loads_eliminate : 0.000008s : 0.01% optimize.opt_a.parameter_eliminate : 0.000004s : 0.01% optimize.opt_a.a_2 : 0.000204s : 0.38% optimize.opt_a.accelerated_algorithm : 0.000020s : 0.04% optimize.opt_a.shard : 0.000005s : 0.01% optimize.opt_a.meta_shard_fg_expand : 0.000005s : 0.01% optimize.opt_a.shard_inline : 0.000017s : 0.03% optimize.opt_a.merge_send_recv : 0.000022s : 0.04% optimize.opt_a.auto_parallel : 0.000023s : 0.04% optimize.opt_a.parallel : 0.000030s : 0.05% optimize.opt_a.flash_sp : 0.000016s : 0.03% optimize.opt_a.merge_comm : 0.000009s : 0.02% optimize.opt_a.allreduce_fusion : 0.000011s : 0.02% optimize.opt_a.matmul_add_comm_reduction : 0.000023s : 0.04% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000002s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000024s : 0.04% optimize.opt_a.virtual_dataset : 0.000017s : 0.03% optimize.opt_a.get_grad_eliminate_ : 0.000016s : 0.03% optimize.opt_a.virtual_output : 0.000015s : 0.03% optimize.opt_a.merge_forward : 0.000010s : 0.02% optimize.opt_a.cell_reuse_recompute_pass : 0.000005s : 0.01% optimize.opt_a.offload_activation : 0.000025s : 0.05% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000039s : 0.07% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.01% optimize.opt_a.before_grad : 0.000027s : 0.05% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000012s : 0.02% optimize.opt_a.meta_fg_expand : 0.000007s : 0.01% optimize.opt_a.flash_sp_send_recv_attached : 0.000005s : 0.01% optimize.opt_a.receive_attached : 0.000005s : 0.01% optimize.opt_a.after_resolve : 0.000033s : 0.06% optimize.opt_a.a_after_grad : 0.000025s : 0.05% optimize.opt_a.renormalize : 0.029751s : 54.99% optimize.opt_a.add_forward_monad_depend : 0.000016s : 0.03% optimize.opt_a.auto_monad_grad : 0.000005s : 0.01% optimize.opt_a.auto_monad_eliminator : 0.000044s : 0.08% optimize.opt_a.cse : 0.000073s : 0.13% optimize.opt_a.a_3 : 0.000128s : 0.24% optimize.py_interpret_to_execute_after_opt_a : 0.000021s : 0.04% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.00% optimize.rewriter_after_opt_a : 0.000057s : 0.11% optimize.convert_after_rewriter : 0.000009s : 0.02% optimize.order_py_execute_after_rewriter : 0.000007s : 0.01% optimize.mutable_eliminate : 0.000776s : 1.43% optimize.opt_b.b_1 : 0.000161s : 0.30% optimize.opt_b.b_2 : 0.000010s : 0.02% optimize.opt_b.updatestate_depend_eliminate : 0.000009s : 0.02% optimize.opt_b.updatestate_assign_eliminate : 0.000004s : 0.01% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.01% optimize.opt_b.renormalize : 0.000000s : 0.00% optimize.opt_b.cse : 0.000034s : 0.06% optimize.optimize_parallel_all_gather_comm : 0.000021s : 0.04% optimize.overlap_param_gather : 0.000002s : 0.00% optimize.cconv : 0.000034s : 0.06% optimize.loop_unroll : 0.000482s : 0.89% optimize.opt_after_cconv.c_1 : 0.000038s : 0.07% optimize.opt_after_cconv.parameter_eliminate : 0.000005s : 0.01% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000008s : 0.01% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.cse : 0.000028s : 0.05% optimize.opt_after_cconv.renormalize : 0.000001s : 0.00% optimize.remove_dup_value : 0.000017s : 0.03% optimize.tuple_transform.d_1 : 0.000056s : 0.10% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000008s : 0.01% optimize.partial_unused_args_eliminate : 0.000002s : 0.00% optimize.add_recomputation : 0.000066s : 0.12% optimize.cse_after_recomputation.cse : 0.000017s : 0.03% optimize.environ_conv : 0.000008s : 0.01% optimize.swap_dp_allreduce_reducescatter : 0.000006s : 0.01% optimize.bias_add_comm_swap : 0.000003s : 0.01% optimize.label_micro_interleaved_index : 0.000005s : 0.01% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.01% optimize.merge_cast_opt : 0.000002s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.00% optimize.micro_interleaved_order_control : 0.000002s : 0.00% optimize.assign_add_opt : 0.000002s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.00% optimize.full_micro_interleaved_order_control : 0.000003s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.01% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.00% optimize.control_data_broadcast_order : 0.000019s : 0.03% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.00% optimize.offloading_packed_experts : 0.000005s : 0.01% optimize.overlap_recompute_and_grad_model_parallel : 0.000006s : 0.01% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000002s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000002s : 0.00% optimize.overlap_recompute_comm : 0.000003s : 0.01% optimize.overlap_grad_ring_attention : 0.000005s : 0.01% optimize.overlap_grad_flash_sp : 0.000028s : 0.05% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.00% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000004s : 0.01% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000016s : 0.03% optimize.symbol_engine_optimizer.elim_not_effective : 0.000020s : 0.04% optimize.symbol_engine_optimizer.opt_reshape : 0.000009s : 0.02% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000014s : 0.03% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.00% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000023s : 0.04% get_jit_bprop_graph : 0.000002s : 0.00% rewriter_after_jit_bprop_graph : 0.000006s : 0.01% opt_after_jit_grad : 0.000530s : 0.98% validate : 0.000051s : 0.09% Time group info: ------[substitution.] 0.000250 38 13.83% : 0.000035s : 3: substitution.cast_eliminate 0.88% : 0.000002s : 3: substitution.elim_not_effective 0.85% : 0.000002s : 3: substitution.fold_const_symbol 2.67% : 0.000007s : 5: substitution.graph_param_transform 68.62% : 0.000172s : 4: substitution.inline 2.07% : 0.000005s : 6: substitution.j_node_and_user_rematch 2.92% : 0.000007s : 6: substitution.remove_not_recompute_node 2.88% : 0.000007s : 4: substitution.replace_old_param 5.28% : 0.000013s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.018745 2 94.80% : 0.017770s : 1: type_inference.infer 5.20% : 0.000976s : 1: type_inference.specialize ------[replace.] 0.000067 8 61.90% : 0.000042s : 4: replace.inline 38.10% : 0.000026s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000180 8 93.77% : 0.000169s : 4: match.inline 6.23% : 0.000011s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000260 1504 0.93% : 0.000002s : 15: predicate.accumulaten_eliminater 0.88% : 0.000002s : 5: predicate.ad_related_special_op_eliminate 0.60% : 0.000002s : 10: predicate.addn_check_dump 0.90% : 0.000002s : 15: predicate.addn_zero_filter 0.79% : 0.000002s : 15: predicate.adjust_all_reduce_mul_add 2.26% : 0.000006s : 25: predicate.arithmetic_simplify 1.16% : 0.000003s : 15: predicate.cast_eliminate 0.68% : 0.000002s : 10: predicate.check_bprop_eliminate 0.58% : 0.000001s : 10: predicate.compare_switch_simplify 0.17% : 0.000000s : 5: predicate.const_output_eliminate 0.76% : 0.000002s : 10: predicate.depend_value_elim 0.87% : 0.000002s : 15: predicate.dict_get_item_const_eliminator 1.05% : 0.000003s : 15: predicate.dict_get_item_eliminator 0.88% : 0.000002s : 15: predicate.dict_set_item_eliminator 1.30% : 0.000003s : 10: predicate.dumpgradient_eliminate 0.26% : 0.000001s : 5: predicate.elim_not_effective 0.45% : 0.000001s : 5: predicate.elim_shapecalc_of_broadcastargs 1.21% : 0.000003s : 20: predicate.environ_add_const_eliminate 1.01% : 0.000003s : 20: predicate.environ_get_add_eliminate 1.06% : 0.000003s : 20: predicate.environ_get_depend_swap 1.60% : 0.000004s : 30: predicate.environ_get_eliminate 1.04% : 0.000003s : 20: predicate.environ_get_set_eliminate 1.39% : 0.000004s : 23: predicate.exchange_switch_depend_value 2.35% : 0.000006s : 23: predicate.float_depend_g_call 0.55% : 0.000001s : 10: predicate.float_environ_get_switch 0.78% : 0.000002s : 15: predicate.float_tuple_getitem_switch 0.20% : 0.000001s : 5: predicate.fold_const_symbol 0.68% : 0.000002s : 10: predicate.get_grad_eliminate 0.20% : 0.000001s : 5: predicate.graph_param_transform 0.61% : 0.000002s : 10: predicate.incorporate_call 0.52% : 0.000001s : 10: predicate.incorporate_call_switch 6.44% : 0.000017s : 68: predicate.inline 1.07% : 0.000003s : 10: predicate.inline_without_move 0.29% : 0.000001s : 10: predicate.j_node_and_user_rematch 0.81% : 0.000002s : 10: predicate.less_batch_normalization 1.88% : 0.000005s : 29: predicate.list_to_tuple_eliminator_ 2.31% : 0.000006s : 44: predicate.load_eliminater 1.07% : 0.000003s : 5: predicate.loop_unroll_after_grad 2.07% : 0.000005s : 36: predicate.loop_unroll_before_grad 1.65% : 0.000004s : 25: predicate.make_slice_get_slice_eliminator 0.55% : 0.000001s : 10: predicate.merge_addn 0.56% : 0.000001s : 10: predicate.micro_step_allgather_replace 0.55% : 0.000001s : 10: predicate.mini_step_allgather_replace 0.80% : 0.000002s : 15: predicate.minmaximum_grad 1.28% : 0.000003s : 5: predicate.mutable_eliminate 0.38% : 0.000001s : 5: predicate.opt_reshape 0.50% : 0.000001s : 5: predicate.parallel_virtual_node 1.58% : 0.000004s : 23: predicate.partial_defer_inline 1.50% : 0.000004s : 24: predicate.partial_eliminate 0.89% : 0.000002s : 15: predicate.print_const_string_wrapper 0.67% : 0.000002s : 10: predicate.reduce_all_const_elim 1.30% : 0.000003s : 15: predicate.reduce_eliminate 2.39% : 0.000006s : 44: predicate.redundant_stop_gradient_eliminater 0.49% : 0.000001s : 10: predicate.remove_not_recompute_node 1.40% : 0.000004s : 29: predicate.replace_applicator 0.63% : 0.000002s : 10: predicate.replace_old_param 0.38% : 0.000001s : 5: predicate.reset_defer_inline 1.17% : 0.000003s : 15: predicate.reshape_eliminate 0.60% : 0.000002s : 10: predicate.row_tensor_add_zeros_like 0.46% : 0.000001s : 5: predicate.row_tensor_eliminate 0.85% : 0.000002s : 10: predicate.same_eliminate 0.44% : 0.000001s : 10: predicate.set_cell_output_no_recompute 0.65% : 0.000002s : 10: predicate.shard_identity_eliminate 0.65% : 0.000002s : 10: predicate.special_op_eliminate 0.69% : 0.000002s : 10: predicate.specialize_transform 1.08% : 0.000003s : 10: predicate.split_environ_get_set_with_tuple_value 0.75% : 0.000002s : 10: predicate.stack_unstack_eliminate 0.34% : 0.000001s : 5: predicate.switch_call_monad_eliminater 1.35% : 0.000004s : 23: predicate.switch_defer_inline 1.91% : 0.000005s : 33: predicate.switch_layer_defer_inline 4.72% : 0.000012s : 74: predicate.switch_simplify 0.88% : 0.000002s : 15: predicate.tile_eliminate 0.88% : 0.000002s : 15: predicate.transpose_eliminate 1.67% : 0.000004s : 25: predicate.tuple_list_convert_item_index_to_positive 1.52% : 0.000004s : 25: predicate.tuple_list_get_item_const_eliminator 1.42% : 0.000004s : 25: predicate.tuple_list_get_item_depend_reorder 3.27% : 0.000009s : 39: predicate.tuple_list_get_item_eliminator 1.53% : 0.000004s : 25: predicate.tuple_list_get_set_item_eliminator 2.22% : 0.000006s : 35: predicate.tuple_list_set_item_eliminator 1.89% : 0.000005s : 29: predicate.tuple_to_list_eliminator_ 2.30% : 0.000006s : 44: predicate.updatestate_pure_node_eliminater 2.98% : 0.000008s : 54: predicate.updatestate_useless_node_eliminater 0.42% : 0.000001s : 5: predicate.value_based_eliminate 0.80% : 0.000002s : 10: predicate.virtual_dataset_eliminate 0.67% : 0.000002s : 10: predicate.virtual_output_eliminate 0.33% : 0.000001s : 5: predicate.virtual_view_grad_eliminate 0.40% : 0.000001s : 5: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000795 11 47.07% : 0.000374s : 5: func_graph_cloner_run.FuncGraphClonerGraph 52.93% : 0.000421s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.129571 192 0.00% : 0.000004s : 1: ForceFp32Comm 3.03% : 0.003924s : 1: add_attr 3.02% : 0.003909s : 1: add_attr_with_inline 0.00% : 0.000004s : 1: add_comm_op_reuse_tag 0.06% : 0.000072s : 1: add_recomputation 0.00% : 0.000006s : 1: assign_add_opt 0.06% : 0.000079s : 1: auto_monad 0.02% : 0.000027s : 1: auto_monad_reorder 0.00% : 0.000003s : 1: begin_end_overlap_inline 0.00% : 0.000006s : 1: bias_add_comm_swap 0.39% : 0.000511s : 1: bootstrap 0.03% : 0.000037s : 1: cconv 0.00% : 0.000004s : 1: comm_op_add_attrs 0.02% : 0.000023s : 1: control_data_broadcast_order 0.01% : 0.000013s : 1: convert_after_rewriter 0.03% : 0.000033s : 1: cse_after_recomputation 0.00% : 0.000005s : 1: dataset_repeat_opt 0.00% : 0.000006s : 1: detach_backward 0.01% : 0.000012s : 1: environ_conv 0.02% : 0.000032s : 1: event_method 0.00% : 0.000006s : 1: full_micro_interleaved_order_control 0.00% : 0.000005s : 1: get_jit_bprop_graph 0.01% : 0.000009s : 1: graph_reusing 0.00% : 0.000005s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000004s : 1: handle_group_info 0.00% : 0.000006s : 1: inline 0.00% : 0.000006s : 1: insert-virtual-dataset 0.00% : 0.000004s : 1: interleave_parallel_branches 0.00% : 0.000004s : 1: interleave_split_concat_branches 0.01% : 0.000007s : 1: label_fine_grained_interleaved_index 0.01% : 0.000008s : 1: label_micro_interleaved_index 0.38% : 0.000492s : 1: loop_unroll 0.00% : 0.000005s : 1: merge_cast_opt 0.02% : 0.000020s : 1: micro_interleaved_order_control 0.61% : 0.000788s : 1: mutable_eliminate 0.01% : 0.000008s : 1: offloading_packed_experts 0.02% : 0.000020s : 1: opt.transform.loop_unroll_optimizer 0.02% : 0.000026s : 1: opt.transform.mutable_eliminate 1.15% : 0.001487s : 78: opt.transform.opt_a 0.03% : 0.000037s : 1: opt.transform.opt_after_cconv 0.03% : 0.000034s : 1: opt.transform.opt_after_jit_grad 0.11% : 0.000137s : 28: opt.transform.opt_b 0.05% : 0.000062s : 2: opt.transform.opt_trans_graph 0.04% : 0.000054s : 4: opt.transform.symbol_engine_opt 24.99% : 0.032383s : 1: opt_a 0.10% : 0.000133s : 1: opt_after_cconv 0.42% : 0.000542s : 1: opt_after_jit_grad 0.21% : 0.000272s : 1: opt_b 27.03% : 0.035023s : 1: optimize 0.02% : 0.000026s : 1: optimize_parallel_all_gather_comm 0.01% : 0.000011s : 1: order_py_execute_after_rewriter 0.02% : 0.000032s : 1: overlap_grad_flash_sp 0.00% : 0.000005s : 1: overlap_grad_matmul_and_grad_allreduce 0.01% : 0.000009s : 1: overlap_grad_ring_attention 0.00% : 0.000006s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000005s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000005s : 1: overlap_param_gather 0.00% : 0.000005s : 1: overlap_recompute_allgather_and_fa_grad 0.01% : 0.000009s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000006s : 1: overlap_recompute_comm 0.01% : 0.000008s : 1: parallel-infer-symbol 0.00% : 0.000004s : 1: parallel-infer-symbol-second 0.00% : 0.000005s : 1: partial_unused_args_eliminate 0.00% : 0.000005s : 1: pipeline_parallel_scheduler 0.00% : 0.000004s : 1: pipeline_split 0.03% : 0.000045s : 1: pre_auto_parallel 0.03% : 0.000035s : 1: py_interpret_to_execute 0.02% : 0.000025s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000004s : 1: remove_cast_before_assign_add 0.02% : 0.000020s : 1: remove_dup_value 22.52% : 0.029182s : 1: renormalize.infer 0.42% : 0.000549s : 1: renormalize.specialize 0.01% : 0.000007s : 1: reorder_send_recv_between_fp_bp 0.01% : 0.000009s : 1: rewriter_after_jit_bprop_graph 0.05% : 0.000066s : 1: rewriter_after_opt_a 0.08% : 0.000101s : 1: rewriter_before_opt_a 0.00% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000005s : 1: slice_recompute_activation 0.00% : 0.000004s : 1: split_layernorm_comm 0.00% : 0.000005s : 1: split_matmul_comm_elemetwise 0.01% : 0.000010s : 1: swap_dp_allreduce_reducescatter 0.08% : 0.000106s : 1: symbol_engine_optimizer 0.07% : 0.000094s : 1: tuple_transform 14.55% : 0.018858s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:45:11.594.48 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:45:11.597.35 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0182186, [21] [bootstrap]: 0.00051988 [type_inference]: 0.0063286 [event_method]: 2.322e-05 [auto_monad]: 6.675e-05 [graph_reusing]: 6.06998e-06 [inline]: 3.03998e-06 [add_attr]: 0.00363831, [1] [add_attr_with_inline]: 0.00362699, [1] [Cycle 1]: 9.494e-05, [2] [tag_attr]: 2.293e-05 [meta_addattr_fg_expand]: 6.07001e-06 [parallel-infer-symbol]: 3.65e-06 [pre_auto_parallel]: 4.126e-05 [insert-virtual-dataset]: 2.22999e-06 [parallel-infer-symbol-second]: 7.89994e-07 [dataset_repeat_opt]: 1.98002e-06 [pipeline_split]: 1.79e-06 [optimize]: 0.00629438, [53] [py_interpret_to_execute]: 3.402e-05 [rewriter_before_opt_a]: 9.501e-05 [opt_a]: 0.00356579, [2] [Cycle 1]: 0.00261785, [45] [expand_dump_flag]: 3.19001e-06 [switch_simplify]: 4.499e-05 [loop_unroll]: 3.064e-05 [a_1]: 0.00067452 [with_stream_mark]: 2.382e-05 [recompute_prepare]: 1.277e-05 [updatestate_depend_eliminate]: 4.70999e-06 [updatestate_assign_eliminate]: 3.88999e-06 [updatestate_loads_eliminate]: 3.36001e-06 [parameter_eliminate]: 2.22999e-06 [a_2]: 0.00011461 [accelerated_algorithm]: 7.45e-06 [shard]: 2.02999e-06 [meta_shard_fg_expand]: 1.92999e-06 [shard_inline]: 6.78e-06 [merge_send_recv]: 9.04998e-06 [auto_parallel]: 9.22999e-06 [parallel]: 2.022e-05 [flash_sp]: 1.214e-05 [merge_comm]: 4.43999e-06 [allreduce_fusion]: 3.52002e-06 [matmul_add_comm_reduction]: 1.225e-05 [allreduce_slice_to_reducescatter]: 7.60017e-07 [virtual_shard_identity]: 1.189e-05 [virtual_dataset]: 7.46001e-06 [get_grad_eliminate_]: 1.064e-05 [virtual_output]: 9.14998e-06 [merge_forward]: 6.74001e-06 [cell_reuse_recompute_pass]: 2.59001e-06 [offload_activation]: 1.339e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.362e-05 [merge_recompute_call_nodes]: 1.57999e-06 [before_grad]: 1.281e-05 [set_forward_comm_id_for_comm_node_pass]: 5.41998e-06 [meta_fg_expand]: 3.81001e-06 [flash_sp_send_recv_attached]: 3.29001e-06 [receive_attached]: 2.31e-06 [after_resolve]: 1.535e-05 [a_after_grad]: 1.235e-05 [renormalize]: 0.00083082 [add_forward_monad_depend]: 8.79e-06 [auto_monad_grad]: 2.89001e-06 [auto_monad_eliminator]: 2.059e-05 [cse]: 3.282e-05 [a_3]: 6.649e-05 [Cycle 2]: 0.00093117, [45] [expand_dump_flag]: 2.43998e-06 [switch_simplify]: 8.66997e-06 [loop_unroll]: 6.46999e-06 [a_1]: 0.00012974 [with_stream_mark]: 1.88e-05 [recompute_prepare]: 7.98999e-06 [updatestate_depend_eliminate]: 4.27e-06 [updatestate_assign_eliminate]: 3.27002e-06 [updatestate_loads_eliminate]: 2.58e-06 [parameter_eliminate]: 2.26e-06 [a_2]: 0.00010362 [accelerated_algorithm]: 7.45e-06 [shard]: 1.98997e-06 [meta_shard_fg_expand]: 2.11998e-06 [shard_inline]: 6.53e-06 [merge_send_recv]: 7.88001e-06 [auto_parallel]: 8.69003e-06 [parallel]: 8.13999e-06 [flash_sp]: 3.86999e-06 [merge_comm]: 3.56001e-06 [allreduce_fusion]: 3.51999e-06 [matmul_add_comm_reduction]: 1.065e-05 [allreduce_slice_to_reducescatter]: 7.2e-07 [virtual_shard_identity]: 1.051e-05 [virtual_dataset]: 6.28e-06 [get_grad_eliminate_]: 6.06998e-06 [virtual_output]: 6.23e-06 [merge_forward]: 4.99e-06 [cell_reuse_recompute_pass]: 2.86e-06 [offload_activation]: 1.023e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.149e-05 [merge_recompute_call_nodes]: 1.52001e-06 [before_grad]: 1.213e-05 [set_forward_comm_id_for_comm_node_pass]: 4.65999e-06 [meta_fg_expand]: 2.43e-06 [flash_sp_send_recv_attached]: 1.67001e-06 [receive_attached]: 2.39001e-06 [after_resolve]: 1.32e-05 [a_after_grad]: 9.54e-06 [renormalize]: 5.9983e-08 [add_forward_monad_depend]: 3.03998e-06 [auto_monad_grad]: 1.87999e-06 [auto_monad_eliminator]: 1.126e-05 [cse]: 2.066e-05 [a_3]: 5.148e-05 [py_interpret_to_execute_after_opt_a]: 1.786e-05 [slice_cell_reuse_recomputed_activation]: 5.40999e-06 [rewriter_after_opt_a]: 4.801e-05 [convert_after_rewriter]: 1.111e-05 [order_py_execute_after_rewriter]: 8.57998e-06 [mutable_eliminate]: 0.00064582 [opt_b]: 0.00029097, [1] [Cycle 1]: 0.00027973, [7] [b_1]: 0.00016862 [b_2]: 1.016e-05 [updatestate_depend_eliminate]: 1.017e-05 [updatestate_assign_eliminate]: 3.04999e-06 [updatestate_loads_eliminate]: 3.09999e-06 [renormalize]: 7.39994e-07 [cse]: 2.497e-05 [optimize_parallel_all_gather_comm]: 2.262e-05 [overlap_param_gather]: 5.41998e-06 [cconv]: 4.147e-05 [loop_unroll]: 0.00051272 [opt_after_cconv]: 0.00013922, [1] [Cycle 1]: 0.00013006, [7] [c_1]: 3.172e-05 [parameter_eliminate]: 6.02999e-06 [updatestate_depend_eliminate]: 6.48e-06 [updatestate_assign_eliminate]: 2.71999e-06 [updatestate_loads_eliminate]: 2.61e-06 [cse]: 2.11e-05 [renormalize]: 1.01002e-06 [remove_dup_value]: 1.752e-05 [tuple_transform]: 9.548e-05, [1] [Cycle 1]: 8.794e-05, [4] [d_1]: 4.814e-05 [none_parameter_eliminate]: 1.67999e-06 [renormalize]: 3.80009e-07 [switch_simplify]: 6.85998e-06 [partial_unused_args_eliminate]: 4.19002e-06 [add_recomputation]: 5.859e-05 [cse_after_recomputation]: 2.956e-05, [1] [Cycle 1]: 2.172e-05, [1] [cse]: 1.237e-05 [environ_conv]: 9.29e-06 [swap_dp_allreduce_reducescatter]: 7.60998e-06 [bias_add_comm_swap]: 1.206e-05 [label_micro_interleaved_index]: 8.38999e-06 [label_fine_grained_interleaved_index]: 5.86998e-06 [merge_cast_opt]: 3.66001e-06 [slice_recompute_activation]: 4.69998e-06 [micro_interleaved_order_control]: 5.39e-06 [assign_add_opt]: 3.77002e-06 [ForceFp32Comm]: 3.63999e-06 [remove_cast_before_assign_add]: 3.54002e-06 [full_micro_interleaved_order_control]: 4.55001e-06 [reorder_send_recv_between_fp_bp]: 5.56e-06 [comm_op_add_attrs]: 3.68e-06 [add_comm_op_reuse_tag]: 3.21001e-06 [interleave_split_concat_branches]: 3.48e-06 [interleave_parallel_branches]: 3.5e-06 [overlap_opt_shard_in_pipeline]: 3.7e-06 [overlap_opt_shard_grad_in_pipeline]: 4.17998e-06 [control_data_broadcast_order]: 2.097e-05 [grouped_pairwise_exchange_alltoall]: 4.12e-06 [offloading_packed_experts]: 6.74001e-06 [overlap_recompute_and_grad_model_parallel]: 7e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.88001e-06 [overlap_recompute_allgather_and_fa_grad]: 3.58e-06 [overlap_recompute_comm]: 4.89e-06 [overlap_grad_ring_attention]: 7.45e-06 [overlap_grad_flash_sp]: 2.597e-05 [begin_end_overlap_inline]: 3.04999e-06 [split_matmul_comm_elemetwise]: 4.60001e-06 [split_layernorm_comm]: 3.95998e-06 [handle_group_info]: 3.63e-06 [symbol_engine_optimizer]: 0.00011299, [1] [Cycle 1]: 0.00010555, [6] [build]: 4.40999e-06 [elim_shapecalc]: 1.531e-05 [elim_not_effective]: 1.563e-05 [opt_reshape]: 7.28e-06 [fold_const_symbol]: 1.073e-05 [renormalize]: 7.09988e-07 [detach_backward]: 3.95e-06 [pipeline_parallel_scheduler]: 1.98002e-06 [auto_monad_reorder]: 2.285e-05 [get_jit_bprop_graph]: 1.79e-06 [rewriter_after_jit_bprop_graph]: 6.19999e-06 [opt_after_jit_grad]: 0.00057802 [validate]: 4.29e-05 Sums bootstrap : 0.000520s : 4.16% type_inference : 0.006329s : 50.62% event_method : 0.000023s : 0.19% auto_monad : 0.000067s : 0.53% graph_reusing : 0.000006s : 0.05% inline : 0.000003s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000023s : 0.18% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.05% parallel-infer-symbol : 0.000004s : 0.03% pre_auto_parallel : 0.000041s : 0.33% insert-virtual-dataset : 0.000002s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000034s : 0.27% optimize.rewriter_before_opt_a : 0.000095s : 0.76% optimize.opt_a.expand_dump_flag : 0.000006s : 0.05% optimize.opt_a.switch_simplify : 0.000054s : 0.43% optimize.opt_a.loop_unroll : 0.000037s : 0.30% optimize.opt_a.a_1 : 0.000804s : 6.43% optimize.opt_a.with_stream_mark : 0.000043s : 0.34% optimize.opt_a.recompute_prepare : 0.000021s : 0.17% optimize.opt_a.updatestate_depend_eliminate : 0.000009s : 0.07% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.06% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.05% optimize.opt_a.parameter_eliminate : 0.000004s : 0.04% optimize.opt_a.a_2 : 0.000218s : 1.75% optimize.opt_a.accelerated_algorithm : 0.000015s : 0.12% optimize.opt_a.shard : 0.000004s : 0.03% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.03% optimize.opt_a.shard_inline : 0.000013s : 0.11% optimize.opt_a.merge_send_recv : 0.000017s : 0.14% optimize.opt_a.auto_parallel : 0.000018s : 0.14% optimize.opt_a.parallel : 0.000028s : 0.23% optimize.opt_a.flash_sp : 0.000016s : 0.13% optimize.opt_a.merge_comm : 0.000008s : 0.06% optimize.opt_a.allreduce_fusion : 0.000007s : 0.06% optimize.opt_a.matmul_add_comm_reduction : 0.000023s : 0.18% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000022s : 0.18% optimize.opt_a.virtual_dataset : 0.000014s : 0.11% optimize.opt_a.get_grad_eliminate_ : 0.000017s : 0.13% optimize.opt_a.virtual_output : 0.000015s : 0.12% optimize.opt_a.merge_forward : 0.000012s : 0.09% optimize.opt_a.cell_reuse_recompute_pass : 0.000005s : 0.04% optimize.opt_a.offload_activation : 0.000024s : 0.19% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000045s : 0.36% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.02% optimize.opt_a.before_grad : 0.000025s : 0.20% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000010s : 0.08% optimize.opt_a.meta_fg_expand : 0.000006s : 0.05% optimize.opt_a.flash_sp_send_recv_attached : 0.000005s : 0.04% optimize.opt_a.receive_attached : 0.000005s : 0.04% optimize.opt_a.after_resolve : 0.000029s : 0.23% optimize.opt_a.a_after_grad : 0.000022s : 0.18% optimize.opt_a.renormalize : 0.000831s : 6.65% optimize.opt_a.add_forward_monad_depend : 0.000012s : 0.09% optimize.opt_a.auto_monad_grad : 0.000005s : 0.04% optimize.opt_a.auto_monad_eliminator : 0.000032s : 0.25% optimize.opt_a.cse : 0.000053s : 0.43% optimize.opt_a.a_3 : 0.000118s : 0.94% optimize.py_interpret_to_execute_after_opt_a : 0.000018s : 0.14% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.04% optimize.rewriter_after_opt_a : 0.000048s : 0.38% optimize.convert_after_rewriter : 0.000011s : 0.09% optimize.order_py_execute_after_rewriter : 0.000009s : 0.07% optimize.mutable_eliminate : 0.000646s : 5.17% optimize.opt_b.b_1 : 0.000169s : 1.35% optimize.opt_b.b_2 : 0.000010s : 0.08% optimize.opt_b.updatestate_depend_eliminate : 0.000010s : 0.08% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.02% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.02% optimize.opt_b.renormalize : 0.000001s : 0.01% optimize.opt_b.cse : 0.000025s : 0.20% optimize.optimize_parallel_all_gather_comm : 0.000023s : 0.18% optimize.overlap_param_gather : 0.000005s : 0.04% optimize.cconv : 0.000041s : 0.33% optimize.loop_unroll : 0.000513s : 4.10% optimize.opt_after_cconv.c_1 : 0.000032s : 0.25% optimize.opt_after_cconv.parameter_eliminate : 0.000006s : 0.05% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.05% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.02% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.02% optimize.opt_after_cconv.cse : 0.000021s : 0.17% optimize.opt_after_cconv.renormalize : 0.000001s : 0.01% optimize.remove_dup_value : 0.000018s : 0.14% optimize.tuple_transform.d_1 : 0.000048s : 0.39% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000007s : 0.05% optimize.partial_unused_args_eliminate : 0.000004s : 0.03% optimize.add_recomputation : 0.000059s : 0.47% optimize.cse_after_recomputation.cse : 0.000012s : 0.10% optimize.environ_conv : 0.000009s : 0.07% optimize.swap_dp_allreduce_reducescatter : 0.000008s : 0.06% optimize.bias_add_comm_swap : 0.000012s : 0.10% optimize.label_micro_interleaved_index : 0.000008s : 0.07% optimize.label_fine_grained_interleaved_index : 0.000006s : 0.05% optimize.merge_cast_opt : 0.000004s : 0.03% optimize.slice_recompute_activation : 0.000005s : 0.04% optimize.micro_interleaved_order_control : 0.000005s : 0.04% optimize.assign_add_opt : 0.000004s : 0.03% optimize.ForceFp32Comm : 0.000004s : 0.03% optimize.remove_cast_before_assign_add : 0.000004s : 0.03% optimize.full_micro_interleaved_order_control : 0.000005s : 0.04% optimize.reorder_send_recv_between_fp_bp : 0.000006s : 0.04% optimize.comm_op_add_attrs : 0.000004s : 0.03% optimize.add_comm_op_reuse_tag : 0.000003s : 0.03% optimize.interleave_split_concat_branches : 0.000003s : 0.03% optimize.interleave_parallel_branches : 0.000003s : 0.03% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.03% optimize.overlap_opt_shard_grad_in_pipeline : 0.000004s : 0.03% optimize.control_data_broadcast_order : 0.000021s : 0.17% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.03% optimize.offloading_packed_experts : 0.000007s : 0.05% optimize.overlap_recompute_and_grad_model_parallel : 0.000007s : 0.06% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.03% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.03% optimize.overlap_recompute_comm : 0.000005s : 0.04% optimize.overlap_grad_ring_attention : 0.000007s : 0.06% optimize.overlap_grad_flash_sp : 0.000026s : 0.21% optimize.begin_end_overlap_inline : 0.000003s : 0.02% optimize.split_matmul_comm_elemetwise : 0.000005s : 0.04% optimize.split_layernorm_comm : 0.000004s : 0.03% optimize.handle_group_info : 0.000004s : 0.03% optimize.symbol_engine_optimizer.build : 0.000004s : 0.04% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000015s : 0.12% optimize.symbol_engine_optimizer.elim_not_effective : 0.000016s : 0.13% optimize.symbol_engine_optimizer.opt_reshape : 0.000007s : 0.06% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000011s : 0.09% optimize.symbol_engine_optimizer.renormalize : 0.000001s : 0.01% detach_backward : 0.000004s : 0.03% pipeline_parallel_scheduler : 0.000002s : 0.02% auto_monad_reorder : 0.000023s : 0.18% get_jit_bprop_graph : 0.000002s : 0.01% rewriter_after_jit_bprop_graph : 0.000006s : 0.05% opt_after_jit_grad : 0.000578s : 4.62% validate : 0.000043s : 0.34% Time group info: ------[substitution.] 0.000214 28 0.86% : 0.000002s : 2: substitution.elim_not_effective 0.63% : 0.000001s : 2: substitution.fold_const_symbol 3.36% : 0.000007s : 4: substitution.graph_param_transform 80.04% : 0.000171s : 4: substitution.inline 2.42% : 0.000005s : 4: substitution.j_node_and_user_rematch 2.72% : 0.000006s : 4: substitution.remove_not_recompute_node 3.15% : 0.000007s : 4: substitution.replace_old_param 6.82% : 0.000015s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.006260 2 87.32% : 0.005466s : 1: type_inference.infer 12.68% : 0.000794s : 1: type_inference.specialize ------[replace.] 0.000066 8 64.81% : 0.000043s : 4: replace.inline 35.19% : 0.000023s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000180 8 93.01% : 0.000168s : 4: match.inline 6.99% : 0.000013s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000216 1278 0.83% : 0.000002s : 13: predicate.accumulaten_eliminater 0.88% : 0.000002s : 4: predicate.ad_related_special_op_eliminate 0.60% : 0.000001s : 8: predicate.addn_check_dump 0.84% : 0.000002s : 13: predicate.addn_zero_filter 0.73% : 0.000002s : 13: predicate.adjust_all_reduce_mul_add 1.92% : 0.000004s : 21: predicate.arithmetic_simplify 1.11% : 0.000002s : 13: predicate.cast_eliminate 0.56% : 0.000001s : 8: predicate.check_bprop_eliminate 0.54% : 0.000001s : 8: predicate.compare_switch_simplify 0.18% : 0.000000s : 4: predicate.const_output_eliminate 0.71% : 0.000002s : 8: predicate.depend_value_elim 0.87% : 0.000002s : 13: predicate.dict_get_item_const_eliminator 1.12% : 0.000002s : 13: predicate.dict_get_item_eliminator 0.81% : 0.000002s : 13: predicate.dict_set_item_eliminator 0.93% : 0.000002s : 8: predicate.dumpgradient_eliminate 0.38% : 0.000001s : 4: predicate.elim_not_effective 0.42% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.13% : 0.000002s : 17: predicate.environ_add_const_eliminate 1.05% : 0.000002s : 17: predicate.environ_get_add_eliminate 1.04% : 0.000002s : 17: predicate.environ_get_depend_swap 1.82% : 0.000004s : 25: predicate.environ_get_eliminate 1.05% : 0.000002s : 17: predicate.environ_get_set_eliminate 1.48% : 0.000003s : 21: predicate.exchange_switch_depend_value 2.41% : 0.000005s : 21: predicate.float_depend_g_call 0.55% : 0.000001s : 8: predicate.float_environ_get_switch 0.77% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.20% : 0.000000s : 4: predicate.fold_const_symbol 0.71% : 0.000002s : 8: predicate.get_grad_eliminate 0.17% : 0.000000s : 4: predicate.graph_param_transform 0.59% : 0.000001s : 8: predicate.incorporate_call 0.49% : 0.000001s : 8: predicate.incorporate_call_switch 6.42% : 0.000014s : 58: predicate.inline 1.24% : 0.000003s : 8: predicate.inline_without_move 0.32% : 0.000001s : 8: predicate.j_node_and_user_rematch 0.86% : 0.000002s : 8: predicate.less_batch_normalization 1.71% : 0.000004s : 25: predicate.list_to_tuple_eliminator_ 2.37% : 0.000005s : 38: predicate.load_eliminater 1.12% : 0.000002s : 4: predicate.loop_unroll_after_grad 2.27% : 0.000005s : 34: predicate.loop_unroll_before_grad 1.61% : 0.000003s : 21: predicate.make_slice_get_slice_eliminator 0.54% : 0.000001s : 8: predicate.merge_addn 0.52% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.67% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.80% : 0.000002s : 13: predicate.minmaximum_grad 1.56% : 0.000003s : 4: predicate.mutable_eliminate 0.31% : 0.000001s : 4: predicate.opt_reshape 0.38% : 0.000001s : 4: predicate.parallel_virtual_node 1.86% : 0.000004s : 21: predicate.partial_defer_inline 1.62% : 0.000004s : 21: predicate.partial_eliminate 0.84% : 0.000002s : 13: predicate.print_const_string_wrapper 0.57% : 0.000001s : 8: predicate.reduce_all_const_elim 1.07% : 0.000002s : 13: predicate.reduce_eliminate 2.49% : 0.000005s : 38: predicate.redundant_stop_gradient_eliminater 0.75% : 0.000002s : 8: predicate.remove_not_recompute_node 1.40% : 0.000003s : 25: predicate.replace_applicator 0.38% : 0.000001s : 8: predicate.replace_old_param 0.45% : 0.000001s : 4: predicate.reset_defer_inline 0.88% : 0.000002s : 13: predicate.reshape_eliminate 0.64% : 0.000001s : 8: predicate.row_tensor_add_zeros_like 0.57% : 0.000001s : 4: predicate.row_tensor_eliminate 0.85% : 0.000002s : 8: predicate.same_eliminate 0.64% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.97% : 0.000002s : 8: predicate.shard_identity_eliminate 0.68% : 0.000001s : 8: predicate.special_op_eliminate 0.70% : 0.000002s : 8: predicate.specialize_transform 1.10% : 0.000002s : 8: predicate.split_environ_get_set_with_tuple_value 0.76% : 0.000002s : 8: predicate.stack_unstack_eliminate 0.31% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.47% : 0.000003s : 21: predicate.switch_defer_inline 2.05% : 0.000004s : 29: predicate.switch_layer_defer_inline 5.09% : 0.000011s : 67: predicate.switch_simplify 0.85% : 0.000002s : 13: predicate.tile_eliminate 0.89% : 0.000002s : 13: predicate.transpose_eliminate 1.50% : 0.000003s : 21: predicate.tuple_list_convert_item_index_to_positive 1.43% : 0.000003s : 21: predicate.tuple_list_get_item_const_eliminator 1.30% : 0.000003s : 21: predicate.tuple_list_get_item_depend_reorder 3.48% : 0.000008s : 33: predicate.tuple_list_get_item_eliminator 1.38% : 0.000003s : 21: predicate.tuple_list_get_set_item_eliminator 2.09% : 0.000005s : 29: predicate.tuple_list_set_item_eliminator 1.72% : 0.000004s : 25: predicate.tuple_to_list_eliminator_ 2.34% : 0.000005s : 38: predicate.updatestate_pure_node_eliminater 3.03% : 0.000007s : 46: predicate.updatestate_useless_node_eliminater 0.34% : 0.000001s : 4: predicate.value_based_eliminate 0.57% : 0.000001s : 8: predicate.virtual_dataset_eliminate 0.70% : 0.000002s : 8: predicate.virtual_output_eliminate 0.24% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.42% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000649 11 55.48% : 0.000360s : 5: func_graph_cloner_run.FuncGraphClonerGraph 44.52% : 0.000289s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.030405 192 0.02% : 0.000006s : 1: ForceFp32Comm 12.00% : 0.003649s : 1: add_attr 11.94% : 0.003631s : 1: add_attr_with_inline 0.02% : 0.000006s : 1: add_comm_op_reuse_tag 0.21% : 0.000063s : 1: add_recomputation 0.02% : 0.000006s : 1: assign_add_opt 0.25% : 0.000076s : 1: auto_monad 0.10% : 0.000031s : 1: auto_monad_reorder 0.02% : 0.000006s : 1: begin_end_overlap_inline 0.06% : 0.000018s : 1: bias_add_comm_swap 1.90% : 0.000579s : 1: bootstrap 0.15% : 0.000046s : 1: cconv 0.02% : 0.000007s : 1: comm_op_add_attrs 0.08% : 0.000025s : 1: control_data_broadcast_order 0.05% : 0.000016s : 1: convert_after_rewriter 0.11% : 0.000033s : 1: cse_after_recomputation 0.03% : 0.000008s : 1: dataset_repeat_opt 0.08% : 0.000024s : 1: detach_backward 0.04% : 0.000012s : 1: environ_conv 0.11% : 0.000035s : 1: event_method 0.02% : 0.000007s : 1: full_micro_interleaved_order_control 0.03% : 0.000008s : 1: get_jit_bprop_graph 0.04% : 0.000012s : 1: graph_reusing 0.02% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.02% : 0.000007s : 1: handle_group_info 0.03% : 0.000009s : 1: inline 0.03% : 0.000009s : 1: insert-virtual-dataset 0.02% : 0.000006s : 1: interleave_parallel_branches 0.02% : 0.000006s : 1: interleave_split_concat_branches 0.03% : 0.000009s : 1: label_fine_grained_interleaved_index 0.04% : 0.000011s : 1: label_micro_interleaved_index 1.71% : 0.000521s : 1: loop_unroll 0.02% : 0.000006s : 1: merge_cast_opt 0.03% : 0.000008s : 1: micro_interleaved_order_control 2.15% : 0.000655s : 1: mutable_eliminate 0.03% : 0.000010s : 1: offloading_packed_experts 0.06% : 0.000018s : 1: opt.transform.loop_unroll_optimizer 0.07% : 0.000022s : 1: opt.transform.mutable_eliminate 4.21% : 0.001279s : 78: opt.transform.opt_a 0.10% : 0.000030s : 1: opt.transform.opt_after_cconv 0.09% : 0.000028s : 1: opt.transform.opt_after_jit_grad 0.35% : 0.000105s : 28: opt.transform.opt_b 0.17% : 0.000053s : 2: opt.transform.opt_trans_graph 0.14% : 0.000044s : 4: opt.transform.symbol_engine_opt 11.74% : 0.003569s : 1: opt_a 0.47% : 0.000143s : 1: opt_after_cconv 1.94% : 0.000591s : 1: opt_after_jit_grad 0.97% : 0.000295s : 1: opt_b 21.82% : 0.006633s : 1: optimize 0.09% : 0.000027s : 1: optimize_parallel_all_gather_comm 0.04% : 0.000012s : 1: order_py_execute_after_rewriter 0.10% : 0.000030s : 1: overlap_grad_flash_sp 0.02% : 0.000007s : 1: overlap_grad_matmul_and_grad_allreduce 0.03% : 0.000010s : 1: overlap_grad_ring_attention 0.02% : 0.000007s : 1: overlap_opt_shard_grad_in_pipeline 0.02% : 0.000007s : 1: overlap_opt_shard_in_pipeline 0.03% : 0.000008s : 1: overlap_param_gather 0.02% : 0.000006s : 1: overlap_recompute_allgather_and_fa_grad 0.03% : 0.000010s : 1: overlap_recompute_and_grad_model_parallel 0.03% : 0.000008s : 1: overlap_recompute_comm 0.04% : 0.000011s : 1: parallel-infer-symbol 0.02% : 0.000007s : 1: parallel-infer-symbol-second 0.02% : 0.000007s : 1: partial_unused_args_eliminate 0.03% : 0.000009s : 1: pipeline_parallel_scheduler 0.02% : 0.000007s : 1: pipeline_split 0.16% : 0.000049s : 1: pre_auto_parallel 0.13% : 0.000038s : 1: py_interpret_to_execute 0.07% : 0.000022s : 1: py_interpret_to_execute_after_opt_a 0.02% : 0.000006s : 1: remove_cast_before_assign_add 0.07% : 0.000021s : 1: remove_dup_value 1.50% : 0.000456s : 1: renormalize.infer 1.20% : 0.000365s : 1: renormalize.specialize 0.03% : 0.000008s : 1: reorder_send_recv_between_fp_bp 0.04% : 0.000012s : 1: rewriter_after_jit_bprop_graph 0.17% : 0.000053s : 1: rewriter_after_opt_a 0.33% : 0.000100s : 1: rewriter_before_opt_a 0.03% : 0.000008s : 1: slice_cell_reuse_recomputed_activation 0.03% : 0.000008s : 1: slice_recompute_activation 0.02% : 0.000007s : 1: split_layernorm_comm 0.02% : 0.000007s : 1: split_matmul_comm_elemetwise 0.34% : 0.000104s : 1: swap_dp_allreduce_reducescatter 0.38% : 0.000116s : 1: symbol_engine_optimizer 0.32% : 0.000098s : 1: tuple_transform 20.98% : 0.006378s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:45:11.566.309 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0159006, [21] [bootstrap]: 0.00041961 [type_inference]: 0.00579141 [event_method]: 2.163e-05 [auto_monad]: 7.061e-05 [graph_reusing]: 6.12001e-06 [inline]: 2.21998e-06 [add_attr]: 0.00345975, [1] [add_attr_with_inline]: 0.00344823, [1] [Cycle 1]: 7.107e-05, [2] [tag_attr]: 2.125e-05 [meta_addattr_fg_expand]: 6.21e-06 [parallel-infer-symbol]: 3.76001e-06 [pre_auto_parallel]: 3.773e-05 [insert-virtual-dataset]: 2.47001e-06 [parallel-infer-symbol-second]: 7.2e-07 [dataset_repeat_opt]: 1.99999e-06 [pipeline_split]: 1.60999e-06 [optimize]: 0.00526357, [53] [py_interpret_to_execute]: 2.963e-05 [rewriter_before_opt_a]: 8.367e-05 [opt_a]: 0.0030033, [2] [Cycle 1]: 0.00225427, [45] [expand_dump_flag]: 3.33998e-06 [switch_simplify]: 4.46e-05 [loop_unroll]: 2.988e-05 [a_1]: 0.00065021 [with_stream_mark]: 2.278e-05 [recompute_prepare]: 1.497e-05 [updatestate_depend_eliminate]: 3.86999e-06 [updatestate_assign_eliminate]: 3.14001e-06 [updatestate_loads_eliminate]: 3.41999e-06 [parameter_eliminate]: 2.07999e-06 [a_2]: 8.373e-05 [accelerated_algorithm]: 8.48999e-06 [shard]: 2.36e-06 [meta_shard_fg_expand]: 2.07001e-06 [shard_inline]: 7.03998e-06 [merge_send_recv]: 9.17001e-06 [auto_parallel]: 7.98999e-06 [parallel]: 2.02e-05 [flash_sp]: 1.044e-05 [merge_comm]: 3.95e-06 [allreduce_fusion]: 3.53999e-06 [matmul_add_comm_reduction]: 1.171e-05 [allreduce_slice_to_reducescatter]: 6.00005e-07 [virtual_shard_identity]: 1.104e-05 [virtual_dataset]: 7.16001e-06 [get_grad_eliminate_]: 6.61e-06 [virtual_output]: 6.12001e-06 [merge_forward]: 4.18001e-06 [cell_reuse_recompute_pass]: 1.55001e-06 [offload_activation]: 1.017e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.688e-05 [merge_recompute_call_nodes]: 1.48002e-06 [before_grad]: 1.057e-05 [set_forward_comm_id_for_comm_node_pass]: 4.3e-06 [meta_fg_expand]: 2.99999e-06 [flash_sp_send_recv_attached]: 4.31002e-06 [receive_attached]: 2.78e-06 [after_resolve]: 1.407e-05 [a_after_grad]: 1.097e-05 [renormalize]: 0.00078263 [add_forward_monad_depend]: 7.66001e-06 [auto_monad_grad]: 2.54001e-06 [auto_monad_eliminator]: 1.971e-05 [cse]: 3.231e-05 [a_3]: 5.976e-05 [Cycle 2]: 0.00073642, [45] [expand_dump_flag]: 2.12999e-06 [switch_simplify]: 9.07001e-06 [loop_unroll]: 6.79999e-06 [a_1]: 0.00013866 [with_stream_mark]: 1.693e-05 [recompute_prepare]: 6.43e-06 [updatestate_depend_eliminate]: 3.75998e-06 [updatestate_assign_eliminate]: 2.92002e-06 [updatestate_loads_eliminate]: 2.97002e-06 [parameter_eliminate]: 2.44999e-06 [a_2]: 7.367e-05 [accelerated_algorithm]: 7.06001e-06 [shard]: 2.38002e-06 [meta_shard_fg_expand]: 2.53e-06 [shard_inline]: 7.51001e-06 [merge_send_recv]: 7.14001e-06 [auto_parallel]: 7.66001e-06 [parallel]: 7.18998e-06 [flash_sp]: 4.12e-06 [merge_comm]: 3.9e-06 [allreduce_fusion]: 3.82002e-06 [matmul_add_comm_reduction]: 9.69e-06 [allreduce_slice_to_reducescatter]: 5.40022e-07 [virtual_shard_identity]: 9.05999e-06 [virtual_dataset]: 9.24e-06 [get_grad_eliminate_]: 5.69999e-06 [virtual_output]: 5.64998e-06 [merge_forward]: 4.23999e-06 [cell_reuse_recompute_pass]: 2.32001e-06 [offload_activation]: 9.10001e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.71e-05 [merge_recompute_call_nodes]: 1.27e-06 [before_grad]: 1.107e-05 [set_forward_comm_id_for_comm_node_pass]: 4.35e-06 [meta_fg_expand]: 2.10002e-06 [flash_sp_send_recv_attached]: 1.64e-06 [receive_attached]: 1.93997e-06 [after_resolve]: 1.343e-05 [a_after_grad]: 1.05e-05 [renormalize]: 7.99773e-08 [add_forward_monad_depend]: 2.56e-06 [auto_monad_grad]: 1.45001e-06 [auto_monad_eliminator]: 1.04e-05 [cse]: 1.695e-05 [a_3]: 3.878e-05 [py_interpret_to_execute_after_opt_a]: 1.312e-05 [slice_cell_reuse_recomputed_activation]: 2.14999e-06 [rewriter_after_opt_a]: 4.143e-05 [convert_after_rewriter]: 7.87003e-06 [order_py_execute_after_rewriter]: 5.74999e-06 [mutable_eliminate]: 0.00061525 [opt_b]: 0.00021775, [1] [Cycle 1]: 0.00021018, [7] [b_1]: 0.00012531 [b_2]: 8.42998e-06 [updatestate_depend_eliminate]: 8.99e-06 [updatestate_assign_eliminate]: 2.68998e-06 [updatestate_loads_eliminate]: 2.76e-06 [renormalize]: 7.00005e-07 [cse]: 2.398e-05 [optimize_parallel_all_gather_comm]: 1.992e-05 [overlap_param_gather]: 2.64001e-06 [cconv]: 3.196e-05 [loop_unroll]: 0.00046284 [opt_after_cconv]: 0.00011649, [1] [Cycle 1]: 0.00010959, [7] [c_1]: 3.468e-05 [parameter_eliminate]: 6.19001e-06 [updatestate_depend_eliminate]: 6.96999e-06 [updatestate_assign_eliminate]: 2.74999e-06 [updatestate_loads_eliminate]: 2.17999e-06 [cse]: 2.02e-05 [renormalize]: 3.69997e-07 [remove_dup_value]: 1.384e-05 [tuple_transform]: 8.725e-05, [1] [Cycle 1]: 8.243e-05, [4] [d_1]: 5.185e-05 [none_parameter_eliminate]: 2.07999e-06 [renormalize]: 2.50002e-07 [switch_simplify]: 7.82e-06 [partial_unused_args_eliminate]: 1.87999e-06 [add_recomputation]: 5.664e-05 [cse_after_recomputation]: 2.208e-05, [1] [Cycle 1]: 1.731e-05, [1] [cse]: 1.158e-05 [environ_conv]: 5.74999e-06 [swap_dp_allreduce_reducescatter]: 5.50001e-06 [bias_add_comm_swap]: 3.3e-06 [label_micro_interleaved_index]: 4.84e-06 [label_fine_grained_interleaved_index]: 3.09999e-06 [merge_cast_opt]: 1.59998e-06 [slice_recompute_activation]: 2.39999e-06 [micro_interleaved_order_control]: 2.34001e-06 [assign_add_opt]: 1.14e-06 [ForceFp32Comm]: 8.00006e-07 [remove_cast_before_assign_add]: 1.27999e-06 [full_micro_interleaved_order_control]: 2.44999e-06 [reorder_send_recv_between_fp_bp]: 3.23e-06 [comm_op_add_attrs]: 1.05001e-06 [add_comm_op_reuse_tag]: 1.34e-06 [interleave_split_concat_branches]: 1.12999e-06 [interleave_parallel_branches]: 1.11002e-06 [overlap_opt_shard_in_pipeline]: 1.40001e-06 [overlap_opt_shard_grad_in_pipeline]: 2.30002e-06 [control_data_broadcast_order]: 1.438e-05 [grouped_pairwise_exchange_alltoall]: 1.77001e-06 [offloading_packed_experts]: 4.55999e-06 [overlap_recompute_and_grad_model_parallel]: 5.34e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.17e-06 [overlap_recompute_allgather_and_fa_grad]: 1.40001e-06 [overlap_recompute_comm]: 2.59001e-06 [overlap_grad_ring_attention]: 4.08001e-06 [overlap_grad_flash_sp]: 2.257e-05 [begin_end_overlap_inline]: 4.99975e-07 [split_matmul_comm_elemetwise]: 2.07999e-06 [split_layernorm_comm]: 1.63002e-06 [handle_group_info]: 9.09989e-07 [symbol_engine_optimizer]: 8.538e-05, [1] [Cycle 1]: 8.054e-05, [6] [build]: 3.33e-06 [elim_shapecalc]: 1.377e-05 [elim_not_effective]: 1.409e-05 [opt_reshape]: 7.04001e-06 [fold_const_symbol]: 9.97999e-06 [renormalize]: 1.59984e-07 [detach_backward]: 2.83e-06 [pipeline_parallel_scheduler]: 2.26e-06 [auto_monad_reorder]: 2.345e-05 [get_jit_bprop_graph]: 2.00002e-06 [rewriter_after_jit_bprop_graph]: 5.97001e-06 [opt_after_jit_grad]: 0.0005329 [validate]: 4.389e-05 Sums bootstrap : 0.000420s : 3.70% type_inference : 0.005791s : 51.05% event_method : 0.000022s : 0.19% auto_monad : 0.000071s : 0.62% graph_reusing : 0.000006s : 0.05% inline : 0.000002s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000021s : 0.19% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.05% parallel-infer-symbol : 0.000004s : 0.03% pre_auto_parallel : 0.000038s : 0.33% insert-virtual-dataset : 0.000002s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000030s : 0.26% optimize.rewriter_before_opt_a : 0.000084s : 0.74% optimize.opt_a.expand_dump_flag : 0.000005s : 0.05% optimize.opt_a.switch_simplify : 0.000054s : 0.47% optimize.opt_a.loop_unroll : 0.000037s : 0.32% optimize.opt_a.a_1 : 0.000789s : 6.95% optimize.opt_a.with_stream_mark : 0.000040s : 0.35% optimize.opt_a.recompute_prepare : 0.000021s : 0.19% optimize.opt_a.updatestate_depend_eliminate : 0.000008s : 0.07% optimize.opt_a.updatestate_assign_eliminate : 0.000006s : 0.05% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.06% optimize.opt_a.parameter_eliminate : 0.000005s : 0.04% optimize.opt_a.a_2 : 0.000157s : 1.39% optimize.opt_a.accelerated_algorithm : 0.000016s : 0.14% optimize.opt_a.shard : 0.000005s : 0.04% optimize.opt_a.meta_shard_fg_expand : 0.000005s : 0.04% optimize.opt_a.shard_inline : 0.000015s : 0.13% optimize.opt_a.merge_send_recv : 0.000016s : 0.14% optimize.opt_a.auto_parallel : 0.000016s : 0.14% optimize.opt_a.parallel : 0.000027s : 0.24% optimize.opt_a.flash_sp : 0.000015s : 0.13% optimize.opt_a.merge_comm : 0.000008s : 0.07% optimize.opt_a.allreduce_fusion : 0.000007s : 0.06% optimize.opt_a.matmul_add_comm_reduction : 0.000021s : 0.19% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000020s : 0.18% optimize.opt_a.virtual_dataset : 0.000016s : 0.14% optimize.opt_a.get_grad_eliminate_ : 0.000012s : 0.11% optimize.opt_a.virtual_output : 0.000012s : 0.10% optimize.opt_a.merge_forward : 0.000008s : 0.07% optimize.opt_a.cell_reuse_recompute_pass : 0.000004s : 0.03% optimize.opt_a.offload_activation : 0.000019s : 0.17% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000034s : 0.30% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.02% optimize.opt_a.before_grad : 0.000022s : 0.19% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000009s : 0.08% optimize.opt_a.meta_fg_expand : 0.000005s : 0.04% optimize.opt_a.flash_sp_send_recv_attached : 0.000006s : 0.05% optimize.opt_a.receive_attached : 0.000005s : 0.04% optimize.opt_a.after_resolve : 0.000028s : 0.24% optimize.opt_a.a_after_grad : 0.000021s : 0.19% optimize.opt_a.renormalize : 0.000783s : 6.90% optimize.opt_a.add_forward_monad_depend : 0.000010s : 0.09% optimize.opt_a.auto_monad_grad : 0.000004s : 0.04% optimize.opt_a.auto_monad_eliminator : 0.000030s : 0.27% optimize.opt_a.cse : 0.000049s : 0.43% optimize.opt_a.a_3 : 0.000099s : 0.87% optimize.py_interpret_to_execute_after_opt_a : 0.000013s : 0.12% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.02% optimize.rewriter_after_opt_a : 0.000041s : 0.37% optimize.convert_after_rewriter : 0.000008s : 0.07% optimize.order_py_execute_after_rewriter : 0.000006s : 0.05% optimize.mutable_eliminate : 0.000615s : 5.42% optimize.opt_b.b_1 : 0.000125s : 1.10% optimize.opt_b.b_2 : 0.000008s : 0.07% optimize.opt_b.updatestate_depend_eliminate : 0.000009s : 0.08% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.02% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.02% optimize.opt_b.renormalize : 0.000001s : 0.01% optimize.opt_b.cse : 0.000024s : 0.21% optimize.optimize_parallel_all_gather_comm : 0.000020s : 0.18% optimize.overlap_param_gather : 0.000003s : 0.02% optimize.cconv : 0.000032s : 0.28% optimize.loop_unroll : 0.000463s : 4.08% optimize.opt_after_cconv.c_1 : 0.000035s : 0.31% optimize.opt_after_cconv.parameter_eliminate : 0.000006s : 0.05% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000007s : 0.06% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.02% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.02% optimize.opt_after_cconv.cse : 0.000020s : 0.18% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000014s : 0.12% optimize.tuple_transform.d_1 : 0.000052s : 0.46% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.02% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000008s : 0.07% optimize.partial_unused_args_eliminate : 0.000002s : 0.02% optimize.add_recomputation : 0.000057s : 0.50% optimize.cse_after_recomputation.cse : 0.000012s : 0.10% optimize.environ_conv : 0.000006s : 0.05% optimize.swap_dp_allreduce_reducescatter : 0.000006s : 0.05% optimize.bias_add_comm_swap : 0.000003s : 0.03% optimize.label_micro_interleaved_index : 0.000005s : 0.04% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.03% optimize.merge_cast_opt : 0.000002s : 0.01% optimize.slice_recompute_activation : 0.000002s : 0.02% optimize.micro_interleaved_order_control : 0.000002s : 0.02% optimize.assign_add_opt : 0.000001s : 0.01% optimize.ForceFp32Comm : 0.000001s : 0.01% optimize.remove_cast_before_assign_add : 0.000001s : 0.01% optimize.full_micro_interleaved_order_control : 0.000002s : 0.02% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.03% optimize.comm_op_add_attrs : 0.000001s : 0.01% optimize.add_comm_op_reuse_tag : 0.000001s : 0.01% optimize.interleave_split_concat_branches : 0.000001s : 0.01% optimize.interleave_parallel_branches : 0.000001s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.02% optimize.control_data_broadcast_order : 0.000014s : 0.13% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.02% optimize.offloading_packed_experts : 0.000005s : 0.04% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.05% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.01% optimize.overlap_recompute_comm : 0.000003s : 0.02% optimize.overlap_grad_ring_attention : 0.000004s : 0.04% optimize.overlap_grad_flash_sp : 0.000023s : 0.20% optimize.begin_end_overlap_inline : 0.000000s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.02% optimize.split_layernorm_comm : 0.000002s : 0.01% optimize.handle_group_info : 0.000001s : 0.01% optimize.symbol_engine_optimizer.build : 0.000003s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000014s : 0.12% optimize.symbol_engine_optimizer.elim_not_effective : 0.000014s : 0.12% optimize.symbol_engine_optimizer.opt_reshape : 0.000007s : 0.06% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000010s : 0.09% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000003s : 0.02% pipeline_parallel_scheduler : 0.000002s : 0.02% auto_monad_reorder : 0.000023s : 0.21% get_jit_bprop_graph : 0.000002s : 0.02% rewriter_after_jit_bprop_graph : 0.000006s : 0.05% opt_after_jit_grad : 0.000533s : 4.70% validate : 0.000044s : 0.39% Time group info: ------[substitution.] 0.000201 28 0.87% : 0.000002s : 2: substitution.elim_not_effective 0.70% : 0.000001s : 2: substitution.fold_const_symbol 3.53% : 0.000007s : 4: substitution.graph_param_transform 77.57% : 0.000156s : 4: substitution.inline 2.46% : 0.000005s : 4: substitution.j_node_and_user_rematch 2.65% : 0.000005s : 4: substitution.remove_not_recompute_node 3.56% : 0.000007s : 4: substitution.replace_old_param 8.66% : 0.000017s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.005719 2 87.15% : 0.004984s : 1: type_inference.infer 12.85% : 0.000735s : 1: type_inference.specialize ------[replace.] 0.000064 8 62.58% : 0.000040s : 4: replace.inline 37.42% : 0.000024s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000168 8 90.80% : 0.000153s : 4: match.inline 9.20% : 0.000015s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000216 1278 0.84% : 0.000002s : 13: predicate.accumulaten_eliminater 0.77% : 0.000002s : 4: predicate.ad_related_special_op_eliminate 0.44% : 0.000001s : 8: predicate.addn_check_dump 0.89% : 0.000002s : 13: predicate.addn_zero_filter 0.81% : 0.000002s : 13: predicate.adjust_all_reduce_mul_add 1.93% : 0.000004s : 21: predicate.arithmetic_simplify 0.88% : 0.000002s : 13: predicate.cast_eliminate 0.56% : 0.000001s : 8: predicate.check_bprop_eliminate 0.47% : 0.000001s : 8: predicate.compare_switch_simplify 0.18% : 0.000000s : 4: predicate.const_output_eliminate 0.57% : 0.000001s : 8: predicate.depend_value_elim 0.85% : 0.000002s : 13: predicate.dict_get_item_const_eliminator 0.94% : 0.000002s : 13: predicate.dict_get_item_eliminator 1.13% : 0.000002s : 13: predicate.dict_set_item_eliminator 1.03% : 0.000002s : 8: predicate.dumpgradient_eliminate 0.22% : 0.000000s : 4: predicate.elim_not_effective 0.45% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.10% : 0.000002s : 17: predicate.environ_add_const_eliminate 1.01% : 0.000002s : 17: predicate.environ_get_add_eliminate 1.14% : 0.000002s : 17: predicate.environ_get_depend_swap 1.58% : 0.000003s : 25: predicate.environ_get_eliminate 1.00% : 0.000002s : 17: predicate.environ_get_set_eliminate 1.39% : 0.000003s : 21: predicate.exchange_switch_depend_value 2.50% : 0.000005s : 21: predicate.float_depend_g_call 0.46% : 0.000001s : 8: predicate.float_environ_get_switch 0.74% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.19% : 0.000000s : 4: predicate.fold_const_symbol 0.59% : 0.000001s : 8: predicate.get_grad_eliminate 0.28% : 0.000001s : 4: predicate.graph_param_transform 0.57% : 0.000001s : 8: predicate.incorporate_call 0.46% : 0.000001s : 8: predicate.incorporate_call_switch 6.92% : 0.000015s : 58: predicate.inline 1.33% : 0.000003s : 8: predicate.inline_without_move 0.30% : 0.000001s : 8: predicate.j_node_and_user_rematch 0.74% : 0.000002s : 8: predicate.less_batch_normalization 1.78% : 0.000004s : 25: predicate.list_to_tuple_eliminator_ 2.37% : 0.000005s : 38: predicate.load_eliminater 1.35% : 0.000003s : 4: predicate.loop_unroll_after_grad 2.21% : 0.000005s : 34: predicate.loop_unroll_before_grad 1.72% : 0.000004s : 21: predicate.make_slice_get_slice_eliminator 0.49% : 0.000001s : 8: predicate.merge_addn 0.54% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.54% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.78% : 0.000002s : 13: predicate.minmaximum_grad 1.42% : 0.000003s : 4: predicate.mutable_eliminate 0.47% : 0.000001s : 4: predicate.opt_reshape 0.43% : 0.000001s : 4: predicate.parallel_virtual_node 1.80% : 0.000004s : 21: predicate.partial_defer_inline 1.62% : 0.000004s : 21: predicate.partial_eliminate 0.99% : 0.000002s : 13: predicate.print_const_string_wrapper 0.61% : 0.000001s : 8: predicate.reduce_all_const_elim 1.05% : 0.000002s : 13: predicate.reduce_eliminate 2.38% : 0.000005s : 38: predicate.redundant_stop_gradient_eliminater 0.69% : 0.000001s : 8: predicate.remove_not_recompute_node 1.54% : 0.000003s : 25: predicate.replace_applicator 0.54% : 0.000001s : 8: predicate.replace_old_param 0.44% : 0.000001s : 4: predicate.reset_defer_inline 0.91% : 0.000002s : 13: predicate.reshape_eliminate 0.66% : 0.000001s : 8: predicate.row_tensor_add_zeros_like 0.53% : 0.000001s : 4: predicate.row_tensor_eliminate 0.75% : 0.000002s : 8: predicate.same_eliminate 0.60% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.74% : 0.000002s : 8: predicate.shard_identity_eliminate 0.74% : 0.000002s : 8: predicate.special_op_eliminate 0.67% : 0.000001s : 8: predicate.specialize_transform 1.11% : 0.000002s : 8: predicate.split_environ_get_set_with_tuple_value 0.95% : 0.000002s : 8: predicate.stack_unstack_eliminate 0.49% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.48% : 0.000003s : 21: predicate.switch_defer_inline 2.03% : 0.000004s : 29: predicate.switch_layer_defer_inline 5.15% : 0.000011s : 67: predicate.switch_simplify 0.80% : 0.000002s : 13: predicate.tile_eliminate 0.96% : 0.000002s : 13: predicate.transpose_eliminate 1.52% : 0.000003s : 21: predicate.tuple_list_convert_item_index_to_positive 1.52% : 0.000003s : 21: predicate.tuple_list_get_item_const_eliminator 1.31% : 0.000003s : 21: predicate.tuple_list_get_item_depend_reorder 3.32% : 0.000007s : 33: predicate.tuple_list_get_item_eliminator 1.33% : 0.000003s : 21: predicate.tuple_list_get_set_item_eliminator 2.22% : 0.000005s : 29: predicate.tuple_list_set_item_eliminator 1.61% : 0.000003s : 25: predicate.tuple_to_list_eliminator_ 2.33% : 0.000005s : 38: predicate.updatestate_pure_node_eliminater 3.00% : 0.000006s : 46: predicate.updatestate_useless_node_eliminater 0.33% : 0.000001s : 4: predicate.value_based_eliminate 0.70% : 0.000002s : 8: predicate.virtual_dataset_eliminate 0.60% : 0.000001s : 8: predicate.virtual_output_eliminate 0.26% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.39% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000582 11 53.84% : 0.000313s : 5: func_graph_cloner_run.FuncGraphClonerGraph 46.16% : 0.000269s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.026813 192 0.01% : 0.000004s : 1: ForceFp32Comm 12.92% : 0.003465s : 1: add_attr 12.88% : 0.003452s : 1: add_attr_with_inline 0.02% : 0.000004s : 1: add_comm_op_reuse_tag 0.23% : 0.000060s : 1: add_recomputation 0.01% : 0.000004s : 1: assign_add_opt 0.29% : 0.000077s : 1: auto_monad 0.10% : 0.000028s : 1: auto_monad_reorder 0.01% : 0.000003s : 1: begin_end_overlap_inline 0.02% : 0.000006s : 1: bias_add_comm_swap 1.67% : 0.000447s : 1: bootstrap 0.13% : 0.000036s : 1: cconv 0.01% : 0.000004s : 1: comm_op_add_attrs 0.07% : 0.000018s : 1: control_data_broadcast_order 0.04% : 0.000012s : 1: convert_after_rewriter 0.09% : 0.000025s : 1: cse_after_recomputation 0.02% : 0.000005s : 1: dataset_repeat_opt 0.16% : 0.000044s : 1: detach_backward 0.04% : 0.000009s : 1: environ_conv 0.11% : 0.000028s : 1: event_method 0.02% : 0.000005s : 1: full_micro_interleaved_order_control 0.02% : 0.000005s : 1: get_jit_bprop_graph 0.04% : 0.000010s : 1: graph_reusing 0.02% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000004s : 1: handle_group_info 0.02% : 0.000005s : 1: inline 0.02% : 0.000006s : 1: insert-virtual-dataset 0.01% : 0.000004s : 1: interleave_parallel_branches 0.02% : 0.000004s : 1: interleave_split_concat_branches 0.02% : 0.000006s : 1: label_fine_grained_interleaved_index 0.03% : 0.000008s : 1: label_micro_interleaved_index 1.76% : 0.000473s : 1: loop_unroll 0.02% : 0.000004s : 1: merge_cast_opt 0.02% : 0.000005s : 1: micro_interleaved_order_control 2.33% : 0.000626s : 1: mutable_eliminate 0.03% : 0.000007s : 1: offloading_packed_experts 0.07% : 0.000019s : 1: opt.transform.loop_unroll_optimizer 0.08% : 0.000023s : 1: opt.transform.mutable_eliminate 4.66% : 0.001249s : 78: opt.transform.opt_a 0.12% : 0.000033s : 1: opt.transform.opt_after_cconv 0.11% : 0.000029s : 1: opt.transform.opt_after_jit_grad 0.38% : 0.000102s : 28: opt.transform.opt_b 0.21% : 0.000057s : 2: opt.transform.opt_trans_graph 0.15% : 0.000041s : 4: opt.transform.symbol_engine_opt 11.21% : 0.003007s : 1: opt_a 0.45% : 0.000121s : 1: opt_after_cconv 2.03% : 0.000544s : 1: opt_after_jit_grad 0.83% : 0.000222s : 1: opt_b 19.65% : 0.005268s : 1: optimize 0.09% : 0.000024s : 1: optimize_parallel_all_gather_comm 0.03% : 0.000009s : 1: order_py_execute_after_rewriter 0.10% : 0.000026s : 1: overlap_grad_flash_sp 0.02% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.03% : 0.000007s : 1: overlap_grad_ring_attention 0.02% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.02% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.02% : 0.000006s : 1: overlap_param_gather 0.02% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.03% : 0.000008s : 1: overlap_recompute_and_grad_model_parallel 0.02% : 0.000006s : 1: overlap_recompute_comm 0.03% : 0.000007s : 1: parallel-infer-symbol 0.01% : 0.000004s : 1: parallel-infer-symbol-second 0.02% : 0.000005s : 1: partial_unused_args_eliminate 0.04% : 0.000010s : 1: pipeline_parallel_scheduler 0.02% : 0.000004s : 1: pipeline_split 0.16% : 0.000042s : 1: pre_auto_parallel 0.13% : 0.000034s : 1: py_interpret_to_execute 0.06% : 0.000017s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000004s : 1: remove_cast_before_assign_add 0.07% : 0.000018s : 1: remove_dup_value 1.61% : 0.000431s : 1: renormalize.infer 1.28% : 0.000342s : 1: renormalize.specialize 0.02% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.03% : 0.000009s : 1: rewriter_after_jit_bprop_graph 0.17% : 0.000047s : 1: rewriter_after_opt_a 0.33% : 0.000088s : 1: rewriter_before_opt_a 0.02% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.02% : 0.000006s : 1: slice_recompute_activation 0.02% : 0.000005s : 1: split_layernorm_comm 0.02% : 0.000005s : 1: split_matmul_comm_elemetwise 0.03% : 0.000009s : 1: swap_dp_allreduce_reducescatter 0.33% : 0.000088s : 1: symbol_engine_optimizer 0.34% : 0.000090s : 1: tuple_transform 21.67% : 0.005810s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:45:11.888.849 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:45:11.889.119 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0192531, [21] [bootstrap]: 0.00044898 [type_inference]: 0.00840352 [event_method]: 1.945e-05 [auto_monad]: 6.098e-05 [graph_reusing]: 6.46999e-06 [inline]: 2.63e-06 [add_attr]: 0.0034012, [1] [add_attr_with_inline]: 0.00339108, [1] [Cycle 1]: 7.9e-05, [2] [tag_attr]: 2.048e-05 [meta_addattr_fg_expand]: 5.97001e-06 [parallel-infer-symbol]: 3.26999e-06 [pre_auto_parallel]: 3.755e-05 [insert-virtual-dataset]: 2.32999e-06 [parallel-infer-symbol-second]: 8.2e-07 [dataset_repeat_opt]: 1.74e-06 [pipeline_split]: 1.86998e-06 [optimize]: 0.0054299, [53] [py_interpret_to_execute]: 3.202e-05 [rewriter_before_opt_a]: 8.914e-05 [opt_a]: 0.00308291, [2] [Cycle 1]: 0.00224288, [45] [expand_dump_flag]: 3.13e-06 [switch_simplify]: 4.273e-05 [loop_unroll]: 3.013e-05 [a_1]: 0.00065452 [with_stream_mark]: 1.72e-05 [recompute_prepare]: 8.90999e-06 [updatestate_depend_eliminate]: 3.7e-06 [updatestate_assign_eliminate]: 3.36999e-06 [updatestate_loads_eliminate]: 2.98e-06 [parameter_eliminate]: 1.82999e-06 [a_2]: 0.0001086 [accelerated_algorithm]: 7.3e-06 [shard]: 2.14e-06 [meta_shard_fg_expand]: 1.82001e-06 [shard_inline]: 6.69999e-06 [merge_send_recv]: 8.22e-06 [auto_parallel]: 6.37001e-06 [parallel]: 2.045e-05 [flash_sp]: 9.42001e-06 [merge_comm]: 4.35e-06 [allreduce_fusion]: 3.58999e-06 [matmul_add_comm_reduction]: 9.69e-06 [allreduce_slice_to_reducescatter]: 6.80011e-07 [virtual_shard_identity]: 8.23999e-06 [virtual_dataset]: 7.00998e-06 [get_grad_eliminate_]: 6.41e-06 [virtual_output]: 6.88e-06 [merge_forward]: 3.85e-06 [cell_reuse_recompute_pass]: 1.23002e-06 [offload_activation]: 9.91998e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.461e-05 [merge_recompute_call_nodes]: 1.42e-06 [before_grad]: 1.013e-05 [set_forward_comm_id_for_comm_node_pass]: 3.53e-06 [meta_fg_expand]: 3.73001e-06 [flash_sp_send_recv_attached]: 2.37999e-06 [receive_attached]: 2.59999e-06 [after_resolve]: 1.22e-05 [a_after_grad]: 1.008e-05 [renormalize]: 0.00068776 [add_forward_monad_depend]: 5.54e-06 [auto_monad_grad]: 2.58998e-06 [auto_monad_eliminator]: 1.451e-05 [cse]: 2.949e-05 [a_3]: 6.198e-05 [Cycle 2]: 0.00082575, [45] [expand_dump_flag]: 1.08001e-06 [switch_simplify]: 7.43e-06 [loop_unroll]: 6.17001e-06 [a_1]: 0.00012988 [with_stream_mark]: 1.138e-05 [recompute_prepare]: 6.56e-06 [updatestate_depend_eliminate]: 3.73001e-06 [updatestate_assign_eliminate]: 2.51e-06 [updatestate_loads_eliminate]: 2.33998e-06 [parameter_eliminate]: 1.49e-06 [a_2]: 9.894e-05 [accelerated_algorithm]: 6.44999e-06 [shard]: 1.49e-06 [meta_shard_fg_expand]: 1.67001e-06 [shard_inline]: 6.38e-06 [merge_send_recv]: 4.80999e-06 [auto_parallel]: 5.35001e-06 [parallel]: 5.71e-06 [flash_sp]: 3.43e-06 [merge_comm]: 3.21001e-06 [allreduce_fusion]: 3.05002e-06 [matmul_add_comm_reduction]: 6.53e-06 [allreduce_slice_to_reducescatter]: 3.4002e-07 [virtual_shard_identity]: 6.86999e-06 [virtual_dataset]: 6.50997e-06 [get_grad_eliminate_]: 5.77999e-06 [virtual_output]: 5.56998e-06 [merge_forward]: 2.83e-06 [cell_reuse_recompute_pass]: 1.77001e-06 [offload_activation]: 7.63999e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.434e-05 [merge_recompute_call_nodes]: 1.04e-06 [before_grad]: 9.12001e-06 [set_forward_comm_id_for_comm_node_pass]: 3.53e-06 [meta_fg_expand]: 2.07999e-06 [flash_sp_send_recv_attached]: 9.29984e-07 [receive_attached]: 1.20001e-06 [after_resolve]: 1.11e-05 [a_after_grad]: 9.22001e-06 [renormalize]: 6.00121e-08 [add_forward_monad_depend]: 1.19e-06 [auto_monad_grad]: 1.59e-06 [auto_monad_eliminator]: 8.57998e-06 [cse]: 1.441e-05 [a_3]: 4.913e-05 [py_interpret_to_execute_after_opt_a]: 1.415e-05 [slice_cell_reuse_recomputed_activation]: 5.39e-06 [rewriter_after_opt_a]: 4.039e-05 [convert_after_rewriter]: 1.055e-05 [order_py_execute_after_rewriter]: 7.82e-06 [mutable_eliminate]: 0.00058179 [opt_b]: 0.00026979, [1] [Cycle 1]: 0.00025992, [7] [b_1]: 0.00016744 [b_2]: 7.57002e-06 [updatestate_depend_eliminate]: 5.62001e-06 [updatestate_assign_eliminate]: 2.46e-06 [updatestate_loads_eliminate]: 2.37001e-06 [renormalize]: 9.29984e-07 [cse]: 1.833e-05 [optimize_parallel_all_gather_comm]: 1.977e-05 [overlap_param_gather]: 4.67e-06 [cconv]: 3.039e-05 [loop_unroll]: 0.00044463 [opt_after_cconv]: 0.00012511, [1] [Cycle 1]: 0.00011615, [7] [c_1]: 3.044e-05 [parameter_eliminate]: 3.06001e-06 [updatestate_depend_eliminate]: 5.13002e-06 [updatestate_assign_eliminate]: 2.61999e-06 [updatestate_loads_eliminate]: 2.36e-06 [cse]: 1.702e-05 [renormalize]: 3.39991e-07 [remove_dup_value]: 1.66e-05 [tuple_transform]: 9.142e-05, [1] [Cycle 1]: 8.335e-05, [4] [d_1]: 4.444e-05 [none_parameter_eliminate]: 2.04999e-06 [renormalize]: 2.60014e-07 [switch_simplify]: 6.92002e-06 [partial_unused_args_eliminate]: 4.34002e-06 [add_recomputation]: 5.157e-05 [cse_after_recomputation]: 2.654e-05, [1] [Cycle 1]: 1.951e-05, [1] [cse]: 1.065e-05 [environ_conv]: 7.8e-06 [swap_dp_allreduce_reducescatter]: 7.5e-06 [bias_add_comm_swap]: 5.16002e-06 [label_micro_interleaved_index]: 6.82002e-06 [label_fine_grained_interleaved_index]: 5.19e-06 [merge_cast_opt]: 3.66999e-06 [slice_recompute_activation]: 4.28999e-06 [micro_interleaved_order_control]: 4.70001e-06 [assign_add_opt]: 3.54002e-06 [ForceFp32Comm]: 3.22002e-06 [remove_cast_before_assign_add]: 3.30003e-06 [full_micro_interleaved_order_control]: 4.55999e-06 [reorder_send_recv_between_fp_bp]: 5.46e-06 [comm_op_add_attrs]: 3.28998e-06 [add_comm_op_reuse_tag]: 3.20998e-06 [interleave_split_concat_branches]: 3.46999e-06 [interleave_parallel_branches]: 3.38999e-06 [overlap_opt_shard_in_pipeline]: 3.63e-06 [overlap_opt_shard_grad_in_pipeline]: 4.45999e-06 [control_data_broadcast_order]: 1.599e-05 [grouped_pairwise_exchange_alltoall]: 4.12998e-06 [offloading_packed_experts]: 6.24001e-06 [overlap_recompute_and_grad_model_parallel]: 6.64001e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.45e-06 [overlap_recompute_allgather_and_fa_grad]: 3.58999e-06 [overlap_recompute_comm]: 5.27001e-06 [overlap_grad_ring_attention]: 6.53998e-06 [overlap_grad_flash_sp]: 2.312e-05 [begin_end_overlap_inline]: 2.79999e-06 [split_matmul_comm_elemetwise]: 4.55001e-06 [split_layernorm_comm]: 3.90998e-06 [handle_group_info]: 3.36001e-06 [symbol_engine_optimizer]: 9.482e-05, [1] [Cycle 1]: 8.823e-05, [6] [build]: 2.84999e-06 [elim_shapecalc]: 9.53002e-06 [elim_not_effective]: 1.333e-05 [opt_reshape]: 7.21001e-06 [fold_const_symbol]: 1.027e-05 [renormalize]: 2.60014e-07 [detach_backward]: 3.6e-06 [pipeline_parallel_scheduler]: 2.64999e-06 [auto_monad_reorder]: 1.954e-05 [get_jit_bprop_graph]: 1.64e-06 [rewriter_after_jit_bprop_graph]: 4.38001e-06 [opt_after_jit_grad]: 0.00049468 [validate]: 3.977e-05 Sums bootstrap : 0.000449s : 3.25% type_inference : 0.008404s : 60.75% event_method : 0.000019s : 0.14% auto_monad : 0.000061s : 0.44% graph_reusing : 0.000006s : 0.05% inline : 0.000003s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000020s : 0.15% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.04% parallel-infer-symbol : 0.000003s : 0.02% pre_auto_parallel : 0.000038s : 0.27% insert-virtual-dataset : 0.000002s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.01% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000032s : 0.23% optimize.rewriter_before_opt_a : 0.000089s : 0.64% optimize.opt_a.expand_dump_flag : 0.000004s : 0.03% optimize.opt_a.switch_simplify : 0.000050s : 0.36% optimize.opt_a.loop_unroll : 0.000036s : 0.26% optimize.opt_a.a_1 : 0.000784s : 5.67% optimize.opt_a.with_stream_mark : 0.000029s : 0.21% optimize.opt_a.recompute_prepare : 0.000015s : 0.11% optimize.opt_a.updatestate_depend_eliminate : 0.000007s : 0.05% optimize.opt_a.updatestate_assign_eliminate : 0.000006s : 0.04% optimize.opt_a.updatestate_loads_eliminate : 0.000005s : 0.04% optimize.opt_a.parameter_eliminate : 0.000003s : 0.02% optimize.opt_a.a_2 : 0.000208s : 1.50% optimize.opt_a.accelerated_algorithm : 0.000014s : 0.10% optimize.opt_a.shard : 0.000004s : 0.03% optimize.opt_a.meta_shard_fg_expand : 0.000003s : 0.03% optimize.opt_a.shard_inline : 0.000013s : 0.09% optimize.opt_a.merge_send_recv : 0.000013s : 0.09% optimize.opt_a.auto_parallel : 0.000012s : 0.08% optimize.opt_a.parallel : 0.000026s : 0.19% optimize.opt_a.flash_sp : 0.000013s : 0.09% optimize.opt_a.merge_comm : 0.000008s : 0.05% optimize.opt_a.allreduce_fusion : 0.000007s : 0.05% optimize.opt_a.matmul_add_comm_reduction : 0.000016s : 0.12% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000015s : 0.11% optimize.opt_a.virtual_dataset : 0.000014s : 0.10% optimize.opt_a.get_grad_eliminate_ : 0.000012s : 0.09% optimize.opt_a.virtual_output : 0.000012s : 0.09% optimize.opt_a.merge_forward : 0.000007s : 0.05% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.02% optimize.opt_a.offload_activation : 0.000018s : 0.13% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000029s : 0.21% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.02% optimize.opt_a.before_grad : 0.000019s : 0.14% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000007s : 0.05% optimize.opt_a.meta_fg_expand : 0.000006s : 0.04% optimize.opt_a.flash_sp_send_recv_attached : 0.000003s : 0.02% optimize.opt_a.receive_attached : 0.000004s : 0.03% optimize.opt_a.after_resolve : 0.000023s : 0.17% optimize.opt_a.a_after_grad : 0.000019s : 0.14% optimize.opt_a.renormalize : 0.000688s : 4.97% optimize.opt_a.add_forward_monad_depend : 0.000007s : 0.05% optimize.opt_a.auto_monad_grad : 0.000004s : 0.03% optimize.opt_a.auto_monad_eliminator : 0.000023s : 0.17% optimize.opt_a.cse : 0.000044s : 0.32% optimize.opt_a.a_3 : 0.000111s : 0.80% optimize.py_interpret_to_execute_after_opt_a : 0.000014s : 0.10% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.04% optimize.rewriter_after_opt_a : 0.000040s : 0.29% optimize.convert_after_rewriter : 0.000011s : 0.08% optimize.order_py_execute_after_rewriter : 0.000008s : 0.06% optimize.mutable_eliminate : 0.000582s : 4.21% optimize.opt_b.b_1 : 0.000167s : 1.21% optimize.opt_b.b_2 : 0.000008s : 0.05% optimize.opt_b.updatestate_depend_eliminate : 0.000006s : 0.04% optimize.opt_b.updatestate_assign_eliminate : 0.000002s : 0.02% optimize.opt_b.updatestate_loads_eliminate : 0.000002s : 0.02% optimize.opt_b.renormalize : 0.000001s : 0.01% optimize.opt_b.cse : 0.000018s : 0.13% optimize.optimize_parallel_all_gather_comm : 0.000020s : 0.14% optimize.overlap_param_gather : 0.000005s : 0.03% optimize.cconv : 0.000030s : 0.22% optimize.loop_unroll : 0.000445s : 3.21% optimize.opt_after_cconv.c_1 : 0.000030s : 0.22% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.02% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000005s : 0.04% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.02% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.02% optimize.opt_after_cconv.cse : 0.000017s : 0.12% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000017s : 0.12% optimize.tuple_transform.d_1 : 0.000044s : 0.32% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000007s : 0.05% optimize.partial_unused_args_eliminate : 0.000004s : 0.03% optimize.add_recomputation : 0.000052s : 0.37% optimize.cse_after_recomputation.cse : 0.000011s : 0.08% optimize.environ_conv : 0.000008s : 0.06% optimize.swap_dp_allreduce_reducescatter : 0.000007s : 0.05% optimize.bias_add_comm_swap : 0.000005s : 0.04% optimize.label_micro_interleaved_index : 0.000007s : 0.05% optimize.label_fine_grained_interleaved_index : 0.000005s : 0.04% optimize.merge_cast_opt : 0.000004s : 0.03% optimize.slice_recompute_activation : 0.000004s : 0.03% optimize.micro_interleaved_order_control : 0.000005s : 0.03% optimize.assign_add_opt : 0.000004s : 0.03% optimize.ForceFp32Comm : 0.000003s : 0.02% optimize.remove_cast_before_assign_add : 0.000003s : 0.02% optimize.full_micro_interleaved_order_control : 0.000005s : 0.03% optimize.reorder_send_recv_between_fp_bp : 0.000005s : 0.04% optimize.comm_op_add_attrs : 0.000003s : 0.02% optimize.add_comm_op_reuse_tag : 0.000003s : 0.02% optimize.interleave_split_concat_branches : 0.000003s : 0.03% optimize.interleave_parallel_branches : 0.000003s : 0.02% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.03% optimize.overlap_opt_shard_grad_in_pipeline : 0.000004s : 0.03% optimize.control_data_broadcast_order : 0.000016s : 0.12% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.03% optimize.offloading_packed_experts : 0.000006s : 0.05% optimize.overlap_recompute_and_grad_model_parallel : 0.000007s : 0.05% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000003s : 0.02% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.03% optimize.overlap_recompute_comm : 0.000005s : 0.04% optimize.overlap_grad_ring_attention : 0.000007s : 0.05% optimize.overlap_grad_flash_sp : 0.000023s : 0.17% optimize.begin_end_overlap_inline : 0.000003s : 0.02% optimize.split_matmul_comm_elemetwise : 0.000005s : 0.03% optimize.split_layernorm_comm : 0.000004s : 0.03% optimize.handle_group_info : 0.000003s : 0.02% optimize.symbol_engine_optimizer.build : 0.000003s : 0.02% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000010s : 0.07% optimize.symbol_engine_optimizer.elim_not_effective : 0.000013s : 0.10% optimize.symbol_engine_optimizer.opt_reshape : 0.000007s : 0.05% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000010s : 0.07% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000004s : 0.03% pipeline_parallel_scheduler : 0.000003s : 0.02% auto_monad_reorder : 0.000020s : 0.14% get_jit_bprop_graph : 0.000002s : 0.01% rewriter_after_jit_bprop_graph : 0.000004s : 0.03% opt_after_jit_grad : 0.000495s : 3.58% validate : 0.000040s : 0.29% Time group info: ------[substitution.] 0.000189 28 1.15% : 0.000002s : 2: substitution.elim_not_effective 0.68% : 0.000001s : 2: substitution.fold_const_symbol 3.10% : 0.000006s : 4: substitution.graph_param_transform 79.51% : 0.000150s : 4: substitution.inline 2.00% : 0.000004s : 4: substitution.j_node_and_user_rematch 2.71% : 0.000005s : 4: substitution.remove_not_recompute_node 2.61% : 0.000005s : 4: substitution.replace_old_param 8.24% : 0.000016s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.008345 2 91.52% : 0.007637s : 1: type_inference.infer 8.48% : 0.000708s : 1: type_inference.specialize ------[replace.] 0.000061 8 62.34% : 0.000038s : 4: replace.inline 37.66% : 0.000023s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000161 8 91.53% : 0.000148s : 4: match.inline 8.47% : 0.000014s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000209 1278 0.93% : 0.000002s : 13: predicate.accumulaten_eliminater 0.69% : 0.000001s : 4: predicate.ad_related_special_op_eliminate 0.52% : 0.000001s : 8: predicate.addn_check_dump 0.94% : 0.000002s : 13: predicate.addn_zero_filter 0.84% : 0.000002s : 13: predicate.adjust_all_reduce_mul_add 2.10% : 0.000004s : 21: predicate.arithmetic_simplify 0.90% : 0.000002s : 13: predicate.cast_eliminate 0.57% : 0.000001s : 8: predicate.check_bprop_eliminate 0.51% : 0.000001s : 8: predicate.compare_switch_simplify 0.20% : 0.000000s : 4: predicate.const_output_eliminate 0.58% : 0.000001s : 8: predicate.depend_value_elim 0.90% : 0.000002s : 13: predicate.dict_get_item_const_eliminator 0.98% : 0.000002s : 13: predicate.dict_get_item_eliminator 0.86% : 0.000002s : 13: predicate.dict_set_item_eliminator 1.02% : 0.000002s : 8: predicate.dumpgradient_eliminate 0.33% : 0.000001s : 4: predicate.elim_not_effective 0.38% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.15% : 0.000002s : 17: predicate.environ_add_const_eliminate 1.10% : 0.000002s : 17: predicate.environ_get_add_eliminate 1.07% : 0.000002s : 17: predicate.environ_get_depend_swap 1.84% : 0.000004s : 25: predicate.environ_get_eliminate 1.01% : 0.000002s : 17: predicate.environ_get_set_eliminate 1.42% : 0.000003s : 21: predicate.exchange_switch_depend_value 2.57% : 0.000005s : 21: predicate.float_depend_g_call 0.50% : 0.000001s : 8: predicate.float_environ_get_switch 0.74% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.18% : 0.000000s : 4: predicate.fold_const_symbol 0.63% : 0.000001s : 8: predicate.get_grad_eliminate 0.20% : 0.000000s : 4: predicate.graph_param_transform 0.57% : 0.000001s : 8: predicate.incorporate_call 0.47% : 0.000001s : 8: predicate.incorporate_call_switch 6.71% : 0.000014s : 58: predicate.inline 0.84% : 0.000002s : 8: predicate.inline_without_move 0.34% : 0.000001s : 8: predicate.j_node_and_user_rematch 0.69% : 0.000001s : 8: predicate.less_batch_normalization 1.94% : 0.000004s : 25: predicate.list_to_tuple_eliminator_ 2.47% : 0.000005s : 38: predicate.load_eliminater 0.84% : 0.000002s : 4: predicate.loop_unroll_after_grad 2.25% : 0.000005s : 34: predicate.loop_unroll_before_grad 1.62% : 0.000003s : 21: predicate.make_slice_get_slice_eliminator 0.57% : 0.000001s : 8: predicate.merge_addn 0.55% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.58% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.88% : 0.000002s : 13: predicate.minmaximum_grad 1.00% : 0.000002s : 4: predicate.mutable_eliminate 0.38% : 0.000001s : 4: predicate.opt_reshape 0.39% : 0.000001s : 4: predicate.parallel_virtual_node 1.77% : 0.000004s : 21: predicate.partial_defer_inline 1.67% : 0.000003s : 21: predicate.partial_eliminate 0.94% : 0.000002s : 13: predicate.print_const_string_wrapper 0.61% : 0.000001s : 8: predicate.reduce_all_const_elim 1.20% : 0.000003s : 13: predicate.reduce_eliminate 2.70% : 0.000006s : 38: predicate.redundant_stop_gradient_eliminater 0.50% : 0.000001s : 8: predicate.remove_not_recompute_node 1.36% : 0.000003s : 25: predicate.replace_applicator 0.66% : 0.000001s : 8: predicate.replace_old_param 0.27% : 0.000001s : 4: predicate.reset_defer_inline 1.00% : 0.000002s : 13: predicate.reshape_eliminate 0.71% : 0.000001s : 8: predicate.row_tensor_add_zeros_like 0.35% : 0.000001s : 4: predicate.row_tensor_eliminate 0.87% : 0.000002s : 8: predicate.same_eliminate 0.55% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.70% : 0.000001s : 8: predicate.shard_identity_eliminate 0.74% : 0.000002s : 8: predicate.special_op_eliminate 0.78% : 0.000002s : 8: predicate.specialize_transform 0.89% : 0.000002s : 8: predicate.split_environ_get_set_with_tuple_value 0.75% : 0.000002s : 8: predicate.stack_unstack_eliminate 0.34% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.53% : 0.000003s : 21: predicate.switch_defer_inline 2.04% : 0.000004s : 29: predicate.switch_layer_defer_inline 5.04% : 0.000011s : 67: predicate.switch_simplify 0.85% : 0.000002s : 13: predicate.tile_eliminate 0.94% : 0.000002s : 13: predicate.transpose_eliminate 1.52% : 0.000003s : 21: predicate.tuple_list_convert_item_index_to_positive 1.58% : 0.000003s : 21: predicate.tuple_list_get_item_const_eliminator 1.47% : 0.000003s : 21: predicate.tuple_list_get_item_depend_reorder 3.24% : 0.000007s : 33: predicate.tuple_list_get_item_eliminator 1.51% : 0.000003s : 21: predicate.tuple_list_get_set_item_eliminator 2.31% : 0.000005s : 29: predicate.tuple_list_set_item_eliminator 1.79% : 0.000004s : 25: predicate.tuple_to_list_eliminator_ 2.40% : 0.000005s : 38: predicate.updatestate_pure_node_eliminater 3.13% : 0.000007s : 46: predicate.updatestate_useless_node_eliminater 0.37% : 0.000001s : 4: predicate.value_based_eliminate 0.79% : 0.000002s : 8: predicate.virtual_dataset_eliminate 0.58% : 0.000001s : 8: predicate.virtual_output_eliminate 0.34% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.45% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000582 11 56.87% : 0.000331s : 5: func_graph_cloner_run.FuncGraphClonerGraph 43.13% : 0.000251s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.030113 192 0.02% : 0.000006s : 1: ForceFp32Comm 11.33% : 0.003411s : 1: add_attr 11.27% : 0.003395s : 1: add_attr_with_inline 0.02% : 0.000006s : 1: add_comm_op_reuse_tag 0.18% : 0.000055s : 1: add_recomputation 0.02% : 0.000006s : 1: assign_add_opt 0.23% : 0.000069s : 1: auto_monad 0.09% : 0.000027s : 1: auto_monad_reorder 0.02% : 0.000006s : 1: begin_end_overlap_inline 0.03% : 0.000008s : 1: bias_add_comm_swap 1.65% : 0.000496s : 1: bootstrap 0.11% : 0.000034s : 1: cconv 0.02% : 0.000006s : 1: comm_op_add_attrs 0.06% : 0.000019s : 1: control_data_broadcast_order 0.05% : 0.000014s : 1: convert_after_rewriter 0.10% : 0.000030s : 1: cse_after_recomputation 0.02% : 0.000007s : 1: dataset_repeat_opt 0.07% : 0.000020s : 1: detach_backward 0.04% : 0.000011s : 1: environ_conv 0.10% : 0.000029s : 1: event_method 0.02% : 0.000007s : 1: full_micro_interleaved_order_control 0.02% : 0.000008s : 1: get_jit_bprop_graph 0.04% : 0.000013s : 1: graph_reusing 0.02% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.02% : 0.000006s : 1: handle_group_info 0.03% : 0.000008s : 1: inline 0.03% : 0.000009s : 1: insert-virtual-dataset 0.02% : 0.000006s : 1: interleave_parallel_branches 0.02% : 0.000006s : 1: interleave_split_concat_branches 0.03% : 0.000008s : 1: label_fine_grained_interleaved_index 0.03% : 0.000010s : 1: label_micro_interleaved_index 1.50% : 0.000451s : 1: loop_unroll 0.02% : 0.000006s : 1: merge_cast_opt 0.02% : 0.000007s : 1: micro_interleaved_order_control 1.95% : 0.000588s : 1: mutable_eliminate 0.03% : 0.000009s : 1: offloading_packed_experts 0.05% : 0.000014s : 1: opt.transform.loop_unroll_optimizer 0.05% : 0.000015s : 1: opt.transform.mutable_eliminate 4.01% : 0.001207s : 78: opt.transform.opt_a 0.10% : 0.000029s : 1: opt.transform.opt_after_cconv 0.08% : 0.000025s : 1: opt.transform.opt_after_jit_grad 0.34% : 0.000102s : 28: opt.transform.opt_b 0.16% : 0.000049s : 2: opt.transform.opt_trans_graph 0.12% : 0.000037s : 4: opt.transform.symbol_engine_opt 10.25% : 0.003086s : 1: opt_a 0.43% : 0.000129s : 1: opt_after_cconv 1.68% : 0.000505s : 1: opt_after_jit_grad 0.91% : 0.000273s : 1: opt_b 20.06% : 0.006040s : 1: optimize 0.08% : 0.000023s : 1: optimize_parallel_all_gather_comm 0.04% : 0.000011s : 1: order_py_execute_after_rewriter 0.09% : 0.000026s : 1: overlap_grad_flash_sp 0.02% : 0.000006s : 1: overlap_grad_matmul_and_grad_allreduce 0.03% : 0.000010s : 1: overlap_grad_ring_attention 0.02% : 0.000007s : 1: overlap_opt_shard_grad_in_pipeline 0.02% : 0.000006s : 1: overlap_opt_shard_in_pipeline 0.03% : 0.000008s : 1: overlap_param_gather 0.02% : 0.000006s : 1: overlap_recompute_allgather_and_fa_grad 0.03% : 0.000009s : 1: overlap_recompute_and_grad_model_parallel 0.03% : 0.000008s : 1: overlap_recompute_comm 0.03% : 0.000010s : 1: parallel-infer-symbol 0.02% : 0.000007s : 1: parallel-infer-symbol-second 0.02% : 0.000007s : 1: partial_unused_args_eliminate 0.03% : 0.000009s : 1: pipeline_parallel_scheduler 0.03% : 0.000008s : 1: pipeline_split 0.15% : 0.000046s : 1: pre_auto_parallel 0.12% : 0.000036s : 1: py_interpret_to_execute 0.06% : 0.000017s : 1: py_interpret_to_execute_after_opt_a 0.02% : 0.000006s : 1: remove_cast_before_assign_add 0.07% : 0.000020s : 1: remove_dup_value 1.24% : 0.000374s : 1: renormalize.infer 1.01% : 0.000305s : 1: renormalize.specialize 0.03% : 0.000008s : 1: reorder_send_recv_between_fp_bp 0.03% : 0.000010s : 1: rewriter_after_jit_bprop_graph 0.15% : 0.000044s : 1: rewriter_after_opt_a 0.31% : 0.000093s : 1: rewriter_before_opt_a 0.03% : 0.000008s : 1: slice_cell_reuse_recomputed_activation 0.02% : 0.000007s : 1: slice_recompute_activation 0.02% : 0.000007s : 1: split_layernorm_comm 0.02% : 0.000007s : 1: split_matmul_comm_elemetwise 0.03% : 0.000010s : 1: swap_dp_allreduce_reducescatter 0.32% : 0.000098s : 1: symbol_engine_optimizer 0.31% : 0.000094s : 1: tuple_transform 28.05% : 0.008447s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:45:12.202.308 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0292585, [21] [bootstrap]: 0.00045451 [type_inference]: 0.0188965 [event_method]: 2.244e-05 [auto_monad]: 7.023e-05 [graph_reusing]: 6.25002e-06 [inline]: 2.73e-06 [add_attr]: 0.00381426, [1] [add_attr_with_inline]: 0.00380282, [1] [Cycle 1]: 7.033e-05, [2] [tag_attr]: 2.034e-05 [meta_addattr_fg_expand]: 6.02001e-06 [parallel-infer-symbol]: 3.67998e-06 [pre_auto_parallel]: 3.779e-05 [insert-virtual-dataset]: 2.58e-06 [parallel-infer-symbol-second]: 7.30011e-07 [dataset_repeat_opt]: 1.89999e-06 [pipeline_split]: 1.67999e-06 [optimize]: 0.00518103, [53] [py_interpret_to_execute]: 2.949e-05 [rewriter_before_opt_a]: 8.409e-05 [opt_a]: 0.00294582, [2] [Cycle 1]: 0.00218464, [45] [expand_dump_flag]: 2.96001e-06 [switch_simplify]: 4.469e-05 [loop_unroll]: 3.096e-05 [a_1]: 0.00064834 [with_stream_mark]: 2.314e-05 [recompute_prepare]: 1.212e-05 [updatestate_depend_eliminate]: 4.47e-06 [updatestate_assign_eliminate]: 2.99999e-06 [updatestate_loads_eliminate]: 2.89999e-06 [parameter_eliminate]: 1.76003e-06 [a_2]: 8.21e-05 [accelerated_algorithm]: 8.05999e-06 [shard]: 1.84e-06 [meta_shard_fg_expand]: 1.96998e-06 [shard_inline]: 6.32001e-06 [merge_send_recv]: 9.14e-06 [auto_parallel]: 7.15e-06 [parallel]: 2.098e-05 [flash_sp]: 9.44e-06 [merge_comm]: 3.97e-06 [allreduce_fusion]: 3.3e-06 [matmul_add_comm_reduction]: 1.028e-05 [allreduce_slice_to_reducescatter]: 6.59988e-07 [virtual_shard_identity]: 1.016e-05 [virtual_dataset]: 6.87002e-06 [get_grad_eliminate_]: 6.23998e-06 [virtual_output]: 6.16e-06 [merge_forward]: 4.90001e-06 [cell_reuse_recompute_pass]: 1.57999e-06 [offload_activation]: 1.015e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.433e-05 [merge_recompute_call_nodes]: 1.66998e-06 [before_grad]: 1.093e-05 [set_forward_comm_id_for_comm_node_pass]: 4.87e-06 [meta_fg_expand]: 2.76999e-06 [flash_sp_send_recv_attached]: 2.84001e-06 [receive_attached]: 2.69001e-06 [after_resolve]: 1.386e-05 [a_after_grad]: 1.036e-05 [renormalize]: 0.00076157 [add_forward_monad_depend]: 6.65998e-06 [auto_monad_grad]: 2.39001e-06 [auto_monad_eliminator]: 1.719e-05 [cse]: 3.109e-05 [a_3]: 5.172e-05 [Cycle 2]: 0.00074782, [45] [expand_dump_flag]: 1.49e-06 [switch_simplify]: 8.55001e-06 [loop_unroll]: 6.47001e-06 [a_1]: 0.00012982 [with_stream_mark]: 1.549e-05 [recompute_prepare]: 7.00002e-06 [updatestate_depend_eliminate]: 3.73001e-06 [updatestate_assign_eliminate]: 2.61999e-06 [updatestate_loads_eliminate]: 2.66e-06 [parameter_eliminate]: 1.96e-06 [a_2]: 7.248e-05 [accelerated_algorithm]: 6.66e-06 [shard]: 1.92001e-06 [meta_shard_fg_expand]: 1.40001e-06 [shard_inline]: 5.92999e-06 [merge_send_recv]: 6.25002e-06 [auto_parallel]: 7.3e-06 [parallel]: 6.66999e-06 [flash_sp]: 3.46001e-06 [merge_comm]: 4.61002e-06 [allreduce_fusion]: 3.10998e-06 [matmul_add_comm_reduction]: 8.45001e-06 [allreduce_slice_to_reducescatter]: 4.50003e-07 [virtual_shard_identity]: 8.50001e-06 [virtual_dataset]: 8.07e-06 [get_grad_eliminate_]: 6.64001e-06 [virtual_output]: 5.96e-06 [merge_forward]: 3.83999e-06 [cell_reuse_recompute_pass]: 1.96e-06 [offload_activation]: 9.17001e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.595e-05 [merge_recompute_call_nodes]: 9.89996e-07 [before_grad]: 1.114e-05 [set_forward_comm_id_for_comm_node_pass]: 3.77002e-06 [meta_fg_expand]: 2.22001e-06 [flash_sp_send_recv_attached]: 1.22e-06 [receive_attached]: 2.21998e-06 [after_resolve]: 1.221e-05 [a_after_grad]: 9.92999e-06 [renormalize]: 7.99773e-08 [add_forward_monad_depend]: 2.52001e-06 [auto_monad_grad]: 1.39998e-06 [auto_monad_eliminator]: 1.08e-05 [cse]: 1.775e-05 [a_3]: 4.128e-05 [py_interpret_to_execute_after_opt_a]: 1.265e-05 [slice_cell_reuse_recomputed_activation]: 2.17999e-06 [rewriter_after_opt_a]: 3.837e-05 [convert_after_rewriter]: 7.09001e-06 [order_py_execute_after_rewriter]: 5.54e-06 [mutable_eliminate]: 0.00058867 [opt_b]: 0.0002271, [1] [Cycle 1]: 0.00021918, [7] [b_1]: 0.00013159 [b_2]: 9.57001e-06 [updatestate_depend_eliminate]: 9.00999e-06 [updatestate_assign_eliminate]: 2.98e-06 [updatestate_loads_eliminate]: 2.37001e-06 [renormalize]: 7.50006e-07 [cse]: 2.368e-05 [optimize_parallel_all_gather_comm]: 1.867e-05 [overlap_param_gather]: 1.94999e-06 [cconv]: 3.147e-05 [loop_unroll]: 0.00047063 [opt_after_cconv]: 0.00011417, [1] [Cycle 1]: 0.0001074, [7] [c_1]: 3.404e-05 [parameter_eliminate]: 4.41002e-06 [updatestate_depend_eliminate]: 7.98001e-06 [updatestate_assign_eliminate]: 2.70002e-06 [updatestate_loads_eliminate]: 2.29999e-06 [cse]: 2.053e-05 [renormalize]: 5.10016e-07 [remove_dup_value]: 1.548e-05 [tuple_transform]: 7.948e-05, [1] [Cycle 1]: 7.44e-05, [4] [d_1]: 4.579e-05 [none_parameter_eliminate]: 2.02001e-06 [renormalize]: 1.69995e-07 [switch_simplify]: 7.25e-06 [partial_unused_args_eliminate]: 1.89e-06 [add_recomputation]: 5.448e-05 [cse_after_recomputation]: 2.229e-05, [1] [Cycle 1]: 1.79e-05, [1] [cse]: 1.22e-05 [environ_conv]: 5.55001e-06 [swap_dp_allreduce_reducescatter]: 5.40001e-06 [bias_add_comm_swap]: 2.53998e-06 [label_micro_interleaved_index]: 4.97e-06 [label_fine_grained_interleaved_index]: 2.69999e-06 [merge_cast_opt]: 1.69e-06 [slice_recompute_activation]: 2.44999e-06 [micro_interleaved_order_control]: 2.57001e-06 [assign_add_opt]: 1.47001e-06 [ForceFp32Comm]: 1.28002e-06 [remove_cast_before_assign_add]: 1.04e-06 [full_micro_interleaved_order_control]: 2.14e-06 [reorder_send_recv_between_fp_bp]: 2.97002e-06 [comm_op_add_attrs]: 1.50999e-06 [add_comm_op_reuse_tag]: 1.12e-06 [interleave_split_concat_branches]: 1.20001e-06 [interleave_parallel_branches]: 1.14998e-06 [overlap_opt_shard_in_pipeline]: 1.34e-06 [overlap_opt_shard_grad_in_pipeline]: 2.29001e-06 [control_data_broadcast_order]: 1.451e-05 [grouped_pairwise_exchange_alltoall]: 1.57001e-06 [offloading_packed_experts]: 3.72002e-06 [overlap_recompute_and_grad_model_parallel]: 4.80001e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.17e-06 [overlap_recompute_allgather_and_fa_grad]: 1.36002e-06 [overlap_recompute_comm]: 2.63e-06 [overlap_grad_ring_attention]: 4.23001e-06 [overlap_grad_flash_sp]: 2.268e-05 [begin_end_overlap_inline]: 5.90022e-07 [split_matmul_comm_elemetwise]: 2.11e-06 [split_layernorm_comm]: 1.70001e-06 [handle_group_info]: 9.79984e-07 [symbol_engine_optimizer]: 8.887e-05, [1] [Cycle 1]: 8.337e-05, [6] [build]: 3.65e-06 [elim_shapecalc]: 1.41e-05 [elim_not_effective]: 1.398e-05 [opt_reshape]: 7.65e-06 [fold_const_symbol]: 1.029e-05 [renormalize]: 6.09987e-07 [detach_backward]: 2.21e-06 [pipeline_parallel_scheduler]: 1.54e-06 [auto_monad_reorder]: 1.91e-05 [get_jit_bprop_graph]: 2.09999e-06 [rewriter_after_jit_bprop_graph]: 5.17e-06 [opt_after_jit_grad]: 0.00051746 [validate]: 4.352e-05 Sums bootstrap : 0.000455s : 1.87% type_inference : 0.018897s : 77.55% event_method : 0.000022s : 0.09% auto_monad : 0.000070s : 0.29% graph_reusing : 0.000006s : 0.03% inline : 0.000003s : 0.01% add_attr.add_attr_with_inline.tag_attr : 0.000020s : 0.08% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.02% parallel-infer-symbol : 0.000004s : 0.02% pre_auto_parallel : 0.000038s : 0.16% insert-virtual-dataset : 0.000003s : 0.01% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.01% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000029s : 0.12% optimize.rewriter_before_opt_a : 0.000084s : 0.35% optimize.opt_a.expand_dump_flag : 0.000004s : 0.02% optimize.opt_a.switch_simplify : 0.000053s : 0.22% optimize.opt_a.loop_unroll : 0.000037s : 0.15% optimize.opt_a.a_1 : 0.000778s : 3.19% optimize.opt_a.with_stream_mark : 0.000039s : 0.16% optimize.opt_a.recompute_prepare : 0.000019s : 0.08% optimize.opt_a.updatestate_depend_eliminate : 0.000008s : 0.03% optimize.opt_a.updatestate_assign_eliminate : 0.000006s : 0.02% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.02% optimize.opt_a.parameter_eliminate : 0.000004s : 0.02% optimize.opt_a.a_2 : 0.000155s : 0.63% optimize.opt_a.accelerated_algorithm : 0.000015s : 0.06% optimize.opt_a.shard : 0.000004s : 0.02% optimize.opt_a.meta_shard_fg_expand : 0.000003s : 0.01% optimize.opt_a.shard_inline : 0.000012s : 0.05% optimize.opt_a.merge_send_recv : 0.000015s : 0.06% optimize.opt_a.auto_parallel : 0.000014s : 0.06% optimize.opt_a.parallel : 0.000028s : 0.11% optimize.opt_a.flash_sp : 0.000013s : 0.05% optimize.opt_a.merge_comm : 0.000009s : 0.04% optimize.opt_a.allreduce_fusion : 0.000006s : 0.03% optimize.opt_a.matmul_add_comm_reduction : 0.000019s : 0.08% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000019s : 0.08% optimize.opt_a.virtual_dataset : 0.000015s : 0.06% optimize.opt_a.get_grad_eliminate_ : 0.000013s : 0.05% optimize.opt_a.virtual_output : 0.000012s : 0.05% optimize.opt_a.merge_forward : 0.000009s : 0.04% optimize.opt_a.cell_reuse_recompute_pass : 0.000004s : 0.01% optimize.opt_a.offload_activation : 0.000019s : 0.08% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000030s : 0.12% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.01% optimize.opt_a.before_grad : 0.000022s : 0.09% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000009s : 0.04% optimize.opt_a.meta_fg_expand : 0.000005s : 0.02% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.02% optimize.opt_a.receive_attached : 0.000005s : 0.02% optimize.opt_a.after_resolve : 0.000026s : 0.11% optimize.opt_a.a_after_grad : 0.000020s : 0.08% optimize.opt_a.renormalize : 0.000762s : 3.13% optimize.opt_a.add_forward_monad_depend : 0.000009s : 0.04% optimize.opt_a.auto_monad_grad : 0.000004s : 0.02% optimize.opt_a.auto_monad_eliminator : 0.000028s : 0.11% optimize.opt_a.cse : 0.000049s : 0.20% optimize.opt_a.a_3 : 0.000093s : 0.38% optimize.py_interpret_to_execute_after_opt_a : 0.000013s : 0.05% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.01% optimize.rewriter_after_opt_a : 0.000038s : 0.16% optimize.convert_after_rewriter : 0.000007s : 0.03% optimize.order_py_execute_after_rewriter : 0.000006s : 0.02% optimize.mutable_eliminate : 0.000589s : 2.42% optimize.opt_b.b_1 : 0.000132s : 0.54% optimize.opt_b.b_2 : 0.000010s : 0.04% optimize.opt_b.updatestate_depend_eliminate : 0.000009s : 0.04% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.01% optimize.opt_b.updatestate_loads_eliminate : 0.000002s : 0.01% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000024s : 0.10% optimize.optimize_parallel_all_gather_comm : 0.000019s : 0.08% optimize.overlap_param_gather : 0.000002s : 0.01% optimize.cconv : 0.000031s : 0.13% optimize.loop_unroll : 0.000471s : 1.93% optimize.opt_after_cconv.c_1 : 0.000034s : 0.14% optimize.opt_after_cconv.parameter_eliminate : 0.000004s : 0.02% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000008s : 0.03% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.01% optimize.opt_after_cconv.cse : 0.000021s : 0.08% optimize.opt_after_cconv.renormalize : 0.000001s : 0.00% optimize.remove_dup_value : 0.000015s : 0.06% optimize.tuple_transform.d_1 : 0.000046s : 0.19% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000007s : 0.03% optimize.partial_unused_args_eliminate : 0.000002s : 0.01% optimize.add_recomputation : 0.000054s : 0.22% optimize.cse_after_recomputation.cse : 0.000012s : 0.05% optimize.environ_conv : 0.000006s : 0.02% optimize.swap_dp_allreduce_reducescatter : 0.000005s : 0.02% optimize.bias_add_comm_swap : 0.000003s : 0.01% optimize.label_micro_interleaved_index : 0.000005s : 0.02% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.01% optimize.merge_cast_opt : 0.000002s : 0.01% optimize.slice_recompute_activation : 0.000002s : 0.01% optimize.micro_interleaved_order_control : 0.000003s : 0.01% optimize.assign_add_opt : 0.000001s : 0.01% optimize.ForceFp32Comm : 0.000001s : 0.01% optimize.remove_cast_before_assign_add : 0.000001s : 0.00% optimize.full_micro_interleaved_order_control : 0.000002s : 0.01% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.01% optimize.comm_op_add_attrs : 0.000002s : 0.01% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.01% optimize.control_data_broadcast_order : 0.000015s : 0.06% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.01% optimize.offloading_packed_experts : 0.000004s : 0.02% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.02% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.01% optimize.overlap_recompute_comm : 0.000003s : 0.01% optimize.overlap_grad_ring_attention : 0.000004s : 0.02% optimize.overlap_grad_flash_sp : 0.000023s : 0.09% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.01% optimize.split_layernorm_comm : 0.000002s : 0.01% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000004s : 0.01% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000014s : 0.06% optimize.symbol_engine_optimizer.elim_not_effective : 0.000014s : 0.06% optimize.symbol_engine_optimizer.opt_reshape : 0.000008s : 0.03% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000010s : 0.04% optimize.symbol_engine_optimizer.renormalize : 0.000001s : 0.00% detach_backward : 0.000002s : 0.01% pipeline_parallel_scheduler : 0.000002s : 0.01% auto_monad_reorder : 0.000019s : 0.08% get_jit_bprop_graph : 0.000002s : 0.01% rewriter_after_jit_bprop_graph : 0.000005s : 0.02% opt_after_jit_grad : 0.000517s : 2.12% validate : 0.000044s : 0.18% Time group info: ------[substitution.] 0.000198 28 0.95% : 0.000002s : 2: substitution.elim_not_effective 0.76% : 0.000001s : 2: substitution.fold_const_symbol 3.05% : 0.000006s : 4: substitution.graph_param_transform 79.88% : 0.000158s : 4: substitution.inline 2.11% : 0.000004s : 4: substitution.j_node_and_user_rematch 2.85% : 0.000006s : 4: substitution.remove_not_recompute_node 2.69% : 0.000005s : 4: substitution.replace_old_param 7.71% : 0.000015s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.018809 2 94.82% : 0.017836s : 1: type_inference.infer 5.18% : 0.000973s : 1: type_inference.specialize ------[replace.] 0.000065 8 63.06% : 0.000041s : 4: replace.inline 36.94% : 0.000024s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000168 8 92.09% : 0.000155s : 4: match.inline 7.91% : 0.000013s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000212 1278 0.90% : 0.000002s : 13: predicate.accumulaten_eliminater 1.13% : 0.000002s : 4: predicate.ad_related_special_op_eliminate 0.56% : 0.000001s : 8: predicate.addn_check_dump 0.84% : 0.000002s : 13: predicate.addn_zero_filter 0.76% : 0.000002s : 13: predicate.adjust_all_reduce_mul_add 1.97% : 0.000004s : 21: predicate.arithmetic_simplify 1.04% : 0.000002s : 13: predicate.cast_eliminate 0.61% : 0.000001s : 8: predicate.check_bprop_eliminate 0.50% : 0.000001s : 8: predicate.compare_switch_simplify 0.21% : 0.000000s : 4: predicate.const_output_eliminate 0.56% : 0.000001s : 8: predicate.depend_value_elim 0.94% : 0.000002s : 13: predicate.dict_get_item_const_eliminator 1.01% : 0.000002s : 13: predicate.dict_get_item_eliminator 0.82% : 0.000002s : 13: predicate.dict_set_item_eliminator 1.18% : 0.000003s : 8: predicate.dumpgradient_eliminate 0.20% : 0.000000s : 4: predicate.elim_not_effective 0.47% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.10% : 0.000002s : 17: predicate.environ_add_const_eliminate 1.01% : 0.000002s : 17: predicate.environ_get_add_eliminate 1.01% : 0.000002s : 17: predicate.environ_get_depend_swap 1.61% : 0.000003s : 25: predicate.environ_get_eliminate 1.02% : 0.000002s : 17: predicate.environ_get_set_eliminate 1.38% : 0.000003s : 21: predicate.exchange_switch_depend_value 2.64% : 0.000006s : 21: predicate.float_depend_g_call 0.47% : 0.000001s : 8: predicate.float_environ_get_switch 0.77% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.17% : 0.000000s : 4: predicate.fold_const_symbol 0.60% : 0.000001s : 8: predicate.get_grad_eliminate 0.29% : 0.000001s : 4: predicate.graph_param_transform 0.55% : 0.000001s : 8: predicate.incorporate_call 0.45% : 0.000001s : 8: predicate.incorporate_call_switch 6.72% : 0.000014s : 58: predicate.inline 0.96% : 0.000002s : 8: predicate.inline_without_move 0.33% : 0.000001s : 8: predicate.j_node_and_user_rematch 0.90% : 0.000002s : 8: predicate.less_batch_normalization 1.76% : 0.000004s : 25: predicate.list_to_tuple_eliminator_ 2.39% : 0.000005s : 38: predicate.load_eliminater 1.20% : 0.000003s : 4: predicate.loop_unroll_after_grad 2.29% : 0.000005s : 34: predicate.loop_unroll_before_grad 1.64% : 0.000003s : 21: predicate.make_slice_get_slice_eliminator 0.51% : 0.000001s : 8: predicate.merge_addn 0.55% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.64% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.76% : 0.000002s : 13: predicate.minmaximum_grad 1.28% : 0.000003s : 4: predicate.mutable_eliminate 0.32% : 0.000001s : 4: predicate.opt_reshape 0.35% : 0.000001s : 4: predicate.parallel_virtual_node 1.80% : 0.000004s : 21: predicate.partial_defer_inline 1.58% : 0.000003s : 21: predicate.partial_eliminate 1.03% : 0.000002s : 13: predicate.print_const_string_wrapper 0.55% : 0.000001s : 8: predicate.reduce_all_const_elim 1.10% : 0.000002s : 13: predicate.reduce_eliminate 2.44% : 0.000005s : 38: predicate.redundant_stop_gradient_eliminater 0.47% : 0.000001s : 8: predicate.remove_not_recompute_node 1.39% : 0.000003s : 25: predicate.replace_applicator 0.95% : 0.000002s : 8: predicate.replace_old_param 0.47% : 0.000001s : 4: predicate.reset_defer_inline 0.91% : 0.000002s : 13: predicate.reshape_eliminate 0.55% : 0.000001s : 8: predicate.row_tensor_add_zeros_like 0.39% : 0.000001s : 4: predicate.row_tensor_eliminate 0.66% : 0.000001s : 8: predicate.same_eliminate 0.44% : 0.000001s : 8: predicate.set_cell_output_no_recompute 1.02% : 0.000002s : 8: predicate.shard_identity_eliminate 0.66% : 0.000001s : 8: predicate.special_op_eliminate 0.66% : 0.000001s : 8: predicate.specialize_transform 1.14% : 0.000002s : 8: predicate.split_environ_get_set_with_tuple_value 0.66% : 0.000001s : 8: predicate.stack_unstack_eliminate 0.33% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.47% : 0.000003s : 21: predicate.switch_defer_inline 1.99% : 0.000004s : 29: predicate.switch_layer_defer_inline 5.23% : 0.000011s : 67: predicate.switch_simplify 0.82% : 0.000002s : 13: predicate.tile_eliminate 0.99% : 0.000002s : 13: predicate.transpose_eliminate 1.40% : 0.000003s : 21: predicate.tuple_list_convert_item_index_to_positive 1.50% : 0.000003s : 21: predicate.tuple_list_get_item_const_eliminator 1.42% : 0.000003s : 21: predicate.tuple_list_get_item_depend_reorder 3.25% : 0.000007s : 33: predicate.tuple_list_get_item_eliminator 1.42% : 0.000003s : 21: predicate.tuple_list_get_set_item_eliminator 2.09% : 0.000004s : 29: predicate.tuple_list_set_item_eliminator 1.80% : 0.000004s : 25: predicate.tuple_to_list_eliminator_ 2.48% : 0.000005s : 38: predicate.updatestate_pure_node_eliminater 2.95% : 0.000006s : 46: predicate.updatestate_useless_node_eliminater 0.33% : 0.000001s : 4: predicate.value_based_eliminate 0.83% : 0.000002s : 8: predicate.virtual_dataset_eliminate 0.53% : 0.000001s : 8: predicate.virtual_output_eliminate 0.25% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.64% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000668 11 46.38% : 0.000310s : 5: func_graph_cloner_run.FuncGraphClonerGraph 53.62% : 0.000358s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.040396 192 0.01% : 0.000004s : 1: ForceFp32Comm 9.46% : 0.003820s : 1: add_attr 9.42% : 0.003807s : 1: add_attr_with_inline 0.01% : 0.000004s : 1: add_comm_op_reuse_tag 0.15% : 0.000059s : 1: add_recomputation 0.01% : 0.000004s : 1: assign_add_opt 0.19% : 0.000076s : 1: auto_monad 0.06% : 0.000023s : 1: auto_monad_reorder 0.01% : 0.000004s : 1: begin_end_overlap_inline 0.01% : 0.000006s : 1: bias_add_comm_swap 1.20% : 0.000485s : 1: bootstrap 0.09% : 0.000036s : 1: cconv 0.01% : 0.000005s : 1: comm_op_add_attrs 0.05% : 0.000018s : 1: control_data_broadcast_order 0.03% : 0.000010s : 1: convert_after_rewriter 0.06% : 0.000025s : 1: cse_after_recomputation 0.01% : 0.000005s : 1: dataset_repeat_opt 0.01% : 0.000006s : 1: detach_backward 0.02% : 0.000009s : 1: environ_conv 0.07% : 0.000030s : 1: event_method 0.01% : 0.000005s : 1: full_micro_interleaved_order_control 0.01% : 0.000005s : 1: get_jit_bprop_graph 0.03% : 0.000011s : 1: graph_reusing 0.01% : 0.000005s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000004s : 1: handle_group_info 0.01% : 0.000006s : 1: inline 0.01% : 0.000006s : 1: insert-virtual-dataset 0.01% : 0.000004s : 1: interleave_parallel_branches 0.01% : 0.000004s : 1: interleave_split_concat_branches 0.01% : 0.000006s : 1: label_fine_grained_interleaved_index 0.02% : 0.000008s : 1: label_micro_interleaved_index 1.19% : 0.000481s : 1: loop_unroll 0.01% : 0.000005s : 1: merge_cast_opt 0.01% : 0.000005s : 1: micro_interleaved_order_control 1.49% : 0.000600s : 1: mutable_eliminate 0.02% : 0.000007s : 1: offloading_packed_experts 0.04% : 0.000017s : 1: opt.transform.loop_unroll_optimizer 0.05% : 0.000022s : 1: opt.transform.mutable_eliminate 3.03% : 0.001224s : 78: opt.transform.opt_a 0.08% : 0.000032s : 1: opt.transform.opt_after_cconv 0.07% : 0.000030s : 1: opt.transform.opt_after_jit_grad 0.27% : 0.000108s : 28: opt.transform.opt_b 0.13% : 0.000051s : 2: opt.transform.opt_trans_graph 0.10% : 0.000042s : 4: opt.transform.symbol_engine_opt 7.30% : 0.002950s : 1: opt_a 0.29% : 0.000118s : 1: opt_after_cconv 1.31% : 0.000529s : 1: opt_after_jit_grad 0.57% : 0.000231s : 1: opt_b 12.84% : 0.005185s : 1: optimize 0.06% : 0.000023s : 1: optimize_parallel_all_gather_comm 0.02% : 0.000009s : 1: order_py_execute_after_rewriter 0.07% : 0.000026s : 1: overlap_grad_flash_sp 0.01% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.02% : 0.000007s : 1: overlap_grad_ring_attention 0.01% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.01% : 0.000005s : 1: overlap_param_gather 0.01% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.02% : 0.000008s : 1: overlap_recompute_and_grad_model_parallel 0.01% : 0.000006s : 1: overlap_recompute_comm 0.02% : 0.000007s : 1: parallel-infer-symbol 0.01% : 0.000004s : 1: parallel-infer-symbol-second 0.01% : 0.000005s : 1: partial_unused_args_eliminate 0.01% : 0.000005s : 1: pipeline_parallel_scheduler 0.01% : 0.000004s : 1: pipeline_split 0.10% : 0.000042s : 1: pre_auto_parallel 0.08% : 0.000034s : 1: py_interpret_to_execute 0.04% : 0.000016s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000004s : 1: remove_cast_before_assign_add 0.05% : 0.000019s : 1: remove_dup_value 1.03% : 0.000416s : 1: renormalize.infer 0.83% : 0.000335s : 1: renormalize.specialize 0.01% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.02% : 0.000009s : 1: rewriter_after_jit_bprop_graph 0.11% : 0.000043s : 1: rewriter_after_opt_a 0.22% : 0.000089s : 1: rewriter_before_opt_a 0.01% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.01% : 0.000005s : 1: slice_recompute_activation 0.01% : 0.000005s : 1: split_layernorm_comm 0.01% : 0.000005s : 1: split_matmul_comm_elemetwise 0.02% : 0.000009s : 1: swap_dp_allreduce_reducescatter 0.23% : 0.000091s : 1: symbol_engine_optimizer 0.20% : 0.000082s : 1: tuple_transform 46.85% : 0.018924s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:45:12.835.320 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:45:12.835.582 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0184683, [21] [bootstrap]: 0.00044084 [type_inference]: 0.00630078 [event_method]: 2.311e-05 [auto_monad]: 6.947e-05 [graph_reusing]: 7.36999e-06 [inline]: 2.48998e-06 [add_attr]: 0.00370549, [1] [add_attr_with_inline]: 0.00369429, [1] [Cycle 1]: 9.326e-05, [2] [tag_attr]: 2.191e-05 [meta_addattr_fg_expand]: 6.86999e-06 [parallel-infer-symbol]: 3.28998e-06 [pre_auto_parallel]: 3.982e-05 [insert-virtual-dataset]: 2.28002e-06 [parallel-infer-symbol-second]: 9.79984e-07 [dataset_repeat_opt]: 2.27001e-06 [pipeline_split]: 1.69e-06 [optimize]: 0.00662293, [53] [py_interpret_to_execute]: 3.518e-05 [rewriter_before_opt_a]: 9.468e-05 [opt_a]: 0.00377177, [2] [Cycle 1]: 0.00269945, [45] [expand_dump_flag]: 3.41999e-06 [switch_simplify]: 4.451e-05 [loop_unroll]: 3.154e-05 [a_1]: 0.00077167 [with_stream_mark]: 2.455e-05 [recompute_prepare]: 1.454e-05 [updatestate_depend_eliminate]: 5.59e-06 [updatestate_assign_eliminate]: 4.02002e-06 [updatestate_loads_eliminate]: 4.18999e-06 [parameter_eliminate]: 2.23998e-06 [a_2]: 0.0001403 [accelerated_algorithm]: 9.72001e-06 [shard]: 2.11e-06 [meta_shard_fg_expand]: 2.40002e-06 [shard_inline]: 8e-06 [merge_send_recv]: 1.04e-05 [auto_parallel]: 8.62e-06 [parallel]: 1.928e-05 [flash_sp]: 1.049e-05 [merge_comm]: 5.79999e-06 [allreduce_fusion]: 4.21001e-06 [matmul_add_comm_reduction]: 1.34e-05 [allreduce_slice_to_reducescatter]: 9.39996e-07 [virtual_shard_identity]: 1.291e-05 [virtual_dataset]: 8.79e-06 [get_grad_eliminate_]: 7.87003e-06 [virtual_output]: 7.73999e-06 [merge_forward]: 5.18002e-06 [cell_reuse_recompute_pass]: 1.67001e-06 [offload_activation]: 1.194e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.295e-05 [merge_recompute_call_nodes]: 2.21e-06 [before_grad]: 1.532e-05 [set_forward_comm_id_for_comm_node_pass]: 5.88002e-06 [meta_fg_expand]: 3.75e-06 [flash_sp_send_recv_attached]: 3.08e-06 [receive_attached]: 2.98e-06 [after_resolve]: 1.561e-05 [a_after_grad]: 1.34e-05 [renormalize]: 0.00083321 [add_forward_monad_depend]: 7.98999e-06 [auto_monad_grad]: 2.49999e-06 [auto_monad_eliminator]: 2.14e-05 [cse]: 3.773e-05 [a_3]: 7.687e-05 [Cycle 2]: 0.00105608, [45] [expand_dump_flag]: 1.87001e-06 [switch_simplify]: 1.011e-05 [loop_unroll]: 7.8e-06 [a_1]: 0.00018077 [with_stream_mark]: 1.828e-05 [recompute_prepare]: 9.57001e-06 [updatestate_depend_eliminate]: 4.52e-06 [updatestate_assign_eliminate]: 3.86001e-06 [updatestate_loads_eliminate]: 3.65003e-06 [parameter_eliminate]: 1.86e-06 [a_2]: 0.00012478 [accelerated_algorithm]: 9.33002e-06 [shard]: 2.21e-06 [meta_shard_fg_expand]: 2.24001e-06 [shard_inline]: 1.011e-05 [merge_send_recv]: 8.85999e-06 [auto_parallel]: 8.68001e-06 [parallel]: 7.4e-06 [flash_sp]: 4.43001e-06 [merge_comm]: 5.09998e-06 [allreduce_fusion]: 4.80001e-06 [matmul_add_comm_reduction]: 1.07e-05 [allreduce_slice_to_reducescatter]: 5.60016e-07 [virtual_shard_identity]: 1.188e-05 [virtual_dataset]: 8.11002e-06 [get_grad_eliminate_]: 7.46999e-06 [virtual_output]: 7.18998e-06 [merge_forward]: 4.75999e-06 [cell_reuse_recompute_pass]: 2.29001e-06 [offload_activation]: 1.089e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.201e-05 [merge_recompute_call_nodes]: 1.35999e-06 [before_grad]: 1.476e-05 [set_forward_comm_id_for_comm_node_pass]: 6.07999e-06 [meta_fg_expand]: 2.98e-06 [flash_sp_send_recv_attached]: 1.14998e-06 [receive_attached]: 1.84e-06 [after_resolve]: 1.377e-05 [a_after_grad]: 1.182e-05 [renormalize]: 8.00064e-08 [add_forward_monad_depend]: 3.4e-06 [auto_monad_grad]: 1.80001e-06 [auto_monad_eliminator]: 1.437e-05 [cse]: 2.352e-05 [a_3]: 6.37e-05 [py_interpret_to_execute_after_opt_a]: 1.972e-05 [slice_cell_reuse_recomputed_activation]: 5.30999e-06 [rewriter_after_opt_a]: 5.103e-05 [convert_after_rewriter]: 1.083e-05 [order_py_execute_after_rewriter]: 9.38002e-06 [mutable_eliminate]: 0.0007836 [opt_b]: 0.0003464, [1] [Cycle 1]: 0.00033635, [7] [b_1]: 0.00021782 [b_2]: 1.105e-05 [updatestate_depend_eliminate]: 1.077e-05 [updatestate_assign_eliminate]: 3.27002e-06 [updatestate_loads_eliminate]: 3.2e-06 [renormalize]: 8.2e-07 [cse]: 2.98e-05 [optimize_parallel_all_gather_comm]: 2.388e-05 [overlap_param_gather]: 5.40001e-06 [cconv]: 3.718e-05 [loop_unroll]: 0.00048105 [opt_after_cconv]: 0.00015221, [1] [Cycle 1]: 0.00014302, [7] [c_1]: 3.87e-05 [parameter_eliminate]: 4.45999e-06 [updatestate_depend_eliminate]: 8.33999e-06 [updatestate_assign_eliminate]: 3.61999e-06 [updatestate_loads_eliminate]: 2.88998e-06 [cse]: 2.781e-05 [renormalize]: 4.7998e-07 [remove_dup_value]: 1.951e-05 [tuple_transform]: 0.00010678, [1] [Cycle 1]: 9.902e-05, [4] [d_1]: 5.417e-05 [none_parameter_eliminate]: 2.04e-06 [renormalize]: 2.70025e-07 [switch_simplify]: 9.54e-06 [partial_unused_args_eliminate]: 4.97e-06 [add_recomputation]: 6.804e-05 [cse_after_recomputation]: 3.643e-05, [1] [Cycle 1]: 2.803e-05, [1] [cse]: 1.625e-05 [environ_conv]: 1.083e-05 [swap_dp_allreduce_reducescatter]: 9.42001e-06 [bias_add_comm_swap]: 5.77001e-06 [label_micro_interleaved_index]: 7.38e-06 [label_fine_grained_interleaved_index]: 5.27001e-06 [merge_cast_opt]: 3.78001e-06 [slice_recompute_activation]: 4.94e-06 [micro_interleaved_order_control]: 5.17e-06 [assign_add_opt]: 3.94002e-06 [ForceFp32Comm]: 3.2e-06 [remove_cast_before_assign_add]: 4.15e-06 [full_micro_interleaved_order_control]: 4.87998e-06 [reorder_send_recv_between_fp_bp]: 5.34e-06 [comm_op_add_attrs]: 3.65e-06 [add_comm_op_reuse_tag]: 3.5e-06 [interleave_split_concat_branches]: 3.59002e-06 [interleave_parallel_branches]: 3.65998e-06 [overlap_opt_shard_in_pipeline]: 4.03001e-06 [overlap_opt_shard_grad_in_pipeline]: 4.94003e-06 [control_data_broadcast_order]: 2.308e-05 [grouped_pairwise_exchange_alltoall]: 4.54002e-06 [offloading_packed_experts]: 8.72e-06 [overlap_recompute_and_grad_model_parallel]: 9.54999e-06 [overlap_grad_matmul_and_grad_allreduce]: 4.23001e-06 [overlap_recompute_allgather_and_fa_grad]: 3.78001e-06 [overlap_recompute_comm]: 4.75999e-06 [overlap_grad_ring_attention]: 8.13001e-06 [overlap_grad_flash_sp]: 2.96e-05 [begin_end_overlap_inline]: 3.17002e-06 [split_matmul_comm_elemetwise]: 5.03002e-06 [split_layernorm_comm]: 4.38999e-06 [handle_group_info]: 3.33e-06 [symbol_engine_optimizer]: 0.00011665, [1] [Cycle 1]: 0.00010851, [6] [build]: 4.23999e-06 [elim_shapecalc]: 1.632e-05 [elim_not_effective]: 1.662e-05 [opt_reshape]: 8.94998e-06 [fold_const_symbol]: 1.285e-05 [renormalize]: 2.30008e-07 [detach_backward]: 3.78001e-06 [pipeline_parallel_scheduler]: 2.29001e-06 [auto_monad_reorder]: 2.711e-05 [get_jit_bprop_graph]: 2.26e-06 [rewriter_after_jit_bprop_graph]: 5.76e-06 [opt_after_jit_grad]: 0.00053441 [validate]: 4.565e-05 Sums bootstrap : 0.000441s : 3.43% type_inference : 0.006301s : 49.04% event_method : 0.000023s : 0.18% auto_monad : 0.000069s : 0.54% graph_reusing : 0.000007s : 0.06% inline : 0.000002s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000022s : 0.17% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000007s : 0.05% parallel-infer-symbol : 0.000003s : 0.03% pre_auto_parallel : 0.000040s : 0.31% insert-virtual-dataset : 0.000002s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000035s : 0.27% optimize.rewriter_before_opt_a : 0.000095s : 0.74% optimize.opt_a.expand_dump_flag : 0.000005s : 0.04% optimize.opt_a.switch_simplify : 0.000055s : 0.43% optimize.opt_a.loop_unroll : 0.000039s : 0.31% optimize.opt_a.a_1 : 0.000952s : 7.41% optimize.opt_a.with_stream_mark : 0.000043s : 0.33% optimize.opt_a.recompute_prepare : 0.000024s : 0.19% optimize.opt_a.updatestate_depend_eliminate : 0.000010s : 0.08% optimize.opt_a.updatestate_assign_eliminate : 0.000008s : 0.06% optimize.opt_a.updatestate_loads_eliminate : 0.000008s : 0.06% optimize.opt_a.parameter_eliminate : 0.000004s : 0.03% optimize.opt_a.a_2 : 0.000265s : 2.06% optimize.opt_a.accelerated_algorithm : 0.000019s : 0.15% optimize.opt_a.shard : 0.000004s : 0.03% optimize.opt_a.meta_shard_fg_expand : 0.000005s : 0.04% optimize.opt_a.shard_inline : 0.000018s : 0.14% optimize.opt_a.merge_send_recv : 0.000019s : 0.15% optimize.opt_a.auto_parallel : 0.000017s : 0.13% optimize.opt_a.parallel : 0.000027s : 0.21% optimize.opt_a.flash_sp : 0.000015s : 0.12% optimize.opt_a.merge_comm : 0.000011s : 0.08% optimize.opt_a.allreduce_fusion : 0.000009s : 0.07% optimize.opt_a.matmul_add_comm_reduction : 0.000024s : 0.19% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000002s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000025s : 0.19% optimize.opt_a.virtual_dataset : 0.000017s : 0.13% optimize.opt_a.get_grad_eliminate_ : 0.000015s : 0.12% optimize.opt_a.virtual_output : 0.000015s : 0.12% optimize.opt_a.merge_forward : 0.000010s : 0.08% optimize.opt_a.cell_reuse_recompute_pass : 0.000004s : 0.03% optimize.opt_a.offload_activation : 0.000023s : 0.18% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000045s : 0.35% optimize.opt_a.merge_recompute_call_nodes : 0.000004s : 0.03% optimize.opt_a.before_grad : 0.000030s : 0.23% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000012s : 0.09% optimize.opt_a.meta_fg_expand : 0.000007s : 0.05% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.03% optimize.opt_a.receive_attached : 0.000005s : 0.04% optimize.opt_a.after_resolve : 0.000029s : 0.23% optimize.opt_a.a_after_grad : 0.000025s : 0.20% optimize.opt_a.renormalize : 0.000833s : 6.49% optimize.opt_a.add_forward_monad_depend : 0.000011s : 0.09% optimize.opt_a.auto_monad_grad : 0.000004s : 0.03% optimize.opt_a.auto_monad_eliminator : 0.000036s : 0.28% optimize.opt_a.cse : 0.000061s : 0.48% optimize.opt_a.a_3 : 0.000141s : 1.09% optimize.py_interpret_to_execute_after_opt_a : 0.000020s : 0.15% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.04% optimize.rewriter_after_opt_a : 0.000051s : 0.40% optimize.convert_after_rewriter : 0.000011s : 0.08% optimize.order_py_execute_after_rewriter : 0.000009s : 0.07% optimize.mutable_eliminate : 0.000784s : 6.10% optimize.opt_b.b_1 : 0.000218s : 1.70% optimize.opt_b.b_2 : 0.000011s : 0.09% optimize.opt_b.updatestate_depend_eliminate : 0.000011s : 0.08% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.02% optimize.opt_b.renormalize : 0.000001s : 0.01% optimize.opt_b.cse : 0.000030s : 0.23% optimize.optimize_parallel_all_gather_comm : 0.000024s : 0.19% optimize.overlap_param_gather : 0.000005s : 0.04% optimize.cconv : 0.000037s : 0.29% optimize.loop_unroll : 0.000481s : 3.74% optimize.opt_after_cconv.c_1 : 0.000039s : 0.30% optimize.opt_after_cconv.parameter_eliminate : 0.000004s : 0.03% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000008s : 0.06% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000004s : 0.03% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.02% optimize.opt_after_cconv.cse : 0.000028s : 0.22% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000020s : 0.15% optimize.tuple_transform.d_1 : 0.000054s : 0.42% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.02% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000010s : 0.07% optimize.partial_unused_args_eliminate : 0.000005s : 0.04% optimize.add_recomputation : 0.000068s : 0.53% optimize.cse_after_recomputation.cse : 0.000016s : 0.13% optimize.environ_conv : 0.000011s : 0.08% optimize.swap_dp_allreduce_reducescatter : 0.000009s : 0.07% optimize.bias_add_comm_swap : 0.000006s : 0.04% optimize.label_micro_interleaved_index : 0.000007s : 0.06% optimize.label_fine_grained_interleaved_index : 0.000005s : 0.04% optimize.merge_cast_opt : 0.000004s : 0.03% optimize.slice_recompute_activation : 0.000005s : 0.04% optimize.micro_interleaved_order_control : 0.000005s : 0.04% optimize.assign_add_opt : 0.000004s : 0.03% optimize.ForceFp32Comm : 0.000003s : 0.02% optimize.remove_cast_before_assign_add : 0.000004s : 0.03% optimize.full_micro_interleaved_order_control : 0.000005s : 0.04% optimize.reorder_send_recv_between_fp_bp : 0.000005s : 0.04% optimize.comm_op_add_attrs : 0.000004s : 0.03% optimize.add_comm_op_reuse_tag : 0.000003s : 0.03% optimize.interleave_split_concat_branches : 0.000004s : 0.03% optimize.interleave_parallel_branches : 0.000004s : 0.03% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.03% optimize.overlap_opt_shard_grad_in_pipeline : 0.000005s : 0.04% optimize.control_data_broadcast_order : 0.000023s : 0.18% optimize.grouped_pairwise_exchange_alltoall : 0.000005s : 0.04% optimize.offloading_packed_experts : 0.000009s : 0.07% optimize.overlap_recompute_and_grad_model_parallel : 0.000010s : 0.07% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.03% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.03% optimize.overlap_recompute_comm : 0.000005s : 0.04% optimize.overlap_grad_ring_attention : 0.000008s : 0.06% optimize.overlap_grad_flash_sp : 0.000030s : 0.23% optimize.begin_end_overlap_inline : 0.000003s : 0.02% optimize.split_matmul_comm_elemetwise : 0.000005s : 0.04% optimize.split_layernorm_comm : 0.000004s : 0.03% optimize.handle_group_info : 0.000003s : 0.03% optimize.symbol_engine_optimizer.build : 0.000004s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000016s : 0.13% optimize.symbol_engine_optimizer.elim_not_effective : 0.000017s : 0.13% optimize.symbol_engine_optimizer.opt_reshape : 0.000009s : 0.07% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000013s : 0.10% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000004s : 0.03% pipeline_parallel_scheduler : 0.000002s : 0.02% auto_monad_reorder : 0.000027s : 0.21% get_jit_bprop_graph : 0.000002s : 0.02% rewriter_after_jit_bprop_graph : 0.000006s : 0.04% opt_after_jit_grad : 0.000534s : 4.16% validate : 0.000046s : 0.36% Time group info: ------[substitution.] 0.000244 38 12.24% : 0.000030s : 3: substitution.cast_eliminate 0.90% : 0.000002s : 3: substitution.elim_not_effective 0.73% : 0.000002s : 3: substitution.fold_const_symbol 2.65% : 0.000006s : 5: substitution.graph_param_transform 70.04% : 0.000171s : 4: substitution.inline 2.35% : 0.000006s : 6: substitution.j_node_and_user_rematch 2.93% : 0.000007s : 6: substitution.remove_not_recompute_node 2.64% : 0.000006s : 4: substitution.replace_old_param 5.52% : 0.000013s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.006237 2 88.28% : 0.005506s : 1: type_inference.infer 11.72% : 0.000731s : 1: type_inference.specialize ------[replace.] 0.000089 8 70.47% : 0.000063s : 4: replace.inline 29.53% : 0.000026s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000179 8 93.70% : 0.000168s : 4: match.inline 6.30% : 0.000011s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000260 1504 0.89% : 0.000002s : 15: predicate.accumulaten_eliminater 0.74% : 0.000002s : 5: predicate.ad_related_special_op_eliminate 0.55% : 0.000001s : 10: predicate.addn_check_dump 0.81% : 0.000002s : 15: predicate.addn_zero_filter 0.80% : 0.000002s : 15: predicate.adjust_all_reduce_mul_add 2.04% : 0.000005s : 25: predicate.arithmetic_simplify 1.11% : 0.000003s : 15: predicate.cast_eliminate 0.76% : 0.000002s : 10: predicate.check_bprop_eliminate 0.60% : 0.000002s : 10: predicate.compare_switch_simplify 0.18% : 0.000000s : 5: predicate.const_output_eliminate 0.65% : 0.000002s : 10: predicate.depend_value_elim 0.90% : 0.000002s : 15: predicate.dict_get_item_const_eliminator 0.98% : 0.000003s : 15: predicate.dict_get_item_eliminator 0.86% : 0.000002s : 15: predicate.dict_set_item_eliminator 1.18% : 0.000003s : 10: predicate.dumpgradient_eliminate 0.19% : 0.000000s : 5: predicate.elim_not_effective 0.61% : 0.000002s : 5: predicate.elim_shapecalc_of_broadcastargs 1.14% : 0.000003s : 20: predicate.environ_add_const_eliminate 1.05% : 0.000003s : 20: predicate.environ_get_add_eliminate 1.07% : 0.000003s : 20: predicate.environ_get_depend_swap 1.91% : 0.000005s : 30: predicate.environ_get_eliminate 1.08% : 0.000003s : 20: predicate.environ_get_set_eliminate 1.33% : 0.000003s : 23: predicate.exchange_switch_depend_value 2.42% : 0.000006s : 23: predicate.float_depend_g_call 0.59% : 0.000002s : 10: predicate.float_environ_get_switch 0.85% : 0.000002s : 15: predicate.float_tuple_getitem_switch 0.16% : 0.000000s : 5: predicate.fold_const_symbol 0.67% : 0.000002s : 10: predicate.get_grad_eliminate 0.19% : 0.000001s : 5: predicate.graph_param_transform 0.61% : 0.000002s : 10: predicate.incorporate_call 0.53% : 0.000001s : 10: predicate.incorporate_call_switch 6.60% : 0.000017s : 68: predicate.inline 1.02% : 0.000003s : 10: predicate.inline_without_move 0.30% : 0.000001s : 10: predicate.j_node_and_user_rematch 0.83% : 0.000002s : 10: predicate.less_batch_normalization 1.81% : 0.000005s : 29: predicate.list_to_tuple_eliminator_ 2.33% : 0.000006s : 44: predicate.load_eliminater 1.15% : 0.000003s : 5: predicate.loop_unroll_after_grad 2.03% : 0.000005s : 36: predicate.loop_unroll_before_grad 1.69% : 0.000004s : 25: predicate.make_slice_get_slice_eliminator 0.62% : 0.000002s : 10: predicate.merge_addn 0.55% : 0.000001s : 10: predicate.micro_step_allgather_replace 0.57% : 0.000001s : 10: predicate.mini_step_allgather_replace 0.78% : 0.000002s : 15: predicate.minmaximum_grad 1.19% : 0.000003s : 5: predicate.mutable_eliminate 0.34% : 0.000001s : 5: predicate.opt_reshape 0.36% : 0.000001s : 5: predicate.parallel_virtual_node 1.60% : 0.000004s : 23: predicate.partial_defer_inline 1.55% : 0.000004s : 24: predicate.partial_eliminate 0.88% : 0.000002s : 15: predicate.print_const_string_wrapper 0.74% : 0.000002s : 10: predicate.reduce_all_const_elim 1.05% : 0.000003s : 15: predicate.reduce_eliminate 2.39% : 0.000006s : 44: predicate.redundant_stop_gradient_eliminater 0.61% : 0.000002s : 10: predicate.remove_not_recompute_node 1.46% : 0.000004s : 29: predicate.replace_applicator 0.39% : 0.000001s : 10: predicate.replace_old_param 0.33% : 0.000001s : 5: predicate.reset_defer_inline 0.93% : 0.000002s : 15: predicate.reshape_eliminate 0.58% : 0.000002s : 10: predicate.row_tensor_add_zeros_like 0.36% : 0.000001s : 5: predicate.row_tensor_eliminate 0.73% : 0.000002s : 10: predicate.same_eliminate 0.47% : 0.000001s : 10: predicate.set_cell_output_no_recompute 1.14% : 0.000003s : 10: predicate.shard_identity_eliminate 0.63% : 0.000002s : 10: predicate.special_op_eliminate 0.85% : 0.000002s : 10: predicate.specialize_transform 0.90% : 0.000002s : 10: predicate.split_environ_get_set_with_tuple_value 0.72% : 0.000002s : 10: predicate.stack_unstack_eliminate 0.33% : 0.000001s : 5: predicate.switch_call_monad_eliminater 1.44% : 0.000004s : 23: predicate.switch_defer_inline 1.99% : 0.000005s : 33: predicate.switch_layer_defer_inline 5.20% : 0.000014s : 74: predicate.switch_simplify 0.84% : 0.000002s : 15: predicate.tile_eliminate 0.90% : 0.000002s : 15: predicate.transpose_eliminate 1.58% : 0.000004s : 25: predicate.tuple_list_convert_item_index_to_positive 1.57% : 0.000004s : 25: predicate.tuple_list_get_item_const_eliminator 1.36% : 0.000004s : 25: predicate.tuple_list_get_item_depend_reorder 3.31% : 0.000009s : 39: predicate.tuple_list_get_item_eliminator 1.51% : 0.000004s : 25: predicate.tuple_list_get_set_item_eliminator 2.17% : 0.000006s : 35: predicate.tuple_list_set_item_eliminator 1.69% : 0.000004s : 29: predicate.tuple_to_list_eliminator_ 2.37% : 0.000006s : 44: predicate.updatestate_pure_node_eliminater 3.42% : 0.000009s : 54: predicate.updatestate_useless_node_eliminater 0.39% : 0.000001s : 5: predicate.value_based_eliminate 0.70% : 0.000002s : 10: predicate.virtual_dataset_eliminate 0.58% : 0.000002s : 10: predicate.virtual_output_eliminate 0.26% : 0.000001s : 5: predicate.virtual_view_grad_eliminate 0.48% : 0.000001s : 5: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000589 11 54.82% : 0.000323s : 5: func_graph_cloner_run.FuncGraphClonerGraph 45.18% : 0.000266s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.031375 192 0.02% : 0.000006s : 1: ForceFp32Comm 11.85% : 0.003717s : 1: add_attr 11.79% : 0.003698s : 1: add_attr_with_inline 0.02% : 0.000006s : 1: add_comm_op_reuse_tag 0.23% : 0.000073s : 1: add_recomputation 0.02% : 0.000007s : 1: assign_add_opt 0.25% : 0.000079s : 1: auto_monad 0.11% : 0.000035s : 1: auto_monad_reorder 0.02% : 0.000006s : 1: begin_end_overlap_inline 0.03% : 0.000008s : 1: bias_add_comm_swap 1.55% : 0.000487s : 1: bootstrap 0.13% : 0.000041s : 1: cconv 0.02% : 0.000006s : 1: comm_op_add_attrs 0.09% : 0.000027s : 1: control_data_broadcast_order 0.05% : 0.000014s : 1: convert_after_rewriter 0.13% : 0.000040s : 1: cse_after_recomputation 0.03% : 0.000008s : 1: dataset_repeat_opt 0.07% : 0.000022s : 1: detach_backward 0.04% : 0.000014s : 1: environ_conv 0.11% : 0.000034s : 1: event_method 0.02% : 0.000008s : 1: full_micro_interleaved_order_control 0.03% : 0.000008s : 1: get_jit_bprop_graph 0.04% : 0.000014s : 1: graph_reusing 0.02% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.02% : 0.000006s : 1: handle_group_info 0.03% : 0.000008s : 1: inline 0.03% : 0.000009s : 1: insert-virtual-dataset 0.02% : 0.000006s : 1: interleave_parallel_branches 0.02% : 0.000006s : 1: interleave_split_concat_branches 0.03% : 0.000008s : 1: label_fine_grained_interleaved_index 0.03% : 0.000010s : 1: label_micro_interleaved_index 1.56% : 0.000488s : 1: loop_unroll 0.02% : 0.000007s : 1: merge_cast_opt 0.02% : 0.000008s : 1: micro_interleaved_order_control 2.52% : 0.000792s : 1: mutable_eliminate 0.04% : 0.000012s : 1: offloading_packed_experts 0.06% : 0.000019s : 1: opt.transform.loop_unroll_optimizer 0.08% : 0.000024s : 1: opt.transform.mutable_eliminate 4.86% : 0.001525s : 78: opt.transform.opt_a 0.12% : 0.000037s : 1: opt.transform.opt_after_cconv 0.10% : 0.000032s : 1: opt.transform.opt_after_jit_grad 0.47% : 0.000149s : 28: opt.transform.opt_b 0.19% : 0.000061s : 2: opt.transform.opt_trans_graph 0.16% : 0.000050s : 4: opt.transform.symbol_engine_opt 12.03% : 0.003775s : 1: opt_a 0.50% : 0.000156s : 1: opt_after_cconv 1.74% : 0.000547s : 1: opt_after_jit_grad 1.12% : 0.000350s : 1: opt_b 22.26% : 0.006983s : 1: optimize 0.09% : 0.000028s : 1: optimize_parallel_all_gather_comm 0.04% : 0.000013s : 1: order_py_execute_after_rewriter 0.11% : 0.000034s : 1: overlap_grad_flash_sp 0.02% : 0.000007s : 1: overlap_grad_matmul_and_grad_allreduce 0.04% : 0.000011s : 1: overlap_grad_ring_attention 0.03% : 0.000008s : 1: overlap_opt_shard_grad_in_pipeline 0.02% : 0.000007s : 1: overlap_opt_shard_in_pipeline 0.03% : 0.000008s : 1: overlap_param_gather 0.02% : 0.000006s : 1: overlap_recompute_allgather_and_fa_grad 0.04% : 0.000013s : 1: overlap_recompute_and_grad_model_parallel 0.03% : 0.000008s : 1: overlap_recompute_comm 0.03% : 0.000011s : 1: parallel-infer-symbol 0.02% : 0.000007s : 1: parallel-infer-symbol-second 0.03% : 0.000008s : 1: partial_unused_args_eliminate 0.03% : 0.000010s : 1: pipeline_parallel_scheduler 0.02% : 0.000007s : 1: pipeline_split 0.15% : 0.000047s : 1: pre_auto_parallel 0.13% : 0.000040s : 1: py_interpret_to_execute 0.08% : 0.000024s : 1: py_interpret_to_execute_after_opt_a 0.03% : 0.000008s : 1: remove_cast_before_assign_add 0.07% : 0.000023s : 1: remove_dup_value 1.47% : 0.000462s : 1: renormalize.infer 1.15% : 0.000360s : 1: renormalize.specialize 0.03% : 0.000008s : 1: reorder_send_recv_between_fp_bp 0.04% : 0.000012s : 1: rewriter_after_jit_bprop_graph 0.18% : 0.000056s : 1: rewriter_after_opt_a 0.32% : 0.000099s : 1: rewriter_before_opt_a 0.03% : 0.000008s : 1: slice_cell_reuse_recomputed_activation 0.02% : 0.000008s : 1: slice_recompute_activation 0.02% : 0.000007s : 1: split_layernorm_comm 0.02% : 0.000008s : 1: split_matmul_comm_elemetwise 0.04% : 0.000012s : 1: swap_dp_allreduce_reducescatter 0.38% : 0.000119s : 1: symbol_engine_optimizer 0.35% : 0.000110s : 1: tuple_transform 20.22% : 0.006344s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:45:13.378.320 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0296324, [21] [bootstrap]: 0.00044829 [type_inference]: 0.00640931 [event_method]: 1.943e-05 [auto_monad]: 6.55e-05 [graph_reusing]: 6.22001e-06 [inline]: 2.74001e-06 [add_attr]: 0.00355604, [1] [add_attr_with_inline]: 0.00354255, [1] [Cycle 1]: 6.887e-05, [2] [tag_attr]: 2.247e-05 [meta_addattr_fg_expand]: 5.96e-06 [parallel-infer-symbol]: 4.08001e-06 [pre_auto_parallel]: 3.959e-05 [insert-virtual-dataset]: 2.48e-06 [parallel-infer-symbol-second]: 8.30012e-07 [dataset_repeat_opt]: 2.53998e-06 [pipeline_split]: 1.74e-06 [optimize]: 0.0183104, [53] [py_interpret_to_execute]: 3.045e-05 [rewriter_before_opt_a]: 9.305e-05 [opt_a]: 0.00330308, [2] [Cycle 1]: 0.00246849, [45] [expand_dump_flag]: 3.05002e-06 [switch_simplify]: 4.437e-05 [loop_unroll]: 3.099e-05 [a_1]: 0.00072914 [with_stream_mark]: 2.148e-05 [recompute_prepare]: 1.404e-05 [updatestate_depend_eliminate]: 5.15001e-06 [updatestate_assign_eliminate]: 3.81999e-06 [updatestate_loads_eliminate]: 3.50998e-06 [parameter_eliminate]: 2.04999e-06 [a_2]: 0.00010309 [accelerated_algorithm]: 8.83001e-06 [shard]: 2.21998e-06 [meta_shard_fg_expand]: 2.21e-06 [shard_inline]: 7.85e-06 [merge_send_recv]: 1.053e-05 [auto_parallel]: 8.57e-06 [parallel]: 2.019e-05 [flash_sp]: 1.057e-05 [merge_comm]: 5.00999e-06 [allreduce_fusion]: 4.30999e-06 [matmul_add_comm_reduction]: 1.199e-05 [allreduce_slice_to_reducescatter]: 6.30011e-07 [virtual_shard_identity]: 1.387e-05 [virtual_dataset]: 8.15e-06 [get_grad_eliminate_]: 8.15e-06 [virtual_output]: 7.61999e-06 [merge_forward]: 4.85999e-06 [cell_reuse_recompute_pass]: 2.02001e-06 [offload_activation]: 1.248e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.928e-05 [merge_recompute_call_nodes]: 1.44998e-06 [before_grad]: 1.396e-05 [set_forward_comm_id_for_comm_node_pass]: 4.75999e-06 [meta_fg_expand]: 3.54002e-06 [flash_sp_send_recv_attached]: 2.71e-06 [receive_attached]: 2.43e-06 [after_resolve]: 1.487e-05 [a_after_grad]: 1.315e-05 [renormalize]: 0.00088052 [add_forward_monad_depend]: 8.04002e-06 [auto_monad_grad]: 2.81999e-06 [auto_monad_eliminator]: 2.155e-05 [cse]: 3.149e-05 [a_3]: 6.514e-05 [Cycle 2]: 0.00082187, [45] [expand_dump_flag]: 1.89999e-06 [switch_simplify]: 1.084e-05 [loop_unroll]: 7.66999e-06 [a_1]: 0.00018021 [with_stream_mark]: 1.884e-05 [recompute_prepare]: 8.64e-06 [updatestate_depend_eliminate]: 4.05e-06 [updatestate_assign_eliminate]: 3.7e-06 [updatestate_loads_eliminate]: 2.98998e-06 [parameter_eliminate]: 1.57001e-06 [a_2]: 9.427e-05 [accelerated_algorithm]: 8.45999e-06 [shard]: 1.57001e-06 [meta_shard_fg_expand]: 2.11e-06 [shard_inline]: 7.65e-06 [merge_send_recv]: 8.73001e-06 [auto_parallel]: 7.3e-06 [parallel]: 7.12002e-06 [flash_sp]: 4.48001e-06 [merge_comm]: 4.49998e-06 [allreduce_fusion]: 3.98001e-06 [matmul_add_comm_reduction]: 9.96e-06 [allreduce_slice_to_reducescatter]: 7.09988e-07 [virtual_shard_identity]: 1.011e-05 [virtual_dataset]: 7.36001e-06 [get_grad_eliminate_]: 7.29001e-06 [virtual_output]: 7.66999e-06 [merge_forward]: 3.99002e-06 [cell_reuse_recompute_pass]: 2.53e-06 [offload_activation]: 9.37001e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.676e-05 [merge_recompute_call_nodes]: 1.24e-06 [before_grad]: 1.217e-05 [set_forward_comm_id_for_comm_node_pass]: 5.00999e-06 [meta_fg_expand]: 3.09999e-06 [flash_sp_send_recv_attached]: 1.36002e-06 [receive_attached]: 1.85001e-06 [after_resolve]: 1.35e-05 [a_after_grad]: 1.133e-05 [renormalize]: 8.00064e-08 [add_forward_monad_depend]: 1.91e-06 [auto_monad_grad]: 1.82001e-06 [auto_monad_eliminator]: 1.092e-05 [cse]: 2.001e-05 [a_3]: 4.751e-05 [py_interpret_to_execute_after_opt_a]: 1.464e-05 [slice_cell_reuse_recomputed_activation]: 1.92999e-06 [rewriter_after_opt_a]: 4.135e-05 [convert_after_rewriter]: 8.38999e-06 [order_py_execute_after_rewriter]: 5.89999e-06 [mutable_eliminate]: 0.0131264 [opt_b]: 0.00030572, [1] [Cycle 1]: 0.0002955, [7] [b_1]: 0.000172 [b_2]: 1.171e-05 [updatestate_depend_eliminate]: 1.504e-05 [updatestate_assign_eliminate]: 3.95998e-06 [updatestate_loads_eliminate]: 3.53e-06 [renormalize]: 7.59988e-07 [cse]: 4.88e-05 [optimize_parallel_all_gather_comm]: 2.694e-05 [overlap_param_gather]: 2.47001e-06 [cconv]: 3.918e-05 [loop_unroll]: 0.00048781 [opt_after_cconv]: 0.00012983, [1] [Cycle 1]: 0.00012352, [7] [c_1]: 3.974e-05 [parameter_eliminate]: 5.77999e-06 [updatestate_depend_eliminate]: 8.12998e-06 [updatestate_assign_eliminate]: 3.26001e-06 [updatestate_loads_eliminate]: 2.86e-06 [cse]: 2.633e-05 [renormalize]: 6.69999e-07 [remove_dup_value]: 1.573e-05 [tuple_transform]: 9.579e-05, [1] [Cycle 1]: 9.097e-05, [4] [d_1]: 6.088e-05 [none_parameter_eliminate]: 2.39001e-06 [renormalize]: 1.80007e-07 [switch_simplify]: 8.32e-06 [partial_unused_args_eliminate]: 1.96998e-06 [add_recomputation]: 7.018e-05 [cse_after_recomputation]: 2.887e-05, [1] [Cycle 1]: 2.332e-05, [1] [cse]: 1.76e-05 [environ_conv]: 1.693e-05 [swap_dp_allreduce_reducescatter]: 6.84001e-06 [bias_add_comm_swap]: 3.71001e-06 [label_micro_interleaved_index]: 4.48001e-06 [label_fine_grained_interleaved_index]: 3.07002e-06 [merge_cast_opt]: 1.32e-06 [slice_recompute_activation]: 2.12999e-06 [micro_interleaved_order_control]: 2.67001e-06 [assign_add_opt]: 1.32e-06 [ForceFp32Comm]: 1.02e-06 [remove_cast_before_assign_add]: 1.10999e-06 [full_micro_interleaved_order_control]: 2.26e-06 [reorder_send_recv_between_fp_bp]: 2.97002e-06 [comm_op_add_attrs]: 1.32e-06 [add_comm_op_reuse_tag]: 9.39996e-07 [interleave_split_concat_branches]: 1.47001e-06 [interleave_parallel_branches]: 1.05001e-06 [overlap_opt_shard_in_pipeline]: 1.27e-06 [overlap_opt_shard_grad_in_pipeline]: 2.17001e-06 [control_data_broadcast_order]: 1.687e-05 [grouped_pairwise_exchange_alltoall]: 1.55999e-06 [offloading_packed_experts]: 5.51e-06 [overlap_recompute_and_grad_model_parallel]: 5.53997e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.52001e-06 [overlap_recompute_allgather_and_fa_grad]: 1.30001e-06 [overlap_recompute_comm]: 2.12001e-06 [overlap_grad_ring_attention]: 5.12999e-06 [overlap_grad_flash_sp]: 2.638e-05 [begin_end_overlap_inline]: 7.7e-07 [split_matmul_comm_elemetwise]: 2.09e-06 [split_layernorm_comm]: 2.07001e-06 [handle_group_info]: 9.99979e-07 [symbol_engine_optimizer]: 0.00010043, [1] [Cycle 1]: 9.52e-05, [6] [build]: 4.28999e-06 [elim_shapecalc]: 1.637e-05 [elim_not_effective]: 1.768e-05 [opt_reshape]: 8.55001e-06 [fold_const_symbol]: 1.313e-05 [renormalize]: 2.29978e-07 [detach_backward]: 2.16998e-06 [pipeline_parallel_scheduler]: 1.89e-06 [auto_monad_reorder]: 2.419e-05 [get_jit_bprop_graph]: 2.12999e-06 [rewriter_after_jit_bprop_graph]: 6.26e-06 [opt_after_jit_grad]: 0.00051328 [validate]: 5.266e-05 Sums bootstrap : 0.000448s : 1.79% type_inference : 0.006409s : 25.62% event_method : 0.000019s : 0.08% auto_monad : 0.000066s : 0.26% graph_reusing : 0.000006s : 0.02% inline : 0.000003s : 0.01% add_attr.add_attr_with_inline.tag_attr : 0.000022s : 0.09% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.02% parallel-infer-symbol : 0.000004s : 0.02% pre_auto_parallel : 0.000040s : 0.16% insert-virtual-dataset : 0.000002s : 0.01% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000003s : 0.01% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000030s : 0.12% optimize.rewriter_before_opt_a : 0.000093s : 0.37% optimize.opt_a.expand_dump_flag : 0.000005s : 0.02% optimize.opt_a.switch_simplify : 0.000055s : 0.22% optimize.opt_a.loop_unroll : 0.000039s : 0.15% optimize.opt_a.a_1 : 0.000909s : 3.63% optimize.opt_a.with_stream_mark : 0.000040s : 0.16% optimize.opt_a.recompute_prepare : 0.000023s : 0.09% optimize.opt_a.updatestate_depend_eliminate : 0.000009s : 0.04% optimize.opt_a.updatestate_assign_eliminate : 0.000008s : 0.03% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.03% optimize.opt_a.parameter_eliminate : 0.000004s : 0.01% optimize.opt_a.a_2 : 0.000197s : 0.79% optimize.opt_a.accelerated_algorithm : 0.000017s : 0.07% optimize.opt_a.shard : 0.000004s : 0.02% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.02% optimize.opt_a.shard_inline : 0.000016s : 0.06% optimize.opt_a.merge_send_recv : 0.000019s : 0.08% optimize.opt_a.auto_parallel : 0.000016s : 0.06% optimize.opt_a.parallel : 0.000027s : 0.11% optimize.opt_a.flash_sp : 0.000015s : 0.06% optimize.opt_a.merge_comm : 0.000010s : 0.04% optimize.opt_a.allreduce_fusion : 0.000008s : 0.03% optimize.opt_a.matmul_add_comm_reduction : 0.000022s : 0.09% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000024s : 0.10% optimize.opt_a.virtual_dataset : 0.000016s : 0.06% optimize.opt_a.get_grad_eliminate_ : 0.000015s : 0.06% optimize.opt_a.virtual_output : 0.000015s : 0.06% optimize.opt_a.merge_forward : 0.000009s : 0.04% optimize.opt_a.cell_reuse_recompute_pass : 0.000005s : 0.02% optimize.opt_a.offload_activation : 0.000022s : 0.09% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000036s : 0.14% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.01% optimize.opt_a.before_grad : 0.000026s : 0.10% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000010s : 0.04% optimize.opt_a.meta_fg_expand : 0.000007s : 0.03% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.02% optimize.opt_a.receive_attached : 0.000004s : 0.02% optimize.opt_a.after_resolve : 0.000028s : 0.11% optimize.opt_a.a_after_grad : 0.000024s : 0.10% optimize.opt_a.renormalize : 0.000881s : 3.52% optimize.opt_a.add_forward_monad_depend : 0.000010s : 0.04% optimize.opt_a.auto_monad_grad : 0.000005s : 0.02% optimize.opt_a.auto_monad_eliminator : 0.000032s : 0.13% optimize.opt_a.cse : 0.000052s : 0.21% optimize.opt_a.a_3 : 0.000113s : 0.45% optimize.py_interpret_to_execute_after_opt_a : 0.000015s : 0.06% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.01% optimize.rewriter_after_opt_a : 0.000041s : 0.17% optimize.convert_after_rewriter : 0.000008s : 0.03% optimize.order_py_execute_after_rewriter : 0.000006s : 0.02% optimize.mutable_eliminate : 0.013126s : 52.47% optimize.opt_b.b_1 : 0.000172s : 0.69% optimize.opt_b.b_2 : 0.000012s : 0.05% optimize.opt_b.updatestate_depend_eliminate : 0.000015s : 0.06% optimize.opt_b.updatestate_assign_eliminate : 0.000004s : 0.02% optimize.opt_b.updatestate_loads_eliminate : 0.000004s : 0.01% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000049s : 0.20% optimize.optimize_parallel_all_gather_comm : 0.000027s : 0.11% optimize.overlap_param_gather : 0.000002s : 0.01% optimize.cconv : 0.000039s : 0.16% optimize.loop_unroll : 0.000488s : 1.95% optimize.opt_after_cconv.c_1 : 0.000040s : 0.16% optimize.opt_after_cconv.parameter_eliminate : 0.000006s : 0.02% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000008s : 0.03% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.cse : 0.000026s : 0.11% optimize.opt_after_cconv.renormalize : 0.000001s : 0.00% optimize.remove_dup_value : 0.000016s : 0.06% optimize.tuple_transform.d_1 : 0.000061s : 0.24% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000008s : 0.03% optimize.partial_unused_args_eliminate : 0.000002s : 0.01% optimize.add_recomputation : 0.000070s : 0.28% optimize.cse_after_recomputation.cse : 0.000018s : 0.07% optimize.environ_conv : 0.000017s : 0.07% optimize.swap_dp_allreduce_reducescatter : 0.000007s : 0.03% optimize.bias_add_comm_swap : 0.000004s : 0.01% optimize.label_micro_interleaved_index : 0.000004s : 0.02% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.01% optimize.merge_cast_opt : 0.000001s : 0.01% optimize.slice_recompute_activation : 0.000002s : 0.01% optimize.micro_interleaved_order_control : 0.000003s : 0.01% optimize.assign_add_opt : 0.000001s : 0.01% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.00% optimize.full_micro_interleaved_order_control : 0.000002s : 0.01% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.01% optimize.comm_op_add_attrs : 0.000001s : 0.01% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.01% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.01% optimize.control_data_broadcast_order : 0.000017s : 0.07% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.01% optimize.offloading_packed_experts : 0.000006s : 0.02% optimize.overlap_recompute_and_grad_model_parallel : 0.000006s : 0.02% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000002s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.01% optimize.overlap_recompute_comm : 0.000002s : 0.01% optimize.overlap_grad_ring_attention : 0.000005s : 0.02% optimize.overlap_grad_flash_sp : 0.000026s : 0.11% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.01% optimize.split_layernorm_comm : 0.000002s : 0.01% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000004s : 0.02% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000016s : 0.07% optimize.symbol_engine_optimizer.elim_not_effective : 0.000018s : 0.07% optimize.symbol_engine_optimizer.opt_reshape : 0.000009s : 0.03% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000013s : 0.05% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.01% pipeline_parallel_scheduler : 0.000002s : 0.01% auto_monad_reorder : 0.000024s : 0.10% get_jit_bprop_graph : 0.000002s : 0.01% rewriter_after_jit_bprop_graph : 0.000006s : 0.03% opt_after_jit_grad : 0.000513s : 2.05% validate : 0.000053s : 0.21% Time group info: ------[substitution.] 0.000244 38 11.85% : 0.000029s : 3: substitution.cast_eliminate 1.12% : 0.000003s : 3: substitution.elim_not_effective 0.68% : 0.000002s : 3: substitution.fold_const_symbol 3.22% : 0.000008s : 5: substitution.graph_param_transform 71.06% : 0.000173s : 4: substitution.inline 1.99% : 0.000005s : 6: substitution.j_node_and_user_rematch 2.60% : 0.000006s : 6: substitution.remove_not_recompute_node 2.25% : 0.000006s : 4: substitution.replace_old_param 5.21% : 0.000013s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.006338 2 86.83% : 0.005503s : 1: type_inference.infer 13.17% : 0.000834s : 1: type_inference.specialize ------[replace.] 0.000068 8 60.73% : 0.000042s : 4: replace.inline 39.27% : 0.000027s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000181 8 94.10% : 0.000170s : 4: match.inline 5.90% : 0.000011s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000254 1504 0.80% : 0.000002s : 15: predicate.accumulaten_eliminater 0.71% : 0.000002s : 5: predicate.ad_related_special_op_eliminate 0.53% : 0.000001s : 10: predicate.addn_check_dump 0.92% : 0.000002s : 15: predicate.addn_zero_filter 0.83% : 0.000002s : 15: predicate.adjust_all_reduce_mul_add 1.96% : 0.000005s : 25: predicate.arithmetic_simplify 0.94% : 0.000002s : 15: predicate.cast_eliminate 0.59% : 0.000001s : 10: predicate.check_bprop_eliminate 0.55% : 0.000001s : 10: predicate.compare_switch_simplify 0.19% : 0.000000s : 5: predicate.const_output_eliminate 0.62% : 0.000002s : 10: predicate.depend_value_elim 0.87% : 0.000002s : 15: predicate.dict_get_item_const_eliminator 0.95% : 0.000002s : 15: predicate.dict_get_item_eliminator 0.85% : 0.000002s : 15: predicate.dict_set_item_eliminator 0.96% : 0.000002s : 10: predicate.dumpgradient_eliminate 0.26% : 0.000001s : 5: predicate.elim_not_effective 0.38% : 0.000001s : 5: predicate.elim_shapecalc_of_broadcastargs 1.14% : 0.000003s : 20: predicate.environ_add_const_eliminate 1.16% : 0.000003s : 20: predicate.environ_get_add_eliminate 1.21% : 0.000003s : 20: predicate.environ_get_depend_swap 1.76% : 0.000004s : 30: predicate.environ_get_eliminate 1.07% : 0.000003s : 20: predicate.environ_get_set_eliminate 1.27% : 0.000003s : 23: predicate.exchange_switch_depend_value 2.52% : 0.000006s : 23: predicate.float_depend_g_call 0.56% : 0.000001s : 10: predicate.float_environ_get_switch 0.83% : 0.000002s : 15: predicate.float_tuple_getitem_switch 0.18% : 0.000000s : 5: predicate.fold_const_symbol 0.66% : 0.000002s : 10: predicate.get_grad_eliminate 0.20% : 0.000000s : 5: predicate.graph_param_transform 0.63% : 0.000002s : 10: predicate.incorporate_call 0.55% : 0.000001s : 10: predicate.incorporate_call_switch 6.60% : 0.000017s : 68: predicate.inline 1.01% : 0.000003s : 10: predicate.inline_without_move 0.31% : 0.000001s : 10: predicate.j_node_and_user_rematch 0.89% : 0.000002s : 10: predicate.less_batch_normalization 1.73% : 0.000004s : 29: predicate.list_to_tuple_eliminator_ 2.38% : 0.000006s : 44: predicate.load_eliminater 1.00% : 0.000003s : 5: predicate.loop_unroll_after_grad 2.04% : 0.000005s : 36: predicate.loop_unroll_before_grad 1.67% : 0.000004s : 25: predicate.make_slice_get_slice_eliminator 0.57% : 0.000001s : 10: predicate.merge_addn 0.59% : 0.000002s : 10: predicate.micro_step_allgather_replace 0.60% : 0.000002s : 10: predicate.mini_step_allgather_replace 0.78% : 0.000002s : 15: predicate.minmaximum_grad 2.82% : 0.000007s : 5: predicate.mutable_eliminate 0.38% : 0.000001s : 5: predicate.opt_reshape 0.37% : 0.000001s : 5: predicate.parallel_virtual_node 1.67% : 0.000004s : 23: predicate.partial_defer_inline 1.52% : 0.000004s : 24: predicate.partial_eliminate 0.79% : 0.000002s : 15: predicate.print_const_string_wrapper 0.63% : 0.000002s : 10: predicate.reduce_all_const_elim 1.19% : 0.000003s : 15: predicate.reduce_eliminate 2.45% : 0.000006s : 44: predicate.redundant_stop_gradient_eliminater 0.36% : 0.000001s : 10: predicate.remove_not_recompute_node 1.31% : 0.000003s : 29: predicate.replace_applicator 0.59% : 0.000002s : 10: predicate.replace_old_param 0.47% : 0.000001s : 5: predicate.reset_defer_inline 0.87% : 0.000002s : 15: predicate.reshape_eliminate 0.62% : 0.000002s : 10: predicate.row_tensor_add_zeros_like 0.46% : 0.000001s : 5: predicate.row_tensor_eliminate 0.67% : 0.000002s : 10: predicate.same_eliminate 0.58% : 0.000001s : 10: predicate.set_cell_output_no_recompute 0.71% : 0.000002s : 10: predicate.shard_identity_eliminate 0.69% : 0.000002s : 10: predicate.special_op_eliminate 0.73% : 0.000002s : 10: predicate.specialize_transform 1.05% : 0.000003s : 10: predicate.split_environ_get_set_with_tuple_value 0.76% : 0.000002s : 10: predicate.stack_unstack_eliminate 0.41% : 0.000001s : 5: predicate.switch_call_monad_eliminater 1.38% : 0.000003s : 23: predicate.switch_defer_inline 1.83% : 0.000005s : 33: predicate.switch_layer_defer_inline 4.52% : 0.000011s : 74: predicate.switch_simplify 0.78% : 0.000002s : 15: predicate.tile_eliminate 0.91% : 0.000002s : 15: predicate.transpose_eliminate 1.52% : 0.000004s : 25: predicate.tuple_list_convert_item_index_to_positive 1.57% : 0.000004s : 25: predicate.tuple_list_get_item_const_eliminator 1.45% : 0.000004s : 25: predicate.tuple_list_get_item_depend_reorder 3.49% : 0.000009s : 39: predicate.tuple_list_get_item_eliminator 1.68% : 0.000004s : 25: predicate.tuple_list_get_set_item_eliminator 2.29% : 0.000006s : 35: predicate.tuple_list_set_item_eliminator 1.68% : 0.000004s : 29: predicate.tuple_to_list_eliminator_ 2.26% : 0.000006s : 44: predicate.updatestate_pure_node_eliminater 3.14% : 0.000008s : 54: predicate.updatestate_useless_node_eliminater 0.37% : 0.000001s : 5: predicate.value_based_eliminate 0.66% : 0.000002s : 10: predicate.virtual_dataset_eliminate 0.63% : 0.000002s : 10: predicate.virtual_output_eliminate 0.31% : 0.000001s : 5: predicate.virtual_view_grad_eliminate 0.55% : 0.000001s : 5: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000597 11 50.74% : 0.000303s : 5: func_graph_cloner_run.FuncGraphClonerGraph 49.26% : 0.000294s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.054070 192 0.01% : 0.000004s : 1: ForceFp32Comm 6.59% : 0.003562s : 1: add_attr 6.56% : 0.003547s : 1: add_attr_with_inline 0.01% : 0.000004s : 1: add_comm_op_reuse_tag 0.14% : 0.000075s : 1: add_recomputation 0.01% : 0.000004s : 1: assign_add_opt 0.13% : 0.000070s : 1: auto_monad 0.05% : 0.000028s : 1: auto_monad_reorder 0.01% : 0.000004s : 1: begin_end_overlap_inline 0.01% : 0.000007s : 1: bias_add_comm_swap 0.88% : 0.000478s : 1: bootstrap 0.08% : 0.000043s : 1: cconv 0.01% : 0.000004s : 1: comm_op_add_attrs 0.04% : 0.000021s : 1: control_data_broadcast_order 0.02% : 0.000012s : 1: convert_after_rewriter 0.06% : 0.000033s : 1: cse_after_recomputation 0.01% : 0.000005s : 1: dataset_repeat_opt 0.01% : 0.000005s : 1: detach_backward 0.04% : 0.000021s : 1: environ_conv 0.05% : 0.000026s : 1: event_method 0.01% : 0.000005s : 1: full_micro_interleaved_order_control 0.01% : 0.000005s : 1: get_jit_bprop_graph 0.02% : 0.000010s : 1: graph_reusing 0.01% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000004s : 1: handle_group_info 0.01% : 0.000006s : 1: inline 0.01% : 0.000006s : 1: insert-virtual-dataset 0.01% : 0.000004s : 1: interleave_parallel_branches 0.01% : 0.000004s : 1: interleave_split_concat_branches 0.01% : 0.000006s : 1: label_fine_grained_interleaved_index 0.01% : 0.000008s : 1: label_micro_interleaved_index 0.92% : 0.000497s : 1: loop_unroll 0.01% : 0.000005s : 1: merge_cast_opt 0.01% : 0.000006s : 1: micro_interleaved_order_control 24.31% : 0.013144s : 1: mutable_eliminate 0.02% : 0.000009s : 1: offloading_packed_experts 0.04% : 0.000020s : 1: opt.transform.loop_unroll_optimizer 0.07% : 0.000040s : 1: opt.transform.mutable_eliminate 2.69% : 0.001457s : 78: opt.transform.opt_a 0.07% : 0.000038s : 1: opt.transform.opt_after_cconv 0.06% : 0.000032s : 1: opt.transform.opt_after_jit_grad 0.27% : 0.000145s : 28: opt.transform.opt_b 0.12% : 0.000067s : 2: opt.transform.opt_trans_graph 0.10% : 0.000051s : 4: opt.transform.symbol_engine_opt 6.12% : 0.003307s : 1: opt_a 0.25% : 0.000134s : 1: opt_after_cconv 0.97% : 0.000524s : 1: opt_after_jit_grad 0.57% : 0.000311s : 1: opt_b 33.88% : 0.018317s : 1: optimize 0.06% : 0.000031s : 1: optimize_parallel_all_gather_comm 0.02% : 0.000009s : 1: order_py_execute_after_rewriter 0.06% : 0.000030s : 1: overlap_grad_flash_sp 0.01% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.02% : 0.000008s : 1: overlap_grad_ring_attention 0.01% : 0.000006s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.01% : 0.000006s : 1: overlap_param_gather 0.01% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.02% : 0.000008s : 1: overlap_recompute_and_grad_model_parallel 0.01% : 0.000005s : 1: overlap_recompute_comm 0.01% : 0.000008s : 1: parallel-infer-symbol 0.01% : 0.000004s : 1: parallel-infer-symbol-second 0.01% : 0.000005s : 1: partial_unused_args_eliminate 0.01% : 0.000005s : 1: pipeline_parallel_scheduler 0.01% : 0.000005s : 1: pipeline_split 0.08% : 0.000044s : 1: pre_auto_parallel 0.06% : 0.000035s : 1: py_interpret_to_execute 0.03% : 0.000018s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000005s : 1: remove_cast_before_assign_add 0.04% : 0.000019s : 1: remove_dup_value 0.89% : 0.000481s : 1: renormalize.infer 0.72% : 0.000389s : 1: renormalize.specialize 0.01% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.02% : 0.000010s : 1: rewriter_after_jit_bprop_graph 0.08% : 0.000045s : 1: rewriter_after_opt_a 0.18% : 0.000097s : 1: rewriter_before_opt_a 0.01% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.01% : 0.000005s : 1: slice_recompute_activation 0.01% : 0.000005s : 1: split_layernorm_comm 0.01% : 0.000005s : 1: split_matmul_comm_elemetwise 0.02% : 0.000010s : 1: swap_dp_allreduce_reducescatter 0.19% : 0.000103s : 1: symbol_engine_optimizer 0.18% : 0.000099s : 1: tuple_transform 11.89% : 0.006429s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:45:13.955.399 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:45:13.955.680 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0406069, [21] [bootstrap]: 0.00047109 [type_inference]: 0.00620018 [event_method]: 2.2e-05 [auto_monad]: 6.991e-05 [graph_reusing]: 5.52999e-06 [inline]: 2.47001e-06 [add_attr]: 0.00317403, [1] [add_attr_with_inline]: 0.00316395, [1] [Cycle 1]: 7.837e-05, [2] [tag_attr]: 2.073e-05 [meta_addattr_fg_expand]: 6.57002e-06 [parallel-infer-symbol]: 3.04999e-06 [pre_auto_parallel]: 3.783e-05 [insert-virtual-dataset]: 2.34999e-06 [parallel-infer-symbol-second]: 7.99977e-07 [dataset_repeat_opt]: 2.27999e-06 [pipeline_split]: 1.61998e-06 [optimize]: 0.0293325, [53] [py_interpret_to_execute]: 3.846e-05 [rewriter_before_opt_a]: 9.47e-05 [opt_a]: 0.0262585, [2] [Cycle 1]: 0.0250333, [45] [expand_dump_flag]: 2.98e-06 [switch_simplify]: 4.54e-05 [loop_unroll]: 3.38e-05 [a_1]: 0.0007698 [with_stream_mark]: 2.26e-05 [recompute_prepare]: 1.438e-05 [updatestate_depend_eliminate]: 6.46999e-06 [updatestate_assign_eliminate]: 5.37999e-06 [updatestate_loads_eliminate]: 4.62998e-06 [parameter_eliminate]: 2.29999e-06 [a_2]: 0.0001543 [accelerated_algorithm]: 1.05e-05 [shard]: 1.94999e-06 [meta_shard_fg_expand]: 2.61999e-06 [shard_inline]: 9.77999e-06 [merge_send_recv]: 1.105e-05 [auto_parallel]: 9.17999e-06 [parallel]: 1.972e-05 [flash_sp]: 1.023e-05 [merge_comm]: 5.92999e-06 [allreduce_fusion]: 5.20999e-06 [matmul_add_comm_reduction]: 1.222e-05 [allreduce_slice_to_reducescatter]: 9.39996e-07 [virtual_shard_identity]: 1.236e-05 [virtual_dataset]: 9.34e-06 [get_grad_eliminate_]: 9.17001e-06 [virtual_output]: 9.49e-06 [merge_forward]: 5.79999e-06 [cell_reuse_recompute_pass]: 2.04999e-06 [offload_activation]: 1.194e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.27e-05 [merge_recompute_call_nodes]: 1.47999e-06 [before_grad]: 1.607e-05 [set_forward_comm_id_for_comm_node_pass]: 6.04001e-06 [meta_fg_expand]: 4.09002e-06 [flash_sp_send_recv_attached]: 3.61999e-06 [receive_attached]: 2.63e-06 [after_resolve]: 1.54e-05 [a_after_grad]: 1.489e-05 [renormalize]: 0.0230499 [add_forward_monad_depend]: 1.216e-05 [auto_monad_grad]: 2.71e-06 [auto_monad_eliminator]: 3.382e-05 [cse]: 4.992e-05 [a_3]: 9.975e-05 [Cycle 2]: 0.00120628, [45] [expand_dump_flag]: 2.98e-06 [switch_simplify]: 1.295e-05 [loop_unroll]: 9.41e-06 [a_1]: 0.00023625 [with_stream_mark]: 2.523e-05 [recompute_prepare]: 1.252e-05 [updatestate_depend_eliminate]: 5.96998e-06 [updatestate_assign_eliminate]: 4.63999e-06 [updatestate_loads_eliminate]: 7.71999e-06 [parameter_eliminate]: 2.14e-06 [a_2]: 0.00014196 [accelerated_algorithm]: 1.158e-05 [shard]: 3.35998e-06 [meta_shard_fg_expand]: 2.94001e-06 [shard_inline]: 9.25001e-06 [merge_send_recv]: 1.153e-05 [auto_parallel]: 1.289e-05 [parallel]: 1.094e-05 [flash_sp]: 5.09003e-06 [merge_comm]: 5.86e-06 [allreduce_fusion]: 5.97999e-06 [matmul_add_comm_reduction]: 1.368e-05 [allreduce_slice_to_reducescatter]: 1.10999e-06 [virtual_shard_identity]: 1.261e-05 [virtual_dataset]: 9.58002e-06 [get_grad_eliminate_]: 9.24e-06 [virtual_output]: 8.44002e-06 [merge_forward]: 6.40002e-06 [cell_reuse_recompute_pass]: 3.26001e-06 [offload_activation]: 1.424e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.398e-05 [merge_recompute_call_nodes]: 1.45001e-06 [before_grad]: 1.656e-05 [set_forward_comm_id_for_comm_node_pass]: 6.46999e-06 [meta_fg_expand]: 3.95e-06 [flash_sp_send_recv_attached]: 2.16e-06 [receive_attached]: 2.77002e-06 [after_resolve]: 1.702e-05 [a_after_grad]: 1.464e-05 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 2.53e-06 [auto_monad_grad]: 2.07999e-06 [auto_monad_eliminator]: 1.369e-05 [cse]: 3.04e-05 [a_3]: 7.137e-05 [py_interpret_to_execute_after_opt_a]: 2.655e-05 [slice_cell_reuse_recomputed_activation]: 5.50001e-06 [rewriter_after_opt_a]: 6.198e-05 [convert_after_rewriter]: 1.277e-05 [order_py_execute_after_rewriter]: 9.46e-06 [mutable_eliminate]: 0.00077028 [opt_b]: 0.00037899, [1] [Cycle 1]: 0.00036595, [7] [b_1]: 0.00023202 [b_2]: 1.247e-05 [updatestate_depend_eliminate]: 1.32e-05 [updatestate_assign_eliminate]: 4.17e-06 [updatestate_loads_eliminate]: 4.00998e-06 [renormalize]: 8.2e-07 [cse]: 3.774e-05 [optimize_parallel_all_gather_comm]: 2.554e-05 [overlap_param_gather]: 4.98001e-06 [cconv]: 3.599e-05 [loop_unroll]: 0.00052192 [opt_after_cconv]: 0.00020717, [1] [Cycle 1]: 0.00019654, [7] [c_1]: 4.503e-05 [parameter_eliminate]: 4.41002e-06 [updatestate_depend_eliminate]: 8.75001e-06 [updatestate_assign_eliminate]: 3.98001e-06 [updatestate_loads_eliminate]: 3.43999e-06 [cse]: 6.733e-05 [renormalize]: 5.69999e-07 [remove_dup_value]: 5.681e-05 [tuple_transform]: 0.00012368, [1] [Cycle 1]: 0.00011529, [4] [d_1]: 6.651e-05 [none_parameter_eliminate]: 2.84001e-06 [renormalize]: 2.50002e-07 [switch_simplify]: 1.158e-05 [partial_unused_args_eliminate]: 4.47998e-06 [add_recomputation]: 7.157e-05 [cse_after_recomputation]: 5.503e-05, [1] [Cycle 1]: 4.699e-05, [1] [cse]: 2.036e-05 [environ_conv]: 1.028e-05 [swap_dp_allreduce_reducescatter]: 9.97999e-06 [bias_add_comm_swap]: 5.99999e-06 [label_micro_interleaved_index]: 6.99001e-06 [label_fine_grained_interleaved_index]: 5.25999e-06 [merge_cast_opt]: 3.58e-06 [slice_recompute_activation]: 4.63001e-06 [micro_interleaved_order_control]: 5.12999e-06 [assign_add_opt]: 3.71999e-06 [ForceFp32Comm]: 3.51001e-06 [remove_cast_before_assign_add]: 3.36001e-06 [full_micro_interleaved_order_control]: 4.53999e-06 [reorder_send_recv_between_fp_bp]: 5.15999e-06 [comm_op_add_attrs]: 3.78001e-06 [add_comm_op_reuse_tag]: 3.91999e-06 [interleave_split_concat_branches]: 3.89002e-06 [interleave_parallel_branches]: 3.29001e-06 [overlap_opt_shard_in_pipeline]: 3.46999e-06 [overlap_opt_shard_grad_in_pipeline]: 4.79e-06 [control_data_broadcast_order]: 2.344e-05 [grouped_pairwise_exchange_alltoall]: 4.17e-06 [offloading_packed_experts]: 8.47e-06 [overlap_recompute_and_grad_model_parallel]: 8.07e-06 [overlap_grad_matmul_and_grad_allreduce]: 4.31002e-06 [overlap_recompute_allgather_and_fa_grad]: 3.85998e-06 [overlap_recompute_comm]: 5.37999e-06 [overlap_grad_ring_attention]: 8.12e-06 [overlap_grad_flash_sp]: 3.006e-05 [begin_end_overlap_inline]: 3.11999e-06 [split_matmul_comm_elemetwise]: 4.65001e-06 [split_layernorm_comm]: 4e-06 [handle_group_info]: 3.51999e-06 [symbol_engine_optimizer]: 0.00012717, [1] [Cycle 1]: 0.00011859, [6] [build]: 4.50001e-06 [elim_shapecalc]: 1.81e-05 [elim_not_effective]: 1.863e-05 [opt_reshape]: 1.04e-05 [fold_const_symbol]: 1.499e-05 [renormalize]: 4.49974e-07 [detach_backward]: 5.23002e-06 [pipeline_parallel_scheduler]: 2.20002e-06 [auto_monad_reorder]: 2.771e-05 [get_jit_bprop_graph]: 2.22999e-06 [rewriter_after_jit_bprop_graph]: 6.68e-06 [opt_after_jit_grad]: 0.00055451 [validate]: 4.98e-05 Sums bootstrap : 0.000471s : 1.33% type_inference : 0.006200s : 17.50% event_method : 0.000022s : 0.06% auto_monad : 0.000070s : 0.20% graph_reusing : 0.000006s : 0.02% inline : 0.000002s : 0.01% add_attr.add_attr_with_inline.tag_attr : 0.000021s : 0.06% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000007s : 0.02% parallel-infer-symbol : 0.000003s : 0.01% pre_auto_parallel : 0.000038s : 0.11% insert-virtual-dataset : 0.000002s : 0.01% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.01% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000038s : 0.11% optimize.rewriter_before_opt_a : 0.000095s : 0.27% optimize.opt_a.expand_dump_flag : 0.000006s : 0.02% optimize.opt_a.switch_simplify : 0.000058s : 0.16% optimize.opt_a.loop_unroll : 0.000043s : 0.12% optimize.opt_a.a_1 : 0.001006s : 2.84% optimize.opt_a.with_stream_mark : 0.000048s : 0.13% optimize.opt_a.recompute_prepare : 0.000027s : 0.08% optimize.opt_a.updatestate_depend_eliminate : 0.000012s : 0.04% optimize.opt_a.updatestate_assign_eliminate : 0.000010s : 0.03% optimize.opt_a.updatestate_loads_eliminate : 0.000012s : 0.03% optimize.opt_a.parameter_eliminate : 0.000004s : 0.01% optimize.opt_a.a_2 : 0.000296s : 0.84% optimize.opt_a.accelerated_algorithm : 0.000022s : 0.06% optimize.opt_a.shard : 0.000005s : 0.01% optimize.opt_a.meta_shard_fg_expand : 0.000006s : 0.02% optimize.opt_a.shard_inline : 0.000019s : 0.05% optimize.opt_a.merge_send_recv : 0.000023s : 0.06% optimize.opt_a.auto_parallel : 0.000022s : 0.06% optimize.opt_a.parallel : 0.000031s : 0.09% optimize.opt_a.flash_sp : 0.000015s : 0.04% optimize.opt_a.merge_comm : 0.000012s : 0.03% optimize.opt_a.allreduce_fusion : 0.000011s : 0.03% optimize.opt_a.matmul_add_comm_reduction : 0.000026s : 0.07% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000002s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000025s : 0.07% optimize.opt_a.virtual_dataset : 0.000019s : 0.05% optimize.opt_a.get_grad_eliminate_ : 0.000018s : 0.05% optimize.opt_a.virtual_output : 0.000018s : 0.05% optimize.opt_a.merge_forward : 0.000012s : 0.03% optimize.opt_a.cell_reuse_recompute_pass : 0.000005s : 0.01% optimize.opt_a.offload_activation : 0.000026s : 0.07% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000047s : 0.13% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.01% optimize.opt_a.before_grad : 0.000033s : 0.09% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000013s : 0.04% optimize.opt_a.meta_fg_expand : 0.000008s : 0.02% optimize.opt_a.flash_sp_send_recv_attached : 0.000006s : 0.02% optimize.opt_a.receive_attached : 0.000005s : 0.02% optimize.opt_a.after_resolve : 0.000032s : 0.09% optimize.opt_a.a_after_grad : 0.000030s : 0.08% optimize.opt_a.renormalize : 0.023050s : 65.05% optimize.opt_a.add_forward_monad_depend : 0.000015s : 0.04% optimize.opt_a.auto_monad_grad : 0.000005s : 0.01% optimize.opt_a.auto_monad_eliminator : 0.000048s : 0.13% optimize.opt_a.cse : 0.000080s : 0.23% optimize.opt_a.a_3 : 0.000171s : 0.48% optimize.py_interpret_to_execute_after_opt_a : 0.000027s : 0.07% optimize.slice_cell_reuse_recomputed_activation : 0.000006s : 0.02% optimize.rewriter_after_opt_a : 0.000062s : 0.17% optimize.convert_after_rewriter : 0.000013s : 0.04% optimize.order_py_execute_after_rewriter : 0.000009s : 0.03% optimize.mutable_eliminate : 0.000770s : 2.17% optimize.opt_b.b_1 : 0.000232s : 0.65% optimize.opt_b.b_2 : 0.000012s : 0.04% optimize.opt_b.updatestate_depend_eliminate : 0.000013s : 0.04% optimize.opt_b.updatestate_assign_eliminate : 0.000004s : 0.01% optimize.opt_b.updatestate_loads_eliminate : 0.000004s : 0.01% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000038s : 0.11% optimize.optimize_parallel_all_gather_comm : 0.000026s : 0.07% optimize.overlap_param_gather : 0.000005s : 0.01% optimize.cconv : 0.000036s : 0.10% optimize.loop_unroll : 0.000522s : 1.47% optimize.opt_after_cconv.c_1 : 0.000045s : 0.13% optimize.opt_after_cconv.parameter_eliminate : 0.000004s : 0.01% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000009s : 0.02% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000004s : 0.01% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.cse : 0.000067s : 0.19% optimize.opt_after_cconv.renormalize : 0.000001s : 0.00% optimize.remove_dup_value : 0.000057s : 0.16% optimize.tuple_transform.d_1 : 0.000067s : 0.19% optimize.tuple_transform.none_parameter_eliminate : 0.000003s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000012s : 0.03% optimize.partial_unused_args_eliminate : 0.000004s : 0.01% optimize.add_recomputation : 0.000072s : 0.20% optimize.cse_after_recomputation.cse : 0.000020s : 0.06% optimize.environ_conv : 0.000010s : 0.03% optimize.swap_dp_allreduce_reducescatter : 0.000010s : 0.03% optimize.bias_add_comm_swap : 0.000006s : 0.02% optimize.label_micro_interleaved_index : 0.000007s : 0.02% optimize.label_fine_grained_interleaved_index : 0.000005s : 0.01% optimize.merge_cast_opt : 0.000004s : 0.01% optimize.slice_recompute_activation : 0.000005s : 0.01% optimize.micro_interleaved_order_control : 0.000005s : 0.01% optimize.assign_add_opt : 0.000004s : 0.01% optimize.ForceFp32Comm : 0.000004s : 0.01% optimize.remove_cast_before_assign_add : 0.000003s : 0.01% optimize.full_micro_interleaved_order_control : 0.000005s : 0.01% optimize.reorder_send_recv_between_fp_bp : 0.000005s : 0.01% optimize.comm_op_add_attrs : 0.000004s : 0.01% optimize.add_comm_op_reuse_tag : 0.000004s : 0.01% optimize.interleave_split_concat_branches : 0.000004s : 0.01% optimize.interleave_parallel_branches : 0.000003s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000003s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000005s : 0.01% optimize.control_data_broadcast_order : 0.000023s : 0.07% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.01% optimize.offloading_packed_experts : 0.000008s : 0.02% optimize.overlap_recompute_and_grad_model_parallel : 0.000008s : 0.02% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.01% optimize.overlap_recompute_comm : 0.000005s : 0.02% optimize.overlap_grad_ring_attention : 0.000008s : 0.02% optimize.overlap_grad_flash_sp : 0.000030s : 0.08% optimize.begin_end_overlap_inline : 0.000003s : 0.01% optimize.split_matmul_comm_elemetwise : 0.000005s : 0.01% optimize.split_layernorm_comm : 0.000004s : 0.01% optimize.handle_group_info : 0.000004s : 0.01% optimize.symbol_engine_optimizer.build : 0.000005s : 0.01% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000018s : 0.05% optimize.symbol_engine_optimizer.elim_not_effective : 0.000019s : 0.05% optimize.symbol_engine_optimizer.opt_reshape : 0.000010s : 0.03% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000015s : 0.04% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000005s : 0.01% pipeline_parallel_scheduler : 0.000002s : 0.01% auto_monad_reorder : 0.000028s : 0.08% get_jit_bprop_graph : 0.000002s : 0.01% rewriter_after_jit_bprop_graph : 0.000007s : 0.02% opt_after_jit_grad : 0.000555s : 1.56% validate : 0.000050s : 0.14% Time group info: ------[substitution.] 0.000246 48 16.65% : 0.000041s : 6: substitution.cast_eliminate 1.06% : 0.000003s : 4: substitution.elim_not_effective 0.89% : 0.000002s : 4: substitution.fold_const_symbol 3.30% : 0.000008s : 6: substitution.graph_param_transform 63.84% : 0.000157s : 4: substitution.inline 2.81% : 0.000007s : 8: substitution.j_node_and_user_rematch 3.65% : 0.000009s : 8: substitution.remove_not_recompute_node 2.66% : 0.000007s : 4: substitution.replace_old_param 5.15% : 0.000013s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.006144 2 87.69% : 0.005388s : 1: type_inference.infer 12.31% : 0.000756s : 1: type_inference.specialize ------[replace.] 0.000073 8 59.83% : 0.000044s : 4: replace.inline 40.17% : 0.000029s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000165 8 93.42% : 0.000154s : 4: match.inline 6.58% : 0.000011s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000304 1730 0.94% : 0.000003s : 17: predicate.accumulaten_eliminater 0.72% : 0.000002s : 6: predicate.ad_related_special_op_eliminate 0.59% : 0.000002s : 12: predicate.addn_check_dump 0.89% : 0.000003s : 17: predicate.addn_zero_filter 0.77% : 0.000002s : 17: predicate.adjust_all_reduce_mul_add 2.41% : 0.000007s : 29: predicate.arithmetic_simplify 1.23% : 0.000004s : 17: predicate.cast_eliminate 0.79% : 0.000002s : 12: predicate.check_bprop_eliminate 0.71% : 0.000002s : 12: predicate.compare_switch_simplify 0.17% : 0.000001s : 6: predicate.const_output_eliminate 0.62% : 0.000002s : 12: predicate.depend_value_elim 0.98% : 0.000003s : 17: predicate.dict_get_item_const_eliminator 1.07% : 0.000003s : 17: predicate.dict_get_item_eliminator 0.83% : 0.000003s : 17: predicate.dict_set_item_eliminator 1.00% : 0.000003s : 12: predicate.dumpgradient_eliminate 0.17% : 0.000001s : 6: predicate.elim_not_effective 0.54% : 0.000002s : 6: predicate.elim_shapecalc_of_broadcastargs 1.19% : 0.000004s : 23: predicate.environ_add_const_eliminate 1.13% : 0.000003s : 23: predicate.environ_get_add_eliminate 1.19% : 0.000004s : 23: predicate.environ_get_depend_swap 1.81% : 0.000005s : 35: predicate.environ_get_eliminate 1.17% : 0.000004s : 23: predicate.environ_get_set_eliminate 1.21% : 0.000004s : 25: predicate.exchange_switch_depend_value 2.10% : 0.000006s : 25: predicate.float_depend_g_call 0.57% : 0.000002s : 12: predicate.float_environ_get_switch 0.92% : 0.000003s : 18: predicate.float_tuple_getitem_switch 0.17% : 0.000001s : 6: predicate.fold_const_symbol 0.66% : 0.000002s : 12: predicate.get_grad_eliminate 0.19% : 0.000001s : 6: predicate.graph_param_transform 0.63% : 0.000002s : 12: predicate.incorporate_call 0.51% : 0.000002s : 12: predicate.incorporate_call_switch 6.55% : 0.000020s : 78: predicate.inline 1.10% : 0.000003s : 12: predicate.inline_without_move 0.34% : 0.000001s : 12: predicate.j_node_and_user_rematch 0.78% : 0.000002s : 12: predicate.less_batch_normalization 1.90% : 0.000006s : 33: predicate.list_to_tuple_eliminator_ 2.44% : 0.000007s : 50: predicate.load_eliminater 0.87% : 0.000003s : 6: predicate.loop_unroll_after_grad 1.91% : 0.000006s : 38: predicate.loop_unroll_before_grad 1.65% : 0.000005s : 29: predicate.make_slice_get_slice_eliminator 0.61% : 0.000002s : 12: predicate.merge_addn 0.60% : 0.000002s : 12: predicate.micro_step_allgather_replace 0.63% : 0.000002s : 12: predicate.mini_step_allgather_replace 0.80% : 0.000002s : 17: predicate.minmaximum_grad 1.27% : 0.000004s : 6: predicate.mutable_eliminate 0.35% : 0.000001s : 6: predicate.opt_reshape 0.48% : 0.000001s : 6: predicate.parallel_virtual_node 1.58% : 0.000005s : 25: predicate.partial_defer_inline 1.52% : 0.000005s : 27: predicate.partial_eliminate 0.84% : 0.000003s : 17: predicate.print_const_string_wrapper 0.64% : 0.000002s : 12: predicate.reduce_all_const_elim 1.19% : 0.000004s : 17: predicate.reduce_eliminate 2.50% : 0.000008s : 50: predicate.redundant_stop_gradient_eliminater 0.35% : 0.000001s : 12: predicate.remove_not_recompute_node 1.33% : 0.000004s : 33: predicate.replace_applicator 0.54% : 0.000002s : 12: predicate.replace_old_param 0.43% : 0.000001s : 6: predicate.reset_defer_inline 1.01% : 0.000003s : 17: predicate.reshape_eliminate 0.70% : 0.000002s : 12: predicate.row_tensor_add_zeros_like 0.36% : 0.000001s : 6: predicate.row_tensor_eliminate 0.94% : 0.000003s : 12: predicate.same_eliminate 0.40% : 0.000001s : 12: predicate.set_cell_output_no_recompute 0.90% : 0.000003s : 12: predicate.shard_identity_eliminate 0.70% : 0.000002s : 12: predicate.special_op_eliminate 0.93% : 0.000003s : 12: predicate.specialize_transform 1.14% : 0.000003s : 12: predicate.split_environ_get_set_with_tuple_value 0.90% : 0.000003s : 12: predicate.stack_unstack_eliminate 0.34% : 0.000001s : 6: predicate.switch_call_monad_eliminater 1.35% : 0.000004s : 25: predicate.switch_defer_inline 1.93% : 0.000006s : 37: predicate.switch_layer_defer_inline 4.55% : 0.000014s : 81: predicate.switch_simplify 0.94% : 0.000003s : 17: predicate.tile_eliminate 0.86% : 0.000003s : 17: predicate.transpose_eliminate 1.49% : 0.000005s : 29: predicate.tuple_list_convert_item_index_to_positive 1.55% : 0.000005s : 29: predicate.tuple_list_get_item_const_eliminator 1.50% : 0.000005s : 29: predicate.tuple_list_get_item_depend_reorder 3.13% : 0.000009s : 45: predicate.tuple_list_get_item_eliminator 1.46% : 0.000004s : 29: predicate.tuple_list_get_set_item_eliminator 2.26% : 0.000007s : 41: predicate.tuple_list_set_item_eliminator 1.65% : 0.000005s : 33: predicate.tuple_to_list_eliminator_ 2.32% : 0.000007s : 50: predicate.updatestate_pure_node_eliminater 3.01% : 0.000009s : 62: predicate.updatestate_useless_node_eliminater 0.31% : 0.000001s : 6: predicate.value_based_eliminate 0.72% : 0.000002s : 12: predicate.virtual_dataset_eliminate 0.78% : 0.000002s : 12: predicate.virtual_output_eliminate 0.30% : 0.000001s : 6: predicate.virtual_view_grad_eliminate 0.52% : 0.000002s : 6: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000647 11 48.90% : 0.000316s : 5: func_graph_cloner_run.FuncGraphClonerGraph 51.10% : 0.000331s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.098099 192 0.01% : 0.000006s : 1: ForceFp32Comm 3.24% : 0.003183s : 1: add_attr 3.23% : 0.003168s : 1: add_attr_with_inline 0.01% : 0.000007s : 1: add_comm_op_reuse_tag 0.08% : 0.000076s : 1: add_recomputation 0.01% : 0.000007s : 1: assign_add_opt 0.08% : 0.000078s : 1: auto_monad 0.04% : 0.000036s : 1: auto_monad_reorder 0.01% : 0.000006s : 1: begin_end_overlap_inline 0.01% : 0.000009s : 1: bias_add_comm_swap 0.53% : 0.000519s : 1: bootstrap 0.04% : 0.000040s : 1: cconv 0.01% : 0.000007s : 1: comm_op_add_attrs 0.03% : 0.000027s : 1: control_data_broadcast_order 0.02% : 0.000016s : 1: convert_after_rewriter 0.06% : 0.000059s : 1: cse_after_recomputation 0.01% : 0.000008s : 1: dataset_repeat_opt 0.03% : 0.000027s : 1: detach_backward 0.01% : 0.000014s : 1: environ_conv 0.03% : 0.000032s : 1: event_method 0.01% : 0.000007s : 1: full_micro_interleaved_order_control 0.01% : 0.000009s : 1: get_jit_bprop_graph 0.01% : 0.000012s : 1: graph_reusing 0.01% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000006s : 1: handle_group_info 0.01% : 0.000008s : 1: inline 0.01% : 0.000009s : 1: insert-virtual-dataset 0.01% : 0.000006s : 1: interleave_parallel_branches 0.01% : 0.000006s : 1: interleave_split_concat_branches 0.01% : 0.000008s : 1: label_fine_grained_interleaved_index 0.01% : 0.000010s : 1: label_micro_interleaved_index 0.54% : 0.000529s : 1: loop_unroll 0.01% : 0.000007s : 1: merge_cast_opt 0.01% : 0.000008s : 1: micro_interleaved_order_control 0.79% : 0.000779s : 1: mutable_eliminate 0.01% : 0.000012s : 1: offloading_packed_experts 0.02% : 0.000021s : 1: opt.transform.loop_unroll_optimizer 0.03% : 0.000028s : 1: opt.transform.mutable_eliminate 1.70% : 0.001671s : 78: opt.transform.opt_a 0.04% : 0.000043s : 1: opt.transform.opt_after_cconv 0.04% : 0.000037s : 1: opt.transform.opt_after_jit_grad 0.17% : 0.000169s : 28: opt.transform.opt_b 0.08% : 0.000076s : 2: opt.transform.opt_trans_graph 0.06% : 0.000058s : 4: opt.transform.symbol_engine_opt 26.77% : 0.026263s : 1: opt_a 0.22% : 0.000211s : 1: opt_after_cconv 0.58% : 0.000566s : 1: opt_after_jit_grad 0.39% : 0.000383s : 1: opt_b 30.28% : 0.029703s : 1: optimize 0.03% : 0.000029s : 1: optimize_parallel_all_gather_comm 0.01% : 0.000013s : 1: order_py_execute_after_rewriter 0.03% : 0.000033s : 1: overlap_grad_flash_sp 0.01% : 0.000007s : 1: overlap_grad_matmul_and_grad_allreduce 0.01% : 0.000011s : 1: overlap_grad_ring_attention 0.01% : 0.000007s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000006s : 1: overlap_opt_shard_in_pipeline 0.01% : 0.000008s : 1: overlap_param_gather 0.01% : 0.000006s : 1: overlap_recompute_allgather_and_fa_grad 0.01% : 0.000011s : 1: overlap_recompute_and_grad_model_parallel 0.01% : 0.000008s : 1: overlap_recompute_comm 0.01% : 0.000010s : 1: parallel-infer-symbol 0.01% : 0.000007s : 1: parallel-infer-symbol-second 0.01% : 0.000008s : 1: partial_unused_args_eliminate 0.01% : 0.000011s : 1: pipeline_parallel_scheduler 0.01% : 0.000007s : 1: pipeline_split 0.05% : 0.000046s : 1: pre_auto_parallel 0.04% : 0.000042s : 1: py_interpret_to_execute 0.03% : 0.000030s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000006s : 1: remove_cast_before_assign_add 0.06% : 0.000061s : 1: remove_dup_value 22.91% : 0.022470s : 1: renormalize.infer 0.57% : 0.000560s : 1: renormalize.specialize 0.01% : 0.000008s : 1: reorder_send_recv_between_fp_bp 0.01% : 0.000013s : 1: rewriter_after_jit_bprop_graph 0.07% : 0.000066s : 1: rewriter_after_opt_a 0.10% : 0.000099s : 1: rewriter_before_opt_a 0.01% : 0.000008s : 1: slice_cell_reuse_recomputed_activation 0.01% : 0.000007s : 1: slice_recompute_activation 0.01% : 0.000007s : 1: split_layernorm_comm 0.01% : 0.000007s : 1: split_matmul_comm_elemetwise 0.01% : 0.000013s : 1: swap_dp_allreduce_reducescatter 0.13% : 0.000130s : 1: symbol_engine_optimizer 0.13% : 0.000127s : 1: tuple_transform 6.36% : 0.006240s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:45:14.546.363 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0375517, [21] [bootstrap]: 0.00039513 [type_inference]: 0.0265346 [event_method]: 2.158e-05 [auto_monad]: 6.889e-05 [graph_reusing]: 5.92999e-06 [inline]: 2.61e-06 [add_attr]: 0.00368205, [1] [add_attr_with_inline]: 0.00366995, [1] [Cycle 1]: 7.313e-05, [2] [tag_attr]: 2.182e-05 [meta_addattr_fg_expand]: 6.38e-06 [parallel-infer-symbol]: 3.73001e-06 [pre_auto_parallel]: 4.086e-05 [insert-virtual-dataset]: 2.69001e-06 [parallel-infer-symbol-second]: 8.89995e-07 [dataset_repeat_opt]: 2.14999e-06 [pipeline_split]: 1.57999e-06 [optimize]: 0.00603275, [53] [py_interpret_to_execute]: 3.263e-05 [rewriter_before_opt_a]: 0.00010086 [opt_a]: 0.00353478, [2] [Cycle 1]: 0.00258587, [45] [expand_dump_flag]: 3.4e-06 [switch_simplify]: 4.725e-05 [loop_unroll]: 3.23e-05 [a_1]: 0.00075504 [with_stream_mark]: 2.298e-05 [recompute_prepare]: 1.533e-05 [updatestate_depend_eliminate]: 5.61e-06 [updatestate_assign_eliminate]: 4.76002e-06 [updatestate_loads_eliminate]: 4.09002e-06 [parameter_eliminate]: 2.23002e-06 [a_2]: 0.00012253 [accelerated_algorithm]: 1.038e-05 [shard]: 2.37999e-06 [meta_shard_fg_expand]: 2.90002e-06 [shard_inline]: 9.37999e-06 [merge_send_recv]: 1.117e-05 [auto_parallel]: 8.95001e-06 [parallel]: 1.963e-05 [flash_sp]: 1.052e-05 [merge_comm]: 5.98998e-06 [allreduce_fusion]: 4.74998e-06 [matmul_add_comm_reduction]: 1.187e-05 [allreduce_slice_to_reducescatter]: 6.89994e-07 [virtual_shard_identity]: 1.321e-05 [virtual_dataset]: 8.84e-06 [get_grad_eliminate_]: 8.37e-06 [virtual_output]: 9.15999e-06 [merge_forward]: 5.54e-06 [cell_reuse_recompute_pass]: 1.57001e-06 [offload_activation]: 1.307e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.054e-05 [merge_recompute_call_nodes]: 1.91998e-06 [before_grad]: 1.657e-05 [set_forward_comm_id_for_comm_node_pass]: 5.59998e-06 [meta_fg_expand]: 4.12e-06 [flash_sp_send_recv_attached]: 2.83e-06 [receive_attached]: 2.52001e-06 [after_resolve]: 1.453e-05 [a_after_grad]: 1.436e-05 [renormalize]: 0.00090981 [add_forward_monad_depend]: 6.02001e-06 [auto_monad_grad]: 3.00002e-06 [auto_monad_eliminator]: 2.194e-05 [cse]: 4.544e-05 [a_3]: 6.921e-05 [Cycle 2]: 0.00093564, [45] [expand_dump_flag]: 1.60999e-06 [switch_simplify]: 1.032e-05 [loop_unroll]: 8.75001e-06 [a_1]: 0.0002137 [with_stream_mark]: 1.869e-05 [recompute_prepare]: 9.19998e-06 [updatestate_depend_eliminate]: 5.44e-06 [updatestate_assign_eliminate]: 4.28999e-06 [updatestate_loads_eliminate]: 3.83001e-06 [parameter_eliminate]: 1.74e-06 [a_2]: 0.00011479 [accelerated_algorithm]: 9.41e-06 [shard]: 1.49e-06 [meta_shard_fg_expand]: 2.67001e-06 [shard_inline]: 9.34e-06 [merge_send_recv]: 9.47999e-06 [auto_parallel]: 8.18001e-06 [parallel]: 7.85998e-06 [flash_sp]: 5.04e-06 [merge_comm]: 5.30999e-06 [allreduce_fusion]: 4.57998e-06 [matmul_add_comm_reduction]: 1.079e-05 [allreduce_slice_to_reducescatter]: 5.40022e-07 [virtual_shard_identity]: 1.004e-05 [virtual_dataset]: 8.26002e-06 [get_grad_eliminate_]: 8.16002e-06 [virtual_output]: 8.76002e-06 [merge_forward]: 5.29e-06 [cell_reuse_recompute_pass]: 2.21e-06 [offload_activation]: 1.107e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.961e-05 [merge_recompute_call_nodes]: 1.07e-06 [before_grad]: 1.469e-05 [set_forward_comm_id_for_comm_node_pass]: 6.06e-06 [meta_fg_expand]: 3.33e-06 [flash_sp_send_recv_attached]: 1.19998e-06 [receive_attached]: 2.09999e-06 [after_resolve]: 1.391e-05 [a_after_grad]: 1.33e-05 [renormalize]: 8.9989e-08 [add_forward_monad_depend]: 2.55997e-06 [auto_monad_grad]: 1.86e-06 [auto_monad_eliminator]: 1.372e-05 [cse]: 2.831e-05 [a_3]: 5.472e-05 [py_interpret_to_execute_after_opt_a]: 1.495e-05 [slice_cell_reuse_recomputed_activation]: 1.94e-06 [rewriter_after_opt_a]: 4.948e-05 [convert_after_rewriter]: 8.75999e-06 [order_py_execute_after_rewriter]: 6.29999e-06 [mutable_eliminate]: 0.00059898 [opt_b]: 0.00028922, [1] [Cycle 1]: 0.00028217, [7] [b_1]: 0.0001836 [b_2]: 1.091e-05 [updatestate_depend_eliminate]: 7.76001e-06 [updatestate_assign_eliminate]: 3.84002e-06 [updatestate_loads_eliminate]: 3.53e-06 [renormalize]: 7.50006e-07 [cse]: 3.443e-05 [optimize_parallel_all_gather_comm]: 1.923e-05 [overlap_param_gather]: 1.91e-06 [cconv]: 2.827e-05 [loop_unroll]: 0.00048623 [opt_after_cconv]: 0.00014692, [1] [Cycle 1]: 0.00013989, [7] [c_1]: 4.368e-05 [parameter_eliminate]: 3.53999e-06 [updatestate_depend_eliminate]: 9.81998e-06 [updatestate_assign_eliminate]: 4.15e-06 [updatestate_loads_eliminate]: 3.80998e-06 [cse]: 3.566e-05 [renormalize]: 1.14e-06 [remove_dup_value]: 4.66e-05 [tuple_transform]: 0.00010037, [1] [Cycle 1]: 9.556e-05, [4] [d_1]: 6.405e-05 [none_parameter_eliminate]: 2.16e-06 [renormalize]: 1.69995e-07 [switch_simplify]: 9.62001e-06 [partial_unused_args_eliminate]: 1.87001e-06 [add_recomputation]: 6.688e-05 [cse_after_recomputation]: 2.769e-05, [1] [Cycle 1]: 2.27e-05, [1] [cse]: 1.706e-05 [environ_conv]: 6.44999e-06 [swap_dp_allreduce_reducescatter]: 6.43e-06 [bias_add_comm_swap]: 2.74999e-06 [label_micro_interleaved_index]: 4.85001e-06 [label_fine_grained_interleaved_index]: 2.60002e-06 [merge_cast_opt]: 1.28002e-06 [slice_recompute_activation]: 1.99999e-06 [micro_interleaved_order_control]: 2.73e-06 [assign_add_opt]: 1.15999e-06 [ForceFp32Comm]: 1.27999e-06 [remove_cast_before_assign_add]: 1.02e-06 [full_micro_interleaved_order_control]: 2.04e-06 [reorder_send_recv_between_fp_bp]: 3.01999e-06 [comm_op_add_attrs]: 9.70002e-07 [add_comm_op_reuse_tag]: 1.02e-06 [interleave_split_concat_branches]: 1.10001e-06 [interleave_parallel_branches]: 1.03001e-06 [overlap_opt_shard_in_pipeline]: 1.25999e-06 [overlap_opt_shard_grad_in_pipeline]: 2.22999e-06 [control_data_broadcast_order]: 1.785e-05 [grouped_pairwise_exchange_alltoall]: 1.67001e-06 [offloading_packed_experts]: 4.75999e-06 [overlap_recompute_and_grad_model_parallel]: 5.61e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.19e-06 [overlap_recompute_allgather_and_fa_grad]: 1.40999e-06 [overlap_recompute_comm]: 2.22999e-06 [overlap_grad_ring_attention]: 5.09998e-06 [overlap_grad_flash_sp]: 2.528e-05 [begin_end_overlap_inline]: 5.3001e-07 [split_matmul_comm_elemetwise]: 2.29001e-06 [split_layernorm_comm]: 2.12999e-06 [handle_group_info]: 9.30013e-07 [symbol_engine_optimizer]: 0.00012246, [1] [Cycle 1]: 0.00011733, [6] [build]: 3.53999e-06 [elim_shapecalc]: 3.434e-05 [elim_not_effective]: 2.025e-05 [opt_reshape]: 9.56e-06 [fold_const_symbol]: 1.532e-05 [renormalize]: 2.19996e-07 [detach_backward]: 2.11998e-06 [pipeline_parallel_scheduler]: 1.66e-06 [auto_monad_reorder]: 2.402e-05 [get_jit_bprop_graph]: 2.00002e-06 [rewriter_after_jit_bprop_graph]: 5.99e-06 [opt_after_jit_grad]: 0.00051736 [validate]: 4.612e-05 Sums bootstrap : 0.000395s : 1.20% type_inference : 0.026535s : 80.84% event_method : 0.000022s : 0.07% auto_monad : 0.000069s : 0.21% graph_reusing : 0.000006s : 0.02% inline : 0.000003s : 0.01% add_attr.add_attr_with_inline.tag_attr : 0.000022s : 0.07% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.02% parallel-infer-symbol : 0.000004s : 0.01% pre_auto_parallel : 0.000041s : 0.12% insert-virtual-dataset : 0.000003s : 0.01% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.01% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000033s : 0.10% optimize.rewriter_before_opt_a : 0.000101s : 0.31% optimize.opt_a.expand_dump_flag : 0.000005s : 0.02% optimize.opt_a.switch_simplify : 0.000058s : 0.18% optimize.opt_a.loop_unroll : 0.000041s : 0.13% optimize.opt_a.a_1 : 0.000969s : 2.95% optimize.opt_a.with_stream_mark : 0.000042s : 0.13% optimize.opt_a.recompute_prepare : 0.000025s : 0.07% optimize.opt_a.updatestate_depend_eliminate : 0.000011s : 0.03% optimize.opt_a.updatestate_assign_eliminate : 0.000009s : 0.03% optimize.opt_a.updatestate_loads_eliminate : 0.000008s : 0.02% optimize.opt_a.parameter_eliminate : 0.000004s : 0.01% optimize.opt_a.a_2 : 0.000237s : 0.72% optimize.opt_a.accelerated_algorithm : 0.000020s : 0.06% optimize.opt_a.shard : 0.000004s : 0.01% optimize.opt_a.meta_shard_fg_expand : 0.000006s : 0.02% optimize.opt_a.shard_inline : 0.000019s : 0.06% optimize.opt_a.merge_send_recv : 0.000021s : 0.06% optimize.opt_a.auto_parallel : 0.000017s : 0.05% optimize.opt_a.parallel : 0.000027s : 0.08% optimize.opt_a.flash_sp : 0.000016s : 0.05% optimize.opt_a.merge_comm : 0.000011s : 0.03% optimize.opt_a.allreduce_fusion : 0.000009s : 0.03% optimize.opt_a.matmul_add_comm_reduction : 0.000023s : 0.07% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000023s : 0.07% optimize.opt_a.virtual_dataset : 0.000017s : 0.05% optimize.opt_a.get_grad_eliminate_ : 0.000017s : 0.05% optimize.opt_a.virtual_output : 0.000018s : 0.05% optimize.opt_a.merge_forward : 0.000011s : 0.03% optimize.opt_a.cell_reuse_recompute_pass : 0.000004s : 0.01% optimize.opt_a.offload_activation : 0.000024s : 0.07% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000040s : 0.12% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.01% optimize.opt_a.before_grad : 0.000031s : 0.10% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000012s : 0.04% optimize.opt_a.meta_fg_expand : 0.000007s : 0.02% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.01% optimize.opt_a.receive_attached : 0.000005s : 0.01% optimize.opt_a.after_resolve : 0.000028s : 0.09% optimize.opt_a.a_after_grad : 0.000028s : 0.08% optimize.opt_a.renormalize : 0.000910s : 2.77% optimize.opt_a.add_forward_monad_depend : 0.000009s : 0.03% optimize.opt_a.auto_monad_grad : 0.000005s : 0.01% optimize.opt_a.auto_monad_eliminator : 0.000036s : 0.11% optimize.opt_a.cse : 0.000074s : 0.22% optimize.opt_a.a_3 : 0.000124s : 0.38% optimize.py_interpret_to_execute_after_opt_a : 0.000015s : 0.05% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.01% optimize.rewriter_after_opt_a : 0.000049s : 0.15% optimize.convert_after_rewriter : 0.000009s : 0.03% optimize.order_py_execute_after_rewriter : 0.000006s : 0.02% optimize.mutable_eliminate : 0.000599s : 1.82% optimize.opt_b.b_1 : 0.000184s : 0.56% optimize.opt_b.b_2 : 0.000011s : 0.03% optimize.opt_b.updatestate_depend_eliminate : 0.000008s : 0.02% optimize.opt_b.updatestate_assign_eliminate : 0.000004s : 0.01% optimize.opt_b.updatestate_loads_eliminate : 0.000004s : 0.01% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000034s : 0.10% optimize.optimize_parallel_all_gather_comm : 0.000019s : 0.06% optimize.overlap_param_gather : 0.000002s : 0.01% optimize.cconv : 0.000028s : 0.09% optimize.loop_unroll : 0.000486s : 1.48% optimize.opt_after_cconv.c_1 : 0.000044s : 0.13% optimize.opt_after_cconv.parameter_eliminate : 0.000004s : 0.01% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000010s : 0.03% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000004s : 0.01% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000004s : 0.01% optimize.opt_after_cconv.cse : 0.000036s : 0.11% optimize.opt_after_cconv.renormalize : 0.000001s : 0.00% optimize.remove_dup_value : 0.000047s : 0.14% optimize.tuple_transform.d_1 : 0.000064s : 0.20% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000010s : 0.03% optimize.partial_unused_args_eliminate : 0.000002s : 0.01% optimize.add_recomputation : 0.000067s : 0.20% optimize.cse_after_recomputation.cse : 0.000017s : 0.05% optimize.environ_conv : 0.000006s : 0.02% optimize.swap_dp_allreduce_reducescatter : 0.000006s : 0.02% optimize.bias_add_comm_swap : 0.000003s : 0.01% optimize.label_micro_interleaved_index : 0.000005s : 0.01% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.01% optimize.merge_cast_opt : 0.000001s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.01% optimize.micro_interleaved_order_control : 0.000003s : 0.01% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.00% optimize.full_micro_interleaved_order_control : 0.000002s : 0.01% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.01% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.01% optimize.control_data_broadcast_order : 0.000018s : 0.05% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.01% optimize.offloading_packed_experts : 0.000005s : 0.01% optimize.overlap_recompute_and_grad_model_parallel : 0.000006s : 0.02% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.00% optimize.overlap_recompute_comm : 0.000002s : 0.01% optimize.overlap_grad_ring_attention : 0.000005s : 0.02% optimize.overlap_grad_flash_sp : 0.000025s : 0.08% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.01% optimize.split_layernorm_comm : 0.000002s : 0.01% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000004s : 0.01% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000034s : 0.10% optimize.symbol_engine_optimizer.elim_not_effective : 0.000020s : 0.06% optimize.symbol_engine_optimizer.opt_reshape : 0.000010s : 0.03% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000015s : 0.05% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.01% pipeline_parallel_scheduler : 0.000002s : 0.01% auto_monad_reorder : 0.000024s : 0.07% get_jit_bprop_graph : 0.000002s : 0.01% rewriter_after_jit_bprop_graph : 0.000006s : 0.02% opt_after_jit_grad : 0.000517s : 1.58% validate : 0.000046s : 0.14% Time group info: ------[substitution.] 0.000245 48 14.26% : 0.000035s : 6: substitution.cast_eliminate 1.48% : 0.000004s : 4: substitution.elim_not_effective 1.04% : 0.000003s : 4: substitution.fold_const_symbol 3.23% : 0.000008s : 6: substitution.graph_param_transform 66.92% : 0.000164s : 4: substitution.inline 2.29% : 0.000006s : 8: substitution.j_node_and_user_rematch 3.30% : 0.000008s : 8: substitution.remove_not_recompute_node 2.15% : 0.000005s : 4: substitution.replace_old_param 5.32% : 0.000013s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.026466 2 97.01% : 0.025676s : 1: type_inference.infer 2.99% : 0.000790s : 1: type_inference.specialize ------[replace.] 0.000067 8 60.87% : 0.000041s : 4: replace.inline 39.13% : 0.000026s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000172 8 93.60% : 0.000161s : 4: match.inline 6.40% : 0.000011s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000277 1730 0.89% : 0.000002s : 17: predicate.accumulaten_eliminater 0.79% : 0.000002s : 6: predicate.ad_related_special_op_eliminate 0.59% : 0.000002s : 12: predicate.addn_check_dump 0.87% : 0.000002s : 17: predicate.addn_zero_filter 0.79% : 0.000002s : 17: predicate.adjust_all_reduce_mul_add 2.18% : 0.000006s : 29: predicate.arithmetic_simplify 1.06% : 0.000003s : 17: predicate.cast_eliminate 0.62% : 0.000002s : 12: predicate.check_bprop_eliminate 0.62% : 0.000002s : 12: predicate.compare_switch_simplify 0.21% : 0.000001s : 6: predicate.const_output_eliminate 0.67% : 0.000002s : 12: predicate.depend_value_elim 0.90% : 0.000003s : 17: predicate.dict_get_item_const_eliminator 0.97% : 0.000003s : 17: predicate.dict_get_item_eliminator 0.83% : 0.000002s : 17: predicate.dict_set_item_eliminator 0.92% : 0.000003s : 12: predicate.dumpgradient_eliminate 0.21% : 0.000001s : 6: predicate.elim_not_effective 0.45% : 0.000001s : 6: predicate.elim_shapecalc_of_broadcastargs 1.15% : 0.000003s : 23: predicate.environ_add_const_eliminate 1.12% : 0.000003s : 23: predicate.environ_get_add_eliminate 1.10% : 0.000003s : 23: predicate.environ_get_depend_swap 1.87% : 0.000005s : 35: predicate.environ_get_eliminate 1.21% : 0.000003s : 23: predicate.environ_get_set_eliminate 1.30% : 0.000004s : 25: predicate.exchange_switch_depend_value 2.28% : 0.000006s : 25: predicate.float_depend_g_call 0.68% : 0.000002s : 12: predicate.float_environ_get_switch 0.87% : 0.000002s : 18: predicate.float_tuple_getitem_switch 0.22% : 0.000001s : 6: predicate.fold_const_symbol 0.76% : 0.000002s : 12: predicate.get_grad_eliminate 0.23% : 0.000001s : 6: predicate.graph_param_transform 0.69% : 0.000002s : 12: predicate.incorporate_call 0.59% : 0.000002s : 12: predicate.incorporate_call_switch 6.82% : 0.000019s : 78: predicate.inline 0.87% : 0.000002s : 12: predicate.inline_without_move 0.36% : 0.000001s : 12: predicate.j_node_and_user_rematch 0.96% : 0.000003s : 12: predicate.less_batch_normalization 1.74% : 0.000005s : 33: predicate.list_to_tuple_eliminator_ 2.42% : 0.000007s : 50: predicate.load_eliminater 0.99% : 0.000003s : 6: predicate.loop_unroll_after_grad 2.02% : 0.000006s : 38: predicate.loop_unroll_before_grad 1.71% : 0.000005s : 29: predicate.make_slice_get_slice_eliminator 0.67% : 0.000002s : 12: predicate.merge_addn 0.61% : 0.000002s : 12: predicate.micro_step_allgather_replace 0.60% : 0.000002s : 12: predicate.mini_step_allgather_replace 0.79% : 0.000002s : 17: predicate.minmaximum_grad 0.91% : 0.000003s : 6: predicate.mutable_eliminate 0.41% : 0.000001s : 6: predicate.opt_reshape 0.41% : 0.000001s : 6: predicate.parallel_virtual_node 1.62% : 0.000004s : 25: predicate.partial_defer_inline 1.61% : 0.000004s : 27: predicate.partial_eliminate 0.86% : 0.000002s : 17: predicate.print_const_string_wrapper 0.62% : 0.000002s : 12: predicate.reduce_all_const_elim 1.16% : 0.000003s : 17: predicate.reduce_eliminate 2.53% : 0.000007s : 50: predicate.redundant_stop_gradient_eliminater 0.39% : 0.000001s : 12: predicate.remove_not_recompute_node 1.32% : 0.000004s : 33: predicate.replace_applicator 0.52% : 0.000001s : 12: predicate.replace_old_param 0.25% : 0.000001s : 6: predicate.reset_defer_inline 0.89% : 0.000002s : 17: predicate.reshape_eliminate 0.80% : 0.000002s : 12: predicate.row_tensor_add_zeros_like 0.47% : 0.000001s : 6: predicate.row_tensor_eliminate 0.80% : 0.000002s : 12: predicate.same_eliminate 0.51% : 0.000001s : 12: predicate.set_cell_output_no_recompute 0.88% : 0.000002s : 12: predicate.shard_identity_eliminate 0.83% : 0.000002s : 12: predicate.special_op_eliminate 0.83% : 0.000002s : 12: predicate.specialize_transform 0.84% : 0.000002s : 12: predicate.split_environ_get_set_with_tuple_value 0.87% : 0.000002s : 12: predicate.stack_unstack_eliminate 0.36% : 0.000001s : 6: predicate.switch_call_monad_eliminater 1.32% : 0.000004s : 25: predicate.switch_defer_inline 1.99% : 0.000006s : 37: predicate.switch_layer_defer_inline 4.67% : 0.000013s : 81: predicate.switch_simplify 0.84% : 0.000002s : 17: predicate.tile_eliminate 0.83% : 0.000002s : 17: predicate.transpose_eliminate 1.53% : 0.000004s : 29: predicate.tuple_list_convert_item_index_to_positive 1.54% : 0.000004s : 29: predicate.tuple_list_get_item_const_eliminator 1.52% : 0.000004s : 29: predicate.tuple_list_get_item_depend_reorder 3.33% : 0.000009s : 45: predicate.tuple_list_get_item_eliminator 1.51% : 0.000004s : 29: predicate.tuple_list_get_set_item_eliminator 2.26% : 0.000006s : 41: predicate.tuple_list_set_item_eliminator 1.69% : 0.000005s : 33: predicate.tuple_to_list_eliminator_ 2.41% : 0.000007s : 50: predicate.updatestate_pure_node_eliminater 3.06% : 0.000008s : 62: predicate.updatestate_useless_node_eliminater 0.48% : 0.000001s : 6: predicate.value_based_eliminate 0.68% : 0.000002s : 12: predicate.virtual_dataset_eliminate 0.68% : 0.000002s : 12: predicate.virtual_output_eliminate 0.34% : 0.000001s : 6: predicate.virtual_view_grad_eliminate 0.46% : 0.000001s : 6: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000590 11 52.44% : 0.000309s : 5: func_graph_cloner_run.FuncGraphClonerGraph 47.56% : 0.000281s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.050052 192 0.01% : 0.000004s : 1: ForceFp32Comm 7.37% : 0.003687s : 1: add_attr 7.34% : 0.003674s : 1: add_attr_with_inline 0.01% : 0.000004s : 1: add_comm_op_reuse_tag 0.14% : 0.000071s : 1: add_recomputation 0.01% : 0.000004s : 1: assign_add_opt 0.15% : 0.000074s : 1: auto_monad 0.06% : 0.000028s : 1: auto_monad_reorder 0.01% : 0.000004s : 1: begin_end_overlap_inline 0.01% : 0.000006s : 1: bias_add_comm_swap 0.84% : 0.000419s : 1: bootstrap 0.06% : 0.000032s : 1: cconv 0.01% : 0.000004s : 1: comm_op_add_attrs 0.04% : 0.000021s : 1: control_data_broadcast_order 0.02% : 0.000012s : 1: convert_after_rewriter 0.06% : 0.000031s : 1: cse_after_recomputation 0.01% : 0.000005s : 1: dataset_repeat_opt 0.01% : 0.000006s : 1: detach_backward 0.02% : 0.000010s : 1: environ_conv 0.06% : 0.000028s : 1: event_method 0.01% : 0.000005s : 1: full_micro_interleaved_order_control 0.01% : 0.000005s : 1: get_jit_bprop_graph 0.02% : 0.000009s : 1: graph_reusing 0.01% : 0.000005s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000004s : 1: handle_group_info 0.01% : 0.000006s : 1: inline 0.01% : 0.000006s : 1: insert-virtual-dataset 0.01% : 0.000004s : 1: interleave_parallel_branches 0.01% : 0.000004s : 1: interleave_split_concat_branches 0.01% : 0.000006s : 1: label_fine_grained_interleaved_index 0.02% : 0.000008s : 1: label_micro_interleaved_index 0.99% : 0.000497s : 1: loop_unroll 0.01% : 0.000004s : 1: merge_cast_opt 0.01% : 0.000006s : 1: micro_interleaved_order_control 1.21% : 0.000607s : 1: mutable_eliminate 0.02% : 0.000008s : 1: offloading_packed_experts 0.04% : 0.000022s : 1: opt.transform.loop_unroll_optimizer 0.04% : 0.000019s : 1: opt.transform.mutable_eliminate 3.19% : 0.001594s : 78: opt.transform.opt_a 0.08% : 0.000042s : 1: opt.transform.opt_after_cconv 0.07% : 0.000036s : 1: opt.transform.opt_after_jit_grad 0.33% : 0.000163s : 28: opt.transform.opt_b 0.14% : 0.000071s : 2: opt.transform.opt_trans_graph 0.15% : 0.000075s : 4: opt.transform.symbol_engine_opt 7.07% : 0.003538s : 1: opt_a 0.30% : 0.000151s : 1: opt_after_cconv 1.06% : 0.000529s : 1: opt_after_jit_grad 0.59% : 0.000293s : 1: opt_b 12.06% : 0.006037s : 1: optimize 0.05% : 0.000023s : 1: optimize_parallel_all_gather_comm 0.02% : 0.000010s : 1: order_py_execute_after_rewriter 0.06% : 0.000028s : 1: overlap_grad_flash_sp 0.01% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.02% : 0.000008s : 1: overlap_grad_ring_attention 0.01% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.01% : 0.000005s : 1: overlap_param_gather 0.01% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.02% : 0.000008s : 1: overlap_recompute_and_grad_model_parallel 0.01% : 0.000005s : 1: overlap_recompute_comm 0.02% : 0.000008s : 1: parallel-infer-symbol 0.01% : 0.000004s : 1: parallel-infer-symbol-second 0.01% : 0.000005s : 1: partial_unused_args_eliminate 0.01% : 0.000005s : 1: pipeline_parallel_scheduler 0.01% : 0.000004s : 1: pipeline_split 0.09% : 0.000045s : 1: pre_auto_parallel 0.07% : 0.000037s : 1: py_interpret_to_execute 0.04% : 0.000018s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000004s : 1: remove_cast_before_assign_add 0.10% : 0.000051s : 1: remove_dup_value 1.05% : 0.000528s : 1: renormalize.infer 0.74% : 0.000372s : 1: renormalize.specialize 0.01% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.02% : 0.000010s : 1: rewriter_after_jit_bprop_graph 0.11% : 0.000054s : 1: rewriter_after_opt_a 0.21% : 0.000106s : 1: rewriter_before_opt_a 0.01% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.01% : 0.000005s : 1: slice_recompute_activation 0.01% : 0.000005s : 1: split_layernorm_comm 0.01% : 0.000005s : 1: split_matmul_comm_elemetwise 0.02% : 0.000009s : 1: swap_dp_allreduce_reducescatter 0.25% : 0.000125s : 1: symbol_engine_optimizer 0.21% : 0.000104s : 1: tuple_transform 53.05% : 0.026555s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:45:15.693.30 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:45:15.696.37 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0386478, [21] [bootstrap]: 0.00047027 [type_inference]: 0.00654138 [event_method]: 2.212e-05 [auto_monad]: 6.756e-05 [graph_reusing]: 5.77001e-06 [inline]: 2.22001e-06 [add_attr]: 0.0033277, [1] [add_attr_with_inline]: 0.0033172, [1] [Cycle 1]: 8.966e-05, [2] [tag_attr]: 2.194e-05 [meta_addattr_fg_expand]: 6.28e-06 [parallel-infer-symbol]: 3.21001e-06 [pre_auto_parallel]: 3.879e-05 [insert-virtual-dataset]: 2.60002e-06 [parallel-infer-symbol-second]: 7.30011e-07 [dataset_repeat_opt]: 2.19001e-06 [pipeline_split]: 1.56002e-06 [optimize]: 0.00631558, [53] [py_interpret_to_execute]: 3.216e-05 [rewriter_before_opt_a]: 9.376e-05 [opt_a]: 0.00372207, [2] [Cycle 1]: 0.00269247, [45] [expand_dump_flag]: 3.05002e-06 [switch_simplify]: 4.487e-05 [loop_unroll]: 3.297e-05 [a_1]: 0.00076839 [with_stream_mark]: 2.183e-05 [recompute_prepare]: 1.324e-05 [updatestate_depend_eliminate]: 5.91e-06 [updatestate_assign_eliminate]: 4.09002e-06 [updatestate_loads_eliminate]: 3.93001e-06 [parameter_eliminate]: 2.28002e-06 [a_2]: 0.0001306 [accelerated_algorithm]: 9.44e-06 [shard]: 2.25002e-06 [meta_shard_fg_expand]: 2.79001e-06 [shard_inline]: 8.15e-06 [merge_send_recv]: 9.89001e-06 [auto_parallel]: 9.47999e-06 [parallel]: 1.975e-05 [flash_sp]: 1.038e-05 [merge_comm]: 4.77998e-06 [allreduce_fusion]: 5.17e-06 [matmul_add_comm_reduction]: 1.137e-05 [allreduce_slice_to_reducescatter]: 6.89994e-07 [virtual_shard_identity]: 1.21e-05 [virtual_dataset]: 8.69003e-06 [get_grad_eliminate_]: 8.2e-06 [virtual_output]: 7.6e-06 [merge_forward]: 5.54e-06 [cell_reuse_recompute_pass]: 1.76e-06 [offload_activation]: 1.151e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.063e-05 [merge_recompute_call_nodes]: 1.50001e-06 [before_grad]: 1.379e-05 [set_forward_comm_id_for_comm_node_pass]: 5.02e-06 [meta_fg_expand]: 3.46001e-06 [flash_sp_send_recv_attached]: 3.83001e-06 [receive_attached]: 2.41998e-06 [after_resolve]: 1.425e-05 [a_after_grad]: 1.28e-05 [renormalize]: 0.00085433 [add_forward_monad_depend]: 7.41999e-06 [auto_monad_grad]: 2.68998e-06 [auto_monad_eliminator]: 2.096e-05 [cse]: 3.705e-05 [a_3]: 7.662e-05 [Cycle 2]: 0.00101223, [45] [expand_dump_flag]: 2.61e-06 [switch_simplify]: 1.036e-05 [loop_unroll]: 7.34002e-06 [a_1]: 0.00017988 [with_stream_mark]: 1.868e-05 [recompute_prepare]: 9.05999e-06 [updatestate_depend_eliminate]: 4.40999e-06 [updatestate_assign_eliminate]: 3.69002e-06 [updatestate_loads_eliminate]: 3.09999e-06 [parameter_eliminate]: 2.01003e-06 [a_2]: 0.00011998 [accelerated_algorithm]: 8.81002e-06 [shard]: 1.85001e-06 [meta_shard_fg_expand]: 2.24001e-06 [shard_inline]: 7.80998e-06 [merge_send_recv]: 7.91001e-06 [auto_parallel]: 8.51002e-06 [parallel]: 6.86999e-06 [flash_sp]: 3.38e-06 [merge_comm]: 5.06002e-06 [allreduce_fusion]: 4e-06 [matmul_add_comm_reduction]: 1.025e-05 [allreduce_slice_to_reducescatter]: 4.90021e-07 [virtual_shard_identity]: 1.019e-05 [virtual_dataset]: 7.75e-06 [get_grad_eliminate_]: 7.56999e-06 [virtual_output]: 7.16999e-06 [merge_forward]: 4.84e-06 [cell_reuse_recompute_pass]: 2.21998e-06 [offload_activation]: 9.54e-06 [cell_reuse_handle_not_recompute_node_pass]: 2.011e-05 [merge_recompute_call_nodes]: 1.15999e-06 [before_grad]: 1.317e-05 [set_forward_comm_id_for_comm_node_pass]: 5.81e-06 [meta_fg_expand]: 3.56999e-06 [flash_sp_send_recv_attached]: 1.29e-06 [receive_attached]: 2.37999e-06 [after_resolve]: 1.239e-05 [a_after_grad]: 1.218e-05 [renormalize]: 8.00064e-08 [add_forward_monad_depend]: 2.31e-06 [auto_monad_grad]: 2.12001e-06 [auto_monad_eliminator]: 1.284e-05 [cse]: 2.394e-05 [a_3]: 6.069e-05 [py_interpret_to_execute_after_opt_a]: 1.866e-05 [slice_cell_reuse_recomputed_activation]: 5.27001e-06 [rewriter_after_opt_a]: 5.237e-05 [convert_after_rewriter]: 1.176e-05 [order_py_execute_after_rewriter]: 8.89003e-06 [mutable_eliminate]: 0.00060385 [opt_b]: 0.00032152, [1] [Cycle 1]: 0.00031102, [7] [b_1]: 0.0001991 [b_2]: 9.91e-06 [updatestate_depend_eliminate]: 9.04e-06 [updatestate_assign_eliminate]: 3.46001e-06 [updatestate_loads_eliminate]: 3.05998e-06 [renormalize]: 6.89994e-07 [cse]: 2.706e-05 [optimize_parallel_all_gather_comm]: 2.283e-05 [overlap_param_gather]: 5.13002e-06 [cconv]: 3.707e-05 [loop_unroll]: 0.00046249 [opt_after_cconv]: 0.0001484, [1] [Cycle 1]: 0.00013951, [7] [c_1]: 3.819e-05 [parameter_eliminate]: 4.37003e-06 [updatestate_depend_eliminate]: 8.17e-06 [updatestate_assign_eliminate]: 3.46001e-06 [updatestate_loads_eliminate]: 2.95002e-06 [cse]: 2.513e-05 [renormalize]: 3.60014e-07 [remove_dup_value]: 1.971e-05 [tuple_transform]: 0.00010274, [1] [Cycle 1]: 9.474e-05, [4] [d_1]: 5.328e-05 [none_parameter_eliminate]: 1.72001e-06 [renormalize]: 1.8999e-07 [switch_simplify]: 8.57998e-06 [partial_unused_args_eliminate]: 4.48999e-06 [add_recomputation]: 6.384e-05 [cse_after_recomputation]: 3.373e-05, [1] [Cycle 1]: 2.637e-05, [1] [cse]: 1.602e-05 [environ_conv]: 1.079e-05 [swap_dp_allreduce_reducescatter]: 9.29e-06 [bias_add_comm_swap]: 5.24003e-06 [label_micro_interleaved_index]: 7.15003e-06 [label_fine_grained_interleaved_index]: 6.21998e-06 [merge_cast_opt]: 4.34002e-06 [slice_recompute_activation]: 5.05001e-06 [micro_interleaved_order_control]: 5.35999e-06 [assign_add_opt]: 3.75e-06 [ForceFp32Comm]: 3.55998e-06 [remove_cast_before_assign_add]: 3.38e-06 [full_micro_interleaved_order_control]: 4.73001e-06 [reorder_send_recv_between_fp_bp]: 5.48002e-06 [comm_op_add_attrs]: 3.31999e-06 [add_comm_op_reuse_tag]: 3.31001e-06 [interleave_split_concat_branches]: 3.45998e-06 [interleave_parallel_branches]: 3.53999e-06 [overlap_opt_shard_in_pipeline]: 3.61999e-06 [overlap_opt_shard_grad_in_pipeline]: 4.68999e-06 [control_data_broadcast_order]: 2.081e-05 [grouped_pairwise_exchange_alltoall]: 3.98999e-06 [offloading_packed_experts]: 7.27002e-06 [overlap_recompute_and_grad_model_parallel]: 8.11002e-06 [overlap_grad_matmul_and_grad_allreduce]: 4.19002e-06 [overlap_recompute_allgather_and_fa_grad]: 3.80998e-06 [overlap_recompute_comm]: 5.46e-06 [overlap_grad_ring_attention]: 7.88999e-06 [overlap_grad_flash_sp]: 2.985e-05 [begin_end_overlap_inline]: 2.91999e-06 [split_matmul_comm_elemetwise]: 4.53001e-06 [split_layernorm_comm]: 4.2e-06 [handle_group_info]: 3.23998e-06 [symbol_engine_optimizer]: 0.00011401, [1] [Cycle 1]: 0.00010611, [6] [build]: 4.25e-06 [elim_shapecalc]: 1.382e-05 [elim_not_effective]: 1.627e-05 [opt_reshape]: 8.64e-06 [fold_const_symbol]: 1.27e-05 [renormalize]: 2.30008e-07 [detach_backward]: 1.156e-05 [pipeline_parallel_scheduler]: 2.46998e-06 [auto_monad_reorder]: 4.409e-05 [get_jit_bprop_graph]: 2.56e-06 [rewriter_after_jit_bprop_graph]: 1.22e-05 [opt_after_jit_grad]: 0.00085335 [validate]: 5.834e-05 Sums bootstrap : 0.000470s : 3.56% type_inference : 0.006541s : 49.57% event_method : 0.000022s : 0.17% auto_monad : 0.000068s : 0.51% graph_reusing : 0.000006s : 0.04% inline : 0.000002s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000022s : 0.17% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.05% parallel-infer-symbol : 0.000003s : 0.02% pre_auto_parallel : 0.000039s : 0.29% insert-virtual-dataset : 0.000003s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000032s : 0.24% optimize.rewriter_before_opt_a : 0.000094s : 0.71% optimize.opt_a.expand_dump_flag : 0.000006s : 0.04% optimize.opt_a.switch_simplify : 0.000055s : 0.42% optimize.opt_a.loop_unroll : 0.000040s : 0.31% optimize.opt_a.a_1 : 0.000948s : 7.19% optimize.opt_a.with_stream_mark : 0.000041s : 0.31% optimize.opt_a.recompute_prepare : 0.000022s : 0.17% optimize.opt_a.updatestate_depend_eliminate : 0.000010s : 0.08% optimize.opt_a.updatestate_assign_eliminate : 0.000008s : 0.06% optimize.opt_a.updatestate_loads_eliminate : 0.000007s : 0.05% optimize.opt_a.parameter_eliminate : 0.000004s : 0.03% optimize.opt_a.a_2 : 0.000251s : 1.90% optimize.opt_a.accelerated_algorithm : 0.000018s : 0.14% optimize.opt_a.shard : 0.000004s : 0.03% optimize.opt_a.meta_shard_fg_expand : 0.000005s : 0.04% optimize.opt_a.shard_inline : 0.000016s : 0.12% optimize.opt_a.merge_send_recv : 0.000018s : 0.13% optimize.opt_a.auto_parallel : 0.000018s : 0.14% optimize.opt_a.parallel : 0.000027s : 0.20% optimize.opt_a.flash_sp : 0.000014s : 0.10% optimize.opt_a.merge_comm : 0.000010s : 0.07% optimize.opt_a.allreduce_fusion : 0.000009s : 0.07% optimize.opt_a.matmul_add_comm_reduction : 0.000022s : 0.16% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000022s : 0.17% optimize.opt_a.virtual_dataset : 0.000016s : 0.12% optimize.opt_a.get_grad_eliminate_ : 0.000016s : 0.12% optimize.opt_a.virtual_output : 0.000015s : 0.11% optimize.opt_a.merge_forward : 0.000010s : 0.08% optimize.opt_a.cell_reuse_recompute_pass : 0.000004s : 0.03% optimize.opt_a.offload_activation : 0.000021s : 0.16% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000041s : 0.31% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.02% optimize.opt_a.before_grad : 0.000027s : 0.20% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000011s : 0.08% optimize.opt_a.meta_fg_expand : 0.000007s : 0.05% optimize.opt_a.flash_sp_send_recv_attached : 0.000005s : 0.04% optimize.opt_a.receive_attached : 0.000005s : 0.04% optimize.opt_a.after_resolve : 0.000027s : 0.20% optimize.opt_a.a_after_grad : 0.000025s : 0.19% optimize.opt_a.renormalize : 0.000854s : 6.47% optimize.opt_a.add_forward_monad_depend : 0.000010s : 0.07% optimize.opt_a.auto_monad_grad : 0.000005s : 0.04% optimize.opt_a.auto_monad_eliminator : 0.000034s : 0.26% optimize.opt_a.cse : 0.000061s : 0.46% optimize.opt_a.a_3 : 0.000137s : 1.04% optimize.py_interpret_to_execute_after_opt_a : 0.000019s : 0.14% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.04% optimize.rewriter_after_opt_a : 0.000052s : 0.40% optimize.convert_after_rewriter : 0.000012s : 0.09% optimize.order_py_execute_after_rewriter : 0.000009s : 0.07% optimize.mutable_eliminate : 0.000604s : 4.58% optimize.opt_b.b_1 : 0.000199s : 1.51% optimize.opt_b.b_2 : 0.000010s : 0.08% optimize.opt_b.updatestate_depend_eliminate : 0.000009s : 0.07% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.02% optimize.opt_b.renormalize : 0.000001s : 0.01% optimize.opt_b.cse : 0.000027s : 0.21% optimize.optimize_parallel_all_gather_comm : 0.000023s : 0.17% optimize.overlap_param_gather : 0.000005s : 0.04% optimize.cconv : 0.000037s : 0.28% optimize.loop_unroll : 0.000462s : 3.50% optimize.opt_after_cconv.c_1 : 0.000038s : 0.29% optimize.opt_after_cconv.parameter_eliminate : 0.000004s : 0.03% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000008s : 0.06% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.02% optimize.opt_after_cconv.cse : 0.000025s : 0.19% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000020s : 0.15% optimize.tuple_transform.d_1 : 0.000053s : 0.40% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000009s : 0.07% optimize.partial_unused_args_eliminate : 0.000004s : 0.03% optimize.add_recomputation : 0.000064s : 0.48% optimize.cse_after_recomputation.cse : 0.000016s : 0.12% optimize.environ_conv : 0.000011s : 0.08% optimize.swap_dp_allreduce_reducescatter : 0.000009s : 0.07% optimize.bias_add_comm_swap : 0.000005s : 0.04% optimize.label_micro_interleaved_index : 0.000007s : 0.05% optimize.label_fine_grained_interleaved_index : 0.000006s : 0.05% optimize.merge_cast_opt : 0.000004s : 0.03% optimize.slice_recompute_activation : 0.000005s : 0.04% optimize.micro_interleaved_order_control : 0.000005s : 0.04% optimize.assign_add_opt : 0.000004s : 0.03% optimize.ForceFp32Comm : 0.000004s : 0.03% optimize.remove_cast_before_assign_add : 0.000003s : 0.03% optimize.full_micro_interleaved_order_control : 0.000005s : 0.04% optimize.reorder_send_recv_between_fp_bp : 0.000005s : 0.04% optimize.comm_op_add_attrs : 0.000003s : 0.03% optimize.add_comm_op_reuse_tag : 0.000003s : 0.03% optimize.interleave_split_concat_branches : 0.000003s : 0.03% optimize.interleave_parallel_branches : 0.000004s : 0.03% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.03% optimize.overlap_opt_shard_grad_in_pipeline : 0.000005s : 0.04% optimize.control_data_broadcast_order : 0.000021s : 0.16% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.03% optimize.offloading_packed_experts : 0.000007s : 0.06% optimize.overlap_recompute_and_grad_model_parallel : 0.000008s : 0.06% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.03% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.03% optimize.overlap_recompute_comm : 0.000005s : 0.04% optimize.overlap_grad_ring_attention : 0.000008s : 0.06% optimize.overlap_grad_flash_sp : 0.000030s : 0.23% optimize.begin_end_overlap_inline : 0.000003s : 0.02% optimize.split_matmul_comm_elemetwise : 0.000005s : 0.03% optimize.split_layernorm_comm : 0.000004s : 0.03% optimize.handle_group_info : 0.000003s : 0.02% optimize.symbol_engine_optimizer.build : 0.000004s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000014s : 0.10% optimize.symbol_engine_optimizer.elim_not_effective : 0.000016s : 0.12% optimize.symbol_engine_optimizer.opt_reshape : 0.000009s : 0.07% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000013s : 0.10% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000012s : 0.09% pipeline_parallel_scheduler : 0.000002s : 0.02% auto_monad_reorder : 0.000044s : 0.33% get_jit_bprop_graph : 0.000003s : 0.02% rewriter_after_jit_bprop_graph : 0.000012s : 0.09% opt_after_jit_grad : 0.000853s : 6.47% validate : 0.000058s : 0.44% Time group info: ------[substitution.] 0.000229 38 11.82% : 0.000027s : 3: substitution.cast_eliminate 0.94% : 0.000002s : 3: substitution.elim_not_effective 0.71% : 0.000002s : 3: substitution.fold_const_symbol 3.07% : 0.000007s : 5: substitution.graph_param_transform 68.84% : 0.000157s : 4: substitution.inline 2.34% : 0.000005s : 6: substitution.j_node_and_user_rematch 3.00% : 0.000007s : 6: substitution.remove_not_recompute_node 2.35% : 0.000005s : 4: substitution.replace_old_param 6.92% : 0.000016s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.006476 2 88.10% : 0.005705s : 1: type_inference.infer 11.90% : 0.000771s : 1: type_inference.specialize ------[replace.] 0.000068 8 60.52% : 0.000041s : 4: replace.inline 39.48% : 0.000027s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000168 8 92.06% : 0.000154s : 4: match.inline 7.94% : 0.000013s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000264 1596 0.91% : 0.000002s : 17: predicate.accumulaten_eliminater 1.30% : 0.000003s : 5: predicate.ad_related_special_op_eliminate 0.53% : 0.000001s : 10: predicate.addn_check_dump 1.02% : 0.000003s : 17: predicate.addn_zero_filter 0.89% : 0.000002s : 17: predicate.adjust_all_reduce_mul_add 1.97% : 0.000005s : 27: predicate.arithmetic_simplify 1.06% : 0.000003s : 17: predicate.cast_eliminate 0.59% : 0.000002s : 10: predicate.check_bprop_eliminate 0.54% : 0.000001s : 10: predicate.compare_switch_simplify 0.18% : 0.000000s : 5: predicate.const_output_eliminate 0.59% : 0.000002s : 10: predicate.depend_value_elim 0.94% : 0.000002s : 17: predicate.dict_get_item_const_eliminator 1.06% : 0.000003s : 17: predicate.dict_get_item_eliminator 0.85% : 0.000002s : 17: predicate.dict_set_item_eliminator 1.34% : 0.000004s : 10: predicate.dumpgradient_eliminate 0.30% : 0.000001s : 5: predicate.elim_not_effective 0.37% : 0.000001s : 5: predicate.elim_shapecalc_of_broadcastargs 1.17% : 0.000003s : 22: predicate.environ_add_const_eliminate 1.12% : 0.000003s : 22: predicate.environ_get_add_eliminate 1.13% : 0.000003s : 22: predicate.environ_get_depend_swap 1.86% : 0.000005s : 32: predicate.environ_get_eliminate 1.09% : 0.000003s : 22: predicate.environ_get_set_eliminate 1.35% : 0.000004s : 25: predicate.exchange_switch_depend_value 2.57% : 0.000007s : 25: predicate.float_depend_g_call 0.53% : 0.000001s : 10: predicate.float_environ_get_switch 0.79% : 0.000002s : 15: predicate.float_tuple_getitem_switch 0.16% : 0.000000s : 5: predicate.fold_const_symbol 0.69% : 0.000002s : 10: predicate.get_grad_eliminate 0.19% : 0.000001s : 5: predicate.graph_param_transform 0.64% : 0.000002s : 10: predicate.incorporate_call 0.50% : 0.000001s : 10: predicate.incorporate_call_switch 6.68% : 0.000018s : 72: predicate.inline 1.05% : 0.000003s : 10: predicate.inline_without_move 0.32% : 0.000001s : 10: predicate.j_node_and_user_rematch 0.81% : 0.000002s : 10: predicate.less_batch_normalization 1.73% : 0.000005s : 31: predicate.list_to_tuple_eliminator_ 2.46% : 0.000006s : 48: predicate.load_eliminater 1.04% : 0.000003s : 5: predicate.loop_unroll_after_grad 1.98% : 0.000005s : 36: predicate.loop_unroll_before_grad 1.75% : 0.000005s : 27: predicate.make_slice_get_slice_eliminator 0.55% : 0.000001s : 10: predicate.merge_addn 0.54% : 0.000001s : 10: predicate.micro_step_allgather_replace 0.53% : 0.000001s : 10: predicate.mini_step_allgather_replace 0.83% : 0.000002s : 17: predicate.minmaximum_grad 0.94% : 0.000002s : 5: predicate.mutable_eliminate 0.32% : 0.000001s : 5: predicate.opt_reshape 0.31% : 0.000001s : 5: predicate.parallel_virtual_node 1.62% : 0.000004s : 25: predicate.partial_defer_inline 1.66% : 0.000004s : 26: predicate.partial_eliminate 0.89% : 0.000002s : 17: predicate.print_const_string_wrapper 0.56% : 0.000001s : 10: predicate.reduce_all_const_elim 1.22% : 0.000003s : 17: predicate.reduce_eliminate 2.56% : 0.000007s : 48: predicate.redundant_stop_gradient_eliminater 0.45% : 0.000001s : 10: predicate.remove_not_recompute_node 1.39% : 0.000004s : 31: predicate.replace_applicator 0.55% : 0.000001s : 10: predicate.replace_old_param 0.41% : 0.000001s : 5: predicate.reset_defer_inline 0.95% : 0.000003s : 17: predicate.reshape_eliminate 0.59% : 0.000002s : 10: predicate.row_tensor_add_zeros_like 0.43% : 0.000001s : 5: predicate.row_tensor_eliminate 0.74% : 0.000002s : 10: predicate.same_eliminate 0.41% : 0.000001s : 10: predicate.set_cell_output_no_recompute 0.92% : 0.000002s : 10: predicate.shard_identity_eliminate 0.72% : 0.000002s : 10: predicate.special_op_eliminate 0.76% : 0.000002s : 10: predicate.specialize_transform 0.91% : 0.000002s : 10: predicate.split_environ_get_set_with_tuple_value 0.89% : 0.000002s : 10: predicate.stack_unstack_eliminate 0.34% : 0.000001s : 5: predicate.switch_call_monad_eliminater 1.46% : 0.000004s : 25: predicate.switch_defer_inline 2.00% : 0.000005s : 35: predicate.switch_layer_defer_inline 4.56% : 0.000012s : 76: predicate.switch_simplify 0.93% : 0.000002s : 17: predicate.tile_eliminate 0.85% : 0.000002s : 17: predicate.transpose_eliminate 1.47% : 0.000004s : 27: predicate.tuple_list_convert_item_index_to_positive 1.58% : 0.000004s : 27: predicate.tuple_list_get_item_const_eliminator 1.45% : 0.000004s : 27: predicate.tuple_list_get_item_depend_reorder 3.43% : 0.000009s : 41: predicate.tuple_list_get_item_eliminator 1.54% : 0.000004s : 27: predicate.tuple_list_get_set_item_eliminator 2.14% : 0.000006s : 37: predicate.tuple_list_set_item_eliminator 1.67% : 0.000004s : 31: predicate.tuple_to_list_eliminator_ 2.40% : 0.000006s : 48: predicate.updatestate_pure_node_eliminater 3.28% : 0.000009s : 58: predicate.updatestate_useless_node_eliminater 0.34% : 0.000001s : 5: predicate.value_based_eliminate 0.62% : 0.000002s : 10: predicate.virtual_dataset_eliminate 0.56% : 0.000001s : 10: predicate.virtual_output_eliminate 0.27% : 0.000001s : 5: predicate.virtual_view_grad_eliminate 0.43% : 0.000001s : 5: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000613 11 55.64% : 0.000341s : 5: func_graph_cloner_run.FuncGraphClonerGraph 44.36% : 0.000272s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.050828 192 0.01% : 0.000006s : 1: ForceFp32Comm 6.57% : 0.003338s : 1: add_attr 6.53% : 0.003321s : 1: add_attr_with_inline 0.01% : 0.000006s : 1: add_comm_op_reuse_tag 0.13% : 0.000067s : 1: add_recomputation 0.01% : 0.000006s : 1: assign_add_opt 0.15% : 0.000078s : 1: auto_monad 0.10% : 0.000052s : 1: auto_monad_reorder 0.01% : 0.000006s : 1: begin_end_overlap_inline 0.02% : 0.000008s : 1: bias_add_comm_swap 1.03% : 0.000522s : 1: bootstrap 0.08% : 0.000041s : 1: cconv 0.01% : 0.000006s : 1: comm_op_add_attrs 0.05% : 0.000025s : 1: control_data_broadcast_order 0.03% : 0.000015s : 1: convert_after_rewriter 0.07% : 0.000037s : 1: cse_after_recomputation 0.01% : 0.000008s : 1: dataset_repeat_opt 0.12% : 0.000063s : 1: detach_backward 0.03% : 0.000014s : 1: environ_conv 0.07% : 0.000034s : 1: event_method 0.02% : 0.000008s : 1: full_micro_interleaved_order_control 0.02% : 0.000009s : 1: get_jit_bprop_graph 0.02% : 0.000012s : 1: graph_reusing 0.01% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000006s : 1: handle_group_info 0.02% : 0.000008s : 1: inline 0.02% : 0.000009s : 1: insert-virtual-dataset 0.01% : 0.000006s : 1: interleave_parallel_branches 0.01% : 0.000006s : 1: interleave_split_concat_branches 0.02% : 0.000010s : 1: label_fine_grained_interleaved_index 0.02% : 0.000010s : 1: label_micro_interleaved_index 0.92% : 0.000470s : 1: loop_unroll 0.01% : 0.000007s : 1: merge_cast_opt 0.02% : 0.000008s : 1: micro_interleaved_order_control 1.20% : 0.000611s : 1: mutable_eliminate 0.02% : 0.000010s : 1: offloading_packed_experts 0.04% : 0.000018s : 1: opt.transform.loop_unroll_optimizer 0.04% : 0.000019s : 1: opt.transform.mutable_eliminate 2.94% : 0.001493s : 78: opt.transform.opt_a 0.07% : 0.000036s : 1: opt.transform.opt_after_cconv 0.09% : 0.000045s : 1: opt.transform.opt_after_jit_grad 0.27% : 0.000137s : 28: opt.transform.opt_b 0.12% : 0.000060s : 2: opt.transform.opt_trans_graph 0.09% : 0.000047s : 4: opt.transform.symbol_engine_opt 7.33% : 0.003725s : 1: opt_a 0.30% : 0.000152s : 1: opt_after_cconv 1.71% : 0.000869s : 1: opt_after_jit_grad 0.64% : 0.000326s : 1: opt_b 52.83% : 0.026854s : 1: optimize 0.05% : 0.000026s : 1: optimize_parallel_all_gather_comm 0.02% : 0.000012s : 1: order_py_execute_after_rewriter 0.07% : 0.000034s : 1: overlap_grad_flash_sp 0.01% : 0.000007s : 1: overlap_grad_matmul_and_grad_allreduce 0.02% : 0.000012s : 1: overlap_grad_ring_attention 0.02% : 0.000008s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000006s : 1: overlap_opt_shard_in_pipeline 0.02% : 0.000009s : 1: overlap_param_gather 0.01% : 0.000007s : 1: overlap_recompute_allgather_and_fa_grad 0.02% : 0.000011s : 1: overlap_recompute_and_grad_model_parallel 0.02% : 0.000009s : 1: overlap_recompute_comm 0.02% : 0.000010s : 1: parallel-infer-symbol 0.01% : 0.000006s : 1: parallel-infer-symbol-second 0.01% : 0.000007s : 1: partial_unused_args_eliminate 0.02% : 0.000013s : 1: pipeline_parallel_scheduler 0.01% : 0.000007s : 1: pipeline_split 0.09% : 0.000047s : 1: pre_auto_parallel 0.07% : 0.000037s : 1: py_interpret_to_execute 0.04% : 0.000023s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000006s : 1: remove_cast_before_assign_add 0.05% : 0.000023s : 1: remove_dup_value 0.88% : 0.000449s : 1: renormalize.infer 0.77% : 0.000393s : 1: renormalize.specialize 0.02% : 0.000008s : 1: reorder_send_recv_between_fp_bp 0.04% : 0.000019s : 1: rewriter_after_jit_bprop_graph 0.11% : 0.000056s : 1: rewriter_after_opt_a 0.19% : 0.000099s : 1: rewriter_before_opt_a 0.02% : 0.000008s : 1: slice_cell_reuse_recomputed_activation 0.02% : 0.000008s : 1: slice_recompute_activation 0.01% : 0.000007s : 1: split_layernorm_comm 0.01% : 0.000007s : 1: split_matmul_comm_elemetwise 0.02% : 0.000012s : 1: swap_dp_allreduce_reducescatter 0.23% : 0.000117s : 1: symbol_engine_optimizer 0.21% : 0.000106s : 1: tuple_transform 12.96% : 0.006586s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:45:15.665.942 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0166594, [21] [bootstrap]: 0.00050427 [type_inference]: 0.00583067 [event_method]: 2.009e-05 [auto_monad]: 6.417e-05 [graph_reusing]: 6.84999e-06 [inline]: 2.84999e-06 [add_attr]: 0.00356633, [1] [add_attr_with_inline]: 0.00355543, [1] [Cycle 1]: 6.909e-05, [2] [tag_attr]: 2.069e-05 [meta_addattr_fg_expand]: 6.41e-06 [parallel-infer-symbol]: 3.68e-06 [pre_auto_parallel]: 3.538e-05 [insert-virtual-dataset]: 2.24001e-06 [parallel-infer-symbol-second]: 7.7e-07 [dataset_repeat_opt]: 2.29999e-06 [pipeline_split]: 1.97001e-06 [optimize]: 0.00579091, [53] [py_interpret_to_execute]: 2.882e-05 [rewriter_before_opt_a]: 8.749e-05 [opt_a]: 0.00343587, [2] [Cycle 1]: 0.00257057, [45] [expand_dump_flag]: 2.90002e-06 [switch_simplify]: 4.492e-05 [loop_unroll]: 3.117e-05 [a_1]: 0.00076595 [with_stream_mark]: 2.15e-05 [recompute_prepare]: 1.228e-05 [updatestate_depend_eliminate]: 5.04e-06 [updatestate_assign_eliminate]: 3.95e-06 [updatestate_loads_eliminate]: 4.31002e-06 [parameter_eliminate]: 2.54999e-06 [a_2]: 0.00010318 [accelerated_algorithm]: 8.76002e-06 [shard]: 2.54999e-06 [meta_shard_fg_expand]: 2.11e-06 [shard_inline]: 8.43999e-06 [merge_send_recv]: 1.09e-05 [auto_parallel]: 8.42e-06 [parallel]: 1.901e-05 [flash_sp]: 9.84001e-06 [merge_comm]: 4.94e-06 [allreduce_fusion]: 5.01997e-06 [matmul_add_comm_reduction]: 1.08e-05 [allreduce_slice_to_reducescatter]: 6.60017e-07 [virtual_shard_identity]: 1.124e-05 [virtual_dataset]: 8.08999e-06 [get_grad_eliminate_]: 7.4e-06 [virtual_output]: 7.26999e-06 [merge_forward]: 4.33999e-06 [cell_reuse_recompute_pass]: 1.96e-06 [offload_activation]: 1.231e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.806e-05 [merge_recompute_call_nodes]: 1.47001e-06 [before_grad]: 1.391e-05 [set_forward_comm_id_for_comm_node_pass]: 5.21002e-06 [meta_fg_expand]: 3.36001e-06 [flash_sp_send_recv_attached]: 3.52002e-06 [receive_attached]: 2.84999e-06 [after_resolve]: 1.519e-05 [a_after_grad]: 1.301e-05 [renormalize]: 0.00089458 [add_forward_monad_depend]: 5.312e-05 [auto_monad_grad]: 3.55998e-06 [auto_monad_eliminator]: 2.416e-05 [cse]: 3.912e-05 [a_3]: 6.415e-05 [Cycle 2]: 0.00085288, [45] [expand_dump_flag]: 2.55002e-06 [switch_simplify]: 1.05e-05 [loop_unroll]: 8.33001e-06 [a_1]: 0.0001987 [with_stream_mark]: 1.665e-05 [recompute_prepare]: 8.02998e-06 [updatestate_depend_eliminate]: 4.88001e-06 [updatestate_assign_eliminate]: 3.11999e-06 [updatestate_loads_eliminate]: 3.11999e-06 [parameter_eliminate]: 1.60001e-06 [a_2]: 9.801e-05 [accelerated_algorithm]: 9.04e-06 [shard]: 2.79999e-06 [meta_shard_fg_expand]: 2.43e-06 [shard_inline]: 9.04e-06 [merge_send_recv]: 8.03001e-06 [auto_parallel]: 7.18998e-06 [parallel]: 1.026e-05 [flash_sp]: 3.86001e-06 [merge_comm]: 4.17998e-06 [allreduce_fusion]: 4e-06 [matmul_add_comm_reduction]: 9.07999e-06 [allreduce_slice_to_reducescatter]: 3.89991e-07 [virtual_shard_identity]: 9.07001e-06 [virtual_dataset]: 7.06001e-06 [get_grad_eliminate_]: 7.02002e-06 [virtual_output]: 7.8e-06 [merge_forward]: 3.7e-06 [cell_reuse_recompute_pass]: 2.06e-06 [offload_activation]: 9.07001e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.794e-05 [merge_recompute_call_nodes]: 1.19e-06 [before_grad]: 1.285e-05 [set_forward_comm_id_for_comm_node_pass]: 4.42e-06 [meta_fg_expand]: 2.88e-06 [flash_sp_send_recv_attached]: 1.10999e-06 [receive_attached]: 1.74998e-06 [after_resolve]: 1.385e-05 [a_after_grad]: 1.157e-05 [renormalize]: 8.00064e-08 [add_forward_monad_depend]: 2.32999e-06 [auto_monad_grad]: 2.01998e-06 [auto_monad_eliminator]: 1.158e-05 [cse]: 2.273e-05 [a_3]: 4.78e-05 [py_interpret_to_execute_after_opt_a]: 1.554e-05 [slice_cell_reuse_recomputed_activation]: 2.71e-06 [rewriter_after_opt_a]: 4.764e-05 [convert_after_rewriter]: 8.24998e-06 [order_py_execute_after_rewriter]: 6.88e-06 [mutable_eliminate]: 0.00060885 [opt_b]: 0.00025893, [1] [Cycle 1]: 0.00025157, [7] [b_1]: 0.00016203 [b_2]: 9.76998e-06 [updatestate_depend_eliminate]: 8.77e-06 [updatestate_assign_eliminate]: 3.29001e-06 [updatestate_loads_eliminate]: 2.93998e-06 [renormalize]: 7.89994e-07 [cse]: 2.651e-05 [optimize_parallel_all_gather_comm]: 1.948e-05 [overlap_param_gather]: 2.07999e-06 [cconv]: 2.927e-05 [loop_unroll]: 0.00046676 [opt_after_cconv]: 0.00012781, [1] [Cycle 1]: 0.00012114, [7] [c_1]: 3.895e-05 [parameter_eliminate]: 3.7e-06 [updatestate_depend_eliminate]: 7.92e-06 [updatestate_assign_eliminate]: 3.26001e-06 [updatestate_loads_eliminate]: 3.25e-06 [cse]: 2.709e-05 [renormalize]: 5.10016e-07 [remove_dup_value]: 1.607e-05 [tuple_transform]: 8.829e-05, [1] [Cycle 1]: 8.363e-05, [4] [d_1]: 5.346e-05 [none_parameter_eliminate]: 2.09e-06 [renormalize]: 1.90019e-07 [switch_simplify]: 8.65001e-06 [partial_unused_args_eliminate]: 1.99e-06 [add_recomputation]: 6.287e-05 [cse_after_recomputation]: 2.704e-05, [1] [Cycle 1]: 2.234e-05, [1] [cse]: 1.632e-05 [environ_conv]: 6.36e-06 [swap_dp_allreduce_reducescatter]: 6.30002e-06 [bias_add_comm_swap]: 2.66e-06 [label_micro_interleaved_index]: 4.80001e-06 [label_fine_grained_interleaved_index]: 2.70002e-06 [merge_cast_opt]: 1.25999e-06 [slice_recompute_activation]: 2.09e-06 [micro_interleaved_order_control]: 2.32999e-06 [assign_add_opt]: 1.27e-06 [ForceFp32Comm]: 1.08001e-06 [remove_cast_before_assign_add]: 1.00001e-06 [full_micro_interleaved_order_control]: 2.80002e-06 [reorder_send_recv_between_fp_bp]: 3.03998e-06 [comm_op_add_attrs]: 1.41998e-06 [add_comm_op_reuse_tag]: 1.00001e-06 [interleave_split_concat_branches]: 1.14e-06 [interleave_parallel_branches]: 1.09003e-06 [overlap_opt_shard_in_pipeline]: 1.19e-06 [overlap_opt_shard_grad_in_pipeline]: 1.81003e-06 [control_data_broadcast_order]: 1.857e-05 [grouped_pairwise_exchange_alltoall]: 1.71e-06 [offloading_packed_experts]: 4.49998e-06 [overlap_recompute_and_grad_model_parallel]: 5.71e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.32e-06 [overlap_recompute_allgather_and_fa_grad]: 1.62999e-06 [overlap_recompute_comm]: 2.51e-06 [overlap_grad_ring_attention]: 5.37999e-06 [overlap_grad_flash_sp]: 2.454e-05 [begin_end_overlap_inline]: 5.10016e-07 [split_matmul_comm_elemetwise]: 2.17999e-06 [split_layernorm_comm]: 2.12999e-06 [handle_group_info]: 1.10001e-06 [symbol_engine_optimizer]: 9.565e-05, [1] [Cycle 1]: 9.008e-05, [6] [build]: 4.12e-06 [elim_shapecalc]: 1.443e-05 [elim_not_effective]: 1.752e-05 [opt_reshape]: 8.07e-06 [fold_const_symbol]: 1.237e-05 [renormalize]: 2.60014e-07 [detach_backward]: 2.19001e-06 [pipeline_parallel_scheduler]: 1.52001e-06 [auto_monad_reorder]: 2.406e-05 [get_jit_bprop_graph]: 2.15002e-06 [rewriter_after_jit_bprop_graph]: 4.70001e-06 [opt_after_jit_grad]: 0.0005867 [validate]: 4.794e-05 Sums bootstrap : 0.000504s : 4.18% type_inference : 0.005831s : 48.38% event_method : 0.000020s : 0.17% auto_monad : 0.000064s : 0.53% graph_reusing : 0.000007s : 0.06% inline : 0.000003s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000021s : 0.17% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.05% parallel-infer-symbol : 0.000004s : 0.03% pre_auto_parallel : 0.000035s : 0.29% insert-virtual-dataset : 0.000002s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.02% optimize.py_interpret_to_execute : 0.000029s : 0.24% optimize.rewriter_before_opt_a : 0.000087s : 0.73% optimize.opt_a.expand_dump_flag : 0.000005s : 0.05% optimize.opt_a.switch_simplify : 0.000055s : 0.46% optimize.opt_a.loop_unroll : 0.000040s : 0.33% optimize.opt_a.a_1 : 0.000965s : 8.00% optimize.opt_a.with_stream_mark : 0.000038s : 0.32% optimize.opt_a.recompute_prepare : 0.000020s : 0.17% optimize.opt_a.updatestate_depend_eliminate : 0.000010s : 0.08% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.06% optimize.opt_a.updatestate_loads_eliminate : 0.000007s : 0.06% optimize.opt_a.parameter_eliminate : 0.000004s : 0.03% optimize.opt_a.a_2 : 0.000201s : 1.67% optimize.opt_a.accelerated_algorithm : 0.000018s : 0.15% optimize.opt_a.shard : 0.000005s : 0.04% optimize.opt_a.meta_shard_fg_expand : 0.000005s : 0.04% optimize.opt_a.shard_inline : 0.000017s : 0.15% optimize.opt_a.merge_send_recv : 0.000019s : 0.16% optimize.opt_a.auto_parallel : 0.000016s : 0.13% optimize.opt_a.parallel : 0.000029s : 0.24% optimize.opt_a.flash_sp : 0.000014s : 0.11% optimize.opt_a.merge_comm : 0.000009s : 0.08% optimize.opt_a.allreduce_fusion : 0.000009s : 0.07% optimize.opt_a.matmul_add_comm_reduction : 0.000020s : 0.16% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000020s : 0.17% optimize.opt_a.virtual_dataset : 0.000015s : 0.13% optimize.opt_a.get_grad_eliminate_ : 0.000014s : 0.12% optimize.opt_a.virtual_output : 0.000015s : 0.13% optimize.opt_a.merge_forward : 0.000008s : 0.07% optimize.opt_a.cell_reuse_recompute_pass : 0.000004s : 0.03% optimize.opt_a.offload_activation : 0.000021s : 0.18% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000036s : 0.30% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.02% optimize.opt_a.before_grad : 0.000027s : 0.22% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000010s : 0.08% optimize.opt_a.meta_fg_expand : 0.000006s : 0.05% optimize.opt_a.flash_sp_send_recv_attached : 0.000005s : 0.04% optimize.opt_a.receive_attached : 0.000005s : 0.04% optimize.opt_a.after_resolve : 0.000029s : 0.24% optimize.opt_a.a_after_grad : 0.000025s : 0.20% optimize.opt_a.renormalize : 0.000895s : 7.42% optimize.opt_a.add_forward_monad_depend : 0.000055s : 0.46% optimize.opt_a.auto_monad_grad : 0.000006s : 0.05% optimize.opt_a.auto_monad_eliminator : 0.000036s : 0.30% optimize.opt_a.cse : 0.000062s : 0.51% optimize.opt_a.a_3 : 0.000112s : 0.93% optimize.py_interpret_to_execute_after_opt_a : 0.000016s : 0.13% optimize.slice_cell_reuse_recomputed_activation : 0.000003s : 0.02% optimize.rewriter_after_opt_a : 0.000048s : 0.40% optimize.convert_after_rewriter : 0.000008s : 0.07% optimize.order_py_execute_after_rewriter : 0.000007s : 0.06% optimize.mutable_eliminate : 0.000609s : 5.05% optimize.opt_b.b_1 : 0.000162s : 1.34% optimize.opt_b.b_2 : 0.000010s : 0.08% optimize.opt_b.updatestate_depend_eliminate : 0.000009s : 0.07% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.02% optimize.opt_b.renormalize : 0.000001s : 0.01% optimize.opt_b.cse : 0.000027s : 0.22% optimize.optimize_parallel_all_gather_comm : 0.000019s : 0.16% optimize.overlap_param_gather : 0.000002s : 0.02% optimize.cconv : 0.000029s : 0.24% optimize.loop_unroll : 0.000467s : 3.87% optimize.opt_after_cconv.c_1 : 0.000039s : 0.32% optimize.opt_after_cconv.parameter_eliminate : 0.000004s : 0.03% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000008s : 0.07% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.cse : 0.000027s : 0.22% optimize.opt_after_cconv.renormalize : 0.000001s : 0.00% optimize.remove_dup_value : 0.000016s : 0.13% optimize.tuple_transform.d_1 : 0.000053s : 0.44% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.02% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000009s : 0.07% optimize.partial_unused_args_eliminate : 0.000002s : 0.02% optimize.add_recomputation : 0.000063s : 0.52% optimize.cse_after_recomputation.cse : 0.000016s : 0.14% optimize.environ_conv : 0.000006s : 0.05% optimize.swap_dp_allreduce_reducescatter : 0.000006s : 0.05% optimize.bias_add_comm_swap : 0.000003s : 0.02% optimize.label_micro_interleaved_index : 0.000005s : 0.04% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.02% optimize.merge_cast_opt : 0.000001s : 0.01% optimize.slice_recompute_activation : 0.000002s : 0.02% optimize.micro_interleaved_order_control : 0.000002s : 0.02% optimize.assign_add_opt : 0.000001s : 0.01% optimize.ForceFp32Comm : 0.000001s : 0.01% optimize.remove_cast_before_assign_add : 0.000001s : 0.01% optimize.full_micro_interleaved_order_control : 0.000003s : 0.02% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.03% optimize.comm_op_add_attrs : 0.000001s : 0.01% optimize.add_comm_op_reuse_tag : 0.000001s : 0.01% optimize.interleave_split_concat_branches : 0.000001s : 0.01% optimize.interleave_parallel_branches : 0.000001s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.02% optimize.control_data_broadcast_order : 0.000019s : 0.15% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.01% optimize.offloading_packed_experts : 0.000004s : 0.04% optimize.overlap_recompute_and_grad_model_parallel : 0.000006s : 0.05% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000002s : 0.01% optimize.overlap_recompute_comm : 0.000003s : 0.02% optimize.overlap_grad_ring_attention : 0.000005s : 0.04% optimize.overlap_grad_flash_sp : 0.000025s : 0.20% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.02% optimize.split_layernorm_comm : 0.000002s : 0.02% optimize.handle_group_info : 0.000001s : 0.01% optimize.symbol_engine_optimizer.build : 0.000004s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000014s : 0.12% optimize.symbol_engine_optimizer.elim_not_effective : 0.000018s : 0.15% optimize.symbol_engine_optimizer.opt_reshape : 0.000008s : 0.07% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000012s : 0.10% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.02% pipeline_parallel_scheduler : 0.000002s : 0.01% auto_monad_reorder : 0.000024s : 0.20% get_jit_bprop_graph : 0.000002s : 0.02% rewriter_after_jit_bprop_graph : 0.000005s : 0.04% opt_after_jit_grad : 0.000587s : 4.87% validate : 0.000048s : 0.40% Time group info: ------[substitution.] 0.000225 38 11.93% : 0.000027s : 3: substitution.cast_eliminate 1.03% : 0.000002s : 3: substitution.elim_not_effective 0.82% : 0.000002s : 3: substitution.fold_const_symbol 2.90% : 0.000006s : 5: substitution.graph_param_transform 68.10% : 0.000153s : 4: substitution.inline 2.23% : 0.000005s : 6: substitution.j_node_and_user_rematch 3.79% : 0.000009s : 6: substitution.remove_not_recompute_node 2.32% : 0.000005s : 4: substitution.replace_old_param 6.89% : 0.000015s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.005766 2 87.16% : 0.005025s : 1: type_inference.infer 12.84% : 0.000740s : 1: type_inference.specialize ------[replace.] 0.000067 8 56.45% : 0.000038s : 4: replace.inline 43.55% : 0.000029s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000163 8 91.89% : 0.000150s : 4: match.inline 8.11% : 0.000013s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000269 1596 0.90% : 0.000002s : 17: predicate.accumulaten_eliminater 0.71% : 0.000002s : 5: predicate.ad_related_special_op_eliminate 0.50% : 0.000001s : 10: predicate.addn_check_dump 0.94% : 0.000003s : 17: predicate.addn_zero_filter 0.85% : 0.000002s : 17: predicate.adjust_all_reduce_mul_add 2.21% : 0.000006s : 27: predicate.arithmetic_simplify 1.18% : 0.000003s : 17: predicate.cast_eliminate 0.58% : 0.000002s : 10: predicate.check_bprop_eliminate 0.61% : 0.000002s : 10: predicate.compare_switch_simplify 0.18% : 0.000000s : 5: predicate.const_output_eliminate 0.56% : 0.000001s : 10: predicate.depend_value_elim 0.97% : 0.000003s : 17: predicate.dict_get_item_const_eliminator 1.03% : 0.000003s : 17: predicate.dict_get_item_eliminator 0.95% : 0.000003s : 17: predicate.dict_set_item_eliminator 0.99% : 0.000003s : 10: predicate.dumpgradient_eliminate 0.22% : 0.000001s : 5: predicate.elim_not_effective 0.37% : 0.000001s : 5: predicate.elim_shapecalc_of_broadcastargs 1.31% : 0.000004s : 22: predicate.environ_add_const_eliminate 1.14% : 0.000003s : 22: predicate.environ_get_add_eliminate 1.13% : 0.000003s : 22: predicate.environ_get_depend_swap 1.73% : 0.000005s : 32: predicate.environ_get_eliminate 1.20% : 0.000003s : 22: predicate.environ_get_set_eliminate 1.34% : 0.000004s : 25: predicate.exchange_switch_depend_value 2.34% : 0.000006s : 25: predicate.float_depend_g_call 0.52% : 0.000001s : 10: predicate.float_environ_get_switch 0.83% : 0.000002s : 15: predicate.float_tuple_getitem_switch 0.19% : 0.000000s : 5: predicate.fold_const_symbol 0.61% : 0.000002s : 10: predicate.get_grad_eliminate 0.18% : 0.000000s : 5: predicate.graph_param_transform 0.56% : 0.000002s : 10: predicate.incorporate_call 0.48% : 0.000001s : 10: predicate.incorporate_call_switch 6.96% : 0.000019s : 72: predicate.inline 0.90% : 0.000002s : 10: predicate.inline_without_move 0.29% : 0.000001s : 10: predicate.j_node_and_user_rematch 1.06% : 0.000003s : 10: predicate.less_batch_normalization 1.74% : 0.000005s : 31: predicate.list_to_tuple_eliminator_ 2.47% : 0.000007s : 48: predicate.load_eliminater 0.95% : 0.000003s : 5: predicate.loop_unroll_after_grad 2.01% : 0.000005s : 36: predicate.loop_unroll_before_grad 1.67% : 0.000004s : 27: predicate.make_slice_get_slice_eliminator 0.55% : 0.000001s : 10: predicate.merge_addn 0.62% : 0.000002s : 10: predicate.micro_step_allgather_replace 0.62% : 0.000002s : 10: predicate.mini_step_allgather_replace 0.86% : 0.000002s : 17: predicate.minmaximum_grad 1.07% : 0.000003s : 5: predicate.mutable_eliminate 0.33% : 0.000001s : 5: predicate.opt_reshape 0.35% : 0.000001s : 5: predicate.parallel_virtual_node 1.64% : 0.000004s : 25: predicate.partial_defer_inline 1.64% : 0.000004s : 26: predicate.partial_eliminate 0.88% : 0.000002s : 17: predicate.print_const_string_wrapper 0.58% : 0.000002s : 10: predicate.reduce_all_const_elim 1.20% : 0.000003s : 17: predicate.reduce_eliminate 2.52% : 0.000007s : 48: predicate.redundant_stop_gradient_eliminater 0.35% : 0.000001s : 10: predicate.remove_not_recompute_node 1.45% : 0.000004s : 31: predicate.replace_applicator 0.70% : 0.000002s : 10: predicate.replace_old_param 0.33% : 0.000001s : 5: predicate.reset_defer_inline 0.95% : 0.000003s : 17: predicate.reshape_eliminate 0.58% : 0.000002s : 10: predicate.row_tensor_add_zeros_like 0.38% : 0.000001s : 5: predicate.row_tensor_eliminate 0.81% : 0.000002s : 10: predicate.same_eliminate 0.55% : 0.000001s : 10: predicate.set_cell_output_no_recompute 0.78% : 0.000002s : 10: predicate.shard_identity_eliminate 0.69% : 0.000002s : 10: predicate.special_op_eliminate 0.70% : 0.000002s : 10: predicate.specialize_transform 1.05% : 0.000003s : 10: predicate.split_environ_get_set_with_tuple_value 0.77% : 0.000002s : 10: predicate.stack_unstack_eliminate 0.34% : 0.000001s : 5: predicate.switch_call_monad_eliminater 1.48% : 0.000004s : 25: predicate.switch_defer_inline 1.95% : 0.000005s : 35: predicate.switch_layer_defer_inline 4.74% : 0.000013s : 76: predicate.switch_simplify 0.92% : 0.000002s : 17: predicate.tile_eliminate 1.03% : 0.000003s : 17: predicate.transpose_eliminate 1.76% : 0.000005s : 27: predicate.tuple_list_convert_item_index_to_positive 1.59% : 0.000004s : 27: predicate.tuple_list_get_item_const_eliminator 1.45% : 0.000004s : 27: predicate.tuple_list_get_item_depend_reorder 3.21% : 0.000009s : 41: predicate.tuple_list_get_item_eliminator 1.52% : 0.000004s : 27: predicate.tuple_list_get_set_item_eliminator 2.15% : 0.000006s : 37: predicate.tuple_list_set_item_eliminator 1.73% : 0.000005s : 31: predicate.tuple_to_list_eliminator_ 2.45% : 0.000007s : 48: predicate.updatestate_pure_node_eliminater 3.10% : 0.000008s : 58: predicate.updatestate_useless_node_eliminater 0.32% : 0.000001s : 5: predicate.value_based_eliminate 0.61% : 0.000002s : 10: predicate.virtual_dataset_eliminate 0.65% : 0.000002s : 10: predicate.virtual_output_eliminate 0.29% : 0.000001s : 5: predicate.virtual_view_grad_eliminate 0.41% : 0.000001s : 5: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000549 11 52.74% : 0.000290s : 5: func_graph_cloner_run.FuncGraphClonerGraph 47.26% : 0.000260s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.028633 192 0.01% : 0.000004s : 1: ForceFp32Comm 12.47% : 0.003572s : 1: add_attr 12.43% : 0.003559s : 1: add_attr_with_inline 0.01% : 0.000004s : 1: add_comm_op_reuse_tag 0.23% : 0.000067s : 1: add_recomputation 0.01% : 0.000004s : 1: assign_add_opt 0.25% : 0.000071s : 1: auto_monad 0.10% : 0.000028s : 1: auto_monad_reorder 0.01% : 0.000004s : 1: begin_end_overlap_inline 0.02% : 0.000006s : 1: bias_add_comm_swap 1.87% : 0.000536s : 1: bootstrap 0.11% : 0.000033s : 1: cconv 0.01% : 0.000004s : 1: comm_op_add_attrs 0.08% : 0.000023s : 1: control_data_broadcast_order 0.04% : 0.000012s : 1: convert_after_rewriter 0.11% : 0.000030s : 1: cse_after_recomputation 0.02% : 0.000005s : 1: dataset_repeat_opt 0.02% : 0.000005s : 1: detach_backward 0.03% : 0.000010s : 1: environ_conv 0.09% : 0.000026s : 1: event_method 0.02% : 0.000006s : 1: full_micro_interleaved_order_control 0.02% : 0.000006s : 1: get_jit_bprop_graph 0.04% : 0.000010s : 1: graph_reusing 0.02% : 0.000005s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000004s : 1: handle_group_info 0.02% : 0.000006s : 1: inline 0.02% : 0.000006s : 1: insert-virtual-dataset 0.01% : 0.000004s : 1: interleave_parallel_branches 0.01% : 0.000004s : 1: interleave_split_concat_branches 0.02% : 0.000006s : 1: label_fine_grained_interleaved_index 0.03% : 0.000008s : 1: label_micro_interleaved_index 1.66% : 0.000476s : 1: loop_unroll 0.02% : 0.000004s : 1: merge_cast_opt 0.02% : 0.000005s : 1: micro_interleaved_order_control 2.16% : 0.000618s : 1: mutable_eliminate 0.03% : 0.000008s : 1: offloading_packed_experts 0.07% : 0.000020s : 1: opt.transform.loop_unroll_optimizer 0.07% : 0.000020s : 1: opt.transform.mutable_eliminate 5.28% : 0.001512s : 78: opt.transform.opt_a 0.13% : 0.000037s : 1: opt.transform.opt_after_cconv 0.12% : 0.000034s : 1: opt.transform.opt_after_jit_grad 0.49% : 0.000139s : 28: opt.transform.opt_b 0.21% : 0.000060s : 2: opt.transform.opt_trans_graph 0.17% : 0.000048s : 4: opt.transform.symbol_engine_opt 12.01% : 0.003439s : 1: opt_a 0.46% : 0.000133s : 1: opt_after_cconv 2.08% : 0.000597s : 1: opt_after_jit_grad 0.92% : 0.000263s : 1: opt_b 20.24% : 0.005796s : 1: optimize 0.08% : 0.000024s : 1: optimize_parallel_all_gather_comm 0.04% : 0.000010s : 1: order_py_execute_after_rewriter 0.10% : 0.000028s : 1: overlap_grad_flash_sp 0.01% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.03% : 0.000009s : 1: overlap_grad_ring_attention 0.02% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.02% : 0.000005s : 1: overlap_param_gather 0.02% : 0.000005s : 1: overlap_recompute_allgather_and_fa_grad 0.03% : 0.000009s : 1: overlap_recompute_and_grad_model_parallel 0.02% : 0.000006s : 1: overlap_recompute_comm 0.02% : 0.000007s : 1: parallel-infer-symbol 0.01% : 0.000004s : 1: parallel-infer-symbol-second 0.02% : 0.000005s : 1: partial_unused_args_eliminate 0.02% : 0.000005s : 1: pipeline_parallel_scheduler 0.02% : 0.000005s : 1: pipeline_split 0.14% : 0.000039s : 1: pre_auto_parallel 0.11% : 0.000033s : 1: py_interpret_to_execute 0.07% : 0.000019s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000004s : 1: remove_cast_before_assign_add 0.07% : 0.000020s : 1: remove_dup_value 1.59% : 0.000456s : 1: renormalize.infer 1.49% : 0.000427s : 1: renormalize.specialize 0.02% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.03% : 0.000008s : 1: rewriter_after_jit_bprop_graph 0.18% : 0.000052s : 1: rewriter_after_opt_a 0.32% : 0.000092s : 1: rewriter_before_opt_a 0.02% : 0.000006s : 1: slice_cell_reuse_recomputed_activation 0.02% : 0.000005s : 1: slice_recompute_activation 0.02% : 0.000005s : 1: split_layernorm_comm 0.02% : 0.000005s : 1: split_matmul_comm_elemetwise 0.03% : 0.000009s : 1: swap_dp_allreduce_reducescatter 0.34% : 0.000099s : 1: symbol_engine_optimizer 0.32% : 0.000091s : 1: tuple_transform 20.42% : 0.005848s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:45:16.203.525 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:45:16.203.802 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0423872, [21] [bootstrap]: 0.00043671 [type_inference]: 0.00626073 [event_method]: 2.068e-05 [auto_monad]: 6.821e-05 [graph_reusing]: 6.63e-06 [inline]: 2.53e-06 [add_attr]: 0.00330591, [1] [add_attr_with_inline]: 0.00329511, [1] [Cycle 1]: 9.169e-05, [2] [tag_attr]: 2.224e-05 [meta_addattr_fg_expand]: 6.08998e-06 [parallel-infer-symbol]: 3.83001e-06 [pre_auto_parallel]: 4.005e-05 [insert-virtual-dataset]: 2.29001e-06 [parallel-infer-symbol-second]: 1.15001e-06 [dataset_repeat_opt]: 2.08998e-06 [pipeline_split]: 2.26e-06 [optimize]: 0.0309368, [53] [py_interpret_to_execute]: 3.433e-05 [rewriter_before_opt_a]: 9.338e-05 [opt_a]: 0.00370829, [2] [Cycle 1]: 0.00266402, [45] [expand_dump_flag]: 2.99999e-06 [switch_simplify]: 4.42e-05 [loop_unroll]: 3.196e-05 [a_1]: 0.00078233 [with_stream_mark]: 2.378e-05 [recompute_prepare]: 1.324e-05 [updatestate_depend_eliminate]: 5.45001e-06 [updatestate_assign_eliminate]: 4.08999e-06 [updatestate_loads_eliminate]: 3.8e-06 [parameter_eliminate]: 1.98002e-06 [a_2]: 0.0001316 [accelerated_algorithm]: 9.09998e-06 [shard]: 2.13998e-06 [meta_shard_fg_expand]: 2.26e-06 [shard_inline]: 8.73001e-06 [merge_send_recv]: 1.023e-05 [auto_parallel]: 9.19e-06 [parallel]: 1.946e-05 [flash_sp]: 1.049e-05 [merge_comm]: 5.51998e-06 [allreduce_fusion]: 4.31002e-06 [matmul_add_comm_reduction]: 1.15e-05 [allreduce_slice_to_reducescatter]: 8.79983e-07 [virtual_shard_identity]: 1.063e-05 [virtual_dataset]: 8.37e-06 [get_grad_eliminate_]: 7.56999e-06 [virtual_output]: 7.75e-06 [merge_forward]: 5.50001e-06 [cell_reuse_recompute_pass]: 1.76998e-06 [offload_activation]: 1.248e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.191e-05 [merge_recompute_call_nodes]: 1.83997e-06 [before_grad]: 1.355e-05 [set_forward_comm_id_for_comm_node_pass]: 6.17001e-06 [meta_fg_expand]: 3.36999e-06 [flash_sp_send_recv_attached]: 3.71001e-06 [receive_attached]: 2.25002e-06 [after_resolve]: 1.512e-05 [a_after_grad]: 1.248e-05 [renormalize]: 0.0008128 [add_forward_monad_depend]: 7.10998e-06 [auto_monad_grad]: 2.31998e-06 [auto_monad_eliminator]: 2.166e-05 [cse]: 3.838e-05 [a_3]: 7.824e-05 [Cycle 2]: 0.00102587, [45] [expand_dump_flag]: 1.84998e-06 [switch_simplify]: 1.052e-05 [loop_unroll]: 7.63001e-06 [a_1]: 0.0001808 [with_stream_mark]: 1.871e-05 [recompute_prepare]: 8.50001e-06 [updatestate_depend_eliminate]: 4.45999e-06 [updatestate_assign_eliminate]: 4.23001e-06 [updatestate_loads_eliminate]: 3.54002e-06 [parameter_eliminate]: 1.51002e-06 [a_2]: 0.00012308 [accelerated_algorithm]: 9.44e-06 [shard]: 2.41e-06 [meta_shard_fg_expand]: 2.48e-06 [shard_inline]: 7.58001e-06 [merge_send_recv]: 7.9e-06 [auto_parallel]: 8.94e-06 [parallel]: 7.04001e-06 [flash_sp]: 3.91999e-06 [merge_comm]: 4.33001e-06 [allreduce_fusion]: 4.90999e-06 [matmul_add_comm_reduction]: 1.083e-05 [allreduce_slice_to_reducescatter]: 9.00007e-07 [virtual_shard_identity]: 1.136e-05 [virtual_dataset]: 7.48e-06 [get_grad_eliminate_]: 6.98e-06 [virtual_output]: 6.98e-06 [merge_forward]: 5.52999e-06 [cell_reuse_recompute_pass]: 2.26e-06 [offload_activation]: 1.032e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.005e-05 [merge_recompute_call_nodes]: 1.15001e-06 [before_grad]: 1.325e-05 [set_forward_comm_id_for_comm_node_pass]: 5.17999e-06 [meta_fg_expand]: 3.03e-06 [flash_sp_send_recv_attached]: 1.45001e-06 [receive_attached]: 2.11e-06 [after_resolve]: 1.328e-05 [a_after_grad]: 1.2e-05 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 2.59999e-06 [auto_monad_grad]: 1.59e-06 [auto_monad_eliminator]: 1.222e-05 [cse]: 2.265e-05 [a_3]: 6.198e-05 [py_interpret_to_execute_after_opt_a]: 1.949e-05 [slice_cell_reuse_recomputed_activation]: 5.18002e-06 [rewriter_after_opt_a]: 5.209e-05 [convert_after_rewriter]: 1.284e-05 [order_py_execute_after_rewriter]: 8.77e-06 [mutable_eliminate]: 0.00061586 [opt_b]: 0.00032919, [1] [Cycle 1]: 0.00031819, [7] [b_1]: 0.00019974 [b_2]: 1.016e-05 [updatestate_depend_eliminate]: 1.115e-05 [updatestate_assign_eliminate]: 3.63e-06 [updatestate_loads_eliminate]: 3.14999e-06 [renormalize]: 5.89993e-07 [cse]: 2.953e-05 [optimize_parallel_all_gather_comm]: 2.488e-05 [overlap_param_gather]: 5.20001e-06 [cconv]: 0.0241389 [loop_unroll]: 0.0008135 [opt_after_cconv]: 0.00019662, [1] [Cycle 1]: 0.00018418, [7] [c_1]: 4.329e-05 [parameter_eliminate]: 7.26001e-06 [updatestate_depend_eliminate]: 1.411e-05 [updatestate_assign_eliminate]: 4.1e-06 [updatestate_loads_eliminate]: 3.95e-06 [cse]: 4.865e-05 [renormalize]: 4.50003e-07 [remove_dup_value]: 2.162e-05 [tuple_transform]: 0.00011681, [1] [Cycle 1]: 0.00010873, [4] [d_1]: 6.359e-05 [none_parameter_eliminate]: 2.43002e-06 [renormalize]: 1.69995e-07 [switch_simplify]: 9.68997e-06 [partial_unused_args_eliminate]: 4.94998e-06 [add_recomputation]: 7.62e-05 [cse_after_recomputation]: 3.883e-05, [1] [Cycle 1]: 2.994e-05, [1] [cse]: 1.929e-05 [environ_conv]: 1.154e-05 [swap_dp_allreduce_reducescatter]: 1.022e-05 [bias_add_comm_swap]: 6.76e-06 [label_micro_interleaved_index]: 1.15e-05 [label_fine_grained_interleaved_index]: 5.38002e-06 [merge_cast_opt]: 4.12e-06 [slice_recompute_activation]: 5.04e-06 [micro_interleaved_order_control]: 5.12e-06 [assign_add_opt]: 3.93999e-06 [ForceFp32Comm]: 3.35e-06 [remove_cast_before_assign_add]: 3.48999e-06 [full_micro_interleaved_order_control]: 4.80999e-06 [reorder_send_recv_between_fp_bp]: 5.07999e-06 [comm_op_add_attrs]: 3.64002e-06 [add_comm_op_reuse_tag]: 3.88001e-06 [interleave_split_concat_branches]: 3.68999e-06 [interleave_parallel_branches]: 3.65998e-06 [overlap_opt_shard_in_pipeline]: 4.03001e-06 [overlap_opt_shard_grad_in_pipeline]: 4.52e-06 [control_data_broadcast_order]: 2.229e-05 [grouped_pairwise_exchange_alltoall]: 4.44002e-06 [offloading_packed_experts]: 7.58001e-06 [overlap_recompute_and_grad_model_parallel]: 8.53001e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.9e-06 [overlap_recompute_allgather_and_fa_grad]: 4.09002e-06 [overlap_recompute_comm]: 6.37001e-06 [overlap_grad_ring_attention]: 8.3e-06 [overlap_grad_flash_sp]: 3.048e-05 [begin_end_overlap_inline]: 3.29001e-06 [split_matmul_comm_elemetwise]: 4.85001e-06 [split_layernorm_comm]: 4.97999e-06 [handle_group_info]: 3.98001e-06 [symbol_engine_optimizer]: 0.00013082, [1] [Cycle 1]: 0.00012108, [6] [build]: 5.14998e-06 [elim_shapecalc]: 1.574e-05 [elim_not_effective]: 1.803e-05 [opt_reshape]: 1.108e-05 [fold_const_symbol]: 1.46e-05 [renormalize]: 2.30008e-07 [detach_backward]: 4.73001e-06 [pipeline_parallel_scheduler]: 2.53e-06 [auto_monad_reorder]: 2.619e-05 [get_jit_bprop_graph]: 2.24999e-06 [rewriter_after_jit_bprop_graph]: 6.74001e-06 [opt_after_jit_grad]: 0.00056351 [validate]: 5.041e-05 Sums bootstrap : 0.000437s : 1.18% type_inference : 0.006261s : 16.87% event_method : 0.000021s : 0.06% auto_monad : 0.000068s : 0.18% graph_reusing : 0.000007s : 0.02% inline : 0.000003s : 0.01% add_attr.add_attr_with_inline.tag_attr : 0.000022s : 0.06% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.02% parallel-infer-symbol : 0.000004s : 0.01% pre_auto_parallel : 0.000040s : 0.11% insert-virtual-dataset : 0.000002s : 0.01% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.01% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000034s : 0.09% optimize.rewriter_before_opt_a : 0.000093s : 0.25% optimize.opt_a.expand_dump_flag : 0.000005s : 0.01% optimize.opt_a.switch_simplify : 0.000055s : 0.15% optimize.opt_a.loop_unroll : 0.000040s : 0.11% optimize.opt_a.a_1 : 0.000963s : 2.60% optimize.opt_a.with_stream_mark : 0.000042s : 0.11% optimize.opt_a.recompute_prepare : 0.000022s : 0.06% optimize.opt_a.updatestate_depend_eliminate : 0.000010s : 0.03% optimize.opt_a.updatestate_assign_eliminate : 0.000008s : 0.02% optimize.opt_a.updatestate_loads_eliminate : 0.000007s : 0.02% optimize.opt_a.parameter_eliminate : 0.000003s : 0.01% optimize.opt_a.a_2 : 0.000255s : 0.69% optimize.opt_a.accelerated_algorithm : 0.000019s : 0.05% optimize.opt_a.shard : 0.000005s : 0.01% optimize.opt_a.meta_shard_fg_expand : 0.000005s : 0.01% optimize.opt_a.shard_inline : 0.000016s : 0.04% optimize.opt_a.merge_send_recv : 0.000018s : 0.05% optimize.opt_a.auto_parallel : 0.000018s : 0.05% optimize.opt_a.parallel : 0.000027s : 0.07% optimize.opt_a.flash_sp : 0.000014s : 0.04% optimize.opt_a.merge_comm : 0.000010s : 0.03% optimize.opt_a.allreduce_fusion : 0.000009s : 0.02% optimize.opt_a.matmul_add_comm_reduction : 0.000022s : 0.06% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000002s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000022s : 0.06% optimize.opt_a.virtual_dataset : 0.000016s : 0.04% optimize.opt_a.get_grad_eliminate_ : 0.000015s : 0.04% optimize.opt_a.virtual_output : 0.000015s : 0.04% optimize.opt_a.merge_forward : 0.000011s : 0.03% optimize.opt_a.cell_reuse_recompute_pass : 0.000004s : 0.01% optimize.opt_a.offload_activation : 0.000023s : 0.06% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000042s : 0.11% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.01% optimize.opt_a.before_grad : 0.000027s : 0.07% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000011s : 0.03% optimize.opt_a.meta_fg_expand : 0.000006s : 0.02% optimize.opt_a.flash_sp_send_recv_attached : 0.000005s : 0.01% optimize.opt_a.receive_attached : 0.000004s : 0.01% optimize.opt_a.after_resolve : 0.000028s : 0.08% optimize.opt_a.a_after_grad : 0.000024s : 0.07% optimize.opt_a.renormalize : 0.000813s : 2.19% optimize.opt_a.add_forward_monad_depend : 0.000010s : 0.03% optimize.opt_a.auto_monad_grad : 0.000004s : 0.01% optimize.opt_a.auto_monad_eliminator : 0.000034s : 0.09% optimize.opt_a.cse : 0.000061s : 0.16% optimize.opt_a.a_3 : 0.000140s : 0.38% optimize.py_interpret_to_execute_after_opt_a : 0.000019s : 0.05% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.01% optimize.rewriter_after_opt_a : 0.000052s : 0.14% optimize.convert_after_rewriter : 0.000013s : 0.03% optimize.order_py_execute_after_rewriter : 0.000009s : 0.02% optimize.mutable_eliminate : 0.000616s : 1.66% optimize.opt_b.b_1 : 0.000200s : 0.54% optimize.opt_b.b_2 : 0.000010s : 0.03% optimize.opt_b.updatestate_depend_eliminate : 0.000011s : 0.03% optimize.opt_b.updatestate_assign_eliminate : 0.000004s : 0.01% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.01% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000030s : 0.08% optimize.optimize_parallel_all_gather_comm : 0.000025s : 0.07% optimize.overlap_param_gather : 0.000005s : 0.01% optimize.cconv : 0.024139s : 65.05% optimize.loop_unroll : 0.000813s : 2.19% optimize.opt_after_cconv.c_1 : 0.000043s : 0.12% optimize.opt_after_cconv.parameter_eliminate : 0.000007s : 0.02% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000014s : 0.04% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000004s : 0.01% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000004s : 0.01% optimize.opt_after_cconv.cse : 0.000049s : 0.13% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000022s : 0.06% optimize.tuple_transform.d_1 : 0.000064s : 0.17% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000010s : 0.03% optimize.partial_unused_args_eliminate : 0.000005s : 0.01% optimize.add_recomputation : 0.000076s : 0.21% optimize.cse_after_recomputation.cse : 0.000019s : 0.05% optimize.environ_conv : 0.000012s : 0.03% optimize.swap_dp_allreduce_reducescatter : 0.000010s : 0.03% optimize.bias_add_comm_swap : 0.000007s : 0.02% optimize.label_micro_interleaved_index : 0.000012s : 0.03% optimize.label_fine_grained_interleaved_index : 0.000005s : 0.01% optimize.merge_cast_opt : 0.000004s : 0.01% optimize.slice_recompute_activation : 0.000005s : 0.01% optimize.micro_interleaved_order_control : 0.000005s : 0.01% optimize.assign_add_opt : 0.000004s : 0.01% optimize.ForceFp32Comm : 0.000003s : 0.01% optimize.remove_cast_before_assign_add : 0.000003s : 0.01% optimize.full_micro_interleaved_order_control : 0.000005s : 0.01% optimize.reorder_send_recv_between_fp_bp : 0.000005s : 0.01% optimize.comm_op_add_attrs : 0.000004s : 0.01% optimize.add_comm_op_reuse_tag : 0.000004s : 0.01% optimize.interleave_split_concat_branches : 0.000004s : 0.01% optimize.interleave_parallel_branches : 0.000004s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000005s : 0.01% optimize.control_data_broadcast_order : 0.000022s : 0.06% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.01% optimize.offloading_packed_experts : 0.000008s : 0.02% optimize.overlap_recompute_and_grad_model_parallel : 0.000009s : 0.02% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.01% optimize.overlap_recompute_comm : 0.000006s : 0.02% optimize.overlap_grad_ring_attention : 0.000008s : 0.02% optimize.overlap_grad_flash_sp : 0.000030s : 0.08% optimize.begin_end_overlap_inline : 0.000003s : 0.01% optimize.split_matmul_comm_elemetwise : 0.000005s : 0.01% optimize.split_layernorm_comm : 0.000005s : 0.01% optimize.handle_group_info : 0.000004s : 0.01% optimize.symbol_engine_optimizer.build : 0.000005s : 0.01% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000016s : 0.04% optimize.symbol_engine_optimizer.elim_not_effective : 0.000018s : 0.05% optimize.symbol_engine_optimizer.opt_reshape : 0.000011s : 0.03% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000015s : 0.04% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000005s : 0.01% pipeline_parallel_scheduler : 0.000003s : 0.01% auto_monad_reorder : 0.000026s : 0.07% get_jit_bprop_graph : 0.000002s : 0.01% rewriter_after_jit_bprop_graph : 0.000007s : 0.02% opt_after_jit_grad : 0.000564s : 1.52% validate : 0.000050s : 0.14% Time group info: ------[substitution.] 0.000232 38 11.94% : 0.000028s : 3: substitution.cast_eliminate 0.95% : 0.000002s : 3: substitution.elim_not_effective 0.90% : 0.000002s : 3: substitution.fold_const_symbol 3.16% : 0.000007s : 5: substitution.graph_param_transform 67.86% : 0.000157s : 4: substitution.inline 2.47% : 0.000006s : 6: substitution.j_node_and_user_rematch 3.17% : 0.000007s : 6: substitution.remove_not_recompute_node 2.70% : 0.000006s : 4: substitution.replace_old_param 6.86% : 0.000016s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.006201 2 87.86% : 0.005448s : 1: type_inference.infer 12.14% : 0.000753s : 1: type_inference.specialize ------[replace.] 0.000070 8 57.79% : 0.000040s : 4: replace.inline 42.21% : 0.000029s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000168 8 91.65% : 0.000154s : 4: match.inline 8.35% : 0.000014s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000270 1596 0.87% : 0.000002s : 17: predicate.accumulaten_eliminater 0.80% : 0.000002s : 5: predicate.ad_related_special_op_eliminate 0.59% : 0.000002s : 10: predicate.addn_check_dump 0.87% : 0.000002s : 17: predicate.addn_zero_filter 0.80% : 0.000002s : 17: predicate.adjust_all_reduce_mul_add 2.00% : 0.000005s : 27: predicate.arithmetic_simplify 1.07% : 0.000003s : 17: predicate.cast_eliminate 0.56% : 0.000002s : 10: predicate.check_bprop_eliminate 0.55% : 0.000001s : 10: predicate.compare_switch_simplify 0.17% : 0.000000s : 5: predicate.const_output_eliminate 0.59% : 0.000002s : 10: predicate.depend_value_elim 0.95% : 0.000003s : 17: predicate.dict_get_item_const_eliminator 1.00% : 0.000003s : 17: predicate.dict_get_item_eliminator 0.87% : 0.000002s : 17: predicate.dict_set_item_eliminator 0.96% : 0.000003s : 10: predicate.dumpgradient_eliminate 0.19% : 0.000001s : 5: predicate.elim_not_effective 0.54% : 0.000001s : 5: predicate.elim_shapecalc_of_broadcastargs 1.10% : 0.000003s : 22: predicate.environ_add_const_eliminate 1.08% : 0.000003s : 22: predicate.environ_get_add_eliminate 1.07% : 0.000003s : 22: predicate.environ_get_depend_swap 1.89% : 0.000005s : 32: predicate.environ_get_eliminate 1.11% : 0.000003s : 22: predicate.environ_get_set_eliminate 1.34% : 0.000004s : 25: predicate.exchange_switch_depend_value 2.49% : 0.000007s : 25: predicate.float_depend_g_call 0.50% : 0.000001s : 10: predicate.float_environ_get_switch 0.78% : 0.000002s : 15: predicate.float_tuple_getitem_switch 0.18% : 0.000000s : 5: predicate.fold_const_symbol 0.74% : 0.000002s : 10: predicate.get_grad_eliminate 0.24% : 0.000001s : 5: predicate.graph_param_transform 0.57% : 0.000002s : 10: predicate.incorporate_call 0.50% : 0.000001s : 10: predicate.incorporate_call_switch 7.09% : 0.000019s : 72: predicate.inline 0.88% : 0.000002s : 10: predicate.inline_without_move 0.30% : 0.000001s : 10: predicate.j_node_and_user_rematch 0.70% : 0.000002s : 10: predicate.less_batch_normalization 1.77% : 0.000005s : 31: predicate.list_to_tuple_eliminator_ 2.49% : 0.000007s : 48: predicate.load_eliminater 1.51% : 0.000004s : 5: predicate.loop_unroll_after_grad 1.89% : 0.000005s : 36: predicate.loop_unroll_before_grad 1.85% : 0.000005s : 27: predicate.make_slice_get_slice_eliminator 0.53% : 0.000001s : 10: predicate.merge_addn 0.58% : 0.000002s : 10: predicate.micro_step_allgather_replace 0.69% : 0.000002s : 10: predicate.mini_step_allgather_replace 0.81% : 0.000002s : 17: predicate.minmaximum_grad 1.35% : 0.000004s : 5: predicate.mutable_eliminate 0.55% : 0.000001s : 5: predicate.opt_reshape 0.34% : 0.000001s : 5: predicate.parallel_virtual_node 1.55% : 0.000004s : 25: predicate.partial_defer_inline 1.58% : 0.000004s : 26: predicate.partial_eliminate 0.98% : 0.000003s : 17: predicate.print_const_string_wrapper 0.55% : 0.000001s : 10: predicate.reduce_all_const_elim 1.09% : 0.000003s : 17: predicate.reduce_eliminate 2.57% : 0.000007s : 48: predicate.redundant_stop_gradient_eliminater 0.61% : 0.000002s : 10: predicate.remove_not_recompute_node 1.29% : 0.000003s : 31: predicate.replace_applicator 0.39% : 0.000001s : 10: predicate.replace_old_param 0.34% : 0.000001s : 5: predicate.reset_defer_inline 0.90% : 0.000002s : 17: predicate.reshape_eliminate 0.57% : 0.000002s : 10: predicate.row_tensor_add_zeros_like 0.32% : 0.000001s : 5: predicate.row_tensor_eliminate 0.82% : 0.000002s : 10: predicate.same_eliminate 0.50% : 0.000001s : 10: predicate.set_cell_output_no_recompute 0.93% : 0.000003s : 10: predicate.shard_identity_eliminate 0.66% : 0.000002s : 10: predicate.special_op_eliminate 0.69% : 0.000002s : 10: predicate.specialize_transform 1.11% : 0.000003s : 10: predicate.split_environ_get_set_with_tuple_value 0.84% : 0.000002s : 10: predicate.stack_unstack_eliminate 0.39% : 0.000001s : 5: predicate.switch_call_monad_eliminater 1.38% : 0.000004s : 25: predicate.switch_defer_inline 1.93% : 0.000005s : 35: predicate.switch_layer_defer_inline 4.59% : 0.000012s : 76: predicate.switch_simplify 0.90% : 0.000002s : 17: predicate.tile_eliminate 0.88% : 0.000002s : 17: predicate.transpose_eliminate 1.60% : 0.000004s : 27: predicate.tuple_list_convert_item_index_to_positive 1.64% : 0.000004s : 27: predicate.tuple_list_get_item_const_eliminator 1.48% : 0.000004s : 27: predicate.tuple_list_get_item_depend_reorder 3.23% : 0.000009s : 41: predicate.tuple_list_get_item_eliminator 1.71% : 0.000005s : 27: predicate.tuple_list_get_set_item_eliminator 2.35% : 0.000006s : 37: predicate.tuple_list_set_item_eliminator 1.67% : 0.000005s : 31: predicate.tuple_to_list_eliminator_ 2.40% : 0.000006s : 48: predicate.updatestate_pure_node_eliminater 3.08% : 0.000008s : 58: predicate.updatestate_useless_node_eliminater 0.30% : 0.000001s : 5: predicate.value_based_eliminate 0.60% : 0.000002s : 10: predicate.virtual_dataset_eliminate 0.54% : 0.000001s : 10: predicate.virtual_output_eliminate 0.27% : 0.000001s : 5: predicate.virtual_view_grad_eliminate 0.46% : 0.000001s : 5: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.024645 11 98.90% : 0.024374s : 5: func_graph_cloner_run.FuncGraphClonerGraph 1.10% : 0.000270s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.079184 192 0.01% : 0.000006s : 1: ForceFp32Comm 4.19% : 0.003317s : 1: add_attr 4.17% : 0.003299s : 1: add_attr_with_inline 0.01% : 0.000007s : 1: add_comm_op_reuse_tag 0.10% : 0.000081s : 1: add_recomputation 0.01% : 0.000007s : 1: assign_add_opt 0.10% : 0.000079s : 1: auto_monad 0.04% : 0.000035s : 1: auto_monad_reorder 0.01% : 0.000007s : 1: begin_end_overlap_inline 0.01% : 0.000009s : 1: bias_add_comm_swap 0.61% : 0.000482s : 1: bootstrap 30.52% : 0.024165s : 1: cconv 0.01% : 0.000007s : 1: comm_op_add_attrs 0.03% : 0.000026s : 1: control_data_broadcast_order 0.02% : 0.000017s : 1: convert_after_rewriter 0.05% : 0.000042s : 1: cse_after_recomputation 0.01% : 0.000008s : 1: dataset_repeat_opt 0.03% : 0.000023s : 1: detach_backward 0.02% : 0.000014s : 1: environ_conv 0.04% : 0.000032s : 1: event_method 0.01% : 0.000007s : 1: full_micro_interleaved_order_control 0.01% : 0.000009s : 1: get_jit_bprop_graph 0.02% : 0.000013s : 1: graph_reusing 0.01% : 0.000008s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000007s : 1: handle_group_info 0.01% : 0.000008s : 1: inline 0.01% : 0.000009s : 1: insert-virtual-dataset 0.01% : 0.000006s : 1: interleave_parallel_branches 0.01% : 0.000007s : 1: interleave_split_concat_branches 0.01% : 0.000008s : 1: label_fine_grained_interleaved_index 0.02% : 0.000014s : 1: label_micro_interleaved_index 1.04% : 0.000825s : 1: loop_unroll 0.01% : 0.000007s : 1: merge_cast_opt 0.01% : 0.000008s : 1: micro_interleaved_order_control 0.79% : 0.000624s : 1: mutable_eliminate 0.01% : 0.000011s : 1: offloading_packed_experts 0.04% : 0.000030s : 1: opt.transform.loop_unroll_optimizer 0.03% : 0.000024s : 1: opt.transform.mutable_eliminate 1.91% : 0.001515s : 78: opt.transform.opt_a 0.05% : 0.000041s : 1: opt.transform.opt_after_cconv 0.04% : 0.000033s : 1: opt.transform.opt_after_jit_grad 0.17% : 0.000136s : 28: opt.transform.opt_b 0.09% : 0.000070s : 2: opt.transform.opt_trans_graph 0.07% : 0.000056s : 4: opt.transform.symbol_engine_opt 4.69% : 0.003712s : 1: opt_a 0.25% : 0.000200s : 1: opt_after_cconv 0.73% : 0.000575s : 1: opt_after_jit_grad 0.42% : 0.000334s : 1: opt_b 39.55% : 0.031315s : 1: optimize 0.04% : 0.000029s : 1: optimize_parallel_all_gather_comm 0.01% : 0.000012s : 1: order_py_execute_after_rewriter 0.04% : 0.000034s : 1: overlap_grad_flash_sp 0.01% : 0.000007s : 1: overlap_grad_matmul_and_grad_allreduce 0.02% : 0.000013s : 1: overlap_grad_ring_attention 0.01% : 0.000009s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000007s : 1: overlap_opt_shard_in_pipeline 0.01% : 0.000008s : 1: overlap_param_gather 0.01% : 0.000007s : 1: overlap_recompute_allgather_and_fa_grad 0.02% : 0.000012s : 1: overlap_recompute_and_grad_model_parallel 0.01% : 0.000010s : 1: overlap_recompute_comm 0.01% : 0.000011s : 1: parallel-infer-symbol 0.01% : 0.000007s : 1: parallel-infer-symbol-second 0.01% : 0.000008s : 1: partial_unused_args_eliminate 0.01% : 0.000011s : 1: pipeline_parallel_scheduler 0.01% : 0.000008s : 1: pipeline_split 0.06% : 0.000048s : 1: pre_auto_parallel 0.05% : 0.000038s : 1: py_interpret_to_execute 0.03% : 0.000023s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000007s : 1: remove_cast_before_assign_add 0.03% : 0.000025s : 1: remove_dup_value 0.56% : 0.000443s : 1: renormalize.infer 0.45% : 0.000359s : 1: renormalize.specialize 0.01% : 0.000008s : 1: reorder_send_recv_between_fp_bp 0.02% : 0.000013s : 1: rewriter_after_jit_bprop_graph 0.07% : 0.000056s : 1: rewriter_after_opt_a 0.12% : 0.000097s : 1: rewriter_before_opt_a 0.01% : 0.000008s : 1: slice_cell_reuse_recomputed_activation 0.01% : 0.000008s : 1: slice_recompute_activation 0.01% : 0.000008s : 1: split_layernorm_comm 0.01% : 0.000008s : 1: split_matmul_comm_elemetwise 0.02% : 0.000014s : 1: swap_dp_allreduce_reducescatter 0.17% : 0.000134s : 1: symbol_engine_optimizer 0.15% : 0.000120s : 1: tuple_transform 7.96% : 0.006303s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:45:16.750.333 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0330941, [21] [bootstrap]: 0.00045206 [type_inference]: 0.00625052 [event_method]: 3.299e-05 [auto_monad]: 6.472e-05 [graph_reusing]: 5.82999e-06 [inline]: 2.71e-06 [add_attr]: 0.00317232, [1] [add_attr_with_inline]: 0.00316148, [1] [Cycle 1]: 6.502e-05, [2] [tag_attr]: 2.128e-05 [meta_addattr_fg_expand]: 5.94e-06 [parallel-infer-symbol]: 3.63e-06 [pre_auto_parallel]: 3.769e-05 [insert-virtual-dataset]: 2.71999e-06 [parallel-infer-symbol-second]: 7.79983e-07 [dataset_repeat_opt]: 1.75001e-06 [pipeline_split]: 1.66e-06 [optimize]: 0.0221824, [53] [py_interpret_to_execute]: 2.838e-05 [rewriter_before_opt_a]: 9.4e-05 [opt_a]: 0.00330399, [2] [Cycle 1]: 0.00248314, [45] [expand_dump_flag]: 3.38e-06 [switch_simplify]: 4.599e-05 [loop_unroll]: 3.178e-05 [a_1]: 0.00079882 [with_stream_mark]: 2.093e-05 [recompute_prepare]: 1.25e-05 [updatestate_depend_eliminate]: 5.22e-06 [updatestate_assign_eliminate]: 4.22998e-06 [updatestate_loads_eliminate]: 3.63999e-06 [parameter_eliminate]: 2.09e-06 [a_2]: 0.00010315 [accelerated_algorithm]: 9.39e-06 [shard]: 1.96e-06 [meta_shard_fg_expand]: 2.30002e-06 [shard_inline]: 8.26002e-06 [merge_send_recv]: 1.089e-05 [auto_parallel]: 8.07998e-06 [parallel]: 1.974e-05 [flash_sp]: 9.56998e-06 [merge_comm]: 4.97999e-06 [allreduce_fusion]: 4.78001e-06 [matmul_add_comm_reduction]: 1.136e-05 [allreduce_slice_to_reducescatter]: 7.60017e-07 [virtual_shard_identity]: 9.36998e-06 [virtual_dataset]: 8.25999e-06 [get_grad_eliminate_]: 7.48999e-06 [virtual_output]: 7.61001e-06 [merge_forward]: 4.76002e-06 [cell_reuse_recompute_pass]: 1.72001e-06 [offload_activation]: 1.241e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.756e-05 [merge_recompute_call_nodes]: 1.60999e-06 [before_grad]: 1.283e-05 [set_forward_comm_id_for_comm_node_pass]: 5.40999e-06 [meta_fg_expand]: 3.54002e-06 [flash_sp_send_recv_attached]: 3.85e-06 [receive_attached]: 2.21998e-06 [after_resolve]: 1.438e-05 [a_after_grad]: 1.276e-05 [renormalize]: 0.00084028 [add_forward_monad_depend]: 6.25002e-06 [auto_monad_grad]: 2.67001e-06 [auto_monad_eliminator]: 1.975e-05 [cse]: 3.938e-05 [a_3]: 5.832e-05 [Cycle 2]: 0.00080941, [45] [expand_dump_flag]: 1.62999e-06 [switch_simplify]: 9.69e-06 [loop_unroll]: 7.47002e-06 [a_1]: 0.00017911 [with_stream_mark]: 1.618e-05 [recompute_prepare]: 8.27e-06 [updatestate_depend_eliminate]: 4.00998e-06 [updatestate_assign_eliminate]: 3.35e-06 [updatestate_loads_eliminate]: 2.92002e-06 [parameter_eliminate]: 1.79e-06 [a_2]: 9.172e-05 [accelerated_algorithm]: 8.47e-06 [shard]: 1.61998e-06 [meta_shard_fg_expand]: 2.22999e-06 [shard_inline]: 8.07e-06 [merge_send_recv]: 7.21999e-06 [auto_parallel]: 7.78001e-06 [parallel]: 5.49e-06 [flash_sp]: 3.86001e-06 [merge_comm]: 4.15e-06 [allreduce_fusion]: 3.93001e-06 [matmul_add_comm_reduction]: 1.119e-05 [allreduce_slice_to_reducescatter]: 3.89991e-07 [virtual_shard_identity]: 1.045e-05 [virtual_dataset]: 8.01001e-06 [get_grad_eliminate_]: 7.11001e-06 [virtual_output]: 7.1e-06 [merge_forward]: 3.99997e-06 [cell_reuse_recompute_pass]: 1.67999e-06 [offload_activation]: 9.01002e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.632e-05 [merge_recompute_call_nodes]: 7.30011e-07 [before_grad]: 1.186e-05 [set_forward_comm_id_for_comm_node_pass]: 4.45e-06 [meta_fg_expand]: 2.89999e-06 [flash_sp_send_recv_attached]: 1.09e-06 [receive_attached]: 1.60999e-06 [after_resolve]: 1.377e-05 [a_after_grad]: 1.17e-05 [renormalize]: 8.00064e-08 [add_forward_monad_depend]: 1.91e-06 [auto_monad_grad]: 1.57001e-06 [auto_monad_eliminator]: 1.233e-05 [cse]: 2.043e-05 [a_3]: 4.536e-05 [py_interpret_to_execute_after_opt_a]: 1.363e-05 [slice_cell_reuse_recomputed_activation]: 2.56e-06 [rewriter_after_opt_a]: 4.457e-05 [convert_after_rewriter]: 8.27e-06 [order_py_execute_after_rewriter]: 5.74e-06 [mutable_eliminate]: 0.00061335 [opt_b]: 0.00025704, [1] [Cycle 1]: 0.00024867, [7] [b_1]: 0.00015555 [b_2]: 1.002e-05 [updatestate_depend_eliminate]: 9.54e-06 [updatestate_assign_eliminate]: 3.76999e-06 [updatestate_loads_eliminate]: 3.27002e-06 [renormalize]: 8.50006e-07 [cse]: 2.674e-05 [optimize_parallel_all_gather_comm]: 1.93e-05 [overlap_param_gather]: 1.92001e-06 [cconv]: 3.01e-05 [loop_unroll]: 0.0168457 [opt_after_cconv]: 0.00018219, [1] [Cycle 1]: 0.00017132, [7] [c_1]: 4.492e-05 [parameter_eliminate]: 7.28999e-06 [updatestate_depend_eliminate]: 1.599e-05 [updatestate_assign_eliminate]: 4.27998e-06 [updatestate_loads_eliminate]: 3.91001e-06 [cse]: 5.316e-05 [renormalize]: 1.19e-06 [remove_dup_value]: 1.941e-05 [tuple_transform]: 0.0001057, [1] [Cycle 1]: 0.00010025, [4] [d_1]: 6.692e-05 [none_parameter_eliminate]: 2.13998e-06 [renormalize]: 2.69996e-07 [switch_simplify]: 9.15999e-06 [partial_unused_args_eliminate]: 2.44999e-06 [add_recomputation]: 7.359e-05 [cse_after_recomputation]: 3.12e-05, [1] [Cycle 1]: 2.569e-05, [1] [cse]: 1.872e-05 [environ_conv]: 8.75001e-06 [swap_dp_allreduce_reducescatter]: 6.22001e-06 [bias_add_comm_swap]: 3.58e-06 [label_micro_interleaved_index]: 7.45e-06 [label_fine_grained_interleaved_index]: 2.94001e-06 [merge_cast_opt]: 1.48002e-06 [slice_recompute_activation]: 2.19999e-06 [micro_interleaved_order_control]: 2.91e-06 [assign_add_opt]: 1.54e-06 [ForceFp32Comm]: 1.17e-06 [remove_cast_before_assign_add]: 1.33002e-06 [full_micro_interleaved_order_control]: 2.62001e-06 [reorder_send_recv_between_fp_bp]: 2.82002e-06 [comm_op_add_attrs]: 1.14e-06 [add_comm_op_reuse_tag]: 1.21002e-06 [interleave_split_concat_branches]: 1.19003e-06 [interleave_parallel_branches]: 1.10001e-06 [overlap_opt_shard_in_pipeline]: 1.49998e-06 [overlap_opt_shard_grad_in_pipeline]: 1.89999e-06 [control_data_broadcast_order]: 2.03e-05 [grouped_pairwise_exchange_alltoall]: 1.61002e-06 [offloading_packed_experts]: 4.80999e-06 [overlap_recompute_and_grad_model_parallel]: 5.73002e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.17999e-06 [overlap_recompute_allgather_and_fa_grad]: 1.30001e-06 [overlap_recompute_comm]: 2.56e-06 [overlap_grad_ring_attention]: 4.74e-06 [overlap_grad_flash_sp]: 2.753e-05 [begin_end_overlap_inline]: 4.89992e-07 [split_matmul_comm_elemetwise]: 2.65002e-06 [split_layernorm_comm]: 1.73997e-06 [handle_group_info]: 1.24e-06 [symbol_engine_optimizer]: 0.00010305, [1] [Cycle 1]: 9.764e-05, [6] [build]: 4.87e-06 [elim_shapecalc]: 1.77e-05 [elim_not_effective]: 1.742e-05 [opt_reshape]: 9.59e-06 [fold_const_symbol]: 1.27e-05 [renormalize]: 2.19996e-07 [detach_backward]: 2.67001e-06 [pipeline_parallel_scheduler]: 1.94e-06 [auto_monad_reorder]: 2.428e-05 [get_jit_bprop_graph]: 2.76e-06 [rewriter_after_jit_bprop_graph]: 7.76001e-06 [opt_after_jit_grad]: 0.00062382 [validate]: 5.453e-05 Sums bootstrap : 0.000452s : 1.57% type_inference : 0.006251s : 21.68% event_method : 0.000033s : 0.11% auto_monad : 0.000065s : 0.22% graph_reusing : 0.000006s : 0.02% inline : 0.000003s : 0.01% add_attr.add_attr_with_inline.tag_attr : 0.000021s : 0.07% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.02% parallel-infer-symbol : 0.000004s : 0.01% pre_auto_parallel : 0.000038s : 0.13% insert-virtual-dataset : 0.000003s : 0.01% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.01% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000028s : 0.10% optimize.rewriter_before_opt_a : 0.000094s : 0.33% optimize.opt_a.expand_dump_flag : 0.000005s : 0.02% optimize.opt_a.switch_simplify : 0.000056s : 0.19% optimize.opt_a.loop_unroll : 0.000039s : 0.14% optimize.opt_a.a_1 : 0.000978s : 3.39% optimize.opt_a.with_stream_mark : 0.000037s : 0.13% optimize.opt_a.recompute_prepare : 0.000021s : 0.07% optimize.opt_a.updatestate_depend_eliminate : 0.000009s : 0.03% optimize.opt_a.updatestate_assign_eliminate : 0.000008s : 0.03% optimize.opt_a.updatestate_loads_eliminate : 0.000007s : 0.02% optimize.opt_a.parameter_eliminate : 0.000004s : 0.01% optimize.opt_a.a_2 : 0.000195s : 0.68% optimize.opt_a.accelerated_algorithm : 0.000018s : 0.06% optimize.opt_a.shard : 0.000004s : 0.01% optimize.opt_a.meta_shard_fg_expand : 0.000005s : 0.02% optimize.opt_a.shard_inline : 0.000016s : 0.06% optimize.opt_a.merge_send_recv : 0.000018s : 0.06% optimize.opt_a.auto_parallel : 0.000016s : 0.06% optimize.opt_a.parallel : 0.000025s : 0.09% optimize.opt_a.flash_sp : 0.000013s : 0.05% optimize.opt_a.merge_comm : 0.000009s : 0.03% optimize.opt_a.allreduce_fusion : 0.000009s : 0.03% optimize.opt_a.matmul_add_comm_reduction : 0.000023s : 0.08% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000020s : 0.07% optimize.opt_a.virtual_dataset : 0.000016s : 0.06% optimize.opt_a.get_grad_eliminate_ : 0.000015s : 0.05% optimize.opt_a.virtual_output : 0.000015s : 0.05% optimize.opt_a.merge_forward : 0.000009s : 0.03% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.01% optimize.opt_a.offload_activation : 0.000021s : 0.07% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000034s : 0.12% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.01% optimize.opt_a.before_grad : 0.000025s : 0.09% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000010s : 0.03% optimize.opt_a.meta_fg_expand : 0.000006s : 0.02% optimize.opt_a.flash_sp_send_recv_attached : 0.000005s : 0.02% optimize.opt_a.receive_attached : 0.000004s : 0.01% optimize.opt_a.after_resolve : 0.000028s : 0.10% optimize.opt_a.a_after_grad : 0.000024s : 0.08% optimize.opt_a.renormalize : 0.000840s : 2.91% optimize.opt_a.add_forward_monad_depend : 0.000008s : 0.03% optimize.opt_a.auto_monad_grad : 0.000004s : 0.01% optimize.opt_a.auto_monad_eliminator : 0.000032s : 0.11% optimize.opt_a.cse : 0.000060s : 0.21% optimize.opt_a.a_3 : 0.000104s : 0.36% optimize.py_interpret_to_execute_after_opt_a : 0.000014s : 0.05% optimize.slice_cell_reuse_recomputed_activation : 0.000003s : 0.01% optimize.rewriter_after_opt_a : 0.000045s : 0.15% optimize.convert_after_rewriter : 0.000008s : 0.03% optimize.order_py_execute_after_rewriter : 0.000006s : 0.02% optimize.mutable_eliminate : 0.000613s : 2.13% optimize.opt_b.b_1 : 0.000156s : 0.54% optimize.opt_b.b_2 : 0.000010s : 0.03% optimize.opt_b.updatestate_depend_eliminate : 0.000010s : 0.03% optimize.opt_b.updatestate_assign_eliminate : 0.000004s : 0.01% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.01% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000027s : 0.09% optimize.optimize_parallel_all_gather_comm : 0.000019s : 0.07% optimize.overlap_param_gather : 0.000002s : 0.01% optimize.cconv : 0.000030s : 0.10% optimize.loop_unroll : 0.016846s : 58.42% optimize.opt_after_cconv.c_1 : 0.000045s : 0.16% optimize.opt_after_cconv.parameter_eliminate : 0.000007s : 0.03% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000016s : 0.06% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000004s : 0.01% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000004s : 0.01% optimize.opt_after_cconv.cse : 0.000053s : 0.18% optimize.opt_after_cconv.renormalize : 0.000001s : 0.00% optimize.remove_dup_value : 0.000019s : 0.07% optimize.tuple_transform.d_1 : 0.000067s : 0.23% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000009s : 0.03% optimize.partial_unused_args_eliminate : 0.000002s : 0.01% optimize.add_recomputation : 0.000074s : 0.26% optimize.cse_after_recomputation.cse : 0.000019s : 0.06% optimize.environ_conv : 0.000009s : 0.03% optimize.swap_dp_allreduce_reducescatter : 0.000006s : 0.02% optimize.bias_add_comm_swap : 0.000004s : 0.01% optimize.label_micro_interleaved_index : 0.000007s : 0.03% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.01% optimize.merge_cast_opt : 0.000001s : 0.01% optimize.slice_recompute_activation : 0.000002s : 0.01% optimize.micro_interleaved_order_control : 0.000003s : 0.01% optimize.assign_add_opt : 0.000002s : 0.01% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.00% optimize.full_micro_interleaved_order_control : 0.000003s : 0.01% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.01% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.01% optimize.control_data_broadcast_order : 0.000020s : 0.07% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.01% optimize.offloading_packed_experts : 0.000005s : 0.02% optimize.overlap_recompute_and_grad_model_parallel : 0.000006s : 0.02% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.00% optimize.overlap_recompute_comm : 0.000003s : 0.01% optimize.overlap_grad_ring_attention : 0.000005s : 0.02% optimize.overlap_grad_flash_sp : 0.000028s : 0.10% optimize.begin_end_overlap_inline : 0.000000s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000003s : 0.01% optimize.split_layernorm_comm : 0.000002s : 0.01% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000005s : 0.02% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000018s : 0.06% optimize.symbol_engine_optimizer.elim_not_effective : 0.000017s : 0.06% optimize.symbol_engine_optimizer.opt_reshape : 0.000010s : 0.03% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000013s : 0.04% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000003s : 0.01% pipeline_parallel_scheduler : 0.000002s : 0.01% auto_monad_reorder : 0.000024s : 0.08% get_jit_bprop_graph : 0.000003s : 0.01% rewriter_after_jit_bprop_graph : 0.000008s : 0.03% opt_after_jit_grad : 0.000624s : 2.16% validate : 0.000055s : 0.19% Time group info: ------[substitution.] 0.000225 38 11.00% : 0.000025s : 3: substitution.cast_eliminate 1.28% : 0.000003s : 3: substitution.elim_not_effective 0.80% : 0.000002s : 3: substitution.fold_const_symbol 3.70% : 0.000008s : 5: substitution.graph_param_transform 69.45% : 0.000156s : 4: substitution.inline 1.86% : 0.000004s : 6: substitution.j_node_and_user_rematch 2.56% : 0.000006s : 6: substitution.remove_not_recompute_node 2.41% : 0.000005s : 4: substitution.replace_old_param 6.95% : 0.000016s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.006178 2 87.83% : 0.005426s : 1: type_inference.infer 12.17% : 0.000752s : 1: type_inference.specialize ------[replace.] 0.000069 8 56.29% : 0.000039s : 4: replace.inline 43.71% : 0.000030s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000166 8 91.85% : 0.000152s : 4: match.inline 8.15% : 0.000013s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000273 1596 0.93% : 0.000003s : 17: predicate.accumulaten_eliminater 0.85% : 0.000002s : 5: predicate.ad_related_special_op_eliminate 0.51% : 0.000001s : 10: predicate.addn_check_dump 0.93% : 0.000003s : 17: predicate.addn_zero_filter 0.83% : 0.000002s : 17: predicate.adjust_all_reduce_mul_add 1.97% : 0.000005s : 27: predicate.arithmetic_simplify 1.08% : 0.000003s : 17: predicate.cast_eliminate 0.61% : 0.000002s : 10: predicate.check_bprop_eliminate 0.54% : 0.000001s : 10: predicate.compare_switch_simplify 0.17% : 0.000000s : 5: predicate.const_output_eliminate 0.63% : 0.000002s : 10: predicate.depend_value_elim 0.94% : 0.000003s : 17: predicate.dict_get_item_const_eliminator 1.02% : 0.000003s : 17: predicate.dict_get_item_eliminator 0.84% : 0.000002s : 17: predicate.dict_set_item_eliminator 1.02% : 0.000003s : 10: predicate.dumpgradient_eliminate 0.20% : 0.000001s : 5: predicate.elim_not_effective 0.53% : 0.000001s : 5: predicate.elim_shapecalc_of_broadcastargs 1.15% : 0.000003s : 22: predicate.environ_add_const_eliminate 1.07% : 0.000003s : 22: predicate.environ_get_add_eliminate 1.07% : 0.000003s : 22: predicate.environ_get_depend_swap 1.78% : 0.000005s : 32: predicate.environ_get_eliminate 1.11% : 0.000003s : 22: predicate.environ_get_set_eliminate 1.31% : 0.000004s : 25: predicate.exchange_switch_depend_value 2.24% : 0.000006s : 25: predicate.float_depend_g_call 0.49% : 0.000001s : 10: predicate.float_environ_get_switch 0.74% : 0.000002s : 15: predicate.float_tuple_getitem_switch 0.17% : 0.000000s : 5: predicate.fold_const_symbol 0.58% : 0.000002s : 10: predicate.get_grad_eliminate 0.25% : 0.000001s : 5: predicate.graph_param_transform 0.58% : 0.000002s : 10: predicate.incorporate_call 0.50% : 0.000001s : 10: predicate.incorporate_call_switch 6.63% : 0.000018s : 72: predicate.inline 0.88% : 0.000002s : 10: predicate.inline_without_move 0.30% : 0.000001s : 10: predicate.j_node_and_user_rematch 0.75% : 0.000002s : 10: predicate.less_batch_normalization 2.02% : 0.000006s : 31: predicate.list_to_tuple_eliminator_ 2.49% : 0.000007s : 48: predicate.load_eliminater 3.15% : 0.000009s : 5: predicate.loop_unroll_after_grad 1.93% : 0.000005s : 36: predicate.loop_unroll_before_grad 1.80% : 0.000005s : 27: predicate.make_slice_get_slice_eliminator 0.53% : 0.000001s : 10: predicate.merge_addn 0.55% : 0.000001s : 10: predicate.micro_step_allgather_replace 0.56% : 0.000002s : 10: predicate.mini_step_allgather_replace 0.89% : 0.000002s : 17: predicate.minmaximum_grad 1.05% : 0.000003s : 5: predicate.mutable_eliminate 0.41% : 0.000001s : 5: predicate.opt_reshape 0.30% : 0.000001s : 5: predicate.parallel_virtual_node 1.63% : 0.000004s : 25: predicate.partial_defer_inline 1.60% : 0.000004s : 26: predicate.partial_eliminate 0.89% : 0.000002s : 17: predicate.print_const_string_wrapper 0.55% : 0.000002s : 10: predicate.reduce_all_const_elim 1.16% : 0.000003s : 17: predicate.reduce_eliminate 2.46% : 0.000007s : 48: predicate.redundant_stop_gradient_eliminater 0.42% : 0.000001s : 10: predicate.remove_not_recompute_node 1.23% : 0.000003s : 31: predicate.replace_applicator 0.47% : 0.000001s : 10: predicate.replace_old_param 0.33% : 0.000001s : 5: predicate.reset_defer_inline 0.99% : 0.000003s : 17: predicate.reshape_eliminate 0.56% : 0.000002s : 10: predicate.row_tensor_add_zeros_like 0.45% : 0.000001s : 5: predicate.row_tensor_eliminate 0.66% : 0.000002s : 10: predicate.same_eliminate 0.39% : 0.000001s : 10: predicate.set_cell_output_no_recompute 0.71% : 0.000002s : 10: predicate.shard_identity_eliminate 0.72% : 0.000002s : 10: predicate.special_op_eliminate 0.74% : 0.000002s : 10: predicate.specialize_transform 1.00% : 0.000003s : 10: predicate.split_environ_get_set_with_tuple_value 0.79% : 0.000002s : 10: predicate.stack_unstack_eliminate 0.38% : 0.000001s : 5: predicate.switch_call_monad_eliminater 1.41% : 0.000004s : 25: predicate.switch_defer_inline 1.89% : 0.000005s : 35: predicate.switch_layer_defer_inline 4.79% : 0.000013s : 76: predicate.switch_simplify 0.86% : 0.000002s : 17: predicate.tile_eliminate 0.86% : 0.000002s : 17: predicate.transpose_eliminate 1.69% : 0.000005s : 27: predicate.tuple_list_convert_item_index_to_positive 1.67% : 0.000005s : 27: predicate.tuple_list_get_item_const_eliminator 1.53% : 0.000004s : 27: predicate.tuple_list_get_item_depend_reorder 3.35% : 0.000009s : 41: predicate.tuple_list_get_item_eliminator 1.49% : 0.000004s : 27: predicate.tuple_list_get_set_item_eliminator 2.23% : 0.000006s : 37: predicate.tuple_list_set_item_eliminator 1.70% : 0.000005s : 31: predicate.tuple_to_list_eliminator_ 2.32% : 0.000006s : 48: predicate.updatestate_pure_node_eliminater 3.03% : 0.000008s : 58: predicate.updatestate_useless_node_eliminater 0.32% : 0.000001s : 5: predicate.value_based_eliminate 0.61% : 0.000002s : 10: predicate.virtual_dataset_eliminate 0.57% : 0.000002s : 10: predicate.virtual_output_eliminate 0.27% : 0.000001s : 5: predicate.virtual_view_grad_eliminate 0.45% : 0.000001s : 5: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000573 11 53.51% : 0.000307s : 5: func_graph_cloner_run.FuncGraphClonerGraph 46.49% : 0.000267s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.061042 192 0.01% : 0.000004s : 1: ForceFp32Comm 5.20% : 0.003177s : 1: add_attr 5.19% : 0.003165s : 1: add_attr_with_inline 0.01% : 0.000004s : 1: add_comm_op_reuse_tag 0.13% : 0.000079s : 1: add_recomputation 0.01% : 0.000005s : 1: assign_add_opt 0.12% : 0.000071s : 1: auto_monad 0.05% : 0.000029s : 1: auto_monad_reorder 0.01% : 0.000003s : 1: begin_end_overlap_inline 0.01% : 0.000006s : 1: bias_add_comm_swap 0.79% : 0.000483s : 1: bootstrap 0.06% : 0.000034s : 1: cconv 0.01% : 0.000004s : 1: comm_op_add_attrs 0.04% : 0.000024s : 1: control_data_broadcast_order 0.02% : 0.000011s : 1: convert_after_rewriter 0.06% : 0.000034s : 1: cse_after_recomputation 0.01% : 0.000005s : 1: dataset_repeat_opt 0.01% : 0.000006s : 1: detach_backward 0.02% : 0.000012s : 1: environ_conv 0.07% : 0.000041s : 1: event_method 0.01% : 0.000006s : 1: full_micro_interleaved_order_control 0.01% : 0.000006s : 1: get_jit_bprop_graph 0.01% : 0.000009s : 1: graph_reusing 0.01% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000004s : 1: handle_group_info 0.01% : 0.000006s : 1: inline 0.01% : 0.000006s : 1: insert-virtual-dataset 0.01% : 0.000005s : 1: interleave_parallel_branches 0.01% : 0.000004s : 1: interleave_split_concat_branches 0.01% : 0.000006s : 1: label_fine_grained_interleaved_index 0.02% : 0.000011s : 1: label_micro_interleaved_index 27.64% : 0.016874s : 1: loop_unroll 0.01% : 0.000004s : 1: merge_cast_opt 0.01% : 0.000006s : 1: micro_interleaved_order_control 1.02% : 0.000622s : 1: mutable_eliminate 0.01% : 0.000008s : 1: offloading_packed_experts 0.07% : 0.000043s : 1: opt.transform.loop_unroll_optimizer 0.04% : 0.000022s : 1: opt.transform.mutable_eliminate 2.47% : 0.001510s : 78: opt.transform.opt_a 0.07% : 0.000043s : 1: opt.transform.opt_after_cconv 0.06% : 0.000037s : 1: opt.transform.opt_after_jit_grad 0.22% : 0.000133s : 28: opt.transform.opt_b 0.12% : 0.000074s : 2: opt.transform.opt_trans_graph 0.09% : 0.000053s : 4: opt.transform.symbol_engine_opt 5.42% : 0.003307s : 1: opt_a 0.31% : 0.000188s : 1: opt_after_cconv 1.04% : 0.000635s : 1: opt_after_jit_grad 0.43% : 0.000261s : 1: opt_b 36.35% : 0.022189s : 1: optimize 0.04% : 0.000023s : 1: optimize_parallel_all_gather_comm 0.02% : 0.000009s : 1: order_py_execute_after_rewriter 0.05% : 0.000031s : 1: overlap_grad_flash_sp 0.01% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.01% : 0.000009s : 1: overlap_grad_ring_attention 0.01% : 0.000006s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.01% : 0.000005s : 1: overlap_param_gather 0.01% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.01% : 0.000009s : 1: overlap_recompute_and_grad_model_parallel 0.01% : 0.000005s : 1: overlap_recompute_comm 0.01% : 0.000007s : 1: parallel-infer-symbol 0.01% : 0.000004s : 1: parallel-infer-symbol-second 0.01% : 0.000006s : 1: partial_unused_args_eliminate 0.01% : 0.000005s : 1: pipeline_parallel_scheduler 0.01% : 0.000005s : 1: pipeline_split 0.07% : 0.000042s : 1: pre_auto_parallel 0.05% : 0.000033s : 1: py_interpret_to_execute 0.03% : 0.000017s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000006s : 1: remove_cast_before_assign_add 0.04% : 0.000023s : 1: remove_dup_value 0.78% : 0.000477s : 1: renormalize.infer 0.58% : 0.000353s : 1: renormalize.specialize 0.01% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.02% : 0.000011s : 1: rewriter_after_jit_bprop_graph 0.08% : 0.000049s : 1: rewriter_after_opt_a 0.16% : 0.000098s : 1: rewriter_before_opt_a 0.01% : 0.000006s : 1: slice_cell_reuse_recomputed_activation 0.01% : 0.000005s : 1: slice_recompute_activation 0.01% : 0.000005s : 1: split_layernorm_comm 0.01% : 0.000005s : 1: split_matmul_comm_elemetwise 0.02% : 0.000009s : 1: swap_dp_allreduce_reducescatter 0.17% : 0.000106s : 1: symbol_engine_optimizer 0.18% : 0.000109s : 1: tuple_transform 10.27% : 0.006270s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:45:17.327.379 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:45:17.327.660 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0166819, [21] [bootstrap]: 0.00045668 [type_inference]: 0.00610047 [event_method]: 2.017e-05 [auto_monad]: 6.429e-05 [graph_reusing]: 5.88002e-06 [inline]: 2.51998e-06 [add_attr]: 0.00315943, [1] [add_attr_with_inline]: 0.00314946, [1] [Cycle 1]: 8.463e-05, [2] [tag_attr]: 2.025e-05 [meta_addattr_fg_expand]: 5.97999e-06 [parallel-infer-symbol]: 3.41999e-06 [pre_auto_parallel]: 3.431e-05 [insert-virtual-dataset]: 2.47001e-06 [parallel-infer-symbol-second]: 6.89994e-07 [dataset_repeat_opt]: 1.95001e-06 [pipeline_split]: 2.02001e-06 [optimize]: 0.00560887, [53] [py_interpret_to_execute]: 2.987e-05 [rewriter_before_opt_a]: 8.649e-05 [opt_a]: 0.00316086, [2] [Cycle 1]: 0.00227623, [45] [expand_dump_flag]: 3.31001e-06 [switch_simplify]: 4.272e-05 [loop_unroll]: 3.092e-05 [a_1]: 0.00062421 [with_stream_mark]: 2.097e-05 [recompute_prepare]: 1.033e-05 [updatestate_depend_eliminate]: 4.18001e-06 [updatestate_assign_eliminate]: 3.96001e-06 [updatestate_loads_eliminate]: 3.43e-06 [parameter_eliminate]: 2.34001e-06 [a_2]: 0.00010986 [accelerated_algorithm]: 8.57e-06 [shard]: 1.87001e-06 [meta_shard_fg_expand]: 2.18998e-06 [shard_inline]: 6.51e-06 [merge_send_recv]: 8.99e-06 [auto_parallel]: 7.48e-06 [parallel]: 1.97e-05 [flash_sp]: 9.31998e-06 [merge_comm]: 3.98999e-06 [allreduce_fusion]: 3.45e-06 [matmul_add_comm_reduction]: 1.027e-05 [allreduce_slice_to_reducescatter]: 6.39993e-07 [virtual_shard_identity]: 1.006e-05 [virtual_dataset]: 6.88998e-06 [get_grad_eliminate_]: 6.78e-06 [virtual_output]: 7.10002e-06 [merge_forward]: 4.84998e-06 [cell_reuse_recompute_pass]: 1.56002e-06 [offload_activation]: 1.089e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.565e-05 [merge_recompute_call_nodes]: 1.65001e-06 [before_grad]: 1.171e-05 [set_forward_comm_id_for_comm_node_pass]: 4.27e-06 [meta_fg_expand]: 2.76e-06 [flash_sp_send_recv_attached]: 3.17002e-06 [receive_attached]: 2.36e-06 [after_resolve]: 1.304e-05 [a_after_grad]: 1.048e-05 [renormalize]: 0.00068509 [add_forward_monad_depend]: 6.28998e-06 [auto_monad_grad]: 2.68998e-06 [auto_monad_eliminator]: 1.655e-05 [cse]: 2.899e-05 [a_3]: 6.386e-05 [Cycle 2]: 0.00086924, [45] [expand_dump_flag]: 1.89e-06 [switch_simplify]: 8.61002e-06 [loop_unroll]: 6.61e-06 [a_1]: 0.00012968 [with_stream_mark]: 1.312e-05 [recompute_prepare]: 6.81999e-06 [updatestate_depend_eliminate]: 3.93001e-06 [updatestate_assign_eliminate]: 2.56998e-06 [updatestate_loads_eliminate]: 2.22001e-06 [parameter_eliminate]: 1.54998e-06 [a_2]: 0.00010018 [accelerated_algorithm]: 7.38e-06 [shard]: 1.84998e-06 [meta_shard_fg_expand]: 1.67999e-06 [shard_inline]: 7.43e-06 [merge_send_recv]: 6.19001e-06 [auto_parallel]: 5.60001e-06 [parallel]: 6.14001e-06 [flash_sp]: 3.60998e-06 [merge_comm]: 3.8e-06 [allreduce_fusion]: 3.25e-06 [matmul_add_comm_reduction]: 7.83999e-06 [allreduce_slice_to_reducescatter]: 3.59985e-07 [virtual_shard_identity]: 9.40001e-06 [virtual_dataset]: 6.11998e-06 [get_grad_eliminate_]: 5.94999e-06 [virtual_output]: 5.87999e-06 [merge_forward]: 3.48e-06 [cell_reuse_recompute_pass]: 2.01e-06 [offload_activation]: 7.35e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.839e-05 [merge_recompute_call_nodes]: 8.39995e-07 [before_grad]: 1.072e-05 [set_forward_comm_id_for_comm_node_pass]: 4.12e-06 [meta_fg_expand]: 2.14999e-06 [flash_sp_send_recv_attached]: 1.19e-06 [receive_attached]: 1.54e-06 [after_resolve]: 1.114e-05 [a_after_grad]: 1.1e-05 [renormalize]: 8.00064e-08 [add_forward_monad_depend]: 2.29001e-06 [auto_monad_grad]: 1.79e-06 [auto_monad_eliminator]: 9.96e-06 [cse]: 1.528e-05 [a_3]: 5.07e-05 [py_interpret_to_execute_after_opt_a]: 1.463e-05 [slice_cell_reuse_recomputed_activation]: 5.04998e-06 [rewriter_after_opt_a]: 4.261e-05 [convert_after_rewriter]: 9.79999e-06 [order_py_execute_after_rewriter]: 7.66999e-06 [mutable_eliminate]: 0.00057599 [opt_b]: 0.00029687, [1] [Cycle 1]: 0.00028678, [7] [b_1]: 0.00016712 [b_2]: 8.60999e-06 [updatestate_depend_eliminate]: 2.136e-05 [updatestate_assign_eliminate]: 3.09999e-06 [updatestate_loads_eliminate]: 2.68998e-06 [renormalize]: 8.29983e-07 [cse]: 2.147e-05 [optimize_parallel_all_gather_comm]: 2.063e-05 [overlap_param_gather]: 5.40001e-06 [cconv]: 3.375e-05 [loop_unroll]: 0.00046299 [opt_after_cconv]: 0.00013432, [1] [Cycle 1]: 0.000125, [7] [c_1]: 3.149e-05 [parameter_eliminate]: 3.78001e-06 [updatestate_depend_eliminate]: 8.09002e-06 [updatestate_assign_eliminate]: 2.83e-06 [updatestate_loads_eliminate]: 2.56e-06 [cse]: 1.955e-05 [renormalize]: 4.69998e-07 [remove_dup_value]: 1.667e-05 [tuple_transform]: 9.686e-05, [1] [Cycle 1]: 8.985e-05, [4] [d_1]: 4.703e-05 [none_parameter_eliminate]: 1.74e-06 [renormalize]: 2.20025e-07 [switch_simplify]: 7.68999e-06 [partial_unused_args_eliminate]: 4.86002e-06 [add_recomputation]: 5.474e-05 [cse_after_recomputation]: 2.878e-05, [1] [Cycle 1]: 2.122e-05, [1] [cse]: 1.203e-05 [environ_conv]: 8.65001e-06 [swap_dp_allreduce_reducescatter]: 8.64998e-06 [bias_add_comm_swap]: 5.20999e-06 [label_micro_interleaved_index]: 7.26001e-06 [label_fine_grained_interleaved_index]: 5.68002e-06 [merge_cast_opt]: 3.88001e-06 [slice_recompute_activation]: 4.68001e-06 [micro_interleaved_order_control]: 4.95001e-06 [assign_add_opt]: 3.52002e-06 [ForceFp32Comm]: 3.16999e-06 [remove_cast_before_assign_add]: 3.35003e-06 [full_micro_interleaved_order_control]: 4.52998e-06 [reorder_send_recv_between_fp_bp]: 5.30001e-06 [comm_op_add_attrs]: 3.69002e-06 [add_comm_op_reuse_tag]: 3.35e-06 [interleave_split_concat_branches]: 3.51001e-06 [interleave_parallel_branches]: 3.75e-06 [overlap_opt_shard_in_pipeline]: 3.83001e-06 [overlap_opt_shard_grad_in_pipeline]: 4.12e-06 [control_data_broadcast_order]: 1.8e-05 [grouped_pairwise_exchange_alltoall]: 4.37e-06 [offloading_packed_experts]: 6.93e-06 [overlap_recompute_and_grad_model_parallel]: 8.07e-06 [overlap_grad_matmul_and_grad_allreduce]: 4.20999e-06 [overlap_recompute_allgather_and_fa_grad]: 3.7e-06 [overlap_recompute_comm]: 5.25001e-06 [overlap_grad_ring_attention]: 7.3e-06 [overlap_grad_flash_sp]: 2.455e-05 [begin_end_overlap_inline]: 3.43999e-06 [split_matmul_comm_elemetwise]: 4.72e-06 [split_layernorm_comm]: 4.17e-06 [handle_group_info]: 3.25998e-06 [symbol_engine_optimizer]: 0.00010744, [1] [Cycle 1]: 9.881e-05, [6] [build]: 3.57002e-06 [elim_shapecalc]: 1.362e-05 [elim_not_effective]: 1.486e-05 [opt_reshape]: 6.94999e-06 [fold_const_symbol]: 1.15e-05 [renormalize]: 1.79978e-07 [detach_backward]: 4.55001e-06 [pipeline_parallel_scheduler]: 2.16e-06 [auto_monad_reorder]: 2.104e-05 [get_jit_bprop_graph]: 2.02999e-06 [rewriter_after_jit_bprop_graph]: 5.39e-06 [opt_after_jit_grad]: 0.00053004 [validate]: 3.987e-05 Sums bootstrap : 0.000457s : 3.91% type_inference : 0.006100s : 52.28% event_method : 0.000020s : 0.17% auto_monad : 0.000064s : 0.55% graph_reusing : 0.000006s : 0.05% inline : 0.000003s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000020s : 0.17% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.05% parallel-infer-symbol : 0.000003s : 0.03% pre_auto_parallel : 0.000034s : 0.29% insert-virtual-dataset : 0.000002s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.02% optimize.py_interpret_to_execute : 0.000030s : 0.26% optimize.rewriter_before_opt_a : 0.000086s : 0.74% optimize.opt_a.expand_dump_flag : 0.000005s : 0.04% optimize.opt_a.switch_simplify : 0.000051s : 0.44% optimize.opt_a.loop_unroll : 0.000038s : 0.32% optimize.opt_a.a_1 : 0.000754s : 6.46% optimize.opt_a.with_stream_mark : 0.000034s : 0.29% optimize.opt_a.recompute_prepare : 0.000017s : 0.15% optimize.opt_a.updatestate_depend_eliminate : 0.000008s : 0.07% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.06% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.05% optimize.opt_a.parameter_eliminate : 0.000004s : 0.03% optimize.opt_a.a_2 : 0.000210s : 1.80% optimize.opt_a.accelerated_algorithm : 0.000016s : 0.14% optimize.opt_a.shard : 0.000004s : 0.03% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.03% optimize.opt_a.shard_inline : 0.000014s : 0.12% optimize.opt_a.merge_send_recv : 0.000015s : 0.13% optimize.opt_a.auto_parallel : 0.000013s : 0.11% optimize.opt_a.parallel : 0.000026s : 0.22% optimize.opt_a.flash_sp : 0.000013s : 0.11% optimize.opt_a.merge_comm : 0.000008s : 0.07% optimize.opt_a.allreduce_fusion : 0.000007s : 0.06% optimize.opt_a.matmul_add_comm_reduction : 0.000018s : 0.16% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000019s : 0.17% optimize.opt_a.virtual_dataset : 0.000013s : 0.11% optimize.opt_a.get_grad_eliminate_ : 0.000013s : 0.11% optimize.opt_a.virtual_output : 0.000013s : 0.11% optimize.opt_a.merge_forward : 0.000008s : 0.07% optimize.opt_a.cell_reuse_recompute_pass : 0.000004s : 0.03% optimize.opt_a.offload_activation : 0.000018s : 0.16% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000034s : 0.29% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.02% optimize.opt_a.before_grad : 0.000022s : 0.19% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000008s : 0.07% optimize.opt_a.meta_fg_expand : 0.000005s : 0.04% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.04% optimize.opt_a.receive_attached : 0.000004s : 0.03% optimize.opt_a.after_resolve : 0.000024s : 0.21% optimize.opt_a.a_after_grad : 0.000021s : 0.18% optimize.opt_a.renormalize : 0.000685s : 5.87% optimize.opt_a.add_forward_monad_depend : 0.000009s : 0.07% optimize.opt_a.auto_monad_grad : 0.000004s : 0.04% optimize.opt_a.auto_monad_eliminator : 0.000027s : 0.23% optimize.opt_a.cse : 0.000044s : 0.38% optimize.opt_a.a_3 : 0.000115s : 0.98% optimize.py_interpret_to_execute_after_opt_a : 0.000015s : 0.13% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.04% optimize.rewriter_after_opt_a : 0.000043s : 0.37% optimize.convert_after_rewriter : 0.000010s : 0.08% optimize.order_py_execute_after_rewriter : 0.000008s : 0.07% optimize.mutable_eliminate : 0.000576s : 4.94% optimize.opt_b.b_1 : 0.000167s : 1.43% optimize.opt_b.b_2 : 0.000009s : 0.07% optimize.opt_b.updatestate_depend_eliminate : 0.000021s : 0.18% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.02% optimize.opt_b.renormalize : 0.000001s : 0.01% optimize.opt_b.cse : 0.000021s : 0.18% optimize.optimize_parallel_all_gather_comm : 0.000021s : 0.18% optimize.overlap_param_gather : 0.000005s : 0.05% optimize.cconv : 0.000034s : 0.29% optimize.loop_unroll : 0.000463s : 3.97% optimize.opt_after_cconv.c_1 : 0.000031s : 0.27% optimize.opt_after_cconv.parameter_eliminate : 0.000004s : 0.03% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000008s : 0.07% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.02% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.02% optimize.opt_after_cconv.cse : 0.000020s : 0.17% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000017s : 0.14% optimize.tuple_transform.d_1 : 0.000047s : 0.40% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000008s : 0.07% optimize.partial_unused_args_eliminate : 0.000005s : 0.04% optimize.add_recomputation : 0.000055s : 0.47% optimize.cse_after_recomputation.cse : 0.000012s : 0.10% optimize.environ_conv : 0.000009s : 0.07% optimize.swap_dp_allreduce_reducescatter : 0.000009s : 0.07% optimize.bias_add_comm_swap : 0.000005s : 0.04% optimize.label_micro_interleaved_index : 0.000007s : 0.06% optimize.label_fine_grained_interleaved_index : 0.000006s : 0.05% optimize.merge_cast_opt : 0.000004s : 0.03% optimize.slice_recompute_activation : 0.000005s : 0.04% optimize.micro_interleaved_order_control : 0.000005s : 0.04% optimize.assign_add_opt : 0.000004s : 0.03% optimize.ForceFp32Comm : 0.000003s : 0.03% optimize.remove_cast_before_assign_add : 0.000003s : 0.03% optimize.full_micro_interleaved_order_control : 0.000005s : 0.04% optimize.reorder_send_recv_between_fp_bp : 0.000005s : 0.05% optimize.comm_op_add_attrs : 0.000004s : 0.03% optimize.add_comm_op_reuse_tag : 0.000003s : 0.03% optimize.interleave_split_concat_branches : 0.000004s : 0.03% optimize.interleave_parallel_branches : 0.000004s : 0.03% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.03% optimize.overlap_opt_shard_grad_in_pipeline : 0.000004s : 0.04% optimize.control_data_broadcast_order : 0.000018s : 0.15% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.04% optimize.offloading_packed_experts : 0.000007s : 0.06% optimize.overlap_recompute_and_grad_model_parallel : 0.000008s : 0.07% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.04% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.03% optimize.overlap_recompute_comm : 0.000005s : 0.04% optimize.overlap_grad_ring_attention : 0.000007s : 0.06% optimize.overlap_grad_flash_sp : 0.000025s : 0.21% optimize.begin_end_overlap_inline : 0.000003s : 0.03% optimize.split_matmul_comm_elemetwise : 0.000005s : 0.04% optimize.split_layernorm_comm : 0.000004s : 0.04% optimize.handle_group_info : 0.000003s : 0.03% optimize.symbol_engine_optimizer.build : 0.000004s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000014s : 0.12% optimize.symbol_engine_optimizer.elim_not_effective : 0.000015s : 0.13% optimize.symbol_engine_optimizer.opt_reshape : 0.000007s : 0.06% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000012s : 0.10% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000005s : 0.04% pipeline_parallel_scheduler : 0.000002s : 0.02% auto_monad_reorder : 0.000021s : 0.18% get_jit_bprop_graph : 0.000002s : 0.02% rewriter_after_jit_bprop_graph : 0.000005s : 0.05% opt_after_jit_grad : 0.000530s : 4.54% validate : 0.000040s : 0.34% Time group info: ------[substitution.] 0.000183 28 1.11% : 0.000002s : 2: substitution.elim_not_effective 0.74% : 0.000001s : 2: substitution.fold_const_symbol 3.45% : 0.000006s : 4: substitution.graph_param_transform 78.44% : 0.000144s : 4: substitution.inline 2.16% : 0.000004s : 4: substitution.j_node_and_user_rematch 2.65% : 0.000005s : 4: substitution.remove_not_recompute_node 3.16% : 0.000006s : 4: substitution.replace_old_param 8.29% : 0.000015s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.006044 2 87.79% : 0.005306s : 1: type_inference.infer 12.21% : 0.000738s : 1: type_inference.specialize ------[replace.] 0.000063 8 63.37% : 0.000040s : 4: replace.inline 36.63% : 0.000023s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000154 8 91.49% : 0.000141s : 4: match.inline 8.51% : 0.000013s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000210 1278 0.86% : 0.000002s : 13: predicate.accumulaten_eliminater 0.91% : 0.000002s : 4: predicate.ad_related_special_op_eliminate 0.51% : 0.000001s : 8: predicate.addn_check_dump 0.94% : 0.000002s : 13: predicate.addn_zero_filter 0.77% : 0.000002s : 13: predicate.adjust_all_reduce_mul_add 2.00% : 0.000004s : 21: predicate.arithmetic_simplify 0.93% : 0.000002s : 13: predicate.cast_eliminate 0.59% : 0.000001s : 8: predicate.check_bprop_eliminate 0.61% : 0.000001s : 8: predicate.compare_switch_simplify 0.17% : 0.000000s : 4: predicate.const_output_eliminate 0.59% : 0.000001s : 8: predicate.depend_value_elim 0.85% : 0.000002s : 13: predicate.dict_get_item_const_eliminator 1.01% : 0.000002s : 13: predicate.dict_get_item_eliminator 0.82% : 0.000002s : 13: predicate.dict_set_item_eliminator 1.24% : 0.000003s : 8: predicate.dumpgradient_eliminate 0.22% : 0.000000s : 4: predicate.elim_not_effective 0.50% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.08% : 0.000002s : 17: predicate.environ_add_const_eliminate 1.04% : 0.000002s : 17: predicate.environ_get_add_eliminate 1.02% : 0.000002s : 17: predicate.environ_get_depend_swap 1.89% : 0.000004s : 25: predicate.environ_get_eliminate 1.07% : 0.000002s : 17: predicate.environ_get_set_eliminate 1.39% : 0.000003s : 21: predicate.exchange_switch_depend_value 2.41% : 0.000005s : 21: predicate.float_depend_g_call 0.51% : 0.000001s : 8: predicate.float_environ_get_switch 0.76% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.19% : 0.000000s : 4: predicate.fold_const_symbol 0.74% : 0.000002s : 8: predicate.get_grad_eliminate 0.29% : 0.000001s : 4: predicate.graph_param_transform 0.59% : 0.000001s : 8: predicate.incorporate_call 0.47% : 0.000001s : 8: predicate.incorporate_call_switch 6.70% : 0.000014s : 58: predicate.inline 1.37% : 0.000003s : 8: predicate.inline_without_move 0.34% : 0.000001s : 8: predicate.j_node_and_user_rematch 0.94% : 0.000002s : 8: predicate.less_batch_normalization 1.73% : 0.000004s : 25: predicate.list_to_tuple_eliminator_ 2.35% : 0.000005s : 38: predicate.load_eliminater 1.03% : 0.000002s : 4: predicate.loop_unroll_after_grad 2.36% : 0.000005s : 34: predicate.loop_unroll_before_grad 1.52% : 0.000003s : 21: predicate.make_slice_get_slice_eliminator 0.56% : 0.000001s : 8: predicate.merge_addn 0.52% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.54% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.78% : 0.000002s : 13: predicate.minmaximum_grad 1.12% : 0.000002s : 4: predicate.mutable_eliminate 0.32% : 0.000001s : 4: predicate.opt_reshape 0.38% : 0.000001s : 4: predicate.parallel_virtual_node 1.83% : 0.000004s : 21: predicate.partial_defer_inline 1.60% : 0.000003s : 21: predicate.partial_eliminate 0.82% : 0.000002s : 13: predicate.print_const_string_wrapper 0.54% : 0.000001s : 8: predicate.reduce_all_const_elim 1.06% : 0.000002s : 13: predicate.reduce_eliminate 2.37% : 0.000005s : 38: predicate.redundant_stop_gradient_eliminater 0.53% : 0.000001s : 8: predicate.remove_not_recompute_node 1.41% : 0.000003s : 25: predicate.replace_applicator 0.51% : 0.000001s : 8: predicate.replace_old_param 0.40% : 0.000001s : 4: predicate.reset_defer_inline 0.94% : 0.000002s : 13: predicate.reshape_eliminate 0.85% : 0.000002s : 8: predicate.row_tensor_add_zeros_like 0.35% : 0.000001s : 4: predicate.row_tensor_eliminate 0.88% : 0.000002s : 8: predicate.same_eliminate 0.69% : 0.000001s : 8: predicate.set_cell_output_no_recompute 1.29% : 0.000003s : 8: predicate.shard_identity_eliminate 0.71% : 0.000001s : 8: predicate.special_op_eliminate 0.66% : 0.000001s : 8: predicate.specialize_transform 1.08% : 0.000002s : 8: predicate.split_environ_get_set_with_tuple_value 0.92% : 0.000002s : 8: predicate.stack_unstack_eliminate 0.33% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.48% : 0.000003s : 21: predicate.switch_defer_inline 2.02% : 0.000004s : 29: predicate.switch_layer_defer_inline 4.92% : 0.000010s : 67: predicate.switch_simplify 0.95% : 0.000002s : 13: predicate.tile_eliminate 0.85% : 0.000002s : 13: predicate.transpose_eliminate 1.52% : 0.000003s : 21: predicate.tuple_list_convert_item_index_to_positive 1.44% : 0.000003s : 21: predicate.tuple_list_get_item_const_eliminator 1.40% : 0.000003s : 21: predicate.tuple_list_get_item_depend_reorder 3.19% : 0.000007s : 33: predicate.tuple_list_get_item_eliminator 1.54% : 0.000003s : 21: predicate.tuple_list_get_set_item_eliminator 2.04% : 0.000004s : 29: predicate.tuple_list_set_item_eliminator 1.85% : 0.000004s : 25: predicate.tuple_to_list_eliminator_ 2.32% : 0.000005s : 38: predicate.updatestate_pure_node_eliminater 2.98% : 0.000006s : 46: predicate.updatestate_useless_node_eliminater 0.30% : 0.000001s : 4: predicate.value_based_eliminate 0.62% : 0.000001s : 8: predicate.virtual_dataset_eliminate 0.60% : 0.000001s : 8: predicate.virtual_output_eliminate 0.23% : 0.000000s : 4: predicate.virtual_view_grad_eliminate 0.41% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000553 11 52.34% : 0.000289s : 5: func_graph_cloner_run.FuncGraphClonerGraph 47.66% : 0.000263s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.027491 192 0.02% : 0.000006s : 1: ForceFp32Comm 11.53% : 0.003169s : 1: add_attr 11.47% : 0.003153s : 1: add_attr_with_inline 0.03% : 0.000007s : 1: add_comm_op_reuse_tag 0.21% : 0.000059s : 1: add_recomputation 0.02% : 0.000007s : 1: assign_add_opt 0.27% : 0.000074s : 1: auto_monad 0.10% : 0.000029s : 1: auto_monad_reorder 0.02% : 0.000006s : 1: begin_end_overlap_inline 0.03% : 0.000008s : 1: bias_add_comm_swap 1.84% : 0.000505s : 1: bootstrap 0.13% : 0.000037s : 1: cconv 0.02% : 0.000007s : 1: comm_op_add_attrs 0.08% : 0.000021s : 1: control_data_broadcast_order 0.05% : 0.000013s : 1: convert_after_rewriter 0.12% : 0.000032s : 1: cse_after_recomputation 0.03% : 0.000007s : 1: dataset_repeat_opt 0.08% : 0.000022s : 1: detach_backward 0.04% : 0.000012s : 1: environ_conv 0.11% : 0.000030s : 1: event_method 0.03% : 0.000007s : 1: full_micro_interleaved_order_control 0.03% : 0.000008s : 1: get_jit_bprop_graph 0.05% : 0.000013s : 1: graph_reusing 0.03% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.02% : 0.000006s : 1: handle_group_info 0.03% : 0.000008s : 1: inline 0.03% : 0.000009s : 1: insert-virtual-dataset 0.02% : 0.000007s : 1: interleave_parallel_branches 0.02% : 0.000007s : 1: interleave_split_concat_branches 0.03% : 0.000008s : 1: label_fine_grained_interleaved_index 0.04% : 0.000010s : 1: label_micro_interleaved_index 1.71% : 0.000470s : 1: loop_unroll 0.02% : 0.000007s : 1: merge_cast_opt 0.03% : 0.000008s : 1: micro_interleaved_order_control 2.12% : 0.000583s : 1: mutable_eliminate 0.04% : 0.000010s : 1: offloading_packed_experts 0.06% : 0.000017s : 1: opt.transform.loop_unroll_optimizer 0.07% : 0.000020s : 1: opt.transform.mutable_eliminate 4.36% : 0.001200s : 78: opt.transform.opt_a 0.11% : 0.000030s : 1: opt.transform.opt_after_cconv 0.11% : 0.000029s : 1: opt.transform.opt_after_jit_grad 0.38% : 0.000103s : 28: opt.transform.opt_b 0.19% : 0.000053s : 2: opt.transform.opt_trans_graph 0.16% : 0.000043s : 4: opt.transform.symbol_engine_opt 11.51% : 0.003164s : 1: opt_a 0.50% : 0.000138s : 1: opt_after_cconv 1.97% : 0.000542s : 1: opt_after_jit_grad 1.10% : 0.000301s : 1: opt_b 21.70% : 0.005965s : 1: optimize 0.09% : 0.000024s : 1: optimize_parallel_all_gather_comm 0.04% : 0.000011s : 1: order_py_execute_after_rewriter 0.10% : 0.000028s : 1: overlap_grad_flash_sp 0.02% : 0.000007s : 1: overlap_grad_matmul_and_grad_allreduce 0.04% : 0.000010s : 1: overlap_grad_ring_attention 0.02% : 0.000007s : 1: overlap_opt_shard_grad_in_pipeline 0.02% : 0.000007s : 1: overlap_opt_shard_in_pipeline 0.03% : 0.000009s : 1: overlap_param_gather 0.02% : 0.000006s : 1: overlap_recompute_allgather_and_fa_grad 0.04% : 0.000011s : 1: overlap_recompute_and_grad_model_parallel 0.03% : 0.000008s : 1: overlap_recompute_comm 0.04% : 0.000010s : 1: parallel-infer-symbol 0.02% : 0.000006s : 1: parallel-infer-symbol-second 0.03% : 0.000008s : 1: partial_unused_args_eliminate 0.04% : 0.000011s : 1: pipeline_parallel_scheduler 0.03% : 0.000007s : 1: pipeline_split 0.15% : 0.000042s : 1: pre_auto_parallel 0.12% : 0.000034s : 1: py_interpret_to_execute 0.07% : 0.000018s : 1: py_interpret_to_execute_after_opt_a 0.02% : 0.000006s : 1: remove_cast_before_assign_add 0.07% : 0.000020s : 1: remove_dup_value 1.28% : 0.000353s : 1: renormalize.infer 1.17% : 0.000323s : 1: renormalize.specialize 0.03% : 0.000008s : 1: reorder_send_recv_between_fp_bp 0.04% : 0.000012s : 1: rewriter_after_jit_bprop_graph 0.17% : 0.000046s : 1: rewriter_after_opt_a 0.33% : 0.000090s : 1: rewriter_before_opt_a 0.03% : 0.000008s : 1: slice_cell_reuse_recomputed_activation 0.03% : 0.000008s : 1: slice_recompute_activation 0.03% : 0.000007s : 1: split_layernorm_comm 0.03% : 0.000008s : 1: split_matmul_comm_elemetwise 0.04% : 0.000012s : 1: swap_dp_allreduce_reducescatter 0.40% : 0.000110s : 1: symbol_engine_optimizer 0.36% : 0.000100s : 1: tuple_transform 22.33% : 0.006139s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:45:17.870.306 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0311249, [21] [bootstrap]: 0.00041633 [type_inference]: 0.00549207 [event_method]: 1.732e-05 [auto_monad]: 5.987e-05 [graph_reusing]: 5.79e-06 [inline]: 2.26e-06 [add_attr]: 0.00304549, [1] [add_attr_with_inline]: 0.0030379, [1] [Cycle 1]: 5.73e-05, [2] [tag_attr]: 1.876e-05 [meta_addattr_fg_expand]: 5.78997e-06 [parallel-infer-symbol]: 2.96001e-06 [pre_auto_parallel]: 3.062e-05 [insert-virtual-dataset]: 2.22999e-06 [parallel-infer-symbol-second]: 8.89995e-07 [dataset_repeat_opt]: 2.00002e-06 [pipeline_split]: 1.62001e-06 [optimize]: 0.0213662, [53] [py_interpret_to_execute]: 2.519e-05 [rewriter_before_opt_a]: 7.992e-05 [opt_a]: 0.0190358, [2] [Cycle 1]: 0.0183494, [45] [expand_dump_flag]: 3.19001e-06 [switch_simplify]: 4.269e-05 [loop_unroll]: 2.944e-05 [a_1]: 0.00060768 [with_stream_mark]: 1.491e-05 [recompute_prepare]: 9.25999e-06 [updatestate_depend_eliminate]: 3.81999e-06 [updatestate_assign_eliminate]: 3.46001e-06 [updatestate_loads_eliminate]: 2.75002e-06 [parameter_eliminate]: 1.89e-06 [a_2]: 0.0162009 [accelerated_algorithm]: 1.17e-05 [shard]: 4.37e-06 [meta_shard_fg_expand]: 5.12e-06 [shard_inline]: 7.34002e-06 [merge_send_recv]: 1.835e-05 [auto_parallel]: 1.517e-05 [parallel]: 2.23e-05 [flash_sp]: 1.368e-05 [merge_comm]: 4.35999e-06 [allreduce_fusion]: 3.41001e-06 [matmul_add_comm_reduction]: 1.331e-05 [allreduce_slice_to_reducescatter]: 1.04e-06 [virtual_shard_identity]: 9.10001e-06 [virtual_dataset]: 7.6e-06 [get_grad_eliminate_]: 6.44999e-06 [virtual_output]: 6.84999e-06 [merge_forward]: 4.27e-06 [cell_reuse_recompute_pass]: 3.01001e-06 [offload_activation]: 1.032e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.23e-05 [merge_recompute_call_nodes]: 1.45999e-06 [before_grad]: 1.151e-05 [set_forward_comm_id_for_comm_node_pass]: 3.93999e-06 [meta_fg_expand]: 3.37002e-06 [flash_sp_send_recv_attached]: 3.06001e-06 [receive_attached]: 2.61e-06 [after_resolve]: 1.392e-05 [a_after_grad]: 1.065e-05 [renormalize]: 0.00078794 [add_forward_monad_depend]: 7.57002e-06 [auto_monad_grad]: 2.61e-06 [auto_monad_eliminator]: 1.657e-05 [cse]: 3.118e-05 [a_3]: 4.979e-05 [Cycle 2]: 0.00067372, [45] [expand_dump_flag]: 2.44001e-06 [switch_simplify]: 7.78999e-06 [loop_unroll]: 7.03998e-06 [a_1]: 0.00013803 [with_stream_mark]: 1.493e-05 [recompute_prepare]: 6.96999e-06 [updatestate_depend_eliminate]: 3.77998e-06 [updatestate_assign_eliminate]: 3.01001e-06 [updatestate_loads_eliminate]: 3.11999e-06 [parameter_eliminate]: 1.84e-06 [a_2]: 7.253e-05 [accelerated_algorithm]: 6.51e-06 [shard]: 1.65001e-06 [meta_shard_fg_expand]: 1.47999e-06 [shard_inline]: 6.14999e-06 [merge_send_recv]: 4.84e-06 [auto_parallel]: 5.81e-06 [parallel]: 5.67001e-06 [flash_sp]: 3.85e-06 [merge_comm]: 3.23998e-06 [allreduce_fusion]: 3.23e-06 [matmul_add_comm_reduction]: 5.91998e-06 [allreduce_slice_to_reducescatter]: 5.00004e-07 [virtual_shard_identity]: 7.08e-06 [virtual_dataset]: 6.24999e-06 [get_grad_eliminate_]: 5.62999e-06 [virtual_output]: 6.33e-06 [merge_forward]: 2.98e-06 [cell_reuse_recompute_pass]: 1.57999e-06 [offload_activation]: 7.03e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.273e-05 [merge_recompute_call_nodes]: 1.00999e-06 [before_grad]: 1.019e-05 [set_forward_comm_id_for_comm_node_pass]: 3.73999e-06 [meta_fg_expand]: 2.21e-06 [flash_sp_send_recv_attached]: 1.07e-06 [receive_attached]: 1.30999e-06 [after_resolve]: 1.061e-05 [a_after_grad]: 9.77999e-06 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 1.35999e-06 [auto_monad_grad]: 1.10999e-06 [auto_monad_eliminator]: 7.34002e-06 [cse]: 1.382e-05 [a_3]: 3.568e-05 [py_interpret_to_execute_after_opt_a]: 1.234e-05 [slice_cell_reuse_recomputed_activation]: 2.47001e-06 [rewriter_after_opt_a]: 3.784e-05 [convert_after_rewriter]: 7e-06 [order_py_execute_after_rewriter]: 5.23002e-06 [mutable_eliminate]: 0.00071375 [opt_b]: 0.00020557, [1] [Cycle 1]: 0.0001985, [7] [b_1]: 0.00012423 [b_2]: 8.47e-06 [updatestate_depend_eliminate]: 6.16998e-06 [updatestate_assign_eliminate]: 2.46998e-06 [updatestate_loads_eliminate]: 2.76e-06 [renormalize]: 4.19997e-07 [cse]: 1.734e-05 [optimize_parallel_all_gather_comm]: 1.633e-05 [overlap_param_gather]: 2.10002e-06 [cconv]: 2.54e-05 [loop_unroll]: 0.00043979 [opt_after_cconv]: 9.976e-05, [1] [Cycle 1]: 9.398e-05, [7] [c_1]: 3.101e-05 [parameter_eliminate]: 2.54001e-06 [updatestate_depend_eliminate]: 4.70001e-06 [updatestate_assign_eliminate]: 2.44001e-06 [updatestate_loads_eliminate]: 2.24999e-06 [cse]: 1.634e-05 [renormalize]: 3.19997e-07 [remove_dup_value]: 1.248e-05 [tuple_transform]: 7.607e-05, [1] [Cycle 1]: 7.143e-05, [4] [d_1]: 4.384e-05 [none_parameter_eliminate]: 1.71998e-06 [renormalize]: 2.50002e-07 [switch_simplify]: 6.76999e-06 [partial_unused_args_eliminate]: 1.84e-06 [add_recomputation]: 6.975e-05 [cse_after_recomputation]: 2.228e-05, [1] [Cycle 1]: 1.716e-05, [1] [cse]: 1.158e-05 [environ_conv]: 5.06002e-06 [swap_dp_allreduce_reducescatter]: 7.36001e-06 [bias_add_comm_swap]: 3.09999e-06 [label_micro_interleaved_index]: 4.63999e-06 [label_fine_grained_interleaved_index]: 2.70002e-06 [merge_cast_opt]: 1.42999e-06 [slice_recompute_activation]: 2.31e-06 [micro_interleaved_order_control]: 2.40002e-06 [assign_add_opt]: 1.22e-06 [ForceFp32Comm]: 1.14e-06 [remove_cast_before_assign_add]: 1.30001e-06 [full_micro_interleaved_order_control]: 2.13002e-06 [reorder_send_recv_between_fp_bp]: 2.96001e-06 [comm_op_add_attrs]: 1.11002e-06 [add_comm_op_reuse_tag]: 9.5999e-07 [interleave_split_concat_branches]: 1.23002e-06 [interleave_parallel_branches]: 1.15001e-06 [overlap_opt_shard_in_pipeline]: 1.54e-06 [overlap_opt_shard_grad_in_pipeline]: 2.25002e-06 [control_data_broadcast_order]: 1.374e-05 [grouped_pairwise_exchange_alltoall]: 1.74e-06 [offloading_packed_experts]: 4.22e-06 [overlap_recompute_and_grad_model_parallel]: 4.45e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.19e-06 [overlap_recompute_allgather_and_fa_grad]: 1.77001e-06 [overlap_recompute_comm]: 2.79001e-06 [overlap_grad_ring_attention]: 4.02998e-06 [overlap_grad_flash_sp]: 1.996e-05 [begin_end_overlap_inline]: 5.00004e-07 [split_matmul_comm_elemetwise]: 2.19001e-06 [split_layernorm_comm]: 1.86003e-06 [handle_group_info]: 9.10019e-07 [symbol_engine_optimizer]: 8.138e-05, [1] [Cycle 1]: 7.649e-05, [6] [build]: 3.86001e-06 [elim_shapecalc]: 1.095e-05 [elim_not_effective]: 1.382e-05 [opt_reshape]: 7.06999e-06 [fold_const_symbol]: 1.084e-05 [renormalize]: 2.60014e-07 [detach_backward]: 2.38998e-06 [pipeline_parallel_scheduler]: 1.77001e-06 [auto_monad_reorder]: 1.718e-05 [get_jit_bprop_graph]: 1.94e-06 [rewriter_after_jit_bprop_graph]: 3.78001e-06 [opt_after_jit_grad]: 0.00046893 [validate]: 3.936e-05 Sums bootstrap : 0.000416s : 1.54% type_inference : 0.005492s : 20.34% event_method : 0.000017s : 0.06% auto_monad : 0.000060s : 0.22% graph_reusing : 0.000006s : 0.02% inline : 0.000002s : 0.01% add_attr.add_attr_with_inline.tag_attr : 0.000019s : 0.07% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.02% parallel-infer-symbol : 0.000003s : 0.01% pre_auto_parallel : 0.000031s : 0.11% insert-virtual-dataset : 0.000002s : 0.01% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.01% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000025s : 0.09% optimize.rewriter_before_opt_a : 0.000080s : 0.30% optimize.opt_a.expand_dump_flag : 0.000006s : 0.02% optimize.opt_a.switch_simplify : 0.000050s : 0.19% optimize.opt_a.loop_unroll : 0.000036s : 0.14% optimize.opt_a.a_1 : 0.000746s : 2.76% optimize.opt_a.with_stream_mark : 0.000030s : 0.11% optimize.opt_a.recompute_prepare : 0.000016s : 0.06% optimize.opt_a.updatestate_depend_eliminate : 0.000008s : 0.03% optimize.opt_a.updatestate_assign_eliminate : 0.000006s : 0.02% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.02% optimize.opt_a.parameter_eliminate : 0.000004s : 0.01% optimize.opt_a.a_2 : 0.016273s : 60.28% optimize.opt_a.accelerated_algorithm : 0.000018s : 0.07% optimize.opt_a.shard : 0.000006s : 0.02% optimize.opt_a.meta_shard_fg_expand : 0.000007s : 0.02% optimize.opt_a.shard_inline : 0.000013s : 0.05% optimize.opt_a.merge_send_recv : 0.000023s : 0.09% optimize.opt_a.auto_parallel : 0.000021s : 0.08% optimize.opt_a.parallel : 0.000028s : 0.10% optimize.opt_a.flash_sp : 0.000018s : 0.06% optimize.opt_a.merge_comm : 0.000008s : 0.03% optimize.opt_a.allreduce_fusion : 0.000007s : 0.02% optimize.opt_a.matmul_add_comm_reduction : 0.000019s : 0.07% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000002s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000016s : 0.06% optimize.opt_a.virtual_dataset : 0.000014s : 0.05% optimize.opt_a.get_grad_eliminate_ : 0.000012s : 0.04% optimize.opt_a.virtual_output : 0.000013s : 0.05% optimize.opt_a.merge_forward : 0.000007s : 0.03% optimize.opt_a.cell_reuse_recompute_pass : 0.000005s : 0.02% optimize.opt_a.offload_activation : 0.000017s : 0.06% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000035s : 0.13% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.01% optimize.opt_a.before_grad : 0.000022s : 0.08% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000008s : 0.03% optimize.opt_a.meta_fg_expand : 0.000006s : 0.02% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.02% optimize.opt_a.receive_attached : 0.000004s : 0.01% optimize.opt_a.after_resolve : 0.000025s : 0.09% optimize.opt_a.a_after_grad : 0.000020s : 0.08% optimize.opt_a.renormalize : 0.000788s : 2.92% optimize.opt_a.add_forward_monad_depend : 0.000009s : 0.03% optimize.opt_a.auto_monad_grad : 0.000004s : 0.01% optimize.opt_a.auto_monad_eliminator : 0.000024s : 0.09% optimize.opt_a.cse : 0.000045s : 0.17% optimize.opt_a.a_3 : 0.000085s : 0.32% optimize.py_interpret_to_execute_after_opt_a : 0.000012s : 0.05% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.01% optimize.rewriter_after_opt_a : 0.000038s : 0.14% optimize.convert_after_rewriter : 0.000007s : 0.03% optimize.order_py_execute_after_rewriter : 0.000005s : 0.02% optimize.mutable_eliminate : 0.000714s : 2.64% optimize.opt_b.b_1 : 0.000124s : 0.46% optimize.opt_b.b_2 : 0.000008s : 0.03% optimize.opt_b.updatestate_depend_eliminate : 0.000006s : 0.02% optimize.opt_b.updatestate_assign_eliminate : 0.000002s : 0.01% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.01% optimize.opt_b.renormalize : 0.000000s : 0.00% optimize.opt_b.cse : 0.000017s : 0.06% optimize.optimize_parallel_all_gather_comm : 0.000016s : 0.06% optimize.overlap_param_gather : 0.000002s : 0.01% optimize.cconv : 0.000025s : 0.09% optimize.loop_unroll : 0.000440s : 1.63% optimize.opt_after_cconv.c_1 : 0.000031s : 0.11% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000005s : 0.02% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000002s : 0.01% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.01% optimize.opt_after_cconv.cse : 0.000016s : 0.06% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000012s : 0.05% optimize.tuple_transform.d_1 : 0.000044s : 0.16% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000007s : 0.03% optimize.partial_unused_args_eliminate : 0.000002s : 0.01% optimize.add_recomputation : 0.000070s : 0.26% optimize.cse_after_recomputation.cse : 0.000012s : 0.04% optimize.environ_conv : 0.000005s : 0.02% optimize.swap_dp_allreduce_reducescatter : 0.000007s : 0.03% optimize.bias_add_comm_swap : 0.000003s : 0.01% optimize.label_micro_interleaved_index : 0.000005s : 0.02% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.01% optimize.merge_cast_opt : 0.000001s : 0.01% optimize.slice_recompute_activation : 0.000002s : 0.01% optimize.micro_interleaved_order_control : 0.000002s : 0.01% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.00% optimize.full_micro_interleaved_order_control : 0.000002s : 0.01% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.01% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000002s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.01% optimize.control_data_broadcast_order : 0.000014s : 0.05% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.01% optimize.offloading_packed_experts : 0.000004s : 0.02% optimize.overlap_recompute_and_grad_model_parallel : 0.000004s : 0.02% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000002s : 0.01% optimize.overlap_recompute_comm : 0.000003s : 0.01% optimize.overlap_grad_ring_attention : 0.000004s : 0.01% optimize.overlap_grad_flash_sp : 0.000020s : 0.07% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.01% optimize.split_layernorm_comm : 0.000002s : 0.01% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000004s : 0.01% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000011s : 0.04% optimize.symbol_engine_optimizer.elim_not_effective : 0.000014s : 0.05% optimize.symbol_engine_optimizer.opt_reshape : 0.000007s : 0.03% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000011s : 0.04% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.01% pipeline_parallel_scheduler : 0.000002s : 0.01% auto_monad_reorder : 0.000017s : 0.06% get_jit_bprop_graph : 0.000002s : 0.01% rewriter_after_jit_bprop_graph : 0.000004s : 0.01% opt_after_jit_grad : 0.000469s : 1.74% validate : 0.000039s : 0.15% Time group info: ------[substitution.] 0.000174 28 1.18% : 0.000002s : 2: substitution.elim_not_effective 0.73% : 0.000001s : 2: substitution.fold_const_symbol 3.32% : 0.000006s : 4: substitution.graph_param_transform 77.34% : 0.000135s : 4: substitution.inline 2.62% : 0.000005s : 4: substitution.j_node_and_user_rematch 3.02% : 0.000005s : 4: substitution.remove_not_recompute_node 3.51% : 0.000006s : 4: substitution.replace_old_param 8.27% : 0.000014s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.005438 2 88.12% : 0.004792s : 1: type_inference.infer 11.88% : 0.000646s : 1: type_inference.specialize ------[replace.] 0.000057 8 62.38% : 0.000036s : 4: replace.inline 37.62% : 0.000021s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000144 8 91.42% : 0.000132s : 4: match.inline 8.58% : 0.000012s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000223 1278 0.89% : 0.000002s : 13: predicate.accumulaten_eliminater 0.60% : 0.000001s : 4: predicate.ad_related_special_op_eliminate 0.47% : 0.000001s : 8: predicate.addn_check_dump 0.90% : 0.000002s : 13: predicate.addn_zero_filter 0.77% : 0.000002s : 13: predicate.adjust_all_reduce_mul_add 2.54% : 0.000006s : 21: predicate.arithmetic_simplify 0.95% : 0.000002s : 13: predicate.cast_eliminate 0.55% : 0.000001s : 8: predicate.check_bprop_eliminate 0.60% : 0.000001s : 8: predicate.compare_switch_simplify 0.18% : 0.000000s : 4: predicate.const_output_eliminate 0.64% : 0.000001s : 8: predicate.depend_value_elim 0.88% : 0.000002s : 13: predicate.dict_get_item_const_eliminator 1.07% : 0.000002s : 13: predicate.dict_get_item_eliminator 0.97% : 0.000002s : 13: predicate.dict_set_item_eliminator 0.94% : 0.000002s : 8: predicate.dumpgradient_eliminate 0.19% : 0.000000s : 4: predicate.elim_not_effective 0.31% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.06% : 0.000002s : 17: predicate.environ_add_const_eliminate 0.99% : 0.000002s : 17: predicate.environ_get_add_eliminate 1.04% : 0.000002s : 17: predicate.environ_get_depend_swap 1.67% : 0.000004s : 25: predicate.environ_get_eliminate 1.06% : 0.000002s : 17: predicate.environ_get_set_eliminate 1.47% : 0.000003s : 21: predicate.exchange_switch_depend_value 2.34% : 0.000005s : 21: predicate.float_depend_g_call 0.69% : 0.000002s : 8: predicate.float_environ_get_switch 0.87% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.18% : 0.000000s : 4: predicate.fold_const_symbol 0.62% : 0.000001s : 8: predicate.get_grad_eliminate 0.19% : 0.000000s : 4: predicate.graph_param_transform 0.74% : 0.000002s : 8: predicate.incorporate_call 0.47% : 0.000001s : 8: predicate.incorporate_call_switch 5.97% : 0.000013s : 58: predicate.inline 0.88% : 0.000002s : 8: predicate.inline_without_move 0.30% : 0.000001s : 8: predicate.j_node_and_user_rematch 0.86% : 0.000002s : 8: predicate.less_batch_normalization 1.72% : 0.000004s : 25: predicate.list_to_tuple_eliminator_ 2.30% : 0.000005s : 38: predicate.load_eliminater 0.80% : 0.000002s : 4: predicate.loop_unroll_after_grad 2.33% : 0.000005s : 34: predicate.loop_unroll_before_grad 1.68% : 0.000004s : 21: predicate.make_slice_get_slice_eliminator 0.70% : 0.000002s : 8: predicate.merge_addn 0.66% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.56% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.77% : 0.000002s : 13: predicate.minmaximum_grad 0.78% : 0.000002s : 4: predicate.mutable_eliminate 0.32% : 0.000001s : 4: predicate.opt_reshape 0.33% : 0.000001s : 4: predicate.parallel_virtual_node 1.64% : 0.000004s : 21: predicate.partial_defer_inline 1.52% : 0.000003s : 21: predicate.partial_eliminate 0.90% : 0.000002s : 13: predicate.print_const_string_wrapper 0.72% : 0.000002s : 8: predicate.reduce_all_const_elim 1.34% : 0.000003s : 13: predicate.reduce_eliminate 2.34% : 0.000005s : 38: predicate.redundant_stop_gradient_eliminater 0.53% : 0.000001s : 8: predicate.remove_not_recompute_node 1.22% : 0.000003s : 25: predicate.replace_applicator 0.48% : 0.000001s : 8: predicate.replace_old_param 0.24% : 0.000001s : 4: predicate.reset_defer_inline 1.00% : 0.000002s : 13: predicate.reshape_eliminate 0.56% : 0.000001s : 8: predicate.row_tensor_add_zeros_like 0.54% : 0.000001s : 4: predicate.row_tensor_eliminate 0.68% : 0.000002s : 8: predicate.same_eliminate 0.44% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.78% : 0.000002s : 8: predicate.shard_identity_eliminate 0.66% : 0.000001s : 8: predicate.special_op_eliminate 1.46% : 0.000003s : 8: predicate.specialize_transform 0.72% : 0.000002s : 8: predicate.split_environ_get_set_with_tuple_value 0.74% : 0.000002s : 8: predicate.stack_unstack_eliminate 0.30% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.41% : 0.000003s : 21: predicate.switch_defer_inline 1.89% : 0.000004s : 29: predicate.switch_layer_defer_inline 7.31% : 0.000016s : 67: predicate.switch_simplify 0.80% : 0.000002s : 13: predicate.tile_eliminate 0.88% : 0.000002s : 13: predicate.transpose_eliminate 1.67% : 0.000004s : 21: predicate.tuple_list_convert_item_index_to_positive 1.51% : 0.000003s : 21: predicate.tuple_list_get_item_const_eliminator 1.45% : 0.000003s : 21: predicate.tuple_list_get_item_depend_reorder 3.27% : 0.000007s : 33: predicate.tuple_list_get_item_eliminator 1.53% : 0.000003s : 21: predicate.tuple_list_get_set_item_eliminator 2.49% : 0.000006s : 29: predicate.tuple_list_set_item_eliminator 1.66% : 0.000004s : 25: predicate.tuple_to_list_eliminator_ 2.18% : 0.000005s : 38: predicate.updatestate_pure_node_eliminater 2.93% : 0.000007s : 46: predicate.updatestate_useless_node_eliminater 0.55% : 0.000001s : 4: predicate.value_based_eliminate 0.61% : 0.000001s : 8: predicate.virtual_dataset_eliminate 0.65% : 0.000001s : 8: predicate.virtual_output_eliminate 0.25% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.37% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000532 11 51.73% : 0.000275s : 5: func_graph_cloner_run.FuncGraphClonerGraph 48.27% : 0.000257s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.057685 192 0.01% : 0.000004s : 1: ForceFp32Comm 5.29% : 0.003050s : 1: add_attr 5.27% : 0.003042s : 1: add_attr_with_inline 0.01% : 0.000004s : 1: add_comm_op_reuse_tag 0.13% : 0.000074s : 1: add_recomputation 0.01% : 0.000004s : 1: assign_add_opt 0.11% : 0.000065s : 1: auto_monad 0.04% : 0.000021s : 1: auto_monad_reorder 0.01% : 0.000003s : 1: begin_end_overlap_inline 0.01% : 0.000006s : 1: bias_add_comm_swap 0.77% : 0.000444s : 1: bootstrap 0.05% : 0.000029s : 1: cconv 0.01% : 0.000004s : 1: comm_op_add_attrs 0.03% : 0.000017s : 1: control_data_broadcast_order 0.02% : 0.000010s : 1: convert_after_rewriter 0.04% : 0.000025s : 1: cse_after_recomputation 0.01% : 0.000005s : 1: dataset_repeat_opt 0.01% : 0.000006s : 1: detach_backward 0.12% : 0.000069s : 1: environ_conv 0.04% : 0.000023s : 1: event_method 0.01% : 0.000005s : 1: full_micro_interleaved_order_control 0.01% : 0.000005s : 1: get_jit_bprop_graph 0.02% : 0.000009s : 1: graph_reusing 0.01% : 0.000005s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000004s : 1: handle_group_info 0.01% : 0.000005s : 1: inline 0.01% : 0.000006s : 1: insert-virtual-dataset 0.01% : 0.000004s : 1: interleave_parallel_branches 0.01% : 0.000004s : 1: interleave_split_concat_branches 0.01% : 0.000006s : 1: label_fine_grained_interleaved_index 0.01% : 0.000008s : 1: label_micro_interleaved_index 0.78% : 0.000448s : 1: loop_unroll 0.01% : 0.000004s : 1: merge_cast_opt 0.01% : 0.000006s : 1: micro_interleaved_order_control 1.25% : 0.000723s : 1: mutable_eliminate 0.01% : 0.000007s : 1: offloading_packed_experts 0.03% : 0.000015s : 1: opt.transform.loop_unroll_optimizer 0.03% : 0.000015s : 1: opt.transform.mutable_eliminate 2.11% : 0.001218s : 78: opt.transform.opt_a 0.05% : 0.000029s : 1: opt.transform.opt_after_cconv 0.04% : 0.000025s : 1: opt.transform.opt_after_jit_grad 0.18% : 0.000101s : 28: opt.transform.opt_b 0.08% : 0.000049s : 2: opt.transform.opt_trans_graph 0.07% : 0.000039s : 4: opt.transform.symbol_engine_opt 33.01% : 0.019039s : 1: opt_a 0.18% : 0.000103s : 1: opt_after_cconv 0.83% : 0.000478s : 1: opt_after_jit_grad 0.36% : 0.000209s : 1: opt_b 37.05% : 0.021372s : 1: optimize 0.03% : 0.000020s : 1: optimize_parallel_all_gather_comm 0.01% : 0.000008s : 1: order_py_execute_after_rewriter 0.04% : 0.000023s : 1: overlap_grad_flash_sp 0.01% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.01% : 0.000007s : 1: overlap_grad_ring_attention 0.01% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000005s : 1: overlap_opt_shard_in_pipeline 0.01% : 0.000005s : 1: overlap_param_gather 0.01% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.01% : 0.000008s : 1: overlap_recompute_and_grad_model_parallel 0.01% : 0.000006s : 1: overlap_recompute_comm 0.01% : 0.000007s : 1: parallel-infer-symbol 0.01% : 0.000004s : 1: parallel-infer-symbol-second 0.01% : 0.000005s : 1: partial_unused_args_eliminate 0.01% : 0.000005s : 1: pipeline_parallel_scheduler 0.01% : 0.000004s : 1: pipeline_split 0.06% : 0.000035s : 1: pre_auto_parallel 0.05% : 0.000029s : 1: py_interpret_to_execute 0.03% : 0.000016s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000004s : 1: remove_cast_before_assign_add 0.03% : 0.000016s : 1: remove_dup_value 0.70% : 0.000406s : 1: renormalize.infer 0.64% : 0.000370s : 1: renormalize.specialize 0.01% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.01% : 0.000007s : 1: rewriter_after_jit_bprop_graph 0.07% : 0.000042s : 1: rewriter_after_opt_a 0.15% : 0.000084s : 1: rewriter_before_opt_a 0.01% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.01% : 0.000005s : 1: slice_recompute_activation 0.01% : 0.000005s : 1: split_layernorm_comm 0.01% : 0.000005s : 1: split_matmul_comm_elemetwise 0.02% : 0.000012s : 1: swap_dp_allreduce_reducescatter 0.15% : 0.000084s : 1: symbol_engine_optimizer 0.14% : 0.000079s : 1: tuple_transform 9.55% : 0.005507s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:45:18.343.658 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:45:18.343.920 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0385171, [21] [bootstrap]: 0.00042436 [type_inference]: 0.0177336 [event_method]: 1.948e-05 [auto_monad]: 6.29e-05 [graph_reusing]: 6.17999e-06 [inline]: 2.27999e-06 [add_attr]: 0.00314312, [1] [add_attr_with_inline]: 0.00313278, [1] [Cycle 1]: 7.58e-05, [2] [tag_attr]: 1.97e-05 [meta_addattr_fg_expand]: 6.42001e-06 [parallel-infer-symbol]: 3.93999e-06 [pre_auto_parallel]: 5.675e-05 [insert-virtual-dataset]: 2.63003e-06 [parallel-infer-symbol-second]: 7.7e-07 [dataset_repeat_opt]: 2.04999e-06 [pipeline_split]: 1.55999e-06 [optimize]: 0.0157809, [53] [py_interpret_to_execute]: 3.158e-05 [rewriter_before_opt_a]: 9.234e-05 [opt_a]: 0.0130959, [2] [Cycle 1]: 0.0119316, [45] [expand_dump_flag]: 3.09999e-06 [switch_simplify]: 4.441e-05 [loop_unroll]: 3.198e-05 [a_1]: 0.00074986 [with_stream_mark]: 1.801e-05 [recompute_prepare]: 1.07e-05 [updatestate_depend_eliminate]: 4.89998e-06 [updatestate_assign_eliminate]: 4.74e-06 [updatestate_loads_eliminate]: 4.08001e-06 [parameter_eliminate]: 1.94e-06 [a_2]: 0.00012925 [accelerated_algorithm]: 8.59e-06 [shard]: 2.29999e-06 [meta_shard_fg_expand]: 2.06e-06 [shard_inline]: 7.93999e-06 [merge_send_recv]: 9.92001e-06 [auto_parallel]: 7.30998e-06 [parallel]: 1.867e-05 [flash_sp]: 8.23999e-06 [merge_comm]: 4.85999e-06 [allreduce_fusion]: 4.25e-06 [matmul_add_comm_reduction]: 1.138e-05 [allreduce_slice_to_reducescatter]: 6.10016e-07 [virtual_shard_identity]: 9.51e-06 [virtual_dataset]: 8.45001e-06 [get_grad_eliminate_]: 7.88001e-06 [virtual_output]: 8.00999e-06 [merge_forward]: 4.48999e-06 [cell_reuse_recompute_pass]: 1.35999e-06 [offload_activation]: 1.138e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.862e-05 [merge_recompute_call_nodes]: 1.50999e-06 [before_grad]: 1.236e-05 [set_forward_comm_id_for_comm_node_pass]: 4.75999e-06 [meta_fg_expand]: 3.56001e-06 [flash_sp_send_recv_attached]: 2.53e-06 [receive_attached]: 2.06e-06 [after_resolve]: 1.23e-05 [a_after_grad]: 1.181e-05 [renormalize]: 0.00080866 [add_forward_monad_depend]: 5.89999e-06 [auto_monad_grad]: 2.26e-06 [auto_monad_eliminator]: 1.788e-05 [cse]: 3.686e-05 [a_3]: 0.00943266 [Cycle 2]: 0.0011448, [45] [expand_dump_flag]: 3.96001e-06 [switch_simplify]: 1.529e-05 [loop_unroll]: 8.91997e-06 [a_1]: 0.00021128 [with_stream_mark]: 3.51e-05 [recompute_prepare]: 8.87e-06 [updatestate_depend_eliminate]: 5.30999e-06 [updatestate_assign_eliminate]: 3.49001e-06 [updatestate_loads_eliminate]: 3.73001e-06 [parameter_eliminate]: 2.66999e-06 [a_2]: 0.00013482 [accelerated_algorithm]: 8.40999e-06 [shard]: 2.39001e-06 [meta_shard_fg_expand]: 3.09999e-06 [shard_inline]: 7.42002e-06 [merge_send_recv]: 1.192e-05 [auto_parallel]: 1.063e-05 [parallel]: 1.097e-05 [flash_sp]: 4.95001e-06 [merge_comm]: 4.21001e-06 [allreduce_fusion]: 4.12e-06 [matmul_add_comm_reduction]: 1.421e-05 [allreduce_slice_to_reducescatter]: 9.29984e-07 [virtual_shard_identity]: 9.34998e-06 [virtual_dataset]: 7.75e-06 [get_grad_eliminate_]: 7.38999e-06 [virtual_output]: 7.25e-06 [merge_forward]: 4.65999e-06 [cell_reuse_recompute_pass]: 3.26999e-06 [offload_activation]: 1.089e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.926e-05 [merge_recompute_call_nodes]: 1.71e-06 [before_grad]: 1.314e-05 [set_forward_comm_id_for_comm_node_pass]: 4.55001e-06 [meta_fg_expand]: 3.78999e-06 [flash_sp_send_recv_attached]: 2.02999e-06 [receive_attached]: 2.52001e-06 [after_resolve]: 1.515e-05 [a_after_grad]: 1.269e-05 [renormalize]: 8.00064e-08 [add_forward_monad_depend]: 2.63998e-06 [auto_monad_grad]: 2.89001e-06 [auto_monad_eliminator]: 1.788e-05 [cse]: 4.084e-05 [a_3]: 6.028e-05 [py_interpret_to_execute_after_opt_a]: 2.35e-05 [slice_cell_reuse_recomputed_activation]: 4.88001e-06 [rewriter_after_opt_a]: 5.212e-05 [convert_after_rewriter]: 1.133e-05 [order_py_execute_after_rewriter]: 8.81002e-06 [mutable_eliminate]: 0.00073616 [opt_b]: 0.00031532, [1] [Cycle 1]: 0.00030399, [7] [b_1]: 0.00019951 [b_2]: 9.92999e-06 [updatestate_depend_eliminate]: 6.29001e-06 [updatestate_assign_eliminate]: 3.11001e-06 [updatestate_loads_eliminate]: 2.99001e-06 [renormalize]: 1.05999e-06 [cse]: 2.42e-05 [optimize_parallel_all_gather_comm]: 2.134e-05 [overlap_param_gather]: 5.10999e-06 [cconv]: 3.863e-05 [loop_unroll]: 0.00046718 [opt_after_cconv]: 0.00014157, [1] [Cycle 1]: 0.00013234, [7] [c_1]: 3.868e-05 [parameter_eliminate]: 2.96001e-06 [updatestate_depend_eliminate]: 5.77999e-06 [updatestate_assign_eliminate]: 3.09001e-06 [updatestate_loads_eliminate]: 3.04001e-06 [cse]: 2.273e-05 [renormalize]: 4.50003e-07 [remove_dup_value]: 2.001e-05 [tuple_transform]: 0.00010373, [1] [Cycle 1]: 9.593e-05, [4] [d_1]: 5.347e-05 [none_parameter_eliminate]: 1.81e-06 [renormalize]: 4.09986e-07 [switch_simplify]: 9.27999e-06 [partial_unused_args_eliminate]: 4.57e-06 [add_recomputation]: 6.286e-05 [cse_after_recomputation]: 3.198e-05, [1] [Cycle 1]: 2.464e-05, [1] [cse]: 1.538e-05 [environ_conv]: 1.007e-05 [swap_dp_allreduce_reducescatter]: 9.00001e-06 [bias_add_comm_swap]: 5.97001e-06 [label_micro_interleaved_index]: 7.3e-06 [label_fine_grained_interleaved_index]: 5.44e-06 [merge_cast_opt]: 3.71001e-06 [slice_recompute_activation]: 4.28001e-06 [micro_interleaved_order_control]: 4.90999e-06 [assign_add_opt]: 3.72998e-06 [ForceFp32Comm]: 3.26001e-06 [remove_cast_before_assign_add]: 3.81001e-06 [full_micro_interleaved_order_control]: 4.59998e-06 [reorder_send_recv_between_fp_bp]: 5.19998e-06 [comm_op_add_attrs]: 3.81999e-06 [add_comm_op_reuse_tag]: 3.67002e-06 [interleave_split_concat_branches]: 3.80998e-06 [interleave_parallel_branches]: 3.73999e-06 [overlap_opt_shard_in_pipeline]: 3.58e-06 [overlap_opt_shard_grad_in_pipeline]: 4.67998e-06 [control_data_broadcast_order]: 1.771e-05 [grouped_pairwise_exchange_alltoall]: 3.86999e-06 [offloading_packed_experts]: 7.46999e-06 [overlap_recompute_and_grad_model_parallel]: 7.29001e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.60998e-06 [overlap_recompute_allgather_and_fa_grad]: 3.86999e-06 [overlap_recompute_comm]: 5.09998e-06 [overlap_grad_ring_attention]: 7.27002e-06 [overlap_grad_flash_sp]: 2.626e-05 [begin_end_overlap_inline]: 3.04001e-06 [split_matmul_comm_elemetwise]: 4.37e-06 [split_layernorm_comm]: 4.27e-06 [handle_group_info]: 3.14999e-06 [symbol_engine_optimizer]: 0.00010561, [1] [Cycle 1]: 9.823e-05, [6] [build]: 3.91999e-06 [elim_shapecalc]: 1.157e-05 [elim_not_effective]: 1.512e-05 [opt_reshape]: 8.74e-06 [fold_const_symbol]: 1.24e-05 [renormalize]: 2.19996e-07 [detach_backward]: 4.62e-06 [pipeline_parallel_scheduler]: 1.96e-06 [auto_monad_reorder]: 2.503e-05 [get_jit_bprop_graph]: 2.35002e-06 [rewriter_after_jit_bprop_graph]: 5.46e-06 [opt_after_jit_grad]: 0.00051034 [validate]: 4.791e-05 Sums bootstrap : 0.000424s : 1.27% type_inference : 0.017734s : 53.02% event_method : 0.000019s : 0.06% auto_monad : 0.000063s : 0.19% graph_reusing : 0.000006s : 0.02% inline : 0.000002s : 0.01% add_attr.add_attr_with_inline.tag_attr : 0.000020s : 0.06% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.02% parallel-infer-symbol : 0.000004s : 0.01% pre_auto_parallel : 0.000057s : 0.17% insert-virtual-dataset : 0.000003s : 0.01% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.01% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000032s : 0.09% optimize.rewriter_before_opt_a : 0.000092s : 0.28% optimize.opt_a.expand_dump_flag : 0.000007s : 0.02% optimize.opt_a.switch_simplify : 0.000060s : 0.18% optimize.opt_a.loop_unroll : 0.000041s : 0.12% optimize.opt_a.a_1 : 0.000961s : 2.87% optimize.opt_a.with_stream_mark : 0.000053s : 0.16% optimize.opt_a.recompute_prepare : 0.000020s : 0.06% optimize.opt_a.updatestate_depend_eliminate : 0.000010s : 0.03% optimize.opt_a.updatestate_assign_eliminate : 0.000008s : 0.02% optimize.opt_a.updatestate_loads_eliminate : 0.000008s : 0.02% optimize.opt_a.parameter_eliminate : 0.000005s : 0.01% optimize.opt_a.a_2 : 0.000264s : 0.79% optimize.opt_a.accelerated_algorithm : 0.000017s : 0.05% optimize.opt_a.shard : 0.000005s : 0.01% optimize.opt_a.meta_shard_fg_expand : 0.000005s : 0.02% optimize.opt_a.shard_inline : 0.000015s : 0.05% optimize.opt_a.merge_send_recv : 0.000022s : 0.07% optimize.opt_a.auto_parallel : 0.000018s : 0.05% optimize.opt_a.parallel : 0.000030s : 0.09% optimize.opt_a.flash_sp : 0.000013s : 0.04% optimize.opt_a.merge_comm : 0.000009s : 0.03% optimize.opt_a.allreduce_fusion : 0.000008s : 0.03% optimize.opt_a.matmul_add_comm_reduction : 0.000026s : 0.08% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000002s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000019s : 0.06% optimize.opt_a.virtual_dataset : 0.000016s : 0.05% optimize.opt_a.get_grad_eliminate_ : 0.000015s : 0.05% optimize.opt_a.virtual_output : 0.000015s : 0.05% optimize.opt_a.merge_forward : 0.000009s : 0.03% optimize.opt_a.cell_reuse_recompute_pass : 0.000005s : 0.01% optimize.opt_a.offload_activation : 0.000022s : 0.07% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000038s : 0.11% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.01% optimize.opt_a.before_grad : 0.000025s : 0.08% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000009s : 0.03% optimize.opt_a.meta_fg_expand : 0.000007s : 0.02% optimize.opt_a.flash_sp_send_recv_attached : 0.000005s : 0.01% optimize.opt_a.receive_attached : 0.000005s : 0.01% optimize.opt_a.after_resolve : 0.000027s : 0.08% optimize.opt_a.a_after_grad : 0.000025s : 0.07% optimize.opt_a.renormalize : 0.000809s : 2.42% optimize.opt_a.add_forward_monad_depend : 0.000009s : 0.03% optimize.opt_a.auto_monad_grad : 0.000005s : 0.02% optimize.opt_a.auto_monad_eliminator : 0.000036s : 0.11% optimize.opt_a.cse : 0.000078s : 0.23% optimize.opt_a.a_3 : 0.009493s : 28.38% optimize.py_interpret_to_execute_after_opt_a : 0.000024s : 0.07% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.01% optimize.rewriter_after_opt_a : 0.000052s : 0.16% optimize.convert_after_rewriter : 0.000011s : 0.03% optimize.order_py_execute_after_rewriter : 0.000009s : 0.03% optimize.mutable_eliminate : 0.000736s : 2.20% optimize.opt_b.b_1 : 0.000200s : 0.60% optimize.opt_b.b_2 : 0.000010s : 0.03% optimize.opt_b.updatestate_depend_eliminate : 0.000006s : 0.02% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.01% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.01% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000024s : 0.07% optimize.optimize_parallel_all_gather_comm : 0.000021s : 0.06% optimize.overlap_param_gather : 0.000005s : 0.02% optimize.cconv : 0.000039s : 0.12% optimize.loop_unroll : 0.000467s : 1.40% optimize.opt_after_cconv.c_1 : 0.000039s : 0.12% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.02% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.cse : 0.000023s : 0.07% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000020s : 0.06% optimize.tuple_transform.d_1 : 0.000053s : 0.16% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000009s : 0.03% optimize.partial_unused_args_eliminate : 0.000005s : 0.01% optimize.add_recomputation : 0.000063s : 0.19% optimize.cse_after_recomputation.cse : 0.000015s : 0.05% optimize.environ_conv : 0.000010s : 0.03% optimize.swap_dp_allreduce_reducescatter : 0.000009s : 0.03% optimize.bias_add_comm_swap : 0.000006s : 0.02% optimize.label_micro_interleaved_index : 0.000007s : 0.02% optimize.label_fine_grained_interleaved_index : 0.000005s : 0.02% optimize.merge_cast_opt : 0.000004s : 0.01% optimize.slice_recompute_activation : 0.000004s : 0.01% optimize.micro_interleaved_order_control : 0.000005s : 0.01% optimize.assign_add_opt : 0.000004s : 0.01% optimize.ForceFp32Comm : 0.000003s : 0.01% optimize.remove_cast_before_assign_add : 0.000004s : 0.01% optimize.full_micro_interleaved_order_control : 0.000005s : 0.01% optimize.reorder_send_recv_between_fp_bp : 0.000005s : 0.02% optimize.comm_op_add_attrs : 0.000004s : 0.01% optimize.add_comm_op_reuse_tag : 0.000004s : 0.01% optimize.interleave_split_concat_branches : 0.000004s : 0.01% optimize.interleave_parallel_branches : 0.000004s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000005s : 0.01% optimize.control_data_broadcast_order : 0.000018s : 0.05% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.01% optimize.offloading_packed_experts : 0.000007s : 0.02% optimize.overlap_recompute_and_grad_model_parallel : 0.000007s : 0.02% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.01% optimize.overlap_recompute_comm : 0.000005s : 0.02% optimize.overlap_grad_ring_attention : 0.000007s : 0.02% optimize.overlap_grad_flash_sp : 0.000026s : 0.08% optimize.begin_end_overlap_inline : 0.000003s : 0.01% optimize.split_matmul_comm_elemetwise : 0.000004s : 0.01% optimize.split_layernorm_comm : 0.000004s : 0.01% optimize.handle_group_info : 0.000003s : 0.01% optimize.symbol_engine_optimizer.build : 0.000004s : 0.01% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000012s : 0.03% optimize.symbol_engine_optimizer.elim_not_effective : 0.000015s : 0.05% optimize.symbol_engine_optimizer.opt_reshape : 0.000009s : 0.03% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000012s : 0.04% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000005s : 0.01% pipeline_parallel_scheduler : 0.000002s : 0.01% auto_monad_reorder : 0.000025s : 0.07% get_jit_bprop_graph : 0.000002s : 0.01% rewriter_after_jit_bprop_graph : 0.000005s : 0.02% opt_after_jit_grad : 0.000510s : 1.53% validate : 0.000048s : 0.14% Time group info: ------[substitution.] 0.000229 38 14.86% : 0.000034s : 3: substitution.cast_eliminate 0.94% : 0.000002s : 3: substitution.elim_not_effective 0.77% : 0.000002s : 3: substitution.fold_const_symbol 3.05% : 0.000007s : 5: substitution.graph_param_transform 64.35% : 0.000148s : 4: substitution.inline 2.33% : 0.000005s : 6: substitution.j_node_and_user_rematch 3.44% : 0.000008s : 6: substitution.remove_not_recompute_node 3.21% : 0.000007s : 4: substitution.replace_old_param 7.04% : 0.000016s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.017676 2 95.35% : 0.016854s : 1: type_inference.infer 4.65% : 0.000822s : 1: type_inference.specialize ------[replace.] 0.000065 8 60.36% : 0.000039s : 4: replace.inline 39.64% : 0.000026s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000159 8 91.05% : 0.000145s : 4: match.inline 8.95% : 0.000014s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000277 1596 0.94% : 0.000003s : 17: predicate.accumulaten_eliminater 0.67% : 0.000002s : 5: predicate.ad_related_special_op_eliminate 0.54% : 0.000002s : 10: predicate.addn_check_dump 0.97% : 0.000003s : 17: predicate.addn_zero_filter 0.83% : 0.000002s : 17: predicate.adjust_all_reduce_mul_add 2.05% : 0.000006s : 27: predicate.arithmetic_simplify 1.12% : 0.000003s : 17: predicate.cast_eliminate 0.68% : 0.000002s : 10: predicate.check_bprop_eliminate 0.57% : 0.000002s : 10: predicate.compare_switch_simplify 0.18% : 0.000000s : 5: predicate.const_output_eliminate 0.61% : 0.000002s : 10: predicate.depend_value_elim 0.95% : 0.000003s : 17: predicate.dict_get_item_const_eliminator 1.14% : 0.000003s : 17: predicate.dict_get_item_eliminator 0.91% : 0.000003s : 17: predicate.dict_set_item_eliminator 0.94% : 0.000003s : 10: predicate.dumpgradient_eliminate 0.18% : 0.000000s : 5: predicate.elim_not_effective 0.34% : 0.000001s : 5: predicate.elim_shapecalc_of_broadcastargs 1.17% : 0.000003s : 22: predicate.environ_add_const_eliminate 1.10% : 0.000003s : 22: predicate.environ_get_add_eliminate 1.17% : 0.000003s : 22: predicate.environ_get_depend_swap 1.69% : 0.000005s : 32: predicate.environ_get_eliminate 1.17% : 0.000003s : 22: predicate.environ_get_set_eliminate 1.39% : 0.000004s : 25: predicate.exchange_switch_depend_value 2.51% : 0.000007s : 25: predicate.float_depend_g_call 0.53% : 0.000001s : 10: predicate.float_environ_get_switch 0.88% : 0.000002s : 15: predicate.float_tuple_getitem_switch 0.16% : 0.000000s : 5: predicate.fold_const_symbol 0.92% : 0.000003s : 10: predicate.get_grad_eliminate 0.19% : 0.000001s : 5: predicate.graph_param_transform 0.55% : 0.000002s : 10: predicate.incorporate_call 0.48% : 0.000001s : 10: predicate.incorporate_call_switch 5.84% : 0.000016s : 72: predicate.inline 0.77% : 0.000002s : 10: predicate.inline_without_move 0.29% : 0.000001s : 10: predicate.j_node_and_user_rematch 0.81% : 0.000002s : 10: predicate.less_batch_normalization 1.81% : 0.000005s : 31: predicate.list_to_tuple_eliminator_ 2.63% : 0.000007s : 48: predicate.load_eliminater 0.69% : 0.000002s : 5: predicate.loop_unroll_after_grad 1.96% : 0.000005s : 36: predicate.loop_unroll_before_grad 1.67% : 0.000005s : 27: predicate.make_slice_get_slice_eliminator 0.56% : 0.000002s : 10: predicate.merge_addn 2.05% : 0.000006s : 10: predicate.micro_step_allgather_replace 0.62% : 0.000002s : 10: predicate.mini_step_allgather_replace 0.86% : 0.000002s : 17: predicate.minmaximum_grad 1.06% : 0.000003s : 5: predicate.mutable_eliminate 0.36% : 0.000001s : 5: predicate.opt_reshape 0.33% : 0.000001s : 5: predicate.parallel_virtual_node 1.72% : 0.000005s : 25: predicate.partial_defer_inline 1.58% : 0.000004s : 26: predicate.partial_eliminate 1.01% : 0.000003s : 17: predicate.print_const_string_wrapper 0.63% : 0.000002s : 10: predicate.reduce_all_const_elim 1.31% : 0.000004s : 17: predicate.reduce_eliminate 2.43% : 0.000007s : 48: predicate.redundant_stop_gradient_eliminater 0.36% : 0.000001s : 10: predicate.remove_not_recompute_node 1.21% : 0.000003s : 31: predicate.replace_applicator 0.52% : 0.000001s : 10: predicate.replace_old_param 0.23% : 0.000001s : 5: predicate.reset_defer_inline 1.00% : 0.000003s : 17: predicate.reshape_eliminate 0.65% : 0.000002s : 10: predicate.row_tensor_add_zeros_like 0.38% : 0.000001s : 5: predicate.row_tensor_eliminate 0.74% : 0.000002s : 10: predicate.same_eliminate 0.39% : 0.000001s : 10: predicate.set_cell_output_no_recompute 0.81% : 0.000002s : 10: predicate.shard_identity_eliminate 0.73% : 0.000002s : 10: predicate.special_op_eliminate 0.68% : 0.000002s : 10: predicate.specialize_transform 1.40% : 0.000004s : 10: predicate.split_environ_get_set_with_tuple_value 0.81% : 0.000002s : 10: predicate.stack_unstack_eliminate 0.31% : 0.000001s : 5: predicate.switch_call_monad_eliminater 1.47% : 0.000004s : 25: predicate.switch_defer_inline 2.02% : 0.000006s : 35: predicate.switch_layer_defer_inline 4.56% : 0.000013s : 76: predicate.switch_simplify 0.97% : 0.000003s : 17: predicate.tile_eliminate 0.89% : 0.000002s : 17: predicate.transpose_eliminate 1.65% : 0.000005s : 27: predicate.tuple_list_convert_item_index_to_positive 1.64% : 0.000005s : 27: predicate.tuple_list_get_item_const_eliminator 1.40% : 0.000004s : 27: predicate.tuple_list_get_item_depend_reorder 3.16% : 0.000009s : 41: predicate.tuple_list_get_item_eliminator 1.55% : 0.000004s : 27: predicate.tuple_list_get_set_item_eliminator 2.32% : 0.000006s : 37: predicate.tuple_list_set_item_eliminator 1.81% : 0.000005s : 31: predicate.tuple_to_list_eliminator_ 2.48% : 0.000007s : 48: predicate.updatestate_pure_node_eliminater 3.07% : 0.000008s : 58: predicate.updatestate_useless_node_eliminater 0.38% : 0.000001s : 5: predicate.value_based_eliminate 0.64% : 0.000002s : 10: predicate.virtual_dataset_eliminate 0.56% : 0.000002s : 10: predicate.virtual_output_eliminate 0.27% : 0.000001s : 5: predicate.virtual_view_grad_eliminate 0.41% : 0.000001s : 5: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000657 11 52.17% : 0.000343s : 5: func_graph_cloner_run.FuncGraphClonerGraph 47.83% : 0.000314s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.069278 192 0.01% : 0.000006s : 1: ForceFp32Comm 4.55% : 0.003153s : 1: add_attr 4.53% : 0.003137s : 1: add_attr_with_inline 0.01% : 0.000006s : 1: add_comm_op_reuse_tag 0.10% : 0.000067s : 1: add_recomputation 0.01% : 0.000006s : 1: assign_add_opt 0.10% : 0.000072s : 1: auto_monad 0.05% : 0.000033s : 1: auto_monad_reorder 0.01% : 0.000006s : 1: begin_end_overlap_inline 0.01% : 0.000009s : 1: bias_add_comm_swap 0.67% : 0.000467s : 1: bootstrap 0.06% : 0.000042s : 1: cconv 0.01% : 0.000007s : 1: comm_op_add_attrs 0.03% : 0.000021s : 1: control_data_broadcast_order 0.02% : 0.000014s : 1: convert_after_rewriter 0.05% : 0.000035s : 1: cse_after_recomputation 0.01% : 0.000008s : 1: dataset_repeat_opt 0.03% : 0.000022s : 1: detach_backward 0.02% : 0.000013s : 1: environ_conv 0.04% : 0.000029s : 1: event_method 0.01% : 0.000007s : 1: full_micro_interleaved_order_control 0.01% : 0.000009s : 1: get_jit_bprop_graph 0.02% : 0.000012s : 1: graph_reusing 0.01% : 0.000006s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000006s : 1: handle_group_info 0.01% : 0.000008s : 1: inline 0.01% : 0.000009s : 1: insert-virtual-dataset 0.01% : 0.000006s : 1: interleave_parallel_branches 0.01% : 0.000007s : 1: interleave_split_concat_branches 0.01% : 0.000008s : 1: label_fine_grained_interleaved_index 0.01% : 0.000010s : 1: label_micro_interleaved_index 0.68% : 0.000473s : 1: loop_unroll 0.01% : 0.000006s : 1: merge_cast_opt 0.01% : 0.000008s : 1: micro_interleaved_order_control 1.07% : 0.000743s : 1: mutable_eliminate 0.01% : 0.000010s : 1: offloading_packed_experts 0.02% : 0.000016s : 1: opt.transform.loop_unroll_optimizer 0.03% : 0.000018s : 1: opt.transform.mutable_eliminate 15.64% : 0.010832s : 78: opt.transform.opt_a 0.05% : 0.000037s : 1: opt.transform.opt_after_cconv 0.05% : 0.000031s : 1: opt.transform.opt_after_jit_grad 0.20% : 0.000136s : 28: opt.transform.opt_b 0.09% : 0.000060s : 2: opt.transform.opt_trans_graph 0.06% : 0.000044s : 4: opt.transform.symbol_engine_opt 18.91% : 0.013100s : 1: opt_a 0.21% : 0.000145s : 1: opt_after_cconv 0.75% : 0.000521s : 1: opt_after_jit_grad 0.46% : 0.000319s : 1: opt_b 23.39% : 0.016207s : 1: optimize 0.04% : 0.000025s : 1: optimize_parallel_all_gather_comm 0.02% : 0.000012s : 1: order_py_execute_after_rewriter 0.04% : 0.000029s : 1: overlap_grad_flash_sp 0.01% : 0.000006s : 1: overlap_grad_matmul_and_grad_allreduce 0.02% : 0.000010s : 1: overlap_grad_ring_attention 0.01% : 0.000008s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000006s : 1: overlap_opt_shard_in_pipeline 0.01% : 0.000008s : 1: overlap_param_gather 0.01% : 0.000006s : 1: overlap_recompute_allgather_and_fa_grad 0.01% : 0.000010s : 1: overlap_recompute_and_grad_model_parallel 0.01% : 0.000008s : 1: overlap_recompute_comm 0.02% : 0.000010s : 1: parallel-infer-symbol 0.01% : 0.000006s : 1: parallel-infer-symbol-second 0.01% : 0.000007s : 1: partial_unused_args_eliminate 0.01% : 0.000010s : 1: pipeline_parallel_scheduler 0.01% : 0.000007s : 1: pipeline_split 0.09% : 0.000065s : 1: pre_auto_parallel 0.05% : 0.000035s : 1: py_interpret_to_execute 0.04% : 0.000027s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000007s : 1: remove_cast_before_assign_add 0.03% : 0.000023s : 1: remove_dup_value 0.66% : 0.000457s : 1: renormalize.infer 0.49% : 0.000342s : 1: renormalize.specialize 0.01% : 0.000008s : 1: reorder_send_recv_between_fp_bp 0.02% : 0.000011s : 1: rewriter_after_jit_bprop_graph 0.08% : 0.000056s : 1: rewriter_after_opt_a 0.14% : 0.000096s : 1: rewriter_before_opt_a 0.01% : 0.000008s : 1: slice_cell_reuse_recomputed_activation 0.01% : 0.000007s : 1: slice_recompute_activation 0.01% : 0.000007s : 1: split_layernorm_comm 0.01% : 0.000007s : 1: split_matmul_comm_elemetwise 0.02% : 0.000012s : 1: swap_dp_allreduce_reducescatter 0.16% : 0.000109s : 1: symbol_engine_optimizer 0.15% : 0.000107s : 1: tuple_transform 25.66% : 0.017774s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:45:18.838.295 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0388322, [21] [bootstrap]: 0.00042443 [type_inference]: 0.0198981 [event_method]: 1.899e-05 [auto_monad]: 6.647e-05 [graph_reusing]: 5.59e-06 [inline]: 2.53003e-06 [add_attr]: 0.00320855, [1] [add_attr_with_inline]: 0.00319883, [1] [Cycle 1]: 5.771e-05, [2] [tag_attr]: 2.045e-05 [meta_addattr_fg_expand]: 5.96e-06 [parallel-infer-symbol]: 3.21001e-06 [pre_auto_parallel]: 3.395e-05 [insert-virtual-dataset]: 2.24001e-06 [parallel-infer-symbol-second]: 8.00006e-07 [dataset_repeat_opt]: 1.80001e-06 [pipeline_split]: 1.64e-06 [optimize]: 0.0144488, [53] [py_interpret_to_execute]: 2.54e-05 [rewriter_before_opt_a]: 8.461e-05 [opt_a]: 0.012112, [2] [Cycle 1]: 0.0112693, [45] [expand_dump_flag]: 3.18998e-06 [switch_simplify]: 4.318e-05 [loop_unroll]: 3.167e-05 [a_1]: 0.00073015 [with_stream_mark]: 1.513e-05 [recompute_prepare]: 9.52001e-06 [updatestate_depend_eliminate]: 4.54002e-06 [updatestate_assign_eliminate]: 3.88001e-06 [updatestate_loads_eliminate]: 3.62002e-06 [parameter_eliminate]: 1.84e-06 [a_2]: 0.00010084 [accelerated_algorithm]: 8.69e-06 [shard]: 1.58002e-06 [meta_shard_fg_expand]: 2.09e-06 [shard_inline]: 7.59002e-06 [merge_send_recv]: 9.66e-06 [auto_parallel]: 7.08e-06 [parallel]: 1.86e-05 [flash_sp]: 7.71001e-06 [merge_comm]: 4.77998e-06 [allreduce_fusion]: 4.03999e-06 [matmul_add_comm_reduction]: 1.052e-05 [allreduce_slice_to_reducescatter]: 7.30011e-07 [virtual_shard_identity]: 8.90001e-06 [virtual_dataset]: 7.72998e-06 [get_grad_eliminate_]: 7.63001e-06 [virtual_output]: 7.94997e-06 [merge_forward]: 4.63999e-06 [cell_reuse_recompute_pass]: 1.24e-06 [offload_activation]: 1.113e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.576e-05 [merge_recompute_call_nodes]: 1.42999e-06 [before_grad]: 1.329e-05 [set_forward_comm_id_for_comm_node_pass]: 4.45e-06 [meta_fg_expand]: 3.56001e-06 [flash_sp_send_recv_attached]: 2.59999e-06 [receive_attached]: 1.97001e-06 [after_resolve]: 1.264e-05 [a_after_grad]: 1.186e-05 [renormalize]: 0.00969405 [add_forward_monad_depend]: 1.09e-05 [auto_monad_grad]: 2.89999e-06 [auto_monad_eliminator]: 2.481e-05 [cse]: 4.127e-05 [a_3]: 7.412e-05 [Cycle 2]: 0.00083001, [45] [expand_dump_flag]: 2.32001e-06 [switch_simplify]: 1.061e-05 [loop_unroll]: 7.77e-06 [a_1]: 0.00018967 [with_stream_mark]: 2.012e-05 [recompute_prepare]: 7.74002e-06 [updatestate_depend_eliminate]: 4.63001e-06 [updatestate_assign_eliminate]: 3.5e-06 [updatestate_loads_eliminate]: 3.66001e-06 [parameter_eliminate]: 2.52001e-06 [a_2]: 9.327e-05 [accelerated_algorithm]: 8.25999e-06 [shard]: 2.51998e-06 [meta_shard_fg_expand]: 2.31998e-06 [shard_inline]: 7.38999e-06 [merge_send_recv]: 9.12001e-06 [auto_parallel]: 9.76e-06 [parallel]: 9.32999e-06 [flash_sp]: 4.21001e-06 [merge_comm]: 4.52998e-06 [allreduce_fusion]: 4.59998e-06 [matmul_add_comm_reduction]: 1.111e-05 [allreduce_slice_to_reducescatter]: 1.09e-06 [virtual_shard_identity]: 8.99003e-06 [virtual_dataset]: 7.44002e-06 [get_grad_eliminate_]: 7.21001e-06 [virtual_output]: 7.18e-06 [merge_forward]: 4.47998e-06 [cell_reuse_recompute_pass]: 3.45e-06 [offload_activation]: 1.156e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.599e-05 [merge_recompute_call_nodes]: 1.53002e-06 [before_grad]: 1.443e-05 [set_forward_comm_id_for_comm_node_pass]: 4.53999e-06 [meta_fg_expand]: 3.04999e-06 [flash_sp_send_recv_attached]: 1.75001e-06 [receive_attached]: 2.09e-06 [after_resolve]: 1.346e-05 [a_after_grad]: 1.22e-05 [renormalize]: 1.00001e-07 [add_forward_monad_depend]: 1.31998e-06 [auto_monad_grad]: 1.06002e-06 [auto_monad_eliminator]: 8.03001e-06 [cse]: 1.963e-05 [a_3]: 4.511e-05 [py_interpret_to_execute_after_opt_a]: 1.952e-05 [slice_cell_reuse_recomputed_activation]: 1.83002e-06 [rewriter_after_opt_a]: 4.382e-05 [convert_after_rewriter]: 7.49002e-06 [order_py_execute_after_rewriter]: 5.58002e-06 [mutable_eliminate]: 0.00070168 [opt_b]: 0.00024718, [1] [Cycle 1]: 0.00024003, [7] [b_1]: 0.00015486 [b_2]: 9.74999e-06 [updatestate_depend_eliminate]: 6.98998e-06 [updatestate_assign_eliminate]: 3.04001e-06 [updatestate_loads_eliminate]: 3.03e-06 [renormalize]: 8.39995e-07 [cse]: 2.506e-05 [optimize_parallel_all_gather_comm]: 1.859e-05 [overlap_param_gather]: 1.77001e-06 [cconv]: 2.918e-05 [loop_unroll]: 0.00042714 [opt_after_cconv]: 0.00011853, [1] [Cycle 1]: 0.00011282, [7] [c_1]: 3.846e-05 [parameter_eliminate]: 4.05e-06 [updatestate_depend_eliminate]: 6.44999e-06 [updatestate_assign_eliminate]: 3.31999e-06 [updatestate_loads_eliminate]: 2.88e-06 [cse]: 2.287e-05 [renormalize]: 7.00005e-07 [remove_dup_value]: 1.561e-05 [tuple_transform]: 8.667e-05, [1] [Cycle 1]: 8.19e-05, [4] [d_1]: 5.294e-05 [none_parameter_eliminate]: 1.76e-06 [renormalize]: 1.80007e-07 [switch_simplify]: 8.54002e-06 [partial_unused_args_eliminate]: 1.84e-06 [add_recomputation]: 5.838e-05 [cse_after_recomputation]: 2.621e-05, [1] [Cycle 1]: 2.081e-05, [1] [cse]: 1.506e-05 [environ_conv]: 6.56e-06 [swap_dp_allreduce_reducescatter]: 5.94999e-06 [bias_add_comm_swap]: 3.05002e-06 [label_micro_interleaved_index]: 4.15e-06 [label_fine_grained_interleaved_index]: 2.49001e-06 [merge_cast_opt]: 1.57001e-06 [slice_recompute_activation]: 1.94e-06 [micro_interleaved_order_control]: 2.53e-06 [assign_add_opt]: 1.21997e-06 [ForceFp32Comm]: 1.12e-06 [remove_cast_before_assign_add]: 1.02e-06 [full_micro_interleaved_order_control]: 2.17999e-06 [reorder_send_recv_between_fp_bp]: 2.89999e-06 [comm_op_add_attrs]: 1.00999e-06 [add_comm_op_reuse_tag]: 9.5999e-07 [interleave_split_concat_branches]: 1.24e-06 [interleave_parallel_branches]: 1.05001e-06 [overlap_opt_shard_in_pipeline]: 1.14e-06 [overlap_opt_shard_grad_in_pipeline]: 1.71e-06 [control_data_broadcast_order]: 1.533e-05 [grouped_pairwise_exchange_alltoall]: 1.84998e-06 [offloading_packed_experts]: 4.13999e-06 [overlap_recompute_and_grad_model_parallel]: 5.51e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.49998e-06 [overlap_recompute_allgather_and_fa_grad]: 1.34998e-06 [overlap_recompute_comm]: 2.14e-06 [overlap_grad_ring_attention]: 4.42998e-06 [overlap_grad_flash_sp]: 2.323e-05 [begin_end_overlap_inline]: 8.30012e-07 [split_matmul_comm_elemetwise]: 2.02999e-06 [split_layernorm_comm]: 1.58002e-06 [handle_group_info]: 9.20001e-07 [symbol_engine_optimizer]: 8.581e-05, [1] [Cycle 1]: 8.144e-05, [6] [build]: 3.79002e-06 [elim_shapecalc]: 1.182e-05 [elim_not_effective]: 1.556e-05 [opt_reshape]: 8.46002e-06 [fold_const_symbol]: 1.244e-05 [renormalize]: 1.8999e-07 [detach_backward]: 2.21e-06 [pipeline_parallel_scheduler]: 1.66e-06 [auto_monad_reorder]: 1.989e-05 [get_jit_bprop_graph]: 1.91998e-06 [rewriter_after_jit_bprop_graph]: 4.02e-06 [opt_after_jit_grad]: 0.00048885 [validate]: 4.416e-05 Sums bootstrap : 0.000424s : 1.23% type_inference : 0.019898s : 57.47% event_method : 0.000019s : 0.05% auto_monad : 0.000066s : 0.19% graph_reusing : 0.000006s : 0.02% inline : 0.000003s : 0.01% add_attr.add_attr_with_inline.tag_attr : 0.000020s : 0.06% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.02% parallel-infer-symbol : 0.000003s : 0.01% pre_auto_parallel : 0.000034s : 0.10% insert-virtual-dataset : 0.000002s : 0.01% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.01% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000025s : 0.07% optimize.rewriter_before_opt_a : 0.000085s : 0.24% optimize.opt_a.expand_dump_flag : 0.000006s : 0.02% optimize.opt_a.switch_simplify : 0.000054s : 0.16% optimize.opt_a.loop_unroll : 0.000039s : 0.11% optimize.opt_a.a_1 : 0.000920s : 2.66% optimize.opt_a.with_stream_mark : 0.000035s : 0.10% optimize.opt_a.recompute_prepare : 0.000017s : 0.05% optimize.opt_a.updatestate_depend_eliminate : 0.000009s : 0.03% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.02% optimize.opt_a.updatestate_loads_eliminate : 0.000007s : 0.02% optimize.opt_a.parameter_eliminate : 0.000004s : 0.01% optimize.opt_a.a_2 : 0.000194s : 0.56% optimize.opt_a.accelerated_algorithm : 0.000017s : 0.05% optimize.opt_a.shard : 0.000004s : 0.01% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.01% optimize.opt_a.shard_inline : 0.000015s : 0.04% optimize.opt_a.merge_send_recv : 0.000019s : 0.05% optimize.opt_a.auto_parallel : 0.000017s : 0.05% optimize.opt_a.parallel : 0.000028s : 0.08% optimize.opt_a.flash_sp : 0.000012s : 0.03% optimize.opt_a.merge_comm : 0.000009s : 0.03% optimize.opt_a.allreduce_fusion : 0.000009s : 0.02% optimize.opt_a.matmul_add_comm_reduction : 0.000022s : 0.06% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000002s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000018s : 0.05% optimize.opt_a.virtual_dataset : 0.000015s : 0.04% optimize.opt_a.get_grad_eliminate_ : 0.000015s : 0.04% optimize.opt_a.virtual_output : 0.000015s : 0.04% optimize.opt_a.merge_forward : 0.000009s : 0.03% optimize.opt_a.cell_reuse_recompute_pass : 0.000005s : 0.01% optimize.opt_a.offload_activation : 0.000023s : 0.07% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000032s : 0.09% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.01% optimize.opt_a.before_grad : 0.000028s : 0.08% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000009s : 0.03% optimize.opt_a.meta_fg_expand : 0.000007s : 0.02% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.01% optimize.opt_a.receive_attached : 0.000004s : 0.01% optimize.opt_a.after_resolve : 0.000026s : 0.08% optimize.opt_a.a_after_grad : 0.000024s : 0.07% optimize.opt_a.renormalize : 0.009694s : 28.00% optimize.opt_a.add_forward_monad_depend : 0.000012s : 0.04% optimize.opt_a.auto_monad_grad : 0.000004s : 0.01% optimize.opt_a.auto_monad_eliminator : 0.000033s : 0.09% optimize.opt_a.cse : 0.000061s : 0.18% optimize.opt_a.a_3 : 0.000119s : 0.34% optimize.py_interpret_to_execute_after_opt_a : 0.000020s : 0.06% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.01% optimize.rewriter_after_opt_a : 0.000044s : 0.13% optimize.convert_after_rewriter : 0.000007s : 0.02% optimize.order_py_execute_after_rewriter : 0.000006s : 0.02% optimize.mutable_eliminate : 0.000702s : 2.03% optimize.opt_b.b_1 : 0.000155s : 0.45% optimize.opt_b.b_2 : 0.000010s : 0.03% optimize.opt_b.updatestate_depend_eliminate : 0.000007s : 0.02% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.01% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.01% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000025s : 0.07% optimize.optimize_parallel_all_gather_comm : 0.000019s : 0.05% optimize.overlap_param_gather : 0.000002s : 0.01% optimize.cconv : 0.000029s : 0.08% optimize.loop_unroll : 0.000427s : 1.23% optimize.opt_after_cconv.c_1 : 0.000038s : 0.11% optimize.opt_after_cconv.parameter_eliminate : 0.000004s : 0.01% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.02% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.cse : 0.000023s : 0.07% optimize.opt_after_cconv.renormalize : 0.000001s : 0.00% optimize.remove_dup_value : 0.000016s : 0.05% optimize.tuple_transform.d_1 : 0.000053s : 0.15% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000009s : 0.02% optimize.partial_unused_args_eliminate : 0.000002s : 0.01% optimize.add_recomputation : 0.000058s : 0.17% optimize.cse_after_recomputation.cse : 0.000015s : 0.04% optimize.environ_conv : 0.000007s : 0.02% optimize.swap_dp_allreduce_reducescatter : 0.000006s : 0.02% optimize.bias_add_comm_swap : 0.000003s : 0.01% optimize.label_micro_interleaved_index : 0.000004s : 0.01% optimize.label_fine_grained_interleaved_index : 0.000002s : 0.01% optimize.merge_cast_opt : 0.000002s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.01% optimize.micro_interleaved_order_control : 0.000003s : 0.01% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.00% optimize.full_micro_interleaved_order_control : 0.000002s : 0.01% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.01% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.00% optimize.control_data_broadcast_order : 0.000015s : 0.04% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.01% optimize.offloading_packed_experts : 0.000004s : 0.01% optimize.overlap_recompute_and_grad_model_parallel : 0.000006s : 0.02% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.00% optimize.overlap_recompute_comm : 0.000002s : 0.01% optimize.overlap_grad_ring_attention : 0.000004s : 0.01% optimize.overlap_grad_flash_sp : 0.000023s : 0.07% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.01% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000004s : 0.01% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000012s : 0.03% optimize.symbol_engine_optimizer.elim_not_effective : 0.000016s : 0.04% optimize.symbol_engine_optimizer.opt_reshape : 0.000008s : 0.02% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000012s : 0.04% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.01% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000020s : 0.06% get_jit_bprop_graph : 0.000002s : 0.01% rewriter_after_jit_bprop_graph : 0.000004s : 0.01% opt_after_jit_grad : 0.000489s : 1.41% validate : 0.000044s : 0.13% Time group info: ------[substitution.] 0.000211 38 12.51% : 0.000026s : 3: substitution.cast_eliminate 1.09% : 0.000002s : 3: substitution.elim_not_effective 0.79% : 0.000002s : 3: substitution.fold_const_symbol 3.07% : 0.000006s : 5: substitution.graph_param_transform 65.88% : 0.000139s : 4: substitution.inline 3.08% : 0.000007s : 6: substitution.j_node_and_user_rematch 3.39% : 0.000007s : 6: substitution.remove_not_recompute_node 2.90% : 0.000006s : 4: substitution.replace_old_param 7.28% : 0.000015s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.019833 2 96.41% : 0.019121s : 1: type_inference.infer 3.59% : 0.000712s : 1: type_inference.specialize ------[replace.] 0.000062 8 59.67% : 0.000037s : 4: replace.inline 40.33% : 0.000025s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000150 8 91.07% : 0.000136s : 4: match.inline 8.93% : 0.000013s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000260 1596 1.06% : 0.000003s : 17: predicate.accumulaten_eliminater 0.73% : 0.000002s : 5: predicate.ad_related_special_op_eliminate 0.60% : 0.000002s : 10: predicate.addn_check_dump 1.01% : 0.000003s : 17: predicate.addn_zero_filter 0.88% : 0.000002s : 17: predicate.adjust_all_reduce_mul_add 2.01% : 0.000005s : 27: predicate.arithmetic_simplify 1.11% : 0.000003s : 17: predicate.cast_eliminate 0.58% : 0.000002s : 10: predicate.check_bprop_eliminate 0.58% : 0.000002s : 10: predicate.compare_switch_simplify 0.17% : 0.000000s : 5: predicate.const_output_eliminate 0.58% : 0.000002s : 10: predicate.depend_value_elim 0.98% : 0.000003s : 17: predicate.dict_get_item_const_eliminator 1.07% : 0.000003s : 17: predicate.dict_get_item_eliminator 0.95% : 0.000002s : 17: predicate.dict_set_item_eliminator 0.83% : 0.000002s : 10: predicate.dumpgradient_eliminate 0.18% : 0.000000s : 5: predicate.elim_not_effective 0.34% : 0.000001s : 5: predicate.elim_shapecalc_of_broadcastargs 1.17% : 0.000003s : 22: predicate.environ_add_const_eliminate 1.17% : 0.000003s : 22: predicate.environ_get_add_eliminate 1.14% : 0.000003s : 22: predicate.environ_get_depend_swap 1.74% : 0.000005s : 32: predicate.environ_get_eliminate 1.15% : 0.000003s : 22: predicate.environ_get_set_eliminate 1.47% : 0.000004s : 25: predicate.exchange_switch_depend_value 2.29% : 0.000006s : 25: predicate.float_depend_g_call 0.52% : 0.000001s : 10: predicate.float_environ_get_switch 0.79% : 0.000002s : 15: predicate.float_tuple_getitem_switch 0.18% : 0.000000s : 5: predicate.fold_const_symbol 0.85% : 0.000002s : 10: predicate.get_grad_eliminate 0.20% : 0.000001s : 5: predicate.graph_param_transform 0.61% : 0.000002s : 10: predicate.incorporate_call 0.48% : 0.000001s : 10: predicate.incorporate_call_switch 6.24% : 0.000016s : 72: predicate.inline 0.75% : 0.000002s : 10: predicate.inline_without_move 0.32% : 0.000001s : 10: predicate.j_node_and_user_rematch 0.99% : 0.000003s : 10: predicate.less_batch_normalization 1.87% : 0.000005s : 31: predicate.list_to_tuple_eliminator_ 2.60% : 0.000007s : 48: predicate.load_eliminater 0.79% : 0.000002s : 5: predicate.loop_unroll_after_grad 1.98% : 0.000005s : 36: predicate.loop_unroll_before_grad 1.76% : 0.000005s : 27: predicate.make_slice_get_slice_eliminator 0.59% : 0.000002s : 10: predicate.merge_addn 0.57% : 0.000001s : 10: predicate.micro_step_allgather_replace 0.50% : 0.000001s : 10: predicate.mini_step_allgather_replace 0.93% : 0.000002s : 17: predicate.minmaximum_grad 0.89% : 0.000002s : 5: predicate.mutable_eliminate 0.34% : 0.000001s : 5: predicate.opt_reshape 0.35% : 0.000001s : 5: predicate.parallel_virtual_node 1.69% : 0.000004s : 25: predicate.partial_defer_inline 1.67% : 0.000004s : 26: predicate.partial_eliminate 1.03% : 0.000003s : 17: predicate.print_const_string_wrapper 0.74% : 0.000002s : 10: predicate.reduce_all_const_elim 1.26% : 0.000003s : 17: predicate.reduce_eliminate 2.68% : 0.000007s : 48: predicate.redundant_stop_gradient_eliminater 0.39% : 0.000001s : 10: predicate.remove_not_recompute_node 1.48% : 0.000004s : 31: predicate.replace_applicator 0.53% : 0.000001s : 10: predicate.replace_old_param 0.32% : 0.000001s : 5: predicate.reset_defer_inline 1.11% : 0.000003s : 17: predicate.reshape_eliminate 0.61% : 0.000002s : 10: predicate.row_tensor_add_zeros_like 0.34% : 0.000001s : 5: predicate.row_tensor_eliminate 0.81% : 0.000002s : 10: predicate.same_eliminate 0.41% : 0.000001s : 10: predicate.set_cell_output_no_recompute 0.77% : 0.000002s : 10: predicate.shard_identity_eliminate 0.65% : 0.000002s : 10: predicate.special_op_eliminate 0.67% : 0.000002s : 10: predicate.specialize_transform 1.11% : 0.000003s : 10: predicate.split_environ_get_set_with_tuple_value 0.84% : 0.000002s : 10: predicate.stack_unstack_eliminate 0.33% : 0.000001s : 5: predicate.switch_call_monad_eliminater 1.55% : 0.000004s : 25: predicate.switch_defer_inline 1.97% : 0.000005s : 35: predicate.switch_layer_defer_inline 4.51% : 0.000012s : 76: predicate.switch_simplify 0.96% : 0.000003s : 17: predicate.tile_eliminate 1.00% : 0.000003s : 17: predicate.transpose_eliminate 1.52% : 0.000004s : 27: predicate.tuple_list_convert_item_index_to_positive 1.68% : 0.000004s : 27: predicate.tuple_list_get_item_const_eliminator 1.52% : 0.000004s : 27: predicate.tuple_list_get_item_depend_reorder 3.17% : 0.000008s : 41: predicate.tuple_list_get_item_eliminator 1.58% : 0.000004s : 27: predicate.tuple_list_get_set_item_eliminator 2.32% : 0.000006s : 37: predicate.tuple_list_set_item_eliminator 1.72% : 0.000004s : 31: predicate.tuple_to_list_eliminator_ 2.43% : 0.000006s : 48: predicate.updatestate_pure_node_eliminater 3.24% : 0.000008s : 58: predicate.updatestate_useless_node_eliminater 0.37% : 0.000001s : 5: predicate.value_based_eliminate 0.75% : 0.000002s : 10: predicate.virtual_dataset_eliminate 0.64% : 0.000002s : 10: predicate.virtual_output_eliminate 0.29% : 0.000001s : 5: predicate.virtual_view_grad_eliminate 0.40% : 0.000001s : 5: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000617 11 49.40% : 0.000305s : 5: func_graph_cloner_run.FuncGraphClonerGraph 50.60% : 0.000312s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.067820 192 0.01% : 0.000004s : 1: ForceFp32Comm 4.74% : 0.003214s : 1: add_attr 4.72% : 0.003203s : 1: add_attr_with_inline 0.01% : 0.000004s : 1: add_comm_op_reuse_tag 0.09% : 0.000063s : 1: add_recomputation 0.01% : 0.000004s : 1: assign_add_opt 0.11% : 0.000073s : 1: auto_monad 0.04% : 0.000024s : 1: auto_monad_reorder 0.01% : 0.000004s : 1: begin_end_overlap_inline 0.01% : 0.000006s : 1: bias_add_comm_swap 0.67% : 0.000452s : 1: bootstrap 0.05% : 0.000033s : 1: cconv 0.01% : 0.000004s : 1: comm_op_add_attrs 0.03% : 0.000019s : 1: control_data_broadcast_order 0.02% : 0.000011s : 1: convert_after_rewriter 0.04% : 0.000029s : 1: cse_after_recomputation 0.01% : 0.000005s : 1: dataset_repeat_opt 0.01% : 0.000005s : 1: detach_backward 0.01% : 0.000010s : 1: environ_conv 0.04% : 0.000026s : 1: event_method 0.01% : 0.000005s : 1: full_micro_interleaved_order_control 0.01% : 0.000005s : 1: get_jit_bprop_graph 0.01% : 0.000009s : 1: graph_reusing 0.01% : 0.000005s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000004s : 1: handle_group_info 0.01% : 0.000005s : 1: inline 0.01% : 0.000006s : 1: insert-virtual-dataset 0.01% : 0.000004s : 1: interleave_parallel_branches 0.01% : 0.000004s : 1: interleave_split_concat_branches 0.01% : 0.000006s : 1: label_fine_grained_interleaved_index 0.01% : 0.000007s : 1: label_micro_interleaved_index 0.64% : 0.000436s : 1: loop_unroll 0.01% : 0.000004s : 1: merge_cast_opt 0.01% : 0.000005s : 1: micro_interleaved_order_control 1.05% : 0.000711s : 1: mutable_eliminate 0.01% : 0.000007s : 1: offloading_packed_experts 0.02% : 0.000016s : 1: opt.transform.loop_unroll_optimizer 0.03% : 0.000018s : 1: opt.transform.mutable_eliminate 2.13% : 0.001447s : 78: opt.transform.opt_a 0.05% : 0.000036s : 1: opt.transform.opt_after_cconv 0.04% : 0.000028s : 1: opt.transform.opt_after_jit_grad 0.19% : 0.000132s : 28: opt.transform.opt_b 0.09% : 0.000059s : 2: opt.transform.opt_trans_graph 0.07% : 0.000045s : 4: opt.transform.symbol_engine_opt 17.86% : 0.012115s : 1: opt_a 0.18% : 0.000122s : 1: opt_after_cconv 0.73% : 0.000497s : 1: opt_after_jit_grad 0.37% : 0.000251s : 1: opt_b 21.31% : 0.014454s : 1: optimize 0.03% : 0.000022s : 1: optimize_parallel_all_gather_comm 0.01% : 0.000008s : 1: order_py_execute_after_rewriter 0.04% : 0.000026s : 1: overlap_grad_flash_sp 0.01% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.01% : 0.000008s : 1: overlap_grad_ring_attention 0.01% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.01% : 0.000005s : 1: overlap_param_gather 0.01% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.01% : 0.000008s : 1: overlap_recompute_and_grad_model_parallel 0.01% : 0.000005s : 1: overlap_recompute_comm 0.01% : 0.000007s : 1: parallel-infer-symbol 0.01% : 0.000004s : 1: parallel-infer-symbol-second 0.01% : 0.000005s : 1: partial_unused_args_eliminate 0.01% : 0.000005s : 1: pipeline_parallel_scheduler 0.01% : 0.000004s : 1: pipeline_split 0.06% : 0.000038s : 1: pre_auto_parallel 0.04% : 0.000030s : 1: py_interpret_to_execute 0.03% : 0.000023s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000004s : 1: remove_cast_before_assign_add 0.03% : 0.000019s : 1: remove_dup_value 13.53% : 0.009178s : 1: renormalize.infer 0.73% : 0.000498s : 1: renormalize.specialize 0.01% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.01% : 0.000007s : 1: rewriter_after_jit_bprop_graph 0.07% : 0.000048s : 1: rewriter_after_opt_a 0.13% : 0.000088s : 1: rewriter_before_opt_a 0.01% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.01% : 0.000005s : 1: slice_recompute_activation 0.01% : 0.000004s : 1: split_layernorm_comm 0.01% : 0.000005s : 1: split_matmul_comm_elemetwise 0.01% : 0.000009s : 1: swap_dp_allreduce_reducescatter 0.13% : 0.000088s : 1: symbol_engine_optimizer 0.13% : 0.000089s : 1: tuple_transform 29.37% : 0.019916s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:45:19.311.786 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:45:19.312.044 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0333853, [21] [bootstrap]: 0.00050536 [type_inference]: 0.00592001 [event_method]: 1.975e-05 [auto_monad]: 6.653e-05 [graph_reusing]: 6.25002e-06 [inline]: 2.67001e-06 [add_attr]: 0.0195509, [1] [add_attr_with_inline]: 0.0195396, [1] [Cycle 1]: 8.621e-05, [2] [tag_attr]: 2.231e-05 [meta_addattr_fg_expand]: 6.12001e-06 [parallel-infer-symbol]: 3.71999e-06 [pre_auto_parallel]: 3.951e-05 [insert-virtual-dataset]: 2.28002e-06 [parallel-infer-symbol-second]: 7.00005e-07 [dataset_repeat_opt]: 1.77001e-06 [pipeline_split]: 1.80001e-06 [optimize]: 0.00608552, [53] [py_interpret_to_execute]: 2.971e-05 [rewriter_before_opt_a]: 9.442e-05 [opt_a]: 0.00361632, [2] [Cycle 1]: 0.00266404, [45] [expand_dump_flag]: 2.84999e-06 [switch_simplify]: 4.382e-05 [loop_unroll]: 3.116e-05 [a_1]: 0.00081952 [with_stream_mark]: 1.97e-05 [recompute_prepare]: 1.086e-05 [updatestate_depend_eliminate]: 4.87998e-06 [updatestate_assign_eliminate]: 4.06001e-06 [updatestate_loads_eliminate]: 4.18001e-06 [parameter_eliminate]: 1.82001e-06 [a_2]: 0.00012957 [accelerated_algorithm]: 8.65001e-06 [shard]: 2.69001e-06 [meta_shard_fg_expand]: 2.27999e-06 [shard_inline]: 7.92e-06 [merge_send_recv]: 9.56003e-06 [auto_parallel]: 7.36001e-06 [parallel]: 1.958e-05 [flash_sp]: 9.76998e-06 [merge_comm]: 4.55001e-06 [allreduce_fusion]: 4.12e-06 [matmul_add_comm_reduction]: 1.017e-05 [allreduce_slice_to_reducescatter]: 6.30011e-07 [virtual_shard_identity]: 1.024e-05 [virtual_dataset]: 7.96001e-06 [get_grad_eliminate_]: 7.58001e-06 [virtual_output]: 7.65e-06 [merge_forward]: 4.29997e-06 [cell_reuse_recompute_pass]: 1.50999e-06 [offload_activation]: 1.158e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.841e-05 [merge_recompute_call_nodes]: 1.71e-06 [before_grad]: 1.314e-05 [set_forward_comm_id_for_comm_node_pass]: 5.12e-06 [meta_fg_expand]: 3.72002e-06 [flash_sp_send_recv_attached]: 2.51e-06 [receive_attached]: 2.26e-06 [after_resolve]: 1.292e-05 [a_after_grad]: 1.306e-05 [renormalize]: 0.00085472 [add_forward_monad_depend]: 6.49999e-06 [auto_monad_grad]: 2.65002e-06 [auto_monad_eliminator]: 1.797e-05 [cse]: 3.564e-05 [a_3]: 7.239e-05 [Cycle 2]: 0.00093808, [45] [expand_dump_flag]: 1.22e-06 [switch_simplify]: 9.36e-06 [loop_unroll]: 7.56999e-06 [a_1]: 0.00017563 [with_stream_mark]: 1.375e-05 [recompute_prepare]: 8.35001e-06 [updatestate_depend_eliminate]: 3.92002e-06 [updatestate_assign_eliminate]: 3.14999e-06 [updatestate_loads_eliminate]: 2.82002e-06 [parameter_eliminate]: 1.34e-06 [a_2]: 0.00011793 [accelerated_algorithm]: 7.46001e-06 [shard]: 1.72999e-06 [meta_shard_fg_expand]: 1.72001e-06 [shard_inline]: 1.048e-05 [merge_send_recv]: 6.73e-06 [auto_parallel]: 7.12002e-06 [parallel]: 6.31e-06 [flash_sp]: 3.50998e-06 [merge_comm]: 4.27e-06 [allreduce_fusion]: 4.55001e-06 [matmul_add_comm_reduction]: 7.68001e-06 [allreduce_slice_to_reducescatter]: 6.50005e-07 [virtual_shard_identity]: 7.93999e-06 [virtual_dataset]: 7.31001e-06 [get_grad_eliminate_]: 6.89001e-06 [virtual_output]: 7.11001e-06 [merge_forward]: 3.99002e-06 [cell_reuse_recompute_pass]: 1.71e-06 [offload_activation]: 8.73001e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.703e-05 [merge_recompute_call_nodes]: 8.69972e-07 [before_grad]: 1.238e-05 [set_forward_comm_id_for_comm_node_pass]: 4.54002e-06 [meta_fg_expand]: 3.03e-06 [flash_sp_send_recv_attached]: 9.89996e-07 [receive_attached]: 1.72001e-06 [after_resolve]: 1.185e-05 [a_after_grad]: 1.127e-05 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 1.29e-06 [auto_monad_grad]: 1.05001e-06 [auto_monad_eliminator]: 8.82e-06 [cse]: 1.881e-05 [a_3]: 5.893e-05 [py_interpret_to_execute_after_opt_a]: 1.628e-05 [slice_cell_reuse_recomputed_activation]: 4.97e-06 [rewriter_after_opt_a]: 4.536e-05 [convert_after_rewriter]: 1.093e-05 [order_py_execute_after_rewriter]: 8.67e-06 [mutable_eliminate]: 0.00057749 [opt_b]: 0.00030256, [1] [Cycle 1]: 0.00029305, [7] [b_1]: 0.00019529 [b_2]: 9.91e-06 [updatestate_depend_eliminate]: 5.91998e-06 [updatestate_assign_eliminate]: 2.98e-06 [updatestate_loads_eliminate]: 2.86e-06 [renormalize]: 2.00002e-07 [cse]: 2.086e-05 [optimize_parallel_all_gather_comm]: 2.008e-05 [overlap_param_gather]: 5.39998e-06 [cconv]: 4.946e-05 [loop_unroll]: 0.00045433 [opt_after_cconv]: 0.00013695, [1] [Cycle 1]: 0.00012847, [7] [c_1]: 3.65e-05 [parameter_eliminate]: 2.97002e-06 [updatestate_depend_eliminate]: 5.93002e-06 [updatestate_assign_eliminate]: 3.27002e-06 [updatestate_loads_eliminate]: 2.93e-06 [cse]: 2.123e-05 [renormalize]: 2.89991e-07 [remove_dup_value]: 1.673e-05 [tuple_transform]: 0.00010151, [1] [Cycle 1]: 9.428e-05, [4] [d_1]: 5.349e-05 [none_parameter_eliminate]: 1.69e-06 [renormalize]: 1.60013e-07 [switch_simplify]: 8.43999e-06 [partial_unused_args_eliminate]: 4.28999e-06 [add_recomputation]: 5.773e-05 [cse_after_recomputation]: 3.133e-05, [1] [Cycle 1]: 2.445e-05, [1] [cse]: 1.475e-05 [environ_conv]: 9.62001e-06 [swap_dp_allreduce_reducescatter]: 8.62998e-06 [bias_add_comm_swap]: 5.22999e-06 [label_micro_interleaved_index]: 6.57002e-06 [label_fine_grained_interleaved_index]: 5.46e-06 [merge_cast_opt]: 3.73001e-06 [slice_recompute_activation]: 4.3e-06 [micro_interleaved_order_control]: 4.89998e-06 [assign_add_opt]: 3.7e-06 [ForceFp32Comm]: 3.2e-06 [remove_cast_before_assign_add]: 3.37002e-06 [full_micro_interleaved_order_control]: 4.65999e-06 [reorder_send_recv_between_fp_bp]: 5.25999e-06 [comm_op_add_attrs]: 3.36999e-06 [add_comm_op_reuse_tag]: 3.28e-06 [interleave_split_concat_branches]: 3.68e-06 [interleave_parallel_branches]: 3.38999e-06 [overlap_opt_shard_in_pipeline]: 3.55e-06 [overlap_opt_shard_grad_in_pipeline]: 3.97998e-06 [control_data_broadcast_order]: 1.77e-05 [grouped_pairwise_exchange_alltoall]: 3.75998e-06 [offloading_packed_experts]: 6.91999e-06 [overlap_recompute_and_grad_model_parallel]: 7.58001e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.65003e-06 [overlap_recompute_allgather_and_fa_grad]: 4.02002e-06 [overlap_recompute_comm]: 4.75999e-06 [overlap_grad_ring_attention]: 7.48e-06 [overlap_grad_flash_sp]: 2.477e-05 [begin_end_overlap_inline]: 2.89999e-06 [split_matmul_comm_elemetwise]: 4.52998e-06 [split_layernorm_comm]: 4.12e-06 [handle_group_info]: 3.6e-06 [symbol_engine_optimizer]: 0.00010373, [1] [Cycle 1]: 9.729e-05, [6] [build]: 3.21999e-06 [elim_shapecalc]: 1.111e-05 [elim_not_effective]: 1.56e-05 [opt_reshape]: 8.84e-06 [fold_const_symbol]: 1.247e-05 [renormalize]: 2.00002e-07 [detach_backward]: 3.66001e-06 [pipeline_parallel_scheduler]: 1.67999e-06 [auto_monad_reorder]: 2.317e-05 [get_jit_bprop_graph]: 1.59e-06 [rewriter_after_jit_bprop_graph]: 5.41998e-06 [opt_after_jit_grad]: 0.00050341 [validate]: 4.159e-05 Sums bootstrap : 0.000505s : 4.18% type_inference : 0.005920s : 49.01% event_method : 0.000020s : 0.16% auto_monad : 0.000067s : 0.55% graph_reusing : 0.000006s : 0.05% inline : 0.000003s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000022s : 0.18% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.05% parallel-infer-symbol : 0.000004s : 0.03% pre_auto_parallel : 0.000040s : 0.33% insert-virtual-dataset : 0.000002s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.01% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000030s : 0.25% optimize.rewriter_before_opt_a : 0.000094s : 0.78% optimize.opt_a.expand_dump_flag : 0.000004s : 0.03% optimize.opt_a.switch_simplify : 0.000053s : 0.44% optimize.opt_a.loop_unroll : 0.000039s : 0.32% optimize.opt_a.a_1 : 0.000995s : 8.24% optimize.opt_a.with_stream_mark : 0.000033s : 0.28% optimize.opt_a.recompute_prepare : 0.000019s : 0.16% optimize.opt_a.updatestate_depend_eliminate : 0.000009s : 0.07% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.06% optimize.opt_a.updatestate_loads_eliminate : 0.000007s : 0.06% optimize.opt_a.parameter_eliminate : 0.000003s : 0.03% optimize.opt_a.a_2 : 0.000247s : 2.05% optimize.opt_a.accelerated_algorithm : 0.000016s : 0.13% optimize.opt_a.shard : 0.000004s : 0.04% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.03% optimize.opt_a.shard_inline : 0.000018s : 0.15% optimize.opt_a.merge_send_recv : 0.000016s : 0.13% optimize.opt_a.auto_parallel : 0.000014s : 0.12% optimize.opt_a.parallel : 0.000026s : 0.21% optimize.opt_a.flash_sp : 0.000013s : 0.11% optimize.opt_a.merge_comm : 0.000009s : 0.07% optimize.opt_a.allreduce_fusion : 0.000009s : 0.07% optimize.opt_a.matmul_add_comm_reduction : 0.000018s : 0.15% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000018s : 0.15% optimize.opt_a.virtual_dataset : 0.000015s : 0.13% optimize.opt_a.get_grad_eliminate_ : 0.000014s : 0.12% optimize.opt_a.virtual_output : 0.000015s : 0.12% optimize.opt_a.merge_forward : 0.000008s : 0.07% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.03% optimize.opt_a.offload_activation : 0.000020s : 0.17% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000035s : 0.29% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.02% optimize.opt_a.before_grad : 0.000026s : 0.21% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000010s : 0.08% optimize.opt_a.meta_fg_expand : 0.000007s : 0.06% optimize.opt_a.flash_sp_send_recv_attached : 0.000003s : 0.03% optimize.opt_a.receive_attached : 0.000004s : 0.03% optimize.opt_a.after_resolve : 0.000025s : 0.21% optimize.opt_a.a_after_grad : 0.000024s : 0.20% optimize.opt_a.renormalize : 0.000855s : 7.08% optimize.opt_a.add_forward_monad_depend : 0.000008s : 0.06% optimize.opt_a.auto_monad_grad : 0.000004s : 0.03% optimize.opt_a.auto_monad_eliminator : 0.000027s : 0.22% optimize.opt_a.cse : 0.000054s : 0.45% optimize.opt_a.a_3 : 0.000131s : 1.09% optimize.py_interpret_to_execute_after_opt_a : 0.000016s : 0.13% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.04% optimize.rewriter_after_opt_a : 0.000045s : 0.38% optimize.convert_after_rewriter : 0.000011s : 0.09% optimize.order_py_execute_after_rewriter : 0.000009s : 0.07% optimize.mutable_eliminate : 0.000577s : 4.78% optimize.opt_b.b_1 : 0.000195s : 1.62% optimize.opt_b.b_2 : 0.000010s : 0.08% optimize.opt_b.updatestate_depend_eliminate : 0.000006s : 0.05% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.02% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.02% optimize.opt_b.renormalize : 0.000000s : 0.00% optimize.opt_b.cse : 0.000021s : 0.17% optimize.optimize_parallel_all_gather_comm : 0.000020s : 0.17% optimize.overlap_param_gather : 0.000005s : 0.04% optimize.cconv : 0.000049s : 0.41% optimize.loop_unroll : 0.000454s : 3.76% optimize.opt_after_cconv.c_1 : 0.000036s : 0.30% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.02% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.05% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.02% optimize.opt_after_cconv.cse : 0.000021s : 0.18% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000017s : 0.14% optimize.tuple_transform.d_1 : 0.000053s : 0.44% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000008s : 0.07% optimize.partial_unused_args_eliminate : 0.000004s : 0.04% optimize.add_recomputation : 0.000058s : 0.48% optimize.cse_after_recomputation.cse : 0.000015s : 0.12% optimize.environ_conv : 0.000010s : 0.08% optimize.swap_dp_allreduce_reducescatter : 0.000009s : 0.07% optimize.bias_add_comm_swap : 0.000005s : 0.04% optimize.label_micro_interleaved_index : 0.000007s : 0.05% optimize.label_fine_grained_interleaved_index : 0.000005s : 0.05% optimize.merge_cast_opt : 0.000004s : 0.03% optimize.slice_recompute_activation : 0.000004s : 0.04% optimize.micro_interleaved_order_control : 0.000005s : 0.04% optimize.assign_add_opt : 0.000004s : 0.03% optimize.ForceFp32Comm : 0.000003s : 0.03% optimize.remove_cast_before_assign_add : 0.000003s : 0.03% optimize.full_micro_interleaved_order_control : 0.000005s : 0.04% optimize.reorder_send_recv_between_fp_bp : 0.000005s : 0.04% optimize.comm_op_add_attrs : 0.000003s : 0.03% optimize.add_comm_op_reuse_tag : 0.000003s : 0.03% optimize.interleave_split_concat_branches : 0.000004s : 0.03% optimize.interleave_parallel_branches : 0.000003s : 0.03% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.03% optimize.overlap_opt_shard_grad_in_pipeline : 0.000004s : 0.03% optimize.control_data_broadcast_order : 0.000018s : 0.15% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.03% optimize.offloading_packed_experts : 0.000007s : 0.06% optimize.overlap_recompute_and_grad_model_parallel : 0.000008s : 0.06% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.03% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.03% optimize.overlap_recompute_comm : 0.000005s : 0.04% optimize.overlap_grad_ring_attention : 0.000007s : 0.06% optimize.overlap_grad_flash_sp : 0.000025s : 0.21% optimize.begin_end_overlap_inline : 0.000003s : 0.02% optimize.split_matmul_comm_elemetwise : 0.000005s : 0.04% optimize.split_layernorm_comm : 0.000004s : 0.03% optimize.handle_group_info : 0.000004s : 0.03% optimize.symbol_engine_optimizer.build : 0.000003s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000011s : 0.09% optimize.symbol_engine_optimizer.elim_not_effective : 0.000016s : 0.13% optimize.symbol_engine_optimizer.opt_reshape : 0.000009s : 0.07% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000012s : 0.10% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000004s : 0.03% pipeline_parallel_scheduler : 0.000002s : 0.01% auto_monad_reorder : 0.000023s : 0.19% get_jit_bprop_graph : 0.000002s : 0.01% rewriter_after_jit_bprop_graph : 0.000005s : 0.04% opt_after_jit_grad : 0.000503s : 4.17% validate : 0.000042s : 0.34% Time group info: ------[substitution.] 0.000217 38 10.46% : 0.000023s : 3: substitution.cast_eliminate 1.03% : 0.000002s : 3: substitution.elim_not_effective 0.86% : 0.000002s : 3: substitution.fold_const_symbol 3.14% : 0.000007s : 5: substitution.graph_param_transform 69.74% : 0.000151s : 4: substitution.inline 2.31% : 0.000005s : 6: substitution.j_node_and_user_rematch 2.98% : 0.000006s : 6: substitution.remove_not_recompute_node 2.27% : 0.000005s : 4: substitution.replace_old_param 7.21% : 0.000016s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.005865 2 87.82% : 0.005151s : 1: type_inference.infer 12.18% : 0.000714s : 1: type_inference.specialize ------[replace.] 0.000064 8 59.93% : 0.000039s : 4: replace.inline 40.07% : 0.000026s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000162 8 91.68% : 0.000148s : 4: match.inline 8.32% : 0.000013s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000252 1596 0.94% : 0.000002s : 17: predicate.accumulaten_eliminater 0.66% : 0.000002s : 5: predicate.ad_related_special_op_eliminate 0.54% : 0.000001s : 10: predicate.addn_check_dump 0.99% : 0.000003s : 17: predicate.addn_zero_filter 0.86% : 0.000002s : 17: predicate.adjust_all_reduce_mul_add 2.10% : 0.000005s : 27: predicate.arithmetic_simplify 1.05% : 0.000003s : 17: predicate.cast_eliminate 0.61% : 0.000002s : 10: predicate.check_bprop_eliminate 0.57% : 0.000001s : 10: predicate.compare_switch_simplify 0.20% : 0.000001s : 5: predicate.const_output_eliminate 0.64% : 0.000002s : 10: predicate.depend_value_elim 0.96% : 0.000002s : 17: predicate.dict_get_item_const_eliminator 1.14% : 0.000003s : 17: predicate.dict_get_item_eliminator 0.93% : 0.000002s : 17: predicate.dict_set_item_eliminator 0.86% : 0.000002s : 10: predicate.dumpgradient_eliminate 0.25% : 0.000001s : 5: predicate.elim_not_effective 0.40% : 0.000001s : 5: predicate.elim_shapecalc_of_broadcastargs 1.17% : 0.000003s : 22: predicate.environ_add_const_eliminate 1.15% : 0.000003s : 22: predicate.environ_get_add_eliminate 1.15% : 0.000003s : 22: predicate.environ_get_depend_swap 1.81% : 0.000005s : 32: predicate.environ_get_eliminate 1.17% : 0.000003s : 22: predicate.environ_get_set_eliminate 1.39% : 0.000004s : 25: predicate.exchange_switch_depend_value 2.38% : 0.000006s : 25: predicate.float_depend_g_call 0.53% : 0.000001s : 10: predicate.float_environ_get_switch 0.82% : 0.000002s : 15: predicate.float_tuple_getitem_switch 0.18% : 0.000000s : 5: predicate.fold_const_symbol 0.61% : 0.000002s : 10: predicate.get_grad_eliminate 0.21% : 0.000001s : 5: predicate.graph_param_transform 0.62% : 0.000002s : 10: predicate.incorporate_call 0.51% : 0.000001s : 10: predicate.incorporate_call_switch 6.59% : 0.000017s : 72: predicate.inline 0.83% : 0.000002s : 10: predicate.inline_without_move 0.32% : 0.000001s : 10: predicate.j_node_and_user_rematch 0.72% : 0.000002s : 10: predicate.less_batch_normalization 1.96% : 0.000005s : 31: predicate.list_to_tuple_eliminator_ 2.63% : 0.000007s : 48: predicate.load_eliminater 0.79% : 0.000002s : 5: predicate.loop_unroll_after_grad 2.00% : 0.000005s : 36: predicate.loop_unroll_before_grad 1.70% : 0.000004s : 27: predicate.make_slice_get_slice_eliminator 0.56% : 0.000001s : 10: predicate.merge_addn 0.54% : 0.000001s : 10: predicate.micro_step_allgather_replace 0.57% : 0.000001s : 10: predicate.mini_step_allgather_replace 0.87% : 0.000002s : 17: predicate.minmaximum_grad 0.85% : 0.000002s : 5: predicate.mutable_eliminate 0.36% : 0.000001s : 5: predicate.opt_reshape 0.35% : 0.000001s : 5: predicate.parallel_virtual_node 1.66% : 0.000004s : 25: predicate.partial_defer_inline 1.71% : 0.000004s : 26: predicate.partial_eliminate 1.07% : 0.000003s : 17: predicate.print_const_string_wrapper 0.56% : 0.000001s : 10: predicate.reduce_all_const_elim 1.24% : 0.000003s : 17: predicate.reduce_eliminate 2.62% : 0.000007s : 48: predicate.redundant_stop_gradient_eliminater 0.45% : 0.000001s : 10: predicate.remove_not_recompute_node 1.36% : 0.000003s : 31: predicate.replace_applicator 0.50% : 0.000001s : 10: predicate.replace_old_param 0.25% : 0.000001s : 5: predicate.reset_defer_inline 0.97% : 0.000002s : 17: predicate.reshape_eliminate 0.59% : 0.000001s : 10: predicate.row_tensor_add_zeros_like 0.38% : 0.000001s : 5: predicate.row_tensor_eliminate 0.85% : 0.000002s : 10: predicate.same_eliminate 0.44% : 0.000001s : 10: predicate.set_cell_output_no_recompute 0.76% : 0.000002s : 10: predicate.shard_identity_eliminate 0.71% : 0.000002s : 10: predicate.special_op_eliminate 0.80% : 0.000002s : 10: predicate.specialize_transform 0.90% : 0.000002s : 10: predicate.split_environ_get_set_with_tuple_value 0.74% : 0.000002s : 10: predicate.stack_unstack_eliminate 0.36% : 0.000001s : 5: predicate.switch_call_monad_eliminater 1.53% : 0.000004s : 25: predicate.switch_defer_inline 2.07% : 0.000005s : 35: predicate.switch_layer_defer_inline 4.84% : 0.000012s : 76: predicate.switch_simplify 0.91% : 0.000002s : 17: predicate.tile_eliminate 0.96% : 0.000002s : 17: predicate.transpose_eliminate 1.53% : 0.000004s : 27: predicate.tuple_list_convert_item_index_to_positive 1.70% : 0.000004s : 27: predicate.tuple_list_get_item_const_eliminator 1.55% : 0.000004s : 27: predicate.tuple_list_get_item_depend_reorder 3.18% : 0.000008s : 41: predicate.tuple_list_get_item_eliminator 1.60% : 0.000004s : 27: predicate.tuple_list_get_set_item_eliminator 2.38% : 0.000006s : 37: predicate.tuple_list_set_item_eliminator 1.81% : 0.000005s : 31: predicate.tuple_to_list_eliminator_ 2.50% : 0.000006s : 48: predicate.updatestate_pure_node_eliminater 3.23% : 0.000008s : 58: predicate.updatestate_useless_node_eliminater 0.35% : 0.000001s : 5: predicate.value_based_eliminate 0.61% : 0.000002s : 10: predicate.virtual_dataset_eliminate 0.62% : 0.000002s : 10: predicate.virtual_output_eliminate 0.28% : 0.000001s : 5: predicate.virtual_view_grad_eliminate 0.38% : 0.000001s : 5: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000535 11 52.24% : 0.000279s : 5: func_graph_cloner_run.FuncGraphClonerGraph 47.76% : 0.000255s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.061589 192 0.01% : 0.000006s : 1: ForceFp32Comm 31.76% : 0.019562s : 1: add_attr 31.73% : 0.019544s : 1: add_attr_with_inline 0.01% : 0.000006s : 1: add_comm_op_reuse_tag 0.10% : 0.000061s : 1: add_recomputation 0.01% : 0.000006s : 1: assign_add_opt 0.12% : 0.000077s : 1: auto_monad 0.05% : 0.000030s : 1: auto_monad_reorder 0.01% : 0.000006s : 1: begin_end_overlap_inline 0.01% : 0.000008s : 1: bias_add_comm_swap 0.89% : 0.000547s : 1: bootstrap 0.09% : 0.000053s : 1: cconv 0.01% : 0.000006s : 1: comm_op_add_attrs 0.03% : 0.000021s : 1: control_data_broadcast_order 0.02% : 0.000014s : 1: convert_after_rewriter 0.06% : 0.000035s : 1: cse_after_recomputation 0.01% : 0.000007s : 1: dataset_repeat_opt 0.03% : 0.000018s : 1: detach_backward 0.02% : 0.000013s : 1: environ_conv 0.05% : 0.000030s : 1: event_method 0.01% : 0.000007s : 1: full_micro_interleaved_order_control 0.01% : 0.000008s : 1: get_jit_bprop_graph 0.02% : 0.000013s : 1: graph_reusing 0.01% : 0.000006s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000006s : 1: handle_group_info 0.01% : 0.000008s : 1: inline 0.01% : 0.000009s : 1: insert-virtual-dataset 0.01% : 0.000006s : 1: interleave_parallel_branches 0.01% : 0.000006s : 1: interleave_split_concat_branches 0.01% : 0.000008s : 1: label_fine_grained_interleaved_index 0.02% : 0.000009s : 1: label_micro_interleaved_index 0.75% : 0.000460s : 1: loop_unroll 0.01% : 0.000006s : 1: merge_cast_opt 0.01% : 0.000008s : 1: micro_interleaved_order_control 0.95% : 0.000584s : 1: mutable_eliminate 0.02% : 0.000010s : 1: offloading_packed_experts 0.03% : 0.000016s : 1: opt.transform.loop_unroll_optimizer 0.03% : 0.000017s : 1: opt.transform.mutable_eliminate 2.47% : 0.001518s : 78: opt.transform.opt_a 0.06% : 0.000035s : 1: opt.transform.opt_after_cconv 0.05% : 0.000030s : 1: opt.transform.opt_after_jit_grad 0.22% : 0.000133s : 28: opt.transform.opt_b 0.10% : 0.000060s : 2: opt.transform.opt_trans_graph 0.07% : 0.000044s : 4: opt.transform.symbol_engine_opt 5.88% : 0.003620s : 1: opt_a 0.23% : 0.000140s : 1: opt_after_cconv 0.84% : 0.000514s : 1: opt_after_jit_grad 0.50% : 0.000306s : 1: opt_b 10.45% : 0.006438s : 1: optimize 0.04% : 0.000023s : 1: optimize_parallel_all_gather_comm 0.02% : 0.000012s : 1: order_py_execute_after_rewriter 0.05% : 0.000028s : 1: overlap_grad_flash_sp 0.01% : 0.000006s : 1: overlap_grad_matmul_and_grad_allreduce 0.02% : 0.000010s : 1: overlap_grad_ring_attention 0.01% : 0.000007s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000006s : 1: overlap_opt_shard_in_pipeline 0.01% : 0.000008s : 1: overlap_param_gather 0.01% : 0.000007s : 1: overlap_recompute_allgather_and_fa_grad 0.02% : 0.000010s : 1: overlap_recompute_and_grad_model_parallel 0.01% : 0.000008s : 1: overlap_recompute_comm 0.02% : 0.000010s : 1: parallel-infer-symbol 0.01% : 0.000006s : 1: parallel-infer-symbol-second 0.01% : 0.000007s : 1: partial_unused_args_eliminate 0.02% : 0.000009s : 1: pipeline_parallel_scheduler 0.01% : 0.000007s : 1: pipeline_split 0.08% : 0.000047s : 1: pre_auto_parallel 0.05% : 0.000033s : 1: py_interpret_to_execute 0.03% : 0.000020s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000006s : 1: remove_cast_before_assign_add 0.03% : 0.000020s : 1: remove_dup_value 0.78% : 0.000478s : 1: renormalize.infer 0.60% : 0.000367s : 1: renormalize.specialize 0.01% : 0.000008s : 1: reorder_send_recv_between_fp_bp 0.02% : 0.000011s : 1: rewriter_after_jit_bprop_graph 0.08% : 0.000049s : 1: rewriter_after_opt_a 0.16% : 0.000098s : 1: rewriter_before_opt_a 0.01% : 0.000008s : 1: slice_cell_reuse_recomputed_activation 0.01% : 0.000007s : 1: slice_recompute_activation 0.01% : 0.000007s : 1: split_layernorm_comm 0.01% : 0.000007s : 1: split_matmul_comm_elemetwise 0.02% : 0.000012s : 1: swap_dp_allreduce_reducescatter 0.17% : 0.000107s : 1: symbol_engine_optimizer 0.17% : 0.000104s : 1: tuple_transform 9.67% : 0.005958s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:45:19.811.482 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0150675, [21] [bootstrap]: 0.00042147 [type_inference]: 0.0055939 [event_method]: 1.804e-05 [auto_monad]: 6.304e-05 [graph_reusing]: 6.23998e-06 [inline]: 2.85002e-06 [add_attr]: 0.00310159, [1] [add_attr_with_inline]: 0.00309297, [1] [Cycle 1]: 5.631e-05, [2] [tag_attr]: 1.888e-05 [meta_addattr_fg_expand]: 6.13002e-06 [parallel-infer-symbol]: 3.13e-06 [pre_auto_parallel]: 3.332e-05 [insert-virtual-dataset]: 2.33002e-06 [parallel-infer-symbol-second]: 7.50006e-07 [dataset_repeat_opt]: 2.06e-06 [pipeline_split]: 1.58002e-06 [optimize]: 0.00513334, [53] [py_interpret_to_execute]: 2.589e-05 [rewriter_before_opt_a]: 8.427e-05 [opt_a]: 0.00303555, [2] [Cycle 1]: 0.00227241, [45] [expand_dump_flag]: 3.08998e-06 [switch_simplify]: 4.366e-05 [loop_unroll]: 3.139e-05 [a_1]: 0.00073464 [with_stream_mark]: 1.603e-05 [recompute_prepare]: 1.002e-05 [updatestate_depend_eliminate]: 4.43001e-06 [updatestate_assign_eliminate]: 4.54998e-06 [updatestate_loads_eliminate]: 3.99002e-06 [parameter_eliminate]: 1.87999e-06 [a_2]: 0.00010051 [accelerated_algorithm]: 8.54e-06 [shard]: 1.69e-06 [meta_shard_fg_expand]: 1.89e-06 [shard_inline]: 7.65e-06 [merge_send_recv]: 9.87001e-06 [auto_parallel]: 7.13998e-06 [parallel]: 1.838e-05 [flash_sp]: 8.98002e-06 [merge_comm]: 4.94e-06 [allreduce_fusion]: 4.18999e-06 [matmul_add_comm_reduction]: 1.01e-05 [allreduce_slice_to_reducescatter]: 9.00007e-07 [virtual_shard_identity]: 8.94e-06 [virtual_dataset]: 7.77e-06 [get_grad_eliminate_]: 7.35e-06 [virtual_output]: 8.33999e-06 [merge_forward]: 4.58999e-06 [cell_reuse_recompute_pass]: 1.28002e-06 [offload_activation]: 1.18e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.47e-05 [merge_recompute_call_nodes]: 1.60001e-06 [before_grad]: 1.204e-05 [set_forward_comm_id_for_comm_node_pass]: 4.50001e-06 [meta_fg_expand]: 3.24001e-06 [flash_sp_send_recv_attached]: 2.78e-06 [receive_attached]: 2.94999e-06 [after_resolve]: 1.32e-05 [a_after_grad]: 1.173e-05 [renormalize]: 0.00075591 [add_forward_monad_depend]: 6.43998e-06 [auto_monad_grad]: 2.09e-06 [auto_monad_eliminator]: 1.737e-05 [cse]: 3.594e-05 [a_3]: 5.718e-05 [Cycle 2]: 0.0007536, [45] [expand_dump_flag]: 1.02e-06 [switch_simplify]: 8.96998e-06 [loop_unroll]: 7.53999e-06 [a_1]: 0.0001752 [with_stream_mark]: 1.218e-05 [recompute_prepare]: 7.53999e-06 [updatestate_depend_eliminate]: 3.83001e-06 [updatestate_assign_eliminate]: 2.92002e-06 [updatestate_loads_eliminate]: 2.85998e-06 [parameter_eliminate]: 1.14e-06 [a_2]: 9.074e-05 [accelerated_algorithm]: 7.43e-06 [shard]: 1.22e-06 [meta_shard_fg_expand]: 1.52001e-06 [shard_inline]: 7.47002e-06 [merge_send_recv]: 6.46e-06 [auto_parallel]: 6.66e-06 [parallel]: 4.72e-06 [flash_sp]: 3.49001e-06 [merge_comm]: 4.25e-06 [allreduce_fusion]: 3.48e-06 [matmul_add_comm_reduction]: 7.31999e-06 [allreduce_slice_to_reducescatter]: 3.60014e-07 [virtual_shard_identity]: 8.25999e-06 [virtual_dataset]: 7.08998e-06 [get_grad_eliminate_]: 7.12997e-06 [virtual_output]: 7.31999e-06 [merge_forward]: 3.16999e-06 [cell_reuse_recompute_pass]: 1.59e-06 [offload_activation]: 7.73999e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.441e-05 [merge_recompute_call_nodes]: 8.89995e-07 [before_grad]: 1.106e-05 [set_forward_comm_id_for_comm_node_pass]: 3.98999e-06 [meta_fg_expand]: 2.83998e-06 [flash_sp_send_recv_attached]: 1.08001e-06 [receive_attached]: 1.24998e-06 [after_resolve]: 1.088e-05 [a_after_grad]: 1.197e-05 [renormalize]: 7.00238e-08 [add_forward_monad_depend]: 1.24e-06 [auto_monad_grad]: 1.02e-06 [auto_monad_eliminator]: 8.70999e-06 [cse]: 1.763e-05 [a_3]: 4.505e-05 [py_interpret_to_execute_after_opt_a]: 9.76e-06 [slice_cell_reuse_recomputed_activation]: 1.97001e-06 [rewriter_after_opt_a]: 3.847e-05 [convert_after_rewriter]: 7.63001e-06 [order_py_execute_after_rewriter]: 5.87999e-06 [mutable_eliminate]: 0.00052248 [opt_b]: 0.00023926, [1] [Cycle 1]: 0.00023335, [7] [b_1]: 0.00015258 [b_2]: 9.47999e-06 [updatestate_depend_eliminate]: 5.89999e-06 [updatestate_assign_eliminate]: 3.18e-06 [updatestate_loads_eliminate]: 2.93e-06 [renormalize]: 4.69998e-07 [cse]: 2.307e-05 [optimize_parallel_all_gather_comm]: 1.795e-05 [overlap_param_gather]: 2.19999e-06 [cconv]: 2.382e-05 [loop_unroll]: 0.00042674 [opt_after_cconv]: 0.00011487, [1] [Cycle 1]: 0.0001092, [7] [c_1]: 3.694e-05 [parameter_eliminate]: 2.51e-06 [updatestate_depend_eliminate]: 5.57001e-06 [updatestate_assign_eliminate]: 3.23e-06 [updatestate_loads_eliminate]: 2.95998e-06 [cse]: 2.288e-05 [renormalize]: 7.00005e-07 [remove_dup_value]: 1.466e-05 [tuple_transform]: 8.468e-05, [1] [Cycle 1]: 7.997e-05, [4] [d_1]: 5.209e-05 [none_parameter_eliminate]: 1.79998e-06 [renormalize]: 2.69996e-07 [switch_simplify]: 7.85e-06 [partial_unused_args_eliminate]: 2.26e-06 [add_recomputation]: 5.35e-05 [cse_after_recomputation]: 2.401e-05, [1] [Cycle 1]: 1.954e-05, [1] [cse]: 1.421e-05 [environ_conv]: 6.53998e-06 [swap_dp_allreduce_reducescatter]: 5.73002e-06 [bias_add_comm_swap]: 2.46e-06 [label_micro_interleaved_index]: 4.63999e-06 [label_fine_grained_interleaved_index]: 3.13e-06 [merge_cast_opt]: 1.52999e-06 [slice_recompute_activation]: 1.96e-06 [micro_interleaved_order_control]: 2.21e-06 [assign_add_opt]: 1.17e-06 [ForceFp32Comm]: 1.09e-06 [remove_cast_before_assign_add]: 9.79984e-07 [full_micro_interleaved_order_control]: 2.07999e-06 [reorder_send_recv_between_fp_bp]: 2.64001e-06 [comm_op_add_attrs]: 1.01002e-06 [add_comm_op_reuse_tag]: 9.70002e-07 [interleave_split_concat_branches]: 1.05999e-06 [interleave_parallel_branches]: 1.01002e-06 [overlap_opt_shard_in_pipeline]: 1.15001e-06 [overlap_opt_shard_grad_in_pipeline]: 2.21e-06 [control_data_broadcast_order]: 1.422e-05 [grouped_pairwise_exchange_alltoall]: 1.67001e-06 [offloading_packed_experts]: 4.21001e-06 [overlap_recompute_and_grad_model_parallel]: 5.07e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.22e-06 [overlap_recompute_allgather_and_fa_grad]: 1.39e-06 [overlap_recompute_comm]: 2.53e-06 [overlap_grad_ring_attention]: 4.37e-06 [overlap_grad_flash_sp]: 2.011e-05 [begin_end_overlap_inline]: 4.80009e-07 [split_matmul_comm_elemetwise]: 2.09e-06 [split_layernorm_comm]: 1.56998e-06 [handle_group_info]: 1.05001e-06 [symbol_engine_optimizer]: 8.184e-05, [1] [Cycle 1]: 7.782e-05, [6] [build]: 3.12002e-06 [elim_shapecalc]: 1.202e-05 [elim_not_effective]: 1.472e-05 [opt_reshape]: 8.27998e-06 [fold_const_symbol]: 1.182e-05 [renormalize]: 1.69995e-07 [detach_backward]: 1.93997e-06 [pipeline_parallel_scheduler]: 1.53002e-06 [auto_monad_reorder]: 1.933e-05 [get_jit_bprop_graph]: 1.43002e-06 [rewriter_after_jit_bprop_graph]: 3.97e-06 [opt_after_jit_grad]: 0.00047025 [validate]: 4.127e-05 Sums bootstrap : 0.000421s : 3.82% type_inference : 0.005594s : 50.73% event_method : 0.000018s : 0.16% auto_monad : 0.000063s : 0.57% graph_reusing : 0.000006s : 0.06% inline : 0.000003s : 0.03% add_attr.add_attr_with_inline.tag_attr : 0.000019s : 0.17% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.06% parallel-infer-symbol : 0.000003s : 0.03% pre_auto_parallel : 0.000033s : 0.30% insert-virtual-dataset : 0.000002s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000026s : 0.23% optimize.rewriter_before_opt_a : 0.000084s : 0.76% optimize.opt_a.expand_dump_flag : 0.000004s : 0.04% optimize.opt_a.switch_simplify : 0.000053s : 0.48% optimize.opt_a.loop_unroll : 0.000039s : 0.35% optimize.opt_a.a_1 : 0.000910s : 8.25% optimize.opt_a.with_stream_mark : 0.000028s : 0.26% optimize.opt_a.recompute_prepare : 0.000018s : 0.16% optimize.opt_a.updatestate_depend_eliminate : 0.000008s : 0.07% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.07% optimize.opt_a.updatestate_loads_eliminate : 0.000007s : 0.06% optimize.opt_a.parameter_eliminate : 0.000003s : 0.03% optimize.opt_a.a_2 : 0.000191s : 1.73% optimize.opt_a.accelerated_algorithm : 0.000016s : 0.14% optimize.opt_a.shard : 0.000003s : 0.03% optimize.opt_a.meta_shard_fg_expand : 0.000003s : 0.03% optimize.opt_a.shard_inline : 0.000015s : 0.14% optimize.opt_a.merge_send_recv : 0.000016s : 0.15% optimize.opt_a.auto_parallel : 0.000014s : 0.13% optimize.opt_a.parallel : 0.000023s : 0.21% optimize.opt_a.flash_sp : 0.000012s : 0.11% optimize.opt_a.merge_comm : 0.000009s : 0.08% optimize.opt_a.allreduce_fusion : 0.000008s : 0.07% optimize.opt_a.matmul_add_comm_reduction : 0.000017s : 0.16% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000017s : 0.16% optimize.opt_a.virtual_dataset : 0.000015s : 0.13% optimize.opt_a.get_grad_eliminate_ : 0.000014s : 0.13% optimize.opt_a.virtual_output : 0.000016s : 0.14% optimize.opt_a.merge_forward : 0.000008s : 0.07% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.03% optimize.opt_a.offload_activation : 0.000020s : 0.18% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000029s : 0.26% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.02% optimize.opt_a.before_grad : 0.000023s : 0.21% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000008s : 0.08% optimize.opt_a.meta_fg_expand : 0.000006s : 0.06% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.04% optimize.opt_a.receive_attached : 0.000004s : 0.04% optimize.opt_a.after_resolve : 0.000024s : 0.22% optimize.opt_a.a_after_grad : 0.000024s : 0.21% optimize.opt_a.renormalize : 0.000756s : 6.86% optimize.opt_a.add_forward_monad_depend : 0.000008s : 0.07% optimize.opt_a.auto_monad_grad : 0.000003s : 0.03% optimize.opt_a.auto_monad_eliminator : 0.000026s : 0.24% optimize.opt_a.cse : 0.000054s : 0.49% optimize.opt_a.a_3 : 0.000102s : 0.93% optimize.py_interpret_to_execute_after_opt_a : 0.000010s : 0.09% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.02% optimize.rewriter_after_opt_a : 0.000038s : 0.35% optimize.convert_after_rewriter : 0.000008s : 0.07% optimize.order_py_execute_after_rewriter : 0.000006s : 0.05% optimize.mutable_eliminate : 0.000522s : 4.74% optimize.opt_b.b_1 : 0.000153s : 1.38% optimize.opt_b.b_2 : 0.000009s : 0.09% optimize.opt_b.updatestate_depend_eliminate : 0.000006s : 0.05% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_b.renormalize : 0.000000s : 0.00% optimize.opt_b.cse : 0.000023s : 0.21% optimize.optimize_parallel_all_gather_comm : 0.000018s : 0.16% optimize.overlap_param_gather : 0.000002s : 0.02% optimize.cconv : 0.000024s : 0.22% optimize.loop_unroll : 0.000427s : 3.87% optimize.opt_after_cconv.c_1 : 0.000037s : 0.34% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.02% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.05% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.cse : 0.000023s : 0.21% optimize.opt_after_cconv.renormalize : 0.000001s : 0.01% optimize.remove_dup_value : 0.000015s : 0.13% optimize.tuple_transform.d_1 : 0.000052s : 0.47% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.02% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000008s : 0.07% optimize.partial_unused_args_eliminate : 0.000002s : 0.02% optimize.add_recomputation : 0.000053s : 0.49% optimize.cse_after_recomputation.cse : 0.000014s : 0.13% optimize.environ_conv : 0.000007s : 0.06% optimize.swap_dp_allreduce_reducescatter : 0.000006s : 0.05% optimize.bias_add_comm_swap : 0.000002s : 0.02% optimize.label_micro_interleaved_index : 0.000005s : 0.04% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.03% optimize.merge_cast_opt : 0.000002s : 0.01% optimize.slice_recompute_activation : 0.000002s : 0.02% optimize.micro_interleaved_order_control : 0.000002s : 0.02% optimize.assign_add_opt : 0.000001s : 0.01% optimize.ForceFp32Comm : 0.000001s : 0.01% optimize.remove_cast_before_assign_add : 0.000001s : 0.01% optimize.full_micro_interleaved_order_control : 0.000002s : 0.02% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.02% optimize.comm_op_add_attrs : 0.000001s : 0.01% optimize.add_comm_op_reuse_tag : 0.000001s : 0.01% optimize.interleave_split_concat_branches : 0.000001s : 0.01% optimize.interleave_parallel_branches : 0.000001s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.02% optimize.control_data_broadcast_order : 0.000014s : 0.13% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.02% optimize.offloading_packed_experts : 0.000004s : 0.04% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.05% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.01% optimize.overlap_recompute_comm : 0.000003s : 0.02% optimize.overlap_grad_ring_attention : 0.000004s : 0.04% optimize.overlap_grad_flash_sp : 0.000020s : 0.18% optimize.begin_end_overlap_inline : 0.000000s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.02% optimize.split_layernorm_comm : 0.000002s : 0.01% optimize.handle_group_info : 0.000001s : 0.01% optimize.symbol_engine_optimizer.build : 0.000003s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000012s : 0.11% optimize.symbol_engine_optimizer.elim_not_effective : 0.000015s : 0.13% optimize.symbol_engine_optimizer.opt_reshape : 0.000008s : 0.08% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000012s : 0.11% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.02% pipeline_parallel_scheduler : 0.000002s : 0.01% auto_monad_reorder : 0.000019s : 0.18% get_jit_bprop_graph : 0.000001s : 0.01% rewriter_after_jit_bprop_graph : 0.000004s : 0.04% opt_after_jit_grad : 0.000470s : 4.26% validate : 0.000041s : 0.37% Time group info: ------[substitution.] 0.000202 38 10.55% : 0.000021s : 3: substitution.cast_eliminate 1.21% : 0.000002s : 3: substitution.elim_not_effective 0.82% : 0.000002s : 3: substitution.fold_const_symbol 3.23% : 0.000007s : 5: substitution.graph_param_transform 69.70% : 0.000141s : 4: substitution.inline 1.86% : 0.000004s : 6: substitution.j_node_and_user_rematch 2.98% : 0.000006s : 6: substitution.remove_not_recompute_node 2.28% : 0.000005s : 4: substitution.replace_old_param 7.38% : 0.000015s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.005537 2 87.65% : 0.004853s : 1: type_inference.infer 12.35% : 0.000684s : 1: type_inference.specialize ------[replace.] 0.000064 8 61.55% : 0.000039s : 4: replace.inline 38.45% : 0.000025s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000152 8 91.41% : 0.000138s : 4: match.inline 8.59% : 0.000013s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000253 1596 0.96% : 0.000002s : 17: predicate.accumulaten_eliminater 0.66% : 0.000002s : 5: predicate.ad_related_special_op_eliminate 0.53% : 0.000001s : 10: predicate.addn_check_dump 1.00% : 0.000003s : 17: predicate.addn_zero_filter 0.86% : 0.000002s : 17: predicate.adjust_all_reduce_mul_add 2.07% : 0.000005s : 27: predicate.arithmetic_simplify 1.18% : 0.000003s : 17: predicate.cast_eliminate 0.54% : 0.000001s : 10: predicate.check_bprop_eliminate 0.60% : 0.000002s : 10: predicate.compare_switch_simplify 0.19% : 0.000000s : 5: predicate.const_output_eliminate 0.60% : 0.000002s : 10: predicate.depend_value_elim 0.99% : 0.000002s : 17: predicate.dict_get_item_const_eliminator 1.08% : 0.000003s : 17: predicate.dict_get_item_eliminator 0.93% : 0.000002s : 17: predicate.dict_set_item_eliminator 0.90% : 0.000002s : 10: predicate.dumpgradient_eliminate 0.19% : 0.000000s : 5: predicate.elim_not_effective 0.54% : 0.000001s : 5: predicate.elim_shapecalc_of_broadcastargs 1.17% : 0.000003s : 22: predicate.environ_add_const_eliminate 1.12% : 0.000003s : 22: predicate.environ_get_add_eliminate 1.18% : 0.000003s : 22: predicate.environ_get_depend_swap 1.72% : 0.000004s : 32: predicate.environ_get_eliminate 1.12% : 0.000003s : 22: predicate.environ_get_set_eliminate 1.50% : 0.000004s : 25: predicate.exchange_switch_depend_value 2.34% : 0.000006s : 25: predicate.float_depend_g_call 0.53% : 0.000001s : 10: predicate.float_environ_get_switch 0.90% : 0.000002s : 15: predicate.float_tuple_getitem_switch 0.17% : 0.000000s : 5: predicate.fold_const_symbol 0.62% : 0.000002s : 10: predicate.get_grad_eliminate 0.20% : 0.000000s : 5: predicate.graph_param_transform 0.61% : 0.000002s : 10: predicate.incorporate_call 0.51% : 0.000001s : 10: predicate.incorporate_call_switch 6.18% : 0.000016s : 72: predicate.inline 0.77% : 0.000002s : 10: predicate.inline_without_move 0.33% : 0.000001s : 10: predicate.j_node_and_user_rematch 0.79% : 0.000002s : 10: predicate.less_batch_normalization 1.88% : 0.000005s : 31: predicate.list_to_tuple_eliminator_ 2.60% : 0.000007s : 48: predicate.load_eliminater 0.83% : 0.000002s : 5: predicate.loop_unroll_after_grad 2.08% : 0.000005s : 36: predicate.loop_unroll_before_grad 1.70% : 0.000004s : 27: predicate.make_slice_get_slice_eliminator 0.54% : 0.000001s : 10: predicate.merge_addn 0.60% : 0.000002s : 10: predicate.micro_step_allgather_replace 0.62% : 0.000002s : 10: predicate.mini_step_allgather_replace 0.95% : 0.000002s : 17: predicate.minmaximum_grad 0.87% : 0.000002s : 5: predicate.mutable_eliminate 0.32% : 0.000001s : 5: predicate.opt_reshape 0.36% : 0.000001s : 5: predicate.parallel_virtual_node 1.69% : 0.000004s : 25: predicate.partial_defer_inline 1.71% : 0.000004s : 26: predicate.partial_eliminate 0.95% : 0.000002s : 17: predicate.print_const_string_wrapper 0.59% : 0.000001s : 10: predicate.reduce_all_const_elim 1.18% : 0.000003s : 17: predicate.reduce_eliminate 2.54% : 0.000006s : 48: predicate.redundant_stop_gradient_eliminater 0.40% : 0.000001s : 10: predicate.remove_not_recompute_node 1.41% : 0.000004s : 31: predicate.replace_applicator 0.43% : 0.000001s : 10: predicate.replace_old_param 0.26% : 0.000001s : 5: predicate.reset_defer_inline 1.16% : 0.000003s : 17: predicate.reshape_eliminate 0.63% : 0.000002s : 10: predicate.row_tensor_add_zeros_like 0.34% : 0.000001s : 5: predicate.row_tensor_eliminate 0.72% : 0.000002s : 10: predicate.same_eliminate 0.39% : 0.000001s : 10: predicate.set_cell_output_no_recompute 0.82% : 0.000002s : 10: predicate.shard_identity_eliminate 0.74% : 0.000002s : 10: predicate.special_op_eliminate 0.79% : 0.000002s : 10: predicate.specialize_transform 0.95% : 0.000002s : 10: predicate.split_environ_get_set_with_tuple_value 0.67% : 0.000002s : 10: predicate.stack_unstack_eliminate 0.36% : 0.000001s : 5: predicate.switch_call_monad_eliminater 1.53% : 0.000004s : 25: predicate.switch_defer_inline 2.00% : 0.000005s : 35: predicate.switch_layer_defer_inline 4.77% : 0.000012s : 76: predicate.switch_simplify 0.94% : 0.000002s : 17: predicate.tile_eliminate 0.95% : 0.000002s : 17: predicate.transpose_eliminate 1.59% : 0.000004s : 27: predicate.tuple_list_convert_item_index_to_positive 1.71% : 0.000004s : 27: predicate.tuple_list_get_item_const_eliminator 1.58% : 0.000004s : 27: predicate.tuple_list_get_item_depend_reorder 3.47% : 0.000009s : 41: predicate.tuple_list_get_item_eliminator 1.67% : 0.000004s : 27: predicate.tuple_list_get_set_item_eliminator 2.41% : 0.000006s : 37: predicate.tuple_list_set_item_eliminator 1.79% : 0.000005s : 31: predicate.tuple_to_list_eliminator_ 2.54% : 0.000006s : 48: predicate.updatestate_pure_node_eliminater 3.21% : 0.000008s : 58: predicate.updatestate_useless_node_eliminater 0.34% : 0.000001s : 5: predicate.value_based_eliminate 0.61% : 0.000002s : 10: predicate.virtual_dataset_eliminate 0.60% : 0.000002s : 10: predicate.virtual_output_eliminate 0.28% : 0.000001s : 5: predicate.virtual_view_grad_eliminate 0.37% : 0.000001s : 5: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000502 11 52.51% : 0.000264s : 5: func_graph_cloner_run.FuncGraphClonerGraph 47.49% : 0.000238s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.025676 192 0.01% : 0.000004s : 1: ForceFp32Comm 12.10% : 0.003107s : 1: add_attr 12.06% : 0.003097s : 1: add_attr_with_inline 0.01% : 0.000004s : 1: add_comm_op_reuse_tag 0.22% : 0.000057s : 1: add_recomputation 0.01% : 0.000004s : 1: assign_add_opt 0.27% : 0.000069s : 1: auto_monad 0.09% : 0.000023s : 1: auto_monad_reorder 0.01% : 0.000003s : 1: begin_end_overlap_inline 0.02% : 0.000005s : 1: bias_add_comm_swap 1.75% : 0.000448s : 1: bootstrap 0.11% : 0.000027s : 1: cconv 0.01% : 0.000004s : 1: comm_op_add_attrs 0.07% : 0.000017s : 1: control_data_broadcast_order 0.04% : 0.000011s : 1: convert_after_rewriter 0.10% : 0.000027s : 1: cse_after_recomputation 0.02% : 0.000005s : 1: dataset_repeat_opt 0.02% : 0.000005s : 1: detach_backward 0.04% : 0.000010s : 1: environ_conv 0.09% : 0.000024s : 1: event_method 0.02% : 0.000005s : 1: full_micro_interleaved_order_control 0.02% : 0.000004s : 1: get_jit_bprop_graph 0.04% : 0.000010s : 1: graph_reusing 0.02% : 0.000005s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000004s : 1: handle_group_info 0.02% : 0.000006s : 1: inline 0.02% : 0.000006s : 1: insert-virtual-dataset 0.01% : 0.000004s : 1: interleave_parallel_branches 0.01% : 0.000004s : 1: interleave_split_concat_branches 0.02% : 0.000006s : 1: label_fine_grained_interleaved_index 0.03% : 0.000007s : 1: label_micro_interleaved_index 1.69% : 0.000435s : 1: loop_unroll 0.02% : 0.000004s : 1: merge_cast_opt 0.02% : 0.000005s : 1: micro_interleaved_order_control 2.07% : 0.000530s : 1: mutable_eliminate 0.03% : 0.000007s : 1: offloading_packed_experts 0.06% : 0.000015s : 1: opt.transform.loop_unroll_optimizer 0.06% : 0.000016s : 1: opt.transform.mutable_eliminate 5.51% : 0.001416s : 78: opt.transform.opt_a 0.14% : 0.000036s : 1: opt.transform.opt_after_cconv 0.11% : 0.000029s : 1: opt.transform.opt_after_jit_grad 0.51% : 0.000131s : 28: opt.transform.opt_b 0.23% : 0.000058s : 2: opt.transform.opt_trans_graph 0.17% : 0.000043s : 4: opt.transform.symbol_engine_opt 11.83% : 0.003038s : 1: opt_a 0.46% : 0.000118s : 1: opt_after_cconv 1.87% : 0.000480s : 1: opt_after_jit_grad 0.95% : 0.000243s : 1: opt_b 20.01% : 0.005138s : 1: optimize 0.08% : 0.000021s : 1: optimize_parallel_all_gather_comm 0.03% : 0.000009s : 1: order_py_execute_after_rewriter 0.09% : 0.000023s : 1: overlap_grad_flash_sp 0.02% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.03% : 0.000007s : 1: overlap_grad_ring_attention 0.02% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.02% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.02% : 0.000005s : 1: overlap_param_gather 0.02% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.03% : 0.000008s : 1: overlap_recompute_and_grad_model_parallel 0.02% : 0.000005s : 1: overlap_recompute_comm 0.03% : 0.000007s : 1: parallel-infer-symbol 0.01% : 0.000004s : 1: parallel-infer-symbol-second 0.02% : 0.000005s : 1: partial_unused_args_eliminate 0.02% : 0.000005s : 1: pipeline_parallel_scheduler 0.02% : 0.000004s : 1: pipeline_split 0.15% : 0.000037s : 1: pre_auto_parallel 0.12% : 0.000030s : 1: py_interpret_to_execute 0.05% : 0.000013s : 1: py_interpret_to_execute_after_opt_a 0.02% : 0.000004s : 1: remove_cast_before_assign_add 0.07% : 0.000018s : 1: remove_dup_value 1.67% : 0.000429s : 1: renormalize.infer 1.24% : 0.000318s : 1: renormalize.specialize 0.02% : 0.000005s : 1: reorder_send_recv_between_fp_bp 0.03% : 0.000007s : 1: rewriter_after_jit_bprop_graph 0.16% : 0.000042s : 1: rewriter_after_opt_a 0.35% : 0.000089s : 1: rewriter_before_opt_a 0.02% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.02% : 0.000005s : 1: slice_recompute_activation 0.02% : 0.000004s : 1: split_layernorm_comm 0.02% : 0.000005s : 1: split_matmul_comm_elemetwise 0.03% : 0.000009s : 1: swap_dp_allreduce_reducescatter 0.33% : 0.000085s : 1: symbol_engine_optimizer 0.34% : 0.000088s : 1: tuple_transform 21.85% : 0.005609s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:45:20.247.340 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:45:20.247.617 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0330411, [21] [bootstrap]: 0.0004616 [type_inference]: 0.00578396 [event_method]: 1.961e-05 [auto_monad]: 6.81e-05 [graph_reusing]: 5.92999e-06 [inline]: 2.27999e-06 [add_attr]: 0.00309582, [1] [add_attr_with_inline]: 0.00308791, [1] [Cycle 1]: 7.208e-05, [2] [tag_attr]: 2.018e-05 [meta_addattr_fg_expand]: 7.19001e-06 [parallel-infer-symbol]: 3.36001e-06 [pre_auto_parallel]: 3.306e-05 [insert-virtual-dataset]: 2.57001e-06 [parallel-infer-symbol-second]: 7.09988e-07 [dataset_repeat_opt]: 2.02999e-06 [pipeline_split]: 1.67001e-06 [optimize]: 0.00592973, [53] [py_interpret_to_execute]: 3.118e-05 [rewriter_before_opt_a]: 9.12e-05 [opt_a]: 0.00345269, [2] [Cycle 1]: 0.00236964, [45] [expand_dump_flag]: 2.78e-06 [switch_simplify]: 4.53e-05 [loop_unroll]: 3.28e-05 [a_1]: 0.00069928 [with_stream_mark]: 1.681e-05 [recompute_prepare]: 1.106e-05 [updatestate_depend_eliminate]: 5.06997e-06 [updatestate_assign_eliminate]: 4.58999e-06 [updatestate_loads_eliminate]: 4.03001e-06 [parameter_eliminate]: 1.87999e-06 [a_2]: 0.00014941 [accelerated_algorithm]: 9.49999e-06 [shard]: 2.06998e-06 [meta_shard_fg_expand]: 2.19999e-06 [shard_inline]: 9.51e-06 [merge_send_recv]: 1.011e-05 [auto_parallel]: 7.83001e-06 [parallel]: 1.864e-05 [flash_sp]: 8.80999e-06 [merge_comm]: 5.81e-06 [allreduce_fusion]: 5.48002e-06 [matmul_add_comm_reduction]: 1.048e-05 [allreduce_slice_to_reducescatter]: 6.50005e-07 [virtual_shard_identity]: 1.044e-05 [virtual_dataset]: 9.02e-06 [get_grad_eliminate_]: 8.57e-06 [virtual_output]: 9.30001e-06 [merge_forward]: 5.35001e-06 [cell_reuse_recompute_pass]: 1.27e-06 [offload_activation]: 1.168e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.005e-05 [merge_recompute_call_nodes]: 1.64998e-06 [before_grad]: 1.518e-05 [set_forward_comm_id_for_comm_node_pass]: 5.23002e-06 [meta_fg_expand]: 3.92998e-06 [flash_sp_send_recv_attached]: 2.59001e-06 [receive_attached]: 3.03998e-06 [after_resolve]: 1.398e-05 [a_after_grad]: 1.394e-05 [renormalize]: 0.00064355 [add_forward_monad_depend]: 5.19e-06 [auto_monad_grad]: 1.96e-06 [auto_monad_eliminator]: 1.824e-05 [cse]: 4.278e-05 [a_3]: 8.041e-05 [Cycle 2]: 0.00106941, [45] [expand_dump_flag]: 1.04e-06 [switch_simplify]: 1.038e-05 [loop_unroll]: 8.70001e-06 [a_1]: 0.00024277 [with_stream_mark]: 1.293e-05 [recompute_prepare]: 9.21002e-06 [updatestate_depend_eliminate]: 5.35999e-06 [updatestate_assign_eliminate]: 3.73001e-06 [updatestate_loads_eliminate]: 3.62002e-06 [parameter_eliminate]: 1.24e-06 [a_2]: 0.00013893 [accelerated_algorithm]: 8.84e-06 [shard]: 1.20999e-06 [meta_shard_fg_expand]: 1.82001e-06 [shard_inline]: 8.72e-06 [merge_send_recv]: 6.97002e-06 [auto_parallel]: 7.1e-06 [parallel]: 4.45e-06 [flash_sp]: 3.70998e-06 [merge_comm]: 5.67999e-06 [allreduce_fusion]: 5.15999e-06 [matmul_add_comm_reduction]: 7.68999e-06 [allreduce_slice_to_reducescatter]: 4.00003e-07 [virtual_shard_identity]: 9.56e-06 [virtual_dataset]: 9.05001e-06 [get_grad_eliminate_]: 8.60999e-06 [virtual_output]: 8.50999e-06 [merge_forward]: 4.59998e-06 [cell_reuse_recompute_pass]: 1.31002e-06 [offload_activation]: 8.97e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.87e-05 [merge_recompute_call_nodes]: 9.00007e-07 [before_grad]: 1.368e-05 [set_forward_comm_id_for_comm_node_pass]: 5.02999e-06 [meta_fg_expand]: 3.16999e-06 [flash_sp_send_recv_attached]: 9.5999e-07 [receive_attached]: 1.24998e-06 [after_resolve]: 1.289e-05 [a_after_grad]: 1.308e-05 [renormalize]: 8.00064e-08 [add_forward_monad_depend]: 1.45999e-06 [auto_monad_grad]: 8.50006e-07 [auto_monad_eliminator]: 9.93998e-06 [cse]: 2.312e-05 [a_3]: 6.874e-05 [py_interpret_to_execute_after_opt_a]: 1.312e-05 [slice_cell_reuse_recomputed_activation]: 4.65001e-06 [rewriter_after_opt_a]: 4.974e-05 [convert_after_rewriter]: 1.13e-05 [order_py_execute_after_rewriter]: 9.54e-06 [mutable_eliminate]: 0.00048683 [opt_b]: 0.00035006, [1] [Cycle 1]: 0.00034016, [7] [b_1]: 0.00022973 [b_2]: 1.073e-05 [updatestate_depend_eliminate]: 6.86001e-06 [updatestate_assign_eliminate]: 3.6e-06 [updatestate_loads_eliminate]: 3.56001e-06 [renormalize]: 4.19997e-07 [cse]: 2.762e-05 [optimize_parallel_all_gather_comm]: 2.266e-05 [overlap_param_gather]: 5.37001e-06 [cconv]: 2.966e-05 [loop_unroll]: 0.00044318 [opt_after_cconv]: 0.00015325, [1] [Cycle 1]: 0.00014481, [7] [c_1]: 4.418e-05 [parameter_eliminate]: 2.53e-06 [updatestate_depend_eliminate]: 7.03e-06 [updatestate_assign_eliminate]: 4.05e-06 [updatestate_loads_eliminate]: 3.64002e-06 [cse]: 2.595e-05 [renormalize]: 3.7998e-07 [remove_dup_value]: 4.015e-05 [tuple_transform]: 0.00011332, [1] [Cycle 1]: 0.00010593, [4] [d_1]: 6.14e-05 [none_parameter_eliminate]: 1.72999e-06 [renormalize]: 4.00003e-07 [switch_simplify]: 1.009e-05 [partial_unused_args_eliminate]: 4.65999e-06 [add_recomputation]: 6.57e-05 [cse_after_recomputation]: 3.466e-05, [1] [Cycle 1]: 2.736e-05, [1] [cse]: 1.792e-05 [environ_conv]: 8.87999e-06 [swap_dp_allreduce_reducescatter]: 9.79e-06 [bias_add_comm_swap]: 5.61e-06 [label_micro_interleaved_index]: 6.80998e-06 [label_fine_grained_interleaved_index]: 5.13002e-06 [merge_cast_opt]: 3.69002e-06 [slice_recompute_activation]: 4.43999e-06 [micro_interleaved_order_control]: 5.09998e-06 [assign_add_opt]: 3.95e-06 [ForceFp32Comm]: 3.35e-06 [remove_cast_before_assign_add]: 3.35998e-06 [full_micro_interleaved_order_control]: 4.47e-06 [reorder_send_recv_between_fp_bp]: 4.95999e-06 [comm_op_add_attrs]: 3.53e-06 [add_comm_op_reuse_tag]: 3.31999e-06 [interleave_split_concat_branches]: 3.79002e-06 [interleave_parallel_branches]: 3.35e-06 [overlap_opt_shard_in_pipeline]: 4.08001e-06 [overlap_opt_shard_grad_in_pipeline]: 4.28999e-06 [control_data_broadcast_order]: 1.973e-05 [grouped_pairwise_exchange_alltoall]: 3.91999e-06 [offloading_packed_experts]: 7.66001e-06 [overlap_recompute_and_grad_model_parallel]: 8.47e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.91999e-06 [overlap_recompute_allgather_and_fa_grad]: 3.8e-06 [overlap_recompute_comm]: 5.21998e-06 [overlap_grad_ring_attention]: 8.2e-06 [overlap_grad_flash_sp]: 2.619e-05 [begin_end_overlap_inline]: 3.19001e-06 [split_matmul_comm_elemetwise]: 4.79e-06 [split_layernorm_comm]: 4.27e-06 [handle_group_info]: 3.36001e-06 [symbol_engine_optimizer]: 0.00011147, [1] [Cycle 1]: 0.00010497, [6] [build]: 3.46001e-06 [elim_shapecalc]: 1.281e-05 [elim_not_effective]: 1.757e-05 [opt_reshape]: 1.009e-05 [fold_const_symbol]: 1.487e-05 [renormalize]: 2.30008e-07 [detach_backward]: 3.14999e-06 [pipeline_parallel_scheduler]: 1.98997e-06 [auto_monad_reorder]: 2.323e-05 [get_jit_bprop_graph]: 1.25001e-06 [rewriter_after_jit_bprop_graph]: 4.17e-06 [opt_after_jit_grad]: 0.0169333 [validate]: 6.44e-05 Sums bootstrap : 0.000462s : 1.64% type_inference : 0.005784s : 20.52% event_method : 0.000020s : 0.07% auto_monad : 0.000068s : 0.24% graph_reusing : 0.000006s : 0.02% inline : 0.000002s : 0.01% add_attr.add_attr_with_inline.tag_attr : 0.000020s : 0.07% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000007s : 0.03% parallel-infer-symbol : 0.000003s : 0.01% pre_auto_parallel : 0.000033s : 0.12% insert-virtual-dataset : 0.000003s : 0.01% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.01% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000031s : 0.11% optimize.rewriter_before_opt_a : 0.000091s : 0.32% optimize.opt_a.expand_dump_flag : 0.000004s : 0.01% optimize.opt_a.switch_simplify : 0.000056s : 0.20% optimize.opt_a.loop_unroll : 0.000042s : 0.15% optimize.opt_a.a_1 : 0.000942s : 3.34% optimize.opt_a.with_stream_mark : 0.000030s : 0.11% optimize.opt_a.recompute_prepare : 0.000020s : 0.07% optimize.opt_a.updatestate_depend_eliminate : 0.000010s : 0.04% optimize.opt_a.updatestate_assign_eliminate : 0.000008s : 0.03% optimize.opt_a.updatestate_loads_eliminate : 0.000008s : 0.03% optimize.opt_a.parameter_eliminate : 0.000003s : 0.01% optimize.opt_a.a_2 : 0.000288s : 1.02% optimize.opt_a.accelerated_algorithm : 0.000018s : 0.07% optimize.opt_a.shard : 0.000003s : 0.01% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.01% optimize.opt_a.shard_inline : 0.000018s : 0.06% optimize.opt_a.merge_send_recv : 0.000017s : 0.06% optimize.opt_a.auto_parallel : 0.000015s : 0.05% optimize.opt_a.parallel : 0.000023s : 0.08% optimize.opt_a.flash_sp : 0.000013s : 0.04% optimize.opt_a.merge_comm : 0.000011s : 0.04% optimize.opt_a.allreduce_fusion : 0.000011s : 0.04% optimize.opt_a.matmul_add_comm_reduction : 0.000018s : 0.06% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000020s : 0.07% optimize.opt_a.virtual_dataset : 0.000018s : 0.06% optimize.opt_a.get_grad_eliminate_ : 0.000017s : 0.06% optimize.opt_a.virtual_output : 0.000018s : 0.06% optimize.opt_a.merge_forward : 0.000010s : 0.04% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.01% optimize.opt_a.offload_activation : 0.000021s : 0.07% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000039s : 0.14% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.01% optimize.opt_a.before_grad : 0.000029s : 0.10% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000010s : 0.04% optimize.opt_a.meta_fg_expand : 0.000007s : 0.03% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.01% optimize.opt_a.receive_attached : 0.000004s : 0.02% optimize.opt_a.after_resolve : 0.000027s : 0.10% optimize.opt_a.a_after_grad : 0.000027s : 0.10% optimize.opt_a.renormalize : 0.000644s : 2.28% optimize.opt_a.add_forward_monad_depend : 0.000007s : 0.02% optimize.opt_a.auto_monad_grad : 0.000003s : 0.01% optimize.opt_a.auto_monad_eliminator : 0.000028s : 0.10% optimize.opt_a.cse : 0.000066s : 0.23% optimize.opt_a.a_3 : 0.000149s : 0.53% optimize.py_interpret_to_execute_after_opt_a : 0.000013s : 0.05% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.02% optimize.rewriter_after_opt_a : 0.000050s : 0.18% optimize.convert_after_rewriter : 0.000011s : 0.04% optimize.order_py_execute_after_rewriter : 0.000010s : 0.03% optimize.mutable_eliminate : 0.000487s : 1.73% optimize.opt_b.b_1 : 0.000230s : 0.82% optimize.opt_b.b_2 : 0.000011s : 0.04% optimize.opt_b.updatestate_depend_eliminate : 0.000007s : 0.02% optimize.opt_b.updatestate_assign_eliminate : 0.000004s : 0.01% optimize.opt_b.updatestate_loads_eliminate : 0.000004s : 0.01% optimize.opt_b.renormalize : 0.000000s : 0.00% optimize.opt_b.cse : 0.000028s : 0.10% optimize.optimize_parallel_all_gather_comm : 0.000023s : 0.08% optimize.overlap_param_gather : 0.000005s : 0.02% optimize.cconv : 0.000030s : 0.11% optimize.loop_unroll : 0.000443s : 1.57% optimize.opt_after_cconv.c_1 : 0.000044s : 0.16% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000007s : 0.02% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000004s : 0.01% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000004s : 0.01% optimize.opt_after_cconv.cse : 0.000026s : 0.09% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000040s : 0.14% optimize.tuple_transform.d_1 : 0.000061s : 0.22% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000010s : 0.04% optimize.partial_unused_args_eliminate : 0.000005s : 0.02% optimize.add_recomputation : 0.000066s : 0.23% optimize.cse_after_recomputation.cse : 0.000018s : 0.06% optimize.environ_conv : 0.000009s : 0.03% optimize.swap_dp_allreduce_reducescatter : 0.000010s : 0.03% optimize.bias_add_comm_swap : 0.000006s : 0.02% optimize.label_micro_interleaved_index : 0.000007s : 0.02% optimize.label_fine_grained_interleaved_index : 0.000005s : 0.02% optimize.merge_cast_opt : 0.000004s : 0.01% optimize.slice_recompute_activation : 0.000004s : 0.02% optimize.micro_interleaved_order_control : 0.000005s : 0.02% optimize.assign_add_opt : 0.000004s : 0.01% optimize.ForceFp32Comm : 0.000003s : 0.01% optimize.remove_cast_before_assign_add : 0.000003s : 0.01% optimize.full_micro_interleaved_order_control : 0.000004s : 0.02% optimize.reorder_send_recv_between_fp_bp : 0.000005s : 0.02% optimize.comm_op_add_attrs : 0.000004s : 0.01% optimize.add_comm_op_reuse_tag : 0.000003s : 0.01% optimize.interleave_split_concat_branches : 0.000004s : 0.01% optimize.interleave_parallel_branches : 0.000003s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000004s : 0.02% optimize.control_data_broadcast_order : 0.000020s : 0.07% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.01% optimize.offloading_packed_experts : 0.000008s : 0.03% optimize.overlap_recompute_and_grad_model_parallel : 0.000008s : 0.03% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.01% optimize.overlap_recompute_comm : 0.000005s : 0.02% optimize.overlap_grad_ring_attention : 0.000008s : 0.03% optimize.overlap_grad_flash_sp : 0.000026s : 0.09% optimize.begin_end_overlap_inline : 0.000003s : 0.01% optimize.split_matmul_comm_elemetwise : 0.000005s : 0.02% optimize.split_layernorm_comm : 0.000004s : 0.02% optimize.handle_group_info : 0.000003s : 0.01% optimize.symbol_engine_optimizer.build : 0.000003s : 0.01% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000013s : 0.05% optimize.symbol_engine_optimizer.elim_not_effective : 0.000018s : 0.06% optimize.symbol_engine_optimizer.opt_reshape : 0.000010s : 0.04% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000015s : 0.05% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000003s : 0.01% pipeline_parallel_scheduler : 0.000002s : 0.01% auto_monad_reorder : 0.000023s : 0.08% get_jit_bprop_graph : 0.000001s : 0.00% rewriter_after_jit_bprop_graph : 0.000004s : 0.01% opt_after_jit_grad : 0.016933s : 60.08% validate : 0.000064s : 0.23% Time group info: ------[substitution.] 0.000209 48 15.46% : 0.000032s : 6: substitution.cast_eliminate 1.20% : 0.000003s : 4: substitution.elim_not_effective 0.99% : 0.000002s : 4: substitution.fold_const_symbol 3.50% : 0.000007s : 6: substitution.graph_param_transform 64.87% : 0.000135s : 4: substitution.inline 2.35% : 0.000005s : 8: substitution.j_node_and_user_rematch 3.56% : 0.000007s : 8: substitution.remove_not_recompute_node 2.15% : 0.000004s : 4: substitution.replace_old_param 5.93% : 0.000012s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.005735 2 87.94% : 0.005044s : 1: type_inference.infer 12.06% : 0.000692s : 1: type_inference.specialize ------[replace.] 0.000059 8 61.82% : 0.000037s : 4: replace.inline 38.18% : 0.000023s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000143 8 92.69% : 0.000133s : 4: match.inline 7.31% : 0.000010s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000279 1730 0.89% : 0.000002s : 17: predicate.accumulaten_eliminater 1.95% : 0.000005s : 6: predicate.ad_related_special_op_eliminate 0.61% : 0.000002s : 12: predicate.addn_check_dump 0.84% : 0.000002s : 17: predicate.addn_zero_filter 0.81% : 0.000002s : 17: predicate.adjust_all_reduce_mul_add 2.12% : 0.000006s : 29: predicate.arithmetic_simplify 1.00% : 0.000003s : 17: predicate.cast_eliminate 0.66% : 0.000002s : 12: predicate.check_bprop_eliminate 0.62% : 0.000002s : 12: predicate.compare_switch_simplify 0.20% : 0.000001s : 6: predicate.const_output_eliminate 0.75% : 0.000002s : 12: predicate.depend_value_elim 0.97% : 0.000003s : 17: predicate.dict_get_item_const_eliminator 1.02% : 0.000003s : 17: predicate.dict_get_item_eliminator 0.84% : 0.000002s : 17: predicate.dict_set_item_eliminator 1.79% : 0.000005s : 12: predicate.dumpgradient_eliminate 0.20% : 0.000001s : 6: predicate.elim_not_effective 0.45% : 0.000001s : 6: predicate.elim_shapecalc_of_broadcastargs 1.27% : 0.000004s : 23: predicate.environ_add_const_eliminate 1.13% : 0.000003s : 23: predicate.environ_get_add_eliminate 1.08% : 0.000003s : 23: predicate.environ_get_depend_swap 1.76% : 0.000005s : 35: predicate.environ_get_eliminate 1.10% : 0.000003s : 23: predicate.environ_get_set_eliminate 1.26% : 0.000004s : 25: predicate.exchange_switch_depend_value 2.10% : 0.000006s : 25: predicate.float_depend_g_call 0.63% : 0.000002s : 12: predicate.float_environ_get_switch 0.91% : 0.000003s : 18: predicate.float_tuple_getitem_switch 0.18% : 0.000001s : 6: predicate.fold_const_symbol 0.71% : 0.000002s : 12: predicate.get_grad_eliminate 0.25% : 0.000001s : 6: predicate.graph_param_transform 0.67% : 0.000002s : 12: predicate.incorporate_call 0.58% : 0.000002s : 12: predicate.incorporate_call_switch 6.09% : 0.000017s : 78: predicate.inline 0.83% : 0.000002s : 12: predicate.inline_without_move 0.35% : 0.000001s : 12: predicate.j_node_and_user_rematch 0.82% : 0.000002s : 12: predicate.less_batch_normalization 1.78% : 0.000005s : 33: predicate.list_to_tuple_eliminator_ 2.47% : 0.000007s : 50: predicate.load_eliminater 0.77% : 0.000002s : 6: predicate.loop_unroll_after_grad 1.92% : 0.000005s : 38: predicate.loop_unroll_before_grad 1.71% : 0.000005s : 29: predicate.make_slice_get_slice_eliminator 0.66% : 0.000002s : 12: predicate.merge_addn 0.62% : 0.000002s : 12: predicate.micro_step_allgather_replace 0.63% : 0.000002s : 12: predicate.mini_step_allgather_replace 0.79% : 0.000002s : 17: predicate.minmaximum_grad 0.73% : 0.000002s : 6: predicate.mutable_eliminate 0.39% : 0.000001s : 6: predicate.opt_reshape 0.37% : 0.000001s : 6: predicate.parallel_virtual_node 1.49% : 0.000004s : 25: predicate.partial_defer_inline 1.62% : 0.000005s : 27: predicate.partial_eliminate 0.87% : 0.000002s : 17: predicate.print_const_string_wrapper 0.63% : 0.000002s : 12: predicate.reduce_all_const_elim 1.05% : 0.000003s : 17: predicate.reduce_eliminate 2.45% : 0.000007s : 50: predicate.redundant_stop_gradient_eliminater 0.44% : 0.000001s : 12: predicate.remove_not_recompute_node 1.27% : 0.000004s : 33: predicate.replace_applicator 0.50% : 0.000001s : 12: predicate.replace_old_param 0.25% : 0.000001s : 6: predicate.reset_defer_inline 0.91% : 0.000003s : 17: predicate.reshape_eliminate 0.66% : 0.000002s : 12: predicate.row_tensor_add_zeros_like 0.39% : 0.000001s : 6: predicate.row_tensor_eliminate 0.73% : 0.000002s : 12: predicate.same_eliminate 0.46% : 0.000001s : 12: predicate.set_cell_output_no_recompute 0.85% : 0.000002s : 12: predicate.shard_identity_eliminate 0.79% : 0.000002s : 12: predicate.special_op_eliminate 0.77% : 0.000002s : 12: predicate.specialize_transform 0.93% : 0.000003s : 12: predicate.split_environ_get_set_with_tuple_value 0.77% : 0.000002s : 12: predicate.stack_unstack_eliminate 0.37% : 0.000001s : 6: predicate.switch_call_monad_eliminater 1.38% : 0.000004s : 25: predicate.switch_defer_inline 2.02% : 0.000006s : 37: predicate.switch_layer_defer_inline 4.65% : 0.000013s : 81: predicate.switch_simplify 0.85% : 0.000002s : 17: predicate.tile_eliminate 0.88% : 0.000002s : 17: predicate.transpose_eliminate 1.56% : 0.000004s : 29: predicate.tuple_list_convert_item_index_to_positive 1.58% : 0.000004s : 29: predicate.tuple_list_get_item_const_eliminator 1.52% : 0.000004s : 29: predicate.tuple_list_get_item_depend_reorder 3.25% : 0.000009s : 45: predicate.tuple_list_get_item_eliminator 1.55% : 0.000004s : 29: predicate.tuple_list_get_set_item_eliminator 2.34% : 0.000007s : 41: predicate.tuple_list_set_item_eliminator 1.73% : 0.000005s : 33: predicate.tuple_to_list_eliminator_ 2.41% : 0.000007s : 50: predicate.updatestate_pure_node_eliminater 3.20% : 0.000009s : 62: predicate.updatestate_useless_node_eliminater 0.45% : 0.000001s : 6: predicate.value_based_eliminate 0.68% : 0.000002s : 12: predicate.virtual_dataset_eliminate 0.69% : 0.000002s : 12: predicate.virtual_output_eliminate 0.34% : 0.000001s : 6: predicate.virtual_view_grad_eliminate 0.45% : 0.000001s : 6: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000512 11 55.79% : 0.000286s : 5: func_graph_cloner_run.FuncGraphClonerGraph 44.21% : 0.000226s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.044536 192 0.01% : 0.000006s : 1: ForceFp32Comm 6.97% : 0.003105s : 1: add_attr 6.94% : 0.003092s : 1: add_attr_with_inline 0.01% : 0.000006s : 1: add_comm_op_reuse_tag 0.16% : 0.000069s : 1: add_recomputation 0.01% : 0.000007s : 1: assign_add_opt 0.17% : 0.000077s : 1: auto_monad 0.07% : 0.000031s : 1: auto_monad_reorder 0.01% : 0.000006s : 1: begin_end_overlap_inline 0.02% : 0.000008s : 1: bias_add_comm_swap 1.14% : 0.000507s : 1: bootstrap 0.07% : 0.000033s : 1: cconv 0.01% : 0.000006s : 1: comm_op_add_attrs 0.05% : 0.000023s : 1: control_data_broadcast_order 0.03% : 0.000014s : 1: convert_after_rewriter 0.08% : 0.000038s : 1: cse_after_recomputation 0.02% : 0.000008s : 1: dataset_repeat_opt 0.04% : 0.000018s : 1: detach_backward 0.03% : 0.000012s : 1: environ_conv 0.07% : 0.000030s : 1: event_method 0.02% : 0.000007s : 1: full_micro_interleaved_order_control 0.02% : 0.000007s : 1: get_jit_bprop_graph 0.03% : 0.000012s : 1: graph_reusing 0.02% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000006s : 1: handle_group_info 0.02% : 0.000008s : 1: inline 0.02% : 0.000009s : 1: insert-virtual-dataset 0.01% : 0.000006s : 1: interleave_parallel_branches 0.01% : 0.000006s : 1: interleave_split_concat_branches 0.02% : 0.000008s : 1: label_fine_grained_interleaved_index 0.02% : 0.000010s : 1: label_micro_interleaved_index 1.01% : 0.000449s : 1: loop_unroll 0.01% : 0.000006s : 1: merge_cast_opt 0.02% : 0.000008s : 1: micro_interleaved_order_control 1.11% : 0.000493s : 1: mutable_eliminate 0.02% : 0.000010s : 1: offloading_packed_experts 0.04% : 0.000018s : 1: opt.transform.loop_unroll_optimizer 0.04% : 0.000017s : 1: opt.transform.mutable_eliminate 3.50% : 0.001557s : 78: opt.transform.opt_a 0.10% : 0.000043s : 1: opt.transform.opt_after_cconv 0.14% : 0.000060s : 1: opt.transform.opt_after_jit_grad 0.37% : 0.000166s : 28: opt.transform.opt_b 0.16% : 0.000069s : 2: opt.transform.opt_trans_graph 0.12% : 0.000052s : 4: opt.transform.symbol_engine_opt 7.76% : 0.003456s : 1: opt_a 0.35% : 0.000157s : 1: opt_after_cconv 38.07% : 0.016954s : 1: opt_after_jit_grad 0.79% : 0.000354s : 1: opt_b 14.10% : 0.006278s : 1: optimize 0.06% : 0.000026s : 1: optimize_parallel_all_gather_comm 0.03% : 0.000012s : 1: order_py_execute_after_rewriter 0.07% : 0.000029s : 1: overlap_grad_flash_sp 0.01% : 0.000007s : 1: overlap_grad_matmul_and_grad_allreduce 0.02% : 0.000011s : 1: overlap_grad_ring_attention 0.02% : 0.000007s : 1: overlap_opt_shard_grad_in_pipeline 0.02% : 0.000007s : 1: overlap_opt_shard_in_pipeline 0.02% : 0.000009s : 1: overlap_param_gather 0.01% : 0.000006s : 1: overlap_recompute_allgather_and_fa_grad 0.03% : 0.000011s : 1: overlap_recompute_and_grad_model_parallel 0.02% : 0.000008s : 1: overlap_recompute_comm 0.02% : 0.000010s : 1: parallel-infer-symbol 0.01% : 0.000006s : 1: parallel-infer-symbol-second 0.02% : 0.000008s : 1: partial_unused_args_eliminate 0.02% : 0.000010s : 1: pipeline_parallel_scheduler 0.02% : 0.000007s : 1: pipeline_split 0.09% : 0.000040s : 1: pre_auto_parallel 0.08% : 0.000035s : 1: py_interpret_to_execute 0.04% : 0.000016s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000006s : 1: remove_cast_before_assign_add 0.10% : 0.000044s : 1: remove_dup_value 0.78% : 0.000346s : 1: renormalize.infer 0.65% : 0.000291s : 1: renormalize.specialize 0.02% : 0.000008s : 1: reorder_send_recv_between_fp_bp 0.02% : 0.000010s : 1: rewriter_after_jit_bprop_graph 0.12% : 0.000053s : 1: rewriter_after_opt_a 0.21% : 0.000095s : 1: rewriter_before_opt_a 0.02% : 0.000007s : 1: slice_cell_reuse_recomputed_activation 0.02% : 0.000007s : 1: slice_recompute_activation 0.02% : 0.000007s : 1: split_layernorm_comm 0.02% : 0.000007s : 1: split_matmul_comm_elemetwise 0.03% : 0.000013s : 1: swap_dp_allreduce_reducescatter 0.26% : 0.000114s : 1: symbol_engine_optimizer 0.26% : 0.000116s : 1: tuple_transform 13.06% : 0.005818s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:45:20.801.888 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0154999, [21] [bootstrap]: 0.0004286 [type_inference]: 0.00564698 [event_method]: 1.9e-05 [auto_monad]: 6.758e-05 [graph_reusing]: 5.92001e-06 [inline]: 2.11998e-06 [add_attr]: 0.00306971, [1] [add_attr_with_inline]: 0.00306125, [1] [Cycle 1]: 5.997e-05, [2] [tag_attr]: 2.036e-05 [meta_addattr_fg_expand]: 6.33e-06 [parallel-infer-symbol]: 3.09001e-06 [pre_auto_parallel]: 3.452e-05 [insert-virtual-dataset]: 2.40002e-06 [parallel-infer-symbol-second]: 7.90023e-07 [dataset_repeat_opt]: 1.96e-06 [pipeline_split]: 1.65001e-06 [optimize]: 0.00550875, [53] [py_interpret_to_execute]: 2.802e-05 [rewriter_before_opt_a]: 8.857e-05 [opt_a]: 0.00319332, [2] [Cycle 1]: 0.00230125, [45] [expand_dump_flag]: 3.12002e-06 [switch_simplify]: 4.454e-05 [loop_unroll]: 3.231e-05 [a_1]: 0.00071036 [with_stream_mark]: 1.64e-05 [recompute_prepare]: 1.196e-05 [updatestate_depend_eliminate]: 5.20001e-06 [updatestate_assign_eliminate]: 4.17998e-06 [updatestate_loads_eliminate]: 3.93001e-06 [parameter_eliminate]: 1.82001e-06 [a_2]: 0.00012008 [accelerated_algorithm]: 9.39e-06 [shard]: 2.02001e-06 [meta_shard_fg_expand]: 2.34001e-06 [shard_inline]: 8.77e-06 [merge_send_recv]: 1.018e-05 [auto_parallel]: 7.68001e-06 [parallel]: 1.802e-05 [flash_sp]: 8.72e-06 [merge_comm]: 5.35999e-06 [allreduce_fusion]: 5.05001e-06 [matmul_add_comm_reduction]: 1.12e-05 [allreduce_slice_to_reducescatter]: 6.50005e-07 [virtual_shard_identity]: 1.1e-05 [virtual_dataset]: 8.92999e-06 [get_grad_eliminate_]: 8.82e-06 [virtual_output]: 9.16998e-06 [merge_forward]: 5.10001e-06 [cell_reuse_recompute_pass]: 1.25001e-06 [offload_activation]: 1.185e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.852e-05 [merge_recompute_call_nodes]: 1.54998e-06 [before_grad]: 1.52e-05 [set_forward_comm_id_for_comm_node_pass]: 5.30999e-06 [meta_fg_expand]: 3.91001e-06 [flash_sp_send_recv_attached]: 2.48e-06 [receive_attached]: 1.96998e-06 [after_resolve]: 1.422e-05 [a_after_grad]: 1.346e-05 [renormalize]: 0.00073936 [add_forward_monad_depend]: 6.34999e-06 [auto_monad_grad]: 2.93e-06 [auto_monad_eliminator]: 1.876e-05 [cse]: 4.439e-05 [a_3]: 6.645e-05 [Cycle 2]: 0.00088209, [45] [expand_dump_flag]: 1.29e-06 [switch_simplify]: 1.017e-05 [loop_unroll]: 8.84003e-06 [a_1]: 0.00022594 [with_stream_mark]: 1.45e-05 [recompute_prepare]: 9.01998e-06 [updatestate_depend_eliminate]: 4.95999e-06 [updatestate_assign_eliminate]: 3.62002e-06 [updatestate_loads_eliminate]: 3.80998e-06 [parameter_eliminate]: 1.33002e-06 [a_2]: 0.00010976 [accelerated_algorithm]: 8.89998e-06 [shard]: 1.51998e-06 [meta_shard_fg_expand]: 2.16e-06 [shard_inline]: 8.48001e-06 [merge_send_recv]: 7.23999e-06 [auto_parallel]: 7.5e-06 [parallel]: 5.86998e-06 [flash_sp]: 3.68e-06 [merge_comm]: 4.80999e-06 [allreduce_fusion]: 5.20001e-06 [matmul_add_comm_reduction]: 8.82e-06 [allreduce_slice_to_reducescatter]: 5.19998e-07 [virtual_shard_identity]: 9.41e-06 [virtual_dataset]: 8.64e-06 [get_grad_eliminate_]: 8.09002e-06 [virtual_output]: 8.1e-06 [merge_forward]: 4.28001e-06 [cell_reuse_recompute_pass]: 1.57999e-06 [offload_activation]: 9.39e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.761e-05 [merge_recompute_call_nodes]: 1.22e-06 [before_grad]: 1.387e-05 [set_forward_comm_id_for_comm_node_pass]: 5.32999e-06 [meta_fg_expand]: 3.33e-06 [flash_sp_send_recv_attached]: 1.02998e-06 [receive_attached]: 1.05001e-06 [after_resolve]: 1.293e-05 [a_after_grad]: 1.276e-05 [renormalize]: 5.9983e-08 [add_forward_monad_depend]: 1.23002e-06 [auto_monad_grad]: 1.16002e-06 [auto_monad_eliminator]: 9.76e-06 [cse]: 2.409e-05 [a_3]: 5.424e-05 [py_interpret_to_execute_after_opt_a]: 1.233e-05 [slice_cell_reuse_recomputed_activation]: 1.97001e-06 [rewriter_after_opt_a]: 4.685e-05 [convert_after_rewriter]: 8.14002e-06 [order_py_execute_after_rewriter]: 6.14999e-06 [mutable_eliminate]: 0.00054617 [opt_b]: 0.00028567, [1] [Cycle 1]: 0.0002795, [7] [b_1]: 0.00018253 [b_2]: 1.108e-05 [updatestate_depend_eliminate]: 8.66002e-06 [updatestate_assign_eliminate]: 3.81999e-06 [updatestate_loads_eliminate]: 3.93999e-06 [renormalize]: 1.00999e-06 [cse]: 3.147e-05 [optimize_parallel_all_gather_comm]: 1.97e-05 [overlap_param_gather]: 2.49999e-06 [cconv]: 3.058e-05 [loop_unroll]: 0.00044282 [opt_after_cconv]: 0.00013278, [1] [Cycle 1]: 0.00012696, [7] [c_1]: 4.248e-05 [parameter_eliminate]: 4.15999e-06 [updatestate_depend_eliminate]: 7.66001e-06 [updatestate_assign_eliminate]: 3.78999e-06 [updatestate_loads_eliminate]: 3.45e-06 [cse]: 2.812e-05 [renormalize]: 1.04e-06 [remove_dup_value]: 4.321e-05 [tuple_transform]: 9.805e-05, [1] [Cycle 1]: 9.252e-05, [4] [d_1]: 6.194e-05 [none_parameter_eliminate]: 1.79998e-06 [renormalize]: 2.69996e-07 [switch_simplify]: 9.37999e-06 [partial_unused_args_eliminate]: 2.08002e-06 [add_recomputation]: 6.59e-05 [cse_after_recomputation]: 2.808e-05, [1] [Cycle 1]: 2.336e-05, [1] [cse]: 1.743e-05 [environ_conv]: 6.58e-06 [swap_dp_allreduce_reducescatter]: 6.96001e-06 [bias_add_comm_swap]: 2.66e-06 [label_micro_interleaved_index]: 4.44998e-06 [label_fine_grained_interleaved_index]: 3.13e-06 [merge_cast_opt]: 1.37e-06 [slice_recompute_activation]: 1.94e-06 [micro_interleaved_order_control]: 3.16999e-06 [assign_add_opt]: 1.19e-06 [ForceFp32Comm]: 1.31002e-06 [remove_cast_before_assign_add]: 1.32999e-06 [full_micro_interleaved_order_control]: 2.42001e-06 [reorder_send_recv_between_fp_bp]: 2.66e-06 [comm_op_add_attrs]: 1.00001e-06 [add_comm_op_reuse_tag]: 9.39996e-07 [interleave_split_concat_branches]: 1.05999e-06 [interleave_parallel_branches]: 1.08001e-06 [overlap_opt_shard_in_pipeline]: 1.22e-06 [overlap_opt_shard_grad_in_pipeline]: 1.79e-06 [control_data_broadcast_order]: 1.74e-05 [grouped_pairwise_exchange_alltoall]: 1.77999e-06 [offloading_packed_experts]: 4.67e-06 [overlap_recompute_and_grad_model_parallel]: 5.57999e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.25999e-06 [overlap_recompute_allgather_and_fa_grad]: 1.57999e-06 [overlap_recompute_comm]: 2.31e-06 [overlap_grad_ring_attention]: 5.08002e-06 [overlap_grad_flash_sp]: 2.54e-05 [begin_end_overlap_inline]: 5.39992e-07 [split_matmul_comm_elemetwise]: 2.24999e-06 [split_layernorm_comm]: 1.67999e-06 [handle_group_info]: 9.20001e-07 [symbol_engine_optimizer]: 9.138e-05, [1] [Cycle 1]: 8.695e-05, [6] [build]: 3.7e-06 [elim_shapecalc]: 1.368e-05 [elim_not_effective]: 1.731e-05 [opt_reshape]: 9.96e-06 [fold_const_symbol]: 1.406e-05 [renormalize]: 2.60014e-07 [detach_backward]: 2.26e-06 [pipeline_parallel_scheduler]: 1.81e-06 [auto_monad_reorder]: 2.076e-05 [get_jit_bprop_graph]: 1.86003e-06 [rewriter_after_jit_bprop_graph]: 4.18001e-06 [opt_after_jit_grad]: 0.00048496 [validate]: 4.573e-05 Sums bootstrap : 0.000429s : 3.74% type_inference : 0.005647s : 49.24% event_method : 0.000019s : 0.17% auto_monad : 0.000068s : 0.59% graph_reusing : 0.000006s : 0.05% inline : 0.000002s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000020s : 0.18% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.06% parallel-infer-symbol : 0.000003s : 0.03% pre_auto_parallel : 0.000035s : 0.30% insert-virtual-dataset : 0.000002s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000028s : 0.24% optimize.rewriter_before_opt_a : 0.000089s : 0.77% optimize.opt_a.expand_dump_flag : 0.000004s : 0.04% optimize.opt_a.switch_simplify : 0.000055s : 0.48% optimize.opt_a.loop_unroll : 0.000041s : 0.36% optimize.opt_a.a_1 : 0.000936s : 8.16% optimize.opt_a.with_stream_mark : 0.000031s : 0.27% optimize.opt_a.recompute_prepare : 0.000021s : 0.18% optimize.opt_a.updatestate_depend_eliminate : 0.000010s : 0.09% optimize.opt_a.updatestate_assign_eliminate : 0.000008s : 0.07% optimize.opt_a.updatestate_loads_eliminate : 0.000008s : 0.07% optimize.opt_a.parameter_eliminate : 0.000003s : 0.03% optimize.opt_a.a_2 : 0.000230s : 2.00% optimize.opt_a.accelerated_algorithm : 0.000018s : 0.16% optimize.opt_a.shard : 0.000004s : 0.03% optimize.opt_a.meta_shard_fg_expand : 0.000005s : 0.04% optimize.opt_a.shard_inline : 0.000017s : 0.15% optimize.opt_a.merge_send_recv : 0.000017s : 0.15% optimize.opt_a.auto_parallel : 0.000015s : 0.13% optimize.opt_a.parallel : 0.000024s : 0.21% optimize.opt_a.flash_sp : 0.000012s : 0.11% optimize.opt_a.merge_comm : 0.000010s : 0.09% optimize.opt_a.allreduce_fusion : 0.000010s : 0.09% optimize.opt_a.matmul_add_comm_reduction : 0.000020s : 0.17% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000020s : 0.18% optimize.opt_a.virtual_dataset : 0.000018s : 0.15% optimize.opt_a.get_grad_eliminate_ : 0.000017s : 0.15% optimize.opt_a.virtual_output : 0.000017s : 0.15% optimize.opt_a.merge_forward : 0.000009s : 0.08% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.02% optimize.opt_a.offload_activation : 0.000021s : 0.19% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000036s : 0.32% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.02% optimize.opt_a.before_grad : 0.000029s : 0.25% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000011s : 0.09% optimize.opt_a.meta_fg_expand : 0.000007s : 0.06% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.03% optimize.opt_a.receive_attached : 0.000003s : 0.03% optimize.opt_a.after_resolve : 0.000027s : 0.24% optimize.opt_a.a_after_grad : 0.000026s : 0.23% optimize.opt_a.renormalize : 0.000739s : 6.45% optimize.opt_a.add_forward_monad_depend : 0.000008s : 0.07% optimize.opt_a.auto_monad_grad : 0.000004s : 0.04% optimize.opt_a.auto_monad_eliminator : 0.000029s : 0.25% optimize.opt_a.cse : 0.000068s : 0.60% optimize.opt_a.a_3 : 0.000121s : 1.05% optimize.py_interpret_to_execute_after_opt_a : 0.000012s : 0.11% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.02% optimize.rewriter_after_opt_a : 0.000047s : 0.41% optimize.convert_after_rewriter : 0.000008s : 0.07% optimize.order_py_execute_after_rewriter : 0.000006s : 0.05% optimize.mutable_eliminate : 0.000546s : 4.76% optimize.opt_b.b_1 : 0.000183s : 1.59% optimize.opt_b.b_2 : 0.000011s : 0.10% optimize.opt_b.updatestate_depend_eliminate : 0.000009s : 0.08% optimize.opt_b.updatestate_assign_eliminate : 0.000004s : 0.03% optimize.opt_b.updatestate_loads_eliminate : 0.000004s : 0.03% optimize.opt_b.renormalize : 0.000001s : 0.01% optimize.opt_b.cse : 0.000031s : 0.27% optimize.optimize_parallel_all_gather_comm : 0.000020s : 0.17% optimize.overlap_param_gather : 0.000002s : 0.02% optimize.cconv : 0.000031s : 0.27% optimize.loop_unroll : 0.000443s : 3.86% optimize.opt_after_cconv.c_1 : 0.000042s : 0.37% optimize.opt_after_cconv.parameter_eliminate : 0.000004s : 0.04% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000008s : 0.07% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000004s : 0.03% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.cse : 0.000028s : 0.25% optimize.opt_after_cconv.renormalize : 0.000001s : 0.01% optimize.remove_dup_value : 0.000043s : 0.38% optimize.tuple_transform.d_1 : 0.000062s : 0.54% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.02% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000009s : 0.08% optimize.partial_unused_args_eliminate : 0.000002s : 0.02% optimize.add_recomputation : 0.000066s : 0.57% optimize.cse_after_recomputation.cse : 0.000017s : 0.15% optimize.environ_conv : 0.000007s : 0.06% optimize.swap_dp_allreduce_reducescatter : 0.000007s : 0.06% optimize.bias_add_comm_swap : 0.000003s : 0.02% optimize.label_micro_interleaved_index : 0.000004s : 0.04% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.03% optimize.merge_cast_opt : 0.000001s : 0.01% optimize.slice_recompute_activation : 0.000002s : 0.02% optimize.micro_interleaved_order_control : 0.000003s : 0.03% optimize.assign_add_opt : 0.000001s : 0.01% optimize.ForceFp32Comm : 0.000001s : 0.01% optimize.remove_cast_before_assign_add : 0.000001s : 0.01% optimize.full_micro_interleaved_order_control : 0.000002s : 0.02% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.02% optimize.comm_op_add_attrs : 0.000001s : 0.01% optimize.add_comm_op_reuse_tag : 0.000001s : 0.01% optimize.interleave_split_concat_branches : 0.000001s : 0.01% optimize.interleave_parallel_branches : 0.000001s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.02% optimize.control_data_broadcast_order : 0.000017s : 0.15% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.02% optimize.offloading_packed_experts : 0.000005s : 0.04% optimize.overlap_recompute_and_grad_model_parallel : 0.000006s : 0.05% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000002s : 0.01% optimize.overlap_recompute_comm : 0.000002s : 0.02% optimize.overlap_grad_ring_attention : 0.000005s : 0.04% optimize.overlap_grad_flash_sp : 0.000025s : 0.22% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.02% optimize.split_layernorm_comm : 0.000002s : 0.01% optimize.handle_group_info : 0.000001s : 0.01% optimize.symbol_engine_optimizer.build : 0.000004s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000014s : 0.12% optimize.symbol_engine_optimizer.elim_not_effective : 0.000017s : 0.15% optimize.symbol_engine_optimizer.opt_reshape : 0.000010s : 0.09% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000014s : 0.12% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.02% pipeline_parallel_scheduler : 0.000002s : 0.02% auto_monad_reorder : 0.000021s : 0.18% get_jit_bprop_graph : 0.000002s : 0.02% rewriter_after_jit_bprop_graph : 0.000004s : 0.04% opt_after_jit_grad : 0.000485s : 4.23% validate : 0.000046s : 0.40% Time group info: ------[substitution.] 0.000220 48 14.27% : 0.000031s : 6: substitution.cast_eliminate 1.13% : 0.000002s : 4: substitution.elim_not_effective 0.93% : 0.000002s : 4: substitution.fold_const_symbol 3.54% : 0.000008s : 6: substitution.graph_param_transform 66.18% : 0.000146s : 4: substitution.inline 2.51% : 0.000006s : 8: substitution.j_node_and_user_rematch 3.63% : 0.000008s : 8: substitution.remove_not_recompute_node 2.17% : 0.000005s : 4: substitution.replace_old_param 5.63% : 0.000012s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.005590 2 87.50% : 0.004891s : 1: type_inference.infer 12.50% : 0.000699s : 1: type_inference.specialize ------[replace.] 0.000060 8 62.91% : 0.000038s : 4: replace.inline 37.09% : 0.000022s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000153 8 93.12% : 0.000143s : 4: match.inline 6.88% : 0.000011s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000271 1730 0.85% : 0.000002s : 17: predicate.accumulaten_eliminater 0.97% : 0.000003s : 6: predicate.ad_related_special_op_eliminate 0.60% : 0.000002s : 12: predicate.addn_check_dump 0.89% : 0.000002s : 17: predicate.addn_zero_filter 0.84% : 0.000002s : 17: predicate.adjust_all_reduce_mul_add 1.98% : 0.000005s : 29: predicate.arithmetic_simplify 1.20% : 0.000003s : 17: predicate.cast_eliminate 0.65% : 0.000002s : 12: predicate.check_bprop_eliminate 0.62% : 0.000002s : 12: predicate.compare_switch_simplify 0.21% : 0.000001s : 6: predicate.const_output_eliminate 0.65% : 0.000002s : 12: predicate.depend_value_elim 0.98% : 0.000003s : 17: predicate.dict_get_item_const_eliminator 1.04% : 0.000003s : 17: predicate.dict_get_item_eliminator 0.86% : 0.000002s : 17: predicate.dict_set_item_eliminator 1.06% : 0.000003s : 12: predicate.dumpgradient_eliminate 0.21% : 0.000001s : 6: predicate.elim_not_effective 0.44% : 0.000001s : 6: predicate.elim_shapecalc_of_broadcastargs 1.12% : 0.000003s : 23: predicate.environ_add_const_eliminate 1.12% : 0.000003s : 23: predicate.environ_get_add_eliminate 1.15% : 0.000003s : 23: predicate.environ_get_depend_swap 1.88% : 0.000005s : 35: predicate.environ_get_eliminate 1.11% : 0.000003s : 23: predicate.environ_get_set_eliminate 1.28% : 0.000003s : 25: predicate.exchange_switch_depend_value 2.03% : 0.000006s : 25: predicate.float_depend_g_call 0.64% : 0.000002s : 12: predicate.float_environ_get_switch 0.88% : 0.000002s : 18: predicate.float_tuple_getitem_switch 0.19% : 0.000001s : 6: predicate.fold_const_symbol 0.72% : 0.000002s : 12: predicate.get_grad_eliminate 0.25% : 0.000001s : 6: predicate.graph_param_transform 0.69% : 0.000002s : 12: predicate.incorporate_call 0.60% : 0.000002s : 12: predicate.incorporate_call_switch 6.30% : 0.000017s : 78: predicate.inline 0.86% : 0.000002s : 12: predicate.inline_without_move 0.36% : 0.000001s : 12: predicate.j_node_and_user_rematch 0.84% : 0.000002s : 12: predicate.less_batch_normalization 1.89% : 0.000005s : 33: predicate.list_to_tuple_eliminator_ 2.48% : 0.000007s : 50: predicate.load_eliminater 0.91% : 0.000002s : 6: predicate.loop_unroll_after_grad 2.05% : 0.000006s : 38: predicate.loop_unroll_before_grad 1.65% : 0.000004s : 29: predicate.make_slice_get_slice_eliminator 0.66% : 0.000002s : 12: predicate.merge_addn 0.63% : 0.000002s : 12: predicate.micro_step_allgather_replace 0.65% : 0.000002s : 12: predicate.mini_step_allgather_replace 0.80% : 0.000002s : 17: predicate.minmaximum_grad 0.95% : 0.000003s : 6: predicate.mutable_eliminate 0.35% : 0.000001s : 6: predicate.opt_reshape 0.41% : 0.000001s : 6: predicate.parallel_virtual_node 1.65% : 0.000004s : 25: predicate.partial_defer_inline 1.64% : 0.000004s : 27: predicate.partial_eliminate 0.86% : 0.000002s : 17: predicate.print_const_string_wrapper 0.60% : 0.000002s : 12: predicate.reduce_all_const_elim 1.12% : 0.000003s : 17: predicate.reduce_eliminate 2.48% : 0.000007s : 50: predicate.redundant_stop_gradient_eliminater 0.41% : 0.000001s : 12: predicate.remove_not_recompute_node 1.35% : 0.000004s : 33: predicate.replace_applicator 0.43% : 0.000001s : 12: predicate.replace_old_param 0.27% : 0.000001s : 6: predicate.reset_defer_inline 0.96% : 0.000003s : 17: predicate.reshape_eliminate 0.69% : 0.000002s : 12: predicate.row_tensor_add_zeros_like 0.44% : 0.000001s : 6: predicate.row_tensor_eliminate 0.82% : 0.000002s : 12: predicate.same_eliminate 0.46% : 0.000001s : 12: predicate.set_cell_output_no_recompute 0.79% : 0.000002s : 12: predicate.shard_identity_eliminate 0.89% : 0.000002s : 12: predicate.special_op_eliminate 0.82% : 0.000002s : 12: predicate.specialize_transform 0.81% : 0.000002s : 12: predicate.split_environ_get_set_with_tuple_value 0.77% : 0.000002s : 12: predicate.stack_unstack_eliminate 0.36% : 0.000001s : 6: predicate.switch_call_monad_eliminater 1.34% : 0.000004s : 25: predicate.switch_defer_inline 1.93% : 0.000005s : 37: predicate.switch_layer_defer_inline 4.63% : 0.000013s : 81: predicate.switch_simplify 0.89% : 0.000002s : 17: predicate.tile_eliminate 0.90% : 0.000002s : 17: predicate.transpose_eliminate 1.65% : 0.000004s : 29: predicate.tuple_list_convert_item_index_to_positive 1.62% : 0.000004s : 29: predicate.tuple_list_get_item_const_eliminator 1.63% : 0.000004s : 29: predicate.tuple_list_get_item_depend_reorder 3.11% : 0.000008s : 45: predicate.tuple_list_get_item_eliminator 1.56% : 0.000004s : 29: predicate.tuple_list_get_set_item_eliminator 2.39% : 0.000006s : 41: predicate.tuple_list_set_item_eliminator 1.76% : 0.000005s : 33: predicate.tuple_to_list_eliminator_ 2.43% : 0.000007s : 50: predicate.updatestate_pure_node_eliminater 3.23% : 0.000009s : 62: predicate.updatestate_useless_node_eliminater 0.45% : 0.000001s : 6: predicate.value_based_eliminate 0.76% : 0.000002s : 12: predicate.virtual_dataset_eliminate 0.84% : 0.000002s : 12: predicate.virtual_output_eliminate 0.32% : 0.000001s : 6: predicate.virtual_view_grad_eliminate 0.43% : 0.000001s : 6: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000512 11 53.73% : 0.000275s : 5: func_graph_cloner_run.FuncGraphClonerGraph 46.27% : 0.000237s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.026619 192 0.02% : 0.000004s : 1: ForceFp32Comm 11.55% : 0.003075s : 1: add_attr 11.51% : 0.003065s : 1: add_attr_with_inline 0.01% : 0.000004s : 1: add_comm_op_reuse_tag 0.26% : 0.000070s : 1: add_recomputation 0.01% : 0.000004s : 1: assign_add_opt 0.27% : 0.000073s : 1: auto_monad 0.09% : 0.000024s : 1: auto_monad_reorder 0.01% : 0.000004s : 1: begin_end_overlap_inline 0.02% : 0.000006s : 1: bias_add_comm_swap 1.72% : 0.000457s : 1: bootstrap 0.13% : 0.000034s : 1: cconv 0.01% : 0.000004s : 1: comm_op_add_attrs 0.08% : 0.000021s : 1: control_data_broadcast_order 0.04% : 0.000011s : 1: convert_after_rewriter 0.12% : 0.000031s : 1: cse_after_recomputation 0.02% : 0.000005s : 1: dataset_repeat_opt 0.02% : 0.000006s : 1: detach_backward 0.04% : 0.000010s : 1: environ_conv 0.09% : 0.000025s : 1: event_method 0.02% : 0.000005s : 1: full_micro_interleaved_order_control 0.02% : 0.000005s : 1: get_jit_bprop_graph 0.04% : 0.000009s : 1: graph_reusing 0.02% : 0.000005s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000004s : 1: handle_group_info 0.02% : 0.000005s : 1: inline 0.02% : 0.000006s : 1: insert-virtual-dataset 0.01% : 0.000004s : 1: interleave_parallel_branches 0.01% : 0.000004s : 1: interleave_split_concat_branches 0.02% : 0.000006s : 1: label_fine_grained_interleaved_index 0.03% : 0.000007s : 1: label_micro_interleaved_index 1.69% : 0.000451s : 1: loop_unroll 0.02% : 0.000004s : 1: merge_cast_opt 0.02% : 0.000006s : 1: micro_interleaved_order_control 2.08% : 0.000554s : 1: mutable_eliminate 0.03% : 0.000007s : 1: offloading_packed_experts 0.07% : 0.000019s : 1: opt.transform.loop_unroll_optimizer 0.07% : 0.000020s : 1: opt.transform.mutable_eliminate 5.78% : 0.001537s : 78: opt.transform.opt_a 0.15% : 0.000041s : 1: opt.transform.opt_after_cconv 0.13% : 0.000034s : 1: opt.transform.opt_after_jit_grad 0.61% : 0.000163s : 28: opt.transform.opt_b 0.26% : 0.000069s : 2: opt.transform.opt_trans_graph 0.19% : 0.000051s : 4: opt.transform.symbol_engine_opt 12.01% : 0.003196s : 1: opt_a 0.51% : 0.000136s : 1: opt_after_cconv 1.86% : 0.000494s : 1: opt_after_jit_grad 1.09% : 0.000289s : 1: opt_b 20.71% : 0.005514s : 1: optimize 0.09% : 0.000023s : 1: optimize_parallel_all_gather_comm 0.03% : 0.000009s : 1: order_py_execute_after_rewriter 0.11% : 0.000029s : 1: overlap_grad_flash_sp 0.01% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.03% : 0.000008s : 1: overlap_grad_ring_attention 0.02% : 0.000004s : 1: overlap_opt_shard_grad_in_pipeline 0.02% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.02% : 0.000006s : 1: overlap_param_gather 0.02% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.03% : 0.000008s : 1: overlap_recompute_and_grad_model_parallel 0.02% : 0.000005s : 1: overlap_recompute_comm 0.03% : 0.000007s : 1: parallel-infer-symbol 0.01% : 0.000004s : 1: parallel-infer-symbol-second 0.02% : 0.000005s : 1: partial_unused_args_eliminate 0.02% : 0.000005s : 1: pipeline_parallel_scheduler 0.02% : 0.000004s : 1: pipeline_split 0.14% : 0.000039s : 1: pre_auto_parallel 0.12% : 0.000032s : 1: py_interpret_to_execute 0.06% : 0.000016s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000004s : 1: remove_cast_before_assign_add 0.18% : 0.000048s : 1: remove_dup_value 1.51% : 0.000402s : 1: renormalize.infer 1.24% : 0.000330s : 1: renormalize.specialize 0.02% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.03% : 0.000008s : 1: rewriter_after_jit_bprop_graph 0.19% : 0.000051s : 1: rewriter_after_opt_a 0.35% : 0.000093s : 1: rewriter_before_opt_a 0.02% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.02% : 0.000005s : 1: slice_recompute_activation 0.02% : 0.000004s : 1: split_layernorm_comm 0.02% : 0.000005s : 1: split_matmul_comm_elemetwise 0.04% : 0.000010s : 1: swap_dp_allreduce_reducescatter 0.35% : 0.000094s : 1: symbol_engine_optimizer 0.38% : 0.000101s : 1: tuple_transform 21.27% : 0.005661s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:45:21.295.424 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:45:21.295.687 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0457295, [21] [bootstrap]: 0.00042263 [type_inference]: 0.0226287 [event_method]: 1.939e-05 [auto_monad]: 6.555e-05 [graph_reusing]: 5.89e-06 [inline]: 2.42001e-06 [add_attr]: 0.00320206, [1] [add_attr_with_inline]: 0.00319306, [1] [Cycle 1]: 0.00010649, [2] [tag_attr]: 1.981e-05 [meta_addattr_fg_expand]: 5.61e-06 [parallel-infer-symbol]: 2.96001e-06 [pre_auto_parallel]: 3.255e-05 [insert-virtual-dataset]: 2.29001e-06 [parallel-infer-symbol-second]: 7.79983e-07 [dataset_repeat_opt]: 1.96e-06 [pipeline_split]: 1.52999e-06 [optimize]: 0.0180689, [53] [py_interpret_to_execute]: 2.951e-05 [rewriter_before_opt_a]: 8.754e-05 [opt_a]: 0.00320007, [2] [Cycle 1]: 0.00223777, [45] [expand_dump_flag]: 2.82002e-06 [switch_simplify]: 4.179e-05 [loop_unroll]: 3.13e-05 [a_1]: 0.00066035 [with_stream_mark]: 1.399e-05 [recompute_prepare]: 9.67001e-06 [updatestate_depend_eliminate]: 4.47998e-06 [updatestate_assign_eliminate]: 4.05e-06 [updatestate_loads_eliminate]: 4.13999e-06 [parameter_eliminate]: 1.87999e-06 [a_2]: 0.00012932 [accelerated_algorithm]: 8.18001e-06 [shard]: 2.39999e-06 [meta_shard_fg_expand]: 2.26998e-06 [shard_inline]: 8.58001e-06 [merge_send_recv]: 9.42999e-06 [auto_parallel]: 7.33999e-06 [parallel]: 1.838e-05 [flash_sp]: 8.07e-06 [merge_comm]: 4.69998e-06 [allreduce_fusion]: 4.25999e-06 [matmul_add_comm_reduction]: 1.023e-05 [allreduce_slice_to_reducescatter]: 6.19999e-07 [virtual_shard_identity]: 9.42001e-06 [virtual_dataset]: 8.09002e-06 [get_grad_eliminate_]: 7.66001e-06 [virtual_output]: 7.69002e-06 [merge_forward]: 4.33999e-06 [cell_reuse_recompute_pass]: 1.27999e-06 [offload_activation]: 9.92999e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.707e-05 [merge_recompute_call_nodes]: 1.39e-06 [before_grad]: 1.272e-05 [set_forward_comm_id_for_comm_node_pass]: 4.77e-06 [meta_fg_expand]: 3.31999e-06 [flash_sp_send_recv_attached]: 2.31e-06 [receive_attached]: 1.92999e-06 [after_resolve]: 1.259e-05 [a_after_grad]: 1.194e-05 [renormalize]: 0.00062068 [add_forward_monad_depend]: 5.52001e-06 [auto_monad_grad]: 2.26998e-06 [auto_monad_eliminator]: 1.681e-05 [cse]: 3.505e-05 [a_3]: 7.234e-05 [Cycle 2]: 0.00094997, [45] [expand_dump_flag]: 9.89996e-07 [switch_simplify]: 8.82e-06 [loop_unroll]: 7.60998e-06 [a_1]: 0.00018509 [with_stream_mark]: 1.185e-05 [recompute_prepare]: 7.98999e-06 [updatestate_depend_eliminate]: 3.75e-06 [updatestate_assign_eliminate]: 3.08e-06 [updatestate_loads_eliminate]: 2.96001e-06 [parameter_eliminate]: 1.00001e-06 [a_2]: 0.00012227 [accelerated_algorithm]: 8.38001e-06 [shard]: 1.12e-06 [meta_shard_fg_expand]: 1.73002e-06 [shard_inline]: 8.45999e-06 [merge_send_recv]: 6.19001e-06 [auto_parallel]: 7.3e-06 [parallel]: 4.72e-06 [flash_sp]: 3.86999e-06 [merge_comm]: 4.58001e-06 [allreduce_fusion]: 5.02e-06 [matmul_add_comm_reduction]: 6.80998e-06 [allreduce_slice_to_reducescatter]: 4.50003e-07 [virtual_shard_identity]: 8.59e-06 [virtual_dataset]: 7.25e-06 [get_grad_eliminate_]: 7.01999e-06 [virtual_output]: 6.99001e-06 [merge_forward]: 3.16999e-06 [cell_reuse_recompute_pass]: 1.55999e-06 [offload_activation]: 7.71999e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.683e-05 [merge_recompute_call_nodes]: 7.2e-07 [before_grad]: 1.206e-05 [set_forward_comm_id_for_comm_node_pass]: 4.37e-06 [meta_fg_expand]: 2.59999e-06 [flash_sp_send_recv_attached]: 7.79983e-07 [receive_attached]: 1.02998e-06 [after_resolve]: 1.128e-05 [a_after_grad]: 1.106e-05 [renormalize]: 7.99773e-08 [add_forward_monad_depend]: 1.30001e-06 [auto_monad_grad]: 8.50006e-07 [auto_monad_eliminator]: 8.57e-06 [cse]: 1.766e-05 [a_3]: 5.836e-05 [py_interpret_to_execute_after_opt_a]: 1.162e-05 [slice_cell_reuse_recomputed_activation]: 4.43001e-06 [rewriter_after_opt_a]: 4.108e-05 [convert_after_rewriter]: 1.035e-05 [order_py_execute_after_rewriter]: 8.91002e-06 [mutable_eliminate]: 0.0126069 [opt_b]: 0.0003705, [1] [Cycle 1]: 0.00035738, [7] [b_1]: 0.00021522 [b_2]: 1.153e-05 [updatestate_depend_eliminate]: 1.393e-05 [updatestate_assign_eliminate]: 3.86999e-06 [updatestate_loads_eliminate]: 3.68e-06 [renormalize]: 1.49998e-06 [cse]: 4.643e-05 [optimize_parallel_all_gather_comm]: 3.069e-05 [overlap_param_gather]: 5.57001e-06 [cconv]: 4.083e-05 [loop_unroll]: 0.0006958 [opt_after_cconv]: 0.00015342, [1] [Cycle 1]: 0.00014322, [7] [c_1]: 3.931e-05 [parameter_eliminate]: 6.20002e-06 [updatestate_depend_eliminate]: 7.98001e-06 [updatestate_assign_eliminate]: 3.28998e-06 [updatestate_loads_eliminate]: 3.04001e-06 [cse]: 2.59e-05 [renormalize]: 4.59986e-07 [remove_dup_value]: 1.931e-05 [tuple_transform]: 0.00011023, [1] [Cycle 1]: 0.0001027, [4] [d_1]: 5.97e-05 [none_parameter_eliminate]: 2.31998e-06 [renormalize]: 1.8999e-07 [switch_simplify]: 9.19998e-06 [partial_unused_args_eliminate]: 4.52e-06 [add_recomputation]: 6.705e-05 [cse_after_recomputation]: 3.178e-05, [1] [Cycle 1]: 2.459e-05, [1] [cse]: 1.524e-05 [environ_conv]: 1.063e-05 [swap_dp_allreduce_reducescatter]: 9.10999e-06 [bias_add_comm_swap]: 6.37001e-06 [label_micro_interleaved_index]: 8.13001e-06 [label_fine_grained_interleaved_index]: 5.24e-06 [merge_cast_opt]: 3.98001e-06 [slice_recompute_activation]: 4.47e-06 [micro_interleaved_order_control]: 5.09998e-06 [assign_add_opt]: 3.68e-06 [ForceFp32Comm]: 4.12e-06 [remove_cast_before_assign_add]: 4.02998e-06 [full_micro_interleaved_order_control]: 4.53999e-06 [reorder_send_recv_between_fp_bp]: 5.51e-06 [comm_op_add_attrs]: 3.81999e-06 [add_comm_op_reuse_tag]: 3.71001e-06 [interleave_split_concat_branches]: 3.66999e-06 [interleave_parallel_branches]: 3.85e-06 [overlap_opt_shard_in_pipeline]: 3.66999e-06 [overlap_opt_shard_grad_in_pipeline]: 4.16001e-06 [control_data_broadcast_order]: 2.053e-05 [grouped_pairwise_exchange_alltoall]: 4.02002e-06 [offloading_packed_experts]: 7.3e-06 [overlap_recompute_and_grad_model_parallel]: 7.46999e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.56999e-06 [overlap_recompute_allgather_and_fa_grad]: 3.95998e-06 [overlap_recompute_comm]: 5.19e-06 [overlap_grad_ring_attention]: 7.13998e-06 [overlap_grad_flash_sp]: 2.818e-05 [begin_end_overlap_inline]: 2.86e-06 [split_matmul_comm_elemetwise]: 4.75001e-06 [split_layernorm_comm]: 3.85998e-06 [handle_group_info]: 3.30998e-06 [symbol_engine_optimizer]: 0.00011417, [1] [Cycle 1]: 0.00010652, [6] [build]: 3.88001e-06 [elim_shapecalc]: 1.447e-05 [elim_not_effective]: 1.646e-05 [opt_reshape]: 9.20001e-06 [fold_const_symbol]: 1.275e-05 [renormalize]: 4.30009e-07 [detach_backward]: 4.63999e-06 [pipeline_parallel_scheduler]: 2.12001e-06 [auto_monad_reorder]: 2.64e-05 [get_jit_bprop_graph]: 2.16e-06 [rewriter_after_jit_bprop_graph]: 5.89999e-06 [opt_after_jit_grad]: 0.00056717 [validate]: 4.828e-05 Sums bootstrap : 0.000423s : 1.04% type_inference : 0.022629s : 55.57% event_method : 0.000019s : 0.05% auto_monad : 0.000066s : 0.16% graph_reusing : 0.000006s : 0.01% inline : 0.000002s : 0.01% add_attr.add_attr_with_inline.tag_attr : 0.000020s : 0.05% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.01% parallel-infer-symbol : 0.000003s : 0.01% pre_auto_parallel : 0.000033s : 0.08% insert-virtual-dataset : 0.000002s : 0.01% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000030s : 0.07% optimize.rewriter_before_opt_a : 0.000088s : 0.21% optimize.opt_a.expand_dump_flag : 0.000004s : 0.01% optimize.opt_a.switch_simplify : 0.000051s : 0.12% optimize.opt_a.loop_unroll : 0.000039s : 0.10% optimize.opt_a.a_1 : 0.000845s : 2.08% optimize.opt_a.with_stream_mark : 0.000026s : 0.06% optimize.opt_a.recompute_prepare : 0.000018s : 0.04% optimize.opt_a.updatestate_depend_eliminate : 0.000008s : 0.02% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.02% optimize.opt_a.updatestate_loads_eliminate : 0.000007s : 0.02% optimize.opt_a.parameter_eliminate : 0.000003s : 0.01% optimize.opt_a.a_2 : 0.000252s : 0.62% optimize.opt_a.accelerated_algorithm : 0.000017s : 0.04% optimize.opt_a.shard : 0.000004s : 0.01% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.01% optimize.opt_a.shard_inline : 0.000017s : 0.04% optimize.opt_a.merge_send_recv : 0.000016s : 0.04% optimize.opt_a.auto_parallel : 0.000015s : 0.04% optimize.opt_a.parallel : 0.000023s : 0.06% optimize.opt_a.flash_sp : 0.000012s : 0.03% optimize.opt_a.merge_comm : 0.000009s : 0.02% optimize.opt_a.allreduce_fusion : 0.000009s : 0.02% optimize.opt_a.matmul_add_comm_reduction : 0.000017s : 0.04% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000018s : 0.04% optimize.opt_a.virtual_dataset : 0.000015s : 0.04% optimize.opt_a.get_grad_eliminate_ : 0.000015s : 0.04% optimize.opt_a.virtual_output : 0.000015s : 0.04% optimize.opt_a.merge_forward : 0.000008s : 0.02% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.01% optimize.opt_a.offload_activation : 0.000018s : 0.04% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000034s : 0.08% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.01% optimize.opt_a.before_grad : 0.000025s : 0.06% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000009s : 0.02% optimize.opt_a.meta_fg_expand : 0.000006s : 0.01% optimize.opt_a.flash_sp_send_recv_attached : 0.000003s : 0.01% optimize.opt_a.receive_attached : 0.000003s : 0.01% optimize.opt_a.after_resolve : 0.000024s : 0.06% optimize.opt_a.a_after_grad : 0.000023s : 0.06% optimize.opt_a.renormalize : 0.000621s : 1.52% optimize.opt_a.add_forward_monad_depend : 0.000007s : 0.02% optimize.opt_a.auto_monad_grad : 0.000003s : 0.01% optimize.opt_a.auto_monad_eliminator : 0.000025s : 0.06% optimize.opt_a.cse : 0.000053s : 0.13% optimize.opt_a.a_3 : 0.000131s : 0.32% optimize.py_interpret_to_execute_after_opt_a : 0.000012s : 0.03% optimize.slice_cell_reuse_recomputed_activation : 0.000004s : 0.01% optimize.rewriter_after_opt_a : 0.000041s : 0.10% optimize.convert_after_rewriter : 0.000010s : 0.03% optimize.order_py_execute_after_rewriter : 0.000009s : 0.02% optimize.mutable_eliminate : 0.012607s : 30.96% optimize.opt_b.b_1 : 0.000215s : 0.53% optimize.opt_b.b_2 : 0.000012s : 0.03% optimize.opt_b.updatestate_depend_eliminate : 0.000014s : 0.03% optimize.opt_b.updatestate_assign_eliminate : 0.000004s : 0.01% optimize.opt_b.updatestate_loads_eliminate : 0.000004s : 0.01% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000046s : 0.11% optimize.optimize_parallel_all_gather_comm : 0.000031s : 0.08% optimize.overlap_param_gather : 0.000006s : 0.01% optimize.cconv : 0.000041s : 0.10% optimize.loop_unroll : 0.000696s : 1.71% optimize.opt_after_cconv.c_1 : 0.000039s : 0.10% optimize.opt_after_cconv.parameter_eliminate : 0.000006s : 0.02% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000008s : 0.02% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.cse : 0.000026s : 0.06% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000019s : 0.05% optimize.tuple_transform.d_1 : 0.000060s : 0.15% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000009s : 0.02% optimize.partial_unused_args_eliminate : 0.000005s : 0.01% optimize.add_recomputation : 0.000067s : 0.16% optimize.cse_after_recomputation.cse : 0.000015s : 0.04% optimize.environ_conv : 0.000011s : 0.03% optimize.swap_dp_allreduce_reducescatter : 0.000009s : 0.02% optimize.bias_add_comm_swap : 0.000006s : 0.02% optimize.label_micro_interleaved_index : 0.000008s : 0.02% optimize.label_fine_grained_interleaved_index : 0.000005s : 0.01% optimize.merge_cast_opt : 0.000004s : 0.01% optimize.slice_recompute_activation : 0.000004s : 0.01% optimize.micro_interleaved_order_control : 0.000005s : 0.01% optimize.assign_add_opt : 0.000004s : 0.01% optimize.ForceFp32Comm : 0.000004s : 0.01% optimize.remove_cast_before_assign_add : 0.000004s : 0.01% optimize.full_micro_interleaved_order_control : 0.000005s : 0.01% optimize.reorder_send_recv_between_fp_bp : 0.000006s : 0.01% optimize.comm_op_add_attrs : 0.000004s : 0.01% optimize.add_comm_op_reuse_tag : 0.000004s : 0.01% optimize.interleave_split_concat_branches : 0.000004s : 0.01% optimize.interleave_parallel_branches : 0.000004s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000004s : 0.01% optimize.control_data_broadcast_order : 0.000021s : 0.05% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.01% optimize.offloading_packed_experts : 0.000007s : 0.02% optimize.overlap_recompute_and_grad_model_parallel : 0.000007s : 0.02% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.01% optimize.overlap_recompute_comm : 0.000005s : 0.01% optimize.overlap_grad_ring_attention : 0.000007s : 0.02% optimize.overlap_grad_flash_sp : 0.000028s : 0.07% optimize.begin_end_overlap_inline : 0.000003s : 0.01% optimize.split_matmul_comm_elemetwise : 0.000005s : 0.01% optimize.split_layernorm_comm : 0.000004s : 0.01% optimize.handle_group_info : 0.000003s : 0.01% optimize.symbol_engine_optimizer.build : 0.000004s : 0.01% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000014s : 0.04% optimize.symbol_engine_optimizer.elim_not_effective : 0.000016s : 0.04% optimize.symbol_engine_optimizer.opt_reshape : 0.000009s : 0.02% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000013s : 0.03% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000005s : 0.01% pipeline_parallel_scheduler : 0.000002s : 0.01% auto_monad_reorder : 0.000026s : 0.06% get_jit_bprop_graph : 0.000002s : 0.01% rewriter_after_jit_bprop_graph : 0.000006s : 0.01% opt_after_jit_grad : 0.000567s : 1.39% validate : 0.000048s : 0.12% Time group info: ------[substitution.] 0.000199 38 10.84% : 0.000022s : 3: substitution.cast_eliminate 1.10% : 0.000002s : 3: substitution.elim_not_effective 0.79% : 0.000002s : 3: substitution.fold_const_symbol 3.60% : 0.000007s : 5: substitution.graph_param_transform 69.88% : 0.000139s : 4: substitution.inline 2.22% : 0.000004s : 6: substitution.j_node_and_user_rematch 3.28% : 0.000007s : 6: substitution.remove_not_recompute_node 2.16% : 0.000004s : 4: substitution.replace_old_param 6.13% : 0.000012s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.022575 2 96.74% : 0.021839s : 1: type_inference.infer 3.26% : 0.000737s : 1: type_inference.specialize ------[replace.] 0.000060 8 61.68% : 0.000037s : 4: replace.inline 38.32% : 0.000023s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000146 8 92.91% : 0.000136s : 4: match.inline 7.09% : 0.000010s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000252 1504 0.83% : 0.000002s : 15: predicate.accumulaten_eliminater 0.73% : 0.000002s : 5: predicate.ad_related_special_op_eliminate 0.54% : 0.000001s : 10: predicate.addn_check_dump 0.88% : 0.000002s : 15: predicate.addn_zero_filter 0.77% : 0.000002s : 15: predicate.adjust_all_reduce_mul_add 2.07% : 0.000005s : 25: predicate.arithmetic_simplify 0.93% : 0.000002s : 15: predicate.cast_eliminate 0.59% : 0.000001s : 10: predicate.check_bprop_eliminate 0.57% : 0.000001s : 10: predicate.compare_switch_simplify 0.18% : 0.000000s : 5: predicate.const_output_eliminate 0.62% : 0.000002s : 10: predicate.depend_value_elim 0.90% : 0.000002s : 15: predicate.dict_get_item_const_eliminator 0.98% : 0.000002s : 15: predicate.dict_get_item_eliminator 0.84% : 0.000002s : 15: predicate.dict_set_item_eliminator 0.98% : 0.000002s : 10: predicate.dumpgradient_eliminate 0.20% : 0.000001s : 5: predicate.elim_not_effective 0.47% : 0.000001s : 5: predicate.elim_shapecalc_of_broadcastargs 1.20% : 0.000003s : 20: predicate.environ_add_const_eliminate 1.10% : 0.000003s : 20: predicate.environ_get_add_eliminate 1.14% : 0.000003s : 20: predicate.environ_get_depend_swap 1.77% : 0.000004s : 30: predicate.environ_get_eliminate 1.05% : 0.000003s : 20: predicate.environ_get_set_eliminate 1.29% : 0.000003s : 23: predicate.exchange_switch_depend_value 2.08% : 0.000005s : 23: predicate.float_depend_g_call 0.55% : 0.000001s : 10: predicate.float_environ_get_switch 0.87% : 0.000002s : 15: predicate.float_tuple_getitem_switch 0.19% : 0.000000s : 5: predicate.fold_const_symbol 0.67% : 0.000002s : 10: predicate.get_grad_eliminate 0.22% : 0.000001s : 5: predicate.graph_param_transform 0.62% : 0.000002s : 10: predicate.incorporate_call 0.52% : 0.000001s : 10: predicate.incorporate_call_switch 6.49% : 0.000016s : 68: predicate.inline 0.79% : 0.000002s : 10: predicate.inline_without_move 0.33% : 0.000001s : 10: predicate.j_node_and_user_rematch 0.81% : 0.000002s : 10: predicate.less_batch_normalization 1.79% : 0.000004s : 29: predicate.list_to_tuple_eliminator_ 2.49% : 0.000006s : 44: predicate.load_eliminater 1.01% : 0.000003s : 5: predicate.loop_unroll_after_grad 2.05% : 0.000005s : 36: predicate.loop_unroll_before_grad 1.78% : 0.000004s : 25: predicate.make_slice_get_slice_eliminator 0.60% : 0.000002s : 10: predicate.merge_addn 0.60% : 0.000002s : 10: predicate.micro_step_allgather_replace 0.56% : 0.000001s : 10: predicate.mini_step_allgather_replace 0.77% : 0.000002s : 15: predicate.minmaximum_grad 2.78% : 0.000007s : 5: predicate.mutable_eliminate 0.39% : 0.000001s : 5: predicate.opt_reshape 0.37% : 0.000001s : 5: predicate.parallel_virtual_node 1.71% : 0.000004s : 23: predicate.partial_defer_inline 1.54% : 0.000004s : 24: predicate.partial_eliminate 0.83% : 0.000002s : 15: predicate.print_const_string_wrapper 0.62% : 0.000002s : 10: predicate.reduce_all_const_elim 1.04% : 0.000003s : 15: predicate.reduce_eliminate 2.51% : 0.000006s : 44: predicate.redundant_stop_gradient_eliminater 0.38% : 0.000001s : 10: predicate.remove_not_recompute_node 1.27% : 0.000003s : 29: predicate.replace_applicator 0.48% : 0.000001s : 10: predicate.replace_old_param 0.50% : 0.000001s : 5: predicate.reset_defer_inline 0.94% : 0.000002s : 15: predicate.reshape_eliminate 0.60% : 0.000001s : 10: predicate.row_tensor_add_zeros_like 0.53% : 0.000001s : 5: predicate.row_tensor_eliminate 0.71% : 0.000002s : 10: predicate.same_eliminate 0.44% : 0.000001s : 10: predicate.set_cell_output_no_recompute 0.75% : 0.000002s : 10: predicate.shard_identity_eliminate 0.85% : 0.000002s : 10: predicate.special_op_eliminate 0.73% : 0.000002s : 10: predicate.specialize_transform 1.05% : 0.000003s : 10: predicate.split_environ_get_set_with_tuple_value 0.74% : 0.000002s : 10: predicate.stack_unstack_eliminate 0.40% : 0.000001s : 5: predicate.switch_call_monad_eliminater 1.43% : 0.000004s : 23: predicate.switch_defer_inline 1.96% : 0.000005s : 33: predicate.switch_layer_defer_inline 4.65% : 0.000012s : 74: predicate.switch_simplify 0.81% : 0.000002s : 15: predicate.tile_eliminate 0.87% : 0.000002s : 15: predicate.transpose_eliminate 1.56% : 0.000004s : 25: predicate.tuple_list_convert_item_index_to_positive 1.55% : 0.000004s : 25: predicate.tuple_list_get_item_const_eliminator 1.66% : 0.000004s : 25: predicate.tuple_list_get_item_depend_reorder 3.27% : 0.000008s : 39: predicate.tuple_list_get_item_eliminator 1.48% : 0.000004s : 25: predicate.tuple_list_get_set_item_eliminator 2.24% : 0.000006s : 35: predicate.tuple_list_set_item_eliminator 1.82% : 0.000005s : 29: predicate.tuple_to_list_eliminator_ 2.38% : 0.000006s : 44: predicate.updatestate_pure_node_eliminater 3.21% : 0.000008s : 54: predicate.updatestate_useless_node_eliminater 0.42% : 0.000001s : 5: predicate.value_based_eliminate 0.66% : 0.000002s : 10: predicate.virtual_dataset_eliminate 0.63% : 0.000002s : 10: predicate.virtual_output_eliminate 0.28% : 0.000001s : 5: predicate.virtual_view_grad_eliminate 0.50% : 0.000001s : 5: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000598 11 60.37% : 0.000361s : 5: func_graph_cloner_run.FuncGraphClonerGraph 39.63% : 0.000237s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.069234 192 0.01% : 0.000007s : 1: ForceFp32Comm 4.64% : 0.003212s : 1: add_attr 4.62% : 0.003197s : 1: add_attr_with_inline 0.01% : 0.000006s : 1: add_comm_op_reuse_tag 0.10% : 0.000071s : 1: add_recomputation 0.01% : 0.000006s : 1: assign_add_opt 0.11% : 0.000074s : 1: auto_monad 0.05% : 0.000034s : 1: auto_monad_reorder 0.01% : 0.000006s : 1: begin_end_overlap_inline 0.01% : 0.000009s : 1: bias_add_comm_swap 0.67% : 0.000466s : 1: bootstrap 0.06% : 0.000044s : 1: cconv 0.01% : 0.000007s : 1: comm_op_add_attrs 0.04% : 0.000024s : 1: control_data_broadcast_order 0.02% : 0.000013s : 1: convert_after_rewriter 0.05% : 0.000035s : 1: cse_after_recomputation 0.01% : 0.000007s : 1: dataset_repeat_opt 0.03% : 0.000022s : 1: detach_backward 0.02% : 0.000014s : 1: environ_conv 0.04% : 0.000030s : 1: event_method 0.01% : 0.000007s : 1: full_micro_interleaved_order_control 0.01% : 0.000008s : 1: get_jit_bprop_graph 0.02% : 0.000012s : 1: graph_reusing 0.01% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000006s : 1: handle_group_info 0.01% : 0.000008s : 1: inline 0.01% : 0.000009s : 1: insert-virtual-dataset 0.01% : 0.000006s : 1: interleave_parallel_branches 0.01% : 0.000006s : 1: interleave_split_concat_branches 0.01% : 0.000008s : 1: label_fine_grained_interleaved_index 0.02% : 0.000011s : 1: label_micro_interleaved_index 1.01% : 0.000702s : 1: loop_unroll 0.01% : 0.000007s : 1: merge_cast_opt 0.01% : 0.000008s : 1: micro_interleaved_order_control 18.23% : 0.012619s : 1: mutable_eliminate 0.01% : 0.000010s : 1: offloading_packed_experts 0.03% : 0.000019s : 1: opt.transform.loop_unroll_optimizer 0.06% : 0.000041s : 1: opt.transform.mutable_eliminate 1.97% : 0.001364s : 78: opt.transform.opt_a 0.05% : 0.000038s : 1: opt.transform.opt_after_cconv 0.05% : 0.000033s : 1: opt.transform.opt_after_jit_grad 0.21% : 0.000147s : 28: opt.transform.opt_b 0.10% : 0.000066s : 2: opt.transform.opt_trans_graph 0.07% : 0.000048s : 4: opt.transform.symbol_engine_opt 4.63% : 0.003203s : 1: opt_a 0.23% : 0.000157s : 1: opt_after_cconv 0.83% : 0.000577s : 1: opt_after_jit_grad 0.54% : 0.000375s : 1: opt_b 26.62% : 0.018432s : 1: optimize 0.05% : 0.000034s : 1: optimize_parallel_all_gather_comm 0.02% : 0.000012s : 1: order_py_execute_after_rewriter 0.05% : 0.000032s : 1: overlap_grad_flash_sp 0.01% : 0.000006s : 1: overlap_grad_matmul_and_grad_allreduce 0.02% : 0.000010s : 1: overlap_grad_ring_attention 0.01% : 0.000008s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000006s : 1: overlap_opt_shard_in_pipeline 0.01% : 0.000009s : 1: overlap_param_gather 0.01% : 0.000007s : 1: overlap_recompute_allgather_and_fa_grad 0.01% : 0.000010s : 1: overlap_recompute_and_grad_model_parallel 0.01% : 0.000008s : 1: overlap_recompute_comm 0.01% : 0.000009s : 1: parallel-infer-symbol 0.01% : 0.000006s : 1: parallel-infer-symbol-second 0.01% : 0.000007s : 1: partial_unused_args_eliminate 0.01% : 0.000010s : 1: pipeline_parallel_scheduler 0.01% : 0.000007s : 1: pipeline_split 0.06% : 0.000040s : 1: pre_auto_parallel 0.05% : 0.000033s : 1: py_interpret_to_execute 0.02% : 0.000015s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000007s : 1: remove_cast_before_assign_add 0.03% : 0.000023s : 1: remove_dup_value 0.48% : 0.000334s : 1: renormalize.infer 0.40% : 0.000280s : 1: renormalize.specialize 0.01% : 0.000008s : 1: reorder_send_recv_between_fp_bp 0.02% : 0.000012s : 1: rewriter_after_jit_bprop_graph 0.06% : 0.000044s : 1: rewriter_after_opt_a 0.13% : 0.000091s : 1: rewriter_before_opt_a 0.01% : 0.000007s : 1: slice_cell_reuse_recomputed_activation 0.01% : 0.000007s : 1: slice_recompute_activation 0.01% : 0.000007s : 1: split_layernorm_comm 0.01% : 0.000007s : 1: split_matmul_comm_elemetwise 0.02% : 0.000012s : 1: swap_dp_allreduce_reducescatter 0.17% : 0.000117s : 1: symbol_engine_optimizer 0.16% : 0.000113s : 1: tuple_transform 32.75% : 0.022673s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:45:21.822.511 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0297088, [21] [bootstrap]: 0.00043981 [type_inference]: 0.00596219 [event_method]: 1.973e-05 [auto_monad]: 6.654e-05 [graph_reusing]: 6.39001e-06 [inline]: 2.54001e-06 [add_attr]: 0.00322229, [1] [add_attr_with_inline]: 0.00321283, [1] [Cycle 1]: 6.005e-05, [2] [tag_attr]: 1.95e-05 [meta_addattr_fg_expand]: 6.17001e-06 [parallel-infer-symbol]: 3.78999e-06 [pre_auto_parallel]: 3.548e-05 [insert-virtual-dataset]: 2.53e-06 [parallel-infer-symbol-second]: 8.99978e-07 [dataset_repeat_opt]: 2.07999e-06 [pipeline_split]: 2.16e-06 [optimize]: 0.0192455, [53] [py_interpret_to_execute]: 2.617e-05 [rewriter_before_opt_a]: 8.405e-05 [opt_a]: 0.0169336, [2] [Cycle 1]: 0.016096, [45] [expand_dump_flag]: 3.02002e-06 [switch_simplify]: 4.428e-05 [loop_unroll]: 3.11e-05 [a_1]: 0.00067103 [with_stream_mark]: 1.576e-05 [recompute_prepare]: 9.92999e-06 [updatestate_depend_eliminate]: 4.63999e-06 [updatestate_assign_eliminate]: 3.8e-06 [updatestate_loads_eliminate]: 3.92002e-06 [parameter_eliminate]: 1.77001e-06 [a_2]: 0.00010295 [accelerated_algorithm]: 8.36002e-06 [shard]: 2.02001e-06 [meta_shard_fg_expand]: 2.09e-06 [shard_inline]: 8.02e-06 [merge_send_recv]: 9.82999e-06 [auto_parallel]: 7.11001e-06 [parallel]: 1.929e-05 [flash_sp]: 9.12001e-06 [merge_comm]: 4.77998e-06 [allreduce_fusion]: 4.18001e-06 [matmul_add_comm_reduction]: 1.064e-05 [allreduce_slice_to_reducescatter]: 6.19999e-07 [virtual_shard_identity]: 9.37999e-06 [virtual_dataset]: 7.90998e-06 [get_grad_eliminate_]: 7.68999e-06 [virtual_output]: 7.73001e-06 [merge_forward]: 4.89998e-06 [cell_reuse_recompute_pass]: 1.15999e-06 [offload_activation]: 1.128e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.563e-05 [merge_recompute_call_nodes]: 1.49e-06 [before_grad]: 1.277e-05 [set_forward_comm_id_for_comm_node_pass]: 4.43999e-06 [meta_fg_expand]: 3.28e-06 [flash_sp_send_recv_attached]: 2.80002e-06 [receive_attached]: 2.59001e-06 [after_resolve]: 1.409e-05 [a_after_grad]: 1.194e-05 [renormalize]: 0.0145765 [add_forward_monad_depend]: 1.063e-05 [auto_monad_grad]: 2.41e-06 [auto_monad_eliminator]: 2.521e-05 [cse]: 4.077e-05 [a_3]: 7.431e-05 [Cycle 2]: 0.00082488, [45] [expand_dump_flag]: 2.27001e-06 [switch_simplify]: 1.088e-05 [loop_unroll]: 8.05e-06 [a_1]: 0.00019284 [with_stream_mark]: 1.922e-05 [recompute_prepare]: 7.97998e-06 [updatestate_depend_eliminate]: 4.82e-06 [updatestate_assign_eliminate]: 3.41999e-06 [updatestate_loads_eliminate]: 3.58e-06 [parameter_eliminate]: 1.84998e-06 [a_2]: 9.395e-05 [accelerated_algorithm]: 8.59e-06 [shard]: 2.86e-06 [meta_shard_fg_expand]: 2.18998e-06 [shard_inline]: 7.55003e-06 [merge_send_recv]: 9.47999e-06 [auto_parallel]: 9.27999e-06 [parallel]: 8.57e-06 [flash_sp]: 4.19002e-06 [merge_comm]: 4.3e-06 [allreduce_fusion]: 4.39998e-06 [matmul_add_comm_reduction]: 9.55001e-06 [allreduce_slice_to_reducescatter]: 7.99977e-07 [virtual_shard_identity]: 8.43999e-06 [virtual_dataset]: 7.77998e-06 [get_grad_eliminate_]: 7.18e-06 [virtual_output]: 7.55998e-06 [merge_forward]: 4.47998e-06 [cell_reuse_recompute_pass]: 3.11999e-06 [offload_activation]: 1.164e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.519e-05 [merge_recompute_call_nodes]: 1.35999e-06 [before_grad]: 1.336e-05 [set_forward_comm_id_for_comm_node_pass]: 4.71002e-06 [meta_fg_expand]: 3.76999e-06 [flash_sp_send_recv_attached]: 1.49e-06 [receive_attached]: 2.26e-06 [after_resolve]: 1.393e-05 [a_after_grad]: 1.186e-05 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 1.22e-06 [auto_monad_grad]: 1.02e-06 [auto_monad_eliminator]: 8.70999e-06 [cse]: 1.66e-05 [a_3]: 4.521e-05 [py_interpret_to_execute_after_opt_a]: 1.977e-05 [slice_cell_reuse_recomputed_activation]: 2.41e-06 [rewriter_after_opt_a]: 4.291e-05 [convert_after_rewriter]: 7.95e-06 [order_py_execute_after_rewriter]: 5.86e-06 [mutable_eliminate]: 0.00070776 [opt_b]: 0.00024126, [1] [Cycle 1]: 0.0002344, [7] [b_1]: 0.00015546 [b_2]: 9.56998e-06 [updatestate_depend_eliminate]: 5.34e-06 [updatestate_assign_eliminate]: 3.04999e-06 [updatestate_loads_eliminate]: 3.06001e-06 [renormalize]: 7.80012e-07 [cse]: 2.132e-05 [optimize_parallel_all_gather_comm]: 1.72e-05 [overlap_param_gather]: 2.11e-06 [cconv]: 2.425e-05 [loop_unroll]: 0.00042297 [opt_after_cconv]: 0.00011316, [1] [Cycle 1]: 0.00010728, [7] [c_1]: 3.673e-05 [parameter_eliminate]: 2.66e-06 [updatestate_depend_eliminate]: 5.92999e-06 [updatestate_assign_eliminate]: 3.00002e-06 [updatestate_loads_eliminate]: 2.76e-06 [cse]: 2.145e-05 [renormalize]: 3.89991e-07 [remove_dup_value]: 1.499e-05 [tuple_transform]: 8.55e-05, [1] [Cycle 1]: 8.082e-05, [4] [d_1]: 5.152e-05 [none_parameter_eliminate]: 1.54e-06 [renormalize]: 2.69996e-07 [switch_simplify]: 8.50999e-06 [partial_unused_args_eliminate]: 2.12999e-06 [add_recomputation]: 5.497e-05 [cse_after_recomputation]: 2.473e-05, [1] [Cycle 1]: 2.012e-05, [1] [cse]: 1.465e-05 [environ_conv]: 6.53e-06 [swap_dp_allreduce_reducescatter]: 5.56e-06 [bias_add_comm_swap]: 2.70002e-06 [label_micro_interleaved_index]: 4.22e-06 [label_fine_grained_interleaved_index]: 2.66e-06 [merge_cast_opt]: 1.24e-06 [slice_recompute_activation]: 2.01998e-06 [micro_interleaved_order_control]: 2.29001e-06 [assign_add_opt]: 1.22999e-06 [ForceFp32Comm]: 1.12999e-06 [remove_cast_before_assign_add]: 1.29e-06 [full_micro_interleaved_order_control]: 2.35997e-06 [reorder_send_recv_between_fp_bp]: 3.01999e-06 [comm_op_add_attrs]: 1.02e-06 [add_comm_op_reuse_tag]: 1.12999e-06 [interleave_split_concat_branches]: 1.25999e-06 [interleave_parallel_branches]: 1.09e-06 [overlap_opt_shard_in_pipeline]: 1.64e-06 [overlap_opt_shard_grad_in_pipeline]: 1.80001e-06 [control_data_broadcast_order]: 1.413e-05 [grouped_pairwise_exchange_alltoall]: 1.42e-06 [offloading_packed_experts]: 4.08999e-06 [overlap_recompute_and_grad_model_parallel]: 5.29e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.50001e-06 [overlap_recompute_allgather_and_fa_grad]: 1.34e-06 [overlap_recompute_comm]: 2.27999e-06 [overlap_grad_ring_attention]: 4.26001e-06 [overlap_grad_flash_sp]: 2.356e-05 [begin_end_overlap_inline]: 6.60017e-07 [split_matmul_comm_elemetwise]: 2.68e-06 [split_layernorm_comm]: 1.65001e-06 [handle_group_info]: 9.09989e-07 [symbol_engine_optimizer]: 8.41e-05, [1] [Cycle 1]: 7.964e-05, [6] [build]: 4.53999e-06 [elim_shapecalc]: 1.141e-05 [elim_not_effective]: 1.497e-05 [opt_reshape]: 8.3e-06 [fold_const_symbol]: 1.189e-05 [renormalize]: 2.19996e-07 [detach_backward]: 2.17999e-06 [pipeline_parallel_scheduler]: 1.47999e-06 [auto_monad_reorder]: 1.972e-05 [get_jit_bprop_graph]: 1.76e-06 [rewriter_after_jit_bprop_graph]: 3.86001e-06 [opt_after_jit_grad]: 0.00047103 [validate]: 4.42e-05 Sums bootstrap : 0.000440s : 1.73% type_inference : 0.005962s : 23.39% event_method : 0.000020s : 0.08% auto_monad : 0.000067s : 0.26% graph_reusing : 0.000006s : 0.03% inline : 0.000003s : 0.01% add_attr.add_attr_with_inline.tag_attr : 0.000020s : 0.08% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.02% parallel-infer-symbol : 0.000004s : 0.01% pre_auto_parallel : 0.000035s : 0.14% insert-virtual-dataset : 0.000003s : 0.01% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.01% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000026s : 0.10% optimize.rewriter_before_opt_a : 0.000084s : 0.33% optimize.opt_a.expand_dump_flag : 0.000005s : 0.02% optimize.opt_a.switch_simplify : 0.000055s : 0.22% optimize.opt_a.loop_unroll : 0.000039s : 0.15% optimize.opt_a.a_1 : 0.000864s : 3.39% optimize.opt_a.with_stream_mark : 0.000035s : 0.14% optimize.opt_a.recompute_prepare : 0.000018s : 0.07% optimize.opt_a.updatestate_depend_eliminate : 0.000009s : 0.04% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.03% optimize.opt_a.updatestate_loads_eliminate : 0.000008s : 0.03% optimize.opt_a.parameter_eliminate : 0.000004s : 0.01% optimize.opt_a.a_2 : 0.000197s : 0.77% optimize.opt_a.accelerated_algorithm : 0.000017s : 0.07% optimize.opt_a.shard : 0.000005s : 0.02% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.02% optimize.opt_a.shard_inline : 0.000016s : 0.06% optimize.opt_a.merge_send_recv : 0.000019s : 0.08% optimize.opt_a.auto_parallel : 0.000016s : 0.06% optimize.opt_a.parallel : 0.000028s : 0.11% optimize.opt_a.flash_sp : 0.000013s : 0.05% optimize.opt_a.merge_comm : 0.000009s : 0.04% optimize.opt_a.allreduce_fusion : 0.000009s : 0.03% optimize.opt_a.matmul_add_comm_reduction : 0.000020s : 0.08% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000018s : 0.07% optimize.opt_a.virtual_dataset : 0.000016s : 0.06% optimize.opt_a.get_grad_eliminate_ : 0.000015s : 0.06% optimize.opt_a.virtual_output : 0.000015s : 0.06% optimize.opt_a.merge_forward : 0.000009s : 0.04% optimize.opt_a.cell_reuse_recompute_pass : 0.000004s : 0.02% optimize.opt_a.offload_activation : 0.000023s : 0.09% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000031s : 0.12% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.01% optimize.opt_a.before_grad : 0.000026s : 0.10% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000009s : 0.04% optimize.opt_a.meta_fg_expand : 0.000007s : 0.03% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.02% optimize.opt_a.receive_attached : 0.000005s : 0.02% optimize.opt_a.after_resolve : 0.000028s : 0.11% optimize.opt_a.a_after_grad : 0.000024s : 0.09% optimize.opt_a.renormalize : 0.014577s : 57.17% optimize.opt_a.add_forward_monad_depend : 0.000012s : 0.05% optimize.opt_a.auto_monad_grad : 0.000003s : 0.01% optimize.opt_a.auto_monad_eliminator : 0.000034s : 0.13% optimize.opt_a.cse : 0.000057s : 0.23% optimize.opt_a.a_3 : 0.000120s : 0.47% optimize.py_interpret_to_execute_after_opt_a : 0.000020s : 0.08% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.01% optimize.rewriter_after_opt_a : 0.000043s : 0.17% optimize.convert_after_rewriter : 0.000008s : 0.03% optimize.order_py_execute_after_rewriter : 0.000006s : 0.02% optimize.mutable_eliminate : 0.000708s : 2.78% optimize.opt_b.b_1 : 0.000155s : 0.61% optimize.opt_b.b_2 : 0.000010s : 0.04% optimize.opt_b.updatestate_depend_eliminate : 0.000005s : 0.02% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.01% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.01% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000021s : 0.08% optimize.optimize_parallel_all_gather_comm : 0.000017s : 0.07% optimize.overlap_param_gather : 0.000002s : 0.01% optimize.cconv : 0.000024s : 0.10% optimize.loop_unroll : 0.000423s : 1.66% optimize.opt_after_cconv.c_1 : 0.000037s : 0.14% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.02% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.cse : 0.000021s : 0.08% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000015s : 0.06% optimize.tuple_transform.d_1 : 0.000052s : 0.20% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000009s : 0.03% optimize.partial_unused_args_eliminate : 0.000002s : 0.01% optimize.add_recomputation : 0.000055s : 0.22% optimize.cse_after_recomputation.cse : 0.000015s : 0.06% optimize.environ_conv : 0.000007s : 0.03% optimize.swap_dp_allreduce_reducescatter : 0.000006s : 0.02% optimize.bias_add_comm_swap : 0.000003s : 0.01% optimize.label_micro_interleaved_index : 0.000004s : 0.02% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.01% optimize.merge_cast_opt : 0.000001s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.01% optimize.micro_interleaved_order_control : 0.000002s : 0.01% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.01% optimize.full_micro_interleaved_order_control : 0.000002s : 0.01% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.01% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000002s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.01% optimize.control_data_broadcast_order : 0.000014s : 0.06% optimize.grouped_pairwise_exchange_alltoall : 0.000001s : 0.01% optimize.offloading_packed_experts : 0.000004s : 0.02% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.02% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000002s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.01% optimize.overlap_recompute_comm : 0.000002s : 0.01% optimize.overlap_grad_ring_attention : 0.000004s : 0.02% optimize.overlap_grad_flash_sp : 0.000024s : 0.09% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000003s : 0.01% optimize.split_layernorm_comm : 0.000002s : 0.01% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000005s : 0.02% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000011s : 0.04% optimize.symbol_engine_optimizer.elim_not_effective : 0.000015s : 0.06% optimize.symbol_engine_optimizer.opt_reshape : 0.000008s : 0.03% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000012s : 0.05% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.01% pipeline_parallel_scheduler : 0.000001s : 0.01% auto_monad_reorder : 0.000020s : 0.08% get_jit_bprop_graph : 0.000002s : 0.01% rewriter_after_jit_bprop_graph : 0.000004s : 0.02% opt_after_jit_grad : 0.000471s : 1.85% validate : 0.000044s : 0.17% Time group info: ------[substitution.] 0.000218 38 14.00% : 0.000030s : 3: substitution.cast_eliminate 1.13% : 0.000002s : 3: substitution.elim_not_effective 0.78% : 0.000002s : 3: substitution.fold_const_symbol 2.94% : 0.000006s : 5: substitution.graph_param_transform 66.69% : 0.000145s : 4: substitution.inline 2.72% : 0.000006s : 6: substitution.j_node_and_user_rematch 3.19% : 0.000007s : 6: substitution.remove_not_recompute_node 2.84% : 0.000006s : 4: substitution.replace_old_param 5.72% : 0.000012s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.005897 2 87.64% : 0.005168s : 1: type_inference.infer 12.36% : 0.000729s : 1: type_inference.specialize ------[replace.] 0.000061 8 61.92% : 0.000037s : 4: replace.inline 38.08% : 0.000023s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000153 8 93.11% : 0.000142s : 4: match.inline 6.89% : 0.000011s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000244 1504 0.95% : 0.000002s : 15: predicate.accumulaten_eliminater 0.65% : 0.000002s : 5: predicate.ad_related_special_op_eliminate 0.59% : 0.000001s : 10: predicate.addn_check_dump 1.02% : 0.000003s : 15: predicate.addn_zero_filter 0.84% : 0.000002s : 15: predicate.adjust_all_reduce_mul_add 2.09% : 0.000005s : 25: predicate.arithmetic_simplify 1.18% : 0.000003s : 15: predicate.cast_eliminate 0.62% : 0.000002s : 10: predicate.check_bprop_eliminate 0.58% : 0.000001s : 10: predicate.compare_switch_simplify 0.20% : 0.000000s : 5: predicate.const_output_eliminate 0.63% : 0.000002s : 10: predicate.depend_value_elim 0.93% : 0.000002s : 15: predicate.dict_get_item_const_eliminator 1.21% : 0.000003s : 15: predicate.dict_get_item_eliminator 0.87% : 0.000002s : 15: predicate.dict_set_item_eliminator 1.00% : 0.000002s : 10: predicate.dumpgradient_eliminate 0.23% : 0.000001s : 5: predicate.elim_not_effective 0.36% : 0.000001s : 5: predicate.elim_shapecalc_of_broadcastargs 1.23% : 0.000003s : 20: predicate.environ_add_const_eliminate 1.09% : 0.000003s : 20: predicate.environ_get_add_eliminate 1.13% : 0.000003s : 20: predicate.environ_get_depend_swap 1.74% : 0.000004s : 30: predicate.environ_get_eliminate 1.17% : 0.000003s : 20: predicate.environ_get_set_eliminate 1.42% : 0.000003s : 23: predicate.exchange_switch_depend_value 2.35% : 0.000006s : 23: predicate.float_depend_g_call 0.59% : 0.000001s : 10: predicate.float_environ_get_switch 0.86% : 0.000002s : 15: predicate.float_tuple_getitem_switch 0.18% : 0.000000s : 5: predicate.fold_const_symbol 0.77% : 0.000002s : 10: predicate.get_grad_eliminate 0.19% : 0.000000s : 5: predicate.graph_param_transform 0.61% : 0.000001s : 10: predicate.incorporate_call 0.54% : 0.000001s : 10: predicate.incorporate_call_switch 6.03% : 0.000015s : 68: predicate.inline 0.84% : 0.000002s : 10: predicate.inline_without_move 0.32% : 0.000001s : 10: predicate.j_node_and_user_rematch 0.98% : 0.000002s : 10: predicate.less_batch_normalization 1.72% : 0.000004s : 29: predicate.list_to_tuple_eliminator_ 2.42% : 0.000006s : 44: predicate.load_eliminater 0.74% : 0.000002s : 5: predicate.loop_unroll_after_grad 2.16% : 0.000005s : 36: predicate.loop_unroll_before_grad 1.79% : 0.000004s : 25: predicate.make_slice_get_slice_eliminator 0.62% : 0.000002s : 10: predicate.merge_addn 0.58% : 0.000001s : 10: predicate.micro_step_allgather_replace 0.62% : 0.000002s : 10: predicate.mini_step_allgather_replace 0.86% : 0.000002s : 15: predicate.minmaximum_grad 0.82% : 0.000002s : 5: predicate.mutable_eliminate 0.46% : 0.000001s : 5: predicate.opt_reshape 0.32% : 0.000001s : 5: predicate.parallel_virtual_node 1.89% : 0.000005s : 23: predicate.partial_defer_inline 1.60% : 0.000004s : 24: predicate.partial_eliminate 0.96% : 0.000002s : 15: predicate.print_const_string_wrapper 0.59% : 0.000001s : 10: predicate.reduce_all_const_elim 1.27% : 0.000003s : 15: predicate.reduce_eliminate 2.53% : 0.000006s : 44: predicate.redundant_stop_gradient_eliminater 0.40% : 0.000001s : 10: predicate.remove_not_recompute_node 1.39% : 0.000003s : 29: predicate.replace_applicator 0.57% : 0.000001s : 10: predicate.replace_old_param 0.27% : 0.000001s : 5: predicate.reset_defer_inline 0.97% : 0.000002s : 15: predicate.reshape_eliminate 0.61% : 0.000001s : 10: predicate.row_tensor_add_zeros_like 0.37% : 0.000001s : 5: predicate.row_tensor_eliminate 0.80% : 0.000002s : 10: predicate.same_eliminate 0.45% : 0.000001s : 10: predicate.set_cell_output_no_recompute 0.77% : 0.000002s : 10: predicate.shard_identity_eliminate 0.74% : 0.000002s : 10: predicate.special_op_eliminate 0.74% : 0.000002s : 10: predicate.specialize_transform 0.99% : 0.000002s : 10: predicate.split_environ_get_set_with_tuple_value 0.93% : 0.000002s : 10: predicate.stack_unstack_eliminate 0.35% : 0.000001s : 5: predicate.switch_call_monad_eliminater 1.45% : 0.000004s : 23: predicate.switch_defer_inline 2.00% : 0.000005s : 33: predicate.switch_layer_defer_inline 4.94% : 0.000012s : 74: predicate.switch_simplify 0.90% : 0.000002s : 15: predicate.tile_eliminate 1.08% : 0.000003s : 15: predicate.transpose_eliminate 1.53% : 0.000004s : 25: predicate.tuple_list_convert_item_index_to_positive 1.63% : 0.000004s : 25: predicate.tuple_list_get_item_const_eliminator 1.45% : 0.000004s : 25: predicate.tuple_list_get_item_depend_reorder 3.30% : 0.000008s : 39: predicate.tuple_list_get_item_eliminator 1.50% : 0.000004s : 25: predicate.tuple_list_get_set_item_eliminator 2.21% : 0.000005s : 35: predicate.tuple_list_set_item_eliminator 1.77% : 0.000004s : 29: predicate.tuple_to_list_eliminator_ 2.36% : 0.000006s : 44: predicate.updatestate_pure_node_eliminater 3.08% : 0.000008s : 54: predicate.updatestate_useless_node_eliminater 0.40% : 0.000001s : 5: predicate.value_based_eliminate 0.73% : 0.000002s : 10: predicate.virtual_dataset_eliminate 0.71% : 0.000002s : 10: predicate.virtual_output_eliminate 0.28% : 0.000001s : 5: predicate.virtual_view_grad_eliminate 0.39% : 0.000001s : 5: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000604 11 46.60% : 0.000281s : 5: func_graph_cloner_run.FuncGraphClonerGraph 53.40% : 0.000323s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.068334 192 0.01% : 0.000004s : 1: ForceFp32Comm 4.72% : 0.003228s : 1: add_attr 4.71% : 0.003216s : 1: add_attr_with_inline 0.01% : 0.000004s : 1: add_comm_op_reuse_tag 0.09% : 0.000059s : 1: add_recomputation 0.01% : 0.000004s : 1: assign_add_opt 0.10% : 0.000072s : 1: auto_monad 0.03% : 0.000024s : 1: auto_monad_reorder 0.01% : 0.000004s : 1: begin_end_overlap_inline 0.01% : 0.000006s : 1: bias_add_comm_swap 0.68% : 0.000467s : 1: bootstrap 0.04% : 0.000028s : 1: cconv 0.01% : 0.000004s : 1: comm_op_add_attrs 0.03% : 0.000017s : 1: control_data_broadcast_order 0.02% : 0.000011s : 1: convert_after_rewriter 0.04% : 0.000028s : 1: cse_after_recomputation 0.01% : 0.000005s : 1: dataset_repeat_opt 0.01% : 0.000005s : 1: detach_backward 0.01% : 0.000010s : 1: environ_conv 0.04% : 0.000026s : 1: event_method 0.01% : 0.000005s : 1: full_micro_interleaved_order_control 0.01% : 0.000005s : 1: get_jit_bprop_graph 0.01% : 0.000010s : 1: graph_reusing 0.01% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000004s : 1: handle_group_info 0.01% : 0.000006s : 1: inline 0.01% : 0.000006s : 1: insert-virtual-dataset 0.01% : 0.000004s : 1: interleave_parallel_branches 0.01% : 0.000004s : 1: interleave_split_concat_branches 0.01% : 0.000006s : 1: label_fine_grained_interleaved_index 0.01% : 0.000007s : 1: label_micro_interleaved_index 0.63% : 0.000431s : 1: loop_unroll 0.01% : 0.000004s : 1: merge_cast_opt 0.01% : 0.000005s : 1: micro_interleaved_order_control 1.05% : 0.000717s : 1: mutable_eliminate 0.01% : 0.000007s : 1: offloading_packed_experts 0.02% : 0.000016s : 1: opt.transform.loop_unroll_optimizer 0.02% : 0.000016s : 1: opt.transform.mutable_eliminate 2.04% : 0.001397s : 78: opt.transform.opt_a 0.05% : 0.000035s : 1: opt.transform.opt_after_cconv 0.04% : 0.000028s : 1: opt.transform.opt_after_jit_grad 0.20% : 0.000134s : 28: opt.transform.opt_b 0.08% : 0.000058s : 2: opt.transform.opt_trans_graph 0.06% : 0.000043s : 4: opt.transform.symbol_engine_opt 24.79% : 0.016937s : 1: opt_a 0.17% : 0.000117s : 1: opt_after_cconv 0.70% : 0.000480s : 1: opt_after_jit_grad 0.36% : 0.000245s : 1: opt_b 28.17% : 0.019251s : 1: optimize 0.03% : 0.000021s : 1: optimize_parallel_all_gather_comm 0.01% : 0.000009s : 1: order_py_execute_after_rewriter 0.04% : 0.000027s : 1: overlap_grad_flash_sp 0.01% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.01% : 0.000008s : 1: overlap_grad_ring_attention 0.01% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000005s : 1: overlap_opt_shard_in_pipeline 0.01% : 0.000005s : 1: overlap_param_gather 0.01% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.01% : 0.000008s : 1: overlap_recompute_and_grad_model_parallel 0.01% : 0.000005s : 1: overlap_recompute_comm 0.01% : 0.000007s : 1: parallel-infer-symbol 0.01% : 0.000004s : 1: parallel-infer-symbol-second 0.01% : 0.000005s : 1: partial_unused_args_eliminate 0.01% : 0.000005s : 1: pipeline_parallel_scheduler 0.01% : 0.000005s : 1: pipeline_split 0.06% : 0.000040s : 1: pre_auto_parallel 0.04% : 0.000030s : 1: py_interpret_to_execute 0.03% : 0.000023s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000004s : 1: remove_cast_before_assign_add 0.03% : 0.000018s : 1: remove_dup_value 20.55% : 0.014040s : 1: renormalize.infer 0.76% : 0.000518s : 1: renormalize.specialize 0.01% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.01% : 0.000007s : 1: rewriter_after_jit_bprop_graph 0.07% : 0.000047s : 1: rewriter_after_opt_a 0.13% : 0.000088s : 1: rewriter_before_opt_a 0.01% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.01% : 0.000005s : 1: slice_recompute_activation 0.01% : 0.000005s : 1: split_layernorm_comm 0.01% : 0.000006s : 1: split_matmul_comm_elemetwise 0.01% : 0.000009s : 1: swap_dp_allreduce_reducescatter 0.13% : 0.000087s : 1: symbol_engine_optimizer 0.13% : 0.000088s : 1: tuple_transform 8.75% : 0.005982s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:45:22.335.003 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:45:22.335.258 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0322014, [21] [bootstrap]: 0.0165088 [type_inference]: 0.00578242 [event_method]: 1.806e-05 [auto_monad]: 6.104e-05 [graph_reusing]: 6.59999e-06 [inline]: 2.29001e-06 [add_attr]: 0.00325062, [1] [add_attr_with_inline]: 0.00324111, [1] [Cycle 1]: 7.405e-05, [2] [tag_attr]: 1.987e-05 [meta_addattr_fg_expand]: 5.91e-06 [parallel-infer-symbol]: 3.39001e-06 [pre_auto_parallel]: 3.332e-05 [insert-virtual-dataset]: 2.31e-06 [parallel-infer-symbol-second]: 7.10017e-07 [dataset_repeat_opt]: 1.82001e-06 [pipeline_split]: 1.59998e-06 [optimize]: 0.00534774, [53] [py_interpret_to_execute]: 2.889e-05 [rewriter_before_opt_a]: 8.577e-05 [opt_a]: 0.00305453, [2] [Cycle 1]: 0.00216738, [45] [expand_dump_flag]: 2.89999e-06 [switch_simplify]: 4.567e-05 [loop_unroll]: 3.185e-05 [a_1]: 0.00064556 [with_stream_mark]: 1.601e-05 [recompute_prepare]: 9.66998e-06 [updatestate_depend_eliminate]: 4.02e-06 [updatestate_assign_eliminate]: 3.28998e-06 [updatestate_loads_eliminate]: 3.47002e-06 [parameter_eliminate]: 1.87001e-06 [a_2]: 0.00011068 [accelerated_algorithm]: 7.51001e-06 [shard]: 1.92999e-06 [meta_shard_fg_expand]: 1.93002e-06 [shard_inline]: 7.26999e-06 [merge_send_recv]: 8.42998e-06 [auto_parallel]: 6.41998e-06 [parallel]: 1.875e-05 [flash_sp]: 8.82999e-06 [merge_comm]: 4.40999e-06 [allreduce_fusion]: 4.02e-06 [matmul_add_comm_reduction]: 9.96e-06 [allreduce_slice_to_reducescatter]: 6.69999e-07 [virtual_shard_identity]: 8.60999e-06 [virtual_dataset]: 7.42998e-06 [get_grad_eliminate_]: 6.58e-06 [virtual_output]: 6.89999e-06 [merge_forward]: 3.77002e-06 [cell_reuse_recompute_pass]: 1.11002e-06 [offload_activation]: 1.02e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.429e-05 [merge_recompute_call_nodes]: 1.69e-06 [before_grad]: 1.07e-05 [set_forward_comm_id_for_comm_node_pass]: 3.86999e-06 [meta_fg_expand]: 2.93998e-06 [flash_sp_send_recv_attached]: 2.83e-06 [receive_attached]: 2.29001e-06 [after_resolve]: 1.237e-05 [a_after_grad]: 1.02e-05 [renormalize]: 0.0006043 [add_forward_monad_depend]: 5.00999e-06 [auto_monad_grad]: 2.63e-06 [auto_monad_eliminator]: 1.554e-05 [cse]: 2.807e-05 [a_3]: 6.042e-05 [Cycle 2]: 0.00087383, [45] [expand_dump_flag]: 1.09e-06 [switch_simplify]: 7.83999e-06 [loop_unroll]: 6.64001e-06 [a_1]: 0.00013095 [with_stream_mark]: 1.082e-05 [recompute_prepare]: 3.011e-05 [updatestate_depend_eliminate]: 3.56999e-06 [updatestate_assign_eliminate]: 2.41e-06 [updatestate_loads_eliminate]: 2.12999e-06 [parameter_eliminate]: 1.40999e-06 [a_2]: 0.00010765 [accelerated_algorithm]: 6.64001e-06 [shard]: 1.27999e-06 [meta_shard_fg_expand]: 1.32e-06 [shard_inline]: 6.34001e-06 [merge_send_recv]: 4.91997e-06 [auto_parallel]: 6.19999e-06 [parallel]: 4.84998e-06 [flash_sp]: 3.48999e-06 [merge_comm]: 3.83999e-06 [allreduce_fusion]: 3.65998e-06 [matmul_add_comm_reduction]: 6.59999e-06 [allreduce_slice_to_reducescatter]: 4.59986e-07 [virtual_shard_identity]: 8.25999e-06 [virtual_dataset]: 7.03e-06 [get_grad_eliminate_]: 6.36e-06 [virtual_output]: 6.21998e-06 [merge_forward]: 2.98e-06 [cell_reuse_recompute_pass]: 1.69e-06 [offload_activation]: 6.59001e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.521e-05 [merge_recompute_call_nodes]: 6.89994e-07 [before_grad]: 1.06e-05 [set_forward_comm_id_for_comm_node_pass]: 3.28998e-06 [meta_fg_expand]: 2.11998e-06 [flash_sp_send_recv_attached]: 9.70002e-07 [receive_attached]: 1.12999e-06 [after_resolve]: 1.32e-05 [a_after_grad]: 9.71998e-06 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 1.44e-06 [auto_monad_grad]: 1.10999e-06 [auto_monad_eliminator]: 7.69002e-06 [cse]: 1.38e-05 [a_3]: 4.903e-05 [py_interpret_to_execute_after_opt_a]: 1.224e-05 [slice_cell_reuse_recomputed_activation]: 5.20999e-06 [rewriter_after_opt_a]: 3.995e-05 [convert_after_rewriter]: 1.004e-05 [order_py_execute_after_rewriter]: 8.07e-06 [mutable_eliminate]: 0.00051241 [opt_b]: 0.00026885, [1] [Cycle 1]: 0.0002593, [7] [b_1]: 0.00016685 [b_2]: 8.12998e-06 [updatestate_depend_eliminate]: 5.77999e-06 [updatestate_assign_eliminate]: 2.65002e-06 [updatestate_loads_eliminate]: 2.36e-06 [renormalize]: 3.00002e-07 [cse]: 1.752e-05 [optimize_parallel_all_gather_comm]: 2.048e-05 [overlap_param_gather]: 5.14e-06 [cconv]: 2.676e-05 [loop_unroll]: 0.00046666 [opt_after_cconv]: 0.00012868, [1] [Cycle 1]: 0.00011999, [7] [c_1]: 3.386e-05 [parameter_eliminate]: 2.86e-06 [updatestate_depend_eliminate]: 5.64e-06 [updatestate_assign_eliminate]: 2.46e-06 [updatestate_loads_eliminate]: 2.42001e-06 [cse]: 1.742e-05 [renormalize]: 2.69996e-07 [remove_dup_value]: 1.653e-05 [tuple_transform]: 9.049e-05, [1] [Cycle 1]: 8.372e-05, [4] [d_1]: 4.452e-05 [none_parameter_eliminate]: 1.71e-06 [renormalize]: 1.79978e-07 [switch_simplify]: 7.27002e-06 [partial_unused_args_eliminate]: 5.40001e-06 [add_recomputation]: 4.999e-05 [cse_after_recomputation]: 2.611e-05, [1] [Cycle 1]: 1.941e-05, [1] [cse]: 1.06e-05 [environ_conv]: 8.00999e-06 [swap_dp_allreduce_reducescatter]: 7.71001e-06 [bias_add_comm_swap]: 5.66998e-06 [label_micro_interleaved_index]: 6.93e-06 [label_fine_grained_interleaved_index]: 5.47001e-06 [merge_cast_opt]: 3.88001e-06 [slice_recompute_activation]: 4.73001e-06 [micro_interleaved_order_control]: 4.49002e-06 [assign_add_opt]: 3.56001e-06 [ForceFp32Comm]: 3.26999e-06 [remove_cast_before_assign_add]: 3.8e-06 [full_micro_interleaved_order_control]: 4.43999e-06 [reorder_send_recv_between_fp_bp]: 5.14e-06 [comm_op_add_attrs]: 3.53e-06 [add_comm_op_reuse_tag]: 3.25e-06 [interleave_split_concat_branches]: 3.66001e-06 [interleave_parallel_branches]: 4.02e-06 [overlap_opt_shard_in_pipeline]: 3.75998e-06 [overlap_opt_shard_grad_in_pipeline]: 4.2e-06 [control_data_broadcast_order]: 1.511e-05 [grouped_pairwise_exchange_alltoall]: 4.17e-06 [offloading_packed_experts]: 6.69999e-06 [overlap_recompute_and_grad_model_parallel]: 7.48999e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.53999e-06 [overlap_recompute_allgather_and_fa_grad]: 4.03001e-06 [overlap_recompute_comm]: 4.89e-06 [overlap_grad_ring_attention]: 6.43003e-06 [overlap_grad_flash_sp]: 2.307e-05 [begin_end_overlap_inline]: 3.11001e-06 [split_matmul_comm_elemetwise]: 4.68999e-06 [split_layernorm_comm]: 4.30999e-06 [handle_group_info]: 3.86001e-06 [symbol_engine_optimizer]: 9.444e-05, [1] [Cycle 1]: 8.772e-05, [6] [build]: 3.16001e-06 [elim_shapecalc]: 9.64e-06 [elim_not_effective]: 1.241e-05 [opt_reshape]: 7.25998e-06 [fold_const_symbol]: 1.004e-05 [renormalize]: 2.80008e-07 [detach_backward]: 3.83999e-06 [pipeline_parallel_scheduler]: 1.91998e-06 [auto_monad_reorder]: 2.018e-05 [get_jit_bprop_graph]: 1.56002e-06 [rewriter_after_jit_bprop_graph]: 4.55001e-06 [opt_after_jit_grad]: 0.00050902 [validate]: 3.932e-05 Sums bootstrap : 0.016509s : 60.74% type_inference : 0.005782s : 21.28% event_method : 0.000018s : 0.07% auto_monad : 0.000061s : 0.22% graph_reusing : 0.000007s : 0.02% inline : 0.000002s : 0.01% add_attr.add_attr_with_inline.tag_attr : 0.000020s : 0.07% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.02% parallel-infer-symbol : 0.000003s : 0.01% pre_auto_parallel : 0.000033s : 0.12% insert-virtual-dataset : 0.000002s : 0.01% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.01% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000029s : 0.11% optimize.rewriter_before_opt_a : 0.000086s : 0.32% optimize.opt_a.expand_dump_flag : 0.000004s : 0.01% optimize.opt_a.switch_simplify : 0.000054s : 0.20% optimize.opt_a.loop_unroll : 0.000038s : 0.14% optimize.opt_a.a_1 : 0.000777s : 2.86% optimize.opt_a.with_stream_mark : 0.000027s : 0.10% optimize.opt_a.recompute_prepare : 0.000040s : 0.15% optimize.opt_a.updatestate_depend_eliminate : 0.000008s : 0.03% optimize.opt_a.updatestate_assign_eliminate : 0.000006s : 0.02% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.02% optimize.opt_a.parameter_eliminate : 0.000003s : 0.01% optimize.opt_a.a_2 : 0.000218s : 0.80% optimize.opt_a.accelerated_algorithm : 0.000014s : 0.05% optimize.opt_a.shard : 0.000003s : 0.01% optimize.opt_a.meta_shard_fg_expand : 0.000003s : 0.01% optimize.opt_a.shard_inline : 0.000014s : 0.05% optimize.opt_a.merge_send_recv : 0.000013s : 0.05% optimize.opt_a.auto_parallel : 0.000013s : 0.05% optimize.opt_a.parallel : 0.000024s : 0.09% optimize.opt_a.flash_sp : 0.000012s : 0.05% optimize.opt_a.merge_comm : 0.000008s : 0.03% optimize.opt_a.allreduce_fusion : 0.000008s : 0.03% optimize.opt_a.matmul_add_comm_reduction : 0.000017s : 0.06% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000017s : 0.06% optimize.opt_a.virtual_dataset : 0.000014s : 0.05% optimize.opt_a.get_grad_eliminate_ : 0.000013s : 0.05% optimize.opt_a.virtual_output : 0.000013s : 0.05% optimize.opt_a.merge_forward : 0.000007s : 0.02% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.01% optimize.opt_a.offload_activation : 0.000017s : 0.06% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000030s : 0.11% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.01% optimize.opt_a.before_grad : 0.000021s : 0.08% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000007s : 0.03% optimize.opt_a.meta_fg_expand : 0.000005s : 0.02% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.01% optimize.opt_a.receive_attached : 0.000003s : 0.01% optimize.opt_a.after_resolve : 0.000026s : 0.09% optimize.opt_a.a_after_grad : 0.000020s : 0.07% optimize.opt_a.renormalize : 0.000604s : 2.22% optimize.opt_a.add_forward_monad_depend : 0.000006s : 0.02% optimize.opt_a.auto_monad_grad : 0.000004s : 0.01% optimize.opt_a.auto_monad_eliminator : 0.000023s : 0.09% optimize.opt_a.cse : 0.000042s : 0.15% optimize.opt_a.a_3 : 0.000109s : 0.40% optimize.py_interpret_to_execute_after_opt_a : 0.000012s : 0.05% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.02% optimize.rewriter_after_opt_a : 0.000040s : 0.15% optimize.convert_after_rewriter : 0.000010s : 0.04% optimize.order_py_execute_after_rewriter : 0.000008s : 0.03% optimize.mutable_eliminate : 0.000512s : 1.89% optimize.opt_b.b_1 : 0.000167s : 0.61% optimize.opt_b.b_2 : 0.000008s : 0.03% optimize.opt_b.updatestate_depend_eliminate : 0.000006s : 0.02% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.01% optimize.opt_b.updatestate_loads_eliminate : 0.000002s : 0.01% optimize.opt_b.renormalize : 0.000000s : 0.00% optimize.opt_b.cse : 0.000018s : 0.06% optimize.optimize_parallel_all_gather_comm : 0.000020s : 0.08% optimize.overlap_param_gather : 0.000005s : 0.02% optimize.cconv : 0.000027s : 0.10% optimize.loop_unroll : 0.000467s : 1.72% optimize.opt_after_cconv.c_1 : 0.000034s : 0.12% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.02% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000002s : 0.01% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.01% optimize.opt_after_cconv.cse : 0.000017s : 0.06% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000017s : 0.06% optimize.tuple_transform.d_1 : 0.000045s : 0.16% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000007s : 0.03% optimize.partial_unused_args_eliminate : 0.000005s : 0.02% optimize.add_recomputation : 0.000050s : 0.18% optimize.cse_after_recomputation.cse : 0.000011s : 0.04% optimize.environ_conv : 0.000008s : 0.03% optimize.swap_dp_allreduce_reducescatter : 0.000008s : 0.03% optimize.bias_add_comm_swap : 0.000006s : 0.02% optimize.label_micro_interleaved_index : 0.000007s : 0.03% optimize.label_fine_grained_interleaved_index : 0.000005s : 0.02% optimize.merge_cast_opt : 0.000004s : 0.01% optimize.slice_recompute_activation : 0.000005s : 0.02% optimize.micro_interleaved_order_control : 0.000004s : 0.02% optimize.assign_add_opt : 0.000004s : 0.01% optimize.ForceFp32Comm : 0.000003s : 0.01% optimize.remove_cast_before_assign_add : 0.000004s : 0.01% optimize.full_micro_interleaved_order_control : 0.000004s : 0.02% optimize.reorder_send_recv_between_fp_bp : 0.000005s : 0.02% optimize.comm_op_add_attrs : 0.000004s : 0.01% optimize.add_comm_op_reuse_tag : 0.000003s : 0.01% optimize.interleave_split_concat_branches : 0.000004s : 0.01% optimize.interleave_parallel_branches : 0.000004s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000004s : 0.02% optimize.control_data_broadcast_order : 0.000015s : 0.06% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.02% optimize.offloading_packed_experts : 0.000007s : 0.02% optimize.overlap_recompute_and_grad_model_parallel : 0.000007s : 0.03% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.01% optimize.overlap_recompute_comm : 0.000005s : 0.02% optimize.overlap_grad_ring_attention : 0.000006s : 0.02% optimize.overlap_grad_flash_sp : 0.000023s : 0.08% optimize.begin_end_overlap_inline : 0.000003s : 0.01% optimize.split_matmul_comm_elemetwise : 0.000005s : 0.02% optimize.split_layernorm_comm : 0.000004s : 0.02% optimize.handle_group_info : 0.000004s : 0.01% optimize.symbol_engine_optimizer.build : 0.000003s : 0.01% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000010s : 0.04% optimize.symbol_engine_optimizer.elim_not_effective : 0.000012s : 0.05% optimize.symbol_engine_optimizer.opt_reshape : 0.000007s : 0.03% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000010s : 0.04% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000004s : 0.01% pipeline_parallel_scheduler : 0.000002s : 0.01% auto_monad_reorder : 0.000020s : 0.07% get_jit_bprop_graph : 0.000002s : 0.01% rewriter_after_jit_bprop_graph : 0.000005s : 0.02% opt_after_jit_grad : 0.000509s : 1.87% validate : 0.000039s : 0.14% Time group info: ------[substitution.] 0.000180 28 1.03% : 0.000002s : 2: substitution.elim_not_effective 0.74% : 0.000001s : 2: substitution.fold_const_symbol 3.30% : 0.000006s : 4: substitution.graph_param_transform 79.23% : 0.000143s : 4: substitution.inline 1.79% : 0.000003s : 4: substitution.j_node_and_user_rematch 2.61% : 0.000005s : 4: substitution.remove_not_recompute_node 2.68% : 0.000005s : 4: substitution.replace_old_param 8.62% : 0.000016s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.005730 2 87.96% : 0.005040s : 1: type_inference.infer 12.04% : 0.000690s : 1: type_inference.specialize ------[replace.] 0.000062 8 61.28% : 0.000038s : 4: replace.inline 38.72% : 0.000024s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000153 8 91.17% : 0.000140s : 4: match.inline 8.83% : 0.000014s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000213 1278 0.97% : 0.000002s : 13: predicate.accumulaten_eliminater 0.66% : 0.000001s : 4: predicate.ad_related_special_op_eliminate 0.52% : 0.000001s : 8: predicate.addn_check_dump 0.90% : 0.000002s : 13: predicate.addn_zero_filter 0.83% : 0.000002s : 13: predicate.adjust_all_reduce_mul_add 2.02% : 0.000004s : 21: predicate.arithmetic_simplify 0.92% : 0.000002s : 13: predicate.cast_eliminate 0.60% : 0.000001s : 8: predicate.check_bprop_eliminate 0.49% : 0.000001s : 8: predicate.compare_switch_simplify 0.18% : 0.000000s : 4: predicate.const_output_eliminate 0.61% : 0.000001s : 8: predicate.depend_value_elim 1.11% : 0.000002s : 13: predicate.dict_get_item_const_eliminator 1.03% : 0.000002s : 13: predicate.dict_get_item_eliminator 0.86% : 0.000002s : 13: predicate.dict_set_item_eliminator 1.16% : 0.000002s : 8: predicate.dumpgradient_eliminate 0.19% : 0.000000s : 4: predicate.elim_not_effective 0.40% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.18% : 0.000003s : 17: predicate.environ_add_const_eliminate 1.07% : 0.000002s : 17: predicate.environ_get_add_eliminate 1.08% : 0.000002s : 17: predicate.environ_get_depend_swap 1.83% : 0.000004s : 25: predicate.environ_get_eliminate 1.09% : 0.000002s : 17: predicate.environ_get_set_eliminate 1.51% : 0.000003s : 21: predicate.exchange_switch_depend_value 2.43% : 0.000005s : 21: predicate.float_depend_g_call 0.48% : 0.000001s : 8: predicate.float_environ_get_switch 0.75% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.16% : 0.000000s : 4: predicate.fold_const_symbol 0.69% : 0.000001s : 8: predicate.get_grad_eliminate 0.20% : 0.000000s : 4: predicate.graph_param_transform 0.56% : 0.000001s : 8: predicate.incorporate_call 0.50% : 0.000001s : 8: predicate.incorporate_call_switch 6.24% : 0.000013s : 58: predicate.inline 0.78% : 0.000002s : 8: predicate.inline_without_move 0.32% : 0.000001s : 8: predicate.j_node_and_user_rematch 0.79% : 0.000002s : 8: predicate.less_batch_normalization 1.87% : 0.000004s : 25: predicate.list_to_tuple_eliminator_ 2.62% : 0.000006s : 38: predicate.load_eliminater 0.92% : 0.000002s : 4: predicate.loop_unroll_after_grad 2.25% : 0.000005s : 34: predicate.loop_unroll_before_grad 1.60% : 0.000003s : 21: predicate.make_slice_get_slice_eliminator 0.55% : 0.000001s : 8: predicate.merge_addn 0.65% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.56% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.81% : 0.000002s : 13: predicate.minmaximum_grad 1.03% : 0.000002s : 4: predicate.mutable_eliminate 0.37% : 0.000001s : 4: predicate.opt_reshape 0.46% : 0.000001s : 4: predicate.parallel_virtual_node 1.87% : 0.000004s : 21: predicate.partial_defer_inline 1.70% : 0.000004s : 21: predicate.partial_eliminate 1.01% : 0.000002s : 13: predicate.print_const_string_wrapper 0.59% : 0.000001s : 8: predicate.reduce_all_const_elim 1.44% : 0.000003s : 13: predicate.reduce_eliminate 2.70% : 0.000006s : 38: predicate.redundant_stop_gradient_eliminater 0.41% : 0.000001s : 8: predicate.remove_not_recompute_node 1.32% : 0.000003s : 25: predicate.replace_applicator 0.49% : 0.000001s : 8: predicate.replace_old_param 0.28% : 0.000001s : 4: predicate.reset_defer_inline 0.94% : 0.000002s : 13: predicate.reshape_eliminate 0.61% : 0.000001s : 8: predicate.row_tensor_add_zeros_like 0.35% : 0.000001s : 4: predicate.row_tensor_eliminate 0.77% : 0.000002s : 8: predicate.same_eliminate 0.39% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.85% : 0.000002s : 8: predicate.shard_identity_eliminate 0.66% : 0.000001s : 8: predicate.special_op_eliminate 0.69% : 0.000001s : 8: predicate.specialize_transform 0.71% : 0.000002s : 8: predicate.split_environ_get_set_with_tuple_value 0.68% : 0.000001s : 8: predicate.stack_unstack_eliminate 0.35% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.53% : 0.000003s : 21: predicate.switch_defer_inline 2.17% : 0.000005s : 29: predicate.switch_layer_defer_inline 5.37% : 0.000011s : 67: predicate.switch_simplify 0.84% : 0.000002s : 13: predicate.tile_eliminate 0.94% : 0.000002s : 13: predicate.transpose_eliminate 1.55% : 0.000003s : 21: predicate.tuple_list_convert_item_index_to_positive 1.53% : 0.000003s : 21: predicate.tuple_list_get_item_const_eliminator 1.55% : 0.000003s : 21: predicate.tuple_list_get_item_depend_reorder 3.24% : 0.000007s : 33: predicate.tuple_list_get_item_eliminator 1.46% : 0.000003s : 21: predicate.tuple_list_get_set_item_eliminator 2.13% : 0.000005s : 29: predicate.tuple_list_set_item_eliminator 1.83% : 0.000004s : 25: predicate.tuple_to_list_eliminator_ 2.55% : 0.000005s : 38: predicate.updatestate_pure_node_eliminater 3.04% : 0.000006s : 46: predicate.updatestate_useless_node_eliminater 0.36% : 0.000001s : 4: predicate.value_based_eliminate 0.67% : 0.000001s : 8: predicate.virtual_dataset_eliminate 0.75% : 0.000002s : 8: predicate.virtual_output_eliminate 0.29% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.56% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000518 11 54.40% : 0.000282s : 5: func_graph_cloner_run.FuncGraphClonerGraph 45.60% : 0.000236s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.042771 192 0.01% : 0.000006s : 1: ForceFp32Comm 7.62% : 0.003260s : 1: add_attr 7.59% : 0.003245s : 1: add_attr_with_inline 0.01% : 0.000006s : 1: add_comm_op_reuse_tag 0.13% : 0.000054s : 1: add_recomputation 0.01% : 0.000006s : 1: assign_add_opt 0.16% : 0.000070s : 1: auto_monad 0.07% : 0.000028s : 1: auto_monad_reorder 0.01% : 0.000006s : 1: begin_end_overlap_inline 0.02% : 0.000009s : 1: bias_add_comm_swap 38.73% : 0.016567s : 1: bootstrap 0.07% : 0.000030s : 1: cconv 0.01% : 0.000006s : 1: comm_op_add_attrs 0.04% : 0.000018s : 1: control_data_broadcast_order 0.03% : 0.000013s : 1: convert_after_rewriter 0.07% : 0.000029s : 1: cse_after_recomputation 0.02% : 0.000007s : 1: dataset_repeat_opt 0.05% : 0.000020s : 1: detach_backward 0.03% : 0.000011s : 1: environ_conv 0.07% : 0.000028s : 1: event_method 0.02% : 0.000007s : 1: full_micro_interleaved_order_control 0.02% : 0.000008s : 1: get_jit_bprop_graph 0.03% : 0.000013s : 1: graph_reusing 0.02% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.02% : 0.000007s : 1: handle_group_info 0.02% : 0.000008s : 1: inline 0.02% : 0.000008s : 1: insert-virtual-dataset 0.02% : 0.000007s : 1: interleave_parallel_branches 0.01% : 0.000006s : 1: interleave_split_concat_branches 0.02% : 0.000008s : 1: label_fine_grained_interleaved_index 0.02% : 0.000010s : 1: label_micro_interleaved_index 1.11% : 0.000473s : 1: loop_unroll 0.02% : 0.000007s : 1: merge_cast_opt 0.02% : 0.000007s : 1: micro_interleaved_order_control 1.21% : 0.000519s : 1: mutable_eliminate 0.02% : 0.000010s : 1: offloading_packed_experts 0.03% : 0.000015s : 1: opt.transform.loop_unroll_optimizer 0.03% : 0.000015s : 1: opt.transform.mutable_eliminate 2.86% : 0.001224s : 78: opt.transform.opt_a 0.08% : 0.000032s : 1: opt.transform.opt_after_cconv 0.06% : 0.000026s : 1: opt.transform.opt_after_jit_grad 0.24% : 0.000103s : 28: opt.transform.opt_b 0.12% : 0.000049s : 2: opt.transform.opt_trans_graph 0.08% : 0.000036s : 4: opt.transform.symbol_engine_opt 7.15% : 0.003058s : 1: opt_a 0.31% : 0.000132s : 1: opt_after_cconv 1.21% : 0.000519s : 1: opt_after_jit_grad 0.64% : 0.000273s : 1: opt_b 13.29% : 0.005684s : 1: optimize 0.06% : 0.000024s : 1: optimize_parallel_all_gather_comm 0.03% : 0.000011s : 1: order_py_execute_after_rewriter 0.06% : 0.000026s : 1: overlap_grad_flash_sp 0.01% : 0.000006s : 1: overlap_grad_matmul_and_grad_allreduce 0.02% : 0.000009s : 1: overlap_grad_ring_attention 0.02% : 0.000007s : 1: overlap_opt_shard_grad_in_pipeline 0.02% : 0.000007s : 1: overlap_opt_shard_in_pipeline 0.02% : 0.000008s : 1: overlap_param_gather 0.02% : 0.000007s : 1: overlap_recompute_allgather_and_fa_grad 0.02% : 0.000011s : 1: overlap_recompute_and_grad_model_parallel 0.02% : 0.000008s : 1: overlap_recompute_comm 0.02% : 0.000010s : 1: parallel-infer-symbol 0.01% : 0.000006s : 1: parallel-infer-symbol-second 0.02% : 0.000008s : 1: partial_unused_args_eliminate 0.02% : 0.000009s : 1: pipeline_parallel_scheduler 0.02% : 0.000007s : 1: pipeline_split 0.09% : 0.000040s : 1: pre_auto_parallel 0.08% : 0.000033s : 1: py_interpret_to_execute 0.04% : 0.000016s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000006s : 1: remove_cast_before_assign_add 0.05% : 0.000020s : 1: remove_dup_value 0.72% : 0.000307s : 1: renormalize.infer 0.68% : 0.000290s : 1: renormalize.specialize 0.02% : 0.000008s : 1: reorder_send_recv_between_fp_bp 0.02% : 0.000010s : 1: rewriter_after_jit_bprop_graph 0.10% : 0.000044s : 1: rewriter_after_opt_a 0.21% : 0.000089s : 1: rewriter_before_opt_a 0.02% : 0.000008s : 1: slice_cell_reuse_recomputed_activation 0.02% : 0.000008s : 1: slice_recompute_activation 0.02% : 0.000007s : 1: split_layernorm_comm 0.02% : 0.000008s : 1: split_matmul_comm_elemetwise 0.02% : 0.000010s : 1: swap_dp_allreduce_reducescatter 0.23% : 0.000097s : 1: symbol_engine_optimizer 0.22% : 0.000093s : 1: tuple_transform 13.62% : 0.005825s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:45:22.898.365 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0152322, [21] [bootstrap]: 0.00041973 [type_inference]: 0.00566031 [event_method]: 1.743e-05 [auto_monad]: 6.233e-05 [graph_reusing]: 6.11e-06 [inline]: 2.53e-06 [add_attr]: 0.00316296, [1] [add_attr_with_inline]: 0.00315371, [1] [Cycle 1]: 5.559e-05, [2] [tag_attr]: 1.884e-05 [meta_addattr_fg_expand]: 6.03998e-06 [parallel-infer-symbol]: 3.33e-06 [pre_auto_parallel]: 3.117e-05 [insert-virtual-dataset]: 2.39001e-06 [parallel-infer-symbol-second]: 7.99977e-07 [dataset_repeat_opt]: 1.97001e-06 [pipeline_split]: 1.71e-06 [optimize]: 0.00513805, [53] [py_interpret_to_execute]: 2.462e-05 [rewriter_before_opt_a]: 8.105e-05 [opt_a]: 0.00276093, [2] [Cycle 1]: 0.00205082, [45] [expand_dump_flag]: 3.06001e-06 [switch_simplify]: 4.412e-05 [loop_unroll]: 3.043e-05 [a_1]: 0.00065516 [with_stream_mark]: 1.713e-05 [recompute_prepare]: 9.52999e-06 [updatestate_depend_eliminate]: 4.04002e-06 [updatestate_assign_eliminate]: 3.65e-06 [updatestate_loads_eliminate]: 3.37002e-06 [parameter_eliminate]: 2.02999e-06 [a_2]: 8.802e-05 [accelerated_algorithm]: 7.27002e-06 [shard]: 1.70001e-06 [meta_shard_fg_expand]: 2.65002e-06 [shard_inline]: 6.79999e-06 [merge_send_recv]: 9.20001e-06 [auto_parallel]: 6.54001e-06 [parallel]: 1.847e-05 [flash_sp]: 8.18001e-06 [merge_comm]: 4.22998e-06 [allreduce_fusion]: 4.04002e-06 [matmul_add_comm_reduction]: 9.31002e-06 [allreduce_slice_to_reducescatter]: 6.40022e-07 [virtual_shard_identity]: 8.95999e-06 [virtual_dataset]: 7.13998e-06 [get_grad_eliminate_]: 6.88998e-06 [virtual_output]: 6.73e-06 [merge_forward]: 4.03001e-06 [cell_reuse_recompute_pass]: 1.60999e-06 [offload_activation]: 1.072e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.388e-05 [merge_recompute_call_nodes]: 1.49e-06 [before_grad]: 1.118e-05 [set_forward_comm_id_for_comm_node_pass]: 3.85e-06 [meta_fg_expand]: 3.01001e-06 [flash_sp_send_recv_attached]: 2.40997e-06 [receive_attached]: 2.19999e-06 [after_resolve]: 1.282e-05 [a_after_grad]: 1.075e-05 [renormalize]: 0.00064215 [add_forward_monad_depend]: 5.61998e-06 [auto_monad_grad]: 2.35997e-06 [auto_monad_eliminator]: 1.584e-05 [cse]: 3.001e-05 [a_3]: 5.231e-05 [Cycle 2]: 0.0007, [45] [expand_dump_flag]: 1.74998e-06 [switch_simplify]: 8.62998e-06 [loop_unroll]: 7.06001e-06 [a_1]: 0.00014618 [with_stream_mark]: 1.236e-05 [recompute_prepare]: 6.88e-06 [updatestate_depend_eliminate]: 2.91e-06 [updatestate_assign_eliminate]: 2.39001e-06 [updatestate_loads_eliminate]: 2.74999e-06 [parameter_eliminate]: 1.42e-06 [a_2]: 7.814e-05 [accelerated_algorithm]: 6.92002e-06 [shard]: 1.35999e-06 [meta_shard_fg_expand]: 1.40999e-06 [shard_inline]: 6.66e-06 [merge_send_recv]: 5.67001e-06 [auto_parallel]: 5.91e-06 [parallel]: 4.99e-06 [flash_sp]: 3.53999e-06 [merge_comm]: 3.38999e-06 [allreduce_fusion]: 3.07002e-06 [matmul_add_comm_reduction]: 6.61999e-06 [allreduce_slice_to_reducescatter]: 5.19998e-07 [virtual_shard_identity]: 7.28e-06 [virtual_dataset]: 6.47001e-06 [get_grad_eliminate_]: 6.02001e-06 [virtual_output]: 5.97999e-06 [merge_forward]: 2.93e-06 [cell_reuse_recompute_pass]: 2.00002e-06 [offload_activation]: 6.73e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.322e-05 [merge_recompute_call_nodes]: 7.59988e-07 [before_grad]: 1.202e-05 [set_forward_comm_id_for_comm_node_pass]: 3.18e-06 [meta_fg_expand]: 1.91003e-06 [flash_sp_send_recv_attached]: 1.06002e-06 [receive_attached]: 1.40001e-06 [after_resolve]: 1.177e-05 [a_after_grad]: 9.67999e-06 [renormalize]: 8.9989e-08 [add_forward_monad_depend]: 1.35001e-06 [auto_monad_grad]: 1.14e-06 [auto_monad_eliminator]: 7.78999e-06 [cse]: 1.38e-05 [a_3]: 3.945e-05 [py_interpret_to_execute_after_opt_a]: 9.04998e-06 [slice_cell_reuse_recomputed_activation]: 2.33998e-06 [rewriter_after_opt_a]: 3.794e-05 [convert_after_rewriter]: 8.11002e-06 [order_py_execute_after_rewriter]: 5.54998e-06 [mutable_eliminate]: 0.00054586 [opt_b]: 0.00021974, [1] [Cycle 1]: 0.00021237, [7] [b_1]: 0.00013492 [b_2]: 8.69998e-06 [updatestate_depend_eliminate]: 5.17e-06 [updatestate_assign_eliminate]: 2.66e-06 [updatestate_loads_eliminate]: 2.61e-06 [renormalize]: 4.19997e-07 [cse]: 1.972e-05 [optimize_parallel_all_gather_comm]: 1.654e-05 [overlap_param_gather]: 2.37001e-06 [cconv]: 2.479e-05 [loop_unroll]: 0.00046629 [opt_after_cconv]: 0.00011092, [1] [Cycle 1]: 0.00010473, [7] [c_1]: 3.541e-05 [parameter_eliminate]: 3.08998e-06 [updatestate_depend_eliminate]: 5.35999e-06 [updatestate_assign_eliminate]: 2.69001e-06 [updatestate_loads_eliminate]: 2.51e-06 [cse]: 1.878e-05 [renormalize]: 3.00002e-07 [remove_dup_value]: 1.319e-05 [tuple_transform]: 9.816e-05, [1] [Cycle 1]: 9.305e-05, [4] [d_1]: 4.769e-05 [none_parameter_eliminate]: 1.67999e-06 [renormalize]: 1.19995e-07 [switch_simplify]: 8.27e-06 [partial_unused_args_eliminate]: 2.26e-06 [add_recomputation]: 4.807e-05 [cse_after_recomputation]: 2.151e-05, [1] [Cycle 1]: 1.68e-05, [1] [cse]: 1.124e-05 [environ_conv]: 5.53997e-06 [swap_dp_allreduce_reducescatter]: 5.25001e-06 [bias_add_comm_swap]: 2.60002e-06 [label_micro_interleaved_index]: 4.95001e-06 [label_fine_grained_interleaved_index]: 2.93e-06 [merge_cast_opt]: 1.83002e-06 [slice_recompute_activation]: 2.48002e-06 [micro_interleaved_order_control]: 2.26998e-06 [assign_add_opt]: 1.15999e-06 [ForceFp32Comm]: 8.2e-07 [remove_cast_before_assign_add]: 1.07e-06 [full_micro_interleaved_order_control]: 2.50002e-06 [reorder_send_recv_between_fp_bp]: 2.86999e-06 [comm_op_add_attrs]: 1.00001e-06 [add_comm_op_reuse_tag]: 1.01002e-06 [interleave_split_concat_branches]: 3.11001e-06 [interleave_parallel_branches]: 1.35999e-06 [overlap_opt_shard_in_pipeline]: 1.45999e-06 [overlap_opt_shard_grad_in_pipeline]: 2.23998e-06 [control_data_broadcast_order]: 1.576e-05 [grouped_pairwise_exchange_alltoall]: 1.86998e-06 [offloading_packed_experts]: 4.08001e-06 [overlap_recompute_and_grad_model_parallel]: 5.19e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.29998e-06 [overlap_recompute_allgather_and_fa_grad]: 1.40999e-06 [overlap_recompute_comm]: 2.26e-06 [overlap_grad_ring_attention]: 4.31002e-06 [overlap_grad_flash_sp]: 2.123e-05 [begin_end_overlap_inline]: 6.60017e-07 [split_matmul_comm_elemetwise]: 2.27001e-06 [split_layernorm_comm]: 2.34001e-06 [handle_group_info]: 9.80013e-07 [symbol_engine_optimizer]: 9.138e-05, [1] [Cycle 1]: 8.59e-05, [6] [build]: 3.91999e-06 [elim_shapecalc]: 1.353e-05 [elim_not_effective]: 1.583e-05 [opt_reshape]: 7.8e-06 [fold_const_symbol]: 1.111e-05 [renormalize]: 2.19996e-07 [detach_backward]: 2.39999e-06 [pipeline_parallel_scheduler]: 1.77001e-06 [auto_monad_reorder]: 1.815e-05 [get_jit_bprop_graph]: 2.05002e-06 [rewriter_after_jit_bprop_graph]: 4.78001e-06 [opt_after_jit_grad]: 0.00050567 [validate]: 3.907e-05 Sums bootstrap : 0.000420s : 3.87% type_inference : 0.005660s : 52.20% event_method : 0.000017s : 0.16% auto_monad : 0.000062s : 0.57% graph_reusing : 0.000006s : 0.06% inline : 0.000003s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000019s : 0.17% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.06% parallel-infer-symbol : 0.000003s : 0.03% pre_auto_parallel : 0.000031s : 0.29% insert-virtual-dataset : 0.000002s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.02% optimize.py_interpret_to_execute : 0.000025s : 0.23% optimize.rewriter_before_opt_a : 0.000081s : 0.75% optimize.opt_a.expand_dump_flag : 0.000005s : 0.04% optimize.opt_a.switch_simplify : 0.000053s : 0.49% optimize.opt_a.loop_unroll : 0.000037s : 0.35% optimize.opt_a.a_1 : 0.000801s : 7.39% optimize.opt_a.with_stream_mark : 0.000029s : 0.27% optimize.opt_a.recompute_prepare : 0.000016s : 0.15% optimize.opt_a.updatestate_depend_eliminate : 0.000007s : 0.06% optimize.opt_a.updatestate_assign_eliminate : 0.000006s : 0.06% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.06% optimize.opt_a.parameter_eliminate : 0.000003s : 0.03% optimize.opt_a.a_2 : 0.000166s : 1.53% optimize.opt_a.accelerated_algorithm : 0.000014s : 0.13% optimize.opt_a.shard : 0.000003s : 0.03% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.04% optimize.opt_a.shard_inline : 0.000013s : 0.12% optimize.opt_a.merge_send_recv : 0.000015s : 0.14% optimize.opt_a.auto_parallel : 0.000012s : 0.11% optimize.opt_a.parallel : 0.000023s : 0.22% optimize.opt_a.flash_sp : 0.000012s : 0.11% optimize.opt_a.merge_comm : 0.000008s : 0.07% optimize.opt_a.allreduce_fusion : 0.000007s : 0.07% optimize.opt_a.matmul_add_comm_reduction : 0.000016s : 0.15% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000016s : 0.15% optimize.opt_a.virtual_dataset : 0.000014s : 0.13% optimize.opt_a.get_grad_eliminate_ : 0.000013s : 0.12% optimize.opt_a.virtual_output : 0.000013s : 0.12% optimize.opt_a.merge_forward : 0.000007s : 0.06% optimize.opt_a.cell_reuse_recompute_pass : 0.000004s : 0.03% optimize.opt_a.offload_activation : 0.000017s : 0.16% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000027s : 0.25% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.02% optimize.opt_a.before_grad : 0.000023s : 0.21% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000007s : 0.06% optimize.opt_a.meta_fg_expand : 0.000005s : 0.05% optimize.opt_a.flash_sp_send_recv_attached : 0.000003s : 0.03% optimize.opt_a.receive_attached : 0.000004s : 0.03% optimize.opt_a.after_resolve : 0.000025s : 0.23% optimize.opt_a.a_after_grad : 0.000020s : 0.19% optimize.opt_a.renormalize : 0.000642s : 5.92% optimize.opt_a.add_forward_monad_depend : 0.000007s : 0.06% optimize.opt_a.auto_monad_grad : 0.000003s : 0.03% optimize.opt_a.auto_monad_eliminator : 0.000024s : 0.22% optimize.opt_a.cse : 0.000044s : 0.40% optimize.opt_a.a_3 : 0.000092s : 0.85% optimize.py_interpret_to_execute_after_opt_a : 0.000009s : 0.08% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.02% optimize.rewriter_after_opt_a : 0.000038s : 0.35% optimize.convert_after_rewriter : 0.000008s : 0.07% optimize.order_py_execute_after_rewriter : 0.000006s : 0.05% optimize.mutable_eliminate : 0.000546s : 5.03% optimize.opt_b.b_1 : 0.000135s : 1.24% optimize.opt_b.b_2 : 0.000009s : 0.08% optimize.opt_b.updatestate_depend_eliminate : 0.000005s : 0.05% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.02% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.02% optimize.opt_b.renormalize : 0.000000s : 0.00% optimize.opt_b.cse : 0.000020s : 0.18% optimize.optimize_parallel_all_gather_comm : 0.000017s : 0.15% optimize.overlap_param_gather : 0.000002s : 0.02% optimize.cconv : 0.000025s : 0.23% optimize.loop_unroll : 0.000466s : 4.30% optimize.opt_after_cconv.c_1 : 0.000035s : 0.33% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000005s : 0.05% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.02% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.02% optimize.opt_after_cconv.cse : 0.000019s : 0.17% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000013s : 0.12% optimize.tuple_transform.d_1 : 0.000048s : 0.44% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.02% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000008s : 0.08% optimize.partial_unused_args_eliminate : 0.000002s : 0.02% optimize.add_recomputation : 0.000048s : 0.44% optimize.cse_after_recomputation.cse : 0.000011s : 0.10% optimize.environ_conv : 0.000006s : 0.05% optimize.swap_dp_allreduce_reducescatter : 0.000005s : 0.05% optimize.bias_add_comm_swap : 0.000003s : 0.02% optimize.label_micro_interleaved_index : 0.000005s : 0.05% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.03% optimize.merge_cast_opt : 0.000002s : 0.02% optimize.slice_recompute_activation : 0.000002s : 0.02% optimize.micro_interleaved_order_control : 0.000002s : 0.02% optimize.assign_add_opt : 0.000001s : 0.01% optimize.ForceFp32Comm : 0.000001s : 0.01% optimize.remove_cast_before_assign_add : 0.000001s : 0.01% optimize.full_micro_interleaved_order_control : 0.000003s : 0.02% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.03% optimize.comm_op_add_attrs : 0.000001s : 0.01% optimize.add_comm_op_reuse_tag : 0.000001s : 0.01% optimize.interleave_split_concat_branches : 0.000003s : 0.03% optimize.interleave_parallel_branches : 0.000001s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.02% optimize.control_data_broadcast_order : 0.000016s : 0.15% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.02% optimize.offloading_packed_experts : 0.000004s : 0.04% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.05% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.01% optimize.overlap_recompute_comm : 0.000002s : 0.02% optimize.overlap_grad_ring_attention : 0.000004s : 0.04% optimize.overlap_grad_flash_sp : 0.000021s : 0.20% optimize.begin_end_overlap_inline : 0.000001s : 0.01% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.02% optimize.split_layernorm_comm : 0.000002s : 0.02% optimize.handle_group_info : 0.000001s : 0.01% optimize.symbol_engine_optimizer.build : 0.000004s : 0.04% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000014s : 0.12% optimize.symbol_engine_optimizer.elim_not_effective : 0.000016s : 0.15% optimize.symbol_engine_optimizer.opt_reshape : 0.000008s : 0.07% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000011s : 0.10% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.02% pipeline_parallel_scheduler : 0.000002s : 0.02% auto_monad_reorder : 0.000018s : 0.17% get_jit_bprop_graph : 0.000002s : 0.02% rewriter_after_jit_bprop_graph : 0.000005s : 0.04% opt_after_jit_grad : 0.000506s : 4.66% validate : 0.000039s : 0.36% Time group info: ------[substitution.] 0.000197 28 1.21% : 0.000002s : 2: substitution.elim_not_effective 0.71% : 0.000001s : 2: substitution.fold_const_symbol 2.95% : 0.000006s : 4: substitution.graph_param_transform 80.42% : 0.000158s : 4: substitution.inline 1.95% : 0.000004s : 4: substitution.j_node_and_user_rematch 2.61% : 0.000005s : 4: substitution.remove_not_recompute_node 2.40% : 0.000005s : 4: substitution.replace_old_param 7.74% : 0.000015s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.005601 2 87.72% : 0.004913s : 1: type_inference.infer 12.28% : 0.000688s : 1: type_inference.specialize ------[replace.] 0.000062 8 62.31% : 0.000039s : 4: replace.inline 37.69% : 0.000023s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000169 8 92.18% : 0.000156s : 4: match.inline 7.82% : 0.000013s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000218 1278 0.89% : 0.000002s : 13: predicate.accumulaten_eliminater 0.66% : 0.000001s : 4: predicate.ad_related_special_op_eliminate 0.51% : 0.000001s : 8: predicate.addn_check_dump 0.88% : 0.000002s : 13: predicate.addn_zero_filter 0.89% : 0.000002s : 13: predicate.adjust_all_reduce_mul_add 2.06% : 0.000004s : 21: predicate.arithmetic_simplify 1.07% : 0.000002s : 13: predicate.cast_eliminate 0.60% : 0.000001s : 8: predicate.check_bprop_eliminate 0.61% : 0.000001s : 8: predicate.compare_switch_simplify 0.21% : 0.000000s : 4: predicate.const_output_eliminate 0.65% : 0.000001s : 8: predicate.depend_value_elim 0.99% : 0.000002s : 13: predicate.dict_get_item_const_eliminator 0.99% : 0.000002s : 13: predicate.dict_get_item_eliminator 0.85% : 0.000002s : 13: predicate.dict_set_item_eliminator 0.92% : 0.000002s : 8: predicate.dumpgradient_eliminate 0.26% : 0.000001s : 4: predicate.elim_not_effective 0.45% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.09% : 0.000002s : 17: predicate.environ_add_const_eliminate 1.10% : 0.000002s : 17: predicate.environ_get_add_eliminate 1.09% : 0.000002s : 17: predicate.environ_get_depend_swap 1.65% : 0.000004s : 25: predicate.environ_get_eliminate 1.16% : 0.000003s : 17: predicate.environ_get_set_eliminate 1.45% : 0.000003s : 21: predicate.exchange_switch_depend_value 2.47% : 0.000005s : 21: predicate.float_depend_g_call 0.47% : 0.000001s : 8: predicate.float_environ_get_switch 0.87% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.20% : 0.000000s : 4: predicate.fold_const_symbol 0.71% : 0.000002s : 8: predicate.get_grad_eliminate 0.19% : 0.000000s : 4: predicate.graph_param_transform 0.57% : 0.000001s : 8: predicate.incorporate_call 0.53% : 0.000001s : 8: predicate.incorporate_call_switch 6.57% : 0.000014s : 58: predicate.inline 0.72% : 0.000002s : 8: predicate.inline_without_move 0.33% : 0.000001s : 8: predicate.j_node_and_user_rematch 0.91% : 0.000002s : 8: predicate.less_batch_normalization 1.85% : 0.000004s : 25: predicate.list_to_tuple_eliminator_ 2.47% : 0.000005s : 38: predicate.load_eliminater 1.01% : 0.000002s : 4: predicate.loop_unroll_after_grad 2.28% : 0.000005s : 34: predicate.loop_unroll_before_grad 1.69% : 0.000004s : 21: predicate.make_slice_get_slice_eliminator 0.52% : 0.000001s : 8: predicate.merge_addn 0.57% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.54% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.80% : 0.000002s : 13: predicate.minmaximum_grad 1.05% : 0.000002s : 4: predicate.mutable_eliminate 0.39% : 0.000001s : 4: predicate.opt_reshape 0.34% : 0.000001s : 4: predicate.parallel_virtual_node 1.87% : 0.000004s : 21: predicate.partial_defer_inline 1.64% : 0.000004s : 21: predicate.partial_eliminate 1.02% : 0.000002s : 13: predicate.print_const_string_wrapper 0.57% : 0.000001s : 8: predicate.reduce_all_const_elim 1.18% : 0.000003s : 13: predicate.reduce_eliminate 2.45% : 0.000005s : 38: predicate.redundant_stop_gradient_eliminater 0.44% : 0.000001s : 8: predicate.remove_not_recompute_node 1.36% : 0.000003s : 25: predicate.replace_applicator 0.53% : 0.000001s : 8: predicate.replace_old_param 0.28% : 0.000001s : 4: predicate.reset_defer_inline 0.86% : 0.000002s : 13: predicate.reshape_eliminate 0.68% : 0.000001s : 8: predicate.row_tensor_add_zeros_like 0.38% : 0.000001s : 4: predicate.row_tensor_eliminate 0.77% : 0.000002s : 8: predicate.same_eliminate 0.41% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.87% : 0.000002s : 8: predicate.shard_identity_eliminate 0.71% : 0.000002s : 8: predicate.special_op_eliminate 0.67% : 0.000001s : 8: predicate.specialize_transform 0.79% : 0.000002s : 8: predicate.split_environ_get_set_with_tuple_value 0.76% : 0.000002s : 8: predicate.stack_unstack_eliminate 0.34% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.48% : 0.000003s : 21: predicate.switch_defer_inline 2.11% : 0.000005s : 29: predicate.switch_layer_defer_inline 5.13% : 0.000011s : 67: predicate.switch_simplify 1.07% : 0.000002s : 13: predicate.tile_eliminate 0.96% : 0.000002s : 13: predicate.transpose_eliminate 1.43% : 0.000003s : 21: predicate.tuple_list_convert_item_index_to_positive 1.63% : 0.000004s : 21: predicate.tuple_list_get_item_const_eliminator 1.41% : 0.000003s : 21: predicate.tuple_list_get_item_depend_reorder 3.39% : 0.000007s : 33: predicate.tuple_list_get_item_eliminator 1.49% : 0.000003s : 21: predicate.tuple_list_get_set_item_eliminator 2.49% : 0.000005s : 29: predicate.tuple_list_set_item_eliminator 1.73% : 0.000004s : 25: predicate.tuple_to_list_eliminator_ 2.45% : 0.000005s : 38: predicate.updatestate_pure_node_eliminater 3.09% : 0.000007s : 46: predicate.updatestate_useless_node_eliminater 0.38% : 0.000001s : 4: predicate.value_based_eliminate 0.60% : 0.000001s : 8: predicate.virtual_dataset_eliminate 0.65% : 0.000001s : 8: predicate.virtual_output_eliminate 0.33% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.51% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000506 11 53.55% : 0.000271s : 5: func_graph_cloner_run.FuncGraphClonerGraph 46.45% : 0.000235s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.025585 192 0.01% : 0.000004s : 1: ForceFp32Comm 12.38% : 0.003168s : 1: add_attr 12.34% : 0.003158s : 1: add_attr_with_inline 0.79% : 0.000202s : 1: add_comm_op_reuse_tag 0.20% : 0.000052s : 1: add_recomputation 0.02% : 0.000004s : 1: assign_add_opt 0.26% : 0.000068s : 1: auto_monad 0.09% : 0.000022s : 1: auto_monad_reorder 0.01% : 0.000004s : 1: begin_end_overlap_inline 0.02% : 0.000005s : 1: bias_add_comm_swap 1.74% : 0.000444s : 1: bootstrap 0.11% : 0.000028s : 1: cconv 0.01% : 0.000004s : 1: comm_op_add_attrs 0.08% : 0.000020s : 1: control_data_broadcast_order 0.04% : 0.000011s : 1: convert_after_rewriter 0.10% : 0.000025s : 1: cse_after_recomputation 0.02% : 0.000005s : 1: dataset_repeat_opt 0.02% : 0.000006s : 1: detach_backward 0.04% : 0.000009s : 1: environ_conv 0.09% : 0.000024s : 1: event_method 0.02% : 0.000005s : 1: full_micro_interleaved_order_control 0.02% : 0.000005s : 1: get_jit_bprop_graph 0.04% : 0.000010s : 1: graph_reusing 0.02% : 0.000005s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000004s : 1: handle_group_info 0.02% : 0.000006s : 1: inline 0.02% : 0.000006s : 1: insert-virtual-dataset 0.02% : 0.000005s : 1: interleave_parallel_branches 0.04% : 0.000011s : 1: interleave_split_concat_branches 0.02% : 0.000006s : 1: label_fine_grained_interleaved_index 0.03% : 0.000008s : 1: label_micro_interleaved_index 1.85% : 0.000474s : 1: loop_unroll 0.02% : 0.000005s : 1: merge_cast_opt 0.02% : 0.000005s : 1: micro_interleaved_order_control 2.17% : 0.000555s : 1: mutable_eliminate 0.03% : 0.000007s : 1: offloading_packed_experts 0.06% : 0.000016s : 1: opt.transform.loop_unroll_optimizer 0.06% : 0.000016s : 1: opt.transform.mutable_eliminate 4.88% : 0.001248s : 78: opt.transform.opt_a 0.13% : 0.000034s : 1: opt.transform.opt_after_cconv 0.10% : 0.000026s : 1: opt.transform.opt_after_jit_grad 0.43% : 0.000109s : 28: opt.transform.opt_b 0.21% : 0.000054s : 2: opt.transform.opt_trans_graph 0.17% : 0.000044s : 4: opt.transform.symbol_engine_opt 10.80% : 0.002764s : 1: opt_a 0.45% : 0.000115s : 1: opt_after_cconv 2.01% : 0.000515s : 1: opt_after_jit_grad 0.87% : 0.000224s : 1: opt_b 20.10% : 0.005143s : 1: optimize 0.08% : 0.000021s : 1: optimize_parallel_all_gather_comm 0.03% : 0.000009s : 1: order_py_execute_after_rewriter 0.10% : 0.000025s : 1: overlap_grad_flash_sp 0.02% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.03% : 0.000007s : 1: overlap_grad_ring_attention 0.02% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.02% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.02% : 0.000006s : 1: overlap_param_gather 0.02% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.03% : 0.000009s : 1: overlap_recompute_and_grad_model_parallel 0.02% : 0.000005s : 1: overlap_recompute_comm 0.03% : 0.000007s : 1: parallel-infer-symbol 0.01% : 0.000004s : 1: parallel-infer-symbol-second 0.02% : 0.000006s : 1: partial_unused_args_eliminate 0.02% : 0.000005s : 1: pipeline_parallel_scheduler 0.02% : 0.000005s : 1: pipeline_split 0.14% : 0.000035s : 1: pre_auto_parallel 0.11% : 0.000029s : 1: py_interpret_to_execute 0.05% : 0.000012s : 1: py_interpret_to_execute_after_opt_a 0.02% : 0.000004s : 1: remove_cast_before_assign_add 0.07% : 0.000017s : 1: remove_dup_value 1.30% : 0.000334s : 1: renormalize.infer 1.17% : 0.000300s : 1: renormalize.specialize 0.02% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.03% : 0.000008s : 1: rewriter_after_jit_bprop_graph 0.16% : 0.000042s : 1: rewriter_after_opt_a 0.33% : 0.000085s : 1: rewriter_before_opt_a 0.02% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.02% : 0.000006s : 1: slice_recompute_activation 0.02% : 0.000005s : 1: split_layernorm_comm 0.02% : 0.000005s : 1: split_matmul_comm_elemetwise 0.03% : 0.000008s : 1: swap_dp_allreduce_reducescatter 0.37% : 0.000094s : 1: symbol_engine_optimizer 0.40% : 0.000101s : 1: tuple_transform 22.19% : 0.005677s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:45:23.343.434 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:45:23.343.700 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0169325, [21] [bootstrap]: 0.00043481 [type_inference]: 0.00590427 [event_method]: 1.964e-05 [auto_monad]: 6.481e-05 [graph_reusing]: 6.16e-06 [inline]: 2.02999e-06 [add_attr]: 0.00324151, [1] [add_attr_with_inline]: 0.00323261, [1] [Cycle 1]: 7.095e-05, [2] [tag_attr]: 1.967e-05 [meta_addattr_fg_expand]: 6.10002e-06 [parallel-infer-symbol]: 3.4e-06 [pre_auto_parallel]: 3.349e-05 [insert-virtual-dataset]: 2.29999e-06 [parallel-infer-symbol-second]: 8.80013e-07 [dataset_repeat_opt]: 2.19001e-06 [pipeline_split]: 1.64e-06 [optimize]: 0.00598456, [53] [py_interpret_to_execute]: 3.082e-05 [rewriter_before_opt_a]: 9.414e-05 [opt_a]: 0.00341596, [2] [Cycle 1]: 0.0024713, [45] [expand_dump_flag]: 3.3e-06 [switch_simplify]: 4.355e-05 [loop_unroll]: 3.101e-05 [a_1]: 0.00075376 [with_stream_mark]: 1.717e-05 [recompute_prepare]: 1.102e-05 [updatestate_depend_eliminate]: 4.72e-06 [updatestate_assign_eliminate]: 4.23001e-06 [updatestate_loads_eliminate]: 3.43e-06 [parameter_eliminate]: 1.84e-06 [a_2]: 0.0001324 [accelerated_algorithm]: 8.45001e-06 [shard]: 2.43e-06 [meta_shard_fg_expand]: 2.11998e-06 [shard_inline]: 7.70998e-06 [merge_send_recv]: 9.24998e-06 [auto_parallel]: 8.07e-06 [parallel]: 1.825e-05 [flash_sp]: 9.17001e-06 [merge_comm]: 4.79e-06 [allreduce_fusion]: 4.3e-06 [matmul_add_comm_reduction]: 9.99001e-06 [allreduce_slice_to_reducescatter]: 6.80011e-07 [virtual_shard_identity]: 9.97001e-06 [virtual_dataset]: 7.95e-06 [get_grad_eliminate_]: 7.83999e-06 [virtual_output]: 8e-06 [merge_forward]: 4.27e-06 [cell_reuse_recompute_pass]: 1.17e-06 [offload_activation]: 1.088e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.768e-05 [merge_recompute_call_nodes]: 1.79e-06 [before_grad]: 1.345e-05 [set_forward_comm_id_for_comm_node_pass]: 4.58999e-06 [meta_fg_expand]: 3.93001e-06 [flash_sp_send_recv_attached]: 2.95998e-06 [receive_attached]: 2.47001e-06 [after_resolve]: 1.272e-05 [a_after_grad]: 1.178e-05 [renormalize]: 0.00072895 [add_forward_monad_depend]: 6.25997e-06 [auto_monad_grad]: 2.56e-06 [auto_monad_eliminator]: 1.804e-05 [cse]: 3.65e-05 [a_3]: 7.272e-05 [Cycle 2]: 0.00093113, [45] [expand_dump_flag]: 1.34e-06 [switch_simplify]: 9.34998e-06 [loop_unroll]: 7.36001e-06 [a_1]: 0.00017461 [with_stream_mark]: 1.319e-05 [recompute_prepare]: 7.42002e-06 [updatestate_depend_eliminate]: 4.02998e-06 [updatestate_assign_eliminate]: 3.13998e-06 [updatestate_loads_eliminate]: 2.89999e-06 [parameter_eliminate]: 1.29e-06 [a_2]: 0.00011871 [accelerated_algorithm]: 7.74002e-06 [shard]: 1.45999e-06 [meta_shard_fg_expand]: 1.76e-06 [shard_inline]: 7.8e-06 [merge_send_recv]: 7.56001e-06 [auto_parallel]: 7.25e-06 [parallel]: 5.76e-06 [flash_sp]: 3.38e-06 [merge_comm]: 3.87998e-06 [allreduce_fusion]: 3.98001e-06 [matmul_add_comm_reduction]: 7.92e-06 [allreduce_slice_to_reducescatter]: 3.39991e-07 [virtual_shard_identity]: 8.64e-06 [virtual_dataset]: 8.10999e-06 [get_grad_eliminate_]: 6.98e-06 [virtual_output]: 7.2e-06 [merge_forward]: 3.38999e-06 [cell_reuse_recompute_pass]: 1.87999e-06 [offload_activation]: 8.66002e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.647e-05 [merge_recompute_call_nodes]: 8.89995e-07 [before_grad]: 1.165e-05 [set_forward_comm_id_for_comm_node_pass]: 4.66002e-06 [meta_fg_expand]: 2.86999e-06 [flash_sp_send_recv_attached]: 8.59989e-07 [receive_attached]: 1.05001e-06 [after_resolve]: 1.189e-05 [a_after_grad]: 1.083e-05 [renormalize]: 7.99773e-08 [add_forward_monad_depend]: 1.75001e-06 [auto_monad_grad]: 1.59e-06 [auto_monad_eliminator]: 1.039e-05 [cse]: 1.952e-05 [a_3]: 5.865e-05 [py_interpret_to_execute_after_opt_a]: 1.583e-05 [slice_cell_reuse_recomputed_activation]: 5.53002e-06 [rewriter_after_opt_a]: 4.513e-05 [convert_after_rewriter]: 3.953e-05 [order_py_execute_after_rewriter]: 9.27001e-06 [mutable_eliminate]: 0.00062216 [opt_b]: 0.00031373, [1] [Cycle 1]: 0.0003048, [7] [b_1]: 0.00019876 [b_2]: 9.98998e-06 [updatestate_depend_eliminate]: 7.28999e-06 [updatestate_assign_eliminate]: 3.26001e-06 [updatestate_loads_eliminate]: 3.15998e-06 [renormalize]: 7.09988e-07 [cse]: 2.55e-05 [optimize_parallel_all_gather_comm]: 2.257e-05 [overlap_param_gather]: 4.58999e-06 [cconv]: 3.046e-05 [loop_unroll]: 0.00046587 [opt_after_cconv]: 0.00014413, [1] [Cycle 1]: 0.00013496, [7] [c_1]: 3.788e-05 [parameter_eliminate]: 3.41999e-06 [updatestate_depend_eliminate]: 6.91999e-06 [updatestate_assign_eliminate]: 3.40003e-06 [updatestate_loads_eliminate]: 2.96999e-06 [cse]: 2.306e-05 [renormalize]: 4.89992e-07 [remove_dup_value]: 1.901e-05 [tuple_transform]: 0.00010322, [1] [Cycle 1]: 9.55e-05, [4] [d_1]: 5.411e-05 [none_parameter_eliminate]: 1.73002e-06 [renormalize]: 2.29978e-07 [switch_simplify]: 8.42e-06 [partial_unused_args_eliminate]: 4.89003e-06 [add_recomputation]: 5.984e-05 [cse_after_recomputation]: 3.237e-05, [1] [Cycle 1]: 2.51e-05, [1] [cse]: 1.567e-05 [environ_conv]: 9.74999e-06 [swap_dp_allreduce_reducescatter]: 8.84e-06 [bias_add_comm_swap]: 5.16998e-06 [label_micro_interleaved_index]: 7.28e-06 [label_fine_grained_interleaved_index]: 5.13002e-06 [merge_cast_opt]: 3.59002e-06 [slice_recompute_activation]: 4.87e-06 [micro_interleaved_order_control]: 5.12e-06 [assign_add_opt]: 3.73999e-06 [ForceFp32Comm]: 3.16999e-06 [remove_cast_before_assign_add]: 3.97998e-06 [full_micro_interleaved_order_control]: 4.73001e-06 [reorder_send_recv_between_fp_bp]: 5.29998e-06 [comm_op_add_attrs]: 3.42002e-06 [add_comm_op_reuse_tag]: 3.26001e-06 [interleave_split_concat_branches]: 3.37002e-06 [interleave_parallel_branches]: 3.48e-06 [overlap_opt_shard_in_pipeline]: 4.23999e-06 [overlap_opt_shard_grad_in_pipeline]: 4.17e-06 [control_data_broadcast_order]: 1.761e-05 [grouped_pairwise_exchange_alltoall]: 3.84002e-06 [offloading_packed_experts]: 7.2e-06 [overlap_recompute_and_grad_model_parallel]: 7.82e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.76999e-06 [overlap_recompute_allgather_and_fa_grad]: 3.88001e-06 [overlap_recompute_comm]: 4.83001e-06 [overlap_grad_ring_attention]: 7.06999e-06 [overlap_grad_flash_sp]: 2.316e-05 [begin_end_overlap_inline]: 2.83998e-06 [split_matmul_comm_elemetwise]: 4.72998e-06 [split_layernorm_comm]: 4e-06 [handle_group_info]: 3.2e-06 [symbol_engine_optimizer]: 0.00010595, [1] [Cycle 1]: 9.928e-05, [6] [build]: 3.47002e-06 [elim_shapecalc]: 1.171e-05 [elim_not_effective]: 1.61e-05 [opt_reshape]: 9.08002e-06 [fold_const_symbol]: 1.249e-05 [renormalize]: 2.3999e-07 [detach_backward]: 4.32e-06 [pipeline_parallel_scheduler]: 1.72001e-06 [auto_monad_reorder]: 2.387e-05 [get_jit_bprop_graph]: 2.15002e-06 [rewriter_after_jit_bprop_graph]: 4.62e-06 [opt_after_jit_grad]: 0.00052594 [validate]: 4.457e-05 Sums bootstrap : 0.000435s : 3.65% type_inference : 0.005904s : 49.60% event_method : 0.000020s : 0.16% auto_monad : 0.000065s : 0.54% graph_reusing : 0.000006s : 0.05% inline : 0.000002s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000020s : 0.17% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.05% parallel-infer-symbol : 0.000003s : 0.03% pre_auto_parallel : 0.000033s : 0.28% insert-virtual-dataset : 0.000002s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000031s : 0.26% optimize.rewriter_before_opt_a : 0.000094s : 0.79% optimize.opt_a.expand_dump_flag : 0.000005s : 0.04% optimize.opt_a.switch_simplify : 0.000053s : 0.44% optimize.opt_a.loop_unroll : 0.000038s : 0.32% optimize.opt_a.a_1 : 0.000928s : 7.80% optimize.opt_a.with_stream_mark : 0.000030s : 0.26% optimize.opt_a.recompute_prepare : 0.000018s : 0.15% optimize.opt_a.updatestate_depend_eliminate : 0.000009s : 0.07% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.06% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.05% optimize.opt_a.parameter_eliminate : 0.000003s : 0.03% optimize.opt_a.a_2 : 0.000251s : 2.11% optimize.opt_a.accelerated_algorithm : 0.000016s : 0.14% optimize.opt_a.shard : 0.000004s : 0.03% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.03% optimize.opt_a.shard_inline : 0.000016s : 0.13% optimize.opt_a.merge_send_recv : 0.000017s : 0.14% optimize.opt_a.auto_parallel : 0.000015s : 0.13% optimize.opt_a.parallel : 0.000024s : 0.20% optimize.opt_a.flash_sp : 0.000013s : 0.11% optimize.opt_a.merge_comm : 0.000009s : 0.07% optimize.opt_a.allreduce_fusion : 0.000008s : 0.07% optimize.opt_a.matmul_add_comm_reduction : 0.000018s : 0.15% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000019s : 0.16% optimize.opt_a.virtual_dataset : 0.000016s : 0.13% optimize.opt_a.get_grad_eliminate_ : 0.000015s : 0.12% optimize.opt_a.virtual_output : 0.000015s : 0.13% optimize.opt_a.merge_forward : 0.000008s : 0.06% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.03% optimize.opt_a.offload_activation : 0.000020s : 0.16% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000034s : 0.29% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.02% optimize.opt_a.before_grad : 0.000025s : 0.21% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000009s : 0.08% optimize.opt_a.meta_fg_expand : 0.000007s : 0.06% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.03% optimize.opt_a.receive_attached : 0.000004s : 0.03% optimize.opt_a.after_resolve : 0.000025s : 0.21% optimize.opt_a.a_after_grad : 0.000023s : 0.19% optimize.opt_a.renormalize : 0.000729s : 6.12% optimize.opt_a.add_forward_monad_depend : 0.000008s : 0.07% optimize.opt_a.auto_monad_grad : 0.000004s : 0.03% optimize.opt_a.auto_monad_eliminator : 0.000028s : 0.24% optimize.opt_a.cse : 0.000056s : 0.47% optimize.opt_a.a_3 : 0.000131s : 1.10% optimize.py_interpret_to_execute_after_opt_a : 0.000016s : 0.13% optimize.slice_cell_reuse_recomputed_activation : 0.000006s : 0.05% optimize.rewriter_after_opt_a : 0.000045s : 0.38% optimize.convert_after_rewriter : 0.000040s : 0.33% optimize.order_py_execute_after_rewriter : 0.000009s : 0.08% optimize.mutable_eliminate : 0.000622s : 5.23% optimize.opt_b.b_1 : 0.000199s : 1.67% optimize.opt_b.b_2 : 0.000010s : 0.08% optimize.opt_b.updatestate_depend_eliminate : 0.000007s : 0.06% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_b.renormalize : 0.000001s : 0.01% optimize.opt_b.cse : 0.000025s : 0.21% optimize.optimize_parallel_all_gather_comm : 0.000023s : 0.19% optimize.overlap_param_gather : 0.000005s : 0.04% optimize.cconv : 0.000030s : 0.26% optimize.loop_unroll : 0.000466s : 3.91% optimize.opt_after_cconv.c_1 : 0.000038s : 0.32% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000007s : 0.06% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.02% optimize.opt_after_cconv.cse : 0.000023s : 0.19% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000019s : 0.16% optimize.tuple_transform.d_1 : 0.000054s : 0.45% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000008s : 0.07% optimize.partial_unused_args_eliminate : 0.000005s : 0.04% optimize.add_recomputation : 0.000060s : 0.50% optimize.cse_after_recomputation.cse : 0.000016s : 0.13% optimize.environ_conv : 0.000010s : 0.08% optimize.swap_dp_allreduce_reducescatter : 0.000009s : 0.07% optimize.bias_add_comm_swap : 0.000005s : 0.04% optimize.label_micro_interleaved_index : 0.000007s : 0.06% optimize.label_fine_grained_interleaved_index : 0.000005s : 0.04% optimize.merge_cast_opt : 0.000004s : 0.03% optimize.slice_recompute_activation : 0.000005s : 0.04% optimize.micro_interleaved_order_control : 0.000005s : 0.04% optimize.assign_add_opt : 0.000004s : 0.03% optimize.ForceFp32Comm : 0.000003s : 0.03% optimize.remove_cast_before_assign_add : 0.000004s : 0.03% optimize.full_micro_interleaved_order_control : 0.000005s : 0.04% optimize.reorder_send_recv_between_fp_bp : 0.000005s : 0.04% optimize.comm_op_add_attrs : 0.000003s : 0.03% optimize.add_comm_op_reuse_tag : 0.000003s : 0.03% optimize.interleave_split_concat_branches : 0.000003s : 0.03% optimize.interleave_parallel_branches : 0.000003s : 0.03% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.04% optimize.overlap_opt_shard_grad_in_pipeline : 0.000004s : 0.04% optimize.control_data_broadcast_order : 0.000018s : 0.15% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.03% optimize.offloading_packed_experts : 0.000007s : 0.06% optimize.overlap_recompute_and_grad_model_parallel : 0.000008s : 0.07% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.03% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.03% optimize.overlap_recompute_comm : 0.000005s : 0.04% optimize.overlap_grad_ring_attention : 0.000007s : 0.06% optimize.overlap_grad_flash_sp : 0.000023s : 0.19% optimize.begin_end_overlap_inline : 0.000003s : 0.02% optimize.split_matmul_comm_elemetwise : 0.000005s : 0.04% optimize.split_layernorm_comm : 0.000004s : 0.03% optimize.handle_group_info : 0.000003s : 0.03% optimize.symbol_engine_optimizer.build : 0.000003s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000012s : 0.10% optimize.symbol_engine_optimizer.elim_not_effective : 0.000016s : 0.14% optimize.symbol_engine_optimizer.opt_reshape : 0.000009s : 0.08% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000012s : 0.10% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000004s : 0.04% pipeline_parallel_scheduler : 0.000002s : 0.01% auto_monad_reorder : 0.000024s : 0.20% get_jit_bprop_graph : 0.000002s : 0.02% rewriter_after_jit_bprop_graph : 0.000005s : 0.04% opt_after_jit_grad : 0.000526s : 4.42% validate : 0.000045s : 0.37% Time group info: ------[substitution.] 0.000209 38 10.70% : 0.000022s : 3: substitution.cast_eliminate 1.17% : 0.000002s : 3: substitution.elim_not_effective 0.86% : 0.000002s : 3: substitution.fold_const_symbol 3.35% : 0.000007s : 5: substitution.graph_param_transform 68.69% : 0.000144s : 4: substitution.inline 2.20% : 0.000005s : 6: substitution.j_node_and_user_rematch 3.07% : 0.000006s : 6: substitution.remove_not_recompute_node 2.56% : 0.000005s : 4: substitution.replace_old_param 7.40% : 0.000015s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.005848 2 88.23% : 0.005160s : 1: type_inference.infer 11.77% : 0.000688s : 1: type_inference.specialize ------[replace.] 0.000066 8 58.55% : 0.000039s : 4: replace.inline 41.45% : 0.000027s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000154 8 91.35% : 0.000141s : 4: match.inline 8.65% : 0.000013s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000257 1596 0.96% : 0.000002s : 17: predicate.accumulaten_eliminater 0.72% : 0.000002s : 5: predicate.ad_related_special_op_eliminate 0.55% : 0.000001s : 10: predicate.addn_check_dump 1.09% : 0.000003s : 17: predicate.addn_zero_filter 0.87% : 0.000002s : 17: predicate.adjust_all_reduce_mul_add 2.00% : 0.000005s : 27: predicate.arithmetic_simplify 1.05% : 0.000003s : 17: predicate.cast_eliminate 0.66% : 0.000002s : 10: predicate.check_bprop_eliminate 0.62% : 0.000002s : 10: predicate.compare_switch_simplify 0.20% : 0.000001s : 5: predicate.const_output_eliminate 0.60% : 0.000002s : 10: predicate.depend_value_elim 1.00% : 0.000003s : 17: predicate.dict_get_item_const_eliminator 1.11% : 0.000003s : 17: predicate.dict_get_item_eliminator 0.91% : 0.000002s : 17: predicate.dict_set_item_eliminator 0.89% : 0.000002s : 10: predicate.dumpgradient_eliminate 0.20% : 0.000001s : 5: predicate.elim_not_effective 0.39% : 0.000001s : 5: predicate.elim_shapecalc_of_broadcastargs 1.20% : 0.000003s : 22: predicate.environ_add_const_eliminate 1.11% : 0.000003s : 22: predicate.environ_get_add_eliminate 1.28% : 0.000003s : 22: predicate.environ_get_depend_swap 1.85% : 0.000005s : 32: predicate.environ_get_eliminate 1.18% : 0.000003s : 22: predicate.environ_get_set_eliminate 1.39% : 0.000004s : 25: predicate.exchange_switch_depend_value 2.16% : 0.000006s : 25: predicate.float_depend_g_call 0.54% : 0.000001s : 10: predicate.float_environ_get_switch 0.82% : 0.000002s : 15: predicate.float_tuple_getitem_switch 0.20% : 0.000001s : 5: predicate.fold_const_symbol 0.60% : 0.000002s : 10: predicate.get_grad_eliminate 0.22% : 0.000001s : 5: predicate.graph_param_transform 0.60% : 0.000002s : 10: predicate.incorporate_call 0.54% : 0.000001s : 10: predicate.incorporate_call_switch 6.05% : 0.000016s : 72: predicate.inline 0.76% : 0.000002s : 10: predicate.inline_without_move 0.32% : 0.000001s : 10: predicate.j_node_and_user_rematch 0.90% : 0.000002s : 10: predicate.less_batch_normalization 1.78% : 0.000005s : 31: predicate.list_to_tuple_eliminator_ 2.61% : 0.000007s : 48: predicate.load_eliminater 0.88% : 0.000002s : 5: predicate.loop_unroll_after_grad 2.07% : 0.000005s : 36: predicate.loop_unroll_before_grad 1.70% : 0.000004s : 27: predicate.make_slice_get_slice_eliminator 0.57% : 0.000001s : 10: predicate.merge_addn 0.54% : 0.000001s : 10: predicate.micro_step_allgather_replace 0.61% : 0.000002s : 10: predicate.mini_step_allgather_replace 0.89% : 0.000002s : 17: predicate.minmaximum_grad 1.08% : 0.000003s : 5: predicate.mutable_eliminate 0.33% : 0.000001s : 5: predicate.opt_reshape 0.39% : 0.000001s : 5: predicate.parallel_virtual_node 1.87% : 0.000005s : 25: predicate.partial_defer_inline 1.70% : 0.000004s : 26: predicate.partial_eliminate 0.95% : 0.000002s : 17: predicate.print_const_string_wrapper 0.60% : 0.000002s : 10: predicate.reduce_all_const_elim 1.24% : 0.000003s : 17: predicate.reduce_eliminate 2.61% : 0.000007s : 48: predicate.redundant_stop_gradient_eliminater 0.40% : 0.000001s : 10: predicate.remove_not_recompute_node 1.38% : 0.000004s : 31: predicate.replace_applicator 0.40% : 0.000001s : 10: predicate.replace_old_param 0.32% : 0.000001s : 5: predicate.reset_defer_inline 1.13% : 0.000003s : 17: predicate.reshape_eliminate 0.60% : 0.000002s : 10: predicate.row_tensor_add_zeros_like 0.36% : 0.000001s : 5: predicate.row_tensor_eliminate 0.68% : 0.000002s : 10: predicate.same_eliminate 0.55% : 0.000001s : 10: predicate.set_cell_output_no_recompute 0.74% : 0.000002s : 10: predicate.shard_identity_eliminate 0.67% : 0.000002s : 10: predicate.special_op_eliminate 0.69% : 0.000002s : 10: predicate.specialize_transform 0.82% : 0.000002s : 10: predicate.split_environ_get_set_with_tuple_value 0.67% : 0.000002s : 10: predicate.stack_unstack_eliminate 0.37% : 0.000001s : 5: predicate.switch_call_monad_eliminater 1.49% : 0.000004s : 25: predicate.switch_defer_inline 2.04% : 0.000005s : 35: predicate.switch_layer_defer_inline 4.90% : 0.000013s : 76: predicate.switch_simplify 0.93% : 0.000002s : 17: predicate.tile_eliminate 0.96% : 0.000002s : 17: predicate.transpose_eliminate 1.72% : 0.000004s : 27: predicate.tuple_list_convert_item_index_to_positive 1.67% : 0.000004s : 27: predicate.tuple_list_get_item_const_eliminator 1.47% : 0.000004s : 27: predicate.tuple_list_get_item_depend_reorder 3.18% : 0.000008s : 41: predicate.tuple_list_get_item_eliminator 1.60% : 0.000004s : 27: predicate.tuple_list_get_set_item_eliminator 2.28% : 0.000006s : 37: predicate.tuple_list_set_item_eliminator 1.86% : 0.000005s : 31: predicate.tuple_to_list_eliminator_ 2.55% : 0.000007s : 48: predicate.updatestate_pure_node_eliminater 3.27% : 0.000008s : 58: predicate.updatestate_useless_node_eliminater 0.46% : 0.000001s : 5: predicate.value_based_eliminate 0.60% : 0.000002s : 10: predicate.virtual_dataset_eliminate 0.64% : 0.000002s : 10: predicate.virtual_output_eliminate 0.29% : 0.000001s : 5: predicate.virtual_view_grad_eliminate 0.36% : 0.000001s : 5: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000532 11 55.67% : 0.000296s : 5: func_graph_cloner_run.FuncGraphClonerGraph 44.33% : 0.000236s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.028538 192 0.02% : 0.000006s : 1: ForceFp32Comm 11.39% : 0.003250s : 1: add_attr 11.34% : 0.003236s : 1: add_attr_with_inline 0.02% : 0.000006s : 1: add_comm_op_reuse_tag 0.22% : 0.000064s : 1: add_recomputation 0.02% : 0.000006s : 1: assign_add_opt 0.26% : 0.000074s : 1: auto_monad 0.11% : 0.000032s : 1: auto_monad_reorder 0.02% : 0.000006s : 1: begin_end_overlap_inline 0.03% : 0.000008s : 1: bias_add_comm_swap 1.67% : 0.000477s : 1: bootstrap 0.12% : 0.000034s : 1: cconv 0.02% : 0.000006s : 1: comm_op_add_attrs 0.07% : 0.000021s : 1: control_data_broadcast_order 0.15% : 0.000043s : 1: convert_after_rewriter 0.12% : 0.000036s : 1: cse_after_recomputation 0.03% : 0.000008s : 1: dataset_repeat_opt 0.08% : 0.000022s : 1: detach_backward 0.04% : 0.000013s : 1: environ_conv 0.10% : 0.000030s : 1: event_method 0.03% : 0.000007s : 1: full_micro_interleaved_order_control 0.03% : 0.000008s : 1: get_jit_bprop_graph 0.04% : 0.000013s : 1: graph_reusing 0.02% : 0.000006s : 1: grouped_pairwise_exchange_alltoall 0.02% : 0.000006s : 1: handle_group_info 0.03% : 0.000008s : 1: inline 0.03% : 0.000009s : 1: insert-virtual-dataset 0.02% : 0.000006s : 1: interleave_parallel_branches 0.02% : 0.000006s : 1: interleave_split_concat_branches 0.03% : 0.000008s : 1: label_fine_grained_interleaved_index 0.03% : 0.000010s : 1: label_micro_interleaved_index 1.65% : 0.000472s : 1: loop_unroll 0.02% : 0.000006s : 1: merge_cast_opt 0.03% : 0.000008s : 1: micro_interleaved_order_control 2.21% : 0.000630s : 1: mutable_eliminate 0.03% : 0.000010s : 1: offloading_packed_experts 0.06% : 0.000016s : 1: opt.transform.loop_unroll_optimizer 0.07% : 0.000019s : 1: opt.transform.mutable_eliminate 5.07% : 0.001447s : 78: opt.transform.opt_a 0.13% : 0.000036s : 1: opt.transform.opt_after_cconv 0.11% : 0.000030s : 1: opt.transform.opt_after_jit_grad 0.47% : 0.000134s : 28: opt.transform.opt_b 0.21% : 0.000060s : 2: opt.transform.opt_trans_graph 0.16% : 0.000045s : 4: opt.transform.symbol_engine_opt 11.98% : 0.003420s : 1: opt_a 0.52% : 0.000148s : 1: opt_after_cconv 1.88% : 0.000537s : 1: opt_after_jit_grad 1.11% : 0.000317s : 1: opt_b 22.28% : 0.006358s : 1: optimize 0.09% : 0.000026s : 1: optimize_parallel_all_gather_comm 0.04% : 0.000012s : 1: order_py_execute_after_rewriter 0.09% : 0.000026s : 1: overlap_grad_flash_sp 0.02% : 0.000007s : 1: overlap_grad_matmul_and_grad_allreduce 0.03% : 0.000010s : 1: overlap_grad_ring_attention 0.02% : 0.000007s : 1: overlap_opt_shard_grad_in_pipeline 0.03% : 0.000007s : 1: overlap_opt_shard_in_pipeline 0.03% : 0.000008s : 1: overlap_param_gather 0.02% : 0.000006s : 1: overlap_recompute_allgather_and_fa_grad 0.04% : 0.000011s : 1: overlap_recompute_and_grad_model_parallel 0.03% : 0.000008s : 1: overlap_recompute_comm 0.03% : 0.000010s : 1: parallel-infer-symbol 0.02% : 0.000007s : 1: parallel-infer-symbol-second 0.03% : 0.000008s : 1: partial_unused_args_eliminate 0.03% : 0.000009s : 1: pipeline_parallel_scheduler 0.02% : 0.000007s : 1: pipeline_split 0.15% : 0.000042s : 1: pre_auto_parallel 0.12% : 0.000035s : 1: py_interpret_to_execute 0.07% : 0.000019s : 1: py_interpret_to_execute_after_opt_a 0.02% : 0.000006s : 1: remove_cast_before_assign_add 0.08% : 0.000022s : 1: remove_dup_value 1.43% : 0.000407s : 1: renormalize.infer 1.10% : 0.000314s : 1: renormalize.specialize 0.03% : 0.000008s : 1: reorder_send_recv_between_fp_bp 0.04% : 0.000011s : 1: rewriter_after_jit_bprop_graph 0.17% : 0.000049s : 1: rewriter_after_opt_a 0.34% : 0.000098s : 1: rewriter_before_opt_a 0.03% : 0.000009s : 1: slice_cell_reuse_recomputed_activation 0.03% : 0.000007s : 1: slice_recompute_activation 0.02% : 0.000007s : 1: split_layernorm_comm 0.03% : 0.000007s : 1: split_matmul_comm_elemetwise 0.04% : 0.000012s : 1: swap_dp_allreduce_reducescatter 0.38% : 0.000109s : 1: symbol_engine_optimizer 0.37% : 0.000106s : 1: tuple_transform 20.83% : 0.005946s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:45:23.670.343 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0152371, [21] [bootstrap]: 0.0004127 [type_inference]: 0.00597266 [event_method]: 1.926e-05 [auto_monad]: 6.479e-05 [graph_reusing]: 6.11998e-06 [inline]: 2.56e-06 [add_attr]: 0.00306575, [1] [add_attr_with_inline]: 0.00305727, [1] [Cycle 1]: 5.759e-05, [2] [tag_attr]: 1.91e-05 [meta_addattr_fg_expand]: 6.44999e-06 [parallel-infer-symbol]: 3.55e-06 [pre_auto_parallel]: 3.279e-05 [insert-virtual-dataset]: 2.33002e-06 [parallel-infer-symbol-second]: 6.89994e-07 [dataset_repeat_opt]: 2.12001e-06 [pipeline_split]: 1.57999e-06 [optimize]: 0.00498871, [53] [py_interpret_to_execute]: 2.675e-05 [rewriter_before_opt_a]: 8.299e-05 [opt_a]: 0.00291313, [2] [Cycle 1]: 0.00214957, [45] [expand_dump_flag]: 2.93e-06 [switch_simplify]: 4.314e-05 [loop_unroll]: 3.078e-05 [a_1]: 0.0007282 [with_stream_mark]: 1.531e-05 [recompute_prepare]: 1.002e-05 [updatestate_depend_eliminate]: 4.60999e-06 [updatestate_assign_eliminate]: 4.05e-06 [updatestate_loads_eliminate]: 3.70003e-06 [parameter_eliminate]: 1.77999e-06 [a_2]: 9.956e-05 [accelerated_algorithm]: 8.45001e-06 [shard]: 2.01e-06 [meta_shard_fg_expand]: 1.93002e-06 [shard_inline]: 7.88999e-06 [merge_send_recv]: 9.29e-06 [auto_parallel]: 6.46e-06 [parallel]: 1.835e-05 [flash_sp]: 7.65998e-06 [merge_comm]: 4.68999e-06 [allreduce_fusion]: 4.25999e-06 [matmul_add_comm_reduction]: 1.075e-05 [allreduce_slice_to_reducescatter]: 6.20028e-07 [virtual_shard_identity]: 9.22999e-06 [virtual_dataset]: 7.83001e-06 [get_grad_eliminate_]: 7.52002e-06 [virtual_output]: 7.3e-06 [merge_forward]: 4.08999e-06 [cell_reuse_recompute_pass]: 1.15001e-06 [offload_activation]: 1.088e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.541e-05 [merge_recompute_call_nodes]: 1.76e-06 [before_grad]: 1.266e-05 [set_forward_comm_id_for_comm_node_pass]: 4.38999e-06 [meta_fg_expand]: 3.4e-06 [flash_sp_send_recv_attached]: 2.86999e-06 [receive_attached]: 2.26e-06 [after_resolve]: 1.209e-05 [a_after_grad]: 1.182e-05 [renormalize]: 0.0006502 [add_forward_monad_depend]: 5.35999e-06 [auto_monad_grad]: 2.41e-06 [auto_monad_eliminator]: 1.673e-05 [cse]: 3.749e-05 [a_3]: 5.622e-05 [Cycle 2]: 0.00075393, [45] [expand_dump_flag]: 1.37999e-06 [switch_simplify]: 8.54e-06 [loop_unroll]: 7.35e-06 [a_1]: 0.00017413 [with_stream_mark]: 1.23e-05 [recompute_prepare]: 7.45e-06 [updatestate_depend_eliminate]: 3.46999e-06 [updatestate_assign_eliminate]: 2.96001e-06 [updatestate_loads_eliminate]: 2.73e-06 [parameter_eliminate]: 1.29e-06 [a_2]: 9.067e-05 [accelerated_algorithm]: 7.41001e-06 [shard]: 1.09e-06 [meta_shard_fg_expand]: 1.55001e-06 [shard_inline]: 7.18998e-06 [merge_send_recv]: 6.21e-06 [auto_parallel]: 6.40002e-06 [parallel]: 4.63001e-06 [flash_sp]: 6.84001e-06 [merge_comm]: 4.3e-06 [allreduce_fusion]: 3.92002e-06 [matmul_add_comm_reduction]: 6.51e-06 [allreduce_slice_to_reducescatter]: 5.00004e-07 [virtual_shard_identity]: 8.58001e-06 [virtual_dataset]: 6.98998e-06 [get_grad_eliminate_]: 6.88e-06 [virtual_output]: 6.87002e-06 [merge_forward]: 3.09001e-06 [cell_reuse_recompute_pass]: 1.45999e-06 [offload_activation]: 8.08001e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.682e-05 [merge_recompute_call_nodes]: 1.07998e-06 [before_grad]: 1.135e-05 [set_forward_comm_id_for_comm_node_pass]: 3.95e-06 [meta_fg_expand]: 2.61999e-06 [flash_sp_send_recv_attached]: 1.29998e-06 [receive_attached]: 1.39003e-06 [after_resolve]: 1.241e-05 [a_after_grad]: 1.137e-05 [renormalize]: 6.00121e-08 [add_forward_monad_depend]: 1.44998e-06 [auto_monad_grad]: 1.05999e-06 [auto_monad_eliminator]: 8.40001e-06 [cse]: 1.854e-05 [a_3]: 4.468e-05 [py_interpret_to_execute_after_opt_a]: 1.042e-05 [slice_cell_reuse_recomputed_activation]: 1.91e-06 [rewriter_after_opt_a]: 3.934e-05 [convert_after_rewriter]: 7.78001e-06 [order_py_execute_after_rewriter]: 5.51002e-06 [mutable_eliminate]: 0.00049593 [opt_b]: 0.00024305, [1] [Cycle 1]: 0.00023651, [7] [b_1]: 0.00015535 [b_2]: 9.20001e-06 [updatestate_depend_eliminate]: 7.5e-06 [updatestate_assign_eliminate]: 2.99001e-06 [updatestate_loads_eliminate]: 3.10002e-06 [renormalize]: 5.69999e-07 [cse]: 2.282e-05 [optimize_parallel_all_gather_comm]: 1.78e-05 [overlap_param_gather]: 1.87999e-06 [cconv]: 2.495e-05 [loop_unroll]: 0.00041635 [opt_after_cconv]: 0.00011448, [1] [Cycle 1]: 0.00010892, [7] [c_1]: 3.749e-05 [parameter_eliminate]: 2.94999e-06 [updatestate_depend_eliminate]: 5.89e-06 [updatestate_assign_eliminate]: 3.07002e-06 [updatestate_loads_eliminate]: 2.81e-06 [cse]: 2.212e-05 [renormalize]: 4.50003e-07 [remove_dup_value]: 1.593e-05 [tuple_transform]: 8.573e-05, [1] [Cycle 1]: 8.142e-05, [4] [d_1]: 5.285e-05 [none_parameter_eliminate]: 1.84e-06 [renormalize]: 1.80007e-07 [switch_simplify]: 8.13001e-06 [partial_unused_args_eliminate]: 1.73002e-06 [add_recomputation]: 5.784e-05 [cse_after_recomputation]: 2.585e-05, [1] [Cycle 1]: 2.137e-05, [1] [cse]: 1.574e-05 [environ_conv]: 5.79e-06 [swap_dp_allreduce_reducescatter]: 6.08002e-06 [bias_add_comm_swap]: 2.78003e-06 [label_micro_interleaved_index]: 4.27e-06 [label_fine_grained_interleaved_index]: 2.61e-06 [merge_cast_opt]: 1.39e-06 [slice_recompute_activation]: 2.16e-06 [micro_interleaved_order_control]: 2.12999e-06 [assign_add_opt]: 1.42e-06 [ForceFp32Comm]: 8.60018e-07 [remove_cast_before_assign_add]: 1.00001e-06 [full_micro_interleaved_order_control]: 2.11e-06 [reorder_send_recv_between_fp_bp]: 2.61e-06 [comm_op_add_attrs]: 9.89996e-07 [add_comm_op_reuse_tag]: 9.50007e-07 [interleave_split_concat_branches]: 1.11002e-06 [interleave_parallel_branches]: 1.12999e-06 [overlap_opt_shard_in_pipeline]: 1.44e-06 [overlap_opt_shard_grad_in_pipeline]: 1.71e-06 [control_data_broadcast_order]: 1.466e-05 [grouped_pairwise_exchange_alltoall]: 1.54e-06 [offloading_packed_experts]: 4.55001e-06 [overlap_recompute_and_grad_model_parallel]: 4.95999e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.17e-06 [overlap_recompute_allgather_and_fa_grad]: 1.72001e-06 [overlap_recompute_comm]: 2.07001e-06 [overlap_grad_ring_attention]: 4.73001e-06 [overlap_grad_flash_sp]: 2.086e-05 [begin_end_overlap_inline]: 5.3001e-07 [split_matmul_comm_elemetwise]: 1.99999e-06 [split_layernorm_comm]: 1.65001e-06 [handle_group_info]: 9.29984e-07 [symbol_engine_optimizer]: 8.069e-05, [1] [Cycle 1]: 7.661e-05, [6] [build]: 3.42002e-06 [elim_shapecalc]: 1.11e-05 [elim_not_effective]: 1.459e-05 [opt_reshape]: 8.1e-06 [fold_const_symbol]: 1.186e-05 [renormalize]: 2.50002e-07 [detach_backward]: 1.68997e-06 [pipeline_parallel_scheduler]: 1.47999e-06 [auto_monad_reorder]: 1.911e-05 [get_jit_bprop_graph]: 1.17e-06 [rewriter_after_jit_bprop_graph]: 3.61999e-06 [opt_after_jit_grad]: 0.00045106 [validate]: 3.996e-05 Sums bootstrap : 0.000413s : 3.67% type_inference : 0.005973s : 53.15% event_method : 0.000019s : 0.17% auto_monad : 0.000065s : 0.58% graph_reusing : 0.000006s : 0.05% inline : 0.000003s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000019s : 0.17% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.06% parallel-infer-symbol : 0.000004s : 0.03% pre_auto_parallel : 0.000033s : 0.29% insert-virtual-dataset : 0.000002s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000027s : 0.24% optimize.rewriter_before_opt_a : 0.000083s : 0.74% optimize.opt_a.expand_dump_flag : 0.000004s : 0.04% optimize.opt_a.switch_simplify : 0.000052s : 0.46% optimize.opt_a.loop_unroll : 0.000038s : 0.34% optimize.opt_a.a_1 : 0.000902s : 8.03% optimize.opt_a.with_stream_mark : 0.000028s : 0.25% optimize.opt_a.recompute_prepare : 0.000017s : 0.16% optimize.opt_a.updatestate_depend_eliminate : 0.000008s : 0.07% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.06% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.06% optimize.opt_a.parameter_eliminate : 0.000003s : 0.03% optimize.opt_a.a_2 : 0.000190s : 1.69% optimize.opt_a.accelerated_algorithm : 0.000016s : 0.14% optimize.opt_a.shard : 0.000003s : 0.03% optimize.opt_a.meta_shard_fg_expand : 0.000003s : 0.03% optimize.opt_a.shard_inline : 0.000015s : 0.13% optimize.opt_a.merge_send_recv : 0.000016s : 0.14% optimize.opt_a.auto_parallel : 0.000013s : 0.11% optimize.opt_a.parallel : 0.000023s : 0.20% optimize.opt_a.flash_sp : 0.000014s : 0.13% optimize.opt_a.merge_comm : 0.000009s : 0.08% optimize.opt_a.allreduce_fusion : 0.000008s : 0.07% optimize.opt_a.matmul_add_comm_reduction : 0.000017s : 0.15% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000018s : 0.16% optimize.opt_a.virtual_dataset : 0.000015s : 0.13% optimize.opt_a.get_grad_eliminate_ : 0.000014s : 0.13% optimize.opt_a.virtual_output : 0.000014s : 0.13% optimize.opt_a.merge_forward : 0.000007s : 0.06% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.02% optimize.opt_a.offload_activation : 0.000019s : 0.17% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000032s : 0.29% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.03% optimize.opt_a.before_grad : 0.000024s : 0.21% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000008s : 0.07% optimize.opt_a.meta_fg_expand : 0.000006s : 0.05% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.04% optimize.opt_a.receive_attached : 0.000004s : 0.03% optimize.opt_a.after_resolve : 0.000024s : 0.22% optimize.opt_a.a_after_grad : 0.000023s : 0.21% optimize.opt_a.renormalize : 0.000650s : 5.79% optimize.opt_a.add_forward_monad_depend : 0.000007s : 0.06% optimize.opt_a.auto_monad_grad : 0.000003s : 0.03% optimize.opt_a.auto_monad_eliminator : 0.000025s : 0.22% optimize.opt_a.cse : 0.000056s : 0.50% optimize.opt_a.a_3 : 0.000101s : 0.90% optimize.py_interpret_to_execute_after_opt_a : 0.000010s : 0.09% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.02% optimize.rewriter_after_opt_a : 0.000039s : 0.35% optimize.convert_after_rewriter : 0.000008s : 0.07% optimize.order_py_execute_after_rewriter : 0.000006s : 0.05% optimize.mutable_eliminate : 0.000496s : 4.41% optimize.opt_b.b_1 : 0.000155s : 1.38% optimize.opt_b.b_2 : 0.000009s : 0.08% optimize.opt_b.updatestate_depend_eliminate : 0.000007s : 0.07% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_b.renormalize : 0.000001s : 0.01% optimize.opt_b.cse : 0.000023s : 0.20% optimize.optimize_parallel_all_gather_comm : 0.000018s : 0.16% optimize.overlap_param_gather : 0.000002s : 0.02% optimize.cconv : 0.000025s : 0.22% optimize.loop_unroll : 0.000416s : 3.71% optimize.opt_after_cconv.c_1 : 0.000037s : 0.33% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.05% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.cse : 0.000022s : 0.20% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000016s : 0.14% optimize.tuple_transform.d_1 : 0.000053s : 0.47% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.02% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000008s : 0.07% optimize.partial_unused_args_eliminate : 0.000002s : 0.02% optimize.add_recomputation : 0.000058s : 0.51% optimize.cse_after_recomputation.cse : 0.000016s : 0.14% optimize.environ_conv : 0.000006s : 0.05% optimize.swap_dp_allreduce_reducescatter : 0.000006s : 0.05% optimize.bias_add_comm_swap : 0.000003s : 0.02% optimize.label_micro_interleaved_index : 0.000004s : 0.04% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.02% optimize.merge_cast_opt : 0.000001s : 0.01% optimize.slice_recompute_activation : 0.000002s : 0.02% optimize.micro_interleaved_order_control : 0.000002s : 0.02% optimize.assign_add_opt : 0.000001s : 0.01% optimize.ForceFp32Comm : 0.000001s : 0.01% optimize.remove_cast_before_assign_add : 0.000001s : 0.01% optimize.full_micro_interleaved_order_control : 0.000002s : 0.02% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.02% optimize.comm_op_add_attrs : 0.000001s : 0.01% optimize.add_comm_op_reuse_tag : 0.000001s : 0.01% optimize.interleave_split_concat_branches : 0.000001s : 0.01% optimize.interleave_parallel_branches : 0.000001s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.02% optimize.control_data_broadcast_order : 0.000015s : 0.13% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.01% optimize.offloading_packed_experts : 0.000005s : 0.04% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.04% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000002s : 0.02% optimize.overlap_recompute_comm : 0.000002s : 0.02% optimize.overlap_grad_ring_attention : 0.000005s : 0.04% optimize.overlap_grad_flash_sp : 0.000021s : 0.19% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.02% optimize.split_layernorm_comm : 0.000002s : 0.01% optimize.handle_group_info : 0.000001s : 0.01% optimize.symbol_engine_optimizer.build : 0.000003s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000011s : 0.10% optimize.symbol_engine_optimizer.elim_not_effective : 0.000015s : 0.13% optimize.symbol_engine_optimizer.opt_reshape : 0.000008s : 0.07% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000012s : 0.11% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.02% pipeline_parallel_scheduler : 0.000001s : 0.01% auto_monad_reorder : 0.000019s : 0.17% get_jit_bprop_graph : 0.000001s : 0.01% rewriter_after_jit_bprop_graph : 0.000004s : 0.03% opt_after_jit_grad : 0.000451s : 4.01% validate : 0.000040s : 0.36% Time group info: ------[substitution.] 0.000201 38 10.86% : 0.000022s : 3: substitution.cast_eliminate 1.08% : 0.000002s : 3: substitution.elim_not_effective 0.80% : 0.000002s : 3: substitution.fold_const_symbol 3.17% : 0.000006s : 5: substitution.graph_param_transform 68.10% : 0.000137s : 4: substitution.inline 2.25% : 0.000005s : 6: substitution.j_node_and_user_rematch 4.26% : 0.000009s : 6: substitution.remove_not_recompute_node 2.23% : 0.000004s : 4: substitution.replace_old_param 7.25% : 0.000015s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.005909 2 88.37% : 0.005222s : 1: type_inference.infer 11.63% : 0.000687s : 1: type_inference.specialize ------[replace.] 0.000062 8 60.15% : 0.000037s : 4: replace.inline 39.85% : 0.000025s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000147 8 91.41% : 0.000135s : 4: match.inline 8.59% : 0.000013s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000249 1596 0.98% : 0.000002s : 17: predicate.accumulaten_eliminater 0.64% : 0.000002s : 5: predicate.ad_related_special_op_eliminate 0.55% : 0.000001s : 10: predicate.addn_check_dump 0.98% : 0.000002s : 17: predicate.addn_zero_filter 0.87% : 0.000002s : 17: predicate.adjust_all_reduce_mul_add 1.97% : 0.000005s : 27: predicate.arithmetic_simplify 1.08% : 0.000003s : 17: predicate.cast_eliminate 0.56% : 0.000001s : 10: predicate.check_bprop_eliminate 0.59% : 0.000001s : 10: predicate.compare_switch_simplify 0.19% : 0.000000s : 5: predicate.const_output_eliminate 0.58% : 0.000001s : 10: predicate.depend_value_elim 1.00% : 0.000003s : 17: predicate.dict_get_item_const_eliminator 1.08% : 0.000003s : 17: predicate.dict_get_item_eliminator 0.94% : 0.000002s : 17: predicate.dict_set_item_eliminator 0.92% : 0.000002s : 10: predicate.dumpgradient_eliminate 0.18% : 0.000000s : 5: predicate.elim_not_effective 0.39% : 0.000001s : 5: predicate.elim_shapecalc_of_broadcastargs 1.25% : 0.000003s : 22: predicate.environ_add_const_eliminate 1.16% : 0.000003s : 22: predicate.environ_get_add_eliminate 1.16% : 0.000003s : 22: predicate.environ_get_depend_swap 1.73% : 0.000004s : 32: predicate.environ_get_eliminate 1.15% : 0.000003s : 22: predicate.environ_get_set_eliminate 1.40% : 0.000003s : 25: predicate.exchange_switch_depend_value 2.34% : 0.000006s : 25: predicate.float_depend_g_call 0.53% : 0.000001s : 10: predicate.float_environ_get_switch 0.81% : 0.000002s : 15: predicate.float_tuple_getitem_switch 0.18% : 0.000000s : 5: predicate.fold_const_symbol 0.73% : 0.000002s : 10: predicate.get_grad_eliminate 0.20% : 0.000001s : 5: predicate.graph_param_transform 0.62% : 0.000002s : 10: predicate.incorporate_call 0.54% : 0.000001s : 10: predicate.incorporate_call_switch 6.31% : 0.000016s : 72: predicate.inline 0.79% : 0.000002s : 10: predicate.inline_without_move 0.30% : 0.000001s : 10: predicate.j_node_and_user_rematch 0.80% : 0.000002s : 10: predicate.less_batch_normalization 1.79% : 0.000004s : 31: predicate.list_to_tuple_eliminator_ 2.62% : 0.000007s : 48: predicate.load_eliminater 0.74% : 0.000002s : 5: predicate.loop_unroll_after_grad 2.07% : 0.000005s : 36: predicate.loop_unroll_before_grad 1.57% : 0.000004s : 27: predicate.make_slice_get_slice_eliminator 0.55% : 0.000001s : 10: predicate.merge_addn 0.59% : 0.000001s : 10: predicate.micro_step_allgather_replace 0.55% : 0.000001s : 10: predicate.mini_step_allgather_replace 0.87% : 0.000002s : 17: predicate.minmaximum_grad 0.91% : 0.000002s : 5: predicate.mutable_eliminate 0.33% : 0.000001s : 5: predicate.opt_reshape 0.38% : 0.000001s : 5: predicate.parallel_virtual_node 1.80% : 0.000004s : 25: predicate.partial_defer_inline 1.72% : 0.000004s : 26: predicate.partial_eliminate 0.92% : 0.000002s : 17: predicate.print_const_string_wrapper 0.59% : 0.000001s : 10: predicate.reduce_all_const_elim 1.20% : 0.000003s : 17: predicate.reduce_eliminate 2.60% : 0.000006s : 48: predicate.redundant_stop_gradient_eliminater 0.40% : 0.000001s : 10: predicate.remove_not_recompute_node 1.43% : 0.000004s : 31: predicate.replace_applicator 0.50% : 0.000001s : 10: predicate.replace_old_param 0.28% : 0.000001s : 5: predicate.reset_defer_inline 1.11% : 0.000003s : 17: predicate.reshape_eliminate 0.66% : 0.000002s : 10: predicate.row_tensor_add_zeros_like 0.45% : 0.000001s : 5: predicate.row_tensor_eliminate 0.80% : 0.000002s : 10: predicate.same_eliminate 0.47% : 0.000001s : 10: predicate.set_cell_output_no_recompute 0.88% : 0.000002s : 10: predicate.shard_identity_eliminate 0.70% : 0.000002s : 10: predicate.special_op_eliminate 0.80% : 0.000002s : 10: predicate.specialize_transform 0.90% : 0.000002s : 10: predicate.split_environ_get_set_with_tuple_value 0.70% : 0.000002s : 10: predicate.stack_unstack_eliminate 0.37% : 0.000001s : 5: predicate.switch_call_monad_eliminater 1.51% : 0.000004s : 25: predicate.switch_defer_inline 2.02% : 0.000005s : 35: predicate.switch_layer_defer_inline 4.66% : 0.000012s : 76: predicate.switch_simplify 0.93% : 0.000002s : 17: predicate.tile_eliminate 0.94% : 0.000002s : 17: predicate.transpose_eliminate 1.60% : 0.000004s : 27: predicate.tuple_list_convert_item_index_to_positive 1.93% : 0.000005s : 27: predicate.tuple_list_get_item_const_eliminator 1.55% : 0.000004s : 27: predicate.tuple_list_get_item_depend_reorder 3.21% : 0.000008s : 41: predicate.tuple_list_get_item_eliminator 1.65% : 0.000004s : 27: predicate.tuple_list_get_set_item_eliminator 2.34% : 0.000006s : 37: predicate.tuple_list_set_item_eliminator 1.76% : 0.000004s : 31: predicate.tuple_to_list_eliminator_ 2.53% : 0.000006s : 48: predicate.updatestate_pure_node_eliminater 3.31% : 0.000008s : 58: predicate.updatestate_useless_node_eliminater 0.43% : 0.000001s : 5: predicate.value_based_eliminate 0.59% : 0.000001s : 10: predicate.virtual_dataset_eliminate 0.59% : 0.000001s : 10: predicate.virtual_output_eliminate 0.26% : 0.000001s : 5: predicate.virtual_view_grad_eliminate 0.43% : 0.000001s : 5: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000513 11 55.19% : 0.000283s : 5: func_graph_cloner_run.FuncGraphClonerGraph 44.81% : 0.000230s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.025557 192 0.01% : 0.000004s : 1: ForceFp32Comm 12.01% : 0.003070s : 1: add_attr 11.98% : 0.003061s : 1: add_attr_with_inline 0.01% : 0.000004s : 1: add_comm_op_reuse_tag 0.24% : 0.000062s : 1: add_recomputation 0.02% : 0.000004s : 1: assign_add_opt 0.28% : 0.000071s : 1: auto_monad 0.09% : 0.000023s : 1: auto_monad_reorder 0.01% : 0.000003s : 1: begin_end_overlap_inline 0.02% : 0.000006s : 1: bias_add_comm_swap 1.71% : 0.000437s : 1: bootstrap 0.11% : 0.000028s : 1: cconv 0.01% : 0.000004s : 1: comm_op_add_attrs 0.07% : 0.000018s : 1: control_data_broadcast_order 0.04% : 0.000011s : 1: convert_after_rewriter 0.11% : 0.000029s : 1: cse_after_recomputation 0.02% : 0.000005s : 1: dataset_repeat_opt 0.02% : 0.000005s : 1: detach_backward 0.04% : 0.000009s : 1: environ_conv 0.10% : 0.000026s : 1: event_method 0.02% : 0.000005s : 1: full_micro_interleaved_order_control 0.02% : 0.000004s : 1: get_jit_bprop_graph 0.04% : 0.000010s : 1: graph_reusing 0.02% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000004s : 1: handle_group_info 0.02% : 0.000005s : 1: inline 0.02% : 0.000006s : 1: insert-virtual-dataset 0.01% : 0.000004s : 1: interleave_parallel_branches 0.01% : 0.000004s : 1: interleave_split_concat_branches 0.02% : 0.000006s : 1: label_fine_grained_interleaved_index 0.03% : 0.000007s : 1: label_micro_interleaved_index 1.66% : 0.000424s : 1: loop_unroll 0.02% : 0.000004s : 1: merge_cast_opt 0.02% : 0.000005s : 1: micro_interleaved_order_control 1.97% : 0.000504s : 1: mutable_eliminate 0.03% : 0.000007s : 1: offloading_packed_experts 0.06% : 0.000015s : 1: opt.transform.loop_unroll_optimizer 0.07% : 0.000018s : 1: opt.transform.mutable_eliminate 5.52% : 0.001410s : 78: opt.transform.opt_a 0.14% : 0.000036s : 1: opt.transform.opt_after_cconv 0.11% : 0.000028s : 1: opt.transform.opt_after_jit_grad 0.52% : 0.000133s : 28: opt.transform.opt_b 0.23% : 0.000059s : 2: opt.transform.opt_trans_graph 0.17% : 0.000042s : 4: opt.transform.symbol_engine_opt 11.41% : 0.002916s : 1: opt_a 0.46% : 0.000118s : 1: opt_after_cconv 1.80% : 0.000460s : 1: opt_after_jit_grad 0.96% : 0.000247s : 1: opt_b 19.54% : 0.004993s : 1: optimize 0.08% : 0.000021s : 1: optimize_parallel_all_gather_comm 0.03% : 0.000008s : 1: order_py_execute_after_rewriter 0.09% : 0.000024s : 1: overlap_grad_flash_sp 0.01% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.03% : 0.000008s : 1: overlap_grad_ring_attention 0.02% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.02% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.02% : 0.000005s : 1: overlap_param_gather 0.02% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.03% : 0.000008s : 1: overlap_recompute_and_grad_model_parallel 0.02% : 0.000005s : 1: overlap_recompute_comm 0.03% : 0.000007s : 1: parallel-infer-symbol 0.01% : 0.000004s : 1: parallel-infer-symbol-second 0.02% : 0.000005s : 1: partial_unused_args_eliminate 0.02% : 0.000005s : 1: pipeline_parallel_scheduler 0.02% : 0.000004s : 1: pipeline_split 0.14% : 0.000037s : 1: pre_auto_parallel 0.12% : 0.000031s : 1: py_interpret_to_execute 0.05% : 0.000014s : 1: py_interpret_to_execute_after_opt_a 0.02% : 0.000004s : 1: remove_cast_before_assign_add 0.08% : 0.000019s : 1: remove_dup_value 1.39% : 0.000354s : 1: renormalize.infer 1.13% : 0.000289s : 1: renormalize.specialize 0.02% : 0.000005s : 1: reorder_send_recv_between_fp_bp 0.03% : 0.000007s : 1: rewriter_after_jit_bprop_graph 0.17% : 0.000044s : 1: rewriter_after_opt_a 0.34% : 0.000087s : 1: rewriter_before_opt_a 0.02% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.02% : 0.000005s : 1: slice_recompute_activation 0.02% : 0.000004s : 1: split_layernorm_comm 0.02% : 0.000005s : 1: split_matmul_comm_elemetwise 0.04% : 0.000009s : 1: swap_dp_allreduce_reducescatter 0.33% : 0.000083s : 1: symbol_engine_optimizer 0.35% : 0.000089s : 1: tuple_transform 23.44% : 0.005989s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:45:23.975.454 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:45:23.975.722 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0168227, [21] [bootstrap]: 0.00045128 [type_inference]: 0.00600022 [event_method]: 1.971e-05 [auto_monad]: 6.422e-05 [graph_reusing]: 5.75001e-06 [inline]: 2.27999e-06 [add_attr]: 0.00312366, [1] [add_attr_with_inline]: 0.00311429, [1] [Cycle 1]: 7.501e-05, [2] [tag_attr]: 2.022e-05 [meta_addattr_fg_expand]: 6.15002e-06 [parallel-infer-symbol]: 3.80998e-06 [pre_auto_parallel]: 3.643e-05 [insert-virtual-dataset]: 2.59001e-06 [parallel-infer-symbol-second]: 7.80012e-07 [dataset_repeat_opt]: 2.12001e-06 [pipeline_split]: 1.59998e-06 [optimize]: 0.00585671, [53] [py_interpret_to_execute]: 3.13e-05 [rewriter_before_opt_a]: 8.859e-05 [opt_a]: 0.00338805, [2] [Cycle 1]: 0.00239048, [45] [expand_dump_flag]: 3.25998e-06 [switch_simplify]: 4.358e-05 [loop_unroll]: 3.152e-05 [a_1]: 0.0006784 [with_stream_mark]: 1.748e-05 [recompute_prepare]: 1.012e-05 [updatestate_depend_eliminate]: 5.12999e-06 [updatestate_assign_eliminate]: 4.28999e-06 [updatestate_loads_eliminate]: 3.86001e-06 [parameter_eliminate]: 1.81e-06 [a_2]: 0.00013186 [accelerated_algorithm]: 9.10001e-06 [shard]: 1.79e-06 [meta_shard_fg_expand]: 2.04e-06 [shard_inline]: 8.54002e-06 [merge_send_recv]: 1.062e-05 [auto_parallel]: 7.92003e-06 [parallel]: 1.997e-05 [flash_sp]: 8.37e-06 [merge_comm]: 4.77e-06 [allreduce_fusion]: 4.67e-06 [matmul_add_comm_reduction]: 1.073e-05 [allreduce_slice_to_reducescatter]: 6.39993e-07 [virtual_shard_identity]: 9.20999e-06 [virtual_dataset]: 8.08999e-06 [get_grad_eliminate_]: 7.77002e-06 [virtual_output]: 7.72998e-06 [merge_forward]: 5.32001e-06 [cell_reuse_recompute_pass]: 1.62001e-06 [offload_activation]: 1.162e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.78e-05 [merge_recompute_call_nodes]: 1.76998e-06 [before_grad]: 1.319e-05 [set_forward_comm_id_for_comm_node_pass]: 5.03002e-06 [meta_fg_expand]: 3.6e-06 [flash_sp_send_recv_attached]: 3.03e-06 [receive_attached]: 2.49001e-06 [after_resolve]: 1.39e-05 [a_after_grad]: 1.255e-05 [renormalize]: 0.00071602 [add_forward_monad_depend]: 6.07001e-06 [auto_monad_grad]: 2.32999e-06 [auto_monad_eliminator]: 1.756e-05 [cse]: 3.528e-05 [a_3]: 7.289e-05 [Cycle 2]: 0.00098228, [45] [expand_dump_flag]: 1.44e-06 [switch_simplify]: 1.13e-05 [loop_unroll]: 7.71999e-06 [a_1]: 0.00018116 [with_stream_mark]: 1.283e-05 [recompute_prepare]: 7.87e-06 [updatestate_depend_eliminate]: 4.25999e-06 [updatestate_assign_eliminate]: 3.13998e-06 [updatestate_loads_eliminate]: 3.16999e-06 [parameter_eliminate]: 1.28002e-06 [a_2]: 0.00012025 [accelerated_algorithm]: 7.43999e-06 [shard]: 1.30999e-06 [meta_shard_fg_expand]: 1.50999e-06 [shard_inline]: 7.53999e-06 [merge_send_recv]: 7.53e-06 [auto_parallel]: 6.39999e-06 [parallel]: 5.11997e-06 [flash_sp]: 4.02e-06 [merge_comm]: 4.23999e-06 [allreduce_fusion]: 4.08999e-06 [matmul_add_comm_reduction]: 7.4e-06 [allreduce_slice_to_reducescatter]: 3.50003e-07 [virtual_shard_identity]: 8.36002e-06 [virtual_dataset]: 7.2e-06 [get_grad_eliminate_]: 6.84001e-06 [virtual_output]: 7.66999e-06 [merge_forward]: 3.9e-06 [cell_reuse_recompute_pass]: 1.58002e-06 [offload_activation]: 8.43999e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.65e-05 [merge_recompute_call_nodes]: 8.99978e-07 [before_grad]: 1.163e-05 [set_forward_comm_id_for_comm_node_pass]: 4.1e-06 [meta_fg_expand]: 2.58e-06 [flash_sp_send_recv_attached]: 1.05001e-06 [receive_attached]: 1.55001e-06 [after_resolve]: 1.314e-05 [a_after_grad]: 1.248e-05 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 1.37999e-06 [auto_monad_grad]: 1.07998e-06 [auto_monad_eliminator]: 9.57999e-06 [cse]: 1.953e-05 [a_3]: 5.766e-05 [py_interpret_to_execute_after_opt_a]: 1.444e-05 [slice_cell_reuse_recomputed_activation]: 5.42001e-06 [rewriter_after_opt_a]: 4.484e-05 [convert_after_rewriter]: 1.068e-05 [order_py_execute_after_rewriter]: 9.24998e-06 [mutable_eliminate]: 0.00054257 [opt_b]: 0.00030717, [1] [Cycle 1]: 0.00029733, [7] [b_1]: 0.00019673 [b_2]: 9.08002e-06 [updatestate_depend_eliminate]: 6.10002e-06 [updatestate_assign_eliminate]: 3.06999e-06 [updatestate_loads_eliminate]: 3.26001e-06 [renormalize]: 5.50004e-07 [cse]: 2.329e-05 [optimize_parallel_all_gather_comm]: 2.142e-05 [overlap_param_gather]: 5.23002e-06 [cconv]: 2.84e-05 [loop_unroll]: 0.00046263 [opt_after_cconv]: 0.00014868, [1] [Cycle 1]: 0.0001397, [7] [c_1]: 4.102e-05 [parameter_eliminate]: 2.94999e-06 [updatestate_depend_eliminate]: 6.36e-06 [updatestate_assign_eliminate]: 3.4e-06 [updatestate_loads_eliminate]: 3.28998e-06 [cse]: 2.319e-05 [renormalize]: 3.69997e-07 [remove_dup_value]: 1.878e-05 [tuple_transform]: 0.00010875, [1] [Cycle 1]: 0.00010145, [4] [d_1]: 5.763e-05 [none_parameter_eliminate]: 1.94999e-06 [renormalize]: 2.60014e-07 [switch_simplify]: 9.05999e-06 [partial_unused_args_eliminate]: 4.55001e-06 [add_recomputation]: 6.102e-05 [cse_after_recomputation]: 3.344e-05, [1] [Cycle 1]: 2.581e-05, [1] [cse]: 1.608e-05 [environ_conv]: 1.017e-05 [swap_dp_allreduce_reducescatter]: 9.04e-06 [bias_add_comm_swap]: 5.25001e-06 [label_micro_interleaved_index]: 7.41001e-06 [label_fine_grained_interleaved_index]: 5.39e-06 [merge_cast_opt]: 3.81001e-06 [slice_recompute_activation]: 4.87e-06 [micro_interleaved_order_control]: 4.63999e-06 [assign_add_opt]: 3.76999e-06 [ForceFp32Comm]: 3.16999e-06 [remove_cast_before_assign_add]: 3.6e-06 [full_micro_interleaved_order_control]: 4.42e-06 [reorder_send_recv_between_fp_bp]: 5.15999e-06 [comm_op_add_attrs]: 4.00998e-06 [add_comm_op_reuse_tag]: 3.41001e-06 [interleave_split_concat_branches]: 3.76999e-06 [interleave_parallel_branches]: 3.68999e-06 [overlap_opt_shard_in_pipeline]: 3.76999e-06 [overlap_opt_shard_grad_in_pipeline]: 4.65001e-06 [control_data_broadcast_order]: 1.893e-05 [grouped_pairwise_exchange_alltoall]: 4.28001e-06 [offloading_packed_experts]: 7.85e-06 [overlap_recompute_and_grad_model_parallel]: 8.28999e-06 [overlap_grad_matmul_and_grad_allreduce]: 4.35e-06 [overlap_recompute_allgather_and_fa_grad]: 3.99997e-06 [overlap_recompute_comm]: 5.39e-06 [overlap_grad_ring_attention]: 7.77e-06 [overlap_grad_flash_sp]: 2.596e-05 [begin_end_overlap_inline]: 3.12002e-06 [split_matmul_comm_elemetwise]: 4.77998e-06 [split_layernorm_comm]: 4.22e-06 [handle_group_info]: 3.81001e-06 [symbol_engine_optimizer]: 0.00011058, [1] [Cycle 1]: 0.00010324, [6] [build]: 3.55e-06 [elim_shapecalc]: 1.265e-05 [elim_not_effective]: 1.626e-05 [opt_reshape]: 8.82e-06 [fold_const_symbol]: 1.284e-05 [renormalize]: 2.40019e-07 [detach_backward]: 3.26999e-06 [pipeline_parallel_scheduler]: 1.55999e-06 [auto_monad_reorder]: 2.191e-05 [get_jit_bprop_graph]: 1.71e-06 [rewriter_after_jit_bprop_graph]: 4.74e-06 [opt_after_jit_grad]: 0.00051496 [validate]: 4.337e-05 Sums bootstrap : 0.000451s : 3.82% type_inference : 0.006000s : 50.76% event_method : 0.000020s : 0.17% auto_monad : 0.000064s : 0.54% graph_reusing : 0.000006s : 0.05% inline : 0.000002s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000020s : 0.17% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.05% parallel-infer-symbol : 0.000004s : 0.03% pre_auto_parallel : 0.000036s : 0.31% insert-virtual-dataset : 0.000003s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000031s : 0.26% optimize.rewriter_before_opt_a : 0.000089s : 0.75% optimize.opt_a.expand_dump_flag : 0.000005s : 0.04% optimize.opt_a.switch_simplify : 0.000055s : 0.46% optimize.opt_a.loop_unroll : 0.000039s : 0.33% optimize.opt_a.a_1 : 0.000860s : 7.27% optimize.opt_a.with_stream_mark : 0.000030s : 0.26% optimize.opt_a.recompute_prepare : 0.000018s : 0.15% optimize.opt_a.updatestate_depend_eliminate : 0.000009s : 0.08% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.06% optimize.opt_a.updatestate_loads_eliminate : 0.000007s : 0.06% optimize.opt_a.parameter_eliminate : 0.000003s : 0.03% optimize.opt_a.a_2 : 0.000252s : 2.13% optimize.opt_a.accelerated_algorithm : 0.000017s : 0.14% optimize.opt_a.shard : 0.000003s : 0.03% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.03% optimize.opt_a.shard_inline : 0.000016s : 0.14% optimize.opt_a.merge_send_recv : 0.000018s : 0.15% optimize.opt_a.auto_parallel : 0.000014s : 0.12% optimize.opt_a.parallel : 0.000025s : 0.21% optimize.opt_a.flash_sp : 0.000012s : 0.10% optimize.opt_a.merge_comm : 0.000009s : 0.08% optimize.opt_a.allreduce_fusion : 0.000009s : 0.07% optimize.opt_a.matmul_add_comm_reduction : 0.000018s : 0.15% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000018s : 0.15% optimize.opt_a.virtual_dataset : 0.000015s : 0.13% optimize.opt_a.get_grad_eliminate_ : 0.000015s : 0.12% optimize.opt_a.virtual_output : 0.000015s : 0.13% optimize.opt_a.merge_forward : 0.000009s : 0.08% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.03% optimize.opt_a.offload_activation : 0.000020s : 0.17% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000034s : 0.29% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.02% optimize.opt_a.before_grad : 0.000025s : 0.21% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000009s : 0.08% optimize.opt_a.meta_fg_expand : 0.000006s : 0.05% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.03% optimize.opt_a.receive_attached : 0.000004s : 0.03% optimize.opt_a.after_resolve : 0.000027s : 0.23% optimize.opt_a.a_after_grad : 0.000025s : 0.21% optimize.opt_a.renormalize : 0.000716s : 6.06% optimize.opt_a.add_forward_monad_depend : 0.000007s : 0.06% optimize.opt_a.auto_monad_grad : 0.000003s : 0.03% optimize.opt_a.auto_monad_eliminator : 0.000027s : 0.23% optimize.opt_a.cse : 0.000055s : 0.46% optimize.opt_a.a_3 : 0.000131s : 1.10% optimize.py_interpret_to_execute_after_opt_a : 0.000014s : 0.12% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.05% optimize.rewriter_after_opt_a : 0.000045s : 0.38% optimize.convert_after_rewriter : 0.000011s : 0.09% optimize.order_py_execute_after_rewriter : 0.000009s : 0.08% optimize.mutable_eliminate : 0.000543s : 4.59% optimize.opt_b.b_1 : 0.000197s : 1.66% optimize.opt_b.b_2 : 0.000009s : 0.08% optimize.opt_b.updatestate_depend_eliminate : 0.000006s : 0.05% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000023s : 0.20% optimize.optimize_parallel_all_gather_comm : 0.000021s : 0.18% optimize.overlap_param_gather : 0.000005s : 0.04% optimize.cconv : 0.000028s : 0.24% optimize.loop_unroll : 0.000463s : 3.91% optimize.opt_after_cconv.c_1 : 0.000041s : 0.35% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.02% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.05% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.cse : 0.000023s : 0.20% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000019s : 0.16% optimize.tuple_transform.d_1 : 0.000058s : 0.49% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.02% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000009s : 0.08% optimize.partial_unused_args_eliminate : 0.000005s : 0.04% optimize.add_recomputation : 0.000061s : 0.52% optimize.cse_after_recomputation.cse : 0.000016s : 0.14% optimize.environ_conv : 0.000010s : 0.09% optimize.swap_dp_allreduce_reducescatter : 0.000009s : 0.08% optimize.bias_add_comm_swap : 0.000005s : 0.04% optimize.label_micro_interleaved_index : 0.000007s : 0.06% optimize.label_fine_grained_interleaved_index : 0.000005s : 0.05% optimize.merge_cast_opt : 0.000004s : 0.03% optimize.slice_recompute_activation : 0.000005s : 0.04% optimize.micro_interleaved_order_control : 0.000005s : 0.04% optimize.assign_add_opt : 0.000004s : 0.03% optimize.ForceFp32Comm : 0.000003s : 0.03% optimize.remove_cast_before_assign_add : 0.000004s : 0.03% optimize.full_micro_interleaved_order_control : 0.000004s : 0.04% optimize.reorder_send_recv_between_fp_bp : 0.000005s : 0.04% optimize.comm_op_add_attrs : 0.000004s : 0.03% optimize.add_comm_op_reuse_tag : 0.000003s : 0.03% optimize.interleave_split_concat_branches : 0.000004s : 0.03% optimize.interleave_parallel_branches : 0.000004s : 0.03% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.03% optimize.overlap_opt_shard_grad_in_pipeline : 0.000005s : 0.04% optimize.control_data_broadcast_order : 0.000019s : 0.16% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.04% optimize.offloading_packed_experts : 0.000008s : 0.07% optimize.overlap_recompute_and_grad_model_parallel : 0.000008s : 0.07% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.04% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.03% optimize.overlap_recompute_comm : 0.000005s : 0.05% optimize.overlap_grad_ring_attention : 0.000008s : 0.07% optimize.overlap_grad_flash_sp : 0.000026s : 0.22% optimize.begin_end_overlap_inline : 0.000003s : 0.03% optimize.split_matmul_comm_elemetwise : 0.000005s : 0.04% optimize.split_layernorm_comm : 0.000004s : 0.04% optimize.handle_group_info : 0.000004s : 0.03% optimize.symbol_engine_optimizer.build : 0.000004s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000013s : 0.11% optimize.symbol_engine_optimizer.elim_not_effective : 0.000016s : 0.14% optimize.symbol_engine_optimizer.opt_reshape : 0.000009s : 0.07% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000013s : 0.11% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000003s : 0.03% pipeline_parallel_scheduler : 0.000002s : 0.01% auto_monad_reorder : 0.000022s : 0.19% get_jit_bprop_graph : 0.000002s : 0.01% rewriter_after_jit_bprop_graph : 0.000005s : 0.04% opt_after_jit_grad : 0.000515s : 4.36% validate : 0.000043s : 0.37% Time group info: ------[substitution.] 0.000213 38 12.93% : 0.000028s : 3: substitution.cast_eliminate 1.08% : 0.000002s : 3: substitution.elim_not_effective 0.79% : 0.000002s : 3: substitution.fold_const_symbol 3.44% : 0.000007s : 5: substitution.graph_param_transform 68.01% : 0.000145s : 4: substitution.inline 2.20% : 0.000005s : 6: substitution.j_node_and_user_rematch 2.95% : 0.000006s : 6: substitution.remove_not_recompute_node 2.51% : 0.000005s : 4: substitution.replace_old_param 6.08% : 0.000013s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.005945 2 88.27% : 0.005248s : 1: type_inference.infer 11.73% : 0.000697s : 1: type_inference.specialize ------[replace.] 0.000060 8 61.98% : 0.000037s : 4: replace.inline 38.02% : 0.000023s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000154 8 92.76% : 0.000142s : 4: match.inline 7.24% : 0.000011s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000241 1504 0.86% : 0.000002s : 15: predicate.accumulaten_eliminater 0.78% : 0.000002s : 5: predicate.ad_related_special_op_eliminate 0.58% : 0.000001s : 10: predicate.addn_check_dump 0.84% : 0.000002s : 15: predicate.addn_zero_filter 0.80% : 0.000002s : 15: predicate.adjust_all_reduce_mul_add 1.92% : 0.000005s : 25: predicate.arithmetic_simplify 1.08% : 0.000003s : 15: predicate.cast_eliminate 0.64% : 0.000002s : 10: predicate.check_bprop_eliminate 0.62% : 0.000001s : 10: predicate.compare_switch_simplify 0.20% : 0.000000s : 5: predicate.const_output_eliminate 0.84% : 0.000002s : 10: predicate.depend_value_elim 0.95% : 0.000002s : 15: predicate.dict_get_item_const_eliminator 1.04% : 0.000003s : 15: predicate.dict_get_item_eliminator 0.99% : 0.000002s : 15: predicate.dict_set_item_eliminator 0.93% : 0.000002s : 10: predicate.dumpgradient_eliminate 0.22% : 0.000001s : 5: predicate.elim_not_effective 0.45% : 0.000001s : 5: predicate.elim_shapecalc_of_broadcastargs 1.22% : 0.000003s : 20: predicate.environ_add_const_eliminate 1.12% : 0.000003s : 20: predicate.environ_get_add_eliminate 1.13% : 0.000003s : 20: predicate.environ_get_depend_swap 1.81% : 0.000004s : 30: predicate.environ_get_eliminate 1.08% : 0.000003s : 20: predicate.environ_get_set_eliminate 1.37% : 0.000003s : 23: predicate.exchange_switch_depend_value 2.28% : 0.000006s : 23: predicate.float_depend_g_call 0.59% : 0.000001s : 10: predicate.float_environ_get_switch 0.88% : 0.000002s : 15: predicate.float_tuple_getitem_switch 0.21% : 0.000001s : 5: predicate.fold_const_symbol 0.71% : 0.000002s : 10: predicate.get_grad_eliminate 0.21% : 0.000001s : 5: predicate.graph_param_transform 0.64% : 0.000002s : 10: predicate.incorporate_call 0.56% : 0.000001s : 10: predicate.incorporate_call_switch 6.57% : 0.000016s : 68: predicate.inline 0.87% : 0.000002s : 10: predicate.inline_without_move 0.32% : 0.000001s : 10: predicate.j_node_and_user_rematch 0.76% : 0.000002s : 10: predicate.less_batch_normalization 1.86% : 0.000004s : 29: predicate.list_to_tuple_eliminator_ 2.44% : 0.000006s : 44: predicate.load_eliminater 0.93% : 0.000002s : 5: predicate.loop_unroll_after_grad 2.11% : 0.000005s : 36: predicate.loop_unroll_before_grad 1.71% : 0.000004s : 25: predicate.make_slice_get_slice_eliminator 0.62% : 0.000001s : 10: predicate.merge_addn 0.62% : 0.000001s : 10: predicate.micro_step_allgather_replace 0.65% : 0.000002s : 10: predicate.mini_step_allgather_replace 0.81% : 0.000002s : 15: predicate.minmaximum_grad 0.96% : 0.000002s : 5: predicate.mutable_eliminate 0.36% : 0.000001s : 5: predicate.opt_reshape 0.39% : 0.000001s : 5: predicate.parallel_virtual_node 1.81% : 0.000004s : 23: predicate.partial_defer_inline 1.65% : 0.000004s : 24: predicate.partial_eliminate 0.95% : 0.000002s : 15: predicate.print_const_string_wrapper 0.66% : 0.000002s : 10: predicate.reduce_all_const_elim 1.11% : 0.000003s : 15: predicate.reduce_eliminate 2.51% : 0.000006s : 44: predicate.redundant_stop_gradient_eliminater 0.42% : 0.000001s : 10: predicate.remove_not_recompute_node 1.32% : 0.000003s : 29: predicate.replace_applicator 0.60% : 0.000001s : 10: predicate.replace_old_param 0.25% : 0.000001s : 5: predicate.reset_defer_inline 0.91% : 0.000002s : 15: predicate.reshape_eliminate 0.65% : 0.000002s : 10: predicate.row_tensor_add_zeros_like 0.37% : 0.000001s : 5: predicate.row_tensor_eliminate 0.76% : 0.000002s : 10: predicate.same_eliminate 0.43% : 0.000001s : 10: predicate.set_cell_output_no_recompute 0.76% : 0.000002s : 10: predicate.shard_identity_eliminate 0.66% : 0.000002s : 10: predicate.special_op_eliminate 0.78% : 0.000002s : 10: predicate.specialize_transform 0.92% : 0.000002s : 10: predicate.split_environ_get_set_with_tuple_value 0.79% : 0.000002s : 10: predicate.stack_unstack_eliminate 0.37% : 0.000001s : 5: predicate.switch_call_monad_eliminater 1.43% : 0.000003s : 23: predicate.switch_defer_inline 1.96% : 0.000005s : 33: predicate.switch_layer_defer_inline 4.94% : 0.000012s : 74: predicate.switch_simplify 0.85% : 0.000002s : 15: predicate.tile_eliminate 0.88% : 0.000002s : 15: predicate.transpose_eliminate 1.52% : 0.000004s : 25: predicate.tuple_list_convert_item_index_to_positive 1.62% : 0.000004s : 25: predicate.tuple_list_get_item_const_eliminator 1.60% : 0.000004s : 25: predicate.tuple_list_get_item_depend_reorder 3.27% : 0.000008s : 39: predicate.tuple_list_get_item_eliminator 1.51% : 0.000004s : 25: predicate.tuple_list_get_set_item_eliminator 2.26% : 0.000005s : 35: predicate.tuple_list_set_item_eliminator 1.83% : 0.000004s : 29: predicate.tuple_to_list_eliminator_ 2.43% : 0.000006s : 44: predicate.updatestate_pure_node_eliminater 3.32% : 0.000008s : 54: predicate.updatestate_useless_node_eliminater 0.35% : 0.000001s : 5: predicate.value_based_eliminate 0.68% : 0.000002s : 10: predicate.virtual_dataset_eliminate 0.64% : 0.000002s : 10: predicate.virtual_output_eliminate 0.27% : 0.000001s : 5: predicate.virtual_view_grad_eliminate 0.49% : 0.000001s : 5: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000556 11 55.66% : 0.000309s : 5: func_graph_cloner_run.FuncGraphClonerGraph 44.34% : 0.000246s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.028111 192 0.02% : 0.000006s : 1: ForceFp32Comm 11.14% : 0.003133s : 1: add_attr 11.09% : 0.003118s : 1: add_attr_with_inline 0.02% : 0.000006s : 1: add_comm_op_reuse_tag 0.23% : 0.000065s : 1: add_recomputation 0.02% : 0.000006s : 1: assign_add_opt 0.26% : 0.000073s : 1: auto_monad 0.11% : 0.000030s : 1: auto_monad_reorder 0.02% : 0.000006s : 1: begin_end_overlap_inline 0.03% : 0.000008s : 1: bias_add_comm_swap 1.91% : 0.000537s : 1: bootstrap 0.11% : 0.000031s : 1: cconv 0.02% : 0.000007s : 1: comm_op_add_attrs 0.08% : 0.000022s : 1: control_data_broadcast_order 0.05% : 0.000014s : 1: convert_after_rewriter 0.13% : 0.000037s : 1: cse_after_recomputation 0.03% : 0.000008s : 1: dataset_repeat_opt 0.06% : 0.000017s : 1: detach_backward 0.05% : 0.000013s : 1: environ_conv 0.11% : 0.000030s : 1: event_method 0.03% : 0.000007s : 1: full_micro_interleaved_order_control 0.03% : 0.000008s : 1: get_jit_bprop_graph 0.04% : 0.000012s : 1: graph_reusing 0.03% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.02% : 0.000007s : 1: handle_group_info 0.03% : 0.000008s : 1: inline 0.03% : 0.000009s : 1: insert-virtual-dataset 0.02% : 0.000006s : 1: interleave_parallel_branches 0.02% : 0.000007s : 1: interleave_split_concat_branches 0.03% : 0.000008s : 1: label_fine_grained_interleaved_index 0.04% : 0.000010s : 1: label_micro_interleaved_index 1.67% : 0.000468s : 1: loop_unroll 0.02% : 0.000006s : 1: merge_cast_opt 0.03% : 0.000007s : 1: micro_interleaved_order_control 1.95% : 0.000548s : 1: mutable_eliminate 0.04% : 0.000011s : 1: offloading_packed_experts 0.06% : 0.000017s : 1: opt.transform.loop_unroll_optimizer 0.06% : 0.000016s : 1: opt.transform.mutable_eliminate 4.93% : 0.001385s : 78: opt.transform.opt_a 0.14% : 0.000040s : 1: opt.transform.opt_after_cconv 0.10% : 0.000029s : 1: opt.transform.opt_after_jit_grad 0.48% : 0.000134s : 28: opt.transform.opt_b 0.23% : 0.000064s : 2: opt.transform.opt_trans_graph 0.17% : 0.000047s : 4: opt.transform.symbol_engine_opt 12.06% : 0.003391s : 1: opt_a 0.54% : 0.000152s : 1: opt_after_cconv 1.87% : 0.000526s : 1: opt_after_jit_grad 1.10% : 0.000311s : 1: opt_b 22.15% : 0.006228s : 1: optimize 0.09% : 0.000025s : 1: optimize_parallel_all_gather_comm 0.04% : 0.000012s : 1: order_py_execute_after_rewriter 0.10% : 0.000029s : 1: overlap_grad_flash_sp 0.03% : 0.000007s : 1: overlap_grad_matmul_and_grad_allreduce 0.04% : 0.000011s : 1: overlap_grad_ring_attention 0.03% : 0.000008s : 1: overlap_opt_shard_grad_in_pipeline 0.02% : 0.000006s : 1: overlap_opt_shard_in_pipeline 0.03% : 0.000008s : 1: overlap_param_gather 0.02% : 0.000007s : 1: overlap_recompute_allgather_and_fa_grad 0.04% : 0.000011s : 1: overlap_recompute_and_grad_model_parallel 0.03% : 0.000008s : 1: overlap_recompute_comm 0.04% : 0.000011s : 1: parallel-infer-symbol 0.02% : 0.000007s : 1: parallel-infer-symbol-second 0.03% : 0.000007s : 1: partial_unused_args_eliminate 0.03% : 0.000009s : 1: pipeline_parallel_scheduler 0.03% : 0.000007s : 1: pipeline_split 0.16% : 0.000044s : 1: pre_auto_parallel 0.12% : 0.000035s : 1: py_interpret_to_execute 0.06% : 0.000018s : 1: py_interpret_to_execute_after_opt_a 0.02% : 0.000006s : 1: remove_cast_before_assign_add 0.08% : 0.000022s : 1: remove_dup_value 1.38% : 0.000387s : 1: renormalize.infer 1.14% : 0.000321s : 1: renormalize.specialize 0.03% : 0.000008s : 1: reorder_send_recv_between_fp_bp 0.04% : 0.000011s : 1: rewriter_after_jit_bprop_graph 0.17% : 0.000048s : 1: rewriter_after_opt_a 0.33% : 0.000092s : 1: rewriter_before_opt_a 0.03% : 0.000008s : 1: slice_cell_reuse_recomputed_activation 0.03% : 0.000008s : 1: slice_recompute_activation 0.03% : 0.000007s : 1: split_layernorm_comm 0.03% : 0.000008s : 1: split_matmul_comm_elemetwise 0.04% : 0.000012s : 1: swap_dp_allreduce_reducescatter 0.40% : 0.000114s : 1: symbol_engine_optimizer 0.40% : 0.000112s : 1: tuple_transform 21.49% : 0.006040s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:45:24.398.303 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0152936, [21] [bootstrap]: 0.00043297 [type_inference]: 0.00576373 [event_method]: 1.783e-05 [auto_monad]: 6.417e-05 [graph_reusing]: 6.41e-06 [inline]: 2.40002e-06 [add_attr]: 0.00310493, [1] [add_attr_with_inline]: 0.00309566, [1] [Cycle 1]: 6.164e-05, [2] [tag_attr]: 1.993e-05 [meta_addattr_fg_expand]: 6.26998e-06 [parallel-infer-symbol]: 3.35e-06 [pre_auto_parallel]: 3.499e-05 [insert-virtual-dataset]: 2.61999e-06 [parallel-infer-symbol-second]: 9.80013e-07 [dataset_repeat_opt]: 1.69e-06 [pipeline_split]: 1.66e-06 [optimize]: 0.00518428, [53] [py_interpret_to_execute]: 2.767e-05 [rewriter_before_opt_a]: 8.934e-05 [opt_a]: 0.00303914, [2] [Cycle 1]: 0.00227129, [45] [expand_dump_flag]: 3.3e-06 [switch_simplify]: 4.734e-05 [loop_unroll]: 3.523e-05 [a_1]: 0.00072223 [with_stream_mark]: 1.726e-05 [recompute_prepare]: 1.154e-05 [updatestate_depend_eliminate]: 4.88001e-06 [updatestate_assign_eliminate]: 4.47e-06 [updatestate_loads_eliminate]: 3.75998e-06 [parameter_eliminate]: 1.91e-06 [a_2]: 0.00010499 [accelerated_algorithm]: 1.027e-05 [shard]: 1.95001e-06 [meta_shard_fg_expand]: 2.07999e-06 [shard_inline]: 8.80999e-06 [merge_send_recv]: 9.96e-06 [auto_parallel]: 7.3e-06 [parallel]: 1.95e-05 [flash_sp]: 8.31002e-06 [merge_comm]: 4.71002e-06 [allreduce_fusion]: 4.63001e-06 [matmul_add_comm_reduction]: 1.088e-05 [allreduce_slice_to_reducescatter]: 6.40022e-07 [virtual_shard_identity]: 9.51e-06 [virtual_dataset]: 8.12998e-06 [get_grad_eliminate_]: 7.53e-06 [virtual_output]: 7.38e-06 [merge_forward]: 4.61002e-06 [cell_reuse_recompute_pass]: 1.10001e-06 [offload_activation]: 1.12e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.677e-05 [merge_recompute_call_nodes]: 1.67001e-06 [before_grad]: 1.467e-05 [set_forward_comm_id_for_comm_node_pass]: 4.90001e-06 [meta_fg_expand]: 4.02e-06 [flash_sp_send_recv_attached]: 2.94999e-06 [receive_attached]: 2.22999e-06 [after_resolve]: 1.426e-05 [a_after_grad]: 1.386e-05 [renormalize]: 0.00072202 [add_forward_monad_depend]: 5.64e-06 [auto_monad_grad]: 2.32001e-06 [auto_monad_eliminator]: 1.782e-05 [cse]: 3.656e-05 [a_3]: 5.828e-05 [Cycle 2]: 0.00075813, [45] [expand_dump_flag]: 1.13001e-06 [switch_simplify]: 9.07999e-06 [loop_unroll]: 8.12e-06 [a_1]: 0.0001755 [with_stream_mark]: 1.258e-05 [recompute_prepare]: 8.14002e-06 [updatestate_depend_eliminate]: 3.66001e-06 [updatestate_assign_eliminate]: 2.96001e-06 [updatestate_loads_eliminate]: 2.91999e-06 [parameter_eliminate]: 1.30999e-06 [a_2]: 9.307e-05 [accelerated_algorithm]: 7.9e-06 [shard]: 1.15999e-06 [meta_shard_fg_expand]: 2.17001e-06 [shard_inline]: 7.35998e-06 [merge_send_recv]: 5.82001e-06 [auto_parallel]: 5.96e-06 [parallel]: 5.34998e-06 [flash_sp]: 3.18e-06 [merge_comm]: 3.97e-06 [allreduce_fusion]: 3.71999e-06 [matmul_add_comm_reduction]: 7.4e-06 [allreduce_slice_to_reducescatter]: 3.69997e-07 [virtual_shard_identity]: 8.10999e-06 [virtual_dataset]: 7.25e-06 [get_grad_eliminate_]: 7.01999e-06 [virtual_output]: 6.70998e-06 [merge_forward]: 3.30003e-06 [cell_reuse_recompute_pass]: 1.77001e-06 [offload_activation]: 7.98999e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.383e-05 [merge_recompute_call_nodes]: 8.80013e-07 [before_grad]: 1.145e-05 [set_forward_comm_id_for_comm_node_pass]: 3.85e-06 [meta_fg_expand]: 2.73e-06 [flash_sp_send_recv_attached]: 9.39996e-07 [receive_attached]: 1.29e-06 [after_resolve]: 1.22e-05 [a_after_grad]: 1.173e-05 [renormalize]: 8.00064e-08 [add_forward_monad_depend]: 1.34998e-06 [auto_monad_grad]: 9.5999e-07 [auto_monad_eliminator]: 7.95e-06 [cse]: 1.876e-05 [a_3]: 4.595e-05 [py_interpret_to_execute_after_opt_a]: 1.16e-05 [slice_cell_reuse_recomputed_activation]: 1.84e-06 [rewriter_after_opt_a]: 3.936e-05 [convert_after_rewriter]: 8.48001e-06 [order_py_execute_after_rewriter]: 6.31998e-06 [mutable_eliminate]: 0.00049736 [opt_b]: 0.00026191, [1] [Cycle 1]: 0.00025563, [7] [b_1]: 0.00016859 [b_2]: 1.052e-05 [updatestate_depend_eliminate]: 6.46999e-06 [updatestate_assign_eliminate]: 3.14999e-06 [updatestate_loads_eliminate]: 3.46999e-06 [renormalize]: 5.3001e-07 [cse]: 2.418e-05 [optimize_parallel_all_gather_comm]: 1.836e-05 [overlap_param_gather]: 2.30002e-06 [cconv]: 2.553e-05 [loop_unroll]: 0.00044843 [opt_after_cconv]: 0.00011413, [1] [Cycle 1]: 0.00010838, [7] [c_1]: 3.78e-05 [parameter_eliminate]: 2.63e-06 [updatestate_depend_eliminate]: 5.97999e-06 [updatestate_assign_eliminate]: 3.14999e-06 [updatestate_loads_eliminate]: 2.91999e-06 [cse]: 2.128e-05 [renormalize]: 3.50003e-07 [remove_dup_value]: 1.662e-05 [tuple_transform]: 8.552e-05, [1] [Cycle 1]: 8.104e-05, [4] [d_1]: 5.162e-05 [none_parameter_eliminate]: 1.96e-06 [renormalize]: 2.29978e-07 [switch_simplify]: 8.44998e-06 [partial_unused_args_eliminate]: 1.81e-06 [add_recomputation]: 5.601e-05 [cse_after_recomputation]: 2.497e-05, [1] [Cycle 1]: 2.052e-05, [1] [cse]: 1.501e-05 [environ_conv]: 5.76e-06 [swap_dp_allreduce_reducescatter]: 5.76e-06 [bias_add_comm_swap]: 2.77002e-06 [label_micro_interleaved_index]: 4.70001e-06 [label_fine_grained_interleaved_index]: 2.66999e-06 [merge_cast_opt]: 1.64e-06 [slice_recompute_activation]: 2.19999e-06 [micro_interleaved_order_control]: 2.94001e-06 [assign_add_opt]: 1.14998e-06 [ForceFp32Comm]: 9.5999e-07 [remove_cast_before_assign_add]: 1.25001e-06 [full_micro_interleaved_order_control]: 2.66e-06 [reorder_send_recv_between_fp_bp]: 3.01999e-06 [comm_op_add_attrs]: 1.02998e-06 [add_comm_op_reuse_tag]: 1.05999e-06 [interleave_split_concat_branches]: 1.27e-06 [interleave_parallel_branches]: 1.05001e-06 [overlap_opt_shard_in_pipeline]: 1.15001e-06 [overlap_opt_shard_grad_in_pipeline]: 1.81998e-06 [control_data_broadcast_order]: 1.418e-05 [grouped_pairwise_exchange_alltoall]: 1.60001e-06 [offloading_packed_experts]: 4.32e-06 [overlap_recompute_and_grad_model_parallel]: 5.29e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.18001e-06 [overlap_recompute_allgather_and_fa_grad]: 1.36998e-06 [overlap_recompute_comm]: 2.22001e-06 [overlap_grad_ring_attention]: 4.60999e-06 [overlap_grad_flash_sp]: 2.188e-05 [begin_end_overlap_inline]: 7.00005e-07 [split_matmul_comm_elemetwise]: 2.19001e-06 [split_layernorm_comm]: 1.64998e-06 [handle_group_info]: 9.5999e-07 [symbol_engine_optimizer]: 8.143e-05, [1] [Cycle 1]: 7.731e-05, [6] [build]: 3.48e-06 [elim_shapecalc]: 1.062e-05 [elim_not_effective]: 1.494e-05 [opt_reshape]: 7.98001e-06 [fold_const_symbol]: 1.204e-05 [renormalize]: 2.80008e-07 [detach_backward]: 1.93002e-06 [pipeline_parallel_scheduler]: 1.57001e-06 [auto_monad_reorder]: 1.968e-05 [get_jit_bprop_graph]: 1.79998e-06 [rewriter_after_jit_bprop_graph]: 3.65e-06 [opt_after_jit_grad]: 0.00045595 [validate]: 4.108e-05 Sums bootstrap : 0.000433s : 3.86% type_inference : 0.005764s : 51.36% event_method : 0.000018s : 0.16% auto_monad : 0.000064s : 0.57% graph_reusing : 0.000006s : 0.06% inline : 0.000002s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000020s : 0.18% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.06% parallel-infer-symbol : 0.000003s : 0.03% pre_auto_parallel : 0.000035s : 0.31% insert-virtual-dataset : 0.000003s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000028s : 0.25% optimize.rewriter_before_opt_a : 0.000089s : 0.80% optimize.opt_a.expand_dump_flag : 0.000004s : 0.04% optimize.opt_a.switch_simplify : 0.000056s : 0.50% optimize.opt_a.loop_unroll : 0.000043s : 0.39% optimize.opt_a.a_1 : 0.000898s : 8.00% optimize.opt_a.with_stream_mark : 0.000030s : 0.27% optimize.opt_a.recompute_prepare : 0.000020s : 0.18% optimize.opt_a.updatestate_depend_eliminate : 0.000009s : 0.08% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.07% optimize.opt_a.updatestate_loads_eliminate : 0.000007s : 0.06% optimize.opt_a.parameter_eliminate : 0.000003s : 0.03% optimize.opt_a.a_2 : 0.000198s : 1.76% optimize.opt_a.accelerated_algorithm : 0.000018s : 0.16% optimize.opt_a.shard : 0.000003s : 0.03% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.04% optimize.opt_a.shard_inline : 0.000016s : 0.14% optimize.opt_a.merge_send_recv : 0.000016s : 0.14% optimize.opt_a.auto_parallel : 0.000013s : 0.12% optimize.opt_a.parallel : 0.000025s : 0.22% optimize.opt_a.flash_sp : 0.000011s : 0.10% optimize.opt_a.merge_comm : 0.000009s : 0.08% optimize.opt_a.allreduce_fusion : 0.000008s : 0.07% optimize.opt_a.matmul_add_comm_reduction : 0.000018s : 0.16% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000018s : 0.16% optimize.opt_a.virtual_dataset : 0.000015s : 0.14% optimize.opt_a.get_grad_eliminate_ : 0.000015s : 0.13% optimize.opt_a.virtual_output : 0.000014s : 0.13% optimize.opt_a.merge_forward : 0.000008s : 0.07% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.03% optimize.opt_a.offload_activation : 0.000019s : 0.17% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000031s : 0.27% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.02% optimize.opt_a.before_grad : 0.000026s : 0.23% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000009s : 0.08% optimize.opt_a.meta_fg_expand : 0.000007s : 0.06% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.03% optimize.opt_a.receive_attached : 0.000004s : 0.03% optimize.opt_a.after_resolve : 0.000026s : 0.24% optimize.opt_a.a_after_grad : 0.000026s : 0.23% optimize.opt_a.renormalize : 0.000722s : 6.43% optimize.opt_a.add_forward_monad_depend : 0.000007s : 0.06% optimize.opt_a.auto_monad_grad : 0.000003s : 0.03% optimize.opt_a.auto_monad_eliminator : 0.000026s : 0.23% optimize.opt_a.cse : 0.000055s : 0.49% optimize.opt_a.a_3 : 0.000104s : 0.93% optimize.py_interpret_to_execute_after_opt_a : 0.000012s : 0.10% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.02% optimize.rewriter_after_opt_a : 0.000039s : 0.35% optimize.convert_after_rewriter : 0.000008s : 0.08% optimize.order_py_execute_after_rewriter : 0.000006s : 0.06% optimize.mutable_eliminate : 0.000497s : 4.43% optimize.opt_b.b_1 : 0.000169s : 1.50% optimize.opt_b.b_2 : 0.000011s : 0.09% optimize.opt_b.updatestate_depend_eliminate : 0.000006s : 0.06% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000024s : 0.22% optimize.optimize_parallel_all_gather_comm : 0.000018s : 0.16% optimize.overlap_param_gather : 0.000002s : 0.02% optimize.cconv : 0.000026s : 0.23% optimize.loop_unroll : 0.000448s : 4.00% optimize.opt_after_cconv.c_1 : 0.000038s : 0.34% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.02% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.05% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.cse : 0.000021s : 0.19% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000017s : 0.15% optimize.tuple_transform.d_1 : 0.000052s : 0.46% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.02% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000008s : 0.08% optimize.partial_unused_args_eliminate : 0.000002s : 0.02% optimize.add_recomputation : 0.000056s : 0.50% optimize.cse_after_recomputation.cse : 0.000015s : 0.13% optimize.environ_conv : 0.000006s : 0.05% optimize.swap_dp_allreduce_reducescatter : 0.000006s : 0.05% optimize.bias_add_comm_swap : 0.000003s : 0.02% optimize.label_micro_interleaved_index : 0.000005s : 0.04% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.02% optimize.merge_cast_opt : 0.000002s : 0.01% optimize.slice_recompute_activation : 0.000002s : 0.02% optimize.micro_interleaved_order_control : 0.000003s : 0.03% optimize.assign_add_opt : 0.000001s : 0.01% optimize.ForceFp32Comm : 0.000001s : 0.01% optimize.remove_cast_before_assign_add : 0.000001s : 0.01% optimize.full_micro_interleaved_order_control : 0.000003s : 0.02% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.03% optimize.comm_op_add_attrs : 0.000001s : 0.01% optimize.add_comm_op_reuse_tag : 0.000001s : 0.01% optimize.interleave_split_concat_branches : 0.000001s : 0.01% optimize.interleave_parallel_branches : 0.000001s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.02% optimize.control_data_broadcast_order : 0.000014s : 0.13% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.01% optimize.offloading_packed_experts : 0.000004s : 0.04% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.05% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.01% optimize.overlap_recompute_comm : 0.000002s : 0.02% optimize.overlap_grad_ring_attention : 0.000005s : 0.04% optimize.overlap_grad_flash_sp : 0.000022s : 0.19% optimize.begin_end_overlap_inline : 0.000001s : 0.01% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.02% optimize.split_layernorm_comm : 0.000002s : 0.01% optimize.handle_group_info : 0.000001s : 0.01% optimize.symbol_engine_optimizer.build : 0.000003s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000011s : 0.09% optimize.symbol_engine_optimizer.elim_not_effective : 0.000015s : 0.13% optimize.symbol_engine_optimizer.opt_reshape : 0.000008s : 0.07% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000012s : 0.11% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.02% pipeline_parallel_scheduler : 0.000002s : 0.01% auto_monad_reorder : 0.000020s : 0.18% get_jit_bprop_graph : 0.000002s : 0.02% rewriter_after_jit_bprop_graph : 0.000004s : 0.03% opt_after_jit_grad : 0.000456s : 4.06% validate : 0.000041s : 0.37% Time group info: ------[substitution.] 0.000217 38 10.89% : 0.000024s : 3: substitution.cast_eliminate 1.04% : 0.000002s : 3: substitution.elim_not_effective 0.76% : 0.000002s : 3: substitution.fold_const_symbol 3.14% : 0.000007s : 5: substitution.graph_param_transform 70.91% : 0.000154s : 4: substitution.inline 2.24% : 0.000005s : 6: substitution.j_node_and_user_rematch 2.82% : 0.000006s : 6: substitution.remove_not_recompute_node 2.12% : 0.000005s : 4: substitution.replace_old_param 6.08% : 0.000013s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.005708 2 88.08% : 0.005027s : 1: type_inference.infer 11.92% : 0.000681s : 1: type_inference.specialize ------[replace.] 0.000063 8 63.37% : 0.000040s : 4: replace.inline 36.63% : 0.000023s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000163 8 93.00% : 0.000151s : 4: match.inline 7.00% : 0.000011s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000246 1504 0.94% : 0.000002s : 15: predicate.accumulaten_eliminater 0.70% : 0.000002s : 5: predicate.ad_related_special_op_eliminate 0.56% : 0.000001s : 10: predicate.addn_check_dump 0.98% : 0.000002s : 15: predicate.addn_zero_filter 0.83% : 0.000002s : 15: predicate.adjust_all_reduce_mul_add 2.03% : 0.000005s : 25: predicate.arithmetic_simplify 1.11% : 0.000003s : 15: predicate.cast_eliminate 0.56% : 0.000001s : 10: predicate.check_bprop_eliminate 0.60% : 0.000001s : 10: predicate.compare_switch_simplify 0.20% : 0.000001s : 5: predicate.const_output_eliminate 0.61% : 0.000002s : 10: predicate.depend_value_elim 0.94% : 0.000002s : 15: predicate.dict_get_item_const_eliminator 1.17% : 0.000003s : 15: predicate.dict_get_item_eliminator 0.91% : 0.000002s : 15: predicate.dict_set_item_eliminator 0.90% : 0.000002s : 10: predicate.dumpgradient_eliminate 0.28% : 0.000001s : 5: predicate.elim_not_effective 0.35% : 0.000001s : 5: predicate.elim_shapecalc_of_broadcastargs 1.19% : 0.000003s : 20: predicate.environ_add_const_eliminate 1.17% : 0.000003s : 20: predicate.environ_get_add_eliminate 1.17% : 0.000003s : 20: predicate.environ_get_depend_swap 1.86% : 0.000005s : 30: predicate.environ_get_eliminate 1.23% : 0.000003s : 20: predicate.environ_get_set_eliminate 1.41% : 0.000003s : 23: predicate.exchange_switch_depend_value 2.36% : 0.000006s : 23: predicate.float_depend_g_call 0.57% : 0.000001s : 10: predicate.float_environ_get_switch 0.82% : 0.000002s : 15: predicate.float_tuple_getitem_switch 0.20% : 0.000001s : 5: predicate.fold_const_symbol 0.62% : 0.000002s : 10: predicate.get_grad_eliminate 0.24% : 0.000001s : 5: predicate.graph_param_transform 0.67% : 0.000002s : 10: predicate.incorporate_call 0.57% : 0.000001s : 10: predicate.incorporate_call_switch 6.37% : 0.000016s : 68: predicate.inline 0.90% : 0.000002s : 10: predicate.inline_without_move 0.36% : 0.000001s : 10: predicate.j_node_and_user_rematch 0.91% : 0.000002s : 10: predicate.less_batch_normalization 1.81% : 0.000004s : 29: predicate.list_to_tuple_eliminator_ 2.58% : 0.000006s : 44: predicate.load_eliminater 0.88% : 0.000002s : 5: predicate.loop_unroll_after_grad 2.32% : 0.000006s : 36: predicate.loop_unroll_before_grad 1.61% : 0.000004s : 25: predicate.make_slice_get_slice_eliminator 0.60% : 0.000001s : 10: predicate.merge_addn 0.62% : 0.000002s : 10: predicate.micro_step_allgather_replace 0.58% : 0.000001s : 10: predicate.mini_step_allgather_replace 0.82% : 0.000002s : 15: predicate.minmaximum_grad 0.85% : 0.000002s : 5: predicate.mutable_eliminate 0.35% : 0.000001s : 5: predicate.opt_reshape 0.45% : 0.000001s : 5: predicate.parallel_virtual_node 1.72% : 0.000004s : 23: predicate.partial_defer_inline 1.71% : 0.000004s : 24: predicate.partial_eliminate 0.91% : 0.000002s : 15: predicate.print_const_string_wrapper 0.65% : 0.000002s : 10: predicate.reduce_all_const_elim 1.21% : 0.000003s : 15: predicate.reduce_eliminate 2.56% : 0.000006s : 44: predicate.redundant_stop_gradient_eliminater 0.37% : 0.000001s : 10: predicate.remove_not_recompute_node 1.36% : 0.000003s : 29: predicate.replace_applicator 0.45% : 0.000001s : 10: predicate.replace_old_param 0.30% : 0.000001s : 5: predicate.reset_defer_inline 0.97% : 0.000002s : 15: predicate.reshape_eliminate 0.69% : 0.000002s : 10: predicate.row_tensor_add_zeros_like 0.49% : 0.000001s : 5: predicate.row_tensor_eliminate 0.68% : 0.000002s : 10: predicate.same_eliminate 0.41% : 0.000001s : 10: predicate.set_cell_output_no_recompute 0.76% : 0.000002s : 10: predicate.shard_identity_eliminate 0.71% : 0.000002s : 10: predicate.special_op_eliminate 0.73% : 0.000002s : 10: predicate.specialize_transform 0.86% : 0.000002s : 10: predicate.split_environ_get_set_with_tuple_value 0.78% : 0.000002s : 10: predicate.stack_unstack_eliminate 0.35% : 0.000001s : 5: predicate.switch_call_monad_eliminater 1.57% : 0.000004s : 23: predicate.switch_defer_inline 2.04% : 0.000005s : 33: predicate.switch_layer_defer_inline 4.90% : 0.000012s : 74: predicate.switch_simplify 0.87% : 0.000002s : 15: predicate.tile_eliminate 0.91% : 0.000002s : 15: predicate.transpose_eliminate 1.50% : 0.000004s : 25: predicate.tuple_list_convert_item_index_to_positive 1.57% : 0.000004s : 25: predicate.tuple_list_get_item_const_eliminator 1.42% : 0.000003s : 25: predicate.tuple_list_get_item_depend_reorder 3.13% : 0.000008s : 39: predicate.tuple_list_get_item_eliminator 1.47% : 0.000004s : 25: predicate.tuple_list_get_set_item_eliminator 2.25% : 0.000006s : 35: predicate.tuple_list_set_item_eliminator 1.79% : 0.000004s : 29: predicate.tuple_to_list_eliminator_ 2.42% : 0.000006s : 44: predicate.updatestate_pure_node_eliminater 3.19% : 0.000008s : 54: predicate.updatestate_useless_node_eliminater 0.57% : 0.000001s : 5: predicate.value_based_eliminate 0.63% : 0.000002s : 10: predicate.virtual_dataset_eliminate 0.60% : 0.000001s : 10: predicate.virtual_output_eliminate 0.26% : 0.000001s : 5: predicate.virtual_view_grad_eliminate 0.47% : 0.000001s : 5: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000499 11 53.78% : 0.000268s : 5: func_graph_cloner_run.FuncGraphClonerGraph 46.22% : 0.000230s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.025946 192 0.01% : 0.000004s : 1: ForceFp32Comm 11.99% : 0.003110s : 1: add_attr 11.95% : 0.003100s : 1: add_attr_with_inline 0.01% : 0.000004s : 1: add_comm_op_reuse_tag 0.23% : 0.000060s : 1: add_recomputation 0.01% : 0.000004s : 1: assign_add_opt 0.27% : 0.000069s : 1: auto_monad 0.09% : 0.000024s : 1: auto_monad_reorder 0.01% : 0.000003s : 1: begin_end_overlap_inline 0.02% : 0.000006s : 1: bias_add_comm_swap 1.78% : 0.000462s : 1: bootstrap 0.11% : 0.000029s : 1: cconv 0.01% : 0.000004s : 1: comm_op_add_attrs 0.07% : 0.000017s : 1: control_data_broadcast_order 0.05% : 0.000012s : 1: convert_after_rewriter 0.11% : 0.000028s : 1: cse_after_recomputation 0.02% : 0.000005s : 1: dataset_repeat_opt 0.02% : 0.000005s : 1: detach_backward 0.03% : 0.000009s : 1: environ_conv 0.09% : 0.000024s : 1: event_method 0.02% : 0.000005s : 1: full_micro_interleaved_order_control 0.02% : 0.000005s : 1: get_jit_bprop_graph 0.04% : 0.000010s : 1: graph_reusing 0.02% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.02% : 0.000004s : 1: handle_group_info 0.02% : 0.000005s : 1: inline 0.03% : 0.000007s : 1: insert-virtual-dataset 0.01% : 0.000004s : 1: interleave_parallel_branches 0.02% : 0.000004s : 1: interleave_split_concat_branches 0.02% : 0.000006s : 1: label_fine_grained_interleaved_index 0.03% : 0.000007s : 1: label_micro_interleaved_index 1.76% : 0.000457s : 1: loop_unroll 0.02% : 0.000004s : 1: merge_cast_opt 0.02% : 0.000006s : 1: micro_interleaved_order_control 1.95% : 0.000505s : 1: mutable_eliminate 0.03% : 0.000007s : 1: offloading_packed_experts 0.06% : 0.000017s : 1: opt.transform.loop_unroll_optimizer 0.06% : 0.000016s : 1: opt.transform.mutable_eliminate 5.51% : 0.001430s : 78: opt.transform.opt_a 0.14% : 0.000036s : 1: opt.transform.opt_after_cconv 0.11% : 0.000028s : 1: opt.transform.opt_after_jit_grad 0.56% : 0.000145s : 28: opt.transform.opt_b 0.22% : 0.000058s : 2: opt.transform.opt_trans_graph 0.16% : 0.000042s : 4: opt.transform.symbol_engine_opt 11.72% : 0.003042s : 1: opt_a 0.45% : 0.000118s : 1: opt_after_cconv 1.79% : 0.000465s : 1: opt_after_jit_grad 1.02% : 0.000265s : 1: opt_b 20.00% : 0.005189s : 1: optimize 0.09% : 0.000022s : 1: optimize_parallel_all_gather_comm 0.04% : 0.000010s : 1: order_py_execute_after_rewriter 0.10% : 0.000025s : 1: overlap_grad_flash_sp 0.02% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.03% : 0.000008s : 1: overlap_grad_ring_attention 0.02% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.02% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.02% : 0.000005s : 1: overlap_param_gather 0.02% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.03% : 0.000008s : 1: overlap_recompute_and_grad_model_parallel 0.02% : 0.000005s : 1: overlap_recompute_comm 0.03% : 0.000007s : 1: parallel-infer-symbol 0.02% : 0.000004s : 1: parallel-infer-symbol-second 0.02% : 0.000005s : 1: partial_unused_args_eliminate 0.02% : 0.000005s : 1: pipeline_parallel_scheduler 0.02% : 0.000004s : 1: pipeline_split 0.15% : 0.000039s : 1: pre_auto_parallel 0.12% : 0.000032s : 1: py_interpret_to_execute 0.06% : 0.000015s : 1: py_interpret_to_execute_after_opt_a 0.02% : 0.000004s : 1: remove_cast_before_assign_add 0.08% : 0.000020s : 1: remove_dup_value 1.48% : 0.000384s : 1: renormalize.infer 1.27% : 0.000330s : 1: renormalize.specialize 0.02% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.03% : 0.000007s : 1: rewriter_after_jit_bprop_graph 0.17% : 0.000044s : 1: rewriter_after_opt_a 0.36% : 0.000094s : 1: rewriter_before_opt_a 0.02% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.02% : 0.000005s : 1: slice_recompute_activation 0.02% : 0.000004s : 1: split_layernorm_comm 0.02% : 0.000005s : 1: split_matmul_comm_elemetwise 0.03% : 0.000009s : 1: swap_dp_allreduce_reducescatter 0.32% : 0.000084s : 1: symbol_engine_optimizer 0.34% : 0.000088s : 1: tuple_transform 22.27% : 0.005778s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:45:24.903.003 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:45:24.903.275 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0164311, [21] [bootstrap]: 0.00043351 [type_inference]: 0.00564592 [event_method]: 1.865e-05 [auto_monad]: 6.533e-05 [graph_reusing]: 6.14001e-06 [inline]: 1.96998e-06 [add_attr]: 0.00308383, [1] [add_attr_with_inline]: 0.0030756, [1] [Cycle 1]: 7.188e-05, [2] [tag_attr]: 1.874e-05 [meta_addattr_fg_expand]: 6.55002e-06 [parallel-infer-symbol]: 3.31001e-06 [pre_auto_parallel]: 3.341e-05 [insert-virtual-dataset]: 2.21e-06 [parallel-infer-symbol-second]: 7.50006e-07 [dataset_repeat_opt]: 1.99e-06 [pipeline_split]: 1.76e-06 [optimize]: 0.00552492, [53] [py_interpret_to_execute]: 3.04e-05 [rewriter_before_opt_a]: 8.949e-05 [opt_a]: 0.00318943, [2] [Cycle 1]: 0.00227493, [45] [expand_dump_flag]: 2.98998e-06 [switch_simplify]: 4.335e-05 [loop_unroll]: 3.106e-05 [a_1]: 0.00066697 [with_stream_mark]: 1.522e-05 [recompute_prepare]: 1.09e-05 [updatestate_depend_eliminate]: 4.97e-06 [updatestate_assign_eliminate]: 4.25e-06 [updatestate_loads_eliminate]: 3.75e-06 [parameter_eliminate]: 2.37999e-06 [a_2]: 0.00012929 [accelerated_algorithm]: 8.07e-06 [shard]: 1.87001e-06 [meta_shard_fg_expand]: 1.96e-06 [shard_inline]: 7.82998e-06 [merge_send_recv]: 9.03002e-06 [auto_parallel]: 7.26001e-06 [parallel]: 1.944e-05 [flash_sp]: 8.02e-06 [merge_comm]: 4.61002e-06 [allreduce_fusion]: 4.52e-06 [matmul_add_comm_reduction]: 1.115e-05 [allreduce_slice_to_reducescatter]: 6.29982e-07 [virtual_shard_identity]: 9.34e-06 [virtual_dataset]: 8.05e-06 [get_grad_eliminate_]: 7.9e-06 [virtual_output]: 7.97e-06 [merge_forward]: 4.32e-06 [cell_reuse_recompute_pass]: 1.11002e-06 [offload_activation]: 1.065e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.73e-05 [merge_recompute_call_nodes]: 1.69998e-06 [before_grad]: 1.274e-05 [set_forward_comm_id_for_comm_node_pass]: 4.50999e-06 [meta_fg_expand]: 3.35e-06 [flash_sp_send_recv_attached]: 2.29999e-06 [receive_attached]: 2.63e-06 [after_resolve]: 1.311e-05 [a_after_grad]: 1.174e-05 [renormalize]: 0.00062258 [add_forward_monad_depend]: 4.92999e-06 [auto_monad_grad]: 2.51e-06 [auto_monad_eliminator]: 1.665e-05 [cse]: 3.514e-05 [a_3]: 7.093e-05 [Cycle 2]: 0.00090194, [45] [expand_dump_flag]: 1.03001e-06 [switch_simplify]: 8.45999e-06 [loop_unroll]: 7.56001e-06 [a_1]: 0.00017137 [with_stream_mark]: 1.129e-05 [recompute_prepare]: 7.77e-06 [updatestate_depend_eliminate]: 3.95e-06 [updatestate_assign_eliminate]: 3.01001e-06 [updatestate_loads_eliminate]: 2.59999e-06 [parameter_eliminate]: 1.12e-06 [a_2]: 0.00011889 [accelerated_algorithm]: 7.36999e-06 [shard]: 1.07e-06 [meta_shard_fg_expand]: 1.49e-06 [shard_inline]: 7.73999e-06 [merge_send_recv]: 5.39e-06 [auto_parallel]: 5.98002e-06 [parallel]: 4.45e-06 [flash_sp]: 3.46001e-06 [merge_comm]: 3.91999e-06 [allreduce_fusion]: 3.6e-06 [matmul_add_comm_reduction]: 6.46e-06 [allreduce_slice_to_reducescatter]: 3.59985e-07 [virtual_shard_identity]: 8.24002e-06 [virtual_dataset]: 7.4e-06 [get_grad_eliminate_]: 7.11999e-06 [virtual_output]: 7.58999e-06 [merge_forward]: 3.08e-06 [cell_reuse_recompute_pass]: 1.33002e-06 [offload_activation]: 7.33e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.655e-05 [merge_recompute_call_nodes]: 8.90024e-07 [before_grad]: 1.142e-05 [set_forward_comm_id_for_comm_node_pass]: 4.40999e-06 [meta_fg_expand]: 2.48002e-06 [flash_sp_send_recv_attached]: 8.09989e-07 [receive_attached]: 1.01002e-06 [after_resolve]: 1.165e-05 [a_after_grad]: 1.11e-05 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 1.59e-06 [auto_monad_grad]: 9.39996e-07 [auto_monad_eliminator]: 8.47998e-06 [cse]: 1.786e-05 [a_3]: 5.894e-05 [py_interpret_to_execute_after_opt_a]: 1.31e-05 [slice_cell_reuse_recomputed_activation]: 4.63999e-06 [rewriter_after_opt_a]: 4.298e-05 [convert_after_rewriter]: 1.026e-05 [order_py_execute_after_rewriter]: 8.18001e-06 [mutable_eliminate]: 0.00049268 [opt_b]: 0.00030265, [1] [Cycle 1]: 0.0002935, [7] [b_1]: 0.0001941 [b_2]: 9.20001e-06 [updatestate_depend_eliminate]: 6.16e-06 [updatestate_assign_eliminate]: 3.4e-06 [updatestate_loads_eliminate]: 3.08998e-06 [renormalize]: 2.69996e-07 [cse]: 2.218e-05 [optimize_parallel_all_gather_comm]: 2.181e-05 [overlap_param_gather]: 5.14e-06 [cconv]: 2.772e-05 [loop_unroll]: 0.00043063 [opt_after_cconv]: 0.00013729, [1] [Cycle 1]: 0.00012891, [7] [c_1]: 3.693e-05 [parameter_eliminate]: 2.44001e-06 [updatestate_depend_eliminate]: 5.91e-06 [updatestate_assign_eliminate]: 3.14999e-06 [updatestate_loads_eliminate]: 2.84001e-06 [cse]: 2.147e-05 [renormalize]: 3.59985e-07 [remove_dup_value]: 1.873e-05 [tuple_transform]: 0.0001133, [1] [Cycle 1]: 0.00010632, [4] [d_1]: 6.411e-05 [none_parameter_eliminate]: 2.11003e-06 [renormalize]: 2.3999e-07 [switch_simplify]: 8.57e-06 [partial_unused_args_eliminate]: 4.80999e-06 [add_recomputation]: 5.821e-05 [cse_after_recomputation]: 3.08e-05, [1] [Cycle 1]: 2.406e-05, [1] [cse]: 1.51e-05 [environ_conv]: 9.37999e-06 [swap_dp_allreduce_reducescatter]: 9.04998e-06 [bias_add_comm_swap]: 4.87998e-06 [label_micro_interleaved_index]: 7.24001e-06 [label_fine_grained_interleaved_index]: 5.36002e-06 [merge_cast_opt]: 3.58e-06 [slice_recompute_activation]: 4.55001e-06 [micro_interleaved_order_control]: 4.68999e-06 [assign_add_opt]: 3.68e-06 [ForceFp32Comm]: 3.61999e-06 [remove_cast_before_assign_add]: 3.41999e-06 [full_micro_interleaved_order_control]: 4.58999e-06 [reorder_send_recv_between_fp_bp]: 5.29e-06 [comm_op_add_attrs]: 3.36001e-06 [add_comm_op_reuse_tag]: 3.26999e-06 [interleave_split_concat_branches]: 3.50998e-06 [interleave_parallel_branches]: 3.9e-06 [overlap_opt_shard_in_pipeline]: 4.05e-06 [overlap_opt_shard_grad_in_pipeline]: 4.63001e-06 [control_data_broadcast_order]: 1.716e-05 [grouped_pairwise_exchange_alltoall]: 3.9e-06 [offloading_packed_experts]: 7.23999e-06 [overlap_recompute_and_grad_model_parallel]: 7.56999e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.71001e-06 [overlap_recompute_allgather_and_fa_grad]: 3.73001e-06 [overlap_recompute_comm]: 4.82e-06 [overlap_grad_ring_attention]: 6.99001e-06 [overlap_grad_flash_sp]: 2.4e-05 [begin_end_overlap_inline]: 2.96999e-06 [split_matmul_comm_elemetwise]: 4.85999e-06 [split_layernorm_comm]: 4.03999e-06 [handle_group_info]: 3.48e-06 [symbol_engine_optimizer]: 0.0001006, [1] [Cycle 1]: 9.369e-05, [6] [build]: 2.90002e-06 [elim_shapecalc]: 1.088e-05 [elim_not_effective]: 1.461e-05 [opt_reshape]: 8.23001e-06 [fold_const_symbol]: 1.227e-05 [renormalize]: 2.30008e-07 [detach_backward]: 3.94002e-06 [pipeline_parallel_scheduler]: 1.91e-06 [auto_monad_reorder]: 2.341e-05 [get_jit_bprop_graph]: 1.30999e-06 [rewriter_after_jit_bprop_graph]: 4.00998e-06 [opt_after_jit_grad]: 0.00048083 [validate]: 3.962e-05 Sums bootstrap : 0.000434s : 3.89% type_inference : 0.005646s : 50.68% event_method : 0.000019s : 0.17% auto_monad : 0.000065s : 0.59% graph_reusing : 0.000006s : 0.06% inline : 0.000002s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000019s : 0.17% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000007s : 0.06% parallel-infer-symbol : 0.000003s : 0.03% pre_auto_parallel : 0.000033s : 0.30% insert-virtual-dataset : 0.000002s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.02% optimize.py_interpret_to_execute : 0.000030s : 0.27% optimize.rewriter_before_opt_a : 0.000089s : 0.80% optimize.opt_a.expand_dump_flag : 0.000004s : 0.04% optimize.opt_a.switch_simplify : 0.000052s : 0.47% optimize.opt_a.loop_unroll : 0.000039s : 0.35% optimize.opt_a.a_1 : 0.000838s : 7.52% optimize.opt_a.with_stream_mark : 0.000027s : 0.24% optimize.opt_a.recompute_prepare : 0.000019s : 0.17% optimize.opt_a.updatestate_depend_eliminate : 0.000009s : 0.08% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.07% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.06% optimize.opt_a.parameter_eliminate : 0.000003s : 0.03% optimize.opt_a.a_2 : 0.000248s : 2.23% optimize.opt_a.accelerated_algorithm : 0.000015s : 0.14% optimize.opt_a.shard : 0.000003s : 0.03% optimize.opt_a.meta_shard_fg_expand : 0.000003s : 0.03% optimize.opt_a.shard_inline : 0.000016s : 0.14% optimize.opt_a.merge_send_recv : 0.000014s : 0.13% optimize.opt_a.auto_parallel : 0.000013s : 0.12% optimize.opt_a.parallel : 0.000024s : 0.21% optimize.opt_a.flash_sp : 0.000011s : 0.10% optimize.opt_a.merge_comm : 0.000009s : 0.08% optimize.opt_a.allreduce_fusion : 0.000008s : 0.07% optimize.opt_a.matmul_add_comm_reduction : 0.000018s : 0.16% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000018s : 0.16% optimize.opt_a.virtual_dataset : 0.000015s : 0.14% optimize.opt_a.get_grad_eliminate_ : 0.000015s : 0.13% optimize.opt_a.virtual_output : 0.000016s : 0.14% optimize.opt_a.merge_forward : 0.000007s : 0.07% optimize.opt_a.cell_reuse_recompute_pass : 0.000002s : 0.02% optimize.opt_a.offload_activation : 0.000018s : 0.16% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000034s : 0.30% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.02% optimize.opt_a.before_grad : 0.000024s : 0.22% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000009s : 0.08% optimize.opt_a.meta_fg_expand : 0.000006s : 0.05% optimize.opt_a.flash_sp_send_recv_attached : 0.000003s : 0.03% optimize.opt_a.receive_attached : 0.000004s : 0.03% optimize.opt_a.after_resolve : 0.000025s : 0.22% optimize.opt_a.a_after_grad : 0.000023s : 0.21% optimize.opt_a.renormalize : 0.000623s : 5.59% optimize.opt_a.add_forward_monad_depend : 0.000007s : 0.06% optimize.opt_a.auto_monad_grad : 0.000003s : 0.03% optimize.opt_a.auto_monad_eliminator : 0.000025s : 0.23% optimize.opt_a.cse : 0.000053s : 0.48% optimize.opt_a.a_3 : 0.000130s : 1.17% optimize.py_interpret_to_execute_after_opt_a : 0.000013s : 0.12% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.04% optimize.rewriter_after_opt_a : 0.000043s : 0.39% optimize.convert_after_rewriter : 0.000010s : 0.09% optimize.order_py_execute_after_rewriter : 0.000008s : 0.07% optimize.mutable_eliminate : 0.000493s : 4.42% optimize.opt_b.b_1 : 0.000194s : 1.74% optimize.opt_b.b_2 : 0.000009s : 0.08% optimize.opt_b.updatestate_depend_eliminate : 0.000006s : 0.06% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_b.renormalize : 0.000000s : 0.00% optimize.opt_b.cse : 0.000022s : 0.20% optimize.optimize_parallel_all_gather_comm : 0.000022s : 0.20% optimize.overlap_param_gather : 0.000005s : 0.05% optimize.cconv : 0.000028s : 0.25% optimize.loop_unroll : 0.000431s : 3.87% optimize.opt_after_cconv.c_1 : 0.000037s : 0.33% optimize.opt_after_cconv.parameter_eliminate : 0.000002s : 0.02% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.05% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.cse : 0.000021s : 0.19% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000019s : 0.17% optimize.tuple_transform.d_1 : 0.000064s : 0.58% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.02% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000009s : 0.08% optimize.partial_unused_args_eliminate : 0.000005s : 0.04% optimize.add_recomputation : 0.000058s : 0.52% optimize.cse_after_recomputation.cse : 0.000015s : 0.14% optimize.environ_conv : 0.000009s : 0.08% optimize.swap_dp_allreduce_reducescatter : 0.000009s : 0.08% optimize.bias_add_comm_swap : 0.000005s : 0.04% optimize.label_micro_interleaved_index : 0.000007s : 0.06% optimize.label_fine_grained_interleaved_index : 0.000005s : 0.05% optimize.merge_cast_opt : 0.000004s : 0.03% optimize.slice_recompute_activation : 0.000005s : 0.04% optimize.micro_interleaved_order_control : 0.000005s : 0.04% optimize.assign_add_opt : 0.000004s : 0.03% optimize.ForceFp32Comm : 0.000004s : 0.03% optimize.remove_cast_before_assign_add : 0.000003s : 0.03% optimize.full_micro_interleaved_order_control : 0.000005s : 0.04% optimize.reorder_send_recv_between_fp_bp : 0.000005s : 0.05% optimize.comm_op_add_attrs : 0.000003s : 0.03% optimize.add_comm_op_reuse_tag : 0.000003s : 0.03% optimize.interleave_split_concat_branches : 0.000004s : 0.03% optimize.interleave_parallel_branches : 0.000004s : 0.04% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.04% optimize.overlap_opt_shard_grad_in_pipeline : 0.000005s : 0.04% optimize.control_data_broadcast_order : 0.000017s : 0.15% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.04% optimize.offloading_packed_experts : 0.000007s : 0.06% optimize.overlap_recompute_and_grad_model_parallel : 0.000008s : 0.07% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.03% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.03% optimize.overlap_recompute_comm : 0.000005s : 0.04% optimize.overlap_grad_ring_attention : 0.000007s : 0.06% optimize.overlap_grad_flash_sp : 0.000024s : 0.22% optimize.begin_end_overlap_inline : 0.000003s : 0.03% optimize.split_matmul_comm_elemetwise : 0.000005s : 0.04% optimize.split_layernorm_comm : 0.000004s : 0.04% optimize.handle_group_info : 0.000003s : 0.03% optimize.symbol_engine_optimizer.build : 0.000003s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000011s : 0.10% optimize.symbol_engine_optimizer.elim_not_effective : 0.000015s : 0.13% optimize.symbol_engine_optimizer.opt_reshape : 0.000008s : 0.07% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000012s : 0.11% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000004s : 0.04% pipeline_parallel_scheduler : 0.000002s : 0.02% auto_monad_reorder : 0.000023s : 0.21% get_jit_bprop_graph : 0.000001s : 0.01% rewriter_after_jit_bprop_graph : 0.000004s : 0.04% opt_after_jit_grad : 0.000481s : 4.32% validate : 0.000040s : 0.36% Time group info: ------[substitution.] 0.000202 38 11.41% : 0.000023s : 3: substitution.cast_eliminate 1.08% : 0.000002s : 3: substitution.elim_not_effective 0.82% : 0.000002s : 3: substitution.fold_const_symbol 3.32% : 0.000007s : 5: substitution.graph_param_transform 69.45% : 0.000140s : 4: substitution.inline 2.03% : 0.000004s : 6: substitution.j_node_and_user_rematch 3.26% : 0.000007s : 6: substitution.remove_not_recompute_node 2.56% : 0.000005s : 4: substitution.replace_old_param 6.06% : 0.000012s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.005598 2 87.90% : 0.004921s : 1: type_inference.infer 12.10% : 0.000677s : 1: type_inference.specialize ------[replace.] 0.000061 8 62.70% : 0.000038s : 4: replace.inline 37.30% : 0.000023s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000148 8 92.99% : 0.000137s : 4: match.inline 7.01% : 0.000010s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000235 1504 0.87% : 0.000002s : 15: predicate.accumulaten_eliminater 0.80% : 0.000002s : 5: predicate.ad_related_special_op_eliminate 0.62% : 0.000001s : 10: predicate.addn_check_dump 0.87% : 0.000002s : 15: predicate.addn_zero_filter 0.82% : 0.000002s : 15: predicate.adjust_all_reduce_mul_add 2.11% : 0.000005s : 25: predicate.arithmetic_simplify 0.98% : 0.000002s : 15: predicate.cast_eliminate 0.71% : 0.000002s : 10: predicate.check_bprop_eliminate 0.59% : 0.000001s : 10: predicate.compare_switch_simplify 0.20% : 0.000000s : 5: predicate.const_output_eliminate 0.64% : 0.000001s : 10: predicate.depend_value_elim 0.91% : 0.000002s : 15: predicate.dict_get_item_const_eliminator 1.00% : 0.000002s : 15: predicate.dict_get_item_eliminator 0.87% : 0.000002s : 15: predicate.dict_set_item_eliminator 1.02% : 0.000002s : 10: predicate.dumpgradient_eliminate 0.20% : 0.000000s : 5: predicate.elim_not_effective 0.41% : 0.000001s : 5: predicate.elim_shapecalc_of_broadcastargs 1.14% : 0.000003s : 20: predicate.environ_add_const_eliminate 1.14% : 0.000003s : 20: predicate.environ_get_add_eliminate 1.13% : 0.000003s : 20: predicate.environ_get_depend_swap 1.74% : 0.000004s : 30: predicate.environ_get_eliminate 1.16% : 0.000003s : 20: predicate.environ_get_set_eliminate 1.38% : 0.000003s : 23: predicate.exchange_switch_depend_value 2.18% : 0.000005s : 23: predicate.float_depend_g_call 0.59% : 0.000001s : 10: predicate.float_environ_get_switch 0.90% : 0.000002s : 15: predicate.float_tuple_getitem_switch 0.20% : 0.000000s : 5: predicate.fold_const_symbol 0.77% : 0.000002s : 10: predicate.get_grad_eliminate 0.22% : 0.000001s : 5: predicate.graph_param_transform 0.65% : 0.000002s : 10: predicate.incorporate_call 0.58% : 0.000001s : 10: predicate.incorporate_call_switch 6.40% : 0.000015s : 68: predicate.inline 0.83% : 0.000002s : 10: predicate.inline_without_move 0.36% : 0.000001s : 10: predicate.j_node_and_user_rematch 0.79% : 0.000002s : 10: predicate.less_batch_normalization 1.84% : 0.000004s : 29: predicate.list_to_tuple_eliminator_ 2.55% : 0.000006s : 44: predicate.load_eliminater 0.85% : 0.000002s : 5: predicate.loop_unroll_after_grad 2.21% : 0.000005s : 36: predicate.loop_unroll_before_grad 1.69% : 0.000004s : 25: predicate.make_slice_get_slice_eliminator 0.64% : 0.000001s : 10: predicate.merge_addn 0.65% : 0.000002s : 10: predicate.micro_step_allgather_replace 0.63% : 0.000001s : 10: predicate.mini_step_allgather_replace 0.85% : 0.000002s : 15: predicate.minmaximum_grad 0.88% : 0.000002s : 5: predicate.mutable_eliminate 0.40% : 0.000001s : 5: predicate.opt_reshape 0.38% : 0.000001s : 5: predicate.parallel_virtual_node 1.56% : 0.000004s : 23: predicate.partial_defer_inline 1.67% : 0.000004s : 24: predicate.partial_eliminate 0.88% : 0.000002s : 15: predicate.print_const_string_wrapper 0.62% : 0.000001s : 10: predicate.reduce_all_const_elim 1.11% : 0.000003s : 15: predicate.reduce_eliminate 2.57% : 0.000006s : 44: predicate.redundant_stop_gradient_eliminater 0.45% : 0.000001s : 10: predicate.remove_not_recompute_node 1.41% : 0.000003s : 29: predicate.replace_applicator 0.50% : 0.000001s : 10: predicate.replace_old_param 0.28% : 0.000001s : 5: predicate.reset_defer_inline 0.95% : 0.000002s : 15: predicate.reshape_eliminate 0.66% : 0.000002s : 10: predicate.row_tensor_add_zeros_like 0.38% : 0.000001s : 5: predicate.row_tensor_eliminate 0.82% : 0.000002s : 10: predicate.same_eliminate 0.45% : 0.000001s : 10: predicate.set_cell_output_no_recompute 0.76% : 0.000002s : 10: predicate.shard_identity_eliminate 0.75% : 0.000002s : 10: predicate.special_op_eliminate 0.77% : 0.000002s : 10: predicate.specialize_transform 0.89% : 0.000002s : 10: predicate.split_environ_get_set_with_tuple_value 0.76% : 0.000002s : 10: predicate.stack_unstack_eliminate 0.35% : 0.000001s : 5: predicate.switch_call_monad_eliminater 1.46% : 0.000003s : 23: predicate.switch_defer_inline 2.03% : 0.000005s : 33: predicate.switch_layer_defer_inline 5.12% : 0.000012s : 74: predicate.switch_simplify 0.86% : 0.000002s : 15: predicate.tile_eliminate 1.07% : 0.000003s : 15: predicate.transpose_eliminate 1.57% : 0.000004s : 25: predicate.tuple_list_convert_item_index_to_positive 1.54% : 0.000004s : 25: predicate.tuple_list_get_item_const_eliminator 1.50% : 0.000004s : 25: predicate.tuple_list_get_item_depend_reorder 3.11% : 0.000007s : 39: predicate.tuple_list_get_item_eliminator 1.53% : 0.000004s : 25: predicate.tuple_list_get_set_item_eliminator 2.34% : 0.000006s : 35: predicate.tuple_list_set_item_eliminator 1.75% : 0.000004s : 29: predicate.tuple_to_list_eliminator_ 2.51% : 0.000006s : 44: predicate.updatestate_pure_node_eliminater 3.22% : 0.000008s : 54: predicate.updatestate_useless_node_eliminater 0.31% : 0.000001s : 5: predicate.value_based_eliminate 0.68% : 0.000002s : 10: predicate.virtual_dataset_eliminate 0.67% : 0.000002s : 10: predicate.virtual_output_eliminate 0.31% : 0.000001s : 5: predicate.virtual_view_grad_eliminate 0.49% : 0.000001s : 5: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000510 11 55.51% : 0.000283s : 5: func_graph_cloner_run.FuncGraphClonerGraph 44.49% : 0.000227s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.027233 192 0.02% : 0.000007s : 1: ForceFp32Comm 11.36% : 0.003093s : 1: add_attr 11.31% : 0.003079s : 1: add_attr_with_inline 0.02% : 0.000006s : 1: add_comm_op_reuse_tag 0.23% : 0.000062s : 1: add_recomputation 0.02% : 0.000006s : 1: assign_add_opt 0.27% : 0.000074s : 1: auto_monad 0.11% : 0.000031s : 1: auto_monad_reorder 0.02% : 0.000006s : 1: begin_end_overlap_inline 0.03% : 0.000008s : 1: bias_add_comm_swap 1.75% : 0.000477s : 1: bootstrap 0.11% : 0.000031s : 1: cconv 0.02% : 0.000006s : 1: comm_op_add_attrs 0.07% : 0.000020s : 1: control_data_broadcast_order 0.05% : 0.000013s : 1: convert_after_rewriter 0.12% : 0.000034s : 1: cse_after_recomputation 0.03% : 0.000007s : 1: dataset_repeat_opt 0.08% : 0.000021s : 1: detach_backward 0.05% : 0.000012s : 1: environ_conv 0.10% : 0.000028s : 1: event_method 0.03% : 0.000007s : 1: full_micro_interleaved_order_control 0.03% : 0.000007s : 1: get_jit_bprop_graph 0.05% : 0.000012s : 1: graph_reusing 0.02% : 0.000006s : 1: grouped_pairwise_exchange_alltoall 0.02% : 0.000006s : 1: handle_group_info 0.03% : 0.000008s : 1: inline 0.03% : 0.000008s : 1: insert-virtual-dataset 0.02% : 0.000007s : 1: interleave_parallel_branches 0.02% : 0.000006s : 1: interleave_split_concat_branches 0.03% : 0.000008s : 1: label_fine_grained_interleaved_index 0.04% : 0.000010s : 1: label_micro_interleaved_index 1.60% : 0.000436s : 1: loop_unroll 0.02% : 0.000006s : 1: merge_cast_opt 0.03% : 0.000007s : 1: micro_interleaved_order_control 1.83% : 0.000499s : 1: mutable_eliminate 0.04% : 0.000010s : 1: offloading_packed_experts 0.05% : 0.000015s : 1: opt.transform.loop_unroll_optimizer 0.06% : 0.000017s : 1: opt.transform.mutable_eliminate 4.98% : 0.001356s : 78: opt.transform.opt_a 0.13% : 0.000036s : 1: opt.transform.opt_after_cconv 0.11% : 0.000029s : 1: opt.transform.opt_after_jit_grad 0.49% : 0.000133s : 28: opt.transform.opt_b 0.26% : 0.000070s : 2: opt.transform.opt_trans_graph 0.16% : 0.000043s : 4: opt.transform.symbol_engine_opt 11.72% : 0.003193s : 1: opt_a 0.52% : 0.000141s : 1: opt_after_cconv 1.80% : 0.000491s : 1: opt_after_jit_grad 1.12% : 0.000306s : 1: opt_b 23.28% : 0.006340s : 1: optimize 0.09% : 0.000025s : 1: optimize_parallel_all_gather_comm 0.04% : 0.000011s : 1: order_py_execute_after_rewriter 0.10% : 0.000027s : 1: overlap_grad_flash_sp 0.02% : 0.000007s : 1: overlap_grad_matmul_and_grad_allreduce 0.04% : 0.000010s : 1: overlap_grad_ring_attention 0.03% : 0.000007s : 1: overlap_opt_shard_grad_in_pipeline 0.03% : 0.000007s : 1: overlap_opt_shard_in_pipeline 0.03% : 0.000008s : 1: overlap_param_gather 0.02% : 0.000006s : 1: overlap_recompute_allgather_and_fa_grad 0.04% : 0.000010s : 1: overlap_recompute_and_grad_model_parallel 0.03% : 0.000008s : 1: overlap_recompute_comm 0.04% : 0.000010s : 1: parallel-infer-symbol 0.02% : 0.000006s : 1: parallel-infer-symbol-second 0.03% : 0.000008s : 1: partial_unused_args_eliminate 0.03% : 0.000009s : 1: pipeline_parallel_scheduler 0.03% : 0.000007s : 1: pipeline_split 0.15% : 0.000041s : 1: pre_auto_parallel 0.13% : 0.000034s : 1: py_interpret_to_execute 0.06% : 0.000016s : 1: py_interpret_to_execute_after_opt_a 0.02% : 0.000006s : 1: remove_cast_before_assign_add 0.08% : 0.000022s : 1: remove_dup_value 1.25% : 0.000340s : 1: renormalize.infer 1.01% : 0.000274s : 1: renormalize.specialize 0.03% : 0.000008s : 1: reorder_send_recv_between_fp_bp 0.04% : 0.000010s : 1: rewriter_after_jit_bprop_graph 0.17% : 0.000047s : 1: rewriter_after_opt_a 0.34% : 0.000093s : 1: rewriter_before_opt_a 0.03% : 0.000007s : 1: slice_cell_reuse_recomputed_activation 0.03% : 0.000007s : 1: slice_recompute_activation 0.02% : 0.000007s : 1: split_layernorm_comm 0.03% : 0.000008s : 1: split_matmul_comm_elemetwise 0.04% : 0.000012s : 1: swap_dp_allreduce_reducescatter 0.38% : 0.000104s : 1: symbol_engine_optimizer 0.43% : 0.000116s : 1: tuple_transform 20.86% : 0.005681s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:45:25.230.263 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0155061, [21] [bootstrap]: 0.00044559 [type_inference]: 0.00601682 [event_method]: 2.037e-05 [auto_monad]: 6.608e-05 [graph_reusing]: 6.71e-06 [inline]: 3.21001e-06 [add_attr]: 0.00307109, [1] [add_attr_with_inline]: 0.00306191, [1] [Cycle 1]: 6.225e-05, [2] [tag_attr]: 1.966e-05 [meta_addattr_fg_expand]: 5.87999e-06 [parallel-infer-symbol]: 3.39001e-06 [pre_auto_parallel]: 3.217e-05 [insert-virtual-dataset]: 2.34001e-06 [parallel-infer-symbol-second]: 7.2e-07 [dataset_repeat_opt]: 2.27999e-06 [pipeline_split]: 1.77999e-06 [optimize]: 0.00510308, [53] [py_interpret_to_execute]: 2.469e-05 [rewriter_before_opt_a]: 8.435e-05 [opt_a]: 0.00295496, [2] [Cycle 1]: 0.00218113, [45] [expand_dump_flag]: 2.94999e-06 [switch_simplify]: 4.485e-05 [loop_unroll]: 3.092e-05 [a_1]: 0.00068122 [with_stream_mark]: 1.686e-05 [recompute_prepare]: 1.311e-05 [updatestate_depend_eliminate]: 5.15999e-06 [updatestate_assign_eliminate]: 3.83001e-06 [updatestate_loads_eliminate]: 3.50998e-06 [parameter_eliminate]: 2.22001e-06 [a_2]: 0.00010272 [accelerated_algorithm]: 9.29e-06 [shard]: 1.84e-06 [meta_shard_fg_expand]: 2.12001e-06 [shard_inline]: 8.08001e-06 [merge_send_recv]: 1.076e-05 [auto_parallel]: 7.30998e-06 [parallel]: 1.851e-05 [flash_sp]: 8.47e-06 [merge_comm]: 4.45e-06 [allreduce_fusion]: 4.35e-06 [matmul_add_comm_reduction]: 1.121e-05 [allreduce_slice_to_reducescatter]: 6.30011e-07 [virtual_shard_identity]: 1.057e-05 [virtual_dataset]: 8.02998e-06 [get_grad_eliminate_]: 7.77e-06 [virtual_output]: 8.08999e-06 [merge_forward]: 4.72998e-06 [cell_reuse_recompute_pass]: 1.27e-06 [offload_activation]: 1.116e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.606e-05 [merge_recompute_call_nodes]: 1.43002e-06 [before_grad]: 1.298e-05 [set_forward_comm_id_for_comm_node_pass]: 4.33001e-06 [meta_fg_expand]: 3.17002e-06 [flash_sp_send_recv_attached]: 2.51e-06 [receive_attached]: 2.68e-06 [after_resolve]: 1.348e-05 [a_after_grad]: 1.205e-05 [renormalize]: 0.00069853 [add_forward_monad_depend]: 5.81003e-06 [auto_monad_grad]: 2.07999e-06 [auto_monad_eliminator]: 1.744e-05 [cse]: 3.719e-05 [a_3]: 5.808e-05 [Cycle 2]: 0.00076418, [45] [expand_dump_flag]: 1.23002e-06 [switch_simplify]: 8.94e-06 [loop_unroll]: 7.51001e-06 [a_1]: 0.00017461 [with_stream_mark]: 1.338e-05 [recompute_prepare]: 7.80998e-06 [updatestate_depend_eliminate]: 3.73999e-06 [updatestate_assign_eliminate]: 3.23e-06 [updatestate_loads_eliminate]: 3.03e-06 [parameter_eliminate]: 1.25001e-06 [a_2]: 9.31e-05 [accelerated_algorithm]: 7.75e-06 [shard]: 1.29e-06 [meta_shard_fg_expand]: 1.95001e-06 [shard_inline]: 7.34002e-06 [merge_send_recv]: 6.12999e-06 [auto_parallel]: 9.09e-06 [parallel]: 5.05001e-06 [flash_sp]: 4.05e-06 [merge_comm]: 4.28001e-06 [allreduce_fusion]: 3.82002e-06 [matmul_add_comm_reduction]: 6.88e-06 [allreduce_slice_to_reducescatter]: 3.59985e-07 [virtual_shard_identity]: 8.12998e-06 [virtual_dataset]: 7.21999e-06 [get_grad_eliminate_]: 7.4e-06 [virtual_output]: 7e-06 [merge_forward]: 3.14001e-06 [cell_reuse_recompute_pass]: 1.71998e-06 [offload_activation]: 9.56e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.434e-05 [merge_recompute_call_nodes]: 1.34998e-06 [before_grad]: 1.21e-05 [set_forward_comm_id_for_comm_node_pass]: 4.35e-06 [meta_fg_expand]: 2.70002e-06 [flash_sp_send_recv_attached]: 9.39996e-07 [receive_attached]: 1.22999e-06 [after_resolve]: 1.258e-05 [a_after_grad]: 1.137e-05 [renormalize]: 8.9989e-08 [add_forward_monad_depend]: 1.14998e-06 [auto_monad_grad]: 1.17e-06 [auto_monad_eliminator]: 8.69e-06 [cse]: 1.807e-05 [a_3]: 4.502e-05 [py_interpret_to_execute_after_opt_a]: 1.067e-05 [slice_cell_reuse_recomputed_activation]: 2.38998e-06 [rewriter_after_opt_a]: 3.997e-05 [convert_after_rewriter]: 7.7e-06 [order_py_execute_after_rewriter]: 5.93002e-06 [mutable_eliminate]: 0.00049549 [opt_b]: 0.00024801, [1] [Cycle 1]: 0.00024128, [7] [b_1]: 0.00015553 [b_2]: 9.55001e-06 [updatestate_depend_eliminate]: 7.13998e-06 [updatestate_assign_eliminate]: 2.96999e-06 [updatestate_loads_eliminate]: 3.08e-06 [renormalize]: 8.09989e-07 [cse]: 2.514e-05 [optimize_parallel_all_gather_comm]: 1.795e-05 [overlap_param_gather]: 2.33002e-06 [cconv]: 2.66e-05 [loop_unroll]: 0.00043439 [opt_after_cconv]: 0.00011478, [1] [Cycle 1]: 0.00010873, [7] [c_1]: 3.692e-05 [parameter_eliminate]: 3.57997e-06 [updatestate_depend_eliminate]: 5.94999e-06 [updatestate_assign_eliminate]: 3.05002e-06 [updatestate_loads_eliminate]: 2.83e-06 [cse]: 2.217e-05 [renormalize]: 3.49974e-07 [remove_dup_value]: 1.428e-05 [tuple_transform]: 8.861e-05, [1] [Cycle 1]: 8.38e-05, [4] [d_1]: 5.328e-05 [none_parameter_eliminate]: 1.63002e-06 [renormalize]: 2.10013e-07 [switch_simplify]: 9.43002e-06 [partial_unused_args_eliminate]: 2.29999e-06 [add_recomputation]: 5.975e-05 [cse_after_recomputation]: 2.666e-05, [1] [Cycle 1]: 2.22e-05, [1] [cse]: 1.56e-05 [environ_conv]: 6.92002e-06 [swap_dp_allreduce_reducescatter]: 6.57002e-06 [bias_add_comm_swap]: 2.66e-06 [label_micro_interleaved_index]: 4.82998e-06 [label_fine_grained_interleaved_index]: 3.14999e-06 [merge_cast_opt]: 1.22999e-06 [slice_recompute_activation]: 2.07999e-06 [micro_interleaved_order_control]: 2.60997e-06 [assign_add_opt]: 1.43002e-06 [ForceFp32Comm]: 8.59989e-07 [remove_cast_before_assign_add]: 1.04e-06 [full_micro_interleaved_order_control]: 2.17999e-06 [reorder_send_recv_between_fp_bp]: 2.58e-06 [comm_op_add_attrs]: 1.04998e-06 [add_comm_op_reuse_tag]: 9.30013e-07 [interleave_split_concat_branches]: 1.24e-06 [interleave_parallel_branches]: 1.08001e-06 [overlap_opt_shard_in_pipeline]: 1.52999e-06 [overlap_opt_shard_grad_in_pipeline]: 2.44999e-06 [control_data_broadcast_order]: 1.562e-05 [grouped_pairwise_exchange_alltoall]: 1.69e-06 [offloading_packed_experts]: 4.62e-06 [overlap_recompute_and_grad_model_parallel]: 5.29e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.27999e-06 [overlap_recompute_allgather_and_fa_grad]: 1.39003e-06 [overlap_recompute_comm]: 2.07001e-06 [overlap_grad_ring_attention]: 4.72e-06 [overlap_grad_flash_sp]: 2.307e-05 [begin_end_overlap_inline]: 6.00005e-07 [split_matmul_comm_elemetwise]: 2.06e-06 [split_layernorm_comm]: 1.70001e-06 [handle_group_info]: 8.89995e-07 [symbol_engine_optimizer]: 8.808e-05, [1] [Cycle 1]: 8.365e-05, [6] [build]: 3.53e-06 [elim_shapecalc]: 1.244e-05 [elim_not_effective]: 1.638e-05 [opt_reshape]: 8.52e-06 [fold_const_symbol]: 1.291e-05 [renormalize]: 2.59985e-07 [detach_backward]: 1.86e-06 [pipeline_parallel_scheduler]: 1.66e-06 [auto_monad_reorder]: 2.014e-05 [get_jit_bprop_graph]: 1.90001e-06 [rewriter_after_jit_bprop_graph]: 4.58001e-06 [opt_after_jit_grad]: 0.00048196 [validate]: 4.268e-05 Sums bootstrap : 0.000446s : 3.90% type_inference : 0.006017s : 52.67% event_method : 0.000020s : 0.18% auto_monad : 0.000066s : 0.58% graph_reusing : 0.000007s : 0.06% inline : 0.000003s : 0.03% add_attr.add_attr_with_inline.tag_attr : 0.000020s : 0.17% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.05% parallel-infer-symbol : 0.000003s : 0.03% pre_auto_parallel : 0.000032s : 0.28% insert-virtual-dataset : 0.000002s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.02% optimize.py_interpret_to_execute : 0.000025s : 0.22% optimize.rewriter_before_opt_a : 0.000084s : 0.74% optimize.opt_a.expand_dump_flag : 0.000004s : 0.04% optimize.opt_a.switch_simplify : 0.000054s : 0.47% optimize.opt_a.loop_unroll : 0.000038s : 0.34% optimize.opt_a.a_1 : 0.000856s : 7.49% optimize.opt_a.with_stream_mark : 0.000030s : 0.26% optimize.opt_a.recompute_prepare : 0.000021s : 0.18% optimize.opt_a.updatestate_depend_eliminate : 0.000009s : 0.08% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.06% optimize.opt_a.updatestate_loads_eliminate : 0.000007s : 0.06% optimize.opt_a.parameter_eliminate : 0.000003s : 0.03% optimize.opt_a.a_2 : 0.000196s : 1.71% optimize.opt_a.accelerated_algorithm : 0.000017s : 0.15% optimize.opt_a.shard : 0.000003s : 0.03% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.04% optimize.opt_a.shard_inline : 0.000015s : 0.13% optimize.opt_a.merge_send_recv : 0.000017s : 0.15% optimize.opt_a.auto_parallel : 0.000016s : 0.14% optimize.opt_a.parallel : 0.000024s : 0.21% optimize.opt_a.flash_sp : 0.000013s : 0.11% optimize.opt_a.merge_comm : 0.000009s : 0.08% optimize.opt_a.allreduce_fusion : 0.000008s : 0.07% optimize.opt_a.matmul_add_comm_reduction : 0.000018s : 0.16% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000019s : 0.16% optimize.opt_a.virtual_dataset : 0.000015s : 0.13% optimize.opt_a.get_grad_eliminate_ : 0.000015s : 0.13% optimize.opt_a.virtual_output : 0.000015s : 0.13% optimize.opt_a.merge_forward : 0.000008s : 0.07% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.03% optimize.opt_a.offload_activation : 0.000021s : 0.18% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000030s : 0.27% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.02% optimize.opt_a.before_grad : 0.000025s : 0.22% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000009s : 0.08% optimize.opt_a.meta_fg_expand : 0.000006s : 0.05% optimize.opt_a.flash_sp_send_recv_attached : 0.000003s : 0.03% optimize.opt_a.receive_attached : 0.000004s : 0.03% optimize.opt_a.after_resolve : 0.000026s : 0.23% optimize.opt_a.a_after_grad : 0.000023s : 0.21% optimize.opt_a.renormalize : 0.000699s : 6.12% optimize.opt_a.add_forward_monad_depend : 0.000007s : 0.06% optimize.opt_a.auto_monad_grad : 0.000003s : 0.03% optimize.opt_a.auto_monad_eliminator : 0.000026s : 0.23% optimize.opt_a.cse : 0.000055s : 0.48% optimize.opt_a.a_3 : 0.000103s : 0.90% optimize.py_interpret_to_execute_after_opt_a : 0.000011s : 0.09% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.02% optimize.rewriter_after_opt_a : 0.000040s : 0.35% optimize.convert_after_rewriter : 0.000008s : 0.07% optimize.order_py_execute_after_rewriter : 0.000006s : 0.05% optimize.mutable_eliminate : 0.000495s : 4.34% optimize.opt_b.b_1 : 0.000156s : 1.36% optimize.opt_b.b_2 : 0.000010s : 0.08% optimize.opt_b.updatestate_depend_eliminate : 0.000007s : 0.06% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_b.renormalize : 0.000001s : 0.01% optimize.opt_b.cse : 0.000025s : 0.22% optimize.optimize_parallel_all_gather_comm : 0.000018s : 0.16% optimize.overlap_param_gather : 0.000002s : 0.02% optimize.cconv : 0.000027s : 0.23% optimize.loop_unroll : 0.000434s : 3.80% optimize.opt_after_cconv.c_1 : 0.000037s : 0.32% optimize.opt_after_cconv.parameter_eliminate : 0.000004s : 0.03% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.05% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.02% optimize.opt_after_cconv.cse : 0.000022s : 0.19% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000014s : 0.12% optimize.tuple_transform.d_1 : 0.000053s : 0.47% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000009s : 0.08% optimize.partial_unused_args_eliminate : 0.000002s : 0.02% optimize.add_recomputation : 0.000060s : 0.52% optimize.cse_after_recomputation.cse : 0.000016s : 0.14% optimize.environ_conv : 0.000007s : 0.06% optimize.swap_dp_allreduce_reducescatter : 0.000007s : 0.06% optimize.bias_add_comm_swap : 0.000003s : 0.02% optimize.label_micro_interleaved_index : 0.000005s : 0.04% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.03% optimize.merge_cast_opt : 0.000001s : 0.01% optimize.slice_recompute_activation : 0.000002s : 0.02% optimize.micro_interleaved_order_control : 0.000003s : 0.02% optimize.assign_add_opt : 0.000001s : 0.01% optimize.ForceFp32Comm : 0.000001s : 0.01% optimize.remove_cast_before_assign_add : 0.000001s : 0.01% optimize.full_micro_interleaved_order_control : 0.000002s : 0.02% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.02% optimize.comm_op_add_attrs : 0.000001s : 0.01% optimize.add_comm_op_reuse_tag : 0.000001s : 0.01% optimize.interleave_split_concat_branches : 0.000001s : 0.01% optimize.interleave_parallel_branches : 0.000001s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000002s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.02% optimize.control_data_broadcast_order : 0.000016s : 0.14% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.01% optimize.offloading_packed_experts : 0.000005s : 0.04% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.05% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.01% optimize.overlap_recompute_comm : 0.000002s : 0.02% optimize.overlap_grad_ring_attention : 0.000005s : 0.04% optimize.overlap_grad_flash_sp : 0.000023s : 0.20% optimize.begin_end_overlap_inline : 0.000001s : 0.01% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.02% optimize.split_layernorm_comm : 0.000002s : 0.01% optimize.handle_group_info : 0.000001s : 0.01% optimize.symbol_engine_optimizer.build : 0.000004s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000012s : 0.11% optimize.symbol_engine_optimizer.elim_not_effective : 0.000016s : 0.14% optimize.symbol_engine_optimizer.opt_reshape : 0.000009s : 0.07% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000013s : 0.11% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.02% pipeline_parallel_scheduler : 0.000002s : 0.01% auto_monad_reorder : 0.000020s : 0.18% get_jit_bprop_graph : 0.000002s : 0.02% rewriter_after_jit_bprop_graph : 0.000005s : 0.04% opt_after_jit_grad : 0.000482s : 4.22% validate : 0.000043s : 0.37% Time group info: ------[substitution.] 0.000202 38 12.04% : 0.000024s : 3: substitution.cast_eliminate 1.09% : 0.000002s : 3: substitution.elim_not_effective 0.81% : 0.000002s : 3: substitution.fold_const_symbol 3.16% : 0.000006s : 5: substitution.graph_param_transform 69.23% : 0.000140s : 4: substitution.inline 2.32% : 0.000005s : 6: substitution.j_node_and_user_rematch 2.83% : 0.000006s : 6: substitution.remove_not_recompute_node 2.31% : 0.000005s : 4: substitution.replace_old_param 6.21% : 0.000013s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.005954 2 88.11% : 0.005246s : 1: type_inference.infer 11.89% : 0.000708s : 1: type_inference.specialize ------[replace.] 0.000059 8 62.38% : 0.000037s : 4: replace.inline 37.62% : 0.000022s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000148 8 92.77% : 0.000137s : 4: match.inline 7.23% : 0.000011s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000238 1504 0.90% : 0.000002s : 15: predicate.accumulaten_eliminater 0.76% : 0.000002s : 5: predicate.ad_related_special_op_eliminate 0.57% : 0.000001s : 10: predicate.addn_check_dump 0.97% : 0.000002s : 15: predicate.addn_zero_filter 0.79% : 0.000002s : 15: predicate.adjust_all_reduce_mul_add 1.90% : 0.000005s : 25: predicate.arithmetic_simplify 1.19% : 0.000003s : 15: predicate.cast_eliminate 0.60% : 0.000001s : 10: predicate.check_bprop_eliminate 0.63% : 0.000001s : 10: predicate.compare_switch_simplify 0.20% : 0.000000s : 5: predicate.const_output_eliminate 0.63% : 0.000001s : 10: predicate.depend_value_elim 0.94% : 0.000002s : 15: predicate.dict_get_item_const_eliminator 0.93% : 0.000002s : 15: predicate.dict_get_item_eliminator 0.91% : 0.000002s : 15: predicate.dict_set_item_eliminator 0.96% : 0.000002s : 10: predicate.dumpgradient_eliminate 0.27% : 0.000001s : 5: predicate.elim_not_effective 0.41% : 0.000001s : 5: predicate.elim_shapecalc_of_broadcastargs 1.10% : 0.000003s : 20: predicate.environ_add_const_eliminate 1.09% : 0.000003s : 20: predicate.environ_get_add_eliminate 1.14% : 0.000003s : 20: predicate.environ_get_depend_swap 1.85% : 0.000004s : 30: predicate.environ_get_eliminate 1.13% : 0.000003s : 20: predicate.environ_get_set_eliminate 1.40% : 0.000003s : 23: predicate.exchange_switch_depend_value 2.34% : 0.000006s : 23: predicate.float_depend_g_call 0.57% : 0.000001s : 10: predicate.float_environ_get_switch 0.85% : 0.000002s : 15: predicate.float_tuple_getitem_switch 0.20% : 0.000000s : 5: predicate.fold_const_symbol 0.73% : 0.000002s : 10: predicate.get_grad_eliminate 0.21% : 0.000001s : 5: predicate.graph_param_transform 0.66% : 0.000002s : 10: predicate.incorporate_call 0.55% : 0.000001s : 10: predicate.incorporate_call_switch 6.34% : 0.000015s : 68: predicate.inline 0.92% : 0.000002s : 10: predicate.inline_without_move 0.34% : 0.000001s : 10: predicate.j_node_and_user_rematch 0.97% : 0.000002s : 10: predicate.less_batch_normalization 1.75% : 0.000004s : 29: predicate.list_to_tuple_eliminator_ 2.50% : 0.000006s : 44: predicate.load_eliminater 0.85% : 0.000002s : 5: predicate.loop_unroll_after_grad 2.17% : 0.000005s : 36: predicate.loop_unroll_before_grad 1.71% : 0.000004s : 25: predicate.make_slice_get_slice_eliminator 0.61% : 0.000001s : 10: predicate.merge_addn 0.59% : 0.000001s : 10: predicate.micro_step_allgather_replace 0.60% : 0.000001s : 10: predicate.mini_step_allgather_replace 0.81% : 0.000002s : 15: predicate.minmaximum_grad 0.95% : 0.000002s : 5: predicate.mutable_eliminate 0.47% : 0.000001s : 5: predicate.opt_reshape 0.37% : 0.000001s : 5: predicate.parallel_virtual_node 1.69% : 0.000004s : 23: predicate.partial_defer_inline 1.64% : 0.000004s : 24: predicate.partial_eliminate 0.87% : 0.000002s : 15: predicate.print_const_string_wrapper 0.60% : 0.000001s : 10: predicate.reduce_all_const_elim 1.07% : 0.000003s : 15: predicate.reduce_eliminate 2.56% : 0.000006s : 44: predicate.redundant_stop_gradient_eliminater 0.44% : 0.000001s : 10: predicate.remove_not_recompute_node 1.37% : 0.000003s : 29: predicate.replace_applicator 0.59% : 0.000001s : 10: predicate.replace_old_param 0.37% : 0.000001s : 5: predicate.reset_defer_inline 0.92% : 0.000002s : 15: predicate.reshape_eliminate 0.60% : 0.000001s : 10: predicate.row_tensor_add_zeros_like 0.39% : 0.000001s : 5: predicate.row_tensor_eliminate 0.72% : 0.000002s : 10: predicate.same_eliminate 0.47% : 0.000001s : 10: predicate.set_cell_output_no_recompute 0.83% : 0.000002s : 10: predicate.shard_identity_eliminate 0.72% : 0.000002s : 10: predicate.special_op_eliminate 0.84% : 0.000002s : 10: predicate.specialize_transform 0.89% : 0.000002s : 10: predicate.split_environ_get_set_with_tuple_value 0.78% : 0.000002s : 10: predicate.stack_unstack_eliminate 0.36% : 0.000001s : 5: predicate.switch_call_monad_eliminater 1.47% : 0.000004s : 23: predicate.switch_defer_inline 1.99% : 0.000005s : 33: predicate.switch_layer_defer_inline 4.85% : 0.000012s : 74: predicate.switch_simplify 0.86% : 0.000002s : 15: predicate.tile_eliminate 0.91% : 0.000002s : 15: predicate.transpose_eliminate 1.58% : 0.000004s : 25: predicate.tuple_list_convert_item_index_to_positive 1.64% : 0.000004s : 25: predicate.tuple_list_get_item_const_eliminator 1.59% : 0.000004s : 25: predicate.tuple_list_get_item_depend_reorder 3.30% : 0.000008s : 39: predicate.tuple_list_get_item_eliminator 1.45% : 0.000003s : 25: predicate.tuple_list_get_set_item_eliminator 2.18% : 0.000005s : 35: predicate.tuple_list_set_item_eliminator 1.80% : 0.000004s : 29: predicate.tuple_to_list_eliminator_ 2.53% : 0.000006s : 44: predicate.updatestate_pure_node_eliminater 3.25% : 0.000008s : 54: predicate.updatestate_useless_node_eliminater 0.40% : 0.000001s : 5: predicate.value_based_eliminate 0.66% : 0.000002s : 10: predicate.virtual_dataset_eliminate 0.73% : 0.000002s : 10: predicate.virtual_output_eliminate 0.33% : 0.000001s : 5: predicate.virtual_view_grad_eliminate 0.43% : 0.000001s : 5: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000588 11 59.11% : 0.000348s : 5: func_graph_cloner_run.FuncGraphClonerGraph 40.89% : 0.000241s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.025960 192 0.01% : 0.000004s : 1: ForceFp32Comm 11.85% : 0.003076s : 1: add_attr 11.81% : 0.003066s : 1: add_attr_with_inline 0.01% : 0.000004s : 1: add_comm_op_reuse_tag 0.25% : 0.000064s : 1: add_recomputation 0.02% : 0.000004s : 1: assign_add_opt 0.36% : 0.000094s : 1: auto_monad 0.09% : 0.000024s : 1: auto_monad_reorder 0.01% : 0.000003s : 1: begin_end_overlap_inline 0.02% : 0.000006s : 1: bias_add_comm_swap 1.82% : 0.000473s : 1: bootstrap 0.12% : 0.000030s : 1: cconv 0.01% : 0.000004s : 1: comm_op_add_attrs 0.07% : 0.000019s : 1: control_data_broadcast_order 0.04% : 0.000011s : 1: convert_after_rewriter 0.11% : 0.000030s : 1: cse_after_recomputation 0.02% : 0.000005s : 1: dataset_repeat_opt 0.02% : 0.000005s : 1: detach_backward 0.04% : 0.000010s : 1: environ_conv 0.10% : 0.000027s : 1: event_method 0.02% : 0.000005s : 1: full_micro_interleaved_order_control 0.02% : 0.000005s : 1: get_jit_bprop_graph 0.04% : 0.000010s : 1: graph_reusing 0.02% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000004s : 1: handle_group_info 0.02% : 0.000006s : 1: inline 0.02% : 0.000006s : 1: insert-virtual-dataset 0.01% : 0.000004s : 1: interleave_parallel_branches 0.02% : 0.000004s : 1: interleave_split_concat_branches 0.02% : 0.000006s : 1: label_fine_grained_interleaved_index 0.03% : 0.000008s : 1: label_micro_interleaved_index 1.71% : 0.000443s : 1: loop_unroll 0.08% : 0.000020s : 1: merge_cast_opt 0.02% : 0.000006s : 1: micro_interleaved_order_control 1.95% : 0.000506s : 1: mutable_eliminate 0.03% : 0.000008s : 1: offloading_packed_experts 0.06% : 0.000017s : 1: opt.transform.loop_unroll_optimizer 0.07% : 0.000019s : 1: opt.transform.mutable_eliminate 5.30% : 0.001377s : 78: opt.transform.opt_a 0.14% : 0.000036s : 1: opt.transform.opt_after_cconv 0.11% : 0.000029s : 1: opt.transform.opt_after_jit_grad 0.51% : 0.000133s : 28: opt.transform.opt_b 0.23% : 0.000061s : 2: opt.transform.opt_trans_graph 0.18% : 0.000046s : 4: opt.transform.symbol_engine_opt 11.39% : 0.002958s : 1: opt_a 0.46% : 0.000118s : 1: opt_after_cconv 1.89% : 0.000490s : 1: opt_after_jit_grad 0.97% : 0.000252s : 1: opt_b 19.67% : 0.005108s : 1: optimize 0.08% : 0.000021s : 1: optimize_parallel_all_gather_comm 0.03% : 0.000009s : 1: order_py_execute_after_rewriter 0.10% : 0.000027s : 1: overlap_grad_flash_sp 0.02% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.03% : 0.000008s : 1: overlap_grad_ring_attention 0.02% : 0.000006s : 1: overlap_opt_shard_grad_in_pipeline 0.02% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.02% : 0.000005s : 1: overlap_param_gather 0.02% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.03% : 0.000008s : 1: overlap_recompute_and_grad_model_parallel 0.02% : 0.000005s : 1: overlap_recompute_comm 0.03% : 0.000007s : 1: parallel-infer-symbol 0.01% : 0.000004s : 1: parallel-infer-symbol-second 0.02% : 0.000005s : 1: partial_unused_args_eliminate 0.02% : 0.000005s : 1: pipeline_parallel_scheduler 0.02% : 0.000005s : 1: pipeline_split 0.14% : 0.000036s : 1: pre_auto_parallel 0.11% : 0.000029s : 1: py_interpret_to_execute 0.05% : 0.000014s : 1: py_interpret_to_execute_after_opt_a 0.02% : 0.000004s : 1: remove_cast_before_assign_add 0.07% : 0.000018s : 1: remove_dup_value 1.50% : 0.000390s : 1: renormalize.infer 1.16% : 0.000300s : 1: renormalize.specialize 0.02% : 0.000005s : 1: reorder_send_recv_between_fp_bp 0.03% : 0.000008s : 1: rewriter_after_jit_bprop_graph 0.17% : 0.000044s : 1: rewriter_after_opt_a 0.34% : 0.000089s : 1: rewriter_before_opt_a 0.02% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.02% : 0.000005s : 1: slice_recompute_activation 0.02% : 0.000005s : 1: split_layernorm_comm 0.02% : 0.000005s : 1: split_matmul_comm_elemetwise 0.04% : 0.000010s : 1: swap_dp_allreduce_reducescatter 0.35% : 0.000091s : 1: symbol_engine_optimizer 0.35% : 0.000092s : 1: tuple_transform 23.25% : 0.006035s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:45:25.575.213 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:45:25.575.476 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0166934, [21] [bootstrap]: 0.00043671 [type_inference]: 0.00603573 [event_method]: 2.016e-05 [auto_monad]: 6.768e-05 [graph_reusing]: 5.76998e-06 [inline]: 2.41e-06 [add_attr]: 0.00317986, [1] [add_attr_with_inline]: 0.00316959, [1] [Cycle 1]: 7.46e-05, [2] [tag_attr]: 1.951e-05 [meta_addattr_fg_expand]: 5.96e-06 [parallel-infer-symbol]: 3.4e-06 [pre_auto_parallel]: 3.348e-05 [insert-virtual-dataset]: 2.39999e-06 [parallel-infer-symbol-second]: 7.40023e-07 [dataset_repeat_opt]: 1.89e-06 [pipeline_split]: 1.97001e-06 [optimize]: 0.00561112, [53] [py_interpret_to_execute]: 2.944e-05 [rewriter_before_opt_a]: 8.84e-05 [opt_a]: 0.00329374, [2] [Cycle 1]: 0.0023357, [45] [expand_dump_flag]: 2.99999e-06 [switch_simplify]: 4.411e-05 [loop_unroll]: 3.092e-05 [a_1]: 0.00066393 [with_stream_mark]: 1.496e-05 [recompute_prepare]: 1.02e-05 [updatestate_depend_eliminate]: 4.64998e-06 [updatestate_assign_eliminate]: 4.52e-06 [updatestate_loads_eliminate]: 4.1e-06 [parameter_eliminate]: 2.12001e-06 [a_2]: 0.00013794 [accelerated_algorithm]: 8.90001e-06 [shard]: 1.94e-06 [meta_shard_fg_expand]: 2.19999e-06 [shard_inline]: 7.98999e-06 [merge_send_recv]: 9.02e-06 [auto_parallel]: 7.38e-06 [parallel]: 1.859e-05 [flash_sp]: 8.37e-06 [merge_comm]: 5.32999e-06 [allreduce_fusion]: 4.61002e-06 [matmul_add_comm_reduction]: 1.138e-05 [allreduce_slice_to_reducescatter]: 7.00005e-07 [virtual_shard_identity]: 9.52001e-06 [virtual_dataset]: 8.18001e-06 [get_grad_eliminate_]: 7.48999e-06 [virtual_output]: 7.8e-06 [merge_forward]: 4.40999e-06 [cell_reuse_recompute_pass]: 1.21002e-06 [offload_activation]: 1.052e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.663e-05 [merge_recompute_call_nodes]: 1.50001e-06 [before_grad]: 1.294e-05 [set_forward_comm_id_for_comm_node_pass]: 4.82e-06 [meta_fg_expand]: 3.9e-06 [flash_sp_send_recv_attached]: 2.69999e-06 [receive_attached]: 2.30002e-06 [after_resolve]: 1.241e-05 [a_after_grad]: 1.165e-05 [renormalize]: 0.00069917 [add_forward_monad_depend]: 5.62001e-06 [auto_monad_grad]: 2.24001e-06 [auto_monad_eliminator]: 1.634e-05 [cse]: 3.556e-05 [a_3]: 7.027e-05 [Cycle 2]: 0.00093362, [45] [expand_dump_flag]: 1.09998e-06 [switch_simplify]: 9.67999e-06 [loop_unroll]: 7.75e-06 [a_1]: 0.00017378 [with_stream_mark]: 1.234e-05 [recompute_prepare]: 7.91001e-06 [updatestate_depend_eliminate]: 3.95998e-06 [updatestate_assign_eliminate]: 3.16001e-06 [updatestate_loads_eliminate]: 2.89999e-06 [parameter_eliminate]: 1.16002e-06 [a_2]: 0.00012026 [accelerated_algorithm]: 7.57002e-06 [shard]: 1.33002e-06 [meta_shard_fg_expand]: 1.57999e-06 [shard_inline]: 8.35999e-06 [merge_send_recv]: 5.65001e-06 [auto_parallel]: 6.36e-06 [parallel]: 5.43002e-06 [flash_sp]: 3.26999e-06 [merge_comm]: 4.08001e-06 [allreduce_fusion]: 4.15e-06 [matmul_add_comm_reduction]: 6.33e-06 [allreduce_slice_to_reducescatter]: 3.69997e-07 [virtual_shard_identity]: 1.023e-05 [virtual_dataset]: 7.77e-06 [get_grad_eliminate_]: 7.78999e-06 [virtual_output]: 7.46001e-06 [merge_forward]: 4.25999e-06 [cell_reuse_recompute_pass]: 1.20999e-06 [offload_activation]: 7.83001e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.632e-05 [merge_recompute_call_nodes]: 7.00005e-07 [before_grad]: 1.174e-05 [set_forward_comm_id_for_comm_node_pass]: 4.58001e-06 [meta_fg_expand]: 2.94001e-06 [flash_sp_send_recv_attached]: 7.80012e-07 [receive_attached]: 1.14e-06 [after_resolve]: 1.157e-05 [a_after_grad]: 1.111e-05 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 1.24e-06 [auto_monad_grad]: 9.29984e-07 [auto_monad_eliminator]: 8.60001e-06 [cse]: 1.762e-05 [a_3]: 5.901e-05 [py_interpret_to_execute_after_opt_a]: 1.361e-05 [slice_cell_reuse_recomputed_activation]: 4.57e-06 [rewriter_after_opt_a]: 4.597e-05 [convert_after_rewriter]: 1.086e-05 [order_py_execute_after_rewriter]: 8.81997e-06 [mutable_eliminate]: 0.0004926 [opt_b]: 0.00030381, [1] [Cycle 1]: 0.00029458, [7] [b_1]: 0.00019541 [b_2]: 9.32999e-06 [updatestate_depend_eliminate]: 5.92999e-06 [updatestate_assign_eliminate]: 3.08e-06 [updatestate_loads_eliminate]: 2.91e-06 [renormalize]: 5.09986e-07 [cse]: 2.196e-05 [optimize_parallel_all_gather_comm]: 2.095e-05 [overlap_param_gather]: 4.85999e-06 [cconv]: 2.99e-05 [loop_unroll]: 0.00042469 [opt_after_cconv]: 0.00013596, [1] [Cycle 1]: 0.00012756, [7] [c_1]: 3.654e-05 [parameter_eliminate]: 2.41998e-06 [updatestate_depend_eliminate]: 5.83002e-06 [updatestate_assign_eliminate]: 3.29001e-06 [updatestate_loads_eliminate]: 2.91999e-06 [cse]: 2.093e-05 [renormalize]: 4.30009e-07 [remove_dup_value]: 1.851e-05 [tuple_transform]: 9.684e-05, [1] [Cycle 1]: 8.982e-05, [4] [d_1]: 5.053e-05 [none_parameter_eliminate]: 1.59e-06 [renormalize]: 2.3999e-07 [switch_simplify]: 8.02e-06 [partial_unused_args_eliminate]: 4.97999e-06 [add_recomputation]: 5.877e-05 [cse_after_recomputation]: 3.031e-05, [1] [Cycle 1]: 2.332e-05, [1] [cse]: 1.442e-05 [environ_conv]: 8.95001e-06 [swap_dp_allreduce_reducescatter]: 8.90001e-06 [bias_add_comm_swap]: 5.07e-06 [label_micro_interleaved_index]: 6.88e-06 [label_fine_grained_interleaved_index]: 4.97999e-06 [merge_cast_opt]: 3.65e-06 [slice_recompute_activation]: 4.60999e-06 [micro_interleaved_order_control]: 4.50001e-06 [assign_add_opt]: 3.61999e-06 [ForceFp32Comm]: 3.41001e-06 [remove_cast_before_assign_add]: 3.32997e-06 [full_micro_interleaved_order_control]: 4.52e-06 [reorder_send_recv_between_fp_bp]: 5.26998e-06 [comm_op_add_attrs]: 3.74002e-06 [add_comm_op_reuse_tag]: 3.16999e-06 [interleave_split_concat_branches]: 3.58999e-06 [interleave_parallel_branches]: 3.39001e-06 [overlap_opt_shard_in_pipeline]: 3.97e-06 [overlap_opt_shard_grad_in_pipeline]: 4.15e-06 [control_data_broadcast_order]: 1.729e-05 [grouped_pairwise_exchange_alltoall]: 3.75e-06 [offloading_packed_experts]: 6.76e-06 [overlap_recompute_and_grad_model_parallel]: 7.81001e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.5e-06 [overlap_recompute_allgather_and_fa_grad]: 3.61999e-06 [overlap_recompute_comm]: 5.09e-06 [overlap_grad_ring_attention]: 7.1e-06 [overlap_grad_flash_sp]: 2.373e-05 [begin_end_overlap_inline]: 3.46001e-06 [split_matmul_comm_elemetwise]: 4.33999e-06 [split_layernorm_comm]: 4.35999e-06 [handle_group_info]: 3.09999e-06 [symbol_engine_optimizer]: 0.0001041, [1] [Cycle 1]: 9.698e-05, [6] [build]: 3.29001e-06 [elim_shapecalc]: 1.134e-05 [elim_not_effective]: 1.511e-05 [opt_reshape]: 8.47e-06 [fold_const_symbol]: 1.25e-05 [renormalize]: 2.10013e-07 [detach_backward]: 3.09001e-06 [pipeline_parallel_scheduler]: 1.79998e-06 [auto_monad_reorder]: 2.344e-05 [get_jit_bprop_graph]: 1.58002e-06 [rewriter_after_jit_bprop_graph]: 4.31002e-06 [opt_after_jit_grad]: 0.00057941 [validate]: 4.213e-05 Sums bootstrap : 0.000437s : 3.73% type_inference : 0.006036s : 51.50% event_method : 0.000020s : 0.17% auto_monad : 0.000068s : 0.58% graph_reusing : 0.000006s : 0.05% inline : 0.000002s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000020s : 0.17% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.05% parallel-infer-symbol : 0.000003s : 0.03% pre_auto_parallel : 0.000033s : 0.29% insert-virtual-dataset : 0.000002s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.02% optimize.py_interpret_to_execute : 0.000029s : 0.25% optimize.rewriter_before_opt_a : 0.000088s : 0.75% optimize.opt_a.expand_dump_flag : 0.000004s : 0.03% optimize.opt_a.switch_simplify : 0.000054s : 0.46% optimize.opt_a.loop_unroll : 0.000039s : 0.33% optimize.opt_a.a_1 : 0.000838s : 7.15% optimize.opt_a.with_stream_mark : 0.000027s : 0.23% optimize.opt_a.recompute_prepare : 0.000018s : 0.15% optimize.opt_a.updatestate_depend_eliminate : 0.000009s : 0.07% optimize.opt_a.updatestate_assign_eliminate : 0.000008s : 0.07% optimize.opt_a.updatestate_loads_eliminate : 0.000007s : 0.06% optimize.opt_a.parameter_eliminate : 0.000003s : 0.03% optimize.opt_a.a_2 : 0.000258s : 2.20% optimize.opt_a.accelerated_algorithm : 0.000016s : 0.14% optimize.opt_a.shard : 0.000003s : 0.03% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.03% optimize.opt_a.shard_inline : 0.000016s : 0.14% optimize.opt_a.merge_send_recv : 0.000015s : 0.13% optimize.opt_a.auto_parallel : 0.000014s : 0.12% optimize.opt_a.parallel : 0.000024s : 0.20% optimize.opt_a.flash_sp : 0.000012s : 0.10% optimize.opt_a.merge_comm : 0.000009s : 0.08% optimize.opt_a.allreduce_fusion : 0.000009s : 0.07% optimize.opt_a.matmul_add_comm_reduction : 0.000018s : 0.15% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000020s : 0.17% optimize.opt_a.virtual_dataset : 0.000016s : 0.14% optimize.opt_a.get_grad_eliminate_ : 0.000015s : 0.13% optimize.opt_a.virtual_output : 0.000015s : 0.13% optimize.opt_a.merge_forward : 0.000009s : 0.07% optimize.opt_a.cell_reuse_recompute_pass : 0.000002s : 0.02% optimize.opt_a.offload_activation : 0.000018s : 0.16% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000033s : 0.28% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.02% optimize.opt_a.before_grad : 0.000025s : 0.21% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000009s : 0.08% optimize.opt_a.meta_fg_expand : 0.000007s : 0.06% optimize.opt_a.flash_sp_send_recv_attached : 0.000003s : 0.03% optimize.opt_a.receive_attached : 0.000003s : 0.03% optimize.opt_a.after_resolve : 0.000024s : 0.20% optimize.opt_a.a_after_grad : 0.000023s : 0.19% optimize.opt_a.renormalize : 0.000699s : 5.97% optimize.opt_a.add_forward_monad_depend : 0.000007s : 0.06% optimize.opt_a.auto_monad_grad : 0.000003s : 0.03% optimize.opt_a.auto_monad_eliminator : 0.000025s : 0.21% optimize.opt_a.cse : 0.000053s : 0.45% optimize.opt_a.a_3 : 0.000129s : 1.10% optimize.py_interpret_to_execute_after_opt_a : 0.000014s : 0.12% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.04% optimize.rewriter_after_opt_a : 0.000046s : 0.39% optimize.convert_after_rewriter : 0.000011s : 0.09% optimize.order_py_execute_after_rewriter : 0.000009s : 0.08% optimize.mutable_eliminate : 0.000493s : 4.20% optimize.opt_b.b_1 : 0.000195s : 1.67% optimize.opt_b.b_2 : 0.000009s : 0.08% optimize.opt_b.updatestate_depend_eliminate : 0.000006s : 0.05% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.02% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000022s : 0.19% optimize.optimize_parallel_all_gather_comm : 0.000021s : 0.18% optimize.overlap_param_gather : 0.000005s : 0.04% optimize.cconv : 0.000030s : 0.26% optimize.loop_unroll : 0.000425s : 3.62% optimize.opt_after_cconv.c_1 : 0.000037s : 0.31% optimize.opt_after_cconv.parameter_eliminate : 0.000002s : 0.02% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.05% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.02% optimize.opt_after_cconv.cse : 0.000021s : 0.18% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000019s : 0.16% optimize.tuple_transform.d_1 : 0.000051s : 0.43% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000008s : 0.07% optimize.partial_unused_args_eliminate : 0.000005s : 0.04% optimize.add_recomputation : 0.000059s : 0.50% optimize.cse_after_recomputation.cse : 0.000014s : 0.12% optimize.environ_conv : 0.000009s : 0.08% optimize.swap_dp_allreduce_reducescatter : 0.000009s : 0.08% optimize.bias_add_comm_swap : 0.000005s : 0.04% optimize.label_micro_interleaved_index : 0.000007s : 0.06% optimize.label_fine_grained_interleaved_index : 0.000005s : 0.04% optimize.merge_cast_opt : 0.000004s : 0.03% optimize.slice_recompute_activation : 0.000005s : 0.04% optimize.micro_interleaved_order_control : 0.000005s : 0.04% optimize.assign_add_opt : 0.000004s : 0.03% optimize.ForceFp32Comm : 0.000003s : 0.03% optimize.remove_cast_before_assign_add : 0.000003s : 0.03% optimize.full_micro_interleaved_order_control : 0.000005s : 0.04% optimize.reorder_send_recv_between_fp_bp : 0.000005s : 0.04% optimize.comm_op_add_attrs : 0.000004s : 0.03% optimize.add_comm_op_reuse_tag : 0.000003s : 0.03% optimize.interleave_split_concat_branches : 0.000004s : 0.03% optimize.interleave_parallel_branches : 0.000003s : 0.03% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.03% optimize.overlap_opt_shard_grad_in_pipeline : 0.000004s : 0.04% optimize.control_data_broadcast_order : 0.000017s : 0.15% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.03% optimize.offloading_packed_experts : 0.000007s : 0.06% optimize.overlap_recompute_and_grad_model_parallel : 0.000008s : 0.07% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000003s : 0.03% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.03% optimize.overlap_recompute_comm : 0.000005s : 0.04% optimize.overlap_grad_ring_attention : 0.000007s : 0.06% optimize.overlap_grad_flash_sp : 0.000024s : 0.20% optimize.begin_end_overlap_inline : 0.000003s : 0.03% optimize.split_matmul_comm_elemetwise : 0.000004s : 0.04% optimize.split_layernorm_comm : 0.000004s : 0.04% optimize.handle_group_info : 0.000003s : 0.03% optimize.symbol_engine_optimizer.build : 0.000003s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000011s : 0.10% optimize.symbol_engine_optimizer.elim_not_effective : 0.000015s : 0.13% optimize.symbol_engine_optimizer.opt_reshape : 0.000008s : 0.07% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000013s : 0.11% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000003s : 0.03% pipeline_parallel_scheduler : 0.000002s : 0.02% auto_monad_reorder : 0.000023s : 0.20% get_jit_bprop_graph : 0.000002s : 0.01% rewriter_after_jit_bprop_graph : 0.000004s : 0.04% opt_after_jit_grad : 0.000579s : 4.94% validate : 0.000042s : 0.36% Time group info: ------[substitution.] 0.000200 38 12.05% : 0.000024s : 3: substitution.cast_eliminate 1.15% : 0.000002s : 3: substitution.elim_not_effective 0.89% : 0.000002s : 3: substitution.fold_const_symbol 3.00% : 0.000006s : 5: substitution.graph_param_transform 68.83% : 0.000138s : 4: substitution.inline 2.23% : 0.000004s : 6: substitution.j_node_and_user_rematch 3.01% : 0.000006s : 6: substitution.remove_not_recompute_node 2.51% : 0.000005s : 4: substitution.replace_old_param 6.35% : 0.000013s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.005983 2 88.36% : 0.005286s : 1: type_inference.infer 11.64% : 0.000697s : 1: type_inference.specialize ------[replace.] 0.000062 8 61.45% : 0.000038s : 4: replace.inline 38.55% : 0.000024s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000146 8 92.53% : 0.000135s : 4: match.inline 7.47% : 0.000011s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000236 1504 0.87% : 0.000002s : 15: predicate.accumulaten_eliminater 0.79% : 0.000002s : 5: predicate.ad_related_special_op_eliminate 0.66% : 0.000002s : 10: predicate.addn_check_dump 0.85% : 0.000002s : 15: predicate.addn_zero_filter 0.82% : 0.000002s : 15: predicate.adjust_all_reduce_mul_add 2.10% : 0.000005s : 25: predicate.arithmetic_simplify 0.98% : 0.000002s : 15: predicate.cast_eliminate 0.63% : 0.000001s : 10: predicate.check_bprop_eliminate 0.67% : 0.000002s : 10: predicate.compare_switch_simplify 0.20% : 0.000000s : 5: predicate.const_output_eliminate 0.80% : 0.000002s : 10: predicate.depend_value_elim 0.95% : 0.000002s : 15: predicate.dict_get_item_const_eliminator 0.98% : 0.000002s : 15: predicate.dict_get_item_eliminator 0.85% : 0.000002s : 15: predicate.dict_set_item_eliminator 1.09% : 0.000003s : 10: predicate.dumpgradient_eliminate 0.22% : 0.000001s : 5: predicate.elim_not_effective 0.39% : 0.000001s : 5: predicate.elim_shapecalc_of_broadcastargs 1.15% : 0.000003s : 20: predicate.environ_add_const_eliminate 1.10% : 0.000003s : 20: predicate.environ_get_add_eliminate 1.14% : 0.000003s : 20: predicate.environ_get_depend_swap 1.73% : 0.000004s : 30: predicate.environ_get_eliminate 1.08% : 0.000003s : 20: predicate.environ_get_set_eliminate 1.36% : 0.000003s : 23: predicate.exchange_switch_depend_value 2.41% : 0.000006s : 23: predicate.float_depend_g_call 0.61% : 0.000001s : 10: predicate.float_environ_get_switch 0.91% : 0.000002s : 15: predicate.float_tuple_getitem_switch 0.21% : 0.000000s : 5: predicate.fold_const_symbol 0.67% : 0.000002s : 10: predicate.get_grad_eliminate 0.22% : 0.000001s : 5: predicate.graph_param_transform 0.66% : 0.000002s : 10: predicate.incorporate_call 0.58% : 0.000001s : 10: predicate.incorporate_call_switch 6.44% : 0.000015s : 68: predicate.inline 0.84% : 0.000002s : 10: predicate.inline_without_move 0.35% : 0.000001s : 10: predicate.j_node_and_user_rematch 0.80% : 0.000002s : 10: predicate.less_batch_normalization 1.81% : 0.000004s : 29: predicate.list_to_tuple_eliminator_ 2.46% : 0.000006s : 44: predicate.load_eliminater 0.91% : 0.000002s : 5: predicate.loop_unroll_after_grad 2.17% : 0.000005s : 36: predicate.loop_unroll_before_grad 1.66% : 0.000004s : 25: predicate.make_slice_get_slice_eliminator 0.69% : 0.000002s : 10: predicate.merge_addn 0.63% : 0.000001s : 10: predicate.micro_step_allgather_replace 0.65% : 0.000002s : 10: predicate.mini_step_allgather_replace 0.81% : 0.000002s : 15: predicate.minmaximum_grad 0.84% : 0.000002s : 5: predicate.mutable_eliminate 0.36% : 0.000001s : 5: predicate.opt_reshape 0.40% : 0.000001s : 5: predicate.parallel_virtual_node 1.72% : 0.000004s : 23: predicate.partial_defer_inline 1.66% : 0.000004s : 24: predicate.partial_eliminate 0.86% : 0.000002s : 15: predicate.print_const_string_wrapper 0.65% : 0.000002s : 10: predicate.reduce_all_const_elim 1.20% : 0.000003s : 15: predicate.reduce_eliminate 2.48% : 0.000006s : 44: predicate.redundant_stop_gradient_eliminater 0.39% : 0.000001s : 10: predicate.remove_not_recompute_node 1.29% : 0.000003s : 29: predicate.replace_applicator 0.44% : 0.000001s : 10: predicate.replace_old_param 0.27% : 0.000001s : 5: predicate.reset_defer_inline 0.94% : 0.000002s : 15: predicate.reshape_eliminate 0.68% : 0.000002s : 10: predicate.row_tensor_add_zeros_like 0.40% : 0.000001s : 5: predicate.row_tensor_eliminate 0.82% : 0.000002s : 10: predicate.same_eliminate 0.45% : 0.000001s : 10: predicate.set_cell_output_no_recompute 0.75% : 0.000002s : 10: predicate.shard_identity_eliminate 0.75% : 0.000002s : 10: predicate.special_op_eliminate 0.85% : 0.000002s : 10: predicate.specialize_transform 0.88% : 0.000002s : 10: predicate.split_environ_get_set_with_tuple_value 0.77% : 0.000002s : 10: predicate.stack_unstack_eliminate 0.35% : 0.000001s : 5: predicate.switch_call_monad_eliminater 1.43% : 0.000003s : 23: predicate.switch_defer_inline 2.09% : 0.000005s : 33: predicate.switch_layer_defer_inline 5.05% : 0.000012s : 74: predicate.switch_simplify 0.82% : 0.000002s : 15: predicate.tile_eliminate 0.92% : 0.000002s : 15: predicate.transpose_eliminate 1.52% : 0.000004s : 25: predicate.tuple_list_convert_item_index_to_positive 1.64% : 0.000004s : 25: predicate.tuple_list_get_item_const_eliminator 1.47% : 0.000003s : 25: predicate.tuple_list_get_item_depend_reorder 3.22% : 0.000008s : 39: predicate.tuple_list_get_item_eliminator 1.59% : 0.000004s : 25: predicate.tuple_list_get_set_item_eliminator 2.23% : 0.000005s : 35: predicate.tuple_list_set_item_eliminator 1.70% : 0.000004s : 29: predicate.tuple_to_list_eliminator_ 2.46% : 0.000006s : 44: predicate.updatestate_pure_node_eliminater 3.15% : 0.000007s : 54: predicate.updatestate_useless_node_eliminater 0.45% : 0.000001s : 5: predicate.value_based_eliminate 0.66% : 0.000002s : 10: predicate.virtual_dataset_eliminate 0.67% : 0.000002s : 10: predicate.virtual_output_eliminate 0.36% : 0.000001s : 5: predicate.virtual_view_grad_eliminate 0.47% : 0.000001s : 5: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000551 11 56.15% : 0.000309s : 5: func_graph_cloner_run.FuncGraphClonerGraph 43.85% : 0.000242s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.027840 192 0.02% : 0.000006s : 1: ForceFp32Comm 11.45% : 0.003188s : 1: add_attr 11.40% : 0.003173s : 1: add_attr_with_inline 0.02% : 0.000006s : 1: add_comm_op_reuse_tag 0.22% : 0.000062s : 1: add_recomputation 0.02% : 0.000006s : 1: assign_add_opt 0.28% : 0.000077s : 1: auto_monad 0.11% : 0.000031s : 1: auto_monad_reorder 0.02% : 0.000006s : 1: begin_end_overlap_inline 0.03% : 0.000008s : 1: bias_add_comm_swap 1.85% : 0.000514s : 1: bootstrap 0.12% : 0.000033s : 1: cconv 0.02% : 0.000006s : 1: comm_op_add_attrs 0.07% : 0.000020s : 1: control_data_broadcast_order 0.05% : 0.000014s : 1: convert_after_rewriter 0.12% : 0.000033s : 1: cse_after_recomputation 0.03% : 0.000007s : 1: dataset_repeat_opt 0.06% : 0.000018s : 1: detach_backward 0.04% : 0.000012s : 1: environ_conv 0.11% : 0.000031s : 1: event_method 0.03% : 0.000007s : 1: full_micro_interleaved_order_control 0.03% : 0.000008s : 1: get_jit_bprop_graph 0.04% : 0.000012s : 1: graph_reusing 0.02% : 0.000006s : 1: grouped_pairwise_exchange_alltoall 0.02% : 0.000006s : 1: handle_group_info 0.03% : 0.000008s : 1: inline 0.03% : 0.000008s : 1: insert-virtual-dataset 0.02% : 0.000006s : 1: interleave_parallel_branches 0.02% : 0.000006s : 1: interleave_split_concat_branches 0.03% : 0.000008s : 1: label_fine_grained_interleaved_index 0.03% : 0.000010s : 1: label_micro_interleaved_index 1.55% : 0.000430s : 1: loop_unroll 0.02% : 0.000006s : 1: merge_cast_opt 0.03% : 0.000007s : 1: micro_interleaved_order_control 1.79% : 0.000499s : 1: mutable_eliminate 0.03% : 0.000010s : 1: offloading_packed_experts 0.06% : 0.000016s : 1: opt.transform.loop_unroll_optimizer 0.06% : 0.000016s : 1: opt.transform.mutable_eliminate 4.91% : 0.001367s : 78: opt.transform.opt_a 0.13% : 0.000035s : 1: opt.transform.opt_after_cconv 0.44% : 0.000124s : 1: opt.transform.opt_after_jit_grad 0.48% : 0.000133s : 28: opt.transform.opt_b 0.20% : 0.000056s : 2: opt.transform.opt_trans_graph 0.16% : 0.000044s : 4: opt.transform.symbol_engine_opt 11.84% : 0.003297s : 1: opt_a 0.50% : 0.000139s : 1: opt_after_cconv 2.12% : 0.000591s : 1: opt_after_jit_grad 1.10% : 0.000307s : 1: opt_b 21.43% : 0.005965s : 1: optimize 0.09% : 0.000024s : 1: optimize_parallel_all_gather_comm 0.04% : 0.000012s : 1: order_py_execute_after_rewriter 0.10% : 0.000027s : 1: overlap_grad_flash_sp 0.02% : 0.000006s : 1: overlap_grad_matmul_and_grad_allreduce 0.04% : 0.000010s : 1: overlap_grad_ring_attention 0.03% : 0.000007s : 1: overlap_opt_shard_grad_in_pipeline 0.02% : 0.000007s : 1: overlap_opt_shard_in_pipeline 0.03% : 0.000008s : 1: overlap_param_gather 0.02% : 0.000006s : 1: overlap_recompute_allgather_and_fa_grad 0.04% : 0.000011s : 1: overlap_recompute_and_grad_model_parallel 0.03% : 0.000008s : 1: overlap_recompute_comm 0.03% : 0.000010s : 1: parallel-infer-symbol 0.02% : 0.000006s : 1: parallel-infer-symbol-second 0.03% : 0.000008s : 1: partial_unused_args_eliminate 0.03% : 0.000009s : 1: pipeline_parallel_scheduler 0.03% : 0.000007s : 1: pipeline_split 0.15% : 0.000041s : 1: pre_auto_parallel 0.12% : 0.000033s : 1: py_interpret_to_execute 0.06% : 0.000017s : 1: py_interpret_to_execute_after_opt_a 0.02% : 0.000006s : 1: remove_cast_before_assign_add 0.08% : 0.000022s : 1: remove_dup_value 1.37% : 0.000381s : 1: renormalize.infer 1.11% : 0.000310s : 1: renormalize.specialize 0.03% : 0.000008s : 1: reorder_send_recv_between_fp_bp 0.04% : 0.000011s : 1: rewriter_after_jit_bprop_graph 0.18% : 0.000050s : 1: rewriter_after_opt_a 0.33% : 0.000092s : 1: rewriter_before_opt_a 0.03% : 0.000007s : 1: slice_cell_reuse_recomputed_activation 0.03% : 0.000007s : 1: slice_recompute_activation 0.03% : 0.000007s : 1: split_layernorm_comm 0.03% : 0.000007s : 1: split_matmul_comm_elemetwise 0.04% : 0.000012s : 1: swap_dp_allreduce_reducescatter 0.38% : 0.000107s : 1: symbol_engine_optimizer 0.36% : 0.000100s : 1: tuple_transform 21.83% : 0.006077s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:45:25.859.134 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0154979, [21] [bootstrap]: 0.00042499 [type_inference]: 0.00564466 [event_method]: 1.803e-05 [auto_monad]: 6.453e-05 [graph_reusing]: 5.49e-06 [inline]: 2.14e-06 [add_attr]: 0.0033129, [1] [add_attr_with_inline]: 0.00330303, [1] [Cycle 1]: 6.776e-05, [2] [tag_attr]: 2.191e-05 [meta_addattr_fg_expand]: 5.77999e-06 [parallel-infer-symbol]: 3.29001e-06 [pre_auto_parallel]: 3.598e-05 [insert-virtual-dataset]: 2.34999e-06 [parallel-infer-symbol-second]: 7.7e-07 [dataset_repeat_opt]: 2.12999e-06 [pipeline_split]: 1.60999e-06 [optimize]: 0.00525385, [53] [py_interpret_to_execute]: 2.739e-05 [rewriter_before_opt_a]: 9.416e-05 [opt_a]: 0.00305881, [2] [Cycle 1]: 0.00227071, [45] [expand_dump_flag]: 3.26001e-06 [switch_simplify]: 4.468e-05 [loop_unroll]: 3.136e-05 [a_1]: 0.00069605 [with_stream_mark]: 1.545e-05 [recompute_prepare]: 9.77001e-06 [updatestate_depend_eliminate]: 4.85001e-06 [updatestate_assign_eliminate]: 3.98999e-06 [updatestate_loads_eliminate]: 3.46999e-06 [parameter_eliminate]: 2.01998e-06 [a_2]: 0.00010155 [accelerated_algorithm]: 8.40001e-06 [shard]: 1.79e-06 [meta_shard_fg_expand]: 2.15002e-06 [shard_inline]: 7.83999e-06 [merge_send_recv]: 1.011e-05 [auto_parallel]: 7.45e-06 [parallel]: 1.983e-05 [flash_sp]: 7.83001e-06 [merge_comm]: 5.20001e-06 [allreduce_fusion]: 4.4e-06 [matmul_add_comm_reduction]: 1.052e-05 [allreduce_slice_to_reducescatter]: 6.50005e-07 [virtual_shard_identity]: 9.20001e-06 [virtual_dataset]: 7.80998e-06 [get_grad_eliminate_]: 7.29001e-06 [virtual_output]: 7.53e-06 [merge_forward]: 4.48001e-06 [cell_reuse_recompute_pass]: 1.86998e-06 [offload_activation]: 1.128e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.564e-05 [merge_recompute_call_nodes]: 1.81e-06 [before_grad]: 1.313e-05 [set_forward_comm_id_for_comm_node_pass]: 4.23999e-06 [meta_fg_expand]: 3.35998e-06 [flash_sp_send_recv_attached]: 2.61e-06 [receive_attached]: 2.96001e-06 [after_resolve]: 1.239e-05 [a_after_grad]: 1.153e-05 [renormalize]: 0.00078191 [add_forward_monad_depend]: 5.69e-06 [auto_monad_grad]: 3.03e-06 [auto_monad_eliminator]: 1.759e-05 [cse]: 3.681e-05 [a_3]: 5.806e-05 [Cycle 2]: 0.00077715, [45] [expand_dump_flag]: 1.69998e-06 [switch_simplify]: 9.32999e-06 [loop_unroll]: 7.31999e-06 [a_1]: 0.00017846 [with_stream_mark]: 1.397e-05 [recompute_prepare]: 7.62002e-06 [updatestate_depend_eliminate]: 4.53001e-06 [updatestate_assign_eliminate]: 3.46001e-06 [updatestate_loads_eliminate]: 2.95002e-06 [parameter_eliminate]: 1.19e-06 [a_2]: 9.296e-05 [accelerated_algorithm]: 8.08999e-06 [shard]: 1.44998e-06 [meta_shard_fg_expand]: 1.99999e-06 [shard_inline]: 7.64002e-06 [merge_send_recv]: 6.25002e-06 [auto_parallel]: 6.32001e-06 [parallel]: 5.45001e-06 [flash_sp]: 3.83001e-06 [merge_comm]: 4.61002e-06 [allreduce_fusion]: 3.93001e-06 [matmul_add_comm_reduction]: 7.66001e-06 [allreduce_slice_to_reducescatter]: 4.80009e-07 [virtual_shard_identity]: 1.02e-05 [virtual_dataset]: 7.29001e-06 [get_grad_eliminate_]: 6.89001e-06 [virtual_output]: 7.56999e-06 [merge_forward]: 4.4e-06 [cell_reuse_recompute_pass]: 1.62001e-06 [offload_activation]: 8.60999e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.464e-05 [merge_recompute_call_nodes]: 1.24998e-06 [before_grad]: 1.279e-05 [set_forward_comm_id_for_comm_node_pass]: 4.72998e-06 [meta_fg_expand]: 2.68e-06 [flash_sp_send_recv_attached]: 9.10019e-07 [receive_attached]: 1.58002e-06 [after_resolve]: 1.204e-05 [a_after_grad]: 1.113e-05 [renormalize]: 8.00064e-08 [add_forward_monad_depend]: 1.10999e-06 [auto_monad_grad]: 9.00007e-07 [auto_monad_eliminator]: 9.57999e-06 [cse]: 1.925e-05 [a_3]: 4.505e-05 [py_interpret_to_execute_after_opt_a]: 1.312e-05 [slice_cell_reuse_recomputed_activation]: 1.89e-06 [rewriter_after_opt_a]: 4.049e-05 [convert_after_rewriter]: 7.57998e-06 [order_py_execute_after_rewriter]: 6.33e-06 [mutable_eliminate]: 0.00052941 [opt_b]: 0.00024881, [1] [Cycle 1]: 0.00024247, [7] [b_1]: 0.00015582 [b_2]: 9.20999e-06 [updatestate_depend_eliminate]: 8.94998e-06 [updatestate_assign_eliminate]: 3.08e-06 [updatestate_loads_eliminate]: 3.16001e-06 [renormalize]: 6.19999e-07 [cse]: 2.603e-05 [optimize_parallel_all_gather_comm]: 1.925e-05 [overlap_param_gather]: 2.17999e-06 [cconv]: 2.82e-05 [loop_unroll]: 0.0004357 [opt_after_cconv]: 0.00011723, [1] [Cycle 1]: 0.00011111, [7] [c_1]: 3.778e-05 [parameter_eliminate]: 3.46999e-06 [updatestate_depend_eliminate]: 6.34001e-06 [updatestate_assign_eliminate]: 3.12002e-06 [updatestate_loads_eliminate]: 2.94001e-06 [cse]: 2.255e-05 [renormalize]: 4.80009e-07 [remove_dup_value]: 1.582e-05 [tuple_transform]: 8.731e-05, [1] [Cycle 1]: 8.217e-05, [4] [d_1]: 5.339e-05 [none_parameter_eliminate]: 1.59998e-06 [renormalize]: 2.80008e-07 [switch_simplify]: 8.35001e-06 [partial_unused_args_eliminate]: 1.71e-06 [add_recomputation]: 7.091e-05 [cse_after_recomputation]: 2.749e-05, [1] [Cycle 1]: 2.222e-05, [1] [cse]: 1.652e-05 [environ_conv]: 7.11001e-06 [swap_dp_allreduce_reducescatter]: 6.34999e-06 [bias_add_comm_swap]: 3.42002e-06 [label_micro_interleaved_index]: 5.16998e-06 [label_fine_grained_interleaved_index]: 3.04001e-06 [merge_cast_opt]: 1.23002e-06 [slice_recompute_activation]: 2.31e-06 [micro_interleaved_order_control]: 2.48e-06 [assign_add_opt]: 1.25001e-06 [ForceFp32Comm]: 8.00006e-07 [remove_cast_before_assign_add]: 1.05001e-06 [full_micro_interleaved_order_control]: 2.89001e-06 [reorder_send_recv_between_fp_bp]: 3.16001e-06 [comm_op_add_attrs]: 1.02e-06 [add_comm_op_reuse_tag]: 1.14998e-06 [interleave_split_concat_branches]: 1.43002e-06 [interleave_parallel_branches]: 1.12e-06 [overlap_opt_shard_in_pipeline]: 1.10999e-06 [overlap_opt_shard_grad_in_pipeline]: 2.11e-06 [control_data_broadcast_order]: 1.592e-05 [grouped_pairwise_exchange_alltoall]: 1.53002e-06 [offloading_packed_experts]: 4.35e-06 [overlap_recompute_and_grad_model_parallel]: 5.80002e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.19e-06 [overlap_recompute_allgather_and_fa_grad]: 1.43002e-06 [overlap_recompute_comm]: 2.32999e-06 [overlap_grad_ring_attention]: 4.57e-06 [overlap_grad_flash_sp]: 2.35e-05 [begin_end_overlap_inline]: 4.99975e-07 [split_matmul_comm_elemetwise]: 2.10002e-06 [split_layernorm_comm]: 1.91998e-06 [handle_group_info]: 1.04e-06 [symbol_engine_optimizer]: 8.595e-05, [1] [Cycle 1]: 8.148e-05, [6] [build]: 3.88999e-06 [elim_shapecalc]: 1.239e-05 [elim_not_effective]: 1.536e-05 [opt_reshape]: 8.3e-06 [fold_const_symbol]: 1.205e-05 [renormalize]: 1.8999e-07 [detach_backward]: 2.04e-06 [pipeline_parallel_scheduler]: 1.49e-06 [auto_monad_reorder]: 2.031e-05 [get_jit_bprop_graph]: 1.96e-06 [rewriter_after_jit_bprop_graph]: 4.58001e-06 [opt_after_jit_grad]: 0.00050105 [validate]: 4.43e-05 Sums bootstrap : 0.000425s : 3.79% type_inference : 0.005645s : 50.30% event_method : 0.000018s : 0.16% auto_monad : 0.000065s : 0.58% graph_reusing : 0.000005s : 0.05% inline : 0.000002s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000022s : 0.20% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.05% parallel-infer-symbol : 0.000003s : 0.03% pre_auto_parallel : 0.000036s : 0.32% insert-virtual-dataset : 0.000002s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000027s : 0.24% optimize.rewriter_before_opt_a : 0.000094s : 0.84% optimize.opt_a.expand_dump_flag : 0.000005s : 0.04% optimize.opt_a.switch_simplify : 0.000054s : 0.48% optimize.opt_a.loop_unroll : 0.000039s : 0.34% optimize.opt_a.a_1 : 0.000875s : 7.79% optimize.opt_a.with_stream_mark : 0.000029s : 0.26% optimize.opt_a.recompute_prepare : 0.000017s : 0.15% optimize.opt_a.updatestate_depend_eliminate : 0.000009s : 0.08% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.07% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.06% optimize.opt_a.parameter_eliminate : 0.000003s : 0.03% optimize.opt_a.a_2 : 0.000195s : 1.73% optimize.opt_a.accelerated_algorithm : 0.000016s : 0.15% optimize.opt_a.shard : 0.000003s : 0.03% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.04% optimize.opt_a.shard_inline : 0.000015s : 0.14% optimize.opt_a.merge_send_recv : 0.000016s : 0.15% optimize.opt_a.auto_parallel : 0.000014s : 0.12% optimize.opt_a.parallel : 0.000025s : 0.23% optimize.opt_a.flash_sp : 0.000012s : 0.10% optimize.opt_a.merge_comm : 0.000010s : 0.09% optimize.opt_a.allreduce_fusion : 0.000008s : 0.07% optimize.opt_a.matmul_add_comm_reduction : 0.000018s : 0.16% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000019s : 0.17% optimize.opt_a.virtual_dataset : 0.000015s : 0.13% optimize.opt_a.get_grad_eliminate_ : 0.000014s : 0.13% optimize.opt_a.virtual_output : 0.000015s : 0.13% optimize.opt_a.merge_forward : 0.000009s : 0.08% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.03% optimize.opt_a.offload_activation : 0.000020s : 0.18% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000030s : 0.27% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.03% optimize.opt_a.before_grad : 0.000026s : 0.23% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000009s : 0.08% optimize.opt_a.meta_fg_expand : 0.000006s : 0.05% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.03% optimize.opt_a.receive_attached : 0.000005s : 0.04% optimize.opt_a.after_resolve : 0.000024s : 0.22% optimize.opt_a.a_after_grad : 0.000023s : 0.20% optimize.opt_a.renormalize : 0.000782s : 6.97% optimize.opt_a.add_forward_monad_depend : 0.000007s : 0.06% optimize.opt_a.auto_monad_grad : 0.000004s : 0.04% optimize.opt_a.auto_monad_eliminator : 0.000027s : 0.24% optimize.opt_a.cse : 0.000056s : 0.50% optimize.opt_a.a_3 : 0.000103s : 0.92% optimize.py_interpret_to_execute_after_opt_a : 0.000013s : 0.12% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.02% optimize.rewriter_after_opt_a : 0.000040s : 0.36% optimize.convert_after_rewriter : 0.000008s : 0.07% optimize.order_py_execute_after_rewriter : 0.000006s : 0.06% optimize.mutable_eliminate : 0.000529s : 4.72% optimize.opt_b.b_1 : 0.000156s : 1.39% optimize.opt_b.b_2 : 0.000009s : 0.08% optimize.opt_b.updatestate_depend_eliminate : 0.000009s : 0.08% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_b.renormalize : 0.000001s : 0.01% optimize.opt_b.cse : 0.000026s : 0.23% optimize.optimize_parallel_all_gather_comm : 0.000019s : 0.17% optimize.overlap_param_gather : 0.000002s : 0.02% optimize.cconv : 0.000028s : 0.25% optimize.loop_unroll : 0.000436s : 3.88% optimize.opt_after_cconv.c_1 : 0.000038s : 0.34% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.06% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.cse : 0.000023s : 0.20% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000016s : 0.14% optimize.tuple_transform.d_1 : 0.000053s : 0.48% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000008s : 0.07% optimize.partial_unused_args_eliminate : 0.000002s : 0.02% optimize.add_recomputation : 0.000071s : 0.63% optimize.cse_after_recomputation.cse : 0.000017s : 0.15% optimize.environ_conv : 0.000007s : 0.06% optimize.swap_dp_allreduce_reducescatter : 0.000006s : 0.06% optimize.bias_add_comm_swap : 0.000003s : 0.03% optimize.label_micro_interleaved_index : 0.000005s : 0.05% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.03% optimize.merge_cast_opt : 0.000001s : 0.01% optimize.slice_recompute_activation : 0.000002s : 0.02% optimize.micro_interleaved_order_control : 0.000002s : 0.02% optimize.assign_add_opt : 0.000001s : 0.01% optimize.ForceFp32Comm : 0.000001s : 0.01% optimize.remove_cast_before_assign_add : 0.000001s : 0.01% optimize.full_micro_interleaved_order_control : 0.000003s : 0.03% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.03% optimize.comm_op_add_attrs : 0.000001s : 0.01% optimize.add_comm_op_reuse_tag : 0.000001s : 0.01% optimize.interleave_split_concat_branches : 0.000001s : 0.01% optimize.interleave_parallel_branches : 0.000001s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.02% optimize.control_data_broadcast_order : 0.000016s : 0.14% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.01% optimize.offloading_packed_experts : 0.000004s : 0.04% optimize.overlap_recompute_and_grad_model_parallel : 0.000006s : 0.05% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.01% optimize.overlap_recompute_comm : 0.000002s : 0.02% optimize.overlap_grad_ring_attention : 0.000005s : 0.04% optimize.overlap_grad_flash_sp : 0.000024s : 0.21% optimize.begin_end_overlap_inline : 0.000000s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.02% optimize.split_layernorm_comm : 0.000002s : 0.02% optimize.handle_group_info : 0.000001s : 0.01% optimize.symbol_engine_optimizer.build : 0.000004s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000012s : 0.11% optimize.symbol_engine_optimizer.elim_not_effective : 0.000015s : 0.14% optimize.symbol_engine_optimizer.opt_reshape : 0.000008s : 0.07% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000012s : 0.11% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.02% pipeline_parallel_scheduler : 0.000001s : 0.01% auto_monad_reorder : 0.000020s : 0.18% get_jit_bprop_graph : 0.000002s : 0.02% rewriter_after_jit_bprop_graph : 0.000005s : 0.04% opt_after_jit_grad : 0.000501s : 4.46% validate : 0.000044s : 0.39% Time group info: ------[substitution.] 0.000213 38 12.14% : 0.000026s : 3: substitution.cast_eliminate 1.07% : 0.000002s : 3: substitution.elim_not_effective 0.75% : 0.000002s : 3: substitution.fold_const_symbol 3.30% : 0.000007s : 5: substitution.graph_param_transform 69.00% : 0.000147s : 4: substitution.inline 2.73% : 0.000006s : 6: substitution.j_node_and_user_rematch 2.94% : 0.000006s : 6: substitution.remove_not_recompute_node 2.18% : 0.000005s : 4: substitution.replace_old_param 5.91% : 0.000013s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.005587 2 87.86% : 0.004908s : 1: type_inference.infer 12.14% : 0.000678s : 1: type_inference.specialize ------[replace.] 0.000062 8 63.59% : 0.000040s : 4: replace.inline 36.41% : 0.000023s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000155 8 93.07% : 0.000144s : 4: match.inline 6.93% : 0.000011s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000258 1504 0.85% : 0.000002s : 15: predicate.accumulaten_eliminater 0.78% : 0.000002s : 5: predicate.ad_related_special_op_eliminate 0.54% : 0.000001s : 10: predicate.addn_check_dump 0.78% : 0.000002s : 15: predicate.addn_zero_filter 0.75% : 0.000002s : 15: predicate.adjust_all_reduce_mul_add 1.84% : 0.000005s : 25: predicate.arithmetic_simplify 0.96% : 0.000002s : 15: predicate.cast_eliminate 0.56% : 0.000001s : 10: predicate.check_bprop_eliminate 0.54% : 0.000001s : 10: predicate.compare_switch_simplify 0.17% : 0.000000s : 5: predicate.const_output_eliminate 0.63% : 0.000002s : 10: predicate.depend_value_elim 0.81% : 0.000002s : 15: predicate.dict_get_item_const_eliminator 0.99% : 0.000003s : 15: predicate.dict_get_item_eliminator 8.41% : 0.000022s : 15: predicate.dict_set_item_eliminator 1.07% : 0.000003s : 10: predicate.dumpgradient_eliminate 0.21% : 0.000001s : 5: predicate.elim_not_effective 0.33% : 0.000001s : 5: predicate.elim_shapecalc_of_broadcastargs 1.04% : 0.000003s : 20: predicate.environ_add_const_eliminate 1.06% : 0.000003s : 20: predicate.environ_get_add_eliminate 1.08% : 0.000003s : 20: predicate.environ_get_depend_swap 1.59% : 0.000004s : 30: predicate.environ_get_eliminate 1.03% : 0.000003s : 20: predicate.environ_get_set_eliminate 1.29% : 0.000003s : 23: predicate.exchange_switch_depend_value 2.11% : 0.000005s : 23: predicate.float_depend_g_call 0.52% : 0.000001s : 10: predicate.float_environ_get_switch 0.79% : 0.000002s : 15: predicate.float_tuple_getitem_switch 0.18% : 0.000000s : 5: predicate.fold_const_symbol 0.64% : 0.000002s : 10: predicate.get_grad_eliminate 0.20% : 0.000001s : 5: predicate.graph_param_transform 0.57% : 0.000001s : 10: predicate.incorporate_call 0.52% : 0.000001s : 10: predicate.incorporate_call_switch 5.69% : 0.000015s : 68: predicate.inline 0.71% : 0.000002s : 10: predicate.inline_without_move 0.32% : 0.000001s : 10: predicate.j_node_and_user_rematch 0.77% : 0.000002s : 10: predicate.less_batch_normalization 1.58% : 0.000004s : 29: predicate.list_to_tuple_eliminator_ 2.32% : 0.000006s : 44: predicate.load_eliminater 0.81% : 0.000002s : 5: predicate.loop_unroll_after_grad 1.96% : 0.000005s : 36: predicate.loop_unroll_before_grad 1.58% : 0.000004s : 25: predicate.make_slice_get_slice_eliminator 0.59% : 0.000002s : 10: predicate.merge_addn 0.57% : 0.000001s : 10: predicate.micro_step_allgather_replace 0.59% : 0.000002s : 10: predicate.mini_step_allgather_replace 0.74% : 0.000002s : 15: predicate.minmaximum_grad 1.21% : 0.000003s : 5: predicate.mutable_eliminate 0.36% : 0.000001s : 5: predicate.opt_reshape 0.33% : 0.000001s : 5: predicate.parallel_virtual_node 1.55% : 0.000004s : 23: predicate.partial_defer_inline 1.48% : 0.000004s : 24: predicate.partial_eliminate 0.78% : 0.000002s : 15: predicate.print_const_string_wrapper 0.55% : 0.000001s : 10: predicate.reduce_all_const_elim 1.10% : 0.000003s : 15: predicate.reduce_eliminate 2.38% : 0.000006s : 44: predicate.redundant_stop_gradient_eliminater 0.39% : 0.000001s : 10: predicate.remove_not_recompute_node 1.25% : 0.000003s : 29: predicate.replace_applicator 0.53% : 0.000001s : 10: predicate.replace_old_param 0.27% : 0.000001s : 5: predicate.reset_defer_inline 0.86% : 0.000002s : 15: predicate.reshape_eliminate 0.59% : 0.000002s : 10: predicate.row_tensor_add_zeros_like 0.32% : 0.000001s : 5: predicate.row_tensor_eliminate 0.76% : 0.000002s : 10: predicate.same_eliminate 0.41% : 0.000001s : 10: predicate.set_cell_output_no_recompute 0.72% : 0.000002s : 10: predicate.shard_identity_eliminate 0.67% : 0.000002s : 10: predicate.special_op_eliminate 0.70% : 0.000002s : 10: predicate.specialize_transform 0.77% : 0.000002s : 10: predicate.split_environ_get_set_with_tuple_value 0.68% : 0.000002s : 10: predicate.stack_unstack_eliminate 0.34% : 0.000001s : 5: predicate.switch_call_monad_eliminater 1.32% : 0.000003s : 23: predicate.switch_defer_inline 1.85% : 0.000005s : 33: predicate.switch_layer_defer_inline 4.66% : 0.000012s : 74: predicate.switch_simplify 0.83% : 0.000002s : 15: predicate.tile_eliminate 0.85% : 0.000002s : 15: predicate.transpose_eliminate 1.50% : 0.000004s : 25: predicate.tuple_list_convert_item_index_to_positive 1.63% : 0.000004s : 25: predicate.tuple_list_get_item_const_eliminator 1.48% : 0.000004s : 25: predicate.tuple_list_get_item_depend_reorder 3.09% : 0.000008s : 39: predicate.tuple_list_get_item_eliminator 1.43% : 0.000004s : 25: predicate.tuple_list_get_set_item_eliminator 2.10% : 0.000005s : 35: predicate.tuple_list_set_item_eliminator 1.63% : 0.000004s : 29: predicate.tuple_to_list_eliminator_ 2.27% : 0.000006s : 44: predicate.updatestate_pure_node_eliminater 2.99% : 0.000008s : 54: predicate.updatestate_useless_node_eliminater 0.35% : 0.000001s : 5: predicate.value_based_eliminate 0.61% : 0.000002s : 10: predicate.virtual_dataset_eliminate 0.61% : 0.000002s : 10: predicate.virtual_output_eliminate 0.28% : 0.000001s : 5: predicate.virtual_view_grad_eliminate 0.49% : 0.000001s : 5: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000533 11 54.08% : 0.000288s : 5: func_graph_cloner_run.FuncGraphClonerGraph 45.92% : 0.000245s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.026439 192 0.01% : 0.000003s : 1: ForceFp32Comm 12.55% : 0.003318s : 1: add_attr 12.51% : 0.003307s : 1: add_attr_with_inline 0.01% : 0.000004s : 1: add_comm_op_reuse_tag 0.29% : 0.000076s : 1: add_recomputation 0.02% : 0.000004s : 1: assign_add_opt 0.26% : 0.000069s : 1: auto_monad 0.09% : 0.000025s : 1: auto_monad_reorder 0.01% : 0.000003s : 1: begin_end_overlap_inline 0.02% : 0.000006s : 1: bias_add_comm_swap 1.72% : 0.000454s : 1: bootstrap 0.12% : 0.000032s : 1: cconv 0.01% : 0.000004s : 1: comm_op_add_attrs 0.07% : 0.000019s : 1: control_data_broadcast_order 0.04% : 0.000011s : 1: convert_after_rewriter 0.12% : 0.000030s : 1: cse_after_recomputation 0.02% : 0.000005s : 1: dataset_repeat_opt 0.02% : 0.000005s : 1: detach_backward 0.04% : 0.000010s : 1: environ_conv 0.09% : 0.000024s : 1: event_method 0.02% : 0.000006s : 1: full_micro_interleaved_order_control 0.02% : 0.000005s : 1: get_jit_bprop_graph 0.03% : 0.000009s : 1: graph_reusing 0.02% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000004s : 1: handle_group_info 0.02% : 0.000005s : 1: inline 0.02% : 0.000006s : 1: insert-virtual-dataset 0.01% : 0.000004s : 1: interleave_parallel_branches 0.02% : 0.000004s : 1: interleave_split_concat_branches 0.02% : 0.000006s : 1: label_fine_grained_interleaved_index 0.03% : 0.000008s : 1: label_micro_interleaved_index 1.68% : 0.000444s : 1: loop_unroll 0.02% : 0.000004s : 1: merge_cast_opt 0.02% : 0.000005s : 1: micro_interleaved_order_control 2.04% : 0.000539s : 1: mutable_eliminate 0.03% : 0.000007s : 1: offloading_packed_experts 0.06% : 0.000016s : 1: opt.transform.loop_unroll_optimizer 0.07% : 0.000019s : 1: opt.transform.mutable_eliminate 5.26% : 0.001390s : 78: opt.transform.opt_a 0.14% : 0.000036s : 1: opt.transform.opt_after_cconv 0.12% : 0.000031s : 1: opt.transform.opt_after_jit_grad 0.51% : 0.000134s : 28: opt.transform.opt_b 0.22% : 0.000059s : 2: opt.transform.opt_trans_graph 0.16% : 0.000043s : 4: opt.transform.symbol_engine_opt 11.58% : 0.003062s : 1: opt_a 0.46% : 0.000121s : 1: opt_after_cconv 1.93% : 0.000510s : 1: opt_after_jit_grad 0.95% : 0.000252s : 1: opt_b 19.89% : 0.005259s : 1: optimize 0.09% : 0.000023s : 1: optimize_parallel_all_gather_comm 0.04% : 0.000009s : 1: order_py_execute_after_rewriter 0.10% : 0.000027s : 1: overlap_grad_flash_sp 0.01% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.03% : 0.000008s : 1: overlap_grad_ring_attention 0.02% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.02% : 0.000005s : 1: overlap_param_gather 0.02% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.03% : 0.000009s : 1: overlap_recompute_and_grad_model_parallel 0.02% : 0.000005s : 1: overlap_recompute_comm 0.03% : 0.000007s : 1: parallel-infer-symbol 0.01% : 0.000004s : 1: parallel-infer-symbol-second 0.02% : 0.000005s : 1: partial_unused_args_eliminate 0.02% : 0.000005s : 1: pipeline_parallel_scheduler 0.02% : 0.000004s : 1: pipeline_split 0.15% : 0.000040s : 1: pre_auto_parallel 0.12% : 0.000031s : 1: py_interpret_to_execute 0.06% : 0.000016s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000004s : 1: remove_cast_before_assign_add 0.07% : 0.000019s : 1: remove_dup_value 1.63% : 0.000430s : 1: renormalize.infer 1.30% : 0.000344s : 1: renormalize.specialize 0.02% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.03% : 0.000008s : 1: rewriter_after_jit_bprop_graph 0.17% : 0.000044s : 1: rewriter_after_opt_a 0.37% : 0.000098s : 1: rewriter_before_opt_a 0.02% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.02% : 0.000005s : 1: slice_recompute_activation 0.02% : 0.000005s : 1: split_layernorm_comm 0.02% : 0.000005s : 1: split_matmul_comm_elemetwise 0.04% : 0.000009s : 1: swap_dp_allreduce_reducescatter 0.33% : 0.000089s : 1: symbol_engine_optimizer 0.34% : 0.000090s : 1: tuple_transform 21.41% : 0.005660s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:45:26.155.165 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:45:26.155.427 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0162065, [21] [bootstrap]: 0.00045265 [type_inference]: 0.00581242 [event_method]: 1.908e-05 [auto_monad]: 6.178e-05 [graph_reusing]: 6.47001e-06 [inline]: 1.91e-06 [add_attr]: 0.00304478, [1] [add_attr_with_inline]: 0.00303644, [1] [Cycle 1]: 7.128e-05, [2] [tag_attr]: 1.931e-05 [meta_addattr_fg_expand]: 5.76998e-06 [parallel-infer-symbol]: 3.46999e-06 [pre_auto_parallel]: 3.21e-05 [insert-virtual-dataset]: 2.88e-06 [parallel-infer-symbol-second]: 6.89994e-07 [dataset_repeat_opt]: 1.92001e-06 [pipeline_split]: 1.70001e-06 [optimize]: 0.0052247, [53] [py_interpret_to_execute]: 2.746e-05 [rewriter_before_opt_a]: 8.286e-05 [opt_a]: 0.00293767, [2] [Cycle 1]: 0.00209223, [45] [expand_dump_flag]: 2.96999e-06 [switch_simplify]: 4.291e-05 [loop_unroll]: 2.964e-05 [a_1]: 0.00063703 [with_stream_mark]: 1.54e-05 [recompute_prepare]: 8.57e-06 [updatestate_depend_eliminate]: 3.71001e-06 [updatestate_assign_eliminate]: 3.55e-06 [updatestate_loads_eliminate]: 3.01001e-06 [parameter_eliminate]: 1.89e-06 [a_2]: 0.00010996 [accelerated_algorithm]: 7.69002e-06 [shard]: 1.72001e-06 [meta_shard_fg_expand]: 1.65001e-06 [shard_inline]: 6.79999e-06 [merge_send_recv]: 8.17e-06 [auto_parallel]: 6.47001e-06 [parallel]: 1.854e-05 [flash_sp]: 7.77e-06 [merge_comm]: 4.36002e-06 [allreduce_fusion]: 3.93999e-06 [matmul_add_comm_reduction]: 9.37001e-06 [allreduce_slice_to_reducescatter]: 6.39993e-07 [virtual_shard_identity]: 8.14997e-06 [virtual_dataset]: 6.61999e-06 [get_grad_eliminate_]: 6.23e-06 [virtual_output]: 6.61999e-06 [merge_forward]: 3.50003e-06 [cell_reuse_recompute_pass]: 1.02998e-06 [offload_activation]: 1.005e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.427e-05 [merge_recompute_call_nodes]: 1.52999e-06 [before_grad]: 1.067e-05 [set_forward_comm_id_for_comm_node_pass]: 3.71001e-06 [meta_fg_expand]: 2.93e-06 [flash_sp_send_recv_attached]: 2.60997e-06 [receive_attached]: 2.39001e-06 [after_resolve]: 1.174e-05 [a_after_grad]: 1.016e-05 [renormalize]: 0.00056499 [add_forward_monad_depend]: 5.47001e-06 [auto_monad_grad]: 2.54999e-06 [auto_monad_eliminator]: 1.477e-05 [cse]: 2.775e-05 [a_3]: 6.159e-05 [Cycle 2]: 0.00083261, [45] [expand_dump_flag]: 1.35999e-06 [switch_simplify]: 7.68999e-06 [loop_unroll]: 6.34001e-06 [a_1]: 0.00012916 [with_stream_mark]: 1.204e-05 [recompute_prepare]: 7.06001e-06 [updatestate_depend_eliminate]: 2.96001e-06 [updatestate_assign_eliminate]: 2.49999e-06 [updatestate_loads_eliminate]: 2.27001e-06 [parameter_eliminate]: 1.22e-06 [a_2]: 9.923e-05 [accelerated_algorithm]: 6.49999e-06 [shard]: 1.32e-06 [meta_shard_fg_expand]: 1.52999e-06 [shard_inline]: 6.24001e-06 [merge_send_recv]: 4.83001e-06 [auto_parallel]: 5.52999e-06 [parallel]: 4.79e-06 [flash_sp]: 3.9e-06 [merge_comm]: 3.13e-06 [allreduce_fusion]: 3.01001e-06 [matmul_add_comm_reduction]: 8.43001e-06 [allreduce_slice_to_reducescatter]: 4.40021e-07 [virtual_shard_identity]: 8.49998e-06 [virtual_dataset]: 6.16e-06 [get_grad_eliminate_]: 5.80002e-06 [virtual_output]: 6.48e-06 [merge_forward]: 3.28998e-06 [cell_reuse_recompute_pass]: 1.53002e-06 [offload_activation]: 6.78e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.608e-05 [merge_recompute_call_nodes]: 1.13001e-06 [before_grad]: 9.41e-06 [set_forward_comm_id_for_comm_node_pass]: 3.75e-06 [meta_fg_expand]: 2.24999e-06 [flash_sp_send_recv_attached]: 1.09003e-06 [receive_attached]: 1.27999e-06 [after_resolve]: 1.139e-05 [a_after_grad]: 9.97001e-06 [renormalize]: 8.00064e-08 [add_forward_monad_depend]: 1.44e-06 [auto_monad_grad]: 1.54e-06 [auto_monad_eliminator]: 8.75001e-06 [cse]: 1.503e-05 [a_3]: 4.997e-05 [py_interpret_to_execute_after_opt_a]: 1.313e-05 [slice_cell_reuse_recomputed_activation]: 4.55999e-06 [rewriter_after_opt_a]: 4.054e-05 [convert_after_rewriter]: 9.76998e-06 [order_py_execute_after_rewriter]: 7.86001e-06 [mutable_eliminate]: 0.00050174 [opt_b]: 0.00027173, [1] [Cycle 1]: 0.00026084, [7] [b_1]: 0.00016295 [b_2]: 8.1e-06 [updatestate_depend_eliminate]: 7.08998e-06 [updatestate_assign_eliminate]: 2.73e-06 [updatestate_loads_eliminate]: 2.74999e-06 [renormalize]: 5.8001e-07 [cse]: 1.91e-05 [optimize_parallel_all_gather_comm]: 2.021e-05 [overlap_param_gather]: 5.31002e-06 [cconv]: 2.978e-05 [loop_unroll]: 0.00045958 [opt_after_cconv]: 0.00012588, [1] [Cycle 1]: 0.00011671, [7] [c_1]: 3.107e-05 [parameter_eliminate]: 3.16999e-06 [updatestate_depend_eliminate]: 5.37001e-06 [updatestate_assign_eliminate]: 2.51e-06 [updatestate_loads_eliminate]: 2.31e-06 [cse]: 1.742e-05 [renormalize]: 4.80009e-07 [remove_dup_value]: 1.547e-05 [tuple_transform]: 9.441e-05, [1] [Cycle 1]: 8.748e-05, [4] [d_1]: 4.59e-05 [none_parameter_eliminate]: 1.78002e-06 [renormalize]: 2.3999e-07 [switch_simplify]: 7.37002e-06 [partial_unused_args_eliminate]: 4.80001e-06 [add_recomputation]: 5.235e-05 [cse_after_recomputation]: 2.7e-05, [1] [Cycle 1]: 2.015e-05, [1] [cse]: 1.12e-05 [environ_conv]: 8.46002e-06 [swap_dp_allreduce_reducescatter]: 8.45001e-06 [bias_add_comm_swap]: 5.92999e-06 [label_micro_interleaved_index]: 6.69999e-06 [label_fine_grained_interleaved_index]: 5.32001e-06 [merge_cast_opt]: 3.81999e-06 [slice_recompute_activation]: 4.4e-06 [micro_interleaved_order_control]: 4.94e-06 [assign_add_opt]: 3.67998e-06 [ForceFp32Comm]: 3.16999e-06 [remove_cast_before_assign_add]: 3.55e-06 [full_micro_interleaved_order_control]: 5.42999e-06 [reorder_send_recv_between_fp_bp]: 5.66e-06 [comm_op_add_attrs]: 3.97998e-06 [add_comm_op_reuse_tag]: 3.48e-06 [interleave_split_concat_branches]: 3.86001e-06 [interleave_parallel_branches]: 3.88999e-06 [overlap_opt_shard_in_pipeline]: 3.69002e-06 [overlap_opt_shard_grad_in_pipeline]: 4.45e-06 [control_data_broadcast_order]: 1.585e-05 [grouped_pairwise_exchange_alltoall]: 4.22e-06 [offloading_packed_experts]: 7.28999e-06 [overlap_recompute_and_grad_model_parallel]: 7.28e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.73999e-06 [overlap_recompute_allgather_and_fa_grad]: 3.61999e-06 [overlap_recompute_comm]: 4.67e-06 [overlap_grad_ring_attention]: 6.41998e-06 [overlap_grad_flash_sp]: 2.201e-05 [begin_end_overlap_inline]: 3.36001e-06 [split_matmul_comm_elemetwise]: 4.86002e-06 [split_layernorm_comm]: 4.28999e-06 [handle_group_info]: 3.35e-06 [symbol_engine_optimizer]: 9.539e-05, [1] [Cycle 1]: 8.882e-05, [6] [build]: 3.53999e-06 [elim_shapecalc]: 1.066e-05 [elim_not_effective]: 1.273e-05 [opt_reshape]: 7.1e-06 [fold_const_symbol]: 1.023e-05 [renormalize]: 2.19996e-07 [detach_backward]: 3.63999e-06 [pipeline_parallel_scheduler]: 1.89e-06 [auto_monad_reorder]: 1.807e-05 [get_jit_bprop_graph]: 1.74e-06 [rewriter_after_jit_bprop_graph]: 4.32998e-06 [opt_after_jit_grad]: 0.00048165 [validate]: 3.904e-05 Sums bootstrap : 0.000453s : 4.11% type_inference : 0.005812s : 52.76% event_method : 0.000019s : 0.17% auto_monad : 0.000062s : 0.56% graph_reusing : 0.000006s : 0.06% inline : 0.000002s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000019s : 0.18% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.05% parallel-infer-symbol : 0.000003s : 0.03% pre_auto_parallel : 0.000032s : 0.29% insert-virtual-dataset : 0.000003s : 0.03% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.02% optimize.py_interpret_to_execute : 0.000027s : 0.25% optimize.rewriter_before_opt_a : 0.000083s : 0.75% optimize.opt_a.expand_dump_flag : 0.000004s : 0.04% optimize.opt_a.switch_simplify : 0.000051s : 0.46% optimize.opt_a.loop_unroll : 0.000036s : 0.33% optimize.opt_a.a_1 : 0.000766s : 6.95% optimize.opt_a.with_stream_mark : 0.000027s : 0.25% optimize.opt_a.recompute_prepare : 0.000016s : 0.14% optimize.opt_a.updatestate_depend_eliminate : 0.000007s : 0.06% optimize.opt_a.updatestate_assign_eliminate : 0.000006s : 0.05% optimize.opt_a.updatestate_loads_eliminate : 0.000005s : 0.05% optimize.opt_a.parameter_eliminate : 0.000003s : 0.03% optimize.opt_a.a_2 : 0.000209s : 1.90% optimize.opt_a.accelerated_algorithm : 0.000014s : 0.13% optimize.opt_a.shard : 0.000003s : 0.03% optimize.opt_a.meta_shard_fg_expand : 0.000003s : 0.03% optimize.opt_a.shard_inline : 0.000013s : 0.12% optimize.opt_a.merge_send_recv : 0.000013s : 0.12% optimize.opt_a.auto_parallel : 0.000012s : 0.11% optimize.opt_a.parallel : 0.000023s : 0.21% optimize.opt_a.flash_sp : 0.000012s : 0.11% optimize.opt_a.merge_comm : 0.000007s : 0.07% optimize.opt_a.allreduce_fusion : 0.000007s : 0.06% optimize.opt_a.matmul_add_comm_reduction : 0.000018s : 0.16% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000017s : 0.15% optimize.opt_a.virtual_dataset : 0.000013s : 0.12% optimize.opt_a.get_grad_eliminate_ : 0.000012s : 0.11% optimize.opt_a.virtual_output : 0.000013s : 0.12% optimize.opt_a.merge_forward : 0.000007s : 0.06% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.02% optimize.opt_a.offload_activation : 0.000017s : 0.15% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000030s : 0.28% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.02% optimize.opt_a.before_grad : 0.000020s : 0.18% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000007s : 0.07% optimize.opt_a.meta_fg_expand : 0.000005s : 0.05% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.03% optimize.opt_a.receive_attached : 0.000004s : 0.03% optimize.opt_a.after_resolve : 0.000023s : 0.21% optimize.opt_a.a_after_grad : 0.000020s : 0.18% optimize.opt_a.renormalize : 0.000565s : 5.13% optimize.opt_a.add_forward_monad_depend : 0.000007s : 0.06% optimize.opt_a.auto_monad_grad : 0.000004s : 0.04% optimize.opt_a.auto_monad_eliminator : 0.000024s : 0.21% optimize.opt_a.cse : 0.000043s : 0.39% optimize.opt_a.a_3 : 0.000112s : 1.01% optimize.py_interpret_to_execute_after_opt_a : 0.000013s : 0.12% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.04% optimize.rewriter_after_opt_a : 0.000041s : 0.37% optimize.convert_after_rewriter : 0.000010s : 0.09% optimize.order_py_execute_after_rewriter : 0.000008s : 0.07% optimize.mutable_eliminate : 0.000502s : 4.55% optimize.opt_b.b_1 : 0.000163s : 1.48% optimize.opt_b.b_2 : 0.000008s : 0.07% optimize.opt_b.updatestate_depend_eliminate : 0.000007s : 0.06% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.02% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.02% optimize.opt_b.renormalize : 0.000001s : 0.01% optimize.opt_b.cse : 0.000019s : 0.17% optimize.optimize_parallel_all_gather_comm : 0.000020s : 0.18% optimize.overlap_param_gather : 0.000005s : 0.05% optimize.cconv : 0.000030s : 0.27% optimize.loop_unroll : 0.000460s : 4.17% optimize.opt_after_cconv.c_1 : 0.000031s : 0.28% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000005s : 0.05% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.02% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.02% optimize.opt_after_cconv.cse : 0.000017s : 0.16% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000015s : 0.14% optimize.tuple_transform.d_1 : 0.000046s : 0.42% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.02% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000007s : 0.07% optimize.partial_unused_args_eliminate : 0.000005s : 0.04% optimize.add_recomputation : 0.000052s : 0.48% optimize.cse_after_recomputation.cse : 0.000011s : 0.10% optimize.environ_conv : 0.000008s : 0.08% optimize.swap_dp_allreduce_reducescatter : 0.000008s : 0.08% optimize.bias_add_comm_swap : 0.000006s : 0.05% optimize.label_micro_interleaved_index : 0.000007s : 0.06% optimize.label_fine_grained_interleaved_index : 0.000005s : 0.05% optimize.merge_cast_opt : 0.000004s : 0.03% optimize.slice_recompute_activation : 0.000004s : 0.04% optimize.micro_interleaved_order_control : 0.000005s : 0.04% optimize.assign_add_opt : 0.000004s : 0.03% optimize.ForceFp32Comm : 0.000003s : 0.03% optimize.remove_cast_before_assign_add : 0.000004s : 0.03% optimize.full_micro_interleaved_order_control : 0.000005s : 0.05% optimize.reorder_send_recv_between_fp_bp : 0.000006s : 0.05% optimize.comm_op_add_attrs : 0.000004s : 0.04% optimize.add_comm_op_reuse_tag : 0.000003s : 0.03% optimize.interleave_split_concat_branches : 0.000004s : 0.04% optimize.interleave_parallel_branches : 0.000004s : 0.04% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.03% optimize.overlap_opt_shard_grad_in_pipeline : 0.000004s : 0.04% optimize.control_data_broadcast_order : 0.000016s : 0.14% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.04% optimize.offloading_packed_experts : 0.000007s : 0.07% optimize.overlap_recompute_and_grad_model_parallel : 0.000007s : 0.07% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.03% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.03% optimize.overlap_recompute_comm : 0.000005s : 0.04% optimize.overlap_grad_ring_attention : 0.000006s : 0.06% optimize.overlap_grad_flash_sp : 0.000022s : 0.20% optimize.begin_end_overlap_inline : 0.000003s : 0.03% optimize.split_matmul_comm_elemetwise : 0.000005s : 0.04% optimize.split_layernorm_comm : 0.000004s : 0.04% optimize.handle_group_info : 0.000003s : 0.03% optimize.symbol_engine_optimizer.build : 0.000004s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000011s : 0.10% optimize.symbol_engine_optimizer.elim_not_effective : 0.000013s : 0.12% optimize.symbol_engine_optimizer.opt_reshape : 0.000007s : 0.06% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000010s : 0.09% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000004s : 0.03% pipeline_parallel_scheduler : 0.000002s : 0.02% auto_monad_reorder : 0.000018s : 0.16% get_jit_bprop_graph : 0.000002s : 0.02% rewriter_after_jit_bprop_graph : 0.000004s : 0.04% opt_after_jit_grad : 0.000482s : 4.37% validate : 0.000039s : 0.35% Time group info: ------[substitution.] 0.000199 28 0.97% : 0.000002s : 2: substitution.elim_not_effective 0.71% : 0.000001s : 2: substitution.fold_const_symbol 2.99% : 0.000006s : 4: substitution.graph_param_transform 80.50% : 0.000160s : 4: substitution.inline 1.72% : 0.000003s : 4: substitution.j_node_and_user_rematch 2.51% : 0.000005s : 4: substitution.remove_not_recompute_node 2.82% : 0.000006s : 4: substitution.replace_old_param 7.78% : 0.000015s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.005761 2 87.81% : 0.005059s : 1: type_inference.infer 12.19% : 0.000702s : 1: type_inference.specialize ------[replace.] 0.000062 8 63.83% : 0.000039s : 4: replace.inline 36.17% : 0.000022s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000171 8 92.08% : 0.000157s : 4: match.inline 7.92% : 0.000014s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000204 1278 0.90% : 0.000002s : 13: predicate.accumulaten_eliminater 0.67% : 0.000001s : 4: predicate.ad_related_special_op_eliminate 0.55% : 0.000001s : 8: predicate.addn_check_dump 0.90% : 0.000002s : 13: predicate.addn_zero_filter 1.03% : 0.000002s : 13: predicate.adjust_all_reduce_mul_add 2.07% : 0.000004s : 21: predicate.arithmetic_simplify 0.90% : 0.000002s : 13: predicate.cast_eliminate 0.58% : 0.000001s : 8: predicate.check_bprop_eliminate 0.53% : 0.000001s : 8: predicate.compare_switch_simplify 0.19% : 0.000000s : 4: predicate.const_output_eliminate 0.65% : 0.000001s : 8: predicate.depend_value_elim 0.97% : 0.000002s : 13: predicate.dict_get_item_const_eliminator 1.07% : 0.000002s : 13: predicate.dict_get_item_eliminator 0.84% : 0.000002s : 13: predicate.dict_set_item_eliminator 0.98% : 0.000002s : 8: predicate.dumpgradient_eliminate 0.20% : 0.000000s : 4: predicate.elim_not_effective 0.36% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.21% : 0.000002s : 17: predicate.environ_add_const_eliminate 1.09% : 0.000002s : 17: predicate.environ_get_add_eliminate 1.11% : 0.000002s : 17: predicate.environ_get_depend_swap 1.69% : 0.000003s : 25: predicate.environ_get_eliminate 1.11% : 0.000002s : 17: predicate.environ_get_set_eliminate 1.43% : 0.000003s : 21: predicate.exchange_switch_depend_value 2.44% : 0.000005s : 21: predicate.float_depend_g_call 0.60% : 0.000001s : 8: predicate.float_environ_get_switch 0.79% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.19% : 0.000000s : 4: predicate.fold_const_symbol 0.65% : 0.000001s : 8: predicate.get_grad_eliminate 0.23% : 0.000000s : 4: predicate.graph_param_transform 0.60% : 0.000001s : 8: predicate.incorporate_call 0.50% : 0.000001s : 8: predicate.incorporate_call_switch 6.16% : 0.000013s : 58: predicate.inline 0.73% : 0.000001s : 8: predicate.inline_without_move 0.34% : 0.000001s : 8: predicate.j_node_and_user_rematch 0.84% : 0.000002s : 8: predicate.less_batch_normalization 1.77% : 0.000004s : 25: predicate.list_to_tuple_eliminator_ 2.55% : 0.000005s : 38: predicate.load_eliminater 1.13% : 0.000002s : 4: predicate.loop_unroll_after_grad 2.32% : 0.000005s : 34: predicate.loop_unroll_before_grad 1.60% : 0.000003s : 21: predicate.make_slice_get_slice_eliminator 0.52% : 0.000001s : 8: predicate.merge_addn 0.56% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.64% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.78% : 0.000002s : 13: predicate.minmaximum_grad 1.23% : 0.000003s : 4: predicate.mutable_eliminate 0.36% : 0.000001s : 4: predicate.opt_reshape 0.34% : 0.000001s : 4: predicate.parallel_virtual_node 1.75% : 0.000004s : 21: predicate.partial_defer_inline 1.67% : 0.000003s : 21: predicate.partial_eliminate 0.99% : 0.000002s : 13: predicate.print_const_string_wrapper 0.59% : 0.000001s : 8: predicate.reduce_all_const_elim 1.09% : 0.000002s : 13: predicate.reduce_eliminate 2.45% : 0.000005s : 38: predicate.redundant_stop_gradient_eliminater 0.43% : 0.000001s : 8: predicate.remove_not_recompute_node 1.40% : 0.000003s : 25: predicate.replace_applicator 0.59% : 0.000001s : 8: predicate.replace_old_param 0.28% : 0.000001s : 4: predicate.reset_defer_inline 0.88% : 0.000002s : 13: predicate.reshape_eliminate 0.61% : 0.000001s : 8: predicate.row_tensor_add_zeros_like 0.41% : 0.000001s : 4: predicate.row_tensor_eliminate 0.85% : 0.000002s : 8: predicate.same_eliminate 0.44% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.84% : 0.000002s : 8: predicate.shard_identity_eliminate 0.74% : 0.000002s : 8: predicate.special_op_eliminate 0.70% : 0.000001s : 8: predicate.specialize_transform 1.10% : 0.000002s : 8: predicate.split_environ_get_set_with_tuple_value 0.92% : 0.000002s : 8: predicate.stack_unstack_eliminate 0.33% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.52% : 0.000003s : 21: predicate.switch_defer_inline 2.07% : 0.000004s : 29: predicate.switch_layer_defer_inline 5.37% : 0.000011s : 67: predicate.switch_simplify 0.82% : 0.000002s : 13: predicate.tile_eliminate 0.94% : 0.000002s : 13: predicate.transpose_eliminate 1.59% : 0.000003s : 21: predicate.tuple_list_convert_item_index_to_positive 1.59% : 0.000003s : 21: predicate.tuple_list_get_item_const_eliminator 1.44% : 0.000003s : 21: predicate.tuple_list_get_item_depend_reorder 3.37% : 0.000007s : 33: predicate.tuple_list_get_item_eliminator 1.41% : 0.000003s : 21: predicate.tuple_list_get_set_item_eliminator 2.18% : 0.000004s : 29: predicate.tuple_list_set_item_eliminator 1.76% : 0.000004s : 25: predicate.tuple_to_list_eliminator_ 2.42% : 0.000005s : 38: predicate.updatestate_pure_node_eliminater 3.21% : 0.000007s : 46: predicate.updatestate_useless_node_eliminater 0.39% : 0.000001s : 4: predicate.value_based_eliminate 0.61% : 0.000001s : 8: predicate.virtual_dataset_eliminate 0.60% : 0.000001s : 8: predicate.virtual_output_eliminate 0.26% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.52% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000523 11 56.45% : 0.000295s : 5: func_graph_cloner_run.FuncGraphClonerGraph 43.55% : 0.000228s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.026374 192 0.02% : 0.000006s : 1: ForceFp32Comm 11.58% : 0.003054s : 1: add_attr 11.53% : 0.003040s : 1: add_attr_with_inline 0.02% : 0.000007s : 1: add_comm_op_reuse_tag 0.21% : 0.000056s : 1: add_recomputation 0.02% : 0.000006s : 1: assign_add_opt 0.27% : 0.000071s : 1: auto_monad 0.09% : 0.000025s : 1: auto_monad_reorder 0.02% : 0.000006s : 1: begin_end_overlap_inline 0.03% : 0.000009s : 1: bias_add_comm_swap 1.89% : 0.000498s : 1: bootstrap 0.13% : 0.000033s : 1: cconv 0.03% : 0.000007s : 1: comm_op_add_attrs 0.07% : 0.000019s : 1: control_data_broadcast_order 0.05% : 0.000013s : 1: convert_after_rewriter 0.11% : 0.000030s : 1: cse_after_recomputation 0.03% : 0.000007s : 1: dataset_repeat_opt 0.07% : 0.000018s : 1: detach_backward 0.04% : 0.000011s : 1: environ_conv 0.11% : 0.000029s : 1: event_method 0.03% : 0.000008s : 1: full_micro_interleaved_order_control 0.03% : 0.000008s : 1: get_jit_bprop_graph 0.05% : 0.000013s : 1: graph_reusing 0.03% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.02% : 0.000006s : 1: handle_group_info 0.03% : 0.000007s : 1: inline 0.03% : 0.000009s : 1: insert-virtual-dataset 0.03% : 0.000007s : 1: interleave_parallel_branches 0.03% : 0.000007s : 1: interleave_split_concat_branches 0.03% : 0.000008s : 1: label_fine_grained_interleaved_index 0.04% : 0.000010s : 1: label_micro_interleaved_index 1.77% : 0.000466s : 1: loop_unroll 0.02% : 0.000007s : 1: merge_cast_opt 0.03% : 0.000008s : 1: micro_interleaved_order_control 1.93% : 0.000508s : 1: mutable_eliminate 0.04% : 0.000010s : 1: offloading_packed_experts 0.05% : 0.000014s : 1: opt.transform.loop_unroll_optimizer 0.06% : 0.000016s : 1: opt.transform.mutable_eliminate 4.52% : 0.001193s : 78: opt.transform.opt_a 0.11% : 0.000030s : 1: opt.transform.opt_after_cconv 0.09% : 0.000025s : 1: opt.transform.opt_after_jit_grad 0.38% : 0.000100s : 28: opt.transform.opt_b 0.19% : 0.000051s : 2: opt.transform.opt_trans_graph 0.14% : 0.000037s : 4: opt.transform.symbol_engine_opt 11.15% : 0.002941s : 1: opt_a 0.49% : 0.000129s : 1: opt_after_cconv 1.87% : 0.000492s : 1: opt_after_jit_grad 1.04% : 0.000275s : 1: opt_b 22.64% : 0.005972s : 1: optimize 0.09% : 0.000024s : 1: optimize_parallel_all_gather_comm 0.04% : 0.000011s : 1: order_py_execute_after_rewriter 0.10% : 0.000025s : 1: overlap_grad_flash_sp 0.03% : 0.000007s : 1: overlap_grad_matmul_and_grad_allreduce 0.04% : 0.000009s : 1: overlap_grad_ring_attention 0.03% : 0.000007s : 1: overlap_opt_shard_grad_in_pipeline 0.02% : 0.000006s : 1: overlap_opt_shard_in_pipeline 0.03% : 0.000009s : 1: overlap_param_gather 0.02% : 0.000006s : 1: overlap_recompute_allgather_and_fa_grad 0.04% : 0.000010s : 1: overlap_recompute_and_grad_model_parallel 0.03% : 0.000007s : 1: overlap_recompute_comm 0.04% : 0.000010s : 1: parallel-infer-symbol 0.02% : 0.000006s : 1: parallel-infer-symbol-second 0.03% : 0.000008s : 1: partial_unused_args_eliminate 0.03% : 0.000009s : 1: pipeline_parallel_scheduler 0.03% : 0.000007s : 1: pipeline_split 0.15% : 0.000039s : 1: pre_auto_parallel 0.12% : 0.000031s : 1: py_interpret_to_execute 0.06% : 0.000016s : 1: py_interpret_to_execute_after_opt_a 0.02% : 0.000006s : 1: remove_cast_before_assign_add 0.07% : 0.000019s : 1: remove_dup_value 1.10% : 0.000290s : 1: renormalize.infer 1.01% : 0.000267s : 1: renormalize.specialize 0.03% : 0.000008s : 1: reorder_send_recv_between_fp_bp 0.04% : 0.000010s : 1: rewriter_after_jit_bprop_graph 0.17% : 0.000044s : 1: rewriter_after_opt_a 0.33% : 0.000087s : 1: rewriter_before_opt_a 0.03% : 0.000007s : 1: slice_cell_reuse_recomputed_activation 0.03% : 0.000007s : 1: slice_recompute_activation 0.03% : 0.000007s : 1: split_layernorm_comm 0.03% : 0.000008s : 1: split_matmul_comm_elemetwise 0.04% : 0.000011s : 1: swap_dp_allreduce_reducescatter 0.37% : 0.000098s : 1: symbol_engine_optimizer 0.37% : 0.000097s : 1: tuple_transform 22.18% : 0.005850s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:45:26.466.124 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0154009, [21] [bootstrap]: 0.0004233 [type_inference]: 0.00584223 [event_method]: 1.75e-05 [auto_monad]: 6.008e-05 [graph_reusing]: 5.67001e-06 [inline]: 2.37001e-06 [add_attr]: 0.00325694, [1] [add_attr_with_inline]: 0.00324786, [1] [Cycle 1]: 6.341e-05, [2] [tag_attr]: 1.974e-05 [meta_addattr_fg_expand]: 5.72999e-06 [parallel-infer-symbol]: 3.33e-06 [pre_auto_parallel]: 3.416e-05 [insert-virtual-dataset]: 1.554e-05 [parallel-infer-symbol-second]: 8.79983e-07 [dataset_repeat_opt]: 2.55002e-06 [pipeline_split]: 1.64998e-06 [optimize]: 0.00500554, [53] [py_interpret_to_execute]: 2.819e-05 [rewriter_before_opt_a]: 8.279e-05 [opt_a]: 0.00278891, [2] [Cycle 1]: 0.00211061, [45] [expand_dump_flag]: 3.40998e-06 [switch_simplify]: 4.47e-05 [loop_unroll]: 2.999e-05 [a_1]: 0.00063483 [with_stream_mark]: 1.871e-05 [recompute_prepare]: 1.225e-05 [updatestate_depend_eliminate]: 4.50001e-06 [updatestate_assign_eliminate]: 3.28e-06 [updatestate_loads_eliminate]: 2.94001e-06 [parameter_eliminate]: 2.34999e-06 [a_2]: 8.309e-05 [accelerated_algorithm]: 7.38e-06 [shard]: 2.98e-06 [meta_shard_fg_expand]: 1.81e-06 [shard_inline]: 6.67002e-06 [merge_send_recv]: 9.36998e-06 [auto_parallel]: 6.89999e-06 [parallel]: 1.981e-05 [flash_sp]: 8.53001e-06 [merge_comm]: 4.42998e-06 [allreduce_fusion]: 3.58e-06 [matmul_add_comm_reduction]: 9.55001e-06 [allreduce_slice_to_reducescatter]: 6.39993e-07 [virtual_shard_identity]: 9.56e-06 [virtual_dataset]: 6.71e-06 [get_grad_eliminate_]: 6.48998e-06 [virtual_output]: 6.38998e-06 [merge_forward]: 4.00998e-06 [cell_reuse_recompute_pass]: 1.17999e-06 [offload_activation]: 1.034e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.493e-05 [merge_recompute_call_nodes]: 1.70001e-06 [before_grad]: 1.08e-05 [set_forward_comm_id_for_comm_node_pass]: 4.15999e-06 [meta_fg_expand]: 3.12002e-06 [flash_sp_send_recv_attached]: 2.48e-06 [receive_attached]: 2.66999e-06 [after_resolve]: 1.269e-05 [a_after_grad]: 1.074e-05 [renormalize]: 0.00070097 [add_forward_monad_depend]: 6.93e-06 [auto_monad_grad]: 2.79001e-06 [auto_monad_eliminator]: 1.628e-05 [cse]: 3.15e-05 [a_3]: 5.103e-05 [Cycle 2]: 0.00066738, [45] [expand_dump_flag]: 1.65001e-06 [switch_simplify]: 8.46002e-06 [loop_unroll]: 6.44001e-06 [a_1]: 0.0001309 [with_stream_mark]: 1.452e-05 [recompute_prepare]: 6.71e-06 [updatestate_depend_eliminate]: 4.18001e-06 [updatestate_assign_eliminate]: 2.24001e-06 [updatestate_loads_eliminate]: 2.57001e-06 [parameter_eliminate]: 1.09e-06 [a_2]: 7.162e-05 [accelerated_algorithm]: 6.28002e-06 [shard]: 1.79e-06 [meta_shard_fg_expand]: 1.50999e-06 [shard_inline]: 6.24001e-06 [merge_send_recv]: 5.30001e-06 [auto_parallel]: 6.19999e-06 [parallel]: 5.69e-06 [flash_sp]: 3.75e-06 [merge_comm]: 3.48999e-06 [allreduce_fusion]: 3.42002e-06 [matmul_add_comm_reduction]: 6.32001e-06 [allreduce_slice_to_reducescatter]: 6.49976e-07 [virtual_shard_identity]: 7.01001e-06 [virtual_dataset]: 6.44999e-06 [get_grad_eliminate_]: 5.77001e-06 [virtual_output]: 5.61e-06 [merge_forward]: 3.25e-06 [cell_reuse_recompute_pass]: 1.91e-06 [offload_activation]: 7.35e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.329e-05 [merge_recompute_call_nodes]: 1.07e-06 [before_grad]: 9.96998e-06 [set_forward_comm_id_for_comm_node_pass]: 3.51999e-06 [meta_fg_expand]: 2.41e-06 [flash_sp_send_recv_attached]: 1.01002e-06 [receive_attached]: 1.44998e-06 [after_resolve]: 1.261e-05 [a_after_grad]: 1.004e-05 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 1.45001e-06 [auto_monad_grad]: 1.31998e-06 [auto_monad_eliminator]: 7.29001e-06 [cse]: 1.453e-05 [a_3]: 3.566e-05 [py_interpret_to_execute_after_opt_a]: 1.141e-05 [slice_cell_reuse_recomputed_activation]: 1.97999e-06 [rewriter_after_opt_a]: 3.715e-05 [convert_after_rewriter]: 7.02002e-06 [order_py_execute_after_rewriter]: 5.56e-06 [mutable_eliminate]: 0.00060658 [opt_b]: 0.00021372, [1] [Cycle 1]: 0.00020711, [7] [b_1]: 0.00013184 [b_2]: 7.95e-06 [updatestate_depend_eliminate]: 6.88998e-06 [updatestate_assign_eliminate]: 2.28998e-06 [updatestate_loads_eliminate]: 2.41e-06 [renormalize]: 7.39994e-07 [cse]: 1.928e-05 [optimize_parallel_all_gather_comm]: 1.722e-05 [overlap_param_gather]: 1.93002e-06 [cconv]: 2.698e-05 [loop_unroll]: 0.00050707 [opt_after_cconv]: 0.00010447, [1] [Cycle 1]: 9.877e-05, [7] [c_1]: 3.226e-05 [parameter_eliminate]: 2.89001e-06 [updatestate_depend_eliminate]: 5.34e-06 [updatestate_assign_eliminate]: 2.57001e-06 [updatestate_loads_eliminate]: 2.21e-06 [cse]: 1.858e-05 [renormalize]: 4.30009e-07 [remove_dup_value]: 1.428e-05 [tuple_transform]: 7.648e-05, [1] [Cycle 1]: 7.186e-05, [4] [d_1]: 4.541e-05 [none_parameter_eliminate]: 1.54998e-06 [renormalize]: 2.00002e-07 [switch_simplify]: 6.52001e-06 [partial_unused_args_eliminate]: 1.95001e-06 [add_recomputation]: 5.262e-05 [cse_after_recomputation]: 2.092e-05, [1] [Cycle 1]: 1.629e-05, [1] [cse]: 1.084e-05 [environ_conv]: 5.40999e-06 [swap_dp_allreduce_reducescatter]: 5.20999e-06 [bias_add_comm_swap]: 2.37999e-06 [label_micro_interleaved_index]: 4.71002e-06 [label_fine_grained_interleaved_index]: 2.88e-06 [merge_cast_opt]: 1.25999e-06 [slice_recompute_activation]: 2.27001e-06 [micro_interleaved_order_control]: 2.29001e-06 [assign_add_opt]: 1.17e-06 [ForceFp32Comm]: 7.7e-07 [remove_cast_before_assign_add]: 1.07998e-06 [full_micro_interleaved_order_control]: 2.45002e-06 [reorder_send_recv_between_fp_bp]: 2.71e-06 [comm_op_add_attrs]: 1.07e-06 [add_comm_op_reuse_tag]: 1.00001e-06 [interleave_split_concat_branches]: 1.48002e-06 [interleave_parallel_branches]: 1.10999e-06 [overlap_opt_shard_in_pipeline]: 1.27e-06 [overlap_opt_shard_grad_in_pipeline]: 2.51e-06 [control_data_broadcast_order]: 1.305e-05 [grouped_pairwise_exchange_alltoall]: 1.55001e-06 [offloading_packed_experts]: 3.6e-06 [overlap_recompute_and_grad_model_parallel]: 4.65999e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.16002e-06 [overlap_recompute_allgather_and_fa_grad]: 1.30999e-06 [overlap_recompute_comm]: 2.44999e-06 [overlap_grad_ring_attention]: 4.38001e-06 [overlap_grad_flash_sp]: 1.949e-05 [begin_end_overlap_inline]: 5.19998e-07 [split_matmul_comm_elemetwise]: 2.70997e-06 [split_layernorm_comm]: 1.57001e-06 [handle_group_info]: 1.03001e-06 [symbol_engine_optimizer]: 7.644e-05, [1] [Cycle 1]: 7.224e-05, [6] [build]: 3.47997e-06 [elim_shapecalc]: 9.84999e-06 [elim_not_effective]: 1.332e-05 [opt_reshape]: 7.21999e-06 [fold_const_symbol]: 1.001e-05 [renormalize]: 2.69996e-07 [detach_backward]: 1.78002e-06 [pipeline_parallel_scheduler]: 1.57999e-06 [auto_monad_reorder]: 1.716e-05 [get_jit_bprop_graph]: 1.72999e-06 [rewriter_after_jit_bprop_graph]: 4.48001e-06 [opt_after_jit_grad]: 0.00050833 [validate]: 4.237e-05 Sums bootstrap : 0.000423s : 3.80% type_inference : 0.005842s : 52.38% event_method : 0.000018s : 0.16% auto_monad : 0.000060s : 0.54% graph_reusing : 0.000006s : 0.05% inline : 0.000002s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000020s : 0.18% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.05% parallel-infer-symbol : 0.000003s : 0.03% pre_auto_parallel : 0.000034s : 0.31% insert-virtual-dataset : 0.000016s : 0.14% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000003s : 0.02% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000028s : 0.25% optimize.rewriter_before_opt_a : 0.000083s : 0.74% optimize.opt_a.expand_dump_flag : 0.000005s : 0.05% optimize.opt_a.switch_simplify : 0.000053s : 0.48% optimize.opt_a.loop_unroll : 0.000036s : 0.33% optimize.opt_a.a_1 : 0.000766s : 6.87% optimize.opt_a.with_stream_mark : 0.000033s : 0.30% optimize.opt_a.recompute_prepare : 0.000019s : 0.17% optimize.opt_a.updatestate_depend_eliminate : 0.000009s : 0.08% optimize.opt_a.updatestate_assign_eliminate : 0.000006s : 0.05% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.05% optimize.opt_a.parameter_eliminate : 0.000003s : 0.03% optimize.opt_a.a_2 : 0.000155s : 1.39% optimize.opt_a.accelerated_algorithm : 0.000014s : 0.12% optimize.opt_a.shard : 0.000005s : 0.04% optimize.opt_a.meta_shard_fg_expand : 0.000003s : 0.03% optimize.opt_a.shard_inline : 0.000013s : 0.12% optimize.opt_a.merge_send_recv : 0.000015s : 0.13% optimize.opt_a.auto_parallel : 0.000013s : 0.12% optimize.opt_a.parallel : 0.000025s : 0.23% optimize.opt_a.flash_sp : 0.000012s : 0.11% optimize.opt_a.merge_comm : 0.000008s : 0.07% optimize.opt_a.allreduce_fusion : 0.000007s : 0.06% optimize.opt_a.matmul_add_comm_reduction : 0.000016s : 0.14% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000017s : 0.15% optimize.opt_a.virtual_dataset : 0.000013s : 0.12% optimize.opt_a.get_grad_eliminate_ : 0.000012s : 0.11% optimize.opt_a.virtual_output : 0.000012s : 0.11% optimize.opt_a.merge_forward : 0.000007s : 0.07% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.03% optimize.opt_a.offload_activation : 0.000018s : 0.16% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000028s : 0.25% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.02% optimize.opt_a.before_grad : 0.000021s : 0.19% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000008s : 0.07% optimize.opt_a.meta_fg_expand : 0.000006s : 0.05% optimize.opt_a.flash_sp_send_recv_attached : 0.000003s : 0.03% optimize.opt_a.receive_attached : 0.000004s : 0.04% optimize.opt_a.after_resolve : 0.000025s : 0.23% optimize.opt_a.a_after_grad : 0.000021s : 0.19% optimize.opt_a.renormalize : 0.000701s : 6.29% optimize.opt_a.add_forward_monad_depend : 0.000008s : 0.08% optimize.opt_a.auto_monad_grad : 0.000004s : 0.04% optimize.opt_a.auto_monad_eliminator : 0.000024s : 0.21% optimize.opt_a.cse : 0.000046s : 0.41% optimize.opt_a.a_3 : 0.000087s : 0.78% optimize.py_interpret_to_execute_after_opt_a : 0.000011s : 0.10% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.02% optimize.rewriter_after_opt_a : 0.000037s : 0.33% optimize.convert_after_rewriter : 0.000007s : 0.06% optimize.order_py_execute_after_rewriter : 0.000006s : 0.05% optimize.mutable_eliminate : 0.000607s : 5.44% optimize.opt_b.b_1 : 0.000132s : 1.18% optimize.opt_b.b_2 : 0.000008s : 0.07% optimize.opt_b.updatestate_depend_eliminate : 0.000007s : 0.06% optimize.opt_b.updatestate_assign_eliminate : 0.000002s : 0.02% optimize.opt_b.updatestate_loads_eliminate : 0.000002s : 0.02% optimize.opt_b.renormalize : 0.000001s : 0.01% optimize.opt_b.cse : 0.000019s : 0.17% optimize.optimize_parallel_all_gather_comm : 0.000017s : 0.15% optimize.overlap_param_gather : 0.000002s : 0.02% optimize.cconv : 0.000027s : 0.24% optimize.loop_unroll : 0.000507s : 4.55% optimize.opt_after_cconv.c_1 : 0.000032s : 0.29% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000005s : 0.05% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.02% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.02% optimize.opt_after_cconv.cse : 0.000019s : 0.17% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000014s : 0.13% optimize.tuple_transform.d_1 : 0.000045s : 0.41% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000007s : 0.06% optimize.partial_unused_args_eliminate : 0.000002s : 0.02% optimize.add_recomputation : 0.000053s : 0.47% optimize.cse_after_recomputation.cse : 0.000011s : 0.10% optimize.environ_conv : 0.000005s : 0.05% optimize.swap_dp_allreduce_reducescatter : 0.000005s : 0.05% optimize.bias_add_comm_swap : 0.000002s : 0.02% optimize.label_micro_interleaved_index : 0.000005s : 0.04% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.03% optimize.merge_cast_opt : 0.000001s : 0.01% optimize.slice_recompute_activation : 0.000002s : 0.02% optimize.micro_interleaved_order_control : 0.000002s : 0.02% optimize.assign_add_opt : 0.000001s : 0.01% optimize.ForceFp32Comm : 0.000001s : 0.01% optimize.remove_cast_before_assign_add : 0.000001s : 0.01% optimize.full_micro_interleaved_order_control : 0.000002s : 0.02% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.02% optimize.comm_op_add_attrs : 0.000001s : 0.01% optimize.add_comm_op_reuse_tag : 0.000001s : 0.01% optimize.interleave_split_concat_branches : 0.000001s : 0.01% optimize.interleave_parallel_branches : 0.000001s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000003s : 0.02% optimize.control_data_broadcast_order : 0.000013s : 0.12% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.01% optimize.offloading_packed_experts : 0.000004s : 0.03% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.04% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.01% optimize.overlap_recompute_comm : 0.000002s : 0.02% optimize.overlap_grad_ring_attention : 0.000004s : 0.04% optimize.overlap_grad_flash_sp : 0.000019s : 0.17% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000003s : 0.02% optimize.split_layernorm_comm : 0.000002s : 0.01% optimize.handle_group_info : 0.000001s : 0.01% optimize.symbol_engine_optimizer.build : 0.000003s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000010s : 0.09% optimize.symbol_engine_optimizer.elim_not_effective : 0.000013s : 0.12% optimize.symbol_engine_optimizer.opt_reshape : 0.000007s : 0.06% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000010s : 0.09% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.02% pipeline_parallel_scheduler : 0.000002s : 0.01% auto_monad_reorder : 0.000017s : 0.15% get_jit_bprop_graph : 0.000002s : 0.02% rewriter_after_jit_bprop_graph : 0.000004s : 0.04% opt_after_jit_grad : 0.000508s : 4.56% validate : 0.000042s : 0.38% Time group info: ------[substitution.] 0.000184 28 1.25% : 0.000002s : 2: substitution.elim_not_effective 0.87% : 0.000002s : 2: substitution.fold_const_symbol 3.27% : 0.000006s : 4: substitution.graph_param_transform 78.76% : 0.000145s : 4: substitution.inline 2.17% : 0.000004s : 4: substitution.j_node_and_user_rematch 2.66% : 0.000005s : 4: substitution.remove_not_recompute_node 2.52% : 0.000005s : 4: substitution.replace_old_param 8.50% : 0.000016s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.005781 2 86.95% : 0.005027s : 1: type_inference.infer 13.05% : 0.000755s : 1: type_inference.specialize ------[replace.] 0.000062 8 63.30% : 0.000039s : 4: replace.inline 36.70% : 0.000023s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000156 8 91.15% : 0.000142s : 4: match.inline 8.85% : 0.000014s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000208 1278 0.85% : 0.000002s : 13: predicate.accumulaten_eliminater 0.96% : 0.000002s : 4: predicate.ad_related_special_op_eliminate 0.51% : 0.000001s : 8: predicate.addn_check_dump 0.93% : 0.000002s : 13: predicate.addn_zero_filter 0.84% : 0.000002s : 13: predicate.adjust_all_reduce_mul_add 2.24% : 0.000005s : 21: predicate.arithmetic_simplify 1.12% : 0.000002s : 13: predicate.cast_eliminate 0.57% : 0.000001s : 8: predicate.check_bprop_eliminate 0.48% : 0.000001s : 8: predicate.compare_switch_simplify 0.19% : 0.000000s : 4: predicate.const_output_eliminate 0.59% : 0.000001s : 8: predicate.depend_value_elim 0.91% : 0.000002s : 13: predicate.dict_get_item_const_eliminator 1.03% : 0.000002s : 13: predicate.dict_get_item_eliminator 0.89% : 0.000002s : 13: predicate.dict_set_item_eliminator 1.26% : 0.000003s : 8: predicate.dumpgradient_eliminate 0.19% : 0.000000s : 4: predicate.elim_not_effective 0.44% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.11% : 0.000002s : 17: predicate.environ_add_const_eliminate 1.06% : 0.000002s : 17: predicate.environ_get_add_eliminate 1.04% : 0.000002s : 17: predicate.environ_get_depend_swap 1.79% : 0.000004s : 25: predicate.environ_get_eliminate 1.07% : 0.000002s : 17: predicate.environ_get_set_eliminate 1.48% : 0.000003s : 21: predicate.exchange_switch_depend_value 2.58% : 0.000005s : 21: predicate.float_depend_g_call 0.60% : 0.000001s : 8: predicate.float_environ_get_switch 0.79% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.16% : 0.000000s : 4: predicate.fold_const_symbol 0.62% : 0.000001s : 8: predicate.get_grad_eliminate 0.24% : 0.000001s : 4: predicate.graph_param_transform 0.58% : 0.000001s : 8: predicate.incorporate_call 0.49% : 0.000001s : 8: predicate.incorporate_call_switch 6.35% : 0.000013s : 58: predicate.inline 0.92% : 0.000002s : 8: predicate.inline_without_move 0.31% : 0.000001s : 8: predicate.j_node_and_user_rematch 0.79% : 0.000002s : 8: predicate.less_batch_normalization 1.80% : 0.000004s : 25: predicate.list_to_tuple_eliminator_ 2.44% : 0.000005s : 38: predicate.load_eliminater 1.06% : 0.000002s : 4: predicate.loop_unroll_after_grad 2.34% : 0.000005s : 34: predicate.loop_unroll_before_grad 1.59% : 0.000003s : 21: predicate.make_slice_get_slice_eliminator 0.52% : 0.000001s : 8: predicate.merge_addn 0.55% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.50% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.79% : 0.000002s : 13: predicate.minmaximum_grad 1.16% : 0.000002s : 4: predicate.mutable_eliminate 0.35% : 0.000001s : 4: predicate.opt_reshape 0.43% : 0.000001s : 4: predicate.parallel_virtual_node 2.06% : 0.000004s : 21: predicate.partial_defer_inline 1.73% : 0.000004s : 21: predicate.partial_eliminate 0.91% : 0.000002s : 13: predicate.print_const_string_wrapper 0.56% : 0.000001s : 8: predicate.reduce_all_const_elim 1.23% : 0.000003s : 13: predicate.reduce_eliminate 2.45% : 0.000005s : 38: predicate.redundant_stop_gradient_eliminater 0.37% : 0.000001s : 8: predicate.remove_not_recompute_node 1.39% : 0.000003s : 25: predicate.replace_applicator 0.45% : 0.000001s : 8: predicate.replace_old_param 0.28% : 0.000001s : 4: predicate.reset_defer_inline 0.95% : 0.000002s : 13: predicate.reshape_eliminate 0.65% : 0.000001s : 8: predicate.row_tensor_add_zeros_like 0.35% : 0.000001s : 4: predicate.row_tensor_eliminate 0.66% : 0.000001s : 8: predicate.same_eliminate 0.49% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.70% : 0.000001s : 8: predicate.shard_identity_eliminate 0.68% : 0.000001s : 8: predicate.special_op_eliminate 0.65% : 0.000001s : 8: predicate.specialize_transform 0.92% : 0.000002s : 8: predicate.split_environ_get_set_with_tuple_value 0.78% : 0.000002s : 8: predicate.stack_unstack_eliminate 0.36% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.50% : 0.000003s : 21: predicate.switch_defer_inline 2.06% : 0.000004s : 29: predicate.switch_layer_defer_inline 5.22% : 0.000011s : 67: predicate.switch_simplify 0.90% : 0.000002s : 13: predicate.tile_eliminate 0.90% : 0.000002s : 13: predicate.transpose_eliminate 1.63% : 0.000003s : 21: predicate.tuple_list_convert_item_index_to_positive 1.47% : 0.000003s : 21: predicate.tuple_list_get_item_const_eliminator 1.50% : 0.000003s : 21: predicate.tuple_list_get_item_depend_reorder 3.46% : 0.000007s : 33: predicate.tuple_list_get_item_eliminator 1.56% : 0.000003s : 21: predicate.tuple_list_get_set_item_eliminator 2.13% : 0.000004s : 29: predicate.tuple_list_set_item_eliminator 1.69% : 0.000004s : 25: predicate.tuple_to_list_eliminator_ 2.43% : 0.000005s : 38: predicate.updatestate_pure_node_eliminater 3.02% : 0.000006s : 46: predicate.updatestate_useless_node_eliminater 0.42% : 0.000001s : 4: predicate.value_based_eliminate 0.63% : 0.000001s : 8: predicate.virtual_dataset_eliminate 0.61% : 0.000001s : 8: predicate.virtual_output_eliminate 0.32% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.42% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000531 11 52.52% : 0.000279s : 5: func_graph_cloner_run.FuncGraphClonerGraph 47.48% : 0.000252s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.025700 192 0.01% : 0.000003s : 1: ForceFp32Comm 12.69% : 0.003263s : 1: add_attr 12.65% : 0.003251s : 1: add_attr_with_inline 0.01% : 0.000004s : 1: add_comm_op_reuse_tag 0.22% : 0.000057s : 1: add_recomputation 0.02% : 0.000004s : 1: assign_add_opt 0.25% : 0.000065s : 1: auto_monad 0.08% : 0.000021s : 1: auto_monad_reorder 0.01% : 0.000003s : 1: begin_end_overlap_inline 0.02% : 0.000005s : 1: bias_add_comm_swap 1.75% : 0.000451s : 1: bootstrap 0.12% : 0.000030s : 1: cconv 0.01% : 0.000004s : 1: comm_op_add_attrs 0.06% : 0.000016s : 1: control_data_broadcast_order 0.04% : 0.000010s : 1: convert_after_rewriter 0.09% : 0.000024s : 1: cse_after_recomputation 0.02% : 0.000005s : 1: dataset_repeat_opt 0.02% : 0.000005s : 1: detach_backward 0.03% : 0.000008s : 1: environ_conv 0.09% : 0.000024s : 1: event_method 0.02% : 0.000005s : 1: full_micro_interleaved_order_control 0.02% : 0.000005s : 1: get_jit_bprop_graph 0.04% : 0.000009s : 1: graph_reusing 0.02% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000004s : 1: handle_group_info 0.02% : 0.000005s : 1: inline 0.08% : 0.000020s : 1: insert-virtual-dataset 0.01% : 0.000004s : 1: interleave_parallel_branches 0.02% : 0.000004s : 1: interleave_split_concat_branches 0.02% : 0.000006s : 1: label_fine_grained_interleaved_index 0.03% : 0.000007s : 1: label_micro_interleaved_index 2.01% : 0.000516s : 1: loop_unroll 0.02% : 0.000004s : 1: merge_cast_opt 0.02% : 0.000005s : 1: micro_interleaved_order_control 2.39% : 0.000615s : 1: mutable_eliminate 0.03% : 0.000006s : 1: offloading_packed_experts 0.06% : 0.000014s : 1: opt.transform.loop_unroll_optimizer 0.06% : 0.000016s : 1: opt.transform.mutable_eliminate 4.66% : 0.001197s : 78: opt.transform.opt_a 0.12% : 0.000031s : 1: opt.transform.opt_after_cconv 0.11% : 0.000027s : 1: opt.transform.opt_after_jit_grad 0.39% : 0.000101s : 28: opt.transform.opt_b 0.19% : 0.000049s : 2: opt.transform.opt_trans_graph 0.14% : 0.000037s : 4: opt.transform.symbol_engine_opt 10.86% : 0.002792s : 1: opt_a 0.42% : 0.000108s : 1: opt_after_cconv 2.02% : 0.000518s : 1: opt_after_jit_grad 0.85% : 0.000217s : 1: opt_b 19.50% : 0.005010s : 1: optimize 0.08% : 0.000021s : 1: optimize_parallel_all_gather_comm 0.03% : 0.000008s : 1: order_py_execute_after_rewriter 0.09% : 0.000023s : 1: overlap_grad_flash_sp 0.02% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.03% : 0.000007s : 1: overlap_grad_ring_attention 0.02% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.02% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.02% : 0.000005s : 1: overlap_param_gather 0.02% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.03% : 0.000008s : 1: overlap_recompute_and_grad_model_parallel 0.02% : 0.000005s : 1: overlap_recompute_comm 0.03% : 0.000007s : 1: parallel-infer-symbol 0.02% : 0.000004s : 1: parallel-infer-symbol-second 0.02% : 0.000005s : 1: partial_unused_args_eliminate 0.02% : 0.000005s : 1: pipeline_parallel_scheduler 0.02% : 0.000004s : 1: pipeline_split 0.15% : 0.000039s : 1: pre_auto_parallel 0.13% : 0.000033s : 1: py_interpret_to_execute 0.06% : 0.000015s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000004s : 1: remove_cast_before_assign_add 0.07% : 0.000018s : 1: remove_dup_value 1.39% : 0.000358s : 1: renormalize.infer 1.29% : 0.000332s : 1: renormalize.specialize 0.02% : 0.000005s : 1: reorder_send_recv_between_fp_bp 0.03% : 0.000008s : 1: rewriter_after_jit_bprop_graph 0.16% : 0.000041s : 1: rewriter_after_opt_a 0.34% : 0.000088s : 1: rewriter_before_opt_a 0.02% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.02% : 0.000006s : 1: slice_recompute_activation 0.02% : 0.000005s : 1: split_layernorm_comm 0.02% : 0.000005s : 1: split_matmul_comm_elemetwise 0.03% : 0.000008s : 1: swap_dp_allreduce_reducescatter 0.31% : 0.000079s : 1: symbol_engine_optimizer 0.31% : 0.000079s : 1: tuple_transform 22.79% : 0.005858s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:45:26.755.343 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:45:26.755.605 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0174037, [21] [bootstrap]: 0.00041859 [type_inference]: 0.00735615 [event_method]: 1.83e-05 [auto_monad]: 6.09e-05 [graph_reusing]: 5.92999e-06 [inline]: 2.01e-06 [add_attr]: 0.00307157, [1] [add_attr_with_inline]: 0.0030634, [1] [Cycle 1]: 6.539e-05, [2] [tag_attr]: 1.77e-05 [meta_addattr_fg_expand]: 5.87999e-06 [parallel-infer-symbol]: 2.94001e-06 [pre_auto_parallel]: 3.132e-05 [insert-virtual-dataset]: 2.32001e-06 [parallel-infer-symbol-second]: 7.2e-07 [dataset_repeat_opt]: 1.82001e-06 [pipeline_split]: 1.48002e-06 [optimize]: 0.00532315, [53] [py_interpret_to_execute]: 2.799e-05 [rewriter_before_opt_a]: 8.218e-05 [opt_a]: 0.00296592, [2] [Cycle 1]: 0.00210884, [45] [expand_dump_flag]: 3.32002e-06 [switch_simplify]: 4.183e-05 [loop_unroll]: 3.036e-05 [a_1]: 0.00060239 [with_stream_mark]: 1.473e-05 [recompute_prepare]: 8.26002e-06 [updatestate_depend_eliminate]: 4.01001e-06 [updatestate_assign_eliminate]: 3.41999e-06 [updatestate_loads_eliminate]: 2.89001e-06 [parameter_eliminate]: 2.04999e-06 [a_2]: 0.00010793 [accelerated_algorithm]: 6.71999e-06 [shard]: 2.17999e-06 [meta_shard_fg_expand]: 2.29001e-06 [shard_inline]: 6.59999e-06 [merge_send_recv]: 8.40001e-06 [auto_parallel]: 6.21998e-06 [parallel]: 1.828e-05 [flash_sp]: 8.49998e-06 [merge_comm]: 3.88001e-06 [allreduce_fusion]: 3.76999e-06 [matmul_add_comm_reduction]: 9.89001e-06 [allreduce_slice_to_reducescatter]: 6.59988e-07 [virtual_shard_identity]: 8.37e-06 [virtual_dataset]: 7.01001e-06 [get_grad_eliminate_]: 6.66e-06 [virtual_output]: 6.45002e-06 [merge_forward]: 3.73999e-06 [cell_reuse_recompute_pass]: 1.30001e-06 [offload_activation]: 1.024e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.397e-05 [merge_recompute_call_nodes]: 1.76e-06 [before_grad]: 1.009e-05 [set_forward_comm_id_for_comm_node_pass]: 3.58e-06 [meta_fg_expand]: 3.14999e-06 [flash_sp_send_recv_attached]: 2.56998e-06 [receive_attached]: 2.81999e-06 [after_resolve]: 1.183e-05 [a_after_grad]: 9.97999e-06 [renormalize]: 0.00056238 [add_forward_monad_depend]: 4.94e-06 [auto_monad_grad]: 2.29001e-06 [auto_monad_eliminator]: 1.534e-05 [cse]: 7.641e-05 [a_3]: 6.403e-05 [Cycle 2]: 0.00084348, [45] [expand_dump_flag]: 1.17999e-06 [switch_simplify]: 7.66999e-06 [loop_unroll]: 6.61e-06 [a_1]: 0.00014523 [with_stream_mark]: 1.45e-05 [recompute_prepare]: 6.88998e-06 [updatestate_depend_eliminate]: 3.11999e-06 [updatestate_assign_eliminate]: 2.37999e-06 [updatestate_loads_eliminate]: 2.01e-06 [parameter_eliminate]: 1.29998e-06 [a_2]: 9.892e-05 [accelerated_algorithm]: 6.29001e-06 [shard]: 1.09e-06 [meta_shard_fg_expand]: 1.32999e-06 [shard_inline]: 6.14001e-06 [merge_send_recv]: 5.92999e-06 [auto_parallel]: 5.67001e-06 [parallel]: 5.05001e-06 [flash_sp]: 3.26001e-06 [merge_comm]: 3.31999e-06 [allreduce_fusion]: 3.39001e-06 [matmul_add_comm_reduction]: 6.61e-06 [allreduce_slice_to_reducescatter]: 4.69998e-07 [virtual_shard_identity]: 6.42001e-06 [virtual_dataset]: 1.01e-05 [get_grad_eliminate_]: 6.11e-06 [virtual_output]: 5.70001e-06 [merge_forward]: 2.86e-06 [cell_reuse_recompute_pass]: 1.97999e-06 [offload_activation]: 8.05999e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.467e-05 [merge_recompute_call_nodes]: 9.70002e-07 [before_grad]: 9.69e-06 [set_forward_comm_id_for_comm_node_pass]: 3.78001e-06 [meta_fg_expand]: 2.19999e-06 [flash_sp_send_recv_attached]: 9.09989e-07 [receive_attached]: 1.43002e-06 [after_resolve]: 1.297e-05 [a_after_grad]: 9.67001e-06 [renormalize]: 1.00001e-07 [add_forward_monad_depend]: 1.59998e-06 [auto_monad_grad]: 1.26002e-06 [auto_monad_eliminator]: 8.38001e-06 [cse]: 1.45e-05 [a_3]: 4.962e-05 [py_interpret_to_execute_after_opt_a]: 1.268e-05 [slice_cell_reuse_recomputed_activation]: 4.89e-06 [rewriter_after_opt_a]: 3.845e-05 [convert_after_rewriter]: 9.77999e-06 [order_py_execute_after_rewriter]: 7.97998e-06 [mutable_eliminate]: 0.00059435 [opt_b]: 0.00026879, [1] [Cycle 1]: 0.00025926, [7] [b_1]: 0.00016899 [b_2]: 8.40001e-06 [updatestate_depend_eliminate]: 5.27001e-06 [updatestate_assign_eliminate]: 2.42001e-06 [updatestate_loads_eliminate]: 2.35002e-06 [renormalize]: 6.89994e-07 [cse]: 1.635e-05 [optimize_parallel_all_gather_comm]: 1.987e-05 [overlap_param_gather]: 5.92999e-06 [cconv]: 2.806e-05 [loop_unroll]: 0.00045617 [opt_after_cconv]: 0.00013027, [1] [Cycle 1]: 0.00012127, [7] [c_1]: 3.102e-05 [parameter_eliminate]: 2.76e-06 [updatestate_depend_eliminate]: 5.39e-06 [updatestate_assign_eliminate]: 2.54999e-06 [updatestate_loads_eliminate]: 2.37999e-06 [cse]: 1.84e-05 [renormalize]: 5.8001e-07 [remove_dup_value]: 1.705e-05 [tuple_transform]: 8.977e-05, [1] [Cycle 1]: 8.296e-05, [4] [d_1]: 4.399e-05 [none_parameter_eliminate]: 1.59e-06 [renormalize]: 2.3999e-07 [switch_simplify]: 6.87002e-06 [partial_unused_args_eliminate]: 4.33001e-06 [add_recomputation]: 5.007e-05 [cse_after_recomputation]: 2.674e-05, [1] [Cycle 1]: 1.976e-05, [1] [cse]: 1.068e-05 [environ_conv]: 8.13999e-06 [swap_dp_allreduce_reducescatter]: 7.48999e-06 [bias_add_comm_swap]: 5.04e-06 [label_micro_interleaved_index]: 6.66999e-06 [label_fine_grained_interleaved_index]: 5.55001e-06 [merge_cast_opt]: 3.95e-06 [slice_recompute_activation]: 4.47e-06 [micro_interleaved_order_control]: 4.67998e-06 [assign_add_opt]: 3.62002e-06 [ForceFp32Comm]: 3.3e-06 [remove_cast_before_assign_add]: 3.55e-06 [full_micro_interleaved_order_control]: 4.84998e-06 [reorder_send_recv_between_fp_bp]: 5.14e-06 [comm_op_add_attrs]: 3.59002e-06 [add_comm_op_reuse_tag]: 3.26999e-06 [interleave_split_concat_branches]: 3.37002e-06 [interleave_parallel_branches]: 3.49001e-06 [overlap_opt_shard_in_pipeline]: 3.38e-06 [overlap_opt_shard_grad_in_pipeline]: 4.47003e-06 [control_data_broadcast_order]: 1.437e-05 [grouped_pairwise_exchange_alltoall]: 4.03999e-06 [offloading_packed_experts]: 5.91e-06 [overlap_recompute_and_grad_model_parallel]: 7.3e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.59002e-06 [overlap_recompute_allgather_and_fa_grad]: 3.73999e-06 [overlap_recompute_comm]: 4.63999e-06 [overlap_grad_ring_attention]: 7.03e-06 [overlap_grad_flash_sp]: 2.216e-05 [begin_end_overlap_inline]: 2.86999e-06 [split_matmul_comm_elemetwise]: 4.43999e-06 [split_layernorm_comm]: 4.22e-06 [handle_group_info]: 3.45e-06 [symbol_engine_optimizer]: 9.83e-05, [1] [Cycle 1]: 9.159e-05, [6] [build]: 3.06001e-06 [elim_shapecalc]: 9.57999e-06 [elim_not_effective]: 1.297e-05 [opt_reshape]: 7.95998e-06 [fold_const_symbol]: 1.153e-05 [renormalize]: 1.8999e-07 [detach_backward]: 2.98e-06 [pipeline_parallel_scheduler]: 1.62001e-06 [auto_monad_reorder]: 1.862e-05 [get_jit_bprop_graph]: 1.27e-06 [rewriter_after_jit_bprop_graph]: 3.9e-06 [opt_after_jit_grad]: 0.00051732 [validate]: 3.819e-05 Sums bootstrap : 0.000419s : 3.31% type_inference : 0.007356s : 58.10% event_method : 0.000018s : 0.14% auto_monad : 0.000061s : 0.48% graph_reusing : 0.000006s : 0.05% inline : 0.000002s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000018s : 0.14% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.05% parallel-infer-symbol : 0.000003s : 0.02% pre_auto_parallel : 0.000031s : 0.25% insert-virtual-dataset : 0.000002s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.01% pipeline_split : 0.000001s : 0.01% optimize.py_interpret_to_execute : 0.000028s : 0.22% optimize.rewriter_before_opt_a : 0.000082s : 0.65% optimize.opt_a.expand_dump_flag : 0.000005s : 0.04% optimize.opt_a.switch_simplify : 0.000049s : 0.39% optimize.opt_a.loop_unroll : 0.000037s : 0.29% optimize.opt_a.a_1 : 0.000748s : 5.91% optimize.opt_a.with_stream_mark : 0.000029s : 0.23% optimize.opt_a.recompute_prepare : 0.000015s : 0.12% optimize.opt_a.updatestate_depend_eliminate : 0.000007s : 0.06% optimize.opt_a.updatestate_assign_eliminate : 0.000006s : 0.05% optimize.opt_a.updatestate_loads_eliminate : 0.000005s : 0.04% optimize.opt_a.parameter_eliminate : 0.000003s : 0.03% optimize.opt_a.a_2 : 0.000207s : 1.63% optimize.opt_a.accelerated_algorithm : 0.000013s : 0.10% optimize.opt_a.shard : 0.000003s : 0.03% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.03% optimize.opt_a.shard_inline : 0.000013s : 0.10% optimize.opt_a.merge_send_recv : 0.000014s : 0.11% optimize.opt_a.auto_parallel : 0.000012s : 0.09% optimize.opt_a.parallel : 0.000023s : 0.18% optimize.opt_a.flash_sp : 0.000012s : 0.09% optimize.opt_a.merge_comm : 0.000007s : 0.06% optimize.opt_a.allreduce_fusion : 0.000007s : 0.06% optimize.opt_a.matmul_add_comm_reduction : 0.000017s : 0.13% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000015s : 0.12% optimize.opt_a.virtual_dataset : 0.000017s : 0.14% optimize.opt_a.get_grad_eliminate_ : 0.000013s : 0.10% optimize.opt_a.virtual_output : 0.000012s : 0.10% optimize.opt_a.merge_forward : 0.000007s : 0.05% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.03% optimize.opt_a.offload_activation : 0.000018s : 0.14% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000029s : 0.23% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.02% optimize.opt_a.before_grad : 0.000020s : 0.16% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000007s : 0.06% optimize.opt_a.meta_fg_expand : 0.000005s : 0.04% optimize.opt_a.flash_sp_send_recv_attached : 0.000003s : 0.03% optimize.opt_a.receive_attached : 0.000004s : 0.03% optimize.opt_a.after_resolve : 0.000025s : 0.20% optimize.opt_a.a_after_grad : 0.000020s : 0.16% optimize.opt_a.renormalize : 0.000562s : 4.44% optimize.opt_a.add_forward_monad_depend : 0.000007s : 0.05% optimize.opt_a.auto_monad_grad : 0.000004s : 0.03% optimize.opt_a.auto_monad_eliminator : 0.000024s : 0.19% optimize.opt_a.cse : 0.000091s : 0.72% optimize.opt_a.a_3 : 0.000114s : 0.90% optimize.py_interpret_to_execute_after_opt_a : 0.000013s : 0.10% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.04% optimize.rewriter_after_opt_a : 0.000038s : 0.30% optimize.convert_after_rewriter : 0.000010s : 0.08% optimize.order_py_execute_after_rewriter : 0.000008s : 0.06% optimize.mutable_eliminate : 0.000594s : 4.69% optimize.opt_b.b_1 : 0.000169s : 1.33% optimize.opt_b.b_2 : 0.000008s : 0.07% optimize.opt_b.updatestate_depend_eliminate : 0.000005s : 0.04% optimize.opt_b.updatestate_assign_eliminate : 0.000002s : 0.02% optimize.opt_b.updatestate_loads_eliminate : 0.000002s : 0.02% optimize.opt_b.renormalize : 0.000001s : 0.01% optimize.opt_b.cse : 0.000016s : 0.13% optimize.optimize_parallel_all_gather_comm : 0.000020s : 0.16% optimize.overlap_param_gather : 0.000006s : 0.05% optimize.cconv : 0.000028s : 0.22% optimize.loop_unroll : 0.000456s : 3.60% optimize.opt_after_cconv.c_1 : 0.000031s : 0.25% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.02% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000005s : 0.04% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.02% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.02% optimize.opt_after_cconv.cse : 0.000018s : 0.15% optimize.opt_after_cconv.renormalize : 0.000001s : 0.00% optimize.remove_dup_value : 0.000017s : 0.13% optimize.tuple_transform.d_1 : 0.000044s : 0.35% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000007s : 0.05% optimize.partial_unused_args_eliminate : 0.000004s : 0.03% optimize.add_recomputation : 0.000050s : 0.40% optimize.cse_after_recomputation.cse : 0.000011s : 0.08% optimize.environ_conv : 0.000008s : 0.06% optimize.swap_dp_allreduce_reducescatter : 0.000007s : 0.06% optimize.bias_add_comm_swap : 0.000005s : 0.04% optimize.label_micro_interleaved_index : 0.000007s : 0.05% optimize.label_fine_grained_interleaved_index : 0.000006s : 0.04% optimize.merge_cast_opt : 0.000004s : 0.03% optimize.slice_recompute_activation : 0.000004s : 0.04% optimize.micro_interleaved_order_control : 0.000005s : 0.04% optimize.assign_add_opt : 0.000004s : 0.03% optimize.ForceFp32Comm : 0.000003s : 0.03% optimize.remove_cast_before_assign_add : 0.000004s : 0.03% optimize.full_micro_interleaved_order_control : 0.000005s : 0.04% optimize.reorder_send_recv_between_fp_bp : 0.000005s : 0.04% optimize.comm_op_add_attrs : 0.000004s : 0.03% optimize.add_comm_op_reuse_tag : 0.000003s : 0.03% optimize.interleave_split_concat_branches : 0.000003s : 0.03% optimize.interleave_parallel_branches : 0.000003s : 0.03% optimize.overlap_opt_shard_in_pipeline : 0.000003s : 0.03% optimize.overlap_opt_shard_grad_in_pipeline : 0.000004s : 0.04% optimize.control_data_broadcast_order : 0.000014s : 0.11% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.03% optimize.offloading_packed_experts : 0.000006s : 0.05% optimize.overlap_recompute_and_grad_model_parallel : 0.000007s : 0.06% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.03% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.03% optimize.overlap_recompute_comm : 0.000005s : 0.04% optimize.overlap_grad_ring_attention : 0.000007s : 0.06% optimize.overlap_grad_flash_sp : 0.000022s : 0.18% optimize.begin_end_overlap_inline : 0.000003s : 0.02% optimize.split_matmul_comm_elemetwise : 0.000004s : 0.04% optimize.split_layernorm_comm : 0.000004s : 0.03% optimize.handle_group_info : 0.000003s : 0.03% optimize.symbol_engine_optimizer.build : 0.000003s : 0.02% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000010s : 0.08% optimize.symbol_engine_optimizer.elim_not_effective : 0.000013s : 0.10% optimize.symbol_engine_optimizer.opt_reshape : 0.000008s : 0.06% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000012s : 0.09% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000003s : 0.02% pipeline_parallel_scheduler : 0.000002s : 0.01% auto_monad_reorder : 0.000019s : 0.15% get_jit_bprop_graph : 0.000001s : 0.01% rewriter_after_jit_bprop_graph : 0.000004s : 0.03% opt_after_jit_grad : 0.000517s : 4.09% validate : 0.000038s : 0.30% Time group info: ------[substitution.] 0.000169 28 1.10% : 0.000002s : 2: substitution.elim_not_effective 0.90% : 0.000002s : 2: substitution.fold_const_symbol 3.63% : 0.000006s : 4: substitution.graph_param_transform 76.41% : 0.000129s : 4: substitution.inline 2.14% : 0.000004s : 4: substitution.j_node_and_user_rematch 2.67% : 0.000005s : 4: substitution.remove_not_recompute_node 4.12% : 0.000007s : 4: substitution.replace_old_param 9.02% : 0.000015s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.007309 2 91.15% : 0.006662s : 1: type_inference.infer 8.85% : 0.000647s : 1: type_inference.specialize ------[replace.] 0.000057 8 62.52% : 0.000036s : 4: replace.inline 37.48% : 0.000021s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000140 8 90.51% : 0.000127s : 4: match.inline 9.49% : 0.000013s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000203 1278 0.98% : 0.000002s : 13: predicate.accumulaten_eliminater 0.73% : 0.000001s : 4: predicate.ad_related_special_op_eliminate 0.56% : 0.000001s : 8: predicate.addn_check_dump 0.95% : 0.000002s : 13: predicate.addn_zero_filter 0.83% : 0.000002s : 13: predicate.adjust_all_reduce_mul_add 1.98% : 0.000004s : 21: predicate.arithmetic_simplify 0.97% : 0.000002s : 13: predicate.cast_eliminate 0.65% : 0.000001s : 8: predicate.check_bprop_eliminate 0.51% : 0.000001s : 8: predicate.compare_switch_simplify 0.19% : 0.000000s : 4: predicate.const_output_eliminate 0.65% : 0.000001s : 8: predicate.depend_value_elim 0.90% : 0.000002s : 13: predicate.dict_get_item_const_eliminator 1.03% : 0.000002s : 13: predicate.dict_get_item_eliminator 0.88% : 0.000002s : 13: predicate.dict_set_item_eliminator 0.92% : 0.000002s : 8: predicate.dumpgradient_eliminate 0.24% : 0.000000s : 4: predicate.elim_not_effective 0.38% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.19% : 0.000002s : 17: predicate.environ_add_const_eliminate 1.07% : 0.000002s : 17: predicate.environ_get_add_eliminate 1.21% : 0.000002s : 17: predicate.environ_get_depend_swap 1.71% : 0.000003s : 25: predicate.environ_get_eliminate 1.07% : 0.000002s : 17: predicate.environ_get_set_eliminate 1.42% : 0.000003s : 21: predicate.exchange_switch_depend_value 2.60% : 0.000005s : 21: predicate.float_depend_g_call 0.54% : 0.000001s : 8: predicate.float_environ_get_switch 0.83% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.22% : 0.000000s : 4: predicate.fold_const_symbol 0.74% : 0.000002s : 8: predicate.get_grad_eliminate 0.20% : 0.000000s : 4: predicate.graph_param_transform 0.60% : 0.000001s : 8: predicate.incorporate_call 0.49% : 0.000001s : 8: predicate.incorporate_call_switch 6.25% : 0.000013s : 58: predicate.inline 0.72% : 0.000001s : 8: predicate.inline_without_move 0.34% : 0.000001s : 8: predicate.j_node_and_user_rematch 0.77% : 0.000002s : 8: predicate.less_batch_normalization 1.86% : 0.000004s : 25: predicate.list_to_tuple_eliminator_ 2.50% : 0.000005s : 38: predicate.load_eliminater 0.90% : 0.000002s : 4: predicate.loop_unroll_after_grad 2.41% : 0.000005s : 34: predicate.loop_unroll_before_grad 1.63% : 0.000003s : 21: predicate.make_slice_get_slice_eliminator 0.52% : 0.000001s : 8: predicate.merge_addn 0.58% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.62% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.84% : 0.000002s : 13: predicate.minmaximum_grad 1.12% : 0.000002s : 4: predicate.mutable_eliminate 0.35% : 0.000001s : 4: predicate.opt_reshape 0.43% : 0.000001s : 4: predicate.parallel_virtual_node 2.01% : 0.000004s : 21: predicate.partial_defer_inline 1.66% : 0.000003s : 21: predicate.partial_eliminate 0.86% : 0.000002s : 13: predicate.print_const_string_wrapper 0.71% : 0.000001s : 8: predicate.reduce_all_const_elim 1.20% : 0.000002s : 13: predicate.reduce_eliminate 2.61% : 0.000005s : 38: predicate.redundant_stop_gradient_eliminater 0.39% : 0.000001s : 8: predicate.remove_not_recompute_node 1.35% : 0.000003s : 25: predicate.replace_applicator 0.52% : 0.000001s : 8: predicate.replace_old_param 0.27% : 0.000001s : 4: predicate.reset_defer_inline 0.97% : 0.000002s : 13: predicate.reshape_eliminate 0.74% : 0.000001s : 8: predicate.row_tensor_add_zeros_like 0.42% : 0.000001s : 4: predicate.row_tensor_eliminate 0.88% : 0.000002s : 8: predicate.same_eliminate 0.42% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.75% : 0.000002s : 8: predicate.shard_identity_eliminate 0.72% : 0.000001s : 8: predicate.special_op_eliminate 0.66% : 0.000001s : 8: predicate.specialize_transform 0.88% : 0.000002s : 8: predicate.split_environ_get_set_with_tuple_value 0.80% : 0.000002s : 8: predicate.stack_unstack_eliminate 0.33% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.51% : 0.000003s : 21: predicate.switch_defer_inline 2.11% : 0.000004s : 29: predicate.switch_layer_defer_inline 5.04% : 0.000010s : 67: predicate.switch_simplify 0.95% : 0.000002s : 13: predicate.tile_eliminate 0.87% : 0.000002s : 13: predicate.transpose_eliminate 1.50% : 0.000003s : 21: predicate.tuple_list_convert_item_index_to_positive 1.58% : 0.000003s : 21: predicate.tuple_list_get_item_const_eliminator 1.41% : 0.000003s : 21: predicate.tuple_list_get_item_depend_reorder 3.31% : 0.000007s : 33: predicate.tuple_list_get_item_eliminator 1.46% : 0.000003s : 21: predicate.tuple_list_get_set_item_eliminator 2.13% : 0.000004s : 29: predicate.tuple_list_set_item_eliminator 1.70% : 0.000003s : 25: predicate.tuple_to_list_eliminator_ 2.45% : 0.000005s : 38: predicate.updatestate_pure_node_eliminater 3.25% : 0.000007s : 46: predicate.updatestate_useless_node_eliminater 0.38% : 0.000001s : 4: predicate.value_based_eliminate 0.76% : 0.000002s : 8: predicate.virtual_dataset_eliminate 0.62% : 0.000001s : 8: predicate.virtual_output_eliminate 0.27% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.43% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000486 11 56.61% : 0.000275s : 5: func_graph_cloner_run.FuncGraphClonerGraph 43.39% : 0.000211s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.027683 192 0.02% : 0.000006s : 1: ForceFp32Comm 11.13% : 0.003080s : 1: add_attr 11.08% : 0.003067s : 1: add_attr_with_inline 0.02% : 0.000006s : 1: add_comm_op_reuse_tag 0.19% : 0.000054s : 1: add_recomputation 0.02% : 0.000006s : 1: assign_add_opt 0.25% : 0.000069s : 1: auto_monad 0.09% : 0.000026s : 1: auto_monad_reorder 0.02% : 0.000006s : 1: begin_end_overlap_inline 0.03% : 0.000008s : 1: bias_add_comm_swap 1.66% : 0.000461s : 1: bootstrap 0.11% : 0.000031s : 1: cconv 0.02% : 0.000006s : 1: comm_op_add_attrs 0.06% : 0.000017s : 1: control_data_broadcast_order 0.05% : 0.000013s : 1: convert_after_rewriter 0.11% : 0.000030s : 1: cse_after_recomputation 0.03% : 0.000007s : 1: dataset_repeat_opt 0.06% : 0.000016s : 1: detach_backward 0.04% : 0.000011s : 1: environ_conv 0.10% : 0.000028s : 1: event_method 0.03% : 0.000007s : 1: full_micro_interleaved_order_control 0.03% : 0.000007s : 1: get_jit_bprop_graph 0.04% : 0.000012s : 1: graph_reusing 0.02% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.02% : 0.000006s : 1: handle_group_info 0.03% : 0.000008s : 1: inline 0.03% : 0.000008s : 1: insert-virtual-dataset 0.02% : 0.000006s : 1: interleave_parallel_branches 0.02% : 0.000006s : 1: interleave_split_concat_branches 0.03% : 0.000008s : 1: label_fine_grained_interleaved_index 0.03% : 0.000009s : 1: label_micro_interleaved_index 1.67% : 0.000462s : 1: loop_unroll 0.02% : 0.000007s : 1: merge_cast_opt 0.03% : 0.000007s : 1: micro_interleaved_order_control 2.17% : 0.000600s : 1: mutable_eliminate 0.03% : 0.000009s : 1: offloading_packed_experts 0.05% : 0.000013s : 1: opt.transform.loop_unroll_optimizer 0.05% : 0.000015s : 1: opt.transform.mutable_eliminate 4.25% : 0.001176s : 78: opt.transform.opt_a 0.11% : 0.000030s : 1: opt.transform.opt_after_cconv 0.09% : 0.000025s : 1: opt.transform.opt_after_jit_grad 0.38% : 0.000105s : 28: opt.transform.opt_b 0.18% : 0.000049s : 2: opt.transform.opt_trans_graph 0.14% : 0.000038s : 4: opt.transform.symbol_engine_opt 10.73% : 0.002969s : 1: opt_a 0.48% : 0.000134s : 1: opt_after_cconv 1.91% : 0.000527s : 1: opt_after_jit_grad 0.98% : 0.000272s : 1: opt_b 20.28% : 0.005615s : 1: optimize 0.08% : 0.000023s : 1: optimize_parallel_all_gather_comm 0.04% : 0.000011s : 1: order_py_execute_after_rewriter 0.09% : 0.000025s : 1: overlap_grad_flash_sp 0.02% : 0.000006s : 1: overlap_grad_matmul_and_grad_allreduce 0.04% : 0.000010s : 1: overlap_grad_ring_attention 0.03% : 0.000007s : 1: overlap_opt_shard_grad_in_pipeline 0.02% : 0.000006s : 1: overlap_opt_shard_in_pipeline 0.03% : 0.000009s : 1: overlap_param_gather 0.02% : 0.000006s : 1: overlap_recompute_allgather_and_fa_grad 0.04% : 0.000011s : 1: overlap_recompute_and_grad_model_parallel 0.03% : 0.000008s : 1: overlap_recompute_comm 0.03% : 0.000009s : 1: parallel-infer-symbol 0.02% : 0.000006s : 1: parallel-infer-symbol-second 0.03% : 0.000007s : 1: partial_unused_args_eliminate 0.03% : 0.000009s : 1: pipeline_parallel_scheduler 0.02% : 0.000007s : 1: pipeline_split 0.14% : 0.000039s : 1: pre_auto_parallel 0.11% : 0.000032s : 1: py_interpret_to_execute 0.06% : 0.000016s : 1: py_interpret_to_execute_after_opt_a 0.02% : 0.000006s : 1: remove_cast_before_assign_add 0.07% : 0.000020s : 1: remove_dup_value 1.04% : 0.000288s : 1: renormalize.infer 0.97% : 0.000268s : 1: renormalize.specialize 0.03% : 0.000008s : 1: reorder_send_recv_between_fp_bp 0.03% : 0.000010s : 1: rewriter_after_jit_bprop_graph 0.15% : 0.000042s : 1: rewriter_after_opt_a 0.31% : 0.000086s : 1: rewriter_before_opt_a 0.03% : 0.000008s : 1: slice_cell_reuse_recomputed_activation 0.03% : 0.000008s : 1: slice_recompute_activation 0.03% : 0.000007s : 1: split_layernorm_comm 0.03% : 0.000007s : 1: split_matmul_comm_elemetwise 0.04% : 0.000010s : 1: swap_dp_allreduce_reducescatter 0.37% : 0.000101s : 1: symbol_engine_optimizer 0.33% : 0.000093s : 1: tuple_transform 26.70% : 0.007391s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:45:27.918.31 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0142245, [21] [bootstrap]: 0.00042208 [type_inference]: 0.00532998 [event_method]: 1.732e-05 [auto_monad]: 6.39e-05 [graph_reusing]: 5.84e-06 [inline]: 2.08002e-06 [add_attr]: 0.00311826, [1] [add_attr_with_inline]: 0.00310961, [1] [Cycle 1]: 5.612e-05, [2] [tag_attr]: 1.915e-05 [meta_addattr_fg_expand]: 5.78002e-06 [parallel-infer-symbol]: 3.81999e-06 [pre_auto_parallel]: 3.237e-05 [insert-virtual-dataset]: 2.27999e-06 [parallel-infer-symbol-second]: 7.50006e-07 [dataset_repeat_opt]: 2.06998e-06 [pipeline_split]: 1.60001e-06 [optimize]: 0.00454756, [53] [py_interpret_to_execute]: 2.481e-05 [rewriter_before_opt_a]: 7.814e-05 [opt_a]: 0.00255749, [2] [Cycle 1]: 0.00189027, [45] [expand_dump_flag]: 3.35e-06 [switch_simplify]: 4.206e-05 [loop_unroll]: 2.971e-05 [a_1]: 0.00061471 [with_stream_mark]: 1.436e-05 [recompute_prepare]: 8.33999e-06 [updatestate_depend_eliminate]: 3.91999e-06 [updatestate_assign_eliminate]: 3.54002e-06 [updatestate_loads_eliminate]: 2.83998e-06 [parameter_eliminate]: 1.76e-06 [a_2]: 8.095e-05 [accelerated_algorithm]: 6.96001e-06 [shard]: 2.37001e-06 [meta_shard_fg_expand]: 1.69e-06 [shard_inline]: 6.28e-06 [merge_send_recv]: 8.28999e-06 [auto_parallel]: 5.67001e-06 [parallel]: 1.943e-05 [flash_sp]: 7.40998e-06 [merge_comm]: 3.95998e-06 [allreduce_fusion]: 3.74002e-06 [matmul_add_comm_reduction]: 1.002e-05 [allreduce_slice_to_reducescatter]: 6.09987e-07 [virtual_shard_identity]: 8.13999e-06 [virtual_dataset]: 6.63998e-06 [get_grad_eliminate_]: 6.17999e-06 [virtual_output]: 6.36e-06 [merge_forward]: 4.03001e-06 [cell_reuse_recompute_pass]: 1.05999e-06 [offload_activation]: 9.49e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.265e-05 [merge_recompute_call_nodes]: 1.64e-06 [before_grad]: 1.005e-05 [set_forward_comm_id_for_comm_node_pass]: 3.66001e-06 [meta_fg_expand]: 2.68e-06 [flash_sp_send_recv_attached]: 2.86999e-06 [receive_attached]: 2.01e-06 [after_resolve]: 1.136e-05 [a_after_grad]: 1.02e-05 [renormalize]: 0.00055842 [add_forward_monad_depend]: 5.29e-06 [auto_monad_grad]: 1.063e-05 [auto_monad_eliminator]: 1.695e-05 [cse]: 2.965e-05 [a_3]: 4.743e-05 [Cycle 2]: 0.00065777, [45] [expand_dump_flag]: 1.17e-06 [switch_simplify]: 7.57002e-06 [loop_unroll]: 6.14001e-06 [a_1]: 0.00012894 [with_stream_mark]: 1.141e-05 [recompute_prepare]: 6.09999e-06 [updatestate_depend_eliminate]: 2.88e-06 [updatestate_assign_eliminate]: 2.42001e-06 [updatestate_loads_eliminate]: 2.63e-06 [parameter_eliminate]: 1.25999e-06 [a_2]: 7.127e-05 [accelerated_algorithm]: 6.10002e-06 [shard]: 1.06002e-06 [meta_shard_fg_expand]: 1.27999e-06 [shard_inline]: 6.12999e-06 [merge_send_recv]: 5.49e-06 [auto_parallel]: 5.92999e-06 [parallel]: 4.43999e-06 [flash_sp]: 3.48e-06 [merge_comm]: 3.31001e-06 [allreduce_fusion]: 3.23998e-06 [matmul_add_comm_reduction]: 5.82999e-06 [allreduce_slice_to_reducescatter]: 3.69997e-07 [virtual_shard_identity]: 7.19001e-06 [virtual_dataset]: 6.89999e-06 [get_grad_eliminate_]: 7.13e-06 [virtual_output]: 6.06e-06 [merge_forward]: 2.89001e-06 [cell_reuse_recompute_pass]: 1.50001e-06 [offload_activation]: 7.2e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.351e-05 [merge_recompute_call_nodes]: 9.49978e-07 [before_grad]: 1.031e-05 [set_forward_comm_id_for_comm_node_pass]: 3.83001e-06 [meta_fg_expand]: 2.04e-06 [flash_sp_send_recv_attached]: 9.39996e-07 [receive_attached]: 1.29e-06 [after_resolve]: 1.117e-05 [a_after_grad]: 9.57001e-06 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 1.55001e-06 [auto_monad_grad]: 1.25999e-06 [auto_monad_eliminator]: 6.76e-06 [cse]: 1.37e-05 [a_3]: 3.689e-05 [py_interpret_to_execute_after_opt_a]: 8.44998e-06 [slice_cell_reuse_recomputed_activation]: 1.87999e-06 [rewriter_after_opt_a]: 3.44e-05 [convert_after_rewriter]: 6.86001e-06 [order_py_execute_after_rewriter]: 5.05001e-06 [mutable_eliminate]: 0.00050275 [opt_b]: 0.00020957, [1] [Cycle 1]: 0.00020246, [7] [b_1]: 0.00013234 [b_2]: 7.93001e-06 [updatestate_depend_eliminate]: 5.66e-06 [updatestate_assign_eliminate]: 2.51e-06 [updatestate_loads_eliminate]: 2.44999e-06 [renormalize]: 5.60016e-07 [cse]: 1.698e-05 [optimize_parallel_all_gather_comm]: 1.654e-05 [overlap_param_gather]: 1.96003e-06 [cconv]: 2.396e-05 [loop_unroll]: 0.0004146 [opt_after_cconv]: 0.00010413, [1] [Cycle 1]: 9.795e-05, [7] [c_1]: 3.108e-05 [parameter_eliminate]: 2.41998e-06 [updatestate_depend_eliminate]: 5.20001e-06 [updatestate_assign_eliminate]: 2.94001e-06 [updatestate_loads_eliminate]: 2.44001e-06 [cse]: 1.807e-05 [renormalize]: 4.09986e-07 [remove_dup_value]: 1.318e-05 [tuple_transform]: 7.683e-05, [1] [Cycle 1]: 7.226e-05, [4] [d_1]: 4.446e-05 [none_parameter_eliminate]: 1.57001e-06 [renormalize]: 2.30008e-07 [switch_simplify]: 7.18e-06 [partial_unused_args_eliminate]: 2.12001e-06 [add_recomputation]: 4.738e-05 [cse_after_recomputation]: 2.286e-05, [1] [Cycle 1]: 1.799e-05, [1] [cse]: 1.195e-05 [environ_conv]: 5.40001e-06 [swap_dp_allreduce_reducescatter]: 5.75001e-06 [bias_add_comm_swap]: 2.60002e-06 [label_micro_interleaved_index]: 4.38001e-06 [label_fine_grained_interleaved_index]: 2.92002e-06 [merge_cast_opt]: 1.43002e-06 [slice_recompute_activation]: 2.12001e-06 [micro_interleaved_order_control]: 2.02001e-06 [assign_add_opt]: 1.17e-06 [ForceFp32Comm]: 9.39996e-07 [remove_cast_before_assign_add]: 1.07998e-06 [full_micro_interleaved_order_control]: 1.97001e-06 [reorder_send_recv_between_fp_bp]: 2.91e-06 [comm_op_add_attrs]: 9.49978e-07 [add_comm_op_reuse_tag]: 9.80013e-07 [interleave_split_concat_branches]: 1.10001e-06 [interleave_parallel_branches]: 1.17e-06 [overlap_opt_shard_in_pipeline]: 1.59998e-06 [overlap_opt_shard_grad_in_pipeline]: 1.72001e-06 [control_data_broadcast_order]: 1.26e-05 [grouped_pairwise_exchange_alltoall]: 2.01e-06 [offloading_packed_experts]: 3.71999e-06 [overlap_recompute_and_grad_model_parallel]: 4.27998e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.18001e-06 [overlap_recompute_allgather_and_fa_grad]: 1.30999e-06 [overlap_recompute_comm]: 2.68e-06 [overlap_grad_ring_attention]: 4.50001e-06 [overlap_grad_flash_sp]: 1.898e-05 [begin_end_overlap_inline]: 8.09989e-07 [split_matmul_comm_elemetwise]: 2.06e-06 [split_layernorm_comm]: 1.62999e-06 [handle_group_info]: 1.14e-06 [symbol_engine_optimizer]: 7.664e-05, [1] [Cycle 1]: 7.231e-05, [6] [build]: 3.09999e-06 [elim_shapecalc]: 9.75002e-06 [elim_not_effective]: 1.248e-05 [opt_reshape]: 7.16999e-06 [fold_const_symbol]: 1.145e-05 [renormalize]: 2.89991e-07 [detach_backward]: 1.72999e-06 [pipeline_parallel_scheduler]: 1.55999e-06 [auto_monad_reorder]: 1.737e-05 [get_jit_bprop_graph]: 1.25001e-06 [rewriter_after_jit_bprop_graph]: 4.15e-06 [opt_after_jit_grad]: 0.00046824 [validate]: 3.69e-05 Sums bootstrap : 0.000422s : 4.16% type_inference : 0.005330s : 52.50% event_method : 0.000017s : 0.17% auto_monad : 0.000064s : 0.63% graph_reusing : 0.000006s : 0.06% inline : 0.000002s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000019s : 0.19% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.06% parallel-infer-symbol : 0.000004s : 0.04% pre_auto_parallel : 0.000032s : 0.32% insert-virtual-dataset : 0.000002s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.02% optimize.py_interpret_to_execute : 0.000025s : 0.24% optimize.rewriter_before_opt_a : 0.000078s : 0.77% optimize.opt_a.expand_dump_flag : 0.000005s : 0.04% optimize.opt_a.switch_simplify : 0.000050s : 0.49% optimize.opt_a.loop_unroll : 0.000036s : 0.35% optimize.opt_a.a_1 : 0.000744s : 7.32% optimize.opt_a.with_stream_mark : 0.000026s : 0.25% optimize.opt_a.recompute_prepare : 0.000014s : 0.14% optimize.opt_a.updatestate_depend_eliminate : 0.000007s : 0.07% optimize.opt_a.updatestate_assign_eliminate : 0.000006s : 0.06% optimize.opt_a.updatestate_loads_eliminate : 0.000005s : 0.05% optimize.opt_a.parameter_eliminate : 0.000003s : 0.03% optimize.opt_a.a_2 : 0.000152s : 1.50% optimize.opt_a.accelerated_algorithm : 0.000013s : 0.13% optimize.opt_a.shard : 0.000003s : 0.03% optimize.opt_a.meta_shard_fg_expand : 0.000003s : 0.03% optimize.opt_a.shard_inline : 0.000012s : 0.12% optimize.opt_a.merge_send_recv : 0.000014s : 0.14% optimize.opt_a.auto_parallel : 0.000012s : 0.11% optimize.opt_a.parallel : 0.000024s : 0.24% optimize.opt_a.flash_sp : 0.000011s : 0.11% optimize.opt_a.merge_comm : 0.000007s : 0.07% optimize.opt_a.allreduce_fusion : 0.000007s : 0.07% optimize.opt_a.matmul_add_comm_reduction : 0.000016s : 0.16% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000015s : 0.15% optimize.opt_a.virtual_dataset : 0.000014s : 0.13% optimize.opt_a.get_grad_eliminate_ : 0.000013s : 0.13% optimize.opt_a.virtual_output : 0.000012s : 0.12% optimize.opt_a.merge_forward : 0.000007s : 0.07% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.03% optimize.opt_a.offload_activation : 0.000017s : 0.16% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000026s : 0.26% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.03% optimize.opt_a.before_grad : 0.000020s : 0.20% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000007s : 0.07% optimize.opt_a.meta_fg_expand : 0.000005s : 0.05% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.04% optimize.opt_a.receive_attached : 0.000003s : 0.03% optimize.opt_a.after_resolve : 0.000023s : 0.22% optimize.opt_a.a_after_grad : 0.000020s : 0.19% optimize.opt_a.renormalize : 0.000558s : 5.50% optimize.opt_a.add_forward_monad_depend : 0.000007s : 0.07% optimize.opt_a.auto_monad_grad : 0.000012s : 0.12% optimize.opt_a.auto_monad_eliminator : 0.000024s : 0.23% optimize.opt_a.cse : 0.000043s : 0.43% optimize.opt_a.a_3 : 0.000084s : 0.83% optimize.py_interpret_to_execute_after_opt_a : 0.000008s : 0.08% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.02% optimize.rewriter_after_opt_a : 0.000034s : 0.34% optimize.convert_after_rewriter : 0.000007s : 0.07% optimize.order_py_execute_after_rewriter : 0.000005s : 0.05% optimize.mutable_eliminate : 0.000503s : 4.95% optimize.opt_b.b_1 : 0.000132s : 1.30% optimize.opt_b.b_2 : 0.000008s : 0.08% optimize.opt_b.updatestate_depend_eliminate : 0.000006s : 0.06% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.02% optimize.opt_b.updatestate_loads_eliminate : 0.000002s : 0.02% optimize.opt_b.renormalize : 0.000001s : 0.01% optimize.opt_b.cse : 0.000017s : 0.17% optimize.optimize_parallel_all_gather_comm : 0.000017s : 0.16% optimize.overlap_param_gather : 0.000002s : 0.02% optimize.cconv : 0.000024s : 0.24% optimize.loop_unroll : 0.000415s : 4.08% optimize.opt_after_cconv.c_1 : 0.000031s : 0.31% optimize.opt_after_cconv.parameter_eliminate : 0.000002s : 0.02% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000005s : 0.05% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.02% optimize.opt_after_cconv.cse : 0.000018s : 0.18% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000013s : 0.13% optimize.tuple_transform.d_1 : 0.000044s : 0.44% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.02% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000007s : 0.07% optimize.partial_unused_args_eliminate : 0.000002s : 0.02% optimize.add_recomputation : 0.000047s : 0.47% optimize.cse_after_recomputation.cse : 0.000012s : 0.12% optimize.environ_conv : 0.000005s : 0.05% optimize.swap_dp_allreduce_reducescatter : 0.000006s : 0.06% optimize.bias_add_comm_swap : 0.000003s : 0.03% optimize.label_micro_interleaved_index : 0.000004s : 0.04% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.03% optimize.merge_cast_opt : 0.000001s : 0.01% optimize.slice_recompute_activation : 0.000002s : 0.02% optimize.micro_interleaved_order_control : 0.000002s : 0.02% optimize.assign_add_opt : 0.000001s : 0.01% optimize.ForceFp32Comm : 0.000001s : 0.01% optimize.remove_cast_before_assign_add : 0.000001s : 0.01% optimize.full_micro_interleaved_order_control : 0.000002s : 0.02% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.03% optimize.comm_op_add_attrs : 0.000001s : 0.01% optimize.add_comm_op_reuse_tag : 0.000001s : 0.01% optimize.interleave_split_concat_branches : 0.000001s : 0.01% optimize.interleave_parallel_branches : 0.000001s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000002s : 0.02% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.02% optimize.control_data_broadcast_order : 0.000013s : 0.12% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.02% optimize.offloading_packed_experts : 0.000004s : 0.04% optimize.overlap_recompute_and_grad_model_parallel : 0.000004s : 0.04% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.01% optimize.overlap_recompute_comm : 0.000003s : 0.03% optimize.overlap_grad_ring_attention : 0.000005s : 0.04% optimize.overlap_grad_flash_sp : 0.000019s : 0.19% optimize.begin_end_overlap_inline : 0.000001s : 0.01% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.02% optimize.split_layernorm_comm : 0.000002s : 0.02% optimize.handle_group_info : 0.000001s : 0.01% optimize.symbol_engine_optimizer.build : 0.000003s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000010s : 0.10% optimize.symbol_engine_optimizer.elim_not_effective : 0.000012s : 0.12% optimize.symbol_engine_optimizer.opt_reshape : 0.000007s : 0.07% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000011s : 0.11% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.02% pipeline_parallel_scheduler : 0.000002s : 0.02% auto_monad_reorder : 0.000017s : 0.17% get_jit_bprop_graph : 0.000001s : 0.01% rewriter_after_jit_bprop_graph : 0.000004s : 0.04% opt_after_jit_grad : 0.000468s : 4.61% validate : 0.000037s : 0.36% Time group info: ------[substitution.] 0.000171 28 1.06% : 0.000002s : 2: substitution.elim_not_effective 0.83% : 0.000001s : 2: substitution.fold_const_symbol 3.33% : 0.000006s : 4: substitution.graph_param_transform 78.70% : 0.000134s : 4: substitution.inline 1.81% : 0.000003s : 4: substitution.j_node_and_user_rematch 2.89% : 0.000005s : 4: substitution.remove_not_recompute_node 2.57% : 0.000004s : 4: substitution.replace_old_param 8.80% : 0.000015s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.005273 2 87.66% : 0.004623s : 1: type_inference.infer 12.34% : 0.000651s : 1: type_inference.specialize ------[replace.] 0.000060 8 62.07% : 0.000037s : 4: replace.inline 37.93% : 0.000023s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000145 8 90.94% : 0.000132s : 4: match.inline 9.06% : 0.000013s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000204 1278 0.92% : 0.000002s : 13: predicate.accumulaten_eliminater 0.68% : 0.000001s : 4: predicate.ad_related_special_op_eliminate 0.51% : 0.000001s : 8: predicate.addn_check_dump 0.92% : 0.000002s : 13: predicate.addn_zero_filter 0.88% : 0.000002s : 13: predicate.adjust_all_reduce_mul_add 2.01% : 0.000004s : 21: predicate.arithmetic_simplify 0.92% : 0.000002s : 13: predicate.cast_eliminate 0.65% : 0.000001s : 8: predicate.check_bprop_eliminate 0.54% : 0.000001s : 8: predicate.compare_switch_simplify 0.20% : 0.000000s : 4: predicate.const_output_eliminate 0.59% : 0.000001s : 8: predicate.depend_value_elim 0.97% : 0.000002s : 13: predicate.dict_get_item_const_eliminator 1.10% : 0.000002s : 13: predicate.dict_get_item_eliminator 0.85% : 0.000002s : 13: predicate.dict_set_item_eliminator 1.10% : 0.000002s : 8: predicate.dumpgradient_eliminate 0.21% : 0.000000s : 4: predicate.elim_not_effective 0.40% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.13% : 0.000002s : 17: predicate.environ_add_const_eliminate 1.06% : 0.000002s : 17: predicate.environ_get_add_eliminate 1.07% : 0.000002s : 17: predicate.environ_get_depend_swap 1.65% : 0.000003s : 25: predicate.environ_get_eliminate 1.12% : 0.000002s : 17: predicate.environ_get_set_eliminate 1.43% : 0.000003s : 21: predicate.exchange_switch_depend_value 2.44% : 0.000005s : 21: predicate.float_depend_g_call 0.52% : 0.000001s : 8: predicate.float_environ_get_switch 0.76% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.19% : 0.000000s : 4: predicate.fold_const_symbol 0.77% : 0.000002s : 8: predicate.get_grad_eliminate 0.22% : 0.000000s : 4: predicate.graph_param_transform 0.60% : 0.000001s : 8: predicate.incorporate_call 0.47% : 0.000001s : 8: predicate.incorporate_call_switch 6.15% : 0.000013s : 58: predicate.inline 0.74% : 0.000002s : 8: predicate.inline_without_move 0.36% : 0.000001s : 8: predicate.j_node_and_user_rematch 0.90% : 0.000002s : 8: predicate.less_batch_normalization 1.85% : 0.000004s : 25: predicate.list_to_tuple_eliminator_ 2.52% : 0.000005s : 38: predicate.load_eliminater 0.82% : 0.000002s : 4: predicate.loop_unroll_after_grad 2.44% : 0.000005s : 34: predicate.loop_unroll_before_grad 1.65% : 0.000003s : 21: predicate.make_slice_get_slice_eliminator 0.52% : 0.000001s : 8: predicate.merge_addn 0.58% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.56% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.87% : 0.000002s : 13: predicate.minmaximum_grad 1.10% : 0.000002s : 4: predicate.mutable_eliminate 0.45% : 0.000001s : 4: predicate.opt_reshape 0.39% : 0.000001s : 4: predicate.parallel_virtual_node 1.70% : 0.000003s : 21: predicate.partial_defer_inline 1.65% : 0.000003s : 21: predicate.partial_eliminate 0.88% : 0.000002s : 13: predicate.print_const_string_wrapper 0.55% : 0.000001s : 8: predicate.reduce_all_const_elim 1.10% : 0.000002s : 13: predicate.reduce_eliminate 2.46% : 0.000005s : 38: predicate.redundant_stop_gradient_eliminater 0.46% : 0.000001s : 8: predicate.remove_not_recompute_node 1.41% : 0.000003s : 25: predicate.replace_applicator 0.55% : 0.000001s : 8: predicate.replace_old_param 0.33% : 0.000001s : 4: predicate.reset_defer_inline 0.89% : 0.000002s : 13: predicate.reshape_eliminate 0.66% : 0.000001s : 8: predicate.row_tensor_add_zeros_like 0.43% : 0.000001s : 4: predicate.row_tensor_eliminate 0.85% : 0.000002s : 8: predicate.same_eliminate 0.44% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.86% : 0.000002s : 8: predicate.shard_identity_eliminate 0.70% : 0.000001s : 8: predicate.special_op_eliminate 0.72% : 0.000001s : 8: predicate.specialize_transform 1.04% : 0.000002s : 8: predicate.split_environ_get_set_with_tuple_value 0.80% : 0.000002s : 8: predicate.stack_unstack_eliminate 0.33% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.59% : 0.000003s : 21: predicate.switch_defer_inline 2.06% : 0.000004s : 29: predicate.switch_layer_defer_inline 5.17% : 0.000011s : 67: predicate.switch_simplify 0.86% : 0.000002s : 13: predicate.tile_eliminate 0.92% : 0.000002s : 13: predicate.transpose_eliminate 1.57% : 0.000003s : 21: predicate.tuple_list_convert_item_index_to_positive 1.62% : 0.000003s : 21: predicate.tuple_list_get_item_const_eliminator 1.48% : 0.000003s : 21: predicate.tuple_list_get_item_depend_reorder 3.48% : 0.000007s : 33: predicate.tuple_list_get_item_eliminator 1.50% : 0.000003s : 21: predicate.tuple_list_get_set_item_eliminator 2.24% : 0.000005s : 29: predicate.tuple_list_set_item_eliminator 1.79% : 0.000004s : 25: predicate.tuple_to_list_eliminator_ 2.53% : 0.000005s : 38: predicate.updatestate_pure_node_eliminater 3.15% : 0.000006s : 46: predicate.updatestate_useless_node_eliminater 0.34% : 0.000001s : 4: predicate.value_based_eliminate 0.70% : 0.000001s : 8: predicate.virtual_dataset_eliminate 0.62% : 0.000001s : 8: predicate.virtual_output_eliminate 0.29% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.54% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000476 11 54.87% : 0.000261s : 5: func_graph_cloner_run.FuncGraphClonerGraph 45.13% : 0.000215s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.023764 192 0.02% : 0.000004s : 1: ForceFp32Comm 13.14% : 0.003124s : 1: add_attr 13.10% : 0.003113s : 1: add_attr_with_inline 0.02% : 0.000004s : 1: add_comm_op_reuse_tag 0.22% : 0.000051s : 1: add_recomputation 0.02% : 0.000004s : 1: assign_add_opt 0.29% : 0.000069s : 1: auto_monad 0.09% : 0.000021s : 1: auto_monad_reorder 0.02% : 0.000004s : 1: begin_end_overlap_inline 0.02% : 0.000006s : 1: bias_add_comm_swap 1.89% : 0.000449s : 1: bootstrap 0.12% : 0.000028s : 1: cconv 0.02% : 0.000004s : 1: comm_op_add_attrs 0.07% : 0.000016s : 1: control_data_broadcast_order 0.04% : 0.000010s : 1: convert_after_rewriter 0.11% : 0.000026s : 1: cse_after_recomputation 0.02% : 0.000005s : 1: dataset_repeat_opt 0.02% : 0.000005s : 1: detach_backward 0.04% : 0.000009s : 1: environ_conv 0.10% : 0.000023s : 1: event_method 0.02% : 0.000005s : 1: full_micro_interleaved_order_control 0.02% : 0.000004s : 1: get_jit_bprop_graph 0.04% : 0.000010s : 1: graph_reusing 0.02% : 0.000005s : 1: grouped_pairwise_exchange_alltoall 0.02% : 0.000004s : 1: handle_group_info 0.02% : 0.000005s : 1: inline 0.02% : 0.000006s : 1: insert-virtual-dataset 0.02% : 0.000004s : 1: interleave_parallel_branches 0.02% : 0.000004s : 1: interleave_split_concat_branches 0.03% : 0.000006s : 1: label_fine_grained_interleaved_index 0.03% : 0.000007s : 1: label_micro_interleaved_index 1.78% : 0.000423s : 1: loop_unroll 0.02% : 0.000004s : 1: merge_cast_opt 0.02% : 0.000005s : 1: micro_interleaved_order_control 2.15% : 0.000511s : 1: mutable_eliminate 0.03% : 0.000007s : 1: offloading_packed_experts 0.06% : 0.000014s : 1: opt.transform.loop_unroll_optimizer 0.06% : 0.000015s : 1: opt.transform.mutable_eliminate 4.89% : 0.001162s : 78: opt.transform.opt_a 0.12% : 0.000030s : 1: opt.transform.opt_after_cconv 0.10% : 0.000025s : 1: opt.transform.opt_after_jit_grad 0.45% : 0.000107s : 28: opt.transform.opt_b 0.21% : 0.000049s : 2: opt.transform.opt_trans_graph 0.16% : 0.000037s : 4: opt.transform.symbol_engine_opt 10.77% : 0.002560s : 1: opt_a 0.45% : 0.000108s : 1: opt_after_cconv 2.01% : 0.000477s : 1: opt_after_jit_grad 0.90% : 0.000213s : 1: opt_b 19.16% : 0.004552s : 1: optimize 0.08% : 0.000020s : 1: optimize_parallel_all_gather_comm 0.03% : 0.000008s : 1: order_py_execute_after_rewriter 0.09% : 0.000022s : 1: overlap_grad_flash_sp 0.02% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.03% : 0.000007s : 1: overlap_grad_ring_attention 0.02% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.02% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.02% : 0.000005s : 1: overlap_param_gather 0.02% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.03% : 0.000008s : 1: overlap_recompute_and_grad_model_parallel 0.02% : 0.000006s : 1: overlap_recompute_comm 0.03% : 0.000007s : 1: parallel-infer-symbol 0.02% : 0.000004s : 1: parallel-infer-symbol-second 0.02% : 0.000005s : 1: partial_unused_args_eliminate 0.02% : 0.000005s : 1: pipeline_parallel_scheduler 0.02% : 0.000004s : 1: pipeline_split 0.15% : 0.000037s : 1: pre_auto_parallel 0.12% : 0.000029s : 1: py_interpret_to_execute 0.05% : 0.000012s : 1: py_interpret_to_execute_after_opt_a 0.02% : 0.000004s : 1: remove_cast_before_assign_add 0.07% : 0.000017s : 1: remove_dup_value 1.23% : 0.000293s : 1: renormalize.infer 1.09% : 0.000258s : 1: renormalize.specialize 0.02% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.03% : 0.000008s : 1: rewriter_after_jit_bprop_graph 0.16% : 0.000038s : 1: rewriter_after_opt_a 0.35% : 0.000082s : 1: rewriter_before_opt_a 0.02% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.02% : 0.000006s : 1: slice_recompute_activation 0.02% : 0.000005s : 1: split_layernorm_comm 0.02% : 0.000005s : 1: split_matmul_comm_elemetwise 0.04% : 0.000009s : 1: swap_dp_allreduce_reducescatter 0.33% : 0.000079s : 1: symbol_engine_optimizer 0.34% : 0.000080s : 1: tuple_transform 22.50% : 0.005346s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:45:27.387.232 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:45:27.387.514 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0167407, [21] [bootstrap]: 0.00045038 [type_inference]: 0.00583746 [event_method]: 2.072e-05 [auto_monad]: 6.413e-05 [graph_reusing]: 5.41998e-06 [inline]: 2.26e-06 [add_attr]: 0.003186, [1] [add_attr_with_inline]: 0.00317664, [1] [Cycle 1]: 7.866e-05, [2] [tag_attr]: 2.007e-05 [meta_addattr_fg_expand]: 6.36998e-06 [parallel-infer-symbol]: 3.38e-06 [pre_auto_parallel]: 3.566e-05 [insert-virtual-dataset]: 2.42001e-06 [parallel-infer-symbol-second]: 7.89994e-07 [dataset_repeat_opt]: 1.79e-06 [pipeline_split]: 1.86e-06 [optimize]: 0.00593674, [53] [py_interpret_to_execute]: 3.04e-05 [rewriter_before_opt_a]: 9.27e-05 [opt_a]: 0.00345606, [2] [Cycle 1]: 0.00249476, [45] [expand_dump_flag]: 3.28e-06 [switch_simplify]: 4.332e-05 [loop_unroll]: 3.211e-05 [a_1]: 0.00067319 [with_stream_mark]: 2.012e-05 [recompute_prepare]: 1.155e-05 [updatestate_depend_eliminate]: 4.92999e-06 [updatestate_assign_eliminate]: 4.34002e-06 [updatestate_loads_eliminate]: 3.68999e-06 [parameter_eliminate]: 2.06998e-06 [a_2]: 0.00012982 [accelerated_algorithm]: 8.76002e-06 [shard]: 2.34999e-06 [meta_shard_fg_expand]: 2.21e-06 [shard_inline]: 8.15999e-06 [merge_send_recv]: 9.69e-06 [auto_parallel]: 7.15998e-06 [parallel]: 4.338e-05 [flash_sp]: 9.67001e-06 [merge_comm]: 5.47999e-06 [allreduce_fusion]: 4.4e-06 [matmul_add_comm_reduction]: 1.162e-05 [allreduce_slice_to_reducescatter]: 9.70002e-07 [virtual_shard_identity]: 1.018e-05 [virtual_dataset]: 8.27e-06 [get_grad_eliminate_]: 7.98999e-06 [virtual_output]: 8.64e-06 [merge_forward]: 4.89998e-06 [cell_reuse_recompute_pass]: 1.39e-06 [offload_activation]: 1.165e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.899e-05 [merge_recompute_call_nodes]: 1.52999e-06 [before_grad]: 1.421e-05 [set_forward_comm_id_for_comm_node_pass]: 4.61002e-06 [meta_fg_expand]: 3.51999e-06 [flash_sp_send_recv_attached]: 2.61e-06 [receive_attached]: 2.62001e-06 [after_resolve]: 1.345e-05 [a_after_grad]: 1.353e-05 [renormalize]: 0.00078281 [add_forward_monad_depend]: 6.71e-06 [auto_monad_grad]: 2.78998e-06 [auto_monad_eliminator]: 1.96e-05 [cse]: 3.706e-05 [a_3]: 7.95e-05 [Cycle 2]: 0.00094694, [45] [expand_dump_flag]: 1.30999e-06 [switch_simplify]: 9.31e-06 [loop_unroll]: 7.56999e-06 [a_1]: 0.00017815 [with_stream_mark]: 1.379e-05 [recompute_prepare]: 8.22998e-06 [updatestate_depend_eliminate]: 4.39998e-06 [updatestate_assign_eliminate]: 3.45e-06 [updatestate_loads_eliminate]: 2.96001e-06 [parameter_eliminate]: 1.32999e-06 [a_2]: 0.00012106 [accelerated_algorithm]: 7.63999e-06 [shard]: 1.32999e-06 [meta_shard_fg_expand]: 1.89999e-06 [shard_inline]: 7.98999e-06 [merge_send_recv]: 7.37002e-06 [auto_parallel]: 7.45e-06 [parallel]: 5.09003e-06 [flash_sp]: 3.54002e-06 [merge_comm]: 4.15999e-06 [allreduce_fusion]: 4.37998e-06 [matmul_add_comm_reduction]: 8.94e-06 [allreduce_slice_to_reducescatter]: 5.49975e-07 [virtual_shard_identity]: 8.52998e-06 [virtual_dataset]: 7.35e-06 [get_grad_eliminate_]: 7.71999e-06 [virtual_output]: 7.05e-06 [merge_forward]: 3.48e-06 [cell_reuse_recompute_pass]: 1.84e-06 [offload_activation]: 8.30999e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.692e-05 [merge_recompute_call_nodes]: 1.09e-06 [before_grad]: 1.211e-05 [set_forward_comm_id_for_comm_node_pass]: 4.25e-06 [meta_fg_expand]: 2.84999e-06 [flash_sp_send_recv_attached]: 1.10001e-06 [receive_attached]: 1.57001e-06 [after_resolve]: 1.152e-05 [a_after_grad]: 1.164e-05 [renormalize]: 8.00064e-08 [add_forward_monad_depend]: 1.09003e-06 [auto_monad_grad]: 9.49978e-07 [auto_monad_eliminator]: 8.95001e-06 [cse]: 1.879e-05 [a_3]: 6.591e-05 [py_interpret_to_execute_after_opt_a]: 1.393e-05 [slice_cell_reuse_recomputed_activation]: 4.79998e-06 [rewriter_after_opt_a]: 4.357e-05 [convert_after_rewriter]: 1.104e-05 [order_py_execute_after_rewriter]: 8.55999e-06 [mutable_eliminate]: 0.00057673 [opt_b]: 0.00030936, [1] [Cycle 1]: 0.00029944, [7] [b_1]: 0.0001967 [b_2]: 9.57001e-06 [updatestate_depend_eliminate]: 6.09999e-06 [updatestate_assign_eliminate]: 3.13e-06 [updatestate_loads_eliminate]: 3.06999e-06 [renormalize]: 3.30008e-07 [cse]: 2.536e-05 [optimize_parallel_all_gather_comm]: 2.091e-05 [overlap_param_gather]: 4.74e-06 [cconv]: 3.01e-05 [loop_unroll]: 0.0004599 [opt_after_cconv]: 0.00015465, [1] [Cycle 1]: 0.00013188, [7] [c_1]: 3.767e-05 [parameter_eliminate]: 3.01001e-06 [updatestate_depend_eliminate]: 5.98002e-06 [updatestate_assign_eliminate]: 3.28e-06 [updatestate_loads_eliminate]: 2.98e-06 [cse]: 2.291e-05 [renormalize]: 4.59986e-07 [remove_dup_value]: 1.896e-05 [tuple_transform]: 0.00010337, [1] [Cycle 1]: 9.569e-05, [4] [d_1]: 5.415e-05 [none_parameter_eliminate]: 1.92999e-06 [renormalize]: 2.30008e-07 [switch_simplify]: 8.43999e-06 [partial_unused_args_eliminate]: 4.67e-06 [add_recomputation]: 5.87e-05 [cse_after_recomputation]: 3.268e-05, [1] [Cycle 1]: 2.524e-05, [1] [cse]: 1.617e-05 [environ_conv]: 9.48997e-06 [swap_dp_allreduce_reducescatter]: 8.94e-06 [bias_add_comm_swap]: 5.70001e-06 [label_micro_interleaved_index]: 6.63e-06 [label_fine_grained_interleaved_index]: 5.55001e-06 [merge_cast_opt]: 3.74002e-06 [slice_recompute_activation]: 4.4e-06 [micro_interleaved_order_control]: 4.53999e-06 [assign_add_opt]: 4.03001e-06 [ForceFp32Comm]: 3.45e-06 [remove_cast_before_assign_add]: 3.76999e-06 [full_micro_interleaved_order_control]: 4.42e-06 [reorder_send_recv_between_fp_bp]: 5.24003e-06 [comm_op_add_attrs]: 3.4e-06 [add_comm_op_reuse_tag]: 3.25e-06 [interleave_split_concat_branches]: 3.6e-06 [interleave_parallel_branches]: 3.56999e-06 [overlap_opt_shard_in_pipeline]: 3.53e-06 [overlap_opt_shard_grad_in_pipeline]: 4.57e-06 [control_data_broadcast_order]: 1.735e-05 [grouped_pairwise_exchange_alltoall]: 4.27e-06 [offloading_packed_experts]: 7.00002e-06 [overlap_recompute_and_grad_model_parallel]: 7.77002e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.65e-06 [overlap_recompute_allgather_and_fa_grad]: 3.81001e-06 [overlap_recompute_comm]: 4.75001e-06 [overlap_grad_ring_attention]: 7.33e-06 [overlap_grad_flash_sp]: 2.407e-05 [begin_end_overlap_inline]: 2.89999e-06 [split_matmul_comm_elemetwise]: 4.62e-06 [split_layernorm_comm]: 4.68001e-06 [handle_group_info]: 3.25e-06 [symbol_engine_optimizer]: 0.00010496, [1] [Cycle 1]: 9.785e-05, [6] [build]: 3.7e-06 [elim_shapecalc]: 1.095e-05 [elim_not_effective]: 1.585e-05 [opt_reshape]: 8.63001e-06 [fold_const_symbol]: 1.202e-05 [renormalize]: 2.60014e-07 [detach_backward]: 3.65998e-06 [pipeline_parallel_scheduler]: 1.96998e-06 [auto_monad_reorder]: 2.378e-05 [get_jit_bprop_graph]: 1.60999e-06 [rewriter_after_jit_bprop_graph]: 4.42e-06 [opt_after_jit_grad]: 0.00051927 [validate]: 4.274e-05 Sums bootstrap : 0.000450s : 3.82% type_inference : 0.005837s : 49.54% event_method : 0.000021s : 0.18% auto_monad : 0.000064s : 0.54% graph_reusing : 0.000005s : 0.05% inline : 0.000002s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000020s : 0.17% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.05% parallel-infer-symbol : 0.000003s : 0.03% pre_auto_parallel : 0.000036s : 0.30% insert-virtual-dataset : 0.000002s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.02% optimize.py_interpret_to_execute : 0.000030s : 0.26% optimize.rewriter_before_opt_a : 0.000093s : 0.79% optimize.opt_a.expand_dump_flag : 0.000005s : 0.04% optimize.opt_a.switch_simplify : 0.000053s : 0.45% optimize.opt_a.loop_unroll : 0.000040s : 0.34% optimize.opt_a.a_1 : 0.000851s : 7.22% optimize.opt_a.with_stream_mark : 0.000034s : 0.29% optimize.opt_a.recompute_prepare : 0.000020s : 0.17% optimize.opt_a.updatestate_depend_eliminate : 0.000009s : 0.08% optimize.opt_a.updatestate_assign_eliminate : 0.000008s : 0.07% optimize.opt_a.updatestate_loads_eliminate : 0.000007s : 0.06% optimize.opt_a.parameter_eliminate : 0.000003s : 0.03% optimize.opt_a.a_2 : 0.000251s : 2.13% optimize.opt_a.accelerated_algorithm : 0.000016s : 0.14% optimize.opt_a.shard : 0.000004s : 0.03% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.03% optimize.opt_a.shard_inline : 0.000016s : 0.14% optimize.opt_a.merge_send_recv : 0.000017s : 0.14% optimize.opt_a.auto_parallel : 0.000015s : 0.12% optimize.opt_a.parallel : 0.000048s : 0.41% optimize.opt_a.flash_sp : 0.000013s : 0.11% optimize.opt_a.merge_comm : 0.000010s : 0.08% optimize.opt_a.allreduce_fusion : 0.000009s : 0.07% optimize.opt_a.matmul_add_comm_reduction : 0.000021s : 0.17% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000002s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000019s : 0.16% optimize.opt_a.virtual_dataset : 0.000016s : 0.13% optimize.opt_a.get_grad_eliminate_ : 0.000016s : 0.13% optimize.opt_a.virtual_output : 0.000016s : 0.13% optimize.opt_a.merge_forward : 0.000008s : 0.07% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.03% optimize.opt_a.offload_activation : 0.000020s : 0.17% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000036s : 0.30% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.02% optimize.opt_a.before_grad : 0.000026s : 0.22% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000009s : 0.08% optimize.opt_a.meta_fg_expand : 0.000006s : 0.05% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.03% optimize.opt_a.receive_attached : 0.000004s : 0.04% optimize.opt_a.after_resolve : 0.000025s : 0.21% optimize.opt_a.a_after_grad : 0.000025s : 0.21% optimize.opt_a.renormalize : 0.000783s : 6.64% optimize.opt_a.add_forward_monad_depend : 0.000008s : 0.07% optimize.opt_a.auto_monad_grad : 0.000004s : 0.03% optimize.opt_a.auto_monad_eliminator : 0.000029s : 0.24% optimize.opt_a.cse : 0.000056s : 0.47% optimize.opt_a.a_3 : 0.000145s : 1.23% optimize.py_interpret_to_execute_after_opt_a : 0.000014s : 0.12% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.04% optimize.rewriter_after_opt_a : 0.000044s : 0.37% optimize.convert_after_rewriter : 0.000011s : 0.09% optimize.order_py_execute_after_rewriter : 0.000009s : 0.07% optimize.mutable_eliminate : 0.000577s : 4.89% optimize.opt_b.b_1 : 0.000197s : 1.67% optimize.opt_b.b_2 : 0.000010s : 0.08% optimize.opt_b.updatestate_depend_eliminate : 0.000006s : 0.05% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_b.renormalize : 0.000000s : 0.00% optimize.opt_b.cse : 0.000025s : 0.22% optimize.optimize_parallel_all_gather_comm : 0.000021s : 0.18% optimize.overlap_param_gather : 0.000005s : 0.04% optimize.cconv : 0.000030s : 0.26% optimize.loop_unroll : 0.000460s : 3.90% optimize.opt_after_cconv.c_1 : 0.000038s : 0.32% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.05% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.cse : 0.000023s : 0.19% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000019s : 0.16% optimize.tuple_transform.d_1 : 0.000054s : 0.46% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.02% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000008s : 0.07% optimize.partial_unused_args_eliminate : 0.000005s : 0.04% optimize.add_recomputation : 0.000059s : 0.50% optimize.cse_after_recomputation.cse : 0.000016s : 0.14% optimize.environ_conv : 0.000009s : 0.08% optimize.swap_dp_allreduce_reducescatter : 0.000009s : 0.08% optimize.bias_add_comm_swap : 0.000006s : 0.05% optimize.label_micro_interleaved_index : 0.000007s : 0.06% optimize.label_fine_grained_interleaved_index : 0.000006s : 0.05% optimize.merge_cast_opt : 0.000004s : 0.03% optimize.slice_recompute_activation : 0.000004s : 0.04% optimize.micro_interleaved_order_control : 0.000005s : 0.04% optimize.assign_add_opt : 0.000004s : 0.03% optimize.ForceFp32Comm : 0.000003s : 0.03% optimize.remove_cast_before_assign_add : 0.000004s : 0.03% optimize.full_micro_interleaved_order_control : 0.000004s : 0.04% optimize.reorder_send_recv_between_fp_bp : 0.000005s : 0.04% optimize.comm_op_add_attrs : 0.000003s : 0.03% optimize.add_comm_op_reuse_tag : 0.000003s : 0.03% optimize.interleave_split_concat_branches : 0.000004s : 0.03% optimize.interleave_parallel_branches : 0.000004s : 0.03% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.03% optimize.overlap_opt_shard_grad_in_pipeline : 0.000005s : 0.04% optimize.control_data_broadcast_order : 0.000017s : 0.15% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.04% optimize.offloading_packed_experts : 0.000007s : 0.06% optimize.overlap_recompute_and_grad_model_parallel : 0.000008s : 0.07% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.03% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.03% optimize.overlap_recompute_comm : 0.000005s : 0.04% optimize.overlap_grad_ring_attention : 0.000007s : 0.06% optimize.overlap_grad_flash_sp : 0.000024s : 0.20% optimize.begin_end_overlap_inline : 0.000003s : 0.02% optimize.split_matmul_comm_elemetwise : 0.000005s : 0.04% optimize.split_layernorm_comm : 0.000005s : 0.04% optimize.handle_group_info : 0.000003s : 0.03% optimize.symbol_engine_optimizer.build : 0.000004s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000011s : 0.09% optimize.symbol_engine_optimizer.elim_not_effective : 0.000016s : 0.13% optimize.symbol_engine_optimizer.opt_reshape : 0.000009s : 0.07% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000012s : 0.10% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000004s : 0.03% pipeline_parallel_scheduler : 0.000002s : 0.02% auto_monad_reorder : 0.000024s : 0.20% get_jit_bprop_graph : 0.000002s : 0.01% rewriter_after_jit_bprop_graph : 0.000004s : 0.04% opt_after_jit_grad : 0.000519s : 4.41% validate : 0.000043s : 0.36% Time group info: ------[substitution.] 0.000205 38 12.36% : 0.000025s : 3: substitution.cast_eliminate 1.09% : 0.000002s : 3: substitution.elim_not_effective 0.77% : 0.000002s : 3: substitution.fold_const_symbol 3.45% : 0.000007s : 5: substitution.graph_param_transform 68.88% : 0.000141s : 4: substitution.inline 2.44% : 0.000005s : 6: substitution.j_node_and_user_rematch 2.86% : 0.000006s : 6: substitution.remove_not_recompute_node 2.11% : 0.000004s : 4: substitution.replace_old_param 6.03% : 0.000012s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.005784 2 87.48% : 0.005060s : 1: type_inference.infer 12.52% : 0.000724s : 1: type_inference.specialize ------[replace.] 0.000062 8 61.96% : 0.000038s : 4: replace.inline 38.04% : 0.000024s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000149 8 92.91% : 0.000138s : 4: match.inline 7.09% : 0.000011s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000243 1504 0.85% : 0.000002s : 15: predicate.accumulaten_eliminater 0.73% : 0.000002s : 5: predicate.ad_related_special_op_eliminate 0.58% : 0.000001s : 10: predicate.addn_check_dump 0.94% : 0.000002s : 15: predicate.addn_zero_filter 0.82% : 0.000002s : 15: predicate.adjust_all_reduce_mul_add 2.06% : 0.000005s : 25: predicate.arithmetic_simplify 1.25% : 0.000003s : 15: predicate.cast_eliminate 0.64% : 0.000002s : 10: predicate.check_bprop_eliminate 0.57% : 0.000001s : 10: predicate.compare_switch_simplify 0.19% : 0.000000s : 5: predicate.const_output_eliminate 0.63% : 0.000002s : 10: predicate.depend_value_elim 1.00% : 0.000002s : 15: predicate.dict_get_item_const_eliminator 1.06% : 0.000003s : 15: predicate.dict_get_item_eliminator 0.86% : 0.000002s : 15: predicate.dict_set_item_eliminator 0.94% : 0.000002s : 10: predicate.dumpgradient_eliminate 0.22% : 0.000001s : 5: predicate.elim_not_effective 0.42% : 0.000001s : 5: predicate.elim_shapecalc_of_broadcastargs 1.19% : 0.000003s : 20: predicate.environ_add_const_eliminate 1.12% : 0.000003s : 20: predicate.environ_get_add_eliminate 1.08% : 0.000003s : 20: predicate.environ_get_depend_swap 1.74% : 0.000004s : 30: predicate.environ_get_eliminate 1.05% : 0.000003s : 20: predicate.environ_get_set_eliminate 1.33% : 0.000003s : 23: predicate.exchange_switch_depend_value 2.27% : 0.000006s : 23: predicate.float_depend_g_call 0.56% : 0.000001s : 10: predicate.float_environ_get_switch 0.84% : 0.000002s : 15: predicate.float_tuple_getitem_switch 0.18% : 0.000000s : 5: predicate.fold_const_symbol 0.69% : 0.000002s : 10: predicate.get_grad_eliminate 0.26% : 0.000001s : 5: predicate.graph_param_transform 0.66% : 0.000002s : 10: predicate.incorporate_call 0.54% : 0.000001s : 10: predicate.incorporate_call_switch 6.23% : 0.000015s : 68: predicate.inline 0.87% : 0.000002s : 10: predicate.inline_without_move 0.35% : 0.000001s : 10: predicate.j_node_and_user_rematch 0.87% : 0.000002s : 10: predicate.less_batch_normalization 1.80% : 0.000004s : 29: predicate.list_to_tuple_eliminator_ 2.45% : 0.000006s : 44: predicate.load_eliminater 0.81% : 0.000002s : 5: predicate.loop_unroll_after_grad 2.19% : 0.000005s : 36: predicate.loop_unroll_before_grad 1.59% : 0.000004s : 25: predicate.make_slice_get_slice_eliminator 0.62% : 0.000001s : 10: predicate.merge_addn 0.62% : 0.000001s : 10: predicate.micro_step_allgather_replace 0.67% : 0.000002s : 10: predicate.mini_step_allgather_replace 0.80% : 0.000002s : 15: predicate.minmaximum_grad 0.91% : 0.000002s : 5: predicate.mutable_eliminate 0.39% : 0.000001s : 5: predicate.opt_reshape 0.36% : 0.000001s : 5: predicate.parallel_virtual_node 1.70% : 0.000004s : 23: predicate.partial_defer_inline 1.64% : 0.000004s : 24: predicate.partial_eliminate 0.85% : 0.000002s : 15: predicate.print_const_string_wrapper 0.61% : 0.000001s : 10: predicate.reduce_all_const_elim 1.18% : 0.000003s : 15: predicate.reduce_eliminate 2.66% : 0.000006s : 44: predicate.redundant_stop_gradient_eliminater 0.56% : 0.000001s : 10: predicate.remove_not_recompute_node 1.50% : 0.000004s : 29: predicate.replace_applicator 0.59% : 0.000001s : 10: predicate.replace_old_param 0.26% : 0.000001s : 5: predicate.reset_defer_inline 1.13% : 0.000003s : 15: predicate.reshape_eliminate 0.68% : 0.000002s : 10: predicate.row_tensor_add_zeros_like 0.38% : 0.000001s : 5: predicate.row_tensor_eliminate 1.04% : 0.000003s : 10: predicate.same_eliminate 0.56% : 0.000001s : 10: predicate.set_cell_output_no_recompute 0.86% : 0.000002s : 10: predicate.shard_identity_eliminate 0.68% : 0.000002s : 10: predicate.special_op_eliminate 0.82% : 0.000002s : 10: predicate.specialize_transform 0.91% : 0.000002s : 10: predicate.split_environ_get_set_with_tuple_value 0.87% : 0.000002s : 10: predicate.stack_unstack_eliminate 0.37% : 0.000001s : 5: predicate.switch_call_monad_eliminater 1.41% : 0.000003s : 23: predicate.switch_defer_inline 1.99% : 0.000005s : 33: predicate.switch_layer_defer_inline 4.77% : 0.000012s : 74: predicate.switch_simplify 0.80% : 0.000002s : 15: predicate.tile_eliminate 0.93% : 0.000002s : 15: predicate.transpose_eliminate 1.49% : 0.000004s : 25: predicate.tuple_list_convert_item_index_to_positive 1.56% : 0.000004s : 25: predicate.tuple_list_get_item_const_eliminator 1.50% : 0.000004s : 25: predicate.tuple_list_get_item_depend_reorder 3.25% : 0.000008s : 39: predicate.tuple_list_get_item_eliminator 1.41% : 0.000003s : 25: predicate.tuple_list_get_set_item_eliminator 2.27% : 0.000006s : 35: predicate.tuple_list_set_item_eliminator 1.77% : 0.000004s : 29: predicate.tuple_to_list_eliminator_ 2.46% : 0.000006s : 44: predicate.updatestate_pure_node_eliminater 3.26% : 0.000008s : 54: predicate.updatestate_useless_node_eliminater 0.39% : 0.000001s : 5: predicate.value_based_eliminate 0.68% : 0.000002s : 10: predicate.virtual_dataset_eliminate 0.65% : 0.000002s : 10: predicate.virtual_output_eliminate 0.31% : 0.000001s : 5: predicate.virtual_view_grad_eliminate 0.42% : 0.000001s : 5: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000552 11 54.82% : 0.000303s : 5: func_graph_cloner_run.FuncGraphClonerGraph 45.18% : 0.000250s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.028233 192 0.02% : 0.000006s : 1: ForceFp32Comm 11.32% : 0.003195s : 1: add_attr 11.26% : 0.003180s : 1: add_attr_with_inline 0.02% : 0.000006s : 1: add_comm_op_reuse_tag 0.22% : 0.000062s : 1: add_recomputation 0.02% : 0.000007s : 1: assign_add_opt 0.26% : 0.000074s : 1: auto_monad 0.11% : 0.000031s : 1: auto_monad_reorder 0.02% : 0.000006s : 1: begin_end_overlap_inline 0.03% : 0.000008s : 1: bias_add_comm_swap 1.76% : 0.000498s : 1: bootstrap 0.12% : 0.000033s : 1: cconv 0.02% : 0.000006s : 1: comm_op_add_attrs 0.07% : 0.000020s : 1: control_data_broadcast_order 0.05% : 0.000014s : 1: convert_after_rewriter 0.13% : 0.000036s : 1: cse_after_recomputation 0.03% : 0.000007s : 1: dataset_repeat_opt 0.07% : 0.000019s : 1: detach_backward 0.04% : 0.000012s : 1: environ_conv 0.11% : 0.000031s : 1: event_method 0.03% : 0.000007s : 1: full_micro_interleaved_order_control 0.03% : 0.000007s : 1: get_jit_bprop_graph 0.04% : 0.000012s : 1: graph_reusing 0.02% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.02% : 0.000006s : 1: handle_group_info 0.03% : 0.000008s : 1: inline 0.03% : 0.000009s : 1: insert-virtual-dataset 0.02% : 0.000006s : 1: interleave_parallel_branches 0.02% : 0.000006s : 1: interleave_split_concat_branches 0.03% : 0.000008s : 1: label_fine_grained_interleaved_index 0.03% : 0.000009s : 1: label_micro_interleaved_index 1.65% : 0.000466s : 1: loop_unroll 0.02% : 0.000006s : 1: merge_cast_opt 0.03% : 0.000007s : 1: micro_interleaved_order_control 2.06% : 0.000582s : 1: mutable_eliminate 0.03% : 0.000010s : 1: offloading_packed_experts 0.06% : 0.000017s : 1: opt.transform.loop_unroll_optimizer 0.06% : 0.000016s : 1: opt.transform.mutable_eliminate 4.91% : 0.001388s : 78: opt.transform.opt_a 0.13% : 0.000036s : 1: opt.transform.opt_after_cconv 0.11% : 0.000030s : 1: opt.transform.opt_after_jit_grad 0.48% : 0.000135s : 28: opt.transform.opt_b 0.21% : 0.000060s : 2: opt.transform.opt_trans_graph 0.16% : 0.000044s : 4: opt.transform.symbol_engine_opt 12.25% : 0.003459s : 1: opt_a 0.56% : 0.000158s : 1: opt_after_cconv 1.88% : 0.000529s : 1: opt_after_jit_grad 1.11% : 0.000313s : 1: opt_b 22.25% : 0.006282s : 1: optimize 0.09% : 0.000024s : 1: optimize_parallel_all_gather_comm 0.04% : 0.000011s : 1: order_py_execute_after_rewriter 0.10% : 0.000027s : 1: overlap_grad_flash_sp 0.02% : 0.000006s : 1: overlap_grad_matmul_and_grad_allreduce 0.04% : 0.000010s : 1: overlap_grad_ring_attention 0.03% : 0.000007s : 1: overlap_opt_shard_grad_in_pipeline 0.02% : 0.000007s : 1: overlap_opt_shard_in_pipeline 0.03% : 0.000008s : 1: overlap_param_gather 0.02% : 0.000006s : 1: overlap_recompute_allgather_and_fa_grad 0.04% : 0.000010s : 1: overlap_recompute_and_grad_model_parallel 0.03% : 0.000008s : 1: overlap_recompute_comm 0.04% : 0.000010s : 1: parallel-infer-symbol 0.02% : 0.000006s : 1: parallel-infer-symbol-second 0.03% : 0.000008s : 1: partial_unused_args_eliminate 0.03% : 0.000010s : 1: pipeline_parallel_scheduler 0.03% : 0.000007s : 1: pipeline_split 0.15% : 0.000043s : 1: pre_auto_parallel 0.12% : 0.000034s : 1: py_interpret_to_execute 0.06% : 0.000017s : 1: py_interpret_to_execute_after_opt_a 0.02% : 0.000006s : 1: remove_cast_before_assign_add 0.08% : 0.000022s : 1: remove_dup_value 1.61% : 0.000454s : 1: renormalize.infer 1.13% : 0.000318s : 1: renormalize.specialize 0.03% : 0.000008s : 1: reorder_send_recv_between_fp_bp 0.04% : 0.000010s : 1: rewriter_after_jit_bprop_graph 0.17% : 0.000047s : 1: rewriter_after_opt_a 0.34% : 0.000096s : 1: rewriter_before_opt_a 0.03% : 0.000008s : 1: slice_cell_reuse_recomputed_activation 0.03% : 0.000007s : 1: slice_recompute_activation 0.03% : 0.000007s : 1: split_layernorm_comm 0.03% : 0.000007s : 1: split_matmul_comm_elemetwise 0.04% : 0.000012s : 1: swap_dp_allreduce_reducescatter 0.38% : 0.000108s : 1: symbol_engine_optimizer 0.38% : 0.000106s : 1: tuple_transform 20.82% : 0.005878s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:45:27.666.344 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0149786, [21] [bootstrap]: 0.00043717 [type_inference]: 0.00560562 [event_method]: 1.797e-05 [auto_monad]: 6.238e-05 [graph_reusing]: 5.57001e-06 [inline]: 1.89e-06 [add_attr]: 0.00299567, [1] [add_attr_with_inline]: 0.00298718, [1] [Cycle 1]: 5.002e-05, [2] [tag_attr]: 1.76e-05 [meta_addattr_fg_expand]: 6.09001e-06 [parallel-infer-symbol]: 3.14001e-06 [pre_auto_parallel]: 3.091e-05 [insert-virtual-dataset]: 2.60002e-06 [parallel-infer-symbol-second]: 7.2e-07 [dataset_repeat_opt]: 1.88002e-06 [pipeline_split]: 1.69e-06 [optimize]: 0.00509846, [53] [py_interpret_to_execute]: 2.519e-05 [rewriter_before_opt_a]: 8.277e-05 [opt_a]: 0.00294873, [2] [Cycle 1]: 0.00214165, [45] [expand_dump_flag]: 3.03e-06 [switch_simplify]: 4.357e-05 [loop_unroll]: 3.136e-05 [a_1]: 0.00065712 [with_stream_mark]: 1.47e-05 [recompute_prepare]: 9.98998e-06 [updatestate_depend_eliminate]: 4.43001e-06 [updatestate_assign_eliminate]: 4.47e-06 [updatestate_loads_eliminate]: 3.76999e-06 [parameter_eliminate]: 1.91998e-06 [a_2]: 0.00010114 [accelerated_algorithm]: 8.33999e-06 [shard]: 2.04999e-06 [meta_shard_fg_expand]: 1.91e-06 [shard_inline]: 8.35999e-06 [merge_send_recv]: 9.94001e-06 [auto_parallel]: 7.23e-06 [parallel]: 1.827e-05 [flash_sp]: 7.59002e-06 [merge_comm]: 5.09998e-06 [allreduce_fusion]: 4.20999e-06 [matmul_add_comm_reduction]: 1.045e-05 [allreduce_slice_to_reducescatter]: 6.50005e-07 [virtual_shard_identity]: 9.22001e-06 [virtual_dataset]: 8.01001e-06 [get_grad_eliminate_]: 7.26001e-06 [virtual_output]: 7.45e-06 [merge_forward]: 4.83001e-06 [cell_reuse_recompute_pass]: 1.07e-06 [offload_activation]: 1.39e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.971e-05 [merge_recompute_call_nodes]: 7.20998e-06 [before_grad]: 1.427e-05 [set_forward_comm_id_for_comm_node_pass]: 5.97999e-06 [meta_fg_expand]: 3.74002e-06 [flash_sp_send_recv_attached]: 3.18e-06 [receive_attached]: 2.29999e-06 [after_resolve]: 1.367e-05 [a_after_grad]: 1.239e-05 [renormalize]: 0.00059823 [add_forward_monad_depend]: 5.44e-06 [auto_monad_grad]: 2.31e-06 [auto_monad_eliminator]: 1.746e-05 [cse]: 3.667e-05 [a_3]: 5.802e-05 [Cycle 2]: 0.00079643, [45] [expand_dump_flag]: 1.21002e-06 [switch_simplify]: 9.75002e-06 [loop_unroll]: 8.47e-06 [a_1]: 0.00018199 [with_stream_mark]: 1.177e-05 [recompute_prepare]: 8.33001e-06 [updatestate_depend_eliminate]: 3.93001e-06 [updatestate_assign_eliminate]: 3.08998e-06 [updatestate_loads_eliminate]: 3.03e-06 [parameter_eliminate]: 1.39998e-06 [a_2]: 0.00010034 [accelerated_algorithm]: 7.85e-06 [shard]: 1.16002e-06 [meta_shard_fg_expand]: 1.79e-06 [shard_inline]: 7.98999e-06 [merge_send_recv]: 6.28e-06 [auto_parallel]: 6.59999e-06 [parallel]: 5.12e-06 [flash_sp]: 3.41001e-06 [merge_comm]: 4.89e-06 [allreduce_fusion]: 4.03999e-06 [matmul_add_comm_reduction]: 6.96999e-06 [allreduce_slice_to_reducescatter]: 3.50003e-07 [virtual_shard_identity]: 8.60001e-06 [virtual_dataset]: 9.93002e-06 [get_grad_eliminate_]: 7.2e-06 [virtual_output]: 7.15e-06 [merge_forward]: 3.74002e-06 [cell_reuse_recompute_pass]: 1.63002e-06 [offload_activation]: 8.13001e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.573e-05 [merge_recompute_call_nodes]: 7.89994e-07 [before_grad]: 1.226e-05 [set_forward_comm_id_for_comm_node_pass]: 4.42998e-06 [meta_fg_expand]: 2.86999e-06 [flash_sp_send_recv_attached]: 9.80013e-07 [receive_attached]: 1.05001e-06 [after_resolve]: 1.208e-05 [a_after_grad]: 1.224e-05 [renormalize]: 7.00238e-08 [add_forward_monad_depend]: 1.77001e-06 [auto_monad_grad]: 1.16997e-06 [auto_monad_eliminator]: 8.57998e-06 [cse]: 1.932e-05 [a_3]: 4.916e-05 [py_interpret_to_execute_after_opt_a]: 1.003e-05 [slice_cell_reuse_recomputed_activation]: 2.46998e-06 [rewriter_after_opt_a]: 4.156e-05 [convert_after_rewriter]: 8.07003e-06 [order_py_execute_after_rewriter]: 6.03998e-06 [mutable_eliminate]: 0.00049815 [opt_b]: 0.00025541, [1] [Cycle 1]: 0.0002487, [7] [b_1]: 0.00016747 [b_2]: 9.61e-06 [updatestate_depend_eliminate]: 6.11998e-06 [updatestate_assign_eliminate]: 3.15002e-06 [updatestate_loads_eliminate]: 3.18e-06 [renormalize]: 4.30009e-07 [cse]: 2.173e-05 [optimize_parallel_all_gather_comm]: 1.78e-05 [overlap_param_gather]: 1.91003e-06 [cconv]: 2.335e-05 [loop_unroll]: 0.0004432 [opt_after_cconv]: 0.0001209, [1] [Cycle 1]: 0.00011516, [7] [c_1]: 4.023e-05 [parameter_eliminate]: 2.66e-06 [updatestate_depend_eliminate]: 6.24001e-06 [updatestate_assign_eliminate]: 3.4e-06 [updatestate_loads_eliminate]: 3.15998e-06 [cse]: 2.248e-05 [renormalize]: 3.80009e-07 [remove_dup_value]: 1.5e-05 [tuple_transform]: 9.152e-05, [1] [Cycle 1]: 8.68e-05, [4] [d_1]: 5.53e-05 [none_parameter_eliminate]: 1.92999e-06 [renormalize]: 2.80008e-07 [switch_simplify]: 9.47001e-06 [partial_unused_args_eliminate]: 1.89999e-06 [add_recomputation]: 5.711e-05 [cse_after_recomputation]: 2.637e-05, [1] [Cycle 1]: 2.171e-05, [1] [cse]: 1.58e-05 [environ_conv]: 6.21e-06 [swap_dp_allreduce_reducescatter]: 6.09001e-06 [bias_add_comm_swap]: 2.32001e-06 [label_micro_interleaved_index]: 4.06001e-06 [label_fine_grained_interleaved_index]: 2.81999e-06 [merge_cast_opt]: 1.33002e-06 [slice_recompute_activation]: 2.07999e-06 [micro_interleaved_order_control]: 2.66999e-06 [assign_add_opt]: 1.46002e-06 [ForceFp32Comm]: 8.2e-07 [remove_cast_before_assign_add]: 1.00001e-06 [full_micro_interleaved_order_control]: 2.11e-06 [reorder_send_recv_between_fp_bp]: 2.79999e-06 [comm_op_add_attrs]: 1.00001e-06 [add_comm_op_reuse_tag]: 9.80013e-07 [interleave_split_concat_branches]: 1.19e-06 [interleave_parallel_branches]: 1.09e-06 [overlap_opt_shard_in_pipeline]: 1.39e-06 [overlap_opt_shard_grad_in_pipeline]: 2.09e-06 [control_data_broadcast_order]: 1.57e-05 [grouped_pairwise_exchange_alltoall]: 1.59e-06 [offloading_packed_experts]: 4.18001e-06 [overlap_recompute_and_grad_model_parallel]: 5.39e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.24e-06 [overlap_recompute_allgather_and_fa_grad]: 1.30999e-06 [overlap_recompute_comm]: 2.48998e-06 [overlap_grad_ring_attention]: 4.60001e-06 [overlap_grad_flash_sp]: 2.118e-05 [begin_end_overlap_inline]: 7.89994e-07 [split_matmul_comm_elemetwise]: 2.34001e-06 [split_layernorm_comm]: 1.67001e-06 [handle_group_info]: 9.49978e-07 [symbol_engine_optimizer]: 8.642e-05, [1] [Cycle 1]: 8.207e-05, [6] [build]: 3.23998e-06 [elim_shapecalc]: 1.125e-05 [elim_not_effective]: 1.598e-05 [opt_reshape]: 8.75001e-06 [fold_const_symbol]: 1.311e-05 [renormalize]: 2.09984e-07 [detach_backward]: 1.91e-06 [pipeline_parallel_scheduler]: 1.69e-06 [auto_monad_reorder]: 1.98e-05 [get_jit_bprop_graph]: 1.27999e-06 [rewriter_after_jit_bprop_graph]: 3.8e-06 [opt_after_jit_grad]: 0.00049963 [validate]: 4.129e-05 Sums bootstrap : 0.000437s : 4.00% type_inference : 0.005606s : 51.32% event_method : 0.000018s : 0.16% auto_monad : 0.000062s : 0.57% graph_reusing : 0.000006s : 0.05% inline : 0.000002s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000018s : 0.16% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.06% parallel-infer-symbol : 0.000003s : 0.03% pre_auto_parallel : 0.000031s : 0.28% insert-virtual-dataset : 0.000003s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.02% optimize.py_interpret_to_execute : 0.000025s : 0.23% optimize.rewriter_before_opt_a : 0.000083s : 0.76% optimize.opt_a.expand_dump_flag : 0.000004s : 0.04% optimize.opt_a.switch_simplify : 0.000053s : 0.49% optimize.opt_a.loop_unroll : 0.000040s : 0.36% optimize.opt_a.a_1 : 0.000839s : 7.68% optimize.opt_a.with_stream_mark : 0.000026s : 0.24% optimize.opt_a.recompute_prepare : 0.000018s : 0.17% optimize.opt_a.updatestate_depend_eliminate : 0.000008s : 0.08% optimize.opt_a.updatestate_assign_eliminate : 0.000008s : 0.07% optimize.opt_a.updatestate_loads_eliminate : 0.000007s : 0.06% optimize.opt_a.parameter_eliminate : 0.000003s : 0.03% optimize.opt_a.a_2 : 0.000201s : 1.84% optimize.opt_a.accelerated_algorithm : 0.000016s : 0.15% optimize.opt_a.shard : 0.000003s : 0.03% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.03% optimize.opt_a.shard_inline : 0.000016s : 0.15% optimize.opt_a.merge_send_recv : 0.000016s : 0.15% optimize.opt_a.auto_parallel : 0.000014s : 0.13% optimize.opt_a.parallel : 0.000023s : 0.21% optimize.opt_a.flash_sp : 0.000011s : 0.10% optimize.opt_a.merge_comm : 0.000010s : 0.09% optimize.opt_a.allreduce_fusion : 0.000008s : 0.08% optimize.opt_a.matmul_add_comm_reduction : 0.000017s : 0.16% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000018s : 0.16% optimize.opt_a.virtual_dataset : 0.000018s : 0.16% optimize.opt_a.get_grad_eliminate_ : 0.000014s : 0.13% optimize.opt_a.virtual_output : 0.000015s : 0.13% optimize.opt_a.merge_forward : 0.000009s : 0.08% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.02% optimize.opt_a.offload_activation : 0.000022s : 0.20% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000035s : 0.32% optimize.opt_a.merge_recompute_call_nodes : 0.000008s : 0.07% optimize.opt_a.before_grad : 0.000027s : 0.24% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000010s : 0.10% optimize.opt_a.meta_fg_expand : 0.000007s : 0.06% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.04% optimize.opt_a.receive_attached : 0.000003s : 0.03% optimize.opt_a.after_resolve : 0.000026s : 0.24% optimize.opt_a.a_after_grad : 0.000025s : 0.23% optimize.opt_a.renormalize : 0.000598s : 5.48% optimize.opt_a.add_forward_monad_depend : 0.000007s : 0.07% optimize.opt_a.auto_monad_grad : 0.000003s : 0.03% optimize.opt_a.auto_monad_eliminator : 0.000026s : 0.24% optimize.opt_a.cse : 0.000056s : 0.51% optimize.opt_a.a_3 : 0.000107s : 0.98% optimize.py_interpret_to_execute_after_opt_a : 0.000010s : 0.09% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.02% optimize.rewriter_after_opt_a : 0.000042s : 0.38% optimize.convert_after_rewriter : 0.000008s : 0.07% optimize.order_py_execute_after_rewriter : 0.000006s : 0.06% optimize.mutable_eliminate : 0.000498s : 4.56% optimize.opt_b.b_1 : 0.000167s : 1.53% optimize.opt_b.b_2 : 0.000010s : 0.09% optimize.opt_b.updatestate_depend_eliminate : 0.000006s : 0.06% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_b.renormalize : 0.000000s : 0.00% optimize.opt_b.cse : 0.000022s : 0.20% optimize.optimize_parallel_all_gather_comm : 0.000018s : 0.16% optimize.overlap_param_gather : 0.000002s : 0.02% optimize.cconv : 0.000023s : 0.21% optimize.loop_unroll : 0.000443s : 4.06% optimize.opt_after_cconv.c_1 : 0.000040s : 0.37% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.02% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.06% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.cse : 0.000022s : 0.21% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000015s : 0.14% optimize.tuple_transform.d_1 : 0.000055s : 0.51% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.02% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000009s : 0.09% optimize.partial_unused_args_eliminate : 0.000002s : 0.02% optimize.add_recomputation : 0.000057s : 0.52% optimize.cse_after_recomputation.cse : 0.000016s : 0.14% optimize.environ_conv : 0.000006s : 0.06% optimize.swap_dp_allreduce_reducescatter : 0.000006s : 0.06% optimize.bias_add_comm_swap : 0.000002s : 0.02% optimize.label_micro_interleaved_index : 0.000004s : 0.04% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.03% optimize.merge_cast_opt : 0.000001s : 0.01% optimize.slice_recompute_activation : 0.000002s : 0.02% optimize.micro_interleaved_order_control : 0.000003s : 0.02% optimize.assign_add_opt : 0.000001s : 0.01% optimize.ForceFp32Comm : 0.000001s : 0.01% optimize.remove_cast_before_assign_add : 0.000001s : 0.01% optimize.full_micro_interleaved_order_control : 0.000002s : 0.02% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.03% optimize.comm_op_add_attrs : 0.000001s : 0.01% optimize.add_comm_op_reuse_tag : 0.000001s : 0.01% optimize.interleave_split_concat_branches : 0.000001s : 0.01% optimize.interleave_parallel_branches : 0.000001s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.02% optimize.control_data_broadcast_order : 0.000016s : 0.14% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.01% optimize.offloading_packed_experts : 0.000004s : 0.04% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.05% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.01% optimize.overlap_recompute_comm : 0.000002s : 0.02% optimize.overlap_grad_ring_attention : 0.000005s : 0.04% optimize.overlap_grad_flash_sp : 0.000021s : 0.19% optimize.begin_end_overlap_inline : 0.000001s : 0.01% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.02% optimize.split_layernorm_comm : 0.000002s : 0.02% optimize.handle_group_info : 0.000001s : 0.01% optimize.symbol_engine_optimizer.build : 0.000003s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000011s : 0.10% optimize.symbol_engine_optimizer.elim_not_effective : 0.000016s : 0.15% optimize.symbol_engine_optimizer.opt_reshape : 0.000009s : 0.08% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000013s : 0.12% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.02% pipeline_parallel_scheduler : 0.000002s : 0.02% auto_monad_reorder : 0.000020s : 0.18% get_jit_bprop_graph : 0.000001s : 0.01% rewriter_after_jit_bprop_graph : 0.000004s : 0.03% opt_after_jit_grad : 0.000500s : 4.57% validate : 0.000041s : 0.38% Time group info: ------[substitution.] 0.000195 38 11.55% : 0.000023s : 3: substitution.cast_eliminate 1.21% : 0.000002s : 3: substitution.elim_not_effective 0.93% : 0.000002s : 3: substitution.fold_const_symbol 3.46% : 0.000007s : 5: substitution.graph_param_transform 67.55% : 0.000132s : 4: substitution.inline 2.18% : 0.000004s : 6: substitution.j_node_and_user_rematch 3.78% : 0.000007s : 6: substitution.remove_not_recompute_node 2.40% : 0.000005s : 4: substitution.replace_old_param 6.94% : 0.000014s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.005551 2 88.03% : 0.004886s : 1: type_inference.infer 11.97% : 0.000664s : 1: type_inference.specialize ------[replace.] 0.000059 8 61.36% : 0.000036s : 4: replace.inline 38.64% : 0.000023s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000141 8 91.63% : 0.000129s : 4: match.inline 8.37% : 0.000012s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000243 1504 0.93% : 0.000002s : 15: predicate.accumulaten_eliminater 0.74% : 0.000002s : 5: predicate.ad_related_special_op_eliminate 0.58% : 0.000001s : 10: predicate.addn_check_dump 0.91% : 0.000002s : 15: predicate.addn_zero_filter 0.87% : 0.000002s : 15: predicate.adjust_all_reduce_mul_add 1.97% : 0.000005s : 25: predicate.arithmetic_simplify 0.99% : 0.000002s : 15: predicate.cast_eliminate 0.59% : 0.000001s : 10: predicate.check_bprop_eliminate 0.61% : 0.000001s : 10: predicate.compare_switch_simplify 0.19% : 0.000000s : 5: predicate.const_output_eliminate 0.64% : 0.000002s : 10: predicate.depend_value_elim 1.04% : 0.000003s : 15: predicate.dict_get_item_const_eliminator 0.99% : 0.000002s : 15: predicate.dict_get_item_eliminator 0.85% : 0.000002s : 15: predicate.dict_set_item_eliminator 0.98% : 0.000002s : 10: predicate.dumpgradient_eliminate 0.20% : 0.000000s : 5: predicate.elim_not_effective 0.42% : 0.000001s : 5: predicate.elim_shapecalc_of_broadcastargs 1.18% : 0.000003s : 20: predicate.environ_add_const_eliminate 1.10% : 0.000003s : 20: predicate.environ_get_add_eliminate 1.13% : 0.000003s : 20: predicate.environ_get_depend_swap 1.73% : 0.000004s : 30: predicate.environ_get_eliminate 1.14% : 0.000003s : 20: predicate.environ_get_set_eliminate 1.36% : 0.000003s : 23: predicate.exchange_switch_depend_value 2.47% : 0.000006s : 23: predicate.float_depend_g_call 0.60% : 0.000001s : 10: predicate.float_environ_get_switch 0.91% : 0.000002s : 15: predicate.float_tuple_getitem_switch 0.20% : 0.000000s : 5: predicate.fold_const_symbol 0.68% : 0.000002s : 10: predicate.get_grad_eliminate 0.24% : 0.000001s : 5: predicate.graph_param_transform 0.63% : 0.000002s : 10: predicate.incorporate_call 0.54% : 0.000001s : 10: predicate.incorporate_call_switch 6.18% : 0.000015s : 68: predicate.inline 0.86% : 0.000002s : 10: predicate.inline_without_move 0.37% : 0.000001s : 10: predicate.j_node_and_user_rematch 0.86% : 0.000002s : 10: predicate.less_batch_normalization 1.80% : 0.000004s : 29: predicate.list_to_tuple_eliminator_ 2.48% : 0.000006s : 44: predicate.load_eliminater 1.05% : 0.000003s : 5: predicate.loop_unroll_after_grad 2.17% : 0.000005s : 36: predicate.loop_unroll_before_grad 1.76% : 0.000004s : 25: predicate.make_slice_get_slice_eliminator 0.62% : 0.000002s : 10: predicate.merge_addn 0.60% : 0.000001s : 10: predicate.micro_step_allgather_replace 0.59% : 0.000001s : 10: predicate.mini_step_allgather_replace 0.81% : 0.000002s : 15: predicate.minmaximum_grad 0.89% : 0.000002s : 5: predicate.mutable_eliminate 0.40% : 0.000001s : 5: predicate.opt_reshape 0.37% : 0.000001s : 5: predicate.parallel_virtual_node 1.70% : 0.000004s : 23: predicate.partial_defer_inline 1.61% : 0.000004s : 24: predicate.partial_eliminate 0.83% : 0.000002s : 15: predicate.print_const_string_wrapper 0.67% : 0.000002s : 10: predicate.reduce_all_const_elim 1.13% : 0.000003s : 15: predicate.reduce_eliminate 2.52% : 0.000006s : 44: predicate.redundant_stop_gradient_eliminater 0.47% : 0.000001s : 10: predicate.remove_not_recompute_node 1.38% : 0.000003s : 29: predicate.replace_applicator 0.44% : 0.000001s : 10: predicate.replace_old_param 0.28% : 0.000001s : 5: predicate.reset_defer_inline 0.97% : 0.000002s : 15: predicate.reshape_eliminate 0.62% : 0.000002s : 10: predicate.row_tensor_add_zeros_like 0.35% : 0.000001s : 5: predicate.row_tensor_eliminate 0.82% : 0.000002s : 10: predicate.same_eliminate 0.44% : 0.000001s : 10: predicate.set_cell_output_no_recompute 0.80% : 0.000002s : 10: predicate.shard_identity_eliminate 0.75% : 0.000002s : 10: predicate.special_op_eliminate 0.79% : 0.000002s : 10: predicate.specialize_transform 0.84% : 0.000002s : 10: predicate.split_environ_get_set_with_tuple_value 0.87% : 0.000002s : 10: predicate.stack_unstack_eliminate 0.39% : 0.000001s : 5: predicate.switch_call_monad_eliminater 1.46% : 0.000004s : 23: predicate.switch_defer_inline 2.02% : 0.000005s : 33: predicate.switch_layer_defer_inline 4.90% : 0.000012s : 74: predicate.switch_simplify 0.87% : 0.000002s : 15: predicate.tile_eliminate 0.87% : 0.000002s : 15: predicate.transpose_eliminate 1.54% : 0.000004s : 25: predicate.tuple_list_convert_item_index_to_positive 1.69% : 0.000004s : 25: predicate.tuple_list_get_item_const_eliminator 1.52% : 0.000004s : 25: predicate.tuple_list_get_item_depend_reorder 3.16% : 0.000008s : 39: predicate.tuple_list_get_item_eliminator 1.76% : 0.000004s : 25: predicate.tuple_list_get_set_item_eliminator 2.29% : 0.000006s : 35: predicate.tuple_list_set_item_eliminator 1.77% : 0.000004s : 29: predicate.tuple_to_list_eliminator_ 2.47% : 0.000006s : 44: predicate.updatestate_pure_node_eliminater 3.21% : 0.000008s : 54: predicate.updatestate_useless_node_eliminater 0.36% : 0.000001s : 5: predicate.value_based_eliminate 0.75% : 0.000002s : 10: predicate.virtual_dataset_eliminate 0.65% : 0.000002s : 10: predicate.virtual_output_eliminate 0.34% : 0.000001s : 5: predicate.virtual_view_grad_eliminate 0.49% : 0.000001s : 5: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000553 11 61.34% : 0.000339s : 5: func_graph_cloner_run.FuncGraphClonerGraph 38.66% : 0.000214s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.025269 192 0.01% : 0.000004s : 1: ForceFp32Comm 11.87% : 0.003000s : 1: add_attr 11.84% : 0.002991s : 1: add_attr_with_inline 0.01% : 0.000004s : 1: add_comm_op_reuse_tag 0.24% : 0.000062s : 1: add_recomputation 0.02% : 0.000004s : 1: assign_add_opt 0.27% : 0.000068s : 1: auto_monad 0.10% : 0.000024s : 1: auto_monad_reorder 0.01% : 0.000004s : 1: begin_end_overlap_inline 0.02% : 0.000005s : 1: bias_add_comm_swap 1.84% : 0.000464s : 1: bootstrap 0.11% : 0.000027s : 1: cconv 0.02% : 0.000004s : 1: comm_op_add_attrs 0.08% : 0.000019s : 1: control_data_broadcast_order 0.05% : 0.000011s : 1: convert_after_rewriter 0.12% : 0.000029s : 1: cse_after_recomputation 0.02% : 0.000005s : 1: dataset_repeat_opt 0.02% : 0.000005s : 1: detach_backward 0.04% : 0.000010s : 1: environ_conv 0.09% : 0.000024s : 1: event_method 0.02% : 0.000005s : 1: full_micro_interleaved_order_control 0.02% : 0.000005s : 1: get_jit_bprop_graph 0.04% : 0.000009s : 1: graph_reusing 0.02% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000004s : 1: handle_group_info 0.02% : 0.000005s : 1: inline 0.02% : 0.000006s : 1: insert-virtual-dataset 0.02% : 0.000004s : 1: interleave_parallel_branches 0.02% : 0.000004s : 1: interleave_split_concat_branches 0.02% : 0.000006s : 1: label_fine_grained_interleaved_index 0.03% : 0.000007s : 1: label_micro_interleaved_index 1.79% : 0.000451s : 1: loop_unroll 0.02% : 0.000005s : 1: merge_cast_opt 0.02% : 0.000006s : 1: micro_interleaved_order_control 2.00% : 0.000507s : 1: mutable_eliminate 0.03% : 0.000007s : 1: offloading_packed_experts 0.07% : 0.000018s : 1: opt.transform.loop_unroll_optimizer 0.07% : 0.000017s : 1: opt.transform.mutable_eliminate 5.44% : 0.001374s : 78: opt.transform.opt_a 0.15% : 0.000039s : 1: opt.transform.opt_after_cconv 0.12% : 0.000031s : 1: opt.transform.opt_after_jit_grad 0.56% : 0.000142s : 28: opt.transform.opt_b 0.25% : 0.000062s : 2: opt.transform.opt_trans_graph 0.18% : 0.000045s : 4: opt.transform.symbol_engine_opt 11.68% : 0.002952s : 1: opt_a 0.49% : 0.000125s : 1: opt_after_cconv 2.01% : 0.000509s : 1: opt_after_jit_grad 1.02% : 0.000259s : 1: opt_b 20.19% : 0.005103s : 1: optimize 0.09% : 0.000022s : 1: optimize_parallel_all_gather_comm 0.04% : 0.000009s : 1: order_py_execute_after_rewriter 0.10% : 0.000025s : 1: overlap_grad_flash_sp 0.02% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.03% : 0.000008s : 1: overlap_grad_ring_attention 0.02% : 0.000006s : 1: overlap_opt_shard_grad_in_pipeline 0.02% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.02% : 0.000005s : 1: overlap_param_gather 0.02% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.03% : 0.000008s : 1: overlap_recompute_and_grad_model_parallel 0.02% : 0.000005s : 1: overlap_recompute_comm 0.03% : 0.000007s : 1: parallel-infer-symbol 0.01% : 0.000004s : 1: parallel-infer-symbol-second 0.02% : 0.000005s : 1: partial_unused_args_eliminate 0.02% : 0.000005s : 1: pipeline_parallel_scheduler 0.02% : 0.000004s : 1: pipeline_split 0.14% : 0.000035s : 1: pre_auto_parallel 0.12% : 0.000029s : 1: py_interpret_to_execute 0.05% : 0.000014s : 1: py_interpret_to_execute_after_opt_a 0.02% : 0.000004s : 1: remove_cast_before_assign_add 0.07% : 0.000019s : 1: remove_dup_value 1.25% : 0.000316s : 1: renormalize.infer 1.08% : 0.000274s : 1: renormalize.specialize 0.02% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.03% : 0.000007s : 1: rewriter_after_jit_bprop_graph 0.18% : 0.000046s : 1: rewriter_after_opt_a 0.34% : 0.000087s : 1: rewriter_before_opt_a 0.02% : 0.000006s : 1: slice_cell_reuse_recomputed_activation 0.02% : 0.000005s : 1: slice_recompute_activation 0.02% : 0.000004s : 1: split_layernorm_comm 0.02% : 0.000005s : 1: split_matmul_comm_elemetwise 0.04% : 0.000009s : 1: swap_dp_allreduce_reducescatter 0.35% : 0.000089s : 1: symbol_engine_optimizer 0.37% : 0.000095s : 1: tuple_transform 22.24% : 0.005620s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:45:28.108.032 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:45:28.108.324 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.016377, [21] [bootstrap]: 0.00044927 [type_inference]: 0.00569461 [event_method]: 1.948e-05 [auto_monad]: 6.509e-05 [graph_reusing]: 6.68e-06 [inline]: 2.12999e-06 [add_attr]: 0.00304407, [1] [add_attr_with_inline]: 0.00303378, [1] [Cycle 1]: 7.111e-05, [2] [tag_attr]: 1.966e-05 [meta_addattr_fg_expand]: 6.29001e-06 [parallel-infer-symbol]: 2.95002e-06 [pre_auto_parallel]: 3.214e-05 [insert-virtual-dataset]: 2.25002e-06 [parallel-infer-symbol-second]: 1.00001e-06 [dataset_repeat_opt]: 1.94999e-06 [pipeline_split]: 1.53002e-06 [optimize]: 0.00591781, [53] [py_interpret_to_execute]: 3.319e-05 [rewriter_before_opt_a]: 0.0001108 [opt_a]: 0.00344303, [2] [Cycle 1]: 0.00241372, [45] [expand_dump_flag]: 3.57002e-06 [switch_simplify]: 4.486e-05 [loop_unroll]: 3.361e-05 [a_1]: 0.00069652 [with_stream_mark]: 1.537e-05 [recompute_prepare]: 1.204e-05 [updatestate_depend_eliminate]: 5.44e-06 [updatestate_assign_eliminate]: 4.58001e-06 [updatestate_loads_eliminate]: 4.84003e-06 [parameter_eliminate]: 2.21e-06 [a_2]: 0.00015121 [accelerated_algorithm]: 9.44998e-06 [shard]: 1.79e-06 [meta_shard_fg_expand]: 2.21e-06 [shard_inline]: 9.17001e-06 [merge_send_recv]: 9.96e-06 [auto_parallel]: 7.7e-06 [parallel]: 2.325e-05 [flash_sp]: 8.13001e-06 [merge_comm]: 5.89999e-06 [allreduce_fusion]: 5.15999e-06 [matmul_add_comm_reduction]: 1.181e-05 [allreduce_slice_to_reducescatter]: 6.89994e-07 [virtual_shard_identity]: 1.084e-05 [virtual_dataset]: 9.33002e-06 [get_grad_eliminate_]: 8.80001e-06 [virtual_output]: 8.85001e-06 [merge_forward]: 4.98001e-06 [cell_reuse_recompute_pass]: 1.49e-06 [offload_activation]: 1.169e-05 [cell_reuse_handle_not_recompute_node_pass]: 2e-05 [merge_recompute_call_nodes]: 1.72001e-06 [before_grad]: 1.502e-05 [set_forward_comm_id_for_comm_node_pass]: 5.14998e-06 [meta_fg_expand]: 4.15e-06 [flash_sp_send_recv_attached]: 3.03e-06 [receive_attached]: 1.94999e-06 [after_resolve]: 1.445e-05 [a_after_grad]: 1.395e-05 [renormalize]: 0.00067232 [add_forward_monad_depend]: 5.67001e-06 [auto_monad_grad]: 2.19001e-06 [auto_monad_eliminator]: 1.905e-05 [cse]: 4.251e-05 [a_3]: 7.698e-05 [Cycle 2]: 0.00101702, [45] [expand_dump_flag]: 1.11002e-06 [switch_simplify]: 9.84001e-06 [loop_unroll]: 8.73001e-06 [a_1]: 0.00020906 [with_stream_mark]: 1.179e-05 [recompute_prepare]: 9.22999e-06 [updatestate_depend_eliminate]: 4.47e-06 [updatestate_assign_eliminate]: 3.66999e-06 [updatestate_loads_eliminate]: 3.65e-06 [parameter_eliminate]: 1.09e-06 [a_2]: 0.00013752 [accelerated_algorithm]: 8.57e-06 [shard]: 1.39998e-06 [meta_shard_fg_expand]: 2.04e-06 [shard_inline]: 8.40999e-06 [merge_send_recv]: 6.33e-06 [auto_parallel]: 6.77002e-06 [parallel]: 4.45999e-06 [flash_sp]: 3.3e-06 [merge_comm]: 4.89998e-06 [allreduce_fusion]: 4.64998e-06 [matmul_add_comm_reduction]: 7.47002e-06 [allreduce_slice_to_reducescatter]: 3.39991e-07 [virtual_shard_identity]: 9.51998e-06 [virtual_dataset]: 8.38001e-06 [get_grad_eliminate_]: 8.07e-06 [virtual_output]: 8.19002e-06 [merge_forward]: 3.82002e-06 [cell_reuse_recompute_pass]: 1.39e-06 [offload_activation]: 8.70001e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.912e-05 [merge_recompute_call_nodes]: 1.27e-06 [before_grad]: 1.494e-05 [set_forward_comm_id_for_comm_node_pass]: 5.14e-06 [meta_fg_expand]: 3.18998e-06 [flash_sp_send_recv_attached]: 8.30012e-07 [receive_attached]: 1.03001e-06 [after_resolve]: 1.271e-05 [a_after_grad]: 1.336e-05 [renormalize]: 1.00001e-07 [add_forward_monad_depend]: 1.45001e-06 [auto_monad_grad]: 1.22e-06 [auto_monad_eliminator]: 9.74999e-06 [cse]: 2.241e-05 [a_3]: 6.633e-05 [py_interpret_to_execute_after_opt_a]: 1.303e-05 [slice_cell_reuse_recomputed_activation]: 5.15001e-06 [rewriter_after_opt_a]: 4.716e-05 [convert_after_rewriter]: 1.124e-05 [order_py_execute_after_rewriter]: 8.97999e-06 [mutable_eliminate]: 0.00048476 [opt_b]: 0.00034624, [1] [Cycle 1]: 0.00033665, [7] [b_1]: 0.00023076 [b_2]: 1.064e-05 [updatestate_depend_eliminate]: 6.73e-06 [updatestate_assign_eliminate]: 3.65998e-06 [updatestate_loads_eliminate]: 3.66001e-06 [renormalize]: 5.09986e-07 [cse]: 2.572e-05 [optimize_parallel_all_gather_comm]: 2.15e-05 [overlap_param_gather]: 7.69002e-06 [cconv]: 2.772e-05 [loop_unroll]: 0.0004374 [opt_after_cconv]: 0.00015412, [1] [Cycle 1]: 0.00014559, [7] [c_1]: 4.405e-05 [parameter_eliminate]: 2.76999e-06 [updatestate_depend_eliminate]: 7.23e-06 [updatestate_assign_eliminate]: 4.03999e-06 [updatestate_loads_eliminate]: 3.60003e-06 [cse]: 2.571e-05 [renormalize]: 3.09985e-07 [remove_dup_value]: 3.921e-05 [tuple_transform]: 0.00011048, [1] [Cycle 1]: 0.00010325, [4] [d_1]: 6.03e-05 [none_parameter_eliminate]: 1.91998e-06 [renormalize]: 6.50005e-07 [switch_simplify]: 9.94999e-06 [partial_unused_args_eliminate]: 4.67998e-06 [add_recomputation]: 6.457e-05 [cse_after_recomputation]: 3.401e-05, [1] [Cycle 1]: 2.685e-05, [1] [cse]: 1.727e-05 [environ_conv]: 9.72001e-06 [swap_dp_allreduce_reducescatter]: 1.024e-05 [bias_add_comm_swap]: 5.08002e-06 [label_micro_interleaved_index]: 6.83e-06 [label_fine_grained_interleaved_index]: 5.24e-06 [merge_cast_opt]: 3.73001e-06 [slice_recompute_activation]: 4.33001e-06 [micro_interleaved_order_control]: 4.99e-06 [assign_add_opt]: 3.66001e-06 [ForceFp32Comm]: 3.12002e-06 [remove_cast_before_assign_add]: 3.48e-06 [full_micro_interleaved_order_control]: 4.52e-06 [reorder_send_recv_between_fp_bp]: 5.20001e-06 [comm_op_add_attrs]: 3.38e-06 [add_comm_op_reuse_tag]: 3.22002e-06 [interleave_split_concat_branches]: 3.7e-06 [interleave_parallel_branches]: 3.36001e-06 [overlap_opt_shard_in_pipeline]: 6.48e-06 [overlap_opt_shard_grad_in_pipeline]: 4.14002e-06 [control_data_broadcast_order]: 1.869e-05 [grouped_pairwise_exchange_alltoall]: 4.35e-06 [offloading_packed_experts]: 7.96001e-06 [overlap_recompute_and_grad_model_parallel]: 8.22998e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.80998e-06 [overlap_recompute_allgather_and_fa_grad]: 3.63e-06 [overlap_recompute_comm]: 4.76002e-06 [overlap_grad_ring_attention]: 7.15e-06 [overlap_grad_flash_sp]: 2.665e-05 [begin_end_overlap_inline]: 3.25e-06 [split_matmul_comm_elemetwise]: 4.35999e-06 [split_layernorm_comm]: 4.32e-06 [handle_group_info]: 3.30003e-06 [symbol_engine_optimizer]: 0.00010912, [1] [Cycle 1]: 0.00010247, [6] [build]: 2.97002e-06 [elim_shapecalc]: 1.209e-05 [elim_not_effective]: 1.757e-05 [opt_reshape]: 9.42999e-06 [fold_const_symbol]: 1.465e-05 [renormalize]: 2.60014e-07 [detach_backward]: 3.4e-06 [pipeline_parallel_scheduler]: 1.62999e-06 [auto_monad_reorder]: 2.429e-05 [get_jit_bprop_graph]: 1.71e-06 [rewriter_after_jit_bprop_graph]: 4.72e-06 [opt_after_jit_grad]: 0.00049087 [validate]: 4.217e-05 Sums bootstrap : 0.000449s : 3.87% type_inference : 0.005695s : 49.07% event_method : 0.000019s : 0.17% auto_monad : 0.000065s : 0.56% graph_reusing : 0.000007s : 0.06% inline : 0.000002s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000020s : 0.17% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.05% parallel-infer-symbol : 0.000003s : 0.03% pre_auto_parallel : 0.000032s : 0.28% insert-virtual-dataset : 0.000002s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000033s : 0.29% optimize.rewriter_before_opt_a : 0.000111s : 0.95% optimize.opt_a.expand_dump_flag : 0.000005s : 0.04% optimize.opt_a.switch_simplify : 0.000055s : 0.47% optimize.opt_a.loop_unroll : 0.000042s : 0.36% optimize.opt_a.a_1 : 0.000906s : 7.80% optimize.opt_a.with_stream_mark : 0.000027s : 0.23% optimize.opt_a.recompute_prepare : 0.000021s : 0.18% optimize.opt_a.updatestate_depend_eliminate : 0.000010s : 0.09% optimize.opt_a.updatestate_assign_eliminate : 0.000008s : 0.07% optimize.opt_a.updatestate_loads_eliminate : 0.000008s : 0.07% optimize.opt_a.parameter_eliminate : 0.000003s : 0.03% optimize.opt_a.a_2 : 0.000289s : 2.49% optimize.opt_a.accelerated_algorithm : 0.000018s : 0.16% optimize.opt_a.shard : 0.000003s : 0.03% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.04% optimize.opt_a.shard_inline : 0.000018s : 0.15% optimize.opt_a.merge_send_recv : 0.000016s : 0.14% optimize.opt_a.auto_parallel : 0.000014s : 0.12% optimize.opt_a.parallel : 0.000028s : 0.24% optimize.opt_a.flash_sp : 0.000011s : 0.10% optimize.opt_a.merge_comm : 0.000011s : 0.09% optimize.opt_a.allreduce_fusion : 0.000010s : 0.08% optimize.opt_a.matmul_add_comm_reduction : 0.000019s : 0.17% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000020s : 0.18% optimize.opt_a.virtual_dataset : 0.000018s : 0.15% optimize.opt_a.get_grad_eliminate_ : 0.000017s : 0.15% optimize.opt_a.virtual_output : 0.000017s : 0.15% optimize.opt_a.merge_forward : 0.000009s : 0.08% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.02% optimize.opt_a.offload_activation : 0.000020s : 0.18% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000039s : 0.34% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.03% optimize.opt_a.before_grad : 0.000030s : 0.26% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000010s : 0.09% optimize.opt_a.meta_fg_expand : 0.000007s : 0.06% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.03% optimize.opt_a.receive_attached : 0.000003s : 0.03% optimize.opt_a.after_resolve : 0.000027s : 0.23% optimize.opt_a.a_after_grad : 0.000027s : 0.24% optimize.opt_a.renormalize : 0.000672s : 5.79% optimize.opt_a.add_forward_monad_depend : 0.000007s : 0.06% optimize.opt_a.auto_monad_grad : 0.000003s : 0.03% optimize.opt_a.auto_monad_eliminator : 0.000029s : 0.25% optimize.opt_a.cse : 0.000065s : 0.56% optimize.opt_a.a_3 : 0.000143s : 1.23% optimize.py_interpret_to_execute_after_opt_a : 0.000013s : 0.11% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.04% optimize.rewriter_after_opt_a : 0.000047s : 0.41% optimize.convert_after_rewriter : 0.000011s : 0.10% optimize.order_py_execute_after_rewriter : 0.000009s : 0.08% optimize.mutable_eliminate : 0.000485s : 4.18% optimize.opt_b.b_1 : 0.000231s : 1.99% optimize.opt_b.b_2 : 0.000011s : 0.09% optimize.opt_b.updatestate_depend_eliminate : 0.000007s : 0.06% optimize.opt_b.updatestate_assign_eliminate : 0.000004s : 0.03% optimize.opt_b.updatestate_loads_eliminate : 0.000004s : 0.03% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000026s : 0.22% optimize.optimize_parallel_all_gather_comm : 0.000021s : 0.19% optimize.overlap_param_gather : 0.000008s : 0.07% optimize.cconv : 0.000028s : 0.24% optimize.loop_unroll : 0.000437s : 3.77% optimize.opt_after_cconv.c_1 : 0.000044s : 0.38% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.02% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000007s : 0.06% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000004s : 0.03% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000004s : 0.03% optimize.opt_after_cconv.cse : 0.000026s : 0.22% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000039s : 0.34% optimize.tuple_transform.d_1 : 0.000060s : 0.52% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.02% optimize.tuple_transform.renormalize : 0.000001s : 0.01% optimize.tuple_transform.switch_simplify : 0.000010s : 0.09% optimize.partial_unused_args_eliminate : 0.000005s : 0.04% optimize.add_recomputation : 0.000065s : 0.56% optimize.cse_after_recomputation.cse : 0.000017s : 0.15% optimize.environ_conv : 0.000010s : 0.08% optimize.swap_dp_allreduce_reducescatter : 0.000010s : 0.09% optimize.bias_add_comm_swap : 0.000005s : 0.04% optimize.label_micro_interleaved_index : 0.000007s : 0.06% optimize.label_fine_grained_interleaved_index : 0.000005s : 0.05% optimize.merge_cast_opt : 0.000004s : 0.03% optimize.slice_recompute_activation : 0.000004s : 0.04% optimize.micro_interleaved_order_control : 0.000005s : 0.04% optimize.assign_add_opt : 0.000004s : 0.03% optimize.ForceFp32Comm : 0.000003s : 0.03% optimize.remove_cast_before_assign_add : 0.000003s : 0.03% optimize.full_micro_interleaved_order_control : 0.000005s : 0.04% optimize.reorder_send_recv_between_fp_bp : 0.000005s : 0.04% optimize.comm_op_add_attrs : 0.000003s : 0.03% optimize.add_comm_op_reuse_tag : 0.000003s : 0.03% optimize.interleave_split_concat_branches : 0.000004s : 0.03% optimize.interleave_parallel_branches : 0.000003s : 0.03% optimize.overlap_opt_shard_in_pipeline : 0.000006s : 0.06% optimize.overlap_opt_shard_grad_in_pipeline : 0.000004s : 0.04% optimize.control_data_broadcast_order : 0.000019s : 0.16% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.04% optimize.offloading_packed_experts : 0.000008s : 0.07% optimize.overlap_recompute_and_grad_model_parallel : 0.000008s : 0.07% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.03% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.03% optimize.overlap_recompute_comm : 0.000005s : 0.04% optimize.overlap_grad_ring_attention : 0.000007s : 0.06% optimize.overlap_grad_flash_sp : 0.000027s : 0.23% optimize.begin_end_overlap_inline : 0.000003s : 0.03% optimize.split_matmul_comm_elemetwise : 0.000004s : 0.04% optimize.split_layernorm_comm : 0.000004s : 0.04% optimize.handle_group_info : 0.000003s : 0.03% optimize.symbol_engine_optimizer.build : 0.000003s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000012s : 0.10% optimize.symbol_engine_optimizer.elim_not_effective : 0.000018s : 0.15% optimize.symbol_engine_optimizer.opt_reshape : 0.000009s : 0.08% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000015s : 0.13% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000003s : 0.03% pipeline_parallel_scheduler : 0.000002s : 0.01% auto_monad_reorder : 0.000024s : 0.21% get_jit_bprop_graph : 0.000002s : 0.01% rewriter_after_jit_bprop_graph : 0.000005s : 0.04% opt_after_jit_grad : 0.000491s : 4.23% validate : 0.000042s : 0.36% Time group info: ------[substitution.] 0.000209 48 14.03% : 0.000029s : 6: substitution.cast_eliminate 1.32% : 0.000003s : 4: substitution.elim_not_effective 0.94% : 0.000002s : 4: substitution.fold_const_symbol 3.52% : 0.000007s : 6: substitution.graph_param_transform 65.75% : 0.000137s : 4: substitution.inline 2.52% : 0.000005s : 8: substitution.j_node_and_user_rematch 3.66% : 0.000008s : 8: substitution.remove_not_recompute_node 2.22% : 0.000005s : 4: substitution.replace_old_param 6.05% : 0.000013s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.005647 2 86.00% : 0.004857s : 1: type_inference.infer 14.00% : 0.000791s : 1: type_inference.specialize ------[replace.] 0.000059 8 62.25% : 0.000036s : 4: replace.inline 37.75% : 0.000022s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000145 8 92.54% : 0.000135s : 4: match.inline 7.46% : 0.000011s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000271 1730 0.87% : 0.000002s : 17: predicate.accumulaten_eliminater 0.70% : 0.000002s : 6: predicate.ad_related_special_op_eliminate 0.61% : 0.000002s : 12: predicate.addn_check_dump 0.86% : 0.000002s : 17: predicate.addn_zero_filter 0.79% : 0.000002s : 17: predicate.adjust_all_reduce_mul_add 2.03% : 0.000006s : 29: predicate.arithmetic_simplify 1.04% : 0.000003s : 17: predicate.cast_eliminate 0.66% : 0.000002s : 12: predicate.check_bprop_eliminate 0.64% : 0.000002s : 12: predicate.compare_switch_simplify 0.22% : 0.000001s : 6: predicate.const_output_eliminate 0.67% : 0.000002s : 12: predicate.depend_value_elim 0.91% : 0.000002s : 17: predicate.dict_get_item_const_eliminator 1.05% : 0.000003s : 17: predicate.dict_get_item_eliminator 0.92% : 0.000002s : 17: predicate.dict_set_item_eliminator 0.94% : 0.000003s : 12: predicate.dumpgradient_eliminate 0.22% : 0.000001s : 6: predicate.elim_not_effective 0.42% : 0.000001s : 6: predicate.elim_shapecalc_of_broadcastargs 1.16% : 0.000003s : 23: predicate.environ_add_const_eliminate 1.31% : 0.000004s : 23: predicate.environ_get_add_eliminate 1.15% : 0.000003s : 23: predicate.environ_get_depend_swap 1.91% : 0.000005s : 35: predicate.environ_get_eliminate 1.13% : 0.000003s : 23: predicate.environ_get_set_eliminate 1.31% : 0.000004s : 25: predicate.exchange_switch_depend_value 2.24% : 0.000006s : 25: predicate.float_depend_g_call 0.61% : 0.000002s : 12: predicate.float_environ_get_switch 0.94% : 0.000003s : 18: predicate.float_tuple_getitem_switch 0.21% : 0.000001s : 6: predicate.fold_const_symbol 0.75% : 0.000002s : 12: predicate.get_grad_eliminate 0.23% : 0.000001s : 6: predicate.graph_param_transform 0.70% : 0.000002s : 12: predicate.incorporate_call 0.60% : 0.000002s : 12: predicate.incorporate_call_switch 6.30% : 0.000017s : 78: predicate.inline 0.87% : 0.000002s : 12: predicate.inline_without_move 0.35% : 0.000001s : 12: predicate.j_node_and_user_rematch 0.80% : 0.000002s : 12: predicate.less_batch_normalization 1.74% : 0.000005s : 33: predicate.list_to_tuple_eliminator_ 2.53% : 0.000007s : 50: predicate.load_eliminater 0.77% : 0.000002s : 6: predicate.loop_unroll_after_grad 2.03% : 0.000006s : 38: predicate.loop_unroll_before_grad 1.56% : 0.000004s : 29: predicate.make_slice_get_slice_eliminator 0.66% : 0.000002s : 12: predicate.merge_addn 0.63% : 0.000002s : 12: predicate.micro_step_allgather_replace 0.65% : 0.000002s : 12: predicate.mini_step_allgather_replace 0.79% : 0.000002s : 17: predicate.minmaximum_grad 0.85% : 0.000002s : 6: predicate.mutable_eliminate 0.40% : 0.000001s : 6: predicate.opt_reshape 0.54% : 0.000001s : 6: predicate.parallel_virtual_node 1.56% : 0.000004s : 25: predicate.partial_defer_inline 1.63% : 0.000004s : 27: predicate.partial_eliminate 0.85% : 0.000002s : 17: predicate.print_const_string_wrapper 0.74% : 0.000002s : 12: predicate.reduce_all_const_elim 1.08% : 0.000003s : 17: predicate.reduce_eliminate 2.54% : 0.000007s : 50: predicate.redundant_stop_gradient_eliminater 0.41% : 0.000001s : 12: predicate.remove_not_recompute_node 1.32% : 0.000004s : 33: predicate.replace_applicator 0.50% : 0.000001s : 12: predicate.replace_old_param 0.27% : 0.000001s : 6: predicate.reset_defer_inline 0.94% : 0.000003s : 17: predicate.reshape_eliminate 0.66% : 0.000002s : 12: predicate.row_tensor_add_zeros_like 0.45% : 0.000001s : 6: predicate.row_tensor_eliminate 0.88% : 0.000002s : 12: predicate.same_eliminate 0.45% : 0.000001s : 12: predicate.set_cell_output_no_recompute 0.81% : 0.000002s : 12: predicate.shard_identity_eliminate 0.73% : 0.000002s : 12: predicate.special_op_eliminate 0.81% : 0.000002s : 12: predicate.specialize_transform 0.81% : 0.000002s : 12: predicate.split_environ_get_set_with_tuple_value 0.85% : 0.000002s : 12: predicate.stack_unstack_eliminate 0.40% : 0.000001s : 6: predicate.switch_call_monad_eliminater 1.43% : 0.000004s : 25: predicate.switch_defer_inline 1.99% : 0.000005s : 37: predicate.switch_layer_defer_inline 4.81% : 0.000013s : 81: predicate.switch_simplify 0.85% : 0.000002s : 17: predicate.tile_eliminate 0.98% : 0.000003s : 17: predicate.transpose_eliminate 1.64% : 0.000004s : 29: predicate.tuple_list_convert_item_index_to_positive 1.62% : 0.000004s : 29: predicate.tuple_list_get_item_const_eliminator 1.59% : 0.000004s : 29: predicate.tuple_list_get_item_depend_reorder 3.06% : 0.000008s : 45: predicate.tuple_list_get_item_eliminator 1.58% : 0.000004s : 29: predicate.tuple_list_get_set_item_eliminator 2.33% : 0.000006s : 41: predicate.tuple_list_set_item_eliminator 1.82% : 0.000005s : 33: predicate.tuple_to_list_eliminator_ 2.44% : 0.000007s : 50: predicate.updatestate_pure_node_eliminater 3.20% : 0.000009s : 62: predicate.updatestate_useless_node_eliminater 0.55% : 0.000001s : 6: predicate.value_based_eliminate 0.71% : 0.000002s : 12: predicate.virtual_dataset_eliminate 0.67% : 0.000002s : 12: predicate.virtual_output_eliminate 0.33% : 0.000001s : 6: predicate.virtual_view_grad_eliminate 0.44% : 0.000001s : 6: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000502 11 55.22% : 0.000277s : 5: func_graph_cloner_run.FuncGraphClonerGraph 44.78% : 0.000225s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.027788 192 0.02% : 0.000006s : 1: ForceFp32Comm 10.99% : 0.003053s : 1: add_attr 10.93% : 0.003037s : 1: add_attr_with_inline 0.02% : 0.000006s : 1: add_comm_op_reuse_tag 0.25% : 0.000068s : 1: add_recomputation 0.02% : 0.000006s : 1: assign_add_opt 0.27% : 0.000074s : 1: auto_monad 0.12% : 0.000032s : 1: auto_monad_reorder 0.02% : 0.000006s : 1: begin_end_overlap_inline 0.03% : 0.000008s : 1: bias_add_comm_swap 1.78% : 0.000496s : 1: bootstrap 0.11% : 0.000031s : 1: cconv 0.02% : 0.000006s : 1: comm_op_add_attrs 0.08% : 0.000022s : 1: control_data_broadcast_order 0.05% : 0.000014s : 1: convert_after_rewriter 0.13% : 0.000037s : 1: cse_after_recomputation 0.03% : 0.000008s : 1: dataset_repeat_opt 0.07% : 0.000019s : 1: detach_backward 0.05% : 0.000013s : 1: environ_conv 0.11% : 0.000030s : 1: event_method 0.03% : 0.000007s : 1: full_micro_interleaved_order_control 0.03% : 0.000009s : 1: get_jit_bprop_graph 0.05% : 0.000013s : 1: graph_reusing 0.03% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.02% : 0.000006s : 1: handle_group_info 0.03% : 0.000008s : 1: inline 0.03% : 0.000008s : 1: insert-virtual-dataset 0.02% : 0.000006s : 1: interleave_parallel_branches 0.02% : 0.000006s : 1: interleave_split_concat_branches 0.03% : 0.000008s : 1: label_fine_grained_interleaved_index 0.03% : 0.000010s : 1: label_micro_interleaved_index 1.59% : 0.000443s : 1: loop_unroll 0.02% : 0.000006s : 1: merge_cast_opt 0.03% : 0.000008s : 1: micro_interleaved_order_control 1.76% : 0.000490s : 1: mutable_eliminate 0.04% : 0.000011s : 1: offloading_packed_experts 0.06% : 0.000017s : 1: opt.transform.loop_unroll_optimizer 0.06% : 0.000017s : 1: opt.transform.mutable_eliminate 5.45% : 0.001514s : 78: opt.transform.opt_a 0.15% : 0.000043s : 1: opt.transform.opt_after_cconv 0.12% : 0.000033s : 1: opt.transform.opt_after_jit_grad 0.60% : 0.000168s : 28: opt.transform.opt_b 0.24% : 0.000068s : 2: opt.transform.opt_trans_graph 0.18% : 0.000050s : 4: opt.transform.symbol_engine_opt 12.40% : 0.003446s : 1: opt_a 0.57% : 0.000158s : 1: opt_after_cconv 1.80% : 0.000501s : 1: opt_after_jit_grad 1.26% : 0.000350s : 1: opt_b 22.47% : 0.006244s : 1: optimize 0.09% : 0.000025s : 1: optimize_parallel_all_gather_comm 0.04% : 0.000012s : 1: order_py_execute_after_rewriter 0.11% : 0.000030s : 1: overlap_grad_flash_sp 0.02% : 0.000006s : 1: overlap_grad_matmul_and_grad_allreduce 0.04% : 0.000010s : 1: overlap_grad_ring_attention 0.02% : 0.000007s : 1: overlap_opt_shard_grad_in_pipeline 0.03% : 0.000009s : 1: overlap_opt_shard_in_pipeline 0.04% : 0.000011s : 1: overlap_param_gather 0.02% : 0.000006s : 1: overlap_recompute_allgather_and_fa_grad 0.04% : 0.000011s : 1: overlap_recompute_and_grad_model_parallel 0.03% : 0.000008s : 1: overlap_recompute_comm 0.03% : 0.000010s : 1: parallel-infer-symbol 0.02% : 0.000006s : 1: parallel-infer-symbol-second 0.03% : 0.000008s : 1: partial_unused_args_eliminate 0.03% : 0.000009s : 1: pipeline_parallel_scheduler 0.02% : 0.000007s : 1: pipeline_split 0.14% : 0.000040s : 1: pre_auto_parallel 0.13% : 0.000037s : 1: py_interpret_to_execute 0.06% : 0.000016s : 1: py_interpret_to_execute_after_opt_a 0.02% : 0.000006s : 1: remove_cast_before_assign_add 0.15% : 0.000043s : 1: remove_dup_value 1.19% : 0.000331s : 1: renormalize.infer 1.20% : 0.000333s : 1: renormalize.specialize 0.03% : 0.000008s : 1: reorder_send_recv_between_fp_bp 0.04% : 0.000011s : 1: rewriter_after_jit_bprop_graph 0.18% : 0.000051s : 1: rewriter_after_opt_a 0.41% : 0.000115s : 1: rewriter_before_opt_a 0.03% : 0.000008s : 1: slice_cell_reuse_recomputed_activation 0.02% : 0.000007s : 1: slice_recompute_activation 0.03% : 0.000007s : 1: split_layernorm_comm 0.02% : 0.000007s : 1: split_matmul_comm_elemetwise 0.05% : 0.000013s : 1: swap_dp_allreduce_reducescatter 0.40% : 0.000112s : 1: symbol_engine_optimizer 0.41% : 0.000113s : 1: tuple_transform 20.61% : 0.005728s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:45:28.454.283 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0149168, [21] [bootstrap]: 0.00042938 [type_inference]: 0.00546837 [event_method]: 1.876e-05 [auto_monad]: 6.499e-05 [graph_reusing]: 6.01998e-06 [inline]: 1.69998e-06 [add_attr]: 0.00299932, [1] [add_attr_with_inline]: 0.00299081, [1] [Cycle 1]: 5.591e-05, [2] [tag_attr]: 1.962e-05 [meta_addattr_fg_expand]: 6.20002e-06 [parallel-infer-symbol]: 2.96001e-06 [pre_auto_parallel]: 3.097e-05 [insert-virtual-dataset]: 2.81e-06 [parallel-infer-symbol-second]: 7.40023e-07 [dataset_repeat_opt]: 1.94e-06 [pipeline_split]: 1.64e-06 [optimize]: 0.00520606, [53] [py_interpret_to_execute]: 2.786e-05 [rewriter_before_opt_a]: 8.583e-05 [opt_a]: 0.00306871, [2] [Cycle 1]: 0.002215, [45] [expand_dump_flag]: 2.93e-06 [switch_simplify]: 4.513e-05 [loop_unroll]: 3.235e-05 [a_1]: 0.00070381 [with_stream_mark]: 1.495e-05 [recompute_prepare]: 1.082e-05 [updatestate_depend_eliminate]: 5.44e-06 [updatestate_assign_eliminate]: 4.45999e-06 [updatestate_loads_eliminate]: 4.13999e-06 [parameter_eliminate]: 2.31e-06 [a_2]: 0.0001185 [accelerated_algorithm]: 9.17001e-06 [shard]: 1.69998e-06 [meta_shard_fg_expand]: 2.24999e-06 [shard_inline]: 9.22001e-06 [merge_send_recv]: 9.57001e-06 [auto_parallel]: 7.77e-06 [parallel]: 1.921e-05 [flash_sp]: 8.3e-06 [merge_comm]: 5.39998e-06 [allreduce_fusion]: 5.14e-06 [matmul_add_comm_reduction]: 1.175e-05 [allreduce_slice_to_reducescatter]: 6.60017e-07 [virtual_shard_identity]: 1.015e-05 [virtual_dataset]: 9.02e-06 [get_grad_eliminate_]: 8.70001e-06 [virtual_output]: 4.988e-05 [merge_forward]: 5.94e-06 [cell_reuse_recompute_pass]: 1.47001e-06 [offload_activation]: 1.21e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.88e-05 [merge_recompute_call_nodes]: 1.60001e-06 [before_grad]: 1.547e-05 [set_forward_comm_id_for_comm_node_pass]: 6.64001e-06 [meta_fg_expand]: 4.85999e-06 [flash_sp_send_recv_attached]: 3.41999e-06 [receive_attached]: 2.34001e-06 [after_resolve]: 1.467e-05 [a_after_grad]: 1.416e-05 [renormalize]: 0.00062446 [add_forward_monad_depend]: 5.25999e-06 [auto_monad_grad]: 2.34999e-06 [auto_monad_eliminator]: 1.747e-05 [cse]: 4.194e-05 [a_3]: 6.292e-05 [Cycle 2]: 0.00084389, [45] [expand_dump_flag]: 1.13001e-06 [switch_simplify]: 9.99999e-06 [loop_unroll]: 8.50001e-06 [a_1]: 0.00021007 [with_stream_mark]: 1.208e-05 [recompute_prepare]: 8.85001e-06 [updatestate_depend_eliminate]: 4.23999e-06 [updatestate_assign_eliminate]: 3.53999e-06 [updatestate_loads_eliminate]: 3.34001e-06 [parameter_eliminate]: 1.14e-06 [a_2]: 0.00011028 [accelerated_algorithm]: 8.55999e-06 [shard]: 1.10001e-06 [meta_shard_fg_expand]: 1.77001e-06 [shard_inline]: 9.04e-06 [merge_send_recv]: 6.58e-06 [auto_parallel]: 6.88e-06 [parallel]: 5.01002e-06 [flash_sp]: 3.20998e-06 [merge_comm]: 7.23e-06 [allreduce_fusion]: 4.47998e-06 [matmul_add_comm_reduction]: 7.77998e-06 [allreduce_slice_to_reducescatter]: 3.80009e-07 [virtual_shard_identity]: 9.46998e-06 [virtual_dataset]: 8.83001e-06 [get_grad_eliminate_]: 8.08999e-06 [virtual_output]: 8.00999e-06 [merge_forward]: 4.02998e-06 [cell_reuse_recompute_pass]: 1.39e-06 [offload_activation]: 8.42e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.593e-05 [merge_recompute_call_nodes]: 7.39994e-07 [before_grad]: 1.381e-05 [set_forward_comm_id_for_comm_node_pass]: 5.03002e-06 [meta_fg_expand]: 3.16999e-06 [flash_sp_send_recv_attached]: 8.2e-07 [receive_attached]: 1.15999e-06 [after_resolve]: 1.247e-05 [a_after_grad]: 1.288e-05 [renormalize]: 8.00064e-08 [add_forward_monad_depend]: 1.37e-06 [auto_monad_grad]: 9.20001e-07 [auto_monad_eliminator]: 9.51e-06 [cse]: 2.143e-05 [a_3]: 5.343e-05 [py_interpret_to_execute_after_opt_a]: 1.007e-05 [slice_cell_reuse_recomputed_activation]: 1.85001e-06 [rewriter_after_opt_a]: 4.422e-05 [convert_after_rewriter]: 8.48001e-06 [order_py_execute_after_rewriter]: 6.01e-06 [mutable_eliminate]: 0.00046267 [opt_b]: 0.00027391, [1] [Cycle 1]: 0.00026801, [7] [b_1]: 0.00018231 [b_2]: 1.064e-05 [updatestate_depend_eliminate]: 6.59001e-06 [updatestate_assign_eliminate]: 3.55e-06 [updatestate_loads_eliminate]: 3.5e-06 [renormalize]: 3.50003e-07 [cse]: 2.606e-05 [optimize_parallel_all_gather_comm]: 1.861e-05 [overlap_param_gather]: 1.89e-06 [cconv]: 2.248e-05 [loop_unroll]: 0.00041327 [opt_after_cconv]: 0.00012603, [1] [Cycle 1]: 0.00012043, [7] [c_1]: 4.404e-05 [parameter_eliminate]: 2.29001e-06 [updatestate_depend_eliminate]: 6.46e-06 [updatestate_assign_eliminate]: 3.73999e-06 [updatestate_loads_eliminate]: 3.48e-06 [cse]: 2.597e-05 [renormalize]: 3.19997e-07 [remove_dup_value]: 3.499e-05 [tuple_transform]: 9.643e-05, [1] [Cycle 1]: 9.175e-05, [4] [d_1]: 6.11e-05 [none_parameter_eliminate]: 1.74e-06 [renormalize]: 1.8999e-07 [switch_simplify]: 9.39e-06 [partial_unused_args_eliminate]: 2.21e-06 [add_recomputation]: 5.851e-05 [cse_after_recomputation]: 2.657e-05, [1] [Cycle 1]: 2.193e-05, [1] [cse]: 1.64e-05 [environ_conv]: 6.21998e-06 [swap_dp_allreduce_reducescatter]: 6.53998e-06 [bias_add_comm_swap]: 2.84999e-06 [label_micro_interleaved_index]: 3.88001e-06 [label_fine_grained_interleaved_index]: 3.01001e-06 [merge_cast_opt]: 1.34e-06 [slice_recompute_activation]: 1.87001e-06 [micro_interleaved_order_control]: 2.53e-06 [assign_add_opt]: 1.18001e-06 [ForceFp32Comm]: 7.99977e-07 [remove_cast_before_assign_add]: 9.99979e-07 [full_micro_interleaved_order_control]: 2.02999e-06 [reorder_send_recv_between_fp_bp]: 2.71e-06 [comm_op_add_attrs]: 1.55999e-06 [add_comm_op_reuse_tag]: 1.11002e-06 [interleave_split_concat_branches]: 1.17e-06 [interleave_parallel_branches]: 1.02998e-06 [overlap_opt_shard_in_pipeline]: 1.18001e-06 [overlap_opt_shard_grad_in_pipeline]: 1.97999e-06 [control_data_broadcast_order]: 1.649e-05 [grouped_pairwise_exchange_alltoall]: 1.63002e-06 [offloading_packed_experts]: 4.45999e-06 [overlap_recompute_and_grad_model_parallel]: 5.64e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.25999e-06 [overlap_recompute_allgather_and_fa_grad]: 1.32e-06 [overlap_recompute_comm]: 2.34999e-06 [overlap_grad_ring_attention]: 5.07e-06 [overlap_grad_flash_sp]: 2.247e-05 [begin_end_overlap_inline]: 4.89992e-07 [split_matmul_comm_elemetwise]: 2.17999e-06 [split_layernorm_comm]: 1.79e-06 [handle_group_info]: 9.09989e-07 [symbol_engine_optimizer]: 8.909e-05, [1] [Cycle 1]: 8.479e-05, [6] [build]: 2.77002e-06 [elim_shapecalc]: 1.302e-05 [elim_not_effective]: 1.755e-05 [opt_reshape]: 9.27001e-06 [fold_const_symbol]: 1.412e-05 [renormalize]: 2.40019e-07 [detach_backward]: 1.72001e-06 [pipeline_parallel_scheduler]: 1.45001e-06 [auto_monad_reorder]: 1.984e-05 [get_jit_bprop_graph]: 1.26002e-06 [rewriter_after_jit_bprop_graph]: 3.41001e-06 [opt_after_jit_grad]: 0.00047384 [validate]: 4.048e-05 Sums bootstrap : 0.000429s : 3.91% type_inference : 0.005468s : 49.82% event_method : 0.000019s : 0.17% auto_monad : 0.000065s : 0.59% graph_reusing : 0.000006s : 0.05% inline : 0.000002s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000020s : 0.18% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.06% parallel-infer-symbol : 0.000003s : 0.03% pre_auto_parallel : 0.000031s : 0.28% insert-virtual-dataset : 0.000003s : 0.03% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000028s : 0.25% optimize.rewriter_before_opt_a : 0.000086s : 0.78% optimize.opt_a.expand_dump_flag : 0.000004s : 0.04% optimize.opt_a.switch_simplify : 0.000055s : 0.50% optimize.opt_a.loop_unroll : 0.000041s : 0.37% optimize.opt_a.a_1 : 0.000914s : 8.33% optimize.opt_a.with_stream_mark : 0.000027s : 0.25% optimize.opt_a.recompute_prepare : 0.000020s : 0.18% optimize.opt_a.updatestate_depend_eliminate : 0.000010s : 0.09% optimize.opt_a.updatestate_assign_eliminate : 0.000008s : 0.07% optimize.opt_a.updatestate_loads_eliminate : 0.000007s : 0.07% optimize.opt_a.parameter_eliminate : 0.000003s : 0.03% optimize.opt_a.a_2 : 0.000229s : 2.08% optimize.opt_a.accelerated_algorithm : 0.000018s : 0.16% optimize.opt_a.shard : 0.000003s : 0.03% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.04% optimize.opt_a.shard_inline : 0.000018s : 0.17% optimize.opt_a.merge_send_recv : 0.000016s : 0.15% optimize.opt_a.auto_parallel : 0.000015s : 0.13% optimize.opt_a.parallel : 0.000024s : 0.22% optimize.opt_a.flash_sp : 0.000012s : 0.10% optimize.opt_a.merge_comm : 0.000013s : 0.12% optimize.opt_a.allreduce_fusion : 0.000010s : 0.09% optimize.opt_a.matmul_add_comm_reduction : 0.000020s : 0.18% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000020s : 0.18% optimize.opt_a.virtual_dataset : 0.000018s : 0.16% optimize.opt_a.get_grad_eliminate_ : 0.000017s : 0.15% optimize.opt_a.virtual_output : 0.000058s : 0.53% optimize.opt_a.merge_forward : 0.000010s : 0.09% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.03% optimize.opt_a.offload_activation : 0.000021s : 0.19% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000035s : 0.32% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.02% optimize.opt_a.before_grad : 0.000029s : 0.27% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000012s : 0.11% optimize.opt_a.meta_fg_expand : 0.000008s : 0.07% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.04% optimize.opt_a.receive_attached : 0.000003s : 0.03% optimize.opt_a.after_resolve : 0.000027s : 0.25% optimize.opt_a.a_after_grad : 0.000027s : 0.25% optimize.opt_a.renormalize : 0.000625s : 5.69% optimize.opt_a.add_forward_monad_depend : 0.000007s : 0.06% optimize.opt_a.auto_monad_grad : 0.000003s : 0.03% optimize.opt_a.auto_monad_eliminator : 0.000027s : 0.25% optimize.opt_a.cse : 0.000063s : 0.58% optimize.opt_a.a_3 : 0.000116s : 1.06% optimize.py_interpret_to_execute_after_opt_a : 0.000010s : 0.09% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.02% optimize.rewriter_after_opt_a : 0.000044s : 0.40% optimize.convert_after_rewriter : 0.000008s : 0.08% optimize.order_py_execute_after_rewriter : 0.000006s : 0.05% optimize.mutable_eliminate : 0.000463s : 4.21% optimize.opt_b.b_1 : 0.000182s : 1.66% optimize.opt_b.b_2 : 0.000011s : 0.10% optimize.opt_b.updatestate_depend_eliminate : 0.000007s : 0.06% optimize.opt_b.updatestate_assign_eliminate : 0.000004s : 0.03% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_b.renormalize : 0.000000s : 0.00% optimize.opt_b.cse : 0.000026s : 0.24% optimize.optimize_parallel_all_gather_comm : 0.000019s : 0.17% optimize.overlap_param_gather : 0.000002s : 0.02% optimize.cconv : 0.000022s : 0.20% optimize.loop_unroll : 0.000413s : 3.76% optimize.opt_after_cconv.c_1 : 0.000044s : 0.40% optimize.opt_after_cconv.parameter_eliminate : 0.000002s : 0.02% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.06% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000004s : 0.03% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.cse : 0.000026s : 0.24% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000035s : 0.32% optimize.tuple_transform.d_1 : 0.000061s : 0.56% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.02% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000009s : 0.09% optimize.partial_unused_args_eliminate : 0.000002s : 0.02% optimize.add_recomputation : 0.000059s : 0.53% optimize.cse_after_recomputation.cse : 0.000016s : 0.15% optimize.environ_conv : 0.000006s : 0.06% optimize.swap_dp_allreduce_reducescatter : 0.000007s : 0.06% optimize.bias_add_comm_swap : 0.000003s : 0.03% optimize.label_micro_interleaved_index : 0.000004s : 0.04% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.03% optimize.merge_cast_opt : 0.000001s : 0.01% optimize.slice_recompute_activation : 0.000002s : 0.02% optimize.micro_interleaved_order_control : 0.000003s : 0.02% optimize.assign_add_opt : 0.000001s : 0.01% optimize.ForceFp32Comm : 0.000001s : 0.01% optimize.remove_cast_before_assign_add : 0.000001s : 0.01% optimize.full_micro_interleaved_order_control : 0.000002s : 0.02% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.02% optimize.comm_op_add_attrs : 0.000002s : 0.01% optimize.add_comm_op_reuse_tag : 0.000001s : 0.01% optimize.interleave_split_concat_branches : 0.000001s : 0.01% optimize.interleave_parallel_branches : 0.000001s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.02% optimize.control_data_broadcast_order : 0.000016s : 0.15% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.01% optimize.offloading_packed_experts : 0.000004s : 0.04% optimize.overlap_recompute_and_grad_model_parallel : 0.000006s : 0.05% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.01% optimize.overlap_recompute_comm : 0.000002s : 0.02% optimize.overlap_grad_ring_attention : 0.000005s : 0.05% optimize.overlap_grad_flash_sp : 0.000022s : 0.20% optimize.begin_end_overlap_inline : 0.000000s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.02% optimize.split_layernorm_comm : 0.000002s : 0.02% optimize.handle_group_info : 0.000001s : 0.01% optimize.symbol_engine_optimizer.build : 0.000003s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000013s : 0.12% optimize.symbol_engine_optimizer.elim_not_effective : 0.000018s : 0.16% optimize.symbol_engine_optimizer.opt_reshape : 0.000009s : 0.08% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000014s : 0.13% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.02% pipeline_parallel_scheduler : 0.000001s : 0.01% auto_monad_reorder : 0.000020s : 0.18% get_jit_bprop_graph : 0.000001s : 0.01% rewriter_after_jit_bprop_graph : 0.000003s : 0.03% opt_after_jit_grad : 0.000474s : 4.32% validate : 0.000040s : 0.37% Time group info: ------[substitution.] 0.000207 48 13.45% : 0.000028s : 6: substitution.cast_eliminate 1.21% : 0.000003s : 4: substitution.elim_not_effective 0.97% : 0.000002s : 4: substitution.fold_const_symbol 3.46% : 0.000007s : 6: substitution.graph_param_transform 66.29% : 0.000137s : 4: substitution.inline 2.42% : 0.000005s : 8: substitution.j_node_and_user_rematch 3.45% : 0.000007s : 8: substitution.remove_not_recompute_node 2.45% : 0.000005s : 4: substitution.replace_old_param 6.30% : 0.000013s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.005414 2 87.21% : 0.004721s : 1: type_inference.infer 12.79% : 0.000692s : 1: type_inference.specialize ------[replace.] 0.000060 8 61.35% : 0.000037s : 4: replace.inline 38.65% : 0.000023s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000146 8 92.34% : 0.000135s : 4: match.inline 7.66% : 0.000011s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000272 1730 1.00% : 0.000003s : 17: predicate.accumulaten_eliminater 0.79% : 0.000002s : 6: predicate.ad_related_special_op_eliminate 0.60% : 0.000002s : 12: predicate.addn_check_dump 0.92% : 0.000002s : 17: predicate.addn_zero_filter 0.84% : 0.000002s : 17: predicate.adjust_all_reduce_mul_add 1.99% : 0.000005s : 29: predicate.arithmetic_simplify 1.03% : 0.000003s : 17: predicate.cast_eliminate 0.64% : 0.000002s : 12: predicate.check_bprop_eliminate 0.66% : 0.000002s : 12: predicate.compare_switch_simplify 0.20% : 0.000001s : 6: predicate.const_output_eliminate 0.63% : 0.000002s : 12: predicate.depend_value_elim 0.92% : 0.000002s : 17: predicate.dict_get_item_const_eliminator 1.03% : 0.000003s : 17: predicate.dict_get_item_eliminator 0.83% : 0.000002s : 17: predicate.dict_set_item_eliminator 0.95% : 0.000003s : 12: predicate.dumpgradient_eliminate 0.27% : 0.000001s : 6: predicate.elim_not_effective 0.43% : 0.000001s : 6: predicate.elim_shapecalc_of_broadcastargs 1.21% : 0.000003s : 23: predicate.environ_add_const_eliminate 1.12% : 0.000003s : 23: predicate.environ_get_add_eliminate 1.13% : 0.000003s : 23: predicate.environ_get_depend_swap 1.79% : 0.000005s : 35: predicate.environ_get_eliminate 1.12% : 0.000003s : 23: predicate.environ_get_set_eliminate 1.33% : 0.000004s : 25: predicate.exchange_switch_depend_value 2.13% : 0.000006s : 25: predicate.float_depend_g_call 0.59% : 0.000002s : 12: predicate.float_environ_get_switch 0.87% : 0.000002s : 18: predicate.float_tuple_getitem_switch 0.19% : 0.000001s : 6: predicate.fold_const_symbol 0.71% : 0.000002s : 12: predicate.get_grad_eliminate 0.24% : 0.000001s : 6: predicate.graph_param_transform 0.75% : 0.000002s : 12: predicate.incorporate_call 0.58% : 0.000002s : 12: predicate.incorporate_call_switch 6.24% : 0.000017s : 78: predicate.inline 0.97% : 0.000003s : 12: predicate.inline_without_move 0.36% : 0.000001s : 12: predicate.j_node_and_user_rematch 0.85% : 0.000002s : 12: predicate.less_batch_normalization 1.92% : 0.000005s : 33: predicate.list_to_tuple_eliminator_ 2.45% : 0.000007s : 50: predicate.load_eliminater 0.74% : 0.000002s : 6: predicate.loop_unroll_after_grad 2.03% : 0.000006s : 38: predicate.loop_unroll_before_grad 1.69% : 0.000005s : 29: predicate.make_slice_get_slice_eliminator 0.66% : 0.000002s : 12: predicate.merge_addn 0.69% : 0.000002s : 12: predicate.micro_step_allgather_replace 0.63% : 0.000002s : 12: predicate.mini_step_allgather_replace 0.81% : 0.000002s : 17: predicate.minmaximum_grad 0.82% : 0.000002s : 6: predicate.mutable_eliminate 0.36% : 0.000001s : 6: predicate.opt_reshape 0.41% : 0.000001s : 6: predicate.parallel_virtual_node 1.56% : 0.000004s : 25: predicate.partial_defer_inline 1.67% : 0.000005s : 27: predicate.partial_eliminate 0.86% : 0.000002s : 17: predicate.print_const_string_wrapper 0.65% : 0.000002s : 12: predicate.reduce_all_const_elim 1.25% : 0.000003s : 17: predicate.reduce_eliminate 2.55% : 0.000007s : 50: predicate.redundant_stop_gradient_eliminater 0.40% : 0.000001s : 12: predicate.remove_not_recompute_node 1.31% : 0.000004s : 33: predicate.replace_applicator 0.54% : 0.000001s : 12: predicate.replace_old_param 0.28% : 0.000001s : 6: predicate.reset_defer_inline 0.99% : 0.000003s : 17: predicate.reshape_eliminate 0.68% : 0.000002s : 12: predicate.row_tensor_add_zeros_like 0.36% : 0.000001s : 6: predicate.row_tensor_eliminate 0.81% : 0.000002s : 12: predicate.same_eliminate 0.41% : 0.000001s : 12: predicate.set_cell_output_no_recompute 0.78% : 0.000002s : 12: predicate.shard_identity_eliminate 0.72% : 0.000002s : 12: predicate.special_op_eliminate 0.83% : 0.000002s : 12: predicate.specialize_transform 0.88% : 0.000002s : 12: predicate.split_environ_get_set_with_tuple_value 0.78% : 0.000002s : 12: predicate.stack_unstack_eliminate 0.39% : 0.000001s : 6: predicate.switch_call_monad_eliminater 1.38% : 0.000004s : 25: predicate.switch_defer_inline 1.96% : 0.000005s : 37: predicate.switch_layer_defer_inline 4.83% : 0.000013s : 81: predicate.switch_simplify 0.86% : 0.000002s : 17: predicate.tile_eliminate 0.87% : 0.000002s : 17: predicate.transpose_eliminate 1.57% : 0.000004s : 29: predicate.tuple_list_convert_item_index_to_positive 1.70% : 0.000005s : 29: predicate.tuple_list_get_item_const_eliminator 1.50% : 0.000004s : 29: predicate.tuple_list_get_item_depend_reorder 3.30% : 0.000009s : 45: predicate.tuple_list_get_item_eliminator 1.56% : 0.000004s : 29: predicate.tuple_list_get_set_item_eliminator 2.42% : 0.000007s : 41: predicate.tuple_list_set_item_eliminator 1.86% : 0.000005s : 33: predicate.tuple_to_list_eliminator_ 2.47% : 0.000007s : 50: predicate.updatestate_pure_node_eliminater 3.22% : 0.000009s : 62: predicate.updatestate_useless_node_eliminater 0.37% : 0.000001s : 6: predicate.value_based_eliminate 0.74% : 0.000002s : 12: predicate.virtual_dataset_eliminate 0.79% : 0.000002s : 12: predicate.virtual_output_eliminate 0.33% : 0.000001s : 6: predicate.virtual_view_grad_eliminate 0.43% : 0.000001s : 6: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000499 11 56.04% : 0.000280s : 5: func_graph_cloner_run.FuncGraphClonerGraph 43.96% : 0.000219s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.025562 192 0.01% : 0.000004s : 1: ForceFp32Comm 11.75% : 0.003004s : 1: add_attr 11.71% : 0.002994s : 1: add_attr_with_inline 0.02% : 0.000004s : 1: add_comm_op_reuse_tag 0.24% : 0.000062s : 1: add_recomputation 0.02% : 0.000004s : 1: assign_add_opt 0.27% : 0.000070s : 1: auto_monad 0.09% : 0.000023s : 1: auto_monad_reorder 0.01% : 0.000004s : 1: begin_end_overlap_inline 0.02% : 0.000006s : 1: bias_add_comm_swap 1.78% : 0.000456s : 1: bootstrap 0.10% : 0.000026s : 1: cconv 0.02% : 0.000004s : 1: comm_op_add_attrs 0.08% : 0.000020s : 1: control_data_broadcast_order 0.05% : 0.000012s : 1: convert_after_rewriter 0.12% : 0.000030s : 1: cse_after_recomputation 0.02% : 0.000005s : 1: dataset_repeat_opt 0.02% : 0.000005s : 1: detach_backward 0.04% : 0.000009s : 1: environ_conv 0.10% : 0.000025s : 1: event_method 0.02% : 0.000005s : 1: full_micro_interleaved_order_control 0.02% : 0.000004s : 1: get_jit_bprop_graph 0.04% : 0.000009s : 1: graph_reusing 0.02% : 0.000005s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000004s : 1: handle_group_info 0.02% : 0.000005s : 1: inline 0.02% : 0.000006s : 1: insert-virtual-dataset 0.01% : 0.000004s : 1: interleave_parallel_branches 0.02% : 0.000004s : 1: interleave_split_concat_branches 0.02% : 0.000006s : 1: label_fine_grained_interleaved_index 0.03% : 0.000007s : 1: label_micro_interleaved_index 1.65% : 0.000421s : 1: loop_unroll 0.02% : 0.000004s : 1: merge_cast_opt 0.02% : 0.000005s : 1: micro_interleaved_order_control 1.84% : 0.000470s : 1: mutable_eliminate 0.03% : 0.000007s : 1: offloading_packed_experts 0.07% : 0.000017s : 1: opt.transform.loop_unroll_optimizer 0.07% : 0.000018s : 1: opt.transform.mutable_eliminate 6.08% : 0.001553s : 78: opt.transform.opt_a 0.17% : 0.000043s : 1: opt.transform.opt_after_cconv 0.12% : 0.000032s : 1: opt.transform.opt_after_jit_grad 0.63% : 0.000161s : 28: opt.transform.opt_b 0.27% : 0.000068s : 2: opt.transform.opt_trans_graph 0.20% : 0.000050s : 4: opt.transform.symbol_engine_opt 12.02% : 0.003072s : 1: opt_a 0.51% : 0.000130s : 1: opt_after_cconv 1.89% : 0.000483s : 1: opt_after_jit_grad 1.08% : 0.000277s : 1: opt_b 20.38% : 0.005210s : 1: optimize 0.09% : 0.000022s : 1: optimize_parallel_all_gather_comm 0.04% : 0.000009s : 1: order_py_execute_after_rewriter 0.10% : 0.000026s : 1: overlap_grad_flash_sp 0.02% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.03% : 0.000008s : 1: overlap_grad_ring_attention 0.02% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.02% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.02% : 0.000005s : 1: overlap_param_gather 0.02% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.03% : 0.000008s : 1: overlap_recompute_and_grad_model_parallel 0.02% : 0.000005s : 1: overlap_recompute_comm 0.03% : 0.000007s : 1: parallel-infer-symbol 0.01% : 0.000004s : 1: parallel-infer-symbol-second 0.02% : 0.000005s : 1: partial_unused_args_eliminate 0.02% : 0.000005s : 1: pipeline_parallel_scheduler 0.02% : 0.000004s : 1: pipeline_split 0.14% : 0.000035s : 1: pre_auto_parallel 0.12% : 0.000032s : 1: py_interpret_to_execute 0.05% : 0.000013s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000004s : 1: remove_cast_before_assign_add 0.15% : 0.000039s : 1: remove_dup_value 1.30% : 0.000331s : 1: renormalize.infer 1.11% : 0.000285s : 1: renormalize.specialize 0.02% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.03% : 0.000007s : 1: rewriter_after_jit_bprop_graph 0.19% : 0.000048s : 1: rewriter_after_opt_a 0.35% : 0.000090s : 1: rewriter_before_opt_a 0.02% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.02% : 0.000005s : 1: slice_recompute_activation 0.02% : 0.000004s : 1: split_layernorm_comm 0.02% : 0.000005s : 1: split_matmul_comm_elemetwise 0.04% : 0.000009s : 1: swap_dp_allreduce_reducescatter 0.36% : 0.000092s : 1: symbol_engine_optimizer 0.39% : 0.000099s : 1: tuple_transform 21.45% : 0.005482s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:45:28.779.150 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:45:28.779.406 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.016629, [21] [bootstrap]: 0.00046063 [type_inference]: 0.00593364 [event_method]: 1.948e-05 [auto_monad]: 6.122e-05 [graph_reusing]: 5.92999e-06 [inline]: 2.31998e-06 [add_attr]: 0.00311778, [1] [add_attr_with_inline]: 0.00310902, [1] [Cycle 1]: 7.811e-05, [2] [tag_attr]: 2.009e-05 [meta_addattr_fg_expand]: 7.01001e-06 [parallel-infer-symbol]: 3.38999e-06 [pre_auto_parallel]: 3.342e-05 [insert-virtual-dataset]: 2.26e-06 [parallel-infer-symbol-second]: 8.50006e-07 [dataset_repeat_opt]: 2.34999e-06 [pipeline_split]: 1.67001e-06 [optimize]: 0.00579377, [53] [py_interpret_to_execute]: 2.861e-05 [rewriter_before_opt_a]: 8.687e-05 [opt_a]: 0.00339882, [2] [Cycle 1]: 0.00243754, [45] [expand_dump_flag]: 3.17002e-06 [switch_simplify]: 4.299e-05 [loop_unroll]: 3.105e-05 [a_1]: 0.00074305 [with_stream_mark]: 3.126e-05 [recompute_prepare]: 1.137e-05 [updatestate_depend_eliminate]: 5.84e-06 [updatestate_assign_eliminate]: 4.35e-06 [updatestate_loads_eliminate]: 4.08001e-06 [parameter_eliminate]: 2.12001e-06 [a_2]: 0.00013072 [accelerated_algorithm]: 8.76002e-06 [shard]: 1.94e-06 [meta_shard_fg_expand]: 2.09999e-06 [shard_inline]: 8.13001e-06 [merge_send_recv]: 1.001e-05 [auto_parallel]: 7.7e-06 [parallel]: 1.92e-05 [flash_sp]: 8.84003e-06 [merge_comm]: 4.99e-06 [allreduce_fusion]: 4.34997e-06 [matmul_add_comm_reduction]: 1.087e-05 [allreduce_slice_to_reducescatter]: 7.7e-07 [virtual_shard_identity]: 1.008e-05 [virtual_dataset]: 9.27999e-06 [get_grad_eliminate_]: 8.60999e-06 [virtual_output]: 8.60999e-06 [merge_forward]: 4.79998e-06 [cell_reuse_recompute_pass]: 1.70001e-06 [offload_activation]: 1.099e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.858e-05 [merge_recompute_call_nodes]: 1.45999e-06 [before_grad]: 1.325e-05 [set_forward_comm_id_for_comm_node_pass]: 4.47e-06 [meta_fg_expand]: 3.31999e-06 [flash_sp_send_recv_attached]: 2.61999e-06 [receive_attached]: 2.58e-06 [after_resolve]: 1.292e-05 [a_after_grad]: 1.334e-05 [renormalize]: 0.00067534 [add_forward_monad_depend]: 5.66998e-06 [auto_monad_grad]: 2.27001e-06 [auto_monad_eliminator]: 1.693e-05 [cse]: 3.49e-05 [a_3]: 7.236e-05 [Cycle 2]: 0.00094811, [45] [expand_dump_flag]: 1.12e-06 [switch_simplify]: 1.02e-05 [loop_unroll]: 7.73001e-06 [a_1]: 0.00017955 [with_stream_mark]: 1.266e-05 [recompute_prepare]: 7.85e-06 [updatestate_depend_eliminate]: 3.8e-06 [updatestate_assign_eliminate]: 3.23998e-06 [updatestate_loads_eliminate]: 3.03e-06 [parameter_eliminate]: 1.42e-06 [a_2]: 0.00012152 [accelerated_algorithm]: 7.87e-06 [shard]: 1.40001e-06 [meta_shard_fg_expand]: 1.79e-06 [shard_inline]: 7.43e-06 [merge_send_recv]: 6.74001e-06 [auto_parallel]: 6.89999e-06 [parallel]: 5.76998e-06 [flash_sp]: 4e-06 [merge_comm]: 4.23999e-06 [allreduce_fusion]: 4.19002e-06 [matmul_add_comm_reduction]: 6.81001e-06 [allreduce_slice_to_reducescatter]: 2.9002e-07 [virtual_shard_identity]: 8.12e-06 [virtual_dataset]: 7.25998e-06 [get_grad_eliminate_]: 7.22002e-06 [virtual_output]: 7.41001e-06 [merge_forward]: 3.61001e-06 [cell_reuse_recompute_pass]: 1.87001e-06 [offload_activation]: 7.77e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.665e-05 [merge_recompute_call_nodes]: 1.15999e-06 [before_grad]: 1.22e-05 [set_forward_comm_id_for_comm_node_pass]: 4.11001e-06 [meta_fg_expand]: 2.51998e-06 [flash_sp_send_recv_attached]: 1.37e-06 [receive_attached]: 1.29998e-06 [after_resolve]: 1.296e-05 [a_after_grad]: 1.149e-05 [renormalize]: 8.9989e-08 [add_forward_monad_depend]: 1.30001e-06 [auto_monad_grad]: 9.49978e-07 [auto_monad_eliminator]: 9.36998e-06 [cse]: 1.95e-05 [a_3]: 5.761e-05 [py_interpret_to_execute_after_opt_a]: 1.405e-05 [slice_cell_reuse_recomputed_activation]: 4.48001e-06 [rewriter_after_opt_a]: 4.373e-05 [convert_after_rewriter]: 1.104e-05 [order_py_execute_after_rewriter]: 8.47e-06 [mutable_eliminate]: 0.00050174 [opt_b]: 0.00030851, [1] [Cycle 1]: 0.00029907, [7] [b_1]: 0.00019768 [b_2]: 9.39e-06 [updatestate_depend_eliminate]: 6.96001e-06 [updatestate_assign_eliminate]: 3.18e-06 [updatestate_loads_eliminate]: 3.09001e-06 [renormalize]: 4.50003e-07 [cse]: 2.336e-05 [optimize_parallel_all_gather_comm]: 2.154e-05 [overlap_param_gather]: 4.94e-06 [cconv]: 2.756e-05 [loop_unroll]: 0.00042887 [opt_after_cconv]: 0.00016577, [1] [Cycle 1]: 0.00015688, [7] [c_1]: 3.802e-05 [parameter_eliminate]: 2.51998e-06 [updatestate_depend_eliminate]: 6.26e-06 [updatestate_assign_eliminate]: 3.35e-06 [updatestate_loads_eliminate]: 3.14001e-06 [cse]: 2.3e-05 [renormalize]: 3.19997e-07 [remove_dup_value]: 1.963e-05 [tuple_transform]: 0.00010206, [1] [Cycle 1]: 9.446e-05, [4] [d_1]: 5.281e-05 [none_parameter_eliminate]: 1.89e-06 [renormalize]: 2.20025e-07 [switch_simplify]: 8.52e-06 [partial_unused_args_eliminate]: 4.80999e-06 [add_recomputation]: 6.106e-05 [cse_after_recomputation]: 3.193e-05, [1] [Cycle 1]: 2.481e-05, [1] [cse]: 1.551e-05 [environ_conv]: 9.77001e-06 [swap_dp_allreduce_reducescatter]: 9.31e-06 [bias_add_comm_swap]: 5.96998e-06 [label_micro_interleaved_index]: 7.45e-06 [label_fine_grained_interleaved_index]: 5.56002e-06 [merge_cast_opt]: 3.91999e-06 [slice_recompute_activation]: 5.18002e-06 [micro_interleaved_order_control]: 5.12e-06 [assign_add_opt]: 3.66999e-06 [ForceFp32Comm]: 3.16999e-06 [remove_cast_before_assign_add]: 3.66001e-06 [full_micro_interleaved_order_control]: 4.66002e-06 [reorder_send_recv_between_fp_bp]: 5.81998e-06 [comm_op_add_attrs]: 3.6e-06 [add_comm_op_reuse_tag]: 3.78001e-06 [interleave_split_concat_branches]: 3.6e-06 [interleave_parallel_branches]: 3.68999e-06 [overlap_opt_shard_in_pipeline]: 3.50998e-06 [overlap_opt_shard_grad_in_pipeline]: 4.52e-06 [control_data_broadcast_order]: 1.831e-05 [grouped_pairwise_exchange_alltoall]: 3.93001e-06 [offloading_packed_experts]: 7.08e-06 [overlap_recompute_and_grad_model_parallel]: 8.38999e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.80998e-06 [overlap_recompute_allgather_and_fa_grad]: 3.66001e-06 [overlap_recompute_comm]: 4.94e-06 [overlap_grad_ring_attention]: 7.16999e-06 [overlap_grad_flash_sp]: 2.503e-05 [begin_end_overlap_inline]: 3.28e-06 [split_matmul_comm_elemetwise]: 5.39e-06 [split_layernorm_comm]: 4.03001e-06 [handle_group_info]: 3.46001e-06 [symbol_engine_optimizer]: 0.00011027, [1] [Cycle 1]: 0.00010234, [6] [build]: 3.81001e-06 [elim_shapecalc]: 1.337e-05 [elim_not_effective]: 1.599e-05 [opt_reshape]: 8.89e-06 [fold_const_symbol]: 1.226e-05 [renormalize]: 3.10014e-07 [detach_backward]: 3.21001e-06 [pipeline_parallel_scheduler]: 1.91998e-06 [auto_monad_reorder]: 2.276e-05 [get_jit_bprop_graph]: 1.36998e-06 [rewriter_after_jit_bprop_graph]: 4.56002e-06 [opt_after_jit_grad]: 0.00051199 [validate]: 4.212e-05 Sums bootstrap : 0.000461s : 3.94% type_inference : 0.005934s : 50.73% event_method : 0.000019s : 0.17% auto_monad : 0.000061s : 0.52% graph_reusing : 0.000006s : 0.05% inline : 0.000002s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000020s : 0.17% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000007s : 0.06% parallel-infer-symbol : 0.000003s : 0.03% pre_auto_parallel : 0.000033s : 0.29% insert-virtual-dataset : 0.000002s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000029s : 0.24% optimize.rewriter_before_opt_a : 0.000087s : 0.74% optimize.opt_a.expand_dump_flag : 0.000004s : 0.04% optimize.opt_a.switch_simplify : 0.000053s : 0.45% optimize.opt_a.loop_unroll : 0.000039s : 0.33% optimize.opt_a.a_1 : 0.000923s : 7.89% optimize.opt_a.with_stream_mark : 0.000044s : 0.38% optimize.opt_a.recompute_prepare : 0.000019s : 0.16% optimize.opt_a.updatestate_depend_eliminate : 0.000010s : 0.08% optimize.opt_a.updatestate_assign_eliminate : 0.000008s : 0.06% optimize.opt_a.updatestate_loads_eliminate : 0.000007s : 0.06% optimize.opt_a.parameter_eliminate : 0.000004s : 0.03% optimize.opt_a.a_2 : 0.000252s : 2.16% optimize.opt_a.accelerated_algorithm : 0.000017s : 0.14% optimize.opt_a.shard : 0.000003s : 0.03% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.03% optimize.opt_a.shard_inline : 0.000016s : 0.13% optimize.opt_a.merge_send_recv : 0.000017s : 0.14% optimize.opt_a.auto_parallel : 0.000015s : 0.12% optimize.opt_a.parallel : 0.000025s : 0.21% optimize.opt_a.flash_sp : 0.000013s : 0.11% optimize.opt_a.merge_comm : 0.000009s : 0.08% optimize.opt_a.allreduce_fusion : 0.000009s : 0.07% optimize.opt_a.matmul_add_comm_reduction : 0.000018s : 0.15% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000018s : 0.16% optimize.opt_a.virtual_dataset : 0.000017s : 0.14% optimize.opt_a.get_grad_eliminate_ : 0.000016s : 0.14% optimize.opt_a.virtual_output : 0.000016s : 0.14% optimize.opt_a.merge_forward : 0.000008s : 0.07% optimize.opt_a.cell_reuse_recompute_pass : 0.000004s : 0.03% optimize.opt_a.offload_activation : 0.000019s : 0.16% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000035s : 0.30% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.02% optimize.opt_a.before_grad : 0.000025s : 0.22% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000009s : 0.07% optimize.opt_a.meta_fg_expand : 0.000006s : 0.05% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.03% optimize.opt_a.receive_attached : 0.000004s : 0.03% optimize.opt_a.after_resolve : 0.000026s : 0.22% optimize.opt_a.a_after_grad : 0.000025s : 0.21% optimize.opt_a.renormalize : 0.000675s : 5.78% optimize.opt_a.add_forward_monad_depend : 0.000007s : 0.06% optimize.opt_a.auto_monad_grad : 0.000003s : 0.03% optimize.opt_a.auto_monad_eliminator : 0.000026s : 0.22% optimize.opt_a.cse : 0.000054s : 0.47% optimize.opt_a.a_3 : 0.000130s : 1.11% optimize.py_interpret_to_execute_after_opt_a : 0.000014s : 0.12% optimize.slice_cell_reuse_recomputed_activation : 0.000004s : 0.04% optimize.rewriter_after_opt_a : 0.000044s : 0.37% optimize.convert_after_rewriter : 0.000011s : 0.09% optimize.order_py_execute_after_rewriter : 0.000008s : 0.07% optimize.mutable_eliminate : 0.000502s : 4.29% optimize.opt_b.b_1 : 0.000198s : 1.69% optimize.opt_b.b_2 : 0.000009s : 0.08% optimize.opt_b.updatestate_depend_eliminate : 0.000007s : 0.06% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_b.renormalize : 0.000000s : 0.00% optimize.opt_b.cse : 0.000023s : 0.20% optimize.optimize_parallel_all_gather_comm : 0.000022s : 0.18% optimize.overlap_param_gather : 0.000005s : 0.04% optimize.cconv : 0.000028s : 0.24% optimize.loop_unroll : 0.000429s : 3.67% optimize.opt_after_cconv.c_1 : 0.000038s : 0.33% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.02% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.05% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.cse : 0.000023s : 0.20% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000020s : 0.17% optimize.tuple_transform.d_1 : 0.000053s : 0.45% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.02% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000009s : 0.07% optimize.partial_unused_args_eliminate : 0.000005s : 0.04% optimize.add_recomputation : 0.000061s : 0.52% optimize.cse_after_recomputation.cse : 0.000016s : 0.13% optimize.environ_conv : 0.000010s : 0.08% optimize.swap_dp_allreduce_reducescatter : 0.000009s : 0.08% optimize.bias_add_comm_swap : 0.000006s : 0.05% optimize.label_micro_interleaved_index : 0.000007s : 0.06% optimize.label_fine_grained_interleaved_index : 0.000006s : 0.05% optimize.merge_cast_opt : 0.000004s : 0.03% optimize.slice_recompute_activation : 0.000005s : 0.04% optimize.micro_interleaved_order_control : 0.000005s : 0.04% optimize.assign_add_opt : 0.000004s : 0.03% optimize.ForceFp32Comm : 0.000003s : 0.03% optimize.remove_cast_before_assign_add : 0.000004s : 0.03% optimize.full_micro_interleaved_order_control : 0.000005s : 0.04% optimize.reorder_send_recv_between_fp_bp : 0.000006s : 0.05% optimize.comm_op_add_attrs : 0.000004s : 0.03% optimize.add_comm_op_reuse_tag : 0.000004s : 0.03% optimize.interleave_split_concat_branches : 0.000004s : 0.03% optimize.interleave_parallel_branches : 0.000004s : 0.03% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.03% optimize.overlap_opt_shard_grad_in_pipeline : 0.000005s : 0.04% optimize.control_data_broadcast_order : 0.000018s : 0.16% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.03% optimize.offloading_packed_experts : 0.000007s : 0.06% optimize.overlap_recompute_and_grad_model_parallel : 0.000008s : 0.07% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.03% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.03% optimize.overlap_recompute_comm : 0.000005s : 0.04% optimize.overlap_grad_ring_attention : 0.000007s : 0.06% optimize.overlap_grad_flash_sp : 0.000025s : 0.21% optimize.begin_end_overlap_inline : 0.000003s : 0.03% optimize.split_matmul_comm_elemetwise : 0.000005s : 0.05% optimize.split_layernorm_comm : 0.000004s : 0.03% optimize.handle_group_info : 0.000003s : 0.03% optimize.symbol_engine_optimizer.build : 0.000004s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000013s : 0.11% optimize.symbol_engine_optimizer.elim_not_effective : 0.000016s : 0.14% optimize.symbol_engine_optimizer.opt_reshape : 0.000009s : 0.08% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000012s : 0.10% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000003s : 0.03% pipeline_parallel_scheduler : 0.000002s : 0.02% auto_monad_reorder : 0.000023s : 0.19% get_jit_bprop_graph : 0.000001s : 0.01% rewriter_after_jit_bprop_graph : 0.000005s : 0.04% opt_after_jit_grad : 0.000512s : 4.38% validate : 0.000042s : 0.36% Time group info: ------[substitution.] 0.000206 38 11.39% : 0.000023s : 3: substitution.cast_eliminate 1.14% : 0.000002s : 3: substitution.elim_not_effective 0.80% : 0.000002s : 3: substitution.fold_const_symbol 3.53% : 0.000007s : 5: substitution.graph_param_transform 67.67% : 0.000139s : 4: substitution.inline 2.31% : 0.000005s : 6: substitution.j_node_and_user_rematch 2.94% : 0.000006s : 6: substitution.remove_not_recompute_node 2.53% : 0.000005s : 4: substitution.replace_old_param 7.69% : 0.000016s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.005881 2 88.24% : 0.005190s : 1: type_inference.infer 11.76% : 0.000692s : 1: type_inference.specialize ------[replace.] 0.000062 8 58.60% : 0.000036s : 4: replace.inline 41.40% : 0.000026s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000150 8 90.85% : 0.000137s : 4: match.inline 9.15% : 0.000014s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000259 1596 0.98% : 0.000003s : 17: predicate.accumulaten_eliminater 0.65% : 0.000002s : 5: predicate.ad_related_special_op_eliminate 0.55% : 0.000001s : 10: predicate.addn_check_dump 1.06% : 0.000003s : 17: predicate.addn_zero_filter 0.92% : 0.000002s : 17: predicate.adjust_all_reduce_mul_add 2.24% : 0.000006s : 27: predicate.arithmetic_simplify 1.04% : 0.000003s : 17: predicate.cast_eliminate 0.60% : 0.000002s : 10: predicate.check_bprop_eliminate 0.57% : 0.000001s : 10: predicate.compare_switch_simplify 0.19% : 0.000000s : 5: predicate.const_output_eliminate 0.58% : 0.000001s : 10: predicate.depend_value_elim 1.04% : 0.000003s : 17: predicate.dict_get_item_const_eliminator 1.05% : 0.000003s : 17: predicate.dict_get_item_eliminator 0.96% : 0.000002s : 17: predicate.dict_set_item_eliminator 1.02% : 0.000003s : 10: predicate.dumpgradient_eliminate 0.20% : 0.000001s : 5: predicate.elim_not_effective 0.35% : 0.000001s : 5: predicate.elim_shapecalc_of_broadcastargs 1.20% : 0.000003s : 22: predicate.environ_add_const_eliminate 1.24% : 0.000003s : 22: predicate.environ_get_add_eliminate 1.18% : 0.000003s : 22: predicate.environ_get_depend_swap 1.70% : 0.000004s : 32: predicate.environ_get_eliminate 1.12% : 0.000003s : 22: predicate.environ_get_set_eliminate 1.44% : 0.000004s : 25: predicate.exchange_switch_depend_value 2.36% : 0.000006s : 25: predicate.float_depend_g_call 0.56% : 0.000001s : 10: predicate.float_environ_get_switch 0.81% : 0.000002s : 15: predicate.float_tuple_getitem_switch 0.17% : 0.000000s : 5: predicate.fold_const_symbol 0.63% : 0.000002s : 10: predicate.get_grad_eliminate 0.22% : 0.000001s : 5: predicate.graph_param_transform 0.62% : 0.000002s : 10: predicate.incorporate_call 0.48% : 0.000001s : 10: predicate.incorporate_call_switch 6.15% : 0.000016s : 72: predicate.inline 0.74% : 0.000002s : 10: predicate.inline_without_move 0.33% : 0.000001s : 10: predicate.j_node_and_user_rematch 0.70% : 0.000002s : 10: predicate.less_batch_normalization 1.81% : 0.000005s : 31: predicate.list_to_tuple_eliminator_ 2.65% : 0.000007s : 48: predicate.load_eliminater 0.73% : 0.000002s : 5: predicate.loop_unroll_after_grad 1.92% : 0.000005s : 36: predicate.loop_unroll_before_grad 1.66% : 0.000004s : 27: predicate.make_slice_get_slice_eliminator 0.61% : 0.000002s : 10: predicate.merge_addn 0.59% : 0.000002s : 10: predicate.micro_step_allgather_replace 0.57% : 0.000001s : 10: predicate.mini_step_allgather_replace 0.87% : 0.000002s : 17: predicate.minmaximum_grad 0.87% : 0.000002s : 5: predicate.mutable_eliminate 0.38% : 0.000001s : 5: predicate.opt_reshape 0.32% : 0.000001s : 5: predicate.parallel_virtual_node 1.73% : 0.000004s : 25: predicate.partial_defer_inline 1.74% : 0.000004s : 26: predicate.partial_eliminate 1.02% : 0.000003s : 17: predicate.print_const_string_wrapper 0.57% : 0.000001s : 10: predicate.reduce_all_const_elim 1.20% : 0.000003s : 17: predicate.reduce_eliminate 2.53% : 0.000007s : 48: predicate.redundant_stop_gradient_eliminater 0.37% : 0.000001s : 10: predicate.remove_not_recompute_node 1.37% : 0.000004s : 31: predicate.replace_applicator 0.48% : 0.000001s : 10: predicate.replace_old_param 0.26% : 0.000001s : 5: predicate.reset_defer_inline 1.06% : 0.000003s : 17: predicate.reshape_eliminate 0.55% : 0.000001s : 10: predicate.row_tensor_add_zeros_like 0.33% : 0.000001s : 5: predicate.row_tensor_eliminate 0.84% : 0.000002s : 10: predicate.same_eliminate 0.41% : 0.000001s : 10: predicate.set_cell_output_no_recompute 0.74% : 0.000002s : 10: predicate.shard_identity_eliminate 0.69% : 0.000002s : 10: predicate.special_op_eliminate 0.79% : 0.000002s : 10: predicate.specialize_transform 0.94% : 0.000002s : 10: predicate.split_environ_get_set_with_tuple_value 0.94% : 0.000002s : 10: predicate.stack_unstack_eliminate 0.34% : 0.000001s : 5: predicate.switch_call_monad_eliminater 1.55% : 0.000004s : 25: predicate.switch_defer_inline 2.11% : 0.000005s : 35: predicate.switch_layer_defer_inline 4.79% : 0.000012s : 76: predicate.switch_simplify 0.92% : 0.000002s : 17: predicate.tile_eliminate 0.94% : 0.000002s : 17: predicate.transpose_eliminate 1.60% : 0.000004s : 27: predicate.tuple_list_convert_item_index_to_positive 1.66% : 0.000004s : 27: predicate.tuple_list_get_item_const_eliminator 1.51% : 0.000004s : 27: predicate.tuple_list_get_item_depend_reorder 3.37% : 0.000009s : 41: predicate.tuple_list_get_item_eliminator 1.47% : 0.000004s : 27: predicate.tuple_list_get_set_item_eliminator 2.36% : 0.000006s : 37: predicate.tuple_list_set_item_eliminator 1.79% : 0.000005s : 31: predicate.tuple_to_list_eliminator_ 2.60% : 0.000007s : 48: predicate.updatestate_pure_node_eliminater 3.17% : 0.000008s : 58: predicate.updatestate_useless_node_eliminater 0.37% : 0.000001s : 5: predicate.value_based_eliminate 0.68% : 0.000002s : 10: predicate.virtual_dataset_eliminate 0.70% : 0.000002s : 10: predicate.virtual_output_eliminate 0.27% : 0.000001s : 5: predicate.virtual_view_grad_eliminate 0.62% : 0.000002s : 5: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000538 11 56.35% : 0.000303s : 5: func_graph_cloner_run.FuncGraphClonerGraph 43.65% : 0.000235s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.027870 192 0.02% : 0.000006s : 1: ForceFp32Comm 11.22% : 0.003126s : 1: add_attr 11.17% : 0.003113s : 1: add_attr_with_inline 0.02% : 0.000006s : 1: add_comm_op_reuse_tag 0.23% : 0.000065s : 1: add_recomputation 0.02% : 0.000006s : 1: assign_add_opt 0.25% : 0.000071s : 1: auto_monad 0.11% : 0.000031s : 1: auto_monad_reorder 0.02% : 0.000006s : 1: begin_end_overlap_inline 0.03% : 0.000009s : 1: bias_add_comm_swap 1.83% : 0.000510s : 1: bootstrap 0.11% : 0.000031s : 1: cconv 0.02% : 0.000006s : 1: comm_op_add_attrs 0.08% : 0.000022s : 1: control_data_broadcast_order 0.05% : 0.000014s : 1: convert_after_rewriter 0.13% : 0.000035s : 1: cse_after_recomputation 0.03% : 0.000008s : 1: dataset_repeat_opt 0.07% : 0.000019s : 1: detach_backward 0.05% : 0.000013s : 1: environ_conv 0.11% : 0.000030s : 1: event_method 0.03% : 0.000008s : 1: full_micro_interleaved_order_control 0.03% : 0.000008s : 1: get_jit_bprop_graph 0.05% : 0.000013s : 1: graph_reusing 0.02% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.02% : 0.000006s : 1: handle_group_info 0.03% : 0.000008s : 1: inline 0.03% : 0.000009s : 1: insert-virtual-dataset 0.02% : 0.000006s : 1: interleave_parallel_branches 0.02% : 0.000006s : 1: interleave_split_concat_branches 0.03% : 0.000008s : 1: label_fine_grained_interleaved_index 0.04% : 0.000011s : 1: label_micro_interleaved_index 1.56% : 0.000435s : 1: loop_unroll 0.02% : 0.000007s : 1: merge_cast_opt 0.03% : 0.000008s : 1: micro_interleaved_order_control 1.82% : 0.000508s : 1: mutable_eliminate 0.04% : 0.000010s : 1: offloading_packed_experts 0.06% : 0.000016s : 1: opt.transform.loop_unroll_optimizer 0.06% : 0.000018s : 1: opt.transform.mutable_eliminate 5.21% : 0.001451s : 78: opt.transform.opt_a 0.13% : 0.000037s : 1: opt.transform.opt_after_cconv 0.11% : 0.000029s : 1: opt.transform.opt_after_jit_grad 0.48% : 0.000134s : 28: opt.transform.opt_b 0.21% : 0.000059s : 2: opt.transform.opt_trans_graph 0.17% : 0.000047s : 4: opt.transform.symbol_engine_opt 12.21% : 0.003402s : 1: opt_a 0.61% : 0.000170s : 1: opt_after_cconv 1.87% : 0.000522s : 1: opt_after_jit_grad 1.12% : 0.000312s : 1: opt_b 22.05% : 0.006144s : 1: optimize 0.09% : 0.000025s : 1: optimize_parallel_all_gather_comm 0.04% : 0.000012s : 1: order_py_execute_after_rewriter 0.10% : 0.000028s : 1: overlap_grad_flash_sp 0.02% : 0.000007s : 1: overlap_grad_matmul_and_grad_allreduce 0.04% : 0.000010s : 1: overlap_grad_ring_attention 0.03% : 0.000007s : 1: overlap_opt_shard_grad_in_pipeline 0.02% : 0.000006s : 1: overlap_opt_shard_in_pipeline 0.03% : 0.000008s : 1: overlap_param_gather 0.02% : 0.000006s : 1: overlap_recompute_allgather_and_fa_grad 0.04% : 0.000011s : 1: overlap_recompute_and_grad_model_parallel 0.03% : 0.000008s : 1: overlap_recompute_comm 0.04% : 0.000010s : 1: parallel-infer-symbol 0.02% : 0.000006s : 1: parallel-infer-symbol-second 0.03% : 0.000008s : 1: partial_unused_args_eliminate 0.04% : 0.000010s : 1: pipeline_parallel_scheduler 0.03% : 0.000007s : 1: pipeline_split 0.15% : 0.000041s : 1: pre_auto_parallel 0.12% : 0.000032s : 1: py_interpret_to_execute 0.06% : 0.000017s : 1: py_interpret_to_execute_after_opt_a 0.02% : 0.000007s : 1: remove_cast_before_assign_add 0.08% : 0.000023s : 1: remove_dup_value 1.32% : 0.000368s : 1: renormalize.infer 1.07% : 0.000299s : 1: renormalize.specialize 0.03% : 0.000009s : 1: reorder_send_recv_between_fp_bp 0.04% : 0.000010s : 1: rewriter_after_jit_bprop_graph 0.17% : 0.000047s : 1: rewriter_after_opt_a 0.32% : 0.000090s : 1: rewriter_before_opt_a 0.03% : 0.000007s : 1: slice_cell_reuse_recomputed_activation 0.03% : 0.000008s : 1: slice_recompute_activation 0.02% : 0.000007s : 1: split_layernorm_comm 0.03% : 0.000008s : 1: split_matmul_comm_elemetwise 0.04% : 0.000012s : 1: swap_dp_allreduce_reducescatter 0.41% : 0.000113s : 1: symbol_engine_optimizer 0.38% : 0.000105s : 1: tuple_transform 21.43% : 0.005972s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:45:29.100.855 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.014526, [21] [bootstrap]: 0.00040553 [type_inference]: 0.00553684 [event_method]: 1.852e-05 [auto_monad]: 6.541e-05 [graph_reusing]: 6.71e-06 [inline]: 2.26998e-06 [add_attr]: 0.00304324, [1] [add_attr_with_inline]: 0.00303544, [1] [Cycle 1]: 5.275e-05, [2] [tag_attr]: 1.866e-05 [meta_addattr_fg_expand]: 6.32001e-06 [parallel-infer-symbol]: 3.13998e-06 [pre_auto_parallel]: 3.124e-05 [insert-virtual-dataset]: 2.61e-06 [parallel-infer-symbol-second]: 8.89995e-07 [dataset_repeat_opt]: 1.97999e-06 [pipeline_split]: 1.65001e-06 [optimize]: 0.00474775, [53] [py_interpret_to_execute]: 2.373e-05 [rewriter_before_opt_a]: 7.83e-05 [opt_a]: 0.00276351, [2] [Cycle 1]: 0.00201025, [45] [expand_dump_flag]: 2.85002e-06 [switch_simplify]: 4.029e-05 [loop_unroll]: 3.082e-05 [a_1]: 0.00069056 [with_stream_mark]: 1.366e-05 [recompute_prepare]: 9.83998e-06 [updatestate_depend_eliminate]: 4.3e-06 [updatestate_assign_eliminate]: 3.72002e-06 [updatestate_loads_eliminate]: 3.41001e-06 [parameter_eliminate]: 1.79998e-06 [a_2]: 0.00010292 [accelerated_algorithm]: 8.42998e-06 [shard]: 1.91e-06 [meta_shard_fg_expand]: 2.20002e-06 [shard_inline]: 7.72002e-06 [merge_send_recv]: 9.06002e-06 [auto_parallel]: 6.58e-06 [parallel]: 1.729e-05 [flash_sp]: 7.53e-06 [merge_comm]: 4.55999e-06 [allreduce_fusion]: 4.47e-06 [matmul_add_comm_reduction]: 1.112e-05 [allreduce_slice_to_reducescatter]: 6.29982e-07 [virtual_shard_identity]: 9.07001e-06 [virtual_dataset]: 8.02e-06 [get_grad_eliminate_]: 7.36001e-06 [virtual_output]: 7.5e-06 [merge_forward]: 4.31002e-06 [cell_reuse_recompute_pass]: 1.09003e-06 [offload_activation]: 1.009e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.467e-05 [merge_recompute_call_nodes]: 1.96e-06 [before_grad]: 1.257e-05 [set_forward_comm_id_for_comm_node_pass]: 4.18999e-06 [meta_fg_expand]: 3.89002e-06 [flash_sp_send_recv_attached]: 2.44001e-06 [receive_attached]: 2.16e-06 [after_resolve]: 1.231e-05 [a_after_grad]: 1.136e-05 [renormalize]: 0.00055074 [add_forward_monad_depend]: 4.89e-06 [auto_monad_grad]: 1.81e-06 [auto_monad_eliminator]: 1.531e-05 [cse]: 4.444e-05 [a_3]: 5.563e-05 [Cycle 2]: 0.00074336, [45] [expand_dump_flag]: 1.10001e-06 [switch_simplify]: 8.65001e-06 [loop_unroll]: 7.16999e-06 [a_1]: 0.00017227 [with_stream_mark]: 1.15e-05 [recompute_prepare]: 7.51999e-06 [updatestate_depend_eliminate]: 3.67998e-06 [updatestate_assign_eliminate]: 2.84999e-06 [updatestate_loads_eliminate]: 3.03998e-06 [parameter_eliminate]: 1.03001e-06 [a_2]: 9.09e-05 [accelerated_algorithm]: 7.47998e-06 [shard]: 1.12999e-06 [meta_shard_fg_expand]: 1.58002e-06 [shard_inline]: 7.26999e-06 [merge_send_recv]: 5.75001e-06 [auto_parallel]: 5.89e-06 [parallel]: 4.50001e-06 [flash_sp]: 3.4e-06 [merge_comm]: 4.02002e-06 [allreduce_fusion]: 3.56999e-06 [matmul_add_comm_reduction]: 6.80998e-06 [allreduce_slice_to_reducescatter]: 3.4002e-07 [virtual_shard_identity]: 8.15e-06 [virtual_dataset]: 7.13998e-06 [get_grad_eliminate_]: 6.97002e-06 [virtual_output]: 6.83998e-06 [merge_forward]: 2.94999e-06 [cell_reuse_recompute_pass]: 1.40001e-06 [offload_activation]: 7.10002e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.391e-05 [merge_recompute_call_nodes]: 6.89994e-07 [before_grad]: 1.206e-05 [set_forward_comm_id_for_comm_node_pass]: 4.50001e-06 [meta_fg_expand]: 2.66999e-06 [flash_sp_send_recv_attached]: 9.50007e-07 [receive_attached]: 1.02e-06 [after_resolve]: 1.121e-05 [a_after_grad]: 1.115e-05 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 1.22999e-06 [auto_monad_grad]: 9.39996e-07 [auto_monad_eliminator]: 8.32e-06 [cse]: 1.718e-05 [a_3]: 4.422e-05 [py_interpret_to_execute_after_opt_a]: 8.60999e-06 [slice_cell_reuse_recomputed_activation]: 2.19999e-06 [rewriter_after_opt_a]: 3.784e-05 [convert_after_rewriter]: 7.66999e-06 [order_py_execute_after_rewriter]: 5.74e-06 [mutable_eliminate]: 0.00044426 [opt_b]: 0.00023332, [1] [Cycle 1]: 0.00022748, [7] [b_1]: 0.00015058 [b_2]: 9.36002e-06 [updatestate_depend_eliminate]: 5.47999e-06 [updatestate_assign_eliminate]: 2.91999e-06 [updatestate_loads_eliminate]: 2.84999e-06 [renormalize]: 8.10018e-07 [cse]: 2.11e-05 [optimize_parallel_all_gather_comm]: 1.685e-05 [overlap_param_gather]: 1.93002e-06 [cconv]: 2.347e-05 [loop_unroll]: 0.00040954 [opt_after_cconv]: 0.0001121, [1] [Cycle 1]: 0.00010657, [7] [c_1]: 3.69e-05 [parameter_eliminate]: 2.51e-06 [updatestate_depend_eliminate]: 5.56e-06 [updatestate_assign_eliminate]: 2.91e-06 [updatestate_loads_eliminate]: 2.89999e-06 [cse]: 2.145e-05 [renormalize]: 4.60015e-07 [remove_dup_value]: 1.683e-05 [tuple_transform]: 8.42e-05, [1] [Cycle 1]: 7.977e-05, [4] [d_1]: 5.091e-05 [none_parameter_eliminate]: 1.87001e-06 [renormalize]: 2.60014e-07 [switch_simplify]: 7.93001e-06 [partial_unused_args_eliminate]: 1.88997e-06 [add_recomputation]: 5.548e-05 [cse_after_recomputation]: 2.379e-05, [1] [Cycle 1]: 1.94e-05, [1] [cse]: 1.418e-05 [environ_conv]: 5.97999e-06 [swap_dp_allreduce_reducescatter]: 5.99999e-06 [bias_add_comm_swap]: 2.57001e-06 [label_micro_interleaved_index]: 4.02e-06 [label_fine_grained_interleaved_index]: 2.74999e-06 [merge_cast_opt]: 1.22e-06 [slice_recompute_activation]: 2.12999e-06 [micro_interleaved_order_control]: 2.39999e-06 [assign_add_opt]: 1.09e-06 [ForceFp32Comm]: 7.89994e-07 [remove_cast_before_assign_add]: 9.69972e-07 [full_micro_interleaved_order_control]: 2.05002e-06 [reorder_send_recv_between_fp_bp]: 2.67001e-06 [comm_op_add_attrs]: 1.02e-06 [add_comm_op_reuse_tag]: 9.50007e-07 [interleave_split_concat_branches]: 1.14e-06 [interleave_parallel_branches]: 1.05001e-06 [overlap_opt_shard_in_pipeline]: 1.12e-06 [overlap_opt_shard_grad_in_pipeline]: 2.15002e-06 [control_data_broadcast_order]: 1.374e-05 [grouped_pairwise_exchange_alltoall]: 1.99e-06 [offloading_packed_experts]: 4.02e-06 [overlap_recompute_and_grad_model_parallel]: 4.85001e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.17999e-06 [overlap_recompute_allgather_and_fa_grad]: 1.30999e-06 [overlap_recompute_comm]: 2.37999e-06 [overlap_grad_ring_attention]: 4.48001e-06 [overlap_grad_flash_sp]: 1.992e-05 [begin_end_overlap_inline]: 5.59987e-07 [split_matmul_comm_elemetwise]: 2.00002e-06 [split_layernorm_comm]: 1.67999e-06 [handle_group_info]: 9.79984e-07 [symbol_engine_optimizer]: 8.289e-05, [1] [Cycle 1]: 7.86e-05, [6] [build]: 3.07002e-06 [elim_shapecalc]: 1.109e-05 [elim_not_effective]: 1.536e-05 [opt_reshape]: 8.27e-06 [fold_const_symbol]: 1.24e-05 [renormalize]: 2.3999e-07 [detach_backward]: 1.67001e-06 [pipeline_parallel_scheduler]: 1.50999e-06 [auto_monad_reorder]: 1.942e-05 [get_jit_bprop_graph]: 1.02e-06 [rewriter_after_jit_bprop_graph]: 3.56001e-06 [opt_after_jit_grad]: 0.00045327 [validate]: 3.772e-05 Sums bootstrap : 0.000406s : 3.84% type_inference : 0.005537s : 52.48% event_method : 0.000019s : 0.18% auto_monad : 0.000065s : 0.62% graph_reusing : 0.000007s : 0.06% inline : 0.000002s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000019s : 0.18% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.06% parallel-infer-symbol : 0.000003s : 0.03% pre_auto_parallel : 0.000031s : 0.30% insert-virtual-dataset : 0.000003s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.02% optimize.py_interpret_to_execute : 0.000024s : 0.22% optimize.rewriter_before_opt_a : 0.000078s : 0.74% optimize.opt_a.expand_dump_flag : 0.000004s : 0.04% optimize.opt_a.switch_simplify : 0.000049s : 0.46% optimize.opt_a.loop_unroll : 0.000038s : 0.36% optimize.opt_a.a_1 : 0.000863s : 8.18% optimize.opt_a.with_stream_mark : 0.000025s : 0.24% optimize.opt_a.recompute_prepare : 0.000017s : 0.16% optimize.opt_a.updatestate_depend_eliminate : 0.000008s : 0.08% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.06% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.06% optimize.opt_a.parameter_eliminate : 0.000003s : 0.03% optimize.opt_a.a_2 : 0.000194s : 1.84% optimize.opt_a.accelerated_algorithm : 0.000016s : 0.15% optimize.opt_a.shard : 0.000003s : 0.03% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.04% optimize.opt_a.shard_inline : 0.000015s : 0.14% optimize.opt_a.merge_send_recv : 0.000015s : 0.14% optimize.opt_a.auto_parallel : 0.000012s : 0.12% optimize.opt_a.parallel : 0.000022s : 0.21% optimize.opt_a.flash_sp : 0.000011s : 0.10% optimize.opt_a.merge_comm : 0.000009s : 0.08% optimize.opt_a.allreduce_fusion : 0.000008s : 0.08% optimize.opt_a.matmul_add_comm_reduction : 0.000018s : 0.17% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000017s : 0.16% optimize.opt_a.virtual_dataset : 0.000015s : 0.14% optimize.opt_a.get_grad_eliminate_ : 0.000014s : 0.14% optimize.opt_a.virtual_output : 0.000014s : 0.14% optimize.opt_a.merge_forward : 0.000007s : 0.07% optimize.opt_a.cell_reuse_recompute_pass : 0.000002s : 0.02% optimize.opt_a.offload_activation : 0.000017s : 0.16% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000029s : 0.27% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.03% optimize.opt_a.before_grad : 0.000025s : 0.23% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000009s : 0.08% optimize.opt_a.meta_fg_expand : 0.000007s : 0.06% optimize.opt_a.flash_sp_send_recv_attached : 0.000003s : 0.03% optimize.opt_a.receive_attached : 0.000003s : 0.03% optimize.opt_a.after_resolve : 0.000024s : 0.22% optimize.opt_a.a_after_grad : 0.000023s : 0.21% optimize.opt_a.renormalize : 0.000551s : 5.22% optimize.opt_a.add_forward_monad_depend : 0.000006s : 0.06% optimize.opt_a.auto_monad_grad : 0.000003s : 0.03% optimize.opt_a.auto_monad_eliminator : 0.000024s : 0.22% optimize.opt_a.cse : 0.000062s : 0.58% optimize.opt_a.a_3 : 0.000100s : 0.95% optimize.py_interpret_to_execute_after_opt_a : 0.000009s : 0.08% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.02% optimize.rewriter_after_opt_a : 0.000038s : 0.36% optimize.convert_after_rewriter : 0.000008s : 0.07% optimize.order_py_execute_after_rewriter : 0.000006s : 0.05% optimize.mutable_eliminate : 0.000444s : 4.21% optimize.opt_b.b_1 : 0.000151s : 1.43% optimize.opt_b.b_2 : 0.000009s : 0.09% optimize.opt_b.updatestate_depend_eliminate : 0.000005s : 0.05% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_b.renormalize : 0.000001s : 0.01% optimize.opt_b.cse : 0.000021s : 0.20% optimize.optimize_parallel_all_gather_comm : 0.000017s : 0.16% optimize.overlap_param_gather : 0.000002s : 0.02% optimize.cconv : 0.000023s : 0.22% optimize.loop_unroll : 0.000410s : 3.88% optimize.opt_after_cconv.c_1 : 0.000037s : 0.35% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.02% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.05% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.cse : 0.000021s : 0.20% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000017s : 0.16% optimize.tuple_transform.d_1 : 0.000051s : 0.48% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.02% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000008s : 0.08% optimize.partial_unused_args_eliminate : 0.000002s : 0.02% optimize.add_recomputation : 0.000055s : 0.53% optimize.cse_after_recomputation.cse : 0.000014s : 0.13% optimize.environ_conv : 0.000006s : 0.06% optimize.swap_dp_allreduce_reducescatter : 0.000006s : 0.06% optimize.bias_add_comm_swap : 0.000003s : 0.02% optimize.label_micro_interleaved_index : 0.000004s : 0.04% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.03% optimize.merge_cast_opt : 0.000001s : 0.01% optimize.slice_recompute_activation : 0.000002s : 0.02% optimize.micro_interleaved_order_control : 0.000002s : 0.02% optimize.assign_add_opt : 0.000001s : 0.01% optimize.ForceFp32Comm : 0.000001s : 0.01% optimize.remove_cast_before_assign_add : 0.000001s : 0.01% optimize.full_micro_interleaved_order_control : 0.000002s : 0.02% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.03% optimize.comm_op_add_attrs : 0.000001s : 0.01% optimize.add_comm_op_reuse_tag : 0.000001s : 0.01% optimize.interleave_split_concat_branches : 0.000001s : 0.01% optimize.interleave_parallel_branches : 0.000001s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.02% optimize.control_data_broadcast_order : 0.000014s : 0.13% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.02% optimize.offloading_packed_experts : 0.000004s : 0.04% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.05% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.01% optimize.overlap_recompute_comm : 0.000002s : 0.02% optimize.overlap_grad_ring_attention : 0.000004s : 0.04% optimize.overlap_grad_flash_sp : 0.000020s : 0.19% optimize.begin_end_overlap_inline : 0.000001s : 0.01% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.02% optimize.split_layernorm_comm : 0.000002s : 0.02% optimize.handle_group_info : 0.000001s : 0.01% optimize.symbol_engine_optimizer.build : 0.000003s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000011s : 0.11% optimize.symbol_engine_optimizer.elim_not_effective : 0.000015s : 0.15% optimize.symbol_engine_optimizer.opt_reshape : 0.000008s : 0.08% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000012s : 0.12% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.02% pipeline_parallel_scheduler : 0.000002s : 0.01% auto_monad_reorder : 0.000019s : 0.18% get_jit_bprop_graph : 0.000001s : 0.01% rewriter_after_jit_bprop_graph : 0.000004s : 0.03% opt_after_jit_grad : 0.000453s : 4.30% validate : 0.000038s : 0.36% Time group info: ------[substitution.] 0.000171 38 11.00% : 0.000019s : 3: substitution.cast_eliminate 1.28% : 0.000002s : 3: substitution.elim_not_effective 1.08% : 0.000002s : 3: substitution.fold_const_symbol 3.90% : 0.000007s : 5: substitution.graph_param_transform 65.70% : 0.000112s : 4: substitution.inline 2.53% : 0.000004s : 6: substitution.j_node_and_user_rematch 3.38% : 0.000006s : 6: substitution.remove_not_recompute_node 2.48% : 0.000004s : 4: substitution.replace_old_param 8.67% : 0.000015s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.005484 2 87.58% : 0.004803s : 1: type_inference.infer 12.42% : 0.000681s : 1: type_inference.specialize ------[replace.] 0.000058 8 58.29% : 0.000034s : 4: replace.inline 41.71% : 0.000024s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000122 8 89.57% : 0.000110s : 4: match.inline 10.43% : 0.000013s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000246 1596 1.01% : 0.000002s : 17: predicate.accumulaten_eliminater 0.74% : 0.000002s : 5: predicate.ad_related_special_op_eliminate 0.55% : 0.000001s : 10: predicate.addn_check_dump 1.00% : 0.000002s : 17: predicate.addn_zero_filter 0.92% : 0.000002s : 17: predicate.adjust_all_reduce_mul_add 2.07% : 0.000005s : 27: predicate.arithmetic_simplify 1.14% : 0.000003s : 17: predicate.cast_eliminate 0.58% : 0.000001s : 10: predicate.check_bprop_eliminate 0.61% : 0.000001s : 10: predicate.compare_switch_simplify 0.20% : 0.000000s : 5: predicate.const_output_eliminate 0.59% : 0.000001s : 10: predicate.depend_value_elim 1.05% : 0.000003s : 17: predicate.dict_get_item_const_eliminator 1.09% : 0.000003s : 17: predicate.dict_get_item_eliminator 0.96% : 0.000002s : 17: predicate.dict_set_item_eliminator 0.90% : 0.000002s : 10: predicate.dumpgradient_eliminate 0.20% : 0.000001s : 5: predicate.elim_not_effective 0.36% : 0.000001s : 5: predicate.elim_shapecalc_of_broadcastargs 1.25% : 0.000003s : 22: predicate.environ_add_const_eliminate 1.17% : 0.000003s : 22: predicate.environ_get_add_eliminate 1.21% : 0.000003s : 22: predicate.environ_get_depend_swap 1.77% : 0.000004s : 32: predicate.environ_get_eliminate 1.17% : 0.000003s : 22: predicate.environ_get_set_eliminate 1.44% : 0.000004s : 25: predicate.exchange_switch_depend_value 2.31% : 0.000006s : 25: predicate.float_depend_g_call 0.59% : 0.000001s : 10: predicate.float_environ_get_switch 0.81% : 0.000002s : 15: predicate.float_tuple_getitem_switch 0.20% : 0.000000s : 5: predicate.fold_const_symbol 0.62% : 0.000002s : 10: predicate.get_grad_eliminate 0.22% : 0.000001s : 5: predicate.graph_param_transform 0.63% : 0.000002s : 10: predicate.incorporate_call 0.53% : 0.000001s : 10: predicate.incorporate_call_switch 6.23% : 0.000015s : 72: predicate.inline 0.74% : 0.000002s : 10: predicate.inline_without_move 0.33% : 0.000001s : 10: predicate.j_node_and_user_rematch 0.79% : 0.000002s : 10: predicate.less_batch_normalization 1.81% : 0.000004s : 31: predicate.list_to_tuple_eliminator_ 2.71% : 0.000007s : 48: predicate.load_eliminater 0.77% : 0.000002s : 5: predicate.loop_unroll_after_grad 2.10% : 0.000005s : 36: predicate.loop_unroll_before_grad 1.71% : 0.000004s : 27: predicate.make_slice_get_slice_eliminator 0.57% : 0.000001s : 10: predicate.merge_addn 0.58% : 0.000001s : 10: predicate.micro_step_allgather_replace 0.56% : 0.000001s : 10: predicate.mini_step_allgather_replace 0.90% : 0.000002s : 17: predicate.minmaximum_grad 0.87% : 0.000002s : 5: predicate.mutable_eliminate 0.35% : 0.000001s : 5: predicate.opt_reshape 0.35% : 0.000001s : 5: predicate.parallel_virtual_node 1.78% : 0.000004s : 25: predicate.partial_defer_inline 1.75% : 0.000004s : 26: predicate.partial_eliminate 1.11% : 0.000003s : 17: predicate.print_const_string_wrapper 0.55% : 0.000001s : 10: predicate.reduce_all_const_elim 1.23% : 0.000003s : 17: predicate.reduce_eliminate 2.67% : 0.000007s : 48: predicate.redundant_stop_gradient_eliminater 0.41% : 0.000001s : 10: predicate.remove_not_recompute_node 1.38% : 0.000003s : 31: predicate.replace_applicator 0.43% : 0.000001s : 10: predicate.replace_old_param 0.25% : 0.000001s : 5: predicate.reset_defer_inline 1.00% : 0.000002s : 17: predicate.reshape_eliminate 0.57% : 0.000001s : 10: predicate.row_tensor_add_zeros_like 0.33% : 0.000001s : 5: predicate.row_tensor_eliminate 0.77% : 0.000002s : 10: predicate.same_eliminate 0.39% : 0.000001s : 10: predicate.set_cell_output_no_recompute 0.72% : 0.000002s : 10: predicate.shard_identity_eliminate 0.68% : 0.000002s : 10: predicate.special_op_eliminate 0.73% : 0.000002s : 10: predicate.specialize_transform 0.72% : 0.000002s : 10: predicate.split_environ_get_set_with_tuple_value 0.73% : 0.000002s : 10: predicate.stack_unstack_eliminate 0.37% : 0.000001s : 5: predicate.switch_call_monad_eliminater 1.53% : 0.000004s : 25: predicate.switch_defer_inline 2.05% : 0.000005s : 35: predicate.switch_layer_defer_inline 4.78% : 0.000012s : 76: predicate.switch_simplify 0.92% : 0.000002s : 17: predicate.tile_eliminate 0.94% : 0.000002s : 17: predicate.transpose_eliminate 1.59% : 0.000004s : 27: predicate.tuple_list_convert_item_index_to_positive 1.73% : 0.000004s : 27: predicate.tuple_list_get_item_const_eliminator 1.57% : 0.000004s : 27: predicate.tuple_list_get_item_depend_reorder 3.05% : 0.000007s : 41: predicate.tuple_list_get_item_eliminator 1.72% : 0.000004s : 27: predicate.tuple_list_get_set_item_eliminator 2.32% : 0.000006s : 37: predicate.tuple_list_set_item_eliminator 1.82% : 0.000004s : 31: predicate.tuple_to_list_eliminator_ 2.65% : 0.000007s : 48: predicate.updatestate_pure_node_eliminater 3.26% : 0.000008s : 58: predicate.updatestate_useless_node_eliminater 0.35% : 0.000001s : 5: predicate.value_based_eliminate 0.63% : 0.000002s : 10: predicate.virtual_dataset_eliminate 0.57% : 0.000001s : 10: predicate.virtual_output_eliminate 0.27% : 0.000001s : 5: predicate.virtual_view_grad_eliminate 0.41% : 0.000001s : 5: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000457 11 53.12% : 0.000242s : 5: func_graph_cloner_run.FuncGraphClonerGraph 46.88% : 0.000214s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.024435 192 0.01% : 0.000003s : 1: ForceFp32Comm 12.47% : 0.003047s : 1: add_attr 12.44% : 0.003039s : 1: add_attr_with_inline 0.01% : 0.000004s : 1: add_comm_op_reuse_tag 0.24% : 0.000059s : 1: add_recomputation 0.02% : 0.000004s : 1: assign_add_opt 0.29% : 0.000072s : 1: auto_monad 0.10% : 0.000024s : 1: auto_monad_reorder 0.01% : 0.000003s : 1: begin_end_overlap_inline 0.02% : 0.000005s : 1: bias_add_comm_swap 1.76% : 0.000431s : 1: bootstrap 0.11% : 0.000027s : 1: cconv 0.02% : 0.000004s : 1: comm_op_add_attrs 0.07% : 0.000017s : 1: control_data_broadcast_order 0.05% : 0.000011s : 1: convert_after_rewriter 0.11% : 0.000027s : 1: cse_after_recomputation 0.02% : 0.000005s : 1: dataset_repeat_opt 0.02% : 0.000005s : 1: detach_backward 0.04% : 0.000009s : 1: environ_conv 0.10% : 0.000025s : 1: event_method 0.02% : 0.000005s : 1: full_micro_interleaved_order_control 0.02% : 0.000004s : 1: get_jit_bprop_graph 0.04% : 0.000011s : 1: graph_reusing 0.02% : 0.000005s : 1: grouped_pairwise_exchange_alltoall 0.02% : 0.000004s : 1: handle_group_info 0.02% : 0.000006s : 1: inline 0.02% : 0.000006s : 1: insert-virtual-dataset 0.02% : 0.000004s : 1: interleave_parallel_branches 0.02% : 0.000004s : 1: interleave_split_concat_branches 0.02% : 0.000006s : 1: label_fine_grained_interleaved_index 0.03% : 0.000007s : 1: label_micro_interleaved_index 1.71% : 0.000417s : 1: loop_unroll 0.02% : 0.000004s : 1: merge_cast_opt 0.02% : 0.000005s : 1: micro_interleaved_order_control 1.85% : 0.000452s : 1: mutable_eliminate 0.03% : 0.000007s : 1: offloading_packed_experts 0.06% : 0.000016s : 1: opt.transform.loop_unroll_optimizer 0.06% : 0.000016s : 1: opt.transform.mutable_eliminate 5.57% : 0.001362s : 78: opt.transform.opt_a 0.15% : 0.000036s : 1: opt.transform.opt_after_cconv 0.11% : 0.000028s : 1: opt.transform.opt_after_jit_grad 0.53% : 0.000130s : 28: opt.transform.opt_b 0.23% : 0.000057s : 2: opt.transform.opt_trans_graph 0.18% : 0.000043s : 4: opt.transform.symbol_engine_opt 11.32% : 0.002766s : 1: opt_a 0.47% : 0.000115s : 1: opt_after_cconv 1.89% : 0.000462s : 1: opt_after_jit_grad 0.97% : 0.000237s : 1: opt_b 19.45% : 0.004752s : 1: optimize 0.08% : 0.000020s : 1: optimize_parallel_all_gather_comm 0.04% : 0.000009s : 1: order_py_execute_after_rewriter 0.09% : 0.000023s : 1: overlap_grad_flash_sp 0.02% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.03% : 0.000008s : 1: overlap_grad_ring_attention 0.02% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.02% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.02% : 0.000005s : 1: overlap_param_gather 0.02% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.03% : 0.000008s : 1: overlap_recompute_and_grad_model_parallel 0.02% : 0.000005s : 1: overlap_recompute_comm 0.03% : 0.000007s : 1: parallel-infer-symbol 0.02% : 0.000004s : 1: parallel-infer-symbol-second 0.02% : 0.000005s : 1: partial_unused_args_eliminate 0.02% : 0.000004s : 1: pipeline_parallel_scheduler 0.02% : 0.000004s : 1: pipeline_split 0.14% : 0.000035s : 1: pre_auto_parallel 0.12% : 0.000029s : 1: py_interpret_to_execute 0.05% : 0.000012s : 1: py_interpret_to_execute_after_opt_a 0.02% : 0.000004s : 1: remove_cast_before_assign_add 0.08% : 0.000020s : 1: remove_dup_value 1.21% : 0.000296s : 1: renormalize.infer 1.01% : 0.000248s : 1: renormalize.specialize 0.02% : 0.000005s : 1: reorder_send_recv_between_fp_bp 0.03% : 0.000007s : 1: rewriter_after_jit_bprop_graph 0.17% : 0.000042s : 1: rewriter_after_opt_a 0.34% : 0.000082s : 1: rewriter_before_opt_a 0.02% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.02% : 0.000005s : 1: slice_recompute_activation 0.02% : 0.000004s : 1: split_layernorm_comm 0.02% : 0.000005s : 1: split_matmul_comm_elemetwise 0.04% : 0.000009s : 1: swap_dp_allreduce_reducescatter 0.35% : 0.000086s : 1: symbol_engine_optimizer 0.36% : 0.000087s : 1: tuple_transform 22.72% : 0.005552s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:45:29.312.325 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:45:29.312.591 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0165093, [21] [bootstrap]: 0.00042428 [type_inference]: 0.00570736 [event_method]: 1.887e-05 [auto_monad]: 6.812e-05 [graph_reusing]: 6.61999e-06 [inline]: 2.51e-06 [add_attr]: 0.00333455, [1] [add_attr_with_inline]: 0.00332535, [1] [Cycle 1]: 7.096e-05, [2] [tag_attr]: 1.949e-05 [meta_addattr_fg_expand]: 6.24999e-06 [parallel-infer-symbol]: 3.23e-06 [pre_auto_parallel]: 3.393e-05 [insert-virtual-dataset]: 2.84001e-06 [parallel-infer-symbol-second]: 8.09989e-07 [dataset_repeat_opt]: 1.94999e-06 [pipeline_split]: 1.64e-06 [optimize]: 0.00570378, [53] [py_interpret_to_execute]: 2.966e-05 [rewriter_before_opt_a]: 8.898e-05 [opt_a]: 0.00331245, [2] [Cycle 1]: 0.00232484, [45] [expand_dump_flag]: 2.83e-06 [switch_simplify]: 4.015e-05 [loop_unroll]: 3.075e-05 [a_1]: 0.00070812 [with_stream_mark]: 1.557e-05 [recompute_prepare]: 9.84001e-06 [updatestate_depend_eliminate]: 4.94998e-06 [updatestate_assign_eliminate]: 4.1e-06 [updatestate_loads_eliminate]: 3.59002e-06 [parameter_eliminate]: 1.89e-06 [a_2]: 0.00012825 [accelerated_algorithm]: 8.52e-06 [shard]: 2.17999e-06 [meta_shard_fg_expand]: 2.32999e-06 [shard_inline]: 7.75e-06 [merge_send_recv]: 9.61998e-06 [auto_parallel]: 6.88e-06 [parallel]: 1.857e-05 [flash_sp]: 8.17998e-06 [merge_comm]: 4.87e-06 [allreduce_fusion]: 4.43001e-06 [matmul_add_comm_reduction]: 1.084e-05 [allreduce_slice_to_reducescatter]: 6.19999e-07 [virtual_shard_identity]: 8.80999e-06 [virtual_dataset]: 8e-06 [get_grad_eliminate_]: 7.98999e-06 [virtual_output]: 7.6e-06 [merge_forward]: 4.30999e-06 [cell_reuse_recompute_pass]: 1.25001e-06 [offload_activation]: 1.128e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.65e-05 [merge_recompute_call_nodes]: 1.54e-06 [before_grad]: 1.302e-05 [set_forward_comm_id_for_comm_node_pass]: 4.36002e-06 [meta_fg_expand]: 3.59002e-06 [flash_sp_send_recv_attached]: 2.56e-06 [receive_attached]: 2.27001e-06 [after_resolve]: 1.183e-05 [a_after_grad]: 1.183e-05 [renormalize]: 0.00066695 [add_forward_monad_depend]: 5.28002e-06 [auto_monad_grad]: 2.22001e-06 [auto_monad_eliminator]: 1.679e-05 [cse]: 3.525e-05 [a_3]: 6.989e-05 [Cycle 2]: 0.00097483, [45] [expand_dump_flag]: 1.04003e-06 [switch_simplify]: 8.82e-06 [loop_unroll]: 7.32002e-06 [a_1]: 0.0002168 [with_stream_mark]: 1.238e-05 [recompute_prepare]: 7.98001e-06 [updatestate_depend_eliminate]: 4.15e-06 [updatestate_assign_eliminate]: 2.97002e-06 [updatestate_loads_eliminate]: 2.79001e-06 [parameter_eliminate]: 1.04998e-06 [a_2]: 0.00011835 [accelerated_algorithm]: 7.61999e-06 [shard]: 1.40001e-06 [meta_shard_fg_expand]: 1.59e-06 [shard_inline]: 7.68001e-06 [merge_send_recv]: 6.38e-06 [auto_parallel]: 6.11e-06 [parallel]: 4.54002e-06 [flash_sp]: 3.51001e-06 [merge_comm]: 4.12e-06 [allreduce_fusion]: 6.61e-06 [matmul_add_comm_reduction]: 6.74999e-06 [allreduce_slice_to_reducescatter]: 3.9002e-07 [virtual_shard_identity]: 8.72998e-06 [virtual_dataset]: 7.28999e-06 [get_grad_eliminate_]: 7.06001e-06 [virtual_output]: 6.91999e-06 [merge_forward]: 3.41001e-06 [cell_reuse_recompute_pass]: 1.60001e-06 [offload_activation]: 7.09001e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.614e-05 [merge_recompute_call_nodes]: 8.79983e-07 [before_grad]: 1.172e-05 [set_forward_comm_id_for_comm_node_pass]: 4.23999e-06 [meta_fg_expand]: 2.63003e-06 [flash_sp_send_recv_attached]: 9.79984e-07 [receive_attached]: 1.12e-06 [after_resolve]: 1.302e-05 [a_after_grad]: 1.185e-05 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 1.42999e-06 [auto_monad_grad]: 1.00999e-06 [auto_monad_eliminator]: 8.99e-06 [cse]: 1.813e-05 [a_3]: 5.818e-05 [py_interpret_to_execute_after_opt_a]: 1.309e-05 [slice_cell_reuse_recomputed_activation]: 5.04998e-06 [rewriter_after_opt_a]: 4.456e-05 [convert_after_rewriter]: 1.091e-05 [order_py_execute_after_rewriter]: 8.90001e-06 [mutable_eliminate]: 0.00052848 [opt_b]: 0.00030463, [1] [Cycle 1]: 0.00029577, [7] [b_1]: 0.00019498 [b_2]: 9.22001e-06 [updatestate_depend_eliminate]: 6.44001e-06 [updatestate_assign_eliminate]: 3.23998e-06 [updatestate_loads_eliminate]: 2.97002e-06 [renormalize]: 4.09986e-07 [cse]: 2.215e-05 [optimize_parallel_all_gather_comm]: 2.145e-05 [overlap_param_gather]: 5.08002e-06 [cconv]: 2.657e-05 [loop_unroll]: 0.00045405 [opt_after_cconv]: 0.00014173, [1] [Cycle 1]: 0.00013319, [7] [c_1]: 3.739e-05 [parameter_eliminate]: 2.93e-06 [updatestate_depend_eliminate]: 6.36998e-06 [updatestate_assign_eliminate]: 3.47002e-06 [updatestate_loads_eliminate]: 3.2e-06 [cse]: 2.271e-05 [renormalize]: 1.14998e-06 [remove_dup_value]: 1.798e-05 [tuple_transform]: 0.00010122, [1] [Cycle 1]: 9.43e-05, [4] [d_1]: 5.228e-05 [none_parameter_eliminate]: 1.65001e-06 [renormalize]: 2.09984e-07 [switch_simplify]: 8.97999e-06 [partial_unused_args_eliminate]: 4.46002e-06 [add_recomputation]: 5.895e-05 [cse_after_recomputation]: 3.083e-05, [1] [Cycle 1]: 2.409e-05, [1] [cse]: 1.522e-05 [environ_conv]: 9.12999e-06 [swap_dp_allreduce_reducescatter]: 8.21002e-06 [bias_add_comm_swap]: 4.82e-06 [label_micro_interleaved_index]: 7.44002e-06 [label_fine_grained_interleaved_index]: 5.07e-06 [merge_cast_opt]: 3.8e-06 [slice_recompute_activation]: 4.91002e-06 [micro_interleaved_order_control]: 5.25001e-06 [assign_add_opt]: 3.56999e-06 [ForceFp32Comm]: 3.29001e-06 [remove_cast_before_assign_add]: 3.30003e-06 [full_micro_interleaved_order_control]: 4.85001e-06 [reorder_send_recv_between_fp_bp]: 5.12e-06 [comm_op_add_attrs]: 3.36999e-06 [add_comm_op_reuse_tag]: 3.59002e-06 [interleave_split_concat_branches]: 3.48e-06 [interleave_parallel_branches]: 3.61001e-06 [overlap_opt_shard_in_pipeline]: 3.81999e-06 [overlap_opt_shard_grad_in_pipeline]: 4.44002e-06 [control_data_broadcast_order]: 1.751e-05 [grouped_pairwise_exchange_alltoall]: 4.30999e-06 [offloading_packed_experts]: 6.71999e-06 [overlap_recompute_and_grad_model_parallel]: 7.31999e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.5e-06 [overlap_recompute_allgather_and_fa_grad]: 3.89002e-06 [overlap_recompute_comm]: 4.65001e-06 [overlap_grad_ring_attention]: 6.90002e-06 [overlap_grad_flash_sp]: 2.416e-05 [begin_end_overlap_inline]: 2.90002e-06 [split_matmul_comm_elemetwise]: 4.63999e-06 [split_layernorm_comm]: 4.03001e-06 [handle_group_info]: 3.21999e-06 [symbol_engine_optimizer]: 0.00010403, [1] [Cycle 1]: 9.753e-05, [6] [build]: 3.43e-06 [elim_shapecalc]: 1.16e-05 [elim_not_effective]: 1.555e-05 [opt_reshape]: 8.48999e-06 [fold_const_symbol]: 1.239e-05 [renormalize]: 2.00002e-07 [detach_backward]: 3.41001e-06 [pipeline_parallel_scheduler]: 1.86e-06 [auto_monad_reorder]: 2.293e-05 [get_jit_bprop_graph]: 1.21002e-06 [rewriter_after_jit_bprop_graph]: 4.71002e-06 [opt_after_jit_grad]: 0.00053907 [validate]: 4.16e-05 Sums bootstrap : 0.000424s : 3.71% type_inference : 0.005707s : 49.88% event_method : 0.000019s : 0.16% auto_monad : 0.000068s : 0.60% graph_reusing : 0.000007s : 0.06% inline : 0.000003s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000019s : 0.17% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.05% parallel-infer-symbol : 0.000003s : 0.03% pre_auto_parallel : 0.000034s : 0.30% insert-virtual-dataset : 0.000003s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000030s : 0.26% optimize.rewriter_before_opt_a : 0.000089s : 0.78% optimize.opt_a.expand_dump_flag : 0.000004s : 0.03% optimize.opt_a.switch_simplify : 0.000049s : 0.43% optimize.opt_a.loop_unroll : 0.000038s : 0.33% optimize.opt_a.a_1 : 0.000925s : 8.08% optimize.opt_a.with_stream_mark : 0.000028s : 0.24% optimize.opt_a.recompute_prepare : 0.000018s : 0.16% optimize.opt_a.updatestate_depend_eliminate : 0.000009s : 0.08% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.06% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.06% optimize.opt_a.parameter_eliminate : 0.000003s : 0.03% optimize.opt_a.a_2 : 0.000247s : 2.16% optimize.opt_a.accelerated_algorithm : 0.000016s : 0.14% optimize.opt_a.shard : 0.000004s : 0.03% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.03% optimize.opt_a.shard_inline : 0.000015s : 0.13% optimize.opt_a.merge_send_recv : 0.000016s : 0.14% optimize.opt_a.auto_parallel : 0.000013s : 0.11% optimize.opt_a.parallel : 0.000023s : 0.20% optimize.opt_a.flash_sp : 0.000012s : 0.10% optimize.opt_a.merge_comm : 0.000009s : 0.08% optimize.opt_a.allreduce_fusion : 0.000011s : 0.10% optimize.opt_a.matmul_add_comm_reduction : 0.000018s : 0.15% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000018s : 0.15% optimize.opt_a.virtual_dataset : 0.000015s : 0.13% optimize.opt_a.get_grad_eliminate_ : 0.000015s : 0.13% optimize.opt_a.virtual_output : 0.000015s : 0.13% optimize.opt_a.merge_forward : 0.000008s : 0.07% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.02% optimize.opt_a.offload_activation : 0.000018s : 0.16% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000033s : 0.29% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.02% optimize.opt_a.before_grad : 0.000025s : 0.22% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000009s : 0.08% optimize.opt_a.meta_fg_expand : 0.000006s : 0.05% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.03% optimize.opt_a.receive_attached : 0.000003s : 0.03% optimize.opt_a.after_resolve : 0.000025s : 0.22% optimize.opt_a.a_after_grad : 0.000024s : 0.21% optimize.opt_a.renormalize : 0.000667s : 5.83% optimize.opt_a.add_forward_monad_depend : 0.000007s : 0.06% optimize.opt_a.auto_monad_grad : 0.000003s : 0.03% optimize.opt_a.auto_monad_eliminator : 0.000026s : 0.23% optimize.opt_a.cse : 0.000053s : 0.47% optimize.opt_a.a_3 : 0.000128s : 1.12% optimize.py_interpret_to_execute_after_opt_a : 0.000013s : 0.11% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.04% optimize.rewriter_after_opt_a : 0.000045s : 0.39% optimize.convert_after_rewriter : 0.000011s : 0.10% optimize.order_py_execute_after_rewriter : 0.000009s : 0.08% optimize.mutable_eliminate : 0.000528s : 4.62% optimize.opt_b.b_1 : 0.000195s : 1.70% optimize.opt_b.b_2 : 0.000009s : 0.08% optimize.opt_b.updatestate_depend_eliminate : 0.000006s : 0.06% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_b.renormalize : 0.000000s : 0.00% optimize.opt_b.cse : 0.000022s : 0.19% optimize.optimize_parallel_all_gather_comm : 0.000021s : 0.19% optimize.overlap_param_gather : 0.000005s : 0.04% optimize.cconv : 0.000027s : 0.23% optimize.loop_unroll : 0.000454s : 3.97% optimize.opt_after_cconv.c_1 : 0.000037s : 0.33% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.06% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.cse : 0.000023s : 0.20% optimize.opt_after_cconv.renormalize : 0.000001s : 0.01% optimize.remove_dup_value : 0.000018s : 0.16% optimize.tuple_transform.d_1 : 0.000052s : 0.46% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000009s : 0.08% optimize.partial_unused_args_eliminate : 0.000004s : 0.04% optimize.add_recomputation : 0.000059s : 0.52% optimize.cse_after_recomputation.cse : 0.000015s : 0.13% optimize.environ_conv : 0.000009s : 0.08% optimize.swap_dp_allreduce_reducescatter : 0.000008s : 0.07% optimize.bias_add_comm_swap : 0.000005s : 0.04% optimize.label_micro_interleaved_index : 0.000007s : 0.07% optimize.label_fine_grained_interleaved_index : 0.000005s : 0.04% optimize.merge_cast_opt : 0.000004s : 0.03% optimize.slice_recompute_activation : 0.000005s : 0.04% optimize.micro_interleaved_order_control : 0.000005s : 0.05% optimize.assign_add_opt : 0.000004s : 0.03% optimize.ForceFp32Comm : 0.000003s : 0.03% optimize.remove_cast_before_assign_add : 0.000003s : 0.03% optimize.full_micro_interleaved_order_control : 0.000005s : 0.04% optimize.reorder_send_recv_between_fp_bp : 0.000005s : 0.04% optimize.comm_op_add_attrs : 0.000003s : 0.03% optimize.add_comm_op_reuse_tag : 0.000004s : 0.03% optimize.interleave_split_concat_branches : 0.000003s : 0.03% optimize.interleave_parallel_branches : 0.000004s : 0.03% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.03% optimize.overlap_opt_shard_grad_in_pipeline : 0.000004s : 0.04% optimize.control_data_broadcast_order : 0.000018s : 0.15% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.04% optimize.offloading_packed_experts : 0.000007s : 0.06% optimize.overlap_recompute_and_grad_model_parallel : 0.000007s : 0.06% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000003s : 0.03% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.03% optimize.overlap_recompute_comm : 0.000005s : 0.04% optimize.overlap_grad_ring_attention : 0.000007s : 0.06% optimize.overlap_grad_flash_sp : 0.000024s : 0.21% optimize.begin_end_overlap_inline : 0.000003s : 0.03% optimize.split_matmul_comm_elemetwise : 0.000005s : 0.04% optimize.split_layernorm_comm : 0.000004s : 0.04% optimize.handle_group_info : 0.000003s : 0.03% optimize.symbol_engine_optimizer.build : 0.000003s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000012s : 0.10% optimize.symbol_engine_optimizer.elim_not_effective : 0.000016s : 0.14% optimize.symbol_engine_optimizer.opt_reshape : 0.000008s : 0.07% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000012s : 0.11% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000003s : 0.03% pipeline_parallel_scheduler : 0.000002s : 0.02% auto_monad_reorder : 0.000023s : 0.20% get_jit_bprop_graph : 0.000001s : 0.01% rewriter_after_jit_bprop_graph : 0.000005s : 0.04% opt_after_jit_grad : 0.000539s : 4.71% validate : 0.000042s : 0.36% Time group info: ------[substitution.] 0.000180 38 11.85% : 0.000021s : 3: substitution.cast_eliminate 1.17% : 0.000002s : 3: substitution.elim_not_effective 0.90% : 0.000002s : 3: substitution.fold_const_symbol 3.70% : 0.000007s : 5: substitution.graph_param_transform 67.46% : 0.000122s : 4: substitution.inline 2.52% : 0.000005s : 6: substitution.j_node_and_user_rematch 3.27% : 0.000006s : 6: substitution.remove_not_recompute_node 2.54% : 0.000005s : 4: substitution.replace_old_param 6.60% : 0.000012s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.005656 2 87.24% : 0.004934s : 1: type_inference.infer 12.76% : 0.000722s : 1: type_inference.specialize ------[replace.] 0.000062 8 60.63% : 0.000038s : 4: replace.inline 39.37% : 0.000025s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000129 8 92.40% : 0.000119s : 4: match.inline 7.60% : 0.000010s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000252 1596 0.96% : 0.000002s : 17: predicate.accumulaten_eliminater 0.90% : 0.000002s : 5: predicate.ad_related_special_op_eliminate 0.54% : 0.000001s : 10: predicate.addn_check_dump 0.94% : 0.000002s : 17: predicate.addn_zero_filter 0.94% : 0.000002s : 17: predicate.adjust_all_reduce_mul_add 2.22% : 0.000006s : 27: predicate.arithmetic_simplify 1.10% : 0.000003s : 17: predicate.cast_eliminate 0.61% : 0.000002s : 10: predicate.check_bprop_eliminate 0.62% : 0.000002s : 10: predicate.compare_switch_simplify 0.18% : 0.000000s : 5: predicate.const_output_eliminate 0.61% : 0.000002s : 10: predicate.depend_value_elim 1.07% : 0.000003s : 17: predicate.dict_get_item_const_eliminator 1.04% : 0.000003s : 17: predicate.dict_get_item_eliminator 0.98% : 0.000002s : 17: predicate.dict_set_item_eliminator 0.91% : 0.000002s : 10: predicate.dumpgradient_eliminate 0.24% : 0.000001s : 5: predicate.elim_not_effective 0.47% : 0.000001s : 5: predicate.elim_shapecalc_of_broadcastargs 1.23% : 0.000003s : 22: predicate.environ_add_const_eliminate 1.24% : 0.000003s : 22: predicate.environ_get_add_eliminate 1.15% : 0.000003s : 22: predicate.environ_get_depend_swap 1.80% : 0.000005s : 32: predicate.environ_get_eliminate 1.15% : 0.000003s : 22: predicate.environ_get_set_eliminate 1.44% : 0.000004s : 25: predicate.exchange_switch_depend_value 2.18% : 0.000005s : 25: predicate.float_depend_g_call 0.55% : 0.000001s : 10: predicate.float_environ_get_switch 0.78% : 0.000002s : 15: predicate.float_tuple_getitem_switch 0.19% : 0.000000s : 5: predicate.fold_const_symbol 0.63% : 0.000002s : 10: predicate.get_grad_eliminate 0.21% : 0.000001s : 5: predicate.graph_param_transform 0.62% : 0.000002s : 10: predicate.incorporate_call 0.52% : 0.000001s : 10: predicate.incorporate_call_switch 6.08% : 0.000015s : 72: predicate.inline 0.77% : 0.000002s : 10: predicate.inline_without_move 0.32% : 0.000001s : 10: predicate.j_node_and_user_rematch 0.82% : 0.000002s : 10: predicate.less_batch_normalization 1.88% : 0.000005s : 31: predicate.list_to_tuple_eliminator_ 2.56% : 0.000006s : 48: predicate.load_eliminater 0.87% : 0.000002s : 5: predicate.loop_unroll_after_grad 2.00% : 0.000005s : 36: predicate.loop_unroll_before_grad 1.65% : 0.000004s : 27: predicate.make_slice_get_slice_eliminator 0.57% : 0.000001s : 10: predicate.merge_addn 0.64% : 0.000002s : 10: predicate.micro_step_allgather_replace 0.60% : 0.000002s : 10: predicate.mini_step_allgather_replace 0.87% : 0.000002s : 17: predicate.minmaximum_grad 0.86% : 0.000002s : 5: predicate.mutable_eliminate 0.33% : 0.000001s : 5: predicate.opt_reshape 0.33% : 0.000001s : 5: predicate.parallel_virtual_node 1.71% : 0.000004s : 25: predicate.partial_defer_inline 1.71% : 0.000004s : 26: predicate.partial_eliminate 0.92% : 0.000002s : 17: predicate.print_const_string_wrapper 0.57% : 0.000001s : 10: predicate.reduce_all_const_elim 1.46% : 0.000004s : 17: predicate.reduce_eliminate 2.62% : 0.000007s : 48: predicate.redundant_stop_gradient_eliminater 0.38% : 0.000001s : 10: predicate.remove_not_recompute_node 1.31% : 0.000003s : 31: predicate.replace_applicator 0.39% : 0.000001s : 10: predicate.replace_old_param 0.25% : 0.000001s : 5: predicate.reset_defer_inline 0.95% : 0.000002s : 17: predicate.reshape_eliminate 0.67% : 0.000002s : 10: predicate.row_tensor_add_zeros_like 0.35% : 0.000001s : 5: predicate.row_tensor_eliminate 0.73% : 0.000002s : 10: predicate.same_eliminate 0.42% : 0.000001s : 10: predicate.set_cell_output_no_recompute 0.71% : 0.000002s : 10: predicate.shard_identity_eliminate 0.79% : 0.000002s : 10: predicate.special_op_eliminate 0.68% : 0.000002s : 10: predicate.specialize_transform 0.81% : 0.000002s : 10: predicate.split_environ_get_set_with_tuple_value 0.71% : 0.000002s : 10: predicate.stack_unstack_eliminate 0.35% : 0.000001s : 5: predicate.switch_call_monad_eliminater 1.54% : 0.000004s : 25: predicate.switch_defer_inline 2.04% : 0.000005s : 35: predicate.switch_layer_defer_inline 4.83% : 0.000012s : 76: predicate.switch_simplify 1.01% : 0.000003s : 17: predicate.tile_eliminate 0.94% : 0.000002s : 17: predicate.transpose_eliminate 1.65% : 0.000004s : 27: predicate.tuple_list_convert_item_index_to_positive 1.68% : 0.000004s : 27: predicate.tuple_list_get_item_const_eliminator 1.50% : 0.000004s : 27: predicate.tuple_list_get_item_depend_reorder 3.09% : 0.000008s : 41: predicate.tuple_list_get_item_eliminator 1.78% : 0.000004s : 27: predicate.tuple_list_get_set_item_eliminator 2.23% : 0.000006s : 37: predicate.tuple_list_set_item_eliminator 1.86% : 0.000005s : 31: predicate.tuple_to_list_eliminator_ 2.51% : 0.000006s : 48: predicate.updatestate_pure_node_eliminater 3.22% : 0.000008s : 58: predicate.updatestate_useless_node_eliminater 0.38% : 0.000001s : 5: predicate.value_based_eliminate 0.67% : 0.000002s : 10: predicate.virtual_dataset_eliminate 0.61% : 0.000002s : 10: predicate.virtual_output_eliminate 0.33% : 0.000001s : 5: predicate.virtual_view_grad_eliminate 0.46% : 0.000001s : 5: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000510 11 54.00% : 0.000275s : 5: func_graph_cloner_run.FuncGraphClonerGraph 46.00% : 0.000235s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.027882 192 0.02% : 0.000006s : 1: ForceFp32Comm 11.99% : 0.003344s : 1: add_attr 11.94% : 0.003329s : 1: add_attr_with_inline 0.02% : 0.000006s : 1: add_comm_op_reuse_tag 0.22% : 0.000062s : 1: add_recomputation 0.02% : 0.000006s : 1: assign_add_opt 0.28% : 0.000079s : 1: auto_monad 0.11% : 0.000031s : 1: auto_monad_reorder 0.02% : 0.000006s : 1: begin_end_overlap_inline 0.03% : 0.000008s : 1: bias_add_comm_swap 1.71% : 0.000476s : 1: bootstrap 0.11% : 0.000030s : 1: cconv 0.02% : 0.000006s : 1: comm_op_add_attrs 0.07% : 0.000020s : 1: control_data_broadcast_order 0.05% : 0.000014s : 1: convert_after_rewriter 0.12% : 0.000034s : 1: cse_after_recomputation 0.03% : 0.000008s : 1: dataset_repeat_opt 0.07% : 0.000019s : 1: detach_backward 0.04% : 0.000012s : 1: environ_conv 0.10% : 0.000029s : 1: event_method 0.03% : 0.000007s : 1: full_micro_interleaved_order_control 0.03% : 0.000008s : 1: get_jit_bprop_graph 0.05% : 0.000013s : 1: graph_reusing 0.02% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.02% : 0.000006s : 1: handle_group_info 0.03% : 0.000008s : 1: inline 0.04% : 0.000010s : 1: insert-virtual-dataset 0.02% : 0.000006s : 1: interleave_parallel_branches 0.02% : 0.000006s : 1: interleave_split_concat_branches 0.03% : 0.000008s : 1: label_fine_grained_interleaved_index 0.04% : 0.000010s : 1: label_micro_interleaved_index 1.65% : 0.000460s : 1: loop_unroll 0.02% : 0.000006s : 1: merge_cast_opt 0.03% : 0.000008s : 1: micro_interleaved_order_control 1.92% : 0.000535s : 1: mutable_eliminate 0.03% : 0.000010s : 1: offloading_packed_experts 0.06% : 0.000016s : 1: opt.transform.loop_unroll_optimizer 0.06% : 0.000016s : 1: opt.transform.mutable_eliminate 5.14% : 0.001434s : 78: opt.transform.opt_a 0.13% : 0.000036s : 1: opt.transform.opt_after_cconv 0.21% : 0.000059s : 1: opt.transform.opt_after_jit_grad 0.47% : 0.000132s : 28: opt.transform.opt_b 0.21% : 0.000059s : 2: opt.transform.opt_trans_graph 0.16% : 0.000044s : 4: opt.transform.symbol_engine_opt 11.89% : 0.003316s : 1: opt_a 0.52% : 0.000145s : 1: opt_after_cconv 1.97% : 0.000550s : 1: opt_after_jit_grad 1.11% : 0.000308s : 1: opt_b 21.64% : 0.006035s : 1: optimize 0.09% : 0.000025s : 1: optimize_parallel_all_gather_comm 0.04% : 0.000012s : 1: order_py_execute_after_rewriter 0.10% : 0.000027s : 1: overlap_grad_flash_sp 0.02% : 0.000006s : 1: overlap_grad_matmul_and_grad_allreduce 0.04% : 0.000010s : 1: overlap_grad_ring_attention 0.03% : 0.000008s : 1: overlap_opt_shard_grad_in_pipeline 0.02% : 0.000006s : 1: overlap_opt_shard_in_pipeline 0.03% : 0.000008s : 1: overlap_param_gather 0.02% : 0.000006s : 1: overlap_recompute_allgather_and_fa_grad 0.04% : 0.000010s : 1: overlap_recompute_and_grad_model_parallel 0.03% : 0.000007s : 1: overlap_recompute_comm 0.03% : 0.000010s : 1: parallel-infer-symbol 0.02% : 0.000007s : 1: parallel-infer-symbol-second 0.03% : 0.000007s : 1: partial_unused_args_eliminate 0.03% : 0.000009s : 1: pipeline_parallel_scheduler 0.03% : 0.000007s : 1: pipeline_split 0.15% : 0.000042s : 1: pre_auto_parallel 0.12% : 0.000033s : 1: py_interpret_to_execute 0.06% : 0.000016s : 1: py_interpret_to_execute_after_opt_a 0.02% : 0.000007s : 1: remove_cast_before_assign_add 0.08% : 0.000021s : 1: remove_dup_value 1.30% : 0.000362s : 1: renormalize.infer 1.07% : 0.000298s : 1: renormalize.specialize 0.03% : 0.000008s : 1: reorder_send_recv_between_fp_bp 0.04% : 0.000010s : 1: rewriter_after_jit_bprop_graph 0.17% : 0.000048s : 1: rewriter_after_opt_a 0.33% : 0.000092s : 1: rewriter_before_opt_a 0.03% : 0.000008s : 1: slice_cell_reuse_recomputed_activation 0.03% : 0.000008s : 1: slice_recompute_activation 0.02% : 0.000007s : 1: split_layernorm_comm 0.03% : 0.000007s : 1: split_matmul_comm_elemetwise 0.04% : 0.000011s : 1: swap_dp_allreduce_reducescatter 0.38% : 0.000107s : 1: symbol_engine_optimizer 0.37% : 0.000104s : 1: tuple_transform 20.60% : 0.005745s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:45:29.601.666 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0147834, [21] [bootstrap]: 0.00042204 [type_inference]: 0.00540751 [event_method]: 1.725e-05 [auto_monad]: 6.237e-05 [graph_reusing]: 5.99e-06 [inline]: 2.22999e-06 [add_attr]: 0.00314763, [1] [add_attr_with_inline]: 0.00313903, [1] [Cycle 1]: 4.968e-05, [2] [tag_attr]: 1.893e-05 [meta_addattr_fg_expand]: 6.34001e-06 [parallel-infer-symbol]: 2.46e-06 [pre_auto_parallel]: 3.092e-05 [insert-virtual-dataset]: 2.69999e-06 [parallel-infer-symbol-second]: 9.5999e-07 [dataset_repeat_opt]: 2.12001e-06 [pipeline_split]: 1.78002e-06 [optimize]: 0.00502258, [53] [py_interpret_to_execute]: 2.42e-05 [rewriter_before_opt_a]: 8.096e-05 [opt_a]: 0.00287331, [2] [Cycle 1]: 0.00210356, [45] [expand_dump_flag]: 3.25e-06 [switch_simplify]: 4.044e-05 [loop_unroll]: 3.125e-05 [a_1]: 0.00072573 [with_stream_mark]: 1.385e-05 [recompute_prepare]: 9.44e-06 [updatestate_depend_eliminate]: 4.4e-06 [updatestate_assign_eliminate]: 4.08999e-06 [updatestate_loads_eliminate]: 3.47002e-06 [parameter_eliminate]: 2.54001e-06 [a_2]: 0.00010269 [accelerated_algorithm]: 8.27e-06 [shard]: 1.81e-06 [meta_shard_fg_expand]: 2.16e-06 [shard_inline]: 7.65e-06 [merge_send_recv]: 9.61998e-06 [auto_parallel]: 6.84001e-06 [parallel]: 1.81e-05 [flash_sp]: 8.69e-06 [merge_comm]: 4.57e-06 [allreduce_fusion]: 4.28001e-06 [matmul_add_comm_reduction]: 1.045e-05 [allreduce_slice_to_reducescatter]: 6.30011e-07 [virtual_shard_identity]: 9.59e-06 [virtual_dataset]: 8.19002e-06 [get_grad_eliminate_]: 7.34002e-06 [virtual_output]: 7.60998e-06 [merge_forward]: 4.61002e-06 [cell_reuse_recompute_pass]: 1.14998e-06 [offload_activation]: 1.069e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.473e-05 [merge_recompute_call_nodes]: 1.94999e-06 [before_grad]: 1.263e-05 [set_forward_comm_id_for_comm_node_pass]: 4.45999e-06 [meta_fg_expand]: 3.36001e-06 [flash_sp_send_recv_attached]: 2.79001e-06 [receive_attached]: 2.56e-06 [after_resolve]: 1.217e-05 [a_after_grad]: 1.179e-05 [renormalize]: 0.00060793 [add_forward_monad_depend]: 5.36998e-06 [auto_monad_grad]: 2.53e-06 [auto_monad_eliminator]: 1.681e-05 [cse]: 3.616e-05 [a_3]: 5.635e-05 [Cycle 2]: 0.00075992, [45] [expand_dump_flag]: 1.01002e-06 [switch_simplify]: 8.65999e-06 [loop_unroll]: 7.51001e-06 [a_1]: 0.00018364 [with_stream_mark]: 1.246e-05 [recompute_prepare]: 7.63001e-06 [updatestate_depend_eliminate]: 3.44001e-06 [updatestate_assign_eliminate]: 2.96999e-06 [updatestate_loads_eliminate]: 2.91999e-06 [parameter_eliminate]: 1.03001e-06 [a_2]: 9.172e-05 [accelerated_algorithm]: 7.43e-06 [shard]: 1.10999e-06 [meta_shard_fg_expand]: 1.47999e-06 [shard_inline]: 1.06e-05 [merge_send_recv]: 5.76998e-06 [auto_parallel]: 6.32001e-06 [parallel]: 4.85001e-06 [flash_sp]: 3.81001e-06 [merge_comm]: 3.98999e-06 [allreduce_fusion]: 3.43999e-06 [matmul_add_comm_reduction]: 6.07001e-06 [allreduce_slice_to_reducescatter]: 3.39991e-07 [virtual_shard_identity]: 7.98999e-06 [virtual_dataset]: 7.37997e-06 [get_grad_eliminate_]: 6.78003e-06 [virtual_output]: 6.68e-06 [merge_forward]: 3.36999e-06 [cell_reuse_recompute_pass]: 1.37e-06 [offload_activation]: 7.14001e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.408e-05 [merge_recompute_call_nodes]: 6.69999e-07 [before_grad]: 1.335e-05 [set_forward_comm_id_for_comm_node_pass]: 4.22e-06 [meta_fg_expand]: 2.58998e-06 [flash_sp_send_recv_attached]: 9.00007e-07 [receive_attached]: 9.5999e-07 [after_resolve]: 1.183e-05 [a_after_grad]: 1.083e-05 [renormalize]: 8.00064e-08 [add_forward_monad_depend]: 1.25999e-06 [auto_monad_grad]: 1.15001e-06 [auto_monad_eliminator]: 8.75001e-06 [cse]: 1.796e-05 [a_3]: 4.469e-05 [py_interpret_to_execute_after_opt_a]: 1.009e-05 [slice_cell_reuse_recomputed_activation]: 2.14e-06 [rewriter_after_opt_a]: 4.049e-05 [convert_after_rewriter]: 7.41001e-06 [order_py_execute_after_rewriter]: 5.69999e-06 [mutable_eliminate]: 0.0005077 [opt_b]: 0.00023697, [1] [Cycle 1]: 0.00023121, [7] [b_1]: 0.00015253 [b_2]: 1e-05 [updatestate_depend_eliminate]: 5.56998e-06 [updatestate_assign_eliminate]: 2.89001e-06 [updatestate_loads_eliminate]: 2.81e-06 [renormalize]: 3.69997e-07 [cse]: 2.206e-05 [optimize_parallel_all_gather_comm]: 1.641e-05 [overlap_param_gather]: 2.18002e-06 [cconv]: 2.366e-05 [loop_unroll]: 0.0004368 [opt_after_cconv]: 0.00011567, [1] [Cycle 1]: 0.00010998, [7] [c_1]: 3.782e-05 [parameter_eliminate]: 2.67001e-06 [updatestate_depend_eliminate]: 6.07999e-06 [updatestate_assign_eliminate]: 3.06001e-06 [updatestate_loads_eliminate]: 2.86e-06 [cse]: 2.279e-05 [renormalize]: 4.89992e-07 [remove_dup_value]: 1.501e-05 [tuple_transform]: 8.522e-05, [1] [Cycle 1]: 8.09e-05, [4] [d_1]: 5.192e-05 [none_parameter_eliminate]: 2.25002e-06 [renormalize]: 2.60014e-07 [switch_simplify]: 8.02998e-06 [partial_unused_args_eliminate]: 2.04e-06 [add_recomputation]: 5.375e-05 [cse_after_recomputation]: 2.47e-05, [1] [Cycle 1]: 2.011e-05, [1] [cse]: 1.476e-05 [environ_conv]: 6.81001e-06 [swap_dp_allreduce_reducescatter]: 6.30002e-06 [bias_add_comm_swap]: 3.18e-06 [label_micro_interleaved_index]: 4.35e-06 [label_fine_grained_interleaved_index]: 3.25998e-06 [merge_cast_opt]: 1.27e-06 [slice_recompute_activation]: 2.54001e-06 [micro_interleaved_order_control]: 2.62001e-06 [assign_add_opt]: 1.28002e-06 [ForceFp32Comm]: 7.89994e-07 [remove_cast_before_assign_add]: 1.12e-06 [full_micro_interleaved_order_control]: 1.92999e-06 [reorder_send_recv_between_fp_bp]: 2.86e-06 [comm_op_add_attrs]: 1.12999e-06 [add_comm_op_reuse_tag]: 1.06002e-06 [interleave_split_concat_branches]: 1.12e-06 [interleave_parallel_branches]: 1.04e-06 [overlap_opt_shard_in_pipeline]: 1.15999e-06 [overlap_opt_shard_grad_in_pipeline]: 1.91998e-06 [control_data_broadcast_order]: 1.537e-05 [grouped_pairwise_exchange_alltoall]: 1.76e-06 [offloading_packed_experts]: 4.46002e-06 [overlap_recompute_and_grad_model_parallel]: 4.97999e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.07e-06 [overlap_recompute_allgather_and_fa_grad]: 1.42e-06 [overlap_recompute_comm]: 2.17001e-06 [overlap_grad_ring_attention]: 4.85999e-06 [overlap_grad_flash_sp]: 2.05e-05 [begin_end_overlap_inline]: 5.90022e-07 [split_matmul_comm_elemetwise]: 2.13998e-06 [split_layernorm_comm]: 1.57999e-06 [handle_group_info]: 9.79984e-07 [symbol_engine_optimizer]: 8.06e-05, [1] [Cycle 1]: 7.65e-05, [6] [build]: 2.68e-06 [elim_shapecalc]: 1.084e-05 [elim_not_effective]: 1.505e-05 [opt_reshape]: 8.23001e-06 [fold_const_symbol]: 1.194e-05 [renormalize]: 2.79979e-07 [detach_backward]: 2.03002e-06 [pipeline_parallel_scheduler]: 1.60999e-06 [auto_monad_reorder]: 1.985e-05 [get_jit_bprop_graph]: 1.03001e-06 [rewriter_after_jit_bprop_graph]: 3.31001e-06 [opt_after_jit_grad]: 0.00046573 [validate]: 3.339e-05 Sums bootstrap : 0.000422s : 3.96% type_inference : 0.005408s : 50.72% event_method : 0.000017s : 0.16% auto_monad : 0.000062s : 0.58% graph_reusing : 0.000006s : 0.06% inline : 0.000002s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000019s : 0.18% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.06% parallel-infer-symbol : 0.000002s : 0.02% pre_auto_parallel : 0.000031s : 0.29% insert-virtual-dataset : 0.000003s : 0.03% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.02% optimize.py_interpret_to_execute : 0.000024s : 0.23% optimize.rewriter_before_opt_a : 0.000081s : 0.76% optimize.opt_a.expand_dump_flag : 0.000004s : 0.04% optimize.opt_a.switch_simplify : 0.000049s : 0.46% optimize.opt_a.loop_unroll : 0.000039s : 0.36% optimize.opt_a.a_1 : 0.000909s : 8.53% optimize.opt_a.with_stream_mark : 0.000026s : 0.25% optimize.opt_a.recompute_prepare : 0.000017s : 0.16% optimize.opt_a.updatestate_depend_eliminate : 0.000008s : 0.07% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.07% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.06% optimize.opt_a.parameter_eliminate : 0.000004s : 0.03% optimize.opt_a.a_2 : 0.000194s : 1.82% optimize.opt_a.accelerated_algorithm : 0.000016s : 0.15% optimize.opt_a.shard : 0.000003s : 0.03% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.03% optimize.opt_a.shard_inline : 0.000018s : 0.17% optimize.opt_a.merge_send_recv : 0.000015s : 0.14% optimize.opt_a.auto_parallel : 0.000013s : 0.12% optimize.opt_a.parallel : 0.000023s : 0.22% optimize.opt_a.flash_sp : 0.000013s : 0.12% optimize.opt_a.merge_comm : 0.000009s : 0.08% optimize.opt_a.allreduce_fusion : 0.000008s : 0.07% optimize.opt_a.matmul_add_comm_reduction : 0.000017s : 0.15% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000018s : 0.16% optimize.opt_a.virtual_dataset : 0.000016s : 0.15% optimize.opt_a.get_grad_eliminate_ : 0.000014s : 0.13% optimize.opt_a.virtual_output : 0.000014s : 0.13% optimize.opt_a.merge_forward : 0.000008s : 0.07% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.02% optimize.opt_a.offload_activation : 0.000018s : 0.17% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000029s : 0.27% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.02% optimize.opt_a.before_grad : 0.000026s : 0.24% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000009s : 0.08% optimize.opt_a.meta_fg_expand : 0.000006s : 0.06% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.03% optimize.opt_a.receive_attached : 0.000004s : 0.03% optimize.opt_a.after_resolve : 0.000024s : 0.23% optimize.opt_a.a_after_grad : 0.000023s : 0.21% optimize.opt_a.renormalize : 0.000608s : 5.70% optimize.opt_a.add_forward_monad_depend : 0.000007s : 0.06% optimize.opt_a.auto_monad_grad : 0.000004s : 0.03% optimize.opt_a.auto_monad_eliminator : 0.000026s : 0.24% optimize.opt_a.cse : 0.000054s : 0.51% optimize.opt_a.a_3 : 0.000101s : 0.95% optimize.py_interpret_to_execute_after_opt_a : 0.000010s : 0.09% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.02% optimize.rewriter_after_opt_a : 0.000040s : 0.38% optimize.convert_after_rewriter : 0.000007s : 0.07% optimize.order_py_execute_after_rewriter : 0.000006s : 0.05% optimize.mutable_eliminate : 0.000508s : 4.76% optimize.opt_b.b_1 : 0.000153s : 1.43% optimize.opt_b.b_2 : 0.000010s : 0.09% optimize.opt_b.updatestate_depend_eliminate : 0.000006s : 0.05% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_b.renormalize : 0.000000s : 0.00% optimize.opt_b.cse : 0.000022s : 0.21% optimize.optimize_parallel_all_gather_comm : 0.000016s : 0.15% optimize.overlap_param_gather : 0.000002s : 0.02% optimize.cconv : 0.000024s : 0.22% optimize.loop_unroll : 0.000437s : 4.10% optimize.opt_after_cconv.c_1 : 0.000038s : 0.35% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.06% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.cse : 0.000023s : 0.21% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000015s : 0.14% optimize.tuple_transform.d_1 : 0.000052s : 0.49% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.02% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000008s : 0.08% optimize.partial_unused_args_eliminate : 0.000002s : 0.02% optimize.add_recomputation : 0.000054s : 0.50% optimize.cse_after_recomputation.cse : 0.000015s : 0.14% optimize.environ_conv : 0.000007s : 0.06% optimize.swap_dp_allreduce_reducescatter : 0.000006s : 0.06% optimize.bias_add_comm_swap : 0.000003s : 0.03% optimize.label_micro_interleaved_index : 0.000004s : 0.04% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.03% optimize.merge_cast_opt : 0.000001s : 0.01% optimize.slice_recompute_activation : 0.000003s : 0.02% optimize.micro_interleaved_order_control : 0.000003s : 0.02% optimize.assign_add_opt : 0.000001s : 0.01% optimize.ForceFp32Comm : 0.000001s : 0.01% optimize.remove_cast_before_assign_add : 0.000001s : 0.01% optimize.full_micro_interleaved_order_control : 0.000002s : 0.02% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.03% optimize.comm_op_add_attrs : 0.000001s : 0.01% optimize.add_comm_op_reuse_tag : 0.000001s : 0.01% optimize.interleave_split_concat_branches : 0.000001s : 0.01% optimize.interleave_parallel_branches : 0.000001s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.02% optimize.control_data_broadcast_order : 0.000015s : 0.14% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.02% optimize.offloading_packed_experts : 0.000004s : 0.04% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.05% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.01% optimize.overlap_recompute_comm : 0.000002s : 0.02% optimize.overlap_grad_ring_attention : 0.000005s : 0.05% optimize.overlap_grad_flash_sp : 0.000021s : 0.19% optimize.begin_end_overlap_inline : 0.000001s : 0.01% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.02% optimize.split_layernorm_comm : 0.000002s : 0.01% optimize.handle_group_info : 0.000001s : 0.01% optimize.symbol_engine_optimizer.build : 0.000003s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000011s : 0.10% optimize.symbol_engine_optimizer.elim_not_effective : 0.000015s : 0.14% optimize.symbol_engine_optimizer.opt_reshape : 0.000008s : 0.08% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000012s : 0.11% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.02% pipeline_parallel_scheduler : 0.000002s : 0.02% auto_monad_reorder : 0.000020s : 0.19% get_jit_bprop_graph : 0.000001s : 0.01% rewriter_after_jit_bprop_graph : 0.000003s : 0.03% opt_after_jit_grad : 0.000466s : 4.37% validate : 0.000033s : 0.31% Time group info: ------[substitution.] 0.000195 38 9.76% : 0.000019s : 3: substitution.cast_eliminate 1.06% : 0.000002s : 3: substitution.elim_not_effective 0.83% : 0.000002s : 3: substitution.fold_const_symbol 3.48% : 0.000007s : 5: substitution.graph_param_transform 69.89% : 0.000137s : 4: substitution.inline 2.06% : 0.000004s : 6: substitution.j_node_and_user_rematch 3.00% : 0.000006s : 6: substitution.remove_not_recompute_node 2.17% : 0.000004s : 4: substitution.replace_old_param 7.74% : 0.000015s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.005355 2 87.63% : 0.004693s : 1: type_inference.infer 12.37% : 0.000662s : 1: type_inference.specialize ------[replace.] 0.000061 8 60.17% : 0.000037s : 4: replace.inline 39.83% : 0.000024s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000147 8 91.06% : 0.000134s : 4: match.inline 8.94% : 0.000013s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000251 1596 0.97% : 0.000002s : 17: predicate.accumulaten_eliminater 0.63% : 0.000002s : 5: predicate.ad_related_special_op_eliminate 0.61% : 0.000002s : 10: predicate.addn_check_dump 0.94% : 0.000002s : 17: predicate.addn_zero_filter 0.88% : 0.000002s : 17: predicate.adjust_all_reduce_mul_add 2.03% : 0.000005s : 27: predicate.arithmetic_simplify 1.09% : 0.000003s : 17: predicate.cast_eliminate 0.54% : 0.000001s : 10: predicate.check_bprop_eliminate 0.63% : 0.000002s : 10: predicate.compare_switch_simplify 0.20% : 0.000000s : 5: predicate.const_output_eliminate 0.60% : 0.000002s : 10: predicate.depend_value_elim 0.99% : 0.000002s : 17: predicate.dict_get_item_const_eliminator 1.09% : 0.000003s : 17: predicate.dict_get_item_eliminator 0.89% : 0.000002s : 17: predicate.dict_set_item_eliminator 0.99% : 0.000002s : 10: predicate.dumpgradient_eliminate 0.23% : 0.000001s : 5: predicate.elim_not_effective 0.36% : 0.000001s : 5: predicate.elim_shapecalc_of_broadcastargs 1.18% : 0.000003s : 22: predicate.environ_add_const_eliminate 1.16% : 0.000003s : 22: predicate.environ_get_add_eliminate 1.23% : 0.000003s : 22: predicate.environ_get_depend_swap 1.84% : 0.000005s : 32: predicate.environ_get_eliminate 1.13% : 0.000003s : 22: predicate.environ_get_set_eliminate 1.47% : 0.000004s : 25: predicate.exchange_switch_depend_value 2.18% : 0.000005s : 25: predicate.float_depend_g_call 0.56% : 0.000001s : 10: predicate.float_environ_get_switch 0.80% : 0.000002s : 15: predicate.float_tuple_getitem_switch 0.20% : 0.000000s : 5: predicate.fold_const_symbol 0.63% : 0.000002s : 10: predicate.get_grad_eliminate 0.22% : 0.000001s : 5: predicate.graph_param_transform 0.63% : 0.000002s : 10: predicate.incorporate_call 0.51% : 0.000001s : 10: predicate.incorporate_call_switch 6.11% : 0.000015s : 72: predicate.inline 0.74% : 0.000002s : 10: predicate.inline_without_move 0.33% : 0.000001s : 10: predicate.j_node_and_user_rematch 0.79% : 0.000002s : 10: predicate.less_batch_normalization 1.82% : 0.000005s : 31: predicate.list_to_tuple_eliminator_ 2.68% : 0.000007s : 48: predicate.load_eliminater 0.90% : 0.000002s : 5: predicate.loop_unroll_after_grad 2.04% : 0.000005s : 36: predicate.loop_unroll_before_grad 1.70% : 0.000004s : 27: predicate.make_slice_get_slice_eliminator 0.63% : 0.000002s : 10: predicate.merge_addn 0.58% : 0.000001s : 10: predicate.micro_step_allgather_replace 0.55% : 0.000001s : 10: predicate.mini_step_allgather_replace 0.89% : 0.000002s : 17: predicate.minmaximum_grad 0.77% : 0.000002s : 5: predicate.mutable_eliminate 0.33% : 0.000001s : 5: predicate.opt_reshape 0.39% : 0.000001s : 5: predicate.parallel_virtual_node 1.74% : 0.000004s : 25: predicate.partial_defer_inline 1.70% : 0.000004s : 26: predicate.partial_eliminate 0.92% : 0.000002s : 17: predicate.print_const_string_wrapper 0.73% : 0.000002s : 10: predicate.reduce_all_const_elim 1.27% : 0.000003s : 17: predicate.reduce_eliminate 2.57% : 0.000006s : 48: predicate.redundant_stop_gradient_eliminater 0.42% : 0.000001s : 10: predicate.remove_not_recompute_node 1.35% : 0.000003s : 31: predicate.replace_applicator 0.47% : 0.000001s : 10: predicate.replace_old_param 0.28% : 0.000001s : 5: predicate.reset_defer_inline 1.00% : 0.000003s : 17: predicate.reshape_eliminate 0.64% : 0.000002s : 10: predicate.row_tensor_add_zeros_like 0.43% : 0.000001s : 5: predicate.row_tensor_eliminate 0.71% : 0.000002s : 10: predicate.same_eliminate 0.40% : 0.000001s : 10: predicate.set_cell_output_no_recompute 0.98% : 0.000002s : 10: predicate.shard_identity_eliminate 0.69% : 0.000002s : 10: predicate.special_op_eliminate 0.75% : 0.000002s : 10: predicate.specialize_transform 0.89% : 0.000002s : 10: predicate.split_environ_get_set_with_tuple_value 0.71% : 0.000002s : 10: predicate.stack_unstack_eliminate 0.36% : 0.000001s : 5: predicate.switch_call_monad_eliminater 1.57% : 0.000004s : 25: predicate.switch_defer_inline 2.11% : 0.000005s : 35: predicate.switch_layer_defer_inline 4.64% : 0.000012s : 76: predicate.switch_simplify 0.93% : 0.000002s : 17: predicate.tile_eliminate 0.99% : 0.000002s : 17: predicate.transpose_eliminate 1.59% : 0.000004s : 27: predicate.tuple_list_convert_item_index_to_positive 1.79% : 0.000004s : 27: predicate.tuple_list_get_item_const_eliminator 1.54% : 0.000004s : 27: predicate.tuple_list_get_item_depend_reorder 3.37% : 0.000008s : 41: predicate.tuple_list_get_item_eliminator 1.56% : 0.000004s : 27: predicate.tuple_list_get_set_item_eliminator 2.32% : 0.000006s : 37: predicate.tuple_list_set_item_eliminator 1.82% : 0.000005s : 31: predicate.tuple_to_list_eliminator_ 2.57% : 0.000006s : 48: predicate.updatestate_pure_node_eliminater 3.29% : 0.000008s : 58: predicate.updatestate_useless_node_eliminater 0.32% : 0.000001s : 5: predicate.value_based_eliminate 0.67% : 0.000002s : 10: predicate.virtual_dataset_eliminate 0.61% : 0.000002s : 10: predicate.virtual_output_eliminate 0.28% : 0.000001s : 5: predicate.virtual_view_grad_eliminate 0.34% : 0.000001s : 5: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000481 11 55.04% : 0.000265s : 5: func_graph_cloner_run.FuncGraphClonerGraph 44.96% : 0.000216s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.025193 192 0.01% : 0.000003s : 1: ForceFp32Comm 12.51% : 0.003152s : 1: add_attr 12.47% : 0.003143s : 1: add_attr_with_inline 0.01% : 0.000004s : 1: add_comm_op_reuse_tag 0.23% : 0.000058s : 1: add_recomputation 0.02% : 0.000004s : 1: assign_add_opt 0.27% : 0.000068s : 1: auto_monad 0.09% : 0.000024s : 1: auto_monad_reorder 0.01% : 0.000003s : 1: begin_end_overlap_inline 0.02% : 0.000006s : 1: bias_add_comm_swap 1.76% : 0.000444s : 1: bootstrap 0.33% : 0.000083s : 1: cconv 0.02% : 0.000004s : 1: comm_op_add_attrs 0.07% : 0.000019s : 1: control_data_broadcast_order 0.04% : 0.000011s : 1: convert_after_rewriter 0.11% : 0.000028s : 1: cse_after_recomputation 0.02% : 0.000005s : 1: dataset_repeat_opt 0.02% : 0.000005s : 1: detach_backward 0.04% : 0.000010s : 1: environ_conv 0.09% : 0.000023s : 1: event_method 0.02% : 0.000005s : 1: full_micro_interleaved_order_control 0.02% : 0.000004s : 1: get_jit_bprop_graph 0.04% : 0.000009s : 1: graph_reusing 0.02% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000004s : 1: handle_group_info 0.02% : 0.000005s : 1: inline 0.02% : 0.000006s : 1: insert-virtual-dataset 0.01% : 0.000004s : 1: interleave_parallel_branches 0.02% : 0.000004s : 1: interleave_split_concat_branches 0.03% : 0.000006s : 1: label_fine_grained_interleaved_index 0.03% : 0.000007s : 1: label_micro_interleaved_index 1.77% : 0.000445s : 1: loop_unroll 0.02% : 0.000004s : 1: merge_cast_opt 0.02% : 0.000005s : 1: micro_interleaved_order_control 2.05% : 0.000515s : 1: mutable_eliminate 0.03% : 0.000007s : 1: offloading_packed_experts 0.07% : 0.000018s : 1: opt.transform.loop_unroll_optimizer 0.07% : 0.000016s : 1: opt.transform.mutable_eliminate 5.63% : 0.001418s : 78: opt.transform.opt_a 0.14% : 0.000036s : 1: opt.transform.opt_after_cconv 0.11% : 0.000028s : 1: opt.transform.opt_after_jit_grad 0.52% : 0.000131s : 28: opt.transform.opt_b 0.23% : 0.000058s : 2: opt.transform.opt_trans_graph 0.17% : 0.000042s : 4: opt.transform.symbol_engine_opt 11.42% : 0.002876s : 1: opt_a 0.47% : 0.000119s : 1: opt_after_cconv 1.88% : 0.000474s : 1: opt_after_jit_grad 0.95% : 0.000240s : 1: opt_b 19.95% : 0.005026s : 1: optimize 0.08% : 0.000020s : 1: optimize_parallel_all_gather_comm 0.03% : 0.000009s : 1: order_py_execute_after_rewriter 0.09% : 0.000024s : 1: overlap_grad_flash_sp 0.02% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.03% : 0.000008s : 1: overlap_grad_ring_attention 0.02% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.02% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.02% : 0.000005s : 1: overlap_param_gather 0.02% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.03% : 0.000008s : 1: overlap_recompute_and_grad_model_parallel 0.02% : 0.000005s : 1: overlap_recompute_comm 0.02% : 0.000006s : 1: parallel-infer-symbol 0.01% : 0.000004s : 1: parallel-infer-symbol-second 0.02% : 0.000005s : 1: partial_unused_args_eliminate 0.02% : 0.000005s : 1: pipeline_parallel_scheduler 0.02% : 0.000005s : 1: pipeline_split 0.14% : 0.000035s : 1: pre_auto_parallel 0.11% : 0.000028s : 1: py_interpret_to_execute 0.06% : 0.000014s : 1: py_interpret_to_execute_after_opt_a 0.02% : 0.000004s : 1: remove_cast_before_assign_add 0.07% : 0.000018s : 1: remove_dup_value 1.28% : 0.000322s : 1: renormalize.infer 1.11% : 0.000278s : 1: renormalize.specialize 0.02% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.03% : 0.000006s : 1: rewriter_after_jit_bprop_graph 0.18% : 0.000044s : 1: rewriter_after_opt_a 0.34% : 0.000085s : 1: rewriter_before_opt_a 0.02% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.02% : 0.000005s : 1: slice_recompute_activation 0.02% : 0.000004s : 1: split_layernorm_comm 0.02% : 0.000005s : 1: split_matmul_comm_elemetwise 0.04% : 0.000009s : 1: swap_dp_allreduce_reducescatter 0.33% : 0.000083s : 1: symbol_engine_optimizer 0.35% : 0.000088s : 1: tuple_transform 21.51% : 0.005420s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:45:29.959.046 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:45:29.959.304 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0149558, [21] [bootstrap]: 0.00041185 [type_inference]: 0.00530594 [event_method]: 1.801e-05 [auto_monad]: 6.254e-05 [graph_reusing]: 5.81e-06 [inline]: 1.81e-06 [add_attr]: 0.00301767, [1] [add_attr_with_inline]: 0.00300852, [1] [Cycle 1]: 6.487e-05, [2] [tag_attr]: 1.819e-05 [meta_addattr_fg_expand]: 5.96e-06 [parallel-infer-symbol]: 2.57001e-06 [pre_auto_parallel]: 3.161e-05 [insert-virtual-dataset]: 2.71e-06 [parallel-infer-symbol-second]: 8.09989e-07 [dataset_repeat_opt]: 2.34001e-06 [pipeline_split]: 1.52999e-06 [optimize]: 0.0050655, [53] [py_interpret_to_execute]: 2.59e-05 [rewriter_before_opt_a]: 8.437e-05 [opt_a]: 0.00291757, [2] [Cycle 1]: 0.00197231, [45] [expand_dump_flag]: 3.03e-06 [switch_simplify]: 4.19e-05 [loop_unroll]: 3.036e-05 [a_1]: 0.00059398 [with_stream_mark]: 1.351e-05 [recompute_prepare]: 8.48999e-06 [updatestate_depend_eliminate]: 3.96001e-06 [updatestate_assign_eliminate]: 3.56001e-06 [updatestate_loads_eliminate]: 3.15002e-06 [parameter_eliminate]: 2.01e-06 [a_2]: 0.00011672 [accelerated_algorithm]: 7.31999e-06 [shard]: 1.91003e-06 [meta_shard_fg_expand]: 1.99e-06 [shard_inline]: 6.95002e-06 [merge_send_recv]: 8.92e-06 [auto_parallel]: 7.4e-06 [parallel]: 1.93e-05 [flash_sp]: 8e-06 [merge_comm]: 3.78999e-06 [allreduce_fusion]: 3.47002e-06 [matmul_add_comm_reduction]: 9.11002e-06 [allreduce_slice_to_reducescatter]: 6.80011e-07 [virtual_shard_identity]: 7.98001e-06 [virtual_dataset]: 6.86999e-06 [get_grad_eliminate_]: 6.32001e-06 [virtual_output]: 6.75002e-06 [merge_forward]: 3.77002e-06 [cell_reuse_recompute_pass]: 1.34e-06 [offload_activation]: 9.56998e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.476e-05 [merge_recompute_call_nodes]: 1.77999e-06 [before_grad]: 1.004e-05 [set_forward_comm_id_for_comm_node_pass]: 3.66999e-06 [meta_fg_expand]: 2.84999e-06 [flash_sp_send_recv_attached]: 2.54999e-06 [receive_attached]: 2.34001e-06 [after_resolve]: 1.164e-05 [a_after_grad]: 9.82999e-06 [renormalize]: 0.0004872 [add_forward_monad_depend]: 5.72001e-06 [auto_monad_grad]: 1.79998e-06 [auto_monad_eliminator]: 1.393e-05 [cse]: 1.903e-05 [a_3]: 6.309e-05 [Cycle 2]: 0.00093237, [45] [expand_dump_flag]: 1.04998e-06 [switch_simplify]: 8.2e-06 [loop_unroll]: 6.74001e-06 [a_1]: 0.00013463 [with_stream_mark]: 9.81e-06 [recompute_prepare]: 6.62002e-06 [updatestate_depend_eliminate]: 3.18e-06 [updatestate_assign_eliminate]: 2.67001e-06 [updatestate_loads_eliminate]: 2.38002e-06 [parameter_eliminate]: 1.00001e-06 [a_2]: 0.00010321 [accelerated_algorithm]: 6.53e-06 [shard]: 1.44e-06 [meta_shard_fg_expand]: 1.20001e-06 [shard_inline]: 6.18998e-06 [merge_send_recv]: 4.92e-06 [auto_parallel]: 5.62001e-06 [parallel]: 1.069e-05 [flash_sp]: 3.78999e-06 [merge_comm]: 3.43e-06 [allreduce_fusion]: 3.18998e-06 [matmul_add_comm_reduction]: 5.29e-06 [allreduce_slice_to_reducescatter]: 4.89992e-07 [virtual_shard_identity]: 6.79999e-06 [virtual_dataset]: 6.57002e-06 [get_grad_eliminate_]: 6.13002e-06 [virtual_output]: 6.07001e-06 [merge_forward]: 2.48e-06 [cell_reuse_recompute_pass]: 1.52001e-06 [offload_activation]: 6.04001e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.575e-05 [merge_recompute_call_nodes]: 1.08001e-06 [before_grad]: 1.601e-05 [set_forward_comm_id_for_comm_node_pass]: 5.15999e-06 [meta_fg_expand]: 2.39001e-06 [flash_sp_send_recv_attached]: 9.79984e-07 [receive_attached]: 1.25999e-06 [after_resolve]: 1.24e-05 [a_after_grad]: 1.116e-05 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 1.99e-06 [auto_monad_grad]: 1.09e-06 [auto_monad_eliminator]: 8.58001e-06 [cse]: 1.419e-05 [a_3]: 5.282e-05 [py_interpret_to_execute_after_opt_a]: 1.206e-05 [slice_cell_reuse_recomputed_activation]: 4.83001e-06 [rewriter_after_opt_a]: 3.592e-05 [convert_after_rewriter]: 1.038e-05 [order_py_execute_after_rewriter]: 8.51002e-06 [mutable_eliminate]: 0.00046531 [opt_b]: 0.00026175, [1] [Cycle 1]: 0.0002533, [7] [b_1]: 0.0001648 [b_2]: 8.05e-06 [updatestate_depend_eliminate]: 4.95999e-06 [updatestate_assign_eliminate]: 2.53003e-06 [updatestate_loads_eliminate]: 2.49001e-06 [renormalize]: 3.19997e-07 [cse]: 1.552e-05 [optimize_parallel_all_gather_comm]: 1.897e-05 [overlap_param_gather]: 5.56998e-06 [cconv]: 2.689e-05 [loop_unroll]: 0.00041844 [opt_after_cconv]: 0.00012211, [1] [Cycle 1]: 0.00011369, [7] [c_1]: 3.063e-05 [parameter_eliminate]: 2.45002e-06 [updatestate_depend_eliminate]: 4.87e-06 [updatestate_assign_eliminate]: 2.56e-06 [updatestate_loads_eliminate]: 2.27001e-06 [cse]: 1.573e-05 [renormalize]: 8.50006e-07 [remove_dup_value]: 1.607e-05 [tuple_transform]: 8.864e-05, [1] [Cycle 1]: 8.169e-05, [4] [d_1]: 4.317e-05 [none_parameter_eliminate]: 1.62001e-06 [renormalize]: 2.19996e-07 [switch_simplify]: 7.13e-06 [partial_unused_args_eliminate]: 4.74e-06 [add_recomputation]: 4.466e-05 [cse_after_recomputation]: 2.645e-05, [1] [Cycle 1]: 1.961e-05, [1] [cse]: 1.062e-05 [environ_conv]: 7.65e-06 [swap_dp_allreduce_reducescatter]: 7.55e-06 [bias_add_comm_swap]: 4.87e-06 [label_micro_interleaved_index]: 6.41e-06 [label_fine_grained_interleaved_index]: 4.97999e-06 [merge_cast_opt]: 3.6e-06 [slice_recompute_activation]: 4.13001e-06 [micro_interleaved_order_control]: 4.55999e-06 [assign_add_opt]: 3.43999e-06 [ForceFp32Comm]: 3.09001e-06 [remove_cast_before_assign_add]: 3.26001e-06 [full_micro_interleaved_order_control]: 4.3e-06 [reorder_send_recv_between_fp_bp]: 5.11002e-06 [comm_op_add_attrs]: 3.2e-06 [add_comm_op_reuse_tag]: 3.23e-06 [interleave_split_concat_branches]: 3.43999e-06 [interleave_parallel_branches]: 3.30998e-06 [overlap_opt_shard_in_pipeline]: 3.35e-06 [overlap_opt_shard_grad_in_pipeline]: 4.32e-06 [control_data_broadcast_order]: 1.456e-05 [grouped_pairwise_exchange_alltoall]: 3.83999e-06 [offloading_packed_experts]: 5.99e-06 [overlap_recompute_and_grad_model_parallel]: 7.41001e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.54002e-06 [overlap_recompute_allgather_and_fa_grad]: 3.58e-06 [overlap_recompute_comm]: 4.13001e-06 [overlap_grad_ring_attention]: 6.64001e-06 [overlap_grad_flash_sp]: 1.892e-05 [begin_end_overlap_inline]: 3.03e-06 [split_matmul_comm_elemetwise]: 4.55001e-06 [split_layernorm_comm]: 4.06001e-06 [handle_group_info]: 3.63999e-06 [symbol_engine_optimizer]: 9.274e-05, [1] [Cycle 1]: 8.58e-05, [6] [build]: 2.61e-06 [elim_shapecalc]: 9.29e-06 [elim_not_effective]: 1.276e-05 [opt_reshape]: 7.15998e-06 [fold_const_symbol]: 9.83998e-06 [renormalize]: 2.50002e-07 [detach_backward]: 3.23e-06 [pipeline_parallel_scheduler]: 2.04e-06 [auto_monad_reorder]: 1.726e-05 [get_jit_bprop_graph]: 1.09003e-06 [rewriter_after_jit_bprop_graph]: 3.61001e-06 [opt_after_jit_grad]: 0.00046386 [validate]: 3.273e-05 Sums bootstrap : 0.000412s : 4.03% type_inference : 0.005306s : 51.95% event_method : 0.000018s : 0.18% auto_monad : 0.000063s : 0.61% graph_reusing : 0.000006s : 0.06% inline : 0.000002s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000018s : 0.18% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.06% parallel-infer-symbol : 0.000003s : 0.03% pre_auto_parallel : 0.000032s : 0.31% insert-virtual-dataset : 0.000003s : 0.03% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000026s : 0.25% optimize.rewriter_before_opt_a : 0.000084s : 0.83% optimize.opt_a.expand_dump_flag : 0.000004s : 0.04% optimize.opt_a.switch_simplify : 0.000050s : 0.49% optimize.opt_a.loop_unroll : 0.000037s : 0.36% optimize.opt_a.a_1 : 0.000729s : 7.13% optimize.opt_a.with_stream_mark : 0.000023s : 0.23% optimize.opt_a.recompute_prepare : 0.000015s : 0.15% optimize.opt_a.updatestate_depend_eliminate : 0.000007s : 0.07% optimize.opt_a.updatestate_assign_eliminate : 0.000006s : 0.06% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.05% optimize.opt_a.parameter_eliminate : 0.000003s : 0.03% optimize.opt_a.a_2 : 0.000220s : 2.15% optimize.opt_a.accelerated_algorithm : 0.000014s : 0.14% optimize.opt_a.shard : 0.000003s : 0.03% optimize.opt_a.meta_shard_fg_expand : 0.000003s : 0.03% optimize.opt_a.shard_inline : 0.000013s : 0.13% optimize.opt_a.merge_send_recv : 0.000014s : 0.14% optimize.opt_a.auto_parallel : 0.000013s : 0.13% optimize.opt_a.parallel : 0.000030s : 0.29% optimize.opt_a.flash_sp : 0.000012s : 0.12% optimize.opt_a.merge_comm : 0.000007s : 0.07% optimize.opt_a.allreduce_fusion : 0.000007s : 0.07% optimize.opt_a.matmul_add_comm_reduction : 0.000014s : 0.14% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000015s : 0.14% optimize.opt_a.virtual_dataset : 0.000013s : 0.13% optimize.opt_a.get_grad_eliminate_ : 0.000012s : 0.12% optimize.opt_a.virtual_output : 0.000013s : 0.13% optimize.opt_a.merge_forward : 0.000006s : 0.06% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.03% optimize.opt_a.offload_activation : 0.000016s : 0.15% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000031s : 0.30% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.03% optimize.opt_a.before_grad : 0.000026s : 0.26% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000009s : 0.09% optimize.opt_a.meta_fg_expand : 0.000005s : 0.05% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.03% optimize.opt_a.receive_attached : 0.000004s : 0.04% optimize.opt_a.after_resolve : 0.000024s : 0.24% optimize.opt_a.a_after_grad : 0.000021s : 0.21% optimize.opt_a.renormalize : 0.000487s : 4.77% optimize.opt_a.add_forward_monad_depend : 0.000008s : 0.08% optimize.opt_a.auto_monad_grad : 0.000003s : 0.03% optimize.opt_a.auto_monad_eliminator : 0.000023s : 0.22% optimize.opt_a.cse : 0.000033s : 0.33% optimize.opt_a.a_3 : 0.000116s : 1.13% optimize.py_interpret_to_execute_after_opt_a : 0.000012s : 0.12% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.05% optimize.rewriter_after_opt_a : 0.000036s : 0.35% optimize.convert_after_rewriter : 0.000010s : 0.10% optimize.order_py_execute_after_rewriter : 0.000009s : 0.08% optimize.mutable_eliminate : 0.000465s : 4.56% optimize.opt_b.b_1 : 0.000165s : 1.61% optimize.opt_b.b_2 : 0.000008s : 0.08% optimize.opt_b.updatestate_depend_eliminate : 0.000005s : 0.05% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.02% optimize.opt_b.updatestate_loads_eliminate : 0.000002s : 0.02% optimize.opt_b.renormalize : 0.000000s : 0.00% optimize.opt_b.cse : 0.000016s : 0.15% optimize.optimize_parallel_all_gather_comm : 0.000019s : 0.19% optimize.overlap_param_gather : 0.000006s : 0.05% optimize.cconv : 0.000027s : 0.26% optimize.loop_unroll : 0.000418s : 4.10% optimize.opt_after_cconv.c_1 : 0.000031s : 0.30% optimize.opt_after_cconv.parameter_eliminate : 0.000002s : 0.02% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000005s : 0.05% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.02% optimize.opt_after_cconv.cse : 0.000016s : 0.15% optimize.opt_after_cconv.renormalize : 0.000001s : 0.01% optimize.remove_dup_value : 0.000016s : 0.16% optimize.tuple_transform.d_1 : 0.000043s : 0.42% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.02% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000007s : 0.07% optimize.partial_unused_args_eliminate : 0.000005s : 0.05% optimize.add_recomputation : 0.000045s : 0.44% optimize.cse_after_recomputation.cse : 0.000011s : 0.10% optimize.environ_conv : 0.000008s : 0.07% optimize.swap_dp_allreduce_reducescatter : 0.000008s : 0.07% optimize.bias_add_comm_swap : 0.000005s : 0.05% optimize.label_micro_interleaved_index : 0.000006s : 0.06% optimize.label_fine_grained_interleaved_index : 0.000005s : 0.05% optimize.merge_cast_opt : 0.000004s : 0.04% optimize.slice_recompute_activation : 0.000004s : 0.04% optimize.micro_interleaved_order_control : 0.000005s : 0.04% optimize.assign_add_opt : 0.000003s : 0.03% optimize.ForceFp32Comm : 0.000003s : 0.03% optimize.remove_cast_before_assign_add : 0.000003s : 0.03% optimize.full_micro_interleaved_order_control : 0.000004s : 0.04% optimize.reorder_send_recv_between_fp_bp : 0.000005s : 0.05% optimize.comm_op_add_attrs : 0.000003s : 0.03% optimize.add_comm_op_reuse_tag : 0.000003s : 0.03% optimize.interleave_split_concat_branches : 0.000003s : 0.03% optimize.interleave_parallel_branches : 0.000003s : 0.03% optimize.overlap_opt_shard_in_pipeline : 0.000003s : 0.03% optimize.overlap_opt_shard_grad_in_pipeline : 0.000004s : 0.04% optimize.control_data_broadcast_order : 0.000015s : 0.14% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.04% optimize.offloading_packed_experts : 0.000006s : 0.06% optimize.overlap_recompute_and_grad_model_parallel : 0.000007s : 0.07% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.03% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.04% optimize.overlap_recompute_comm : 0.000004s : 0.04% optimize.overlap_grad_ring_attention : 0.000007s : 0.07% optimize.overlap_grad_flash_sp : 0.000019s : 0.19% optimize.begin_end_overlap_inline : 0.000003s : 0.03% optimize.split_matmul_comm_elemetwise : 0.000005s : 0.04% optimize.split_layernorm_comm : 0.000004s : 0.04% optimize.handle_group_info : 0.000004s : 0.04% optimize.symbol_engine_optimizer.build : 0.000003s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000009s : 0.09% optimize.symbol_engine_optimizer.elim_not_effective : 0.000013s : 0.12% optimize.symbol_engine_optimizer.opt_reshape : 0.000007s : 0.07% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000010s : 0.10% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000003s : 0.03% pipeline_parallel_scheduler : 0.000002s : 0.02% auto_monad_reorder : 0.000017s : 0.17% get_jit_bprop_graph : 0.000001s : 0.01% rewriter_after_jit_bprop_graph : 0.000004s : 0.04% opt_after_jit_grad : 0.000464s : 4.54% validate : 0.000033s : 0.32% Time group info: ------[substitution.] 0.000157 28 1.12% : 0.000002s : 2: substitution.elim_not_effective 0.78% : 0.000001s : 2: substitution.fold_const_symbol 3.62% : 0.000006s : 4: substitution.graph_param_transform 75.10% : 0.000118s : 4: substitution.inline 2.44% : 0.000004s : 4: substitution.j_node_and_user_rematch 4.04% : 0.000006s : 4: substitution.remove_not_recompute_node 3.17% : 0.000005s : 4: substitution.replace_old_param 9.73% : 0.000015s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.005262 2 87.79% : 0.004620s : 1: type_inference.infer 12.21% : 0.000642s : 1: type_inference.specialize ------[replace.] 0.000055 8 59.12% : 0.000032s : 4: replace.inline 40.88% : 0.000022s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000128 8 89.62% : 0.000115s : 4: match.inline 10.38% : 0.000013s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000205 1278 0.92% : 0.000002s : 13: predicate.accumulaten_eliminater 0.67% : 0.000001s : 4: predicate.ad_related_special_op_eliminate 0.55% : 0.000001s : 8: predicate.addn_check_dump 0.93% : 0.000002s : 13: predicate.addn_zero_filter 0.81% : 0.000002s : 13: predicate.adjust_all_reduce_mul_add 2.04% : 0.000004s : 21: predicate.arithmetic_simplify 0.93% : 0.000002s : 13: predicate.cast_eliminate 0.67% : 0.000001s : 8: predicate.check_bprop_eliminate 0.56% : 0.000001s : 8: predicate.compare_switch_simplify 0.20% : 0.000000s : 4: predicate.const_output_eliminate 0.68% : 0.000001s : 8: predicate.depend_value_elim 0.99% : 0.000002s : 13: predicate.dict_get_item_const_eliminator 1.02% : 0.000002s : 13: predicate.dict_get_item_eliminator 0.87% : 0.000002s : 13: predicate.dict_set_item_eliminator 1.03% : 0.000002s : 8: predicate.dumpgradient_eliminate 0.19% : 0.000000s : 4: predicate.elim_not_effective 0.39% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.21% : 0.000002s : 17: predicate.environ_add_const_eliminate 1.17% : 0.000002s : 17: predicate.environ_get_add_eliminate 1.14% : 0.000002s : 17: predicate.environ_get_depend_swap 1.74% : 0.000004s : 25: predicate.environ_get_eliminate 1.17% : 0.000002s : 17: predicate.environ_get_set_eliminate 1.45% : 0.000003s : 21: predicate.exchange_switch_depend_value 2.64% : 0.000005s : 21: predicate.float_depend_g_call 0.60% : 0.000001s : 8: predicate.float_environ_get_switch 0.83% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.19% : 0.000000s : 4: predicate.fold_const_symbol 0.67% : 0.000001s : 8: predicate.get_grad_eliminate 0.21% : 0.000000s : 4: predicate.graph_param_transform 0.61% : 0.000001s : 8: predicate.incorporate_call 0.50% : 0.000001s : 8: predicate.incorporate_call_switch 6.21% : 0.000013s : 58: predicate.inline 0.83% : 0.000002s : 8: predicate.inline_without_move 0.39% : 0.000001s : 8: predicate.j_node_and_user_rematch 0.80% : 0.000002s : 8: predicate.less_batch_normalization 1.85% : 0.000004s : 25: predicate.list_to_tuple_eliminator_ 2.49% : 0.000005s : 38: predicate.load_eliminater 0.84% : 0.000002s : 4: predicate.loop_unroll_after_grad 2.43% : 0.000005s : 34: predicate.loop_unroll_before_grad 1.49% : 0.000003s : 21: predicate.make_slice_get_slice_eliminator 0.58% : 0.000001s : 8: predicate.merge_addn 0.55% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.74% : 0.000002s : 8: predicate.mini_step_allgather_replace 0.80% : 0.000002s : 13: predicate.minmaximum_grad 0.98% : 0.000002s : 4: predicate.mutable_eliminate 0.32% : 0.000001s : 4: predicate.opt_reshape 0.35% : 0.000001s : 4: predicate.parallel_virtual_node 1.82% : 0.000004s : 21: predicate.partial_defer_inline 1.67% : 0.000003s : 21: predicate.partial_eliminate 0.91% : 0.000002s : 13: predicate.print_const_string_wrapper 0.63% : 0.000001s : 8: predicate.reduce_all_const_elim 1.20% : 0.000002s : 13: predicate.reduce_eliminate 2.56% : 0.000005s : 38: predicate.redundant_stop_gradient_eliminater 0.45% : 0.000001s : 8: predicate.remove_not_recompute_node 1.39% : 0.000003s : 25: predicate.replace_applicator 0.45% : 0.000001s : 8: predicate.replace_old_param 0.28% : 0.000001s : 4: predicate.reset_defer_inline 0.92% : 0.000002s : 13: predicate.reshape_eliminate 0.56% : 0.000001s : 8: predicate.row_tensor_add_zeros_like 0.36% : 0.000001s : 4: predicate.row_tensor_eliminate 0.93% : 0.000002s : 8: predicate.same_eliminate 0.44% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.73% : 0.000001s : 8: predicate.shard_identity_eliminate 0.71% : 0.000001s : 8: predicate.special_op_eliminate 0.75% : 0.000002s : 8: predicate.specialize_transform 0.87% : 0.000002s : 8: predicate.split_environ_get_set_with_tuple_value 0.77% : 0.000002s : 8: predicate.stack_unstack_eliminate 0.34% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.60% : 0.000003s : 21: predicate.switch_defer_inline 2.10% : 0.000004s : 29: predicate.switch_layer_defer_inline 5.30% : 0.000011s : 67: predicate.switch_simplify 0.86% : 0.000002s : 13: predicate.tile_eliminate 0.96% : 0.000002s : 13: predicate.transpose_eliminate 1.49% : 0.000003s : 21: predicate.tuple_list_convert_item_index_to_positive 1.58% : 0.000003s : 21: predicate.tuple_list_get_item_const_eliminator 1.55% : 0.000003s : 21: predicate.tuple_list_get_item_depend_reorder 3.28% : 0.000007s : 33: predicate.tuple_list_get_item_eliminator 1.44% : 0.000003s : 21: predicate.tuple_list_get_set_item_eliminator 2.25% : 0.000005s : 29: predicate.tuple_list_set_item_eliminator 1.72% : 0.000004s : 25: predicate.tuple_to_list_eliminator_ 2.41% : 0.000005s : 38: predicate.updatestate_pure_node_eliminater 3.25% : 0.000007s : 46: predicate.updatestate_useless_node_eliminater 0.30% : 0.000001s : 4: predicate.value_based_eliminate 0.67% : 0.000001s : 8: predicate.virtual_dataset_eliminate 0.63% : 0.000001s : 8: predicate.virtual_output_eliminate 0.27% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.41% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000479 11 54.21% : 0.000260s : 5: func_graph_cloner_run.FuncGraphClonerGraph 45.79% : 0.000219s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.024846 192 0.02% : 0.000006s : 1: ForceFp32Comm 12.18% : 0.003026s : 1: add_attr 12.12% : 0.003012s : 1: add_attr_with_inline 0.03% : 0.000006s : 1: add_comm_op_reuse_tag 0.19% : 0.000048s : 1: add_recomputation 0.02% : 0.000006s : 1: assign_add_opt 0.29% : 0.000071s : 1: auto_monad 0.10% : 0.000025s : 1: auto_monad_reorder 0.02% : 0.000006s : 1: begin_end_overlap_inline 0.03% : 0.000008s : 1: bias_add_comm_swap 1.82% : 0.000451s : 1: bootstrap 0.12% : 0.000030s : 1: cconv 0.02% : 0.000006s : 1: comm_op_add_attrs 0.07% : 0.000017s : 1: control_data_broadcast_order 0.05% : 0.000014s : 1: convert_after_rewriter 0.12% : 0.000030s : 1: cse_after_recomputation 0.03% : 0.000008s : 1: dataset_repeat_opt 0.07% : 0.000017s : 1: detach_backward 0.04% : 0.000010s : 1: environ_conv 0.11% : 0.000027s : 1: event_method 0.03% : 0.000007s : 1: full_micro_interleaved_order_control 0.03% : 0.000008s : 1: get_jit_bprop_graph 0.05% : 0.000012s : 1: graph_reusing 0.03% : 0.000006s : 1: grouped_pairwise_exchange_alltoall 0.03% : 0.000006s : 1: handle_group_info 0.03% : 0.000007s : 1: inline 0.04% : 0.000009s : 1: insert-virtual-dataset 0.02% : 0.000006s : 1: interleave_parallel_branches 0.02% : 0.000006s : 1: interleave_split_concat_branches 0.03% : 0.000008s : 1: label_fine_grained_interleaved_index 0.04% : 0.000009s : 1: label_micro_interleaved_index 1.71% : 0.000424s : 1: loop_unroll 0.03% : 0.000006s : 1: merge_cast_opt 0.03% : 0.000007s : 1: micro_interleaved_order_control 1.90% : 0.000471s : 1: mutable_eliminate 0.04% : 0.000009s : 1: offloading_packed_experts 0.05% : 0.000013s : 1: opt.transform.loop_unroll_optimizer 0.05% : 0.000013s : 1: opt.transform.mutable_eliminate 4.71% : 0.001171s : 78: opt.transform.opt_a 0.12% : 0.000029s : 1: opt.transform.opt_after_cconv 0.10% : 0.000025s : 1: opt.transform.opt_after_jit_grad 0.41% : 0.000102s : 28: opt.transform.opt_b 0.19% : 0.000048s : 2: opt.transform.opt_trans_graph 0.14% : 0.000036s : 4: opt.transform.symbol_engine_opt 11.76% : 0.002921s : 1: opt_a 0.51% : 0.000126s : 1: opt_after_cconv 1.91% : 0.000474s : 1: opt_after_jit_grad 1.07% : 0.000265s : 1: opt_b 21.51% : 0.005343s : 1: optimize 0.09% : 0.000022s : 1: optimize_parallel_all_gather_comm 0.05% : 0.000011s : 1: order_py_execute_after_rewriter 0.09% : 0.000022s : 1: overlap_grad_flash_sp 0.03% : 0.000006s : 1: overlap_grad_matmul_and_grad_allreduce 0.04% : 0.000009s : 1: overlap_grad_ring_attention 0.03% : 0.000007s : 1: overlap_opt_shard_grad_in_pipeline 0.02% : 0.000006s : 1: overlap_opt_shard_in_pipeline 0.03% : 0.000009s : 1: overlap_param_gather 0.02% : 0.000006s : 1: overlap_recompute_allgather_and_fa_grad 0.04% : 0.000011s : 1: overlap_recompute_and_grad_model_parallel 0.03% : 0.000007s : 1: overlap_recompute_comm 0.04% : 0.000009s : 1: parallel-infer-symbol 0.03% : 0.000006s : 1: parallel-infer-symbol-second 0.03% : 0.000008s : 1: partial_unused_args_eliminate 0.04% : 0.000009s : 1: pipeline_parallel_scheduler 0.03% : 0.000007s : 1: pipeline_split 0.16% : 0.000039s : 1: pre_auto_parallel 0.12% : 0.000030s : 1: py_interpret_to_execute 0.06% : 0.000015s : 1: py_interpret_to_execute_after_opt_a 0.02% : 0.000006s : 1: remove_cast_before_assign_add 0.08% : 0.000019s : 1: remove_dup_value 0.90% : 0.000223s : 1: renormalize.infer 1.03% : 0.000257s : 1: renormalize.specialize 0.03% : 0.000008s : 1: reorder_send_recv_between_fp_bp 0.04% : 0.000010s : 1: rewriter_after_jit_bprop_graph 0.16% : 0.000039s : 1: rewriter_after_opt_a 0.35% : 0.000088s : 1: rewriter_before_opt_a 0.03% : 0.000008s : 1: slice_cell_reuse_recomputed_activation 0.03% : 0.000007s : 1: slice_recompute_activation 0.03% : 0.000007s : 1: split_layernorm_comm 0.03% : 0.000007s : 1: split_matmul_comm_elemetwise 0.04% : 0.000010s : 1: swap_dp_allreduce_reducescatter 0.38% : 0.000096s : 1: symbol_engine_optimizer 0.37% : 0.000092s : 1: tuple_transform 21.47% : 0.005335s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:45:30.441.822 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0279849, [21] [bootstrap]: 0.00042892 [type_inference]: 0.0187 [event_method]: 2.234e-05 [auto_monad]: 6.624e-05 [graph_reusing]: 6.04001e-06 [inline]: 2.55997e-06 [add_attr]: 0.0033676, [1] [add_attr_with_inline]: 0.00335908, [1] [Cycle 1]: 6.199e-05, [2] [tag_attr]: 1.929e-05 [meta_addattr_fg_expand]: 6.14001e-06 [parallel-infer-symbol]: 3.36999e-06 [pre_auto_parallel]: 3.267e-05 [insert-virtual-dataset]: 2.67001e-06 [parallel-infer-symbol-second]: 7.79983e-07 [dataset_repeat_opt]: 1.99e-06 [pipeline_split]: 2.09999e-06 [optimize]: 0.00465811, [53] [py_interpret_to_execute]: 2.517e-05 [rewriter_before_opt_a]: 7.924e-05 [opt_a]: 0.00262777, [2] [Cycle 1]: 0.00196922, [45] [expand_dump_flag]: 3.13e-06 [switch_simplify]: 4.324e-05 [loop_unroll]: 3.01e-05 [a_1]: 0.00061016 [with_stream_mark]: 1.507e-05 [recompute_prepare]: 9.13002e-06 [updatestate_depend_eliminate]: 3.67002e-06 [updatestate_assign_eliminate]: 3.35e-06 [updatestate_loads_eliminate]: 3.23e-06 [parameter_eliminate]: 2.06e-06 [a_2]: 8.059e-05 [accelerated_algorithm]: 7.23999e-06 [shard]: 2.02001e-06 [meta_shard_fg_expand]: 2.01e-06 [shard_inline]: 7.08998e-06 [merge_send_recv]: 8.84998e-06 [auto_parallel]: 6.11e-06 [parallel]: 1.774e-05 [flash_sp]: 7.58001e-06 [merge_comm]: 4.03001e-06 [allreduce_fusion]: 3.91999e-06 [matmul_add_comm_reduction]: 9.91998e-06 [allreduce_slice_to_reducescatter]: 1.01002e-06 [virtual_shard_identity]: 8.01001e-06 [virtual_dataset]: 6.56e-06 [get_grad_eliminate_]: 6.32001e-06 [virtual_output]: 6.39001e-06 [merge_forward]: 3.54002e-06 [cell_reuse_recompute_pass]: 1.12e-06 [offload_activation]: 1.001e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.257e-05 [merge_recompute_call_nodes]: 1.44e-06 [before_grad]: 1.076e-05 [set_forward_comm_id_for_comm_node_pass]: 3.50998e-06 [meta_fg_expand]: 2.84001e-06 [flash_sp_send_recv_attached]: 2.64001e-06 [receive_attached]: 3.28e-06 [after_resolve]: 1.165e-05 [a_after_grad]: 9.52999e-06 [renormalize]: 0.00064532 [add_forward_monad_depend]: 5.25999e-06 [auto_monad_grad]: 2.19001e-06 [auto_monad_eliminator]: 1.487e-05 [cse]: 2.917e-05 [a_3]: 4.707e-05 [Cycle 2]: 0.00064798, [45] [expand_dump_flag]: 1.02e-06 [switch_simplify]: 7.31001e-06 [loop_unroll]: 6.31998e-06 [a_1]: 0.00013111 [with_stream_mark]: 1.188e-05 [recompute_prepare]: 7.20998e-06 [updatestate_depend_eliminate]: 3.04999e-06 [updatestate_assign_eliminate]: 2.17999e-06 [updatestate_loads_eliminate]: 2.39999e-06 [parameter_eliminate]: 1.19e-06 [a_2]: 7.122e-05 [accelerated_algorithm]: 6.02999e-06 [shard]: 1.05999e-06 [meta_shard_fg_expand]: 1.23002e-06 [shard_inline]: 5.99e-06 [merge_send_recv]: 5.89999e-06 [auto_parallel]: 5.27001e-06 [parallel]: 4.71002e-06 [flash_sp]: 3.50003e-06 [merge_comm]: 3.4e-06 [allreduce_fusion]: 3.15998e-06 [matmul_add_comm_reduction]: 5.27999e-06 [allreduce_slice_to_reducescatter]: 4.59986e-07 [virtual_shard_identity]: 6.93e-06 [virtual_dataset]: 6.11e-06 [get_grad_eliminate_]: 5.76998e-06 [virtual_output]: 5.53002e-06 [merge_forward]: 3.29001e-06 [cell_reuse_recompute_pass]: 1.48002e-06 [offload_activation]: 6.51e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.321e-05 [merge_recompute_call_nodes]: 9.20001e-07 [before_grad]: 1.057e-05 [set_forward_comm_id_for_comm_node_pass]: 3.3e-06 [meta_fg_expand]: 2.06e-06 [flash_sp_send_recv_attached]: 9.70002e-07 [receive_attached]: 1.26002e-06 [after_resolve]: 1.223e-05 [a_after_grad]: 9.05001e-06 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 1.09e-06 [auto_monad_grad]: 1.09e-06 [auto_monad_eliminator]: 7.38e-06 [cse]: 1.393e-05 [a_3]: 3.716e-05 [py_interpret_to_execute_after_opt_a]: 1.018e-05 [slice_cell_reuse_recomputed_activation]: 2.24001e-06 [rewriter_after_opt_a]: 3.539e-05 [convert_after_rewriter]: 7.12002e-06 [order_py_execute_after_rewriter]: 5.12e-06 [mutable_eliminate]: 0.00051491 [opt_b]: 0.00022864, [1] [Cycle 1]: 0.00022237, [7] [b_1]: 0.00014819 [b_2]: 8.20999e-06 [updatestate_depend_eliminate]: 6.04999e-06 [updatestate_assign_eliminate]: 2.31998e-06 [updatestate_loads_eliminate]: 2.20002e-06 [renormalize]: 4.50003e-07 [cse]: 1.778e-05 [optimize_parallel_all_gather_comm]: 1.629e-05 [overlap_param_gather]: 1.84e-06 [cconv]: 2.731e-05 [loop_unroll]: 0.00042512 [opt_after_cconv]: 0.00010237, [1] [Cycle 1]: 9.688e-05, [7] [c_1]: 3.097e-05 [parameter_eliminate]: 3.03e-06 [updatestate_depend_eliminate]: 6.16e-06 [updatestate_assign_eliminate]: 2.42001e-06 [updatestate_loads_eliminate]: 2.24001e-06 [cse]: 1.771e-05 [renormalize]: 4.30009e-07 [remove_dup_value]: 1.233e-05 [tuple_transform]: 7.412e-05, [1] [Cycle 1]: 6.984e-05, [4] [d_1]: 4.318e-05 [none_parameter_eliminate]: 1.81e-06 [renormalize]: 1.59984e-07 [switch_simplify]: 6.58e-06 [partial_unused_args_eliminate]: 1.81998e-06 [add_recomputation]: 4.734e-05 [cse_after_recomputation]: 2.256e-05, [1] [Cycle 1]: 1.777e-05, [1] [cse]: 1.199e-05 [environ_conv]: 5.20999e-06 [swap_dp_allreduce_reducescatter]: 5.64e-06 [bias_add_comm_swap]: 2.84999e-06 [label_micro_interleaved_index]: 4.14002e-06 [label_fine_grained_interleaved_index]: 2.84999e-06 [merge_cast_opt]: 1.27e-06 [slice_recompute_activation]: 2.05002e-06 [micro_interleaved_order_control]: 2.36e-06 [assign_add_opt]: 1.19998e-06 [ForceFp32Comm]: 8.2e-07 [remove_cast_before_assign_add]: 1.05999e-06 [full_micro_interleaved_order_control]: 2.24999e-06 [reorder_send_recv_between_fp_bp]: 2.69001e-06 [comm_op_add_attrs]: 9.99979e-07 [add_comm_op_reuse_tag]: 9.29984e-07 [interleave_split_concat_branches]: 1.27999e-06 [interleave_parallel_branches]: 1.05001e-06 [overlap_opt_shard_in_pipeline]: 1.20001e-06 [overlap_opt_shard_grad_in_pipeline]: 2.27001e-06 [control_data_broadcast_order]: 1.265e-05 [grouped_pairwise_exchange_alltoall]: 1.78002e-06 [offloading_packed_experts]: 3.51001e-06 [overlap_recompute_and_grad_model_parallel]: 5.19e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.17999e-06 [overlap_recompute_allgather_and_fa_grad]: 1.37999e-06 [overlap_recompute_comm]: 2.69001e-06 [overlap_grad_ring_attention]: 4.09002e-06 [overlap_grad_flash_sp]: 1.873e-05 [begin_end_overlap_inline]: 4.69998e-07 [split_matmul_comm_elemetwise]: 2.21e-06 [split_layernorm_comm]: 1.55999e-06 [handle_group_info]: 1.05001e-06 [symbol_engine_optimizer]: 7.634e-05, [1] [Cycle 1]: 7.226e-05, [6] [build]: 3.31999e-06 [elim_shapecalc]: 1.042e-05 [elim_not_effective]: 1.265e-05 [opt_reshape]: 6.96001e-06 [fold_const_symbol]: 1.018e-05 [renormalize]: 1.60013e-07 [detach_backward]: 2.33998e-06 [pipeline_parallel_scheduler]: 1.86e-06 [auto_monad_reorder]: 1.691e-05 [get_jit_bprop_graph]: 1.55999e-06 [rewriter_after_jit_bprop_graph]: 4.43001e-06 [opt_after_jit_grad]: 0.0004704 [validate]: 3.968e-05 Sums bootstrap : 0.000429s : 1.81% type_inference : 0.018700s : 79.02% event_method : 0.000022s : 0.09% auto_monad : 0.000066s : 0.28% graph_reusing : 0.000006s : 0.03% inline : 0.000003s : 0.01% add_attr.add_attr_with_inline.tag_attr : 0.000019s : 0.08% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.03% parallel-infer-symbol : 0.000003s : 0.01% pre_auto_parallel : 0.000033s : 0.14% insert-virtual-dataset : 0.000003s : 0.01% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.01% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000025s : 0.11% optimize.rewriter_before_opt_a : 0.000079s : 0.33% optimize.opt_a.expand_dump_flag : 0.000004s : 0.02% optimize.opt_a.switch_simplify : 0.000051s : 0.21% optimize.opt_a.loop_unroll : 0.000036s : 0.15% optimize.opt_a.a_1 : 0.000741s : 3.13% optimize.opt_a.with_stream_mark : 0.000027s : 0.11% optimize.opt_a.recompute_prepare : 0.000016s : 0.07% optimize.opt_a.updatestate_depend_eliminate : 0.000007s : 0.03% optimize.opt_a.updatestate_assign_eliminate : 0.000006s : 0.02% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.02% optimize.opt_a.parameter_eliminate : 0.000003s : 0.01% optimize.opt_a.a_2 : 0.000152s : 0.64% optimize.opt_a.accelerated_algorithm : 0.000013s : 0.06% optimize.opt_a.shard : 0.000003s : 0.01% optimize.opt_a.meta_shard_fg_expand : 0.000003s : 0.01% optimize.opt_a.shard_inline : 0.000013s : 0.06% optimize.opt_a.merge_send_recv : 0.000015s : 0.06% optimize.opt_a.auto_parallel : 0.000011s : 0.05% optimize.opt_a.parallel : 0.000022s : 0.09% optimize.opt_a.flash_sp : 0.000011s : 0.05% optimize.opt_a.merge_comm : 0.000007s : 0.03% optimize.opt_a.allreduce_fusion : 0.000007s : 0.03% optimize.opt_a.matmul_add_comm_reduction : 0.000015s : 0.06% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000015s : 0.06% optimize.opt_a.virtual_dataset : 0.000013s : 0.05% optimize.opt_a.get_grad_eliminate_ : 0.000012s : 0.05% optimize.opt_a.virtual_output : 0.000012s : 0.05% optimize.opt_a.merge_forward : 0.000007s : 0.03% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.01% optimize.opt_a.offload_activation : 0.000017s : 0.07% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000026s : 0.11% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.01% optimize.opt_a.before_grad : 0.000021s : 0.09% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000007s : 0.03% optimize.opt_a.meta_fg_expand : 0.000005s : 0.02% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.02% optimize.opt_a.receive_attached : 0.000005s : 0.02% optimize.opt_a.after_resolve : 0.000024s : 0.10% optimize.opt_a.a_after_grad : 0.000019s : 0.08% optimize.opt_a.renormalize : 0.000645s : 2.73% optimize.opt_a.add_forward_monad_depend : 0.000006s : 0.03% optimize.opt_a.auto_monad_grad : 0.000003s : 0.01% optimize.opt_a.auto_monad_eliminator : 0.000022s : 0.09% optimize.opt_a.cse : 0.000043s : 0.18% optimize.opt_a.a_3 : 0.000084s : 0.36% optimize.py_interpret_to_execute_after_opt_a : 0.000010s : 0.04% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.01% optimize.rewriter_after_opt_a : 0.000035s : 0.15% optimize.convert_after_rewriter : 0.000007s : 0.03% optimize.order_py_execute_after_rewriter : 0.000005s : 0.02% optimize.mutable_eliminate : 0.000515s : 2.18% optimize.opt_b.b_1 : 0.000148s : 0.63% optimize.opt_b.b_2 : 0.000008s : 0.03% optimize.opt_b.updatestate_depend_eliminate : 0.000006s : 0.03% optimize.opt_b.updatestate_assign_eliminate : 0.000002s : 0.01% optimize.opt_b.updatestate_loads_eliminate : 0.000002s : 0.01% optimize.opt_b.renormalize : 0.000000s : 0.00% optimize.opt_b.cse : 0.000018s : 0.08% optimize.optimize_parallel_all_gather_comm : 0.000016s : 0.07% optimize.overlap_param_gather : 0.000002s : 0.01% optimize.cconv : 0.000027s : 0.12% optimize.loop_unroll : 0.000425s : 1.80% optimize.opt_after_cconv.c_1 : 0.000031s : 0.13% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.03% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000002s : 0.01% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.01% optimize.opt_after_cconv.cse : 0.000018s : 0.07% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000012s : 0.05% optimize.tuple_transform.d_1 : 0.000043s : 0.18% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000007s : 0.03% optimize.partial_unused_args_eliminate : 0.000002s : 0.01% optimize.add_recomputation : 0.000047s : 0.20% optimize.cse_after_recomputation.cse : 0.000012s : 0.05% optimize.environ_conv : 0.000005s : 0.02% optimize.swap_dp_allreduce_reducescatter : 0.000006s : 0.02% optimize.bias_add_comm_swap : 0.000003s : 0.01% optimize.label_micro_interleaved_index : 0.000004s : 0.02% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.01% optimize.merge_cast_opt : 0.000001s : 0.01% optimize.slice_recompute_activation : 0.000002s : 0.01% optimize.micro_interleaved_order_control : 0.000002s : 0.01% optimize.assign_add_opt : 0.000001s : 0.01% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.00% optimize.full_micro_interleaved_order_control : 0.000002s : 0.01% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.01% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.01% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.01% optimize.control_data_broadcast_order : 0.000013s : 0.05% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.01% optimize.offloading_packed_experts : 0.000004s : 0.01% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.02% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.01% optimize.overlap_recompute_comm : 0.000003s : 0.01% optimize.overlap_grad_ring_attention : 0.000004s : 0.02% optimize.overlap_grad_flash_sp : 0.000019s : 0.08% optimize.begin_end_overlap_inline : 0.000000s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.01% optimize.split_layernorm_comm : 0.000002s : 0.01% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000003s : 0.01% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000010s : 0.04% optimize.symbol_engine_optimizer.elim_not_effective : 0.000013s : 0.05% optimize.symbol_engine_optimizer.opt_reshape : 0.000007s : 0.03% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000010s : 0.04% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.01% pipeline_parallel_scheduler : 0.000002s : 0.01% auto_monad_reorder : 0.000017s : 0.07% get_jit_bprop_graph : 0.000002s : 0.01% rewriter_after_jit_bprop_graph : 0.000004s : 0.02% opt_after_jit_grad : 0.000470s : 1.99% validate : 0.000040s : 0.17% Time group info: ------[substitution.] 0.000175 28 1.02% : 0.000002s : 2: substitution.elim_not_effective 0.99% : 0.000002s : 2: substitution.fold_const_symbol 3.37% : 0.000006s : 4: substitution.graph_param_transform 78.75% : 0.000138s : 4: substitution.inline 2.60% : 0.000005s : 4: substitution.j_node_and_user_rematch 2.68% : 0.000005s : 4: substitution.remove_not_recompute_node 2.43% : 0.000004s : 4: substitution.replace_old_param 8.16% : 0.000014s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.018629 2 95.47% : 0.017785s : 1: type_inference.infer 4.53% : 0.000844s : 1: type_inference.specialize ------[replace.] 0.000060 8 61.86% : 0.000037s : 4: replace.inline 38.14% : 0.000023s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000148 8 91.66% : 0.000135s : 4: match.inline 8.34% : 0.000012s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000197 1278 0.91% : 0.000002s : 13: predicate.accumulaten_eliminater 0.75% : 0.000001s : 4: predicate.ad_related_special_op_eliminate 0.52% : 0.000001s : 8: predicate.addn_check_dump 0.97% : 0.000002s : 13: predicate.addn_zero_filter 0.84% : 0.000002s : 13: predicate.adjust_all_reduce_mul_add 1.96% : 0.000004s : 21: predicate.arithmetic_simplify 0.98% : 0.000002s : 13: predicate.cast_eliminate 0.60% : 0.000001s : 8: predicate.check_bprop_eliminate 0.52% : 0.000001s : 8: predicate.compare_switch_simplify 0.20% : 0.000000s : 4: predicate.const_output_eliminate 0.57% : 0.000001s : 8: predicate.depend_value_elim 0.94% : 0.000002s : 13: predicate.dict_get_item_const_eliminator 1.00% : 0.000002s : 13: predicate.dict_get_item_eliminator 0.89% : 0.000002s : 13: predicate.dict_set_item_eliminator 1.05% : 0.000002s : 8: predicate.dumpgradient_eliminate 0.20% : 0.000000s : 4: predicate.elim_not_effective 0.38% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.13% : 0.000002s : 17: predicate.environ_add_const_eliminate 1.11% : 0.000002s : 17: predicate.environ_get_add_eliminate 1.16% : 0.000002s : 17: predicate.environ_get_depend_swap 1.76% : 0.000003s : 25: predicate.environ_get_eliminate 1.09% : 0.000002s : 17: predicate.environ_get_set_eliminate 1.49% : 0.000003s : 21: predicate.exchange_switch_depend_value 2.48% : 0.000005s : 21: predicate.float_depend_g_call 0.52% : 0.000001s : 8: predicate.float_environ_get_switch 0.77% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.17% : 0.000000s : 4: predicate.fold_const_symbol 0.61% : 0.000001s : 8: predicate.get_grad_eliminate 0.21% : 0.000000s : 4: predicate.graph_param_transform 0.57% : 0.000001s : 8: predicate.incorporate_call 0.50% : 0.000001s : 8: predicate.incorporate_call_switch 6.26% : 0.000012s : 58: predicate.inline 0.71% : 0.000001s : 8: predicate.inline_without_move 0.33% : 0.000001s : 8: predicate.j_node_and_user_rematch 0.80% : 0.000002s : 8: predicate.less_batch_normalization 1.82% : 0.000004s : 25: predicate.list_to_tuple_eliminator_ 2.67% : 0.000005s : 38: predicate.load_eliminater 0.83% : 0.000002s : 4: predicate.loop_unroll_after_grad 2.44% : 0.000005s : 34: predicate.loop_unroll_before_grad 1.61% : 0.000003s : 21: predicate.make_slice_get_slice_eliminator 0.52% : 0.000001s : 8: predicate.merge_addn 0.62% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.65% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.82% : 0.000002s : 13: predicate.minmaximum_grad 0.96% : 0.000002s : 4: predicate.mutable_eliminate 0.36% : 0.000001s : 4: predicate.opt_reshape 0.34% : 0.000001s : 4: predicate.parallel_virtual_node 1.80% : 0.000004s : 21: predicate.partial_defer_inline 1.72% : 0.000003s : 21: predicate.partial_eliminate 0.97% : 0.000002s : 13: predicate.print_const_string_wrapper 0.64% : 0.000001s : 8: predicate.reduce_all_const_elim 1.17% : 0.000002s : 13: predicate.reduce_eliminate 2.53% : 0.000005s : 38: predicate.redundant_stop_gradient_eliminater 0.40% : 0.000001s : 8: predicate.remove_not_recompute_node 1.42% : 0.000003s : 25: predicate.replace_applicator 0.56% : 0.000001s : 8: predicate.replace_old_param 0.30% : 0.000001s : 4: predicate.reset_defer_inline 0.97% : 0.000002s : 13: predicate.reshape_eliminate 0.63% : 0.000001s : 8: predicate.row_tensor_add_zeros_like 0.37% : 0.000001s : 4: predicate.row_tensor_eliminate 0.68% : 0.000001s : 8: predicate.same_eliminate 0.47% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.75% : 0.000001s : 8: predicate.shard_identity_eliminate 0.70% : 0.000001s : 8: predicate.special_op_eliminate 0.74% : 0.000001s : 8: predicate.specialize_transform 0.91% : 0.000002s : 8: predicate.split_environ_get_set_with_tuple_value 0.83% : 0.000002s : 8: predicate.stack_unstack_eliminate 0.34% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.55% : 0.000003s : 21: predicate.switch_defer_inline 2.14% : 0.000004s : 29: predicate.switch_layer_defer_inline 5.28% : 0.000010s : 67: predicate.switch_simplify 0.89% : 0.000002s : 13: predicate.tile_eliminate 0.95% : 0.000002s : 13: predicate.transpose_eliminate 1.56% : 0.000003s : 21: predicate.tuple_list_convert_item_index_to_positive 1.64% : 0.000003s : 21: predicate.tuple_list_get_item_const_eliminator 1.47% : 0.000003s : 21: predicate.tuple_list_get_item_depend_reorder 3.30% : 0.000006s : 33: predicate.tuple_list_get_item_eliminator 1.58% : 0.000003s : 21: predicate.tuple_list_get_set_item_eliminator 2.34% : 0.000005s : 29: predicate.tuple_list_set_item_eliminator 1.74% : 0.000003s : 25: predicate.tuple_to_list_eliminator_ 2.48% : 0.000005s : 38: predicate.updatestate_pure_node_eliminater 3.15% : 0.000006s : 46: predicate.updatestate_useless_node_eliminater 0.32% : 0.000001s : 4: predicate.value_based_eliminate 0.63% : 0.000001s : 8: predicate.virtual_dataset_eliminate 0.60% : 0.000001s : 8: predicate.virtual_output_eliminate 0.30% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.56% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000563 11 47.12% : 0.000265s : 5: func_graph_cloner_run.FuncGraphClonerGraph 52.88% : 0.000298s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.037956 192 0.01% : 0.000004s : 1: ForceFp32Comm 8.89% : 0.003373s : 1: add_attr 8.86% : 0.003363s : 1: add_attr_with_inline 0.01% : 0.000004s : 1: add_comm_op_reuse_tag 0.14% : 0.000051s : 1: add_recomputation 0.01% : 0.000004s : 1: assign_add_opt 0.19% : 0.000071s : 1: auto_monad 0.05% : 0.000021s : 1: auto_monad_reorder 0.01% : 0.000003s : 1: begin_end_overlap_inline 0.01% : 0.000006s : 1: bias_add_comm_swap 1.20% : 0.000455s : 1: bootstrap 0.08% : 0.000031s : 1: cconv 0.01% : 0.000004s : 1: comm_op_add_attrs 0.04% : 0.000016s : 1: control_data_broadcast_order 0.03% : 0.000010s : 1: convert_after_rewriter 0.07% : 0.000026s : 1: cse_after_recomputation 0.01% : 0.000005s : 1: dataset_repeat_opt 0.01% : 0.000006s : 1: detach_backward 0.02% : 0.000008s : 1: environ_conv 0.08% : 0.000029s : 1: event_method 0.01% : 0.000005s : 1: full_micro_interleaved_order_control 0.01% : 0.000005s : 1: get_jit_bprop_graph 0.03% : 0.000010s : 1: graph_reusing 0.01% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000004s : 1: handle_group_info 0.01% : 0.000005s : 1: inline 0.02% : 0.000006s : 1: insert-virtual-dataset 0.01% : 0.000004s : 1: interleave_parallel_branches 0.01% : 0.000004s : 1: interleave_split_concat_branches 0.02% : 0.000006s : 1: label_fine_grained_interleaved_index 0.02% : 0.000007s : 1: label_micro_interleaved_index 1.14% : 0.000433s : 1: loop_unroll 0.01% : 0.000004s : 1: merge_cast_opt 0.01% : 0.000005s : 1: micro_interleaved_order_control 1.38% : 0.000523s : 1: mutable_eliminate 0.02% : 0.000006s : 1: offloading_packed_experts 0.03% : 0.000013s : 1: opt.transform.loop_unroll_optimizer 0.04% : 0.000015s : 1: opt.transform.mutable_eliminate 3.06% : 0.001160s : 78: opt.transform.opt_a 0.08% : 0.000030s : 1: opt.transform.opt_after_cconv 0.07% : 0.000025s : 1: opt.transform.opt_after_jit_grad 0.27% : 0.000101s : 28: opt.transform.opt_b 0.13% : 0.000048s : 2: opt.transform.opt_trans_graph 0.10% : 0.000037s : 4: opt.transform.symbol_engine_opt 6.93% : 0.002631s : 1: opt_a 0.28% : 0.000106s : 1: opt_after_cconv 1.26% : 0.000479s : 1: opt_after_jit_grad 0.61% : 0.000232s : 1: opt_b 12.28% : 0.004663s : 1: optimize 0.05% : 0.000020s : 1: optimize_parallel_all_gather_comm 0.02% : 0.000008s : 1: order_py_execute_after_rewriter 0.06% : 0.000022s : 1: overlap_grad_flash_sp 0.01% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.02% : 0.000007s : 1: overlap_grad_ring_attention 0.01% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.01% : 0.000005s : 1: overlap_param_gather 0.01% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.02% : 0.000009s : 1: overlap_recompute_and_grad_model_parallel 0.01% : 0.000006s : 1: overlap_recompute_comm 0.02% : 0.000007s : 1: parallel-infer-symbol 0.01% : 0.000004s : 1: parallel-infer-symbol-second 0.01% : 0.000005s : 1: partial_unused_args_eliminate 0.01% : 0.000005s : 1: pipeline_parallel_scheduler 0.01% : 0.000005s : 1: pipeline_split 0.10% : 0.000037s : 1: pre_auto_parallel 0.08% : 0.000029s : 1: py_interpret_to_execute 0.04% : 0.000013s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000004s : 1: remove_cast_before_assign_add 0.04% : 0.000016s : 1: remove_dup_value 0.94% : 0.000356s : 1: renormalize.infer 0.74% : 0.000280s : 1: renormalize.specialize 0.01% : 0.000005s : 1: reorder_send_recv_between_fp_bp 0.02% : 0.000008s : 1: rewriter_after_jit_bprop_graph 0.10% : 0.000039s : 1: rewriter_after_opt_a 0.22% : 0.000083s : 1: rewriter_before_opt_a 0.01% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.01% : 0.000005s : 1: slice_recompute_activation 0.01% : 0.000005s : 1: split_layernorm_comm 0.01% : 0.000005s : 1: split_matmul_comm_elemetwise 0.02% : 0.000009s : 1: swap_dp_allreduce_reducescatter 0.21% : 0.000079s : 1: symbol_engine_optimizer 0.20% : 0.000077s : 1: tuple_transform 49.33% : 0.018724s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:45:30.854.826 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:45:30.855.092 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0381805, [21] [bootstrap]: 0.0208125 [type_inference]: 0.00581466 [event_method]: 2.093e-05 [auto_monad]: 6.851e-05 [graph_reusing]: 6.17999e-06 [inline]: 2.21998e-06 [add_attr]: 0.00338259, [1] [add_attr_with_inline]: 0.00337162, [1] [Cycle 1]: 8.468e-05, [2] [tag_attr]: 2.064e-05 [meta_addattr_fg_expand]: 6.29999e-06 [parallel-infer-symbol]: 3.26001e-06 [pre_auto_parallel]: 3.763e-05 [insert-virtual-dataset]: 2.64001e-06 [parallel-infer-symbol-second]: 7.2e-07 [dataset_repeat_opt]: 2.12001e-06 [pipeline_split]: 1.59e-06 [optimize]: 0.00673716, [53] [py_interpret_to_execute]: 3.281e-05 [rewriter_before_opt_a]: 9.535e-05 [opt_a]: 0.00378168, [2] [Cycle 1]: 0.00272805, [45] [expand_dump_flag]: 2.83e-06 [switch_simplify]: 4.503e-05 [loop_unroll]: 3.151e-05 [a_1]: 0.00077264 [with_stream_mark]: 2.204e-05 [recompute_prepare]: 1.331e-05 [updatestate_depend_eliminate]: 5.94e-06 [updatestate_assign_eliminate]: 3.98001e-06 [updatestate_loads_eliminate]: 3.94997e-06 [parameter_eliminate]: 2.92002e-06 [a_2]: 0.00013188 [accelerated_algorithm]: 9.12001e-06 [shard]: 2.18002e-06 [meta_shard_fg_expand]: 2.16e-06 [shard_inline]: 8.59e-06 [merge_send_recv]: 1.172e-05 [auto_parallel]: 9.14e-06 [parallel]: 2.13e-05 [flash_sp]: 9.98998e-06 [merge_comm]: 5.81e-06 [allreduce_fusion]: 4.56002e-06 [matmul_add_comm_reduction]: 1.071e-05 [allreduce_slice_to_reducescatter]: 6.79982e-07 [virtual_shard_identity]: 1.324e-05 [virtual_dataset]: 9.40001e-06 [get_grad_eliminate_]: 8.18999e-06 [virtual_output]: 8.25e-06 [merge_forward]: 5.46e-06 [cell_reuse_recompute_pass]: 1.30999e-06 [offload_activation]: 1.233e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.323e-05 [merge_recompute_call_nodes]: 1.65001e-06 [before_grad]: 1.486e-05 [set_forward_comm_id_for_comm_node_pass]: 6.68998e-06 [meta_fg_expand]: 3.86999e-06 [flash_sp_send_recv_attached]: 2.86e-06 [receive_attached]: 2.01e-06 [after_resolve]: 3.088e-05 [a_after_grad]: 1.389e-05 [renormalize]: 0.0008293 [add_forward_monad_depend]: 8.15999e-06 [auto_monad_grad]: 2.65997e-06 [auto_monad_eliminator]: 2.098e-05 [cse]: 3.706e-05 [a_3]: 7.693e-05 [Cycle 2]: 0.00103629, [45] [expand_dump_flag]: 1.90001e-06 [switch_simplify]: 1.049e-05 [loop_unroll]: 7.42998e-06 [a_1]: 0.0001818 [with_stream_mark]: 1.741e-05 [recompute_prepare]: 8.90001e-06 [updatestate_depend_eliminate]: 5.14e-06 [updatestate_assign_eliminate]: 3.61001e-06 [updatestate_loads_eliminate]: 3.16001e-06 [parameter_eliminate]: 2.82002e-06 [a_2]: 0.00011977 [accelerated_algorithm]: 9.49999e-06 [shard]: 2.12999e-06 [meta_shard_fg_expand]: 2.29999e-06 [shard_inline]: 7.40998e-06 [merge_send_recv]: 8.05999e-06 [auto_parallel]: 8.03001e-06 [parallel]: 6.67002e-06 [flash_sp]: 3.55003e-06 [merge_comm]: 5.11002e-06 [allreduce_fusion]: 4.47e-06 [matmul_add_comm_reduction]: 9.47001e-06 [allreduce_slice_to_reducescatter]: 5.19998e-07 [virtual_shard_identity]: 1.037e-05 [virtual_dataset]: 7.92e-06 [get_grad_eliminate_]: 7.41999e-06 [virtual_output]: 7.11999e-06 [merge_forward]: 4.77998e-06 [cell_reuse_recompute_pass]: 2.04999e-06 [offload_activation]: 1.046e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.107e-05 [merge_recompute_call_nodes]: 1.40999e-06 [before_grad]: 1.547e-05 [set_forward_comm_id_for_comm_node_pass]: 4.70001e-06 [meta_fg_expand]: 3.10002e-06 [flash_sp_send_recv_attached]: 1.92999e-06 [receive_attached]: 1.64e-06 [after_resolve]: 1.56e-05 [a_after_grad]: 1.235e-05 [renormalize]: 8.00064e-08 [add_forward_monad_depend]: 3.6e-06 [auto_monad_grad]: 1.50999e-06 [auto_monad_eliminator]: 1.283e-05 [cse]: 2.306e-05 [a_3]: 6.151e-05 [py_interpret_to_execute_after_opt_a]: 1.891e-05 [slice_cell_reuse_recomputed_activation]: 5.04e-06 [rewriter_after_opt_a]: 5.241e-05 [convert_after_rewriter]: 1.128e-05 [order_py_execute_after_rewriter]: 9.12001e-06 [mutable_eliminate]: 0.00063264 [opt_b]: 0.00033243, [1] [Cycle 1]: 0.0003216, [7] [b_1]: 0.00019884 [b_2]: 1.041e-05 [updatestate_depend_eliminate]: 1.024e-05 [updatestate_assign_eliminate]: 3.71999e-06 [updatestate_loads_eliminate]: 3.57002e-06 [renormalize]: 9.30013e-07 [cse]: 2.99e-05 [optimize_parallel_all_gather_comm]: 2.419e-05 [overlap_param_gather]: 5.40001e-06 [cconv]: 3.707e-05 [loop_unroll]: 0.00048706 [opt_after_cconv]: 0.00016906, [1] [Cycle 1]: 0.00015852, [7] [c_1]: 3.798e-05 [parameter_eliminate]: 5.76998e-06 [updatestate_depend_eliminate]: 8.33999e-06 [updatestate_assign_eliminate]: 3.30003e-06 [updatestate_loads_eliminate]: 3.36001e-06 [cse]: 2.741e-05 [renormalize]: 5.19998e-07 [remove_dup_value]: 1.914e-05 [tuple_transform]: 0.00011488, [1] [Cycle 1]: 0.00010682, [4] [d_1]: 5.847e-05 [none_parameter_eliminate]: 2.19999e-06 [renormalize]: 3.19997e-07 [switch_simplify]: 9.96e-06 [partial_unused_args_eliminate]: 4.78001e-06 [add_recomputation]: 7.043e-05 [cse_after_recomputation]: 4.077e-05, [1] [Cycle 1]: 3.256e-05, [1] [cse]: 1.939e-05 [environ_conv]: 1.059e-05 [swap_dp_allreduce_reducescatter]: 1.03e-05 [bias_add_comm_swap]: 5.24e-06 [label_micro_interleaved_index]: 7.43e-06 [label_fine_grained_interleaved_index]: 5.22e-06 [merge_cast_opt]: 3.76001e-06 [slice_recompute_activation]: 4.45999e-06 [micro_interleaved_order_control]: 5.02e-06 [assign_add_opt]: 3.5e-06 [ForceFp32Comm]: 3.4e-06 [remove_cast_before_assign_add]: 3.30998e-06 [full_micro_interleaved_order_control]: 5.00999e-06 [reorder_send_recv_between_fp_bp]: 5.47001e-06 [comm_op_add_attrs]: 3.48999e-06 [add_comm_op_reuse_tag]: 3.26999e-06 [interleave_split_concat_branches]: 3.56001e-06 [interleave_parallel_branches]: 3.4e-06 [overlap_opt_shard_in_pipeline]: 4.03999e-06 [overlap_opt_shard_grad_in_pipeline]: 4.33001e-06 [control_data_broadcast_order]: 2.098e-05 [grouped_pairwise_exchange_alltoall]: 4.37998e-06 [offloading_packed_experts]: 7.36999e-06 [overlap_recompute_and_grad_model_parallel]: 9.12999e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.86999e-06 [overlap_recompute_allgather_and_fa_grad]: 3.64002e-06 [overlap_recompute_comm]: 5.09998e-06 [overlap_grad_ring_attention]: 7.05e-06 [overlap_grad_flash_sp]: 2.786e-05 [begin_end_overlap_inline]: 3.16001e-06 [split_matmul_comm_elemetwise]: 4.87e-06 [split_layernorm_comm]: 4.2e-06 [handle_group_info]: 3.80998e-06 [symbol_engine_optimizer]: 0.00035427, [1] [Cycle 1]: 0.00034513, [6] [build]: 4.84e-06 [elim_shapecalc]: 1.578e-05 [elim_not_effective]: 1.701e-05 [opt_reshape]: 1.835e-05 [fold_const_symbol]: 1.773e-05 [renormalize]: 2.19996e-07 [detach_backward]: 4.63999e-06 [pipeline_parallel_scheduler]: 1.86e-06 [auto_monad_reorder]: 2.638e-05 [get_jit_bprop_graph]: 1.82999e-06 [rewriter_after_jit_bprop_graph]: 6.47001e-06 [opt_after_jit_grad]: 0.00056584 [validate]: 4.9e-05 Sums bootstrap : 0.020812s : 63.85% type_inference : 0.005815s : 17.84% event_method : 0.000021s : 0.06% auto_monad : 0.000069s : 0.21% graph_reusing : 0.000006s : 0.02% inline : 0.000002s : 0.01% add_attr.add_attr_with_inline.tag_attr : 0.000021s : 0.06% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.02% parallel-infer-symbol : 0.000003s : 0.01% pre_auto_parallel : 0.000038s : 0.12% insert-virtual-dataset : 0.000003s : 0.01% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.01% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000033s : 0.10% optimize.rewriter_before_opt_a : 0.000095s : 0.29% optimize.opt_a.expand_dump_flag : 0.000005s : 0.01% optimize.opt_a.switch_simplify : 0.000056s : 0.17% optimize.opt_a.loop_unroll : 0.000039s : 0.12% optimize.opt_a.a_1 : 0.000954s : 2.93% optimize.opt_a.with_stream_mark : 0.000039s : 0.12% optimize.opt_a.recompute_prepare : 0.000022s : 0.07% optimize.opt_a.updatestate_depend_eliminate : 0.000011s : 0.03% optimize.opt_a.updatestate_assign_eliminate : 0.000008s : 0.02% optimize.opt_a.updatestate_loads_eliminate : 0.000007s : 0.02% optimize.opt_a.parameter_eliminate : 0.000006s : 0.02% optimize.opt_a.a_2 : 0.000252s : 0.77% optimize.opt_a.accelerated_algorithm : 0.000019s : 0.06% optimize.opt_a.shard : 0.000004s : 0.01% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.01% optimize.opt_a.shard_inline : 0.000016s : 0.05% optimize.opt_a.merge_send_recv : 0.000020s : 0.06% optimize.opt_a.auto_parallel : 0.000017s : 0.05% optimize.opt_a.parallel : 0.000028s : 0.09% optimize.opt_a.flash_sp : 0.000014s : 0.04% optimize.opt_a.merge_comm : 0.000011s : 0.03% optimize.opt_a.allreduce_fusion : 0.000009s : 0.03% optimize.opt_a.matmul_add_comm_reduction : 0.000020s : 0.06% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000024s : 0.07% optimize.opt_a.virtual_dataset : 0.000017s : 0.05% optimize.opt_a.get_grad_eliminate_ : 0.000016s : 0.05% optimize.opt_a.virtual_output : 0.000015s : 0.05% optimize.opt_a.merge_forward : 0.000010s : 0.03% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.01% optimize.opt_a.offload_activation : 0.000023s : 0.07% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000044s : 0.14% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.01% optimize.opt_a.before_grad : 0.000030s : 0.09% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000011s : 0.03% optimize.opt_a.meta_fg_expand : 0.000007s : 0.02% optimize.opt_a.flash_sp_send_recv_attached : 0.000005s : 0.01% optimize.opt_a.receive_attached : 0.000004s : 0.01% optimize.opt_a.after_resolve : 0.000046s : 0.14% optimize.opt_a.a_after_grad : 0.000026s : 0.08% optimize.opt_a.renormalize : 0.000829s : 2.54% optimize.opt_a.add_forward_monad_depend : 0.000012s : 0.04% optimize.opt_a.auto_monad_grad : 0.000004s : 0.01% optimize.opt_a.auto_monad_eliminator : 0.000034s : 0.10% optimize.opt_a.cse : 0.000060s : 0.18% optimize.opt_a.a_3 : 0.000138s : 0.42% optimize.py_interpret_to_execute_after_opt_a : 0.000019s : 0.06% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.02% optimize.rewriter_after_opt_a : 0.000052s : 0.16% optimize.convert_after_rewriter : 0.000011s : 0.03% optimize.order_py_execute_after_rewriter : 0.000009s : 0.03% optimize.mutable_eliminate : 0.000633s : 1.94% optimize.opt_b.b_1 : 0.000199s : 0.61% optimize.opt_b.b_2 : 0.000010s : 0.03% optimize.opt_b.updatestate_depend_eliminate : 0.000010s : 0.03% optimize.opt_b.updatestate_assign_eliminate : 0.000004s : 0.01% optimize.opt_b.updatestate_loads_eliminate : 0.000004s : 0.01% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000030s : 0.09% optimize.optimize_parallel_all_gather_comm : 0.000024s : 0.07% optimize.overlap_param_gather : 0.000005s : 0.02% optimize.cconv : 0.000037s : 0.11% optimize.loop_unroll : 0.000487s : 1.49% optimize.opt_after_cconv.c_1 : 0.000038s : 0.12% optimize.opt_after_cconv.parameter_eliminate : 0.000006s : 0.02% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000008s : 0.03% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.cse : 0.000027s : 0.08% optimize.opt_after_cconv.renormalize : 0.000001s : 0.00% optimize.remove_dup_value : 0.000019s : 0.06% optimize.tuple_transform.d_1 : 0.000058s : 0.18% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000010s : 0.03% optimize.partial_unused_args_eliminate : 0.000005s : 0.01% optimize.add_recomputation : 0.000070s : 0.22% optimize.cse_after_recomputation.cse : 0.000019s : 0.06% optimize.environ_conv : 0.000011s : 0.03% optimize.swap_dp_allreduce_reducescatter : 0.000010s : 0.03% optimize.bias_add_comm_swap : 0.000005s : 0.02% optimize.label_micro_interleaved_index : 0.000007s : 0.02% optimize.label_fine_grained_interleaved_index : 0.000005s : 0.02% optimize.merge_cast_opt : 0.000004s : 0.01% optimize.slice_recompute_activation : 0.000004s : 0.01% optimize.micro_interleaved_order_control : 0.000005s : 0.02% optimize.assign_add_opt : 0.000003s : 0.01% optimize.ForceFp32Comm : 0.000003s : 0.01% optimize.remove_cast_before_assign_add : 0.000003s : 0.01% optimize.full_micro_interleaved_order_control : 0.000005s : 0.02% optimize.reorder_send_recv_between_fp_bp : 0.000005s : 0.02% optimize.comm_op_add_attrs : 0.000003s : 0.01% optimize.add_comm_op_reuse_tag : 0.000003s : 0.01% optimize.interleave_split_concat_branches : 0.000004s : 0.01% optimize.interleave_parallel_branches : 0.000003s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000004s : 0.01% optimize.control_data_broadcast_order : 0.000021s : 0.06% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.01% optimize.offloading_packed_experts : 0.000007s : 0.02% optimize.overlap_recompute_and_grad_model_parallel : 0.000009s : 0.03% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.01% optimize.overlap_recompute_comm : 0.000005s : 0.02% optimize.overlap_grad_ring_attention : 0.000007s : 0.02% optimize.overlap_grad_flash_sp : 0.000028s : 0.09% optimize.begin_end_overlap_inline : 0.000003s : 0.01% optimize.split_matmul_comm_elemetwise : 0.000005s : 0.01% optimize.split_layernorm_comm : 0.000004s : 0.01% optimize.handle_group_info : 0.000004s : 0.01% optimize.symbol_engine_optimizer.build : 0.000005s : 0.01% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000016s : 0.05% optimize.symbol_engine_optimizer.elim_not_effective : 0.000017s : 0.05% optimize.symbol_engine_optimizer.opt_reshape : 0.000018s : 0.06% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000018s : 0.05% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000005s : 0.01% pipeline_parallel_scheduler : 0.000002s : 0.01% auto_monad_reorder : 0.000026s : 0.08% get_jit_bprop_graph : 0.000002s : 0.01% rewriter_after_jit_bprop_graph : 0.000006s : 0.02% opt_after_jit_grad : 0.000566s : 1.74% validate : 0.000049s : 0.15% Time group info: ------[substitution.] 0.000247 38 11.42% : 0.000028s : 3: substitution.cast_eliminate 0.82% : 0.000002s : 3: substitution.elim_not_effective 0.81% : 0.000002s : 3: substitution.fold_const_symbol 2.91% : 0.000007s : 5: substitution.graph_param_transform 64.22% : 0.000158s : 4: substitution.inline 2.29% : 0.000006s : 6: substitution.j_node_and_user_rematch 2.67% : 0.000007s : 6: substitution.remove_not_recompute_node 8.08% : 0.000020s : 4: substitution.replace_old_param 6.78% : 0.000017s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.005759 2 87.43% : 0.005035s : 1: type_inference.infer 12.57% : 0.000724s : 1: type_inference.specialize ------[replace.] 0.000069 8 57.23% : 0.000040s : 4: replace.inline 42.77% : 0.000030s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000171 8 91.37% : 0.000156s : 4: match.inline 8.63% : 0.000015s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000264 1596 0.94% : 0.000002s : 17: predicate.accumulaten_eliminater 1.01% : 0.000003s : 5: predicate.ad_related_special_op_eliminate 0.63% : 0.000002s : 10: predicate.addn_check_dump 1.04% : 0.000003s : 17: predicate.addn_zero_filter 0.84% : 0.000002s : 17: predicate.adjust_all_reduce_mul_add 1.96% : 0.000005s : 27: predicate.arithmetic_simplify 0.96% : 0.000003s : 17: predicate.cast_eliminate 0.64% : 0.000002s : 10: predicate.check_bprop_eliminate 0.64% : 0.000002s : 10: predicate.compare_switch_simplify 0.25% : 0.000001s : 5: predicate.const_output_eliminate 0.64% : 0.000002s : 10: predicate.depend_value_elim 0.96% : 0.000003s : 17: predicate.dict_get_item_const_eliminator 0.98% : 0.000003s : 17: predicate.dict_get_item_eliminator 0.91% : 0.000002s : 17: predicate.dict_set_item_eliminator 1.06% : 0.000003s : 10: predicate.dumpgradient_eliminate 0.29% : 0.000001s : 5: predicate.elim_not_effective 0.48% : 0.000001s : 5: predicate.elim_shapecalc_of_broadcastargs 1.14% : 0.000003s : 22: predicate.environ_add_const_eliminate 1.09% : 0.000003s : 22: predicate.environ_get_add_eliminate 1.08% : 0.000003s : 22: predicate.environ_get_depend_swap 1.65% : 0.000004s : 32: predicate.environ_get_eliminate 1.07% : 0.000003s : 22: predicate.environ_get_set_eliminate 1.36% : 0.000004s : 25: predicate.exchange_switch_depend_value 2.57% : 0.000007s : 25: predicate.float_depend_g_call 0.52% : 0.000001s : 10: predicate.float_environ_get_switch 0.81% : 0.000002s : 15: predicate.float_tuple_getitem_switch 0.20% : 0.000001s : 5: predicate.fold_const_symbol 0.62% : 0.000002s : 10: predicate.get_grad_eliminate 0.22% : 0.000001s : 5: predicate.graph_param_transform 0.58% : 0.000002s : 10: predicate.incorporate_call 0.48% : 0.000001s : 10: predicate.incorporate_call_switch 6.42% : 0.000017s : 72: predicate.inline 1.06% : 0.000003s : 10: predicate.inline_without_move 0.31% : 0.000001s : 10: predicate.j_node_and_user_rematch 0.84% : 0.000002s : 10: predicate.less_batch_normalization 1.80% : 0.000005s : 31: predicate.list_to_tuple_eliminator_ 2.47% : 0.000007s : 48: predicate.load_eliminater 0.85% : 0.000002s : 5: predicate.loop_unroll_after_grad 1.89% : 0.000005s : 36: predicate.loop_unroll_before_grad 1.65% : 0.000004s : 27: predicate.make_slice_get_slice_eliminator 0.55% : 0.000001s : 10: predicate.merge_addn 0.61% : 0.000002s : 10: predicate.micro_step_allgather_replace 0.54% : 0.000001s : 10: predicate.mini_step_allgather_replace 0.82% : 0.000002s : 17: predicate.minmaximum_grad 1.15% : 0.000003s : 5: predicate.mutable_eliminate 0.72% : 0.000002s : 5: predicate.opt_reshape 0.32% : 0.000001s : 5: predicate.parallel_virtual_node 1.58% : 0.000004s : 25: predicate.partial_defer_inline 1.64% : 0.000004s : 26: predicate.partial_eliminate 0.88% : 0.000002s : 17: predicate.print_const_string_wrapper 0.55% : 0.000001s : 10: predicate.reduce_all_const_elim 1.12% : 0.000003s : 17: predicate.reduce_eliminate 2.49% : 0.000007s : 48: predicate.redundant_stop_gradient_eliminater 0.44% : 0.000001s : 10: predicate.remove_not_recompute_node 1.37% : 0.000004s : 31: predicate.replace_applicator 0.60% : 0.000002s : 10: predicate.replace_old_param 0.32% : 0.000001s : 5: predicate.reset_defer_inline 0.92% : 0.000002s : 17: predicate.reshape_eliminate 0.61% : 0.000002s : 10: predicate.row_tensor_add_zeros_like 0.35% : 0.000001s : 5: predicate.row_tensor_eliminate 0.95% : 0.000003s : 10: predicate.same_eliminate 0.39% : 0.000001s : 10: predicate.set_cell_output_no_recompute 0.83% : 0.000002s : 10: predicate.shard_identity_eliminate 0.61% : 0.000002s : 10: predicate.special_op_eliminate 0.78% : 0.000002s : 10: predicate.specialize_transform 0.98% : 0.000003s : 10: predicate.split_environ_get_set_with_tuple_value 0.84% : 0.000002s : 10: predicate.stack_unstack_eliminate 0.39% : 0.000001s : 5: predicate.switch_call_monad_eliminater 1.41% : 0.000004s : 25: predicate.switch_defer_inline 1.94% : 0.000005s : 35: predicate.switch_layer_defer_inline 4.79% : 0.000013s : 76: predicate.switch_simplify 0.89% : 0.000002s : 17: predicate.tile_eliminate 0.92% : 0.000002s : 17: predicate.transpose_eliminate 1.51% : 0.000004s : 27: predicate.tuple_list_convert_item_index_to_positive 1.65% : 0.000004s : 27: predicate.tuple_list_get_item_const_eliminator 1.69% : 0.000004s : 27: predicate.tuple_list_get_item_depend_reorder 3.56% : 0.000009s : 41: predicate.tuple_list_get_item_eliminator 1.51% : 0.000004s : 27: predicate.tuple_list_get_set_item_eliminator 2.16% : 0.000006s : 37: predicate.tuple_list_set_item_eliminator 1.68% : 0.000004s : 31: predicate.tuple_to_list_eliminator_ 2.37% : 0.000006s : 48: predicate.updatestate_pure_node_eliminater 3.15% : 0.000008s : 58: predicate.updatestate_useless_node_eliminater 0.44% : 0.000001s : 5: predicate.value_based_eliminate 0.84% : 0.000002s : 10: predicate.virtual_dataset_eliminate 0.58% : 0.000002s : 10: predicate.virtual_output_eliminate 0.29% : 0.000001s : 5: predicate.virtual_view_grad_eliminate 0.33% : 0.000001s : 5: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000559 11 52.16% : 0.000292s : 5: func_graph_cloner_run.FuncGraphClonerGraph 47.84% : 0.000268s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.050877 192 0.01% : 0.000006s : 1: ForceFp32Comm 6.67% : 0.003392s : 1: add_attr 6.64% : 0.003376s : 1: add_attr_with_inline 0.01% : 0.000006s : 1: add_comm_op_reuse_tag 0.15% : 0.000076s : 1: add_recomputation 0.01% : 0.000006s : 1: assign_add_opt 0.15% : 0.000079s : 1: auto_monad 0.07% : 0.000034s : 1: auto_monad_reorder 0.01% : 0.000006s : 1: begin_end_overlap_inline 0.02% : 0.000008s : 1: bias_add_comm_swap 41.01% : 0.020865s : 1: bootstrap 0.08% : 0.000041s : 1: cconv 0.01% : 0.000006s : 1: comm_op_add_attrs 0.05% : 0.000025s : 1: control_data_broadcast_order 0.03% : 0.000015s : 1: convert_after_rewriter 0.09% : 0.000044s : 1: cse_after_recomputation 0.02% : 0.000008s : 1: dataset_repeat_opt 0.05% : 0.000023s : 1: detach_backward 0.03% : 0.000014s : 1: environ_conv 0.06% : 0.000032s : 1: event_method 0.02% : 0.000008s : 1: full_micro_interleaved_order_control 0.02% : 0.000008s : 1: get_jit_bprop_graph 0.02% : 0.000013s : 1: graph_reusing 0.01% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000007s : 1: handle_group_info 0.02% : 0.000008s : 1: inline 0.02% : 0.000009s : 1: insert-virtual-dataset 0.01% : 0.000006s : 1: interleave_parallel_branches 0.01% : 0.000006s : 1: interleave_split_concat_branches 0.02% : 0.000008s : 1: label_fine_grained_interleaved_index 0.02% : 0.000011s : 1: label_micro_interleaved_index 0.97% : 0.000495s : 1: loop_unroll 0.01% : 0.000007s : 1: merge_cast_opt 0.01% : 0.000008s : 1: micro_interleaved_order_control 1.26% : 0.000640s : 1: mutable_eliminate 0.02% : 0.000010s : 1: offloading_packed_experts 0.04% : 0.000020s : 1: opt.transform.loop_unroll_optimizer 0.04% : 0.000022s : 1: opt.transform.mutable_eliminate 3.00% : 0.001528s : 78: opt.transform.opt_a 0.07% : 0.000036s : 1: opt.transform.opt_after_cconv 0.07% : 0.000033s : 1: opt.transform.opt_after_jit_grad 0.26% : 0.000135s : 28: opt.transform.opt_b 0.13% : 0.000066s : 2: opt.transform.opt_trans_graph 0.12% : 0.000062s : 4: opt.transform.symbol_engine_opt 7.44% : 0.003785s : 1: opt_a 0.34% : 0.000173s : 1: opt_after_cconv 1.14% : 0.000578s : 1: opt_after_jit_grad 0.66% : 0.000337s : 1: opt_b 13.96% : 0.007102s : 1: optimize 0.06% : 0.000029s : 1: optimize_parallel_all_gather_comm 0.02% : 0.000012s : 1: order_py_execute_after_rewriter 0.06% : 0.000032s : 1: overlap_grad_flash_sp 0.01% : 0.000007s : 1: overlap_grad_matmul_and_grad_allreduce 0.02% : 0.000011s : 1: overlap_grad_ring_attention 0.01% : 0.000007s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000007s : 1: overlap_opt_shard_in_pipeline 0.02% : 0.000008s : 1: overlap_param_gather 0.01% : 0.000006s : 1: overlap_recompute_allgather_and_fa_grad 0.02% : 0.000012s : 1: overlap_recompute_and_grad_model_parallel 0.02% : 0.000008s : 1: overlap_recompute_comm 0.02% : 0.000010s : 1: parallel-infer-symbol 0.01% : 0.000006s : 1: parallel-infer-symbol-second 0.02% : 0.000008s : 1: partial_unused_args_eliminate 0.02% : 0.000009s : 1: pipeline_parallel_scheduler 0.01% : 0.000007s : 1: pipeline_split 0.09% : 0.000045s : 1: pre_auto_parallel 0.07% : 0.000037s : 1: py_interpret_to_execute 0.05% : 0.000023s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000007s : 1: remove_cast_before_assign_add 0.04% : 0.000023s : 1: remove_dup_value 0.90% : 0.000459s : 1: renormalize.infer 0.71% : 0.000360s : 1: renormalize.specialize 0.02% : 0.000008s : 1: reorder_send_recv_between_fp_bp 0.03% : 0.000013s : 1: rewriter_after_jit_bprop_graph 0.11% : 0.000058s : 1: rewriter_after_opt_a 0.20% : 0.000100s : 1: rewriter_before_opt_a 0.02% : 0.000008s : 1: slice_cell_reuse_recomputed_activation 0.01% : 0.000007s : 1: slice_recompute_activation 0.01% : 0.000007s : 1: split_layernorm_comm 0.01% : 0.000007s : 1: split_matmul_comm_elemetwise 0.03% : 0.000014s : 1: swap_dp_allreduce_reducescatter 0.70% : 0.000357s : 1: symbol_engine_optimizer 0.23% : 0.000118s : 1: tuple_transform 11.51% : 0.005854s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:45:31.243.097 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0169753, [21] [bootstrap]: 0.00042208 [type_inference]: 0.0066844 [event_method]: 1.864e-05 [auto_monad]: 6.44e-05 [graph_reusing]: 5.71e-06 [inline]: 2.36e-06 [add_attr]: 0.0030342, [1] [add_attr_with_inline]: 0.00302582, [1] [Cycle 1]: 5.922e-05, [2] [tag_attr]: 2.011e-05 [meta_addattr_fg_expand]: 6.14001e-06 [parallel-infer-symbol]: 3.68e-06 [pre_auto_parallel]: 3.326e-05 [insert-virtual-dataset]: 2.71e-06 [parallel-infer-symbol-second]: 1.07e-06 [dataset_repeat_opt]: 1.92001e-06 [pipeline_split]: 1.99999e-06 [optimize]: 0.00600957, [53] [py_interpret_to_execute]: 2.473e-05 [rewriter_before_opt_a]: 8.795e-05 [opt_a]: 0.00384071, [2] [Cycle 1]: 0.00216033, [45] [expand_dump_flag]: 2.73e-06 [switch_simplify]: 4.817e-05 [loop_unroll]: 4.908e-05 [a_1]: 0.00073232 [with_stream_mark]: 1.625e-05 [recompute_prepare]: 1.106e-05 [updatestate_depend_eliminate]: 4.77e-06 [updatestate_assign_eliminate]: 3.94002e-06 [updatestate_loads_eliminate]: 3.88999e-06 [parameter_eliminate]: 1.87001e-06 [a_2]: 0.00010218 [accelerated_algorithm]: 8.03999e-06 [shard]: 1.79998e-06 [meta_shard_fg_expand]: 1.91e-06 [shard_inline]: 7.86001e-06 [merge_send_recv]: 9.21002e-06 [auto_parallel]: 7.46001e-06 [parallel]: 1.901e-05 [flash_sp]: 9.06998e-06 [merge_comm]: 4.98001e-06 [allreduce_fusion]: 4.25999e-06 [matmul_add_comm_reduction]: 1.023e-05 [allreduce_slice_to_reducescatter]: 6.79982e-07 [virtual_shard_identity]: 8.74e-06 [virtual_dataset]: 7.74002e-06 [get_grad_eliminate_]: 8.04997e-06 [virtual_output]: 7.58001e-06 [merge_forward]: 4.68999e-06 [cell_reuse_recompute_pass]: 1.27e-06 [offload_activation]: 1.129e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.529e-05 [merge_recompute_call_nodes]: 1.47001e-06 [before_grad]: 1.336e-05 [set_forward_comm_id_for_comm_node_pass]: 4.18001e-06 [meta_fg_expand]: 3.28e-06 [flash_sp_send_recv_attached]: 2.78e-06 [receive_attached]: 2.34999e-06 [after_resolve]: 1.307e-05 [a_after_grad]: 1.154e-05 [renormalize]: 0.0006199 [add_forward_monad_depend]: 5.71e-06 [auto_monad_grad]: 2.73e-06 [auto_monad_eliminator]: 1.586e-05 [cse]: 3.61e-05 [a_3]: 5.613e-05 [Cycle 2]: 0.00166934, [45] [expand_dump_flag]: 2.05002e-06 [switch_simplify]: 9.61e-06 [loop_unroll]: 7.51001e-06 [a_1]: 0.00017368 [with_stream_mark]: 1.281e-05 [recompute_prepare]: 7.87e-06 [updatestate_depend_eliminate]: 3.56001e-06 [updatestate_assign_eliminate]: 2.83e-06 [updatestate_loads_eliminate]: 2.94001e-06 [parameter_eliminate]: 1.20001e-06 [a_2]: 9.126e-05 [accelerated_algorithm]: 7.57002e-06 [shard]: 1.33002e-06 [meta_shard_fg_expand]: 1.47001e-06 [shard_inline]: 7.18998e-06 [merge_send_recv]: 6.46e-06 [auto_parallel]: 0.00086471 [parallel]: 5.84e-06 [flash_sp]: 4.18001e-06 [merge_comm]: 8.37e-06 [allreduce_fusion]: 4.32e-06 [matmul_add_comm_reduction]: 8.59e-06 [allreduce_slice_to_reducescatter]: 5.89993e-07 [virtual_shard_identity]: 1.466e-05 [virtual_dataset]: 7.95998e-06 [get_grad_eliminate_]: 7.88999e-06 [virtual_output]: 7.07002e-06 [merge_forward]: 3.81999e-06 [cell_reuse_recompute_pass]: 1.59e-06 [offload_activation]: 9.02999e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.792e-05 [merge_recompute_call_nodes]: 9.20001e-07 [before_grad]: 1.196e-05 [set_forward_comm_id_for_comm_node_pass]: 4.46002e-06 [meta_fg_expand]: 2.97002e-06 [flash_sp_send_recv_attached]: 1.02e-06 [receive_attached]: 1.57999e-06 [after_resolve]: 1.253e-05 [a_after_grad]: 1.265e-05 [renormalize]: 8.00064e-08 [add_forward_monad_depend]: 3.46999e-06 [auto_monad_grad]: 1.89e-06 [auto_monad_eliminator]: 1.166e-05 [cse]: 2.444e-05 [a_3]: 4.692e-05 [py_interpret_to_execute_after_opt_a]: 1.042e-05 [slice_cell_reuse_recomputed_activation]: 2.09999e-06 [rewriter_after_opt_a]: 4.029e-05 [convert_after_rewriter]: 7.21001e-06 [order_py_execute_after_rewriter]: 5.77001e-06 [mutable_eliminate]: 0.00052327 [opt_b]: 0.0002456, [1] [Cycle 1]: 0.00023771, [7] [b_1]: 0.00015553 [b_2]: 9.24e-06 [updatestate_depend_eliminate]: 6.64999e-06 [updatestate_assign_eliminate]: 3.20998e-06 [updatestate_loads_eliminate]: 3.16999e-06 [renormalize]: 3.4002e-07 [cse]: 2.421e-05 [optimize_parallel_all_gather_comm]: 1.802e-05 [overlap_param_gather]: 1.72999e-06 [cconv]: 2.642e-05 [loop_unroll]: 0.00044152 [opt_after_cconv]: 0.00013497, [1] [Cycle 1]: 0.00012847, [7] [c_1]: 5.057e-05 [parameter_eliminate]: 3.28998e-06 [updatestate_depend_eliminate]: 6.43e-06 [updatestate_assign_eliminate]: 3.31001e-06 [updatestate_loads_eliminate]: 3.22002e-06 [cse]: 2.443e-05 [renormalize]: 3.80009e-07 [remove_dup_value]: 1.562e-05 [tuple_transform]: 9.07e-05, [1] [Cycle 1]: 8.576e-05, [4] [d_1]: 5.655e-05 [none_parameter_eliminate]: 1.54e-06 [renormalize]: 1.90019e-07 [switch_simplify]: 8.20999e-06 [partial_unused_args_eliminate]: 2.01e-06 [add_recomputation]: 5.639e-05 [cse_after_recomputation]: 2.516e-05, [1] [Cycle 1]: 2.037e-05, [1] [cse]: 1.485e-05 [environ_conv]: 6.21e-06 [swap_dp_allreduce_reducescatter]: 6.63e-06 [bias_add_comm_swap]: 2.52001e-06 [label_micro_interleaved_index]: 4.43001e-06 [label_fine_grained_interleaved_index]: 2.54999e-06 [merge_cast_opt]: 1.27e-06 [slice_recompute_activation]: 2.00002e-06 [micro_interleaved_order_control]: 2.12001e-06 [assign_add_opt]: 1.26002e-06 [ForceFp32Comm]: 7.80012e-07 [remove_cast_before_assign_add]: 9.69972e-07 [full_micro_interleaved_order_control]: 2.21e-06 [reorder_send_recv_between_fp_bp]: 2.63e-06 [comm_op_add_attrs]: 1.25999e-06 [add_comm_op_reuse_tag]: 9.79984e-07 [interleave_split_concat_branches]: 1.08001e-06 [interleave_parallel_branches]: 1.32e-06 [overlap_opt_shard_in_pipeline]: 1.11002e-06 [overlap_opt_shard_grad_in_pipeline]: 2.01e-06 [control_data_broadcast_order]: 1.5e-05 [grouped_pairwise_exchange_alltoall]: 1.49998e-06 [offloading_packed_experts]: 4.94e-06 [overlap_recompute_and_grad_model_parallel]: 5.02999e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.12999e-06 [overlap_recompute_allgather_and_fa_grad]: 1.34e-06 [overlap_recompute_comm]: 2.07999e-06 [overlap_grad_ring_attention]: 4.63001e-06 [overlap_grad_flash_sp]: 2.159e-05 [begin_end_overlap_inline]: 5.59987e-07 [split_matmul_comm_elemetwise]: 2.41e-06 [split_layernorm_comm]: 1.62001e-06 [handle_group_info]: 9.49978e-07 [symbol_engine_optimizer]: 8.304e-05, [1] [Cycle 1]: 7.87e-05, [6] [build]: 3.46001e-06 [elim_shapecalc]: 1.115e-05 [elim_not_effective]: 1.512e-05 [opt_reshape]: 8.41002e-06 [fold_const_symbol]: 1.186e-05 [renormalize]: 2.30008e-07 [detach_backward]: 2.04e-06 [pipeline_parallel_scheduler]: 1.45001e-06 [auto_monad_reorder]: 1.999e-05 [get_jit_bprop_graph]: 1.44e-06 [rewriter_after_jit_bprop_graph]: 4.13001e-06 [opt_after_jit_grad]: 0.00047158 [validate]: 4.193e-05 Sums bootstrap : 0.000422s : 3.26% type_inference : 0.006684s : 51.57% event_method : 0.000019s : 0.14% auto_monad : 0.000064s : 0.50% graph_reusing : 0.000006s : 0.04% inline : 0.000002s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000020s : 0.16% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.05% parallel-infer-symbol : 0.000004s : 0.03% pre_auto_parallel : 0.000033s : 0.26% insert-virtual-dataset : 0.000003s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.01% pipeline_split : 0.000002s : 0.02% optimize.py_interpret_to_execute : 0.000025s : 0.19% optimize.rewriter_before_opt_a : 0.000088s : 0.68% optimize.opt_a.expand_dump_flag : 0.000005s : 0.04% optimize.opt_a.switch_simplify : 0.000058s : 0.45% optimize.opt_a.loop_unroll : 0.000057s : 0.44% optimize.opt_a.a_1 : 0.000906s : 6.99% optimize.opt_a.with_stream_mark : 0.000029s : 0.22% optimize.opt_a.recompute_prepare : 0.000019s : 0.15% optimize.opt_a.updatestate_depend_eliminate : 0.000008s : 0.06% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.05% optimize.opt_a.updatestate_loads_eliminate : 0.000007s : 0.05% optimize.opt_a.parameter_eliminate : 0.000003s : 0.02% optimize.opt_a.a_2 : 0.000193s : 1.49% optimize.opt_a.accelerated_algorithm : 0.000016s : 0.12% optimize.opt_a.shard : 0.000003s : 0.02% optimize.opt_a.meta_shard_fg_expand : 0.000003s : 0.03% optimize.opt_a.shard_inline : 0.000015s : 0.12% optimize.opt_a.merge_send_recv : 0.000016s : 0.12% optimize.opt_a.auto_parallel : 0.000872s : 6.73% optimize.opt_a.parallel : 0.000025s : 0.19% optimize.opt_a.flash_sp : 0.000013s : 0.10% optimize.opt_a.merge_comm : 0.000013s : 0.10% optimize.opt_a.allreduce_fusion : 0.000009s : 0.07% optimize.opt_a.matmul_add_comm_reduction : 0.000019s : 0.15% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000023s : 0.18% optimize.opt_a.virtual_dataset : 0.000016s : 0.12% optimize.opt_a.get_grad_eliminate_ : 0.000016s : 0.12% optimize.opt_a.virtual_output : 0.000015s : 0.11% optimize.opt_a.merge_forward : 0.000009s : 0.07% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.02% optimize.opt_a.offload_activation : 0.000020s : 0.16% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000033s : 0.26% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.02% optimize.opt_a.before_grad : 0.000025s : 0.20% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000009s : 0.07% optimize.opt_a.meta_fg_expand : 0.000006s : 0.05% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.03% optimize.opt_a.receive_attached : 0.000004s : 0.03% optimize.opt_a.after_resolve : 0.000026s : 0.20% optimize.opt_a.a_after_grad : 0.000024s : 0.19% optimize.opt_a.renormalize : 0.000620s : 4.78% optimize.opt_a.add_forward_monad_depend : 0.000009s : 0.07% optimize.opt_a.auto_monad_grad : 0.000005s : 0.04% optimize.opt_a.auto_monad_eliminator : 0.000028s : 0.21% optimize.opt_a.cse : 0.000061s : 0.47% optimize.opt_a.a_3 : 0.000103s : 0.79% optimize.py_interpret_to_execute_after_opt_a : 0.000010s : 0.08% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.02% optimize.rewriter_after_opt_a : 0.000040s : 0.31% optimize.convert_after_rewriter : 0.000007s : 0.06% optimize.order_py_execute_after_rewriter : 0.000006s : 0.04% optimize.mutable_eliminate : 0.000523s : 4.04% optimize.opt_b.b_1 : 0.000156s : 1.20% optimize.opt_b.b_2 : 0.000009s : 0.07% optimize.opt_b.updatestate_depend_eliminate : 0.000007s : 0.05% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.02% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.02% optimize.opt_b.renormalize : 0.000000s : 0.00% optimize.opt_b.cse : 0.000024s : 0.19% optimize.optimize_parallel_all_gather_comm : 0.000018s : 0.14% optimize.overlap_param_gather : 0.000002s : 0.01% optimize.cconv : 0.000026s : 0.20% optimize.loop_unroll : 0.000442s : 3.41% optimize.opt_after_cconv.c_1 : 0.000051s : 0.39% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.05% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.02% optimize.opt_after_cconv.cse : 0.000024s : 0.19% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000016s : 0.12% optimize.tuple_transform.d_1 : 0.000057s : 0.44% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000008s : 0.06% optimize.partial_unused_args_eliminate : 0.000002s : 0.02% optimize.add_recomputation : 0.000056s : 0.44% optimize.cse_after_recomputation.cse : 0.000015s : 0.11% optimize.environ_conv : 0.000006s : 0.05% optimize.swap_dp_allreduce_reducescatter : 0.000007s : 0.05% optimize.bias_add_comm_swap : 0.000003s : 0.02% optimize.label_micro_interleaved_index : 0.000004s : 0.03% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.02% optimize.merge_cast_opt : 0.000001s : 0.01% optimize.slice_recompute_activation : 0.000002s : 0.02% optimize.micro_interleaved_order_control : 0.000002s : 0.02% optimize.assign_add_opt : 0.000001s : 0.01% optimize.ForceFp32Comm : 0.000001s : 0.01% optimize.remove_cast_before_assign_add : 0.000001s : 0.01% optimize.full_micro_interleaved_order_control : 0.000002s : 0.02% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.02% optimize.comm_op_add_attrs : 0.000001s : 0.01% optimize.add_comm_op_reuse_tag : 0.000001s : 0.01% optimize.interleave_split_concat_branches : 0.000001s : 0.01% optimize.interleave_parallel_branches : 0.000001s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.02% optimize.control_data_broadcast_order : 0.000015s : 0.12% optimize.grouped_pairwise_exchange_alltoall : 0.000001s : 0.01% optimize.offloading_packed_experts : 0.000005s : 0.04% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.04% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.01% optimize.overlap_recompute_comm : 0.000002s : 0.02% optimize.overlap_grad_ring_attention : 0.000005s : 0.04% optimize.overlap_grad_flash_sp : 0.000022s : 0.17% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.02% optimize.split_layernorm_comm : 0.000002s : 0.01% optimize.handle_group_info : 0.000001s : 0.01% optimize.symbol_engine_optimizer.build : 0.000003s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000011s : 0.09% optimize.symbol_engine_optimizer.elim_not_effective : 0.000015s : 0.12% optimize.symbol_engine_optimizer.opt_reshape : 0.000008s : 0.06% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000012s : 0.09% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.02% pipeline_parallel_scheduler : 0.000001s : 0.01% auto_monad_reorder : 0.000020s : 0.15% get_jit_bprop_graph : 0.000001s : 0.01% rewriter_after_jit_bprop_graph : 0.000004s : 0.03% opt_after_jit_grad : 0.000472s : 3.64% validate : 0.000042s : 0.32% Time group info: ------[substitution.] 0.000201 38 10.98% : 0.000022s : 3: substitution.cast_eliminate 1.08% : 0.000002s : 3: substitution.elim_not_effective 0.78% : 0.000002s : 3: substitution.fold_const_symbol 3.44% : 0.000007s : 5: substitution.graph_param_transform 68.70% : 0.000138s : 4: substitution.inline 2.12% : 0.000004s : 6: substitution.j_node_and_user_rematch 3.11% : 0.000006s : 6: substitution.remove_not_recompute_node 2.55% : 0.000005s : 4: substitution.replace_old_param 7.24% : 0.000015s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.006624 2 72.82% : 0.004824s : 1: type_inference.infer 27.18% : 0.001800s : 1: type_inference.specialize ------[replace.] 0.000063 8 60.80% : 0.000038s : 4: replace.inline 39.20% : 0.000025s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000148 8 91.50% : 0.000136s : 4: match.inline 8.50% : 0.000013s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000257 1596 0.98% : 0.000003s : 17: predicate.accumulaten_eliminater 0.73% : 0.000002s : 5: predicate.ad_related_special_op_eliminate 0.52% : 0.000001s : 10: predicate.addn_check_dump 0.91% : 0.000002s : 17: predicate.addn_zero_filter 0.85% : 0.000002s : 17: predicate.adjust_all_reduce_mul_add 2.03% : 0.000005s : 27: predicate.arithmetic_simplify 1.08% : 0.000003s : 17: predicate.cast_eliminate 0.63% : 0.000002s : 10: predicate.check_bprop_eliminate 0.64% : 0.000002s : 10: predicate.compare_switch_simplify 0.18% : 0.000000s : 5: predicate.const_output_eliminate 0.65% : 0.000002s : 10: predicate.depend_value_elim 0.96% : 0.000002s : 17: predicate.dict_get_item_const_eliminator 1.03% : 0.000003s : 17: predicate.dict_get_item_eliminator 1.01% : 0.000003s : 17: predicate.dict_set_item_eliminator 0.82% : 0.000002s : 10: predicate.dumpgradient_eliminate 0.21% : 0.000001s : 5: predicate.elim_not_effective 0.33% : 0.000001s : 5: predicate.elim_shapecalc_of_broadcastargs 1.20% : 0.000003s : 22: predicate.environ_add_const_eliminate 1.09% : 0.000003s : 22: predicate.environ_get_add_eliminate 1.17% : 0.000003s : 22: predicate.environ_get_depend_swap 1.71% : 0.000004s : 32: predicate.environ_get_eliminate 1.10% : 0.000003s : 22: predicate.environ_get_set_eliminate 1.42% : 0.000004s : 25: predicate.exchange_switch_depend_value 2.26% : 0.000006s : 25: predicate.float_depend_g_call 0.49% : 0.000001s : 10: predicate.float_environ_get_switch 0.77% : 0.000002s : 15: predicate.float_tuple_getitem_switch 0.19% : 0.000000s : 5: predicate.fold_const_symbol 0.70% : 0.000002s : 10: predicate.get_grad_eliminate 0.23% : 0.000001s : 5: predicate.graph_param_transform 0.60% : 0.000002s : 10: predicate.incorporate_call 0.50% : 0.000001s : 10: predicate.incorporate_call_switch 6.66% : 0.000017s : 72: predicate.inline 0.97% : 0.000002s : 10: predicate.inline_without_move 0.31% : 0.000001s : 10: predicate.j_node_and_user_rematch 0.77% : 0.000002s : 10: predicate.less_batch_normalization 1.72% : 0.000004s : 31: predicate.list_to_tuple_eliminator_ 2.58% : 0.000007s : 48: predicate.load_eliminater 0.99% : 0.000003s : 5: predicate.loop_unroll_after_grad 2.11% : 0.000005s : 36: predicate.loop_unroll_before_grad 1.70% : 0.000004s : 27: predicate.make_slice_get_slice_eliminator 0.59% : 0.000002s : 10: predicate.merge_addn 0.55% : 0.000001s : 10: predicate.micro_step_allgather_replace 0.53% : 0.000001s : 10: predicate.mini_step_allgather_replace 0.87% : 0.000002s : 17: predicate.minmaximum_grad 0.90% : 0.000002s : 5: predicate.mutable_eliminate 0.46% : 0.000001s : 5: predicate.opt_reshape 0.34% : 0.000001s : 5: predicate.parallel_virtual_node 1.61% : 0.000004s : 25: predicate.partial_defer_inline 1.72% : 0.000004s : 26: predicate.partial_eliminate 1.02% : 0.000003s : 17: predicate.print_const_string_wrapper 0.66% : 0.000002s : 10: predicate.reduce_all_const_elim 1.11% : 0.000003s : 17: predicate.reduce_eliminate 2.53% : 0.000007s : 48: predicate.redundant_stop_gradient_eliminater 0.41% : 0.000001s : 10: predicate.remove_not_recompute_node 1.31% : 0.000003s : 31: predicate.replace_applicator 0.57% : 0.000001s : 10: predicate.replace_old_param 0.24% : 0.000001s : 5: predicate.reset_defer_inline 1.04% : 0.000003s : 17: predicate.reshape_eliminate 0.66% : 0.000002s : 10: predicate.row_tensor_add_zeros_like 0.35% : 0.000001s : 5: predicate.row_tensor_eliminate 0.86% : 0.000002s : 10: predicate.same_eliminate 0.42% : 0.000001s : 10: predicate.set_cell_output_no_recompute 0.91% : 0.000002s : 10: predicate.shard_identity_eliminate 0.67% : 0.000002s : 10: predicate.special_op_eliminate 0.75% : 0.000002s : 10: predicate.specialize_transform 1.00% : 0.000003s : 10: predicate.split_environ_get_set_with_tuple_value 0.78% : 0.000002s : 10: predicate.stack_unstack_eliminate 0.40% : 0.000001s : 5: predicate.switch_call_monad_eliminater 1.50% : 0.000004s : 25: predicate.switch_defer_inline 2.06% : 0.000005s : 35: predicate.switch_layer_defer_inline 4.85% : 0.000012s : 76: predicate.switch_simplify 0.89% : 0.000002s : 17: predicate.tile_eliminate 1.04% : 0.000003s : 17: predicate.transpose_eliminate 1.62% : 0.000004s : 27: predicate.tuple_list_convert_item_index_to_positive 1.66% : 0.000004s : 27: predicate.tuple_list_get_item_const_eliminator 1.48% : 0.000004s : 27: predicate.tuple_list_get_item_depend_reorder 3.26% : 0.000008s : 41: predicate.tuple_list_get_item_eliminator 1.55% : 0.000004s : 27: predicate.tuple_list_get_set_item_eliminator 2.27% : 0.000006s : 37: predicate.tuple_list_set_item_eliminator 1.79% : 0.000005s : 31: predicate.tuple_to_list_eliminator_ 2.47% : 0.000006s : 48: predicate.updatestate_pure_node_eliminater 3.20% : 0.000008s : 58: predicate.updatestate_useless_node_eliminater 0.34% : 0.000001s : 5: predicate.value_based_eliminate 0.67% : 0.000002s : 10: predicate.virtual_dataset_eliminate 0.63% : 0.000002s : 10: predicate.virtual_output_eliminate 0.26% : 0.000001s : 5: predicate.virtual_view_grad_eliminate 0.41% : 0.000001s : 5: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000501 11 55.53% : 0.000278s : 5: func_graph_cloner_run.FuncGraphClonerGraph 44.47% : 0.000223s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.028311 192 0.01% : 0.000003s : 1: ForceFp32Comm 10.73% : 0.003039s : 1: add_attr 10.70% : 0.003029s : 1: add_attr_with_inline 0.01% : 0.000004s : 1: add_comm_op_reuse_tag 0.21% : 0.000060s : 1: add_recomputation 0.01% : 0.000004s : 1: assign_add_opt 0.25% : 0.000071s : 1: auto_monad 0.08% : 0.000024s : 1: auto_monad_reorder 0.01% : 0.000003s : 1: begin_end_overlap_inline 0.02% : 0.000005s : 1: bias_add_comm_swap 1.59% : 0.000451s : 1: bootstrap 0.11% : 0.000030s : 1: cconv 0.01% : 0.000004s : 1: comm_op_add_attrs 0.06% : 0.000018s : 1: control_data_broadcast_order 0.04% : 0.000010s : 1: convert_after_rewriter 0.10% : 0.000028s : 1: cse_after_recomputation 0.02% : 0.000005s : 1: dataset_repeat_opt 0.02% : 0.000005s : 1: detach_backward 0.03% : 0.000009s : 1: environ_conv 0.09% : 0.000025s : 1: event_method 0.02% : 0.000005s : 1: full_micro_interleaved_order_control 0.02% : 0.000005s : 1: get_jit_bprop_graph 0.03% : 0.000009s : 1: graph_reusing 0.02% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000004s : 1: handle_group_info 0.02% : 0.000005s : 1: inline 0.02% : 0.000006s : 1: insert-virtual-dataset 0.01% : 0.000004s : 1: interleave_parallel_branches 0.01% : 0.000004s : 1: interleave_split_concat_branches 0.02% : 0.000006s : 1: label_fine_grained_interleaved_index 0.03% : 0.000007s : 1: label_micro_interleaved_index 1.59% : 0.000451s : 1: loop_unroll 0.01% : 0.000004s : 1: merge_cast_opt 0.02% : 0.000005s : 1: micro_interleaved_order_control 1.88% : 0.000532s : 1: mutable_eliminate 0.03% : 0.000008s : 1: offloading_packed_experts 0.06% : 0.000017s : 1: opt.transform.loop_unroll_optimizer 0.06% : 0.000017s : 1: opt.transform.mutable_eliminate 5.13% : 0.001452s : 78: opt.transform.opt_a 0.17% : 0.000049s : 1: opt.transform.opt_after_cconv 0.10% : 0.000028s : 1: opt.transform.opt_after_jit_grad 0.47% : 0.000132s : 28: opt.transform.opt_b 0.22% : 0.000062s : 2: opt.transform.opt_trans_graph 0.15% : 0.000043s : 4: opt.transform.symbol_engine_opt 13.58% : 0.003844s : 1: opt_a 0.49% : 0.000139s : 1: opt_after_cconv 1.70% : 0.000481s : 1: opt_after_jit_grad 0.88% : 0.000249s : 1: opt_b 21.24% : 0.006014s : 1: optimize 0.08% : 0.000021s : 1: optimize_parallel_all_gather_comm 0.03% : 0.000009s : 1: order_py_execute_after_rewriter 0.09% : 0.000025s : 1: overlap_grad_flash_sp 0.01% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.03% : 0.000008s : 1: overlap_grad_ring_attention 0.02% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.02% : 0.000005s : 1: overlap_param_gather 0.01% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.03% : 0.000008s : 1: overlap_recompute_and_grad_model_parallel 0.02% : 0.000005s : 1: overlap_recompute_comm 0.03% : 0.000007s : 1: parallel-infer-symbol 0.01% : 0.000004s : 1: parallel-infer-symbol-second 0.02% : 0.000005s : 1: partial_unused_args_eliminate 0.02% : 0.000004s : 1: pipeline_parallel_scheduler 0.02% : 0.000005s : 1: pipeline_split 0.13% : 0.000037s : 1: pre_auto_parallel 0.10% : 0.000029s : 1: py_interpret_to_execute 0.05% : 0.000014s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000004s : 1: remove_cast_before_assign_add 0.07% : 0.000020s : 1: remove_dup_value 1.17% : 0.000332s : 1: renormalize.infer 0.99% : 0.000280s : 1: renormalize.specialize 0.02% : 0.000005s : 1: reorder_send_recv_between_fp_bp 0.03% : 0.000007s : 1: rewriter_after_jit_bprop_graph 0.16% : 0.000044s : 1: rewriter_after_opt_a 0.33% : 0.000093s : 1: rewriter_before_opt_a 0.02% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.02% : 0.000005s : 1: slice_recompute_activation 0.01% : 0.000004s : 1: split_layernorm_comm 0.02% : 0.000005s : 1: split_matmul_comm_elemetwise 0.03% : 0.000010s : 1: swap_dp_allreduce_reducescatter 0.30% : 0.000086s : 1: symbol_engine_optimizer 0.33% : 0.000094s : 1: tuple_transform 23.67% : 0.006700s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:45:31.455.950 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:45:31.456.217 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0171928, [21] [bootstrap]: 0.00045532 [type_inference]: 0.00592398 [event_method]: 2.018e-05 [auto_monad]: 6.578e-05 [graph_reusing]: 6.33e-06 [inline]: 2.22999e-06 [add_attr]: 0.00324037, [1] [add_attr_with_inline]: 0.0032306, [1] [Cycle 1]: 7.954e-05, [2] [tag_attr]: 2.04e-05 [meta_addattr_fg_expand]: 6.57002e-06 [parallel-infer-symbol]: 3.58e-06 [pre_auto_parallel]: 3.649e-05 [insert-virtual-dataset]: 2.99999e-06 [parallel-infer-symbol-second]: 9.39996e-07 [dataset_repeat_opt]: 2.34001e-06 [pipeline_split]: 1.74e-06 [optimize]: 0.00620874, [53] [py_interpret_to_execute]: 3.152e-05 [rewriter_before_opt_a]: 9.15e-05 [opt_a]: 0.00357696, [2] [Cycle 1]: 0.00254948, [45] [expand_dump_flag]: 2.74001e-06 [switch_simplify]: 4.416e-05 [loop_unroll]: 3.151e-05 [a_1]: 0.00074936 [with_stream_mark]: 1.867e-05 [recompute_prepare]: 1.343e-05 [updatestate_depend_eliminate]: 4.67e-06 [updatestate_assign_eliminate]: 4.1e-06 [updatestate_loads_eliminate]: 3.53999e-06 [parameter_eliminate]: 1.99999e-06 [a_2]: 0.00013771 [accelerated_algorithm]: 8.59002e-06 [shard]: 1.84e-06 [meta_shard_fg_expand]: 2.43002e-06 [shard_inline]: 7.97003e-06 [merge_send_recv]: 9.96e-06 [auto_parallel]: 8.42998e-06 [parallel]: 1.842e-05 [flash_sp]: 9.17999e-06 [merge_comm]: 5.00001e-06 [allreduce_fusion]: 4.2e-06 [matmul_add_comm_reduction]: 1.029e-05 [allreduce_slice_to_reducescatter]: 8.00006e-07 [virtual_shard_identity]: 1.164e-05 [virtual_dataset]: 8.74e-06 [get_grad_eliminate_]: 7.45e-06 [virtual_output]: 7.80998e-06 [merge_forward]: 5.40001e-06 [cell_reuse_recompute_pass]: 1.29998e-06 [offload_activation]: 1.204e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.157e-05 [merge_recompute_call_nodes]: 1.51998e-06 [before_grad]: 1.411e-05 [set_forward_comm_id_for_comm_node_pass]: 5.52999e-06 [meta_fg_expand]: 3.43e-06 [flash_sp_send_recv_attached]: 2.32999e-06 [receive_attached]: 2.19001e-06 [after_resolve]: 1.358e-05 [a_after_grad]: 1.252e-05 [renormalize]: 0.00076217 [add_forward_monad_depend]: 6.79999e-06 [auto_monad_grad]: 2.83998e-06 [auto_monad_eliminator]: 1.852e-05 [cse]: 3.49e-05 [a_3]: 7.375e-05 [Cycle 2]: 0.00101296, [45] [expand_dump_flag]: 1.07998e-06 [switch_simplify]: 9.10999e-06 [loop_unroll]: 7.36001e-06 [a_1]: 0.00017615 [with_stream_mark]: 1.618e-05 [recompute_prepare]: 9.05999e-06 [updatestate_depend_eliminate]: 4.13999e-06 [updatestate_assign_eliminate]: 3.09001e-06 [updatestate_loads_eliminate]: 3.3e-06 [parameter_eliminate]: 1.58002e-06 [a_2]: 0.00012048 [accelerated_algorithm]: 8.97e-06 [shard]: 1.23002e-06 [meta_shard_fg_expand]: 2.22999e-06 [shard_inline]: 8.20999e-06 [merge_send_recv]: 7.7e-06 [auto_parallel]: 7.26999e-06 [parallel]: 5.80002e-06 [flash_sp]: 4.08001e-06 [merge_comm]: 4.33001e-06 [allreduce_fusion]: 3.95e-06 [matmul_add_comm_reduction]: 8.67e-06 [allreduce_slice_to_reducescatter]: 4.50003e-07 [virtual_shard_identity]: 1.078e-05 [virtual_dataset]: 7.20003e-06 [get_grad_eliminate_]: 7.18e-06 [virtual_output]: 7.50998e-06 [merge_forward]: 4.47e-06 [cell_reuse_recompute_pass]: 1.87001e-06 [offload_activation]: 9.61e-06 [cell_reuse_handle_not_recompute_node_pass]: 2.194e-05 [merge_recompute_call_nodes]: 1.51998e-06 [before_grad]: 1.483e-05 [set_forward_comm_id_for_comm_node_pass]: 5.72001e-06 [meta_fg_expand]: 3.78999e-06 [flash_sp_send_recv_attached]: 1.42e-06 [receive_attached]: 1.25001e-06 [after_resolve]: 1.505e-05 [a_after_grad]: 1.237e-05 [renormalize]: 6.00121e-08 [add_forward_monad_depend]: 2.56e-06 [auto_monad_grad]: 1.84e-06 [auto_monad_eliminator]: 1.21e-05 [cse]: 2.369e-05 [a_3]: 6.019e-05 [py_interpret_to_execute_after_opt_a]: 1.563e-05 [slice_cell_reuse_recomputed_activation]: 5.62999e-06 [rewriter_after_opt_a]: 4.833e-05 [convert_after_rewriter]: 1.242e-05 [order_py_execute_after_rewriter]: 8.90999e-06 [mutable_eliminate]: 0.00058452 [opt_b]: 0.00032516, [1] [Cycle 1]: 0.00031412, [7] [b_1]: 0.00019909 [b_2]: 1.011e-05 [updatestate_depend_eliminate]: 8.05e-06 [updatestate_assign_eliminate]: 3.4e-06 [updatestate_loads_eliminate]: 3.76001e-06 [renormalize]: 7.59988e-07 [cse]: 2.835e-05 [optimize_parallel_all_gather_comm]: 2.433e-05 [overlap_param_gather]: 5.47999e-06 [cconv]: 3.456e-05 [loop_unroll]: 0.00048403 [opt_after_cconv]: 0.00016031, [1] [Cycle 1]: 0.00015022, [7] [c_1]: 4.03e-05 [parameter_eliminate]: 3.87998e-06 [updatestate_depend_eliminate]: 8.17998e-06 [updatestate_assign_eliminate]: 3.83999e-06 [updatestate_loads_eliminate]: 3.21999e-06 [cse]: 2.975e-05 [renormalize]: 6.60017e-07 [remove_dup_value]: 1.974e-05 [tuple_transform]: 0.00011154, [1] [Cycle 1]: 0.0001037, [4] [d_1]: 5.788e-05 [none_parameter_eliminate]: 1.44998e-06 [renormalize]: 2.00002e-07 [switch_simplify]: 9.82001e-06 [partial_unused_args_eliminate]: 4.57e-06 [add_recomputation]: 6.587e-05 [cse_after_recomputation]: 3.554e-05, [1] [Cycle 1]: 2.76e-05, [1] [cse]: 1.753e-05 [environ_conv]: 1.082e-05 [swap_dp_allreduce_reducescatter]: 9.14e-06 [bias_add_comm_swap]: 5.39998e-06 [label_micro_interleaved_index]: 8.08999e-06 [label_fine_grained_interleaved_index]: 5.46e-06 [merge_cast_opt]: 3.76001e-06 [slice_recompute_activation]: 4.32998e-06 [micro_interleaved_order_control]: 5.09998e-06 [assign_add_opt]: 3.62998e-06 [ForceFp32Comm]: 3.26001e-06 [remove_cast_before_assign_add]: 3.7e-06 [full_micro_interleaved_order_control]: 4.82998e-06 [reorder_send_recv_between_fp_bp]: 5.43002e-06 [comm_op_add_attrs]: 3.96001e-06 [add_comm_op_reuse_tag]: 3.14001e-06 [interleave_split_concat_branches]: 3.49001e-06 [interleave_parallel_branches]: 3.66999e-06 [overlap_opt_shard_in_pipeline]: 4.08001e-06 [overlap_opt_shard_grad_in_pipeline]: 4.33999e-06 [control_data_broadcast_order]: 3.605e-05 [grouped_pairwise_exchange_alltoall]: 5.02e-06 [offloading_packed_experts]: 8.84e-06 [overlap_recompute_and_grad_model_parallel]: 8.84e-06 [overlap_grad_matmul_and_grad_allreduce]: 4.06001e-06 [overlap_recompute_allgather_and_fa_grad]: 3.95e-06 [overlap_recompute_comm]: 5.10001e-06 [overlap_grad_ring_attention]: 7.6e-06 [overlap_grad_flash_sp]: 2.752e-05 [begin_end_overlap_inline]: 3.12002e-06 [split_matmul_comm_elemetwise]: 4.95999e-06 [split_layernorm_comm]: 3.88001e-06 [handle_group_info]: 3.42002e-06 [symbol_engine_optimizer]: 0.00011505, [1] [Cycle 1]: 0.00010696, [6] [build]: 3.95e-06 [elim_shapecalc]: 1.574e-05 [elim_not_effective]: 1.748e-05 [opt_reshape]: 8.39002e-06 [fold_const_symbol]: 1.23e-05 [renormalize]: 4.19997e-07 [detach_backward]: 3.56001e-06 [pipeline_parallel_scheduler]: 1.95001e-06 [auto_monad_reorder]: 2.443e-05 [get_jit_bprop_graph]: 1.99999e-06 [rewriter_after_jit_bprop_graph]: 5.51e-06 [opt_after_jit_grad]: 0.00053056 [validate]: 4.307e-05 Sums bootstrap : 0.000455s : 3.77% type_inference : 0.005924s : 49.06% event_method : 0.000020s : 0.17% auto_monad : 0.000066s : 0.54% graph_reusing : 0.000006s : 0.05% inline : 0.000002s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000020s : 0.17% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000007s : 0.05% parallel-infer-symbol : 0.000004s : 0.03% pre_auto_parallel : 0.000036s : 0.30% insert-virtual-dataset : 0.000003s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000032s : 0.26% optimize.rewriter_before_opt_a : 0.000091s : 0.76% optimize.opt_a.expand_dump_flag : 0.000004s : 0.03% optimize.opt_a.switch_simplify : 0.000053s : 0.44% optimize.opt_a.loop_unroll : 0.000039s : 0.32% optimize.opt_a.a_1 : 0.000926s : 7.67% optimize.opt_a.with_stream_mark : 0.000035s : 0.29% optimize.opt_a.recompute_prepare : 0.000022s : 0.19% optimize.opt_a.updatestate_depend_eliminate : 0.000009s : 0.07% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.06% optimize.opt_a.updatestate_loads_eliminate : 0.000007s : 0.06% optimize.opt_a.parameter_eliminate : 0.000004s : 0.03% optimize.opt_a.a_2 : 0.000258s : 2.14% optimize.opt_a.accelerated_algorithm : 0.000018s : 0.15% optimize.opt_a.shard : 0.000003s : 0.03% optimize.opt_a.meta_shard_fg_expand : 0.000005s : 0.04% optimize.opt_a.shard_inline : 0.000016s : 0.13% optimize.opt_a.merge_send_recv : 0.000018s : 0.15% optimize.opt_a.auto_parallel : 0.000016s : 0.13% optimize.opt_a.parallel : 0.000024s : 0.20% optimize.opt_a.flash_sp : 0.000013s : 0.11% optimize.opt_a.merge_comm : 0.000009s : 0.08% optimize.opt_a.allreduce_fusion : 0.000008s : 0.07% optimize.opt_a.matmul_add_comm_reduction : 0.000019s : 0.16% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000022s : 0.19% optimize.opt_a.virtual_dataset : 0.000016s : 0.13% optimize.opt_a.get_grad_eliminate_ : 0.000015s : 0.12% optimize.opt_a.virtual_output : 0.000015s : 0.13% optimize.opt_a.merge_forward : 0.000010s : 0.08% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.03% optimize.opt_a.offload_activation : 0.000022s : 0.18% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000044s : 0.36% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.03% optimize.opt_a.before_grad : 0.000029s : 0.24% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000011s : 0.09% optimize.opt_a.meta_fg_expand : 0.000007s : 0.06% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.03% optimize.opt_a.receive_attached : 0.000003s : 0.03% optimize.opt_a.after_resolve : 0.000029s : 0.24% optimize.opt_a.a_after_grad : 0.000025s : 0.21% optimize.opt_a.renormalize : 0.000762s : 6.31% optimize.opt_a.add_forward_monad_depend : 0.000009s : 0.08% optimize.opt_a.auto_monad_grad : 0.000005s : 0.04% optimize.opt_a.auto_monad_eliminator : 0.000031s : 0.25% optimize.opt_a.cse : 0.000059s : 0.49% optimize.opt_a.a_3 : 0.000134s : 1.11% optimize.py_interpret_to_execute_after_opt_a : 0.000016s : 0.13% optimize.slice_cell_reuse_recomputed_activation : 0.000006s : 0.05% optimize.rewriter_after_opt_a : 0.000048s : 0.40% optimize.convert_after_rewriter : 0.000012s : 0.10% optimize.order_py_execute_after_rewriter : 0.000009s : 0.07% optimize.mutable_eliminate : 0.000585s : 4.84% optimize.opt_b.b_1 : 0.000199s : 1.65% optimize.opt_b.b_2 : 0.000010s : 0.08% optimize.opt_b.updatestate_depend_eliminate : 0.000008s : 0.07% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_b.updatestate_loads_eliminate : 0.000004s : 0.03% optimize.opt_b.renormalize : 0.000001s : 0.01% optimize.opt_b.cse : 0.000028s : 0.23% optimize.optimize_parallel_all_gather_comm : 0.000024s : 0.20% optimize.overlap_param_gather : 0.000005s : 0.05% optimize.cconv : 0.000035s : 0.29% optimize.loop_unroll : 0.000484s : 4.01% optimize.opt_after_cconv.c_1 : 0.000040s : 0.33% optimize.opt_after_cconv.parameter_eliminate : 0.000004s : 0.03% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000008s : 0.07% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000004s : 0.03% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.cse : 0.000030s : 0.25% optimize.opt_after_cconv.renormalize : 0.000001s : 0.01% optimize.remove_dup_value : 0.000020s : 0.16% optimize.tuple_transform.d_1 : 0.000058s : 0.48% optimize.tuple_transform.none_parameter_eliminate : 0.000001s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000010s : 0.08% optimize.partial_unused_args_eliminate : 0.000005s : 0.04% optimize.add_recomputation : 0.000066s : 0.55% optimize.cse_after_recomputation.cse : 0.000018s : 0.15% optimize.environ_conv : 0.000011s : 0.09% optimize.swap_dp_allreduce_reducescatter : 0.000009s : 0.08% optimize.bias_add_comm_swap : 0.000005s : 0.04% optimize.label_micro_interleaved_index : 0.000008s : 0.07% optimize.label_fine_grained_interleaved_index : 0.000005s : 0.05% optimize.merge_cast_opt : 0.000004s : 0.03% optimize.slice_recompute_activation : 0.000004s : 0.04% optimize.micro_interleaved_order_control : 0.000005s : 0.04% optimize.assign_add_opt : 0.000004s : 0.03% optimize.ForceFp32Comm : 0.000003s : 0.03% optimize.remove_cast_before_assign_add : 0.000004s : 0.03% optimize.full_micro_interleaved_order_control : 0.000005s : 0.04% optimize.reorder_send_recv_between_fp_bp : 0.000005s : 0.04% optimize.comm_op_add_attrs : 0.000004s : 0.03% optimize.add_comm_op_reuse_tag : 0.000003s : 0.03% optimize.interleave_split_concat_branches : 0.000003s : 0.03% optimize.interleave_parallel_branches : 0.000004s : 0.03% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.03% optimize.overlap_opt_shard_grad_in_pipeline : 0.000004s : 0.04% optimize.control_data_broadcast_order : 0.000036s : 0.30% optimize.grouped_pairwise_exchange_alltoall : 0.000005s : 0.04% optimize.offloading_packed_experts : 0.000009s : 0.07% optimize.overlap_recompute_and_grad_model_parallel : 0.000009s : 0.07% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.03% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.03% optimize.overlap_recompute_comm : 0.000005s : 0.04% optimize.overlap_grad_ring_attention : 0.000008s : 0.06% optimize.overlap_grad_flash_sp : 0.000028s : 0.23% optimize.begin_end_overlap_inline : 0.000003s : 0.03% optimize.split_matmul_comm_elemetwise : 0.000005s : 0.04% optimize.split_layernorm_comm : 0.000004s : 0.03% optimize.handle_group_info : 0.000003s : 0.03% optimize.symbol_engine_optimizer.build : 0.000004s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000016s : 0.13% optimize.symbol_engine_optimizer.elim_not_effective : 0.000017s : 0.14% optimize.symbol_engine_optimizer.opt_reshape : 0.000008s : 0.07% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000012s : 0.10% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000004s : 0.03% pipeline_parallel_scheduler : 0.000002s : 0.02% auto_monad_reorder : 0.000024s : 0.20% get_jit_bprop_graph : 0.000002s : 0.02% rewriter_after_jit_bprop_graph : 0.000006s : 0.05% opt_after_jit_grad : 0.000531s : 4.39% validate : 0.000043s : 0.36% Time group info: ------[substitution.] 0.000214 38 10.17% : 0.000022s : 3: substitution.cast_eliminate 1.04% : 0.000002s : 3: substitution.elim_not_effective 0.86% : 0.000002s : 3: substitution.fold_const_symbol 3.42% : 0.000007s : 5: substitution.graph_param_transform 68.86% : 0.000147s : 4: substitution.inline 2.46% : 0.000005s : 6: substitution.j_node_and_user_rematch 3.53% : 0.000008s : 6: substitution.remove_not_recompute_node 2.55% : 0.000005s : 4: substitution.replace_old_param 7.12% : 0.000015s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.005868 2 87.65% : 0.005144s : 1: type_inference.infer 12.35% : 0.000725s : 1: type_inference.specialize ------[replace.] 0.000066 8 57.67% : 0.000038s : 4: replace.inline 42.33% : 0.000028s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000158 8 91.66% : 0.000144s : 4: match.inline 8.34% : 0.000013s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000259 1596 0.97% : 0.000003s : 17: predicate.accumulaten_eliminater 0.73% : 0.000002s : 5: predicate.ad_related_special_op_eliminate 0.54% : 0.000001s : 10: predicate.addn_check_dump 1.03% : 0.000003s : 17: predicate.addn_zero_filter 0.83% : 0.000002s : 17: predicate.adjust_all_reduce_mul_add 1.97% : 0.000005s : 27: predicate.arithmetic_simplify 1.04% : 0.000003s : 17: predicate.cast_eliminate 0.58% : 0.000001s : 10: predicate.check_bprop_eliminate 0.61% : 0.000002s : 10: predicate.compare_switch_simplify 0.19% : 0.000001s : 5: predicate.const_output_eliminate 0.71% : 0.000002s : 10: predicate.depend_value_elim 0.96% : 0.000002s : 17: predicate.dict_get_item_const_eliminator 1.01% : 0.000003s : 17: predicate.dict_get_item_eliminator 0.87% : 0.000002s : 17: predicate.dict_set_item_eliminator 0.91% : 0.000002s : 10: predicate.dumpgradient_eliminate 0.30% : 0.000001s : 5: predicate.elim_not_effective 0.34% : 0.000001s : 5: predicate.elim_shapecalc_of_broadcastargs 1.20% : 0.000003s : 22: predicate.environ_add_const_eliminate 1.13% : 0.000003s : 22: predicate.environ_get_add_eliminate 1.17% : 0.000003s : 22: predicate.environ_get_depend_swap 1.77% : 0.000005s : 32: predicate.environ_get_eliminate 1.16% : 0.000003s : 22: predicate.environ_get_set_eliminate 1.36% : 0.000004s : 25: predicate.exchange_switch_depend_value 2.39% : 0.000006s : 25: predicate.float_depend_g_call 0.75% : 0.000002s : 10: predicate.float_environ_get_switch 0.89% : 0.000002s : 15: predicate.float_tuple_getitem_switch 0.18% : 0.000000s : 5: predicate.fold_const_symbol 0.61% : 0.000002s : 10: predicate.get_grad_eliminate 0.21% : 0.000001s : 5: predicate.graph_param_transform 0.67% : 0.000002s : 10: predicate.incorporate_call 0.50% : 0.000001s : 10: predicate.incorporate_call_switch 6.53% : 0.000017s : 72: predicate.inline 0.82% : 0.000002s : 10: predicate.inline_without_move 0.33% : 0.000001s : 10: predicate.j_node_and_user_rematch 0.74% : 0.000002s : 10: predicate.less_batch_normalization 1.85% : 0.000005s : 31: predicate.list_to_tuple_eliminator_ 2.60% : 0.000007s : 48: predicate.load_eliminater 0.98% : 0.000003s : 5: predicate.loop_unroll_after_grad 1.96% : 0.000005s : 36: predicate.loop_unroll_before_grad 1.59% : 0.000004s : 27: predicate.make_slice_get_slice_eliminator 0.56% : 0.000001s : 10: predicate.merge_addn 0.59% : 0.000002s : 10: predicate.micro_step_allgather_replace 0.59% : 0.000002s : 10: predicate.mini_step_allgather_replace 0.86% : 0.000002s : 17: predicate.minmaximum_grad 1.10% : 0.000003s : 5: predicate.mutable_eliminate 0.41% : 0.000001s : 5: predicate.opt_reshape 0.45% : 0.000001s : 5: predicate.parallel_virtual_node 1.82% : 0.000005s : 25: predicate.partial_defer_inline 1.71% : 0.000004s : 26: predicate.partial_eliminate 0.92% : 0.000002s : 17: predicate.print_const_string_wrapper 0.62% : 0.000002s : 10: predicate.reduce_all_const_elim 1.12% : 0.000003s : 17: predicate.reduce_eliminate 2.53% : 0.000007s : 48: predicate.redundant_stop_gradient_eliminater 0.39% : 0.000001s : 10: predicate.remove_not_recompute_node 1.37% : 0.000004s : 31: predicate.replace_applicator 0.51% : 0.000001s : 10: predicate.replace_old_param 0.31% : 0.000001s : 5: predicate.reset_defer_inline 0.97% : 0.000003s : 17: predicate.reshape_eliminate 0.59% : 0.000002s : 10: predicate.row_tensor_add_zeros_like 0.44% : 0.000001s : 5: predicate.row_tensor_eliminate 0.79% : 0.000002s : 10: predicate.same_eliminate 0.56% : 0.000001s : 10: predicate.set_cell_output_no_recompute 1.00% : 0.000003s : 10: predicate.shard_identity_eliminate 0.68% : 0.000002s : 10: predicate.special_op_eliminate 0.73% : 0.000002s : 10: predicate.specialize_transform 0.83% : 0.000002s : 10: predicate.split_environ_get_set_with_tuple_value 0.72% : 0.000002s : 10: predicate.stack_unstack_eliminate 0.37% : 0.000001s : 5: predicate.switch_call_monad_eliminater 1.49% : 0.000004s : 25: predicate.switch_defer_inline 2.03% : 0.000005s : 35: predicate.switch_layer_defer_inline 4.79% : 0.000012s : 76: predicate.switch_simplify 0.88% : 0.000002s : 17: predicate.tile_eliminate 0.93% : 0.000002s : 17: predicate.transpose_eliminate 1.58% : 0.000004s : 27: predicate.tuple_list_convert_item_index_to_positive 1.68% : 0.000004s : 27: predicate.tuple_list_get_item_const_eliminator 1.47% : 0.000004s : 27: predicate.tuple_list_get_item_depend_reorder 3.22% : 0.000008s : 41: predicate.tuple_list_get_item_eliminator 1.53% : 0.000004s : 27: predicate.tuple_list_get_set_item_eliminator 2.26% : 0.000006s : 37: predicate.tuple_list_set_item_eliminator 1.78% : 0.000005s : 31: predicate.tuple_to_list_eliminator_ 2.48% : 0.000006s : 48: predicate.updatestate_pure_node_eliminater 3.19% : 0.000008s : 58: predicate.updatestate_useless_node_eliminater 0.34% : 0.000001s : 5: predicate.value_based_eliminate 0.59% : 0.000002s : 10: predicate.virtual_dataset_eliminate 0.55% : 0.000001s : 10: predicate.virtual_output_eliminate 0.28% : 0.000001s : 5: predicate.virtual_view_grad_eliminate 0.46% : 0.000001s : 5: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000557 11 55.35% : 0.000308s : 5: func_graph_cloner_run.FuncGraphClonerGraph 44.65% : 0.000249s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.029098 192 0.02% : 0.000006s : 1: ForceFp32Comm 11.17% : 0.003250s : 1: add_attr 11.12% : 0.003235s : 1: add_attr_with_inline 0.02% : 0.000006s : 1: add_comm_op_reuse_tag 0.24% : 0.000071s : 1: add_recomputation 0.02% : 0.000007s : 1: assign_add_opt 0.26% : 0.000077s : 1: auto_monad 0.11% : 0.000033s : 1: auto_monad_reorder 0.02% : 0.000006s : 1: begin_end_overlap_inline 0.03% : 0.000008s : 1: bias_add_comm_swap 1.73% : 0.000504s : 1: bootstrap 0.13% : 0.000038s : 1: cconv 0.02% : 0.000007s : 1: comm_op_add_attrs 0.14% : 0.000041s : 1: control_data_broadcast_order 0.05% : 0.000016s : 1: convert_after_rewriter 0.13% : 0.000039s : 1: cse_after_recomputation 0.03% : 0.000008s : 1: dataset_repeat_opt 0.08% : 0.000022s : 1: detach_backward 0.05% : 0.000014s : 1: environ_conv 0.10% : 0.000030s : 1: event_method 0.03% : 0.000008s : 1: full_micro_interleaved_order_control 0.03% : 0.000008s : 1: get_jit_bprop_graph 0.04% : 0.000013s : 1: graph_reusing 0.03% : 0.000008s : 1: grouped_pairwise_exchange_alltoall 0.02% : 0.000006s : 1: handle_group_info 0.03% : 0.000008s : 1: inline 0.03% : 0.000010s : 1: insert-virtual-dataset 0.02% : 0.000006s : 1: interleave_parallel_branches 0.02% : 0.000006s : 1: interleave_split_concat_branches 0.03% : 0.000008s : 1: label_fine_grained_interleaved_index 0.04% : 0.000011s : 1: label_micro_interleaved_index 1.69% : 0.000491s : 1: loop_unroll 0.02% : 0.000006s : 1: merge_cast_opt 0.03% : 0.000008s : 1: micro_interleaved_order_control 2.03% : 0.000592s : 1: mutable_eliminate 0.04% : 0.000012s : 1: offloading_packed_experts 0.07% : 0.000019s : 1: opt.transform.loop_unroll_optimizer 0.07% : 0.000021s : 1: opt.transform.mutable_eliminate 5.07% : 0.001475s : 78: opt.transform.opt_a 0.13% : 0.000039s : 1: opt.transform.opt_after_cconv 0.11% : 0.000032s : 1: opt.transform.opt_after_jit_grad 0.46% : 0.000135s : 28: opt.transform.opt_b 0.22% : 0.000065s : 2: opt.transform.opt_trans_graph 0.17% : 0.000049s : 4: opt.transform.symbol_engine_opt 12.30% : 0.003580s : 1: opt_a 0.56% : 0.000164s : 1: opt_after_cconv 1.86% : 0.000542s : 1: opt_after_jit_grad 1.13% : 0.000329s : 1: opt_b 22.54% : 0.006559s : 1: optimize 0.10% : 0.000028s : 1: optimize_parallel_all_gather_comm 0.04% : 0.000012s : 1: order_py_execute_after_rewriter 0.11% : 0.000031s : 1: overlap_grad_flash_sp 0.02% : 0.000007s : 1: overlap_grad_matmul_and_grad_allreduce 0.04% : 0.000011s : 1: overlap_grad_ring_attention 0.02% : 0.000007s : 1: overlap_opt_shard_grad_in_pipeline 0.02% : 0.000007s : 1: overlap_opt_shard_in_pipeline 0.03% : 0.000009s : 1: overlap_param_gather 0.02% : 0.000007s : 1: overlap_recompute_allgather_and_fa_grad 0.04% : 0.000012s : 1: overlap_recompute_and_grad_model_parallel 0.03% : 0.000008s : 1: overlap_recompute_comm 0.04% : 0.000011s : 1: parallel-infer-symbol 0.03% : 0.000008s : 1: parallel-infer-symbol-second 0.03% : 0.000008s : 1: partial_unused_args_eliminate 0.04% : 0.000010s : 1: pipeline_parallel_scheduler 0.03% : 0.000008s : 1: pipeline_split 0.16% : 0.000045s : 1: pre_auto_parallel 0.12% : 0.000036s : 1: py_interpret_to_execute 0.07% : 0.000019s : 1: py_interpret_to_execute_after_opt_a 0.03% : 0.000007s : 1: remove_cast_before_assign_add 0.08% : 0.000023s : 1: remove_dup_value 1.51% : 0.000439s : 1: renormalize.infer 1.08% : 0.000314s : 1: renormalize.specialize 0.03% : 0.000008s : 1: reorder_send_recv_between_fp_bp 0.04% : 0.000011s : 1: rewriter_after_jit_bprop_graph 0.18% : 0.000053s : 1: rewriter_after_opt_a 0.33% : 0.000096s : 1: rewriter_before_opt_a 0.03% : 0.000009s : 1: slice_cell_reuse_recomputed_activation 0.03% : 0.000007s : 1: slice_recompute_activation 0.02% : 0.000007s : 1: split_layernorm_comm 0.03% : 0.000008s : 1: split_matmul_comm_elemetwise 0.04% : 0.000012s : 1: swap_dp_allreduce_reducescatter 0.41% : 0.000118s : 1: symbol_engine_optimizer 0.39% : 0.000114s : 1: tuple_transform 20.49% : 0.005962s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:45:31.714.481 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0149305, [21] [bootstrap]: 0.00043003 [type_inference]: 0.00547171 [event_method]: 1.848e-05 [auto_monad]: 6.295e-05 [graph_reusing]: 5.79e-06 [inline]: 1.79e-06 [add_attr]: 0.00305661, [1] [add_attr_with_inline]: 0.00304752, [1] [Cycle 1]: 5.851e-05, [2] [tag_attr]: 2.04e-05 [meta_addattr_fg_expand]: 6.01998e-06 [parallel-infer-symbol]: 3.4e-06 [pre_auto_parallel]: 3.266e-05 [insert-virtual-dataset]: 2.59001e-06 [parallel-infer-symbol-second]: 7.80012e-07 [dataset_repeat_opt]: 2.46e-06 [pipeline_split]: 1.67001e-06 [optimize]: 0.00518048, [53] [py_interpret_to_execute]: 2.597e-05 [rewriter_before_opt_a]: 8.541e-05 [opt_a]: 0.00306024, [2] [Cycle 1]: 0.00229556, [45] [expand_dump_flag]: 3.19001e-06 [switch_simplify]: 4.458e-05 [loop_unroll]: 3.257e-05 [a_1]: 0.0007591 [with_stream_mark]: 1.852e-05 [recompute_prepare]: 1.304e-05 [updatestate_depend_eliminate]: 5.40999e-06 [updatestate_assign_eliminate]: 4.33999e-06 [updatestate_loads_eliminate]: 4.25e-06 [parameter_eliminate]: 2.60002e-06 [a_2]: 0.00010305 [accelerated_algorithm]: 9.06998e-06 [shard]: 2.69999e-06 [meta_shard_fg_expand]: 2.04e-06 [shard_inline]: 8.03001e-06 [merge_send_recv]: 9.74e-06 [auto_parallel]: 8.1e-06 [parallel]: 1.981e-05 [flash_sp]: 9.92999e-06 [merge_comm]: 5.28002e-06 [allreduce_fusion]: 4.59998e-06 [matmul_add_comm_reduction]: 1.125e-05 [allreduce_slice_to_reducescatter]: 6.59988e-07 [virtual_shard_identity]: 1.099e-05 [virtual_dataset]: 8.88002e-06 [get_grad_eliminate_]: 8.20999e-06 [virtual_output]: 8.52998e-06 [merge_forward]: 5.76e-06 [cell_reuse_recompute_pass]: 1.31998e-06 [offload_activation]: 1.118e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.895e-05 [merge_recompute_call_nodes]: 1.64e-06 [before_grad]: 1.404e-05 [set_forward_comm_id_for_comm_node_pass]: 5.57001e-06 [meta_fg_expand]: 3.7e-06 [flash_sp_send_recv_attached]: 2.79001e-06 [receive_attached]: 2.01998e-06 [after_resolve]: 1.37e-05 [a_after_grad]: 1.273e-05 [renormalize]: 0.00068968 [add_forward_monad_depend]: 5.80002e-06 [auto_monad_grad]: 2.52001e-06 [auto_monad_eliminator]: 1.563e-05 [cse]: 3.527e-05 [a_3]: 5.855e-05 [Cycle 2]: 0.00075516, [45] [expand_dump_flag]: 1.27e-06 [switch_simplify]: 9.37001e-06 [loop_unroll]: 7.53e-06 [a_1]: 0.000176 [with_stream_mark]: 1.321e-05 [recompute_prepare]: 7.63999e-06 [updatestate_depend_eliminate]: 3.77002e-06 [updatestate_assign_eliminate]: 3.01001e-06 [updatestate_loads_eliminate]: 3.06001e-06 [parameter_eliminate]: 1.27e-06 [a_2]: 9.248e-05 [accelerated_algorithm]: 7.58001e-06 [shard]: 1.39998e-06 [meta_shard_fg_expand]: 1.67999e-06 [shard_inline]: 7.27002e-06 [merge_send_recv]: 8.77999e-06 [auto_parallel]: 6.67002e-06 [parallel]: 5.25001e-06 [flash_sp]: 3.39001e-06 [merge_comm]: 4.48999e-06 [allreduce_fusion]: 3.83001e-06 [matmul_add_comm_reduction]: 6.71e-06 [allreduce_slice_to_reducescatter]: 5.8001e-07 [virtual_shard_identity]: 8.12e-06 [virtual_dataset]: 7.51001e-06 [get_grad_eliminate_]: 6.86001e-06 [virtual_output]: 6.86001e-06 [merge_forward]: 3.43e-06 [cell_reuse_recompute_pass]: 1.19e-06 [offload_activation]: 7.16999e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.41e-05 [merge_recompute_call_nodes]: 9.70002e-07 [before_grad]: 1.131e-05 [set_forward_comm_id_for_comm_node_pass]: 6.24999e-06 [meta_fg_expand]: 2.84999e-06 [flash_sp_send_recv_attached]: 9.29984e-07 [receive_attached]: 9.5999e-07 [after_resolve]: 1.226e-05 [a_after_grad]: 1.112e-05 [renormalize]: 8.00064e-08 [add_forward_monad_depend]: 1.22e-06 [auto_monad_grad]: 8.2e-07 [auto_monad_eliminator]: 8.37e-06 [cse]: 1.726e-05 [a_3]: 4.468e-05 [py_interpret_to_execute_after_opt_a]: 1.044e-05 [slice_cell_reuse_recomputed_activation]: 2.02001e-06 [rewriter_after_opt_a]: 3.85e-05 [convert_after_rewriter]: 7.71999e-06 [order_py_execute_after_rewriter]: 6.08002e-06 [mutable_eliminate]: 0.00047503 [opt_b]: 0.00031962, [1] [Cycle 1]: 0.00030099, [7] [b_1]: 0.00021534 [b_2]: 1.05e-05 [updatestate_depend_eliminate]: 6.09001e-06 [updatestate_assign_eliminate]: 3.11999e-06 [updatestate_loads_eliminate]: 2.94999e-06 [renormalize]: 4.00003e-07 [cse]: 2.236e-05 [optimize_parallel_all_gather_comm]: 1.757e-05 [overlap_param_gather]: 1.84998e-06 [cconv]: 2.322e-05 [loop_unroll]: 0.00042107 [opt_after_cconv]: 0.0001126, [1] [Cycle 1]: 0.00010724, [7] [c_1]: 3.659e-05 [parameter_eliminate]: 2.53998e-06 [updatestate_depend_eliminate]: 5.77001e-06 [updatestate_assign_eliminate]: 3.2e-06 [updatestate_loads_eliminate]: 2.98e-06 [cse]: 2.104e-05 [renormalize]: 4.00003e-07 [remove_dup_value]: 1.499e-05 [tuple_transform]: 8.363e-05, [1] [Cycle 1]: 7.949e-05, [4] [d_1]: 5.146e-05 [none_parameter_eliminate]: 1.58002e-06 [renormalize]: 2.59985e-07 [switch_simplify]: 8e-06 [partial_unused_args_eliminate]: 2.04e-06 [add_recomputation]: 5.362e-05 [cse_after_recomputation]: 2.405e-05, [1] [Cycle 1]: 1.962e-05, [1] [cse]: 1.448e-05 [environ_conv]: 6.41998e-06 [swap_dp_allreduce_reducescatter]: 5.71e-06 [bias_add_comm_swap]: 2.42001e-06 [label_micro_interleaved_index]: 4.25999e-06 [label_fine_grained_interleaved_index]: 3.08e-06 [merge_cast_opt]: 1.25001e-06 [slice_recompute_activation]: 2.12999e-06 [micro_interleaved_order_control]: 2.36e-06 [assign_add_opt]: 1.18001e-06 [ForceFp32Comm]: 1.12e-06 [remove_cast_before_assign_add]: 1.10001e-06 [full_micro_interleaved_order_control]: 2.02001e-06 [reorder_send_recv_between_fp_bp]: 2.59001e-06 [comm_op_add_attrs]: 1.09998e-06 [add_comm_op_reuse_tag]: 1.08001e-06 [interleave_split_concat_branches]: 1.08001e-06 [interleave_parallel_branches]: 1.00001e-06 [overlap_opt_shard_in_pipeline]: 1.19e-06 [overlap_opt_shard_grad_in_pipeline]: 1.68002e-06 [control_data_broadcast_order]: 1.365e-05 [grouped_pairwise_exchange_alltoall]: 1.50999e-06 [offloading_packed_experts]: 4.13999e-06 [overlap_recompute_and_grad_model_parallel]: 5.44998e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.35001e-06 [overlap_recompute_allgather_and_fa_grad]: 1.29998e-06 [overlap_recompute_comm]: 2.61e-06 [overlap_grad_ring_attention]: 4.63001e-06 [overlap_grad_flash_sp]: 1.995e-05 [begin_end_overlap_inline]: 5.60016e-07 [split_matmul_comm_elemetwise]: 2.31e-06 [split_layernorm_comm]: 1.85001e-06 [handle_group_info]: 1.32e-06 [symbol_engine_optimizer]: 7.978e-05, [1] [Cycle 1]: 7.582e-05, [6] [build]: 2.67001e-06 [elim_shapecalc]: 1.086e-05 [elim_not_effective]: 1.487e-05 [opt_reshape]: 8.11002e-06 [fold_const_symbol]: 1.211e-05 [renormalize]: 2.3999e-07 [detach_backward]: 1.81e-06 [pipeline_parallel_scheduler]: 1.49e-06 [auto_monad_reorder]: 1.905e-05 [get_jit_bprop_graph]: 1.39e-06 [rewriter_after_jit_bprop_graph]: 3.38999e-06 [opt_after_jit_grad]: 0.00045132 [validate]: 4.113e-05 Sums bootstrap : 0.000430s : 3.95% type_inference : 0.005472s : 50.21% event_method : 0.000018s : 0.17% auto_monad : 0.000063s : 0.58% graph_reusing : 0.000006s : 0.05% inline : 0.000002s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000020s : 0.19% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.06% parallel-infer-symbol : 0.000003s : 0.03% pre_auto_parallel : 0.000033s : 0.30% insert-virtual-dataset : 0.000003s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.02% optimize.py_interpret_to_execute : 0.000026s : 0.24% optimize.rewriter_before_opt_a : 0.000085s : 0.78% optimize.opt_a.expand_dump_flag : 0.000004s : 0.04% optimize.opt_a.switch_simplify : 0.000054s : 0.50% optimize.opt_a.loop_unroll : 0.000040s : 0.37% optimize.opt_a.a_1 : 0.000935s : 8.58% optimize.opt_a.with_stream_mark : 0.000032s : 0.29% optimize.opt_a.recompute_prepare : 0.000021s : 0.19% optimize.opt_a.updatestate_depend_eliminate : 0.000009s : 0.08% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.07% optimize.opt_a.updatestate_loads_eliminate : 0.000007s : 0.07% optimize.opt_a.parameter_eliminate : 0.000004s : 0.04% optimize.opt_a.a_2 : 0.000196s : 1.79% optimize.opt_a.accelerated_algorithm : 0.000017s : 0.15% optimize.opt_a.shard : 0.000004s : 0.04% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.03% optimize.opt_a.shard_inline : 0.000015s : 0.14% optimize.opt_a.merge_send_recv : 0.000019s : 0.17% optimize.opt_a.auto_parallel : 0.000015s : 0.14% optimize.opt_a.parallel : 0.000025s : 0.23% optimize.opt_a.flash_sp : 0.000013s : 0.12% optimize.opt_a.merge_comm : 0.000010s : 0.09% optimize.opt_a.allreduce_fusion : 0.000008s : 0.08% optimize.opt_a.matmul_add_comm_reduction : 0.000018s : 0.16% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000019s : 0.18% optimize.opt_a.virtual_dataset : 0.000016s : 0.15% optimize.opt_a.get_grad_eliminate_ : 0.000015s : 0.14% optimize.opt_a.virtual_output : 0.000015s : 0.14% optimize.opt_a.merge_forward : 0.000009s : 0.08% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.02% optimize.opt_a.offload_activation : 0.000018s : 0.17% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000033s : 0.30% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.02% optimize.opt_a.before_grad : 0.000025s : 0.23% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000012s : 0.11% optimize.opt_a.meta_fg_expand : 0.000007s : 0.06% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.03% optimize.opt_a.receive_attached : 0.000003s : 0.03% optimize.opt_a.after_resolve : 0.000026s : 0.24% optimize.opt_a.a_after_grad : 0.000024s : 0.22% optimize.opt_a.renormalize : 0.000690s : 6.33% optimize.opt_a.add_forward_monad_depend : 0.000007s : 0.06% optimize.opt_a.auto_monad_grad : 0.000003s : 0.03% optimize.opt_a.auto_monad_eliminator : 0.000024s : 0.22% optimize.opt_a.cse : 0.000053s : 0.48% optimize.opt_a.a_3 : 0.000103s : 0.95% optimize.py_interpret_to_execute_after_opt_a : 0.000010s : 0.10% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.02% optimize.rewriter_after_opt_a : 0.000039s : 0.35% optimize.convert_after_rewriter : 0.000008s : 0.07% optimize.order_py_execute_after_rewriter : 0.000006s : 0.06% optimize.mutable_eliminate : 0.000475s : 4.36% optimize.opt_b.b_1 : 0.000215s : 1.98% optimize.opt_b.b_2 : 0.000010s : 0.10% optimize.opt_b.updatestate_depend_eliminate : 0.000006s : 0.06% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_b.renormalize : 0.000000s : 0.00% optimize.opt_b.cse : 0.000022s : 0.21% optimize.optimize_parallel_all_gather_comm : 0.000018s : 0.16% optimize.overlap_param_gather : 0.000002s : 0.02% optimize.cconv : 0.000023s : 0.21% optimize.loop_unroll : 0.000421s : 3.86% optimize.opt_after_cconv.c_1 : 0.000037s : 0.34% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.02% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.05% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.cse : 0.000021s : 0.19% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000015s : 0.14% optimize.tuple_transform.d_1 : 0.000051s : 0.47% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000008s : 0.07% optimize.partial_unused_args_eliminate : 0.000002s : 0.02% optimize.add_recomputation : 0.000054s : 0.49% optimize.cse_after_recomputation.cse : 0.000014s : 0.13% optimize.environ_conv : 0.000006s : 0.06% optimize.swap_dp_allreduce_reducescatter : 0.000006s : 0.05% optimize.bias_add_comm_swap : 0.000002s : 0.02% optimize.label_micro_interleaved_index : 0.000004s : 0.04% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.03% optimize.merge_cast_opt : 0.000001s : 0.01% optimize.slice_recompute_activation : 0.000002s : 0.02% optimize.micro_interleaved_order_control : 0.000002s : 0.02% optimize.assign_add_opt : 0.000001s : 0.01% optimize.ForceFp32Comm : 0.000001s : 0.01% optimize.remove_cast_before_assign_add : 0.000001s : 0.01% optimize.full_micro_interleaved_order_control : 0.000002s : 0.02% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.02% optimize.comm_op_add_attrs : 0.000001s : 0.01% optimize.add_comm_op_reuse_tag : 0.000001s : 0.01% optimize.interleave_split_concat_branches : 0.000001s : 0.01% optimize.interleave_parallel_branches : 0.000001s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.02% optimize.control_data_broadcast_order : 0.000014s : 0.13% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.01% optimize.offloading_packed_experts : 0.000004s : 0.04% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.05% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.01% optimize.overlap_recompute_comm : 0.000003s : 0.02% optimize.overlap_grad_ring_attention : 0.000005s : 0.04% optimize.overlap_grad_flash_sp : 0.000020s : 0.18% optimize.begin_end_overlap_inline : 0.000001s : 0.01% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.02% optimize.split_layernorm_comm : 0.000002s : 0.02% optimize.handle_group_info : 0.000001s : 0.01% optimize.symbol_engine_optimizer.build : 0.000003s : 0.02% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000011s : 0.10% optimize.symbol_engine_optimizer.elim_not_effective : 0.000015s : 0.14% optimize.symbol_engine_optimizer.opt_reshape : 0.000008s : 0.07% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000012s : 0.11% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.02% pipeline_parallel_scheduler : 0.000001s : 0.01% auto_monad_reorder : 0.000019s : 0.17% get_jit_bprop_graph : 0.000001s : 0.01% rewriter_after_jit_bprop_graph : 0.000003s : 0.03% opt_after_jit_grad : 0.000451s : 4.14% validate : 0.000041s : 0.38% Time group info: ------[substitution.] 0.000209 38 10.37% : 0.000022s : 3: substitution.cast_eliminate 1.06% : 0.000002s : 3: substitution.elim_not_effective 0.76% : 0.000002s : 3: substitution.fold_const_symbol 3.01% : 0.000006s : 5: substitution.graph_param_transform 69.69% : 0.000146s : 4: substitution.inline 2.12% : 0.000004s : 6: substitution.j_node_and_user_rematch 3.02% : 0.000006s : 6: substitution.remove_not_recompute_node 2.09% : 0.000004s : 4: substitution.replace_old_param 7.89% : 0.000017s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.005417 2 87.26% : 0.004727s : 1: type_inference.infer 12.74% : 0.000690s : 1: type_inference.specialize ------[replace.] 0.000065 8 57.86% : 0.000038s : 4: replace.inline 42.14% : 0.000027s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000157 8 91.02% : 0.000143s : 4: match.inline 8.98% : 0.000014s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000307 1596 0.89% : 0.000003s : 17: predicate.accumulaten_eliminater 0.59% : 0.000002s : 5: predicate.ad_related_special_op_eliminate 0.44% : 0.000001s : 10: predicate.addn_check_dump 0.78% : 0.000002s : 17: predicate.addn_zero_filter 0.73% : 0.000002s : 17: predicate.adjust_all_reduce_mul_add 1.62% : 0.000005s : 27: predicate.arithmetic_simplify 0.95% : 0.000003s : 17: predicate.cast_eliminate 0.46% : 0.000001s : 10: predicate.check_bprop_eliminate 0.50% : 0.000002s : 10: predicate.compare_switch_simplify 0.17% : 0.000001s : 5: predicate.const_output_eliminate 0.50% : 0.000002s : 10: predicate.depend_value_elim 0.81% : 0.000002s : 17: predicate.dict_get_item_const_eliminator 0.88% : 0.000003s : 17: predicate.dict_get_item_eliminator 0.77% : 0.000002s : 17: predicate.dict_set_item_eliminator 0.75% : 0.000002s : 10: predicate.dumpgradient_eliminate 0.18% : 0.000001s : 5: predicate.elim_not_effective 0.30% : 0.000001s : 5: predicate.elim_shapecalc_of_broadcastargs 0.97% : 0.000003s : 22: predicate.environ_add_const_eliminate 0.95% : 0.000003s : 22: predicate.environ_get_add_eliminate 0.97% : 0.000003s : 22: predicate.environ_get_depend_swap 1.43% : 0.000004s : 32: predicate.environ_get_eliminate 0.96% : 0.000003s : 22: predicate.environ_get_set_eliminate 1.17% : 0.000004s : 25: predicate.exchange_switch_depend_value 1.92% : 0.000006s : 25: predicate.float_depend_g_call 0.43% : 0.000001s : 10: predicate.float_environ_get_switch 0.75% : 0.000002s : 15: predicate.float_tuple_getitem_switch 0.16% : 0.000000s : 5: predicate.fold_const_symbol 0.53% : 0.000002s : 10: predicate.get_grad_eliminate 0.16% : 0.000000s : 5: predicate.graph_param_transform 0.51% : 0.000002s : 10: predicate.incorporate_call 0.44% : 0.000001s : 10: predicate.incorporate_call_switch 5.15% : 0.000016s : 72: predicate.inline 0.68% : 0.000002s : 10: predicate.inline_without_move 0.27% : 0.000001s : 10: predicate.j_node_and_user_rematch 0.67% : 0.000002s : 10: predicate.less_batch_normalization 1.54% : 0.000005s : 31: predicate.list_to_tuple_eliminator_ 2.23% : 0.000007s : 48: predicate.load_eliminater 0.65% : 0.000002s : 5: predicate.loop_unroll_after_grad 1.68% : 0.000005s : 36: predicate.loop_unroll_before_grad 18.42% : 0.000056s : 27: predicate.make_slice_get_slice_eliminator 0.51% : 0.000002s : 10: predicate.merge_addn 0.50% : 0.000002s : 10: predicate.micro_step_allgather_replace 0.47% : 0.000001s : 10: predicate.mini_step_allgather_replace 0.74% : 0.000002s : 17: predicate.minmaximum_grad 0.68% : 0.000002s : 5: predicate.mutable_eliminate 0.30% : 0.000001s : 5: predicate.opt_reshape 0.30% : 0.000001s : 5: predicate.parallel_virtual_node 1.44% : 0.000004s : 25: predicate.partial_defer_inline 1.42% : 0.000004s : 26: predicate.partial_eliminate 0.81% : 0.000002s : 17: predicate.print_const_string_wrapper 0.51% : 0.000002s : 10: predicate.reduce_all_const_elim 1.03% : 0.000003s : 17: predicate.reduce_eliminate 2.14% : 0.000007s : 48: predicate.redundant_stop_gradient_eliminater 0.49% : 0.000002s : 10: predicate.remove_not_recompute_node 1.13% : 0.000003s : 31: predicate.replace_applicator 0.41% : 0.000001s : 10: predicate.replace_old_param 0.22% : 0.000001s : 5: predicate.reset_defer_inline 0.87% : 0.000003s : 17: predicate.reshape_eliminate 0.50% : 0.000002s : 10: predicate.row_tensor_add_zeros_like 0.31% : 0.000001s : 5: predicate.row_tensor_eliminate 0.66% : 0.000002s : 10: predicate.same_eliminate 0.35% : 0.000001s : 10: predicate.set_cell_output_no_recompute 0.68% : 0.000002s : 10: predicate.shard_identity_eliminate 0.56% : 0.000002s : 10: predicate.special_op_eliminate 0.59% : 0.000002s : 10: predicate.specialize_transform 0.65% : 0.000002s : 10: predicate.split_environ_get_set_with_tuple_value 0.63% : 0.000002s : 10: predicate.stack_unstack_eliminate 0.29% : 0.000001s : 5: predicate.switch_call_monad_eliminater 1.29% : 0.000004s : 25: predicate.switch_defer_inline 1.69% : 0.000005s : 35: predicate.switch_layer_defer_inline 4.00% : 0.000012s : 76: predicate.switch_simplify 0.80% : 0.000002s : 17: predicate.tile_eliminate 0.81% : 0.000002s : 17: predicate.transpose_eliminate 1.28% : 0.000004s : 27: predicate.tuple_list_convert_item_index_to_positive 1.39% : 0.000004s : 27: predicate.tuple_list_get_item_const_eliminator 1.29% : 0.000004s : 27: predicate.tuple_list_get_item_depend_reorder 2.87% : 0.000009s : 41: predicate.tuple_list_get_item_eliminator 1.30% : 0.000004s : 27: predicate.tuple_list_get_set_item_eliminator 1.87% : 0.000006s : 37: predicate.tuple_list_set_item_eliminator 1.45% : 0.000004s : 31: predicate.tuple_to_list_eliminator_ 2.13% : 0.000007s : 48: predicate.updatestate_pure_node_eliminater 2.66% : 0.000008s : 58: predicate.updatestate_useless_node_eliminater 0.24% : 0.000001s : 5: predicate.value_based_eliminate 0.51% : 0.000002s : 10: predicate.virtual_dataset_eliminate 0.51% : 0.000002s : 10: predicate.virtual_output_eliminate 0.25% : 0.000001s : 5: predicate.virtual_view_grad_eliminate 0.54% : 0.000002s : 5: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000500 11 53.09% : 0.000266s : 5: func_graph_cloner_run.FuncGraphClonerGraph 46.91% : 0.000235s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.025577 192 0.01% : 0.000004s : 1: ForceFp32Comm 11.97% : 0.003062s : 1: add_attr 11.93% : 0.003051s : 1: add_attr_with_inline 0.01% : 0.000004s : 1: add_comm_op_reuse_tag 0.22% : 0.000057s : 1: add_recomputation 0.01% : 0.000004s : 1: assign_add_opt 0.27% : 0.000069s : 1: auto_monad 0.09% : 0.000023s : 1: auto_monad_reorder 0.01% : 0.000003s : 1: begin_end_overlap_inline 0.02% : 0.000005s : 1: bias_add_comm_swap 1.78% : 0.000456s : 1: bootstrap 0.10% : 0.000027s : 1: cconv 0.01% : 0.000004s : 1: comm_op_add_attrs 0.07% : 0.000017s : 1: control_data_broadcast_order 0.04% : 0.000011s : 1: convert_after_rewriter 0.10% : 0.000027s : 1: cse_after_recomputation 0.02% : 0.000005s : 1: dataset_repeat_opt 0.02% : 0.000005s : 1: detach_backward 0.04% : 0.000010s : 1: environ_conv 0.09% : 0.000024s : 1: event_method 0.02% : 0.000005s : 1: full_micro_interleaved_order_control 0.02% : 0.000004s : 1: get_jit_bprop_graph 0.04% : 0.000009s : 1: graph_reusing 0.02% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.02% : 0.000004s : 1: handle_group_info 0.02% : 0.000005s : 1: inline 0.02% : 0.000006s : 1: insert-virtual-dataset 0.01% : 0.000004s : 1: interleave_parallel_branches 0.01% : 0.000004s : 1: interleave_split_concat_branches 0.02% : 0.000006s : 1: label_fine_grained_interleaved_index 0.03% : 0.000007s : 1: label_micro_interleaved_index 1.68% : 0.000429s : 1: loop_unroll 0.02% : 0.000004s : 1: merge_cast_opt 0.02% : 0.000005s : 1: micro_interleaved_order_control 1.89% : 0.000483s : 1: mutable_eliminate 0.03% : 0.000007s : 1: offloading_packed_experts 0.06% : 0.000015s : 1: opt.transform.loop_unroll_optimizer 0.06% : 0.000016s : 1: opt.transform.mutable_eliminate 5.71% : 0.001459s : 78: opt.transform.opt_a 0.14% : 0.000035s : 1: opt.transform.opt_after_cconv 0.11% : 0.000028s : 1: opt.transform.opt_after_jit_grad 0.75% : 0.000191s : 28: opt.transform.opt_b 0.22% : 0.000057s : 2: opt.transform.opt_trans_graph 0.17% : 0.000043s : 4: opt.transform.symbol_engine_opt 11.98% : 0.003063s : 1: opt_a 0.45% : 0.000116s : 1: opt_after_cconv 1.80% : 0.000460s : 1: opt_after_jit_grad 1.26% : 0.000323s : 1: opt_b 20.27% : 0.005185s : 1: optimize 0.08% : 0.000021s : 1: optimize_parallel_all_gather_comm 0.04% : 0.000009s : 1: order_py_execute_after_rewriter 0.09% : 0.000023s : 1: overlap_grad_flash_sp 0.02% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.03% : 0.000008s : 1: overlap_grad_ring_attention 0.02% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.02% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.02% : 0.000005s : 1: overlap_param_gather 0.02% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.03% : 0.000008s : 1: overlap_recompute_and_grad_model_parallel 0.02% : 0.000006s : 1: overlap_recompute_comm 0.03% : 0.000007s : 1: parallel-infer-symbol 0.01% : 0.000004s : 1: parallel-infer-symbol-second 0.02% : 0.000005s : 1: partial_unused_args_eliminate 0.02% : 0.000004s : 1: pipeline_parallel_scheduler 0.02% : 0.000004s : 1: pipeline_split 0.15% : 0.000037s : 1: pre_auto_parallel 0.12% : 0.000030s : 1: py_interpret_to_execute 0.05% : 0.000014s : 1: py_interpret_to_execute_after_opt_a 0.02% : 0.000004s : 1: remove_cast_before_assign_add 0.07% : 0.000018s : 1: remove_dup_value 1.49% : 0.000382s : 1: renormalize.infer 1.17% : 0.000299s : 1: renormalize.specialize 0.02% : 0.000005s : 1: reorder_send_recv_between_fp_bp 0.03% : 0.000007s : 1: rewriter_after_jit_bprop_graph 0.17% : 0.000042s : 1: rewriter_after_opt_a 0.35% : 0.000090s : 1: rewriter_before_opt_a 0.02% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.02% : 0.000005s : 1: slice_recompute_activation 0.02% : 0.000004s : 1: split_layernorm_comm 0.02% : 0.000005s : 1: split_matmul_comm_elemetwise 0.03% : 0.000009s : 1: swap_dp_allreduce_reducescatter 0.32% : 0.000082s : 1: symbol_engine_optimizer 0.34% : 0.000087s : 1: tuple_transform 21.45% : 0.005485s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:45:32.103.408 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:45:32.103.692 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0164394, [21] [bootstrap]: 0.00039581 [type_inference]: 0.00550963 [event_method]: 2.01e-05 [auto_monad]: 6.711e-05 [graph_reusing]: 5.59e-06 [inline]: 1.84e-06 [add_attr]: 0.00316618, [1] [add_attr_with_inline]: 0.00315728, [1] [Cycle 1]: 8.111e-05, [2] [tag_attr]: 2.088e-05 [meta_addattr_fg_expand]: 6.28998e-06 [parallel-infer-symbol]: 3.14999e-06 [pre_auto_parallel]: 3.433e-05 [insert-virtual-dataset]: 2.63e-06 [parallel-infer-symbol-second]: 8.09989e-07 [dataset_repeat_opt]: 1.71e-06 [pipeline_split]: 1.63002e-06 [optimize]: 0.00602571, [53] [py_interpret_to_execute]: 3.468e-05 [rewriter_before_opt_a]: 9.451e-05 [opt_a]: 0.00353841, [2] [Cycle 1]: 0.00249023, [45] [expand_dump_flag]: 2.76999e-06 [switch_simplify]: 4.474e-05 [loop_unroll]: 3.273e-05 [a_1]: 0.0007218 [with_stream_mark]: 1.692e-05 [recompute_prepare]: 1.271e-05 [updatestate_depend_eliminate]: 5.51e-06 [updatestate_assign_eliminate]: 5.32001e-06 [updatestate_loads_eliminate]: 4.31002e-06 [parameter_eliminate]: 1.91e-06 [a_2]: 0.00014826 [accelerated_algorithm]: 1.003e-05 [shard]: 2.07001e-06 [meta_shard_fg_expand]: 2.27999e-06 [shard_inline]: 9.12001e-06 [merge_send_recv]: 1.046e-05 [auto_parallel]: 7.56999e-06 [parallel]: 1.737e-05 [flash_sp]: 8.81002e-06 [merge_comm]: 5.40999e-06 [allreduce_fusion]: 5.22e-06 [matmul_add_comm_reduction]: 1.147e-05 [allreduce_slice_to_reducescatter]: 6.60017e-07 [virtual_shard_identity]: 1.051e-05 [virtual_dataset]: 9.30001e-06 [get_grad_eliminate_]: 8.80999e-06 [virtual_output]: 8.89003e-06 [merge_forward]: 5.19998e-06 [cell_reuse_recompute_pass]: 1.22999e-06 [offload_activation]: 1.152e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.995e-05 [merge_recompute_call_nodes]: 1.54e-06 [before_grad]: 1.759e-05 [set_forward_comm_id_for_comm_node_pass]: 5.79e-06 [meta_fg_expand]: 4.18001e-06 [flash_sp_send_recv_attached]: 2.53e-06 [receive_attached]: 2.32001e-06 [after_resolve]: 1.346e-05 [a_after_grad]: 1.369e-05 [renormalize]: 0.00072269 [add_forward_monad_depend]: 5.51e-06 [auto_monad_grad]: 2.96001e-06 [auto_monad_eliminator]: 1.884e-05 [cse]: 4.227e-05 [a_3]: 8.183e-05 [Cycle 2]: 0.0010351, [45] [expand_dump_flag]: 8.39995e-07 [switch_simplify]: 9.96e-06 [loop_unroll]: 8.38001e-06 [a_1]: 0.00021096 [with_stream_mark]: 1.275e-05 [recompute_prepare]: 8.80001e-06 [updatestate_depend_eliminate]: 4.55999e-06 [updatestate_assign_eliminate]: 3.74002e-06 [updatestate_loads_eliminate]: 3.87998e-06 [parameter_eliminate]: 1.38002e-06 [a_2]: 0.00013855 [accelerated_algorithm]: 8.95001e-06 [shard]: 1.19e-06 [meta_shard_fg_expand]: 2.11e-06 [shard_inline]: 8.95999e-06 [merge_send_recv]: 7.83001e-06 [auto_parallel]: 7.63001e-06 [parallel]: 4.84998e-06 [flash_sp]: 3.68999e-06 [merge_comm]: 5.69999e-06 [allreduce_fusion]: 4.55999e-06 [matmul_add_comm_reduction]: 8.97e-06 [allreduce_slice_to_reducescatter]: 3.59985e-07 [virtual_shard_identity]: 9.99001e-06 [virtual_dataset]: 1.021e-05 [get_grad_eliminate_]: 9.17999e-06 [virtual_output]: 8.32e-06 [merge_forward]: 4.65999e-06 [cell_reuse_recompute_pass]: 1.84e-06 [offload_activation]: 9.23002e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.931e-05 [merge_recompute_call_nodes]: 1.07e-06 [before_grad]: 1.428e-05 [set_forward_comm_id_for_comm_node_pass]: 5.11002e-06 [meta_fg_expand]: 3.28e-06 [flash_sp_send_recv_attached]: 7.40023e-07 [receive_attached]: 9.39996e-07 [after_resolve]: 1.298e-05 [a_after_grad]: 1.334e-05 [renormalize]: 1.00001e-07 [add_forward_monad_depend]: 1.44e-06 [auto_monad_grad]: 1.30999e-06 [auto_monad_eliminator]: 1.013e-05 [cse]: 2.462e-05 [a_3]: 6.662e-05 [py_interpret_to_execute_after_opt_a]: 1.324e-05 [slice_cell_reuse_recomputed_activation]: 4.65999e-06 [rewriter_after_opt_a]: 4.854e-05 [convert_after_rewriter]: 1.126e-05 [order_py_execute_after_rewriter]: 9.46998e-06 [mutable_eliminate]: 0.00049172 [opt_b]: 0.00035194, [1] [Cycle 1]: 0.00034232, [7] [b_1]: 0.00023345 [b_2]: 1.034e-05 [updatestate_depend_eliminate]: 6.58998e-06 [updatestate_assign_eliminate]: 3.85e-06 [updatestate_loads_eliminate]: 3.86999e-06 [renormalize]: 3.09985e-07 [cse]: 2.761e-05 [optimize_parallel_all_gather_comm]: 2.269e-05 [overlap_param_gather]: 5.05999e-06 [cconv]: 2.98e-05 [loop_unroll]: 0.00043512 [opt_after_cconv]: 0.00015284, [1] [Cycle 1]: 0.00014444, [7] [c_1]: 4.369e-05 [parameter_eliminate]: 3.03e-06 [updatestate_depend_eliminate]: 6.61e-06 [updatestate_assign_eliminate]: 3.78001e-06 [updatestate_loads_eliminate]: 3.76999e-06 [cse]: 2.76e-05 [renormalize]: 3.70026e-07 [remove_dup_value]: 4.013e-05 [tuple_transform]: 0.00011018, [1] [Cycle 1]: 0.00010287, [4] [d_1]: 6.047e-05 [none_parameter_eliminate]: 1.87999e-06 [renormalize]: 1.80007e-07 [switch_simplify]: 9.54999e-06 [partial_unused_args_eliminate]: 5.02e-06 [add_recomputation]: 6.437e-05 [cse_after_recomputation]: 3.332e-05, [1] [Cycle 1]: 2.605e-05, [1] [cse]: 1.714e-05 [environ_conv]: 9.12999e-06 [swap_dp_allreduce_reducescatter]: 9.19e-06 [bias_add_comm_swap]: 5.51002e-06 [label_micro_interleaved_index]: 7.23e-06 [label_fine_grained_interleaved_index]: 6.79999e-06 [merge_cast_opt]: 4.17003e-06 [slice_recompute_activation]: 4.87998e-06 [micro_interleaved_order_control]: 5.25001e-06 [assign_add_opt]: 3.98999e-06 [ForceFp32Comm]: 3.41999e-06 [remove_cast_before_assign_add]: 3.39001e-06 [full_micro_interleaved_order_control]: 4.48001e-06 [reorder_send_recv_between_fp_bp]: 5.27001e-06 [comm_op_add_attrs]: 3.37002e-06 [add_comm_op_reuse_tag]: 3.3e-06 [interleave_split_concat_branches]: 3.81999e-06 [interleave_parallel_branches]: 3.45e-06 [overlap_opt_shard_in_pipeline]: 3.53e-06 [overlap_opt_shard_grad_in_pipeline]: 4.49002e-06 [control_data_broadcast_order]: 1.937e-05 [grouped_pairwise_exchange_alltoall]: 3.83001e-06 [offloading_packed_experts]: 7.54002e-06 [overlap_recompute_and_grad_model_parallel]: 7.99002e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.58e-06 [overlap_recompute_allgather_and_fa_grad]: 3.66001e-06 [overlap_recompute_comm]: 4.85001e-06 [overlap_grad_ring_attention]: 7.2e-06 [overlap_grad_flash_sp]: 2.565e-05 [begin_end_overlap_inline]: 2.81e-06 [split_matmul_comm_elemetwise]: 4.75001e-06 [split_layernorm_comm]: 4.1e-06 [handle_group_info]: 3.71999e-06 [symbol_engine_optimizer]: 0.00012277, [1] [Cycle 1]: 0.0001158, [6] [build]: 2.86999e-06 [elim_shapecalc]: 1.314e-05 [elim_not_effective]: 1.738e-05 [opt_reshape]: 1.892e-05 [fold_const_symbol]: 1.476e-05 [renormalize]: 2.49973e-07 [detach_backward]: 3.12002e-06 [pipeline_parallel_scheduler]: 1.77001e-06 [auto_monad_reorder]: 2.373e-05 [get_jit_bprop_graph]: 1.65001e-06 [rewriter_after_jit_bprop_graph]: 4.50001e-06 [opt_after_jit_grad]: 0.00052149 [validate]: 4.309e-05 Sums bootstrap : 0.000396s : 3.44% type_inference : 0.005510s : 47.90% event_method : 0.000020s : 0.17% auto_monad : 0.000067s : 0.58% graph_reusing : 0.000006s : 0.05% inline : 0.000002s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000021s : 0.18% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.05% parallel-infer-symbol : 0.000003s : 0.03% pre_auto_parallel : 0.000034s : 0.30% insert-virtual-dataset : 0.000003s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.01% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000035s : 0.30% optimize.rewriter_before_opt_a : 0.000095s : 0.82% optimize.opt_a.expand_dump_flag : 0.000004s : 0.03% optimize.opt_a.switch_simplify : 0.000055s : 0.48% optimize.opt_a.loop_unroll : 0.000041s : 0.36% optimize.opt_a.a_1 : 0.000933s : 8.11% optimize.opt_a.with_stream_mark : 0.000030s : 0.26% optimize.opt_a.recompute_prepare : 0.000022s : 0.19% optimize.opt_a.updatestate_depend_eliminate : 0.000010s : 0.09% optimize.opt_a.updatestate_assign_eliminate : 0.000009s : 0.08% optimize.opt_a.updatestate_loads_eliminate : 0.000008s : 0.07% optimize.opt_a.parameter_eliminate : 0.000003s : 0.03% optimize.opt_a.a_2 : 0.000287s : 2.49% optimize.opt_a.accelerated_algorithm : 0.000019s : 0.17% optimize.opt_a.shard : 0.000003s : 0.03% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.04% optimize.opt_a.shard_inline : 0.000018s : 0.16% optimize.opt_a.merge_send_recv : 0.000018s : 0.16% optimize.opt_a.auto_parallel : 0.000015s : 0.13% optimize.opt_a.parallel : 0.000022s : 0.19% optimize.opt_a.flash_sp : 0.000013s : 0.11% optimize.opt_a.merge_comm : 0.000011s : 0.10% optimize.opt_a.allreduce_fusion : 0.000010s : 0.09% optimize.opt_a.matmul_add_comm_reduction : 0.000020s : 0.18% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000020s : 0.18% optimize.opt_a.virtual_dataset : 0.000020s : 0.17% optimize.opt_a.get_grad_eliminate_ : 0.000018s : 0.16% optimize.opt_a.virtual_output : 0.000017s : 0.15% optimize.opt_a.merge_forward : 0.000010s : 0.09% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.03% optimize.opt_a.offload_activation : 0.000021s : 0.18% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000039s : 0.34% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.02% optimize.opt_a.before_grad : 0.000032s : 0.28% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000011s : 0.09% optimize.opt_a.meta_fg_expand : 0.000007s : 0.06% optimize.opt_a.flash_sp_send_recv_attached : 0.000003s : 0.03% optimize.opt_a.receive_attached : 0.000003s : 0.03% optimize.opt_a.after_resolve : 0.000026s : 0.23% optimize.opt_a.a_after_grad : 0.000027s : 0.24% optimize.opt_a.renormalize : 0.000723s : 6.28% optimize.opt_a.add_forward_monad_depend : 0.000007s : 0.06% optimize.opt_a.auto_monad_grad : 0.000004s : 0.04% optimize.opt_a.auto_monad_eliminator : 0.000029s : 0.25% optimize.opt_a.cse : 0.000067s : 0.58% optimize.opt_a.a_3 : 0.000148s : 1.29% optimize.py_interpret_to_execute_after_opt_a : 0.000013s : 0.12% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.04% optimize.rewriter_after_opt_a : 0.000049s : 0.42% optimize.convert_after_rewriter : 0.000011s : 0.10% optimize.order_py_execute_after_rewriter : 0.000009s : 0.08% optimize.mutable_eliminate : 0.000492s : 4.28% optimize.opt_b.b_1 : 0.000233s : 2.03% optimize.opt_b.b_2 : 0.000010s : 0.09% optimize.opt_b.updatestate_depend_eliminate : 0.000007s : 0.06% optimize.opt_b.updatestate_assign_eliminate : 0.000004s : 0.03% optimize.opt_b.updatestate_loads_eliminate : 0.000004s : 0.03% optimize.opt_b.renormalize : 0.000000s : 0.00% optimize.opt_b.cse : 0.000028s : 0.24% optimize.optimize_parallel_all_gather_comm : 0.000023s : 0.20% optimize.overlap_param_gather : 0.000005s : 0.04% optimize.cconv : 0.000030s : 0.26% optimize.loop_unroll : 0.000435s : 3.78% optimize.opt_after_cconv.c_1 : 0.000044s : 0.38% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000007s : 0.06% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000004s : 0.03% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000004s : 0.03% optimize.opt_after_cconv.cse : 0.000028s : 0.24% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000040s : 0.35% optimize.tuple_transform.d_1 : 0.000060s : 0.53% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.02% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000010s : 0.08% optimize.partial_unused_args_eliminate : 0.000005s : 0.04% optimize.add_recomputation : 0.000064s : 0.56% optimize.cse_after_recomputation.cse : 0.000017s : 0.15% optimize.environ_conv : 0.000009s : 0.08% optimize.swap_dp_allreduce_reducescatter : 0.000009s : 0.08% optimize.bias_add_comm_swap : 0.000006s : 0.05% optimize.label_micro_interleaved_index : 0.000007s : 0.06% optimize.label_fine_grained_interleaved_index : 0.000007s : 0.06% optimize.merge_cast_opt : 0.000004s : 0.04% optimize.slice_recompute_activation : 0.000005s : 0.04% optimize.micro_interleaved_order_control : 0.000005s : 0.05% optimize.assign_add_opt : 0.000004s : 0.03% optimize.ForceFp32Comm : 0.000003s : 0.03% optimize.remove_cast_before_assign_add : 0.000003s : 0.03% optimize.full_micro_interleaved_order_control : 0.000004s : 0.04% optimize.reorder_send_recv_between_fp_bp : 0.000005s : 0.05% optimize.comm_op_add_attrs : 0.000003s : 0.03% optimize.add_comm_op_reuse_tag : 0.000003s : 0.03% optimize.interleave_split_concat_branches : 0.000004s : 0.03% optimize.interleave_parallel_branches : 0.000003s : 0.03% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.03% optimize.overlap_opt_shard_grad_in_pipeline : 0.000004s : 0.04% optimize.control_data_broadcast_order : 0.000019s : 0.17% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.03% optimize.offloading_packed_experts : 0.000008s : 0.07% optimize.overlap_recompute_and_grad_model_parallel : 0.000008s : 0.07% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.03% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.03% optimize.overlap_recompute_comm : 0.000005s : 0.04% optimize.overlap_grad_ring_attention : 0.000007s : 0.06% optimize.overlap_grad_flash_sp : 0.000026s : 0.22% optimize.begin_end_overlap_inline : 0.000003s : 0.02% optimize.split_matmul_comm_elemetwise : 0.000005s : 0.04% optimize.split_layernorm_comm : 0.000004s : 0.04% optimize.handle_group_info : 0.000004s : 0.03% optimize.symbol_engine_optimizer.build : 0.000003s : 0.02% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000013s : 0.11% optimize.symbol_engine_optimizer.elim_not_effective : 0.000017s : 0.15% optimize.symbol_engine_optimizer.opt_reshape : 0.000019s : 0.16% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000015s : 0.13% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000003s : 0.03% pipeline_parallel_scheduler : 0.000002s : 0.02% auto_monad_reorder : 0.000024s : 0.21% get_jit_bprop_graph : 0.000002s : 0.01% rewriter_after_jit_bprop_graph : 0.000005s : 0.04% opt_after_jit_grad : 0.000521s : 4.53% validate : 0.000043s : 0.37% Time group info: ------[substitution.] 0.000222 48 14.58% : 0.000032s : 6: substitution.cast_eliminate 1.14% : 0.000003s : 4: substitution.elim_not_effective 0.90% : 0.000002s : 4: substitution.fold_const_symbol 3.20% : 0.000007s : 6: substitution.graph_param_transform 66.81% : 0.000148s : 4: substitution.inline 2.81% : 0.000006s : 8: substitution.j_node_and_user_rematch 3.25% : 0.000007s : 8: substitution.remove_not_recompute_node 1.94% : 0.000004s : 4: substitution.replace_old_param 5.37% : 0.000012s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.005456 2 86.70% : 0.004730s : 1: type_inference.infer 13.30% : 0.000726s : 1: type_inference.specialize ------[replace.] 0.000064 8 62.70% : 0.000040s : 4: replace.inline 37.30% : 0.000024s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000156 8 93.52% : 0.000146s : 4: match.inline 6.48% : 0.000010s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000275 1730 1.06% : 0.000003s : 17: predicate.accumulaten_eliminater 0.74% : 0.000002s : 6: predicate.ad_related_special_op_eliminate 0.62% : 0.000002s : 12: predicate.addn_check_dump 0.92% : 0.000003s : 17: predicate.addn_zero_filter 0.80% : 0.000002s : 17: predicate.adjust_all_reduce_mul_add 2.04% : 0.000006s : 29: predicate.arithmetic_simplify 1.14% : 0.000003s : 17: predicate.cast_eliminate 0.90% : 0.000002s : 12: predicate.check_bprop_eliminate 0.61% : 0.000002s : 12: predicate.compare_switch_simplify 0.20% : 0.000001s : 6: predicate.const_output_eliminate 0.65% : 0.000002s : 12: predicate.depend_value_elim 0.95% : 0.000003s : 17: predicate.dict_get_item_const_eliminator 0.95% : 0.000003s : 17: predicate.dict_get_item_eliminator 0.84% : 0.000002s : 17: predicate.dict_set_item_eliminator 0.94% : 0.000003s : 12: predicate.dumpgradient_eliminate 0.21% : 0.000001s : 6: predicate.elim_not_effective 0.44% : 0.000001s : 6: predicate.elim_shapecalc_of_broadcastargs 1.19% : 0.000003s : 23: predicate.environ_add_const_eliminate 1.13% : 0.000003s : 23: predicate.environ_get_add_eliminate 1.14% : 0.000003s : 23: predicate.environ_get_depend_swap 1.89% : 0.000005s : 35: predicate.environ_get_eliminate 1.12% : 0.000003s : 23: predicate.environ_get_set_eliminate 1.32% : 0.000004s : 25: predicate.exchange_switch_depend_value 2.16% : 0.000006s : 25: predicate.float_depend_g_call 0.61% : 0.000002s : 12: predicate.float_environ_get_switch 0.92% : 0.000003s : 18: predicate.float_tuple_getitem_switch 0.20% : 0.000001s : 6: predicate.fold_const_symbol 0.69% : 0.000002s : 12: predicate.get_grad_eliminate 0.25% : 0.000001s : 6: predicate.graph_param_transform 0.79% : 0.000002s : 12: predicate.incorporate_call 0.61% : 0.000002s : 12: predicate.incorporate_call_switch 6.16% : 0.000017s : 78: predicate.inline 0.96% : 0.000003s : 12: predicate.inline_without_move 0.37% : 0.000001s : 12: predicate.j_node_and_user_rematch 0.84% : 0.000002s : 12: predicate.less_batch_normalization 1.80% : 0.000005s : 33: predicate.list_to_tuple_eliminator_ 2.52% : 0.000007s : 50: predicate.load_eliminater 0.94% : 0.000003s : 6: predicate.loop_unroll_after_grad 1.97% : 0.000005s : 38: predicate.loop_unroll_before_grad 1.64% : 0.000005s : 29: predicate.make_slice_get_slice_eliminator 0.64% : 0.000002s : 12: predicate.merge_addn 0.61% : 0.000002s : 12: predicate.micro_step_allgather_replace 0.62% : 0.000002s : 12: predicate.mini_step_allgather_replace 0.81% : 0.000002s : 17: predicate.minmaximum_grad 0.84% : 0.000002s : 6: predicate.mutable_eliminate 0.40% : 0.000001s : 6: predicate.opt_reshape 0.37% : 0.000001s : 6: predicate.parallel_virtual_node 1.52% : 0.000004s : 25: predicate.partial_defer_inline 1.62% : 0.000004s : 27: predicate.partial_eliminate 0.88% : 0.000002s : 17: predicate.print_const_string_wrapper 0.64% : 0.000002s : 12: predicate.reduce_all_const_elim 1.14% : 0.000003s : 17: predicate.reduce_eliminate 2.51% : 0.000007s : 50: predicate.redundant_stop_gradient_eliminater 0.39% : 0.000001s : 12: predicate.remove_not_recompute_node 1.43% : 0.000004s : 33: predicate.replace_applicator 0.57% : 0.000002s : 12: predicate.replace_old_param 0.34% : 0.000001s : 6: predicate.reset_defer_inline 0.99% : 0.000003s : 17: predicate.reshape_eliminate 0.70% : 0.000002s : 12: predicate.row_tensor_add_zeros_like 0.38% : 0.000001s : 6: predicate.row_tensor_eliminate 0.80% : 0.000002s : 12: predicate.same_eliminate 0.51% : 0.000001s : 12: predicate.set_cell_output_no_recompute 0.89% : 0.000002s : 12: predicate.shard_identity_eliminate 0.78% : 0.000002s : 12: predicate.special_op_eliminate 0.83% : 0.000002s : 12: predicate.specialize_transform 0.90% : 0.000002s : 12: predicate.split_environ_get_set_with_tuple_value 0.76% : 0.000002s : 12: predicate.stack_unstack_eliminate 0.38% : 0.000001s : 6: predicate.switch_call_monad_eliminater 1.43% : 0.000004s : 25: predicate.switch_defer_inline 2.03% : 0.000006s : 37: predicate.switch_layer_defer_inline 4.76% : 0.000013s : 81: predicate.switch_simplify 0.85% : 0.000002s : 17: predicate.tile_eliminate 0.85% : 0.000002s : 17: predicate.transpose_eliminate 1.70% : 0.000005s : 29: predicate.tuple_list_convert_item_index_to_positive 1.68% : 0.000005s : 29: predicate.tuple_list_get_item_const_eliminator 1.52% : 0.000004s : 29: predicate.tuple_list_get_item_depend_reorder 3.03% : 0.000008s : 45: predicate.tuple_list_get_item_eliminator 1.53% : 0.000004s : 29: predicate.tuple_list_get_set_item_eliminator 2.27% : 0.000006s : 41: predicate.tuple_list_set_item_eliminator 1.75% : 0.000005s : 33: predicate.tuple_to_list_eliminator_ 2.45% : 0.000007s : 50: predicate.updatestate_pure_node_eliminater 3.18% : 0.000009s : 62: predicate.updatestate_useless_node_eliminater 0.39% : 0.000001s : 6: predicate.value_based_eliminate 0.68% : 0.000002s : 12: predicate.virtual_dataset_eliminate 0.69% : 0.000002s : 12: predicate.virtual_output_eliminate 0.30% : 0.000001s : 6: predicate.virtual_view_grad_eliminate 0.45% : 0.000001s : 6: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000502 11 50.16% : 0.000252s : 5: func_graph_cloner_run.FuncGraphClonerGraph 49.84% : 0.000250s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.028175 192 0.02% : 0.000006s : 1: ForceFp32Comm 11.27% : 0.003176s : 1: add_attr 11.22% : 0.003161s : 1: add_attr_with_inline 0.02% : 0.000006s : 1: add_comm_op_reuse_tag 0.24% : 0.000068s : 1: add_recomputation 0.02% : 0.000007s : 1: assign_add_opt 0.27% : 0.000076s : 1: auto_monad 0.11% : 0.000032s : 1: auto_monad_reorder 0.02% : 0.000006s : 1: begin_end_overlap_inline 0.03% : 0.000008s : 1: bias_add_comm_swap 1.56% : 0.000440s : 1: bootstrap 0.12% : 0.000033s : 1: cconv 0.02% : 0.000006s : 1: comm_op_add_attrs 0.08% : 0.000022s : 1: control_data_broadcast_order 0.05% : 0.000014s : 1: convert_after_rewriter 0.13% : 0.000037s : 1: cse_after_recomputation 0.03% : 0.000007s : 1: dataset_repeat_opt 0.07% : 0.000019s : 1: detach_backward 0.04% : 0.000012s : 1: environ_conv 0.11% : 0.000031s : 1: event_method 0.03% : 0.000007s : 1: full_micro_interleaved_order_control 0.03% : 0.000008s : 1: get_jit_bprop_graph 0.04% : 0.000012s : 1: graph_reusing 0.02% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.02% : 0.000006s : 1: handle_group_info 0.03% : 0.000007s : 1: inline 0.03% : 0.000009s : 1: insert-virtual-dataset 0.02% : 0.000006s : 1: interleave_parallel_branches 0.02% : 0.000007s : 1: interleave_split_concat_branches 0.04% : 0.000010s : 1: label_fine_grained_interleaved_index 0.04% : 0.000010s : 1: label_micro_interleaved_index 1.57% : 0.000441s : 1: loop_unroll 0.02% : 0.000007s : 1: merge_cast_opt 0.03% : 0.000008s : 1: micro_interleaved_order_control 1.77% : 0.000498s : 1: mutable_eliminate 0.04% : 0.000011s : 1: offloading_packed_experts 0.06% : 0.000017s : 1: opt.transform.loop_unroll_optimizer 0.07% : 0.000018s : 1: opt.transform.mutable_eliminate 5.49% : 0.001548s : 78: opt.transform.opt_a 0.15% : 0.000042s : 1: opt.transform.opt_after_cconv 0.12% : 0.000034s : 1: opt.transform.opt_after_jit_grad 0.60% : 0.000168s : 28: opt.transform.opt_b 0.24% : 0.000068s : 2: opt.transform.opt_trans_graph 0.21% : 0.000060s : 4: opt.transform.symbol_engine_opt 12.57% : 0.003542s : 1: opt_a 0.55% : 0.000156s : 1: opt_after_cconv 1.89% : 0.000532s : 1: opt_after_jit_grad 1.26% : 0.000355s : 1: opt_b 22.66% : 0.006385s : 1: optimize 0.09% : 0.000026s : 1: optimize_parallel_all_gather_comm 0.04% : 0.000012s : 1: order_py_execute_after_rewriter 0.10% : 0.000029s : 1: overlap_grad_flash_sp 0.02% : 0.000006s : 1: overlap_grad_matmul_and_grad_allreduce 0.04% : 0.000010s : 1: overlap_grad_ring_attention 0.03% : 0.000007s : 1: overlap_opt_shard_grad_in_pipeline 0.02% : 0.000006s : 1: overlap_opt_shard_in_pipeline 0.03% : 0.000008s : 1: overlap_param_gather 0.02% : 0.000006s : 1: overlap_recompute_allgather_and_fa_grad 0.04% : 0.000011s : 1: overlap_recompute_and_grad_model_parallel 0.03% : 0.000008s : 1: overlap_recompute_comm 0.03% : 0.000009s : 1: parallel-infer-symbol 0.02% : 0.000006s : 1: parallel-infer-symbol-second 0.03% : 0.000008s : 1: partial_unused_args_eliminate 0.03% : 0.000009s : 1: pipeline_parallel_scheduler 0.03% : 0.000007s : 1: pipeline_split 0.15% : 0.000042s : 1: pre_auto_parallel 0.14% : 0.000039s : 1: py_interpret_to_execute 0.06% : 0.000016s : 1: py_interpret_to_execute_after_opt_a 0.02% : 0.000006s : 1: remove_cast_before_assign_add 0.16% : 0.000044s : 1: remove_dup_value 1.40% : 0.000395s : 1: renormalize.infer 1.14% : 0.000320s : 1: renormalize.specialize 0.03% : 0.000008s : 1: reorder_send_recv_between_fp_bp 0.04% : 0.000011s : 1: rewriter_after_jit_bprop_graph 0.19% : 0.000053s : 1: rewriter_after_opt_a 0.35% : 0.000099s : 1: rewriter_before_opt_a 0.03% : 0.000007s : 1: slice_cell_reuse_recomputed_activation 0.03% : 0.000008s : 1: slice_recompute_activation 0.02% : 0.000007s : 1: split_layernorm_comm 0.03% : 0.000008s : 1: split_matmul_comm_elemetwise 0.04% : 0.000012s : 1: swap_dp_allreduce_reducescatter 0.45% : 0.000126s : 1: symbol_engine_optimizer 0.40% : 0.000113s : 1: tuple_transform 19.69% : 0.005548s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:45:32.521.880 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0149942, [21] [bootstrap]: 0.00042029 [type_inference]: 0.00562897 [event_method]: 1.914e-05 [auto_monad]: 6.56e-05 [graph_reusing]: 5.14e-06 [inline]: 1.79998e-06 [add_attr]: 0.00298601, [1] [add_attr_with_inline]: 0.00297768, [1] [Cycle 1]: 5.162e-05, [2] [tag_attr]: 1.856e-05 [meta_addattr_fg_expand]: 5.92001e-06 [parallel-infer-symbol]: 3.55998e-06 [pre_auto_parallel]: 3.328e-05 [insert-virtual-dataset]: 2.39001e-06 [parallel-infer-symbol-second]: 8.89995e-07 [dataset_repeat_opt]: 1.89e-06 [pipeline_split]: 1.81003e-06 [optimize]: 0.0051621, [53] [py_interpret_to_execute]: 2.719e-05 [rewriter_before_opt_a]: 8.624e-05 [opt_a]: 0.00299966, [2] [Cycle 1]: 0.00214554, [45] [expand_dump_flag]: 2.91999e-06 [switch_simplify]: 4.504e-05 [loop_unroll]: 3.182e-05 [a_1]: 0.00068941 [with_stream_mark]: 1.551e-05 [recompute_prepare]: 1.082e-05 [updatestate_depend_eliminate]: 5.34e-06 [updatestate_assign_eliminate]: 4.17e-06 [updatestate_loads_eliminate]: 4.1e-06 [parameter_eliminate]: 2.17999e-06 [a_2]: 0.00011805 [accelerated_algorithm]: 9.25001e-06 [shard]: 1.74998e-06 [meta_shard_fg_expand]: 2.26e-06 [shard_inline]: 9.09e-06 [merge_send_recv]: 1.009e-05 [auto_parallel]: 7.56999e-06 [parallel]: 1.778e-05 [flash_sp]: 7.71999e-06 [merge_comm]: 5.10001e-06 [allreduce_fusion]: 4.87998e-06 [matmul_add_comm_reduction]: 1.063e-05 [allreduce_slice_to_reducescatter]: 6.50005e-07 [virtual_shard_identity]: 1.022e-05 [virtual_dataset]: 9.00999e-06 [get_grad_eliminate_]: 8.77999e-06 [virtual_output]: 8.60999e-06 [merge_forward]: 4.95999e-06 [cell_reuse_recompute_pass]: 1.15999e-06 [offload_activation]: 1.092e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.699e-05 [merge_recompute_call_nodes]: 1.73997e-06 [before_grad]: 1.517e-05 [set_forward_comm_id_for_comm_node_pass]: 4.95999e-06 [meta_fg_expand]: 3.81001e-06 [flash_sp_send_recv_attached]: 2.29001e-06 [receive_attached]: 2.24001e-06 [after_resolve]: 1.332e-05 [a_after_grad]: 1.364e-05 [renormalize]: 0.00063326 [add_forward_monad_depend]: 5.19e-06 [auto_monad_grad]: 2.49001e-06 [auto_monad_eliminator]: 1.695e-05 [cse]: 4.263e-05 [a_3]: 6.465e-05 [Cycle 2]: 0.00084501, [45] [expand_dump_flag]: 8.60018e-07 [switch_simplify]: 1.002e-05 [loop_unroll]: 8.60001e-06 [a_1]: 0.00020786 [with_stream_mark]: 1.284e-05 [recompute_prepare]: 9.54e-06 [updatestate_depend_eliminate]: 4.32e-06 [updatestate_assign_eliminate]: 3.60998e-06 [updatestate_loads_eliminate]: 3.5e-06 [parameter_eliminate]: 1.12999e-06 [a_2]: 0.00010916 [accelerated_algorithm]: 8.74e-06 [shard]: 1.09e-06 [meta_shard_fg_expand]: 1.72001e-06 [shard_inline]: 1.091e-05 [merge_send_recv]: 6.69001e-06 [auto_parallel]: 6.71e-06 [parallel]: 4.65001e-06 [flash_sp]: 3.11999e-06 [merge_comm]: 4.63001e-06 [allreduce_fusion]: 4.12e-06 [matmul_add_comm_reduction]: 7.15e-06 [allreduce_slice_to_reducescatter]: 3.39991e-07 [virtual_shard_identity]: 9.56e-06 [virtual_dataset]: 8.64e-06 [get_grad_eliminate_]: 8e-06 [virtual_output]: 8.60001e-06 [merge_forward]: 3.9e-06 [cell_reuse_recompute_pass]: 1.62001e-06 [offload_activation]: 8.91997e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.679e-05 [merge_recompute_call_nodes]: 7.99977e-07 [before_grad]: 1.415e-05 [set_forward_comm_id_for_comm_node_pass]: 4.74e-06 [meta_fg_expand]: 3.53e-06 [flash_sp_send_recv_attached]: 8.29983e-07 [receive_attached]: 9.09989e-07 [after_resolve]: 1.23e-05 [a_after_grad]: 1.314e-05 [renormalize]: 8.00064e-08 [add_forward_monad_depend]: 1.17999e-06 [auto_monad_grad]: 9.20001e-07 [auto_monad_eliminator]: 9.59999e-06 [cse]: 2.232e-05 [a_3]: 5.442e-05 [py_interpret_to_execute_after_opt_a]: 1.026e-05 [slice_cell_reuse_recomputed_activation]: 2.17999e-06 [rewriter_after_opt_a]: 4.378e-05 [convert_after_rewriter]: 8.60001e-06 [order_py_execute_after_rewriter]: 6.57002e-06 [mutable_eliminate]: 0.00047502 [opt_b]: 0.00027439, [1] [Cycle 1]: 0.00026812, [7] [b_1]: 0.00018061 [b_2]: 1.098e-05 [updatestate_depend_eliminate]: 6.56999e-06 [updatestate_assign_eliminate]: 3.73001e-06 [updatestate_loads_eliminate]: 3.76999e-06 [renormalize]: 6.99976e-07 [cse]: 2.577e-05 [optimize_parallel_all_gather_comm]: 1.865e-05 [overlap_param_gather]: 1.85001e-06 [cconv]: 2.423e-05 [loop_unroll]: 0.00041744 [opt_after_cconv]: 0.00012716, [1] [Cycle 1]: 0.00012148, [7] [c_1]: 4.281e-05 [parameter_eliminate]: 2.83e-06 [updatestate_depend_eliminate]: 6.93e-06 [updatestate_assign_eliminate]: 3.61001e-06 [updatestate_loads_eliminate]: 3.58e-06 [cse]: 2.634e-05 [renormalize]: 3.60014e-07 [remove_dup_value]: 3.627e-05 [tuple_transform]: 9.601e-05, [1] [Cycle 1]: 9.148e-05, [4] [d_1]: 6.051e-05 [none_parameter_eliminate]: 1.64e-06 [renormalize]: 2.09984e-07 [switch_simplify]: 9.91e-06 [partial_unused_args_eliminate]: 1.71998e-06 [add_recomputation]: 5.849e-05 [cse_after_recomputation]: 2.683e-05, [1] [Cycle 1]: 2.23e-05, [1] [cse]: 1.673e-05 [environ_conv]: 6.29001e-06 [swap_dp_allreduce_reducescatter]: 6.88998e-06 [bias_add_comm_swap]: 2.36e-06 [label_micro_interleaved_index]: 4.78001e-06 [label_fine_grained_interleaved_index]: 2.71e-06 [merge_cast_opt]: 1.42e-06 [slice_recompute_activation]: 2.39999e-06 [micro_interleaved_order_control]: 2.00002e-06 [assign_add_opt]: 1.55001e-06 [ForceFp32Comm]: 7.39994e-07 [remove_cast_before_assign_add]: 1.02e-06 [full_micro_interleaved_order_control]: 2.15002e-06 [reorder_send_recv_between_fp_bp]: 2.59999e-06 [comm_op_add_attrs]: 1.03001e-06 [add_comm_op_reuse_tag]: 9.5999e-07 [interleave_split_concat_branches]: 1.09998e-06 [interleave_parallel_branches]: 1.02e-06 [overlap_opt_shard_in_pipeline]: 1.12e-06 [overlap_opt_shard_grad_in_pipeline]: 2.04999e-06 [control_data_broadcast_order]: 1.553e-05 [grouped_pairwise_exchange_alltoall]: 1.52999e-06 [offloading_packed_experts]: 4.75001e-06 [overlap_recompute_and_grad_model_parallel]: 5.65001e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.15999e-06 [overlap_recompute_allgather_and_fa_grad]: 1.34e-06 [overlap_recompute_comm]: 2.16e-06 [overlap_grad_ring_attention]: 4.68999e-06 [overlap_grad_flash_sp]: 2.383e-05 [begin_end_overlap_inline]: 5.39992e-07 [split_matmul_comm_elemetwise]: 2.27999e-06 [split_layernorm_comm]: 1.83997e-06 [handle_group_info]: 9.89996e-07 [symbol_engine_optimizer]: 9.461e-05, [1] [Cycle 1]: 8.981e-05, [6] [build]: 3.97e-06 [elim_shapecalc]: 1.273e-05 [elim_not_effective]: 1.879e-05 [opt_reshape]: 1.025e-05 [fold_const_symbol]: 1.422e-05 [renormalize]: 2.00002e-07 [detach_backward]: 1.97999e-06 [pipeline_parallel_scheduler]: 1.49e-06 [auto_monad_reorder]: 2.023e-05 [get_jit_bprop_graph]: 1.14e-06 [rewriter_after_jit_bprop_graph]: 3.77998e-06 [opt_after_jit_grad]: 0.00045611 [validate]: 3.961e-05 Sums bootstrap : 0.000420s : 3.79% type_inference : 0.005629s : 50.82% event_method : 0.000019s : 0.17% auto_monad : 0.000066s : 0.59% graph_reusing : 0.000005s : 0.05% inline : 0.000002s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000019s : 0.17% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.05% parallel-infer-symbol : 0.000004s : 0.03% pre_auto_parallel : 0.000033s : 0.30% insert-virtual-dataset : 0.000002s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.02% optimize.py_interpret_to_execute : 0.000027s : 0.25% optimize.rewriter_before_opt_a : 0.000086s : 0.78% optimize.opt_a.expand_dump_flag : 0.000004s : 0.03% optimize.opt_a.switch_simplify : 0.000055s : 0.50% optimize.opt_a.loop_unroll : 0.000040s : 0.36% optimize.opt_a.a_1 : 0.000897s : 8.10% optimize.opt_a.with_stream_mark : 0.000028s : 0.26% optimize.opt_a.recompute_prepare : 0.000020s : 0.18% optimize.opt_a.updatestate_depend_eliminate : 0.000010s : 0.09% optimize.opt_a.updatestate_assign_eliminate : 0.000008s : 0.07% optimize.opt_a.updatestate_loads_eliminate : 0.000008s : 0.07% optimize.opt_a.parameter_eliminate : 0.000003s : 0.03% optimize.opt_a.a_2 : 0.000227s : 2.05% optimize.opt_a.accelerated_algorithm : 0.000018s : 0.16% optimize.opt_a.shard : 0.000003s : 0.03% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.04% optimize.opt_a.shard_inline : 0.000020s : 0.18% optimize.opt_a.merge_send_recv : 0.000017s : 0.15% optimize.opt_a.auto_parallel : 0.000014s : 0.13% optimize.opt_a.parallel : 0.000022s : 0.20% optimize.opt_a.flash_sp : 0.000011s : 0.10% optimize.opt_a.merge_comm : 0.000010s : 0.09% optimize.opt_a.allreduce_fusion : 0.000009s : 0.08% optimize.opt_a.matmul_add_comm_reduction : 0.000018s : 0.16% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000020s : 0.18% optimize.opt_a.virtual_dataset : 0.000018s : 0.16% optimize.opt_a.get_grad_eliminate_ : 0.000017s : 0.15% optimize.opt_a.virtual_output : 0.000017s : 0.16% optimize.opt_a.merge_forward : 0.000009s : 0.08% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.03% optimize.opt_a.offload_activation : 0.000020s : 0.18% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000034s : 0.30% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.02% optimize.opt_a.before_grad : 0.000029s : 0.26% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000010s : 0.09% optimize.opt_a.meta_fg_expand : 0.000007s : 0.07% optimize.opt_a.flash_sp_send_recv_attached : 0.000003s : 0.03% optimize.opt_a.receive_attached : 0.000003s : 0.03% optimize.opt_a.after_resolve : 0.000026s : 0.23% optimize.opt_a.a_after_grad : 0.000027s : 0.24% optimize.opt_a.renormalize : 0.000633s : 5.72% optimize.opt_a.add_forward_monad_depend : 0.000006s : 0.06% optimize.opt_a.auto_monad_grad : 0.000003s : 0.03% optimize.opt_a.auto_monad_eliminator : 0.000027s : 0.24% optimize.opt_a.cse : 0.000065s : 0.59% optimize.opt_a.a_3 : 0.000119s : 1.08% optimize.py_interpret_to_execute_after_opt_a : 0.000010s : 0.09% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.02% optimize.rewriter_after_opt_a : 0.000044s : 0.40% optimize.convert_after_rewriter : 0.000009s : 0.08% optimize.order_py_execute_after_rewriter : 0.000007s : 0.06% optimize.mutable_eliminate : 0.000475s : 4.29% optimize.opt_b.b_1 : 0.000181s : 1.63% optimize.opt_b.b_2 : 0.000011s : 0.10% optimize.opt_b.updatestate_depend_eliminate : 0.000007s : 0.06% optimize.opt_b.updatestate_assign_eliminate : 0.000004s : 0.03% optimize.opt_b.updatestate_loads_eliminate : 0.000004s : 0.03% optimize.opt_b.renormalize : 0.000001s : 0.01% optimize.opt_b.cse : 0.000026s : 0.23% optimize.optimize_parallel_all_gather_comm : 0.000019s : 0.17% optimize.overlap_param_gather : 0.000002s : 0.02% optimize.cconv : 0.000024s : 0.22% optimize.loop_unroll : 0.000417s : 3.77% optimize.opt_after_cconv.c_1 : 0.000043s : 0.39% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000007s : 0.06% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000004s : 0.03% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000004s : 0.03% optimize.opt_after_cconv.cse : 0.000026s : 0.24% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000036s : 0.33% optimize.tuple_transform.d_1 : 0.000061s : 0.55% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000010s : 0.09% optimize.partial_unused_args_eliminate : 0.000002s : 0.02% optimize.add_recomputation : 0.000058s : 0.53% optimize.cse_after_recomputation.cse : 0.000017s : 0.15% optimize.environ_conv : 0.000006s : 0.06% optimize.swap_dp_allreduce_reducescatter : 0.000007s : 0.06% optimize.bias_add_comm_swap : 0.000002s : 0.02% optimize.label_micro_interleaved_index : 0.000005s : 0.04% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.02% optimize.merge_cast_opt : 0.000001s : 0.01% optimize.slice_recompute_activation : 0.000002s : 0.02% optimize.micro_interleaved_order_control : 0.000002s : 0.02% optimize.assign_add_opt : 0.000002s : 0.01% optimize.ForceFp32Comm : 0.000001s : 0.01% optimize.remove_cast_before_assign_add : 0.000001s : 0.01% optimize.full_micro_interleaved_order_control : 0.000002s : 0.02% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.02% optimize.comm_op_add_attrs : 0.000001s : 0.01% optimize.add_comm_op_reuse_tag : 0.000001s : 0.01% optimize.interleave_split_concat_branches : 0.000001s : 0.01% optimize.interleave_parallel_branches : 0.000001s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.02% optimize.control_data_broadcast_order : 0.000016s : 0.14% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.01% optimize.offloading_packed_experts : 0.000005s : 0.04% optimize.overlap_recompute_and_grad_model_parallel : 0.000006s : 0.05% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.01% optimize.overlap_recompute_comm : 0.000002s : 0.02% optimize.overlap_grad_ring_attention : 0.000005s : 0.04% optimize.overlap_grad_flash_sp : 0.000024s : 0.22% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.02% optimize.split_layernorm_comm : 0.000002s : 0.02% optimize.handle_group_info : 0.000001s : 0.01% optimize.symbol_engine_optimizer.build : 0.000004s : 0.04% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000013s : 0.11% optimize.symbol_engine_optimizer.elim_not_effective : 0.000019s : 0.17% optimize.symbol_engine_optimizer.opt_reshape : 0.000010s : 0.09% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000014s : 0.13% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.02% pipeline_parallel_scheduler : 0.000001s : 0.01% auto_monad_reorder : 0.000020s : 0.18% get_jit_bprop_graph : 0.000001s : 0.01% rewriter_after_jit_bprop_graph : 0.000004s : 0.03% opt_after_jit_grad : 0.000456s : 4.12% validate : 0.000040s : 0.36% Time group info: ------[substitution.] 0.000204 48 15.45% : 0.000032s : 6: substitution.cast_eliminate 1.23% : 0.000003s : 4: substitution.elim_not_effective 0.91% : 0.000002s : 4: substitution.fold_const_symbol 3.60% : 0.000007s : 6: substitution.graph_param_transform 64.55% : 0.000132s : 4: substitution.inline 2.48% : 0.000005s : 8: substitution.j_node_and_user_rematch 3.70% : 0.000008s : 8: substitution.remove_not_recompute_node 1.94% : 0.000004s : 4: substitution.replace_old_param 6.14% : 0.000013s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.005576 2 87.82% : 0.004897s : 1: type_inference.infer 12.18% : 0.000679s : 1: type_inference.specialize ------[replace.] 0.000058 8 60.62% : 0.000035s : 4: replace.inline 39.38% : 0.000023s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000140 8 92.32% : 0.000129s : 4: match.inline 7.68% : 0.000011s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000269 1730 0.91% : 0.000002s : 17: predicate.accumulaten_eliminater 0.72% : 0.000002s : 6: predicate.ad_related_special_op_eliminate 0.59% : 0.000002s : 12: predicate.addn_check_dump 0.90% : 0.000002s : 17: predicate.addn_zero_filter 0.88% : 0.000002s : 17: predicate.adjust_all_reduce_mul_add 2.02% : 0.000005s : 29: predicate.arithmetic_simplify 1.06% : 0.000003s : 17: predicate.cast_eliminate 0.68% : 0.000002s : 12: predicate.check_bprop_eliminate 0.60% : 0.000002s : 12: predicate.compare_switch_simplify 0.20% : 0.000001s : 6: predicate.const_output_eliminate 0.68% : 0.000002s : 12: predicate.depend_value_elim 0.96% : 0.000003s : 17: predicate.dict_get_item_const_eliminator 1.00% : 0.000003s : 17: predicate.dict_get_item_eliminator 0.86% : 0.000002s : 17: predicate.dict_set_item_eliminator 0.91% : 0.000002s : 12: predicate.dumpgradient_eliminate 0.25% : 0.000001s : 6: predicate.elim_not_effective 0.44% : 0.000001s : 6: predicate.elim_shapecalc_of_broadcastargs 1.14% : 0.000003s : 23: predicate.environ_add_const_eliminate 1.13% : 0.000003s : 23: predicate.environ_get_add_eliminate 1.15% : 0.000003s : 23: predicate.environ_get_depend_swap 1.82% : 0.000005s : 35: predicate.environ_get_eliminate 1.13% : 0.000003s : 23: predicate.environ_get_set_eliminate 1.32% : 0.000004s : 25: predicate.exchange_switch_depend_value 2.19% : 0.000006s : 25: predicate.float_depend_g_call 0.58% : 0.000002s : 12: predicate.float_environ_get_switch 0.87% : 0.000002s : 18: predicate.float_tuple_getitem_switch 0.22% : 0.000001s : 6: predicate.fold_const_symbol 0.71% : 0.000002s : 12: predicate.get_grad_eliminate 0.26% : 0.000001s : 6: predicate.graph_param_transform 0.68% : 0.000002s : 12: predicate.incorporate_call 0.62% : 0.000002s : 12: predicate.incorporate_call_switch 6.26% : 0.000017s : 78: predicate.inline 0.91% : 0.000002s : 12: predicate.inline_without_move 0.36% : 0.000001s : 12: predicate.j_node_and_user_rematch 0.85% : 0.000002s : 12: predicate.less_batch_normalization 1.81% : 0.000005s : 33: predicate.list_to_tuple_eliminator_ 2.57% : 0.000007s : 50: predicate.load_eliminater 0.90% : 0.000002s : 6: predicate.loop_unroll_after_grad 1.97% : 0.000005s : 38: predicate.loop_unroll_before_grad 1.68% : 0.000005s : 29: predicate.make_slice_get_slice_eliminator 0.65% : 0.000002s : 12: predicate.merge_addn 0.62% : 0.000002s : 12: predicate.micro_step_allgather_replace 0.60% : 0.000002s : 12: predicate.mini_step_allgather_replace 0.83% : 0.000002s : 17: predicate.minmaximum_grad 0.93% : 0.000003s : 6: predicate.mutable_eliminate 0.43% : 0.000001s : 6: predicate.opt_reshape 0.43% : 0.000001s : 6: predicate.parallel_virtual_node 1.58% : 0.000004s : 25: predicate.partial_defer_inline 1.68% : 0.000005s : 27: predicate.partial_eliminate 0.89% : 0.000002s : 17: predicate.print_const_string_wrapper 0.62% : 0.000002s : 12: predicate.reduce_all_const_elim 1.18% : 0.000003s : 17: predicate.reduce_eliminate 2.54% : 0.000007s : 50: predicate.redundant_stop_gradient_eliminater 0.40% : 0.000001s : 12: predicate.remove_not_recompute_node 1.32% : 0.000004s : 33: predicate.replace_applicator 0.55% : 0.000001s : 12: predicate.replace_old_param 0.27% : 0.000001s : 6: predicate.reset_defer_inline 0.99% : 0.000003s : 17: predicate.reshape_eliminate 0.64% : 0.000002s : 12: predicate.row_tensor_add_zeros_like 0.40% : 0.000001s : 6: predicate.row_tensor_eliminate 0.75% : 0.000002s : 12: predicate.same_eliminate 0.49% : 0.000001s : 12: predicate.set_cell_output_no_recompute 0.83% : 0.000002s : 12: predicate.shard_identity_eliminate 0.85% : 0.000002s : 12: predicate.special_op_eliminate 0.83% : 0.000002s : 12: predicate.specialize_transform 0.84% : 0.000002s : 12: predicate.split_environ_get_set_with_tuple_value 0.74% : 0.000002s : 12: predicate.stack_unstack_eliminate 0.38% : 0.000001s : 6: predicate.switch_call_monad_eliminater 1.40% : 0.000004s : 25: predicate.switch_defer_inline 2.10% : 0.000006s : 37: predicate.switch_layer_defer_inline 4.78% : 0.000013s : 81: predicate.switch_simplify 0.91% : 0.000002s : 17: predicate.tile_eliminate 0.88% : 0.000002s : 17: predicate.transpose_eliminate 1.56% : 0.000004s : 29: predicate.tuple_list_convert_item_index_to_positive 1.63% : 0.000004s : 29: predicate.tuple_list_get_item_const_eliminator 1.51% : 0.000004s : 29: predicate.tuple_list_get_item_depend_reorder 3.08% : 0.000008s : 45: predicate.tuple_list_get_item_eliminator 1.53% : 0.000004s : 29: predicate.tuple_list_get_set_item_eliminator 2.48% : 0.000007s : 41: predicate.tuple_list_set_item_eliminator 1.83% : 0.000005s : 33: predicate.tuple_to_list_eliminator_ 2.51% : 0.000007s : 50: predicate.updatestate_pure_node_eliminater 3.22% : 0.000009s : 62: predicate.updatestate_useless_node_eliminater 0.40% : 0.000001s : 6: predicate.value_based_eliminate 0.69% : 0.000002s : 12: predicate.virtual_dataset_eliminate 0.69% : 0.000002s : 12: predicate.virtual_output_eliminate 0.30% : 0.000001s : 6: predicate.virtual_view_grad_eliminate 0.41% : 0.000001s : 6: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000493 11 55.37% : 0.000273s : 5: func_graph_cloner_run.FuncGraphClonerGraph 44.63% : 0.000220s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.025537 192 0.01% : 0.000003s : 1: ForceFp32Comm 11.71% : 0.002990s : 1: add_attr 11.67% : 0.002981s : 1: add_attr_with_inline 0.01% : 0.000004s : 1: add_comm_op_reuse_tag 0.24% : 0.000062s : 1: add_recomputation 0.02% : 0.000004s : 1: assign_add_opt 0.28% : 0.000071s : 1: auto_monad 0.09% : 0.000024s : 1: auto_monad_reorder 0.01% : 0.000003s : 1: begin_end_overlap_inline 0.02% : 0.000005s : 1: bias_add_comm_swap 1.75% : 0.000446s : 1: bootstrap 0.11% : 0.000028s : 1: cconv 0.01% : 0.000004s : 1: comm_op_add_attrs 0.07% : 0.000019s : 1: control_data_broadcast_order 0.05% : 0.000012s : 1: convert_after_rewriter 0.12% : 0.000030s : 1: cse_after_recomputation 0.02% : 0.000005s : 1: dataset_repeat_opt 0.02% : 0.000005s : 1: detach_backward 0.04% : 0.000010s : 1: environ_conv 0.10% : 0.000025s : 1: event_method 0.02% : 0.000005s : 1: full_micro_interleaved_order_control 0.02% : 0.000004s : 1: get_jit_bprop_graph 0.03% : 0.000009s : 1: graph_reusing 0.02% : 0.000005s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000004s : 1: handle_group_info 0.02% : 0.000005s : 1: inline 0.02% : 0.000006s : 1: insert-virtual-dataset 0.01% : 0.000004s : 1: interleave_parallel_branches 0.01% : 0.000004s : 1: interleave_split_concat_branches 0.02% : 0.000006s : 1: label_fine_grained_interleaved_index 0.03% : 0.000008s : 1: label_micro_interleaved_index 1.66% : 0.000425s : 1: loop_unroll 0.02% : 0.000004s : 1: merge_cast_opt 0.02% : 0.000005s : 1: micro_interleaved_order_control 1.89% : 0.000483s : 1: mutable_eliminate 0.03% : 0.000008s : 1: offloading_packed_experts 0.07% : 0.000017s : 1: opt.transform.loop_unroll_optimizer 0.07% : 0.000018s : 1: opt.transform.mutable_eliminate 5.86% : 0.001496s : 78: opt.transform.opt_a 0.16% : 0.000041s : 1: opt.transform.opt_after_cconv 0.12% : 0.000032s : 1: opt.transform.opt_after_jit_grad 0.63% : 0.000161s : 28: opt.transform.opt_b 0.27% : 0.000068s : 2: opt.transform.opt_trans_graph 0.20% : 0.000052s : 4: opt.transform.symbol_engine_opt 11.76% : 0.003003s : 1: opt_a 0.51% : 0.000131s : 1: opt_after_cconv 1.82% : 0.000465s : 1: opt_after_jit_grad 1.09% : 0.000278s : 1: opt_b 20.23% : 0.005166s : 1: optimize 0.09% : 0.000022s : 1: optimize_parallel_all_gather_comm 0.04% : 0.000010s : 1: order_py_execute_after_rewriter 0.11% : 0.000027s : 1: overlap_grad_flash_sp 0.02% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.03% : 0.000008s : 1: overlap_grad_ring_attention 0.02% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.02% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.02% : 0.000005s : 1: overlap_param_gather 0.02% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.03% : 0.000008s : 1: overlap_recompute_and_grad_model_parallel 0.02% : 0.000005s : 1: overlap_recompute_comm 0.03% : 0.000007s : 1: parallel-infer-symbol 0.01% : 0.000004s : 1: parallel-infer-symbol-second 0.02% : 0.000005s : 1: partial_unused_args_eliminate 0.02% : 0.000005s : 1: pipeline_parallel_scheduler 0.02% : 0.000005s : 1: pipeline_split 0.15% : 0.000037s : 1: pre_auto_parallel 0.12% : 0.000031s : 1: py_interpret_to_execute 0.05% : 0.000014s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000004s : 1: remove_cast_before_assign_add 0.16% : 0.000040s : 1: remove_dup_value 1.35% : 0.000344s : 1: renormalize.infer 1.11% : 0.000282s : 1: renormalize.specialize 0.02% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.03% : 0.000007s : 1: rewriter_after_jit_bprop_graph 0.19% : 0.000048s : 1: rewriter_after_opt_a 0.35% : 0.000090s : 1: rewriter_before_opt_a 0.02% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.02% : 0.000005s : 1: slice_recompute_activation 0.02% : 0.000004s : 1: split_layernorm_comm 0.02% : 0.000005s : 1: split_matmul_comm_elemetwise 0.04% : 0.000010s : 1: swap_dp_allreduce_reducescatter 0.38% : 0.000097s : 1: symbol_engine_optimizer 0.39% : 0.000099s : 1: tuple_transform 22.10% : 0.005643s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:45:32.819.280 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:45:32.819.545 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0164255, [21] [bootstrap]: 0.0004394 [type_inference]: 0.00578732 [event_method]: 1.957e-05 [auto_monad]: 6.476e-05 [graph_reusing]: 6.11998e-06 [inline]: 2.37999e-06 [add_attr]: 0.00312697, [1] [add_attr_with_inline]: 0.00311815, [1] [Cycle 1]: 8.252e-05, [2] [tag_attr]: 1.876e-05 [meta_addattr_fg_expand]: 6.06998e-06 [parallel-infer-symbol]: 3.26001e-06 [pre_auto_parallel]: 3.297e-05 [insert-virtual-dataset]: 2.41998e-06 [parallel-infer-symbol-second]: 6.59988e-07 [dataset_repeat_opt]: 1.92001e-06 [pipeline_split]: 1.72999e-06 [optimize]: 0.00572788, [53] [py_interpret_to_execute]: 2.894e-05 [rewriter_before_opt_a]: 8.817e-05 [opt_a]: 0.00324955, [2] [Cycle 1]: 0.00228534, [45] [expand_dump_flag]: 3.01999e-06 [switch_simplify]: 4.243e-05 [loop_unroll]: 3.035e-05 [a_1]: 0.00066253 [with_stream_mark]: 1.639e-05 [recompute_prepare]: 1.051e-05 [updatestate_depend_eliminate]: 4.82e-06 [updatestate_assign_eliminate]: 3.88999e-06 [updatestate_loads_eliminate]: 3.61999e-06 [parameter_eliminate]: 2.53e-06 [a_2]: 0.000131 [accelerated_algorithm]: 8.43999e-06 [shard]: 1.67999e-06 [meta_shard_fg_expand]: 2.26e-06 [shard_inline]: 8.13999e-06 [merge_send_recv]: 9.72001e-06 [auto_parallel]: 6.79001e-06 [parallel]: 1.749e-05 [flash_sp]: 9.00999e-06 [merge_comm]: 4.62998e-06 [allreduce_fusion]: 4.16001e-06 [matmul_add_comm_reduction]: 1.005e-05 [allreduce_slice_to_reducescatter]: 8.59989e-07 [virtual_shard_identity]: 9.89001e-06 [virtual_dataset]: 8.07e-06 [get_grad_eliminate_]: 7.42002e-06 [virtual_output]: 7.62998e-06 [merge_forward]: 4.50999e-06 [cell_reuse_recompute_pass]: 1.22e-06 [offload_activation]: 1.051e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.725e-05 [merge_recompute_call_nodes]: 1.54e-06 [before_grad]: 1.374e-05 [set_forward_comm_id_for_comm_node_pass]: 4.53001e-06 [meta_fg_expand]: 3.51999e-06 [flash_sp_send_recv_attached]: 2.54001e-06 [receive_attached]: 2.16e-06 [after_resolve]: 1.292e-05 [a_after_grad]: 1.192e-05 [renormalize]: 0.00064924 [add_forward_monad_depend]: 5.84e-06 [auto_monad_grad]: 2.43002e-06 [auto_monad_eliminator]: 1.75e-05 [cse]: 3.296e-05 [a_3]: 7.37e-05 [Cycle 2]: 0.00095053, [45] [expand_dump_flag]: 1.04e-06 [switch_simplify]: 9.41e-06 [loop_unroll]: 7.39002e-06 [a_1]: 0.0001746 [with_stream_mark]: 1.292e-05 [recompute_prepare]: 7.77e-06 [updatestate_depend_eliminate]: 4e-06 [updatestate_assign_eliminate]: 3.46001e-06 [updatestate_loads_eliminate]: 3.24001e-06 [parameter_eliminate]: 1.40999e-06 [a_2]: 0.00011923 [accelerated_algorithm]: 7.57998e-06 [shard]: 1.47001e-06 [meta_shard_fg_expand]: 1.79998e-06 [shard_inline]: 7.57998e-06 [merge_send_recv]: 6.77002e-06 [auto_parallel]: 7.43999e-06 [parallel]: 6.01e-06 [flash_sp]: 3.60003e-06 [merge_comm]: 7.11001e-06 [allreduce_fusion]: 4.18001e-06 [matmul_add_comm_reduction]: 7.09001e-06 [allreduce_slice_to_reducescatter]: 5.39992e-07 [virtual_shard_identity]: 9.04e-06 [virtual_dataset]: 7.87e-06 [get_grad_eliminate_]: 7.1e-06 [virtual_output]: 6.81001e-06 [merge_forward]: 3.68999e-06 [cell_reuse_recompute_pass]: 1.40999e-06 [offload_activation]: 8.69e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.634e-05 [merge_recompute_call_nodes]: 1.08001e-06 [before_grad]: 1.172e-05 [set_forward_comm_id_for_comm_node_pass]: 6.63003e-06 [meta_fg_expand]: 3.01001e-06 [flash_sp_send_recv_attached]: 8.50006e-07 [receive_attached]: 1.14e-06 [after_resolve]: 1.299e-05 [a_after_grad]: 1.177e-05 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 2.07999e-06 [auto_monad_grad]: 1.04e-06 [auto_monad_eliminator]: 1.007e-05 [cse]: 1.968e-05 [a_3]: 6.198e-05 [py_interpret_to_execute_after_opt_a]: 1.48e-05 [slice_cell_reuse_recomputed_activation]: 4.89003e-06 [rewriter_after_opt_a]: 4.635e-05 [convert_after_rewriter]: 1.099e-05 [order_py_execute_after_rewriter]: 9.30001e-06 [mutable_eliminate]: 0.00055267 [opt_b]: 0.00031173, [1] [Cycle 1]: 0.00030224, [7] [b_1]: 0.00019782 [b_2]: 9.30001e-06 [updatestate_depend_eliminate]: 7.36001e-06 [updatestate_assign_eliminate]: 3.45e-06 [updatestate_loads_eliminate]: 3.47002e-06 [renormalize]: 4.80009e-07 [cse]: 2.446e-05 [optimize_parallel_all_gather_comm]: 2.268e-05 [overlap_param_gather]: 4.75999e-06 [cconv]: 3.069e-05 [loop_unroll]: 0.00044892 [opt_after_cconv]: 0.00014352, [1] [Cycle 1]: 0.00013447, [7] [c_1]: 3.682e-05 [parameter_eliminate]: 3.78999e-06 [updatestate_depend_eliminate]: 7.05e-06 [updatestate_assign_eliminate]: 3.28998e-06 [updatestate_loads_eliminate]: 2.86e-06 [cse]: 2.402e-05 [renormalize]: 5.39992e-07 [remove_dup_value]: 1.81e-05 [tuple_transform]: 0.0001023, [1] [Cycle 1]: 9.494e-05, [4] [d_1]: 5.394e-05 [none_parameter_eliminate]: 1.61998e-06 [renormalize]: 2.00002e-07 [switch_simplify]: 8.53001e-06 [partial_unused_args_eliminate]: 4.33001e-06 [add_recomputation]: 6.239e-05 [cse_after_recomputation]: 3.157e-05, [1] [Cycle 1]: 2.425e-05, [1] [cse]: 1.527e-05 [environ_conv]: 9.52001e-06 [swap_dp_allreduce_reducescatter]: 8.69e-06 [bias_add_comm_swap]: 5.15001e-06 [label_micro_interleaved_index]: 7.18e-06 [label_fine_grained_interleaved_index]: 5.20001e-06 [merge_cast_opt]: 3.62998e-06 [slice_recompute_activation]: 4.83001e-06 [micro_interleaved_order_control]: 4.62e-06 [assign_add_opt]: 3.61999e-06 [ForceFp32Comm]: 3.33998e-06 [remove_cast_before_assign_add]: 3.38e-06 [full_micro_interleaved_order_control]: 2.479e-05 [reorder_send_recv_between_fp_bp]: 8.22e-06 [comm_op_add_attrs]: 4.29997e-06 [add_comm_op_reuse_tag]: 3.71999e-06 [interleave_split_concat_branches]: 3.58e-06 [interleave_parallel_branches]: 3.36001e-06 [overlap_opt_shard_in_pipeline]: 3.43e-06 [overlap_opt_shard_grad_in_pipeline]: 4.22e-06 [control_data_broadcast_order]: 1.972e-05 [grouped_pairwise_exchange_alltoall]: 3.85e-06 [offloading_packed_experts]: 7.3e-06 [overlap_recompute_and_grad_model_parallel]: 8.40999e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.6e-06 [overlap_recompute_allgather_and_fa_grad]: 3.7e-06 [overlap_recompute_comm]: 5.22e-06 [overlap_grad_ring_attention]: 7.11001e-06 [overlap_grad_flash_sp]: 2.611e-05 [begin_end_overlap_inline]: 2.81999e-06 [split_matmul_comm_elemetwise]: 4.52e-06 [split_layernorm_comm]: 4.04002e-06 [handle_group_info]: 3.41001e-06 [symbol_engine_optimizer]: 0.00010829, [1] [Cycle 1]: 0.00010123, [6] [build]: 2.91e-06 [elim_shapecalc]: 1.393e-05 [elim_not_effective]: 1.625e-05 [opt_reshape]: 8.74e-06 [fold_const_symbol]: 1.208e-05 [renormalize]: 2.00002e-07 [detach_backward]: 3.80998e-06 [pipeline_parallel_scheduler]: 1.70001e-06 [auto_monad_reorder]: 2.165e-05 [get_jit_bprop_graph]: 1.66e-06 [rewriter_after_jit_bprop_graph]: 4.15e-06 [opt_after_jit_grad]: 0.00050938 [validate]: 4.416e-05 Sums bootstrap : 0.000439s : 3.82% type_inference : 0.005787s : 50.32% event_method : 0.000020s : 0.17% auto_monad : 0.000065s : 0.56% graph_reusing : 0.000006s : 0.05% inline : 0.000002s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000019s : 0.16% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.05% parallel-infer-symbol : 0.000003s : 0.03% pre_auto_parallel : 0.000033s : 0.29% insert-virtual-dataset : 0.000002s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.02% optimize.py_interpret_to_execute : 0.000029s : 0.25% optimize.rewriter_before_opt_a : 0.000088s : 0.77% optimize.opt_a.expand_dump_flag : 0.000004s : 0.04% optimize.opt_a.switch_simplify : 0.000052s : 0.45% optimize.opt_a.loop_unroll : 0.000038s : 0.33% optimize.opt_a.a_1 : 0.000837s : 7.28% optimize.opt_a.with_stream_mark : 0.000029s : 0.25% optimize.opt_a.recompute_prepare : 0.000018s : 0.16% optimize.opt_a.updatestate_depend_eliminate : 0.000009s : 0.08% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.06% optimize.opt_a.updatestate_loads_eliminate : 0.000007s : 0.06% optimize.opt_a.parameter_eliminate : 0.000004s : 0.03% optimize.opt_a.a_2 : 0.000250s : 2.18% optimize.opt_a.accelerated_algorithm : 0.000016s : 0.14% optimize.opt_a.shard : 0.000003s : 0.03% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.04% optimize.opt_a.shard_inline : 0.000016s : 0.14% optimize.opt_a.merge_send_recv : 0.000016s : 0.14% optimize.opt_a.auto_parallel : 0.000014s : 0.12% optimize.opt_a.parallel : 0.000024s : 0.20% optimize.opt_a.flash_sp : 0.000013s : 0.11% optimize.opt_a.merge_comm : 0.000012s : 0.10% optimize.opt_a.allreduce_fusion : 0.000008s : 0.07% optimize.opt_a.matmul_add_comm_reduction : 0.000017s : 0.15% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000019s : 0.16% optimize.opt_a.virtual_dataset : 0.000016s : 0.14% optimize.opt_a.get_grad_eliminate_ : 0.000015s : 0.13% optimize.opt_a.virtual_output : 0.000014s : 0.13% optimize.opt_a.merge_forward : 0.000008s : 0.07% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.02% optimize.opt_a.offload_activation : 0.000019s : 0.17% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000034s : 0.29% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.02% optimize.opt_a.before_grad : 0.000025s : 0.22% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000011s : 0.10% optimize.opt_a.meta_fg_expand : 0.000007s : 0.06% optimize.opt_a.flash_sp_send_recv_attached : 0.000003s : 0.03% optimize.opt_a.receive_attached : 0.000003s : 0.03% optimize.opt_a.after_resolve : 0.000026s : 0.23% optimize.opt_a.a_after_grad : 0.000024s : 0.21% optimize.opt_a.renormalize : 0.000649s : 5.65% optimize.opt_a.add_forward_monad_depend : 0.000008s : 0.07% optimize.opt_a.auto_monad_grad : 0.000003s : 0.03% optimize.opt_a.auto_monad_eliminator : 0.000028s : 0.24% optimize.opt_a.cse : 0.000053s : 0.46% optimize.opt_a.a_3 : 0.000136s : 1.18% optimize.py_interpret_to_execute_after_opt_a : 0.000015s : 0.13% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.04% optimize.rewriter_after_opt_a : 0.000046s : 0.40% optimize.convert_after_rewriter : 0.000011s : 0.10% optimize.order_py_execute_after_rewriter : 0.000009s : 0.08% optimize.mutable_eliminate : 0.000553s : 4.81% optimize.opt_b.b_1 : 0.000198s : 1.72% optimize.opt_b.b_2 : 0.000009s : 0.08% optimize.opt_b.updatestate_depend_eliminate : 0.000007s : 0.06% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_b.renormalize : 0.000000s : 0.00% optimize.opt_b.cse : 0.000024s : 0.21% optimize.optimize_parallel_all_gather_comm : 0.000023s : 0.20% optimize.overlap_param_gather : 0.000005s : 0.04% optimize.cconv : 0.000031s : 0.27% optimize.loop_unroll : 0.000449s : 3.90% optimize.opt_after_cconv.c_1 : 0.000037s : 0.32% optimize.opt_after_cconv.parameter_eliminate : 0.000004s : 0.03% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000007s : 0.06% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.02% optimize.opt_after_cconv.cse : 0.000024s : 0.21% optimize.opt_after_cconv.renormalize : 0.000001s : 0.00% optimize.remove_dup_value : 0.000018s : 0.16% optimize.tuple_transform.d_1 : 0.000054s : 0.47% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000009s : 0.07% optimize.partial_unused_args_eliminate : 0.000004s : 0.04% optimize.add_recomputation : 0.000062s : 0.54% optimize.cse_after_recomputation.cse : 0.000015s : 0.13% optimize.environ_conv : 0.000010s : 0.08% optimize.swap_dp_allreduce_reducescatter : 0.000009s : 0.08% optimize.bias_add_comm_swap : 0.000005s : 0.04% optimize.label_micro_interleaved_index : 0.000007s : 0.06% optimize.label_fine_grained_interleaved_index : 0.000005s : 0.05% optimize.merge_cast_opt : 0.000004s : 0.03% optimize.slice_recompute_activation : 0.000005s : 0.04% optimize.micro_interleaved_order_control : 0.000005s : 0.04% optimize.assign_add_opt : 0.000004s : 0.03% optimize.ForceFp32Comm : 0.000003s : 0.03% optimize.remove_cast_before_assign_add : 0.000003s : 0.03% optimize.full_micro_interleaved_order_control : 0.000025s : 0.22% optimize.reorder_send_recv_between_fp_bp : 0.000008s : 0.07% optimize.comm_op_add_attrs : 0.000004s : 0.04% optimize.add_comm_op_reuse_tag : 0.000004s : 0.03% optimize.interleave_split_concat_branches : 0.000004s : 0.03% optimize.interleave_parallel_branches : 0.000003s : 0.03% optimize.overlap_opt_shard_in_pipeline : 0.000003s : 0.03% optimize.overlap_opt_shard_grad_in_pipeline : 0.000004s : 0.04% optimize.control_data_broadcast_order : 0.000020s : 0.17% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.03% optimize.offloading_packed_experts : 0.000007s : 0.06% optimize.overlap_recompute_and_grad_model_parallel : 0.000008s : 0.07% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.03% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.03% optimize.overlap_recompute_comm : 0.000005s : 0.05% optimize.overlap_grad_ring_attention : 0.000007s : 0.06% optimize.overlap_grad_flash_sp : 0.000026s : 0.23% optimize.begin_end_overlap_inline : 0.000003s : 0.02% optimize.split_matmul_comm_elemetwise : 0.000005s : 0.04% optimize.split_layernorm_comm : 0.000004s : 0.04% optimize.handle_group_info : 0.000003s : 0.03% optimize.symbol_engine_optimizer.build : 0.000003s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000014s : 0.12% optimize.symbol_engine_optimizer.elim_not_effective : 0.000016s : 0.14% optimize.symbol_engine_optimizer.opt_reshape : 0.000009s : 0.08% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000012s : 0.11% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000004s : 0.03% pipeline_parallel_scheduler : 0.000002s : 0.01% auto_monad_reorder : 0.000022s : 0.19% get_jit_bprop_graph : 0.000002s : 0.01% rewriter_after_jit_bprop_graph : 0.000004s : 0.04% opt_after_jit_grad : 0.000509s : 4.43% validate : 0.000044s : 0.38% Time group info: ------[substitution.] 0.000201 38 12.47% : 0.000025s : 3: substitution.cast_eliminate 1.11% : 0.000002s : 3: substitution.elim_not_effective 0.79% : 0.000002s : 3: substitution.fold_const_symbol 3.39% : 0.000007s : 5: substitution.graph_param_transform 68.44% : 0.000138s : 4: substitution.inline 2.22% : 0.000004s : 6: substitution.j_node_and_user_rematch 2.89% : 0.000006s : 6: substitution.remove_not_recompute_node 2.37% : 0.000005s : 4: substitution.replace_old_param 6.31% : 0.000013s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.005736 2 87.53% : 0.005021s : 1: type_inference.infer 12.47% : 0.000715s : 1: type_inference.specialize ------[replace.] 0.000059 8 62.64% : 0.000037s : 4: replace.inline 37.36% : 0.000022s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000146 8 92.55% : 0.000135s : 4: match.inline 7.45% : 0.000011s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000238 1504 0.89% : 0.000002s : 15: predicate.accumulaten_eliminater 0.79% : 0.000002s : 5: predicate.ad_related_special_op_eliminate 0.59% : 0.000001s : 10: predicate.addn_check_dump 0.87% : 0.000002s : 15: predicate.addn_zero_filter 0.88% : 0.000002s : 15: predicate.adjust_all_reduce_mul_add 2.06% : 0.000005s : 25: predicate.arithmetic_simplify 1.06% : 0.000003s : 15: predicate.cast_eliminate 0.69% : 0.000002s : 10: predicate.check_bprop_eliminate 0.68% : 0.000002s : 10: predicate.compare_switch_simplify 0.20% : 0.000000s : 5: predicate.const_output_eliminate 0.64% : 0.000002s : 10: predicate.depend_value_elim 0.92% : 0.000002s : 15: predicate.dict_get_item_const_eliminator 0.97% : 0.000002s : 15: predicate.dict_get_item_eliminator 0.83% : 0.000002s : 15: predicate.dict_set_item_eliminator 1.07% : 0.000003s : 10: predicate.dumpgradient_eliminate 0.21% : 0.000000s : 5: predicate.elim_not_effective 0.41% : 0.000001s : 5: predicate.elim_shapecalc_of_broadcastargs 1.18% : 0.000003s : 20: predicate.environ_add_const_eliminate 1.09% : 0.000003s : 20: predicate.environ_get_add_eliminate 1.11% : 0.000003s : 20: predicate.environ_get_depend_swap 1.74% : 0.000004s : 30: predicate.environ_get_eliminate 1.11% : 0.000003s : 20: predicate.environ_get_set_eliminate 1.39% : 0.000003s : 23: predicate.exchange_switch_depend_value 2.20% : 0.000005s : 23: predicate.float_depend_g_call 0.58% : 0.000001s : 10: predicate.float_environ_get_switch 0.83% : 0.000002s : 15: predicate.float_tuple_getitem_switch 0.19% : 0.000000s : 5: predicate.fold_const_symbol 0.77% : 0.000002s : 10: predicate.get_grad_eliminate 0.23% : 0.000001s : 5: predicate.graph_param_transform 0.72% : 0.000002s : 10: predicate.incorporate_call 0.56% : 0.000001s : 10: predicate.incorporate_call_switch 6.29% : 0.000015s : 68: predicate.inline 0.79% : 0.000002s : 10: predicate.inline_without_move 0.36% : 0.000001s : 10: predicate.j_node_and_user_rematch 0.82% : 0.000002s : 10: predicate.less_batch_normalization 1.80% : 0.000004s : 29: predicate.list_to_tuple_eliminator_ 2.47% : 0.000006s : 44: predicate.load_eliminater 1.08% : 0.000003s : 5: predicate.loop_unroll_after_grad 2.12% : 0.000005s : 36: predicate.loop_unroll_before_grad 1.60% : 0.000004s : 25: predicate.make_slice_get_slice_eliminator 0.63% : 0.000001s : 10: predicate.merge_addn 0.60% : 0.000001s : 10: predicate.micro_step_allgather_replace 0.74% : 0.000002s : 10: predicate.mini_step_allgather_replace 0.96% : 0.000002s : 15: predicate.minmaximum_grad 0.98% : 0.000002s : 5: predicate.mutable_eliminate 0.36% : 0.000001s : 5: predicate.opt_reshape 0.37% : 0.000001s : 5: predicate.parallel_virtual_node 1.64% : 0.000004s : 23: predicate.partial_defer_inline 1.64% : 0.000004s : 24: predicate.partial_eliminate 0.86% : 0.000002s : 15: predicate.print_const_string_wrapper 0.63% : 0.000002s : 10: predicate.reduce_all_const_elim 1.16% : 0.000003s : 15: predicate.reduce_eliminate 2.48% : 0.000006s : 44: predicate.redundant_stop_gradient_eliminater 0.50% : 0.000001s : 10: predicate.remove_not_recompute_node 1.40% : 0.000003s : 29: predicate.replace_applicator 0.49% : 0.000001s : 10: predicate.replace_old_param 0.24% : 0.000001s : 5: predicate.reset_defer_inline 1.04% : 0.000002s : 15: predicate.reshape_eliminate 0.79% : 0.000002s : 10: predicate.row_tensor_add_zeros_like 0.37% : 0.000001s : 5: predicate.row_tensor_eliminate 0.77% : 0.000002s : 10: predicate.same_eliminate 0.53% : 0.000001s : 10: predicate.set_cell_output_no_recompute 0.90% : 0.000002s : 10: predicate.shard_identity_eliminate 0.76% : 0.000002s : 10: predicate.special_op_eliminate 0.78% : 0.000002s : 10: predicate.specialize_transform 0.86% : 0.000002s : 10: predicate.split_environ_get_set_with_tuple_value 0.74% : 0.000002s : 10: predicate.stack_unstack_eliminate 0.35% : 0.000001s : 5: predicate.switch_call_monad_eliminater 1.43% : 0.000003s : 23: predicate.switch_defer_inline 1.98% : 0.000005s : 33: predicate.switch_layer_defer_inline 4.82% : 0.000011s : 74: predicate.switch_simplify 0.89% : 0.000002s : 15: predicate.tile_eliminate 1.01% : 0.000002s : 15: predicate.transpose_eliminate 1.53% : 0.000004s : 25: predicate.tuple_list_convert_item_index_to_positive 1.62% : 0.000004s : 25: predicate.tuple_list_get_item_const_eliminator 1.48% : 0.000004s : 25: predicate.tuple_list_get_item_depend_reorder 3.32% : 0.000008s : 39: predicate.tuple_list_get_item_eliminator 1.53% : 0.000004s : 25: predicate.tuple_list_get_set_item_eliminator 2.35% : 0.000006s : 35: predicate.tuple_list_set_item_eliminator 1.71% : 0.000004s : 29: predicate.tuple_to_list_eliminator_ 2.42% : 0.000006s : 44: predicate.updatestate_pure_node_eliminater 3.16% : 0.000008s : 54: predicate.updatestate_useless_node_eliminater 0.40% : 0.000001s : 5: predicate.value_based_eliminate 0.65% : 0.000002s : 10: predicate.virtual_dataset_eliminate 0.64% : 0.000002s : 10: predicate.virtual_output_eliminate 0.31% : 0.000001s : 5: predicate.virtual_view_grad_eliminate 0.42% : 0.000001s : 5: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000552 11 54.45% : 0.000301s : 5: func_graph_cloner_run.FuncGraphClonerGraph 45.55% : 0.000252s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.027495 192 0.02% : 0.000006s : 1: ForceFp32Comm 11.41% : 0.003136s : 1: add_attr 11.35% : 0.003122s : 1: add_attr_with_inline 0.02% : 0.000007s : 1: add_comm_op_reuse_tag 0.24% : 0.000066s : 1: add_recomputation 0.02% : 0.000006s : 1: assign_add_opt 0.27% : 0.000075s : 1: auto_monad 0.10% : 0.000029s : 1: auto_monad_reorder 0.02% : 0.000005s : 1: begin_end_overlap_inline 0.03% : 0.000008s : 1: bias_add_comm_swap 1.76% : 0.000483s : 1: bootstrap 0.12% : 0.000034s : 1: cconv 0.03% : 0.000007s : 1: comm_op_add_attrs 0.08% : 0.000023s : 1: control_data_broadcast_order 0.05% : 0.000015s : 1: convert_after_rewriter 0.13% : 0.000035s : 1: cse_after_recomputation 0.03% : 0.000008s : 1: dataset_repeat_opt 0.07% : 0.000018s : 1: detach_backward 0.05% : 0.000012s : 1: environ_conv 0.11% : 0.000029s : 1: event_method 0.11% : 0.000030s : 1: full_micro_interleaved_order_control 0.03% : 0.000008s : 1: get_jit_bprop_graph 0.05% : 0.000013s : 1: graph_reusing 0.02% : 0.000006s : 1: grouped_pairwise_exchange_alltoall 0.02% : 0.000006s : 1: handle_group_info 0.03% : 0.000008s : 1: inline 0.03% : 0.000009s : 1: insert-virtual-dataset 0.02% : 0.000006s : 1: interleave_parallel_branches 0.02% : 0.000006s : 1: interleave_split_concat_branches 0.03% : 0.000008s : 1: label_fine_grained_interleaved_index 0.04% : 0.000010s : 1: label_micro_interleaved_index 1.66% : 0.000456s : 1: loop_unroll 0.02% : 0.000006s : 1: merge_cast_opt 0.03% : 0.000007s : 1: micro_interleaved_order_control 2.03% : 0.000559s : 1: mutable_eliminate 0.04% : 0.000010s : 1: offloading_packed_experts 0.07% : 0.000018s : 1: opt.transform.loop_unroll_optimizer 0.07% : 0.000018s : 1: opt.transform.mutable_eliminate 4.95% : 0.001361s : 78: opt.transform.opt_a 0.13% : 0.000035s : 1: opt.transform.opt_after_cconv 0.11% : 0.000030s : 1: opt.transform.opt_after_jit_grad 0.49% : 0.000134s : 28: opt.transform.opt_b 0.22% : 0.000060s : 2: opt.transform.opt_trans_graph 0.17% : 0.000047s : 4: opt.transform.symbol_engine_opt 11.83% : 0.003253s : 1: opt_a 0.54% : 0.000147s : 1: opt_after_cconv 1.89% : 0.000520s : 1: opt_after_jit_grad 1.15% : 0.000315s : 1: opt_b 22.20% : 0.006104s : 1: optimize 0.09% : 0.000026s : 1: optimize_parallel_all_gather_comm 0.04% : 0.000012s : 1: order_py_execute_after_rewriter 0.11% : 0.000029s : 1: overlap_grad_flash_sp 0.02% : 0.000006s : 1: overlap_grad_matmul_and_grad_allreduce 0.04% : 0.000010s : 1: overlap_grad_ring_attention 0.03% : 0.000007s : 1: overlap_opt_shard_grad_in_pipeline 0.02% : 0.000006s : 1: overlap_opt_shard_in_pipeline 0.03% : 0.000008s : 1: overlap_param_gather 0.02% : 0.000006s : 1: overlap_recompute_allgather_and_fa_grad 0.04% : 0.000011s : 1: overlap_recompute_and_grad_model_parallel 0.03% : 0.000008s : 1: overlap_recompute_comm 0.04% : 0.000010s : 1: parallel-infer-symbol 0.02% : 0.000006s : 1: parallel-infer-symbol-second 0.03% : 0.000007s : 1: partial_unused_args_eliminate 0.03% : 0.000009s : 1: pipeline_parallel_scheduler 0.03% : 0.000007s : 1: pipeline_split 0.15% : 0.000040s : 1: pre_auto_parallel 0.12% : 0.000033s : 1: py_interpret_to_execute 0.07% : 0.000018s : 1: py_interpret_to_execute_after_opt_a 0.02% : 0.000006s : 1: remove_cast_before_assign_add 0.08% : 0.000022s : 1: remove_dup_value 1.25% : 0.000343s : 1: renormalize.infer 1.08% : 0.000298s : 1: renormalize.specialize 0.04% : 0.000012s : 1: reorder_send_recv_between_fp_bp 0.04% : 0.000010s : 1: rewriter_after_jit_bprop_graph 0.18% : 0.000050s : 1: rewriter_after_opt_a 0.34% : 0.000092s : 1: rewriter_before_opt_a 0.03% : 0.000008s : 1: slice_cell_reuse_recomputed_activation 0.03% : 0.000007s : 1: slice_recompute_activation 0.03% : 0.000007s : 1: split_layernorm_comm 0.03% : 0.000007s : 1: split_matmul_comm_elemetwise 0.04% : 0.000012s : 1: swap_dp_allreduce_reducescatter 0.40% : 0.000111s : 1: symbol_engine_optimizer 0.38% : 0.000105s : 1: tuple_transform 21.19% : 0.005827s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:45:33.202.003 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0581455, [21] [bootstrap]: 0.0003996 [type_inference]: 0.0341833 [event_method]: 1.898e-05 [auto_monad]: 6.263e-05 [graph_reusing]: 5.42001e-06 [inline]: 2.39999e-06 [add_attr]: 0.00303374, [1] [add_attr_with_inline]: 0.00302402, [1] [Cycle 1]: 5.781e-05, [2] [tag_attr]: 1.909e-05 [meta_addattr_fg_expand]: 6.25002e-06 [parallel-infer-symbol]: 3.44001e-06 [pre_auto_parallel]: 3.485e-05 [insert-virtual-dataset]: 3.08998e-06 [parallel-infer-symbol-second]: 9.39996e-07 [dataset_repeat_opt]: 2.14999e-06 [pipeline_split]: 1.71e-06 [optimize]: 0.0196562, [53] [py_interpret_to_execute]: 2.674e-05 [rewriter_before_opt_a]: 8.357e-05 [opt_a]: 0.00289232, [2] [Cycle 1]: 0.00209351, [45] [expand_dump_flag]: 3.01999e-06 [switch_simplify]: 4.391e-05 [loop_unroll]: 3.17e-05 [a_1]: 0.00067117 [with_stream_mark]: 1.674e-05 [recompute_prepare]: 1.085e-05 [updatestate_depend_eliminate]: 4.74e-06 [updatestate_assign_eliminate]: 3.97998e-06 [updatestate_loads_eliminate]: 3.73001e-06 [parameter_eliminate]: 1.86e-06 [a_2]: 0.00010538 [accelerated_algorithm]: 8.70999e-06 [shard]: 1.70001e-06 [meta_shard_fg_expand]: 2.44999e-06 [shard_inline]: 9.42001e-06 [merge_send_recv]: 1.03e-05 [auto_parallel]: 8.18001e-06 [parallel]: 1.852e-05 [flash_sp]: 8.47e-06 [merge_comm]: 5.07999e-06 [allreduce_fusion]: 4.45e-06 [matmul_add_comm_reduction]: 1.06e-05 [allreduce_slice_to_reducescatter]: 9.70002e-07 [virtual_shard_identity]: 1.1e-05 [virtual_dataset]: 9.34e-06 [get_grad_eliminate_]: 9.24e-06 [virtual_output]: 7.77e-06 [merge_forward]: 4.53999e-06 [cell_reuse_recompute_pass]: 1.25001e-06 [offload_activation]: 1.073e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.547e-05 [merge_recompute_call_nodes]: 1.50999e-06 [before_grad]: 1.364e-05 [set_forward_comm_id_for_comm_node_pass]: 4.56002e-06 [meta_fg_expand]: 3.52997e-06 [flash_sp_send_recv_attached]: 2.67001e-06 [receive_attached]: 2.58998e-06 [after_resolve]: 1.347e-05 [a_after_grad]: 1.218e-05 [renormalize]: 0.00061502 [add_forward_monad_depend]: 5.42001e-06 [auto_monad_grad]: 2.14e-06 [auto_monad_eliminator]: 1.713e-05 [cse]: 3.52e-05 [a_3]: 5.757e-05 [Cycle 2]: 0.00078907, [45] [expand_dump_flag]: 1.04e-06 [switch_simplify]: 9.00001e-06 [loop_unroll]: 7.48999e-06 [a_1]: 0.00018042 [with_stream_mark]: 1.202e-05 [recompute_prepare]: 8.02e-06 [updatestate_depend_eliminate]: 3.88001e-06 [updatestate_assign_eliminate]: 2.93e-06 [updatestate_loads_eliminate]: 2.94999e-06 [parameter_eliminate]: 1.02e-06 [a_2]: 9.349e-05 [accelerated_algorithm]: 7.69002e-06 [shard]: 1.26002e-06 [meta_shard_fg_expand]: 1.49e-06 [shard_inline]: 7.59002e-06 [merge_send_recv]: 6.06e-06 [auto_parallel]: 5.95002e-06 [parallel]: 4.83001e-06 [flash_sp]: 3.21001e-06 [merge_comm]: 4.13001e-06 [allreduce_fusion]: 3.63e-06 [matmul_add_comm_reduction]: 6.96999e-06 [allreduce_slice_to_reducescatter]: 4.00003e-07 [virtual_shard_identity]: 9.45001e-06 [virtual_dataset]: 8.90001e-06 [get_grad_eliminate_]: 7.82e-06 [virtual_output]: 8.04997e-06 [merge_forward]: 4.11001e-06 [cell_reuse_recompute_pass]: 1.79998e-06 [offload_activation]: 8.32e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.608e-05 [merge_recompute_call_nodes]: 9.30013e-07 [before_grad]: 1.494e-05 [set_forward_comm_id_for_comm_node_pass]: 4.42e-06 [meta_fg_expand]: 2.77002e-06 [flash_sp_send_recv_attached]: 1.12e-06 [receive_attached]: 9.70002e-07 [after_resolve]: 1.243e-05 [a_after_grad]: 1.139e-05 [renormalize]: 8.9989e-08 [add_forward_monad_depend]: 1.80001e-06 [auto_monad_grad]: 8.30012e-07 [auto_monad_eliminator]: 9.91e-06 [cse]: 1.843e-05 [a_3]: 5.013e-05 [py_interpret_to_execute_after_opt_a]: 1.086e-05 [slice_cell_reuse_recomputed_activation]: 1.96998e-06 [rewriter_after_opt_a]: 4.342e-05 [convert_after_rewriter]: 8.75001e-06 [order_py_execute_after_rewriter]: 5.79e-06 [mutable_eliminate]: 0.0149973 [opt_b]: 0.00028718, [1] [Cycle 1]: 0.00027902, [7] [b_1]: 0.00017077 [b_2]: 1.146e-05 [updatestate_depend_eliminate]: 8.64998e-06 [updatestate_assign_eliminate]: 3.75e-06 [updatestate_loads_eliminate]: 3.97e-06 [renormalize]: 7.89994e-07 [cse]: 4.137e-05 [optimize_parallel_all_gather_comm]: 2.077e-05 [overlap_param_gather]: 2.01e-06 [cconv]: 2.729e-05 [loop_unroll]: 0.00046055 [opt_after_cconv]: 0.00012099, [1] [Cycle 1]: 0.00011476, [7] [c_1]: 3.806e-05 [parameter_eliminate]: 4e-06 [updatestate_depend_eliminate]: 6.01e-06 [updatestate_assign_eliminate]: 3.25998e-06 [updatestate_loads_eliminate]: 2.91e-06 [cse]: 2.519e-05 [renormalize]: 7.39994e-07 [remove_dup_value]: 1.599e-05 [tuple_transform]: 0.00011098, [1] [Cycle 1]: 0.00010594, [4] [d_1]: 5.576e-05 [none_parameter_eliminate]: 1.54998e-06 [renormalize]: 2.50002e-07 [switch_simplify]: 2.818e-05 [partial_unused_args_eliminate]: 2.21e-06 [add_recomputation]: 6.05e-05 [cse_after_recomputation]: 2.812e-05, [1] [Cycle 1]: 2.301e-05, [1] [cse]: 1.726e-05 [environ_conv]: 7.05998e-06 [swap_dp_allreduce_reducescatter]: 6.21e-06 [bias_add_comm_swap]: 3.03e-06 [label_micro_interleaved_index]: 4.68999e-06 [label_fine_grained_interleaved_index]: 2.83e-06 [merge_cast_opt]: 1.39e-06 [slice_recompute_activation]: 2.19999e-06 [micro_interleaved_order_control]: 2.66e-06 [assign_add_opt]: 1.62999e-06 [ForceFp32Comm]: 8.89995e-07 [remove_cast_before_assign_add]: 1.14e-06 [full_micro_interleaved_order_control]: 2.39999e-06 [reorder_send_recv_between_fp_bp]: 2.65002e-06 [comm_op_add_attrs]: 1.05001e-06 [add_comm_op_reuse_tag]: 9.70002e-07 [interleave_split_concat_branches]: 1.10999e-06 [interleave_parallel_branches]: 1.32e-06 [overlap_opt_shard_in_pipeline]: 1.45999e-06 [overlap_opt_shard_grad_in_pipeline]: 1.76e-06 [control_data_broadcast_order]: 1.54e-05 [grouped_pairwise_exchange_alltoall]: 2.04999e-06 [offloading_packed_experts]: 4.76002e-06 [overlap_recompute_and_grad_model_parallel]: 6.49001e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.25999e-06 [overlap_recompute_allgather_and_fa_grad]: 1.48002e-06 [overlap_recompute_comm]: 2.42001e-06 [overlap_grad_ring_attention]: 4.86002e-06 [overlap_grad_flash_sp]: 2.24e-05 [begin_end_overlap_inline]: 6.89994e-07 [split_matmul_comm_elemetwise]: 2.00002e-06 [split_layernorm_comm]: 1.97999e-06 [handle_group_info]: 1.27999e-06 [symbol_engine_optimizer]: 9.424e-05, [1] [Cycle 1]: 8.959e-05, [6] [build]: 3.68e-06 [elim_shapecalc]: 1.399e-05 [elim_not_effective]: 1.7e-05 [opt_reshape]: 9.34998e-06 [fold_const_symbol]: 1.431e-05 [renormalize]: 2.89991e-07 [detach_backward]: 2.47001e-06 [pipeline_parallel_scheduler]: 2.01e-06 [auto_monad_reorder]: 1.975e-05 [get_jit_bprop_graph]: 2.18998e-06 [rewriter_after_jit_bprop_graph]: 4.92999e-06 [opt_after_jit_grad]: 0.00050944 [validate]: 4.302e-05 Sums bootstrap : 0.000400s : 0.74% type_inference : 0.034183s : 63.18% event_method : 0.000019s : 0.04% auto_monad : 0.000063s : 0.12% graph_reusing : 0.000005s : 0.01% inline : 0.000002s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000019s : 0.04% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.01% parallel-infer-symbol : 0.000003s : 0.01% pre_auto_parallel : 0.000035s : 0.06% insert-virtual-dataset : 0.000003s : 0.01% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000027s : 0.05% optimize.rewriter_before_opt_a : 0.000084s : 0.15% optimize.opt_a.expand_dump_flag : 0.000004s : 0.01% optimize.opt_a.switch_simplify : 0.000053s : 0.10% optimize.opt_a.loop_unroll : 0.000039s : 0.07% optimize.opt_a.a_1 : 0.000852s : 1.57% optimize.opt_a.with_stream_mark : 0.000029s : 0.05% optimize.opt_a.recompute_prepare : 0.000019s : 0.03% optimize.opt_a.updatestate_depend_eliminate : 0.000009s : 0.02% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.01% optimize.opt_a.updatestate_loads_eliminate : 0.000007s : 0.01% optimize.opt_a.parameter_eliminate : 0.000003s : 0.01% optimize.opt_a.a_2 : 0.000199s : 0.37% optimize.opt_a.accelerated_algorithm : 0.000016s : 0.03% optimize.opt_a.shard : 0.000003s : 0.01% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.01% optimize.opt_a.shard_inline : 0.000017s : 0.03% optimize.opt_a.merge_send_recv : 0.000016s : 0.03% optimize.opt_a.auto_parallel : 0.000014s : 0.03% optimize.opt_a.parallel : 0.000023s : 0.04% optimize.opt_a.flash_sp : 0.000012s : 0.02% optimize.opt_a.merge_comm : 0.000009s : 0.02% optimize.opt_a.allreduce_fusion : 0.000008s : 0.01% optimize.opt_a.matmul_add_comm_reduction : 0.000018s : 0.03% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000020s : 0.04% optimize.opt_a.virtual_dataset : 0.000018s : 0.03% optimize.opt_a.get_grad_eliminate_ : 0.000017s : 0.03% optimize.opt_a.virtual_output : 0.000016s : 0.03% optimize.opt_a.merge_forward : 0.000009s : 0.02% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.01% optimize.opt_a.offload_activation : 0.000019s : 0.04% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000032s : 0.06% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.00% optimize.opt_a.before_grad : 0.000029s : 0.05% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000009s : 0.02% optimize.opt_a.meta_fg_expand : 0.000006s : 0.01% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.01% optimize.opt_a.receive_attached : 0.000004s : 0.01% optimize.opt_a.after_resolve : 0.000026s : 0.05% optimize.opt_a.a_after_grad : 0.000024s : 0.04% optimize.opt_a.renormalize : 0.000615s : 1.14% optimize.opt_a.add_forward_monad_depend : 0.000007s : 0.01% optimize.opt_a.auto_monad_grad : 0.000003s : 0.01% optimize.opt_a.auto_monad_eliminator : 0.000027s : 0.05% optimize.opt_a.cse : 0.000054s : 0.10% optimize.opt_a.a_3 : 0.000108s : 0.20% optimize.py_interpret_to_execute_after_opt_a : 0.000011s : 0.02% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.00% optimize.rewriter_after_opt_a : 0.000043s : 0.08% optimize.convert_after_rewriter : 0.000009s : 0.02% optimize.order_py_execute_after_rewriter : 0.000006s : 0.01% optimize.mutable_eliminate : 0.014997s : 27.72% optimize.opt_b.b_1 : 0.000171s : 0.32% optimize.opt_b.b_2 : 0.000011s : 0.02% optimize.opt_b.updatestate_depend_eliminate : 0.000009s : 0.02% optimize.opt_b.updatestate_assign_eliminate : 0.000004s : 0.01% optimize.opt_b.updatestate_loads_eliminate : 0.000004s : 0.01% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000041s : 0.08% optimize.optimize_parallel_all_gather_comm : 0.000021s : 0.04% optimize.overlap_param_gather : 0.000002s : 0.00% optimize.cconv : 0.000027s : 0.05% optimize.loop_unroll : 0.000461s : 0.85% optimize.opt_after_cconv.c_1 : 0.000038s : 0.07% optimize.opt_after_cconv.parameter_eliminate : 0.000004s : 0.01% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.01% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.cse : 0.000025s : 0.05% optimize.opt_after_cconv.renormalize : 0.000001s : 0.00% optimize.remove_dup_value : 0.000016s : 0.03% optimize.tuple_transform.d_1 : 0.000056s : 0.10% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000028s : 0.05% optimize.partial_unused_args_eliminate : 0.000002s : 0.00% optimize.add_recomputation : 0.000060s : 0.11% optimize.cse_after_recomputation.cse : 0.000017s : 0.03% optimize.environ_conv : 0.000007s : 0.01% optimize.swap_dp_allreduce_reducescatter : 0.000006s : 0.01% optimize.bias_add_comm_swap : 0.000003s : 0.01% optimize.label_micro_interleaved_index : 0.000005s : 0.01% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.01% optimize.merge_cast_opt : 0.000001s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.00% optimize.micro_interleaved_order_control : 0.000003s : 0.00% optimize.assign_add_opt : 0.000002s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.00% optimize.full_micro_interleaved_order_control : 0.000002s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.00% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.00% optimize.control_data_broadcast_order : 0.000015s : 0.03% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.00% optimize.offloading_packed_experts : 0.000005s : 0.01% optimize.overlap_recompute_and_grad_model_parallel : 0.000006s : 0.01% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.00% optimize.overlap_recompute_comm : 0.000002s : 0.00% optimize.overlap_grad_ring_attention : 0.000005s : 0.01% optimize.overlap_grad_flash_sp : 0.000022s : 0.04% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.00% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000004s : 0.01% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000014s : 0.03% optimize.symbol_engine_optimizer.elim_not_effective : 0.000017s : 0.03% optimize.symbol_engine_optimizer.opt_reshape : 0.000009s : 0.02% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000014s : 0.03% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.00% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000020s : 0.04% get_jit_bprop_graph : 0.000002s : 0.00% rewriter_after_jit_bprop_graph : 0.000005s : 0.01% opt_after_jit_grad : 0.000509s : 0.94% validate : 0.000043s : 0.08% Time group info: ------[substitution.] 0.000203 38 11.45% : 0.000023s : 3: substitution.cast_eliminate 1.13% : 0.000002s : 3: substitution.elim_not_effective 0.90% : 0.000002s : 3: substitution.fold_const_symbol 3.58% : 0.000007s : 5: substitution.graph_param_transform 68.35% : 0.000139s : 4: substitution.inline 2.62% : 0.000005s : 6: substitution.j_node_and_user_rematch 3.05% : 0.000006s : 6: substitution.remove_not_recompute_node 2.41% : 0.000005s : 4: substitution.replace_old_param 6.50% : 0.000013s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.034123 2 98.01% : 0.033444s : 1: type_inference.infer 1.99% : 0.000679s : 1: type_inference.specialize ------[replace.] 0.000061 8 62.98% : 0.000039s : 4: replace.inline 37.02% : 0.000023s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000148 8 92.30% : 0.000136s : 4: match.inline 7.70% : 0.000011s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000243 1504 0.85% : 0.000002s : 15: predicate.accumulaten_eliminater 0.82% : 0.000002s : 5: predicate.ad_related_special_op_eliminate 0.61% : 0.000001s : 10: predicate.addn_check_dump 0.93% : 0.000002s : 15: predicate.addn_zero_filter 0.78% : 0.000002s : 15: predicate.adjust_all_reduce_mul_add 1.86% : 0.000005s : 25: predicate.arithmetic_simplify 1.02% : 0.000002s : 15: predicate.cast_eliminate 0.62% : 0.000002s : 10: predicate.check_bprop_eliminate 0.60% : 0.000001s : 10: predicate.compare_switch_simplify 0.21% : 0.000000s : 5: predicate.const_output_eliminate 0.64% : 0.000002s : 10: predicate.depend_value_elim 0.99% : 0.000002s : 15: predicate.dict_get_item_const_eliminator 0.95% : 0.000002s : 15: predicate.dict_get_item_eliminator 0.85% : 0.000002s : 15: predicate.dict_set_item_eliminator 0.87% : 0.000002s : 10: predicate.dumpgradient_eliminate 0.22% : 0.000001s : 5: predicate.elim_not_effective 0.38% : 0.000001s : 5: predicate.elim_shapecalc_of_broadcastargs 1.23% : 0.000003s : 20: predicate.environ_add_const_eliminate 1.11% : 0.000003s : 20: predicate.environ_get_add_eliminate 1.16% : 0.000003s : 20: predicate.environ_get_depend_swap 1.84% : 0.000004s : 30: predicate.environ_get_eliminate 1.14% : 0.000003s : 20: predicate.environ_get_set_eliminate 1.39% : 0.000003s : 23: predicate.exchange_switch_depend_value 2.30% : 0.000006s : 23: predicate.float_depend_g_call 0.54% : 0.000001s : 10: predicate.float_environ_get_switch 0.86% : 0.000002s : 15: predicate.float_tuple_getitem_switch 0.20% : 0.000000s : 5: predicate.fold_const_symbol 0.69% : 0.000002s : 10: predicate.get_grad_eliminate 0.29% : 0.000001s : 5: predicate.graph_param_transform 0.65% : 0.000002s : 10: predicate.incorporate_call 0.54% : 0.000001s : 10: predicate.incorporate_call_switch 6.23% : 0.000015s : 68: predicate.inline 0.82% : 0.000002s : 10: predicate.inline_without_move 0.36% : 0.000001s : 10: predicate.j_node_and_user_rematch 0.72% : 0.000002s : 10: predicate.less_batch_normalization 1.79% : 0.000004s : 29: predicate.list_to_tuple_eliminator_ 2.65% : 0.000006s : 44: predicate.load_eliminater 0.87% : 0.000002s : 5: predicate.loop_unroll_after_grad 2.15% : 0.000005s : 36: predicate.loop_unroll_before_grad 1.68% : 0.000004s : 25: predicate.make_slice_get_slice_eliminator 0.63% : 0.000002s : 10: predicate.merge_addn 0.63% : 0.000002s : 10: predicate.micro_step_allgather_replace 0.66% : 0.000002s : 10: predicate.mini_step_allgather_replace 0.82% : 0.000002s : 15: predicate.minmaximum_grad 1.48% : 0.000004s : 5: predicate.mutable_eliminate 0.46% : 0.000001s : 5: predicate.opt_reshape 0.35% : 0.000001s : 5: predicate.parallel_virtual_node 1.66% : 0.000004s : 23: predicate.partial_defer_inline 1.63% : 0.000004s : 24: predicate.partial_eliminate 0.86% : 0.000002s : 15: predicate.print_const_string_wrapper 0.62% : 0.000002s : 10: predicate.reduce_all_const_elim 1.22% : 0.000003s : 15: predicate.reduce_eliminate 2.44% : 0.000006s : 44: predicate.redundant_stop_gradient_eliminater 0.37% : 0.000001s : 10: predicate.remove_not_recompute_node 1.34% : 0.000003s : 29: predicate.replace_applicator 0.51% : 0.000001s : 10: predicate.replace_old_param 0.32% : 0.000001s : 5: predicate.reset_defer_inline 0.93% : 0.000002s : 15: predicate.reshape_eliminate 0.66% : 0.000002s : 10: predicate.row_tensor_add_zeros_like 0.40% : 0.000001s : 5: predicate.row_tensor_eliminate 0.81% : 0.000002s : 10: predicate.same_eliminate 0.47% : 0.000001s : 10: predicate.set_cell_output_no_recompute 0.83% : 0.000002s : 10: predicate.shard_identity_eliminate 0.71% : 0.000002s : 10: predicate.special_op_eliminate 0.75% : 0.000002s : 10: predicate.specialize_transform 0.84% : 0.000002s : 10: predicate.split_environ_get_set_with_tuple_value 0.75% : 0.000002s : 10: predicate.stack_unstack_eliminate 0.41% : 0.000001s : 5: predicate.switch_call_monad_eliminater 1.43% : 0.000003s : 23: predicate.switch_defer_inline 2.00% : 0.000005s : 33: predicate.switch_layer_defer_inline 4.90% : 0.000012s : 74: predicate.switch_simplify 0.89% : 0.000002s : 15: predicate.tile_eliminate 0.93% : 0.000002s : 15: predicate.transpose_eliminate 1.49% : 0.000004s : 25: predicate.tuple_list_convert_item_index_to_positive 1.69% : 0.000004s : 25: predicate.tuple_list_get_item_const_eliminator 1.45% : 0.000004s : 25: predicate.tuple_list_get_item_depend_reorder 3.47% : 0.000008s : 39: predicate.tuple_list_get_item_eliminator 1.50% : 0.000004s : 25: predicate.tuple_list_get_set_item_eliminator 2.37% : 0.000006s : 35: predicate.tuple_list_set_item_eliminator 1.80% : 0.000004s : 29: predicate.tuple_to_list_eliminator_ 2.45% : 0.000006s : 44: predicate.updatestate_pure_node_eliminater 3.18% : 0.000008s : 54: predicate.updatestate_useless_node_eliminater 0.37% : 0.000001s : 5: predicate.value_based_eliminate 0.73% : 0.000002s : 10: predicate.virtual_dataset_eliminate 0.67% : 0.000002s : 10: predicate.virtual_output_eliminate 0.30% : 0.000001s : 5: predicate.virtual_view_grad_eliminate 0.43% : 0.000001s : 5: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000507 11 55.95% : 0.000284s : 5: func_graph_cloner_run.FuncGraphClonerGraph 44.05% : 0.000224s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.083090 192 0.00% : 0.000004s : 1: ForceFp32Comm 3.66% : 0.003039s : 1: add_attr 3.64% : 0.003028s : 1: add_attr_with_inline 0.00% : 0.000004s : 1: add_comm_op_reuse_tag 0.08% : 0.000065s : 1: add_recomputation 0.01% : 0.000004s : 1: assign_add_opt 0.08% : 0.000069s : 1: auto_monad 0.03% : 0.000024s : 1: auto_monad_reorder 0.00% : 0.000003s : 1: begin_end_overlap_inline 0.01% : 0.000006s : 1: bias_add_comm_swap 0.51% : 0.000428s : 1: bootstrap 0.04% : 0.000031s : 1: cconv 0.00% : 0.000004s : 1: comm_op_add_attrs 0.02% : 0.000019s : 1: control_data_broadcast_order 0.01% : 0.000012s : 1: convert_after_rewriter 0.04% : 0.000031s : 1: cse_after_recomputation 0.01% : 0.000005s : 1: dataset_repeat_opt 0.01% : 0.000006s : 1: detach_backward 0.01% : 0.000010s : 1: environ_conv 0.03% : 0.000025s : 1: event_method 0.01% : 0.000005s : 1: full_micro_interleaved_order_control 0.01% : 0.000005s : 1: get_jit_bprop_graph 0.01% : 0.000009s : 1: graph_reusing 0.01% : 0.000005s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000004s : 1: handle_group_info 0.01% : 0.000005s : 1: inline 0.01% : 0.000007s : 1: insert-virtual-dataset 0.00% : 0.000004s : 1: interleave_parallel_branches 0.00% : 0.000004s : 1: interleave_split_concat_branches 0.01% : 0.000006s : 1: label_fine_grained_interleaved_index 0.01% : 0.000008s : 1: label_micro_interleaved_index 0.56% : 0.000469s : 1: loop_unroll 0.01% : 0.000004s : 1: merge_cast_opt 0.01% : 0.000005s : 1: micro_interleaved_order_control 18.07% : 0.015012s : 1: mutable_eliminate 0.01% : 0.000008s : 1: offloading_packed_experts 0.02% : 0.000016s : 1: opt.transform.loop_unroll_optimizer 0.03% : 0.000027s : 1: opt.transform.mutable_eliminate 1.68% : 0.001392s : 78: opt.transform.opt_a 0.04% : 0.000036s : 1: opt.transform.opt_after_cconv 0.04% : 0.000030s : 1: opt.transform.opt_after_jit_grad 0.17% : 0.000145s : 28: opt.transform.opt_b 0.10% : 0.000082s : 2: opt.transform.opt_trans_graph 0.06% : 0.000050s : 4: opt.transform.symbol_engine_opt 3.48% : 0.002896s : 1: opt_a 0.15% : 0.000125s : 1: opt_after_cconv 0.62% : 0.000519s : 1: opt_after_jit_grad 0.35% : 0.000292s : 1: opt_b 23.66% : 0.019662s : 1: optimize 0.03% : 0.000024s : 1: optimize_parallel_all_gather_comm 0.01% : 0.000009s : 1: order_py_execute_after_rewriter 0.03% : 0.000026s : 1: overlap_grad_flash_sp 0.01% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.01% : 0.000008s : 1: overlap_grad_ring_attention 0.01% : 0.000006s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.01% : 0.000005s : 1: overlap_param_gather 0.01% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.01% : 0.000010s : 1: overlap_recompute_and_grad_model_parallel 0.01% : 0.000005s : 1: overlap_recompute_comm 0.01% : 0.000007s : 1: parallel-infer-symbol 0.00% : 0.000004s : 1: parallel-infer-symbol-second 0.01% : 0.000005s : 1: partial_unused_args_eliminate 0.01% : 0.000005s : 1: pipeline_parallel_scheduler 0.01% : 0.000005s : 1: pipeline_split 0.05% : 0.000039s : 1: pre_auto_parallel 0.04% : 0.000031s : 1: py_interpret_to_execute 0.02% : 0.000015s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000005s : 1: remove_cast_before_assign_add 0.02% : 0.000020s : 1: remove_dup_value 0.40% : 0.000332s : 1: renormalize.infer 0.33% : 0.000275s : 1: renormalize.specialize 0.01% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.01% : 0.000008s : 1: rewriter_after_jit_bprop_graph 0.06% : 0.000048s : 1: rewriter_after_opt_a 0.11% : 0.000088s : 1: rewriter_before_opt_a 0.01% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.01% : 0.000005s : 1: slice_recompute_activation 0.01% : 0.000005s : 1: split_layernorm_comm 0.01% : 0.000005s : 1: split_matmul_comm_elemetwise 0.01% : 0.000009s : 1: swap_dp_allreduce_reducescatter 0.12% : 0.000097s : 1: symbol_engine_optimizer 0.14% : 0.000114s : 1: tuple_transform 41.16% : 0.034201s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:45:33.607.229 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:45:33.607.481 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0191161, [21] [bootstrap]: 0.00042901 [type_inference]: 0.00858018 [event_method]: 2.049e-05 [auto_monad]: 6.453e-05 [graph_reusing]: 5.52999e-06 [inline]: 2.12001e-06 [add_attr]: 0.00336682, [1] [add_attr_with_inline]: 0.00335648, [1] [Cycle 1]: 7.824e-05, [2] [tag_attr]: 1.943e-05 [meta_addattr_fg_expand]: 5.84e-06 [parallel-infer-symbol]: 3.32002e-06 [pre_auto_parallel]: 3.552e-05 [insert-virtual-dataset]: 2.36e-06 [parallel-infer-symbol-second]: 7.79983e-07 [dataset_repeat_opt]: 2.18002e-06 [pipeline_split]: 1.60999e-06 [optimize]: 0.00550454, [53] [py_interpret_to_execute]: 3.082e-05 [rewriter_before_opt_a]: 8.501e-05 [opt_a]: 0.00318196, [2] [Cycle 1]: 0.00231385, [45] [expand_dump_flag]: 2.64999e-06 [switch_simplify]: 4.251e-05 [loop_unroll]: 2.975e-05 [a_1]: 0.00065112 [with_stream_mark]: 1.687e-05 [recompute_prepare]: 1e-05 [updatestate_depend_eliminate]: 3.85998e-06 [updatestate_assign_eliminate]: 3.81001e-06 [updatestate_loads_eliminate]: 3.18e-06 [parameter_eliminate]: 2.34999e-06 [a_2]: 0.00011552 [accelerated_algorithm]: 7.85e-06 [shard]: 1.91e-06 [meta_shard_fg_expand]: 2.02001e-06 [shard_inline]: 6.81001e-06 [merge_send_recv]: 8.22e-06 [auto_parallel]: 7.18e-06 [parallel]: 2.023e-05 [flash_sp]: 9.52001e-06 [merge_comm]: 3.98001e-06 [allreduce_fusion]: 3.55e-06 [matmul_add_comm_reduction]: 9.84001e-06 [allreduce_slice_to_reducescatter]: 8.30012e-07 [virtual_shard_identity]: 9.32001e-06 [virtual_dataset]: 6.68e-06 [get_grad_eliminate_]: 7.70998e-06 [virtual_output]: 7.32997e-06 [merge_forward]: 4.52998e-06 [cell_reuse_recompute_pass]: 1.39e-06 [offload_activation]: 1.072e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.45e-05 [merge_recompute_call_nodes]: 1.92001e-06 [before_grad]: 1.145e-05 [set_forward_comm_id_for_comm_node_pass]: 4.08999e-06 [meta_fg_expand]: 2.96001e-06 [flash_sp_send_recv_attached]: 2.39001e-06 [receive_attached]: 2.17999e-06 [after_resolve]: 1.294e-05 [a_after_grad]: 1.154e-05 [renormalize]: 0.00067262 [add_forward_monad_depend]: 5.71e-06 [auto_monad_grad]: 2.76e-06 [auto_monad_eliminator]: 1.443e-05 [cse]: 2.457e-05 [a_3]: 6.268e-05 [Cycle 2]: 0.00085398, [45] [expand_dump_flag]: 1.30001e-06 [switch_simplify]: 8.27e-06 [loop_unroll]: 6.31998e-06 [a_1]: 0.00012944 [with_stream_mark]: 1.301e-05 [recompute_prepare]: 8.06001e-06 [updatestate_depend_eliminate]: 3.26999e-06 [updatestate_assign_eliminate]: 3.25002e-06 [updatestate_loads_eliminate]: 2.49999e-06 [parameter_eliminate]: 1.42999e-06 [a_2]: 0.00010069 [accelerated_algorithm]: 6.29999e-06 [shard]: 1.29e-06 [meta_shard_fg_expand]: 1.52001e-06 [shard_inline]: 6.55002e-06 [merge_send_recv]: 5.15999e-06 [auto_parallel]: 5.91998e-06 [parallel]: 5.39998e-06 [flash_sp]: 3.9e-06 [merge_comm]: 3.60998e-06 [allreduce_fusion]: 3.22002e-06 [matmul_add_comm_reduction]: 6.11e-06 [allreduce_slice_to_reducescatter]: 5.8001e-07 [virtual_shard_identity]: 7.13e-06 [virtual_dataset]: 6.03002e-06 [get_grad_eliminate_]: 5.71e-06 [virtual_output]: 6.39001e-06 [merge_forward]: 3.18e-06 [cell_reuse_recompute_pass]: 1.50001e-06 [offload_activation]: 7.35e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.738e-05 [merge_recompute_call_nodes]: 1.17e-06 [before_grad]: 1.104e-05 [set_forward_comm_id_for_comm_node_pass]: 4.13999e-06 [meta_fg_expand]: 2.22999e-06 [flash_sp_send_recv_attached]: 9.99979e-07 [receive_attached]: 9.89996e-07 [after_resolve]: 1.147e-05 [a_after_grad]: 1.009e-05 [renormalize]: 7.00238e-08 [add_forward_monad_depend]: 1.50001e-06 [auto_monad_grad]: 1.30001e-06 [auto_monad_eliminator]: 8.65999e-06 [cse]: 1.477e-05 [a_3]: 4.921e-05 [py_interpret_to_execute_after_opt_a]: 1.356e-05 [slice_cell_reuse_recomputed_activation]: 5.30001e-06 [rewriter_after_opt_a]: 3.873e-05 [convert_after_rewriter]: 1.019e-05 [order_py_execute_after_rewriter]: 7.96001e-06 [mutable_eliminate]: 0.00055893 [opt_b]: 0.00027269, [1] [Cycle 1]: 0.00026147, [7] [b_1]: 0.00016639 [b_2]: 8e-06 [updatestate_depend_eliminate]: 5.63002e-06 [updatestate_assign_eliminate]: 2.42001e-06 [updatestate_loads_eliminate]: 2.49001e-06 [renormalize]: 7.2e-07 [cse]: 1.963e-05 [optimize_parallel_all_gather_comm]: 2.228e-05 [overlap_param_gather]: 5.47999e-06 [cconv]: 2.969e-05 [loop_unroll]: 0.00045376 [opt_after_cconv]: 0.00012411, [1] [Cycle 1]: 0.00011517, [7] [c_1]: 3.087e-05 [parameter_eliminate]: 2.91e-06 [updatestate_depend_eliminate]: 5.13002e-06 [updatestate_assign_eliminate]: 2.58e-06 [updatestate_loads_eliminate]: 2.33998e-06 [cse]: 1.647e-05 [renormalize]: 4.39992e-07 [remove_dup_value]: 1.598e-05 [tuple_transform]: 8.845e-05, [1] [Cycle 1]: 8.174e-05, [4] [d_1]: 4.308e-05 [none_parameter_eliminate]: 1.57001e-06 [renormalize]: 2.30008e-07 [switch_simplify]: 7.38e-06 [partial_unused_args_eliminate]: 4.55999e-06 [add_recomputation]: 4.578e-05 [cse_after_recomputation]: 2.603e-05, [1] [Cycle 1]: 1.938e-05, [1] [cse]: 1.048e-05 [environ_conv]: 8.57e-06 [swap_dp_allreduce_reducescatter]: 7.68999e-06 [bias_add_comm_swap]: 5.04e-06 [label_micro_interleaved_index]: 6.76e-06 [label_fine_grained_interleaved_index]: 5.07e-06 [merge_cast_opt]: 3.89002e-06 [slice_recompute_activation]: 4.95001e-06 [micro_interleaved_order_control]: 4.47e-06 [assign_add_opt]: 3.66999e-06 [ForceFp32Comm]: 3.16999e-06 [remove_cast_before_assign_add]: 3.24001e-06 [full_micro_interleaved_order_control]: 4.45e-06 [reorder_send_recv_between_fp_bp]: 5.23002e-06 [comm_op_add_attrs]: 3.72998e-06 [add_comm_op_reuse_tag]: 3.31999e-06 [interleave_split_concat_branches]: 3.44001e-06 [interleave_parallel_branches]: 3.47997e-06 [overlap_opt_shard_in_pipeline]: 4.05998e-06 [overlap_opt_shard_grad_in_pipeline]: 4.17003e-06 [control_data_broadcast_order]: 1.511e-05 [grouped_pairwise_exchange_alltoall]: 3.78001e-06 [offloading_packed_experts]: 6.34001e-06 [overlap_recompute_and_grad_model_parallel]: 7.43e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.81001e-06 [overlap_recompute_allgather_and_fa_grad]: 3.78999e-06 [overlap_recompute_comm]: 4.74e-06 [overlap_grad_ring_attention]: 6.49001e-06 [overlap_grad_flash_sp]: 2.058e-05 [begin_end_overlap_inline]: 2.89999e-06 [split_matmul_comm_elemetwise]: 4.75999e-06 [split_layernorm_comm]: 4.02998e-06 [handle_group_info]: 3.66001e-06 [symbol_engine_optimizer]: 9.45e-05, [1] [Cycle 1]: 8.789e-05, [6] [build]: 2.88998e-06 [elim_shapecalc]: 9.25001e-06 [elim_not_effective]: 1.268e-05 [opt_reshape]: 7.14001e-06 [fold_const_symbol]: 1.001e-05 [renormalize]: 2.80008e-07 [detach_backward]: 3.44001e-06 [pipeline_parallel_scheduler]: 2.09e-06 [auto_monad_reorder]: 1.85e-05 [get_jit_bprop_graph]: 1.62001e-06 [rewriter_after_jit_bprop_graph]: 4.2e-06 [opt_after_jit_grad]: 0.00047088 [validate]: 3.563e-05 Sums bootstrap : 0.000429s : 3.08% type_inference : 0.008580s : 61.50% event_method : 0.000020s : 0.15% auto_monad : 0.000065s : 0.46% graph_reusing : 0.000006s : 0.04% inline : 0.000002s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000019s : 0.14% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.04% parallel-infer-symbol : 0.000003s : 0.02% pre_auto_parallel : 0.000036s : 0.25% insert-virtual-dataset : 0.000002s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000031s : 0.22% optimize.rewriter_before_opt_a : 0.000085s : 0.61% optimize.opt_a.expand_dump_flag : 0.000004s : 0.03% optimize.opt_a.switch_simplify : 0.000051s : 0.36% optimize.opt_a.loop_unroll : 0.000036s : 0.26% optimize.opt_a.a_1 : 0.000781s : 5.59% optimize.opt_a.with_stream_mark : 0.000030s : 0.21% optimize.opt_a.recompute_prepare : 0.000018s : 0.13% optimize.opt_a.updatestate_depend_eliminate : 0.000007s : 0.05% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.05% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.04% optimize.opt_a.parameter_eliminate : 0.000004s : 0.03% optimize.opt_a.a_2 : 0.000216s : 1.55% optimize.opt_a.accelerated_algorithm : 0.000014s : 0.10% optimize.opt_a.shard : 0.000003s : 0.02% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.03% optimize.opt_a.shard_inline : 0.000013s : 0.10% optimize.opt_a.merge_send_recv : 0.000013s : 0.10% optimize.opt_a.auto_parallel : 0.000013s : 0.09% optimize.opt_a.parallel : 0.000026s : 0.18% optimize.opt_a.flash_sp : 0.000013s : 0.10% optimize.opt_a.merge_comm : 0.000008s : 0.05% optimize.opt_a.allreduce_fusion : 0.000007s : 0.05% optimize.opt_a.matmul_add_comm_reduction : 0.000016s : 0.11% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000016s : 0.12% optimize.opt_a.virtual_dataset : 0.000013s : 0.09% optimize.opt_a.get_grad_eliminate_ : 0.000013s : 0.10% optimize.opt_a.virtual_output : 0.000014s : 0.10% optimize.opt_a.merge_forward : 0.000008s : 0.06% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.02% optimize.opt_a.offload_activation : 0.000018s : 0.13% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000042s : 0.30% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.02% optimize.opt_a.before_grad : 0.000022s : 0.16% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000008s : 0.06% optimize.opt_a.meta_fg_expand : 0.000005s : 0.04% optimize.opt_a.flash_sp_send_recv_attached : 0.000003s : 0.02% optimize.opt_a.receive_attached : 0.000003s : 0.02% optimize.opt_a.after_resolve : 0.000024s : 0.17% optimize.opt_a.a_after_grad : 0.000022s : 0.16% optimize.opt_a.renormalize : 0.000673s : 4.82% optimize.opt_a.add_forward_monad_depend : 0.000007s : 0.05% optimize.opt_a.auto_monad_grad : 0.000004s : 0.03% optimize.opt_a.auto_monad_eliminator : 0.000023s : 0.17% optimize.opt_a.cse : 0.000039s : 0.28% optimize.opt_a.a_3 : 0.000112s : 0.80% optimize.py_interpret_to_execute_after_opt_a : 0.000014s : 0.10% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.04% optimize.rewriter_after_opt_a : 0.000039s : 0.28% optimize.convert_after_rewriter : 0.000010s : 0.07% optimize.order_py_execute_after_rewriter : 0.000008s : 0.06% optimize.mutable_eliminate : 0.000559s : 4.01% optimize.opt_b.b_1 : 0.000166s : 1.19% optimize.opt_b.b_2 : 0.000008s : 0.06% optimize.opt_b.updatestate_depend_eliminate : 0.000006s : 0.04% optimize.opt_b.updatestate_assign_eliminate : 0.000002s : 0.02% optimize.opt_b.updatestate_loads_eliminate : 0.000002s : 0.02% optimize.opt_b.renormalize : 0.000001s : 0.01% optimize.opt_b.cse : 0.000020s : 0.14% optimize.optimize_parallel_all_gather_comm : 0.000022s : 0.16% optimize.overlap_param_gather : 0.000005s : 0.04% optimize.cconv : 0.000030s : 0.21% optimize.loop_unroll : 0.000454s : 3.25% optimize.opt_after_cconv.c_1 : 0.000031s : 0.22% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.02% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000005s : 0.04% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.02% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.02% optimize.opt_after_cconv.cse : 0.000016s : 0.12% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000016s : 0.11% optimize.tuple_transform.d_1 : 0.000043s : 0.31% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000007s : 0.05% optimize.partial_unused_args_eliminate : 0.000005s : 0.03% optimize.add_recomputation : 0.000046s : 0.33% optimize.cse_after_recomputation.cse : 0.000010s : 0.08% optimize.environ_conv : 0.000009s : 0.06% optimize.swap_dp_allreduce_reducescatter : 0.000008s : 0.06% optimize.bias_add_comm_swap : 0.000005s : 0.04% optimize.label_micro_interleaved_index : 0.000007s : 0.05% optimize.label_fine_grained_interleaved_index : 0.000005s : 0.04% optimize.merge_cast_opt : 0.000004s : 0.03% optimize.slice_recompute_activation : 0.000005s : 0.04% optimize.micro_interleaved_order_control : 0.000004s : 0.03% optimize.assign_add_opt : 0.000004s : 0.03% optimize.ForceFp32Comm : 0.000003s : 0.02% optimize.remove_cast_before_assign_add : 0.000003s : 0.02% optimize.full_micro_interleaved_order_control : 0.000004s : 0.03% optimize.reorder_send_recv_between_fp_bp : 0.000005s : 0.04% optimize.comm_op_add_attrs : 0.000004s : 0.03% optimize.add_comm_op_reuse_tag : 0.000003s : 0.02% optimize.interleave_split_concat_branches : 0.000003s : 0.02% optimize.interleave_parallel_branches : 0.000003s : 0.02% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.03% optimize.overlap_opt_shard_grad_in_pipeline : 0.000004s : 0.03% optimize.control_data_broadcast_order : 0.000015s : 0.11% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.03% optimize.offloading_packed_experts : 0.000006s : 0.05% optimize.overlap_recompute_and_grad_model_parallel : 0.000007s : 0.05% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.03% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.03% optimize.overlap_recompute_comm : 0.000005s : 0.03% optimize.overlap_grad_ring_attention : 0.000006s : 0.05% optimize.overlap_grad_flash_sp : 0.000021s : 0.15% optimize.begin_end_overlap_inline : 0.000003s : 0.02% optimize.split_matmul_comm_elemetwise : 0.000005s : 0.03% optimize.split_layernorm_comm : 0.000004s : 0.03% optimize.handle_group_info : 0.000004s : 0.03% optimize.symbol_engine_optimizer.build : 0.000003s : 0.02% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000009s : 0.07% optimize.symbol_engine_optimizer.elim_not_effective : 0.000013s : 0.09% optimize.symbol_engine_optimizer.opt_reshape : 0.000007s : 0.05% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000010s : 0.07% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000003s : 0.02% pipeline_parallel_scheduler : 0.000002s : 0.01% auto_monad_reorder : 0.000018s : 0.13% get_jit_bprop_graph : 0.000002s : 0.01% rewriter_after_jit_bprop_graph : 0.000004s : 0.03% opt_after_jit_grad : 0.000471s : 3.38% validate : 0.000036s : 0.26% Time group info: ------[substitution.] 0.000188 28 0.96% : 0.000002s : 2: substitution.elim_not_effective 0.72% : 0.000001s : 2: substitution.fold_const_symbol 3.23% : 0.000006s : 4: substitution.graph_param_transform 78.20% : 0.000147s : 4: substitution.inline 2.24% : 0.000004s : 4: substitution.j_node_and_user_rematch 3.23% : 0.000006s : 4: substitution.remove_not_recompute_node 3.06% : 0.000006s : 4: substitution.replace_old_param 8.35% : 0.000016s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.008525 2 54.10% : 0.004612s : 1: type_inference.infer 45.90% : 0.003913s : 1: type_inference.specialize ------[replace.] 0.000063 8 61.42% : 0.000039s : 4: replace.inline 38.58% : 0.000024s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000157 8 91.42% : 0.000144s : 4: match.inline 8.58% : 0.000014s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000208 1278 0.93% : 0.000002s : 13: predicate.accumulaten_eliminater 0.66% : 0.000001s : 4: predicate.ad_related_special_op_eliminate 0.53% : 0.000001s : 8: predicate.addn_check_dump 0.92% : 0.000002s : 13: predicate.addn_zero_filter 0.88% : 0.000002s : 13: predicate.adjust_all_reduce_mul_add 2.07% : 0.000004s : 21: predicate.arithmetic_simplify 1.02% : 0.000002s : 13: predicate.cast_eliminate 0.58% : 0.000001s : 8: predicate.check_bprop_eliminate 0.53% : 0.000001s : 8: predicate.compare_switch_simplify 0.20% : 0.000000s : 4: predicate.const_output_eliminate 0.58% : 0.000001s : 8: predicate.depend_value_elim 0.91% : 0.000002s : 13: predicate.dict_get_item_const_eliminator 0.98% : 0.000002s : 13: predicate.dict_get_item_eliminator 0.91% : 0.000002s : 13: predicate.dict_set_item_eliminator 0.90% : 0.000002s : 8: predicate.dumpgradient_eliminate 0.20% : 0.000000s : 4: predicate.elim_not_effective 0.32% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.15% : 0.000002s : 17: predicate.environ_add_const_eliminate 1.22% : 0.000003s : 17: predicate.environ_get_add_eliminate 1.10% : 0.000002s : 17: predicate.environ_get_depend_swap 1.82% : 0.000004s : 25: predicate.environ_get_eliminate 1.17% : 0.000002s : 17: predicate.environ_get_set_eliminate 1.46% : 0.000003s : 21: predicate.exchange_switch_depend_value 2.50% : 0.000005s : 21: predicate.float_depend_g_call 0.58% : 0.000001s : 8: predicate.float_environ_get_switch 0.85% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.15% : 0.000000s : 4: predicate.fold_const_symbol 0.79% : 0.000002s : 8: predicate.get_grad_eliminate 0.19% : 0.000000s : 4: predicate.graph_param_transform 0.58% : 0.000001s : 8: predicate.incorporate_call 0.49% : 0.000001s : 8: predicate.incorporate_call_switch 6.46% : 0.000013s : 58: predicate.inline 0.88% : 0.000002s : 8: predicate.inline_without_move 0.34% : 0.000001s : 8: predicate.j_node_and_user_rematch 0.73% : 0.000002s : 8: predicate.less_batch_normalization 1.93% : 0.000004s : 25: predicate.list_to_tuple_eliminator_ 2.46% : 0.000005s : 38: predicate.load_eliminater 0.89% : 0.000002s : 4: predicate.loop_unroll_after_grad 2.26% : 0.000005s : 34: predicate.loop_unroll_before_grad 1.69% : 0.000003s : 21: predicate.make_slice_get_slice_eliminator 0.61% : 0.000001s : 8: predicate.merge_addn 0.58% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.65% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.82% : 0.000002s : 13: predicate.minmaximum_grad 0.92% : 0.000002s : 4: predicate.mutable_eliminate 0.35% : 0.000001s : 4: predicate.opt_reshape 0.38% : 0.000001s : 4: predicate.parallel_virtual_node 1.77% : 0.000004s : 21: predicate.partial_defer_inline 1.62% : 0.000003s : 21: predicate.partial_eliminate 0.92% : 0.000002s : 13: predicate.print_const_string_wrapper 0.65% : 0.000001s : 8: predicate.reduce_all_const_elim 1.10% : 0.000002s : 13: predicate.reduce_eliminate 2.52% : 0.000005s : 38: predicate.redundant_stop_gradient_eliminater 0.47% : 0.000001s : 8: predicate.remove_not_recompute_node 1.26% : 0.000003s : 25: predicate.replace_applicator 0.37% : 0.000001s : 8: predicate.replace_old_param 0.36% : 0.000001s : 4: predicate.reset_defer_inline 0.89% : 0.000002s : 13: predicate.reshape_eliminate 0.71% : 0.000001s : 8: predicate.row_tensor_add_zeros_like 0.39% : 0.000001s : 4: predicate.row_tensor_eliminate 0.82% : 0.000002s : 8: predicate.same_eliminate 0.48% : 0.000001s : 8: predicate.set_cell_output_no_recompute 1.09% : 0.000002s : 8: predicate.shard_identity_eliminate 0.70% : 0.000001s : 8: predicate.special_op_eliminate 0.89% : 0.000002s : 8: predicate.specialize_transform 0.77% : 0.000002s : 8: predicate.split_environ_get_set_with_tuple_value 0.82% : 0.000002s : 8: predicate.stack_unstack_eliminate 0.32% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.53% : 0.000003s : 21: predicate.switch_defer_inline 2.51% : 0.000005s : 29: predicate.switch_layer_defer_inline 5.48% : 0.000011s : 67: predicate.switch_simplify 0.92% : 0.000002s : 13: predicate.tile_eliminate 0.87% : 0.000002s : 13: predicate.transpose_eliminate 1.44% : 0.000003s : 21: predicate.tuple_list_convert_item_index_to_positive 1.50% : 0.000003s : 21: predicate.tuple_list_get_item_const_eliminator 1.43% : 0.000003s : 21: predicate.tuple_list_get_item_depend_reorder 3.25% : 0.000007s : 33: predicate.tuple_list_get_item_eliminator 1.37% : 0.000003s : 21: predicate.tuple_list_get_set_item_eliminator 2.09% : 0.000004s : 29: predicate.tuple_list_set_item_eliminator 1.67% : 0.000003s : 25: predicate.tuple_to_list_eliminator_ 2.36% : 0.000005s : 38: predicate.updatestate_pure_node_eliminater 3.11% : 0.000006s : 46: predicate.updatestate_useless_node_eliminater 0.40% : 0.000001s : 4: predicate.value_based_eliminate 0.63% : 0.000001s : 8: predicate.virtual_dataset_eliminate 0.71% : 0.000001s : 8: predicate.virtual_output_eliminate 0.27% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.45% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000551 11 48.15% : 0.000265s : 5: func_graph_cloner_run.FuncGraphClonerGraph 51.85% : 0.000286s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.030031 192 0.02% : 0.000006s : 1: ForceFp32Comm 11.24% : 0.003377s : 1: add_attr 11.19% : 0.003361s : 1: add_attr_with_inline 0.02% : 0.000006s : 1: add_comm_op_reuse_tag 0.16% : 0.000049s : 1: add_recomputation 0.02% : 0.000006s : 1: assign_add_opt 0.24% : 0.000074s : 1: auto_monad 0.09% : 0.000026s : 1: auto_monad_reorder 0.02% : 0.000006s : 1: begin_end_overlap_inline 0.03% : 0.000008s : 1: bias_add_comm_swap 1.56% : 0.000469s : 1: bootstrap 0.11% : 0.000033s : 1: cconv 0.02% : 0.000006s : 1: comm_op_add_attrs 0.06% : 0.000018s : 1: control_data_broadcast_order 0.04% : 0.000013s : 1: convert_after_rewriter 0.10% : 0.000029s : 1: cse_after_recomputation 0.03% : 0.000008s : 1: dataset_repeat_opt 0.06% : 0.000017s : 1: detach_backward 0.04% : 0.000012s : 1: environ_conv 0.10% : 0.000031s : 1: event_method 0.02% : 0.000007s : 1: full_micro_interleaved_order_control 0.03% : 0.000008s : 1: get_jit_bprop_graph 0.04% : 0.000012s : 1: graph_reusing 0.02% : 0.000006s : 1: grouped_pairwise_exchange_alltoall 0.02% : 0.000006s : 1: handle_group_info 0.03% : 0.000008s : 1: inline 0.03% : 0.000009s : 1: insert-virtual-dataset 0.02% : 0.000006s : 1: interleave_parallel_branches 0.02% : 0.000006s : 1: interleave_split_concat_branches 0.03% : 0.000008s : 1: label_fine_grained_interleaved_index 0.03% : 0.000010s : 1: label_micro_interleaved_index 1.53% : 0.000460s : 1: loop_unroll 0.02% : 0.000006s : 1: merge_cast_opt 0.02% : 0.000007s : 1: micro_interleaved_order_control 1.89% : 0.000566s : 1: mutable_eliminate 0.03% : 0.000009s : 1: offloading_packed_experts 0.05% : 0.000014s : 1: opt.transform.loop_unroll_optimizer 0.05% : 0.000016s : 1: opt.transform.mutable_eliminate 4.09% : 0.001230s : 78: opt.transform.opt_a 0.10% : 0.000030s : 1: opt.transform.opt_after_cconv 0.08% : 0.000024s : 1: opt.transform.opt_after_jit_grad 0.34% : 0.000102s : 28: opt.transform.opt_b 0.16% : 0.000048s : 2: opt.transform.opt_trans_graph 0.12% : 0.000036s : 4: opt.transform.symbol_engine_opt 10.61% : 0.003185s : 1: opt_a 0.42% : 0.000128s : 1: opt_after_cconv 1.60% : 0.000481s : 1: opt_after_jit_grad 0.92% : 0.000277s : 1: opt_b 19.38% : 0.005819s : 1: optimize 0.09% : 0.000026s : 1: optimize_parallel_all_gather_comm 0.04% : 0.000011s : 1: order_py_execute_after_rewriter 0.08% : 0.000024s : 1: overlap_grad_flash_sp 0.02% : 0.000007s : 1: overlap_grad_matmul_and_grad_allreduce 0.03% : 0.000009s : 1: overlap_grad_ring_attention 0.02% : 0.000007s : 1: overlap_opt_shard_grad_in_pipeline 0.02% : 0.000007s : 1: overlap_opt_shard_in_pipeline 0.03% : 0.000009s : 1: overlap_param_gather 0.02% : 0.000006s : 1: overlap_recompute_allgather_and_fa_grad 0.04% : 0.000011s : 1: overlap_recompute_and_grad_model_parallel 0.03% : 0.000008s : 1: overlap_recompute_comm 0.04% : 0.000011s : 1: parallel-infer-symbol 0.02% : 0.000006s : 1: parallel-infer-symbol-second 0.02% : 0.000007s : 1: partial_unused_args_eliminate 0.03% : 0.000009s : 1: pipeline_parallel_scheduler 0.02% : 0.000007s : 1: pipeline_split 0.14% : 0.000043s : 1: pre_auto_parallel 0.12% : 0.000035s : 1: py_interpret_to_execute 0.06% : 0.000017s : 1: py_interpret_to_execute_after_opt_a 0.02% : 0.000006s : 1: remove_cast_before_assign_add 0.06% : 0.000019s : 1: remove_dup_value 1.18% : 0.000355s : 1: renormalize.infer 1.03% : 0.000309s : 1: renormalize.specialize 0.03% : 0.000008s : 1: reorder_send_recv_between_fp_bp 0.03% : 0.000010s : 1: rewriter_after_jit_bprop_graph 0.14% : 0.000043s : 1: rewriter_after_opt_a 0.30% : 0.000089s : 1: rewriter_before_opt_a 0.03% : 0.000008s : 1: slice_cell_reuse_recomputed_activation 0.03% : 0.000008s : 1: slice_recompute_activation 0.02% : 0.000007s : 1: split_layernorm_comm 0.03% : 0.000008s : 1: split_matmul_comm_elemetwise 0.04% : 0.000011s : 1: swap_dp_allreduce_reducescatter 0.32% : 0.000097s : 1: symbol_engine_optimizer 0.30% : 0.000091s : 1: tuple_transform 28.72% : 0.008625s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:45:34.118.062 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0340693, [21] [bootstrap]: 0.00044158 [type_inference]: 0.0252201 [event_method]: 1.877e-05 [auto_monad]: 6.378e-05 [graph_reusing]: 6.00002e-06 [inline]: 2.26e-06 [add_attr]: 0.00318363, [1] [add_attr_with_inline]: 0.00317526, [1] [Cycle 1]: 5.656e-05, [2] [tag_attr]: 1.865e-05 [meta_addattr_fg_expand]: 5.71e-06 [parallel-infer-symbol]: 3.11001e-06 [pre_auto_parallel]: 3.197e-05 [insert-virtual-dataset]: 2.71e-06 [parallel-infer-symbol-second]: 7.50006e-07 [dataset_repeat_opt]: 1.89999e-06 [pipeline_split]: 1.92999e-06 [optimize]: 0.00443524, [53] [py_interpret_to_execute]: 2.577e-05 [rewriter_before_opt_a]: 7.917e-05 [opt_a]: 0.00249974, [2] [Cycle 1]: 0.00185118, [45] [expand_dump_flag]: 2.81999e-06 [switch_simplify]: 3.852e-05 [loop_unroll]: 3.066e-05 [a_1]: 0.00060538 [with_stream_mark]: 1.607e-05 [recompute_prepare]: 8.45001e-06 [updatestate_depend_eliminate]: 3.70998e-06 [updatestate_assign_eliminate]: 3.46999e-06 [updatestate_loads_eliminate]: 3.15002e-06 [parameter_eliminate]: 2.36998e-06 [a_2]: 8.152e-05 [accelerated_algorithm]: 6.86999e-06 [shard]: 1.79998e-06 [meta_shard_fg_expand]: 1.82001e-06 [shard_inline]: 6.64999e-06 [merge_send_recv]: 7.85e-06 [auto_parallel]: 5.80002e-06 [parallel]: 1.856e-05 [flash_sp]: 7e-06 [merge_comm]: 3.90998e-06 [allreduce_fusion]: 3.43e-06 [matmul_add_comm_reduction]: 9.20001e-06 [allreduce_slice_to_reducescatter]: 7.80012e-07 [virtual_shard_identity]: 8.04997e-06 [virtual_dataset]: 6.79001e-06 [get_grad_eliminate_]: 6.38e-06 [virtual_output]: 6.46999e-06 [merge_forward]: 4.03001e-06 [cell_reuse_recompute_pass]: 1.09998e-06 [offload_activation]: 9.67001e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.187e-05 [merge_recompute_call_nodes]: 1.47999e-06 [before_grad]: 1.077e-05 [set_forward_comm_id_for_comm_node_pass]: 3.67002e-06 [meta_fg_expand]: 2.69999e-06 [flash_sp_send_recv_attached]: 2.67001e-06 [receive_attached]: 2.22001e-06 [after_resolve]: 1.118e-05 [a_after_grad]: 1.004e-05 [renormalize]: 0.00054794 [add_forward_monad_depend]: 5.09e-06 [auto_monad_grad]: 2.56e-06 [auto_monad_eliminator]: 1.371e-05 [cse]: 2.792e-05 [a_3]: 4.569e-05 [Cycle 2]: 0.00063944, [45] [expand_dump_flag]: 1.05999e-06 [switch_simplify]: 7.71999e-06 [loop_unroll]: 6.14001e-06 [a_1]: 0.00013076 [with_stream_mark]: 1.084e-05 [recompute_prepare]: 6.61e-06 [updatestate_depend_eliminate]: 2.79999e-06 [updatestate_assign_eliminate]: 2.31998e-06 [updatestate_loads_eliminate]: 2.48e-06 [parameter_eliminate]: 1.28002e-06 [a_2]: 7.239e-05 [accelerated_algorithm]: 6.36e-06 [shard]: 1.15001e-06 [meta_shard_fg_expand]: 1.28002e-06 [shard_inline]: 6.12999e-06 [merge_send_recv]: 4.17e-06 [auto_parallel]: 5.05001e-06 [parallel]: 4.35999e-06 [flash_sp]: 3.68e-06 [merge_comm]: 3.27997e-06 [allreduce_fusion]: 3.04001e-06 [matmul_add_comm_reduction]: 5.04998e-06 [allreduce_slice_to_reducescatter]: 3.50003e-07 [virtual_shard_identity]: 6.39999e-06 [virtual_dataset]: 5.97999e-06 [get_grad_eliminate_]: 5.74e-06 [virtual_output]: 5.79e-06 [merge_forward]: 2.68e-06 [cell_reuse_recompute_pass]: 1.54e-06 [offload_activation]: 6.31998e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.203e-05 [merge_recompute_call_nodes]: 9.70002e-07 [before_grad]: 8.95999e-06 [set_forward_comm_id_for_comm_node_pass]: 2.98998e-06 [meta_fg_expand]: 1.99999e-06 [flash_sp_send_recv_attached]: 8.39995e-07 [receive_attached]: 9.50007e-07 [after_resolve]: 1.014e-05 [a_after_grad]: 8.95999e-06 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 1.49998e-06 [auto_monad_grad]: 9.09989e-07 [auto_monad_eliminator]: 7.46001e-06 [cse]: 1.318e-05 [a_3]: 3.661e-05 [py_interpret_to_execute_after_opt_a]: 7.92003e-06 [slice_cell_reuse_recomputed_activation]: 1.84998e-06 [rewriter_after_opt_a]: 3.359e-05 [convert_after_rewriter]: 7.07002e-06 [order_py_execute_after_rewriter]: 5.33002e-06 [mutable_eliminate]: 0.00048538 [opt_b]: 0.00019801, [1] [Cycle 1]: 0.00019206, [7] [b_1]: 0.00012217 [b_2]: 7.63001e-06 [updatestate_depend_eliminate]: 4.52998e-06 [updatestate_assign_eliminate]: 2.26998e-06 [updatestate_loads_eliminate]: 2.39001e-06 [renormalize]: 3.30008e-07 [cse]: 1.725e-05 [optimize_parallel_all_gather_comm]: 1.586e-05 [overlap_param_gather]: 1.82999e-06 [cconv]: 2.332e-05 [loop_unroll]: 0.00041432 [opt_after_cconv]: 9.995e-05, [1] [Cycle 1]: 9.433e-05, [7] [c_1]: 3.068e-05 [parameter_eliminate]: 3.75e-06 [updatestate_depend_eliminate]: 5.02999e-06 [updatestate_assign_eliminate]: 2.31998e-06 [updatestate_loads_eliminate]: 2.18998e-06 [cse]: 1.625e-05 [renormalize]: 3.80009e-07 [remove_dup_value]: 1.314e-05 [tuple_transform]: 7.397e-05, [1] [Cycle 1]: 6.946e-05, [4] [d_1]: 4.303e-05 [none_parameter_eliminate]: 1.51998e-06 [renormalize]: 1.8999e-07 [switch_simplify]: 6.51e-06 [partial_unused_args_eliminate]: 1.79998e-06 [add_recomputation]: 4.548e-05 [cse_after_recomputation]: 2.012e-05, [1] [Cycle 1]: 1.581e-05, [1] [cse]: 1.032e-05 [environ_conv]: 5.17e-06 [swap_dp_allreduce_reducescatter]: 5.35999e-06 [bias_add_comm_swap]: 2.44001e-06 [label_micro_interleaved_index]: 4.72e-06 [label_fine_grained_interleaved_index]: 2.53e-06 [merge_cast_opt]: 1.59e-06 [slice_recompute_activation]: 2.01998e-06 [micro_interleaved_order_control]: 2.41e-06 [assign_add_opt]: 1.20999e-06 [ForceFp32Comm]: 9.09989e-07 [remove_cast_before_assign_add]: 1.11002e-06 [full_micro_interleaved_order_control]: 2.14e-06 [reorder_send_recv_between_fp_bp]: 2.96001e-06 [comm_op_add_attrs]: 1.19e-06 [add_comm_op_reuse_tag]: 1.02998e-06 [interleave_split_concat_branches]: 1.12e-06 [interleave_parallel_branches]: 1.06002e-06 [overlap_opt_shard_in_pipeline]: 1.17999e-06 [overlap_opt_shard_grad_in_pipeline]: 1.89e-06 [control_data_broadcast_order]: 1.203e-05 [grouped_pairwise_exchange_alltoall]: 1.52001e-06 [offloading_packed_experts]: 3.63999e-06 [overlap_recompute_and_grad_model_parallel]: 4.66002e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.17e-06 [overlap_recompute_allgather_and_fa_grad]: 1.40999e-06 [overlap_recompute_comm]: 2.21998e-06 [overlap_grad_ring_attention]: 3.83999e-06 [overlap_grad_flash_sp]: 1.702e-05 [begin_end_overlap_inline]: 5.49975e-07 [split_matmul_comm_elemetwise]: 2.51e-06 [split_layernorm_comm]: 1.55001e-06 [handle_group_info]: 9.70002e-07 [symbol_engine_optimizer]: 7.401e-05, [1] [Cycle 1]: 6.963e-05, [6] [build]: 2.86999e-06 [elim_shapecalc]: 9.48002e-06 [elim_not_effective]: 1.236e-05 [opt_reshape]: 6.63e-06 [fold_const_symbol]: 1.018e-05 [renormalize]: 2.40019e-07 [detach_backward]: 1.84e-06 [pipeline_parallel_scheduler]: 1.54e-06 [auto_monad_reorder]: 1.659e-05 [get_jit_bprop_graph]: 1.71e-06 [rewriter_after_jit_bprop_graph]: 3.73001e-06 [opt_after_jit_grad]: 0.00044825 [validate]: 3.568e-05 Sums bootstrap : 0.000442s : 1.47% type_inference : 0.025220s : 84.21% event_method : 0.000019s : 0.06% auto_monad : 0.000064s : 0.21% graph_reusing : 0.000006s : 0.02% inline : 0.000002s : 0.01% add_attr.add_attr_with_inline.tag_attr : 0.000019s : 0.06% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.02% parallel-infer-symbol : 0.000003s : 0.01% pre_auto_parallel : 0.000032s : 0.11% insert-virtual-dataset : 0.000003s : 0.01% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.01% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000026s : 0.09% optimize.rewriter_before_opt_a : 0.000079s : 0.26% optimize.opt_a.expand_dump_flag : 0.000004s : 0.01% optimize.opt_a.switch_simplify : 0.000046s : 0.15% optimize.opt_a.loop_unroll : 0.000037s : 0.12% optimize.opt_a.a_1 : 0.000736s : 2.46% optimize.opt_a.with_stream_mark : 0.000027s : 0.09% optimize.opt_a.recompute_prepare : 0.000015s : 0.05% optimize.opt_a.updatestate_depend_eliminate : 0.000007s : 0.02% optimize.opt_a.updatestate_assign_eliminate : 0.000006s : 0.02% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.02% optimize.opt_a.parameter_eliminate : 0.000004s : 0.01% optimize.opt_a.a_2 : 0.000154s : 0.51% optimize.opt_a.accelerated_algorithm : 0.000013s : 0.04% optimize.opt_a.shard : 0.000003s : 0.01% optimize.opt_a.meta_shard_fg_expand : 0.000003s : 0.01% optimize.opt_a.shard_inline : 0.000013s : 0.04% optimize.opt_a.merge_send_recv : 0.000012s : 0.04% optimize.opt_a.auto_parallel : 0.000011s : 0.04% optimize.opt_a.parallel : 0.000023s : 0.08% optimize.opt_a.flash_sp : 0.000011s : 0.04% optimize.opt_a.merge_comm : 0.000007s : 0.02% optimize.opt_a.allreduce_fusion : 0.000006s : 0.02% optimize.opt_a.matmul_add_comm_reduction : 0.000014s : 0.05% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000014s : 0.05% optimize.opt_a.virtual_dataset : 0.000013s : 0.04% optimize.opt_a.get_grad_eliminate_ : 0.000012s : 0.04% optimize.opt_a.virtual_output : 0.000012s : 0.04% optimize.opt_a.merge_forward : 0.000007s : 0.02% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.01% optimize.opt_a.offload_activation : 0.000016s : 0.05% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000024s : 0.08% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.01% optimize.opt_a.before_grad : 0.000020s : 0.07% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000007s : 0.02% optimize.opt_a.meta_fg_expand : 0.000005s : 0.02% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.01% optimize.opt_a.receive_attached : 0.000003s : 0.01% optimize.opt_a.after_resolve : 0.000021s : 0.07% optimize.opt_a.a_after_grad : 0.000019s : 0.06% optimize.opt_a.renormalize : 0.000548s : 1.83% optimize.opt_a.add_forward_monad_depend : 0.000007s : 0.02% optimize.opt_a.auto_monad_grad : 0.000003s : 0.01% optimize.opt_a.auto_monad_eliminator : 0.000021s : 0.07% optimize.opt_a.cse : 0.000041s : 0.14% optimize.opt_a.a_3 : 0.000082s : 0.27% optimize.py_interpret_to_execute_after_opt_a : 0.000008s : 0.03% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.01% optimize.rewriter_after_opt_a : 0.000034s : 0.11% optimize.convert_after_rewriter : 0.000007s : 0.02% optimize.order_py_execute_after_rewriter : 0.000005s : 0.02% optimize.mutable_eliminate : 0.000485s : 1.62% optimize.opt_b.b_1 : 0.000122s : 0.41% optimize.opt_b.b_2 : 0.000008s : 0.03% optimize.opt_b.updatestate_depend_eliminate : 0.000005s : 0.02% optimize.opt_b.updatestate_assign_eliminate : 0.000002s : 0.01% optimize.opt_b.updatestate_loads_eliminate : 0.000002s : 0.01% optimize.opt_b.renormalize : 0.000000s : 0.00% optimize.opt_b.cse : 0.000017s : 0.06% optimize.optimize_parallel_all_gather_comm : 0.000016s : 0.05% optimize.overlap_param_gather : 0.000002s : 0.01% optimize.cconv : 0.000023s : 0.08% optimize.loop_unroll : 0.000414s : 1.38% optimize.opt_after_cconv.c_1 : 0.000031s : 0.10% optimize.opt_after_cconv.parameter_eliminate : 0.000004s : 0.01% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000005s : 0.02% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000002s : 0.01% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.01% optimize.opt_after_cconv.cse : 0.000016s : 0.05% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000013s : 0.04% optimize.tuple_transform.d_1 : 0.000043s : 0.14% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000007s : 0.02% optimize.partial_unused_args_eliminate : 0.000002s : 0.01% optimize.add_recomputation : 0.000045s : 0.15% optimize.cse_after_recomputation.cse : 0.000010s : 0.03% optimize.environ_conv : 0.000005s : 0.02% optimize.swap_dp_allreduce_reducescatter : 0.000005s : 0.02% optimize.bias_add_comm_swap : 0.000002s : 0.01% optimize.label_micro_interleaved_index : 0.000005s : 0.02% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.01% optimize.merge_cast_opt : 0.000002s : 0.01% optimize.slice_recompute_activation : 0.000002s : 0.01% optimize.micro_interleaved_order_control : 0.000002s : 0.01% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.00% optimize.full_micro_interleaved_order_control : 0.000002s : 0.01% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.01% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.01% optimize.control_data_broadcast_order : 0.000012s : 0.04% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.01% optimize.offloading_packed_experts : 0.000004s : 0.01% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.02% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.00% optimize.overlap_recompute_comm : 0.000002s : 0.01% optimize.overlap_grad_ring_attention : 0.000004s : 0.01% optimize.overlap_grad_flash_sp : 0.000017s : 0.06% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000003s : 0.01% optimize.split_layernorm_comm : 0.000002s : 0.01% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000003s : 0.01% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000009s : 0.03% optimize.symbol_engine_optimizer.elim_not_effective : 0.000012s : 0.04% optimize.symbol_engine_optimizer.opt_reshape : 0.000007s : 0.02% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000010s : 0.03% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.01% pipeline_parallel_scheduler : 0.000002s : 0.01% auto_monad_reorder : 0.000017s : 0.06% get_jit_bprop_graph : 0.000002s : 0.01% rewriter_after_jit_bprop_graph : 0.000004s : 0.01% opt_after_jit_grad : 0.000448s : 1.50% validate : 0.000036s : 0.12% Time group info: ------[substitution.] 0.000164 28 1.13% : 0.000002s : 2: substitution.elim_not_effective 0.95% : 0.000002s : 2: substitution.fold_const_symbol 3.53% : 0.000006s : 4: substitution.graph_param_transform 77.81% : 0.000128s : 4: substitution.inline 1.97% : 0.000003s : 4: substitution.j_node_and_user_rematch 2.98% : 0.000005s : 4: substitution.remove_not_recompute_node 2.45% : 0.000004s : 4: substitution.replace_old_param 9.19% : 0.000015s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.025158 2 97.26% : 0.024468s : 1: type_inference.infer 2.74% : 0.000690s : 1: type_inference.specialize ------[replace.] 0.000058 8 61.27% : 0.000035s : 4: replace.inline 38.73% : 0.000022s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000138 8 90.47% : 0.000125s : 4: match.inline 9.53% : 0.000013s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000196 1278 0.93% : 0.000002s : 13: predicate.accumulaten_eliminater 0.78% : 0.000002s : 4: predicate.ad_related_special_op_eliminate 0.50% : 0.000001s : 8: predicate.addn_check_dump 0.99% : 0.000002s : 13: predicate.addn_zero_filter 0.86% : 0.000002s : 13: predicate.adjust_all_reduce_mul_add 2.00% : 0.000004s : 21: predicate.arithmetic_simplify 0.95% : 0.000002s : 13: predicate.cast_eliminate 0.61% : 0.000001s : 8: predicate.check_bprop_eliminate 0.52% : 0.000001s : 8: predicate.compare_switch_simplify 0.19% : 0.000000s : 4: predicate.const_output_eliminate 0.60% : 0.000001s : 8: predicate.depend_value_elim 0.94% : 0.000002s : 13: predicate.dict_get_item_const_eliminator 1.06% : 0.000002s : 13: predicate.dict_get_item_eliminator 0.97% : 0.000002s : 13: predicate.dict_set_item_eliminator 0.93% : 0.000002s : 8: predicate.dumpgradient_eliminate 0.24% : 0.000000s : 4: predicate.elim_not_effective 0.45% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.16% : 0.000002s : 17: predicate.environ_add_const_eliminate 1.11% : 0.000002s : 17: predicate.environ_get_add_eliminate 1.09% : 0.000002s : 17: predicate.environ_get_depend_swap 1.74% : 0.000003s : 25: predicate.environ_get_eliminate 1.16% : 0.000002s : 17: predicate.environ_get_set_eliminate 1.55% : 0.000003s : 21: predicate.exchange_switch_depend_value 2.26% : 0.000004s : 21: predicate.float_depend_g_call 0.62% : 0.000001s : 8: predicate.float_environ_get_switch 0.81% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.20% : 0.000000s : 4: predicate.fold_const_symbol 0.69% : 0.000001s : 8: predicate.get_grad_eliminate 0.20% : 0.000000s : 4: predicate.graph_param_transform 0.62% : 0.000001s : 8: predicate.incorporate_call 0.54% : 0.000001s : 8: predicate.incorporate_call_switch 6.24% : 0.000012s : 58: predicate.inline 0.79% : 0.000002s : 8: predicate.inline_without_move 0.35% : 0.000001s : 8: predicate.j_node_and_user_rematch 0.86% : 0.000002s : 8: predicate.less_batch_normalization 1.87% : 0.000004s : 25: predicate.list_to_tuple_eliminator_ 2.66% : 0.000005s : 38: predicate.load_eliminater 0.83% : 0.000002s : 4: predicate.loop_unroll_after_grad 2.49% : 0.000005s : 34: predicate.loop_unroll_before_grad 1.62% : 0.000003s : 21: predicate.make_slice_get_slice_eliminator 0.55% : 0.000001s : 8: predicate.merge_addn 0.61% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.60% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.82% : 0.000002s : 13: predicate.minmaximum_grad 0.90% : 0.000002s : 4: predicate.mutable_eliminate 0.37% : 0.000001s : 4: predicate.opt_reshape 0.44% : 0.000001s : 4: predicate.parallel_virtual_node 1.82% : 0.000004s : 21: predicate.partial_defer_inline 1.69% : 0.000003s : 21: predicate.partial_eliminate 0.93% : 0.000002s : 13: predicate.print_const_string_wrapper 0.58% : 0.000001s : 8: predicate.reduce_all_const_elim 1.14% : 0.000002s : 13: predicate.reduce_eliminate 2.52% : 0.000005s : 38: predicate.redundant_stop_gradient_eliminater 0.40% : 0.000001s : 8: predicate.remove_not_recompute_node 1.41% : 0.000003s : 25: predicate.replace_applicator 0.54% : 0.000001s : 8: predicate.replace_old_param 0.27% : 0.000001s : 4: predicate.reset_defer_inline 1.08% : 0.000002s : 13: predicate.reshape_eliminate 0.66% : 0.000001s : 8: predicate.row_tensor_add_zeros_like 0.36% : 0.000001s : 4: predicate.row_tensor_eliminate 0.70% : 0.000001s : 8: predicate.same_eliminate 0.47% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.73% : 0.000001s : 8: predicate.shard_identity_eliminate 0.68% : 0.000001s : 8: predicate.special_op_eliminate 0.84% : 0.000002s : 8: predicate.specialize_transform 0.81% : 0.000002s : 8: predicate.split_environ_get_set_with_tuple_value 0.83% : 0.000002s : 8: predicate.stack_unstack_eliminate 0.35% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.58% : 0.000003s : 21: predicate.switch_defer_inline 2.14% : 0.000004s : 29: predicate.switch_layer_defer_inline 5.02% : 0.000010s : 67: predicate.switch_simplify 0.95% : 0.000002s : 13: predicate.tile_eliminate 0.98% : 0.000002s : 13: predicate.transpose_eliminate 1.62% : 0.000003s : 21: predicate.tuple_list_convert_item_index_to_positive 1.61% : 0.000003s : 21: predicate.tuple_list_get_item_const_eliminator 1.40% : 0.000003s : 21: predicate.tuple_list_get_item_depend_reorder 3.10% : 0.000006s : 33: predicate.tuple_list_get_item_eliminator 1.53% : 0.000003s : 21: predicate.tuple_list_get_set_item_eliminator 2.30% : 0.000005s : 29: predicate.tuple_list_set_item_eliminator 1.68% : 0.000003s : 25: predicate.tuple_to_list_eliminator_ 2.46% : 0.000005s : 38: predicate.updatestate_pure_node_eliminater 3.08% : 0.000006s : 46: predicate.updatestate_useless_node_eliminater 0.40% : 0.000001s : 4: predicate.value_based_eliminate 0.68% : 0.000001s : 8: predicate.virtual_dataset_eliminate 0.66% : 0.000001s : 8: predicate.virtual_output_eliminate 0.30% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.45% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000537 11 56.56% : 0.000304s : 5: func_graph_cloner_run.FuncGraphClonerGraph 43.44% : 0.000233s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.043522 192 0.01% : 0.000004s : 1: ForceFp32Comm 7.33% : 0.003188s : 1: add_attr 7.30% : 0.003179s : 1: add_attr_with_inline 0.01% : 0.000004s : 1: add_comm_op_reuse_tag 0.11% : 0.000049s : 1: add_recomputation 0.01% : 0.000004s : 1: assign_add_opt 0.16% : 0.000069s : 1: auto_monad 0.05% : 0.000020s : 1: auto_monad_reorder 0.01% : 0.000003s : 1: begin_end_overlap_inline 0.01% : 0.000005s : 1: bias_add_comm_swap 1.08% : 0.000470s : 1: bootstrap 0.06% : 0.000027s : 1: cconv 0.01% : 0.000004s : 1: comm_op_add_attrs 0.04% : 0.000015s : 1: control_data_broadcast_order 0.02% : 0.000010s : 1: convert_after_rewriter 0.05% : 0.000023s : 1: cse_after_recomputation 0.01% : 0.000005s : 1: dataset_repeat_opt 0.01% : 0.000005s : 1: detach_backward 0.02% : 0.000008s : 1: environ_conv 0.06% : 0.000025s : 1: event_method 0.01% : 0.000005s : 1: full_micro_interleaved_order_control 0.01% : 0.000005s : 1: get_jit_bprop_graph 0.02% : 0.000010s : 1: graph_reusing 0.01% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000004s : 1: handle_group_info 0.01% : 0.000005s : 1: inline 0.01% : 0.000006s : 1: insert-virtual-dataset 0.01% : 0.000004s : 1: interleave_parallel_branches 0.01% : 0.000004s : 1: interleave_split_concat_branches 0.01% : 0.000006s : 1: label_fine_grained_interleaved_index 0.02% : 0.000007s : 1: label_micro_interleaved_index 0.97% : 0.000422s : 1: loop_unroll 0.01% : 0.000004s : 1: merge_cast_opt 0.01% : 0.000005s : 1: micro_interleaved_order_control 1.13% : 0.000493s : 1: mutable_eliminate 0.02% : 0.000007s : 1: offloading_packed_experts 0.03% : 0.000013s : 1: opt.transform.loop_unroll_optimizer 0.03% : 0.000014s : 1: opt.transform.mutable_eliminate 2.63% : 0.001145s : 78: opt.transform.opt_a 0.07% : 0.000029s : 1: opt.transform.opt_after_cconv 0.05% : 0.000023s : 1: opt.transform.opt_after_jit_grad 0.23% : 0.000099s : 28: opt.transform.opt_b 0.11% : 0.000048s : 2: opt.transform.opt_trans_graph 0.08% : 0.000035s : 4: opt.transform.symbol_engine_opt 5.75% : 0.002503s : 1: opt_a 0.24% : 0.000103s : 1: opt_after_cconv 1.05% : 0.000456s : 1: opt_after_jit_grad 0.46% : 0.000201s : 1: opt_b 10.20% : 0.004439s : 1: optimize 0.04% : 0.000019s : 1: optimize_parallel_all_gather_comm 0.02% : 0.000008s : 1: order_py_execute_after_rewriter 0.05% : 0.000020s : 1: overlap_grad_flash_sp 0.01% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.02% : 0.000007s : 1: overlap_grad_ring_attention 0.01% : 0.000004s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.01% : 0.000005s : 1: overlap_param_gather 0.01% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.02% : 0.000008s : 1: overlap_recompute_and_grad_model_parallel 0.01% : 0.000005s : 1: overlap_recompute_comm 0.02% : 0.000007s : 1: parallel-infer-symbol 0.01% : 0.000004s : 1: parallel-infer-symbol-second 0.01% : 0.000005s : 1: partial_unused_args_eliminate 0.01% : 0.000005s : 1: pipeline_parallel_scheduler 0.01% : 0.000005s : 1: pipeline_split 0.08% : 0.000036s : 1: pre_auto_parallel 0.07% : 0.000030s : 1: py_interpret_to_execute 0.03% : 0.000011s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000004s : 1: remove_cast_before_assign_add 0.04% : 0.000017s : 1: remove_dup_value 0.64% : 0.000280s : 1: renormalize.infer 0.60% : 0.000261s : 1: renormalize.specialize 0.01% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.02% : 0.000007s : 1: rewriter_after_jit_bprop_graph 0.09% : 0.000038s : 1: rewriter_after_opt_a 0.19% : 0.000083s : 1: rewriter_before_opt_a 0.01% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.01% : 0.000005s : 1: slice_recompute_activation 0.01% : 0.000004s : 1: split_layernorm_comm 0.01% : 0.000005s : 1: split_matmul_comm_elemetwise 0.02% : 0.000008s : 1: swap_dp_allreduce_reducescatter 0.18% : 0.000077s : 1: symbol_engine_optimizer 0.18% : 0.000077s : 1: tuple_transform 57.99% : 0.025237s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:45:34.574.974 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:45:34.575.247 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0607517, [21] [bootstrap]: 0.00039488 [type_inference]: 0.0498252 [event_method]: 1.956e-05 [auto_monad]: 6.274e-05 [graph_reusing]: 6.70002e-06 [inline]: 2.28998e-06 [add_attr]: 0.00323143, [1] [add_attr_with_inline]: 0.00322183, [1] [Cycle 1]: 7.469e-05, [2] [tag_attr]: 1.95e-05 [meta_addattr_fg_expand]: 6.04999e-06 [parallel-infer-symbol]: 3.38e-06 [pre_auto_parallel]: 3.32e-05 [insert-virtual-dataset]: 2.27001e-06 [parallel-infer-symbol-second]: 7.39994e-07 [dataset_repeat_opt]: 2.26e-06 [pipeline_split]: 1.54998e-06 [optimize]: 0.00598194, [53] [py_interpret_to_execute]: 3.478e-05 [rewriter_before_opt_a]: 9.099e-05 [opt_a]: 0.00354926, [2] [Cycle 1]: 0.0025974, [45] [expand_dump_flag]: 3.11001e-06 [switch_simplify]: 4.293e-05 [loop_unroll]: 3.134e-05 [a_1]: 0.00076488 [with_stream_mark]: 1.787e-05 [recompute_prepare]: 1.141e-05 [updatestate_depend_eliminate]: 4.45e-06 [updatestate_assign_eliminate]: 3.91999e-06 [updatestate_loads_eliminate]: 3.83999e-06 [parameter_eliminate]: 1.96998e-06 [a_2]: 0.00014198 [accelerated_algorithm]: 8.84998e-06 [shard]: 2.36e-06 [meta_shard_fg_expand]: 2.01e-06 [shard_inline]: 7.65e-06 [merge_send_recv]: 0.00011855 [auto_parallel]: 1.027e-05 [parallel]: 1.859e-05 [flash_sp]: 1.007e-05 [merge_comm]: 5.11002e-06 [allreduce_fusion]: 4.42e-06 [matmul_add_comm_reduction]: 1.027e-05 [allreduce_slice_to_reducescatter]: 1.07998e-06 [virtual_shard_identity]: 1.226e-05 [virtual_dataset]: 1.315e-05 [get_grad_eliminate_]: 8.47e-06 [virtual_output]: 7.83999e-06 [merge_forward]: 5.24e-06 [cell_reuse_recompute_pass]: 1.47999e-06 [offload_activation]: 1.159e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.058e-05 [merge_recompute_call_nodes]: 1.66e-06 [before_grad]: 1.281e-05 [set_forward_comm_id_for_comm_node_pass]: 5.35001e-06 [meta_fg_expand]: 3.63e-06 [flash_sp_send_recv_attached]: 2.53e-06 [receive_attached]: 2.22999e-06 [after_resolve]: 1.333e-05 [a_after_grad]: 1.251e-05 [renormalize]: 0.00070015 [add_forward_monad_depend]: 5.72999e-06 [auto_monad_grad]: 2.32999e-06 [auto_monad_eliminator]: 1.728e-05 [cse]: 3.648e-05 [a_3]: 7.147e-05 [Cycle 2]: 0.00093868, [45] [expand_dump_flag]: 1.27e-06 [switch_simplify]: 9.05001e-06 [loop_unroll]: 7.55e-06 [a_1]: 0.0001781 [with_stream_mark]: 1.307e-05 [recompute_prepare]: 7.73999e-06 [updatestate_depend_eliminate]: 4.42998e-06 [updatestate_assign_eliminate]: 2.89999e-06 [updatestate_loads_eliminate]: 2.98998e-06 [parameter_eliminate]: 1.22999e-06 [a_2]: 0.0001192 [accelerated_algorithm]: 7.65e-06 [shard]: 1.30999e-06 [meta_shard_fg_expand]: 1.64e-06 [shard_inline]: 7.56999e-06 [merge_send_recv]: 6.22001e-06 [auto_parallel]: 6.94001e-06 [parallel]: 4.74998e-06 [flash_sp]: 3.51001e-06 [merge_comm]: 4.51002e-06 [allreduce_fusion]: 3.64002e-06 [matmul_add_comm_reduction]: 6.79999e-06 [allreduce_slice_to_reducescatter]: 3.19997e-07 [virtual_shard_identity]: 8.46002e-06 [virtual_dataset]: 7.82998e-06 [get_grad_eliminate_]: 7.14001e-06 [virtual_output]: 7.2e-06 [merge_forward]: 3.97e-06 [cell_reuse_recompute_pass]: 1.29e-06 [offload_activation]: 8.58001e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.714e-05 [merge_recompute_call_nodes]: 1.07e-06 [before_grad]: 1.103e-05 [set_forward_comm_id_for_comm_node_pass]: 4.42e-06 [meta_fg_expand]: 3.06001e-06 [flash_sp_send_recv_attached]: 1.17e-06 [receive_attached]: 1.00001e-06 [after_resolve]: 1.245e-05 [a_after_grad]: 1.102e-05 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 2.21e-06 [auto_monad_grad]: 1.27e-06 [auto_monad_eliminator]: 9.32001e-06 [cse]: 2.015e-05 [a_3]: 5.866e-05 [py_interpret_to_execute_after_opt_a]: 1.503e-05 [slice_cell_reuse_recomputed_activation]: 4.93001e-06 [rewriter_after_opt_a]: 4.9e-05 [convert_after_rewriter]: 1.121e-05 [order_py_execute_after_rewriter]: 9.03002e-06 [mutable_eliminate]: 0.00053067 [opt_b]: 0.00031056, [1] [Cycle 1]: 0.00030105, [7] [b_1]: 0.00019608 [b_2]: 9.70002e-06 [updatestate_depend_eliminate]: 7.34002e-06 [updatestate_assign_eliminate]: 3.06001e-06 [updatestate_loads_eliminate]: 3.13e-06 [renormalize]: 5.49975e-07 [cse]: 2.328e-05 [optimize_parallel_all_gather_comm]: 2.18e-05 [overlap_param_gather]: 4.85999e-06 [cconv]: 2.937e-05 [loop_unroll]: 0.0004429 [opt_after_cconv]: 0.00015732, [1] [Cycle 1]: 0.00013325, [7] [c_1]: 3.776e-05 [parameter_eliminate]: 3.10002e-06 [updatestate_depend_eliminate]: 7.18998e-06 [updatestate_assign_eliminate]: 3.22997e-06 [updatestate_loads_eliminate]: 3.03998e-06 [cse]: 2.18e-05 [renormalize]: 6.29982e-07 [remove_dup_value]: 1.86e-05 [tuple_transform]: 0.00010251, [1] [Cycle 1]: 9.542e-05, [4] [d_1]: 5.47e-05 [none_parameter_eliminate]: 1.63002e-06 [renormalize]: 1.69995e-07 [switch_simplify]: 8.08001e-06 [partial_unused_args_eliminate]: 4.37e-06 [add_recomputation]: 6.053e-05 [cse_after_recomputation]: 3.095e-05, [1] [Cycle 1]: 2.416e-05, [1] [cse]: 1.495e-05 [environ_conv]: 9.47001e-06 [swap_dp_allreduce_reducescatter]: 8.87999e-06 [bias_add_comm_swap]: 5.05001e-06 [label_micro_interleaved_index]: 6.83e-06 [label_fine_grained_interleaved_index]: 5.69999e-06 [merge_cast_opt]: 3.63999e-06 [slice_recompute_activation]: 4.35e-06 [micro_interleaved_order_control]: 4.87998e-06 [assign_add_opt]: 4.02e-06 [ForceFp32Comm]: 3.18e-06 [remove_cast_before_assign_add]: 3.47002e-06 [full_micro_interleaved_order_control]: 4.70999e-06 [reorder_send_recv_between_fp_bp]: 5.79e-06 [comm_op_add_attrs]: 3.39001e-06 [add_comm_op_reuse_tag]: 3.23e-06 [interleave_split_concat_branches]: 3.58e-06 [interleave_parallel_branches]: 3.45e-06 [overlap_opt_shard_in_pipeline]: 3.41001e-06 [overlap_opt_shard_grad_in_pipeline]: 4.48999e-06 [control_data_broadcast_order]: 1.779e-05 [grouped_pairwise_exchange_alltoall]: 4.15999e-06 [offloading_packed_experts]: 6.83998e-06 [overlap_recompute_and_grad_model_parallel]: 7.73001e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.95998e-06 [overlap_recompute_allgather_and_fa_grad]: 3.68e-06 [overlap_recompute_comm]: 4.53999e-06 [overlap_grad_ring_attention]: 7.08e-06 [overlap_grad_flash_sp]: 2.565e-05 [begin_end_overlap_inline]: 3.26001e-06 [split_matmul_comm_elemetwise]: 5.07e-06 [split_layernorm_comm]: 3.98999e-06 [handle_group_info]: 3.41999e-06 [symbol_engine_optimizer]: 0.00010321, [1] [Cycle 1]: 9.61e-05, [6] [build]: 3.23e-06 [elim_shapecalc]: 1.179e-05 [elim_not_effective]: 1.536e-05 [opt_reshape]: 7.95e-06 [fold_const_symbol]: 1.181e-05 [renormalize]: 2.3999e-07 [detach_backward]: 4.04002e-06 [pipeline_parallel_scheduler]: 2.30002e-06 [auto_monad_reorder]: 2.403e-05 [get_jit_bprop_graph]: 1.59998e-06 [rewriter_after_jit_bprop_graph]: 4.00998e-06 [opt_after_jit_grad]: 0.00049801 [validate]: 4.208e-05 Sums bootstrap : 0.000395s : 0.71% type_inference : 0.049825s : 89.42% event_method : 0.000020s : 0.04% auto_monad : 0.000063s : 0.11% graph_reusing : 0.000007s : 0.01% inline : 0.000002s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000020s : 0.03% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.01% parallel-infer-symbol : 0.000003s : 0.01% pre_auto_parallel : 0.000033s : 0.06% insert-virtual-dataset : 0.000002s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000035s : 0.06% optimize.rewriter_before_opt_a : 0.000091s : 0.16% optimize.opt_a.expand_dump_flag : 0.000004s : 0.01% optimize.opt_a.switch_simplify : 0.000052s : 0.09% optimize.opt_a.loop_unroll : 0.000039s : 0.07% optimize.opt_a.a_1 : 0.000943s : 1.69% optimize.opt_a.with_stream_mark : 0.000031s : 0.06% optimize.opt_a.recompute_prepare : 0.000019s : 0.03% optimize.opt_a.updatestate_depend_eliminate : 0.000009s : 0.02% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.01% optimize.opt_a.updatestate_loads_eliminate : 0.000007s : 0.01% optimize.opt_a.parameter_eliminate : 0.000003s : 0.01% optimize.opt_a.a_2 : 0.000261s : 0.47% optimize.opt_a.accelerated_algorithm : 0.000016s : 0.03% optimize.opt_a.shard : 0.000004s : 0.01% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.01% optimize.opt_a.shard_inline : 0.000015s : 0.03% optimize.opt_a.merge_send_recv : 0.000125s : 0.22% optimize.opt_a.auto_parallel : 0.000017s : 0.03% optimize.opt_a.parallel : 0.000023s : 0.04% optimize.opt_a.flash_sp : 0.000014s : 0.02% optimize.opt_a.merge_comm : 0.000010s : 0.02% optimize.opt_a.allreduce_fusion : 0.000008s : 0.01% optimize.opt_a.matmul_add_comm_reduction : 0.000017s : 0.03% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000021s : 0.04% optimize.opt_a.virtual_dataset : 0.000021s : 0.04% optimize.opt_a.get_grad_eliminate_ : 0.000016s : 0.03% optimize.opt_a.virtual_output : 0.000015s : 0.03% optimize.opt_a.merge_forward : 0.000009s : 0.02% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.00% optimize.opt_a.offload_activation : 0.000020s : 0.04% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000038s : 0.07% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.00% optimize.opt_a.before_grad : 0.000024s : 0.04% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000010s : 0.02% optimize.opt_a.meta_fg_expand : 0.000007s : 0.01% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.01% optimize.opt_a.receive_attached : 0.000003s : 0.01% optimize.opt_a.after_resolve : 0.000026s : 0.05% optimize.opt_a.a_after_grad : 0.000024s : 0.04% optimize.opt_a.renormalize : 0.000700s : 1.26% optimize.opt_a.add_forward_monad_depend : 0.000008s : 0.01% optimize.opt_a.auto_monad_grad : 0.000004s : 0.01% optimize.opt_a.auto_monad_eliminator : 0.000027s : 0.05% optimize.opt_a.cse : 0.000057s : 0.10% optimize.opt_a.a_3 : 0.000130s : 0.23% optimize.py_interpret_to_execute_after_opt_a : 0.000015s : 0.03% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.01% optimize.rewriter_after_opt_a : 0.000049s : 0.09% optimize.convert_after_rewriter : 0.000011s : 0.02% optimize.order_py_execute_after_rewriter : 0.000009s : 0.02% optimize.mutable_eliminate : 0.000531s : 0.95% optimize.opt_b.b_1 : 0.000196s : 0.35% optimize.opt_b.b_2 : 0.000010s : 0.02% optimize.opt_b.updatestate_depend_eliminate : 0.000007s : 0.01% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.01% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.01% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000023s : 0.04% optimize.optimize_parallel_all_gather_comm : 0.000022s : 0.04% optimize.overlap_param_gather : 0.000005s : 0.01% optimize.cconv : 0.000029s : 0.05% optimize.loop_unroll : 0.000443s : 0.79% optimize.opt_after_cconv.c_1 : 0.000038s : 0.07% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000007s : 0.01% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.cse : 0.000022s : 0.04% optimize.opt_after_cconv.renormalize : 0.000001s : 0.00% optimize.remove_dup_value : 0.000019s : 0.03% optimize.tuple_transform.d_1 : 0.000055s : 0.10% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000008s : 0.01% optimize.partial_unused_args_eliminate : 0.000004s : 0.01% optimize.add_recomputation : 0.000061s : 0.11% optimize.cse_after_recomputation.cse : 0.000015s : 0.03% optimize.environ_conv : 0.000009s : 0.02% optimize.swap_dp_allreduce_reducescatter : 0.000009s : 0.02% optimize.bias_add_comm_swap : 0.000005s : 0.01% optimize.label_micro_interleaved_index : 0.000007s : 0.01% optimize.label_fine_grained_interleaved_index : 0.000006s : 0.01% optimize.merge_cast_opt : 0.000004s : 0.01% optimize.slice_recompute_activation : 0.000004s : 0.01% optimize.micro_interleaved_order_control : 0.000005s : 0.01% optimize.assign_add_opt : 0.000004s : 0.01% optimize.ForceFp32Comm : 0.000003s : 0.01% optimize.remove_cast_before_assign_add : 0.000003s : 0.01% optimize.full_micro_interleaved_order_control : 0.000005s : 0.01% optimize.reorder_send_recv_between_fp_bp : 0.000006s : 0.01% optimize.comm_op_add_attrs : 0.000003s : 0.01% optimize.add_comm_op_reuse_tag : 0.000003s : 0.01% optimize.interleave_split_concat_branches : 0.000004s : 0.01% optimize.interleave_parallel_branches : 0.000003s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000003s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000004s : 0.01% optimize.control_data_broadcast_order : 0.000018s : 0.03% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.01% optimize.offloading_packed_experts : 0.000007s : 0.01% optimize.overlap_recompute_and_grad_model_parallel : 0.000008s : 0.01% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.01% optimize.overlap_recompute_comm : 0.000005s : 0.01% optimize.overlap_grad_ring_attention : 0.000007s : 0.01% optimize.overlap_grad_flash_sp : 0.000026s : 0.05% optimize.begin_end_overlap_inline : 0.000003s : 0.01% optimize.split_matmul_comm_elemetwise : 0.000005s : 0.01% optimize.split_layernorm_comm : 0.000004s : 0.01% optimize.handle_group_info : 0.000003s : 0.01% optimize.symbol_engine_optimizer.build : 0.000003s : 0.01% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000012s : 0.02% optimize.symbol_engine_optimizer.elim_not_effective : 0.000015s : 0.03% optimize.symbol_engine_optimizer.opt_reshape : 0.000008s : 0.01% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000012s : 0.02% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000004s : 0.01% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000024s : 0.04% get_jit_bprop_graph : 0.000002s : 0.00% rewriter_after_jit_bprop_graph : 0.000004s : 0.01% opt_after_jit_grad : 0.000498s : 0.89% validate : 0.000042s : 0.08% Time group info: ------[substitution.] 0.000209 38 10.66% : 0.000022s : 3: substitution.cast_eliminate 1.04% : 0.000002s : 3: substitution.elim_not_effective 0.82% : 0.000002s : 3: substitution.fold_const_symbol 3.24% : 0.000007s : 5: substitution.graph_param_transform 69.87% : 0.000146s : 4: substitution.inline 1.87% : 0.000004s : 6: substitution.j_node_and_user_rematch 2.86% : 0.000006s : 6: substitution.remove_not_recompute_node 2.47% : 0.000005s : 4: substitution.replace_old_param 7.18% : 0.000015s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.049778 2 98.59% : 0.049074s : 1: type_inference.infer 1.41% : 0.000703s : 1: type_inference.specialize ------[replace.] 0.000067 8 54.72% : 0.000037s : 4: replace.inline 45.28% : 0.000030s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000156 8 91.67% : 0.000143s : 4: match.inline 8.33% : 0.000013s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000256 1596 0.92% : 0.000002s : 17: predicate.accumulaten_eliminater 0.82% : 0.000002s : 5: predicate.ad_related_special_op_eliminate 0.53% : 0.000001s : 10: predicate.addn_check_dump 0.91% : 0.000002s : 17: predicate.addn_zero_filter 0.91% : 0.000002s : 17: predicate.adjust_all_reduce_mul_add 2.29% : 0.000006s : 27: predicate.arithmetic_simplify 1.11% : 0.000003s : 17: predicate.cast_eliminate 0.57% : 0.000001s : 10: predicate.check_bprop_eliminate 0.57% : 0.000001s : 10: predicate.compare_switch_simplify 0.17% : 0.000000s : 5: predicate.const_output_eliminate 0.62% : 0.000002s : 10: predicate.depend_value_elim 1.00% : 0.000003s : 17: predicate.dict_get_item_const_eliminator 1.02% : 0.000003s : 17: predicate.dict_get_item_eliminator 1.01% : 0.000003s : 17: predicate.dict_set_item_eliminator 0.84% : 0.000002s : 10: predicate.dumpgradient_eliminate 0.26% : 0.000001s : 5: predicate.elim_not_effective 0.36% : 0.000001s : 5: predicate.elim_shapecalc_of_broadcastargs 1.18% : 0.000003s : 22: predicate.environ_add_const_eliminate 1.14% : 0.000003s : 22: predicate.environ_get_add_eliminate 1.14% : 0.000003s : 22: predicate.environ_get_depend_swap 1.83% : 0.000005s : 32: predicate.environ_get_eliminate 1.13% : 0.000003s : 22: predicate.environ_get_set_eliminate 1.40% : 0.000004s : 25: predicate.exchange_switch_depend_value 2.24% : 0.000006s : 25: predicate.float_depend_g_call 0.55% : 0.000001s : 10: predicate.float_environ_get_switch 0.82% : 0.000002s : 15: predicate.float_tuple_getitem_switch 0.19% : 0.000000s : 5: predicate.fold_const_symbol 0.81% : 0.000002s : 10: predicate.get_grad_eliminate 0.20% : 0.000001s : 5: predicate.graph_param_transform 0.65% : 0.000002s : 10: predicate.incorporate_call 0.50% : 0.000001s : 10: predicate.incorporate_call_switch 6.24% : 0.000016s : 72: predicate.inline 0.77% : 0.000002s : 10: predicate.inline_without_move 0.32% : 0.000001s : 10: predicate.j_node_and_user_rematch 0.84% : 0.000002s : 10: predicate.less_batch_normalization 1.84% : 0.000005s : 31: predicate.list_to_tuple_eliminator_ 2.63% : 0.000007s : 48: predicate.load_eliminater 0.88% : 0.000002s : 5: predicate.loop_unroll_after_grad 1.96% : 0.000005s : 36: predicate.loop_unroll_before_grad 1.60% : 0.000004s : 27: predicate.make_slice_get_slice_eliminator 0.59% : 0.000002s : 10: predicate.merge_addn 0.55% : 0.000001s : 10: predicate.micro_step_allgather_replace 0.62% : 0.000002s : 10: predicate.mini_step_allgather_replace 0.90% : 0.000002s : 17: predicate.minmaximum_grad 0.75% : 0.000002s : 5: predicate.mutable_eliminate 0.32% : 0.000001s : 5: predicate.opt_reshape 0.38% : 0.000001s : 5: predicate.parallel_virtual_node 1.87% : 0.000005s : 25: predicate.partial_defer_inline 1.74% : 0.000004s : 26: predicate.partial_eliminate 1.00% : 0.000003s : 17: predicate.print_const_string_wrapper 0.57% : 0.000001s : 10: predicate.reduce_all_const_elim 1.29% : 0.000003s : 17: predicate.reduce_eliminate 2.56% : 0.000007s : 48: predicate.redundant_stop_gradient_eliminater 0.46% : 0.000001s : 10: predicate.remove_not_recompute_node 1.37% : 0.000003s : 31: predicate.replace_applicator 0.41% : 0.000001s : 10: predicate.replace_old_param 0.24% : 0.000001s : 5: predicate.reset_defer_inline 1.03% : 0.000003s : 17: predicate.reshape_eliminate 0.65% : 0.000002s : 10: predicate.row_tensor_add_zeros_like 0.42% : 0.000001s : 5: predicate.row_tensor_eliminate 0.71% : 0.000002s : 10: predicate.same_eliminate 0.45% : 0.000001s : 10: predicate.set_cell_output_no_recompute 0.65% : 0.000002s : 10: predicate.shard_identity_eliminate 0.75% : 0.000002s : 10: predicate.special_op_eliminate 0.73% : 0.000002s : 10: predicate.specialize_transform 0.94% : 0.000002s : 10: predicate.split_environ_get_set_with_tuple_value 0.77% : 0.000002s : 10: predicate.stack_unstack_eliminate 0.36% : 0.000001s : 5: predicate.switch_call_monad_eliminater 1.49% : 0.000004s : 25: predicate.switch_defer_inline 2.08% : 0.000005s : 35: predicate.switch_layer_defer_inline 4.62% : 0.000012s : 76: predicate.switch_simplify 0.96% : 0.000002s : 17: predicate.tile_eliminate 1.00% : 0.000003s : 17: predicate.transpose_eliminate 1.67% : 0.000004s : 27: predicate.tuple_list_convert_item_index_to_positive 1.65% : 0.000004s : 27: predicate.tuple_list_get_item_const_eliminator 1.57% : 0.000004s : 27: predicate.tuple_list_get_item_depend_reorder 3.39% : 0.000009s : 41: predicate.tuple_list_get_item_eliminator 1.61% : 0.000004s : 27: predicate.tuple_list_get_set_item_eliminator 2.20% : 0.000006s : 37: predicate.tuple_list_set_item_eliminator 1.77% : 0.000005s : 31: predicate.tuple_to_list_eliminator_ 2.52% : 0.000006s : 48: predicate.updatestate_pure_node_eliminater 3.27% : 0.000008s : 58: predicate.updatestate_useless_node_eliminater 0.37% : 0.000001s : 5: predicate.value_based_eliminate 0.62% : 0.000002s : 10: predicate.virtual_dataset_eliminate 0.60% : 0.000002s : 10: predicate.virtual_output_eliminate 0.28% : 0.000001s : 5: predicate.virtual_view_grad_eliminate 0.44% : 0.000001s : 5: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000528 11 53.66% : 0.000283s : 5: func_graph_cloner_run.FuncGraphClonerGraph 46.34% : 0.000244s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.072337 192 0.01% : 0.000006s : 1: ForceFp32Comm 4.48% : 0.003242s : 1: add_attr 4.46% : 0.003226s : 1: add_attr_with_inline 0.01% : 0.000006s : 1: add_comm_op_reuse_tag 0.09% : 0.000064s : 1: add_recomputation 0.01% : 0.000007s : 1: assign_add_opt 0.10% : 0.000072s : 1: auto_monad 0.04% : 0.000032s : 1: auto_monad_reorder 0.01% : 0.000006s : 1: begin_end_overlap_inline 0.01% : 0.000008s : 1: bias_add_comm_swap 0.60% : 0.000433s : 1: bootstrap 0.05% : 0.000033s : 1: cconv 0.01% : 0.000006s : 1: comm_op_add_attrs 0.03% : 0.000021s : 1: control_data_broadcast_order 0.02% : 0.000014s : 1: convert_after_rewriter 0.05% : 0.000034s : 1: cse_after_recomputation 0.01% : 0.000008s : 1: dataset_repeat_opt 0.03% : 0.000019s : 1: detach_backward 0.02% : 0.000012s : 1: environ_conv 0.04% : 0.000029s : 1: event_method 0.01% : 0.000007s : 1: full_micro_interleaved_order_control 0.01% : 0.000008s : 1: get_jit_bprop_graph 0.02% : 0.000013s : 1: graph_reusing 0.01% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000006s : 1: handle_group_info 0.01% : 0.000008s : 1: inline 0.01% : 0.000008s : 1: insert-virtual-dataset 0.01% : 0.000006s : 1: interleave_parallel_branches 0.01% : 0.000006s : 1: interleave_split_concat_branches 0.01% : 0.000008s : 1: label_fine_grained_interleaved_index 0.01% : 0.000010s : 1: label_micro_interleaved_index 0.62% : 0.000449s : 1: loop_unroll 0.01% : 0.000006s : 1: merge_cast_opt 0.01% : 0.000007s : 1: micro_interleaved_order_control 0.74% : 0.000537s : 1: mutable_eliminate 0.01% : 0.000010s : 1: offloading_packed_experts 0.02% : 0.000016s : 1: opt.transform.loop_unroll_optimizer 0.02% : 0.000016s : 1: opt.transform.mutable_eliminate 2.03% : 0.001472s : 78: opt.transform.opt_a 0.05% : 0.000036s : 1: opt.transform.opt_after_cconv 0.04% : 0.000029s : 1: opt.transform.opt_after_jit_grad 0.18% : 0.000133s : 28: opt.transform.opt_b 0.08% : 0.000061s : 2: opt.transform.opt_trans_graph 0.06% : 0.000043s : 4: opt.transform.symbol_engine_opt 4.91% : 0.003553s : 1: opt_a 0.22% : 0.000161s : 1: opt_after_cconv 0.70% : 0.000509s : 1: opt_after_jit_grad 0.43% : 0.000314s : 1: opt_b 8.78% : 0.006350s : 1: optimize 0.03% : 0.000025s : 1: optimize_parallel_all_gather_comm 0.02% : 0.000012s : 1: order_py_execute_after_rewriter 0.04% : 0.000029s : 1: overlap_grad_flash_sp 0.01% : 0.000007s : 1: overlap_grad_matmul_and_grad_allreduce 0.01% : 0.000010s : 1: overlap_grad_ring_attention 0.01% : 0.000007s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000006s : 1: overlap_opt_shard_in_pipeline 0.01% : 0.000008s : 1: overlap_param_gather 0.01% : 0.000006s : 1: overlap_recompute_allgather_and_fa_grad 0.01% : 0.000011s : 1: overlap_recompute_and_grad_model_parallel 0.01% : 0.000008s : 1: overlap_recompute_comm 0.01% : 0.000010s : 1: parallel-infer-symbol 0.01% : 0.000006s : 1: parallel-infer-symbol-second 0.01% : 0.000007s : 1: partial_unused_args_eliminate 0.01% : 0.000010s : 1: pipeline_parallel_scheduler 0.01% : 0.000007s : 1: pipeline_split 0.06% : 0.000041s : 1: pre_auto_parallel 0.05% : 0.000039s : 1: py_interpret_to_execute 0.03% : 0.000018s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000006s : 1: remove_cast_before_assign_add 0.03% : 0.000022s : 1: remove_dup_value 0.53% : 0.000380s : 1: renormalize.infer 0.43% : 0.000312s : 1: renormalize.specialize 0.01% : 0.000008s : 1: reorder_send_recv_between_fp_bp 0.01% : 0.000010s : 1: rewriter_after_jit_bprop_graph 0.07% : 0.000053s : 1: rewriter_after_opt_a 0.13% : 0.000095s : 1: rewriter_before_opt_a 0.01% : 0.000008s : 1: slice_cell_reuse_recomputed_activation 0.01% : 0.000007s : 1: slice_recompute_activation 0.01% : 0.000007s : 1: split_layernorm_comm 0.01% : 0.000008s : 1: split_matmul_comm_elemetwise 0.02% : 0.000012s : 1: swap_dp_allreduce_reducescatter 0.15% : 0.000106s : 1: symbol_engine_optimizer 0.15% : 0.000105s : 1: tuple_transform 68.93% : 0.049863s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:45:34.987.838 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0276015, [21] [bootstrap]: 0.00040954 [type_inference]: 0.00567008 [event_method]: 1.871e-05 [auto_monad]: 6.284e-05 [graph_reusing]: 5.67999e-06 [inline]: 2.61e-06 [add_attr]: 0.00302896, [1] [add_attr_with_inline]: 0.00302049, [1] [Cycle 1]: 5.327e-05, [2] [tag_attr]: 1.918e-05 [meta_addattr_fg_expand]: 5.98002e-06 [parallel-infer-symbol]: 3.46001e-06 [pre_auto_parallel]: 3.342e-05 [insert-virtual-dataset]: 2.56998e-06 [parallel-infer-symbol-second]: 8.70001e-07 [dataset_repeat_opt]: 1.97999e-06 [pipeline_split]: 1.66998e-06 [optimize]: 0.0176135, [53] [py_interpret_to_execute]: 2.531e-05 [rewriter_before_opt_a]: 8.187e-05 [opt_a]: 0.0152644, [2] [Cycle 1]: 0.0144497, [45] [expand_dump_flag]: 2.81e-06 [switch_simplify]: 4.181e-05 [loop_unroll]: 3.09e-05 [a_1]: 0.00072436 [with_stream_mark]: 1.606e-05 [recompute_prepare]: 9.41e-06 [updatestate_depend_eliminate]: 4.25e-06 [updatestate_assign_eliminate]: 3.75998e-06 [updatestate_loads_eliminate]: 3.66999e-06 [parameter_eliminate]: 1.87001e-06 [a_2]: 9.842e-05 [accelerated_algorithm]: 8.35001e-06 [shard]: 2.16e-06 [meta_shard_fg_expand]: 1.92999e-06 [shard_inline]: 7.96001e-06 [merge_send_recv]: 9.96998e-06 [auto_parallel]: 7.61001e-06 [parallel]: 1.829e-05 [flash_sp]: 9.22001e-06 [merge_comm]: 5.00999e-06 [allreduce_fusion]: 4.55001e-06 [matmul_add_comm_reduction]: 1.039e-05 [allreduce_slice_to_reducescatter]: 7.89994e-07 [virtual_shard_identity]: 9.96e-06 [virtual_dataset]: 8.32998e-06 [get_grad_eliminate_]: 7.48e-06 [virtual_output]: 7.39002e-06 [merge_forward]: 4.13999e-06 [cell_reuse_recompute_pass]: 1.35999e-06 [offload_activation]: 1.188e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.604e-05 [merge_recompute_call_nodes]: 1.44e-06 [before_grad]: 1.416e-05 [set_forward_comm_id_for_comm_node_pass]: 4.47e-06 [meta_fg_expand]: 3.37997e-06 [flash_sp_send_recv_attached]: 2.69001e-06 [receive_attached]: 2.18002e-06 [after_resolve]: 1.282e-05 [a_after_grad]: 1.173e-05 [renormalize]: 0.0129085 [add_forward_monad_depend]: 1.051e-05 [auto_monad_grad]: 2.48e-06 [auto_monad_eliminator]: 2.167e-05 [cse]: 3.716e-05 [a_3]: 6.442e-05 [Cycle 2]: 0.00080334, [45] [expand_dump_flag]: 2.22999e-06 [switch_simplify]: 1.039e-05 [loop_unroll]: 7.88001e-06 [a_1]: 0.00018519 [with_stream_mark]: 1.76e-05 [recompute_prepare]: 7.82e-06 [updatestate_depend_eliminate]: 4.63999e-06 [updatestate_assign_eliminate]: 3.33e-06 [updatestate_loads_eliminate]: 3.5e-06 [parameter_eliminate]: 1.55999e-06 [a_2]: 9.561e-05 [accelerated_algorithm]: 7.71999e-06 [shard]: 2.21998e-06 [meta_shard_fg_expand]: 2.19001e-06 [shard_inline]: 7.41001e-06 [merge_send_recv]: 8.02e-06 [auto_parallel]: 9.25001e-06 [parallel]: 8.49002e-06 [flash_sp]: 3.86001e-06 [merge_comm]: 4.21001e-06 [allreduce_fusion]: 4.02998e-06 [matmul_add_comm_reduction]: 1.02e-05 [allreduce_slice_to_reducescatter]: 8.69972e-07 [virtual_shard_identity]: 8.2e-06 [virtual_dataset]: 7.26999e-06 [get_grad_eliminate_]: 7.27002e-06 [virtual_output]: 7.1e-06 [merge_forward]: 4.12e-06 [cell_reuse_recompute_pass]: 2.40002e-06 [offload_activation]: 1.088e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.519e-05 [merge_recompute_call_nodes]: 1.39e-06 [before_grad]: 1.256e-05 [set_forward_comm_id_for_comm_node_pass]: 4.63001e-06 [meta_fg_expand]: 3.14999e-06 [flash_sp_send_recv_attached]: 1.35001e-06 [receive_attached]: 2.04e-06 [after_resolve]: 1.226e-05 [a_after_grad]: 1.109e-05 [renormalize]: 6.00121e-08 [add_forward_monad_depend]: 1.24e-06 [auto_monad_grad]: 9.10019e-07 [auto_monad_eliminator]: 8.14002e-06 [cse]: 1.806e-05 [a_3]: 4.525e-05 [py_interpret_to_execute_after_opt_a]: 1.352e-05 [slice_cell_reuse_recomputed_activation]: 1.94e-06 [rewriter_after_opt_a]: 4.148e-05 [convert_after_rewriter]: 7.55e-06 [order_py_execute_after_rewriter]: 5.79999e-06 [mutable_eliminate]: 0.0006797 [opt_b]: 0.00024169, [1] [Cycle 1]: 0.00023552, [7] [b_1]: 0.00015509 [b_2]: 9.98002e-06 [updatestate_depend_eliminate]: 6.16e-06 [updatestate_assign_eliminate]: 2.86e-06 [updatestate_loads_eliminate]: 2.80002e-06 [renormalize]: 5.69999e-07 [cse]: 2.351e-05 [optimize_parallel_all_gather_comm]: 1.722e-05 [overlap_param_gather]: 2.11e-06 [cconv]: 2.627e-05 [loop_unroll]: 0.00045364 [opt_after_cconv]: 0.00012511, [1] [Cycle 1]: 0.00011923, [7] [c_1]: 3.754e-05 [parameter_eliminate]: 4.10998e-06 [updatestate_depend_eliminate]: 7.12002e-06 [updatestate_assign_eliminate]: 3.43999e-06 [updatestate_loads_eliminate]: 2.96999e-06 [cse]: 2.785e-05 [renormalize]: 5.60016e-07 [remove_dup_value]: 1.586e-05 [tuple_transform]: 8.846e-05, [1] [Cycle 1]: 8.381e-05, [4] [d_1]: 5.424e-05 [none_parameter_eliminate]: 1.69998e-06 [renormalize]: 2.29978e-07 [switch_simplify]: 8.18001e-06 [partial_unused_args_eliminate]: 1.87001e-06 [add_recomputation]: 5.944e-05 [cse_after_recomputation]: 2.884e-05, [1] [Cycle 1]: 2.364e-05, [1] [cse]: 1.594e-05 [environ_conv]: 7.13e-06 [swap_dp_allreduce_reducescatter]: 6.09001e-06 [bias_add_comm_swap]: 2.54999e-06 [label_micro_interleaved_index]: 5.02999e-06 [label_fine_grained_interleaved_index]: 2.68e-06 [merge_cast_opt]: 1.42e-06 [slice_recompute_activation]: 2.34001e-06 [micro_interleaved_order_control]: 1.96e-06 [assign_add_opt]: 1.55001e-06 [ForceFp32Comm]: 1.29998e-06 [remove_cast_before_assign_add]: 1.02e-06 [full_micro_interleaved_order_control]: 1.99999e-06 [reorder_send_recv_between_fp_bp]: 2.58998e-06 [comm_op_add_attrs]: 1.00999e-06 [add_comm_op_reuse_tag]: 1.17e-06 [interleave_split_concat_branches]: 1.05999e-06 [interleave_parallel_branches]: 1.06002e-06 [overlap_opt_shard_in_pipeline]: 1.29998e-06 [overlap_opt_shard_grad_in_pipeline]: 1.72999e-06 [control_data_broadcast_order]: 1.556e-05 [grouped_pairwise_exchange_alltoall]: 1.59e-06 [offloading_packed_experts]: 4.52998e-06 [overlap_recompute_and_grad_model_parallel]: 5.52999e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.22999e-06 [overlap_recompute_allgather_and_fa_grad]: 1.47001e-06 [overlap_recompute_comm]: 2.26998e-06 [overlap_grad_ring_attention]: 4.74002e-06 [overlap_grad_flash_sp]: 2.499e-05 [begin_end_overlap_inline]: 5.69999e-07 [split_matmul_comm_elemetwise]: 2.41998e-06 [split_layernorm_comm]: 1.85001e-06 [handle_group_info]: 1.02e-06 [symbol_engine_optimizer]: 9.345e-05, [1] [Cycle 1]: 8.906e-05, [6] [build]: 3.71001e-06 [elim_shapecalc]: 1.397e-05 [elim_not_effective]: 1.689e-05 [opt_reshape]: 8.38999e-06 [fold_const_symbol]: 1.319e-05 [renormalize]: 2.50002e-07 [detach_backward]: 2.47001e-06 [pipeline_parallel_scheduler]: 1.86003e-06 [auto_monad_reorder]: 3.868e-05 [get_jit_bprop_graph]: 2.47001e-06 [rewriter_after_jit_bprop_graph]: 5.28002e-06 [opt_after_jit_grad]: 0.00050611 [validate]: 4.555e-05 Sums bootstrap : 0.000410s : 1.74% type_inference : 0.005670s : 24.04% event_method : 0.000019s : 0.08% auto_monad : 0.000063s : 0.27% graph_reusing : 0.000006s : 0.02% inline : 0.000003s : 0.01% add_attr.add_attr_with_inline.tag_attr : 0.000019s : 0.08% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.03% parallel-infer-symbol : 0.000003s : 0.01% pre_auto_parallel : 0.000033s : 0.14% insert-virtual-dataset : 0.000003s : 0.01% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.01% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000025s : 0.11% optimize.rewriter_before_opt_a : 0.000082s : 0.35% optimize.opt_a.expand_dump_flag : 0.000005s : 0.02% optimize.opt_a.switch_simplify : 0.000052s : 0.22% optimize.opt_a.loop_unroll : 0.000039s : 0.16% optimize.opt_a.a_1 : 0.000910s : 3.86% optimize.opt_a.with_stream_mark : 0.000034s : 0.14% optimize.opt_a.recompute_prepare : 0.000017s : 0.07% optimize.opt_a.updatestate_depend_eliminate : 0.000009s : 0.04% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.03% optimize.opt_a.updatestate_loads_eliminate : 0.000007s : 0.03% optimize.opt_a.parameter_eliminate : 0.000003s : 0.01% optimize.opt_a.a_2 : 0.000194s : 0.82% optimize.opt_a.accelerated_algorithm : 0.000016s : 0.07% optimize.opt_a.shard : 0.000004s : 0.02% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.02% optimize.opt_a.shard_inline : 0.000015s : 0.07% optimize.opt_a.merge_send_recv : 0.000018s : 0.08% optimize.opt_a.auto_parallel : 0.000017s : 0.07% optimize.opt_a.parallel : 0.000027s : 0.11% optimize.opt_a.flash_sp : 0.000013s : 0.06% optimize.opt_a.merge_comm : 0.000009s : 0.04% optimize.opt_a.allreduce_fusion : 0.000009s : 0.04% optimize.opt_a.matmul_add_comm_reduction : 0.000021s : 0.09% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000002s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000018s : 0.08% optimize.opt_a.virtual_dataset : 0.000016s : 0.07% optimize.opt_a.get_grad_eliminate_ : 0.000015s : 0.06% optimize.opt_a.virtual_output : 0.000014s : 0.06% optimize.opt_a.merge_forward : 0.000008s : 0.04% optimize.opt_a.cell_reuse_recompute_pass : 0.000004s : 0.02% optimize.opt_a.offload_activation : 0.000023s : 0.10% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000031s : 0.13% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.01% optimize.opt_a.before_grad : 0.000027s : 0.11% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000009s : 0.04% optimize.opt_a.meta_fg_expand : 0.000007s : 0.03% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.02% optimize.opt_a.receive_attached : 0.000004s : 0.02% optimize.opt_a.after_resolve : 0.000025s : 0.11% optimize.opt_a.a_after_grad : 0.000023s : 0.10% optimize.opt_a.renormalize : 0.012909s : 54.72% optimize.opt_a.add_forward_monad_depend : 0.000012s : 0.05% optimize.opt_a.auto_monad_grad : 0.000003s : 0.01% optimize.opt_a.auto_monad_eliminator : 0.000030s : 0.13% optimize.opt_a.cse : 0.000055s : 0.23% optimize.opt_a.a_3 : 0.000110s : 0.46% optimize.py_interpret_to_execute_after_opt_a : 0.000014s : 0.06% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.01% optimize.rewriter_after_opt_a : 0.000041s : 0.18% optimize.convert_after_rewriter : 0.000008s : 0.03% optimize.order_py_execute_after_rewriter : 0.000006s : 0.02% optimize.mutable_eliminate : 0.000680s : 2.88% optimize.opt_b.b_1 : 0.000155s : 0.66% optimize.opt_b.b_2 : 0.000010s : 0.04% optimize.opt_b.updatestate_depend_eliminate : 0.000006s : 0.03% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.01% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.01% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000024s : 0.10% optimize.optimize_parallel_all_gather_comm : 0.000017s : 0.07% optimize.overlap_param_gather : 0.000002s : 0.01% optimize.cconv : 0.000026s : 0.11% optimize.loop_unroll : 0.000454s : 1.92% optimize.opt_after_cconv.c_1 : 0.000038s : 0.16% optimize.opt_after_cconv.parameter_eliminate : 0.000004s : 0.02% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000007s : 0.03% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.cse : 0.000028s : 0.12% optimize.opt_after_cconv.renormalize : 0.000001s : 0.00% optimize.remove_dup_value : 0.000016s : 0.07% optimize.tuple_transform.d_1 : 0.000054s : 0.23% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000008s : 0.03% optimize.partial_unused_args_eliminate : 0.000002s : 0.01% optimize.add_recomputation : 0.000059s : 0.25% optimize.cse_after_recomputation.cse : 0.000016s : 0.07% optimize.environ_conv : 0.000007s : 0.03% optimize.swap_dp_allreduce_reducescatter : 0.000006s : 0.03% optimize.bias_add_comm_swap : 0.000003s : 0.01% optimize.label_micro_interleaved_index : 0.000005s : 0.02% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.01% optimize.merge_cast_opt : 0.000001s : 0.01% optimize.slice_recompute_activation : 0.000002s : 0.01% optimize.micro_interleaved_order_control : 0.000002s : 0.01% optimize.assign_add_opt : 0.000002s : 0.01% optimize.ForceFp32Comm : 0.000001s : 0.01% optimize.remove_cast_before_assign_add : 0.000001s : 0.00% optimize.full_micro_interleaved_order_control : 0.000002s : 0.01% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.01% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.01% optimize.control_data_broadcast_order : 0.000016s : 0.07% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.01% optimize.offloading_packed_experts : 0.000005s : 0.02% optimize.overlap_recompute_and_grad_model_parallel : 0.000006s : 0.02% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.01% optimize.overlap_recompute_comm : 0.000002s : 0.01% optimize.overlap_grad_ring_attention : 0.000005s : 0.02% optimize.overlap_grad_flash_sp : 0.000025s : 0.11% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.01% optimize.split_layernorm_comm : 0.000002s : 0.01% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000004s : 0.02% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000014s : 0.06% optimize.symbol_engine_optimizer.elim_not_effective : 0.000017s : 0.07% optimize.symbol_engine_optimizer.opt_reshape : 0.000008s : 0.04% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000013s : 0.06% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.01% pipeline_parallel_scheduler : 0.000002s : 0.01% auto_monad_reorder : 0.000039s : 0.16% get_jit_bprop_graph : 0.000002s : 0.01% rewriter_after_jit_bprop_graph : 0.000005s : 0.02% opt_after_jit_grad : 0.000506s : 2.15% validate : 0.000046s : 0.19% Time group info: ------[substitution.] 0.000204 38 12.23% : 0.000025s : 3: substitution.cast_eliminate 1.25% : 0.000003s : 3: substitution.elim_not_effective 0.87% : 0.000002s : 3: substitution.fold_const_symbol 3.46% : 0.000007s : 5: substitution.graph_param_transform 66.84% : 0.000136s : 4: substitution.inline 2.70% : 0.000006s : 6: substitution.j_node_and_user_rematch 3.35% : 0.000007s : 6: substitution.remove_not_recompute_node 2.39% : 0.000005s : 4: substitution.replace_old_param 6.92% : 0.000014s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.005612 2 86.63% : 0.004862s : 1: type_inference.infer 13.37% : 0.000750s : 1: type_inference.specialize ------[replace.] 0.000061 8 59.07% : 0.000036s : 4: replace.inline 40.93% : 0.000025s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000146 8 91.64% : 0.000134s : 4: match.inline 8.36% : 0.000012s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000259 1596 1.02% : 0.000003s : 17: predicate.accumulaten_eliminater 0.87% : 0.000002s : 5: predicate.ad_related_special_op_eliminate 0.52% : 0.000001s : 10: predicate.addn_check_dump 1.02% : 0.000003s : 17: predicate.addn_zero_filter 0.92% : 0.000002s : 17: predicate.adjust_all_reduce_mul_add 1.98% : 0.000005s : 27: predicate.arithmetic_simplify 1.04% : 0.000003s : 17: predicate.cast_eliminate 0.62% : 0.000002s : 10: predicate.check_bprop_eliminate 0.67% : 0.000002s : 10: predicate.compare_switch_simplify 0.18% : 0.000000s : 5: predicate.const_output_eliminate 0.57% : 0.000001s : 10: predicate.depend_value_elim 0.99% : 0.000003s : 17: predicate.dict_get_item_const_eliminator 1.14% : 0.000003s : 17: predicate.dict_get_item_eliminator 1.02% : 0.000003s : 17: predicate.dict_set_item_eliminator 0.87% : 0.000002s : 10: predicate.dumpgradient_eliminate 0.23% : 0.000001s : 5: predicate.elim_not_effective 0.41% : 0.000001s : 5: predicate.elim_shapecalc_of_broadcastargs 1.22% : 0.000003s : 22: predicate.environ_add_const_eliminate 1.17% : 0.000003s : 22: predicate.environ_get_add_eliminate 1.14% : 0.000003s : 22: predicate.environ_get_depend_swap 1.70% : 0.000004s : 32: predicate.environ_get_eliminate 1.13% : 0.000003s : 22: predicate.environ_get_set_eliminate 1.41% : 0.000004s : 25: predicate.exchange_switch_depend_value 2.35% : 0.000006s : 25: predicate.float_depend_g_call 0.52% : 0.000001s : 10: predicate.float_environ_get_switch 0.79% : 0.000002s : 15: predicate.float_tuple_getitem_switch 0.18% : 0.000000s : 5: predicate.fold_const_symbol 0.62% : 0.000002s : 10: predicate.get_grad_eliminate 0.19% : 0.000000s : 5: predicate.graph_param_transform 0.58% : 0.000001s : 10: predicate.incorporate_call 0.48% : 0.000001s : 10: predicate.incorporate_call_switch 6.00% : 0.000016s : 72: predicate.inline 0.70% : 0.000002s : 10: predicate.inline_without_move 0.34% : 0.000001s : 10: predicate.j_node_and_user_rematch 0.77% : 0.000002s : 10: predicate.less_batch_normalization 1.81% : 0.000005s : 31: predicate.list_to_tuple_eliminator_ 2.57% : 0.000007s : 48: predicate.load_eliminater 1.20% : 0.000003s : 5: predicate.loop_unroll_after_grad 1.99% : 0.000005s : 36: predicate.loop_unroll_before_grad 1.68% : 0.000004s : 27: predicate.make_slice_get_slice_eliminator 0.56% : 0.000001s : 10: predicate.merge_addn 0.73% : 0.000002s : 10: predicate.micro_step_allgather_replace 0.59% : 0.000002s : 10: predicate.mini_step_allgather_replace 0.88% : 0.000002s : 17: predicate.minmaximum_grad 0.97% : 0.000003s : 5: predicate.mutable_eliminate 0.37% : 0.000001s : 5: predicate.opt_reshape 0.40% : 0.000001s : 5: predicate.parallel_virtual_node 1.66% : 0.000004s : 25: predicate.partial_defer_inline 1.69% : 0.000004s : 26: predicate.partial_eliminate 0.95% : 0.000002s : 17: predicate.print_const_string_wrapper 0.55% : 0.000001s : 10: predicate.reduce_all_const_elim 1.20% : 0.000003s : 17: predicate.reduce_eliminate 2.73% : 0.000007s : 48: predicate.redundant_stop_gradient_eliminater 0.39% : 0.000001s : 10: predicate.remove_not_recompute_node 1.50% : 0.000004s : 31: predicate.replace_applicator 0.56% : 0.000001s : 10: predicate.replace_old_param 0.24% : 0.000001s : 5: predicate.reset_defer_inline 1.01% : 0.000003s : 17: predicate.reshape_eliminate 0.56% : 0.000001s : 10: predicate.row_tensor_add_zeros_like 0.47% : 0.000001s : 5: predicate.row_tensor_eliminate 0.75% : 0.000002s : 10: predicate.same_eliminate 0.43% : 0.000001s : 10: predicate.set_cell_output_no_recompute 0.91% : 0.000002s : 10: predicate.shard_identity_eliminate 0.70% : 0.000002s : 10: predicate.special_op_eliminate 0.73% : 0.000002s : 10: predicate.specialize_transform 0.98% : 0.000003s : 10: predicate.split_environ_get_set_with_tuple_value 0.71% : 0.000002s : 10: predicate.stack_unstack_eliminate 0.34% : 0.000001s : 5: predicate.switch_call_monad_eliminater 1.58% : 0.000004s : 25: predicate.switch_defer_inline 2.00% : 0.000005s : 35: predicate.switch_layer_defer_inline 4.53% : 0.000012s : 76: predicate.switch_simplify 0.91% : 0.000002s : 17: predicate.tile_eliminate 0.97% : 0.000003s : 17: predicate.transpose_eliminate 1.63% : 0.000004s : 27: predicate.tuple_list_convert_item_index_to_positive 1.75% : 0.000005s : 27: predicate.tuple_list_get_item_const_eliminator 1.55% : 0.000004s : 27: predicate.tuple_list_get_item_depend_reorder 3.24% : 0.000008s : 41: predicate.tuple_list_get_item_eliminator 1.61% : 0.000004s : 27: predicate.tuple_list_get_set_item_eliminator 2.29% : 0.000006s : 37: predicate.tuple_list_set_item_eliminator 1.80% : 0.000005s : 31: predicate.tuple_to_list_eliminator_ 2.51% : 0.000006s : 48: predicate.updatestate_pure_node_eliminater 3.16% : 0.000008s : 58: predicate.updatestate_useless_node_eliminater 0.34% : 0.000001s : 5: predicate.value_based_eliminate 0.66% : 0.000002s : 10: predicate.virtual_dataset_eliminate 0.59% : 0.000002s : 10: predicate.virtual_output_eliminate 0.27% : 0.000001s : 5: predicate.virtual_view_grad_eliminate 0.43% : 0.000001s : 5: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000559 11 50.51% : 0.000282s : 5: func_graph_cloner_run.FuncGraphClonerGraph 49.49% : 0.000276s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.062790 192 0.01% : 0.000004s : 1: ForceFp32Comm 4.83% : 0.003034s : 1: add_attr 4.82% : 0.003024s : 1: add_attr_with_inline 0.01% : 0.000004s : 1: add_comm_op_reuse_tag 0.10% : 0.000064s : 1: add_recomputation 0.01% : 0.000004s : 1: assign_add_opt 0.11% : 0.000069s : 1: auto_monad 0.07% : 0.000044s : 1: auto_monad_reorder 0.01% : 0.000003s : 1: begin_end_overlap_inline 0.01% : 0.000006s : 1: bias_add_comm_swap 0.69% : 0.000434s : 1: bootstrap 0.05% : 0.000030s : 1: cconv 0.01% : 0.000004s : 1: comm_op_add_attrs 0.03% : 0.000019s : 1: control_data_broadcast_order 0.02% : 0.000011s : 1: convert_after_rewriter 0.05% : 0.000032s : 1: cse_after_recomputation 0.01% : 0.000005s : 1: dataset_repeat_opt 0.01% : 0.000006s : 1: detach_backward 0.02% : 0.000011s : 1: environ_conv 0.04% : 0.000024s : 1: event_method 0.01% : 0.000005s : 1: full_micro_interleaved_order_control 0.01% : 0.000006s : 1: get_jit_bprop_graph 0.01% : 0.000009s : 1: graph_reusing 0.01% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000004s : 1: handle_group_info 0.01% : 0.000006s : 1: inline 0.01% : 0.000006s : 1: insert-virtual-dataset 0.01% : 0.000004s : 1: interleave_parallel_branches 0.01% : 0.000004s : 1: interleave_split_concat_branches 0.01% : 0.000006s : 1: label_fine_grained_interleaved_index 0.01% : 0.000008s : 1: label_micro_interleaved_index 0.74% : 0.000463s : 1: loop_unroll 0.01% : 0.000004s : 1: merge_cast_opt 0.01% : 0.000005s : 1: micro_interleaved_order_control 1.10% : 0.000688s : 1: mutable_eliminate 0.01% : 0.000008s : 1: offloading_packed_experts 0.03% : 0.000019s : 1: opt.transform.loop_unroll_optimizer 0.03% : 0.000017s : 1: opt.transform.mutable_eliminate 2.28% : 0.001429s : 78: opt.transform.opt_a 0.06% : 0.000036s : 1: opt.transform.opt_after_cconv 0.05% : 0.000030s : 1: opt.transform.opt_after_jit_grad 0.21% : 0.000132s : 28: opt.transform.opt_b 0.09% : 0.000059s : 2: opt.transform.opt_trans_graph 0.08% : 0.000048s : 4: opt.transform.symbol_engine_opt 24.32% : 0.015268s : 1: opt_a 0.21% : 0.000129s : 1: opt_after_cconv 0.82% : 0.000515s : 1: opt_after_jit_grad 0.39% : 0.000245s : 1: opt_b 28.06% : 0.017619s : 1: optimize 0.03% : 0.000021s : 1: optimize_parallel_all_gather_comm 0.01% : 0.000009s : 1: order_py_execute_after_rewriter 0.05% : 0.000029s : 1: overlap_grad_flash_sp 0.01% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.01% : 0.000008s : 1: overlap_grad_ring_attention 0.01% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.01% : 0.000005s : 1: overlap_param_gather 0.01% : 0.000005s : 1: overlap_recompute_allgather_and_fa_grad 0.01% : 0.000009s : 1: overlap_recompute_and_grad_model_parallel 0.01% : 0.000005s : 1: overlap_recompute_comm 0.01% : 0.000007s : 1: parallel-infer-symbol 0.01% : 0.000004s : 1: parallel-infer-symbol-second 0.01% : 0.000005s : 1: partial_unused_args_eliminate 0.01% : 0.000005s : 1: pipeline_parallel_scheduler 0.01% : 0.000004s : 1: pipeline_split 0.06% : 0.000037s : 1: pre_auto_parallel 0.05% : 0.000029s : 1: py_interpret_to_execute 0.03% : 0.000017s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000004s : 1: remove_cast_before_assign_add 0.03% : 0.000019s : 1: remove_dup_value 19.87% : 0.012479s : 1: renormalize.infer 0.67% : 0.000420s : 1: renormalize.specialize 0.01% : 0.000005s : 1: reorder_send_recv_between_fp_bp 0.01% : 0.000009s : 1: rewriter_after_jit_bprop_graph 0.07% : 0.000046s : 1: rewriter_after_opt_a 0.14% : 0.000086s : 1: rewriter_before_opt_a 0.01% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.01% : 0.000005s : 1: slice_recompute_activation 0.01% : 0.000005s : 1: split_layernorm_comm 0.01% : 0.000005s : 1: split_matmul_comm_elemetwise 0.01% : 0.000009s : 1: swap_dp_allreduce_reducescatter 0.15% : 0.000097s : 1: symbol_engine_optimizer 0.15% : 0.000091s : 1: tuple_transform 9.05% : 0.005685s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:45:35.371.242 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:45:35.371.513 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0281979, [21] [bootstrap]: 0.00039519 [type_inference]: 0.00547803 [event_method]: 1.832e-05 [auto_monad]: 6.068e-05 [graph_reusing]: 5.88002e-06 [inline]: 2.28998e-06 [add_attr]: 0.00297657, [1] [add_attr_with_inline]: 0.00296909, [1] [Cycle 1]: 6.28e-05, [2] [tag_attr]: 1.827e-05 [meta_addattr_fg_expand]: 5.91e-06 [parallel-infer-symbol]: 3.04999e-06 [pre_auto_parallel]: 3.119e-05 [insert-virtual-dataset]: 2.68e-06 [parallel-infer-symbol-second]: 7.7e-07 [dataset_repeat_opt]: 1.84e-06 [pipeline_split]: 1.97001e-06 [optimize]: 0.0179932, [53] [py_interpret_to_execute]: 2.633e-05 [rewriter_before_opt_a]: 8.471e-05 [opt_a]: 0.0155135, [2] [Cycle 1]: 0.0145363, [45] [expand_dump_flag]: 2.76999e-06 [switch_simplify]: 4.26e-05 [loop_unroll]: 3.148e-05 [a_1]: 0.00065095 [with_stream_mark]: 1.41e-05 [recompute_prepare]: 9.76e-06 [updatestate_depend_eliminate]: 4.68999e-06 [updatestate_assign_eliminate]: 4.13001e-06 [updatestate_loads_eliminate]: 3.74002e-06 [parameter_eliminate]: 1.86e-06 [a_2]: 0.0001298 [accelerated_algorithm]: 8.56002e-06 [shard]: 1.67999e-06 [meta_shard_fg_expand]: 1.94e-06 [shard_inline]: 7.82998e-06 [merge_send_recv]: 8.90999e-06 [auto_parallel]: 6.56999e-06 [parallel]: 1.835e-05 [flash_sp]: 7.88999e-06 [merge_comm]: 4.82e-06 [allreduce_fusion]: 4.34997e-06 [matmul_add_comm_reduction]: 9.86e-06 [allreduce_slice_to_reducescatter]: 6.10016e-07 [virtual_shard_identity]: 9.09e-06 [virtual_dataset]: 8.02e-06 [get_grad_eliminate_]: 7.48e-06 [virtual_output]: 7.9e-06 [merge_forward]: 4.32e-06 [cell_reuse_recompute_pass]: 1.18001e-06 [offload_activation]: 1.021e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.703e-05 [merge_recompute_call_nodes]: 1.39e-06 [before_grad]: 1.382e-05 [set_forward_comm_id_for_comm_node_pass]: 4.65999e-06 [meta_fg_expand]: 3.50998e-06 [flash_sp_send_recv_attached]: 3.06999e-06 [receive_attached]: 3.67998e-06 [after_resolve]: 2.97e-05 [a_after_grad]: 1.561e-05 [renormalize]: 0.00069233 [add_forward_monad_depend]: 6.11e-06 [auto_monad_grad]: 2.57001e-06 [auto_monad_eliminator]: 1.708e-05 [cse]: 3.505e-05 [a_3]: 7.208e-05 [Cycle 2]: 0.00096273, [45] [expand_dump_flag]: 1.38002e-06 [switch_simplify]: 9.24998e-06 [loop_unroll]: 7.60998e-06 [a_1]: 0.00017713 [with_stream_mark]: 1.409e-05 [recompute_prepare]: 8.08999e-06 [updatestate_depend_eliminate]: 3.93999e-06 [updatestate_assign_eliminate]: 3.28e-06 [updatestate_loads_eliminate]: 3.56001e-06 [parameter_eliminate]: 1.19998e-06 [a_2]: 0.00012189 [accelerated_algorithm]: 7.87e-06 [shard]: 1.63002e-06 [meta_shard_fg_expand]: 2.00002e-06 [shard_inline]: 7.76001e-06 [merge_send_recv]: 7.45e-06 [auto_parallel]: 8e-06 [parallel]: 6.16998e-06 [flash_sp]: 4.78001e-06 [merge_comm]: 4.48999e-06 [allreduce_fusion]: 4.28999e-06 [matmul_add_comm_reduction]: 8.44002e-06 [allreduce_slice_to_reducescatter]: 7.29982e-07 [virtual_shard_identity]: 9.38997e-06 [virtual_dataset]: 7.92e-06 [get_grad_eliminate_]: 7.29001e-06 [virtual_output]: 7.45e-06 [merge_forward]: 3.8e-06 [cell_reuse_recompute_pass]: 1.89999e-06 [offload_activation]: 9.34e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.705e-05 [merge_recompute_call_nodes]: 1.52999e-06 [before_grad]: 1.198e-05 [set_forward_comm_id_for_comm_node_pass]: 4.4e-06 [meta_fg_expand]: 3.3e-06 [flash_sp_send_recv_attached]: 1.24e-06 [receive_attached]: 9.70002e-07 [after_resolve]: 1.292e-05 [a_after_grad]: 1.2e-05 [renormalize]: 8.00064e-08 [add_forward_monad_depend]: 1.72001e-06 [auto_monad_grad]: 9.30013e-07 [auto_monad_eliminator]: 8.95999e-06 [cse]: 1.764e-05 [a_3]: 5.949e-05 [py_interpret_to_execute_after_opt_a]: 1.475e-05 [slice_cell_reuse_recomputed_activation]: 4.86002e-06 [rewriter_after_opt_a]: 4.395e-05 [convert_after_rewriter]: 1.061e-05 [order_py_execute_after_rewriter]: 8.63001e-06 [mutable_eliminate]: 0.00056564 [opt_b]: 0.0003162, [1] [Cycle 1]: 0.00030651, [7] [b_1]: 0.00020178 [b_2]: 1.1e-05 [updatestate_depend_eliminate]: 6.24001e-06 [updatestate_assign_eliminate]: 3.18e-06 [updatestate_loads_eliminate]: 3.09001e-06 [renormalize]: 4.09986e-07 [cse]: 2.166e-05 [optimize_parallel_all_gather_comm]: 2.187e-05 [overlap_param_gather]: 4.97e-06 [cconv]: 2.844e-05 [loop_unroll]: 0.00046084 [opt_after_cconv]: 0.0001373, [1] [Cycle 1]: 0.00012889, [7] [c_1]: 3.682e-05 [parameter_eliminate]: 2.74001e-06 [updatestate_depend_eliminate]: 5.82999e-06 [updatestate_assign_eliminate]: 3.19001e-06 [updatestate_loads_eliminate]: 2.96001e-06 [cse]: 2.173e-05 [renormalize]: 4.10015e-07 [remove_dup_value]: 1.844e-05 [tuple_transform]: 0.00011763, [1] [Cycle 1]: 0.00010997, [4] [d_1]: 5.222e-05 [none_parameter_eliminate]: 1.67999e-06 [renormalize]: 1.60013e-07 [switch_simplify]: 9.10001e-06 [partial_unused_args_eliminate]: 5.00001e-06 [add_recomputation]: 5.781e-05 [cse_after_recomputation]: 3.48e-05, [1] [Cycle 1]: 2.7e-05, [1] [cse]: 1.716e-05 [environ_conv]: 9.37999e-06 [swap_dp_allreduce_reducescatter]: 9.53002e-06 [bias_add_comm_swap]: 5.67001e-06 [label_micro_interleaved_index]: 7.46001e-06 [label_fine_grained_interleaved_index]: 5.51998e-06 [merge_cast_opt]: 4.02e-06 [slice_recompute_activation]: 4.57e-06 [micro_interleaved_order_control]: 4.63999e-06 [assign_add_opt]: 4.03001e-06 [ForceFp32Comm]: 3.48e-06 [remove_cast_before_assign_add]: 3.53999e-06 [full_micro_interleaved_order_control]: 4.58001e-06 [reorder_send_recv_between_fp_bp]: 5.76998e-06 [comm_op_add_attrs]: 3.58e-06 [add_comm_op_reuse_tag]: 3.57997e-06 [interleave_split_concat_branches]: 3.8e-06 [interleave_parallel_branches]: 3.55e-06 [overlap_opt_shard_in_pipeline]: 3.95e-06 [overlap_opt_shard_grad_in_pipeline]: 4.67998e-06 [control_data_broadcast_order]: 1.921e-05 [grouped_pairwise_exchange_alltoall]: 4.03999e-06 [offloading_packed_experts]: 7.47998e-06 [overlap_recompute_and_grad_model_parallel]: 7.50998e-06 [overlap_grad_matmul_and_grad_allreduce]: 4.07998e-06 [overlap_recompute_allgather_and_fa_grad]: 4.13999e-06 [overlap_recompute_comm]: 4.68001e-06 [overlap_grad_ring_attention]: 7.54002e-06 [overlap_grad_flash_sp]: 2.576e-05 [begin_end_overlap_inline]: 3.03e-06 [split_matmul_comm_elemetwise]: 4.77e-06 [split_layernorm_comm]: 4.22e-06 [handle_group_info]: 3.45998e-06 [symbol_engine_optimizer]: 0.00010882, [1] [Cycle 1]: 0.0001012, [6] [build]: 3.48999e-06 [elim_shapecalc]: 1.283e-05 [elim_not_effective]: 1.591e-05 [opt_reshape]: 8.77999e-06 [fold_const_symbol]: 1.24e-05 [renormalize]: 2.10013e-07 [detach_backward]: 3.51001e-06 [pipeline_parallel_scheduler]: 1.92999e-06 [auto_monad_reorder]: 2.302e-05 [get_jit_bprop_graph]: 1.35001e-06 [rewriter_after_jit_bprop_graph]: 3.71999e-06 [opt_after_jit_grad]: 0.00051401 [validate]: 4.135e-05 Sums bootstrap : 0.000395s : 3.53% type_inference : 0.005478s : 48.99% event_method : 0.000018s : 0.16% auto_monad : 0.000061s : 0.54% graph_reusing : 0.000006s : 0.05% inline : 0.000002s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000018s : 0.16% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.05% parallel-infer-symbol : 0.000003s : 0.03% pre_auto_parallel : 0.000031s : 0.28% insert-virtual-dataset : 0.000003s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.02% optimize.py_interpret_to_execute : 0.000026s : 0.24% optimize.rewriter_before_opt_a : 0.000085s : 0.76% optimize.opt_a.expand_dump_flag : 0.000004s : 0.04% optimize.opt_a.switch_simplify : 0.000052s : 0.46% optimize.opt_a.loop_unroll : 0.000039s : 0.35% optimize.opt_a.a_1 : 0.000828s : 7.41% optimize.opt_a.with_stream_mark : 0.000028s : 0.25% optimize.opt_a.recompute_prepare : 0.000018s : 0.16% optimize.opt_a.updatestate_depend_eliminate : 0.000009s : 0.08% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.07% optimize.opt_a.updatestate_loads_eliminate : 0.000007s : 0.07% optimize.opt_a.parameter_eliminate : 0.000003s : 0.03% optimize.opt_a.a_2 : 0.000252s : 2.25% optimize.opt_a.accelerated_algorithm : 0.000016s : 0.15% optimize.opt_a.shard : 0.000003s : 0.03% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.04% optimize.opt_a.shard_inline : 0.000016s : 0.14% optimize.opt_a.merge_send_recv : 0.000016s : 0.15% optimize.opt_a.auto_parallel : 0.000015s : 0.13% optimize.opt_a.parallel : 0.000025s : 0.22% optimize.opt_a.flash_sp : 0.000013s : 0.11% optimize.opt_a.merge_comm : 0.000009s : 0.08% optimize.opt_a.allreduce_fusion : 0.000009s : 0.08% optimize.opt_a.matmul_add_comm_reduction : 0.000018s : 0.16% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000018s : 0.17% optimize.opt_a.virtual_dataset : 0.000016s : 0.14% optimize.opt_a.get_grad_eliminate_ : 0.000015s : 0.13% optimize.opt_a.virtual_output : 0.000015s : 0.14% optimize.opt_a.merge_forward : 0.000008s : 0.07% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.03% optimize.opt_a.offload_activation : 0.000020s : 0.17% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000034s : 0.30% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.03% optimize.opt_a.before_grad : 0.000026s : 0.23% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000009s : 0.08% optimize.opt_a.meta_fg_expand : 0.000007s : 0.06% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.04% optimize.opt_a.receive_attached : 0.000005s : 0.04% optimize.opt_a.after_resolve : 0.000043s : 0.38% optimize.opt_a.a_after_grad : 0.000028s : 0.25% optimize.opt_a.renormalize : 0.000692s : 6.19% optimize.opt_a.add_forward_monad_depend : 0.000008s : 0.07% optimize.opt_a.auto_monad_grad : 0.000004s : 0.03% optimize.opt_a.auto_monad_eliminator : 0.000026s : 0.23% optimize.opt_a.cse : 0.000053s : 0.47% optimize.opt_a.a_3 : 0.000132s : 1.18% optimize.py_interpret_to_execute_after_opt_a : 0.000015s : 0.13% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.04% optimize.rewriter_after_opt_a : 0.000044s : 0.39% optimize.convert_after_rewriter : 0.000011s : 0.09% optimize.order_py_execute_after_rewriter : 0.000009s : 0.08% optimize.mutable_eliminate : 0.000566s : 5.06% optimize.opt_b.b_1 : 0.000202s : 1.80% optimize.opt_b.b_2 : 0.000011s : 0.10% optimize.opt_b.updatestate_depend_eliminate : 0.000006s : 0.06% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_b.renormalize : 0.000000s : 0.00% optimize.opt_b.cse : 0.000022s : 0.19% optimize.optimize_parallel_all_gather_comm : 0.000022s : 0.20% optimize.overlap_param_gather : 0.000005s : 0.04% optimize.cconv : 0.000028s : 0.25% optimize.loop_unroll : 0.000461s : 4.12% optimize.opt_after_cconv.c_1 : 0.000037s : 0.33% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.02% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.05% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.cse : 0.000022s : 0.19% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000018s : 0.16% optimize.tuple_transform.d_1 : 0.000052s : 0.47% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.02% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000009s : 0.08% optimize.partial_unused_args_eliminate : 0.000005s : 0.04% optimize.add_recomputation : 0.000058s : 0.52% optimize.cse_after_recomputation.cse : 0.000017s : 0.15% optimize.environ_conv : 0.000009s : 0.08% optimize.swap_dp_allreduce_reducescatter : 0.000010s : 0.09% optimize.bias_add_comm_swap : 0.000006s : 0.05% optimize.label_micro_interleaved_index : 0.000007s : 0.07% optimize.label_fine_grained_interleaved_index : 0.000006s : 0.05% optimize.merge_cast_opt : 0.000004s : 0.04% optimize.slice_recompute_activation : 0.000005s : 0.04% optimize.micro_interleaved_order_control : 0.000005s : 0.04% optimize.assign_add_opt : 0.000004s : 0.04% optimize.ForceFp32Comm : 0.000003s : 0.03% optimize.remove_cast_before_assign_add : 0.000004s : 0.03% optimize.full_micro_interleaved_order_control : 0.000005s : 0.04% optimize.reorder_send_recv_between_fp_bp : 0.000006s : 0.05% optimize.comm_op_add_attrs : 0.000004s : 0.03% optimize.add_comm_op_reuse_tag : 0.000004s : 0.03% optimize.interleave_split_concat_branches : 0.000004s : 0.03% optimize.interleave_parallel_branches : 0.000004s : 0.03% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.04% optimize.overlap_opt_shard_grad_in_pipeline : 0.000005s : 0.04% optimize.control_data_broadcast_order : 0.000019s : 0.17% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.04% optimize.offloading_packed_experts : 0.000007s : 0.07% optimize.overlap_recompute_and_grad_model_parallel : 0.000008s : 0.07% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.04% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.04% optimize.overlap_recompute_comm : 0.000005s : 0.04% optimize.overlap_grad_ring_attention : 0.000008s : 0.07% optimize.overlap_grad_flash_sp : 0.000026s : 0.23% optimize.begin_end_overlap_inline : 0.000003s : 0.03% optimize.split_matmul_comm_elemetwise : 0.000005s : 0.04% optimize.split_layernorm_comm : 0.000004s : 0.04% optimize.handle_group_info : 0.000003s : 0.03% optimize.symbol_engine_optimizer.build : 0.000003s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000013s : 0.11% optimize.symbol_engine_optimizer.elim_not_effective : 0.000016s : 0.14% optimize.symbol_engine_optimizer.opt_reshape : 0.000009s : 0.08% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000012s : 0.11% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000004s : 0.03% pipeline_parallel_scheduler : 0.000002s : 0.02% auto_monad_reorder : 0.000023s : 0.21% get_jit_bprop_graph : 0.000001s : 0.01% rewriter_after_jit_bprop_graph : 0.000004s : 0.03% opt_after_jit_grad : 0.000514s : 4.60% validate : 0.000041s : 0.37% Time group info: ------[substitution.] 0.000190 38 12.48% : 0.000024s : 3: substitution.cast_eliminate 1.13% : 0.000002s : 3: substitution.elim_not_effective 0.88% : 0.000002s : 3: substitution.fold_const_symbol 3.61% : 0.000007s : 5: substitution.graph_param_transform 67.14% : 0.000128s : 4: substitution.inline 2.33% : 0.000004s : 6: substitution.j_node_and_user_rematch 3.36% : 0.000006s : 6: substitution.remove_not_recompute_node 2.67% : 0.000005s : 4: substitution.replace_old_param 6.41% : 0.000012s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.005435 2 87.81% : 0.004772s : 1: type_inference.infer 12.19% : 0.000662s : 1: type_inference.specialize ------[replace.] 0.000058 8 62.15% : 0.000036s : 4: replace.inline 37.85% : 0.000022s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000135 8 92.40% : 0.000125s : 4: match.inline 7.60% : 0.000010s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000242 1504 0.85% : 0.000002s : 15: predicate.accumulaten_eliminater 0.73% : 0.000002s : 5: predicate.ad_related_special_op_eliminate 0.66% : 0.000002s : 10: predicate.addn_check_dump 0.91% : 0.000002s : 15: predicate.addn_zero_filter 0.79% : 0.000002s : 15: predicate.adjust_all_reduce_mul_add 1.90% : 0.000005s : 25: predicate.arithmetic_simplify 0.98% : 0.000002s : 15: predicate.cast_eliminate 0.66% : 0.000002s : 10: predicate.check_bprop_eliminate 0.59% : 0.000001s : 10: predicate.compare_switch_simplify 0.19% : 0.000000s : 5: predicate.const_output_eliminate 0.64% : 0.000002s : 10: predicate.depend_value_elim 0.92% : 0.000002s : 15: predicate.dict_get_item_const_eliminator 0.94% : 0.000002s : 15: predicate.dict_get_item_eliminator 0.86% : 0.000002s : 15: predicate.dict_set_item_eliminator 0.90% : 0.000002s : 10: predicate.dumpgradient_eliminate 0.21% : 0.000001s : 5: predicate.elim_not_effective 0.54% : 0.000001s : 5: predicate.elim_shapecalc_of_broadcastargs 1.13% : 0.000003s : 20: predicate.environ_add_const_eliminate 1.06% : 0.000003s : 20: predicate.environ_get_add_eliminate 1.14% : 0.000003s : 20: predicate.environ_get_depend_swap 1.87% : 0.000005s : 30: predicate.environ_get_eliminate 1.19% : 0.000003s : 20: predicate.environ_get_set_eliminate 1.39% : 0.000003s : 23: predicate.exchange_switch_depend_value 2.29% : 0.000006s : 23: predicate.float_depend_g_call 0.71% : 0.000002s : 10: predicate.float_environ_get_switch 0.86% : 0.000002s : 15: predicate.float_tuple_getitem_switch 0.20% : 0.000000s : 5: predicate.fold_const_symbol 0.72% : 0.000002s : 10: predicate.get_grad_eliminate 0.20% : 0.000000s : 5: predicate.graph_param_transform 0.64% : 0.000002s : 10: predicate.incorporate_call 0.55% : 0.000001s : 10: predicate.incorporate_call_switch 6.16% : 0.000015s : 68: predicate.inline 1.27% : 0.000003s : 10: predicate.inline_without_move 0.33% : 0.000001s : 10: predicate.j_node_and_user_rematch 0.91% : 0.000002s : 10: predicate.less_batch_normalization 1.94% : 0.000005s : 29: predicate.list_to_tuple_eliminator_ 2.43% : 0.000006s : 44: predicate.load_eliminater 0.75% : 0.000002s : 5: predicate.loop_unroll_after_grad 2.17% : 0.000005s : 36: predicate.loop_unroll_before_grad 1.67% : 0.000004s : 25: predicate.make_slice_get_slice_eliminator 0.61% : 0.000001s : 10: predicate.merge_addn 0.61% : 0.000001s : 10: predicate.micro_step_allgather_replace 0.64% : 0.000002s : 10: predicate.mini_step_allgather_replace 0.86% : 0.000002s : 15: predicate.minmaximum_grad 0.81% : 0.000002s : 5: predicate.mutable_eliminate 0.40% : 0.000001s : 5: predicate.opt_reshape 0.44% : 0.000001s : 5: predicate.parallel_virtual_node 1.63% : 0.000004s : 23: predicate.partial_defer_inline 1.60% : 0.000004s : 24: predicate.partial_eliminate 0.87% : 0.000002s : 15: predicate.print_const_string_wrapper 0.63% : 0.000002s : 10: predicate.reduce_all_const_elim 1.20% : 0.000003s : 15: predicate.reduce_eliminate 2.40% : 0.000006s : 44: predicate.redundant_stop_gradient_eliminater 0.41% : 0.000001s : 10: predicate.remove_not_recompute_node 1.31% : 0.000003s : 29: predicate.replace_applicator 0.74% : 0.000002s : 10: predicate.replace_old_param 0.26% : 0.000001s : 5: predicate.reset_defer_inline 0.91% : 0.000002s : 15: predicate.reshape_eliminate 0.72% : 0.000002s : 10: predicate.row_tensor_add_zeros_like 0.69% : 0.000002s : 5: predicate.row_tensor_eliminate 0.73% : 0.000002s : 10: predicate.same_eliminate 0.45% : 0.000001s : 10: predicate.set_cell_output_no_recompute 0.87% : 0.000002s : 10: predicate.shard_identity_eliminate 0.73% : 0.000002s : 10: predicate.special_op_eliminate 0.78% : 0.000002s : 10: predicate.specialize_transform 0.94% : 0.000002s : 10: predicate.split_environ_get_set_with_tuple_value 1.08% : 0.000003s : 10: predicate.stack_unstack_eliminate 0.35% : 0.000001s : 5: predicate.switch_call_monad_eliminater 1.44% : 0.000004s : 23: predicate.switch_defer_inline 1.93% : 0.000005s : 33: predicate.switch_layer_defer_inline 4.80% : 0.000012s : 74: predicate.switch_simplify 0.85% : 0.000002s : 15: predicate.tile_eliminate 0.86% : 0.000002s : 15: predicate.transpose_eliminate 1.56% : 0.000004s : 25: predicate.tuple_list_convert_item_index_to_positive 1.60% : 0.000004s : 25: predicate.tuple_list_get_item_const_eliminator 1.51% : 0.000004s : 25: predicate.tuple_list_get_item_depend_reorder 3.18% : 0.000008s : 39: predicate.tuple_list_get_item_eliminator 1.50% : 0.000004s : 25: predicate.tuple_list_get_set_item_eliminator 2.42% : 0.000006s : 35: predicate.tuple_list_set_item_eliminator 1.72% : 0.000004s : 29: predicate.tuple_to_list_eliminator_ 2.42% : 0.000006s : 44: predicate.updatestate_pure_node_eliminater 3.11% : 0.000008s : 54: predicate.updatestate_useless_node_eliminater 0.43% : 0.000001s : 5: predicate.value_based_eliminate 0.64% : 0.000002s : 10: predicate.virtual_dataset_eliminate 0.65% : 0.000002s : 10: predicate.virtual_output_eliminate 0.35% : 0.000001s : 5: predicate.virtual_view_grad_eliminate 0.52% : 0.000001s : 5: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000474 11 52.49% : 0.000249s : 5: func_graph_cloner_run.FuncGraphClonerGraph 47.51% : 0.000225s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.051441 192 0.01% : 0.000006s : 1: ForceFp32Comm 5.80% : 0.002985s : 1: add_attr 5.78% : 0.002973s : 1: add_attr_with_inline 0.01% : 0.000006s : 1: add_comm_op_reuse_tag 0.12% : 0.000062s : 1: add_recomputation 0.01% : 0.000007s : 1: assign_add_opt 0.14% : 0.000070s : 1: auto_monad 0.06% : 0.000030s : 1: auto_monad_reorder 0.01% : 0.000006s : 1: begin_end_overlap_inline 0.02% : 0.000009s : 1: bias_add_comm_swap 0.85% : 0.000437s : 1: bootstrap 0.06% : 0.000032s : 1: cconv 0.01% : 0.000006s : 1: comm_op_add_attrs 0.04% : 0.000022s : 1: control_data_broadcast_order 0.03% : 0.000014s : 1: convert_after_rewriter 0.07% : 0.000038s : 1: cse_after_recomputation 0.01% : 0.000007s : 1: dataset_repeat_opt 0.04% : 0.000018s : 1: detach_backward 0.02% : 0.000013s : 1: environ_conv 0.05% : 0.000027s : 1: event_method 0.01% : 0.000007s : 1: full_micro_interleaved_order_control 0.01% : 0.000007s : 1: get_jit_bprop_graph 0.02% : 0.000012s : 1: graph_reusing 0.01% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000006s : 1: handle_group_info 0.02% : 0.000008s : 1: inline 0.02% : 0.000009s : 1: insert-virtual-dataset 0.01% : 0.000006s : 1: interleave_parallel_branches 0.01% : 0.000006s : 1: interleave_split_concat_branches 0.02% : 0.000008s : 1: label_fine_grained_interleaved_index 0.02% : 0.000010s : 1: label_micro_interleaved_index 0.91% : 0.000466s : 1: loop_unroll 0.01% : 0.000007s : 1: merge_cast_opt 0.01% : 0.000007s : 1: micro_interleaved_order_control 1.11% : 0.000571s : 1: mutable_eliminate 0.02% : 0.000010s : 1: offloading_packed_experts 0.03% : 0.000015s : 1: opt.transform.loop_unroll_optimizer 0.03% : 0.000016s : 1: opt.transform.mutable_eliminate 2.67% : 0.001371s : 78: opt.transform.opt_a 0.07% : 0.000035s : 1: opt.transform.opt_after_cconv 0.06% : 0.000029s : 1: opt.transform.opt_after_jit_grad 0.27% : 0.000139s : 28: opt.transform.opt_b 0.11% : 0.000059s : 2: opt.transform.opt_trans_graph 0.09% : 0.000046s : 4: opt.transform.symbol_engine_opt 30.16% : 0.015517s : 1: opt_a 0.27% : 0.000141s : 1: opt_after_cconv 1.02% : 0.000526s : 1: opt_after_jit_grad 0.62% : 0.000320s : 1: opt_b 35.78% : 0.018403s : 1: optimize 0.05% : 0.000025s : 1: optimize_parallel_all_gather_comm 0.02% : 0.000012s : 1: order_py_execute_after_rewriter 0.06% : 0.000029s : 1: overlap_grad_flash_sp 0.01% : 0.000007s : 1: overlap_grad_matmul_and_grad_allreduce 0.02% : 0.000011s : 1: overlap_grad_ring_attention 0.01% : 0.000008s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000007s : 1: overlap_opt_shard_in_pipeline 0.02% : 0.000008s : 1: overlap_param_gather 0.01% : 0.000007s : 1: overlap_recompute_allgather_and_fa_grad 0.02% : 0.000010s : 1: overlap_recompute_and_grad_model_parallel 0.01% : 0.000008s : 1: overlap_recompute_comm 0.02% : 0.000010s : 1: parallel-infer-symbol 0.01% : 0.000006s : 1: parallel-infer-symbol-second 0.02% : 0.000008s : 1: partial_unused_args_eliminate 0.02% : 0.000009s : 1: pipeline_parallel_scheduler 0.01% : 0.000007s : 1: pipeline_split 0.07% : 0.000038s : 1: pre_auto_parallel 0.06% : 0.000030s : 1: py_interpret_to_execute 0.03% : 0.000018s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000007s : 1: remove_cast_before_assign_add 0.04% : 0.000022s : 1: remove_dup_value 0.74% : 0.000379s : 1: renormalize.infer 0.59% : 0.000305s : 1: renormalize.specialize 0.02% : 0.000009s : 1: reorder_send_recv_between_fp_bp 0.02% : 0.000010s : 1: rewriter_after_jit_bprop_graph 0.09% : 0.000047s : 1: rewriter_after_opt_a 0.17% : 0.000088s : 1: rewriter_before_opt_a 0.01% : 0.000008s : 1: slice_cell_reuse_recomputed_activation 0.01% : 0.000007s : 1: slice_recompute_activation 0.01% : 0.000007s : 1: split_layernorm_comm 0.01% : 0.000007s : 1: split_matmul_comm_elemetwise 0.02% : 0.000013s : 1: swap_dp_allreduce_reducescatter 0.22% : 0.000112s : 1: symbol_engine_optimizer 0.23% : 0.000121s : 1: tuple_transform 10.71% : 0.005507s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:45:35.790.164 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0403288, [21] [bootstrap]: 0.00045895 [type_inference]: 0.00593512 [event_method]: 1.974e-05 [auto_monad]: 6.391e-05 [graph_reusing]: 6.49999e-06 [inline]: 2.64001e-06 [add_attr]: 0.0275445, [1] [add_attr_with_inline]: 0.0275334, [1] [Cycle 1]: 7.447e-05, [2] [tag_attr]: 2.284e-05 [meta_addattr_fg_expand]: 6.07001e-06 [parallel-infer-symbol]: 3.96001e-06 [pre_auto_parallel]: 4.12e-05 [insert-virtual-dataset]: 2.45002e-06 [parallel-infer-symbol-second]: 6.89994e-07 [dataset_repeat_opt]: 1.87999e-06 [pipeline_split]: 2.04999e-06 [optimize]: 0.00553047, [53] [py_interpret_to_execute]: 2.776e-05 [rewriter_before_opt_a]: 9.184e-05 [opt_a]: 0.00319591, [2] [Cycle 1]: 0.00237741, [45] [expand_dump_flag]: 3.01001e-06 [switch_simplify]: 4.349e-05 [loop_unroll]: 3.124e-05 [a_1]: 0.0006941 [with_stream_mark]: 1.937e-05 [recompute_prepare]: 1.087e-05 [updatestate_depend_eliminate]: 4.69998e-06 [updatestate_assign_eliminate]: 3.62002e-06 [updatestate_loads_eliminate]: 3.84002e-06 [parameter_eliminate]: 2.03997e-06 [a_2]: 0.00011554 [accelerated_algorithm]: 9.31e-06 [shard]: 1.93002e-06 [meta_shard_fg_expand]: 2.04e-06 [shard_inline]: 7.95998e-06 [merge_send_recv]: 1.028e-05 [auto_parallel]: 8.14002e-06 [parallel]: 1.995e-05 [flash_sp]: 8.97e-06 [merge_comm]: 5.07e-06 [allreduce_fusion]: 4.61002e-06 [matmul_add_comm_reduction]: 1.141e-05 [allreduce_slice_to_reducescatter]: 6.50005e-07 [virtual_shard_identity]: 1.064e-05 [virtual_dataset]: 8.20999e-06 [get_grad_eliminate_]: 7.37002e-06 [virtual_output]: 7.68001e-06 [merge_forward]: 4.58001e-06 [cell_reuse_recompute_pass]: 1.14998e-06 [offload_activation]: 1.157e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.712e-05 [merge_recompute_call_nodes]: 1.43002e-06 [before_grad]: 1.373e-05 [set_forward_comm_id_for_comm_node_pass]: 4.68999e-06 [meta_fg_expand]: 3.38e-06 [flash_sp_send_recv_attached]: 3.03998e-06 [receive_attached]: 2.16e-06 [after_resolve]: 1.479e-05 [a_after_grad]: 1.22e-05 [renormalize]: 0.0008435 [add_forward_monad_depend]: 6.13998e-06 [auto_monad_grad]: 2.29999e-06 [auto_monad_eliminator]: 1.813e-05 [cse]: 3.85e-05 [a_3]: 6.142e-05 [Cycle 2]: 0.00080772, [45] [expand_dump_flag]: 1.45001e-06 [switch_simplify]: 9.39998e-06 [loop_unroll]: 7.58001e-06 [a_1]: 0.00018036 [with_stream_mark]: 1.631e-05 [recompute_prepare]: 7.97998e-06 [updatestate_depend_eliminate]: 3.75e-06 [updatestate_assign_eliminate]: 3.03e-06 [updatestate_loads_eliminate]: 2.96001e-06 [parameter_eliminate]: 1.34e-06 [a_2]: 9.908e-05 [accelerated_algorithm]: 7.59002e-06 [shard]: 1.82999e-06 [meta_shard_fg_expand]: 1.76e-06 [shard_inline]: 7.5e-06 [merge_send_recv]: 6.64001e-06 [auto_parallel]: 6.52001e-06 [parallel]: 5.59e-06 [flash_sp]: 3.78999e-06 [merge_comm]: 6.84999e-06 [allreduce_fusion]: 4.51002e-06 [matmul_add_comm_reduction]: 7.14001e-06 [allreduce_slice_to_reducescatter]: 5.10016e-07 [virtual_shard_identity]: 1.017e-05 [virtual_dataset]: 7.69002e-06 [get_grad_eliminate_]: 6.79001e-06 [virtual_output]: 6.93e-06 [merge_forward]: 4.27998e-06 [cell_reuse_recompute_pass]: 1.43002e-06 [offload_activation]: 8.53001e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.569e-05 [merge_recompute_call_nodes]: 1.02e-06 [before_grad]: 1.208e-05 [set_forward_comm_id_for_comm_node_pass]: 4.21001e-06 [meta_fg_expand]: 2.95002e-06 [flash_sp_send_recv_attached]: 1.24998e-06 [receive_attached]: 9.50007e-07 [after_resolve]: 1.448e-05 [a_after_grad]: 1.211e-05 [renormalize]: 8.00064e-08 [add_forward_monad_depend]: 2.46e-06 [auto_monad_grad]: 1.50999e-06 [auto_monad_eliminator]: 1.077e-05 [cse]: 2.159e-05 [a_3]: 4.582e-05 [py_interpret_to_execute_after_opt_a]: 1.387e-05 [slice_cell_reuse_recomputed_activation]: 2.01e-06 [rewriter_after_opt_a]: 4.342e-05 [convert_after_rewriter]: 8.14997e-06 [order_py_execute_after_rewriter]: 5.79999e-06 [mutable_eliminate]: 0.00057648 [opt_b]: 0.00024889, [1] [Cycle 1]: 0.00024223, [7] [b_1]: 0.00015519 [b_2]: 9.57001e-06 [updatestate_depend_eliminate]: 7.8e-06 [updatestate_assign_eliminate]: 3.01999e-06 [updatestate_loads_eliminate]: 3.28e-06 [renormalize]: 6.00005e-07 [cse]: 2.602e-05 [optimize_parallel_all_gather_comm]: 1.894e-05 [overlap_param_gather]: 2.27001e-06 [cconv]: 2.895e-05 [loop_unroll]: 0.00044046 [opt_after_cconv]: 0.0001194, [1] [Cycle 1]: 0.00011339, [7] [c_1]: 3.722e-05 [parameter_eliminate]: 3.35e-06 [updatestate_depend_eliminate]: 7.08e-06 [updatestate_assign_eliminate]: 3.01999e-06 [updatestate_loads_eliminate]: 3.16001e-06 [cse]: 2.447e-05 [renormalize]: 6.59988e-07 [remove_dup_value]: 1.528e-05 [tuple_transform]: 8.563e-05, [1] [Cycle 1]: 8.115e-05, [4] [d_1]: 5.229e-05 [none_parameter_eliminate]: 1.74e-06 [renormalize]: 2.60014e-07 [switch_simplify]: 8.52e-06 [partial_unused_args_eliminate]: 1.92001e-06 [add_recomputation]: 5.789e-05 [cse_after_recomputation]: 2.891e-05, [1] [Cycle 1]: 2.431e-05, [1] [cse]: 1.808e-05 [environ_conv]: 6.63998e-06 [swap_dp_allreduce_reducescatter]: 6.03998e-06 [bias_add_comm_swap]: 2.94999e-06 [label_micro_interleaved_index]: 4.57e-06 [label_fine_grained_interleaved_index]: 2.55002e-06 [merge_cast_opt]: 1.30999e-06 [slice_recompute_activation]: 1.92001e-06 [micro_interleaved_order_control]: 3.76999e-06 [assign_add_opt]: 1.42e-06 [ForceFp32Comm]: 1.08001e-06 [remove_cast_before_assign_add]: 9.80013e-07 [full_micro_interleaved_order_control]: 2.03002e-06 [reorder_send_recv_between_fp_bp]: 2.83998e-06 [comm_op_add_attrs]: 9.79984e-07 [add_comm_op_reuse_tag]: 9.30013e-07 [interleave_split_concat_branches]: 1.07998e-06 [interleave_parallel_branches]: 1.09998e-06 [overlap_opt_shard_in_pipeline]: 1.72999e-06 [overlap_opt_shard_grad_in_pipeline]: 1.94999e-06 [control_data_broadcast_order]: 1.584e-05 [grouped_pairwise_exchange_alltoall]: 1.97999e-06 [offloading_packed_experts]: 4.23001e-06 [overlap_recompute_and_grad_model_parallel]: 5.16002e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.22e-06 [overlap_recompute_allgather_and_fa_grad]: 1.37999e-06 [overlap_recompute_comm]: 2.11e-06 [overlap_grad_ring_attention]: 4.4e-06 [overlap_grad_flash_sp]: 2.334e-05 [begin_end_overlap_inline]: 5.8001e-07 [split_matmul_comm_elemetwise]: 2.17001e-06 [split_layernorm_comm]: 1.60001e-06 [handle_group_info]: 1.29e-06 [symbol_engine_optimizer]: 9.003e-05, [1] [Cycle 1]: 8.528e-05, [6] [build]: 2.93e-06 [elim_shapecalc]: 1.499e-05 [elim_not_effective]: 1.656e-05 [opt_reshape]: 8.00999e-06 [fold_const_symbol]: 1.207e-05 [renormalize]: 2.50002e-07 [detach_backward]: 2.06e-06 [pipeline_parallel_scheduler]: 1.70001e-06 [auto_monad_reorder]: 2.069e-05 [get_jit_bprop_graph]: 1.83002e-06 [rewriter_after_jit_bprop_graph]: 3.78999e-06 [opt_after_jit_grad]: 0.00047863 [validate]: 4.605e-05 Sums bootstrap : 0.000459s : 3.93% type_inference : 0.005935s : 50.77% event_method : 0.000020s : 0.17% auto_monad : 0.000064s : 0.55% graph_reusing : 0.000006s : 0.06% inline : 0.000003s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000023s : 0.20% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.05% parallel-infer-symbol : 0.000004s : 0.03% pre_auto_parallel : 0.000041s : 0.35% insert-virtual-dataset : 0.000002s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.02% optimize.py_interpret_to_execute : 0.000028s : 0.24% optimize.rewriter_before_opt_a : 0.000092s : 0.79% optimize.opt_a.expand_dump_flag : 0.000004s : 0.04% optimize.opt_a.switch_simplify : 0.000053s : 0.45% optimize.opt_a.loop_unroll : 0.000039s : 0.33% optimize.opt_a.a_1 : 0.000874s : 7.48% optimize.opt_a.with_stream_mark : 0.000036s : 0.31% optimize.opt_a.recompute_prepare : 0.000019s : 0.16% optimize.opt_a.updatestate_depend_eliminate : 0.000008s : 0.07% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.06% optimize.opt_a.updatestate_loads_eliminate : 0.000007s : 0.06% optimize.opt_a.parameter_eliminate : 0.000003s : 0.03% optimize.opt_a.a_2 : 0.000215s : 1.84% optimize.opt_a.accelerated_algorithm : 0.000017s : 0.14% optimize.opt_a.shard : 0.000004s : 0.03% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.03% optimize.opt_a.shard_inline : 0.000015s : 0.13% optimize.opt_a.merge_send_recv : 0.000017s : 0.14% optimize.opt_a.auto_parallel : 0.000015s : 0.13% optimize.opt_a.parallel : 0.000026s : 0.22% optimize.opt_a.flash_sp : 0.000013s : 0.11% optimize.opt_a.merge_comm : 0.000012s : 0.10% optimize.opt_a.allreduce_fusion : 0.000009s : 0.08% optimize.opt_a.matmul_add_comm_reduction : 0.000019s : 0.16% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000021s : 0.18% optimize.opt_a.virtual_dataset : 0.000016s : 0.14% optimize.opt_a.get_grad_eliminate_ : 0.000014s : 0.12% optimize.opt_a.virtual_output : 0.000015s : 0.12% optimize.opt_a.merge_forward : 0.000009s : 0.08% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.02% optimize.opt_a.offload_activation : 0.000020s : 0.17% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000033s : 0.28% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.02% optimize.opt_a.before_grad : 0.000026s : 0.22% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000009s : 0.08% optimize.opt_a.meta_fg_expand : 0.000006s : 0.05% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.04% optimize.opt_a.receive_attached : 0.000003s : 0.03% optimize.opt_a.after_resolve : 0.000029s : 0.25% optimize.opt_a.a_after_grad : 0.000024s : 0.21% optimize.opt_a.renormalize : 0.000844s : 7.22% optimize.opt_a.add_forward_monad_depend : 0.000009s : 0.07% optimize.opt_a.auto_monad_grad : 0.000004s : 0.03% optimize.opt_a.auto_monad_eliminator : 0.000029s : 0.25% optimize.opt_a.cse : 0.000060s : 0.51% optimize.opt_a.a_3 : 0.000107s : 0.92% optimize.py_interpret_to_execute_after_opt_a : 0.000014s : 0.12% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.02% optimize.rewriter_after_opt_a : 0.000043s : 0.37% optimize.convert_after_rewriter : 0.000008s : 0.07% optimize.order_py_execute_after_rewriter : 0.000006s : 0.05% optimize.mutable_eliminate : 0.000576s : 4.93% optimize.opt_b.b_1 : 0.000155s : 1.33% optimize.opt_b.b_2 : 0.000010s : 0.08% optimize.opt_b.updatestate_depend_eliminate : 0.000008s : 0.07% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_b.renormalize : 0.000001s : 0.01% optimize.opt_b.cse : 0.000026s : 0.22% optimize.optimize_parallel_all_gather_comm : 0.000019s : 0.16% optimize.overlap_param_gather : 0.000002s : 0.02% optimize.cconv : 0.000029s : 0.25% optimize.loop_unroll : 0.000440s : 3.77% optimize.opt_after_cconv.c_1 : 0.000037s : 0.32% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000007s : 0.06% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.cse : 0.000024s : 0.21% optimize.opt_after_cconv.renormalize : 0.000001s : 0.01% optimize.remove_dup_value : 0.000015s : 0.13% optimize.tuple_transform.d_1 : 0.000052s : 0.45% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000009s : 0.07% optimize.partial_unused_args_eliminate : 0.000002s : 0.02% optimize.add_recomputation : 0.000058s : 0.50% optimize.cse_after_recomputation.cse : 0.000018s : 0.15% optimize.environ_conv : 0.000007s : 0.06% optimize.swap_dp_allreduce_reducescatter : 0.000006s : 0.05% optimize.bias_add_comm_swap : 0.000003s : 0.03% optimize.label_micro_interleaved_index : 0.000005s : 0.04% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.02% optimize.merge_cast_opt : 0.000001s : 0.01% optimize.slice_recompute_activation : 0.000002s : 0.02% optimize.micro_interleaved_order_control : 0.000004s : 0.03% optimize.assign_add_opt : 0.000001s : 0.01% optimize.ForceFp32Comm : 0.000001s : 0.01% optimize.remove_cast_before_assign_add : 0.000001s : 0.01% optimize.full_micro_interleaved_order_control : 0.000002s : 0.02% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.02% optimize.comm_op_add_attrs : 0.000001s : 0.01% optimize.add_comm_op_reuse_tag : 0.000001s : 0.01% optimize.interleave_split_concat_branches : 0.000001s : 0.01% optimize.interleave_parallel_branches : 0.000001s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000002s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.02% optimize.control_data_broadcast_order : 0.000016s : 0.14% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.02% optimize.offloading_packed_experts : 0.000004s : 0.04% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.04% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.01% optimize.overlap_recompute_comm : 0.000002s : 0.02% optimize.overlap_grad_ring_attention : 0.000004s : 0.04% optimize.overlap_grad_flash_sp : 0.000023s : 0.20% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.02% optimize.split_layernorm_comm : 0.000002s : 0.01% optimize.handle_group_info : 0.000001s : 0.01% optimize.symbol_engine_optimizer.build : 0.000003s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000015s : 0.13% optimize.symbol_engine_optimizer.elim_not_effective : 0.000017s : 0.14% optimize.symbol_engine_optimizer.opt_reshape : 0.000008s : 0.07% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000012s : 0.10% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.02% pipeline_parallel_scheduler : 0.000002s : 0.01% auto_monad_reorder : 0.000021s : 0.18% get_jit_bprop_graph : 0.000002s : 0.02% rewriter_after_jit_bprop_graph : 0.000004s : 0.03% opt_after_jit_grad : 0.000479s : 4.09% validate : 0.000046s : 0.39% Time group info: ------[substitution.] 0.000228 38 11.42% : 0.000026s : 3: substitution.cast_eliminate 1.12% : 0.000003s : 3: substitution.elim_not_effective 0.71% : 0.000002s : 3: substitution.fold_const_symbol 2.85% : 0.000006s : 5: substitution.graph_param_transform 70.18% : 0.000160s : 4: substitution.inline 2.24% : 0.000005s : 6: substitution.j_node_and_user_rematch 2.56% : 0.000006s : 6: substitution.remove_not_recompute_node 3.31% : 0.000008s : 4: substitution.replace_old_param 5.61% : 0.000013s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.005872 2 88.02% : 0.005169s : 1: type_inference.infer 11.98% : 0.000703s : 1: type_inference.specialize ------[replace.] 0.000063 8 63.72% : 0.000040s : 4: replace.inline 36.28% : 0.000023s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000168 8 93.48% : 0.000157s : 4: match.inline 6.52% : 0.000011s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000242 1504 0.86% : 0.000002s : 15: predicate.accumulaten_eliminater 0.84% : 0.000002s : 5: predicate.ad_related_special_op_eliminate 0.62% : 0.000001s : 10: predicate.addn_check_dump 0.85% : 0.000002s : 15: predicate.addn_zero_filter 0.79% : 0.000002s : 15: predicate.adjust_all_reduce_mul_add 2.19% : 0.000005s : 25: predicate.arithmetic_simplify 1.05% : 0.000003s : 15: predicate.cast_eliminate 0.61% : 0.000001s : 10: predicate.check_bprop_eliminate 0.60% : 0.000001s : 10: predicate.compare_switch_simplify 0.19% : 0.000000s : 5: predicate.const_output_eliminate 0.63% : 0.000002s : 10: predicate.depend_value_elim 1.03% : 0.000002s : 15: predicate.dict_get_item_const_eliminator 0.99% : 0.000002s : 15: predicate.dict_get_item_eliminator 0.85% : 0.000002s : 15: predicate.dict_set_item_eliminator 0.91% : 0.000002s : 10: predicate.dumpgradient_eliminate 0.21% : 0.000001s : 5: predicate.elim_not_effective 0.46% : 0.000001s : 5: predicate.elim_shapecalc_of_broadcastargs 1.15% : 0.000003s : 20: predicate.environ_add_const_eliminate 1.11% : 0.000003s : 20: predicate.environ_get_add_eliminate 1.10% : 0.000003s : 20: predicate.environ_get_depend_swap 1.76% : 0.000004s : 30: predicate.environ_get_eliminate 1.10% : 0.000003s : 20: predicate.environ_get_set_eliminate 1.35% : 0.000003s : 23: predicate.exchange_switch_depend_value 2.33% : 0.000006s : 23: predicate.float_depend_g_call 0.62% : 0.000002s : 10: predicate.float_environ_get_switch 0.86% : 0.000002s : 15: predicate.float_tuple_getitem_switch 0.19% : 0.000000s : 5: predicate.fold_const_symbol 0.64% : 0.000002s : 10: predicate.get_grad_eliminate 0.22% : 0.000001s : 5: predicate.graph_param_transform 0.68% : 0.000002s : 10: predicate.incorporate_call 0.58% : 0.000001s : 10: predicate.incorporate_call_switch 6.46% : 0.000016s : 68: predicate.inline 0.79% : 0.000002s : 10: predicate.inline_without_move 0.34% : 0.000001s : 10: predicate.j_node_and_user_rematch 0.96% : 0.000002s : 10: predicate.less_batch_normalization 1.74% : 0.000004s : 29: predicate.list_to_tuple_eliminator_ 2.43% : 0.000006s : 44: predicate.load_eliminater 0.77% : 0.000002s : 5: predicate.loop_unroll_after_grad 2.06% : 0.000005s : 36: predicate.loop_unroll_before_grad 1.56% : 0.000004s : 25: predicate.make_slice_get_slice_eliminator 0.60% : 0.000001s : 10: predicate.merge_addn 0.60% : 0.000001s : 10: predicate.micro_step_allgather_replace 0.66% : 0.000002s : 10: predicate.mini_step_allgather_replace 0.80% : 0.000002s : 15: predicate.minmaximum_grad 1.06% : 0.000003s : 5: predicate.mutable_eliminate 0.33% : 0.000001s : 5: predicate.opt_reshape 0.48% : 0.000001s : 5: predicate.parallel_virtual_node 1.84% : 0.000004s : 23: predicate.partial_defer_inline 1.69% : 0.000004s : 24: predicate.partial_eliminate 0.93% : 0.000002s : 15: predicate.print_const_string_wrapper 0.78% : 0.000002s : 10: predicate.reduce_all_const_elim 1.17% : 0.000003s : 15: predicate.reduce_eliminate 2.52% : 0.000006s : 44: predicate.redundant_stop_gradient_eliminater 0.42% : 0.000001s : 10: predicate.remove_not_recompute_node 1.42% : 0.000003s : 29: predicate.replace_applicator 0.49% : 0.000001s : 10: predicate.replace_old_param 0.24% : 0.000001s : 5: predicate.reset_defer_inline 0.98% : 0.000002s : 15: predicate.reshape_eliminate 0.59% : 0.000001s : 10: predicate.row_tensor_add_zeros_like 0.36% : 0.000001s : 5: predicate.row_tensor_eliminate 0.74% : 0.000002s : 10: predicate.same_eliminate 0.60% : 0.000001s : 10: predicate.set_cell_output_no_recompute 0.84% : 0.000002s : 10: predicate.shard_identity_eliminate 0.68% : 0.000002s : 10: predicate.special_op_eliminate 0.82% : 0.000002s : 10: predicate.specialize_transform 0.82% : 0.000002s : 10: predicate.split_environ_get_set_with_tuple_value 0.81% : 0.000002s : 10: predicate.stack_unstack_eliminate 0.37% : 0.000001s : 5: predicate.switch_call_monad_eliminater 1.41% : 0.000003s : 23: predicate.switch_defer_inline 1.99% : 0.000005s : 33: predicate.switch_layer_defer_inline 5.02% : 0.000012s : 74: predicate.switch_simplify 0.83% : 0.000002s : 15: predicate.tile_eliminate 0.90% : 0.000002s : 15: predicate.transpose_eliminate 1.58% : 0.000004s : 25: predicate.tuple_list_convert_item_index_to_positive 1.53% : 0.000004s : 25: predicate.tuple_list_get_item_const_eliminator 1.42% : 0.000003s : 25: predicate.tuple_list_get_item_depend_reorder 3.46% : 0.000008s : 39: predicate.tuple_list_get_item_eliminator 1.64% : 0.000004s : 25: predicate.tuple_list_get_set_item_eliminator 2.32% : 0.000006s : 35: predicate.tuple_list_set_item_eliminator 1.66% : 0.000004s : 29: predicate.tuple_to_list_eliminator_ 2.42% : 0.000006s : 44: predicate.updatestate_pure_node_eliminater 3.10% : 0.000007s : 54: predicate.updatestate_useless_node_eliminater 0.44% : 0.000001s : 5: predicate.value_based_eliminate 0.70% : 0.000002s : 10: predicate.virtual_dataset_eliminate 0.71% : 0.000002s : 10: predicate.virtual_output_eliminate 0.30% : 0.000001s : 5: predicate.virtual_view_grad_eliminate 0.44% : 0.000001s : 5: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000570 11 55.10% : 0.000314s : 5: func_graph_cloner_run.FuncGraphClonerGraph 44.90% : 0.000256s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.075863 192 0.01% : 0.000004s : 1: ForceFp32Comm 36.32% : 0.027551s : 1: add_attr 36.30% : 0.027538s : 1: add_attr_with_inline 0.00% : 0.000004s : 1: add_comm_op_reuse_tag 0.08% : 0.000062s : 1: add_recomputation 0.01% : 0.000005s : 1: assign_add_opt 0.09% : 0.000071s : 1: auto_monad 0.03% : 0.000025s : 1: auto_monad_reorder 0.00% : 0.000003s : 1: begin_end_overlap_inline 0.01% : 0.000006s : 1: bias_add_comm_swap 0.65% : 0.000490s : 1: bootstrap 0.04% : 0.000033s : 1: cconv 0.01% : 0.000004s : 1: comm_op_add_attrs 0.03% : 0.000019s : 1: control_data_broadcast_order 0.02% : 0.000011s : 1: convert_after_rewriter 0.06% : 0.000046s : 1: cse_after_recomputation 0.01% : 0.000005s : 1: dataset_repeat_opt 0.01% : 0.000005s : 1: detach_backward 0.01% : 0.000010s : 1: environ_conv 0.04% : 0.000027s : 1: event_method 0.01% : 0.000005s : 1: full_micro_interleaved_order_control 0.01% : 0.000005s : 1: get_jit_bprop_graph 0.01% : 0.000010s : 1: graph_reusing 0.01% : 0.000005s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000004s : 1: handle_group_info 0.01% : 0.000006s : 1: inline 0.01% : 0.000006s : 1: insert-virtual-dataset 0.00% : 0.000004s : 1: interleave_parallel_branches 0.01% : 0.000004s : 1: interleave_split_concat_branches 0.01% : 0.000006s : 1: label_fine_grained_interleaved_index 0.01% : 0.000007s : 1: label_micro_interleaved_index 0.59% : 0.000450s : 1: loop_unroll 0.01% : 0.000004s : 1: merge_cast_opt 0.01% : 0.000010s : 1: micro_interleaved_order_control 0.77% : 0.000586s : 1: mutable_eliminate 0.01% : 0.000007s : 1: offloading_packed_experts 0.02% : 0.000017s : 1: opt.transform.loop_unroll_optimizer 0.02% : 0.000019s : 1: opt.transform.mutable_eliminate 1.87% : 0.001418s : 78: opt.transform.opt_a 0.05% : 0.000036s : 1: opt.transform.opt_after_cconv 0.04% : 0.000031s : 1: opt.transform.opt_after_jit_grad 0.17% : 0.000132s : 28: opt.transform.opt_b 0.08% : 0.000058s : 2: opt.transform.opt_trans_graph 0.06% : 0.000046s : 4: opt.transform.symbol_engine_opt 4.22% : 0.003199s : 1: opt_a 0.16% : 0.000123s : 1: opt_after_cconv 0.64% : 0.000488s : 1: opt_after_jit_grad 0.33% : 0.000253s : 1: opt_b 7.30% : 0.005536s : 1: optimize 0.03% : 0.000022s : 1: optimize_parallel_all_gather_comm 0.01% : 0.000009s : 1: order_py_execute_after_rewriter 0.04% : 0.000028s : 1: overlap_grad_flash_sp 0.01% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.01% : 0.000008s : 1: overlap_grad_ring_attention 0.01% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000005s : 1: overlap_opt_shard_in_pipeline 0.01% : 0.000005s : 1: overlap_param_gather 0.01% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.01% : 0.000008s : 1: overlap_recompute_and_grad_model_parallel 0.01% : 0.000005s : 1: overlap_recompute_comm 0.01% : 0.000008s : 1: parallel-infer-symbol 0.00% : 0.000004s : 1: parallel-infer-symbol-second 0.01% : 0.000005s : 1: partial_unused_args_eliminate 0.01% : 0.000005s : 1: pipeline_parallel_scheduler 0.01% : 0.000005s : 1: pipeline_split 0.06% : 0.000045s : 1: pre_auto_parallel 0.04% : 0.000032s : 1: py_interpret_to_execute 0.02% : 0.000017s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000004s : 1: remove_cast_before_assign_add 0.02% : 0.000019s : 1: remove_dup_value 0.63% : 0.000478s : 1: renormalize.infer 0.47% : 0.000357s : 1: renormalize.specialize 0.01% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.01% : 0.000007s : 1: rewriter_after_jit_bprop_graph 0.06% : 0.000048s : 1: rewriter_after_opt_a 0.13% : 0.000097s : 1: rewriter_before_opt_a 0.01% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.10% : 0.000077s : 1: slice_recompute_activation 0.01% : 0.000004s : 1: split_layernorm_comm 0.01% : 0.000005s : 1: split_matmul_comm_elemetwise 0.01% : 0.000009s : 1: swap_dp_allreduce_reducescatter 0.12% : 0.000093s : 1: symbol_engine_optimizer 0.12% : 0.000089s : 1: tuple_transform 7.85% : 0.005953s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:45:36.227.662 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:45:36.227.950 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0159203, [21] [bootstrap]: 0.00046141 [type_inference]: 0.00562576 [event_method]: 1.839e-05 [auto_monad]: 6.235e-05 [graph_reusing]: 5.47001e-06 [inline]: 2.22999e-06 [add_attr]: 0.00310621, [1] [add_attr_with_inline]: 0.00309786, [1] [Cycle 1]: 6.829e-05, [2] [tag_attr]: 1.87e-05 [meta_addattr_fg_expand]: 6.33e-06 [parallel-infer-symbol]: 3.22002e-06 [pre_auto_parallel]: 3.33e-05 [insert-virtual-dataset]: 2.26e-06 [parallel-infer-symbol-second]: 7.60017e-07 [dataset_repeat_opt]: 1.99e-06 [pipeline_split]: 1.57999e-06 [optimize]: 0.00547426, [53] [py_interpret_to_execute]: 2.81e-05 [rewriter_before_opt_a]: 8.629e-05 [opt_a]: 0.00316163, [2] [Cycle 1]: 0.00219274, [45] [expand_dump_flag]: 3.18e-06 [switch_simplify]: 4.243e-05 [loop_unroll]: 3.104e-05 [a_1]: 0.00065802 [with_stream_mark]: 1.554e-05 [recompute_prepare]: 1.072e-05 [updatestate_depend_eliminate]: 4.52e-06 [updatestate_assign_eliminate]: 4.01001e-06 [updatestate_loads_eliminate]: 3.53e-06 [parameter_eliminate]: 2.29999e-06 [a_2]: 0.00012949 [accelerated_algorithm]: 8.3e-06 [shard]: 1.70001e-06 [meta_shard_fg_expand]: 2.19999e-06 [shard_inline]: 7.96001e-06 [merge_send_recv]: 9.42001e-06 [auto_parallel]: 6.78e-06 [parallel]: 1.784e-05 [flash_sp]: 7.93001e-06 [merge_comm]: 4.83001e-06 [allreduce_fusion]: 4.17e-06 [matmul_add_comm_reduction]: 9.64999e-06 [allreduce_slice_to_reducescatter]: 7.2e-07 [virtual_shard_identity]: 9.17001e-06 [virtual_dataset]: 8e-06 [get_grad_eliminate_]: 7.55e-06 [virtual_output]: 8.02e-06 [merge_forward]: 4.45e-06 [cell_reuse_recompute_pass]: 1.25999e-06 [offload_activation]: 1.028e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.683e-05 [merge_recompute_call_nodes]: 1.44e-06 [before_grad]: 1.294e-05 [set_forward_comm_id_for_comm_node_pass]: 4.70001e-06 [meta_fg_expand]: 3.21001e-06 [flash_sp_send_recv_attached]: 2.53e-06 [receive_attached]: 1.90001e-06 [after_resolve]: 1.197e-05 [a_after_grad]: 1.213e-05 [renormalize]: 0.00058551 [add_forward_monad_depend]: 5.03002e-06 [auto_monad_grad]: 2.29001e-06 [auto_monad_eliminator]: 1.594e-05 [cse]: 3.435e-05 [a_3]: 7.054e-05 [Cycle 2]: 0.0009557, [45] [expand_dump_flag]: 1.01002e-06 [switch_simplify]: 9.26998e-06 [loop_unroll]: 7.71001e-06 [a_1]: 0.00017573 [with_stream_mark]: 1.149e-05 [recompute_prepare]: 8.42e-06 [updatestate_depend_eliminate]: 4.03001e-06 [updatestate_assign_eliminate]: 3.33e-06 [updatestate_loads_eliminate]: 2.88e-06 [parameter_eliminate]: 1.22999e-06 [a_2]: 0.00013221 [accelerated_algorithm]: 8.21002e-06 [shard]: 1.30999e-06 [meta_shard_fg_expand]: 1.91e-06 [shard_inline]: 8.77e-06 [merge_send_recv]: 6.61999e-06 [auto_parallel]: 6.43998e-06 [parallel]: 5.02999e-06 [flash_sp]: 3.4e-06 [merge_comm]: 3.93001e-06 [allreduce_fusion]: 3.86001e-06 [matmul_add_comm_reduction]: 6.81001e-06 [allreduce_slice_to_reducescatter]: 6.39993e-07 [virtual_shard_identity]: 9.99999e-06 [virtual_dataset]: 7.21999e-06 [get_grad_eliminate_]: 7.52002e-06 [virtual_output]: 7.56001e-06 [merge_forward]: 3.6e-06 [cell_reuse_recompute_pass]: 1.60001e-06 [offload_activation]: 8.28999e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.668e-05 [merge_recompute_call_nodes]: 9.29984e-07 [before_grad]: 1.238e-05 [set_forward_comm_id_for_comm_node_pass]: 4.53001e-06 [meta_fg_expand]: 2.68e-06 [flash_sp_send_recv_attached]: 1.09e-06 [receive_attached]: 1.05999e-06 [after_resolve]: 1.183e-05 [a_after_grad]: 1.147e-05 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 1.49003e-06 [auto_monad_grad]: 1.10001e-06 [auto_monad_eliminator]: 9.44e-06 [cse]: 1.919e-05 [a_3]: 5.877e-05 [py_interpret_to_execute_after_opt_a]: 1.385e-05 [slice_cell_reuse_recomputed_activation]: 4.67998e-06 [rewriter_after_opt_a]: 4.352e-05 [convert_after_rewriter]: 1.138e-05 [order_py_execute_after_rewriter]: 9.10001e-06 [mutable_eliminate]: 0.00048137 [opt_b]: 0.00030782, [1] [Cycle 1]: 0.00029894, [7] [b_1]: 0.0001978 [b_2]: 9.04e-06 [updatestate_depend_eliminate]: 6.72002e-06 [updatestate_assign_eliminate]: 3.16001e-06 [updatestate_loads_eliminate]: 3.24001e-06 [renormalize]: 4.19997e-07 [cse]: 2.306e-05 [optimize_parallel_all_gather_comm]: 2.099e-05 [overlap_param_gather]: 4.82e-06 [cconv]: 2.968e-05 [loop_unroll]: 0.00042765 [opt_after_cconv]: 0.00013848, [1] [Cycle 1]: 0.00013013, [7] [c_1]: 3.692e-05 [parameter_eliminate]: 3.58e-06 [updatestate_depend_eliminate]: 5.94e-06 [updatestate_assign_eliminate]: 3.18998e-06 [updatestate_loads_eliminate]: 2.94001e-06 [cse]: 2.206e-05 [renormalize]: 4.00003e-07 [remove_dup_value]: 1.76e-05 [tuple_transform]: 0.00010137, [1] [Cycle 1]: 9.353e-05, [4] [d_1]: 5.3e-05 [none_parameter_eliminate]: 1.89e-06 [renormalize]: 2.3999e-07 [switch_simplify]: 8.35999e-06 [partial_unused_args_eliminate]: 4.73001e-06 [add_recomputation]: 5.755e-05 [cse_after_recomputation]: 3.058e-05, [1] [Cycle 1]: 2.383e-05, [1] [cse]: 1.475e-05 [environ_conv]: 9.29e-06 [swap_dp_allreduce_reducescatter]: 8.73001e-06 [bias_add_comm_swap]: 5.52001e-06 [label_micro_interleaved_index]: 6.70002e-06 [label_fine_grained_interleaved_index]: 5.49e-06 [merge_cast_opt]: 3.83001e-06 [slice_recompute_activation]: 4.37e-06 [micro_interleaved_order_control]: 5.08002e-06 [assign_add_opt]: 3.65e-06 [ForceFp32Comm]: 3.26001e-06 [remove_cast_before_assign_add]: 3.31999e-06 [full_micro_interleaved_order_control]: 4.63999e-06 [reorder_send_recv_between_fp_bp]: 5.49e-06 [comm_op_add_attrs]: 3.4e-06 [add_comm_op_reuse_tag]: 3.23e-06 [interleave_split_concat_branches]: 3.66999e-06 [interleave_parallel_branches]: 3.43e-06 [overlap_opt_shard_in_pipeline]: 3.45e-06 [overlap_opt_shard_grad_in_pipeline]: 4.38001e-06 [control_data_broadcast_order]: 1.732e-05 [grouped_pairwise_exchange_alltoall]: 4.10998e-06 [offloading_packed_experts]: 6.57002e-06 [overlap_recompute_and_grad_model_parallel]: 8.08999e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.53999e-06 [overlap_recompute_allgather_and_fa_grad]: 3.69002e-06 [overlap_recompute_comm]: 4.58999e-06 [overlap_grad_ring_attention]: 7.06001e-06 [overlap_grad_flash_sp]: 2.338e-05 [begin_end_overlap_inline]: 3.21999e-06 [split_matmul_comm_elemetwise]: 4.68999e-06 [split_layernorm_comm]: 4.2e-06 [handle_group_info]: 3.18e-06 [symbol_engine_optimizer]: 0.00010132, [1] [Cycle 1]: 9.48e-05, [6] [build]: 2.74999e-06 [elim_shapecalc]: 1.11e-05 [elim_not_effective]: 1.508e-05 [opt_reshape]: 8.42e-06 [fold_const_symbol]: 1.278e-05 [renormalize]: 2.69996e-07 [detach_backward]: 3.43e-06 [pipeline_parallel_scheduler]: 1.68002e-06 [auto_monad_reorder]: 2.244e-05 [get_jit_bprop_graph]: 1.69998e-06 [rewriter_after_jit_bprop_graph]: 3.91999e-06 [opt_after_jit_grad]: 0.00048945 [validate]: 3.973e-05 Sums bootstrap : 0.000461s : 4.15% type_inference : 0.005626s : 50.65% event_method : 0.000018s : 0.17% auto_monad : 0.000062s : 0.56% graph_reusing : 0.000005s : 0.05% inline : 0.000002s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000019s : 0.17% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.06% parallel-infer-symbol : 0.000003s : 0.03% pre_auto_parallel : 0.000033s : 0.30% insert-virtual-dataset : 0.000002s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000028s : 0.25% optimize.rewriter_before_opt_a : 0.000086s : 0.78% optimize.opt_a.expand_dump_flag : 0.000004s : 0.04% optimize.opt_a.switch_simplify : 0.000052s : 0.47% optimize.opt_a.loop_unroll : 0.000039s : 0.35% optimize.opt_a.a_1 : 0.000834s : 7.51% optimize.opt_a.with_stream_mark : 0.000027s : 0.24% optimize.opt_a.recompute_prepare : 0.000019s : 0.17% optimize.opt_a.updatestate_depend_eliminate : 0.000009s : 0.08% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.07% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.06% optimize.opt_a.parameter_eliminate : 0.000004s : 0.03% optimize.opt_a.a_2 : 0.000262s : 2.36% optimize.opt_a.accelerated_algorithm : 0.000017s : 0.15% optimize.opt_a.shard : 0.000003s : 0.03% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.04% optimize.opt_a.shard_inline : 0.000017s : 0.15% optimize.opt_a.merge_send_recv : 0.000016s : 0.14% optimize.opt_a.auto_parallel : 0.000013s : 0.12% optimize.opt_a.parallel : 0.000023s : 0.21% optimize.opt_a.flash_sp : 0.000011s : 0.10% optimize.opt_a.merge_comm : 0.000009s : 0.08% optimize.opt_a.allreduce_fusion : 0.000008s : 0.07% optimize.opt_a.matmul_add_comm_reduction : 0.000016s : 0.15% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000019s : 0.17% optimize.opt_a.virtual_dataset : 0.000015s : 0.14% optimize.opt_a.get_grad_eliminate_ : 0.000015s : 0.14% optimize.opt_a.virtual_output : 0.000016s : 0.14% optimize.opt_a.merge_forward : 0.000008s : 0.07% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.03% optimize.opt_a.offload_activation : 0.000019s : 0.17% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000034s : 0.30% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.02% optimize.opt_a.before_grad : 0.000025s : 0.23% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000009s : 0.08% optimize.opt_a.meta_fg_expand : 0.000006s : 0.05% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.03% optimize.opt_a.receive_attached : 0.000003s : 0.03% optimize.opt_a.after_resolve : 0.000024s : 0.21% optimize.opt_a.a_after_grad : 0.000024s : 0.21% optimize.opt_a.renormalize : 0.000586s : 5.27% optimize.opt_a.add_forward_monad_depend : 0.000007s : 0.06% optimize.opt_a.auto_monad_grad : 0.000003s : 0.03% optimize.opt_a.auto_monad_eliminator : 0.000025s : 0.23% optimize.opt_a.cse : 0.000054s : 0.48% optimize.opt_a.a_3 : 0.000129s : 1.16% optimize.py_interpret_to_execute_after_opt_a : 0.000014s : 0.12% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.04% optimize.rewriter_after_opt_a : 0.000044s : 0.39% optimize.convert_after_rewriter : 0.000011s : 0.10% optimize.order_py_execute_after_rewriter : 0.000009s : 0.08% optimize.mutable_eliminate : 0.000481s : 4.33% optimize.opt_b.b_1 : 0.000198s : 1.78% optimize.opt_b.b_2 : 0.000009s : 0.08% optimize.opt_b.updatestate_depend_eliminate : 0.000007s : 0.06% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_b.renormalize : 0.000000s : 0.00% optimize.opt_b.cse : 0.000023s : 0.21% optimize.optimize_parallel_all_gather_comm : 0.000021s : 0.19% optimize.overlap_param_gather : 0.000005s : 0.04% optimize.cconv : 0.000030s : 0.27% optimize.loop_unroll : 0.000428s : 3.85% optimize.opt_after_cconv.c_1 : 0.000037s : 0.33% optimize.opt_after_cconv.parameter_eliminate : 0.000004s : 0.03% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.05% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.cse : 0.000022s : 0.20% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000018s : 0.16% optimize.tuple_transform.d_1 : 0.000053s : 0.48% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.02% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000008s : 0.08% optimize.partial_unused_args_eliminate : 0.000005s : 0.04% optimize.add_recomputation : 0.000058s : 0.52% optimize.cse_after_recomputation.cse : 0.000015s : 0.13% optimize.environ_conv : 0.000009s : 0.08% optimize.swap_dp_allreduce_reducescatter : 0.000009s : 0.08% optimize.bias_add_comm_swap : 0.000006s : 0.05% optimize.label_micro_interleaved_index : 0.000007s : 0.06% optimize.label_fine_grained_interleaved_index : 0.000005s : 0.05% optimize.merge_cast_opt : 0.000004s : 0.03% optimize.slice_recompute_activation : 0.000004s : 0.04% optimize.micro_interleaved_order_control : 0.000005s : 0.05% optimize.assign_add_opt : 0.000004s : 0.03% optimize.ForceFp32Comm : 0.000003s : 0.03% optimize.remove_cast_before_assign_add : 0.000003s : 0.03% optimize.full_micro_interleaved_order_control : 0.000005s : 0.04% optimize.reorder_send_recv_between_fp_bp : 0.000005s : 0.05% optimize.comm_op_add_attrs : 0.000003s : 0.03% optimize.add_comm_op_reuse_tag : 0.000003s : 0.03% optimize.interleave_split_concat_branches : 0.000004s : 0.03% optimize.interleave_parallel_branches : 0.000003s : 0.03% optimize.overlap_opt_shard_in_pipeline : 0.000003s : 0.03% optimize.overlap_opt_shard_grad_in_pipeline : 0.000004s : 0.04% optimize.control_data_broadcast_order : 0.000017s : 0.16% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.04% optimize.offloading_packed_experts : 0.000007s : 0.06% optimize.overlap_recompute_and_grad_model_parallel : 0.000008s : 0.07% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.03% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.03% optimize.overlap_recompute_comm : 0.000005s : 0.04% optimize.overlap_grad_ring_attention : 0.000007s : 0.06% optimize.overlap_grad_flash_sp : 0.000023s : 0.21% optimize.begin_end_overlap_inline : 0.000003s : 0.03% optimize.split_matmul_comm_elemetwise : 0.000005s : 0.04% optimize.split_layernorm_comm : 0.000004s : 0.04% optimize.handle_group_info : 0.000003s : 0.03% optimize.symbol_engine_optimizer.build : 0.000003s : 0.02% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000011s : 0.10% optimize.symbol_engine_optimizer.elim_not_effective : 0.000015s : 0.14% optimize.symbol_engine_optimizer.opt_reshape : 0.000008s : 0.08% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000013s : 0.12% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000003s : 0.03% pipeline_parallel_scheduler : 0.000002s : 0.02% auto_monad_reorder : 0.000022s : 0.20% get_jit_bprop_graph : 0.000002s : 0.02% rewriter_after_jit_bprop_graph : 0.000004s : 0.04% opt_after_jit_grad : 0.000489s : 4.41% validate : 0.000040s : 0.36% Time group info: ------[substitution.] 0.000195 38 11.10% : 0.000022s : 3: substitution.cast_eliminate 1.14% : 0.000002s : 3: substitution.elim_not_effective 1.00% : 0.000002s : 3: substitution.fold_const_symbol 3.43% : 0.000007s : 5: substitution.graph_param_transform 69.80% : 0.000136s : 4: substitution.inline 2.31% : 0.000004s : 6: substitution.j_node_and_user_rematch 3.00% : 0.000006s : 6: substitution.remove_not_recompute_node 2.11% : 0.000004s : 4: substitution.replace_old_param 6.10% : 0.000012s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.005577 2 87.75% : 0.004894s : 1: type_inference.infer 12.25% : 0.000683s : 1: type_inference.specialize ------[replace.] 0.000059 8 61.45% : 0.000036s : 4: replace.inline 38.55% : 0.000023s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000144 8 92.97% : 0.000133s : 4: match.inline 7.03% : 0.000010s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000235 1504 0.87% : 0.000002s : 15: predicate.accumulaten_eliminater 0.75% : 0.000002s : 5: predicate.ad_related_special_op_eliminate 0.57% : 0.000001s : 10: predicate.addn_check_dump 0.93% : 0.000002s : 15: predicate.addn_zero_filter 0.82% : 0.000002s : 15: predicate.adjust_all_reduce_mul_add 1.95% : 0.000005s : 25: predicate.arithmetic_simplify 1.15% : 0.000003s : 15: predicate.cast_eliminate 0.65% : 0.000002s : 10: predicate.check_bprop_eliminate 0.62% : 0.000001s : 10: predicate.compare_switch_simplify 0.20% : 0.000000s : 5: predicate.const_output_eliminate 0.69% : 0.000002s : 10: predicate.depend_value_elim 0.94% : 0.000002s : 15: predicate.dict_get_item_const_eliminator 0.99% : 0.000002s : 15: predicate.dict_get_item_eliminator 0.86% : 0.000002s : 15: predicate.dict_set_item_eliminator 0.92% : 0.000002s : 10: predicate.dumpgradient_eliminate 0.19% : 0.000000s : 5: predicate.elim_not_effective 0.37% : 0.000001s : 5: predicate.elim_shapecalc_of_broadcastargs 1.16% : 0.000003s : 20: predicate.environ_add_const_eliminate 1.15% : 0.000003s : 20: predicate.environ_get_add_eliminate 1.15% : 0.000003s : 20: predicate.environ_get_depend_swap 1.80% : 0.000004s : 30: predicate.environ_get_eliminate 1.13% : 0.000003s : 20: predicate.environ_get_set_eliminate 1.39% : 0.000003s : 23: predicate.exchange_switch_depend_value 2.34% : 0.000006s : 23: predicate.float_depend_g_call 0.65% : 0.000002s : 10: predicate.float_environ_get_switch 0.92% : 0.000002s : 15: predicate.float_tuple_getitem_switch 0.21% : 0.000000s : 5: predicate.fold_const_symbol 0.68% : 0.000002s : 10: predicate.get_grad_eliminate 0.23% : 0.000001s : 5: predicate.graph_param_transform 0.68% : 0.000002s : 10: predicate.incorporate_call 0.57% : 0.000001s : 10: predicate.incorporate_call_switch 6.06% : 0.000014s : 68: predicate.inline 0.87% : 0.000002s : 10: predicate.inline_without_move 0.37% : 0.000001s : 10: predicate.j_node_and_user_rematch 0.82% : 0.000002s : 10: predicate.less_batch_normalization 1.94% : 0.000005s : 29: predicate.list_to_tuple_eliminator_ 2.62% : 0.000006s : 44: predicate.load_eliminater 0.86% : 0.000002s : 5: predicate.loop_unroll_after_grad 2.23% : 0.000005s : 36: predicate.loop_unroll_before_grad 1.61% : 0.000004s : 25: predicate.make_slice_get_slice_eliminator 0.63% : 0.000001s : 10: predicate.merge_addn 0.62% : 0.000001s : 10: predicate.micro_step_allgather_replace 0.61% : 0.000001s : 10: predicate.mini_step_allgather_replace 0.85% : 0.000002s : 15: predicate.minmaximum_grad 0.89% : 0.000002s : 5: predicate.mutable_eliminate 0.40% : 0.000001s : 5: predicate.opt_reshape 0.46% : 0.000001s : 5: predicate.parallel_virtual_node 1.63% : 0.000004s : 23: predicate.partial_defer_inline 1.68% : 0.000004s : 24: predicate.partial_eliminate 0.88% : 0.000002s : 15: predicate.print_const_string_wrapper 0.63% : 0.000001s : 10: predicate.reduce_all_const_elim 1.29% : 0.000003s : 15: predicate.reduce_eliminate 2.51% : 0.000006s : 44: predicate.redundant_stop_gradient_eliminater 0.38% : 0.000001s : 10: predicate.remove_not_recompute_node 1.35% : 0.000003s : 29: predicate.replace_applicator 0.43% : 0.000001s : 10: predicate.replace_old_param 0.33% : 0.000001s : 5: predicate.reset_defer_inline 1.05% : 0.000002s : 15: predicate.reshape_eliminate 0.66% : 0.000002s : 10: predicate.row_tensor_add_zeros_like 0.37% : 0.000001s : 5: predicate.row_tensor_eliminate 0.92% : 0.000002s : 10: predicate.same_eliminate 0.45% : 0.000001s : 10: predicate.set_cell_output_no_recompute 0.79% : 0.000002s : 10: predicate.shard_identity_eliminate 0.71% : 0.000002s : 10: predicate.special_op_eliminate 0.82% : 0.000002s : 10: predicate.specialize_transform 0.84% : 0.000002s : 10: predicate.split_environ_get_set_with_tuple_value 0.80% : 0.000002s : 10: predicate.stack_unstack_eliminate 0.36% : 0.000001s : 5: predicate.switch_call_monad_eliminater 1.45% : 0.000003s : 23: predicate.switch_defer_inline 2.06% : 0.000005s : 33: predicate.switch_layer_defer_inline 4.86% : 0.000011s : 74: predicate.switch_simplify 0.90% : 0.000002s : 15: predicate.tile_eliminate 0.91% : 0.000002s : 15: predicate.transpose_eliminate 1.58% : 0.000004s : 25: predicate.tuple_list_convert_item_index_to_positive 1.64% : 0.000004s : 25: predicate.tuple_list_get_item_const_eliminator 1.49% : 0.000004s : 25: predicate.tuple_list_get_item_depend_reorder 3.16% : 0.000007s : 39: predicate.tuple_list_get_item_eliminator 1.65% : 0.000004s : 25: predicate.tuple_list_get_set_item_eliminator 2.18% : 0.000005s : 35: predicate.tuple_list_set_item_eliminator 1.73% : 0.000004s : 29: predicate.tuple_to_list_eliminator_ 2.49% : 0.000006s : 44: predicate.updatestate_pure_node_eliminater 3.22% : 0.000008s : 54: predicate.updatestate_useless_node_eliminater 0.34% : 0.000001s : 5: predicate.value_based_eliminate 0.67% : 0.000002s : 10: predicate.virtual_dataset_eliminate 0.70% : 0.000002s : 10: predicate.virtual_output_eliminate 0.28% : 0.000001s : 5: predicate.virtual_view_grad_eliminate 0.46% : 0.000001s : 5: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000508 11 55.93% : 0.000284s : 5: func_graph_cloner_run.FuncGraphClonerGraph 44.07% : 0.000224s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.026659 192 0.02% : 0.000006s : 1: ForceFp32Comm 11.68% : 0.003114s : 1: add_attr 11.64% : 0.003102s : 1: add_attr_with_inline 0.02% : 0.000006s : 1: add_comm_op_reuse_tag 0.23% : 0.000061s : 1: add_recomputation 0.02% : 0.000006s : 1: assign_add_opt 0.27% : 0.000073s : 1: auto_monad 0.11% : 0.000030s : 1: auto_monad_reorder 0.02% : 0.000006s : 1: begin_end_overlap_inline 0.03% : 0.000008s : 1: bias_add_comm_swap 1.88% : 0.000502s : 1: bootstrap 0.12% : 0.000033s : 1: cconv 0.02% : 0.000006s : 1: comm_op_add_attrs 0.08% : 0.000020s : 1: control_data_broadcast_order 0.05% : 0.000014s : 1: convert_after_rewriter 0.13% : 0.000034s : 1: cse_after_recomputation 0.03% : 0.000007s : 1: dataset_repeat_opt 0.07% : 0.000018s : 1: detach_backward 0.05% : 0.000012s : 1: environ_conv 0.11% : 0.000028s : 1: event_method 0.03% : 0.000007s : 1: full_micro_interleaved_order_control 0.03% : 0.000008s : 1: get_jit_bprop_graph 0.04% : 0.000012s : 1: graph_reusing 0.03% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.02% : 0.000006s : 1: handle_group_info 0.03% : 0.000008s : 1: inline 0.03% : 0.000008s : 1: insert-virtual-dataset 0.02% : 0.000006s : 1: interleave_parallel_branches 0.02% : 0.000006s : 1: interleave_split_concat_branches 0.03% : 0.000008s : 1: label_fine_grained_interleaved_index 0.04% : 0.000010s : 1: label_micro_interleaved_index 1.63% : 0.000433s : 1: loop_unroll 0.02% : 0.000007s : 1: merge_cast_opt 0.03% : 0.000008s : 1: micro_interleaved_order_control 1.83% : 0.000487s : 1: mutable_eliminate 0.04% : 0.000009s : 1: offloading_packed_experts 0.06% : 0.000016s : 1: opt.transform.loop_unroll_optimizer 0.06% : 0.000016s : 1: opt.transform.mutable_eliminate 5.13% : 0.001366s : 78: opt.transform.opt_a 0.13% : 0.000035s : 1: opt.transform.opt_after_cconv 0.11% : 0.000029s : 1: opt.transform.opt_after_jit_grad 0.50% : 0.000135s : 28: opt.transform.opt_b 0.22% : 0.000059s : 2: opt.transform.opt_trans_graph 0.16% : 0.000043s : 4: opt.transform.symbol_engine_opt 11.87% : 0.003165s : 1: opt_a 0.53% : 0.000142s : 1: opt_after_cconv 1.87% : 0.000500s : 1: opt_after_jit_grad 1.17% : 0.000311s : 1: opt_b 21.76% : 0.005802s : 1: optimize 0.09% : 0.000024s : 1: optimize_parallel_all_gather_comm 0.05% : 0.000012s : 1: order_py_execute_after_rewriter 0.10% : 0.000026s : 1: overlap_grad_flash_sp 0.02% : 0.000006s : 1: overlap_grad_matmul_and_grad_allreduce 0.04% : 0.000010s : 1: overlap_grad_ring_attention 0.03% : 0.000007s : 1: overlap_opt_shard_grad_in_pipeline 0.02% : 0.000006s : 1: overlap_opt_shard_in_pipeline 0.03% : 0.000008s : 1: overlap_param_gather 0.02% : 0.000006s : 1: overlap_recompute_allgather_and_fa_grad 0.04% : 0.000011s : 1: overlap_recompute_and_grad_model_parallel 0.03% : 0.000007s : 1: overlap_recompute_comm 0.04% : 0.000010s : 1: parallel-infer-symbol 0.02% : 0.000006s : 1: parallel-infer-symbol-second 0.03% : 0.000008s : 1: partial_unused_args_eliminate 0.03% : 0.000009s : 1: pipeline_parallel_scheduler 0.03% : 0.000007s : 1: pipeline_split 0.15% : 0.000041s : 1: pre_auto_parallel 0.12% : 0.000032s : 1: py_interpret_to_execute 0.06% : 0.000017s : 1: py_interpret_to_execute_after_opt_a 0.02% : 0.000006s : 1: remove_cast_before_assign_add 0.08% : 0.000021s : 1: remove_dup_value 1.17% : 0.000313s : 1: renormalize.infer 0.99% : 0.000265s : 1: renormalize.specialize 0.03% : 0.000008s : 1: reorder_send_recv_between_fp_bp 0.04% : 0.000010s : 1: rewriter_after_jit_bprop_graph 0.18% : 0.000047s : 1: rewriter_after_opt_a 0.34% : 0.000090s : 1: rewriter_before_opt_a 0.03% : 0.000007s : 1: slice_cell_reuse_recomputed_activation 0.03% : 0.000007s : 1: slice_recompute_activation 0.03% : 0.000007s : 1: split_layernorm_comm 0.03% : 0.000007s : 1: split_matmul_comm_elemetwise 0.04% : 0.000012s : 1: swap_dp_allreduce_reducescatter 0.39% : 0.000104s : 1: symbol_engine_optimizer 0.39% : 0.000104s : 1: tuple_transform 21.22% : 0.005657s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:45:36.663.334 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0146374, [21] [bootstrap]: 0.00041066 [type_inference]: 0.00551164 [event_method]: 1.828e-05 [auto_monad]: 6.257e-05 [graph_reusing]: 5.96e-06 [inline]: 2.21e-06 [add_attr]: 0.00299134, [1] [add_attr_with_inline]: 0.00298268, [1] [Cycle 1]: 5.344e-05, [2] [tag_attr]: 1.846e-05 [meta_addattr_fg_expand]: 6.31e-06 [parallel-infer-symbol]: 3.04999e-06 [pre_auto_parallel]: 3.282e-05 [insert-virtual-dataset]: 2.54001e-06 [parallel-infer-symbol-second]: 9.00007e-07 [dataset_repeat_opt]: 2.41998e-06 [pipeline_split]: 1.64998e-06 [optimize]: 0.00491202, [53] [py_interpret_to_execute]: 2.422e-05 [rewriter_before_opt_a]: 8.3e-05 [opt_a]: 0.00284277, [2] [Cycle 1]: 0.00205461, [45] [expand_dump_flag]: 3.57002e-06 [switch_simplify]: 4.347e-05 [loop_unroll]: 3.114e-05 [a_1]: 0.00066891 [with_stream_mark]: 1.43e-05 [recompute_prepare]: 1.004e-05 [updatestate_depend_eliminate]: 4.75001e-06 [updatestate_assign_eliminate]: 3.76999e-06 [updatestate_loads_eliminate]: 3.73001e-06 [parameter_eliminate]: 1.87999e-06 [a_2]: 0.00010205 [accelerated_algorithm]: 8.97999e-06 [shard]: 2.51e-06 [meta_shard_fg_expand]: 1.91e-06 [shard_inline]: 8.37e-06 [merge_send_recv]: 9.37001e-06 [auto_parallel]: 6.99001e-06 [parallel]: 1.851e-05 [flash_sp]: 7.93999e-06 [merge_comm]: 4.77e-06 [allreduce_fusion]: 4.4e-06 [matmul_add_comm_reduction]: 1.007e-05 [allreduce_slice_to_reducescatter]: 9.29984e-07 [virtual_shard_identity]: 9.87999e-06 [virtual_dataset]: 9.03002e-06 [get_grad_eliminate_]: 8.28001e-06 [virtual_output]: 8.79998e-06 [merge_forward]: 4.42e-06 [cell_reuse_recompute_pass]: 1.42999e-06 [offload_activation]: 1.072e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.643e-05 [merge_recompute_call_nodes]: 1.67999e-06 [before_grad]: 1.406e-05 [set_forward_comm_id_for_comm_node_pass]: 4.55001e-06 [meta_fg_expand]: 3.34001e-06 [flash_sp_send_recv_attached]: 3.30003e-06 [receive_attached]: 2.09e-06 [after_resolve]: 1.406e-05 [a_after_grad]: 1.266e-05 [renormalize]: 0.00059071 [add_forward_monad_depend]: 5.54e-06 [auto_monad_grad]: 1.81e-06 [auto_monad_eliminator]: 1.655e-05 [cse]: 3.61e-05 [a_3]: 5.759e-05 [Cycle 2]: 0.00077879, [45] [expand_dump_flag]: 1.50999e-06 [switch_simplify]: 9.81998e-06 [loop_unroll]: 8.27998e-06 [a_1]: 0.0001756 [with_stream_mark]: 1.171e-05 [recompute_prepare]: 7.7e-06 [updatestate_depend_eliminate]: 3.52002e-06 [updatestate_assign_eliminate]: 2.93e-06 [updatestate_loads_eliminate]: 2.80002e-06 [parameter_eliminate]: 1.05001e-06 [a_2]: 9.304e-05 [accelerated_algorithm]: 8.25e-06 [shard]: 1.19e-06 [meta_shard_fg_expand]: 1.59e-06 [shard_inline]: 7.42002e-06 [merge_send_recv]: 6.22001e-06 [auto_parallel]: 6.46e-06 [parallel]: 4.37e-06 [flash_sp]: 3.53999e-06 [merge_comm]: 6.96001e-06 [allreduce_fusion]: 3.56001e-06 [matmul_add_comm_reduction]: 6.53e-06 [allreduce_slice_to_reducescatter]: 4.00003e-07 [virtual_shard_identity]: 8.45999e-06 [virtual_dataset]: 7.61999e-06 [get_grad_eliminate_]: 7.08e-06 [virtual_output]: 6.88e-06 [merge_forward]: 3.08e-06 [cell_reuse_recompute_pass]: 1.40001e-06 [offload_activation]: 6.95002e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.455e-05 [merge_recompute_call_nodes]: 7.09988e-07 [before_grad]: 1.141e-05 [set_forward_comm_id_for_comm_node_pass]: 4.23001e-06 [meta_fg_expand]: 2.61999e-06 [flash_sp_send_recv_attached]: 9.10019e-07 [receive_attached]: 1.07e-06 [after_resolve]: 1.376e-05 [a_after_grad]: 1.089e-05 [renormalize]: 8.9989e-08 [add_forward_monad_depend]: 1.36998e-06 [auto_monad_grad]: 1.12e-06 [auto_monad_eliminator]: 8.69e-06 [cse]: 1.745e-05 [a_3]: 4.562e-05 [py_interpret_to_execute_after_opt_a]: 9.51e-06 [slice_cell_reuse_recomputed_activation]: 1.92001e-06 [rewriter_after_opt_a]: 3.919e-05 [convert_after_rewriter]: 7.93001e-06 [order_py_execute_after_rewriter]: 5.74e-06 [mutable_eliminate]: 0.00048057 [opt_b]: 0.00024667, [1] [Cycle 1]: 0.00024076, [7] [b_1]: 0.00016122 [b_2]: 9.55001e-06 [updatestate_depend_eliminate]: 5.72999e-06 [updatestate_assign_eliminate]: 2.86999e-06 [updatestate_loads_eliminate]: 2.79999e-06 [renormalize]: 6.19999e-07 [cse]: 2.293e-05 [optimize_parallel_all_gather_comm]: 1.783e-05 [overlap_param_gather]: 2.39999e-06 [cconv]: 2.413e-05 [loop_unroll]: 0.00043109 [opt_after_cconv]: 0.00011326, [1] [Cycle 1]: 0.00010795, [7] [c_1]: 3.729e-05 [parameter_eliminate]: 2.38998e-06 [updatestate_depend_eliminate]: 5.73997e-06 [updatestate_assign_eliminate]: 3.14999e-06 [updatestate_loads_eliminate]: 2.89001e-06 [cse]: 2.191e-05 [renormalize]: 3.89991e-07 [remove_dup_value]: 1.529e-05 [tuple_transform]: 8.6e-05, [1] [Cycle 1]: 8.146e-05, [4] [d_1]: 5.23e-05 [none_parameter_eliminate]: 2.02001e-06 [renormalize]: 2.19996e-07 [switch_simplify]: 8.42e-06 [partial_unused_args_eliminate]: 2.21998e-06 [add_recomputation]: 5.196e-05 [cse_after_recomputation]: 2.491e-05, [1] [Cycle 1]: 2.032e-05, [1] [cse]: 1.477e-05 [environ_conv]: 6.29001e-06 [swap_dp_allreduce_reducescatter]: 5.92001e-06 [bias_add_comm_swap]: 2.55002e-06 [label_micro_interleaved_index]: 4.18001e-06 [label_fine_grained_interleaved_index]: 2.59999e-06 [merge_cast_opt]: 1.35001e-06 [slice_recompute_activation]: 1.87001e-06 [micro_interleaved_order_control]: 2.38002e-06 [assign_add_opt]: 1.15999e-06 [ForceFp32Comm]: 1.11002e-06 [remove_cast_before_assign_add]: 1.27e-06 [full_micro_interleaved_order_control]: 2.11998e-06 [reorder_send_recv_between_fp_bp]: 2.88e-06 [comm_op_add_attrs]: 9.89996e-07 [add_comm_op_reuse_tag]: 9.09989e-07 [interleave_split_concat_branches]: 1.17e-06 [interleave_parallel_branches]: 1.07998e-06 [overlap_opt_shard_in_pipeline]: 1.12e-06 [overlap_opt_shard_grad_in_pipeline]: 2.18998e-06 [control_data_broadcast_order]: 1.462e-05 [grouped_pairwise_exchange_alltoall]: 1.93002e-06 [offloading_packed_experts]: 4.35999e-06 [overlap_recompute_and_grad_model_parallel]: 5.30001e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.25001e-06 [overlap_recompute_allgather_and_fa_grad]: 1.31002e-06 [overlap_recompute_comm]: 2.21e-06 [overlap_grad_ring_attention]: 4.83001e-06 [overlap_grad_flash_sp]: 1.928e-05 [begin_end_overlap_inline]: 4.80009e-07 [split_matmul_comm_elemetwise]: 2.02999e-06 [split_layernorm_comm]: 1.55999e-06 [handle_group_info]: 1.02e-06 [symbol_engine_optimizer]: 8.256e-05, [1] [Cycle 1]: 7.831e-05, [6] [build]: 2.98998e-06 [elim_shapecalc]: 1.072e-05 [elim_not_effective]: 1.508e-05 [opt_reshape]: 8.60999e-06 [fold_const_symbol]: 1.307e-05 [renormalize]: 2.50002e-07 [detach_backward]: 1.73002e-06 [pipeline_parallel_scheduler]: 1.89999e-06 [auto_monad_reorder]: 2.04e-05 [get_jit_bprop_graph]: 1.29e-06 [rewriter_after_jit_bprop_graph]: 3.73001e-06 [opt_after_jit_grad]: 0.00046808 [validate]: 3.943e-05 Sums bootstrap : 0.000411s : 3.85% type_inference : 0.005512s : 51.61% event_method : 0.000018s : 0.17% auto_monad : 0.000063s : 0.59% graph_reusing : 0.000006s : 0.06% inline : 0.000002s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000018s : 0.17% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.06% parallel-infer-symbol : 0.000003s : 0.03% pre_auto_parallel : 0.000033s : 0.31% insert-virtual-dataset : 0.000003s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.02% optimize.py_interpret_to_execute : 0.000024s : 0.23% optimize.rewriter_before_opt_a : 0.000083s : 0.78% optimize.opt_a.expand_dump_flag : 0.000005s : 0.05% optimize.opt_a.switch_simplify : 0.000053s : 0.50% optimize.opt_a.loop_unroll : 0.000039s : 0.37% optimize.opt_a.a_1 : 0.000845s : 7.91% optimize.opt_a.with_stream_mark : 0.000026s : 0.24% optimize.opt_a.recompute_prepare : 0.000018s : 0.17% optimize.opt_a.updatestate_depend_eliminate : 0.000008s : 0.08% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.06% optimize.opt_a.updatestate_loads_eliminate : 0.000007s : 0.06% optimize.opt_a.parameter_eliminate : 0.000003s : 0.03% optimize.opt_a.a_2 : 0.000195s : 1.83% optimize.opt_a.accelerated_algorithm : 0.000017s : 0.16% optimize.opt_a.shard : 0.000004s : 0.03% optimize.opt_a.meta_shard_fg_expand : 0.000003s : 0.03% optimize.opt_a.shard_inline : 0.000016s : 0.15% optimize.opt_a.merge_send_recv : 0.000016s : 0.15% optimize.opt_a.auto_parallel : 0.000013s : 0.13% optimize.opt_a.parallel : 0.000023s : 0.21% optimize.opt_a.flash_sp : 0.000011s : 0.11% optimize.opt_a.merge_comm : 0.000012s : 0.11% optimize.opt_a.allreduce_fusion : 0.000008s : 0.07% optimize.opt_a.matmul_add_comm_reduction : 0.000017s : 0.16% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000018s : 0.17% optimize.opt_a.virtual_dataset : 0.000017s : 0.16% optimize.opt_a.get_grad_eliminate_ : 0.000015s : 0.14% optimize.opt_a.virtual_output : 0.000016s : 0.15% optimize.opt_a.merge_forward : 0.000007s : 0.07% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.03% optimize.opt_a.offload_activation : 0.000018s : 0.17% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000031s : 0.29% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.02% optimize.opt_a.before_grad : 0.000025s : 0.24% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000009s : 0.08% optimize.opt_a.meta_fg_expand : 0.000006s : 0.06% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.04% optimize.opt_a.receive_attached : 0.000003s : 0.03% optimize.opt_a.after_resolve : 0.000028s : 0.26% optimize.opt_a.a_after_grad : 0.000024s : 0.22% optimize.opt_a.renormalize : 0.000591s : 5.53% optimize.opt_a.add_forward_monad_depend : 0.000007s : 0.06% optimize.opt_a.auto_monad_grad : 0.000003s : 0.03% optimize.opt_a.auto_monad_eliminator : 0.000025s : 0.24% optimize.opt_a.cse : 0.000054s : 0.50% optimize.opt_a.a_3 : 0.000103s : 0.97% optimize.py_interpret_to_execute_after_opt_a : 0.000010s : 0.09% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.02% optimize.rewriter_after_opt_a : 0.000039s : 0.37% optimize.convert_after_rewriter : 0.000008s : 0.07% optimize.order_py_execute_after_rewriter : 0.000006s : 0.05% optimize.mutable_eliminate : 0.000481s : 4.50% optimize.opt_b.b_1 : 0.000161s : 1.51% optimize.opt_b.b_2 : 0.000010s : 0.09% optimize.opt_b.updatestate_depend_eliminate : 0.000006s : 0.05% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_b.renormalize : 0.000001s : 0.01% optimize.opt_b.cse : 0.000023s : 0.21% optimize.optimize_parallel_all_gather_comm : 0.000018s : 0.17% optimize.overlap_param_gather : 0.000002s : 0.02% optimize.cconv : 0.000024s : 0.23% optimize.loop_unroll : 0.000431s : 4.04% optimize.opt_after_cconv.c_1 : 0.000037s : 0.35% optimize.opt_after_cconv.parameter_eliminate : 0.000002s : 0.02% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.05% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.cse : 0.000022s : 0.21% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000015s : 0.14% optimize.tuple_transform.d_1 : 0.000052s : 0.49% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.02% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000008s : 0.08% optimize.partial_unused_args_eliminate : 0.000002s : 0.02% optimize.add_recomputation : 0.000052s : 0.49% optimize.cse_after_recomputation.cse : 0.000015s : 0.14% optimize.environ_conv : 0.000006s : 0.06% optimize.swap_dp_allreduce_reducescatter : 0.000006s : 0.06% optimize.bias_add_comm_swap : 0.000003s : 0.02% optimize.label_micro_interleaved_index : 0.000004s : 0.04% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.02% optimize.merge_cast_opt : 0.000001s : 0.01% optimize.slice_recompute_activation : 0.000002s : 0.02% optimize.micro_interleaved_order_control : 0.000002s : 0.02% optimize.assign_add_opt : 0.000001s : 0.01% optimize.ForceFp32Comm : 0.000001s : 0.01% optimize.remove_cast_before_assign_add : 0.000001s : 0.01% optimize.full_micro_interleaved_order_control : 0.000002s : 0.02% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.03% optimize.comm_op_add_attrs : 0.000001s : 0.01% optimize.add_comm_op_reuse_tag : 0.000001s : 0.01% optimize.interleave_split_concat_branches : 0.000001s : 0.01% optimize.interleave_parallel_branches : 0.000001s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.02% optimize.control_data_broadcast_order : 0.000015s : 0.14% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.02% optimize.offloading_packed_experts : 0.000004s : 0.04% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.05% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.01% optimize.overlap_recompute_comm : 0.000002s : 0.02% optimize.overlap_grad_ring_attention : 0.000005s : 0.05% optimize.overlap_grad_flash_sp : 0.000019s : 0.18% optimize.begin_end_overlap_inline : 0.000000s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.02% optimize.split_layernorm_comm : 0.000002s : 0.01% optimize.handle_group_info : 0.000001s : 0.01% optimize.symbol_engine_optimizer.build : 0.000003s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000011s : 0.10% optimize.symbol_engine_optimizer.elim_not_effective : 0.000015s : 0.14% optimize.symbol_engine_optimizer.opt_reshape : 0.000009s : 0.08% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000013s : 0.12% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.02% pipeline_parallel_scheduler : 0.000002s : 0.02% auto_monad_reorder : 0.000020s : 0.19% get_jit_bprop_graph : 0.000001s : 0.01% rewriter_after_jit_bprop_graph : 0.000004s : 0.03% opt_after_jit_grad : 0.000468s : 4.38% validate : 0.000039s : 0.37% Time group info: ------[substitution.] 0.000203 38 11.24% : 0.000023s : 3: substitution.cast_eliminate 1.03% : 0.000002s : 3: substitution.elim_not_effective 0.88% : 0.000002s : 3: substitution.fold_const_symbol 3.32% : 0.000007s : 5: substitution.graph_param_transform 65.95% : 0.000134s : 4: substitution.inline 2.12% : 0.000004s : 6: substitution.j_node_and_user_rematch 3.09% : 0.000006s : 6: substitution.remove_not_recompute_node 3.38% : 0.000007s : 4: substitution.replace_old_param 8.99% : 0.000018s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.005456 2 87.47% : 0.004772s : 1: type_inference.infer 12.53% : 0.000684s : 1: type_inference.specialize ------[replace.] 0.000064 8 64.40% : 0.000041s : 4: replace.inline 35.60% : 0.000023s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000147 8 88.89% : 0.000131s : 4: match.inline 11.11% : 0.000016s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000236 1504 0.92% : 0.000002s : 15: predicate.accumulaten_eliminater 0.74% : 0.000002s : 5: predicate.ad_related_special_op_eliminate 0.58% : 0.000001s : 10: predicate.addn_check_dump 0.87% : 0.000002s : 15: predicate.addn_zero_filter 0.87% : 0.000002s : 15: predicate.adjust_all_reduce_mul_add 2.07% : 0.000005s : 25: predicate.arithmetic_simplify 1.03% : 0.000002s : 15: predicate.cast_eliminate 0.68% : 0.000002s : 10: predicate.check_bprop_eliminate 0.62% : 0.000001s : 10: predicate.compare_switch_simplify 0.20% : 0.000000s : 5: predicate.const_output_eliminate 0.72% : 0.000002s : 10: predicate.depend_value_elim 0.93% : 0.000002s : 15: predicate.dict_get_item_const_eliminator 1.01% : 0.000002s : 15: predicate.dict_get_item_eliminator 0.87% : 0.000002s : 15: predicate.dict_set_item_eliminator 0.93% : 0.000002s : 10: predicate.dumpgradient_eliminate 0.22% : 0.000001s : 5: predicate.elim_not_effective 0.40% : 0.000001s : 5: predicate.elim_shapecalc_of_broadcastargs 1.18% : 0.000003s : 20: predicate.environ_add_const_eliminate 1.10% : 0.000003s : 20: predicate.environ_get_add_eliminate 1.17% : 0.000003s : 20: predicate.environ_get_depend_swap 1.70% : 0.000004s : 30: predicate.environ_get_eliminate 1.11% : 0.000003s : 20: predicate.environ_get_set_eliminate 1.42% : 0.000003s : 23: predicate.exchange_switch_depend_value 2.43% : 0.000006s : 23: predicate.float_depend_g_call 0.60% : 0.000001s : 10: predicate.float_environ_get_switch 0.85% : 0.000002s : 15: predicate.float_tuple_getitem_switch 0.22% : 0.000001s : 5: predicate.fold_const_symbol 0.72% : 0.000002s : 10: predicate.get_grad_eliminate 0.22% : 0.000001s : 5: predicate.graph_param_transform 0.64% : 0.000002s : 10: predicate.incorporate_call 0.56% : 0.000001s : 10: predicate.incorporate_call_switch 6.26% : 0.000015s : 68: predicate.inline 0.80% : 0.000002s : 10: predicate.inline_without_move 0.34% : 0.000001s : 10: predicate.j_node_and_user_rematch 0.93% : 0.000002s : 10: predicate.less_batch_normalization 1.77% : 0.000004s : 29: predicate.list_to_tuple_eliminator_ 2.61% : 0.000006s : 44: predicate.load_eliminater 0.97% : 0.000002s : 5: predicate.loop_unroll_after_grad 2.20% : 0.000005s : 36: predicate.loop_unroll_before_grad 1.58% : 0.000004s : 25: predicate.make_slice_get_slice_eliminator 0.67% : 0.000002s : 10: predicate.merge_addn 0.58% : 0.000001s : 10: predicate.micro_step_allgather_replace 0.60% : 0.000001s : 10: predicate.mini_step_allgather_replace 0.80% : 0.000002s : 15: predicate.minmaximum_grad 0.89% : 0.000002s : 5: predicate.mutable_eliminate 0.33% : 0.000001s : 5: predicate.opt_reshape 0.38% : 0.000001s : 5: predicate.parallel_virtual_node 1.62% : 0.000004s : 23: predicate.partial_defer_inline 1.65% : 0.000004s : 24: predicate.partial_eliminate 0.89% : 0.000002s : 15: predicate.print_const_string_wrapper 0.61% : 0.000001s : 10: predicate.reduce_all_const_elim 1.17% : 0.000003s : 15: predicate.reduce_eliminate 2.51% : 0.000006s : 44: predicate.redundant_stop_gradient_eliminater 0.45% : 0.000001s : 10: predicate.remove_not_recompute_node 1.34% : 0.000003s : 29: predicate.replace_applicator 0.53% : 0.000001s : 10: predicate.replace_old_param 0.27% : 0.000001s : 5: predicate.reset_defer_inline 0.95% : 0.000002s : 15: predicate.reshape_eliminate 0.61% : 0.000001s : 10: predicate.row_tensor_add_zeros_like 0.47% : 0.000001s : 5: predicate.row_tensor_eliminate 0.78% : 0.000002s : 10: predicate.same_eliminate 0.45% : 0.000001s : 10: predicate.set_cell_output_no_recompute 0.78% : 0.000002s : 10: predicate.shard_identity_eliminate 0.86% : 0.000002s : 10: predicate.special_op_eliminate 0.75% : 0.000002s : 10: predicate.specialize_transform 0.89% : 0.000002s : 10: predicate.split_environ_get_set_with_tuple_value 0.78% : 0.000002s : 10: predicate.stack_unstack_eliminate 0.36% : 0.000001s : 5: predicate.switch_call_monad_eliminater 1.44% : 0.000003s : 23: predicate.switch_defer_inline 2.04% : 0.000005s : 33: predicate.switch_layer_defer_inline 4.96% : 0.000012s : 74: predicate.switch_simplify 0.87% : 0.000002s : 15: predicate.tile_eliminate 0.90% : 0.000002s : 15: predicate.transpose_eliminate 1.56% : 0.000004s : 25: predicate.tuple_list_convert_item_index_to_positive 1.70% : 0.000004s : 25: predicate.tuple_list_get_item_const_eliminator 1.52% : 0.000004s : 25: predicate.tuple_list_get_item_depend_reorder 3.17% : 0.000007s : 39: predicate.tuple_list_get_item_eliminator 1.52% : 0.000004s : 25: predicate.tuple_list_get_set_item_eliminator 2.39% : 0.000006s : 35: predicate.tuple_list_set_item_eliminator 1.71% : 0.000004s : 29: predicate.tuple_to_list_eliminator_ 2.49% : 0.000006s : 44: predicate.updatestate_pure_node_eliminater 3.24% : 0.000008s : 54: predicate.updatestate_useless_node_eliminater 0.38% : 0.000001s : 5: predicate.value_based_eliminate 0.69% : 0.000002s : 10: predicate.virtual_dataset_eliminate 0.70% : 0.000002s : 10: predicate.virtual_output_eliminate 0.31% : 0.000001s : 5: predicate.virtual_view_grad_eliminate 0.43% : 0.000001s : 5: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000493 11 53.93% : 0.000266s : 5: func_graph_cloner_run.FuncGraphClonerGraph 46.07% : 0.000227s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.024716 192 0.02% : 0.000004s : 1: ForceFp32Comm 12.12% : 0.002996s : 1: add_attr 12.08% : 0.002986s : 1: add_attr_with_inline 0.01% : 0.000004s : 1: add_comm_op_reuse_tag 0.23% : 0.000056s : 1: add_recomputation 0.02% : 0.000004s : 1: assign_add_opt 0.28% : 0.000068s : 1: auto_monad 0.10% : 0.000025s : 1: auto_monad_reorder 0.01% : 0.000003s : 1: begin_end_overlap_inline 0.02% : 0.000006s : 1: bias_add_comm_swap 1.77% : 0.000438s : 1: bootstrap 0.11% : 0.000028s : 1: cconv 0.02% : 0.000004s : 1: comm_op_add_attrs 0.07% : 0.000018s : 1: control_data_broadcast_order 0.05% : 0.000011s : 1: convert_after_rewriter 0.11% : 0.000028s : 1: cse_after_recomputation 0.02% : 0.000005s : 1: dataset_repeat_opt 0.02% : 0.000005s : 1: detach_backward 0.04% : 0.000009s : 1: environ_conv 0.10% : 0.000024s : 1: event_method 0.02% : 0.000005s : 1: full_micro_interleaved_order_control 0.02% : 0.000004s : 1: get_jit_bprop_graph 0.04% : 0.000010s : 1: graph_reusing 0.02% : 0.000005s : 1: grouped_pairwise_exchange_alltoall 0.02% : 0.000004s : 1: handle_group_info 0.02% : 0.000005s : 1: inline 0.02% : 0.000006s : 1: insert-virtual-dataset 0.02% : 0.000004s : 1: interleave_parallel_branches 0.02% : 0.000004s : 1: interleave_split_concat_branches 0.02% : 0.000006s : 1: label_fine_grained_interleaved_index 0.03% : 0.000007s : 1: label_micro_interleaved_index 1.77% : 0.000439s : 1: loop_unroll 0.02% : 0.000004s : 1: merge_cast_opt 0.02% : 0.000005s : 1: micro_interleaved_order_control 1.97% : 0.000488s : 1: mutable_eliminate 0.03% : 0.000007s : 1: offloading_packed_experts 0.07% : 0.000017s : 1: opt.transform.loop_unroll_optimizer 0.07% : 0.000017s : 1: opt.transform.mutable_eliminate 5.54% : 0.001370s : 78: opt.transform.opt_a 0.15% : 0.000036s : 1: opt.transform.opt_after_cconv 0.12% : 0.000030s : 1: opt.transform.opt_after_jit_grad 0.56% : 0.000138s : 28: opt.transform.opt_b 0.24% : 0.000059s : 2: opt.transform.opt_trans_graph 0.18% : 0.000044s : 4: opt.transform.symbol_engine_opt 11.51% : 0.002846s : 1: opt_a 0.47% : 0.000117s : 1: opt_after_cconv 1.93% : 0.000477s : 1: opt_after_jit_grad 1.01% : 0.000250s : 1: opt_b 19.89% : 0.004916s : 1: optimize 0.09% : 0.000021s : 1: optimize_parallel_all_gather_comm 0.04% : 0.000009s : 1: order_py_execute_after_rewriter 0.09% : 0.000023s : 1: overlap_grad_flash_sp 0.02% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.03% : 0.000008s : 1: overlap_grad_ring_attention 0.02% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.02% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.02% : 0.000006s : 1: overlap_param_gather 0.02% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.03% : 0.000008s : 1: overlap_recompute_and_grad_model_parallel 0.02% : 0.000005s : 1: overlap_recompute_comm 0.03% : 0.000007s : 1: parallel-infer-symbol 0.01% : 0.000004s : 1: parallel-infer-symbol-second 0.02% : 0.000005s : 1: partial_unused_args_eliminate 0.02% : 0.000005s : 1: pipeline_parallel_scheduler 0.02% : 0.000005s : 1: pipeline_split 0.15% : 0.000037s : 1: pre_auto_parallel 0.11% : 0.000028s : 1: py_interpret_to_execute 0.05% : 0.000013s : 1: py_interpret_to_execute_after_opt_a 0.02% : 0.000005s : 1: remove_cast_before_assign_add 0.08% : 0.000019s : 1: remove_dup_value 1.24% : 0.000307s : 1: renormalize.infer 1.12% : 0.000276s : 1: renormalize.specialize 0.02% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.03% : 0.000007s : 1: rewriter_after_jit_bprop_graph 0.17% : 0.000043s : 1: rewriter_after_opt_a 0.35% : 0.000087s : 1: rewriter_before_opt_a 0.02% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.02% : 0.000005s : 1: slice_recompute_activation 0.02% : 0.000004s : 1: split_layernorm_comm 0.02% : 0.000005s : 1: split_matmul_comm_elemetwise 0.04% : 0.000009s : 1: swap_dp_allreduce_reducescatter 0.35% : 0.000085s : 1: symbol_engine_optimizer 0.36% : 0.000089s : 1: tuple_transform 22.35% : 0.005525s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:45:37.537. [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:45:37.802. [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0159016, [21] [bootstrap]: 0.00043614 [type_inference]: 0.00548031 [event_method]: 1.875e-05 [auto_monad]: 6.42e-05 [graph_reusing]: 5.62999e-06 [inline]: 2.01998e-06 [add_attr]: 0.00302125, [1] [add_attr_with_inline]: 0.00301276, [1] [Cycle 1]: 6.673e-05, [2] [tag_attr]: 1.894e-05 [meta_addattr_fg_expand]: 6.06e-06 [parallel-infer-symbol]: 2.84999e-06 [pre_auto_parallel]: 3.193e-05 [insert-virtual-dataset]: 2.39999e-06 [parallel-infer-symbol-second]: 7.80012e-07 [dataset_repeat_opt]: 1.94999e-06 [pipeline_split]: 1.47999e-06 [optimize]: 0.00571868, [53] [py_interpret_to_execute]: 2.846e-05 [rewriter_before_opt_a]: 0.00012347 [opt_a]: 0.00322633, [2] [Cycle 1]: 0.00226347, [45] [expand_dump_flag]: 3.06999e-06 [switch_simplify]: 4.411e-05 [loop_unroll]: 3.082e-05 [a_1]: 0.00068155 [with_stream_mark]: 1.516e-05 [recompute_prepare]: 9.77001e-06 [updatestate_depend_eliminate]: 4.97999e-06 [updatestate_assign_eliminate]: 4.13999e-06 [updatestate_loads_eliminate]: 3.43e-06 [parameter_eliminate]: 1.95001e-06 [a_2]: 0.00013394 [accelerated_algorithm]: 8.35999e-06 [shard]: 1.74e-06 [meta_shard_fg_expand]: 1.97001e-06 [shard_inline]: 8.43999e-06 [merge_send_recv]: 9.43002e-06 [auto_parallel]: 7.2e-06 [parallel]: 1.924e-05 [flash_sp]: 8.01001e-06 [merge_comm]: 4.47e-06 [allreduce_fusion]: 4.37998e-06 [matmul_add_comm_reduction]: 1.062e-05 [allreduce_slice_to_reducescatter]: 7.89994e-07 [virtual_shard_identity]: 9.97001e-06 [virtual_dataset]: 8.07e-06 [get_grad_eliminate_]: 7.43e-06 [virtual_output]: 8.13001e-06 [merge_forward]: 4.80001e-06 [cell_reuse_recompute_pass]: 1.54e-06 [offload_activation]: 1.084e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.767e-05 [merge_recompute_call_nodes]: 1.67001e-06 [before_grad]: 1.302e-05 [set_forward_comm_id_for_comm_node_pass]: 4.98001e-06 [meta_fg_expand]: 3.68e-06 [flash_sp_send_recv_attached]: 2.79999e-06 [receive_attached]: 2.63e-06 [after_resolve]: 1.281e-05 [a_after_grad]: 1.233e-05 [renormalize]: 0.00060077 [add_forward_monad_depend]: 5.43002e-06 [auto_monad_grad]: 1.75001e-06 [auto_monad_eliminator]: 1.793e-05 [cse]: 3.436e-05 [a_3]: 7.571e-05 [Cycle 2]: 0.00094926, [45] [expand_dump_flag]: 1.08001e-06 [switch_simplify]: 1.013e-05 [loop_unroll]: 8.03999e-06 [a_1]: 0.00018558 [with_stream_mark]: 1.168e-05 [recompute_prepare]: 8.54e-06 [updatestate_depend_eliminate]: 3.59002e-06 [updatestate_assign_eliminate]: 2.89999e-06 [updatestate_loads_eliminate]: 2.81e-06 [parameter_eliminate]: 1.02998e-06 [a_2]: 0.00011948 [accelerated_algorithm]: 7.63999e-06 [shard]: 1.25001e-06 [meta_shard_fg_expand]: 1.54998e-06 [shard_inline]: 7.56999e-06 [merge_send_recv]: 5.92999e-06 [auto_parallel]: 6.81999e-06 [parallel]: 4.2e-06 [flash_sp]: 3.37002e-06 [merge_comm]: 4.60999e-06 [allreduce_fusion]: 3.85e-06 [matmul_add_comm_reduction]: 6.05002e-06 [allreduce_slice_to_reducescatter]: 3.50003e-07 [virtual_shard_identity]: 8.46002e-06 [virtual_dataset]: 7.63001e-06 [get_grad_eliminate_]: 7.6e-06 [virtual_output]: 6.99001e-06 [merge_forward]: 3.45e-06 [cell_reuse_recompute_pass]: 1.52001e-06 [offload_activation]: 7.65e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.606e-05 [merge_recompute_call_nodes]: 7.79983e-07 [before_grad]: 1.193e-05 [set_forward_comm_id_for_comm_node_pass]: 4.45e-06 [meta_fg_expand]: 2.73998e-06 [flash_sp_send_recv_attached]: 8.49977e-07 [receive_attached]: 9.99979e-07 [after_resolve]: 1.273e-05 [a_after_grad]: 1.222e-05 [renormalize]: 8.9989e-08 [add_forward_monad_depend]: 1.24003e-06 [auto_monad_grad]: 1.20999e-06 [auto_monad_eliminator]: 8.57e-06 [cse]: 1.73e-05 [a_3]: 6.284e-05 [py_interpret_to_execute_after_opt_a]: 1.427e-05 [slice_cell_reuse_recomputed_activation]: 5.12999e-06 [rewriter_after_opt_a]: 4.307e-05 [convert_after_rewriter]: 1.046e-05 [order_py_execute_after_rewriter]: 8.76002e-06 [mutable_eliminate]: 0.00049067 [opt_b]: 0.00040351, [1] [Cycle 1]: 0.00039371, [7] [b_1]: 0.00028527 [b_2]: 1.088e-05 [updatestate_depend_eliminate]: 6.64001e-06 [updatestate_assign_eliminate]: 3.09001e-06 [updatestate_loads_eliminate]: 3.08998e-06 [renormalize]: 3.00002e-07 [cse]: 2.279e-05 [optimize_parallel_all_gather_comm]: 2.168e-05 [overlap_param_gather]: 5.18002e-06 [cconv]: 2.688e-05 [loop_unroll]: 0.00046557 [opt_after_cconv]: 0.00013791, [1] [Cycle 1]: 0.00012939, [7] [c_1]: 3.77e-05 [parameter_eliminate]: 2.61e-06 [updatestate_depend_eliminate]: 5.82999e-06 [updatestate_assign_eliminate]: 3.08e-06 [updatestate_loads_eliminate]: 2.91999e-06 [cse]: 2.156e-05 [renormalize]: 6.60017e-07 [remove_dup_value]: 1.804e-05 [tuple_transform]: 9.847e-05, [1] [Cycle 1]: 9.157e-05, [4] [d_1]: 5.121e-05 [none_parameter_eliminate]: 2.17999e-06 [renormalize]: 2.30008e-07 [switch_simplify]: 8.55999e-06 [partial_unused_args_eliminate]: 4.57e-06 [add_recomputation]: 5.952e-05 [cse_after_recomputation]: 3.237e-05, [1] [Cycle 1]: 2.505e-05, [1] [cse]: 1.545e-05 [environ_conv]: 9.04e-06 [swap_dp_allreduce_reducescatter]: 9.56998e-06 [bias_add_comm_swap]: 4.89e-06 [label_micro_interleaved_index]: 6.34999e-06 [label_fine_grained_interleaved_index]: 5.63002e-06 [merge_cast_opt]: 3.66001e-06 [slice_recompute_activation]: 5.05999e-06 [micro_interleaved_order_control]: 5.07e-06 [assign_add_opt]: 3.55e-06 [ForceFp32Comm]: 3.13e-06 [remove_cast_before_assign_add]: 3.24001e-06 [full_micro_interleaved_order_control]: 4.42003e-06 [reorder_send_recv_between_fp_bp]: 5.22999e-06 [comm_op_add_attrs]: 3.48e-06 [add_comm_op_reuse_tag]: 3.33e-06 [interleave_split_concat_branches]: 3.53999e-06 [interleave_parallel_branches]: 3.33998e-06 [overlap_opt_shard_in_pipeline]: 3.68e-06 [overlap_opt_shard_grad_in_pipeline]: 4.22e-06 [control_data_broadcast_order]: 1.728e-05 [grouped_pairwise_exchange_alltoall]: 3.82002e-06 [offloading_packed_experts]: 7.53e-06 [overlap_recompute_and_grad_model_parallel]: 7.67002e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.60998e-06 [overlap_recompute_allgather_and_fa_grad]: 3.91001e-06 [overlap_recompute_comm]: 4.75999e-06 [overlap_grad_ring_attention]: 7.15003e-06 [overlap_grad_flash_sp]: 2.228e-05 [begin_end_overlap_inline]: 3.03e-06 [split_matmul_comm_elemetwise]: 4.60001e-06 [split_layernorm_comm]: 3.86999e-06 [handle_group_info]: 3.22002e-06 [symbol_engine_optimizer]: 0.00010363, [1] [Cycle 1]: 9.679e-05, [6] [build]: 3.22002e-06 [elim_shapecalc]: 1.088e-05 [elim_not_effective]: 1.564e-05 [opt_reshape]: 8.57e-06 [fold_const_symbol]: 1.31e-05 [renormalize]: 2.3999e-07 [detach_backward]: 3.83001e-06 [pipeline_parallel_scheduler]: 1.84998e-06 [auto_monad_reorder]: 2.286e-05 [get_jit_bprop_graph]: 1.30001e-06 [rewriter_after_jit_bprop_graph]: 4.13999e-06 [opt_after_jit_grad]: 0.00047669 [validate]: 3.961e-05 Sums bootstrap : 0.000436s : 3.91% type_inference : 0.005480s : 49.15% event_method : 0.000019s : 0.17% auto_monad : 0.000064s : 0.58% graph_reusing : 0.000006s : 0.05% inline : 0.000002s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000019s : 0.17% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.05% parallel-infer-symbol : 0.000003s : 0.03% pre_auto_parallel : 0.000032s : 0.29% insert-virtual-dataset : 0.000002s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000001s : 0.01% optimize.py_interpret_to_execute : 0.000028s : 0.26% optimize.rewriter_before_opt_a : 0.000123s : 1.11% optimize.opt_a.expand_dump_flag : 0.000004s : 0.04% optimize.opt_a.switch_simplify : 0.000054s : 0.49% optimize.opt_a.loop_unroll : 0.000039s : 0.35% optimize.opt_a.a_1 : 0.000867s : 7.78% optimize.opt_a.with_stream_mark : 0.000027s : 0.24% optimize.opt_a.recompute_prepare : 0.000018s : 0.16% optimize.opt_a.updatestate_depend_eliminate : 0.000009s : 0.08% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.06% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.06% optimize.opt_a.parameter_eliminate : 0.000003s : 0.03% optimize.opt_a.a_2 : 0.000253s : 2.27% optimize.opt_a.accelerated_algorithm : 0.000016s : 0.14% optimize.opt_a.shard : 0.000003s : 0.03% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.03% optimize.opt_a.shard_inline : 0.000016s : 0.14% optimize.opt_a.merge_send_recv : 0.000015s : 0.14% optimize.opt_a.auto_parallel : 0.000014s : 0.13% optimize.opt_a.parallel : 0.000023s : 0.21% optimize.opt_a.flash_sp : 0.000011s : 0.10% optimize.opt_a.merge_comm : 0.000009s : 0.08% optimize.opt_a.allreduce_fusion : 0.000008s : 0.07% optimize.opt_a.matmul_add_comm_reduction : 0.000017s : 0.15% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000018s : 0.17% optimize.opt_a.virtual_dataset : 0.000016s : 0.14% optimize.opt_a.get_grad_eliminate_ : 0.000015s : 0.13% optimize.opt_a.virtual_output : 0.000015s : 0.14% optimize.opt_a.merge_forward : 0.000008s : 0.07% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.03% optimize.opt_a.offload_activation : 0.000018s : 0.17% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000034s : 0.30% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.02% optimize.opt_a.before_grad : 0.000025s : 0.22% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000009s : 0.08% optimize.opt_a.meta_fg_expand : 0.000006s : 0.06% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.03% optimize.opt_a.receive_attached : 0.000004s : 0.03% optimize.opt_a.after_resolve : 0.000026s : 0.23% optimize.opt_a.a_after_grad : 0.000025s : 0.22% optimize.opt_a.renormalize : 0.000601s : 5.39% optimize.opt_a.add_forward_monad_depend : 0.000007s : 0.06% optimize.opt_a.auto_monad_grad : 0.000003s : 0.03% optimize.opt_a.auto_monad_eliminator : 0.000026s : 0.24% optimize.opt_a.cse : 0.000052s : 0.46% optimize.opt_a.a_3 : 0.000139s : 1.24% optimize.py_interpret_to_execute_after_opt_a : 0.000014s : 0.13% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.05% optimize.rewriter_after_opt_a : 0.000043s : 0.39% optimize.convert_after_rewriter : 0.000010s : 0.09% optimize.order_py_execute_after_rewriter : 0.000009s : 0.08% optimize.mutable_eliminate : 0.000491s : 4.40% optimize.opt_b.b_1 : 0.000285s : 2.56% optimize.opt_b.b_2 : 0.000011s : 0.10% optimize.opt_b.updatestate_depend_eliminate : 0.000007s : 0.06% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_b.renormalize : 0.000000s : 0.00% optimize.opt_b.cse : 0.000023s : 0.20% optimize.optimize_parallel_all_gather_comm : 0.000022s : 0.19% optimize.overlap_param_gather : 0.000005s : 0.05% optimize.cconv : 0.000027s : 0.24% optimize.loop_unroll : 0.000466s : 4.18% optimize.opt_after_cconv.c_1 : 0.000038s : 0.34% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.02% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.05% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.cse : 0.000022s : 0.19% optimize.opt_after_cconv.renormalize : 0.000001s : 0.01% optimize.remove_dup_value : 0.000018s : 0.16% optimize.tuple_transform.d_1 : 0.000051s : 0.46% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.02% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000009s : 0.08% optimize.partial_unused_args_eliminate : 0.000005s : 0.04% optimize.add_recomputation : 0.000060s : 0.53% optimize.cse_after_recomputation.cse : 0.000015s : 0.14% optimize.environ_conv : 0.000009s : 0.08% optimize.swap_dp_allreduce_reducescatter : 0.000010s : 0.09% optimize.bias_add_comm_swap : 0.000005s : 0.04% optimize.label_micro_interleaved_index : 0.000006s : 0.06% optimize.label_fine_grained_interleaved_index : 0.000006s : 0.05% optimize.merge_cast_opt : 0.000004s : 0.03% optimize.slice_recompute_activation : 0.000005s : 0.05% optimize.micro_interleaved_order_control : 0.000005s : 0.05% optimize.assign_add_opt : 0.000004s : 0.03% optimize.ForceFp32Comm : 0.000003s : 0.03% optimize.remove_cast_before_assign_add : 0.000003s : 0.03% optimize.full_micro_interleaved_order_control : 0.000004s : 0.04% optimize.reorder_send_recv_between_fp_bp : 0.000005s : 0.05% optimize.comm_op_add_attrs : 0.000003s : 0.03% optimize.add_comm_op_reuse_tag : 0.000003s : 0.03% optimize.interleave_split_concat_branches : 0.000004s : 0.03% optimize.interleave_parallel_branches : 0.000003s : 0.03% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.03% optimize.overlap_opt_shard_grad_in_pipeline : 0.000004s : 0.04% optimize.control_data_broadcast_order : 0.000017s : 0.15% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.03% optimize.offloading_packed_experts : 0.000008s : 0.07% optimize.overlap_recompute_and_grad_model_parallel : 0.000008s : 0.07% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.03% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.04% optimize.overlap_recompute_comm : 0.000005s : 0.04% optimize.overlap_grad_ring_attention : 0.000007s : 0.06% optimize.overlap_grad_flash_sp : 0.000022s : 0.20% optimize.begin_end_overlap_inline : 0.000003s : 0.03% optimize.split_matmul_comm_elemetwise : 0.000005s : 0.04% optimize.split_layernorm_comm : 0.000004s : 0.03% optimize.handle_group_info : 0.000003s : 0.03% optimize.symbol_engine_optimizer.build : 0.000003s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000011s : 0.10% optimize.symbol_engine_optimizer.elim_not_effective : 0.000016s : 0.14% optimize.symbol_engine_optimizer.opt_reshape : 0.000009s : 0.08% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000013s : 0.12% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000004s : 0.03% pipeline_parallel_scheduler : 0.000002s : 0.02% auto_monad_reorder : 0.000023s : 0.21% get_jit_bprop_graph : 0.000001s : 0.01% rewriter_after_jit_bprop_graph : 0.000004s : 0.04% opt_after_jit_grad : 0.000477s : 4.28% validate : 0.000040s : 0.36% Time group info: ------[substitution.] 0.000198 38 11.10% : 0.000022s : 3: substitution.cast_eliminate 1.07% : 0.000002s : 3: substitution.elim_not_effective 0.89% : 0.000002s : 3: substitution.fold_const_symbol 3.25% : 0.000006s : 5: substitution.graph_param_transform 69.88% : 0.000139s : 4: substitution.inline 2.19% : 0.000004s : 6: substitution.j_node_and_user_rematch 3.02% : 0.000006s : 6: substitution.remove_not_recompute_node 2.23% : 0.000004s : 4: substitution.replace_old_param 6.36% : 0.000013s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.005433 2 87.06% : 0.004730s : 1: type_inference.infer 12.94% : 0.000703s : 1: type_inference.specialize ------[replace.] 0.000061 8 62.03% : 0.000038s : 4: replace.inline 37.97% : 0.000023s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000147 8 92.70% : 0.000136s : 4: match.inline 7.30% : 0.000011s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000248 1504 1.03% : 0.000003s : 15: predicate.accumulaten_eliminater 0.76% : 0.000002s : 5: predicate.ad_related_special_op_eliminate 0.59% : 0.000001s : 10: predicate.addn_check_dump 0.93% : 0.000002s : 15: predicate.addn_zero_filter 0.89% : 0.000002s : 15: predicate.adjust_all_reduce_mul_add 2.13% : 0.000005s : 25: predicate.arithmetic_simplify 1.08% : 0.000003s : 15: predicate.cast_eliminate 0.68% : 0.000002s : 10: predicate.check_bprop_eliminate 0.62% : 0.000002s : 10: predicate.compare_switch_simplify 0.20% : 0.000001s : 5: predicate.const_output_eliminate 0.68% : 0.000002s : 10: predicate.depend_value_elim 0.95% : 0.000002s : 15: predicate.dict_get_item_const_eliminator 1.11% : 0.000003s : 15: predicate.dict_get_item_eliminator 0.87% : 0.000002s : 15: predicate.dict_set_item_eliminator 0.90% : 0.000002s : 10: predicate.dumpgradient_eliminate 0.21% : 0.000001s : 5: predicate.elim_not_effective 0.39% : 0.000001s : 5: predicate.elim_shapecalc_of_broadcastargs 1.16% : 0.000003s : 20: predicate.environ_add_const_eliminate 1.07% : 0.000003s : 20: predicate.environ_get_add_eliminate 1.09% : 0.000003s : 20: predicate.environ_get_depend_swap 1.76% : 0.000004s : 30: predicate.environ_get_eliminate 1.12% : 0.000003s : 20: predicate.environ_get_set_eliminate 1.39% : 0.000003s : 23: predicate.exchange_switch_depend_value 2.22% : 0.000006s : 23: predicate.float_depend_g_call 0.53% : 0.000001s : 10: predicate.float_environ_get_switch 1.04% : 0.000003s : 15: predicate.float_tuple_getitem_switch 0.19% : 0.000000s : 5: predicate.fold_const_symbol 0.64% : 0.000002s : 10: predicate.get_grad_eliminate 0.22% : 0.000001s : 5: predicate.graph_param_transform 0.62% : 0.000002s : 10: predicate.incorporate_call 0.54% : 0.000001s : 10: predicate.incorporate_call_switch 6.34% : 0.000016s : 68: predicate.inline 0.84% : 0.000002s : 10: predicate.inline_without_move 0.33% : 0.000001s : 10: predicate.j_node_and_user_rematch 0.76% : 0.000002s : 10: predicate.less_batch_normalization 1.83% : 0.000005s : 29: predicate.list_to_tuple_eliminator_ 2.58% : 0.000006s : 44: predicate.load_eliminater 0.88% : 0.000002s : 5: predicate.loop_unroll_after_grad 2.12% : 0.000005s : 36: predicate.loop_unroll_before_grad 1.65% : 0.000004s : 25: predicate.make_slice_get_slice_eliminator 0.60% : 0.000001s : 10: predicate.merge_addn 0.62% : 0.000002s : 10: predicate.micro_step_allgather_replace 0.67% : 0.000002s : 10: predicate.mini_step_allgather_replace 0.87% : 0.000002s : 15: predicate.minmaximum_grad 0.82% : 0.000002s : 5: predicate.mutable_eliminate 0.34% : 0.000001s : 5: predicate.opt_reshape 0.43% : 0.000001s : 5: predicate.parallel_virtual_node 1.68% : 0.000004s : 23: predicate.partial_defer_inline 1.68% : 0.000004s : 24: predicate.partial_eliminate 0.90% : 0.000002s : 15: predicate.print_const_string_wrapper 0.68% : 0.000002s : 10: predicate.reduce_all_const_elim 1.17% : 0.000003s : 15: predicate.reduce_eliminate 2.55% : 0.000006s : 44: predicate.redundant_stop_gradient_eliminater 0.38% : 0.000001s : 10: predicate.remove_not_recompute_node 1.36% : 0.000003s : 29: predicate.replace_applicator 0.50% : 0.000001s : 10: predicate.replace_old_param 0.24% : 0.000001s : 5: predicate.reset_defer_inline 0.98% : 0.000002s : 15: predicate.reshape_eliminate 0.70% : 0.000002s : 10: predicate.row_tensor_add_zeros_like 0.48% : 0.000001s : 5: predicate.row_tensor_eliminate 0.79% : 0.000002s : 10: predicate.same_eliminate 0.43% : 0.000001s : 10: predicate.set_cell_output_no_recompute 0.86% : 0.000002s : 10: predicate.shard_identity_eliminate 0.73% : 0.000002s : 10: predicate.special_op_eliminate 0.91% : 0.000002s : 10: predicate.specialize_transform 0.93% : 0.000002s : 10: predicate.split_environ_get_set_with_tuple_value 0.75% : 0.000002s : 10: predicate.stack_unstack_eliminate 0.35% : 0.000001s : 5: predicate.switch_call_monad_eliminater 1.46% : 0.000004s : 23: predicate.switch_defer_inline 2.06% : 0.000005s : 33: predicate.switch_layer_defer_inline 4.97% : 0.000012s : 74: predicate.switch_simplify 0.83% : 0.000002s : 15: predicate.tile_eliminate 0.96% : 0.000002s : 15: predicate.transpose_eliminate 1.49% : 0.000004s : 25: predicate.tuple_list_convert_item_index_to_positive 1.57% : 0.000004s : 25: predicate.tuple_list_get_item_const_eliminator 1.47% : 0.000004s : 25: predicate.tuple_list_get_item_depend_reorder 3.07% : 0.000008s : 39: predicate.tuple_list_get_item_eliminator 1.51% : 0.000004s : 25: predicate.tuple_list_get_set_item_eliminator 2.38% : 0.000006s : 35: predicate.tuple_list_set_item_eliminator 1.67% : 0.000004s : 29: predicate.tuple_to_list_eliminator_ 2.50% : 0.000006s : 44: predicate.updatestate_pure_node_eliminater 3.22% : 0.000008s : 54: predicate.updatestate_useless_node_eliminater 0.44% : 0.000001s : 5: predicate.value_based_eliminate 0.64% : 0.000002s : 10: predicate.virtual_dataset_eliminate 0.63% : 0.000002s : 10: predicate.virtual_output_eliminate 0.31% : 0.000001s : 5: predicate.virtual_view_grad_eliminate 0.44% : 0.000001s : 5: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000483 11 54.43% : 0.000263s : 5: func_graph_cloner_run.FuncGraphClonerGraph 45.57% : 0.000220s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.026852 192 0.02% : 0.000006s : 1: ForceFp32Comm 11.28% : 0.003030s : 1: add_attr 11.23% : 0.003016s : 1: add_attr_with_inline 0.02% : 0.000006s : 1: add_comm_op_reuse_tag 0.24% : 0.000063s : 1: add_recomputation 0.02% : 0.000006s : 1: assign_add_opt 0.27% : 0.000074s : 1: auto_monad 0.11% : 0.000030s : 1: auto_monad_reorder 0.02% : 0.000006s : 1: begin_end_overlap_inline 0.03% : 0.000008s : 1: bias_add_comm_swap 1.78% : 0.000478s : 1: bootstrap 0.11% : 0.000030s : 1: cconv 0.02% : 0.000006s : 1: comm_op_add_attrs 0.08% : 0.000020s : 1: control_data_broadcast_order 0.05% : 0.000013s : 1: convert_after_rewriter 0.13% : 0.000035s : 1: cse_after_recomputation 0.03% : 0.000007s : 1: dataset_repeat_opt 0.07% : 0.000019s : 1: detach_backward 0.04% : 0.000012s : 1: environ_conv 0.11% : 0.000028s : 1: event_method 0.03% : 0.000007s : 1: full_micro_interleaved_order_control 0.03% : 0.000007s : 1: get_jit_bprop_graph 0.04% : 0.000012s : 1: graph_reusing 0.02% : 0.000006s : 1: grouped_pairwise_exchange_alltoall 0.02% : 0.000006s : 1: handle_group_info 0.03% : 0.000008s : 1: inline 0.03% : 0.000009s : 1: insert-virtual-dataset 0.02% : 0.000006s : 1: interleave_parallel_branches 0.02% : 0.000006s : 1: interleave_split_concat_branches 0.03% : 0.000008s : 1: label_fine_grained_interleaved_index 0.03% : 0.000009s : 1: label_micro_interleaved_index 1.76% : 0.000471s : 1: loop_unroll 0.02% : 0.000006s : 1: merge_cast_opt 0.03% : 0.000008s : 1: micro_interleaved_order_control 1.85% : 0.000496s : 1: mutable_eliminate 0.04% : 0.000010s : 1: offloading_packed_experts 0.06% : 0.000016s : 1: opt.transform.loop_unroll_optimizer 0.06% : 0.000017s : 1: opt.transform.mutable_eliminate 5.21% : 0.001399s : 78: opt.transform.opt_a 0.14% : 0.000036s : 1: opt.transform.opt_after_cconv 0.11% : 0.000028s : 1: opt.transform.opt_after_jit_grad 0.52% : 0.000140s : 28: opt.transform.opt_b 0.21% : 0.000057s : 2: opt.transform.opt_trans_graph 0.16% : 0.000044s : 4: opt.transform.symbol_engine_opt 12.03% : 0.003230s : 1: opt_a 0.53% : 0.000142s : 1: opt_after_cconv 1.81% : 0.000486s : 1: opt_after_jit_grad 1.52% : 0.000407s : 1: opt_b 22.53% : 0.006049s : 1: optimize 0.09% : 0.000025s : 1: optimize_parallel_all_gather_comm 0.04% : 0.000012s : 1: order_py_execute_after_rewriter 0.09% : 0.000025s : 1: overlap_grad_flash_sp 0.02% : 0.000006s : 1: overlap_grad_matmul_and_grad_allreduce 0.04% : 0.000010s : 1: overlap_grad_ring_attention 0.03% : 0.000007s : 1: overlap_opt_shard_grad_in_pipeline 0.02% : 0.000006s : 1: overlap_opt_shard_in_pipeline 0.03% : 0.000008s : 1: overlap_param_gather 0.02% : 0.000007s : 1: overlap_recompute_allgather_and_fa_grad 0.04% : 0.000011s : 1: overlap_recompute_and_grad_model_parallel 0.03% : 0.000008s : 1: overlap_recompute_comm 0.03% : 0.000009s : 1: parallel-infer-symbol 0.02% : 0.000006s : 1: parallel-infer-symbol-second 0.03% : 0.000007s : 1: partial_unused_args_eliminate 0.04% : 0.000010s : 1: pipeline_parallel_scheduler 0.03% : 0.000007s : 1: pipeline_split 0.15% : 0.000039s : 1: pre_auto_parallel 0.12% : 0.000032s : 1: py_interpret_to_execute 0.07% : 0.000018s : 1: py_interpret_to_execute_after_opt_a 0.02% : 0.000006s : 1: remove_cast_before_assign_add 0.08% : 0.000021s : 1: remove_dup_value 1.18% : 0.000316s : 1: renormalize.infer 1.03% : 0.000276s : 1: renormalize.specialize 0.03% : 0.000008s : 1: reorder_send_recv_between_fp_bp 0.04% : 0.000010s : 1: rewriter_after_jit_bprop_graph 0.17% : 0.000046s : 1: rewriter_after_opt_a 0.47% : 0.000127s : 1: rewriter_before_opt_a 0.03% : 0.000008s : 1: slice_cell_reuse_recomputed_activation 0.03% : 0.000008s : 1: slice_recompute_activation 0.02% : 0.000006s : 1: split_layernorm_comm 0.03% : 0.000007s : 1: split_matmul_comm_elemetwise 0.05% : 0.000012s : 1: swap_dp_allreduce_reducescatter 0.40% : 0.000106s : 1: symbol_engine_optimizer 0.38% : 0.000101s : 1: tuple_transform 20.53% : 0.005514s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:45:37.358.345 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0174512, [21] [bootstrap]: 0.00046925 [type_inference]: 0.00655294 [event_method]: 2.054e-05 [auto_monad]: 6.878e-05 [graph_reusing]: 5.65001e-06 [inline]: 2.74999e-06 [add_attr]: 0.00361713, [1] [add_attr_with_inline]: 0.00360383, [1] [Cycle 1]: 7.352e-05, [2] [tag_attr]: 2.401e-05 [meta_addattr_fg_expand]: 6.73e-06 [parallel-infer-symbol]: 3.97e-06 [pre_auto_parallel]: 3.852e-05 [insert-virtual-dataset]: 2.40002e-06 [parallel-infer-symbol-second]: 1.36002e-06 [dataset_repeat_opt]: 2.45002e-06 [pipeline_split]: 2.07001e-06 [optimize]: 0.00580341, [53] [py_interpret_to_execute]: 2.825e-05 [rewriter_before_opt_a]: 0.00010133 [opt_a]: 0.00318501, [2] [Cycle 1]: 0.00235456, [45] [expand_dump_flag]: 3.58e-06 [switch_simplify]: 4.662e-05 [loop_unroll]: 3.512e-05 [a_1]: 0.00073667 [with_stream_mark]: 1.827e-05 [recompute_prepare]: 1.041e-05 [updatestate_depend_eliminate]: 4.90001e-06 [updatestate_assign_eliminate]: 4.48001e-06 [updatestate_loads_eliminate]: 3.63e-06 [parameter_eliminate]: 2.37001e-06 [a_2]: 0.00010995 [accelerated_algorithm]: 8.54e-06 [shard]: 2.02001e-06 [meta_shard_fg_expand]: 2.22999e-06 [shard_inline]: 8.07e-06 [merge_send_recv]: 9.20999e-06 [auto_parallel]: 7.98001e-06 [parallel]: 2.083e-05 [flash_sp]: 9.13002e-06 [merge_comm]: 5.10999e-06 [allreduce_fusion]: 4.97999e-06 [matmul_add_comm_reduction]: 1.165e-05 [allreduce_slice_to_reducescatter]: 6.50005e-07 [virtual_shard_identity]: 9.77999e-06 [virtual_dataset]: 8.74998e-06 [get_grad_eliminate_]: 8.12998e-06 [virtual_output]: 8.36002e-06 [merge_forward]: 4.4e-06 [cell_reuse_recompute_pass]: 1.24e-06 [offload_activation]: 1.141e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.587e-05 [merge_recompute_call_nodes]: 1.62001e-06 [before_grad]: 1.384e-05 [set_forward_comm_id_for_comm_node_pass]: 4.55999e-06 [meta_fg_expand]: 3.93001e-06 [flash_sp_send_recv_attached]: 2.56e-06 [receive_attached]: 2.65002e-06 [after_resolve]: 1.324e-05 [a_after_grad]: 1.208e-05 [renormalize]: 0.00077557 [add_forward_monad_depend]: 7.35998e-06 [auto_monad_grad]: 2.76e-06 [auto_monad_eliminator]: 1.833e-05 [cse]: 3.603e-05 [a_3]: 6.493e-05 [Cycle 2]: 0.00081759, [45] [expand_dump_flag]: 1.84998e-06 [switch_simplify]: 9.37001e-06 [loop_unroll]: 8.25e-06 [a_1]: 0.00018723 [with_stream_mark]: 1.529e-05 [recompute_prepare]: 7.88999e-06 [updatestate_depend_eliminate]: 3.99002e-06 [updatestate_assign_eliminate]: 3.26001e-06 [updatestate_loads_eliminate]: 2.79001e-06 [parameter_eliminate]: 1.64998e-06 [a_2]: 9.431e-05 [accelerated_algorithm]: 8.52e-06 [shard]: 1.55999e-06 [meta_shard_fg_expand]: 1.99999e-06 [shard_inline]: 7.46999e-06 [merge_send_recv]: 7.61999e-06 [auto_parallel]: 8.19002e-06 [parallel]: 6.01e-06 [flash_sp]: 3.95e-06 [merge_comm]: 4.43999e-06 [allreduce_fusion]: 4.19002e-06 [matmul_add_comm_reduction]: 7.5e-06 [allreduce_slice_to_reducescatter]: 6.19999e-07 [virtual_shard_identity]: 8.80001e-06 [virtual_dataset]: 7.3e-06 [get_grad_eliminate_]: 7.33e-06 [virtual_output]: 7.83999e-06 [merge_forward]: 4.23001e-06 [cell_reuse_recompute_pass]: 2.17001e-06 [offload_activation]: 9.35001e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.674e-05 [merge_recompute_call_nodes]: 1.08001e-06 [before_grad]: 1.251e-05 [set_forward_comm_id_for_comm_node_pass]: 4.18001e-06 [meta_fg_expand]: 3.16001e-06 [flash_sp_send_recv_attached]: 1.52001e-06 [receive_attached]: 1.50001e-06 [after_resolve]: 1.282e-05 [a_after_grad]: 1.351e-05 [renormalize]: 8.00064e-08 [add_forward_monad_depend]: 1.66e-06 [auto_monad_grad]: 1.32e-06 [auto_monad_eliminator]: 1.008e-05 [cse]: 2.002e-05 [a_3]: 4.818e-05 [py_interpret_to_execute_after_opt_a]: 1.346e-05 [slice_cell_reuse_recomputed_activation]: 1.96e-06 [rewriter_after_opt_a]: 4.204e-05 [convert_after_rewriter]: 7.7e-06 [order_py_execute_after_rewriter]: 6.14999e-06 [mutable_eliminate]: 0.00077239 [opt_b]: 0.00027239, [1] [Cycle 1]: 0.00026346, [7] [b_1]: 0.0001705 [b_2]: 1.016e-05 [updatestate_depend_eliminate]: 6.89999e-06 [updatestate_assign_eliminate]: 3.12997e-06 [updatestate_loads_eliminate]: 3.04999e-06 [renormalize]: 1.11002e-06 [cse]: 2.91e-05 [optimize_parallel_all_gather_comm]: 2.262e-05 [overlap_param_gather]: 2.47001e-06 [cconv]: 2.81e-05 [loop_unroll]: 0.00055542 [opt_after_cconv]: 0.00012976, [1] [Cycle 1]: 0.00012378, [7] [c_1]: 3.914e-05 [parameter_eliminate]: 4.99e-06 [updatestate_depend_eliminate]: 8.06001e-06 [updatestate_assign_eliminate]: 3.44001e-06 [updatestate_loads_eliminate]: 3.17002e-06 [cse]: 2.779e-05 [renormalize]: 7.99977e-07 [remove_dup_value]: 1.612e-05 [tuple_transform]: 9.013e-05, [1] [Cycle 1]: 8.537e-05, [4] [d_1]: 5.504e-05 [none_parameter_eliminate]: 1.82001e-06 [renormalize]: 2.69996e-07 [switch_simplify]: 8.24002e-06 [partial_unused_args_eliminate]: 2.10002e-06 [add_recomputation]: 5.936e-05 [cse_after_recomputation]: 2.751e-05, [1] [Cycle 1]: 2.274e-05, [1] [cse]: 1.7e-05 [environ_conv]: 7.08e-06 [swap_dp_allreduce_reducescatter]: 6.53003e-06 [bias_add_comm_swap]: 2.57001e-06 [label_micro_interleaved_index]: 4.90001e-06 [label_fine_grained_interleaved_index]: 2.54999e-06 [merge_cast_opt]: 1.54e-06 [slice_recompute_activation]: 2.64001e-06 [micro_interleaved_order_control]: 2.39001e-06 [assign_add_opt]: 1.44e-06 [ForceFp32Comm]: 9.39996e-07 [remove_cast_before_assign_add]: 1.13001e-06 [full_micro_interleaved_order_control]: 2.19999e-06 [reorder_send_recv_between_fp_bp]: 2.94001e-06 [comm_op_add_attrs]: 1.15999e-06 [add_comm_op_reuse_tag]: 9.60019e-07 [interleave_split_concat_branches]: 1.44e-06 [interleave_parallel_branches]: 1.13001e-06 [overlap_opt_shard_in_pipeline]: 1.34998e-06 [overlap_opt_shard_grad_in_pipeline]: 1.86998e-06 [control_data_broadcast_order]: 1.587e-05 [grouped_pairwise_exchange_alltoall]: 1.66e-06 [offloading_packed_experts]: 5.05001e-06 [overlap_recompute_and_grad_model_parallel]: 5.22999e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.24e-06 [overlap_recompute_allgather_and_fa_grad]: 1.54e-06 [overlap_recompute_comm]: 2.64001e-06 [overlap_grad_ring_attention]: 4.75999e-06 [overlap_grad_flash_sp]: 2.245e-05 [begin_end_overlap_inline]: 5.00004e-07 [split_matmul_comm_elemetwise]: 2.09e-06 [split_layernorm_comm]: 1.84998e-06 [handle_group_info]: 1.01002e-06 [symbol_engine_optimizer]: 9.097e-05, [1] [Cycle 1]: 8.616e-05, [6] [build]: 3.18e-06 [elim_shapecalc]: 1.391e-05 [elim_not_effective]: 1.732e-05 [opt_reshape]: 8.89e-06 [fold_const_symbol]: 1.28e-05 [renormalize]: 1.8999e-07 [detach_backward]: 2.14e-06 [pipeline_parallel_scheduler]: 1.50999e-06 [auto_monad_reorder]: 2.131e-05 [get_jit_bprop_graph]: 1.80001e-06 [rewriter_after_jit_bprop_graph]: 4.01001e-06 [opt_after_jit_grad]: 0.000597 [validate]: 4.484e-05 Sums bootstrap : 0.000469s : 3.67% type_inference : 0.006553s : 51.26% event_method : 0.000021s : 0.16% auto_monad : 0.000069s : 0.54% graph_reusing : 0.000006s : 0.04% inline : 0.000003s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000024s : 0.19% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000007s : 0.05% parallel-infer-symbol : 0.000004s : 0.03% pre_auto_parallel : 0.000039s : 0.30% insert-virtual-dataset : 0.000002s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.02% optimize.py_interpret_to_execute : 0.000028s : 0.22% optimize.rewriter_before_opt_a : 0.000101s : 0.79% optimize.opt_a.expand_dump_flag : 0.000005s : 0.04% optimize.opt_a.switch_simplify : 0.000056s : 0.44% optimize.opt_a.loop_unroll : 0.000043s : 0.34% optimize.opt_a.a_1 : 0.000924s : 7.23% optimize.opt_a.with_stream_mark : 0.000034s : 0.26% optimize.opt_a.recompute_prepare : 0.000018s : 0.14% optimize.opt_a.updatestate_depend_eliminate : 0.000009s : 0.07% optimize.opt_a.updatestate_assign_eliminate : 0.000008s : 0.06% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.05% optimize.opt_a.parameter_eliminate : 0.000004s : 0.03% optimize.opt_a.a_2 : 0.000204s : 1.60% optimize.opt_a.accelerated_algorithm : 0.000017s : 0.13% optimize.opt_a.shard : 0.000004s : 0.03% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.03% optimize.opt_a.shard_inline : 0.000016s : 0.12% optimize.opt_a.merge_send_recv : 0.000017s : 0.13% optimize.opt_a.auto_parallel : 0.000016s : 0.13% optimize.opt_a.parallel : 0.000027s : 0.21% optimize.opt_a.flash_sp : 0.000013s : 0.10% optimize.opt_a.merge_comm : 0.000010s : 0.07% optimize.opt_a.allreduce_fusion : 0.000009s : 0.07% optimize.opt_a.matmul_add_comm_reduction : 0.000019s : 0.15% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000019s : 0.15% optimize.opt_a.virtual_dataset : 0.000016s : 0.13% optimize.opt_a.get_grad_eliminate_ : 0.000015s : 0.12% optimize.opt_a.virtual_output : 0.000016s : 0.13% optimize.opt_a.merge_forward : 0.000009s : 0.07% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.03% optimize.opt_a.offload_activation : 0.000021s : 0.16% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000033s : 0.26% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.02% optimize.opt_a.before_grad : 0.000026s : 0.21% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000009s : 0.07% optimize.opt_a.meta_fg_expand : 0.000007s : 0.06% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.03% optimize.opt_a.receive_attached : 0.000004s : 0.03% optimize.opt_a.after_resolve : 0.000026s : 0.20% optimize.opt_a.a_after_grad : 0.000026s : 0.20% optimize.opt_a.renormalize : 0.000776s : 6.07% optimize.opt_a.add_forward_monad_depend : 0.000009s : 0.07% optimize.opt_a.auto_monad_grad : 0.000004s : 0.03% optimize.opt_a.auto_monad_eliminator : 0.000028s : 0.22% optimize.opt_a.cse : 0.000056s : 0.44% optimize.opt_a.a_3 : 0.000113s : 0.88% optimize.py_interpret_to_execute_after_opt_a : 0.000013s : 0.11% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.02% optimize.rewriter_after_opt_a : 0.000042s : 0.33% optimize.convert_after_rewriter : 0.000008s : 0.06% optimize.order_py_execute_after_rewriter : 0.000006s : 0.05% optimize.mutable_eliminate : 0.000772s : 6.04% optimize.opt_b.b_1 : 0.000171s : 1.33% optimize.opt_b.b_2 : 0.000010s : 0.08% optimize.opt_b.updatestate_depend_eliminate : 0.000007s : 0.05% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.02% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.02% optimize.opt_b.renormalize : 0.000001s : 0.01% optimize.opt_b.cse : 0.000029s : 0.23% optimize.optimize_parallel_all_gather_comm : 0.000023s : 0.18% optimize.overlap_param_gather : 0.000002s : 0.02% optimize.cconv : 0.000028s : 0.22% optimize.loop_unroll : 0.000555s : 4.34% optimize.opt_after_cconv.c_1 : 0.000039s : 0.31% optimize.opt_after_cconv.parameter_eliminate : 0.000005s : 0.04% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000008s : 0.06% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.02% optimize.opt_after_cconv.cse : 0.000028s : 0.22% optimize.opt_after_cconv.renormalize : 0.000001s : 0.01% optimize.remove_dup_value : 0.000016s : 0.13% optimize.tuple_transform.d_1 : 0.000055s : 0.43% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000008s : 0.06% optimize.partial_unused_args_eliminate : 0.000002s : 0.02% optimize.add_recomputation : 0.000059s : 0.46% optimize.cse_after_recomputation.cse : 0.000017s : 0.13% optimize.environ_conv : 0.000007s : 0.06% optimize.swap_dp_allreduce_reducescatter : 0.000007s : 0.05% optimize.bias_add_comm_swap : 0.000003s : 0.02% optimize.label_micro_interleaved_index : 0.000005s : 0.04% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.02% optimize.merge_cast_opt : 0.000002s : 0.01% optimize.slice_recompute_activation : 0.000003s : 0.02% optimize.micro_interleaved_order_control : 0.000002s : 0.02% optimize.assign_add_opt : 0.000001s : 0.01% optimize.ForceFp32Comm : 0.000001s : 0.01% optimize.remove_cast_before_assign_add : 0.000001s : 0.01% optimize.full_micro_interleaved_order_control : 0.000002s : 0.02% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.02% optimize.comm_op_add_attrs : 0.000001s : 0.01% optimize.add_comm_op_reuse_tag : 0.000001s : 0.01% optimize.interleave_split_concat_branches : 0.000001s : 0.01% optimize.interleave_parallel_branches : 0.000001s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.01% optimize.control_data_broadcast_order : 0.000016s : 0.12% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.01% optimize.offloading_packed_experts : 0.000005s : 0.04% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.04% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000002s : 0.01% optimize.overlap_recompute_comm : 0.000003s : 0.02% optimize.overlap_grad_ring_attention : 0.000005s : 0.04% optimize.overlap_grad_flash_sp : 0.000022s : 0.18% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.02% optimize.split_layernorm_comm : 0.000002s : 0.01% optimize.handle_group_info : 0.000001s : 0.01% optimize.symbol_engine_optimizer.build : 0.000003s : 0.02% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000014s : 0.11% optimize.symbol_engine_optimizer.elim_not_effective : 0.000017s : 0.14% optimize.symbol_engine_optimizer.opt_reshape : 0.000009s : 0.07% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000013s : 0.10% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.02% pipeline_parallel_scheduler : 0.000002s : 0.01% auto_monad_reorder : 0.000021s : 0.17% get_jit_bprop_graph : 0.000002s : 0.01% rewriter_after_jit_bprop_graph : 0.000004s : 0.03% opt_after_jit_grad : 0.000597s : 4.67% validate : 0.000045s : 0.35% Time group info: ------[substitution.] 0.000238 38 12.93% : 0.000031s : 3: substitution.cast_eliminate 1.27% : 0.000003s : 3: substitution.elim_not_effective 0.71% : 0.000002s : 3: substitution.fold_const_symbol 2.90% : 0.000007s : 5: substitution.graph_param_transform 69.43% : 0.000165s : 4: substitution.inline 2.07% : 0.000005s : 6: substitution.j_node_and_user_rematch 2.92% : 0.000007s : 6: substitution.remove_not_recompute_node 2.19% : 0.000005s : 4: substitution.replace_old_param 5.58% : 0.000013s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.006488 2 86.86% : 0.005635s : 1: type_inference.infer 13.14% : 0.000852s : 1: type_inference.specialize ------[replace.] 0.000066 8 61.21% : 0.000040s : 4: replace.inline 38.79% : 0.000026s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000174 8 93.51% : 0.000162s : 4: match.inline 6.49% : 0.000011s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000266 1504 0.96% : 0.000003s : 15: predicate.accumulaten_eliminater 0.95% : 0.000003s : 5: predicate.ad_related_special_op_eliminate 0.62% : 0.000002s : 10: predicate.addn_check_dump 1.06% : 0.000003s : 15: predicate.addn_zero_filter 0.79% : 0.000002s : 15: predicate.adjust_all_reduce_mul_add 2.12% : 0.000006s : 25: predicate.arithmetic_simplify 1.22% : 0.000003s : 15: predicate.cast_eliminate 0.71% : 0.000002s : 10: predicate.check_bprop_eliminate 0.56% : 0.000002s : 10: predicate.compare_switch_simplify 0.19% : 0.000000s : 5: predicate.const_output_eliminate 0.71% : 0.000002s : 10: predicate.depend_value_elim 0.90% : 0.000002s : 15: predicate.dict_get_item_const_eliminator 1.06% : 0.000003s : 15: predicate.dict_get_item_eliminator 0.85% : 0.000002s : 15: predicate.dict_set_item_eliminator 0.93% : 0.000002s : 10: predicate.dumpgradient_eliminate 0.24% : 0.000001s : 5: predicate.elim_not_effective 0.47% : 0.000001s : 5: predicate.elim_shapecalc_of_broadcastargs 1.31% : 0.000003s : 20: predicate.environ_add_const_eliminate 1.36% : 0.000004s : 20: predicate.environ_get_add_eliminate 1.14% : 0.000003s : 20: predicate.environ_get_depend_swap 1.77% : 0.000005s : 30: predicate.environ_get_eliminate 1.08% : 0.000003s : 20: predicate.environ_get_set_eliminate 1.39% : 0.000004s : 23: predicate.exchange_switch_depend_value 2.23% : 0.000006s : 23: predicate.float_depend_g_call 0.60% : 0.000002s : 10: predicate.float_environ_get_switch 0.90% : 0.000002s : 15: predicate.float_tuple_getitem_switch 0.17% : 0.000000s : 5: predicate.fold_const_symbol 0.77% : 0.000002s : 10: predicate.get_grad_eliminate 0.19% : 0.000001s : 5: predicate.graph_param_transform 0.58% : 0.000002s : 10: predicate.incorporate_call 0.51% : 0.000001s : 10: predicate.incorporate_call_switch 5.95% : 0.000016s : 68: predicate.inline 0.75% : 0.000002s : 10: predicate.inline_without_move 0.33% : 0.000001s : 10: predicate.j_node_and_user_rematch 0.85% : 0.000002s : 10: predicate.less_batch_normalization 1.83% : 0.000005s : 29: predicate.list_to_tuple_eliminator_ 2.67% : 0.000007s : 44: predicate.load_eliminater 0.94% : 0.000003s : 5: predicate.loop_unroll_after_grad 2.21% : 0.000006s : 36: predicate.loop_unroll_before_grad 1.62% : 0.000004s : 25: predicate.make_slice_get_slice_eliminator 0.63% : 0.000002s : 10: predicate.merge_addn 0.54% : 0.000001s : 10: predicate.micro_step_allgather_replace 0.59% : 0.000002s : 10: predicate.mini_step_allgather_replace 0.77% : 0.000002s : 15: predicate.minmaximum_grad 1.26% : 0.000003s : 5: predicate.mutable_eliminate 0.41% : 0.000001s : 5: predicate.opt_reshape 0.51% : 0.000001s : 5: predicate.parallel_virtual_node 1.65% : 0.000004s : 23: predicate.partial_defer_inline 1.56% : 0.000004s : 24: predicate.partial_eliminate 0.96% : 0.000003s : 15: predicate.print_const_string_wrapper 0.68% : 0.000002s : 10: predicate.reduce_all_const_elim 1.16% : 0.000003s : 15: predicate.reduce_eliminate 2.64% : 0.000007s : 44: predicate.redundant_stop_gradient_eliminater 0.36% : 0.000001s : 10: predicate.remove_not_recompute_node 1.34% : 0.000004s : 29: predicate.replace_applicator 0.49% : 0.000001s : 10: predicate.replace_old_param 0.26% : 0.000001s : 5: predicate.reset_defer_inline 1.04% : 0.000003s : 15: predicate.reshape_eliminate 0.61% : 0.000002s : 10: predicate.row_tensor_add_zeros_like 0.42% : 0.000001s : 5: predicate.row_tensor_eliminate 0.74% : 0.000002s : 10: predicate.same_eliminate 0.38% : 0.000001s : 10: predicate.set_cell_output_no_recompute 0.84% : 0.000002s : 10: predicate.shard_identity_eliminate 0.71% : 0.000002s : 10: predicate.special_op_eliminate 0.71% : 0.000002s : 10: predicate.specialize_transform 0.98% : 0.000003s : 10: predicate.split_environ_get_set_with_tuple_value 0.83% : 0.000002s : 10: predicate.stack_unstack_eliminate 0.39% : 0.000001s : 5: predicate.switch_call_monad_eliminater 1.37% : 0.000004s : 23: predicate.switch_defer_inline 1.90% : 0.000005s : 33: predicate.switch_layer_defer_inline 4.73% : 0.000013s : 74: predicate.switch_simplify 1.03% : 0.000003s : 15: predicate.tile_eliminate 0.86% : 0.000002s : 15: predicate.transpose_eliminate 1.47% : 0.000004s : 25: predicate.tuple_list_convert_item_index_to_positive 1.67% : 0.000004s : 25: predicate.tuple_list_get_item_const_eliminator 1.49% : 0.000004s : 25: predicate.tuple_list_get_item_depend_reorder 3.32% : 0.000009s : 39: predicate.tuple_list_get_item_eliminator 1.39% : 0.000004s : 25: predicate.tuple_list_get_set_item_eliminator 2.29% : 0.000006s : 35: predicate.tuple_list_set_item_eliminator 1.81% : 0.000005s : 29: predicate.tuple_to_list_eliminator_ 2.24% : 0.000006s : 44: predicate.updatestate_pure_node_eliminater 3.03% : 0.000008s : 54: predicate.updatestate_useless_node_eliminater 0.50% : 0.000001s : 5: predicate.value_based_eliminate 0.67% : 0.000002s : 10: predicate.virtual_dataset_eliminate 0.63% : 0.000002s : 10: predicate.virtual_output_eliminate 0.26% : 0.000001s : 5: predicate.virtual_view_grad_eliminate 0.38% : 0.000001s : 5: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000578 11 54.68% : 0.000316s : 5: func_graph_cloner_run.FuncGraphClonerGraph 45.32% : 0.000262s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.029335 192 0.01% : 0.000004s : 1: ForceFp32Comm 12.35% : 0.003624s : 1: add_attr 12.30% : 0.003608s : 1: add_attr_with_inline 0.01% : 0.000004s : 1: add_comm_op_reuse_tag 0.22% : 0.000064s : 1: add_recomputation 0.01% : 0.000004s : 1: assign_add_opt 0.26% : 0.000076s : 1: auto_monad 0.09% : 0.000026s : 1: auto_monad_reorder 0.01% : 0.000003s : 1: begin_end_overlap_inline 0.02% : 0.000005s : 1: bias_add_comm_swap 1.71% : 0.000500s : 1: bootstrap 0.11% : 0.000032s : 1: cconv 0.01% : 0.000004s : 1: comm_op_add_attrs 0.06% : 0.000019s : 1: control_data_broadcast_order 0.04% : 0.000011s : 1: convert_after_rewriter 0.10% : 0.000031s : 1: cse_after_recomputation 0.02% : 0.000006s : 1: dataset_repeat_opt 0.02% : 0.000005s : 1: detach_backward 0.04% : 0.000010s : 1: environ_conv 0.09% : 0.000026s : 1: event_method 0.02% : 0.000005s : 1: full_micro_interleaved_order_control 0.02% : 0.000005s : 1: get_jit_bprop_graph 0.03% : 0.000009s : 1: graph_reusing 0.01% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000004s : 1: handle_group_info 0.02% : 0.000006s : 1: inline 0.02% : 0.000007s : 1: insert-virtual-dataset 0.01% : 0.000004s : 1: interleave_parallel_branches 0.02% : 0.000005s : 1: interleave_split_concat_branches 0.02% : 0.000006s : 1: label_fine_grained_interleaved_index 0.03% : 0.000008s : 1: label_micro_interleaved_index 1.92% : 0.000564s : 1: loop_unroll 0.02% : 0.000005s : 1: merge_cast_opt 0.02% : 0.000005s : 1: micro_interleaved_order_control 2.67% : 0.000784s : 1: mutable_eliminate 0.03% : 0.000008s : 1: offloading_packed_experts 0.06% : 0.000019s : 1: opt.transform.loop_unroll_optimizer 0.08% : 0.000023s : 1: opt.transform.mutable_eliminate 5.01% : 0.001468s : 78: opt.transform.opt_a 0.13% : 0.000038s : 1: opt.transform.opt_after_cconv 0.11% : 0.000033s : 1: opt.transform.opt_after_jit_grad 0.49% : 0.000144s : 28: opt.transform.opt_b 0.21% : 0.000061s : 2: opt.transform.opt_trans_graph 0.16% : 0.000048s : 4: opt.transform.symbol_engine_opt 10.87% : 0.003189s : 1: opt_a 0.46% : 0.000134s : 1: opt_after_cconv 2.07% : 0.000608s : 1: opt_after_jit_grad 0.94% : 0.000276s : 1: opt_b 19.80% : 0.005809s : 1: optimize 0.09% : 0.000026s : 1: optimize_parallel_all_gather_comm 0.03% : 0.000009s : 1: order_py_execute_after_rewriter 0.09% : 0.000026s : 1: overlap_grad_flash_sp 0.01% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.03% : 0.000009s : 1: overlap_grad_ring_attention 0.02% : 0.000006s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.02% : 0.000006s : 1: overlap_param_gather 0.01% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.03% : 0.000008s : 1: overlap_recompute_and_grad_model_parallel 0.02% : 0.000006s : 1: overlap_recompute_comm 0.09% : 0.000027s : 1: parallel-infer-symbol 0.02% : 0.000005s : 1: parallel-infer-symbol-second 0.02% : 0.000005s : 1: partial_unused_args_eliminate 0.02% : 0.000005s : 1: pipeline_parallel_scheduler 0.02% : 0.000005s : 1: pipeline_split 0.15% : 0.000043s : 1: pre_auto_parallel 0.11% : 0.000033s : 1: py_interpret_to_execute 0.06% : 0.000017s : 1: py_interpret_to_execute_after_opt_a 0.02% : 0.000005s : 1: remove_cast_before_assign_add 0.07% : 0.000020s : 1: remove_dup_value 1.42% : 0.000416s : 1: renormalize.infer 1.20% : 0.000351s : 1: renormalize.specialize 0.02% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.02% : 0.000007s : 1: rewriter_after_jit_bprop_graph 0.16% : 0.000047s : 1: rewriter_after_opt_a 0.36% : 0.000106s : 1: rewriter_before_opt_a 0.02% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.02% : 0.000005s : 1: slice_recompute_activation 0.02% : 0.000005s : 1: split_layernorm_comm 0.02% : 0.000005s : 1: split_matmul_comm_elemetwise 0.03% : 0.000009s : 1: swap_dp_allreduce_reducescatter 0.32% : 0.000094s : 1: symbol_engine_optimizer 0.32% : 0.000093s : 1: tuple_transform 22.41% : 0.006574s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:45:37.661.374 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:45:37.661.724 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0168457, [21] [bootstrap]: 0.00045071 [type_inference]: 0.00611566 [event_method]: 2.153e-05 [auto_monad]: 6.514e-05 [graph_reusing]: 6.82002e-06 [inline]: 1.88002e-06 [add_attr]: 0.0032941, [1] [add_attr_with_inline]: 0.0032851, [1] [Cycle 1]: 7.081e-05, [2] [tag_attr]: 2.004e-05 [meta_addattr_fg_expand]: 6.02999e-06 [parallel-infer-symbol]: 3.35e-06 [pre_auto_parallel]: 3.698e-05 [insert-virtual-dataset]: 2.99999e-06 [parallel-infer-symbol-second]: 9.29984e-07 [dataset_repeat_opt]: 2.01e-06 [pipeline_split]: 1.99e-06 [optimize]: 0.00564249, [53] [py_interpret_to_execute]: 2.882e-05 [rewriter_before_opt_a]: 8.648e-05 [opt_a]: 0.00309146, [2] [Cycle 1]: 0.00222166, [45] [expand_dump_flag]: 3.4e-06 [switch_simplify]: 4.094e-05 [loop_unroll]: 3.055e-05 [a_1]: 0.00063335 [with_stream_mark]: 1.606e-05 [recompute_prepare]: 1.139e-05 [updatestate_depend_eliminate]: 4.48001e-06 [updatestate_assign_eliminate]: 3.31001e-06 [updatestate_loads_eliminate]: 3.08e-06 [parameter_eliminate]: 1.91e-06 [a_2]: 0.00011168 [accelerated_algorithm]: 7.53999e-06 [shard]: 2.07999e-06 [meta_shard_fg_expand]: 1.93002e-06 [shard_inline]: 6.94999e-06 [merge_send_recv]: 9.20999e-06 [auto_parallel]: 7.55998e-06 [parallel]: 1.718e-05 [flash_sp]: 9.14e-06 [merge_comm]: 4.11001e-06 [allreduce_fusion]: 3.63e-06 [matmul_add_comm_reduction]: 9.73998e-06 [allreduce_slice_to_reducescatter]: 6.19999e-07 [virtual_shard_identity]: 8.87e-06 [virtual_dataset]: 7.08998e-06 [get_grad_eliminate_]: 6.98998e-06 [virtual_output]: 6.94001e-06 [merge_forward]: 4.55001e-06 [cell_reuse_recompute_pass]: 1.32999e-06 [offload_activation]: 1.051e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.554e-05 [merge_recompute_call_nodes]: 1.47001e-06 [before_grad]: 1.075e-05 [set_forward_comm_id_for_comm_node_pass]: 4.05e-06 [meta_fg_expand]: 2.99001e-06 [flash_sp_send_recv_attached]: 2.95998e-06 [receive_attached]: 2.91999e-06 [after_resolve]: 1.341e-05 [a_after_grad]: 1.067e-05 [renormalize]: 0.00066609 [add_forward_monad_depend]: 4.90999e-06 [auto_monad_grad]: 2.76e-06 [auto_monad_eliminator]: 1.703e-05 [cse]: 2.45e-05 [a_3]: 6.612e-05 [Cycle 2]: 0.00085528, [45] [expand_dump_flag]: 2.01998e-06 [switch_simplify]: 8.70999e-06 [loop_unroll]: 6.34999e-06 [a_1]: 0.00014153 [with_stream_mark]: 1.245e-05 [recompute_prepare]: 6.82002e-06 [updatestate_depend_eliminate]: 3.03e-06 [updatestate_assign_eliminate]: 2.56998e-06 [updatestate_loads_eliminate]: 2.43002e-06 [parameter_eliminate]: 1.25999e-06 [a_2]: 9.826e-05 [accelerated_algorithm]: 6.54999e-06 [shard]: 1.15001e-06 [meta_shard_fg_expand]: 1.44e-06 [shard_inline]: 6.07001e-06 [merge_send_recv]: 9.14e-06 [auto_parallel]: 6.96001e-06 [parallel]: 5.37001e-06 [flash_sp]: 3.93999e-06 [merge_comm]: 3.93999e-06 [allreduce_fusion]: 3.25998e-06 [matmul_add_comm_reduction]: 5.82001e-06 [allreduce_slice_to_reducescatter]: 3.69997e-07 [virtual_shard_identity]: 7.36001e-06 [virtual_dataset]: 6.81001e-06 [get_grad_eliminate_]: 6.56e-06 [virtual_output]: 6.01998e-06 [merge_forward]: 3.54002e-06 [cell_reuse_recompute_pass]: 1.44998e-06 [offload_activation]: 8.40001e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.598e-05 [merge_recompute_call_nodes]: 9.39996e-07 [before_grad]: 1.044e-05 [set_forward_comm_id_for_comm_node_pass]: 4.14997e-06 [meta_fg_expand]: 2.53e-06 [flash_sp_send_recv_attached]: 1.17e-06 [receive_attached]: 1.53002e-06 [after_resolve]: 1.089e-05 [a_after_grad]: 1.012e-05 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 2.37999e-06 [auto_monad_grad]: 1.30001e-06 [auto_monad_eliminator]: 8.02e-06 [cse]: 1.46e-05 [a_3]: 5.017e-05 [py_interpret_to_execute_after_opt_a]: 1.399e-05 [slice_cell_reuse_recomputed_activation]: 4.68999e-06 [rewriter_after_opt_a]: 3.799e-05 [convert_after_rewriter]: 1.034e-05 [order_py_execute_after_rewriter]: 8.08001e-06 [mutable_eliminate]: 0.00072136 [opt_b]: 0.00027368, [1] [Cycle 1]: 0.00026374, [7] [b_1]: 0.00016863 [b_2]: 8.35001e-06 [updatestate_depend_eliminate]: 6.79999e-06 [updatestate_assign_eliminate]: 2.39999e-06 [updatestate_loads_eliminate]: 2.43e-06 [renormalize]: 6.39993e-07 [cse]: 1.898e-05 [optimize_parallel_all_gather_comm]: 2.205e-05 [overlap_param_gather]: 5.05999e-06 [cconv]: 2.715e-05 [loop_unroll]: 0.00048568 [opt_after_cconv]: 0.00013121, [1] [Cycle 1]: 0.00012144, [7] [c_1]: 3.18e-05 [parameter_eliminate]: 4.04002e-06 [updatestate_depend_eliminate]: 6.05002e-06 [updatestate_assign_eliminate]: 2.57001e-06 [updatestate_loads_eliminate]: 2.48e-06 [cse]: 1.808e-05 [renormalize]: 3.80009e-07 [remove_dup_value]: 1.682e-05 [tuple_transform]: 9.537e-05, [1] [Cycle 1]: 8.779e-05, [4] [d_1]: 4.735e-05 [none_parameter_eliminate]: 1.76e-06 [renormalize]: 1.50001e-07 [switch_simplify]: 7.16001e-06 [partial_unused_args_eliminate]: 4.50001e-06 [add_recomputation]: 5.224e-05 [cse_after_recomputation]: 2.84e-05, [1] [Cycle 1]: 2.089e-05, [1] [cse]: 1.099e-05 [environ_conv]: 8.77999e-06 [swap_dp_allreduce_reducescatter]: 7.61001e-06 [bias_add_comm_swap]: 5.07999e-06 [label_micro_interleaved_index]: 7.2e-06 [label_fine_grained_interleaved_index]: 5.29e-06 [merge_cast_opt]: 3.81999e-06 [slice_recompute_activation]: 4.46002e-06 [micro_interleaved_order_control]: 4.97999e-06 [assign_add_opt]: 3.9e-06 [ForceFp32Comm]: 3.38e-06 [remove_cast_before_assign_add]: 3.36001e-06 [full_micro_interleaved_order_control]: 4.56002e-06 [reorder_send_recv_between_fp_bp]: 5.14e-06 [comm_op_add_attrs]: 3.26001e-06 [add_comm_op_reuse_tag]: 3.26999e-06 [interleave_split_concat_branches]: 3.54002e-06 [interleave_parallel_branches]: 3.58e-06 [overlap_opt_shard_in_pipeline]: 3.73001e-06 [overlap_opt_shard_grad_in_pipeline]: 4.45e-06 [control_data_broadcast_order]: 1.521e-05 [grouped_pairwise_exchange_alltoall]: 4.23999e-06 [offloading_packed_experts]: 6.85998e-06 [overlap_recompute_and_grad_model_parallel]: 7.46999e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.63999e-06 [overlap_recompute_allgather_and_fa_grad]: 3.75e-06 [overlap_recompute_comm]: 4.80001e-06 [overlap_grad_ring_attention]: 6.93e-06 [overlap_grad_flash_sp]: 2.3e-05 [begin_end_overlap_inline]: 3.08e-06 [split_matmul_comm_elemetwise]: 5.19998e-06 [split_layernorm_comm]: 4.33999e-06 [handle_group_info]: 3.61001e-06 [symbol_engine_optimizer]: 0.0001008, [1] [Cycle 1]: 9.391e-05, [6] [build]: 3.46999e-06 [elim_shapecalc]: 1.093e-05 [elim_not_effective]: 1.359e-05 [opt_reshape]: 7.22997e-06 [fold_const_symbol]: 1.131e-05 [renormalize]: 3.30008e-07 [detach_backward]: 4.08999e-06 [pipeline_parallel_scheduler]: 1.74e-06 [auto_monad_reorder]: 2.047e-05 [get_jit_bprop_graph]: 1.35001e-06 [rewriter_after_jit_bprop_graph]: 4.53001e-06 [opt_after_jit_grad]: 0.00053774 [validate]: 3.923e-05 Sums bootstrap : 0.000451s : 3.82% type_inference : 0.006116s : 51.89% event_method : 0.000022s : 0.18% auto_monad : 0.000065s : 0.55% graph_reusing : 0.000007s : 0.06% inline : 0.000002s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000020s : 0.17% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.05% parallel-infer-symbol : 0.000003s : 0.03% pre_auto_parallel : 0.000037s : 0.31% insert-virtual-dataset : 0.000003s : 0.03% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.02% optimize.py_interpret_to_execute : 0.000029s : 0.24% optimize.rewriter_before_opt_a : 0.000086s : 0.73% optimize.opt_a.expand_dump_flag : 0.000005s : 0.05% optimize.opt_a.switch_simplify : 0.000050s : 0.42% optimize.opt_a.loop_unroll : 0.000037s : 0.31% optimize.opt_a.a_1 : 0.000775s : 6.57% optimize.opt_a.with_stream_mark : 0.000029s : 0.24% optimize.opt_a.recompute_prepare : 0.000018s : 0.15% optimize.opt_a.updatestate_depend_eliminate : 0.000008s : 0.06% optimize.opt_a.updatestate_assign_eliminate : 0.000006s : 0.05% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.05% optimize.opt_a.parameter_eliminate : 0.000003s : 0.03% optimize.opt_a.a_2 : 0.000210s : 1.78% optimize.opt_a.accelerated_algorithm : 0.000014s : 0.12% optimize.opt_a.shard : 0.000003s : 0.03% optimize.opt_a.meta_shard_fg_expand : 0.000003s : 0.03% optimize.opt_a.shard_inline : 0.000013s : 0.11% optimize.opt_a.merge_send_recv : 0.000018s : 0.16% optimize.opt_a.auto_parallel : 0.000015s : 0.12% optimize.opt_a.parallel : 0.000023s : 0.19% optimize.opt_a.flash_sp : 0.000013s : 0.11% optimize.opt_a.merge_comm : 0.000008s : 0.07% optimize.opt_a.allreduce_fusion : 0.000007s : 0.06% optimize.opt_a.matmul_add_comm_reduction : 0.000016s : 0.13% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000016s : 0.14% optimize.opt_a.virtual_dataset : 0.000014s : 0.12% optimize.opt_a.get_grad_eliminate_ : 0.000014s : 0.11% optimize.opt_a.virtual_output : 0.000013s : 0.11% optimize.opt_a.merge_forward : 0.000008s : 0.07% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.02% optimize.opt_a.offload_activation : 0.000019s : 0.16% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000032s : 0.27% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.02% optimize.opt_a.before_grad : 0.000021s : 0.18% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000008s : 0.07% optimize.opt_a.meta_fg_expand : 0.000006s : 0.05% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.04% optimize.opt_a.receive_attached : 0.000004s : 0.04% optimize.opt_a.after_resolve : 0.000024s : 0.21% optimize.opt_a.a_after_grad : 0.000021s : 0.18% optimize.opt_a.renormalize : 0.000666s : 5.65% optimize.opt_a.add_forward_monad_depend : 0.000007s : 0.06% optimize.opt_a.auto_monad_grad : 0.000004s : 0.03% optimize.opt_a.auto_monad_eliminator : 0.000025s : 0.21% optimize.opt_a.cse : 0.000039s : 0.33% optimize.opt_a.a_3 : 0.000116s : 0.99% optimize.py_interpret_to_execute_after_opt_a : 0.000014s : 0.12% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.04% optimize.rewriter_after_opt_a : 0.000038s : 0.32% optimize.convert_after_rewriter : 0.000010s : 0.09% optimize.order_py_execute_after_rewriter : 0.000008s : 0.07% optimize.mutable_eliminate : 0.000721s : 6.12% optimize.opt_b.b_1 : 0.000169s : 1.43% optimize.opt_b.b_2 : 0.000008s : 0.07% optimize.opt_b.updatestate_depend_eliminate : 0.000007s : 0.06% optimize.opt_b.updatestate_assign_eliminate : 0.000002s : 0.02% optimize.opt_b.updatestate_loads_eliminate : 0.000002s : 0.02% optimize.opt_b.renormalize : 0.000001s : 0.01% optimize.opt_b.cse : 0.000019s : 0.16% optimize.optimize_parallel_all_gather_comm : 0.000022s : 0.19% optimize.overlap_param_gather : 0.000005s : 0.04% optimize.cconv : 0.000027s : 0.23% optimize.loop_unroll : 0.000486s : 4.12% optimize.opt_after_cconv.c_1 : 0.000032s : 0.27% optimize.opt_after_cconv.parameter_eliminate : 0.000004s : 0.03% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.05% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.02% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.02% optimize.opt_after_cconv.cse : 0.000018s : 0.15% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000017s : 0.14% optimize.tuple_transform.d_1 : 0.000047s : 0.40% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000007s : 0.06% optimize.partial_unused_args_eliminate : 0.000005s : 0.04% optimize.add_recomputation : 0.000052s : 0.44% optimize.cse_after_recomputation.cse : 0.000011s : 0.09% optimize.environ_conv : 0.000009s : 0.07% optimize.swap_dp_allreduce_reducescatter : 0.000008s : 0.06% optimize.bias_add_comm_swap : 0.000005s : 0.04% optimize.label_micro_interleaved_index : 0.000007s : 0.06% optimize.label_fine_grained_interleaved_index : 0.000005s : 0.04% optimize.merge_cast_opt : 0.000004s : 0.03% optimize.slice_recompute_activation : 0.000004s : 0.04% optimize.micro_interleaved_order_control : 0.000005s : 0.04% optimize.assign_add_opt : 0.000004s : 0.03% optimize.ForceFp32Comm : 0.000003s : 0.03% optimize.remove_cast_before_assign_add : 0.000003s : 0.03% optimize.full_micro_interleaved_order_control : 0.000005s : 0.04% optimize.reorder_send_recv_between_fp_bp : 0.000005s : 0.04% optimize.comm_op_add_attrs : 0.000003s : 0.03% optimize.add_comm_op_reuse_tag : 0.000003s : 0.03% optimize.interleave_split_concat_branches : 0.000004s : 0.03% optimize.interleave_parallel_branches : 0.000004s : 0.03% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.03% optimize.overlap_opt_shard_grad_in_pipeline : 0.000004s : 0.04% optimize.control_data_broadcast_order : 0.000015s : 0.13% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.04% optimize.offloading_packed_experts : 0.000007s : 0.06% optimize.overlap_recompute_and_grad_model_parallel : 0.000007s : 0.06% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.03% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.03% optimize.overlap_recompute_comm : 0.000005s : 0.04% optimize.overlap_grad_ring_attention : 0.000007s : 0.06% optimize.overlap_grad_flash_sp : 0.000023s : 0.20% optimize.begin_end_overlap_inline : 0.000003s : 0.03% optimize.split_matmul_comm_elemetwise : 0.000005s : 0.04% optimize.split_layernorm_comm : 0.000004s : 0.04% optimize.handle_group_info : 0.000004s : 0.03% optimize.symbol_engine_optimizer.build : 0.000003s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000011s : 0.09% optimize.symbol_engine_optimizer.elim_not_effective : 0.000014s : 0.12% optimize.symbol_engine_optimizer.opt_reshape : 0.000007s : 0.06% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000011s : 0.10% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000004s : 0.03% pipeline_parallel_scheduler : 0.000002s : 0.01% auto_monad_reorder : 0.000020s : 0.17% get_jit_bprop_graph : 0.000001s : 0.01% rewriter_after_jit_bprop_graph : 0.000005s : 0.04% opt_after_jit_grad : 0.000538s : 4.56% validate : 0.000039s : 0.33% Time group info: ------[substitution.] 0.000184 28 0.97% : 0.000002s : 2: substitution.elim_not_effective 0.93% : 0.000002s : 2: substitution.fold_const_symbol 3.25% : 0.000006s : 4: substitution.graph_param_transform 79.25% : 0.000146s : 4: substitution.inline 1.97% : 0.000004s : 4: substitution.j_node_and_user_rematch 2.73% : 0.000005s : 4: substitution.remove_not_recompute_node 2.76% : 0.000005s : 4: substitution.replace_old_param 8.15% : 0.000015s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.006060 2 87.87% : 0.005325s : 1: type_inference.infer 12.13% : 0.000735s : 1: type_inference.specialize ------[replace.] 0.000061 8 61.35% : 0.000037s : 4: replace.inline 38.65% : 0.000023s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000156 8 91.68% : 0.000143s : 4: match.inline 8.32% : 0.000013s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000213 1278 0.95% : 0.000002s : 13: predicate.accumulaten_eliminater 0.79% : 0.000002s : 4: predicate.ad_related_special_op_eliminate 0.56% : 0.000001s : 8: predicate.addn_check_dump 1.02% : 0.000002s : 13: predicate.addn_zero_filter 0.80% : 0.000002s : 13: predicate.adjust_all_reduce_mul_add 2.24% : 0.000005s : 21: predicate.arithmetic_simplify 0.93% : 0.000002s : 13: predicate.cast_eliminate 0.77% : 0.000002s : 8: predicate.check_bprop_eliminate 0.52% : 0.000001s : 8: predicate.compare_switch_simplify 0.19% : 0.000000s : 4: predicate.const_output_eliminate 0.69% : 0.000001s : 8: predicate.depend_value_elim 0.92% : 0.000002s : 13: predicate.dict_get_item_const_eliminator 1.05% : 0.000002s : 13: predicate.dict_get_item_eliminator 0.83% : 0.000002s : 13: predicate.dict_set_item_eliminator 0.91% : 0.000002s : 8: predicate.dumpgradient_eliminate 0.26% : 0.000001s : 4: predicate.elim_not_effective 0.37% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.14% : 0.000002s : 17: predicate.environ_add_const_eliminate 1.03% : 0.000002s : 17: predicate.environ_get_add_eliminate 1.07% : 0.000002s : 17: predicate.environ_get_depend_swap 1.64% : 0.000004s : 25: predicate.environ_get_eliminate 1.02% : 0.000002s : 17: predicate.environ_get_set_eliminate 1.49% : 0.000003s : 21: predicate.exchange_switch_depend_value 2.43% : 0.000005s : 21: predicate.float_depend_g_call 0.56% : 0.000001s : 8: predicate.float_environ_get_switch 0.91% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.17% : 0.000000s : 4: predicate.fold_const_symbol 0.74% : 0.000002s : 8: predicate.get_grad_eliminate 0.28% : 0.000001s : 4: predicate.graph_param_transform 0.55% : 0.000001s : 8: predicate.incorporate_call 0.47% : 0.000001s : 8: predicate.incorporate_call_switch 5.95% : 0.000013s : 58: predicate.inline 0.74% : 0.000002s : 8: predicate.inline_without_move 0.32% : 0.000001s : 8: predicate.j_node_and_user_rematch 0.79% : 0.000002s : 8: predicate.less_batch_normalization 1.87% : 0.000004s : 25: predicate.list_to_tuple_eliminator_ 2.48% : 0.000005s : 38: predicate.load_eliminater 0.90% : 0.000002s : 4: predicate.loop_unroll_after_grad 2.41% : 0.000005s : 34: predicate.loop_unroll_before_grad 1.75% : 0.000004s : 21: predicate.make_slice_get_slice_eliminator 0.56% : 0.000001s : 8: predicate.merge_addn 0.67% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.64% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.89% : 0.000002s : 13: predicate.minmaximum_grad 0.88% : 0.000002s : 4: predicate.mutable_eliminate 0.35% : 0.000001s : 4: predicate.opt_reshape 0.32% : 0.000001s : 4: predicate.parallel_virtual_node 1.87% : 0.000004s : 21: predicate.partial_defer_inline 1.63% : 0.000003s : 21: predicate.partial_eliminate 0.90% : 0.000002s : 13: predicate.print_const_string_wrapper 0.91% : 0.000002s : 8: predicate.reduce_all_const_elim 1.37% : 0.000003s : 13: predicate.reduce_eliminate 2.47% : 0.000005s : 38: predicate.redundant_stop_gradient_eliminater 0.59% : 0.000001s : 8: predicate.remove_not_recompute_node 1.42% : 0.000003s : 25: predicate.replace_applicator 0.53% : 0.000001s : 8: predicate.replace_old_param 0.27% : 0.000001s : 4: predicate.reset_defer_inline 0.88% : 0.000002s : 13: predicate.reshape_eliminate 0.69% : 0.000001s : 8: predicate.row_tensor_add_zeros_like 0.47% : 0.000001s : 4: predicate.row_tensor_eliminate 0.79% : 0.000002s : 8: predicate.same_eliminate 0.43% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.70% : 0.000001s : 8: predicate.shard_identity_eliminate 0.67% : 0.000001s : 8: predicate.special_op_eliminate 0.64% : 0.000001s : 8: predicate.specialize_transform 0.92% : 0.000002s : 8: predicate.split_environ_get_set_with_tuple_value 0.80% : 0.000002s : 8: predicate.stack_unstack_eliminate 0.33% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.51% : 0.000003s : 21: predicate.switch_defer_inline 2.11% : 0.000005s : 29: predicate.switch_layer_defer_inline 5.22% : 0.000011s : 67: predicate.switch_simplify 1.04% : 0.000002s : 13: predicate.tile_eliminate 0.90% : 0.000002s : 13: predicate.transpose_eliminate 1.49% : 0.000003s : 21: predicate.tuple_list_convert_item_index_to_positive 1.60% : 0.000003s : 21: predicate.tuple_list_get_item_const_eliminator 1.41% : 0.000003s : 21: predicate.tuple_list_get_item_depend_reorder 3.27% : 0.000007s : 33: predicate.tuple_list_get_item_eliminator 1.44% : 0.000003s : 21: predicate.tuple_list_get_set_item_eliminator 2.26% : 0.000005s : 29: predicate.tuple_list_set_item_eliminator 1.76% : 0.000004s : 25: predicate.tuple_to_list_eliminator_ 2.39% : 0.000005s : 38: predicate.updatestate_pure_node_eliminater 3.06% : 0.000007s : 46: predicate.updatestate_useless_node_eliminater 0.43% : 0.000001s : 4: predicate.value_based_eliminate 0.63% : 0.000001s : 8: predicate.virtual_dataset_eliminate 0.67% : 0.000001s : 8: predicate.virtual_output_eliminate 0.30% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.37% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000561 11 55.51% : 0.000312s : 5: func_graph_cloner_run.FuncGraphClonerGraph 44.49% : 0.000250s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.027805 192 0.02% : 0.000006s : 1: ForceFp32Comm 11.88% : 0.003304s : 1: add_attr 11.83% : 0.003289s : 1: add_attr_with_inline 0.02% : 0.000006s : 1: add_comm_op_reuse_tag 0.20% : 0.000056s : 1: add_recomputation 0.02% : 0.000006s : 1: assign_add_opt 0.27% : 0.000074s : 1: auto_monad 0.10% : 0.000029s : 1: auto_monad_reorder 0.02% : 0.000006s : 1: begin_end_overlap_inline 0.03% : 0.000008s : 1: bias_add_comm_swap 1.77% : 0.000493s : 1: bootstrap 0.11% : 0.000030s : 1: cconv 0.02% : 0.000006s : 1: comm_op_add_attrs 0.06% : 0.000018s : 1: control_data_broadcast_order 0.05% : 0.000013s : 1: convert_after_rewriter 0.11% : 0.000031s : 1: cse_after_recomputation 0.03% : 0.000008s : 1: dataset_repeat_opt 0.07% : 0.000020s : 1: detach_backward 0.04% : 0.000012s : 1: environ_conv 0.12% : 0.000032s : 1: event_method 0.03% : 0.000007s : 1: full_micro_interleaved_order_control 0.03% : 0.000008s : 1: get_jit_bprop_graph 0.05% : 0.000013s : 1: graph_reusing 0.02% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.02% : 0.000006s : 1: handle_group_info 0.03% : 0.000008s : 1: inline 0.03% : 0.000010s : 1: insert-virtual-dataset 0.02% : 0.000006s : 1: interleave_parallel_branches 0.02% : 0.000006s : 1: interleave_split_concat_branches 0.03% : 0.000008s : 1: label_fine_grained_interleaved_index 0.04% : 0.000010s : 1: label_micro_interleaved_index 1.77% : 0.000492s : 1: loop_unroll 0.02% : 0.000006s : 1: merge_cast_opt 0.03% : 0.000008s : 1: micro_interleaved_order_control 2.62% : 0.000729s : 1: mutable_eliminate 0.03% : 0.000010s : 1: offloading_packed_experts 0.05% : 0.000015s : 1: opt.transform.loop_unroll_optimizer 0.06% : 0.000016s : 1: opt.transform.mutable_eliminate 4.36% : 0.001213s : 78: opt.transform.opt_a 0.11% : 0.000030s : 1: opt.transform.opt_after_cconv 0.10% : 0.000027s : 1: opt.transform.opt_after_jit_grad 0.37% : 0.000104s : 28: opt.transform.opt_b 0.19% : 0.000052s : 2: opt.transform.opt_trans_graph 0.14% : 0.000038s : 4: opt.transform.symbol_engine_opt 11.13% : 0.003095s : 1: opt_a 0.48% : 0.000135s : 1: opt_after_cconv 1.97% : 0.000548s : 1: opt_after_jit_grad 1.00% : 0.000277s : 1: opt_b 21.50% : 0.005978s : 1: optimize 0.09% : 0.000026s : 1: optimize_parallel_all_gather_comm 0.04% : 0.000011s : 1: order_py_execute_after_rewriter 0.09% : 0.000026s : 1: overlap_grad_flash_sp 0.02% : 0.000007s : 1: overlap_grad_matmul_and_grad_allreduce 0.03% : 0.000010s : 1: overlap_grad_ring_attention 0.03% : 0.000007s : 1: overlap_opt_shard_grad_in_pipeline 0.02% : 0.000006s : 1: overlap_opt_shard_in_pipeline 0.03% : 0.000008s : 1: overlap_param_gather 0.02% : 0.000006s : 1: overlap_recompute_allgather_and_fa_grad 0.04% : 0.000010s : 1: overlap_recompute_and_grad_model_parallel 0.03% : 0.000008s : 1: overlap_recompute_comm 0.04% : 0.000010s : 1: parallel-infer-symbol 0.02% : 0.000007s : 1: parallel-infer-symbol-second 0.03% : 0.000007s : 1: partial_unused_args_eliminate 0.03% : 0.000009s : 1: pipeline_parallel_scheduler 0.03% : 0.000008s : 1: pipeline_split 0.16% : 0.000045s : 1: pre_auto_parallel 0.12% : 0.000032s : 1: py_interpret_to_execute 0.06% : 0.000017s : 1: py_interpret_to_execute_after_opt_a 0.02% : 0.000006s : 1: remove_cast_before_assign_add 0.07% : 0.000020s : 1: remove_dup_value 1.25% : 0.000347s : 1: renormalize.infer 1.11% : 0.000310s : 1: renormalize.specialize 0.03% : 0.000008s : 1: reorder_send_recv_between_fp_bp 0.04% : 0.000010s : 1: rewriter_after_jit_bprop_graph 0.15% : 0.000042s : 1: rewriter_after_opt_a 0.32% : 0.000090s : 1: rewriter_before_opt_a 0.03% : 0.000008s : 1: slice_cell_reuse_recomputed_activation 0.03% : 0.000008s : 1: slice_recompute_activation 0.03% : 0.000007s : 1: split_layernorm_comm 0.03% : 0.000008s : 1: split_matmul_comm_elemetwise 0.04% : 0.000010s : 1: swap_dp_allreduce_reducescatter 0.37% : 0.000104s : 1: symbol_engine_optimizer 0.35% : 0.000098s : 1: tuple_transform 22.15% : 0.006159s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:45:37.905.990 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.015641, [21] [bootstrap]: 0.00039671 [type_inference]: 0.00606989 [event_method]: 1.964e-05 [auto_monad]: 6.328e-05 [graph_reusing]: 5.57001e-06 [inline]: 2.59999e-06 [add_attr]: 0.00332941, [1] [add_attr_with_inline]: 0.0033193, [1] [Cycle 1]: 5.637e-05, [2] [tag_attr]: 2.12e-05 [meta_addattr_fg_expand]: 5.47001e-06 [parallel-infer-symbol]: 3.51999e-06 [pre_auto_parallel]: 3.218e-05 [insert-virtual-dataset]: 2.56e-06 [parallel-infer-symbol-second]: 9.09989e-07 [dataset_repeat_opt]: 2.22999e-06 [pipeline_split]: 1.74e-06 [optimize]: 0.00497956, [53] [py_interpret_to_execute]: 2.574e-05 [rewriter_before_opt_a]: 8.302e-05 [opt_a]: 0.00278724, [2] [Cycle 1]: 0.00208045, [45] [expand_dump_flag]: 2.98e-06 [switch_simplify]: 3.931e-05 [loop_unroll]: 3.097e-05 [a_1]: 0.00062589 [with_stream_mark]: 1.802e-05 [recompute_prepare]: 1.025e-05 [updatestate_depend_eliminate]: 3.51999e-06 [updatestate_assign_eliminate]: 3.02002e-06 [updatestate_loads_eliminate]: 3.31001e-06 [parameter_eliminate]: 1.81998e-06 [a_2]: 8.089e-05 [accelerated_algorithm]: 7.68999e-06 [shard]: 1.55999e-06 [meta_shard_fg_expand]: 1.91998e-06 [shard_inline]: 6.67002e-06 [merge_send_recv]: 9.44998e-06 [auto_parallel]: 7.56001e-06 [parallel]: 1.998e-05 [flash_sp]: 9.66998e-06 [merge_comm]: 3.94002e-06 [allreduce_fusion]: 3.47002e-06 [matmul_add_comm_reduction]: 8.10999e-06 [allreduce_slice_to_reducescatter]: 5.89993e-07 [virtual_shard_identity]: 9.55001e-06 [virtual_dataset]: 6.59999e-06 [get_grad_eliminate_]: 6.31e-06 [virtual_output]: 6.77002e-06 [merge_forward]: 3.76999e-06 [cell_reuse_recompute_pass]: 1.27e-06 [offload_activation]: 9.82001e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.449e-05 [merge_recompute_call_nodes]: 1.40001e-06 [before_grad]: 9.88998e-06 [set_forward_comm_id_for_comm_node_pass]: 4.05e-06 [meta_fg_expand]: 2.69999e-06 [flash_sp_send_recv_attached]: 2.59001e-06 [receive_attached]: 2.44999e-06 [after_resolve]: 1.177e-05 [a_after_grad]: 9.91998e-06 [renormalize]: 0.00071952 [add_forward_monad_depend]: 6.12999e-06 [auto_monad_grad]: 2.93e-06 [auto_monad_eliminator]: 1.498e-05 [cse]: 2.384e-05 [a_3]: 5.614e-05 [Cycle 2]: 0.00069599, [45] [expand_dump_flag]: 1.67001e-06 [switch_simplify]: 8.07e-06 [loop_unroll]: 6.69999e-06 [a_1]: 0.0001408 [with_stream_mark]: 1.166e-05 [recompute_prepare]: 6.45997e-06 [updatestate_depend_eliminate]: 3.30003e-06 [updatestate_assign_eliminate]: 2.61e-06 [updatestate_loads_eliminate]: 2.95998e-06 [parameter_eliminate]: 1.44998e-06 [a_2]: 7.561e-05 [accelerated_algorithm]: 7.40998e-06 [shard]: 1.04998e-06 [meta_shard_fg_expand]: 1.89999e-06 [shard_inline]: 6.57002e-06 [merge_send_recv]: 6.16998e-06 [auto_parallel]: 6.88998e-06 [parallel]: 5.96998e-06 [flash_sp]: 3.41999e-06 [merge_comm]: 3.28998e-06 [allreduce_fusion]: 3.45998e-06 [matmul_add_comm_reduction]: 6.24999e-06 [allreduce_slice_to_reducescatter]: 3.00002e-07 [virtual_shard_identity]: 6.68e-06 [virtual_dataset]: 6.06998e-06 [get_grad_eliminate_]: 6.34999e-06 [virtual_output]: 6.18998e-06 [merge_forward]: 3.30998e-06 [cell_reuse_recompute_pass]: 1.86e-06 [offload_activation]: 8.38001e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.439e-05 [merge_recompute_call_nodes]: 9.99979e-07 [before_grad]: 1.047e-05 [set_forward_comm_id_for_comm_node_pass]: 3.96001e-06 [meta_fg_expand]: 2.41998e-06 [flash_sp_send_recv_attached]: 1.12e-06 [receive_attached]: 1.48002e-06 [after_resolve]: 1.229e-05 [a_after_grad]: 1.03e-05 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 1.49998e-06 [auto_monad_grad]: 1.49e-06 [auto_monad_eliminator]: 7.25e-06 [cse]: 1.389e-05 [a_3]: 3.829e-05 [py_interpret_to_execute_after_opt_a]: 1.121e-05 [slice_cell_reuse_recomputed_activation]: 2.69001e-06 [rewriter_after_opt_a]: 3.4e-05 [convert_after_rewriter]: 7.12002e-06 [order_py_execute_after_rewriter]: 5.54e-06 [mutable_eliminate]: 0.00060734 [opt_b]: 0.00022171, [1] [Cycle 1]: 0.00021346, [7] [b_1]: 0.00013606 [b_2]: 8.63001e-06 [updatestate_depend_eliminate]: 5.87999e-06 [updatestate_assign_eliminate]: 2.46e-06 [updatestate_loads_eliminate]: 2.44999e-06 [renormalize]: 6.19999e-07 [cse]: 2.066e-05 [optimize_parallel_all_gather_comm]: 1.761e-05 [overlap_param_gather]: 2.39999e-06 [cconv]: 2.508e-05 [loop_unroll]: 0.00046938 [opt_after_cconv]: 0.00010705, [1] [Cycle 1]: 0.00010034, [7] [c_1]: 3.237e-05 [parameter_eliminate]: 4.02002e-06 [updatestate_depend_eliminate]: 5.71e-06 [updatestate_assign_eliminate]: 2.51e-06 [updatestate_loads_eliminate]: 2.32999e-06 [cse]: 1.731e-05 [renormalize]: 4.69998e-07 [remove_dup_value]: 1.368e-05 [tuple_transform]: 7.851e-05, [1] [Cycle 1]: 7.379e-05, [4] [d_1]: 4.473e-05 [none_parameter_eliminate]: 1.38002e-06 [renormalize]: 1.79978e-07 [switch_simplify]: 7.51999e-06 [partial_unused_args_eliminate]: 2.49001e-06 [add_recomputation]: 4.63e-05 [cse_after_recomputation]: 2.237e-05, [1] [Cycle 1]: 1.667e-05, [1] [cse]: 1.098e-05 [environ_conv]: 5.37001e-06 [swap_dp_allreduce_reducescatter]: 5.10999e-06 [bias_add_comm_swap]: 2.79999e-06 [label_micro_interleaved_index]: 3.79002e-06 [label_fine_grained_interleaved_index]: 2.66999e-06 [merge_cast_opt]: 1.29e-06 [slice_recompute_activation]: 2.19001e-06 [micro_interleaved_order_control]: 2.11e-06 [assign_add_opt]: 1.25001e-06 [ForceFp32Comm]: 8.2e-07 [remove_cast_before_assign_add]: 7.79983e-07 [full_micro_interleaved_order_control]: 2.10002e-06 [reorder_send_recv_between_fp_bp]: 2.79001e-06 [comm_op_add_attrs]: 1.37e-06 [add_comm_op_reuse_tag]: 1.08001e-06 [interleave_split_concat_branches]: 1.31998e-06 [interleave_parallel_branches]: 1.08001e-06 [overlap_opt_shard_in_pipeline]: 1.23002e-06 [overlap_opt_shard_grad_in_pipeline]: 1.83997e-06 [control_data_broadcast_order]: 1.224e-05 [grouped_pairwise_exchange_alltoall]: 1.58002e-06 [offloading_packed_experts]: 4.03999e-06 [overlap_recompute_and_grad_model_parallel]: 5.09e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.20001e-06 [overlap_recompute_allgather_and_fa_grad]: 1.40999e-06 [overlap_recompute_comm]: 2.32001e-06 [overlap_grad_ring_attention]: 4.35999e-06 [overlap_grad_flash_sp]: 1.971e-05 [begin_end_overlap_inline]: 5.10016e-07 [split_matmul_comm_elemetwise]: 2.26e-06 [split_layernorm_comm]: 1.91e-06 [handle_group_info]: 1.05999e-06 [symbol_engine_optimizer]: 8.152e-05, [1] [Cycle 1]: 7.711e-05, [6] [build]: 3.09999e-06 [elim_shapecalc]: 1.183e-05 [elim_not_effective]: 1.363e-05 [opt_reshape]: 7.29001e-06 [fold_const_symbol]: 1.079e-05 [renormalize]: 4.39992e-07 [detach_backward]: 2.32001e-06 [pipeline_parallel_scheduler]: 1.60999e-06 [auto_monad_reorder]: 1.543e-05 [get_jit_bprop_graph]: 1.86e-06 [rewriter_after_jit_bprop_graph]: 3.64002e-06 [opt_after_jit_grad]: 0.00052329 [validate]: 3.765e-05 Sums bootstrap : 0.000397s : 3.50% type_inference : 0.006070s : 53.63% event_method : 0.000020s : 0.17% auto_monad : 0.000063s : 0.56% graph_reusing : 0.000006s : 0.05% inline : 0.000003s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000021s : 0.19% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000005s : 0.05% parallel-infer-symbol : 0.000004s : 0.03% pre_auto_parallel : 0.000032s : 0.28% insert-virtual-dataset : 0.000003s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.02% optimize.py_interpret_to_execute : 0.000026s : 0.23% optimize.rewriter_before_opt_a : 0.000083s : 0.73% optimize.opt_a.expand_dump_flag : 0.000005s : 0.04% optimize.opt_a.switch_simplify : 0.000047s : 0.42% optimize.opt_a.loop_unroll : 0.000038s : 0.33% optimize.opt_a.a_1 : 0.000767s : 6.77% optimize.opt_a.with_stream_mark : 0.000030s : 0.26% optimize.opt_a.recompute_prepare : 0.000017s : 0.15% optimize.opt_a.updatestate_depend_eliminate : 0.000007s : 0.06% optimize.opt_a.updatestate_assign_eliminate : 0.000006s : 0.05% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.06% optimize.opt_a.parameter_eliminate : 0.000003s : 0.03% optimize.opt_a.a_2 : 0.000156s : 1.38% optimize.opt_a.accelerated_algorithm : 0.000015s : 0.13% optimize.opt_a.shard : 0.000003s : 0.02% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.03% optimize.opt_a.shard_inline : 0.000013s : 0.12% optimize.opt_a.merge_send_recv : 0.000016s : 0.14% optimize.opt_a.auto_parallel : 0.000014s : 0.13% optimize.opt_a.parallel : 0.000026s : 0.23% optimize.opt_a.flash_sp : 0.000013s : 0.12% optimize.opt_a.merge_comm : 0.000007s : 0.06% optimize.opt_a.allreduce_fusion : 0.000007s : 0.06% optimize.opt_a.matmul_add_comm_reduction : 0.000014s : 0.13% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000016s : 0.14% optimize.opt_a.virtual_dataset : 0.000013s : 0.11% optimize.opt_a.get_grad_eliminate_ : 0.000013s : 0.11% optimize.opt_a.virtual_output : 0.000013s : 0.11% optimize.opt_a.merge_forward : 0.000007s : 0.06% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.03% optimize.opt_a.offload_activation : 0.000018s : 0.16% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000029s : 0.26% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.02% optimize.opt_a.before_grad : 0.000020s : 0.18% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000008s : 0.07% optimize.opt_a.meta_fg_expand : 0.000005s : 0.05% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.03% optimize.opt_a.receive_attached : 0.000004s : 0.03% optimize.opt_a.after_resolve : 0.000024s : 0.21% optimize.opt_a.a_after_grad : 0.000020s : 0.18% optimize.opt_a.renormalize : 0.000720s : 6.36% optimize.opt_a.add_forward_monad_depend : 0.000008s : 0.07% optimize.opt_a.auto_monad_grad : 0.000004s : 0.04% optimize.opt_a.auto_monad_eliminator : 0.000022s : 0.20% optimize.opt_a.cse : 0.000038s : 0.33% optimize.opt_a.a_3 : 0.000094s : 0.83% optimize.py_interpret_to_execute_after_opt_a : 0.000011s : 0.10% optimize.slice_cell_reuse_recomputed_activation : 0.000003s : 0.02% optimize.rewriter_after_opt_a : 0.000034s : 0.30% optimize.convert_after_rewriter : 0.000007s : 0.06% optimize.order_py_execute_after_rewriter : 0.000006s : 0.05% optimize.mutable_eliminate : 0.000607s : 5.37% optimize.opt_b.b_1 : 0.000136s : 1.20% optimize.opt_b.b_2 : 0.000009s : 0.08% optimize.opt_b.updatestate_depend_eliminate : 0.000006s : 0.05% optimize.opt_b.updatestate_assign_eliminate : 0.000002s : 0.02% optimize.opt_b.updatestate_loads_eliminate : 0.000002s : 0.02% optimize.opt_b.renormalize : 0.000001s : 0.01% optimize.opt_b.cse : 0.000021s : 0.18% optimize.optimize_parallel_all_gather_comm : 0.000018s : 0.16% optimize.overlap_param_gather : 0.000002s : 0.02% optimize.cconv : 0.000025s : 0.22% optimize.loop_unroll : 0.000469s : 4.15% optimize.opt_after_cconv.c_1 : 0.000032s : 0.29% optimize.opt_after_cconv.parameter_eliminate : 0.000004s : 0.04% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.05% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.02% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.02% optimize.opt_after_cconv.cse : 0.000017s : 0.15% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000014s : 0.12% optimize.tuple_transform.d_1 : 0.000045s : 0.40% optimize.tuple_transform.none_parameter_eliminate : 0.000001s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000008s : 0.07% optimize.partial_unused_args_eliminate : 0.000002s : 0.02% optimize.add_recomputation : 0.000046s : 0.41% optimize.cse_after_recomputation.cse : 0.000011s : 0.10% optimize.environ_conv : 0.000005s : 0.05% optimize.swap_dp_allreduce_reducescatter : 0.000005s : 0.05% optimize.bias_add_comm_swap : 0.000003s : 0.02% optimize.label_micro_interleaved_index : 0.000004s : 0.03% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.02% optimize.merge_cast_opt : 0.000001s : 0.01% optimize.slice_recompute_activation : 0.000002s : 0.02% optimize.micro_interleaved_order_control : 0.000002s : 0.02% optimize.assign_add_opt : 0.000001s : 0.01% optimize.ForceFp32Comm : 0.000001s : 0.01% optimize.remove_cast_before_assign_add : 0.000001s : 0.01% optimize.full_micro_interleaved_order_control : 0.000002s : 0.02% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.02% optimize.comm_op_add_attrs : 0.000001s : 0.01% optimize.add_comm_op_reuse_tag : 0.000001s : 0.01% optimize.interleave_split_concat_branches : 0.000001s : 0.01% optimize.interleave_parallel_branches : 0.000001s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.02% optimize.control_data_broadcast_order : 0.000012s : 0.11% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.01% optimize.offloading_packed_experts : 0.000004s : 0.04% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.04% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.01% optimize.overlap_recompute_comm : 0.000002s : 0.02% optimize.overlap_grad_ring_attention : 0.000004s : 0.04% optimize.overlap_grad_flash_sp : 0.000020s : 0.17% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.02% optimize.split_layernorm_comm : 0.000002s : 0.02% optimize.handle_group_info : 0.000001s : 0.01% optimize.symbol_engine_optimizer.build : 0.000003s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000012s : 0.10% optimize.symbol_engine_optimizer.elim_not_effective : 0.000014s : 0.12% optimize.symbol_engine_optimizer.opt_reshape : 0.000007s : 0.06% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000011s : 0.10% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.02% pipeline_parallel_scheduler : 0.000002s : 0.01% auto_monad_reorder : 0.000015s : 0.14% get_jit_bprop_graph : 0.000002s : 0.02% rewriter_after_jit_bprop_graph : 0.000004s : 0.03% opt_after_jit_grad : 0.000523s : 4.62% validate : 0.000038s : 0.33% Time group info: ------[substitution.] 0.000186 28 0.96% : 0.000002s : 2: substitution.elim_not_effective 0.71% : 0.000001s : 2: substitution.fold_const_symbol 3.42% : 0.000006s : 4: substitution.graph_param_transform 79.37% : 0.000148s : 4: substitution.inline 1.83% : 0.000003s : 4: substitution.j_node_and_user_rematch 2.89% : 0.000005s : 4: substitution.remove_not_recompute_node 2.59% : 0.000005s : 4: substitution.replace_old_param 8.23% : 0.000015s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.006008 2 88.40% : 0.005311s : 1: type_inference.infer 11.60% : 0.000697s : 1: type_inference.specialize ------[replace.] 0.000062 8 62.27% : 0.000039s : 4: replace.inline 37.73% : 0.000023s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000158 8 91.59% : 0.000145s : 4: match.inline 8.41% : 0.000013s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000211 1278 0.85% : 0.000002s : 13: predicate.accumulaten_eliminater 0.69% : 0.000001s : 4: predicate.ad_related_special_op_eliminate 0.53% : 0.000001s : 8: predicate.addn_check_dump 0.89% : 0.000002s : 13: predicate.addn_zero_filter 0.85% : 0.000002s : 13: predicate.adjust_all_reduce_mul_add 2.13% : 0.000004s : 21: predicate.arithmetic_simplify 0.90% : 0.000002s : 13: predicate.cast_eliminate 0.65% : 0.000001s : 8: predicate.check_bprop_eliminate 0.51% : 0.000001s : 8: predicate.compare_switch_simplify 0.18% : 0.000000s : 4: predicate.const_output_eliminate 0.59% : 0.000001s : 8: predicate.depend_value_elim 0.87% : 0.000002s : 13: predicate.dict_get_item_const_eliminator 0.96% : 0.000002s : 13: predicate.dict_get_item_eliminator 0.84% : 0.000002s : 13: predicate.dict_set_item_eliminator 1.02% : 0.000002s : 8: predicate.dumpgradient_eliminate 0.31% : 0.000001s : 4: predicate.elim_not_effective 0.50% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.13% : 0.000002s : 17: predicate.environ_add_const_eliminate 1.16% : 0.000002s : 17: predicate.environ_get_add_eliminate 1.08% : 0.000002s : 17: predicate.environ_get_depend_swap 1.81% : 0.000004s : 25: predicate.environ_get_eliminate 1.13% : 0.000002s : 17: predicate.environ_get_set_eliminate 1.45% : 0.000003s : 21: predicate.exchange_switch_depend_value 2.22% : 0.000005s : 21: predicate.float_depend_g_call 0.64% : 0.000001s : 8: predicate.float_environ_get_switch 0.75% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.25% : 0.000001s : 4: predicate.fold_const_symbol 0.68% : 0.000001s : 8: predicate.get_grad_eliminate 0.26% : 0.000001s : 4: predicate.graph_param_transform 0.61% : 0.000001s : 8: predicate.incorporate_call 0.46% : 0.000001s : 8: predicate.incorporate_call_switch 6.08% : 0.000013s : 58: predicate.inline 0.70% : 0.000001s : 8: predicate.inline_without_move 0.33% : 0.000001s : 8: predicate.j_node_and_user_rematch 1.10% : 0.000002s : 8: predicate.less_batch_normalization 1.81% : 0.000004s : 25: predicate.list_to_tuple_eliminator_ 2.45% : 0.000005s : 38: predicate.load_eliminater 0.86% : 0.000002s : 4: predicate.loop_unroll_after_grad 2.38% : 0.000005s : 34: predicate.loop_unroll_before_grad 2.03% : 0.000004s : 21: predicate.make_slice_get_slice_eliminator 0.51% : 0.000001s : 8: predicate.merge_addn 0.58% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.59% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.85% : 0.000002s : 13: predicate.minmaximum_grad 0.98% : 0.000002s : 4: predicate.mutable_eliminate 0.38% : 0.000001s : 4: predicate.opt_reshape 0.39% : 0.000001s : 4: predicate.parallel_virtual_node 1.77% : 0.000004s : 21: predicate.partial_defer_inline 1.62% : 0.000003s : 21: predicate.partial_eliminate 0.92% : 0.000002s : 13: predicate.print_const_string_wrapper 0.60% : 0.000001s : 8: predicate.reduce_all_const_elim 1.14% : 0.000002s : 13: predicate.reduce_eliminate 2.54% : 0.000005s : 38: predicate.redundant_stop_gradient_eliminater 0.49% : 0.000001s : 8: predicate.remove_not_recompute_node 1.44% : 0.000003s : 25: predicate.replace_applicator 0.54% : 0.000001s : 8: predicate.replace_old_param 0.37% : 0.000001s : 4: predicate.reset_defer_inline 0.93% : 0.000002s : 13: predicate.reshape_eliminate 0.62% : 0.000001s : 8: predicate.row_tensor_add_zeros_like 0.45% : 0.000001s : 4: predicate.row_tensor_eliminate 0.84% : 0.000002s : 8: predicate.same_eliminate 0.61% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.88% : 0.000002s : 8: predicate.shard_identity_eliminate 0.88% : 0.000002s : 8: predicate.special_op_eliminate 0.72% : 0.000002s : 8: predicate.specialize_transform 0.94% : 0.000002s : 8: predicate.split_environ_get_set_with_tuple_value 0.72% : 0.000002s : 8: predicate.stack_unstack_eliminate 0.31% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.46% : 0.000003s : 21: predicate.switch_defer_inline 2.13% : 0.000004s : 29: predicate.switch_layer_defer_inline 4.89% : 0.000010s : 67: predicate.switch_simplify 0.94% : 0.000002s : 13: predicate.tile_eliminate 0.95% : 0.000002s : 13: predicate.transpose_eliminate 1.63% : 0.000003s : 21: predicate.tuple_list_convert_item_index_to_positive 1.64% : 0.000003s : 21: predicate.tuple_list_get_item_const_eliminator 1.47% : 0.000003s : 21: predicate.tuple_list_get_item_depend_reorder 3.07% : 0.000006s : 33: predicate.tuple_list_get_item_eliminator 1.54% : 0.000003s : 21: predicate.tuple_list_get_set_item_eliminator 2.27% : 0.000005s : 29: predicate.tuple_list_set_item_eliminator 1.86% : 0.000004s : 25: predicate.tuple_to_list_eliminator_ 2.37% : 0.000005s : 38: predicate.updatestate_pure_node_eliminater 2.98% : 0.000006s : 46: predicate.updatestate_useless_node_eliminater 0.41% : 0.000001s : 4: predicate.value_based_eliminate 0.59% : 0.000001s : 8: predicate.virtual_dataset_eliminate 0.73% : 0.000002s : 8: predicate.virtual_output_eliminate 0.30% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.49% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000551 11 53.84% : 0.000297s : 5: func_graph_cloner_run.FuncGraphClonerGraph 46.16% : 0.000254s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.026034 192 0.01% : 0.000003s : 1: ForceFp32Comm 12.81% : 0.003335s : 1: add_attr 12.76% : 0.003323s : 1: add_attr_with_inline 0.02% : 0.000004s : 1: add_comm_op_reuse_tag 0.19% : 0.000050s : 1: add_recomputation 0.02% : 0.000004s : 1: assign_add_opt 0.26% : 0.000068s : 1: auto_monad 0.07% : 0.000019s : 1: auto_monad_reorder 0.01% : 0.000004s : 1: begin_end_overlap_inline 0.02% : 0.000006s : 1: bias_add_comm_swap 1.63% : 0.000424s : 1: bootstrap 0.11% : 0.000028s : 1: cconv 0.02% : 0.000004s : 1: comm_op_add_attrs 0.06% : 0.000015s : 1: control_data_broadcast_order 0.04% : 0.000010s : 1: convert_after_rewriter 0.10% : 0.000025s : 1: cse_after_recomputation 0.02% : 0.000005s : 1: dataset_repeat_opt 0.02% : 0.000006s : 1: detach_backward 0.03% : 0.000009s : 1: environ_conv 0.10% : 0.000025s : 1: event_method 0.02% : 0.000005s : 1: full_micro_interleaved_order_control 0.02% : 0.000005s : 1: get_jit_bprop_graph 0.04% : 0.000009s : 1: graph_reusing 0.02% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.02% : 0.000004s : 1: handle_group_info 0.02% : 0.000005s : 1: inline 0.02% : 0.000006s : 1: insert-virtual-dataset 0.01% : 0.000004s : 1: interleave_parallel_branches 0.02% : 0.000004s : 1: interleave_split_concat_branches 0.02% : 0.000006s : 1: label_fine_grained_interleaved_index 0.03% : 0.000007s : 1: label_micro_interleaved_index 1.83% : 0.000478s : 1: loop_unroll 0.02% : 0.000004s : 1: merge_cast_opt 0.02% : 0.000005s : 1: micro_interleaved_order_control 2.38% : 0.000618s : 1: mutable_eliminate 0.03% : 0.000007s : 1: offloading_packed_experts 0.06% : 0.000016s : 1: opt.transform.loop_unroll_optimizer 0.07% : 0.000019s : 1: opt.transform.mutable_eliminate 4.62% : 0.001202s : 78: opt.transform.opt_a 0.12% : 0.000031s : 1: opt.transform.opt_after_cconv 0.11% : 0.000030s : 1: opt.transform.opt_after_jit_grad 0.42% : 0.000109s : 28: opt.transform.opt_b 0.19% : 0.000050s : 2: opt.transform.opt_trans_graph 0.15% : 0.000039s : 4: opt.transform.symbol_engine_opt 10.72% : 0.002792s : 1: opt_a 0.42% : 0.000111s : 1: opt_after_cconv 2.04% : 0.000532s : 1: opt_after_jit_grad 0.87% : 0.000225s : 1: opt_b 19.15% : 0.004985s : 1: optimize 0.08% : 0.000021s : 1: optimize_parallel_all_gather_comm 0.03% : 0.000009s : 1: order_py_execute_after_rewriter 0.09% : 0.000023s : 1: overlap_grad_flash_sp 0.02% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.03% : 0.000008s : 1: overlap_grad_ring_attention 0.02% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.02% : 0.000005s : 1: overlap_param_gather 0.02% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.03% : 0.000009s : 1: overlap_recompute_and_grad_model_parallel 0.02% : 0.000005s : 1: overlap_recompute_comm 0.03% : 0.000007s : 1: parallel-infer-symbol 0.01% : 0.000004s : 1: parallel-infer-symbol-second 0.02% : 0.000006s : 1: partial_unused_args_eliminate 0.02% : 0.000005s : 1: pipeline_parallel_scheduler 0.02% : 0.000004s : 1: pipeline_split 0.14% : 0.000036s : 1: pre_auto_parallel 0.12% : 0.000030s : 1: py_interpret_to_execute 0.06% : 0.000015s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000003s : 1: remove_cast_before_assign_add 0.07% : 0.000017s : 1: remove_dup_value 1.47% : 0.000383s : 1: renormalize.infer 1.26% : 0.000327s : 1: renormalize.specialize 0.02% : 0.000005s : 1: reorder_send_recv_between_fp_bp 0.03% : 0.000007s : 1: rewriter_after_jit_bprop_graph 0.15% : 0.000038s : 1: rewriter_after_opt_a 0.33% : 0.000087s : 1: rewriter_before_opt_a 0.02% : 0.000006s : 1: slice_cell_reuse_recomputed_activation 0.02% : 0.000006s : 1: slice_recompute_activation 0.02% : 0.000005s : 1: split_layernorm_comm 0.02% : 0.000005s : 1: split_matmul_comm_elemetwise 0.03% : 0.000008s : 1: swap_dp_allreduce_reducescatter 0.32% : 0.000084s : 1: symbol_engine_optimizer 0.31% : 0.000082s : 1: tuple_transform 23.38% : 0.006086s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:45:38.125.324 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:45:38.125.620 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0180873, [21] [bootstrap]: 0.00037222 [type_inference]: 0.0076136 [event_method]: 1.756e-05 [auto_monad]: 6.089e-05 [graph_reusing]: 6.25002e-06 [inline]: 2.37001e-06 [add_attr]: 0.00319499, [1] [add_attr_with_inline]: 0.00318581, [1] [Cycle 1]: 7.461e-05, [2] [tag_attr]: 1.941e-05 [meta_addattr_fg_expand]: 5.78002e-06 [parallel-infer-symbol]: 3.35e-06 [pre_auto_parallel]: 3.346e-05 [insert-virtual-dataset]: 2.76e-06 [parallel-infer-symbol-second]: 7.39994e-07 [dataset_repeat_opt]: 1.82001e-06 [pipeline_split]: 1.63002e-06 [optimize]: 0.00559045, [53] [py_interpret_to_execute]: 2.962e-05 [rewriter_before_opt_a]: 8.309e-05 [opt_a]: 0.00308032, [2] [Cycle 1]: 0.00219623, [45] [expand_dump_flag]: 3.7e-06 [switch_simplify]: 4.112e-05 [loop_unroll]: 3.09e-05 [a_1]: 0.00061704 [with_stream_mark]: 1.694e-05 [recompute_prepare]: 8.92999e-06 [updatestate_depend_eliminate]: 4.17e-06 [updatestate_assign_eliminate]: 3.81999e-06 [updatestate_loads_eliminate]: 2.97002e-06 [parameter_eliminate]: 1.99e-06 [a_2]: 0.00014108 [accelerated_algorithm]: 7.63999e-06 [shard]: 1.68002e-06 [meta_shard_fg_expand]: 1.96e-06 [shard_inline]: 6.98e-06 [merge_send_recv]: 9.46e-06 [auto_parallel]: 7.01001e-06 [parallel]: 1.996e-05 [flash_sp]: 9.74999e-06 [merge_comm]: 4.2e-06 [allreduce_fusion]: 3.98001e-06 [matmul_add_comm_reduction]: 1.056e-05 [allreduce_slice_to_reducescatter]: 6.49976e-07 [virtual_shard_identity]: 9.24e-06 [virtual_dataset]: 7.33999e-06 [get_grad_eliminate_]: 6.59001e-06 [virtual_output]: 6.59999e-06 [merge_forward]: 3.66001e-06 [cell_reuse_recompute_pass]: 1.25999e-06 [offload_activation]: 1.084e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.521e-05 [merge_recompute_call_nodes]: 1.71e-06 [before_grad]: 1.073e-05 [set_forward_comm_id_for_comm_node_pass]: 4.10998e-06 [meta_fg_expand]: 3.37002e-06 [flash_sp_send_recv_attached]: 2.91999e-06 [receive_attached]: 2.34999e-06 [after_resolve]: 1.358e-05 [a_after_grad]: 1.031e-05 [renormalize]: 0.00061731 [add_forward_monad_depend]: 6.00002e-06 [auto_monad_grad]: 2.81e-06 [auto_monad_eliminator]: 1.562e-05 [cse]: 2.594e-05 [a_3]: 6.436e-05 [Cycle 2]: 0.00086903, [45] [expand_dump_flag]: 1.77999e-06 [switch_simplify]: 8.50999e-06 [loop_unroll]: 6.14999e-06 [a_1]: 0.00013259 [with_stream_mark]: 1.361e-05 [recompute_prepare]: 6.57002e-06 [updatestate_depend_eliminate]: 2.97002e-06 [updatestate_assign_eliminate]: 2.89999e-06 [updatestate_loads_eliminate]: 2.44001e-06 [parameter_eliminate]: 1.97999e-06 [a_2]: 0.00010004 [accelerated_algorithm]: 6.59999e-06 [shard]: 1.35999e-06 [meta_shard_fg_expand]: 2.00002e-06 [shard_inline]: 6.42001e-06 [merge_send_recv]: 7.45e-06 [auto_parallel]: 8.33999e-06 [parallel]: 6.23e-06 [flash_sp]: 3.43e-06 [merge_comm]: 6.37001e-06 [allreduce_fusion]: 3.56999e-06 [matmul_add_comm_reduction]: 7.28e-06 [allreduce_slice_to_reducescatter]: 4.00003e-07 [virtual_shard_identity]: 8.05999e-06 [virtual_dataset]: 7.49002e-06 [get_grad_eliminate_]: 5.72999e-06 [virtual_output]: 5.56e-06 [merge_forward]: 3.48e-06 [cell_reuse_recompute_pass]: 2.24999e-06 [offload_activation]: 8.74e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.79e-05 [merge_recompute_call_nodes]: 1.09e-06 [before_grad]: 1.059e-05 [set_forward_comm_id_for_comm_node_pass]: 6.05002e-06 [meta_fg_expand]: 2.29001e-06 [flash_sp_send_recv_attached]: 1.35999e-06 [receive_attached]: 1.68002e-06 [after_resolve]: 1.241e-05 [a_after_grad]: 1.128e-05 [renormalize]: 8.00064e-08 [add_forward_monad_depend]: 2.54999e-06 [auto_monad_grad]: 2.07999e-06 [auto_monad_eliminator]: 9.72999e-06 [cse]: 1.648e-05 [a_3]: 5.039e-05 [py_interpret_to_execute_after_opt_a]: 1.465e-05 [slice_cell_reuse_recomputed_activation]: 5.34998e-06 [rewriter_after_opt_a]: 3.889e-05 [convert_after_rewriter]: 1.031e-05 [order_py_execute_after_rewriter]: 8.30999e-06 [mutable_eliminate]: 0.00065969 [opt_b]: 0.00027848, [1] [Cycle 1]: 0.00026782, [7] [b_1]: 0.00017024 [b_2]: 8.89998e-06 [updatestate_depend_eliminate]: 5.64998e-06 [updatestate_assign_eliminate]: 3.11001e-06 [updatestate_loads_eliminate]: 3.21001e-06 [renormalize]: 6.50005e-07 [cse]: 1.831e-05 [optimize_parallel_all_gather_comm]: 2.092e-05 [overlap_param_gather]: 5.27001e-06 [cconv]: 3.41e-05 [loop_unroll]: 0.00047401 [opt_after_cconv]: 0.00013254, [1] [Cycle 1]: 0.00012334, [7] [c_1]: 3.225e-05 [parameter_eliminate]: 3.65e-06 [updatestate_depend_eliminate]: 5.84999e-06 [updatestate_assign_eliminate]: 2.71999e-06 [updatestate_loads_eliminate]: 2.88e-06 [cse]: 1.793e-05 [renormalize]: 6.29982e-07 [remove_dup_value]: 1.712e-05 [tuple_transform]: 0.00010836, [1] [Cycle 1]: 0.00010122, [4] [d_1]: 5.918e-05 [none_parameter_eliminate]: 1.75001e-06 [renormalize]: 2.69996e-07 [switch_simplify]: 7.74002e-06 [partial_unused_args_eliminate]: 4.48001e-06 [add_recomputation]: 5.291e-05 [cse_after_recomputation]: 2.891e-05, [1] [Cycle 1]: 2.153e-05, [1] [cse]: 1.193e-05 [environ_conv]: 8.19998e-06 [swap_dp_allreduce_reducescatter]: 8.20999e-06 [bias_add_comm_swap]: 4.90001e-06 [label_micro_interleaved_index]: 7.75998e-06 [label_fine_grained_interleaved_index]: 5.59e-06 [merge_cast_opt]: 3.83001e-06 [slice_recompute_activation]: 4.47998e-06 [micro_interleaved_order_control]: 4.41002e-06 [assign_add_opt]: 3.61001e-06 [ForceFp32Comm]: 3.2e-06 [remove_cast_before_assign_add]: 3.40998e-06 [full_micro_interleaved_order_control]: 4.70001e-06 [reorder_send_recv_between_fp_bp]: 5.69e-06 [comm_op_add_attrs]: 3.56001e-06 [add_comm_op_reuse_tag]: 3.45003e-06 [interleave_split_concat_branches]: 3.81999e-06 [interleave_parallel_branches]: 3.36999e-06 [overlap_opt_shard_in_pipeline]: 3.24001e-06 [overlap_opt_shard_grad_in_pipeline]: 4.13001e-06 [control_data_broadcast_order]: 1.538e-05 [grouped_pairwise_exchange_alltoall]: 4.26001e-06 [offloading_packed_experts]: 6.95002e-06 [overlap_recompute_and_grad_model_parallel]: 7.99002e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.58999e-06 [overlap_recompute_allgather_and_fa_grad]: 4.1e-06 [overlap_recompute_comm]: 5.51e-06 [overlap_grad_ring_attention]: 6.62002e-06 [overlap_grad_flash_sp]: 2.22e-05 [begin_end_overlap_inline]: 2.95002e-06 [split_matmul_comm_elemetwise]: 4.69002e-06 [split_layernorm_comm]: 4.02998e-06 [handle_group_info]: 3.57002e-06 [symbol_engine_optimizer]: 0.00010204, [1] [Cycle 1]: 9.492e-05, [6] [build]: 3.58e-06 [elim_shapecalc]: 1.15e-05 [elim_not_effective]: 1.356e-05 [opt_reshape]: 7.36999e-06 [fold_const_symbol]: 1.095e-05 [renormalize]: 5.89993e-07 [detach_backward]: 3.5e-06 [pipeline_parallel_scheduler]: 1.76e-06 [auto_monad_reorder]: 1.793e-05 [get_jit_bprop_graph]: 1.89e-06 [rewriter_after_jit_bprop_graph]: 4.32e-06 [opt_after_jit_grad]: 0.00055393 [validate]: 3.786e-05 Sums bootstrap : 0.000372s : 2.83% type_inference : 0.007614s : 57.98% event_method : 0.000018s : 0.13% auto_monad : 0.000061s : 0.46% graph_reusing : 0.000006s : 0.05% inline : 0.000002s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000019s : 0.15% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.04% parallel-infer-symbol : 0.000003s : 0.03% pre_auto_parallel : 0.000033s : 0.25% insert-virtual-dataset : 0.000003s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.01% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000030s : 0.23% optimize.rewriter_before_opt_a : 0.000083s : 0.63% optimize.opt_a.expand_dump_flag : 0.000005s : 0.04% optimize.opt_a.switch_simplify : 0.000050s : 0.38% optimize.opt_a.loop_unroll : 0.000037s : 0.28% optimize.opt_a.a_1 : 0.000750s : 5.71% optimize.opt_a.with_stream_mark : 0.000031s : 0.23% optimize.opt_a.recompute_prepare : 0.000016s : 0.12% optimize.opt_a.updatestate_depend_eliminate : 0.000007s : 0.05% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.05% optimize.opt_a.updatestate_loads_eliminate : 0.000005s : 0.04% optimize.opt_a.parameter_eliminate : 0.000004s : 0.03% optimize.opt_a.a_2 : 0.000241s : 1.84% optimize.opt_a.accelerated_algorithm : 0.000014s : 0.11% optimize.opt_a.shard : 0.000003s : 0.02% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.03% optimize.opt_a.shard_inline : 0.000013s : 0.10% optimize.opt_a.merge_send_recv : 0.000017s : 0.13% optimize.opt_a.auto_parallel : 0.000015s : 0.12% optimize.opt_a.parallel : 0.000026s : 0.20% optimize.opt_a.flash_sp : 0.000013s : 0.10% optimize.opt_a.merge_comm : 0.000011s : 0.08% optimize.opt_a.allreduce_fusion : 0.000008s : 0.06% optimize.opt_a.matmul_add_comm_reduction : 0.000018s : 0.14% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000017s : 0.13% optimize.opt_a.virtual_dataset : 0.000015s : 0.11% optimize.opt_a.get_grad_eliminate_ : 0.000012s : 0.09% optimize.opt_a.virtual_output : 0.000012s : 0.09% optimize.opt_a.merge_forward : 0.000007s : 0.05% optimize.opt_a.cell_reuse_recompute_pass : 0.000004s : 0.03% optimize.opt_a.offload_activation : 0.000020s : 0.15% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000033s : 0.25% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.02% optimize.opt_a.before_grad : 0.000021s : 0.16% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000010s : 0.08% optimize.opt_a.meta_fg_expand : 0.000006s : 0.04% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.03% optimize.opt_a.receive_attached : 0.000004s : 0.03% optimize.opt_a.after_resolve : 0.000026s : 0.20% optimize.opt_a.a_after_grad : 0.000022s : 0.16% optimize.opt_a.renormalize : 0.000617s : 4.70% optimize.opt_a.add_forward_monad_depend : 0.000009s : 0.07% optimize.opt_a.auto_monad_grad : 0.000005s : 0.04% optimize.opt_a.auto_monad_eliminator : 0.000025s : 0.19% optimize.opt_a.cse : 0.000042s : 0.32% optimize.opt_a.a_3 : 0.000115s : 0.87% optimize.py_interpret_to_execute_after_opt_a : 0.000015s : 0.11% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.04% optimize.rewriter_after_opt_a : 0.000039s : 0.30% optimize.convert_after_rewriter : 0.000010s : 0.08% optimize.order_py_execute_after_rewriter : 0.000008s : 0.06% optimize.mutable_eliminate : 0.000660s : 5.02% optimize.opt_b.b_1 : 0.000170s : 1.30% optimize.opt_b.b_2 : 0.000009s : 0.07% optimize.opt_b.updatestate_depend_eliminate : 0.000006s : 0.04% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.02% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.02% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000018s : 0.14% optimize.optimize_parallel_all_gather_comm : 0.000021s : 0.16% optimize.overlap_param_gather : 0.000005s : 0.04% optimize.cconv : 0.000034s : 0.26% optimize.loop_unroll : 0.000474s : 3.61% optimize.opt_after_cconv.c_1 : 0.000032s : 0.25% optimize.opt_after_cconv.parameter_eliminate : 0.000004s : 0.03% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.04% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.02% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.02% optimize.opt_after_cconv.cse : 0.000018s : 0.14% optimize.opt_after_cconv.renormalize : 0.000001s : 0.00% optimize.remove_dup_value : 0.000017s : 0.13% optimize.tuple_transform.d_1 : 0.000059s : 0.45% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000008s : 0.06% optimize.partial_unused_args_eliminate : 0.000004s : 0.03% optimize.add_recomputation : 0.000053s : 0.40% optimize.cse_after_recomputation.cse : 0.000012s : 0.09% optimize.environ_conv : 0.000008s : 0.06% optimize.swap_dp_allreduce_reducescatter : 0.000008s : 0.06% optimize.bias_add_comm_swap : 0.000005s : 0.04% optimize.label_micro_interleaved_index : 0.000008s : 0.06% optimize.label_fine_grained_interleaved_index : 0.000006s : 0.04% optimize.merge_cast_opt : 0.000004s : 0.03% optimize.slice_recompute_activation : 0.000004s : 0.03% optimize.micro_interleaved_order_control : 0.000004s : 0.03% optimize.assign_add_opt : 0.000004s : 0.03% optimize.ForceFp32Comm : 0.000003s : 0.02% optimize.remove_cast_before_assign_add : 0.000003s : 0.03% optimize.full_micro_interleaved_order_control : 0.000005s : 0.04% optimize.reorder_send_recv_between_fp_bp : 0.000006s : 0.04% optimize.comm_op_add_attrs : 0.000004s : 0.03% optimize.add_comm_op_reuse_tag : 0.000003s : 0.03% optimize.interleave_split_concat_branches : 0.000004s : 0.03% optimize.interleave_parallel_branches : 0.000003s : 0.03% optimize.overlap_opt_shard_in_pipeline : 0.000003s : 0.02% optimize.overlap_opt_shard_grad_in_pipeline : 0.000004s : 0.03% optimize.control_data_broadcast_order : 0.000015s : 0.12% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.03% optimize.offloading_packed_experts : 0.000007s : 0.05% optimize.overlap_recompute_and_grad_model_parallel : 0.000008s : 0.06% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.03% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.03% optimize.overlap_recompute_comm : 0.000006s : 0.04% optimize.overlap_grad_ring_attention : 0.000007s : 0.05% optimize.overlap_grad_flash_sp : 0.000022s : 0.17% optimize.begin_end_overlap_inline : 0.000003s : 0.02% optimize.split_matmul_comm_elemetwise : 0.000005s : 0.04% optimize.split_layernorm_comm : 0.000004s : 0.03% optimize.handle_group_info : 0.000004s : 0.03% optimize.symbol_engine_optimizer.build : 0.000004s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000012s : 0.09% optimize.symbol_engine_optimizer.elim_not_effective : 0.000014s : 0.10% optimize.symbol_engine_optimizer.opt_reshape : 0.000007s : 0.06% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000011s : 0.08% optimize.symbol_engine_optimizer.renormalize : 0.000001s : 0.00% detach_backward : 0.000003s : 0.03% pipeline_parallel_scheduler : 0.000002s : 0.01% auto_monad_reorder : 0.000018s : 0.14% get_jit_bprop_graph : 0.000002s : 0.01% rewriter_after_jit_bprop_graph : 0.000004s : 0.03% opt_after_jit_grad : 0.000554s : 4.22% validate : 0.000038s : 0.29% Time group info: ------[substitution.] 0.000180 28 1.36% : 0.000002s : 2: substitution.elim_not_effective 0.92% : 0.000002s : 2: substitution.fold_const_symbol 3.60% : 0.000006s : 4: substitution.graph_param_transform 77.47% : 0.000140s : 4: substitution.inline 2.19% : 0.000004s : 4: substitution.j_node_and_user_rematch 2.80% : 0.000005s : 4: substitution.remove_not_recompute_node 3.24% : 0.000006s : 4: substitution.replace_old_param 8.41% : 0.000015s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.007567 2 91.02% : 0.006887s : 1: type_inference.infer 8.98% : 0.000680s : 1: type_inference.specialize ------[replace.] 0.000060 8 61.93% : 0.000037s : 4: replace.inline 38.07% : 0.000023s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000150 8 91.22% : 0.000137s : 4: match.inline 8.78% : 0.000013s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000208 1278 0.92% : 0.000002s : 13: predicate.accumulaten_eliminater 0.92% : 0.000002s : 4: predicate.ad_related_special_op_eliminate 0.49% : 0.000001s : 8: predicate.addn_check_dump 0.92% : 0.000002s : 13: predicate.addn_zero_filter 0.78% : 0.000002s : 13: predicate.adjust_all_reduce_mul_add 2.20% : 0.000005s : 21: predicate.arithmetic_simplify 0.88% : 0.000002s : 13: predicate.cast_eliminate 0.73% : 0.000002s : 8: predicate.check_bprop_eliminate 0.55% : 0.000001s : 8: predicate.compare_switch_simplify 0.16% : 0.000000s : 4: predicate.const_output_eliminate 0.73% : 0.000002s : 8: predicate.depend_value_elim 0.92% : 0.000002s : 13: predicate.dict_get_item_const_eliminator 0.98% : 0.000002s : 13: predicate.dict_get_item_eliminator 0.86% : 0.000002s : 13: predicate.dict_set_item_eliminator 0.91% : 0.000002s : 8: predicate.dumpgradient_eliminate 0.30% : 0.000001s : 4: predicate.elim_not_effective 0.43% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.21% : 0.000003s : 17: predicate.environ_add_const_eliminate 1.05% : 0.000002s : 17: predicate.environ_get_add_eliminate 1.22% : 0.000003s : 17: predicate.environ_get_depend_swap 1.73% : 0.000004s : 25: predicate.environ_get_eliminate 1.06% : 0.000002s : 17: predicate.environ_get_set_eliminate 1.39% : 0.000003s : 21: predicate.exchange_switch_depend_value 2.41% : 0.000005s : 21: predicate.float_depend_g_call 0.56% : 0.000001s : 8: predicate.float_environ_get_switch 0.73% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.18% : 0.000000s : 4: predicate.fold_const_symbol 0.64% : 0.000001s : 8: predicate.get_grad_eliminate 0.31% : 0.000001s : 4: predicate.graph_param_transform 0.59% : 0.000001s : 8: predicate.incorporate_call 0.48% : 0.000001s : 8: predicate.incorporate_call_switch 6.51% : 0.000014s : 58: predicate.inline 0.79% : 0.000002s : 8: predicate.inline_without_move 0.31% : 0.000001s : 8: predicate.j_node_and_user_rematch 0.86% : 0.000002s : 8: predicate.less_batch_normalization 1.88% : 0.000004s : 25: predicate.list_to_tuple_eliminator_ 2.50% : 0.000005s : 38: predicate.load_eliminater 0.90% : 0.000002s : 4: predicate.loop_unroll_after_grad 2.34% : 0.000005s : 34: predicate.loop_unroll_before_grad 1.67% : 0.000003s : 21: predicate.make_slice_get_slice_eliminator 0.57% : 0.000001s : 8: predicate.merge_addn 0.62% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.58% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.86% : 0.000002s : 13: predicate.minmaximum_grad 0.90% : 0.000002s : 4: predicate.mutable_eliminate 0.41% : 0.000001s : 4: predicate.opt_reshape 0.55% : 0.000001s : 4: predicate.parallel_virtual_node 1.72% : 0.000004s : 21: predicate.partial_defer_inline 1.65% : 0.000003s : 21: predicate.partial_eliminate 0.95% : 0.000002s : 13: predicate.print_const_string_wrapper 0.62% : 0.000001s : 8: predicate.reduce_all_const_elim 1.17% : 0.000002s : 13: predicate.reduce_eliminate 2.49% : 0.000005s : 38: predicate.redundant_stop_gradient_eliminater 0.57% : 0.000001s : 8: predicate.remove_not_recompute_node 1.39% : 0.000003s : 25: predicate.replace_applicator 0.68% : 0.000001s : 8: predicate.replace_old_param 0.37% : 0.000001s : 4: predicate.reset_defer_inline 0.93% : 0.000002s : 13: predicate.reshape_eliminate 0.69% : 0.000001s : 8: predicate.row_tensor_add_zeros_like 0.44% : 0.000001s : 4: predicate.row_tensor_eliminate 0.78% : 0.000002s : 8: predicate.same_eliminate 0.42% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.85% : 0.000002s : 8: predicate.shard_identity_eliminate 0.69% : 0.000001s : 8: predicate.special_op_eliminate 0.68% : 0.000001s : 8: predicate.specialize_transform 0.93% : 0.000002s : 8: predicate.split_environ_get_set_with_tuple_value 0.66% : 0.000001s : 8: predicate.stack_unstack_eliminate 0.42% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.47% : 0.000003s : 21: predicate.switch_defer_inline 2.08% : 0.000004s : 29: predicate.switch_layer_defer_inline 4.83% : 0.000010s : 67: predicate.switch_simplify 0.91% : 0.000002s : 13: predicate.tile_eliminate 0.90% : 0.000002s : 13: predicate.transpose_eliminate 1.58% : 0.000003s : 21: predicate.tuple_list_convert_item_index_to_positive 1.53% : 0.000003s : 21: predicate.tuple_list_get_item_const_eliminator 1.42% : 0.000003s : 21: predicate.tuple_list_get_item_depend_reorder 3.38% : 0.000007s : 33: predicate.tuple_list_get_item_eliminator 1.40% : 0.000003s : 21: predicate.tuple_list_get_set_item_eliminator 2.21% : 0.000005s : 29: predicate.tuple_list_set_item_eliminator 1.65% : 0.000003s : 25: predicate.tuple_to_list_eliminator_ 2.53% : 0.000005s : 38: predicate.updatestate_pure_node_eliminater 3.13% : 0.000007s : 46: predicate.updatestate_useless_node_eliminater 0.39% : 0.000001s : 4: predicate.value_based_eliminate 0.69% : 0.000001s : 8: predicate.virtual_dataset_eliminate 0.64% : 0.000001s : 8: predicate.virtual_output_eliminate 0.26% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.40% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000589 11 62.56% : 0.000369s : 5: func_graph_cloner_run.FuncGraphClonerGraph 37.44% : 0.000221s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.028852 192 0.02% : 0.000006s : 1: ForceFp32Comm 11.10% : 0.003204s : 1: add_attr 11.06% : 0.003190s : 1: add_attr_with_inline 0.02% : 0.000007s : 1: add_comm_op_reuse_tag 0.20% : 0.000057s : 1: add_recomputation 0.02% : 0.000006s : 1: assign_add_opt 0.24% : 0.000069s : 1: auto_monad 0.09% : 0.000026s : 1: auto_monad_reorder 0.02% : 0.000006s : 1: begin_end_overlap_inline 0.03% : 0.000008s : 1: bias_add_comm_swap 1.42% : 0.000410s : 1: bootstrap 0.13% : 0.000037s : 1: cconv 0.02% : 0.000007s : 1: comm_op_add_attrs 0.06% : 0.000018s : 1: control_data_broadcast_order 0.05% : 0.000014s : 1: convert_after_rewriter 0.11% : 0.000032s : 1: cse_after_recomputation 0.03% : 0.000007s : 1: dataset_repeat_opt 0.07% : 0.000021s : 1: detach_backward 0.04% : 0.000011s : 1: environ_conv 0.09% : 0.000027s : 1: event_method 0.03% : 0.000007s : 1: full_micro_interleaved_order_control 0.03% : 0.000008s : 1: get_jit_bprop_graph 0.04% : 0.000012s : 1: graph_reusing 0.02% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.02% : 0.000006s : 1: handle_group_info 0.03% : 0.000008s : 1: inline 0.03% : 0.000009s : 1: insert-virtual-dataset 0.02% : 0.000006s : 1: interleave_parallel_branches 0.02% : 0.000006s : 1: interleave_split_concat_branches 0.03% : 0.000008s : 1: label_fine_grained_interleaved_index 0.04% : 0.000011s : 1: label_micro_interleaved_index 1.67% : 0.000482s : 1: loop_unroll 0.02% : 0.000007s : 1: merge_cast_opt 0.02% : 0.000007s : 1: micro_interleaved_order_control 2.31% : 0.000668s : 1: mutable_eliminate 0.03% : 0.000010s : 1: offloading_packed_experts 0.05% : 0.000016s : 1: opt.transform.loop_unroll_optimizer 0.07% : 0.000020s : 1: opt.transform.mutable_eliminate 4.12% : 0.001190s : 78: opt.transform.opt_a 0.11% : 0.000031s : 1: opt.transform.opt_after_cconv 0.10% : 0.000028s : 1: opt.transform.opt_after_jit_grad 0.37% : 0.000106s : 28: opt.transform.opt_b 0.22% : 0.000065s : 2: opt.transform.opt_trans_graph 0.14% : 0.000039s : 4: opt.transform.symbol_engine_opt 10.69% : 0.003084s : 1: opt_a 0.47% : 0.000136s : 1: opt_after_cconv 1.95% : 0.000564s : 1: opt_after_jit_grad 0.98% : 0.000282s : 1: opt_b 20.51% : 0.005919s : 1: optimize 0.08% : 0.000024s : 1: optimize_parallel_all_gather_comm 0.04% : 0.000011s : 1: order_py_execute_after_rewriter 0.09% : 0.000026s : 1: overlap_grad_flash_sp 0.02% : 0.000006s : 1: overlap_grad_matmul_and_grad_allreduce 0.03% : 0.000010s : 1: overlap_grad_ring_attention 0.02% : 0.000007s : 1: overlap_opt_shard_grad_in_pipeline 0.02% : 0.000006s : 1: overlap_opt_shard_in_pipeline 0.03% : 0.000009s : 1: overlap_param_gather 0.02% : 0.000007s : 1: overlap_recompute_allgather_and_fa_grad 0.04% : 0.000011s : 1: overlap_recompute_and_grad_model_parallel 0.03% : 0.000008s : 1: overlap_recompute_comm 0.03% : 0.000010s : 1: parallel-infer-symbol 0.02% : 0.000006s : 1: parallel-infer-symbol-second 0.03% : 0.000007s : 1: partial_unused_args_eliminate 0.03% : 0.000009s : 1: pipeline_parallel_scheduler 0.03% : 0.000007s : 1: pipeline_split 0.14% : 0.000041s : 1: pre_auto_parallel 0.12% : 0.000033s : 1: py_interpret_to_execute 0.06% : 0.000018s : 1: py_interpret_to_execute_after_opt_a 0.02% : 0.000006s : 1: remove_cast_before_assign_add 0.07% : 0.000020s : 1: remove_dup_value 1.09% : 0.000315s : 1: renormalize.infer 1.02% : 0.000293s : 1: renormalize.specialize 0.03% : 0.000008s : 1: reorder_send_recv_between_fp_bp 0.04% : 0.000010s : 1: rewriter_after_jit_bprop_graph 0.15% : 0.000043s : 1: rewriter_after_opt_a 0.30% : 0.000087s : 1: rewriter_before_opt_a 0.03% : 0.000008s : 1: slice_cell_reuse_recomputed_activation 0.03% : 0.000008s : 1: slice_recompute_activation 0.02% : 0.000007s : 1: split_layernorm_comm 0.03% : 0.000007s : 1: split_matmul_comm_elemetwise 0.04% : 0.000011s : 1: swap_dp_allreduce_reducescatter 0.36% : 0.000105s : 1: symbol_engine_optimizer 0.39% : 0.000111s : 1: tuple_transform 26.52% : 0.007651s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:45:38.350.927 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0148222, [21] [bootstrap]: 0.00038744 [type_inference]: 0.00565485 [event_method]: 1.923e-05 [auto_monad]: 6.363e-05 [graph_reusing]: 5.62999e-06 [inline]: 2.64001e-06 [add_attr]: 0.00325862, [1] [add_attr_with_inline]: 0.00324885, [1] [Cycle 1]: 6.037e-05, [2] [tag_attr]: 1.925e-05 [meta_addattr_fg_expand]: 6.59999e-06 [parallel-infer-symbol]: 4.25e-06 [pre_auto_parallel]: 3.244e-05 [insert-virtual-dataset]: 2.37999e-06 [parallel-infer-symbol-second]: 8.49977e-07 [dataset_repeat_opt]: 2.31e-06 [pipeline_split]: 1.62001e-06 [optimize]: 0.00469607, [53] [py_interpret_to_execute]: 2.443e-05 [rewriter_before_opt_a]: 8.137e-05 [opt_a]: 0.0025919, [2] [Cycle 1]: 0.00191987, [45] [expand_dump_flag]: 3.24001e-06 [switch_simplify]: 4.25e-05 [loop_unroll]: 3.034e-05 [a_1]: 0.000617 [with_stream_mark]: 1.505e-05 [recompute_prepare]: 9.44e-06 [updatestate_depend_eliminate]: 3.63999e-06 [updatestate_assign_eliminate]: 3.25e-06 [updatestate_loads_eliminate]: 2.67001e-06 [parameter_eliminate]: 1.89999e-06 [a_2]: 8.202e-05 [accelerated_algorithm]: 7.4e-06 [shard]: 1.76e-06 [meta_shard_fg_expand]: 1.76e-06 [shard_inline]: 6.83998e-06 [merge_send_recv]: 9.88002e-06 [auto_parallel]: 6.02001e-06 [parallel]: 1.84e-05 [flash_sp]: 8.52e-06 [merge_comm]: 3.66999e-06 [allreduce_fusion]: 3.43e-06 [matmul_add_comm_reduction]: 8.38999e-06 [allreduce_slice_to_reducescatter]: 6.00005e-07 [virtual_shard_identity]: 8.24002e-06 [virtual_dataset]: 7.66999e-06 [get_grad_eliminate_]: 6.02001e-06 [virtual_output]: 6.34001e-06 [merge_forward]: 4.07998e-06 [cell_reuse_recompute_pass]: 1.19998e-06 [offload_activation]: 9.62001e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.207e-05 [merge_recompute_call_nodes]: 1.42e-06 [before_grad]: 1.039e-05 [set_forward_comm_id_for_comm_node_pass]: 3.41999e-06 [meta_fg_expand]: 2.75997e-06 [flash_sp_send_recv_attached]: 2.56998e-06 [receive_attached]: 2.66e-06 [after_resolve]: 1.154e-05 [a_after_grad]: 9.95002e-06 [renormalize]: 0.0005876 [add_forward_monad_depend]: 5.79999e-06 [auto_monad_grad]: 2.88e-06 [auto_monad_eliminator]: 1.53e-05 [cse]: 2.732e-05 [a_3]: 5.091e-05 [Cycle 2]: 0.00066192, [45] [expand_dump_flag]: 1.94e-06 [switch_simplify]: 7.59002e-06 [loop_unroll]: 6.42001e-06 [a_1]: 0.00013016 [with_stream_mark]: 1.17e-05 [recompute_prepare]: 6.96001e-06 [updatestate_depend_eliminate]: 2.94999e-06 [updatestate_assign_eliminate]: 2.60002e-06 [updatestate_loads_eliminate]: 2.93998e-06 [parameter_eliminate]: 1.35999e-06 [a_2]: 7.235e-05 [accelerated_algorithm]: 6.19001e-06 [shard]: 1.24e-06 [meta_shard_fg_expand]: 1.66e-06 [shard_inline]: 5.94e-06 [merge_send_recv]: 5.89e-06 [auto_parallel]: 6.59001e-06 [parallel]: 6.66999e-06 [flash_sp]: 3.35e-06 [merge_comm]: 3.46001e-06 [allreduce_fusion]: 3.36999e-06 [matmul_add_comm_reduction]: 6.11998e-06 [allreduce_slice_to_reducescatter]: 3.19997e-07 [virtual_shard_identity]: 6.64999e-06 [virtual_dataset]: 5.79e-06 [get_grad_eliminate_]: 5.47001e-06 [virtual_output]: 6.21998e-06 [merge_forward]: 2.88e-06 [cell_reuse_recompute_pass]: 1.59e-06 [offload_activation]: 7.26999e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.407e-05 [merge_recompute_call_nodes]: 1.13001e-06 [before_grad]: 9.39998e-06 [set_forward_comm_id_for_comm_node_pass]: 3.2e-06 [meta_fg_expand]: 2.02999e-06 [flash_sp_send_recv_attached]: 9.90025e-07 [receive_attached]: 1.29998e-06 [after_resolve]: 1.084e-05 [a_after_grad]: 9.08002e-06 [renormalize]: 8.00064e-08 [add_forward_monad_depend]: 1.55999e-06 [auto_monad_grad]: 1.10999e-06 [auto_monad_eliminator]: 7.53999e-06 [cse]: 1.394e-05 [a_3]: 3.873e-05 [py_interpret_to_execute_after_opt_a]: 1.021e-05 [slice_cell_reuse_recomputed_activation]: 2.16e-06 [rewriter_after_opt_a]: 3.714e-05 [convert_after_rewriter]: 7.55998e-06 [order_py_execute_after_rewriter]: 5.47999e-06 [mutable_eliminate]: 0.00057986 [opt_b]: 0.00020322, [1] [Cycle 1]: 0.00019643, [7] [b_1]: 0.00012331 [b_2]: 8.08999e-06 [updatestate_depend_eliminate]: 5.33002e-06 [updatestate_assign_eliminate]: 2.28998e-06 [updatestate_loads_eliminate]: 2.69001e-06 [renormalize]: 5.8001e-07 [cse]: 1.856e-05 [optimize_parallel_all_gather_comm]: 1.588e-05 [overlap_param_gather]: 1.84998e-06 [cconv]: 2.519e-05 [loop_unroll]: 0.00044562 [opt_after_cconv]: 0.00010241, [1] [Cycle 1]: 9.672e-05, [7] [c_1]: 3.18e-05 [parameter_eliminate]: 3.17002e-06 [updatestate_depend_eliminate]: 5.21998e-06 [updatestate_assign_eliminate]: 2.59999e-06 [updatestate_loads_eliminate]: 2.36e-06 [cse]: 1.726e-05 [renormalize]: 3.20026e-07 [remove_dup_value]: 1.352e-05 [tuple_transform]: 8.055e-05, [1] [Cycle 1]: 7.595e-05, [4] [d_1]: 4.911e-05 [none_parameter_eliminate]: 1.54998e-06 [renormalize]: 2.19996e-07 [switch_simplify]: 6.81001e-06 [partial_unused_args_eliminate]: 2.41e-06 [add_recomputation]: 4.644e-05 [cse_after_recomputation]: 2.063e-05, [1] [Cycle 1]: 1.605e-05, [1] [cse]: 1.064e-05 [environ_conv]: 4.74998e-06 [swap_dp_allreduce_reducescatter]: 5.17e-06 [bias_add_comm_swap]: 2.94999e-06 [label_micro_interleaved_index]: 4.88001e-06 [label_fine_grained_interleaved_index]: 3.22002e-06 [merge_cast_opt]: 1.32999e-06 [slice_recompute_activation]: 2.31e-06 [micro_interleaved_order_control]: 2.66e-06 [assign_add_opt]: 1.23002e-06 [ForceFp32Comm]: 8.50006e-07 [remove_cast_before_assign_add]: 1.02e-06 [full_micro_interleaved_order_control]: 2.06e-06 [reorder_send_recv_between_fp_bp]: 2.82002e-06 [comm_op_add_attrs]: 1.00999e-06 [add_comm_op_reuse_tag]: 9.50007e-07 [interleave_split_concat_branches]: 1.20999e-06 [interleave_parallel_branches]: 1.35999e-06 [overlap_opt_shard_in_pipeline]: 1.27999e-06 [overlap_opt_shard_grad_in_pipeline]: 1.79e-06 [control_data_broadcast_order]: 1.23e-05 [grouped_pairwise_exchange_alltoall]: 1.57001e-06 [offloading_packed_experts]: 4.22003e-06 [overlap_recompute_and_grad_model_parallel]: 5.01002e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.24e-06 [overlap_recompute_allgather_and_fa_grad]: 1.50001e-06 [overlap_recompute_comm]: 2.71e-06 [overlap_grad_ring_attention]: 4.1e-06 [overlap_grad_flash_sp]: 1.809e-05 [begin_end_overlap_inline]: 4.7998e-07 [split_matmul_comm_elemetwise]: 2.17999e-06 [split_layernorm_comm]: 2.19001e-06 [handle_group_info]: 1.27e-06 [symbol_engine_optimizer]: 7.633e-05, [1] [Cycle 1]: 7.175e-05, [6] [build]: 3.5e-06 [elim_shapecalc]: 1.012e-05 [elim_not_effective]: 1.334e-05 [opt_reshape]: 6.66999e-06 [fold_const_symbol]: 9.77999e-06 [renormalize]: 2.79979e-07 [detach_backward]: 1.99e-06 [pipeline_parallel_scheduler]: 1.39e-06 [auto_monad_reorder]: 1.747e-05 [get_jit_bprop_graph]: 1.76e-06 [rewriter_after_jit_bprop_graph]: 3.66999e-06 [opt_after_jit_grad]: 0.00048807 [validate]: 3.975e-05 Sums bootstrap : 0.000387s : 3.65% type_inference : 0.005655s : 53.26% event_method : 0.000019s : 0.18% auto_monad : 0.000064s : 0.60% graph_reusing : 0.000006s : 0.05% inline : 0.000003s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000019s : 0.18% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000007s : 0.06% parallel-infer-symbol : 0.000004s : 0.04% pre_auto_parallel : 0.000032s : 0.31% insert-virtual-dataset : 0.000002s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.02% optimize.py_interpret_to_execute : 0.000024s : 0.23% optimize.rewriter_before_opt_a : 0.000081s : 0.77% optimize.opt_a.expand_dump_flag : 0.000005s : 0.05% optimize.opt_a.switch_simplify : 0.000050s : 0.47% optimize.opt_a.loop_unroll : 0.000037s : 0.35% optimize.opt_a.a_1 : 0.000747s : 7.04% optimize.opt_a.with_stream_mark : 0.000027s : 0.25% optimize.opt_a.recompute_prepare : 0.000016s : 0.15% optimize.opt_a.updatestate_depend_eliminate : 0.000007s : 0.06% optimize.opt_a.updatestate_assign_eliminate : 0.000006s : 0.06% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.05% optimize.opt_a.parameter_eliminate : 0.000003s : 0.03% optimize.opt_a.a_2 : 0.000154s : 1.45% optimize.opt_a.accelerated_algorithm : 0.000014s : 0.13% optimize.opt_a.shard : 0.000003s : 0.03% optimize.opt_a.meta_shard_fg_expand : 0.000003s : 0.03% optimize.opt_a.shard_inline : 0.000013s : 0.12% optimize.opt_a.merge_send_recv : 0.000016s : 0.15% optimize.opt_a.auto_parallel : 0.000013s : 0.12% optimize.opt_a.parallel : 0.000025s : 0.24% optimize.opt_a.flash_sp : 0.000012s : 0.11% optimize.opt_a.merge_comm : 0.000007s : 0.07% optimize.opt_a.allreduce_fusion : 0.000007s : 0.06% optimize.opt_a.matmul_add_comm_reduction : 0.000015s : 0.14% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000015s : 0.14% optimize.opt_a.virtual_dataset : 0.000013s : 0.13% optimize.opt_a.get_grad_eliminate_ : 0.000011s : 0.11% optimize.opt_a.virtual_output : 0.000013s : 0.12% optimize.opt_a.merge_forward : 0.000007s : 0.07% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.03% optimize.opt_a.offload_activation : 0.000017s : 0.16% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000026s : 0.25% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.02% optimize.opt_a.before_grad : 0.000020s : 0.19% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000007s : 0.06% optimize.opt_a.meta_fg_expand : 0.000005s : 0.05% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.03% optimize.opt_a.receive_attached : 0.000004s : 0.04% optimize.opt_a.after_resolve : 0.000022s : 0.21% optimize.opt_a.a_after_grad : 0.000019s : 0.18% optimize.opt_a.renormalize : 0.000588s : 5.53% optimize.opt_a.add_forward_monad_depend : 0.000007s : 0.07% optimize.opt_a.auto_monad_grad : 0.000004s : 0.04% optimize.opt_a.auto_monad_eliminator : 0.000023s : 0.22% optimize.opt_a.cse : 0.000041s : 0.39% optimize.opt_a.a_3 : 0.000090s : 0.84% optimize.py_interpret_to_execute_after_opt_a : 0.000010s : 0.10% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.02% optimize.rewriter_after_opt_a : 0.000037s : 0.35% optimize.convert_after_rewriter : 0.000008s : 0.07% optimize.order_py_execute_after_rewriter : 0.000005s : 0.05% optimize.mutable_eliminate : 0.000580s : 5.46% optimize.opt_b.b_1 : 0.000123s : 1.16% optimize.opt_b.b_2 : 0.000008s : 0.08% optimize.opt_b.updatestate_depend_eliminate : 0.000005s : 0.05% optimize.opt_b.updatestate_assign_eliminate : 0.000002s : 0.02% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_b.renormalize : 0.000001s : 0.01% optimize.opt_b.cse : 0.000019s : 0.17% optimize.optimize_parallel_all_gather_comm : 0.000016s : 0.15% optimize.overlap_param_gather : 0.000002s : 0.02% optimize.cconv : 0.000025s : 0.24% optimize.loop_unroll : 0.000446s : 4.20% optimize.opt_after_cconv.c_1 : 0.000032s : 0.30% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000005s : 0.05% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.02% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.02% optimize.opt_after_cconv.cse : 0.000017s : 0.16% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000014s : 0.13% optimize.tuple_transform.d_1 : 0.000049s : 0.46% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000007s : 0.06% optimize.partial_unused_args_eliminate : 0.000002s : 0.02% optimize.add_recomputation : 0.000046s : 0.44% optimize.cse_after_recomputation.cse : 0.000011s : 0.10% optimize.environ_conv : 0.000005s : 0.04% optimize.swap_dp_allreduce_reducescatter : 0.000005s : 0.05% optimize.bias_add_comm_swap : 0.000003s : 0.03% optimize.label_micro_interleaved_index : 0.000005s : 0.05% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.03% optimize.merge_cast_opt : 0.000001s : 0.01% optimize.slice_recompute_activation : 0.000002s : 0.02% optimize.micro_interleaved_order_control : 0.000003s : 0.03% optimize.assign_add_opt : 0.000001s : 0.01% optimize.ForceFp32Comm : 0.000001s : 0.01% optimize.remove_cast_before_assign_add : 0.000001s : 0.01% optimize.full_micro_interleaved_order_control : 0.000002s : 0.02% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.03% optimize.comm_op_add_attrs : 0.000001s : 0.01% optimize.add_comm_op_reuse_tag : 0.000001s : 0.01% optimize.interleave_split_concat_branches : 0.000001s : 0.01% optimize.interleave_parallel_branches : 0.000001s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.02% optimize.control_data_broadcast_order : 0.000012s : 0.12% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.01% optimize.offloading_packed_experts : 0.000004s : 0.04% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.05% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000002s : 0.01% optimize.overlap_recompute_comm : 0.000003s : 0.03% optimize.overlap_grad_ring_attention : 0.000004s : 0.04% optimize.overlap_grad_flash_sp : 0.000018s : 0.17% optimize.begin_end_overlap_inline : 0.000000s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.02% optimize.split_layernorm_comm : 0.000002s : 0.02% optimize.handle_group_info : 0.000001s : 0.01% optimize.symbol_engine_optimizer.build : 0.000003s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000010s : 0.10% optimize.symbol_engine_optimizer.elim_not_effective : 0.000013s : 0.13% optimize.symbol_engine_optimizer.opt_reshape : 0.000007s : 0.06% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000010s : 0.09% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.02% pipeline_parallel_scheduler : 0.000001s : 0.01% auto_monad_reorder : 0.000017s : 0.16% get_jit_bprop_graph : 0.000002s : 0.02% rewriter_after_jit_bprop_graph : 0.000004s : 0.03% opt_after_jit_grad : 0.000488s : 4.60% validate : 0.000040s : 0.37% Time group info: ------[substitution.] 0.000180 28 1.15% : 0.000002s : 2: substitution.elim_not_effective 0.72% : 0.000001s : 2: substitution.fold_const_symbol 3.82% : 0.000007s : 4: substitution.graph_param_transform 78.59% : 0.000141s : 4: substitution.inline 1.90% : 0.000003s : 4: substitution.j_node_and_user_rematch 2.75% : 0.000005s : 4: substitution.remove_not_recompute_node 2.58% : 0.000005s : 4: substitution.replace_old_param 8.50% : 0.000015s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.005596 2 87.10% : 0.004874s : 1: type_inference.infer 12.90% : 0.000722s : 1: type_inference.specialize ------[replace.] 0.000060 8 63.89% : 0.000038s : 4: replace.inline 36.11% : 0.000022s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000152 8 91.24% : 0.000139s : 4: match.inline 8.76% : 0.000013s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000203 1278 0.86% : 0.000002s : 13: predicate.accumulaten_eliminater 0.69% : 0.000001s : 4: predicate.ad_related_special_op_eliminate 0.51% : 0.000001s : 8: predicate.addn_check_dump 0.95% : 0.000002s : 13: predicate.addn_zero_filter 0.86% : 0.000002s : 13: predicate.adjust_all_reduce_mul_add 2.11% : 0.000004s : 21: predicate.arithmetic_simplify 0.92% : 0.000002s : 13: predicate.cast_eliminate 0.59% : 0.000001s : 8: predicate.check_bprop_eliminate 0.49% : 0.000001s : 8: predicate.compare_switch_simplify 0.20% : 0.000000s : 4: predicate.const_output_eliminate 0.60% : 0.000001s : 8: predicate.depend_value_elim 0.92% : 0.000002s : 13: predicate.dict_get_item_const_eliminator 1.06% : 0.000002s : 13: predicate.dict_get_item_eliminator 1.02% : 0.000002s : 13: predicate.dict_set_item_eliminator 1.07% : 0.000002s : 8: predicate.dumpgradient_eliminate 0.19% : 0.000000s : 4: predicate.elim_not_effective 0.59% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.11% : 0.000002s : 17: predicate.environ_add_const_eliminate 1.08% : 0.000002s : 17: predicate.environ_get_add_eliminate 1.10% : 0.000002s : 17: predicate.environ_get_depend_swap 1.71% : 0.000003s : 25: predicate.environ_get_eliminate 1.03% : 0.000002s : 17: predicate.environ_get_set_eliminate 1.47% : 0.000003s : 21: predicate.exchange_switch_depend_value 2.36% : 0.000005s : 21: predicate.float_depend_g_call 0.52% : 0.000001s : 8: predicate.float_environ_get_switch 0.75% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.18% : 0.000000s : 4: predicate.fold_const_symbol 0.64% : 0.000001s : 8: predicate.get_grad_eliminate 0.29% : 0.000001s : 4: predicate.graph_param_transform 0.60% : 0.000001s : 8: predicate.incorporate_call 0.48% : 0.000001s : 8: predicate.incorporate_call_switch 6.25% : 0.000013s : 58: predicate.inline 0.69% : 0.000001s : 8: predicate.inline_without_move 0.33% : 0.000001s : 8: predicate.j_node_and_user_rematch 0.89% : 0.000002s : 8: predicate.less_batch_normalization 1.97% : 0.000004s : 25: predicate.list_to_tuple_eliminator_ 2.55% : 0.000005s : 38: predicate.load_eliminater 1.05% : 0.000002s : 4: predicate.loop_unroll_after_grad 2.42% : 0.000005s : 34: predicate.loop_unroll_before_grad 1.70% : 0.000003s : 21: predicate.make_slice_get_slice_eliminator 0.56% : 0.000001s : 8: predicate.merge_addn 0.56% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.81% : 0.000002s : 8: predicate.mini_step_allgather_replace 0.85% : 0.000002s : 13: predicate.minmaximum_grad 0.81% : 0.000002s : 4: predicate.mutable_eliminate 0.35% : 0.000001s : 4: predicate.opt_reshape 0.40% : 0.000001s : 4: predicate.parallel_virtual_node 1.82% : 0.000004s : 21: predicate.partial_defer_inline 1.68% : 0.000003s : 21: predicate.partial_eliminate 0.88% : 0.000002s : 13: predicate.print_const_string_wrapper 0.55% : 0.000001s : 8: predicate.reduce_all_const_elim 1.22% : 0.000002s : 13: predicate.reduce_eliminate 2.50% : 0.000005s : 38: predicate.redundant_stop_gradient_eliminater 0.45% : 0.000001s : 8: predicate.remove_not_recompute_node 1.42% : 0.000003s : 25: predicate.replace_applicator 0.52% : 0.000001s : 8: predicate.replace_old_param 0.27% : 0.000001s : 4: predicate.reset_defer_inline 0.91% : 0.000002s : 13: predicate.reshape_eliminate 0.74% : 0.000002s : 8: predicate.row_tensor_add_zeros_like 0.37% : 0.000001s : 4: predicate.row_tensor_eliminate 0.73% : 0.000001s : 8: predicate.same_eliminate 0.44% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.81% : 0.000002s : 8: predicate.shard_identity_eliminate 0.66% : 0.000001s : 8: predicate.special_op_eliminate 0.73% : 0.000001s : 8: predicate.specialize_transform 0.84% : 0.000002s : 8: predicate.split_environ_get_set_with_tuple_value 0.71% : 0.000001s : 8: predicate.stack_unstack_eliminate 0.58% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.57% : 0.000003s : 21: predicate.switch_defer_inline 2.04% : 0.000004s : 29: predicate.switch_layer_defer_inline 5.18% : 0.000011s : 67: predicate.switch_simplify 0.94% : 0.000002s : 13: predicate.tile_eliminate 0.95% : 0.000002s : 13: predicate.transpose_eliminate 1.65% : 0.000003s : 21: predicate.tuple_list_convert_item_index_to_positive 1.59% : 0.000003s : 21: predicate.tuple_list_get_item_const_eliminator 1.46% : 0.000003s : 21: predicate.tuple_list_get_item_depend_reorder 3.25% : 0.000007s : 33: predicate.tuple_list_get_item_eliminator 1.42% : 0.000003s : 21: predicate.tuple_list_get_set_item_eliminator 2.24% : 0.000005s : 29: predicate.tuple_list_set_item_eliminator 1.78% : 0.000004s : 25: predicate.tuple_to_list_eliminator_ 2.37% : 0.000005s : 38: predicate.updatestate_pure_node_eliminater 3.18% : 0.000006s : 46: predicate.updatestate_useless_node_eliminater 0.43% : 0.000001s : 4: predicate.value_based_eliminate 0.65% : 0.000001s : 8: predicate.virtual_dataset_eliminate 0.67% : 0.000001s : 8: predicate.virtual_output_eliminate 0.26% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.40% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000502 11 52.28% : 0.000263s : 5: func_graph_cloner_run.FuncGraphClonerGraph 47.72% : 0.000240s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.024682 192 0.01% : 0.000004s : 1: ForceFp32Comm 13.22% : 0.003263s : 1: add_attr 13.18% : 0.003253s : 1: add_attr_with_inline 0.02% : 0.000004s : 1: add_comm_op_reuse_tag 0.20% : 0.000050s : 1: add_recomputation 0.02% : 0.000004s : 1: assign_add_opt 0.28% : 0.000068s : 1: auto_monad 0.09% : 0.000021s : 1: auto_monad_reorder 0.01% : 0.000003s : 1: begin_end_overlap_inline 0.02% : 0.000006s : 1: bias_add_comm_swap 1.65% : 0.000407s : 1: bootstrap 0.12% : 0.000029s : 1: cconv 0.01% : 0.000004s : 1: comm_op_add_attrs 0.06% : 0.000015s : 1: control_data_broadcast_order 0.04% : 0.000011s : 1: convert_after_rewriter 0.10% : 0.000024s : 1: cse_after_recomputation 0.02% : 0.000005s : 1: dataset_repeat_opt 0.02% : 0.000005s : 1: detach_backward 0.03% : 0.000008s : 1: environ_conv 0.10% : 0.000025s : 1: event_method 0.02% : 0.000005s : 1: full_micro_interleaved_order_control 0.02% : 0.000005s : 1: get_jit_bprop_graph 0.04% : 0.000009s : 1: graph_reusing 0.02% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.02% : 0.000004s : 1: handle_group_info 0.02% : 0.000006s : 1: inline 0.02% : 0.000006s : 1: insert-virtual-dataset 0.02% : 0.000004s : 1: interleave_parallel_branches 0.02% : 0.000004s : 1: interleave_split_concat_branches 0.03% : 0.000006s : 1: label_fine_grained_interleaved_index 0.03% : 0.000008s : 1: label_micro_interleaved_index 1.84% : 0.000453s : 1: loop_unroll 0.02% : 0.000004s : 1: merge_cast_opt 0.02% : 0.000006s : 1: micro_interleaved_order_control 2.38% : 0.000588s : 1: mutable_eliminate 0.03% : 0.000007s : 1: offloading_packed_experts 0.06% : 0.000015s : 1: opt.transform.loop_unroll_optimizer 0.07% : 0.000016s : 1: opt.transform.mutable_eliminate 4.74% : 0.001169s : 78: opt.transform.opt_a 0.12% : 0.000030s : 1: opt.transform.opt_after_cconv 0.10% : 0.000026s : 1: opt.transform.opt_after_jit_grad 0.40% : 0.000099s : 28: opt.transform.opt_b 0.22% : 0.000054s : 2: opt.transform.opt_trans_graph 0.15% : 0.000036s : 4: opt.transform.symbol_engine_opt 10.52% : 0.002596s : 1: opt_a 0.43% : 0.000106s : 1: opt_after_cconv 2.01% : 0.000497s : 1: opt_after_jit_grad 0.84% : 0.000207s : 1: opt_b 19.05% : 0.004701s : 1: optimize 0.08% : 0.000019s : 1: optimize_parallel_all_gather_comm 0.04% : 0.000009s : 1: order_py_execute_after_rewriter 0.09% : 0.000021s : 1: overlap_grad_flash_sp 0.02% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.03% : 0.000007s : 1: overlap_grad_ring_attention 0.02% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.02% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.02% : 0.000005s : 1: overlap_param_gather 0.02% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.03% : 0.000008s : 1: overlap_recompute_and_grad_model_parallel 0.02% : 0.000006s : 1: overlap_recompute_comm 0.03% : 0.000008s : 1: parallel-infer-symbol 0.02% : 0.000004s : 1: parallel-infer-symbol-second 0.02% : 0.000005s : 1: partial_unused_args_eliminate 0.02% : 0.000004s : 1: pipeline_parallel_scheduler 0.02% : 0.000004s : 1: pipeline_split 0.15% : 0.000037s : 1: pre_auto_parallel 0.12% : 0.000028s : 1: py_interpret_to_execute 0.06% : 0.000014s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000004s : 1: remove_cast_before_assign_add 0.07% : 0.000017s : 1: remove_dup_value 1.20% : 0.000297s : 1: renormalize.infer 1.14% : 0.000282s : 1: renormalize.specialize 0.02% : 0.000005s : 1: reorder_send_recv_between_fp_bp 0.03% : 0.000007s : 1: rewriter_after_jit_bprop_graph 0.17% : 0.000042s : 1: rewriter_after_opt_a 0.35% : 0.000086s : 1: rewriter_before_opt_a 0.02% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.02% : 0.000006s : 1: slice_recompute_activation 0.02% : 0.000005s : 1: split_layernorm_comm 0.02% : 0.000005s : 1: split_matmul_comm_elemetwise 0.03% : 0.000008s : 1: swap_dp_allreduce_reducescatter 0.32% : 0.000079s : 1: symbol_engine_optimizer 0.34% : 0.000083s : 1: tuple_transform 22.97% : 0.005670s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:45:38.607.107 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:45:38.607.447 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0163907, [21] [bootstrap]: 0.00038694 [type_inference]: 0.00555413 [event_method]: 1.912e-05 [auto_monad]: 6.407e-05 [graph_reusing]: 6.61e-06 [inline]: 2.31e-06 [add_attr]: 0.00324356, [1] [add_attr_with_inline]: 0.00323444, [1] [Cycle 1]: 7.081e-05, [2] [tag_attr]: 1.871e-05 [meta_addattr_fg_expand]: 6.21998e-06 [parallel-infer-symbol]: 3.55998e-06 [pre_auto_parallel]: 3.294e-05 [insert-virtual-dataset]: 2.60002e-06 [parallel-infer-symbol-second]: 6.89994e-07 [dataset_repeat_opt]: 1.91998e-06 [pipeline_split]: 1.67001e-06 [optimize]: 0.00583271, [53] [py_interpret_to_execute]: 2.752e-05 [rewriter_before_opt_a]: 8.677e-05 [opt_a]: 0.00335438, [2] [Cycle 1]: 0.00238513, [45] [expand_dump_flag]: 3.42997e-06 [switch_simplify]: 4.406e-05 [loop_unroll]: 3.103e-05 [a_1]: 0.00066958 [with_stream_mark]: 1.685e-05 [recompute_prepare]: 1.193e-05 [updatestate_depend_eliminate]: 4.85001e-06 [updatestate_assign_eliminate]: 4.08999e-06 [updatestate_loads_eliminate]: 3.8e-06 [parameter_eliminate]: 1.96e-06 [a_2]: 0.00013923 [accelerated_algorithm]: 9.34e-06 [shard]: 1.84e-06 [meta_shard_fg_expand]: 1.92999e-06 [shard_inline]: 7.92998e-06 [merge_send_recv]: 9.37001e-06 [auto_parallel]: 7.49002e-06 [parallel]: 2.077e-05 [flash_sp]: 9.82999e-06 [merge_comm]: 4.80999e-06 [allreduce_fusion]: 4.4e-06 [matmul_add_comm_reduction]: 1.108e-05 [allreduce_slice_to_reducescatter]: 6.19999e-07 [virtual_shard_identity]: 9.67001e-06 [virtual_dataset]: 8.18001e-06 [get_grad_eliminate_]: 7.6e-06 [virtual_output]: 8.33001e-06 [merge_forward]: 4.40999e-06 [cell_reuse_recompute_pass]: 1.09e-06 [offload_activation]: 1.091e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.778e-05 [merge_recompute_call_nodes]: 1.44e-06 [before_grad]: 1.297e-05 [set_forward_comm_id_for_comm_node_pass]: 5.50001e-06 [meta_fg_expand]: 3.51999e-06 [flash_sp_send_recv_attached]: 2.79001e-06 [receive_attached]: 2.36e-06 [after_resolve]: 1.35e-05 [a_after_grad]: 1.223e-05 [renormalize]: 0.00069591 [add_forward_monad_depend]: 6.67002e-06 [auto_monad_grad]: 2.53e-06 [auto_monad_eliminator]: 1.908e-05 [cse]: 3.263e-05 [a_3]: 7.312e-05 [Cycle 2]: 0.00095459, [45] [expand_dump_flag]: 1.50001e-06 [switch_simplify]: 8.91002e-06 [loop_unroll]: 7.78001e-06 [a_1]: 0.00017674 [with_stream_mark]: 1.279e-05 [recompute_prepare]: 7.98999e-06 [updatestate_depend_eliminate]: 4.31002e-06 [updatestate_assign_eliminate]: 3.19001e-06 [updatestate_loads_eliminate]: 3.09001e-06 [parameter_eliminate]: 1.99999e-06 [a_2]: 0.00012303 [accelerated_algorithm]: 7.75e-06 [shard]: 1.24e-06 [meta_shard_fg_expand]: 2.17001e-06 [shard_inline]: 7.58001e-06 [merge_send_recv]: 7.85e-06 [auto_parallel]: 7.35e-06 [parallel]: 5.30001e-06 [flash_sp]: 3.25998e-06 [merge_comm]: 5.16002e-06 [allreduce_fusion]: 4.16001e-06 [matmul_add_comm_reduction]: 7.93999e-06 [allreduce_slice_to_reducescatter]: 3.50003e-07 [virtual_shard_identity]: 8.41002e-06 [virtual_dataset]: 7.95998e-06 [get_grad_eliminate_]: 7.11001e-06 [virtual_output]: 6.91999e-06 [merge_forward]: 3.97002e-06 [cell_reuse_recompute_pass]: 1.71e-06 [offload_activation]: 9.24e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.789e-05 [merge_recompute_call_nodes]: 1.09e-06 [before_grad]: 1.288e-05 [set_forward_comm_id_for_comm_node_pass]: 4.75999e-06 [meta_fg_expand]: 3.18998e-06 [flash_sp_send_recv_attached]: 1.00001e-06 [receive_attached]: 1.34e-06 [after_resolve]: 1.27e-05 [a_after_grad]: 1.154e-05 [renormalize]: 8.00064e-08 [add_forward_monad_depend]: 1.35999e-06 [auto_monad_grad]: 1.37999e-06 [auto_monad_eliminator]: 9.39e-06 [cse]: 2.025e-05 [a_3]: 5.986e-05 [py_interpret_to_execute_after_opt_a]: 1.615e-05 [slice_cell_reuse_recomputed_activation]: 4.77998e-06 [rewriter_after_opt_a]: 4.713e-05 [convert_after_rewriter]: 1.114e-05 [order_py_execute_after_rewriter]: 9.69e-06 [mutable_eliminate]: 0.00059214 [opt_b]: 0.00030873, [1] [Cycle 1]: 0.00029891, [7] [b_1]: 0.00019773 [b_2]: 9.39e-06 [updatestate_depend_eliminate]: 5.94e-06 [updatestate_assign_eliminate]: 3.06001e-06 [updatestate_loads_eliminate]: 3.22002e-06 [renormalize]: 6.00005e-07 [cse]: 2.21e-05 [optimize_parallel_all_gather_comm]: 2.124e-05 [overlap_param_gather]: 4.94e-06 [cconv]: 2.772e-05 [loop_unroll]: 0.00045585 [opt_after_cconv]: 0.00014078, [1] [Cycle 1]: 0.00013164, [7] [c_1]: 3.716e-05 [parameter_eliminate]: 2.68e-06 [updatestate_depend_eliminate]: 6.26998e-06 [updatestate_assign_eliminate]: 3.2e-06 [updatestate_loads_eliminate]: 2.98998e-06 [cse]: 2.33e-05 [renormalize]: 3.7998e-07 [remove_dup_value]: 1.777e-05 [tuple_transform]: 0.00010197, [1] [Cycle 1]: 9.457e-05, [4] [d_1]: 5.264e-05 [none_parameter_eliminate]: 2.00002e-06 [renormalize]: 4.69998e-07 [switch_simplify]: 8.25999e-06 [partial_unused_args_eliminate]: 4.43999e-06 [add_recomputation]: 5.917e-05 [cse_after_recomputation]: 3.289e-05, [1] [Cycle 1]: 2.523e-05, [1] [cse]: 1.531e-05 [environ_conv]: 8.99e-06 [swap_dp_allreduce_reducescatter]: 8.87e-06 [bias_add_comm_swap]: 5.29e-06 [label_micro_interleaved_index]: 6.53e-06 [label_fine_grained_interleaved_index]: 5.51e-06 [merge_cast_opt]: 3.71999e-06 [slice_recompute_activation]: 4.47e-06 [micro_interleaved_order_control]: 4.80001e-06 [assign_add_opt]: 3.89002e-06 [ForceFp32Comm]: 3.81999e-06 [remove_cast_before_assign_add]: 3.29001e-06 [full_micro_interleaved_order_control]: 4.72998e-06 [reorder_send_recv_between_fp_bp]: 5.82001e-06 [comm_op_add_attrs]: 3.65998e-06 [add_comm_op_reuse_tag]: 3.33e-06 [interleave_split_concat_branches]: 3.43e-06 [interleave_parallel_branches]: 3.31999e-06 [overlap_opt_shard_in_pipeline]: 4.15999e-06 [overlap_opt_shard_grad_in_pipeline]: 4.72998e-06 [control_data_broadcast_order]: 1.855e-05 [grouped_pairwise_exchange_alltoall]: 3.91001e-06 [offloading_packed_experts]: 6.91001e-06 [overlap_recompute_and_grad_model_parallel]: 7.55e-06 [overlap_grad_matmul_and_grad_allreduce]: 4.08001e-06 [overlap_recompute_allgather_and_fa_grad]: 3.68e-06 [overlap_recompute_comm]: 5.10001e-06 [overlap_grad_ring_attention]: 7.41999e-06 [overlap_grad_flash_sp]: 2.62e-05 [begin_end_overlap_inline]: 2.96999e-06 [split_matmul_comm_elemetwise]: 4.80001e-06 [split_layernorm_comm]: 4.25e-06 [handle_group_info]: 3.51001e-06 [symbol_engine_optimizer]: 0.00010652, [1] [Cycle 1]: 9.92e-05, [6] [build]: 3.16001e-06 [elim_shapecalc]: 1.166e-05 [elim_not_effective]: 1.578e-05 [opt_reshape]: 8.66002e-06 [fold_const_symbol]: 1.264e-05 [renormalize]: 1.80007e-07 [detach_backward]: 3.63e-06 [pipeline_parallel_scheduler]: 2.03002e-06 [auto_monad_reorder]: 2.305e-05 [get_jit_bprop_graph]: 1.32e-06 [rewriter_after_jit_bprop_graph]: 4.63001e-06 [opt_after_jit_grad]: 0.00055454 [validate]: 4.945e-05 Sums bootstrap : 0.000387s : 3.41% type_inference : 0.005554s : 48.92% event_method : 0.000019s : 0.17% auto_monad : 0.000064s : 0.56% graph_reusing : 0.000007s : 0.06% inline : 0.000002s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000019s : 0.16% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.05% parallel-infer-symbol : 0.000004s : 0.03% pre_auto_parallel : 0.000033s : 0.29% insert-virtual-dataset : 0.000003s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000028s : 0.24% optimize.rewriter_before_opt_a : 0.000087s : 0.76% optimize.opt_a.expand_dump_flag : 0.000005s : 0.04% optimize.opt_a.switch_simplify : 0.000053s : 0.47% optimize.opt_a.loop_unroll : 0.000039s : 0.34% optimize.opt_a.a_1 : 0.000846s : 7.45% optimize.opt_a.with_stream_mark : 0.000030s : 0.26% optimize.opt_a.recompute_prepare : 0.000020s : 0.18% optimize.opt_a.updatestate_depend_eliminate : 0.000009s : 0.08% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.06% optimize.opt_a.updatestate_loads_eliminate : 0.000007s : 0.06% optimize.opt_a.parameter_eliminate : 0.000004s : 0.03% optimize.opt_a.a_2 : 0.000262s : 2.31% optimize.opt_a.accelerated_algorithm : 0.000017s : 0.15% optimize.opt_a.shard : 0.000003s : 0.03% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.04% optimize.opt_a.shard_inline : 0.000016s : 0.14% optimize.opt_a.merge_send_recv : 0.000017s : 0.15% optimize.opt_a.auto_parallel : 0.000015s : 0.13% optimize.opt_a.parallel : 0.000026s : 0.23% optimize.opt_a.flash_sp : 0.000013s : 0.12% optimize.opt_a.merge_comm : 0.000010s : 0.09% optimize.opt_a.allreduce_fusion : 0.000009s : 0.08% optimize.opt_a.matmul_add_comm_reduction : 0.000019s : 0.17% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000018s : 0.16% optimize.opt_a.virtual_dataset : 0.000016s : 0.14% optimize.opt_a.get_grad_eliminate_ : 0.000015s : 0.13% optimize.opt_a.virtual_output : 0.000015s : 0.13% optimize.opt_a.merge_forward : 0.000008s : 0.07% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.02% optimize.opt_a.offload_activation : 0.000020s : 0.18% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000036s : 0.31% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.02% optimize.opt_a.before_grad : 0.000026s : 0.23% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000010s : 0.09% optimize.opt_a.meta_fg_expand : 0.000007s : 0.06% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.03% optimize.opt_a.receive_attached : 0.000004s : 0.03% optimize.opt_a.after_resolve : 0.000026s : 0.23% optimize.opt_a.a_after_grad : 0.000024s : 0.21% optimize.opt_a.renormalize : 0.000696s : 6.13% optimize.opt_a.add_forward_monad_depend : 0.000008s : 0.07% optimize.opt_a.auto_monad_grad : 0.000004s : 0.03% optimize.opt_a.auto_monad_eliminator : 0.000028s : 0.25% optimize.opt_a.cse : 0.000053s : 0.47% optimize.opt_a.a_3 : 0.000133s : 1.17% optimize.py_interpret_to_execute_after_opt_a : 0.000016s : 0.14% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.04% optimize.rewriter_after_opt_a : 0.000047s : 0.42% optimize.convert_after_rewriter : 0.000011s : 0.10% optimize.order_py_execute_after_rewriter : 0.000010s : 0.09% optimize.mutable_eliminate : 0.000592s : 5.21% optimize.opt_b.b_1 : 0.000198s : 1.74% optimize.opt_b.b_2 : 0.000009s : 0.08% optimize.opt_b.updatestate_depend_eliminate : 0.000006s : 0.05% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_b.renormalize : 0.000001s : 0.01% optimize.opt_b.cse : 0.000022s : 0.19% optimize.optimize_parallel_all_gather_comm : 0.000021s : 0.19% optimize.overlap_param_gather : 0.000005s : 0.04% optimize.cconv : 0.000028s : 0.24% optimize.loop_unroll : 0.000456s : 4.01% optimize.opt_after_cconv.c_1 : 0.000037s : 0.33% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.02% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.06% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.cse : 0.000023s : 0.21% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000018s : 0.16% optimize.tuple_transform.d_1 : 0.000053s : 0.46% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.02% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000008s : 0.07% optimize.partial_unused_args_eliminate : 0.000004s : 0.04% optimize.add_recomputation : 0.000059s : 0.52% optimize.cse_after_recomputation.cse : 0.000015s : 0.13% optimize.environ_conv : 0.000009s : 0.08% optimize.swap_dp_allreduce_reducescatter : 0.000009s : 0.08% optimize.bias_add_comm_swap : 0.000005s : 0.05% optimize.label_micro_interleaved_index : 0.000007s : 0.06% optimize.label_fine_grained_interleaved_index : 0.000006s : 0.05% optimize.merge_cast_opt : 0.000004s : 0.03% optimize.slice_recompute_activation : 0.000004s : 0.04% optimize.micro_interleaved_order_control : 0.000005s : 0.04% optimize.assign_add_opt : 0.000004s : 0.03% optimize.ForceFp32Comm : 0.000004s : 0.03% optimize.remove_cast_before_assign_add : 0.000003s : 0.03% optimize.full_micro_interleaved_order_control : 0.000005s : 0.04% optimize.reorder_send_recv_between_fp_bp : 0.000006s : 0.05% optimize.comm_op_add_attrs : 0.000004s : 0.03% optimize.add_comm_op_reuse_tag : 0.000003s : 0.03% optimize.interleave_split_concat_branches : 0.000003s : 0.03% optimize.interleave_parallel_branches : 0.000003s : 0.03% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.04% optimize.overlap_opt_shard_grad_in_pipeline : 0.000005s : 0.04% optimize.control_data_broadcast_order : 0.000019s : 0.16% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.03% optimize.offloading_packed_experts : 0.000007s : 0.06% optimize.overlap_recompute_and_grad_model_parallel : 0.000008s : 0.07% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.04% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.03% optimize.overlap_recompute_comm : 0.000005s : 0.04% optimize.overlap_grad_ring_attention : 0.000007s : 0.07% optimize.overlap_grad_flash_sp : 0.000026s : 0.23% optimize.begin_end_overlap_inline : 0.000003s : 0.03% optimize.split_matmul_comm_elemetwise : 0.000005s : 0.04% optimize.split_layernorm_comm : 0.000004s : 0.04% optimize.handle_group_info : 0.000004s : 0.03% optimize.symbol_engine_optimizer.build : 0.000003s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000012s : 0.10% optimize.symbol_engine_optimizer.elim_not_effective : 0.000016s : 0.14% optimize.symbol_engine_optimizer.opt_reshape : 0.000009s : 0.08% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000013s : 0.11% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000004s : 0.03% pipeline_parallel_scheduler : 0.000002s : 0.02% auto_monad_reorder : 0.000023s : 0.20% get_jit_bprop_graph : 0.000001s : 0.01% rewriter_after_jit_bprop_graph : 0.000005s : 0.04% opt_after_jit_grad : 0.000555s : 4.88% validate : 0.000049s : 0.44% Time group info: ------[substitution.] 0.000207 38 11.83% : 0.000024s : 3: substitution.cast_eliminate 1.35% : 0.000003s : 3: substitution.elim_not_effective 0.82% : 0.000002s : 3: substitution.fold_const_symbol 3.29% : 0.000007s : 5: substitution.graph_param_transform 68.76% : 0.000142s : 4: substitution.inline 2.30% : 0.000005s : 6: substitution.j_node_and_user_rematch 2.98% : 0.000006s : 6: substitution.remove_not_recompute_node 2.65% : 0.000005s : 4: substitution.replace_old_param 6.03% : 0.000012s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.005499 2 87.46% : 0.004809s : 1: type_inference.infer 12.54% : 0.000690s : 1: type_inference.specialize ------[replace.] 0.000062 8 62.10% : 0.000039s : 4: replace.inline 37.90% : 0.000024s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000150 8 92.87% : 0.000139s : 4: match.inline 7.13% : 0.000011s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000239 1504 0.90% : 0.000002s : 15: predicate.accumulaten_eliminater 0.76% : 0.000002s : 5: predicate.ad_related_special_op_eliminate 0.58% : 0.000001s : 10: predicate.addn_check_dump 0.92% : 0.000002s : 15: predicate.addn_zero_filter 0.84% : 0.000002s : 15: predicate.adjust_all_reduce_mul_add 2.03% : 0.000005s : 25: predicate.arithmetic_simplify 1.03% : 0.000002s : 15: predicate.cast_eliminate 0.66% : 0.000002s : 10: predicate.check_bprop_eliminate 0.62% : 0.000001s : 10: predicate.compare_switch_simplify 0.19% : 0.000000s : 5: predicate.const_output_eliminate 0.65% : 0.000002s : 10: predicate.depend_value_elim 0.92% : 0.000002s : 15: predicate.dict_get_item_const_eliminator 1.01% : 0.000002s : 15: predicate.dict_get_item_eliminator 0.94% : 0.000002s : 15: predicate.dict_set_item_eliminator 0.88% : 0.000002s : 10: predicate.dumpgradient_eliminate 0.40% : 0.000001s : 5: predicate.elim_not_effective 0.41% : 0.000001s : 5: predicate.elim_shapecalc_of_broadcastargs 1.19% : 0.000003s : 20: predicate.environ_add_const_eliminate 1.15% : 0.000003s : 20: predicate.environ_get_add_eliminate 1.10% : 0.000003s : 20: predicate.environ_get_depend_swap 1.93% : 0.000005s : 30: predicate.environ_get_eliminate 1.11% : 0.000003s : 20: predicate.environ_get_set_eliminate 1.33% : 0.000003s : 23: predicate.exchange_switch_depend_value 2.17% : 0.000005s : 23: predicate.float_depend_g_call 0.62% : 0.000001s : 10: predicate.float_environ_get_switch 0.85% : 0.000002s : 15: predicate.float_tuple_getitem_switch 0.21% : 0.000001s : 5: predicate.fold_const_symbol 0.70% : 0.000002s : 10: predicate.get_grad_eliminate 0.29% : 0.000001s : 5: predicate.graph_param_transform 0.70% : 0.000002s : 10: predicate.incorporate_call 0.56% : 0.000001s : 10: predicate.incorporate_call_switch 6.31% : 0.000015s : 68: predicate.inline 0.87% : 0.000002s : 10: predicate.inline_without_move 0.33% : 0.000001s : 10: predicate.j_node_and_user_rematch 0.75% : 0.000002s : 10: predicate.less_batch_normalization 1.88% : 0.000005s : 29: predicate.list_to_tuple_eliminator_ 2.62% : 0.000006s : 44: predicate.load_eliminater 0.78% : 0.000002s : 5: predicate.loop_unroll_after_grad 2.12% : 0.000005s : 36: predicate.loop_unroll_before_grad 1.73% : 0.000004s : 25: predicate.make_slice_get_slice_eliminator 0.65% : 0.000002s : 10: predicate.merge_addn 0.65% : 0.000002s : 10: predicate.micro_step_allgather_replace 0.62% : 0.000001s : 10: predicate.mini_step_allgather_replace 0.85% : 0.000002s : 15: predicate.minmaximum_grad 0.89% : 0.000002s : 5: predicate.mutable_eliminate 0.38% : 0.000001s : 5: predicate.opt_reshape 0.37% : 0.000001s : 5: predicate.parallel_virtual_node 1.63% : 0.000004s : 23: predicate.partial_defer_inline 1.65% : 0.000004s : 24: predicate.partial_eliminate 1.06% : 0.000003s : 15: predicate.print_const_string_wrapper 0.58% : 0.000001s : 10: predicate.reduce_all_const_elim 1.13% : 0.000003s : 15: predicate.reduce_eliminate 2.47% : 0.000006s : 44: predicate.redundant_stop_gradient_eliminater 0.58% : 0.000001s : 10: predicate.remove_not_recompute_node 1.38% : 0.000003s : 29: predicate.replace_applicator 0.45% : 0.000001s : 10: predicate.replace_old_param 0.26% : 0.000001s : 5: predicate.reset_defer_inline 0.98% : 0.000002s : 15: predicate.reshape_eliminate 0.65% : 0.000002s : 10: predicate.row_tensor_add_zeros_like 0.37% : 0.000001s : 5: predicate.row_tensor_eliminate 0.73% : 0.000002s : 10: predicate.same_eliminate 0.60% : 0.000001s : 10: predicate.set_cell_output_no_recompute 0.87% : 0.000002s : 10: predicate.shard_identity_eliminate 0.66% : 0.000002s : 10: predicate.special_op_eliminate 0.79% : 0.000002s : 10: predicate.specialize_transform 0.89% : 0.000002s : 10: predicate.split_environ_get_set_with_tuple_value 0.77% : 0.000002s : 10: predicate.stack_unstack_eliminate 0.37% : 0.000001s : 5: predicate.switch_call_monad_eliminater 1.43% : 0.000003s : 23: predicate.switch_defer_inline 2.04% : 0.000005s : 33: predicate.switch_layer_defer_inline 4.88% : 0.000012s : 74: predicate.switch_simplify 0.87% : 0.000002s : 15: predicate.tile_eliminate 0.89% : 0.000002s : 15: predicate.transpose_eliminate 1.60% : 0.000004s : 25: predicate.tuple_list_convert_item_index_to_positive 1.73% : 0.000004s : 25: predicate.tuple_list_get_item_const_eliminator 1.50% : 0.000004s : 25: predicate.tuple_list_get_item_depend_reorder 3.23% : 0.000008s : 39: predicate.tuple_list_get_item_eliminator 1.50% : 0.000004s : 25: predicate.tuple_list_get_set_item_eliminator 2.19% : 0.000005s : 35: predicate.tuple_list_set_item_eliminator 1.80% : 0.000004s : 29: predicate.tuple_to_list_eliminator_ 2.43% : 0.000006s : 44: predicate.updatestate_pure_node_eliminater 3.16% : 0.000008s : 54: predicate.updatestate_useless_node_eliminater 0.41% : 0.000001s : 5: predicate.value_based_eliminate 0.66% : 0.000002s : 10: predicate.virtual_dataset_eliminate 0.66% : 0.000002s : 10: predicate.virtual_output_eliminate 0.34% : 0.000001s : 5: predicate.virtual_view_grad_eliminate 0.38% : 0.000001s : 5: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000469 11 52.37% : 0.000246s : 5: func_graph_cloner_run.FuncGraphClonerGraph 47.63% : 0.000223s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.027731 192 0.02% : 0.000006s : 1: ForceFp32Comm 11.73% : 0.003253s : 1: add_attr 11.68% : 0.003238s : 1: add_attr_with_inline 0.02% : 0.000006s : 1: add_comm_op_reuse_tag 0.23% : 0.000063s : 1: add_recomputation 0.02% : 0.000006s : 1: assign_add_opt 0.27% : 0.000074s : 1: auto_monad 0.11% : 0.000030s : 1: auto_monad_reorder 0.02% : 0.000006s : 1: begin_end_overlap_inline 0.03% : 0.000008s : 1: bias_add_comm_swap 1.52% : 0.000423s : 1: bootstrap 0.11% : 0.000031s : 1: cconv 0.02% : 0.000006s : 1: comm_op_add_attrs 0.08% : 0.000021s : 1: control_data_broadcast_order 0.05% : 0.000014s : 1: convert_after_rewriter 0.13% : 0.000036s : 1: cse_after_recomputation 0.03% : 0.000007s : 1: dataset_repeat_opt 0.07% : 0.000019s : 1: detach_backward 0.04% : 0.000012s : 1: environ_conv 0.11% : 0.000029s : 1: event_method 0.03% : 0.000007s : 1: full_micro_interleaved_order_control 0.03% : 0.000007s : 1: get_jit_bprop_graph 0.05% : 0.000013s : 1: graph_reusing 0.02% : 0.000006s : 1: grouped_pairwise_exchange_alltoall 0.02% : 0.000006s : 1: handle_group_info 0.03% : 0.000008s : 1: inline 0.03% : 0.000009s : 1: insert-virtual-dataset 0.02% : 0.000006s : 1: interleave_parallel_branches 0.02% : 0.000006s : 1: interleave_split_concat_branches 0.03% : 0.000008s : 1: label_fine_grained_interleaved_index 0.03% : 0.000009s : 1: label_micro_interleaved_index 1.67% : 0.000462s : 1: loop_unroll 0.02% : 0.000006s : 1: merge_cast_opt 0.03% : 0.000007s : 1: micro_interleaved_order_control 2.16% : 0.000598s : 1: mutable_eliminate 0.04% : 0.000010s : 1: offloading_packed_experts 0.06% : 0.000016s : 1: opt.transform.loop_unroll_optimizer 0.06% : 0.000016s : 1: opt.transform.mutable_eliminate 4.96% : 0.001376s : 78: opt.transform.opt_a 0.13% : 0.000036s : 1: opt.transform.opt_after_cconv 0.11% : 0.000030s : 1: opt.transform.opt_after_jit_grad 0.49% : 0.000136s : 28: opt.transform.opt_b 0.21% : 0.000059s : 2: opt.transform.opt_trans_graph 0.16% : 0.000045s : 4: opt.transform.symbol_engine_opt 12.11% : 0.003358s : 1: opt_a 0.52% : 0.000144s : 1: opt_after_cconv 2.04% : 0.000566s : 1: opt_after_jit_grad 1.13% : 0.000312s : 1: opt_b 22.33% : 0.006192s : 1: optimize 0.09% : 0.000024s : 1: optimize_parallel_all_gather_comm 0.05% : 0.000013s : 1: order_py_execute_after_rewriter 0.11% : 0.000029s : 1: overlap_grad_flash_sp 0.02% : 0.000007s : 1: overlap_grad_matmul_and_grad_allreduce 0.04% : 0.000010s : 1: overlap_grad_ring_attention 0.03% : 0.000008s : 1: overlap_opt_shard_grad_in_pipeline 0.02% : 0.000007s : 1: overlap_opt_shard_in_pipeline 0.03% : 0.000008s : 1: overlap_param_gather 0.02% : 0.000006s : 1: overlap_recompute_allgather_and_fa_grad 0.04% : 0.000010s : 1: overlap_recompute_and_grad_model_parallel 0.03% : 0.000008s : 1: overlap_recompute_comm 0.04% : 0.000010s : 1: parallel-infer-symbol 0.02% : 0.000006s : 1: parallel-infer-symbol-second 0.03% : 0.000007s : 1: partial_unused_args_eliminate 0.03% : 0.000009s : 1: pipeline_parallel_scheduler 0.03% : 0.000007s : 1: pipeline_split 0.15% : 0.000040s : 1: pre_auto_parallel 0.11% : 0.000031s : 1: py_interpret_to_execute 0.07% : 0.000020s : 1: py_interpret_to_execute_after_opt_a 0.02% : 0.000006s : 1: remove_cast_before_assign_add 0.08% : 0.000021s : 1: remove_dup_value 1.36% : 0.000377s : 1: renormalize.infer 1.12% : 0.000310s : 1: renormalize.specialize 0.03% : 0.000008s : 1: reorder_send_recv_between_fp_bp 0.04% : 0.000010s : 1: rewriter_after_jit_bprop_graph 0.18% : 0.000051s : 1: rewriter_after_opt_a 0.33% : 0.000090s : 1: rewriter_before_opt_a 0.03% : 0.000008s : 1: slice_cell_reuse_recomputed_activation 0.03% : 0.000007s : 1: slice_recompute_activation 0.02% : 0.000007s : 1: split_layernorm_comm 0.03% : 0.000008s : 1: split_matmul_comm_elemetwise 0.04% : 0.000012s : 1: swap_dp_allreduce_reducescatter 0.39% : 0.000109s : 1: symbol_engine_optimizer 0.38% : 0.000105s : 1: tuple_transform 20.17% : 0.005594s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:45:38.853.688 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.015408, [21] [bootstrap]: 0.00039025 [type_inference]: 0.00602903 [event_method]: 1.896e-05 [auto_monad]: 6.137e-05 [graph_reusing]: 5.79e-06 [inline]: 2.37001e-06 [add_attr]: 0.0030668, [1] [add_attr_with_inline]: 0.00305801, [1] [Cycle 1]: 5.287e-05, [2] [tag_attr]: 1.91e-05 [meta_addattr_fg_expand]: 5.92001e-06 [parallel-infer-symbol]: 3.59002e-06 [pre_auto_parallel]: 3.028e-05 [insert-virtual-dataset]: 2.88998e-06 [parallel-infer-symbol-second]: 8.29983e-07 [dataset_repeat_opt]: 2.24999e-06 [pipeline_split]: 1.66002e-06 [optimize]: 0.00509251, [53] [py_interpret_to_execute]: 2.161e-05 [rewriter_before_opt_a]: 7.684e-05 [opt_a]: 0.00292908, [2] [Cycle 1]: 0.00216862, [45] [expand_dump_flag]: 1.99e-06 [switch_simplify]: 4.435e-05 [loop_unroll]: 3.102e-05 [a_1]: 0.00068812 [with_stream_mark]: 1.768e-05 [recompute_prepare]: 1.167e-05 [updatestate_depend_eliminate]: 4.89e-06 [updatestate_assign_eliminate]: 3.95e-06 [updatestate_loads_eliminate]: 3.46999e-06 [parameter_eliminate]: 2.06e-06 [a_2]: 0.00010563 [accelerated_algorithm]: 9.20001e-06 [shard]: 1.92001e-06 [meta_shard_fg_expand]: 2.16e-06 [shard_inline]: 8.59e-06 [merge_send_recv]: 9.46e-06 [auto_parallel]: 7.89002e-06 [parallel]: 1.941e-05 [flash_sp]: 8.62e-06 [merge_comm]: 4.84e-06 [allreduce_fusion]: 4.46002e-06 [matmul_add_comm_reduction]: 1.052e-05 [allreduce_slice_to_reducescatter]: 6.59988e-07 [virtual_shard_identity]: 9.55001e-06 [virtual_dataset]: 8.55001e-06 [get_grad_eliminate_]: 7.65e-06 [virtual_output]: 7.78001e-06 [merge_forward]: 4.27e-06 [cell_reuse_recompute_pass]: 1.36002e-06 [offload_activation]: 1.081e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.73e-05 [merge_recompute_call_nodes]: 1.62999e-06 [before_grad]: 1.435e-05 [set_forward_comm_id_for_comm_node_pass]: 4.50001e-06 [meta_fg_expand]: 3.51999e-06 [flash_sp_send_recv_attached]: 2.94999e-06 [receive_attached]: 2.17999e-06 [after_resolve]: 1.391e-05 [a_after_grad]: 1.221e-05 [renormalize]: 0.00068169 [add_forward_monad_depend]: 4.32998e-06 [auto_monad_grad]: 2.46998e-06 [auto_monad_eliminator]: 1.567e-05 [cse]: 3.544e-05 [a_3]: 5.56e-05 [Cycle 2]: 0.00075004, [45] [expand_dump_flag]: 1.35999e-06 [switch_simplify]: 8.59998e-06 [loop_unroll]: 7.58001e-06 [a_1]: 0.00017226 [with_stream_mark]: 1.024e-05 [recompute_prepare]: 7.66999e-06 [updatestate_depend_eliminate]: 3.46999e-06 [updatestate_assign_eliminate]: 2.94001e-06 [updatestate_loads_eliminate]: 2.80002e-06 [parameter_eliminate]: 7.99977e-07 [a_2]: 9.211e-05 [accelerated_algorithm]: 7.77998e-06 [shard]: 1.00999e-06 [meta_shard_fg_expand]: 1.59998e-06 [shard_inline]: 7.37002e-06 [merge_send_recv]: 5.82001e-06 [auto_parallel]: 6.65002e-06 [parallel]: 4.33999e-06 [flash_sp]: 6.58e-06 [merge_comm]: 4.1e-06 [allreduce_fusion]: 3.6e-06 [matmul_add_comm_reduction]: 6.51999e-06 [allreduce_slice_to_reducescatter]: 5.3001e-07 [virtual_shard_identity]: 8.06001e-06 [virtual_dataset]: 7.31999e-06 [get_grad_eliminate_]: 6.86999e-06 [virtual_output]: 7e-06 [merge_forward]: 3.20002e-06 [cell_reuse_recompute_pass]: 1.30001e-06 [offload_activation]: 7.41999e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.421e-05 [merge_recompute_call_nodes]: 1.09e-06 [before_grad]: 1.109e-05 [set_forward_comm_id_for_comm_node_pass]: 4e-06 [meta_fg_expand]: 2.68998e-06 [flash_sp_send_recv_attached]: 9.10019e-07 [receive_attached]: 1.35999e-06 [after_resolve]: 1.229e-05 [a_after_grad]: 1.12e-05 [renormalize]: 6.00121e-08 [add_forward_monad_depend]: 1.40999e-06 [auto_monad_grad]: 9.09989e-07 [auto_monad_eliminator]: 8.40001e-06 [cse]: 1.983e-05 [a_3]: 4.561e-05 [py_interpret_to_execute_after_opt_a]: 1.042e-05 [slice_cell_reuse_recomputed_activation]: 1.99e-06 [rewriter_after_opt_a]: 3.925e-05 [convert_after_rewriter]: 7.27002e-06 [order_py_execute_after_rewriter]: 5.66e-06 [mutable_eliminate]: 0.00053752 [opt_b]: 0.00024064, [1] [Cycle 1]: 0.00023374, [7] [b_1]: 0.00015458 [b_2]: 9.37999e-06 [updatestate_depend_eliminate]: 5.67001e-06 [updatestate_assign_eliminate]: 3.04999e-06 [updatestate_loads_eliminate]: 2.93e-06 [renormalize]: 3.60014e-07 [cse]: 2.328e-05 [optimize_parallel_all_gather_comm]: 1.746e-05 [overlap_param_gather]: 2.00002e-06 [cconv]: 2.359e-05 [loop_unroll]: 0.00044204 [opt_after_cconv]: 0.0001087, [1] [Cycle 1]: 0.00010309, [7] [c_1]: 3.621e-05 [parameter_eliminate]: 1.79e-06 [updatestate_depend_eliminate]: 4.65999e-06 [updatestate_assign_eliminate]: 3.05002e-06 [updatestate_loads_eliminate]: 2.72001e-06 [cse]: 2.107e-05 [renormalize]: 5.19998e-07 [remove_dup_value]: 1.435e-05 [tuple_transform]: 8.404e-05, [1] [Cycle 1]: 7.929e-05, [4] [d_1]: 5.111e-05 [none_parameter_eliminate]: 1.62999e-06 [renormalize]: 1.8999e-07 [switch_simplify]: 8.12998e-06 [partial_unused_args_eliminate]: 1.74e-06 [add_recomputation]: 5.653e-05 [cse_after_recomputation]: 2.63e-05, [1] [Cycle 1]: 2.181e-05, [1] [cse]: 1.602e-05 [environ_conv]: 6.10002e-06 [swap_dp_allreduce_reducescatter]: 6.01e-06 [bias_add_comm_swap]: 2.46e-06 [label_micro_interleaved_index]: 4.72998e-06 [label_fine_grained_interleaved_index]: 2.83e-06 [merge_cast_opt]: 1.28002e-06 [slice_recompute_activation]: 2.09999e-06 [micro_interleaved_order_control]: 2.34999e-06 [assign_add_opt]: 1.64e-06 [ForceFp32Comm]: 8.40024e-07 [remove_cast_before_assign_add]: 1.03001e-06 [full_micro_interleaved_order_control]: 2.39999e-06 [reorder_send_recv_between_fp_bp]: 2.94999e-06 [comm_op_add_attrs]: 1.30999e-06 [add_comm_op_reuse_tag]: 1.15999e-06 [interleave_split_concat_branches]: 1.15001e-06 [interleave_parallel_branches]: 1.04998e-06 [overlap_opt_shard_in_pipeline]: 1.86e-06 [overlap_opt_shard_grad_in_pipeline]: 1.87999e-06 [control_data_broadcast_order]: 1.455e-05 [grouped_pairwise_exchange_alltoall]: 1.72999e-06 [offloading_packed_experts]: 4.65001e-06 [overlap_recompute_and_grad_model_parallel]: 5.74999e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.22e-06 [overlap_recompute_allgather_and_fa_grad]: 1.47001e-06 [overlap_recompute_comm]: 2.62001e-06 [overlap_grad_ring_attention]: 4.42e-06 [overlap_grad_flash_sp]: 2.125e-05 [begin_end_overlap_inline]: 5.79981e-07 [split_matmul_comm_elemetwise]: 2.39001e-06 [split_layernorm_comm]: 1.80001e-06 [handle_group_info]: 1.24e-06 [symbol_engine_optimizer]: 8.552e-05, [1] [Cycle 1]: 8.126e-05, [6] [build]: 3.56999e-06 [elim_shapecalc]: 1.223e-05 [elim_not_effective]: 1.55e-05 [opt_reshape]: 8e-06 [fold_const_symbol]: 1.239e-05 [renormalize]: 1.90019e-07 [detach_backward]: 2.16998e-06 [pipeline_parallel_scheduler]: 1.56002e-06 [auto_monad_reorder]: 2.179e-05 [get_jit_bprop_graph]: 1.62001e-06 [rewriter_after_jit_bprop_graph]: 3.53e-06 [opt_after_jit_grad]: 0.0004844 [validate]: 3.965e-05 Sums bootstrap : 0.000390s : 3.44% type_inference : 0.006029s : 53.09% event_method : 0.000019s : 0.17% auto_monad : 0.000061s : 0.54% graph_reusing : 0.000006s : 0.05% inline : 0.000002s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000019s : 0.17% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.05% parallel-infer-symbol : 0.000004s : 0.03% pre_auto_parallel : 0.000030s : 0.27% insert-virtual-dataset : 0.000003s : 0.03% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000022s : 0.19% optimize.rewriter_before_opt_a : 0.000077s : 0.68% optimize.opt_a.expand_dump_flag : 0.000003s : 0.03% optimize.opt_a.switch_simplify : 0.000053s : 0.47% optimize.opt_a.loop_unroll : 0.000039s : 0.34% optimize.opt_a.a_1 : 0.000860s : 7.58% optimize.opt_a.with_stream_mark : 0.000028s : 0.25% optimize.opt_a.recompute_prepare : 0.000019s : 0.17% optimize.opt_a.updatestate_depend_eliminate : 0.000008s : 0.07% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.06% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.06% optimize.opt_a.parameter_eliminate : 0.000003s : 0.03% optimize.opt_a.a_2 : 0.000198s : 1.74% optimize.opt_a.accelerated_algorithm : 0.000017s : 0.15% optimize.opt_a.shard : 0.000003s : 0.03% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.03% optimize.opt_a.shard_inline : 0.000016s : 0.14% optimize.opt_a.merge_send_recv : 0.000015s : 0.13% optimize.opt_a.auto_parallel : 0.000015s : 0.13% optimize.opt_a.parallel : 0.000024s : 0.21% optimize.opt_a.flash_sp : 0.000015s : 0.13% optimize.opt_a.merge_comm : 0.000009s : 0.08% optimize.opt_a.allreduce_fusion : 0.000008s : 0.07% optimize.opt_a.matmul_add_comm_reduction : 0.000017s : 0.15% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000018s : 0.16% optimize.opt_a.virtual_dataset : 0.000016s : 0.14% optimize.opt_a.get_grad_eliminate_ : 0.000015s : 0.13% optimize.opt_a.virtual_output : 0.000015s : 0.13% optimize.opt_a.merge_forward : 0.000007s : 0.07% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.02% optimize.opt_a.offload_activation : 0.000018s : 0.16% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000032s : 0.28% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.02% optimize.opt_a.before_grad : 0.000025s : 0.22% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000009s : 0.07% optimize.opt_a.meta_fg_expand : 0.000006s : 0.05% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.03% optimize.opt_a.receive_attached : 0.000004s : 0.03% optimize.opt_a.after_resolve : 0.000026s : 0.23% optimize.opt_a.a_after_grad : 0.000023s : 0.21% optimize.opt_a.renormalize : 0.000682s : 6.00% optimize.opt_a.add_forward_monad_depend : 0.000006s : 0.05% optimize.opt_a.auto_monad_grad : 0.000003s : 0.03% optimize.opt_a.auto_monad_eliminator : 0.000024s : 0.21% optimize.opt_a.cse : 0.000055s : 0.49% optimize.opt_a.a_3 : 0.000101s : 0.89% optimize.py_interpret_to_execute_after_opt_a : 0.000010s : 0.09% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.02% optimize.rewriter_after_opt_a : 0.000039s : 0.35% optimize.convert_after_rewriter : 0.000007s : 0.06% optimize.order_py_execute_after_rewriter : 0.000006s : 0.05% optimize.mutable_eliminate : 0.000538s : 4.73% optimize.opt_b.b_1 : 0.000155s : 1.36% optimize.opt_b.b_2 : 0.000009s : 0.08% optimize.opt_b.updatestate_depend_eliminate : 0.000006s : 0.05% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_b.renormalize : 0.000000s : 0.00% optimize.opt_b.cse : 0.000023s : 0.20% optimize.optimize_parallel_all_gather_comm : 0.000017s : 0.15% optimize.overlap_param_gather : 0.000002s : 0.02% optimize.cconv : 0.000024s : 0.21% optimize.loop_unroll : 0.000442s : 3.89% optimize.opt_after_cconv.c_1 : 0.000036s : 0.32% optimize.opt_after_cconv.parameter_eliminate : 0.000002s : 0.02% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000005s : 0.04% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.02% optimize.opt_after_cconv.cse : 0.000021s : 0.19% optimize.opt_after_cconv.renormalize : 0.000001s : 0.00% optimize.remove_dup_value : 0.000014s : 0.13% optimize.tuple_transform.d_1 : 0.000051s : 0.45% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000008s : 0.07% optimize.partial_unused_args_eliminate : 0.000002s : 0.02% optimize.add_recomputation : 0.000057s : 0.50% optimize.cse_after_recomputation.cse : 0.000016s : 0.14% optimize.environ_conv : 0.000006s : 0.05% optimize.swap_dp_allreduce_reducescatter : 0.000006s : 0.05% optimize.bias_add_comm_swap : 0.000002s : 0.02% optimize.label_micro_interleaved_index : 0.000005s : 0.04% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.02% optimize.merge_cast_opt : 0.000001s : 0.01% optimize.slice_recompute_activation : 0.000002s : 0.02% optimize.micro_interleaved_order_control : 0.000002s : 0.02% optimize.assign_add_opt : 0.000002s : 0.01% optimize.ForceFp32Comm : 0.000001s : 0.01% optimize.remove_cast_before_assign_add : 0.000001s : 0.01% optimize.full_micro_interleaved_order_control : 0.000002s : 0.02% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.03% optimize.comm_op_add_attrs : 0.000001s : 0.01% optimize.add_comm_op_reuse_tag : 0.000001s : 0.01% optimize.interleave_split_concat_branches : 0.000001s : 0.01% optimize.interleave_parallel_branches : 0.000001s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000002s : 0.02% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.02% optimize.control_data_broadcast_order : 0.000015s : 0.13% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.02% optimize.offloading_packed_experts : 0.000005s : 0.04% optimize.overlap_recompute_and_grad_model_parallel : 0.000006s : 0.05% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.01% optimize.overlap_recompute_comm : 0.000003s : 0.02% optimize.overlap_grad_ring_attention : 0.000004s : 0.04% optimize.overlap_grad_flash_sp : 0.000021s : 0.19% optimize.begin_end_overlap_inline : 0.000001s : 0.01% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.02% optimize.split_layernorm_comm : 0.000002s : 0.02% optimize.handle_group_info : 0.000001s : 0.01% optimize.symbol_engine_optimizer.build : 0.000004s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000012s : 0.11% optimize.symbol_engine_optimizer.elim_not_effective : 0.000016s : 0.14% optimize.symbol_engine_optimizer.opt_reshape : 0.000008s : 0.07% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000012s : 0.11% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.02% pipeline_parallel_scheduler : 0.000002s : 0.01% auto_monad_reorder : 0.000022s : 0.19% get_jit_bprop_graph : 0.000002s : 0.01% rewriter_after_jit_bprop_graph : 0.000004s : 0.03% opt_after_jit_grad : 0.000484s : 4.27% validate : 0.000040s : 0.35% Time group info: ------[substitution.] 0.000200 38 10.42% : 0.000021s : 3: substitution.cast_eliminate 1.13% : 0.000002s : 3: substitution.elim_not_effective 0.93% : 0.000002s : 3: substitution.fold_const_symbol 3.32% : 0.000007s : 5: substitution.graph_param_transform 69.08% : 0.000138s : 4: substitution.inline 2.10% : 0.000004s : 6: substitution.j_node_and_user_rematch 3.51% : 0.000007s : 6: substitution.remove_not_recompute_node 2.63% : 0.000005s : 4: substitution.replace_old_param 6.87% : 0.000014s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.005974 2 88.51% : 0.005288s : 1: type_inference.infer 11.49% : 0.000686s : 1: type_inference.specialize ------[replace.] 0.000060 8 58.51% : 0.000035s : 4: replace.inline 41.49% : 0.000025s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000148 8 91.94% : 0.000136s : 4: match.inline 8.06% : 0.000012s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000238 1504 0.93% : 0.000002s : 15: predicate.accumulaten_eliminater 0.69% : 0.000002s : 5: predicate.ad_related_special_op_eliminate 0.67% : 0.000002s : 10: predicate.addn_check_dump 0.91% : 0.000002s : 15: predicate.addn_zero_filter 0.87% : 0.000002s : 15: predicate.adjust_all_reduce_mul_add 2.25% : 0.000005s : 25: predicate.arithmetic_simplify 1.09% : 0.000003s : 15: predicate.cast_eliminate 0.59% : 0.000001s : 10: predicate.check_bprop_eliminate 0.74% : 0.000002s : 10: predicate.compare_switch_simplify 0.20% : 0.000000s : 5: predicate.const_output_eliminate 0.73% : 0.000002s : 10: predicate.depend_value_elim 0.92% : 0.000002s : 15: predicate.dict_get_item_const_eliminator 1.12% : 0.000003s : 15: predicate.dict_get_item_eliminator 0.94% : 0.000002s : 15: predicate.dict_set_item_eliminator 0.92% : 0.000002s : 10: predicate.dumpgradient_eliminate 0.20% : 0.000000s : 5: predicate.elim_not_effective 0.39% : 0.000001s : 5: predicate.elim_shapecalc_of_broadcastargs 1.15% : 0.000003s : 20: predicate.environ_add_const_eliminate 1.12% : 0.000003s : 20: predicate.environ_get_add_eliminate 1.08% : 0.000003s : 20: predicate.environ_get_depend_swap 1.73% : 0.000004s : 30: predicate.environ_get_eliminate 1.10% : 0.000003s : 20: predicate.environ_get_set_eliminate 1.37% : 0.000003s : 23: predicate.exchange_switch_depend_value 2.17% : 0.000005s : 23: predicate.float_depend_g_call 0.58% : 0.000001s : 10: predicate.float_environ_get_switch 0.88% : 0.000002s : 15: predicate.float_tuple_getitem_switch 0.20% : 0.000000s : 5: predicate.fold_const_symbol 0.69% : 0.000002s : 10: predicate.get_grad_eliminate 0.34% : 0.000001s : 5: predicate.graph_param_transform 0.66% : 0.000002s : 10: predicate.incorporate_call 0.56% : 0.000001s : 10: predicate.incorporate_call_switch 6.03% : 0.000014s : 68: predicate.inline 0.80% : 0.000002s : 10: predicate.inline_without_move 0.36% : 0.000001s : 10: predicate.j_node_and_user_rematch 0.89% : 0.000002s : 10: predicate.less_batch_normalization 1.81% : 0.000004s : 29: predicate.list_to_tuple_eliminator_ 2.60% : 0.000006s : 44: predicate.load_eliminater 0.77% : 0.000002s : 5: predicate.loop_unroll_after_grad 2.31% : 0.000005s : 36: predicate.loop_unroll_before_grad 1.78% : 0.000004s : 25: predicate.make_slice_get_slice_eliminator 0.64% : 0.000002s : 10: predicate.merge_addn 0.63% : 0.000001s : 10: predicate.micro_step_allgather_replace 0.62% : 0.000001s : 10: predicate.mini_step_allgather_replace 0.88% : 0.000002s : 15: predicate.minmaximum_grad 0.80% : 0.000002s : 5: predicate.mutable_eliminate 0.41% : 0.000001s : 5: predicate.opt_reshape 0.36% : 0.000001s : 5: predicate.parallel_virtual_node 1.76% : 0.000004s : 23: predicate.partial_defer_inline 1.64% : 0.000004s : 24: predicate.partial_eliminate 0.93% : 0.000002s : 15: predicate.print_const_string_wrapper 0.63% : 0.000001s : 10: predicate.reduce_all_const_elim 1.17% : 0.000003s : 15: predicate.reduce_eliminate 2.47% : 0.000006s : 44: predicate.redundant_stop_gradient_eliminater 0.40% : 0.000001s : 10: predicate.remove_not_recompute_node 1.31% : 0.000003s : 29: predicate.replace_applicator 0.51% : 0.000001s : 10: predicate.replace_old_param 0.24% : 0.000001s : 5: predicate.reset_defer_inline 0.94% : 0.000002s : 15: predicate.reshape_eliminate 0.65% : 0.000002s : 10: predicate.row_tensor_add_zeros_like 0.40% : 0.000001s : 5: predicate.row_tensor_eliminate 0.71% : 0.000002s : 10: predicate.same_eliminate 0.43% : 0.000001s : 10: predicate.set_cell_output_no_recompute 0.87% : 0.000002s : 10: predicate.shard_identity_eliminate 0.74% : 0.000002s : 10: predicate.special_op_eliminate 0.79% : 0.000002s : 10: predicate.specialize_transform 0.76% : 0.000002s : 10: predicate.split_environ_get_set_with_tuple_value 0.77% : 0.000002s : 10: predicate.stack_unstack_eliminate 0.36% : 0.000001s : 5: predicate.switch_call_monad_eliminater 1.46% : 0.000003s : 23: predicate.switch_defer_inline 2.03% : 0.000005s : 33: predicate.switch_layer_defer_inline 5.02% : 0.000012s : 74: predicate.switch_simplify 0.88% : 0.000002s : 15: predicate.tile_eliminate 0.88% : 0.000002s : 15: predicate.transpose_eliminate 1.55% : 0.000004s : 25: predicate.tuple_list_convert_item_index_to_positive 1.65% : 0.000004s : 25: predicate.tuple_list_get_item_const_eliminator 1.42% : 0.000003s : 25: predicate.tuple_list_get_item_depend_reorder 3.23% : 0.000008s : 39: predicate.tuple_list_get_item_eliminator 1.60% : 0.000004s : 25: predicate.tuple_list_get_set_item_eliminator 2.26% : 0.000005s : 35: predicate.tuple_list_set_item_eliminator 1.78% : 0.000004s : 29: predicate.tuple_to_list_eliminator_ 2.45% : 0.000006s : 44: predicate.updatestate_pure_node_eliminater 3.21% : 0.000008s : 54: predicate.updatestate_useless_node_eliminater 0.35% : 0.000001s : 5: predicate.value_based_eliminate 0.76% : 0.000002s : 10: predicate.virtual_dataset_eliminate 0.73% : 0.000002s : 10: predicate.virtual_output_eliminate 0.30% : 0.000001s : 5: predicate.virtual_view_grad_eliminate 0.47% : 0.000001s : 5: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000479 11 52.53% : 0.000252s : 5: func_graph_cloner_run.FuncGraphClonerGraph 47.47% : 0.000227s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.025800 192 0.01% : 0.000004s : 1: ForceFp32Comm 11.90% : 0.003071s : 1: add_attr 11.87% : 0.003062s : 1: add_attr_with_inline 0.01% : 0.000004s : 1: add_comm_op_reuse_tag 0.24% : 0.000061s : 1: add_recomputation 0.02% : 0.000004s : 1: assign_add_opt 0.26% : 0.000067s : 1: auto_monad 0.10% : 0.000027s : 1: auto_monad_reorder 0.01% : 0.000003s : 1: begin_end_overlap_inline 0.02% : 0.000005s : 1: bias_add_comm_swap 1.61% : 0.000417s : 1: bootstrap 0.10% : 0.000027s : 1: cconv 0.02% : 0.000004s : 1: comm_op_add_attrs 0.07% : 0.000018s : 1: control_data_broadcast_order 0.04% : 0.000011s : 1: convert_after_rewriter 0.11% : 0.000029s : 1: cse_after_recomputation 0.02% : 0.000005s : 1: dataset_repeat_opt 0.02% : 0.000005s : 1: detach_backward 0.04% : 0.000009s : 1: environ_conv 0.10% : 0.000025s : 1: event_method 0.02% : 0.000005s : 1: full_micro_interleaved_order_control 0.02% : 0.000005s : 1: get_jit_bprop_graph 0.04% : 0.000009s : 1: graph_reusing 0.02% : 0.000005s : 1: grouped_pairwise_exchange_alltoall 0.02% : 0.000004s : 1: handle_group_info 0.02% : 0.000005s : 1: inline 0.02% : 0.000006s : 1: insert-virtual-dataset 0.01% : 0.000004s : 1: interleave_parallel_branches 0.01% : 0.000004s : 1: interleave_split_concat_branches 0.02% : 0.000006s : 1: label_fine_grained_interleaved_index 0.03% : 0.000008s : 1: label_micro_interleaved_index 1.74% : 0.000450s : 1: loop_unroll 0.02% : 0.000004s : 1: merge_cast_opt 0.02% : 0.000005s : 1: micro_interleaved_order_control 2.12% : 0.000547s : 1: mutable_eliminate 0.03% : 0.000008s : 1: offloading_packed_experts 0.05% : 0.000014s : 1: opt.transform.loop_unroll_optimizer 0.06% : 0.000016s : 1: opt.transform.mutable_eliminate 5.35% : 0.001381s : 78: opt.transform.opt_a 0.14% : 0.000035s : 1: opt.transform.opt_after_cconv 0.11% : 0.000028s : 1: opt.transform.opt_after_jit_grad 0.51% : 0.000132s : 28: opt.transform.opt_b 0.22% : 0.000057s : 2: opt.transform.opt_trans_graph 0.17% : 0.000044s : 4: opt.transform.symbol_engine_opt 11.37% : 0.002933s : 1: opt_a 0.43% : 0.000112s : 1: opt_after_cconv 1.91% : 0.000493s : 1: opt_after_jit_grad 0.95% : 0.000244s : 1: opt_b 19.76% : 0.005098s : 1: optimize 0.08% : 0.000021s : 1: optimize_parallel_all_gather_comm 0.03% : 0.000009s : 1: order_py_execute_after_rewriter 0.10% : 0.000025s : 1: overlap_grad_flash_sp 0.02% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.03% : 0.000008s : 1: overlap_grad_ring_attention 0.02% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.02% : 0.000005s : 1: overlap_opt_shard_in_pipeline 0.02% : 0.000005s : 1: overlap_param_gather 0.02% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.03% : 0.000009s : 1: overlap_recompute_and_grad_model_parallel 0.02% : 0.000006s : 1: overlap_recompute_comm 0.03% : 0.000007s : 1: parallel-infer-symbol 0.01% : 0.000004s : 1: parallel-infer-symbol-second 0.02% : 0.000005s : 1: partial_unused_args_eliminate 0.02% : 0.000005s : 1: pipeline_parallel_scheduler 0.02% : 0.000004s : 1: pipeline_split 0.13% : 0.000034s : 1: pre_auto_parallel 0.10% : 0.000026s : 1: py_interpret_to_execute 0.05% : 0.000014s : 1: py_interpret_to_execute_after_opt_a 0.02% : 0.000005s : 1: remove_cast_before_assign_add 0.07% : 0.000018s : 1: remove_dup_value 1.44% : 0.000371s : 1: renormalize.infer 1.18% : 0.000304s : 1: renormalize.specialize 0.02% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.03% : 0.000007s : 1: rewriter_after_jit_bprop_graph 0.17% : 0.000043s : 1: rewriter_after_opt_a 0.31% : 0.000081s : 1: rewriter_before_opt_a 0.02% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.02% : 0.000005s : 1: slice_recompute_activation 0.02% : 0.000005s : 1: split_layernorm_comm 0.02% : 0.000005s : 1: split_matmul_comm_elemetwise 0.03% : 0.000009s : 1: swap_dp_allreduce_reducescatter 0.34% : 0.000088s : 1: symbol_engine_optimizer 0.34% : 0.000087s : 1: tuple_transform 23.43% : 0.006045s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:45:39.544.19 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:45:39.546.93 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0175546, [21] [bootstrap]: 0.00040792 [type_inference]: 0.00575369 [event_method]: 1.907e-05 [auto_monad]: 6.569e-05 [graph_reusing]: 5.49e-06 [inline]: 1.91998e-06 [add_attr]: 0.00309968, [1] [add_attr_with_inline]: 0.00309057, [1] [Cycle 1]: 6.618e-05, [2] [tag_attr]: 1.992e-05 [meta_addattr_fg_expand]: 5.86e-06 [parallel-infer-symbol]: 3.36001e-06 [pre_auto_parallel]: 3.301e-05 [insert-virtual-dataset]: 2.60002e-06 [parallel-infer-symbol-second]: 8.80013e-07 [dataset_repeat_opt]: 1.77999e-06 [pipeline_split]: 1.59e-06 [optimize]: 0.00686955, [53] [py_interpret_to_execute]: 3.378e-05 [rewriter_before_opt_a]: 9.278e-05 [opt_a]: 0.00361527, [2] [Cycle 1]: 0.00247532, [45] [expand_dump_flag]: 3.33998e-06 [switch_simplify]: 3.991e-05 [loop_unroll]: 3.252e-05 [a_1]: 0.00071037 [with_stream_mark]: 1.475e-05 [recompute_prepare]: 1.119e-05 [updatestate_depend_eliminate]: 5.07e-06 [updatestate_assign_eliminate]: 4.29002e-06 [updatestate_loads_eliminate]: 4.05e-06 [parameter_eliminate]: 1.91e-06 [a_2]: 0.00014736 [accelerated_algorithm]: 9.88002e-06 [shard]: 1.89e-06 [meta_shard_fg_expand]: 2.19999e-06 [shard_inline]: 9.27001e-06 [merge_send_recv]: 1.095e-05 [auto_parallel]: 7.41999e-06 [parallel]: 1.873e-05 [flash_sp]: 9.97001e-06 [merge_comm]: 5.84999e-06 [allreduce_fusion]: 5.00999e-06 [matmul_add_comm_reduction]: 9.81003e-06 [allreduce_slice_to_reducescatter]: 6.59988e-07 [virtual_shard_identity]: 1.079e-05 [virtual_dataset]: 1.25e-05 [get_grad_eliminate_]: 9.05999e-06 [virtual_output]: 8.72e-06 [merge_forward]: 5.14998e-06 [cell_reuse_recompute_pass]: 1.28002e-06 [offload_activation]: 1.096e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.934e-05 [merge_recompute_call_nodes]: 1.52001e-06 [before_grad]: 1.531e-05 [set_forward_comm_id_for_comm_node_pass]: 5.53002e-06 [meta_fg_expand]: 4.18999e-06 [flash_sp_send_recv_attached]: 2.75002e-06 [receive_attached]: 3.09999e-06 [after_resolve]: 1.415e-05 [a_after_grad]: 1.413e-05 [renormalize]: 0.0007113 [add_forward_monad_depend]: 5.64998e-06 [auto_monad_grad]: 2.71e-06 [auto_monad_eliminator]: 1.86e-05 [cse]: 5.358e-05 [a_3]: 8.619e-05 [Cycle 2]: 0.0011243, [45] [expand_dump_flag]: 2.05002e-06 [switch_simplify]: 1.114e-05 [loop_unroll]: 9.54999e-06 [a_1]: 0.00023541 [with_stream_mark]: 1.432e-05 [recompute_prepare]: 9.66e-06 [updatestate_depend_eliminate]: 4.95999e-06 [updatestate_assign_eliminate]: 4.07998e-06 [updatestate_loads_eliminate]: 3.88001e-06 [parameter_eliminate]: 1.63002e-06 [a_2]: 0.00014291 [accelerated_algorithm]: 1.031e-05 [shard]: 1.52999e-06 [meta_shard_fg_expand]: 2.81e-06 [shard_inline]: 9.79e-06 [merge_send_recv]: 8.75999e-06 [auto_parallel]: 8.99998e-06 [parallel]: 6.02999e-06 [flash_sp]: 3.85998e-06 [merge_comm]: 5.27001e-06 [allreduce_fusion]: 7.48999e-06 [matmul_add_comm_reduction]: 9.48997e-06 [allreduce_slice_to_reducescatter]: 6.19999e-07 [virtual_shard_identity]: 1.141e-05 [virtual_dataset]: 9.56003e-06 [get_grad_eliminate_]: 9.56e-06 [virtual_output]: 9.52999e-06 [merge_forward]: 5.08002e-06 [cell_reuse_recompute_pass]: 2.13998e-06 [offload_activation]: 9.99999e-06 [cell_reuse_handle_not_recompute_node_pass]: 2.053e-05 [merge_recompute_call_nodes]: 1.19e-06 [before_grad]: 1.533e-05 [set_forward_comm_id_for_comm_node_pass]: 5.18002e-06 [meta_fg_expand]: 3.98999e-06 [flash_sp_send_recv_attached]: 1.19998e-06 [receive_attached]: 1.49998e-06 [after_resolve]: 1.393e-05 [a_after_grad]: 1.494e-05 [renormalize]: 8.00064e-08 [add_forward_monad_depend]: 1.35999e-06 [auto_monad_grad]: 1.09e-06 [auto_monad_eliminator]: 1.061e-05 [cse]: 2.545e-05 [a_3]: 7.33e-05 [py_interpret_to_execute_after_opt_a]: 1.65e-05 [slice_cell_reuse_recomputed_activation]: 5.15001e-06 [rewriter_after_opt_a]: 5.07e-05 [convert_after_rewriter]: 1.247e-05 [order_py_execute_after_rewriter]: 9.97999e-06 [mutable_eliminate]: 0.00101793 [opt_b]: 0.00038782, [1] [Cycle 1]: 0.00037693, [7] [b_1]: 0.00025273 [b_2]: 1.261e-05 [updatestate_depend_eliminate]: 7.96001e-06 [updatestate_assign_eliminate]: 3.93999e-06 [updatestate_loads_eliminate]: 4.02e-06 [renormalize]: 9.50007e-07 [cse]: 3.25e-05 [optimize_parallel_all_gather_comm]: 2.482e-05 [overlap_param_gather]: 5.30001e-06 [cconv]: 3.082e-05 [loop_unroll]: 0.00051758 [opt_after_cconv]: 0.00016613, [1] [Cycle 1]: 0.00015651, [7] [c_1]: 4.814e-05 [parameter_eliminate]: 3.23e-06 [updatestate_depend_eliminate]: 7.62002e-06 [updatestate_assign_eliminate]: 4.02e-06 [updatestate_loads_eliminate]: 3.93999e-06 [cse]: 2.865e-05 [renormalize]: 8.59989e-07 [remove_dup_value]: 4.75e-05 [tuple_transform]: 0.00012413, [1] [Cycle 1]: 0.00011582, [4] [d_1]: 6.93e-05 [none_parameter_eliminate]: 1.88997e-06 [renormalize]: 3.19997e-07 [switch_simplify]: 1.063e-05 [partial_unused_args_eliminate]: 4.63001e-06 [add_recomputation]: 6.487e-05 [cse_after_recomputation]: 3.698e-05, [1] [Cycle 1]: 2.928e-05, [1] [cse]: 1.897e-05 [environ_conv]: 1.105e-05 [swap_dp_allreduce_reducescatter]: 9.94999e-06 [bias_add_comm_swap]: 5.77001e-06 [label_micro_interleaved_index]: 8.40001e-06 [label_fine_grained_interleaved_index]: 5.59e-06 [merge_cast_opt]: 4.84998e-06 [slice_recompute_activation]: 4.68999e-06 [micro_interleaved_order_control]: 4.87e-06 [assign_add_opt]: 3.85998e-06 [ForceFp32Comm]: 3.53e-06 [remove_cast_before_assign_add]: 5.34e-06 [full_micro_interleaved_order_control]: 6.31e-06 [reorder_send_recv_between_fp_bp]: 6.12999e-06 [comm_op_add_attrs]: 4.09002e-06 [add_comm_op_reuse_tag]: 3.48999e-06 [interleave_split_concat_branches]: 3.8e-06 [interleave_parallel_branches]: 4e-06 [overlap_opt_shard_in_pipeline]: 4.05998e-06 [overlap_opt_shard_grad_in_pipeline]: 4.83001e-06 [control_data_broadcast_order]: 2.164e-05 [grouped_pairwise_exchange_alltoall]: 4.28001e-06 [offloading_packed_experts]: 7.66001e-06 [overlap_recompute_and_grad_model_parallel]: 8.71002e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.98001e-06 [overlap_recompute_allgather_and_fa_grad]: 4.3e-06 [overlap_recompute_comm]: 5.10001e-06 [overlap_grad_ring_attention]: 8.2e-06 [overlap_grad_flash_sp]: 2.955e-05 [begin_end_overlap_inline]: 3.45998e-06 [split_matmul_comm_elemetwise]: 5.38002e-06 [split_layernorm_comm]: 4.72e-06 [handle_group_info]: 3.61999e-06 [symbol_engine_optimizer]: 0.00012505, [1] [Cycle 1]: 0.00011742, [6] [build]: 3.88001e-06 [elim_shapecalc]: 1.438e-05 [elim_not_effective]: 1.961e-05 [opt_reshape]: 1.2e-05 [fold_const_symbol]: 1.64e-05 [renormalize]: 2.00002e-07 [detach_backward]: 3.98999e-06 [pipeline_parallel_scheduler]: 1.82999e-06 [auto_monad_reorder]: 2.635e-05 [get_jit_bprop_graph]: 1.73002e-06 [rewriter_after_jit_bprop_graph]: 5.10999e-06 [opt_after_jit_grad]: 0.00059984 [validate]: 4.791e-05 Sums bootstrap : 0.000408s : 3.24% type_inference : 0.005754s : 45.73% event_method : 0.000019s : 0.15% auto_monad : 0.000066s : 0.52% graph_reusing : 0.000005s : 0.04% inline : 0.000002s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000020s : 0.16% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.05% parallel-infer-symbol : 0.000003s : 0.03% pre_auto_parallel : 0.000033s : 0.26% insert-virtual-dataset : 0.000003s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.01% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000034s : 0.27% optimize.rewriter_before_opt_a : 0.000093s : 0.74% optimize.opt_a.expand_dump_flag : 0.000005s : 0.04% optimize.opt_a.switch_simplify : 0.000051s : 0.41% optimize.opt_a.loop_unroll : 0.000042s : 0.33% optimize.opt_a.a_1 : 0.000946s : 7.52% optimize.opt_a.with_stream_mark : 0.000029s : 0.23% optimize.opt_a.recompute_prepare : 0.000021s : 0.17% optimize.opt_a.updatestate_depend_eliminate : 0.000010s : 0.08% optimize.opt_a.updatestate_assign_eliminate : 0.000008s : 0.07% optimize.opt_a.updatestate_loads_eliminate : 0.000008s : 0.06% optimize.opt_a.parameter_eliminate : 0.000004s : 0.03% optimize.opt_a.a_2 : 0.000290s : 2.31% optimize.opt_a.accelerated_algorithm : 0.000020s : 0.16% optimize.opt_a.shard : 0.000003s : 0.03% optimize.opt_a.meta_shard_fg_expand : 0.000005s : 0.04% optimize.opt_a.shard_inline : 0.000019s : 0.15% optimize.opt_a.merge_send_recv : 0.000020s : 0.16% optimize.opt_a.auto_parallel : 0.000016s : 0.13% optimize.opt_a.parallel : 0.000025s : 0.20% optimize.opt_a.flash_sp : 0.000014s : 0.11% optimize.opt_a.merge_comm : 0.000011s : 0.09% optimize.opt_a.allreduce_fusion : 0.000012s : 0.10% optimize.opt_a.matmul_add_comm_reduction : 0.000019s : 0.15% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000022s : 0.18% optimize.opt_a.virtual_dataset : 0.000022s : 0.18% optimize.opt_a.get_grad_eliminate_ : 0.000019s : 0.15% optimize.opt_a.virtual_output : 0.000018s : 0.15% optimize.opt_a.merge_forward : 0.000010s : 0.08% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.03% optimize.opt_a.offload_activation : 0.000021s : 0.17% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000040s : 0.32% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.02% optimize.opt_a.before_grad : 0.000031s : 0.24% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000011s : 0.09% optimize.opt_a.meta_fg_expand : 0.000008s : 0.07% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.03% optimize.opt_a.receive_attached : 0.000005s : 0.04% optimize.opt_a.after_resolve : 0.000028s : 0.22% optimize.opt_a.a_after_grad : 0.000029s : 0.23% optimize.opt_a.renormalize : 0.000711s : 5.65% optimize.opt_a.add_forward_monad_depend : 0.000007s : 0.06% optimize.opt_a.auto_monad_grad : 0.000004s : 0.03% optimize.opt_a.auto_monad_eliminator : 0.000029s : 0.23% optimize.opt_a.cse : 0.000079s : 0.63% optimize.opt_a.a_3 : 0.000159s : 1.27% optimize.py_interpret_to_execute_after_opt_a : 0.000017s : 0.13% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.04% optimize.rewriter_after_opt_a : 0.000051s : 0.40% optimize.convert_after_rewriter : 0.000012s : 0.10% optimize.order_py_execute_after_rewriter : 0.000010s : 0.08% optimize.mutable_eliminate : 0.001018s : 8.09% optimize.opt_b.b_1 : 0.000253s : 2.01% optimize.opt_b.b_2 : 0.000013s : 0.10% optimize.opt_b.updatestate_depend_eliminate : 0.000008s : 0.06% optimize.opt_b.updatestate_assign_eliminate : 0.000004s : 0.03% optimize.opt_b.updatestate_loads_eliminate : 0.000004s : 0.03% optimize.opt_b.renormalize : 0.000001s : 0.01% optimize.opt_b.cse : 0.000033s : 0.26% optimize.optimize_parallel_all_gather_comm : 0.000025s : 0.20% optimize.overlap_param_gather : 0.000005s : 0.04% optimize.cconv : 0.000031s : 0.24% optimize.loop_unroll : 0.000518s : 4.11% optimize.opt_after_cconv.c_1 : 0.000048s : 0.38% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000008s : 0.06% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000004s : 0.03% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000004s : 0.03% optimize.opt_after_cconv.cse : 0.000029s : 0.23% optimize.opt_after_cconv.renormalize : 0.000001s : 0.01% optimize.remove_dup_value : 0.000047s : 0.38% optimize.tuple_transform.d_1 : 0.000069s : 0.55% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.02% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000011s : 0.08% optimize.partial_unused_args_eliminate : 0.000005s : 0.04% optimize.add_recomputation : 0.000065s : 0.52% optimize.cse_after_recomputation.cse : 0.000019s : 0.15% optimize.environ_conv : 0.000011s : 0.09% optimize.swap_dp_allreduce_reducescatter : 0.000010s : 0.08% optimize.bias_add_comm_swap : 0.000006s : 0.05% optimize.label_micro_interleaved_index : 0.000008s : 0.07% optimize.label_fine_grained_interleaved_index : 0.000006s : 0.04% optimize.merge_cast_opt : 0.000005s : 0.04% optimize.slice_recompute_activation : 0.000005s : 0.04% optimize.micro_interleaved_order_control : 0.000005s : 0.04% optimize.assign_add_opt : 0.000004s : 0.03% optimize.ForceFp32Comm : 0.000004s : 0.03% optimize.remove_cast_before_assign_add : 0.000005s : 0.04% optimize.full_micro_interleaved_order_control : 0.000006s : 0.05% optimize.reorder_send_recv_between_fp_bp : 0.000006s : 0.05% optimize.comm_op_add_attrs : 0.000004s : 0.03% optimize.add_comm_op_reuse_tag : 0.000003s : 0.03% optimize.interleave_split_concat_branches : 0.000004s : 0.03% optimize.interleave_parallel_branches : 0.000004s : 0.03% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.03% optimize.overlap_opt_shard_grad_in_pipeline : 0.000005s : 0.04% optimize.control_data_broadcast_order : 0.000022s : 0.17% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.03% optimize.offloading_packed_experts : 0.000008s : 0.06% optimize.overlap_recompute_and_grad_model_parallel : 0.000009s : 0.07% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.03% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.03% optimize.overlap_recompute_comm : 0.000005s : 0.04% optimize.overlap_grad_ring_attention : 0.000008s : 0.07% optimize.overlap_grad_flash_sp : 0.000030s : 0.23% optimize.begin_end_overlap_inline : 0.000003s : 0.03% optimize.split_matmul_comm_elemetwise : 0.000005s : 0.04% optimize.split_layernorm_comm : 0.000005s : 0.04% optimize.handle_group_info : 0.000004s : 0.03% optimize.symbol_engine_optimizer.build : 0.000004s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000014s : 0.11% optimize.symbol_engine_optimizer.elim_not_effective : 0.000020s : 0.16% optimize.symbol_engine_optimizer.opt_reshape : 0.000012s : 0.10% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000016s : 0.13% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000004s : 0.03% pipeline_parallel_scheduler : 0.000002s : 0.01% auto_monad_reorder : 0.000026s : 0.21% get_jit_bprop_graph : 0.000002s : 0.01% rewriter_after_jit_bprop_graph : 0.000005s : 0.04% opt_after_jit_grad : 0.000600s : 4.77% validate : 0.000048s : 0.38% Time group info: ------[substitution.] 0.000222 48 15.96% : 0.000035s : 6: substitution.cast_eliminate 1.25% : 0.000003s : 4: substitution.elim_not_effective 0.90% : 0.000002s : 4: substitution.fold_const_symbol 3.65% : 0.000008s : 6: substitution.graph_param_transform 60.76% : 0.000135s : 4: substitution.inline 2.56% : 0.000006s : 8: substitution.j_node_and_user_rematch 3.60% : 0.000008s : 8: substitution.remove_not_recompute_node 2.03% : 0.000005s : 4: substitution.replace_old_param 9.29% : 0.000021s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.005704 2 88.59% : 0.005054s : 1: type_inference.infer 11.41% : 0.000651s : 1: type_inference.specialize ------[replace.] 0.000061 8 61.52% : 0.000038s : 4: replace.inline 38.48% : 0.000024s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000151 8 87.64% : 0.000132s : 4: match.inline 12.36% : 0.000019s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000292 1730 0.90% : 0.000003s : 17: predicate.accumulaten_eliminater 0.92% : 0.000003s : 6: predicate.ad_related_special_op_eliminate 0.58% : 0.000002s : 12: predicate.addn_check_dump 1.03% : 0.000003s : 17: predicate.addn_zero_filter 0.78% : 0.000002s : 17: predicate.adjust_all_reduce_mul_add 2.09% : 0.000006s : 29: predicate.arithmetic_simplify 1.05% : 0.000003s : 17: predicate.cast_eliminate 0.78% : 0.000002s : 12: predicate.check_bprop_eliminate 0.61% : 0.000002s : 12: predicate.compare_switch_simplify 0.19% : 0.000001s : 6: predicate.const_output_eliminate 0.74% : 0.000002s : 12: predicate.depend_value_elim 0.93% : 0.000003s : 17: predicate.dict_get_item_const_eliminator 1.12% : 0.000003s : 17: predicate.dict_get_item_eliminator 0.85% : 0.000002s : 17: predicate.dict_set_item_eliminator 0.90% : 0.000003s : 12: predicate.dumpgradient_eliminate 0.33% : 0.000001s : 6: predicate.elim_not_effective 0.49% : 0.000001s : 6: predicate.elim_shapecalc_of_broadcastargs 1.17% : 0.000003s : 23: predicate.environ_add_const_eliminate 1.12% : 0.000003s : 23: predicate.environ_get_add_eliminate 1.12% : 0.000003s : 23: predicate.environ_get_depend_swap 1.79% : 0.000005s : 35: predicate.environ_get_eliminate 1.17% : 0.000003s : 23: predicate.environ_get_set_eliminate 1.38% : 0.000004s : 25: predicate.exchange_switch_depend_value 1.94% : 0.000006s : 25: predicate.float_depend_g_call 0.56% : 0.000002s : 12: predicate.float_environ_get_switch 0.86% : 0.000003s : 18: predicate.float_tuple_getitem_switch 0.19% : 0.000001s : 6: predicate.fold_const_symbol 0.78% : 0.000002s : 12: predicate.get_grad_eliminate 0.27% : 0.000001s : 6: predicate.graph_param_transform 0.65% : 0.000002s : 12: predicate.incorporate_call 0.57% : 0.000002s : 12: predicate.incorporate_call_switch 5.81% : 0.000017s : 78: predicate.inline 0.92% : 0.000003s : 12: predicate.inline_without_move 0.35% : 0.000001s : 12: predicate.j_node_and_user_rematch 0.89% : 0.000003s : 12: predicate.less_batch_normalization 1.93% : 0.000006s : 33: predicate.list_to_tuple_eliminator_ 2.59% : 0.000008s : 50: predicate.load_eliminater 0.84% : 0.000002s : 6: predicate.loop_unroll_after_grad 1.94% : 0.000006s : 38: predicate.loop_unroll_before_grad 1.77% : 0.000005s : 29: predicate.make_slice_get_slice_eliminator 0.67% : 0.000002s : 12: predicate.merge_addn 0.63% : 0.000002s : 12: predicate.micro_step_allgather_replace 0.70% : 0.000002s : 12: predicate.mini_step_allgather_replace 0.80% : 0.000002s : 17: predicate.minmaximum_grad 0.83% : 0.000002s : 6: predicate.mutable_eliminate 0.46% : 0.000001s : 6: predicate.opt_reshape 0.47% : 0.000001s : 6: predicate.parallel_virtual_node 1.48% : 0.000004s : 25: predicate.partial_defer_inline 1.58% : 0.000005s : 27: predicate.partial_eliminate 0.84% : 0.000002s : 17: predicate.print_const_string_wrapper 0.79% : 0.000002s : 12: predicate.reduce_all_const_elim 1.18% : 0.000003s : 17: predicate.reduce_eliminate 2.49% : 0.000007s : 50: predicate.redundant_stop_gradient_eliminater 0.37% : 0.000001s : 12: predicate.remove_not_recompute_node 1.30% : 0.000004s : 33: predicate.replace_applicator 0.41% : 0.000001s : 12: predicate.replace_old_param 0.27% : 0.000001s : 6: predicate.reset_defer_inline 0.92% : 0.000003s : 17: predicate.reshape_eliminate 0.68% : 0.000002s : 12: predicate.row_tensor_add_zeros_like 0.51% : 0.000002s : 6: predicate.row_tensor_eliminate 0.95% : 0.000003s : 12: predicate.same_eliminate 0.45% : 0.000001s : 12: predicate.set_cell_output_no_recompute 0.84% : 0.000002s : 12: predicate.shard_identity_eliminate 0.82% : 0.000002s : 12: predicate.special_op_eliminate 0.80% : 0.000002s : 12: predicate.specialize_transform 0.90% : 0.000003s : 12: predicate.split_environ_get_set_with_tuple_value 0.89% : 0.000003s : 12: predicate.stack_unstack_eliminate 0.39% : 0.000001s : 6: predicate.switch_call_monad_eliminater 1.28% : 0.000004s : 25: predicate.switch_defer_inline 2.05% : 0.000006s : 37: predicate.switch_layer_defer_inline 4.62% : 0.000013s : 81: predicate.switch_simplify 0.84% : 0.000002s : 17: predicate.tile_eliminate 0.88% : 0.000003s : 17: predicate.transpose_eliminate 1.62% : 0.000005s : 29: predicate.tuple_list_convert_item_index_to_positive 1.71% : 0.000005s : 29: predicate.tuple_list_get_item_const_eliminator 1.63% : 0.000005s : 29: predicate.tuple_list_get_item_depend_reorder 2.93% : 0.000009s : 45: predicate.tuple_list_get_item_eliminator 1.65% : 0.000005s : 29: predicate.tuple_list_get_set_item_eliminator 2.41% : 0.000007s : 41: predicate.tuple_list_set_item_eliminator 1.79% : 0.000005s : 33: predicate.tuple_to_list_eliminator_ 2.41% : 0.000007s : 50: predicate.updatestate_pure_node_eliminater 3.08% : 0.000009s : 62: predicate.updatestate_useless_node_eliminater 0.43% : 0.000001s : 6: predicate.value_based_eliminate 0.73% : 0.000002s : 12: predicate.virtual_dataset_eliminate 0.66% : 0.000002s : 12: predicate.virtual_output_eliminate 0.35% : 0.000001s : 6: predicate.virtual_view_grad_eliminate 0.62% : 0.000002s : 6: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000476 11 54.51% : 0.000259s : 5: func_graph_cloner_run.FuncGraphClonerGraph 45.49% : 0.000216s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.030107 192 0.12% : 0.000036s : 1: ForceFp32Comm 10.33% : 0.003110s : 1: add_attr 10.28% : 0.003094s : 1: add_attr_with_inline 0.02% : 0.000006s : 1: add_comm_op_reuse_tag 0.23% : 0.000069s : 1: add_recomputation 0.02% : 0.000007s : 1: assign_add_opt 0.25% : 0.000075s : 1: auto_monad 0.11% : 0.000034s : 1: auto_monad_reorder 0.02% : 0.000007s : 1: begin_end_overlap_inline 0.03% : 0.000009s : 1: bias_add_comm_swap 1.48% : 0.000447s : 1: bootstrap 0.11% : 0.000034s : 1: cconv 0.02% : 0.000007s : 1: comm_op_add_attrs 0.08% : 0.000025s : 1: control_data_broadcast_order 0.05% : 0.000016s : 1: convert_after_rewriter 0.13% : 0.000040s : 1: cse_after_recomputation 0.02% : 0.000007s : 1: dataset_repeat_opt 0.07% : 0.000021s : 1: detach_backward 0.05% : 0.000014s : 1: environ_conv 0.09% : 0.000029s : 1: event_method 0.03% : 0.000010s : 1: full_micro_interleaved_order_control 0.03% : 0.000008s : 1: get_jit_bprop_graph 0.04% : 0.000012s : 1: graph_reusing 0.03% : 0.000008s : 1: grouped_pairwise_exchange_alltoall 0.02% : 0.000007s : 1: handle_group_info 0.03% : 0.000008s : 1: inline 0.03% : 0.000009s : 1: insert-virtual-dataset 0.02% : 0.000007s : 1: interleave_parallel_branches 0.02% : 0.000007s : 1: interleave_split_concat_branches 0.03% : 0.000009s : 1: label_fine_grained_interleaved_index 0.04% : 0.000012s : 1: label_micro_interleaved_index 1.74% : 0.000524s : 1: loop_unroll 0.03% : 0.000008s : 1: merge_cast_opt 0.03% : 0.000008s : 1: micro_interleaved_order_control 3.41% : 0.001026s : 1: mutable_eliminate 0.04% : 0.000011s : 1: offloading_packed_experts 0.07% : 0.000020s : 1: opt.transform.loop_unroll_optimizer 0.06% : 0.000019s : 1: opt.transform.mutable_eliminate 5.24% : 0.001577s : 78: opt.transform.opt_a 0.15% : 0.000046s : 1: opt.transform.opt_after_cconv 0.13% : 0.000040s : 1: opt.transform.opt_after_jit_grad 0.60% : 0.000182s : 28: opt.transform.opt_b 0.26% : 0.000078s : 2: opt.transform.opt_trans_graph 0.19% : 0.000058s : 4: opt.transform.symbol_engine_opt 12.02% : 0.003619s : 1: opt_a 0.56% : 0.000170s : 1: opt_after_cconv 2.04% : 0.000613s : 1: opt_after_jit_grad 1.30% : 0.000391s : 1: opt_b 24.01% : 0.007228s : 1: optimize 0.09% : 0.000028s : 1: optimize_parallel_all_gather_comm 0.04% : 0.000013s : 1: order_py_execute_after_rewriter 0.11% : 0.000033s : 1: overlap_grad_flash_sp 0.02% : 0.000007s : 1: overlap_grad_matmul_and_grad_allreduce 0.04% : 0.000011s : 1: overlap_grad_ring_attention 0.03% : 0.000008s : 1: overlap_opt_shard_grad_in_pipeline 0.02% : 0.000007s : 1: overlap_opt_shard_in_pipeline 0.03% : 0.000008s : 1: overlap_param_gather 0.02% : 0.000007s : 1: overlap_recompute_allgather_and_fa_grad 0.04% : 0.000012s : 1: overlap_recompute_and_grad_model_parallel 0.03% : 0.000008s : 1: overlap_recompute_comm 0.03% : 0.000010s : 1: parallel-infer-symbol 0.02% : 0.000007s : 1: parallel-infer-symbol-second 0.03% : 0.000008s : 1: partial_unused_args_eliminate 0.04% : 0.000011s : 1: pipeline_parallel_scheduler 0.02% : 0.000007s : 1: pipeline_split 0.13% : 0.000040s : 1: pre_auto_parallel 0.13% : 0.000038s : 1: py_interpret_to_execute 0.07% : 0.000020s : 1: py_interpret_to_execute_after_opt_a 0.03% : 0.000010s : 1: remove_cast_before_assign_add 0.17% : 0.000052s : 1: remove_dup_value 1.21% : 0.000364s : 1: renormalize.infer 1.13% : 0.000340s : 1: renormalize.specialize 0.03% : 0.000010s : 1: reorder_send_recv_between_fp_bp 0.04% : 0.000011s : 1: rewriter_after_jit_bprop_graph 0.18% : 0.000054s : 1: rewriter_after_opt_a 0.32% : 0.000097s : 1: rewriter_before_opt_a 0.03% : 0.000008s : 1: slice_cell_reuse_recomputed_activation 0.02% : 0.000008s : 1: slice_recompute_activation 0.03% : 0.000008s : 1: split_layernorm_comm 0.03% : 0.000008s : 1: split_matmul_comm_elemetwise 0.04% : 0.000013s : 1: swap_dp_allreduce_reducescatter 0.43% : 0.000128s : 1: symbol_engine_optimizer 0.42% : 0.000127s : 1: tuple_transform 19.22% : 0.005787s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:45:39.292.880 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0159694, [21] [bootstrap]: 0.00039592 [type_inference]: 0.005806 [event_method]: 1.904e-05 [auto_monad]: 6.562e-05 [graph_reusing]: 4.91002e-06 [inline]: 1.97999e-06 [add_attr]: 0.00318888, [1] [add_attr_with_inline]: 0.00317904, [1] [Cycle 1]: 5.647e-05, [2] [tag_attr]: 2.122e-05 [meta_addattr_fg_expand]: 6.26e-06 [parallel-infer-symbol]: 3.43999e-06 [pre_auto_parallel]: 3.315e-05 [insert-virtual-dataset]: 2.26998e-06 [parallel-infer-symbol-second]: 8.50006e-07 [dataset_repeat_opt]: 1.92001e-06 [pipeline_split]: 1.62001e-06 [optimize]: 0.00572837, [53] [py_interpret_to_execute]: 2.817e-05 [rewriter_before_opt_a]: 8.955e-05 [opt_a]: 0.00323932, [2] [Cycle 1]: 0.00232738, [45] [expand_dump_flag]: 3.31001e-06 [switch_simplify]: 4.382e-05 [loop_unroll]: 3.483e-05 [a_1]: 0.00072401 [with_stream_mark]: 1.777e-05 [recompute_prepare]: 1.212e-05 [updatestate_depend_eliminate]: 5.42001e-06 [updatestate_assign_eliminate]: 4.15999e-06 [updatestate_loads_eliminate]: 3.72002e-06 [parameter_eliminate]: 1.72999e-06 [a_2]: 0.00011997 [accelerated_algorithm]: 9.54e-06 [shard]: 1.86e-06 [meta_shard_fg_expand]: 2.41e-06 [shard_inline]: 9.42001e-06 [merge_send_recv]: 1.086e-05 [auto_parallel]: 8.82999e-06 [parallel]: 1.919e-05 [flash_sp]: 9.44e-06 [merge_comm]: 5.24998e-06 [allreduce_fusion]: 5.38002e-06 [matmul_add_comm_reduction]: 1.007e-05 [allreduce_slice_to_reducescatter]: 9.09989e-07 [virtual_shard_identity]: 1.264e-05 [virtual_dataset]: 9.51e-06 [get_grad_eliminate_]: 8.82999e-06 [virtual_output]: 8.91002e-06 [merge_forward]: 4.85999e-06 [cell_reuse_recompute_pass]: 1.64e-06 [offload_activation]: 1.171e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.916e-05 [merge_recompute_call_nodes]: 1.52999e-06 [before_grad]: 1.537e-05 [set_forward_comm_id_for_comm_node_pass]: 5.24e-06 [meta_fg_expand]: 3.86001e-06 [flash_sp_send_recv_attached]: 2.71e-06 [receive_attached]: 2.39001e-06 [after_resolve]: 1.397e-05 [a_after_grad]: 1.412e-05 [renormalize]: 0.00074032 [add_forward_monad_depend]: 6.69999e-06 [auto_monad_grad]: 2.20002e-06 [auto_monad_eliminator]: 1.802e-05 [cse]: 3.861e-05 [a_3]: 6.914e-05 [Cycle 2]: 0.00090115, [45] [expand_dump_flag]: 1.98002e-06 [switch_simplify]: 1.075e-05 [loop_unroll]: 8.50999e-06 [a_1]: 0.00021338 [with_stream_mark]: 1.506e-05 [recompute_prepare]: 9.19998e-06 [updatestate_depend_eliminate]: 4.85999e-06 [updatestate_assign_eliminate]: 3.55e-06 [updatestate_loads_eliminate]: 3.63e-06 [parameter_eliminate]: 1.74998e-06 [a_2]: 0.00011152 [accelerated_algorithm]: 9.97999e-06 [shard]: 1.86e-06 [meta_shard_fg_expand]: 1.96e-06 [shard_inline]: 8.54998e-06 [merge_send_recv]: 7.68001e-06 [auto_parallel]: 1.031e-05 [parallel]: 6.69999e-06 [flash_sp]: 4.26001e-06 [merge_comm]: 4.84e-06 [allreduce_fusion]: 4.58999e-06 [matmul_add_comm_reduction]: 9.07001e-06 [allreduce_slice_to_reducescatter]: 7.39994e-07 [virtual_shard_identity]: 1.117e-05 [virtual_dataset]: 8.75999e-06 [get_grad_eliminate_]: 8.65999e-06 [virtual_output]: 8.57e-06 [merge_forward]: 4.94e-06 [cell_reuse_recompute_pass]: 1.64998e-06 [offload_activation]: 1.232e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.94e-05 [merge_recompute_call_nodes]: 1.24e-06 [before_grad]: 1.53e-05 [set_forward_comm_id_for_comm_node_pass]: 5.19003e-06 [meta_fg_expand]: 3.43e-06 [flash_sp_send_recv_attached]: 1.20001e-06 [receive_attached]: 1.62999e-06 [after_resolve]: 1.322e-05 [a_after_grad]: 1.366e-05 [renormalize]: 5.9983e-08 [add_forward_monad_depend]: 2.89999e-06 [auto_monad_grad]: 1.53002e-06 [auto_monad_eliminator]: 1.173e-05 [cse]: 2.677e-05 [a_3]: 5.751e-05 [py_interpret_to_execute_after_opt_a]: 1.348e-05 [slice_cell_reuse_recomputed_activation]: 1.92999e-06 [rewriter_after_opt_a]: 4.808e-05 [convert_after_rewriter]: 8.47e-06 [order_py_execute_after_rewriter]: 6.10002e-06 [mutable_eliminate]: 0.00061586 [opt_b]: 0.0002972, [1] [Cycle 1]: 0.0002896, [7] [b_1]: 0.00019307 [b_2]: 1.116e-05 [updatestate_depend_eliminate]: 8.85001e-06 [updatestate_assign_eliminate]: 4e-06 [updatestate_loads_eliminate]: 4.28999e-06 [renormalize]: 7.79983e-07 [cse]: 3.067e-05 [optimize_parallel_all_gather_comm]: 1.984e-05 [overlap_param_gather]: 2.16e-06 [cconv]: 3.037e-05 [loop_unroll]: 0.00049416 [opt_after_cconv]: 0.00014169, [1] [Cycle 1]: 0.0001344, [7] [c_1]: 4.612e-05 [parameter_eliminate]: 4.95001e-06 [updatestate_depend_eliminate]: 7.30998e-06 [updatestate_assign_eliminate]: 4.04997e-06 [updatestate_loads_eliminate]: 3.58999e-06 [cse]: 3.062e-05 [renormalize]: 8.50006e-07 [remove_dup_value]: 4.54e-05 [tuple_transform]: 0.0001038, [1] [Cycle 1]: 9.9e-05, [4] [d_1]: 6.707e-05 [none_parameter_eliminate]: 1.69e-06 [renormalize]: 2.89991e-07 [switch_simplify]: 9.76e-06 [partial_unused_args_eliminate]: 1.67001e-06 [add_recomputation]: 6.303e-05 [cse_after_recomputation]: 2.961e-05, [1] [Cycle 1]: 2.479e-05, [1] [cse]: 1.821e-05 [environ_conv]: 7.13998e-06 [swap_dp_allreduce_reducescatter]: 6.78998e-06 [bias_add_comm_swap]: 2.75997e-06 [label_micro_interleaved_index]: 4.37e-06 [label_fine_grained_interleaved_index]: 2.73e-06 [merge_cast_opt]: 1.34e-06 [slice_recompute_activation]: 2.09999e-06 [micro_interleaved_order_control]: 2.32999e-06 [assign_add_opt]: 1.39998e-06 [ForceFp32Comm]: 8.50006e-07 [remove_cast_before_assign_add]: 1.32999e-06 [full_micro_interleaved_order_control]: 2.16998e-06 [reorder_send_recv_between_fp_bp]: 3.48999e-06 [comm_op_add_attrs]: 1.46998e-06 [add_comm_op_reuse_tag]: 9.80013e-07 [interleave_split_concat_branches]: 1.50999e-06 [interleave_parallel_branches]: 1.25001e-06 [overlap_opt_shard_in_pipeline]: 1.21002e-06 [overlap_opt_shard_grad_in_pipeline]: 2.26e-06 [control_data_broadcast_order]: 1.815e-05 [grouped_pairwise_exchange_alltoall]: 1.62001e-06 [offloading_packed_experts]: 4.90001e-06 [overlap_recompute_and_grad_model_parallel]: 6.01e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.14e-06 [overlap_recompute_allgather_and_fa_grad]: 1.35001e-06 [overlap_recompute_comm]: 2.64001e-06 [overlap_grad_ring_attention]: 5.18002e-06 [overlap_grad_flash_sp]: 2.426e-05 [begin_end_overlap_inline]: 5.49975e-07 [split_matmul_comm_elemetwise]: 2.43002e-06 [split_layernorm_comm]: 1.79e-06 [handle_group_info]: 9.39996e-07 [symbol_engine_optimizer]: 0.00010139, [1] [Cycle 1]: 9.639e-05, [6] [build]: 3.78001e-06 [elim_shapecalc]: 1.639e-05 [elim_not_effective]: 1.812e-05 [opt_reshape]: 1.075e-05 [fold_const_symbol]: 1.641e-05 [renormalize]: 2.30008e-07 [detach_backward]: 2.36998e-06 [pipeline_parallel_scheduler]: 1.40001e-06 [auto_monad_reorder]: 2.242e-05 [get_jit_bprop_graph]: 1.62999e-06 [rewriter_after_jit_bprop_graph]: 3.58e-06 [opt_after_jit_grad]: 0.00050092 [validate]: 4.268e-05 Sums bootstrap : 0.000396s : 3.36% type_inference : 0.005806s : 49.20% event_method : 0.000019s : 0.16% auto_monad : 0.000066s : 0.56% graph_reusing : 0.000005s : 0.04% inline : 0.000002s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000021s : 0.18% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.05% parallel-infer-symbol : 0.000003s : 0.03% pre_auto_parallel : 0.000033s : 0.28% insert-virtual-dataset : 0.000002s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000028s : 0.24% optimize.rewriter_before_opt_a : 0.000090s : 0.76% optimize.opt_a.expand_dump_flag : 0.000005s : 0.04% optimize.opt_a.switch_simplify : 0.000055s : 0.46% optimize.opt_a.loop_unroll : 0.000043s : 0.37% optimize.opt_a.a_1 : 0.000937s : 7.94% optimize.opt_a.with_stream_mark : 0.000033s : 0.28% optimize.opt_a.recompute_prepare : 0.000021s : 0.18% optimize.opt_a.updatestate_depend_eliminate : 0.000010s : 0.09% optimize.opt_a.updatestate_assign_eliminate : 0.000008s : 0.07% optimize.opt_a.updatestate_loads_eliminate : 0.000007s : 0.06% optimize.opt_a.parameter_eliminate : 0.000003s : 0.03% optimize.opt_a.a_2 : 0.000231s : 1.96% optimize.opt_a.accelerated_algorithm : 0.000020s : 0.17% optimize.opt_a.shard : 0.000004s : 0.03% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.04% optimize.opt_a.shard_inline : 0.000018s : 0.15% optimize.opt_a.merge_send_recv : 0.000019s : 0.16% optimize.opt_a.auto_parallel : 0.000019s : 0.16% optimize.opt_a.parallel : 0.000026s : 0.22% optimize.opt_a.flash_sp : 0.000014s : 0.12% optimize.opt_a.merge_comm : 0.000010s : 0.09% optimize.opt_a.allreduce_fusion : 0.000010s : 0.08% optimize.opt_a.matmul_add_comm_reduction : 0.000019s : 0.16% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000002s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000024s : 0.20% optimize.opt_a.virtual_dataset : 0.000018s : 0.15% optimize.opt_a.get_grad_eliminate_ : 0.000017s : 0.15% optimize.opt_a.virtual_output : 0.000017s : 0.15% optimize.opt_a.merge_forward : 0.000010s : 0.08% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.03% optimize.opt_a.offload_activation : 0.000024s : 0.20% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000039s : 0.33% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.02% optimize.opt_a.before_grad : 0.000031s : 0.26% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000010s : 0.09% optimize.opt_a.meta_fg_expand : 0.000007s : 0.06% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.03% optimize.opt_a.receive_attached : 0.000004s : 0.03% optimize.opt_a.after_resolve : 0.000027s : 0.23% optimize.opt_a.a_after_grad : 0.000028s : 0.24% optimize.opt_a.renormalize : 0.000740s : 6.27% optimize.opt_a.add_forward_monad_depend : 0.000010s : 0.08% optimize.opt_a.auto_monad_grad : 0.000004s : 0.03% optimize.opt_a.auto_monad_eliminator : 0.000030s : 0.25% optimize.opt_a.cse : 0.000065s : 0.55% optimize.opt_a.a_3 : 0.000127s : 1.07% optimize.py_interpret_to_execute_after_opt_a : 0.000013s : 0.11% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.02% optimize.rewriter_after_opt_a : 0.000048s : 0.41% optimize.convert_after_rewriter : 0.000008s : 0.07% optimize.order_py_execute_after_rewriter : 0.000006s : 0.05% optimize.mutable_eliminate : 0.000616s : 5.22% optimize.opt_b.b_1 : 0.000193s : 1.64% optimize.opt_b.b_2 : 0.000011s : 0.09% optimize.opt_b.updatestate_depend_eliminate : 0.000009s : 0.08% optimize.opt_b.updatestate_assign_eliminate : 0.000004s : 0.03% optimize.opt_b.updatestate_loads_eliminate : 0.000004s : 0.04% optimize.opt_b.renormalize : 0.000001s : 0.01% optimize.opt_b.cse : 0.000031s : 0.26% optimize.optimize_parallel_all_gather_comm : 0.000020s : 0.17% optimize.overlap_param_gather : 0.000002s : 0.02% optimize.cconv : 0.000030s : 0.26% optimize.loop_unroll : 0.000494s : 4.19% optimize.opt_after_cconv.c_1 : 0.000046s : 0.39% optimize.opt_after_cconv.parameter_eliminate : 0.000005s : 0.04% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000007s : 0.06% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000004s : 0.03% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000004s : 0.03% optimize.opt_after_cconv.cse : 0.000031s : 0.26% optimize.opt_after_cconv.renormalize : 0.000001s : 0.01% optimize.remove_dup_value : 0.000045s : 0.38% optimize.tuple_transform.d_1 : 0.000067s : 0.57% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000010s : 0.08% optimize.partial_unused_args_eliminate : 0.000002s : 0.01% optimize.add_recomputation : 0.000063s : 0.53% optimize.cse_after_recomputation.cse : 0.000018s : 0.15% optimize.environ_conv : 0.000007s : 0.06% optimize.swap_dp_allreduce_reducescatter : 0.000007s : 0.06% optimize.bias_add_comm_swap : 0.000003s : 0.02% optimize.label_micro_interleaved_index : 0.000004s : 0.04% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.02% optimize.merge_cast_opt : 0.000001s : 0.01% optimize.slice_recompute_activation : 0.000002s : 0.02% optimize.micro_interleaved_order_control : 0.000002s : 0.02% optimize.assign_add_opt : 0.000001s : 0.01% optimize.ForceFp32Comm : 0.000001s : 0.01% optimize.remove_cast_before_assign_add : 0.000001s : 0.01% optimize.full_micro_interleaved_order_control : 0.000002s : 0.02% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.03% optimize.comm_op_add_attrs : 0.000001s : 0.01% optimize.add_comm_op_reuse_tag : 0.000001s : 0.01% optimize.interleave_split_concat_branches : 0.000002s : 0.01% optimize.interleave_parallel_branches : 0.000001s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.02% optimize.control_data_broadcast_order : 0.000018s : 0.15% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.01% optimize.offloading_packed_experts : 0.000005s : 0.04% optimize.overlap_recompute_and_grad_model_parallel : 0.000006s : 0.05% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.01% optimize.overlap_recompute_comm : 0.000003s : 0.02% optimize.overlap_grad_ring_attention : 0.000005s : 0.04% optimize.overlap_grad_flash_sp : 0.000024s : 0.21% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.02% optimize.split_layernorm_comm : 0.000002s : 0.02% optimize.handle_group_info : 0.000001s : 0.01% optimize.symbol_engine_optimizer.build : 0.000004s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000016s : 0.14% optimize.symbol_engine_optimizer.elim_not_effective : 0.000018s : 0.15% optimize.symbol_engine_optimizer.opt_reshape : 0.000011s : 0.09% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000016s : 0.14% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.02% pipeline_parallel_scheduler : 0.000001s : 0.01% auto_monad_reorder : 0.000022s : 0.19% get_jit_bprop_graph : 0.000002s : 0.01% rewriter_after_jit_bprop_graph : 0.000004s : 0.03% opt_after_jit_grad : 0.000501s : 4.25% validate : 0.000043s : 0.36% Time group info: ------[substitution.] 0.000224 48 14.13% : 0.000032s : 6: substitution.cast_eliminate 0.97% : 0.000002s : 4: substitution.elim_not_effective 0.98% : 0.000002s : 4: substitution.fold_const_symbol 3.77% : 0.000008s : 6: substitution.graph_param_transform 66.17% : 0.000148s : 4: substitution.inline 2.68% : 0.000006s : 8: substitution.j_node_and_user_rematch 3.70% : 0.000008s : 8: substitution.remove_not_recompute_node 2.26% : 0.000005s : 4: substitution.replace_old_param 5.33% : 0.000012s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.005753 2 87.80% : 0.005051s : 1: type_inference.infer 12.20% : 0.000702s : 1: type_inference.specialize ------[replace.] 0.000069 8 63.34% : 0.000044s : 4: replace.inline 36.66% : 0.000025s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000156 8 93.51% : 0.000146s : 4: match.inline 6.49% : 0.000010s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000283 1730 0.86% : 0.000002s : 17: predicate.accumulaten_eliminater 0.69% : 0.000002s : 6: predicate.ad_related_special_op_eliminate 0.68% : 0.000002s : 12: predicate.addn_check_dump 0.86% : 0.000002s : 17: predicate.addn_zero_filter 0.77% : 0.000002s : 17: predicate.adjust_all_reduce_mul_add 2.07% : 0.000006s : 29: predicate.arithmetic_simplify 1.11% : 0.000003s : 17: predicate.cast_eliminate 0.64% : 0.000002s : 12: predicate.check_bprop_eliminate 0.58% : 0.000002s : 12: predicate.compare_switch_simplify 0.22% : 0.000001s : 6: predicate.const_output_eliminate 0.64% : 0.000002s : 12: predicate.depend_value_elim 0.88% : 0.000002s : 17: predicate.dict_get_item_const_eliminator 0.97% : 0.000003s : 17: predicate.dict_get_item_eliminator 0.81% : 0.000002s : 17: predicate.dict_set_item_eliminator 0.99% : 0.000003s : 12: predicate.dumpgradient_eliminate 0.29% : 0.000001s : 6: predicate.elim_not_effective 0.56% : 0.000002s : 6: predicate.elim_shapecalc_of_broadcastargs 1.15% : 0.000003s : 23: predicate.environ_add_const_eliminate 1.12% : 0.000003s : 23: predicate.environ_get_add_eliminate 1.11% : 0.000003s : 23: predicate.environ_get_depend_swap 1.64% : 0.000005s : 35: predicate.environ_get_eliminate 1.12% : 0.000003s : 23: predicate.environ_get_set_eliminate 1.26% : 0.000004s : 25: predicate.exchange_switch_depend_value 2.09% : 0.000006s : 25: predicate.float_depend_g_call 0.64% : 0.000002s : 12: predicate.float_environ_get_switch 0.87% : 0.000002s : 18: predicate.float_tuple_getitem_switch 0.17% : 0.000000s : 6: predicate.fold_const_symbol 0.71% : 0.000002s : 12: predicate.get_grad_eliminate 0.31% : 0.000001s : 6: predicate.graph_param_transform 0.68% : 0.000002s : 12: predicate.incorporate_call 0.59% : 0.000002s : 12: predicate.incorporate_call_switch 6.28% : 0.000018s : 78: predicate.inline 0.88% : 0.000002s : 12: predicate.inline_without_move 0.32% : 0.000001s : 12: predicate.j_node_and_user_rematch 0.83% : 0.000002s : 12: predicate.less_batch_normalization 1.72% : 0.000005s : 33: predicate.list_to_tuple_eliminator_ 2.45% : 0.000007s : 50: predicate.load_eliminater 1.08% : 0.000003s : 6: predicate.loop_unroll_after_grad 2.05% : 0.000006s : 38: predicate.loop_unroll_before_grad 1.71% : 0.000005s : 29: predicate.make_slice_get_slice_eliminator 0.64% : 0.000002s : 12: predicate.merge_addn 0.59% : 0.000002s : 12: predicate.micro_step_allgather_replace 0.65% : 0.000002s : 12: predicate.mini_step_allgather_replace 0.78% : 0.000002s : 17: predicate.minmaximum_grad 1.24% : 0.000004s : 6: predicate.mutable_eliminate 0.44% : 0.000001s : 6: predicate.opt_reshape 0.44% : 0.000001s : 6: predicate.parallel_virtual_node 1.60% : 0.000005s : 25: predicate.partial_defer_inline 1.61% : 0.000005s : 27: predicate.partial_eliminate 0.84% : 0.000002s : 17: predicate.print_const_string_wrapper 0.64% : 0.000002s : 12: predicate.reduce_all_const_elim 1.10% : 0.000003s : 17: predicate.reduce_eliminate 2.41% : 0.000007s : 50: predicate.redundant_stop_gradient_eliminater 0.44% : 0.000001s : 12: predicate.remove_not_recompute_node 1.41% : 0.000004s : 33: predicate.replace_applicator 0.45% : 0.000001s : 12: predicate.replace_old_param 0.35% : 0.000001s : 6: predicate.reset_defer_inline 0.86% : 0.000002s : 17: predicate.reshape_eliminate 0.66% : 0.000002s : 12: predicate.row_tensor_add_zeros_like 0.56% : 0.000002s : 6: predicate.row_tensor_eliminate 0.79% : 0.000002s : 12: predicate.same_eliminate 0.59% : 0.000002s : 12: predicate.set_cell_output_no_recompute 0.86% : 0.000002s : 12: predicate.shard_identity_eliminate 0.84% : 0.000002s : 12: predicate.special_op_eliminate 0.76% : 0.000002s : 12: predicate.specialize_transform 1.04% : 0.000003s : 12: predicate.split_environ_get_set_with_tuple_value 1.00% : 0.000003s : 12: predicate.stack_unstack_eliminate 0.43% : 0.000001s : 6: predicate.switch_call_monad_eliminater 1.35% : 0.000004s : 25: predicate.switch_defer_inline 1.93% : 0.000005s : 37: predicate.switch_layer_defer_inline 4.62% : 0.000013s : 81: predicate.switch_simplify 0.86% : 0.000002s : 17: predicate.tile_eliminate 0.91% : 0.000003s : 17: predicate.transpose_eliminate 1.60% : 0.000005s : 29: predicate.tuple_list_convert_item_index_to_positive 1.67% : 0.000005s : 29: predicate.tuple_list_get_item_const_eliminator 1.60% : 0.000005s : 29: predicate.tuple_list_get_item_depend_reorder 3.06% : 0.000009s : 45: predicate.tuple_list_get_item_eliminator 1.45% : 0.000004s : 29: predicate.tuple_list_get_set_item_eliminator 2.31% : 0.000007s : 41: predicate.tuple_list_set_item_eliminator 1.71% : 0.000005s : 33: predicate.tuple_to_list_eliminator_ 2.44% : 0.000007s : 50: predicate.updatestate_pure_node_eliminater 3.17% : 0.000009s : 62: predicate.updatestate_useless_node_eliminater 0.43% : 0.000001s : 6: predicate.value_based_eliminate 0.82% : 0.000002s : 12: predicate.virtual_dataset_eliminate 0.85% : 0.000002s : 12: predicate.virtual_output_eliminate 0.33% : 0.000001s : 6: predicate.virtual_view_grad_eliminate 0.49% : 0.000001s : 6: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000497 11 51.79% : 0.000258s : 5: func_graph_cloner_run.FuncGraphClonerGraph 48.21% : 0.000240s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.027471 192 0.01% : 0.000004s : 1: ForceFp32Comm 11.63% : 0.003194s : 1: add_attr 11.59% : 0.003183s : 1: add_attr_with_inline 0.01% : 0.000004s : 1: add_comm_op_reuse_tag 0.25% : 0.000067s : 1: add_recomputation 0.02% : 0.000004s : 1: assign_add_opt 0.26% : 0.000070s : 1: auto_monad 0.10% : 0.000026s : 1: auto_monad_reorder 0.01% : 0.000004s : 1: begin_end_overlap_inline 0.02% : 0.000006s : 1: bias_add_comm_swap 1.53% : 0.000420s : 1: bootstrap 0.12% : 0.000034s : 1: cconv 0.02% : 0.000005s : 1: comm_op_add_attrs 0.08% : 0.000022s : 1: control_data_broadcast_order 0.04% : 0.000012s : 1: convert_after_rewriter 0.12% : 0.000033s : 1: cse_after_recomputation 0.02% : 0.000005s : 1: dataset_repeat_opt 0.02% : 0.000006s : 1: detach_backward 0.04% : 0.000010s : 1: environ_conv 0.09% : 0.000024s : 1: event_method 0.02% : 0.000005s : 1: full_micro_interleaved_order_control 0.02% : 0.000005s : 1: get_jit_bprop_graph 0.03% : 0.000008s : 1: graph_reusing 0.02% : 0.000005s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000004s : 1: handle_group_info 0.02% : 0.000005s : 1: inline 0.02% : 0.000006s : 1: insert-virtual-dataset 0.01% : 0.000004s : 1: interleave_parallel_branches 0.02% : 0.000005s : 1: interleave_split_concat_branches 0.02% : 0.000006s : 1: label_fine_grained_interleaved_index 0.03% : 0.000008s : 1: label_micro_interleaved_index 1.83% : 0.000502s : 1: loop_unroll 0.01% : 0.000004s : 1: merge_cast_opt 0.02% : 0.000005s : 1: micro_interleaved_order_control 2.28% : 0.000626s : 1: mutable_eliminate 0.03% : 0.000008s : 1: offloading_packed_experts 0.08% : 0.000021s : 1: opt.transform.loop_unroll_optimizer 0.08% : 0.000023s : 1: opt.transform.mutable_eliminate 5.66% : 0.001556s : 78: opt.transform.opt_a 0.16% : 0.000045s : 1: opt.transform.opt_after_cconv 0.12% : 0.000033s : 1: opt.transform.opt_after_jit_grad 0.62% : 0.000170s : 28: opt.transform.opt_b 0.27% : 0.000074s : 2: opt.transform.opt_trans_graph 0.21% : 0.000058s : 4: opt.transform.symbol_engine_opt 11.80% : 0.003242s : 1: opt_a 0.53% : 0.000145s : 1: opt_after_cconv 1.85% : 0.000509s : 1: opt_after_jit_grad 1.10% : 0.000301s : 1: opt_b 20.87% : 0.005734s : 1: optimize 0.09% : 0.000023s : 1: optimize_parallel_all_gather_comm 0.03% : 0.000009s : 1: order_py_execute_after_rewriter 0.10% : 0.000028s : 1: overlap_grad_flash_sp 0.01% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.03% : 0.000008s : 1: overlap_grad_ring_attention 0.02% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.02% : 0.000006s : 1: overlap_param_gather 0.01% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.03% : 0.000009s : 1: overlap_recompute_and_grad_model_parallel 0.02% : 0.000006s : 1: overlap_recompute_comm 0.03% : 0.000007s : 1: parallel-infer-symbol 0.01% : 0.000004s : 1: parallel-infer-symbol-second 0.02% : 0.000005s : 1: partial_unused_args_eliminate 0.02% : 0.000005s : 1: pipeline_parallel_scheduler 0.02% : 0.000004s : 1: pipeline_split 0.13% : 0.000037s : 1: pre_auto_parallel 0.12% : 0.000032s : 1: py_interpret_to_execute 0.06% : 0.000017s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000004s : 1: remove_cast_before_assign_add 0.18% : 0.000050s : 1: remove_dup_value 1.47% : 0.000403s : 1: renormalize.infer 1.20% : 0.000329s : 1: renormalize.specialize 0.02% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.03% : 0.000007s : 1: rewriter_after_jit_bprop_graph 0.19% : 0.000052s : 1: rewriter_after_opt_a 0.34% : 0.000094s : 1: rewriter_before_opt_a 0.02% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.02% : 0.000005s : 1: slice_recompute_activation 0.02% : 0.000005s : 1: split_layernorm_comm 0.02% : 0.000005s : 1: split_matmul_comm_elemetwise 0.04% : 0.000010s : 1: swap_dp_allreduce_reducescatter 0.38% : 0.000104s : 1: symbol_engine_optimizer 0.39% : 0.000107s : 1: tuple_transform 21.19% : 0.005821s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:45:39.518.094 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:45:39.518.378 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0196087, [21] [bootstrap]: 0.00044429 [type_inference]: 0.00687176 [event_method]: 2.275e-05 [auto_monad]: 6.693e-05 [graph_reusing]: 6.07999e-06 [inline]: 2.36e-06 [add_attr]: 0.00397128, [1] [add_attr_with_inline]: 0.00395925, [1] [Cycle 1]: 8.508e-05, [2] [tag_attr]: 2.486e-05 [meta_addattr_fg_expand]: 5.86998e-06 [parallel-infer-symbol]: 3.29001e-06 [pre_auto_parallel]: 3.822e-05 [insert-virtual-dataset]: 2.34001e-06 [parallel-infer-symbol-second]: 8.2e-07 [dataset_repeat_opt]: 1.92999e-06 [pipeline_split]: 1.79998e-06 [optimize]: 0.00666826, [53] [py_interpret_to_execute]: 3.008e-05 [rewriter_before_opt_a]: 9.76e-05 [opt_a]: 0.00388695, [2] [Cycle 1]: 0.00281317, [45] [expand_dump_flag]: 3.04001e-06 [switch_simplify]: 3.957e-05 [loop_unroll]: 3.208e-05 [a_1]: 0.00078523 [with_stream_mark]: 2.264e-05 [recompute_prepare]: 1.151e-05 [updatestate_depend_eliminate]: 5.52001e-06 [updatestate_assign_eliminate]: 3.98001e-06 [updatestate_loads_eliminate]: 4.42e-06 [parameter_eliminate]: 2.47001e-06 [a_2]: 0.00013097 [accelerated_algorithm]: 9.34e-06 [shard]: 2.66e-06 [meta_shard_fg_expand]: 2.38002e-06 [shard_inline]: 9.07999e-06 [merge_send_recv]: 1.068e-05 [auto_parallel]: 1.117e-05 [parallel]: 2.149e-05 [flash_sp]: 1.249e-05 [merge_comm]: 4.91002e-06 [allreduce_fusion]: 4.36002e-06 [matmul_add_comm_reduction]: 1.317e-05 [allreduce_slice_to_reducescatter]: 9.49978e-07 [virtual_shard_identity]: 1.025e-05 [virtual_dataset]: 8.35001e-06 [get_grad_eliminate_]: 7.82e-06 [virtual_output]: 8.12998e-06 [merge_forward]: 5.00001e-06 [cell_reuse_recompute_pass]: 2.31e-06 [offload_activation]: 1.309e-05 [cell_reuse_handle_not_recompute_node_pass]: 4.443e-05 [merge_recompute_call_nodes]: 1.82999e-06 [before_grad]: 1.943e-05 [set_forward_comm_id_for_comm_node_pass]: 5.67001e-06 [meta_fg_expand]: 3.45e-06 [flash_sp_send_recv_attached]: 3.5e-06 [receive_attached]: 2.81e-06 [after_resolve]: 1.388e-05 [a_after_grad]: 1.304e-05 [renormalize]: 0.00092852 [add_forward_monad_depend]: 7.44002e-06 [auto_monad_grad]: 2.73e-06 [auto_monad_eliminator]: 1.837e-05 [cse]: 2.72e-05 [a_3]: 7.489e-05 [Cycle 2]: 0.00105828, [45] [expand_dump_flag]: 2.01003e-06 [switch_simplify]: 8.99998e-06 [loop_unroll]: 8.08999e-06 [a_1]: 0.00018468 [with_stream_mark]: 1.751e-05 [recompute_prepare]: 7.9e-06 [updatestate_depend_eliminate]: 3.78001e-06 [updatestate_assign_eliminate]: 3.67002e-06 [updatestate_loads_eliminate]: 3.91999e-06 [parameter_eliminate]: 2.31e-06 [a_2]: 0.0001204 [accelerated_algorithm]: 8.34998e-06 [shard]: 1.73002e-06 [meta_shard_fg_expand]: 2.64999e-06 [shard_inline]: 7.5e-06 [merge_send_recv]: 9.97999e-06 [auto_parallel]: 1.077e-05 [parallel]: 9.07999e-06 [flash_sp]: 5.12999e-06 [merge_comm]: 4.55001e-06 [allreduce_fusion]: 4.70999e-06 [matmul_add_comm_reduction]: 9.92001e-06 [allreduce_slice_to_reducescatter]: 6.19999e-07 [virtual_shard_identity]: 1.222e-05 [virtual_dataset]: 7.56999e-06 [get_grad_eliminate_]: 7.03998e-06 [virtual_output]: 6.99001e-06 [merge_forward]: 5.24e-06 [cell_reuse_recompute_pass]: 2.07001e-06 [offload_activation]: 1.845e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.901e-05 [merge_recompute_call_nodes]: 1.50999e-06 [before_grad]: 1.332e-05 [set_forward_comm_id_for_comm_node_pass]: 4.80001e-06 [meta_fg_expand]: 3.55998e-06 [flash_sp_send_recv_attached]: 1.83002e-06 [receive_attached]: 2.56998e-06 [after_resolve]: 1.308e-05 [a_after_grad]: 1.149e-05 [renormalize]: 8.9989e-08 [add_forward_monad_depend]: 2.19999e-06 [auto_monad_grad]: 1.89e-06 [auto_monad_eliminator]: 1.136e-05 [cse]: 2.261e-05 [a_3]: 5.976e-05 [py_interpret_to_execute_after_opt_a]: 1.684e-05 [slice_cell_reuse_recomputed_activation]: 4.75001e-06 [rewriter_after_opt_a]: 4.753e-05 [convert_after_rewriter]: 1.15e-05 [order_py_execute_after_rewriter]: 8.96002e-06 [mutable_eliminate]: 0.00071819 [opt_b]: 0.00032458, [1] [Cycle 1]: 0.00031288, [7] [b_1]: 0.00020316 [b_2]: 1.002e-05 [updatestate_depend_eliminate]: 6.63998e-06 [updatestate_assign_eliminate]: 3.47997e-06 [updatestate_loads_eliminate]: 2.92002e-06 [renormalize]: 8.2e-07 [cse]: 2.78e-05 [optimize_parallel_all_gather_comm]: 2.23e-05 [overlap_param_gather]: 4.93001e-06 [cconv]: 3.556e-05 [loop_unroll]: 0.00051581 [opt_after_cconv]: 0.00016254, [1] [Cycle 1]: 0.00015263, [7] [c_1]: 4.205e-05 [parameter_eliminate]: 5.30999e-06 [updatestate_depend_eliminate]: 7.78001e-06 [updatestate_assign_eliminate]: 3.35e-06 [updatestate_loads_eliminate]: 3.53999e-06 [cse]: 2.806e-05 [renormalize]: 3.19997e-07 [remove_dup_value]: 2.168e-05 [tuple_transform]: 0.00011285, [1] [Cycle 1]: 0.00010544, [4] [d_1]: 6.024e-05 [none_parameter_eliminate]: 1.96998e-06 [renormalize]: 1.60013e-07 [switch_simplify]: 9.52999e-06 [partial_unused_args_eliminate]: 5.26002e-06 [add_recomputation]: 6.559e-05 [cse_after_recomputation]: 3.363e-05, [1] [Cycle 1]: 2.603e-05, [1] [cse]: 1.568e-05 [environ_conv]: 1.055e-05 [swap_dp_allreduce_reducescatter]: 9.27001e-06 [bias_add_comm_swap]: 5.79999e-06 [label_micro_interleaved_index]: 8.54e-06 [label_fine_grained_interleaved_index]: 5.49e-06 [merge_cast_opt]: 3.71999e-06 [slice_recompute_activation]: 4.81002e-06 [micro_interleaved_order_control]: 5.54998e-06 [assign_add_opt]: 4.18999e-06 [ForceFp32Comm]: 3.44001e-06 [remove_cast_before_assign_add]: 3.51001e-06 [full_micro_interleaved_order_control]: 4.86002e-06 [reorder_send_recv_between_fp_bp]: 5.25001e-06 [comm_op_add_attrs]: 3.66999e-06 [add_comm_op_reuse_tag]: 3.35003e-06 [interleave_split_concat_branches]: 3.60003e-06 [interleave_parallel_branches]: 3.75e-06 [overlap_opt_shard_in_pipeline]: 3.55998e-06 [overlap_opt_shard_grad_in_pipeline]: 4.29002e-06 [control_data_broadcast_order]: 2.058e-05 [grouped_pairwise_exchange_alltoall]: 4.2e-06 [offloading_packed_experts]: 8.03999e-06 [overlap_recompute_and_grad_model_parallel]: 8.47e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.68e-06 [overlap_recompute_allgather_and_fa_grad]: 4.17998e-06 [overlap_recompute_comm]: 5.37999e-06 [overlap_grad_ring_attention]: 7.7e-06 [overlap_grad_flash_sp]: 2.706e-05 [begin_end_overlap_inline]: 2.98e-06 [split_matmul_comm_elemetwise]: 4.57998e-06 [split_layernorm_comm]: 4.02002e-06 [handle_group_info]: 3.47997e-06 [symbol_engine_optimizer]: 0.0001179, [1] [Cycle 1]: 0.00010928, [6] [build]: 4.38999e-06 [elim_shapecalc]: 1.259e-05 [elim_not_effective]: 1.833e-05 [opt_reshape]: 9.57999e-06 [fold_const_symbol]: 1.372e-05 [renormalize]: 4.50003e-07 [detach_backward]: 5.77001e-06 [pipeline_parallel_scheduler]: 2.03997e-06 [auto_monad_reorder]: 2.631e-05 [get_jit_bprop_graph]: 2.06998e-06 [rewriter_after_jit_bprop_graph]: 6.94999e-06 [opt_after_jit_grad]: 0.00075142 [validate]: 5.044e-05 Sums bootstrap : 0.000444s : 3.25% type_inference : 0.006872s : 50.26% event_method : 0.000023s : 0.17% auto_monad : 0.000067s : 0.49% graph_reusing : 0.000006s : 0.04% inline : 0.000002s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000025s : 0.18% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.04% parallel-infer-symbol : 0.000003s : 0.02% pre_auto_parallel : 0.000038s : 0.28% insert-virtual-dataset : 0.000002s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.01% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000030s : 0.22% optimize.rewriter_before_opt_a : 0.000098s : 0.71% optimize.opt_a.expand_dump_flag : 0.000005s : 0.04% optimize.opt_a.switch_simplify : 0.000049s : 0.36% optimize.opt_a.loop_unroll : 0.000040s : 0.29% optimize.opt_a.a_1 : 0.000970s : 7.09% optimize.opt_a.with_stream_mark : 0.000040s : 0.29% optimize.opt_a.recompute_prepare : 0.000019s : 0.14% optimize.opt_a.updatestate_depend_eliminate : 0.000009s : 0.07% optimize.opt_a.updatestate_assign_eliminate : 0.000008s : 0.06% optimize.opt_a.updatestate_loads_eliminate : 0.000008s : 0.06% optimize.opt_a.parameter_eliminate : 0.000005s : 0.03% optimize.opt_a.a_2 : 0.000251s : 1.84% optimize.opt_a.accelerated_algorithm : 0.000018s : 0.13% optimize.opt_a.shard : 0.000004s : 0.03% optimize.opt_a.meta_shard_fg_expand : 0.000005s : 0.04% optimize.opt_a.shard_inline : 0.000017s : 0.12% optimize.opt_a.merge_send_recv : 0.000021s : 0.15% optimize.opt_a.auto_parallel : 0.000022s : 0.16% optimize.opt_a.parallel : 0.000031s : 0.22% optimize.opt_a.flash_sp : 0.000018s : 0.13% optimize.opt_a.merge_comm : 0.000009s : 0.07% optimize.opt_a.allreduce_fusion : 0.000009s : 0.07% optimize.opt_a.matmul_add_comm_reduction : 0.000023s : 0.17% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000002s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000022s : 0.16% optimize.opt_a.virtual_dataset : 0.000016s : 0.12% optimize.opt_a.get_grad_eliminate_ : 0.000015s : 0.11% optimize.opt_a.virtual_output : 0.000015s : 0.11% optimize.opt_a.merge_forward : 0.000010s : 0.07% optimize.opt_a.cell_reuse_recompute_pass : 0.000004s : 0.03% optimize.opt_a.offload_activation : 0.000032s : 0.23% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000063s : 0.46% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.02% optimize.opt_a.before_grad : 0.000033s : 0.24% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000010s : 0.08% optimize.opt_a.meta_fg_expand : 0.000007s : 0.05% optimize.opt_a.flash_sp_send_recv_attached : 0.000005s : 0.04% optimize.opt_a.receive_attached : 0.000005s : 0.04% optimize.opt_a.after_resolve : 0.000027s : 0.20% optimize.opt_a.a_after_grad : 0.000025s : 0.18% optimize.opt_a.renormalize : 0.000929s : 6.79% optimize.opt_a.add_forward_monad_depend : 0.000010s : 0.07% optimize.opt_a.auto_monad_grad : 0.000005s : 0.03% optimize.opt_a.auto_monad_eliminator : 0.000030s : 0.22% optimize.opt_a.cse : 0.000050s : 0.36% optimize.opt_a.a_3 : 0.000135s : 0.98% optimize.py_interpret_to_execute_after_opt_a : 0.000017s : 0.12% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.03% optimize.rewriter_after_opt_a : 0.000048s : 0.35% optimize.convert_after_rewriter : 0.000012s : 0.08% optimize.order_py_execute_after_rewriter : 0.000009s : 0.07% optimize.mutable_eliminate : 0.000718s : 5.25% optimize.opt_b.b_1 : 0.000203s : 1.49% optimize.opt_b.b_2 : 0.000010s : 0.07% optimize.opt_b.updatestate_depend_eliminate : 0.000007s : 0.05% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.02% optimize.opt_b.renormalize : 0.000001s : 0.01% optimize.opt_b.cse : 0.000028s : 0.20% optimize.optimize_parallel_all_gather_comm : 0.000022s : 0.16% optimize.overlap_param_gather : 0.000005s : 0.04% optimize.cconv : 0.000036s : 0.26% optimize.loop_unroll : 0.000516s : 3.77% optimize.opt_after_cconv.c_1 : 0.000042s : 0.31% optimize.opt_after_cconv.parameter_eliminate : 0.000005s : 0.04% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000008s : 0.06% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.02% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000004s : 0.03% optimize.opt_after_cconv.cse : 0.000028s : 0.21% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000022s : 0.16% optimize.tuple_transform.d_1 : 0.000060s : 0.44% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000010s : 0.07% optimize.partial_unused_args_eliminate : 0.000005s : 0.04% optimize.add_recomputation : 0.000066s : 0.48% optimize.cse_after_recomputation.cse : 0.000016s : 0.11% optimize.environ_conv : 0.000011s : 0.08% optimize.swap_dp_allreduce_reducescatter : 0.000009s : 0.07% optimize.bias_add_comm_swap : 0.000006s : 0.04% optimize.label_micro_interleaved_index : 0.000009s : 0.06% optimize.label_fine_grained_interleaved_index : 0.000005s : 0.04% optimize.merge_cast_opt : 0.000004s : 0.03% optimize.slice_recompute_activation : 0.000005s : 0.04% optimize.micro_interleaved_order_control : 0.000006s : 0.04% optimize.assign_add_opt : 0.000004s : 0.03% optimize.ForceFp32Comm : 0.000003s : 0.03% optimize.remove_cast_before_assign_add : 0.000004s : 0.03% optimize.full_micro_interleaved_order_control : 0.000005s : 0.04% optimize.reorder_send_recv_between_fp_bp : 0.000005s : 0.04% optimize.comm_op_add_attrs : 0.000004s : 0.03% optimize.add_comm_op_reuse_tag : 0.000003s : 0.02% optimize.interleave_split_concat_branches : 0.000004s : 0.03% optimize.interleave_parallel_branches : 0.000004s : 0.03% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.03% optimize.overlap_opt_shard_grad_in_pipeline : 0.000004s : 0.03% optimize.control_data_broadcast_order : 0.000021s : 0.15% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.03% optimize.offloading_packed_experts : 0.000008s : 0.06% optimize.overlap_recompute_and_grad_model_parallel : 0.000008s : 0.06% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.03% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.03% optimize.overlap_recompute_comm : 0.000005s : 0.04% optimize.overlap_grad_ring_attention : 0.000008s : 0.06% optimize.overlap_grad_flash_sp : 0.000027s : 0.20% optimize.begin_end_overlap_inline : 0.000003s : 0.02% optimize.split_matmul_comm_elemetwise : 0.000005s : 0.03% optimize.split_layernorm_comm : 0.000004s : 0.03% optimize.handle_group_info : 0.000003s : 0.03% optimize.symbol_engine_optimizer.build : 0.000004s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000013s : 0.09% optimize.symbol_engine_optimizer.elim_not_effective : 0.000018s : 0.13% optimize.symbol_engine_optimizer.opt_reshape : 0.000010s : 0.07% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000014s : 0.10% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000006s : 0.04% pipeline_parallel_scheduler : 0.000002s : 0.01% auto_monad_reorder : 0.000026s : 0.19% get_jit_bprop_graph : 0.000002s : 0.02% rewriter_after_jit_bprop_graph : 0.000007s : 0.05% opt_after_jit_grad : 0.000751s : 5.50% validate : 0.000050s : 0.37% Time group info: ------[substitution.] 0.000238 38 11.83% : 0.000028s : 3: substitution.cast_eliminate 1.13% : 0.000003s : 3: substitution.elim_not_effective 0.75% : 0.000002s : 3: substitution.fold_const_symbol 3.15% : 0.000008s : 5: substitution.graph_param_transform 67.76% : 0.000161s : 4: substitution.inline 2.30% : 0.000005s : 6: substitution.j_node_and_user_rematch 3.53% : 0.000008s : 6: substitution.remove_not_recompute_node 2.32% : 0.000006s : 4: substitution.replace_old_param 7.22% : 0.000017s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.006813 2 88.26% : 0.006013s : 1: type_inference.infer 11.74% : 0.000800s : 1: type_inference.specialize ------[replace.] 0.000068 8 58.36% : 0.000040s : 4: replace.inline 41.64% : 0.000028s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000174 8 91.17% : 0.000159s : 4: match.inline 8.83% : 0.000015s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000279 1596 0.87% : 0.000002s : 17: predicate.accumulaten_eliminater 0.93% : 0.000003s : 5: predicate.ad_related_special_op_eliminate 0.74% : 0.000002s : 10: predicate.addn_check_dump 0.97% : 0.000003s : 17: predicate.addn_zero_filter 0.88% : 0.000002s : 17: predicate.adjust_all_reduce_mul_add 2.17% : 0.000006s : 27: predicate.arithmetic_simplify 1.17% : 0.000003s : 17: predicate.cast_eliminate 0.57% : 0.000002s : 10: predicate.check_bprop_eliminate 0.56% : 0.000002s : 10: predicate.compare_switch_simplify 0.16% : 0.000000s : 5: predicate.const_output_eliminate 0.71% : 0.000002s : 10: predicate.depend_value_elim 0.91% : 0.000003s : 17: predicate.dict_get_item_const_eliminator 1.03% : 0.000003s : 17: predicate.dict_get_item_eliminator 0.86% : 0.000002s : 17: predicate.dict_set_item_eliminator 0.81% : 0.000002s : 10: predicate.dumpgradient_eliminate 0.23% : 0.000001s : 5: predicate.elim_not_effective 0.37% : 0.000001s : 5: predicate.elim_shapecalc_of_broadcastargs 1.15% : 0.000003s : 22: predicate.environ_add_const_eliminate 1.16% : 0.000003s : 22: predicate.environ_get_add_eliminate 1.06% : 0.000003s : 22: predicate.environ_get_depend_swap 1.63% : 0.000005s : 32: predicate.environ_get_eliminate 1.08% : 0.000003s : 22: predicate.environ_get_set_eliminate 1.35% : 0.000004s : 25: predicate.exchange_switch_depend_value 2.20% : 0.000006s : 25: predicate.float_depend_g_call 0.48% : 0.000001s : 10: predicate.float_environ_get_switch 0.79% : 0.000002s : 15: predicate.float_tuple_getitem_switch 0.15% : 0.000000s : 5: predicate.fold_const_symbol 0.54% : 0.000001s : 10: predicate.get_grad_eliminate 0.25% : 0.000001s : 5: predicate.graph_param_transform 0.58% : 0.000002s : 10: predicate.incorporate_call 0.45% : 0.000001s : 10: predicate.incorporate_call_switch 5.93% : 0.000017s : 72: predicate.inline 0.79% : 0.000002s : 10: predicate.inline_without_move 2.03% : 0.000006s : 10: predicate.j_node_and_user_rematch 0.92% : 0.000003s : 10: predicate.less_batch_normalization 1.96% : 0.000005s : 31: predicate.list_to_tuple_eliminator_ 2.69% : 0.000008s : 48: predicate.load_eliminater 1.16% : 0.000003s : 5: predicate.loop_unroll_after_grad 1.98% : 0.000006s : 36: predicate.loop_unroll_before_grad 1.60% : 0.000004s : 27: predicate.make_slice_get_slice_eliminator 0.65% : 0.000002s : 10: predicate.merge_addn 0.68% : 0.000002s : 10: predicate.micro_step_allgather_replace 0.56% : 0.000002s : 10: predicate.mini_step_allgather_replace 0.91% : 0.000003s : 17: predicate.minmaximum_grad 0.83% : 0.000002s : 5: predicate.mutable_eliminate 0.40% : 0.000001s : 5: predicate.opt_reshape 0.34% : 0.000001s : 5: predicate.parallel_virtual_node 1.97% : 0.000006s : 25: predicate.partial_defer_inline 1.58% : 0.000004s : 26: predicate.partial_eliminate 0.94% : 0.000003s : 17: predicate.print_const_string_wrapper 0.67% : 0.000002s : 10: predicate.reduce_all_const_elim 1.13% : 0.000003s : 17: predicate.reduce_eliminate 2.45% : 0.000007s : 48: predicate.redundant_stop_gradient_eliminater 0.43% : 0.000001s : 10: predicate.remove_not_recompute_node 1.25% : 0.000004s : 31: predicate.replace_applicator 0.47% : 0.000001s : 10: predicate.replace_old_param 0.25% : 0.000001s : 5: predicate.reset_defer_inline 0.94% : 0.000003s : 17: predicate.reshape_eliminate 0.56% : 0.000002s : 10: predicate.row_tensor_add_zeros_like 0.35% : 0.000001s : 5: predicate.row_tensor_eliminate 0.73% : 0.000002s : 10: predicate.same_eliminate 0.45% : 0.000001s : 10: predicate.set_cell_output_no_recompute 0.77% : 0.000002s : 10: predicate.shard_identity_eliminate 0.80% : 0.000002s : 10: predicate.special_op_eliminate 0.64% : 0.000002s : 10: predicate.specialize_transform 0.77% : 0.000002s : 10: predicate.split_environ_get_set_with_tuple_value 0.69% : 0.000002s : 10: predicate.stack_unstack_eliminate 0.44% : 0.000001s : 5: predicate.switch_call_monad_eliminater 1.57% : 0.000004s : 25: predicate.switch_defer_inline 2.04% : 0.000006s : 35: predicate.switch_layer_defer_inline 4.41% : 0.000012s : 76: predicate.switch_simplify 1.03% : 0.000003s : 17: predicate.tile_eliminate 0.94% : 0.000003s : 17: predicate.transpose_eliminate 1.62% : 0.000005s : 27: predicate.tuple_list_convert_item_index_to_positive 1.64% : 0.000005s : 27: predicate.tuple_list_get_item_const_eliminator 1.67% : 0.000005s : 27: predicate.tuple_list_get_item_depend_reorder 3.18% : 0.000009s : 41: predicate.tuple_list_get_item_eliminator 1.54% : 0.000004s : 27: predicate.tuple_list_get_set_item_eliminator 2.33% : 0.000007s : 37: predicate.tuple_list_set_item_eliminator 1.85% : 0.000005s : 31: predicate.tuple_to_list_eliminator_ 2.38% : 0.000007s : 48: predicate.updatestate_pure_node_eliminater 3.04% : 0.000009s : 58: predicate.updatestate_useless_node_eliminater 0.44% : 0.000001s : 5: predicate.value_based_eliminate 0.57% : 0.000002s : 10: predicate.virtual_dataset_eliminate 0.56% : 0.000002s : 10: predicate.virtual_output_eliminate 0.29% : 0.000001s : 5: predicate.virtual_view_grad_eliminate 0.39% : 0.000001s : 5: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000626 11 51.79% : 0.000324s : 5: func_graph_cloner_run.FuncGraphClonerGraph 48.21% : 0.000302s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.032926 192 0.02% : 0.000006s : 1: ForceFp32Comm 12.10% : 0.003984s : 1: add_attr 12.04% : 0.003963s : 1: add_attr_with_inline 0.02% : 0.000006s : 1: add_comm_op_reuse_tag 0.21% : 0.000070s : 1: add_recomputation 0.02% : 0.000007s : 1: assign_add_opt 0.24% : 0.000078s : 1: auto_monad 0.11% : 0.000035s : 1: auto_monad_reorder 0.02% : 0.000006s : 1: begin_end_overlap_inline 0.03% : 0.000009s : 1: bias_add_comm_swap 1.47% : 0.000483s : 1: bootstrap 0.12% : 0.000039s : 1: cconv 0.02% : 0.000007s : 1: comm_op_add_attrs 0.07% : 0.000024s : 1: control_data_broadcast_order 0.04% : 0.000014s : 1: convert_after_rewriter 0.11% : 0.000037s : 1: cse_after_recomputation 0.02% : 0.000008s : 1: dataset_repeat_opt 0.08% : 0.000026s : 1: detach_backward 0.04% : 0.000014s : 1: environ_conv 0.10% : 0.000033s : 1: event_method 0.02% : 0.000008s : 1: full_micro_interleaved_order_control 0.03% : 0.000008s : 1: get_jit_bprop_graph 0.04% : 0.000012s : 1: graph_reusing 0.02% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.02% : 0.000006s : 1: handle_group_info 0.02% : 0.000008s : 1: inline 0.03% : 0.000009s : 1: insert-virtual-dataset 0.02% : 0.000006s : 1: interleave_parallel_branches 0.02% : 0.000006s : 1: interleave_split_concat_branches 0.03% : 0.000008s : 1: label_fine_grained_interleaved_index 0.03% : 0.000011s : 1: label_micro_interleaved_index 1.59% : 0.000523s : 1: loop_unroll 0.02% : 0.000006s : 1: merge_cast_opt 0.02% : 0.000008s : 1: micro_interleaved_order_control 2.20% : 0.000725s : 1: mutable_eliminate 0.03% : 0.000011s : 1: offloading_packed_experts 0.06% : 0.000020s : 1: opt.transform.loop_unroll_optimizer 0.06% : 0.000021s : 1: opt.transform.mutable_eliminate 4.66% : 0.001535s : 78: opt.transform.opt_a 0.12% : 0.000040s : 1: opt.transform.opt_after_cconv 0.11% : 0.000037s : 1: opt.transform.opt_after_jit_grad 0.41% : 0.000136s : 28: opt.transform.opt_b 0.20% : 0.000067s : 2: opt.transform.opt_trans_graph 0.15% : 0.000050s : 4: opt.transform.symbol_engine_opt 11.82% : 0.003891s : 1: opt_a 0.51% : 0.000167s : 1: opt_after_cconv 2.32% : 0.000765s : 1: opt_after_jit_grad 1.00% : 0.000328s : 1: opt_b 21.42% : 0.007054s : 1: optimize 0.08% : 0.000026s : 1: optimize_parallel_all_gather_comm 0.04% : 0.000012s : 1: order_py_execute_after_rewriter 0.09% : 0.000030s : 1: overlap_grad_flash_sp 0.02% : 0.000007s : 1: overlap_grad_matmul_and_grad_allreduce 0.03% : 0.000011s : 1: overlap_grad_ring_attention 0.02% : 0.000007s : 1: overlap_opt_shard_grad_in_pipeline 0.02% : 0.000006s : 1: overlap_opt_shard_in_pipeline 0.02% : 0.000008s : 1: overlap_param_gather 0.02% : 0.000007s : 1: overlap_recompute_allgather_and_fa_grad 0.03% : 0.000011s : 1: overlap_recompute_and_grad_model_parallel 0.02% : 0.000008s : 1: overlap_recompute_comm 0.03% : 0.000011s : 1: parallel-infer-symbol 0.02% : 0.000007s : 1: parallel-infer-symbol-second 0.03% : 0.000009s : 1: partial_unused_args_eliminate 0.03% : 0.000011s : 1: pipeline_parallel_scheduler 0.02% : 0.000007s : 1: pipeline_split 0.14% : 0.000046s : 1: pre_auto_parallel 0.11% : 0.000035s : 1: py_interpret_to_execute 0.06% : 0.000020s : 1: py_interpret_to_execute_after_opt_a 0.02% : 0.000006s : 1: remove_cast_before_assign_add 0.08% : 0.000025s : 1: remove_dup_value 1.59% : 0.000523s : 1: renormalize.infer 1.20% : 0.000396s : 1: renormalize.specialize 0.02% : 0.000008s : 1: reorder_send_recv_between_fp_bp 0.04% : 0.000013s : 1: rewriter_after_jit_bprop_graph 0.15% : 0.000051s : 1: rewriter_after_opt_a 0.31% : 0.000101s : 1: rewriter_before_opt_a 0.02% : 0.000008s : 1: slice_cell_reuse_recomputed_activation 0.02% : 0.000007s : 1: slice_recompute_activation 0.02% : 0.000007s : 1: split_layernorm_comm 0.02% : 0.000007s : 1: split_matmul_comm_elemetwise 0.04% : 0.000012s : 1: swap_dp_allreduce_reducescatter 0.37% : 0.000121s : 1: symbol_engine_optimizer 0.35% : 0.000116s : 1: tuple_transform 21.03% : 0.006923s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:45:39.780.623 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0167062, [21] [bootstrap]: 0.00039111 [type_inference]: 0.00624806 [event_method]: 2.065e-05 [auto_monad]: 6.799e-05 [graph_reusing]: 6.16e-06 [inline]: 3.23e-06 [add_attr]: 0.00360655, [1] [add_attr_with_inline]: 0.00359537, [1] [Cycle 1]: 6.658e-05, [2] [tag_attr]: 2.171e-05 [meta_addattr_fg_expand]: 6.34999e-06 [parallel-infer-symbol]: 3.81001e-06 [pre_auto_parallel]: 3.659e-05 [insert-virtual-dataset]: 2.95998e-06 [parallel-infer-symbol-second]: 9.19972e-07 [dataset_repeat_opt]: 2.18002e-06 [pipeline_split]: 1.59e-06 [optimize]: 0.00556091, [53] [py_interpret_to_execute]: 2.575e-05 [rewriter_before_opt_a]: 8.845e-05 [opt_a]: 0.0031309, [2] [Cycle 1]: 0.00228487, [45] [expand_dump_flag]: 2.73e-06 [switch_simplify]: 4.427e-05 [loop_unroll]: 3.215e-05 [a_1]: 0.000741 [with_stream_mark]: 1.496e-05 [recompute_prepare]: 1.121e-05 [updatestate_depend_eliminate]: 4.45e-06 [updatestate_assign_eliminate]: 3.72998e-06 [updatestate_loads_eliminate]: 3.88001e-06 [parameter_eliminate]: 2.07999e-06 [a_2]: 0.00010296 [accelerated_algorithm]: 8.20999e-06 [shard]: 1.98002e-06 [meta_shard_fg_expand]: 2.07999e-06 [shard_inline]: 7.66001e-06 [merge_send_recv]: 9.29998e-06 [auto_parallel]: 7.87998e-06 [parallel]: 1.886e-05 [flash_sp]: 1.018e-05 [merge_comm]: 4.78001e-06 [allreduce_fusion]: 4.24002e-06 [matmul_add_comm_reduction]: 1.075e-05 [allreduce_slice_to_reducescatter]: 6.69999e-07 [virtual_shard_identity]: 1.008e-05 [virtual_dataset]: 8.03999e-06 [get_grad_eliminate_]: 7.61001e-06 [virtual_output]: 7.78999e-06 [merge_forward]: 4.65001e-06 [cell_reuse_recompute_pass]: 1.15001e-06 [offload_activation]: 1.109e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.479e-05 [merge_recompute_call_nodes]: 1.64e-06 [before_grad]: 1.323e-05 [set_forward_comm_id_for_comm_node_pass]: 4.32e-06 [meta_fg_expand]: 3.64002e-06 [flash_sp_send_recv_attached]: 2.63e-06 [receive_attached]: 2.04999e-06 [after_resolve]: 1.316e-05 [a_after_grad]: 1.221e-05 [renormalize]: 0.00075358 [add_forward_monad_depend]: 4.53999e-06 [auto_monad_grad]: 2.62001e-06 [auto_monad_eliminator]: 1.6e-05 [cse]: 3.742e-05 [a_3]: 5.697e-05 [Cycle 2]: 0.00083475, [45] [expand_dump_flag]: 1.95001e-06 [switch_simplify]: 1.009e-05 [loop_unroll]: 8.31002e-06 [a_1]: 0.00019524 [with_stream_mark]: 1.31e-05 [recompute_prepare]: 9.20001e-06 [updatestate_depend_eliminate]: 4.15e-06 [updatestate_assign_eliminate]: 3.02002e-06 [updatestate_loads_eliminate]: 2.92002e-06 [parameter_eliminate]: 1.00999e-06 [a_2]: 9.75e-05 [accelerated_algorithm]: 8.32e-06 [shard]: 1.43002e-06 [meta_shard_fg_expand]: 1.79e-06 [shard_inline]: 8.30999e-06 [merge_send_recv]: 7.23999e-06 [auto_parallel]: 8.24002e-06 [parallel]: 7.3e-06 [flash_sp]: 4.3e-06 [merge_comm]: 4.43001e-06 [allreduce_fusion]: 4.11001e-06 [matmul_add_comm_reduction]: 8.70001e-06 [allreduce_slice_to_reducescatter]: 7.59988e-07 [virtual_shard_identity]: 9.24e-06 [virtual_dataset]: 7.78001e-06 [get_grad_eliminate_]: 7.41999e-06 [virtual_output]: 7.02002e-06 [merge_forward]: 3.08e-06 [cell_reuse_recompute_pass]: 2.43998e-06 [offload_activation]: 9.69999e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.604e-05 [merge_recompute_call_nodes]: 1.14e-06 [before_grad]: 1.295e-05 [set_forward_comm_id_for_comm_node_pass]: 4.87e-06 [meta_fg_expand]: 3.46999e-06 [flash_sp_send_recv_attached]: 1.15999e-06 [receive_attached]: 1.81003e-06 [after_resolve]: 1.369e-05 [a_after_grad]: 1.144e-05 [renormalize]: 6.00121e-08 [add_forward_monad_depend]: 1.42e-06 [auto_monad_grad]: 1.12e-06 [auto_monad_eliminator]: 9.20001e-06 [cse]: 2.094e-05 [a_3]: 5.012e-05 [py_interpret_to_execute_after_opt_a]: 1.453e-05 [slice_cell_reuse_recomputed_activation]: 1.97001e-06 [rewriter_after_opt_a]: 4.228e-05 [convert_after_rewriter]: 7.54002e-06 [order_py_execute_after_rewriter]: 5.86998e-06 [mutable_eliminate]: 0.0006966 [opt_b]: 0.00025392, [1] [Cycle 1]: 0.00024609, [7] [b_1]: 0.0001577 [b_2]: 1.074e-05 [updatestate_depend_eliminate]: 7.05e-06 [updatestate_assign_eliminate]: 3.08e-06 [updatestate_loads_eliminate]: 3.11999e-06 [renormalize]: 5.39992e-07 [cse]: 2.665e-05 [optimize_parallel_all_gather_comm]: 1.874e-05 [overlap_param_gather]: 1.93002e-06 [cconv]: 3.004e-05 [loop_unroll]: 0.00049921 [opt_after_cconv]: 0.00012164, [1] [Cycle 1]: 0.00011456, [7] [c_1]: 3.871e-05 [parameter_eliminate]: 3.16001e-06 [updatestate_depend_eliminate]: 6.41e-06 [updatestate_assign_eliminate]: 3.06001e-06 [updatestate_loads_eliminate]: 2.99999e-06 [cse]: 2.538e-05 [renormalize]: 2.19996e-07 [remove_dup_value]: 1.673e-05 [tuple_transform]: 8.873e-05, [1] [Cycle 1]: 8.393e-05, [4] [d_1]: 5.471e-05 [none_parameter_eliminate]: 1.96e-06 [renormalize]: 2.29978e-07 [switch_simplify]: 8.05999e-06 [partial_unused_args_eliminate]: 1.81e-06 [add_recomputation]: 5.85e-05 [cse_after_recomputation]: 2.565e-05, [1] [Cycle 1]: 2.073e-05, [1] [cse]: 1.528e-05 [environ_conv]: 6.02999e-06 [swap_dp_allreduce_reducescatter]: 6.59001e-06 [bias_add_comm_swap]: 2.41998e-06 [label_micro_interleaved_index]: 4.98001e-06 [label_fine_grained_interleaved_index]: 2.68e-06 [merge_cast_opt]: 1.54998e-06 [slice_recompute_activation]: 2.14e-06 [micro_interleaved_order_control]: 2.78e-06 [assign_add_opt]: 1.19998e-06 [ForceFp32Comm]: 1.42999e-06 [remove_cast_before_assign_add]: 1.44e-06 [full_micro_interleaved_order_control]: 1.99e-06 [reorder_send_recv_between_fp_bp]: 2.66e-06 [comm_op_add_attrs]: 1.10001e-06 [add_comm_op_reuse_tag]: 9.70002e-07 [interleave_split_concat_branches]: 1.24998e-06 [interleave_parallel_branches]: 1.41998e-06 [overlap_opt_shard_in_pipeline]: 1.19998e-06 [overlap_opt_shard_grad_in_pipeline]: 1.79e-06 [control_data_broadcast_order]: 1.592e-05 [grouped_pairwise_exchange_alltoall]: 2.43e-06 [offloading_packed_experts]: 4.61002e-06 [overlap_recompute_and_grad_model_parallel]: 5.40999e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.27e-06 [overlap_recompute_allgather_and_fa_grad]: 1.37999e-06 [overlap_recompute_comm]: 2.22999e-06 [overlap_grad_ring_attention]: 4.75999e-06 [overlap_grad_flash_sp]: 2.367e-05 [begin_end_overlap_inline]: 6.69999e-07 [split_matmul_comm_elemetwise]: 2.34001e-06 [split_layernorm_comm]: 1.83002e-06 [handle_group_info]: 1.39998e-06 [symbol_engine_optimizer]: 8.763e-05, [1] [Cycle 1]: 8.227e-05, [6] [build]: 3.35998e-06 [elim_shapecalc]: 1.143e-05 [elim_not_effective]: 1.681e-05 [opt_reshape]: 9.22001e-06 [fold_const_symbol]: 1.296e-05 [renormalize]: 2.09984e-07 [detach_backward]: 1.96e-06 [pipeline_parallel_scheduler]: 1.49e-06 [auto_monad_reorder]: 2.046e-05 [get_jit_bprop_graph]: 2.11e-06 [rewriter_after_jit_bprop_graph]: 3.93001e-06 [opt_after_jit_grad]: 0.00052787 [validate]: 4.266e-05 Sums bootstrap : 0.000391s : 3.23% type_inference : 0.006248s : 51.61% event_method : 0.000021s : 0.17% auto_monad : 0.000068s : 0.56% graph_reusing : 0.000006s : 0.05% inline : 0.000003s : 0.03% add_attr.add_attr_with_inline.tag_attr : 0.000022s : 0.18% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.05% parallel-infer-symbol : 0.000004s : 0.03% pre_auto_parallel : 0.000037s : 0.30% insert-virtual-dataset : 0.000003s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000026s : 0.21% optimize.rewriter_before_opt_a : 0.000088s : 0.73% optimize.opt_a.expand_dump_flag : 0.000005s : 0.04% optimize.opt_a.switch_simplify : 0.000054s : 0.45% optimize.opt_a.loop_unroll : 0.000040s : 0.33% optimize.opt_a.a_1 : 0.000936s : 7.73% optimize.opt_a.with_stream_mark : 0.000028s : 0.23% optimize.opt_a.recompute_prepare : 0.000020s : 0.17% optimize.opt_a.updatestate_depend_eliminate : 0.000009s : 0.07% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.06% optimize.opt_a.updatestate_loads_eliminate : 0.000007s : 0.06% optimize.opt_a.parameter_eliminate : 0.000003s : 0.03% optimize.opt_a.a_2 : 0.000200s : 1.66% optimize.opt_a.accelerated_algorithm : 0.000017s : 0.14% optimize.opt_a.shard : 0.000003s : 0.03% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.03% optimize.opt_a.shard_inline : 0.000016s : 0.13% optimize.opt_a.merge_send_recv : 0.000017s : 0.14% optimize.opt_a.auto_parallel : 0.000016s : 0.13% optimize.opt_a.parallel : 0.000026s : 0.22% optimize.opt_a.flash_sp : 0.000014s : 0.12% optimize.opt_a.merge_comm : 0.000009s : 0.08% optimize.opt_a.allreduce_fusion : 0.000008s : 0.07% optimize.opt_a.matmul_add_comm_reduction : 0.000019s : 0.16% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000019s : 0.16% optimize.opt_a.virtual_dataset : 0.000016s : 0.13% optimize.opt_a.get_grad_eliminate_ : 0.000015s : 0.12% optimize.opt_a.virtual_output : 0.000015s : 0.12% optimize.opt_a.merge_forward : 0.000008s : 0.06% optimize.opt_a.cell_reuse_recompute_pass : 0.000004s : 0.03% optimize.opt_a.offload_activation : 0.000021s : 0.17% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000031s : 0.25% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.02% optimize.opt_a.before_grad : 0.000026s : 0.22% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000009s : 0.08% optimize.opt_a.meta_fg_expand : 0.000007s : 0.06% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.03% optimize.opt_a.receive_attached : 0.000004s : 0.03% optimize.opt_a.after_resolve : 0.000027s : 0.22% optimize.opt_a.a_after_grad : 0.000024s : 0.20% optimize.opt_a.renormalize : 0.000754s : 6.22% optimize.opt_a.add_forward_monad_depend : 0.000006s : 0.05% optimize.opt_a.auto_monad_grad : 0.000004s : 0.03% optimize.opt_a.auto_monad_eliminator : 0.000025s : 0.21% optimize.opt_a.cse : 0.000058s : 0.48% optimize.opt_a.a_3 : 0.000107s : 0.88% optimize.py_interpret_to_execute_after_opt_a : 0.000015s : 0.12% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.02% optimize.rewriter_after_opt_a : 0.000042s : 0.35% optimize.convert_after_rewriter : 0.000008s : 0.06% optimize.order_py_execute_after_rewriter : 0.000006s : 0.05% optimize.mutable_eliminate : 0.000697s : 5.75% optimize.opt_b.b_1 : 0.000158s : 1.30% optimize.opt_b.b_2 : 0.000011s : 0.09% optimize.opt_b.updatestate_depend_eliminate : 0.000007s : 0.06% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000027s : 0.22% optimize.optimize_parallel_all_gather_comm : 0.000019s : 0.15% optimize.overlap_param_gather : 0.000002s : 0.02% optimize.cconv : 0.000030s : 0.25% optimize.loop_unroll : 0.000499s : 4.12% optimize.opt_after_cconv.c_1 : 0.000039s : 0.32% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.05% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.02% optimize.opt_after_cconv.cse : 0.000025s : 0.21% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000017s : 0.14% optimize.tuple_transform.d_1 : 0.000055s : 0.45% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.02% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000008s : 0.07% optimize.partial_unused_args_eliminate : 0.000002s : 0.01% optimize.add_recomputation : 0.000059s : 0.48% optimize.cse_after_recomputation.cse : 0.000015s : 0.13% optimize.environ_conv : 0.000006s : 0.05% optimize.swap_dp_allreduce_reducescatter : 0.000007s : 0.05% optimize.bias_add_comm_swap : 0.000002s : 0.02% optimize.label_micro_interleaved_index : 0.000005s : 0.04% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.02% optimize.merge_cast_opt : 0.000002s : 0.01% optimize.slice_recompute_activation : 0.000002s : 0.02% optimize.micro_interleaved_order_control : 0.000003s : 0.02% optimize.assign_add_opt : 0.000001s : 0.01% optimize.ForceFp32Comm : 0.000001s : 0.01% optimize.remove_cast_before_assign_add : 0.000001s : 0.01% optimize.full_micro_interleaved_order_control : 0.000002s : 0.02% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.02% optimize.comm_op_add_attrs : 0.000001s : 0.01% optimize.add_comm_op_reuse_tag : 0.000001s : 0.01% optimize.interleave_split_concat_branches : 0.000001s : 0.01% optimize.interleave_parallel_branches : 0.000001s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.01% optimize.control_data_broadcast_order : 0.000016s : 0.13% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.02% optimize.offloading_packed_experts : 0.000005s : 0.04% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.04% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.01% optimize.overlap_recompute_comm : 0.000002s : 0.02% optimize.overlap_grad_ring_attention : 0.000005s : 0.04% optimize.overlap_grad_flash_sp : 0.000024s : 0.20% optimize.begin_end_overlap_inline : 0.000001s : 0.01% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.02% optimize.split_layernorm_comm : 0.000002s : 0.02% optimize.handle_group_info : 0.000001s : 0.01% optimize.symbol_engine_optimizer.build : 0.000003s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000011s : 0.09% optimize.symbol_engine_optimizer.elim_not_effective : 0.000017s : 0.14% optimize.symbol_engine_optimizer.opt_reshape : 0.000009s : 0.08% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000013s : 0.11% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.02% pipeline_parallel_scheduler : 0.000001s : 0.01% auto_monad_reorder : 0.000020s : 0.17% get_jit_bprop_graph : 0.000002s : 0.02% rewriter_after_jit_bprop_graph : 0.000004s : 0.03% opt_after_jit_grad : 0.000528s : 4.36% validate : 0.000043s : 0.35% Time group info: ------[substitution.] 0.000209 38 11.99% : 0.000025s : 3: substitution.cast_eliminate 1.34% : 0.000003s : 3: substitution.elim_not_effective 0.79% : 0.000002s : 3: substitution.fold_const_symbol 3.43% : 0.000007s : 5: substitution.graph_param_transform 66.63% : 0.000139s : 4: substitution.inline 2.42% : 0.000005s : 6: substitution.j_node_and_user_rematch 3.11% : 0.000006s : 6: substitution.remove_not_recompute_node 2.47% : 0.000005s : 4: substitution.replace_old_param 7.82% : 0.000016s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.006187 2 88.53% : 0.005477s : 1: type_inference.infer 11.47% : 0.000710s : 1: type_inference.specialize ------[replace.] 0.000063 8 58.67% : 0.000037s : 4: replace.inline 41.33% : 0.000026s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000151 8 90.48% : 0.000137s : 4: match.inline 9.52% : 0.000014s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000268 1596 0.93% : 0.000002s : 17: predicate.accumulaten_eliminater 0.63% : 0.000002s : 5: predicate.ad_related_special_op_eliminate 0.61% : 0.000002s : 10: predicate.addn_check_dump 0.97% : 0.000003s : 17: predicate.addn_zero_filter 0.84% : 0.000002s : 17: predicate.adjust_all_reduce_mul_add 2.01% : 0.000005s : 27: predicate.arithmetic_simplify 1.10% : 0.000003s : 17: predicate.cast_eliminate 0.63% : 0.000002s : 10: predicate.check_bprop_eliminate 0.56% : 0.000002s : 10: predicate.compare_switch_simplify 0.17% : 0.000000s : 5: predicate.const_output_eliminate 0.60% : 0.000002s : 10: predicate.depend_value_elim 1.00% : 0.000003s : 17: predicate.dict_get_item_const_eliminator 1.08% : 0.000003s : 17: predicate.dict_get_item_eliminator 1.01% : 0.000003s : 17: predicate.dict_set_item_eliminator 0.91% : 0.000002s : 10: predicate.dumpgradient_eliminate 0.24% : 0.000001s : 5: predicate.elim_not_effective 0.39% : 0.000001s : 5: predicate.elim_shapecalc_of_broadcastargs 1.17% : 0.000003s : 22: predicate.environ_add_const_eliminate 1.17% : 0.000003s : 22: predicate.environ_get_add_eliminate 1.36% : 0.000004s : 22: predicate.environ_get_depend_swap 1.77% : 0.000005s : 32: predicate.environ_get_eliminate 1.13% : 0.000003s : 22: predicate.environ_get_set_eliminate 1.36% : 0.000004s : 25: predicate.exchange_switch_depend_value 2.17% : 0.000006s : 25: predicate.float_depend_g_call 0.58% : 0.000002s : 10: predicate.float_environ_get_switch 0.82% : 0.000002s : 15: predicate.float_tuple_getitem_switch 0.16% : 0.000000s : 5: predicate.fold_const_symbol 0.82% : 0.000002s : 10: predicate.get_grad_eliminate 0.19% : 0.000001s : 5: predicate.graph_param_transform 0.56% : 0.000002s : 10: predicate.incorporate_call 0.49% : 0.000001s : 10: predicate.incorporate_call_switch 6.06% : 0.000016s : 72: predicate.inline 0.74% : 0.000002s : 10: predicate.inline_without_move 0.31% : 0.000001s : 10: predicate.j_node_and_user_rematch 0.86% : 0.000002s : 10: predicate.less_batch_normalization 1.82% : 0.000005s : 31: predicate.list_to_tuple_eliminator_ 2.65% : 0.000007s : 48: predicate.load_eliminater 0.85% : 0.000002s : 5: predicate.loop_unroll_after_grad 2.09% : 0.000006s : 36: predicate.loop_unroll_before_grad 1.84% : 0.000005s : 27: predicate.make_slice_get_slice_eliminator 0.63% : 0.000002s : 10: predicate.merge_addn 0.58% : 0.000002s : 10: predicate.micro_step_allgather_replace 0.58% : 0.000002s : 10: predicate.mini_step_allgather_replace 0.95% : 0.000003s : 17: predicate.minmaximum_grad 0.86% : 0.000002s : 5: predicate.mutable_eliminate 0.36% : 0.000001s : 5: predicate.opt_reshape 0.34% : 0.000001s : 5: predicate.parallel_virtual_node 1.58% : 0.000004s : 25: predicate.partial_defer_inline 1.62% : 0.000004s : 26: predicate.partial_eliminate 0.95% : 0.000003s : 17: predicate.print_const_string_wrapper 0.79% : 0.000002s : 10: predicate.reduce_all_const_elim 1.19% : 0.000003s : 17: predicate.reduce_eliminate 2.50% : 0.000007s : 48: predicate.redundant_stop_gradient_eliminater 0.36% : 0.000001s : 10: predicate.remove_not_recompute_node 1.25% : 0.000003s : 31: predicate.replace_applicator 0.51% : 0.000001s : 10: predicate.replace_old_param 0.32% : 0.000001s : 5: predicate.reset_defer_inline 0.96% : 0.000003s : 17: predicate.reshape_eliminate 0.76% : 0.000002s : 10: predicate.row_tensor_add_zeros_like 0.54% : 0.000001s : 5: predicate.row_tensor_eliminate 0.88% : 0.000002s : 10: predicate.same_eliminate 0.41% : 0.000001s : 10: predicate.set_cell_output_no_recompute 0.72% : 0.000002s : 10: predicate.shard_identity_eliminate 0.67% : 0.000002s : 10: predicate.special_op_eliminate 0.75% : 0.000002s : 10: predicate.specialize_transform 1.00% : 0.000003s : 10: predicate.split_environ_get_set_with_tuple_value 0.77% : 0.000002s : 10: predicate.stack_unstack_eliminate 0.40% : 0.000001s : 5: predicate.switch_call_monad_eliminater 1.62% : 0.000004s : 25: predicate.switch_defer_inline 2.06% : 0.000006s : 35: predicate.switch_layer_defer_inline 4.73% : 0.000013s : 76: predicate.switch_simplify 0.91% : 0.000002s : 17: predicate.tile_eliminate 0.98% : 0.000003s : 17: predicate.transpose_eliminate 1.57% : 0.000004s : 27: predicate.tuple_list_convert_item_index_to_positive 1.66% : 0.000004s : 27: predicate.tuple_list_get_item_const_eliminator 1.49% : 0.000004s : 27: predicate.tuple_list_get_item_depend_reorder 3.49% : 0.000009s : 41: predicate.tuple_list_get_item_eliminator 1.63% : 0.000004s : 27: predicate.tuple_list_get_set_item_eliminator 2.24% : 0.000006s : 37: predicate.tuple_list_set_item_eliminator 1.71% : 0.000005s : 31: predicate.tuple_to_list_eliminator_ 2.51% : 0.000007s : 48: predicate.updatestate_pure_node_eliminater 3.16% : 0.000008s : 58: predicate.updatestate_useless_node_eliminater 0.34% : 0.000001s : 5: predicate.value_based_eliminate 0.70% : 0.000002s : 10: predicate.virtual_dataset_eliminate 0.62% : 0.000002s : 10: predicate.virtual_output_eliminate 0.28% : 0.000001s : 5: predicate.virtual_view_grad_eliminate 0.42% : 0.000001s : 5: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000517 11 52.65% : 0.000272s : 5: func_graph_cloner_run.FuncGraphClonerGraph 47.35% : 0.000245s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.028302 192 0.01% : 0.000004s : 1: ForceFp32Comm 12.76% : 0.003612s : 1: add_attr 12.72% : 0.003600s : 1: add_attr_with_inline 0.01% : 0.000004s : 1: add_comm_op_reuse_tag 0.22% : 0.000063s : 1: add_recomputation 0.01% : 0.000004s : 1: assign_add_opt 0.26% : 0.000074s : 1: auto_monad 0.09% : 0.000024s : 1: auto_monad_reorder 0.01% : 0.000004s : 1: begin_end_overlap_inline 0.02% : 0.000005s : 1: bias_add_comm_swap 1.47% : 0.000416s : 1: bootstrap 0.12% : 0.000033s : 1: cconv 0.01% : 0.000004s : 1: comm_op_add_attrs 0.07% : 0.000019s : 1: control_data_broadcast_order 0.04% : 0.000011s : 1: convert_after_rewriter 0.10% : 0.000029s : 1: cse_after_recomputation 0.02% : 0.000005s : 1: dataset_repeat_opt 0.02% : 0.000005s : 1: detach_backward 0.03% : 0.000009s : 1: environ_conv 0.10% : 0.000027s : 1: event_method 0.02% : 0.000005s : 1: full_micro_interleaved_order_control 0.02% : 0.000005s : 1: get_jit_bprop_graph 0.03% : 0.000010s : 1: graph_reusing 0.02% : 0.000005s : 1: grouped_pairwise_exchange_alltoall 0.02% : 0.000004s : 1: handle_group_info 0.02% : 0.000006s : 1: inline 0.02% : 0.000006s : 1: insert-virtual-dataset 0.01% : 0.000004s : 1: interleave_parallel_branches 0.01% : 0.000004s : 1: interleave_split_concat_branches 0.02% : 0.000006s : 1: label_fine_grained_interleaved_index 0.03% : 0.000008s : 1: label_micro_interleaved_index 1.80% : 0.000508s : 1: loop_unroll 0.02% : 0.000004s : 1: merge_cast_opt 0.02% : 0.000005s : 1: micro_interleaved_order_control 2.50% : 0.000707s : 1: mutable_eliminate 0.03% : 0.000008s : 1: offloading_packed_experts 0.06% : 0.000018s : 1: opt.transform.loop_unroll_optimizer 0.07% : 0.000019s : 1: opt.transform.mutable_eliminate 5.19% : 0.001468s : 78: opt.transform.opt_a 0.13% : 0.000037s : 1: opt.transform.opt_after_cconv 0.10% : 0.000028s : 1: opt.transform.opt_after_jit_grad 0.48% : 0.000136s : 28: opt.transform.opt_b 0.21% : 0.000060s : 2: opt.transform.opt_trans_graph 0.16% : 0.000046s : 4: opt.transform.symbol_engine_opt 11.08% : 0.003135s : 1: opt_a 0.44% : 0.000125s : 1: opt_after_cconv 1.90% : 0.000536s : 1: opt_after_jit_grad 0.91% : 0.000257s : 1: opt_b 19.67% : 0.005566s : 1: optimize 0.08% : 0.000023s : 1: optimize_parallel_all_gather_comm 0.03% : 0.000009s : 1: order_py_execute_after_rewriter 0.10% : 0.000027s : 1: overlap_grad_flash_sp 0.01% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.03% : 0.000008s : 1: overlap_grad_ring_attention 0.02% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.02% : 0.000005s : 1: overlap_param_gather 0.01% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.03% : 0.000008s : 1: overlap_recompute_and_grad_model_parallel 0.02% : 0.000005s : 1: overlap_recompute_comm 0.03% : 0.000008s : 1: parallel-infer-symbol 0.01% : 0.000004s : 1: parallel-infer-symbol-second 0.02% : 0.000005s : 1: partial_unused_args_eliminate 0.02% : 0.000005s : 1: pipeline_parallel_scheduler 0.02% : 0.000004s : 1: pipeline_split 0.14% : 0.000041s : 1: pre_auto_parallel 0.11% : 0.000030s : 1: py_interpret_to_execute 0.06% : 0.000018s : 1: py_interpret_to_execute_after_opt_a 0.02% : 0.000005s : 1: remove_cast_before_assign_add 0.07% : 0.000020s : 1: remove_dup_value 1.45% : 0.000410s : 1: renormalize.infer 1.19% : 0.000336s : 1: renormalize.specialize 0.02% : 0.000005s : 1: reorder_send_recv_between_fp_bp 0.02% : 0.000007s : 1: rewriter_after_jit_bprop_graph 0.16% : 0.000046s : 1: rewriter_after_opt_a 0.33% : 0.000093s : 1: rewriter_before_opt_a 0.02% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.02% : 0.000005s : 1: slice_recompute_activation 0.02% : 0.000005s : 1: split_layernorm_comm 0.02% : 0.000005s : 1: split_matmul_comm_elemetwise 0.03% : 0.000009s : 1: swap_dp_allreduce_reducescatter 0.32% : 0.000091s : 1: symbol_engine_optimizer 0.32% : 0.000092s : 1: tuple_transform 22.15% : 0.006270s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:45:40.266.82 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:45:40.269.53 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0163024, [21] [bootstrap]: 0.00037037 [type_inference]: 0.00547798 [event_method]: 1.798e-05 [auto_monad]: 5.005e-05 [graph_reusing]: 4.63001e-06 [inline]: 1.97999e-06 [add_attr]: 0.00292764, [1] [add_attr_with_inline]: 0.0029201, [1] [Cycle 1]: 5.989e-05, [2] [tag_attr]: 1.547e-05 [meta_addattr_fg_expand]: 5.02e-06 [parallel-infer-symbol]: 1.94e-06 [pre_auto_parallel]: 2.687e-05 [insert-virtual-dataset]: 1.31002e-06 [parallel-infer-symbol-second]: 7.49977e-07 [dataset_repeat_opt]: 1.30999e-06 [pipeline_split]: 1.39e-06 [optimize]: 0.00580203, [53] [py_interpret_to_execute]: 2.558e-05 [rewriter_before_opt_a]: 8.166e-05 [opt_a]: 0.003188, [2] [Cycle 1]: 0.00225017, [45] [expand_dump_flag]: 2.10002e-06 [switch_simplify]: 3.749e-05 [loop_unroll]: 3.101e-05 [a_1]: 0.0007173 [with_stream_mark]: 1.133e-05 [recompute_prepare]: 1.007e-05 [updatestate_depend_eliminate]: 4.1e-06 [updatestate_assign_eliminate]: 3.3e-06 [updatestate_loads_eliminate]: 3.11001e-06 [parameter_eliminate]: 1.34e-06 [a_2]: 0.00012697 [accelerated_algorithm]: 8.99998e-06 [shard]: 1.76e-06 [meta_shard_fg_expand]: 2.06e-06 [shard_inline]: 7.82998e-06 [merge_send_recv]: 9.53002e-06 [auto_parallel]: 7.54002e-06 [parallel]: 1.773e-05 [flash_sp]: 7.92998e-06 [merge_comm]: 4.80001e-06 [allreduce_fusion]: 4.23999e-06 [matmul_add_comm_reduction]: 9.32001e-06 [allreduce_slice_to_reducescatter]: 6.30011e-07 [virtual_shard_identity]: 9.32001e-06 [virtual_dataset]: 7.8e-06 [get_grad_eliminate_]: 7.62998e-06 [virtual_output]: 7.93999e-06 [merge_forward]: 4.4e-06 [cell_reuse_recompute_pass]: 1.64998e-06 [offload_activation]: 1.054e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.722e-05 [merge_recompute_call_nodes]: 1.62999e-06 [before_grad]: 1.232e-05 [set_forward_comm_id_for_comm_node_pass]: 4.4e-06 [meta_fg_expand]: 3.28e-06 [flash_sp_send_recv_attached]: 2.60002e-06 [receive_attached]: 2.64999e-06 [after_resolve]: 1.224e-05 [a_after_grad]: 1.134e-05 [renormalize]: 0.00061084 [add_forward_monad_depend]: 4.53001e-06 [auto_monad_grad]: 2.14e-06 [auto_monad_eliminator]: 1.514e-05 [cse]: 2.278e-05 [a_3]: 6.99e-05 [Cycle 2]: 0.00092482, [45] [expand_dump_flag]: 1.09998e-06 [switch_simplify]: 8.90999e-06 [loop_unroll]: 7.56001e-06 [a_1]: 0.00017405 [with_stream_mark]: 1.108e-05 [recompute_prepare]: 7.95e-06 [updatestate_depend_eliminate]: 3.59002e-06 [updatestate_assign_eliminate]: 2.93e-06 [updatestate_loads_eliminate]: 3.06001e-06 [parameter_eliminate]: 1.11997e-06 [a_2]: 0.0001222 [accelerated_algorithm]: 7.56999e-06 [shard]: 1.12e-06 [meta_shard_fg_expand]: 1.60999e-06 [shard_inline]: 7.41001e-06 [merge_send_recv]: 5.72001e-06 [auto_parallel]: 6.95998e-06 [parallel]: 4.27998e-06 [flash_sp]: 3.61001e-06 [merge_comm]: 4.12e-06 [allreduce_fusion]: 3.83001e-06 [matmul_add_comm_reduction]: 6.30002e-06 [allreduce_slice_to_reducescatter]: 4.50003e-07 [virtual_shard_identity]: 8.23001e-06 [virtual_dataset]: 7.16999e-06 [get_grad_eliminate_]: 9.32999e-06 [virtual_output]: 7.19001e-06 [merge_forward]: 3.63e-06 [cell_reuse_recompute_pass]: 1.62001e-06 [offload_activation]: 7.88999e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.635e-05 [merge_recompute_call_nodes]: 9.00007e-07 [before_grad]: 1.165e-05 [set_forward_comm_id_for_comm_node_pass]: 4.32e-06 [meta_fg_expand]: 2.65997e-06 [flash_sp_send_recv_attached]: 1.25001e-06 [receive_attached]: 1.39e-06 [after_resolve]: 1.157e-05 [a_after_grad]: 1.099e-05 [renormalize]: 7.99773e-08 [add_forward_monad_depend]: 1.30001e-06 [auto_monad_grad]: 1.02998e-06 [auto_monad_eliminator]: 8.60001e-06 [cse]: 1.669e-05 [a_3]: 5.817e-05 [py_interpret_to_execute_after_opt_a]: 1.252e-05 [slice_cell_reuse_recomputed_activation]: 5.48002e-06 [rewriter_after_opt_a]: 4.091e-05 [convert_after_rewriter]: 1.072e-05 [order_py_execute_after_rewriter]: 8.68001e-06 [mutable_eliminate]: 0.0004914 [opt_b]: 0.00030361, [1] [Cycle 1]: 0.0002943, [7] [b_1]: 0.00019478 [b_2]: 9.52999e-06 [updatestate_depend_eliminate]: 5.87001e-06 [updatestate_assign_eliminate]: 3.3e-06 [updatestate_loads_eliminate]: 3.01999e-06 [renormalize]: 5.19998e-07 [cse]: 2.225e-05 [optimize_parallel_all_gather_comm]: 2.068e-05 [overlap_param_gather]: 5.07999e-06 [cconv]: 2.729e-05 [loop_unroll]: 0.00046848 [opt_after_cconv]: 0.00013847, [1] [Cycle 1]: 0.00012999, [7] [c_1]: 3.722e-05 [parameter_eliminate]: 2.88e-06 [updatestate_depend_eliminate]: 5.85002e-06 [updatestate_assign_eliminate]: 3.45003e-06 [updatestate_loads_eliminate]: 2.88998e-06 [cse]: 2.131e-05 [renormalize]: 4.50003e-07 [remove_dup_value]: 1.767e-05 [tuple_transform]: 0.00010077, [1] [Cycle 1]: 9.346e-05, [4] [d_1]: 5.2e-05 [none_parameter_eliminate]: 1.50999e-06 [renormalize]: 2.60014e-07 [switch_simplify]: 8.33999e-06 [partial_unused_args_eliminate]: 4.47e-06 [add_recomputation]: 5.505e-05 [cse_after_recomputation]: 3.017e-05, [1] [Cycle 1]: 2.32e-05, [1] [cse]: 1.393e-05 [environ_conv]: 8.89e-06 [swap_dp_allreduce_reducescatter]: 0.00027233 [bias_add_comm_swap]: 5.54e-06 [label_micro_interleaved_index]: 6.94001e-06 [label_fine_grained_interleaved_index]: 4.99e-06 [merge_cast_opt]: 3.63e-06 [slice_recompute_activation]: 4.33999e-06 [micro_interleaved_order_control]: 4.77e-06 [assign_add_opt]: 3.58e-06 [ForceFp32Comm]: 3.72998e-06 [remove_cast_before_assign_add]: 3.33e-06 [full_micro_interleaved_order_control]: 5.05001e-06 [reorder_send_recv_between_fp_bp]: 5.36002e-06 [comm_op_add_attrs]: 3.75e-06 [add_comm_op_reuse_tag]: 3.48e-06 [interleave_split_concat_branches]: 3.53999e-06 [interleave_parallel_branches]: 3.69002e-06 [overlap_opt_shard_in_pipeline]: 3.83001e-06 [overlap_opt_shard_grad_in_pipeline]: 4.28999e-06 [control_data_broadcast_order]: 1.892e-05 [grouped_pairwise_exchange_alltoall]: 4.52e-06 [offloading_packed_experts]: 7.35e-06 [overlap_recompute_and_grad_model_parallel]: 7.81001e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.56001e-06 [overlap_recompute_allgather_and_fa_grad]: 3.6e-06 [overlap_recompute_comm]: 5.00999e-06 [overlap_grad_ring_attention]: 6.94999e-06 [overlap_grad_flash_sp]: 2.379e-05 [begin_end_overlap_inline]: 3.21001e-06 [split_matmul_comm_elemetwise]: 4.64002e-06 [split_layernorm_comm]: 4.18001e-06 [handle_group_info]: 3.11999e-06 [symbol_engine_optimizer]: 0.0001054, [1] [Cycle 1]: 9.852e-05, [6] [build]: 3.33e-06 [elim_shapecalc]: 1.157e-05 [elim_not_effective]: 1.572e-05 [opt_reshape]: 8.40999e-06 [fold_const_symbol]: 1.283e-05 [renormalize]: 3.30008e-07 [detach_backward]: 3.3e-06 [pipeline_parallel_scheduler]: 1.77001e-06 [auto_monad_reorder]: 2.264e-05 [get_jit_bprop_graph]: 1.44998e-06 [rewriter_after_jit_bprop_graph]: 3.11999e-06 [opt_after_jit_grad]: 0.00050294 [validate]: 4.18e-05 Sums bootstrap : 0.000370s : 3.31% type_inference : 0.005478s : 48.98% event_method : 0.000018s : 0.16% auto_monad : 0.000050s : 0.45% graph_reusing : 0.000005s : 0.04% inline : 0.000002s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000015s : 0.14% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000005s : 0.04% parallel-infer-symbol : 0.000002s : 0.02% pre_auto_parallel : 0.000027s : 0.24% insert-virtual-dataset : 0.000001s : 0.01% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000001s : 0.01% pipeline_split : 0.000001s : 0.01% optimize.py_interpret_to_execute : 0.000026s : 0.23% optimize.rewriter_before_opt_a : 0.000082s : 0.73% optimize.opt_a.expand_dump_flag : 0.000003s : 0.03% optimize.opt_a.switch_simplify : 0.000046s : 0.41% optimize.opt_a.loop_unroll : 0.000039s : 0.34% optimize.opt_a.a_1 : 0.000891s : 7.97% optimize.opt_a.with_stream_mark : 0.000022s : 0.20% optimize.opt_a.recompute_prepare : 0.000018s : 0.16% optimize.opt_a.updatestate_depend_eliminate : 0.000008s : 0.07% optimize.opt_a.updatestate_assign_eliminate : 0.000006s : 0.06% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.06% optimize.opt_a.parameter_eliminate : 0.000002s : 0.02% optimize.opt_a.a_2 : 0.000249s : 2.23% optimize.opt_a.accelerated_algorithm : 0.000017s : 0.15% optimize.opt_a.shard : 0.000003s : 0.03% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.03% optimize.opt_a.shard_inline : 0.000015s : 0.14% optimize.opt_a.merge_send_recv : 0.000015s : 0.14% optimize.opt_a.auto_parallel : 0.000014s : 0.13% optimize.opt_a.parallel : 0.000022s : 0.20% optimize.opt_a.flash_sp : 0.000012s : 0.10% optimize.opt_a.merge_comm : 0.000009s : 0.08% optimize.opt_a.allreduce_fusion : 0.000008s : 0.07% optimize.opt_a.matmul_add_comm_reduction : 0.000016s : 0.14% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000018s : 0.16% optimize.opt_a.virtual_dataset : 0.000015s : 0.13% optimize.opt_a.get_grad_eliminate_ : 0.000017s : 0.15% optimize.opt_a.virtual_output : 0.000015s : 0.14% optimize.opt_a.merge_forward : 0.000008s : 0.07% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.03% optimize.opt_a.offload_activation : 0.000018s : 0.16% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000034s : 0.30% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.02% optimize.opt_a.before_grad : 0.000024s : 0.21% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000009s : 0.08% optimize.opt_a.meta_fg_expand : 0.000006s : 0.05% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.03% optimize.opt_a.receive_attached : 0.000004s : 0.04% optimize.opt_a.after_resolve : 0.000024s : 0.21% optimize.opt_a.a_after_grad : 0.000022s : 0.20% optimize.opt_a.renormalize : 0.000611s : 5.46% optimize.opt_a.add_forward_monad_depend : 0.000006s : 0.05% optimize.opt_a.auto_monad_grad : 0.000003s : 0.03% optimize.opt_a.auto_monad_eliminator : 0.000024s : 0.21% optimize.opt_a.cse : 0.000039s : 0.35% optimize.opt_a.a_3 : 0.000128s : 1.15% optimize.py_interpret_to_execute_after_opt_a : 0.000013s : 0.11% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.05% optimize.rewriter_after_opt_a : 0.000041s : 0.37% optimize.convert_after_rewriter : 0.000011s : 0.10% optimize.order_py_execute_after_rewriter : 0.000009s : 0.08% optimize.mutable_eliminate : 0.000491s : 4.39% optimize.opt_b.b_1 : 0.000195s : 1.74% optimize.opt_b.b_2 : 0.000010s : 0.09% optimize.opt_b.updatestate_depend_eliminate : 0.000006s : 0.05% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000022s : 0.20% optimize.optimize_parallel_all_gather_comm : 0.000021s : 0.18% optimize.overlap_param_gather : 0.000005s : 0.05% optimize.cconv : 0.000027s : 0.24% optimize.loop_unroll : 0.000468s : 4.19% optimize.opt_after_cconv.c_1 : 0.000037s : 0.33% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.05% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.cse : 0.000021s : 0.19% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000018s : 0.16% optimize.tuple_transform.d_1 : 0.000052s : 0.46% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000008s : 0.07% optimize.partial_unused_args_eliminate : 0.000004s : 0.04% optimize.add_recomputation : 0.000055s : 0.49% optimize.cse_after_recomputation.cse : 0.000014s : 0.12% optimize.environ_conv : 0.000009s : 0.08% optimize.swap_dp_allreduce_reducescatter : 0.000272s : 2.43% optimize.bias_add_comm_swap : 0.000006s : 0.05% optimize.label_micro_interleaved_index : 0.000007s : 0.06% optimize.label_fine_grained_interleaved_index : 0.000005s : 0.04% optimize.merge_cast_opt : 0.000004s : 0.03% optimize.slice_recompute_activation : 0.000004s : 0.04% optimize.micro_interleaved_order_control : 0.000005s : 0.04% optimize.assign_add_opt : 0.000004s : 0.03% optimize.ForceFp32Comm : 0.000004s : 0.03% optimize.remove_cast_before_assign_add : 0.000003s : 0.03% optimize.full_micro_interleaved_order_control : 0.000005s : 0.05% optimize.reorder_send_recv_between_fp_bp : 0.000005s : 0.05% optimize.comm_op_add_attrs : 0.000004s : 0.03% optimize.add_comm_op_reuse_tag : 0.000003s : 0.03% optimize.interleave_split_concat_branches : 0.000004s : 0.03% optimize.interleave_parallel_branches : 0.000004s : 0.03% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.03% optimize.overlap_opt_shard_grad_in_pipeline : 0.000004s : 0.04% optimize.control_data_broadcast_order : 0.000019s : 0.17% optimize.grouped_pairwise_exchange_alltoall : 0.000005s : 0.04% optimize.offloading_packed_experts : 0.000007s : 0.07% optimize.overlap_recompute_and_grad_model_parallel : 0.000008s : 0.07% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.03% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.03% optimize.overlap_recompute_comm : 0.000005s : 0.04% optimize.overlap_grad_ring_attention : 0.000007s : 0.06% optimize.overlap_grad_flash_sp : 0.000024s : 0.21% optimize.begin_end_overlap_inline : 0.000003s : 0.03% optimize.split_matmul_comm_elemetwise : 0.000005s : 0.04% optimize.split_layernorm_comm : 0.000004s : 0.04% optimize.handle_group_info : 0.000003s : 0.03% optimize.symbol_engine_optimizer.build : 0.000003s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000012s : 0.10% optimize.symbol_engine_optimizer.elim_not_effective : 0.000016s : 0.14% optimize.symbol_engine_optimizer.opt_reshape : 0.000008s : 0.08% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000013s : 0.11% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000003s : 0.03% pipeline_parallel_scheduler : 0.000002s : 0.02% auto_monad_reorder : 0.000023s : 0.20% get_jit_bprop_graph : 0.000001s : 0.01% rewriter_after_jit_bprop_graph : 0.000003s : 0.03% opt_after_jit_grad : 0.000503s : 4.50% validate : 0.000042s : 0.37% Time group info: ------[substitution.] 0.000171 38 10.40% : 0.000018s : 3: substitution.cast_eliminate 1.41% : 0.000002s : 3: substitution.elim_not_effective 1.06% : 0.000002s : 3: substitution.fold_const_symbol 3.87% : 0.000007s : 5: substitution.graph_param_transform 66.37% : 0.000114s : 4: substitution.inline 2.40% : 0.000004s : 6: substitution.j_node_and_user_rematch 3.77% : 0.000006s : 6: substitution.remove_not_recompute_node 2.54% : 0.000004s : 4: substitution.replace_old_param 8.18% : 0.000014s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.005435 2 87.36% : 0.004749s : 1: type_inference.infer 12.64% : 0.000687s : 1: type_inference.specialize ------[replace.] 0.000079 8 41.80% : 0.000033s : 4: replace.inline 58.20% : 0.000046s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000123 8 90.27% : 0.000111s : 4: match.inline 9.73% : 0.000012s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000254 1596 0.96% : 0.000002s : 17: predicate.accumulaten_eliminater 0.73% : 0.000002s : 5: predicate.ad_related_special_op_eliminate 0.56% : 0.000001s : 10: predicate.addn_check_dump 0.97% : 0.000002s : 17: predicate.addn_zero_filter 0.86% : 0.000002s : 17: predicate.adjust_all_reduce_mul_add 2.09% : 0.000005s : 27: predicate.arithmetic_simplify 1.03% : 0.000003s : 17: predicate.cast_eliminate 0.63% : 0.000002s : 10: predicate.check_bprop_eliminate 0.60% : 0.000002s : 10: predicate.compare_switch_simplify 0.18% : 0.000000s : 5: predicate.const_output_eliminate 0.65% : 0.000002s : 10: predicate.depend_value_elim 1.02% : 0.000003s : 17: predicate.dict_get_item_const_eliminator 1.06% : 0.000003s : 17: predicate.dict_get_item_eliminator 0.92% : 0.000002s : 17: predicate.dict_set_item_eliminator 0.97% : 0.000002s : 10: predicate.dumpgradient_eliminate 0.19% : 0.000000s : 5: predicate.elim_not_effective 0.37% : 0.000001s : 5: predicate.elim_shapecalc_of_broadcastargs 1.23% : 0.000003s : 22: predicate.environ_add_const_eliminate 1.13% : 0.000003s : 22: predicate.environ_get_add_eliminate 1.12% : 0.000003s : 22: predicate.environ_get_depend_swap 1.80% : 0.000005s : 32: predicate.environ_get_eliminate 1.17% : 0.000003s : 22: predicate.environ_get_set_eliminate 1.38% : 0.000003s : 25: predicate.exchange_switch_depend_value 2.20% : 0.000006s : 25: predicate.float_depend_g_call 0.56% : 0.000001s : 10: predicate.float_environ_get_switch 0.86% : 0.000002s : 15: predicate.float_tuple_getitem_switch 0.16% : 0.000000s : 5: predicate.fold_const_symbol 0.69% : 0.000002s : 10: predicate.get_grad_eliminate 0.19% : 0.000000s : 5: predicate.graph_param_transform 0.59% : 0.000001s : 10: predicate.incorporate_call 0.50% : 0.000001s : 10: predicate.incorporate_call_switch 6.01% : 0.000015s : 72: predicate.inline 0.75% : 0.000002s : 10: predicate.inline_without_move 0.31% : 0.000001s : 10: predicate.j_node_and_user_rematch 0.90% : 0.000002s : 10: predicate.less_batch_normalization 2.05% : 0.000005s : 31: predicate.list_to_tuple_eliminator_ 2.63% : 0.000007s : 48: predicate.load_eliminater 0.98% : 0.000002s : 5: predicate.loop_unroll_after_grad 1.97% : 0.000005s : 36: predicate.loop_unroll_before_grad 1.83% : 0.000005s : 27: predicate.make_slice_get_slice_eliminator 0.66% : 0.000002s : 10: predicate.merge_addn 0.60% : 0.000002s : 10: predicate.micro_step_allgather_replace 0.58% : 0.000001s : 10: predicate.mini_step_allgather_replace 0.91% : 0.000002s : 17: predicate.minmaximum_grad 0.79% : 0.000002s : 5: predicate.mutable_eliminate 0.32% : 0.000001s : 5: predicate.opt_reshape 0.38% : 0.000001s : 5: predicate.parallel_virtual_node 1.71% : 0.000004s : 25: predicate.partial_defer_inline 1.73% : 0.000004s : 26: predicate.partial_eliminate 0.97% : 0.000002s : 17: predicate.print_const_string_wrapper 0.64% : 0.000002s : 10: predicate.reduce_all_const_elim 1.21% : 0.000003s : 17: predicate.reduce_eliminate 2.63% : 0.000007s : 48: predicate.redundant_stop_gradient_eliminater 0.39% : 0.000001s : 10: predicate.remove_not_recompute_node 1.34% : 0.000003s : 31: predicate.replace_applicator 0.49% : 0.000001s : 10: predicate.replace_old_param 0.24% : 0.000001s : 5: predicate.reset_defer_inline 1.01% : 0.000003s : 17: predicate.reshape_eliminate 0.61% : 0.000002s : 10: predicate.row_tensor_add_zeros_like 0.34% : 0.000001s : 5: predicate.row_tensor_eliminate 0.82% : 0.000002s : 10: predicate.same_eliminate 0.42% : 0.000001s : 10: predicate.set_cell_output_no_recompute 0.97% : 0.000002s : 10: predicate.shard_identity_eliminate 0.65% : 0.000002s : 10: predicate.special_op_eliminate 0.74% : 0.000002s : 10: predicate.specialize_transform 0.85% : 0.000002s : 10: predicate.split_environ_get_set_with_tuple_value 0.66% : 0.000002s : 10: predicate.stack_unstack_eliminate 0.34% : 0.000001s : 5: predicate.switch_call_monad_eliminater 1.49% : 0.000004s : 25: predicate.switch_defer_inline 2.07% : 0.000005s : 35: predicate.switch_layer_defer_inline 4.76% : 0.000012s : 76: predicate.switch_simplify 1.10% : 0.000003s : 17: predicate.tile_eliminate 0.95% : 0.000002s : 17: predicate.transpose_eliminate 1.54% : 0.000004s : 27: predicate.tuple_list_convert_item_index_to_positive 1.75% : 0.000004s : 27: predicate.tuple_list_get_item_const_eliminator 1.45% : 0.000004s : 27: predicate.tuple_list_get_item_depend_reorder 3.10% : 0.000008s : 41: predicate.tuple_list_get_item_eliminator 1.55% : 0.000004s : 27: predicate.tuple_list_get_set_item_eliminator 2.40% : 0.000006s : 37: predicate.tuple_list_set_item_eliminator 1.77% : 0.000004s : 31: predicate.tuple_to_list_eliminator_ 2.57% : 0.000007s : 48: predicate.updatestate_pure_node_eliminater 3.33% : 0.000008s : 58: predicate.updatestate_useless_node_eliminater 0.42% : 0.000001s : 5: predicate.value_based_eliminate 0.65% : 0.000002s : 10: predicate.virtual_dataset_eliminate 0.61% : 0.000002s : 10: predicate.virtual_output_eliminate 0.31% : 0.000001s : 5: predicate.virtual_view_grad_eliminate 0.41% : 0.000001s : 5: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000473 11 53.61% : 0.000254s : 5: func_graph_cloner_run.FuncGraphClonerGraph 46.39% : 0.000220s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.027253 192 0.02% : 0.000006s : 1: ForceFp32Comm 10.78% : 0.002937s : 1: add_attr 10.73% : 0.002923s : 1: add_attr_with_inline 0.02% : 0.000006s : 1: add_comm_op_reuse_tag 0.22% : 0.000059s : 1: add_recomputation 0.02% : 0.000006s : 1: assign_add_opt 0.22% : 0.000060s : 1: auto_monad 0.11% : 0.000030s : 1: auto_monad_reorder 0.02% : 0.000006s : 1: begin_end_overlap_inline 0.03% : 0.000008s : 1: bias_add_comm_swap 1.50% : 0.000409s : 1: bootstrap 0.11% : 0.000031s : 1: cconv 0.02% : 0.000006s : 1: comm_op_add_attrs 0.08% : 0.000022s : 1: control_data_broadcast_order 0.05% : 0.000014s : 1: convert_after_rewriter 0.12% : 0.000033s : 1: cse_after_recomputation 0.03% : 0.000007s : 1: dataset_repeat_opt 0.07% : 0.000018s : 1: detach_backward 0.04% : 0.000012s : 1: environ_conv 0.10% : 0.000027s : 1: event_method 0.03% : 0.000008s : 1: full_micro_interleaved_order_control 0.03% : 0.000008s : 1: get_jit_bprop_graph 0.04% : 0.000011s : 1: graph_reusing 0.03% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.02% : 0.000006s : 1: handle_group_info 0.03% : 0.000008s : 1: inline 0.03% : 0.000008s : 1: insert-virtual-dataset 0.02% : 0.000006s : 1: interleave_parallel_branches 0.02% : 0.000006s : 1: interleave_split_concat_branches 0.03% : 0.000008s : 1: label_fine_grained_interleaved_index 0.04% : 0.000010s : 1: label_micro_interleaved_index 1.74% : 0.000475s : 1: loop_unroll 0.02% : 0.000006s : 1: merge_cast_opt 0.03% : 0.000007s : 1: micro_interleaved_order_control 1.82% : 0.000497s : 1: mutable_eliminate 0.04% : 0.000010s : 1: offloading_packed_experts 0.06% : 0.000017s : 1: opt.transform.loop_unroll_optimizer 0.06% : 0.000015s : 1: opt.transform.mutable_eliminate 5.16% : 0.001405s : 78: opt.transform.opt_a 0.13% : 0.000036s : 1: opt.transform.opt_after_cconv 0.11% : 0.000030s : 1: opt.transform.opt_after_jit_grad 0.49% : 0.000132s : 28: opt.transform.opt_b 0.21% : 0.000058s : 2: opt.transform.opt_trans_graph 0.17% : 0.000045s : 4: opt.transform.symbol_engine_opt 11.71% : 0.003191s : 1: opt_a 0.52% : 0.000142s : 1: opt_after_cconv 1.89% : 0.000514s : 1: opt_after_jit_grad 1.13% : 0.000307s : 1: opt_b 24.27% : 0.006613s : 1: optimize 0.09% : 0.000024s : 1: optimize_parallel_all_gather_comm 0.04% : 0.000012s : 1: order_py_execute_after_rewriter 0.10% : 0.000027s : 1: overlap_grad_flash_sp 0.02% : 0.000006s : 1: overlap_grad_matmul_and_grad_allreduce 0.04% : 0.000010s : 1: overlap_grad_ring_attention 0.03% : 0.000007s : 1: overlap_opt_shard_grad_in_pipeline 0.02% : 0.000007s : 1: overlap_opt_shard_in_pipeline 0.03% : 0.000008s : 1: overlap_param_gather 0.02% : 0.000006s : 1: overlap_recompute_allgather_and_fa_grad 0.04% : 0.000011s : 1: overlap_recompute_and_grad_model_parallel 0.03% : 0.000008s : 1: overlap_recompute_comm 0.03% : 0.000008s : 1: parallel-infer-symbol 0.02% : 0.000006s : 1: parallel-infer-symbol-second 0.03% : 0.000007s : 1: partial_unused_args_eliminate 0.03% : 0.000009s : 1: pipeline_parallel_scheduler 0.03% : 0.000007s : 1: pipeline_split 0.13% : 0.000034s : 1: pre_auto_parallel 0.11% : 0.000029s : 1: py_interpret_to_execute 0.06% : 0.000016s : 1: py_interpret_to_execute_after_opt_a 0.02% : 0.000007s : 1: remove_cast_before_assign_add 0.08% : 0.000021s : 1: remove_dup_value 1.15% : 0.000314s : 1: renormalize.infer 1.06% : 0.000290s : 1: renormalize.specialize 0.03% : 0.000008s : 1: reorder_send_recv_between_fp_bp 0.03% : 0.000009s : 1: rewriter_after_jit_bprop_graph 0.16% : 0.000044s : 1: rewriter_after_opt_a 0.31% : 0.000085s : 1: rewriter_before_opt_a 0.03% : 0.000008s : 1: slice_cell_reuse_recomputed_activation 0.03% : 0.000007s : 1: slice_recompute_activation 0.03% : 0.000007s : 1: split_layernorm_comm 0.03% : 0.000007s : 1: split_matmul_comm_elemetwise 1.01% : 0.000276s : 1: swap_dp_allreduce_reducescatter 0.40% : 0.000108s : 1: symbol_engine_optimizer 0.38% : 0.000104s : 1: tuple_transform 20.21% : 0.005509s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:45:40.259.085 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.015866, [21] [bootstrap]: 0.00037813 [type_inference]: 0.00603001 [event_method]: 1.78e-05 [auto_monad]: 6.336e-05 [graph_reusing]: 5.40999e-06 [inline]: 2.14e-06 [add_attr]: 0.00327137, [1] [add_attr_with_inline]: 0.0032609, [1] [Cycle 1]: 5.875e-05, [2] [tag_attr]: 2.223e-05 [meta_addattr_fg_expand]: 5.94e-06 [parallel-infer-symbol]: 3.31999e-06 [pre_auto_parallel]: 3.509e-05 [insert-virtual-dataset]: 2.34999e-06 [parallel-infer-symbol-second]: 7.30011e-07 [dataset_repeat_opt]: 1.94e-06 [pipeline_split]: 1.71002e-06 [optimize]: 0.00537908, [53] [py_interpret_to_execute]: 2.592e-05 [rewriter_before_opt_a]: 8.889e-05 [opt_a]: 0.00318004, [2] [Cycle 1]: 0.00235234, [45] [expand_dump_flag]: 3.02002e-06 [switch_simplify]: 4.053e-05 [loop_unroll]: 3.138e-05 [a_1]: 0.0007516 [with_stream_mark]: 1.635e-05 [recompute_prepare]: 1.09e-05 [updatestate_depend_eliminate]: 4.09997e-06 [updatestate_assign_eliminate]: 3.86999e-06 [updatestate_loads_eliminate]: 3.69002e-06 [parameter_eliminate]: 1.77999e-06 [a_2]: 0.00010052 [accelerated_algorithm]: 9.02e-06 [shard]: 1.69998e-06 [meta_shard_fg_expand]: 2.48e-06 [shard_inline]: 7.77998e-06 [merge_send_recv]: 1.024e-05 [auto_parallel]: 7.25e-06 [parallel]: 1.996e-05 [flash_sp]: 9.48002e-06 [merge_comm]: 4.76002e-06 [allreduce_fusion]: 4.30999e-06 [matmul_add_comm_reduction]: 1.003e-05 [allreduce_slice_to_reducescatter]: 7.00005e-07 [virtual_shard_identity]: 9.97001e-06 [virtual_dataset]: 7.74002e-06 [get_grad_eliminate_]: 7.68001e-06 [virtual_output]: 7.9e-06 [merge_forward]: 4.37998e-06 [cell_reuse_recompute_pass]: 1.89e-06 [offload_activation]: 1.187e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.527e-05 [merge_recompute_call_nodes]: 1.55001e-06 [before_grad]: 1.296e-05 [set_forward_comm_id_for_comm_node_pass]: 4.35e-06 [meta_fg_expand]: 3.45e-06 [flash_sp_send_recv_attached]: 2.66e-06 [receive_attached]: 2.22999e-06 [after_resolve]: 1.328e-05 [a_after_grad]: 1.19e-05 [renormalize]: 0.00080765 [add_forward_monad_depend]: 6.63e-06 [auto_monad_grad]: 2.60997e-06 [auto_monad_eliminator]: 1.612e-05 [cse]: 3.251e-05 [a_3]: 6.035e-05 [Cycle 2]: 0.00081731, [45] [expand_dump_flag]: 1.96e-06 [switch_simplify]: 9.37999e-06 [loop_unroll]: 7.38e-06 [a_1]: 0.00017854 [with_stream_mark]: 1.364e-05 [recompute_prepare]: 7.77998e-06 [updatestate_depend_eliminate]: 3.73001e-06 [updatestate_assign_eliminate]: 3.63e-06 [updatestate_loads_eliminate]: 3.19001e-06 [parameter_eliminate]: 1.37e-06 [a_2]: 9.257e-05 [accelerated_algorithm]: 7.5e-06 [shard]: 1.34e-06 [meta_shard_fg_expand]: 2.31e-06 [shard_inline]: 7.38e-06 [merge_send_recv]: 7.06001e-06 [auto_parallel]: 6.74001e-06 [parallel]: 6.93998e-06 [flash_sp]: 3.81001e-06 [merge_comm]: 4.89e-06 [allreduce_fusion]: 3.77998e-06 [matmul_add_comm_reduction]: 8.05e-06 [allreduce_slice_to_reducescatter]: 3.50003e-07 [virtual_shard_identity]: 8.50001e-06 [virtual_dataset]: 7.23e-06 [get_grad_eliminate_]: 7.49002e-06 [virtual_output]: 7.07002e-06 [merge_forward]: 3.71999e-06 [cell_reuse_recompute_pass]: 2.01003e-06 [offload_activation]: 9.62999e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.496e-05 [merge_recompute_call_nodes]: 1.17e-06 [before_grad]: 1.158e-05 [set_forward_comm_id_for_comm_node_pass]: 4.67e-06 [meta_fg_expand]: 2.87002e-06 [flash_sp_send_recv_attached]: 1.72999e-06 [receive_attached]: 1.71998e-06 [after_resolve]: 1.257e-05 [a_after_grad]: 1.148e-05 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 2.07999e-06 [auto_monad_grad]: 1.17e-06 [auto_monad_eliminator]: 8.72e-06 [cse]: 1.991e-05 [a_3]: 4.542e-05 [py_interpret_to_execute_after_opt_a]: 1.349e-05 [slice_cell_reuse_recomputed_activation]: 1.92001e-06 [rewriter_after_opt_a]: 3.987e-05 [convert_after_rewriter]: 8.17998e-06 [order_py_execute_after_rewriter]: 6.17999e-06 [mutable_eliminate]: 0.00061065 [opt_b]: 0.00023887, [1] [Cycle 1]: 0.00023256, [7] [b_1]: 0.000153 [b_2]: 9.22001e-06 [updatestate_depend_eliminate]: 6.36e-06 [updatestate_assign_eliminate]: 3.08e-06 [updatestate_loads_eliminate]: 2.97002e-06 [renormalize]: 5.50004e-07 [cse]: 2.167e-05 [optimize_parallel_all_gather_comm]: 1.792e-05 [overlap_param_gather]: 1.97001e-06 [cconv]: 2.577e-05 [loop_unroll]: 0.00042762 [opt_after_cconv]: 0.00011418, [1] [Cycle 1]: 0.00010828, [7] [c_1]: 3.769e-05 [parameter_eliminate]: 3.19001e-06 [updatestate_depend_eliminate]: 5.86e-06 [updatestate_assign_eliminate]: 2.99999e-06 [updatestate_loads_eliminate]: 2.74001e-06 [cse]: 2.134e-05 [renormalize]: 4.2998e-07 [remove_dup_value]: 1.407e-05 [tuple_transform]: 8.347e-05, [1] [Cycle 1]: 7.889e-05, [4] [d_1]: 5.105e-05 [none_parameter_eliminate]: 1.68997e-06 [renormalize]: 1.8999e-07 [switch_simplify]: 8.15e-06 [partial_unused_args_eliminate]: 1.89e-06 [add_recomputation]: 5.145e-05 [cse_after_recomputation]: 2.579e-05, [1] [Cycle 1]: 2.098e-05, [1] [cse]: 1.52e-05 [environ_conv]: 5.92999e-06 [swap_dp_allreduce_reducescatter]: 5.72999e-06 [bias_add_comm_swap]: 2.44999e-06 [label_micro_interleaved_index]: 3.97002e-06 [label_fine_grained_interleaved_index]: 2.64999e-06 [merge_cast_opt]: 1.31002e-06 [slice_recompute_activation]: 1.90001e-06 [micro_interleaved_order_control]: 2.73e-06 [assign_add_opt]: 1.16002e-06 [ForceFp32Comm]: 1.25001e-06 [remove_cast_before_assign_add]: 1.44998e-06 [full_micro_interleaved_order_control]: 2.04999e-06 [reorder_send_recv_between_fp_bp]: 2.62001e-06 [comm_op_add_attrs]: 1.00001e-06 [add_comm_op_reuse_tag]: 9.70002e-07 [interleave_split_concat_branches]: 1.10999e-06 [interleave_parallel_branches]: 1.02998e-06 [overlap_opt_shard_in_pipeline]: 1.19e-06 [overlap_opt_shard_grad_in_pipeline]: 2.01998e-06 [control_data_broadcast_order]: 1.509e-05 [grouped_pairwise_exchange_alltoall]: 1.71e-06 [offloading_packed_experts]: 4.26001e-06 [overlap_recompute_and_grad_model_parallel]: 5.37001e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.20001e-06 [overlap_recompute_allgather_and_fa_grad]: 1.40999e-06 [overlap_recompute_comm]: 2.22001e-06 [overlap_grad_ring_attention]: 4.63999e-06 [overlap_grad_flash_sp]: 2.13e-05 [begin_end_overlap_inline]: 5.90022e-07 [split_matmul_comm_elemetwise]: 2.04e-06 [split_layernorm_comm]: 1.62999e-06 [handle_group_info]: 1.21002e-06 [symbol_engine_optimizer]: 8.264e-05, [1] [Cycle 1]: 7.839e-05, [6] [build]: 3.04999e-06 [elim_shapecalc]: 1.109e-05 [elim_not_effective]: 1.503e-05 [opt_reshape]: 8.42e-06 [fold_const_symbol]: 1.236e-05 [renormalize]: 1.80007e-07 [detach_backward]: 2.17999e-06 [pipeline_parallel_scheduler]: 1.81e-06 [auto_monad_reorder]: 2.006e-05 [get_jit_bprop_graph]: 1.76e-06 [rewriter_after_jit_bprop_graph]: 4.00998e-06 [opt_after_jit_grad]: 0.0004635 [validate]: 3.862e-05 Sums bootstrap : 0.000378s : 3.26% type_inference : 0.006030s : 51.94% event_method : 0.000018s : 0.15% auto_monad : 0.000063s : 0.55% graph_reusing : 0.000005s : 0.05% inline : 0.000002s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000022s : 0.19% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.05% parallel-infer-symbol : 0.000003s : 0.03% pre_auto_parallel : 0.000035s : 0.30% insert-virtual-dataset : 0.000002s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000026s : 0.22% optimize.rewriter_before_opt_a : 0.000089s : 0.77% optimize.opt_a.expand_dump_flag : 0.000005s : 0.04% optimize.opt_a.switch_simplify : 0.000050s : 0.43% optimize.opt_a.loop_unroll : 0.000039s : 0.33% optimize.opt_a.a_1 : 0.000930s : 8.01% optimize.opt_a.with_stream_mark : 0.000030s : 0.26% optimize.opt_a.recompute_prepare : 0.000019s : 0.16% optimize.opt_a.updatestate_depend_eliminate : 0.000008s : 0.07% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.06% optimize.opt_a.updatestate_loads_eliminate : 0.000007s : 0.06% optimize.opt_a.parameter_eliminate : 0.000003s : 0.03% optimize.opt_a.a_2 : 0.000193s : 1.66% optimize.opt_a.accelerated_algorithm : 0.000017s : 0.14% optimize.opt_a.shard : 0.000003s : 0.03% optimize.opt_a.meta_shard_fg_expand : 0.000005s : 0.04% optimize.opt_a.shard_inline : 0.000015s : 0.13% optimize.opt_a.merge_send_recv : 0.000017s : 0.15% optimize.opt_a.auto_parallel : 0.000014s : 0.12% optimize.opt_a.parallel : 0.000027s : 0.23% optimize.opt_a.flash_sp : 0.000013s : 0.11% optimize.opt_a.merge_comm : 0.000010s : 0.08% optimize.opt_a.allreduce_fusion : 0.000008s : 0.07% optimize.opt_a.matmul_add_comm_reduction : 0.000018s : 0.16% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000018s : 0.16% optimize.opt_a.virtual_dataset : 0.000015s : 0.13% optimize.opt_a.get_grad_eliminate_ : 0.000015s : 0.13% optimize.opt_a.virtual_output : 0.000015s : 0.13% optimize.opt_a.merge_forward : 0.000008s : 0.07% optimize.opt_a.cell_reuse_recompute_pass : 0.000004s : 0.03% optimize.opt_a.offload_activation : 0.000021s : 0.19% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000030s : 0.26% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.02% optimize.opt_a.before_grad : 0.000025s : 0.21% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000009s : 0.08% optimize.opt_a.meta_fg_expand : 0.000006s : 0.05% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.04% optimize.opt_a.receive_attached : 0.000004s : 0.03% optimize.opt_a.after_resolve : 0.000026s : 0.22% optimize.opt_a.a_after_grad : 0.000023s : 0.20% optimize.opt_a.renormalize : 0.000808s : 6.96% optimize.opt_a.add_forward_monad_depend : 0.000009s : 0.08% optimize.opt_a.auto_monad_grad : 0.000004s : 0.03% optimize.opt_a.auto_monad_eliminator : 0.000025s : 0.21% optimize.opt_a.cse : 0.000052s : 0.45% optimize.opt_a.a_3 : 0.000106s : 0.91% optimize.py_interpret_to_execute_after_opt_a : 0.000013s : 0.12% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.02% optimize.rewriter_after_opt_a : 0.000040s : 0.34% optimize.convert_after_rewriter : 0.000008s : 0.07% optimize.order_py_execute_after_rewriter : 0.000006s : 0.05% optimize.mutable_eliminate : 0.000611s : 5.26% optimize.opt_b.b_1 : 0.000153s : 1.32% optimize.opt_b.b_2 : 0.000009s : 0.08% optimize.opt_b.updatestate_depend_eliminate : 0.000006s : 0.05% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000022s : 0.19% optimize.optimize_parallel_all_gather_comm : 0.000018s : 0.15% optimize.overlap_param_gather : 0.000002s : 0.02% optimize.cconv : 0.000026s : 0.22% optimize.loop_unroll : 0.000428s : 3.68% optimize.opt_after_cconv.c_1 : 0.000038s : 0.32% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.05% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.02% optimize.opt_after_cconv.cse : 0.000021s : 0.18% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000014s : 0.12% optimize.tuple_transform.d_1 : 0.000051s : 0.44% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000008s : 0.07% optimize.partial_unused_args_eliminate : 0.000002s : 0.02% optimize.add_recomputation : 0.000051s : 0.44% optimize.cse_after_recomputation.cse : 0.000015s : 0.13% optimize.environ_conv : 0.000006s : 0.05% optimize.swap_dp_allreduce_reducescatter : 0.000006s : 0.05% optimize.bias_add_comm_swap : 0.000002s : 0.02% optimize.label_micro_interleaved_index : 0.000004s : 0.03% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.02% optimize.merge_cast_opt : 0.000001s : 0.01% optimize.slice_recompute_activation : 0.000002s : 0.02% optimize.micro_interleaved_order_control : 0.000003s : 0.02% optimize.assign_add_opt : 0.000001s : 0.01% optimize.ForceFp32Comm : 0.000001s : 0.01% optimize.remove_cast_before_assign_add : 0.000001s : 0.01% optimize.full_micro_interleaved_order_control : 0.000002s : 0.02% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.02% optimize.comm_op_add_attrs : 0.000001s : 0.01% optimize.add_comm_op_reuse_tag : 0.000001s : 0.01% optimize.interleave_split_concat_branches : 0.000001s : 0.01% optimize.interleave_parallel_branches : 0.000001s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.02% optimize.control_data_broadcast_order : 0.000015s : 0.13% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.01% optimize.offloading_packed_experts : 0.000004s : 0.04% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.05% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.01% optimize.overlap_recompute_comm : 0.000002s : 0.02% optimize.overlap_grad_ring_attention : 0.000005s : 0.04% optimize.overlap_grad_flash_sp : 0.000021s : 0.18% optimize.begin_end_overlap_inline : 0.000001s : 0.01% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.02% optimize.split_layernorm_comm : 0.000002s : 0.01% optimize.handle_group_info : 0.000001s : 0.01% optimize.symbol_engine_optimizer.build : 0.000003s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000011s : 0.10% optimize.symbol_engine_optimizer.elim_not_effective : 0.000015s : 0.13% optimize.symbol_engine_optimizer.opt_reshape : 0.000008s : 0.07% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000012s : 0.11% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.02% pipeline_parallel_scheduler : 0.000002s : 0.02% auto_monad_reorder : 0.000020s : 0.17% get_jit_bprop_graph : 0.000002s : 0.02% rewriter_after_jit_bprop_graph : 0.000004s : 0.03% opt_after_jit_grad : 0.000464s : 3.99% validate : 0.000039s : 0.33% Time group info: ------[substitution.] 0.000218 38 9.41% : 0.000020s : 3: substitution.cast_eliminate 1.09% : 0.000002s : 3: substitution.elim_not_effective 0.85% : 0.000002s : 3: substitution.fold_const_symbol 2.92% : 0.000006s : 5: substitution.graph_param_transform 71.51% : 0.000156s : 4: substitution.inline 2.16% : 0.000005s : 6: substitution.j_node_and_user_rematch 2.68% : 0.000006s : 6: substitution.remove_not_recompute_node 2.45% : 0.000005s : 4: substitution.replace_old_param 6.93% : 0.000015s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.005973 2 88.35% : 0.005277s : 1: type_inference.infer 11.65% : 0.000696s : 1: type_inference.specialize ------[replace.] 0.000064 8 60.34% : 0.000039s : 4: replace.inline 39.66% : 0.000026s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000166 8 92.14% : 0.000153s : 4: match.inline 7.86% : 0.000013s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000255 1596 0.97% : 0.000002s : 17: predicate.accumulaten_eliminater 0.68% : 0.000002s : 5: predicate.ad_related_special_op_eliminate 0.55% : 0.000001s : 10: predicate.addn_check_dump 0.99% : 0.000003s : 17: predicate.addn_zero_filter 0.87% : 0.000002s : 17: predicate.adjust_all_reduce_mul_add 2.29% : 0.000006s : 27: predicate.arithmetic_simplify 1.25% : 0.000003s : 17: predicate.cast_eliminate 0.66% : 0.000002s : 10: predicate.check_bprop_eliminate 0.57% : 0.000001s : 10: predicate.compare_switch_simplify 0.18% : 0.000000s : 5: predicate.const_output_eliminate 0.66% : 0.000002s : 10: predicate.depend_value_elim 0.99% : 0.000003s : 17: predicate.dict_get_item_const_eliminator 1.08% : 0.000003s : 17: predicate.dict_get_item_eliminator 0.88% : 0.000002s : 17: predicate.dict_set_item_eliminator 0.95% : 0.000002s : 10: predicate.dumpgradient_eliminate 0.20% : 0.000001s : 5: predicate.elim_not_effective 0.32% : 0.000001s : 5: predicate.elim_shapecalc_of_broadcastargs 1.19% : 0.000003s : 22: predicate.environ_add_const_eliminate 1.10% : 0.000003s : 22: predicate.environ_get_add_eliminate 1.15% : 0.000003s : 22: predicate.environ_get_depend_swap 1.75% : 0.000004s : 32: predicate.environ_get_eliminate 1.19% : 0.000003s : 22: predicate.environ_get_set_eliminate 1.41% : 0.000004s : 25: predicate.exchange_switch_depend_value 2.30% : 0.000006s : 25: predicate.float_depend_g_call 0.51% : 0.000001s : 10: predicate.float_environ_get_switch 0.82% : 0.000002s : 15: predicate.float_tuple_getitem_switch 0.18% : 0.000000s : 5: predicate.fold_const_symbol 0.70% : 0.000002s : 10: predicate.get_grad_eliminate 0.20% : 0.000001s : 5: predicate.graph_param_transform 0.62% : 0.000002s : 10: predicate.incorporate_call 0.49% : 0.000001s : 10: predicate.incorporate_call_switch 6.24% : 0.000016s : 72: predicate.inline 0.77% : 0.000002s : 10: predicate.inline_without_move 0.33% : 0.000001s : 10: predicate.j_node_and_user_rematch 0.78% : 0.000002s : 10: predicate.less_batch_normalization 1.86% : 0.000005s : 31: predicate.list_to_tuple_eliminator_ 2.56% : 0.000007s : 48: predicate.load_eliminater 0.81% : 0.000002s : 5: predicate.loop_unroll_after_grad 1.98% : 0.000005s : 36: predicate.loop_unroll_before_grad 1.76% : 0.000004s : 27: predicate.make_slice_get_slice_eliminator 0.60% : 0.000002s : 10: predicate.merge_addn 0.56% : 0.000001s : 10: predicate.micro_step_allgather_replace 0.57% : 0.000001s : 10: predicate.mini_step_allgather_replace 0.88% : 0.000002s : 17: predicate.minmaximum_grad 0.85% : 0.000002s : 5: predicate.mutable_eliminate 0.41% : 0.000001s : 5: predicate.opt_reshape 0.47% : 0.000001s : 5: predicate.parallel_virtual_node 1.74% : 0.000004s : 25: predicate.partial_defer_inline 1.67% : 0.000004s : 26: predicate.partial_eliminate 1.00% : 0.000003s : 17: predicate.print_const_string_wrapper 0.54% : 0.000001s : 10: predicate.reduce_all_const_elim 1.25% : 0.000003s : 17: predicate.reduce_eliminate 2.60% : 0.000007s : 48: predicate.redundant_stop_gradient_eliminater 0.39% : 0.000001s : 10: predicate.remove_not_recompute_node 1.48% : 0.000004s : 31: predicate.replace_applicator 0.41% : 0.000001s : 10: predicate.replace_old_param 0.34% : 0.000001s : 5: predicate.reset_defer_inline 0.98% : 0.000002s : 17: predicate.reshape_eliminate 0.56% : 0.000001s : 10: predicate.row_tensor_add_zeros_like 0.33% : 0.000001s : 5: predicate.row_tensor_eliminate 0.74% : 0.000002s : 10: predicate.same_eliminate 0.42% : 0.000001s : 10: predicate.set_cell_output_no_recompute 0.76% : 0.000002s : 10: predicate.shard_identity_eliminate 0.72% : 0.000002s : 10: predicate.special_op_eliminate 0.71% : 0.000002s : 10: predicate.specialize_transform 0.85% : 0.000002s : 10: predicate.split_environ_get_set_with_tuple_value 0.72% : 0.000002s : 10: predicate.stack_unstack_eliminate 0.35% : 0.000001s : 5: predicate.switch_call_monad_eliminater 1.50% : 0.000004s : 25: predicate.switch_defer_inline 2.09% : 0.000005s : 35: predicate.switch_layer_defer_inline 4.89% : 0.000012s : 76: predicate.switch_simplify 1.10% : 0.000003s : 17: predicate.tile_eliminate 0.98% : 0.000003s : 17: predicate.transpose_eliminate 1.52% : 0.000004s : 27: predicate.tuple_list_convert_item_index_to_positive 1.74% : 0.000004s : 27: predicate.tuple_list_get_item_const_eliminator 1.59% : 0.000004s : 27: predicate.tuple_list_get_item_depend_reorder 3.06% : 0.000008s : 41: predicate.tuple_list_get_item_eliminator 1.59% : 0.000004s : 27: predicate.tuple_list_get_set_item_eliminator 2.27% : 0.000006s : 37: predicate.tuple_list_set_item_eliminator 1.79% : 0.000005s : 31: predicate.tuple_to_list_eliminator_ 2.52% : 0.000006s : 48: predicate.updatestate_pure_node_eliminater 3.17% : 0.000008s : 58: predicate.updatestate_useless_node_eliminater 0.34% : 0.000001s : 5: predicate.value_based_eliminate 0.68% : 0.000002s : 10: predicate.virtual_dataset_eliminate 0.71% : 0.000002s : 10: predicate.virtual_output_eliminate 0.29% : 0.000001s : 5: predicate.virtual_view_grad_eliminate 0.46% : 0.000001s : 5: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000537 11 52.30% : 0.000281s : 5: func_graph_cloner_run.FuncGraphClonerGraph 47.70% : 0.000256s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.026970 192 0.01% : 0.000004s : 1: ForceFp32Comm 12.15% : 0.003276s : 1: add_attr 12.11% : 0.003265s : 1: add_attr_with_inline 0.01% : 0.000004s : 1: add_comm_op_reuse_tag 0.21% : 0.000055s : 1: add_recomputation 0.01% : 0.000004s : 1: assign_add_opt 0.26% : 0.000070s : 1: auto_monad 0.09% : 0.000024s : 1: auto_monad_reorder 0.01% : 0.000003s : 1: begin_end_overlap_inline 0.02% : 0.000005s : 1: bias_add_comm_swap 1.49% : 0.000403s : 1: bootstrap 0.11% : 0.000029s : 1: cconv 0.01% : 0.000004s : 1: comm_op_add_attrs 0.07% : 0.000018s : 1: control_data_broadcast_order 0.04% : 0.000011s : 1: convert_after_rewriter 0.11% : 0.000029s : 1: cse_after_recomputation 0.02% : 0.000005s : 1: dataset_repeat_opt 0.02% : 0.000005s : 1: detach_backward 0.03% : 0.000009s : 1: environ_conv 0.09% : 0.000024s : 1: event_method 0.02% : 0.000005s : 1: full_micro_interleaved_order_control 0.02% : 0.000005s : 1: get_jit_bprop_graph 0.03% : 0.000009s : 1: graph_reusing 0.02% : 0.000005s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000004s : 1: handle_group_info 0.02% : 0.000005s : 1: inline 0.02% : 0.000006s : 1: insert-virtual-dataset 0.01% : 0.000004s : 1: interleave_parallel_branches 0.01% : 0.000004s : 1: interleave_split_concat_branches 0.02% : 0.000006s : 1: label_fine_grained_interleaved_index 0.02% : 0.000007s : 1: label_micro_interleaved_index 1.61% : 0.000435s : 1: loop_unroll 0.02% : 0.000004s : 1: merge_cast_opt 0.02% : 0.000005s : 1: micro_interleaved_order_control 2.29% : 0.000619s : 1: mutable_eliminate 0.03% : 0.000007s : 1: offloading_packed_experts 0.06% : 0.000016s : 1: opt.transform.loop_unroll_optimizer 0.06% : 0.000016s : 1: opt.transform.mutable_eliminate 5.36% : 0.001444s : 78: opt.transform.opt_a 0.13% : 0.000036s : 1: opt.transform.opt_after_cconv 0.10% : 0.000028s : 1: opt.transform.opt_after_jit_grad 0.49% : 0.000131s : 28: opt.transform.opt_b 0.21% : 0.000057s : 2: opt.transform.opt_trans_graph 0.16% : 0.000043s : 4: opt.transform.symbol_engine_opt 11.80% : 0.003183s : 1: opt_a 0.44% : 0.000118s : 1: opt_after_cconv 1.75% : 0.000472s : 1: opt_after_jit_grad 0.90% : 0.000242s : 1: opt_b 19.96% : 0.005384s : 1: optimize 0.08% : 0.000021s : 1: optimize_parallel_all_gather_comm 0.03% : 0.000009s : 1: order_py_execute_after_rewriter 0.09% : 0.000024s : 1: overlap_grad_flash_sp 0.01% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.03% : 0.000008s : 1: overlap_grad_ring_attention 0.02% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.02% : 0.000005s : 1: overlap_param_gather 0.01% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.03% : 0.000008s : 1: overlap_recompute_and_grad_model_parallel 0.02% : 0.000005s : 1: overlap_recompute_comm 0.03% : 0.000007s : 1: parallel-infer-symbol 0.01% : 0.000004s : 1: parallel-infer-symbol-second 0.02% : 0.000005s : 1: partial_unused_args_eliminate 0.02% : 0.000005s : 1: pipeline_parallel_scheduler 0.02% : 0.000004s : 1: pipeline_split 0.15% : 0.000039s : 1: pre_auto_parallel 0.11% : 0.000030s : 1: py_interpret_to_execute 0.06% : 0.000017s : 1: py_interpret_to_execute_after_opt_a 0.02% : 0.000004s : 1: remove_cast_before_assign_add 0.06% : 0.000018s : 1: remove_dup_value 1.67% : 0.000452s : 1: renormalize.infer 1.29% : 0.000347s : 1: renormalize.specialize 0.02% : 0.000005s : 1: reorder_send_recv_between_fp_bp 0.03% : 0.000007s : 1: rewriter_after_jit_bprop_graph 0.16% : 0.000044s : 1: rewriter_after_opt_a 0.34% : 0.000093s : 1: rewriter_before_opt_a 0.02% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.02% : 0.000005s : 1: slice_recompute_activation 0.02% : 0.000004s : 1: split_layernorm_comm 0.02% : 0.000005s : 1: split_matmul_comm_elemetwise 0.03% : 0.000009s : 1: swap_dp_allreduce_reducescatter 0.32% : 0.000085s : 1: symbol_engine_optimizer 0.32% : 0.000086s : 1: tuple_transform 22.42% : 0.006045s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:45:40.496.755 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:45:40.497.014 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0154772, [21] [bootstrap]: 0.00037247 [type_inference]: 0.00545116 [event_method]: 1.801e-05 [auto_monad]: 6.14e-05 [graph_reusing]: 5.48002e-06 [inline]: 2.52001e-06 [add_attr]: 0.00299796, [1] [add_attr_with_inline]: 0.0029898, [1] [Cycle 1]: 6.614e-05, [2] [tag_attr]: 1.875e-05 [meta_addattr_fg_expand]: 5.77999e-06 [parallel-infer-symbol]: 2.99999e-06 [pre_auto_parallel]: 3.152e-05 [insert-virtual-dataset]: 2.28998e-06 [parallel-infer-symbol-second]: 7.7e-07 [dataset_repeat_opt]: 1.77999e-06 [pipeline_split]: 1.54e-06 [optimize]: 0.00539368, [53] [py_interpret_to_execute]: 2.725e-05 [rewriter_before_opt_a]: 8.203e-05 [opt_a]: 0.00308612, [2] [Cycle 1]: 0.00224966, [45] [expand_dump_flag]: 3.26999e-06 [switch_simplify]: 4.427e-05 [loop_unroll]: 3.031e-05 [a_1]: 0.00062573 [with_stream_mark]: 1.681e-05 [recompute_prepare]: 1.207e-05 [updatestate_depend_eliminate]: 4.12003e-06 [updatestate_assign_eliminate]: 3.68e-06 [updatestate_loads_eliminate]: 3.53e-06 [parameter_eliminate]: 2.56e-06 [a_2]: 0.00011877 [accelerated_algorithm]: 8.00999e-06 [shard]: 2.04e-06 [meta_shard_fg_expand]: 2.04e-06 [shard_inline]: 7.55998e-06 [merge_send_recv]: 8.94e-06 [auto_parallel]: 7.35e-06 [parallel]: 1.891e-05 [flash_sp]: 8.85001e-06 [merge_comm]: 4.07e-06 [allreduce_fusion]: 3.48999e-06 [matmul_add_comm_reduction]: 9.81998e-06 [allreduce_slice_to_reducescatter]: 7.7e-07 [virtual_shard_identity]: 9.42001e-06 [virtual_dataset]: 6.91001e-06 [get_grad_eliminate_]: 6.48e-06 [virtual_output]: 6.50002e-06 [merge_forward]: 3.93999e-06 [cell_reuse_recompute_pass]: 1.36002e-06 [offload_activation]: 1.044e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.613e-05 [merge_recompute_call_nodes]: 1.75001e-06 [before_grad]: 1.177e-05 [set_forward_comm_id_for_comm_node_pass]: 4.43999e-06 [meta_fg_expand]: 2.94999e-06 [flash_sp_send_recv_attached]: 2.61999e-06 [receive_attached]: 2.76e-06 [after_resolve]: 1.201e-05 [a_after_grad]: 1.036e-05 [renormalize]: 0.00068455 [add_forward_monad_depend]: 5.59e-06 [auto_monad_grad]: 2.40002e-06 [auto_monad_eliminator]: 1.619e-05 [cse]: 2.786e-05 [a_3]: 6.371e-05 [Cycle 2]: 0.00082288, [45] [expand_dump_flag]: 1.55001e-06 [switch_simplify]: 7.44002e-06 [loop_unroll]: 6.17999e-06 [a_1]: 0.00012912 [with_stream_mark]: 1.089e-05 [recompute_prepare]: 7.05e-06 [updatestate_depend_eliminate]: 3.11999e-06 [updatestate_assign_eliminate]: 2.54001e-06 [updatestate_loads_eliminate]: 2.04999e-06 [parameter_eliminate]: 1.24998e-06 [a_2]: 0.00010122 [accelerated_algorithm]: 6.31e-06 [shard]: 1.15999e-06 [meta_shard_fg_expand]: 1.45001e-06 [shard_inline]: 6.17999e-06 [merge_send_recv]: 4.39002e-06 [auto_parallel]: 9.08002e-06 [parallel]: 5.24998e-06 [flash_sp]: 3.73999e-06 [merge_comm]: 3.28e-06 [allreduce_fusion]: 3.21001e-06 [matmul_add_comm_reduction]: 6.24999e-06 [allreduce_slice_to_reducescatter]: 3.80009e-07 [virtual_shard_identity]: 7.11001e-06 [virtual_dataset]: 6.11e-06 [get_grad_eliminate_]: 7.58999e-06 [virtual_output]: 5.82999e-06 [merge_forward]: 3.16001e-06 [cell_reuse_recompute_pass]: 1.39998e-06 [offload_activation]: 6.78e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.578e-05 [merge_recompute_call_nodes]: 9.09989e-07 [before_grad]: 1.06e-05 [set_forward_comm_id_for_comm_node_pass]: 3.66999e-06 [meta_fg_expand]: 2.26e-06 [flash_sp_send_recv_attached]: 9.00007e-07 [receive_attached]: 1.07998e-06 [after_resolve]: 1.051e-05 [a_after_grad]: 9.49e-06 [renormalize]: 8.00064e-08 [add_forward_monad_depend]: 1.60999e-06 [auto_monad_grad]: 1.23002e-06 [auto_monad_eliminator]: 7.03998e-06 [cse]: 1.308e-05 [a_3]: 5.035e-05 [py_interpret_to_execute_after_opt_a]: 1.264e-05 [slice_cell_reuse_recomputed_activation]: 4.90999e-06 [rewriter_after_opt_a]: 3.946e-05 [convert_after_rewriter]: 1.081e-05 [order_py_execute_after_rewriter]: 8.23999e-06 [mutable_eliminate]: 0.00058141 [opt_b]: 0.0002669, [1] [Cycle 1]: 0.00025728, [7] [b_1]: 0.00016653 [b_2]: 7.97e-06 [updatestate_depend_eliminate]: 5.77999e-06 [updatestate_assign_eliminate]: 2.41998e-06 [updatestate_loads_eliminate]: 2.34999e-06 [renormalize]: 6.10016e-07 [cse]: 1.632e-05 [optimize_parallel_all_gather_comm]: 1.959e-05 [overlap_param_gather]: 4.48999e-06 [cconv]: 2.749e-05 [loop_unroll]: 0.00043507 [opt_after_cconv]: 0.00012382, [1] [Cycle 1]: 0.00011543, [7] [c_1]: 3.081e-05 [parameter_eliminate]: 2.36998e-06 [updatestate_depend_eliminate]: 4.93001e-06 [updatestate_assign_eliminate]: 2.76999e-06 [updatestate_loads_eliminate]: 2.29001e-06 [cse]: 1.705e-05 [renormalize]: 3.80009e-07 [remove_dup_value]: 1.599e-05 [tuple_transform]: 8.92e-05, [1] [Cycle 1]: 8.223e-05, [4] [d_1]: 4.281e-05 [none_parameter_eliminate]: 1.52001e-06 [renormalize]: 2.29978e-07 [switch_simplify]: 7.3e-06 [partial_unused_args_eliminate]: 4.25e-06 [add_recomputation]: 4.777e-05 [cse_after_recomputation]: 2.656e-05, [1] [Cycle 1]: 1.991e-05, [1] [cse]: 1.114e-05 [environ_conv]: 8.04002e-06 [swap_dp_allreduce_reducescatter]: 8.3e-06 [bias_add_comm_swap]: 5.02e-06 [label_micro_interleaved_index]: 6.96999e-06 [label_fine_grained_interleaved_index]: 5.20001e-06 [merge_cast_opt]: 3.83001e-06 [slice_recompute_activation]: 4.43999e-06 [micro_interleaved_order_control]: 4.45e-06 [assign_add_opt]: 3.87998e-06 [ForceFp32Comm]: 3.23e-06 [remove_cast_before_assign_add]: 3.33998e-06 [full_micro_interleaved_order_control]: 4.38999e-06 [reorder_send_recv_between_fp_bp]: 5.33002e-06 [comm_op_add_attrs]: 3.40998e-06 [add_comm_op_reuse_tag]: 3.62002e-06 [interleave_split_concat_branches]: 3.73001e-06 [interleave_parallel_branches]: 3.31999e-06 [overlap_opt_shard_in_pipeline]: 3.4e-06 [overlap_opt_shard_grad_in_pipeline]: 4.07e-06 [control_data_broadcast_order]: 1.48e-05 [grouped_pairwise_exchange_alltoall]: 4.13999e-06 [offloading_packed_experts]: 6.17999e-06 [overlap_recompute_and_grad_model_parallel]: 7.14001e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.6e-06 [overlap_recompute_allgather_and_fa_grad]: 3.78999e-06 [overlap_recompute_comm]: 4.85999e-06 [overlap_grad_ring_attention]: 6.60002e-06 [overlap_grad_flash_sp]: 2.027e-05 [begin_end_overlap_inline]: 3.22002e-06 [split_matmul_comm_elemetwise]: 4.85001e-06 [split_layernorm_comm]: 4.31002e-06 [handle_group_info]: 3.3e-06 [symbol_engine_optimizer]: 9.528e-05, [1] [Cycle 1]: 8.852e-05, [6] [build]: 3.03e-06 [elim_shapecalc]: 1.01e-05 [elim_not_effective]: 1.301e-05 [opt_reshape]: 6.78e-06 [fold_const_symbol]: 1.057e-05 [renormalize]: 3.00002e-07 [detach_backward]: 3.41999e-06 [pipeline_parallel_scheduler]: 2.72001e-06 [auto_monad_reorder]: 1.868e-05 [get_jit_bprop_graph]: 1.39e-06 [rewriter_after_jit_bprop_graph]: 3.74002e-06 [opt_after_jit_grad]: 0.00051944 [validate]: 3.565e-05 Sums bootstrap : 0.000372s : 3.46% type_inference : 0.005451s : 50.60% event_method : 0.000018s : 0.17% auto_monad : 0.000061s : 0.57% graph_reusing : 0.000005s : 0.05% inline : 0.000003s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000019s : 0.17% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.05% parallel-infer-symbol : 0.000003s : 0.03% pre_auto_parallel : 0.000032s : 0.29% insert-virtual-dataset : 0.000002s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000027s : 0.25% optimize.rewriter_before_opt_a : 0.000082s : 0.76% optimize.opt_a.expand_dump_flag : 0.000005s : 0.04% optimize.opt_a.switch_simplify : 0.000052s : 0.48% optimize.opt_a.loop_unroll : 0.000036s : 0.34% optimize.opt_a.a_1 : 0.000755s : 7.01% optimize.opt_a.with_stream_mark : 0.000028s : 0.26% optimize.opt_a.recompute_prepare : 0.000019s : 0.18% optimize.opt_a.updatestate_depend_eliminate : 0.000007s : 0.07% optimize.opt_a.updatestate_assign_eliminate : 0.000006s : 0.06% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.05% optimize.opt_a.parameter_eliminate : 0.000004s : 0.04% optimize.opt_a.a_2 : 0.000220s : 2.04% optimize.opt_a.accelerated_algorithm : 0.000014s : 0.13% optimize.opt_a.shard : 0.000003s : 0.03% optimize.opt_a.meta_shard_fg_expand : 0.000003s : 0.03% optimize.opt_a.shard_inline : 0.000014s : 0.13% optimize.opt_a.merge_send_recv : 0.000013s : 0.12% optimize.opt_a.auto_parallel : 0.000016s : 0.15% optimize.opt_a.parallel : 0.000024s : 0.22% optimize.opt_a.flash_sp : 0.000013s : 0.12% optimize.opt_a.merge_comm : 0.000007s : 0.07% optimize.opt_a.allreduce_fusion : 0.000007s : 0.06% optimize.opt_a.matmul_add_comm_reduction : 0.000016s : 0.15% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000017s : 0.15% optimize.opt_a.virtual_dataset : 0.000013s : 0.12% optimize.opt_a.get_grad_eliminate_ : 0.000014s : 0.13% optimize.opt_a.virtual_output : 0.000012s : 0.11% optimize.opt_a.merge_forward : 0.000007s : 0.07% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.03% optimize.opt_a.offload_activation : 0.000017s : 0.16% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000032s : 0.30% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.02% optimize.opt_a.before_grad : 0.000022s : 0.21% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000008s : 0.08% optimize.opt_a.meta_fg_expand : 0.000005s : 0.05% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.03% optimize.opt_a.receive_attached : 0.000004s : 0.04% optimize.opt_a.after_resolve : 0.000023s : 0.21% optimize.opt_a.a_after_grad : 0.000020s : 0.18% optimize.opt_a.renormalize : 0.000685s : 6.35% optimize.opt_a.add_forward_monad_depend : 0.000007s : 0.07% optimize.opt_a.auto_monad_grad : 0.000004s : 0.03% optimize.opt_a.auto_monad_eliminator : 0.000023s : 0.22% optimize.opt_a.cse : 0.000041s : 0.38% optimize.opt_a.a_3 : 0.000114s : 1.06% optimize.py_interpret_to_execute_after_opt_a : 0.000013s : 0.12% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.05% optimize.rewriter_after_opt_a : 0.000039s : 0.37% optimize.convert_after_rewriter : 0.000011s : 0.10% optimize.order_py_execute_after_rewriter : 0.000008s : 0.08% optimize.mutable_eliminate : 0.000581s : 5.40% optimize.opt_b.b_1 : 0.000167s : 1.55% optimize.opt_b.b_2 : 0.000008s : 0.07% optimize.opt_b.updatestate_depend_eliminate : 0.000006s : 0.05% optimize.opt_b.updatestate_assign_eliminate : 0.000002s : 0.02% optimize.opt_b.updatestate_loads_eliminate : 0.000002s : 0.02% optimize.opt_b.renormalize : 0.000001s : 0.01% optimize.opt_b.cse : 0.000016s : 0.15% optimize.optimize_parallel_all_gather_comm : 0.000020s : 0.18% optimize.overlap_param_gather : 0.000004s : 0.04% optimize.cconv : 0.000027s : 0.26% optimize.loop_unroll : 0.000435s : 4.04% optimize.opt_after_cconv.c_1 : 0.000031s : 0.29% optimize.opt_after_cconv.parameter_eliminate : 0.000002s : 0.02% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000005s : 0.05% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.02% optimize.opt_after_cconv.cse : 0.000017s : 0.16% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000016s : 0.15% optimize.tuple_transform.d_1 : 0.000043s : 0.40% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000007s : 0.07% optimize.partial_unused_args_eliminate : 0.000004s : 0.04% optimize.add_recomputation : 0.000048s : 0.44% optimize.cse_after_recomputation.cse : 0.000011s : 0.10% optimize.environ_conv : 0.000008s : 0.07% optimize.swap_dp_allreduce_reducescatter : 0.000008s : 0.08% optimize.bias_add_comm_swap : 0.000005s : 0.05% optimize.label_micro_interleaved_index : 0.000007s : 0.06% optimize.label_fine_grained_interleaved_index : 0.000005s : 0.05% optimize.merge_cast_opt : 0.000004s : 0.04% optimize.slice_recompute_activation : 0.000004s : 0.04% optimize.micro_interleaved_order_control : 0.000004s : 0.04% optimize.assign_add_opt : 0.000004s : 0.04% optimize.ForceFp32Comm : 0.000003s : 0.03% optimize.remove_cast_before_assign_add : 0.000003s : 0.03% optimize.full_micro_interleaved_order_control : 0.000004s : 0.04% optimize.reorder_send_recv_between_fp_bp : 0.000005s : 0.05% optimize.comm_op_add_attrs : 0.000003s : 0.03% optimize.add_comm_op_reuse_tag : 0.000004s : 0.03% optimize.interleave_split_concat_branches : 0.000004s : 0.03% optimize.interleave_parallel_branches : 0.000003s : 0.03% optimize.overlap_opt_shard_in_pipeline : 0.000003s : 0.03% optimize.overlap_opt_shard_grad_in_pipeline : 0.000004s : 0.04% optimize.control_data_broadcast_order : 0.000015s : 0.14% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.04% optimize.offloading_packed_experts : 0.000006s : 0.06% optimize.overlap_recompute_and_grad_model_parallel : 0.000007s : 0.07% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.03% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.04% optimize.overlap_recompute_comm : 0.000005s : 0.05% optimize.overlap_grad_ring_attention : 0.000007s : 0.06% optimize.overlap_grad_flash_sp : 0.000020s : 0.19% optimize.begin_end_overlap_inline : 0.000003s : 0.03% optimize.split_matmul_comm_elemetwise : 0.000005s : 0.05% optimize.split_layernorm_comm : 0.000004s : 0.04% optimize.handle_group_info : 0.000003s : 0.03% optimize.symbol_engine_optimizer.build : 0.000003s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000010s : 0.09% optimize.symbol_engine_optimizer.elim_not_effective : 0.000013s : 0.12% optimize.symbol_engine_optimizer.opt_reshape : 0.000007s : 0.06% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000011s : 0.10% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000003s : 0.03% pipeline_parallel_scheduler : 0.000003s : 0.03% auto_monad_reorder : 0.000019s : 0.17% get_jit_bprop_graph : 0.000001s : 0.01% rewriter_after_jit_bprop_graph : 0.000004s : 0.03% opt_after_jit_grad : 0.000519s : 4.82% validate : 0.000036s : 0.33% Time group info: ------[substitution.] 0.000178 28 1.20% : 0.000002s : 2: substitution.elim_not_effective 0.96% : 0.000002s : 2: substitution.fold_const_symbol 3.37% : 0.000006s : 4: substitution.graph_param_transform 77.92% : 0.000139s : 4: substitution.inline 2.36% : 0.000004s : 4: substitution.j_node_and_user_rematch 2.76% : 0.000005s : 4: substitution.remove_not_recompute_node 2.34% : 0.000004s : 4: substitution.replace_old_param 9.08% : 0.000016s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.005405 2 87.92% : 0.004752s : 1: type_inference.infer 12.08% : 0.000653s : 1: type_inference.specialize ------[replace.] 0.000059 8 61.50% : 0.000036s : 4: replace.inline 38.50% : 0.000023s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000151 8 90.54% : 0.000136s : 4: match.inline 9.46% : 0.000014s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000208 1278 0.84% : 0.000002s : 13: predicate.accumulaten_eliminater 0.77% : 0.000002s : 4: predicate.ad_related_special_op_eliminate 0.52% : 0.000001s : 8: predicate.addn_check_dump 0.83% : 0.000002s : 13: predicate.addn_zero_filter 0.80% : 0.000002s : 13: predicate.adjust_all_reduce_mul_add 2.01% : 0.000004s : 21: predicate.arithmetic_simplify 0.92% : 0.000002s : 13: predicate.cast_eliminate 0.67% : 0.000001s : 8: predicate.check_bprop_eliminate 0.52% : 0.000001s : 8: predicate.compare_switch_simplify 0.19% : 0.000000s : 4: predicate.const_output_eliminate 0.67% : 0.000001s : 8: predicate.depend_value_elim 0.94% : 0.000002s : 13: predicate.dict_get_item_const_eliminator 1.12% : 0.000002s : 13: predicate.dict_get_item_eliminator 0.86% : 0.000002s : 13: predicate.dict_set_item_eliminator 0.95% : 0.000002s : 8: predicate.dumpgradient_eliminate 0.23% : 0.000000s : 4: predicate.elim_not_effective 0.37% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.20% : 0.000002s : 17: predicate.environ_add_const_eliminate 1.07% : 0.000002s : 17: predicate.environ_get_add_eliminate 1.09% : 0.000002s : 17: predicate.environ_get_depend_swap 2.01% : 0.000004s : 25: predicate.environ_get_eliminate 1.05% : 0.000002s : 17: predicate.environ_get_set_eliminate 1.43% : 0.000003s : 21: predicate.exchange_switch_depend_value 2.74% : 0.000006s : 21: predicate.float_depend_g_call 0.57% : 0.000001s : 8: predicate.float_environ_get_switch 0.88% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.18% : 0.000000s : 4: predicate.fold_const_symbol 0.64% : 0.000001s : 8: predicate.get_grad_eliminate 0.22% : 0.000000s : 4: predicate.graph_param_transform 0.60% : 0.000001s : 8: predicate.incorporate_call 0.50% : 0.000001s : 8: predicate.incorporate_call_switch 6.21% : 0.000013s : 58: predicate.inline 0.79% : 0.000002s : 8: predicate.inline_without_move 0.43% : 0.000001s : 8: predicate.j_node_and_user_rematch 0.89% : 0.000002s : 8: predicate.less_batch_normalization 1.87% : 0.000004s : 25: predicate.list_to_tuple_eliminator_ 2.45% : 0.000005s : 38: predicate.load_eliminater 0.73% : 0.000002s : 4: predicate.loop_unroll_after_grad 2.33% : 0.000005s : 34: predicate.loop_unroll_before_grad 1.56% : 0.000003s : 21: predicate.make_slice_get_slice_eliminator 0.51% : 0.000001s : 8: predicate.merge_addn 0.61% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.56% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.83% : 0.000002s : 13: predicate.minmaximum_grad 0.85% : 0.000002s : 4: predicate.mutable_eliminate 0.32% : 0.000001s : 4: predicate.opt_reshape 0.34% : 0.000001s : 4: predicate.parallel_virtual_node 1.81% : 0.000004s : 21: predicate.partial_defer_inline 1.67% : 0.000003s : 21: predicate.partial_eliminate 0.85% : 0.000002s : 13: predicate.print_const_string_wrapper 0.59% : 0.000001s : 8: predicate.reduce_all_const_elim 1.12% : 0.000002s : 13: predicate.reduce_eliminate 2.50% : 0.000005s : 38: predicate.redundant_stop_gradient_eliminater 0.70% : 0.000001s : 8: predicate.remove_not_recompute_node 1.34% : 0.000003s : 25: predicate.replace_applicator 0.55% : 0.000001s : 8: predicate.replace_old_param 0.28% : 0.000001s : 4: predicate.reset_defer_inline 0.93% : 0.000002s : 13: predicate.reshape_eliminate 0.62% : 0.000001s : 8: predicate.row_tensor_add_zeros_like 0.32% : 0.000001s : 4: predicate.row_tensor_eliminate 0.77% : 0.000002s : 8: predicate.same_eliminate 0.44% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.95% : 0.000002s : 8: predicate.shard_identity_eliminate 0.79% : 0.000002s : 8: predicate.special_op_eliminate 0.72% : 0.000002s : 8: predicate.specialize_transform 0.88% : 0.000002s : 8: predicate.split_environ_get_set_with_tuple_value 0.74% : 0.000002s : 8: predicate.stack_unstack_eliminate 0.33% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.52% : 0.000003s : 21: predicate.switch_defer_inline 2.12% : 0.000004s : 29: predicate.switch_layer_defer_inline 5.49% : 0.000011s : 67: predicate.switch_simplify 0.89% : 0.000002s : 13: predicate.tile_eliminate 0.97% : 0.000002s : 13: predicate.transpose_eliminate 1.62% : 0.000003s : 21: predicate.tuple_list_convert_item_index_to_positive 1.69% : 0.000004s : 21: predicate.tuple_list_get_item_const_eliminator 1.42% : 0.000003s : 21: predicate.tuple_list_get_item_depend_reorder 3.42% : 0.000007s : 33: predicate.tuple_list_get_item_eliminator 1.45% : 0.000003s : 21: predicate.tuple_list_get_set_item_eliminator 2.11% : 0.000004s : 29: predicate.tuple_list_set_item_eliminator 1.77% : 0.000004s : 25: predicate.tuple_to_list_eliminator_ 2.44% : 0.000005s : 38: predicate.updatestate_pure_node_eliminater 3.07% : 0.000006s : 46: predicate.updatestate_useless_node_eliminater 0.40% : 0.000001s : 4: predicate.value_based_eliminate 0.66% : 0.000001s : 8: predicate.virtual_dataset_eliminate 0.62% : 0.000001s : 8: predicate.virtual_output_eliminate 0.34% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.44% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000537 11 50.28% : 0.000270s : 5: func_graph_cloner_run.FuncGraphClonerGraph 49.72% : 0.000267s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.025895 192 0.02% : 0.000006s : 1: ForceFp32Comm 11.61% : 0.003006s : 1: add_attr 11.56% : 0.002993s : 1: add_attr_with_inline 0.03% : 0.000007s : 1: add_comm_op_reuse_tag 0.20% : 0.000051s : 1: add_recomputation 0.02% : 0.000006s : 1: assign_add_opt 0.27% : 0.000070s : 1: auto_monad 0.10% : 0.000026s : 1: auto_monad_reorder 0.02% : 0.000006s : 1: begin_end_overlap_inline 0.03% : 0.000008s : 1: bias_add_comm_swap 1.58% : 0.000408s : 1: bootstrap 0.12% : 0.000031s : 1: cconv 0.02% : 0.000006s : 1: comm_op_add_attrs 0.07% : 0.000018s : 1: control_data_broadcast_order 0.05% : 0.000014s : 1: convert_after_rewriter 0.11% : 0.000030s : 1: cse_after_recomputation 0.03% : 0.000007s : 1: dataset_repeat_opt 0.07% : 0.000018s : 1: detach_backward 0.04% : 0.000011s : 1: environ_conv 0.11% : 0.000028s : 1: event_method 0.03% : 0.000007s : 1: full_micro_interleaved_order_control 0.03% : 0.000008s : 1: get_jit_bprop_graph 0.05% : 0.000012s : 1: graph_reusing 0.03% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.02% : 0.000006s : 1: handle_group_info 0.03% : 0.000008s : 1: inline 0.03% : 0.000008s : 1: insert-virtual-dataset 0.02% : 0.000006s : 1: interleave_parallel_branches 0.02% : 0.000006s : 1: interleave_split_concat_branches 0.03% : 0.000008s : 1: label_fine_grained_interleaved_index 0.04% : 0.000010s : 1: label_micro_interleaved_index 1.70% : 0.000441s : 1: loop_unroll 0.03% : 0.000006s : 1: merge_cast_opt 0.03% : 0.000007s : 1: micro_interleaved_order_control 2.27% : 0.000588s : 1: mutable_eliminate 0.03% : 0.000009s : 1: offloading_packed_experts 0.05% : 0.000013s : 1: opt.transform.loop_unroll_optimizer 0.05% : 0.000014s : 1: opt.transform.mutable_eliminate 4.63% : 0.001198s : 78: opt.transform.opt_a 0.11% : 0.000029s : 1: opt.transform.opt_after_cconv 0.11% : 0.000028s : 1: opt.transform.opt_after_jit_grad 0.40% : 0.000103s : 28: opt.transform.opt_b 0.18% : 0.000048s : 2: opt.transform.opt_trans_graph 0.14% : 0.000037s : 4: opt.transform.symbol_engine_opt 11.93% : 0.003089s : 1: opt_a 0.49% : 0.000127s : 1: opt_after_cconv 2.05% : 0.000531s : 1: opt_after_jit_grad 1.04% : 0.000270s : 1: opt_b 22.05% : 0.005710s : 1: optimize 0.09% : 0.000023s : 1: optimize_parallel_all_gather_comm 0.04% : 0.000011s : 1: order_py_execute_after_rewriter 0.09% : 0.000023s : 1: overlap_grad_flash_sp 0.02% : 0.000006s : 1: overlap_grad_matmul_and_grad_allreduce 0.04% : 0.000010s : 1: overlap_grad_ring_attention 0.03% : 0.000007s : 1: overlap_opt_shard_grad_in_pipeline 0.02% : 0.000006s : 1: overlap_opt_shard_in_pipeline 0.03% : 0.000008s : 1: overlap_param_gather 0.02% : 0.000006s : 1: overlap_recompute_allgather_and_fa_grad 0.04% : 0.000010s : 1: overlap_recompute_and_grad_model_parallel 0.03% : 0.000008s : 1: overlap_recompute_comm 0.04% : 0.000010s : 1: parallel-infer-symbol 0.02% : 0.000006s : 1: parallel-infer-symbol-second 0.03% : 0.000007s : 1: partial_unused_args_eliminate 0.04% : 0.000010s : 1: pipeline_parallel_scheduler 0.03% : 0.000007s : 1: pipeline_split 0.15% : 0.000039s : 1: pre_auto_parallel 0.12% : 0.000031s : 1: py_interpret_to_execute 0.06% : 0.000016s : 1: py_interpret_to_execute_after_opt_a 0.02% : 0.000006s : 1: remove_cast_before_assign_add 0.07% : 0.000019s : 1: remove_dup_value 1.31% : 0.000339s : 1: renormalize.infer 1.30% : 0.000337s : 1: renormalize.specialize 0.03% : 0.000008s : 1: reorder_send_recv_between_fp_bp 0.04% : 0.000010s : 1: rewriter_after_jit_bprop_graph 0.17% : 0.000043s : 1: rewriter_after_opt_a 0.33% : 0.000086s : 1: rewriter_before_opt_a 0.03% : 0.000008s : 1: slice_cell_reuse_recomputed_activation 0.03% : 0.000007s : 1: slice_recompute_activation 0.03% : 0.000007s : 1: split_layernorm_comm 0.03% : 0.000008s : 1: split_matmul_comm_elemetwise 0.04% : 0.000011s : 1: swap_dp_allreduce_reducescatter 0.38% : 0.000098s : 1: symbol_engine_optimizer 0.36% : 0.000092s : 1: tuple_transform 21.18% : 0.005484s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:45:40.770.305 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0149519, [21] [bootstrap]: 0.00039446 [type_inference]: 0.00574212 [event_method]: 1.835e-05 [auto_monad]: 6.203e-05 [graph_reusing]: 5.80002e-06 [inline]: 2.26e-06 [add_attr]: 0.00328281, [1] [add_attr_with_inline]: 0.00327272, [1] [Cycle 1]: 6.499e-05, [2] [tag_attr]: 2.005e-05 [meta_addattr_fg_expand]: 6.19999e-06 [parallel-infer-symbol]: 3.58e-06 [pre_auto_parallel]: 3.227e-05 [insert-virtual-dataset]: 2.47001e-06 [parallel-infer-symbol-second]: 8.89995e-07 [dataset_repeat_opt]: 2.01998e-06 [pipeline_split]: 1.74998e-06 [optimize]: 0.00470335, [53] [py_interpret_to_execute]: 2.402e-05 [rewriter_before_opt_a]: 8.32e-05 [opt_a]: 0.00267409, [2] [Cycle 1]: 0.00201749, [45] [expand_dump_flag]: 3.46999e-06 [switch_simplify]: 4.64e-05 [loop_unroll]: 3.109e-05 [a_1]: 0.00065119 [with_stream_mark]: 1.583e-05 [recompute_prepare]: 9.69e-06 [updatestate_depend_eliminate]: 4.05e-06 [updatestate_assign_eliminate]: 3.51001e-06 [updatestate_loads_eliminate]: 3.45e-06 [parameter_eliminate]: 2.00002e-06 [a_2]: 8.648e-05 [accelerated_algorithm]: 8.40001e-06 [shard]: 1.94e-06 [meta_shard_fg_expand]: 1.89e-06 [shard_inline]: 7.41001e-06 [merge_send_recv]: 9.00999e-06 [auto_parallel]: 6.53e-06 [parallel]: 1.818e-05 [flash_sp]: 8.59e-06 [merge_comm]: 4.87e-06 [allreduce_fusion]: 3.86999e-06 [matmul_add_comm_reduction]: 9.42001e-06 [allreduce_slice_to_reducescatter]: 8.2e-07 [virtual_shard_identity]: 8.62e-06 [virtual_dataset]: 7.21001e-06 [get_grad_eliminate_]: 6.53e-06 [virtual_output]: 6.61e-06 [merge_forward]: 3.56999e-06 [cell_reuse_recompute_pass]: 1.50999e-06 [offload_activation]: 9.89001e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.264e-05 [merge_recompute_call_nodes]: 1.49e-06 [before_grad]: 1.058e-05 [set_forward_comm_id_for_comm_node_pass]: 3.82002e-06 [meta_fg_expand]: 3.29001e-06 [flash_sp_send_recv_attached]: 2.97002e-06 [receive_attached]: 2.32999e-06 [after_resolve]: 1.342e-05 [a_after_grad]: 1.07e-05 [renormalize]: 0.00059148 [add_forward_monad_depend]: 5.59e-06 [auto_monad_grad]: 2.26e-06 [auto_monad_eliminator]: 1.538e-05 [cse]: 3.22e-05 [a_3]: 4.925e-05 [Cycle 2]: 0.00064675, [45] [expand_dump_flag]: 1.29e-06 [switch_simplify]: 7.5e-06 [loop_unroll]: 6.12999e-06 [a_1]: 0.00013022 [with_stream_mark]: 1.21e-05 [recompute_prepare]: 6.24999e-06 [updatestate_depend_eliminate]: 2.66999e-06 [updatestate_assign_eliminate]: 2.31e-06 [updatestate_loads_eliminate]: 2.31e-06 [parameter_eliminate]: 1.04e-06 [a_2]: 7.146e-05 [accelerated_algorithm]: 6.24001e-06 [shard]: 1.50999e-06 [meta_shard_fg_expand]: 1.42e-06 [shard_inline]: 6.01e-06 [merge_send_recv]: 5.09e-06 [auto_parallel]: 5.24998e-06 [parallel]: 4.82e-06 [flash_sp]: 3.88999e-06 [merge_comm]: 3.32002e-06 [allreduce_fusion]: 3.04001e-06 [matmul_add_comm_reduction]: 8.47998e-06 [allreduce_slice_to_reducescatter]: 5.89993e-07 [virtual_shard_identity]: 7e-06 [virtual_dataset]: 5.74999e-06 [get_grad_eliminate_]: 5.56998e-06 [virtual_output]: 5.67999e-06 [merge_forward]: 3.08e-06 [cell_reuse_recompute_pass]: 1.44998e-06 [offload_activation]: 6.84001e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.364e-05 [merge_recompute_call_nodes]: 9.79984e-07 [before_grad]: 9.14e-06 [set_forward_comm_id_for_comm_node_pass]: 3.2e-06 [meta_fg_expand]: 2.15002e-06 [flash_sp_send_recv_attached]: 1.16997e-06 [receive_attached]: 1.15001e-06 [after_resolve]: 1.02e-05 [a_after_grad]: 9.04e-06 [renormalize]: 9.00181e-08 [add_forward_monad_depend]: 1.45001e-06 [auto_monad_grad]: 1.29e-06 [auto_monad_eliminator]: 7.26001e-06 [cse]: 1.376e-05 [a_3]: 3.75e-05 [py_interpret_to_execute_after_opt_a]: 9.73002e-06 [slice_cell_reuse_recomputed_activation]: 1.79e-06 [rewriter_after_opt_a]: 3.398e-05 [convert_after_rewriter]: 7.14001e-06 [order_py_execute_after_rewriter]: 5.39998e-06 [mutable_eliminate]: 0.00053523 [opt_b]: 0.00020106, [1] [Cycle 1]: 0.00019479, [7] [b_1]: 0.00012313 [b_2]: 8.25999e-06 [updatestate_depend_eliminate]: 5.40999e-06 [updatestate_assign_eliminate]: 2.32001e-06 [updatestate_loads_eliminate]: 2.24999e-06 [renormalize]: 5.69999e-07 [cse]: 1.75e-05 [optimize_parallel_all_gather_comm]: 1.645e-05 [overlap_param_gather]: 2.61e-06 [cconv]: 2.535e-05 [loop_unroll]: 0.00043494 [opt_after_cconv]: 0.00010019, [1] [Cycle 1]: 9.464e-05, [7] [c_1]: 3.133e-05 [parameter_eliminate]: 2.46e-06 [updatestate_depend_eliminate]: 5.07999e-06 [updatestate_assign_eliminate]: 2.49001e-06 [updatestate_loads_eliminate]: 2.25002e-06 [cse]: 1.682e-05 [renormalize]: 4.00003e-07 [remove_dup_value]: 1.324e-05 [tuple_transform]: 7.426e-05, [1] [Cycle 1]: 6.989e-05, [4] [d_1]: 4.281e-05 [none_parameter_eliminate]: 1.69998e-06 [renormalize]: 2.40019e-07 [switch_simplify]: 6.89001e-06 [partial_unused_args_eliminate]: 1.69998e-06 [add_recomputation]: 4.862e-05 [cse_after_recomputation]: 2.129e-05, [1] [Cycle 1]: 1.655e-05, [1] [cse]: 1.096e-05 [environ_conv]: 5.67001e-06 [swap_dp_allreduce_reducescatter]: 4.84e-06 [bias_add_comm_swap]: 2.51998e-06 [label_micro_interleaved_index]: 4.53001e-06 [label_fine_grained_interleaved_index]: 2.99999e-06 [merge_cast_opt]: 1.32999e-06 [slice_recompute_activation]: 1.99e-06 [micro_interleaved_order_control]: 2.24999e-06 [assign_add_opt]: 1.34e-06 [ForceFp32Comm]: 7.79983e-07 [remove_cast_before_assign_add]: 1.10999e-06 [full_micro_interleaved_order_control]: 2.22001e-06 [reorder_send_recv_between_fp_bp]: 2.65002e-06 [comm_op_add_attrs]: 1.00001e-06 [add_comm_op_reuse_tag]: 9.39996e-07 [interleave_split_concat_branches]: 1.15001e-06 [interleave_parallel_branches]: 1.01997e-06 [overlap_opt_shard_in_pipeline]: 1.14e-06 [overlap_opt_shard_grad_in_pipeline]: 2.24001e-06 [control_data_broadcast_order]: 1.173e-05 [grouped_pairwise_exchange_alltoall]: 1.47001e-06 [offloading_packed_experts]: 3.67998e-06 [overlap_recompute_and_grad_model_parallel]: 4.74998e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.39998e-06 [overlap_recompute_allgather_and_fa_grad]: 1.37e-06 [overlap_recompute_comm]: 2.54999e-06 [overlap_grad_ring_attention]: 4.25e-06 [overlap_grad_flash_sp]: 1.985e-05 [begin_end_overlap_inline]: 9.20001e-07 [split_matmul_comm_elemetwise]: 2.29001e-06 [split_layernorm_comm]: 1.57999e-06 [handle_group_info]: 9.20001e-07 [symbol_engine_optimizer]: 7.576e-05, [1] [Cycle 1]: 7.164e-05, [6] [build]: 2.84999e-06 [elim_shapecalc]: 1.025e-05 [elim_not_effective]: 1.301e-05 [opt_reshape]: 7.14001e-06 [fold_const_symbol]: 1.029e-05 [renormalize]: 2.89991e-07 [detach_backward]: 2.16e-06 [pipeline_parallel_scheduler]: 1.76998e-06 [auto_monad_reorder]: 1.589e-05 [get_jit_bprop_graph]: 1.59998e-06 [rewriter_after_jit_bprop_graph]: 3.66999e-06 [opt_after_jit_grad]: 0.00048406 [validate]: 3.837e-05 Sums bootstrap : 0.000394s : 3.69% type_inference : 0.005742s : 53.71% event_method : 0.000018s : 0.17% auto_monad : 0.000062s : 0.58% graph_reusing : 0.000006s : 0.05% inline : 0.000002s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000020s : 0.19% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.06% parallel-infer-symbol : 0.000004s : 0.03% pre_auto_parallel : 0.000032s : 0.30% insert-virtual-dataset : 0.000002s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.02% optimize.py_interpret_to_execute : 0.000024s : 0.22% optimize.rewriter_before_opt_a : 0.000083s : 0.78% optimize.opt_a.expand_dump_flag : 0.000005s : 0.04% optimize.opt_a.switch_simplify : 0.000054s : 0.50% optimize.opt_a.loop_unroll : 0.000037s : 0.35% optimize.opt_a.a_1 : 0.000781s : 7.31% optimize.opt_a.with_stream_mark : 0.000028s : 0.26% optimize.opt_a.recompute_prepare : 0.000016s : 0.15% optimize.opt_a.updatestate_depend_eliminate : 0.000007s : 0.06% optimize.opt_a.updatestate_assign_eliminate : 0.000006s : 0.05% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.05% optimize.opt_a.parameter_eliminate : 0.000003s : 0.03% optimize.opt_a.a_2 : 0.000158s : 1.48% optimize.opt_a.accelerated_algorithm : 0.000015s : 0.14% optimize.opt_a.shard : 0.000003s : 0.03% optimize.opt_a.meta_shard_fg_expand : 0.000003s : 0.03% optimize.opt_a.shard_inline : 0.000013s : 0.13% optimize.opt_a.merge_send_recv : 0.000014s : 0.13% optimize.opt_a.auto_parallel : 0.000012s : 0.11% optimize.opt_a.parallel : 0.000023s : 0.22% optimize.opt_a.flash_sp : 0.000012s : 0.12% optimize.opt_a.merge_comm : 0.000008s : 0.08% optimize.opt_a.allreduce_fusion : 0.000007s : 0.06% optimize.opt_a.matmul_add_comm_reduction : 0.000018s : 0.17% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000016s : 0.15% optimize.opt_a.virtual_dataset : 0.000013s : 0.12% optimize.opt_a.get_grad_eliminate_ : 0.000012s : 0.11% optimize.opt_a.virtual_output : 0.000012s : 0.11% optimize.opt_a.merge_forward : 0.000007s : 0.06% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.03% optimize.opt_a.offload_activation : 0.000017s : 0.16% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000026s : 0.25% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.02% optimize.opt_a.before_grad : 0.000020s : 0.18% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000007s : 0.07% optimize.opt_a.meta_fg_expand : 0.000005s : 0.05% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.04% optimize.opt_a.receive_attached : 0.000003s : 0.03% optimize.opt_a.after_resolve : 0.000024s : 0.22% optimize.opt_a.a_after_grad : 0.000020s : 0.18% optimize.opt_a.renormalize : 0.000592s : 5.53% optimize.opt_a.add_forward_monad_depend : 0.000007s : 0.07% optimize.opt_a.auto_monad_grad : 0.000004s : 0.03% optimize.opt_a.auto_monad_eliminator : 0.000023s : 0.21% optimize.opt_a.cse : 0.000046s : 0.43% optimize.opt_a.a_3 : 0.000087s : 0.81% optimize.py_interpret_to_execute_after_opt_a : 0.000010s : 0.09% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.02% optimize.rewriter_after_opt_a : 0.000034s : 0.32% optimize.convert_after_rewriter : 0.000007s : 0.07% optimize.order_py_execute_after_rewriter : 0.000005s : 0.05% optimize.mutable_eliminate : 0.000535s : 5.01% optimize.opt_b.b_1 : 0.000123s : 1.15% optimize.opt_b.b_2 : 0.000008s : 0.08% optimize.opt_b.updatestate_depend_eliminate : 0.000005s : 0.05% optimize.opt_b.updatestate_assign_eliminate : 0.000002s : 0.02% optimize.opt_b.updatestate_loads_eliminate : 0.000002s : 0.02% optimize.opt_b.renormalize : 0.000001s : 0.01% optimize.opt_b.cse : 0.000017s : 0.16% optimize.optimize_parallel_all_gather_comm : 0.000016s : 0.15% optimize.overlap_param_gather : 0.000003s : 0.02% optimize.cconv : 0.000025s : 0.24% optimize.loop_unroll : 0.000435s : 4.07% optimize.opt_after_cconv.c_1 : 0.000031s : 0.29% optimize.opt_after_cconv.parameter_eliminate : 0.000002s : 0.02% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000005s : 0.05% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000002s : 0.02% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.02% optimize.opt_after_cconv.cse : 0.000017s : 0.16% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000013s : 0.12% optimize.tuple_transform.d_1 : 0.000043s : 0.40% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.02% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000007s : 0.06% optimize.partial_unused_args_eliminate : 0.000002s : 0.02% optimize.add_recomputation : 0.000049s : 0.45% optimize.cse_after_recomputation.cse : 0.000011s : 0.10% optimize.environ_conv : 0.000006s : 0.05% optimize.swap_dp_allreduce_reducescatter : 0.000005s : 0.05% optimize.bias_add_comm_swap : 0.000003s : 0.02% optimize.label_micro_interleaved_index : 0.000005s : 0.04% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.03% optimize.merge_cast_opt : 0.000001s : 0.01% optimize.slice_recompute_activation : 0.000002s : 0.02% optimize.micro_interleaved_order_control : 0.000002s : 0.02% optimize.assign_add_opt : 0.000001s : 0.01% optimize.ForceFp32Comm : 0.000001s : 0.01% optimize.remove_cast_before_assign_add : 0.000001s : 0.01% optimize.full_micro_interleaved_order_control : 0.000002s : 0.02% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.02% optimize.comm_op_add_attrs : 0.000001s : 0.01% optimize.add_comm_op_reuse_tag : 0.000001s : 0.01% optimize.interleave_split_concat_branches : 0.000001s : 0.01% optimize.interleave_parallel_branches : 0.000001s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.02% optimize.control_data_broadcast_order : 0.000012s : 0.11% optimize.grouped_pairwise_exchange_alltoall : 0.000001s : 0.01% optimize.offloading_packed_experts : 0.000004s : 0.03% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.04% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.01% optimize.overlap_recompute_comm : 0.000003s : 0.02% optimize.overlap_grad_ring_attention : 0.000004s : 0.04% optimize.overlap_grad_flash_sp : 0.000020s : 0.19% optimize.begin_end_overlap_inline : 0.000001s : 0.01% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.02% optimize.split_layernorm_comm : 0.000002s : 0.01% optimize.handle_group_info : 0.000001s : 0.01% optimize.symbol_engine_optimizer.build : 0.000003s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000010s : 0.10% optimize.symbol_engine_optimizer.elim_not_effective : 0.000013s : 0.12% optimize.symbol_engine_optimizer.opt_reshape : 0.000007s : 0.07% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000010s : 0.10% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.02% pipeline_parallel_scheduler : 0.000002s : 0.02% auto_monad_reorder : 0.000016s : 0.15% get_jit_bprop_graph : 0.000002s : 0.01% rewriter_after_jit_bprop_graph : 0.000004s : 0.03% opt_after_jit_grad : 0.000484s : 4.53% validate : 0.000038s : 0.36% Time group info: ------[substitution.] 0.000178 28 1.06% : 0.000002s : 2: substitution.elim_not_effective 0.79% : 0.000001s : 2: substitution.fold_const_symbol 3.26% : 0.000006s : 4: substitution.graph_param_transform 79.34% : 0.000141s : 4: substitution.inline 1.88% : 0.000003s : 4: substitution.j_node_and_user_rematch 2.60% : 0.000005s : 4: substitution.remove_not_recompute_node 2.59% : 0.000005s : 4: substitution.replace_old_param 8.48% : 0.000015s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.005684 2 88.10% : 0.005008s : 1: type_inference.infer 11.90% : 0.000676s : 1: type_inference.specialize ------[replace.] 0.000063 8 61.72% : 0.000039s : 4: replace.inline 38.28% : 0.000024s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000151 8 91.40% : 0.000138s : 4: match.inline 8.60% : 0.000013s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000211 1278 1.01% : 0.000002s : 13: predicate.accumulaten_eliminater 0.81% : 0.000002s : 4: predicate.ad_related_special_op_eliminate 0.55% : 0.000001s : 8: predicate.addn_check_dump 1.06% : 0.000002s : 13: predicate.addn_zero_filter 0.89% : 0.000002s : 13: predicate.adjust_all_reduce_mul_add 2.09% : 0.000004s : 21: predicate.arithmetic_simplify 1.01% : 0.000002s : 13: predicate.cast_eliminate 0.58% : 0.000001s : 8: predicate.check_bprop_eliminate 0.49% : 0.000001s : 8: predicate.compare_switch_simplify 0.18% : 0.000000s : 4: predicate.const_output_eliminate 0.59% : 0.000001s : 8: predicate.depend_value_elim 1.00% : 0.000002s : 13: predicate.dict_get_item_const_eliminator 1.04% : 0.000002s : 13: predicate.dict_get_item_eliminator 0.90% : 0.000002s : 13: predicate.dict_set_item_eliminator 0.97% : 0.000002s : 8: predicate.dumpgradient_eliminate 0.27% : 0.000001s : 4: predicate.elim_not_effective 0.37% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.14% : 0.000002s : 17: predicate.environ_add_const_eliminate 1.08% : 0.000002s : 17: predicate.environ_get_add_eliminate 1.09% : 0.000002s : 17: predicate.environ_get_depend_swap 1.71% : 0.000004s : 25: predicate.environ_get_eliminate 1.07% : 0.000002s : 17: predicate.environ_get_set_eliminate 1.52% : 0.000003s : 21: predicate.exchange_switch_depend_value 2.43% : 0.000005s : 21: predicate.float_depend_g_call 0.55% : 0.000001s : 8: predicate.float_environ_get_switch 0.89% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.18% : 0.000000s : 4: predicate.fold_const_symbol 0.62% : 0.000001s : 8: predicate.get_grad_eliminate 0.21% : 0.000000s : 4: predicate.graph_param_transform 0.61% : 0.000001s : 8: predicate.incorporate_call 0.50% : 0.000001s : 8: predicate.incorporate_call_switch 6.05% : 0.000013s : 58: predicate.inline 0.70% : 0.000001s : 8: predicate.inline_without_move 0.31% : 0.000001s : 8: predicate.j_node_and_user_rematch 0.85% : 0.000002s : 8: predicate.less_batch_normalization 1.85% : 0.000004s : 25: predicate.list_to_tuple_eliminator_ 2.74% : 0.000006s : 38: predicate.load_eliminater 0.86% : 0.000002s : 4: predicate.loop_unroll_after_grad 2.27% : 0.000005s : 34: predicate.loop_unroll_before_grad 1.53% : 0.000003s : 21: predicate.make_slice_get_slice_eliminator 0.52% : 0.000001s : 8: predicate.merge_addn 0.53% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.54% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.86% : 0.000002s : 13: predicate.minmaximum_grad 1.00% : 0.000002s : 4: predicate.mutable_eliminate 0.36% : 0.000001s : 4: predicate.opt_reshape 0.37% : 0.000001s : 4: predicate.parallel_virtual_node 1.90% : 0.000004s : 21: predicate.partial_defer_inline 1.60% : 0.000003s : 21: predicate.partial_eliminate 0.91% : 0.000002s : 13: predicate.print_const_string_wrapper 0.59% : 0.000001s : 8: predicate.reduce_all_const_elim 1.42% : 0.000003s : 13: predicate.reduce_eliminate 2.43% : 0.000005s : 38: predicate.redundant_stop_gradient_eliminater 0.38% : 0.000001s : 8: predicate.remove_not_recompute_node 1.38% : 0.000003s : 25: predicate.replace_applicator 0.53% : 0.000001s : 8: predicate.replace_old_param 0.26% : 0.000001s : 4: predicate.reset_defer_inline 1.02% : 0.000002s : 13: predicate.reshape_eliminate 0.57% : 0.000001s : 8: predicate.row_tensor_add_zeros_like 0.40% : 0.000001s : 4: predicate.row_tensor_eliminate 0.83% : 0.000002s : 8: predicate.same_eliminate 0.46% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.85% : 0.000002s : 8: predicate.shard_identity_eliminate 0.65% : 0.000001s : 8: predicate.special_op_eliminate 0.69% : 0.000001s : 8: predicate.specialize_transform 0.74% : 0.000002s : 8: predicate.split_environ_get_set_with_tuple_value 0.85% : 0.000002s : 8: predicate.stack_unstack_eliminate 0.38% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.61% : 0.000003s : 21: predicate.switch_defer_inline 2.17% : 0.000005s : 29: predicate.switch_layer_defer_inline 5.48% : 0.000012s : 67: predicate.switch_simplify 0.99% : 0.000002s : 13: predicate.tile_eliminate 0.97% : 0.000002s : 13: predicate.transpose_eliminate 1.45% : 0.000003s : 21: predicate.tuple_list_convert_item_index_to_positive 1.77% : 0.000004s : 21: predicate.tuple_list_get_item_const_eliminator 1.42% : 0.000003s : 21: predicate.tuple_list_get_item_depend_reorder 3.19% : 0.000007s : 33: predicate.tuple_list_get_item_eliminator 1.47% : 0.000003s : 21: predicate.tuple_list_get_set_item_eliminator 2.18% : 0.000005s : 29: predicate.tuple_list_set_item_eliminator 1.75% : 0.000004s : 25: predicate.tuple_to_list_eliminator_ 2.42% : 0.000005s : 38: predicate.updatestate_pure_node_eliminater 3.24% : 0.000007s : 46: predicate.updatestate_useless_node_eliminater 0.38% : 0.000001s : 4: predicate.value_based_eliminate 0.59% : 0.000001s : 8: predicate.virtual_dataset_eliminate 0.61% : 0.000001s : 8: predicate.virtual_output_eliminate 0.31% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.39% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000490 11 54.48% : 0.000267s : 5: func_graph_cloner_run.FuncGraphClonerGraph 45.52% : 0.000223s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.024882 192 0.01% : 0.000003s : 1: ForceFp32Comm 13.21% : 0.003288s : 1: add_attr 13.17% : 0.003277s : 1: add_attr_with_inline 0.02% : 0.000004s : 1: add_comm_op_reuse_tag 0.21% : 0.000053s : 1: add_recomputation 0.02% : 0.000004s : 1: assign_add_opt 0.27% : 0.000067s : 1: auto_monad 0.08% : 0.000020s : 1: auto_monad_reorder 0.01% : 0.000004s : 1: begin_end_overlap_inline 0.02% : 0.000005s : 1: bias_add_comm_swap 1.70% : 0.000422s : 1: bootstrap 0.12% : 0.000029s : 1: cconv 0.02% : 0.000004s : 1: comm_op_add_attrs 0.06% : 0.000015s : 1: control_data_broadcast_order 0.04% : 0.000010s : 1: convert_after_rewriter 0.10% : 0.000024s : 1: cse_after_recomputation 0.02% : 0.000005s : 1: dataset_repeat_opt 0.02% : 0.000005s : 1: detach_backward 0.04% : 0.000009s : 1: environ_conv 0.10% : 0.000025s : 1: event_method 0.02% : 0.000005s : 1: full_micro_interleaved_order_control 0.02% : 0.000005s : 1: get_jit_bprop_graph 0.04% : 0.000009s : 1: graph_reusing 0.02% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000004s : 1: handle_group_info 0.02% : 0.000005s : 1: inline 0.03% : 0.000006s : 1: insert-virtual-dataset 0.01% : 0.000004s : 1: interleave_parallel_branches 0.02% : 0.000004s : 1: interleave_split_concat_branches 0.02% : 0.000006s : 1: label_fine_grained_interleaved_index 0.03% : 0.000007s : 1: label_micro_interleaved_index 1.78% : 0.000443s : 1: loop_unroll 0.02% : 0.000004s : 1: merge_cast_opt 0.02% : 0.000005s : 1: micro_interleaved_order_control 2.18% : 0.000543s : 1: mutable_eliminate 0.03% : 0.000007s : 1: offloading_packed_experts 0.05% : 0.000013s : 1: opt.transform.loop_unroll_optimizer 0.06% : 0.000015s : 1: opt.transform.mutable_eliminate 4.87% : 0.001211s : 78: opt.transform.opt_a 0.12% : 0.000030s : 1: opt.transform.opt_after_cconv 0.11% : 0.000027s : 1: opt.transform.opt_after_jit_grad 0.40% : 0.000100s : 28: opt.transform.opt_b 0.19% : 0.000048s : 2: opt.transform.opt_trans_graph 0.15% : 0.000037s : 4: opt.transform.symbol_engine_opt 10.76% : 0.002677s : 1: opt_a 0.42% : 0.000104s : 1: opt_after_cconv 1.98% : 0.000493s : 1: opt_after_jit_grad 0.82% : 0.000204s : 1: opt_b 18.92% : 0.004708s : 1: optimize 0.08% : 0.000020s : 1: optimize_parallel_all_gather_comm 0.03% : 0.000008s : 1: order_py_execute_after_rewriter 0.09% : 0.000023s : 1: overlap_grad_flash_sp 0.02% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.03% : 0.000007s : 1: overlap_grad_ring_attention 0.02% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.02% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.02% : 0.000006s : 1: overlap_param_gather 0.02% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.03% : 0.000008s : 1: overlap_recompute_and_grad_model_parallel 0.02% : 0.000005s : 1: overlap_recompute_comm 0.03% : 0.000007s : 1: parallel-infer-symbol 0.02% : 0.000004s : 1: parallel-infer-symbol-second 0.02% : 0.000005s : 1: partial_unused_args_eliminate 0.02% : 0.000005s : 1: pipeline_parallel_scheduler 0.02% : 0.000005s : 1: pipeline_split 0.15% : 0.000036s : 1: pre_auto_parallel 0.11% : 0.000028s : 1: py_interpret_to_execute 0.05% : 0.000013s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000004s : 1: remove_cast_before_assign_add 0.07% : 0.000017s : 1: remove_dup_value 1.19% : 0.000297s : 1: renormalize.infer 1.15% : 0.000286s : 1: renormalize.specialize 0.02% : 0.000005s : 1: reorder_send_recv_between_fp_bp 0.03% : 0.000007s : 1: rewriter_after_jit_bprop_graph 0.15% : 0.000038s : 1: rewriter_after_opt_a 0.35% : 0.000088s : 1: rewriter_before_opt_a 0.02% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.02% : 0.000005s : 1: slice_recompute_activation 0.02% : 0.000005s : 1: split_layernorm_comm 0.02% : 0.000005s : 1: split_matmul_comm_elemetwise 0.03% : 0.000008s : 1: swap_dp_allreduce_reducescatter 0.32% : 0.000079s : 1: symbol_engine_optimizer 0.31% : 0.000077s : 1: tuple_transform 23.15% : 0.005760s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:45:40.985.827 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:45:40.986.125 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0162193, [21] [bootstrap]: 0.00036229 [type_inference]: 0.00556481 [event_method]: 1.904e-05 [auto_monad]: 6.198e-05 [graph_reusing]: 6.23002e-06 [inline]: 1.97999e-06 [add_attr]: 0.00311741, [1] [add_attr_with_inline]: 0.00310866, [1] [Cycle 1]: 8.624e-05, [2] [tag_attr]: 1.865e-05 [meta_addattr_fg_expand]: 5.92001e-06 [parallel-infer-symbol]: 3.10998e-06 [pre_auto_parallel]: 3.419e-05 [insert-virtual-dataset]: 2.73e-06 [parallel-infer-symbol-second]: 7.39994e-07 [dataset_repeat_opt]: 2.06e-06 [pipeline_split]: 1.88002e-06 [optimize]: 0.00588111, [53] [py_interpret_to_execute]: 2.985e-05 [rewriter_before_opt_a]: 9.469e-05 [opt_a]: 0.00337467, [2] [Cycle 1]: 0.0024006, [45] [expand_dump_flag]: 3.17002e-06 [switch_simplify]: 4.679e-05 [loop_unroll]: 3.436e-05 [a_1]: 0.00076621 [with_stream_mark]: 1.435e-05 [recompute_prepare]: 1.014e-05 [updatestate_depend_eliminate]: 4.57e-06 [updatestate_assign_eliminate]: 3.83999e-06 [updatestate_loads_eliminate]: 3.81999e-06 [parameter_eliminate]: 1.96e-06 [a_2]: 0.00013198 [accelerated_algorithm]: 9.34998e-06 [shard]: 2.00002e-06 [meta_shard_fg_expand]: 2.22001e-06 [shard_inline]: 7.59002e-06 [merge_send_recv]: 1.007e-05 [auto_parallel]: 8.01001e-06 [parallel]: 1.87e-05 [flash_sp]: 8.72e-06 [merge_comm]: 4.95001e-06 [allreduce_fusion]: 4.62998e-06 [matmul_add_comm_reduction]: 1.089e-05 [allreduce_slice_to_reducescatter]: 6.50005e-07 [virtual_shard_identity]: 9.93002e-06 [virtual_dataset]: 7.99002e-06 [get_grad_eliminate_]: 7.5e-06 [virtual_output]: 7.89002e-06 [merge_forward]: 4.30999e-06 [cell_reuse_recompute_pass]: 1.55001e-06 [offload_activation]: 1.113e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.645e-05 [merge_recompute_call_nodes]: 1.47001e-06 [before_grad]: 1.272e-05 [set_forward_comm_id_for_comm_node_pass]: 4.38001e-06 [meta_fg_expand]: 3.48999e-06 [flash_sp_send_recv_attached]: 2.72001e-06 [receive_attached]: 2.29001e-06 [after_resolve]: 1.29e-05 [a_after_grad]: 1.2e-05 [renormalize]: 0.00065257 [add_forward_monad_depend]: 5.40999e-06 [auto_monad_grad]: 2.34999e-06 [auto_monad_eliminator]: 1.842e-05 [cse]: 3.194e-05 [a_3]: 7.381e-05 [Cycle 2]: 0.00096004, [45] [expand_dump_flag]: 1.20999e-06 [switch_simplify]: 9.57999e-06 [loop_unroll]: 7.56001e-06 [a_1]: 0.00017663 [with_stream_mark]: 1.26e-05 [recompute_prepare]: 8.72e-06 [updatestate_depend_eliminate]: 4.03999e-06 [updatestate_assign_eliminate]: 3.18e-06 [updatestate_loads_eliminate]: 3.11999e-06 [parameter_eliminate]: 1.17e-06 [a_2]: 0.00011844 [accelerated_algorithm]: 9.00001e-06 [shard]: 1.17999e-06 [meta_shard_fg_expand]: 1.71e-06 [shard_inline]: 7.21001e-06 [merge_send_recv]: 6.39001e-06 [auto_parallel]: 6.64999e-06 [parallel]: 7.77998e-06 [flash_sp]: 4.02e-06 [merge_comm]: 4.4e-06 [allreduce_fusion]: 4.4e-06 [matmul_add_comm_reduction]: 7.25e-06 [allreduce_slice_to_reducescatter]: 3.59985e-07 [virtual_shard_identity]: 8.76997e-06 [virtual_dataset]: 7.56999e-06 [get_grad_eliminate_]: 6.83e-06 [virtual_output]: 1.049e-05 [merge_forward]: 4.28999e-06 [cell_reuse_recompute_pass]: 1.47999e-06 [offload_activation]: 9.57001e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.85e-05 [merge_recompute_call_nodes]: 9.20001e-07 [before_grad]: 1.246e-05 [set_forward_comm_id_for_comm_node_pass]: 5.02999e-06 [meta_fg_expand]: 3.06999e-06 [flash_sp_send_recv_attached]: 1.67999e-06 [receive_attached]: 1.10999e-06 [after_resolve]: 1.235e-05 [a_after_grad]: 1.117e-05 [renormalize]: 7.00238e-08 [add_forward_monad_depend]: 1.88002e-06 [auto_monad_grad]: 1.70001e-06 [auto_monad_eliminator]: 1.122e-05 [cse]: 2.037e-05 [a_3]: 5.814e-05 [py_interpret_to_execute_after_opt_a]: 1.557e-05 [slice_cell_reuse_recomputed_activation]: 5.15999e-06 [rewriter_after_opt_a]: 4.654e-05 [convert_after_rewriter]: 1.118e-05 [order_py_execute_after_rewriter]: 9.39e-06 [mutable_eliminate]: 0.00061513 [opt_b]: 0.00031303, [1] [Cycle 1]: 0.0003031, [7] [b_1]: 0.00019872 [b_2]: 9.86998e-06 [updatestate_depend_eliminate]: 7.17002e-06 [updatestate_assign_eliminate]: 2.92002e-06 [updatestate_loads_eliminate]: 3.14001e-06 [renormalize]: 6.30011e-07 [cse]: 2.503e-05 [optimize_parallel_all_gather_comm]: 2.131e-05 [overlap_param_gather]: 5.19e-06 [cconv]: 2.973e-05 [loop_unroll]: 0.00045463 [opt_after_cconv]: 0.00013779, [1] [Cycle 1]: 0.00012863, [7] [c_1]: 3.771e-05 [parameter_eliminate]: 2.01998e-06 [updatestate_depend_eliminate]: 4.94e-06 [updatestate_assign_eliminate]: 3.18998e-06 [updatestate_loads_eliminate]: 3.11001e-06 [cse]: 2.124e-05 [renormalize]: 4.69998e-07 [remove_dup_value]: 1.882e-05 [tuple_transform]: 0.00010103, [1] [Cycle 1]: 9.394e-05, [4] [d_1]: 5.245e-05 [none_parameter_eliminate]: 1.61998e-06 [renormalize]: 1.69995e-07 [switch_simplify]: 8.55999e-06 [partial_unused_args_eliminate]: 4.80001e-06 [add_recomputation]: 5.95e-05 [cse_after_recomputation]: 3.146e-05, [1] [Cycle 1]: 2.45e-05, [1] [cse]: 1.529e-05 [environ_conv]: 9.46e-06 [swap_dp_allreduce_reducescatter]: 8.3e-06 [bias_add_comm_swap]: 4.79e-06 [label_micro_interleaved_index]: 7.18998e-06 [label_fine_grained_interleaved_index]: 5.10999e-06 [merge_cast_opt]: 3.70998e-06 [slice_recompute_activation]: 4.89998e-06 [micro_interleaved_order_control]: 4.55001e-06 [assign_add_opt]: 3.86001e-06 [ForceFp32Comm]: 3.31999e-06 [remove_cast_before_assign_add]: 3.68999e-06 [full_micro_interleaved_order_control]: 4.68001e-06 [reorder_send_recv_between_fp_bp]: 5.43002e-06 [comm_op_add_attrs]: 3.33e-06 [add_comm_op_reuse_tag]: 3.26001e-06 [interleave_split_concat_branches]: 3.45003e-06 [interleave_parallel_branches]: 3.36999e-06 [overlap_opt_shard_in_pipeline]: 3.36001e-06 [overlap_opt_shard_grad_in_pipeline]: 3.96001e-06 [control_data_broadcast_order]: 1.686e-05 [grouped_pairwise_exchange_alltoall]: 4.05998e-06 [offloading_packed_experts]: 6.79999e-06 [overlap_recompute_and_grad_model_parallel]: 8.3e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.63999e-06 [overlap_recompute_allgather_and_fa_grad]: 3.69002e-06 [overlap_recompute_comm]: 4.57998e-06 [overlap_grad_ring_attention]: 6.99001e-06 [overlap_grad_flash_sp]: 2.476e-05 [begin_end_overlap_inline]: 2.90998e-06 [split_matmul_comm_elemetwise]: 4.71002e-06 [split_layernorm_comm]: 4.07e-06 [handle_group_info]: 3.79002e-06 [symbol_engine_optimizer]: 0.00010516, [1] [Cycle 1]: 9.871e-05, [6] [build]: 3.31999e-06 [elim_shapecalc]: 1.196e-05 [elim_not_effective]: 1.567e-05 [opt_reshape]: 8.47e-06 [fold_const_symbol]: 1.278e-05 [renormalize]: 2.19996e-07 [detach_backward]: 3.66001e-06 [pipeline_parallel_scheduler]: 1.76998e-06 [auto_monad_reorder]: 2.219e-05 [get_jit_bprop_graph]: 1.39e-06 [rewriter_after_jit_bprop_graph]: 3.61999e-06 [opt_after_jit_grad]: 0.00049007 [validate]: 4.023e-05 Sums bootstrap : 0.000362s : 3.20% type_inference : 0.005565s : 49.10% event_method : 0.000019s : 0.17% auto_monad : 0.000062s : 0.55% graph_reusing : 0.000006s : 0.05% inline : 0.000002s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000019s : 0.16% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.05% parallel-infer-symbol : 0.000003s : 0.03% pre_auto_parallel : 0.000034s : 0.30% insert-virtual-dataset : 0.000003s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.02% optimize.py_interpret_to_execute : 0.000030s : 0.26% optimize.rewriter_before_opt_a : 0.000095s : 0.84% optimize.opt_a.expand_dump_flag : 0.000004s : 0.04% optimize.opt_a.switch_simplify : 0.000056s : 0.50% optimize.opt_a.loop_unroll : 0.000042s : 0.37% optimize.opt_a.a_1 : 0.000943s : 8.32% optimize.opt_a.with_stream_mark : 0.000027s : 0.24% optimize.opt_a.recompute_prepare : 0.000019s : 0.17% optimize.opt_a.updatestate_depend_eliminate : 0.000009s : 0.08% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.06% optimize.opt_a.updatestate_loads_eliminate : 0.000007s : 0.06% optimize.opt_a.parameter_eliminate : 0.000003s : 0.03% optimize.opt_a.a_2 : 0.000250s : 2.21% optimize.opt_a.accelerated_algorithm : 0.000018s : 0.16% optimize.opt_a.shard : 0.000003s : 0.03% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.03% optimize.opt_a.shard_inline : 0.000015s : 0.13% optimize.opt_a.merge_send_recv : 0.000016s : 0.15% optimize.opt_a.auto_parallel : 0.000015s : 0.13% optimize.opt_a.parallel : 0.000026s : 0.23% optimize.opt_a.flash_sp : 0.000013s : 0.11% optimize.opt_a.merge_comm : 0.000009s : 0.08% optimize.opt_a.allreduce_fusion : 0.000009s : 0.08% optimize.opt_a.matmul_add_comm_reduction : 0.000018s : 0.16% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000019s : 0.17% optimize.opt_a.virtual_dataset : 0.000016s : 0.14% optimize.opt_a.get_grad_eliminate_ : 0.000014s : 0.13% optimize.opt_a.virtual_output : 0.000018s : 0.16% optimize.opt_a.merge_forward : 0.000009s : 0.08% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.03% optimize.opt_a.offload_activation : 0.000021s : 0.18% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000035s : 0.31% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.02% optimize.opt_a.before_grad : 0.000025s : 0.22% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000009s : 0.08% optimize.opt_a.meta_fg_expand : 0.000007s : 0.06% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.04% optimize.opt_a.receive_attached : 0.000003s : 0.03% optimize.opt_a.after_resolve : 0.000025s : 0.22% optimize.opt_a.a_after_grad : 0.000023s : 0.20% optimize.opt_a.renormalize : 0.000653s : 5.76% optimize.opt_a.add_forward_monad_depend : 0.000007s : 0.06% optimize.opt_a.auto_monad_grad : 0.000004s : 0.04% optimize.opt_a.auto_monad_eliminator : 0.000030s : 0.26% optimize.opt_a.cse : 0.000052s : 0.46% optimize.opt_a.a_3 : 0.000132s : 1.16% optimize.py_interpret_to_execute_after_opt_a : 0.000016s : 0.14% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.05% optimize.rewriter_after_opt_a : 0.000047s : 0.41% optimize.convert_after_rewriter : 0.000011s : 0.10% optimize.order_py_execute_after_rewriter : 0.000009s : 0.08% optimize.mutable_eliminate : 0.000615s : 5.43% optimize.opt_b.b_1 : 0.000199s : 1.75% optimize.opt_b.b_2 : 0.000010s : 0.09% optimize.opt_b.updatestate_depend_eliminate : 0.000007s : 0.06% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_b.renormalize : 0.000001s : 0.01% optimize.opt_b.cse : 0.000025s : 0.22% optimize.optimize_parallel_all_gather_comm : 0.000021s : 0.19% optimize.overlap_param_gather : 0.000005s : 0.05% optimize.cconv : 0.000030s : 0.26% optimize.loop_unroll : 0.000455s : 4.01% optimize.opt_after_cconv.c_1 : 0.000038s : 0.33% optimize.opt_after_cconv.parameter_eliminate : 0.000002s : 0.02% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000005s : 0.04% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.cse : 0.000021s : 0.19% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000019s : 0.17% optimize.tuple_transform.d_1 : 0.000052s : 0.46% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000009s : 0.08% optimize.partial_unused_args_eliminate : 0.000005s : 0.04% optimize.add_recomputation : 0.000060s : 0.53% optimize.cse_after_recomputation.cse : 0.000015s : 0.13% optimize.environ_conv : 0.000009s : 0.08% optimize.swap_dp_allreduce_reducescatter : 0.000008s : 0.07% optimize.bias_add_comm_swap : 0.000005s : 0.04% optimize.label_micro_interleaved_index : 0.000007s : 0.06% optimize.label_fine_grained_interleaved_index : 0.000005s : 0.05% optimize.merge_cast_opt : 0.000004s : 0.03% optimize.slice_recompute_activation : 0.000005s : 0.04% optimize.micro_interleaved_order_control : 0.000005s : 0.04% optimize.assign_add_opt : 0.000004s : 0.03% optimize.ForceFp32Comm : 0.000003s : 0.03% optimize.remove_cast_before_assign_add : 0.000004s : 0.03% optimize.full_micro_interleaved_order_control : 0.000005s : 0.04% optimize.reorder_send_recv_between_fp_bp : 0.000005s : 0.05% optimize.comm_op_add_attrs : 0.000003s : 0.03% optimize.add_comm_op_reuse_tag : 0.000003s : 0.03% optimize.interleave_split_concat_branches : 0.000003s : 0.03% optimize.interleave_parallel_branches : 0.000003s : 0.03% optimize.overlap_opt_shard_in_pipeline : 0.000003s : 0.03% optimize.overlap_opt_shard_grad_in_pipeline : 0.000004s : 0.03% optimize.control_data_broadcast_order : 0.000017s : 0.15% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.04% optimize.offloading_packed_experts : 0.000007s : 0.06% optimize.overlap_recompute_and_grad_model_parallel : 0.000008s : 0.07% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.03% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.03% optimize.overlap_recompute_comm : 0.000005s : 0.04% optimize.overlap_grad_ring_attention : 0.000007s : 0.06% optimize.overlap_grad_flash_sp : 0.000025s : 0.22% optimize.begin_end_overlap_inline : 0.000003s : 0.03% optimize.split_matmul_comm_elemetwise : 0.000005s : 0.04% optimize.split_layernorm_comm : 0.000004s : 0.04% optimize.handle_group_info : 0.000004s : 0.03% optimize.symbol_engine_optimizer.build : 0.000003s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000012s : 0.11% optimize.symbol_engine_optimizer.elim_not_effective : 0.000016s : 0.14% optimize.symbol_engine_optimizer.opt_reshape : 0.000008s : 0.07% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000013s : 0.11% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000004s : 0.03% pipeline_parallel_scheduler : 0.000002s : 0.02% auto_monad_reorder : 0.000022s : 0.20% get_jit_bprop_graph : 0.000001s : 0.01% rewriter_after_jit_bprop_graph : 0.000004s : 0.03% opt_after_jit_grad : 0.000490s : 4.32% validate : 0.000040s : 0.35% Time group info: ------[substitution.] 0.000200 38 10.85% : 0.000022s : 3: substitution.cast_eliminate 1.24% : 0.000002s : 3: substitution.elim_not_effective 0.82% : 0.000002s : 3: substitution.fold_const_symbol 3.43% : 0.000007s : 5: substitution.graph_param_transform 67.42% : 0.000135s : 4: substitution.inline 2.31% : 0.000005s : 6: substitution.j_node_and_user_rematch 3.36% : 0.000007s : 6: substitution.remove_not_recompute_node 2.63% : 0.000005s : 4: substitution.replace_old_param 7.95% : 0.000016s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.005514 2 87.27% : 0.004812s : 1: type_inference.infer 12.73% : 0.000702s : 1: type_inference.specialize ------[replace.] 0.000062 8 57.49% : 0.000035s : 4: replace.inline 42.51% : 0.000026s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000146 8 90.50% : 0.000132s : 4: match.inline 9.50% : 0.000014s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000267 1596 0.98% : 0.000003s : 17: predicate.accumulaten_eliminater 0.65% : 0.000002s : 5: predicate.ad_related_special_op_eliminate 0.54% : 0.000001s : 10: predicate.addn_check_dump 0.99% : 0.000003s : 17: predicate.addn_zero_filter 0.86% : 0.000002s : 17: predicate.adjust_all_reduce_mul_add 2.13% : 0.000006s : 27: predicate.arithmetic_simplify 1.03% : 0.000003s : 17: predicate.cast_eliminate 0.58% : 0.000002s : 10: predicate.check_bprop_eliminate 0.57% : 0.000002s : 10: predicate.compare_switch_simplify 0.17% : 0.000000s : 5: predicate.const_output_eliminate 0.69% : 0.000002s : 10: predicate.depend_value_elim 1.05% : 0.000003s : 17: predicate.dict_get_item_const_eliminator 1.11% : 0.000003s : 17: predicate.dict_get_item_eliminator 0.93% : 0.000002s : 17: predicate.dict_set_item_eliminator 0.85% : 0.000002s : 10: predicate.dumpgradient_eliminate 0.19% : 0.000001s : 5: predicate.elim_not_effective 0.37% : 0.000001s : 5: predicate.elim_shapecalc_of_broadcastargs 1.21% : 0.000003s : 22: predicate.environ_add_const_eliminate 1.23% : 0.000003s : 22: predicate.environ_get_add_eliminate 1.21% : 0.000003s : 22: predicate.environ_get_depend_swap 1.77% : 0.000005s : 32: predicate.environ_get_eliminate 1.15% : 0.000003s : 22: predicate.environ_get_set_eliminate 1.49% : 0.000004s : 25: predicate.exchange_switch_depend_value 2.20% : 0.000006s : 25: predicate.float_depend_g_call 0.53% : 0.000001s : 10: predicate.float_environ_get_switch 0.79% : 0.000002s : 15: predicate.float_tuple_getitem_switch 0.17% : 0.000000s : 5: predicate.fold_const_symbol 0.67% : 0.000002s : 10: predicate.get_grad_eliminate 0.20% : 0.000001s : 5: predicate.graph_param_transform 0.58% : 0.000002s : 10: predicate.incorporate_call 0.59% : 0.000002s : 10: predicate.incorporate_call_switch 6.06% : 0.000016s : 72: predicate.inline 0.73% : 0.000002s : 10: predicate.inline_without_move 0.31% : 0.000001s : 10: predicate.j_node_and_user_rematch 0.79% : 0.000002s : 10: predicate.less_batch_normalization 1.86% : 0.000005s : 31: predicate.list_to_tuple_eliminator_ 2.58% : 0.000007s : 48: predicate.load_eliminater 0.69% : 0.000002s : 5: predicate.loop_unroll_after_grad 2.06% : 0.000006s : 36: predicate.loop_unroll_before_grad 1.67% : 0.000004s : 27: predicate.make_slice_get_slice_eliminator 0.57% : 0.000002s : 10: predicate.merge_addn 0.56% : 0.000001s : 10: predicate.micro_step_allgather_replace 0.65% : 0.000002s : 10: predicate.mini_step_allgather_replace 1.01% : 0.000003s : 17: predicate.minmaximum_grad 0.94% : 0.000002s : 5: predicate.mutable_eliminate 0.33% : 0.000001s : 5: predicate.opt_reshape 0.31% : 0.000001s : 5: predicate.parallel_virtual_node 1.63% : 0.000004s : 25: predicate.partial_defer_inline 1.66% : 0.000004s : 26: predicate.partial_eliminate 1.01% : 0.000003s : 17: predicate.print_const_string_wrapper 0.60% : 0.000002s : 10: predicate.reduce_all_const_elim 1.31% : 0.000003s : 17: predicate.reduce_eliminate 2.59% : 0.000007s : 48: predicate.redundant_stop_gradient_eliminater 0.46% : 0.000001s : 10: predicate.remove_not_recompute_node 1.28% : 0.000003s : 31: predicate.replace_applicator 0.48% : 0.000001s : 10: predicate.replace_old_param 0.23% : 0.000001s : 5: predicate.reset_defer_inline 1.11% : 0.000003s : 17: predicate.reshape_eliminate 0.65% : 0.000002s : 10: predicate.row_tensor_add_zeros_like 0.34% : 0.000001s : 5: predicate.row_tensor_eliminate 0.85% : 0.000002s : 10: predicate.same_eliminate 0.43% : 0.000001s : 10: predicate.set_cell_output_no_recompute 0.66% : 0.000002s : 10: predicate.shard_identity_eliminate 0.65% : 0.000002s : 10: predicate.special_op_eliminate 0.67% : 0.000002s : 10: predicate.specialize_transform 0.84% : 0.000002s : 10: predicate.split_environ_get_set_with_tuple_value 0.72% : 0.000002s : 10: predicate.stack_unstack_eliminate 0.34% : 0.000001s : 5: predicate.switch_call_monad_eliminater 1.58% : 0.000004s : 25: predicate.switch_defer_inline 2.12% : 0.000006s : 35: predicate.switch_layer_defer_inline 5.04% : 0.000013s : 76: predicate.switch_simplify 0.98% : 0.000003s : 17: predicate.tile_eliminate 1.07% : 0.000003s : 17: predicate.transpose_eliminate 1.60% : 0.000004s : 27: predicate.tuple_list_convert_item_index_to_positive 1.65% : 0.000004s : 27: predicate.tuple_list_get_item_const_eliminator 1.55% : 0.000004s : 27: predicate.tuple_list_get_item_depend_reorder 3.23% : 0.000009s : 41: predicate.tuple_list_get_item_eliminator 1.70% : 0.000005s : 27: predicate.tuple_list_get_set_item_eliminator 2.39% : 0.000006s : 37: predicate.tuple_list_set_item_eliminator 1.78% : 0.000005s : 31: predicate.tuple_to_list_eliminator_ 2.48% : 0.000007s : 48: predicate.updatestate_pure_node_eliminater 3.22% : 0.000009s : 58: predicate.updatestate_useless_node_eliminater 0.41% : 0.000001s : 5: predicate.value_based_eliminate 0.75% : 0.000002s : 10: predicate.virtual_dataset_eliminate 0.71% : 0.000002s : 10: predicate.virtual_output_eliminate 0.27% : 0.000001s : 5: predicate.virtual_view_grad_eliminate 0.38% : 0.000001s : 5: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000473 11 51.71% : 0.000244s : 5: func_graph_cloner_run.FuncGraphClonerGraph 48.29% : 0.000228s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.027551 192 0.02% : 0.000006s : 1: ForceFp32Comm 11.35% : 0.003126s : 1: add_attr 11.30% : 0.003112s : 1: add_attr_with_inline 0.02% : 0.000006s : 1: add_comm_op_reuse_tag 0.23% : 0.000063s : 1: add_recomputation 0.02% : 0.000007s : 1: assign_add_opt 0.26% : 0.000072s : 1: auto_monad 0.11% : 0.000029s : 1: auto_monad_reorder 0.02% : 0.000006s : 1: begin_end_overlap_inline 0.03% : 0.000008s : 1: bias_add_comm_swap 1.45% : 0.000399s : 1: bootstrap 0.12% : 0.000033s : 1: cconv 0.02% : 0.000006s : 1: comm_op_add_attrs 0.07% : 0.000020s : 1: control_data_broadcast_order 0.05% : 0.000014s : 1: convert_after_rewriter 0.13% : 0.000035s : 1: cse_after_recomputation 0.03% : 0.000008s : 1: dataset_repeat_opt 0.06% : 0.000017s : 1: detach_backward 0.04% : 0.000012s : 1: environ_conv 0.11% : 0.000029s : 1: event_method 0.03% : 0.000007s : 1: full_micro_interleaved_order_control 0.03% : 0.000007s : 1: get_jit_bprop_graph 0.05% : 0.000012s : 1: graph_reusing 0.02% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.02% : 0.000007s : 1: handle_group_info 0.03% : 0.000008s : 1: inline 0.03% : 0.000009s : 1: insert-virtual-dataset 0.02% : 0.000006s : 1: interleave_parallel_branches 0.02% : 0.000006s : 1: interleave_split_concat_branches 0.03% : 0.000008s : 1: label_fine_grained_interleaved_index 0.04% : 0.000010s : 1: label_micro_interleaved_index 1.67% : 0.000460s : 1: loop_unroll 0.02% : 0.000006s : 1: merge_cast_opt 0.03% : 0.000007s : 1: micro_interleaved_order_control 2.26% : 0.000622s : 1: mutable_eliminate 0.03% : 0.000010s : 1: offloading_packed_experts 0.06% : 0.000015s : 1: opt.transform.loop_unroll_optimizer 0.07% : 0.000019s : 1: opt.transform.mutable_eliminate 5.35% : 0.001475s : 78: opt.transform.opt_a 0.13% : 0.000036s : 1: opt.transform.opt_after_cconv 0.10% : 0.000029s : 1: opt.transform.opt_after_jit_grad 0.49% : 0.000135s : 28: opt.transform.opt_b 0.21% : 0.000059s : 2: opt.transform.opt_trans_graph 0.17% : 0.000046s : 4: opt.transform.symbol_engine_opt 12.26% : 0.003378s : 1: opt_a 0.51% : 0.000141s : 1: opt_after_cconv 1.81% : 0.000500s : 1: opt_after_jit_grad 1.15% : 0.000316s : 1: opt_b 22.66% : 0.006244s : 1: optimize 0.09% : 0.000025s : 1: optimize_parallel_all_gather_comm 0.04% : 0.000012s : 1: order_py_execute_after_rewriter 0.10% : 0.000028s : 1: overlap_grad_flash_sp 0.02% : 0.000006s : 1: overlap_grad_matmul_and_grad_allreduce 0.04% : 0.000010s : 1: overlap_grad_ring_attention 0.02% : 0.000007s : 1: overlap_opt_shard_grad_in_pipeline 0.02% : 0.000006s : 1: overlap_opt_shard_in_pipeline 0.03% : 0.000008s : 1: overlap_param_gather 0.02% : 0.000006s : 1: overlap_recompute_allgather_and_fa_grad 0.04% : 0.000011s : 1: overlap_recompute_and_grad_model_parallel 0.03% : 0.000007s : 1: overlap_recompute_comm 0.04% : 0.000010s : 1: parallel-infer-symbol 0.02% : 0.000007s : 1: parallel-infer-symbol-second 0.03% : 0.000008s : 1: partial_unused_args_eliminate 0.03% : 0.000010s : 1: pipeline_parallel_scheduler 0.03% : 0.000008s : 1: pipeline_split 0.15% : 0.000041s : 1: pre_auto_parallel 0.12% : 0.000034s : 1: py_interpret_to_execute 0.07% : 0.000019s : 1: py_interpret_to_execute_after_opt_a 0.02% : 0.000007s : 1: remove_cast_before_assign_add 0.08% : 0.000022s : 1: remove_dup_value 1.22% : 0.000337s : 1: renormalize.infer 1.11% : 0.000306s : 1: renormalize.specialize 0.03% : 0.000008s : 1: reorder_send_recv_between_fp_bp 0.03% : 0.000009s : 1: rewriter_after_jit_bprop_graph 0.18% : 0.000050s : 1: rewriter_after_opt_a 0.36% : 0.000099s : 1: rewriter_before_opt_a 0.03% : 0.000008s : 1: slice_cell_reuse_recomputed_activation 0.03% : 0.000008s : 1: slice_recompute_activation 0.02% : 0.000007s : 1: split_layernorm_comm 0.03% : 0.000008s : 1: split_matmul_comm_elemetwise 0.04% : 0.000011s : 1: swap_dp_allreduce_reducescatter 0.39% : 0.000108s : 1: symbol_engine_optimizer 0.38% : 0.000104s : 1: tuple_transform 20.34% : 0.005604s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:45:41.202.673 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0173951, [21] [bootstrap]: 0.00041524 [type_inference]: 0.00671619 [event_method]: 2.14e-05 [auto_monad]: 6.877e-05 [graph_reusing]: 6.12001e-06 [inline]: 3.30003e-06 [add_attr]: 0.00351882, [1] [add_attr_with_inline]: 0.00350813, [1] [Cycle 1]: 6.451e-05, [2] [tag_attr]: 2.145e-05 [meta_addattr_fg_expand]: 5.85002e-06 [parallel-infer-symbol]: 3.53e-06 [pre_auto_parallel]: 3.566e-05 [insert-virtual-dataset]: 2.47001e-06 [parallel-infer-symbol-second]: 9.50007e-07 [dataset_repeat_opt]: 2.19001e-06 [pipeline_split]: 1.81e-06 [optimize]: 0.00584717, [53] [py_interpret_to_execute]: 2.643e-05 [rewriter_before_opt_a]: 9.134e-05 [opt_a]: 0.00334638, [2] [Cycle 1]: 0.00247399, [45] [expand_dump_flag]: 2.98e-06 [switch_simplify]: 4.457e-05 [loop_unroll]: 3.145e-05 [a_1]: 0.00077354 [with_stream_mark]: 1.832e-05 [recompute_prepare]: 1.296e-05 [updatestate_depend_eliminate]: 4.94e-06 [updatestate_assign_eliminate]: 3.91999e-06 [updatestate_loads_eliminate]: 3.94002e-06 [parameter_eliminate]: 1.94e-06 [a_2]: 0.0001048 [accelerated_algorithm]: 9.70002e-06 [shard]: 1.78002e-06 [meta_shard_fg_expand]: 2.26998e-06 [shard_inline]: 8.70999e-06 [merge_send_recv]: 1.094e-05 [auto_parallel]: 8.49998e-06 [parallel]: 2.079e-05 [flash_sp]: 1.153e-05 [merge_comm]: 4.70001e-06 [allreduce_fusion]: 4.72998e-06 [matmul_add_comm_reduction]: 1.142e-05 [allreduce_slice_to_reducescatter]: 8.50006e-07 [virtual_shard_identity]: 1.301e-05 [virtual_dataset]: 8.99e-06 [get_grad_eliminate_]: 8.02998e-06 [virtual_output]: 8.26002e-06 [merge_forward]: 4.35999e-06 [cell_reuse_recompute_pass]: 1.64e-06 [offload_activation]: 1.235e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.698e-05 [merge_recompute_call_nodes]: 1.74e-06 [before_grad]: 1.412e-05 [set_forward_comm_id_for_comm_node_pass]: 4.79e-06 [meta_fg_expand]: 3.27002e-06 [flash_sp_send_recv_attached]: 2.51e-06 [receive_attached]: 2.11998e-06 [after_resolve]: 1.466e-05 [a_after_grad]: 1.214e-05 [renormalize]: 0.00083251 [add_forward_monad_depend]: 8.1e-06 [auto_monad_grad]: 3.20998e-06 [auto_monad_eliminator]: 2.202e-05 [cse]: 3.843e-05 [a_3]: 6.539e-05 [Cycle 2]: 0.00086016, [45] [expand_dump_flag]: 1.71e-06 [switch_simplify]: 1.052e-05 [loop_unroll]: 7.58001e-06 [a_1]: 0.00020899 [with_stream_mark]: 1.772e-05 [recompute_prepare]: 8.70999e-06 [updatestate_depend_eliminate]: 4.15e-06 [updatestate_assign_eliminate]: 3.83999e-06 [updatestate_loads_eliminate]: 4.15e-06 [parameter_eliminate]: 1.91e-06 [a_2]: 9.566e-05 [accelerated_algorithm]: 7.95e-06 [shard]: 2.14999e-06 [meta_shard_fg_expand]: 2.41998e-06 [shard_inline]: 7.53e-06 [merge_send_recv]: 9.50001e-06 [auto_parallel]: 8.91002e-06 [parallel]: 7.11999e-06 [flash_sp]: 4.48999e-06 [merge_comm]: 4.60999e-06 [allreduce_fusion]: 4.35e-06 [matmul_add_comm_reduction]: 9.77001e-06 [allreduce_slice_to_reducescatter]: 5.3001e-07 [virtual_shard_identity]: 8.79e-06 [virtual_dataset]: 7.4e-06 [get_grad_eliminate_]: 8.03999e-06 [virtual_output]: 7.18e-06 [merge_forward]: 4.14002e-06 [cell_reuse_recompute_pass]: 2.36e-06 [offload_activation]: 1.195e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.581e-05 [merge_recompute_call_nodes]: 1.37e-06 [before_grad]: 1.303e-05 [set_forward_comm_id_for_comm_node_pass]: 5.35999e-06 [meta_fg_expand]: 3.02002e-06 [flash_sp_send_recv_attached]: 1.39e-06 [receive_attached]: 1.94999e-06 [after_resolve]: 1.375e-05 [a_after_grad]: 1.166e-05 [renormalize]: 8.00064e-08 [add_forward_monad_depend]: 1.77001e-06 [auto_monad_grad]: 1.15001e-06 [auto_monad_eliminator]: 1.049e-05 [cse]: 2.554e-05 [a_3]: 4.63e-05 [py_interpret_to_execute_after_opt_a]: 1.647e-05 [slice_cell_reuse_recomputed_activation]: 2.64001e-06 [rewriter_after_opt_a]: 4.66e-05 [convert_after_rewriter]: 7.69002e-06 [order_py_execute_after_rewriter]: 5.67001e-06 [mutable_eliminate]: 0.00075019 [opt_b]: 0.00025969, [1] [Cycle 1]: 0.00025244, [7] [b_1]: 0.00015724 [b_2]: 1.113e-05 [updatestate_depend_eliminate]: 8.42e-06 [updatestate_assign_eliminate]: 3.26001e-06 [updatestate_loads_eliminate]: 3.30998e-06 [renormalize]: 8.00006e-07 [cse]: 3.231e-05 [optimize_parallel_all_gather_comm]: 2.012e-05 [overlap_param_gather]: 2.13002e-06 [cconv]: 3.2e-05 [loop_unroll]: 0.00048165 [opt_after_cconv]: 0.00012489, [1] [Cycle 1]: 0.00011807, [7] [c_1]: 3.893e-05 [parameter_eliminate]: 4.25999e-06 [updatestate_depend_eliminate]: 7.13e-06 [updatestate_assign_eliminate]: 3.15998e-06 [updatestate_loads_eliminate]: 3.11999e-06 [cse]: 2.656e-05 [renormalize]: 6.69999e-07 [remove_dup_value]: 1.593e-05 [tuple_transform]: 9.094e-05, [1] [Cycle 1]: 8.608e-05, [4] [d_1]: 5.544e-05 [none_parameter_eliminate]: 1.86e-06 [renormalize]: 2.10013e-07 [switch_simplify]: 9.12001e-06 [partial_unused_args_eliminate]: 1.91e-06 [add_recomputation]: 5.987e-05 [cse_after_recomputation]: 2.892e-05, [1] [Cycle 1]: 2.379e-05, [1] [cse]: 1.776e-05 [environ_conv]: 6.87002e-06 [swap_dp_allreduce_reducescatter]: 6.94001e-06 [bias_add_comm_swap]: 2.68e-06 [label_micro_interleaved_index]: 5.05001e-06 [label_fine_grained_interleaved_index]: 2.66e-06 [merge_cast_opt]: 1.64e-06 [slice_recompute_activation]: 2.06e-06 [micro_interleaved_order_control]: 2.53e-06 [assign_add_opt]: 1.35999e-06 [ForceFp32Comm]: 1.27999e-06 [remove_cast_before_assign_add]: 1.07998e-06 [full_micro_interleaved_order_control]: 2.33002e-06 [reorder_send_recv_between_fp_bp]: 3.01001e-06 [comm_op_add_attrs]: 1.02e-06 [add_comm_op_reuse_tag]: 1.37e-06 [interleave_split_concat_branches]: 1.35999e-06 [interleave_parallel_branches]: 1.07e-06 [overlap_opt_shard_in_pipeline]: 1.40999e-06 [overlap_opt_shard_grad_in_pipeline]: 2.44999e-06 [control_data_broadcast_order]: 1.562e-05 [grouped_pairwise_exchange_alltoall]: 1.82001e-06 [offloading_packed_experts]: 4.90001e-06 [overlap_recompute_and_grad_model_parallel]: 5.92001e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.39e-06 [overlap_recompute_allgather_and_fa_grad]: 1.72999e-06 [overlap_recompute_comm]: 2.43e-06 [overlap_grad_ring_attention]: 4.94e-06 [overlap_grad_flash_sp]: 2.438e-05 [begin_end_overlap_inline]: 5.39992e-07 [split_matmul_comm_elemetwise]: 2.50002e-06 [split_layernorm_comm]: 2.04e-06 [handle_group_info]: 9.89996e-07 [symbol_engine_optimizer]: 9.048e-05, [1] [Cycle 1]: 8.569e-05, [6] [build]: 4.33999e-06 [elim_shapecalc]: 1.471e-05 [elim_not_effective]: 1.571e-05 [opt_reshape]: 9.23002e-06 [fold_const_symbol]: 1.217e-05 [renormalize]: 1.90019e-07 [detach_backward]: 2.20002e-06 [pipeline_parallel_scheduler]: 1.77001e-06 [auto_monad_reorder]: 2.255e-05 [get_jit_bprop_graph]: 1.45001e-06 [rewriter_after_jit_bprop_graph]: 5.46e-06 [opt_after_jit_grad]: 0.00051049 [validate]: 4.774e-05 Sums bootstrap : 0.000415s : 3.23% type_inference : 0.006716s : 52.26% event_method : 0.000021s : 0.17% auto_monad : 0.000069s : 0.54% graph_reusing : 0.000006s : 0.05% inline : 0.000003s : 0.03% add_attr.add_attr_with_inline.tag_attr : 0.000021s : 0.17% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.05% parallel-infer-symbol : 0.000004s : 0.03% pre_auto_parallel : 0.000036s : 0.28% insert-virtual-dataset : 0.000002s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000026s : 0.21% optimize.rewriter_before_opt_a : 0.000091s : 0.71% optimize.opt_a.expand_dump_flag : 0.000005s : 0.04% optimize.opt_a.switch_simplify : 0.000055s : 0.43% optimize.opt_a.loop_unroll : 0.000039s : 0.30% optimize.opt_a.a_1 : 0.000983s : 7.65% optimize.opt_a.with_stream_mark : 0.000036s : 0.28% optimize.opt_a.recompute_prepare : 0.000022s : 0.17% optimize.opt_a.updatestate_depend_eliminate : 0.000009s : 0.07% optimize.opt_a.updatestate_assign_eliminate : 0.000008s : 0.06% optimize.opt_a.updatestate_loads_eliminate : 0.000008s : 0.06% optimize.opt_a.parameter_eliminate : 0.000004s : 0.03% optimize.opt_a.a_2 : 0.000200s : 1.56% optimize.opt_a.accelerated_algorithm : 0.000018s : 0.14% optimize.opt_a.shard : 0.000004s : 0.03% optimize.opt_a.meta_shard_fg_expand : 0.000005s : 0.04% optimize.opt_a.shard_inline : 0.000016s : 0.13% optimize.opt_a.merge_send_recv : 0.000020s : 0.16% optimize.opt_a.auto_parallel : 0.000017s : 0.14% optimize.opt_a.parallel : 0.000028s : 0.22% optimize.opt_a.flash_sp : 0.000016s : 0.12% optimize.opt_a.merge_comm : 0.000009s : 0.07% optimize.opt_a.allreduce_fusion : 0.000009s : 0.07% optimize.opt_a.matmul_add_comm_reduction : 0.000021s : 0.16% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000022s : 0.17% optimize.opt_a.virtual_dataset : 0.000016s : 0.13% optimize.opt_a.get_grad_eliminate_ : 0.000016s : 0.13% optimize.opt_a.virtual_output : 0.000015s : 0.12% optimize.opt_a.merge_forward : 0.000009s : 0.07% optimize.opt_a.cell_reuse_recompute_pass : 0.000004s : 0.03% optimize.opt_a.offload_activation : 0.000024s : 0.19% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000033s : 0.26% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.02% optimize.opt_a.before_grad : 0.000027s : 0.21% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000010s : 0.08% optimize.opt_a.meta_fg_expand : 0.000006s : 0.05% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.03% optimize.opt_a.receive_attached : 0.000004s : 0.03% optimize.opt_a.after_resolve : 0.000028s : 0.22% optimize.opt_a.a_after_grad : 0.000024s : 0.19% optimize.opt_a.renormalize : 0.000833s : 6.48% optimize.opt_a.add_forward_monad_depend : 0.000010s : 0.08% optimize.opt_a.auto_monad_grad : 0.000004s : 0.03% optimize.opt_a.auto_monad_eliminator : 0.000033s : 0.25% optimize.opt_a.cse : 0.000064s : 0.50% optimize.opt_a.a_3 : 0.000112s : 0.87% optimize.py_interpret_to_execute_after_opt_a : 0.000016s : 0.13% optimize.slice_cell_reuse_recomputed_activation : 0.000003s : 0.02% optimize.rewriter_after_opt_a : 0.000047s : 0.36% optimize.convert_after_rewriter : 0.000008s : 0.06% optimize.order_py_execute_after_rewriter : 0.000006s : 0.04% optimize.mutable_eliminate : 0.000750s : 5.84% optimize.opt_b.b_1 : 0.000157s : 1.22% optimize.opt_b.b_2 : 0.000011s : 0.09% optimize.opt_b.updatestate_depend_eliminate : 0.000008s : 0.07% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_b.renormalize : 0.000001s : 0.01% optimize.opt_b.cse : 0.000032s : 0.25% optimize.optimize_parallel_all_gather_comm : 0.000020s : 0.16% optimize.overlap_param_gather : 0.000002s : 0.02% optimize.cconv : 0.000032s : 0.25% optimize.loop_unroll : 0.000482s : 3.75% optimize.opt_after_cconv.c_1 : 0.000039s : 0.30% optimize.opt_after_cconv.parameter_eliminate : 0.000004s : 0.03% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000007s : 0.06% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.02% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.02% optimize.opt_after_cconv.cse : 0.000027s : 0.21% optimize.opt_after_cconv.renormalize : 0.000001s : 0.01% optimize.remove_dup_value : 0.000016s : 0.12% optimize.tuple_transform.d_1 : 0.000055s : 0.43% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000009s : 0.07% optimize.partial_unused_args_eliminate : 0.000002s : 0.01% optimize.add_recomputation : 0.000060s : 0.47% optimize.cse_after_recomputation.cse : 0.000018s : 0.14% optimize.environ_conv : 0.000007s : 0.05% optimize.swap_dp_allreduce_reducescatter : 0.000007s : 0.05% optimize.bias_add_comm_swap : 0.000003s : 0.02% optimize.label_micro_interleaved_index : 0.000005s : 0.04% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.02% optimize.merge_cast_opt : 0.000002s : 0.01% optimize.slice_recompute_activation : 0.000002s : 0.02% optimize.micro_interleaved_order_control : 0.000003s : 0.02% optimize.assign_add_opt : 0.000001s : 0.01% optimize.ForceFp32Comm : 0.000001s : 0.01% optimize.remove_cast_before_assign_add : 0.000001s : 0.01% optimize.full_micro_interleaved_order_control : 0.000002s : 0.02% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.02% optimize.comm_op_add_attrs : 0.000001s : 0.01% optimize.add_comm_op_reuse_tag : 0.000001s : 0.01% optimize.interleave_split_concat_branches : 0.000001s : 0.01% optimize.interleave_parallel_branches : 0.000001s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.02% optimize.control_data_broadcast_order : 0.000016s : 0.12% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.01% optimize.offloading_packed_experts : 0.000005s : 0.04% optimize.overlap_recompute_and_grad_model_parallel : 0.000006s : 0.05% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000002s : 0.01% optimize.overlap_recompute_comm : 0.000002s : 0.02% optimize.overlap_grad_ring_attention : 0.000005s : 0.04% optimize.overlap_grad_flash_sp : 0.000024s : 0.19% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000003s : 0.02% optimize.split_layernorm_comm : 0.000002s : 0.02% optimize.handle_group_info : 0.000001s : 0.01% optimize.symbol_engine_optimizer.build : 0.000004s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000015s : 0.11% optimize.symbol_engine_optimizer.elim_not_effective : 0.000016s : 0.12% optimize.symbol_engine_optimizer.opt_reshape : 0.000009s : 0.07% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000012s : 0.09% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.02% pipeline_parallel_scheduler : 0.000002s : 0.01% auto_monad_reorder : 0.000023s : 0.18% get_jit_bprop_graph : 0.000001s : 0.01% rewriter_after_jit_bprop_graph : 0.000005s : 0.04% opt_after_jit_grad : 0.000510s : 3.97% validate : 0.000048s : 0.37% Time group info: ------[substitution.] 0.000234 38 11.62% : 0.000027s : 3: substitution.cast_eliminate 0.88% : 0.000002s : 3: substitution.elim_not_effective 0.71% : 0.000002s : 3: substitution.fold_const_symbol 3.14% : 0.000007s : 5: substitution.graph_param_transform 68.92% : 0.000161s : 4: substitution.inline 2.28% : 0.000005s : 6: substitution.j_node_and_user_rematch 2.61% : 0.000006s : 6: substitution.remove_not_recompute_node 2.64% : 0.000006s : 4: substitution.replace_old_param 7.20% : 0.000017s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.006647 2 88.08% : 0.005855s : 1: type_inference.infer 11.92% : 0.000792s : 1: type_inference.specialize ------[replace.] 0.000067 8 58.56% : 0.000039s : 4: replace.inline 41.44% : 0.000028s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000173 8 91.59% : 0.000159s : 4: match.inline 8.41% : 0.000015s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000290 1596 0.82% : 0.000002s : 17: predicate.accumulaten_eliminater 0.68% : 0.000002s : 5: predicate.ad_related_special_op_eliminate 0.50% : 0.000001s : 10: predicate.addn_check_dump 9.04% : 0.000026s : 17: predicate.addn_zero_filter 0.92% : 0.000003s : 17: predicate.adjust_all_reduce_mul_add 1.96% : 0.000006s : 27: predicate.arithmetic_simplify 0.86% : 0.000002s : 17: predicate.cast_eliminate 0.53% : 0.000002s : 10: predicate.check_bprop_eliminate 0.50% : 0.000001s : 10: predicate.compare_switch_simplify 0.17% : 0.000000s : 5: predicate.const_output_eliminate 0.56% : 0.000002s : 10: predicate.depend_value_elim 1.08% : 0.000003s : 17: predicate.dict_get_item_const_eliminator 0.91% : 0.000003s : 17: predicate.dict_get_item_eliminator 0.80% : 0.000002s : 17: predicate.dict_set_item_eliminator 0.82% : 0.000002s : 10: predicate.dumpgradient_eliminate 0.19% : 0.000001s : 5: predicate.elim_not_effective 0.52% : 0.000002s : 5: predicate.elim_shapecalc_of_broadcastargs 1.05% : 0.000003s : 22: predicate.environ_add_const_eliminate 0.99% : 0.000003s : 22: predicate.environ_get_add_eliminate 1.01% : 0.000003s : 22: predicate.environ_get_depend_swap 1.66% : 0.000005s : 32: predicate.environ_get_eliminate 1.00% : 0.000003s : 22: predicate.environ_get_set_eliminate 1.27% : 0.000004s : 25: predicate.exchange_switch_depend_value 2.14% : 0.000006s : 25: predicate.float_depend_g_call 0.51% : 0.000001s : 10: predicate.float_environ_get_switch 0.77% : 0.000002s : 15: predicate.float_tuple_getitem_switch 0.17% : 0.000001s : 5: predicate.fold_const_symbol 0.74% : 0.000002s : 10: predicate.get_grad_eliminate 0.16% : 0.000000s : 5: predicate.graph_param_transform 0.55% : 0.000002s : 10: predicate.incorporate_call 0.45% : 0.000001s : 10: predicate.incorporate_call_switch 5.55% : 0.000016s : 72: predicate.inline 0.69% : 0.000002s : 10: predicate.inline_without_move 0.27% : 0.000001s : 10: predicate.j_node_and_user_rematch 0.80% : 0.000002s : 10: predicate.less_batch_normalization 1.61% : 0.000005s : 31: predicate.list_to_tuple_eliminator_ 2.26% : 0.000007s : 48: predicate.load_eliminater 0.99% : 0.000003s : 5: predicate.loop_unroll_after_grad 1.72% : 0.000005s : 36: predicate.loop_unroll_before_grad 1.55% : 0.000004s : 27: predicate.make_slice_get_slice_eliminator 0.51% : 0.000001s : 10: predicate.merge_addn 0.51% : 0.000001s : 10: predicate.micro_step_allgather_replace 0.50% : 0.000001s : 10: predicate.mini_step_allgather_replace 0.88% : 0.000003s : 17: predicate.minmaximum_grad 0.86% : 0.000003s : 5: predicate.mutable_eliminate 0.41% : 0.000001s : 5: predicate.opt_reshape 0.37% : 0.000001s : 5: predicate.parallel_virtual_node 1.66% : 0.000005s : 25: predicate.partial_defer_inline 1.54% : 0.000004s : 26: predicate.partial_eliminate 0.94% : 0.000003s : 17: predicate.print_const_string_wrapper 0.50% : 0.000001s : 10: predicate.reduce_all_const_elim 1.11% : 0.000003s : 17: predicate.reduce_eliminate 2.33% : 0.000007s : 48: predicate.redundant_stop_gradient_eliminater 0.32% : 0.000001s : 10: predicate.remove_not_recompute_node 1.27% : 0.000004s : 31: predicate.replace_applicator 0.74% : 0.000002s : 10: predicate.replace_old_param 0.29% : 0.000001s : 5: predicate.reset_defer_inline 0.91% : 0.000003s : 17: predicate.reshape_eliminate 0.61% : 0.000002s : 10: predicate.row_tensor_add_zeros_like 0.40% : 0.000001s : 5: predicate.row_tensor_eliminate 0.81% : 0.000002s : 10: predicate.same_eliminate 0.45% : 0.000001s : 10: predicate.set_cell_output_no_recompute 0.70% : 0.000002s : 10: predicate.shard_identity_eliminate 0.70% : 0.000002s : 10: predicate.special_op_eliminate 0.64% : 0.000002s : 10: predicate.specialize_transform 0.95% : 0.000003s : 10: predicate.split_environ_get_set_with_tuple_value 0.91% : 0.000003s : 10: predicate.stack_unstack_eliminate 0.32% : 0.000001s : 5: predicate.switch_call_monad_eliminater 1.35% : 0.000004s : 25: predicate.switch_defer_inline 1.81% : 0.000005s : 35: predicate.switch_layer_defer_inline 4.45% : 0.000013s : 76: predicate.switch_simplify 0.87% : 0.000003s : 17: predicate.tile_eliminate 0.83% : 0.000002s : 17: predicate.transpose_eliminate 1.39% : 0.000004s : 27: predicate.tuple_list_convert_item_index_to_positive 1.64% : 0.000005s : 27: predicate.tuple_list_get_item_const_eliminator 1.32% : 0.000004s : 27: predicate.tuple_list_get_item_depend_reorder 3.09% : 0.000009s : 41: predicate.tuple_list_get_item_eliminator 1.32% : 0.000004s : 27: predicate.tuple_list_get_set_item_eliminator 2.01% : 0.000006s : 37: predicate.tuple_list_set_item_eliminator 1.60% : 0.000005s : 31: predicate.tuple_to_list_eliminator_ 2.29% : 0.000007s : 48: predicate.updatestate_pure_node_eliminater 2.91% : 0.000008s : 58: predicate.updatestate_useless_node_eliminater 0.31% : 0.000001s : 5: predicate.value_based_eliminate 0.68% : 0.000002s : 10: predicate.virtual_dataset_eliminate 0.64% : 0.000002s : 10: predicate.virtual_output_eliminate 0.25% : 0.000001s : 5: predicate.virtual_view_grad_eliminate 0.31% : 0.000001s : 5: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000608 11 55.38% : 0.000337s : 5: func_graph_cloner_run.FuncGraphClonerGraph 44.62% : 0.000272s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.029328 192 0.01% : 0.000004s : 1: ForceFp32Comm 12.02% : 0.003524s : 1: add_attr 11.98% : 0.003512s : 1: add_attr_with_inline 0.01% : 0.000004s : 1: add_comm_op_reuse_tag 0.22% : 0.000064s : 1: add_recomputation 0.02% : 0.000004s : 1: assign_add_opt 0.26% : 0.000075s : 1: auto_monad 0.09% : 0.000027s : 1: auto_monad_reorder 0.01% : 0.000003s : 1: begin_end_overlap_inline 0.02% : 0.000006s : 1: bias_add_comm_swap 1.51% : 0.000444s : 1: bootstrap 0.12% : 0.000036s : 1: cconv 0.01% : 0.000004s : 1: comm_op_add_attrs 0.06% : 0.000019s : 1: control_data_broadcast_order 0.04% : 0.000011s : 1: convert_after_rewriter 0.11% : 0.000032s : 1: cse_after_recomputation 0.02% : 0.000005s : 1: dataset_repeat_opt 0.02% : 0.000006s : 1: detach_backward 0.03% : 0.000010s : 1: environ_conv 0.10% : 0.000028s : 1: event_method 0.02% : 0.000006s : 1: full_micro_interleaved_order_control 0.02% : 0.000004s : 1: get_jit_bprop_graph 0.03% : 0.000010s : 1: graph_reusing 0.02% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000004s : 1: handle_group_info 0.02% : 0.000006s : 1: inline 0.02% : 0.000006s : 1: insert-virtual-dataset 0.01% : 0.000004s : 1: interleave_parallel_branches 0.01% : 0.000004s : 1: interleave_split_concat_branches 0.02% : 0.000006s : 1: label_fine_grained_interleaved_index 0.03% : 0.000008s : 1: label_micro_interleaved_index 1.67% : 0.000491s : 1: loop_unroll 0.01% : 0.000004s : 1: merge_cast_opt 0.02% : 0.000005s : 1: micro_interleaved_order_control 2.59% : 0.000758s : 1: mutable_eliminate 0.03% : 0.000008s : 1: offloading_packed_experts 0.07% : 0.000019s : 1: opt.transform.loop_unroll_optimizer 0.07% : 0.000021s : 1: opt.transform.mutable_eliminate 5.20% : 0.001525s : 78: opt.transform.opt_a 0.13% : 0.000037s : 1: opt.transform.opt_after_cconv 0.10% : 0.000030s : 1: opt.transform.opt_after_jit_grad 0.46% : 0.000134s : 28: opt.transform.opt_b 0.21% : 0.000062s : 2: opt.transform.opt_trans_graph 0.16% : 0.000048s : 4: opt.transform.symbol_engine_opt 11.42% : 0.003350s : 1: opt_a 0.44% : 0.000128s : 1: opt_after_cconv 1.77% : 0.000520s : 1: opt_after_jit_grad 0.90% : 0.000263s : 1: opt_b 19.96% : 0.005853s : 1: optimize 0.08% : 0.000024s : 1: optimize_parallel_all_gather_comm 0.03% : 0.000009s : 1: order_py_execute_after_rewriter 0.09% : 0.000028s : 1: overlap_grad_flash_sp 0.01% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.03% : 0.000008s : 1: overlap_grad_ring_attention 0.02% : 0.000006s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.02% : 0.000005s : 1: overlap_param_gather 0.01% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.03% : 0.000009s : 1: overlap_recompute_and_grad_model_parallel 0.02% : 0.000005s : 1: overlap_recompute_comm 0.03% : 0.000008s : 1: parallel-infer-symbol 0.01% : 0.000004s : 1: parallel-infer-symbol-second 0.02% : 0.000005s : 1: partial_unused_args_eliminate 0.02% : 0.000005s : 1: pipeline_parallel_scheduler 0.02% : 0.000005s : 1: pipeline_split 0.14% : 0.000040s : 1: pre_auto_parallel 0.10% : 0.000030s : 1: py_interpret_to_execute 0.07% : 0.000020s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000004s : 1: remove_cast_before_assign_add 0.07% : 0.000020s : 1: remove_dup_value 1.62% : 0.000475s : 1: renormalize.infer 1.19% : 0.000348s : 1: renormalize.specialize 0.02% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.03% : 0.000009s : 1: rewriter_after_jit_bprop_graph 0.17% : 0.000051s : 1: rewriter_after_opt_a 0.33% : 0.000095s : 1: rewriter_before_opt_a 0.02% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.02% : 0.000005s : 1: slice_recompute_activation 0.02% : 0.000005s : 1: split_layernorm_comm 0.02% : 0.000005s : 1: split_matmul_comm_elemetwise 0.03% : 0.000010s : 1: swap_dp_allreduce_reducescatter 0.32% : 0.000093s : 1: symbol_engine_optimizer 0.32% : 0.000094s : 1: tuple_transform 22.97% : 0.006738s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:45:41.427.110 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:45:41.427.389 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0157776, [21] [bootstrap]: 0.00035791 [type_inference]: 0.00555521 [event_method]: 1.954e-05 [auto_monad]: 5.53e-05 [graph_reusing]: 4.70001e-06 [inline]: 2.44999e-06 [add_attr]: 0.0030255, [1] [add_attr_with_inline]: 0.00301719, [1] [Cycle 1]: 6.274e-05, [2] [tag_attr]: 1.845e-05 [meta_addattr_fg_expand]: 5.89e-06 [parallel-infer-symbol]: 3.08998e-06 [pre_auto_parallel]: 3.25e-05 [insert-virtual-dataset]: 2.54999e-06 [parallel-infer-symbol-second]: 6.90023e-07 [dataset_repeat_opt]: 1.95001e-06 [pipeline_split]: 1.67001e-06 [optimize]: 0.00556711, [53] [py_interpret_to_execute]: 2.745e-05 [rewriter_before_opt_a]: 8.427e-05 [opt_a]: 0.00313128, [2] [Cycle 1]: 0.00219663, [45] [expand_dump_flag]: 2.91e-06 [switch_simplify]: 3.575e-05 [loop_unroll]: 3.073e-05 [a_1]: 0.0006953 [with_stream_mark]: 1.101e-05 [recompute_prepare]: 9.92001e-06 [updatestate_depend_eliminate]: 3.81999e-06 [updatestate_assign_eliminate]: 3.17002e-06 [updatestate_loads_eliminate]: 2.95998e-06 [parameter_eliminate]: 1.27e-06 [a_2]: 0.00012953 [accelerated_algorithm]: 9.91e-06 [shard]: 1.81998e-06 [meta_shard_fg_expand]: 1.82999e-06 [shard_inline]: 7.65998e-06 [merge_send_recv]: 9.31e-06 [auto_parallel]: 6.76e-06 [parallel]: 1.845e-05 [flash_sp]: 8.14002e-06 [merge_comm]: 4.90001e-06 [allreduce_fusion]: 4.3e-06 [matmul_add_comm_reduction]: 1.096e-05 [allreduce_slice_to_reducescatter]: 8.79983e-07 [virtual_shard_identity]: 9.24e-06 [virtual_dataset]: 8.17998e-06 [get_grad_eliminate_]: 7.65e-06 [virtual_output]: 8.02e-06 [merge_forward]: 3.36001e-06 [cell_reuse_recompute_pass]: 9.89996e-07 [offload_activation]: 1.165e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.645e-05 [merge_recompute_call_nodes]: 1.79998e-06 [before_grad]: 1.21e-05 [set_forward_comm_id_for_comm_node_pass]: 4.48999e-06 [meta_fg_expand]: 2.92002e-06 [flash_sp_send_recv_attached]: 2.91999e-06 [receive_attached]: 2.93e-06 [after_resolve]: 1.172e-05 [a_after_grad]: 1.196e-05 [renormalize]: 0.00058594 [add_forward_monad_depend]: 4.47998e-06 [auto_monad_grad]: 1.60001e-06 [auto_monad_eliminator]: 1.264e-05 [cse]: 2.031e-05 [a_3]: 6.716e-05 [Cycle 2]: 0.0009224, [45] [expand_dump_flag]: 1.00001e-06 [switch_simplify]: 8.49998e-06 [loop_unroll]: 7.21999e-06 [a_1]: 0.00017263 [with_stream_mark]: 1.083e-05 [recompute_prepare]: 8.12e-06 [updatestate_depend_eliminate]: 3.61001e-06 [updatestate_assign_eliminate]: 2.99001e-06 [updatestate_loads_eliminate]: 3.11001e-06 [parameter_eliminate]: 1.05999e-06 [a_2]: 0.00011861 [accelerated_algorithm]: 7.71999e-06 [shard]: 9.79984e-07 [meta_shard_fg_expand]: 1.85001e-06 [shard_inline]: 7.21999e-06 [merge_send_recv]: 6.43e-06 [auto_parallel]: 6.59999e-06 [parallel]: 4.28999e-06 [flash_sp]: 3.51999e-06 [merge_comm]: 4.48999e-06 [allreduce_fusion]: 4.28999e-06 [matmul_add_comm_reduction]: 6.94001e-06 [allreduce_slice_to_reducescatter]: 3.89991e-07 [virtual_shard_identity]: 8.32e-06 [virtual_dataset]: 7.11001e-06 [get_grad_eliminate_]: 6.93998e-06 [virtual_output]: 6.95002e-06 [merge_forward]: 3.28998e-06 [cell_reuse_recompute_pass]: 1.61998e-06 [offload_activation]: 7.4e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.673e-05 [merge_recompute_call_nodes]: 9.49978e-07 [before_grad]: 1.231e-05 [set_forward_comm_id_for_comm_node_pass]: 4.89998e-06 [meta_fg_expand]: 2.76e-06 [flash_sp_send_recv_attached]: 9.80013e-07 [receive_attached]: 9.89996e-07 [after_resolve]: 1.188e-05 [a_after_grad]: 1.206e-05 [renormalize]: 9.00181e-08 [add_forward_monad_depend]: 1.02998e-06 [auto_monad_grad]: 1.00999e-06 [auto_monad_eliminator]: 8.50999e-06 [cse]: 1.747e-05 [a_3]: 5.893e-05 [py_interpret_to_execute_after_opt_a]: 1.283e-05 [slice_cell_reuse_recomputed_activation]: 4.99e-06 [rewriter_after_opt_a]: 3.942e-05 [convert_after_rewriter]: 9.96998e-06 [order_py_execute_after_rewriter]: 8.33999e-06 [mutable_eliminate]: 0.00053032 [opt_b]: 0.00031595, [1] [Cycle 1]: 0.00030624, [7] [b_1]: 0.00019946 [b_2]: 1.031e-05 [updatestate_depend_eliminate]: 5.91e-06 [updatestate_assign_eliminate]: 3.63999e-06 [updatestate_loads_eliminate]: 3.48999e-06 [renormalize]: 7.79983e-07 [cse]: 2.506e-05 [optimize_parallel_all_gather_comm]: 2.261e-05 [overlap_param_gather]: 5.02999e-06 [cconv]: 2.675e-05 [loop_unroll]: 0.00047228 [opt_after_cconv]: 0.00014906, [1] [Cycle 1]: 0.00013882, [7] [c_1]: 3.804e-05 [parameter_eliminate]: 3.54002e-06 [updatestate_depend_eliminate]: 6.22001e-06 [updatestate_assign_eliminate]: 3.46001e-06 [updatestate_loads_eliminate]: 3.29001e-06 [cse]: 2.611e-05 [renormalize]: 6.00005e-07 [remove_dup_value]: 1.656e-05 [tuple_transform]: 0.00010331, [1] [Cycle 1]: 9.607e-05, [4] [d_1]: 5.455e-05 [none_parameter_eliminate]: 1.64e-06 [renormalize]: 1.90019e-07 [switch_simplify]: 8.49998e-06 [partial_unused_args_eliminate]: 4.47e-06 [add_recomputation]: 5.387e-05 [cse_after_recomputation]: 3.208e-05, [1] [Cycle 1]: 2.507e-05, [1] [cse]: 1.551e-05 [environ_conv]: 9.51e-06 [swap_dp_allreduce_reducescatter]: 8.89e-06 [bias_add_comm_swap]: 4.98001e-06 [label_micro_interleaved_index]: 7.2e-06 [label_fine_grained_interleaved_index]: 5.72999e-06 [merge_cast_opt]: 4.19002e-06 [slice_recompute_activation]: 4.77e-06 [micro_interleaved_order_control]: 5.10001e-06 [assign_add_opt]: 4.33999e-06 [ForceFp32Comm]: 3.68e-06 [remove_cast_before_assign_add]: 3.76999e-06 [full_micro_interleaved_order_control]: 5.15001e-06 [reorder_send_recv_between_fp_bp]: 5.49998e-06 [comm_op_add_attrs]: 3.41999e-06 [add_comm_op_reuse_tag]: 3.62998e-06 [interleave_split_concat_branches]: 3.55e-06 [interleave_parallel_branches]: 3.32002e-06 [overlap_opt_shard_in_pipeline]: 3.61001e-06 [overlap_opt_shard_grad_in_pipeline]: 4.03999e-06 [control_data_broadcast_order]: 1.792e-05 [grouped_pairwise_exchange_alltoall]: 4.37e-06 [offloading_packed_experts]: 7.11001e-06 [overlap_recompute_and_grad_model_parallel]: 7.68999e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.56001e-06 [overlap_recompute_allgather_and_fa_grad]: 3.7e-06 [overlap_recompute_comm]: 5.17e-06 [overlap_grad_ring_attention]: 7.26999e-06 [overlap_grad_flash_sp]: 2.507e-05 [begin_end_overlap_inline]: 2.92002e-06 [split_matmul_comm_elemetwise]: 5.09998e-06 [split_layernorm_comm]: 4.06001e-06 [handle_group_info]: 3.56999e-06 [symbol_engine_optimizer]: 0.0001046, [1] [Cycle 1]: 9.802e-05, [6] [build]: 2.94001e-06 [elim_shapecalc]: 1.202e-05 [elim_not_effective]: 1.526e-05 [opt_reshape]: 8.76002e-06 [fold_const_symbol]: 1.242e-05 [renormalize]: 1.8999e-07 [detach_backward]: 3.65e-06 [pipeline_parallel_scheduler]: 1.82999e-06 [auto_monad_reorder]: 2.17e-05 [get_jit_bprop_graph]: 1.66e-06 [rewriter_after_jit_bprop_graph]: 4.63999e-06 [opt_after_jit_grad]: 0.00050806 [validate]: 3.903e-05 Sums bootstrap : 0.000358s : 3.25% type_inference : 0.005555s : 50.41% event_method : 0.000020s : 0.18% auto_monad : 0.000055s : 0.50% graph_reusing : 0.000005s : 0.04% inline : 0.000002s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000018s : 0.17% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.05% parallel-infer-symbol : 0.000003s : 0.03% pre_auto_parallel : 0.000032s : 0.29% insert-virtual-dataset : 0.000003s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.02% optimize.py_interpret_to_execute : 0.000027s : 0.25% optimize.rewriter_before_opt_a : 0.000084s : 0.76% optimize.opt_a.expand_dump_flag : 0.000004s : 0.04% optimize.opt_a.switch_simplify : 0.000044s : 0.40% optimize.opt_a.loop_unroll : 0.000038s : 0.34% optimize.opt_a.a_1 : 0.000868s : 7.88% optimize.opt_a.with_stream_mark : 0.000022s : 0.20% optimize.opt_a.recompute_prepare : 0.000018s : 0.16% optimize.opt_a.updatestate_depend_eliminate : 0.000007s : 0.07% optimize.opt_a.updatestate_assign_eliminate : 0.000006s : 0.06% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.06% optimize.opt_a.parameter_eliminate : 0.000002s : 0.02% optimize.opt_a.a_2 : 0.000248s : 2.25% optimize.opt_a.accelerated_algorithm : 0.000018s : 0.16% optimize.opt_a.shard : 0.000003s : 0.03% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.03% optimize.opt_a.shard_inline : 0.000015s : 0.14% optimize.opt_a.merge_send_recv : 0.000016s : 0.14% optimize.opt_a.auto_parallel : 0.000013s : 0.12% optimize.opt_a.parallel : 0.000023s : 0.21% optimize.opt_a.flash_sp : 0.000012s : 0.11% optimize.opt_a.merge_comm : 0.000009s : 0.09% optimize.opt_a.allreduce_fusion : 0.000009s : 0.08% optimize.opt_a.matmul_add_comm_reduction : 0.000018s : 0.16% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000018s : 0.16% optimize.opt_a.virtual_dataset : 0.000015s : 0.14% optimize.opt_a.get_grad_eliminate_ : 0.000015s : 0.13% optimize.opt_a.virtual_output : 0.000015s : 0.14% optimize.opt_a.merge_forward : 0.000007s : 0.06% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.02% optimize.opt_a.offload_activation : 0.000019s : 0.17% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000033s : 0.30% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.02% optimize.opt_a.before_grad : 0.000024s : 0.22% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000009s : 0.09% optimize.opt_a.meta_fg_expand : 0.000006s : 0.05% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.04% optimize.opt_a.receive_attached : 0.000004s : 0.04% optimize.opt_a.after_resolve : 0.000024s : 0.21% optimize.opt_a.a_after_grad : 0.000024s : 0.22% optimize.opt_a.renormalize : 0.000586s : 5.32% optimize.opt_a.add_forward_monad_depend : 0.000006s : 0.05% optimize.opt_a.auto_monad_grad : 0.000003s : 0.02% optimize.opt_a.auto_monad_eliminator : 0.000021s : 0.19% optimize.opt_a.cse : 0.000038s : 0.34% optimize.opt_a.a_3 : 0.000126s : 1.14% optimize.py_interpret_to_execute_after_opt_a : 0.000013s : 0.12% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.05% optimize.rewriter_after_opt_a : 0.000039s : 0.36% optimize.convert_after_rewriter : 0.000010s : 0.09% optimize.order_py_execute_after_rewriter : 0.000008s : 0.08% optimize.mutable_eliminate : 0.000530s : 4.81% optimize.opt_b.b_1 : 0.000199s : 1.81% optimize.opt_b.b_2 : 0.000010s : 0.09% optimize.opt_b.updatestate_depend_eliminate : 0.000006s : 0.05% optimize.opt_b.updatestate_assign_eliminate : 0.000004s : 0.03% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_b.renormalize : 0.000001s : 0.01% optimize.opt_b.cse : 0.000025s : 0.23% optimize.optimize_parallel_all_gather_comm : 0.000023s : 0.21% optimize.overlap_param_gather : 0.000005s : 0.05% optimize.cconv : 0.000027s : 0.24% optimize.loop_unroll : 0.000472s : 4.29% optimize.opt_after_cconv.c_1 : 0.000038s : 0.35% optimize.opt_after_cconv.parameter_eliminate : 0.000004s : 0.03% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.06% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.cse : 0.000026s : 0.24% optimize.opt_after_cconv.renormalize : 0.000001s : 0.01% optimize.remove_dup_value : 0.000017s : 0.15% optimize.tuple_transform.d_1 : 0.000055s : 0.50% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000008s : 0.08% optimize.partial_unused_args_eliminate : 0.000004s : 0.04% optimize.add_recomputation : 0.000054s : 0.49% optimize.cse_after_recomputation.cse : 0.000016s : 0.14% optimize.environ_conv : 0.000010s : 0.09% optimize.swap_dp_allreduce_reducescatter : 0.000009s : 0.08% optimize.bias_add_comm_swap : 0.000005s : 0.05% optimize.label_micro_interleaved_index : 0.000007s : 0.07% optimize.label_fine_grained_interleaved_index : 0.000006s : 0.05% optimize.merge_cast_opt : 0.000004s : 0.04% optimize.slice_recompute_activation : 0.000005s : 0.04% optimize.micro_interleaved_order_control : 0.000005s : 0.05% optimize.assign_add_opt : 0.000004s : 0.04% optimize.ForceFp32Comm : 0.000004s : 0.03% optimize.remove_cast_before_assign_add : 0.000004s : 0.03% optimize.full_micro_interleaved_order_control : 0.000005s : 0.05% optimize.reorder_send_recv_between_fp_bp : 0.000005s : 0.05% optimize.comm_op_add_attrs : 0.000003s : 0.03% optimize.add_comm_op_reuse_tag : 0.000004s : 0.03% optimize.interleave_split_concat_branches : 0.000004s : 0.03% optimize.interleave_parallel_branches : 0.000003s : 0.03% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.03% optimize.overlap_opt_shard_grad_in_pipeline : 0.000004s : 0.04% optimize.control_data_broadcast_order : 0.000018s : 0.16% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.04% optimize.offloading_packed_experts : 0.000007s : 0.06% optimize.overlap_recompute_and_grad_model_parallel : 0.000008s : 0.07% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.03% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.03% optimize.overlap_recompute_comm : 0.000005s : 0.05% optimize.overlap_grad_ring_attention : 0.000007s : 0.07% optimize.overlap_grad_flash_sp : 0.000025s : 0.23% optimize.begin_end_overlap_inline : 0.000003s : 0.03% optimize.split_matmul_comm_elemetwise : 0.000005s : 0.05% optimize.split_layernorm_comm : 0.000004s : 0.04% optimize.handle_group_info : 0.000004s : 0.03% optimize.symbol_engine_optimizer.build : 0.000003s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000012s : 0.11% optimize.symbol_engine_optimizer.elim_not_effective : 0.000015s : 0.14% optimize.symbol_engine_optimizer.opt_reshape : 0.000009s : 0.08% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000012s : 0.11% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000004s : 0.03% pipeline_parallel_scheduler : 0.000002s : 0.02% auto_monad_reorder : 0.000022s : 0.20% get_jit_bprop_graph : 0.000002s : 0.02% rewriter_after_jit_bprop_graph : 0.000005s : 0.04% opt_after_jit_grad : 0.000508s : 4.61% validate : 0.000039s : 0.35% Time group info: ------[substitution.] 0.000171 38 12.43% : 0.000021s : 3: substitution.cast_eliminate 1.39% : 0.000002s : 3: substitution.elim_not_effective 0.93% : 0.000002s : 3: substitution.fold_const_symbol 3.70% : 0.000006s : 5: substitution.graph_param_transform 66.86% : 0.000115s : 4: substitution.inline 2.18% : 0.000004s : 6: substitution.j_node_and_user_rematch 3.39% : 0.000006s : 6: substitution.remove_not_recompute_node 2.33% : 0.000004s : 4: substitution.replace_old_param 6.78% : 0.000012s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.005509 2 87.21% : 0.004804s : 1: type_inference.infer 12.79% : 0.000704s : 1: type_inference.specialize ------[replace.] 0.000059 8 56.90% : 0.000034s : 4: replace.inline 43.10% : 0.000026s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000122 8 92.10% : 0.000112s : 4: match.inline 7.90% : 0.000010s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000250 1596 0.94% : 0.000002s : 17: predicate.accumulaten_eliminater 0.66% : 0.000002s : 5: predicate.ad_related_special_op_eliminate 0.63% : 0.000002s : 10: predicate.addn_check_dump 0.97% : 0.000002s : 17: predicate.addn_zero_filter 0.87% : 0.000002s : 17: predicate.adjust_all_reduce_mul_add 1.96% : 0.000005s : 27: predicate.arithmetic_simplify 1.04% : 0.000003s : 17: predicate.cast_eliminate 0.62% : 0.000002s : 10: predicate.check_bprop_eliminate 0.63% : 0.000002s : 10: predicate.compare_switch_simplify 0.20% : 0.000001s : 5: predicate.const_output_eliminate 0.66% : 0.000002s : 10: predicate.depend_value_elim 0.98% : 0.000002s : 17: predicate.dict_get_item_const_eliminator 1.13% : 0.000003s : 17: predicate.dict_get_item_eliminator 1.00% : 0.000003s : 17: predicate.dict_set_item_eliminator 0.78% : 0.000002s : 10: predicate.dumpgradient_eliminate 0.20% : 0.000001s : 5: predicate.elim_not_effective 0.34% : 0.000001s : 5: predicate.elim_shapecalc_of_broadcastargs 1.20% : 0.000003s : 22: predicate.environ_add_const_eliminate 1.16% : 0.000003s : 22: predicate.environ_get_add_eliminate 1.16% : 0.000003s : 22: predicate.environ_get_depend_swap 1.88% : 0.000005s : 32: predicate.environ_get_eliminate 1.18% : 0.000003s : 22: predicate.environ_get_set_eliminate 1.44% : 0.000004s : 25: predicate.exchange_switch_depend_value 2.06% : 0.000005s : 25: predicate.float_depend_g_call 0.56% : 0.000001s : 10: predicate.float_environ_get_switch 0.84% : 0.000002s : 15: predicate.float_tuple_getitem_switch 0.19% : 0.000000s : 5: predicate.fold_const_symbol 0.64% : 0.000002s : 10: predicate.get_grad_eliminate 0.19% : 0.000000s : 5: predicate.graph_param_transform 0.60% : 0.000001s : 10: predicate.incorporate_call 0.54% : 0.000001s : 10: predicate.incorporate_call_switch 6.09% : 0.000015s : 72: predicate.inline 1.07% : 0.000003s : 10: predicate.inline_without_move 0.34% : 0.000001s : 10: predicate.j_node_and_user_rematch 0.84% : 0.000002s : 10: predicate.less_batch_normalization 1.88% : 0.000005s : 31: predicate.list_to_tuple_eliminator_ 2.64% : 0.000007s : 48: predicate.load_eliminater 0.94% : 0.000002s : 5: predicate.loop_unroll_after_grad 2.06% : 0.000005s : 36: predicate.loop_unroll_before_grad 1.86% : 0.000005s : 27: predicate.make_slice_get_slice_eliminator 0.61% : 0.000002s : 10: predicate.merge_addn 0.57% : 0.000001s : 10: predicate.micro_step_allgather_replace 0.60% : 0.000001s : 10: predicate.mini_step_allgather_replace 0.88% : 0.000002s : 17: predicate.minmaximum_grad 0.83% : 0.000002s : 5: predicate.mutable_eliminate 0.35% : 0.000001s : 5: predicate.opt_reshape 0.35% : 0.000001s : 5: predicate.parallel_virtual_node 1.75% : 0.000004s : 25: predicate.partial_defer_inline 1.73% : 0.000004s : 26: predicate.partial_eliminate 0.93% : 0.000002s : 17: predicate.print_const_string_wrapper 0.70% : 0.000002s : 10: predicate.reduce_all_const_elim 1.17% : 0.000003s : 17: predicate.reduce_eliminate 2.57% : 0.000006s : 48: predicate.redundant_stop_gradient_eliminater 0.51% : 0.000001s : 10: predicate.remove_not_recompute_node 1.33% : 0.000003s : 31: predicate.replace_applicator 0.43% : 0.000001s : 10: predicate.replace_old_param 0.27% : 0.000001s : 5: predicate.reset_defer_inline 0.97% : 0.000002s : 17: predicate.reshape_eliminate 0.62% : 0.000002s : 10: predicate.row_tensor_add_zeros_like 0.40% : 0.000001s : 5: predicate.row_tensor_eliminate 0.65% : 0.000002s : 10: predicate.same_eliminate 0.42% : 0.000001s : 10: predicate.set_cell_output_no_recompute 0.72% : 0.000002s : 10: predicate.shard_identity_eliminate 0.77% : 0.000002s : 10: predicate.special_op_eliminate 0.74% : 0.000002s : 10: predicate.specialize_transform 0.78% : 0.000002s : 10: predicate.split_environ_get_set_with_tuple_value 0.92% : 0.000002s : 10: predicate.stack_unstack_eliminate 0.36% : 0.000001s : 5: predicate.switch_call_monad_eliminater 1.52% : 0.000004s : 25: predicate.switch_defer_inline 2.08% : 0.000005s : 35: predicate.switch_layer_defer_inline 4.67% : 0.000012s : 76: predicate.switch_simplify 0.91% : 0.000002s : 17: predicate.tile_eliminate 0.92% : 0.000002s : 17: predicate.transpose_eliminate 1.63% : 0.000004s : 27: predicate.tuple_list_convert_item_index_to_positive 1.72% : 0.000004s : 27: predicate.tuple_list_get_item_const_eliminator 1.52% : 0.000004s : 27: predicate.tuple_list_get_item_depend_reorder 3.06% : 0.000008s : 41: predicate.tuple_list_get_item_eliminator 1.58% : 0.000004s : 27: predicate.tuple_list_get_set_item_eliminator 2.35% : 0.000006s : 37: predicate.tuple_list_set_item_eliminator 1.85% : 0.000005s : 31: predicate.tuple_to_list_eliminator_ 2.54% : 0.000006s : 48: predicate.updatestate_pure_node_eliminater 3.30% : 0.000008s : 58: predicate.updatestate_useless_node_eliminater 0.32% : 0.000001s : 5: predicate.value_based_eliminate 0.63% : 0.000002s : 10: predicate.virtual_dataset_eliminate 0.82% : 0.000002s : 10: predicate.virtual_output_eliminate 0.30% : 0.000001s : 5: predicate.virtual_view_grad_eliminate 0.40% : 0.000001s : 5: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000472 11 50.23% : 0.000237s : 5: func_graph_cloner_run.FuncGraphClonerGraph 49.77% : 0.000235s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.026541 192 0.02% : 0.000006s : 1: ForceFp32Comm 11.43% : 0.003035s : 1: add_attr 11.38% : 0.003021s : 1: add_attr_with_inline 0.02% : 0.000006s : 1: add_comm_op_reuse_tag 0.22% : 0.000058s : 1: add_recomputation 0.03% : 0.000007s : 1: assign_add_opt 0.25% : 0.000066s : 1: auto_monad 0.11% : 0.000030s : 1: auto_monad_reorder 0.02% : 0.000006s : 1: begin_end_overlap_inline 0.03% : 0.000008s : 1: bias_add_comm_swap 1.49% : 0.000394s : 1: bootstrap 0.11% : 0.000030s : 1: cconv 0.02% : 0.000006s : 1: comm_op_add_attrs 0.08% : 0.000021s : 1: control_data_broadcast_order 0.05% : 0.000013s : 1: convert_after_rewriter 0.13% : 0.000035s : 1: cse_after_recomputation 0.03% : 0.000008s : 1: dataset_repeat_opt 0.07% : 0.000018s : 1: detach_backward 0.05% : 0.000012s : 1: environ_conv 0.12% : 0.000031s : 1: event_method 0.03% : 0.000008s : 1: full_micro_interleaved_order_control 0.03% : 0.000008s : 1: get_jit_bprop_graph 0.04% : 0.000011s : 1: graph_reusing 0.03% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.02% : 0.000006s : 1: handle_group_info 0.03% : 0.000008s : 1: inline 0.03% : 0.000009s : 1: insert-virtual-dataset 0.02% : 0.000006s : 1: interleave_parallel_branches 0.02% : 0.000006s : 1: interleave_split_concat_branches 0.03% : 0.000009s : 1: label_fine_grained_interleaved_index 0.04% : 0.000010s : 1: label_micro_interleaved_index 1.81% : 0.000480s : 1: loop_unroll 0.03% : 0.000007s : 1: merge_cast_opt 0.03% : 0.000008s : 1: micro_interleaved_order_control 2.03% : 0.000538s : 1: mutable_eliminate 0.04% : 0.000010s : 1: offloading_packed_experts 0.07% : 0.000018s : 1: opt.transform.loop_unroll_optimizer 0.07% : 0.000019s : 1: opt.transform.mutable_eliminate 5.17% : 0.001373s : 78: opt.transform.opt_a 0.14% : 0.000036s : 1: opt.transform.opt_after_cconv 0.11% : 0.000029s : 1: opt.transform.opt_after_jit_grad 0.51% : 0.000135s : 28: opt.transform.opt_b 0.23% : 0.000060s : 2: opt.transform.opt_trans_graph 0.17% : 0.000045s : 4: opt.transform.symbol_engine_opt 11.81% : 0.003135s : 1: opt_a 0.58% : 0.000153s : 1: opt_after_cconv 1.95% : 0.000518s : 1: opt_after_jit_grad 1.21% : 0.000320s : 1: opt_b 22.22% : 0.005896s : 1: optimize 0.10% : 0.000026s : 1: optimize_parallel_all_gather_comm 0.04% : 0.000011s : 1: order_py_execute_after_rewriter 0.11% : 0.000028s : 1: overlap_grad_flash_sp 0.02% : 0.000006s : 1: overlap_grad_matmul_and_grad_allreduce 0.04% : 0.000010s : 1: overlap_grad_ring_attention 0.03% : 0.000007s : 1: overlap_opt_shard_grad_in_pipeline 0.02% : 0.000007s : 1: overlap_opt_shard_in_pipeline 0.03% : 0.000008s : 1: overlap_param_gather 0.02% : 0.000006s : 1: overlap_recompute_allgather_and_fa_grad 0.04% : 0.000010s : 1: overlap_recompute_and_grad_model_parallel 0.03% : 0.000008s : 1: overlap_recompute_comm 0.04% : 0.000010s : 1: parallel-infer-symbol 0.02% : 0.000006s : 1: parallel-infer-symbol-second 0.03% : 0.000007s : 1: partial_unused_args_eliminate 0.04% : 0.000009s : 1: pipeline_parallel_scheduler 0.03% : 0.000007s : 1: pipeline_split 0.15% : 0.000040s : 1: pre_auto_parallel 0.12% : 0.000031s : 1: py_interpret_to_execute 0.06% : 0.000017s : 1: py_interpret_to_execute_after_opt_a 0.03% : 0.000007s : 1: remove_cast_before_assign_add 0.07% : 0.000020s : 1: remove_dup_value 1.17% : 0.000311s : 1: renormalize.infer 1.01% : 0.000269s : 1: renormalize.specialize 0.03% : 0.000008s : 1: reorder_send_recv_between_fp_bp 0.04% : 0.000010s : 1: rewriter_after_jit_bprop_graph 0.16% : 0.000044s : 1: rewriter_after_opt_a 0.33% : 0.000088s : 1: rewriter_before_opt_a 0.03% : 0.000008s : 1: slice_cell_reuse_recomputed_activation 0.03% : 0.000008s : 1: slice_recompute_activation 0.03% : 0.000007s : 1: split_layernorm_comm 0.03% : 0.000008s : 1: split_matmul_comm_elemetwise 0.05% : 0.000012s : 1: swap_dp_allreduce_reducescatter 0.40% : 0.000107s : 1: symbol_engine_optimizer 0.40% : 0.000106s : 1: tuple_transform 21.08% : 0.005594s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:45:41.634.696 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0162846, [21] [bootstrap]: 0.00040714 [type_inference]: 0.00580824 [event_method]: 2.043e-05 [auto_monad]: 6.445e-05 [graph_reusing]: 6.68998e-06 [inline]: 3.24001e-06 [add_attr]: 0.00340012, [1] [add_attr_with_inline]: 0.00338942, [1] [Cycle 1]: 6.659e-05, [2] [tag_attr]: 2.352e-05 [meta_addattr_fg_expand]: 6.54999e-06 [parallel-infer-symbol]: 3.25e-06 [pre_auto_parallel]: 4.09e-05 [insert-virtual-dataset]: 2.95998e-06 [parallel-infer-symbol-second]: 7.00005e-07 [dataset_repeat_opt]: 1.91998e-06 [pipeline_split]: 1.92999e-06 [optimize]: 0.00566671, [53] [py_interpret_to_execute]: 2.55e-05 [rewriter_before_opt_a]: 8.936e-05 [opt_a]: 0.00332966, [2] [Cycle 1]: 0.00247754, [45] [expand_dump_flag]: 2.52001e-06 [switch_simplify]: 4.468e-05 [loop_unroll]: 3.165e-05 [a_1]: 0.00077614 [with_stream_mark]: 2.075e-05 [recompute_prepare]: 1.204e-05 [updatestate_depend_eliminate]: 5.20999e-06 [updatestate_assign_eliminate]: 3.86999e-06 [updatestate_loads_eliminate]: 3.63999e-06 [parameter_eliminate]: 1.99e-06 [a_2]: 0.00010162 [accelerated_algorithm]: 9.31e-06 [shard]: 2.09e-06 [meta_shard_fg_expand]: 2.78998e-06 [shard_inline]: 9.12001e-06 [merge_send_recv]: 9.81e-06 [auto_parallel]: 9.55001e-06 [parallel]: 1.994e-05 [flash_sp]: 1.018e-05 [merge_comm]: 5.06002e-06 [allreduce_fusion]: 4.50999e-06 [matmul_add_comm_reduction]: 1.08e-05 [allreduce_slice_to_reducescatter]: 9.80013e-07 [virtual_shard_identity]: 1.076e-05 [virtual_dataset]: 7.7e-06 [get_grad_eliminate_]: 8.03001e-06 [virtual_output]: 7.8e-06 [merge_forward]: 4.38001e-06 [cell_reuse_recompute_pass]: 1.70001e-06 [offload_activation]: 1.146e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.654e-05 [merge_recompute_call_nodes]: 1.84e-06 [before_grad]: 1.363e-05 [set_forward_comm_id_for_comm_node_pass]: 4.51002e-06 [meta_fg_expand]: 3.28998e-06 [flash_sp_send_recv_attached]: 2.81e-06 [receive_attached]: 3.04999e-06 [after_resolve]: 1.382e-05 [a_after_grad]: 1.206e-05 [renormalize]: 0.00085944 [add_forward_monad_depend]: 7.36999e-06 [auto_monad_grad]: 2.93e-06 [auto_monad_eliminator]: 2.002e-05 [cse]: 3.706e-05 [a_3]: 6.495e-05 [Cycle 2]: 0.00083936, [45] [expand_dump_flag]: 1.84e-06 [switch_simplify]: 1.015e-05 [loop_unroll]: 7.76001e-06 [a_1]: 0.00019477 [with_stream_mark]: 1.638e-05 [recompute_prepare]: 7.90998e-06 [updatestate_depend_eliminate]: 3.7e-06 [updatestate_assign_eliminate]: 3.26001e-06 [updatestate_loads_eliminate]: 3.13e-06 [parameter_eliminate]: 1.87001e-06 [a_2]: 9.29e-05 [accelerated_algorithm]: 8.27e-06 [shard]: 1.55999e-06 [meta_shard_fg_expand]: 2.17999e-06 [shard_inline]: 7.97003e-06 [merge_send_recv]: 8.42e-06 [auto_parallel]: 8.47998e-06 [parallel]: 8.28999e-06 [flash_sp]: 4.03001e-06 [merge_comm]: 4.18001e-06 [allreduce_fusion]: 3.97e-06 [matmul_add_comm_reduction]: 8.45999e-06 [allreduce_slice_to_reducescatter]: 7.29982e-07 [virtual_shard_identity]: 8.86002e-06 [virtual_dataset]: 7.9e-06 [get_grad_eliminate_]: 7.14001e-06 [virtual_output]: 7.66001e-06 [merge_forward]: 4.27e-06 [cell_reuse_recompute_pass]: 2.51e-06 [offload_activation]: 1.13e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.656e-05 [merge_recompute_call_nodes]: 1.40999e-06 [before_grad]: 1.493e-05 [set_forward_comm_id_for_comm_node_pass]: 5.79999e-06 [meta_fg_expand]: 3.03e-06 [flash_sp_send_recv_attached]: 1.66e-06 [receive_attached]: 1.92999e-06 [after_resolve]: 1.455e-05 [a_after_grad]: 1.225e-05 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 2.22001e-06 [auto_monad_grad]: 2.04e-06 [auto_monad_eliminator]: 1.125e-05 [cse]: 2.327e-05 [a_3]: 4.724e-05 [py_interpret_to_execute_after_opt_a]: 1.497e-05 [slice_cell_reuse_recomputed_activation]: 2.59999e-06 [rewriter_after_opt_a]: 4.725e-05 [convert_after_rewriter]: 7.66001e-06 [order_py_execute_after_rewriter]: 6.01e-06 [mutable_eliminate]: 0.00062928 [opt_b]: 0.00025139, [1] [Cycle 1]: 0.00024471, [7] [b_1]: 0.00015561 [b_2]: 9.61003e-06 [updatestate_depend_eliminate]: 8.10999e-06 [updatestate_assign_eliminate]: 3.25998e-06 [updatestate_loads_eliminate]: 3.09001e-06 [renormalize]: 1.00001e-06 [cse]: 2.687e-05 [optimize_parallel_all_gather_comm]: 1.974e-05 [overlap_param_gather]: 1.94e-06 [cconv]: 3.006e-05 [loop_unroll]: 0.00046469 [opt_after_cconv]: 0.00011743, [1] [Cycle 1]: 0.00011083, [7] [c_1]: 3.744e-05 [parameter_eliminate]: 2.95002e-06 [updatestate_depend_eliminate]: 6.23e-06 [updatestate_assign_eliminate]: 3.06001e-06 [updatestate_loads_eliminate]: 2.79999e-06 [cse]: 2.265e-05 [renormalize]: 4.90021e-07 [remove_dup_value]: 1.449e-05 [tuple_transform]: 8.698e-05, [1] [Cycle 1]: 8.239e-05, [4] [d_1]: 5.394e-05 [none_parameter_eliminate]: 1.70001e-06 [renormalize]: 1.39989e-07 [switch_simplify]: 8.00999e-06 [partial_unused_args_eliminate]: 1.70001e-06 [add_recomputation]: 5.908e-05 [cse_after_recomputation]: 2.548e-05, [1] [Cycle 1]: 2.04e-05, [1] [cse]: 1.473e-05 [environ_conv]: 6.70002e-06 [swap_dp_allreduce_reducescatter]: 6.16e-06 [bias_add_comm_swap]: 2.93e-06 [label_micro_interleaved_index]: 5.27001e-06 [label_fine_grained_interleaved_index]: 3.11001e-06 [merge_cast_opt]: 1.84e-06 [slice_recompute_activation]: 2.52001e-06 [micro_interleaved_order_control]: 2.72001e-06 [assign_add_opt]: 1.34e-06 [ForceFp32Comm]: 1.27e-06 [remove_cast_before_assign_add]: 9.80013e-07 [full_micro_interleaved_order_control]: 2.34001e-06 [reorder_send_recv_between_fp_bp]: 3.06001e-06 [comm_op_add_attrs]: 1.09e-06 [add_comm_op_reuse_tag]: 1.25001e-06 [interleave_split_concat_branches]: 1.39998e-06 [interleave_parallel_branches]: 1.09003e-06 [overlap_opt_shard_in_pipeline]: 1.15999e-06 [overlap_opt_shard_grad_in_pipeline]: 1.87999e-06 [control_data_broadcast_order]: 1.707e-05 [grouped_pairwise_exchange_alltoall]: 1.73002e-06 [offloading_packed_experts]: 4.55999e-06 [overlap_recompute_and_grad_model_parallel]: 5.77001e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.14998e-06 [overlap_recompute_allgather_and_fa_grad]: 1.36002e-06 [overlap_recompute_comm]: 2.27999e-06 [overlap_grad_ring_attention]: 4.89998e-06 [overlap_grad_flash_sp]: 2.41e-05 [begin_end_overlap_inline]: 6.89994e-07 [split_matmul_comm_elemetwise]: 2.31998e-06 [split_layernorm_comm]: 1.67999e-06 [handle_group_info]: 1.24e-06 [symbol_engine_optimizer]: 9.501e-05, [1] [Cycle 1]: 8.944e-05, [6] [build]: 4.02e-06 [elim_shapecalc]: 1.226e-05 [elim_not_effective]: 1.748e-05 [opt_reshape]: 8.98002e-06 [fold_const_symbol]: 1.421e-05 [renormalize]: 5.29981e-07 [detach_backward]: 2.41e-06 [pipeline_parallel_scheduler]: 1.72001e-06 [auto_monad_reorder]: 2.437e-05 [get_jit_bprop_graph]: 2.36e-06 [rewriter_after_jit_bprop_graph]: 5.35999e-06 [opt_after_jit_grad]: 0.00060649 [validate]: 5.086e-05 Sums bootstrap : 0.000407s : 3.43% type_inference : 0.005808s : 48.94% event_method : 0.000020s : 0.17% auto_monad : 0.000064s : 0.54% graph_reusing : 0.000007s : 0.06% inline : 0.000003s : 0.03% add_attr.add_attr_with_inline.tag_attr : 0.000024s : 0.20% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000007s : 0.06% parallel-infer-symbol : 0.000003s : 0.03% pre_auto_parallel : 0.000041s : 0.34% insert-virtual-dataset : 0.000003s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.02% optimize.py_interpret_to_execute : 0.000025s : 0.21% optimize.rewriter_before_opt_a : 0.000089s : 0.75% optimize.opt_a.expand_dump_flag : 0.000004s : 0.04% optimize.opt_a.switch_simplify : 0.000055s : 0.46% optimize.opt_a.loop_unroll : 0.000039s : 0.33% optimize.opt_a.a_1 : 0.000971s : 8.18% optimize.opt_a.with_stream_mark : 0.000037s : 0.31% optimize.opt_a.recompute_prepare : 0.000020s : 0.17% optimize.opt_a.updatestate_depend_eliminate : 0.000009s : 0.08% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.06% optimize.opt_a.updatestate_loads_eliminate : 0.000007s : 0.06% optimize.opt_a.parameter_eliminate : 0.000004s : 0.03% optimize.opt_a.a_2 : 0.000195s : 1.64% optimize.opt_a.accelerated_algorithm : 0.000018s : 0.15% optimize.opt_a.shard : 0.000004s : 0.03% optimize.opt_a.meta_shard_fg_expand : 0.000005s : 0.04% optimize.opt_a.shard_inline : 0.000017s : 0.14% optimize.opt_a.merge_send_recv : 0.000018s : 0.15% optimize.opt_a.auto_parallel : 0.000018s : 0.15% optimize.opt_a.parallel : 0.000028s : 0.24% optimize.opt_a.flash_sp : 0.000014s : 0.12% optimize.opt_a.merge_comm : 0.000009s : 0.08% optimize.opt_a.allreduce_fusion : 0.000008s : 0.07% optimize.opt_a.matmul_add_comm_reduction : 0.000019s : 0.16% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000002s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000020s : 0.17% optimize.opt_a.virtual_dataset : 0.000016s : 0.13% optimize.opt_a.get_grad_eliminate_ : 0.000015s : 0.13% optimize.opt_a.virtual_output : 0.000015s : 0.13% optimize.opt_a.merge_forward : 0.000009s : 0.07% optimize.opt_a.cell_reuse_recompute_pass : 0.000004s : 0.04% optimize.opt_a.offload_activation : 0.000023s : 0.19% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000033s : 0.28% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.03% optimize.opt_a.before_grad : 0.000029s : 0.24% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000010s : 0.09% optimize.opt_a.meta_fg_expand : 0.000006s : 0.05% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.04% optimize.opt_a.receive_attached : 0.000005s : 0.04% optimize.opt_a.after_resolve : 0.000028s : 0.24% optimize.opt_a.a_after_grad : 0.000024s : 0.20% optimize.opt_a.renormalize : 0.000860s : 7.24% optimize.opt_a.add_forward_monad_depend : 0.000010s : 0.08% optimize.opt_a.auto_monad_grad : 0.000005s : 0.04% optimize.opt_a.auto_monad_eliminator : 0.000031s : 0.26% optimize.opt_a.cse : 0.000060s : 0.51% optimize.opt_a.a_3 : 0.000112s : 0.95% optimize.py_interpret_to_execute_after_opt_a : 0.000015s : 0.13% optimize.slice_cell_reuse_recomputed_activation : 0.000003s : 0.02% optimize.rewriter_after_opt_a : 0.000047s : 0.40% optimize.convert_after_rewriter : 0.000008s : 0.06% optimize.order_py_execute_after_rewriter : 0.000006s : 0.05% optimize.mutable_eliminate : 0.000629s : 5.30% optimize.opt_b.b_1 : 0.000156s : 1.31% optimize.opt_b.b_2 : 0.000010s : 0.08% optimize.opt_b.updatestate_depend_eliminate : 0.000008s : 0.07% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_b.renormalize : 0.000001s : 0.01% optimize.opt_b.cse : 0.000027s : 0.23% optimize.optimize_parallel_all_gather_comm : 0.000020s : 0.17% optimize.overlap_param_gather : 0.000002s : 0.02% optimize.cconv : 0.000030s : 0.25% optimize.loop_unroll : 0.000465s : 3.92% optimize.opt_after_cconv.c_1 : 0.000037s : 0.32% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.02% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.05% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.02% optimize.opt_after_cconv.cse : 0.000023s : 0.19% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000014s : 0.12% optimize.tuple_transform.d_1 : 0.000054s : 0.45% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000008s : 0.07% optimize.partial_unused_args_eliminate : 0.000002s : 0.01% optimize.add_recomputation : 0.000059s : 0.50% optimize.cse_after_recomputation.cse : 0.000015s : 0.12% optimize.environ_conv : 0.000007s : 0.06% optimize.swap_dp_allreduce_reducescatter : 0.000006s : 0.05% optimize.bias_add_comm_swap : 0.000003s : 0.02% optimize.label_micro_interleaved_index : 0.000005s : 0.04% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.03% optimize.merge_cast_opt : 0.000002s : 0.02% optimize.slice_recompute_activation : 0.000003s : 0.02% optimize.micro_interleaved_order_control : 0.000003s : 0.02% optimize.assign_add_opt : 0.000001s : 0.01% optimize.ForceFp32Comm : 0.000001s : 0.01% optimize.remove_cast_before_assign_add : 0.000001s : 0.01% optimize.full_micro_interleaved_order_control : 0.000002s : 0.02% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.03% optimize.comm_op_add_attrs : 0.000001s : 0.01% optimize.add_comm_op_reuse_tag : 0.000001s : 0.01% optimize.interleave_split_concat_branches : 0.000001s : 0.01% optimize.interleave_parallel_branches : 0.000001s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.02% optimize.control_data_broadcast_order : 0.000017s : 0.14% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.01% optimize.offloading_packed_experts : 0.000005s : 0.04% optimize.overlap_recompute_and_grad_model_parallel : 0.000006s : 0.05% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.01% optimize.overlap_recompute_comm : 0.000002s : 0.02% optimize.overlap_grad_ring_attention : 0.000005s : 0.04% optimize.overlap_grad_flash_sp : 0.000024s : 0.20% optimize.begin_end_overlap_inline : 0.000001s : 0.01% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.02% optimize.split_layernorm_comm : 0.000002s : 0.01% optimize.handle_group_info : 0.000001s : 0.01% optimize.symbol_engine_optimizer.build : 0.000004s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000012s : 0.10% optimize.symbol_engine_optimizer.elim_not_effective : 0.000017s : 0.15% optimize.symbol_engine_optimizer.opt_reshape : 0.000009s : 0.08% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000014s : 0.12% optimize.symbol_engine_optimizer.renormalize : 0.000001s : 0.00% detach_backward : 0.000002s : 0.02% pipeline_parallel_scheduler : 0.000002s : 0.01% auto_monad_reorder : 0.000024s : 0.21% get_jit_bprop_graph : 0.000002s : 0.02% rewriter_after_jit_bprop_graph : 0.000005s : 0.05% opt_after_jit_grad : 0.000606s : 5.11% validate : 0.000051s : 0.43% Time group info: ------[substitution.] 0.000232 38 10.81% : 0.000025s : 3: substitution.cast_eliminate 0.95% : 0.000002s : 3: substitution.elim_not_effective 0.79% : 0.000002s : 3: substitution.fold_const_symbol 2.98% : 0.000007s : 5: substitution.graph_param_transform 70.41% : 0.000163s : 4: substitution.inline 2.23% : 0.000005s : 6: substitution.j_node_and_user_rematch 2.83% : 0.000007s : 6: substitution.remove_not_recompute_node 2.41% : 0.000006s : 4: substitution.replace_old_param 6.59% : 0.000015s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.005746 2 87.65% : 0.005037s : 1: type_inference.infer 12.35% : 0.000710s : 1: type_inference.specialize ------[replace.] 0.000070 8 59.10% : 0.000041s : 4: replace.inline 40.90% : 0.000028s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000173 8 92.54% : 0.000160s : 4: match.inline 7.46% : 0.000013s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000263 1596 0.98% : 0.000003s : 17: predicate.accumulaten_eliminater 0.77% : 0.000002s : 5: predicate.ad_related_special_op_eliminate 0.56% : 0.000001s : 10: predicate.addn_check_dump 1.11% : 0.000003s : 17: predicate.addn_zero_filter 0.85% : 0.000002s : 17: predicate.adjust_all_reduce_mul_add 2.09% : 0.000006s : 27: predicate.arithmetic_simplify 1.02% : 0.000003s : 17: predicate.cast_eliminate 0.61% : 0.000002s : 10: predicate.check_bprop_eliminate 0.56% : 0.000001s : 10: predicate.compare_switch_simplify 0.19% : 0.000001s : 5: predicate.const_output_eliminate 0.56% : 0.000001s : 10: predicate.depend_value_elim 0.98% : 0.000003s : 17: predicate.dict_get_item_const_eliminator 1.10% : 0.000003s : 17: predicate.dict_get_item_eliminator 0.91% : 0.000002s : 17: predicate.dict_set_item_eliminator 0.91% : 0.000002s : 10: predicate.dumpgradient_eliminate 0.21% : 0.000001s : 5: predicate.elim_not_effective 0.37% : 0.000001s : 5: predicate.elim_shapecalc_of_broadcastargs 1.27% : 0.000003s : 22: predicate.environ_add_const_eliminate 1.22% : 0.000003s : 22: predicate.environ_get_add_eliminate 1.12% : 0.000003s : 22: predicate.environ_get_depend_swap 1.70% : 0.000004s : 32: predicate.environ_get_eliminate 1.13% : 0.000003s : 22: predicate.environ_get_set_eliminate 1.40% : 0.000004s : 25: predicate.exchange_switch_depend_value 2.50% : 0.000007s : 25: predicate.float_depend_g_call 0.54% : 0.000001s : 10: predicate.float_environ_get_switch 0.87% : 0.000002s : 15: predicate.float_tuple_getitem_switch 0.19% : 0.000000s : 5: predicate.fold_const_symbol 0.63% : 0.000002s : 10: predicate.get_grad_eliminate 0.23% : 0.000001s : 5: predicate.graph_param_transform 0.60% : 0.000002s : 10: predicate.incorporate_call 0.50% : 0.000001s : 10: predicate.incorporate_call_switch 6.41% : 0.000017s : 72: predicate.inline 0.76% : 0.000002s : 10: predicate.inline_without_move 0.30% : 0.000001s : 10: predicate.j_node_and_user_rematch 0.80% : 0.000002s : 10: predicate.less_batch_normalization 1.85% : 0.000005s : 31: predicate.list_to_tuple_eliminator_ 2.53% : 0.000007s : 48: predicate.load_eliminater 0.85% : 0.000002s : 5: predicate.loop_unroll_after_grad 2.05% : 0.000005s : 36: predicate.loop_unroll_before_grad 1.61% : 0.000004s : 27: predicate.make_slice_get_slice_eliminator 0.50% : 0.000001s : 10: predicate.merge_addn 0.55% : 0.000001s : 10: predicate.micro_step_allgather_replace 0.62% : 0.000002s : 10: predicate.mini_step_allgather_replace 0.86% : 0.000002s : 17: predicate.minmaximum_grad 0.87% : 0.000002s : 5: predicate.mutable_eliminate 0.44% : 0.000001s : 5: predicate.opt_reshape 0.38% : 0.000001s : 5: predicate.parallel_virtual_node 1.68% : 0.000004s : 25: predicate.partial_defer_inline 1.67% : 0.000004s : 26: predicate.partial_eliminate 1.07% : 0.000003s : 17: predicate.print_const_string_wrapper 0.53% : 0.000001s : 10: predicate.reduce_all_const_elim 1.32% : 0.000003s : 17: predicate.reduce_eliminate 2.57% : 0.000007s : 48: predicate.redundant_stop_gradient_eliminater 0.50% : 0.000001s : 10: predicate.remove_not_recompute_node 1.48% : 0.000004s : 31: predicate.replace_applicator 0.49% : 0.000001s : 10: predicate.replace_old_param 0.27% : 0.000001s : 5: predicate.reset_defer_inline 0.98% : 0.000003s : 17: predicate.reshape_eliminate 0.59% : 0.000002s : 10: predicate.row_tensor_add_zeros_like 0.42% : 0.000001s : 5: predicate.row_tensor_eliminate 0.79% : 0.000002s : 10: predicate.same_eliminate 0.35% : 0.000001s : 10: predicate.set_cell_output_no_recompute 0.70% : 0.000002s : 10: predicate.shard_identity_eliminate 0.72% : 0.000002s : 10: predicate.special_op_eliminate 0.68% : 0.000002s : 10: predicate.specialize_transform 0.92% : 0.000002s : 10: predicate.split_environ_get_set_with_tuple_value 0.71% : 0.000002s : 10: predicate.stack_unstack_eliminate 0.35% : 0.000001s : 5: predicate.switch_call_monad_eliminater 1.47% : 0.000004s : 25: predicate.switch_defer_inline 2.03% : 0.000005s : 35: predicate.switch_layer_defer_inline 4.97% : 0.000013s : 76: predicate.switch_simplify 0.97% : 0.000003s : 17: predicate.tile_eliminate 1.03% : 0.000003s : 17: predicate.transpose_eliminate 1.55% : 0.000004s : 27: predicate.tuple_list_convert_item_index_to_positive 1.67% : 0.000004s : 27: predicate.tuple_list_get_item_const_eliminator 1.53% : 0.000004s : 27: predicate.tuple_list_get_item_depend_reorder 3.30% : 0.000009s : 41: predicate.tuple_list_get_item_eliminator 1.59% : 0.000004s : 27: predicate.tuple_list_get_set_item_eliminator 2.47% : 0.000006s : 37: predicate.tuple_list_set_item_eliminator 1.64% : 0.000004s : 31: predicate.tuple_to_list_eliminator_ 2.42% : 0.000006s : 48: predicate.updatestate_pure_node_eliminater 3.17% : 0.000008s : 58: predicate.updatestate_useless_node_eliminater 0.36% : 0.000001s : 5: predicate.value_based_eliminate 0.62% : 0.000002s : 10: predicate.virtual_dataset_eliminate 0.64% : 0.000002s : 10: predicate.virtual_output_eliminate 0.27% : 0.000001s : 5: predicate.virtual_view_grad_eliminate 0.41% : 0.000001s : 5: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000536 11 50.22% : 0.000269s : 5: func_graph_cloner_run.FuncGraphClonerGraph 49.78% : 0.000267s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.027914 192 0.01% : 0.000004s : 1: ForceFp32Comm 12.20% : 0.003407s : 1: add_attr 12.16% : 0.003393s : 1: add_attr_with_inline 0.01% : 0.000004s : 1: add_comm_op_reuse_tag 0.23% : 0.000064s : 1: add_recomputation 0.02% : 0.000004s : 1: assign_add_opt 0.26% : 0.000071s : 1: auto_monad 0.10% : 0.000029s : 1: auto_monad_reorder 0.01% : 0.000004s : 1: begin_end_overlap_inline 0.02% : 0.000006s : 1: bias_add_comm_swap 1.56% : 0.000436s : 1: bootstrap 0.12% : 0.000034s : 1: cconv 0.01% : 0.000004s : 1: comm_op_add_attrs 0.07% : 0.000021s : 1: control_data_broadcast_order 0.04% : 0.000011s : 1: convert_after_rewriter 0.10% : 0.000028s : 1: cse_after_recomputation 0.02% : 0.000005s : 1: dataset_repeat_opt 0.02% : 0.000006s : 1: detach_backward 0.04% : 0.000010s : 1: environ_conv 0.10% : 0.000027s : 1: event_method 0.02% : 0.000005s : 1: full_micro_interleaved_order_control 0.02% : 0.000005s : 1: get_jit_bprop_graph 0.04% : 0.000010s : 1: graph_reusing 0.02% : 0.000005s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000004s : 1: handle_group_info 0.02% : 0.000006s : 1: inline 0.02% : 0.000007s : 1: insert-virtual-dataset 0.01% : 0.000004s : 1: interleave_parallel_branches 0.02% : 0.000004s : 1: interleave_split_concat_branches 0.02% : 0.000006s : 1: label_fine_grained_interleaved_index 0.03% : 0.000008s : 1: label_micro_interleaved_index 1.69% : 0.000473s : 1: loop_unroll 0.02% : 0.000005s : 1: merge_cast_opt 0.02% : 0.000006s : 1: micro_interleaved_order_control 2.29% : 0.000638s : 1: mutable_eliminate 0.03% : 0.000007s : 1: offloading_packed_experts 0.06% : 0.000016s : 1: opt.transform.loop_unroll_optimizer 0.06% : 0.000017s : 1: opt.transform.mutable_eliminate 5.40% : 0.001509s : 78: opt.transform.opt_a 0.13% : 0.000036s : 1: opt.transform.opt_after_cconv 0.13% : 0.000036s : 1: opt.transform.opt_after_jit_grad 0.47% : 0.000132s : 28: opt.transform.opt_b 0.21% : 0.000060s : 2: opt.transform.opt_trans_graph 0.18% : 0.000049s : 4: opt.transform.symbol_engine_opt 11.94% : 0.003333s : 1: opt_a 0.44% : 0.000122s : 1: opt_after_cconv 2.22% : 0.000618s : 1: opt_after_jit_grad 0.91% : 0.000255s : 1: opt_b 20.32% : 0.005672s : 1: optimize 0.08% : 0.000023s : 1: optimize_parallel_all_gather_comm 0.03% : 0.000009s : 1: order_py_execute_after_rewriter 0.10% : 0.000028s : 1: overlap_grad_flash_sp 0.01% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.03% : 0.000008s : 1: overlap_grad_ring_attention 0.02% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.02% : 0.000005s : 1: overlap_param_gather 0.01% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.03% : 0.000009s : 1: overlap_recompute_and_grad_model_parallel 0.02% : 0.000005s : 1: overlap_recompute_comm 0.03% : 0.000007s : 1: parallel-infer-symbol 0.01% : 0.000004s : 1: parallel-infer-symbol-second 0.02% : 0.000005s : 1: partial_unused_args_eliminate 0.02% : 0.000005s : 1: pipeline_parallel_scheduler 0.02% : 0.000005s : 1: pipeline_split 0.16% : 0.000045s : 1: pre_auto_parallel 0.11% : 0.000030s : 1: py_interpret_to_execute 0.07% : 0.000019s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000004s : 1: remove_cast_before_assign_add 0.06% : 0.000018s : 1: remove_dup_value 1.74% : 0.000487s : 1: renormalize.infer 1.30% : 0.000362s : 1: renormalize.specialize 0.02% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.03% : 0.000008s : 1: rewriter_after_jit_bprop_graph 0.19% : 0.000052s : 1: rewriter_after_opt_a 0.34% : 0.000095s : 1: rewriter_before_opt_a 0.02% : 0.000006s : 1: slice_cell_reuse_recomputed_activation 0.02% : 0.000005s : 1: slice_recompute_activation 0.02% : 0.000004s : 1: split_layernorm_comm 0.02% : 0.000005s : 1: split_matmul_comm_elemetwise 0.03% : 0.000009s : 1: swap_dp_allreduce_reducescatter 0.35% : 0.000098s : 1: symbol_engine_optimizer 0.32% : 0.000090s : 1: tuple_transform 20.87% : 0.005826s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:45:41.871.698 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:45:41.872.021 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0172413, [21] [bootstrap]: 0.00039859 [type_inference]: 0.00589095 [event_method]: 1.933e-05 [auto_monad]: 6.594e-05 [graph_reusing]: 5.62999e-06 [inline]: 1.90001e-06 [add_attr]: 0.00341019, [1] [add_attr_with_inline]: 0.00340133, [1] [Cycle 1]: 7.142e-05, [2] [tag_attr]: 2.145e-05 [meta_addattr_fg_expand]: 6.32001e-06 [parallel-infer-symbol]: 3.46999e-06 [pre_auto_parallel]: 3.355e-05 [insert-virtual-dataset]: 2.63e-06 [parallel-infer-symbol-second]: 6.80011e-07 [dataset_repeat_opt]: 2.37001e-06 [pipeline_split]: 1.54e-06 [optimize]: 0.00617356, [53] [py_interpret_to_execute]: 2.926e-05 [rewriter_before_opt_a]: 9.061e-05 [opt_a]: 0.00361024, [2] [Cycle 1]: 0.00255758, [45] [expand_dump_flag]: 1.85001e-06 [switch_simplify]: 3.968e-05 [loop_unroll]: 3.225e-05 [a_1]: 0.00071918 [with_stream_mark]: 1.599e-05 [recompute_prepare]: 1.17e-05 [updatestate_depend_eliminate]: 5.17e-06 [updatestate_assign_eliminate]: 4.27e-06 [updatestate_loads_eliminate]: 3.95e-06 [parameter_eliminate]: 2.34001e-06 [a_2]: 0.0001455 [accelerated_algorithm]: 9.61998e-06 [shard]: 1.67999e-06 [meta_shard_fg_expand]: 2.63998e-06 [shard_inline]: 8.89e-06 [merge_send_recv]: 7.75998e-06 [auto_parallel]: 9.14e-06 [parallel]: 1.769e-05 [flash_sp]: 9.10001e-06 [merge_comm]: 5.35001e-06 [allreduce_fusion]: 4.69998e-06 [matmul_add_comm_reduction]: 9.87001e-06 [allreduce_slice_to_reducescatter]: 4.2998e-07 [virtual_shard_identity]: 1.035e-05 [virtual_dataset]: 8.97e-06 [get_grad_eliminate_]: 9.32001e-06 [virtual_output]: 9.19998e-06 [merge_forward]: 5.14e-06 [cell_reuse_recompute_pass]: 1.07e-06 [offload_activation]: 1.016e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.928e-05 [merge_recompute_call_nodes]: 1.15999e-06 [before_grad]: 1.641e-05 [set_forward_comm_id_for_comm_node_pass]: 4.99998e-06 [meta_fg_expand]: 3.93999e-06 [flash_sp_send_recv_attached]: 2.83e-06 [receive_attached]: 2.36e-06 [after_resolve]: 1.361e-05 [a_after_grad]: 1.387e-05 [renormalize]: 0.00083886 [add_forward_monad_depend]: 5.29e-06 [auto_monad_grad]: 2.22001e-06 [auto_monad_eliminator]: 1.669e-05 [cse]: 2.813e-05 [a_3]: 8.062e-05 [Cycle 2]: 0.00103887, [45] [expand_dump_flag]: 1.36998e-06 [switch_simplify]: 9.81e-06 [loop_unroll]: 8.70999e-06 [a_1]: 0.00021267 [with_stream_mark]: 1.384e-05 [recompute_prepare]: 8.79e-06 [updatestate_depend_eliminate]: 4.67e-06 [updatestate_assign_eliminate]: 3.71999e-06 [updatestate_loads_eliminate]: 3.7e-06 [parameter_eliminate]: 1.00999e-06 [a_2]: 0.00013739 [accelerated_algorithm]: 8.80001e-06 [shard]: 1.38002e-06 [meta_shard_fg_expand]: 2.29001e-06 [shard_inline]: 8.94998e-06 [merge_send_recv]: 7.56001e-06 [auto_parallel]: 8.17e-06 [parallel]: 5.69e-06 [flash_sp]: 3.11001e-06 [merge_comm]: 5.04998e-06 [allreduce_fusion]: 4.85999e-06 [matmul_add_comm_reduction]: 8.94003e-06 [allreduce_slice_to_reducescatter]: 3.19997e-07 [virtual_shard_identity]: 9.20999e-06 [virtual_dataset]: 8.67998e-06 [get_grad_eliminate_]: 9.02e-06 [virtual_output]: 8.36002e-06 [merge_forward]: 4.50999e-06 [cell_reuse_recompute_pass]: 1.87001e-06 [offload_activation]: 9.79e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.851e-05 [merge_recompute_call_nodes]: 9.10019e-07 [before_grad]: 1.46e-05 [set_forward_comm_id_for_comm_node_pass]: 4.87e-06 [meta_fg_expand]: 3.38e-06 [flash_sp_send_recv_attached]: 1.10001e-06 [receive_attached]: 1.69998e-06 [after_resolve]: 1.319e-05 [a_after_grad]: 1.299e-05 [renormalize]: 6.00121e-08 [add_forward_monad_depend]: 1.34e-06 [auto_monad_grad]: 1.90001e-06 [auto_monad_eliminator]: 1.057e-05 [cse]: 2.225e-05 [a_3]: 6.791e-05 [py_interpret_to_execute_after_opt_a]: 1.605e-05 [slice_cell_reuse_recomputed_activation]: 4.39002e-06 [rewriter_after_opt_a]: 5.012e-05 [convert_after_rewriter]: 1.173e-05 [order_py_execute_after_rewriter]: 9.28002e-06 [mutable_eliminate]: 0.00058661 [opt_b]: 0.00034941, [1] [Cycle 1]: 0.00033999, [7] [b_1]: 0.00022822 [b_2]: 1.089e-05 [updatestate_depend_eliminate]: 7.51999e-06 [updatestate_assign_eliminate]: 3.85e-06 [updatestate_loads_eliminate]: 3.89002e-06 [renormalize]: 4.39992e-07 [cse]: 2.942e-05 [optimize_parallel_all_gather_comm]: 2.089e-05 [overlap_param_gather]: 5.04e-06 [cconv]: 2.513e-05 [loop_unroll]: 0.0004724 [opt_after_cconv]: 0.00015296, [1] [Cycle 1]: 0.00014427, [7] [c_1]: 4.303e-05 [parameter_eliminate]: 2.59001e-06 [updatestate_depend_eliminate]: 7.44002e-06 [updatestate_assign_eliminate]: 3.83999e-06 [updatestate_loads_eliminate]: 3.78001e-06 [cse]: 2.746e-05 [renormalize]: 3.7998e-07 [remove_dup_value]: 3.515e-05 [tuple_transform]: 0.00010949, [1] [Cycle 1]: 0.00010198, [4] [d_1]: 5.942e-05 [none_parameter_eliminate]: 1.88002e-06 [renormalize]: 2.29978e-07 [switch_simplify]: 1.004e-05 [partial_unused_args_eliminate]: 4.55999e-06 [add_recomputation]: 5.495e-05 [cse_after_recomputation]: 3.419e-05, [1] [Cycle 1]: 2.732e-05, [1] [cse]: 1.788e-05 [environ_conv]: 8.56002e-06 [swap_dp_allreduce_reducescatter]: 9.03002e-06 [bias_add_comm_swap]: 4.53001e-06 [label_micro_interleaved_index]: 6.34001e-06 [label_fine_grained_interleaved_index]: 4.52e-06 [merge_cast_opt]: 3.37002e-06 [slice_recompute_activation]: 4.07e-06 [micro_interleaved_order_control]: 4.48999e-06 [assign_add_opt]: 3.28998e-06 [ForceFp32Comm]: 3.48999e-06 [remove_cast_before_assign_add]: 3.09001e-06 [full_micro_interleaved_order_control]: 4.61002e-06 [reorder_send_recv_between_fp_bp]: 4.67e-06 [comm_op_add_attrs]: 3.32002e-06 [add_comm_op_reuse_tag]: 3.06999e-06 [interleave_split_concat_branches]: 3.5e-06 [interleave_parallel_branches]: 3.39001e-06 [overlap_opt_shard_in_pipeline]: 3.2e-06 [overlap_opt_shard_grad_in_pipeline]: 3.56001e-06 [control_data_broadcast_order]: 1.805e-05 [grouped_pairwise_exchange_alltoall]: 3.48999e-06 [offloading_packed_experts]: 7.13e-06 [overlap_recompute_and_grad_model_parallel]: 7.68001e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.75e-06 [overlap_recompute_allgather_and_fa_grad]: 3.51999e-06 [overlap_recompute_comm]: 5.05001e-06 [overlap_grad_ring_attention]: 6.94001e-06 [overlap_grad_flash_sp]: 2.323e-05 [begin_end_overlap_inline]: 2.83e-06 [split_matmul_comm_elemetwise]: 3.58999e-06 [split_layernorm_comm]: 3.78999e-06 [handle_group_info]: 3.3e-06 [symbol_engine_optimizer]: 0.00011194, [1] [Cycle 1]: 0.00010469, [6] [build]: 3.61999e-06 [elim_shapecalc]: 1.328e-05 [elim_not_effective]: 1.689e-05 [opt_reshape]: 9.89999e-06 [fold_const_symbol]: 1.44e-05 [renormalize]: 2.10013e-07 [detach_backward]: 3.23998e-06 [pipeline_parallel_scheduler]: 1.46002e-06 [auto_monad_reorder]: 1.921e-05 [get_jit_bprop_graph]: 1.21002e-06 [rewriter_after_jit_bprop_graph]: 4.33001e-06 [opt_after_jit_grad]: 0.00054174 [validate]: 4.003e-05 Sums bootstrap : 0.000399s : 3.31% type_inference : 0.005891s : 48.89% event_method : 0.000019s : 0.16% auto_monad : 0.000066s : 0.55% graph_reusing : 0.000006s : 0.05% inline : 0.000002s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000021s : 0.18% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.05% parallel-infer-symbol : 0.000003s : 0.03% pre_auto_parallel : 0.000034s : 0.28% insert-virtual-dataset : 0.000003s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000029s : 0.24% optimize.rewriter_before_opt_a : 0.000091s : 0.75% optimize.opt_a.expand_dump_flag : 0.000003s : 0.03% optimize.opt_a.switch_simplify : 0.000049s : 0.41% optimize.opt_a.loop_unroll : 0.000041s : 0.34% optimize.opt_a.a_1 : 0.000932s : 7.73% optimize.opt_a.with_stream_mark : 0.000030s : 0.25% optimize.opt_a.recompute_prepare : 0.000020s : 0.17% optimize.opt_a.updatestate_depend_eliminate : 0.000010s : 0.08% optimize.opt_a.updatestate_assign_eliminate : 0.000008s : 0.07% optimize.opt_a.updatestate_loads_eliminate : 0.000008s : 0.06% optimize.opt_a.parameter_eliminate : 0.000003s : 0.03% optimize.opt_a.a_2 : 0.000283s : 2.35% optimize.opt_a.accelerated_algorithm : 0.000018s : 0.15% optimize.opt_a.shard : 0.000003s : 0.03% optimize.opt_a.meta_shard_fg_expand : 0.000005s : 0.04% optimize.opt_a.shard_inline : 0.000018s : 0.15% optimize.opt_a.merge_send_recv : 0.000015s : 0.13% optimize.opt_a.auto_parallel : 0.000017s : 0.14% optimize.opt_a.parallel : 0.000023s : 0.19% optimize.opt_a.flash_sp : 0.000012s : 0.10% optimize.opt_a.merge_comm : 0.000010s : 0.09% optimize.opt_a.allreduce_fusion : 0.000010s : 0.08% optimize.opt_a.matmul_add_comm_reduction : 0.000019s : 0.16% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000020s : 0.16% optimize.opt_a.virtual_dataset : 0.000018s : 0.15% optimize.opt_a.get_grad_eliminate_ : 0.000018s : 0.15% optimize.opt_a.virtual_output : 0.000018s : 0.15% optimize.opt_a.merge_forward : 0.000010s : 0.08% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.02% optimize.opt_a.offload_activation : 0.000020s : 0.17% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000038s : 0.31% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.02% optimize.opt_a.before_grad : 0.000031s : 0.26% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000010s : 0.08% optimize.opt_a.meta_fg_expand : 0.000007s : 0.06% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.03% optimize.opt_a.receive_attached : 0.000004s : 0.03% optimize.opt_a.after_resolve : 0.000027s : 0.22% optimize.opt_a.a_after_grad : 0.000027s : 0.22% optimize.opt_a.renormalize : 0.000839s : 6.96% optimize.opt_a.add_forward_monad_depend : 0.000007s : 0.06% optimize.opt_a.auto_monad_grad : 0.000004s : 0.03% optimize.opt_a.auto_monad_eliminator : 0.000027s : 0.23% optimize.opt_a.cse : 0.000050s : 0.42% optimize.opt_a.a_3 : 0.000149s : 1.23% optimize.py_interpret_to_execute_after_opt_a : 0.000016s : 0.13% optimize.slice_cell_reuse_recomputed_activation : 0.000004s : 0.04% optimize.rewriter_after_opt_a : 0.000050s : 0.42% optimize.convert_after_rewriter : 0.000012s : 0.10% optimize.order_py_execute_after_rewriter : 0.000009s : 0.08% optimize.mutable_eliminate : 0.000587s : 4.87% optimize.opt_b.b_1 : 0.000228s : 1.89% optimize.opt_b.b_2 : 0.000011s : 0.09% optimize.opt_b.updatestate_depend_eliminate : 0.000008s : 0.06% optimize.opt_b.updatestate_assign_eliminate : 0.000004s : 0.03% optimize.opt_b.updatestate_loads_eliminate : 0.000004s : 0.03% optimize.opt_b.renormalize : 0.000000s : 0.00% optimize.opt_b.cse : 0.000029s : 0.24% optimize.optimize_parallel_all_gather_comm : 0.000021s : 0.17% optimize.overlap_param_gather : 0.000005s : 0.04% optimize.cconv : 0.000025s : 0.21% optimize.loop_unroll : 0.000472s : 3.92% optimize.opt_after_cconv.c_1 : 0.000043s : 0.36% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.02% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000007s : 0.06% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000004s : 0.03% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000004s : 0.03% optimize.opt_after_cconv.cse : 0.000027s : 0.23% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000035s : 0.29% optimize.tuple_transform.d_1 : 0.000059s : 0.49% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.02% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000010s : 0.08% optimize.partial_unused_args_eliminate : 0.000005s : 0.04% optimize.add_recomputation : 0.000055s : 0.46% optimize.cse_after_recomputation.cse : 0.000018s : 0.15% optimize.environ_conv : 0.000009s : 0.07% optimize.swap_dp_allreduce_reducescatter : 0.000009s : 0.07% optimize.bias_add_comm_swap : 0.000005s : 0.04% optimize.label_micro_interleaved_index : 0.000006s : 0.05% optimize.label_fine_grained_interleaved_index : 0.000005s : 0.04% optimize.merge_cast_opt : 0.000003s : 0.03% optimize.slice_recompute_activation : 0.000004s : 0.03% optimize.micro_interleaved_order_control : 0.000004s : 0.04% optimize.assign_add_opt : 0.000003s : 0.03% optimize.ForceFp32Comm : 0.000003s : 0.03% optimize.remove_cast_before_assign_add : 0.000003s : 0.03% optimize.full_micro_interleaved_order_control : 0.000005s : 0.04% optimize.reorder_send_recv_between_fp_bp : 0.000005s : 0.04% optimize.comm_op_add_attrs : 0.000003s : 0.03% optimize.add_comm_op_reuse_tag : 0.000003s : 0.03% optimize.interleave_split_concat_branches : 0.000003s : 0.03% optimize.interleave_parallel_branches : 0.000003s : 0.03% optimize.overlap_opt_shard_in_pipeline : 0.000003s : 0.03% optimize.overlap_opt_shard_grad_in_pipeline : 0.000004s : 0.03% optimize.control_data_broadcast_order : 0.000018s : 0.15% optimize.grouped_pairwise_exchange_alltoall : 0.000003s : 0.03% optimize.offloading_packed_experts : 0.000007s : 0.06% optimize.overlap_recompute_and_grad_model_parallel : 0.000008s : 0.06% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.03% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.03% optimize.overlap_recompute_comm : 0.000005s : 0.04% optimize.overlap_grad_ring_attention : 0.000007s : 0.06% optimize.overlap_grad_flash_sp : 0.000023s : 0.19% optimize.begin_end_overlap_inline : 0.000003s : 0.02% optimize.split_matmul_comm_elemetwise : 0.000004s : 0.03% optimize.split_layernorm_comm : 0.000004s : 0.03% optimize.handle_group_info : 0.000003s : 0.03% optimize.symbol_engine_optimizer.build : 0.000004s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000013s : 0.11% optimize.symbol_engine_optimizer.elim_not_effective : 0.000017s : 0.14% optimize.symbol_engine_optimizer.opt_reshape : 0.000010s : 0.08% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000014s : 0.12% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000003s : 0.03% pipeline_parallel_scheduler : 0.000001s : 0.01% auto_monad_reorder : 0.000019s : 0.16% get_jit_bprop_graph : 0.000001s : 0.01% rewriter_after_jit_bprop_graph : 0.000004s : 0.04% opt_after_jit_grad : 0.000542s : 4.50% validate : 0.000040s : 0.33% Time group info: ------[substitution.] 0.000223 48 14.05% : 0.000031s : 6: substitution.cast_eliminate 1.14% : 0.000003s : 4: substitution.elim_not_effective 1.02% : 0.000002s : 4: substitution.fold_const_symbol 2.96% : 0.000007s : 6: substitution.graph_param_transform 66.96% : 0.000149s : 4: substitution.inline 2.76% : 0.000006s : 8: substitution.j_node_and_user_rematch 3.21% : 0.000007s : 8: substitution.remove_not_recompute_node 2.06% : 0.000005s : 4: substitution.replace_old_param 5.84% : 0.000013s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.005845 2 88.17% : 0.005153s : 1: type_inference.infer 11.83% : 0.000691s : 1: type_inference.specialize ------[replace.] 0.000063 8 62.79% : 0.000040s : 4: replace.inline 37.21% : 0.000023s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000157 8 92.94% : 0.000146s : 4: match.inline 7.06% : 0.000011s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000275 1730 0.91% : 0.000002s : 17: predicate.accumulaten_eliminater 0.71% : 0.000002s : 6: predicate.ad_related_special_op_eliminate 0.59% : 0.000002s : 12: predicate.addn_check_dump 0.89% : 0.000002s : 17: predicate.addn_zero_filter 0.87% : 0.000002s : 17: predicate.adjust_all_reduce_mul_add 2.22% : 0.000006s : 29: predicate.arithmetic_simplify 1.09% : 0.000003s : 17: predicate.cast_eliminate 0.76% : 0.000002s : 12: predicate.check_bprop_eliminate 0.62% : 0.000002s : 12: predicate.compare_switch_simplify 0.19% : 0.000001s : 6: predicate.const_output_eliminate 0.67% : 0.000002s : 12: predicate.depend_value_elim 0.92% : 0.000003s : 17: predicate.dict_get_item_const_eliminator 1.01% : 0.000003s : 17: predicate.dict_get_item_eliminator 0.86% : 0.000002s : 17: predicate.dict_set_item_eliminator 0.96% : 0.000003s : 12: predicate.dumpgradient_eliminate 0.19% : 0.000001s : 6: predicate.elim_not_effective 0.43% : 0.000001s : 6: predicate.elim_shapecalc_of_broadcastargs 1.14% : 0.000003s : 23: predicate.environ_add_const_eliminate 1.11% : 0.000003s : 23: predicate.environ_get_add_eliminate 1.15% : 0.000003s : 23: predicate.environ_get_depend_swap 1.82% : 0.000005s : 35: predicate.environ_get_eliminate 1.09% : 0.000003s : 23: predicate.environ_get_set_eliminate 1.30% : 0.000004s : 25: predicate.exchange_switch_depend_value 2.05% : 0.000006s : 25: predicate.float_depend_g_call 0.59% : 0.000002s : 12: predicate.float_environ_get_switch 0.93% : 0.000003s : 18: predicate.float_tuple_getitem_switch 0.19% : 0.000001s : 6: predicate.fold_const_symbol 0.92% : 0.000003s : 12: predicate.get_grad_eliminate 0.22% : 0.000001s : 6: predicate.graph_param_transform 0.67% : 0.000002s : 12: predicate.incorporate_call 0.60% : 0.000002s : 12: predicate.incorporate_call_switch 6.22% : 0.000017s : 78: predicate.inline 0.90% : 0.000002s : 12: predicate.inline_without_move 0.35% : 0.000001s : 12: predicate.j_node_and_user_rematch 0.87% : 0.000002s : 12: predicate.less_batch_normalization 1.84% : 0.000005s : 33: predicate.list_to_tuple_eliminator_ 2.55% : 0.000007s : 50: predicate.load_eliminater 0.76% : 0.000002s : 6: predicate.loop_unroll_after_grad 1.98% : 0.000005s : 38: predicate.loop_unroll_before_grad 1.65% : 0.000005s : 29: predicate.make_slice_get_slice_eliminator 0.67% : 0.000002s : 12: predicate.merge_addn 0.75% : 0.000002s : 12: predicate.micro_step_allgather_replace 0.64% : 0.000002s : 12: predicate.mini_step_allgather_replace 0.82% : 0.000002s : 17: predicate.minmaximum_grad 0.96% : 0.000003s : 6: predicate.mutable_eliminate 0.44% : 0.000001s : 6: predicate.opt_reshape 0.52% : 0.000001s : 6: predicate.parallel_virtual_node 1.57% : 0.000004s : 25: predicate.partial_defer_inline 1.59% : 0.000004s : 27: predicate.partial_eliminate 0.86% : 0.000002s : 17: predicate.print_const_string_wrapper 0.66% : 0.000002s : 12: predicate.reduce_all_const_elim 1.10% : 0.000003s : 17: predicate.reduce_eliminate 2.52% : 0.000007s : 50: predicate.redundant_stop_gradient_eliminater 0.42% : 0.000001s : 12: predicate.remove_not_recompute_node 1.33% : 0.000004s : 33: predicate.replace_applicator 0.45% : 0.000001s : 12: predicate.replace_old_param 0.28% : 0.000001s : 6: predicate.reset_defer_inline 0.96% : 0.000003s : 17: predicate.reshape_eliminate 0.78% : 0.000002s : 12: predicate.row_tensor_add_zeros_like 0.41% : 0.000001s : 6: predicate.row_tensor_eliminate 0.91% : 0.000003s : 12: predicate.same_eliminate 0.44% : 0.000001s : 12: predicate.set_cell_output_no_recompute 0.78% : 0.000002s : 12: predicate.shard_identity_eliminate 0.80% : 0.000002s : 12: predicate.special_op_eliminate 0.86% : 0.000002s : 12: predicate.specialize_transform 0.87% : 0.000002s : 12: predicate.split_environ_get_set_with_tuple_value 0.74% : 0.000002s : 12: predicate.stack_unstack_eliminate 0.37% : 0.000001s : 6: predicate.switch_call_monad_eliminater 1.38% : 0.000004s : 25: predicate.switch_defer_inline 2.00% : 0.000005s : 37: predicate.switch_layer_defer_inline 4.55% : 0.000012s : 81: predicate.switch_simplify 0.88% : 0.000002s : 17: predicate.tile_eliminate 1.07% : 0.000003s : 17: predicate.transpose_eliminate 1.76% : 0.000005s : 29: predicate.tuple_list_convert_item_index_to_positive 1.66% : 0.000005s : 29: predicate.tuple_list_get_item_const_eliminator 1.53% : 0.000004s : 29: predicate.tuple_list_get_item_depend_reorder 2.89% : 0.000008s : 45: predicate.tuple_list_get_item_eliminator 1.57% : 0.000004s : 29: predicate.tuple_list_get_set_item_eliminator 2.41% : 0.000007s : 41: predicate.tuple_list_set_item_eliminator 1.73% : 0.000005s : 33: predicate.tuple_to_list_eliminator_ 2.44% : 0.000007s : 50: predicate.updatestate_pure_node_eliminater 3.21% : 0.000009s : 62: predicate.updatestate_useless_node_eliminater 0.44% : 0.000001s : 6: predicate.value_based_eliminate 0.68% : 0.000002s : 12: predicate.virtual_dataset_eliminate 0.66% : 0.000002s : 12: predicate.virtual_output_eliminate 0.34% : 0.000001s : 6: predicate.virtual_view_grad_eliminate 0.51% : 0.000001s : 6: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000531 11 52.87% : 0.000281s : 5: func_graph_cloner_run.FuncGraphClonerGraph 47.13% : 0.000250s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.029460 192 0.02% : 0.000006s : 1: ForceFp32Comm 11.61% : 0.003420s : 1: add_attr 11.56% : 0.003405s : 1: add_attr_with_inline 0.02% : 0.000006s : 1: add_comm_op_reuse_tag 0.20% : 0.000059s : 1: add_recomputation 0.02% : 0.000006s : 1: assign_add_opt 0.25% : 0.000075s : 1: auto_monad 0.09% : 0.000027s : 1: auto_monad_reorder 0.02% : 0.000006s : 1: begin_end_overlap_inline 0.02% : 0.000007s : 1: bias_add_comm_swap 1.54% : 0.000454s : 1: bootstrap 0.10% : 0.000028s : 1: cconv 0.02% : 0.000006s : 1: comm_op_add_attrs 0.07% : 0.000021s : 1: control_data_broadcast_order 0.05% : 0.000015s : 1: convert_after_rewriter 0.13% : 0.000037s : 1: cse_after_recomputation 0.03% : 0.000008s : 1: dataset_repeat_opt 0.07% : 0.000019s : 1: detach_backward 0.04% : 0.000012s : 1: environ_conv 0.10% : 0.000029s : 1: event_method 0.03% : 0.000007s : 1: full_micro_interleaved_order_control 0.02% : 0.000007s : 1: get_jit_bprop_graph 0.04% : 0.000012s : 1: graph_reusing 0.02% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.02% : 0.000006s : 1: handle_group_info 0.03% : 0.000007s : 1: inline 0.03% : 0.000009s : 1: insert-virtual-dataset 0.02% : 0.000006s : 1: interleave_parallel_branches 0.02% : 0.000006s : 1: interleave_split_concat_branches 0.03% : 0.000007s : 1: label_fine_grained_interleaved_index 0.03% : 0.000009s : 1: label_micro_interleaved_index 1.62% : 0.000479s : 1: loop_unroll 0.02% : 0.000006s : 1: merge_cast_opt 0.02% : 0.000007s : 1: micro_interleaved_order_control 2.01% : 0.000593s : 1: mutable_eliminate 0.03% : 0.000010s : 1: offloading_packed_experts 0.06% : 0.000017s : 1: opt.transform.loop_unroll_optimizer 0.06% : 0.000018s : 1: opt.transform.mutable_eliminate 5.21% : 0.001534s : 78: opt.transform.opt_a 0.14% : 0.000041s : 1: opt.transform.opt_after_cconv 0.12% : 0.000034s : 1: opt.transform.opt_after_jit_grad 0.56% : 0.000166s : 28: opt.transform.opt_b 0.23% : 0.000067s : 2: opt.transform.opt_trans_graph 0.17% : 0.000051s : 4: opt.transform.symbol_engine_opt 12.27% : 0.003613s : 1: opt_a 0.53% : 0.000156s : 1: opt_after_cconv 1.88% : 0.000553s : 1: opt_after_jit_grad 1.20% : 0.000353s : 1: opt_b 22.19% : 0.006539s : 1: optimize 0.08% : 0.000024s : 1: optimize_parallel_all_gather_comm 0.04% : 0.000012s : 1: order_py_execute_after_rewriter 0.09% : 0.000026s : 1: overlap_grad_flash_sp 0.02% : 0.000007s : 1: overlap_grad_matmul_and_grad_allreduce 0.03% : 0.000010s : 1: overlap_grad_ring_attention 0.02% : 0.000006s : 1: overlap_opt_shard_grad_in_pipeline 0.02% : 0.000006s : 1: overlap_opt_shard_in_pipeline 0.03% : 0.000008s : 1: overlap_param_gather 0.02% : 0.000006s : 1: overlap_recompute_allgather_and_fa_grad 0.04% : 0.000011s : 1: overlap_recompute_and_grad_model_parallel 0.03% : 0.000008s : 1: overlap_recompute_comm 0.04% : 0.000011s : 1: parallel-infer-symbol 0.02% : 0.000006s : 1: parallel-infer-symbol-second 0.03% : 0.000008s : 1: partial_unused_args_eliminate 0.03% : 0.000010s : 1: pipeline_parallel_scheduler 0.02% : 0.000007s : 1: pipeline_split 0.14% : 0.000041s : 1: pre_auto_parallel 0.11% : 0.000033s : 1: py_interpret_to_execute 0.07% : 0.000020s : 1: py_interpret_to_execute_after_opt_a 0.02% : 0.000006s : 1: remove_cast_before_assign_add 0.13% : 0.000039s : 1: remove_dup_value 1.59% : 0.000470s : 1: renormalize.infer 1.22% : 0.000361s : 1: renormalize.specialize 0.03% : 0.000008s : 1: reorder_send_recv_between_fp_bp 0.03% : 0.000010s : 1: rewriter_after_jit_bprop_graph 0.18% : 0.000053s : 1: rewriter_after_opt_a 0.32% : 0.000094s : 1: rewriter_before_opt_a 0.02% : 0.000007s : 1: slice_cell_reuse_recomputed_activation 0.02% : 0.000007s : 1: slice_recompute_activation 0.02% : 0.000006s : 1: split_layernorm_comm 0.02% : 0.000006s : 1: split_matmul_comm_elemetwise 0.04% : 0.000012s : 1: swap_dp_allreduce_reducescatter 0.39% : 0.000115s : 1: symbol_engine_optimizer 0.38% : 0.000112s : 1: tuple_transform 20.11% : 0.005925s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:45:42.989.59 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.016994, [21] [bootstrap]: 0.00042803 [type_inference]: 0.00625414 [event_method]: 2.265e-05 [auto_monad]: 7.267e-05 [graph_reusing]: 5.66e-06 [inline]: 3.11001e-06 [add_attr]: 0.00365882, [1] [add_attr_with_inline]: 0.00364581, [1] [Cycle 1]: 7.367e-05, [2] [tag_attr]: 2.424e-05 [meta_addattr_fg_expand]: 6.26e-06 [parallel-infer-symbol]: 3.97998e-06 [pre_auto_parallel]: 4.299e-05 [insert-virtual-dataset]: 3.08e-06 [parallel-infer-symbol-second]: 8.70001e-07 [dataset_repeat_opt]: 1.92999e-06 [pipeline_split]: 1.82001e-06 [optimize]: 0.00572692, [53] [py_interpret_to_execute]: 2.967e-05 [rewriter_before_opt_a]: 9.472e-05 [opt_a]: 0.00332713, [2] [Cycle 1]: 0.00242135, [45] [expand_dump_flag]: 2.81e-06 [switch_simplify]: 4.469e-05 [loop_unroll]: 3.223e-05 [a_1]: 0.00073805 [with_stream_mark]: 1.808e-05 [recompute_prepare]: 1.118e-05 [updatestate_depend_eliminate]: 5.00999e-06 [updatestate_assign_eliminate]: 4.52e-06 [updatestate_loads_eliminate]: 4.43001e-06 [parameter_eliminate]: 2.18998e-06 [a_2]: 0.00012109 [accelerated_algorithm]: 9.47999e-06 [shard]: 1.94e-06 [meta_shard_fg_expand]: 2.51e-06 [shard_inline]: 9.41e-06 [merge_send_recv]: 1.117e-05 [auto_parallel]: 8.93002e-06 [parallel]: 1.973e-05 [flash_sp]: 8.95001e-06 [merge_comm]: 5.27999e-06 [allreduce_fusion]: 4.93001e-06 [matmul_add_comm_reduction]: 1.177e-05 [allreduce_slice_to_reducescatter]: 8.29983e-07 [virtual_shard_identity]: 1.092e-05 [virtual_dataset]: 9.76998e-06 [get_grad_eliminate_]: 9.07001e-06 [virtual_output]: 9.31e-06 [merge_forward]: 4.27e-06 [cell_reuse_recompute_pass]: 1.04e-06 [offload_activation]: 1.166e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.81e-05 [merge_recompute_call_nodes]: 2.23002e-06 [before_grad]: 1.542e-05 [set_forward_comm_id_for_comm_node_pass]: 5.25999e-06 [meta_fg_expand]: 3.81001e-06 [flash_sp_send_recv_attached]: 2.48998e-06 [receive_attached]: 1.99e-06 [after_resolve]: 1.431e-05 [a_after_grad]: 1.425e-05 [renormalize]: 0.00082097 [add_forward_monad_depend]: 5.54998e-06 [auto_monad_grad]: 2.36e-06 [auto_monad_eliminator]: 1.963e-05 [cse]: 4.494e-05 [a_3]: 6.746e-05 [Cycle 2]: 0.00089444, [45] [expand_dump_flag]: 1.94999e-06 [switch_simplify]: 1.015e-05 [loop_unroll]: 8.67998e-06 [a_1]: 0.00021312 [with_stream_mark]: 1.259e-05 [recompute_prepare]: 8.97999e-06 [updatestate_depend_eliminate]: 4.49998e-06 [updatestate_assign_eliminate]: 3.51999e-06 [updatestate_loads_eliminate]: 3.59002e-06 [parameter_eliminate]: 1.01002e-06 [a_2]: 0.00011319 [accelerated_algorithm]: 9.13002e-06 [shard]: 1.27999e-06 [meta_shard_fg_expand]: 1.91e-06 [shard_inline]: 8.68001e-06 [merge_send_recv]: 7.98001e-06 [auto_parallel]: 7.53999e-06 [parallel]: 6.20002e-06 [flash_sp]: 3.33e-06 [merge_comm]: 4.98001e-06 [allreduce_fusion]: 4.65999e-06 [matmul_add_comm_reduction]: 9.99001e-06 [allreduce_slice_to_reducescatter]: 3.50003e-07 [virtual_shard_identity]: 9.41e-06 [virtual_dataset]: 8.43001e-06 [get_grad_eliminate_]: 8.97e-06 [virtual_output]: 8.35999e-06 [merge_forward]: 4.37e-06 [cell_reuse_recompute_pass]: 1.34998e-06 [offload_activation]: 1.023e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.559e-05 [merge_recompute_call_nodes]: 1.47999e-06 [before_grad]: 1.395e-05 [set_forward_comm_id_for_comm_node_pass]: 5.23002e-06 [meta_fg_expand]: 3.16999e-06 [flash_sp_send_recv_attached]: 1.00999e-06 [receive_attached]: 1.84998e-06 [after_resolve]: 1.297e-05 [a_after_grad]: 1.301e-05 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 1.20999e-06 [auto_monad_grad]: 1.27999e-06 [auto_monad_eliminator]: 1.007e-05 [cse]: 2.406e-05 [a_3]: 8.139e-05 [py_interpret_to_execute_after_opt_a]: 1.182e-05 [slice_cell_reuse_recomputed_activation]: 2.63e-06 [rewriter_after_opt_a]: 3.981e-05 [convert_after_rewriter]: 7.35e-06 [order_py_execute_after_rewriter]: 6.11e-06 [mutable_eliminate]: 0.00060121 [opt_b]: 0.00029024, [1] [Cycle 1]: 0.00028333, [7] [b_1]: 0.00018616 [b_2]: 1.112e-05 [updatestate_depend_eliminate]: 8.52e-06 [updatestate_assign_eliminate]: 3.62002e-06 [updatestate_loads_eliminate]: 3.59002e-06 [renormalize]: 7.29982e-07 [cse]: 3.303e-05 [optimize_parallel_all_gather_comm]: 2.022e-05 [overlap_param_gather]: 1.99e-06 [cconv]: 1.842e-05 [loop_unroll]: 0.00048206 [opt_after_cconv]: 0.00013201, [1] [Cycle 1]: 0.0001258, [7] [c_1]: 4.459e-05 [parameter_eliminate]: 2.98998e-06 [updatestate_depend_eliminate]: 6.83e-06 [updatestate_assign_eliminate]: 3.62002e-06 [updatestate_loads_eliminate]: 3.8e-06 [cse]: 2.897e-05 [renormalize]: 3.69997e-07 [remove_dup_value]: 4.424e-05 [tuple_transform]: 9.813e-05, [1] [Cycle 1]: 9.29e-05, [4] [d_1]: 6.216e-05 [none_parameter_eliminate]: 1.32e-06 [renormalize]: 2.69996e-07 [switch_simplify]: 9.91e-06 [partial_unused_args_eliminate]: 1.45001e-06 [add_recomputation]: 5.026e-05 [cse_after_recomputation]: 2.866e-05, [1] [Cycle 1]: 2.354e-05, [1] [cse]: 1.784e-05 [environ_conv]: 5.94e-06 [swap_dp_allreduce_reducescatter]: 7.61001e-06 [bias_add_comm_swap]: 3.06999e-06 [label_micro_interleaved_index]: 5.25001e-06 [label_fine_grained_interleaved_index]: 2.88e-06 [merge_cast_opt]: 1.42e-06 [slice_recompute_activation]: 2.09e-06 [micro_interleaved_order_control]: 2.29001e-06 [assign_add_opt]: 1.23002e-06 [ForceFp32Comm]: 8.59989e-07 [remove_cast_before_assign_add]: 1.16002e-06 [full_micro_interleaved_order_control]: 2.07001e-06 [reorder_send_recv_between_fp_bp]: 2.71e-06 [comm_op_add_attrs]: 1.00999e-06 [add_comm_op_reuse_tag]: 9.39996e-07 [interleave_split_concat_branches]: 1.22e-06 [interleave_parallel_branches]: 1.04e-06 [overlap_opt_shard_in_pipeline]: 1.24e-06 [overlap_opt_shard_grad_in_pipeline]: 1.77999e-06 [control_data_broadcast_order]: 1.72e-05 [grouped_pairwise_exchange_alltoall]: 2.34001e-06 [offloading_packed_experts]: 5.16998e-06 [overlap_recompute_and_grad_model_parallel]: 6.07999e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.29998e-06 [overlap_recompute_allgather_and_fa_grad]: 1.34e-06 [overlap_recompute_comm]: 2.07999e-06 [overlap_grad_ring_attention]: 5.87999e-06 [overlap_grad_flash_sp]: 2.626e-05 [begin_end_overlap_inline]: 5.50004e-07 [split_matmul_comm_elemetwise]: 2.38998e-06 [split_layernorm_comm]: 1.73002e-06 [handle_group_info]: 1.15999e-06 [symbol_engine_optimizer]: 9.261e-05, [1] [Cycle 1]: 8.798e-05, [6] [build]: 2.78003e-06 [elim_shapecalc]: 1.301e-05 [elim_not_effective]: 1.77e-05 [opt_reshape]: 1.007e-05 [fold_const_symbol]: 1.477e-05 [renormalize]: 2.20025e-07 [detach_backward]: 2.48002e-06 [pipeline_parallel_scheduler]: 1.74e-06 [auto_monad_reorder]: 1.631e-05 [get_jit_bprop_graph]: 1.64e-06 [rewriter_after_jit_bprop_graph]: 4.18001e-06 [opt_after_jit_grad]: 0.00053527 [validate]: 3.864e-05 Sums bootstrap : 0.000428s : 3.47% type_inference : 0.006254s : 50.64% event_method : 0.000023s : 0.18% auto_monad : 0.000073s : 0.59% graph_reusing : 0.000006s : 0.05% inline : 0.000003s : 0.03% add_attr.add_attr_with_inline.tag_attr : 0.000024s : 0.20% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.05% parallel-infer-symbol : 0.000004s : 0.03% pre_auto_parallel : 0.000043s : 0.35% insert-virtual-dataset : 0.000003s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000030s : 0.24% optimize.rewriter_before_opt_a : 0.000095s : 0.77% optimize.opt_a.expand_dump_flag : 0.000005s : 0.04% optimize.opt_a.switch_simplify : 0.000055s : 0.44% optimize.opt_a.loop_unroll : 0.000041s : 0.33% optimize.opt_a.a_1 : 0.000951s : 7.70% optimize.opt_a.with_stream_mark : 0.000031s : 0.25% optimize.opt_a.recompute_prepare : 0.000020s : 0.16% optimize.opt_a.updatestate_depend_eliminate : 0.000010s : 0.08% optimize.opt_a.updatestate_assign_eliminate : 0.000008s : 0.07% optimize.opt_a.updatestate_loads_eliminate : 0.000008s : 0.06% optimize.opt_a.parameter_eliminate : 0.000003s : 0.03% optimize.opt_a.a_2 : 0.000234s : 1.90% optimize.opt_a.accelerated_algorithm : 0.000019s : 0.15% optimize.opt_a.shard : 0.000003s : 0.03% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.04% optimize.opt_a.shard_inline : 0.000018s : 0.15% optimize.opt_a.merge_send_recv : 0.000019s : 0.16% optimize.opt_a.auto_parallel : 0.000016s : 0.13% optimize.opt_a.parallel : 0.000026s : 0.21% optimize.opt_a.flash_sp : 0.000012s : 0.10% optimize.opt_a.merge_comm : 0.000010s : 0.08% optimize.opt_a.allreduce_fusion : 0.000010s : 0.08% optimize.opt_a.matmul_add_comm_reduction : 0.000022s : 0.18% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000020s : 0.16% optimize.opt_a.virtual_dataset : 0.000018s : 0.15% optimize.opt_a.get_grad_eliminate_ : 0.000018s : 0.15% optimize.opt_a.virtual_output : 0.000018s : 0.14% optimize.opt_a.merge_forward : 0.000009s : 0.07% optimize.opt_a.cell_reuse_recompute_pass : 0.000002s : 0.02% optimize.opt_a.offload_activation : 0.000022s : 0.18% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000034s : 0.27% optimize.opt_a.merge_recompute_call_nodes : 0.000004s : 0.03% optimize.opt_a.before_grad : 0.000029s : 0.24% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000010s : 0.08% optimize.opt_a.meta_fg_expand : 0.000007s : 0.06% optimize.opt_a.flash_sp_send_recv_attached : 0.000003s : 0.03% optimize.opt_a.receive_attached : 0.000004s : 0.03% optimize.opt_a.after_resolve : 0.000027s : 0.22% optimize.opt_a.a_after_grad : 0.000027s : 0.22% optimize.opt_a.renormalize : 0.000821s : 6.65% optimize.opt_a.add_forward_monad_depend : 0.000007s : 0.05% optimize.opt_a.auto_monad_grad : 0.000004s : 0.03% optimize.opt_a.auto_monad_eliminator : 0.000030s : 0.24% optimize.opt_a.cse : 0.000069s : 0.56% optimize.opt_a.a_3 : 0.000149s : 1.21% optimize.py_interpret_to_execute_after_opt_a : 0.000012s : 0.10% optimize.slice_cell_reuse_recomputed_activation : 0.000003s : 0.02% optimize.rewriter_after_opt_a : 0.000040s : 0.32% optimize.convert_after_rewriter : 0.000007s : 0.06% optimize.order_py_execute_after_rewriter : 0.000006s : 0.05% optimize.mutable_eliminate : 0.000601s : 4.87% optimize.opt_b.b_1 : 0.000186s : 1.51% optimize.opt_b.b_2 : 0.000011s : 0.09% optimize.opt_b.updatestate_depend_eliminate : 0.000009s : 0.07% optimize.opt_b.updatestate_assign_eliminate : 0.000004s : 0.03% optimize.opt_b.updatestate_loads_eliminate : 0.000004s : 0.03% optimize.opt_b.renormalize : 0.000001s : 0.01% optimize.opt_b.cse : 0.000033s : 0.27% optimize.optimize_parallel_all_gather_comm : 0.000020s : 0.16% optimize.overlap_param_gather : 0.000002s : 0.02% optimize.cconv : 0.000018s : 0.15% optimize.loop_unroll : 0.000482s : 3.90% optimize.opt_after_cconv.c_1 : 0.000045s : 0.36% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.02% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000007s : 0.06% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000004s : 0.03% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000004s : 0.03% optimize.opt_after_cconv.cse : 0.000029s : 0.23% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000044s : 0.36% optimize.tuple_transform.d_1 : 0.000062s : 0.50% optimize.tuple_transform.none_parameter_eliminate : 0.000001s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000010s : 0.08% optimize.partial_unused_args_eliminate : 0.000001s : 0.01% optimize.add_recomputation : 0.000050s : 0.41% optimize.cse_after_recomputation.cse : 0.000018s : 0.14% optimize.environ_conv : 0.000006s : 0.05% optimize.swap_dp_allreduce_reducescatter : 0.000008s : 0.06% optimize.bias_add_comm_swap : 0.000003s : 0.02% optimize.label_micro_interleaved_index : 0.000005s : 0.04% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.02% optimize.merge_cast_opt : 0.000001s : 0.01% optimize.slice_recompute_activation : 0.000002s : 0.02% optimize.micro_interleaved_order_control : 0.000002s : 0.02% optimize.assign_add_opt : 0.000001s : 0.01% optimize.ForceFp32Comm : 0.000001s : 0.01% optimize.remove_cast_before_assign_add : 0.000001s : 0.01% optimize.full_micro_interleaved_order_control : 0.000002s : 0.02% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.02% optimize.comm_op_add_attrs : 0.000001s : 0.01% optimize.add_comm_op_reuse_tag : 0.000001s : 0.01% optimize.interleave_split_concat_branches : 0.000001s : 0.01% optimize.interleave_parallel_branches : 0.000001s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.01% optimize.control_data_broadcast_order : 0.000017s : 0.14% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.02% optimize.offloading_packed_experts : 0.000005s : 0.04% optimize.overlap_recompute_and_grad_model_parallel : 0.000006s : 0.05% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.01% optimize.overlap_recompute_comm : 0.000002s : 0.02% optimize.overlap_grad_ring_attention : 0.000006s : 0.05% optimize.overlap_grad_flash_sp : 0.000026s : 0.21% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.02% optimize.split_layernorm_comm : 0.000002s : 0.01% optimize.handle_group_info : 0.000001s : 0.01% optimize.symbol_engine_optimizer.build : 0.000003s : 0.02% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000013s : 0.11% optimize.symbol_engine_optimizer.elim_not_effective : 0.000018s : 0.14% optimize.symbol_engine_optimizer.opt_reshape : 0.000010s : 0.08% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000015s : 0.12% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.02% pipeline_parallel_scheduler : 0.000002s : 0.01% auto_monad_reorder : 0.000016s : 0.13% get_jit_bprop_graph : 0.000002s : 0.01% rewriter_after_jit_bprop_graph : 0.000004s : 0.03% opt_after_jit_grad : 0.000535s : 4.33% validate : 0.000039s : 0.31% Time group info: ------[substitution.] 0.000233 48 15.03% : 0.000035s : 6: substitution.cast_eliminate 0.94% : 0.000002s : 4: substitution.elim_not_effective 0.78% : 0.000002s : 4: substitution.fold_const_symbol 2.63% : 0.000006s : 6: substitution.graph_param_transform 68.01% : 0.000158s : 4: substitution.inline 2.10% : 0.000005s : 8: substitution.j_node_and_user_rematch 2.91% : 0.000007s : 8: substitution.remove_not_recompute_node 1.82% : 0.000004s : 4: substitution.replace_old_param 5.79% : 0.000013s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.006187 2 87.87% : 0.005436s : 1: type_inference.infer 12.13% : 0.000751s : 1: type_inference.specialize ------[replace.] 0.000062 8 61.84% : 0.000039s : 4: replace.inline 38.16% : 0.000024s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000168 8 93.00% : 0.000156s : 4: match.inline 7.00% : 0.000012s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000279 1730 0.84% : 0.000002s : 17: predicate.accumulaten_eliminater 0.66% : 0.000002s : 6: predicate.ad_related_special_op_eliminate 0.56% : 0.000002s : 12: predicate.addn_check_dump 0.88% : 0.000002s : 17: predicate.addn_zero_filter 0.90% : 0.000003s : 17: predicate.adjust_all_reduce_mul_add 2.07% : 0.000006s : 29: predicate.arithmetic_simplify 1.09% : 0.000003s : 17: predicate.cast_eliminate 0.85% : 0.000002s : 12: predicate.check_bprop_eliminate 0.61% : 0.000002s : 12: predicate.compare_switch_simplify 0.20% : 0.000001s : 6: predicate.const_output_eliminate 0.81% : 0.000002s : 12: predicate.depend_value_elim 0.97% : 0.000003s : 17: predicate.dict_get_item_const_eliminator 1.02% : 0.000003s : 17: predicate.dict_get_item_eliminator 0.93% : 0.000003s : 17: predicate.dict_set_item_eliminator 0.94% : 0.000003s : 12: predicate.dumpgradient_eliminate 0.20% : 0.000001s : 6: predicate.elim_not_effective 0.51% : 0.000001s : 6: predicate.elim_shapecalc_of_broadcastargs 1.22% : 0.000003s : 23: predicate.environ_add_const_eliminate 1.08% : 0.000003s : 23: predicate.environ_get_add_eliminate 1.25% : 0.000003s : 23: predicate.environ_get_depend_swap 1.74% : 0.000005s : 35: predicate.environ_get_eliminate 1.25% : 0.000003s : 23: predicate.environ_get_set_eliminate 1.30% : 0.000004s : 25: predicate.exchange_switch_depend_value 2.03% : 0.000006s : 25: predicate.float_depend_g_call 0.56% : 0.000002s : 12: predicate.float_environ_get_switch 0.89% : 0.000002s : 18: predicate.float_tuple_getitem_switch 0.20% : 0.000001s : 6: predicate.fold_const_symbol 0.74% : 0.000002s : 12: predicate.get_grad_eliminate 0.23% : 0.000001s : 6: predicate.graph_param_transform 0.68% : 0.000002s : 12: predicate.incorporate_call 0.60% : 0.000002s : 12: predicate.incorporate_call_switch 6.03% : 0.000017s : 78: predicate.inline 0.83% : 0.000002s : 12: predicate.inline_without_move 0.34% : 0.000001s : 12: predicate.j_node_and_user_rematch 0.87% : 0.000002s : 12: predicate.less_batch_normalization 1.81% : 0.000005s : 33: predicate.list_to_tuple_eliminator_ 2.47% : 0.000007s : 50: predicate.load_eliminater 0.85% : 0.000002s : 6: predicate.loop_unroll_after_grad 2.01% : 0.000006s : 38: predicate.loop_unroll_before_grad 1.62% : 0.000005s : 29: predicate.make_slice_get_slice_eliminator 0.66% : 0.000002s : 12: predicate.merge_addn 0.60% : 0.000002s : 12: predicate.micro_step_allgather_replace 0.64% : 0.000002s : 12: predicate.mini_step_allgather_replace 0.82% : 0.000002s : 17: predicate.minmaximum_grad 0.80% : 0.000002s : 6: predicate.mutable_eliminate 0.44% : 0.000001s : 6: predicate.opt_reshape 0.42% : 0.000001s : 6: predicate.parallel_virtual_node 1.61% : 0.000004s : 25: predicate.partial_defer_inline 1.60% : 0.000004s : 27: predicate.partial_eliminate 0.90% : 0.000003s : 17: predicate.print_const_string_wrapper 0.62% : 0.000002s : 12: predicate.reduce_all_const_elim 1.14% : 0.000003s : 17: predicate.reduce_eliminate 2.61% : 0.000007s : 50: predicate.redundant_stop_gradient_eliminater 0.39% : 0.000001s : 12: predicate.remove_not_recompute_node 1.29% : 0.000004s : 33: predicate.replace_applicator 0.43% : 0.000001s : 12: predicate.replace_old_param 0.26% : 0.000001s : 6: predicate.reset_defer_inline 0.87% : 0.000002s : 17: predicate.reshape_eliminate 0.70% : 0.000002s : 12: predicate.row_tensor_add_zeros_like 0.40% : 0.000001s : 6: predicate.row_tensor_eliminate 0.89% : 0.000002s : 12: predicate.same_eliminate 0.43% : 0.000001s : 12: predicate.set_cell_output_no_recompute 0.81% : 0.000002s : 12: predicate.shard_identity_eliminate 0.78% : 0.000002s : 12: predicate.special_op_eliminate 0.81% : 0.000002s : 12: predicate.specialize_transform 0.89% : 0.000002s : 12: predicate.split_environ_get_set_with_tuple_value 0.84% : 0.000002s : 12: predicate.stack_unstack_eliminate 0.38% : 0.000001s : 6: predicate.switch_call_monad_eliminater 1.37% : 0.000004s : 25: predicate.switch_defer_inline 2.01% : 0.000006s : 37: predicate.switch_layer_defer_inline 4.58% : 0.000013s : 81: predicate.switch_simplify 0.93% : 0.000003s : 17: predicate.tile_eliminate 1.01% : 0.000003s : 17: predicate.transpose_eliminate 1.71% : 0.000005s : 29: predicate.tuple_list_convert_item_index_to_positive 1.67% : 0.000005s : 29: predicate.tuple_list_get_item_const_eliminator 1.60% : 0.000004s : 29: predicate.tuple_list_get_item_depend_reorder 3.07% : 0.000009s : 45: predicate.tuple_list_get_item_eliminator 1.51% : 0.000004s : 29: predicate.tuple_list_get_set_item_eliminator 2.46% : 0.000007s : 41: predicate.tuple_list_set_item_eliminator 1.82% : 0.000005s : 33: predicate.tuple_to_list_eliminator_ 2.37% : 0.000007s : 50: predicate.updatestate_pure_node_eliminater 3.39% : 0.000009s : 62: predicate.updatestate_useless_node_eliminater 0.41% : 0.000001s : 6: predicate.value_based_eliminate 0.94% : 0.000003s : 12: predicate.virtual_dataset_eliminate 0.71% : 0.000002s : 12: predicate.virtual_output_eliminate 0.29% : 0.000001s : 6: predicate.virtual_view_grad_eliminate 0.47% : 0.000001s : 6: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000549 11 55.61% : 0.000305s : 5: func_graph_cloner_run.FuncGraphClonerGraph 44.39% : 0.000244s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.029047 192 0.01% : 0.000004s : 1: ForceFp32Comm 12.62% : 0.003666s : 1: add_attr 12.57% : 0.003650s : 1: add_attr_with_inline 0.01% : 0.000004s : 1: add_comm_op_reuse_tag 0.19% : 0.000054s : 1: add_recomputation 0.01% : 0.000004s : 1: assign_add_opt 0.27% : 0.000078s : 1: auto_monad 0.07% : 0.000020s : 1: auto_monad_reorder 0.01% : 0.000004s : 1: begin_end_overlap_inline 0.02% : 0.000006s : 1: bias_add_comm_swap 1.58% : 0.000460s : 1: bootstrap 0.08% : 0.000022s : 1: cconv 0.01% : 0.000004s : 1: comm_op_add_attrs 0.07% : 0.000020s : 1: control_data_broadcast_order 0.04% : 0.000011s : 1: convert_after_rewriter 0.11% : 0.000032s : 1: cse_after_recomputation 0.02% : 0.000005s : 1: dataset_repeat_opt 0.02% : 0.000006s : 1: detach_backward 0.03% : 0.000009s : 1: environ_conv 0.10% : 0.000029s : 1: event_method 0.02% : 0.000005s : 1: full_micro_interleaved_order_control 0.02% : 0.000005s : 1: get_jit_bprop_graph 0.03% : 0.000009s : 1: graph_reusing 0.02% : 0.000005s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000004s : 1: handle_group_info 0.02% : 0.000006s : 1: inline 0.02% : 0.000007s : 1: insert-virtual-dataset 0.01% : 0.000004s : 1: interleave_parallel_branches 0.01% : 0.000004s : 1: interleave_split_concat_branches 0.02% : 0.000006s : 1: label_fine_grained_interleaved_index 0.03% : 0.000008s : 1: label_micro_interleaved_index 1.69% : 0.000491s : 1: loop_unroll 0.02% : 0.000004s : 1: merge_cast_opt 0.02% : 0.000005s : 1: micro_interleaved_order_control 2.10% : 0.000610s : 1: mutable_eliminate 0.03% : 0.000008s : 1: offloading_packed_experts 0.06% : 0.000019s : 1: opt.transform.loop_unroll_optimizer 0.06% : 0.000019s : 1: opt.transform.mutable_eliminate 5.46% : 0.001586s : 78: opt.transform.opt_a 0.15% : 0.000043s : 1: opt.transform.opt_after_cconv 0.11% : 0.000031s : 1: opt.transform.opt_after_jit_grad 0.56% : 0.000164s : 28: opt.transform.opt_b 0.24% : 0.000070s : 2: opt.transform.opt_trans_graph 0.18% : 0.000051s : 4: opt.transform.symbol_engine_opt 11.47% : 0.003331s : 1: opt_a 0.47% : 0.000136s : 1: opt_after_cconv 1.87% : 0.000544s : 1: opt_after_jit_grad 1.01% : 0.000294s : 1: opt_b 19.73% : 0.005732s : 1: optimize 0.08% : 0.000024s : 1: optimize_parallel_all_gather_comm 0.03% : 0.000009s : 1: order_py_execute_after_rewriter 0.10% : 0.000030s : 1: overlap_grad_flash_sp 0.01% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.03% : 0.000009s : 1: overlap_grad_ring_attention 0.02% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.02% : 0.000006s : 1: overlap_param_gather 0.01% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.03% : 0.000009s : 1: overlap_recompute_and_grad_model_parallel 0.02% : 0.000005s : 1: overlap_recompute_comm 0.03% : 0.000008s : 1: parallel-infer-symbol 0.01% : 0.000004s : 1: parallel-infer-symbol-second 0.02% : 0.000005s : 1: partial_unused_args_eliminate 0.02% : 0.000005s : 1: pipeline_parallel_scheduler 0.02% : 0.000005s : 1: pipeline_split 0.16% : 0.000047s : 1: pre_auto_parallel 0.12% : 0.000034s : 1: py_interpret_to_execute 0.05% : 0.000016s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000004s : 1: remove_cast_before_assign_add 0.17% : 0.000049s : 1: remove_dup_value 1.61% : 0.000468s : 1: renormalize.infer 1.19% : 0.000346s : 1: renormalize.specialize 0.02% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.03% : 0.000007s : 1: rewriter_after_jit_bprop_graph 0.15% : 0.000044s : 1: rewriter_after_opt_a 0.34% : 0.000099s : 1: rewriter_before_opt_a 0.02% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.02% : 0.000005s : 1: slice_recompute_activation 0.02% : 0.000005s : 1: split_layernorm_comm 0.02% : 0.000005s : 1: split_matmul_comm_elemetwise 0.04% : 0.000011s : 1: swap_dp_allreduce_reducescatter 0.33% : 0.000096s : 1: symbol_engine_optimizer 0.35% : 0.000101s : 1: tuple_transform 21.62% : 0.006279s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:45:42.322.381 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:45:42.322.673 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0200526, [21] [bootstrap]: 0.00042568 [type_inference]: 0.00763111 [event_method]: 2.256e-05 [auto_monad]: 6.815e-05 [graph_reusing]: 6.44999e-06 [inline]: 2.32999e-06 [add_attr]: 0.00349389, [1] [add_attr_with_inline]: 0.003483, [1] [Cycle 1]: 7.093e-05, [2] [tag_attr]: 2.107e-05 [meta_addattr_fg_expand]: 5.33002e-06 [parallel-infer-symbol]: 3.46001e-06 [pre_auto_parallel]: 3.558e-05 [insert-virtual-dataset]: 2.54999e-06 [parallel-infer-symbol-second]: 8.2e-07 [dataset_repeat_opt]: 1.72999e-06 [pipeline_split]: 1.76e-06 [optimize]: 0.00685015, [53] [py_interpret_to_execute]: 2.518e-05 [rewriter_before_opt_a]: 9.161e-05 [opt_a]: 0.00369177, [2] [Cycle 1]: 0.00262173, [45] [expand_dump_flag]: 3.00002e-06 [switch_simplify]: 3.782e-05 [loop_unroll]: 3.113e-05 [a_1]: 0.00069292 [with_stream_mark]: 1.687e-05 [recompute_prepare]: 1.117e-05 [updatestate_depend_eliminate]: 5.15001e-06 [updatestate_assign_eliminate]: 4.03001e-06 [updatestate_loads_eliminate]: 3.06001e-06 [parameter_eliminate]: 1.42e-06 [a_2]: 0.00013032 [accelerated_algorithm]: 9.03002e-06 [shard]: 2.37999e-06 [meta_shard_fg_expand]: 2.64999e-06 [shard_inline]: 8.04997e-06 [merge_send_recv]: 1.128e-05 [auto_parallel]: 7.87e-06 [parallel]: 1.991e-05 [flash_sp]: 1e-05 [merge_comm]: 4.89e-06 [allreduce_fusion]: 4.2e-06 [matmul_add_comm_reduction]: 1.097e-05 [allreduce_slice_to_reducescatter]: 6.89994e-07 [virtual_shard_identity]: 9.99999e-06 [virtual_dataset]: 8.33001e-06 [get_grad_eliminate_]: 7.75998e-06 [virtual_output]: 8.55999e-06 [merge_forward]: 4.22e-06 [cell_reuse_recompute_pass]: 1.44e-06 [offload_activation]: 1.086e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.747e-05 [merge_recompute_call_nodes]: 1.90001e-06 [before_grad]: 1.489e-05 [set_forward_comm_id_for_comm_node_pass]: 4.63999e-06 [meta_fg_expand]: 3.58999e-06 [flash_sp_send_recv_attached]: 3.47997e-06 [receive_attached]: 2.09e-06 [after_resolve]: 1.366e-05 [a_after_grad]: 1.283e-05 [renormalize]: 0.00090768 [add_forward_monad_depend]: 7.11001e-06 [auto_monad_grad]: 2.74001e-06 [auto_monad_eliminator]: 1.904e-05 [cse]: 3.461e-05 [a_3]: 8.172e-05 [Cycle 2]: 0.0010538, [45] [expand_dump_flag]: 2.27999e-06 [switch_simplify]: 9.89001e-06 [loop_unroll]: 8.94e-06 [a_1]: 0.00019274 [with_stream_mark]: 1.765e-05 [recompute_prepare]: 8.22e-06 [updatestate_depend_eliminate]: 4.3e-06 [updatestate_assign_eliminate]: 4.23001e-06 [updatestate_loads_eliminate]: 3.52002e-06 [parameter_eliminate]: 1.87001e-06 [a_2]: 0.00012303 [accelerated_algorithm]: 8.38001e-06 [shard]: 2.01e-06 [meta_shard_fg_expand]: 2.40002e-06 [shard_inline]: 7.91001e-06 [merge_send_recv]: 1.002e-05 [auto_parallel]: 9.67999e-06 [parallel]: 1.934e-05 [flash_sp]: 3.65e-06 [merge_comm]: 4.68001e-06 [allreduce_fusion]: 4.73001e-06 [matmul_add_comm_reduction]: 9.86998e-06 [allreduce_slice_to_reducescatter]: 3.39991e-07 [virtual_shard_identity]: 1.062e-05 [virtual_dataset]: 8.17998e-06 [get_grad_eliminate_]: 7.9e-06 [virtual_output]: 1.102e-05 [merge_forward]: 4.51002e-06 [cell_reuse_recompute_pass]: 3.18e-06 [offload_activation]: 1.03e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.078e-05 [merge_recompute_call_nodes]: 1.20999e-06 [before_grad]: 1.417e-05 [set_forward_comm_id_for_comm_node_pass]: 5.17e-06 [meta_fg_expand]: 3.68999e-06 [flash_sp_send_recv_attached]: 1.66e-06 [receive_attached]: 2.37001e-06 [after_resolve]: 1.397e-05 [a_after_grad]: 1.235e-05 [renormalize]: 1.60013e-07 [add_forward_monad_depend]: 1.97999e-06 [auto_monad_grad]: 1.47001e-06 [auto_monad_eliminator]: 1.132e-05 [cse]: 2.373e-05 [a_3]: 6.111e-05 [py_interpret_to_execute_after_opt_a]: 2.002e-05 [slice_cell_reuse_recomputed_activation]: 5.64e-06 [rewriter_after_opt_a]: 4.809e-05 [convert_after_rewriter]: 1.136e-05 [order_py_execute_after_rewriter]: 9.13002e-06 [mutable_eliminate]: 0.00080794 [opt_b]: 0.00036332, [1] [Cycle 1]: 0.00035107, [7] [b_1]: 0.00021907 [b_2]: 1.11e-05 [updatestate_depend_eliminate]: 8.47e-06 [updatestate_assign_eliminate]: 3.74002e-06 [updatestate_loads_eliminate]: 3.76999e-06 [renormalize]: 1.05001e-06 [cse]: 4.012e-05 [optimize_parallel_all_gather_comm]: 2.644e-05 [overlap_param_gather]: 6.86001e-06 [cconv]: 3.88e-05 [loop_unroll]: 0.0006605 [opt_after_cconv]: 0.00016908, [1] [Cycle 1]: 0.0001577, [7] [c_1]: 4.287e-05 [parameter_eliminate]: 5.22999e-06 [updatestate_depend_eliminate]: 7.06001e-06 [updatestate_assign_eliminate]: 4.12998e-06 [updatestate_loads_eliminate]: 3.36001e-06 [cse]: 3.192e-05 [renormalize]: 1.22e-06 [remove_dup_value]: 2e-05 [tuple_transform]: 0.00015806, [1] [Cycle 1]: 0.00015049, [4] [d_1]: 6.162e-05 [none_parameter_eliminate]: 1.48002e-06 [renormalize]: 8.80013e-07 [switch_simplify]: 9.69e-06 [partial_unused_args_eliminate]: 5.51998e-06 [add_recomputation]: 6.914e-05 [cse_after_recomputation]: 3.754e-05, [1] [Cycle 1]: 2.99e-05, [1] [cse]: 1.882e-05 [environ_conv]: 1.166e-05 [swap_dp_allreduce_reducescatter]: 9.04e-06 [bias_add_comm_swap]: 6.14001e-06 [label_micro_interleaved_index]: 9.14998e-06 [label_fine_grained_interleaved_index]: 5.66e-06 [merge_cast_opt]: 4.22998e-06 [slice_recompute_activation]: 5.22999e-06 [micro_interleaved_order_control]: 5.10999e-06 [assign_add_opt]: 4.54998e-06 [ForceFp32Comm]: 3.95e-06 [remove_cast_before_assign_add]: 3.9e-06 [full_micro_interleaved_order_control]: 5.20001e-06 [reorder_send_recv_between_fp_bp]: 5.37999e-06 [comm_op_add_attrs]: 3.86001e-06 [add_comm_op_reuse_tag]: 3.48e-06 [interleave_split_concat_branches]: 3.86001e-06 [interleave_parallel_branches]: 3.71999e-06 [overlap_opt_shard_in_pipeline]: 3.93001e-06 [overlap_opt_shard_grad_in_pipeline]: 4.76002e-06 [control_data_broadcast_order]: 2.088e-05 [grouped_pairwise_exchange_alltoall]: 4.58999e-06 [offloading_packed_experts]: 8.34002e-06 [overlap_recompute_and_grad_model_parallel]: 9.05001e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.87998e-06 [overlap_recompute_allgather_and_fa_grad]: 3.95e-06 [overlap_recompute_comm]: 5.85002e-06 [overlap_grad_ring_attention]: 8.43999e-06 [overlap_grad_flash_sp]: 2.92e-05 [begin_end_overlap_inline]: 3.3e-06 [split_matmul_comm_elemetwise]: 4.99e-06 [split_layernorm_comm]: 4.68999e-06 [handle_group_info]: 3.71001e-06 [symbol_engine_optimizer]: 0.00012054, [1] [Cycle 1]: 0.00011297, [6] [build]: 3.86999e-06 [elim_shapecalc]: 1.45e-05 [elim_not_effective]: 1.842e-05 [opt_reshape]: 9.85002e-06 [fold_const_symbol]: 1.459e-05 [renormalize]: 3.00002e-07 [detach_backward]: 4.55001e-06 [pipeline_parallel_scheduler]: 1.95001e-06 [auto_monad_reorder]: 2.402e-05 [get_jit_bprop_graph]: 1.64e-06 [rewriter_after_jit_bprop_graph]: 5.89e-06 [opt_after_jit_grad]: 0.00075211 [validate]: 4.97e-05 Sums bootstrap : 0.000426s : 2.92% type_inference : 0.007631s : 52.40% event_method : 0.000023s : 0.15% auto_monad : 0.000068s : 0.47% graph_reusing : 0.000006s : 0.04% inline : 0.000002s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000021s : 0.14% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000005s : 0.04% parallel-infer-symbol : 0.000003s : 0.02% pre_auto_parallel : 0.000036s : 0.24% insert-virtual-dataset : 0.000003s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.01% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000025s : 0.17% optimize.rewriter_before_opt_a : 0.000092s : 0.63% optimize.opt_a.expand_dump_flag : 0.000005s : 0.04% optimize.opt_a.switch_simplify : 0.000048s : 0.33% optimize.opt_a.loop_unroll : 0.000040s : 0.28% optimize.opt_a.a_1 : 0.000886s : 6.08% optimize.opt_a.with_stream_mark : 0.000035s : 0.24% optimize.opt_a.recompute_prepare : 0.000019s : 0.13% optimize.opt_a.updatestate_depend_eliminate : 0.000009s : 0.06% optimize.opt_a.updatestate_assign_eliminate : 0.000008s : 0.06% optimize.opt_a.updatestate_loads_eliminate : 0.000007s : 0.05% optimize.opt_a.parameter_eliminate : 0.000003s : 0.02% optimize.opt_a.a_2 : 0.000253s : 1.74% optimize.opt_a.accelerated_algorithm : 0.000017s : 0.12% optimize.opt_a.shard : 0.000004s : 0.03% optimize.opt_a.meta_shard_fg_expand : 0.000005s : 0.03% optimize.opt_a.shard_inline : 0.000016s : 0.11% optimize.opt_a.merge_send_recv : 0.000021s : 0.15% optimize.opt_a.auto_parallel : 0.000018s : 0.12% optimize.opt_a.parallel : 0.000039s : 0.27% optimize.opt_a.flash_sp : 0.000014s : 0.09% optimize.opt_a.merge_comm : 0.000010s : 0.07% optimize.opt_a.allreduce_fusion : 0.000009s : 0.06% optimize.opt_a.matmul_add_comm_reduction : 0.000021s : 0.14% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000021s : 0.14% optimize.opt_a.virtual_dataset : 0.000017s : 0.11% optimize.opt_a.get_grad_eliminate_ : 0.000016s : 0.11% optimize.opt_a.virtual_output : 0.000020s : 0.13% optimize.opt_a.merge_forward : 0.000009s : 0.06% optimize.opt_a.cell_reuse_recompute_pass : 0.000005s : 0.03% optimize.opt_a.offload_activation : 0.000021s : 0.15% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000038s : 0.26% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.02% optimize.opt_a.before_grad : 0.000029s : 0.20% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000010s : 0.07% optimize.opt_a.meta_fg_expand : 0.000007s : 0.05% optimize.opt_a.flash_sp_send_recv_attached : 0.000005s : 0.04% optimize.opt_a.receive_attached : 0.000004s : 0.03% optimize.opt_a.after_resolve : 0.000028s : 0.19% optimize.opt_a.a_after_grad : 0.000025s : 0.17% optimize.opt_a.renormalize : 0.000908s : 6.23% optimize.opt_a.add_forward_monad_depend : 0.000009s : 0.06% optimize.opt_a.auto_monad_grad : 0.000004s : 0.03% optimize.opt_a.auto_monad_eliminator : 0.000030s : 0.21% optimize.opt_a.cse : 0.000058s : 0.40% optimize.opt_a.a_3 : 0.000143s : 0.98% optimize.py_interpret_to_execute_after_opt_a : 0.000020s : 0.14% optimize.slice_cell_reuse_recomputed_activation : 0.000006s : 0.04% optimize.rewriter_after_opt_a : 0.000048s : 0.33% optimize.convert_after_rewriter : 0.000011s : 0.08% optimize.order_py_execute_after_rewriter : 0.000009s : 0.06% optimize.mutable_eliminate : 0.000808s : 5.55% optimize.opt_b.b_1 : 0.000219s : 1.50% optimize.opt_b.b_2 : 0.000011s : 0.08% optimize.opt_b.updatestate_depend_eliminate : 0.000008s : 0.06% optimize.opt_b.updatestate_assign_eliminate : 0.000004s : 0.03% optimize.opt_b.updatestate_loads_eliminate : 0.000004s : 0.03% optimize.opt_b.renormalize : 0.000001s : 0.01% optimize.opt_b.cse : 0.000040s : 0.28% optimize.optimize_parallel_all_gather_comm : 0.000026s : 0.18% optimize.overlap_param_gather : 0.000007s : 0.05% optimize.cconv : 0.000039s : 0.27% optimize.loop_unroll : 0.000660s : 4.54% optimize.opt_after_cconv.c_1 : 0.000043s : 0.29% optimize.opt_after_cconv.parameter_eliminate : 0.000005s : 0.04% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000007s : 0.05% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000004s : 0.03% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.02% optimize.opt_after_cconv.cse : 0.000032s : 0.22% optimize.opt_after_cconv.renormalize : 0.000001s : 0.01% optimize.remove_dup_value : 0.000020s : 0.14% optimize.tuple_transform.d_1 : 0.000062s : 0.42% optimize.tuple_transform.none_parameter_eliminate : 0.000001s : 0.01% optimize.tuple_transform.renormalize : 0.000001s : 0.01% optimize.tuple_transform.switch_simplify : 0.000010s : 0.07% optimize.partial_unused_args_eliminate : 0.000006s : 0.04% optimize.add_recomputation : 0.000069s : 0.47% optimize.cse_after_recomputation.cse : 0.000019s : 0.13% optimize.environ_conv : 0.000012s : 0.08% optimize.swap_dp_allreduce_reducescatter : 0.000009s : 0.06% optimize.bias_add_comm_swap : 0.000006s : 0.04% optimize.label_micro_interleaved_index : 0.000009s : 0.06% optimize.label_fine_grained_interleaved_index : 0.000006s : 0.04% optimize.merge_cast_opt : 0.000004s : 0.03% optimize.slice_recompute_activation : 0.000005s : 0.04% optimize.micro_interleaved_order_control : 0.000005s : 0.04% optimize.assign_add_opt : 0.000005s : 0.03% optimize.ForceFp32Comm : 0.000004s : 0.03% optimize.remove_cast_before_assign_add : 0.000004s : 0.03% optimize.full_micro_interleaved_order_control : 0.000005s : 0.04% optimize.reorder_send_recv_between_fp_bp : 0.000005s : 0.04% optimize.comm_op_add_attrs : 0.000004s : 0.03% optimize.add_comm_op_reuse_tag : 0.000003s : 0.02% optimize.interleave_split_concat_branches : 0.000004s : 0.03% optimize.interleave_parallel_branches : 0.000004s : 0.03% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.03% optimize.overlap_opt_shard_grad_in_pipeline : 0.000005s : 0.03% optimize.control_data_broadcast_order : 0.000021s : 0.14% optimize.grouped_pairwise_exchange_alltoall : 0.000005s : 0.03% optimize.offloading_packed_experts : 0.000008s : 0.06% optimize.overlap_recompute_and_grad_model_parallel : 0.000009s : 0.06% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.03% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.03% optimize.overlap_recompute_comm : 0.000006s : 0.04% optimize.overlap_grad_ring_attention : 0.000008s : 0.06% optimize.overlap_grad_flash_sp : 0.000029s : 0.20% optimize.begin_end_overlap_inline : 0.000003s : 0.02% optimize.split_matmul_comm_elemetwise : 0.000005s : 0.03% optimize.split_layernorm_comm : 0.000005s : 0.03% optimize.handle_group_info : 0.000004s : 0.03% optimize.symbol_engine_optimizer.build : 0.000004s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000015s : 0.10% optimize.symbol_engine_optimizer.elim_not_effective : 0.000018s : 0.13% optimize.symbol_engine_optimizer.opt_reshape : 0.000010s : 0.07% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000015s : 0.10% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000005s : 0.03% pipeline_parallel_scheduler : 0.000002s : 0.01% auto_monad_reorder : 0.000024s : 0.16% get_jit_bprop_graph : 0.000002s : 0.01% rewriter_after_jit_bprop_graph : 0.000006s : 0.04% opt_after_jit_grad : 0.000752s : 5.16% validate : 0.000050s : 0.34% Time group info: ------[substitution.] 0.000227 38 13.10% : 0.000030s : 3: substitution.cast_eliminate 1.07% : 0.000002s : 3: substitution.elim_not_effective 0.92% : 0.000002s : 3: substitution.fold_const_symbol 3.34% : 0.000008s : 5: substitution.graph_param_transform 67.10% : 0.000152s : 4: substitution.inline 2.66% : 0.000006s : 6: substitution.j_node_and_user_rematch 3.52% : 0.000008s : 6: substitution.remove_not_recompute_node 2.49% : 0.000006s : 4: substitution.replace_old_param 5.79% : 0.000013s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.007572 2 89.52% : 0.006778s : 1: type_inference.infer 10.48% : 0.000793s : 1: type_inference.specialize ------[replace.] 0.000065 8 61.57% : 0.000040s : 4: replace.inline 38.43% : 0.000025s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000161 8 93.00% : 0.000150s : 4: match.inline 7.00% : 0.000011s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000265 1504 0.93% : 0.000002s : 15: predicate.accumulaten_eliminater 1.11% : 0.000003s : 5: predicate.ad_related_special_op_eliminate 0.58% : 0.000002s : 10: predicate.addn_check_dump 0.85% : 0.000002s : 15: predicate.addn_zero_filter 0.79% : 0.000002s : 15: predicate.adjust_all_reduce_mul_add 2.09% : 0.000006s : 25: predicate.arithmetic_simplify 1.05% : 0.000003s : 15: predicate.cast_eliminate 0.62% : 0.000002s : 10: predicate.check_bprop_eliminate 0.56% : 0.000001s : 10: predicate.compare_switch_simplify 0.19% : 0.000000s : 5: predicate.const_output_eliminate 0.70% : 0.000002s : 10: predicate.depend_value_elim 0.94% : 0.000002s : 15: predicate.dict_get_item_const_eliminator 1.08% : 0.000003s : 15: predicate.dict_get_item_eliminator 0.89% : 0.000002s : 15: predicate.dict_set_item_eliminator 1.06% : 0.000003s : 10: predicate.dumpgradient_eliminate 0.28% : 0.000001s : 5: predicate.elim_not_effective 0.42% : 0.000001s : 5: predicate.elim_shapecalc_of_broadcastargs 1.17% : 0.000003s : 20: predicate.environ_add_const_eliminate 1.03% : 0.000003s : 20: predicate.environ_get_add_eliminate 1.12% : 0.000003s : 20: predicate.environ_get_depend_swap 1.77% : 0.000005s : 30: predicate.environ_get_eliminate 1.11% : 0.000003s : 20: predicate.environ_get_set_eliminate 1.32% : 0.000003s : 23: predicate.exchange_switch_depend_value 2.13% : 0.000006s : 23: predicate.float_depend_g_call 0.53% : 0.000001s : 10: predicate.float_environ_get_switch 0.86% : 0.000002s : 15: predicate.float_tuple_getitem_switch 0.20% : 0.000001s : 5: predicate.fold_const_symbol 0.70% : 0.000002s : 10: predicate.get_grad_eliminate 0.26% : 0.000001s : 5: predicate.graph_param_transform 0.59% : 0.000002s : 10: predicate.incorporate_call 0.51% : 0.000001s : 10: predicate.incorporate_call_switch 5.63% : 0.000015s : 68: predicate.inline 0.78% : 0.000002s : 10: predicate.inline_without_move 0.32% : 0.000001s : 10: predicate.j_node_and_user_rematch 0.93% : 0.000002s : 10: predicate.less_batch_normalization 1.87% : 0.000005s : 29: predicate.list_to_tuple_eliminator_ 2.43% : 0.000006s : 44: predicate.load_eliminater 1.04% : 0.000003s : 5: predicate.loop_unroll_after_grad 2.06% : 0.000005s : 36: predicate.loop_unroll_before_grad 1.82% : 0.000005s : 25: predicate.make_slice_get_slice_eliminator 0.58% : 0.000002s : 10: predicate.merge_addn 0.64% : 0.000002s : 10: predicate.micro_step_allgather_replace 0.68% : 0.000002s : 10: predicate.mini_step_allgather_replace 0.90% : 0.000002s : 15: predicate.minmaximum_grad 1.20% : 0.000003s : 5: predicate.mutable_eliminate 0.46% : 0.000001s : 5: predicate.opt_reshape 0.40% : 0.000001s : 5: predicate.parallel_virtual_node 1.55% : 0.000004s : 23: predicate.partial_defer_inline 1.51% : 0.000004s : 24: predicate.partial_eliminate 0.92% : 0.000002s : 15: predicate.print_const_string_wrapper 0.67% : 0.000002s : 10: predicate.reduce_all_const_elim 1.18% : 0.000003s : 15: predicate.reduce_eliminate 2.33% : 0.000006s : 44: predicate.redundant_stop_gradient_eliminater 0.39% : 0.000001s : 10: predicate.remove_not_recompute_node 1.24% : 0.000003s : 29: predicate.replace_applicator 0.48% : 0.000001s : 10: predicate.replace_old_param 0.31% : 0.000001s : 5: predicate.reset_defer_inline 0.99% : 0.000003s : 15: predicate.reshape_eliminate 0.70% : 0.000002s : 10: predicate.row_tensor_add_zeros_like 0.43% : 0.000001s : 5: predicate.row_tensor_eliminate 0.81% : 0.000002s : 10: predicate.same_eliminate 0.40% : 0.000001s : 10: predicate.set_cell_output_no_recompute 0.78% : 0.000002s : 10: predicate.shard_identity_eliminate 0.78% : 0.000002s : 10: predicate.special_op_eliminate 0.71% : 0.000002s : 10: predicate.specialize_transform 1.10% : 0.000003s : 10: predicate.split_environ_get_set_with_tuple_value 1.06% : 0.000003s : 10: predicate.stack_unstack_eliminate 0.36% : 0.000001s : 5: predicate.switch_call_monad_eliminater 1.39% : 0.000004s : 23: predicate.switch_defer_inline 1.91% : 0.000005s : 33: predicate.switch_layer_defer_inline 4.51% : 0.000012s : 74: predicate.switch_simplify 0.84% : 0.000002s : 15: predicate.tile_eliminate 0.92% : 0.000002s : 15: predicate.transpose_eliminate 1.58% : 0.000004s : 25: predicate.tuple_list_convert_item_index_to_positive 1.84% : 0.000005s : 25: predicate.tuple_list_get_item_const_eliminator 1.73% : 0.000005s : 25: predicate.tuple_list_get_item_depend_reorder 3.06% : 0.000008s : 39: predicate.tuple_list_get_item_eliminator 2.06% : 0.000005s : 25: predicate.tuple_list_get_set_item_eliminator 2.42% : 0.000006s : 35: predicate.tuple_list_set_item_eliminator 1.69% : 0.000004s : 29: predicate.tuple_to_list_eliminator_ 2.33% : 0.000006s : 44: predicate.updatestate_pure_node_eliminater 3.02% : 0.000008s : 54: predicate.updatestate_useless_node_eliminater 0.56% : 0.000001s : 5: predicate.value_based_eliminate 0.63% : 0.000002s : 10: predicate.virtual_dataset_eliminate 0.74% : 0.000002s : 10: predicate.virtual_output_eliminate 0.31% : 0.000001s : 5: predicate.virtual_view_grad_eliminate 0.56% : 0.000001s : 5: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000674 11 52.01% : 0.000351s : 5: func_graph_cloner_run.FuncGraphClonerGraph 47.99% : 0.000324s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.032965 192 0.02% : 0.000007s : 1: ForceFp32Comm 10.63% : 0.003505s : 1: add_attr 10.58% : 0.003487s : 1: add_attr_with_inline 0.02% : 0.000006s : 1: add_comm_op_reuse_tag 0.22% : 0.000073s : 1: add_recomputation 0.02% : 0.000008s : 1: assign_add_opt 0.24% : 0.000078s : 1: auto_monad 0.10% : 0.000032s : 1: auto_monad_reorder 0.02% : 0.000006s : 1: begin_end_overlap_inline 0.03% : 0.000009s : 1: bias_add_comm_swap 1.43% : 0.000472s : 1: bootstrap 0.13% : 0.000042s : 1: cconv 0.02% : 0.000007s : 1: comm_op_add_attrs 0.07% : 0.000025s : 1: control_data_broadcast_order 0.05% : 0.000015s : 1: convert_after_rewriter 0.12% : 0.000041s : 1: cse_after_recomputation 0.02% : 0.000007s : 1: dataset_repeat_opt 0.06% : 0.000021s : 1: detach_backward 0.04% : 0.000015s : 1: environ_conv 0.10% : 0.000034s : 1: event_method 0.02% : 0.000008s : 1: full_micro_interleaved_order_control 0.02% : 0.000008s : 1: get_jit_bprop_graph 0.04% : 0.000013s : 1: graph_reusing 0.02% : 0.000008s : 1: grouped_pairwise_exchange_alltoall 0.02% : 0.000007s : 1: handle_group_info 0.02% : 0.000008s : 1: inline 0.03% : 0.000009s : 1: insert-virtual-dataset 0.02% : 0.000006s : 1: interleave_parallel_branches 0.02% : 0.000007s : 1: interleave_split_concat_branches 0.03% : 0.000009s : 1: label_fine_grained_interleaved_index 0.04% : 0.000012s : 1: label_micro_interleaved_index 2.03% : 0.000670s : 1: loop_unroll 0.02% : 0.000007s : 1: merge_cast_opt 0.03% : 0.000008s : 1: micro_interleaved_order_control 2.48% : 0.000819s : 1: mutable_eliminate 0.03% : 0.000011s : 1: offloading_packed_experts 0.06% : 0.000020s : 1: opt.transform.loop_unroll_optimizer 0.08% : 0.000025s : 1: opt.transform.mutable_eliminate 4.35% : 0.001433s : 78: opt.transform.opt_a 0.12% : 0.000040s : 1: opt.transform.opt_after_cconv 0.11% : 0.000037s : 1: opt.transform.opt_after_jit_grad 0.45% : 0.000149s : 28: opt.transform.opt_b 0.21% : 0.000069s : 2: opt.transform.opt_trans_graph 0.16% : 0.000053s : 4: opt.transform.symbol_engine_opt 11.21% : 0.003697s : 1: opt_a 0.52% : 0.000173s : 1: opt_after_cconv 2.32% : 0.000764s : 1: opt_after_jit_grad 1.12% : 0.000368s : 1: opt_b 21.98% : 0.007244s : 1: optimize 0.09% : 0.000030s : 1: optimize_parallel_all_gather_comm 0.04% : 0.000012s : 1: order_py_execute_after_rewriter 0.10% : 0.000033s : 1: overlap_grad_flash_sp 0.02% : 0.000007s : 1: overlap_grad_matmul_and_grad_allreduce 0.04% : 0.000012s : 1: overlap_grad_ring_attention 0.03% : 0.000009s : 1: overlap_opt_shard_grad_in_pipeline 0.02% : 0.000007s : 1: overlap_opt_shard_in_pipeline 0.03% : 0.000010s : 1: overlap_param_gather 0.02% : 0.000007s : 1: overlap_recompute_allgather_and_fa_grad 0.04% : 0.000012s : 1: overlap_recompute_and_grad_model_parallel 0.03% : 0.000009s : 1: overlap_recompute_comm 0.03% : 0.000010s : 1: parallel-infer-symbol 0.02% : 0.000006s : 1: parallel-infer-symbol-second 0.03% : 0.000008s : 1: partial_unused_args_eliminate 0.03% : 0.000010s : 1: pipeline_parallel_scheduler 0.02% : 0.000008s : 1: pipeline_split 0.13% : 0.000043s : 1: pre_auto_parallel 0.09% : 0.000029s : 1: py_interpret_to_execute 0.07% : 0.000024s : 1: py_interpret_to_execute_after_opt_a 0.02% : 0.000008s : 1: remove_cast_before_assign_add 0.07% : 0.000024s : 1: remove_dup_value 1.38% : 0.000454s : 1: renormalize.infer 1.35% : 0.000445s : 1: renormalize.specialize 0.02% : 0.000008s : 1: reorder_send_recv_between_fp_bp 0.04% : 0.000012s : 1: rewriter_after_jit_bprop_graph 0.16% : 0.000053s : 1: rewriter_after_opt_a 0.29% : 0.000096s : 1: rewriter_before_opt_a 0.03% : 0.000009s : 1: slice_cell_reuse_recomputed_activation 0.02% : 0.000008s : 1: slice_recompute_activation 0.02% : 0.000007s : 1: split_layernorm_comm 0.02% : 0.000008s : 1: split_matmul_comm_elemetwise 0.04% : 0.000012s : 1: swap_dp_allreduce_reducescatter 0.38% : 0.000124s : 1: symbol_engine_optimizer 0.49% : 0.000161s : 1: tuple_transform 23.30% : 0.007680s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:45:42.539.163 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0188975, [21] [bootstrap]: 0.00040402 [type_inference]: 0.00701884 [event_method]: 2.13e-05 [auto_monad]: 7.347e-05 [graph_reusing]: 6.61e-06 [inline]: 3.98001e-06 [add_attr]: 0.00387073, [1] [add_attr_with_inline]: 0.0038572, [1] [Cycle 1]: 6.891e-05, [2] [tag_attr]: 2.324e-05 [meta_addattr_fg_expand]: 5.99e-06 [parallel-infer-symbol]: 4.08999e-06 [pre_auto_parallel]: 4.141e-05 [insert-virtual-dataset]: 2.54999e-06 [parallel-infer-symbol-second]: 8.49977e-07 [dataset_repeat_opt]: 1.77999e-06 [pipeline_split]: 2.11e-06 [optimize]: 0.0064688, [53] [py_interpret_to_execute]: 2.973e-05 [rewriter_before_opt_a]: 9.687e-05 [opt_a]: 0.00347156, [2] [Cycle 1]: 0.00258003, [45] [expand_dump_flag]: 3.83999e-06 [switch_simplify]: 4.528e-05 [loop_unroll]: 3.168e-05 [a_1]: 0.00070526 [with_stream_mark]: 2.416e-05 [recompute_prepare]: 1.469e-05 [updatestate_depend_eliminate]: 5.87001e-06 [updatestate_assign_eliminate]: 4.38999e-06 [updatestate_loads_eliminate]: 3.81001e-06 [parameter_eliminate]: 2.36e-06 [a_2]: 0.00010516 [accelerated_algorithm]: 8.67e-06 [shard]: 2.34999e-06 [meta_shard_fg_expand]: 2.42001e-06 [shard_inline]: 7.8e-06 [merge_send_recv]: 1.04e-05 [auto_parallel]: 9.87999e-06 [parallel]: 2.185e-05 [flash_sp]: 1.238e-05 [merge_comm]: 4.84e-06 [allreduce_fusion]: 4.30999e-06 [matmul_add_comm_reduction]: 1.226e-05 [allreduce_slice_to_reducescatter]: 9.29984e-07 [virtual_shard_identity]: 1.217e-05 [virtual_dataset]: 8.80999e-06 [get_grad_eliminate_]: 8.28001e-06 [virtual_output]: 7.88001e-06 [merge_forward]: 4.95001e-06 [cell_reuse_recompute_pass]: 1.91003e-06 [offload_activation]: 1.237e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.974e-05 [merge_recompute_call_nodes]: 2.01998e-06 [before_grad]: 1.424e-05 [set_forward_comm_id_for_comm_node_pass]: 4.92999e-06 [meta_fg_expand]: 3.38999e-06 [flash_sp_send_recv_attached]: 3.76001e-06 [receive_attached]: 2.11e-06 [after_resolve]: 1.547e-05 [a_after_grad]: 1.377e-05 [renormalize]: 0.00097539 [add_forward_monad_depend]: 9.03002e-06 [auto_monad_grad]: 3.08e-06 [auto_monad_eliminator]: 2.381e-05 [cse]: 3.865e-05 [a_3]: 7.009e-05 [Cycle 2]: 0.00087777, [45] [expand_dump_flag]: 2.27001e-06 [switch_simplify]: 1.092e-05 [loop_unroll]: 7.8e-06 [a_1]: 0.00019668 [with_stream_mark]: 2.114e-05 [recompute_prepare]: 9.74e-06 [updatestate_depend_eliminate]: 5.97001e-06 [updatestate_assign_eliminate]: 3.45e-06 [updatestate_loads_eliminate]: 3.25e-06 [parameter_eliminate]: 1.65001e-06 [a_2]: 9.531e-05 [accelerated_algorithm]: 8.21002e-06 [shard]: 2.41998e-06 [meta_shard_fg_expand]: 2.62001e-06 [shard_inline]: 7.41999e-06 [merge_send_recv]: 1.041e-05 [auto_parallel]: 1.073e-05 [parallel]: 8.93002e-06 [flash_sp]: 4e-06 [merge_comm]: 5.88998e-06 [allreduce_fusion]: 4.42e-06 [matmul_add_comm_reduction]: 1.005e-05 [allreduce_slice_to_reducescatter]: 6.00005e-07 [virtual_shard_identity]: 9.44998e-06 [virtual_dataset]: 7.43999e-06 [get_grad_eliminate_]: 7.22002e-06 [virtual_output]: 7.13998e-06 [merge_forward]: 4.50001e-06 [cell_reuse_recompute_pass]: 3.36999e-06 [offload_activation]: 1.151e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.739e-05 [merge_recompute_call_nodes]: 1.12e-06 [before_grad]: 1.347e-05 [set_forward_comm_id_for_comm_node_pass]: 4.48999e-06 [meta_fg_expand]: 3.38e-06 [flash_sp_send_recv_attached]: 1.86e-06 [receive_attached]: 2.27999e-06 [after_resolve]: 1.617e-05 [a_after_grad]: 1.269e-05 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 1.87999e-06 [auto_monad_grad]: 2.21e-06 [auto_monad_eliminator]: 1.179e-05 [cse]: 2.428e-05 [a_3]: 4.918e-05 [py_interpret_to_execute_after_opt_a]: 1.758e-05 [slice_cell_reuse_recomputed_activation]: 2.23998e-06 [rewriter_after_opt_a]: 4.505e-05 [convert_after_rewriter]: 7.39002e-06 [order_py_execute_after_rewriter]: 6.69001e-06 [mutable_eliminate]: 0.00076716 [opt_b]: 0.00027695, [1] [Cycle 1]: 0.00026727, [7] [b_1]: 0.00016825 [b_2]: 1.048e-05 [updatestate_depend_eliminate]: 8.41002e-06 [updatestate_assign_eliminate]: 3.88999e-06 [updatestate_loads_eliminate]: 3.28e-06 [renormalize]: 4.30009e-07 [cse]: 3.173e-05 [optimize_parallel_all_gather_comm]: 2.175e-05 [overlap_param_gather]: 2.17999e-06 [cconv]: 3.527e-05 [loop_unroll]: 0.00056306 [opt_after_cconv]: 0.00013505, [1] [Cycle 1]: 0.000128, [7] [c_1]: 3.983e-05 [parameter_eliminate]: 5.59e-06 [updatestate_depend_eliminate]: 9.10999e-06 [updatestate_assign_eliminate]: 3.51999e-06 [updatestate_loads_eliminate]: 3.06999e-06 [cse]: 2.92e-05 [renormalize]: 6.39993e-07 [remove_dup_value]: 1.651e-05 [tuple_transform]: 9.514e-05, [1] [Cycle 1]: 8.99e-05, [4] [d_1]: 5.82e-05 [none_parameter_eliminate]: 1.94e-06 [renormalize]: 2.3999e-07 [switch_simplify]: 8.54e-06 [partial_unused_args_eliminate]: 2.26e-06 [add_recomputation]: 6.423e-05 [cse_after_recomputation]: 2.714e-05, [1] [Cycle 1]: 2.212e-05, [1] [cse]: 1.615e-05 [environ_conv]: 7.18e-06 [swap_dp_allreduce_reducescatter]: 5.89999e-06 [bias_add_comm_swap]: 3.7e-06 [label_micro_interleaved_index]: 5.49e-06 [label_fine_grained_interleaved_index]: 3.04001e-06 [merge_cast_opt]: 2.04e-06 [slice_recompute_activation]: 2.11e-06 [micro_interleaved_order_control]: 2.27999e-06 [assign_add_opt]: 1.25999e-06 [ForceFp32Comm]: 8.89995e-07 [remove_cast_before_assign_add]: 1.62001e-06 [full_micro_interleaved_order_control]: 2.29001e-06 [reorder_send_recv_between_fp_bp]: 2.88998e-06 [comm_op_add_attrs]: 1.06002e-06 [add_comm_op_reuse_tag]: 1.05001e-06 [interleave_split_concat_branches]: 1.24998e-06 [interleave_parallel_branches]: 1.20001e-06 [overlap_opt_shard_in_pipeline]: 1.33002e-06 [overlap_opt_shard_grad_in_pipeline]: 1.82001e-06 [control_data_broadcast_order]: 1.689e-05 [grouped_pairwise_exchange_alltoall]: 1.64e-06 [offloading_packed_experts]: 4.77e-06 [overlap_recompute_and_grad_model_parallel]: 5.86e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.16002e-06 [overlap_recompute_allgather_and_fa_grad]: 1.49e-06 [overlap_recompute_comm]: 2.19001e-06 [overlap_grad_ring_attention]: 4.80001e-06 [overlap_grad_flash_sp]: 2.54e-05 [begin_end_overlap_inline]: 6.89994e-07 [split_matmul_comm_elemetwise]: 0.00029522 [split_layernorm_comm]: 2.34001e-06 [handle_group_info]: 1.39e-06 [symbol_engine_optimizer]: 0.000117, [1] [Cycle 1]: 0.00011044, [6] [build]: 6.88998e-06 [elim_shapecalc]: 2.403e-05 [elim_not_effective]: 1.938e-05 [opt_reshape]: 9.46e-06 [fold_const_symbol]: 1.302e-05 [renormalize]: 6.10016e-07 [detach_backward]: 2.58e-06 [pipeline_parallel_scheduler]: 1.66002e-06 [auto_monad_reorder]: 2.934e-05 [get_jit_bprop_graph]: 2.07001e-06 [rewriter_after_jit_bprop_graph]: 6.96999e-06 [opt_after_jit_grad]: 0.00071249 [validate]: 5.348e-05 Sums bootstrap : 0.000404s : 2.90% type_inference : 0.007019s : 50.38% event_method : 0.000021s : 0.15% auto_monad : 0.000073s : 0.53% graph_reusing : 0.000007s : 0.05% inline : 0.000004s : 0.03% add_attr.add_attr_with_inline.tag_attr : 0.000023s : 0.17% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.04% parallel-infer-symbol : 0.000004s : 0.03% pre_auto_parallel : 0.000041s : 0.30% insert-virtual-dataset : 0.000003s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.01% pipeline_split : 0.000002s : 0.02% optimize.py_interpret_to_execute : 0.000030s : 0.21% optimize.rewriter_before_opt_a : 0.000097s : 0.70% optimize.opt_a.expand_dump_flag : 0.000006s : 0.04% optimize.opt_a.switch_simplify : 0.000056s : 0.40% optimize.opt_a.loop_unroll : 0.000039s : 0.28% optimize.opt_a.a_1 : 0.000902s : 6.47% optimize.opt_a.with_stream_mark : 0.000045s : 0.33% optimize.opt_a.recompute_prepare : 0.000024s : 0.18% optimize.opt_a.updatestate_depend_eliminate : 0.000012s : 0.08% optimize.opt_a.updatestate_assign_eliminate : 0.000008s : 0.06% optimize.opt_a.updatestate_loads_eliminate : 0.000007s : 0.05% optimize.opt_a.parameter_eliminate : 0.000004s : 0.03% optimize.opt_a.a_2 : 0.000200s : 1.44% optimize.opt_a.accelerated_algorithm : 0.000017s : 0.12% optimize.opt_a.shard : 0.000005s : 0.03% optimize.opt_a.meta_shard_fg_expand : 0.000005s : 0.04% optimize.opt_a.shard_inline : 0.000015s : 0.11% optimize.opt_a.merge_send_recv : 0.000021s : 0.15% optimize.opt_a.auto_parallel : 0.000021s : 0.15% optimize.opt_a.parallel : 0.000031s : 0.22% optimize.opt_a.flash_sp : 0.000016s : 0.12% optimize.opt_a.merge_comm : 0.000011s : 0.08% optimize.opt_a.allreduce_fusion : 0.000009s : 0.06% optimize.opt_a.matmul_add_comm_reduction : 0.000022s : 0.16% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000002s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000022s : 0.16% optimize.opt_a.virtual_dataset : 0.000016s : 0.12% optimize.opt_a.get_grad_eliminate_ : 0.000016s : 0.11% optimize.opt_a.virtual_output : 0.000015s : 0.11% optimize.opt_a.merge_forward : 0.000009s : 0.07% optimize.opt_a.cell_reuse_recompute_pass : 0.000005s : 0.04% optimize.opt_a.offload_activation : 0.000024s : 0.17% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000037s : 0.27% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.02% optimize.opt_a.before_grad : 0.000028s : 0.20% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000009s : 0.07% optimize.opt_a.meta_fg_expand : 0.000007s : 0.05% optimize.opt_a.flash_sp_send_recv_attached : 0.000006s : 0.04% optimize.opt_a.receive_attached : 0.000004s : 0.03% optimize.opt_a.after_resolve : 0.000032s : 0.23% optimize.opt_a.a_after_grad : 0.000026s : 0.19% optimize.opt_a.renormalize : 0.000975s : 7.00% optimize.opt_a.add_forward_monad_depend : 0.000011s : 0.08% optimize.opt_a.auto_monad_grad : 0.000005s : 0.04% optimize.opt_a.auto_monad_eliminator : 0.000036s : 0.26% optimize.opt_a.cse : 0.000063s : 0.45% optimize.opt_a.a_3 : 0.000119s : 0.86% optimize.py_interpret_to_execute_after_opt_a : 0.000018s : 0.13% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.02% optimize.rewriter_after_opt_a : 0.000045s : 0.32% optimize.convert_after_rewriter : 0.000007s : 0.05% optimize.order_py_execute_after_rewriter : 0.000007s : 0.05% optimize.mutable_eliminate : 0.000767s : 5.51% optimize.opt_b.b_1 : 0.000168s : 1.21% optimize.opt_b.b_2 : 0.000010s : 0.08% optimize.opt_b.updatestate_depend_eliminate : 0.000008s : 0.06% optimize.opt_b.updatestate_assign_eliminate : 0.000004s : 0.03% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.02% optimize.opt_b.renormalize : 0.000000s : 0.00% optimize.opt_b.cse : 0.000032s : 0.23% optimize.optimize_parallel_all_gather_comm : 0.000022s : 0.16% optimize.overlap_param_gather : 0.000002s : 0.02% optimize.cconv : 0.000035s : 0.25% optimize.loop_unroll : 0.000563s : 4.04% optimize.opt_after_cconv.c_1 : 0.000040s : 0.29% optimize.opt_after_cconv.parameter_eliminate : 0.000006s : 0.04% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000009s : 0.07% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000004s : 0.03% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.02% optimize.opt_after_cconv.cse : 0.000029s : 0.21% optimize.opt_after_cconv.renormalize : 0.000001s : 0.00% optimize.remove_dup_value : 0.000017s : 0.12% optimize.tuple_transform.d_1 : 0.000058s : 0.42% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000009s : 0.06% optimize.partial_unused_args_eliminate : 0.000002s : 0.02% optimize.add_recomputation : 0.000064s : 0.46% optimize.cse_after_recomputation.cse : 0.000016s : 0.12% optimize.environ_conv : 0.000007s : 0.05% optimize.swap_dp_allreduce_reducescatter : 0.000006s : 0.04% optimize.bias_add_comm_swap : 0.000004s : 0.03% optimize.label_micro_interleaved_index : 0.000005s : 0.04% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.02% optimize.merge_cast_opt : 0.000002s : 0.01% optimize.slice_recompute_activation : 0.000002s : 0.02% optimize.micro_interleaved_order_control : 0.000002s : 0.02% optimize.assign_add_opt : 0.000001s : 0.01% optimize.ForceFp32Comm : 0.000001s : 0.01% optimize.remove_cast_before_assign_add : 0.000002s : 0.01% optimize.full_micro_interleaved_order_control : 0.000002s : 0.02% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.02% optimize.comm_op_add_attrs : 0.000001s : 0.01% optimize.add_comm_op_reuse_tag : 0.000001s : 0.01% optimize.interleave_split_concat_branches : 0.000001s : 0.01% optimize.interleave_parallel_branches : 0.000001s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.01% optimize.control_data_broadcast_order : 0.000017s : 0.12% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.01% optimize.offloading_packed_experts : 0.000005s : 0.03% optimize.overlap_recompute_and_grad_model_parallel : 0.000006s : 0.04% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.01% optimize.overlap_recompute_comm : 0.000002s : 0.02% optimize.overlap_grad_ring_attention : 0.000005s : 0.03% optimize.overlap_grad_flash_sp : 0.000025s : 0.18% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000295s : 2.12% optimize.split_layernorm_comm : 0.000002s : 0.02% optimize.handle_group_info : 0.000001s : 0.01% optimize.symbol_engine_optimizer.build : 0.000007s : 0.05% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000024s : 0.17% optimize.symbol_engine_optimizer.elim_not_effective : 0.000019s : 0.14% optimize.symbol_engine_optimizer.opt_reshape : 0.000009s : 0.07% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000013s : 0.09% optimize.symbol_engine_optimizer.renormalize : 0.000001s : 0.00% detach_backward : 0.000003s : 0.02% pipeline_parallel_scheduler : 0.000002s : 0.01% auto_monad_reorder : 0.000029s : 0.21% get_jit_bprop_graph : 0.000002s : 0.01% rewriter_after_jit_bprop_graph : 0.000007s : 0.05% opt_after_jit_grad : 0.000712s : 5.11% validate : 0.000053s : 0.38% Time group info: ------[substitution.] 0.000232 38 13.70% : 0.000032s : 3: substitution.cast_eliminate 1.03% : 0.000002s : 3: substitution.elim_not_effective 0.75% : 0.000002s : 3: substitution.fold_const_symbol 3.40% : 0.000008s : 5: substitution.graph_param_transform 66.74% : 0.000155s : 4: substitution.inline 2.06% : 0.000005s : 6: substitution.j_node_and_user_rematch 3.25% : 0.000008s : 6: substitution.remove_not_recompute_node 3.11% : 0.000007s : 4: substitution.replace_old_param 5.96% : 0.000014s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.006943 2 87.03% : 0.006042s : 1: type_inference.infer 12.97% : 0.000901s : 1: type_inference.specialize ------[replace.] 0.000064 8 59.72% : 0.000038s : 4: replace.inline 40.28% : 0.000026s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000164 8 92.68% : 0.000152s : 4: match.inline 7.32% : 0.000012s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000271 1504 0.80% : 0.000002s : 15: predicate.accumulaten_eliminater 0.89% : 0.000002s : 5: predicate.ad_related_special_op_eliminate 0.53% : 0.000001s : 10: predicate.addn_check_dump 0.96% : 0.000003s : 15: predicate.addn_zero_filter 0.74% : 0.000002s : 15: predicate.adjust_all_reduce_mul_add 2.43% : 0.000007s : 25: predicate.arithmetic_simplify 1.08% : 0.000003s : 15: predicate.cast_eliminate 0.62% : 0.000002s : 10: predicate.check_bprop_eliminate 0.53% : 0.000001s : 10: predicate.compare_switch_simplify 0.18% : 0.000000s : 5: predicate.const_output_eliminate 0.72% : 0.000002s : 10: predicate.depend_value_elim 0.96% : 0.000003s : 15: predicate.dict_get_item_const_eliminator 0.86% : 0.000002s : 15: predicate.dict_get_item_eliminator 0.88% : 0.000002s : 15: predicate.dict_set_item_eliminator 0.91% : 0.000002s : 10: predicate.dumpgradient_eliminate 0.20% : 0.000001s : 5: predicate.elim_not_effective 1.16% : 0.000003s : 5: predicate.elim_shapecalc_of_broadcastargs 1.30% : 0.000004s : 20: predicate.environ_add_const_eliminate 0.97% : 0.000003s : 20: predicate.environ_get_add_eliminate 1.18% : 0.000003s : 20: predicate.environ_get_depend_swap 1.60% : 0.000004s : 30: predicate.environ_get_eliminate 1.06% : 0.000003s : 20: predicate.environ_get_set_eliminate 1.40% : 0.000004s : 23: predicate.exchange_switch_depend_value 4.30% : 0.000012s : 23: predicate.float_depend_g_call 0.50% : 0.000001s : 10: predicate.float_environ_get_switch 0.81% : 0.000002s : 15: predicate.float_tuple_getitem_switch 0.16% : 0.000000s : 5: predicate.fold_const_symbol 0.68% : 0.000002s : 10: predicate.get_grad_eliminate 0.21% : 0.000001s : 5: predicate.graph_param_transform 0.59% : 0.000002s : 10: predicate.incorporate_call 0.49% : 0.000001s : 10: predicate.incorporate_call_switch 6.01% : 0.000016s : 68: predicate.inline 0.93% : 0.000003s : 10: predicate.inline_without_move 0.28% : 0.000001s : 10: predicate.j_node_and_user_rematch 0.83% : 0.000002s : 10: predicate.less_batch_normalization 2.03% : 0.000005s : 29: predicate.list_to_tuple_eliminator_ 2.23% : 0.000006s : 44: predicate.load_eliminater 0.92% : 0.000002s : 5: predicate.loop_unroll_after_grad 2.07% : 0.000006s : 36: predicate.loop_unroll_before_grad 1.63% : 0.000004s : 25: predicate.make_slice_get_slice_eliminator 0.65% : 0.000002s : 10: predicate.merge_addn 0.66% : 0.000002s : 10: predicate.micro_step_allgather_replace 0.54% : 0.000001s : 10: predicate.mini_step_allgather_replace 0.79% : 0.000002s : 15: predicate.minmaximum_grad 0.99% : 0.000003s : 5: predicate.mutable_eliminate 0.41% : 0.000001s : 5: predicate.opt_reshape 0.54% : 0.000001s : 5: predicate.parallel_virtual_node 1.60% : 0.000004s : 23: predicate.partial_defer_inline 1.46% : 0.000004s : 24: predicate.partial_eliminate 0.88% : 0.000002s : 15: predicate.print_const_string_wrapper 0.68% : 0.000002s : 10: predicate.reduce_all_const_elim 1.04% : 0.000003s : 15: predicate.reduce_eliminate 2.28% : 0.000006s : 44: predicate.redundant_stop_gradient_eliminater 0.39% : 0.000001s : 10: predicate.remove_not_recompute_node 1.45% : 0.000004s : 29: predicate.replace_applicator 0.48% : 0.000001s : 10: predicate.replace_old_param 0.41% : 0.000001s : 5: predicate.reset_defer_inline 0.98% : 0.000003s : 15: predicate.reshape_eliminate 0.64% : 0.000002s : 10: predicate.row_tensor_add_zeros_like 0.41% : 0.000001s : 5: predicate.row_tensor_eliminate 0.95% : 0.000003s : 10: predicate.same_eliminate 0.59% : 0.000002s : 10: predicate.set_cell_output_no_recompute 0.72% : 0.000002s : 10: predicate.shard_identity_eliminate 0.75% : 0.000002s : 10: predicate.special_op_eliminate 0.67% : 0.000002s : 10: predicate.specialize_transform 1.09% : 0.000003s : 10: predicate.split_environ_get_set_with_tuple_value 0.84% : 0.000002s : 10: predicate.stack_unstack_eliminate 0.36% : 0.000001s : 5: predicate.switch_call_monad_eliminater 1.35% : 0.000004s : 23: predicate.switch_defer_inline 1.85% : 0.000005s : 33: predicate.switch_layer_defer_inline 4.48% : 0.000012s : 74: predicate.switch_simplify 0.90% : 0.000002s : 15: predicate.tile_eliminate 0.93% : 0.000003s : 15: predicate.transpose_eliminate 1.48% : 0.000004s : 25: predicate.tuple_list_convert_item_index_to_positive 1.48% : 0.000004s : 25: predicate.tuple_list_get_item_const_eliminator 1.41% : 0.000004s : 25: predicate.tuple_list_get_item_depend_reorder 3.29% : 0.000009s : 39: predicate.tuple_list_get_item_eliminator 1.58% : 0.000004s : 25: predicate.tuple_list_get_set_item_eliminator 2.22% : 0.000006s : 35: predicate.tuple_list_set_item_eliminator 1.71% : 0.000005s : 29: predicate.tuple_to_list_eliminator_ 2.23% : 0.000006s : 44: predicate.updatestate_pure_node_eliminater 2.80% : 0.000008s : 54: predicate.updatestate_useless_node_eliminater 0.37% : 0.000001s : 5: predicate.value_based_eliminate 0.69% : 0.000002s : 10: predicate.virtual_dataset_eliminate 0.65% : 0.000002s : 10: predicate.virtual_output_eliminate 0.27% : 0.000001s : 5: predicate.virtual_view_grad_eliminate 0.44% : 0.000001s : 5: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000642 11 49.45% : 0.000318s : 5: func_graph_cloner_run.FuncGraphClonerGraph 50.55% : 0.000325s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.031896 192 0.01% : 0.000004s : 1: ForceFp32Comm 12.16% : 0.003877s : 1: add_attr 12.11% : 0.003862s : 1: add_attr_with_inline 0.01% : 0.000004s : 1: add_comm_op_reuse_tag 0.22% : 0.000069s : 1: add_recomputation 0.01% : 0.000004s : 1: assign_add_opt 0.25% : 0.000081s : 1: auto_monad 0.11% : 0.000034s : 1: auto_monad_reorder 0.01% : 0.000004s : 1: begin_end_overlap_inline 0.02% : 0.000007s : 1: bias_add_comm_swap 1.33% : 0.000425s : 1: bootstrap 0.12% : 0.000039s : 1: cconv 0.01% : 0.000004s : 1: comm_op_add_attrs 0.07% : 0.000021s : 1: control_data_broadcast_order 0.03% : 0.000011s : 1: convert_after_rewriter 0.09% : 0.000030s : 1: cse_after_recomputation 0.01% : 0.000005s : 1: dataset_repeat_opt 0.02% : 0.000006s : 1: detach_backward 0.03% : 0.000011s : 1: environ_conv 0.09% : 0.000028s : 1: event_method 0.02% : 0.000006s : 1: full_micro_interleaved_order_control 0.02% : 0.000006s : 1: get_jit_bprop_graph 0.03% : 0.000010s : 1: graph_reusing 0.01% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.02% : 0.000005s : 1: handle_group_info 0.02% : 0.000007s : 1: inline 0.02% : 0.000006s : 1: insert-virtual-dataset 0.01% : 0.000004s : 1: interleave_parallel_branches 0.01% : 0.000004s : 1: interleave_split_concat_branches 0.02% : 0.000006s : 1: label_fine_grained_interleaved_index 0.03% : 0.000009s : 1: label_micro_interleaved_index 1.80% : 0.000573s : 1: loop_unroll 0.02% : 0.000005s : 1: merge_cast_opt 0.02% : 0.000005s : 1: micro_interleaved_order_control 2.44% : 0.000778s : 1: mutable_eliminate 0.02% : 0.000008s : 1: offloading_packed_experts 0.06% : 0.000020s : 1: opt.transform.loop_unroll_optimizer 0.07% : 0.000023s : 1: opt.transform.mutable_eliminate 4.59% : 0.001464s : 78: opt.transform.opt_a 0.12% : 0.000038s : 1: opt.transform.opt_after_cconv 0.11% : 0.000036s : 1: opt.transform.opt_after_jit_grad 0.44% : 0.000142s : 28: opt.transform.opt_b 0.20% : 0.000065s : 2: opt.transform.opt_trans_graph 0.19% : 0.000061s : 4: opt.transform.symbol_engine_opt 10.90% : 0.003475s : 1: opt_a 0.44% : 0.000139s : 1: opt_after_cconv 2.27% : 0.000724s : 1: opt_after_jit_grad 0.88% : 0.000281s : 1: opt_b 20.30% : 0.006475s : 1: optimize 0.08% : 0.000026s : 1: optimize_parallel_all_gather_comm 0.03% : 0.000010s : 1: order_py_execute_after_rewriter 0.09% : 0.000029s : 1: overlap_grad_flash_sp 0.01% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.03% : 0.000008s : 1: overlap_grad_ring_attention 0.02% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.02% : 0.000006s : 1: overlap_param_gather 0.01% : 0.000005s : 1: overlap_recompute_allgather_and_fa_grad 0.03% : 0.000009s : 1: overlap_recompute_and_grad_model_parallel 0.02% : 0.000005s : 1: overlap_recompute_comm 0.03% : 0.000009s : 1: parallel-infer-symbol 0.01% : 0.000004s : 1: parallel-infer-symbol-second 0.02% : 0.000005s : 1: partial_unused_args_eliminate 0.02% : 0.000005s : 1: pipeline_parallel_scheduler 0.02% : 0.000005s : 1: pipeline_split 0.14% : 0.000046s : 1: pre_auto_parallel 0.11% : 0.000034s : 1: py_interpret_to_execute 0.07% : 0.000022s : 1: py_interpret_to_execute_after_opt_a 0.02% : 0.000005s : 1: remove_cast_before_assign_add 0.06% : 0.000020s : 1: remove_dup_value 1.68% : 0.000536s : 1: renormalize.infer 1.34% : 0.000428s : 1: renormalize.specialize 0.02% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.03% : 0.000010s : 1: rewriter_after_jit_bprop_graph 0.16% : 0.000050s : 1: rewriter_after_opt_a 0.32% : 0.000101s : 1: rewriter_before_opt_a 0.02% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.02% : 0.000005s : 1: slice_recompute_activation 0.02% : 0.000007s : 1: split_layernorm_comm 0.96% : 0.000308s : 1: split_matmul_comm_elemetwise 0.03% : 0.000009s : 1: swap_dp_allreduce_reducescatter 0.38% : 0.000120s : 1: symbol_engine_optimizer 0.31% : 0.000098s : 1: tuple_transform 22.09% : 0.007046s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:45:42.774.976 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:45:42.775.272 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0198453, [21] [bootstrap]: 0.0004913 [type_inference]: 0.00709205 [event_method]: 2.049e-05 [auto_monad]: 6.557e-05 [graph_reusing]: 5.81998e-06 [inline]: 2.84001e-06 [add_attr]: 0.00350577, [1] [add_attr_with_inline]: 0.00349475, [1] [Cycle 1]: 8.275e-05, [2] [tag_attr]: 2.147e-05 [meta_addattr_fg_expand]: 6.46e-06 [parallel-infer-symbol]: 3.01999e-06 [pre_auto_parallel]: 3.571e-05 [insert-virtual-dataset]: 2.59001e-06 [parallel-infer-symbol-second]: 7.39994e-07 [dataset_repeat_opt]: 2.27999e-06 [pipeline_split]: 2.11003e-06 [optimize]: 0.00715482, [53] [py_interpret_to_execute]: 3.149e-05 [rewriter_before_opt_a]: 9.02e-05 [opt_a]: 0.00324809, [2] [Cycle 1]: 0.00237314, [45] [expand_dump_flag]: 2.94999e-06 [switch_simplify]: 4.25e-05 [loop_unroll]: 2.98e-05 [a_1]: 0.00066424 [with_stream_mark]: 1.989e-05 [recompute_prepare]: 9.21998e-06 [updatestate_depend_eliminate]: 4.13999e-06 [updatestate_assign_eliminate]: 3.41001e-06 [updatestate_loads_eliminate]: 3.09001e-06 [parameter_eliminate]: 1.92001e-06 [a_2]: 0.00011153 [accelerated_algorithm]: 6.93998e-06 [shard]: 2.27999e-06 [meta_shard_fg_expand]: 2.05002e-06 [shard_inline]: 6.71e-06 [merge_send_recv]: 1.002e-05 [auto_parallel]: 7.12002e-06 [parallel]: 1.986e-05 [flash_sp]: 9.28002e-06 [merge_comm]: 3.81999e-06 [allreduce_fusion]: 3.73001e-06 [matmul_add_comm_reduction]: 1.04e-05 [allreduce_slice_to_reducescatter]: 8.50006e-07 [virtual_shard_identity]: 8.57e-06 [virtual_dataset]: 7.41001e-06 [get_grad_eliminate_]: 6.56e-06 [virtual_output]: 6.68e-06 [merge_forward]: 4e-06 [cell_reuse_recompute_pass]: 1.45999e-06 [offload_activation]: 1.134e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.614e-05 [merge_recompute_call_nodes]: 2.12001e-06 [before_grad]: 1.158e-05 [set_forward_comm_id_for_comm_node_pass]: 4.22e-06 [meta_fg_expand]: 3.66999e-06 [flash_sp_send_recv_attached]: 2.82002e-06 [receive_attached]: 2.34999e-06 [after_resolve]: 1.234e-05 [a_after_grad]: 1.064e-05 [renormalize]: 0.000768 [add_forward_monad_depend]: 6.71e-06 [auto_monad_grad]: 2.71e-06 [auto_monad_eliminator]: 1.835e-05 [cse]: 2.95e-05 [a_3]: 6.648e-05 [Cycle 2]: 0.00085878, [45] [expand_dump_flag]: 2.19001e-06 [switch_simplify]: 7.88999e-06 [loop_unroll]: 6.35997e-06 [a_1]: 0.00013326 [with_stream_mark]: 1.301e-05 [recompute_prepare]: 6.88e-06 [updatestate_depend_eliminate]: 3.28998e-06 [updatestate_assign_eliminate]: 2.94001e-06 [updatestate_loads_eliminate]: 2.12999e-06 [parameter_eliminate]: 1.54e-06 [a_2]: 9.977e-05 [accelerated_algorithm]: 6.72002e-06 [shard]: 1.68002e-06 [meta_shard_fg_expand]: 1.52999e-06 [shard_inline]: 6.25002e-06 [merge_send_recv]: 6.56e-06 [auto_parallel]: 7.06001e-06 [parallel]: 6.72002e-06 [flash_sp]: 4.38001e-06 [merge_comm]: 3.49001e-06 [allreduce_fusion]: 3.08e-06 [matmul_add_comm_reduction]: 6.61999e-06 [allreduce_slice_to_reducescatter]: 4.80009e-07 [virtual_shard_identity]: 7.99002e-06 [virtual_dataset]: 6.19001e-06 [get_grad_eliminate_]: 5.89e-06 [virtual_output]: 5.89e-06 [merge_forward]: 5.37999e-06 [cell_reuse_recompute_pass]: 2.31e-06 [offload_activation]: 9.02e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.609e-05 [merge_recompute_call_nodes]: 9.50007e-07 [before_grad]: 1.046e-05 [set_forward_comm_id_for_comm_node_pass]: 3.56001e-06 [meta_fg_expand]: 2.27001e-06 [flash_sp_send_recv_attached]: 9.50007e-07 [receive_attached]: 1.94999e-06 [after_resolve]: 1.263e-05 [a_after_grad]: 9.73998e-06 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 1.72999e-06 [auto_monad_grad]: 2.00002e-06 [auto_monad_eliminator]: 9.39e-06 [cse]: 1.639e-05 [a_3]: 5.032e-05 [py_interpret_to_execute_after_opt_a]: 1.752e-05 [slice_cell_reuse_recomputed_activation]: 5.14e-06 [rewriter_after_opt_a]: 4.454e-05 [convert_after_rewriter]: 1.115e-05 [order_py_execute_after_rewriter]: 8.27e-06 [mutable_eliminate]: 0.00072881 [opt_b]: 0.00028982, [1] [Cycle 1]: 0.00027814, [7] [b_1]: 0.00017047 [b_2]: 9.39998e-06 [updatestate_depend_eliminate]: 8.47e-06 [updatestate_assign_eliminate]: 2.91e-06 [updatestate_loads_eliminate]: 2.73e-06 [renormalize]: 1.04003e-06 [cse]: 2.295e-05 [optimize_parallel_all_gather_comm]: 2.464e-05 [overlap_param_gather]: 5.52999e-06 [cconv]: 3.065e-05 [loop_unroll]: 0.0005215 [opt_after_cconv]: 0.00132357, [1] [Cycle 1]: 0.00018562, [7] [c_1]: 4.916e-05 [parameter_eliminate]: 5.94999e-06 [updatestate_depend_eliminate]: 1.081e-05 [updatestate_assign_eliminate]: 3.17002e-06 [updatestate_loads_eliminate]: 3.04999e-06 [cse]: 3.325e-05 [renormalize]: 5.8001e-07 [remove_dup_value]: 1.88e-05 [tuple_transform]: 0.00010985, [1] [Cycle 1]: 0.00010002, [4] [d_1]: 5.427e-05 [none_parameter_eliminate]: 2.21e-06 [renormalize]: 2.00002e-07 [switch_simplify]: 8.45001e-06 [partial_unused_args_eliminate]: 5.03002e-06 [add_recomputation]: 6.4e-05 [cse_after_recomputation]: 3.203e-05, [1] [Cycle 1]: 2.393e-05, [1] [cse]: 1.346e-05 [environ_conv]: 8.93002e-06 [swap_dp_allreduce_reducescatter]: 8.51002e-06 [bias_add_comm_swap]: 6.11998e-06 [label_micro_interleaved_index]: 9.52001e-06 [label_fine_grained_interleaved_index]: 5.64998e-06 [merge_cast_opt]: 4.09997e-06 [slice_recompute_activation]: 5.14e-06 [micro_interleaved_order_control]: 4.94e-06 [assign_add_opt]: 3.93001e-06 [ForceFp32Comm]: 3.91999e-06 [remove_cast_before_assign_add]: 3.51999e-06 [full_micro_interleaved_order_control]: 5.39e-06 [reorder_send_recv_between_fp_bp]: 5.15001e-06 [comm_op_add_attrs]: 3.78001e-06 [add_comm_op_reuse_tag]: 3.51001e-06 [interleave_split_concat_branches]: 3.66999e-06 [interleave_parallel_branches]: 4.15999e-06 [overlap_opt_shard_in_pipeline]: 4.28001e-06 [overlap_opt_shard_grad_in_pipeline]: 4.58999e-06 [control_data_broadcast_order]: 1.958e-05 [grouped_pairwise_exchange_alltoall]: 4.79e-06 [offloading_packed_experts]: 7.00998e-06 [overlap_recompute_and_grad_model_parallel]: 7.45003e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.56999e-06 [overlap_recompute_allgather_and_fa_grad]: 3.8e-06 [overlap_recompute_comm]: 5.64e-06 [overlap_grad_ring_attention]: 6.96001e-06 [overlap_grad_flash_sp]: 2.588e-05 [begin_end_overlap_inline]: 3.03e-06 [split_matmul_comm_elemetwise]: 5.35999e-06 [split_layernorm_comm]: 4.13999e-06 [handle_group_info]: 3.51999e-06 [symbol_engine_optimizer]: 0.00011137, [1] [Cycle 1]: 0.00010364, [6] [build]: 4.03001e-06 [elim_shapecalc]: 1.341e-05 [elim_not_effective]: 1.445e-05 [opt_reshape]: 8.77999e-06 [fold_const_symbol]: 1.082e-05 [renormalize]: 2.20025e-07 [detach_backward]: 4.12e-06 [pipeline_parallel_scheduler]: 1.79e-06 [auto_monad_reorder]: 2.107e-05 [get_jit_bprop_graph]: 2.14999e-06 [rewriter_after_jit_bprop_graph]: 5.20001e-06 [opt_after_jit_grad]: 0.00075405 [validate]: 4.158e-05 Sums bootstrap : 0.000491s : 3.68% type_inference : 0.007092s : 53.17% event_method : 0.000020s : 0.15% auto_monad : 0.000066s : 0.49% graph_reusing : 0.000006s : 0.04% inline : 0.000003s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000021s : 0.16% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.05% parallel-infer-symbol : 0.000003s : 0.02% pre_auto_parallel : 0.000036s : 0.27% insert-virtual-dataset : 0.000003s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.02% optimize.py_interpret_to_execute : 0.000031s : 0.24% optimize.rewriter_before_opt_a : 0.000090s : 0.68% optimize.opt_a.expand_dump_flag : 0.000005s : 0.04% optimize.opt_a.switch_simplify : 0.000050s : 0.38% optimize.opt_a.loop_unroll : 0.000036s : 0.27% optimize.opt_a.a_1 : 0.000798s : 5.98% optimize.opt_a.with_stream_mark : 0.000033s : 0.25% optimize.opt_a.recompute_prepare : 0.000016s : 0.12% optimize.opt_a.updatestate_depend_eliminate : 0.000007s : 0.06% optimize.opt_a.updatestate_assign_eliminate : 0.000006s : 0.05% optimize.opt_a.updatestate_loads_eliminate : 0.000005s : 0.04% optimize.opt_a.parameter_eliminate : 0.000003s : 0.03% optimize.opt_a.a_2 : 0.000211s : 1.58% optimize.opt_a.accelerated_algorithm : 0.000014s : 0.10% optimize.opt_a.shard : 0.000004s : 0.03% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.03% optimize.opt_a.shard_inline : 0.000013s : 0.10% optimize.opt_a.merge_send_recv : 0.000017s : 0.12% optimize.opt_a.auto_parallel : 0.000014s : 0.11% optimize.opt_a.parallel : 0.000027s : 0.20% optimize.opt_a.flash_sp : 0.000014s : 0.10% optimize.opt_a.merge_comm : 0.000007s : 0.05% optimize.opt_a.allreduce_fusion : 0.000007s : 0.05% optimize.opt_a.matmul_add_comm_reduction : 0.000017s : 0.13% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000017s : 0.12% optimize.opt_a.virtual_dataset : 0.000014s : 0.10% optimize.opt_a.get_grad_eliminate_ : 0.000012s : 0.09% optimize.opt_a.virtual_output : 0.000013s : 0.09% optimize.opt_a.merge_forward : 0.000009s : 0.07% optimize.opt_a.cell_reuse_recompute_pass : 0.000004s : 0.03% optimize.opt_a.offload_activation : 0.000020s : 0.15% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000032s : 0.24% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.02% optimize.opt_a.before_grad : 0.000022s : 0.17% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000008s : 0.06% optimize.opt_a.meta_fg_expand : 0.000006s : 0.04% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.03% optimize.opt_a.receive_attached : 0.000004s : 0.03% optimize.opt_a.after_resolve : 0.000025s : 0.19% optimize.opt_a.a_after_grad : 0.000020s : 0.15% optimize.opt_a.renormalize : 0.000768s : 5.76% optimize.opt_a.add_forward_monad_depend : 0.000008s : 0.06% optimize.opt_a.auto_monad_grad : 0.000005s : 0.04% optimize.opt_a.auto_monad_eliminator : 0.000028s : 0.21% optimize.opt_a.cse : 0.000046s : 0.34% optimize.opt_a.a_3 : 0.000117s : 0.88% optimize.py_interpret_to_execute_after_opt_a : 0.000018s : 0.13% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.04% optimize.rewriter_after_opt_a : 0.000045s : 0.33% optimize.convert_after_rewriter : 0.000011s : 0.08% optimize.order_py_execute_after_rewriter : 0.000008s : 0.06% optimize.mutable_eliminate : 0.000729s : 5.46% optimize.opt_b.b_1 : 0.000170s : 1.28% optimize.opt_b.b_2 : 0.000009s : 0.07% optimize.opt_b.updatestate_depend_eliminate : 0.000008s : 0.06% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.02% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.02% optimize.opt_b.renormalize : 0.000001s : 0.01% optimize.opt_b.cse : 0.000023s : 0.17% optimize.optimize_parallel_all_gather_comm : 0.000025s : 0.18% optimize.overlap_param_gather : 0.000006s : 0.04% optimize.cconv : 0.000031s : 0.23% optimize.loop_unroll : 0.000522s : 3.91% optimize.opt_after_cconv.c_1 : 0.000049s : 0.37% optimize.opt_after_cconv.parameter_eliminate : 0.000006s : 0.04% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000011s : 0.08% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.02% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.02% optimize.opt_after_cconv.cse : 0.000033s : 0.25% optimize.opt_after_cconv.renormalize : 0.000001s : 0.00% optimize.remove_dup_value : 0.000019s : 0.14% optimize.tuple_transform.d_1 : 0.000054s : 0.41% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.02% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000008s : 0.06% optimize.partial_unused_args_eliminate : 0.000005s : 0.04% optimize.add_recomputation : 0.000064s : 0.48% optimize.cse_after_recomputation.cse : 0.000013s : 0.10% optimize.environ_conv : 0.000009s : 0.07% optimize.swap_dp_allreduce_reducescatter : 0.000009s : 0.06% optimize.bias_add_comm_swap : 0.000006s : 0.05% optimize.label_micro_interleaved_index : 0.000010s : 0.07% optimize.label_fine_grained_interleaved_index : 0.000006s : 0.04% optimize.merge_cast_opt : 0.000004s : 0.03% optimize.slice_recompute_activation : 0.000005s : 0.04% optimize.micro_interleaved_order_control : 0.000005s : 0.04% optimize.assign_add_opt : 0.000004s : 0.03% optimize.ForceFp32Comm : 0.000004s : 0.03% optimize.remove_cast_before_assign_add : 0.000004s : 0.03% optimize.full_micro_interleaved_order_control : 0.000005s : 0.04% optimize.reorder_send_recv_between_fp_bp : 0.000005s : 0.04% optimize.comm_op_add_attrs : 0.000004s : 0.03% optimize.add_comm_op_reuse_tag : 0.000004s : 0.03% optimize.interleave_split_concat_branches : 0.000004s : 0.03% optimize.interleave_parallel_branches : 0.000004s : 0.03% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.03% optimize.overlap_opt_shard_grad_in_pipeline : 0.000005s : 0.03% optimize.control_data_broadcast_order : 0.000020s : 0.15% optimize.grouped_pairwise_exchange_alltoall : 0.000005s : 0.04% optimize.offloading_packed_experts : 0.000007s : 0.05% optimize.overlap_recompute_and_grad_model_parallel : 0.000007s : 0.06% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.03% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.03% optimize.overlap_recompute_comm : 0.000006s : 0.04% optimize.overlap_grad_ring_attention : 0.000007s : 0.05% optimize.overlap_grad_flash_sp : 0.000026s : 0.19% optimize.begin_end_overlap_inline : 0.000003s : 0.02% optimize.split_matmul_comm_elemetwise : 0.000005s : 0.04% optimize.split_layernorm_comm : 0.000004s : 0.03% optimize.handle_group_info : 0.000004s : 0.03% optimize.symbol_engine_optimizer.build : 0.000004s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000013s : 0.10% optimize.symbol_engine_optimizer.elim_not_effective : 0.000014s : 0.11% optimize.symbol_engine_optimizer.opt_reshape : 0.000009s : 0.07% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000011s : 0.08% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000004s : 0.03% pipeline_parallel_scheduler : 0.000002s : 0.01% auto_monad_reorder : 0.000021s : 0.16% get_jit_bprop_graph : 0.000002s : 0.02% rewriter_after_jit_bprop_graph : 0.000005s : 0.04% opt_after_jit_grad : 0.000754s : 5.65% validate : 0.000042s : 0.31% Time group info: ------[substitution.] 0.000193 28 1.20% : 0.000002s : 2: substitution.elim_not_effective 0.68% : 0.000001s : 2: substitution.fold_const_symbol 3.64% : 0.000007s : 4: substitution.graph_param_transform 78.42% : 0.000152s : 4: substitution.inline 2.41% : 0.000005s : 4: substitution.j_node_and_user_rematch 2.59% : 0.000005s : 4: substitution.remove_not_recompute_node 2.94% : 0.000006s : 4: substitution.replace_old_param 8.12% : 0.000016s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.007034 2 88.91% : 0.006254s : 1: type_inference.infer 11.09% : 0.000780s : 1: type_inference.specialize ------[replace.] 0.000066 8 60.77% : 0.000040s : 4: replace.inline 39.23% : 0.000026s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000163 8 91.49% : 0.000149s : 4: match.inline 8.51% : 0.000014s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000223 1278 0.90% : 0.000002s : 13: predicate.accumulaten_eliminater 0.92% : 0.000002s : 4: predicate.ad_related_special_op_eliminate 0.53% : 0.000001s : 8: predicate.addn_check_dump 0.84% : 0.000002s : 13: predicate.addn_zero_filter 0.83% : 0.000002s : 13: predicate.adjust_all_reduce_mul_add 2.06% : 0.000005s : 21: predicate.arithmetic_simplify 1.19% : 0.000003s : 13: predicate.cast_eliminate 0.59% : 0.000001s : 8: predicate.check_bprop_eliminate 0.48% : 0.000001s : 8: predicate.compare_switch_simplify 0.17% : 0.000000s : 4: predicate.const_output_eliminate 0.61% : 0.000001s : 8: predicate.depend_value_elim 0.85% : 0.000002s : 13: predicate.dict_get_item_const_eliminator 0.97% : 0.000002s : 13: predicate.dict_get_item_eliminator 0.79% : 0.000002s : 13: predicate.dict_set_item_eliminator 0.82% : 0.000002s : 8: predicate.dumpgradient_eliminate 0.25% : 0.000001s : 4: predicate.elim_not_effective 0.78% : 0.000002s : 4: predicate.elim_shapecalc_of_broadcastargs 1.05% : 0.000002s : 17: predicate.environ_add_const_eliminate 1.02% : 0.000002s : 17: predicate.environ_get_add_eliminate 0.98% : 0.000002s : 17: predicate.environ_get_depend_swap 1.56% : 0.000003s : 25: predicate.environ_get_eliminate 0.94% : 0.000002s : 17: predicate.environ_get_set_eliminate 1.34% : 0.000003s : 21: predicate.exchange_switch_depend_value 2.30% : 0.000005s : 21: predicate.float_depend_g_call 0.50% : 0.000001s : 8: predicate.float_environ_get_switch 0.81% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.16% : 0.000000s : 4: predicate.fold_const_symbol 0.61% : 0.000001s : 8: predicate.get_grad_eliminate 0.28% : 0.000001s : 4: predicate.graph_param_transform 0.55% : 0.000001s : 8: predicate.incorporate_call 0.43% : 0.000001s : 8: predicate.incorporate_call_switch 6.56% : 0.000015s : 58: predicate.inline 0.76% : 0.000002s : 8: predicate.inline_without_move 0.31% : 0.000001s : 8: predicate.j_node_and_user_rematch 0.83% : 0.000002s : 8: predicate.less_batch_normalization 2.07% : 0.000005s : 25: predicate.list_to_tuple_eliminator_ 2.62% : 0.000006s : 38: predicate.load_eliminater 0.89% : 0.000002s : 4: predicate.loop_unroll_after_grad 2.19% : 0.000005s : 34: predicate.loop_unroll_before_grad 1.79% : 0.000004s : 21: predicate.make_slice_get_slice_eliminator 0.61% : 0.000001s : 8: predicate.merge_addn 0.61% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.55% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.73% : 0.000002s : 13: predicate.minmaximum_grad 1.13% : 0.000003s : 4: predicate.mutable_eliminate 0.39% : 0.000001s : 4: predicate.opt_reshape 0.33% : 0.000001s : 4: predicate.parallel_virtual_node 1.99% : 0.000004s : 21: predicate.partial_defer_inline 1.49% : 0.000003s : 21: predicate.partial_eliminate 0.84% : 0.000002s : 13: predicate.print_const_string_wrapper 0.55% : 0.000001s : 8: predicate.reduce_all_const_elim 1.31% : 0.000003s : 13: predicate.reduce_eliminate 2.36% : 0.000005s : 38: predicate.redundant_stop_gradient_eliminater 0.55% : 0.000001s : 8: predicate.remove_not_recompute_node 1.82% : 0.000004s : 25: predicate.replace_applicator 0.49% : 0.000001s : 8: predicate.replace_old_param 0.36% : 0.000001s : 4: predicate.reset_defer_inline 1.05% : 0.000002s : 13: predicate.reshape_eliminate 0.62% : 0.000001s : 8: predicate.row_tensor_add_zeros_like 0.54% : 0.000001s : 4: predicate.row_tensor_eliminate 0.72% : 0.000002s : 8: predicate.same_eliminate 0.43% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.75% : 0.000002s : 8: predicate.shard_identity_eliminate 0.70% : 0.000002s : 8: predicate.special_op_eliminate 0.90% : 0.000002s : 8: predicate.specialize_transform 0.96% : 0.000002s : 8: predicate.split_environ_get_set_with_tuple_value 0.76% : 0.000002s : 8: predicate.stack_unstack_eliminate 0.38% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.41% : 0.000003s : 21: predicate.switch_defer_inline 2.07% : 0.000005s : 29: predicate.switch_layer_defer_inline 4.97% : 0.000011s : 67: predicate.switch_simplify 0.88% : 0.000002s : 13: predicate.tile_eliminate 0.89% : 0.000002s : 13: predicate.transpose_eliminate 1.55% : 0.000003s : 21: predicate.tuple_list_convert_item_index_to_positive 1.51% : 0.000003s : 21: predicate.tuple_list_get_item_const_eliminator 1.50% : 0.000003s : 21: predicate.tuple_list_get_item_depend_reorder 3.32% : 0.000007s : 33: predicate.tuple_list_get_item_eliminator 1.59% : 0.000004s : 21: predicate.tuple_list_get_set_item_eliminator 2.34% : 0.000005s : 29: predicate.tuple_list_set_item_eliminator 1.78% : 0.000004s : 25: predicate.tuple_to_list_eliminator_ 2.31% : 0.000005s : 38: predicate.updatestate_pure_node_eliminater 3.30% : 0.000007s : 46: predicate.updatestate_useless_node_eliminater 0.48% : 0.000001s : 4: predicate.value_based_eliminate 0.69% : 0.000002s : 8: predicate.virtual_dataset_eliminate 0.61% : 0.000001s : 8: predicate.virtual_output_eliminate 0.23% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.52% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000625 11 54.56% : 0.000341s : 5: func_graph_cloner_run.FuncGraphClonerGraph 45.44% : 0.000284s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.032678 192 0.02% : 0.000007s : 1: ForceFp32Comm 10.76% : 0.003516s : 1: add_attr 10.71% : 0.003499s : 1: add_attr_with_inline 0.02% : 0.000007s : 1: add_comm_op_reuse_tag 0.21% : 0.000068s : 1: add_recomputation 0.02% : 0.000007s : 1: assign_add_opt 0.23% : 0.000075s : 1: auto_monad 0.09% : 0.000029s : 1: auto_monad_reorder 0.02% : 0.000007s : 1: begin_end_overlap_inline 0.03% : 0.000009s : 1: bias_add_comm_swap 1.65% : 0.000538s : 1: bootstrap 0.10% : 0.000034s : 1: cconv 0.02% : 0.000007s : 1: comm_op_add_attrs 0.07% : 0.000023s : 1: control_data_broadcast_order 0.04% : 0.000014s : 1: convert_after_rewriter 0.11% : 0.000035s : 1: cse_after_recomputation 0.02% : 0.000008s : 1: dataset_repeat_opt 0.07% : 0.000021s : 1: detach_backward 0.04% : 0.000012s : 1: environ_conv 0.10% : 0.000032s : 1: event_method 0.03% : 0.000008s : 1: full_micro_interleaved_order_control 0.03% : 0.000009s : 1: get_jit_bprop_graph 0.04% : 0.000012s : 1: graph_reusing 0.02% : 0.000008s : 1: grouped_pairwise_exchange_alltoall 0.02% : 0.000006s : 1: handle_group_info 0.03% : 0.000008s : 1: inline 0.03% : 0.000009s : 1: insert-virtual-dataset 0.02% : 0.000007s : 1: interleave_parallel_branches 0.02% : 0.000007s : 1: interleave_split_concat_branches 0.03% : 0.000009s : 1: label_fine_grained_interleaved_index 0.04% : 0.000012s : 1: label_micro_interleaved_index 1.62% : 0.000529s : 1: loop_unroll 0.02% : 0.000007s : 1: merge_cast_opt 0.02% : 0.000008s : 1: micro_interleaved_order_control 2.25% : 0.000736s : 1: mutable_eliminate 0.03% : 0.000010s : 1: offloading_packed_experts 0.05% : 0.000016s : 1: opt.transform.loop_unroll_optimizer 0.06% : 0.000019s : 1: opt.transform.mutable_eliminate 3.78% : 0.001234s : 78: opt.transform.opt_a 0.14% : 0.000044s : 1: opt.transform.opt_after_cconv 0.09% : 0.000029s : 1: opt.transform.opt_after_jit_grad 0.32% : 0.000105s : 28: opt.transform.opt_b 0.18% : 0.000060s : 2: opt.transform.opt_trans_graph 0.13% : 0.000044s : 4: opt.transform.symbol_engine_opt 9.95% : 0.003252s : 1: opt_a 4.06% : 0.001328s : 1: opt_after_cconv 2.35% : 0.000767s : 1: opt_after_jit_grad 0.90% : 0.000294s : 1: opt_b 23.00% : 0.007515s : 1: optimize 0.09% : 0.000028s : 1: optimize_parallel_all_gather_comm 0.03% : 0.000011s : 1: order_py_execute_after_rewriter 0.09% : 0.000029s : 1: overlap_grad_flash_sp 0.02% : 0.000006s : 1: overlap_grad_matmul_and_grad_allreduce 0.03% : 0.000010s : 1: overlap_grad_ring_attention 0.02% : 0.000008s : 1: overlap_opt_shard_grad_in_pipeline 0.02% : 0.000007s : 1: overlap_opt_shard_in_pipeline 0.03% : 0.000009s : 1: overlap_param_gather 0.02% : 0.000006s : 1: overlap_recompute_allgather_and_fa_grad 0.03% : 0.000011s : 1: overlap_recompute_and_grad_model_parallel 0.03% : 0.000008s : 1: overlap_recompute_comm 0.03% : 0.000010s : 1: parallel-infer-symbol 0.02% : 0.000006s : 1: parallel-infer-symbol-second 0.03% : 0.000008s : 1: partial_unused_args_eliminate 0.03% : 0.000010s : 1: pipeline_parallel_scheduler 0.03% : 0.000009s : 1: pipeline_split 0.13% : 0.000043s : 1: pre_auto_parallel 0.11% : 0.000035s : 1: py_interpret_to_execute 0.07% : 0.000021s : 1: py_interpret_to_execute_after_opt_a 0.02% : 0.000006s : 1: remove_cast_before_assign_add 0.07% : 0.000022s : 1: remove_dup_value 1.21% : 0.000395s : 1: renormalize.infer 1.11% : 0.000363s : 1: renormalize.specialize 0.02% : 0.000008s : 1: reorder_send_recv_between_fp_bp 0.04% : 0.000012s : 1: rewriter_after_jit_bprop_graph 0.15% : 0.000048s : 1: rewriter_after_opt_a 0.29% : 0.000094s : 1: rewriter_before_opt_a 0.03% : 0.000008s : 1: slice_cell_reuse_recomputed_activation 0.03% : 0.000009s : 1: slice_recompute_activation 0.03% : 0.000009s : 1: split_layernorm_comm 0.03% : 0.000008s : 1: split_matmul_comm_elemetwise 0.03% : 0.000011s : 1: swap_dp_allreduce_reducescatter 0.35% : 0.000114s : 1: symbol_engine_optimizer 0.35% : 0.000113s : 1: tuple_transform 21.85% : 0.007140s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:45:43.209.783 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0154913, [21] [bootstrap]: 0.00047 [type_inference]: 0.00639985 [event_method]: 1.865e-05 [auto_monad]: 6.301e-05 [graph_reusing]: 6.88e-06 [inline]: 2.30002e-06 [add_attr]: 0.00323318, [1] [add_attr_with_inline]: 0.00322385, [1] [Cycle 1]: 6.043e-05, [2] [tag_attr]: 1.942e-05 [meta_addattr_fg_expand]: 5.60001e-06 [parallel-infer-symbol]: 3.43999e-06 [pre_auto_parallel]: 3.139e-05 [insert-virtual-dataset]: 2.39001e-06 [parallel-infer-symbol-second]: 9.5999e-07 [dataset_repeat_opt]: 1.74e-06 [pipeline_split]: 1.84e-06 [optimize]: 0.00457045, [53] [py_interpret_to_execute]: 2.576e-05 [rewriter_before_opt_a]: 7.99e-05 [opt_a]: 0.00256811, [2] [Cycle 1]: 0.00190439, [45] [expand_dump_flag]: 2.93998e-06 [switch_simplify]: 4.2e-05 [loop_unroll]: 2.957e-05 [a_1]: 0.00061554 [with_stream_mark]: 1.57e-05 [recompute_prepare]: 9.00001e-06 [updatestate_depend_eliminate]: 3.88001e-06 [updatestate_assign_eliminate]: 3.38e-06 [updatestate_loads_eliminate]: 3.02002e-06 [parameter_eliminate]: 1.82001e-06 [a_2]: 8.188e-05 [accelerated_algorithm]: 7.21999e-06 [shard]: 1.94e-06 [meta_shard_fg_expand]: 1.74e-06 [shard_inline]: 6.39001e-06 [merge_send_recv]: 8.17e-06 [auto_parallel]: 5.91e-06 [parallel]: 1.867e-05 [flash_sp]: 7.38999e-06 [merge_comm]: 3.90998e-06 [allreduce_fusion]: 3.54002e-06 [matmul_add_comm_reduction]: 9.34e-06 [allreduce_slice_to_reducescatter]: 6.39993e-07 [virtual_shard_identity]: 7.83001e-06 [virtual_dataset]: 6.44999e-06 [get_grad_eliminate_]: 6.17999e-06 [virtual_output]: 6.65002e-06 [merge_forward]: 4.15e-06 [cell_reuse_recompute_pass]: 1.24e-06 [offload_activation]: 1.008e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.213e-05 [merge_recompute_call_nodes]: 1.52001e-06 [before_grad]: 1.044e-05 [set_forward_comm_id_for_comm_node_pass]: 3.63999e-06 [meta_fg_expand]: 2.83e-06 [flash_sp_send_recv_attached]: 2.58e-06 [receive_attached]: 2.44001e-06 [after_resolve]: 1.184e-05 [a_after_grad]: 9.80002e-06 [renormalize]: 0.00057967 [add_forward_monad_depend]: 4.99e-06 [auto_monad_grad]: 2.53e-06 [auto_monad_eliminator]: 1.435e-05 [cse]: 2.833e-05 [a_3]: 4.864e-05 [Cycle 2]: 0.0006533, [45] [expand_dump_flag]: 1.12999e-06 [switch_simplify]: 7.64002e-06 [loop_unroll]: 6.14001e-06 [a_1]: 0.0001293 [with_stream_mark]: 1.198e-05 [recompute_prepare]: 6.23e-06 [updatestate_depend_eliminate]: 2.84999e-06 [updatestate_assign_eliminate]: 2.29001e-06 [updatestate_loads_eliminate]: 2.54999e-06 [parameter_eliminate]: 1.18001e-06 [a_2]: 7.229e-05 [accelerated_algorithm]: 6.20002e-06 [shard]: 1.25999e-06 [meta_shard_fg_expand]: 1.30999e-06 [shard_inline]: 6.06998e-06 [merge_send_recv]: 5.55001e-06 [auto_parallel]: 5.54e-06 [parallel]: 4.70001e-06 [flash_sp]: 3.62998e-06 [merge_comm]: 3.46999e-06 [allreduce_fusion]: 3.31999e-06 [matmul_add_comm_reduction]: 6.38e-06 [allreduce_slice_to_reducescatter]: 3.19997e-07 [virtual_shard_identity]: 7.34002e-06 [virtual_dataset]: 5.87001e-06 [get_grad_eliminate_]: 5.73997e-06 [virtual_output]: 5.81998e-06 [merge_forward]: 2.61e-06 [cell_reuse_recompute_pass]: 1.67999e-06 [offload_activation]: 6.71e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.339e-05 [merge_recompute_call_nodes]: 8.2e-07 [before_grad]: 9.54999e-06 [set_forward_comm_id_for_comm_node_pass]: 3.3e-06 [meta_fg_expand]: 2.36e-06 [flash_sp_send_recv_attached]: 8.89995e-07 [receive_attached]: 1.40999e-06 [after_resolve]: 1.143e-05 [a_after_grad]: 9.71e-06 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 1.74e-06 [auto_monad_grad]: 1.02e-06 [auto_monad_eliminator]: 7.16999e-06 [cse]: 1.393e-05 [a_3]: 3.596e-05 [py_interpret_to_execute_after_opt_a]: 9.27999e-06 [slice_cell_reuse_recomputed_activation]: 1.93002e-06 [rewriter_after_opt_a]: 3.374e-05 [convert_after_rewriter]: 6.56e-06 [order_py_execute_after_rewriter]: 5.20999e-06 [mutable_eliminate]: 0.00051243 [opt_b]: 0.00021571, [1] [Cycle 1]: 0.00020959, [7] [b_1]: 0.00013588 [b_2]: 8.11002e-06 [updatestate_depend_eliminate]: 6.11e-06 [updatestate_assign_eliminate]: 2.31e-06 [updatestate_loads_eliminate]: 2.32999e-06 [renormalize]: 3.4002e-07 [cse]: 1.758e-05 [optimize_parallel_all_gather_comm]: 1.635e-05 [overlap_param_gather]: 2.01003e-06 [cconv]: 2.448e-05 [loop_unroll]: 0.0004262 [opt_after_cconv]: 0.00010006, [1] [Cycle 1]: 9.421e-05, [7] [c_1]: 3.013e-05 [parameter_eliminate]: 2.89999e-06 [updatestate_depend_eliminate]: 5.42001e-06 [updatestate_assign_eliminate]: 2.37999e-06 [updatestate_loads_eliminate]: 2.37001e-06 [cse]: 1.687e-05 [renormalize]: 4.30009e-07 [remove_dup_value]: 1.335e-05 [tuple_transform]: 7.552e-05, [1] [Cycle 1]: 7.105e-05, [4] [d_1]: 4.432e-05 [none_parameter_eliminate]: 1.55001e-06 [renormalize]: 2.60014e-07 [switch_simplify]: 7e-06 [partial_unused_args_eliminate]: 1.71998e-06 [add_recomputation]: 4.644e-05 [cse_after_recomputation]: 2.097e-05, [1] [Cycle 1]: 1.621e-05, [1] [cse]: 1.066e-05 [environ_conv]: 4.82e-06 [swap_dp_allreduce_reducescatter]: 5.13002e-06 [bias_add_comm_swap]: 2.41e-06 [label_micro_interleaved_index]: 4.43001e-06 [label_fine_grained_interleaved_index]: 2.54001e-06 [merge_cast_opt]: 1.26002e-06 [slice_recompute_activation]: 2.01998e-06 [micro_interleaved_order_control]: 2.37999e-06 [assign_add_opt]: 1.14e-06 [ForceFp32Comm]: 1.19e-06 [remove_cast_before_assign_add]: 1.27e-06 [full_micro_interleaved_order_control]: 2.02001e-06 [reorder_send_recv_between_fp_bp]: 3.00002e-06 [comm_op_add_attrs]: 1.05001e-06 [add_comm_op_reuse_tag]: 9.70002e-07 [interleave_split_concat_branches]: 1.12e-06 [interleave_parallel_branches]: 1.12e-06 [overlap_opt_shard_in_pipeline]: 1.31002e-06 [overlap_opt_shard_grad_in_pipeline]: 2.02999e-06 [control_data_broadcast_order]: 1.27e-05 [grouped_pairwise_exchange_alltoall]: 2.00002e-06 [offloading_packed_experts]: 3.61001e-06 [overlap_recompute_and_grad_model_parallel]: 4.38001e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.23002e-06 [overlap_recompute_allgather_and_fa_grad]: 1.34998e-06 [overlap_recompute_comm]: 2.24999e-06 [overlap_grad_ring_attention]: 4.12998e-06 [overlap_grad_flash_sp]: 1.874e-05 [begin_end_overlap_inline]: 5.19998e-07 [split_matmul_comm_elemetwise]: 2.51e-06 [split_layernorm_comm]: 1.59998e-06 [handle_group_info]: 1.08001e-06 [symbol_engine_optimizer]: 7.347e-05, [1] [Cycle 1]: 6.884e-05, [6] [build]: 2.76999e-06 [elim_shapecalc]: 9.33002e-06 [elim_not_effective]: 1.212e-05 [opt_reshape]: 6.83e-06 [fold_const_symbol]: 1.009e-05 [renormalize]: 1.80007e-07 [detach_backward]: 1.94e-06 [pipeline_parallel_scheduler]: 1.79998e-06 [auto_monad_reorder]: 1.687e-05 [get_jit_bprop_graph]: 1.64998e-06 [rewriter_after_jit_bprop_graph]: 4.12e-06 [opt_after_jit_grad]: 0.00046413 [validate]: 3.792e-05 Sums bootstrap : 0.000470s : 4.16% type_inference : 0.006400s : 56.64% event_method : 0.000019s : 0.17% auto_monad : 0.000063s : 0.56% graph_reusing : 0.000007s : 0.06% inline : 0.000002s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000019s : 0.17% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.05% parallel-infer-symbol : 0.000003s : 0.03% pre_auto_parallel : 0.000031s : 0.28% insert-virtual-dataset : 0.000002s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.02% optimize.py_interpret_to_execute : 0.000026s : 0.23% optimize.rewriter_before_opt_a : 0.000080s : 0.71% optimize.opt_a.expand_dump_flag : 0.000004s : 0.04% optimize.opt_a.switch_simplify : 0.000050s : 0.44% optimize.opt_a.loop_unroll : 0.000036s : 0.32% optimize.opt_a.a_1 : 0.000745s : 6.59% optimize.opt_a.with_stream_mark : 0.000028s : 0.24% optimize.opt_a.recompute_prepare : 0.000015s : 0.13% optimize.opt_a.updatestate_depend_eliminate : 0.000007s : 0.06% optimize.opt_a.updatestate_assign_eliminate : 0.000006s : 0.05% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.05% optimize.opt_a.parameter_eliminate : 0.000003s : 0.03% optimize.opt_a.a_2 : 0.000154s : 1.36% optimize.opt_a.accelerated_algorithm : 0.000013s : 0.12% optimize.opt_a.shard : 0.000003s : 0.03% optimize.opt_a.meta_shard_fg_expand : 0.000003s : 0.03% optimize.opt_a.shard_inline : 0.000012s : 0.11% optimize.opt_a.merge_send_recv : 0.000014s : 0.12% optimize.opt_a.auto_parallel : 0.000011s : 0.10% optimize.opt_a.parallel : 0.000023s : 0.21% optimize.opt_a.flash_sp : 0.000011s : 0.10% optimize.opt_a.merge_comm : 0.000007s : 0.07% optimize.opt_a.allreduce_fusion : 0.000007s : 0.06% optimize.opt_a.matmul_add_comm_reduction : 0.000016s : 0.14% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000015s : 0.13% optimize.opt_a.virtual_dataset : 0.000012s : 0.11% optimize.opt_a.get_grad_eliminate_ : 0.000012s : 0.11% optimize.opt_a.virtual_output : 0.000012s : 0.11% optimize.opt_a.merge_forward : 0.000007s : 0.06% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.03% optimize.opt_a.offload_activation : 0.000017s : 0.15% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000026s : 0.23% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.02% optimize.opt_a.before_grad : 0.000020s : 0.18% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000007s : 0.06% optimize.opt_a.meta_fg_expand : 0.000005s : 0.05% optimize.opt_a.flash_sp_send_recv_attached : 0.000003s : 0.03% optimize.opt_a.receive_attached : 0.000004s : 0.03% optimize.opt_a.after_resolve : 0.000023s : 0.21% optimize.opt_a.a_after_grad : 0.000020s : 0.17% optimize.opt_a.renormalize : 0.000580s : 5.13% optimize.opt_a.add_forward_monad_depend : 0.000007s : 0.06% optimize.opt_a.auto_monad_grad : 0.000004s : 0.03% optimize.opt_a.auto_monad_eliminator : 0.000022s : 0.19% optimize.opt_a.cse : 0.000042s : 0.37% optimize.opt_a.a_3 : 0.000085s : 0.75% optimize.py_interpret_to_execute_after_opt_a : 0.000009s : 0.08% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.02% optimize.rewriter_after_opt_a : 0.000034s : 0.30% optimize.convert_after_rewriter : 0.000007s : 0.06% optimize.order_py_execute_after_rewriter : 0.000005s : 0.05% optimize.mutable_eliminate : 0.000512s : 4.54% optimize.opt_b.b_1 : 0.000136s : 1.20% optimize.opt_b.b_2 : 0.000008s : 0.07% optimize.opt_b.updatestate_depend_eliminate : 0.000006s : 0.05% optimize.opt_b.updatestate_assign_eliminate : 0.000002s : 0.02% optimize.opt_b.updatestate_loads_eliminate : 0.000002s : 0.02% optimize.opt_b.renormalize : 0.000000s : 0.00% optimize.opt_b.cse : 0.000018s : 0.16% optimize.optimize_parallel_all_gather_comm : 0.000016s : 0.14% optimize.overlap_param_gather : 0.000002s : 0.02% optimize.cconv : 0.000024s : 0.22% optimize.loop_unroll : 0.000426s : 3.77% optimize.opt_after_cconv.c_1 : 0.000030s : 0.27% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000005s : 0.05% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000002s : 0.02% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.02% optimize.opt_after_cconv.cse : 0.000017s : 0.15% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000013s : 0.12% optimize.tuple_transform.d_1 : 0.000044s : 0.39% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000007s : 0.06% optimize.partial_unused_args_eliminate : 0.000002s : 0.02% optimize.add_recomputation : 0.000046s : 0.41% optimize.cse_after_recomputation.cse : 0.000011s : 0.09% optimize.environ_conv : 0.000005s : 0.04% optimize.swap_dp_allreduce_reducescatter : 0.000005s : 0.05% optimize.bias_add_comm_swap : 0.000002s : 0.02% optimize.label_micro_interleaved_index : 0.000004s : 0.04% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.02% optimize.merge_cast_opt : 0.000001s : 0.01% optimize.slice_recompute_activation : 0.000002s : 0.02% optimize.micro_interleaved_order_control : 0.000002s : 0.02% optimize.assign_add_opt : 0.000001s : 0.01% optimize.ForceFp32Comm : 0.000001s : 0.01% optimize.remove_cast_before_assign_add : 0.000001s : 0.01% optimize.full_micro_interleaved_order_control : 0.000002s : 0.02% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.03% optimize.comm_op_add_attrs : 0.000001s : 0.01% optimize.add_comm_op_reuse_tag : 0.000001s : 0.01% optimize.interleave_split_concat_branches : 0.000001s : 0.01% optimize.interleave_parallel_branches : 0.000001s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.02% optimize.control_data_broadcast_order : 0.000013s : 0.11% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.02% optimize.offloading_packed_experts : 0.000004s : 0.03% optimize.overlap_recompute_and_grad_model_parallel : 0.000004s : 0.04% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.01% optimize.overlap_recompute_comm : 0.000002s : 0.02% optimize.overlap_grad_ring_attention : 0.000004s : 0.04% optimize.overlap_grad_flash_sp : 0.000019s : 0.17% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000003s : 0.02% optimize.split_layernorm_comm : 0.000002s : 0.01% optimize.handle_group_info : 0.000001s : 0.01% optimize.symbol_engine_optimizer.build : 0.000003s : 0.02% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000009s : 0.08% optimize.symbol_engine_optimizer.elim_not_effective : 0.000012s : 0.11% optimize.symbol_engine_optimizer.opt_reshape : 0.000007s : 0.06% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000010s : 0.09% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.02% pipeline_parallel_scheduler : 0.000002s : 0.02% auto_monad_reorder : 0.000017s : 0.15% get_jit_bprop_graph : 0.000002s : 0.01% rewriter_after_jit_bprop_graph : 0.000004s : 0.04% opt_after_jit_grad : 0.000464s : 4.11% validate : 0.000038s : 0.34% Time group info: ------[substitution.] 0.000177 28 0.96% : 0.000002s : 2: substitution.elim_not_effective 0.76% : 0.000001s : 2: substitution.fold_const_symbol 3.42% : 0.000006s : 4: substitution.graph_param_transform 78.89% : 0.000140s : 4: substitution.inline 2.23% : 0.000004s : 4: substitution.j_node_and_user_rematch 2.92% : 0.000005s : 4: substitution.remove_not_recompute_node 2.47% : 0.000004s : 4: substitution.replace_old_param 8.35% : 0.000015s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.006335 2 88.85% : 0.005629s : 1: type_inference.infer 11.15% : 0.000706s : 1: type_inference.specialize ------[replace.] 0.000060 8 62.09% : 0.000037s : 4: replace.inline 37.91% : 0.000023s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000150 8 91.46% : 0.000137s : 4: match.inline 8.54% : 0.000013s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000200 1278 1.00% : 0.000002s : 13: predicate.accumulaten_eliminater 0.77% : 0.000002s : 4: predicate.ad_related_special_op_eliminate 0.52% : 0.000001s : 8: predicate.addn_check_dump 0.92% : 0.000002s : 13: predicate.addn_zero_filter 0.81% : 0.000002s : 13: predicate.adjust_all_reduce_mul_add 2.02% : 0.000004s : 21: predicate.arithmetic_simplify 0.92% : 0.000002s : 13: predicate.cast_eliminate 0.66% : 0.000001s : 8: predicate.check_bprop_eliminate 0.54% : 0.000001s : 8: predicate.compare_switch_simplify 0.21% : 0.000000s : 4: predicate.const_output_eliminate 0.63% : 0.000001s : 8: predicate.depend_value_elim 0.96% : 0.000002s : 13: predicate.dict_get_item_const_eliminator 1.04% : 0.000002s : 13: predicate.dict_get_item_eliminator 0.86% : 0.000002s : 13: predicate.dict_set_item_eliminator 0.97% : 0.000002s : 8: predicate.dumpgradient_eliminate 0.21% : 0.000000s : 4: predicate.elim_not_effective 0.34% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.12% : 0.000002s : 17: predicate.environ_add_const_eliminate 1.08% : 0.000002s : 17: predicate.environ_get_add_eliminate 1.10% : 0.000002s : 17: predicate.environ_get_depend_swap 1.67% : 0.000003s : 25: predicate.environ_get_eliminate 1.16% : 0.000002s : 17: predicate.environ_get_set_eliminate 1.51% : 0.000003s : 21: predicate.exchange_switch_depend_value 2.47% : 0.000005s : 21: predicate.float_depend_g_call 0.57% : 0.000001s : 8: predicate.float_environ_get_switch 0.78% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.18% : 0.000000s : 4: predicate.fold_const_symbol 0.64% : 0.000001s : 8: predicate.get_grad_eliminate 0.23% : 0.000000s : 4: predicate.graph_param_transform 0.59% : 0.000001s : 8: predicate.incorporate_call 0.49% : 0.000001s : 8: predicate.incorporate_call_switch 6.31% : 0.000013s : 58: predicate.inline 0.77% : 0.000002s : 8: predicate.inline_without_move 0.35% : 0.000001s : 8: predicate.j_node_and_user_rematch 0.87% : 0.000002s : 8: predicate.less_batch_normalization 1.96% : 0.000004s : 25: predicate.list_to_tuple_eliminator_ 2.57% : 0.000005s : 38: predicate.load_eliminater 0.86% : 0.000002s : 4: predicate.loop_unroll_after_grad 2.40% : 0.000005s : 34: predicate.loop_unroll_before_grad 1.57% : 0.000003s : 21: predicate.make_slice_get_slice_eliminator 0.55% : 0.000001s : 8: predicate.merge_addn 0.63% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.55% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.82% : 0.000002s : 13: predicate.minmaximum_grad 0.94% : 0.000002s : 4: predicate.mutable_eliminate 0.37% : 0.000001s : 4: predicate.opt_reshape 0.36% : 0.000001s : 4: predicate.parallel_virtual_node 1.77% : 0.000004s : 21: predicate.partial_defer_inline 1.69% : 0.000003s : 21: predicate.partial_eliminate 0.92% : 0.000002s : 13: predicate.print_const_string_wrapper 0.56% : 0.000001s : 8: predicate.reduce_all_const_elim 1.27% : 0.000003s : 13: predicate.reduce_eliminate 2.49% : 0.000005s : 38: predicate.redundant_stop_gradient_eliminater 0.41% : 0.000001s : 8: predicate.remove_not_recompute_node 1.41% : 0.000003s : 25: predicate.replace_applicator 0.66% : 0.000001s : 8: predicate.replace_old_param 0.29% : 0.000001s : 4: predicate.reset_defer_inline 0.90% : 0.000002s : 13: predicate.reshape_eliminate 0.66% : 0.000001s : 8: predicate.row_tensor_add_zeros_like 0.37% : 0.000001s : 4: predicate.row_tensor_eliminate 0.74% : 0.000001s : 8: predicate.same_eliminate 0.48% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.80% : 0.000002s : 8: predicate.shard_identity_eliminate 0.71% : 0.000001s : 8: predicate.special_op_eliminate 0.70% : 0.000001s : 8: predicate.specialize_transform 0.79% : 0.000002s : 8: predicate.split_environ_get_set_with_tuple_value 0.74% : 0.000001s : 8: predicate.stack_unstack_eliminate 0.34% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.62% : 0.000003s : 21: predicate.switch_defer_inline 2.17% : 0.000004s : 29: predicate.switch_layer_defer_inline 5.25% : 0.000010s : 67: predicate.switch_simplify 0.96% : 0.000002s : 13: predicate.tile_eliminate 0.86% : 0.000002s : 13: predicate.transpose_eliminate 1.49% : 0.000003s : 21: predicate.tuple_list_convert_item_index_to_positive 1.51% : 0.000003s : 21: predicate.tuple_list_get_item_const_eliminator 1.56% : 0.000003s : 21: predicate.tuple_list_get_item_depend_reorder 3.52% : 0.000007s : 33: predicate.tuple_list_get_item_eliminator 1.55% : 0.000003s : 21: predicate.tuple_list_get_set_item_eliminator 2.27% : 0.000005s : 29: predicate.tuple_list_set_item_eliminator 1.70% : 0.000003s : 25: predicate.tuple_to_list_eliminator_ 2.46% : 0.000005s : 38: predicate.updatestate_pure_node_eliminater 3.22% : 0.000006s : 46: predicate.updatestate_useless_node_eliminater 0.40% : 0.000001s : 4: predicate.value_based_eliminate 0.61% : 0.000001s : 8: predicate.virtual_dataset_eliminate 0.64% : 0.000001s : 8: predicate.virtual_output_eliminate 0.31% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.42% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000536 11 58.49% : 0.000313s : 5: func_graph_cloner_run.FuncGraphClonerGraph 41.51% : 0.000223s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.025186 192 0.02% : 0.000004s : 1: ForceFp32Comm 12.86% : 0.003238s : 1: add_attr 12.82% : 0.003228s : 1: add_attr_with_inline 0.02% : 0.000004s : 1: add_comm_op_reuse_tag 0.20% : 0.000050s : 1: add_recomputation 0.01% : 0.000004s : 1: assign_add_opt 0.27% : 0.000069s : 1: auto_monad 0.08% : 0.000021s : 1: auto_monad_reorder 0.01% : 0.000003s : 1: begin_end_overlap_inline 0.02% : 0.000005s : 1: bias_add_comm_swap 1.99% : 0.000501s : 1: bootstrap 0.11% : 0.000028s : 1: cconv 0.02% : 0.000004s : 1: comm_op_add_attrs 0.06% : 0.000016s : 1: control_data_broadcast_order 0.04% : 0.000010s : 1: convert_after_rewriter 0.09% : 0.000024s : 1: cse_after_recomputation 0.02% : 0.000005s : 1: dataset_repeat_opt 0.02% : 0.000005s : 1: detach_backward 0.03% : 0.000008s : 1: environ_conv 0.10% : 0.000025s : 1: event_method 0.02% : 0.000005s : 1: full_micro_interleaved_order_control 0.02% : 0.000005s : 1: get_jit_bprop_graph 0.04% : 0.000010s : 1: graph_reusing 0.02% : 0.000005s : 1: grouped_pairwise_exchange_alltoall 0.02% : 0.000004s : 1: handle_group_info 0.02% : 0.000005s : 1: inline 0.03% : 0.000006s : 1: insert-virtual-dataset 0.01% : 0.000004s : 1: interleave_parallel_branches 0.02% : 0.000004s : 1: interleave_split_concat_branches 0.02% : 0.000006s : 1: label_fine_grained_interleaved_index 0.03% : 0.000007s : 1: label_micro_interleaved_index 1.72% : 0.000434s : 1: loop_unroll 0.02% : 0.000004s : 1: merge_cast_opt 0.02% : 0.000005s : 1: micro_interleaved_order_control 2.07% : 0.000521s : 1: mutable_eliminate 0.03% : 0.000006s : 1: offloading_packed_experts 0.05% : 0.000014s : 1: opt.transform.loop_unroll_optimizer 0.06% : 0.000014s : 1: opt.transform.mutable_eliminate 4.61% : 0.001161s : 78: opt.transform.opt_a 0.11% : 0.000029s : 1: opt.transform.opt_after_cconv 0.10% : 0.000024s : 1: opt.transform.opt_after_jit_grad 0.45% : 0.000113s : 28: opt.transform.opt_b 0.19% : 0.000049s : 2: opt.transform.opt_trans_graph 0.14% : 0.000035s : 4: opt.transform.symbol_engine_opt 10.21% : 0.002571s : 1: opt_a 0.41% : 0.000104s : 1: opt_after_cconv 1.88% : 0.000473s : 1: opt_after_jit_grad 0.87% : 0.000219s : 1: opt_b 18.17% : 0.004576s : 1: optimize 0.08% : 0.000020s : 1: optimize_parallel_all_gather_comm 0.03% : 0.000008s : 1: order_py_execute_after_rewriter 0.09% : 0.000022s : 1: overlap_grad_flash_sp 0.02% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.03% : 0.000007s : 1: overlap_grad_ring_attention 0.02% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.02% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.02% : 0.000005s : 1: overlap_param_gather 0.02% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.03% : 0.000008s : 1: overlap_recompute_and_grad_model_parallel 0.02% : 0.000005s : 1: overlap_recompute_comm 0.03% : 0.000007s : 1: parallel-infer-symbol 0.02% : 0.000004s : 1: parallel-infer-symbol-second 0.02% : 0.000005s : 1: partial_unused_args_eliminate 0.02% : 0.000005s : 1: pipeline_parallel_scheduler 0.02% : 0.000005s : 1: pipeline_split 0.14% : 0.000035s : 1: pre_auto_parallel 0.12% : 0.000030s : 1: py_interpret_to_execute 0.05% : 0.000013s : 1: py_interpret_to_execute_after_opt_a 0.02% : 0.000004s : 1: remove_cast_before_assign_add 0.07% : 0.000017s : 1: remove_dup_value 1.22% : 0.000308s : 1: renormalize.infer 1.05% : 0.000264s : 1: renormalize.specialize 0.02% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.03% : 0.000007s : 1: rewriter_after_jit_bprop_graph 0.15% : 0.000037s : 1: rewriter_after_opt_a 0.33% : 0.000084s : 1: rewriter_before_opt_a 0.02% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.02% : 0.000005s : 1: slice_recompute_activation 0.02% : 0.000005s : 1: split_layernorm_comm 0.02% : 0.000005s : 1: split_matmul_comm_elemetwise 0.03% : 0.000008s : 1: swap_dp_allreduce_reducescatter 0.30% : 0.000076s : 1: symbol_engine_optimizer 0.31% : 0.000078s : 1: tuple_transform 25.48% : 0.006419s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:45:43.595.461 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:45:43.595.738 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0336625, [21] [bootstrap]: 0.00039589 [type_inference]: 0.0224373 [event_method]: 2.022e-05 [auto_monad]: 6.157e-05 [graph_reusing]: 6.59001e-06 [inline]: 2.48998e-06 [add_attr]: 0.00329003, [1] [add_attr_with_inline]: 0.00327964, [1] [Cycle 1]: 7.718e-05, [2] [tag_attr]: 2.202e-05 [meta_addattr_fg_expand]: 6.22001e-06 [parallel-infer-symbol]: 3.71001e-06 [pre_auto_parallel]: 3.566e-05 [insert-virtual-dataset]: 2.74999e-06 [parallel-infer-symbol-second]: 7.89994e-07 [dataset_repeat_opt]: 1.87001e-06 [pipeline_split]: 1.72001e-06 [optimize]: 0.00611989, [53] [py_interpret_to_execute]: 3.147e-05 [rewriter_before_opt_a]: 9.555e-05 [opt_a]: 0.00356581, [2] [Cycle 1]: 0.00254756, [45] [expand_dump_flag]: 3.31001e-06 [switch_simplify]: 4.873e-05 [loop_unroll]: 3.482e-05 [a_1]: 0.00079773 [with_stream_mark]: 1.721e-05 [recompute_prepare]: 1.051e-05 [updatestate_depend_eliminate]: 4.97e-06 [updatestate_assign_eliminate]: 4.2e-06 [updatestate_loads_eliminate]: 3.75e-06 [parameter_eliminate]: 1.87999e-06 [a_2]: 0.00014105 [accelerated_algorithm]: 9.49999e-06 [shard]: 1.81e-06 [meta_shard_fg_expand]: 2.37999e-06 [shard_inline]: 8.22998e-06 [merge_send_recv]: 9.89999e-06 [auto_parallel]: 7.21001e-06 [parallel]: 1.966e-05 [flash_sp]: 8.48999e-06 [merge_comm]: 5.27999e-06 [allreduce_fusion]: 4.94e-06 [matmul_add_comm_reduction]: 1.087e-05 [allreduce_slice_to_reducescatter]: 6.79982e-07 [virtual_shard_identity]: 9.91e-06 [virtual_dataset]: 8.72e-06 [get_grad_eliminate_]: 7.27002e-06 [virtual_output]: 8.23001e-06 [merge_forward]: 4.92e-06 [cell_reuse_recompute_pass]: 1.64e-06 [offload_activation]: 1.215e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.895e-05 [merge_recompute_call_nodes]: 2.14e-06 [before_grad]: 1.431e-05 [set_forward_comm_id_for_comm_node_pass]: 4.65001e-06 [meta_fg_expand]: 4.18001e-06 [flash_sp_send_recv_attached]: 2.68e-06 [receive_attached]: 2.34001e-06 [after_resolve]: 1.444e-05 [a_after_grad]: 1.341e-05 [renormalize]: 0.00072262 [add_forward_monad_depend]: 5.32999e-06 [auto_monad_grad]: 2.36998e-06 [auto_monad_eliminator]: 1.804e-05 [cse]: 3.678e-05 [a_3]: 7.255e-05 [Cycle 2]: 0.00100437, [45] [expand_dump_flag]: 1.57999e-06 [switch_simplify]: 9.42001e-06 [loop_unroll]: 7.97003e-06 [a_1]: 0.00018563 [with_stream_mark]: 1.199e-05 [recompute_prepare]: 7.83999e-06 [updatestate_depend_eliminate]: 4.69998e-06 [updatestate_assign_eliminate]: 3.36999e-06 [updatestate_loads_eliminate]: 2.89999e-06 [parameter_eliminate]: 1.00001e-06 [a_2]: 0.00015818 [accelerated_algorithm]: 8.60001e-06 [shard]: 1.47001e-06 [meta_shard_fg_expand]: 1.87001e-06 [shard_inline]: 8.50999e-06 [merge_send_recv]: 6.07999e-06 [auto_parallel]: 7.06999e-06 [parallel]: 5.44998e-06 [flash_sp]: 3.4e-06 [merge_comm]: 4.41002e-06 [allreduce_fusion]: 4.18999e-06 [matmul_add_comm_reduction]: 7.73999e-06 [allreduce_slice_to_reducescatter]: 3.50003e-07 [virtual_shard_identity]: 9.69e-06 [virtual_dataset]: 7.97e-06 [get_grad_eliminate_]: 7.21001e-06 [virtual_output]: 6.86001e-06 [merge_forward]: 3.55e-06 [cell_reuse_recompute_pass]: 1.45999e-06 [offload_activation]: 7.85998e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.701e-05 [merge_recompute_call_nodes]: 8.99978e-07 [before_grad]: 1.232e-05 [set_forward_comm_id_for_comm_node_pass]: 4.02998e-06 [meta_fg_expand]: 2.71999e-06 [flash_sp_send_recv_attached]: 8.2e-07 [receive_attached]: 1.00001e-06 [after_resolve]: 1.144e-05 [a_after_grad]: 1.164e-05 [renormalize]: 8.9989e-08 [add_forward_monad_depend]: 1.82001e-06 [auto_monad_grad]: 1.24e-06 [auto_monad_eliminator]: 9.59999e-06 [cse]: 2.283e-05 [a_3]: 6.113e-05 [py_interpret_to_execute_after_opt_a]: 1.465e-05 [slice_cell_reuse_recomputed_activation]: 4.98001e-06 [rewriter_after_opt_a]: 4.555e-05 [convert_after_rewriter]: 1.073e-05 [order_py_execute_after_rewriter]: 1.017e-05 [mutable_eliminate]: 0.00055324 [opt_b]: 0.00033272, [1] [Cycle 1]: 0.0003231, [7] [b_1]: 0.00021024 [b_2]: 9.47999e-06 [updatestate_depend_eliminate]: 8.65001e-06 [updatestate_assign_eliminate]: 3.36999e-06 [updatestate_loads_eliminate]: 3.41999e-06 [renormalize]: 4.50003e-07 [cse]: 2.845e-05 [optimize_parallel_all_gather_comm]: 2.177e-05 [overlap_param_gather]: 5.29998e-06 [cconv]: 3.437e-05 [loop_unroll]: 0.00047564 [opt_after_cconv]: 0.00015312, [1] [Cycle 1]: 0.00014409, [7] [c_1]: 4.164e-05 [parameter_eliminate]: 4.71002e-06 [updatestate_depend_eliminate]: 6.86001e-06 [updatestate_assign_eliminate]: 3.36001e-06 [updatestate_loads_eliminate]: 3.55e-06 [cse]: 2.407e-05 [renormalize]: 3.59985e-07 [remove_dup_value]: 1.897e-05 [tuple_transform]: 0.00010938, [1] [Cycle 1]: 0.00010095, [4] [d_1]: 5.861e-05 [none_parameter_eliminate]: 1.79e-06 [renormalize]: 2.10013e-07 [switch_simplify]: 9.41e-06 [partial_unused_args_eliminate]: 4.79998e-06 [add_recomputation]: 6.285e-05 [cse_after_recomputation]: 3.574e-05, [1] [Cycle 1]: 2.79e-05, [1] [cse]: 1.768e-05 [environ_conv]: 9.67001e-06 [swap_dp_allreduce_reducescatter]: 9.08002e-06 [bias_add_comm_swap]: 5.61003e-06 [label_micro_interleaved_index]: 7.82e-06 [label_fine_grained_interleaved_index]: 5.83002e-06 [merge_cast_opt]: 4.41002e-06 [slice_recompute_activation]: 4.85001e-06 [micro_interleaved_order_control]: 4.60001e-06 [assign_add_opt]: 3.87998e-06 [ForceFp32Comm]: 3.83999e-06 [remove_cast_before_assign_add]: 3.95998e-06 [full_micro_interleaved_order_control]: 5.07e-06 [reorder_send_recv_between_fp_bp]: 5.54e-06 [comm_op_add_attrs]: 3.91001e-06 [add_comm_op_reuse_tag]: 3.48e-06 [interleave_split_concat_branches]: 3.71999e-06 [interleave_parallel_branches]: 3.5e-06 [overlap_opt_shard_in_pipeline]: 3.82002e-06 [overlap_opt_shard_grad_in_pipeline]: 4.67e-06 [control_data_broadcast_order]: 2.466e-05 [grouped_pairwise_exchange_alltoall]: 3.97e-06 [offloading_packed_experts]: 7.23999e-06 [overlap_recompute_and_grad_model_parallel]: 7.67998e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.93001e-06 [overlap_recompute_allgather_and_fa_grad]: 4.15e-06 [overlap_recompute_comm]: 4.95999e-06 [overlap_grad_ring_attention]: 7.37997e-06 [overlap_grad_flash_sp]: 2.588e-05 [begin_end_overlap_inline]: 2.93e-06 [split_matmul_comm_elemetwise]: 4.98001e-06 [split_layernorm_comm]: 4.70999e-06 [handle_group_info]: 3.51999e-06 [symbol_engine_optimizer]: 0.00010871, [1] [Cycle 1]: 0.00010174, [6] [build]: 3.72002e-06 [elim_shapecalc]: 1.317e-05 [elim_not_effective]: 1.61e-05 [opt_reshape]: 9.10001e-06 [fold_const_symbol]: 1.307e-05 [renormalize]: 2.19996e-07 [detach_backward]: 4.53001e-06 [pipeline_parallel_scheduler]: 1.76998e-06 [auto_monad_reorder]: 2.478e-05 [get_jit_bprop_graph]: 1.69e-06 [rewriter_after_jit_bprop_graph]: 5.24e-06 [opt_after_jit_grad]: 0.00056981 [validate]: 4.925e-05 Sums bootstrap : 0.000396s : 1.39% type_inference : 0.022437s : 78.61% event_method : 0.000020s : 0.07% auto_monad : 0.000062s : 0.22% graph_reusing : 0.000007s : 0.02% inline : 0.000002s : 0.01% add_attr.add_attr_with_inline.tag_attr : 0.000022s : 0.08% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.02% parallel-infer-symbol : 0.000004s : 0.01% pre_auto_parallel : 0.000036s : 0.12% insert-virtual-dataset : 0.000003s : 0.01% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.01% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000031s : 0.11% optimize.rewriter_before_opt_a : 0.000096s : 0.33% optimize.opt_a.expand_dump_flag : 0.000005s : 0.02% optimize.opt_a.switch_simplify : 0.000058s : 0.20% optimize.opt_a.loop_unroll : 0.000043s : 0.15% optimize.opt_a.a_1 : 0.000983s : 3.45% optimize.opt_a.with_stream_mark : 0.000029s : 0.10% optimize.opt_a.recompute_prepare : 0.000018s : 0.06% optimize.opt_a.updatestate_depend_eliminate : 0.000010s : 0.03% optimize.opt_a.updatestate_assign_eliminate : 0.000008s : 0.03% optimize.opt_a.updatestate_loads_eliminate : 0.000007s : 0.02% optimize.opt_a.parameter_eliminate : 0.000003s : 0.01% optimize.opt_a.a_2 : 0.000299s : 1.05% optimize.opt_a.accelerated_algorithm : 0.000018s : 0.06% optimize.opt_a.shard : 0.000003s : 0.01% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.01% optimize.opt_a.shard_inline : 0.000017s : 0.06% optimize.opt_a.merge_send_recv : 0.000016s : 0.06% optimize.opt_a.auto_parallel : 0.000014s : 0.05% optimize.opt_a.parallel : 0.000025s : 0.09% optimize.opt_a.flash_sp : 0.000012s : 0.04% optimize.opt_a.merge_comm : 0.000010s : 0.03% optimize.opt_a.allreduce_fusion : 0.000009s : 0.03% optimize.opt_a.matmul_add_comm_reduction : 0.000019s : 0.07% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000020s : 0.07% optimize.opt_a.virtual_dataset : 0.000017s : 0.06% optimize.opt_a.get_grad_eliminate_ : 0.000014s : 0.05% optimize.opt_a.virtual_output : 0.000015s : 0.05% optimize.opt_a.merge_forward : 0.000008s : 0.03% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.01% optimize.opt_a.offload_activation : 0.000020s : 0.07% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000036s : 0.13% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.01% optimize.opt_a.before_grad : 0.000027s : 0.09% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000009s : 0.03% optimize.opt_a.meta_fg_expand : 0.000007s : 0.02% optimize.opt_a.flash_sp_send_recv_attached : 0.000003s : 0.01% optimize.opt_a.receive_attached : 0.000003s : 0.01% optimize.opt_a.after_resolve : 0.000026s : 0.09% optimize.opt_a.a_after_grad : 0.000025s : 0.09% optimize.opt_a.renormalize : 0.000723s : 2.53% optimize.opt_a.add_forward_monad_depend : 0.000007s : 0.03% optimize.opt_a.auto_monad_grad : 0.000004s : 0.01% optimize.opt_a.auto_monad_eliminator : 0.000028s : 0.10% optimize.opt_a.cse : 0.000060s : 0.21% optimize.opt_a.a_3 : 0.000134s : 0.47% optimize.py_interpret_to_execute_after_opt_a : 0.000015s : 0.05% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.02% optimize.rewriter_after_opt_a : 0.000046s : 0.16% optimize.convert_after_rewriter : 0.000011s : 0.04% optimize.order_py_execute_after_rewriter : 0.000010s : 0.04% optimize.mutable_eliminate : 0.000553s : 1.94% optimize.opt_b.b_1 : 0.000210s : 0.74% optimize.opt_b.b_2 : 0.000009s : 0.03% optimize.opt_b.updatestate_depend_eliminate : 0.000009s : 0.03% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.01% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.01% optimize.opt_b.renormalize : 0.000000s : 0.00% optimize.opt_b.cse : 0.000028s : 0.10% optimize.optimize_parallel_all_gather_comm : 0.000022s : 0.08% optimize.overlap_param_gather : 0.000005s : 0.02% optimize.cconv : 0.000034s : 0.12% optimize.loop_unroll : 0.000476s : 1.67% optimize.opt_after_cconv.c_1 : 0.000042s : 0.15% optimize.opt_after_cconv.parameter_eliminate : 0.000005s : 0.02% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000007s : 0.02% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000004s : 0.01% optimize.opt_after_cconv.cse : 0.000024s : 0.08% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000019s : 0.07% optimize.tuple_transform.d_1 : 0.000059s : 0.21% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000009s : 0.03% optimize.partial_unused_args_eliminate : 0.000005s : 0.02% optimize.add_recomputation : 0.000063s : 0.22% optimize.cse_after_recomputation.cse : 0.000018s : 0.06% optimize.environ_conv : 0.000010s : 0.03% optimize.swap_dp_allreduce_reducescatter : 0.000009s : 0.03% optimize.bias_add_comm_swap : 0.000006s : 0.02% optimize.label_micro_interleaved_index : 0.000008s : 0.03% optimize.label_fine_grained_interleaved_index : 0.000006s : 0.02% optimize.merge_cast_opt : 0.000004s : 0.02% optimize.slice_recompute_activation : 0.000005s : 0.02% optimize.micro_interleaved_order_control : 0.000005s : 0.02% optimize.assign_add_opt : 0.000004s : 0.01% optimize.ForceFp32Comm : 0.000004s : 0.01% optimize.remove_cast_before_assign_add : 0.000004s : 0.01% optimize.full_micro_interleaved_order_control : 0.000005s : 0.02% optimize.reorder_send_recv_between_fp_bp : 0.000006s : 0.02% optimize.comm_op_add_attrs : 0.000004s : 0.01% optimize.add_comm_op_reuse_tag : 0.000003s : 0.01% optimize.interleave_split_concat_branches : 0.000004s : 0.01% optimize.interleave_parallel_branches : 0.000003s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000005s : 0.02% optimize.control_data_broadcast_order : 0.000025s : 0.09% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.01% optimize.offloading_packed_experts : 0.000007s : 0.03% optimize.overlap_recompute_and_grad_model_parallel : 0.000008s : 0.03% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.01% optimize.overlap_recompute_comm : 0.000005s : 0.02% optimize.overlap_grad_ring_attention : 0.000007s : 0.03% optimize.overlap_grad_flash_sp : 0.000026s : 0.09% optimize.begin_end_overlap_inline : 0.000003s : 0.01% optimize.split_matmul_comm_elemetwise : 0.000005s : 0.02% optimize.split_layernorm_comm : 0.000005s : 0.02% optimize.handle_group_info : 0.000004s : 0.01% optimize.symbol_engine_optimizer.build : 0.000004s : 0.01% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000013s : 0.05% optimize.symbol_engine_optimizer.elim_not_effective : 0.000016s : 0.06% optimize.symbol_engine_optimizer.opt_reshape : 0.000009s : 0.03% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000013s : 0.05% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000005s : 0.02% pipeline_parallel_scheduler : 0.000002s : 0.01% auto_monad_reorder : 0.000025s : 0.09% get_jit_bprop_graph : 0.000002s : 0.01% rewriter_after_jit_bprop_graph : 0.000005s : 0.02% opt_after_jit_grad : 0.000570s : 2.00% validate : 0.000049s : 0.17% Time group info: ------[substitution.] 0.000212 38 10.39% : 0.000022s : 3: substitution.cast_eliminate 1.06% : 0.000002s : 3: substitution.elim_not_effective 0.97% : 0.000002s : 3: substitution.fold_const_symbol 3.35% : 0.000007s : 5: substitution.graph_param_transform 69.27% : 0.000147s : 4: substitution.inline 2.21% : 0.000005s : 6: substitution.j_node_and_user_rematch 2.99% : 0.000006s : 6: substitution.remove_not_recompute_node 2.26% : 0.000005s : 4: substitution.replace_old_param 7.51% : 0.000016s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.022382 2 96.83% : 0.021673s : 1: type_inference.infer 3.17% : 0.000710s : 1: type_inference.specialize ------[replace.] 0.000068 8 61.24% : 0.000041s : 4: replace.inline 38.76% : 0.000026s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000158 8 91.26% : 0.000144s : 4: match.inline 8.74% : 0.000014s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000275 1596 0.97% : 0.000003s : 17: predicate.accumulaten_eliminater 0.77% : 0.000002s : 5: predicate.ad_related_special_op_eliminate 0.57% : 0.000002s : 10: predicate.addn_check_dump 1.00% : 0.000003s : 17: predicate.addn_zero_filter 0.86% : 0.000002s : 17: predicate.adjust_all_reduce_mul_add 2.08% : 0.000006s : 27: predicate.arithmetic_simplify 1.10% : 0.000003s : 17: predicate.cast_eliminate 0.57% : 0.000002s : 10: predicate.check_bprop_eliminate 0.58% : 0.000002s : 10: predicate.compare_switch_simplify 0.17% : 0.000000s : 5: predicate.const_output_eliminate 0.62% : 0.000002s : 10: predicate.depend_value_elim 1.00% : 0.000003s : 17: predicate.dict_get_item_const_eliminator 1.11% : 0.000003s : 17: predicate.dict_get_item_eliminator 0.95% : 0.000003s : 17: predicate.dict_set_item_eliminator 0.97% : 0.000003s : 10: predicate.dumpgradient_eliminate 0.17% : 0.000000s : 5: predicate.elim_not_effective 0.38% : 0.000001s : 5: predicate.elim_shapecalc_of_broadcastargs 1.25% : 0.000003s : 22: predicate.environ_add_const_eliminate 1.16% : 0.000003s : 22: predicate.environ_get_add_eliminate 1.34% : 0.000004s : 22: predicate.environ_get_depend_swap 1.89% : 0.000005s : 32: predicate.environ_get_eliminate 1.17% : 0.000003s : 22: predicate.environ_get_set_eliminate 1.43% : 0.000004s : 25: predicate.exchange_switch_depend_value 2.29% : 0.000006s : 25: predicate.float_depend_g_call 0.53% : 0.000001s : 10: predicate.float_environ_get_switch 0.87% : 0.000002s : 15: predicate.float_tuple_getitem_switch 0.17% : 0.000000s : 5: predicate.fold_const_symbol 0.58% : 0.000002s : 10: predicate.get_grad_eliminate 0.19% : 0.000001s : 5: predicate.graph_param_transform 0.63% : 0.000002s : 10: predicate.incorporate_call 0.50% : 0.000001s : 10: predicate.incorporate_call_switch 6.28% : 0.000017s : 72: predicate.inline 0.76% : 0.000002s : 10: predicate.inline_without_move 0.33% : 0.000001s : 10: predicate.j_node_and_user_rematch 0.89% : 0.000002s : 10: predicate.less_batch_normalization 1.76% : 0.000005s : 31: predicate.list_to_tuple_eliminator_ 2.66% : 0.000007s : 48: predicate.load_eliminater 0.92% : 0.000003s : 5: predicate.loop_unroll_after_grad 2.01% : 0.000006s : 36: predicate.loop_unroll_before_grad 1.63% : 0.000005s : 27: predicate.make_slice_get_slice_eliminator 0.54% : 0.000001s : 10: predicate.merge_addn 0.55% : 0.000002s : 10: predicate.micro_step_allgather_replace 0.51% : 0.000001s : 10: predicate.mini_step_allgather_replace 0.91% : 0.000002s : 17: predicate.minmaximum_grad 1.07% : 0.000003s : 5: predicate.mutable_eliminate 0.36% : 0.000001s : 5: predicate.opt_reshape 0.35% : 0.000001s : 5: predicate.parallel_virtual_node 1.81% : 0.000005s : 25: predicate.partial_defer_inline 1.65% : 0.000005s : 26: predicate.partial_eliminate 0.93% : 0.000003s : 17: predicate.print_const_string_wrapper 0.59% : 0.000002s : 10: predicate.reduce_all_const_elim 1.18% : 0.000003s : 17: predicate.reduce_eliminate 2.53% : 0.000007s : 48: predicate.redundant_stop_gradient_eliminater 0.37% : 0.000001s : 10: predicate.remove_not_recompute_node 1.30% : 0.000004s : 31: predicate.replace_applicator 0.44% : 0.000001s : 10: predicate.replace_old_param 0.32% : 0.000001s : 5: predicate.reset_defer_inline 0.98% : 0.000003s : 17: predicate.reshape_eliminate 0.60% : 0.000002s : 10: predicate.row_tensor_add_zeros_like 0.36% : 0.000001s : 5: predicate.row_tensor_eliminate 0.82% : 0.000002s : 10: predicate.same_eliminate 0.39% : 0.000001s : 10: predicate.set_cell_output_no_recompute 0.81% : 0.000002s : 10: predicate.shard_identity_eliminate 0.80% : 0.000002s : 10: predicate.special_op_eliminate 0.88% : 0.000002s : 10: predicate.specialize_transform 0.76% : 0.000002s : 10: predicate.split_environ_get_set_with_tuple_value 0.69% : 0.000002s : 10: predicate.stack_unstack_eliminate 0.33% : 0.000001s : 5: predicate.switch_call_monad_eliminater 1.56% : 0.000004s : 25: predicate.switch_defer_inline 2.08% : 0.000006s : 35: predicate.switch_layer_defer_inline 5.00% : 0.000014s : 76: predicate.switch_simplify 0.90% : 0.000002s : 17: predicate.tile_eliminate 1.02% : 0.000003s : 17: predicate.transpose_eliminate 1.51% : 0.000004s : 27: predicate.tuple_list_convert_item_index_to_positive 1.63% : 0.000004s : 27: predicate.tuple_list_get_item_const_eliminator 1.62% : 0.000004s : 27: predicate.tuple_list_get_item_depend_reorder 3.24% : 0.000009s : 41: predicate.tuple_list_get_item_eliminator 1.59% : 0.000004s : 27: predicate.tuple_list_get_set_item_eliminator 2.15% : 0.000006s : 37: predicate.tuple_list_set_item_eliminator 1.72% : 0.000005s : 31: predicate.tuple_to_list_eliminator_ 2.50% : 0.000007s : 48: predicate.updatestate_pure_node_eliminater 3.21% : 0.000009s : 58: predicate.updatestate_useless_node_eliminater 0.35% : 0.000001s : 5: predicate.value_based_eliminate 0.66% : 0.000002s : 10: predicate.virtual_dataset_eliminate 0.61% : 0.000002s : 10: predicate.virtual_output_eliminate 0.26% : 0.000001s : 5: predicate.virtual_view_grad_eliminate 0.44% : 0.000001s : 5: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000563 11 56.27% : 0.000317s : 5: func_graph_cloner_run.FuncGraphClonerGraph 43.73% : 0.000246s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.045561 192 0.02% : 0.000007s : 1: ForceFp32Comm 7.24% : 0.003300s : 1: add_attr 7.21% : 0.003283s : 1: add_attr_with_inline 0.01% : 0.000006s : 1: add_comm_op_reuse_tag 0.15% : 0.000067s : 1: add_recomputation 0.02% : 0.000007s : 1: assign_add_opt 0.16% : 0.000072s : 1: auto_monad 0.07% : 0.000033s : 1: auto_monad_reorder 0.01% : 0.000006s : 1: begin_end_overlap_inline 0.02% : 0.000008s : 1: bias_add_comm_swap 0.96% : 0.000436s : 1: bootstrap 0.08% : 0.000038s : 1: cconv 0.01% : 0.000007s : 1: comm_op_add_attrs 0.06% : 0.000028s : 1: control_data_broadcast_order 0.03% : 0.000014s : 1: convert_after_rewriter 0.09% : 0.000039s : 1: cse_after_recomputation 0.02% : 0.000008s : 1: dataset_repeat_opt 0.05% : 0.000023s : 1: detach_backward 0.03% : 0.000013s : 1: environ_conv 0.07% : 0.000031s : 1: event_method 0.02% : 0.000008s : 1: full_micro_interleaved_order_control 0.02% : 0.000008s : 1: get_jit_bprop_graph 0.03% : 0.000013s : 1: graph_reusing 0.01% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000006s : 1: handle_group_info 0.02% : 0.000008s : 1: inline 0.02% : 0.000010s : 1: insert-virtual-dataset 0.01% : 0.000006s : 1: interleave_parallel_branches 0.01% : 0.000006s : 1: interleave_split_concat_branches 0.02% : 0.000009s : 1: label_fine_grained_interleaved_index 0.02% : 0.000011s : 1: label_micro_interleaved_index 1.06% : 0.000482s : 1: loop_unroll 0.02% : 0.000007s : 1: merge_cast_opt 0.02% : 0.000007s : 1: micro_interleaved_order_control 1.23% : 0.000561s : 1: mutable_eliminate 0.02% : 0.000010s : 1: offloading_packed_experts 0.04% : 0.000017s : 1: opt.transform.loop_unroll_optimizer 0.04% : 0.000019s : 1: opt.transform.mutable_eliminate 3.37% : 0.001535s : 78: opt.transform.opt_a 0.09% : 0.000040s : 1: opt.transform.opt_after_cconv 0.10% : 0.000047s : 1: opt.transform.opt_after_jit_grad 0.31% : 0.000142s : 28: opt.transform.opt_b 0.14% : 0.000065s : 2: opt.transform.opt_trans_graph 0.10% : 0.000047s : 4: opt.transform.symbol_engine_opt 7.83% : 0.003569s : 1: opt_a 0.34% : 0.000157s : 1: opt_after_cconv 1.27% : 0.000580s : 1: opt_after_jit_grad 0.74% : 0.000336s : 1: opt_b 14.25% : 0.006494s : 1: optimize 0.06% : 0.000025s : 1: optimize_parallel_all_gather_comm 0.03% : 0.000013s : 1: order_py_execute_after_rewriter 0.06% : 0.000029s : 1: overlap_grad_flash_sp 0.01% : 0.000007s : 1: overlap_grad_matmul_and_grad_allreduce 0.02% : 0.000010s : 1: overlap_grad_ring_attention 0.02% : 0.000007s : 1: overlap_opt_shard_grad_in_pipeline 0.02% : 0.000007s : 1: overlap_opt_shard_in_pipeline 0.02% : 0.000008s : 1: overlap_param_gather 0.01% : 0.000007s : 1: overlap_recompute_allgather_and_fa_grad 0.02% : 0.000011s : 1: overlap_recompute_and_grad_model_parallel 0.02% : 0.000008s : 1: overlap_recompute_comm 0.02% : 0.000011s : 1: parallel-infer-symbol 0.01% : 0.000007s : 1: parallel-infer-symbol-second 0.02% : 0.000008s : 1: partial_unused_args_eliminate 0.02% : 0.000010s : 1: pipeline_parallel_scheduler 0.02% : 0.000008s : 1: pipeline_split 0.10% : 0.000044s : 1: pre_auto_parallel 0.08% : 0.000036s : 1: py_interpret_to_execute 0.04% : 0.000019s : 1: py_interpret_to_execute_after_opt_a 0.02% : 0.000007s : 1: remove_cast_before_assign_add 0.05% : 0.000022s : 1: remove_dup_value 0.88% : 0.000403s : 1: renormalize.infer 0.68% : 0.000311s : 1: renormalize.specialize 0.02% : 0.000009s : 1: reorder_send_recv_between_fp_bp 0.02% : 0.000011s : 1: rewriter_after_jit_bprop_graph 0.11% : 0.000049s : 1: rewriter_after_opt_a 0.22% : 0.000100s : 1: rewriter_before_opt_a 0.02% : 0.000008s : 1: slice_cell_reuse_recomputed_activation 0.02% : 0.000008s : 1: slice_recompute_activation 0.02% : 0.000007s : 1: split_layernorm_comm 0.02% : 0.000008s : 1: split_matmul_comm_elemetwise 0.03% : 0.000012s : 1: swap_dp_allreduce_reducescatter 0.25% : 0.000112s : 1: symbol_engine_optimizer 0.25% : 0.000113s : 1: tuple_transform 49.33% : 0.022475s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:45:43.990.203 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0324301, [21] [bootstrap]: 0.00041499 [type_inference]: 0.0224803 [event_method]: 2.28e-05 [auto_monad]: 6.846e-05 [graph_reusing]: 5.99e-06 [inline]: 3.3e-06 [add_attr]: 0.00342011, [1] [add_attr_with_inline]: 0.00341011, [1] [Cycle 1]: 5.837e-05, [2] [tag_attr]: 1.992e-05 [meta_addattr_fg_expand]: 6.12999e-06 [parallel-infer-symbol]: 3.26001e-06 [pre_auto_parallel]: 3.442e-05 [insert-virtual-dataset]: 2.64001e-06 [parallel-infer-symbol-second]: 8.80013e-07 [dataset_repeat_opt]: 1.84998e-06 [pipeline_split]: 1.60999e-06 [optimize]: 0.00525953, [53] [py_interpret_to_execute]: 2.691e-05 [rewriter_before_opt_a]: 8.941e-05 [opt_a]: 0.003107, [2] [Cycle 1]: 0.00228583, [45] [expand_dump_flag]: 3.26001e-06 [switch_simplify]: 4.485e-05 [loop_unroll]: 3.288e-05 [a_1]: 0.00077209 [with_stream_mark]: 1.59e-05 [recompute_prepare]: 9.89999e-06 [updatestate_depend_eliminate]: 5.15001e-06 [updatestate_assign_eliminate]: 3.91001e-06 [updatestate_loads_eliminate]: 3.6e-06 [parameter_eliminate]: 2.21e-06 [a_2]: 0.00010435 [accelerated_algorithm]: 9.20999e-06 [shard]: 1.78002e-06 [meta_shard_fg_expand]: 2.17999e-06 [shard_inline]: 8.03999e-06 [merge_send_recv]: 1.042e-05 [auto_parallel]: 7.11001e-06 [parallel]: 2.039e-05 [flash_sp]: 9.06998e-06 [merge_comm]: 4.88001e-06 [allreduce_fusion]: 4.31002e-06 [matmul_add_comm_reduction]: 1.073e-05 [allreduce_slice_to_reducescatter]: 7.40023e-07 [virtual_shard_identity]: 9.36e-06 [virtual_dataset]: 7.9e-06 [get_grad_eliminate_]: 7.8e-06 [virtual_output]: 8.48999e-06 [merge_forward]: 4.58001e-06 [cell_reuse_recompute_pass]: 1.17e-06 [offload_activation]: 1.14e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.6e-05 [merge_recompute_call_nodes]: 1.54e-06 [before_grad]: 1.357e-05 [set_forward_comm_id_for_comm_node_pass]: 4.47e-06 [meta_fg_expand]: 3.53999e-06 [flash_sp_send_recv_attached]: 2.69001e-06 [receive_attached]: 2.32001e-06 [after_resolve]: 1.306e-05 [a_after_grad]: 1.193e-05 [renormalize]: 0.00070478 [add_forward_monad_depend]: 5.17999e-06 [auto_monad_grad]: 1.74998e-06 [auto_monad_eliminator]: 1.813e-05 [cse]: 3.54e-05 [a_3]: 5.933e-05 [Cycle 2]: 0.00081078, [45] [expand_dump_flag]: 1.07e-06 [switch_simplify]: 9.86e-06 [loop_unroll]: 8.3e-06 [a_1]: 0.00020439 [with_stream_mark]: 1.309e-05 [recompute_prepare]: 7.79002e-06 [updatestate_depend_eliminate]: 4.15e-06 [updatestate_assign_eliminate]: 3.13998e-06 [updatestate_loads_eliminate]: 2.93e-06 [parameter_eliminate]: 1.29e-06 [a_2]: 9.447e-05 [accelerated_algorithm]: 7.84002e-06 [shard]: 1.49e-06 [meta_shard_fg_expand]: 2.03002e-06 [shard_inline]: 7.7e-06 [merge_send_recv]: 6.21e-06 [auto_parallel]: 6.49001e-06 [parallel]: 4.62e-06 [flash_sp]: 3.38e-06 [merge_comm]: 4.12e-06 [allreduce_fusion]: 4.03999e-06 [matmul_add_comm_reduction]: 7.25e-06 [allreduce_slice_to_reducescatter]: 3.70026e-07 [virtual_shard_identity]: 9.22001e-06 [virtual_dataset]: 7.41999e-06 [get_grad_eliminate_]: 7.16001e-06 [virtual_output]: 7.14001e-06 [merge_forward]: 4.49002e-06 [cell_reuse_recompute_pass]: 1.57001e-06 [offload_activation]: 7.31001e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.45e-05 [merge_recompute_call_nodes]: 9.20001e-07 [before_grad]: 1.252e-05 [set_forward_comm_id_for_comm_node_pass]: 4.05998e-06 [meta_fg_expand]: 2.61999e-06 [flash_sp_send_recv_attached]: 8.89995e-07 [receive_attached]: 1.04e-06 [after_resolve]: 1.28e-05 [a_after_grad]: 1.166e-05 [renormalize]: 8.00064e-08 [add_forward_monad_depend]: 1.55999e-06 [auto_monad_grad]: 1.00001e-06 [auto_monad_eliminator]: 8.74e-06 [cse]: 2.033e-05 [a_3]: 4.701e-05 [py_interpret_to_execute_after_opt_a]: 1.08e-05 [slice_cell_reuse_recomputed_activation]: 2.27999e-06 [rewriter_after_opt_a]: 4.051e-05 [convert_after_rewriter]: 8.57998e-06 [order_py_execute_after_rewriter]: 5.95002e-06 [mutable_eliminate]: 0.00051405 [opt_b]: 0.00024855, [1] [Cycle 1]: 0.00024254, [7] [b_1]: 0.00016066 [b_2]: 9.37999e-06 [updatestate_depend_eliminate]: 6.12999e-06 [updatestate_assign_eliminate]: 2.94001e-06 [updatestate_loads_eliminate]: 2.94001e-06 [renormalize]: 7.29982e-07 [cse]: 2.306e-05 [optimize_parallel_all_gather_comm]: 1.847e-05 [overlap_param_gather]: 2.11e-06 [cconv]: 2.684e-05 [loop_unroll]: 0.00044512 [opt_after_cconv]: 0.0001167, [1] [Cycle 1]: 0.00011138, [7] [c_1]: 3.869e-05 [parameter_eliminate]: 2.89001e-06 [updatestate_depend_eliminate]: 5.99999e-06 [updatestate_assign_eliminate]: 3.19001e-06 [updatestate_loads_eliminate]: 2.86999e-06 [cse]: 2.239e-05 [renormalize]: 2.80008e-07 [remove_dup_value]: 1.497e-05 [tuple_transform]: 8.712e-05, [1] [Cycle 1]: 8.262e-05, [4] [d_1]: 5.179e-05 [none_parameter_eliminate]: 2.26998e-06 [renormalize]: 1.80007e-07 [switch_simplify]: 8.84998e-06 [partial_unused_args_eliminate]: 1.91998e-06 [add_recomputation]: 5.766e-05 [cse_after_recomputation]: 2.63e-05, [1] [Cycle 1]: 2.093e-05, [1] [cse]: 1.53e-05 [environ_conv]: 6.20002e-06 [swap_dp_allreduce_reducescatter]: 6.33002e-06 [bias_add_comm_swap]: 2.73998e-06 [label_micro_interleaved_index]: 4.27e-06 [label_fine_grained_interleaved_index]: 2.72001e-06 [merge_cast_opt]: 1.39e-06 [slice_recompute_activation]: 2.01e-06 [micro_interleaved_order_control]: 2.38998e-06 [assign_add_opt]: 1.11002e-06 [ForceFp32Comm]: 8.2e-07 [remove_cast_before_assign_add]: 1.00001e-06 [full_micro_interleaved_order_control]: 2.26e-06 [reorder_send_recv_between_fp_bp]: 2.71999e-06 [comm_op_add_attrs]: 1.07e-06 [add_comm_op_reuse_tag]: 9.20001e-07 [interleave_split_concat_branches]: 1.18001e-06 [interleave_parallel_branches]: 1.30999e-06 [overlap_opt_shard_in_pipeline]: 1.24e-06 [overlap_opt_shard_grad_in_pipeline]: 1.66e-06 [control_data_broadcast_order]: 1.528e-05 [grouped_pairwise_exchange_alltoall]: 1.79e-06 [offloading_packed_experts]: 4.42e-06 [overlap_recompute_and_grad_model_parallel]: 5.20001e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.15001e-06 [overlap_recompute_allgather_and_fa_grad]: 1.44998e-06 [overlap_recompute_comm]: 2.51e-06 [overlap_grad_ring_attention]: 4.72998e-06 [overlap_grad_flash_sp]: 2.116e-05 [begin_end_overlap_inline]: 5.69999e-07 [split_matmul_comm_elemetwise]: 2.32999e-06 [split_layernorm_comm]: 1.79e-06 [handle_group_info]: 1.25999e-06 [symbol_engine_optimizer]: 8.318e-05, [1] [Cycle 1]: 7.858e-05, [6] [build]: 3.36001e-06 [elim_shapecalc]: 1.071e-05 [elim_not_effective]: 1.481e-05 [opt_reshape]: 8.40999e-06 [fold_const_symbol]: 1.251e-05 [renormalize]: 2.00002e-07 [detach_backward]: 1.94999e-06 [pipeline_parallel_scheduler]: 1.48002e-06 [auto_monad_reorder]: 2.046e-05 [get_jit_bprop_graph]: 1.37e-06 [rewriter_after_jit_bprop_graph]: 5.17999e-06 [opt_after_jit_grad]: 0.00047673 [validate]: 4.396e-05 Sums bootstrap : 0.000415s : 1.48% type_inference : 0.022480s : 80.22% event_method : 0.000023s : 0.08% auto_monad : 0.000068s : 0.24% graph_reusing : 0.000006s : 0.02% inline : 0.000003s : 0.01% add_attr.add_attr_with_inline.tag_attr : 0.000020s : 0.07% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.02% parallel-infer-symbol : 0.000003s : 0.01% pre_auto_parallel : 0.000034s : 0.12% insert-virtual-dataset : 0.000003s : 0.01% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.01% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000027s : 0.10% optimize.rewriter_before_opt_a : 0.000089s : 0.32% optimize.opt_a.expand_dump_flag : 0.000004s : 0.02% optimize.opt_a.switch_simplify : 0.000055s : 0.20% optimize.opt_a.loop_unroll : 0.000041s : 0.15% optimize.opt_a.a_1 : 0.000976s : 3.48% optimize.opt_a.with_stream_mark : 0.000029s : 0.10% optimize.opt_a.recompute_prepare : 0.000018s : 0.06% optimize.opt_a.updatestate_depend_eliminate : 0.000009s : 0.03% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.03% optimize.opt_a.updatestate_loads_eliminate : 0.000007s : 0.02% optimize.opt_a.parameter_eliminate : 0.000003s : 0.01% optimize.opt_a.a_2 : 0.000199s : 0.71% optimize.opt_a.accelerated_algorithm : 0.000017s : 0.06% optimize.opt_a.shard : 0.000003s : 0.01% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.02% optimize.opt_a.shard_inline : 0.000016s : 0.06% optimize.opt_a.merge_send_recv : 0.000017s : 0.06% optimize.opt_a.auto_parallel : 0.000014s : 0.05% optimize.opt_a.parallel : 0.000025s : 0.09% optimize.opt_a.flash_sp : 0.000012s : 0.04% optimize.opt_a.merge_comm : 0.000009s : 0.03% optimize.opt_a.allreduce_fusion : 0.000008s : 0.03% optimize.opt_a.matmul_add_comm_reduction : 0.000018s : 0.06% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000019s : 0.07% optimize.opt_a.virtual_dataset : 0.000015s : 0.05% optimize.opt_a.get_grad_eliminate_ : 0.000015s : 0.05% optimize.opt_a.virtual_output : 0.000016s : 0.06% optimize.opt_a.merge_forward : 0.000009s : 0.03% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.01% optimize.opt_a.offload_activation : 0.000019s : 0.07% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000030s : 0.11% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.01% optimize.opt_a.before_grad : 0.000026s : 0.09% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000009s : 0.03% optimize.opt_a.meta_fg_expand : 0.000006s : 0.02% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.01% optimize.opt_a.receive_attached : 0.000003s : 0.01% optimize.opt_a.after_resolve : 0.000026s : 0.09% optimize.opt_a.a_after_grad : 0.000024s : 0.08% optimize.opt_a.renormalize : 0.000705s : 2.52% optimize.opt_a.add_forward_monad_depend : 0.000007s : 0.02% optimize.opt_a.auto_monad_grad : 0.000003s : 0.01% optimize.opt_a.auto_monad_eliminator : 0.000027s : 0.10% optimize.opt_a.cse : 0.000056s : 0.20% optimize.opt_a.a_3 : 0.000106s : 0.38% optimize.py_interpret_to_execute_after_opt_a : 0.000011s : 0.04% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.01% optimize.rewriter_after_opt_a : 0.000041s : 0.14% optimize.convert_after_rewriter : 0.000009s : 0.03% optimize.order_py_execute_after_rewriter : 0.000006s : 0.02% optimize.mutable_eliminate : 0.000514s : 1.83% optimize.opt_b.b_1 : 0.000161s : 0.57% optimize.opt_b.b_2 : 0.000009s : 0.03% optimize.opt_b.updatestate_depend_eliminate : 0.000006s : 0.02% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.01% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.01% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000023s : 0.08% optimize.optimize_parallel_all_gather_comm : 0.000018s : 0.07% optimize.overlap_param_gather : 0.000002s : 0.01% optimize.cconv : 0.000027s : 0.10% optimize.loop_unroll : 0.000445s : 1.59% optimize.opt_after_cconv.c_1 : 0.000039s : 0.14% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.02% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.cse : 0.000022s : 0.08% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000015s : 0.05% optimize.tuple_transform.d_1 : 0.000052s : 0.18% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000009s : 0.03% optimize.partial_unused_args_eliminate : 0.000002s : 0.01% optimize.add_recomputation : 0.000058s : 0.21% optimize.cse_after_recomputation.cse : 0.000015s : 0.05% optimize.environ_conv : 0.000006s : 0.02% optimize.swap_dp_allreduce_reducescatter : 0.000006s : 0.02% optimize.bias_add_comm_swap : 0.000003s : 0.01% optimize.label_micro_interleaved_index : 0.000004s : 0.02% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.01% optimize.merge_cast_opt : 0.000001s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.01% optimize.micro_interleaved_order_control : 0.000002s : 0.01% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.00% optimize.full_micro_interleaved_order_control : 0.000002s : 0.01% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.01% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.01% optimize.control_data_broadcast_order : 0.000015s : 0.05% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.01% optimize.offloading_packed_experts : 0.000004s : 0.02% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.02% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.01% optimize.overlap_recompute_comm : 0.000003s : 0.01% optimize.overlap_grad_ring_attention : 0.000005s : 0.02% optimize.overlap_grad_flash_sp : 0.000021s : 0.08% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.01% optimize.split_layernorm_comm : 0.000002s : 0.01% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000003s : 0.01% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000011s : 0.04% optimize.symbol_engine_optimizer.elim_not_effective : 0.000015s : 0.05% optimize.symbol_engine_optimizer.opt_reshape : 0.000008s : 0.03% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000013s : 0.04% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.01% pipeline_parallel_scheduler : 0.000001s : 0.01% auto_monad_reorder : 0.000020s : 0.07% get_jit_bprop_graph : 0.000001s : 0.00% rewriter_after_jit_bprop_graph : 0.000005s : 0.02% opt_after_jit_grad : 0.000477s : 1.70% validate : 0.000044s : 0.16% Time group info: ------[substitution.] 0.000213 38 10.32% : 0.000022s : 3: substitution.cast_eliminate 0.98% : 0.000002s : 3: substitution.elim_not_effective 0.93% : 0.000002s : 3: substitution.fold_const_symbol 2.99% : 0.000006s : 5: substitution.graph_param_transform 70.08% : 0.000149s : 4: substitution.inline 2.35% : 0.000005s : 6: substitution.j_node_and_user_rematch 2.83% : 0.000006s : 6: substitution.remove_not_recompute_node 2.05% : 0.000004s : 4: substitution.replace_old_param 7.47% : 0.000016s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.022404 2 96.18% : 0.021549s : 1: type_inference.infer 3.82% : 0.000855s : 1: type_inference.specialize ------[replace.] 0.000065 8 60.67% : 0.000039s : 4: replace.inline 39.33% : 0.000026s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000160 8 91.43% : 0.000147s : 4: match.inline 8.57% : 0.000014s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000264 1596 0.97% : 0.000003s : 17: predicate.accumulaten_eliminater 0.73% : 0.000002s : 5: predicate.ad_related_special_op_eliminate 0.53% : 0.000001s : 10: predicate.addn_check_dump 1.11% : 0.000003s : 17: predicate.addn_zero_filter 0.94% : 0.000002s : 17: predicate.adjust_all_reduce_mul_add 2.26% : 0.000006s : 27: predicate.arithmetic_simplify 1.14% : 0.000003s : 17: predicate.cast_eliminate 0.59% : 0.000002s : 10: predicate.check_bprop_eliminate 0.56% : 0.000001s : 10: predicate.compare_switch_simplify 0.18% : 0.000000s : 5: predicate.const_output_eliminate 0.60% : 0.000002s : 10: predicate.depend_value_elim 0.94% : 0.000002s : 17: predicate.dict_get_item_const_eliminator 1.31% : 0.000003s : 17: predicate.dict_get_item_eliminator 0.93% : 0.000002s : 17: predicate.dict_set_item_eliminator 0.86% : 0.000002s : 10: predicate.dumpgradient_eliminate 0.18% : 0.000000s : 5: predicate.elim_not_effective 0.34% : 0.000001s : 5: predicate.elim_shapecalc_of_broadcastargs 1.26% : 0.000003s : 22: predicate.environ_add_const_eliminate 1.12% : 0.000003s : 22: predicate.environ_get_add_eliminate 1.12% : 0.000003s : 22: predicate.environ_get_depend_swap 1.73% : 0.000005s : 32: predicate.environ_get_eliminate 1.24% : 0.000003s : 22: predicate.environ_get_set_eliminate 1.49% : 0.000004s : 25: predicate.exchange_switch_depend_value 2.36% : 0.000006s : 25: predicate.float_depend_g_call 0.53% : 0.000001s : 10: predicate.float_environ_get_switch 0.83% : 0.000002s : 15: predicate.float_tuple_getitem_switch 0.17% : 0.000000s : 5: predicate.fold_const_symbol 0.74% : 0.000002s : 10: predicate.get_grad_eliminate 0.19% : 0.000000s : 5: predicate.graph_param_transform 0.59% : 0.000002s : 10: predicate.incorporate_call 0.50% : 0.000001s : 10: predicate.incorporate_call_switch 6.17% : 0.000016s : 72: predicate.inline 0.79% : 0.000002s : 10: predicate.inline_without_move 0.32% : 0.000001s : 10: predicate.j_node_and_user_rematch 0.86% : 0.000002s : 10: predicate.less_batch_normalization 1.77% : 0.000005s : 31: predicate.list_to_tuple_eliminator_ 2.58% : 0.000007s : 48: predicate.load_eliminater 0.79% : 0.000002s : 5: predicate.loop_unroll_after_grad 2.08% : 0.000005s : 36: predicate.loop_unroll_before_grad 1.81% : 0.000005s : 27: predicate.make_slice_get_slice_eliminator 0.55% : 0.000001s : 10: predicate.merge_addn 0.59% : 0.000002s : 10: predicate.micro_step_allgather_replace 0.57% : 0.000001s : 10: predicate.mini_step_allgather_replace 0.92% : 0.000002s : 17: predicate.minmaximum_grad 0.78% : 0.000002s : 5: predicate.mutable_eliminate 0.34% : 0.000001s : 5: predicate.opt_reshape 0.33% : 0.000001s : 5: predicate.parallel_virtual_node 1.85% : 0.000005s : 25: predicate.partial_defer_inline 1.69% : 0.000004s : 26: predicate.partial_eliminate 0.95% : 0.000003s : 17: predicate.print_const_string_wrapper 0.60% : 0.000002s : 10: predicate.reduce_all_const_elim 1.28% : 0.000003s : 17: predicate.reduce_eliminate 2.57% : 0.000007s : 48: predicate.redundant_stop_gradient_eliminater 0.36% : 0.000001s : 10: predicate.remove_not_recompute_node 1.30% : 0.000003s : 31: predicate.replace_applicator 0.51% : 0.000001s : 10: predicate.replace_old_param 0.27% : 0.000001s : 5: predicate.reset_defer_inline 0.95% : 0.000003s : 17: predicate.reshape_eliminate 0.56% : 0.000001s : 10: predicate.row_tensor_add_zeros_like 0.34% : 0.000001s : 5: predicate.row_tensor_eliminate 0.75% : 0.000002s : 10: predicate.same_eliminate 0.39% : 0.000001s : 10: predicate.set_cell_output_no_recompute 0.81% : 0.000002s : 10: predicate.shard_identity_eliminate 0.65% : 0.000002s : 10: predicate.special_op_eliminate 0.67% : 0.000002s : 10: predicate.specialize_transform 0.82% : 0.000002s : 10: predicate.split_environ_get_set_with_tuple_value 0.78% : 0.000002s : 10: predicate.stack_unstack_eliminate 0.36% : 0.000001s : 5: predicate.switch_call_monad_eliminater 1.55% : 0.000004s : 25: predicate.switch_defer_inline 2.03% : 0.000005s : 35: predicate.switch_layer_defer_inline 4.76% : 0.000013s : 76: predicate.switch_simplify 1.08% : 0.000003s : 17: predicate.tile_eliminate 0.97% : 0.000003s : 17: predicate.transpose_eliminate 1.48% : 0.000004s : 27: predicate.tuple_list_convert_item_index_to_positive 1.69% : 0.000004s : 27: predicate.tuple_list_get_item_const_eliminator 1.63% : 0.000004s : 27: predicate.tuple_list_get_item_depend_reorder 3.25% : 0.000009s : 41: predicate.tuple_list_get_item_eliminator 1.55% : 0.000004s : 27: predicate.tuple_list_get_set_item_eliminator 2.40% : 0.000006s : 37: predicate.tuple_list_set_item_eliminator 1.70% : 0.000004s : 31: predicate.tuple_to_list_eliminator_ 2.51% : 0.000007s : 48: predicate.updatestate_pure_node_eliminater 3.24% : 0.000009s : 58: predicate.updatestate_useless_node_eliminater 0.34% : 0.000001s : 5: predicate.value_based_eliminate 0.65% : 0.000002s : 10: predicate.virtual_dataset_eliminate 0.61% : 0.000002s : 10: predicate.virtual_output_eliminate 0.30% : 0.000001s : 5: predicate.virtual_view_grad_eliminate 0.50% : 0.000001s : 5: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000578 11 50.24% : 0.000291s : 5: func_graph_cloner_run.FuncGraphClonerGraph 49.76% : 0.000288s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.043524 192 0.01% : 0.000004s : 1: ForceFp32Comm 7.87% : 0.003425s : 1: add_attr 7.84% : 0.003414s : 1: add_attr_with_inline 0.01% : 0.000004s : 1: add_comm_op_reuse_tag 0.14% : 0.000062s : 1: add_recomputation 0.01% : 0.000004s : 1: assign_add_opt 0.17% : 0.000074s : 1: auto_monad 0.06% : 0.000024s : 1: auto_monad_reorder 0.01% : 0.000003s : 1: begin_end_overlap_inline 0.01% : 0.000006s : 1: bias_add_comm_swap 1.02% : 0.000443s : 1: bootstrap 0.07% : 0.000031s : 1: cconv 0.01% : 0.000004s : 1: comm_op_add_attrs 0.04% : 0.000018s : 1: control_data_broadcast_order 0.03% : 0.000012s : 1: convert_after_rewriter 0.07% : 0.000029s : 1: cse_after_recomputation 0.01% : 0.000005s : 1: dataset_repeat_opt 0.01% : 0.000005s : 1: detach_backward 0.02% : 0.000009s : 1: environ_conv 0.07% : 0.000029s : 1: event_method 0.01% : 0.000005s : 1: full_micro_interleaved_order_control 0.01% : 0.000004s : 1: get_jit_bprop_graph 0.02% : 0.000010s : 1: graph_reusing 0.01% : 0.000005s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000004s : 1: handle_group_info 0.02% : 0.000007s : 1: inline 0.01% : 0.000006s : 1: insert-virtual-dataset 0.01% : 0.000004s : 1: interleave_parallel_branches 0.01% : 0.000004s : 1: interleave_split_concat_branches 0.01% : 0.000006s : 1: label_fine_grained_interleaved_index 0.02% : 0.000007s : 1: label_micro_interleaved_index 1.04% : 0.000453s : 1: loop_unroll 0.01% : 0.000004s : 1: merge_cast_opt 0.01% : 0.000005s : 1: micro_interleaved_order_control 1.20% : 0.000522s : 1: mutable_eliminate 0.02% : 0.000007s : 1: offloading_packed_experts 0.04% : 0.000017s : 1: opt.transform.loop_unroll_optimizer 0.04% : 0.000017s : 1: opt.transform.mutable_eliminate 3.46% : 0.001505s : 78: opt.transform.opt_a 0.09% : 0.000037s : 1: opt.transform.opt_after_cconv 0.07% : 0.000029s : 1: opt.transform.opt_after_jit_grad 0.32% : 0.000138s : 28: opt.transform.opt_b 0.13% : 0.000058s : 2: opt.transform.opt_trans_graph 0.10% : 0.000043s : 4: opt.transform.symbol_engine_opt 7.15% : 0.003110s : 1: opt_a 0.28% : 0.000120s : 1: opt_after_cconv 1.12% : 0.000486s : 1: opt_after_jit_grad 0.58% : 0.000252s : 1: opt_b 12.09% : 0.005264s : 1: optimize 0.05% : 0.000022s : 1: optimize_parallel_all_gather_comm 0.02% : 0.000009s : 1: order_py_execute_after_rewriter 0.06% : 0.000025s : 1: overlap_grad_flash_sp 0.01% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.02% : 0.000008s : 1: overlap_grad_ring_attention 0.01% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.01% : 0.000005s : 1: overlap_param_gather 0.01% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.02% : 0.000008s : 1: overlap_recompute_and_grad_model_parallel 0.01% : 0.000005s : 1: overlap_recompute_comm 0.02% : 0.000007s : 1: parallel-infer-symbol 0.01% : 0.000004s : 1: parallel-infer-symbol-second 0.01% : 0.000005s : 1: partial_unused_args_eliminate 0.01% : 0.000005s : 1: pipeline_parallel_scheduler 0.01% : 0.000005s : 1: pipeline_split 0.09% : 0.000039s : 1: pre_auto_parallel 0.07% : 0.000031s : 1: py_interpret_to_execute 0.03% : 0.000014s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000004s : 1: remove_cast_before_assign_add 0.04% : 0.000019s : 1: remove_dup_value 0.90% : 0.000391s : 1: renormalize.infer 0.70% : 0.000305s : 1: renormalize.specialize 0.01% : 0.000005s : 1: reorder_send_recv_between_fp_bp 0.02% : 0.000008s : 1: rewriter_after_jit_bprop_graph 0.10% : 0.000044s : 1: rewriter_after_opt_a 0.21% : 0.000093s : 1: rewriter_before_opt_a 0.01% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.01% : 0.000005s : 1: slice_recompute_activation 0.01% : 0.000005s : 1: split_layernorm_comm 0.01% : 0.000005s : 1: split_matmul_comm_elemetwise 0.02% : 0.000009s : 1: swap_dp_allreduce_reducescatter 0.20% : 0.000086s : 1: symbol_engine_optimizer 0.21% : 0.000090s : 1: tuple_transform 51.71% : 0.022504s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:45:44.398.949 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:45:44.399.221 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0170926, [21] [bootstrap]: 0.00045957 [type_inference]: 0.00633483 [event_method]: 1.953e-05 [auto_monad]: 6.549e-05 [graph_reusing]: 6.97002e-06 [inline]: 2.29999e-06 [add_attr]: 0.00320058, [1] [add_attr_with_inline]: 0.00319113, [1] [Cycle 1]: 7.479e-05, [2] [tag_attr]: 1.961e-05 [meta_addattr_fg_expand]: 6.53998e-06 [parallel-infer-symbol]: 2.90002e-06 [pre_auto_parallel]: 3.674e-05 [insert-virtual-dataset]: 2.22999e-06 [parallel-infer-symbol-second]: 7.00005e-07 [dataset_repeat_opt]: 1.77001e-06 [pipeline_split]: 1.72001e-06 [optimize]: 0.00578925, [53] [py_interpret_to_execute]: 2.918e-05 [rewriter_before_opt_a]: 0.00013462 [opt_a]: 0.00331263, [2] [Cycle 1]: 0.00233963, [45] [expand_dump_flag]: 3.88999e-06 [switch_simplify]: 4.484e-05 [loop_unroll]: 3.138e-05 [a_1]: 0.00067655 [with_stream_mark]: 1.654e-05 [recompute_prepare]: 1.1e-05 [updatestate_depend_eliminate]: 4.88001e-06 [updatestate_assign_eliminate]: 4.08001e-06 [updatestate_loads_eliminate]: 3.80998e-06 [parameter_eliminate]: 2.22001e-06 [a_2]: 0.00013382 [accelerated_algorithm]: 8.97999e-06 [shard]: 2.37001e-06 [meta_shard_fg_expand]: 1.97999e-06 [shard_inline]: 8.1e-06 [merge_send_recv]: 9.79999e-06 [auto_parallel]: 6.66e-06 [parallel]: 1.916e-05 [flash_sp]: 8.60999e-06 [merge_comm]: 4.85999e-06 [allreduce_fusion]: 4.76002e-06 [matmul_add_comm_reduction]: 1.126e-05 [allreduce_slice_to_reducescatter]: 6.59988e-07 [virtual_shard_identity]: 9.62999e-06 [virtual_dataset]: 7.98999e-06 [get_grad_eliminate_]: 7.68999e-06 [virtual_output]: 7.85e-06 [merge_forward]: 4.45999e-06 [cell_reuse_recompute_pass]: 1.19e-06 [offload_activation]: 1.098e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.776e-05 [merge_recompute_call_nodes]: 1.73002e-06 [before_grad]: 1.305e-05 [set_forward_comm_id_for_comm_node_pass]: 4.57998e-06 [meta_fg_expand]: 3.43e-06 [flash_sp_send_recv_attached]: 2.44001e-06 [receive_attached]: 2.84999e-06 [after_resolve]: 1.243e-05 [a_after_grad]: 1.256e-05 [renormalize]: 0.00067801 [add_forward_monad_depend]: 6.38e-06 [auto_monad_grad]: 1.97999e-06 [auto_monad_eliminator]: 1.749e-05 [cse]: 3.412e-05 [a_3]: 7.235e-05 [Cycle 2]: 0.00095927, [45] [expand_dump_flag]: 1.15001e-06 [switch_simplify]: 9.37001e-06 [loop_unroll]: 7.31999e-06 [a_1]: 0.00017571 [with_stream_mark]: 1.279e-05 [recompute_prepare]: 8.19002e-06 [updatestate_depend_eliminate]: 3.85e-06 [updatestate_assign_eliminate]: 3.09001e-06 [updatestate_loads_eliminate]: 2.99999e-06 [parameter_eliminate]: 1.20999e-06 [a_2]: 0.00012058 [accelerated_algorithm]: 7.87e-06 [shard]: 1.70001e-06 [meta_shard_fg_expand]: 2.17001e-06 [shard_inline]: 7.78999e-06 [merge_send_recv]: 9.52999e-06 [auto_parallel]: 6.36e-06 [parallel]: 5.49e-06 [flash_sp]: 3.54002e-06 [merge_comm]: 5.02e-06 [allreduce_fusion]: 4.21001e-06 [matmul_add_comm_reduction]: 7.65998e-06 [allreduce_slice_to_reducescatter]: 3.89991e-07 [virtual_shard_identity]: 8.84e-06 [virtual_dataset]: 1.086e-05 [get_grad_eliminate_]: 7.75e-06 [virtual_output]: 7.2e-06 [merge_forward]: 3.73999e-06 [cell_reuse_recompute_pass]: 1.55999e-06 [offload_activation]: 8.35999e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.664e-05 [merge_recompute_call_nodes]: 1.05999e-06 [before_grad]: 1.258e-05 [set_forward_comm_id_for_comm_node_pass]: 4.70999e-06 [meta_fg_expand]: 3.29001e-06 [flash_sp_send_recv_attached]: 8.89995e-07 [receive_attached]: 1.27e-06 [after_resolve]: 1.128e-05 [a_after_grad]: 1.126e-05 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 1.68002e-06 [auto_monad_grad]: 8.89995e-07 [auto_monad_eliminator]: 9.71e-06 [cse]: 1.918e-05 [a_3]: 5.875e-05 [py_interpret_to_execute_after_opt_a]: 1.478e-05 [slice_cell_reuse_recomputed_activation]: 5.25999e-06 [rewriter_after_opt_a]: 4.545e-05 [convert_after_rewriter]: 1.08e-05 [order_py_execute_after_rewriter]: 8.67e-06 [mutable_eliminate]: 0.00053941 [opt_b]: 0.00031262, [1] [Cycle 1]: 0.0003028, [7] [b_1]: 0.00019879 [b_2]: 9.59e-06 [updatestate_depend_eliminate]: 6.42001e-06 [updatestate_assign_eliminate]: 3.2e-06 [updatestate_loads_eliminate]: 3.43999e-06 [renormalize]: 4.70027e-07 [cse]: 2.341e-05 [optimize_parallel_all_gather_comm]: 2.507e-05 [overlap_param_gather]: 5.08002e-06 [cconv]: 2.942e-05 [loop_unroll]: 0.00045409 [opt_after_cconv]: 0.00013995, [1] [Cycle 1]: 0.00013146, [7] [c_1]: 3.82e-05 [parameter_eliminate]: 2.76e-06 [updatestate_depend_eliminate]: 6.86999e-06 [updatestate_assign_eliminate]: 3.11999e-06 [updatestate_loads_eliminate]: 2.77002e-06 [cse]: 2.237e-05 [renormalize]: 4.30009e-07 [remove_dup_value]: 1.896e-05 [tuple_transform]: 0.00010329, [1] [Cycle 1]: 9.519e-05, [4] [d_1]: 5.408e-05 [none_parameter_eliminate]: 1.50999e-06 [renormalize]: 2.10013e-07 [switch_simplify]: 8.57e-06 [partial_unused_args_eliminate]: 4.59998e-06 [add_recomputation]: 5.854e-05 [cse_after_recomputation]: 3.123e-05, [1] [Cycle 1]: 2.448e-05, [1] [cse]: 1.534e-05 [environ_conv]: 9.32001e-06 [swap_dp_allreduce_reducescatter]: 8.84e-06 [bias_add_comm_swap]: 4.72e-06 [label_micro_interleaved_index]: 7.83999e-06 [label_fine_grained_interleaved_index]: 5.35999e-06 [merge_cast_opt]: 4.23001e-06 [slice_recompute_activation]: 4.63001e-06 [micro_interleaved_order_control]: 4.92e-06 [assign_add_opt]: 3.56999e-06 [ForceFp32Comm]: 3.18e-06 [remove_cast_before_assign_add]: 3.7e-06 [full_micro_interleaved_order_control]: 4.84e-06 [reorder_send_recv_between_fp_bp]: 5.30001e-06 [comm_op_add_attrs]: 3.56999e-06 [add_comm_op_reuse_tag]: 3.66001e-06 [interleave_split_concat_branches]: 3.73001e-06 [interleave_parallel_branches]: 3.61999e-06 [overlap_opt_shard_in_pipeline]: 3.65e-06 [overlap_opt_shard_grad_in_pipeline]: 4.21001e-06 [control_data_broadcast_order]: 1.83e-05 [grouped_pairwise_exchange_alltoall]: 4.27e-06 [offloading_packed_experts]: 7.24001e-06 [overlap_recompute_and_grad_model_parallel]: 7.51001e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.55998e-06 [overlap_recompute_allgather_and_fa_grad]: 3.75e-06 [overlap_recompute_comm]: 4.95999e-06 [overlap_grad_ring_attention]: 7.21999e-06 [overlap_grad_flash_sp]: 2.439e-05 [begin_end_overlap_inline]: 3.06999e-06 [split_matmul_comm_elemetwise]: 4.33001e-06 [split_layernorm_comm]: 4.03999e-06 [handle_group_info]: 3.21001e-06 [symbol_engine_optimizer]: 0.00010448, [1] [Cycle 1]: 9.788e-05, [6] [build]: 3.34001e-06 [elim_shapecalc]: 1.157e-05 [elim_not_effective]: 1.518e-05 [opt_reshape]: 8.80001e-06 [fold_const_symbol]: 1.202e-05 [renormalize]: 1.99972e-07 [detach_backward]: 3.25e-06 [pipeline_parallel_scheduler]: 1.79e-06 [auto_monad_reorder]: 2.248e-05 [get_jit_bprop_graph]: 1.71002e-06 [rewriter_after_jit_bprop_graph]: 4.38001e-06 [opt_after_jit_grad]: 0.00050796 [validate]: 4.163e-05 Sums bootstrap : 0.000460s : 3.79% type_inference : 0.006335s : 52.22% event_method : 0.000020s : 0.16% auto_monad : 0.000065s : 0.54% graph_reusing : 0.000007s : 0.06% inline : 0.000002s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000020s : 0.16% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000007s : 0.05% parallel-infer-symbol : 0.000003s : 0.02% pre_auto_parallel : 0.000037s : 0.30% insert-virtual-dataset : 0.000002s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.01% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000029s : 0.24% optimize.rewriter_before_opt_a : 0.000135s : 1.11% optimize.opt_a.expand_dump_flag : 0.000005s : 0.04% optimize.opt_a.switch_simplify : 0.000054s : 0.45% optimize.opt_a.loop_unroll : 0.000039s : 0.32% optimize.opt_a.a_1 : 0.000852s : 7.03% optimize.opt_a.with_stream_mark : 0.000029s : 0.24% optimize.opt_a.recompute_prepare : 0.000019s : 0.16% optimize.opt_a.updatestate_depend_eliminate : 0.000009s : 0.07% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.06% optimize.opt_a.updatestate_loads_eliminate : 0.000007s : 0.06% optimize.opt_a.parameter_eliminate : 0.000003s : 0.03% optimize.opt_a.a_2 : 0.000254s : 2.10% optimize.opt_a.accelerated_algorithm : 0.000017s : 0.14% optimize.opt_a.shard : 0.000004s : 0.03% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.03% optimize.opt_a.shard_inline : 0.000016s : 0.13% optimize.opt_a.merge_send_recv : 0.000019s : 0.16% optimize.opt_a.auto_parallel : 0.000013s : 0.11% optimize.opt_a.parallel : 0.000025s : 0.20% optimize.opt_a.flash_sp : 0.000012s : 0.10% optimize.opt_a.merge_comm : 0.000010s : 0.08% optimize.opt_a.allreduce_fusion : 0.000009s : 0.07% optimize.opt_a.matmul_add_comm_reduction : 0.000019s : 0.16% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000018s : 0.15% optimize.opt_a.virtual_dataset : 0.000019s : 0.16% optimize.opt_a.get_grad_eliminate_ : 0.000015s : 0.13% optimize.opt_a.virtual_output : 0.000015s : 0.12% optimize.opt_a.merge_forward : 0.000008s : 0.07% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.02% optimize.opt_a.offload_activation : 0.000019s : 0.16% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000034s : 0.28% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.02% optimize.opt_a.before_grad : 0.000026s : 0.21% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000009s : 0.08% optimize.opt_a.meta_fg_expand : 0.000007s : 0.06% optimize.opt_a.flash_sp_send_recv_attached : 0.000003s : 0.03% optimize.opt_a.receive_attached : 0.000004s : 0.03% optimize.opt_a.after_resolve : 0.000024s : 0.20% optimize.opt_a.a_after_grad : 0.000024s : 0.20% optimize.opt_a.renormalize : 0.000678s : 5.59% optimize.opt_a.add_forward_monad_depend : 0.000008s : 0.07% optimize.opt_a.auto_monad_grad : 0.000003s : 0.02% optimize.opt_a.auto_monad_eliminator : 0.000027s : 0.22% optimize.opt_a.cse : 0.000053s : 0.44% optimize.opt_a.a_3 : 0.000131s : 1.08% optimize.py_interpret_to_execute_after_opt_a : 0.000015s : 0.12% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.04% optimize.rewriter_after_opt_a : 0.000045s : 0.37% optimize.convert_after_rewriter : 0.000011s : 0.09% optimize.order_py_execute_after_rewriter : 0.000009s : 0.07% optimize.mutable_eliminate : 0.000539s : 4.45% optimize.opt_b.b_1 : 0.000199s : 1.64% optimize.opt_b.b_2 : 0.000010s : 0.08% optimize.opt_b.updatestate_depend_eliminate : 0.000006s : 0.05% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_b.renormalize : 0.000000s : 0.00% optimize.opt_b.cse : 0.000023s : 0.19% optimize.optimize_parallel_all_gather_comm : 0.000025s : 0.21% optimize.overlap_param_gather : 0.000005s : 0.04% optimize.cconv : 0.000029s : 0.24% optimize.loop_unroll : 0.000454s : 3.74% optimize.opt_after_cconv.c_1 : 0.000038s : 0.31% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.02% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000007s : 0.06% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.02% optimize.opt_after_cconv.cse : 0.000022s : 0.18% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000019s : 0.16% optimize.tuple_transform.d_1 : 0.000054s : 0.45% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000009s : 0.07% optimize.partial_unused_args_eliminate : 0.000005s : 0.04% optimize.add_recomputation : 0.000059s : 0.48% optimize.cse_after_recomputation.cse : 0.000015s : 0.13% optimize.environ_conv : 0.000009s : 0.08% optimize.swap_dp_allreduce_reducescatter : 0.000009s : 0.07% optimize.bias_add_comm_swap : 0.000005s : 0.04% optimize.label_micro_interleaved_index : 0.000008s : 0.06% optimize.label_fine_grained_interleaved_index : 0.000005s : 0.04% optimize.merge_cast_opt : 0.000004s : 0.03% optimize.slice_recompute_activation : 0.000005s : 0.04% optimize.micro_interleaved_order_control : 0.000005s : 0.04% optimize.assign_add_opt : 0.000004s : 0.03% optimize.ForceFp32Comm : 0.000003s : 0.03% optimize.remove_cast_before_assign_add : 0.000004s : 0.03% optimize.full_micro_interleaved_order_control : 0.000005s : 0.04% optimize.reorder_send_recv_between_fp_bp : 0.000005s : 0.04% optimize.comm_op_add_attrs : 0.000004s : 0.03% optimize.add_comm_op_reuse_tag : 0.000004s : 0.03% optimize.interleave_split_concat_branches : 0.000004s : 0.03% optimize.interleave_parallel_branches : 0.000004s : 0.03% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.03% optimize.overlap_opt_shard_grad_in_pipeline : 0.000004s : 0.03% optimize.control_data_broadcast_order : 0.000018s : 0.15% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.04% optimize.offloading_packed_experts : 0.000007s : 0.06% optimize.overlap_recompute_and_grad_model_parallel : 0.000008s : 0.06% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.03% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.03% optimize.overlap_recompute_comm : 0.000005s : 0.04% optimize.overlap_grad_ring_attention : 0.000007s : 0.06% optimize.overlap_grad_flash_sp : 0.000024s : 0.20% optimize.begin_end_overlap_inline : 0.000003s : 0.03% optimize.split_matmul_comm_elemetwise : 0.000004s : 0.04% optimize.split_layernorm_comm : 0.000004s : 0.03% optimize.handle_group_info : 0.000003s : 0.03% optimize.symbol_engine_optimizer.build : 0.000003s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000012s : 0.10% optimize.symbol_engine_optimizer.elim_not_effective : 0.000015s : 0.13% optimize.symbol_engine_optimizer.opt_reshape : 0.000009s : 0.07% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000012s : 0.10% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000003s : 0.03% pipeline_parallel_scheduler : 0.000002s : 0.01% auto_monad_reorder : 0.000022s : 0.19% get_jit_bprop_graph : 0.000002s : 0.01% rewriter_after_jit_bprop_graph : 0.000004s : 0.04% opt_after_jit_grad : 0.000508s : 4.19% validate : 0.000042s : 0.34% Time group info: ------[substitution.] 0.000210 38 11.69% : 0.000025s : 3: substitution.cast_eliminate 1.04% : 0.000002s : 3: substitution.elim_not_effective 0.83% : 0.000002s : 3: substitution.fold_const_symbol 3.16% : 0.000007s : 5: substitution.graph_param_transform 69.80% : 0.000146s : 4: substitution.inline 2.40% : 0.000005s : 6: substitution.j_node_and_user_rematch 3.04% : 0.000006s : 6: substitution.remove_not_recompute_node 1.89% : 0.000004s : 4: substitution.replace_old_param 6.14% : 0.000013s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.006278 2 87.37% : 0.005485s : 1: type_inference.infer 12.63% : 0.000793s : 1: type_inference.specialize ------[replace.] 0.000061 8 61.86% : 0.000038s : 4: replace.inline 38.14% : 0.000023s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000155 8 92.88% : 0.000144s : 4: match.inline 7.12% : 0.000011s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000241 1504 0.86% : 0.000002s : 15: predicate.accumulaten_eliminater 0.79% : 0.000002s : 5: predicate.ad_related_special_op_eliminate 0.59% : 0.000001s : 10: predicate.addn_check_dump 0.90% : 0.000002s : 15: predicate.addn_zero_filter 0.80% : 0.000002s : 15: predicate.adjust_all_reduce_mul_add 1.97% : 0.000005s : 25: predicate.arithmetic_simplify 0.99% : 0.000002s : 15: predicate.cast_eliminate 0.66% : 0.000002s : 10: predicate.check_bprop_eliminate 0.64% : 0.000002s : 10: predicate.compare_switch_simplify 0.20% : 0.000000s : 5: predicate.const_output_eliminate 0.64% : 0.000002s : 10: predicate.depend_value_elim 0.92% : 0.000002s : 15: predicate.dict_get_item_const_eliminator 1.04% : 0.000003s : 15: predicate.dict_get_item_eliminator 0.91% : 0.000002s : 15: predicate.dict_set_item_eliminator 0.95% : 0.000002s : 10: predicate.dumpgradient_eliminate 0.22% : 0.000001s : 5: predicate.elim_not_effective 0.46% : 0.000001s : 5: predicate.elim_shapecalc_of_broadcastargs 1.18% : 0.000003s : 20: predicate.environ_add_const_eliminate 1.05% : 0.000003s : 20: predicate.environ_get_add_eliminate 1.14% : 0.000003s : 20: predicate.environ_get_depend_swap 1.78% : 0.000004s : 30: predicate.environ_get_eliminate 1.11% : 0.000003s : 20: predicate.environ_get_set_eliminate 1.37% : 0.000003s : 23: predicate.exchange_switch_depend_value 2.23% : 0.000005s : 23: predicate.float_depend_g_call 0.59% : 0.000001s : 10: predicate.float_environ_get_switch 0.86% : 0.000002s : 15: predicate.float_tuple_getitem_switch 0.18% : 0.000000s : 5: predicate.fold_const_symbol 0.78% : 0.000002s : 10: predicate.get_grad_eliminate 0.21% : 0.000001s : 5: predicate.graph_param_transform 0.62% : 0.000002s : 10: predicate.incorporate_call 0.54% : 0.000001s : 10: predicate.incorporate_call_switch 6.28% : 0.000015s : 68: predicate.inline 1.04% : 0.000003s : 10: predicate.inline_without_move 0.42% : 0.000001s : 10: predicate.j_node_and_user_rematch 0.93% : 0.000002s : 10: predicate.less_batch_normalization 1.93% : 0.000005s : 29: predicate.list_to_tuple_eliminator_ 2.44% : 0.000006s : 44: predicate.load_eliminater 0.92% : 0.000002s : 5: predicate.loop_unroll_after_grad 2.10% : 0.000005s : 36: predicate.loop_unroll_before_grad 1.67% : 0.000004s : 25: predicate.make_slice_get_slice_eliminator 0.63% : 0.000002s : 10: predicate.merge_addn 0.61% : 0.000001s : 10: predicate.micro_step_allgather_replace 0.59% : 0.000001s : 10: predicate.mini_step_allgather_replace 0.80% : 0.000002s : 15: predicate.minmaximum_grad 1.13% : 0.000003s : 5: predicate.mutable_eliminate 0.42% : 0.000001s : 5: predicate.opt_reshape 0.38% : 0.000001s : 5: predicate.parallel_virtual_node 1.75% : 0.000004s : 23: predicate.partial_defer_inline 1.66% : 0.000004s : 24: predicate.partial_eliminate 0.95% : 0.000002s : 15: predicate.print_const_string_wrapper 0.66% : 0.000002s : 10: predicate.reduce_all_const_elim 1.17% : 0.000003s : 15: predicate.reduce_eliminate 2.49% : 0.000006s : 44: predicate.redundant_stop_gradient_eliminater 0.38% : 0.000001s : 10: predicate.remove_not_recompute_node 1.35% : 0.000003s : 29: predicate.replace_applicator 0.52% : 0.000001s : 10: predicate.replace_old_param 0.24% : 0.000001s : 5: predicate.reset_defer_inline 0.91% : 0.000002s : 15: predicate.reshape_eliminate 0.59% : 0.000001s : 10: predicate.row_tensor_add_zeros_like 0.48% : 0.000001s : 5: predicate.row_tensor_eliminate 0.85% : 0.000002s : 10: predicate.same_eliminate 0.52% : 0.000001s : 10: predicate.set_cell_output_no_recompute 0.85% : 0.000002s : 10: predicate.shard_identity_eliminate 0.72% : 0.000002s : 10: predicate.special_op_eliminate 0.75% : 0.000002s : 10: predicate.specialize_transform 0.86% : 0.000002s : 10: predicate.split_environ_get_set_with_tuple_value 0.77% : 0.000002s : 10: predicate.stack_unstack_eliminate 0.39% : 0.000001s : 5: predicate.switch_call_monad_eliminater 1.43% : 0.000003s : 23: predicate.switch_defer_inline 1.95% : 0.000005s : 33: predicate.switch_layer_defer_inline 4.82% : 0.000012s : 74: predicate.switch_simplify 0.91% : 0.000002s : 15: predicate.tile_eliminate 0.91% : 0.000002s : 15: predicate.transpose_eliminate 1.47% : 0.000004s : 25: predicate.tuple_list_convert_item_index_to_positive 1.61% : 0.000004s : 25: predicate.tuple_list_get_item_const_eliminator 1.43% : 0.000003s : 25: predicate.tuple_list_get_item_depend_reorder 3.25% : 0.000008s : 39: predicate.tuple_list_get_item_eliminator 1.46% : 0.000004s : 25: predicate.tuple_list_get_set_item_eliminator 2.33% : 0.000006s : 35: predicate.tuple_list_set_item_eliminator 1.86% : 0.000004s : 29: predicate.tuple_to_list_eliminator_ 2.54% : 0.000006s : 44: predicate.updatestate_pure_node_eliminater 3.24% : 0.000008s : 54: predicate.updatestate_useless_node_eliminater 0.37% : 0.000001s : 5: predicate.value_based_eliminate 0.70% : 0.000002s : 10: predicate.virtual_dataset_eliminate 0.69% : 0.000002s : 10: predicate.virtual_output_eliminate 0.29% : 0.000001s : 5: predicate.virtual_view_grad_eliminate 0.42% : 0.000001s : 5: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000586 11 51.64% : 0.000303s : 5: func_graph_cloner_run.FuncGraphClonerGraph 48.36% : 0.000284s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.028350 192 0.02% : 0.000006s : 1: ForceFp32Comm 11.32% : 0.003210s : 1: add_attr 11.27% : 0.003195s : 1: add_attr_with_inline 0.02% : 0.000006s : 1: add_comm_op_reuse_tag 0.22% : 0.000062s : 1: add_recomputation 0.02% : 0.000006s : 1: assign_add_opt 0.27% : 0.000076s : 1: auto_monad 0.10% : 0.000030s : 1: auto_monad_reorder 0.02% : 0.000006s : 1: begin_end_overlap_inline 0.03% : 0.000007s : 1: bias_add_comm_swap 1.78% : 0.000506s : 1: bootstrap 0.11% : 0.000033s : 1: cconv 0.02% : 0.000006s : 1: comm_op_add_attrs 0.07% : 0.000021s : 1: control_data_broadcast_order 0.05% : 0.000014s : 1: convert_after_rewriter 0.12% : 0.000034s : 1: cse_after_recomputation 0.03% : 0.000007s : 1: dataset_repeat_opt 0.06% : 0.000018s : 1: detach_backward 0.04% : 0.000012s : 1: environ_conv 0.11% : 0.000030s : 1: event_method 0.03% : 0.000008s : 1: full_micro_interleaved_order_control 0.03% : 0.000008s : 1: get_jit_bprop_graph 0.05% : 0.000014s : 1: graph_reusing 0.02% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.02% : 0.000006s : 1: handle_group_info 0.03% : 0.000008s : 1: inline 0.03% : 0.000008s : 1: insert-virtual-dataset 0.02% : 0.000006s : 1: interleave_parallel_branches 0.02% : 0.000006s : 1: interleave_split_concat_branches 0.03% : 0.000008s : 1: label_fine_grained_interleaved_index 0.04% : 0.000011s : 1: label_micro_interleaved_index 1.62% : 0.000460s : 1: loop_unroll 0.02% : 0.000007s : 1: merge_cast_opt 0.03% : 0.000008s : 1: micro_interleaved_order_control 1.92% : 0.000545s : 1: mutable_eliminate 0.04% : 0.000010s : 1: offloading_packed_experts 0.06% : 0.000016s : 1: opt.transform.loop_unroll_optimizer 0.07% : 0.000019s : 1: opt.transform.mutable_eliminate 4.87% : 0.001381s : 78: opt.transform.opt_a 0.13% : 0.000036s : 1: opt.transform.opt_after_cconv 0.11% : 0.000031s : 1: opt.transform.opt_after_jit_grad 0.48% : 0.000136s : 28: opt.transform.opt_b 0.21% : 0.000061s : 2: opt.transform.opt_trans_graph 0.16% : 0.000044s : 4: opt.transform.symbol_engine_opt 11.70% : 0.003316s : 1: opt_a 0.51% : 0.000143s : 1: opt_after_cconv 1.83% : 0.000519s : 1: opt_after_jit_grad 1.12% : 0.000316s : 1: opt_b 21.58% : 0.006117s : 1: optimize 0.10% : 0.000028s : 1: optimize_parallel_all_gather_comm 0.04% : 0.000012s : 1: order_py_execute_after_rewriter 0.10% : 0.000028s : 1: overlap_grad_flash_sp 0.02% : 0.000006s : 1: overlap_grad_matmul_and_grad_allreduce 0.04% : 0.000010s : 1: overlap_grad_ring_attention 0.03% : 0.000007s : 1: overlap_opt_shard_grad_in_pipeline 0.02% : 0.000006s : 1: overlap_opt_shard_in_pipeline 0.03% : 0.000008s : 1: overlap_param_gather 0.02% : 0.000006s : 1: overlap_recompute_allgather_and_fa_grad 0.04% : 0.000010s : 1: overlap_recompute_and_grad_model_parallel 0.03% : 0.000008s : 1: overlap_recompute_comm 0.03% : 0.000010s : 1: parallel-infer-symbol 0.02% : 0.000006s : 1: parallel-infer-symbol-second 0.03% : 0.000007s : 1: partial_unused_args_eliminate 0.03% : 0.000010s : 1: pipeline_parallel_scheduler 0.03% : 0.000007s : 1: pipeline_split 0.16% : 0.000044s : 1: pre_auto_parallel 0.12% : 0.000033s : 1: py_interpret_to_execute 0.06% : 0.000018s : 1: py_interpret_to_execute_after_opt_a 0.02% : 0.000007s : 1: remove_cast_before_assign_add 0.08% : 0.000022s : 1: remove_dup_value 1.27% : 0.000359s : 1: renormalize.infer 1.10% : 0.000312s : 1: renormalize.specialize 0.03% : 0.000008s : 1: reorder_send_recv_between_fp_bp 0.04% : 0.000011s : 1: rewriter_after_jit_bprop_graph 0.18% : 0.000050s : 1: rewriter_after_opt_a 0.49% : 0.000140s : 1: rewriter_before_opt_a 0.03% : 0.000008s : 1: slice_cell_reuse_recomputed_activation 0.03% : 0.000007s : 1: slice_recompute_activation 0.02% : 0.000007s : 1: split_layernorm_comm 0.02% : 0.000007s : 1: split_matmul_comm_elemetwise 0.04% : 0.000012s : 1: swap_dp_allreduce_reducescatter 0.38% : 0.000108s : 1: symbol_engine_optimizer 0.37% : 0.000106s : 1: tuple_transform 22.50% : 0.006378s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:45:44.837.990 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0306045, [21] [bootstrap]: 0.00037622 [type_inference]: 0.00546956 [event_method]: 1.798e-05 [auto_monad]: 6.474e-05 [graph_reusing]: 6.14999e-06 [inline]: 2.66999e-06 [add_attr]: 0.0030551, [1] [add_attr_with_inline]: 0.00304608, [1] [Cycle 1]: 5.481e-05, [2] [tag_attr]: 1.804e-05 [meta_addattr_fg_expand]: 6.21998e-06 [parallel-infer-symbol]: 3.35998e-06 [pre_auto_parallel]: 3.134e-05 [insert-virtual-dataset]: 2.29001e-06 [parallel-infer-symbol-second]: 9.00007e-07 [dataset_repeat_opt]: 1.81003e-06 [pipeline_split]: 1.61998e-06 [optimize]: 0.0208773, [53] [py_interpret_to_execute]: 2.53e-05 [rewriter_before_opt_a]: 8.283e-05 [opt_a]: 0.0185093, [2] [Cycle 1]: 0.0176266, [45] [expand_dump_flag]: 2.86e-06 [switch_simplify]: 4.488e-05 [loop_unroll]: 3.114e-05 [a_1]: 0.00065856 [with_stream_mark]: 1.591e-05 [recompute_prepare]: 1.058e-05 [updatestate_depend_eliminate]: 4.27e-06 [updatestate_assign_eliminate]: 3.68e-06 [updatestate_loads_eliminate]: 3.38999e-06 [parameter_eliminate]: 2.06e-06 [a_2]: 0.00010187 [accelerated_algorithm]: 1.744e-05 [shard]: 1.97001e-06 [meta_shard_fg_expand]: 2.11e-06 [shard_inline]: 9.20999e-06 [merge_send_recv]: 1.064e-05 [auto_parallel]: 7.6e-06 [parallel]: 1.927e-05 [flash_sp]: 8.94e-06 [merge_comm]: 4.99e-06 [allreduce_fusion]: 4.60001e-06 [matmul_add_comm_reduction]: 1.046e-05 [allreduce_slice_to_reducescatter]: 8.2e-07 [virtual_shard_identity]: 1.07e-05 [virtual_dataset]: 8.80001e-06 [get_grad_eliminate_]: 7.56001e-06 [virtual_output]: 7.55e-06 [merge_forward]: 4.32e-06 [cell_reuse_recompute_pass]: 1.07998e-06 [offload_activation]: 1.135e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.607e-05 [merge_recompute_call_nodes]: 1.74e-06 [before_grad]: 1.294e-05 [set_forward_comm_id_for_comm_node_pass]: 5.01002e-06 [meta_fg_expand]: 3.39001e-06 [flash_sp_send_recv_attached]: 2.54999e-06 [receive_attached]: 2.27001e-06 [after_resolve]: 1.303e-05 [a_after_grad]: 1.231e-05 [renormalize]: 0.0161094 [add_forward_monad_depend]: 8.16002e-06 [auto_monad_grad]: 2.64999e-06 [auto_monad_eliminator]: 2.231e-05 [cse]: 3.989e-05 [a_3]: 7.288e-05 [Cycle 2]: 0.00086834, [45] [expand_dump_flag]: 1.99e-06 [switch_simplify]: 1.065e-05 [loop_unroll]: 7.82998e-06 [a_1]: 0.00019655 [with_stream_mark]: 1.863e-05 [recompute_prepare]: 8.92e-06 [updatestate_depend_eliminate]: 4.43999e-06 [updatestate_assign_eliminate]: 3.48e-06 [updatestate_loads_eliminate]: 3.6e-06 [parameter_eliminate]: 1.96e-06 [a_2]: 9.969e-05 [accelerated_algorithm]: 8.48999e-06 [shard]: 2.51998e-06 [meta_shard_fg_expand]: 2.36e-06 [shard_inline]: 8.28999e-06 [merge_send_recv]: 1.048e-05 [auto_parallel]: 1.053e-05 [parallel]: 9.31998e-06 [flash_sp]: 3.88001e-06 [merge_comm]: 4.73001e-06 [allreduce_fusion]: 4.2e-06 [matmul_add_comm_reduction]: 1.04e-05 [allreduce_slice_to_reducescatter]: 9.20001e-07 [virtual_shard_identity]: 8.85001e-06 [virtual_dataset]: 7.95998e-06 [get_grad_eliminate_]: 7.48e-06 [virtual_output]: 7.25e-06 [merge_forward]: 5.04e-06 [cell_reuse_recompute_pass]: 3.46001e-06 [offload_activation]: 9.91e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.646e-05 [merge_recompute_call_nodes]: 1.50999e-06 [before_grad]: 1.343e-05 [set_forward_comm_id_for_comm_node_pass]: 4.38999e-06 [meta_fg_expand]: 3.34001e-06 [flash_sp_send_recv_attached]: 1.64998e-06 [receive_attached]: 2.27999e-06 [after_resolve]: 1.573e-05 [a_after_grad]: 1.229e-05 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 1.97999e-06 [auto_monad_grad]: 1.49e-06 [auto_monad_eliminator]: 9.88998e-06 [cse]: 2.242e-05 [a_3]: 4.649e-05 [py_interpret_to_execute_after_opt_a]: 1.435e-05 [slice_cell_reuse_recomputed_activation]: 2.71e-06 [rewriter_after_opt_a]: 4.246e-05 [convert_after_rewriter]: 7.88001e-06 [order_py_execute_after_rewriter]: 6.23e-06 [mutable_eliminate]: 0.0007135 [opt_b]: 0.00025211, [1] [Cycle 1]: 0.00024507, [7] [b_1]: 0.00015626 [b_2]: 1.016e-05 [updatestate_depend_eliminate]: 7.22002e-06 [updatestate_assign_eliminate]: 3.09999e-06 [updatestate_loads_eliminate]: 3.36999e-06 [renormalize]: 6.49976e-07 [cse]: 2.741e-05 [optimize_parallel_all_gather_comm]: 1.936e-05 [overlap_param_gather]: 2.04e-06 [cconv]: 2.785e-05 [loop_unroll]: 0.00045142 [opt_after_cconv]: 0.00011884, [1] [Cycle 1]: 0.00011242, [7] [c_1]: 3.747e-05 [parameter_eliminate]: 3.09999e-06 [updatestate_depend_eliminate]: 5.87001e-06 [updatestate_assign_eliminate]: 3.56001e-06 [updatestate_loads_eliminate]: 3.01999e-06 [cse]: 2.447e-05 [renormalize]: 3.89991e-07 [remove_dup_value]: 1.579e-05 [tuple_transform]: 8.568e-05, [1] [Cycle 1]: 8.125e-05, [4] [d_1]: 5.212e-05 [none_parameter_eliminate]: 1.91e-06 [renormalize]: 2.19996e-07 [switch_simplify]: 8.12e-06 [partial_unused_args_eliminate]: 1.87999e-06 [add_recomputation]: 5.65e-05 [cse_after_recomputation]: 2.477e-05, [1] [Cycle 1]: 2.012e-05, [1] [cse]: 1.482e-05 [environ_conv]: 6.51e-06 [swap_dp_allreduce_reducescatter]: 5.86e-06 [bias_add_comm_swap]: 2.44999e-06 [label_micro_interleaved_index]: 5.40001e-06 [label_fine_grained_interleaved_index]: 2.54999e-06 [merge_cast_opt]: 1.37999e-06 [slice_recompute_activation]: 2.16e-06 [micro_interleaved_order_control]: 2.56e-06 [assign_add_opt]: 1.37e-06 [ForceFp32Comm]: 1.03001e-06 [remove_cast_before_assign_add]: 1.41002e-06 [full_micro_interleaved_order_control]: 2.21e-06 [reorder_send_recv_between_fp_bp]: 2.81e-06 [comm_op_add_attrs]: 9.89996e-07 [add_comm_op_reuse_tag]: 9.20001e-07 [interleave_split_concat_branches]: 1.11997e-06 [interleave_parallel_branches]: 1.00999e-06 [overlap_opt_shard_in_pipeline]: 1.17999e-06 [overlap_opt_shard_grad_in_pipeline]: 1.69e-06 [control_data_broadcast_order]: 1.487e-05 [grouped_pairwise_exchange_alltoall]: 1.64998e-06 [offloading_packed_experts]: 4.37e-06 [overlap_recompute_and_grad_model_parallel]: 5.27999e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.28002e-06 [overlap_recompute_allgather_and_fa_grad]: 1.32999e-06 [overlap_recompute_comm]: 2.36e-06 [overlap_grad_ring_attention]: 4.80999e-06 [overlap_grad_flash_sp]: 2.514e-05 [begin_end_overlap_inline]: 5.19998e-07 [split_matmul_comm_elemetwise]: 2.09e-06 [split_layernorm_comm]: 1.55999e-06 [handle_group_info]: 1.39e-06 [symbol_engine_optimizer]: 8.258e-05, [1] [Cycle 1]: 7.832e-05, [6] [build]: 3.5e-06 [elim_shapecalc]: 1.146e-05 [elim_not_effective]: 1.503e-05 [opt_reshape]: 8.05e-06 [fold_const_symbol]: 1.211e-05 [renormalize]: 1.80007e-07 [detach_backward]: 2.49999e-06 [pipeline_parallel_scheduler]: 1.48002e-06 [auto_monad_reorder]: 1.95e-05 [get_jit_bprop_graph]: 2.01e-06 [rewriter_after_jit_bprop_graph]: 3.48e-06 [opt_after_jit_grad]: 0.000484 [validate]: 3.867e-05 Sums bootstrap : 0.000376s : 1.42% type_inference : 0.005470s : 20.61% event_method : 0.000018s : 0.07% auto_monad : 0.000065s : 0.24% graph_reusing : 0.000006s : 0.02% inline : 0.000003s : 0.01% add_attr.add_attr_with_inline.tag_attr : 0.000018s : 0.07% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.02% parallel-infer-symbol : 0.000003s : 0.01% pre_auto_parallel : 0.000031s : 0.12% insert-virtual-dataset : 0.000002s : 0.01% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.01% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000025s : 0.10% optimize.rewriter_before_opt_a : 0.000083s : 0.31% optimize.opt_a.expand_dump_flag : 0.000005s : 0.02% optimize.opt_a.switch_simplify : 0.000056s : 0.21% optimize.opt_a.loop_unroll : 0.000039s : 0.15% optimize.opt_a.a_1 : 0.000855s : 3.22% optimize.opt_a.with_stream_mark : 0.000035s : 0.13% optimize.opt_a.recompute_prepare : 0.000020s : 0.07% optimize.opt_a.updatestate_depend_eliminate : 0.000009s : 0.03% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.03% optimize.opt_a.updatestate_loads_eliminate : 0.000007s : 0.03% optimize.opt_a.parameter_eliminate : 0.000004s : 0.02% optimize.opt_a.a_2 : 0.000202s : 0.76% optimize.opt_a.accelerated_algorithm : 0.000026s : 0.10% optimize.opt_a.shard : 0.000004s : 0.02% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.02% optimize.opt_a.shard_inline : 0.000017s : 0.07% optimize.opt_a.merge_send_recv : 0.000021s : 0.08% optimize.opt_a.auto_parallel : 0.000018s : 0.07% optimize.opt_a.parallel : 0.000029s : 0.11% optimize.opt_a.flash_sp : 0.000013s : 0.05% optimize.opt_a.merge_comm : 0.000010s : 0.04% optimize.opt_a.allreduce_fusion : 0.000009s : 0.03% optimize.opt_a.matmul_add_comm_reduction : 0.000021s : 0.08% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000002s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000020s : 0.07% optimize.opt_a.virtual_dataset : 0.000017s : 0.06% optimize.opt_a.get_grad_eliminate_ : 0.000015s : 0.06% optimize.opt_a.virtual_output : 0.000015s : 0.06% optimize.opt_a.merge_forward : 0.000009s : 0.04% optimize.opt_a.cell_reuse_recompute_pass : 0.000005s : 0.02% optimize.opt_a.offload_activation : 0.000021s : 0.08% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000033s : 0.12% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.01% optimize.opt_a.before_grad : 0.000026s : 0.10% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000009s : 0.04% optimize.opt_a.meta_fg_expand : 0.000007s : 0.03% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.02% optimize.opt_a.receive_attached : 0.000005s : 0.02% optimize.opt_a.after_resolve : 0.000029s : 0.11% optimize.opt_a.a_after_grad : 0.000025s : 0.09% optimize.opt_a.renormalize : 0.016110s : 60.70% optimize.opt_a.add_forward_monad_depend : 0.000010s : 0.04% optimize.opt_a.auto_monad_grad : 0.000004s : 0.02% optimize.opt_a.auto_monad_eliminator : 0.000032s : 0.12% optimize.opt_a.cse : 0.000062s : 0.23% optimize.opt_a.a_3 : 0.000119s : 0.45% optimize.py_interpret_to_execute_after_opt_a : 0.000014s : 0.05% optimize.slice_cell_reuse_recomputed_activation : 0.000003s : 0.01% optimize.rewriter_after_opt_a : 0.000042s : 0.16% optimize.convert_after_rewriter : 0.000008s : 0.03% optimize.order_py_execute_after_rewriter : 0.000006s : 0.02% optimize.mutable_eliminate : 0.000713s : 2.69% optimize.opt_b.b_1 : 0.000156s : 0.59% optimize.opt_b.b_2 : 0.000010s : 0.04% optimize.opt_b.updatestate_depend_eliminate : 0.000007s : 0.03% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.01% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.01% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000027s : 0.10% optimize.optimize_parallel_all_gather_comm : 0.000019s : 0.07% optimize.overlap_param_gather : 0.000002s : 0.01% optimize.cconv : 0.000028s : 0.10% optimize.loop_unroll : 0.000451s : 1.70% optimize.opt_after_cconv.c_1 : 0.000037s : 0.14% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.02% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000004s : 0.01% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.cse : 0.000024s : 0.09% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000016s : 0.06% optimize.tuple_transform.d_1 : 0.000052s : 0.20% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000008s : 0.03% optimize.partial_unused_args_eliminate : 0.000002s : 0.01% optimize.add_recomputation : 0.000056s : 0.21% optimize.cse_after_recomputation.cse : 0.000015s : 0.06% optimize.environ_conv : 0.000007s : 0.02% optimize.swap_dp_allreduce_reducescatter : 0.000006s : 0.02% optimize.bias_add_comm_swap : 0.000002s : 0.01% optimize.label_micro_interleaved_index : 0.000005s : 0.02% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.01% optimize.merge_cast_opt : 0.000001s : 0.01% optimize.slice_recompute_activation : 0.000002s : 0.01% optimize.micro_interleaved_order_control : 0.000003s : 0.01% optimize.assign_add_opt : 0.000001s : 0.01% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.01% optimize.full_micro_interleaved_order_control : 0.000002s : 0.01% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.01% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.01% optimize.control_data_broadcast_order : 0.000015s : 0.06% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.01% optimize.offloading_packed_experts : 0.000004s : 0.02% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.02% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.01% optimize.overlap_recompute_comm : 0.000002s : 0.01% optimize.overlap_grad_ring_attention : 0.000005s : 0.02% optimize.overlap_grad_flash_sp : 0.000025s : 0.09% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.01% optimize.split_layernorm_comm : 0.000002s : 0.01% optimize.handle_group_info : 0.000001s : 0.01% optimize.symbol_engine_optimizer.build : 0.000003s : 0.01% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000011s : 0.04% optimize.symbol_engine_optimizer.elim_not_effective : 0.000015s : 0.06% optimize.symbol_engine_optimizer.opt_reshape : 0.000008s : 0.03% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000012s : 0.05% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.01% pipeline_parallel_scheduler : 0.000001s : 0.01% auto_monad_reorder : 0.000019s : 0.07% get_jit_bprop_graph : 0.000002s : 0.01% rewriter_after_jit_bprop_graph : 0.000003s : 0.01% opt_after_jit_grad : 0.000484s : 1.82% validate : 0.000039s : 0.15% Time group info: ------[substitution.] 0.000203 38 13.52% : 0.000027s : 3: substitution.cast_eliminate 1.16% : 0.000002s : 3: substitution.elim_not_effective 0.79% : 0.000002s : 3: substitution.fold_const_symbol 3.21% : 0.000007s : 5: substitution.graph_param_transform 66.88% : 0.000136s : 4: substitution.inline 2.40% : 0.000005s : 6: substitution.j_node_and_user_rematch 3.19% : 0.000006s : 6: substitution.remove_not_recompute_node 2.72% : 0.000006s : 4: substitution.replace_old_param 6.11% : 0.000012s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.005416 2 87.52% : 0.004740s : 1: type_inference.infer 12.48% : 0.000676s : 1: type_inference.specialize ------[replace.] 0.000062 8 62.18% : 0.000039s : 4: replace.inline 37.82% : 0.000023s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000144 8 92.57% : 0.000133s : 4: match.inline 7.43% : 0.000011s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000247 1504 0.96% : 0.000002s : 15: predicate.accumulaten_eliminater 0.74% : 0.000002s : 5: predicate.ad_related_special_op_eliminate 0.64% : 0.000002s : 10: predicate.addn_check_dump 0.89% : 0.000002s : 15: predicate.addn_zero_filter 0.78% : 0.000002s : 15: predicate.adjust_all_reduce_mul_add 2.00% : 0.000005s : 25: predicate.arithmetic_simplify 1.09% : 0.000003s : 15: predicate.cast_eliminate 0.60% : 0.000001s : 10: predicate.check_bprop_eliminate 0.58% : 0.000001s : 10: predicate.compare_switch_simplify 0.18% : 0.000000s : 5: predicate.const_output_eliminate 0.71% : 0.000002s : 10: predicate.depend_value_elim 0.97% : 0.000002s : 15: predicate.dict_get_item_const_eliminator 1.05% : 0.000003s : 15: predicate.dict_get_item_eliminator 0.94% : 0.000002s : 15: predicate.dict_set_item_eliminator 0.88% : 0.000002s : 10: predicate.dumpgradient_eliminate 0.21% : 0.000001s : 5: predicate.elim_not_effective 0.33% : 0.000001s : 5: predicate.elim_shapecalc_of_broadcastargs 1.17% : 0.000003s : 20: predicate.environ_add_const_eliminate 1.09% : 0.000003s : 20: predicate.environ_get_add_eliminate 1.10% : 0.000003s : 20: predicate.environ_get_depend_swap 1.74% : 0.000004s : 30: predicate.environ_get_eliminate 1.10% : 0.000003s : 20: predicate.environ_get_set_eliminate 1.35% : 0.000003s : 23: predicate.exchange_switch_depend_value 2.39% : 0.000006s : 23: predicate.float_depend_g_call 0.58% : 0.000001s : 10: predicate.float_environ_get_switch 0.86% : 0.000002s : 15: predicate.float_tuple_getitem_switch 0.20% : 0.000000s : 5: predicate.fold_const_symbol 0.71% : 0.000002s : 10: predicate.get_grad_eliminate 0.21% : 0.000001s : 5: predicate.graph_param_transform 0.62% : 0.000002s : 10: predicate.incorporate_call 0.57% : 0.000001s : 10: predicate.incorporate_call_switch 6.37% : 0.000016s : 68: predicate.inline 0.88% : 0.000002s : 10: predicate.inline_without_move 0.34% : 0.000001s : 10: predicate.j_node_and_user_rematch 0.89% : 0.000002s : 10: predicate.less_batch_normalization 2.02% : 0.000005s : 29: predicate.list_to_tuple_eliminator_ 2.40% : 0.000006s : 44: predicate.load_eliminater 0.87% : 0.000002s : 5: predicate.loop_unroll_after_grad 2.11% : 0.000005s : 36: predicate.loop_unroll_before_grad 1.76% : 0.000004s : 25: predicate.make_slice_get_slice_eliminator 0.59% : 0.000001s : 10: predicate.merge_addn 0.63% : 0.000002s : 10: predicate.micro_step_allgather_replace 0.79% : 0.000002s : 10: predicate.mini_step_allgather_replace 0.78% : 0.000002s : 15: predicate.minmaximum_grad 0.95% : 0.000002s : 5: predicate.mutable_eliminate 0.39% : 0.000001s : 5: predicate.opt_reshape 0.37% : 0.000001s : 5: predicate.parallel_virtual_node 1.59% : 0.000004s : 23: predicate.partial_defer_inline 1.63% : 0.000004s : 24: predicate.partial_eliminate 0.95% : 0.000002s : 15: predicate.print_const_string_wrapper 0.64% : 0.000002s : 10: predicate.reduce_all_const_elim 1.11% : 0.000003s : 15: predicate.reduce_eliminate 2.45% : 0.000006s : 44: predicate.redundant_stop_gradient_eliminater 0.45% : 0.000001s : 10: predicate.remove_not_recompute_node 1.37% : 0.000003s : 29: predicate.replace_applicator 0.49% : 0.000001s : 10: predicate.replace_old_param 0.34% : 0.000001s : 5: predicate.reset_defer_inline 1.10% : 0.000003s : 15: predicate.reshape_eliminate 0.65% : 0.000002s : 10: predicate.row_tensor_add_zeros_like 0.38% : 0.000001s : 5: predicate.row_tensor_eliminate 0.92% : 0.000002s : 10: predicate.same_eliminate 0.42% : 0.000001s : 10: predicate.set_cell_output_no_recompute 0.96% : 0.000002s : 10: predicate.shard_identity_eliminate 0.72% : 0.000002s : 10: predicate.special_op_eliminate 0.75% : 0.000002s : 10: predicate.specialize_transform 1.07% : 0.000003s : 10: predicate.split_environ_get_set_with_tuple_value 0.75% : 0.000002s : 10: predicate.stack_unstack_eliminate 0.36% : 0.000001s : 5: predicate.switch_call_monad_eliminater 1.38% : 0.000003s : 23: predicate.switch_defer_inline 1.98% : 0.000005s : 33: predicate.switch_layer_defer_inline 4.78% : 0.000012s : 74: predicate.switch_simplify 0.84% : 0.000002s : 15: predicate.tile_eliminate 0.92% : 0.000002s : 15: predicate.transpose_eliminate 1.41% : 0.000003s : 25: predicate.tuple_list_convert_item_index_to_positive 1.62% : 0.000004s : 25: predicate.tuple_list_get_item_const_eliminator 1.55% : 0.000004s : 25: predicate.tuple_list_get_item_depend_reorder 3.36% : 0.000008s : 39: predicate.tuple_list_get_item_eliminator 1.49% : 0.000004s : 25: predicate.tuple_list_get_set_item_eliminator 2.40% : 0.000006s : 35: predicate.tuple_list_set_item_eliminator 1.77% : 0.000004s : 29: predicate.tuple_to_list_eliminator_ 2.35% : 0.000006s : 44: predicate.updatestate_pure_node_eliminater 3.12% : 0.000008s : 54: predicate.updatestate_useless_node_eliminater 0.35% : 0.000001s : 5: predicate.value_based_eliminate 0.74% : 0.000002s : 10: predicate.virtual_dataset_eliminate 0.69% : 0.000002s : 10: predicate.virtual_output_eliminate 0.30% : 0.000001s : 5: predicate.virtual_view_grad_eliminate 0.49% : 0.000001s : 5: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000541 11 49.90% : 0.000270s : 5: func_graph_cloner_run.FuncGraphClonerGraph 50.10% : 0.000271s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.072265 192 0.01% : 0.000004s : 1: ForceFp32Comm 4.23% : 0.003060s : 1: add_attr 4.22% : 0.003050s : 1: add_attr_with_inline 0.00% : 0.000004s : 1: add_comm_op_reuse_tag 0.08% : 0.000061s : 1: add_recomputation 0.01% : 0.000004s : 1: assign_add_opt 0.10% : 0.000070s : 1: auto_monad 0.03% : 0.000023s : 1: auto_monad_reorder 0.00% : 0.000003s : 1: begin_end_overlap_inline 0.01% : 0.000005s : 1: bias_add_comm_swap 0.56% : 0.000404s : 1: bootstrap 0.04% : 0.000031s : 1: cconv 0.01% : 0.000004s : 1: comm_op_add_attrs 0.03% : 0.000018s : 1: control_data_broadcast_order 0.02% : 0.000011s : 1: convert_after_rewriter 0.04% : 0.000028s : 1: cse_after_recomputation 0.01% : 0.000005s : 1: dataset_repeat_opt 0.01% : 0.000006s : 1: detach_backward 0.01% : 0.000010s : 1: environ_conv 0.03% : 0.000024s : 1: event_method 0.01% : 0.000005s : 1: full_micro_interleaved_order_control 0.01% : 0.000005s : 1: get_jit_bprop_graph 0.01% : 0.000010s : 1: graph_reusing 0.01% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000004s : 1: handle_group_info 0.01% : 0.000006s : 1: inline 0.01% : 0.000006s : 1: insert-virtual-dataset 0.01% : 0.000004s : 1: interleave_parallel_branches 0.01% : 0.000004s : 1: interleave_split_concat_branches 0.01% : 0.000006s : 1: label_fine_grained_interleaved_index 0.01% : 0.000008s : 1: label_micro_interleaved_index 0.64% : 0.000461s : 1: loop_unroll 0.01% : 0.000004s : 1: merge_cast_opt 0.01% : 0.000005s : 1: micro_interleaved_order_control 1.00% : 0.000724s : 1: mutable_eliminate 0.01% : 0.000007s : 1: offloading_packed_experts 0.02% : 0.000018s : 1: opt.transform.loop_unroll_optimizer 0.03% : 0.000022s : 1: opt.transform.mutable_eliminate 1.95% : 0.001409s : 78: opt.transform.opt_a 0.05% : 0.000036s : 1: opt.transform.opt_after_cconv 0.04% : 0.000029s : 1: opt.transform.opt_after_jit_grad 0.18% : 0.000134s : 28: opt.transform.opt_b 0.08% : 0.000058s : 2: opt.transform.opt_trans_graph 0.06% : 0.000043s : 4: opt.transform.symbol_engine_opt 25.62% : 0.018513s : 1: opt_a 0.17% : 0.000122s : 1: opt_after_cconv 0.68% : 0.000492s : 1: opt_after_jit_grad 0.35% : 0.000256s : 1: opt_b 28.90% : 0.020883s : 1: optimize 0.03% : 0.000023s : 1: optimize_parallel_all_gather_comm 0.01% : 0.000009s : 1: order_py_execute_after_rewriter 0.04% : 0.000028s : 1: overlap_grad_flash_sp 0.01% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.01% : 0.000008s : 1: overlap_grad_ring_attention 0.01% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.01% : 0.000005s : 1: overlap_param_gather 0.01% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.01% : 0.000008s : 1: overlap_recompute_and_grad_model_parallel 0.01% : 0.000005s : 1: overlap_recompute_comm 0.01% : 0.000007s : 1: parallel-infer-symbol 0.01% : 0.000004s : 1: parallel-infer-symbol-second 0.01% : 0.000005s : 1: partial_unused_args_eliminate 0.01% : 0.000005s : 1: pipeline_parallel_scheduler 0.01% : 0.000004s : 1: pipeline_split 0.05% : 0.000036s : 1: pre_auto_parallel 0.04% : 0.000029s : 1: py_interpret_to_execute 0.02% : 0.000018s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000004s : 1: remove_cast_before_assign_add 0.03% : 0.000019s : 1: remove_dup_value 21.71% : 0.015688s : 1: renormalize.infer 0.56% : 0.000408s : 1: renormalize.specialize 0.01% : 0.000005s : 1: reorder_send_recv_between_fp_bp 0.01% : 0.000007s : 1: rewriter_after_jit_bprop_graph 0.07% : 0.000048s : 1: rewriter_after_opt_a 0.12% : 0.000087s : 1: rewriter_before_opt_a 0.01% : 0.000006s : 1: slice_cell_reuse_recomputed_activation 0.01% : 0.000005s : 1: slice_recompute_activation 0.01% : 0.000004s : 1: split_layernorm_comm 0.01% : 0.000005s : 1: split_matmul_comm_elemetwise 0.01% : 0.000009s : 1: swap_dp_allreduce_reducescatter 0.12% : 0.000085s : 1: symbol_engine_optimizer 0.12% : 0.000089s : 1: tuple_transform 7.59% : 0.005484s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:45:45.226.991 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:45:45.227.256 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0218148, [21] [bootstrap]: 0.00050382 [type_inference]: 0.00760953 [event_method]: 2.459e-05 [auto_monad]: 7.428e-05 [graph_reusing]: 6.23002e-06 [inline]: 3.45998e-06 [add_attr]: 0.00425403, [1] [add_attr_with_inline]: 0.00423948, [1] [Cycle 1]: 0.0001182, [2] [tag_attr]: 2.973e-05 [meta_addattr_fg_expand]: 7.40998e-06 [parallel-infer-symbol]: 4.38999e-06 [pre_auto_parallel]: 4.596e-05 [insert-virtual-dataset]: 2.66e-06 [parallel-infer-symbol-second]: 9.00007e-07 [dataset_repeat_opt]: 2.34999e-06 [pipeline_split]: 1.81e-06 [optimize]: 0.00768532, [53] [py_interpret_to_execute]: 4.302e-05 [rewriter_before_opt_a]: 0.00011362 [opt_a]: 0.00435817, [2] [Cycle 1]: 0.00318212, [45] [expand_dump_flag]: 3.53999e-06 [switch_simplify]: 4.956e-05 [loop_unroll]: 3.613e-05 [a_1]: 0.00085297 [with_stream_mark]: 3.216e-05 [recompute_prepare]: 2.009e-05 [updatestate_depend_eliminate]: 5.74e-06 [updatestate_assign_eliminate]: 4.35e-06 [updatestate_loads_eliminate]: 3.91999e-06 [parameter_eliminate]: 3.43e-06 [a_2]: 0.0001404 [accelerated_algorithm]: 1.231e-05 [shard]: 3.54002e-06 [meta_shard_fg_expand]: 2.54999e-06 [shard_inline]: 9.20999e-06 [merge_send_recv]: 1.078e-05 [auto_parallel]: 1.175e-05 [parallel]: 2.538e-05 [flash_sp]: 1.401e-05 [merge_comm]: 5.89e-06 [allreduce_fusion]: 4.92e-06 [matmul_add_comm_reduction]: 1.367e-05 [allreduce_slice_to_reducescatter]: 7.59988e-07 [virtual_shard_identity]: 1.437e-05 [virtual_dataset]: 9.25999e-06 [get_grad_eliminate_]: 9.57999e-06 [virtual_output]: 8.70001e-06 [merge_forward]: 6.39001e-06 [cell_reuse_recompute_pass]: 2.49001e-06 [offload_activation]: 1.261e-05 [cell_reuse_handle_not_recompute_node_pass]: 3.047e-05 [merge_recompute_call_nodes]: 1.66e-06 [before_grad]: 1.74e-05 [set_forward_comm_id_for_comm_node_pass]: 7.66001e-06 [meta_fg_expand]: 4.03001e-06 [flash_sp_send_recv_attached]: 4.67e-06 [receive_attached]: 2.83e-06 [after_resolve]: 2.021e-05 [a_after_grad]: 1.39e-05 [renormalize]: 0.0010779 [add_forward_monad_depend]: 9.97001e-06 [auto_monad_grad]: 3.16001e-06 [auto_monad_eliminator]: 2.551e-05 [cse]: 4.353e-05 [a_3]: 8.756e-05 [Cycle 2]: 0.00115466, [45] [expand_dump_flag]: 3.13998e-06 [switch_simplify]: 1.11e-05 [loop_unroll]: 8.15999e-06 [a_1]: 0.00019332 [with_stream_mark]: 2.584e-05 [recompute_prepare]: 1.047e-05 [updatestate_depend_eliminate]: 5.66e-06 [updatestate_assign_eliminate]: 4.02002e-06 [updatestate_loads_eliminate]: 3.31001e-06 [parameter_eliminate]: 2.42001e-06 [a_2]: 0.0001244 [accelerated_algorithm]: 9.00001e-06 [shard]: 3.22002e-06 [meta_shard_fg_expand]: 2.82002e-06 [shard_inline]: 8.78001e-06 [merge_send_recv]: 1.038e-05 [auto_parallel]: 1.044e-05 [parallel]: 9.20001e-06 [flash_sp]: 4.28999e-06 [merge_comm]: 5.92999e-06 [allreduce_fusion]: 4.57e-06 [matmul_add_comm_reduction]: 1.266e-05 [allreduce_slice_to_reducescatter]: 1.09e-06 [virtual_shard_identity]: 1.569e-05 [virtual_dataset]: 9.12999e-06 [get_grad_eliminate_]: 8.67e-06 [virtual_output]: 8.33999e-06 [merge_forward]: 6.02001e-06 [cell_reuse_recompute_pass]: 2.83998e-06 [offload_activation]: 1.363e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.42e-05 [merge_recompute_call_nodes]: 1.50999e-06 [before_grad]: 1.49e-05 [set_forward_comm_id_for_comm_node_pass]: 8.09002e-06 [meta_fg_expand]: 3.63999e-06 [flash_sp_send_recv_attached]: 1.86e-06 [receive_attached]: 2.76e-06 [after_resolve]: 1.469e-05 [a_after_grad]: 1.481e-05 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 5.22e-06 [auto_monad_grad]: 2.54001e-06 [auto_monad_eliminator]: 1.674e-05 [cse]: 3.116e-05 [a_3]: 6.56e-05 [py_interpret_to_execute_after_opt_a]: 2.911e-05 [slice_cell_reuse_recomputed_activation]: 4.85001e-06 [rewriter_after_opt_a]: 6.644e-05 [convert_after_rewriter]: 1.285e-05 [order_py_execute_after_rewriter]: 9.27999e-06 [mutable_eliminate]: 0.00089285 [opt_b]: 0.00037755, [1] [Cycle 1]: 0.00036358, [7] [b_1]: 0.00021461 [b_2]: 1.271e-05 [updatestate_depend_eliminate]: 1.3e-05 [updatestate_assign_eliminate]: 4.03999e-06 [updatestate_loads_eliminate]: 4.19002e-06 [renormalize]: 1.24e-06 [cse]: 4.449e-05 [optimize_parallel_all_gather_comm]: 3.177e-05 [overlap_param_gather]: 5.29998e-06 [cconv]: 4.666e-05 [loop_unroll]: 0.0006393 [opt_after_cconv]: 0.00016896, [1] [Cycle 1]: 0.0001579, [7] [c_1]: 4.224e-05 [parameter_eliminate]: 6.28e-06 [updatestate_depend_eliminate]: 1.052e-05 [updatestate_assign_eliminate]: 3.75e-06 [updatestate_loads_eliminate]: 3.25002e-06 [cse]: 3.217e-05 [renormalize]: 5.00004e-07 [remove_dup_value]: 2.108e-05 [tuple_transform]: 0.00011351, [1] [Cycle 1]: 0.00010519, [4] [d_1]: 6.041e-05 [none_parameter_eliminate]: 1.86e-06 [renormalize]: 4.80009e-07 [switch_simplify]: 8.94e-06 [partial_unused_args_eliminate]: 4.80999e-06 [add_recomputation]: 7.29e-05 [cse_after_recomputation]: 3.895e-05, [1] [Cycle 1]: 3.076e-05, [1] [cse]: 1.955e-05 [environ_conv]: 1.116e-05 [swap_dp_allreduce_reducescatter]: 1.048e-05 [bias_add_comm_swap]: 6.19001e-06 [label_micro_interleaved_index]: 8.89e-06 [label_fine_grained_interleaved_index]: 5.76e-06 [merge_cast_opt]: 3.88999e-06 [slice_recompute_activation]: 4.69998e-06 [micro_interleaved_order_control]: 4.75001e-06 [assign_add_opt]: 3.86001e-06 [ForceFp32Comm]: 3.70998e-06 [remove_cast_before_assign_add]: 3.35e-06 [full_micro_interleaved_order_control]: 5.35001e-06 [reorder_send_recv_between_fp_bp]: 5.62001e-06 [comm_op_add_attrs]: 3.45003e-06 [add_comm_op_reuse_tag]: 3.42997e-06 [interleave_split_concat_branches]: 3.85998e-06 [interleave_parallel_branches]: 3.45e-06 [overlap_opt_shard_in_pipeline]: 4.33999e-06 [overlap_opt_shard_grad_in_pipeline]: 4.45e-06 [control_data_broadcast_order]: 2.398e-05 [grouped_pairwise_exchange_alltoall]: 4.14997e-06 [offloading_packed_experts]: 8.11002e-06 [overlap_recompute_and_grad_model_parallel]: 8.03999e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.6e-06 [overlap_recompute_allgather_and_fa_grad]: 4.41002e-06 [overlap_recompute_comm]: 5.27999e-06 [overlap_grad_ring_attention]: 8.27e-06 [overlap_grad_flash_sp]: 3.113e-05 [begin_end_overlap_inline]: 3.45e-06 [split_matmul_comm_elemetwise]: 4.95999e-06 [split_layernorm_comm]: 3.98001e-06 [handle_group_info]: 3.53999e-06 [symbol_engine_optimizer]: 0.00013239, [1] [Cycle 1]: 0.00012292, [6] [build]: 5.77999e-06 [elim_shapecalc]: 1.625e-05 [elim_not_effective]: 1.984e-05 [opt_reshape]: 1.094e-05 [fold_const_symbol]: 1.332e-05 [renormalize]: 2.29978e-07 [detach_backward]: 6.14999e-06 [pipeline_parallel_scheduler]: 2.08002e-06 [auto_monad_reorder]: 3.228e-05 [get_jit_bprop_graph]: 2.49999e-06 [rewriter_after_jit_bprop_graph]: 7.88999e-06 [opt_after_jit_grad]: 0.00074111 [validate]: 5.873e-05 Sums bootstrap : 0.000504s : 3.28% type_inference : 0.007610s : 49.56% event_method : 0.000025s : 0.16% auto_monad : 0.000074s : 0.48% graph_reusing : 0.000006s : 0.04% inline : 0.000003s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000030s : 0.19% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000007s : 0.05% parallel-infer-symbol : 0.000004s : 0.03% pre_auto_parallel : 0.000046s : 0.30% insert-virtual-dataset : 0.000003s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000043s : 0.28% optimize.rewriter_before_opt_a : 0.000114s : 0.74% optimize.opt_a.expand_dump_flag : 0.000007s : 0.04% optimize.opt_a.switch_simplify : 0.000061s : 0.40% optimize.opt_a.loop_unroll : 0.000044s : 0.29% optimize.opt_a.a_1 : 0.001046s : 6.81% optimize.opt_a.with_stream_mark : 0.000058s : 0.38% optimize.opt_a.recompute_prepare : 0.000031s : 0.20% optimize.opt_a.updatestate_depend_eliminate : 0.000011s : 0.07% optimize.opt_a.updatestate_assign_eliminate : 0.000008s : 0.05% optimize.opt_a.updatestate_loads_eliminate : 0.000007s : 0.05% optimize.opt_a.parameter_eliminate : 0.000006s : 0.04% optimize.opt_a.a_2 : 0.000265s : 1.72% optimize.opt_a.accelerated_algorithm : 0.000021s : 0.14% optimize.opt_a.shard : 0.000007s : 0.04% optimize.opt_a.meta_shard_fg_expand : 0.000005s : 0.03% optimize.opt_a.shard_inline : 0.000018s : 0.12% optimize.opt_a.merge_send_recv : 0.000021s : 0.14% optimize.opt_a.auto_parallel : 0.000022s : 0.14% optimize.opt_a.parallel : 0.000035s : 0.23% optimize.opt_a.flash_sp : 0.000018s : 0.12% optimize.opt_a.merge_comm : 0.000012s : 0.08% optimize.opt_a.allreduce_fusion : 0.000009s : 0.06% optimize.opt_a.matmul_add_comm_reduction : 0.000026s : 0.17% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000002s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000030s : 0.20% optimize.opt_a.virtual_dataset : 0.000018s : 0.12% optimize.opt_a.get_grad_eliminate_ : 0.000018s : 0.12% optimize.opt_a.virtual_output : 0.000017s : 0.11% optimize.opt_a.merge_forward : 0.000012s : 0.08% optimize.opt_a.cell_reuse_recompute_pass : 0.000005s : 0.03% optimize.opt_a.offload_activation : 0.000026s : 0.17% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000055s : 0.36% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.02% optimize.opt_a.before_grad : 0.000032s : 0.21% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000016s : 0.10% optimize.opt_a.meta_fg_expand : 0.000008s : 0.05% optimize.opt_a.flash_sp_send_recv_attached : 0.000007s : 0.04% optimize.opt_a.receive_attached : 0.000006s : 0.04% optimize.opt_a.after_resolve : 0.000035s : 0.23% optimize.opt_a.a_after_grad : 0.000029s : 0.19% optimize.opt_a.renormalize : 0.001078s : 7.02% optimize.opt_a.add_forward_monad_depend : 0.000015s : 0.10% optimize.opt_a.auto_monad_grad : 0.000006s : 0.04% optimize.opt_a.auto_monad_eliminator : 0.000042s : 0.28% optimize.opt_a.cse : 0.000075s : 0.49% optimize.opt_a.a_3 : 0.000153s : 1.00% optimize.py_interpret_to_execute_after_opt_a : 0.000029s : 0.19% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.03% optimize.rewriter_after_opt_a : 0.000066s : 0.43% optimize.convert_after_rewriter : 0.000013s : 0.08% optimize.order_py_execute_after_rewriter : 0.000009s : 0.06% optimize.mutable_eliminate : 0.000893s : 5.82% optimize.opt_b.b_1 : 0.000215s : 1.40% optimize.opt_b.b_2 : 0.000013s : 0.08% optimize.opt_b.updatestate_depend_eliminate : 0.000013s : 0.08% optimize.opt_b.updatestate_assign_eliminate : 0.000004s : 0.03% optimize.opt_b.updatestate_loads_eliminate : 0.000004s : 0.03% optimize.opt_b.renormalize : 0.000001s : 0.01% optimize.opt_b.cse : 0.000044s : 0.29% optimize.optimize_parallel_all_gather_comm : 0.000032s : 0.21% optimize.overlap_param_gather : 0.000005s : 0.03% optimize.cconv : 0.000047s : 0.30% optimize.loop_unroll : 0.000639s : 4.16% optimize.opt_after_cconv.c_1 : 0.000042s : 0.28% optimize.opt_after_cconv.parameter_eliminate : 0.000006s : 0.04% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000011s : 0.07% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000004s : 0.02% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.02% optimize.opt_after_cconv.cse : 0.000032s : 0.21% optimize.opt_after_cconv.renormalize : 0.000001s : 0.00% optimize.remove_dup_value : 0.000021s : 0.14% optimize.tuple_transform.d_1 : 0.000060s : 0.39% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000009s : 0.06% optimize.partial_unused_args_eliminate : 0.000005s : 0.03% optimize.add_recomputation : 0.000073s : 0.47% optimize.cse_after_recomputation.cse : 0.000020s : 0.13% optimize.environ_conv : 0.000011s : 0.07% optimize.swap_dp_allreduce_reducescatter : 0.000010s : 0.07% optimize.bias_add_comm_swap : 0.000006s : 0.04% optimize.label_micro_interleaved_index : 0.000009s : 0.06% optimize.label_fine_grained_interleaved_index : 0.000006s : 0.04% optimize.merge_cast_opt : 0.000004s : 0.03% optimize.slice_recompute_activation : 0.000005s : 0.03% optimize.micro_interleaved_order_control : 0.000005s : 0.03% optimize.assign_add_opt : 0.000004s : 0.03% optimize.ForceFp32Comm : 0.000004s : 0.02% optimize.remove_cast_before_assign_add : 0.000003s : 0.02% optimize.full_micro_interleaved_order_control : 0.000005s : 0.03% optimize.reorder_send_recv_between_fp_bp : 0.000006s : 0.04% optimize.comm_op_add_attrs : 0.000003s : 0.02% optimize.add_comm_op_reuse_tag : 0.000003s : 0.02% optimize.interleave_split_concat_branches : 0.000004s : 0.03% optimize.interleave_parallel_branches : 0.000003s : 0.02% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.03% optimize.overlap_opt_shard_grad_in_pipeline : 0.000004s : 0.03% optimize.control_data_broadcast_order : 0.000024s : 0.16% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.03% optimize.offloading_packed_experts : 0.000008s : 0.05% optimize.overlap_recompute_and_grad_model_parallel : 0.000008s : 0.05% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.02% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.03% optimize.overlap_recompute_comm : 0.000005s : 0.03% optimize.overlap_grad_ring_attention : 0.000008s : 0.05% optimize.overlap_grad_flash_sp : 0.000031s : 0.20% optimize.begin_end_overlap_inline : 0.000003s : 0.02% optimize.split_matmul_comm_elemetwise : 0.000005s : 0.03% optimize.split_layernorm_comm : 0.000004s : 0.03% optimize.handle_group_info : 0.000004s : 0.02% optimize.symbol_engine_optimizer.build : 0.000006s : 0.04% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000016s : 0.11% optimize.symbol_engine_optimizer.elim_not_effective : 0.000020s : 0.13% optimize.symbol_engine_optimizer.opt_reshape : 0.000011s : 0.07% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000013s : 0.09% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000006s : 0.04% pipeline_parallel_scheduler : 0.000002s : 0.01% auto_monad_reorder : 0.000032s : 0.21% get_jit_bprop_graph : 0.000002s : 0.02% rewriter_after_jit_bprop_graph : 0.000008s : 0.05% opt_after_jit_grad : 0.000741s : 4.83% validate : 0.000059s : 0.38% Time group info: ------[substitution.] 0.000336 38 11.23% : 0.000038s : 3: substitution.cast_eliminate 0.74% : 0.000002s : 3: substitution.elim_not_effective 0.52% : 0.000002s : 3: substitution.fold_const_symbol 2.29% : 0.000008s : 5: substitution.graph_param_transform 74.73% : 0.000251s : 4: substitution.inline 1.90% : 0.000006s : 6: substitution.j_node_and_user_rematch 2.27% : 0.000008s : 6: substitution.remove_not_recompute_node 2.30% : 0.000008s : 4: substitution.replace_old_param 4.02% : 0.000014s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.007533 2 88.18% : 0.006643s : 1: type_inference.infer 11.82% : 0.000891s : 1: type_inference.specialize ------[replace.] 0.000080 8 62.02% : 0.000049s : 4: replace.inline 37.98% : 0.000030s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000259 8 95.48% : 0.000248s : 4: match.inline 4.52% : 0.000012s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000286 1504 0.74% : 0.000002s : 15: predicate.accumulaten_eliminater 0.99% : 0.000003s : 5: predicate.ad_related_special_op_eliminate 0.49% : 0.000001s : 10: predicate.addn_check_dump 0.78% : 0.000002s : 15: predicate.addn_zero_filter 0.68% : 0.000002s : 15: predicate.adjust_all_reduce_mul_add 2.41% : 0.000007s : 25: predicate.arithmetic_simplify 1.26% : 0.000004s : 15: predicate.cast_eliminate 0.57% : 0.000002s : 10: predicate.check_bprop_eliminate 0.58% : 0.000002s : 10: predicate.compare_switch_simplify 0.17% : 0.000000s : 5: predicate.const_output_eliminate 0.80% : 0.000002s : 10: predicate.depend_value_elim 0.86% : 0.000002s : 15: predicate.dict_get_item_const_eliminator 0.88% : 0.000003s : 15: predicate.dict_get_item_eliminator 0.93% : 0.000003s : 15: predicate.dict_set_item_eliminator 1.18% : 0.000003s : 10: predicate.dumpgradient_eliminate 0.19% : 0.000001s : 5: predicate.elim_not_effective 0.51% : 0.000001s : 5: predicate.elim_shapecalc_of_broadcastargs 1.18% : 0.000003s : 20: predicate.environ_add_const_eliminate 0.91% : 0.000003s : 20: predicate.environ_get_add_eliminate 1.05% : 0.000003s : 20: predicate.environ_get_depend_swap 1.56% : 0.000004s : 30: predicate.environ_get_eliminate 0.98% : 0.000003s : 20: predicate.environ_get_set_eliminate 1.32% : 0.000004s : 23: predicate.exchange_switch_depend_value 2.25% : 0.000006s : 23: predicate.float_depend_g_call 0.58% : 0.000002s : 10: predicate.float_environ_get_switch 1.02% : 0.000003s : 15: predicate.float_tuple_getitem_switch 0.16% : 0.000000s : 5: predicate.fold_const_symbol 0.78% : 0.000002s : 10: predicate.get_grad_eliminate 0.24% : 0.000001s : 5: predicate.graph_param_transform 0.55% : 0.000002s : 10: predicate.incorporate_call 0.48% : 0.000001s : 10: predicate.incorporate_call_switch 6.20% : 0.000018s : 68: predicate.inline 1.18% : 0.000003s : 10: predicate.inline_without_move 0.38% : 0.000001s : 10: predicate.j_node_and_user_rematch 0.80% : 0.000002s : 10: predicate.less_batch_normalization 1.57% : 0.000004s : 29: predicate.list_to_tuple_eliminator_ 2.51% : 0.000007s : 44: predicate.load_eliminater 1.37% : 0.000004s : 5: predicate.loop_unroll_after_grad 2.07% : 0.000006s : 36: predicate.loop_unroll_before_grad 1.65% : 0.000005s : 25: predicate.make_slice_get_slice_eliminator 0.64% : 0.000002s : 10: predicate.merge_addn 0.52% : 0.000001s : 10: predicate.micro_step_allgather_replace 0.53% : 0.000002s : 10: predicate.mini_step_allgather_replace 0.75% : 0.000002s : 15: predicate.minmaximum_grad 1.87% : 0.000005s : 5: predicate.mutable_eliminate 0.63% : 0.000002s : 5: predicate.opt_reshape 0.52% : 0.000001s : 5: predicate.parallel_virtual_node 1.89% : 0.000005s : 23: predicate.partial_defer_inline 1.39% : 0.000004s : 24: predicate.partial_eliminate 0.79% : 0.000002s : 15: predicate.print_const_string_wrapper 0.61% : 0.000002s : 10: predicate.reduce_all_const_elim 0.99% : 0.000003s : 15: predicate.reduce_eliminate 2.16% : 0.000006s : 44: predicate.redundant_stop_gradient_eliminater 0.59% : 0.000002s : 10: predicate.remove_not_recompute_node 1.26% : 0.000004s : 29: predicate.replace_applicator 0.57% : 0.000002s : 10: predicate.replace_old_param 0.56% : 0.000002s : 5: predicate.reset_defer_inline 0.92% : 0.000003s : 15: predicate.reshape_eliminate 0.73% : 0.000002s : 10: predicate.row_tensor_add_zeros_like 0.48% : 0.000001s : 5: predicate.row_tensor_eliminate 0.94% : 0.000003s : 10: predicate.same_eliminate 0.51% : 0.000001s : 10: predicate.set_cell_output_no_recompute 1.03% : 0.000003s : 10: predicate.shard_identity_eliminate 0.63% : 0.000002s : 10: predicate.special_op_eliminate 0.83% : 0.000002s : 10: predicate.specialize_transform 1.03% : 0.000003s : 10: predicate.split_environ_get_set_with_tuple_value 1.24% : 0.000004s : 10: predicate.stack_unstack_eliminate 0.36% : 0.000001s : 5: predicate.switch_call_monad_eliminater 1.29% : 0.000004s : 23: predicate.switch_defer_inline 1.76% : 0.000005s : 33: predicate.switch_layer_defer_inline 4.66% : 0.000013s : 74: predicate.switch_simplify 0.79% : 0.000002s : 15: predicate.tile_eliminate 0.84% : 0.000002s : 15: predicate.transpose_eliminate 1.33% : 0.000004s : 25: predicate.tuple_list_convert_item_index_to_positive 1.51% : 0.000004s : 25: predicate.tuple_list_get_item_const_eliminator 1.40% : 0.000004s : 25: predicate.tuple_list_get_item_depend_reorder 3.39% : 0.000010s : 39: predicate.tuple_list_get_item_eliminator 1.40% : 0.000004s : 25: predicate.tuple_list_get_set_item_eliminator 2.33% : 0.000007s : 35: predicate.tuple_list_set_item_eliminator 1.45% : 0.000004s : 29: predicate.tuple_to_list_eliminator_ 2.12% : 0.000006s : 44: predicate.updatestate_pure_node_eliminater 3.12% : 0.000009s : 54: predicate.updatestate_useless_node_eliminater 0.39% : 0.000001s : 5: predicate.value_based_eliminate 0.75% : 0.000002s : 10: predicate.virtual_dataset_eliminate 0.86% : 0.000002s : 10: predicate.virtual_output_eliminate 0.26% : 0.000001s : 5: predicate.virtual_view_grad_eliminate 0.56% : 0.000002s : 5: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000707 11 53.33% : 0.000377s : 5: func_graph_cloner_run.FuncGraphClonerGraph 46.67% : 0.000330s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.036719 192 0.02% : 0.000007s : 1: ForceFp32Comm 11.62% : 0.004268s : 1: add_attr 11.56% : 0.004244s : 1: add_attr_with_inline 0.02% : 0.000006s : 1: add_comm_op_reuse_tag 0.21% : 0.000078s : 1: add_recomputation 0.02% : 0.000007s : 1: assign_add_opt 0.24% : 0.000087s : 1: auto_monad 0.11% : 0.000041s : 1: auto_monad_reorder 0.02% : 0.000006s : 1: begin_end_overlap_inline 0.02% : 0.000009s : 1: bias_add_comm_swap 1.52% : 0.000557s : 1: bootstrap 0.14% : 0.000050s : 1: cconv 0.02% : 0.000006s : 1: comm_op_add_attrs 0.08% : 0.000028s : 1: control_data_broadcast_order 0.05% : 0.000017s : 1: convert_after_rewriter 0.12% : 0.000043s : 1: cse_after_recomputation 0.02% : 0.000008s : 1: dataset_repeat_opt 0.09% : 0.000034s : 1: detach_backward 0.04% : 0.000014s : 1: environ_conv 0.10% : 0.000036s : 1: event_method 0.02% : 0.000008s : 1: full_micro_interleaved_order_control 0.02% : 0.000009s : 1: get_jit_bprop_graph 0.04% : 0.000013s : 1: graph_reusing 0.02% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.02% : 0.000006s : 1: handle_group_info 0.02% : 0.000009s : 1: inline 0.03% : 0.000010s : 1: insert-virtual-dataset 0.02% : 0.000006s : 1: interleave_parallel_branches 0.02% : 0.000007s : 1: interleave_split_concat_branches 0.02% : 0.000009s : 1: label_fine_grained_interleaved_index 0.03% : 0.000012s : 1: label_micro_interleaved_index 1.77% : 0.000650s : 1: loop_unroll 0.02% : 0.000007s : 1: merge_cast_opt 0.02% : 0.000007s : 1: micro_interleaved_order_control 2.46% : 0.000905s : 1: mutable_eliminate 0.03% : 0.000011s : 1: offloading_packed_experts 0.07% : 0.000026s : 1: opt.transform.loop_unroll_optimizer 0.09% : 0.000032s : 1: opt.transform.mutable_eliminate 4.54% : 0.001667s : 78: opt.transform.opt_a 0.11% : 0.000040s : 1: opt.transform.opt_after_cconv 0.11% : 0.000040s : 1: opt.transform.opt_after_jit_grad 0.40% : 0.000147s : 28: opt.transform.opt_b 0.18% : 0.000067s : 2: opt.transform.opt_trans_graph 0.15% : 0.000055s : 4: opt.transform.symbol_engine_opt 11.98% : 0.004400s : 1: opt_a 0.47% : 0.000173s : 1: opt_after_cconv 2.06% : 0.000756s : 1: opt_after_jit_grad 1.04% : 0.000382s : 1: opt_b 22.10% : 0.008114s : 1: optimize 0.10% : 0.000036s : 1: optimize_parallel_all_gather_comm 0.04% : 0.000013s : 1: order_py_execute_after_rewriter 0.10% : 0.000035s : 1: overlap_grad_flash_sp 0.02% : 0.000007s : 1: overlap_grad_matmul_and_grad_allreduce 0.03% : 0.000012s : 1: overlap_grad_ring_attention 0.02% : 0.000008s : 1: overlap_opt_shard_grad_in_pipeline 0.02% : 0.000007s : 1: overlap_opt_shard_in_pipeline 0.02% : 0.000009s : 1: overlap_param_gather 0.02% : 0.000007s : 1: overlap_recompute_allgather_and_fa_grad 0.03% : 0.000011s : 1: overlap_recompute_and_grad_model_parallel 0.02% : 0.000008s : 1: overlap_recompute_comm 0.03% : 0.000012s : 1: parallel-infer-symbol 0.02% : 0.000007s : 1: parallel-infer-symbol-second 0.02% : 0.000008s : 1: partial_unused_args_eliminate 0.03% : 0.000010s : 1: pipeline_parallel_scheduler 0.02% : 0.000007s : 1: pipeline_split 0.15% : 0.000055s : 1: pre_auto_parallel 0.13% : 0.000048s : 1: py_interpret_to_execute 0.09% : 0.000033s : 1: py_interpret_to_execute_after_opt_a 0.02% : 0.000007s : 1: remove_cast_before_assign_add 0.07% : 0.000024s : 1: remove_dup_value 1.65% : 0.000606s : 1: renormalize.infer 1.24% : 0.000457s : 1: renormalize.specialize 0.02% : 0.000008s : 1: reorder_send_recv_between_fp_bp 0.04% : 0.000014s : 1: rewriter_after_jit_bprop_graph 0.19% : 0.000071s : 1: rewriter_after_opt_a 0.32% : 0.000119s : 1: rewriter_before_opt_a 0.02% : 0.000008s : 1: slice_cell_reuse_recomputed_activation 0.02% : 0.000007s : 1: slice_recompute_activation 0.02% : 0.000007s : 1: split_layernorm_comm 0.02% : 0.000008s : 1: split_matmul_comm_elemetwise 0.04% : 0.000014s : 1: swap_dp_allreduce_reducescatter 0.37% : 0.000135s : 1: symbol_engine_optimizer 0.32% : 0.000116s : 1: tuple_transform 20.88% : 0.007668s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:45:45.483.039 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.015749, [21] [bootstrap]: 0.00043046 [type_inference]: 0.00583805 [event_method]: 1.924e-05 [auto_monad]: 6.301e-05 [graph_reusing]: 5.91e-06 [inline]: 2.59999e-06 [add_attr]: 0.00330609, [1] [add_attr_with_inline]: 0.00329641, [1] [Cycle 1]: 6.747e-05, [2] [tag_attr]: 2.148e-05 [meta_addattr_fg_expand]: 5.87001e-06 [parallel-infer-symbol]: 3.41999e-06 [pre_auto_parallel]: 3.627e-05 [insert-virtual-dataset]: 2.49999e-06 [parallel-infer-symbol-second]: 1.17e-06 [dataset_repeat_opt]: 1.97999e-06 [pipeline_split]: 1.64e-06 [optimize]: 0.00533517, [53] [py_interpret_to_execute]: 2.691e-05 [rewriter_before_opt_a]: 8.632e-05 [opt_a]: 0.00310688, [2] [Cycle 1]: 0.00229436, [45] [expand_dump_flag]: 3.54002e-06 [switch_simplify]: 4.451e-05 [loop_unroll]: 3.126e-05 [a_1]: 0.00067743 [with_stream_mark]: 1.702e-05 [recompute_prepare]: 9.92999e-06 [updatestate_depend_eliminate]: 4.67e-06 [updatestate_assign_eliminate]: 4.14002e-06 [updatestate_loads_eliminate]: 3.96001e-06 [parameter_eliminate]: 1.85001e-06 [a_2]: 0.00010373 [accelerated_algorithm]: 8.80999e-06 [shard]: 2.32999e-06 [meta_shard_fg_expand]: 1.99999e-06 [shard_inline]: 7.92e-06 [merge_send_recv]: 9.35001e-06 [auto_parallel]: 7.08e-06 [parallel]: 1.912e-05 [flash_sp]: 8.85999e-06 [merge_comm]: 4.60001e-06 [allreduce_fusion]: 4.57e-06 [matmul_add_comm_reduction]: 1.02e-05 [allreduce_slice_to_reducescatter]: 6.89994e-07 [virtual_shard_identity]: 1.062e-05 [virtual_dataset]: 8.15999e-06 [get_grad_eliminate_]: 7.6e-06 [virtual_output]: 7.82e-06 [merge_forward]: 5.05001e-06 [cell_reuse_recompute_pass]: 1.35999e-06 [offload_activation]: 1.17e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.546e-05 [merge_recompute_call_nodes]: 1.76e-06 [before_grad]: 1.282e-05 [set_forward_comm_id_for_comm_node_pass]: 4.27e-06 [meta_fg_expand]: 3.38999e-06 [flash_sp_send_recv_attached]: 2.48998e-06 [receive_attached]: 2.40002e-06 [after_resolve]: 1.216e-05 [a_after_grad]: 1.194e-05 [renormalize]: 0.00080958 [add_forward_monad_depend]: 6.36e-06 [auto_monad_grad]: 2.37999e-06 [auto_monad_eliminator]: 1.857e-05 [cse]: 3.886e-05 [a_3]: 6.022e-05 [Cycle 2]: 0.00080159, [45] [expand_dump_flag]: 1.45999e-06 [switch_simplify]: 9.76e-06 [loop_unroll]: 7.35e-06 [a_1]: 0.00017751 [with_stream_mark]: 1.381e-05 [recompute_prepare]: 8.18999e-06 [updatestate_depend_eliminate]: 3.86999e-06 [updatestate_assign_eliminate]: 2.88998e-06 [updatestate_loads_eliminate]: 2.76e-06 [parameter_eliminate]: 1.05999e-06 [a_2]: 9.275e-05 [accelerated_algorithm]: 8.38999e-06 [shard]: 1.45001e-06 [meta_shard_fg_expand]: 1.90001e-06 [shard_inline]: 7.66999e-06 [merge_send_recv]: 9.72001e-06 [auto_parallel]: 7.61001e-06 [parallel]: 5.91998e-06 [flash_sp]: 3.28998e-06 [merge_comm]: 5.08002e-06 [allreduce_fusion]: 4.53999e-06 [matmul_add_comm_reduction]: 6.41e-06 [allreduce_slice_to_reducescatter]: 3.59985e-07 [virtual_shard_identity]: 8.60001e-06 [virtual_dataset]: 7.91001e-06 [get_grad_eliminate_]: 7.04001e-06 [virtual_output]: 7.08e-06 [merge_forward]: 3.99997e-06 [cell_reuse_recompute_pass]: 1.87999e-06 [offload_activation]: 8.18001e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.546e-05 [merge_recompute_call_nodes]: 1.05001e-06 [before_grad]: 1.271e-05 [set_forward_comm_id_for_comm_node_pass]: 4.47998e-06 [meta_fg_expand]: 3.05002e-06 [flash_sp_send_recv_attached]: 1.06002e-06 [receive_attached]: 1.34998e-06 [after_resolve]: 1.209e-05 [a_after_grad]: 1.139e-05 [renormalize]: 8.00064e-08 [add_forward_monad_depend]: 1.54998e-06 [auto_monad_grad]: 1.24e-06 [auto_monad_eliminator]: 8.94998e-06 [cse]: 1.921e-05 [a_3]: 4.668e-05 [py_interpret_to_execute_after_opt_a]: 1.345e-05 [slice_cell_reuse_recomputed_activation]: 2.16e-06 [rewriter_after_opt_a]: 4.227e-05 [convert_after_rewriter]: 7.45e-06 [order_py_execute_after_rewriter]: 6.06998e-06 [mutable_eliminate]: 0.00060017 [opt_b]: 0.00024518, [1] [Cycle 1]: 0.00023838, [7] [b_1]: 0.00015657 [b_2]: 9.79e-06 [updatestate_depend_eliminate]: 6.34999e-06 [updatestate_assign_eliminate]: 3.05998e-06 [updatestate_loads_eliminate]: 2.84001e-06 [renormalize]: 5.79981e-07 [cse]: 2.255e-05 [optimize_parallel_all_gather_comm]: 1.716e-05 [overlap_param_gather]: 1.85001e-06 [cconv]: 2.599e-05 [loop_unroll]: 0.0004355 [opt_after_cconv]: 0.0001165, [1] [Cycle 1]: 0.00011089, [7] [c_1]: 3.805e-05 [parameter_eliminate]: 3.35e-06 [updatestate_depend_eliminate]: 5.76e-06 [updatestate_assign_eliminate]: 3.08e-06 [updatestate_loads_eliminate]: 2.86e-06 [cse]: 2.287e-05 [renormalize]: 2.69996e-07 [remove_dup_value]: 1.532e-05 [tuple_transform]: 8.644e-05, [1] [Cycle 1]: 8.198e-05, [4] [d_1]: 5.301e-05 [none_parameter_eliminate]: 1.70001e-06 [renormalize]: 2.09984e-07 [switch_simplify]: 8.48001e-06 [partial_unused_args_eliminate]: 1.76e-06 [add_recomputation]: 5.684e-05 [cse_after_recomputation]: 2.557e-05, [1] [Cycle 1]: 2.079e-05, [1] [cse]: 1.548e-05 [environ_conv]: 6.41998e-06 [swap_dp_allreduce_reducescatter]: 5.76e-06 [bias_add_comm_swap]: 3.4e-06 [label_micro_interleaved_index]: 4.19002e-06 [label_fine_grained_interleaved_index]: 2.66999e-06 [merge_cast_opt]: 1.24998e-06 [slice_recompute_activation]: 2.37999e-06 [micro_interleaved_order_control]: 2.14e-06 [assign_add_opt]: 1.50999e-06 [ForceFp32Comm]: 8.00006e-07 [remove_cast_before_assign_add]: 9.70002e-07 [full_micro_interleaved_order_control]: 2.53998e-06 [reorder_send_recv_between_fp_bp]: 2.96001e-06 [comm_op_add_attrs]: 1.32999e-06 [add_comm_op_reuse_tag]: 1.04e-06 [interleave_split_concat_branches]: 1.16002e-06 [interleave_parallel_branches]: 1.07e-06 [overlap_opt_shard_in_pipeline]: 1.24e-06 [overlap_opt_shard_grad_in_pipeline]: 1.83002e-06 [control_data_broadcast_order]: 1.471e-05 [grouped_pairwise_exchange_alltoall]: 1.53002e-06 [offloading_packed_experts]: 4.30999e-06 [overlap_recompute_and_grad_model_parallel]: 5.24e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.64e-06 [overlap_recompute_allgather_and_fa_grad]: 1.64998e-06 [overlap_recompute_comm]: 2.83998e-06 [overlap_grad_ring_attention]: 4.65999e-06 [overlap_grad_flash_sp]: 2.358e-05 [begin_end_overlap_inline]: 5.70028e-07 [split_matmul_comm_elemetwise]: 2.27001e-06 [split_layernorm_comm]: 1.62001e-06 [handle_group_info]: 1.34998e-06 [symbol_engine_optimizer]: 8.475e-05, [1] [Cycle 1]: 8.045e-05, [6] [build]: 3.23998e-06 [elim_shapecalc]: 1.159e-05 [elim_not_effective]: 1.574e-05 [opt_reshape]: 8.54998e-06 [fold_const_symbol]: 1.247e-05 [renormalize]: 2.3999e-07 [detach_backward]: 1.99e-06 [pipeline_parallel_scheduler]: 1.68002e-06 [auto_monad_reorder]: 2.044e-05 [get_jit_bprop_graph]: 1.82999e-06 [rewriter_after_jit_bprop_graph]: 4.43001e-06 [opt_after_jit_grad]: 0.00047088 [validate]: 4.415e-05 Sums bootstrap : 0.000430s : 3.76% type_inference : 0.005838s : 50.99% event_method : 0.000019s : 0.17% auto_monad : 0.000063s : 0.55% graph_reusing : 0.000006s : 0.05% inline : 0.000003s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000021s : 0.19% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.05% parallel-infer-symbol : 0.000003s : 0.03% pre_auto_parallel : 0.000036s : 0.32% insert-virtual-dataset : 0.000002s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000027s : 0.24% optimize.rewriter_before_opt_a : 0.000086s : 0.75% optimize.opt_a.expand_dump_flag : 0.000005s : 0.04% optimize.opt_a.switch_simplify : 0.000054s : 0.47% optimize.opt_a.loop_unroll : 0.000039s : 0.34% optimize.opt_a.a_1 : 0.000855s : 7.47% optimize.opt_a.with_stream_mark : 0.000031s : 0.27% optimize.opt_a.recompute_prepare : 0.000018s : 0.16% optimize.opt_a.updatestate_depend_eliminate : 0.000009s : 0.07% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.06% optimize.opt_a.updatestate_loads_eliminate : 0.000007s : 0.06% optimize.opt_a.parameter_eliminate : 0.000003s : 0.03% optimize.opt_a.a_2 : 0.000196s : 1.72% optimize.opt_a.accelerated_algorithm : 0.000017s : 0.15% optimize.opt_a.shard : 0.000004s : 0.03% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.03% optimize.opt_a.shard_inline : 0.000016s : 0.14% optimize.opt_a.merge_send_recv : 0.000019s : 0.17% optimize.opt_a.auto_parallel : 0.000015s : 0.13% optimize.opt_a.parallel : 0.000025s : 0.22% optimize.opt_a.flash_sp : 0.000012s : 0.11% optimize.opt_a.merge_comm : 0.000010s : 0.08% optimize.opt_a.allreduce_fusion : 0.000009s : 0.08% optimize.opt_a.matmul_add_comm_reduction : 0.000017s : 0.15% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000019s : 0.17% optimize.opt_a.virtual_dataset : 0.000016s : 0.14% optimize.opt_a.get_grad_eliminate_ : 0.000015s : 0.13% optimize.opt_a.virtual_output : 0.000015s : 0.13% optimize.opt_a.merge_forward : 0.000009s : 0.08% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.03% optimize.opt_a.offload_activation : 0.000020s : 0.17% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000031s : 0.27% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.02% optimize.opt_a.before_grad : 0.000026s : 0.22% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000009s : 0.08% optimize.opt_a.meta_fg_expand : 0.000006s : 0.06% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.03% optimize.opt_a.receive_attached : 0.000004s : 0.03% optimize.opt_a.after_resolve : 0.000024s : 0.21% optimize.opt_a.a_after_grad : 0.000023s : 0.20% optimize.opt_a.renormalize : 0.000810s : 7.07% optimize.opt_a.add_forward_monad_depend : 0.000008s : 0.07% optimize.opt_a.auto_monad_grad : 0.000004s : 0.03% optimize.opt_a.auto_monad_eliminator : 0.000028s : 0.24% optimize.opt_a.cse : 0.000058s : 0.51% optimize.opt_a.a_3 : 0.000107s : 0.93% optimize.py_interpret_to_execute_after_opt_a : 0.000013s : 0.12% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.02% optimize.rewriter_after_opt_a : 0.000042s : 0.37% optimize.convert_after_rewriter : 0.000007s : 0.07% optimize.order_py_execute_after_rewriter : 0.000006s : 0.05% optimize.mutable_eliminate : 0.000600s : 5.24% optimize.opt_b.b_1 : 0.000157s : 1.37% optimize.opt_b.b_2 : 0.000010s : 0.09% optimize.opt_b.updatestate_depend_eliminate : 0.000006s : 0.06% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.02% optimize.opt_b.renormalize : 0.000001s : 0.01% optimize.opt_b.cse : 0.000023s : 0.20% optimize.optimize_parallel_all_gather_comm : 0.000017s : 0.15% optimize.overlap_param_gather : 0.000002s : 0.02% optimize.cconv : 0.000026s : 0.23% optimize.loop_unroll : 0.000435s : 3.80% optimize.opt_after_cconv.c_1 : 0.000038s : 0.33% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.05% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.02% optimize.opt_after_cconv.cse : 0.000023s : 0.20% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000015s : 0.13% optimize.tuple_transform.d_1 : 0.000053s : 0.46% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000008s : 0.07% optimize.partial_unused_args_eliminate : 0.000002s : 0.02% optimize.add_recomputation : 0.000057s : 0.50% optimize.cse_after_recomputation.cse : 0.000015s : 0.14% optimize.environ_conv : 0.000006s : 0.06% optimize.swap_dp_allreduce_reducescatter : 0.000006s : 0.05% optimize.bias_add_comm_swap : 0.000003s : 0.03% optimize.label_micro_interleaved_index : 0.000004s : 0.04% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.02% optimize.merge_cast_opt : 0.000001s : 0.01% optimize.slice_recompute_activation : 0.000002s : 0.02% optimize.micro_interleaved_order_control : 0.000002s : 0.02% optimize.assign_add_opt : 0.000002s : 0.01% optimize.ForceFp32Comm : 0.000001s : 0.01% optimize.remove_cast_before_assign_add : 0.000001s : 0.01% optimize.full_micro_interleaved_order_control : 0.000003s : 0.02% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.03% optimize.comm_op_add_attrs : 0.000001s : 0.01% optimize.add_comm_op_reuse_tag : 0.000001s : 0.01% optimize.interleave_split_concat_branches : 0.000001s : 0.01% optimize.interleave_parallel_branches : 0.000001s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.02% optimize.control_data_broadcast_order : 0.000015s : 0.13% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.01% optimize.offloading_packed_experts : 0.000004s : 0.04% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.05% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000002s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000002s : 0.01% optimize.overlap_recompute_comm : 0.000003s : 0.02% optimize.overlap_grad_ring_attention : 0.000005s : 0.04% optimize.overlap_grad_flash_sp : 0.000024s : 0.21% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.02% optimize.split_layernorm_comm : 0.000002s : 0.01% optimize.handle_group_info : 0.000001s : 0.01% optimize.symbol_engine_optimizer.build : 0.000003s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000012s : 0.10% optimize.symbol_engine_optimizer.elim_not_effective : 0.000016s : 0.14% optimize.symbol_engine_optimizer.opt_reshape : 0.000009s : 0.07% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000012s : 0.11% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.02% pipeline_parallel_scheduler : 0.000002s : 0.01% auto_monad_reorder : 0.000020s : 0.18% get_jit_bprop_graph : 0.000002s : 0.02% rewriter_after_jit_bprop_graph : 0.000004s : 0.04% opt_after_jit_grad : 0.000471s : 4.11% validate : 0.000044s : 0.39% Time group info: ------[substitution.] 0.000211 38 11.86% : 0.000025s : 3: substitution.cast_eliminate 1.10% : 0.000002s : 3: substitution.elim_not_effective 0.85% : 0.000002s : 3: substitution.fold_const_symbol 3.33% : 0.000007s : 5: substitution.graph_param_transform 69.41% : 0.000146s : 4: substitution.inline 2.32% : 0.000005s : 6: substitution.j_node_and_user_rematch 2.98% : 0.000006s : 6: substitution.remove_not_recompute_node 2.13% : 0.000004s : 4: substitution.replace_old_param 6.02% : 0.000013s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.005780 2 87.99% : 0.005086s : 1: type_inference.infer 12.01% : 0.000694s : 1: type_inference.specialize ------[replace.] 0.000061 8 63.01% : 0.000038s : 4: replace.inline 36.99% : 0.000023s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000155 8 92.92% : 0.000144s : 4: match.inline 7.08% : 0.000011s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000239 1504 0.99% : 0.000002s : 15: predicate.accumulaten_eliminater 0.73% : 0.000002s : 5: predicate.ad_related_special_op_eliminate 0.56% : 0.000001s : 10: predicate.addn_check_dump 0.94% : 0.000002s : 15: predicate.addn_zero_filter 0.83% : 0.000002s : 15: predicate.adjust_all_reduce_mul_add 2.19% : 0.000005s : 25: predicate.arithmetic_simplify 1.14% : 0.000003s : 15: predicate.cast_eliminate 0.64% : 0.000002s : 10: predicate.check_bprop_eliminate 0.64% : 0.000002s : 10: predicate.compare_switch_simplify 0.19% : 0.000000s : 5: predicate.const_output_eliminate 0.66% : 0.000002s : 10: predicate.depend_value_elim 0.97% : 0.000002s : 15: predicate.dict_get_item_const_eliminator 1.05% : 0.000002s : 15: predicate.dict_get_item_eliminator 0.87% : 0.000002s : 15: predicate.dict_set_item_eliminator 0.94% : 0.000002s : 10: predicate.dumpgradient_eliminate 0.19% : 0.000000s : 5: predicate.elim_not_effective 0.37% : 0.000001s : 5: predicate.elim_shapecalc_of_broadcastargs 1.22% : 0.000003s : 20: predicate.environ_add_const_eliminate 1.19% : 0.000003s : 20: predicate.environ_get_add_eliminate 1.10% : 0.000003s : 20: predicate.environ_get_depend_swap 1.73% : 0.000004s : 30: predicate.environ_get_eliminate 1.13% : 0.000003s : 20: predicate.environ_get_set_eliminate 1.37% : 0.000003s : 23: predicate.exchange_switch_depend_value 2.21% : 0.000005s : 23: predicate.float_depend_g_call 0.59% : 0.000001s : 10: predicate.float_environ_get_switch 0.83% : 0.000002s : 15: predicate.float_tuple_getitem_switch 0.18% : 0.000000s : 5: predicate.fold_const_symbol 0.66% : 0.000002s : 10: predicate.get_grad_eliminate 0.35% : 0.000001s : 5: predicate.graph_param_transform 0.66% : 0.000002s : 10: predicate.incorporate_call 0.55% : 0.000001s : 10: predicate.incorporate_call_switch 6.25% : 0.000015s : 68: predicate.inline 0.94% : 0.000002s : 10: predicate.inline_without_move 0.36% : 0.000001s : 10: predicate.j_node_and_user_rematch 0.87% : 0.000002s : 10: predicate.less_batch_normalization 1.90% : 0.000005s : 29: predicate.list_to_tuple_eliminator_ 2.47% : 0.000006s : 44: predicate.load_eliminater 0.85% : 0.000002s : 5: predicate.loop_unroll_after_grad 2.13% : 0.000005s : 36: predicate.loop_unroll_before_grad 1.71% : 0.000004s : 25: predicate.make_slice_get_slice_eliminator 0.58% : 0.000001s : 10: predicate.merge_addn 0.62% : 0.000001s : 10: predicate.micro_step_allgather_replace 0.76% : 0.000002s : 10: predicate.mini_step_allgather_replace 0.81% : 0.000002s : 15: predicate.minmaximum_grad 0.83% : 0.000002s : 5: predicate.mutable_eliminate 0.39% : 0.000001s : 5: predicate.opt_reshape 0.36% : 0.000001s : 5: predicate.parallel_virtual_node 1.67% : 0.000004s : 23: predicate.partial_defer_inline 1.64% : 0.000004s : 24: predicate.partial_eliminate 0.88% : 0.000002s : 15: predicate.print_const_string_wrapper 0.63% : 0.000002s : 10: predicate.reduce_all_const_elim 1.13% : 0.000003s : 15: predicate.reduce_eliminate 2.61% : 0.000006s : 44: predicate.redundant_stop_gradient_eliminater 0.37% : 0.000001s : 10: predicate.remove_not_recompute_node 1.36% : 0.000003s : 29: predicate.replace_applicator 0.41% : 0.000001s : 10: predicate.replace_old_param 0.28% : 0.000001s : 5: predicate.reset_defer_inline 0.94% : 0.000002s : 15: predicate.reshape_eliminate 0.62% : 0.000001s : 10: predicate.row_tensor_add_zeros_like 0.42% : 0.000001s : 5: predicate.row_tensor_eliminate 0.77% : 0.000002s : 10: predicate.same_eliminate 0.46% : 0.000001s : 10: predicate.set_cell_output_no_recompute 0.95% : 0.000002s : 10: predicate.shard_identity_eliminate 0.72% : 0.000002s : 10: predicate.special_op_eliminate 0.75% : 0.000002s : 10: predicate.specialize_transform 0.94% : 0.000002s : 10: predicate.split_environ_get_set_with_tuple_value 0.76% : 0.000002s : 10: predicate.stack_unstack_eliminate 0.44% : 0.000001s : 5: predicate.switch_call_monad_eliminater 1.46% : 0.000004s : 23: predicate.switch_defer_inline 1.98% : 0.000005s : 33: predicate.switch_layer_defer_inline 4.82% : 0.000012s : 74: predicate.switch_simplify 0.95% : 0.000002s : 15: predicate.tile_eliminate 0.89% : 0.000002s : 15: predicate.transpose_eliminate 1.52% : 0.000004s : 25: predicate.tuple_list_convert_item_index_to_positive 1.59% : 0.000004s : 25: predicate.tuple_list_get_item_const_eliminator 1.41% : 0.000003s : 25: predicate.tuple_list_get_item_depend_reorder 3.24% : 0.000008s : 39: predicate.tuple_list_get_item_eliminator 1.64% : 0.000004s : 25: predicate.tuple_list_get_set_item_eliminator 2.27% : 0.000005s : 35: predicate.tuple_list_set_item_eliminator 1.72% : 0.000004s : 29: predicate.tuple_to_list_eliminator_ 2.51% : 0.000006s : 44: predicate.updatestate_pure_node_eliminater 3.22% : 0.000008s : 54: predicate.updatestate_useless_node_eliminater 0.37% : 0.000001s : 5: predicate.value_based_eliminate 0.67% : 0.000002s : 10: predicate.virtual_dataset_eliminate 0.73% : 0.000002s : 10: predicate.virtual_output_eliminate 0.31% : 0.000001s : 5: predicate.virtual_view_grad_eliminate 0.43% : 0.000001s : 5: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000544 11 52.89% : 0.000288s : 5: func_graph_cloner_run.FuncGraphClonerGraph 47.11% : 0.000256s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.026778 192 0.01% : 0.000004s : 1: ForceFp32Comm 12.37% : 0.003312s : 1: add_attr 12.33% : 0.003301s : 1: add_attr_with_inline 0.01% : 0.000004s : 1: add_comm_op_reuse_tag 0.23% : 0.000061s : 1: add_recomputation 0.02% : 0.000004s : 1: assign_add_opt 0.26% : 0.000070s : 1: auto_monad 0.09% : 0.000025s : 1: auto_monad_reorder 0.01% : 0.000003s : 1: begin_end_overlap_inline 0.02% : 0.000006s : 1: bias_add_comm_swap 1.72% : 0.000460s : 1: bootstrap 0.11% : 0.000030s : 1: cconv 0.02% : 0.000004s : 1: comm_op_add_attrs 0.07% : 0.000018s : 1: control_data_broadcast_order 0.04% : 0.000011s : 1: convert_after_rewriter 0.11% : 0.000029s : 1: cse_after_recomputation 0.02% : 0.000005s : 1: dataset_repeat_opt 0.02% : 0.000005s : 1: detach_backward 0.04% : 0.000010s : 1: environ_conv 0.10% : 0.000026s : 1: event_method 0.02% : 0.000005s : 1: full_micro_interleaved_order_control 0.02% : 0.000005s : 1: get_jit_bprop_graph 0.04% : 0.000010s : 1: graph_reusing 0.02% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.02% : 0.000004s : 1: handle_group_info 0.02% : 0.000006s : 1: inline 0.02% : 0.000006s : 1: insert-virtual-dataset 0.01% : 0.000004s : 1: interleave_parallel_branches 0.01% : 0.000004s : 1: interleave_split_concat_branches 0.02% : 0.000006s : 1: label_fine_grained_interleaved_index 0.03% : 0.000007s : 1: label_micro_interleaved_index 1.66% : 0.000443s : 1: loop_unroll 0.02% : 0.000004s : 1: merge_cast_opt 0.02% : 0.000005s : 1: micro_interleaved_order_control 2.27% : 0.000608s : 1: mutable_eliminate 0.03% : 0.000007s : 1: offloading_packed_experts 0.06% : 0.000016s : 1: opt.transform.loop_unroll_optimizer 0.06% : 0.000016s : 1: opt.transform.mutable_eliminate 5.15% : 0.001379s : 78: opt.transform.opt_a 0.14% : 0.000036s : 1: opt.transform.opt_after_cconv 0.10% : 0.000028s : 1: opt.transform.opt_after_jit_grad 0.50% : 0.000134s : 28: opt.transform.opt_b 0.22% : 0.000059s : 2: opt.transform.opt_trans_graph 0.17% : 0.000044s : 4: opt.transform.symbol_engine_opt 11.62% : 0.003110s : 1: opt_a 0.45% : 0.000120s : 1: opt_after_cconv 1.79% : 0.000479s : 1: opt_after_jit_grad 0.93% : 0.000249s : 1: opt_b 19.94% : 0.005341s : 1: optimize 0.08% : 0.000020s : 1: optimize_parallel_all_gather_comm 0.03% : 0.000009s : 1: order_py_execute_after_rewriter 0.10% : 0.000027s : 1: overlap_grad_flash_sp 0.02% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.03% : 0.000008s : 1: overlap_grad_ring_attention 0.02% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.02% : 0.000005s : 1: overlap_param_gather 0.02% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.03% : 0.000008s : 1: overlap_recompute_and_grad_model_parallel 0.02% : 0.000006s : 1: overlap_recompute_comm 0.03% : 0.000007s : 1: parallel-infer-symbol 0.02% : 0.000004s : 1: parallel-infer-symbol-second 0.02% : 0.000005s : 1: partial_unused_args_eliminate 0.02% : 0.000005s : 1: pipeline_parallel_scheduler 0.02% : 0.000004s : 1: pipeline_split 0.15% : 0.000040s : 1: pre_auto_parallel 0.12% : 0.000031s : 1: py_interpret_to_execute 0.06% : 0.000017s : 1: py_interpret_to_execute_after_opt_a 0.02% : 0.000004s : 1: remove_cast_before_assign_add 0.07% : 0.000019s : 1: remove_dup_value 1.67% : 0.000448s : 1: renormalize.infer 1.32% : 0.000353s : 1: renormalize.specialize 0.02% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.03% : 0.000008s : 1: rewriter_after_jit_bprop_graph 0.17% : 0.000046s : 1: rewriter_after_opt_a 0.34% : 0.000091s : 1: rewriter_before_opt_a 0.02% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.02% : 0.000005s : 1: slice_recompute_activation 0.02% : 0.000004s : 1: split_layernorm_comm 0.02% : 0.000005s : 1: split_matmul_comm_elemetwise 0.03% : 0.000009s : 1: swap_dp_allreduce_reducescatter 0.33% : 0.000088s : 1: symbol_engine_optimizer 0.33% : 0.000089s : 1: tuple_transform 21.87% : 0.005857s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:45:45.886.525 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:45:45.886.817 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0389372, [21] [bootstrap]: 0.00043543 [type_inference]: 0.0063962 [event_method]: 2.214e-05 [auto_monad]: 6.759e-05 [graph_reusing]: 2.138e-05 [inline]: 3.09999e-06 [add_attr]: 0.00323544, [1] [add_attr_with_inline]: 0.0032251, [1] [Cycle 1]: 8.793e-05, [2] [tag_attr]: 2.122e-05 [meta_addattr_fg_expand]: 6.71999e-06 [parallel-infer-symbol]: 3.24001e-06 [pre_auto_parallel]: 3.788e-05 [insert-virtual-dataset]: 2.37001e-06 [parallel-infer-symbol-second]: 7.39994e-07 [dataset_repeat_opt]: 2.15002e-06 [pipeline_split]: 1.52001e-06 [optimize]: 0.0210831, [53] [py_interpret_to_execute]: 3.161e-05 [rewriter_before_opt_a]: 8.971e-05 [opt_a]: 0.0183354, [2] [Cycle 1]: 0.0172356, [45] [expand_dump_flag]: 3.13e-06 [switch_simplify]: 4.452e-05 [loop_unroll]: 3.128e-05 [a_1]: 0.0007102 [with_stream_mark]: 1.864e-05 [recompute_prepare]: 9.96e-06 [updatestate_depend_eliminate]: 5.14998e-06 [updatestate_assign_eliminate]: 3.94002e-06 [updatestate_loads_eliminate]: 3.4e-06 [parameter_eliminate]: 1.91e-06 [a_2]: 0.00013268 [accelerated_algorithm]: 9.05001e-06 [shard]: 1.98997e-06 [meta_shard_fg_expand]: 2.11e-06 [shard_inline]: 8.27e-06 [merge_send_recv]: 1.003e-05 [auto_parallel]: 8.25999e-06 [parallel]: 1.965e-05 [flash_sp]: 9.72999e-06 [merge_comm]: 6.09999e-06 [allreduce_fusion]: 4.89e-06 [matmul_add_comm_reduction]: 1.023e-05 [allreduce_slice_to_reducescatter]: 7.99977e-07 [virtual_shard_identity]: 1.062e-05 [virtual_dataset]: 8.11002e-06 [get_grad_eliminate_]: 7.68001e-06 [virtual_output]: 8.38999e-06 [merge_forward]: 4.68999e-06 [cell_reuse_recompute_pass]: 1.20999e-06 [offload_activation]: 1.268e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.042e-05 [merge_recompute_call_nodes]: 1.54e-06 [before_grad]: 1.455e-05 [set_forward_comm_id_for_comm_node_pass]: 5.22999e-06 [meta_fg_expand]: 3.34001e-06 [flash_sp_send_recv_attached]: 2.86999e-06 [receive_attached]: 2.34001e-06 [after_resolve]: 1.535e-05 [a_after_grad]: 1.338e-05 [renormalize]: 0.0154514 [add_forward_monad_depend]: 7.91001e-06 [auto_monad_grad]: 2.71999e-06 [auto_monad_eliminator]: 1.969e-05 [cse]: 3.963e-05 [a_3]: 9.622e-05 [Cycle 2]: 0.00108138, [45] [expand_dump_flag]: 2.19999e-06 [switch_simplify]: 1.128e-05 [loop_unroll]: 8.83001e-06 [a_1]: 0.00019532 [with_stream_mark]: 2.163e-05 [recompute_prepare]: 9.87999e-06 [updatestate_depend_eliminate]: 4.63999e-06 [updatestate_assign_eliminate]: 3.89002e-06 [updatestate_loads_eliminate]: 3.55e-06 [parameter_eliminate]: 1.81e-06 [a_2]: 0.00012956 [accelerated_algorithm]: 8.11002e-06 [shard]: 2.54001e-06 [meta_shard_fg_expand]: 2.76999e-06 [shard_inline]: 1.096e-05 [merge_send_recv]: 9.61998e-06 [auto_parallel]: 9.98002e-06 [parallel]: 8.85999e-06 [flash_sp]: 4.25e-06 [merge_comm]: 4.47e-06 [allreduce_fusion]: 4.05e-06 [matmul_add_comm_reduction]: 1.028e-05 [allreduce_slice_to_reducescatter]: 7.50006e-07 [virtual_shard_identity]: 1.193e-05 [virtual_dataset]: 8.15e-06 [get_grad_eliminate_]: 7.74002e-06 [virtual_output]: 8.15999e-06 [merge_forward]: 6.96001e-06 [cell_reuse_recompute_pass]: 3.66999e-06 [offload_activation]: 1.159e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.082e-05 [merge_recompute_call_nodes]: 1.99e-06 [before_grad]: 1.359e-05 [set_forward_comm_id_for_comm_node_pass]: 4.70001e-06 [meta_fg_expand]: 3.21999e-06 [flash_sp_send_recv_attached]: 1.74998e-06 [receive_attached]: 2.46e-06 [after_resolve]: 1.484e-05 [a_after_grad]: 1.265e-05 [renormalize]: 5.9983e-08 [add_forward_monad_depend]: 2.32999e-06 [auto_monad_grad]: 1.69e-06 [auto_monad_eliminator]: 1.363e-05 [cse]: 2.343e-05 [a_3]: 6.329e-05 [py_interpret_to_execute_after_opt_a]: 2.173e-05 [slice_cell_reuse_recomputed_activation]: 5.27001e-06 [rewriter_after_opt_a]: 5.057e-05 [convert_after_rewriter]: 1.18e-05 [order_py_execute_after_rewriter]: 9.03002e-06 [mutable_eliminate]: 0.0007223 [opt_b]: 0.00034729, [1] [Cycle 1]: 0.00033566, [7] [b_1]: 0.00021879 [b_2]: 9.74e-06 [updatestate_depend_eliminate]: 9.10999e-06 [updatestate_assign_eliminate]: 3.82002e-06 [updatestate_loads_eliminate]: 3.53999e-06 [renormalize]: 8.2e-07 [cse]: 2.916e-05 [optimize_parallel_all_gather_comm]: 2.295e-05 [overlap_param_gather]: 5.03002e-06 [cconv]: 3.597e-05 [loop_unroll]: 0.0004689 [opt_after_cconv]: 0.0001463, [1] [Cycle 1]: 0.000137, [7] [c_1]: 3.815e-05 [parameter_eliminate]: 4.94e-06 [updatestate_depend_eliminate]: 6.94001e-06 [updatestate_assign_eliminate]: 3.14999e-06 [updatestate_loads_eliminate]: 2.90002e-06 [cse]: 2.404e-05 [renormalize]: 3.19997e-07 [remove_dup_value]: 2.027e-05 [tuple_transform]: 0.00010492, [1] [Cycle 1]: 9.711e-05, [4] [d_1]: 5.52e-05 [none_parameter_eliminate]: 1.45001e-06 [renormalize]: 3.80009e-07 [switch_simplify]: 9.09e-06 [partial_unused_args_eliminate]: 4.37e-06 [add_recomputation]: 6.37e-05 [cse_after_recomputation]: 4.683e-05, [1] [Cycle 1]: 2.519e-05, [1] [cse]: 1.541e-05 [environ_conv]: 9.95002e-06 [swap_dp_allreduce_reducescatter]: 9.51e-06 [bias_add_comm_swap]: 5.56e-06 [label_micro_interleaved_index]: 7.16999e-06 [label_fine_grained_interleaved_index]: 5.12e-06 [merge_cast_opt]: 3.72002e-06 [slice_recompute_activation]: 4.35999e-06 [micro_interleaved_order_control]: 5.02e-06 [assign_add_opt]: 3.83001e-06 [ForceFp32Comm]: 3.26001e-06 [remove_cast_before_assign_add]: 3.37002e-06 [full_micro_interleaved_order_control]: 4.48999e-06 [reorder_send_recv_between_fp_bp]: 5.17e-06 [comm_op_add_attrs]: 3.71999e-06 [add_comm_op_reuse_tag]: 3.37997e-06 [interleave_split_concat_branches]: 3.76999e-06 [interleave_parallel_branches]: 3.83001e-06 [overlap_opt_shard_in_pipeline]: 5.04e-06 [overlap_opt_shard_grad_in_pipeline]: 4.18001e-06 [control_data_broadcast_order]: 2.046e-05 [grouped_pairwise_exchange_alltoall]: 4.17e-06 [offloading_packed_experts]: 8.17998e-06 [overlap_recompute_and_grad_model_parallel]: 8.23001e-06 [overlap_grad_matmul_and_grad_allreduce]: 4.22e-06 [overlap_recompute_allgather_and_fa_grad]: 3.94002e-06 [overlap_recompute_comm]: 5.19e-06 [overlap_grad_ring_attention]: 7.77e-06 [overlap_grad_flash_sp]: 2.933e-05 [begin_end_overlap_inline]: 3.55998e-06 [split_matmul_comm_elemetwise]: 4.82e-06 [split_layernorm_comm]: 4.41002e-06 [handle_group_info]: 3.6e-06 [symbol_engine_optimizer]: 0.00011269, [1] [Cycle 1]: 0.00010442, [6] [build]: 4.55001e-06 [elim_shapecalc]: 1.364e-05 [elim_not_effective]: 1.752e-05 [opt_reshape]: 8.57e-06 [fold_const_symbol]: 1.248e-05 [renormalize]: 2.10013e-07 [detach_backward]: 3.85e-06 [pipeline_parallel_scheduler]: 2.19001e-06 [auto_monad_reorder]: 2.206e-05 [get_jit_bprop_graph]: 2.78e-06 [rewriter_after_jit_bprop_graph]: 5.94e-06 [opt_after_jit_grad]: 0.0005522 [validate]: 4.643e-05 Sums bootstrap : 0.000435s : 1.59% type_inference : 0.006396s : 23.32% event_method : 0.000022s : 0.08% auto_monad : 0.000068s : 0.25% graph_reusing : 0.000021s : 0.08% inline : 0.000003s : 0.01% add_attr.add_attr_with_inline.tag_attr : 0.000021s : 0.08% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000007s : 0.02% parallel-infer-symbol : 0.000003s : 0.01% pre_auto_parallel : 0.000038s : 0.14% insert-virtual-dataset : 0.000002s : 0.01% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.01% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000032s : 0.12% optimize.rewriter_before_opt_a : 0.000090s : 0.33% optimize.opt_a.expand_dump_flag : 0.000005s : 0.02% optimize.opt_a.switch_simplify : 0.000056s : 0.20% optimize.opt_a.loop_unroll : 0.000040s : 0.15% optimize.opt_a.a_1 : 0.000906s : 3.30% optimize.opt_a.with_stream_mark : 0.000040s : 0.15% optimize.opt_a.recompute_prepare : 0.000020s : 0.07% optimize.opt_a.updatestate_depend_eliminate : 0.000010s : 0.04% optimize.opt_a.updatestate_assign_eliminate : 0.000008s : 0.03% optimize.opt_a.updatestate_loads_eliminate : 0.000007s : 0.03% optimize.opt_a.parameter_eliminate : 0.000004s : 0.01% optimize.opt_a.a_2 : 0.000262s : 0.96% optimize.opt_a.accelerated_algorithm : 0.000017s : 0.06% optimize.opt_a.shard : 0.000005s : 0.02% optimize.opt_a.meta_shard_fg_expand : 0.000005s : 0.02% optimize.opt_a.shard_inline : 0.000019s : 0.07% optimize.opt_a.merge_send_recv : 0.000020s : 0.07% optimize.opt_a.auto_parallel : 0.000018s : 0.07% optimize.opt_a.parallel : 0.000029s : 0.10% optimize.opt_a.flash_sp : 0.000014s : 0.05% optimize.opt_a.merge_comm : 0.000011s : 0.04% optimize.opt_a.allreduce_fusion : 0.000009s : 0.03% optimize.opt_a.matmul_add_comm_reduction : 0.000021s : 0.07% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000002s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000023s : 0.08% optimize.opt_a.virtual_dataset : 0.000016s : 0.06% optimize.opt_a.get_grad_eliminate_ : 0.000015s : 0.06% optimize.opt_a.virtual_output : 0.000017s : 0.06% optimize.opt_a.merge_forward : 0.000012s : 0.04% optimize.opt_a.cell_reuse_recompute_pass : 0.000005s : 0.02% optimize.opt_a.offload_activation : 0.000024s : 0.09% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000041s : 0.15% optimize.opt_a.merge_recompute_call_nodes : 0.000004s : 0.01% optimize.opt_a.before_grad : 0.000028s : 0.10% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000010s : 0.04% optimize.opt_a.meta_fg_expand : 0.000007s : 0.02% optimize.opt_a.flash_sp_send_recv_attached : 0.000005s : 0.02% optimize.opt_a.receive_attached : 0.000005s : 0.02% optimize.opt_a.after_resolve : 0.000030s : 0.11% optimize.opt_a.a_after_grad : 0.000026s : 0.09% optimize.opt_a.renormalize : 0.015451s : 56.33% optimize.opt_a.add_forward_monad_depend : 0.000010s : 0.04% optimize.opt_a.auto_monad_grad : 0.000004s : 0.02% optimize.opt_a.auto_monad_eliminator : 0.000033s : 0.12% optimize.opt_a.cse : 0.000063s : 0.23% optimize.opt_a.a_3 : 0.000160s : 0.58% optimize.py_interpret_to_execute_after_opt_a : 0.000022s : 0.08% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.02% optimize.rewriter_after_opt_a : 0.000051s : 0.18% optimize.convert_after_rewriter : 0.000012s : 0.04% optimize.order_py_execute_after_rewriter : 0.000009s : 0.03% optimize.mutable_eliminate : 0.000722s : 2.63% optimize.opt_b.b_1 : 0.000219s : 0.80% optimize.opt_b.b_2 : 0.000010s : 0.04% optimize.opt_b.updatestate_depend_eliminate : 0.000009s : 0.03% optimize.opt_b.updatestate_assign_eliminate : 0.000004s : 0.01% optimize.opt_b.updatestate_loads_eliminate : 0.000004s : 0.01% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000029s : 0.11% optimize.optimize_parallel_all_gather_comm : 0.000023s : 0.08% optimize.overlap_param_gather : 0.000005s : 0.02% optimize.cconv : 0.000036s : 0.13% optimize.loop_unroll : 0.000469s : 1.71% optimize.opt_after_cconv.c_1 : 0.000038s : 0.14% optimize.opt_after_cconv.parameter_eliminate : 0.000005s : 0.02% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000007s : 0.03% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.cse : 0.000024s : 0.09% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000020s : 0.07% optimize.tuple_transform.d_1 : 0.000055s : 0.20% optimize.tuple_transform.none_parameter_eliminate : 0.000001s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000009s : 0.03% optimize.partial_unused_args_eliminate : 0.000004s : 0.02% optimize.add_recomputation : 0.000064s : 0.23% optimize.cse_after_recomputation.cse : 0.000015s : 0.06% optimize.environ_conv : 0.000010s : 0.04% optimize.swap_dp_allreduce_reducescatter : 0.000010s : 0.03% optimize.bias_add_comm_swap : 0.000006s : 0.02% optimize.label_micro_interleaved_index : 0.000007s : 0.03% optimize.label_fine_grained_interleaved_index : 0.000005s : 0.02% optimize.merge_cast_opt : 0.000004s : 0.01% optimize.slice_recompute_activation : 0.000004s : 0.02% optimize.micro_interleaved_order_control : 0.000005s : 0.02% optimize.assign_add_opt : 0.000004s : 0.01% optimize.ForceFp32Comm : 0.000003s : 0.01% optimize.remove_cast_before_assign_add : 0.000003s : 0.01% optimize.full_micro_interleaved_order_control : 0.000004s : 0.02% optimize.reorder_send_recv_between_fp_bp : 0.000005s : 0.02% optimize.comm_op_add_attrs : 0.000004s : 0.01% optimize.add_comm_op_reuse_tag : 0.000003s : 0.01% optimize.interleave_split_concat_branches : 0.000004s : 0.01% optimize.interleave_parallel_branches : 0.000004s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000005s : 0.02% optimize.overlap_opt_shard_grad_in_pipeline : 0.000004s : 0.02% optimize.control_data_broadcast_order : 0.000020s : 0.07% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.02% optimize.offloading_packed_experts : 0.000008s : 0.03% optimize.overlap_recompute_and_grad_model_parallel : 0.000008s : 0.03% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.02% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.01% optimize.overlap_recompute_comm : 0.000005s : 0.02% optimize.overlap_grad_ring_attention : 0.000008s : 0.03% optimize.overlap_grad_flash_sp : 0.000029s : 0.11% optimize.begin_end_overlap_inline : 0.000004s : 0.01% optimize.split_matmul_comm_elemetwise : 0.000005s : 0.02% optimize.split_layernorm_comm : 0.000004s : 0.02% optimize.handle_group_info : 0.000004s : 0.01% optimize.symbol_engine_optimizer.build : 0.000005s : 0.02% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000014s : 0.05% optimize.symbol_engine_optimizer.elim_not_effective : 0.000018s : 0.06% optimize.symbol_engine_optimizer.opt_reshape : 0.000009s : 0.03% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000012s : 0.05% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000004s : 0.01% pipeline_parallel_scheduler : 0.000002s : 0.01% auto_monad_reorder : 0.000022s : 0.08% get_jit_bprop_graph : 0.000003s : 0.01% rewriter_after_jit_bprop_graph : 0.000006s : 0.02% opt_after_jit_grad : 0.000552s : 2.01% validate : 0.000046s : 0.17% Time group info: ------[substitution.] 0.000226 38 13.10% : 0.000030s : 3: substitution.cast_eliminate 1.35% : 0.000003s : 3: substitution.elim_not_effective 0.75% : 0.000002s : 3: substitution.fold_const_symbol 3.58% : 0.000008s : 5: substitution.graph_param_transform 67.05% : 0.000151s : 4: substitution.inline 2.57% : 0.000006s : 6: substitution.j_node_and_user_rematch 3.13% : 0.000007s : 6: substitution.remove_not_recompute_node 2.87% : 0.000006s : 4: substitution.replace_old_param 5.59% : 0.000013s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.006337 2 88.12% : 0.005584s : 1: type_inference.infer 11.88% : 0.000753s : 1: type_inference.specialize ------[replace.] 0.000063 8 61.68% : 0.000039s : 4: replace.inline 38.32% : 0.000024s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000160 8 93.24% : 0.000149s : 4: match.inline 6.76% : 0.000011s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000282 1504 0.82% : 0.000002s : 15: predicate.accumulaten_eliminater 0.75% : 0.000002s : 5: predicate.ad_related_special_op_eliminate 0.52% : 0.000001s : 10: predicate.addn_check_dump 0.84% : 0.000002s : 15: predicate.addn_zero_filter 0.70% : 0.000002s : 15: predicate.adjust_all_reduce_mul_add 2.05% : 0.000006s : 25: predicate.arithmetic_simplify 0.85% : 0.000002s : 15: predicate.cast_eliminate 0.70% : 0.000002s : 10: predicate.check_bprop_eliminate 0.55% : 0.000002s : 10: predicate.compare_switch_simplify 0.17% : 0.000000s : 5: predicate.const_output_eliminate 0.57% : 0.000002s : 10: predicate.depend_value_elim 0.80% : 0.000002s : 15: predicate.dict_get_item_const_eliminator 0.90% : 0.000003s : 15: predicate.dict_get_item_eliminator 0.79% : 0.000002s : 15: predicate.dict_set_item_eliminator 0.93% : 0.000003s : 10: predicate.dumpgradient_eliminate 0.26% : 0.000001s : 5: predicate.elim_not_effective 0.29% : 0.000001s : 5: predicate.elim_shapecalc_of_broadcastargs 1.33% : 0.000004s : 20: predicate.environ_add_const_eliminate 1.03% : 0.000003s : 20: predicate.environ_get_add_eliminate 1.04% : 0.000003s : 20: predicate.environ_get_depend_swap 1.56% : 0.000004s : 30: predicate.environ_get_eliminate 0.97% : 0.000003s : 20: predicate.environ_get_set_eliminate 1.19% : 0.000003s : 23: predicate.exchange_switch_depend_value 2.05% : 0.000006s : 23: predicate.float_depend_g_call 0.56% : 0.000002s : 10: predicate.float_environ_get_switch 0.81% : 0.000002s : 15: predicate.float_tuple_getitem_switch 0.17% : 0.000000s : 5: predicate.fold_const_symbol 0.66% : 0.000002s : 10: predicate.get_grad_eliminate 0.27% : 0.000001s : 5: predicate.graph_param_transform 0.56% : 0.000002s : 10: predicate.incorporate_call 0.48% : 0.000001s : 10: predicate.incorporate_call_switch 5.53% : 0.000016s : 68: predicate.inline 0.92% : 0.000003s : 10: predicate.inline_without_move 0.31% : 0.000001s : 10: predicate.j_node_and_user_rematch 0.74% : 0.000002s : 10: predicate.less_batch_normalization 1.71% : 0.000005s : 29: predicate.list_to_tuple_eliminator_ 2.22% : 0.000006s : 44: predicate.load_eliminater 0.91% : 0.000003s : 5: predicate.loop_unroll_after_grad 1.94% : 0.000005s : 36: predicate.loop_unroll_before_grad 1.67% : 0.000005s : 25: predicate.make_slice_get_slice_eliminator 0.55% : 0.000002s : 10: predicate.merge_addn 0.60% : 0.000002s : 10: predicate.micro_step_allgather_replace 0.59% : 0.000002s : 10: predicate.mini_step_allgather_replace 8.70% : 0.000025s : 15: predicate.minmaximum_grad 1.43% : 0.000004s : 5: predicate.mutable_eliminate 0.34% : 0.000001s : 5: predicate.opt_reshape 0.36% : 0.000001s : 5: predicate.parallel_virtual_node 1.53% : 0.000004s : 23: predicate.partial_defer_inline 1.41% : 0.000004s : 24: predicate.partial_eliminate 0.76% : 0.000002s : 15: predicate.print_const_string_wrapper 0.55% : 0.000002s : 10: predicate.reduce_all_const_elim 1.03% : 0.000003s : 15: predicate.reduce_eliminate 2.14% : 0.000006s : 44: predicate.redundant_stop_gradient_eliminater 0.42% : 0.000001s : 10: predicate.remove_not_recompute_node 1.24% : 0.000003s : 29: predicate.replace_applicator 0.46% : 0.000001s : 10: predicate.replace_old_param 0.30% : 0.000001s : 5: predicate.reset_defer_inline 0.85% : 0.000002s : 15: predicate.reshape_eliminate 0.59% : 0.000002s : 10: predicate.row_tensor_add_zeros_like 0.35% : 0.000001s : 5: predicate.row_tensor_eliminate 0.85% : 0.000002s : 10: predicate.same_eliminate 0.42% : 0.000001s : 10: predicate.set_cell_output_no_recompute 0.84% : 0.000002s : 10: predicate.shard_identity_eliminate 0.66% : 0.000002s : 10: predicate.special_op_eliminate 0.67% : 0.000002s : 10: predicate.specialize_transform 1.36% : 0.000004s : 10: predicate.split_environ_get_set_with_tuple_value 0.82% : 0.000002s : 10: predicate.stack_unstack_eliminate 0.36% : 0.000001s : 5: predicate.switch_call_monad_eliminater 1.30% : 0.000004s : 23: predicate.switch_defer_inline 1.81% : 0.000005s : 33: predicate.switch_layer_defer_inline 4.43% : 0.000013s : 74: predicate.switch_simplify 0.75% : 0.000002s : 15: predicate.tile_eliminate 0.90% : 0.000003s : 15: predicate.transpose_eliminate 1.42% : 0.000004s : 25: predicate.tuple_list_convert_item_index_to_positive 1.41% : 0.000004s : 25: predicate.tuple_list_get_item_const_eliminator 1.41% : 0.000004s : 25: predicate.tuple_list_get_item_depend_reorder 2.85% : 0.000008s : 39: predicate.tuple_list_get_item_eliminator 1.43% : 0.000004s : 25: predicate.tuple_list_get_set_item_eliminator 2.16% : 0.000006s : 35: predicate.tuple_list_set_item_eliminator 1.53% : 0.000004s : 29: predicate.tuple_to_list_eliminator_ 2.11% : 0.000006s : 44: predicate.updatestate_pure_node_eliminater 2.96% : 0.000008s : 54: predicate.updatestate_useless_node_eliminater 0.38% : 0.000001s : 5: predicate.value_based_eliminate 0.59% : 0.000002s : 10: predicate.virtual_dataset_eliminate 0.62% : 0.000002s : 10: predicate.virtual_output_eliminate 0.24% : 0.000001s : 5: predicate.virtual_view_grad_eliminate 0.38% : 0.000001s : 5: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000631 11 54.72% : 0.000345s : 5: func_graph_cloner_run.FuncGraphClonerGraph 45.28% : 0.000286s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.080399 192 0.01% : 0.000006s : 1: ForceFp32Comm 4.04% : 0.003245s : 1: add_attr 4.02% : 0.003229s : 1: add_attr_with_inline 0.01% : 0.000006s : 1: add_comm_op_reuse_tag 0.08% : 0.000067s : 1: add_recomputation 0.01% : 0.000006s : 1: assign_add_opt 0.10% : 0.000077s : 1: auto_monad 7.97% : 0.006409s : 1: auto_monad_reorder 0.01% : 0.000007s : 1: begin_end_overlap_inline 0.01% : 0.000008s : 1: bias_add_comm_swap 0.60% : 0.000481s : 1: bootstrap 0.05% : 0.000039s : 1: cconv 0.01% : 0.000007s : 1: comm_op_add_attrs 0.03% : 0.000024s : 1: control_data_broadcast_order 0.02% : 0.000015s : 1: convert_after_rewriter 0.06% : 0.000050s : 1: cse_after_recomputation 0.01% : 0.000008s : 1: dataset_repeat_opt 0.03% : 0.000020s : 1: detach_backward 0.02% : 0.000013s : 1: environ_conv 0.04% : 0.000032s : 1: event_method 0.01% : 0.000007s : 1: full_micro_interleaved_order_control 0.02% : 0.000013s : 1: get_jit_bprop_graph 0.04% : 0.000029s : 1: graph_reusing 0.01% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000006s : 1: handle_group_info 0.01% : 0.000009s : 1: inline 0.01% : 0.000009s : 1: insert-virtual-dataset 0.01% : 0.000006s : 1: interleave_parallel_branches 0.01% : 0.000006s : 1: interleave_split_concat_branches 0.01% : 0.000008s : 1: label_fine_grained_interleaved_index 0.01% : 0.000010s : 1: label_micro_interleaved_index 0.59% : 0.000476s : 1: loop_unroll 0.01% : 0.000006s : 1: merge_cast_opt 0.01% : 0.000008s : 1: micro_interleaved_order_control 0.91% : 0.000730s : 1: mutable_eliminate 0.01% : 0.000011s : 1: offloading_packed_experts 0.02% : 0.000017s : 1: opt.transform.loop_unroll_optimizer 0.03% : 0.000024s : 1: opt.transform.mutable_eliminate 1.84% : 0.001478s : 78: opt.transform.opt_a 0.05% : 0.000037s : 1: opt.transform.opt_after_cconv 0.04% : 0.000032s : 1: opt.transform.opt_after_jit_grad 0.18% : 0.000147s : 28: opt.transform.opt_b 0.08% : 0.000062s : 2: opt.transform.opt_trans_graph 0.06% : 0.000047s : 4: opt.transform.symbol_engine_opt 22.81% : 0.018340s : 1: opt_a 0.19% : 0.000150s : 1: opt_after_cconv 0.70% : 0.000562s : 1: opt_after_jit_grad 0.44% : 0.000351s : 1: opt_b 26.65% : 0.021430s : 1: optimize 0.03% : 0.000026s : 1: optimize_parallel_all_gather_comm 0.02% : 0.000012s : 1: order_py_execute_after_rewriter 0.04% : 0.000033s : 1: overlap_grad_flash_sp 0.01% : 0.000007s : 1: overlap_grad_matmul_and_grad_allreduce 0.01% : 0.000011s : 1: overlap_grad_ring_attention 0.01% : 0.000007s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000008s : 1: overlap_opt_shard_in_pipeline 0.01% : 0.000008s : 1: overlap_param_gather 0.01% : 0.000007s : 1: overlap_recompute_allgather_and_fa_grad 0.01% : 0.000011s : 1: overlap_recompute_and_grad_model_parallel 0.01% : 0.000008s : 1: overlap_recompute_comm 0.01% : 0.000010s : 1: parallel-infer-symbol 0.01% : 0.000007s : 1: parallel-infer-symbol-second 0.01% : 0.000007s : 1: partial_unused_args_eliminate 0.01% : 0.000010s : 1: pipeline_parallel_scheduler 0.01% : 0.000007s : 1: pipeline_split 0.06% : 0.000045s : 1: pre_auto_parallel 0.04% : 0.000035s : 1: py_interpret_to_execute 0.03% : 0.000026s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000006s : 1: remove_cast_before_assign_add 0.03% : 0.000024s : 1: remove_dup_value 18.67% : 0.015013s : 1: renormalize.infer 0.53% : 0.000425s : 1: renormalize.specialize 0.01% : 0.000008s : 1: reorder_send_recv_between_fp_bp 0.02% : 0.000012s : 1: rewriter_after_jit_bprop_graph 0.07% : 0.000055s : 1: rewriter_after_opt_a 0.12% : 0.000094s : 1: rewriter_before_opt_a 0.01% : 0.000008s : 1: slice_cell_reuse_recomputed_activation 0.01% : 0.000007s : 1: slice_recompute_activation 0.01% : 0.000007s : 1: split_layernorm_comm 0.01% : 0.000008s : 1: split_matmul_comm_elemetwise 0.02% : 0.000013s : 1: swap_dp_allreduce_reducescatter 0.14% : 0.000116s : 1: symbol_engine_optimizer 0.13% : 0.000108s : 1: tuple_transform 8.01% : 0.006438s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:45:46.346.372 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0150673, [21] [bootstrap]: 0.00046537 [type_inference]: 0.00586541 [event_method]: 1.907e-05 [auto_monad]: 6.37e-05 [graph_reusing]: 5.62999e-06 [inline]: 2.44999e-06 [add_attr]: 0.00305177, [1] [add_attr_with_inline]: 0.00304201, [1] [Cycle 1]: 5.961e-05, [2] [tag_attr]: 1.988e-05 [meta_addattr_fg_expand]: 6.26e-06 [parallel-infer-symbol]: 3.88001e-06 [pre_auto_parallel]: 3.474e-05 [insert-virtual-dataset]: 2.50002e-06 [parallel-infer-symbol-second]: 7.39994e-07 [dataset_repeat_opt]: 2.36998e-06 [pipeline_split]: 2.13002e-06 [optimize]: 0.00487826, [53] [py_interpret_to_execute]: 2.717e-05 [rewriter_before_opt_a]: 8.391e-05 [opt_a]: 0.0028492, [2] [Cycle 1]: 0.00209044, [45] [expand_dump_flag]: 2.81999e-06 [switch_simplify]: 4.709e-05 [loop_unroll]: 3.574e-05 [a_1]: 0.00067987 [with_stream_mark]: 1.725e-05 [recompute_prepare]: 1.183e-05 [updatestate_depend_eliminate]: 4.85999e-06 [updatestate_assign_eliminate]: 4.17998e-06 [updatestate_loads_eliminate]: 3.58999e-06 [parameter_eliminate]: 1.84e-06 [a_2]: 0.00010106 [accelerated_algorithm]: 8.75001e-06 [shard]: 1.66e-06 [meta_shard_fg_expand]: 2.12999e-06 [shard_inline]: 7.82e-06 [merge_send_recv]: 9.89999e-06 [auto_parallel]: 6.87002e-06 [parallel]: 1.832e-05 [flash_sp]: 8.43001e-06 [merge_comm]: 4.82e-06 [allreduce_fusion]: 4.21001e-06 [matmul_add_comm_reduction]: 9.72001e-06 [allreduce_slice_to_reducescatter]: 6.09987e-07 [virtual_shard_identity]: 9.14e-06 [virtual_dataset]: 7.99997e-06 [get_grad_eliminate_]: 7.88999e-06 [virtual_output]: 7.71999e-06 [merge_forward]: 4.41002e-06 [cell_reuse_recompute_pass]: 1.17999e-06 [offload_activation]: 1.156e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.567e-05 [merge_recompute_call_nodes]: 1.37e-06 [before_grad]: 1.292e-05 [set_forward_comm_id_for_comm_node_pass]: 4.82998e-06 [meta_fg_expand]: 3.25998e-06 [flash_sp_send_recv_attached]: 3.06999e-06 [receive_attached]: 2.68e-06 [after_resolve]: 1.301e-05 [a_after_grad]: 1.197e-05 [renormalize]: 0.00059621 [add_forward_monad_depend]: 5.29e-06 [auto_monad_grad]: 2.02999e-06 [auto_monad_eliminator]: 1.594e-05 [cse]: 3.579e-05 [a_3]: 6.927e-05 [Cycle 2]: 0.00074939, [45] [expand_dump_flag]: 1.29998e-06 [switch_simplify]: 9.02999e-06 [loop_unroll]: 7.40998e-06 [a_1]: 0.00017325 [with_stream_mark]: 1.124e-05 [recompute_prepare]: 8.10999e-06 [updatestate_depend_eliminate]: 3.66001e-06 [updatestate_assign_eliminate]: 2.94999e-06 [updatestate_loads_eliminate]: 2.93e-06 [parameter_eliminate]: 9.89996e-07 [a_2]: 9.182e-05 [accelerated_algorithm]: 7.36999e-06 [shard]: 1.38002e-06 [meta_shard_fg_expand]: 1.79e-06 [shard_inline]: 7.4e-06 [merge_send_recv]: 6.17001e-06 [auto_parallel]: 5.84e-06 [parallel]: 4.3e-06 [flash_sp]: 3.23e-06 [merge_comm]: 3.96001e-06 [allreduce_fusion]: 6.64999e-06 [matmul_add_comm_reduction]: 6.01998e-06 [allreduce_slice_to_reducescatter]: 3.69997e-07 [virtual_shard_identity]: 7.95998e-06 [virtual_dataset]: 7.15998e-06 [get_grad_eliminate_]: 7.06001e-06 [virtual_output]: 6.68003e-06 [merge_forward]: 3.08e-06 [cell_reuse_recompute_pass]: 1.25999e-06 [offload_activation]: 7.11001e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.351e-05 [merge_recompute_call_nodes]: 9.00007e-07 [before_grad]: 1.13e-05 [set_forward_comm_id_for_comm_node_pass]: 4.28001e-06 [meta_fg_expand]: 2.73e-06 [flash_sp_send_recv_attached]: 8.50006e-07 [receive_attached]: 1.07e-06 [after_resolve]: 1.216e-05 [a_after_grad]: 1.127e-05 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 1.25001e-06 [auto_monad_grad]: 9.89996e-07 [auto_monad_eliminator]: 8.47e-06 [cse]: 1.823e-05 [a_3]: 4.521e-05 [py_interpret_to_execute_after_opt_a]: 1.069e-05 [slice_cell_reuse_recomputed_activation]: 1.89e-06 [rewriter_after_opt_a]: 3.997e-05 [convert_after_rewriter]: 7.51999e-06 [order_py_execute_after_rewriter]: 5.88998e-06 [mutable_eliminate]: 0.00047062 [opt_b]: 0.00023569, [1] [Cycle 1]: 0.00023007, [7] [b_1]: 0.00015344 [b_2]: 9.22001e-06 [updatestate_depend_eliminate]: 5.62999e-06 [updatestate_assign_eliminate]: 2.83998e-06 [updatestate_loads_eliminate]: 2.83998e-06 [renormalize]: 3.19997e-07 [cse]: 2.116e-05 [optimize_parallel_all_gather_comm]: 1.684e-05 [overlap_param_gather]: 1.82001e-06 [cconv]: 2.341e-05 [loop_unroll]: 0.00040906 [opt_after_cconv]: 0.00011348, [1] [Cycle 1]: 0.00010803, [7] [c_1]: 3.655e-05 [parameter_eliminate]: 2.28998e-06 [updatestate_depend_eliminate]: 5.77001e-06 [updatestate_assign_eliminate]: 2.98998e-06 [updatestate_loads_eliminate]: 2.98e-06 [cse]: 2.139e-05 [renormalize]: 4.89992e-07 [remove_dup_value]: 1.36e-05 [tuple_transform]: 8.381e-05, [1] [Cycle 1]: 7.952e-05, [4] [d_1]: 5.092e-05 [none_parameter_eliminate]: 1.92001e-06 [renormalize]: 2.30008e-07 [switch_simplify]: 7.85998e-06 [partial_unused_args_eliminate]: 1.66002e-06 [add_recomputation]: 5.526e-05 [cse_after_recomputation]: 2.714e-05, [1] [Cycle 1]: 2.208e-05, [1] [cse]: 1.602e-05 [environ_conv]: 6.17001e-06 [swap_dp_allreduce_reducescatter]: 6.17001e-06 [bias_add_comm_swap]: 2.61e-06 [label_micro_interleaved_index]: 4.22e-06 [label_fine_grained_interleaved_index]: 3.03e-06 [merge_cast_opt]: 1.29003e-06 [slice_recompute_activation]: 2.14e-06 [micro_interleaved_order_control]: 2.56e-06 [assign_add_opt]: 1.19e-06 [ForceFp32Comm]: 7.50006e-07 [remove_cast_before_assign_add]: 9.20001e-07 [full_micro_interleaved_order_control]: 2.19999e-06 [reorder_send_recv_between_fp_bp]: 2.54001e-06 [comm_op_add_attrs]: 1.03001e-06 [add_comm_op_reuse_tag]: 9.39996e-07 [interleave_split_concat_branches]: 1.09998e-06 [interleave_parallel_branches]: 1.18001e-06 [overlap_opt_shard_in_pipeline]: 1.17999e-06 [overlap_opt_shard_grad_in_pipeline]: 1.66e-06 [control_data_broadcast_order]: 1.411e-05 [grouped_pairwise_exchange_alltoall]: 1.42e-06 [offloading_packed_experts]: 4.26001e-06 [overlap_recompute_and_grad_model_parallel]: 4.80999e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.12e-06 [overlap_recompute_allgather_and_fa_grad]: 1.29e-06 [overlap_recompute_comm]: 2.43e-06 [overlap_grad_ring_attention]: 4.45999e-06 [overlap_grad_flash_sp]: 2.125e-05 [begin_end_overlap_inline]: 5.60016e-07 [split_matmul_comm_elemetwise]: 2.52001e-06 [split_layernorm_comm]: 1.66e-06 [handle_group_info]: 9.70002e-07 [symbol_engine_optimizer]: 8.061e-05, [1] [Cycle 1]: 7.624e-05, [6] [build]: 3.09001e-06 [elim_shapecalc]: 1.08e-05 [elim_not_effective]: 1.478e-05 [opt_reshape]: 8.07e-06 [fold_const_symbol]: 1.215e-05 [renormalize]: 2.10013e-07 [detach_backward]: 1.84998e-06 [pipeline_parallel_scheduler]: 1.52001e-06 [auto_monad_reorder]: 1.977e-05 [get_jit_bprop_graph]: 1.32999e-06 [rewriter_after_jit_bprop_graph]: 3.22002e-06 [opt_after_jit_grad]: 0.00045234 [validate]: 3.936e-05 Sums bootstrap : 0.000465s : 4.21% type_inference : 0.005865s : 53.05% event_method : 0.000019s : 0.17% auto_monad : 0.000064s : 0.58% graph_reusing : 0.000006s : 0.05% inline : 0.000002s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000020s : 0.18% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.06% parallel-infer-symbol : 0.000004s : 0.04% pre_auto_parallel : 0.000035s : 0.31% insert-virtual-dataset : 0.000003s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.02% optimize.py_interpret_to_execute : 0.000027s : 0.25% optimize.rewriter_before_opt_a : 0.000084s : 0.76% optimize.opt_a.expand_dump_flag : 0.000004s : 0.04% optimize.opt_a.switch_simplify : 0.000056s : 0.51% optimize.opt_a.loop_unroll : 0.000043s : 0.39% optimize.opt_a.a_1 : 0.000853s : 7.72% optimize.opt_a.with_stream_mark : 0.000028s : 0.26% optimize.opt_a.recompute_prepare : 0.000020s : 0.18% optimize.opt_a.updatestate_depend_eliminate : 0.000009s : 0.08% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.06% optimize.opt_a.updatestate_loads_eliminate : 0.000007s : 0.06% optimize.opt_a.parameter_eliminate : 0.000003s : 0.03% optimize.opt_a.a_2 : 0.000193s : 1.74% optimize.opt_a.accelerated_algorithm : 0.000016s : 0.15% optimize.opt_a.shard : 0.000003s : 0.03% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.04% optimize.opt_a.shard_inline : 0.000015s : 0.14% optimize.opt_a.merge_send_recv : 0.000016s : 0.15% optimize.opt_a.auto_parallel : 0.000013s : 0.11% optimize.opt_a.parallel : 0.000023s : 0.20% optimize.opt_a.flash_sp : 0.000012s : 0.11% optimize.opt_a.merge_comm : 0.000009s : 0.08% optimize.opt_a.allreduce_fusion : 0.000011s : 0.10% optimize.opt_a.matmul_add_comm_reduction : 0.000016s : 0.14% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000017s : 0.15% optimize.opt_a.virtual_dataset : 0.000015s : 0.14% optimize.opt_a.get_grad_eliminate_ : 0.000015s : 0.14% optimize.opt_a.virtual_output : 0.000014s : 0.13% optimize.opt_a.merge_forward : 0.000007s : 0.07% optimize.opt_a.cell_reuse_recompute_pass : 0.000002s : 0.02% optimize.opt_a.offload_activation : 0.000019s : 0.17% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000029s : 0.26% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.02% optimize.opt_a.before_grad : 0.000024s : 0.22% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000009s : 0.08% optimize.opt_a.meta_fg_expand : 0.000006s : 0.05% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.04% optimize.opt_a.receive_attached : 0.000004s : 0.03% optimize.opt_a.after_resolve : 0.000025s : 0.23% optimize.opt_a.a_after_grad : 0.000023s : 0.21% optimize.opt_a.renormalize : 0.000596s : 5.39% optimize.opt_a.add_forward_monad_depend : 0.000007s : 0.06% optimize.opt_a.auto_monad_grad : 0.000003s : 0.03% optimize.opt_a.auto_monad_eliminator : 0.000024s : 0.22% optimize.opt_a.cse : 0.000054s : 0.49% optimize.opt_a.a_3 : 0.000114s : 1.04% optimize.py_interpret_to_execute_after_opt_a : 0.000011s : 0.10% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.02% optimize.rewriter_after_opt_a : 0.000040s : 0.36% optimize.convert_after_rewriter : 0.000008s : 0.07% optimize.order_py_execute_after_rewriter : 0.000006s : 0.05% optimize.mutable_eliminate : 0.000471s : 4.26% optimize.opt_b.b_1 : 0.000153s : 1.39% optimize.opt_b.b_2 : 0.000009s : 0.08% optimize.opt_b.updatestate_depend_eliminate : 0.000006s : 0.05% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_b.renormalize : 0.000000s : 0.00% optimize.opt_b.cse : 0.000021s : 0.19% optimize.optimize_parallel_all_gather_comm : 0.000017s : 0.15% optimize.overlap_param_gather : 0.000002s : 0.02% optimize.cconv : 0.000023s : 0.21% optimize.loop_unroll : 0.000409s : 3.70% optimize.opt_after_cconv.c_1 : 0.000037s : 0.33% optimize.opt_after_cconv.parameter_eliminate : 0.000002s : 0.02% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.05% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.cse : 0.000021s : 0.19% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000014s : 0.12% optimize.tuple_transform.d_1 : 0.000051s : 0.46% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.02% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000008s : 0.07% optimize.partial_unused_args_eliminate : 0.000002s : 0.02% optimize.add_recomputation : 0.000055s : 0.50% optimize.cse_after_recomputation.cse : 0.000016s : 0.14% optimize.environ_conv : 0.000006s : 0.06% optimize.swap_dp_allreduce_reducescatter : 0.000006s : 0.06% optimize.bias_add_comm_swap : 0.000003s : 0.02% optimize.label_micro_interleaved_index : 0.000004s : 0.04% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.03% optimize.merge_cast_opt : 0.000001s : 0.01% optimize.slice_recompute_activation : 0.000002s : 0.02% optimize.micro_interleaved_order_control : 0.000003s : 0.02% optimize.assign_add_opt : 0.000001s : 0.01% optimize.ForceFp32Comm : 0.000001s : 0.01% optimize.remove_cast_before_assign_add : 0.000001s : 0.01% optimize.full_micro_interleaved_order_control : 0.000002s : 0.02% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.02% optimize.comm_op_add_attrs : 0.000001s : 0.01% optimize.add_comm_op_reuse_tag : 0.000001s : 0.01% optimize.interleave_split_concat_branches : 0.000001s : 0.01% optimize.interleave_parallel_branches : 0.000001s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.02% optimize.control_data_broadcast_order : 0.000014s : 0.13% optimize.grouped_pairwise_exchange_alltoall : 0.000001s : 0.01% optimize.offloading_packed_experts : 0.000004s : 0.04% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.04% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.01% optimize.overlap_recompute_comm : 0.000002s : 0.02% optimize.overlap_grad_ring_attention : 0.000004s : 0.04% optimize.overlap_grad_flash_sp : 0.000021s : 0.19% optimize.begin_end_overlap_inline : 0.000001s : 0.01% optimize.split_matmul_comm_elemetwise : 0.000003s : 0.02% optimize.split_layernorm_comm : 0.000002s : 0.02% optimize.handle_group_info : 0.000001s : 0.01% optimize.symbol_engine_optimizer.build : 0.000003s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000011s : 0.10% optimize.symbol_engine_optimizer.elim_not_effective : 0.000015s : 0.13% optimize.symbol_engine_optimizer.opt_reshape : 0.000008s : 0.07% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000012s : 0.11% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.02% pipeline_parallel_scheduler : 0.000002s : 0.01% auto_monad_reorder : 0.000020s : 0.18% get_jit_bprop_graph : 0.000001s : 0.01% rewriter_after_jit_bprop_graph : 0.000003s : 0.03% opt_after_jit_grad : 0.000452s : 4.09% validate : 0.000039s : 0.36% Time group info: ------[substitution.] 0.000202 38 11.12% : 0.000022s : 3: substitution.cast_eliminate 1.05% : 0.000002s : 3: substitution.elim_not_effective 0.85% : 0.000002s : 3: substitution.fold_const_symbol 3.32% : 0.000007s : 5: substitution.graph_param_transform 70.27% : 0.000142s : 4: substitution.inline 2.15% : 0.000004s : 6: substitution.j_node_and_user_rematch 2.85% : 0.000006s : 6: substitution.remove_not_recompute_node 2.14% : 0.000004s : 4: substitution.replace_old_param 6.24% : 0.000013s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.005807 2 88.18% : 0.005121s : 1: type_inference.infer 11.82% : 0.000686s : 1: type_inference.specialize ------[replace.] 0.000063 8 60.03% : 0.000038s : 4: replace.inline 39.97% : 0.000025s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000150 8 92.82% : 0.000139s : 4: match.inline 7.18% : 0.000011s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000234 1504 0.90% : 0.000002s : 15: predicate.accumulaten_eliminater 0.63% : 0.000001s : 5: predicate.ad_related_special_op_eliminate 0.58% : 0.000001s : 10: predicate.addn_check_dump 0.98% : 0.000002s : 15: predicate.addn_zero_filter 0.84% : 0.000002s : 15: predicate.adjust_all_reduce_mul_add 2.15% : 0.000005s : 25: predicate.arithmetic_simplify 1.00% : 0.000002s : 15: predicate.cast_eliminate 0.64% : 0.000001s : 10: predicate.check_bprop_eliminate 0.60% : 0.000001s : 10: predicate.compare_switch_simplify 0.20% : 0.000000s : 5: predicate.const_output_eliminate 0.62% : 0.000001s : 10: predicate.depend_value_elim 0.94% : 0.000002s : 15: predicate.dict_get_item_const_eliminator 0.97% : 0.000002s : 15: predicate.dict_get_item_eliminator 0.85% : 0.000002s : 15: predicate.dict_set_item_eliminator 0.92% : 0.000002s : 10: predicate.dumpgradient_eliminate 0.20% : 0.000000s : 5: predicate.elim_not_effective 0.46% : 0.000001s : 5: predicate.elim_shapecalc_of_broadcastargs 1.21% : 0.000003s : 20: predicate.environ_add_const_eliminate 1.16% : 0.000003s : 20: predicate.environ_get_add_eliminate 1.10% : 0.000003s : 20: predicate.environ_get_depend_swap 1.79% : 0.000004s : 30: predicate.environ_get_eliminate 1.14% : 0.000003s : 20: predicate.environ_get_set_eliminate 1.43% : 0.000003s : 23: predicate.exchange_switch_depend_value 2.26% : 0.000005s : 23: predicate.float_depend_g_call 0.58% : 0.000001s : 10: predicate.float_environ_get_switch 0.88% : 0.000002s : 15: predicate.float_tuple_getitem_switch 0.20% : 0.000000s : 5: predicate.fold_const_symbol 0.71% : 0.000002s : 10: predicate.get_grad_eliminate 0.23% : 0.000001s : 5: predicate.graph_param_transform 0.67% : 0.000002s : 10: predicate.incorporate_call 0.57% : 0.000001s : 10: predicate.incorporate_call_switch 6.26% : 0.000015s : 68: predicate.inline 0.82% : 0.000002s : 10: predicate.inline_without_move 0.34% : 0.000001s : 10: predicate.j_node_and_user_rematch 0.97% : 0.000002s : 10: predicate.less_batch_normalization 1.79% : 0.000004s : 29: predicate.list_to_tuple_eliminator_ 2.54% : 0.000006s : 44: predicate.load_eliminater 0.83% : 0.000002s : 5: predicate.loop_unroll_after_grad 2.39% : 0.000006s : 36: predicate.loop_unroll_before_grad 1.65% : 0.000004s : 25: predicate.make_slice_get_slice_eliminator 0.62% : 0.000001s : 10: predicate.merge_addn 0.57% : 0.000001s : 10: predicate.micro_step_allgather_replace 0.58% : 0.000001s : 10: predicate.mini_step_allgather_replace 0.86% : 0.000002s : 15: predicate.minmaximum_grad 0.75% : 0.000002s : 5: predicate.mutable_eliminate 0.36% : 0.000001s : 5: predicate.opt_reshape 0.37% : 0.000001s : 5: predicate.parallel_virtual_node 1.85% : 0.000004s : 23: predicate.partial_defer_inline 1.66% : 0.000004s : 24: predicate.partial_eliminate 0.93% : 0.000002s : 15: predicate.print_const_string_wrapper 0.63% : 0.000001s : 10: predicate.reduce_all_const_elim 1.19% : 0.000003s : 15: predicate.reduce_eliminate 2.50% : 0.000006s : 44: predicate.redundant_stop_gradient_eliminater 0.40% : 0.000001s : 10: predicate.remove_not_recompute_node 1.40% : 0.000003s : 29: predicate.replace_applicator 0.64% : 0.000001s : 10: predicate.replace_old_param 0.29% : 0.000001s : 5: predicate.reset_defer_inline 0.92% : 0.000002s : 15: predicate.reshape_eliminate 0.61% : 0.000001s : 10: predicate.row_tensor_add_zeros_like 0.39% : 0.000001s : 5: predicate.row_tensor_eliminate 0.75% : 0.000002s : 10: predicate.same_eliminate 0.46% : 0.000001s : 10: predicate.set_cell_output_no_recompute 0.76% : 0.000002s : 10: predicate.shard_identity_eliminate 0.76% : 0.000002s : 10: predicate.special_op_eliminate 0.75% : 0.000002s : 10: predicate.specialize_transform 0.86% : 0.000002s : 10: predicate.split_environ_get_set_with_tuple_value 0.76% : 0.000002s : 10: predicate.stack_unstack_eliminate 0.37% : 0.000001s : 5: predicate.switch_call_monad_eliminater 1.46% : 0.000003s : 23: predicate.switch_defer_inline 2.08% : 0.000005s : 33: predicate.switch_layer_defer_inline 5.05% : 0.000012s : 74: predicate.switch_simplify 0.93% : 0.000002s : 15: predicate.tile_eliminate 0.90% : 0.000002s : 15: predicate.transpose_eliminate 1.51% : 0.000004s : 25: predicate.tuple_list_convert_item_index_to_positive 1.66% : 0.000004s : 25: predicate.tuple_list_get_item_const_eliminator 1.48% : 0.000003s : 25: predicate.tuple_list_get_item_depend_reorder 3.41% : 0.000008s : 39: predicate.tuple_list_get_item_eliminator 1.49% : 0.000003s : 25: predicate.tuple_list_get_set_item_eliminator 2.22% : 0.000005s : 35: predicate.tuple_list_set_item_eliminator 1.72% : 0.000004s : 29: predicate.tuple_to_list_eliminator_ 2.48% : 0.000006s : 44: predicate.updatestate_pure_node_eliminater 3.24% : 0.000008s : 54: predicate.updatestate_useless_node_eliminater 0.39% : 0.000001s : 5: predicate.value_based_eliminate 0.67% : 0.000002s : 10: predicate.virtual_dataset_eliminate 0.65% : 0.000002s : 10: predicate.virtual_output_eliminate 0.30% : 0.000001s : 5: predicate.virtual_view_grad_eliminate 0.42% : 0.000001s : 5: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000516 11 56.36% : 0.000291s : 5: func_graph_cloner_run.FuncGraphClonerGraph 43.64% : 0.000225s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.025176 192 0.01% : 0.000003s : 1: ForceFp32Comm 12.14% : 0.003057s : 1: add_attr 12.10% : 0.003047s : 1: add_attr_with_inline 0.01% : 0.000004s : 1: add_comm_op_reuse_tag 0.24% : 0.000059s : 1: add_recomputation 0.02% : 0.000004s : 1: assign_add_opt 0.28% : 0.000070s : 1: auto_monad 0.09% : 0.000024s : 1: auto_monad_reorder 0.01% : 0.000003s : 1: begin_end_overlap_inline 0.02% : 0.000006s : 1: bias_add_comm_swap 1.97% : 0.000495s : 1: bootstrap 0.11% : 0.000027s : 1: cconv 0.01% : 0.000004s : 1: comm_op_add_attrs 0.07% : 0.000017s : 1: control_data_broadcast_order 0.04% : 0.000010s : 1: convert_after_rewriter 0.12% : 0.000030s : 1: cse_after_recomputation 0.02% : 0.000005s : 1: dataset_repeat_opt 0.02% : 0.000005s : 1: detach_backward 0.04% : 0.000010s : 1: environ_conv 0.10% : 0.000026s : 1: event_method 0.02% : 0.000005s : 1: full_micro_interleaved_order_control 0.02% : 0.000004s : 1: get_jit_bprop_graph 0.04% : 0.000009s : 1: graph_reusing 0.02% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000004s : 1: handle_group_info 0.02% : 0.000005s : 1: inline 0.02% : 0.000006s : 1: insert-virtual-dataset 0.02% : 0.000004s : 1: interleave_parallel_branches 0.02% : 0.000004s : 1: interleave_split_concat_branches 0.02% : 0.000006s : 1: label_fine_grained_interleaved_index 0.03% : 0.000007s : 1: label_micro_interleaved_index 1.66% : 0.000417s : 1: loop_unroll 0.02% : 0.000004s : 1: merge_cast_opt 0.02% : 0.000005s : 1: micro_interleaved_order_control 1.90% : 0.000479s : 1: mutable_eliminate 0.03% : 0.000007s : 1: offloading_packed_experts 0.06% : 0.000015s : 1: opt.transform.loop_unroll_optimizer 0.06% : 0.000015s : 1: opt.transform.mutable_eliminate 5.50% : 0.001385s : 78: opt.transform.opt_a 0.14% : 0.000035s : 1: opt.transform.opt_after_cconv 0.11% : 0.000028s : 1: opt.transform.opt_after_jit_grad 0.52% : 0.000132s : 28: opt.transform.opt_b 0.22% : 0.000057s : 2: opt.transform.opt_trans_graph 0.17% : 0.000042s : 4: opt.transform.symbol_engine_opt 11.33% : 0.002852s : 1: opt_a 0.46% : 0.000117s : 1: opt_after_cconv 1.83% : 0.000461s : 1: opt_after_jit_grad 0.95% : 0.000239s : 1: opt_b 19.39% : 0.004883s : 1: optimize 0.08% : 0.000020s : 1: optimize_parallel_all_gather_comm 0.04% : 0.000009s : 1: order_py_execute_after_rewriter 0.10% : 0.000025s : 1: overlap_grad_flash_sp 0.02% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.03% : 0.000008s : 1: overlap_grad_ring_attention 0.02% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.02% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.02% : 0.000005s : 1: overlap_param_gather 0.02% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.03% : 0.000008s : 1: overlap_recompute_and_grad_model_parallel 0.02% : 0.000005s : 1: overlap_recompute_comm 0.03% : 0.000007s : 1: parallel-infer-symbol 0.01% : 0.000004s : 1: parallel-infer-symbol-second 0.02% : 0.000005s : 1: partial_unused_args_eliminate 0.02% : 0.000004s : 1: pipeline_parallel_scheduler 0.02% : 0.000005s : 1: pipeline_split 0.16% : 0.000039s : 1: pre_auto_parallel 0.13% : 0.000032s : 1: py_interpret_to_execute 0.06% : 0.000014s : 1: py_interpret_to_execute_after_opt_a 0.02% : 0.000004s : 1: remove_cast_before_assign_add 0.07% : 0.000017s : 1: remove_dup_value 1.26% : 0.000317s : 1: renormalize.infer 1.08% : 0.000272s : 1: renormalize.specialize 0.02% : 0.000005s : 1: reorder_send_recv_between_fp_bp 0.02% : 0.000006s : 1: rewriter_after_jit_bprop_graph 0.17% : 0.000044s : 1: rewriter_after_opt_a 0.35% : 0.000089s : 1: rewriter_before_opt_a 0.02% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.02% : 0.000005s : 1: slice_recompute_activation 0.02% : 0.000004s : 1: split_layernorm_comm 0.02% : 0.000005s : 1: split_matmul_comm_elemetwise 0.04% : 0.000009s : 1: swap_dp_allreduce_reducescatter 0.33% : 0.000083s : 1: symbol_engine_optimizer 0.34% : 0.000087s : 1: tuple_transform 23.36% : 0.005882s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:45:46.775.007 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:45:46.775.277 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0152103, [21] [bootstrap]: 0.00041712 [type_inference]: 0.00544707 [event_method]: 1.759e-05 [auto_monad]: 6.256e-05 [graph_reusing]: 5.89e-06 [inline]: 1.89999e-06 [add_attr]: 0.00305805, [1] [add_attr_with_inline]: 0.00304857, [1] [Cycle 1]: 7.062e-05, [2] [tag_attr]: 1.855e-05 [meta_addattr_fg_expand]: 5.63002e-06 [parallel-infer-symbol]: 3.23e-06 [pre_auto_parallel]: 3.083e-05 [insert-virtual-dataset]: 2.46e-06 [parallel-infer-symbol-second]: 7.00005e-07 [dataset_repeat_opt]: 1.91003e-06 [pipeline_split]: 1.83997e-06 [optimize]: 0.00505853, [53] [py_interpret_to_execute]: 2.655e-05 [rewriter_before_opt_a]: 8.199e-05 [opt_a]: 0.00282149, [2] [Cycle 1]: 0.00198834, [45] [expand_dump_flag]: 2.99999e-06 [switch_simplify]: 4.183e-05 [loop_unroll]: 3.009e-05 [a_1]: 0.00060959 [with_stream_mark]: 1.473e-05 [recompute_prepare]: 8.50999e-06 [updatestate_depend_eliminate]: 3.85998e-06 [updatestate_assign_eliminate]: 3.25e-06 [updatestate_loads_eliminate]: 3.06999e-06 [parameter_eliminate]: 2.21998e-06 [a_2]: 0.00011319 [accelerated_algorithm]: 7.65e-06 [shard]: 1.92999e-06 [meta_shard_fg_expand]: 1.84e-06 [shard_inline]: 6.59999e-06 [merge_send_recv]: 8.18001e-06 [auto_parallel]: 5.87999e-06 [parallel]: 1.911e-05 [flash_sp]: 7.46999e-06 [merge_comm]: 4.02998e-06 [allreduce_fusion]: 3.7e-06 [matmul_add_comm_reduction]: 9.32001e-06 [allreduce_slice_to_reducescatter]: 6.00005e-07 [virtual_shard_identity]: 7.95e-06 [virtual_dataset]: 6.83e-06 [get_grad_eliminate_]: 6.19001e-06 [virtual_output]: 6.55002e-06 [merge_forward]: 3.92998e-06 [cell_reuse_recompute_pass]: 1.38002e-06 [offload_activation]: 9.19e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.445e-05 [merge_recompute_call_nodes]: 1.42e-06 [before_grad]: 1.014e-05 [set_forward_comm_id_for_comm_node_pass]: 3.65e-06 [meta_fg_expand]: 2.89999e-06 [flash_sp_send_recv_attached]: 2.34999e-06 [receive_attached]: 2.54001e-06 [after_resolve]: 1.148e-05 [a_after_grad]: 1.01e-05 [renormalize]: 0.00049485 [add_forward_monad_depend]: 5.56e-06 [auto_monad_grad]: 2.39999e-06 [auto_monad_eliminator]: 1.451e-05 [cse]: 2.597e-05 [a_3]: 6.043e-05 [Cycle 2]: 0.000821, [45] [expand_dump_flag]: 1.23002e-06 [switch_simplify]: 7.53e-06 [loop_unroll]: 6.25002e-06 [a_1]: 0.00012928 [with_stream_mark]: 1.082e-05 [recompute_prepare]: 6.59001e-06 [updatestate_depend_eliminate]: 2.74999e-06 [updatestate_assign_eliminate]: 2.48e-06 [updatestate_loads_eliminate]: 2.27999e-06 [parameter_eliminate]: 1.04e-06 [a_2]: 0.0001094 [accelerated_algorithm]: 6.09001e-06 [shard]: 1.23002e-06 [meta_shard_fg_expand]: 1.27e-06 [shard_inline]: 6.14001e-06 [merge_send_recv]: 4.51002e-06 [auto_parallel]: 5.49e-06 [parallel]: 4.02998e-06 [flash_sp]: 3.31999e-06 [merge_comm]: 3.69002e-06 [allreduce_fusion]: 3.62998e-06 [matmul_add_comm_reduction]: 5.57001e-06 [allreduce_slice_to_reducescatter]: 3.59985e-07 [virtual_shard_identity]: 6.89999e-06 [virtual_dataset]: 5.87999e-06 [get_grad_eliminate_]: 5.79e-06 [virtual_output]: 5.59e-06 [merge_forward]: 2.78e-06 [cell_reuse_recompute_pass]: 1.72001e-06 [offload_activation]: 6.66999e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.479e-05 [merge_recompute_call_nodes]: 9.09989e-07 [before_grad]: 9.14e-06 [set_forward_comm_id_for_comm_node_pass]: 3.46999e-06 [meta_fg_expand]: 2.19999e-06 [flash_sp_send_recv_attached]: 1.17e-06 [receive_attached]: 1.04e-06 [after_resolve]: 1.071e-05 [a_after_grad]: 9.40001e-06 [renormalize]: 8.00064e-08 [add_forward_monad_depend]: 1.37e-06 [auto_monad_grad]: 1.15001e-06 [auto_monad_eliminator]: 6.76999e-06 [cse]: 1.314e-05 [a_3]: 4.889e-05 [py_interpret_to_execute_after_opt_a]: 1.064e-05 [slice_cell_reuse_recomputed_activation]: 5.19998e-06 [rewriter_after_opt_a]: 5.029e-05 [convert_after_rewriter]: 1.087e-05 [order_py_execute_after_rewriter]: 8.2e-06 [mutable_eliminate]: 0.00049273 [opt_b]: 0.00026738, [1] [Cycle 1]: 0.0002575, [7] [b_1]: 0.00016699 [b_2]: 8.11002e-06 [updatestate_depend_eliminate]: 5.34e-06 [updatestate_assign_eliminate]: 2.59999e-06 [updatestate_loads_eliminate]: 2.37999e-06 [renormalize]: 5.10016e-07 [cse]: 1.655e-05 [optimize_parallel_all_gather_comm]: 1.97e-05 [overlap_param_gather]: 4.96997e-06 [cconv]: 2.669e-05 [loop_unroll]: 0.00043528 [opt_after_cconv]: 0.00012627, [1] [Cycle 1]: 0.00011794, [7] [c_1]: 3.059e-05 [parameter_eliminate]: 2.77002e-06 [updatestate_depend_eliminate]: 5.39e-06 [updatestate_assign_eliminate]: 2.66e-06 [updatestate_loads_eliminate]: 2.58003e-06 [cse]: 1.685e-05 [renormalize]: 2.69996e-07 [remove_dup_value]: 1.699e-05 [tuple_transform]: 8.922e-05, [1] [Cycle 1]: 8.233e-05, [4] [d_1]: 4.36e-05 [none_parameter_eliminate]: 1.60999e-06 [renormalize]: 1.8999e-07 [switch_simplify]: 6.92002e-06 [partial_unused_args_eliminate]: 4.54998e-06 [add_recomputation]: 4.786e-05 [cse_after_recomputation]: 2.622e-05, [1] [Cycle 1]: 1.952e-05, [1] [cse]: 1.063e-05 [environ_conv]: 8.81997e-06 [swap_dp_allreduce_reducescatter]: 7.6e-06 [bias_add_comm_swap]: 4.79e-06 [label_micro_interleaved_index]: 6.95002e-06 [label_fine_grained_interleaved_index]: 4.89e-06 [merge_cast_opt]: 3.94002e-06 [slice_recompute_activation]: 4.33999e-06 [micro_interleaved_order_control]: 4.78001e-06 [assign_add_opt]: 3.7e-06 [ForceFp32Comm]: 3.58999e-06 [remove_cast_before_assign_add]: 3.45e-06 [full_micro_interleaved_order_control]: 4.55001e-06 [reorder_send_recv_between_fp_bp]: 5.12e-06 [comm_op_add_attrs]: 3.85e-06 [add_comm_op_reuse_tag]: 3.43e-06 [interleave_split_concat_branches]: 3.56001e-06 [interleave_parallel_branches]: 3.59002e-06 [overlap_opt_shard_in_pipeline]: 3.78999e-06 [overlap_opt_shard_grad_in_pipeline]: 4.72e-06 [control_data_broadcast_order]: 1.51e-05 [grouped_pairwise_exchange_alltoall]: 3.85998e-06 [offloading_packed_experts]: 6.64001e-06 [overlap_recompute_and_grad_model_parallel]: 7.23999e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.80998e-06 [overlap_recompute_allgather_and_fa_grad]: 4.15999e-06 [overlap_recompute_comm]: 5.06997e-06 [overlap_grad_ring_attention]: 7.28e-06 [overlap_grad_flash_sp]: 2.103e-05 [begin_end_overlap_inline]: 3.05998e-06 [split_matmul_comm_elemetwise]: 4.59998e-06 [split_layernorm_comm]: 4.42e-06 [handle_group_info]: 3.61001e-06 [symbol_engine_optimizer]: 9.561e-05, [1] [Cycle 1]: 8.78e-05, [6] [build]: 2.88e-06 [elim_shapecalc]: 9.26002e-06 [elim_not_effective]: 1.279e-05 [opt_reshape]: 6.98e-06 [fold_const_symbol]: 1.011e-05 [renormalize]: 1.8999e-07 [detach_backward]: 3.14001e-06 [pipeline_parallel_scheduler]: 1.75001e-06 [auto_monad_reorder]: 1.801e-05 [get_jit_bprop_graph]: 1.39998e-06 [rewriter_after_jit_bprop_graph]: 3.91999e-06 [opt_after_jit_grad]: 0.00048412 [validate]: 3.67e-05 Sums bootstrap : 0.000417s : 3.99% type_inference : 0.005447s : 52.10% event_method : 0.000018s : 0.17% auto_monad : 0.000063s : 0.60% graph_reusing : 0.000006s : 0.06% inline : 0.000002s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000019s : 0.18% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.05% parallel-infer-symbol : 0.000003s : 0.03% pre_auto_parallel : 0.000031s : 0.29% insert-virtual-dataset : 0.000002s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.02% optimize.py_interpret_to_execute : 0.000027s : 0.25% optimize.rewriter_before_opt_a : 0.000082s : 0.78% optimize.opt_a.expand_dump_flag : 0.000004s : 0.04% optimize.opt_a.switch_simplify : 0.000049s : 0.47% optimize.opt_a.loop_unroll : 0.000036s : 0.35% optimize.opt_a.a_1 : 0.000739s : 7.07% optimize.opt_a.with_stream_mark : 0.000026s : 0.24% optimize.opt_a.recompute_prepare : 0.000015s : 0.14% optimize.opt_a.updatestate_depend_eliminate : 0.000007s : 0.06% optimize.opt_a.updatestate_assign_eliminate : 0.000006s : 0.05% optimize.opt_a.updatestate_loads_eliminate : 0.000005s : 0.05% optimize.opt_a.parameter_eliminate : 0.000003s : 0.03% optimize.opt_a.a_2 : 0.000223s : 2.13% optimize.opt_a.accelerated_algorithm : 0.000014s : 0.13% optimize.opt_a.shard : 0.000003s : 0.03% optimize.opt_a.meta_shard_fg_expand : 0.000003s : 0.03% optimize.opt_a.shard_inline : 0.000013s : 0.12% optimize.opt_a.merge_send_recv : 0.000013s : 0.12% optimize.opt_a.auto_parallel : 0.000011s : 0.11% optimize.opt_a.parallel : 0.000023s : 0.22% optimize.opt_a.flash_sp : 0.000011s : 0.10% optimize.opt_a.merge_comm : 0.000008s : 0.07% optimize.opt_a.allreduce_fusion : 0.000007s : 0.07% optimize.opt_a.matmul_add_comm_reduction : 0.000015s : 0.14% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000015s : 0.14% optimize.opt_a.virtual_dataset : 0.000013s : 0.12% optimize.opt_a.get_grad_eliminate_ : 0.000012s : 0.11% optimize.opt_a.virtual_output : 0.000012s : 0.12% optimize.opt_a.merge_forward : 0.000007s : 0.06% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.03% optimize.opt_a.offload_activation : 0.000016s : 0.15% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000029s : 0.28% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.02% optimize.opt_a.before_grad : 0.000019s : 0.18% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000007s : 0.07% optimize.opt_a.meta_fg_expand : 0.000005s : 0.05% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.03% optimize.opt_a.receive_attached : 0.000004s : 0.03% optimize.opt_a.after_resolve : 0.000022s : 0.21% optimize.opt_a.a_after_grad : 0.000020s : 0.19% optimize.opt_a.renormalize : 0.000495s : 4.73% optimize.opt_a.add_forward_monad_depend : 0.000007s : 0.07% optimize.opt_a.auto_monad_grad : 0.000004s : 0.03% optimize.opt_a.auto_monad_eliminator : 0.000021s : 0.20% optimize.opt_a.cse : 0.000039s : 0.37% optimize.opt_a.a_3 : 0.000109s : 1.05% optimize.py_interpret_to_execute_after_opt_a : 0.000011s : 0.10% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.05% optimize.rewriter_after_opt_a : 0.000050s : 0.48% optimize.convert_after_rewriter : 0.000011s : 0.10% optimize.order_py_execute_after_rewriter : 0.000008s : 0.08% optimize.mutable_eliminate : 0.000493s : 4.71% optimize.opt_b.b_1 : 0.000167s : 1.60% optimize.opt_b.b_2 : 0.000008s : 0.08% optimize.opt_b.updatestate_depend_eliminate : 0.000005s : 0.05% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.02% optimize.opt_b.updatestate_loads_eliminate : 0.000002s : 0.02% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000017s : 0.16% optimize.optimize_parallel_all_gather_comm : 0.000020s : 0.19% optimize.overlap_param_gather : 0.000005s : 0.05% optimize.cconv : 0.000027s : 0.26% optimize.loop_unroll : 0.000435s : 4.16% optimize.opt_after_cconv.c_1 : 0.000031s : 0.29% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000005s : 0.05% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.02% optimize.opt_after_cconv.cse : 0.000017s : 0.16% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000017s : 0.16% optimize.tuple_transform.d_1 : 0.000044s : 0.42% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.02% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000007s : 0.07% optimize.partial_unused_args_eliminate : 0.000005s : 0.04% optimize.add_recomputation : 0.000048s : 0.46% optimize.cse_after_recomputation.cse : 0.000011s : 0.10% optimize.environ_conv : 0.000009s : 0.08% optimize.swap_dp_allreduce_reducescatter : 0.000008s : 0.07% optimize.bias_add_comm_swap : 0.000005s : 0.05% optimize.label_micro_interleaved_index : 0.000007s : 0.07% optimize.label_fine_grained_interleaved_index : 0.000005s : 0.05% optimize.merge_cast_opt : 0.000004s : 0.04% optimize.slice_recompute_activation : 0.000004s : 0.04% optimize.micro_interleaved_order_control : 0.000005s : 0.05% optimize.assign_add_opt : 0.000004s : 0.04% optimize.ForceFp32Comm : 0.000004s : 0.03% optimize.remove_cast_before_assign_add : 0.000003s : 0.03% optimize.full_micro_interleaved_order_control : 0.000005s : 0.04% optimize.reorder_send_recv_between_fp_bp : 0.000005s : 0.05% optimize.comm_op_add_attrs : 0.000004s : 0.04% optimize.add_comm_op_reuse_tag : 0.000003s : 0.03% optimize.interleave_split_concat_branches : 0.000004s : 0.03% optimize.interleave_parallel_branches : 0.000004s : 0.03% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.04% optimize.overlap_opt_shard_grad_in_pipeline : 0.000005s : 0.05% optimize.control_data_broadcast_order : 0.000015s : 0.14% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.04% optimize.offloading_packed_experts : 0.000007s : 0.06% optimize.overlap_recompute_and_grad_model_parallel : 0.000007s : 0.07% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.04% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.04% optimize.overlap_recompute_comm : 0.000005s : 0.05% optimize.overlap_grad_ring_attention : 0.000007s : 0.07% optimize.overlap_grad_flash_sp : 0.000021s : 0.20% optimize.begin_end_overlap_inline : 0.000003s : 0.03% optimize.split_matmul_comm_elemetwise : 0.000005s : 0.04% optimize.split_layernorm_comm : 0.000004s : 0.04% optimize.handle_group_info : 0.000004s : 0.03% optimize.symbol_engine_optimizer.build : 0.000003s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000009s : 0.09% optimize.symbol_engine_optimizer.elim_not_effective : 0.000013s : 0.12% optimize.symbol_engine_optimizer.opt_reshape : 0.000007s : 0.07% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000010s : 0.10% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000003s : 0.03% pipeline_parallel_scheduler : 0.000002s : 0.02% auto_monad_reorder : 0.000018s : 0.17% get_jit_bprop_graph : 0.000001s : 0.01% rewriter_after_jit_bprop_graph : 0.000004s : 0.04% opt_after_jit_grad : 0.000484s : 4.63% validate : 0.000037s : 0.35% Time group info: ------[substitution.] 0.000170 28 1.08% : 0.000002s : 2: substitution.elim_not_effective 0.76% : 0.000001s : 2: substitution.fold_const_symbol 3.46% : 0.000006s : 4: substitution.graph_param_transform 78.31% : 0.000133s : 4: substitution.inline 1.77% : 0.000003s : 4: substitution.j_node_and_user_rematch 2.95% : 0.000005s : 4: substitution.remove_not_recompute_node 2.82% : 0.000005s : 4: substitution.replace_old_param 8.85% : 0.000015s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.005399 2 87.80% : 0.004740s : 1: type_inference.infer 12.20% : 0.000659s : 1: type_inference.specialize ------[replace.] 0.000059 8 62.75% : 0.000037s : 4: replace.inline 37.25% : 0.000022s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000143 8 90.91% : 0.000130s : 4: match.inline 9.09% : 0.000013s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000203 1278 0.88% : 0.000002s : 13: predicate.accumulaten_eliminater 0.67% : 0.000001s : 4: predicate.ad_related_special_op_eliminate 0.57% : 0.000001s : 8: predicate.addn_check_dump 0.95% : 0.000002s : 13: predicate.addn_zero_filter 0.80% : 0.000002s : 13: predicate.adjust_all_reduce_mul_add 1.98% : 0.000004s : 21: predicate.arithmetic_simplify 0.93% : 0.000002s : 13: predicate.cast_eliminate 0.65% : 0.000001s : 8: predicate.check_bprop_eliminate 0.57% : 0.000001s : 8: predicate.compare_switch_simplify 0.19% : 0.000000s : 4: predicate.const_output_eliminate 0.69% : 0.000001s : 8: predicate.depend_value_elim 0.95% : 0.000002s : 13: predicate.dict_get_item_const_eliminator 1.01% : 0.000002s : 13: predicate.dict_get_item_eliminator 0.84% : 0.000002s : 13: predicate.dict_set_item_eliminator 1.06% : 0.000002s : 8: predicate.dumpgradient_eliminate 0.20% : 0.000000s : 4: predicate.elim_not_effective 0.32% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.13% : 0.000002s : 17: predicate.environ_add_const_eliminate 1.06% : 0.000002s : 17: predicate.environ_get_add_eliminate 1.15% : 0.000002s : 17: predicate.environ_get_depend_swap 1.83% : 0.000004s : 25: predicate.environ_get_eliminate 1.14% : 0.000002s : 17: predicate.environ_get_set_eliminate 1.45% : 0.000003s : 21: predicate.exchange_switch_depend_value 2.64% : 0.000005s : 21: predicate.float_depend_g_call 0.58% : 0.000001s : 8: predicate.float_environ_get_switch 0.78% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.18% : 0.000000s : 4: predicate.fold_const_symbol 0.63% : 0.000001s : 8: predicate.get_grad_eliminate 0.23% : 0.000000s : 4: predicate.graph_param_transform 0.66% : 0.000001s : 8: predicate.incorporate_call 0.54% : 0.000001s : 8: predicate.incorporate_call_switch 6.35% : 0.000013s : 58: predicate.inline 0.72% : 0.000001s : 8: predicate.inline_without_move 0.32% : 0.000001s : 8: predicate.j_node_and_user_rematch 0.82% : 0.000002s : 8: predicate.less_batch_normalization 1.92% : 0.000004s : 25: predicate.list_to_tuple_eliminator_ 2.61% : 0.000005s : 38: predicate.load_eliminater 0.95% : 0.000002s : 4: predicate.loop_unroll_after_grad 2.40% : 0.000005s : 34: predicate.loop_unroll_before_grad 1.70% : 0.000003s : 21: predicate.make_slice_get_slice_eliminator 0.58% : 0.000001s : 8: predicate.merge_addn 0.57% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.57% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.81% : 0.000002s : 13: predicate.minmaximum_grad 0.88% : 0.000002s : 4: predicate.mutable_eliminate 0.33% : 0.000001s : 4: predicate.opt_reshape 0.50% : 0.000001s : 4: predicate.parallel_virtual_node 1.80% : 0.000004s : 21: predicate.partial_defer_inline 1.62% : 0.000003s : 21: predicate.partial_eliminate 0.92% : 0.000002s : 13: predicate.print_const_string_wrapper 0.67% : 0.000001s : 8: predicate.reduce_all_const_elim 1.08% : 0.000002s : 13: predicate.reduce_eliminate 2.54% : 0.000005s : 38: predicate.redundant_stop_gradient_eliminater 0.42% : 0.000001s : 8: predicate.remove_not_recompute_node 1.37% : 0.000003s : 25: predicate.replace_applicator 0.47% : 0.000001s : 8: predicate.replace_old_param 0.26% : 0.000001s : 4: predicate.reset_defer_inline 0.94% : 0.000002s : 13: predicate.reshape_eliminate 0.64% : 0.000001s : 8: predicate.row_tensor_add_zeros_like 0.38% : 0.000001s : 4: predicate.row_tensor_eliminate 0.72% : 0.000001s : 8: predicate.same_eliminate 0.43% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.79% : 0.000002s : 8: predicate.shard_identity_eliminate 0.73% : 0.000001s : 8: predicate.special_op_eliminate 0.73% : 0.000001s : 8: predicate.specialize_transform 0.91% : 0.000002s : 8: predicate.split_environ_get_set_with_tuple_value 0.74% : 0.000002s : 8: predicate.stack_unstack_eliminate 0.32% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.51% : 0.000003s : 21: predicate.switch_defer_inline 2.09% : 0.000004s : 29: predicate.switch_layer_defer_inline 5.28% : 0.000011s : 67: predicate.switch_simplify 0.90% : 0.000002s : 13: predicate.tile_eliminate 0.92% : 0.000002s : 13: predicate.transpose_eliminate 1.52% : 0.000003s : 21: predicate.tuple_list_convert_item_index_to_positive 1.57% : 0.000003s : 21: predicate.tuple_list_get_item_const_eliminator 1.48% : 0.000003s : 21: predicate.tuple_list_get_item_depend_reorder 3.30% : 0.000007s : 33: predicate.tuple_list_get_item_eliminator 1.45% : 0.000003s : 21: predicate.tuple_list_get_set_item_eliminator 2.34% : 0.000005s : 29: predicate.tuple_list_set_item_eliminator 1.90% : 0.000004s : 25: predicate.tuple_to_list_eliminator_ 2.45% : 0.000005s : 38: predicate.updatestate_pure_node_eliminater 3.19% : 0.000006s : 46: predicate.updatestate_useless_node_eliminater 0.34% : 0.000001s : 4: predicate.value_based_eliminate 0.62% : 0.000001s : 8: predicate.virtual_dataset_eliminate 0.63% : 0.000001s : 8: predicate.virtual_output_eliminate 0.26% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.49% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000440 11 52.45% : 0.000231s : 5: func_graph_cloner_run.FuncGraphClonerGraph 47.55% : 0.000209s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.025131 192 0.02% : 0.000006s : 1: ForceFp32Comm 12.20% : 0.003067s : 1: add_attr 12.15% : 0.003053s : 1: add_attr_with_inline 0.03% : 0.000006s : 1: add_comm_op_reuse_tag 0.21% : 0.000052s : 1: add_recomputation 0.03% : 0.000006s : 1: assign_add_opt 0.28% : 0.000071s : 1: auto_monad 0.10% : 0.000026s : 1: auto_monad_reorder 0.02% : 0.000006s : 1: begin_end_overlap_inline 0.03% : 0.000007s : 1: bias_add_comm_swap 1.83% : 0.000460s : 1: bootstrap 0.12% : 0.000030s : 1: cconv 0.03% : 0.000007s : 1: comm_op_add_attrs 0.07% : 0.000018s : 1: control_data_broadcast_order 0.06% : 0.000014s : 1: convert_after_rewriter 0.12% : 0.000029s : 1: cse_after_recomputation 0.03% : 0.000007s : 1: dataset_repeat_opt 0.07% : 0.000017s : 1: detach_backward 0.05% : 0.000012s : 1: environ_conv 0.11% : 0.000027s : 1: event_method 0.03% : 0.000007s : 1: full_micro_interleaved_order_control 0.03% : 0.000007s : 1: get_jit_bprop_graph 0.05% : 0.000012s : 1: graph_reusing 0.03% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.03% : 0.000006s : 1: handle_group_info 0.03% : 0.000007s : 1: inline 0.04% : 0.000009s : 1: insert-virtual-dataset 0.02% : 0.000006s : 1: interleave_parallel_branches 0.02% : 0.000006s : 1: interleave_split_concat_branches 0.03% : 0.000008s : 1: label_fine_grained_interleaved_index 0.04% : 0.000010s : 1: label_micro_interleaved_index 1.76% : 0.000441s : 1: loop_unroll 0.03% : 0.000007s : 1: merge_cast_opt 0.03% : 0.000007s : 1: micro_interleaved_order_control 1.98% : 0.000499s : 1: mutable_eliminate 0.04% : 0.000010s : 1: offloading_packed_experts 0.06% : 0.000015s : 1: opt.transform.loop_unroll_optimizer 0.05% : 0.000014s : 1: opt.transform.mutable_eliminate 4.64% : 0.001165s : 78: opt.transform.opt_a 0.12% : 0.000029s : 1: opt.transform.opt_after_cconv 0.10% : 0.000024s : 1: opt.transform.opt_after_jit_grad 0.41% : 0.000104s : 28: opt.transform.opt_b 0.19% : 0.000049s : 2: opt.transform.opt_trans_graph 0.14% : 0.000036s : 4: opt.transform.symbol_engine_opt 11.24% : 0.002825s : 1: opt_a 0.52% : 0.000130s : 1: opt_after_cconv 1.97% : 0.000495s : 1: opt_after_jit_grad 1.08% : 0.000271s : 1: opt_b 21.38% : 0.005374s : 1: optimize 0.09% : 0.000023s : 1: optimize_parallel_all_gather_comm 0.04% : 0.000011s : 1: order_py_execute_after_rewriter 0.10% : 0.000024s : 1: overlap_grad_flash_sp 0.03% : 0.000007s : 1: overlap_grad_matmul_and_grad_allreduce 0.04% : 0.000010s : 1: overlap_grad_ring_attention 0.03% : 0.000007s : 1: overlap_opt_shard_grad_in_pipeline 0.03% : 0.000006s : 1: overlap_opt_shard_in_pipeline 0.03% : 0.000008s : 1: overlap_param_gather 0.03% : 0.000007s : 1: overlap_recompute_allgather_and_fa_grad 0.04% : 0.000010s : 1: overlap_recompute_and_grad_model_parallel 0.03% : 0.000008s : 1: overlap_recompute_comm 0.04% : 0.000010s : 1: parallel-infer-symbol 0.02% : 0.000006s : 1: parallel-infer-symbol-second 0.03% : 0.000008s : 1: partial_unused_args_eliminate 0.04% : 0.000009s : 1: pipeline_parallel_scheduler 0.03% : 0.000007s : 1: pipeline_split 0.15% : 0.000038s : 1: pre_auto_parallel 0.12% : 0.000030s : 1: py_interpret_to_execute 0.05% : 0.000014s : 1: py_interpret_to_execute_after_opt_a 0.02% : 0.000006s : 1: remove_cast_before_assign_add 0.08% : 0.000020s : 1: remove_dup_value 0.95% : 0.000240s : 1: renormalize.infer 0.99% : 0.000248s : 1: renormalize.specialize 0.03% : 0.000008s : 1: reorder_send_recv_between_fp_bp 0.04% : 0.000010s : 1: rewriter_after_jit_bprop_graph 0.22% : 0.000056s : 1: rewriter_after_opt_a 0.34% : 0.000086s : 1: rewriter_before_opt_a 0.03% : 0.000008s : 1: slice_cell_reuse_recomputed_activation 0.03% : 0.000008s : 1: slice_recompute_activation 0.03% : 0.000007s : 1: split_layernorm_comm 0.03% : 0.000007s : 1: split_matmul_comm_elemetwise 0.04% : 0.000010s : 1: swap_dp_allreduce_reducescatter 0.39% : 0.000099s : 1: symbol_engine_optimizer 0.37% : 0.000092s : 1: tuple_transform 21.82% : 0.005484s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:45:47.174.332 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.014377, [21] [bootstrap]: 0.00043766 [type_inference]: 0.00563997 [event_method]: 1.827e-05 [auto_monad]: 6.188e-05 [graph_reusing]: 6.18002e-06 [inline]: 2.32001e-06 [add_attr]: 0.00303833, [1] [add_attr_with_inline]: 0.00303053, [1] [Cycle 1]: 5.306e-05, [2] [tag_attr]: 1.772e-05 [meta_addattr_fg_expand]: 5.72001e-06 [parallel-infer-symbol]: 3.35e-06 [pre_auto_parallel]: 3.161e-05 [insert-virtual-dataset]: 2.39001e-06 [parallel-infer-symbol-second]: 9.30013e-07 [dataset_repeat_opt]: 1.96998e-06 [pipeline_split]: 1.67001e-06 [optimize]: 0.00445582, [53] [py_interpret_to_execute]: 2.428e-05 [rewriter_before_opt_a]: 7.833e-05 [opt_a]: 0.00251169, [2] [Cycle 1]: 0.00183742, [45] [expand_dump_flag]: 2.79999e-06 [switch_simplify]: 4.295e-05 [loop_unroll]: 3.042e-05 [a_1]: 0.00060786 [with_stream_mark]: 1.503e-05 [recompute_prepare]: 9.25001e-06 [updatestate_depend_eliminate]: 3.51999e-06 [updatestate_assign_eliminate]: 3.04001e-06 [updatestate_loads_eliminate]: 2.76e-06 [parameter_eliminate]: 1.79e-06 [a_2]: 8.127e-05 [accelerated_algorithm]: 7.13e-06 [shard]: 1.96e-06 [meta_shard_fg_expand]: 1.77001e-06 [shard_inline]: 6.59999e-06 [merge_send_recv]: 8.02e-06 [auto_parallel]: 6.21e-06 [parallel]: 1.896e-05 [flash_sp]: 7.56001e-06 [merge_comm]: 4.13001e-06 [allreduce_fusion]: 3.37002e-06 [matmul_add_comm_reduction]: 9.57999e-06 [allreduce_slice_to_reducescatter]: 6.29982e-07 [virtual_shard_identity]: 7.41999e-06 [virtual_dataset]: 6.61e-06 [get_grad_eliminate_]: 6.11e-06 [virtual_output]: 6.39001e-06 [merge_forward]: 4.12e-06 [cell_reuse_recompute_pass]: 1.09003e-06 [offload_activation]: 9.37999e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.195e-05 [merge_recompute_call_nodes]: 1.63002e-06 [before_grad]: 9.90002e-06 [set_forward_comm_id_for_comm_node_pass]: 3.66999e-06 [meta_fg_expand]: 2.86999e-06 [flash_sp_send_recv_attached]: 2.76999e-06 [receive_attached]: 2.74001e-06 [after_resolve]: 1.165e-05 [a_after_grad]: 1.038e-05 [renormalize]: 0.00053163 [add_forward_monad_depend]: 5.71003e-06 [auto_monad_grad]: 2.26e-06 [auto_monad_eliminator]: 1.423e-05 [cse]: 2.653e-05 [a_3]: 4.547e-05 [Cycle 2]: 0.00066416, [45] [expand_dump_flag]: 1.02e-06 [switch_simplify]: 7.65e-06 [loop_unroll]: 6.23e-06 [a_1]: 0.00012935 [with_stream_mark]: 1.16e-05 [recompute_prepare]: 6.94999e-06 [updatestate_depend_eliminate]: 2.81e-06 [updatestate_assign_eliminate]: 2.24001e-06 [updatestate_loads_eliminate]: 2.64001e-06 [parameter_eliminate]: 1.17e-06 [a_2]: 7.269e-05 [accelerated_algorithm]: 6.26e-06 [shard]: 1.64e-06 [meta_shard_fg_expand]: 1.66998e-06 [shard_inline]: 6.07999e-06 [merge_send_recv]: 4.75999e-06 [auto_parallel]: 5.30999e-06 [parallel]: 4.1e-06 [flash_sp]: 3.33e-06 [merge_comm]: 3.21999e-06 [allreduce_fusion]: 3.2e-06 [matmul_add_comm_reduction]: 1.105e-05 [allreduce_slice_to_reducescatter]: 3.29979e-07 [virtual_shard_identity]: 6.59001e-06 [virtual_dataset]: 6.01e-06 [get_grad_eliminate_]: 5.74e-06 [virtual_output]: 5.62001e-06 [merge_forward]: 2.64001e-06 [cell_reuse_recompute_pass]: 1.35999e-06 [offload_activation]: 6.28002e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.32e-05 [merge_recompute_call_nodes]: 8.80013e-07 [before_grad]: 9.19998e-06 [set_forward_comm_id_for_comm_node_pass]: 3.38999e-06 [meta_fg_expand]: 1.96e-06 [flash_sp_send_recv_attached]: 9.20001e-07 [receive_attached]: 1.05001e-06 [after_resolve]: 1.097e-05 [a_after_grad]: 9.84999e-06 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 1.16002e-06 [auto_monad_grad]: 1.44e-06 [auto_monad_eliminator]: 7.67998e-06 [cse]: 1.418e-05 [a_3]: 3.579e-05 [py_interpret_to_execute_after_opt_a]: 8.70001e-06 [slice_cell_reuse_recomputed_activation]: 1.86e-06 [rewriter_after_opt_a]: 3.755e-05 [convert_after_rewriter]: 6.63998e-06 [order_py_execute_after_rewriter]: 5.40001e-06 [mutable_eliminate]: 0.00047457 [opt_b]: 0.00020079, [1] [Cycle 1]: 0.00019449, [7] [b_1]: 0.00012114 [b_2]: 8.40999e-06 [updatestate_depend_eliminate]: 6.11e-06 [updatestate_assign_eliminate]: 2.74999e-06 [updatestate_loads_eliminate]: 2.53998e-06 [renormalize]: 7.10017e-07 [cse]: 1.693e-05 [optimize_parallel_all_gather_comm]: 1.641e-05 [overlap_param_gather]: 2.58e-06 [cconv]: 2.436e-05 [loop_unroll]: 0.00041516 [opt_after_cconv]: 0.00010052, [1] [Cycle 1]: 9.491e-05, [7] [c_1]: 3.092e-05 [parameter_eliminate]: 3.85998e-06 [updatestate_depend_eliminate]: 5.67001e-06 [updatestate_assign_eliminate]: 2.41e-06 [updatestate_loads_eliminate]: 2.19001e-06 [cse]: 1.596e-05 [renormalize]: 3.19997e-07 [remove_dup_value]: 1.332e-05 [tuple_transform]: 7.535e-05, [1] [Cycle 1]: 7.08e-05, [4] [d_1]: 4.348e-05 [none_parameter_eliminate]: 1.57001e-06 [renormalize]: 1.69995e-07 [switch_simplify]: 6.83e-06 [partial_unused_args_eliminate]: 2.42001e-06 [add_recomputation]: 4.632e-05 [cse_after_recomputation]: 1.986e-05, [1] [Cycle 1]: 1.541e-05, [1] [cse]: 9.89001e-06 [environ_conv]: 5.72001e-06 [swap_dp_allreduce_reducescatter]: 5.20001e-06 [bias_add_comm_swap]: 2.50002e-06 [label_micro_interleaved_index]: 4.05998e-06 [label_fine_grained_interleaved_index]: 3.16999e-06 [merge_cast_opt]: 1.62001e-06 [slice_recompute_activation]: 2.14999e-06 [micro_interleaved_order_control]: 2.37001e-06 [assign_add_opt]: 1.58002e-06 [ForceFp32Comm]: 9.50007e-07 [remove_cast_before_assign_add]: 7.09988e-07 [full_micro_interleaved_order_control]: 2.09999e-06 [reorder_send_recv_between_fp_bp]: 2.95998e-06 [comm_op_add_attrs]: 9.89996e-07 [add_comm_op_reuse_tag]: 9.39996e-07 [interleave_split_concat_branches]: 1.08001e-06 [interleave_parallel_branches]: 1.02998e-06 [overlap_opt_shard_in_pipeline]: 1.14998e-06 [overlap_opt_shard_grad_in_pipeline]: 1.92999e-06 [control_data_broadcast_order]: 1.261e-05 [grouped_pairwise_exchange_alltoall]: 1.32e-06 [offloading_packed_experts]: 4.23999e-06 [overlap_recompute_and_grad_model_parallel]: 5.27001e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.34998e-06 [overlap_recompute_allgather_and_fa_grad]: 1.22e-06 [overlap_recompute_comm]: 2.41e-06 [overlap_grad_ring_attention]: 4.34997e-06 [overlap_grad_flash_sp]: 1.952e-05 [begin_end_overlap_inline]: 4.89992e-07 [split_matmul_comm_elemetwise]: 2.12999e-06 [split_layernorm_comm]: 1.89e-06 [handle_group_info]: 1.09e-06 [symbol_engine_optimizer]: 7.564e-05, [1] [Cycle 1]: 7.144e-05, [6] [build]: 2.63998e-06 [elim_shapecalc]: 9.96e-06 [elim_not_effective]: 1.302e-05 [opt_reshape]: 6.75002e-06 [fold_const_symbol]: 1.004e-05 [renormalize]: 2.50002e-07 [detach_backward]: 2.04e-06 [pipeline_parallel_scheduler]: 1.50001e-06 [auto_monad_reorder]: 1.629e-05 [get_jit_bprop_graph]: 1.60001e-06 [rewriter_after_jit_bprop_graph]: 4.28999e-06 [opt_after_jit_grad]: 0.00046396 [validate]: 3.938e-05 Sums bootstrap : 0.000438s : 4.21% type_inference : 0.005640s : 54.31% event_method : 0.000018s : 0.18% auto_monad : 0.000062s : 0.60% graph_reusing : 0.000006s : 0.06% inline : 0.000002s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000018s : 0.17% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.06% parallel-infer-symbol : 0.000003s : 0.03% pre_auto_parallel : 0.000032s : 0.30% insert-virtual-dataset : 0.000002s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.02% optimize.py_interpret_to_execute : 0.000024s : 0.23% optimize.rewriter_before_opt_a : 0.000078s : 0.75% optimize.opt_a.expand_dump_flag : 0.000004s : 0.04% optimize.opt_a.switch_simplify : 0.000051s : 0.49% optimize.opt_a.loop_unroll : 0.000037s : 0.35% optimize.opt_a.a_1 : 0.000737s : 7.10% optimize.opt_a.with_stream_mark : 0.000027s : 0.26% optimize.opt_a.recompute_prepare : 0.000016s : 0.16% optimize.opt_a.updatestate_depend_eliminate : 0.000006s : 0.06% optimize.opt_a.updatestate_assign_eliminate : 0.000005s : 0.05% optimize.opt_a.updatestate_loads_eliminate : 0.000005s : 0.05% optimize.opt_a.parameter_eliminate : 0.000003s : 0.03% optimize.opt_a.a_2 : 0.000154s : 1.48% optimize.opt_a.accelerated_algorithm : 0.000013s : 0.13% optimize.opt_a.shard : 0.000004s : 0.03% optimize.opt_a.meta_shard_fg_expand : 0.000003s : 0.03% optimize.opt_a.shard_inline : 0.000013s : 0.12% optimize.opt_a.merge_send_recv : 0.000013s : 0.12% optimize.opt_a.auto_parallel : 0.000012s : 0.11% optimize.opt_a.parallel : 0.000023s : 0.22% optimize.opt_a.flash_sp : 0.000011s : 0.10% optimize.opt_a.merge_comm : 0.000007s : 0.07% optimize.opt_a.allreduce_fusion : 0.000007s : 0.06% optimize.opt_a.matmul_add_comm_reduction : 0.000021s : 0.20% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000014s : 0.13% optimize.opt_a.virtual_dataset : 0.000013s : 0.12% optimize.opt_a.get_grad_eliminate_ : 0.000012s : 0.11% optimize.opt_a.virtual_output : 0.000012s : 0.12% optimize.opt_a.merge_forward : 0.000007s : 0.07% optimize.opt_a.cell_reuse_recompute_pass : 0.000002s : 0.02% optimize.opt_a.offload_activation : 0.000016s : 0.15% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000025s : 0.24% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.02% optimize.opt_a.before_grad : 0.000019s : 0.18% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000007s : 0.07% optimize.opt_a.meta_fg_expand : 0.000005s : 0.05% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.04% optimize.opt_a.receive_attached : 0.000004s : 0.04% optimize.opt_a.after_resolve : 0.000023s : 0.22% optimize.opt_a.a_after_grad : 0.000020s : 0.19% optimize.opt_a.renormalize : 0.000532s : 5.12% optimize.opt_a.add_forward_monad_depend : 0.000007s : 0.07% optimize.opt_a.auto_monad_grad : 0.000004s : 0.04% optimize.opt_a.auto_monad_eliminator : 0.000022s : 0.21% optimize.opt_a.cse : 0.000041s : 0.39% optimize.opt_a.a_3 : 0.000081s : 0.78% optimize.py_interpret_to_execute_after_opt_a : 0.000009s : 0.08% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.02% optimize.rewriter_after_opt_a : 0.000038s : 0.36% optimize.convert_after_rewriter : 0.000007s : 0.06% optimize.order_py_execute_after_rewriter : 0.000005s : 0.05% optimize.mutable_eliminate : 0.000475s : 4.57% optimize.opt_b.b_1 : 0.000121s : 1.17% optimize.opt_b.b_2 : 0.000008s : 0.08% optimize.opt_b.updatestate_depend_eliminate : 0.000006s : 0.06% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.02% optimize.opt_b.renormalize : 0.000001s : 0.01% optimize.opt_b.cse : 0.000017s : 0.16% optimize.optimize_parallel_all_gather_comm : 0.000016s : 0.16% optimize.overlap_param_gather : 0.000003s : 0.02% optimize.cconv : 0.000024s : 0.23% optimize.loop_unroll : 0.000415s : 4.00% optimize.opt_after_cconv.c_1 : 0.000031s : 0.30% optimize.opt_after_cconv.parameter_eliminate : 0.000004s : 0.04% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.05% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000002s : 0.02% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.02% optimize.opt_after_cconv.cse : 0.000016s : 0.15% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000013s : 0.13% optimize.tuple_transform.d_1 : 0.000043s : 0.42% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.02% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000007s : 0.07% optimize.partial_unused_args_eliminate : 0.000002s : 0.02% optimize.add_recomputation : 0.000046s : 0.45% optimize.cse_after_recomputation.cse : 0.000010s : 0.10% optimize.environ_conv : 0.000006s : 0.06% optimize.swap_dp_allreduce_reducescatter : 0.000005s : 0.05% optimize.bias_add_comm_swap : 0.000003s : 0.02% optimize.label_micro_interleaved_index : 0.000004s : 0.04% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.03% optimize.merge_cast_opt : 0.000002s : 0.02% optimize.slice_recompute_activation : 0.000002s : 0.02% optimize.micro_interleaved_order_control : 0.000002s : 0.02% optimize.assign_add_opt : 0.000002s : 0.02% optimize.ForceFp32Comm : 0.000001s : 0.01% optimize.remove_cast_before_assign_add : 0.000001s : 0.01% optimize.full_micro_interleaved_order_control : 0.000002s : 0.02% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.03% optimize.comm_op_add_attrs : 0.000001s : 0.01% optimize.add_comm_op_reuse_tag : 0.000001s : 0.01% optimize.interleave_split_concat_branches : 0.000001s : 0.01% optimize.interleave_parallel_branches : 0.000001s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.02% optimize.control_data_broadcast_order : 0.000013s : 0.12% optimize.grouped_pairwise_exchange_alltoall : 0.000001s : 0.01% optimize.offloading_packed_experts : 0.000004s : 0.04% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.05% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.01% optimize.overlap_recompute_comm : 0.000002s : 0.02% optimize.overlap_grad_ring_attention : 0.000004s : 0.04% optimize.overlap_grad_flash_sp : 0.000020s : 0.19% optimize.begin_end_overlap_inline : 0.000000s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.02% optimize.split_layernorm_comm : 0.000002s : 0.02% optimize.handle_group_info : 0.000001s : 0.01% optimize.symbol_engine_optimizer.build : 0.000003s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000010s : 0.10% optimize.symbol_engine_optimizer.elim_not_effective : 0.000013s : 0.13% optimize.symbol_engine_optimizer.opt_reshape : 0.000007s : 0.06% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000010s : 0.10% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.02% pipeline_parallel_scheduler : 0.000002s : 0.01% auto_monad_reorder : 0.000016s : 0.16% get_jit_bprop_graph : 0.000002s : 0.02% rewriter_after_jit_bprop_graph : 0.000004s : 0.04% opt_after_jit_grad : 0.000464s : 4.47% validate : 0.000039s : 0.38% Time group info: ------[substitution.] 0.000170 28 1.07% : 0.000002s : 2: substitution.elim_not_effective 0.83% : 0.000001s : 2: substitution.fold_const_symbol 3.49% : 0.000006s : 4: substitution.graph_param_transform 78.68% : 0.000134s : 4: substitution.inline 1.85% : 0.000003s : 4: substitution.j_node_and_user_rematch 2.61% : 0.000004s : 4: substitution.remove_not_recompute_node 2.67% : 0.000005s : 4: substitution.replace_old_param 8.81% : 0.000015s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.005582 2 88.07% : 0.004916s : 1: type_inference.infer 11.93% : 0.000666s : 1: type_inference.specialize ------[replace.] 0.000060 8 63.25% : 0.000038s : 4: replace.inline 36.75% : 0.000022s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000144 8 90.94% : 0.000131s : 4: match.inline 9.06% : 0.000013s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000199 1278 0.97% : 0.000002s : 13: predicate.accumulaten_eliminater 0.71% : 0.000001s : 4: predicate.ad_related_special_op_eliminate 0.49% : 0.000001s : 8: predicate.addn_check_dump 0.83% : 0.000002s : 13: predicate.addn_zero_filter 0.82% : 0.000002s : 13: predicate.adjust_all_reduce_mul_add 2.00% : 0.000004s : 21: predicate.arithmetic_simplify 0.96% : 0.000002s : 13: predicate.cast_eliminate 0.58% : 0.000001s : 8: predicate.check_bprop_eliminate 0.55% : 0.000001s : 8: predicate.compare_switch_simplify 0.19% : 0.000000s : 4: predicate.const_output_eliminate 0.56% : 0.000001s : 8: predicate.depend_value_elim 0.98% : 0.000002s : 13: predicate.dict_get_item_const_eliminator 1.03% : 0.000002s : 13: predicate.dict_get_item_eliminator 0.87% : 0.000002s : 13: predicate.dict_set_item_eliminator 1.08% : 0.000002s : 8: predicate.dumpgradient_eliminate 0.23% : 0.000000s : 4: predicate.elim_not_effective 0.37% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.24% : 0.000002s : 17: predicate.environ_add_const_eliminate 1.14% : 0.000002s : 17: predicate.environ_get_add_eliminate 1.17% : 0.000002s : 17: predicate.environ_get_depend_swap 1.72% : 0.000003s : 25: predicate.environ_get_eliminate 1.13% : 0.000002s : 17: predicate.environ_get_set_eliminate 1.44% : 0.000003s : 21: predicate.exchange_switch_depend_value 2.57% : 0.000005s : 21: predicate.float_depend_g_call 0.53% : 0.000001s : 8: predicate.float_environ_get_switch 0.76% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.20% : 0.000000s : 4: predicate.fold_const_symbol 0.64% : 0.000001s : 8: predicate.get_grad_eliminate 0.23% : 0.000000s : 4: predicate.graph_param_transform 0.63% : 0.000001s : 8: predicate.incorporate_call 0.50% : 0.000001s : 8: predicate.incorporate_call_switch 6.43% : 0.000013s : 58: predicate.inline 0.94% : 0.000002s : 8: predicate.inline_without_move 0.34% : 0.000001s : 8: predicate.j_node_and_user_rematch 0.86% : 0.000002s : 8: predicate.less_batch_normalization 1.78% : 0.000004s : 25: predicate.list_to_tuple_eliminator_ 2.54% : 0.000005s : 38: predicate.load_eliminater 0.81% : 0.000002s : 4: predicate.loop_unroll_after_grad 2.46% : 0.000005s : 34: predicate.loop_unroll_before_grad 1.58% : 0.000003s : 21: predicate.make_slice_get_slice_eliminator 0.54% : 0.000001s : 8: predicate.merge_addn 0.63% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.56% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.83% : 0.000002s : 13: predicate.minmaximum_grad 0.98% : 0.000002s : 4: predicate.mutable_eliminate 0.34% : 0.000001s : 4: predicate.opt_reshape 0.36% : 0.000001s : 4: predicate.parallel_virtual_node 1.77% : 0.000004s : 21: predicate.partial_defer_inline 1.70% : 0.000003s : 21: predicate.partial_eliminate 1.01% : 0.000002s : 13: predicate.print_const_string_wrapper 0.57% : 0.000001s : 8: predicate.reduce_all_const_elim 1.11% : 0.000002s : 13: predicate.reduce_eliminate 2.54% : 0.000005s : 38: predicate.redundant_stop_gradient_eliminater 0.49% : 0.000001s : 8: predicate.remove_not_recompute_node 1.45% : 0.000003s : 25: predicate.replace_applicator 0.48% : 0.000001s : 8: predicate.replace_old_param 0.27% : 0.000001s : 4: predicate.reset_defer_inline 0.96% : 0.000002s : 13: predicate.reshape_eliminate 0.57% : 0.000001s : 8: predicate.row_tensor_add_zeros_like 0.45% : 0.000001s : 4: predicate.row_tensor_eliminate 0.72% : 0.000001s : 8: predicate.same_eliminate 0.57% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.72% : 0.000001s : 8: predicate.shard_identity_eliminate 0.88% : 0.000002s : 8: predicate.special_op_eliminate 0.76% : 0.000002s : 8: predicate.specialize_transform 0.74% : 0.000001s : 8: predicate.split_environ_get_set_with_tuple_value 0.76% : 0.000002s : 8: predicate.stack_unstack_eliminate 0.37% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.56% : 0.000003s : 21: predicate.switch_defer_inline 2.20% : 0.000004s : 29: predicate.switch_layer_defer_inline 5.32% : 0.000011s : 67: predicate.switch_simplify 0.84% : 0.000002s : 13: predicate.tile_eliminate 0.91% : 0.000002s : 13: predicate.transpose_eliminate 1.57% : 0.000003s : 21: predicate.tuple_list_convert_item_index_to_positive 1.54% : 0.000003s : 21: predicate.tuple_list_get_item_const_eliminator 1.45% : 0.000003s : 21: predicate.tuple_list_get_item_depend_reorder 3.33% : 0.000007s : 33: predicate.tuple_list_get_item_eliminator 1.53% : 0.000003s : 21: predicate.tuple_list_get_set_item_eliminator 2.21% : 0.000004s : 29: predicate.tuple_list_set_item_eliminator 1.70% : 0.000003s : 25: predicate.tuple_to_list_eliminator_ 2.47% : 0.000005s : 38: predicate.updatestate_pure_node_eliminater 3.17% : 0.000006s : 46: predicate.updatestate_useless_node_eliminater 0.33% : 0.000001s : 4: predicate.value_based_eliminate 0.61% : 0.000001s : 8: predicate.virtual_dataset_eliminate 0.63% : 0.000001s : 8: predicate.virtual_output_eliminate 0.28% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.36% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000494 11 55.66% : 0.000275s : 5: func_graph_cloner_run.FuncGraphClonerGraph 44.34% : 0.000219s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.023695 192 0.02% : 0.000004s : 1: ForceFp32Comm 12.84% : 0.003043s : 1: add_attr 12.80% : 0.003034s : 1: add_attr_with_inline 0.02% : 0.000004s : 1: add_comm_op_reuse_tag 0.21% : 0.000050s : 1: add_recomputation 0.02% : 0.000004s : 1: assign_add_opt 0.28% : 0.000067s : 1: auto_monad 0.08% : 0.000020s : 1: auto_monad_reorder 0.01% : 0.000003s : 1: begin_end_overlap_inline 0.02% : 0.000005s : 1: bias_add_comm_swap 1.96% : 0.000465s : 1: bootstrap 0.12% : 0.000028s : 1: cconv 0.02% : 0.000004s : 1: comm_op_add_attrs 0.07% : 0.000016s : 1: control_data_broadcast_order 0.04% : 0.000010s : 1: convert_after_rewriter 0.10% : 0.000023s : 1: cse_after_recomputation 0.02% : 0.000005s : 1: dataset_repeat_opt 0.02% : 0.000005s : 1: detach_backward 0.04% : 0.000009s : 1: environ_conv 0.10% : 0.000025s : 1: event_method 0.02% : 0.000005s : 1: full_micro_interleaved_order_control 0.02% : 0.000005s : 1: get_jit_bprop_graph 0.04% : 0.000010s : 1: graph_reusing 0.02% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.02% : 0.000004s : 1: handle_group_info 0.02% : 0.000005s : 1: inline 0.03% : 0.000006s : 1: insert-virtual-dataset 0.02% : 0.000004s : 1: interleave_parallel_branches 0.02% : 0.000004s : 1: interleave_split_concat_branches 0.03% : 0.000006s : 1: label_fine_grained_interleaved_index 0.03% : 0.000007s : 1: label_micro_interleaved_index 1.79% : 0.000423s : 1: loop_unroll 0.02% : 0.000004s : 1: merge_cast_opt 0.02% : 0.000005s : 1: micro_interleaved_order_control 2.04% : 0.000483s : 1: mutable_eliminate 0.03% : 0.000007s : 1: offloading_packed_experts 0.06% : 0.000013s : 1: opt.transform.loop_unroll_optimizer 0.06% : 0.000014s : 1: opt.transform.mutable_eliminate 4.86% : 0.001152s : 78: opt.transform.opt_a 0.12% : 0.000029s : 1: opt.transform.opt_after_cconv 0.10% : 0.000024s : 1: opt.transform.opt_after_jit_grad 0.42% : 0.000099s : 28: opt.transform.opt_b 0.20% : 0.000048s : 2: opt.transform.opt_trans_graph 0.15% : 0.000036s : 4: opt.transform.symbol_engine_opt 10.61% : 0.002515s : 1: opt_a 0.44% : 0.000104s : 1: opt_after_cconv 1.99% : 0.000473s : 1: opt_after_jit_grad 0.86% : 0.000204s : 1: opt_b 18.82% : 0.004461s : 1: optimize 0.08% : 0.000020s : 1: optimize_parallel_all_gather_comm 0.04% : 0.000008s : 1: order_py_execute_after_rewriter 0.10% : 0.000023s : 1: overlap_grad_flash_sp 0.02% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.03% : 0.000007s : 1: overlap_grad_ring_attention 0.02% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.02% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.02% : 0.000006s : 1: overlap_param_gather 0.02% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.04% : 0.000009s : 1: overlap_recompute_and_grad_model_parallel 0.02% : 0.000005s : 1: overlap_recompute_comm 0.03% : 0.000007s : 1: parallel-infer-symbol 0.02% : 0.000004s : 1: parallel-infer-symbol-second 0.02% : 0.000005s : 1: partial_unused_args_eliminate 0.02% : 0.000004s : 1: pipeline_parallel_scheduler 0.02% : 0.000004s : 1: pipeline_split 0.15% : 0.000036s : 1: pre_auto_parallel 0.12% : 0.000028s : 1: py_interpret_to_execute 0.05% : 0.000012s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000003s : 1: remove_cast_before_assign_add 0.07% : 0.000017s : 1: remove_dup_value 1.14% : 0.000270s : 1: renormalize.infer 1.08% : 0.000255s : 1: renormalize.specialize 0.02% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.03% : 0.000007s : 1: rewriter_after_jit_bprop_graph 0.17% : 0.000041s : 1: rewriter_after_opt_a 0.35% : 0.000082s : 1: rewriter_before_opt_a 0.02% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.02% : 0.000005s : 1: slice_recompute_activation 0.02% : 0.000005s : 1: split_layernorm_comm 0.02% : 0.000005s : 1: split_matmul_comm_elemetwise 0.03% : 0.000008s : 1: swap_dp_allreduce_reducescatter 0.33% : 0.000078s : 1: symbol_engine_optimizer 0.33% : 0.000078s : 1: tuple_transform 23.87% : 0.005657s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:45:47.395.195 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:45:47.395.452 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0178862, [21] [bootstrap]: 0.00043417 [type_inference]: 0.00785628 [event_method]: 1.968e-05 [auto_monad]: 6.055e-05 [graph_reusing]: 5.84e-06 [inline]: 1.96e-06 [add_attr]: 0.00309047, [1] [add_attr_with_inline]: 0.00308121, [1] [Cycle 1]: 7.438e-05, [2] [tag_attr]: 1.857e-05 [meta_addattr_fg_expand]: 5.75001e-06 [parallel-infer-symbol]: 2.91999e-06 [pre_auto_parallel]: 3.466e-05 [insert-virtual-dataset]: 2.87002e-06 [parallel-infer-symbol-second]: 7.2e-07 [dataset_repeat_opt]: 1.83002e-06 [pipeline_split]: 1.55999e-06 [optimize]: 0.00525905, [53] [py_interpret_to_execute]: 3.172e-05 [rewriter_before_opt_a]: 8.53e-05 [opt_a]: 0.00300626, [2] [Cycle 1]: 0.00218729, [45] [expand_dump_flag]: 2.98e-06 [switch_simplify]: 4.3e-05 [loop_unroll]: 3.026e-05 [a_1]: 0.00065528 [with_stream_mark]: 1.793e-05 [recompute_prepare]: 9.31e-06 [updatestate_depend_eliminate]: 4.63999e-06 [updatestate_assign_eliminate]: 3.46001e-06 [updatestate_loads_eliminate]: 2.78998e-06 [parameter_eliminate]: 2.24001e-06 [a_2]: 0.00010899 [accelerated_algorithm]: 7.45e-06 [shard]: 1.71998e-06 [meta_shard_fg_expand]: 1.86e-06 [shard_inline]: 6.73e-06 [merge_send_recv]: 8.13001e-06 [auto_parallel]: 5.96e-06 [parallel]: 1.803e-05 [flash_sp]: 8.79e-06 [merge_comm]: 4.30999e-06 [allreduce_fusion]: 3.55e-06 [matmul_add_comm_reduction]: 9.21998e-06 [allreduce_slice_to_reducescatter]: 7.30011e-07 [virtual_shard_identity]: 8.85999e-06 [virtual_dataset]: 6.76e-06 [get_grad_eliminate_]: 6.34001e-06 [virtual_output]: 6.53e-06 [merge_forward]: 4.16001e-06 [cell_reuse_recompute_pass]: 1.27999e-06 [offload_activation]: 1.041e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.602e-05 [merge_recompute_call_nodes]: 1.44e-06 [before_grad]: 1.057e-05 [set_forward_comm_id_for_comm_node_pass]: 4.36002e-06 [meta_fg_expand]: 3.08e-06 [flash_sp_send_recv_attached]: 2.91e-06 [receive_attached]: 2.93e-06 [after_resolve]: 1.215e-05 [a_after_grad]: 1.012e-05 [renormalize]: 0.00062587 [add_forward_monad_depend]: 5.25001e-06 [auto_monad_grad]: 2.44999e-06 [auto_monad_eliminator]: 1.473e-05 [cse]: 2.62e-05 [a_3]: 6.057e-05 [Cycle 2]: 0.00080439, [45] [expand_dump_flag]: 1.19998e-06 [switch_simplify]: 7.39002e-06 [loop_unroll]: 6.27001e-06 [a_1]: 0.00012933 [with_stream_mark]: 1.163e-05 [recompute_prepare]: 6.04001e-06 [updatestate_depend_eliminate]: 3.01001e-06 [updatestate_assign_eliminate]: 2.27999e-06 [updatestate_loads_eliminate]: 2.44001e-06 [parameter_eliminate]: 1.00001e-06 [a_2]: 9.859e-05 [accelerated_algorithm]: 6.41998e-06 [shard]: 1.55001e-06 [meta_shard_fg_expand]: 1.52001e-06 [shard_inline]: 6.53e-06 [merge_send_recv]: 5.25001e-06 [auto_parallel]: 5.63002e-06 [parallel]: 5.94e-06 [flash_sp]: 3.28998e-06 [merge_comm]: 3.28e-06 [allreduce_fusion]: 3.8e-06 [matmul_add_comm_reduction]: 6.11e-06 [allreduce_slice_to_reducescatter]: 5.8001e-07 [virtual_shard_identity]: 6.91001e-06 [virtual_dataset]: 6.02001e-06 [get_grad_eliminate_]: 5.84e-06 [virtual_output]: 5.71e-06 [merge_forward]: 3.16001e-06 [cell_reuse_recompute_pass]: 1.25999e-06 [offload_activation]: 6.61e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.479e-05 [merge_recompute_call_nodes]: 8.59989e-07 [before_grad]: 9.81e-06 [set_forward_comm_id_for_comm_node_pass]: 3.76999e-06 [meta_fg_expand]: 2.31998e-06 [flash_sp_send_recv_attached]: 1.18001e-06 [receive_attached]: 1.13001e-06 [after_resolve]: 1.06e-05 [a_after_grad]: 9.54e-06 [renormalize]: 7.99773e-08 [add_forward_monad_depend]: 1.20001e-06 [auto_monad_grad]: 1.11002e-06 [auto_monad_eliminator]: 6.67002e-06 [cse]: 1.216e-05 [a_3]: 5.33e-05 [py_interpret_to_execute_after_opt_a]: 1.363e-05 [slice_cell_reuse_recomputed_activation]: 4.97e-06 [rewriter_after_opt_a]: 3.802e-05 [convert_after_rewriter]: 9.51998e-06 [order_py_execute_after_rewriter]: 7.85e-06 [mutable_eliminate]: 0.00051573 [opt_b]: 0.00026501, [1] [Cycle 1]: 0.00025584, [7] [b_1]: 0.00016606 [b_2]: 7.64002e-06 [updatestate_depend_eliminate]: 4.79002e-06 [updatestate_assign_eliminate]: 2.41e-06 [updatestate_loads_eliminate]: 2.37001e-06 [renormalize]: 5.39992e-07 [cse]: 1.615e-05 [optimize_parallel_all_gather_comm]: 1.917e-05 [overlap_param_gather]: 5.04998e-06 [cconv]: 2.752e-05 [loop_unroll]: 0.0004324 [opt_after_cconv]: 0.00012351, [1] [Cycle 1]: 0.00011486, [7] [c_1]: 3.147e-05 [parameter_eliminate]: 2.56e-06 [updatestate_depend_eliminate]: 4.70001e-06 [updatestate_assign_eliminate]: 2.69001e-06 [updatestate_loads_eliminate]: 2.31e-06 [cse]: 1.556e-05 [renormalize]: 5.60016e-07 [remove_dup_value]: 1.636e-05 [tuple_transform]: 8.91e-05, [1] [Cycle 1]: 8.145e-05, [4] [d_1]: 4.318e-05 [none_parameter_eliminate]: 1.52999e-06 [renormalize]: 2.19996e-07 [switch_simplify]: 7.07002e-06 [partial_unused_args_eliminate]: 4.38999e-06 [add_recomputation]: 4.697e-05 [cse_after_recomputation]: 3.977e-05, [1] [Cycle 1]: 3.26e-05, [1] [cse]: 2.256e-05 [environ_conv]: 8.16002e-06 [swap_dp_allreduce_reducescatter]: 7.86001e-06 [bias_add_comm_swap]: 5.40001e-06 [label_micro_interleaved_index]: 6.76999e-06 [label_fine_grained_interleaved_index]: 5.07999e-06 [merge_cast_opt]: 3.77002e-06 [slice_recompute_activation]: 4.48001e-06 [micro_interleaved_order_control]: 4.72e-06 [assign_add_opt]: 3.73001e-06 [ForceFp32Comm]: 3.26999e-06 [remove_cast_before_assign_add]: 3.46999e-06 [full_micro_interleaved_order_control]: 4.72998e-06 [reorder_send_recv_between_fp_bp]: 5.42999e-06 [comm_op_add_attrs]: 3.33998e-06 [add_comm_op_reuse_tag]: 3.21001e-06 [interleave_split_concat_branches]: 3.64002e-06 [interleave_parallel_branches]: 3.36999e-06 [overlap_opt_shard_in_pipeline]: 3.53999e-06 [overlap_opt_shard_grad_in_pipeline]: 4.18001e-06 [control_data_broadcast_order]: 1.457e-05 [grouped_pairwise_exchange_alltoall]: 3.71999e-06 [offloading_packed_experts]: 5.87001e-06 [overlap_recompute_and_grad_model_parallel]: 7.19001e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.6e-06 [overlap_recompute_allgather_and_fa_grad]: 3.5e-06 [overlap_recompute_comm]: 5.32999e-06 [overlap_grad_ring_attention]: 6.76999e-06 [overlap_grad_flash_sp]: 2.111e-05 [begin_end_overlap_inline]: 3.43e-06 [split_matmul_comm_elemetwise]: 4.27e-06 [split_layernorm_comm]: 4.11001e-06 [handle_group_info]: 3.38999e-06 [symbol_engine_optimizer]: 9.556e-05, [1] [Cycle 1]: 8.918e-05, [6] [build]: 3.31999e-06 [elim_shapecalc]: 9.51e-06 [elim_not_effective]: 1.353e-05 [opt_reshape]: 7.11999e-06 [fold_const_symbol]: 1.036e-05 [renormalize]: 2.19996e-07 [detach_backward]: 3.13e-06 [pipeline_parallel_scheduler]: 2.24999e-06 [auto_monad_reorder]: 2.038e-05 [get_jit_bprop_graph]: 1.86998e-06 [rewriter_after_jit_bprop_graph]: 4.90999e-06 [opt_after_jit_grad]: 0.00046737 [validate]: 3.625e-05 Sums bootstrap : 0.000434s : 3.32% type_inference : 0.007856s : 60.07% event_method : 0.000020s : 0.15% auto_monad : 0.000061s : 0.46% graph_reusing : 0.000006s : 0.04% inline : 0.000002s : 0.01% add_attr.add_attr_with_inline.tag_attr : 0.000019s : 0.14% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.04% parallel-infer-symbol : 0.000003s : 0.02% pre_auto_parallel : 0.000035s : 0.27% insert-virtual-dataset : 0.000003s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.01% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000032s : 0.24% optimize.rewriter_before_opt_a : 0.000085s : 0.65% optimize.opt_a.expand_dump_flag : 0.000004s : 0.03% optimize.opt_a.switch_simplify : 0.000050s : 0.39% optimize.opt_a.loop_unroll : 0.000037s : 0.28% optimize.opt_a.a_1 : 0.000785s : 6.00% optimize.opt_a.with_stream_mark : 0.000030s : 0.23% optimize.opt_a.recompute_prepare : 0.000015s : 0.12% optimize.opt_a.updatestate_depend_eliminate : 0.000008s : 0.06% optimize.opt_a.updatestate_assign_eliminate : 0.000006s : 0.04% optimize.opt_a.updatestate_loads_eliminate : 0.000005s : 0.04% optimize.opt_a.parameter_eliminate : 0.000003s : 0.02% optimize.opt_a.a_2 : 0.000208s : 1.59% optimize.opt_a.accelerated_algorithm : 0.000014s : 0.11% optimize.opt_a.shard : 0.000003s : 0.03% optimize.opt_a.meta_shard_fg_expand : 0.000003s : 0.03% optimize.opt_a.shard_inline : 0.000013s : 0.10% optimize.opt_a.merge_send_recv : 0.000013s : 0.10% optimize.opt_a.auto_parallel : 0.000012s : 0.09% optimize.opt_a.parallel : 0.000024s : 0.18% optimize.opt_a.flash_sp : 0.000012s : 0.09% optimize.opt_a.merge_comm : 0.000008s : 0.06% optimize.opt_a.allreduce_fusion : 0.000007s : 0.06% optimize.opt_a.matmul_add_comm_reduction : 0.000015s : 0.12% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000016s : 0.12% optimize.opt_a.virtual_dataset : 0.000013s : 0.10% optimize.opt_a.get_grad_eliminate_ : 0.000012s : 0.09% optimize.opt_a.virtual_output : 0.000012s : 0.09% optimize.opt_a.merge_forward : 0.000007s : 0.06% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.02% optimize.opt_a.offload_activation : 0.000017s : 0.13% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000031s : 0.24% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.02% optimize.opt_a.before_grad : 0.000020s : 0.16% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000008s : 0.06% optimize.opt_a.meta_fg_expand : 0.000005s : 0.04% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.03% optimize.opt_a.receive_attached : 0.000004s : 0.03% optimize.opt_a.after_resolve : 0.000023s : 0.17% optimize.opt_a.a_after_grad : 0.000020s : 0.15% optimize.opt_a.renormalize : 0.000626s : 4.79% optimize.opt_a.add_forward_monad_depend : 0.000006s : 0.05% optimize.opt_a.auto_monad_grad : 0.000004s : 0.03% optimize.opt_a.auto_monad_eliminator : 0.000021s : 0.16% optimize.opt_a.cse : 0.000038s : 0.29% optimize.opt_a.a_3 : 0.000114s : 0.87% optimize.py_interpret_to_execute_after_opt_a : 0.000014s : 0.10% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.04% optimize.rewriter_after_opt_a : 0.000038s : 0.29% optimize.convert_after_rewriter : 0.000010s : 0.07% optimize.order_py_execute_after_rewriter : 0.000008s : 0.06% optimize.mutable_eliminate : 0.000516s : 3.94% optimize.opt_b.b_1 : 0.000166s : 1.27% optimize.opt_b.b_2 : 0.000008s : 0.06% optimize.opt_b.updatestate_depend_eliminate : 0.000005s : 0.04% optimize.opt_b.updatestate_assign_eliminate : 0.000002s : 0.02% optimize.opt_b.updatestate_loads_eliminate : 0.000002s : 0.02% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000016s : 0.12% optimize.optimize_parallel_all_gather_comm : 0.000019s : 0.15% optimize.overlap_param_gather : 0.000005s : 0.04% optimize.cconv : 0.000028s : 0.21% optimize.loop_unroll : 0.000432s : 3.31% optimize.opt_after_cconv.c_1 : 0.000031s : 0.24% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.02% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000005s : 0.04% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.02% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.02% optimize.opt_after_cconv.cse : 0.000016s : 0.12% optimize.opt_after_cconv.renormalize : 0.000001s : 0.00% optimize.remove_dup_value : 0.000016s : 0.13% optimize.tuple_transform.d_1 : 0.000043s : 0.33% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000007s : 0.05% optimize.partial_unused_args_eliminate : 0.000004s : 0.03% optimize.add_recomputation : 0.000047s : 0.36% optimize.cse_after_recomputation.cse : 0.000023s : 0.17% optimize.environ_conv : 0.000008s : 0.06% optimize.swap_dp_allreduce_reducescatter : 0.000008s : 0.06% optimize.bias_add_comm_swap : 0.000005s : 0.04% optimize.label_micro_interleaved_index : 0.000007s : 0.05% optimize.label_fine_grained_interleaved_index : 0.000005s : 0.04% optimize.merge_cast_opt : 0.000004s : 0.03% optimize.slice_recompute_activation : 0.000004s : 0.03% optimize.micro_interleaved_order_control : 0.000005s : 0.04% optimize.assign_add_opt : 0.000004s : 0.03% optimize.ForceFp32Comm : 0.000003s : 0.03% optimize.remove_cast_before_assign_add : 0.000003s : 0.03% optimize.full_micro_interleaved_order_control : 0.000005s : 0.04% optimize.reorder_send_recv_between_fp_bp : 0.000005s : 0.04% optimize.comm_op_add_attrs : 0.000003s : 0.03% optimize.add_comm_op_reuse_tag : 0.000003s : 0.02% optimize.interleave_split_concat_branches : 0.000004s : 0.03% optimize.interleave_parallel_branches : 0.000003s : 0.03% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.03% optimize.overlap_opt_shard_grad_in_pipeline : 0.000004s : 0.03% optimize.control_data_broadcast_order : 0.000015s : 0.11% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.03% optimize.offloading_packed_experts : 0.000006s : 0.04% optimize.overlap_recompute_and_grad_model_parallel : 0.000007s : 0.05% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.03% optimize.overlap_recompute_allgather_and_fa_grad : 0.000003s : 0.03% optimize.overlap_recompute_comm : 0.000005s : 0.04% optimize.overlap_grad_ring_attention : 0.000007s : 0.05% optimize.overlap_grad_flash_sp : 0.000021s : 0.16% optimize.begin_end_overlap_inline : 0.000003s : 0.03% optimize.split_matmul_comm_elemetwise : 0.000004s : 0.03% optimize.split_layernorm_comm : 0.000004s : 0.03% optimize.handle_group_info : 0.000003s : 0.03% optimize.symbol_engine_optimizer.build : 0.000003s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000010s : 0.07% optimize.symbol_engine_optimizer.elim_not_effective : 0.000014s : 0.10% optimize.symbol_engine_optimizer.opt_reshape : 0.000007s : 0.05% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000010s : 0.08% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000003s : 0.02% pipeline_parallel_scheduler : 0.000002s : 0.02% auto_monad_reorder : 0.000020s : 0.16% get_jit_bprop_graph : 0.000002s : 0.01% rewriter_after_jit_bprop_graph : 0.000005s : 0.04% opt_after_jit_grad : 0.000467s : 3.57% validate : 0.000036s : 0.28% Time group info: ------[substitution.] 0.000178 28 1.00% : 0.000002s : 2: substitution.elim_not_effective 0.73% : 0.000001s : 2: substitution.fold_const_symbol 3.09% : 0.000005s : 4: substitution.graph_param_transform 79.71% : 0.000142s : 4: substitution.inline 2.00% : 0.000004s : 4: substitution.j_node_and_user_rematch 2.50% : 0.000004s : 4: substitution.remove_not_recompute_node 2.54% : 0.000005s : 4: substitution.replace_old_param 8.44% : 0.000015s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.007804 2 91.51% : 0.007141s : 1: type_inference.infer 8.49% : 0.000663s : 1: type_inference.specialize ------[replace.] 0.000060 8 62.66% : 0.000038s : 4: replace.inline 37.34% : 0.000023s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000153 8 91.50% : 0.000140s : 4: match.inline 8.50% : 0.000013s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000200 1278 0.94% : 0.000002s : 13: predicate.accumulaten_eliminater 0.76% : 0.000002s : 4: predicate.ad_related_special_op_eliminate 0.52% : 0.000001s : 8: predicate.addn_check_dump 0.85% : 0.000002s : 13: predicate.addn_zero_filter 0.82% : 0.000002s : 13: predicate.adjust_all_reduce_mul_add 2.15% : 0.000004s : 21: predicate.arithmetic_simplify 0.96% : 0.000002s : 13: predicate.cast_eliminate 0.67% : 0.000001s : 8: predicate.check_bprop_eliminate 0.52% : 0.000001s : 8: predicate.compare_switch_simplify 0.21% : 0.000000s : 4: predicate.const_output_eliminate 0.57% : 0.000001s : 8: predicate.depend_value_elim 0.96% : 0.000002s : 13: predicate.dict_get_item_const_eliminator 1.05% : 0.000002s : 13: predicate.dict_get_item_eliminator 0.86% : 0.000002s : 13: predicate.dict_set_item_eliminator 0.90% : 0.000002s : 8: predicate.dumpgradient_eliminate 0.22% : 0.000000s : 4: predicate.elim_not_effective 0.37% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.16% : 0.000002s : 17: predicate.environ_add_const_eliminate 1.08% : 0.000002s : 17: predicate.environ_get_add_eliminate 1.11% : 0.000002s : 17: predicate.environ_get_depend_swap 1.76% : 0.000004s : 25: predicate.environ_get_eliminate 1.06% : 0.000002s : 17: predicate.environ_get_set_eliminate 1.48% : 0.000003s : 21: predicate.exchange_switch_depend_value 2.48% : 0.000005s : 21: predicate.float_depend_g_call 0.54% : 0.000001s : 8: predicate.float_environ_get_switch 0.76% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.20% : 0.000000s : 4: predicate.fold_const_symbol 0.64% : 0.000001s : 8: predicate.get_grad_eliminate 0.23% : 0.000000s : 4: predicate.graph_param_transform 0.65% : 0.000001s : 8: predicate.incorporate_call 0.50% : 0.000001s : 8: predicate.incorporate_call_switch 6.26% : 0.000012s : 58: predicate.inline 0.96% : 0.000002s : 8: predicate.inline_without_move 0.37% : 0.000001s : 8: predicate.j_node_and_user_rematch 0.80% : 0.000002s : 8: predicate.less_batch_normalization 1.79% : 0.000004s : 25: predicate.list_to_tuple_eliminator_ 2.60% : 0.000005s : 38: predicate.load_eliminater 0.83% : 0.000002s : 4: predicate.loop_unroll_after_grad 2.41% : 0.000005s : 34: predicate.loop_unroll_before_grad 1.63% : 0.000003s : 21: predicate.make_slice_get_slice_eliminator 0.58% : 0.000001s : 8: predicate.merge_addn 0.57% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.54% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.85% : 0.000002s : 13: predicate.minmaximum_grad 0.89% : 0.000002s : 4: predicate.mutable_eliminate 0.38% : 0.000001s : 4: predicate.opt_reshape 0.39% : 0.000001s : 4: predicate.parallel_virtual_node 1.79% : 0.000004s : 21: predicate.partial_defer_inline 1.74% : 0.000003s : 21: predicate.partial_eliminate 0.98% : 0.000002s : 13: predicate.print_const_string_wrapper 0.62% : 0.000001s : 8: predicate.reduce_all_const_elim 1.13% : 0.000002s : 13: predicate.reduce_eliminate 2.54% : 0.000005s : 38: predicate.redundant_stop_gradient_eliminater 0.53% : 0.000001s : 8: predicate.remove_not_recompute_node 1.48% : 0.000003s : 25: predicate.replace_applicator 0.43% : 0.000001s : 8: predicate.replace_old_param 0.29% : 0.000001s : 4: predicate.reset_defer_inline 0.93% : 0.000002s : 13: predicate.reshape_eliminate 0.62% : 0.000001s : 8: predicate.row_tensor_add_zeros_like 0.39% : 0.000001s : 4: predicate.row_tensor_eliminate 0.81% : 0.000002s : 8: predicate.same_eliminate 0.41% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.86% : 0.000002s : 8: predicate.shard_identity_eliminate 0.68% : 0.000001s : 8: predicate.special_op_eliminate 0.70% : 0.000001s : 8: predicate.specialize_transform 0.80% : 0.000002s : 8: predicate.split_environ_get_set_with_tuple_value 0.82% : 0.000002s : 8: predicate.stack_unstack_eliminate 0.38% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.55% : 0.000003s : 21: predicate.switch_defer_inline 2.16% : 0.000004s : 29: predicate.switch_layer_defer_inline 5.26% : 0.000011s : 67: predicate.switch_simplify 0.97% : 0.000002s : 13: predicate.tile_eliminate 0.95% : 0.000002s : 13: predicate.transpose_eliminate 1.51% : 0.000003s : 21: predicate.tuple_list_convert_item_index_to_positive 1.57% : 0.000003s : 21: predicate.tuple_list_get_item_const_eliminator 1.48% : 0.000003s : 21: predicate.tuple_list_get_item_depend_reorder 3.28% : 0.000007s : 33: predicate.tuple_list_get_item_eliminator 1.40% : 0.000003s : 21: predicate.tuple_list_get_set_item_eliminator 2.20% : 0.000004s : 29: predicate.tuple_list_set_item_eliminator 1.85% : 0.000004s : 25: predicate.tuple_to_list_eliminator_ 2.53% : 0.000005s : 38: predicate.updatestate_pure_node_eliminater 3.19% : 0.000006s : 46: predicate.updatestate_useless_node_eliminater 0.39% : 0.000001s : 4: predicate.value_based_eliminate 0.64% : 0.000001s : 8: predicate.virtual_dataset_eliminate 0.62% : 0.000001s : 8: predicate.virtual_output_eliminate 0.26% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.46% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000526 11 55.12% : 0.000290s : 5: func_graph_cloner_run.FuncGraphClonerGraph 44.88% : 0.000236s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.028217 192 0.02% : 0.000006s : 1: ForceFp32Comm 10.98% : 0.003100s : 1: add_attr 10.93% : 0.003085s : 1: add_attr_with_inline 0.02% : 0.000006s : 1: add_comm_op_reuse_tag 0.18% : 0.000050s : 1: add_recomputation 0.02% : 0.000006s : 1: assign_add_opt 0.25% : 0.000070s : 1: auto_monad 0.10% : 0.000028s : 1: auto_monad_reorder 0.02% : 0.000006s : 1: begin_end_overlap_inline 0.03% : 0.000008s : 1: bias_add_comm_swap 1.69% : 0.000478s : 1: bootstrap 0.11% : 0.000031s : 1: cconv 0.02% : 0.000006s : 1: comm_op_add_attrs 0.06% : 0.000017s : 1: control_data_broadcast_order 0.04% : 0.000012s : 1: convert_after_rewriter 0.15% : 0.000043s : 1: cse_after_recomputation 0.03% : 0.000007s : 1: dataset_repeat_opt 0.06% : 0.000017s : 1: detach_backward 0.04% : 0.000011s : 1: environ_conv 0.11% : 0.000031s : 1: event_method 0.03% : 0.000007s : 1: full_micro_interleaved_order_control 0.03% : 0.000008s : 1: get_jit_bprop_graph 0.04% : 0.000012s : 1: graph_reusing 0.02% : 0.000006s : 1: grouped_pairwise_exchange_alltoall 0.02% : 0.000006s : 1: handle_group_info 0.03% : 0.000007s : 1: inline 0.03% : 0.000010s : 1: insert-virtual-dataset 0.02% : 0.000006s : 1: interleave_parallel_branches 0.02% : 0.000006s : 1: interleave_split_concat_branches 0.03% : 0.000008s : 1: label_fine_grained_interleaved_index 0.03% : 0.000009s : 1: label_micro_interleaved_index 1.55% : 0.000438s : 1: loop_unroll 0.02% : 0.000007s : 1: merge_cast_opt 0.03% : 0.000007s : 1: micro_interleaved_order_control 1.85% : 0.000522s : 1: mutable_eliminate 0.03% : 0.000009s : 1: offloading_packed_experts 0.05% : 0.000013s : 1: opt.transform.loop_unroll_optimizer 0.05% : 0.000014s : 1: opt.transform.mutable_eliminate 4.30% : 0.001214s : 78: opt.transform.opt_a 0.11% : 0.000030s : 1: opt.transform.opt_after_cconv 0.08% : 0.000024s : 1: opt.transform.opt_after_jit_grad 0.36% : 0.000101s : 28: opt.transform.opt_b 0.17% : 0.000048s : 2: opt.transform.opt_trans_graph 0.13% : 0.000037s : 4: opt.transform.symbol_engine_opt 10.67% : 0.003010s : 1: opt_a 0.45% : 0.000127s : 1: opt_after_cconv 1.69% : 0.000478s : 1: opt_after_jit_grad 0.95% : 0.000268s : 1: opt_b 19.82% : 0.005594s : 1: optimize 0.08% : 0.000023s : 1: optimize_parallel_all_gather_comm 0.04% : 0.000011s : 1: order_py_execute_after_rewriter 0.09% : 0.000024s : 1: overlap_grad_flash_sp 0.02% : 0.000006s : 1: overlap_grad_matmul_and_grad_allreduce 0.03% : 0.000010s : 1: overlap_grad_ring_attention 0.03% : 0.000007s : 1: overlap_opt_shard_grad_in_pipeline 0.02% : 0.000006s : 1: overlap_opt_shard_in_pipeline 0.03% : 0.000008s : 1: overlap_param_gather 0.02% : 0.000006s : 1: overlap_recompute_allgather_and_fa_grad 0.04% : 0.000010s : 1: overlap_recompute_and_grad_model_parallel 0.03% : 0.000008s : 1: overlap_recompute_comm 0.03% : 0.000010s : 1: parallel-infer-symbol 0.02% : 0.000006s : 1: parallel-infer-symbol-second 0.03% : 0.000007s : 1: partial_unused_args_eliminate 0.03% : 0.000010s : 1: pipeline_parallel_scheduler 0.02% : 0.000007s : 1: pipeline_split 0.15% : 0.000042s : 1: pre_auto_parallel 0.13% : 0.000036s : 1: py_interpret_to_execute 0.06% : 0.000017s : 1: py_interpret_to_execute_after_opt_a 0.02% : 0.000006s : 1: remove_cast_before_assign_add 0.07% : 0.000019s : 1: remove_dup_value 1.19% : 0.000335s : 1: renormalize.infer 1.00% : 0.000282s : 1: renormalize.specialize 0.03% : 0.000008s : 1: reorder_send_recv_between_fp_bp 0.04% : 0.000011s : 1: rewriter_after_jit_bprop_graph 0.15% : 0.000041s : 1: rewriter_after_opt_a 0.32% : 0.000089s : 1: rewriter_before_opt_a 0.03% : 0.000008s : 1: slice_cell_reuse_recomputed_activation 0.03% : 0.000007s : 1: slice_recompute_activation 0.03% : 0.000007s : 1: split_layernorm_comm 0.02% : 0.000007s : 1: split_matmul_comm_elemetwise 0.04% : 0.000011s : 1: swap_dp_allreduce_reducescatter 0.35% : 0.000098s : 1: symbol_engine_optimizer 0.33% : 0.000092s : 1: tuple_transform 27.98% : 0.007896s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:45:47.618.465 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0163967, [21] [bootstrap]: 0.00046909 [type_inference]: 0.00573158 [event_method]: 1.873e-05 [auto_monad]: 6.599e-05 [graph_reusing]: 5.72999e-06 [inline]: 2.29999e-06 [add_attr]: 0.00426128, [1] [add_attr_with_inline]: 0.00425052, [1] [Cycle 1]: 7.666e-05, [2] [tag_attr]: 2.253e-05 [meta_addattr_fg_expand]: 5.78002e-06 [parallel-infer-symbol]: 3.6e-06 [pre_auto_parallel]: 3.752e-05 [insert-virtual-dataset]: 2.35002e-06 [parallel-infer-symbol-second]: 9.70002e-07 [dataset_repeat_opt]: 2.29001e-06 [pipeline_split]: 1.77001e-06 [optimize]: 0.00511844, [53] [py_interpret_to_execute]: 2.913e-05 [rewriter_before_opt_a]: 8.853e-05 [opt_a]: 0.00302168, [2] [Cycle 1]: 0.00234045, [45] [expand_dump_flag]: 3.21999e-06 [switch_simplify]: 4.568e-05 [loop_unroll]: 3.063e-05 [a_1]: 0.00067561 [with_stream_mark]: 2.152e-05 [recompute_prepare]: 1.143e-05 [updatestate_depend_eliminate]: 4.74002e-06 [updatestate_assign_eliminate]: 3.74002e-06 [updatestate_loads_eliminate]: 2.76e-06 [parameter_eliminate]: 2.04999e-06 [a_2]: 8.276e-05 [accelerated_algorithm]: 7.07002e-06 [shard]: 2.37001e-06 [meta_shard_fg_expand]: 1.80001e-06 [shard_inline]: 6.53e-06 [merge_send_recv]: 9.64999e-06 [auto_parallel]: 7.22002e-06 [parallel]: 1.92e-05 [flash_sp]: 9.62999e-06 [merge_comm]: 3.88999e-06 [allreduce_fusion]: 3.73001e-06 [matmul_add_comm_reduction]: 9.00001e-06 [allreduce_slice_to_reducescatter]: 7.79983e-07 [virtual_shard_identity]: 8.55001e-06 [virtual_dataset]: 7.58999e-06 [get_grad_eliminate_]: 6.68e-06 [virtual_output]: 6.46999e-06 [merge_forward]: 5.39e-06 [cell_reuse_recompute_pass]: 1.55001e-06 [offload_activation]: 1.091e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.495e-05 [merge_recompute_call_nodes]: 2.07999e-06 [before_grad]: 1.15e-05 [set_forward_comm_id_for_comm_node_pass]: 5.05001e-06 [meta_fg_expand]: 3.18e-06 [flash_sp_send_recv_attached]: 3.06999e-06 [receive_attached]: 2.56e-06 [after_resolve]: 1.297e-05 [a_after_grad]: 1.049e-05 [renormalize]: 0.00087656 [add_forward_monad_depend]: 6.98998e-06 [auto_monad_grad]: 2.74999e-06 [auto_monad_eliminator]: 1.96e-05 [cse]: 2.984e-05 [a_3]: 5.301e-05 [Cycle 2]: 0.00066999, [45] [expand_dump_flag]: 2.46998e-06 [switch_simplify]: 8.39998e-06 [loop_unroll]: 6.25002e-06 [a_1]: 0.00013056 [with_stream_mark]: 1.495e-05 [recompute_prepare]: 6.28e-06 [updatestate_depend_eliminate]: 3.27002e-06 [updatestate_assign_eliminate]: 2.40002e-06 [updatestate_loads_eliminate]: 2.71e-06 [parameter_eliminate]: 1.54998e-06 [a_2]: 7.249e-05 [accelerated_algorithm]: 6.58998e-06 [shard]: 2.69999e-06 [meta_shard_fg_expand]: 1.76e-06 [shard_inline]: 6.21e-06 [merge_send_recv]: 5.91e-06 [auto_parallel]: 6.28e-06 [parallel]: 6.46e-06 [flash_sp]: 6.58e-06 [merge_comm]: 3.26999e-06 [allreduce_fusion]: 3.11999e-06 [matmul_add_comm_reduction]: 6.98e-06 [allreduce_slice_to_reducescatter]: 5.19998e-07 [virtual_shard_identity]: 6.84001e-06 [virtual_dataset]: 5.91998e-06 [get_grad_eliminate_]: 5.66998e-06 [virtual_output]: 5.47001e-06 [merge_forward]: 3.41001e-06 [cell_reuse_recompute_pass]: 1.99e-06 [offload_activation]: 8.27e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.614e-05 [merge_recompute_call_nodes]: 8.89995e-07 [before_grad]: 9.42001e-06 [set_forward_comm_id_for_comm_node_pass]: 3.74002e-06 [meta_fg_expand]: 2.49999e-06 [flash_sp_send_recv_attached]: 9.29984e-07 [receive_attached]: 1.69e-06 [after_resolve]: 1.154e-05 [a_after_grad]: 9.13002e-06 [renormalize]: 6.00121e-08 [add_forward_monad_depend]: 1.30999e-06 [auto_monad_grad]: 1.27e-06 [auto_monad_eliminator]: 8.44002e-06 [cse]: 1.335e-05 [a_3]: 3.555e-05 [py_interpret_to_execute_after_opt_a]: 1.257e-05 [slice_cell_reuse_recomputed_activation]: 2.86e-06 [rewriter_after_opt_a]: 3.775e-05 [convert_after_rewriter]: 6.78998e-06 [order_py_execute_after_rewriter]: 5.27001e-06 [mutable_eliminate]: 0.00058091 [opt_b]: 0.00020182, [1] [Cycle 1]: 0.00019452, [7] [b_1]: 0.0001225 [b_2]: 8.12998e-06 [updatestate_depend_eliminate]: 5.71998e-06 [updatestate_assign_eliminate]: 2.37999e-06 [updatestate_loads_eliminate]: 2.27999e-06 [renormalize]: 6.60017e-07 [cse]: 1.762e-05 [optimize_parallel_all_gather_comm]: 1.589e-05 [overlap_param_gather]: 1.97999e-06 [cconv]: 2.86e-05 [loop_unroll]: 0.00041498 [opt_after_cconv]: 9.792e-05, [1] [Cycle 1]: 9.238e-05, [7] [c_1]: 3.011e-05 [parameter_eliminate]: 2.60002e-06 [updatestate_depend_eliminate]: 5.31002e-06 [updatestate_assign_eliminate]: 2.39001e-06 [updatestate_loads_eliminate]: 2.24001e-06 [cse]: 1.629e-05 [renormalize]: 3.30008e-07 [remove_dup_value]: 1.287e-05 [tuple_transform]: 7.446e-05, [1] [Cycle 1]: 7.017e-05, [4] [d_1]: 4.327e-05 [none_parameter_eliminate]: 1.55999e-06 [renormalize]: 2.70025e-07 [switch_simplify]: 6.79999e-06 [partial_unused_args_eliminate]: 1.84998e-06 [add_recomputation]: 6.891e-05 [cse_after_recomputation]: 2.102e-05, [1] [Cycle 1]: 1.657e-05, [1] [cse]: 1.123e-05 [environ_conv]: 5.40999e-06 [swap_dp_allreduce_reducescatter]: 5.09998e-06 [bias_add_comm_swap]: 2.64001e-06 [label_micro_interleaved_index]: 4.13001e-06 [label_fine_grained_interleaved_index]: 2.79001e-06 [merge_cast_opt]: 1.52999e-06 [slice_recompute_activation]: 2.32001e-06 [micro_interleaved_order_control]: 2.44001e-06 [assign_add_opt]: 1.14e-06 [ForceFp32Comm]: 8.00006e-07 [remove_cast_before_assign_add]: 9.69972e-07 [full_micro_interleaved_order_control]: 2.04e-06 [reorder_send_recv_between_fp_bp]: 2.91999e-06 [comm_op_add_attrs]: 9.89996e-07 [add_comm_op_reuse_tag]: 1.15001e-06 [interleave_split_concat_branches]: 1.14998e-06 [interleave_parallel_branches]: 1.07e-06 [overlap_opt_shard_in_pipeline]: 1.25001e-06 [overlap_opt_shard_grad_in_pipeline]: 1.86e-06 [control_data_broadcast_order]: 1.205e-05 [grouped_pairwise_exchange_alltoall]: 1.40001e-06 [offloading_packed_experts]: 4.08001e-06 [overlap_recompute_and_grad_model_parallel]: 5.49998e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.40001e-06 [overlap_recompute_allgather_and_fa_grad]: 1.19e-06 [overlap_recompute_comm]: 2.40002e-06 [overlap_grad_ring_attention]: 4.82e-06 [overlap_grad_flash_sp]: 1.9e-05 [begin_end_overlap_inline]: 5.40022e-07 [split_matmul_comm_elemetwise]: 2.05002e-06 [split_layernorm_comm]: 2.28998e-06 [handle_group_info]: 1.10001e-06 [symbol_engine_optimizer]: 7.364e-05, [1] [Cycle 1]: 6.96e-05, [6] [build]: 3.41001e-06 [elim_shapecalc]: 9.68002e-06 [elim_not_effective]: 1.272e-05 [opt_reshape]: 6.46e-06 [fold_const_symbol]: 9.84999e-06 [renormalize]: 2.10013e-07 [detach_backward]: 2.27001e-06 [pipeline_parallel_scheduler]: 1.49e-06 [auto_monad_reorder]: 1.667e-05 [get_jit_bprop_graph]: 2.01e-06 [rewriter_after_jit_bprop_graph]: 4.36002e-06 [opt_after_jit_grad]: 0.00045048 [validate]: 4.034e-05 Sums bootstrap : 0.000469s : 4.21% type_inference : 0.005732s : 51.41% event_method : 0.000019s : 0.17% auto_monad : 0.000066s : 0.59% graph_reusing : 0.000006s : 0.05% inline : 0.000002s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000023s : 0.20% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.05% parallel-infer-symbol : 0.000004s : 0.03% pre_auto_parallel : 0.000038s : 0.34% insert-virtual-dataset : 0.000002s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.02% optimize.py_interpret_to_execute : 0.000029s : 0.26% optimize.rewriter_before_opt_a : 0.000089s : 0.79% optimize.opt_a.expand_dump_flag : 0.000006s : 0.05% optimize.opt_a.switch_simplify : 0.000054s : 0.49% optimize.opt_a.loop_unroll : 0.000037s : 0.33% optimize.opt_a.a_1 : 0.000806s : 7.23% optimize.opt_a.with_stream_mark : 0.000036s : 0.33% optimize.opt_a.recompute_prepare : 0.000018s : 0.16% optimize.opt_a.updatestate_depend_eliminate : 0.000008s : 0.07% optimize.opt_a.updatestate_assign_eliminate : 0.000006s : 0.06% optimize.opt_a.updatestate_loads_eliminate : 0.000005s : 0.05% optimize.opt_a.parameter_eliminate : 0.000004s : 0.03% optimize.opt_a.a_2 : 0.000155s : 1.39% optimize.opt_a.accelerated_algorithm : 0.000014s : 0.12% optimize.opt_a.shard : 0.000005s : 0.05% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.03% optimize.opt_a.shard_inline : 0.000013s : 0.11% optimize.opt_a.merge_send_recv : 0.000016s : 0.14% optimize.opt_a.auto_parallel : 0.000014s : 0.12% optimize.opt_a.parallel : 0.000026s : 0.23% optimize.opt_a.flash_sp : 0.000016s : 0.15% optimize.opt_a.merge_comm : 0.000007s : 0.06% optimize.opt_a.allreduce_fusion : 0.000007s : 0.06% optimize.opt_a.matmul_add_comm_reduction : 0.000016s : 0.14% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000015s : 0.14% optimize.opt_a.virtual_dataset : 0.000014s : 0.12% optimize.opt_a.get_grad_eliminate_ : 0.000012s : 0.11% optimize.opt_a.virtual_output : 0.000012s : 0.11% optimize.opt_a.merge_forward : 0.000009s : 0.08% optimize.opt_a.cell_reuse_recompute_pass : 0.000004s : 0.03% optimize.opt_a.offload_activation : 0.000019s : 0.17% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000031s : 0.28% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.03% optimize.opt_a.before_grad : 0.000021s : 0.19% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000009s : 0.08% optimize.opt_a.meta_fg_expand : 0.000006s : 0.05% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.04% optimize.opt_a.receive_attached : 0.000004s : 0.04% optimize.opt_a.after_resolve : 0.000025s : 0.22% optimize.opt_a.a_after_grad : 0.000020s : 0.18% optimize.opt_a.renormalize : 0.000877s : 7.86% optimize.opt_a.add_forward_monad_depend : 0.000008s : 0.07% optimize.opt_a.auto_monad_grad : 0.000004s : 0.04% optimize.opt_a.auto_monad_eliminator : 0.000028s : 0.25% optimize.opt_a.cse : 0.000043s : 0.39% optimize.opt_a.a_3 : 0.000089s : 0.79% optimize.py_interpret_to_execute_after_opt_a : 0.000013s : 0.11% optimize.slice_cell_reuse_recomputed_activation : 0.000003s : 0.03% optimize.rewriter_after_opt_a : 0.000038s : 0.34% optimize.convert_after_rewriter : 0.000007s : 0.06% optimize.order_py_execute_after_rewriter : 0.000005s : 0.05% optimize.mutable_eliminate : 0.000581s : 5.21% optimize.opt_b.b_1 : 0.000123s : 1.10% optimize.opt_b.b_2 : 0.000008s : 0.07% optimize.opt_b.updatestate_depend_eliminate : 0.000006s : 0.05% optimize.opt_b.updatestate_assign_eliminate : 0.000002s : 0.02% optimize.opt_b.updatestate_loads_eliminate : 0.000002s : 0.02% optimize.opt_b.renormalize : 0.000001s : 0.01% optimize.opt_b.cse : 0.000018s : 0.16% optimize.optimize_parallel_all_gather_comm : 0.000016s : 0.14% optimize.overlap_param_gather : 0.000002s : 0.02% optimize.cconv : 0.000029s : 0.26% optimize.loop_unroll : 0.000415s : 3.72% optimize.opt_after_cconv.c_1 : 0.000030s : 0.27% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.02% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000005s : 0.05% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000002s : 0.02% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.02% optimize.opt_after_cconv.cse : 0.000016s : 0.15% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000013s : 0.12% optimize.tuple_transform.d_1 : 0.000043s : 0.39% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000007s : 0.06% optimize.partial_unused_args_eliminate : 0.000002s : 0.02% optimize.add_recomputation : 0.000069s : 0.62% optimize.cse_after_recomputation.cse : 0.000011s : 0.10% optimize.environ_conv : 0.000005s : 0.05% optimize.swap_dp_allreduce_reducescatter : 0.000005s : 0.05% optimize.bias_add_comm_swap : 0.000003s : 0.02% optimize.label_micro_interleaved_index : 0.000004s : 0.04% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.03% optimize.merge_cast_opt : 0.000002s : 0.01% optimize.slice_recompute_activation : 0.000002s : 0.02% optimize.micro_interleaved_order_control : 0.000002s : 0.02% optimize.assign_add_opt : 0.000001s : 0.01% optimize.ForceFp32Comm : 0.000001s : 0.01% optimize.remove_cast_before_assign_add : 0.000001s : 0.01% optimize.full_micro_interleaved_order_control : 0.000002s : 0.02% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.03% optimize.comm_op_add_attrs : 0.000001s : 0.01% optimize.add_comm_op_reuse_tag : 0.000001s : 0.01% optimize.interleave_split_concat_branches : 0.000001s : 0.01% optimize.interleave_parallel_branches : 0.000001s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.02% optimize.control_data_broadcast_order : 0.000012s : 0.11% optimize.grouped_pairwise_exchange_alltoall : 0.000001s : 0.01% optimize.offloading_packed_experts : 0.000004s : 0.04% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.05% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.01% optimize.overlap_recompute_comm : 0.000002s : 0.02% optimize.overlap_grad_ring_attention : 0.000005s : 0.04% optimize.overlap_grad_flash_sp : 0.000019s : 0.17% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.02% optimize.split_layernorm_comm : 0.000002s : 0.02% optimize.handle_group_info : 0.000001s : 0.01% optimize.symbol_engine_optimizer.build : 0.000003s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000010s : 0.09% optimize.symbol_engine_optimizer.elim_not_effective : 0.000013s : 0.11% optimize.symbol_engine_optimizer.opt_reshape : 0.000006s : 0.06% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000010s : 0.09% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.02% pipeline_parallel_scheduler : 0.000001s : 0.01% auto_monad_reorder : 0.000017s : 0.15% get_jit_bprop_graph : 0.000002s : 0.02% rewriter_after_jit_bprop_graph : 0.000004s : 0.04% opt_after_jit_grad : 0.000450s : 4.04% validate : 0.000040s : 0.36% Time group info: ------[substitution.] 0.000200 28 0.90% : 0.000002s : 2: substitution.elim_not_effective 0.65% : 0.000001s : 2: substitution.fold_const_symbol 3.05% : 0.000006s : 4: substitution.graph_param_transform 78.94% : 0.000158s : 4: substitution.inline 2.45% : 0.000005s : 4: substitution.j_node_and_user_rematch 3.43% : 0.000007s : 4: substitution.remove_not_recompute_node 2.70% : 0.000005s : 4: substitution.replace_old_param 7.87% : 0.000016s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.005669 2 87.91% : 0.004984s : 1: type_inference.infer 12.09% : 0.000685s : 1: type_inference.specialize ------[replace.] 0.000069 8 59.40% : 0.000041s : 4: replace.inline 40.60% : 0.000028s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000169 8 91.80% : 0.000155s : 4: match.inline 8.20% : 0.000014s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000203 1278 0.90% : 0.000002s : 13: predicate.accumulaten_eliminater 0.75% : 0.000002s : 4: predicate.ad_related_special_op_eliminate 0.50% : 0.000001s : 8: predicate.addn_check_dump 0.89% : 0.000002s : 13: predicate.addn_zero_filter 0.81% : 0.000002s : 13: predicate.adjust_all_reduce_mul_add 2.13% : 0.000004s : 21: predicate.arithmetic_simplify 0.91% : 0.000002s : 13: predicate.cast_eliminate 0.61% : 0.000001s : 8: predicate.check_bprop_eliminate 0.51% : 0.000001s : 8: predicate.compare_switch_simplify 0.21% : 0.000000s : 4: predicate.const_output_eliminate 0.63% : 0.000001s : 8: predicate.depend_value_elim 0.93% : 0.000002s : 13: predicate.dict_get_item_const_eliminator 1.07% : 0.000002s : 13: predicate.dict_get_item_eliminator 1.06% : 0.000002s : 13: predicate.dict_set_item_eliminator 1.00% : 0.000002s : 8: predicate.dumpgradient_eliminate 0.22% : 0.000000s : 4: predicate.elim_not_effective 0.39% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.11% : 0.000002s : 17: predicate.environ_add_const_eliminate 1.09% : 0.000002s : 17: predicate.environ_get_add_eliminate 1.06% : 0.000002s : 17: predicate.environ_get_depend_swap 1.65% : 0.000003s : 25: predicate.environ_get_eliminate 1.13% : 0.000002s : 17: predicate.environ_get_set_eliminate 1.48% : 0.000003s : 21: predicate.exchange_switch_depend_value 2.57% : 0.000005s : 21: predicate.float_depend_g_call 0.59% : 0.000001s : 8: predicate.float_environ_get_switch 0.79% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.20% : 0.000000s : 4: predicate.fold_const_symbol 0.67% : 0.000001s : 8: predicate.get_grad_eliminate 0.20% : 0.000000s : 4: predicate.graph_param_transform 0.61% : 0.000001s : 8: predicate.incorporate_call 0.49% : 0.000001s : 8: predicate.incorporate_call_switch 6.51% : 0.000013s : 58: predicate.inline 0.70% : 0.000001s : 8: predicate.inline_without_move 0.30% : 0.000001s : 8: predicate.j_node_and_user_rematch 0.84% : 0.000002s : 8: predicate.less_batch_normalization 1.74% : 0.000004s : 25: predicate.list_to_tuple_eliminator_ 2.47% : 0.000005s : 38: predicate.load_eliminater 0.81% : 0.000002s : 4: predicate.loop_unroll_after_grad 2.37% : 0.000005s : 34: predicate.loop_unroll_before_grad 1.67% : 0.000003s : 21: predicate.make_slice_get_slice_eliminator 0.48% : 0.000001s : 8: predicate.merge_addn 0.64% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.58% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.86% : 0.000002s : 13: predicate.minmaximum_grad 1.12% : 0.000002s : 4: predicate.mutable_eliminate 0.31% : 0.000001s : 4: predicate.opt_reshape 0.31% : 0.000001s : 4: predicate.parallel_virtual_node 1.83% : 0.000004s : 21: predicate.partial_defer_inline 1.65% : 0.000003s : 21: predicate.partial_eliminate 0.92% : 0.000002s : 13: predicate.print_const_string_wrapper 0.59% : 0.000001s : 8: predicate.reduce_all_const_elim 1.30% : 0.000003s : 13: predicate.reduce_eliminate 2.51% : 0.000005s : 38: predicate.redundant_stop_gradient_eliminater 0.41% : 0.000001s : 8: predicate.remove_not_recompute_node 1.60% : 0.000003s : 25: predicate.replace_applicator 0.62% : 0.000001s : 8: predicate.replace_old_param 0.29% : 0.000001s : 4: predicate.reset_defer_inline 0.93% : 0.000002s : 13: predicate.reshape_eliminate 0.62% : 0.000001s : 8: predicate.row_tensor_add_zeros_like 0.37% : 0.000001s : 4: predicate.row_tensor_eliminate 0.82% : 0.000002s : 8: predicate.same_eliminate 0.54% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.72% : 0.000001s : 8: predicate.shard_identity_eliminate 0.63% : 0.000001s : 8: predicate.special_op_eliminate 0.70% : 0.000001s : 8: predicate.specialize_transform 0.85% : 0.000002s : 8: predicate.split_environ_get_set_with_tuple_value 0.93% : 0.000002s : 8: predicate.stack_unstack_eliminate 0.34% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.57% : 0.000003s : 21: predicate.switch_defer_inline 2.01% : 0.000004s : 29: predicate.switch_layer_defer_inline 5.49% : 0.000011s : 67: predicate.switch_simplify 0.88% : 0.000002s : 13: predicate.tile_eliminate 0.90% : 0.000002s : 13: predicate.transpose_eliminate 1.47% : 0.000003s : 21: predicate.tuple_list_convert_item_index_to_positive 1.55% : 0.000003s : 21: predicate.tuple_list_get_item_const_eliminator 1.36% : 0.000003s : 21: predicate.tuple_list_get_item_depend_reorder 3.13% : 0.000006s : 33: predicate.tuple_list_get_item_eliminator 1.41% : 0.000003s : 21: predicate.tuple_list_get_set_item_eliminator 2.11% : 0.000004s : 29: predicate.tuple_list_set_item_eliminator 1.75% : 0.000004s : 25: predicate.tuple_to_list_eliminator_ 2.44% : 0.000005s : 38: predicate.updatestate_pure_node_eliminater 3.40% : 0.000007s : 46: predicate.updatestate_useless_node_eliminater 0.35% : 0.000001s : 4: predicate.value_based_eliminate 0.79% : 0.000002s : 8: predicate.virtual_dataset_eliminate 0.59% : 0.000001s : 8: predicate.virtual_output_eliminate 0.28% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.47% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000554 11 53.52% : 0.000297s : 5: func_graph_cloner_run.FuncGraphClonerGraph 46.48% : 0.000258s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.028023 192 0.01% : 0.000004s : 1: ForceFp32Comm 15.23% : 0.004267s : 1: add_attr 15.18% : 0.004255s : 1: add_attr_with_inline 0.01% : 0.000004s : 1: add_comm_op_reuse_tag 0.26% : 0.000073s : 1: add_recomputation 0.01% : 0.000004s : 1: assign_add_opt 0.26% : 0.000072s : 1: auto_monad 0.07% : 0.000020s : 1: auto_monad_reorder 0.01% : 0.000003s : 1: begin_end_overlap_inline 0.02% : 0.000006s : 1: bias_add_comm_swap 1.79% : 0.000501s : 1: bootstrap 0.11% : 0.000032s : 1: cconv 0.01% : 0.000004s : 1: comm_op_add_attrs 0.05% : 0.000015s : 1: control_data_broadcast_order 0.04% : 0.000010s : 1: convert_after_rewriter 0.09% : 0.000024s : 1: cse_after_recomputation 0.02% : 0.000005s : 1: dataset_repeat_opt 0.02% : 0.000005s : 1: detach_backward 0.03% : 0.000009s : 1: environ_conv 0.09% : 0.000025s : 1: event_method 0.02% : 0.000005s : 1: full_micro_interleaved_order_control 0.02% : 0.000005s : 1: get_jit_bprop_graph 0.03% : 0.000009s : 1: graph_reusing 0.01% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000004s : 1: handle_group_info 0.02% : 0.000005s : 1: inline 0.02% : 0.000006s : 1: insert-virtual-dataset 0.01% : 0.000004s : 1: interleave_parallel_branches 0.01% : 0.000004s : 1: interleave_split_concat_branches 0.02% : 0.000006s : 1: label_fine_grained_interleaved_index 0.02% : 0.000007s : 1: label_micro_interleaved_index 1.51% : 0.000423s : 1: loop_unroll 0.02% : 0.000004s : 1: merge_cast_opt 0.02% : 0.000005s : 1: micro_interleaved_order_control 2.10% : 0.000590s : 1: mutable_eliminate 0.03% : 0.000007s : 1: offloading_packed_experts 0.05% : 0.000013s : 1: opt.transform.loop_unroll_optimizer 0.06% : 0.000016s : 1: opt.transform.mutable_eliminate 4.43% : 0.001241s : 78: opt.transform.opt_a 0.10% : 0.000029s : 1: opt.transform.opt_after_cconv 0.09% : 0.000024s : 1: opt.transform.opt_after_jit_grad 0.35% : 0.000099s : 28: opt.transform.opt_b 0.17% : 0.000048s : 2: opt.transform.opt_trans_graph 0.13% : 0.000035s : 4: opt.transform.symbol_engine_opt 10.79% : 0.003025s : 1: opt_a 0.36% : 0.000102s : 1: opt_after_cconv 1.64% : 0.000459s : 1: opt_after_jit_grad 0.73% : 0.000206s : 1: opt_b 18.28% : 0.005123s : 1: optimize 0.07% : 0.000019s : 1: optimize_parallel_all_gather_comm 0.03% : 0.000009s : 1: order_py_execute_after_rewriter 0.08% : 0.000022s : 1: overlap_grad_flash_sp 0.01% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.03% : 0.000008s : 1: overlap_grad_ring_attention 0.02% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.02% : 0.000005s : 1: overlap_param_gather 0.01% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.03% : 0.000009s : 1: overlap_recompute_and_grad_model_parallel 0.02% : 0.000006s : 1: overlap_recompute_comm 0.03% : 0.000008s : 1: parallel-infer-symbol 0.01% : 0.000004s : 1: parallel-infer-symbol-second 0.02% : 0.000005s : 1: partial_unused_args_eliminate 0.02% : 0.000005s : 1: pipeline_parallel_scheduler 0.02% : 0.000005s : 1: pipeline_split 0.15% : 0.000042s : 1: pre_auto_parallel 0.12% : 0.000034s : 1: py_interpret_to_execute 0.06% : 0.000016s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000004s : 1: remove_cast_before_assign_add 0.06% : 0.000016s : 1: remove_dup_value 1.80% : 0.000505s : 1: renormalize.infer 1.28% : 0.000360s : 1: renormalize.specialize 0.02% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.03% : 0.000008s : 1: rewriter_after_jit_bprop_graph 0.15% : 0.000042s : 1: rewriter_after_opt_a 0.33% : 0.000093s : 1: rewriter_before_opt_a 0.02% : 0.000006s : 1: slice_cell_reuse_recomputed_activation 0.02% : 0.000005s : 1: slice_recompute_activation 0.02% : 0.000005s : 1: split_layernorm_comm 0.02% : 0.000005s : 1: split_matmul_comm_elemetwise 0.03% : 0.000008s : 1: swap_dp_allreduce_reducescatter 0.27% : 0.000076s : 1: symbol_engine_optimizer 0.28% : 0.000078s : 1: tuple_transform 20.52% : 0.005751s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:45:47.943.016 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:45:47.943.323 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0169872, [21] [bootstrap]: 0.00046052 [type_inference]: 0.00602431 [event_method]: 2.02e-05 [auto_monad]: 6.533e-05 [graph_reusing]: 6.17999e-06 [inline]: 2.30002e-06 [add_attr]: 0.00318148, [1] [add_attr_with_inline]: 0.00317188, [1] [Cycle 1]: 7.713e-05, [2] [tag_attr]: 1.992e-05 [meta_addattr_fg_expand]: 6.86001e-06 [parallel-infer-symbol]: 3.16001e-06 [pre_auto_parallel]: 3.574e-05 [insert-virtual-dataset]: 2.56e-06 [parallel-infer-symbol-second]: 8.30012e-07 [dataset_repeat_opt]: 1.91e-06 [pipeline_split]: 1.62999e-06 [optimize]: 0.00596996, [53] [py_interpret_to_execute]: 3.043e-05 [rewriter_before_opt_a]: 9.084e-05 [opt_a]: 0.00344591, [2] [Cycle 1]: 0.00246584, [45] [expand_dump_flag]: 3.36999e-06 [switch_simplify]: 4.385e-05 [loop_unroll]: 3.114e-05 [a_1]: 0.00067785 [with_stream_mark]: 1.796e-05 [recompute_prepare]: 1.265e-05 [updatestate_depend_eliminate]: 5.24e-06 [updatestate_assign_eliminate]: 3.88001e-06 [updatestate_loads_eliminate]: 3.88999e-06 [parameter_eliminate]: 2.26e-06 [a_2]: 0.0001322 [accelerated_algorithm]: 8.99e-06 [shard]: 1.96e-06 [meta_shard_fg_expand]: 2.12001e-06 [shard_inline]: 7.83999e-06 [merge_send_recv]: 9.97001e-06 [auto_parallel]: 7.71001e-06 [parallel]: 1.808e-05 [flash_sp]: 9.01998e-06 [merge_comm]: 4.67e-06 [allreduce_fusion]: 4.70999e-06 [matmul_add_comm_reduction]: 1.14e-05 [allreduce_slice_to_reducescatter]: 5.79981e-07 [virtual_shard_identity]: 1.045e-05 [virtual_dataset]: 8.33999e-06 [get_grad_eliminate_]: 7.73999e-06 [virtual_output]: 7.98999e-06 [merge_forward]: 4.53999e-06 [cell_reuse_recompute_pass]: 1.40999e-06 [offload_activation]: 1.147e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.873e-05 [merge_recompute_call_nodes]: 1.49e-06 [before_grad]: 1.338e-05 [set_forward_comm_id_for_comm_node_pass]: 5.04998e-06 [meta_fg_expand]: 3.51999e-06 [flash_sp_send_recv_attached]: 2.86999e-06 [receive_attached]: 2.24001e-06 [after_resolve]: 1.424e-05 [a_after_grad]: 1.21e-05 [renormalize]: 0.00077754 [add_forward_monad_depend]: 6.99001e-06 [auto_monad_grad]: 2.35002e-06 [auto_monad_eliminator]: 1.814e-05 [cse]: 3.606e-05 [a_3]: 7.487e-05 [Cycle 2]: 0.00096499, [45] [expand_dump_flag]: 1.54998e-06 [switch_simplify]: 9.14e-06 [loop_unroll]: 7.4e-06 [a_1]: 0.0001766 [with_stream_mark]: 1.517e-05 [recompute_prepare]: 7.82998e-06 [updatestate_depend_eliminate]: 3.81001e-06 [updatestate_assign_eliminate]: 3.51001e-06 [updatestate_loads_eliminate]: 3.32997e-06 [parameter_eliminate]: 1.89e-06 [a_2]: 0.00011951 [accelerated_algorithm]: 7.98001e-06 [shard]: 1.19e-06 [meta_shard_fg_expand]: 1.81e-06 [shard_inline]: 7.55e-06 [merge_send_recv]: 7.45998e-06 [auto_parallel]: 6.36e-06 [parallel]: 5.78002e-06 [flash_sp]: 3.76999e-06 [merge_comm]: 4.82998e-06 [allreduce_fusion]: 4.32e-06 [matmul_add_comm_reduction]: 7.82e-06 [allreduce_slice_to_reducescatter]: 2.80008e-07 [virtual_shard_identity]: 1.167e-05 [virtual_dataset]: 7.38999e-06 [get_grad_eliminate_]: 7.17997e-06 [virtual_output]: 7.07002e-06 [merge_forward]: 4.72998e-06 [cell_reuse_recompute_pass]: 2.15002e-06 [offload_activation]: 8.69998e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.696e-05 [merge_recompute_call_nodes]: 1.07e-06 [before_grad]: 1.236e-05 [set_forward_comm_id_for_comm_node_pass]: 4.72998e-06 [meta_fg_expand]: 2.78e-06 [flash_sp_send_recv_attached]: 1.11002e-06 [receive_attached]: 1.09e-06 [after_resolve]: 1.383e-05 [a_after_grad]: 1.122e-05 [renormalize]: 5.9983e-08 [add_forward_monad_depend]: 1.76998e-06 [auto_monad_grad]: 1.45001e-06 [auto_monad_eliminator]: 1.124e-05 [cse]: 2.016e-05 [a_3]: 6.102e-05 [py_interpret_to_execute_after_opt_a]: 1.608e-05 [slice_cell_reuse_recomputed_activation]: 5.60001e-06 [rewriter_after_opt_a]: 6.717e-05 [convert_after_rewriter]: 1.142e-05 [order_py_execute_after_rewriter]: 9.55001e-06 [mutable_eliminate]: 0.00057577 [opt_b]: 0.00031624, [1] [Cycle 1]: 0.00030621, [7] [b_1]: 0.00019734 [b_2]: 1.021e-05 [updatestate_depend_eliminate]: 7.3e-06 [updatestate_assign_eliminate]: 3.41999e-06 [updatestate_loads_eliminate]: 3.28e-06 [renormalize]: 8.2e-07 [cse]: 2.654e-05 [optimize_parallel_all_gather_comm]: 2.261e-05 [overlap_param_gather]: 5.10001e-06 [cconv]: 3.327e-05 [loop_unroll]: 0.0004542 [opt_after_cconv]: 0.00014634, [1] [Cycle 1]: 0.00013706, [7] [c_1]: 3.814e-05 [parameter_eliminate]: 3.83999e-06 [updatestate_depend_eliminate]: 6.40002e-06 [updatestate_assign_eliminate]: 3.82002e-06 [updatestate_loads_eliminate]: 3.09001e-06 [cse]: 2.505e-05 [renormalize]: 9.5999e-07 [remove_dup_value]: 1.837e-05 [tuple_transform]: 0.00010173, [1] [Cycle 1]: 9.376e-05, [4] [d_1]: 5.227e-05 [none_parameter_eliminate]: 1.66e-06 [renormalize]: 2.00002e-07 [switch_simplify]: 8.44998e-06 [partial_unused_args_eliminate]: 4.4e-06 [add_recomputation]: 6.083e-05 [cse_after_recomputation]: 3.384e-05, [1] [Cycle 1]: 2.633e-05, [1] [cse]: 1.673e-05 [environ_conv]: 9.69e-06 [swap_dp_allreduce_reducescatter]: 9.37001e-06 [bias_add_comm_swap]: 5.35001e-06 [label_micro_interleaved_index]: 7.22002e-06 [label_fine_grained_interleaved_index]: 5.67001e-06 [merge_cast_opt]: 3.94002e-06 [slice_recompute_activation]: 4.49002e-06 [micro_interleaved_order_control]: 4.65001e-06 [assign_add_opt]: 3.57002e-06 [ForceFp32Comm]: 3.18998e-06 [remove_cast_before_assign_add]: 3.56001e-06 [full_micro_interleaved_order_control]: 4.50999e-06 [reorder_send_recv_between_fp_bp]: 5.04e-06 [comm_op_add_attrs]: 3.61001e-06 [add_comm_op_reuse_tag]: 3.41001e-06 [interleave_split_concat_branches]: 3.59002e-06 [interleave_parallel_branches]: 3.91999e-06 [overlap_opt_shard_in_pipeline]: 3.45e-06 [overlap_opt_shard_grad_in_pipeline]: 4.43999e-06 [control_data_broadcast_order]: 1.844e-05 [grouped_pairwise_exchange_alltoall]: 3.88001e-06 [offloading_packed_experts]: 7.95998e-06 [overlap_recompute_and_grad_model_parallel]: 8.59e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.74002e-06 [overlap_recompute_allgather_and_fa_grad]: 3.80998e-06 [overlap_recompute_comm]: 5.49e-06 [overlap_grad_ring_attention]: 7.23999e-06 [overlap_grad_flash_sp]: 2.72e-05 [begin_end_overlap_inline]: 3.08e-06 [split_matmul_comm_elemetwise]: 4.92e-06 [split_layernorm_comm]: 4.72998e-06 [handle_group_info]: 3.81999e-06 [symbol_engine_optimizer]: 0.00010754, [1] [Cycle 1]: 0.00010111, [6] [build]: 3.37997e-06 [elim_shapecalc]: 1.188e-05 [elim_not_effective]: 1.634e-05 [opt_reshape]: 8.38001e-06 [fold_const_symbol]: 1.338e-05 [renormalize]: 4.39992e-07 [detach_backward]: 3.4e-06 [pipeline_parallel_scheduler]: 1.76998e-06 [auto_monad_reorder]: 2.268e-05 [get_jit_bprop_graph]: 1.94e-06 [rewriter_after_jit_bprop_graph]: 5.17999e-06 [opt_after_jit_grad]: 0.00051174 [validate]: 4.431e-05 Sums bootstrap : 0.000461s : 3.84% type_inference : 0.006024s : 50.25% event_method : 0.000020s : 0.17% auto_monad : 0.000065s : 0.54% graph_reusing : 0.000006s : 0.05% inline : 0.000002s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000020s : 0.17% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000007s : 0.06% parallel-infer-symbol : 0.000003s : 0.03% pre_auto_parallel : 0.000036s : 0.30% insert-virtual-dataset : 0.000003s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000030s : 0.25% optimize.rewriter_before_opt_a : 0.000091s : 0.76% optimize.opt_a.expand_dump_flag : 0.000005s : 0.04% optimize.opt_a.switch_simplify : 0.000053s : 0.44% optimize.opt_a.loop_unroll : 0.000039s : 0.32% optimize.opt_a.a_1 : 0.000854s : 7.13% optimize.opt_a.with_stream_mark : 0.000033s : 0.28% optimize.opt_a.recompute_prepare : 0.000020s : 0.17% optimize.opt_a.updatestate_depend_eliminate : 0.000009s : 0.08% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.06% optimize.opt_a.updatestate_loads_eliminate : 0.000007s : 0.06% optimize.opt_a.parameter_eliminate : 0.000004s : 0.03% optimize.opt_a.a_2 : 0.000252s : 2.10% optimize.opt_a.accelerated_algorithm : 0.000017s : 0.14% optimize.opt_a.shard : 0.000003s : 0.03% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.03% optimize.opt_a.shard_inline : 0.000015s : 0.13% optimize.opt_a.merge_send_recv : 0.000017s : 0.15% optimize.opt_a.auto_parallel : 0.000014s : 0.12% optimize.opt_a.parallel : 0.000024s : 0.20% optimize.opt_a.flash_sp : 0.000013s : 0.11% optimize.opt_a.merge_comm : 0.000009s : 0.08% optimize.opt_a.allreduce_fusion : 0.000009s : 0.08% optimize.opt_a.matmul_add_comm_reduction : 0.000019s : 0.16% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000022s : 0.18% optimize.opt_a.virtual_dataset : 0.000016s : 0.13% optimize.opt_a.get_grad_eliminate_ : 0.000015s : 0.12% optimize.opt_a.virtual_output : 0.000015s : 0.13% optimize.opt_a.merge_forward : 0.000009s : 0.08% optimize.opt_a.cell_reuse_recompute_pass : 0.000004s : 0.03% optimize.opt_a.offload_activation : 0.000020s : 0.17% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000036s : 0.30% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.02% optimize.opt_a.before_grad : 0.000026s : 0.21% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000010s : 0.08% optimize.opt_a.meta_fg_expand : 0.000006s : 0.05% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.03% optimize.opt_a.receive_attached : 0.000003s : 0.03% optimize.opt_a.after_resolve : 0.000028s : 0.23% optimize.opt_a.a_after_grad : 0.000023s : 0.19% optimize.opt_a.renormalize : 0.000778s : 6.49% optimize.opt_a.add_forward_monad_depend : 0.000009s : 0.07% optimize.opt_a.auto_monad_grad : 0.000004s : 0.03% optimize.opt_a.auto_monad_eliminator : 0.000029s : 0.25% optimize.opt_a.cse : 0.000056s : 0.47% optimize.opt_a.a_3 : 0.000136s : 1.13% optimize.py_interpret_to_execute_after_opt_a : 0.000016s : 0.13% optimize.slice_cell_reuse_recomputed_activation : 0.000006s : 0.05% optimize.rewriter_after_opt_a : 0.000067s : 0.56% optimize.convert_after_rewriter : 0.000011s : 0.10% optimize.order_py_execute_after_rewriter : 0.000010s : 0.08% optimize.mutable_eliminate : 0.000576s : 4.80% optimize.opt_b.b_1 : 0.000197s : 1.65% optimize.opt_b.b_2 : 0.000010s : 0.09% optimize.opt_b.updatestate_depend_eliminate : 0.000007s : 0.06% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_b.renormalize : 0.000001s : 0.01% optimize.opt_b.cse : 0.000027s : 0.22% optimize.optimize_parallel_all_gather_comm : 0.000023s : 0.19% optimize.overlap_param_gather : 0.000005s : 0.04% optimize.cconv : 0.000033s : 0.28% optimize.loop_unroll : 0.000454s : 3.79% optimize.opt_after_cconv.c_1 : 0.000038s : 0.32% optimize.opt_after_cconv.parameter_eliminate : 0.000004s : 0.03% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.05% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000004s : 0.03% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.cse : 0.000025s : 0.21% optimize.opt_after_cconv.renormalize : 0.000001s : 0.01% optimize.remove_dup_value : 0.000018s : 0.15% optimize.tuple_transform.d_1 : 0.000052s : 0.44% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000008s : 0.07% optimize.partial_unused_args_eliminate : 0.000004s : 0.04% optimize.add_recomputation : 0.000061s : 0.51% optimize.cse_after_recomputation.cse : 0.000017s : 0.14% optimize.environ_conv : 0.000010s : 0.08% optimize.swap_dp_allreduce_reducescatter : 0.000009s : 0.08% optimize.bias_add_comm_swap : 0.000005s : 0.04% optimize.label_micro_interleaved_index : 0.000007s : 0.06% optimize.label_fine_grained_interleaved_index : 0.000006s : 0.05% optimize.merge_cast_opt : 0.000004s : 0.03% optimize.slice_recompute_activation : 0.000004s : 0.04% optimize.micro_interleaved_order_control : 0.000005s : 0.04% optimize.assign_add_opt : 0.000004s : 0.03% optimize.ForceFp32Comm : 0.000003s : 0.03% optimize.remove_cast_before_assign_add : 0.000004s : 0.03% optimize.full_micro_interleaved_order_control : 0.000005s : 0.04% optimize.reorder_send_recv_between_fp_bp : 0.000005s : 0.04% optimize.comm_op_add_attrs : 0.000004s : 0.03% optimize.add_comm_op_reuse_tag : 0.000003s : 0.03% optimize.interleave_split_concat_branches : 0.000004s : 0.03% optimize.interleave_parallel_branches : 0.000004s : 0.03% optimize.overlap_opt_shard_in_pipeline : 0.000003s : 0.03% optimize.overlap_opt_shard_grad_in_pipeline : 0.000004s : 0.04% optimize.control_data_broadcast_order : 0.000018s : 0.15% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.03% optimize.offloading_packed_experts : 0.000008s : 0.07% optimize.overlap_recompute_and_grad_model_parallel : 0.000009s : 0.07% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.03% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.03% optimize.overlap_recompute_comm : 0.000005s : 0.05% optimize.overlap_grad_ring_attention : 0.000007s : 0.06% optimize.overlap_grad_flash_sp : 0.000027s : 0.23% optimize.begin_end_overlap_inline : 0.000003s : 0.03% optimize.split_matmul_comm_elemetwise : 0.000005s : 0.04% optimize.split_layernorm_comm : 0.000005s : 0.04% optimize.handle_group_info : 0.000004s : 0.03% optimize.symbol_engine_optimizer.build : 0.000003s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000012s : 0.10% optimize.symbol_engine_optimizer.elim_not_effective : 0.000016s : 0.14% optimize.symbol_engine_optimizer.opt_reshape : 0.000008s : 0.07% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000013s : 0.11% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000003s : 0.03% pipeline_parallel_scheduler : 0.000002s : 0.01% auto_monad_reorder : 0.000023s : 0.19% get_jit_bprop_graph : 0.000002s : 0.02% rewriter_after_jit_bprop_graph : 0.000005s : 0.04% opt_after_jit_grad : 0.000512s : 4.27% validate : 0.000044s : 0.37% Time group info: ------[substitution.] 0.000211 38 11.85% : 0.000025s : 3: substitution.cast_eliminate 1.19% : 0.000002s : 3: substitution.elim_not_effective 0.84% : 0.000002s : 3: substitution.fold_const_symbol 3.24% : 0.000007s : 5: substitution.graph_param_transform 68.93% : 0.000145s : 4: substitution.inline 2.30% : 0.000005s : 6: substitution.j_node_and_user_rematch 2.94% : 0.000006s : 6: substitution.remove_not_recompute_node 2.85% : 0.000006s : 4: substitution.replace_old_param 5.86% : 0.000012s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.005968 2 87.16% : 0.005202s : 1: type_inference.infer 12.84% : 0.000766s : 1: type_inference.specialize ------[replace.] 0.000062 8 62.45% : 0.000039s : 4: replace.inline 37.55% : 0.000023s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000153 8 93.13% : 0.000143s : 4: match.inline 6.87% : 0.000011s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000239 1504 0.90% : 0.000002s : 15: predicate.accumulaten_eliminater 0.88% : 0.000002s : 5: predicate.ad_related_special_op_eliminate 0.57% : 0.000001s : 10: predicate.addn_check_dump 0.86% : 0.000002s : 15: predicate.addn_zero_filter 0.80% : 0.000002s : 15: predicate.adjust_all_reduce_mul_add 1.86% : 0.000004s : 25: predicate.arithmetic_simplify 1.05% : 0.000003s : 15: predicate.cast_eliminate 0.65% : 0.000002s : 10: predicate.check_bprop_eliminate 0.61% : 0.000001s : 10: predicate.compare_switch_simplify 0.20% : 0.000000s : 5: predicate.const_output_eliminate 0.70% : 0.000002s : 10: predicate.depend_value_elim 0.94% : 0.000002s : 15: predicate.dict_get_item_const_eliminator 1.02% : 0.000002s : 15: predicate.dict_get_item_eliminator 0.90% : 0.000002s : 15: predicate.dict_set_item_eliminator 0.93% : 0.000002s : 10: predicate.dumpgradient_eliminate 0.22% : 0.000001s : 5: predicate.elim_not_effective 0.42% : 0.000001s : 5: predicate.elim_shapecalc_of_broadcastargs 1.15% : 0.000003s : 20: predicate.environ_add_const_eliminate 1.11% : 0.000003s : 20: predicate.environ_get_add_eliminate 1.11% : 0.000003s : 20: predicate.environ_get_depend_swap 1.82% : 0.000004s : 30: predicate.environ_get_eliminate 1.09% : 0.000003s : 20: predicate.environ_get_set_eliminate 1.35% : 0.000003s : 23: predicate.exchange_switch_depend_value 2.15% : 0.000005s : 23: predicate.float_depend_g_call 0.59% : 0.000001s : 10: predicate.float_environ_get_switch 0.85% : 0.000002s : 15: predicate.float_tuple_getitem_switch 0.18% : 0.000000s : 5: predicate.fold_const_symbol 0.71% : 0.000002s : 10: predicate.get_grad_eliminate 0.20% : 0.000000s : 5: predicate.graph_param_transform 0.67% : 0.000002s : 10: predicate.incorporate_call 0.56% : 0.000001s : 10: predicate.incorporate_call_switch 6.19% : 0.000015s : 68: predicate.inline 0.95% : 0.000002s : 10: predicate.inline_without_move 0.36% : 0.000001s : 10: predicate.j_node_and_user_rematch 0.83% : 0.000002s : 10: predicate.less_batch_normalization 1.74% : 0.000004s : 29: predicate.list_to_tuple_eliminator_ 2.59% : 0.000006s : 44: predicate.load_eliminater 0.88% : 0.000002s : 5: predicate.loop_unroll_after_grad 2.19% : 0.000005s : 36: predicate.loop_unroll_before_grad 1.59% : 0.000004s : 25: predicate.make_slice_get_slice_eliminator 0.64% : 0.000002s : 10: predicate.merge_addn 0.59% : 0.000001s : 10: predicate.micro_step_allgather_replace 0.67% : 0.000002s : 10: predicate.mini_step_allgather_replace 0.80% : 0.000002s : 15: predicate.minmaximum_grad 0.92% : 0.000002s : 5: predicate.mutable_eliminate 0.34% : 0.000001s : 5: predicate.opt_reshape 0.39% : 0.000001s : 5: predicate.parallel_virtual_node 1.80% : 0.000004s : 23: predicate.partial_defer_inline 1.66% : 0.000004s : 24: predicate.partial_eliminate 0.99% : 0.000002s : 15: predicate.print_const_string_wrapper 0.66% : 0.000002s : 10: predicate.reduce_all_const_elim 1.13% : 0.000003s : 15: predicate.reduce_eliminate 2.53% : 0.000006s : 44: predicate.redundant_stop_gradient_eliminater 0.41% : 0.000001s : 10: predicate.remove_not_recompute_node 1.43% : 0.000003s : 29: predicate.replace_applicator 0.67% : 0.000002s : 10: predicate.replace_old_param 0.36% : 0.000001s : 5: predicate.reset_defer_inline 0.97% : 0.000002s : 15: predicate.reshape_eliminate 0.74% : 0.000002s : 10: predicate.row_tensor_add_zeros_like 0.46% : 0.000001s : 5: predicate.row_tensor_eliminate 0.83% : 0.000002s : 10: predicate.same_eliminate 0.52% : 0.000001s : 10: predicate.set_cell_output_no_recompute 0.82% : 0.000002s : 10: predicate.shard_identity_eliminate 0.71% : 0.000002s : 10: predicate.special_op_eliminate 0.87% : 0.000002s : 10: predicate.specialize_transform 0.95% : 0.000002s : 10: predicate.split_environ_get_set_with_tuple_value 0.74% : 0.000002s : 10: predicate.stack_unstack_eliminate 0.35% : 0.000001s : 5: predicate.switch_call_monad_eliminater 1.40% : 0.000003s : 23: predicate.switch_defer_inline 1.96% : 0.000005s : 33: predicate.switch_layer_defer_inline 4.86% : 0.000012s : 74: predicate.switch_simplify 0.86% : 0.000002s : 15: predicate.tile_eliminate 0.93% : 0.000002s : 15: predicate.transpose_eliminate 1.55% : 0.000004s : 25: predicate.tuple_list_convert_item_index_to_positive 1.55% : 0.000004s : 25: predicate.tuple_list_get_item_const_eliminator 1.54% : 0.000004s : 25: predicate.tuple_list_get_item_depend_reorder 3.27% : 0.000008s : 39: predicate.tuple_list_get_item_eliminator 1.44% : 0.000003s : 25: predicate.tuple_list_get_set_item_eliminator 2.36% : 0.000006s : 35: predicate.tuple_list_set_item_eliminator 1.74% : 0.000004s : 29: predicate.tuple_to_list_eliminator_ 2.47% : 0.000006s : 44: predicate.updatestate_pure_node_eliminater 3.16% : 0.000008s : 54: predicate.updatestate_useless_node_eliminater 0.52% : 0.000001s : 5: predicate.value_based_eliminate 0.68% : 0.000002s : 10: predicate.virtual_dataset_eliminate 0.68% : 0.000002s : 10: predicate.virtual_output_eliminate 0.28% : 0.000001s : 5: predicate.virtual_view_grad_eliminate 0.46% : 0.000001s : 5: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000571 11 54.98% : 0.000314s : 5: func_graph_cloner_run.FuncGraphClonerGraph 45.02% : 0.000257s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.028509 192 0.02% : 0.000006s : 1: ForceFp32Comm 11.19% : 0.003191s : 1: add_attr 11.14% : 0.003176s : 1: add_attr_with_inline 0.02% : 0.000006s : 1: add_comm_op_reuse_tag 0.23% : 0.000065s : 1: add_recomputation 0.02% : 0.000006s : 1: assign_add_opt 0.27% : 0.000076s : 1: auto_monad 0.10% : 0.000030s : 1: auto_monad_reorder 0.02% : 0.000006s : 1: begin_end_overlap_inline 0.03% : 0.000008s : 1: bias_add_comm_swap 1.78% : 0.000507s : 1: bootstrap 0.13% : 0.000037s : 1: cconv 0.02% : 0.000006s : 1: comm_op_add_attrs 0.07% : 0.000021s : 1: control_data_broadcast_order 0.05% : 0.000015s : 1: convert_after_rewriter 0.13% : 0.000037s : 1: cse_after_recomputation 0.03% : 0.000008s : 1: dataset_repeat_opt 0.07% : 0.000020s : 1: detach_backward 0.04% : 0.000013s : 1: environ_conv 0.11% : 0.000030s : 1: event_method 0.03% : 0.000007s : 1: full_micro_interleaved_order_control 0.03% : 0.000008s : 1: get_jit_bprop_graph 0.05% : 0.000013s : 1: graph_reusing 0.02% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.02% : 0.000007s : 1: handle_group_info 0.03% : 0.000008s : 1: inline 0.03% : 0.000009s : 1: insert-virtual-dataset 0.02% : 0.000006s : 1: interleave_parallel_branches 0.02% : 0.000006s : 1: interleave_split_concat_branches 0.03% : 0.000009s : 1: label_fine_grained_interleaved_index 0.04% : 0.000010s : 1: label_micro_interleaved_index 1.62% : 0.000461s : 1: loop_unroll 0.02% : 0.000007s : 1: merge_cast_opt 0.03% : 0.000007s : 1: micro_interleaved_order_control 2.04% : 0.000582s : 1: mutable_eliminate 0.04% : 0.000011s : 1: offloading_packed_experts 0.06% : 0.000017s : 1: opt.transform.loop_unroll_optimizer 0.06% : 0.000018s : 1: opt.transform.mutable_eliminate 4.87% : 0.001389s : 78: opt.transform.opt_a 0.13% : 0.000037s : 1: opt.transform.opt_after_cconv 0.11% : 0.000030s : 1: opt.transform.opt_after_jit_grad 0.47% : 0.000135s : 28: opt.transform.opt_b 0.21% : 0.000059s : 2: opt.transform.opt_trans_graph 0.16% : 0.000046s : 4: opt.transform.symbol_engine_opt 12.10% : 0.003449s : 1: opt_a 0.53% : 0.000150s : 1: opt_after_cconv 1.83% : 0.000523s : 1: opt_after_jit_grad 1.12% : 0.000320s : 1: opt_b 22.22% : 0.006334s : 1: optimize 0.09% : 0.000026s : 1: optimize_parallel_all_gather_comm 0.04% : 0.000013s : 1: order_py_execute_after_rewriter 0.11% : 0.000030s : 1: overlap_grad_flash_sp 0.02% : 0.000007s : 1: overlap_grad_matmul_and_grad_allreduce 0.04% : 0.000010s : 1: overlap_grad_ring_attention 0.03% : 0.000007s : 1: overlap_opt_shard_grad_in_pipeline 0.02% : 0.000006s : 1: overlap_opt_shard_in_pipeline 0.03% : 0.000008s : 1: overlap_param_gather 0.02% : 0.000007s : 1: overlap_recompute_allgather_and_fa_grad 0.04% : 0.000011s : 1: overlap_recompute_and_grad_model_parallel 0.03% : 0.000008s : 1: overlap_recompute_comm 0.04% : 0.000010s : 1: parallel-infer-symbol 0.02% : 0.000006s : 1: parallel-infer-symbol-second 0.03% : 0.000007s : 1: partial_unused_args_eliminate 0.03% : 0.000010s : 1: pipeline_parallel_scheduler 0.03% : 0.000007s : 1: pipeline_split 0.15% : 0.000044s : 1: pre_auto_parallel 0.12% : 0.000034s : 1: py_interpret_to_execute 0.07% : 0.000020s : 1: py_interpret_to_execute_after_opt_a 0.02% : 0.000006s : 1: remove_cast_before_assign_add 0.08% : 0.000022s : 1: remove_dup_value 1.54% : 0.000440s : 1: renormalize.infer 1.15% : 0.000329s : 1: renormalize.specialize 0.03% : 0.000008s : 1: reorder_send_recv_between_fp_bp 0.04% : 0.000011s : 1: rewriter_after_jit_bprop_graph 0.25% : 0.000072s : 1: rewriter_after_opt_a 0.33% : 0.000095s : 1: rewriter_before_opt_a 0.03% : 0.000009s : 1: slice_cell_reuse_recomputed_activation 0.03% : 0.000007s : 1: slice_recompute_activation 0.03% : 0.000008s : 1: split_layernorm_comm 0.03% : 0.000008s : 1: split_matmul_comm_elemetwise 0.04% : 0.000012s : 1: swap_dp_allreduce_reducescatter 0.39% : 0.000110s : 1: symbol_engine_optimizer 0.37% : 0.000105s : 1: tuple_transform 21.27% : 0.006064s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:45:48.405.866 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0314591, [21] [bootstrap]: 0.00046959 [type_inference]: 0.00575026 [event_method]: 1.954e-05 [auto_monad]: 6.185e-05 [graph_reusing]: 5.82001e-06 [inline]: 1.97999e-06 [add_attr]: 0.0194124, [1] [add_attr_with_inline]: 0.0194014, [1] [Cycle 1]: 7.212e-05, [2] [tag_attr]: 2.18e-05 [meta_addattr_fg_expand]: 5.89e-06 [parallel-infer-symbol]: 3.83001e-06 [pre_auto_parallel]: 3.609e-05 [insert-virtual-dataset]: 2.90998e-06 [parallel-infer-symbol-second]: 8.29983e-07 [dataset_repeat_opt]: 2.05002e-06 [pipeline_split]: 1.67999e-06 [optimize]: 0.00502057, [53] [py_interpret_to_execute]: 2.503e-05 [rewriter_before_opt_a]: 8.876e-05 [opt_a]: 0.00296223, [2] [Cycle 1]: 0.00219098, [45] [expand_dump_flag]: 2.89999e-06 [switch_simplify]: 4.413e-05 [loop_unroll]: 3.119e-05 [a_1]: 0.00066909 [with_stream_mark]: 1.532e-05 [recompute_prepare]: 1.004e-05 [updatestate_depend_eliminate]: 4.70001e-06 [updatestate_assign_eliminate]: 3.86001e-06 [updatestate_loads_eliminate]: 3.46999e-06 [parameter_eliminate]: 2.26998e-06 [a_2]: 0.00010215 [accelerated_algorithm]: 8.05999e-06 [shard]: 1.73997e-06 [meta_shard_fg_expand]: 1.92999e-06 [shard_inline]: 7.93001e-06 [merge_send_recv]: 9.10999e-06 [auto_parallel]: 6.59001e-06 [parallel]: 1.818e-05 [flash_sp]: 8.28999e-06 [merge_comm]: 4.50999e-06 [allreduce_fusion]: 4.28001e-06 [matmul_add_comm_reduction]: 1.014e-05 [allreduce_slice_to_reducescatter]: 6.89994e-07 [virtual_shard_identity]: 8.94e-06 [virtual_dataset]: 8.16002e-06 [get_grad_eliminate_]: 7.85998e-06 [virtual_output]: 7.68999e-06 [merge_forward]: 4.48001e-06 [cell_reuse_recompute_pass]: 1.15999e-06 [offload_activation]: 1.058e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.476e-05 [merge_recompute_call_nodes]: 1.40001e-06 [before_grad]: 1.295e-05 [set_forward_comm_id_for_comm_node_pass]: 4.35e-06 [meta_fg_expand]: 3.22002e-06 [flash_sp_send_recv_attached]: 2.64001e-06 [receive_attached]: 2.51998e-06 [after_resolve]: 1.285e-05 [a_after_grad]: 1.189e-05 [renormalize]: 0.00074212 [add_forward_monad_depend]: 5.37001e-06 [auto_monad_grad]: 2.43e-06 [auto_monad_eliminator]: 1.645e-05 [cse]: 3.794e-05 [a_3]: 5.836e-05 [Cycle 2]: 0.00076079, [45] [expand_dump_flag]: 1.31998e-06 [switch_simplify]: 9.12999e-06 [loop_unroll]: 7.93001e-06 [a_1]: 0.00017682 [with_stream_mark]: 1.252e-05 [recompute_prepare]: 8.25999e-06 [updatestate_depend_eliminate]: 3.8e-06 [updatestate_assign_eliminate]: 2.76999e-06 [updatestate_loads_eliminate]: 2.76e-06 [parameter_eliminate]: 1.19998e-06 [a_2]: 9.341e-05 [accelerated_algorithm]: 7.31001e-06 [shard]: 1.25999e-06 [meta_shard_fg_expand]: 1.53002e-06 [shard_inline]: 7.48e-06 [merge_send_recv]: 6.04001e-06 [auto_parallel]: 6.53e-06 [parallel]: 5.44e-06 [flash_sp]: 3.87002e-06 [merge_comm]: 4.2e-06 [allreduce_fusion]: 4.38001e-06 [matmul_add_comm_reduction]: 7.65e-06 [allreduce_slice_to_reducescatter]: 4.10015e-07 [virtual_shard_identity]: 8.23001e-06 [virtual_dataset]: 7.43e-06 [get_grad_eliminate_]: 7.19001e-06 [virtual_output]: 6.89999e-06 [merge_forward]: 3.2e-06 [cell_reuse_recompute_pass]: 1.28002e-06 [offload_activation]: 7.88999e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.32e-05 [merge_recompute_call_nodes]: 1.04e-06 [before_grad]: 1.21e-05 [set_forward_comm_id_for_comm_node_pass]: 4.55999e-06 [meta_fg_expand]: 2.87002e-06 [flash_sp_send_recv_attached]: 8.59989e-07 [receive_attached]: 1.36002e-06 [after_resolve]: 1.162e-05 [a_after_grad]: 1.094e-05 [renormalize]: 1.00001e-07 [add_forward_monad_depend]: 1.34e-06 [auto_monad_grad]: 1.38002e-06 [auto_monad_eliminator]: 9.24e-06 [cse]: 1.672e-05 [a_3]: 4.503e-05 [py_interpret_to_execute_after_opt_a]: 1.03e-05 [slice_cell_reuse_recomputed_activation]: 2.36e-06 [rewriter_after_opt_a]: 3.827e-05 [convert_after_rewriter]: 7.23e-06 [order_py_execute_after_rewriter]: 5.43002e-06 [mutable_eliminate]: 0.00048775 [opt_b]: 0.00024181, [1] [Cycle 1]: 0.00023574, [7] [b_1]: 0.00015557 [b_2]: 9.19e-06 [updatestate_depend_eliminate]: 6.26e-06 [updatestate_assign_eliminate]: 3.07002e-06 [updatestate_loads_eliminate]: 2.85998e-06 [renormalize]: 9.10019e-07 [cse]: 2.161e-05 [optimize_parallel_all_gather_comm]: 1.786e-05 [overlap_param_gather]: 2.27999e-06 [cconv]: 2.443e-05 [loop_unroll]: 0.0004133 [opt_after_cconv]: 0.00011201, [1] [Cycle 1]: 0.00010639, [7] [c_1]: 3.683e-05 [parameter_eliminate]: 2.36e-06 [updatestate_depend_eliminate]: 5.94e-06 [updatestate_assign_eliminate]: 3.06999e-06 [updatestate_loads_eliminate]: 3.12002e-06 [cse]: 2.058e-05 [renormalize]: 5.19998e-07 [remove_dup_value]: 1.439e-05 [tuple_transform]: 8.451e-05, [1] [Cycle 1]: 8.022e-05, [4] [d_1]: 5.183e-05 [none_parameter_eliminate]: 1.70001e-06 [renormalize]: 2.00002e-07 [switch_simplify]: 7.98999e-06 [partial_unused_args_eliminate]: 2.06e-06 [add_recomputation]: 5.602e-05 [cse_after_recomputation]: 2.455e-05, [1] [Cycle 1]: 1.955e-05, [1] [cse]: 1.436e-05 [environ_conv]: 5.61998e-06 [swap_dp_allreduce_reducescatter]: 5.56e-06 [bias_add_comm_swap]: 3.10998e-06 [label_micro_interleaved_index]: 4.2e-06 [label_fine_grained_interleaved_index]: 2.81999e-06 [merge_cast_opt]: 1.35999e-06 [slice_recompute_activation]: 1.97999e-06 [micro_interleaved_order_control]: 2.17999e-06 [assign_add_opt]: 1.24998e-06 [ForceFp32Comm]: 7.80012e-07 [remove_cast_before_assign_add]: 1.05999e-06 [full_micro_interleaved_order_control]: 2.49999e-06 [reorder_send_recv_between_fp_bp]: 3.08e-06 [comm_op_add_attrs]: 1.02998e-06 [add_comm_op_reuse_tag]: 1.40999e-06 [interleave_split_concat_branches]: 1.23002e-06 [interleave_parallel_branches]: 1.05001e-06 [overlap_opt_shard_in_pipeline]: 1.15001e-06 [overlap_opt_shard_grad_in_pipeline]: 2.27001e-06 [control_data_broadcast_order]: 1.399e-05 [grouped_pairwise_exchange_alltoall]: 1.79e-06 [offloading_packed_experts]: 4.2e-06 [overlap_recompute_and_grad_model_parallel]: 5.71e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.19e-06 [overlap_recompute_allgather_and_fa_grad]: 1.39e-06 [overlap_recompute_comm]: 2.48e-06 [overlap_grad_ring_attention]: 4.65999e-06 [overlap_grad_flash_sp]: 2.112e-05 [begin_end_overlap_inline]: 5.00004e-07 [split_matmul_comm_elemetwise]: 2.84999e-06 [split_layernorm_comm]: 1.66002e-06 [handle_group_info]: 1.01997e-06 [symbol_engine_optimizer]: 8.075e-05, [1] [Cycle 1]: 7.663e-05, [6] [build]: 3.01999e-06 [elim_shapecalc]: 1.103e-05 [elim_not_effective]: 1.475e-05 [opt_reshape]: 7.97e-06 [fold_const_symbol]: 1.206e-05 [renormalize]: 2.00002e-07 [detach_backward]: 1.86e-06 [pipeline_parallel_scheduler]: 1.71e-06 [auto_monad_reorder]: 1.976e-05 [get_jit_bprop_graph]: 1.82999e-06 [rewriter_after_jit_bprop_graph]: 3.45998e-06 [opt_after_jit_grad]: 0.00044878 [validate]: 4.144e-05 Sums bootstrap : 0.000470s : 4.23% type_inference : 0.005750s : 51.80% event_method : 0.000020s : 0.18% auto_monad : 0.000062s : 0.56% graph_reusing : 0.000006s : 0.05% inline : 0.000002s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000022s : 0.20% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.05% parallel-infer-symbol : 0.000004s : 0.03% pre_auto_parallel : 0.000036s : 0.33% insert-virtual-dataset : 0.000003s : 0.03% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.02% optimize.py_interpret_to_execute : 0.000025s : 0.23% optimize.rewriter_before_opt_a : 0.000089s : 0.80% optimize.opt_a.expand_dump_flag : 0.000004s : 0.04% optimize.opt_a.switch_simplify : 0.000053s : 0.48% optimize.opt_a.loop_unroll : 0.000039s : 0.35% optimize.opt_a.a_1 : 0.000846s : 7.62% optimize.opt_a.with_stream_mark : 0.000028s : 0.25% optimize.opt_a.recompute_prepare : 0.000018s : 0.16% optimize.opt_a.updatestate_depend_eliminate : 0.000009s : 0.08% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.06% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.06% optimize.opt_a.parameter_eliminate : 0.000003s : 0.03% optimize.opt_a.a_2 : 0.000196s : 1.76% optimize.opt_a.accelerated_algorithm : 0.000015s : 0.14% optimize.opt_a.shard : 0.000003s : 0.03% optimize.opt_a.meta_shard_fg_expand : 0.000003s : 0.03% optimize.opt_a.shard_inline : 0.000015s : 0.14% optimize.opt_a.merge_send_recv : 0.000015s : 0.14% optimize.opt_a.auto_parallel : 0.000013s : 0.12% optimize.opt_a.parallel : 0.000024s : 0.21% optimize.opt_a.flash_sp : 0.000012s : 0.11% optimize.opt_a.merge_comm : 0.000009s : 0.08% optimize.opt_a.allreduce_fusion : 0.000009s : 0.08% optimize.opt_a.matmul_add_comm_reduction : 0.000018s : 0.16% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000017s : 0.15% optimize.opt_a.virtual_dataset : 0.000016s : 0.14% optimize.opt_a.get_grad_eliminate_ : 0.000015s : 0.14% optimize.opt_a.virtual_output : 0.000015s : 0.13% optimize.opt_a.merge_forward : 0.000008s : 0.07% optimize.opt_a.cell_reuse_recompute_pass : 0.000002s : 0.02% optimize.opt_a.offload_activation : 0.000018s : 0.17% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000028s : 0.25% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.02% optimize.opt_a.before_grad : 0.000025s : 0.23% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000009s : 0.08% optimize.opt_a.meta_fg_expand : 0.000006s : 0.05% optimize.opt_a.flash_sp_send_recv_attached : 0.000003s : 0.03% optimize.opt_a.receive_attached : 0.000004s : 0.03% optimize.opt_a.after_resolve : 0.000024s : 0.22% optimize.opt_a.a_after_grad : 0.000023s : 0.21% optimize.opt_a.renormalize : 0.000742s : 6.69% optimize.opt_a.add_forward_monad_depend : 0.000007s : 0.06% optimize.opt_a.auto_monad_grad : 0.000004s : 0.03% optimize.opt_a.auto_monad_eliminator : 0.000026s : 0.23% optimize.opt_a.cse : 0.000055s : 0.49% optimize.opt_a.a_3 : 0.000103s : 0.93% optimize.py_interpret_to_execute_after_opt_a : 0.000010s : 0.09% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.02% optimize.rewriter_after_opt_a : 0.000038s : 0.34% optimize.convert_after_rewriter : 0.000007s : 0.07% optimize.order_py_execute_after_rewriter : 0.000005s : 0.05% optimize.mutable_eliminate : 0.000488s : 4.39% optimize.opt_b.b_1 : 0.000156s : 1.40% optimize.opt_b.b_2 : 0.000009s : 0.08% optimize.opt_b.updatestate_depend_eliminate : 0.000006s : 0.06% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_b.renormalize : 0.000001s : 0.01% optimize.opt_b.cse : 0.000022s : 0.19% optimize.optimize_parallel_all_gather_comm : 0.000018s : 0.16% optimize.overlap_param_gather : 0.000002s : 0.02% optimize.cconv : 0.000024s : 0.22% optimize.loop_unroll : 0.000413s : 3.72% optimize.opt_after_cconv.c_1 : 0.000037s : 0.33% optimize.opt_after_cconv.parameter_eliminate : 0.000002s : 0.02% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.05% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.cse : 0.000021s : 0.19% optimize.opt_after_cconv.renormalize : 0.000001s : 0.00% optimize.remove_dup_value : 0.000014s : 0.13% optimize.tuple_transform.d_1 : 0.000052s : 0.47% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.02% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000008s : 0.07% optimize.partial_unused_args_eliminate : 0.000002s : 0.02% optimize.add_recomputation : 0.000056s : 0.50% optimize.cse_after_recomputation.cse : 0.000014s : 0.13% optimize.environ_conv : 0.000006s : 0.05% optimize.swap_dp_allreduce_reducescatter : 0.000006s : 0.05% optimize.bias_add_comm_swap : 0.000003s : 0.03% optimize.label_micro_interleaved_index : 0.000004s : 0.04% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.03% optimize.merge_cast_opt : 0.000001s : 0.01% optimize.slice_recompute_activation : 0.000002s : 0.02% optimize.micro_interleaved_order_control : 0.000002s : 0.02% optimize.assign_add_opt : 0.000001s : 0.01% optimize.ForceFp32Comm : 0.000001s : 0.01% optimize.remove_cast_before_assign_add : 0.000001s : 0.01% optimize.full_micro_interleaved_order_control : 0.000002s : 0.02% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.03% optimize.comm_op_add_attrs : 0.000001s : 0.01% optimize.add_comm_op_reuse_tag : 0.000001s : 0.01% optimize.interleave_split_concat_branches : 0.000001s : 0.01% optimize.interleave_parallel_branches : 0.000001s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.02% optimize.control_data_broadcast_order : 0.000014s : 0.13% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.02% optimize.offloading_packed_experts : 0.000004s : 0.04% optimize.overlap_recompute_and_grad_model_parallel : 0.000006s : 0.05% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.01% optimize.overlap_recompute_comm : 0.000002s : 0.02% optimize.overlap_grad_ring_attention : 0.000005s : 0.04% optimize.overlap_grad_flash_sp : 0.000021s : 0.19% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000003s : 0.03% optimize.split_layernorm_comm : 0.000002s : 0.01% optimize.handle_group_info : 0.000001s : 0.01% optimize.symbol_engine_optimizer.build : 0.000003s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000011s : 0.10% optimize.symbol_engine_optimizer.elim_not_effective : 0.000015s : 0.13% optimize.symbol_engine_optimizer.opt_reshape : 0.000008s : 0.07% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000012s : 0.11% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.02% pipeline_parallel_scheduler : 0.000002s : 0.02% auto_monad_reorder : 0.000020s : 0.18% get_jit_bprop_graph : 0.000002s : 0.02% rewriter_after_jit_bprop_graph : 0.000003s : 0.03% opt_after_jit_grad : 0.000449s : 4.04% validate : 0.000041s : 0.37% Time group info: ------[substitution.] 0.000206 38 11.79% : 0.000024s : 3: substitution.cast_eliminate 1.05% : 0.000002s : 3: substitution.elim_not_effective 0.80% : 0.000002s : 3: substitution.fold_const_symbol 3.34% : 0.000007s : 5: substitution.graph_param_transform 69.40% : 0.000143s : 4: substitution.inline 2.32% : 0.000005s : 6: substitution.j_node_and_user_rematch 2.90% : 0.000006s : 6: substitution.remove_not_recompute_node 2.23% : 0.000005s : 4: substitution.replace_old_param 6.17% : 0.000013s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.005693 2 87.96% : 0.005008s : 1: type_inference.infer 12.04% : 0.000685s : 1: type_inference.specialize ------[replace.] 0.000061 8 62.88% : 0.000039s : 4: replace.inline 37.12% : 0.000023s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000151 8 92.82% : 0.000141s : 4: match.inline 7.18% : 0.000011s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000232 1504 0.89% : 0.000002s : 15: predicate.accumulaten_eliminater 0.65% : 0.000001s : 5: predicate.ad_related_special_op_eliminate 0.63% : 0.000001s : 10: predicate.addn_check_dump 0.89% : 0.000002s : 15: predicate.addn_zero_filter 0.83% : 0.000002s : 15: predicate.adjust_all_reduce_mul_add 2.14% : 0.000005s : 25: predicate.arithmetic_simplify 1.03% : 0.000002s : 15: predicate.cast_eliminate 0.65% : 0.000002s : 10: predicate.check_bprop_eliminate 0.60% : 0.000001s : 10: predicate.compare_switch_simplify 0.21% : 0.000000s : 5: predicate.const_output_eliminate 0.66% : 0.000002s : 10: predicate.depend_value_elim 0.99% : 0.000002s : 15: predicate.dict_get_item_const_eliminator 1.04% : 0.000002s : 15: predicate.dict_get_item_eliminator 0.98% : 0.000002s : 15: predicate.dict_set_item_eliminator 0.90% : 0.000002s : 10: predicate.dumpgradient_eliminate 0.19% : 0.000000s : 5: predicate.elim_not_effective 0.38% : 0.000001s : 5: predicate.elim_shapecalc_of_broadcastargs 1.16% : 0.000003s : 20: predicate.environ_add_const_eliminate 1.12% : 0.000003s : 20: predicate.environ_get_add_eliminate 1.16% : 0.000003s : 20: predicate.environ_get_depend_swap 1.73% : 0.000004s : 30: predicate.environ_get_eliminate 1.11% : 0.000003s : 20: predicate.environ_get_set_eliminate 1.42% : 0.000003s : 23: predicate.exchange_switch_depend_value 2.34% : 0.000005s : 23: predicate.float_depend_g_call 0.59% : 0.000001s : 10: predicate.float_environ_get_switch 0.86% : 0.000002s : 15: predicate.float_tuple_getitem_switch 0.19% : 0.000000s : 5: predicate.fold_const_symbol 0.63% : 0.000001s : 10: predicate.get_grad_eliminate 0.22% : 0.000001s : 5: predicate.graph_param_transform 0.68% : 0.000002s : 10: predicate.incorporate_call 0.55% : 0.000001s : 10: predicate.incorporate_call_switch 6.21% : 0.000014s : 68: predicate.inline 0.85% : 0.000002s : 10: predicate.inline_without_move 0.35% : 0.000001s : 10: predicate.j_node_and_user_rematch 0.81% : 0.000002s : 10: predicate.less_batch_normalization 1.81% : 0.000004s : 29: predicate.list_to_tuple_eliminator_ 2.53% : 0.000006s : 44: predicate.load_eliminater 0.86% : 0.000002s : 5: predicate.loop_unroll_after_grad 2.21% : 0.000005s : 36: predicate.loop_unroll_before_grad 1.64% : 0.000004s : 25: predicate.make_slice_get_slice_eliminator 0.64% : 0.000001s : 10: predicate.merge_addn 0.59% : 0.000001s : 10: predicate.micro_step_allgather_replace 0.60% : 0.000001s : 10: predicate.mini_step_allgather_replace 1.05% : 0.000002s : 15: predicate.minmaximum_grad 0.89% : 0.000002s : 5: predicate.mutable_eliminate 0.38% : 0.000001s : 5: predicate.opt_reshape 0.34% : 0.000001s : 5: predicate.parallel_virtual_node 1.69% : 0.000004s : 23: predicate.partial_defer_inline 1.67% : 0.000004s : 24: predicate.partial_eliminate 0.90% : 0.000002s : 15: predicate.print_const_string_wrapper 0.62% : 0.000001s : 10: predicate.reduce_all_const_elim 1.20% : 0.000003s : 15: predicate.reduce_eliminate 2.56% : 0.000006s : 44: predicate.redundant_stop_gradient_eliminater 0.37% : 0.000001s : 10: predicate.remove_not_recompute_node 1.40% : 0.000003s : 29: predicate.replace_applicator 0.53% : 0.000001s : 10: predicate.replace_old_param 0.26% : 0.000001s : 5: predicate.reset_defer_inline 0.95% : 0.000002s : 15: predicate.reshape_eliminate 0.67% : 0.000002s : 10: predicate.row_tensor_add_zeros_like 0.35% : 0.000001s : 5: predicate.row_tensor_eliminate 0.72% : 0.000002s : 10: predicate.same_eliminate 0.71% : 0.000002s : 10: predicate.set_cell_output_no_recompute 0.74% : 0.000002s : 10: predicate.shard_identity_eliminate 0.74% : 0.000002s : 10: predicate.special_op_eliminate 0.77% : 0.000002s : 10: predicate.specialize_transform 0.88% : 0.000002s : 10: predicate.split_environ_get_set_with_tuple_value 0.74% : 0.000002s : 10: predicate.stack_unstack_eliminate 0.39% : 0.000001s : 5: predicate.switch_call_monad_eliminater 1.45% : 0.000003s : 23: predicate.switch_defer_inline 2.00% : 0.000005s : 33: predicate.switch_layer_defer_inline 5.04% : 0.000012s : 74: predicate.switch_simplify 0.92% : 0.000002s : 15: predicate.tile_eliminate 0.98% : 0.000002s : 15: predicate.transpose_eliminate 1.55% : 0.000004s : 25: predicate.tuple_list_convert_item_index_to_positive 1.64% : 0.000004s : 25: predicate.tuple_list_get_item_const_eliminator 1.45% : 0.000003s : 25: predicate.tuple_list_get_item_depend_reorder 3.11% : 0.000007s : 39: predicate.tuple_list_get_item_eliminator 1.46% : 0.000003s : 25: predicate.tuple_list_get_set_item_eliminator 2.39% : 0.000006s : 35: predicate.tuple_list_set_item_eliminator 1.72% : 0.000004s : 29: predicate.tuple_to_list_eliminator_ 2.46% : 0.000006s : 44: predicate.updatestate_pure_node_eliminater 3.23% : 0.000008s : 54: predicate.updatestate_useless_node_eliminater 0.37% : 0.000001s : 5: predicate.value_based_eliminate 0.74% : 0.000002s : 10: predicate.virtual_dataset_eliminate 0.68% : 0.000002s : 10: predicate.virtual_output_eliminate 0.34% : 0.000001s : 5: predicate.virtual_view_grad_eliminate 0.42% : 0.000001s : 5: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000530 11 52.82% : 0.000280s : 5: func_graph_cloner_run.FuncGraphClonerGraph 47.18% : 0.000250s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.058189 192 0.01% : 0.000003s : 1: ForceFp32Comm 33.37% : 0.019418s : 1: add_attr 33.35% : 0.019406s : 1: add_attr_with_inline 0.01% : 0.000004s : 1: add_comm_op_reuse_tag 0.10% : 0.000060s : 1: add_recomputation 0.01% : 0.000004s : 1: assign_add_opt 0.12% : 0.000067s : 1: auto_monad 0.04% : 0.000024s : 1: auto_monad_reorder 0.01% : 0.000003s : 1: begin_end_overlap_inline 0.01% : 0.000006s : 1: bias_add_comm_swap 0.86% : 0.000500s : 1: bootstrap 0.05% : 0.000028s : 1: cconv 0.01% : 0.000004s : 1: comm_op_add_attrs 0.03% : 0.000017s : 1: control_data_broadcast_order 0.02% : 0.000011s : 1: convert_after_rewriter 0.05% : 0.000028s : 1: cse_after_recomputation 0.01% : 0.000005s : 1: dataset_repeat_opt 0.01% : 0.000005s : 1: detach_backward 0.01% : 0.000009s : 1: environ_conv 0.05% : 0.000026s : 1: event_method 0.01% : 0.000005s : 1: full_micro_interleaved_order_control 0.01% : 0.000005s : 1: get_jit_bprop_graph 0.02% : 0.000009s : 1: graph_reusing 0.01% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000004s : 1: handle_group_info 0.01% : 0.000005s : 1: inline 0.01% : 0.000006s : 1: insert-virtual-dataset 0.01% : 0.000004s : 1: interleave_parallel_branches 0.01% : 0.000004s : 1: interleave_split_concat_branches 0.01% : 0.000006s : 1: label_fine_grained_interleaved_index 0.01% : 0.000007s : 1: label_micro_interleaved_index 0.72% : 0.000421s : 1: loop_unroll 0.01% : 0.000004s : 1: merge_cast_opt 0.01% : 0.000005s : 1: micro_interleaved_order_control 0.85% : 0.000496s : 1: mutable_eliminate 0.01% : 0.000007s : 1: offloading_packed_experts 0.03% : 0.000015s : 1: opt.transform.loop_unroll_optimizer 0.03% : 0.000017s : 1: opt.transform.mutable_eliminate 2.33% : 0.001358s : 78: opt.transform.opt_a 0.06% : 0.000036s : 1: opt.transform.opt_after_cconv 0.05% : 0.000027s : 1: opt.transform.opt_after_jit_grad 0.23% : 0.000134s : 28: opt.transform.opt_b 0.10% : 0.000058s : 2: opt.transform.opt_trans_graph 0.07% : 0.000042s : 4: opt.transform.symbol_engine_opt 5.10% : 0.002965s : 1: opt_a 0.20% : 0.000115s : 1: opt_after_cconv 0.79% : 0.000457s : 1: opt_after_jit_grad 0.42% : 0.000245s : 1: opt_b 8.64% : 0.005026s : 1: optimize 0.04% : 0.000021s : 1: optimize_parallel_all_gather_comm 0.01% : 0.000008s : 1: order_py_execute_after_rewriter 0.04% : 0.000024s : 1: overlap_grad_flash_sp 0.01% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.01% : 0.000008s : 1: overlap_grad_ring_attention 0.01% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.01% : 0.000005s : 1: overlap_param_gather 0.01% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.01% : 0.000009s : 1: overlap_recompute_and_grad_model_parallel 0.01% : 0.000005s : 1: overlap_recompute_comm 0.01% : 0.000007s : 1: parallel-infer-symbol 0.01% : 0.000004s : 1: parallel-infer-symbol-second 0.01% : 0.000005s : 1: partial_unused_args_eliminate 0.01% : 0.000005s : 1: pipeline_parallel_scheduler 0.01% : 0.000004s : 1: pipeline_split 0.07% : 0.000040s : 1: pre_auto_parallel 0.05% : 0.000029s : 1: py_interpret_to_execute 0.02% : 0.000014s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000004s : 1: remove_cast_before_assign_add 0.03% : 0.000018s : 1: remove_dup_value 0.68% : 0.000395s : 1: renormalize.infer 0.58% : 0.000339s : 1: renormalize.specialize 0.01% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.01% : 0.000007s : 1: rewriter_after_jit_bprop_graph 0.07% : 0.000042s : 1: rewriter_after_opt_a 0.16% : 0.000093s : 1: rewriter_before_opt_a 0.01% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.01% : 0.000005s : 1: slice_recompute_activation 0.01% : 0.000004s : 1: split_layernorm_comm 0.01% : 0.000006s : 1: split_matmul_comm_elemetwise 0.01% : 0.000008s : 1: swap_dp_allreduce_reducescatter 0.14% : 0.000083s : 1: symbol_engine_optimizer 0.15% : 0.000087s : 1: tuple_transform 9.91% : 0.005766s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:45:48.867.347 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:45:48.867.608 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0333089, [21] [bootstrap]: 0.00046443 [type_inference]: 0.00578178 [event_method]: 1.882e-05 [auto_monad]: 6.626e-05 [graph_reusing]: 6.26e-06 [inline]: 2.21998e-06 [add_attr]: 0.0192018, [1] [add_attr_with_inline]: 0.0191872, [1] [Cycle 1]: 0.00010651, [2] [tag_attr]: 2.716e-05 [meta_addattr_fg_expand]: 6.34001e-06 [parallel-infer-symbol]: 4.04997e-06 [pre_auto_parallel]: 4.675e-05 [insert-virtual-dataset]: 2.39999e-06 [parallel-infer-symbol-second]: 8.09989e-07 [dataset_repeat_opt]: 1.97999e-06 [pipeline_split]: 1.82001e-06 [optimize]: 0.00655652, [53] [py_interpret_to_execute]: 4.344e-05 [rewriter_before_opt_a]: 0.0001088 [opt_a]: 0.00388916, [2] [Cycle 1]: 0.00277992, [45] [expand_dump_flag]: 3.01999e-06 [switch_simplify]: 4.595e-05 [loop_unroll]: 3.256e-05 [a_1]: 0.00077276 [with_stream_mark]: 2.363e-05 [recompute_prepare]: 1.399e-05 [updatestate_depend_eliminate]: 5.54e-06 [updatestate_assign_eliminate]: 4.49002e-06 [updatestate_loads_eliminate]: 4.13999e-06 [parameter_eliminate]: 2.14e-06 [a_2]: 0.00014988 [accelerated_algorithm]: 9.72999e-06 [shard]: 2.37001e-06 [meta_shard_fg_expand]: 2.34001e-06 [shard_inline]: 9.39998e-06 [merge_send_recv]: 1.094e-05 [auto_parallel]: 9.31998e-06 [parallel]: 1.923e-05 [flash_sp]: 1.041e-05 [merge_comm]: 5.68002e-06 [allreduce_fusion]: 4.80999e-06 [matmul_add_comm_reduction]: 1.119e-05 [allreduce_slice_to_reducescatter]: 7.00005e-07 [virtual_shard_identity]: 1.261e-05 [virtual_dataset]: 1.024e-05 [get_grad_eliminate_]: 9.09e-06 [virtual_output]: 1.093e-05 [merge_forward]: 5.96998e-06 [cell_reuse_recompute_pass]: 1.59e-06 [offload_activation]: 1.222e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.045e-05 [merge_recompute_call_nodes]: 1.49e-06 [before_grad]: 1.578e-05 [set_forward_comm_id_for_comm_node_pass]: 5.07e-06 [meta_fg_expand]: 4.22003e-06 [flash_sp_send_recv_attached]: 2.88e-06 [receive_attached]: 2.11e-06 [after_resolve]: 1.531e-05 [a_after_grad]: 1.495e-05 [renormalize]: 0.00091686 [add_forward_monad_depend]: 5.20001e-06 [auto_monad_grad]: 3.26001e-06 [auto_monad_eliminator]: 2.005e-05 [cse]: 4.552e-05 [a_3]: 8.132e-05 [Cycle 2]: 0.00109331, [45] [expand_dump_flag]: 1.34998e-06 [switch_simplify]: 1.047e-05 [loop_unroll]: 8.89998e-06 [a_1]: 0.00021239 [with_stream_mark]: 1.346e-05 [recompute_prepare]: 8.74e-06 [updatestate_depend_eliminate]: 5.14003e-06 [updatestate_assign_eliminate]: 3.8e-06 [updatestate_loads_eliminate]: 3.60998e-06 [parameter_eliminate]: 1.20999e-06 [a_2]: 0.00015848 [accelerated_algorithm]: 9.57999e-06 [shard]: 1.69e-06 [meta_shard_fg_expand]: 2.03002e-06 [shard_inline]: 9.59999e-06 [merge_send_recv]: 9.02e-06 [auto_parallel]: 8.22e-06 [parallel]: 5.05999e-06 [flash_sp]: 3.65998e-06 [merge_comm]: 5.45001e-06 [allreduce_fusion]: 4.77e-06 [matmul_add_comm_reduction]: 1.009e-05 [allreduce_slice_to_reducescatter]: 3.50003e-07 [virtual_shard_identity]: 1.04e-05 [virtual_dataset]: 9.29e-06 [get_grad_eliminate_]: 8.65999e-06 [virtual_output]: 8.94998e-06 [merge_forward]: 4.71002e-06 [cell_reuse_recompute_pass]: 1.99e-06 [offload_activation]: 9.97999e-06 [cell_reuse_handle_not_recompute_node_pass]: 2.164e-05 [merge_recompute_call_nodes]: 7.2e-07 [before_grad]: 1.561e-05 [set_forward_comm_id_for_comm_node_pass]: 5.30001e-06 [meta_fg_expand]: 3.11999e-06 [flash_sp_send_recv_attached]: 7.50006e-07 [receive_attached]: 1.42e-06 [after_resolve]: 1.28e-05 [a_after_grad]: 1.31e-05 [renormalize]: 8.00064e-08 [add_forward_monad_depend]: 1.36002e-06 [auto_monad_grad]: 1.26002e-06 [auto_monad_eliminator]: 1.172e-05 [cse]: 2.441e-05 [a_3]: 6.825e-05 [py_interpret_to_execute_after_opt_a]: 1.543e-05 [slice_cell_reuse_recomputed_activation]: 4.48999e-06 [rewriter_after_opt_a]: 5.121e-05 [convert_after_rewriter]: 1.143e-05 [order_py_execute_after_rewriter]: 9.02e-06 [mutable_eliminate]: 0.00065937 [opt_b]: 0.00036081, [1] [Cycle 1]: 0.0003501, [7] [b_1]: 0.00023956 [b_2]: 1.069e-05 [updatestate_depend_eliminate]: 7.55e-06 [updatestate_assign_eliminate]: 3.95e-06 [updatestate_loads_eliminate]: 3.62002e-06 [renormalize]: 5.19998e-07 [cse]: 2.714e-05 [optimize_parallel_all_gather_comm]: 2.119e-05 [overlap_param_gather]: 4.92999e-06 [cconv]: 3.041e-05 [loop_unroll]: 0.00043176 [opt_after_cconv]: 0.00015002, [1] [Cycle 1]: 0.00014159, [7] [c_1]: 4.238e-05 [parameter_eliminate]: 2.57001e-06 [updatestate_depend_eliminate]: 6.66e-06 [updatestate_assign_eliminate]: 3.8e-06 [updatestate_loads_eliminate]: 3.98999e-06 [cse]: 2.536e-05 [renormalize]: 4.00003e-07 [remove_dup_value]: 4.523e-05 [tuple_transform]: 0.00011042, [1] [Cycle 1]: 0.00010335, [4] [d_1]: 6.114e-05 [none_parameter_eliminate]: 1.74e-06 [renormalize]: 2.19996e-07 [switch_simplify]: 9.92999e-06 [partial_unused_args_eliminate]: 4.48999e-06 [add_recomputation]: 6.227e-05 [cse_after_recomputation]: 3.355e-05, [1] [Cycle 1]: 2.678e-05, [1] [cse]: 1.751e-05 [environ_conv]: 9.50001e-06 [swap_dp_allreduce_reducescatter]: 9.64999e-06 [bias_add_comm_swap]: 4.97999e-06 [label_micro_interleaved_index]: 6.58e-06 [label_fine_grained_interleaved_index]: 5.03002e-06 [merge_cast_opt]: 3.74002e-06 [slice_recompute_activation]: 4.74998e-06 [micro_interleaved_order_control]: 4.80001e-06 [assign_add_opt]: 3.61999e-06 [ForceFp32Comm]: 3.46001e-06 [remove_cast_before_assign_add]: 3.36999e-06 [full_micro_interleaved_order_control]: 4.38999e-06 [reorder_send_recv_between_fp_bp]: 5.04003e-06 [comm_op_add_attrs]: 3.40998e-06 [add_comm_op_reuse_tag]: 3.23e-06 [interleave_split_concat_branches]: 3.41999e-06 [interleave_parallel_branches]: 3.30998e-06 [overlap_opt_shard_in_pipeline]: 3.36999e-06 [overlap_opt_shard_grad_in_pipeline]: 4.42003e-06 [control_data_broadcast_order]: 1.916e-05 [grouped_pairwise_exchange_alltoall]: 4.13001e-06 [offloading_packed_experts]: 7.16999e-06 [overlap_recompute_and_grad_model_parallel]: 7.98001e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.55e-06 [overlap_recompute_allgather_and_fa_grad]: 3.98999e-06 [overlap_recompute_comm]: 4.73001e-06 [overlap_grad_ring_attention]: 7.71999e-06 [overlap_grad_flash_sp]: 2.815e-05 [begin_end_overlap_inline]: 3.01001e-06 [split_matmul_comm_elemetwise]: 4.35e-06 [split_layernorm_comm]: 4.34997e-06 [handle_group_info]: 3.2e-06 [symbol_engine_optimizer]: 0.00010912, [1] [Cycle 1]: 0.00010256, [6] [build]: 3.53999e-06 [elim_shapecalc]: 1.19e-05 [elim_not_effective]: 1.73e-05 [opt_reshape]: 9.64e-06 [fold_const_symbol]: 1.434e-05 [renormalize]: 2.00002e-07 [detach_backward]: 3.28e-06 [pipeline_parallel_scheduler]: 1.80001e-06 [auto_monad_reorder]: 2.246e-05 [get_jit_bprop_graph]: 1.77001e-06 [rewriter_after_jit_bprop_graph]: 4.28999e-06 [opt_after_jit_grad]: 0.00049139 [validate]: 4.638e-05 Sums bootstrap : 0.000464s : 3.77% type_inference : 0.005782s : 46.90% event_method : 0.000019s : 0.15% auto_monad : 0.000066s : 0.54% graph_reusing : 0.000006s : 0.05% inline : 0.000002s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000027s : 0.22% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.05% parallel-infer-symbol : 0.000004s : 0.03% pre_auto_parallel : 0.000047s : 0.38% insert-virtual-dataset : 0.000002s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000043s : 0.35% optimize.rewriter_before_opt_a : 0.000109s : 0.88% optimize.opt_a.expand_dump_flag : 0.000004s : 0.04% optimize.opt_a.switch_simplify : 0.000056s : 0.46% optimize.opt_a.loop_unroll : 0.000041s : 0.34% optimize.opt_a.a_1 : 0.000985s : 7.99% optimize.opt_a.with_stream_mark : 0.000037s : 0.30% optimize.opt_a.recompute_prepare : 0.000023s : 0.18% optimize.opt_a.updatestate_depend_eliminate : 0.000011s : 0.09% optimize.opt_a.updatestate_assign_eliminate : 0.000008s : 0.07% optimize.opt_a.updatestate_loads_eliminate : 0.000008s : 0.06% optimize.opt_a.parameter_eliminate : 0.000003s : 0.03% optimize.opt_a.a_2 : 0.000308s : 2.50% optimize.opt_a.accelerated_algorithm : 0.000019s : 0.16% optimize.opt_a.shard : 0.000004s : 0.03% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.04% optimize.opt_a.shard_inline : 0.000019s : 0.15% optimize.opt_a.merge_send_recv : 0.000020s : 0.16% optimize.opt_a.auto_parallel : 0.000018s : 0.14% optimize.opt_a.parallel : 0.000024s : 0.20% optimize.opt_a.flash_sp : 0.000014s : 0.11% optimize.opt_a.merge_comm : 0.000011s : 0.09% optimize.opt_a.allreduce_fusion : 0.000010s : 0.08% optimize.opt_a.matmul_add_comm_reduction : 0.000021s : 0.17% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000023s : 0.19% optimize.opt_a.virtual_dataset : 0.000020s : 0.16% optimize.opt_a.get_grad_eliminate_ : 0.000018s : 0.14% optimize.opt_a.virtual_output : 0.000020s : 0.16% optimize.opt_a.merge_forward : 0.000011s : 0.09% optimize.opt_a.cell_reuse_recompute_pass : 0.000004s : 0.03% optimize.opt_a.offload_activation : 0.000022s : 0.18% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000042s : 0.34% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.02% optimize.opt_a.before_grad : 0.000031s : 0.25% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000010s : 0.08% optimize.opt_a.meta_fg_expand : 0.000007s : 0.06% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.03% optimize.opt_a.receive_attached : 0.000004s : 0.03% optimize.opt_a.after_resolve : 0.000028s : 0.23% optimize.opt_a.a_after_grad : 0.000028s : 0.23% optimize.opt_a.renormalize : 0.000917s : 7.44% optimize.opt_a.add_forward_monad_depend : 0.000007s : 0.05% optimize.opt_a.auto_monad_grad : 0.000005s : 0.04% optimize.opt_a.auto_monad_eliminator : 0.000032s : 0.26% optimize.opt_a.cse : 0.000070s : 0.57% optimize.opt_a.a_3 : 0.000150s : 1.21% optimize.py_interpret_to_execute_after_opt_a : 0.000015s : 0.13% optimize.slice_cell_reuse_recomputed_activation : 0.000004s : 0.04% optimize.rewriter_after_opt_a : 0.000051s : 0.42% optimize.convert_after_rewriter : 0.000011s : 0.09% optimize.order_py_execute_after_rewriter : 0.000009s : 0.07% optimize.mutable_eliminate : 0.000659s : 5.35% optimize.opt_b.b_1 : 0.000240s : 1.94% optimize.opt_b.b_2 : 0.000011s : 0.09% optimize.opt_b.updatestate_depend_eliminate : 0.000008s : 0.06% optimize.opt_b.updatestate_assign_eliminate : 0.000004s : 0.03% optimize.opt_b.updatestate_loads_eliminate : 0.000004s : 0.03% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000027s : 0.22% optimize.optimize_parallel_all_gather_comm : 0.000021s : 0.17% optimize.overlap_param_gather : 0.000005s : 0.04% optimize.cconv : 0.000030s : 0.25% optimize.loop_unroll : 0.000432s : 3.50% optimize.opt_after_cconv.c_1 : 0.000042s : 0.34% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.02% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000007s : 0.05% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000004s : 0.03% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000004s : 0.03% optimize.opt_after_cconv.cse : 0.000025s : 0.21% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000045s : 0.37% optimize.tuple_transform.d_1 : 0.000061s : 0.50% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000010s : 0.08% optimize.partial_unused_args_eliminate : 0.000004s : 0.04% optimize.add_recomputation : 0.000062s : 0.51% optimize.cse_after_recomputation.cse : 0.000018s : 0.14% optimize.environ_conv : 0.000010s : 0.08% optimize.swap_dp_allreduce_reducescatter : 0.000010s : 0.08% optimize.bias_add_comm_swap : 0.000005s : 0.04% optimize.label_micro_interleaved_index : 0.000007s : 0.05% optimize.label_fine_grained_interleaved_index : 0.000005s : 0.04% optimize.merge_cast_opt : 0.000004s : 0.03% optimize.slice_recompute_activation : 0.000005s : 0.04% optimize.micro_interleaved_order_control : 0.000005s : 0.04% optimize.assign_add_opt : 0.000004s : 0.03% optimize.ForceFp32Comm : 0.000003s : 0.03% optimize.remove_cast_before_assign_add : 0.000003s : 0.03% optimize.full_micro_interleaved_order_control : 0.000004s : 0.04% optimize.reorder_send_recv_between_fp_bp : 0.000005s : 0.04% optimize.comm_op_add_attrs : 0.000003s : 0.03% optimize.add_comm_op_reuse_tag : 0.000003s : 0.03% optimize.interleave_split_concat_branches : 0.000003s : 0.03% optimize.interleave_parallel_branches : 0.000003s : 0.03% optimize.overlap_opt_shard_in_pipeline : 0.000003s : 0.03% optimize.overlap_opt_shard_grad_in_pipeline : 0.000004s : 0.04% optimize.control_data_broadcast_order : 0.000019s : 0.16% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.03% optimize.offloading_packed_experts : 0.000007s : 0.06% optimize.overlap_recompute_and_grad_model_parallel : 0.000008s : 0.06% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.03% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.03% optimize.overlap_recompute_comm : 0.000005s : 0.04% optimize.overlap_grad_ring_attention : 0.000008s : 0.06% optimize.overlap_grad_flash_sp : 0.000028s : 0.23% optimize.begin_end_overlap_inline : 0.000003s : 0.02% optimize.split_matmul_comm_elemetwise : 0.000004s : 0.04% optimize.split_layernorm_comm : 0.000004s : 0.04% optimize.handle_group_info : 0.000003s : 0.03% optimize.symbol_engine_optimizer.build : 0.000004s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000012s : 0.10% optimize.symbol_engine_optimizer.elim_not_effective : 0.000017s : 0.14% optimize.symbol_engine_optimizer.opt_reshape : 0.000010s : 0.08% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000014s : 0.12% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000003s : 0.03% pipeline_parallel_scheduler : 0.000002s : 0.01% auto_monad_reorder : 0.000022s : 0.18% get_jit_bprop_graph : 0.000002s : 0.01% rewriter_after_jit_bprop_graph : 0.000004s : 0.03% opt_after_jit_grad : 0.000491s : 3.99% validate : 0.000046s : 0.38% Time group info: ------[substitution.] 0.000261 48 13.45% : 0.000035s : 6: substitution.cast_eliminate 0.94% : 0.000002s : 4: substitution.elim_not_effective 0.74% : 0.000002s : 4: substitution.fold_const_symbol 3.05% : 0.000008s : 6: substitution.graph_param_transform 69.09% : 0.000180s : 4: substitution.inline 2.28% : 0.000006s : 8: substitution.j_node_and_user_rematch 3.08% : 0.000008s : 8: substitution.remove_not_recompute_node 2.15% : 0.000006s : 4: substitution.replace_old_param 5.22% : 0.000014s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.005734 2 87.73% : 0.005031s : 1: type_inference.infer 12.27% : 0.000704s : 1: type_inference.specialize ------[replace.] 0.000068 8 61.48% : 0.000042s : 4: replace.inline 38.52% : 0.000026s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000189 8 93.76% : 0.000177s : 4: match.inline 6.24% : 0.000012s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000284 1730 0.99% : 0.000003s : 17: predicate.accumulaten_eliminater 0.66% : 0.000002s : 6: predicate.ad_related_special_op_eliminate 0.76% : 0.000002s : 12: predicate.addn_check_dump 0.88% : 0.000002s : 17: predicate.addn_zero_filter 0.84% : 0.000002s : 17: predicate.adjust_all_reduce_mul_add 2.06% : 0.000006s : 29: predicate.arithmetic_simplify 1.13% : 0.000003s : 17: predicate.cast_eliminate 0.81% : 0.000002s : 12: predicate.check_bprop_eliminate 0.61% : 0.000002s : 12: predicate.compare_switch_simplify 0.20% : 0.000001s : 6: predicate.const_output_eliminate 0.75% : 0.000002s : 12: predicate.depend_value_elim 0.99% : 0.000003s : 17: predicate.dict_get_item_const_eliminator 1.18% : 0.000003s : 17: predicate.dict_get_item_eliminator 0.89% : 0.000003s : 17: predicate.dict_set_item_eliminator 0.91% : 0.000003s : 12: predicate.dumpgradient_eliminate 0.19% : 0.000001s : 6: predicate.elim_not_effective 0.37% : 0.000001s : 6: predicate.elim_shapecalc_of_broadcastargs 1.20% : 0.000003s : 23: predicate.environ_add_const_eliminate 1.27% : 0.000004s : 23: predicate.environ_get_add_eliminate 1.06% : 0.000003s : 23: predicate.environ_get_depend_swap 1.87% : 0.000005s : 35: predicate.environ_get_eliminate 1.10% : 0.000003s : 23: predicate.environ_get_set_eliminate 1.31% : 0.000004s : 25: predicate.exchange_switch_depend_value 2.14% : 0.000006s : 25: predicate.float_depend_g_call 0.72% : 0.000002s : 12: predicate.float_environ_get_switch 0.96% : 0.000003s : 18: predicate.float_tuple_getitem_switch 0.18% : 0.000001s : 6: predicate.fold_const_symbol 0.71% : 0.000002s : 12: predicate.get_grad_eliminate 0.21% : 0.000001s : 6: predicate.graph_param_transform 0.70% : 0.000002s : 12: predicate.incorporate_call 0.58% : 0.000002s : 12: predicate.incorporate_call_switch 6.28% : 0.000018s : 78: predicate.inline 0.93% : 0.000003s : 12: predicate.inline_without_move 0.34% : 0.000001s : 12: predicate.j_node_and_user_rematch 0.80% : 0.000002s : 12: predicate.less_batch_normalization 1.76% : 0.000005s : 33: predicate.list_to_tuple_eliminator_ 2.53% : 0.000007s : 50: predicate.load_eliminater 0.65% : 0.000002s : 6: predicate.loop_unroll_after_grad 2.02% : 0.000006s : 38: predicate.loop_unroll_before_grad 1.65% : 0.000005s : 29: predicate.make_slice_get_slice_eliminator 0.67% : 0.000002s : 12: predicate.merge_addn 0.65% : 0.000002s : 12: predicate.micro_step_allgather_replace 0.66% : 0.000002s : 12: predicate.mini_step_allgather_replace 0.80% : 0.000002s : 17: predicate.minmaximum_grad 0.69% : 0.000002s : 6: predicate.mutable_eliminate 0.38% : 0.000001s : 6: predicate.opt_reshape 0.50% : 0.000001s : 6: predicate.parallel_virtual_node 1.60% : 0.000005s : 25: predicate.partial_defer_inline 1.58% : 0.000004s : 27: predicate.partial_eliminate 0.87% : 0.000002s : 17: predicate.print_const_string_wrapper 0.64% : 0.000002s : 12: predicate.reduce_all_const_elim 1.13% : 0.000003s : 17: predicate.reduce_eliminate 2.41% : 0.000007s : 50: predicate.redundant_stop_gradient_eliminater 0.37% : 0.000001s : 12: predicate.remove_not_recompute_node 1.34% : 0.000004s : 33: predicate.replace_applicator 0.48% : 0.000001s : 12: predicate.replace_old_param 0.27% : 0.000001s : 6: predicate.reset_defer_inline 0.96% : 0.000003s : 17: predicate.reshape_eliminate 0.69% : 0.000002s : 12: predicate.row_tensor_add_zeros_like 0.51% : 0.000001s : 6: predicate.row_tensor_eliminate 0.86% : 0.000002s : 12: predicate.same_eliminate 0.40% : 0.000001s : 12: predicate.set_cell_output_no_recompute 1.03% : 0.000003s : 12: predicate.shard_identity_eliminate 0.81% : 0.000002s : 12: predicate.special_op_eliminate 0.80% : 0.000002s : 12: predicate.specialize_transform 0.99% : 0.000003s : 12: predicate.split_environ_get_set_with_tuple_value 0.95% : 0.000003s : 12: predicate.stack_unstack_eliminate 0.32% : 0.000001s : 6: predicate.switch_call_monad_eliminater 1.29% : 0.000004s : 25: predicate.switch_defer_inline 1.98% : 0.000006s : 37: predicate.switch_layer_defer_inline 4.64% : 0.000013s : 81: predicate.switch_simplify 0.86% : 0.000002s : 17: predicate.tile_eliminate 0.89% : 0.000003s : 17: predicate.transpose_eliminate 1.65% : 0.000005s : 29: predicate.tuple_list_convert_item_index_to_positive 1.52% : 0.000004s : 29: predicate.tuple_list_get_item_const_eliminator 1.64% : 0.000005s : 29: predicate.tuple_list_get_item_depend_reorder 3.23% : 0.000009s : 45: predicate.tuple_list_get_item_eliminator 1.66% : 0.000005s : 29: predicate.tuple_list_get_set_item_eliminator 2.27% : 0.000006s : 41: predicate.tuple_list_set_item_eliminator 1.75% : 0.000005s : 33: predicate.tuple_to_list_eliminator_ 2.36% : 0.000007s : 50: predicate.updatestate_pure_node_eliminater 3.05% : 0.000009s : 62: predicate.updatestate_useless_node_eliminater 0.44% : 0.000001s : 6: predicate.value_based_eliminate 0.75% : 0.000002s : 12: predicate.virtual_dataset_eliminate 0.72% : 0.000002s : 12: predicate.virtual_output_eliminate 0.33% : 0.000001s : 6: predicate.virtual_view_grad_eliminate 0.43% : 0.000001s : 6: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000551 11 52.12% : 0.000287s : 5: func_graph_cloner_run.FuncGraphClonerGraph 47.88% : 0.000264s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.061858 192 0.01% : 0.000006s : 1: ForceFp32Comm 31.06% : 0.019215s : 1: add_attr 31.02% : 0.019191s : 1: add_attr_with_inline 0.01% : 0.000006s : 1: add_comm_op_reuse_tag 0.11% : 0.000066s : 1: add_recomputation 0.01% : 0.000006s : 1: assign_add_opt 0.12% : 0.000075s : 1: auto_monad 0.05% : 0.000030s : 1: auto_monad_reorder 0.01% : 0.000006s : 1: begin_end_overlap_inline 0.01% : 0.000008s : 1: bias_add_comm_swap 0.83% : 0.000513s : 1: bootstrap 0.05% : 0.000034s : 1: cconv 0.01% : 0.000006s : 1: comm_op_add_attrs 0.04% : 0.000022s : 1: control_data_broadcast_order 0.02% : 0.000014s : 1: convert_after_rewriter 0.06% : 0.000037s : 1: cse_after_recomputation 0.01% : 0.000007s : 1: dataset_repeat_opt 0.03% : 0.000017s : 1: detach_backward 0.02% : 0.000012s : 1: environ_conv 0.05% : 0.000029s : 1: event_method 0.01% : 0.000007s : 1: full_micro_interleaved_order_control 0.01% : 0.000008s : 1: get_jit_bprop_graph 0.02% : 0.000013s : 1: graph_reusing 0.01% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000006s : 1: handle_group_info 0.01% : 0.000008s : 1: inline 0.01% : 0.000009s : 1: insert-virtual-dataset 0.01% : 0.000006s : 1: interleave_parallel_branches 0.01% : 0.000006s : 1: interleave_split_concat_branches 0.01% : 0.000008s : 1: label_fine_grained_interleaved_index 0.02% : 0.000010s : 1: label_micro_interleaved_index 0.71% : 0.000437s : 1: loop_unroll 0.01% : 0.000007s : 1: merge_cast_opt 0.01% : 0.000007s : 1: micro_interleaved_order_control 1.08% : 0.000666s : 1: mutable_eliminate 0.02% : 0.000010s : 1: offloading_packed_experts 0.03% : 0.000017s : 1: opt.transform.loop_unroll_optimizer 0.03% : 0.000018s : 1: opt.transform.mutable_eliminate 2.62% : 0.001618s : 78: opt.transform.opt_a 0.07% : 0.000041s : 1: opt.transform.opt_after_cconv 0.06% : 0.000035s : 1: opt.transform.opt_after_jit_grad 0.28% : 0.000173s : 28: opt.transform.opt_b 0.11% : 0.000069s : 2: opt.transform.opt_trans_graph 0.08% : 0.000050s : 4: opt.transform.symbol_engine_opt 6.29% : 0.003893s : 1: opt_a 0.25% : 0.000153s : 1: opt_after_cconv 0.81% : 0.000503s : 1: opt_after_jit_grad 0.59% : 0.000364s : 1: opt_b 11.14% : 0.006889s : 1: optimize 0.04% : 0.000024s : 1: optimize_parallel_all_gather_comm 0.02% : 0.000012s : 1: order_py_execute_after_rewriter 0.05% : 0.000031s : 1: overlap_grad_flash_sp 0.01% : 0.000006s : 1: overlap_grad_matmul_and_grad_allreduce 0.02% : 0.000010s : 1: overlap_grad_ring_attention 0.01% : 0.000007s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000006s : 1: overlap_opt_shard_in_pipeline 0.01% : 0.000008s : 1: overlap_param_gather 0.01% : 0.000007s : 1: overlap_recompute_allgather_and_fa_grad 0.02% : 0.000011s : 1: overlap_recompute_and_grad_model_parallel 0.01% : 0.000007s : 1: overlap_recompute_comm 0.02% : 0.000011s : 1: parallel-infer-symbol 0.01% : 0.000006s : 1: parallel-infer-symbol-second 0.01% : 0.000008s : 1: partial_unused_args_eliminate 0.02% : 0.000009s : 1: pipeline_parallel_scheduler 0.01% : 0.000007s : 1: pipeline_split 0.09% : 0.000054s : 1: pre_auto_parallel 0.08% : 0.000047s : 1: py_interpret_to_execute 0.03% : 0.000019s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000006s : 1: remove_cast_before_assign_add 0.08% : 0.000049s : 1: remove_dup_value 0.85% : 0.000523s : 1: renormalize.infer 0.62% : 0.000384s : 1: renormalize.specialize 0.01% : 0.000008s : 1: reorder_send_recv_between_fp_bp 0.02% : 0.000010s : 1: rewriter_after_jit_bprop_graph 0.09% : 0.000055s : 1: rewriter_after_opt_a 0.18% : 0.000113s : 1: rewriter_before_opt_a 0.01% : 0.000007s : 1: slice_cell_reuse_recomputed_activation 0.01% : 0.000007s : 1: slice_recompute_activation 0.01% : 0.000007s : 1: split_layernorm_comm 0.01% : 0.000007s : 1: split_matmul_comm_elemetwise 0.02% : 0.000013s : 1: swap_dp_allreduce_reducescatter 0.18% : 0.000112s : 1: symbol_engine_optimizer 0.18% : 0.000113s : 1: tuple_transform 9.40% : 0.005818s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:45:49.293.867 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0320329, [21] [bootstrap]: 0.00043114 [type_inference]: 0.00577098 [event_method]: 1.856e-05 [auto_monad]: 6.876e-05 [graph_reusing]: 6.07001e-06 [inline]: 2.78998e-06 [add_attr]: 0.00300466, [1] [add_attr_with_inline]: 0.00299684, [1] [Cycle 1]: 5.823e-05, [2] [tag_attr]: 2.023e-05 [meta_addattr_fg_expand]: 5.88002e-06 [parallel-infer-symbol]: 3.04001e-06 [pre_auto_parallel]: 3.173e-05 [insert-virtual-dataset]: 0.0160223 [parallel-infer-symbol-second]: 4.71997e-06 [dataset_repeat_opt]: 3.33e-06 [pipeline_split]: 2.40002e-06 [optimize]: 0.00593719, [53] [py_interpret_to_execute]: 5.059e-05 [rewriter_before_opt_a]: 0.00011389 [opt_a]: 0.00348113, [2] [Cycle 1]: 0.00260951, [45] [expand_dump_flag]: 6.27001e-06 [switch_simplify]: 4.987e-05 [loop_unroll]: 3.337e-05 [a_1]: 0.00078083 [with_stream_mark]: 2.557e-05 [recompute_prepare]: 1.163e-05 [updatestate_depend_eliminate]: 6.06998e-06 [updatestate_assign_eliminate]: 4.08999e-06 [updatestate_loads_eliminate]: 4.13001e-06 [parameter_eliminate]: 2.56998e-06 [a_2]: 0.00012261 [accelerated_algorithm]: 1.015e-05 [shard]: 2.36e-06 [meta_shard_fg_expand]: 2.41998e-06 [shard_inline]: 9.29e-06 [merge_send_recv]: 1.194e-05 [auto_parallel]: 1.074e-05 [parallel]: 1.959e-05 [flash_sp]: 1.219e-05 [merge_comm]: 5.72999e-06 [allreduce_fusion]: 4.86002e-06 [matmul_add_comm_reduction]: 1.22e-05 [allreduce_slice_to_reducescatter]: 8.39995e-07 [virtual_shard_identity]: 1.148e-05 [virtual_dataset]: 1.008e-05 [get_grad_eliminate_]: 9.23002e-06 [virtual_output]: 9.52001e-06 [merge_forward]: 5.34e-06 [cell_reuse_recompute_pass]: 1.14e-06 [offload_activation]: 1.247e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.778e-05 [merge_recompute_call_nodes]: 1.47001e-06 [before_grad]: 1.576e-05 [set_forward_comm_id_for_comm_node_pass]: 5.67001e-06 [meta_fg_expand]: 4.31002e-06 [flash_sp_send_recv_attached]: 2.91999e-06 [receive_attached]: 2.16e-06 [after_resolve]: 1.452e-05 [a_after_grad]: 1.41e-05 [renormalize]: 0.00091602 [add_forward_monad_depend]: 5.40999e-06 [auto_monad_grad]: 2.88998e-06 [auto_monad_eliminator]: 1.989e-05 [cse]: 4.454e-05 [a_3]: 6.769e-05 [Cycle 2]: 0.0008566, [45] [expand_dump_flag]: 1.02998e-06 [switch_simplify]: 1.043e-05 [loop_unroll]: 8.37998e-06 [a_1]: 0.00021256 [with_stream_mark]: 1.276e-05 [recompute_prepare]: 9.23002e-06 [updatestate_depend_eliminate]: 4.62e-06 [updatestate_assign_eliminate]: 3.48999e-06 [updatestate_loads_eliminate]: 3.63e-06 [parameter_eliminate]: 1.04998e-06 [a_2]: 0.00011338 [accelerated_algorithm]: 8.99e-06 [shard]: 1.40999e-06 [meta_shard_fg_expand]: 1.74e-06 [shard_inline]: 8.84998e-06 [merge_send_recv]: 6.82002e-06 [auto_parallel]: 6.93998e-06 [parallel]: 4.45e-06 [flash_sp]: 4.37e-06 [merge_comm]: 5.17e-06 [allreduce_fusion]: 4.64002e-06 [matmul_add_comm_reduction]: 8.18999e-06 [allreduce_slice_to_reducescatter]: 4.19997e-07 [virtual_shard_identity]: 9.29998e-06 [virtual_dataset]: 9.75002e-06 [get_grad_eliminate_]: 7.95e-06 [virtual_output]: 8.23001e-06 [merge_forward]: 4.32e-06 [cell_reuse_recompute_pass]: 1.38002e-06 [offload_activation]: 9.82001e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.624e-05 [merge_recompute_call_nodes]: 8.09989e-07 [before_grad]: 1.372e-05 [set_forward_comm_id_for_comm_node_pass]: 4.95001e-06 [meta_fg_expand]: 3.17002e-06 [flash_sp_send_recv_attached]: 8.39995e-07 [receive_attached]: 1.30001e-06 [after_resolve]: 1.299e-05 [a_after_grad]: 1.295e-05 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 1.41998e-06 [auto_monad_grad]: 1.21002e-06 [auto_monad_eliminator]: 9.80002e-06 [cse]: 2.211e-05 [a_3]: 5.337e-05 [py_interpret_to_execute_after_opt_a]: 1.15e-05 [slice_cell_reuse_recomputed_activation]: 2.11998e-06 [rewriter_after_opt_a]: 4.608e-05 [convert_after_rewriter]: 8.47998e-06 [order_py_execute_after_rewriter]: 6.56e-06 [mutable_eliminate]: 0.00069223 [opt_b]: 0.00027775, [1] [Cycle 1]: 0.00027166, [7] [b_1]: 0.00018316 [b_2]: 1.089e-05 [updatestate_depend_eliminate]: 6.62002e-06 [updatestate_assign_eliminate]: 3.6e-06 [updatestate_loads_eliminate]: 3.70998e-06 [renormalize]: 4.50003e-07 [cse]: 2.743e-05 [optimize_parallel_all_gather_comm]: 1.798e-05 [overlap_param_gather]: 2.07001e-06 [cconv]: 2.589e-05 [loop_unroll]: 0.00042317 [opt_after_cconv]: 0.00012568, [1] [Cycle 1]: 0.00011978, [7] [c_1]: 4.257e-05 [parameter_eliminate]: 2.74999e-06 [updatestate_depend_eliminate]: 6.31e-06 [updatestate_assign_eliminate]: 3.68999e-06 [updatestate_loads_eliminate]: 3.56001e-06 [cse]: 2.622e-05 [renormalize]: 3.59985e-07 [remove_dup_value]: 4.034e-05 [tuple_transform]: 9.517e-05, [1] [Cycle 1]: 9.074e-05, [4] [d_1]: 6.026e-05 [none_parameter_eliminate]: 1.59e-06 [renormalize]: 2.9002e-07 [switch_simplify]: 9.56e-06 [partial_unused_args_eliminate]: 1.78002e-06 [add_recomputation]: 6.153e-05 [cse_after_recomputation]: 2.714e-05, [1] [Cycle 1]: 2.22e-05, [1] [cse]: 1.66e-05 [environ_conv]: 7.38e-06 [swap_dp_allreduce_reducescatter]: 6.87002e-06 [bias_add_comm_swap]: 2.61e-06 [label_micro_interleaved_index]: 4.60001e-06 [label_fine_grained_interleaved_index]: 2.61999e-06 [merge_cast_opt]: 1.54e-06 [slice_recompute_activation]: 2.03997e-06 [micro_interleaved_order_control]: 2.29001e-06 [assign_add_opt]: 1.25001e-06 [ForceFp32Comm]: 1.14998e-06 [remove_cast_before_assign_add]: 9.70002e-07 [full_micro_interleaved_order_control]: 2.12001e-06 [reorder_send_recv_between_fp_bp]: 2.88e-06 [comm_op_add_attrs]: 1.02e-06 [add_comm_op_reuse_tag]: 1.02998e-06 [interleave_split_concat_branches]: 1.12e-06 [interleave_parallel_branches]: 1.29e-06 [overlap_opt_shard_in_pipeline]: 1.35999e-06 [overlap_opt_shard_grad_in_pipeline]: 1.79998e-06 [control_data_broadcast_order]: 1.526e-05 [grouped_pairwise_exchange_alltoall]: 1.91e-06 [offloading_packed_experts]: 4.99e-06 [overlap_recompute_and_grad_model_parallel]: 6.36998e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.12e-06 [overlap_recompute_allgather_and_fa_grad]: 1.44e-06 [overlap_recompute_comm]: 2.61e-06 [overlap_grad_ring_attention]: 5.15999e-06 [overlap_grad_flash_sp]: 2.388e-05 [begin_end_overlap_inline]: 8.59989e-07 [split_matmul_comm_elemetwise]: 2.32001e-06 [split_layernorm_comm]: 1.83997e-06 [handle_group_info]: 1.25001e-06 [symbol_engine_optimizer]: 9.099e-05, [1] [Cycle 1]: 8.64e-05, [6] [build]: 3.72002e-06 [elim_shapecalc]: 1.275e-05 [elim_not_effective]: 1.758e-05 [opt_reshape]: 9.25999e-06 [fold_const_symbol]: 1.483e-05 [renormalize]: 2.19996e-07 [detach_backward]: 1.86e-06 [pipeline_parallel_scheduler]: 1.79998e-06 [auto_monad_reorder]: 2.127e-05 [get_jit_bprop_graph]: 1.72001e-06 [rewriter_after_jit_bprop_graph]: 3.7e-06 [opt_after_jit_grad]: 0.00046001 [validate]: 4.289e-05 Sums bootstrap : 0.000431s : 1.54% type_inference : 0.005771s : 20.61% event_method : 0.000019s : 0.07% auto_monad : 0.000069s : 0.25% graph_reusing : 0.000006s : 0.02% inline : 0.000003s : 0.01% add_attr.add_attr_with_inline.tag_attr : 0.000020s : 0.07% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.02% parallel-infer-symbol : 0.000003s : 0.01% pre_auto_parallel : 0.000032s : 0.11% insert-virtual-dataset : 0.016022s : 57.21% parallel-infer-symbol-second : 0.000005s : 0.02% dataset_repeat_opt : 0.000003s : 0.01% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000051s : 0.18% optimize.rewriter_before_opt_a : 0.000114s : 0.41% optimize.opt_a.expand_dump_flag : 0.000007s : 0.03% optimize.opt_a.switch_simplify : 0.000060s : 0.22% optimize.opt_a.loop_unroll : 0.000042s : 0.15% optimize.opt_a.a_1 : 0.000993s : 3.55% optimize.opt_a.with_stream_mark : 0.000038s : 0.14% optimize.opt_a.recompute_prepare : 0.000021s : 0.07% optimize.opt_a.updatestate_depend_eliminate : 0.000011s : 0.04% optimize.opt_a.updatestate_assign_eliminate : 0.000008s : 0.03% optimize.opt_a.updatestate_loads_eliminate : 0.000008s : 0.03% optimize.opt_a.parameter_eliminate : 0.000004s : 0.01% optimize.opt_a.a_2 : 0.000236s : 0.84% optimize.opt_a.accelerated_algorithm : 0.000019s : 0.07% optimize.opt_a.shard : 0.000004s : 0.01% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.01% optimize.opt_a.shard_inline : 0.000018s : 0.06% optimize.opt_a.merge_send_recv : 0.000019s : 0.07% optimize.opt_a.auto_parallel : 0.000018s : 0.06% optimize.opt_a.parallel : 0.000024s : 0.09% optimize.opt_a.flash_sp : 0.000017s : 0.06% optimize.opt_a.merge_comm : 0.000011s : 0.04% optimize.opt_a.allreduce_fusion : 0.000010s : 0.03% optimize.opt_a.matmul_add_comm_reduction : 0.000020s : 0.07% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000021s : 0.07% optimize.opt_a.virtual_dataset : 0.000020s : 0.07% optimize.opt_a.get_grad_eliminate_ : 0.000017s : 0.06% optimize.opt_a.virtual_output : 0.000018s : 0.06% optimize.opt_a.merge_forward : 0.000010s : 0.03% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.01% optimize.opt_a.offload_activation : 0.000022s : 0.08% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000034s : 0.12% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.01% optimize.opt_a.before_grad : 0.000029s : 0.11% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000011s : 0.04% optimize.opt_a.meta_fg_expand : 0.000007s : 0.03% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.01% optimize.opt_a.receive_attached : 0.000003s : 0.01% optimize.opt_a.after_resolve : 0.000028s : 0.10% optimize.opt_a.a_after_grad : 0.000027s : 0.10% optimize.opt_a.renormalize : 0.000916s : 3.27% optimize.opt_a.add_forward_monad_depend : 0.000007s : 0.02% optimize.opt_a.auto_monad_grad : 0.000004s : 0.01% optimize.opt_a.auto_monad_eliminator : 0.000030s : 0.11% optimize.opt_a.cse : 0.000067s : 0.24% optimize.opt_a.a_3 : 0.000121s : 0.43% optimize.py_interpret_to_execute_after_opt_a : 0.000012s : 0.04% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.01% optimize.rewriter_after_opt_a : 0.000046s : 0.16% optimize.convert_after_rewriter : 0.000008s : 0.03% optimize.order_py_execute_after_rewriter : 0.000007s : 0.02% optimize.mutable_eliminate : 0.000692s : 2.47% optimize.opt_b.b_1 : 0.000183s : 0.65% optimize.opt_b.b_2 : 0.000011s : 0.04% optimize.opt_b.updatestate_depend_eliminate : 0.000007s : 0.02% optimize.opt_b.updatestate_assign_eliminate : 0.000004s : 0.01% optimize.opt_b.updatestate_loads_eliminate : 0.000004s : 0.01% optimize.opt_b.renormalize : 0.000000s : 0.00% optimize.opt_b.cse : 0.000027s : 0.10% optimize.optimize_parallel_all_gather_comm : 0.000018s : 0.06% optimize.overlap_param_gather : 0.000002s : 0.01% optimize.cconv : 0.000026s : 0.09% optimize.loop_unroll : 0.000423s : 1.51% optimize.opt_after_cconv.c_1 : 0.000043s : 0.15% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.02% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000004s : 0.01% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000004s : 0.01% optimize.opt_after_cconv.cse : 0.000026s : 0.09% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000040s : 0.14% optimize.tuple_transform.d_1 : 0.000060s : 0.22% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000010s : 0.03% optimize.partial_unused_args_eliminate : 0.000002s : 0.01% optimize.add_recomputation : 0.000062s : 0.22% optimize.cse_after_recomputation.cse : 0.000017s : 0.06% optimize.environ_conv : 0.000007s : 0.03% optimize.swap_dp_allreduce_reducescatter : 0.000007s : 0.02% optimize.bias_add_comm_swap : 0.000003s : 0.01% optimize.label_micro_interleaved_index : 0.000005s : 0.02% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.01% optimize.merge_cast_opt : 0.000002s : 0.01% optimize.slice_recompute_activation : 0.000002s : 0.01% optimize.micro_interleaved_order_control : 0.000002s : 0.01% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.00% optimize.full_micro_interleaved_order_control : 0.000002s : 0.01% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.01% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.01% optimize.control_data_broadcast_order : 0.000015s : 0.05% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.01% optimize.offloading_packed_experts : 0.000005s : 0.02% optimize.overlap_recompute_and_grad_model_parallel : 0.000006s : 0.02% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.01% optimize.overlap_recompute_comm : 0.000003s : 0.01% optimize.overlap_grad_ring_attention : 0.000005s : 0.02% optimize.overlap_grad_flash_sp : 0.000024s : 0.09% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.01% optimize.split_layernorm_comm : 0.000002s : 0.01% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000004s : 0.01% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000013s : 0.05% optimize.symbol_engine_optimizer.elim_not_effective : 0.000018s : 0.06% optimize.symbol_engine_optimizer.opt_reshape : 0.000009s : 0.03% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000015s : 0.05% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.01% pipeline_parallel_scheduler : 0.000002s : 0.01% auto_monad_reorder : 0.000021s : 0.08% get_jit_bprop_graph : 0.000002s : 0.01% rewriter_after_jit_bprop_graph : 0.000004s : 0.01% opt_after_jit_grad : 0.000460s : 1.64% validate : 0.000043s : 0.15% Time group info: ------[substitution.] 0.000267 48 13.96% : 0.000037s : 6: substitution.cast_eliminate 0.91% : 0.000002s : 4: substitution.elim_not_effective 0.75% : 0.000002s : 4: substitution.fold_const_symbol 2.80% : 0.000007s : 6: substitution.graph_param_transform 69.82% : 0.000186s : 4: substitution.inline 2.01% : 0.000005s : 8: substitution.j_node_and_user_rematch 2.77% : 0.000007s : 8: substitution.remove_not_recompute_node 1.77% : 0.000005s : 4: substitution.replace_old_param 5.21% : 0.000014s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.005712 2 87.76% : 0.005013s : 1: type_inference.infer 12.24% : 0.000699s : 1: type_inference.specialize ------[replace.] 0.000067 8 64.81% : 0.000043s : 4: replace.inline 35.19% : 0.000023s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000195 8 93.80% : 0.000183s : 4: match.inline 6.20% : 0.000012s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000280 1730 1.06% : 0.000003s : 17: predicate.accumulaten_eliminater 0.66% : 0.000002s : 6: predicate.ad_related_special_op_eliminate 0.64% : 0.000002s : 12: predicate.addn_check_dump 0.98% : 0.000003s : 17: predicate.addn_zero_filter 0.82% : 0.000002s : 17: predicate.adjust_all_reduce_mul_add 1.91% : 0.000005s : 29: predicate.arithmetic_simplify 1.13% : 0.000003s : 17: predicate.cast_eliminate 0.67% : 0.000002s : 12: predicate.check_bprop_eliminate 0.74% : 0.000002s : 12: predicate.compare_switch_simplify 0.19% : 0.000001s : 6: predicate.const_output_eliminate 0.70% : 0.000002s : 12: predicate.depend_value_elim 0.94% : 0.000003s : 17: predicate.dict_get_item_const_eliminator 1.03% : 0.000003s : 17: predicate.dict_get_item_eliminator 0.87% : 0.000002s : 17: predicate.dict_set_item_eliminator 0.89% : 0.000003s : 12: predicate.dumpgradient_eliminate 0.20% : 0.000001s : 6: predicate.elim_not_effective 0.45% : 0.000001s : 6: predicate.elim_shapecalc_of_broadcastargs 1.38% : 0.000004s : 23: predicate.environ_add_const_eliminate 1.18% : 0.000003s : 23: predicate.environ_get_add_eliminate 1.14% : 0.000003s : 23: predicate.environ_get_depend_swap 1.87% : 0.000005s : 35: predicate.environ_get_eliminate 1.18% : 0.000003s : 23: predicate.environ_get_set_eliminate 1.29% : 0.000004s : 25: predicate.exchange_switch_depend_value 2.16% : 0.000006s : 25: predicate.float_depend_g_call 0.63% : 0.000002s : 12: predicate.float_environ_get_switch 0.87% : 0.000002s : 18: predicate.float_tuple_getitem_switch 0.20% : 0.000001s : 6: predicate.fold_const_symbol 0.77% : 0.000002s : 12: predicate.get_grad_eliminate 0.27% : 0.000001s : 6: predicate.graph_param_transform 0.67% : 0.000002s : 12: predicate.incorporate_call 0.57% : 0.000002s : 12: predicate.incorporate_call_switch 6.14% : 0.000017s : 78: predicate.inline 0.85% : 0.000002s : 12: predicate.inline_without_move 0.35% : 0.000001s : 12: predicate.j_node_and_user_rematch 0.95% : 0.000003s : 12: predicate.less_batch_normalization 1.75% : 0.000005s : 33: predicate.list_to_tuple_eliminator_ 2.48% : 0.000007s : 50: predicate.load_eliminater 0.72% : 0.000002s : 6: predicate.loop_unroll_after_grad 1.99% : 0.000006s : 38: predicate.loop_unroll_before_grad 1.67% : 0.000005s : 29: predicate.make_slice_get_slice_eliminator 0.69% : 0.000002s : 12: predicate.merge_addn 0.66% : 0.000002s : 12: predicate.micro_step_allgather_replace 0.75% : 0.000002s : 12: predicate.mini_step_allgather_replace 0.81% : 0.000002s : 17: predicate.minmaximum_grad 0.95% : 0.000003s : 6: predicate.mutable_eliminate 0.39% : 0.000001s : 6: predicate.opt_reshape 0.37% : 0.000001s : 6: predicate.parallel_virtual_node 1.52% : 0.000004s : 25: predicate.partial_defer_inline 1.57% : 0.000004s : 27: predicate.partial_eliminate 0.92% : 0.000003s : 17: predicate.print_const_string_wrapper 0.67% : 0.000002s : 12: predicate.reduce_all_const_elim 1.27% : 0.000004s : 17: predicate.reduce_eliminate 2.46% : 0.000007s : 50: predicate.redundant_stop_gradient_eliminater 0.36% : 0.000001s : 12: predicate.remove_not_recompute_node 1.28% : 0.000004s : 33: predicate.replace_applicator 0.47% : 0.000001s : 12: predicate.replace_old_param 0.26% : 0.000001s : 6: predicate.reset_defer_inline 0.95% : 0.000003s : 17: predicate.reshape_eliminate 0.69% : 0.000002s : 12: predicate.row_tensor_add_zeros_like 0.46% : 0.000001s : 6: predicate.row_tensor_eliminate 0.77% : 0.000002s : 12: predicate.same_eliminate 0.45% : 0.000001s : 12: predicate.set_cell_output_no_recompute 0.76% : 0.000002s : 12: predicate.shard_identity_eliminate 0.78% : 0.000002s : 12: predicate.special_op_eliminate 0.78% : 0.000002s : 12: predicate.specialize_transform 0.83% : 0.000002s : 12: predicate.split_environ_get_set_with_tuple_value 0.85% : 0.000002s : 12: predicate.stack_unstack_eliminate 0.37% : 0.000001s : 6: predicate.switch_call_monad_eliminater 1.37% : 0.000004s : 25: predicate.switch_defer_inline 2.04% : 0.000006s : 37: predicate.switch_layer_defer_inline 4.88% : 0.000014s : 81: predicate.switch_simplify 0.90% : 0.000003s : 17: predicate.tile_eliminate 0.93% : 0.000003s : 17: predicate.transpose_eliminate 1.54% : 0.000004s : 29: predicate.tuple_list_convert_item_index_to_positive 1.55% : 0.000004s : 29: predicate.tuple_list_get_item_const_eliminator 1.44% : 0.000004s : 29: predicate.tuple_list_get_item_depend_reorder 3.09% : 0.000009s : 45: predicate.tuple_list_get_item_eliminator 1.62% : 0.000005s : 29: predicate.tuple_list_get_set_item_eliminator 2.45% : 0.000007s : 41: predicate.tuple_list_set_item_eliminator 1.67% : 0.000005s : 33: predicate.tuple_to_list_eliminator_ 2.41% : 0.000007s : 50: predicate.updatestate_pure_node_eliminater 3.17% : 0.000009s : 62: predicate.updatestate_useless_node_eliminater 0.44% : 0.000001s : 6: predicate.value_based_eliminate 0.69% : 0.000002s : 12: predicate.virtual_dataset_eliminate 0.82% : 0.000002s : 12: predicate.virtual_output_eliminate 0.36% : 0.000001s : 6: predicate.virtual_view_grad_eliminate 0.57% : 0.000002s : 6: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000545 11 53.40% : 0.000291s : 5: func_graph_cloner_run.FuncGraphClonerGraph 46.60% : 0.000254s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.043747 192 0.01% : 0.000004s : 1: ForceFp32Comm 6.88% : 0.003009s : 1: add_attr 6.86% : 0.003000s : 1: add_attr_with_inline 0.01% : 0.000004s : 1: add_comm_op_reuse_tag 0.15% : 0.000065s : 1: add_recomputation 0.01% : 0.000004s : 1: assign_add_opt 0.17% : 0.000074s : 1: auto_monad 0.06% : 0.000025s : 1: auto_monad_reorder 0.01% : 0.000004s : 1: begin_end_overlap_inline 0.01% : 0.000006s : 1: bias_add_comm_swap 1.06% : 0.000463s : 1: bootstrap 0.07% : 0.000029s : 1: cconv 0.01% : 0.000004s : 1: comm_op_add_attrs 0.04% : 0.000018s : 1: control_data_broadcast_order 0.03% : 0.000012s : 1: convert_after_rewriter 0.07% : 0.000030s : 1: cse_after_recomputation 0.01% : 0.000007s : 1: dataset_repeat_opt 0.01% : 0.000005s : 1: detach_backward 0.02% : 0.000011s : 1: environ_conv 0.06% : 0.000026s : 1: event_method 0.01% : 0.000005s : 1: full_micro_interleaved_order_control 0.01% : 0.000005s : 1: get_jit_bprop_graph 0.02% : 0.000009s : 1: graph_reusing 0.01% : 0.000005s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000004s : 1: handle_group_info 0.01% : 0.000006s : 1: inline 36.70% : 0.016054s : 1: insert-virtual-dataset 0.01% : 0.000004s : 1: interleave_parallel_branches 0.01% : 0.000004s : 1: interleave_split_concat_branches 0.01% : 0.000005s : 1: label_fine_grained_interleaved_index 0.02% : 0.000007s : 1: label_micro_interleaved_index 0.98% : 0.000431s : 1: loop_unroll 0.01% : 0.000004s : 1: merge_cast_opt 0.01% : 0.000005s : 1: micro_interleaved_order_control 1.60% : 0.000701s : 1: mutable_eliminate 0.02% : 0.000008s : 1: offloading_packed_experts 0.04% : 0.000017s : 1: opt.transform.loop_unroll_optimizer 0.04% : 0.000019s : 1: opt.transform.mutable_eliminate 3.67% : 0.001608s : 78: opt.transform.opt_a 0.09% : 0.000041s : 1: opt.transform.opt_after_cconv 0.07% : 0.000031s : 1: opt.transform.opt_after_jit_grad 0.37% : 0.000162s : 28: opt.transform.opt_b 0.15% : 0.000068s : 2: opt.transform.opt_trans_graph 0.12% : 0.000051s : 4: opt.transform.symbol_engine_opt 7.96% : 0.003484s : 1: opt_a 0.29% : 0.000129s : 1: opt_after_cconv 1.07% : 0.000469s : 1: opt_after_jit_grad 0.64% : 0.000281s : 1: opt_b 13.58% : 0.005942s : 1: optimize 0.05% : 0.000022s : 1: optimize_parallel_all_gather_comm 0.02% : 0.000010s : 1: order_py_execute_after_rewriter 0.06% : 0.000027s : 1: overlap_grad_flash_sp 0.01% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.02% : 0.000008s : 1: overlap_grad_ring_attention 0.01% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.01% : 0.000005s : 1: overlap_param_gather 0.01% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.02% : 0.000009s : 1: overlap_recompute_and_grad_model_parallel 0.01% : 0.000005s : 1: overlap_recompute_comm 0.02% : 0.000007s : 1: parallel-infer-symbol 0.02% : 0.000011s : 1: parallel-infer-symbol-second 0.01% : 0.000005s : 1: partial_unused_args_eliminate 0.01% : 0.000005s : 1: pipeline_parallel_scheduler 0.01% : 0.000005s : 1: pipeline_split 0.08% : 0.000036s : 1: pre_auto_parallel 0.13% : 0.000056s : 1: py_interpret_to_execute 0.03% : 0.000015s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000004s : 1: remove_cast_before_assign_add 0.10% : 0.000045s : 1: remove_dup_value 1.20% : 0.000525s : 1: renormalize.infer 0.87% : 0.000382s : 1: renormalize.specialize 0.01% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.02% : 0.000007s : 1: rewriter_after_jit_bprop_graph 0.11% : 0.000050s : 1: rewriter_after_opt_a 0.27% : 0.000118s : 1: rewriter_before_opt_a 0.01% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.01% : 0.000005s : 1: slice_recompute_activation 0.01% : 0.000005s : 1: split_layernorm_comm 0.01% : 0.000005s : 1: split_matmul_comm_elemetwise 0.02% : 0.000010s : 1: swap_dp_allreduce_reducescatter 0.21% : 0.000094s : 1: symbol_engine_optimizer 0.22% : 0.000098s : 1: tuple_transform 13.23% : 0.005788s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:45:49.699.036 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:45:49.699.313 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0412492, [21] [bootstrap]: 0.00046506 [type_inference]: 0.00582013 [event_method]: 1.979e-05 [auto_monad]: 6.556e-05 [graph_reusing]: 6.46e-06 [inline]: 2.37001e-06 [add_attr]: 0.00314514, [1] [add_attr_with_inline]: 0.00313495, [1] [Cycle 1]: 7.697e-05, [2] [tag_attr]: 1.994e-05 [meta_addattr_fg_expand]: 6.21e-06 [parallel-infer-symbol]: 3.21999e-06 [pre_auto_parallel]: 3.67e-05 [insert-virtual-dataset]: 2.84999e-06 [parallel-infer-symbol-second]: 7.50006e-07 [dataset_repeat_opt]: 2.15002e-06 [pipeline_split]: 1.54998e-06 [optimize]: 0.0304849, [53] [py_interpret_to_execute]: 3.039e-05 [rewriter_before_opt_a]: 8.911e-05 [opt_a]: 0.00352303, [2] [Cycle 1]: 0.00252548, [45] [expand_dump_flag]: 3.04001e-06 [switch_simplify]: 4.328e-05 [loop_unroll]: 3.123e-05 [a_1]: 0.00076055 [with_stream_mark]: 2.285e-05 [recompute_prepare]: 1.186e-05 [updatestate_depend_eliminate]: 5.15999e-06 [updatestate_assign_eliminate]: 4.27998e-06 [updatestate_loads_eliminate]: 3.64002e-06 [parameter_eliminate]: 2.00002e-06 [a_2]: 0.00012945 [accelerated_algorithm]: 8.85001e-06 [shard]: 2.26e-06 [meta_shard_fg_expand]: 2.23002e-06 [shard_inline]: 8.07e-06 [merge_send_recv]: 9.66e-06 [auto_parallel]: 9.14e-06 [parallel]: 1.881e-05 [flash_sp]: 1.03e-05 [merge_comm]: 4.52003e-06 [allreduce_fusion]: 4.22998e-06 [matmul_add_comm_reduction]: 1.17e-05 [allreduce_slice_to_reducescatter]: 7.50006e-07 [virtual_shard_identity]: 1.123e-05 [virtual_dataset]: 8.23999e-06 [get_grad_eliminate_]: 7.63001e-06 [virtual_output]: 8.50001e-06 [merge_forward]: 4.50001e-06 [cell_reuse_recompute_pass]: 1.47999e-06 [offload_activation]: 1.176e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.956e-05 [merge_recompute_call_nodes]: 1.47999e-06 [before_grad]: 1.334e-05 [set_forward_comm_id_for_comm_node_pass]: 5.42001e-06 [meta_fg_expand]: 3.28998e-06 [flash_sp_send_recv_attached]: 3.21999e-06 [receive_attached]: 2.42001e-06 [after_resolve]: 1.492e-05 [a_after_grad]: 1.235e-05 [renormalize]: 0.0007449 [add_forward_monad_depend]: 6.69999e-06 [auto_monad_grad]: 2.81e-06 [auto_monad_eliminator]: 1.859e-05 [cse]: 3.513e-05 [a_3]: 7.465e-05 [Cycle 2]: 0.00098203, [45] [expand_dump_flag]: 1.87001e-06 [switch_simplify]: 9.17001e-06 [loop_unroll]: 7.21001e-06 [a_1]: 0.00017838 [with_stream_mark]: 1.576e-05 [recompute_prepare]: 8.64e-06 [updatestate_depend_eliminate]: 4.15e-06 [updatestate_assign_eliminate]: 3.53999e-06 [updatestate_loads_eliminate]: 3.03e-06 [parameter_eliminate]: 2.41e-06 [a_2]: 0.00012098 [accelerated_algorithm]: 9.41e-06 [shard]: 1.35999e-06 [meta_shard_fg_expand]: 2.14e-06 [shard_inline]: 7.85e-06 [merge_send_recv]: 8.60001e-06 [auto_parallel]: 7.76001e-06 [parallel]: 6.87002e-06 [flash_sp]: 3.76001e-06 [merge_comm]: 4.17003e-06 [allreduce_fusion]: 4.20999e-06 [matmul_add_comm_reduction]: 1.013e-05 [allreduce_slice_to_reducescatter]: 3.10014e-07 [virtual_shard_identity]: 1.028e-05 [virtual_dataset]: 7.42998e-06 [get_grad_eliminate_]: 6.89999e-06 [virtual_output]: 7.52002e-06 [merge_forward]: 4.36002e-06 [cell_reuse_recompute_pass]: 2.60002e-06 [offload_activation]: 9.58997e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.931e-05 [merge_recompute_call_nodes]: 1.03001e-06 [before_grad]: 1.241e-05 [set_forward_comm_id_for_comm_node_pass]: 4.94e-06 [meta_fg_expand]: 2.96001e-06 [flash_sp_send_recv_attached]: 1.32e-06 [receive_attached]: 1.70001e-06 [after_resolve]: 1.267e-05 [a_after_grad]: 1.103e-05 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 2.75002e-06 [auto_monad_grad]: 1.52001e-06 [auto_monad_eliminator]: 1.139e-05 [cse]: 2.141e-05 [a_3]: 5.865e-05 [py_interpret_to_execute_after_opt_a]: 1.707e-05 [slice_cell_reuse_recomputed_activation]: 4.74e-06 [rewriter_after_opt_a]: 5.577e-05 [convert_after_rewriter]: 1.16e-05 [order_py_execute_after_rewriter]: 9.12001e-06 [mutable_eliminate]: 0.0246952 [opt_b]: 0.00036873, [1] [Cycle 1]: 0.00035455, [7] [b_1]: 0.00021623 [b_2]: 1.004e-05 [updatestate_depend_eliminate]: 1.408e-05 [updatestate_assign_eliminate]: 4.08001e-06 [updatestate_loads_eliminate]: 3.85e-06 [renormalize]: 1.15999e-06 [cse]: 4.733e-05 [optimize_parallel_all_gather_comm]: 3.53e-05 [overlap_param_gather]: 5.07e-06 [cconv]: 4.347e-05 [loop_unroll]: 0.00065132 [opt_after_cconv]: 0.00014779, [1] [Cycle 1]: 0.00013857, [7] [c_1]: 3.912e-05 [parameter_eliminate]: 6.03002e-06 [updatestate_depend_eliminate]: 7.1e-06 [updatestate_assign_eliminate]: 3.2e-06 [updatestate_loads_eliminate]: 2.89999e-06 [cse]: 2.285e-05 [renormalize]: 3.80009e-07 [remove_dup_value]: 2.322e-05 [tuple_transform]: 0.00011925, [1] [Cycle 1]: 0.00011031, [4] [d_1]: 6.483e-05 [none_parameter_eliminate]: 1.82001e-06 [renormalize]: 1.8999e-07 [switch_simplify]: 9.82001e-06 [partial_unused_args_eliminate]: 5.27999e-06 [add_recomputation]: 7.193e-05 [cse_after_recomputation]: 3.213e-05, [1] [Cycle 1]: 2.483e-05, [1] [cse]: 1.546e-05 [environ_conv]: 9.86e-06 [swap_dp_allreduce_reducescatter]: 8.36002e-06 [bias_add_comm_swap]: 5.52999e-06 [label_micro_interleaved_index]: 7.1e-06 [label_fine_grained_interleaved_index]: 5.77999e-06 [merge_cast_opt]: 3.68e-06 [slice_recompute_activation]: 4.57998e-06 [micro_interleaved_order_control]: 5.01002e-06 [assign_add_opt]: 3.83999e-06 [ForceFp32Comm]: 3.36001e-06 [remove_cast_before_assign_add]: 4.07998e-06 [full_micro_interleaved_order_control]: 4.89e-06 [reorder_send_recv_between_fp_bp]: 5.05999e-06 [comm_op_add_attrs]: 3.36999e-06 [add_comm_op_reuse_tag]: 3.16999e-06 [interleave_split_concat_branches]: 3.48e-06 [interleave_parallel_branches]: 3.88999e-06 [overlap_opt_shard_in_pipeline]: 4.20999e-06 [overlap_opt_shard_grad_in_pipeline]: 4.41002e-06 [control_data_broadcast_order]: 1.881e-05 [grouped_pairwise_exchange_alltoall]: 4.70001e-06 [offloading_packed_experts]: 8.19002e-06 [overlap_recompute_and_grad_model_parallel]: 7.88001e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.78001e-06 [overlap_recompute_allgather_and_fa_grad]: 4.12003e-06 [overlap_recompute_comm]: 4.82e-06 [overlap_grad_ring_attention]: 7.94002e-06 [overlap_grad_flash_sp]: 3.133e-05 [begin_end_overlap_inline]: 3.03e-06 [split_matmul_comm_elemetwise]: 5.30999e-06 [split_layernorm_comm]: 4.17e-06 [handle_group_info]: 3.61999e-06 [symbol_engine_optimizer]: 0.00011081, [1] [Cycle 1]: 0.00010397, [6] [build]: 4.70999e-06 [elim_shapecalc]: 1.274e-05 [elim_not_effective]: 1.613e-05 [opt_reshape]: 8.70999e-06 [fold_const_symbol]: 1.266e-05 [renormalize]: 2.59985e-07 [detach_backward]: 3.41999e-06 [pipeline_parallel_scheduler]: 2.27999e-06 [auto_monad_reorder]: 2.224e-05 [get_jit_bprop_graph]: 2.20002e-06 [rewriter_after_jit_bprop_graph]: 5.86e-06 [opt_after_jit_grad]: 0.00049465 [validate]: 4.658e-05 Sums bootstrap : 0.000465s : 1.28% type_inference : 0.005820s : 16.05% event_method : 0.000020s : 0.05% auto_monad : 0.000066s : 0.18% graph_reusing : 0.000006s : 0.02% inline : 0.000002s : 0.01% add_attr.add_attr_with_inline.tag_attr : 0.000020s : 0.05% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.02% parallel-infer-symbol : 0.000003s : 0.01% pre_auto_parallel : 0.000037s : 0.10% insert-virtual-dataset : 0.000003s : 0.01% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.01% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000030s : 0.08% optimize.rewriter_before_opt_a : 0.000089s : 0.25% optimize.opt_a.expand_dump_flag : 0.000005s : 0.01% optimize.opt_a.switch_simplify : 0.000052s : 0.14% optimize.opt_a.loop_unroll : 0.000038s : 0.11% optimize.opt_a.a_1 : 0.000939s : 2.59% optimize.opt_a.with_stream_mark : 0.000039s : 0.11% optimize.opt_a.recompute_prepare : 0.000021s : 0.06% optimize.opt_a.updatestate_depend_eliminate : 0.000009s : 0.03% optimize.opt_a.updatestate_assign_eliminate : 0.000008s : 0.02% optimize.opt_a.updatestate_loads_eliminate : 0.000007s : 0.02% optimize.opt_a.parameter_eliminate : 0.000004s : 0.01% optimize.opt_a.a_2 : 0.000250s : 0.69% optimize.opt_a.accelerated_algorithm : 0.000018s : 0.05% optimize.opt_a.shard : 0.000004s : 0.01% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.01% optimize.opt_a.shard_inline : 0.000016s : 0.04% optimize.opt_a.merge_send_recv : 0.000018s : 0.05% optimize.opt_a.auto_parallel : 0.000017s : 0.05% optimize.opt_a.parallel : 0.000026s : 0.07% optimize.opt_a.flash_sp : 0.000014s : 0.04% optimize.opt_a.merge_comm : 0.000009s : 0.02% optimize.opt_a.allreduce_fusion : 0.000008s : 0.02% optimize.opt_a.matmul_add_comm_reduction : 0.000022s : 0.06% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000022s : 0.06% optimize.opt_a.virtual_dataset : 0.000016s : 0.04% optimize.opt_a.get_grad_eliminate_ : 0.000015s : 0.04% optimize.opt_a.virtual_output : 0.000016s : 0.04% optimize.opt_a.merge_forward : 0.000009s : 0.02% optimize.opt_a.cell_reuse_recompute_pass : 0.000004s : 0.01% optimize.opt_a.offload_activation : 0.000021s : 0.06% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000039s : 0.11% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.01% optimize.opt_a.before_grad : 0.000026s : 0.07% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000010s : 0.03% optimize.opt_a.meta_fg_expand : 0.000006s : 0.02% optimize.opt_a.flash_sp_send_recv_attached : 0.000005s : 0.01% optimize.opt_a.receive_attached : 0.000004s : 0.01% optimize.opt_a.after_resolve : 0.000028s : 0.08% optimize.opt_a.a_after_grad : 0.000023s : 0.06% optimize.opt_a.renormalize : 0.000745s : 2.05% optimize.opt_a.add_forward_monad_depend : 0.000009s : 0.03% optimize.opt_a.auto_monad_grad : 0.000004s : 0.01% optimize.opt_a.auto_monad_eliminator : 0.000030s : 0.08% optimize.opt_a.cse : 0.000057s : 0.16% optimize.opt_a.a_3 : 0.000133s : 0.37% optimize.py_interpret_to_execute_after_opt_a : 0.000017s : 0.05% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.01% optimize.rewriter_after_opt_a : 0.000056s : 0.15% optimize.convert_after_rewriter : 0.000012s : 0.03% optimize.order_py_execute_after_rewriter : 0.000009s : 0.03% optimize.mutable_eliminate : 0.024695s : 68.11% optimize.opt_b.b_1 : 0.000216s : 0.60% optimize.opt_b.b_2 : 0.000010s : 0.03% optimize.opt_b.updatestate_depend_eliminate : 0.000014s : 0.04% optimize.opt_b.updatestate_assign_eliminate : 0.000004s : 0.01% optimize.opt_b.updatestate_loads_eliminate : 0.000004s : 0.01% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000047s : 0.13% optimize.optimize_parallel_all_gather_comm : 0.000035s : 0.10% optimize.overlap_param_gather : 0.000005s : 0.01% optimize.cconv : 0.000043s : 0.12% optimize.loop_unroll : 0.000651s : 1.80% optimize.opt_after_cconv.c_1 : 0.000039s : 0.11% optimize.opt_after_cconv.parameter_eliminate : 0.000006s : 0.02% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000007s : 0.02% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.cse : 0.000023s : 0.06% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000023s : 0.06% optimize.tuple_transform.d_1 : 0.000065s : 0.18% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000010s : 0.03% optimize.partial_unused_args_eliminate : 0.000005s : 0.01% optimize.add_recomputation : 0.000072s : 0.20% optimize.cse_after_recomputation.cse : 0.000015s : 0.04% optimize.environ_conv : 0.000010s : 0.03% optimize.swap_dp_allreduce_reducescatter : 0.000008s : 0.02% optimize.bias_add_comm_swap : 0.000006s : 0.02% optimize.label_micro_interleaved_index : 0.000007s : 0.02% optimize.label_fine_grained_interleaved_index : 0.000006s : 0.02% optimize.merge_cast_opt : 0.000004s : 0.01% optimize.slice_recompute_activation : 0.000005s : 0.01% optimize.micro_interleaved_order_control : 0.000005s : 0.01% optimize.assign_add_opt : 0.000004s : 0.01% optimize.ForceFp32Comm : 0.000003s : 0.01% optimize.remove_cast_before_assign_add : 0.000004s : 0.01% optimize.full_micro_interleaved_order_control : 0.000005s : 0.01% optimize.reorder_send_recv_between_fp_bp : 0.000005s : 0.01% optimize.comm_op_add_attrs : 0.000003s : 0.01% optimize.add_comm_op_reuse_tag : 0.000003s : 0.01% optimize.interleave_split_concat_branches : 0.000003s : 0.01% optimize.interleave_parallel_branches : 0.000004s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000004s : 0.01% optimize.control_data_broadcast_order : 0.000019s : 0.05% optimize.grouped_pairwise_exchange_alltoall : 0.000005s : 0.01% optimize.offloading_packed_experts : 0.000008s : 0.02% optimize.overlap_recompute_and_grad_model_parallel : 0.000008s : 0.02% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.01% optimize.overlap_recompute_comm : 0.000005s : 0.01% optimize.overlap_grad_ring_attention : 0.000008s : 0.02% optimize.overlap_grad_flash_sp : 0.000031s : 0.09% optimize.begin_end_overlap_inline : 0.000003s : 0.01% optimize.split_matmul_comm_elemetwise : 0.000005s : 0.01% optimize.split_layernorm_comm : 0.000004s : 0.01% optimize.handle_group_info : 0.000004s : 0.01% optimize.symbol_engine_optimizer.build : 0.000005s : 0.01% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000013s : 0.04% optimize.symbol_engine_optimizer.elim_not_effective : 0.000016s : 0.04% optimize.symbol_engine_optimizer.opt_reshape : 0.000009s : 0.02% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000013s : 0.03% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000003s : 0.01% pipeline_parallel_scheduler : 0.000002s : 0.01% auto_monad_reorder : 0.000022s : 0.06% get_jit_bprop_graph : 0.000002s : 0.01% rewriter_after_jit_bprop_graph : 0.000006s : 0.02% opt_after_jit_grad : 0.000495s : 1.36% validate : 0.000047s : 0.13% Time group info: ------[substitution.] 0.000214 38 11.59% : 0.000025s : 3: substitution.cast_eliminate 1.17% : 0.000003s : 3: substitution.elim_not_effective 0.75% : 0.000002s : 3: substitution.fold_const_symbol 3.43% : 0.000007s : 5: substitution.graph_param_transform 68.00% : 0.000145s : 4: substitution.inline 2.26% : 0.000005s : 6: substitution.j_node_and_user_rematch 3.11% : 0.000007s : 6: substitution.remove_not_recompute_node 2.63% : 0.000006s : 4: substitution.replace_old_param 7.06% : 0.000015s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.005768 2 87.73% : 0.005060s : 1: type_inference.infer 12.27% : 0.000708s : 1: type_inference.specialize ------[replace.] 0.000062 8 58.82% : 0.000037s : 4: replace.inline 41.18% : 0.000026s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000156 8 91.59% : 0.000143s : 4: match.inline 8.41% : 0.000013s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000282 1596 0.92% : 0.000003s : 17: predicate.accumulaten_eliminater 0.71% : 0.000002s : 5: predicate.ad_related_special_op_eliminate 0.53% : 0.000001s : 10: predicate.addn_check_dump 0.85% : 0.000002s : 17: predicate.addn_zero_filter 0.75% : 0.000002s : 17: predicate.adjust_all_reduce_mul_add 1.82% : 0.000005s : 27: predicate.arithmetic_simplify 0.96% : 0.000003s : 17: predicate.cast_eliminate 0.53% : 0.000001s : 10: predicate.check_bprop_eliminate 0.51% : 0.000001s : 10: predicate.compare_switch_simplify 0.17% : 0.000000s : 5: predicate.const_output_eliminate 0.56% : 0.000002s : 10: predicate.depend_value_elim 0.96% : 0.000003s : 17: predicate.dict_get_item_const_eliminator 1.07% : 0.000003s : 17: predicate.dict_get_item_eliminator 0.86% : 0.000002s : 17: predicate.dict_set_item_eliminator 0.80% : 0.000002s : 10: predicate.dumpgradient_eliminate 0.23% : 0.000001s : 5: predicate.elim_not_effective 0.43% : 0.000001s : 5: predicate.elim_shapecalc_of_broadcastargs 1.40% : 0.000004s : 22: predicate.environ_add_const_eliminate 1.09% : 0.000003s : 22: predicate.environ_get_add_eliminate 1.09% : 0.000003s : 22: predicate.environ_get_depend_swap 1.68% : 0.000005s : 32: predicate.environ_get_eliminate 1.17% : 0.000003s : 22: predicate.environ_get_set_eliminate 1.26% : 0.000004s : 25: predicate.exchange_switch_depend_value 2.04% : 0.000006s : 25: predicate.float_depend_g_call 0.47% : 0.000001s : 10: predicate.float_environ_get_switch 0.79% : 0.000002s : 15: predicate.float_tuple_getitem_switch 0.14% : 0.000000s : 5: predicate.fold_const_symbol 0.59% : 0.000002s : 10: predicate.get_grad_eliminate 0.19% : 0.000001s : 5: predicate.graph_param_transform 0.56% : 0.000002s : 10: predicate.incorporate_call 0.48% : 0.000001s : 10: predicate.incorporate_call_switch 6.22% : 0.000018s : 72: predicate.inline 0.68% : 0.000002s : 10: predicate.inline_without_move 0.30% : 0.000001s : 10: predicate.j_node_and_user_rematch 0.77% : 0.000002s : 10: predicate.less_batch_normalization 1.75% : 0.000005s : 31: predicate.list_to_tuple_eliminator_ 2.48% : 0.000007s : 48: predicate.load_eliminater 0.68% : 0.000002s : 5: predicate.loop_unroll_after_grad 1.78% : 0.000005s : 36: predicate.loop_unroll_before_grad 1.91% : 0.000005s : 27: predicate.make_slice_get_slice_eliminator 0.60% : 0.000002s : 10: predicate.merge_addn 0.50% : 0.000001s : 10: predicate.micro_step_allgather_replace 0.54% : 0.000002s : 10: predicate.mini_step_allgather_replace 0.80% : 0.000002s : 17: predicate.minmaximum_grad 2.17% : 0.000006s : 5: predicate.mutable_eliminate 0.32% : 0.000001s : 5: predicate.opt_reshape 0.40% : 0.000001s : 5: predicate.parallel_virtual_node 5.64% : 0.000016s : 25: predicate.partial_defer_inline 1.53% : 0.000004s : 26: predicate.partial_eliminate 0.95% : 0.000003s : 17: predicate.print_const_string_wrapper 0.53% : 0.000002s : 10: predicate.reduce_all_const_elim 1.11% : 0.000003s : 17: predicate.reduce_eliminate 2.44% : 0.000007s : 48: predicate.redundant_stop_gradient_eliminater 0.38% : 0.000001s : 10: predicate.remove_not_recompute_node 1.28% : 0.000004s : 31: predicate.replace_applicator 0.57% : 0.000002s : 10: predicate.replace_old_param 0.37% : 0.000001s : 5: predicate.reset_defer_inline 0.97% : 0.000003s : 17: predicate.reshape_eliminate 0.68% : 0.000002s : 10: predicate.row_tensor_add_zeros_like 0.36% : 0.000001s : 5: predicate.row_tensor_eliminate 0.65% : 0.000002s : 10: predicate.same_eliminate 0.38% : 0.000001s : 10: predicate.set_cell_output_no_recompute 0.89% : 0.000003s : 10: predicate.shard_identity_eliminate 0.76% : 0.000002s : 10: predicate.special_op_eliminate 0.64% : 0.000002s : 10: predicate.specialize_transform 0.73% : 0.000002s : 10: predicate.split_environ_get_set_with_tuple_value 0.60% : 0.000002s : 10: predicate.stack_unstack_eliminate 0.30% : 0.000001s : 5: predicate.switch_call_monad_eliminater 1.49% : 0.000004s : 25: predicate.switch_defer_inline 1.92% : 0.000005s : 35: predicate.switch_layer_defer_inline 4.45% : 0.000013s : 76: predicate.switch_simplify 0.83% : 0.000002s : 17: predicate.tile_eliminate 0.87% : 0.000002s : 17: predicate.transpose_eliminate 1.55% : 0.000004s : 27: predicate.tuple_list_convert_item_index_to_positive 1.54% : 0.000004s : 27: predicate.tuple_list_get_item_const_eliminator 1.64% : 0.000005s : 27: predicate.tuple_list_get_item_depend_reorder 3.43% : 0.000010s : 41: predicate.tuple_list_get_item_eliminator 1.51% : 0.000004s : 27: predicate.tuple_list_get_set_item_eliminator 2.12% : 0.000006s : 37: predicate.tuple_list_set_item_eliminator 1.66% : 0.000005s : 31: predicate.tuple_to_list_eliminator_ 2.30% : 0.000006s : 48: predicate.updatestate_pure_node_eliminater 3.10% : 0.000009s : 58: predicate.updatestate_useless_node_eliminater 0.50% : 0.000001s : 5: predicate.value_based_eliminate 0.64% : 0.000002s : 10: predicate.virtual_dataset_eliminate 0.56% : 0.000002s : 10: predicate.virtual_output_eliminate 0.24% : 0.000001s : 5: predicate.virtual_view_grad_eliminate 0.38% : 0.000001s : 5: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000537 11 54.14% : 0.000291s : 5: func_graph_cloner_run.FuncGraphClonerGraph 45.86% : 0.000246s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.077338 192 0.01% : 0.000006s : 1: ForceFp32Comm 4.08% : 0.003154s : 1: add_attr 4.06% : 0.003138s : 1: add_attr_with_inline 0.01% : 0.000006s : 1: add_comm_op_reuse_tag 0.10% : 0.000076s : 1: add_recomputation 0.01% : 0.000006s : 1: assign_add_opt 0.10% : 0.000076s : 1: auto_monad 0.04% : 0.000030s : 1: auto_monad_reorder 0.01% : 0.000006s : 1: begin_end_overlap_inline 0.01% : 0.000009s : 1: bias_add_comm_swap 0.66% : 0.000508s : 1: bootstrap 0.06% : 0.000047s : 1: cconv 0.01% : 0.000006s : 1: comm_op_add_attrs 0.03% : 0.000022s : 1: control_data_broadcast_order 0.02% : 0.000015s : 1: convert_after_rewriter 0.05% : 0.000036s : 1: cse_after_recomputation 0.01% : 0.000008s : 1: dataset_repeat_opt 0.02% : 0.000019s : 1: detach_backward 0.02% : 0.000013s : 1: environ_conv 0.04% : 0.000030s : 1: event_method 0.01% : 0.000007s : 1: full_micro_interleaved_order_control 0.01% : 0.000008s : 1: get_jit_bprop_graph 0.02% : 0.000014s : 1: graph_reusing 0.01% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000007s : 1: handle_group_info 0.01% : 0.000008s : 1: inline 0.01% : 0.000009s : 1: insert-virtual-dataset 0.01% : 0.000006s : 1: interleave_parallel_branches 0.01% : 0.000006s : 1: interleave_split_concat_branches 0.01% : 0.000009s : 1: label_fine_grained_interleaved_index 0.01% : 0.000010s : 1: label_micro_interleaved_index 0.85% : 0.000657s : 1: loop_unroll 0.01% : 0.000006s : 1: merge_cast_opt 0.01% : 0.000008s : 1: micro_interleaved_order_control 31.95% : 0.024709s : 1: mutable_eliminate 0.01% : 0.000011s : 1: offloading_packed_experts 0.02% : 0.000016s : 1: opt.transform.loop_unroll_optimizer 0.05% : 0.000042s : 1: opt.transform.mutable_eliminate 1.90% : 0.001473s : 78: opt.transform.opt_a 0.05% : 0.000037s : 1: opt.transform.opt_after_cconv 0.04% : 0.000030s : 1: opt.transform.opt_after_jit_grad 0.19% : 0.000144s : 28: opt.transform.opt_b 0.09% : 0.000072s : 2: opt.transform.opt_trans_graph 0.06% : 0.000047s : 4: opt.transform.symbol_engine_opt 4.56% : 0.003526s : 1: opt_a 0.20% : 0.000152s : 1: opt_after_cconv 0.65% : 0.000505s : 1: opt_after_jit_grad 0.48% : 0.000374s : 1: opt_b 39.90% : 0.030858s : 1: optimize 0.05% : 0.000039s : 1: optimize_parallel_all_gather_comm 0.02% : 0.000012s : 1: order_py_execute_after_rewriter 0.04% : 0.000034s : 1: overlap_grad_flash_sp 0.01% : 0.000007s : 1: overlap_grad_matmul_and_grad_allreduce 0.01% : 0.000011s : 1: overlap_grad_ring_attention 0.01% : 0.000007s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000007s : 1: overlap_opt_shard_in_pipeline 0.01% : 0.000008s : 1: overlap_param_gather 0.01% : 0.000007s : 1: overlap_recompute_allgather_and_fa_grad 0.01% : 0.000011s : 1: overlap_recompute_and_grad_model_parallel 0.01% : 0.000008s : 1: overlap_recompute_comm 0.01% : 0.000010s : 1: parallel-infer-symbol 0.01% : 0.000006s : 1: parallel-infer-symbol-second 0.01% : 0.000008s : 1: partial_unused_args_eliminate 0.01% : 0.000010s : 1: pipeline_parallel_scheduler 0.01% : 0.000007s : 1: pipeline_split 0.06% : 0.000044s : 1: pre_auto_parallel 0.04% : 0.000034s : 1: py_interpret_to_execute 0.03% : 0.000021s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000007s : 1: remove_cast_before_assign_add 0.03% : 0.000027s : 1: remove_dup_value 0.53% : 0.000413s : 1: renormalize.infer 0.42% : 0.000323s : 1: renormalize.specialize 0.01% : 0.000008s : 1: reorder_send_recv_between_fp_bp 0.02% : 0.000012s : 1: rewriter_after_jit_bprop_graph 0.08% : 0.000060s : 1: rewriter_after_opt_a 0.12% : 0.000093s : 1: rewriter_before_opt_a 0.01% : 0.000008s : 1: slice_cell_reuse_recomputed_activation 0.01% : 0.000008s : 1: slice_recompute_activation 0.01% : 0.000007s : 1: split_layernorm_comm 0.01% : 0.000008s : 1: split_matmul_comm_elemetwise 0.01% : 0.000011s : 1: swap_dp_allreduce_reducescatter 0.15% : 0.000114s : 1: symbol_engine_optimizer 0.16% : 0.000122s : 1: tuple_transform 7.57% : 0.005857s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:45:50.117.917 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0456371, [21] [bootstrap]: 0.00045364 [type_inference]: 0.0345792 [event_method]: 2.141e-05 [auto_monad]: 6.736e-05 [graph_reusing]: 6.11e-06 [inline]: 2.99001e-06 [add_attr]: 0.00386602, [1] [add_attr_with_inline]: 0.00385294, [1] [Cycle 1]: 7.773e-05, [2] [tag_attr]: 2.274e-05 [meta_addattr_fg_expand]: 6.26998e-06 [parallel-infer-symbol]: 3.65e-06 [pre_auto_parallel]: 4.094e-05 [insert-virtual-dataset]: 2.49001e-06 [parallel-infer-symbol-second]: 7.89994e-07 [dataset_repeat_opt]: 2.35002e-06 [pipeline_split]: 1.89999e-06 [optimize]: 0.00581358, [53] [py_interpret_to_execute]: 2.913e-05 [rewriter_before_opt_a]: 9.6e-05 [opt_a]: 0.0033543, [2] [Cycle 1]: 0.00252216, [45] [expand_dump_flag]: 3.39001e-06 [switch_simplify]: 4.621e-05 [loop_unroll]: 3.196e-05 [a_1]: 0.00079755 [with_stream_mark]: 2.166e-05 [recompute_prepare]: 1.441e-05 [updatestate_depend_eliminate]: 4.68999e-06 [updatestate_assign_eliminate]: 3.65e-06 [updatestate_loads_eliminate]: 3.48999e-06 [parameter_eliminate]: 2.20002e-06 [a_2]: 0.00010703 [accelerated_algorithm]: 8.63001e-06 [shard]: 2.19001e-06 [meta_shard_fg_expand]: 2.30002e-06 [shard_inline]: 8.32998e-06 [merge_send_recv]: 1.017e-05 [auto_parallel]: 7.87e-06 [parallel]: 2.066e-05 [flash_sp]: 9.69e-06 [merge_comm]: 5.87001e-06 [allreduce_fusion]: 4.68001e-06 [matmul_add_comm_reduction]: 1.11e-05 [allreduce_slice_to_reducescatter]: 5.90022e-07 [virtual_shard_identity]: 1.091e-05 [virtual_dataset]: 8.47e-06 [get_grad_eliminate_]: 8.09002e-06 [virtual_output]: 7.76001e-06 [merge_forward]: 4.51002e-06 [cell_reuse_recompute_pass]: 1.29e-06 [offload_activation]: 1.109e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.726e-05 [merge_recompute_call_nodes]: 1.47001e-06 [before_grad]: 1.428e-05 [set_forward_comm_id_for_comm_node_pass]: 5.27999e-06 [meta_fg_expand]: 3.41999e-06 [flash_sp_send_recv_attached]: 2.40002e-06 [receive_attached]: 2.04999e-06 [after_resolve]: 1.347e-05 [a_after_grad]: 1.339e-05 [renormalize]: 0.00087016 [add_forward_monad_depend]: 6.66e-06 [auto_monad_grad]: 2.51e-06 [auto_monad_eliminator]: 1.927e-05 [cse]: 3.671e-05 [a_3]: 6.083e-05 [Cycle 2]: 0.00082029, [45] [expand_dump_flag]: 1.97999e-06 [switch_simplify]: 9.95002e-06 [loop_unroll]: 8.35001e-06 [a_1]: 0.00018442 [with_stream_mark]: 1.548e-05 [recompute_prepare]: 8.55001e-06 [updatestate_depend_eliminate]: 3.70998e-06 [updatestate_assign_eliminate]: 3.12002e-06 [updatestate_loads_eliminate]: 3.11999e-06 [parameter_eliminate]: 1.39998e-06 [a_2]: 9.171e-05 [accelerated_algorithm]: 7.90998e-06 [shard]: 1.76e-06 [meta_shard_fg_expand]: 1.84998e-06 [shard_inline]: 7.76001e-06 [merge_send_recv]: 7.8e-06 [auto_parallel]: 7.62002e-06 [parallel]: 6.26e-06 [flash_sp]: 6.79999e-06 [merge_comm]: 4.80001e-06 [allreduce_fusion]: 4.25999e-06 [matmul_add_comm_reduction]: 8.17e-06 [allreduce_slice_to_reducescatter]: 3.29979e-07 [virtual_shard_identity]: 1.002e-05 [virtual_dataset]: 7.36999e-06 [get_grad_eliminate_]: 6.74999e-06 [virtual_output]: 6.97002e-06 [merge_forward]: 4.41002e-06 [cell_reuse_recompute_pass]: 1.62001e-06 [offload_activation]: 9.89001e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.833e-05 [merge_recompute_call_nodes]: 1.40001e-06 [before_grad]: 1.337e-05 [set_forward_comm_id_for_comm_node_pass]: 4.46002e-06 [meta_fg_expand]: 2.91e-06 [flash_sp_send_recv_attached]: 9.20001e-07 [receive_attached]: 1.54998e-06 [after_resolve]: 1.317e-05 [a_after_grad]: 1.193e-05 [renormalize]: 1.00001e-07 [add_forward_monad_depend]: 2.78e-06 [auto_monad_grad]: 1.49998e-06 [auto_monad_eliminator]: 1.069e-05 [cse]: 2.249e-05 [a_3]: 4.56e-05 [py_interpret_to_execute_after_opt_a]: 1.53e-05 [slice_cell_reuse_recomputed_activation]: 2.36e-06 [rewriter_after_opt_a]: 4.498e-05 [convert_after_rewriter]: 8.48001e-06 [order_py_execute_after_rewriter]: 7.01999e-06 [mutable_eliminate]: 0.00060408 [opt_b]: 0.00025261, [1] [Cycle 1]: 0.00024572, [7] [b_1]: 0.00015491 [b_2]: 9.40001e-06 [updatestate_depend_eliminate]: 7.82e-06 [updatestate_assign_eliminate]: 3.38e-06 [updatestate_loads_eliminate]: 3.11999e-06 [renormalize]: 7.2e-07 [cse]: 2.921e-05 [optimize_parallel_all_gather_comm]: 2.062e-05 [overlap_param_gather]: 1.89e-06 [cconv]: 2.988e-05 [loop_unroll]: 0.00055859 [opt_after_cconv]: 0.00013086, [1] [Cycle 1]: 0.0001243, [7] [c_1]: 4.237e-05 [parameter_eliminate]: 3.85e-06 [updatestate_depend_eliminate]: 8.13999e-06 [updatestate_assign_eliminate]: 3.40998e-06 [updatestate_loads_eliminate]: 3.21001e-06 [cse]: 2.543e-05 [renormalize]: 5.69999e-07 [remove_dup_value]: 1.662e-05 [tuple_transform]: 9.858e-05, [1] [Cycle 1]: 9.364e-05, [4] [d_1]: 6.114e-05 [none_parameter_eliminate]: 2.12999e-06 [renormalize]: 3.00002e-07 [switch_simplify]: 9.01002e-06 [partial_unused_args_eliminate]: 1.98002e-06 [add_recomputation]: 6.26e-05 [cse_after_recomputation]: 2.513e-05, [1] [Cycle 1]: 2.068e-05, [1] [cse]: 1.506e-05 [environ_conv]: 7.20998e-06 [swap_dp_allreduce_reducescatter]: 6.44999e-06 [bias_add_comm_swap]: 3.06999e-06 [label_micro_interleaved_index]: 4.69002e-06 [label_fine_grained_interleaved_index]: 3.5e-06 [merge_cast_opt]: 1.31998e-06 [slice_recompute_activation]: 2.14e-06 [micro_interleaved_order_control]: 2.31e-06 [assign_add_opt]: 1.49e-06 [ForceFp32Comm]: 8.80013e-07 [remove_cast_before_assign_add]: 1.16002e-06 [full_micro_interleaved_order_control]: 2.61e-06 [reorder_send_recv_between_fp_bp]: 2.56e-06 [comm_op_add_attrs]: 1.12999e-06 [add_comm_op_reuse_tag]: 9.79984e-07 [interleave_split_concat_branches]: 1.44e-06 [interleave_parallel_branches]: 1.18001e-06 [overlap_opt_shard_in_pipeline]: 1.20999e-06 [overlap_opt_shard_grad_in_pipeline]: 2.85002e-06 [control_data_broadcast_order]: 1.661e-05 [grouped_pairwise_exchange_alltoall]: 1.81e-06 [offloading_packed_experts]: 4.50001e-06 [overlap_recompute_and_grad_model_parallel]: 6.11e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.19e-06 [overlap_recompute_allgather_and_fa_grad]: 1.47999e-06 [overlap_recompute_comm]: 2.46e-06 [overlap_grad_ring_attention]: 4.95001e-06 [overlap_grad_flash_sp]: 2.647e-05 [begin_end_overlap_inline]: 5.19998e-07 [split_matmul_comm_elemetwise]: 2.25002e-06 [split_layernorm_comm]: 1.73002e-06 [handle_group_info]: 1.14998e-06 [symbol_engine_optimizer]: 9.22e-05, [1] [Cycle 1]: 8.737e-05, [6] [build]: 4.15e-06 [elim_shapecalc]: 1.301e-05 [elim_not_effective]: 1.673e-05 [opt_reshape]: 9.46e-06 [fold_const_symbol]: 1.328e-05 [renormalize]: 2.19996e-07 [detach_backward]: 2.41e-06 [pipeline_parallel_scheduler]: 1.82001e-06 [auto_monad_reorder]: 2.067e-05 [get_jit_bprop_graph]: 2.09999e-06 [rewriter_after_jit_bprop_graph]: 5.04e-06 [opt_after_jit_grad]: 0.00052204 [validate]: 4.784e-05 Sums bootstrap : 0.000454s : 1.11% type_inference : 0.034579s : 84.92% event_method : 0.000021s : 0.05% auto_monad : 0.000067s : 0.17% graph_reusing : 0.000006s : 0.02% inline : 0.000003s : 0.01% add_attr.add_attr_with_inline.tag_attr : 0.000023s : 0.06% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.02% parallel-infer-symbol : 0.000004s : 0.01% pre_auto_parallel : 0.000041s : 0.10% insert-virtual-dataset : 0.000002s : 0.01% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.01% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000029s : 0.07% optimize.rewriter_before_opt_a : 0.000096s : 0.24% optimize.opt_a.expand_dump_flag : 0.000005s : 0.01% optimize.opt_a.switch_simplify : 0.000056s : 0.14% optimize.opt_a.loop_unroll : 0.000040s : 0.10% optimize.opt_a.a_1 : 0.000982s : 2.41% optimize.opt_a.with_stream_mark : 0.000037s : 0.09% optimize.opt_a.recompute_prepare : 0.000023s : 0.06% optimize.opt_a.updatestate_depend_eliminate : 0.000008s : 0.02% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.02% optimize.opt_a.updatestate_loads_eliminate : 0.000007s : 0.02% optimize.opt_a.parameter_eliminate : 0.000004s : 0.01% optimize.opt_a.a_2 : 0.000199s : 0.49% optimize.opt_a.accelerated_algorithm : 0.000017s : 0.04% optimize.opt_a.shard : 0.000004s : 0.01% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.01% optimize.opt_a.shard_inline : 0.000016s : 0.04% optimize.opt_a.merge_send_recv : 0.000018s : 0.04% optimize.opt_a.auto_parallel : 0.000015s : 0.04% optimize.opt_a.parallel : 0.000027s : 0.07% optimize.opt_a.flash_sp : 0.000016s : 0.04% optimize.opt_a.merge_comm : 0.000011s : 0.03% optimize.opt_a.allreduce_fusion : 0.000009s : 0.02% optimize.opt_a.matmul_add_comm_reduction : 0.000019s : 0.05% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000021s : 0.05% optimize.opt_a.virtual_dataset : 0.000016s : 0.04% optimize.opt_a.get_grad_eliminate_ : 0.000015s : 0.04% optimize.opt_a.virtual_output : 0.000015s : 0.04% optimize.opt_a.merge_forward : 0.000009s : 0.02% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.01% optimize.opt_a.offload_activation : 0.000021s : 0.05% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000036s : 0.09% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.01% optimize.opt_a.before_grad : 0.000028s : 0.07% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000010s : 0.02% optimize.opt_a.meta_fg_expand : 0.000006s : 0.02% optimize.opt_a.flash_sp_send_recv_attached : 0.000003s : 0.01% optimize.opt_a.receive_attached : 0.000004s : 0.01% optimize.opt_a.after_resolve : 0.000027s : 0.07% optimize.opt_a.a_after_grad : 0.000025s : 0.06% optimize.opt_a.renormalize : 0.000870s : 2.14% optimize.opt_a.add_forward_monad_depend : 0.000009s : 0.02% optimize.opt_a.auto_monad_grad : 0.000004s : 0.01% optimize.opt_a.auto_monad_eliminator : 0.000030s : 0.07% optimize.opt_a.cse : 0.000059s : 0.15% optimize.opt_a.a_3 : 0.000106s : 0.26% optimize.py_interpret_to_execute_after_opt_a : 0.000015s : 0.04% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.01% optimize.rewriter_after_opt_a : 0.000045s : 0.11% optimize.convert_after_rewriter : 0.000008s : 0.02% optimize.order_py_execute_after_rewriter : 0.000007s : 0.02% optimize.mutable_eliminate : 0.000604s : 1.48% optimize.opt_b.b_1 : 0.000155s : 0.38% optimize.opt_b.b_2 : 0.000009s : 0.02% optimize.opt_b.updatestate_depend_eliminate : 0.000008s : 0.02% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.01% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.01% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000029s : 0.07% optimize.optimize_parallel_all_gather_comm : 0.000021s : 0.05% optimize.overlap_param_gather : 0.000002s : 0.00% optimize.cconv : 0.000030s : 0.07% optimize.loop_unroll : 0.000559s : 1.37% optimize.opt_after_cconv.c_1 : 0.000042s : 0.10% optimize.opt_after_cconv.parameter_eliminate : 0.000004s : 0.01% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000008s : 0.02% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.cse : 0.000025s : 0.06% optimize.opt_after_cconv.renormalize : 0.000001s : 0.00% optimize.remove_dup_value : 0.000017s : 0.04% optimize.tuple_transform.d_1 : 0.000061s : 0.15% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000009s : 0.02% optimize.partial_unused_args_eliminate : 0.000002s : 0.00% optimize.add_recomputation : 0.000063s : 0.15% optimize.cse_after_recomputation.cse : 0.000015s : 0.04% optimize.environ_conv : 0.000007s : 0.02% optimize.swap_dp_allreduce_reducescatter : 0.000006s : 0.02% optimize.bias_add_comm_swap : 0.000003s : 0.01% optimize.label_micro_interleaved_index : 0.000005s : 0.01% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.01% optimize.merge_cast_opt : 0.000001s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.01% optimize.micro_interleaved_order_control : 0.000002s : 0.01% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.00% optimize.full_micro_interleaved_order_control : 0.000003s : 0.01% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.01% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000003s : 0.01% optimize.control_data_broadcast_order : 0.000017s : 0.04% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.00% optimize.offloading_packed_experts : 0.000005s : 0.01% optimize.overlap_recompute_and_grad_model_parallel : 0.000006s : 0.02% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.00% optimize.overlap_recompute_comm : 0.000002s : 0.01% optimize.overlap_grad_ring_attention : 0.000005s : 0.01% optimize.overlap_grad_flash_sp : 0.000026s : 0.07% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.01% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000004s : 0.01% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000013s : 0.03% optimize.symbol_engine_optimizer.elim_not_effective : 0.000017s : 0.04% optimize.symbol_engine_optimizer.opt_reshape : 0.000009s : 0.02% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000013s : 0.03% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.01% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000021s : 0.05% get_jit_bprop_graph : 0.000002s : 0.01% rewriter_after_jit_bprop_graph : 0.000005s : 0.01% opt_after_jit_grad : 0.000522s : 1.28% validate : 0.000048s : 0.12% Time group info: ------[substitution.] 0.000240 38 11.32% : 0.000027s : 3: substitution.cast_eliminate 0.97% : 0.000002s : 3: substitution.elim_not_effective 0.74% : 0.000002s : 3: substitution.fold_const_symbol 3.11% : 0.000007s : 5: substitution.graph_param_transform 69.43% : 0.000167s : 4: substitution.inline 2.47% : 0.000006s : 6: substitution.j_node_and_user_rematch 2.65% : 0.000006s : 6: substitution.remove_not_recompute_node 2.20% : 0.000005s : 4: substitution.replace_old_param 7.11% : 0.000017s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.034503 2 97.63% : 0.033684s : 1: type_inference.infer 2.37% : 0.000819s : 1: type_inference.specialize ------[replace.] 0.000068 8 58.45% : 0.000040s : 4: replace.inline 41.55% : 0.000028s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000179 8 91.55% : 0.000164s : 4: match.inline 8.45% : 0.000015s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000263 1596 0.93% : 0.000002s : 17: predicate.accumulaten_eliminater 0.93% : 0.000002s : 5: predicate.ad_related_special_op_eliminate 0.54% : 0.000001s : 10: predicate.addn_check_dump 0.94% : 0.000002s : 17: predicate.addn_zero_filter 0.87% : 0.000002s : 17: predicate.adjust_all_reduce_mul_add 1.96% : 0.000005s : 27: predicate.arithmetic_simplify 1.22% : 0.000003s : 17: predicate.cast_eliminate 0.60% : 0.000002s : 10: predicate.check_bprop_eliminate 0.56% : 0.000001s : 10: predicate.compare_switch_simplify 0.19% : 0.000001s : 5: predicate.const_output_eliminate 0.56% : 0.000001s : 10: predicate.depend_value_elim 0.95% : 0.000002s : 17: predicate.dict_get_item_const_eliminator 1.01% : 0.000003s : 17: predicate.dict_get_item_eliminator 0.92% : 0.000002s : 17: predicate.dict_set_item_eliminator 0.92% : 0.000002s : 10: predicate.dumpgradient_eliminate 0.22% : 0.000001s : 5: predicate.elim_not_effective 0.46% : 0.000001s : 5: predicate.elim_shapecalc_of_broadcastargs 1.20% : 0.000003s : 22: predicate.environ_add_const_eliminate 1.10% : 0.000003s : 22: predicate.environ_get_add_eliminate 1.10% : 0.000003s : 22: predicate.environ_get_depend_swap 1.80% : 0.000005s : 32: predicate.environ_get_eliminate 1.11% : 0.000003s : 22: predicate.environ_get_set_eliminate 1.38% : 0.000004s : 25: predicate.exchange_switch_depend_value 2.29% : 0.000006s : 25: predicate.float_depend_g_call 0.50% : 0.000001s : 10: predicate.float_environ_get_switch 0.80% : 0.000002s : 15: predicate.float_tuple_getitem_switch 0.19% : 0.000000s : 5: predicate.fold_const_symbol 0.58% : 0.000002s : 10: predicate.get_grad_eliminate 0.20% : 0.000001s : 5: predicate.graph_param_transform 0.59% : 0.000002s : 10: predicate.incorporate_call 0.50% : 0.000001s : 10: predicate.incorporate_call_switch 6.34% : 0.000017s : 72: predicate.inline 0.95% : 0.000003s : 10: predicate.inline_without_move 0.32% : 0.000001s : 10: predicate.j_node_and_user_rematch 0.73% : 0.000002s : 10: predicate.less_batch_normalization 1.83% : 0.000005s : 31: predicate.list_to_tuple_eliminator_ 2.56% : 0.000007s : 48: predicate.load_eliminater 0.89% : 0.000002s : 5: predicate.loop_unroll_after_grad 2.03% : 0.000005s : 36: predicate.loop_unroll_before_grad 1.72% : 0.000005s : 27: predicate.make_slice_get_slice_eliminator 0.65% : 0.000002s : 10: predicate.merge_addn 0.52% : 0.000001s : 10: predicate.micro_step_allgather_replace 0.53% : 0.000001s : 10: predicate.mini_step_allgather_replace 0.86% : 0.000002s : 17: predicate.minmaximum_grad 0.95% : 0.000003s : 5: predicate.mutable_eliminate 0.37% : 0.000001s : 5: predicate.opt_reshape 0.34% : 0.000001s : 5: predicate.parallel_virtual_node 1.72% : 0.000005s : 25: predicate.partial_defer_inline 1.71% : 0.000004s : 26: predicate.partial_eliminate 0.93% : 0.000002s : 17: predicate.print_const_string_wrapper 0.58% : 0.000002s : 10: predicate.reduce_all_const_elim 1.32% : 0.000003s : 17: predicate.reduce_eliminate 2.73% : 0.000007s : 48: predicate.redundant_stop_gradient_eliminater 0.46% : 0.000001s : 10: predicate.remove_not_recompute_node 1.40% : 0.000004s : 31: predicate.replace_applicator 0.48% : 0.000001s : 10: predicate.replace_old_param 0.31% : 0.000001s : 5: predicate.reset_defer_inline 1.01% : 0.000003s : 17: predicate.reshape_eliminate 0.59% : 0.000002s : 10: predicate.row_tensor_add_zeros_like 0.40% : 0.000001s : 5: predicate.row_tensor_eliminate 0.73% : 0.000002s : 10: predicate.same_eliminate 0.61% : 0.000002s : 10: predicate.set_cell_output_no_recompute 0.87% : 0.000002s : 10: predicate.shard_identity_eliminate 0.77% : 0.000002s : 10: predicate.special_op_eliminate 0.75% : 0.000002s : 10: predicate.specialize_transform 0.73% : 0.000002s : 10: predicate.split_environ_get_set_with_tuple_value 0.75% : 0.000002s : 10: predicate.stack_unstack_eliminate 0.35% : 0.000001s : 5: predicate.switch_call_monad_eliminater 1.48% : 0.000004s : 25: predicate.switch_defer_inline 2.02% : 0.000005s : 35: predicate.switch_layer_defer_inline 4.70% : 0.000012s : 76: predicate.switch_simplify 0.91% : 0.000002s : 17: predicate.tile_eliminate 0.91% : 0.000002s : 17: predicate.transpose_eliminate 1.71% : 0.000004s : 27: predicate.tuple_list_convert_item_index_to_positive 1.67% : 0.000004s : 27: predicate.tuple_list_get_item_const_eliminator 1.48% : 0.000004s : 27: predicate.tuple_list_get_item_depend_reorder 3.13% : 0.000008s : 41: predicate.tuple_list_get_item_eliminator 1.57% : 0.000004s : 27: predicate.tuple_list_get_set_item_eliminator 2.89% : 0.000008s : 37: predicate.tuple_list_set_item_eliminator 1.75% : 0.000005s : 31: predicate.tuple_to_list_eliminator_ 2.52% : 0.000007s : 48: predicate.updatestate_pure_node_eliminater 3.20% : 0.000008s : 58: predicate.updatestate_useless_node_eliminater 0.32% : 0.000001s : 5: predicate.value_based_eliminate 0.61% : 0.000002s : 10: predicate.virtual_dataset_eliminate 0.61% : 0.000002s : 10: predicate.virtual_output_eliminate 0.28% : 0.000001s : 5: predicate.virtual_view_grad_eliminate 0.36% : 0.000001s : 5: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000622 11 52.58% : 0.000327s : 5: func_graph_cloner_run.FuncGraphClonerGraph 47.42% : 0.000295s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.057916 192 0.01% : 0.000004s : 1: ForceFp32Comm 6.69% : 0.003873s : 1: add_attr 6.66% : 0.003857s : 1: add_attr_with_inline 0.01% : 0.000004s : 1: add_comm_op_reuse_tag 0.12% : 0.000067s : 1: add_recomputation 0.01% : 0.000005s : 1: assign_add_opt 0.13% : 0.000075s : 1: auto_monad 0.04% : 0.000025s : 1: auto_monad_reorder 0.01% : 0.000003s : 1: begin_end_overlap_inline 0.01% : 0.000006s : 1: bias_add_comm_swap 0.83% : 0.000483s : 1: bootstrap 0.06% : 0.000033s : 1: cconv 0.01% : 0.000004s : 1: comm_op_add_attrs 0.04% : 0.000020s : 1: control_data_broadcast_order 0.02% : 0.000012s : 1: convert_after_rewriter 0.05% : 0.000028s : 1: cse_after_recomputation 0.01% : 0.000005s : 1: dataset_repeat_opt 0.01% : 0.000006s : 1: detach_backward 0.02% : 0.000011s : 1: environ_conv 0.05% : 0.000028s : 1: event_method 0.01% : 0.000006s : 1: full_micro_interleaved_order_control 0.01% : 0.000006s : 1: get_jit_bprop_graph 0.02% : 0.000010s : 1: graph_reusing 0.01% : 0.000005s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000004s : 1: handle_group_info 0.01% : 0.000006s : 1: inline 0.01% : 0.000006s : 1: insert-virtual-dataset 0.01% : 0.000004s : 1: interleave_parallel_branches 0.01% : 0.000005s : 1: interleave_split_concat_branches 0.01% : 0.000007s : 1: label_fine_grained_interleaved_index 0.01% : 0.000008s : 1: label_micro_interleaved_index 0.98% : 0.000569s : 1: loop_unroll 0.01% : 0.000004s : 1: merge_cast_opt 0.01% : 0.000005s : 1: micro_interleaved_order_control 1.06% : 0.000614s : 1: mutable_eliminate 0.01% : 0.000008s : 1: offloading_packed_experts 0.03% : 0.000019s : 1: opt.transform.loop_unroll_optimizer 0.03% : 0.000019s : 1: opt.transform.mutable_eliminate 2.63% : 0.001520s : 78: opt.transform.opt_a 0.07% : 0.000041s : 1: opt.transform.opt_after_cconv 0.06% : 0.000033s : 1: opt.transform.opt_after_jit_grad 0.23% : 0.000133s : 28: opt.transform.opt_b 0.12% : 0.000068s : 2: opt.transform.opt_trans_graph 0.08% : 0.000048s : 4: opt.transform.symbol_engine_opt 5.80% : 0.003357s : 1: opt_a 0.23% : 0.000135s : 1: opt_after_cconv 0.92% : 0.000533s : 1: opt_after_jit_grad 0.44% : 0.000256s : 1: opt_b 10.05% : 0.005819s : 1: optimize 0.04% : 0.000024s : 1: optimize_parallel_all_gather_comm 0.02% : 0.000010s : 1: order_py_execute_after_rewriter 0.05% : 0.000030s : 1: overlap_grad_flash_sp 0.01% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.01% : 0.000008s : 1: overlap_grad_ring_attention 0.01% : 0.000006s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.01% : 0.000005s : 1: overlap_param_gather 0.01% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.02% : 0.000009s : 1: overlap_recompute_and_grad_model_parallel 0.01% : 0.000006s : 1: overlap_recompute_comm 0.01% : 0.000008s : 1: parallel-infer-symbol 0.01% : 0.000004s : 1: parallel-infer-symbol-second 0.01% : 0.000005s : 1: partial_unused_args_eliminate 0.01% : 0.000005s : 1: pipeline_parallel_scheduler 0.01% : 0.000005s : 1: pipeline_split 0.08% : 0.000045s : 1: pre_auto_parallel 0.06% : 0.000033s : 1: py_interpret_to_execute 0.03% : 0.000019s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000004s : 1: remove_cast_before_assign_add 0.04% : 0.000020s : 1: remove_dup_value 0.85% : 0.000490s : 1: renormalize.infer 0.64% : 0.000371s : 1: renormalize.specialize 0.01% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.01% : 0.000009s : 1: rewriter_after_jit_bprop_graph 0.09% : 0.000050s : 1: rewriter_after_opt_a 0.17% : 0.000101s : 1: rewriter_before_opt_a 0.01% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.01% : 0.000005s : 1: slice_recompute_activation 0.01% : 0.000005s : 1: split_layernorm_comm 0.01% : 0.000005s : 1: split_matmul_comm_elemetwise 0.02% : 0.000010s : 1: swap_dp_allreduce_reducescatter 0.16% : 0.000095s : 1: symbol_engine_optimizer 0.18% : 0.000102s : 1: tuple_transform 59.75% : 0.034605s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:45:50.539.011 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:45:50.539.279 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0325133, [21] [bootstrap]: 0.00043837 [type_inference]: 0.00583421 [event_method]: 1.829e-05 [auto_monad]: 6.203e-05 [graph_reusing]: 5.96998e-06 [inline]: 2.11e-06 [add_attr]: 0.00302802, [1] [add_attr_with_inline]: 0.00301978, [1] [Cycle 1]: 7.1e-05, [2] [tag_attr]: 1.935e-05 [meta_addattr_fg_expand]: 6.66999e-06 [parallel-infer-symbol]: 2.91e-06 [pre_auto_parallel]: 3.259e-05 [insert-virtual-dataset]: 2.39001e-06 [parallel-infer-symbol-second]: 9.00007e-07 [dataset_repeat_opt]: 2.27001e-06 [pipeline_split]: 1.66e-06 [optimize]: 0.00570761, [53] [py_interpret_to_execute]: 3.05e-05 [rewriter_before_opt_a]: 8.763e-05 [opt_a]: 0.0033706, [2] [Cycle 1]: 0.00241701, [45] [expand_dump_flag]: 2.91e-06 [switch_simplify]: 4.362e-05 [loop_unroll]: 3.141e-05 [a_1]: 0.00073874 [with_stream_mark]: 1.64e-05 [recompute_prepare]: 1.24e-05 [updatestate_depend_eliminate]: 4.72e-06 [updatestate_assign_eliminate]: 4.35e-06 [updatestate_loads_eliminate]: 4.02e-06 [parameter_eliminate]: 2.60002e-06 [a_2]: 0.00012878 [accelerated_algorithm]: 8.92999e-06 [shard]: 2.46e-06 [meta_shard_fg_expand]: 1.94999e-06 [shard_inline]: 7.47998e-06 [merge_send_recv]: 9.61e-06 [auto_parallel]: 7.36999e-06 [parallel]: 1.824e-05 [flash_sp]: 9.92001e-06 [merge_comm]: 4.80999e-06 [allreduce_fusion]: 4.1e-06 [matmul_add_comm_reduction]: 1.039e-05 [allreduce_slice_to_reducescatter]: 9.40025e-07 [virtual_shard_identity]: 9.55001e-06 [virtual_dataset]: 8.07e-06 [get_grad_eliminate_]: 7.46999e-06 [virtual_output]: 8.3e-06 [merge_forward]: 4.47e-06 [cell_reuse_recompute_pass]: 1.52001e-06 [offload_activation]: 1.069e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.767e-05 [merge_recompute_call_nodes]: 1.95001e-06 [before_grad]: 1.352e-05 [set_forward_comm_id_for_comm_node_pass]: 5.05001e-06 [meta_fg_expand]: 3.61999e-06 [flash_sp_send_recv_attached]: 2.42001e-06 [receive_attached]: 2.24999e-06 [after_resolve]: 1.308e-05 [a_after_grad]: 1.248e-05 [renormalize]: 0.00068642 [add_forward_monad_depend]: 6.29999e-06 [auto_monad_grad]: 2.43e-06 [auto_monad_eliminator]: 1.748e-05 [cse]: 3.515e-05 [a_3]: 7.284e-05 [Cycle 2]: 0.00093984, [45] [expand_dump_flag]: 1.28002e-06 [switch_simplify]: 8.82e-06 [loop_unroll]: 7.33e-06 [a_1]: 0.00017618 [with_stream_mark]: 1.285e-05 [recompute_prepare]: 7.61999e-06 [updatestate_depend_eliminate]: 3.9e-06 [updatestate_assign_eliminate]: 3.09001e-06 [updatestate_loads_eliminate]: 3.01001e-06 [parameter_eliminate]: 1.41002e-06 [a_2]: 0.00011878 [accelerated_algorithm]: 7.79002e-06 [shard]: 1.49e-06 [meta_shard_fg_expand]: 1.66998e-06 [shard_inline]: 7.29001e-06 [merge_send_recv]: 5.89999e-06 [auto_parallel]: 6.61e-06 [parallel]: 7.86001e-06 [flash_sp]: 4.03001e-06 [merge_comm]: 4.28999e-06 [allreduce_fusion]: 3.86999e-06 [matmul_add_comm_reduction]: 7.16001e-06 [allreduce_slice_to_reducescatter]: 5.00004e-07 [virtual_shard_identity]: 8.48999e-06 [virtual_dataset]: 7.71999e-06 [get_grad_eliminate_]: 6.94001e-06 [virtual_output]: 7.19001e-06 [merge_forward]: 3.43999e-06 [cell_reuse_recompute_pass]: 1.42999e-06 [offload_activation]: 8.13001e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.828e-05 [merge_recompute_call_nodes]: 1.04998e-06 [before_grad]: 1.21e-05 [set_forward_comm_id_for_comm_node_pass]: 4.89998e-06 [meta_fg_expand]: 3.03e-06 [flash_sp_send_recv_attached]: 1.24e-06 [receive_attached]: 1.19e-06 [after_resolve]: 1.267e-05 [a_after_grad]: 1.145e-05 [renormalize]: 7.99773e-08 [add_forward_monad_depend]: 1.52001e-06 [auto_monad_grad]: 1.09e-06 [auto_monad_eliminator]: 9.66e-06 [cse]: 1.882e-05 [a_3]: 5.915e-05 [py_interpret_to_execute_after_opt_a]: 1.42e-05 [slice_cell_reuse_recomputed_activation]: 4.53999e-06 [rewriter_after_opt_a]: 4.276e-05 [convert_after_rewriter]: 1.049e-05 [order_py_execute_after_rewriter]: 8.63001e-06 [mutable_eliminate]: 0.00050931 [opt_b]: 0.00030245, [1] [Cycle 1]: 0.00029345, [7] [b_1]: 0.00019496 [b_2]: 9.10001e-06 [updatestate_depend_eliminate]: 5.93002e-06 [updatestate_assign_eliminate]: 3.12002e-06 [updatestate_loads_eliminate]: 3.03e-06 [renormalize]: 2.50002e-07 [cse]: 2.253e-05 [optimize_parallel_all_gather_comm]: 1.978e-05 [overlap_param_gather]: 4.93001e-06 [cconv]: 2.773e-05 [loop_unroll]: 0.00042469 [opt_after_cconv]: 0.00013465, [1] [Cycle 1]: 0.00012615, [7] [c_1]: 3.713e-05 [parameter_eliminate]: 2.54999e-06 [updatestate_depend_eliminate]: 5.53997e-06 [updatestate_assign_eliminate]: 3.01999e-06 [updatestate_loads_eliminate]: 2.88e-06 [cse]: 2.077e-05 [renormalize]: 2.69996e-07 [remove_dup_value]: 1.821e-05 [tuple_transform]: 0.00010866, [1] [Cycle 1]: 0.00010061, [4] [d_1]: 5.558e-05 [none_parameter_eliminate]: 1.96e-06 [renormalize]: 1.39989e-07 [switch_simplify]: 9.07001e-06 [partial_unused_args_eliminate]: 4.64002e-06 [add_recomputation]: 5.702e-05 [cse_after_recomputation]: 3.095e-05, [1] [Cycle 1]: 2.422e-05, [1] [cse]: 1.514e-05 [environ_conv]: 9.99999e-06 [swap_dp_allreduce_reducescatter]: 8.79e-06 [bias_add_comm_swap]: 4.72e-06 [label_micro_interleaved_index]: 6.66e-06 [label_fine_grained_interleaved_index]: 5.51e-06 [merge_cast_opt]: 3.68e-06 [slice_recompute_activation]: 4.60999e-06 [micro_interleaved_order_control]: 5.12e-06 [assign_add_opt]: 3.55e-06 [ForceFp32Comm]: 3.39001e-06 [remove_cast_before_assign_add]: 3.31001e-06 [full_micro_interleaved_order_control]: 4.41002e-06 [reorder_send_recv_between_fp_bp]: 5.77999e-06 [comm_op_add_attrs]: 3.35998e-06 [add_comm_op_reuse_tag]: 3.61001e-06 [interleave_split_concat_branches]: 3.4e-06 [interleave_parallel_branches]: 3.68e-06 [overlap_opt_shard_in_pipeline]: 3.79002e-06 [overlap_opt_shard_grad_in_pipeline]: 4.68999e-06 [control_data_broadcast_order]: 1.743e-05 [grouped_pairwise_exchange_alltoall]: 4.15e-06 [offloading_packed_experts]: 7.01999e-06 [overlap_recompute_and_grad_model_parallel]: 7.56001e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.46001e-06 [overlap_recompute_allgather_and_fa_grad]: 4.09997e-06 [overlap_recompute_comm]: 5.07999e-06 [overlap_grad_ring_attention]: 7.35e-06 [overlap_grad_flash_sp]: 2.427e-05 [begin_end_overlap_inline]: 2.94001e-06 [split_matmul_comm_elemetwise]: 4.50001e-06 [split_layernorm_comm]: 4.94e-06 [handle_group_info]: 3.33998e-06 [symbol_engine_optimizer]: 0.00010155, [1] [Cycle 1]: 9.49e-05, [6] [build]: 3.26999e-06 [elim_shapecalc]: 1.094e-05 [elim_not_effective]: 1.525e-05 [opt_reshape]: 8.13999e-06 [fold_const_symbol]: 1.218e-05 [renormalize]: 2.29978e-07 [detach_backward]: 3.63e-06 [pipeline_parallel_scheduler]: 1.64998e-06 [auto_monad_reorder]: 0.0159454 [get_jit_bprop_graph]: 2.62001e-06 [rewriter_after_jit_bprop_graph]: 8.51002e-06 [opt_after_jit_grad]: 0.00072847 [validate]: 5.184e-05 Sums bootstrap : 0.000438s : 1.58% type_inference : 0.005834s : 21.07% event_method : 0.000018s : 0.07% auto_monad : 0.000062s : 0.22% graph_reusing : 0.000006s : 0.02% inline : 0.000002s : 0.01% add_attr.add_attr_with_inline.tag_attr : 0.000019s : 0.07% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000007s : 0.02% parallel-infer-symbol : 0.000003s : 0.01% pre_auto_parallel : 0.000033s : 0.12% insert-virtual-dataset : 0.000002s : 0.01% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.01% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000030s : 0.11% optimize.rewriter_before_opt_a : 0.000088s : 0.32% optimize.opt_a.expand_dump_flag : 0.000004s : 0.02% optimize.opt_a.switch_simplify : 0.000052s : 0.19% optimize.opt_a.loop_unroll : 0.000039s : 0.14% optimize.opt_a.a_1 : 0.000915s : 3.30% optimize.opt_a.with_stream_mark : 0.000029s : 0.11% optimize.opt_a.recompute_prepare : 0.000020s : 0.07% optimize.opt_a.updatestate_depend_eliminate : 0.000009s : 0.03% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.03% optimize.opt_a.updatestate_loads_eliminate : 0.000007s : 0.03% optimize.opt_a.parameter_eliminate : 0.000004s : 0.01% optimize.opt_a.a_2 : 0.000248s : 0.89% optimize.opt_a.accelerated_algorithm : 0.000017s : 0.06% optimize.opt_a.shard : 0.000004s : 0.01% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.01% optimize.opt_a.shard_inline : 0.000015s : 0.05% optimize.opt_a.merge_send_recv : 0.000016s : 0.06% optimize.opt_a.auto_parallel : 0.000014s : 0.05% optimize.opt_a.parallel : 0.000026s : 0.09% optimize.opt_a.flash_sp : 0.000014s : 0.05% optimize.opt_a.merge_comm : 0.000009s : 0.03% optimize.opt_a.allreduce_fusion : 0.000008s : 0.03% optimize.opt_a.matmul_add_comm_reduction : 0.000018s : 0.06% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000018s : 0.07% optimize.opt_a.virtual_dataset : 0.000016s : 0.06% optimize.opt_a.get_grad_eliminate_ : 0.000014s : 0.05% optimize.opt_a.virtual_output : 0.000015s : 0.06% optimize.opt_a.merge_forward : 0.000008s : 0.03% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.01% optimize.opt_a.offload_activation : 0.000019s : 0.07% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000036s : 0.13% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.01% optimize.opt_a.before_grad : 0.000026s : 0.09% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000010s : 0.04% optimize.opt_a.meta_fg_expand : 0.000007s : 0.02% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.01% optimize.opt_a.receive_attached : 0.000003s : 0.01% optimize.opt_a.after_resolve : 0.000026s : 0.09% optimize.opt_a.a_after_grad : 0.000024s : 0.09% optimize.opt_a.renormalize : 0.000686s : 2.48% optimize.opt_a.add_forward_monad_depend : 0.000008s : 0.03% optimize.opt_a.auto_monad_grad : 0.000004s : 0.01% optimize.opt_a.auto_monad_eliminator : 0.000027s : 0.10% optimize.opt_a.cse : 0.000054s : 0.19% optimize.opt_a.a_3 : 0.000132s : 0.48% optimize.py_interpret_to_execute_after_opt_a : 0.000014s : 0.05% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.02% optimize.rewriter_after_opt_a : 0.000043s : 0.15% optimize.convert_after_rewriter : 0.000010s : 0.04% optimize.order_py_execute_after_rewriter : 0.000009s : 0.03% optimize.mutable_eliminate : 0.000509s : 1.84% optimize.opt_b.b_1 : 0.000195s : 0.70% optimize.opt_b.b_2 : 0.000009s : 0.03% optimize.opt_b.updatestate_depend_eliminate : 0.000006s : 0.02% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.01% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.01% optimize.opt_b.renormalize : 0.000000s : 0.00% optimize.opt_b.cse : 0.000023s : 0.08% optimize.optimize_parallel_all_gather_comm : 0.000020s : 0.07% optimize.overlap_param_gather : 0.000005s : 0.02% optimize.cconv : 0.000028s : 0.10% optimize.loop_unroll : 0.000425s : 1.53% optimize.opt_after_cconv.c_1 : 0.000037s : 0.13% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.02% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.cse : 0.000021s : 0.08% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000018s : 0.07% optimize.tuple_transform.d_1 : 0.000056s : 0.20% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000009s : 0.03% optimize.partial_unused_args_eliminate : 0.000005s : 0.02% optimize.add_recomputation : 0.000057s : 0.21% optimize.cse_after_recomputation.cse : 0.000015s : 0.05% optimize.environ_conv : 0.000010s : 0.04% optimize.swap_dp_allreduce_reducescatter : 0.000009s : 0.03% optimize.bias_add_comm_swap : 0.000005s : 0.02% optimize.label_micro_interleaved_index : 0.000007s : 0.02% optimize.label_fine_grained_interleaved_index : 0.000006s : 0.02% optimize.merge_cast_opt : 0.000004s : 0.01% optimize.slice_recompute_activation : 0.000005s : 0.02% optimize.micro_interleaved_order_control : 0.000005s : 0.02% optimize.assign_add_opt : 0.000004s : 0.01% optimize.ForceFp32Comm : 0.000003s : 0.01% optimize.remove_cast_before_assign_add : 0.000003s : 0.01% optimize.full_micro_interleaved_order_control : 0.000004s : 0.02% optimize.reorder_send_recv_between_fp_bp : 0.000006s : 0.02% optimize.comm_op_add_attrs : 0.000003s : 0.01% optimize.add_comm_op_reuse_tag : 0.000004s : 0.01% optimize.interleave_split_concat_branches : 0.000003s : 0.01% optimize.interleave_parallel_branches : 0.000004s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000005s : 0.02% optimize.control_data_broadcast_order : 0.000017s : 0.06% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.01% optimize.offloading_packed_experts : 0.000007s : 0.03% optimize.overlap_recompute_and_grad_model_parallel : 0.000008s : 0.03% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000003s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.01% optimize.overlap_recompute_comm : 0.000005s : 0.02% optimize.overlap_grad_ring_attention : 0.000007s : 0.03% optimize.overlap_grad_flash_sp : 0.000024s : 0.09% optimize.begin_end_overlap_inline : 0.000003s : 0.01% optimize.split_matmul_comm_elemetwise : 0.000005s : 0.02% optimize.split_layernorm_comm : 0.000005s : 0.02% optimize.handle_group_info : 0.000003s : 0.01% optimize.symbol_engine_optimizer.build : 0.000003s : 0.01% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000011s : 0.04% optimize.symbol_engine_optimizer.elim_not_effective : 0.000015s : 0.06% optimize.symbol_engine_optimizer.opt_reshape : 0.000008s : 0.03% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000012s : 0.04% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000004s : 0.01% pipeline_parallel_scheduler : 0.000002s : 0.01% auto_monad_reorder : 0.015945s : 57.59% get_jit_bprop_graph : 0.000003s : 0.01% rewriter_after_jit_bprop_graph : 0.000009s : 0.03% opt_after_jit_grad : 0.000728s : 2.63% validate : 0.000052s : 0.19% Time group info: ------[substitution.] 0.000206 38 10.20% : 0.000021s : 3: substitution.cast_eliminate 1.08% : 0.000002s : 3: substitution.elim_not_effective 0.83% : 0.000002s : 3: substitution.fold_const_symbol 3.45% : 0.000007s : 5: substitution.graph_param_transform 68.11% : 0.000140s : 4: substitution.inline 2.35% : 0.000005s : 6: substitution.j_node_and_user_rematch 3.98% : 0.000008s : 6: substitution.remove_not_recompute_node 2.25% : 0.000005s : 4: substitution.replace_old_param 7.75% : 0.000016s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.005788 2 87.26% : 0.005050s : 1: type_inference.infer 12.74% : 0.000738s : 1: type_inference.specialize ------[replace.] 0.000064 8 59.30% : 0.000038s : 4: replace.inline 40.70% : 0.000026s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000152 8 90.80% : 0.000138s : 4: match.inline 9.20% : 0.000014s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000254 1596 0.91% : 0.000002s : 17: predicate.accumulaten_eliminater 0.95% : 0.000002s : 5: predicate.ad_related_special_op_eliminate 0.54% : 0.000001s : 10: predicate.addn_check_dump 1.03% : 0.000003s : 17: predicate.addn_zero_filter 0.89% : 0.000002s : 17: predicate.adjust_all_reduce_mul_add 1.98% : 0.000005s : 27: predicate.arithmetic_simplify 1.11% : 0.000003s : 17: predicate.cast_eliminate 0.62% : 0.000002s : 10: predicate.check_bprop_eliminate 0.59% : 0.000001s : 10: predicate.compare_switch_simplify 0.18% : 0.000000s : 5: predicate.const_output_eliminate 0.59% : 0.000001s : 10: predicate.depend_value_elim 0.98% : 0.000002s : 17: predicate.dict_get_item_const_eliminator 1.06% : 0.000003s : 17: predicate.dict_get_item_eliminator 0.91% : 0.000002s : 17: predicate.dict_set_item_eliminator 1.00% : 0.000003s : 10: predicate.dumpgradient_eliminate 0.21% : 0.000001s : 5: predicate.elim_not_effective 0.35% : 0.000001s : 5: predicate.elim_shapecalc_of_broadcastargs 1.25% : 0.000003s : 22: predicate.environ_add_const_eliminate 1.15% : 0.000003s : 22: predicate.environ_get_add_eliminate 1.22% : 0.000003s : 22: predicate.environ_get_depend_swap 1.85% : 0.000005s : 32: predicate.environ_get_eliminate 1.20% : 0.000003s : 22: predicate.environ_get_set_eliminate 1.39% : 0.000004s : 25: predicate.exchange_switch_depend_value 2.25% : 0.000006s : 25: predicate.float_depend_g_call 0.52% : 0.000001s : 10: predicate.float_environ_get_switch 0.80% : 0.000002s : 15: predicate.float_tuple_getitem_switch 0.19% : 0.000000s : 5: predicate.fold_const_symbol 0.61% : 0.000002s : 10: predicate.get_grad_eliminate 0.21% : 0.000001s : 5: predicate.graph_param_transform 0.58% : 0.000001s : 10: predicate.incorporate_call 0.51% : 0.000001s : 10: predicate.incorporate_call_switch 6.18% : 0.000016s : 72: predicate.inline 0.90% : 0.000002s : 10: predicate.inline_without_move 0.32% : 0.000001s : 10: predicate.j_node_and_user_rematch 0.76% : 0.000002s : 10: predicate.less_batch_normalization 1.90% : 0.000005s : 31: predicate.list_to_tuple_eliminator_ 2.66% : 0.000007s : 48: predicate.load_eliminater 0.78% : 0.000002s : 5: predicate.loop_unroll_after_grad 2.12% : 0.000005s : 36: predicate.loop_unroll_before_grad 1.68% : 0.000004s : 27: predicate.make_slice_get_slice_eliminator 0.58% : 0.000001s : 10: predicate.merge_addn 0.61% : 0.000002s : 10: predicate.micro_step_allgather_replace 0.55% : 0.000001s : 10: predicate.mini_step_allgather_replace 0.88% : 0.000002s : 17: predicate.minmaximum_grad 0.82% : 0.000002s : 5: predicate.mutable_eliminate 0.34% : 0.000001s : 5: predicate.opt_reshape 0.47% : 0.000001s : 5: predicate.parallel_virtual_node 1.70% : 0.000004s : 25: predicate.partial_defer_inline 1.72% : 0.000004s : 26: predicate.partial_eliminate 0.96% : 0.000002s : 17: predicate.print_const_string_wrapper 0.61% : 0.000002s : 10: predicate.reduce_all_const_elim 1.12% : 0.000003s : 17: predicate.reduce_eliminate 2.57% : 0.000007s : 48: predicate.redundant_stop_gradient_eliminater 0.36% : 0.000001s : 10: predicate.remove_not_recompute_node 1.35% : 0.000003s : 31: predicate.replace_applicator 0.49% : 0.000001s : 10: predicate.replace_old_param 0.25% : 0.000001s : 5: predicate.reset_defer_inline 1.07% : 0.000003s : 17: predicate.reshape_eliminate 0.63% : 0.000002s : 10: predicate.row_tensor_add_zeros_like 0.35% : 0.000001s : 5: predicate.row_tensor_eliminate 0.76% : 0.000002s : 10: predicate.same_eliminate 0.43% : 0.000001s : 10: predicate.set_cell_output_no_recompute 0.74% : 0.000002s : 10: predicate.shard_identity_eliminate 0.90% : 0.000002s : 10: predicate.special_op_eliminate 0.76% : 0.000002s : 10: predicate.specialize_transform 0.87% : 0.000002s : 10: predicate.split_environ_get_set_with_tuple_value 0.76% : 0.000002s : 10: predicate.stack_unstack_eliminate 0.35% : 0.000001s : 5: predicate.switch_call_monad_eliminater 1.53% : 0.000004s : 25: predicate.switch_defer_inline 2.08% : 0.000005s : 35: predicate.switch_layer_defer_inline 4.62% : 0.000012s : 76: predicate.switch_simplify 0.94% : 0.000002s : 17: predicate.tile_eliminate 0.94% : 0.000002s : 17: predicate.transpose_eliminate 1.64% : 0.000004s : 27: predicate.tuple_list_convert_item_index_to_positive 1.71% : 0.000004s : 27: predicate.tuple_list_get_item_const_eliminator 1.53% : 0.000004s : 27: predicate.tuple_list_get_item_depend_reorder 3.24% : 0.000008s : 41: predicate.tuple_list_get_item_eliminator 1.63% : 0.000004s : 27: predicate.tuple_list_get_set_item_eliminator 2.17% : 0.000006s : 37: predicate.tuple_list_set_item_eliminator 1.79% : 0.000005s : 31: predicate.tuple_to_list_eliminator_ 2.48% : 0.000006s : 48: predicate.updatestate_pure_node_eliminater 3.16% : 0.000008s : 58: predicate.updatestate_useless_node_eliminater 0.33% : 0.000001s : 5: predicate.value_based_eliminate 0.72% : 0.000002s : 10: predicate.virtual_dataset_eliminate 0.62% : 0.000002s : 10: predicate.virtual_output_eliminate 0.29% : 0.000001s : 5: predicate.virtual_view_grad_eliminate 0.66% : 0.000002s : 5: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000552 11 51.20% : 0.000283s : 5: func_graph_cloner_run.FuncGraphClonerGraph 48.80% : 0.000269s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.043568 192 0.01% : 0.000006s : 1: ForceFp32Comm 6.97% : 0.003036s : 1: add_attr 6.94% : 0.003023s : 1: add_attr_with_inline 0.01% : 0.000006s : 1: add_comm_op_reuse_tag 0.14% : 0.000060s : 1: add_recomputation 0.01% : 0.000006s : 1: assign_add_opt 0.16% : 0.000071s : 1: auto_monad 36.68% : 0.015980s : 1: auto_monad_reorder 0.01% : 0.000006s : 1: begin_end_overlap_inline 0.02% : 0.000007s : 1: bias_add_comm_swap 1.10% : 0.000481s : 1: bootstrap 0.07% : 0.000031s : 1: cconv 0.01% : 0.000006s : 1: comm_op_add_attrs 0.05% : 0.000020s : 1: control_data_broadcast_order 0.03% : 0.000014s : 1: convert_after_rewriter 0.08% : 0.000034s : 1: cse_after_recomputation 0.02% : 0.000008s : 1: dataset_repeat_opt 0.04% : 0.000018s : 1: detach_backward 0.03% : 0.000013s : 1: environ_conv 0.06% : 0.000028s : 1: event_method 0.02% : 0.000007s : 1: full_micro_interleaved_order_control 0.03% : 0.000013s : 1: get_jit_bprop_graph 0.03% : 0.000012s : 1: graph_reusing 0.02% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000006s : 1: handle_group_info 0.02% : 0.000008s : 1: inline 0.02% : 0.000009s : 1: insert-virtual-dataset 0.01% : 0.000006s : 1: interleave_parallel_branches 0.01% : 0.000006s : 1: interleave_split_concat_branches 0.02% : 0.000008s : 1: label_fine_grained_interleaved_index 0.02% : 0.000009s : 1: label_micro_interleaved_index 0.99% : 0.000430s : 1: loop_unroll 0.01% : 0.000006s : 1: merge_cast_opt 0.02% : 0.000008s : 1: micro_interleaved_order_control 1.18% : 0.000515s : 1: mutable_eliminate 0.02% : 0.000010s : 1: offloading_packed_experts 0.04% : 0.000016s : 1: opt.transform.loop_unroll_optimizer 0.04% : 0.000016s : 1: opt.transform.mutable_eliminate 3.30% : 0.001438s : 78: opt.transform.opt_a 0.08% : 0.000036s : 1: opt.transform.opt_after_cconv 0.09% : 0.000038s : 1: opt.transform.opt_after_jit_grad 0.30% : 0.000133s : 28: opt.transform.opt_b 0.14% : 0.000062s : 2: opt.transform.opt_trans_graph 0.10% : 0.000043s : 4: opt.transform.symbol_engine_opt 7.74% : 0.003374s : 1: opt_a 0.32% : 0.000138s : 1: opt_after_cconv 1.70% : 0.000739s : 1: opt_after_jit_grad 0.70% : 0.000306s : 1: opt_b 13.93% : 0.006068s : 1: optimize 0.05% : 0.000023s : 1: optimize_parallel_all_gather_comm 0.03% : 0.000012s : 1: order_py_execute_after_rewriter 0.06% : 0.000027s : 1: overlap_grad_flash_sp 0.01% : 0.000006s : 1: overlap_grad_matmul_and_grad_allreduce 0.02% : 0.000010s : 1: overlap_grad_ring_attention 0.02% : 0.000007s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000006s : 1: overlap_opt_shard_in_pipeline 0.02% : 0.000008s : 1: overlap_param_gather 0.02% : 0.000007s : 1: overlap_recompute_allgather_and_fa_grad 0.02% : 0.000010s : 1: overlap_recompute_and_grad_model_parallel 0.02% : 0.000008s : 1: overlap_recompute_comm 0.02% : 0.000009s : 1: parallel-infer-symbol 0.01% : 0.000006s : 1: parallel-infer-symbol-second 0.02% : 0.000008s : 1: partial_unused_args_eliminate 0.02% : 0.000009s : 1: pipeline_parallel_scheduler 0.02% : 0.000007s : 1: pipeline_split 0.09% : 0.000040s : 1: pre_auto_parallel 0.08% : 0.000034s : 1: py_interpret_to_execute 0.04% : 0.000017s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000006s : 1: remove_cast_before_assign_add 0.05% : 0.000022s : 1: remove_dup_value 0.88% : 0.000384s : 1: renormalize.infer 0.67% : 0.000294s : 1: renormalize.specialize 0.02% : 0.000008s : 1: reorder_send_recv_between_fp_bp 0.03% : 0.000015s : 1: rewriter_after_jit_bprop_graph 0.11% : 0.000046s : 1: rewriter_after_opt_a 0.21% : 0.000092s : 1: rewriter_before_opt_a 0.02% : 0.000007s : 1: slice_cell_reuse_recomputed_activation 0.02% : 0.000007s : 1: slice_recompute_activation 0.02% : 0.000008s : 1: split_layernorm_comm 0.02% : 0.000007s : 1: split_matmul_comm_elemetwise 0.03% : 0.000012s : 1: swap_dp_allreduce_reducescatter 0.24% : 0.000104s : 1: symbol_engine_optimizer 0.26% : 0.000112s : 1: tuple_transform 13.46% : 0.005866s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:45:51.138.14 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0152152, [21] [bootstrap]: 0.00042104 [type_inference]: 0.00588578 [event_method]: 1.897e-05 [auto_monad]: 6.708e-05 [graph_reusing]: 6.39999e-06 [inline]: 2.09e-06 [add_attr]: 0.00311357, [1] [add_attr_with_inline]: 0.00310386, [1] [Cycle 1]: 5.728e-05, [2] [tag_attr]: 1.918e-05 [meta_addattr_fg_expand]: 6.53998e-06 [parallel-infer-symbol]: 3.14001e-06 [pre_auto_parallel]: 3.396e-05 [insert-virtual-dataset]: 2.86e-06 [parallel-infer-symbol-second]: 8.60018e-07 [dataset_repeat_opt]: 2.31998e-06 [pipeline_split]: 1.63002e-06 [optimize]: 0.00499096, [53] [py_interpret_to_execute]: 2.807e-05 [rewriter_before_opt_a]: 8.376e-05 [opt_a]: 0.00294066, [2] [Cycle 1]: 0.00218988, [45] [expand_dump_flag]: 2.74999e-06 [switch_simplify]: 4.391e-05 [loop_unroll]: 3.084e-05 [a_1]: 0.00073178 [with_stream_mark]: 1.684e-05 [recompute_prepare]: 1.019e-05 [updatestate_depend_eliminate]: 4.45999e-06 [updatestate_assign_eliminate]: 4.02e-06 [updatestate_loads_eliminate]: 3.73001e-06 [parameter_eliminate]: 1.86e-06 [a_2]: 0.00010017 [accelerated_algorithm]: 8.28001e-06 [shard]: 1.86e-06 [meta_shard_fg_expand]: 1.99e-06 [shard_inline]: 9.02e-06 [merge_send_recv]: 9.26998e-06 [auto_parallel]: 7e-06 [parallel]: 1.915e-05 [flash_sp]: 8.45999e-06 [merge_comm]: 4.65999e-06 [allreduce_fusion]: 4.18999e-06 [matmul_add_comm_reduction]: 1.083e-05 [allreduce_slice_to_reducescatter]: 7.09988e-07 [virtual_shard_identity]: 9.05001e-06 [virtual_dataset]: 7.93001e-06 [get_grad_eliminate_]: 7.34002e-06 [virtual_output]: 7.66999e-06 [merge_forward]: 4.48999e-06 [cell_reuse_recompute_pass]: 1.35001e-06 [offload_activation]: 1.168e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.533e-05 [merge_recompute_call_nodes]: 1.43002e-06 [before_grad]: 1.268e-05 [set_forward_comm_id_for_comm_node_pass]: 4.37e-06 [meta_fg_expand]: 3.16999e-06 [flash_sp_send_recv_attached]: 2.35002e-06 [receive_attached]: 2.34001e-06 [after_resolve]: 1.238e-05 [a_after_grad]: 1.194e-05 [renormalize]: 0.00068703 [add_forward_monad_depend]: 5.29e-06 [auto_monad_grad]: 2.65997e-06 [auto_monad_eliminator]: 1.645e-05 [cse]: 3.206e-05 [a_3]: 5.588e-05 [Cycle 2]: 0.00074084, [45] [expand_dump_flag]: 1.07e-06 [switch_simplify]: 8.49002e-06 [loop_unroll]: 7.63999e-06 [a_1]: 0.00017245 [with_stream_mark]: 1.214e-05 [recompute_prepare]: 7.51999e-06 [updatestate_depend_eliminate]: 3.75998e-06 [updatestate_assign_eliminate]: 2.82002e-06 [updatestate_loads_eliminate]: 2.86999e-06 [parameter_eliminate]: 1.01002e-06 [a_2]: 9.156e-05 [accelerated_algorithm]: 7.69002e-06 [shard]: 1.24998e-06 [meta_shard_fg_expand]: 1.61002e-06 [shard_inline]: 7.31999e-06 [merge_send_recv]: 6.19001e-06 [auto_parallel]: 6.21998e-06 [parallel]: 4.35e-06 [flash_sp]: 3.31999e-06 [merge_comm]: 6.78e-06 [allreduce_fusion]: 3.65e-06 [matmul_add_comm_reduction]: 5.92999e-06 [allreduce_slice_to_reducescatter]: 3.39991e-07 [virtual_shard_identity]: 7.9e-06 [virtual_dataset]: 7.26999e-06 [get_grad_eliminate_]: 6.76999e-06 [virtual_output]: 6.75002e-06 [merge_forward]: 3.25998e-06 [cell_reuse_recompute_pass]: 1.55999e-06 [offload_activation]: 7.00998e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.318e-05 [merge_recompute_call_nodes]: 7.39994e-07 [before_grad]: 1.088e-05 [set_forward_comm_id_for_comm_node_pass]: 4.03999e-06 [meta_fg_expand]: 2.72001e-06 [flash_sp_send_recv_attached]: 7.89994e-07 [receive_attached]: 9.89996e-07 [after_resolve]: 1.274e-05 [a_after_grad]: 1.166e-05 [renormalize]: 8.00064e-08 [add_forward_monad_depend]: 1.26002e-06 [auto_monad_grad]: 1.02e-06 [auto_monad_eliminator]: 8.28001e-06 [cse]: 1.675e-05 [a_3]: 4.45e-05 [py_interpret_to_execute_after_opt_a]: 9.70002e-06 [slice_cell_reuse_recomputed_activation]: 2.14999e-06 [rewriter_after_opt_a]: 3.95e-05 [convert_after_rewriter]: 8e-06 [order_py_execute_after_rewriter]: 6.04999e-06 [mutable_eliminate]: 0.00048683 [opt_b]: 0.0002467, [1] [Cycle 1]: 0.00024066, [7] [b_1]: 0.00016186 [b_2]: 9.63002e-06 [updatestate_depend_eliminate]: 5.57001e-06 [updatestate_assign_eliminate]: 2.89999e-06 [updatestate_loads_eliminate]: 2.94001e-06 [renormalize]: 5.50004e-07 [cse]: 2.126e-05 [optimize_parallel_all_gather_comm]: 1.686e-05 [overlap_param_gather]: 1.79e-06 [cconv]: 2.51e-05 [loop_unroll]: 0.0004105 [opt_after_cconv]: 0.0001126, [1] [Cycle 1]: 0.00010689, [7] [c_1]: 3.71e-05 [parameter_eliminate]: 2.94001e-06 [updatestate_depend_eliminate]: 5.92001e-06 [updatestate_assign_eliminate]: 3.00002e-06 [updatestate_loads_eliminate]: 2.94001e-06 [cse]: 2.042e-05 [renormalize]: 4.39992e-07 [remove_dup_value]: 1.343e-05 [tuple_transform]: 8.385e-05, [1] [Cycle 1]: 7.949e-05, [4] [d_1]: 5.08e-05 [none_parameter_eliminate]: 2.01e-06 [renormalize]: 2.10013e-07 [switch_simplify]: 8.25e-06 [partial_unused_args_eliminate]: 1.74e-06 [add_recomputation]: 5.255e-05 [cse_after_recomputation]: 2.425e-05, [1] [Cycle 1]: 1.987e-05, [1] [cse]: 1.431e-05 [environ_conv]: 6.29001e-06 [swap_dp_allreduce_reducescatter]: 5.99999e-06 [bias_add_comm_swap]: 2.94999e-06 [label_micro_interleaved_index]: 4e-06 [label_fine_grained_interleaved_index]: 2.89999e-06 [merge_cast_opt]: 1.30001e-06 [slice_recompute_activation]: 1.97999e-06 [micro_interleaved_order_control]: 2.17999e-06 [assign_add_opt]: 1.42999e-06 [ForceFp32Comm]: 1.10001e-06 [remove_cast_before_assign_add]: 9.49978e-07 [full_micro_interleaved_order_control]: 2.27001e-06 [reorder_send_recv_between_fp_bp]: 2.83e-06 [comm_op_add_attrs]: 9.70002e-07 [add_comm_op_reuse_tag]: 9.99979e-07 [interleave_split_concat_branches]: 1.22e-06 [interleave_parallel_branches]: 9.89996e-07 [overlap_opt_shard_in_pipeline]: 1.44003e-06 [overlap_opt_shard_grad_in_pipeline]: 1.96e-06 [control_data_broadcast_order]: 1.457e-05 [grouped_pairwise_exchange_alltoall]: 1.49e-06 [offloading_packed_experts]: 4.12998e-06 [overlap_recompute_and_grad_model_parallel]: 5.05999e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.41002e-06 [overlap_recompute_allgather_and_fa_grad]: 1.33002e-06 [overlap_recompute_comm]: 2.07001e-06 [overlap_grad_ring_attention]: 4.49002e-06 [overlap_grad_flash_sp]: 2.053e-05 [begin_end_overlap_inline]: 5.79981e-07 [split_matmul_comm_elemetwise]: 2.07001e-06 [split_layernorm_comm]: 1.60999e-06 [handle_group_info]: 1.15999e-06 [symbol_engine_optimizer]: 8.089e-05, [1] [Cycle 1]: 7.658e-05, [6] [build]: 3.20998e-06 [elim_shapecalc]: 1.06e-05 [elim_not_effective]: 1.447e-05 [opt_reshape]: 8.24998e-06 [fold_const_symbol]: 1.183e-05 [renormalize]: 2.50002e-07 [detach_backward]: 1.84998e-06 [pipeline_parallel_scheduler]: 1.54e-06 [auto_monad_reorder]: 1.901e-05 [get_jit_bprop_graph]: 1.98002e-06 [rewriter_after_jit_bprop_graph]: 3.71999e-06 [opt_after_jit_grad]: 0.00044853 [validate]: 4.026e-05 Sums bootstrap : 0.000421s : 3.77% type_inference : 0.005886s : 52.70% event_method : 0.000019s : 0.17% auto_monad : 0.000067s : 0.60% graph_reusing : 0.000006s : 0.06% inline : 0.000002s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000019s : 0.17% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000007s : 0.06% parallel-infer-symbol : 0.000003s : 0.03% pre_auto_parallel : 0.000034s : 0.30% insert-virtual-dataset : 0.000003s : 0.03% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000028s : 0.25% optimize.rewriter_before_opt_a : 0.000084s : 0.75% optimize.opt_a.expand_dump_flag : 0.000004s : 0.03% optimize.opt_a.switch_simplify : 0.000052s : 0.47% optimize.opt_a.loop_unroll : 0.000038s : 0.34% optimize.opt_a.a_1 : 0.000904s : 8.10% optimize.opt_a.with_stream_mark : 0.000029s : 0.26% optimize.opt_a.recompute_prepare : 0.000018s : 0.16% optimize.opt_a.updatestate_depend_eliminate : 0.000008s : 0.07% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.06% optimize.opt_a.updatestate_loads_eliminate : 0.000007s : 0.06% optimize.opt_a.parameter_eliminate : 0.000003s : 0.03% optimize.opt_a.a_2 : 0.000192s : 1.72% optimize.opt_a.accelerated_algorithm : 0.000016s : 0.14% optimize.opt_a.shard : 0.000003s : 0.03% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.03% optimize.opt_a.shard_inline : 0.000016s : 0.15% optimize.opt_a.merge_send_recv : 0.000015s : 0.14% optimize.opt_a.auto_parallel : 0.000013s : 0.12% optimize.opt_a.parallel : 0.000024s : 0.21% optimize.opt_a.flash_sp : 0.000012s : 0.11% optimize.opt_a.merge_comm : 0.000011s : 0.10% optimize.opt_a.allreduce_fusion : 0.000008s : 0.07% optimize.opt_a.matmul_add_comm_reduction : 0.000017s : 0.15% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000017s : 0.15% optimize.opt_a.virtual_dataset : 0.000015s : 0.14% optimize.opt_a.get_grad_eliminate_ : 0.000014s : 0.13% optimize.opt_a.virtual_output : 0.000014s : 0.13% optimize.opt_a.merge_forward : 0.000008s : 0.07% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.03% optimize.opt_a.offload_activation : 0.000019s : 0.17% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000029s : 0.26% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.02% optimize.opt_a.before_grad : 0.000024s : 0.21% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000008s : 0.08% optimize.opt_a.meta_fg_expand : 0.000006s : 0.05% optimize.opt_a.flash_sp_send_recv_attached : 0.000003s : 0.03% optimize.opt_a.receive_attached : 0.000003s : 0.03% optimize.opt_a.after_resolve : 0.000025s : 0.22% optimize.opt_a.a_after_grad : 0.000024s : 0.21% optimize.opt_a.renormalize : 0.000687s : 6.15% optimize.opt_a.add_forward_monad_depend : 0.000007s : 0.06% optimize.opt_a.auto_monad_grad : 0.000004s : 0.03% optimize.opt_a.auto_monad_eliminator : 0.000025s : 0.22% optimize.opt_a.cse : 0.000049s : 0.44% optimize.opt_a.a_3 : 0.000100s : 0.90% optimize.py_interpret_to_execute_after_opt_a : 0.000010s : 0.09% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.02% optimize.rewriter_after_opt_a : 0.000040s : 0.35% optimize.convert_after_rewriter : 0.000008s : 0.07% optimize.order_py_execute_after_rewriter : 0.000006s : 0.05% optimize.mutable_eliminate : 0.000487s : 4.36% optimize.opt_b.b_1 : 0.000162s : 1.45% optimize.opt_b.b_2 : 0.000010s : 0.09% optimize.opt_b.updatestate_depend_eliminate : 0.000006s : 0.05% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000021s : 0.19% optimize.optimize_parallel_all_gather_comm : 0.000017s : 0.15% optimize.overlap_param_gather : 0.000002s : 0.02% optimize.cconv : 0.000025s : 0.22% optimize.loop_unroll : 0.000411s : 3.68% optimize.opt_after_cconv.c_1 : 0.000037s : 0.33% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.05% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.cse : 0.000020s : 0.18% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000013s : 0.12% optimize.tuple_transform.d_1 : 0.000051s : 0.45% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.02% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000008s : 0.07% optimize.partial_unused_args_eliminate : 0.000002s : 0.02% optimize.add_recomputation : 0.000053s : 0.47% optimize.cse_after_recomputation.cse : 0.000014s : 0.13% optimize.environ_conv : 0.000006s : 0.06% optimize.swap_dp_allreduce_reducescatter : 0.000006s : 0.05% optimize.bias_add_comm_swap : 0.000003s : 0.03% optimize.label_micro_interleaved_index : 0.000004s : 0.04% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.03% optimize.merge_cast_opt : 0.000001s : 0.01% optimize.slice_recompute_activation : 0.000002s : 0.02% optimize.micro_interleaved_order_control : 0.000002s : 0.02% optimize.assign_add_opt : 0.000001s : 0.01% optimize.ForceFp32Comm : 0.000001s : 0.01% optimize.remove_cast_before_assign_add : 0.000001s : 0.01% optimize.full_micro_interleaved_order_control : 0.000002s : 0.02% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.03% optimize.comm_op_add_attrs : 0.000001s : 0.01% optimize.add_comm_op_reuse_tag : 0.000001s : 0.01% optimize.interleave_split_concat_branches : 0.000001s : 0.01% optimize.interleave_parallel_branches : 0.000001s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.02% optimize.control_data_broadcast_order : 0.000015s : 0.13% optimize.grouped_pairwise_exchange_alltoall : 0.000001s : 0.01% optimize.offloading_packed_experts : 0.000004s : 0.04% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.05% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.01% optimize.overlap_recompute_comm : 0.000002s : 0.02% optimize.overlap_grad_ring_attention : 0.000004s : 0.04% optimize.overlap_grad_flash_sp : 0.000021s : 0.18% optimize.begin_end_overlap_inline : 0.000001s : 0.01% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.02% optimize.split_layernorm_comm : 0.000002s : 0.01% optimize.handle_group_info : 0.000001s : 0.01% optimize.symbol_engine_optimizer.build : 0.000003s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000011s : 0.09% optimize.symbol_engine_optimizer.elim_not_effective : 0.000014s : 0.13% optimize.symbol_engine_optimizer.opt_reshape : 0.000008s : 0.07% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000012s : 0.11% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.02% pipeline_parallel_scheduler : 0.000002s : 0.01% auto_monad_reorder : 0.000019s : 0.17% get_jit_bprop_graph : 0.000002s : 0.02% rewriter_after_jit_bprop_graph : 0.000004s : 0.03% opt_after_jit_grad : 0.000449s : 4.02% validate : 0.000040s : 0.36% Time group info: ------[substitution.] 0.000202 38 9.07% : 0.000018s : 3: substitution.cast_eliminate 1.10% : 0.000002s : 3: substitution.elim_not_effective 0.78% : 0.000002s : 3: substitution.fold_const_symbol 3.11% : 0.000006s : 5: substitution.graph_param_transform 70.78% : 0.000143s : 4: substitution.inline 2.02% : 0.000004s : 6: substitution.j_node_and_user_rematch 2.79% : 0.000006s : 6: substitution.remove_not_recompute_node 2.98% : 0.000006s : 4: substitution.replace_old_param 7.38% : 0.000015s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.005826 2 87.62% : 0.005104s : 1: type_inference.infer 12.38% : 0.000721s : 1: type_inference.specialize ------[replace.] 0.000064 8 61.42% : 0.000039s : 4: replace.inline 38.58% : 0.000025s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000153 8 91.63% : 0.000140s : 4: match.inline 8.37% : 0.000013s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000249 1596 0.99% : 0.000002s : 17: predicate.accumulaten_eliminater 0.65% : 0.000002s : 5: predicate.ad_related_special_op_eliminate 0.54% : 0.000001s : 10: predicate.addn_check_dump 0.93% : 0.000002s : 17: predicate.addn_zero_filter 0.88% : 0.000002s : 17: predicate.adjust_all_reduce_mul_add 2.02% : 0.000005s : 27: predicate.arithmetic_simplify 1.06% : 0.000003s : 17: predicate.cast_eliminate 0.57% : 0.000001s : 10: predicate.check_bprop_eliminate 0.60% : 0.000001s : 10: predicate.compare_switch_simplify 0.20% : 0.000000s : 5: predicate.const_output_eliminate 0.61% : 0.000002s : 10: predicate.depend_value_elim 1.00% : 0.000002s : 17: predicate.dict_get_item_const_eliminator 1.06% : 0.000003s : 17: predicate.dict_get_item_eliminator 0.92% : 0.000002s : 17: predicate.dict_set_item_eliminator 0.87% : 0.000002s : 10: predicate.dumpgradient_eliminate 0.22% : 0.000001s : 5: predicate.elim_not_effective 0.36% : 0.000001s : 5: predicate.elim_shapecalc_of_broadcastargs 1.20% : 0.000003s : 22: predicate.environ_add_const_eliminate 1.14% : 0.000003s : 22: predicate.environ_get_add_eliminate 1.35% : 0.000003s : 22: predicate.environ_get_depend_swap 1.69% : 0.000004s : 32: predicate.environ_get_eliminate 1.16% : 0.000003s : 22: predicate.environ_get_set_eliminate 1.42% : 0.000004s : 25: predicate.exchange_switch_depend_value 2.39% : 0.000006s : 25: predicate.float_depend_g_call 0.53% : 0.000001s : 10: predicate.float_environ_get_switch 0.79% : 0.000002s : 15: predicate.float_tuple_getitem_switch 0.18% : 0.000000s : 5: predicate.fold_const_symbol 0.63% : 0.000002s : 10: predicate.get_grad_eliminate 0.19% : 0.000000s : 5: predicate.graph_param_transform 0.63% : 0.000002s : 10: predicate.incorporate_call 0.50% : 0.000001s : 10: predicate.incorporate_call_switch 6.55% : 0.000016s : 72: predicate.inline 0.84% : 0.000002s : 10: predicate.inline_without_move 0.33% : 0.000001s : 10: predicate.j_node_and_user_rematch 0.85% : 0.000002s : 10: predicate.less_batch_normalization 1.85% : 0.000005s : 31: predicate.list_to_tuple_eliminator_ 2.82% : 0.000007s : 48: predicate.load_eliminater 0.85% : 0.000002s : 5: predicate.loop_unroll_after_grad 2.08% : 0.000005s : 36: predicate.loop_unroll_before_grad 1.71% : 0.000004s : 27: predicate.make_slice_get_slice_eliminator 0.55% : 0.000001s : 10: predicate.merge_addn 0.60% : 0.000002s : 10: predicate.micro_step_allgather_replace 0.59% : 0.000001s : 10: predicate.mini_step_allgather_replace 0.88% : 0.000002s : 17: predicate.minmaximum_grad 0.89% : 0.000002s : 5: predicate.mutable_eliminate 0.34% : 0.000001s : 5: predicate.opt_reshape 0.29% : 0.000001s : 5: predicate.parallel_virtual_node 1.68% : 0.000004s : 25: predicate.partial_defer_inline 1.77% : 0.000004s : 26: predicate.partial_eliminate 0.95% : 0.000002s : 17: predicate.print_const_string_wrapper 0.59% : 0.000001s : 10: predicate.reduce_all_const_elim 1.14% : 0.000003s : 17: predicate.reduce_eliminate 2.61% : 0.000007s : 48: predicate.redundant_stop_gradient_eliminater 0.41% : 0.000001s : 10: predicate.remove_not_recompute_node 1.44% : 0.000004s : 31: predicate.replace_applicator 0.51% : 0.000001s : 10: predicate.replace_old_param 0.26% : 0.000001s : 5: predicate.reset_defer_inline 1.00% : 0.000002s : 17: predicate.reshape_eliminate 0.61% : 0.000002s : 10: predicate.row_tensor_add_zeros_like 0.37% : 0.000001s : 5: predicate.row_tensor_eliminate 0.70% : 0.000002s : 10: predicate.same_eliminate 0.42% : 0.000001s : 10: predicate.set_cell_output_no_recompute 0.70% : 0.000002s : 10: predicate.shard_identity_eliminate 0.66% : 0.000002s : 10: predicate.special_op_eliminate 0.73% : 0.000002s : 10: predicate.specialize_transform 0.79% : 0.000002s : 10: predicate.split_environ_get_set_with_tuple_value 0.72% : 0.000002s : 10: predicate.stack_unstack_eliminate 0.34% : 0.000001s : 5: predicate.switch_call_monad_eliminater 1.51% : 0.000004s : 25: predicate.switch_defer_inline 2.07% : 0.000005s : 35: predicate.switch_layer_defer_inline 4.79% : 0.000012s : 76: predicate.switch_simplify 0.97% : 0.000002s : 17: predicate.tile_eliminate 0.98% : 0.000002s : 17: predicate.transpose_eliminate 1.63% : 0.000004s : 27: predicate.tuple_list_convert_item_index_to_positive 1.67% : 0.000004s : 27: predicate.tuple_list_get_item_const_eliminator 1.57% : 0.000004s : 27: predicate.tuple_list_get_item_depend_reorder 3.31% : 0.000008s : 41: predicate.tuple_list_get_item_eliminator 1.54% : 0.000004s : 27: predicate.tuple_list_get_set_item_eliminator 2.31% : 0.000006s : 37: predicate.tuple_list_set_item_eliminator 1.74% : 0.000004s : 31: predicate.tuple_to_list_eliminator_ 2.58% : 0.000006s : 48: predicate.updatestate_pure_node_eliminater 3.27% : 0.000008s : 58: predicate.updatestate_useless_node_eliminater 0.34% : 0.000001s : 5: predicate.value_based_eliminate 0.63% : 0.000002s : 10: predicate.virtual_dataset_eliminate 0.63% : 0.000002s : 10: predicate.virtual_output_eliminate 0.29% : 0.000001s : 5: predicate.virtual_view_grad_eliminate 0.48% : 0.000001s : 5: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000539 11 53.47% : 0.000288s : 5: func_graph_cloner_run.FuncGraphClonerGraph 46.53% : 0.000251s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.025618 192 0.01% : 0.000004s : 1: ForceFp32Comm 12.17% : 0.003118s : 1: add_attr 12.13% : 0.003108s : 1: add_attr_with_inline 0.01% : 0.000004s : 1: add_comm_op_reuse_tag 0.22% : 0.000057s : 1: add_recomputation 0.02% : 0.000004s : 1: assign_add_opt 0.29% : 0.000073s : 1: auto_monad 0.09% : 0.000023s : 1: auto_monad_reorder 0.01% : 0.000003s : 1: begin_end_overlap_inline 0.02% : 0.000006s : 1: bias_add_comm_swap 1.75% : 0.000450s : 1: bootstrap 0.11% : 0.000028s : 1: cconv 0.01% : 0.000004s : 1: comm_op_add_attrs 0.07% : 0.000018s : 1: control_data_broadcast_order 0.04% : 0.000011s : 1: convert_after_rewriter 0.11% : 0.000027s : 1: cse_after_recomputation 0.02% : 0.000005s : 1: dataset_repeat_opt 0.02% : 0.000005s : 1: detach_backward 0.04% : 0.000009s : 1: environ_conv 0.10% : 0.000026s : 1: event_method 0.02% : 0.000005s : 1: full_micro_interleaved_order_control 0.02% : 0.000005s : 1: get_jit_bprop_graph 0.04% : 0.000010s : 1: graph_reusing 0.02% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.02% : 0.000004s : 1: handle_group_info 0.02% : 0.000005s : 1: inline 0.03% : 0.000006s : 1: insert-virtual-dataset 0.01% : 0.000004s : 1: interleave_parallel_branches 0.02% : 0.000004s : 1: interleave_split_concat_branches 0.02% : 0.000006s : 1: label_fine_grained_interleaved_index 0.03% : 0.000007s : 1: label_micro_interleaved_index 1.63% : 0.000418s : 1: loop_unroll 0.02% : 0.000004s : 1: merge_cast_opt 0.02% : 0.000005s : 1: micro_interleaved_order_control 1.93% : 0.000495s : 1: mutable_eliminate 0.03% : 0.000007s : 1: offloading_packed_experts 0.06% : 0.000015s : 1: opt.transform.loop_unroll_optimizer 0.06% : 0.000016s : 1: opt.transform.mutable_eliminate 5.51% : 0.001411s : 78: opt.transform.opt_a 0.14% : 0.000036s : 1: opt.transform.opt_after_cconv 0.11% : 0.000028s : 1: opt.transform.opt_after_jit_grad 0.51% : 0.000131s : 28: opt.transform.opt_b 0.22% : 0.000057s : 2: opt.transform.opt_trans_graph 0.16% : 0.000042s : 4: opt.transform.symbol_engine_opt 11.49% : 0.002944s : 1: opt_a 0.45% : 0.000116s : 1: opt_after_cconv 1.78% : 0.000457s : 1: opt_after_jit_grad 0.98% : 0.000250s : 1: opt_b 19.50% : 0.004995s : 1: optimize 0.08% : 0.000020s : 1: optimize_parallel_all_gather_comm 0.04% : 0.000009s : 1: order_py_execute_after_rewriter 0.09% : 0.000024s : 1: overlap_grad_flash_sp 0.02% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.03% : 0.000008s : 1: overlap_grad_ring_attention 0.02% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.02% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.02% : 0.000005s : 1: overlap_param_gather 0.02% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.03% : 0.000008s : 1: overlap_recompute_and_grad_model_parallel 0.02% : 0.000005s : 1: overlap_recompute_comm 0.03% : 0.000007s : 1: parallel-infer-symbol 0.02% : 0.000004s : 1: parallel-infer-symbol-second 0.02% : 0.000004s : 1: partial_unused_args_eliminate 0.02% : 0.000005s : 1: pipeline_parallel_scheduler 0.02% : 0.000004s : 1: pipeline_split 0.15% : 0.000038s : 1: pre_auto_parallel 0.13% : 0.000032s : 1: py_interpret_to_execute 0.05% : 0.000013s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000004s : 1: remove_cast_before_assign_add 0.07% : 0.000017s : 1: remove_dup_value 1.46% : 0.000373s : 1: renormalize.infer 1.20% : 0.000307s : 1: renormalize.specialize 0.02% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.03% : 0.000007s : 1: rewriter_after_jit_bprop_graph 0.17% : 0.000044s : 1: rewriter_after_opt_a 0.34% : 0.000088s : 1: rewriter_before_opt_a 0.02% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.02% : 0.000005s : 1: slice_recompute_activation 0.02% : 0.000004s : 1: split_layernorm_comm 0.02% : 0.000005s : 1: split_matmul_comm_elemetwise 0.03% : 0.000009s : 1: swap_dp_allreduce_reducescatter 0.33% : 0.000084s : 1: symbol_engine_optimizer 0.34% : 0.000087s : 1: tuple_transform 23.04% : 0.005902s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:45:51.314.336 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:45:51.314.611 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0154408, [21] [bootstrap]: 0.00047282 [type_inference]: 0.00572112 [event_method]: 1.82e-05 [auto_monad]: 6.102e-05 [graph_reusing]: 6.40002e-06 [inline]: 2.01998e-06 [add_attr]: 0.00302397, [1] [add_attr_with_inline]: 0.00301568, [1] [Cycle 1]: 6.772e-05, [2] [tag_attr]: 1.769e-05 [meta_addattr_fg_expand]: 5.45001e-06 [parallel-infer-symbol]: 3.08e-06 [pre_auto_parallel]: 3.157e-05 [insert-virtual-dataset]: 2.96001e-06 [parallel-infer-symbol-second]: 7.7e-07 [dataset_repeat_opt]: 1.75001e-06 [pipeline_split]: 1.56002e-06 [optimize]: 0.00499338, [53] [py_interpret_to_execute]: 2.75e-05 [rewriter_before_opt_a]: 8.636e-05 [opt_a]: 0.00282164, [2] [Cycle 1]: 0.00197801, [45] [expand_dump_flag]: 2.86999e-06 [switch_simplify]: 4.126e-05 [loop_unroll]: 3.006e-05 [a_1]: 0.00061173 [with_stream_mark]: 1.456e-05 [recompute_prepare]: 8.15e-06 [updatestate_depend_eliminate]: 3.65e-06 [updatestate_assign_eliminate]: 3.09999e-06 [updatestate_loads_eliminate]: 3.08e-06 [parameter_eliminate]: 2.44001e-06 [a_2]: 0.00011371 [accelerated_algorithm]: 7.19001e-06 [shard]: 1.91998e-06 [meta_shard_fg_expand]: 1.71998e-06 [shard_inline]: 6.53998e-06 [merge_send_recv]: 8.02e-06 [auto_parallel]: 5.76e-06 [parallel]: 1.767e-05 [flash_sp]: 7.93001e-06 [merge_comm]: 4.25999e-06 [allreduce_fusion]: 3.43e-06 [matmul_add_comm_reduction]: 9.12999e-06 [allreduce_slice_to_reducescatter]: 5.79981e-07 [virtual_shard_identity]: 8.07e-06 [virtual_dataset]: 7.33999e-06 [get_grad_eliminate_]: 6.23998e-06 [virtual_output]: 6.36e-06 [merge_forward]: 3.56001e-06 [cell_reuse_recompute_pass]: 1.25999e-06 [offload_activation]: 9.56e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.388e-05 [merge_recompute_call_nodes]: 1.38002e-06 [before_grad]: 1.026e-05 [set_forward_comm_id_for_comm_node_pass]: 3.44001e-06 [meta_fg_expand]: 2.84999e-06 [flash_sp_send_recv_attached]: 2.43998e-06 [receive_attached]: 2.24999e-06 [after_resolve]: 1.188e-05 [a_after_grad]: 9.86998e-06 [renormalize]: 0.00049462 [add_forward_monad_depend]: 4.90999e-06 [auto_monad_grad]: 1.83002e-06 [auto_monad_eliminator]: 1.326e-05 [cse]: 2.622e-05 [a_3]: 5.879e-05 [Cycle 2]: 0.00083071, [45] [expand_dump_flag]: 9.30013e-07 [switch_simplify]: 7.3e-06 [loop_unroll]: 6.02999e-06 [a_1]: 0.00012787 [with_stream_mark]: 1.013e-05 [recompute_prepare]: 6.17001e-06 [updatestate_depend_eliminate]: 2.79999e-06 [updatestate_assign_eliminate]: 2.39001e-06 [updatestate_loads_eliminate]: 2.20002e-06 [parameter_eliminate]: 1.00001e-06 [a_2]: 0.00010142 [accelerated_algorithm]: 6.26e-06 [shard]: 1.05999e-06 [meta_shard_fg_expand]: 1.24e-06 [shard_inline]: 6.21e-06 [merge_send_recv]: 4.41002e-06 [auto_parallel]: 5.07e-06 [parallel]: 2.513e-05 [flash_sp]: 5.19998e-06 [merge_comm]: 3.88999e-06 [allreduce_fusion]: 3.43e-06 [matmul_add_comm_reduction]: 5.15001e-06 [allreduce_slice_to_reducescatter]: 3.30008e-07 [virtual_shard_identity]: 7.13e-06 [virtual_dataset]: 6.35002e-06 [get_grad_eliminate_]: 5.77001e-06 [virtual_output]: 9.57999e-06 [merge_forward]: 2.45002e-06 [cell_reuse_recompute_pass]: 1.27e-06 [offload_activation]: 5.86e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.529e-05 [merge_recompute_call_nodes]: 7.10017e-07 [before_grad]: 9.06998e-06 [set_forward_comm_id_for_comm_node_pass]: 3.27002e-06 [meta_fg_expand]: 1.94999e-06 [flash_sp_send_recv_attached]: 8.89995e-07 [receive_attached]: 9.20001e-07 [after_resolve]: 1.027e-05 [a_after_grad]: 8.79e-06 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 1.35001e-06 [auto_monad_grad]: 1.20001e-06 [auto_monad_eliminator]: 7.10998e-06 [cse]: 1.34e-05 [a_3]: 5.179e-05 [py_interpret_to_execute_after_opt_a]: 1.066e-05 [slice_cell_reuse_recomputed_activation]: 4.87e-06 [rewriter_after_opt_a]: 3.631e-05 [convert_after_rewriter]: 9.96e-06 [order_py_execute_after_rewriter]: 7.98999e-06 [mutable_eliminate]: 0.00047072 [opt_b]: 0.00026502, [1] [Cycle 1]: 0.00025589, [7] [b_1]: 0.0001659 [b_2]: 8.26002e-06 [updatestate_depend_eliminate]: 5.05999e-06 [updatestate_assign_eliminate]: 2.41998e-06 [updatestate_loads_eliminate]: 2.36998e-06 [renormalize]: 3.7998e-07 [cse]: 1.606e-05 [optimize_parallel_all_gather_comm]: 1.934e-05 [overlap_param_gather]: 4.79e-06 [cconv]: 2.781e-05 [loop_unroll]: 0.0004278 [opt_after_cconv]: 0.0001215, [1] [Cycle 1]: 0.00011307, [7] [c_1]: 3.102e-05 [parameter_eliminate]: 2.31e-06 [updatestate_depend_eliminate]: 5.09e-06 [updatestate_assign_eliminate]: 2.51e-06 [updatestate_loads_eliminate]: 2.44999e-06 [cse]: 1.548e-05 [renormalize]: 3.19997e-07 [remove_dup_value]: 1.524e-05 [tuple_transform]: 8.731e-05, [1] [Cycle 1]: 8.046e-05, [4] [d_1]: 4.241e-05 [none_parameter_eliminate]: 1.50999e-06 [renormalize]: 2.3999e-07 [switch_simplify]: 6.89999e-06 [partial_unused_args_eliminate]: 4.48001e-06 [add_recomputation]: 4.951e-05 [cse_after_recomputation]: 2.644e-05, [1] [Cycle 1]: 1.948e-05, [1] [cse]: 1.072e-05 [environ_conv]: 7.55e-06 [swap_dp_allreduce_reducescatter]: 7.7e-06 [bias_add_comm_swap]: 4.99e-06 [label_micro_interleaved_index]: 6.59999e-06 [label_fine_grained_interleaved_index]: 5.02999e-06 [merge_cast_opt]: 3.61999e-06 [slice_recompute_activation]: 4.46002e-06 [micro_interleaved_order_control]: 4.48001e-06 [assign_add_opt]: 3.56001e-06 [ForceFp32Comm]: 3.11999e-06 [remove_cast_before_assign_add]: 3.23e-06 [full_micro_interleaved_order_control]: 4.47e-06 [reorder_send_recv_between_fp_bp]: 5.54998e-06 [comm_op_add_attrs]: 3.26001e-06 [add_comm_op_reuse_tag]: 3.11999e-06 [interleave_split_concat_branches]: 3.39001e-06 [interleave_parallel_branches]: 3.75e-06 [overlap_opt_shard_in_pipeline]: 3.51999e-06 [overlap_opt_shard_grad_in_pipeline]: 4.33999e-06 [control_data_broadcast_order]: 1.445e-05 [grouped_pairwise_exchange_alltoall]: 3.98001e-06 [offloading_packed_experts]: 6.17999e-06 [overlap_recompute_and_grad_model_parallel]: 6.81999e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.8e-06 [overlap_recompute_allgather_and_fa_grad]: 3.70998e-06 [overlap_recompute_comm]: 4.38001e-06 [overlap_grad_ring_attention]: 6.53e-06 [overlap_grad_flash_sp]: 2.13e-05 [begin_end_overlap_inline]: 3.01999e-06 [split_matmul_comm_elemetwise]: 4.68001e-06 [split_layernorm_comm]: 3.99002e-06 [handle_group_info]: 3.55e-06 [symbol_engine_optimizer]: 9.285e-05, [1] [Cycle 1]: 8.582e-05, [6] [build]: 2.64999e-06 [elim_shapecalc]: 8.99e-06 [elim_not_effective]: 1.272e-05 [opt_reshape]: 6.92002e-06 [fold_const_symbol]: 9.96e-06 [renormalize]: 2.59985e-07 [detach_backward]: 3.21001e-06 [pipeline_parallel_scheduler]: 1.99999e-06 [auto_monad_reorder]: 1.827e-05 [get_jit_bprop_graph]: 1.33002e-06 [rewriter_after_jit_bprop_graph]: 4.71002e-06 [opt_after_jit_grad]: 0.00047688 [validate]: 3.431e-05 Sums bootstrap : 0.000473s : 4.40% type_inference : 0.005721s : 53.30% event_method : 0.000018s : 0.17% auto_monad : 0.000061s : 0.57% graph_reusing : 0.000006s : 0.06% inline : 0.000002s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000018s : 0.16% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000005s : 0.05% parallel-infer-symbol : 0.000003s : 0.03% pre_auto_parallel : 0.000032s : 0.29% insert-virtual-dataset : 0.000003s : 0.03% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000028s : 0.26% optimize.rewriter_before_opt_a : 0.000086s : 0.80% optimize.opt_a.expand_dump_flag : 0.000004s : 0.04% optimize.opt_a.switch_simplify : 0.000049s : 0.45% optimize.opt_a.loop_unroll : 0.000036s : 0.34% optimize.opt_a.a_1 : 0.000740s : 6.89% optimize.opt_a.with_stream_mark : 0.000025s : 0.23% optimize.opt_a.recompute_prepare : 0.000014s : 0.13% optimize.opt_a.updatestate_depend_eliminate : 0.000006s : 0.06% optimize.opt_a.updatestate_assign_eliminate : 0.000005s : 0.05% optimize.opt_a.updatestate_loads_eliminate : 0.000005s : 0.05% optimize.opt_a.parameter_eliminate : 0.000003s : 0.03% optimize.opt_a.a_2 : 0.000215s : 2.00% optimize.opt_a.accelerated_algorithm : 0.000013s : 0.13% optimize.opt_a.shard : 0.000003s : 0.03% optimize.opt_a.meta_shard_fg_expand : 0.000003s : 0.03% optimize.opt_a.shard_inline : 0.000013s : 0.12% optimize.opt_a.merge_send_recv : 0.000012s : 0.12% optimize.opt_a.auto_parallel : 0.000011s : 0.10% optimize.opt_a.parallel : 0.000043s : 0.40% optimize.opt_a.flash_sp : 0.000013s : 0.12% optimize.opt_a.merge_comm : 0.000008s : 0.08% optimize.opt_a.allreduce_fusion : 0.000007s : 0.06% optimize.opt_a.matmul_add_comm_reduction : 0.000014s : 0.13% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000015s : 0.14% optimize.opt_a.virtual_dataset : 0.000014s : 0.13% optimize.opt_a.get_grad_eliminate_ : 0.000012s : 0.11% optimize.opt_a.virtual_output : 0.000016s : 0.15% optimize.opt_a.merge_forward : 0.000006s : 0.06% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.02% optimize.opt_a.offload_activation : 0.000015s : 0.14% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000029s : 0.27% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.02% optimize.opt_a.before_grad : 0.000019s : 0.18% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000007s : 0.06% optimize.opt_a.meta_fg_expand : 0.000005s : 0.04% optimize.opt_a.flash_sp_send_recv_attached : 0.000003s : 0.03% optimize.opt_a.receive_attached : 0.000003s : 0.03% optimize.opt_a.after_resolve : 0.000022s : 0.21% optimize.opt_a.a_after_grad : 0.000019s : 0.17% optimize.opt_a.renormalize : 0.000495s : 4.61% optimize.opt_a.add_forward_monad_depend : 0.000006s : 0.06% optimize.opt_a.auto_monad_grad : 0.000003s : 0.03% optimize.opt_a.auto_monad_eliminator : 0.000020s : 0.19% optimize.opt_a.cse : 0.000040s : 0.37% optimize.opt_a.a_3 : 0.000111s : 1.03% optimize.py_interpret_to_execute_after_opt_a : 0.000011s : 0.10% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.05% optimize.rewriter_after_opt_a : 0.000036s : 0.34% optimize.convert_after_rewriter : 0.000010s : 0.09% optimize.order_py_execute_after_rewriter : 0.000008s : 0.07% optimize.mutable_eliminate : 0.000471s : 4.39% optimize.opt_b.b_1 : 0.000166s : 1.55% optimize.opt_b.b_2 : 0.000008s : 0.08% optimize.opt_b.updatestate_depend_eliminate : 0.000005s : 0.05% optimize.opt_b.updatestate_assign_eliminate : 0.000002s : 0.02% optimize.opt_b.updatestate_loads_eliminate : 0.000002s : 0.02% optimize.opt_b.renormalize : 0.000000s : 0.00% optimize.opt_b.cse : 0.000016s : 0.15% optimize.optimize_parallel_all_gather_comm : 0.000019s : 0.18% optimize.overlap_param_gather : 0.000005s : 0.04% optimize.cconv : 0.000028s : 0.26% optimize.loop_unroll : 0.000428s : 3.99% optimize.opt_after_cconv.c_1 : 0.000031s : 0.29% optimize.opt_after_cconv.parameter_eliminate : 0.000002s : 0.02% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000005s : 0.05% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.02% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.02% optimize.opt_after_cconv.cse : 0.000015s : 0.14% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000015s : 0.14% optimize.tuple_transform.d_1 : 0.000042s : 0.40% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000007s : 0.06% optimize.partial_unused_args_eliminate : 0.000004s : 0.04% optimize.add_recomputation : 0.000050s : 0.46% optimize.cse_after_recomputation.cse : 0.000011s : 0.10% optimize.environ_conv : 0.000008s : 0.07% optimize.swap_dp_allreduce_reducescatter : 0.000008s : 0.07% optimize.bias_add_comm_swap : 0.000005s : 0.05% optimize.label_micro_interleaved_index : 0.000007s : 0.06% optimize.label_fine_grained_interleaved_index : 0.000005s : 0.05% optimize.merge_cast_opt : 0.000004s : 0.03% optimize.slice_recompute_activation : 0.000004s : 0.04% optimize.micro_interleaved_order_control : 0.000004s : 0.04% optimize.assign_add_opt : 0.000004s : 0.03% optimize.ForceFp32Comm : 0.000003s : 0.03% optimize.remove_cast_before_assign_add : 0.000003s : 0.03% optimize.full_micro_interleaved_order_control : 0.000004s : 0.04% optimize.reorder_send_recv_between_fp_bp : 0.000006s : 0.05% optimize.comm_op_add_attrs : 0.000003s : 0.03% optimize.add_comm_op_reuse_tag : 0.000003s : 0.03% optimize.interleave_split_concat_branches : 0.000003s : 0.03% optimize.interleave_parallel_branches : 0.000004s : 0.03% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.03% optimize.overlap_opt_shard_grad_in_pipeline : 0.000004s : 0.04% optimize.control_data_broadcast_order : 0.000014s : 0.13% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.04% optimize.offloading_packed_experts : 0.000006s : 0.06% optimize.overlap_recompute_and_grad_model_parallel : 0.000007s : 0.06% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.04% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.03% optimize.overlap_recompute_comm : 0.000004s : 0.04% optimize.overlap_grad_ring_attention : 0.000007s : 0.06% optimize.overlap_grad_flash_sp : 0.000021s : 0.20% optimize.begin_end_overlap_inline : 0.000003s : 0.03% optimize.split_matmul_comm_elemetwise : 0.000005s : 0.04% optimize.split_layernorm_comm : 0.000004s : 0.04% optimize.handle_group_info : 0.000004s : 0.03% optimize.symbol_engine_optimizer.build : 0.000003s : 0.02% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000009s : 0.08% optimize.symbol_engine_optimizer.elim_not_effective : 0.000013s : 0.12% optimize.symbol_engine_optimizer.opt_reshape : 0.000007s : 0.06% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000010s : 0.09% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000003s : 0.03% pipeline_parallel_scheduler : 0.000002s : 0.02% auto_monad_reorder : 0.000018s : 0.17% get_jit_bprop_graph : 0.000001s : 0.01% rewriter_after_jit_bprop_graph : 0.000005s : 0.04% opt_after_jit_grad : 0.000477s : 4.44% validate : 0.000034s : 0.32% Time group info: ------[substitution.] 0.000168 28 1.05% : 0.000002s : 2: substitution.elim_not_effective 0.80% : 0.000001s : 2: substitution.fold_const_symbol 3.21% : 0.000005s : 4: substitution.graph_param_transform 78.29% : 0.000132s : 4: substitution.inline 1.91% : 0.000003s : 4: substitution.j_node_and_user_rematch 3.12% : 0.000005s : 4: substitution.remove_not_recompute_node 2.83% : 0.000005s : 4: substitution.replace_old_param 8.78% : 0.000015s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.005671 2 88.52% : 0.005020s : 1: type_inference.infer 11.48% : 0.000651s : 1: type_inference.specialize ------[replace.] 0.000060 8 63.47% : 0.000038s : 4: replace.inline 36.53% : 0.000022s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000142 8 90.88% : 0.000129s : 4: match.inline 9.12% : 0.000013s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000202 1278 0.94% : 0.000002s : 13: predicate.accumulaten_eliminater 0.67% : 0.000001s : 4: predicate.ad_related_special_op_eliminate 0.55% : 0.000001s : 8: predicate.addn_check_dump 0.90% : 0.000002s : 13: predicate.addn_zero_filter 0.86% : 0.000002s : 13: predicate.adjust_all_reduce_mul_add 1.95% : 0.000004s : 21: predicate.arithmetic_simplify 0.91% : 0.000002s : 13: predicate.cast_eliminate 0.62% : 0.000001s : 8: predicate.check_bprop_eliminate 0.56% : 0.000001s : 8: predicate.compare_switch_simplify 0.19% : 0.000000s : 4: predicate.const_output_eliminate 0.81% : 0.000002s : 8: predicate.depend_value_elim 0.99% : 0.000002s : 13: predicate.dict_get_item_const_eliminator 0.98% : 0.000002s : 13: predicate.dict_get_item_eliminator 0.91% : 0.000002s : 13: predicate.dict_set_item_eliminator 1.01% : 0.000002s : 8: predicate.dumpgradient_eliminate 0.19% : 0.000000s : 4: predicate.elim_not_effective 0.35% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.35% : 0.000003s : 17: predicate.environ_add_const_eliminate 1.09% : 0.000002s : 17: predicate.environ_get_add_eliminate 1.06% : 0.000002s : 17: predicate.environ_get_depend_swap 1.79% : 0.000004s : 25: predicate.environ_get_eliminate 1.10% : 0.000002s : 17: predicate.environ_get_set_eliminate 1.46% : 0.000003s : 21: predicate.exchange_switch_depend_value 2.47% : 0.000005s : 21: predicate.float_depend_g_call 0.61% : 0.000001s : 8: predicate.float_environ_get_switch 0.79% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.17% : 0.000000s : 4: predicate.fold_const_symbol 0.63% : 0.000001s : 8: predicate.get_grad_eliminate 0.21% : 0.000000s : 4: predicate.graph_param_transform 0.59% : 0.000001s : 8: predicate.incorporate_call 0.51% : 0.000001s : 8: predicate.incorporate_call_switch 6.34% : 0.000013s : 58: predicate.inline 0.77% : 0.000002s : 8: predicate.inline_without_move 0.34% : 0.000001s : 8: predicate.j_node_and_user_rematch 0.79% : 0.000002s : 8: predicate.less_batch_normalization 1.97% : 0.000004s : 25: predicate.list_to_tuple_eliminator_ 2.58% : 0.000005s : 38: predicate.load_eliminater 0.91% : 0.000002s : 4: predicate.loop_unroll_after_grad 2.34% : 0.000005s : 34: predicate.loop_unroll_before_grad 1.59% : 0.000003s : 21: predicate.make_slice_get_slice_eliminator 0.63% : 0.000001s : 8: predicate.merge_addn 0.60% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.60% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.81% : 0.000002s : 13: predicate.minmaximum_grad 0.83% : 0.000002s : 4: predicate.mutable_eliminate 0.35% : 0.000001s : 4: predicate.opt_reshape 0.33% : 0.000001s : 4: predicate.parallel_virtual_node 1.84% : 0.000004s : 21: predicate.partial_defer_inline 1.73% : 0.000003s : 21: predicate.partial_eliminate 0.86% : 0.000002s : 13: predicate.print_const_string_wrapper 0.60% : 0.000001s : 8: predicate.reduce_all_const_elim 1.10% : 0.000002s : 13: predicate.reduce_eliminate 2.53% : 0.000005s : 38: predicate.redundant_stop_gradient_eliminater 0.39% : 0.000001s : 8: predicate.remove_not_recompute_node 1.40% : 0.000003s : 25: predicate.replace_applicator 0.51% : 0.000001s : 8: predicate.replace_old_param 0.28% : 0.000001s : 4: predicate.reset_defer_inline 0.94% : 0.000002s : 13: predicate.reshape_eliminate 0.62% : 0.000001s : 8: predicate.row_tensor_add_zeros_like 0.38% : 0.000001s : 4: predicate.row_tensor_eliminate 0.80% : 0.000002s : 8: predicate.same_eliminate 0.47% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.92% : 0.000002s : 8: predicate.shard_identity_eliminate 0.65% : 0.000001s : 8: predicate.special_op_eliminate 0.78% : 0.000002s : 8: predicate.specialize_transform 0.78% : 0.000002s : 8: predicate.split_environ_get_set_with_tuple_value 0.75% : 0.000002s : 8: predicate.stack_unstack_eliminate 0.32% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.55% : 0.000003s : 21: predicate.switch_defer_inline 2.20% : 0.000004s : 29: predicate.switch_layer_defer_inline 5.09% : 0.000010s : 67: predicate.switch_simplify 0.97% : 0.000002s : 13: predicate.tile_eliminate 0.95% : 0.000002s : 13: predicate.transpose_eliminate 1.52% : 0.000003s : 21: predicate.tuple_list_convert_item_index_to_positive 1.61% : 0.000003s : 21: predicate.tuple_list_get_item_const_eliminator 1.43% : 0.000003s : 21: predicate.tuple_list_get_item_depend_reorder 3.32% : 0.000007s : 33: predicate.tuple_list_get_item_eliminator 1.44% : 0.000003s : 21: predicate.tuple_list_get_set_item_eliminator 2.21% : 0.000004s : 29: predicate.tuple_list_set_item_eliminator 1.70% : 0.000003s : 25: predicate.tuple_to_list_eliminator_ 2.50% : 0.000005s : 38: predicate.updatestate_pure_node_eliminater 3.16% : 0.000006s : 46: predicate.updatestate_useless_node_eliminater 0.38% : 0.000001s : 4: predicate.value_based_eliminate 0.71% : 0.000001s : 8: predicate.virtual_dataset_eliminate 0.66% : 0.000001s : 8: predicate.virtual_output_eliminate 0.48% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.45% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000504 11 58.28% : 0.000293s : 5: func_graph_cloner_run.FuncGraphClonerGraph 41.72% : 0.000210s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.025264 192 0.02% : 0.000006s : 1: ForceFp32Comm 12.00% : 0.003032s : 1: add_attr 11.95% : 0.003019s : 1: add_attr_with_inline 0.02% : 0.000006s : 1: add_comm_op_reuse_tag 0.21% : 0.000053s : 1: add_recomputation 0.02% : 0.000006s : 1: assign_add_opt 0.28% : 0.000070s : 1: auto_monad 0.10% : 0.000026s : 1: auto_monad_reorder 0.02% : 0.000006s : 1: begin_end_overlap_inline 0.03% : 0.000008s : 1: bias_add_comm_swap 2.05% : 0.000518s : 1: bootstrap 0.12% : 0.000031s : 1: cconv 0.02% : 0.000006s : 1: comm_op_add_attrs 0.07% : 0.000017s : 1: control_data_broadcast_order 0.05% : 0.000013s : 1: convert_after_rewriter 0.12% : 0.000029s : 1: cse_after_recomputation 0.03% : 0.000007s : 1: dataset_repeat_opt 0.07% : 0.000017s : 1: detach_backward 0.04% : 0.000010s : 1: environ_conv 0.11% : 0.000028s : 1: event_method 0.03% : 0.000007s : 1: full_micro_interleaved_order_control 0.03% : 0.000007s : 1: get_jit_bprop_graph 0.05% : 0.000013s : 1: graph_reusing 0.03% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.02% : 0.000006s : 1: handle_group_info 0.03% : 0.000007s : 1: inline 0.04% : 0.000009s : 1: insert-virtual-dataset 0.02% : 0.000006s : 1: interleave_parallel_branches 0.02% : 0.000006s : 1: interleave_split_concat_branches 0.03% : 0.000008s : 1: label_fine_grained_interleaved_index 0.04% : 0.000009s : 1: label_micro_interleaved_index 1.71% : 0.000433s : 1: loop_unroll 0.02% : 0.000006s : 1: merge_cast_opt 0.03% : 0.000007s : 1: micro_interleaved_order_control 1.89% : 0.000477s : 1: mutable_eliminate 0.04% : 0.000009s : 1: offloading_packed_experts 0.05% : 0.000014s : 1: opt.transform.loop_unroll_optimizer 0.06% : 0.000014s : 1: opt.transform.mutable_eliminate 4.61% : 0.001164s : 78: opt.transform.opt_a 0.12% : 0.000030s : 1: opt.transform.opt_after_cconv 0.10% : 0.000025s : 1: opt.transform.opt_after_jit_grad 0.41% : 0.000103s : 28: opt.transform.opt_b 0.19% : 0.000047s : 2: opt.transform.opt_trans_graph 0.14% : 0.000035s : 4: opt.transform.symbol_engine_opt 11.18% : 0.002825s : 1: opt_a 0.49% : 0.000125s : 1: opt_after_cconv 1.93% : 0.000487s : 1: opt_after_jit_grad 1.06% : 0.000268s : 1: opt_b 21.05% : 0.005317s : 1: optimize 0.09% : 0.000023s : 1: optimize_parallel_all_gather_comm 0.04% : 0.000011s : 1: order_py_execute_after_rewriter 0.10% : 0.000024s : 1: overlap_grad_flash_sp 0.03% : 0.000007s : 1: overlap_grad_matmul_and_grad_allreduce 0.04% : 0.000009s : 1: overlap_grad_ring_attention 0.03% : 0.000007s : 1: overlap_opt_shard_grad_in_pipeline 0.02% : 0.000006s : 1: overlap_opt_shard_in_pipeline 0.03% : 0.000008s : 1: overlap_param_gather 0.02% : 0.000006s : 1: overlap_recompute_allgather_and_fa_grad 0.04% : 0.000010s : 1: overlap_recompute_and_grad_model_parallel 0.03% : 0.000007s : 1: overlap_recompute_comm 0.04% : 0.000010s : 1: parallel-infer-symbol 0.02% : 0.000006s : 1: parallel-infer-symbol-second 0.03% : 0.000007s : 1: partial_unused_args_eliminate 0.04% : 0.000009s : 1: pipeline_parallel_scheduler 0.03% : 0.000007s : 1: pipeline_split 0.15% : 0.000039s : 1: pre_auto_parallel 0.12% : 0.000031s : 1: py_interpret_to_execute 0.05% : 0.000014s : 1: py_interpret_to_execute_after_opt_a 0.02% : 0.000006s : 1: remove_cast_before_assign_add 0.07% : 0.000018s : 1: remove_dup_value 0.96% : 0.000243s : 1: renormalize.infer 0.97% : 0.000245s : 1: renormalize.specialize 0.03% : 0.000008s : 1: reorder_send_recv_between_fp_bp 0.04% : 0.000010s : 1: rewriter_after_jit_bprop_graph 0.16% : 0.000040s : 1: rewriter_after_opt_a 0.36% : 0.000090s : 1: rewriter_before_opt_a 0.03% : 0.000008s : 1: slice_cell_reuse_recomputed_activation 0.03% : 0.000007s : 1: slice_recompute_activation 0.03% : 0.000007s : 1: split_layernorm_comm 0.03% : 0.000007s : 1: split_matmul_comm_elemetwise 0.04% : 0.000010s : 1: swap_dp_allreduce_reducescatter 0.38% : 0.000096s : 1: symbol_engine_optimizer 0.36% : 0.000090s : 1: tuple_transform 22.79% : 0.005757s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:45:51.730.404 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0149806, [21] [bootstrap]: 0.00043849 [type_inference]: 0.00566577 [event_method]: 1.866e-05 [auto_monad]: 6.3e-05 [graph_reusing]: 5.85002e-06 [inline]: 1.95001e-06 [add_attr]: 0.00319414, [1] [add_attr_with_inline]: 0.0031851, [1] [Cycle 1]: 6.233e-05, [2] [tag_attr]: 1.933e-05 [meta_addattr_fg_expand]: 5.86e-06 [parallel-infer-symbol]: 3.35e-06 [pre_auto_parallel]: 3.499e-05 [insert-virtual-dataset]: 2.64001e-06 [parallel-infer-symbol-second]: 8.30012e-07 [dataset_repeat_opt]: 1.82001e-06 [pipeline_split]: 1.79e-06 [optimize]: 0.00483196, [53] [py_interpret_to_execute]: 2.51e-05 [rewriter_before_opt_a]: 8.258e-05 [opt_a]: 0.00272359, [2] [Cycle 1]: 0.00202369, [45] [expand_dump_flag]: 3.09001e-06 [switch_simplify]: 4.631e-05 [loop_unroll]: 3.43e-05 [a_1]: 0.00062182 [with_stream_mark]: 1.659e-05 [recompute_prepare]: 1.034e-05 [updatestate_depend_eliminate]: 4.07e-06 [updatestate_assign_eliminate]: 3.50998e-06 [updatestate_loads_eliminate]: 2.74999e-06 [parameter_eliminate]: 1.94e-06 [a_2]: 8.081e-05 [accelerated_algorithm]: 7.41999e-06 [shard]: 1.89e-06 [meta_shard_fg_expand]: 1.82001e-06 [shard_inline]: 6.46e-06 [merge_send_recv]: 8.26002e-06 [auto_parallel]: 6.78998e-06 [parallel]: 1.894e-05 [flash_sp]: 8.13999e-06 [merge_comm]: 4.72998e-06 [allreduce_fusion]: 3.50998e-06 [matmul_add_comm_reduction]: 9.51e-06 [allreduce_slice_to_reducescatter]: 8.89995e-07 [virtual_shard_identity]: 9.02e-06 [virtual_dataset]: 7.2e-06 [get_grad_eliminate_]: 6.04001e-06 [virtual_output]: 6.64001e-06 [merge_forward]: 3.50998e-06 [cell_reuse_recompute_pass]: 1.15999e-06 [offload_activation]: 1.041e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.343e-05 [merge_recompute_call_nodes]: 1.59e-06 [before_grad]: 1.04e-05 [set_forward_comm_id_for_comm_node_pass]: 3.60998e-06 [meta_fg_expand]: 3.04001e-06 [flash_sp_send_recv_attached]: 2.44001e-06 [receive_attached]: 2.35002e-06 [after_resolve]: 1.299e-05 [a_after_grad]: 1.037e-05 [renormalize]: 0.00065388 [add_forward_monad_depend]: 5.58002e-06 [auto_monad_grad]: 2.16e-06 [auto_monad_eliminator]: 1.493e-05 [cse]: 2.831e-05 [a_3]: 5.282e-05 [Cycle 2]: 0.00068897, [45] [expand_dump_flag]: 1.09e-06 [switch_simplify]: 8.37e-06 [loop_unroll]: 6.43e-06 [a_1]: 0.00014716 [with_stream_mark]: 1.251e-05 [recompute_prepare]: 6.63998e-06 [updatestate_depend_eliminate]: 2.83998e-06 [updatestate_assign_eliminate]: 2.40002e-06 [updatestate_loads_eliminate]: 2.56e-06 [parameter_eliminate]: 1.02998e-06 [a_2]: 7.253e-05 [accelerated_algorithm]: 6.56e-06 [shard]: 1.64e-06 [meta_shard_fg_expand]: 1.32e-06 [shard_inline]: 6.10002e-06 [merge_send_recv]: 5.54e-06 [auto_parallel]: 5.66e-06 [parallel]: 5.67001e-06 [flash_sp]: 3.66999e-06 [merge_comm]: 3.33e-06 [allreduce_fusion]: 3.13e-06 [matmul_add_comm_reduction]: 5.82999e-06 [allreduce_slice_to_reducescatter]: 5.3001e-07 [virtual_shard_identity]: 1.065e-05 [virtual_dataset]: 6.16998e-06 [get_grad_eliminate_]: 5.76e-06 [virtual_output]: 5.59e-06 [merge_forward]: 3.51999e-06 [cell_reuse_recompute_pass]: 1.87999e-06 [offload_activation]: 7.88999e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.565e-05 [merge_recompute_call_nodes]: 1.09e-06 [before_grad]: 9.84001e-06 [set_forward_comm_id_for_comm_node_pass]: 4.03999e-06 [meta_fg_expand]: 2.41e-06 [flash_sp_send_recv_attached]: 9.70002e-07 [receive_attached]: 1.39e-06 [after_resolve]: 1.11e-05 [a_after_grad]: 1.044e-05 [renormalize]: 8.00064e-08 [add_forward_monad_depend]: 1.69e-06 [auto_monad_grad]: 1.00001e-06 [auto_monad_eliminator]: 8.69e-06 [cse]: 1.506e-05 [a_3]: 3.603e-05 [py_interpret_to_execute_after_opt_a]: 1.002e-05 [slice_cell_reuse_recomputed_activation]: 1.96e-06 [rewriter_after_opt_a]: 3.757e-05 [convert_after_rewriter]: 7.36001e-06 [order_py_execute_after_rewriter]: 5.61e-06 [mutable_eliminate]: 0.00053859 [opt_b]: 0.00021124, [1] [Cycle 1]: 0.00020471, [7] [b_1]: 0.00012519 [b_2]: 7.88001e-06 [updatestate_depend_eliminate]: 7.6e-06 [updatestate_assign_eliminate]: 2.56e-06 [updatestate_loads_eliminate]: 2.66999e-06 [renormalize]: 7.2e-07 [cse]: 2.144e-05 [optimize_parallel_all_gather_comm]: 1.829e-05 [overlap_param_gather]: 2.02999e-06 [cconv]: 2.944e-05 [loop_unroll]: 0.0004574 [opt_after_cconv]: 0.0001059, [1] [Cycle 1]: 0.00010009, [7] [c_1]: 3.238e-05 [parameter_eliminate]: 4.58001e-06 [updatestate_depend_eliminate]: 5.71e-06 [updatestate_assign_eliminate]: 2.34999e-06 [updatestate_loads_eliminate]: 2.28002e-06 [cse]: 1.858e-05 [renormalize]: 4.19997e-07 [remove_dup_value]: 1.389e-05 [tuple_transform]: 7.606e-05, [1] [Cycle 1]: 7.164e-05, [4] [d_1]: 4.485e-05 [none_parameter_eliminate]: 1.84e-06 [renormalize]: 2.49973e-07 [switch_simplify]: 6.41998e-06 [partial_unused_args_eliminate]: 2.29001e-06 [add_recomputation]: 5.055e-05 [cse_after_recomputation]: 2.129e-05, [1] [Cycle 1]: 1.664e-05, [1] [cse]: 1.103e-05 [environ_conv]: 5.54998e-06 [swap_dp_allreduce_reducescatter]: 5.30999e-06 [bias_add_comm_swap]: 2.78e-06 [label_micro_interleaved_index]: 4.60001e-06 [label_fine_grained_interleaved_index]: 2.98998e-06 [merge_cast_opt]: 1.60001e-06 [slice_recompute_activation]: 2.21e-06 [micro_interleaved_order_control]: 2.13002e-06 [assign_add_opt]: 1.21997e-06 [ForceFp32Comm]: 1.14e-06 [remove_cast_before_assign_add]: 1.02998e-06 [full_micro_interleaved_order_control]: 2.09e-06 [reorder_send_recv_between_fp_bp]: 2.83998e-06 [comm_op_add_attrs]: 1.02e-06 [add_comm_op_reuse_tag]: 1.00001e-06 [interleave_split_concat_branches]: 1.28002e-06 [interleave_parallel_branches]: 1.08001e-06 [overlap_opt_shard_in_pipeline]: 1.47999e-06 [overlap_opt_shard_grad_in_pipeline]: 2.33998e-06 [control_data_broadcast_order]: 1.333e-05 [grouped_pairwise_exchange_alltoall]: 2.36998e-06 [offloading_packed_experts]: 3.98999e-06 [overlap_recompute_and_grad_model_parallel]: 4.74998e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.09998e-06 [overlap_recompute_allgather_and_fa_grad]: 1.35999e-06 [overlap_recompute_comm]: 2.41e-06 [overlap_grad_ring_attention]: 4.63999e-06 [overlap_grad_flash_sp]: 2.226e-05 [begin_end_overlap_inline]: 9.00007e-07 [split_matmul_comm_elemetwise]: 2.39999e-06 [split_layernorm_comm]: 1.69998e-06 [handle_group_info]: 1.17e-06 [symbol_engine_optimizer]: 8.222e-05, [1] [Cycle 1]: 7.785e-05, [6] [build]: 3.41001e-06 [elim_shapecalc]: 1.23e-05 [elim_not_effective]: 1.397e-05 [opt_reshape]: 6.86999e-06 [fold_const_symbol]: 9.82001e-06 [renormalize]: 2.80008e-07 [detach_backward]: 2.01e-06 [pipeline_parallel_scheduler]: 1.47999e-06 [auto_monad_reorder]: 1.579e-05 [get_jit_bprop_graph]: 1.77999e-06 [rewriter_after_jit_bprop_graph]: 4.48999e-06 [opt_after_jit_grad]: 0.00050068 [validate]: 4.172e-05 Sums bootstrap : 0.000438s : 4.06% type_inference : 0.005666s : 52.43% event_method : 0.000019s : 0.17% auto_monad : 0.000063s : 0.58% graph_reusing : 0.000006s : 0.05% inline : 0.000002s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000019s : 0.18% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.05% parallel-infer-symbol : 0.000003s : 0.03% pre_auto_parallel : 0.000035s : 0.32% insert-virtual-dataset : 0.000003s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.02% optimize.py_interpret_to_execute : 0.000025s : 0.23% optimize.rewriter_before_opt_a : 0.000083s : 0.76% optimize.opt_a.expand_dump_flag : 0.000004s : 0.04% optimize.opt_a.switch_simplify : 0.000055s : 0.51% optimize.opt_a.loop_unroll : 0.000041s : 0.38% optimize.opt_a.a_1 : 0.000769s : 7.12% optimize.opt_a.with_stream_mark : 0.000029s : 0.27% optimize.opt_a.recompute_prepare : 0.000017s : 0.16% optimize.opt_a.updatestate_depend_eliminate : 0.000007s : 0.06% optimize.opt_a.updatestate_assign_eliminate : 0.000006s : 0.05% optimize.opt_a.updatestate_loads_eliminate : 0.000005s : 0.05% optimize.opt_a.parameter_eliminate : 0.000003s : 0.03% optimize.opt_a.a_2 : 0.000153s : 1.42% optimize.opt_a.accelerated_algorithm : 0.000014s : 0.13% optimize.opt_a.shard : 0.000004s : 0.03% optimize.opt_a.meta_shard_fg_expand : 0.000003s : 0.03% optimize.opt_a.shard_inline : 0.000013s : 0.12% optimize.opt_a.merge_send_recv : 0.000014s : 0.13% optimize.opt_a.auto_parallel : 0.000012s : 0.12% optimize.opt_a.parallel : 0.000025s : 0.23% optimize.opt_a.flash_sp : 0.000012s : 0.11% optimize.opt_a.merge_comm : 0.000008s : 0.07% optimize.opt_a.allreduce_fusion : 0.000007s : 0.06% optimize.opt_a.matmul_add_comm_reduction : 0.000015s : 0.14% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000020s : 0.18% optimize.opt_a.virtual_dataset : 0.000013s : 0.12% optimize.opt_a.get_grad_eliminate_ : 0.000012s : 0.11% optimize.opt_a.virtual_output : 0.000012s : 0.11% optimize.opt_a.merge_forward : 0.000007s : 0.07% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.03% optimize.opt_a.offload_activation : 0.000018s : 0.17% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000029s : 0.27% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.02% optimize.opt_a.before_grad : 0.000020s : 0.19% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000008s : 0.07% optimize.opt_a.meta_fg_expand : 0.000005s : 0.05% optimize.opt_a.flash_sp_send_recv_attached : 0.000003s : 0.03% optimize.opt_a.receive_attached : 0.000004s : 0.03% optimize.opt_a.after_resolve : 0.000024s : 0.22% optimize.opt_a.a_after_grad : 0.000021s : 0.19% optimize.opt_a.renormalize : 0.000654s : 6.05% optimize.opt_a.add_forward_monad_depend : 0.000007s : 0.07% optimize.opt_a.auto_monad_grad : 0.000003s : 0.03% optimize.opt_a.auto_monad_eliminator : 0.000024s : 0.22% optimize.opt_a.cse : 0.000043s : 0.40% optimize.opt_a.a_3 : 0.000089s : 0.82% optimize.py_interpret_to_execute_after_opt_a : 0.000010s : 0.09% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.02% optimize.rewriter_after_opt_a : 0.000038s : 0.35% optimize.convert_after_rewriter : 0.000007s : 0.07% optimize.order_py_execute_after_rewriter : 0.000006s : 0.05% optimize.mutable_eliminate : 0.000539s : 4.98% optimize.opt_b.b_1 : 0.000125s : 1.16% optimize.opt_b.b_2 : 0.000008s : 0.07% optimize.opt_b.updatestate_depend_eliminate : 0.000008s : 0.07% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.02% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.02% optimize.opt_b.renormalize : 0.000001s : 0.01% optimize.opt_b.cse : 0.000021s : 0.20% optimize.optimize_parallel_all_gather_comm : 0.000018s : 0.17% optimize.overlap_param_gather : 0.000002s : 0.02% optimize.cconv : 0.000029s : 0.27% optimize.loop_unroll : 0.000457s : 4.23% optimize.opt_after_cconv.c_1 : 0.000032s : 0.30% optimize.opt_after_cconv.parameter_eliminate : 0.000005s : 0.04% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.05% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000002s : 0.02% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.02% optimize.opt_after_cconv.cse : 0.000019s : 0.17% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000014s : 0.13% optimize.tuple_transform.d_1 : 0.000045s : 0.42% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.02% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000006s : 0.06% optimize.partial_unused_args_eliminate : 0.000002s : 0.02% optimize.add_recomputation : 0.000051s : 0.47% optimize.cse_after_recomputation.cse : 0.000011s : 0.10% optimize.environ_conv : 0.000006s : 0.05% optimize.swap_dp_allreduce_reducescatter : 0.000005s : 0.05% optimize.bias_add_comm_swap : 0.000003s : 0.03% optimize.label_micro_interleaved_index : 0.000005s : 0.04% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.03% optimize.merge_cast_opt : 0.000002s : 0.01% optimize.slice_recompute_activation : 0.000002s : 0.02% optimize.micro_interleaved_order_control : 0.000002s : 0.02% optimize.assign_add_opt : 0.000001s : 0.01% optimize.ForceFp32Comm : 0.000001s : 0.01% optimize.remove_cast_before_assign_add : 0.000001s : 0.01% optimize.full_micro_interleaved_order_control : 0.000002s : 0.02% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.03% optimize.comm_op_add_attrs : 0.000001s : 0.01% optimize.add_comm_op_reuse_tag : 0.000001s : 0.01% optimize.interleave_split_concat_branches : 0.000001s : 0.01% optimize.interleave_parallel_branches : 0.000001s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.02% optimize.control_data_broadcast_order : 0.000013s : 0.12% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.02% optimize.offloading_packed_experts : 0.000004s : 0.04% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.04% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.01% optimize.overlap_recompute_comm : 0.000002s : 0.02% optimize.overlap_grad_ring_attention : 0.000005s : 0.04% optimize.overlap_grad_flash_sp : 0.000022s : 0.21% optimize.begin_end_overlap_inline : 0.000001s : 0.01% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.02% optimize.split_layernorm_comm : 0.000002s : 0.02% optimize.handle_group_info : 0.000001s : 0.01% optimize.symbol_engine_optimizer.build : 0.000003s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000012s : 0.11% optimize.symbol_engine_optimizer.elim_not_effective : 0.000014s : 0.13% optimize.symbol_engine_optimizer.opt_reshape : 0.000007s : 0.06% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000010s : 0.09% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.02% pipeline_parallel_scheduler : 0.000001s : 0.01% auto_monad_reorder : 0.000016s : 0.15% get_jit_bprop_graph : 0.000002s : 0.02% rewriter_after_jit_bprop_graph : 0.000004s : 0.04% opt_after_jit_grad : 0.000501s : 4.63% validate : 0.000042s : 0.39% Time group info: ------[substitution.] 0.000180 28 1.19% : 0.000002s : 2: substitution.elim_not_effective 0.78% : 0.000001s : 2: substitution.fold_const_symbol 3.36% : 0.000006s : 4: substitution.graph_param_transform 78.37% : 0.000141s : 4: substitution.inline 1.85% : 0.000003s : 4: substitution.j_node_and_user_rematch 2.84% : 0.000005s : 4: substitution.remove_not_recompute_node 2.87% : 0.000005s : 4: substitution.replace_old_param 8.74% : 0.000016s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.005604 2 87.55% : 0.004906s : 1: type_inference.infer 12.45% : 0.000698s : 1: type_inference.specialize ------[replace.] 0.000061 8 61.90% : 0.000038s : 4: replace.inline 38.10% : 0.000023s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000152 8 90.99% : 0.000138s : 4: match.inline 9.01% : 0.000014s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000223 1278 0.88% : 0.000002s : 13: predicate.accumulaten_eliminater 0.79% : 0.000002s : 4: predicate.ad_related_special_op_eliminate 0.46% : 0.000001s : 8: predicate.addn_check_dump 0.79% : 0.000002s : 13: predicate.addn_zero_filter 0.76% : 0.000002s : 13: predicate.adjust_all_reduce_mul_add 1.77% : 0.000004s : 21: predicate.arithmetic_simplify 0.79% : 0.000002s : 13: predicate.cast_eliminate 0.56% : 0.000001s : 8: predicate.check_bprop_eliminate 0.47% : 0.000001s : 8: predicate.compare_switch_simplify 0.19% : 0.000000s : 4: predicate.const_output_eliminate 0.52% : 0.000001s : 8: predicate.depend_value_elim 0.83% : 0.000002s : 13: predicate.dict_get_item_const_eliminator 0.92% : 0.000002s : 13: predicate.dict_get_item_eliminator 0.77% : 0.000002s : 13: predicate.dict_set_item_eliminator 0.91% : 0.000002s : 8: predicate.dumpgradient_eliminate 0.21% : 0.000000s : 4: predicate.elim_not_effective 0.46% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.07% : 0.000002s : 17: predicate.environ_add_const_eliminate 1.03% : 0.000002s : 17: predicate.environ_get_add_eliminate 0.97% : 0.000002s : 17: predicate.environ_get_depend_swap 1.60% : 0.000004s : 25: predicate.environ_get_eliminate 0.98% : 0.000002s : 17: predicate.environ_get_set_eliminate 1.34% : 0.000003s : 21: predicate.exchange_switch_depend_value 9.78% : 0.000022s : 21: predicate.float_depend_g_call 0.45% : 0.000001s : 8: predicate.float_environ_get_switch 0.73% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.16% : 0.000000s : 4: predicate.fold_const_symbol 0.60% : 0.000001s : 8: predicate.get_grad_eliminate 0.26% : 0.000001s : 4: predicate.graph_param_transform 0.54% : 0.000001s : 8: predicate.incorporate_call 0.45% : 0.000001s : 8: predicate.incorporate_call_switch 6.19% : 0.000014s : 58: predicate.inline 0.72% : 0.000002s : 8: predicate.inline_without_move 0.33% : 0.000001s : 8: predicate.j_node_and_user_rematch 0.90% : 0.000002s : 8: predicate.less_batch_normalization 1.73% : 0.000004s : 25: predicate.list_to_tuple_eliminator_ 2.38% : 0.000005s : 38: predicate.load_eliminater 1.06% : 0.000002s : 4: predicate.loop_unroll_after_grad 2.25% : 0.000005s : 34: predicate.loop_unroll_before_grad 1.54% : 0.000003s : 21: predicate.make_slice_get_slice_eliminator 0.46% : 0.000001s : 8: predicate.merge_addn 0.56% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.63% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.71% : 0.000002s : 13: predicate.minmaximum_grad 1.02% : 0.000002s : 4: predicate.mutable_eliminate 0.28% : 0.000001s : 4: predicate.opt_reshape 0.36% : 0.000001s : 4: predicate.parallel_virtual_node 1.63% : 0.000004s : 21: predicate.partial_defer_inline 1.53% : 0.000003s : 21: predicate.partial_eliminate 0.81% : 0.000002s : 13: predicate.print_const_string_wrapper 0.51% : 0.000001s : 8: predicate.reduce_all_const_elim 1.01% : 0.000002s : 13: predicate.reduce_eliminate 2.29% : 0.000005s : 38: predicate.redundant_stop_gradient_eliminater 0.43% : 0.000001s : 8: predicate.remove_not_recompute_node 1.27% : 0.000003s : 25: predicate.replace_applicator 0.48% : 0.000001s : 8: predicate.replace_old_param 0.36% : 0.000001s : 4: predicate.reset_defer_inline 0.82% : 0.000002s : 13: predicate.reshape_eliminate 0.73% : 0.000002s : 8: predicate.row_tensor_add_zeros_like 0.32% : 0.000001s : 4: predicate.row_tensor_eliminate 0.75% : 0.000002s : 8: predicate.same_eliminate 0.43% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.86% : 0.000002s : 8: predicate.shard_identity_eliminate 0.64% : 0.000001s : 8: predicate.special_op_eliminate 0.62% : 0.000001s : 8: predicate.specialize_transform 0.80% : 0.000002s : 8: predicate.split_environ_get_set_with_tuple_value 0.59% : 0.000001s : 8: predicate.stack_unstack_eliminate 0.31% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.40% : 0.000003s : 21: predicate.switch_defer_inline 1.90% : 0.000004s : 29: predicate.switch_layer_defer_inline 4.93% : 0.000011s : 67: predicate.switch_simplify 0.79% : 0.000002s : 13: predicate.tile_eliminate 0.84% : 0.000002s : 13: predicate.transpose_eliminate 1.40% : 0.000003s : 21: predicate.tuple_list_convert_item_index_to_positive 1.63% : 0.000004s : 21: predicate.tuple_list_get_item_const_eliminator 1.25% : 0.000003s : 21: predicate.tuple_list_get_item_depend_reorder 3.11% : 0.000007s : 33: predicate.tuple_list_get_item_eliminator 1.36% : 0.000003s : 21: predicate.tuple_list_get_set_item_eliminator 2.06% : 0.000005s : 29: predicate.tuple_list_set_item_eliminator 1.54% : 0.000003s : 25: predicate.tuple_to_list_eliminator_ 2.30% : 0.000005s : 38: predicate.updatestate_pure_node_eliminater 2.81% : 0.000006s : 46: predicate.updatestate_useless_node_eliminater 0.38% : 0.000001s : 4: predicate.value_based_eliminate 0.66% : 0.000001s : 8: predicate.virtual_dataset_eliminate 0.59% : 0.000001s : 8: predicate.virtual_output_eliminate 0.29% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.40% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000521 11 53.40% : 0.000278s : 5: func_graph_cloner_run.FuncGraphClonerGraph 46.60% : 0.000243s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.025020 192 0.02% : 0.000004s : 1: ForceFp32Comm 12.79% : 0.003199s : 1: add_attr 12.75% : 0.003189s : 1: add_attr_with_inline 0.02% : 0.000004s : 1: add_comm_op_reuse_tag 0.22% : 0.000055s : 1: add_recomputation 0.02% : 0.000004s : 1: assign_add_opt 0.27% : 0.000068s : 1: auto_monad 0.08% : 0.000019s : 1: auto_monad_reorder 0.02% : 0.000004s : 1: begin_end_overlap_inline 0.02% : 0.000006s : 1: bias_add_comm_swap 1.86% : 0.000464s : 1: bootstrap 0.13% : 0.000033s : 1: cconv 0.02% : 0.000004s : 1: comm_op_add_attrs 0.07% : 0.000017s : 1: control_data_broadcast_order 0.04% : 0.000011s : 1: convert_after_rewriter 0.10% : 0.000024s : 1: cse_after_recomputation 0.02% : 0.000005s : 1: dataset_repeat_opt 0.02% : 0.000006s : 1: detach_backward 0.03% : 0.000009s : 1: environ_conv 0.10% : 0.000025s : 1: event_method 0.02% : 0.000005s : 1: full_micro_interleaved_order_control 0.02% : 0.000005s : 1: get_jit_bprop_graph 0.04% : 0.000009s : 1: graph_reusing 0.02% : 0.000005s : 1: grouped_pairwise_exchange_alltoall 0.02% : 0.000004s : 1: handle_group_info 0.02% : 0.000005s : 1: inline 0.02% : 0.000006s : 1: insert-virtual-dataset 0.02% : 0.000004s : 1: interleave_parallel_branches 0.02% : 0.000004s : 1: interleave_split_concat_branches 0.02% : 0.000006s : 1: label_fine_grained_interleaved_index 0.03% : 0.000008s : 1: label_micro_interleaved_index 1.87% : 0.000467s : 1: loop_unroll 0.02% : 0.000004s : 1: merge_cast_opt 0.02% : 0.000005s : 1: micro_interleaved_order_control 2.19% : 0.000547s : 1: mutable_eliminate 0.03% : 0.000007s : 1: offloading_packed_experts 0.07% : 0.000017s : 1: opt.transform.loop_unroll_optimizer 0.07% : 0.000017s : 1: opt.transform.mutable_eliminate 4.83% : 0.001209s : 78: opt.transform.opt_a 0.12% : 0.000031s : 1: opt.transform.opt_after_cconv 0.10% : 0.000026s : 1: opt.transform.opt_after_jit_grad 0.40% : 0.000101s : 28: opt.transform.opt_b 0.20% : 0.000049s : 2: opt.transform.opt_trans_graph 0.15% : 0.000039s : 4: opt.transform.symbol_engine_opt 10.90% : 0.002727s : 1: opt_a 0.44% : 0.000110s : 1: opt_after_cconv 2.04% : 0.000510s : 1: opt_after_jit_grad 0.86% : 0.000216s : 1: opt_b 19.33% : 0.004837s : 1: optimize 0.09% : 0.000022s : 1: optimize_parallel_all_gather_comm 0.03% : 0.000009s : 1: order_py_execute_after_rewriter 0.10% : 0.000026s : 1: overlap_grad_flash_sp 0.02% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.03% : 0.000008s : 1: overlap_grad_ring_attention 0.02% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.02% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.02% : 0.000005s : 1: overlap_param_gather 0.02% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.03% : 0.000008s : 1: overlap_recompute_and_grad_model_parallel 0.02% : 0.000006s : 1: overlap_recompute_comm 0.03% : 0.000007s : 1: parallel-infer-symbol 0.01% : 0.000004s : 1: parallel-infer-symbol-second 0.02% : 0.000005s : 1: partial_unused_args_eliminate 0.02% : 0.000005s : 1: pipeline_parallel_scheduler 0.02% : 0.000005s : 1: pipeline_split 0.16% : 0.000039s : 1: pre_auto_parallel 0.12% : 0.000029s : 1: py_interpret_to_execute 0.05% : 0.000014s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000004s : 1: remove_cast_before_assign_add 0.07% : 0.000017s : 1: remove_dup_value 1.38% : 0.000347s : 1: renormalize.infer 1.19% : 0.000299s : 1: renormalize.specialize 0.02% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.03% : 0.000008s : 1: rewriter_after_jit_bprop_graph 0.17% : 0.000042s : 1: rewriter_after_opt_a 0.35% : 0.000087s : 1: rewriter_before_opt_a 0.02% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.02% : 0.000005s : 1: slice_recompute_activation 0.02% : 0.000005s : 1: split_layernorm_comm 0.02% : 0.000005s : 1: split_matmul_comm_elemetwise 0.03% : 0.000008s : 1: swap_dp_allreduce_reducescatter 0.34% : 0.000085s : 1: symbol_engine_optimizer 0.32% : 0.000079s : 1: tuple_transform 22.72% : 0.005684s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:45:52.115.161 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:45:52.115.424 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0286756, [21] [bootstrap]: 0.00044498 [type_inference]: 0.00574627 [event_method]: 1.933e-05 [auto_monad]: 6.782e-05 [graph_reusing]: 6.81999e-06 [inline]: 2.04e-06 [add_attr]: 0.00305031, [1] [add_attr_with_inline]: 0.00304204, [1] [Cycle 1]: 7.484e-05, [2] [tag_attr]: 1.88e-05 [meta_addattr_fg_expand]: 6.43998e-06 [parallel-infer-symbol]: 3.44001e-06 [pre_auto_parallel]: 3.323e-05 [insert-virtual-dataset]: 2.56998e-06 [parallel-infer-symbol-second]: 7.49977e-07 [dataset_repeat_opt]: 2.21998e-06 [pipeline_split]: 1.62001e-06 [optimize]: 0.0181232, [53] [py_interpret_to_execute]: 2.987e-05 [rewriter_before_opt_a]: 9.079e-05 [opt_a]: 0.00329048, [2] [Cycle 1]: 0.00234719, [45] [expand_dump_flag]: 3.34001e-06 [switch_simplify]: 6.049e-05 [loop_unroll]: 3.222e-05 [a_1]: 0.00073147 [with_stream_mark]: 1.652e-05 [recompute_prepare]: 1.024e-05 [updatestate_depend_eliminate]: 4.36002e-06 [updatestate_assign_eliminate]: 4.08999e-06 [updatestate_loads_eliminate]: 3.55e-06 [parameter_eliminate]: 2.12001e-06 [a_2]: 0.00012958 [accelerated_algorithm]: 8.08001e-06 [shard]: 2.33998e-06 [meta_shard_fg_expand]: 2.14e-06 [shard_inline]: 7.8e-06 [merge_send_recv]: 9.63002e-06 [auto_parallel]: 7.28e-06 [parallel]: 1.867e-05 [flash_sp]: 8.33001e-06 [merge_comm]: 4.82e-06 [allreduce_fusion]: 4.13001e-06 [matmul_add_comm_reduction]: 1.063e-05 [allreduce_slice_to_reducescatter]: 6.00005e-07 [virtual_shard_identity]: 9.10999e-06 [virtual_dataset]: 7.83001e-06 [get_grad_eliminate_]: 7.63001e-06 [virtual_output]: 7.62998e-06 [merge_forward]: 4.75999e-06 [cell_reuse_recompute_pass]: 1.17999e-06 [offload_activation]: 1.062e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.697e-05 [merge_recompute_call_nodes]: 1.44e-06 [before_grad]: 1.27e-05 [set_forward_comm_id_for_comm_node_pass]: 4.43999e-06 [meta_fg_expand]: 3.42997e-06 [flash_sp_send_recv_attached]: 2.66e-06 [receive_attached]: 2.53e-06 [after_resolve]: 1.332e-05 [a_after_grad]: 1.165e-05 [renormalize]: 0.00063245 [add_forward_monad_depend]: 5.12e-06 [auto_monad_grad]: 2.44999e-06 [auto_monad_eliminator]: 1.679e-05 [cse]: 3.386e-05 [a_3]: 7.177e-05 [Cycle 2]: 0.00093007, [45] [expand_dump_flag]: 1.25999e-06 [switch_simplify]: 9.34e-06 [loop_unroll]: 7.6e-06 [a_1]: 0.00017538 [with_stream_mark]: 1.25e-05 [recompute_prepare]: 8.26002e-06 [updatestate_depend_eliminate]: 3.99002e-06 [updatestate_assign_eliminate]: 2.84999e-06 [updatestate_loads_eliminate]: 3.07002e-06 [parameter_eliminate]: 1.23002e-06 [a_2]: 0.00012247 [accelerated_algorithm]: 7.48999e-06 [shard]: 1.82001e-06 [meta_shard_fg_expand]: 1.89e-06 [shard_inline]: 7.41999e-06 [merge_send_recv]: 6.71e-06 [auto_parallel]: 6.74001e-06 [parallel]: 4.53999e-06 [flash_sp]: 3.88001e-06 [merge_comm]: 4.4e-06 [allreduce_fusion]: 3.86999e-06 [matmul_add_comm_reduction]: 6.98e-06 [allreduce_slice_to_reducescatter]: 5.19998e-07 [virtual_shard_identity]: 8.38999e-06 [virtual_dataset]: 9.26002e-06 [get_grad_eliminate_]: 6.87002e-06 [virtual_output]: 7.15e-06 [merge_forward]: 3.71999e-06 [cell_reuse_recompute_pass]: 1.44e-06 [offload_activation]: 7.90998e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.64e-05 [merge_recompute_call_nodes]: 9.10019e-07 [before_grad]: 1.154e-05 [set_forward_comm_id_for_comm_node_pass]: 4.60999e-06 [meta_fg_expand]: 2.79001e-06 [flash_sp_send_recv_attached]: 8.89995e-07 [receive_attached]: 9.89996e-07 [after_resolve]: 1.141e-05 [a_after_grad]: 1.082e-05 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 2.11e-06 [auto_monad_grad]: 1.01997e-06 [auto_monad_eliminator]: 9.19e-06 [cse]: 1.838e-05 [a_3]: 5.849e-05 [py_interpret_to_execute_after_opt_a]: 1.307e-05 [slice_cell_reuse_recomputed_activation]: 4.92e-06 [rewriter_after_opt_a]: 4.542e-05 [convert_after_rewriter]: 1.034e-05 [order_py_execute_after_rewriter]: 8.29998e-06 [mutable_eliminate]: 0.0004944 [opt_b]: 0.0124935, [1] [Cycle 1]: 0.0124803, [7] [b_1]: 0.0123081 [b_2]: 1.647e-05 [updatestate_depend_eliminate]: 1.257e-05 [updatestate_assign_eliminate]: 4.12e-06 [updatestate_loads_eliminate]: 4.05e-06 [renormalize]: 8.50006e-07 [cse]: 4.307e-05 [optimize_parallel_all_gather_comm]: 2.812e-05 [overlap_param_gather]: 5.85002e-06 [cconv]: 3.918e-05 [loop_unroll]: 0.00068628 [opt_after_cconv]: 0.00014663, [1] [Cycle 1]: 0.00013658, [7] [c_1]: 3.942e-05 [parameter_eliminate]: 4.46002e-06 [updatestate_depend_eliminate]: 6.07999e-06 [updatestate_assign_eliminate]: 3.29001e-06 [updatestate_loads_eliminate]: 3.16001e-06 [cse]: 2.381e-05 [renormalize]: 8.39995e-07 [remove_dup_value]: 1.871e-05 [tuple_transform]: 0.00010426, [1] [Cycle 1]: 9.695e-05, [4] [d_1]: 5.613e-05 [none_parameter_eliminate]: 1.64e-06 [renormalize]: 2.3999e-07 [switch_simplify]: 8.60001e-06 [partial_unused_args_eliminate]: 4.97999e-06 [add_recomputation]: 6.266e-05 [cse_after_recomputation]: 3.233e-05, [1] [Cycle 1]: 2.522e-05, [1] [cse]: 1.602e-05 [environ_conv]: 1.002e-05 [swap_dp_allreduce_reducescatter]: 8.79e-06 [bias_add_comm_swap]: 5.64998e-06 [label_micro_interleaved_index]: 7.57998e-06 [label_fine_grained_interleaved_index]: 5.54998e-06 [merge_cast_opt]: 3.94002e-06 [slice_recompute_activation]: 4.66002e-06 [micro_interleaved_order_control]: 4.75001e-06 [assign_add_opt]: 4.29002e-06 [ForceFp32Comm]: 3.43e-06 [remove_cast_before_assign_add]: 3.68999e-06 [full_micro_interleaved_order_control]: 4.60999e-06 [reorder_send_recv_between_fp_bp]: 5.20001e-06 [comm_op_add_attrs]: 3.59002e-06 [add_comm_op_reuse_tag]: 3.35e-06 [interleave_split_concat_branches]: 3.51999e-06 [interleave_parallel_branches]: 3.38e-06 [overlap_opt_shard_in_pipeline]: 3.65e-06 [overlap_opt_shard_grad_in_pipeline]: 4.70001e-06 [control_data_broadcast_order]: 1.777e-05 [grouped_pairwise_exchange_alltoall]: 3.83001e-06 [offloading_packed_experts]: 7.4e-06 [overlap_recompute_and_grad_model_parallel]: 8.80001e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.59002e-06 [overlap_recompute_allgather_and_fa_grad]: 3.56001e-06 [overlap_recompute_comm]: 5.64998e-06 [overlap_grad_ring_attention]: 7.36001e-06 [overlap_grad_flash_sp]: 2.681e-05 [begin_end_overlap_inline]: 2.98998e-06 [split_matmul_comm_elemetwise]: 4.54002e-06 [split_layernorm_comm]: 4.08001e-06 [handle_group_info]: 3.31999e-06 [symbol_engine_optimizer]: 0.00010474, [1] [Cycle 1]: 9.774e-05, [6] [build]: 3.82002e-06 [elim_shapecalc]: 1.148e-05 [elim_not_effective]: 1.466e-05 [opt_reshape]: 8.48001e-06 [fold_const_symbol]: 1.318e-05 [renormalize]: 2.29978e-07 [detach_backward]: 3.53e-06 [pipeline_parallel_scheduler]: 1.97999e-06 [auto_monad_reorder]: 2.213e-05 [get_jit_bprop_graph]: 2.09e-06 [rewriter_after_jit_bprop_graph]: 5.14e-06 [opt_after_jit_grad]: 0.00049405 [validate]: 4.576e-05 Sums bootstrap : 0.000445s : 1.87% type_inference : 0.005746s : 24.11% event_method : 0.000019s : 0.08% auto_monad : 0.000068s : 0.28% graph_reusing : 0.000007s : 0.03% inline : 0.000002s : 0.01% add_attr.add_attr_with_inline.tag_attr : 0.000019s : 0.08% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.03% parallel-infer-symbol : 0.000003s : 0.01% pre_auto_parallel : 0.000033s : 0.14% insert-virtual-dataset : 0.000003s : 0.01% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.01% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000030s : 0.13% optimize.rewriter_before_opt_a : 0.000091s : 0.38% optimize.opt_a.expand_dump_flag : 0.000005s : 0.02% optimize.opt_a.switch_simplify : 0.000070s : 0.29% optimize.opt_a.loop_unroll : 0.000040s : 0.17% optimize.opt_a.a_1 : 0.000907s : 3.80% optimize.opt_a.with_stream_mark : 0.000029s : 0.12% optimize.opt_a.recompute_prepare : 0.000019s : 0.08% optimize.opt_a.updatestate_depend_eliminate : 0.000008s : 0.04% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.03% optimize.opt_a.updatestate_loads_eliminate : 0.000007s : 0.03% optimize.opt_a.parameter_eliminate : 0.000003s : 0.01% optimize.opt_a.a_2 : 0.000252s : 1.06% optimize.opt_a.accelerated_algorithm : 0.000016s : 0.07% optimize.opt_a.shard : 0.000004s : 0.02% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.02% optimize.opt_a.shard_inline : 0.000015s : 0.06% optimize.opt_a.merge_send_recv : 0.000016s : 0.07% optimize.opt_a.auto_parallel : 0.000014s : 0.06% optimize.opt_a.parallel : 0.000023s : 0.10% optimize.opt_a.flash_sp : 0.000012s : 0.05% optimize.opt_a.merge_comm : 0.000009s : 0.04% optimize.opt_a.allreduce_fusion : 0.000008s : 0.03% optimize.opt_a.matmul_add_comm_reduction : 0.000018s : 0.07% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000017s : 0.07% optimize.opt_a.virtual_dataset : 0.000017s : 0.07% optimize.opt_a.get_grad_eliminate_ : 0.000015s : 0.06% optimize.opt_a.virtual_output : 0.000015s : 0.06% optimize.opt_a.merge_forward : 0.000008s : 0.04% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.01% optimize.opt_a.offload_activation : 0.000019s : 0.08% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000033s : 0.14% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.01% optimize.opt_a.before_grad : 0.000024s : 0.10% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000009s : 0.04% optimize.opt_a.meta_fg_expand : 0.000006s : 0.03% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.01% optimize.opt_a.receive_attached : 0.000004s : 0.01% optimize.opt_a.after_resolve : 0.000025s : 0.10% optimize.opt_a.a_after_grad : 0.000022s : 0.09% optimize.opt_a.renormalize : 0.000633s : 2.65% optimize.opt_a.add_forward_monad_depend : 0.000007s : 0.03% optimize.opt_a.auto_monad_grad : 0.000003s : 0.01% optimize.opt_a.auto_monad_eliminator : 0.000026s : 0.11% optimize.opt_a.cse : 0.000052s : 0.22% optimize.opt_a.a_3 : 0.000130s : 0.55% optimize.py_interpret_to_execute_after_opt_a : 0.000013s : 0.05% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.02% optimize.rewriter_after_opt_a : 0.000045s : 0.19% optimize.convert_after_rewriter : 0.000010s : 0.04% optimize.order_py_execute_after_rewriter : 0.000008s : 0.03% optimize.mutable_eliminate : 0.000494s : 2.07% optimize.opt_b.b_1 : 0.012308s : 51.64% optimize.opt_b.b_2 : 0.000016s : 0.07% optimize.opt_b.updatestate_depend_eliminate : 0.000013s : 0.05% optimize.opt_b.updatestate_assign_eliminate : 0.000004s : 0.02% optimize.opt_b.updatestate_loads_eliminate : 0.000004s : 0.02% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000043s : 0.18% optimize.optimize_parallel_all_gather_comm : 0.000028s : 0.12% optimize.overlap_param_gather : 0.000006s : 0.02% optimize.cconv : 0.000039s : 0.16% optimize.loop_unroll : 0.000686s : 2.88% optimize.opt_after_cconv.c_1 : 0.000039s : 0.17% optimize.opt_after_cconv.parameter_eliminate : 0.000004s : 0.02% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.03% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.cse : 0.000024s : 0.10% optimize.opt_after_cconv.renormalize : 0.000001s : 0.00% optimize.remove_dup_value : 0.000019s : 0.08% optimize.tuple_transform.d_1 : 0.000056s : 0.24% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000009s : 0.04% optimize.partial_unused_args_eliminate : 0.000005s : 0.02% optimize.add_recomputation : 0.000063s : 0.26% optimize.cse_after_recomputation.cse : 0.000016s : 0.07% optimize.environ_conv : 0.000010s : 0.04% optimize.swap_dp_allreduce_reducescatter : 0.000009s : 0.04% optimize.bias_add_comm_swap : 0.000006s : 0.02% optimize.label_micro_interleaved_index : 0.000008s : 0.03% optimize.label_fine_grained_interleaved_index : 0.000006s : 0.02% optimize.merge_cast_opt : 0.000004s : 0.02% optimize.slice_recompute_activation : 0.000005s : 0.02% optimize.micro_interleaved_order_control : 0.000005s : 0.02% optimize.assign_add_opt : 0.000004s : 0.02% optimize.ForceFp32Comm : 0.000003s : 0.01% optimize.remove_cast_before_assign_add : 0.000004s : 0.02% optimize.full_micro_interleaved_order_control : 0.000005s : 0.02% optimize.reorder_send_recv_between_fp_bp : 0.000005s : 0.02% optimize.comm_op_add_attrs : 0.000004s : 0.02% optimize.add_comm_op_reuse_tag : 0.000003s : 0.01% optimize.interleave_split_concat_branches : 0.000004s : 0.01% optimize.interleave_parallel_branches : 0.000003s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.02% optimize.overlap_opt_shard_grad_in_pipeline : 0.000005s : 0.02% optimize.control_data_broadcast_order : 0.000018s : 0.07% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.02% optimize.offloading_packed_experts : 0.000007s : 0.03% optimize.overlap_recompute_and_grad_model_parallel : 0.000009s : 0.04% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.02% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.01% optimize.overlap_recompute_comm : 0.000006s : 0.02% optimize.overlap_grad_ring_attention : 0.000007s : 0.03% optimize.overlap_grad_flash_sp : 0.000027s : 0.11% optimize.begin_end_overlap_inline : 0.000003s : 0.01% optimize.split_matmul_comm_elemetwise : 0.000005s : 0.02% optimize.split_layernorm_comm : 0.000004s : 0.02% optimize.handle_group_info : 0.000003s : 0.01% optimize.symbol_engine_optimizer.build : 0.000004s : 0.02% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000011s : 0.05% optimize.symbol_engine_optimizer.elim_not_effective : 0.000015s : 0.06% optimize.symbol_engine_optimizer.opt_reshape : 0.000008s : 0.04% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000013s : 0.06% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000004s : 0.01% pipeline_parallel_scheduler : 0.000002s : 0.01% auto_monad_reorder : 0.000022s : 0.09% get_jit_bprop_graph : 0.000002s : 0.01% rewriter_after_jit_bprop_graph : 0.000005s : 0.02% opt_after_jit_grad : 0.000494s : 2.07% validate : 0.000046s : 0.19% Time group info: ------[substitution.] 0.000203 38 10.24% : 0.000021s : 3: substitution.cast_eliminate 1.04% : 0.000002s : 3: substitution.elim_not_effective 0.77% : 0.000002s : 3: substitution.fold_const_symbol 3.39% : 0.000007s : 5: substitution.graph_param_transform 69.95% : 0.000142s : 4: substitution.inline 2.15% : 0.000004s : 6: substitution.j_node_and_user_rematch 2.94% : 0.000006s : 6: substitution.remove_not_recompute_node 2.06% : 0.000004s : 4: substitution.replace_old_param 7.47% : 0.000015s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.005696 2 87.11% : 0.004962s : 1: type_inference.infer 12.89% : 0.000734s : 1: type_inference.specialize ------[replace.] 0.000063 8 59.89% : 0.000038s : 4: replace.inline 40.11% : 0.000025s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000153 8 91.33% : 0.000140s : 4: match.inline 8.67% : 0.000013s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000260 1596 0.90% : 0.000002s : 17: predicate.accumulaten_eliminater 0.77% : 0.000002s : 5: predicate.ad_related_special_op_eliminate 0.52% : 0.000001s : 10: predicate.addn_check_dump 0.93% : 0.000002s : 17: predicate.addn_zero_filter 0.87% : 0.000002s : 17: predicate.adjust_all_reduce_mul_add 2.11% : 0.000005s : 27: predicate.arithmetic_simplify 0.98% : 0.000003s : 17: predicate.cast_eliminate 0.59% : 0.000002s : 10: predicate.check_bprop_eliminate 0.71% : 0.000002s : 10: predicate.compare_switch_simplify 0.25% : 0.000001s : 5: predicate.const_output_eliminate 0.60% : 0.000002s : 10: predicate.depend_value_elim 0.97% : 0.000003s : 17: predicate.dict_get_item_const_eliminator 1.03% : 0.000003s : 17: predicate.dict_get_item_eliminator 0.88% : 0.000002s : 17: predicate.dict_set_item_eliminator 1.99% : 0.000005s : 10: predicate.dumpgradient_eliminate 0.19% : 0.000000s : 5: predicate.elim_not_effective 0.38% : 0.000001s : 5: predicate.elim_shapecalc_of_broadcastargs 1.24% : 0.000003s : 22: predicate.environ_add_const_eliminate 1.19% : 0.000003s : 22: predicate.environ_get_add_eliminate 1.14% : 0.000003s : 22: predicate.environ_get_depend_swap 1.86% : 0.000005s : 32: predicate.environ_get_eliminate 1.15% : 0.000003s : 22: predicate.environ_get_set_eliminate 1.36% : 0.000004s : 25: predicate.exchange_switch_depend_value 2.19% : 0.000006s : 25: predicate.float_depend_g_call 0.54% : 0.000001s : 10: predicate.float_environ_get_switch 0.78% : 0.000002s : 15: predicate.float_tuple_getitem_switch 0.19% : 0.000000s : 5: predicate.fold_const_symbol 0.73% : 0.000002s : 10: predicate.get_grad_eliminate 0.26% : 0.000001s : 5: predicate.graph_param_transform 0.62% : 0.000002s : 10: predicate.incorporate_call 0.48% : 0.000001s : 10: predicate.incorporate_call_switch 6.27% : 0.000016s : 72: predicate.inline 0.73% : 0.000002s : 10: predicate.inline_without_move 0.31% : 0.000001s : 10: predicate.j_node_and_user_rematch 0.80% : 0.000002s : 10: predicate.less_batch_normalization 1.85% : 0.000005s : 31: predicate.list_to_tuple_eliminator_ 2.54% : 0.000007s : 48: predicate.load_eliminater 0.81% : 0.000002s : 5: predicate.loop_unroll_after_grad 2.29% : 0.000006s : 36: predicate.loop_unroll_before_grad 1.75% : 0.000005s : 27: predicate.make_slice_get_slice_eliminator 0.57% : 0.000001s : 10: predicate.merge_addn 0.54% : 0.000001s : 10: predicate.micro_step_allgather_replace 0.62% : 0.000002s : 10: predicate.mini_step_allgather_replace 0.86% : 0.000002s : 17: predicate.minmaximum_grad 0.79% : 0.000002s : 5: predicate.mutable_eliminate 0.35% : 0.000001s : 5: predicate.opt_reshape 0.47% : 0.000001s : 5: predicate.parallel_virtual_node 1.73% : 0.000004s : 25: predicate.partial_defer_inline 1.70% : 0.000004s : 26: predicate.partial_eliminate 0.92% : 0.000002s : 17: predicate.print_const_string_wrapper 0.58% : 0.000002s : 10: predicate.reduce_all_const_elim 1.14% : 0.000003s : 17: predicate.reduce_eliminate 2.48% : 0.000006s : 48: predicate.redundant_stop_gradient_eliminater 0.37% : 0.000001s : 10: predicate.remove_not_recompute_node 1.30% : 0.000003s : 31: predicate.replace_applicator 0.50% : 0.000001s : 10: predicate.replace_old_param 0.25% : 0.000001s : 5: predicate.reset_defer_inline 0.94% : 0.000002s : 17: predicate.reshape_eliminate 0.62% : 0.000002s : 10: predicate.row_tensor_add_zeros_like 0.52% : 0.000001s : 5: predicate.row_tensor_eliminate 0.68% : 0.000002s : 10: predicate.same_eliminate 0.42% : 0.000001s : 10: predicate.set_cell_output_no_recompute 0.71% : 0.000002s : 10: predicate.shard_identity_eliminate 0.73% : 0.000002s : 10: predicate.special_op_eliminate 0.86% : 0.000002s : 10: predicate.specialize_transform 0.82% : 0.000002s : 10: predicate.split_environ_get_set_with_tuple_value 0.65% : 0.000002s : 10: predicate.stack_unstack_eliminate 0.41% : 0.000001s : 5: predicate.switch_call_monad_eliminater 1.47% : 0.000004s : 25: predicate.switch_defer_inline 2.02% : 0.000005s : 35: predicate.switch_layer_defer_inline 4.60% : 0.000012s : 76: predicate.switch_simplify 0.93% : 0.000002s : 17: predicate.tile_eliminate 0.95% : 0.000002s : 17: predicate.transpose_eliminate 1.61% : 0.000004s : 27: predicate.tuple_list_convert_item_index_to_positive 1.62% : 0.000004s : 27: predicate.tuple_list_get_item_const_eliminator 1.45% : 0.000004s : 27: predicate.tuple_list_get_item_depend_reorder 3.31% : 0.000009s : 41: predicate.tuple_list_get_item_eliminator 1.56% : 0.000004s : 27: predicate.tuple_list_get_set_item_eliminator 2.29% : 0.000006s : 37: predicate.tuple_list_set_item_eliminator 1.69% : 0.000004s : 31: predicate.tuple_to_list_eliminator_ 2.46% : 0.000006s : 48: predicate.updatestate_pure_node_eliminater 3.14% : 0.000008s : 58: predicate.updatestate_useless_node_eliminater 0.75% : 0.000002s : 5: predicate.value_based_eliminate 0.58% : 0.000001s : 10: predicate.virtual_dataset_eliminate 0.62% : 0.000002s : 10: predicate.virtual_output_eliminate 0.29% : 0.000001s : 5: predicate.virtual_view_grad_eliminate 0.41% : 0.000001s : 5: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000532 11 52.67% : 0.000280s : 5: func_graph_cloner_run.FuncGraphClonerGraph 47.33% : 0.000252s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.052157 192 0.01% : 0.000006s : 1: ForceFp32Comm 5.86% : 0.003059s : 1: add_attr 5.84% : 0.003046s : 1: add_attr_with_inline 0.01% : 0.000006s : 1: add_comm_op_reuse_tag 0.13% : 0.000067s : 1: add_recomputation 0.01% : 0.000007s : 1: assign_add_opt 0.15% : 0.000079s : 1: auto_monad 0.06% : 0.000030s : 1: auto_monad_reorder 0.01% : 0.000006s : 1: begin_end_overlap_inline 0.02% : 0.000008s : 1: bias_add_comm_swap 0.94% : 0.000488s : 1: bootstrap 0.08% : 0.000042s : 1: cconv 0.01% : 0.000006s : 1: comm_op_add_attrs 0.04% : 0.000021s : 1: control_data_broadcast_order 0.03% : 0.000013s : 1: convert_after_rewriter 0.07% : 0.000035s : 1: cse_after_recomputation 0.02% : 0.000008s : 1: dataset_repeat_opt 0.03% : 0.000018s : 1: detach_backward 0.02% : 0.000013s : 1: environ_conv 0.06% : 0.000029s : 1: event_method 0.01% : 0.000007s : 1: full_micro_interleaved_order_control 0.02% : 0.000008s : 1: get_jit_bprop_graph 0.03% : 0.000013s : 1: graph_reusing 0.01% : 0.000006s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000006s : 1: handle_group_info 0.01% : 0.000008s : 1: inline 0.02% : 0.000009s : 1: insert-virtual-dataset 0.01% : 0.000006s : 1: interleave_parallel_branches 0.01% : 0.000006s : 1: interleave_split_concat_branches 0.02% : 0.000008s : 1: label_fine_grained_interleaved_index 0.02% : 0.000010s : 1: label_micro_interleaved_index 1.33% : 0.000692s : 1: loop_unroll 0.01% : 0.000007s : 1: merge_cast_opt 0.01% : 0.000007s : 1: micro_interleaved_order_control 0.96% : 0.000501s : 1: mutable_eliminate 0.02% : 0.000010s : 1: offloading_packed_experts 0.03% : 0.000017s : 1: opt.transform.loop_unroll_optimizer 0.03% : 0.000017s : 1: opt.transform.mutable_eliminate 2.77% : 0.001443s : 78: opt.transform.opt_a 0.07% : 0.000038s : 1: opt.transform.opt_after_cconv 0.06% : 0.000029s : 1: opt.transform.opt_after_jit_grad 0.31% : 0.000164s : 28: opt.transform.opt_b 0.12% : 0.000062s : 2: opt.transform.opt_trans_graph 0.08% : 0.000044s : 4: opt.transform.symbol_engine_opt 6.32% : 0.003294s : 1: opt_a 0.29% : 0.000150s : 1: opt_after_cconv 0.97% : 0.000504s : 1: opt_after_jit_grad 23.96% : 0.012498s : 1: opt_b 35.43% : 0.018480s : 1: optimize 0.06% : 0.000032s : 1: optimize_parallel_all_gather_comm 0.02% : 0.000011s : 1: order_py_execute_after_rewriter 0.06% : 0.000030s : 1: overlap_grad_flash_sp 0.01% : 0.000006s : 1: overlap_grad_matmul_and_grad_allreduce 0.02% : 0.000010s : 1: overlap_grad_ring_attention 0.01% : 0.000008s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000006s : 1: overlap_opt_shard_in_pipeline 0.02% : 0.000009s : 1: overlap_param_gather 0.01% : 0.000006s : 1: overlap_recompute_allgather_and_fa_grad 0.02% : 0.000012s : 1: overlap_recompute_and_grad_model_parallel 0.02% : 0.000008s : 1: overlap_recompute_comm 0.02% : 0.000010s : 1: parallel-infer-symbol 0.01% : 0.000007s : 1: parallel-infer-symbol-second 0.02% : 0.000008s : 1: partial_unused_args_eliminate 0.02% : 0.000010s : 1: pipeline_parallel_scheduler 0.01% : 0.000007s : 1: pipeline_split 0.08% : 0.000041s : 1: pre_auto_parallel 0.06% : 0.000033s : 1: py_interpret_to_execute 0.03% : 0.000017s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000007s : 1: remove_cast_before_assign_add 0.04% : 0.000022s : 1: remove_dup_value 0.65% : 0.000340s : 1: renormalize.infer 0.54% : 0.000284s : 1: renormalize.specialize 0.01% : 0.000008s : 1: reorder_send_recv_between_fp_bp 0.02% : 0.000011s : 1: rewriter_after_jit_bprop_graph 0.10% : 0.000050s : 1: rewriter_after_opt_a 0.18% : 0.000094s : 1: rewriter_before_opt_a 0.01% : 0.000008s : 1: slice_cell_reuse_recomputed_activation 0.01% : 0.000007s : 1: slice_recompute_activation 0.01% : 0.000007s : 1: split_layernorm_comm 0.01% : 0.000007s : 1: split_matmul_comm_elemetwise 0.02% : 0.000012s : 1: swap_dp_allreduce_reducescatter 0.21% : 0.000108s : 1: symbol_engine_optimizer 0.21% : 0.000107s : 1: tuple_transform 11.08% : 0.005780s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:45:52.530.483 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0309785, [21] [bootstrap]: 0.00037843 [type_inference]: 0.00529174 [event_method]: 1.734e-05 [auto_monad]: 6.389e-05 [graph_reusing]: 5.99e-06 [inline]: 1.87999e-06 [add_attr]: 0.00299557, [1] [add_attr_with_inline]: 0.00298731, [1] [Cycle 1]: 5.236e-05, [2] [tag_attr]: 1.778e-05 [meta_addattr_fg_expand]: 6.29001e-06 [parallel-infer-symbol]: 3.20998e-06 [pre_auto_parallel]: 3.093e-05 [insert-virtual-dataset]: 2.37999e-06 [parallel-infer-symbol-second]: 8.30012e-07 [dataset_repeat_opt]: 2.09999e-06 [pipeline_split]: 1.60999e-06 [optimize]: 0.0215211, [53] [py_interpret_to_execute]: 2.584e-05 [rewriter_before_opt_a]: 8.299e-05 [opt_a]: 0.0191809, [2] [Cycle 1]: 0.0184009, [45] [expand_dump_flag]: 3.08e-06 [switch_simplify]: 4.43e-05 [loop_unroll]: 5.74e-05 [a_1]: 0.00082198 [with_stream_mark]: 1.809e-05 [recompute_prepare]: 9.59e-06 [updatestate_depend_eliminate]: 4.53999e-06 [updatestate_assign_eliminate]: 3.58999e-06 [updatestate_loads_eliminate]: 3.8e-06 [parameter_eliminate]: 2.15002e-06 [a_2]: 0.0001052 [accelerated_algorithm]: 1.001e-05 [shard]: 2.04999e-06 [meta_shard_fg_expand]: 3.28e-06 [shard_inline]: 8.22e-06 [merge_send_recv]: 9.89001e-06 [auto_parallel]: 1.144e-05 [parallel]: 1.788e-05 [flash_sp]: 1.051e-05 [merge_comm]: 4.80999e-06 [allreduce_fusion]: 4.23001e-06 [matmul_add_comm_reduction]: 1.07e-05 [allreduce_slice_to_reducescatter]: 9.00007e-07 [virtual_shard_identity]: 9.29998e-06 [virtual_dataset]: 8.33999e-06 [get_grad_eliminate_]: 8.40001e-06 [virtual_output]: 8.32e-06 [merge_forward]: 4.49998e-06 [cell_reuse_recompute_pass]: 1.19e-06 [offload_activation]: 1.11e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.607e-05 [merge_recompute_call_nodes]: 1.57999e-06 [before_grad]: 1.334e-05 [set_forward_comm_id_for_comm_node_pass]: 4.45999e-06 [meta_fg_expand]: 3.36999e-06 [flash_sp_send_recv_attached]: 2.54999e-06 [receive_attached]: 2.48e-06 [after_resolve]: 1.335e-05 [a_after_grad]: 1.221e-05 [renormalize]: 0.0007626 [add_forward_monad_depend]: 5.72999e-06 [auto_monad_grad]: 2.96001e-06 [auto_monad_eliminator]: 1.752e-05 [cse]: 3.688e-05 [a_3]: 5.945e-05 [Cycle 2]: 0.00076698, [45] [expand_dump_flag]: 2.48e-06 [switch_simplify]: 8.64e-06 [loop_unroll]: 7.25e-06 [a_1]: 0.00018146 [with_stream_mark]: 1.17e-05 [recompute_prepare]: 8.22e-06 [updatestate_depend_eliminate]: 3.72002e-06 [updatestate_assign_eliminate]: 3.08e-06 [updatestate_loads_eliminate]: 3.09001e-06 [parameter_eliminate]: 1.08001e-06 [a_2]: 9.353e-05 [accelerated_algorithm]: 7.78999e-06 [shard]: 1.30999e-06 [meta_shard_fg_expand]: 1.47001e-06 [shard_inline]: 1.017e-05 [merge_send_recv]: 6.01e-06 [auto_parallel]: 6.34999e-06 [parallel]: 4.74e-06 [flash_sp]: 3.49001e-06 [merge_comm]: 3.98999e-06 [allreduce_fusion]: 4.22e-06 [matmul_add_comm_reduction]: 6.49001e-06 [allreduce_slice_to_reducescatter]: 3.39991e-07 [virtual_shard_identity]: 8.46002e-06 [virtual_dataset]: 7.05e-06 [get_grad_eliminate_]: 7.06999e-06 [virtual_output]: 6.94999e-06 [merge_forward]: 3.37002e-06 [cell_reuse_recompute_pass]: 1.12e-06 [offload_activation]: 7.48e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.488e-05 [merge_recompute_call_nodes]: 6.89994e-07 [before_grad]: 1.187e-05 [set_forward_comm_id_for_comm_node_pass]: 3.97e-06 [meta_fg_expand]: 3.09999e-06 [flash_sp_send_recv_attached]: 9.20001e-07 [receive_attached]: 1.18001e-06 [after_resolve]: 1.142e-05 [a_after_grad]: 1.069e-05 [renormalize]: 7.00238e-08 [add_forward_monad_depend]: 1.39e-06 [auto_monad_grad]: 1.10999e-06 [auto_monad_eliminator]: 8.55001e-06 [cse]: 1.735e-05 [a_3]: 4.548e-05 [py_interpret_to_execute_after_opt_a]: 1.237e-05 [slice_cell_reuse_recomputed_activation]: 2.29999e-06 [rewriter_after_opt_a]: 4.082e-05 [convert_after_rewriter]: 7.47002e-06 [order_py_execute_after_rewriter]: 5.59998e-06 [mutable_eliminate]: 0.00065577 [opt_b]: 0.00032254, [1] [Cycle 1]: 0.00031562, [7] [b_1]: 0.00017449 [b_2]: 9.72001e-06 [updatestate_depend_eliminate]: 7.08e-06 [updatestate_assign_eliminate]: 3.36999e-06 [updatestate_loads_eliminate]: 3.04001e-06 [renormalize]: 7.50006e-07 [cse]: 2.203e-05 [optimize_parallel_all_gather_comm]: 1.784e-05 [overlap_param_gather]: 1.87001e-06 [cconv]: 2.418e-05 [loop_unroll]: 0.00042382 [opt_after_cconv]: 0.00011373, [1] [Cycle 1]: 0.00010799, [7] [c_1]: 3.749e-05 [parameter_eliminate]: 2.59001e-06 [updatestate_depend_eliminate]: 5.72999e-06 [updatestate_assign_eliminate]: 2.99999e-06 [updatestate_loads_eliminate]: 3.05998e-06 [cse]: 2.084e-05 [renormalize]: 5.00004e-07 [remove_dup_value]: 1.811e-05 [tuple_transform]: 9.106e-05, [1] [Cycle 1]: 8.622e-05, [4] [d_1]: 5.675e-05 [none_parameter_eliminate]: 1.44e-06 [renormalize]: 1.90019e-07 [switch_simplify]: 8.08999e-06 [partial_unused_args_eliminate]: 1.80001e-06 [add_recomputation]: 5.438e-05 [cse_after_recomputation]: 2.597e-05, [1] [Cycle 1]: 2.02e-05, [1] [cse]: 1.446e-05 [environ_conv]: 6.89999e-06 [swap_dp_allreduce_reducescatter]: 6.70002e-06 [bias_add_comm_swap]: 2.41e-06 [label_micro_interleaved_index]: 4.15e-06 [label_fine_grained_interleaved_index]: 2.93e-06 [merge_cast_opt]: 1.32999e-06 [slice_recompute_activation]: 1.89e-06 [micro_interleaved_order_control]: 2.07001e-06 [assign_add_opt]: 1.13001e-06 [ForceFp32Comm]: 7.89994e-07 [remove_cast_before_assign_add]: 9.70002e-07 [full_micro_interleaved_order_control]: 2.16e-06 [reorder_send_recv_between_fp_bp]: 2.54999e-06 [comm_op_add_attrs]: 1.32e-06 [add_comm_op_reuse_tag]: 1.25999e-06 [interleave_split_concat_branches]: 1.12e-06 [interleave_parallel_branches]: 1.06002e-06 [overlap_opt_shard_in_pipeline]: 1.07e-06 [overlap_opt_shard_grad_in_pipeline]: 1.78002e-06 [control_data_broadcast_order]: 1.388e-05 [grouped_pairwise_exchange_alltoall]: 1.66998e-06 [offloading_packed_experts]: 4.4e-06 [overlap_recompute_and_grad_model_parallel]: 5.05001e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.14e-06 [overlap_recompute_allgather_and_fa_grad]: 1.35999e-06 [overlap_recompute_comm]: 2.53003e-06 [overlap_grad_ring_attention]: 4.36002e-06 [overlap_grad_flash_sp]: 2.126e-05 [begin_end_overlap_inline]: 5.39992e-07 [split_matmul_comm_elemetwise]: 2.16e-06 [split_layernorm_comm]: 1.64e-06 [handle_group_info]: 9.39996e-07 [symbol_engine_optimizer]: 8.187e-05, [1] [Cycle 1]: 7.78e-05, [6] [build]: 3.23e-06 [elim_shapecalc]: 1.068e-05 [elim_not_effective]: 1.516e-05 [opt_reshape]: 8.41002e-06 [fold_const_symbol]: 1.246e-05 [renormalize]: 1.50001e-07 [detach_backward]: 2.41998e-06 [pipeline_parallel_scheduler]: 1.55999e-06 [auto_monad_reorder]: 1.91e-05 [get_jit_bprop_graph]: 1.72999e-06 [rewriter_after_jit_bprop_graph]: 3.4e-06 [opt_after_jit_grad]: 0.00044933 [validate]: 3.879e-05 Sums bootstrap : 0.000378s : 3.45% type_inference : 0.005292s : 48.22% event_method : 0.000017s : 0.16% auto_monad : 0.000064s : 0.58% graph_reusing : 0.000006s : 0.05% inline : 0.000002s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000018s : 0.16% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.06% parallel-infer-symbol : 0.000003s : 0.03% pre_auto_parallel : 0.000031s : 0.28% insert-virtual-dataset : 0.000002s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000026s : 0.24% optimize.rewriter_before_opt_a : 0.000083s : 0.76% optimize.opt_a.expand_dump_flag : 0.000006s : 0.05% optimize.opt_a.switch_simplify : 0.000053s : 0.48% optimize.opt_a.loop_unroll : 0.000065s : 0.59% optimize.opt_a.a_1 : 0.001003s : 9.14% optimize.opt_a.with_stream_mark : 0.000030s : 0.27% optimize.opt_a.recompute_prepare : 0.000018s : 0.16% optimize.opt_a.updatestate_depend_eliminate : 0.000008s : 0.08% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.06% optimize.opt_a.updatestate_loads_eliminate : 0.000007s : 0.06% optimize.opt_a.parameter_eliminate : 0.000003s : 0.03% optimize.opt_a.a_2 : 0.000199s : 1.81% optimize.opt_a.accelerated_algorithm : 0.000018s : 0.16% optimize.opt_a.shard : 0.000003s : 0.03% optimize.opt_a.meta_shard_fg_expand : 0.000005s : 0.04% optimize.opt_a.shard_inline : 0.000018s : 0.17% optimize.opt_a.merge_send_recv : 0.000016s : 0.14% optimize.opt_a.auto_parallel : 0.000018s : 0.16% optimize.opt_a.parallel : 0.000023s : 0.21% optimize.opt_a.flash_sp : 0.000014s : 0.13% optimize.opt_a.merge_comm : 0.000009s : 0.08% optimize.opt_a.allreduce_fusion : 0.000008s : 0.08% optimize.opt_a.matmul_add_comm_reduction : 0.000017s : 0.16% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000018s : 0.16% optimize.opt_a.virtual_dataset : 0.000015s : 0.14% optimize.opt_a.get_grad_eliminate_ : 0.000015s : 0.14% optimize.opt_a.virtual_output : 0.000015s : 0.14% optimize.opt_a.merge_forward : 0.000008s : 0.07% optimize.opt_a.cell_reuse_recompute_pass : 0.000002s : 0.02% optimize.opt_a.offload_activation : 0.000019s : 0.17% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000031s : 0.28% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.02% optimize.opt_a.before_grad : 0.000025s : 0.23% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000008s : 0.08% optimize.opt_a.meta_fg_expand : 0.000006s : 0.06% optimize.opt_a.flash_sp_send_recv_attached : 0.000003s : 0.03% optimize.opt_a.receive_attached : 0.000004s : 0.03% optimize.opt_a.after_resolve : 0.000025s : 0.23% optimize.opt_a.a_after_grad : 0.000023s : 0.21% optimize.opt_a.renormalize : 0.000763s : 6.95% optimize.opt_a.add_forward_monad_depend : 0.000007s : 0.06% optimize.opt_a.auto_monad_grad : 0.000004s : 0.04% optimize.opt_a.auto_monad_eliminator : 0.000026s : 0.24% optimize.opt_a.cse : 0.000054s : 0.49% optimize.opt_a.a_3 : 0.000105s : 0.96% optimize.py_interpret_to_execute_after_opt_a : 0.000012s : 0.11% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.02% optimize.rewriter_after_opt_a : 0.000041s : 0.37% optimize.convert_after_rewriter : 0.000007s : 0.07% optimize.order_py_execute_after_rewriter : 0.000006s : 0.05% optimize.mutable_eliminate : 0.000656s : 5.98% optimize.opt_b.b_1 : 0.000174s : 1.59% optimize.opt_b.b_2 : 0.000010s : 0.09% optimize.opt_b.updatestate_depend_eliminate : 0.000007s : 0.06% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_b.renormalize : 0.000001s : 0.01% optimize.opt_b.cse : 0.000022s : 0.20% optimize.optimize_parallel_all_gather_comm : 0.000018s : 0.16% optimize.overlap_param_gather : 0.000002s : 0.02% optimize.cconv : 0.000024s : 0.22% optimize.loop_unroll : 0.000424s : 3.86% optimize.opt_after_cconv.c_1 : 0.000037s : 0.34% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.02% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.05% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.cse : 0.000021s : 0.19% optimize.opt_after_cconv.renormalize : 0.000001s : 0.00% optimize.remove_dup_value : 0.000018s : 0.17% optimize.tuple_transform.d_1 : 0.000057s : 0.52% optimize.tuple_transform.none_parameter_eliminate : 0.000001s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000008s : 0.07% optimize.partial_unused_args_eliminate : 0.000002s : 0.02% optimize.add_recomputation : 0.000054s : 0.50% optimize.cse_after_recomputation.cse : 0.000014s : 0.13% optimize.environ_conv : 0.000007s : 0.06% optimize.swap_dp_allreduce_reducescatter : 0.000007s : 0.06% optimize.bias_add_comm_swap : 0.000002s : 0.02% optimize.label_micro_interleaved_index : 0.000004s : 0.04% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.03% optimize.merge_cast_opt : 0.000001s : 0.01% optimize.slice_recompute_activation : 0.000002s : 0.02% optimize.micro_interleaved_order_control : 0.000002s : 0.02% optimize.assign_add_opt : 0.000001s : 0.01% optimize.ForceFp32Comm : 0.000001s : 0.01% optimize.remove_cast_before_assign_add : 0.000001s : 0.01% optimize.full_micro_interleaved_order_control : 0.000002s : 0.02% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.02% optimize.comm_op_add_attrs : 0.000001s : 0.01% optimize.add_comm_op_reuse_tag : 0.000001s : 0.01% optimize.interleave_split_concat_branches : 0.000001s : 0.01% optimize.interleave_parallel_branches : 0.000001s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.02% optimize.control_data_broadcast_order : 0.000014s : 0.13% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.02% optimize.offloading_packed_experts : 0.000004s : 0.04% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.05% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.01% optimize.overlap_recompute_comm : 0.000003s : 0.02% optimize.overlap_grad_ring_attention : 0.000004s : 0.04% optimize.overlap_grad_flash_sp : 0.000021s : 0.19% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.02% optimize.split_layernorm_comm : 0.000002s : 0.01% optimize.handle_group_info : 0.000001s : 0.01% optimize.symbol_engine_optimizer.build : 0.000003s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000011s : 0.10% optimize.symbol_engine_optimizer.elim_not_effective : 0.000015s : 0.14% optimize.symbol_engine_optimizer.opt_reshape : 0.000008s : 0.08% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000012s : 0.11% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.02% pipeline_parallel_scheduler : 0.000002s : 0.01% auto_monad_reorder : 0.000019s : 0.17% get_jit_bprop_graph : 0.000002s : 0.02% rewriter_after_jit_bprop_graph : 0.000003s : 0.03% opt_after_jit_grad : 0.000449s : 4.09% validate : 0.000039s : 0.35% Time group info: ------[substitution.] 0.000232 38 9.03% : 0.000021s : 3: substitution.cast_eliminate 0.92% : 0.000002s : 3: substitution.elim_not_effective 0.67% : 0.000002s : 3: substitution.fold_const_symbol 3.16% : 0.000007s : 5: substitution.graph_param_transform 72.06% : 0.000167s : 4: substitution.inline 1.88% : 0.000004s : 6: substitution.j_node_and_user_rematch 2.83% : 0.000007s : 6: substitution.remove_not_recompute_node 1.96% : 0.000005s : 4: substitution.replace_old_param 7.50% : 0.000017s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.005237 2 86.69% : 0.004540s : 1: type_inference.infer 13.31% : 0.000697s : 1: type_inference.specialize ------[replace.] 0.000069 8 61.65% : 0.000043s : 4: replace.inline 38.35% : 0.000027s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000180 8 91.49% : 0.000164s : 4: match.inline 8.51% : 0.000015s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000272 1596 1.02% : 0.000003s : 17: predicate.accumulaten_eliminater 0.60% : 0.000002s : 5: predicate.ad_related_special_op_eliminate 0.56% : 0.000002s : 10: predicate.addn_check_dump 1.05% : 0.000003s : 17: predicate.addn_zero_filter 0.88% : 0.000002s : 17: predicate.adjust_all_reduce_mul_add 2.19% : 0.000006s : 27: predicate.arithmetic_simplify 1.11% : 0.000003s : 17: predicate.cast_eliminate 0.61% : 0.000002s : 10: predicate.check_bprop_eliminate 0.57% : 0.000002s : 10: predicate.compare_switch_simplify 0.18% : 0.000000s : 5: predicate.const_output_eliminate 0.62% : 0.000002s : 10: predicate.depend_value_elim 1.10% : 0.000003s : 17: predicate.dict_get_item_const_eliminator 1.12% : 0.000003s : 17: predicate.dict_get_item_eliminator 0.94% : 0.000003s : 17: predicate.dict_set_item_eliminator 0.84% : 0.000002s : 10: predicate.dumpgradient_eliminate 0.18% : 0.000000s : 5: predicate.elim_not_effective 0.33% : 0.000001s : 5: predicate.elim_shapecalc_of_broadcastargs 1.19% : 0.000003s : 22: predicate.environ_add_const_eliminate 1.12% : 0.000003s : 22: predicate.environ_get_add_eliminate 1.14% : 0.000003s : 22: predicate.environ_get_depend_swap 1.80% : 0.000005s : 32: predicate.environ_get_eliminate 1.25% : 0.000003s : 22: predicate.environ_get_set_eliminate 1.42% : 0.000004s : 25: predicate.exchange_switch_depend_value 2.41% : 0.000007s : 25: predicate.float_depend_g_call 0.54% : 0.000001s : 10: predicate.float_environ_get_switch 0.73% : 0.000002s : 15: predicate.float_tuple_getitem_switch 0.17% : 0.000000s : 5: predicate.fold_const_symbol 0.69% : 0.000002s : 10: predicate.get_grad_eliminate 0.19% : 0.000001s : 5: predicate.graph_param_transform 0.55% : 0.000001s : 10: predicate.incorporate_call 0.47% : 0.000001s : 10: predicate.incorporate_call_switch 5.92% : 0.000016s : 72: predicate.inline 0.66% : 0.000002s : 10: predicate.inline_without_move 0.32% : 0.000001s : 10: predicate.j_node_and_user_rematch 0.92% : 0.000003s : 10: predicate.less_batch_normalization 1.86% : 0.000005s : 31: predicate.list_to_tuple_eliminator_ 2.61% : 0.000007s : 48: predicate.load_eliminater 0.69% : 0.000002s : 5: predicate.loop_unroll_after_grad 2.98% : 0.000008s : 36: predicate.loop_unroll_before_grad 1.65% : 0.000004s : 27: predicate.make_slice_get_slice_eliminator 0.58% : 0.000002s : 10: predicate.merge_addn 0.54% : 0.000001s : 10: predicate.micro_step_allgather_replace 0.63% : 0.000002s : 10: predicate.mini_step_allgather_replace 0.92% : 0.000002s : 17: predicate.minmaximum_grad 0.79% : 0.000002s : 5: predicate.mutable_eliminate 0.31% : 0.000001s : 5: predicate.opt_reshape 0.37% : 0.000001s : 5: predicate.parallel_virtual_node 1.77% : 0.000005s : 25: predicate.partial_defer_inline 1.65% : 0.000004s : 26: predicate.partial_eliminate 0.96% : 0.000003s : 17: predicate.print_const_string_wrapper 0.70% : 0.000002s : 10: predicate.reduce_all_const_elim 1.25% : 0.000003s : 17: predicate.reduce_eliminate 2.72% : 0.000007s : 48: predicate.redundant_stop_gradient_eliminater 0.36% : 0.000001s : 10: predicate.remove_not_recompute_node 1.30% : 0.000004s : 31: predicate.replace_applicator 0.44% : 0.000001s : 10: predicate.replace_old_param 0.25% : 0.000001s : 5: predicate.reset_defer_inline 1.06% : 0.000003s : 17: predicate.reshape_eliminate 0.65% : 0.000002s : 10: predicate.row_tensor_add_zeros_like 0.36% : 0.000001s : 5: predicate.row_tensor_eliminate 0.72% : 0.000002s : 10: predicate.same_eliminate 0.38% : 0.000001s : 10: predicate.set_cell_output_no_recompute 0.73% : 0.000002s : 10: predicate.shard_identity_eliminate 0.61% : 0.000002s : 10: predicate.special_op_eliminate 0.69% : 0.000002s : 10: predicate.specialize_transform 0.77% : 0.000002s : 10: predicate.split_environ_get_set_with_tuple_value 0.79% : 0.000002s : 10: predicate.stack_unstack_eliminate 0.31% : 0.000001s : 5: predicate.switch_call_monad_eliminater 1.51% : 0.000004s : 25: predicate.switch_defer_inline 2.07% : 0.000006s : 35: predicate.switch_layer_defer_inline 4.59% : 0.000012s : 76: predicate.switch_simplify 0.93% : 0.000003s : 17: predicate.tile_eliminate 1.13% : 0.000003s : 17: predicate.transpose_eliminate 1.73% : 0.000005s : 27: predicate.tuple_list_convert_item_index_to_positive 1.65% : 0.000004s : 27: predicate.tuple_list_get_item_const_eliminator 1.56% : 0.000004s : 27: predicate.tuple_list_get_item_depend_reorder 3.38% : 0.000009s : 41: predicate.tuple_list_get_item_eliminator 1.69% : 0.000005s : 27: predicate.tuple_list_get_set_item_eliminator 2.39% : 0.000007s : 37: predicate.tuple_list_set_item_eliminator 1.82% : 0.000005s : 31: predicate.tuple_to_list_eliminator_ 2.48% : 0.000007s : 48: predicate.updatestate_pure_node_eliminater 3.12% : 0.000008s : 58: predicate.updatestate_useless_node_eliminater 0.36% : 0.000001s : 5: predicate.value_based_eliminate 0.56% : 0.000002s : 10: predicate.virtual_dataset_eliminate 0.59% : 0.000002s : 10: predicate.virtual_output_eliminate 0.25% : 0.000001s : 5: predicate.virtual_view_grad_eliminate 0.37% : 0.000001s : 5: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000465 11 51.64% : 0.000240s : 5: func_graph_cloner_run.FuncGraphClonerGraph 48.36% : 0.000225s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.058036 192 0.01% : 0.000003s : 1: ForceFp32Comm 5.17% : 0.003000s : 1: add_attr 5.15% : 0.002991s : 1: add_attr_with_inline 0.01% : 0.000004s : 1: add_comm_op_reuse_tag 0.10% : 0.000059s : 1: add_recomputation 0.01% : 0.000004s : 1: assign_add_opt 0.12% : 0.000070s : 1: auto_monad 0.04% : 0.000023s : 1: auto_monad_reorder 0.01% : 0.000003s : 1: begin_end_overlap_inline 0.01% : 0.000005s : 1: bias_add_comm_swap 0.70% : 0.000406s : 1: bootstrap 0.05% : 0.000027s : 1: cconv 0.01% : 0.000004s : 1: comm_op_add_attrs 0.03% : 0.000017s : 1: control_data_broadcast_order 0.02% : 0.000011s : 1: convert_after_rewriter 0.05% : 0.000029s : 1: cse_after_recomputation 0.01% : 0.000005s : 1: dataset_repeat_opt 0.01% : 0.000006s : 1: detach_backward 0.02% : 0.000010s : 1: environ_conv 0.04% : 0.000024s : 1: event_method 0.01% : 0.000005s : 1: full_micro_interleaved_order_control 0.01% : 0.000005s : 1: get_jit_bprop_graph 0.02% : 0.000010s : 1: graph_reusing 0.01% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000004s : 1: handle_group_info 0.01% : 0.000005s : 1: inline 0.01% : 0.000006s : 1: insert-virtual-dataset 0.01% : 0.000004s : 1: interleave_parallel_branches 0.01% : 0.000004s : 1: interleave_split_concat_branches 0.01% : 0.000006s : 1: label_fine_grained_interleaved_index 0.01% : 0.000007s : 1: label_micro_interleaved_index 0.74% : 0.000431s : 1: loop_unroll 0.01% : 0.000004s : 1: merge_cast_opt 0.01% : 0.000005s : 1: micro_interleaved_order_control 1.14% : 0.000663s : 1: mutable_eliminate 0.01% : 0.000007s : 1: offloading_packed_experts 0.03% : 0.000015s : 1: opt.transform.loop_unroll_optimizer 0.03% : 0.000015s : 1: opt.transform.mutable_eliminate 2.67% : 0.001550s : 78: opt.transform.opt_a 0.06% : 0.000036s : 1: opt.transform.opt_after_cconv 0.05% : 0.000028s : 1: opt.transform.opt_after_jit_grad 0.26% : 0.000152s : 28: opt.transform.opt_b 0.11% : 0.000062s : 2: opt.transform.opt_trans_graph 0.07% : 0.000043s : 4: opt.transform.symbol_engine_opt 33.06% : 0.019184s : 1: opt_a 0.20% : 0.000117s : 1: opt_after_cconv 0.79% : 0.000457s : 1: opt_after_jit_grad 0.56% : 0.000326s : 1: opt_b 37.09% : 0.021526s : 1: optimize 0.04% : 0.000021s : 1: optimize_parallel_all_gather_comm 0.01% : 0.000009s : 1: order_py_execute_after_rewriter 0.04% : 0.000025s : 1: overlap_grad_flash_sp 0.01% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.01% : 0.000008s : 1: overlap_grad_ring_attention 0.01% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.01% : 0.000005s : 1: overlap_param_gather 0.01% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.01% : 0.000008s : 1: overlap_recompute_and_grad_model_parallel 0.01% : 0.000005s : 1: overlap_recompute_comm 0.01% : 0.000007s : 1: parallel-infer-symbol 0.01% : 0.000004s : 1: parallel-infer-symbol-second 0.01% : 0.000005s : 1: partial_unused_args_eliminate 0.01% : 0.000005s : 1: pipeline_parallel_scheduler 0.01% : 0.000004s : 1: pipeline_split 0.06% : 0.000035s : 1: pre_auto_parallel 0.05% : 0.000030s : 1: py_interpret_to_execute 0.03% : 0.000016s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000004s : 1: remove_cast_before_assign_add 0.04% : 0.000022s : 1: remove_dup_value 0.73% : 0.000424s : 1: renormalize.infer 0.57% : 0.000331s : 1: renormalize.specialize 0.01% : 0.000005s : 1: reorder_send_recv_between_fp_bp 0.01% : 0.000007s : 1: rewriter_after_jit_bprop_graph 0.08% : 0.000045s : 1: rewriter_after_opt_a 0.17% : 0.000098s : 1: rewriter_before_opt_a 0.01% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.01% : 0.000005s : 1: slice_recompute_activation 0.01% : 0.000005s : 1: split_layernorm_comm 0.01% : 0.000005s : 1: split_matmul_comm_elemetwise 0.02% : 0.000010s : 1: swap_dp_allreduce_reducescatter 0.15% : 0.000085s : 1: symbol_engine_optimizer 0.16% : 0.000094s : 1: tuple_transform 9.14% : 0.005306s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:45:52.942.819 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:45:52.943.061 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0154992, [21] [bootstrap]: 0.00037882 [type_inference]: 0.00518353 [event_method]: 1.782e-05 [auto_monad]: 6.15e-05 [graph_reusing]: 5.84999e-06 [inline]: 2.14e-06 [add_attr]: 0.00298732, [1] [add_attr_with_inline]: 0.00297981, [1] [Cycle 1]: 6.364e-05, [2] [tag_attr]: 1.878e-05 [meta_addattr_fg_expand]: 5.99e-06 [parallel-infer-symbol]: 3.16001e-06 [pre_auto_parallel]: 3.179e-05 [insert-virtual-dataset]: 2.81999e-06 [parallel-infer-symbol-second]: 7.89994e-07 [dataset_repeat_opt]: 1.87001e-06 [pipeline_split]: 1.79998e-06 [optimize]: 0.00567784, [53] [py_interpret_to_execute]: 2.771e-05 [rewriter_before_opt_a]: 8.54e-05 [opt_a]: 0.00326733, [2] [Cycle 1]: 0.00230556, [45] [expand_dump_flag]: 2.67001e-06 [switch_simplify]: 4.357e-05 [loop_unroll]: 3.237e-05 [a_1]: 0.00071429 [with_stream_mark]: 1.418e-05 [recompute_prepare]: 9.57999e-06 [updatestate_depend_eliminate]: 4.77e-06 [updatestate_assign_eliminate]: 1.653e-05 [updatestate_loads_eliminate]: 3.95e-06 [parameter_eliminate]: 2.17001e-06 [a_2]: 0.00013266 [accelerated_algorithm]: 9.05001e-06 [shard]: 2.86e-06 [meta_shard_fg_expand]: 1.97001e-06 [shard_inline]: 8.30999e-06 [merge_send_recv]: 9.66e-06 [auto_parallel]: 6.89999e-06 [parallel]: 1.796e-05 [flash_sp]: 8.65999e-06 [merge_comm]: 4.89e-06 [allreduce_fusion]: 4.66002e-06 [matmul_add_comm_reduction]: 1.056e-05 [allreduce_slice_to_reducescatter]: 7.00005e-07 [virtual_shard_identity]: 9.46e-06 [virtual_dataset]: 8.23001e-06 [get_grad_eliminate_]: 8.35999e-06 [virtual_output]: 8.34002e-06 [merge_forward]: 4.47e-06 [cell_reuse_recompute_pass]: 1.24e-06 [offload_activation]: 1.101e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.722e-05 [merge_recompute_call_nodes]: 1.54e-06 [before_grad]: 1.344e-05 [set_forward_comm_id_for_comm_node_pass]: 4.78001e-06 [meta_fg_expand]: 3.26999e-06 [flash_sp_send_recv_attached]: 2.41e-06 [receive_attached]: 2.68998e-06 [after_resolve]: 1.274e-05 [a_after_grad]: 1.271e-05 [renormalize]: 0.00060446 [add_forward_monad_depend]: 4.70001e-06 [auto_monad_grad]: 1.89e-06 [auto_monad_eliminator]: 1.721e-05 [cse]: 3.479e-05 [a_3]: 7.114e-05 [Cycle 2]: 0.00094855, [45] [expand_dump_flag]: 1.04e-06 [switch_simplify]: 8.75001e-06 [loop_unroll]: 7.45998e-06 [a_1]: 0.00017812 [with_stream_mark]: 1.161e-05 [recompute_prepare]: 8.92e-06 [updatestate_depend_eliminate]: 4.45999e-06 [updatestate_assign_eliminate]: 3.02002e-06 [updatestate_loads_eliminate]: 2.93e-06 [parameter_eliminate]: 1.05001e-06 [a_2]: 0.00011824 [accelerated_algorithm]: 8.05999e-06 [shard]: 1.14e-06 [meta_shard_fg_expand]: 1.79e-06 [shard_inline]: 7.82002e-06 [merge_send_recv]: 7.15e-06 [auto_parallel]: 6.63e-06 [parallel]: 4.75999e-06 [flash_sp]: 4.08999e-06 [merge_comm]: 7.71999e-06 [allreduce_fusion]: 4.47e-06 [matmul_add_comm_reduction]: 6.41e-06 [allreduce_slice_to_reducescatter]: 3.50003e-07 [virtual_shard_identity]: 9.46e-06 [virtual_dataset]: 8.62e-06 [get_grad_eliminate_]: 7.03e-06 [virtual_output]: 7.17002e-06 [merge_forward]: 3.51999e-06 [cell_reuse_recompute_pass]: 1.34998e-06 [offload_activation]: 7.28e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.594e-05 [merge_recompute_call_nodes]: 8.89995e-07 [before_grad]: 1.166e-05 [set_forward_comm_id_for_comm_node_pass]: 6.31e-06 [meta_fg_expand]: 2.80002e-06 [flash_sp_send_recv_attached]: 9.5999e-07 [receive_attached]: 1.02e-06 [after_resolve]: 1.235e-05 [a_after_grad]: 1.259e-05 [renormalize]: 7.00238e-08 [add_forward_monad_depend]: 1.22e-06 [auto_monad_grad]: 1.06002e-06 [auto_monad_eliminator]: 8.42998e-06 [cse]: 1.831e-05 [a_3]: 5.961e-05 [py_interpret_to_execute_after_opt_a]: 1.394e-05 [slice_cell_reuse_recomputed_activation]: 5.52999e-06 [rewriter_after_opt_a]: 4.393e-05 [convert_after_rewriter]: 1.096e-05 [order_py_execute_after_rewriter]: 8.63001e-06 [mutable_eliminate]: 0.00049528 [opt_b]: 0.00031267, [1] [Cycle 1]: 0.00030363, [7] [b_1]: 0.00020161 [b_2]: 9.24e-06 [updatestate_depend_eliminate]: 5.74e-06 [updatestate_assign_eliminate]: 3.48e-06 [updatestate_loads_eliminate]: 3.43999e-06 [renormalize]: 5.50004e-07 [cse]: 2.239e-05 [optimize_parallel_all_gather_comm]: 2.016e-05 [overlap_param_gather]: 5.62999e-06 [cconv]: 2.869e-05 [loop_unroll]: 0.00045096 [opt_after_cconv]: 0.0001515, [1] [Cycle 1]: 0.00014253, [7] [c_1]: 4.162e-05 [parameter_eliminate]: 2.88e-06 [updatestate_depend_eliminate]: 6.39001e-06 [updatestate_assign_eliminate]: 3.65998e-06 [updatestate_loads_eliminate]: 3.26001e-06 [cse]: 2.319e-05 [renormalize]: 5.3001e-07 [remove_dup_value]: 2.971e-05 [tuple_transform]: 0.00010594, [1] [Cycle 1]: 9.808e-05, [4] [d_1]: 5.467e-05 [none_parameter_eliminate]: 1.84e-06 [renormalize]: 2.69996e-07 [switch_simplify]: 8.48999e-06 [partial_unused_args_eliminate]: 5.54e-06 [add_recomputation]: 6.047e-05 [cse_after_recomputation]: 3.369e-05, [1] [Cycle 1]: 2.602e-05, [1] [cse]: 1.623e-05 [environ_conv]: 9.39998e-06 [swap_dp_allreduce_reducescatter]: 8.99998e-06 [bias_add_comm_swap]: 5.27999e-06 [label_micro_interleaved_index]: 7.48999e-06 [label_fine_grained_interleaved_index]: 5.36002e-06 [merge_cast_opt]: 4.13999e-06 [slice_recompute_activation]: 4.47998e-06 [micro_interleaved_order_control]: 4.62998e-06 [assign_add_opt]: 3.56001e-06 [ForceFp32Comm]: 3.48e-06 [remove_cast_before_assign_add]: 4.02998e-06 [full_micro_interleaved_order_control]: 4.82998e-06 [reorder_send_recv_between_fp_bp]: 5.26002e-06 [comm_op_add_attrs]: 3.62998e-06 [add_comm_op_reuse_tag]: 3.33e-06 [interleave_split_concat_branches]: 3.43999e-06 [interleave_parallel_branches]: 3.5e-06 [overlap_opt_shard_in_pipeline]: 3.66999e-06 [overlap_opt_shard_grad_in_pipeline]: 4.17e-06 [control_data_broadcast_order]: 1.701e-05 [grouped_pairwise_exchange_alltoall]: 4.2e-06 [offloading_packed_experts]: 7.53999e-06 [overlap_recompute_and_grad_model_parallel]: 7.68001e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.68e-06 [overlap_recompute_allgather_and_fa_grad]: 4.05e-06 [overlap_recompute_comm]: 5.10999e-06 [overlap_grad_ring_attention]: 6.89999e-06 [overlap_grad_flash_sp]: 2.435e-05 [begin_end_overlap_inline]: 2.96001e-06 [split_matmul_comm_elemetwise]: 4.78001e-06 [split_layernorm_comm]: 4.52003e-06 [handle_group_info]: 3.66999e-06 [symbol_engine_optimizer]: 0.00010853, [1] [Cycle 1]: 0.00010211, [6] [build]: 2.84999e-06 [elim_shapecalc]: 1.124e-05 [elim_not_effective]: 1.696e-05 [opt_reshape]: 9.49999e-06 [fold_const_symbol]: 1.34e-05 [renormalize]: 2.30008e-07 [detach_backward]: 4.13999e-06 [pipeline_parallel_scheduler]: 1.82999e-06 [auto_monad_reorder]: 2.331e-05 [get_jit_bprop_graph]: 1.27e-06 [rewriter_after_jit_bprop_graph]: 4.07e-06 [opt_after_jit_grad]: 0.00050847 [validate]: 3.89e-05 Sums bootstrap : 0.000379s : 3.52% type_inference : 0.005184s : 48.14% event_method : 0.000018s : 0.17% auto_monad : 0.000062s : 0.57% graph_reusing : 0.000006s : 0.05% inline : 0.000002s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000019s : 0.17% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.06% parallel-infer-symbol : 0.000003s : 0.03% pre_auto_parallel : 0.000032s : 0.30% insert-virtual-dataset : 0.000003s : 0.03% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.02% optimize.py_interpret_to_execute : 0.000028s : 0.26% optimize.rewriter_before_opt_a : 0.000085s : 0.79% optimize.opt_a.expand_dump_flag : 0.000004s : 0.03% optimize.opt_a.switch_simplify : 0.000052s : 0.49% optimize.opt_a.loop_unroll : 0.000040s : 0.37% optimize.opt_a.a_1 : 0.000892s : 8.29% optimize.opt_a.with_stream_mark : 0.000026s : 0.24% optimize.opt_a.recompute_prepare : 0.000018s : 0.17% optimize.opt_a.updatestate_depend_eliminate : 0.000009s : 0.09% optimize.opt_a.updatestate_assign_eliminate : 0.000020s : 0.18% optimize.opt_a.updatestate_loads_eliminate : 0.000007s : 0.06% optimize.opt_a.parameter_eliminate : 0.000003s : 0.03% optimize.opt_a.a_2 : 0.000251s : 2.33% optimize.opt_a.accelerated_algorithm : 0.000017s : 0.16% optimize.opt_a.shard : 0.000004s : 0.04% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.03% optimize.opt_a.shard_inline : 0.000016s : 0.15% optimize.opt_a.merge_send_recv : 0.000017s : 0.16% optimize.opt_a.auto_parallel : 0.000014s : 0.13% optimize.opt_a.parallel : 0.000023s : 0.21% optimize.opt_a.flash_sp : 0.000013s : 0.12% optimize.opt_a.merge_comm : 0.000013s : 0.12% optimize.opt_a.allreduce_fusion : 0.000009s : 0.08% optimize.opt_a.matmul_add_comm_reduction : 0.000017s : 0.16% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000019s : 0.18% optimize.opt_a.virtual_dataset : 0.000017s : 0.16% optimize.opt_a.get_grad_eliminate_ : 0.000015s : 0.14% optimize.opt_a.virtual_output : 0.000016s : 0.14% optimize.opt_a.merge_forward : 0.000008s : 0.07% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.02% optimize.opt_a.offload_activation : 0.000018s : 0.17% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000033s : 0.31% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.02% optimize.opt_a.before_grad : 0.000025s : 0.23% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000011s : 0.10% optimize.opt_a.meta_fg_expand : 0.000006s : 0.06% optimize.opt_a.flash_sp_send_recv_attached : 0.000003s : 0.03% optimize.opt_a.receive_attached : 0.000004s : 0.03% optimize.opt_a.after_resolve : 0.000025s : 0.23% optimize.opt_a.a_after_grad : 0.000025s : 0.23% optimize.opt_a.renormalize : 0.000605s : 5.61% optimize.opt_a.add_forward_monad_depend : 0.000006s : 0.05% optimize.opt_a.auto_monad_grad : 0.000003s : 0.03% optimize.opt_a.auto_monad_eliminator : 0.000026s : 0.24% optimize.opt_a.cse : 0.000053s : 0.49% optimize.opt_a.a_3 : 0.000131s : 1.21% optimize.py_interpret_to_execute_after_opt_a : 0.000014s : 0.13% optimize.slice_cell_reuse_recomputed_activation : 0.000006s : 0.05% optimize.rewriter_after_opt_a : 0.000044s : 0.41% optimize.convert_after_rewriter : 0.000011s : 0.10% optimize.order_py_execute_after_rewriter : 0.000009s : 0.08% optimize.mutable_eliminate : 0.000495s : 4.60% optimize.opt_b.b_1 : 0.000202s : 1.87% optimize.opt_b.b_2 : 0.000009s : 0.09% optimize.opt_b.updatestate_depend_eliminate : 0.000006s : 0.05% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_b.renormalize : 0.000001s : 0.01% optimize.opt_b.cse : 0.000022s : 0.21% optimize.optimize_parallel_all_gather_comm : 0.000020s : 0.19% optimize.overlap_param_gather : 0.000006s : 0.05% optimize.cconv : 0.000029s : 0.27% optimize.loop_unroll : 0.000451s : 4.19% optimize.opt_after_cconv.c_1 : 0.000042s : 0.39% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.06% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000004s : 0.03% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.cse : 0.000023s : 0.22% optimize.opt_after_cconv.renormalize : 0.000001s : 0.00% optimize.remove_dup_value : 0.000030s : 0.28% optimize.tuple_transform.d_1 : 0.000055s : 0.51% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.02% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000008s : 0.08% optimize.partial_unused_args_eliminate : 0.000006s : 0.05% optimize.add_recomputation : 0.000060s : 0.56% optimize.cse_after_recomputation.cse : 0.000016s : 0.15% optimize.environ_conv : 0.000009s : 0.09% optimize.swap_dp_allreduce_reducescatter : 0.000009s : 0.08% optimize.bias_add_comm_swap : 0.000005s : 0.05% optimize.label_micro_interleaved_index : 0.000007s : 0.07% optimize.label_fine_grained_interleaved_index : 0.000005s : 0.05% optimize.merge_cast_opt : 0.000004s : 0.04% optimize.slice_recompute_activation : 0.000004s : 0.04% optimize.micro_interleaved_order_control : 0.000005s : 0.04% optimize.assign_add_opt : 0.000004s : 0.03% optimize.ForceFp32Comm : 0.000003s : 0.03% optimize.remove_cast_before_assign_add : 0.000004s : 0.04% optimize.full_micro_interleaved_order_control : 0.000005s : 0.04% optimize.reorder_send_recv_between_fp_bp : 0.000005s : 0.05% optimize.comm_op_add_attrs : 0.000004s : 0.03% optimize.add_comm_op_reuse_tag : 0.000003s : 0.03% optimize.interleave_split_concat_branches : 0.000003s : 0.03% optimize.interleave_parallel_branches : 0.000003s : 0.03% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.03% optimize.overlap_opt_shard_grad_in_pipeline : 0.000004s : 0.04% optimize.control_data_broadcast_order : 0.000017s : 0.16% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.04% optimize.offloading_packed_experts : 0.000008s : 0.07% optimize.overlap_recompute_and_grad_model_parallel : 0.000008s : 0.07% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.03% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.04% optimize.overlap_recompute_comm : 0.000005s : 0.05% optimize.overlap_grad_ring_attention : 0.000007s : 0.06% optimize.overlap_grad_flash_sp : 0.000024s : 0.23% optimize.begin_end_overlap_inline : 0.000003s : 0.03% optimize.split_matmul_comm_elemetwise : 0.000005s : 0.04% optimize.split_layernorm_comm : 0.000005s : 0.04% optimize.handle_group_info : 0.000004s : 0.03% optimize.symbol_engine_optimizer.build : 0.000003s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000011s : 0.10% optimize.symbol_engine_optimizer.elim_not_effective : 0.000017s : 0.16% optimize.symbol_engine_optimizer.opt_reshape : 0.000009s : 0.09% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000013s : 0.12% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000004s : 0.04% pipeline_parallel_scheduler : 0.000002s : 0.02% auto_monad_reorder : 0.000023s : 0.22% get_jit_bprop_graph : 0.000001s : 0.01% rewriter_after_jit_bprop_graph : 0.000004s : 0.04% opt_after_jit_grad : 0.000508s : 4.72% validate : 0.000039s : 0.36% Time group info: ------[substitution.] 0.000190 38 10.43% : 0.000020s : 3: substitution.cast_eliminate 1.25% : 0.000002s : 3: substitution.elim_not_effective 1.04% : 0.000002s : 3: substitution.fold_const_symbol 3.51% : 0.000007s : 5: substitution.graph_param_transform 68.16% : 0.000130s : 4: substitution.inline 2.32% : 0.000004s : 6: substitution.j_node_and_user_rematch 3.08% : 0.000006s : 6: substitution.remove_not_recompute_node 2.30% : 0.000004s : 4: substitution.replace_old_param 7.91% : 0.000015s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.005138 2 87.10% : 0.004476s : 1: type_inference.infer 12.90% : 0.000663s : 1: type_inference.specialize ------[replace.] 0.000060 8 59.81% : 0.000036s : 4: replace.inline 40.19% : 0.000024s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000140 8 90.67% : 0.000127s : 4: match.inline 9.33% : 0.000013s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000255 1596 0.94% : 0.000002s : 17: predicate.accumulaten_eliminater 0.70% : 0.000002s : 5: predicate.ad_related_special_op_eliminate 0.56% : 0.000001s : 10: predicate.addn_check_dump 0.98% : 0.000003s : 17: predicate.addn_zero_filter 0.86% : 0.000002s : 17: predicate.adjust_all_reduce_mul_add 1.99% : 0.000005s : 27: predicate.arithmetic_simplify 1.12% : 0.000003s : 17: predicate.cast_eliminate 0.59% : 0.000001s : 10: predicate.check_bprop_eliminate 0.54% : 0.000001s : 10: predicate.compare_switch_simplify 0.18% : 0.000000s : 5: predicate.const_output_eliminate 0.71% : 0.000002s : 10: predicate.depend_value_elim 0.99% : 0.000003s : 17: predicate.dict_get_item_const_eliminator 1.06% : 0.000003s : 17: predicate.dict_get_item_eliminator 0.94% : 0.000002s : 17: predicate.dict_set_item_eliminator 0.85% : 0.000002s : 10: predicate.dumpgradient_eliminate 0.21% : 0.000001s : 5: predicate.elim_not_effective 0.38% : 0.000001s : 5: predicate.elim_shapecalc_of_broadcastargs 1.25% : 0.000003s : 22: predicate.environ_add_const_eliminate 1.16% : 0.000003s : 22: predicate.environ_get_add_eliminate 1.21% : 0.000003s : 22: predicate.environ_get_depend_swap 1.77% : 0.000005s : 32: predicate.environ_get_eliminate 1.14% : 0.000003s : 22: predicate.environ_get_set_eliminate 1.39% : 0.000004s : 25: predicate.exchange_switch_depend_value 2.27% : 0.000006s : 25: predicate.float_depend_g_call 0.57% : 0.000001s : 10: predicate.float_environ_get_switch 0.84% : 0.000002s : 15: predicate.float_tuple_getitem_switch 0.20% : 0.000001s : 5: predicate.fold_const_symbol 0.62% : 0.000002s : 10: predicate.get_grad_eliminate 0.21% : 0.000001s : 5: predicate.graph_param_transform 0.60% : 0.000002s : 10: predicate.incorporate_call 0.52% : 0.000001s : 10: predicate.incorporate_call_switch 6.21% : 0.000016s : 72: predicate.inline 0.81% : 0.000002s : 10: predicate.inline_without_move 0.31% : 0.000001s : 10: predicate.j_node_and_user_rematch 0.81% : 0.000002s : 10: predicate.less_batch_normalization 1.91% : 0.000005s : 31: predicate.list_to_tuple_eliminator_ 2.73% : 0.000007s : 48: predicate.load_eliminater 0.65% : 0.000002s : 5: predicate.loop_unroll_after_grad 2.01% : 0.000005s : 36: predicate.loop_unroll_before_grad 1.70% : 0.000004s : 27: predicate.make_slice_get_slice_eliminator 0.59% : 0.000002s : 10: predicate.merge_addn 0.80% : 0.000002s : 10: predicate.micro_step_allgather_replace 0.65% : 0.000002s : 10: predicate.mini_step_allgather_replace 0.88% : 0.000002s : 17: predicate.minmaximum_grad 0.89% : 0.000002s : 5: predicate.mutable_eliminate 0.41% : 0.000001s : 5: predicate.opt_reshape 0.37% : 0.000001s : 5: predicate.parallel_virtual_node 1.73% : 0.000004s : 25: predicate.partial_defer_inline 1.69% : 0.000004s : 26: predicate.partial_eliminate 0.98% : 0.000003s : 17: predicate.print_const_string_wrapper 0.61% : 0.000002s : 10: predicate.reduce_all_const_elim 1.31% : 0.000003s : 17: predicate.reduce_eliminate 2.67% : 0.000007s : 48: predicate.redundant_stop_gradient_eliminater 0.40% : 0.000001s : 10: predicate.remove_not_recompute_node 1.41% : 0.000004s : 31: predicate.replace_applicator 0.51% : 0.000001s : 10: predicate.replace_old_param 0.28% : 0.000001s : 5: predicate.reset_defer_inline 0.98% : 0.000002s : 17: predicate.reshape_eliminate 0.59% : 0.000002s : 10: predicate.row_tensor_add_zeros_like 0.32% : 0.000001s : 5: predicate.row_tensor_eliminate 0.70% : 0.000002s : 10: predicate.same_eliminate 0.47% : 0.000001s : 10: predicate.set_cell_output_no_recompute 0.77% : 0.000002s : 10: predicate.shard_identity_eliminate 0.71% : 0.000002s : 10: predicate.special_op_eliminate 0.73% : 0.000002s : 10: predicate.specialize_transform 0.73% : 0.000002s : 10: predicate.split_environ_get_set_with_tuple_value 0.75% : 0.000002s : 10: predicate.stack_unstack_eliminate 0.35% : 0.000001s : 5: predicate.switch_call_monad_eliminater 1.50% : 0.000004s : 25: predicate.switch_defer_inline 2.00% : 0.000005s : 35: predicate.switch_layer_defer_inline 4.71% : 0.000012s : 76: predicate.switch_simplify 0.94% : 0.000002s : 17: predicate.tile_eliminate 1.01% : 0.000003s : 17: predicate.transpose_eliminate 1.67% : 0.000004s : 27: predicate.tuple_list_convert_item_index_to_positive 1.72% : 0.000004s : 27: predicate.tuple_list_get_item_const_eliminator 1.53% : 0.000004s : 27: predicate.tuple_list_get_item_depend_reorder 3.28% : 0.000008s : 41: predicate.tuple_list_get_item_eliminator 1.66% : 0.000004s : 27: predicate.tuple_list_get_set_item_eliminator 2.27% : 0.000006s : 37: predicate.tuple_list_set_item_eliminator 1.78% : 0.000005s : 31: predicate.tuple_to_list_eliminator_ 2.54% : 0.000006s : 48: predicate.updatestate_pure_node_eliminater 3.27% : 0.000008s : 58: predicate.updatestate_useless_node_eliminater 0.34% : 0.000001s : 5: predicate.value_based_eliminate 0.70% : 0.000002s : 10: predicate.virtual_dataset_eliminate 0.63% : 0.000002s : 10: predicate.virtual_output_eliminate 0.28% : 0.000001s : 5: predicate.virtual_view_grad_eliminate 0.43% : 0.000001s : 5: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000441 11 52.13% : 0.000230s : 5: func_graph_cloner_run.FuncGraphClonerGraph 47.87% : 0.000211s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.026403 192 0.02% : 0.000006s : 1: ForceFp32Comm 11.35% : 0.002995s : 1: add_attr 11.30% : 0.002983s : 1: add_attr_with_inline 0.02% : 0.000006s : 1: add_comm_op_reuse_tag 0.24% : 0.000064s : 1: add_recomputation 0.02% : 0.000006s : 1: assign_add_opt 0.27% : 0.000071s : 1: auto_monad 0.12% : 0.000031s : 1: auto_monad_reorder 0.02% : 0.000006s : 1: begin_end_overlap_inline 0.03% : 0.000008s : 1: bias_add_comm_swap 1.59% : 0.000420s : 1: bootstrap 0.12% : 0.000032s : 1: cconv 0.02% : 0.000006s : 1: comm_op_add_attrs 0.08% : 0.000020s : 1: control_data_broadcast_order 0.05% : 0.000014s : 1: convert_after_rewriter 0.14% : 0.000037s : 1: cse_after_recomputation 0.03% : 0.000007s : 1: dataset_repeat_opt 0.07% : 0.000019s : 1: detach_backward 0.05% : 0.000013s : 1: environ_conv 0.10% : 0.000027s : 1: event_method 0.03% : 0.000007s : 1: full_micro_interleaved_order_control 0.03% : 0.000008s : 1: get_jit_bprop_graph 0.05% : 0.000012s : 1: graph_reusing 0.03% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.02% : 0.000006s : 1: handle_group_info 0.03% : 0.000008s : 1: inline 0.04% : 0.000010s : 1: insert-virtual-dataset 0.02% : 0.000006s : 1: interleave_parallel_branches 0.02% : 0.000006s : 1: interleave_split_concat_branches 0.03% : 0.000008s : 1: label_fine_grained_interleaved_index 0.04% : 0.000010s : 1: label_micro_interleaved_index 1.73% : 0.000457s : 1: loop_unroll 0.03% : 0.000007s : 1: merge_cast_opt 0.03% : 0.000007s : 1: micro_interleaved_order_control 1.90% : 0.000501s : 1: mutable_eliminate 0.04% : 0.000010s : 1: offloading_packed_experts 0.06% : 0.000015s : 1: opt.transform.loop_unroll_optimizer 0.07% : 0.000017s : 1: opt.transform.mutable_eliminate 5.37% : 0.001419s : 78: opt.transform.opt_a 0.15% : 0.000040s : 1: opt.transform.opt_after_cconv 0.11% : 0.000028s : 1: opt.transform.opt_after_jit_grad 0.52% : 0.000136s : 28: opt.transform.opt_b 0.23% : 0.000061s : 2: opt.transform.opt_trans_graph 0.18% : 0.000047s : 4: opt.transform.symbol_engine_opt 12.39% : 0.003271s : 1: opt_a 0.59% : 0.000155s : 1: opt_after_cconv 1.96% : 0.000518s : 1: opt_after_jit_grad 1.20% : 0.000316s : 1: opt_b 22.75% : 0.006007s : 1: optimize 0.09% : 0.000024s : 1: optimize_parallel_all_gather_comm 0.04% : 0.000012s : 1: order_py_execute_after_rewriter 0.10% : 0.000027s : 1: overlap_grad_flash_sp 0.02% : 0.000006s : 1: overlap_grad_matmul_and_grad_allreduce 0.04% : 0.000010s : 1: overlap_grad_ring_attention 0.03% : 0.000007s : 1: overlap_opt_shard_grad_in_pipeline 0.02% : 0.000006s : 1: overlap_opt_shard_in_pipeline 0.03% : 0.000009s : 1: overlap_param_gather 0.03% : 0.000007s : 1: overlap_recompute_allgather_and_fa_grad 0.04% : 0.000011s : 1: overlap_recompute_and_grad_model_parallel 0.03% : 0.000009s : 1: overlap_recompute_comm 0.04% : 0.000010s : 1: parallel-infer-symbol 0.02% : 0.000006s : 1: parallel-infer-symbol-second 0.03% : 0.000009s : 1: partial_unused_args_eliminate 0.04% : 0.000010s : 1: pipeline_parallel_scheduler 0.03% : 0.000007s : 1: pipeline_split 0.15% : 0.000039s : 1: pre_auto_parallel 0.12% : 0.000031s : 1: py_interpret_to_execute 0.07% : 0.000018s : 1: py_interpret_to_execute_after_opt_a 0.03% : 0.000007s : 1: remove_cast_before_assign_add 0.13% : 0.000033s : 1: remove_dup_value 1.29% : 0.000342s : 1: renormalize.infer 0.96% : 0.000255s : 1: renormalize.specialize 0.03% : 0.000008s : 1: reorder_send_recv_between_fp_bp 0.04% : 0.000010s : 1: rewriter_after_jit_bprop_graph 0.18% : 0.000047s : 1: rewriter_after_opt_a 0.34% : 0.000089s : 1: rewriter_before_opt_a 0.03% : 0.000009s : 1: slice_cell_reuse_recomputed_activation 0.03% : 0.000007s : 1: slice_recompute_activation 0.03% : 0.000007s : 1: split_layernorm_comm 0.03% : 0.000008s : 1: split_matmul_comm_elemetwise 0.05% : 0.000012s : 1: swap_dp_allreduce_reducescatter 0.42% : 0.000111s : 1: symbol_engine_optimizer 0.41% : 0.000109s : 1: tuple_transform 19.75% : 0.005214s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:45:53.286.457 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0176536, [21] [bootstrap]: 0.00047627 [type_inference]: 0.00662305 [event_method]: 2.153e-05 [auto_monad]: 6.823e-05 [graph_reusing]: 6.32001e-06 [inline]: 3.09001e-06 [add_attr]: 0.00366272, [1] [add_attr_with_inline]: 0.0036511, [1] [Cycle 1]: 7.251e-05, [2] [tag_attr]: 2.475e-05 [meta_addattr_fg_expand]: 6.96001e-06 [parallel-infer-symbol]: 3.65e-06 [pre_auto_parallel]: 4.067e-05 [insert-virtual-dataset]: 2.97002e-06 [parallel-infer-symbol-second]: 8.79983e-07 [dataset_repeat_opt]: 2.41e-06 [pipeline_split]: 1.71998e-06 [optimize]: 0.00591794, [53] [py_interpret_to_execute]: 3.096e-05 [rewriter_before_opt_a]: 9.559e-05 [opt_a]: 0.00335516, [2] [Cycle 1]: 0.0025354, [45] [expand_dump_flag]: 2.81e-06 [switch_simplify]: 4.414e-05 [loop_unroll]: 3.187e-05 [a_1]: 0.00079701 [with_stream_mark]: 1.862e-05 [recompute_prepare]: 1.045e-05 [updatestate_depend_eliminate]: 4.77998e-06 [updatestate_assign_eliminate]: 3.93001e-06 [updatestate_loads_eliminate]: 4.08001e-06 [parameter_eliminate]: 1.76e-06 [a_2]: 0.00010271 [accelerated_algorithm]: 9.20001e-06 [shard]: 2.27999e-06 [meta_shard_fg_expand]: 2.68e-06 [shard_inline]: 8.64e-06 [merge_send_recv]: 9.96e-06 [auto_parallel]: 9.00999e-06 [parallel]: 2.09e-05 [flash_sp]: 9.29e-06 [merge_comm]: 5.05999e-06 [allreduce_fusion]: 4.3e-06 [matmul_add_comm_reduction]: 1.091e-05 [allreduce_slice_to_reducescatter]: 1.01997e-06 [virtual_shard_identity]: 1.014e-05 [virtual_dataset]: 8.69e-06 [get_grad_eliminate_]: 8.27e-06 [virtual_output]: 8.32e-06 [merge_forward]: 4.60001e-06 [cell_reuse_recompute_pass]: 1.42e-06 [offload_activation]: 1.217e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.506e-05 [merge_recompute_call_nodes]: 1.50999e-06 [before_grad]: 1.385e-05 [set_forward_comm_id_for_comm_node_pass]: 4.68999e-06 [meta_fg_expand]: 3.93001e-06 [flash_sp_send_recv_attached]: 3.18e-06 [receive_attached]: 2.29999e-06 [after_resolve]: 1.455e-05 [a_after_grad]: 1.356e-05 [renormalize]: 0.00088937 [add_forward_monad_depend]: 6.15002e-06 [auto_monad_grad]: 2.82002e-06 [auto_monad_eliminator]: 2.099e-05 [cse]: 3.751e-05 [a_3]: 6.357e-05 [Cycle 2]: 0.00080698, [45] [expand_dump_flag]: 1.75001e-06 [switch_simplify]: 9.29998e-06 [loop_unroll]: 7.43999e-06 [a_1]: 0.00018307 [with_stream_mark]: 1.521e-05 [recompute_prepare]: 8.03999e-06 [updatestate_depend_eliminate]: 3.86999e-06 [updatestate_assign_eliminate]: 3.26001e-06 [updatestate_loads_eliminate]: 3.16999e-06 [parameter_eliminate]: 1.15999e-06 [a_2]: 9.285e-05 [accelerated_algorithm]: 7.67998e-06 [shard]: 1.55001e-06 [meta_shard_fg_expand]: 1.88002e-06 [shard_inline]: 7.41001e-06 [merge_send_recv]: 7.55e-06 [auto_parallel]: 7.93999e-06 [parallel]: 6.21e-06 [flash_sp]: 6.96001e-06 [merge_comm]: 4.47e-06 [allreduce_fusion]: 4.23001e-06 [matmul_add_comm_reduction]: 8.89e-06 [allreduce_slice_to_reducescatter]: 4.69998e-07 [virtual_shard_identity]: 8.69998e-06 [virtual_dataset]: 7.48999e-06 [get_grad_eliminate_]: 7.15e-06 [virtual_output]: 6.89999e-06 [merge_forward]: 4.25e-06 [cell_reuse_recompute_pass]: 2.39001e-06 [offload_activation]: 8.72998e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.755e-05 [merge_recompute_call_nodes]: 9.99979e-07 [before_grad]: 1.174e-05 [set_forward_comm_id_for_comm_node_pass]: 4.12e-06 [meta_fg_expand]: 3.11001e-06 [flash_sp_send_recv_attached]: 1.66998e-06 [receive_attached]: 2.64999e-06 [after_resolve]: 1.363e-05 [a_after_grad]: 1.127e-05 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 2.13998e-06 [auto_monad_grad]: 1.52001e-06 [auto_monad_eliminator]: 1.069e-05 [cse]: 2.247e-05 [a_3]: 4.621e-05 [py_interpret_to_execute_after_opt_a]: 1.58e-05 [slice_cell_reuse_recomputed_activation]: 2.26e-06 [rewriter_after_opt_a]: 4.656e-05 [convert_after_rewriter]: 7.97e-06 [order_py_execute_after_rewriter]: 6.67002e-06 [mutable_eliminate]: 0.00069829 [opt_b]: 0.00026067, [1] [Cycle 1]: 0.00025237, [7] [b_1]: 0.00015769 [b_2]: 1.125e-05 [updatestate_depend_eliminate]: 8.90999e-06 [updatestate_assign_eliminate]: 3.38999e-06 [updatestate_loads_eliminate]: 3.35e-06 [renormalize]: 1.09e-06 [cse]: 2.943e-05 [optimize_parallel_all_gather_comm]: 2.196e-05 [overlap_param_gather]: 2.32001e-06 [cconv]: 3.516e-05 [loop_unroll]: 0.00055936 [opt_after_cconv]: 0.00013279, [1] [Cycle 1]: 0.00012442, [7] [c_1]: 4.019e-05 [parameter_eliminate]: 3.98001e-06 [updatestate_depend_eliminate]: 6.68998e-06 [updatestate_assign_eliminate]: 3.46001e-06 [updatestate_loads_eliminate]: 3.08998e-06 [cse]: 2.921e-05 [renormalize]: 5.49975e-07 [remove_dup_value]: 1.682e-05 [tuple_transform]: 9.725e-05, [1] [Cycle 1]: 9.223e-05, [4] [d_1]: 6.205e-05 [none_parameter_eliminate]: 1.84e-06 [renormalize]: 2.69996e-07 [switch_simplify]: 8.1e-06 [partial_unused_args_eliminate]: 1.80001e-06 [add_recomputation]: 6.283e-05 [cse_after_recomputation]: 2.756e-05, [1] [Cycle 1]: 2.246e-05, [1] [cse]: 1.66e-05 [environ_conv]: 6.89999e-06 [swap_dp_allreduce_reducescatter]: 6.07001e-06 [bias_add_comm_swap]: 3.8e-06 [label_micro_interleaved_index]: 5.63002e-06 [label_fine_grained_interleaved_index]: 2.88e-06 [merge_cast_opt]: 1.66998e-06 [slice_recompute_activation]: 2.26998e-06 [micro_interleaved_order_control]: 2.55002e-06 [assign_add_opt]: 1.28002e-06 [ForceFp32Comm]: 8.50006e-07 [remove_cast_before_assign_add]: 1.15001e-06 [full_micro_interleaved_order_control]: 2.48e-06 [reorder_send_recv_between_fp_bp]: 3.01001e-06 [comm_op_add_attrs]: 1.02998e-06 [add_comm_op_reuse_tag]: 9.79984e-07 [interleave_split_concat_branches]: 1.10999e-06 [interleave_parallel_branches]: 1.03001e-06 [overlap_opt_shard_in_pipeline]: 1.22999e-06 [overlap_opt_shard_grad_in_pipeline]: 2.09e-06 [control_data_broadcast_order]: 1.58e-05 [grouped_pairwise_exchange_alltoall]: 1.59998e-06 [offloading_packed_experts]: 5.42999e-06 [overlap_recompute_and_grad_model_parallel]: 5.47999e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.14998e-06 [overlap_recompute_allgather_and_fa_grad]: 1.34e-06 [overlap_recompute_comm]: 2.73998e-06 [overlap_grad_ring_attention]: 4.68999e-06 [overlap_grad_flash_sp]: 2.422e-05 [begin_end_overlap_inline]: 5.10016e-07 [split_matmul_comm_elemetwise]: 2.64001e-06 [split_layernorm_comm]: 1.91e-06 [handle_group_info]: 1.47001e-06 [symbol_engine_optimizer]: 8.705e-05, [1] [Cycle 1]: 8.231e-05, [6] [build]: 4.15999e-06 [elim_shapecalc]: 1.152e-05 [elim_not_effective]: 1.598e-05 [opt_reshape]: 8.25e-06 [fold_const_symbol]: 1.272e-05 [renormalize]: 4.19997e-07 [detach_backward]: 2.41998e-06 [pipeline_parallel_scheduler]: 1.57001e-06 [auto_monad_reorder]: 2.095e-05 [get_jit_bprop_graph]: 1.83997e-06 [rewriter_after_jit_bprop_graph]: 6.05002e-06 [opt_after_jit_grad]: 0.00056959 [validate]: 5.334e-05 Sums bootstrap : 0.000476s : 3.68% type_inference : 0.006623s : 51.12% event_method : 0.000022s : 0.17% auto_monad : 0.000068s : 0.53% graph_reusing : 0.000006s : 0.05% inline : 0.000003s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000025s : 0.19% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000007s : 0.05% parallel-infer-symbol : 0.000004s : 0.03% pre_auto_parallel : 0.000041s : 0.31% insert-virtual-dataset : 0.000003s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000031s : 0.24% optimize.rewriter_before_opt_a : 0.000096s : 0.74% optimize.opt_a.expand_dump_flag : 0.000005s : 0.04% optimize.opt_a.switch_simplify : 0.000053s : 0.41% optimize.opt_a.loop_unroll : 0.000039s : 0.30% optimize.opt_a.a_1 : 0.000980s : 7.56% optimize.opt_a.with_stream_mark : 0.000034s : 0.26% optimize.opt_a.recompute_prepare : 0.000018s : 0.14% optimize.opt_a.updatestate_depend_eliminate : 0.000009s : 0.07% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.06% optimize.opt_a.updatestate_loads_eliminate : 0.000007s : 0.06% optimize.opt_a.parameter_eliminate : 0.000003s : 0.02% optimize.opt_a.a_2 : 0.000196s : 1.51% optimize.opt_a.accelerated_algorithm : 0.000017s : 0.13% optimize.opt_a.shard : 0.000004s : 0.03% optimize.opt_a.meta_shard_fg_expand : 0.000005s : 0.04% optimize.opt_a.shard_inline : 0.000016s : 0.12% optimize.opt_a.merge_send_recv : 0.000018s : 0.14% optimize.opt_a.auto_parallel : 0.000017s : 0.13% optimize.opt_a.parallel : 0.000027s : 0.21% optimize.opt_a.flash_sp : 0.000016s : 0.13% optimize.opt_a.merge_comm : 0.000010s : 0.07% optimize.opt_a.allreduce_fusion : 0.000009s : 0.07% optimize.opt_a.matmul_add_comm_reduction : 0.000020s : 0.15% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000019s : 0.15% optimize.opt_a.virtual_dataset : 0.000016s : 0.12% optimize.opt_a.get_grad_eliminate_ : 0.000015s : 0.12% optimize.opt_a.virtual_output : 0.000015s : 0.12% optimize.opt_a.merge_forward : 0.000009s : 0.07% optimize.opt_a.cell_reuse_recompute_pass : 0.000004s : 0.03% optimize.opt_a.offload_activation : 0.000021s : 0.16% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000033s : 0.25% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.02% optimize.opt_a.before_grad : 0.000026s : 0.20% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000009s : 0.07% optimize.opt_a.meta_fg_expand : 0.000007s : 0.05% optimize.opt_a.flash_sp_send_recv_attached : 0.000005s : 0.04% optimize.opt_a.receive_attached : 0.000005s : 0.04% optimize.opt_a.after_resolve : 0.000028s : 0.22% optimize.opt_a.a_after_grad : 0.000025s : 0.19% optimize.opt_a.renormalize : 0.000889s : 6.87% optimize.opt_a.add_forward_monad_depend : 0.000008s : 0.06% optimize.opt_a.auto_monad_grad : 0.000004s : 0.03% optimize.opt_a.auto_monad_eliminator : 0.000032s : 0.24% optimize.opt_a.cse : 0.000060s : 0.46% optimize.opt_a.a_3 : 0.000110s : 0.85% optimize.py_interpret_to_execute_after_opt_a : 0.000016s : 0.12% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.02% optimize.rewriter_after_opt_a : 0.000047s : 0.36% optimize.convert_after_rewriter : 0.000008s : 0.06% optimize.order_py_execute_after_rewriter : 0.000007s : 0.05% optimize.mutable_eliminate : 0.000698s : 5.39% optimize.opt_b.b_1 : 0.000158s : 1.22% optimize.opt_b.b_2 : 0.000011s : 0.09% optimize.opt_b.updatestate_depend_eliminate : 0.000009s : 0.07% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_b.renormalize : 0.000001s : 0.01% optimize.opt_b.cse : 0.000029s : 0.23% optimize.optimize_parallel_all_gather_comm : 0.000022s : 0.17% optimize.overlap_param_gather : 0.000002s : 0.02% optimize.cconv : 0.000035s : 0.27% optimize.loop_unroll : 0.000559s : 4.32% optimize.opt_after_cconv.c_1 : 0.000040s : 0.31% optimize.opt_after_cconv.parameter_eliminate : 0.000004s : 0.03% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000007s : 0.05% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.02% optimize.opt_after_cconv.cse : 0.000029s : 0.23% optimize.opt_after_cconv.renormalize : 0.000001s : 0.00% optimize.remove_dup_value : 0.000017s : 0.13% optimize.tuple_transform.d_1 : 0.000062s : 0.48% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000008s : 0.06% optimize.partial_unused_args_eliminate : 0.000002s : 0.01% optimize.add_recomputation : 0.000063s : 0.48% optimize.cse_after_recomputation.cse : 0.000017s : 0.13% optimize.environ_conv : 0.000007s : 0.05% optimize.swap_dp_allreduce_reducescatter : 0.000006s : 0.05% optimize.bias_add_comm_swap : 0.000004s : 0.03% optimize.label_micro_interleaved_index : 0.000006s : 0.04% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.02% optimize.merge_cast_opt : 0.000002s : 0.01% optimize.slice_recompute_activation : 0.000002s : 0.02% optimize.micro_interleaved_order_control : 0.000003s : 0.02% optimize.assign_add_opt : 0.000001s : 0.01% optimize.ForceFp32Comm : 0.000001s : 0.01% optimize.remove_cast_before_assign_add : 0.000001s : 0.01% optimize.full_micro_interleaved_order_control : 0.000002s : 0.02% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.02% optimize.comm_op_add_attrs : 0.000001s : 0.01% optimize.add_comm_op_reuse_tag : 0.000001s : 0.01% optimize.interleave_split_concat_branches : 0.000001s : 0.01% optimize.interleave_parallel_branches : 0.000001s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.02% optimize.control_data_broadcast_order : 0.000016s : 0.12% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.01% optimize.offloading_packed_experts : 0.000005s : 0.04% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.04% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.01% optimize.overlap_recompute_comm : 0.000003s : 0.02% optimize.overlap_grad_ring_attention : 0.000005s : 0.04% optimize.overlap_grad_flash_sp : 0.000024s : 0.19% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000003s : 0.02% optimize.split_layernorm_comm : 0.000002s : 0.01% optimize.handle_group_info : 0.000001s : 0.01% optimize.symbol_engine_optimizer.build : 0.000004s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000012s : 0.09% optimize.symbol_engine_optimizer.elim_not_effective : 0.000016s : 0.12% optimize.symbol_engine_optimizer.opt_reshape : 0.000008s : 0.06% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000013s : 0.10% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.02% pipeline_parallel_scheduler : 0.000002s : 0.01% auto_monad_reorder : 0.000021s : 0.16% get_jit_bprop_graph : 0.000002s : 0.01% rewriter_after_jit_bprop_graph : 0.000006s : 0.05% opt_after_jit_grad : 0.000570s : 4.40% validate : 0.000053s : 0.41% Time group info: ------[substitution.] 0.000233 38 10.24% : 0.000024s : 3: substitution.cast_eliminate 1.11% : 0.000003s : 3: substitution.elim_not_effective 0.96% : 0.000002s : 3: substitution.fold_const_symbol 3.14% : 0.000007s : 5: substitution.graph_param_transform 68.81% : 0.000160s : 4: substitution.inline 2.24% : 0.000005s : 6: substitution.j_node_and_user_rematch 3.68% : 0.000009s : 6: substitution.remove_not_recompute_node 2.53% : 0.000006s : 4: substitution.replace_old_param 7.27% : 0.000017s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.006556 2 88.25% : 0.005785s : 1: type_inference.infer 11.75% : 0.000771s : 1: type_inference.specialize ------[replace.] 0.000068 8 59.26% : 0.000040s : 4: replace.inline 40.74% : 0.000028s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000173 8 91.37% : 0.000158s : 4: match.inline 8.63% : 0.000015s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000271 1596 1.06% : 0.000003s : 17: predicate.accumulaten_eliminater 0.77% : 0.000002s : 5: predicate.ad_related_special_op_eliminate 0.56% : 0.000002s : 10: predicate.addn_check_dump 0.93% : 0.000003s : 17: predicate.addn_zero_filter 0.86% : 0.000002s : 17: predicate.adjust_all_reduce_mul_add 2.09% : 0.000006s : 27: predicate.arithmetic_simplify 1.02% : 0.000003s : 17: predicate.cast_eliminate 0.76% : 0.000002s : 10: predicate.check_bprop_eliminate 0.52% : 0.000001s : 10: predicate.compare_switch_simplify 0.17% : 0.000000s : 5: predicate.const_output_eliminate 0.57% : 0.000002s : 10: predicate.depend_value_elim 1.02% : 0.000003s : 17: predicate.dict_get_item_const_eliminator 1.05% : 0.000003s : 17: predicate.dict_get_item_eliminator 0.98% : 0.000003s : 17: predicate.dict_set_item_eliminator 1.30% : 0.000004s : 10: predicate.dumpgradient_eliminate 0.18% : 0.000001s : 5: predicate.elim_not_effective 0.38% : 0.000001s : 5: predicate.elim_shapecalc_of_broadcastargs 1.13% : 0.000003s : 22: predicate.environ_add_const_eliminate 1.26% : 0.000003s : 22: predicate.environ_get_add_eliminate 1.13% : 0.000003s : 22: predicate.environ_get_depend_swap 1.77% : 0.000005s : 32: predicate.environ_get_eliminate 1.10% : 0.000003s : 22: predicate.environ_get_set_eliminate 1.42% : 0.000004s : 25: predicate.exchange_switch_depend_value 2.25% : 0.000006s : 25: predicate.float_depend_g_call 0.51% : 0.000001s : 10: predicate.float_environ_get_switch 0.85% : 0.000002s : 15: predicate.float_tuple_getitem_switch 0.16% : 0.000000s : 5: predicate.fold_const_symbol 0.64% : 0.000002s : 10: predicate.get_grad_eliminate 0.25% : 0.000001s : 5: predicate.graph_param_transform 0.55% : 0.000001s : 10: predicate.incorporate_call 0.47% : 0.000001s : 10: predicate.incorporate_call_switch 6.44% : 0.000017s : 72: predicate.inline 0.81% : 0.000002s : 10: predicate.inline_without_move 0.31% : 0.000001s : 10: predicate.j_node_and_user_rematch 0.85% : 0.000002s : 10: predicate.less_batch_normalization 1.75% : 0.000005s : 31: predicate.list_to_tuple_eliminator_ 2.70% : 0.000007s : 48: predicate.load_eliminater 1.01% : 0.000003s : 5: predicate.loop_unroll_after_grad 1.90% : 0.000005s : 36: predicate.loop_unroll_before_grad 1.68% : 0.000005s : 27: predicate.make_slice_get_slice_eliminator 0.68% : 0.000002s : 10: predicate.merge_addn 0.69% : 0.000002s : 10: predicate.micro_step_allgather_replace 0.62% : 0.000002s : 10: predicate.mini_step_allgather_replace 0.83% : 0.000002s : 17: predicate.minmaximum_grad 1.01% : 0.000003s : 5: predicate.mutable_eliminate 0.39% : 0.000001s : 5: predicate.opt_reshape 0.38% : 0.000001s : 5: predicate.parallel_virtual_node 1.73% : 0.000005s : 25: predicate.partial_defer_inline 1.62% : 0.000004s : 26: predicate.partial_eliminate 1.03% : 0.000003s : 17: predicate.print_const_string_wrapper 0.52% : 0.000001s : 10: predicate.reduce_all_const_elim 1.30% : 0.000004s : 17: predicate.reduce_eliminate 2.46% : 0.000007s : 48: predicate.redundant_stop_gradient_eliminater 0.33% : 0.000001s : 10: predicate.remove_not_recompute_node 1.27% : 0.000003s : 31: predicate.replace_applicator 0.48% : 0.000001s : 10: predicate.replace_old_param 0.34% : 0.000001s : 5: predicate.reset_defer_inline 1.01% : 0.000003s : 17: predicate.reshape_eliminate 0.56% : 0.000002s : 10: predicate.row_tensor_add_zeros_like 0.50% : 0.000001s : 5: predicate.row_tensor_eliminate 0.75% : 0.000002s : 10: predicate.same_eliminate 0.38% : 0.000001s : 10: predicate.set_cell_output_no_recompute 0.75% : 0.000002s : 10: predicate.shard_identity_eliminate 0.65% : 0.000002s : 10: predicate.special_op_eliminate 0.68% : 0.000002s : 10: predicate.specialize_transform 0.98% : 0.000003s : 10: predicate.split_environ_get_set_with_tuple_value 1.01% : 0.000003s : 10: predicate.stack_unstack_eliminate 0.42% : 0.000001s : 5: predicate.switch_call_monad_eliminater 1.40% : 0.000004s : 25: predicate.switch_defer_inline 2.01% : 0.000005s : 35: predicate.switch_layer_defer_inline 4.39% : 0.000012s : 76: predicate.switch_simplify 0.95% : 0.000003s : 17: predicate.tile_eliminate 1.03% : 0.000003s : 17: predicate.transpose_eliminate 1.59% : 0.000004s : 27: predicate.tuple_list_convert_item_index_to_positive 1.64% : 0.000004s : 27: predicate.tuple_list_get_item_const_eliminator 1.63% : 0.000004s : 27: predicate.tuple_list_get_item_depend_reorder 3.44% : 0.000009s : 41: predicate.tuple_list_get_item_eliminator 1.50% : 0.000004s : 27: predicate.tuple_list_get_set_item_eliminator 2.16% : 0.000006s : 37: predicate.tuple_list_set_item_eliminator 1.74% : 0.000005s : 31: predicate.tuple_to_list_eliminator_ 2.42% : 0.000007s : 48: predicate.updatestate_pure_node_eliminater 3.14% : 0.000008s : 58: predicate.updatestate_useless_node_eliminater 0.37% : 0.000001s : 5: predicate.value_based_eliminate 0.71% : 0.000002s : 10: predicate.virtual_dataset_eliminate 0.68% : 0.000002s : 10: predicate.virtual_output_eliminate 0.27% : 0.000001s : 5: predicate.virtual_view_grad_eliminate 0.36% : 0.000001s : 5: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000618 11 53.50% : 0.000330s : 5: func_graph_cloner_run.FuncGraphClonerGraph 46.50% : 0.000287s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.029837 192 0.01% : 0.000003s : 1: ForceFp32Comm 12.29% : 0.003668s : 1: add_attr 12.25% : 0.003656s : 1: add_attr_with_inline 0.01% : 0.000004s : 1: add_comm_op_reuse_tag 0.22% : 0.000067s : 1: add_recomputation 0.01% : 0.000004s : 1: assign_add_opt 0.25% : 0.000075s : 1: auto_monad 0.08% : 0.000025s : 1: auto_monad_reorder 0.01% : 0.000003s : 1: begin_end_overlap_inline 0.02% : 0.000007s : 1: bias_add_comm_swap 1.70% : 0.000507s : 1: bootstrap 0.13% : 0.000039s : 1: cconv 0.01% : 0.000004s : 1: comm_op_add_attrs 0.06% : 0.000019s : 1: control_data_broadcast_order 0.04% : 0.000012s : 1: convert_after_rewriter 0.10% : 0.000031s : 1: cse_after_recomputation 0.02% : 0.000005s : 1: dataset_repeat_opt 0.02% : 0.000006s : 1: detach_backward 0.04% : 0.000011s : 1: environ_conv 0.09% : 0.000028s : 1: event_method 0.02% : 0.000005s : 1: full_micro_interleaved_order_control 0.02% : 0.000005s : 1: get_jit_bprop_graph 0.03% : 0.000010s : 1: graph_reusing 0.01% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000004s : 1: handle_group_info 0.02% : 0.000006s : 1: inline 0.02% : 0.000007s : 1: insert-virtual-dataset 0.01% : 0.000004s : 1: interleave_parallel_branches 0.01% : 0.000004s : 1: interleave_split_concat_branches 0.02% : 0.000007s : 1: label_fine_grained_interleaved_index 0.03% : 0.000008s : 1: label_micro_interleaved_index 1.91% : 0.000571s : 1: loop_unroll 0.02% : 0.000005s : 1: merge_cast_opt 0.02% : 0.000006s : 1: micro_interleaved_order_control 2.38% : 0.000709s : 1: mutable_eliminate 0.03% : 0.000009s : 1: offloading_packed_experts 0.06% : 0.000019s : 1: opt.transform.loop_unroll_optimizer 0.07% : 0.000021s : 1: opt.transform.mutable_eliminate 5.07% : 0.001511s : 78: opt.transform.opt_a 0.13% : 0.000038s : 1: opt.transform.opt_after_cconv 0.11% : 0.000033s : 1: opt.transform.opt_after_jit_grad 0.45% : 0.000135s : 28: opt.transform.opt_b 0.23% : 0.000068s : 2: opt.transform.opt_trans_graph 0.15% : 0.000044s : 4: opt.transform.symbol_engine_opt 11.26% : 0.003358s : 1: opt_a 0.46% : 0.000138s : 1: opt_after_cconv 1.94% : 0.000580s : 1: opt_after_jit_grad 0.89% : 0.000265s : 1: opt_b 19.85% : 0.005923s : 1: optimize 0.09% : 0.000026s : 1: optimize_parallel_all_gather_comm 0.03% : 0.000010s : 1: order_py_execute_after_rewriter 0.09% : 0.000028s : 1: overlap_grad_flash_sp 0.01% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.03% : 0.000008s : 1: overlap_grad_ring_attention 0.02% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.02% : 0.000005s : 1: overlap_param_gather 0.01% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.03% : 0.000009s : 1: overlap_recompute_and_grad_model_parallel 0.02% : 0.000005s : 1: overlap_recompute_comm 0.02% : 0.000007s : 1: parallel-infer-symbol 0.01% : 0.000004s : 1: parallel-infer-symbol-second 0.02% : 0.000005s : 1: partial_unused_args_eliminate 0.02% : 0.000005s : 1: pipeline_parallel_scheduler 0.01% : 0.000004s : 1: pipeline_split 0.15% : 0.000046s : 1: pre_auto_parallel 0.12% : 0.000035s : 1: py_interpret_to_execute 0.07% : 0.000019s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000004s : 1: remove_cast_before_assign_add 0.07% : 0.000021s : 1: remove_dup_value 1.68% : 0.000500s : 1: renormalize.infer 1.27% : 0.000379s : 1: renormalize.specialize 0.02% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.03% : 0.000009s : 1: rewriter_after_jit_bprop_graph 0.17% : 0.000051s : 1: rewriter_after_opt_a 0.33% : 0.000100s : 1: rewriter_before_opt_a 0.02% : 0.000006s : 1: slice_cell_reuse_recomputed_activation 0.02% : 0.000005s : 1: slice_recompute_activation 0.02% : 0.000005s : 1: split_layernorm_comm 0.02% : 0.000005s : 1: split_matmul_comm_elemetwise 0.03% : 0.000009s : 1: swap_dp_allreduce_reducescatter 0.30% : 0.000090s : 1: symbol_engine_optimizer 0.34% : 0.000100s : 1: tuple_transform 22.28% : 0.006646s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:45:53.593.846 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:45:53.594.124 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0335457, [21] [bootstrap]: 0.00042428 [type_inference]: 0.0223995 [event_method]: 2.109e-05 [auto_monad]: 7.183e-05 [graph_reusing]: 6.41e-06 [inline]: 2.69999e-06 [add_attr]: 0.00318647, [1] [add_attr_with_inline]: 0.00317739, [1] [Cycle 1]: 7.112e-05, [2] [tag_attr]: 2.037e-05 [meta_addattr_fg_expand]: 6.54999e-06 [parallel-infer-symbol]: 3.28e-06 [pre_auto_parallel]: 3.508e-05 [insert-virtual-dataset]: 2.48e-06 [parallel-infer-symbol-second]: 9.10019e-07 [dataset_repeat_opt]: 2.12999e-06 [pipeline_split]: 1.92001e-06 [optimize]: 0.00620767, [53] [py_interpret_to_execute]: 3.122e-05 [rewriter_before_opt_a]: 9.862e-05 [opt_a]: 0.00368572, [2] [Cycle 1]: 0.00253623, [45] [expand_dump_flag]: 3.01999e-06 [switch_simplify]: 4.425e-05 [loop_unroll]: 3.226e-05 [a_1]: 0.00072399 [with_stream_mark]: 1.687e-05 [recompute_prepare]: 1.287e-05 [updatestate_depend_eliminate]: 5.62001e-06 [updatestate_assign_eliminate]: 4.52e-06 [updatestate_loads_eliminate]: 4.15e-06 [parameter_eliminate]: 2.86999e-06 [a_2]: 0.00015321 [accelerated_algorithm]: 1.089e-05 [shard]: 2.55002e-06 [meta_shard_fg_expand]: 2.27999e-06 [shard_inline]: 1.052e-05 [merge_send_recv]: 1.203e-05 [auto_parallel]: 8.01001e-06 [parallel]: 2.041e-05 [flash_sp]: 9.78998e-06 [merge_comm]: 7.15998e-06 [allreduce_fusion]: 5.09e-06 [matmul_add_comm_reduction]: 1.234e-05 [allreduce_slice_to_reducescatter]: 6.19999e-07 [virtual_shard_identity]: 1.311e-05 [virtual_dataset]: 9.64e-06 [get_grad_eliminate_]: 9.07001e-06 [virtual_output]: 8.92e-06 [merge_forward]: 5.50001e-06 [cell_reuse_recompute_pass]: 1.27999e-06 [offload_activation]: 1.155e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.253e-05 [merge_recompute_call_nodes]: 1.54e-06 [before_grad]: 1.583e-05 [set_forward_comm_id_for_comm_node_pass]: 6.24999e-06 [meta_fg_expand]: 4.47e-06 [flash_sp_send_recv_attached]: 2.57001e-06 [receive_attached]: 2.99999e-06 [after_resolve]: 1.496e-05 [a_after_grad]: 1.369e-05 [renormalize]: 0.0007192 [add_forward_monad_depend]: 6.69999e-06 [auto_monad_grad]: 2.16998e-06 [auto_monad_eliminator]: 2.027e-05 [cse]: 4.108e-05 [a_3]: 8.272e-05 [Cycle 2]: 0.00113456, [45] [expand_dump_flag]: 1.52001e-06 [switch_simplify]: 1.046e-05 [loop_unroll]: 8.62e-06 [a_1]: 0.00021702 [with_stream_mark]: 1.507e-05 [recompute_prepare]: 1.213e-05 [updatestate_depend_eliminate]: 4.77e-06 [updatestate_assign_eliminate]: 3.68999e-06 [updatestate_loads_eliminate]: 3.71001e-06 [parameter_eliminate]: 1.55001e-06 [a_2]: 0.00013939 [accelerated_algorithm]: 9.49e-06 [shard]: 1.85001e-06 [meta_shard_fg_expand]: 2.12999e-06 [shard_inline]: 1.1e-05 [merge_send_recv]: 8.57e-06 [auto_parallel]: 7.75e-06 [parallel]: 6.16e-06 [flash_sp]: 4.21001e-06 [merge_comm]: 5.39998e-06 [allreduce_fusion]: 4.85999e-06 [matmul_add_comm_reduction]: 9.14e-06 [allreduce_slice_to_reducescatter]: 5.20027e-07 [virtual_shard_identity]: 1.131e-05 [virtual_dataset]: 8.89003e-06 [get_grad_eliminate_]: 8.43001e-06 [virtual_output]: 8.44998e-06 [merge_forward]: 4.48999e-06 [cell_reuse_recompute_pass]: 2.06003e-06 [offload_activation]: 9.39e-06 [cell_reuse_handle_not_recompute_node_pass]: 2.174e-05 [merge_recompute_call_nodes]: 1.25999e-06 [before_grad]: 1.503e-05 [set_forward_comm_id_for_comm_node_pass]: 5.44e-06 [meta_fg_expand]: 3.61999e-06 [flash_sp_send_recv_attached]: 1.15999e-06 [receive_attached]: 1.09998e-06 [after_resolve]: 1.62e-05 [a_after_grad]: 1.387e-05 [renormalize]: 9.00181e-08 [add_forward_monad_depend]: 2.83e-06 [auto_monad_grad]: 1.51998e-06 [auto_monad_eliminator]: 1.315e-05 [cse]: 2.581e-05 [a_3]: 7.039e-05 [py_interpret_to_execute_after_opt_a]: 1.697e-05 [slice_cell_reuse_recomputed_activation]: 5.15001e-06 [rewriter_after_opt_a]: 5.149e-05 [convert_after_rewriter]: 1.311e-05 [order_py_execute_after_rewriter]: 9.79e-06 [mutable_eliminate]: 0.00051954 [opt_b]: 0.00035107, [1] [Cycle 1]: 0.00034149, [7] [b_1]: 0.00022562 [b_2]: 1.047e-05 [updatestate_depend_eliminate]: 7.93001e-06 [updatestate_assign_eliminate]: 3.73999e-06 [updatestate_loads_eliminate]: 3.77002e-06 [renormalize]: 7.2e-07 [cse]: 3.023e-05 [optimize_parallel_all_gather_comm]: 2.399e-05 [overlap_param_gather]: 5.03002e-06 [cconv]: 3.013e-05 [loop_unroll]: 0.00044069 [opt_after_cconv]: 0.00015293, [1] [Cycle 1]: 0.00014439, [7] [c_1]: 4.387e-05 [parameter_eliminate]: 3.06001e-06 [updatestate_depend_eliminate]: 6.72002e-06 [updatestate_assign_eliminate]: 3.68e-06 [updatestate_loads_eliminate]: 3.54002e-06 [cse]: 2.672e-05 [renormalize]: 4.19997e-07 [remove_dup_value]: 4.34e-05 [tuple_transform]: 0.00011068, [1] [Cycle 1]: 0.00010341, [4] [d_1]: 6.196e-05 [none_parameter_eliminate]: 1.71002e-06 [renormalize]: 1.79978e-07 [switch_simplify]: 9.29e-06 [partial_unused_args_eliminate]: 4.61002e-06 [add_recomputation]: 6.35e-05 [cse_after_recomputation]: 3.481e-05, [1] [Cycle 1]: 2.745e-05, [1] [cse]: 1.766e-05 [environ_conv]: 9.43002e-06 [swap_dp_allreduce_reducescatter]: 9.47999e-06 [bias_add_comm_swap]: 5.14e-06 [label_micro_interleaved_index]: 7.16001e-06 [label_fine_grained_interleaved_index]: 4.89e-06 [merge_cast_opt]: 3.88001e-06 [slice_recompute_activation]: 4.68001e-06 [micro_interleaved_order_control]: 4.53999e-06 [assign_add_opt]: 3.54002e-06 [ForceFp32Comm]: 3.35998e-06 [remove_cast_before_assign_add]: 3.37002e-06 [full_micro_interleaved_order_control]: 4.57e-06 [reorder_send_recv_between_fp_bp]: 5.64e-06 [comm_op_add_attrs]: 3.59002e-06 [add_comm_op_reuse_tag]: 3.26001e-06 [interleave_split_concat_branches]: 3.6e-06 [interleave_parallel_branches]: 3.36999e-06 [overlap_opt_shard_in_pipeline]: 3.48e-06 [overlap_opt_shard_grad_in_pipeline]: 4.08001e-06 [control_data_broadcast_order]: 1.999e-05 [grouped_pairwise_exchange_alltoall]: 3.83001e-06 [offloading_packed_experts]: 7.61999e-06 [overlap_recompute_and_grad_model_parallel]: 8.42e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.5e-06 [overlap_recompute_allgather_and_fa_grad]: 3.73001e-06 [overlap_recompute_comm]: 5.04003e-06 [overlap_grad_ring_attention]: 7.40003e-06 [overlap_grad_flash_sp]: 2.742e-05 [begin_end_overlap_inline]: 2.96001e-06 [split_matmul_comm_elemetwise]: 4.68999e-06 [split_layernorm_comm]: 4.03999e-06 [handle_group_info]: 3.14999e-06 [symbol_engine_optimizer]: 0.00010925, [1] [Cycle 1]: 0.00010253, [6] [build]: 3.16001e-06 [elim_shapecalc]: 1.208e-05 [elim_not_effective]: 1.753e-05 [opt_reshape]: 9.67001e-06 [fold_const_symbol]: 1.409e-05 [renormalize]: 2.00002e-07 [detach_backward]: 3.67002e-06 [pipeline_parallel_scheduler]: 2.02999e-06 [auto_monad_reorder]: 2.486e-05 [get_jit_bprop_graph]: 1.66e-06 [rewriter_after_jit_bprop_graph]: 4.94998e-06 [opt_after_jit_grad]: 0.00050364 [validate]: 4.113e-05 Sums bootstrap : 0.000424s : 1.49% type_inference : 0.022399s : 78.58% event_method : 0.000021s : 0.07% auto_monad : 0.000072s : 0.25% graph_reusing : 0.000006s : 0.02% inline : 0.000003s : 0.01% add_attr.add_attr_with_inline.tag_attr : 0.000020s : 0.07% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000007s : 0.02% parallel-infer-symbol : 0.000003s : 0.01% pre_auto_parallel : 0.000035s : 0.12% insert-virtual-dataset : 0.000002s : 0.01% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.01% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000031s : 0.11% optimize.rewriter_before_opt_a : 0.000099s : 0.35% optimize.opt_a.expand_dump_flag : 0.000005s : 0.02% optimize.opt_a.switch_simplify : 0.000055s : 0.19% optimize.opt_a.loop_unroll : 0.000041s : 0.14% optimize.opt_a.a_1 : 0.000941s : 3.30% optimize.opt_a.with_stream_mark : 0.000032s : 0.11% optimize.opt_a.recompute_prepare : 0.000025s : 0.09% optimize.opt_a.updatestate_depend_eliminate : 0.000010s : 0.04% optimize.opt_a.updatestate_assign_eliminate : 0.000008s : 0.03% optimize.opt_a.updatestate_loads_eliminate : 0.000008s : 0.03% optimize.opt_a.parameter_eliminate : 0.000004s : 0.02% optimize.opt_a.a_2 : 0.000293s : 1.03% optimize.opt_a.accelerated_algorithm : 0.000020s : 0.07% optimize.opt_a.shard : 0.000004s : 0.02% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.02% optimize.opt_a.shard_inline : 0.000022s : 0.08% optimize.opt_a.merge_send_recv : 0.000021s : 0.07% optimize.opt_a.auto_parallel : 0.000016s : 0.06% optimize.opt_a.parallel : 0.000027s : 0.09% optimize.opt_a.flash_sp : 0.000014s : 0.05% optimize.opt_a.merge_comm : 0.000013s : 0.04% optimize.opt_a.allreduce_fusion : 0.000010s : 0.03% optimize.opt_a.matmul_add_comm_reduction : 0.000021s : 0.08% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000024s : 0.09% optimize.opt_a.virtual_dataset : 0.000019s : 0.07% optimize.opt_a.get_grad_eliminate_ : 0.000018s : 0.06% optimize.opt_a.virtual_output : 0.000017s : 0.06% optimize.opt_a.merge_forward : 0.000010s : 0.04% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.01% optimize.opt_a.offload_activation : 0.000021s : 0.07% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000044s : 0.16% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.01% optimize.opt_a.before_grad : 0.000031s : 0.11% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000012s : 0.04% optimize.opt_a.meta_fg_expand : 0.000008s : 0.03% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.01% optimize.opt_a.receive_attached : 0.000004s : 0.01% optimize.opt_a.after_resolve : 0.000031s : 0.11% optimize.opt_a.a_after_grad : 0.000028s : 0.10% optimize.opt_a.renormalize : 0.000719s : 2.52% optimize.opt_a.add_forward_monad_depend : 0.000010s : 0.03% optimize.opt_a.auto_monad_grad : 0.000004s : 0.01% optimize.opt_a.auto_monad_eliminator : 0.000033s : 0.12% optimize.opt_a.cse : 0.000067s : 0.23% optimize.opt_a.a_3 : 0.000153s : 0.54% optimize.py_interpret_to_execute_after_opt_a : 0.000017s : 0.06% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.02% optimize.rewriter_after_opt_a : 0.000051s : 0.18% optimize.convert_after_rewriter : 0.000013s : 0.05% optimize.order_py_execute_after_rewriter : 0.000010s : 0.03% optimize.mutable_eliminate : 0.000520s : 1.82% optimize.opt_b.b_1 : 0.000226s : 0.79% optimize.opt_b.b_2 : 0.000010s : 0.04% optimize.opt_b.updatestate_depend_eliminate : 0.000008s : 0.03% optimize.opt_b.updatestate_assign_eliminate : 0.000004s : 0.01% optimize.opt_b.updatestate_loads_eliminate : 0.000004s : 0.01% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000030s : 0.11% optimize.optimize_parallel_all_gather_comm : 0.000024s : 0.08% optimize.overlap_param_gather : 0.000005s : 0.02% optimize.cconv : 0.000030s : 0.11% optimize.loop_unroll : 0.000441s : 1.55% optimize.opt_after_cconv.c_1 : 0.000044s : 0.15% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000007s : 0.02% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000004s : 0.01% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000004s : 0.01% optimize.opt_after_cconv.cse : 0.000027s : 0.09% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000043s : 0.15% optimize.tuple_transform.d_1 : 0.000062s : 0.22% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000009s : 0.03% optimize.partial_unused_args_eliminate : 0.000005s : 0.02% optimize.add_recomputation : 0.000063s : 0.22% optimize.cse_after_recomputation.cse : 0.000018s : 0.06% optimize.environ_conv : 0.000009s : 0.03% optimize.swap_dp_allreduce_reducescatter : 0.000009s : 0.03% optimize.bias_add_comm_swap : 0.000005s : 0.02% optimize.label_micro_interleaved_index : 0.000007s : 0.03% optimize.label_fine_grained_interleaved_index : 0.000005s : 0.02% optimize.merge_cast_opt : 0.000004s : 0.01% optimize.slice_recompute_activation : 0.000005s : 0.02% optimize.micro_interleaved_order_control : 0.000005s : 0.02% optimize.assign_add_opt : 0.000004s : 0.01% optimize.ForceFp32Comm : 0.000003s : 0.01% optimize.remove_cast_before_assign_add : 0.000003s : 0.01% optimize.full_micro_interleaved_order_control : 0.000005s : 0.02% optimize.reorder_send_recv_between_fp_bp : 0.000006s : 0.02% optimize.comm_op_add_attrs : 0.000004s : 0.01% optimize.add_comm_op_reuse_tag : 0.000003s : 0.01% optimize.interleave_split_concat_branches : 0.000004s : 0.01% optimize.interleave_parallel_branches : 0.000003s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000003s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000004s : 0.01% optimize.control_data_broadcast_order : 0.000020s : 0.07% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.01% optimize.offloading_packed_experts : 0.000008s : 0.03% optimize.overlap_recompute_and_grad_model_parallel : 0.000008s : 0.03% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000003s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.01% optimize.overlap_recompute_comm : 0.000005s : 0.02% optimize.overlap_grad_ring_attention : 0.000007s : 0.03% optimize.overlap_grad_flash_sp : 0.000027s : 0.10% optimize.begin_end_overlap_inline : 0.000003s : 0.01% optimize.split_matmul_comm_elemetwise : 0.000005s : 0.02% optimize.split_layernorm_comm : 0.000004s : 0.01% optimize.handle_group_info : 0.000003s : 0.01% optimize.symbol_engine_optimizer.build : 0.000003s : 0.01% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000012s : 0.04% optimize.symbol_engine_optimizer.elim_not_effective : 0.000018s : 0.06% optimize.symbol_engine_optimizer.opt_reshape : 0.000010s : 0.03% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000014s : 0.05% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000004s : 0.01% pipeline_parallel_scheduler : 0.000002s : 0.01% auto_monad_reorder : 0.000025s : 0.09% get_jit_bprop_graph : 0.000002s : 0.01% rewriter_after_jit_bprop_graph : 0.000005s : 0.02% opt_after_jit_grad : 0.000504s : 1.77% validate : 0.000041s : 0.14% Time group info: ------[substitution.] 0.000219 48 14.24% : 0.000031s : 6: substitution.cast_eliminate 1.14% : 0.000002s : 4: substitution.elim_not_effective 0.93% : 0.000002s : 4: substitution.fold_const_symbol 3.53% : 0.000008s : 6: substitution.graph_param_transform 66.04% : 0.000144s : 4: substitution.inline 2.52% : 0.000006s : 8: substitution.j_node_and_user_rematch 3.36% : 0.000007s : 8: substitution.remove_not_recompute_node 2.37% : 0.000005s : 4: substitution.replace_old_param 5.86% : 0.000013s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.022347 2 96.65% : 0.021598s : 1: type_inference.infer 3.35% : 0.000749s : 1: type_inference.specialize ------[replace.] 0.000062 8 60.87% : 0.000038s : 4: replace.inline 39.13% : 0.000024s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000152 8 92.86% : 0.000141s : 4: match.inline 7.14% : 0.000011s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000277 1730 0.88% : 0.000002s : 17: predicate.accumulaten_eliminater 0.66% : 0.000002s : 6: predicate.ad_related_special_op_eliminate 0.60% : 0.000002s : 12: predicate.addn_check_dump 0.95% : 0.000003s : 17: predicate.addn_zero_filter 0.81% : 0.000002s : 17: predicate.adjust_all_reduce_mul_add 1.93% : 0.000005s : 29: predicate.arithmetic_simplify 1.07% : 0.000003s : 17: predicate.cast_eliminate 0.63% : 0.000002s : 12: predicate.check_bprop_eliminate 0.60% : 0.000002s : 12: predicate.compare_switch_simplify 0.20% : 0.000001s : 6: predicate.const_output_eliminate 0.65% : 0.000002s : 12: predicate.depend_value_elim 0.91% : 0.000003s : 17: predicate.dict_get_item_const_eliminator 1.02% : 0.000003s : 17: predicate.dict_get_item_eliminator 0.95% : 0.000003s : 17: predicate.dict_set_item_eliminator 0.86% : 0.000002s : 12: predicate.dumpgradient_eliminate 0.22% : 0.000001s : 6: predicate.elim_not_effective 0.36% : 0.000001s : 6: predicate.elim_shapecalc_of_broadcastargs 1.17% : 0.000003s : 23: predicate.environ_add_const_eliminate 1.13% : 0.000003s : 23: predicate.environ_get_add_eliminate 1.12% : 0.000003s : 23: predicate.environ_get_depend_swap 1.76% : 0.000005s : 35: predicate.environ_get_eliminate 1.23% : 0.000003s : 23: predicate.environ_get_set_eliminate 1.37% : 0.000004s : 25: predicate.exchange_switch_depend_value 2.24% : 0.000006s : 25: predicate.float_depend_g_call 0.58% : 0.000002s : 12: predicate.float_environ_get_switch 0.91% : 0.000003s : 18: predicate.float_tuple_getitem_switch 0.20% : 0.000001s : 6: predicate.fold_const_symbol 0.66% : 0.000002s : 12: predicate.get_grad_eliminate 0.23% : 0.000001s : 6: predicate.graph_param_transform 0.68% : 0.000002s : 12: predicate.incorporate_call 0.58% : 0.000002s : 12: predicate.incorporate_call_switch 6.31% : 0.000017s : 78: predicate.inline 0.96% : 0.000003s : 12: predicate.inline_without_move 0.35% : 0.000001s : 12: predicate.j_node_and_user_rematch 0.85% : 0.000002s : 12: predicate.less_batch_normalization 1.80% : 0.000005s : 33: predicate.list_to_tuple_eliminator_ 2.57% : 0.000007s : 50: predicate.load_eliminater 0.80% : 0.000002s : 6: predicate.loop_unroll_after_grad 1.94% : 0.000005s : 38: predicate.loop_unroll_before_grad 1.65% : 0.000005s : 29: predicate.make_slice_get_slice_eliminator 0.63% : 0.000002s : 12: predicate.merge_addn 0.64% : 0.000002s : 12: predicate.micro_step_allgather_replace 0.63% : 0.000002s : 12: predicate.mini_step_allgather_replace 0.82% : 0.000002s : 17: predicate.minmaximum_grad 0.93% : 0.000003s : 6: predicate.mutable_eliminate 0.37% : 0.000001s : 6: predicate.opt_reshape 0.40% : 0.000001s : 6: predicate.parallel_virtual_node 1.55% : 0.000004s : 25: predicate.partial_defer_inline 1.67% : 0.000005s : 27: predicate.partial_eliminate 0.88% : 0.000002s : 17: predicate.print_const_string_wrapper 0.63% : 0.000002s : 12: predicate.reduce_all_const_elim 1.14% : 0.000003s : 17: predicate.reduce_eliminate 2.56% : 0.000007s : 50: predicate.redundant_stop_gradient_eliminater 0.40% : 0.000001s : 12: predicate.remove_not_recompute_node 1.33% : 0.000004s : 33: predicate.replace_applicator 0.52% : 0.000001s : 12: predicate.replace_old_param 0.28% : 0.000001s : 6: predicate.reset_defer_inline 0.96% : 0.000003s : 17: predicate.reshape_eliminate 0.66% : 0.000002s : 12: predicate.row_tensor_add_zeros_like 0.37% : 0.000001s : 6: predicate.row_tensor_eliminate 0.76% : 0.000002s : 12: predicate.same_eliminate 0.58% : 0.000002s : 12: predicate.set_cell_output_no_recompute 0.90% : 0.000002s : 12: predicate.shard_identity_eliminate 0.73% : 0.000002s : 12: predicate.special_op_eliminate 0.84% : 0.000002s : 12: predicate.specialize_transform 1.19% : 0.000003s : 12: predicate.split_environ_get_set_with_tuple_value 0.78% : 0.000002s : 12: predicate.stack_unstack_eliminate 0.41% : 0.000001s : 6: predicate.switch_call_monad_eliminater 1.37% : 0.000004s : 25: predicate.switch_defer_inline 2.00% : 0.000006s : 37: predicate.switch_layer_defer_inline 4.68% : 0.000013s : 81: predicate.switch_simplify 0.84% : 0.000002s : 17: predicate.tile_eliminate 1.00% : 0.000003s : 17: predicate.transpose_eliminate 1.75% : 0.000005s : 29: predicate.tuple_list_convert_item_index_to_positive 1.61% : 0.000004s : 29: predicate.tuple_list_get_item_const_eliminator 1.54% : 0.000004s : 29: predicate.tuple_list_get_item_depend_reorder 3.15% : 0.000009s : 45: predicate.tuple_list_get_item_eliminator 1.54% : 0.000004s : 29: predicate.tuple_list_get_set_item_eliminator 2.31% : 0.000006s : 41: predicate.tuple_list_set_item_eliminator 1.86% : 0.000005s : 33: predicate.tuple_to_list_eliminator_ 2.45% : 0.000007s : 50: predicate.updatestate_pure_node_eliminater 3.27% : 0.000009s : 62: predicate.updatestate_useless_node_eliminater 0.38% : 0.000001s : 6: predicate.value_based_eliminate 0.76% : 0.000002s : 12: predicate.virtual_dataset_eliminate 0.67% : 0.000002s : 12: predicate.virtual_output_eliminate 0.32% : 0.000001s : 6: predicate.virtual_view_grad_eliminate 0.56% : 0.000002s : 6: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000570 11 55.20% : 0.000314s : 5: func_graph_cloner_run.FuncGraphClonerGraph 44.80% : 0.000255s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.045493 192 0.01% : 0.000006s : 1: ForceFp32Comm 7.03% : 0.003196s : 1: add_attr 6.99% : 0.003181s : 1: add_attr_with_inline 0.01% : 0.000006s : 1: add_comm_op_reuse_tag 0.15% : 0.000068s : 1: add_recomputation 0.01% : 0.000006s : 1: assign_add_opt 0.18% : 0.000081s : 1: auto_monad 0.07% : 0.000033s : 1: auto_monad_reorder 0.01% : 0.000006s : 1: begin_end_overlap_inline 0.02% : 0.000008s : 1: bias_add_comm_swap 1.02% : 0.000466s : 1: bootstrap 0.07% : 0.000033s : 1: cconv 0.01% : 0.000006s : 1: comm_op_add_attrs 0.05% : 0.000023s : 1: control_data_broadcast_order 0.04% : 0.000016s : 1: convert_after_rewriter 0.08% : 0.000038s : 1: cse_after_recomputation 0.02% : 0.000008s : 1: dataset_repeat_opt 0.04% : 0.000020s : 1: detach_backward 0.03% : 0.000013s : 1: environ_conv 0.07% : 0.000032s : 1: event_method 0.02% : 0.000007s : 1: full_micro_interleaved_order_control 0.02% : 0.000008s : 1: get_jit_bprop_graph 0.03% : 0.000013s : 1: graph_reusing 0.02% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000006s : 1: handle_group_info 0.02% : 0.000009s : 1: inline 0.02% : 0.000009s : 1: insert-virtual-dataset 0.01% : 0.000006s : 1: interleave_parallel_branches 0.01% : 0.000006s : 1: interleave_split_concat_branches 0.02% : 0.000008s : 1: label_fine_grained_interleaved_index 0.02% : 0.000010s : 1: label_micro_interleaved_index 0.98% : 0.000446s : 1: loop_unroll 0.01% : 0.000007s : 1: merge_cast_opt 0.02% : 0.000007s : 1: micro_interleaved_order_control 1.16% : 0.000526s : 1: mutable_eliminate 0.02% : 0.000010s : 1: offloading_packed_experts 0.04% : 0.000018s : 1: opt.transform.loop_unroll_optimizer 0.04% : 0.000019s : 1: opt.transform.mutable_eliminate 3.46% : 0.001574s : 78: opt.transform.opt_a 0.09% : 0.000042s : 1: opt.transform.opt_after_cconv 0.07% : 0.000032s : 1: opt.transform.opt_after_jit_grad 0.36% : 0.000164s : 28: opt.transform.opt_b 0.15% : 0.000069s : 2: opt.transform.opt_trans_graph 0.11% : 0.000050s : 4: opt.transform.symbol_engine_opt 8.11% : 0.003690s : 1: opt_a 0.34% : 0.000157s : 1: opt_after_cconv 1.13% : 0.000514s : 1: opt_after_jit_grad 0.78% : 0.000355s : 1: opt_b 14.40% : 0.006550s : 1: optimize 0.06% : 0.000027s : 1: optimize_parallel_all_gather_comm 0.03% : 0.000013s : 1: order_py_execute_after_rewriter 0.07% : 0.000030s : 1: overlap_grad_flash_sp 0.01% : 0.000006s : 1: overlap_grad_matmul_and_grad_allreduce 0.02% : 0.000010s : 1: overlap_grad_ring_attention 0.01% : 0.000007s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000006s : 1: overlap_opt_shard_in_pipeline 0.02% : 0.000008s : 1: overlap_param_gather 0.01% : 0.000006s : 1: overlap_recompute_allgather_and_fa_grad 0.02% : 0.000011s : 1: overlap_recompute_and_grad_model_parallel 0.02% : 0.000008s : 1: overlap_recompute_comm 0.02% : 0.000010s : 1: parallel-infer-symbol 0.02% : 0.000007s : 1: parallel-infer-symbol-second 0.02% : 0.000008s : 1: partial_unused_args_eliminate 0.02% : 0.000010s : 1: pipeline_parallel_scheduler 0.02% : 0.000007s : 1: pipeline_split 0.09% : 0.000043s : 1: pre_auto_parallel 0.08% : 0.000035s : 1: py_interpret_to_execute 0.05% : 0.000021s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000006s : 1: remove_cast_before_assign_add 0.10% : 0.000047s : 1: remove_dup_value 0.87% : 0.000398s : 1: renormalize.infer 0.69% : 0.000312s : 1: renormalize.specialize 0.02% : 0.000009s : 1: reorder_send_recv_between_fp_bp 0.02% : 0.000011s : 1: rewriter_after_jit_bprop_graph 0.12% : 0.000055s : 1: rewriter_after_opt_a 0.23% : 0.000103s : 1: rewriter_before_opt_a 0.02% : 0.000008s : 1: slice_cell_reuse_recomputed_activation 0.02% : 0.000007s : 1: slice_recompute_activation 0.01% : 0.000007s : 1: split_layernorm_comm 0.02% : 0.000007s : 1: split_matmul_comm_elemetwise 0.03% : 0.000012s : 1: swap_dp_allreduce_reducescatter 0.25% : 0.000112s : 1: symbol_engine_optimizer 0.25% : 0.000113s : 1: tuple_transform 49.34% : 0.022445s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:45:54.182.66 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0148337, [21] [bootstrap]: 0.00041438 [type_inference]: 0.00551298 [event_method]: 1.796e-05 [auto_monad]: 6.494e-05 [graph_reusing]: 6.04001e-06 [inline]: 2.14999e-06 [add_attr]: 0.00292641, [1] [add_attr_with_inline]: 0.00291844, [1] [Cycle 1]: 5.24e-05, [2] [tag_attr]: 1.907e-05 [meta_addattr_fg_expand]: 5.89999e-06 [parallel-infer-symbol]: 2.81e-06 [pre_auto_parallel]: 3.194e-05 [insert-virtual-dataset]: 2.597e-05 [parallel-infer-symbol-second]: 9.50007e-07 [dataset_repeat_opt]: 1.79e-06 [pipeline_split]: 2.29001e-06 [optimize]: 0.00515786, [53] [py_interpret_to_execute]: 2.728e-05 [rewriter_before_opt_a]: 8.466e-05 [opt_a]: 0.00298414, [2] [Cycle 1]: 0.00212786, [45] [expand_dump_flag]: 3.08e-06 [switch_simplify]: 4.438e-05 [loop_unroll]: 3.131e-05 [a_1]: 0.00069804 [with_stream_mark]: 1.519e-05 [recompute_prepare]: 1.078e-05 [updatestate_depend_eliminate]: 4.76002e-06 [updatestate_assign_eliminate]: 4.52998e-06 [updatestate_loads_eliminate]: 4.4e-06 [parameter_eliminate]: 1.98997e-06 [a_2]: 0.0001196 [accelerated_algorithm]: 9.54e-06 [shard]: 1.86998e-06 [meta_shard_fg_expand]: 2.16998e-06 [shard_inline]: 8.88002e-06 [merge_send_recv]: 1.064e-05 [auto_parallel]: 7.9e-06 [parallel]: 1.885e-05 [flash_sp]: 7.9e-06 [merge_comm]: 5.40999e-06 [allreduce_fusion]: 4.90001e-06 [matmul_add_comm_reduction]: 1.087e-05 [allreduce_slice_to_reducescatter]: 6.19999e-07 [virtual_shard_identity]: 1.013e-05 [virtual_dataset]: 9.04e-06 [get_grad_eliminate_]: 8.92e-06 [virtual_output]: 9.06002e-06 [merge_forward]: 4.74e-06 [cell_reuse_recompute_pass]: 1.20001e-06 [offload_activation]: 1.185e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.692e-05 [merge_recompute_call_nodes]: 1.77999e-06 [before_grad]: 1.47e-05 [set_forward_comm_id_for_comm_node_pass]: 4.93001e-06 [meta_fg_expand]: 4.1e-06 [flash_sp_send_recv_attached]: 2.41998e-06 [receive_attached]: 2.24999e-06 [after_resolve]: 1.334e-05 [a_after_grad]: 1.364e-05 [renormalize]: 0.00060074 [add_forward_monad_depend]: 5.14e-06 [auto_monad_grad]: 1.69998e-06 [auto_monad_eliminator]: 1.865e-05 [cse]: 4.303e-05 [a_3]: 6.359e-05 [Cycle 2]: 0.00084687, [45] [expand_dump_flag]: 9.79984e-07 [switch_simplify]: 1.017e-05 [loop_unroll]: 8.68001e-06 [a_1]: 0.00020923 [with_stream_mark]: 1.272e-05 [recompute_prepare]: 8.85001e-06 [updatestate_depend_eliminate]: 4.3e-06 [updatestate_assign_eliminate]: 3.51999e-06 [updatestate_loads_eliminate]: 3.38999e-06 [parameter_eliminate]: 1.30001e-06 [a_2]: 0.00011039 [accelerated_algorithm]: 8.87999e-06 [shard]: 1.05999e-06 [meta_shard_fg_expand]: 1.86998e-06 [shard_inline]: 8.65999e-06 [merge_send_recv]: 6.14999e-06 [auto_parallel]: 9.67001e-06 [parallel]: 4.83001e-06 [flash_sp]: 3.15998e-06 [merge_comm]: 4.75999e-06 [allreduce_fusion]: 4.2e-06 [matmul_add_comm_reduction]: 7.70998e-06 [allreduce_slice_to_reducescatter]: 7.00005e-07 [virtual_shard_identity]: 9.47001e-06 [virtual_dataset]: 8.21002e-06 [get_grad_eliminate_]: 1.001e-05 [virtual_output]: 8.27e-06 [merge_forward]: 4.16001e-06 [cell_reuse_recompute_pass]: 1.57999e-06 [offload_activation]: 8.98002e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.606e-05 [merge_recompute_call_nodes]: 6.69999e-07 [before_grad]: 1.333e-05 [set_forward_comm_id_for_comm_node_pass]: 4.99e-06 [meta_fg_expand]: 3.23e-06 [flash_sp_send_recv_attached]: 8.10018e-07 [receive_attached]: 1.19e-06 [after_resolve]: 1.25e-05 [a_after_grad]: 1.305e-05 [renormalize]: 8.00064e-08 [add_forward_monad_depend]: 1.14e-06 [auto_monad_grad]: 1.10001e-06 [auto_monad_eliminator]: 9.96e-06 [cse]: 2.212e-05 [a_3]: 5.392e-05 [py_interpret_to_execute_after_opt_a]: 1.087e-05 [slice_cell_reuse_recomputed_activation]: 2.19999e-06 [rewriter_after_opt_a]: 4.33e-05 [convert_after_rewriter]: 8.76002e-06 [order_py_execute_after_rewriter]: 6.23998e-06 [mutable_eliminate]: 0.00045998 [opt_b]: 0.00027413, [1] [Cycle 1]: 0.0002676, [7] [b_1]: 0.00018171 [b_2]: 1.045e-05 [updatestate_depend_eliminate]: 6.37001e-06 [updatestate_assign_eliminate]: 4.01001e-06 [updatestate_loads_eliminate]: 3.66999e-06 [renormalize]: 4.10015e-07 [cse]: 2.601e-05 [optimize_parallel_all_gather_comm]: 4.243e-05 [overlap_param_gather]: 2.38998e-06 [cconv]: 2.449e-05 [loop_unroll]: 0.00041824 [opt_after_cconv]: 0.00013169, [1] [Cycle 1]: 0.00012621, [7] [c_1]: 4.357e-05 [parameter_eliminate]: 2.66999e-06 [updatestate_depend_eliminate]: 6.83e-06 [updatestate_assign_eliminate]: 3.88001e-06 [updatestate_loads_eliminate]: 3.71999e-06 [cse]: 2.797e-05 [renormalize]: 7.7e-07 [remove_dup_value]: 3.609e-05 [tuple_transform]: 9.598e-05, [1] [Cycle 1]: 9.102e-05, [4] [d_1]: 6.058e-05 [none_parameter_eliminate]: 1.79998e-06 [renormalize]: 2.29978e-07 [switch_simplify]: 9.19e-06 [partial_unused_args_eliminate]: 1.99999e-06 [add_recomputation]: 5.976e-05 [cse_after_recomputation]: 2.771e-05, [1] [Cycle 1]: 2.29e-05, [1] [cse]: 1.731e-05 [environ_conv]: 6.84999e-06 [swap_dp_allreduce_reducescatter]: 6.84999e-06 [bias_add_comm_swap]: 2.71999e-06 [label_micro_interleaved_index]: 4.42e-06 [label_fine_grained_interleaved_index]: 2.53e-06 [merge_cast_opt]: 1.17e-06 [slice_recompute_activation]: 2.10002e-06 [micro_interleaved_order_control]: 2.59999e-06 [assign_add_opt]: 1.20001e-06 [ForceFp32Comm]: 9.70002e-07 [remove_cast_before_assign_add]: 1.15999e-06 [full_micro_interleaved_order_control]: 1.99e-06 [reorder_send_recv_between_fp_bp]: 2.58998e-06 [comm_op_add_attrs]: 1.00999e-06 [add_comm_op_reuse_tag]: 1.10999e-06 [interleave_split_concat_branches]: 1.12e-06 [interleave_parallel_branches]: 1.05999e-06 [overlap_opt_shard_in_pipeline]: 1.14e-06 [overlap_opt_shard_grad_in_pipeline]: 2.00002e-06 [control_data_broadcast_order]: 1.592e-05 [grouped_pairwise_exchange_alltoall]: 1.59e-06 [offloading_packed_experts]: 4.92e-06 [overlap_recompute_and_grad_model_parallel]: 5.70001e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.15999e-06 [overlap_recompute_allgather_and_fa_grad]: 1.32999e-06 [overlap_recompute_comm]: 2.17999e-06 [overlap_grad_ring_attention]: 5.15001e-06 [overlap_grad_flash_sp]: 2.301e-05 [begin_end_overlap_inline]: 5.19998e-07 [split_matmul_comm_elemetwise]: 2.41998e-06 [split_layernorm_comm]: 1.76e-06 [handle_group_info]: 1.34e-06 [symbol_engine_optimizer]: 8.946e-05, [1] [Cycle 1]: 8.53e-05, [6] [build]: 3.68999e-06 [elim_shapecalc]: 1.248e-05 [elim_not_effective]: 1.728e-05 [opt_reshape]: 9.35001e-06 [fold_const_symbol]: 1.429e-05 [renormalize]: 2.80008e-07 [detach_backward]: 2.32001e-06 [pipeline_parallel_scheduler]: 1.52001e-06 [auto_monad_reorder]: 1.997e-05 [get_jit_bprop_graph]: 1.04e-06 [rewriter_after_jit_bprop_graph]: 4.1e-06 [opt_after_jit_grad]: 0.00045837 [validate]: 3.992e-05 Sums bootstrap : 0.000414s : 3.78% type_inference : 0.005513s : 50.23% event_method : 0.000018s : 0.16% auto_monad : 0.000065s : 0.59% graph_reusing : 0.000006s : 0.06% inline : 0.000002s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000019s : 0.17% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.05% parallel-infer-symbol : 0.000003s : 0.03% pre_auto_parallel : 0.000032s : 0.29% insert-virtual-dataset : 0.000026s : 0.24% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.02% optimize.py_interpret_to_execute : 0.000027s : 0.25% optimize.rewriter_before_opt_a : 0.000085s : 0.77% optimize.opt_a.expand_dump_flag : 0.000004s : 0.04% optimize.opt_a.switch_simplify : 0.000055s : 0.50% optimize.opt_a.loop_unroll : 0.000040s : 0.36% optimize.opt_a.a_1 : 0.000907s : 8.27% optimize.opt_a.with_stream_mark : 0.000028s : 0.25% optimize.opt_a.recompute_prepare : 0.000020s : 0.18% optimize.opt_a.updatestate_depend_eliminate : 0.000009s : 0.08% optimize.opt_a.updatestate_assign_eliminate : 0.000008s : 0.07% optimize.opt_a.updatestate_loads_eliminate : 0.000008s : 0.07% optimize.opt_a.parameter_eliminate : 0.000003s : 0.03% optimize.opt_a.a_2 : 0.000230s : 2.10% optimize.opt_a.accelerated_algorithm : 0.000018s : 0.17% optimize.opt_a.shard : 0.000003s : 0.03% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.04% optimize.opt_a.shard_inline : 0.000018s : 0.16% optimize.opt_a.merge_send_recv : 0.000017s : 0.15% optimize.opt_a.auto_parallel : 0.000018s : 0.16% optimize.opt_a.parallel : 0.000024s : 0.22% optimize.opt_a.flash_sp : 0.000011s : 0.10% optimize.opt_a.merge_comm : 0.000010s : 0.09% optimize.opt_a.allreduce_fusion : 0.000009s : 0.08% optimize.opt_a.matmul_add_comm_reduction : 0.000019s : 0.17% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000020s : 0.18% optimize.opt_a.virtual_dataset : 0.000017s : 0.16% optimize.opt_a.get_grad_eliminate_ : 0.000019s : 0.17% optimize.opt_a.virtual_output : 0.000017s : 0.16% optimize.opt_a.merge_forward : 0.000009s : 0.08% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.03% optimize.opt_a.offload_activation : 0.000021s : 0.19% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000033s : 0.30% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.02% optimize.opt_a.before_grad : 0.000028s : 0.26% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000010s : 0.09% optimize.opt_a.meta_fg_expand : 0.000007s : 0.07% optimize.opt_a.flash_sp_send_recv_attached : 0.000003s : 0.03% optimize.opt_a.receive_attached : 0.000003s : 0.03% optimize.opt_a.after_resolve : 0.000026s : 0.24% optimize.opt_a.a_after_grad : 0.000027s : 0.24% optimize.opt_a.renormalize : 0.000601s : 5.47% optimize.opt_a.add_forward_monad_depend : 0.000006s : 0.06% optimize.opt_a.auto_monad_grad : 0.000003s : 0.03% optimize.opt_a.auto_monad_eliminator : 0.000029s : 0.26% optimize.opt_a.cse : 0.000065s : 0.59% optimize.opt_a.a_3 : 0.000118s : 1.07% optimize.py_interpret_to_execute_after_opt_a : 0.000011s : 0.10% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.02% optimize.rewriter_after_opt_a : 0.000043s : 0.39% optimize.convert_after_rewriter : 0.000009s : 0.08% optimize.order_py_execute_after_rewriter : 0.000006s : 0.06% optimize.mutable_eliminate : 0.000460s : 4.19% optimize.opt_b.b_1 : 0.000182s : 1.66% optimize.opt_b.b_2 : 0.000010s : 0.10% optimize.opt_b.updatestate_depend_eliminate : 0.000006s : 0.06% optimize.opt_b.updatestate_assign_eliminate : 0.000004s : 0.04% optimize.opt_b.updatestate_loads_eliminate : 0.000004s : 0.03% optimize.opt_b.renormalize : 0.000000s : 0.00% optimize.opt_b.cse : 0.000026s : 0.24% optimize.optimize_parallel_all_gather_comm : 0.000042s : 0.39% optimize.overlap_param_gather : 0.000002s : 0.02% optimize.cconv : 0.000024s : 0.22% optimize.loop_unroll : 0.000418s : 3.81% optimize.opt_after_cconv.c_1 : 0.000044s : 0.40% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.02% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000007s : 0.06% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000004s : 0.04% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000004s : 0.03% optimize.opt_after_cconv.cse : 0.000028s : 0.25% optimize.opt_after_cconv.renormalize : 0.000001s : 0.01% optimize.remove_dup_value : 0.000036s : 0.33% optimize.tuple_transform.d_1 : 0.000061s : 0.55% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.02% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000009s : 0.08% optimize.partial_unused_args_eliminate : 0.000002s : 0.02% optimize.add_recomputation : 0.000060s : 0.54% optimize.cse_after_recomputation.cse : 0.000017s : 0.16% optimize.environ_conv : 0.000007s : 0.06% optimize.swap_dp_allreduce_reducescatter : 0.000007s : 0.06% optimize.bias_add_comm_swap : 0.000003s : 0.02% optimize.label_micro_interleaved_index : 0.000004s : 0.04% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.02% optimize.merge_cast_opt : 0.000001s : 0.01% optimize.slice_recompute_activation : 0.000002s : 0.02% optimize.micro_interleaved_order_control : 0.000003s : 0.02% optimize.assign_add_opt : 0.000001s : 0.01% optimize.ForceFp32Comm : 0.000001s : 0.01% optimize.remove_cast_before_assign_add : 0.000001s : 0.01% optimize.full_micro_interleaved_order_control : 0.000002s : 0.02% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.02% optimize.comm_op_add_attrs : 0.000001s : 0.01% optimize.add_comm_op_reuse_tag : 0.000001s : 0.01% optimize.interleave_split_concat_branches : 0.000001s : 0.01% optimize.interleave_parallel_branches : 0.000001s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.02% optimize.control_data_broadcast_order : 0.000016s : 0.15% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.01% optimize.offloading_packed_experts : 0.000005s : 0.04% optimize.overlap_recompute_and_grad_model_parallel : 0.000006s : 0.05% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.01% optimize.overlap_recompute_comm : 0.000002s : 0.02% optimize.overlap_grad_ring_attention : 0.000005s : 0.05% optimize.overlap_grad_flash_sp : 0.000023s : 0.21% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.02% optimize.split_layernorm_comm : 0.000002s : 0.02% optimize.handle_group_info : 0.000001s : 0.01% optimize.symbol_engine_optimizer.build : 0.000004s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000012s : 0.11% optimize.symbol_engine_optimizer.elim_not_effective : 0.000017s : 0.16% optimize.symbol_engine_optimizer.opt_reshape : 0.000009s : 0.09% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000014s : 0.13% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.02% pipeline_parallel_scheduler : 0.000002s : 0.01% auto_monad_reorder : 0.000020s : 0.18% get_jit_bprop_graph : 0.000001s : 0.01% rewriter_after_jit_bprop_graph : 0.000004s : 0.04% opt_after_jit_grad : 0.000458s : 4.18% validate : 0.000040s : 0.36% Time group info: ------[substitution.] 0.000203 48 14.30% : 0.000029s : 6: substitution.cast_eliminate 1.35% : 0.000003s : 4: substitution.elim_not_effective 0.97% : 0.000002s : 4: substitution.fold_const_symbol 3.47% : 0.000007s : 6: substitution.graph_param_transform 65.76% : 0.000134s : 4: substitution.inline 2.32% : 0.000005s : 8: substitution.j_node_and_user_rematch 3.58% : 0.000007s : 8: substitution.remove_not_recompute_node 2.10% : 0.000004s : 4: substitution.replace_old_param 6.15% : 0.000013s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.005459 2 87.50% : 0.004777s : 1: type_inference.infer 12.50% : 0.000683s : 1: type_inference.specialize ------[replace.] 0.000061 8 62.55% : 0.000038s : 4: replace.inline 37.45% : 0.000023s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000142 8 92.42% : 0.000131s : 4: match.inline 7.58% : 0.000011s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000269 1730 0.85% : 0.000002s : 17: predicate.accumulaten_eliminater 0.82% : 0.000002s : 6: predicate.ad_related_special_op_eliminate 0.59% : 0.000002s : 12: predicate.addn_check_dump 0.93% : 0.000002s : 17: predicate.addn_zero_filter 0.83% : 0.000002s : 17: predicate.adjust_all_reduce_mul_add 2.00% : 0.000005s : 29: predicate.arithmetic_simplify 1.06% : 0.000003s : 17: predicate.cast_eliminate 0.66% : 0.000002s : 12: predicate.check_bprop_eliminate 0.59% : 0.000002s : 12: predicate.compare_switch_simplify 0.19% : 0.000001s : 6: predicate.const_output_eliminate 0.65% : 0.000002s : 12: predicate.depend_value_elim 0.93% : 0.000002s : 17: predicate.dict_get_item_const_eliminator 1.02% : 0.000003s : 17: predicate.dict_get_item_eliminator 0.85% : 0.000002s : 17: predicate.dict_set_item_eliminator 1.02% : 0.000003s : 12: predicate.dumpgradient_eliminate 0.27% : 0.000001s : 6: predicate.elim_not_effective 0.41% : 0.000001s : 6: predicate.elim_shapecalc_of_broadcastargs 1.23% : 0.000003s : 23: predicate.environ_add_const_eliminate 1.11% : 0.000003s : 23: predicate.environ_get_add_eliminate 1.14% : 0.000003s : 23: predicate.environ_get_depend_swap 1.82% : 0.000005s : 35: predicate.environ_get_eliminate 1.13% : 0.000003s : 23: predicate.environ_get_set_eliminate 1.31% : 0.000004s : 25: predicate.exchange_switch_depend_value 2.20% : 0.000006s : 25: predicate.float_depend_g_call 0.62% : 0.000002s : 12: predicate.float_environ_get_switch 0.88% : 0.000002s : 18: predicate.float_tuple_getitem_switch 0.19% : 0.000001s : 6: predicate.fold_const_symbol 0.77% : 0.000002s : 12: predicate.get_grad_eliminate 0.23% : 0.000001s : 6: predicate.graph_param_transform 0.70% : 0.000002s : 12: predicate.incorporate_call 0.59% : 0.000002s : 12: predicate.incorporate_call_switch 6.21% : 0.000017s : 78: predicate.inline 0.90% : 0.000002s : 12: predicate.inline_without_move 0.35% : 0.000001s : 12: predicate.j_node_and_user_rematch 0.81% : 0.000002s : 12: predicate.less_batch_normalization 1.80% : 0.000005s : 33: predicate.list_to_tuple_eliminator_ 2.52% : 0.000007s : 50: predicate.load_eliminater 0.71% : 0.000002s : 6: predicate.loop_unroll_after_grad 2.01% : 0.000005s : 38: predicate.loop_unroll_before_grad 1.66% : 0.000004s : 29: predicate.make_slice_get_slice_eliminator 0.69% : 0.000002s : 12: predicate.merge_addn 0.72% : 0.000002s : 12: predicate.micro_step_allgather_replace 0.64% : 0.000002s : 12: predicate.mini_step_allgather_replace 0.84% : 0.000002s : 17: predicate.minmaximum_grad 0.81% : 0.000002s : 6: predicate.mutable_eliminate 0.36% : 0.000001s : 6: predicate.opt_reshape 0.38% : 0.000001s : 6: predicate.parallel_virtual_node 1.56% : 0.000004s : 25: predicate.partial_defer_inline 1.67% : 0.000004s : 27: predicate.partial_eliminate 0.86% : 0.000002s : 17: predicate.print_const_string_wrapper 0.79% : 0.000002s : 12: predicate.reduce_all_const_elim 1.10% : 0.000003s : 17: predicate.reduce_eliminate 2.51% : 0.000007s : 50: predicate.redundant_stop_gradient_eliminater 0.38% : 0.000001s : 12: predicate.remove_not_recompute_node 1.35% : 0.000004s : 33: predicate.replace_applicator 0.43% : 0.000001s : 12: predicate.replace_old_param 0.26% : 0.000001s : 6: predicate.reset_defer_inline 0.90% : 0.000002s : 17: predicate.reshape_eliminate 0.68% : 0.000002s : 12: predicate.row_tensor_add_zeros_like 0.36% : 0.000001s : 6: predicate.row_tensor_eliminate 0.79% : 0.000002s : 12: predicate.same_eliminate 0.48% : 0.000001s : 12: predicate.set_cell_output_no_recompute 0.81% : 0.000002s : 12: predicate.shard_identity_eliminate 1.01% : 0.000003s : 12: predicate.special_op_eliminate 0.84% : 0.000002s : 12: predicate.specialize_transform 0.83% : 0.000002s : 12: predicate.split_environ_get_set_with_tuple_value 0.87% : 0.000002s : 12: predicate.stack_unstack_eliminate 0.39% : 0.000001s : 6: predicate.switch_call_monad_eliminater 1.50% : 0.000004s : 25: predicate.switch_defer_inline 2.03% : 0.000005s : 37: predicate.switch_layer_defer_inline 4.72% : 0.000013s : 81: predicate.switch_simplify 0.91% : 0.000002s : 17: predicate.tile_eliminate 0.92% : 0.000002s : 17: predicate.transpose_eliminate 1.55% : 0.000004s : 29: predicate.tuple_list_convert_item_index_to_positive 1.65% : 0.000004s : 29: predicate.tuple_list_get_item_const_eliminator 1.51% : 0.000004s : 29: predicate.tuple_list_get_item_depend_reorder 3.23% : 0.000009s : 45: predicate.tuple_list_get_item_eliminator 1.55% : 0.000004s : 29: predicate.tuple_list_get_set_item_eliminator 2.42% : 0.000007s : 41: predicate.tuple_list_set_item_eliminator 1.73% : 0.000005s : 33: predicate.tuple_to_list_eliminator_ 2.53% : 0.000007s : 50: predicate.updatestate_pure_node_eliminater 3.23% : 0.000009s : 62: predicate.updatestate_useless_node_eliminater 0.36% : 0.000001s : 6: predicate.value_based_eliminate 0.71% : 0.000002s : 12: predicate.virtual_dataset_eliminate 0.83% : 0.000002s : 12: predicate.virtual_output_eliminate 0.32% : 0.000001s : 6: predicate.virtual_view_grad_eliminate 0.42% : 0.000001s : 6: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000479 11 55.04% : 0.000264s : 5: func_graph_cloner_run.FuncGraphClonerGraph 44.96% : 0.000216s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.025284 192 0.01% : 0.000004s : 1: ForceFp32Comm 11.59% : 0.002931s : 1: add_attr 11.56% : 0.002922s : 1: add_attr_with_inline 0.01% : 0.000004s : 1: add_comm_op_reuse_tag 0.25% : 0.000064s : 1: add_recomputation 0.02% : 0.000004s : 1: assign_add_opt 0.28% : 0.000070s : 1: auto_monad 0.09% : 0.000023s : 1: auto_monad_reorder 0.01% : 0.000003s : 1: begin_end_overlap_inline 0.02% : 0.000006s : 1: bias_add_comm_swap 1.74% : 0.000440s : 1: bootstrap 0.11% : 0.000028s : 1: cconv 0.01% : 0.000004s : 1: comm_op_add_attrs 0.08% : 0.000019s : 1: control_data_broadcast_order 0.05% : 0.000012s : 1: convert_after_rewriter 0.12% : 0.000031s : 1: cse_after_recomputation 0.02% : 0.000005s : 1: dataset_repeat_opt 0.02% : 0.000006s : 1: detach_backward 0.04% : 0.000010s : 1: environ_conv 0.09% : 0.000024s : 1: event_method 0.02% : 0.000005s : 1: full_micro_interleaved_order_control 0.02% : 0.000004s : 1: get_jit_bprop_graph 0.04% : 0.000010s : 1: graph_reusing 0.02% : 0.000005s : 1: grouped_pairwise_exchange_alltoall 0.02% : 0.000004s : 1: handle_group_info 0.02% : 0.000005s : 1: inline 0.12% : 0.000031s : 1: insert-virtual-dataset 0.01% : 0.000004s : 1: interleave_parallel_branches 0.02% : 0.000004s : 1: interleave_split_concat_branches 0.02% : 0.000005s : 1: label_fine_grained_interleaved_index 0.03% : 0.000007s : 1: label_micro_interleaved_index 1.68% : 0.000426s : 1: loop_unroll 0.02% : 0.000004s : 1: merge_cast_opt 0.02% : 0.000005s : 1: micro_interleaved_order_control 1.85% : 0.000468s : 1: mutable_eliminate 0.03% : 0.000008s : 1: offloading_packed_experts 0.06% : 0.000016s : 1: opt.transform.loop_unroll_optimizer 0.07% : 0.000017s : 1: opt.transform.mutable_eliminate 5.95% : 0.001503s : 78: opt.transform.opt_a 0.16% : 0.000042s : 1: opt.transform.opt_after_cconv 0.13% : 0.000033s : 1: opt.transform.opt_after_jit_grad 0.63% : 0.000160s : 28: opt.transform.opt_b 0.27% : 0.000068s : 2: opt.transform.opt_trans_graph 0.20% : 0.000050s : 4: opt.transform.symbol_engine_opt 11.81% : 0.002987s : 1: opt_a 0.53% : 0.000135s : 1: opt_after_cconv 1.84% : 0.000466s : 1: opt_after_jit_grad 1.10% : 0.000278s : 1: opt_b 20.42% : 0.005162s : 1: optimize 0.18% : 0.000046s : 1: optimize_parallel_all_gather_comm 0.04% : 0.000009s : 1: order_py_execute_after_rewriter 0.10% : 0.000026s : 1: overlap_grad_flash_sp 0.02% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.03% : 0.000008s : 1: overlap_grad_ring_attention 0.02% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.02% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.02% : 0.000005s : 1: overlap_param_gather 0.02% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.03% : 0.000009s : 1: overlap_recompute_and_grad_model_parallel 0.02% : 0.000005s : 1: overlap_recompute_comm 0.03% : 0.000006s : 1: parallel-infer-symbol 0.02% : 0.000004s : 1: parallel-infer-symbol-second 0.02% : 0.000005s : 1: partial_unused_args_eliminate 0.02% : 0.000004s : 1: pipeline_parallel_scheduler 0.02% : 0.000005s : 1: pipeline_split 0.14% : 0.000036s : 1: pre_auto_parallel 0.12% : 0.000031s : 1: py_interpret_to_execute 0.06% : 0.000014s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000004s : 1: remove_cast_before_assign_add 0.16% : 0.000041s : 1: remove_dup_value 1.27% : 0.000321s : 1: renormalize.infer 1.08% : 0.000272s : 1: renormalize.specialize 0.02% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.03% : 0.000007s : 1: rewriter_after_jit_bprop_graph 0.19% : 0.000047s : 1: rewriter_after_opt_a 0.35% : 0.000089s : 1: rewriter_before_opt_a 0.02% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.02% : 0.000005s : 1: slice_recompute_activation 0.02% : 0.000004s : 1: split_layernorm_comm 0.02% : 0.000005s : 1: split_matmul_comm_elemetwise 0.04% : 0.000010s : 1: swap_dp_allreduce_reducescatter 0.37% : 0.000092s : 1: symbol_engine_optimizer 0.39% : 0.000099s : 1: tuple_transform 21.86% : 0.005526s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:45:54.395.640 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:45:54.395.904 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0167915, [21] [bootstrap]: 0.00044422 [type_inference]: 0.00592779 [event_method]: 1.958e-05 [auto_monad]: 6.615e-05 [graph_reusing]: 6.43e-06 [inline]: 2.35002e-06 [add_attr]: 0.00308693, [1] [add_attr_with_inline]: 0.00307734, [1] [Cycle 1]: 7.476e-05, [2] [tag_attr]: 1.981e-05 [meta_addattr_fg_expand]: 6.75002e-06 [parallel-infer-symbol]: 3.39001e-06 [pre_auto_parallel]: 3.331e-05 [insert-virtual-dataset]: 2.61e-06 [parallel-infer-symbol-second]: 8.80013e-07 [dataset_repeat_opt]: 1.96003e-06 [pipeline_split]: 1.54998e-06 [optimize]: 0.00557366, [53] [py_interpret_to_execute]: 3.036e-05 [rewriter_before_opt_a]: 9.253e-05 [opt_a]: 0.00322161, [2] [Cycle 1]: 0.00229113, [45] [expand_dump_flag]: 2.99001e-06 [switch_simplify]: 4.317e-05 [loop_unroll]: 3.197e-05 [a_1]: 0.00068403 [with_stream_mark]: 1.673e-05 [recompute_prepare]: 1.011e-05 [updatestate_depend_eliminate]: 5.23002e-06 [updatestate_assign_eliminate]: 4.23001e-06 [updatestate_loads_eliminate]: 3.98001e-06 [parameter_eliminate]: 1.97001e-06 [a_2]: 0.0001351 [accelerated_algorithm]: 9.04e-06 [shard]: 2.14999e-06 [meta_shard_fg_expand]: 1.99e-06 [shard_inline]: 8.38999e-06 [merge_send_recv]: 1.037e-05 [auto_parallel]: 7.2e-06 [parallel]: 1.846e-05 [flash_sp]: 8.43999e-06 [merge_comm]: 5.08002e-06 [allreduce_fusion]: 4.60001e-06 [matmul_add_comm_reduction]: 1.051e-05 [allreduce_slice_to_reducescatter]: 6.80011e-07 [virtual_shard_identity]: 1.124e-05 [virtual_dataset]: 8.2e-06 [get_grad_eliminate_]: 7.9e-06 [virtual_output]: 7.53999e-06 [merge_forward]: 4.75001e-06 [cell_reuse_recompute_pass]: 1.34e-06 [offload_activation]: 1.059e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.744e-05 [merge_recompute_call_nodes]: 1.37999e-06 [before_grad]: 1.297e-05 [set_forward_comm_id_for_comm_node_pass]: 4.63999e-06 [meta_fg_expand]: 3.36999e-06 [flash_sp_send_recv_attached]: 2.44001e-06 [receive_attached]: 1.97999e-06 [after_resolve]: 1.293e-05 [a_after_grad]: 1.2e-05 [renormalize]: 0.00062938 [add_forward_monad_depend]: 5.34998e-06 [auto_monad_grad]: 1.83002e-06 [auto_monad_eliminator]: 1.623e-05 [cse]: 3.374e-05 [a_3]: 6.922e-05 [Cycle 2]: 0.0009172, [45] [expand_dump_flag]: 1.00001e-06 [switch_simplify]: 9.52001e-06 [loop_unroll]: 7.56999e-06 [a_1]: 0.00017328 [with_stream_mark]: 1.168e-05 [recompute_prepare]: 7.88001e-06 [updatestate_depend_eliminate]: 3.78999e-06 [updatestate_assign_eliminate]: 3.12002e-06 [updatestate_loads_eliminate]: 3.14001e-06 [parameter_eliminate]: 9.80013e-07 [a_2]: 0.00011899 [accelerated_algorithm]: 7.76001e-06 [shard]: 1.25001e-06 [meta_shard_fg_expand]: 1.62001e-06 [shard_inline]: 7.53e-06 [merge_send_recv]: 6.13998e-06 [auto_parallel]: 6.06e-06 [parallel]: 4.74e-06 [flash_sp]: 3.60998e-06 [merge_comm]: 4.12e-06 [allreduce_fusion]: 3.83999e-06 [matmul_add_comm_reduction]: 6.11e-06 [allreduce_slice_to_reducescatter]: 4.00003e-07 [virtual_shard_identity]: 8.33001e-06 [virtual_dataset]: 7.21999e-06 [get_grad_eliminate_]: 6.89999e-06 [virtual_output]: 7.06001e-06 [merge_forward]: 3.54002e-06 [cell_reuse_recompute_pass]: 1.42e-06 [offload_activation]: 6.97002e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.593e-05 [merge_recompute_call_nodes]: 7.00005e-07 [before_grad]: 1.147e-05 [set_forward_comm_id_for_comm_node_pass]: 4e-06 [meta_fg_expand]: 2.68e-06 [flash_sp_send_recv_attached]: 8.50006e-07 [receive_attached]: 1.02998e-06 [after_resolve]: 1.217e-05 [a_after_grad]: 1.14e-05 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 1.30001e-06 [auto_monad_grad]: 9.70002e-07 [auto_monad_eliminator]: 8.27998e-06 [cse]: 1.714e-05 [a_3]: 5.895e-05 [py_interpret_to_execute_after_opt_a]: 1.23e-05 [slice_cell_reuse_recomputed_activation]: 4.80001e-06 [rewriter_after_opt_a]: 4.32e-05 [convert_after_rewriter]: 1.097e-05 [order_py_execute_after_rewriter]: 8.45999e-06 [mutable_eliminate]: 0.00047165 [opt_b]: 0.00030514, [1] [Cycle 1]: 0.00029627, [7] [b_1]: 0.0001978 [b_2]: 9.48997e-06 [updatestate_depend_eliminate]: 5.72001e-06 [updatestate_assign_eliminate]: 3.15998e-06 [updatestate_loads_eliminate]: 2.93e-06 [renormalize]: 4.00003e-07 [cse]: 2.099e-05 [optimize_parallel_all_gather_comm]: 2.041e-05 [overlap_param_gather]: 5.29e-06 [cconv]: 2.665e-05 [loop_unroll]: 0.00043153 [opt_after_cconv]: 0.00014601, [1] [Cycle 1]: 0.00013685, [7] [c_1]: 3.687e-05 [parameter_eliminate]: 3.45998e-06 [updatestate_depend_eliminate]: 7.52002e-06 [updatestate_assign_eliminate]: 3.63999e-06 [updatestate_loads_eliminate]: 3.02002e-06 [cse]: 2.513e-05 [renormalize]: 3.7998e-07 [remove_dup_value]: 1.868e-05 [tuple_transform]: 0.00010402, [1] [Cycle 1]: 9.664e-05, [4] [d_1]: 5.199e-05 [none_parameter_eliminate]: 1.99e-06 [renormalize]: 2.30008e-07 [switch_simplify]: 9.62999e-06 [partial_unused_args_eliminate]: 4.49002e-06 [add_recomputation]: 6.05e-05 [cse_after_recomputation]: 3.507e-05, [1] [Cycle 1]: 2.779e-05, [1] [cse]: 1.721e-05 [environ_conv]: 9.37001e-06 [swap_dp_allreduce_reducescatter]: 9.10999e-06 [bias_add_comm_swap]: 5.47999e-06 [label_micro_interleaved_index]: 7.25e-06 [label_fine_grained_interleaved_index]: 5.44998e-06 [merge_cast_opt]: 4.28001e-06 [slice_recompute_activation]: 4.50999e-06 [micro_interleaved_order_control]: 5.46002e-06 [assign_add_opt]: 3.63e-06 [ForceFp32Comm]: 3.39001e-06 [remove_cast_before_assign_add]: 3.71999e-06 [full_micro_interleaved_order_control]: 4.95001e-06 [reorder_send_recv_between_fp_bp]: 5.57999e-06 [comm_op_add_attrs]: 4.00998e-06 [add_comm_op_reuse_tag]: 3.38e-06 [interleave_split_concat_branches]: 3.44001e-06 [interleave_parallel_branches]: 3.68999e-06 [overlap_opt_shard_in_pipeline]: 3.96001e-06 [overlap_opt_shard_grad_in_pipeline]: 4.28001e-06 [control_data_broadcast_order]: 1.97e-05 [grouped_pairwise_exchange_alltoall]: 3.93001e-06 [offloading_packed_experts]: 7.28e-06 [overlap_recompute_and_grad_model_parallel]: 7.35998e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.57002e-06 [overlap_recompute_allgather_and_fa_grad]: 3.75e-06 [overlap_recompute_comm]: 5.45001e-06 [overlap_grad_ring_attention]: 6.71e-06 [overlap_grad_flash_sp]: 2.535e-05 [begin_end_overlap_inline]: 3.40998e-06 [split_matmul_comm_elemetwise]: 4.84e-06 [split_layernorm_comm]: 4.27e-06 [handle_group_info]: 3.41999e-06 [symbol_engine_optimizer]: 0.00011345, [1] [Cycle 1]: 0.00010552, [6] [build]: 3.68999e-06 [elim_shapecalc]: 1.427e-05 [elim_not_effective]: 1.607e-05 [opt_reshape]: 8.80001e-06 [fold_const_symbol]: 1.243e-05 [renormalize]: 3.50003e-07 [detach_backward]: 5.13002e-06 [pipeline_parallel_scheduler]: 2.14e-06 [auto_monad_reorder]: 2.421e-05 [get_jit_bprop_graph]: 1.58002e-06 [rewriter_after_jit_bprop_graph]: 5.52999e-06 [opt_after_jit_grad]: 0.00049631 [validate]: 4.157e-05 Sums bootstrap : 0.000444s : 3.87% type_inference : 0.005928s : 51.58% event_method : 0.000020s : 0.17% auto_monad : 0.000066s : 0.58% graph_reusing : 0.000006s : 0.06% inline : 0.000002s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000020s : 0.17% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000007s : 0.06% parallel-infer-symbol : 0.000003s : 0.03% pre_auto_parallel : 0.000033s : 0.29% insert-virtual-dataset : 0.000003s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000030s : 0.26% optimize.rewriter_before_opt_a : 0.000093s : 0.81% optimize.opt_a.expand_dump_flag : 0.000004s : 0.03% optimize.opt_a.switch_simplify : 0.000053s : 0.46% optimize.opt_a.loop_unroll : 0.000040s : 0.34% optimize.opt_a.a_1 : 0.000857s : 7.46% optimize.opt_a.with_stream_mark : 0.000028s : 0.25% optimize.opt_a.recompute_prepare : 0.000018s : 0.16% optimize.opt_a.updatestate_depend_eliminate : 0.000009s : 0.08% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.06% optimize.opt_a.updatestate_loads_eliminate : 0.000007s : 0.06% optimize.opt_a.parameter_eliminate : 0.000003s : 0.03% optimize.opt_a.a_2 : 0.000254s : 2.21% optimize.opt_a.accelerated_algorithm : 0.000017s : 0.15% optimize.opt_a.shard : 0.000003s : 0.03% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.03% optimize.opt_a.shard_inline : 0.000016s : 0.14% optimize.opt_a.merge_send_recv : 0.000017s : 0.14% optimize.opt_a.auto_parallel : 0.000013s : 0.12% optimize.opt_a.parallel : 0.000023s : 0.20% optimize.opt_a.flash_sp : 0.000012s : 0.10% optimize.opt_a.merge_comm : 0.000009s : 0.08% optimize.opt_a.allreduce_fusion : 0.000008s : 0.07% optimize.opt_a.matmul_add_comm_reduction : 0.000017s : 0.14% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000020s : 0.17% optimize.opt_a.virtual_dataset : 0.000015s : 0.13% optimize.opt_a.get_grad_eliminate_ : 0.000015s : 0.13% optimize.opt_a.virtual_output : 0.000015s : 0.13% optimize.opt_a.merge_forward : 0.000008s : 0.07% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.02% optimize.opt_a.offload_activation : 0.000018s : 0.15% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000033s : 0.29% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.02% optimize.opt_a.before_grad : 0.000024s : 0.21% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000009s : 0.08% optimize.opt_a.meta_fg_expand : 0.000006s : 0.05% optimize.opt_a.flash_sp_send_recv_attached : 0.000003s : 0.03% optimize.opt_a.receive_attached : 0.000003s : 0.03% optimize.opt_a.after_resolve : 0.000025s : 0.22% optimize.opt_a.a_after_grad : 0.000023s : 0.20% optimize.opt_a.renormalize : 0.000629s : 5.48% optimize.opt_a.add_forward_monad_depend : 0.000007s : 0.06% optimize.opt_a.auto_monad_grad : 0.000003s : 0.02% optimize.opt_a.auto_monad_eliminator : 0.000025s : 0.21% optimize.opt_a.cse : 0.000051s : 0.44% optimize.opt_a.a_3 : 0.000128s : 1.12% optimize.py_interpret_to_execute_after_opt_a : 0.000012s : 0.11% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.04% optimize.rewriter_after_opt_a : 0.000043s : 0.38% optimize.convert_after_rewriter : 0.000011s : 0.10% optimize.order_py_execute_after_rewriter : 0.000008s : 0.07% optimize.mutable_eliminate : 0.000472s : 4.10% optimize.opt_b.b_1 : 0.000198s : 1.72% optimize.opt_b.b_2 : 0.000009s : 0.08% optimize.opt_b.updatestate_depend_eliminate : 0.000006s : 0.05% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_b.renormalize : 0.000000s : 0.00% optimize.opt_b.cse : 0.000021s : 0.18% optimize.optimize_parallel_all_gather_comm : 0.000020s : 0.18% optimize.overlap_param_gather : 0.000005s : 0.05% optimize.cconv : 0.000027s : 0.23% optimize.loop_unroll : 0.000432s : 3.76% optimize.opt_after_cconv.c_1 : 0.000037s : 0.32% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000008s : 0.07% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000004s : 0.03% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.cse : 0.000025s : 0.22% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000019s : 0.16% optimize.tuple_transform.d_1 : 0.000052s : 0.45% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.02% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000010s : 0.08% optimize.partial_unused_args_eliminate : 0.000004s : 0.04% optimize.add_recomputation : 0.000061s : 0.53% optimize.cse_after_recomputation.cse : 0.000017s : 0.15% optimize.environ_conv : 0.000009s : 0.08% optimize.swap_dp_allreduce_reducescatter : 0.000009s : 0.08% optimize.bias_add_comm_swap : 0.000005s : 0.05% optimize.label_micro_interleaved_index : 0.000007s : 0.06% optimize.label_fine_grained_interleaved_index : 0.000005s : 0.05% optimize.merge_cast_opt : 0.000004s : 0.04% optimize.slice_recompute_activation : 0.000005s : 0.04% optimize.micro_interleaved_order_control : 0.000005s : 0.05% optimize.assign_add_opt : 0.000004s : 0.03% optimize.ForceFp32Comm : 0.000003s : 0.03% optimize.remove_cast_before_assign_add : 0.000004s : 0.03% optimize.full_micro_interleaved_order_control : 0.000005s : 0.04% optimize.reorder_send_recv_between_fp_bp : 0.000006s : 0.05% optimize.comm_op_add_attrs : 0.000004s : 0.03% optimize.add_comm_op_reuse_tag : 0.000003s : 0.03% optimize.interleave_split_concat_branches : 0.000003s : 0.03% optimize.interleave_parallel_branches : 0.000004s : 0.03% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.03% optimize.overlap_opt_shard_grad_in_pipeline : 0.000004s : 0.04% optimize.control_data_broadcast_order : 0.000020s : 0.17% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.03% optimize.offloading_packed_experts : 0.000007s : 0.06% optimize.overlap_recompute_and_grad_model_parallel : 0.000007s : 0.06% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.03% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.03% optimize.overlap_recompute_comm : 0.000005s : 0.05% optimize.overlap_grad_ring_attention : 0.000007s : 0.06% optimize.overlap_grad_flash_sp : 0.000025s : 0.22% optimize.begin_end_overlap_inline : 0.000003s : 0.03% optimize.split_matmul_comm_elemetwise : 0.000005s : 0.04% optimize.split_layernorm_comm : 0.000004s : 0.04% optimize.handle_group_info : 0.000003s : 0.03% optimize.symbol_engine_optimizer.build : 0.000004s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000014s : 0.12% optimize.symbol_engine_optimizer.elim_not_effective : 0.000016s : 0.14% optimize.symbol_engine_optimizer.opt_reshape : 0.000009s : 0.08% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000012s : 0.11% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000005s : 0.04% pipeline_parallel_scheduler : 0.000002s : 0.02% auto_monad_reorder : 0.000024s : 0.21% get_jit_bprop_graph : 0.000002s : 0.01% rewriter_after_jit_bprop_graph : 0.000006s : 0.05% opt_after_jit_grad : 0.000496s : 4.32% validate : 0.000042s : 0.36% Time group info: ------[substitution.] 0.000200 38 11.29% : 0.000023s : 3: substitution.cast_eliminate 1.13% : 0.000002s : 3: substitution.elim_not_effective 0.88% : 0.000002s : 3: substitution.fold_const_symbol 3.17% : 0.000006s : 5: substitution.graph_param_transform 69.50% : 0.000139s : 4: substitution.inline 2.14% : 0.000004s : 6: substitution.j_node_and_user_rematch 2.99% : 0.000006s : 6: substitution.remove_not_recompute_node 2.34% : 0.000005s : 4: substitution.replace_old_param 6.54% : 0.000013s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.005877 2 88.07% : 0.005176s : 1: type_inference.infer 11.93% : 0.000701s : 1: type_inference.specialize ------[replace.] 0.000062 8 61.71% : 0.000038s : 4: replace.inline 38.29% : 0.000024s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000148 8 92.37% : 0.000137s : 4: match.inline 7.63% : 0.000011s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000240 1504 0.87% : 0.000002s : 15: predicate.accumulaten_eliminater 0.83% : 0.000002s : 5: predicate.ad_related_special_op_eliminate 0.62% : 0.000001s : 10: predicate.addn_check_dump 0.87% : 0.000002s : 15: predicate.addn_zero_filter 0.87% : 0.000002s : 15: predicate.adjust_all_reduce_mul_add 2.10% : 0.000005s : 25: predicate.arithmetic_simplify 1.04% : 0.000002s : 15: predicate.cast_eliminate 0.63% : 0.000002s : 10: predicate.check_bprop_eliminate 0.63% : 0.000002s : 10: predicate.compare_switch_simplify 0.20% : 0.000000s : 5: predicate.const_output_eliminate 0.69% : 0.000002s : 10: predicate.depend_value_elim 0.96% : 0.000002s : 15: predicate.dict_get_item_const_eliminator 1.00% : 0.000002s : 15: predicate.dict_get_item_eliminator 0.85% : 0.000002s : 15: predicate.dict_set_item_eliminator 0.93% : 0.000002s : 10: predicate.dumpgradient_eliminate 0.21% : 0.000001s : 5: predicate.elim_not_effective 0.57% : 0.000001s : 5: predicate.elim_shapecalc_of_broadcastargs 1.15% : 0.000003s : 20: predicate.environ_add_const_eliminate 1.14% : 0.000003s : 20: predicate.environ_get_add_eliminate 1.16% : 0.000003s : 20: predicate.environ_get_depend_swap 1.76% : 0.000004s : 30: predicate.environ_get_eliminate 1.10% : 0.000003s : 20: predicate.environ_get_set_eliminate 1.44% : 0.000003s : 23: predicate.exchange_switch_depend_value 2.40% : 0.000006s : 23: predicate.float_depend_g_call 0.57% : 0.000001s : 10: predicate.float_environ_get_switch 0.89% : 0.000002s : 15: predicate.float_tuple_getitem_switch 0.21% : 0.000000s : 5: predicate.fold_const_symbol 0.67% : 0.000002s : 10: predicate.get_grad_eliminate 0.21% : 0.000001s : 5: predicate.graph_param_transform 0.69% : 0.000002s : 10: predicate.incorporate_call 0.58% : 0.000001s : 10: predicate.incorporate_call_switch 6.29% : 0.000015s : 68: predicate.inline 0.91% : 0.000002s : 10: predicate.inline_without_move 0.35% : 0.000001s : 10: predicate.j_node_and_user_rematch 0.83% : 0.000002s : 10: predicate.less_batch_normalization 1.84% : 0.000004s : 29: predicate.list_to_tuple_eliminator_ 2.56% : 0.000006s : 44: predicate.load_eliminater 0.87% : 0.000002s : 5: predicate.loop_unroll_after_grad 2.23% : 0.000005s : 36: predicate.loop_unroll_before_grad 1.61% : 0.000004s : 25: predicate.make_slice_get_slice_eliminator 0.67% : 0.000002s : 10: predicate.merge_addn 0.65% : 0.000002s : 10: predicate.micro_step_allgather_replace 0.62% : 0.000001s : 10: predicate.mini_step_allgather_replace 0.83% : 0.000002s : 15: predicate.minmaximum_grad 0.86% : 0.000002s : 5: predicate.mutable_eliminate 0.36% : 0.000001s : 5: predicate.opt_reshape 0.37% : 0.000001s : 5: predicate.parallel_virtual_node 1.69% : 0.000004s : 23: predicate.partial_defer_inline 1.73% : 0.000004s : 24: predicate.partial_eliminate 0.88% : 0.000002s : 15: predicate.print_const_string_wrapper 0.58% : 0.000001s : 10: predicate.reduce_all_const_elim 1.16% : 0.000003s : 15: predicate.reduce_eliminate 2.59% : 0.000006s : 44: predicate.redundant_stop_gradient_eliminater 0.37% : 0.000001s : 10: predicate.remove_not_recompute_node 1.38% : 0.000003s : 29: predicate.replace_applicator 0.63% : 0.000002s : 10: predicate.replace_old_param 0.27% : 0.000001s : 5: predicate.reset_defer_inline 1.06% : 0.000003s : 15: predicate.reshape_eliminate 0.63% : 0.000002s : 10: predicate.row_tensor_add_zeros_like 0.33% : 0.000001s : 5: predicate.row_tensor_eliminate 0.70% : 0.000002s : 10: predicate.same_eliminate 0.42% : 0.000001s : 10: predicate.set_cell_output_no_recompute 0.74% : 0.000002s : 10: predicate.shard_identity_eliminate 0.70% : 0.000002s : 10: predicate.special_op_eliminate 0.79% : 0.000002s : 10: predicate.specialize_transform 0.85% : 0.000002s : 10: predicate.split_environ_get_set_with_tuple_value 0.80% : 0.000002s : 10: predicate.stack_unstack_eliminate 0.35% : 0.000001s : 5: predicate.switch_call_monad_eliminater 1.43% : 0.000003s : 23: predicate.switch_defer_inline 2.04% : 0.000005s : 33: predicate.switch_layer_defer_inline 5.07% : 0.000012s : 74: predicate.switch_simplify 0.88% : 0.000002s : 15: predicate.tile_eliminate 0.89% : 0.000002s : 15: predicate.transpose_eliminate 1.54% : 0.000004s : 25: predicate.tuple_list_convert_item_index_to_positive 1.56% : 0.000004s : 25: predicate.tuple_list_get_item_const_eliminator 1.44% : 0.000003s : 25: predicate.tuple_list_get_item_depend_reorder 3.23% : 0.000008s : 39: predicate.tuple_list_get_item_eliminator 1.55% : 0.000004s : 25: predicate.tuple_list_get_set_item_eliminator 2.22% : 0.000005s : 35: predicate.tuple_list_set_item_eliminator 1.80% : 0.000004s : 29: predicate.tuple_to_list_eliminator_ 2.42% : 0.000006s : 44: predicate.updatestate_pure_node_eliminater 3.17% : 0.000008s : 54: predicate.updatestate_useless_node_eliminater 0.36% : 0.000001s : 5: predicate.value_based_eliminate 0.68% : 0.000002s : 10: predicate.virtual_dataset_eliminate 0.63% : 0.000002s : 10: predicate.virtual_output_eliminate 0.28% : 0.000001s : 5: predicate.virtual_view_grad_eliminate 0.47% : 0.000001s : 5: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000534 11 56.85% : 0.000303s : 5: func_graph_cloner_run.FuncGraphClonerGraph 43.15% : 0.000230s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.027671 192 0.02% : 0.000006s : 1: ForceFp32Comm 11.19% : 0.003097s : 1: add_attr 11.14% : 0.003082s : 1: add_attr_with_inline 0.02% : 0.000006s : 1: add_comm_op_reuse_tag 0.24% : 0.000066s : 1: add_recomputation 0.02% : 0.000006s : 1: assign_add_opt 0.28% : 0.000077s : 1: auto_monad 0.11% : 0.000031s : 1: auto_monad_reorder 0.02% : 0.000006s : 1: begin_end_overlap_inline 0.03% : 0.000009s : 1: bias_add_comm_swap 1.78% : 0.000491s : 1: bootstrap 0.11% : 0.000030s : 1: cconv 0.02% : 0.000007s : 1: comm_op_add_attrs 0.08% : 0.000023s : 1: control_data_broadcast_order 0.05% : 0.000014s : 1: convert_after_rewriter 0.14% : 0.000039s : 1: cse_after_recomputation 0.03% : 0.000008s : 1: dataset_repeat_opt 0.08% : 0.000023s : 1: detach_backward 0.05% : 0.000013s : 1: environ_conv 0.11% : 0.000030s : 1: event_method 0.03% : 0.000008s : 1: full_micro_interleaved_order_control 0.03% : 0.000007s : 1: get_jit_bprop_graph 0.05% : 0.000013s : 1: graph_reusing 0.02% : 0.000006s : 1: grouped_pairwise_exchange_alltoall 0.02% : 0.000006s : 1: handle_group_info 0.03% : 0.000008s : 1: inline 0.03% : 0.000009s : 1: insert-virtual-dataset 0.02% : 0.000006s : 1: interleave_parallel_branches 0.02% : 0.000006s : 1: interleave_split_concat_branches 0.03% : 0.000008s : 1: label_fine_grained_interleaved_index 0.04% : 0.000010s : 1: label_micro_interleaved_index 1.58% : 0.000438s : 1: loop_unroll 0.03% : 0.000007s : 1: merge_cast_opt 0.03% : 0.000008s : 1: micro_interleaved_order_control 1.72% : 0.000477s : 1: mutable_eliminate 0.04% : 0.000010s : 1: offloading_packed_experts 0.06% : 0.000016s : 1: opt.transform.loop_unroll_optimizer 0.06% : 0.000016s : 1: opt.transform.mutable_eliminate 4.99% : 0.001382s : 78: opt.transform.opt_a 0.13% : 0.000036s : 1: opt.transform.opt_after_cconv 0.11% : 0.000032s : 1: opt.transform.opt_after_jit_grad 0.49% : 0.000135s : 28: opt.transform.opt_b 0.21% : 0.000059s : 2: opt.transform.opt_trans_graph 0.17% : 0.000047s : 4: opt.transform.symbol_engine_opt 11.65% : 0.003225s : 1: opt_a 0.54% : 0.000150s : 1: opt_after_cconv 1.83% : 0.000507s : 1: opt_after_jit_grad 1.11% : 0.000308s : 1: opt_b 23.01% : 0.006366s : 1: optimize 0.09% : 0.000024s : 1: optimize_parallel_all_gather_comm 0.04% : 0.000011s : 1: order_py_execute_after_rewriter 0.11% : 0.000029s : 1: overlap_grad_flash_sp 0.02% : 0.000006s : 1: overlap_grad_matmul_and_grad_allreduce 0.03% : 0.000010s : 1: overlap_grad_ring_attention 0.03% : 0.000007s : 1: overlap_opt_shard_grad_in_pipeline 0.02% : 0.000007s : 1: overlap_opt_shard_in_pipeline 0.03% : 0.000008s : 1: overlap_param_gather 0.02% : 0.000006s : 1: overlap_recompute_allgather_and_fa_grad 0.04% : 0.000010s : 1: overlap_recompute_and_grad_model_parallel 0.03% : 0.000008s : 1: overlap_recompute_comm 0.04% : 0.000010s : 1: parallel-infer-symbol 0.02% : 0.000006s : 1: parallel-infer-symbol-second 0.03% : 0.000008s : 1: partial_unused_args_eliminate 0.04% : 0.000011s : 1: pipeline_parallel_scheduler 0.03% : 0.000007s : 1: pipeline_split 0.15% : 0.000041s : 1: pre_auto_parallel 0.12% : 0.000034s : 1: py_interpret_to_execute 0.06% : 0.000015s : 1: py_interpret_to_execute_after_opt_a 0.02% : 0.000007s : 1: remove_cast_before_assign_add 0.08% : 0.000022s : 1: remove_dup_value 1.25% : 0.000346s : 1: renormalize.infer 0.99% : 0.000275s : 1: renormalize.specialize 0.03% : 0.000008s : 1: reorder_send_recv_between_fp_bp 0.04% : 0.000011s : 1: rewriter_after_jit_bprop_graph 0.17% : 0.000047s : 1: rewriter_after_opt_a 0.35% : 0.000097s : 1: rewriter_before_opt_a 0.03% : 0.000008s : 1: slice_cell_reuse_recomputed_activation 0.03% : 0.000007s : 1: slice_recompute_activation 0.03% : 0.000007s : 1: split_layernorm_comm 0.03% : 0.000007s : 1: split_matmul_comm_elemetwise 0.04% : 0.000012s : 1: swap_dp_allreduce_reducescatter 0.42% : 0.000117s : 1: symbol_engine_optimizer 0.39% : 0.000107s : 1: tuple_transform 21.55% : 0.005963s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:45:54.813.853 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0145037, [21] [bootstrap]: 0.00041521 [type_inference]: 0.00551478 [event_method]: 1.815e-05 [auto_monad]: 6.017e-05 [graph_reusing]: 6.38e-06 [inline]: 2.53e-06 [add_attr]: 0.00293337, [1] [add_attr_with_inline]: 0.00292488, [1] [Cycle 1]: 5.094e-05, [2] [tag_attr]: 1.822e-05 [meta_addattr_fg_expand]: 5.99e-06 [parallel-infer-symbol]: 2.79999e-06 [pre_auto_parallel]: 3.047e-05 [insert-virtual-dataset]: 2.39999e-06 [parallel-infer-symbol-second]: 7.7e-07 [dataset_repeat_opt]: 2.34999e-06 [pipeline_split]: 1.57001e-06 [optimize]: 0.00485452, [53] [py_interpret_to_execute]: 2.426e-05 [rewriter_before_opt_a]: 8.315e-05 [opt_a]: 0.00279844, [2] [Cycle 1]: 0.00201158, [45] [expand_dump_flag]: 3.13998e-06 [switch_simplify]: 4.615e-05 [loop_unroll]: 3.213e-05 [a_1]: 0.00069244 [with_stream_mark]: 1.403e-05 [recompute_prepare]: 1.017e-05 [updatestate_depend_eliminate]: 4.70999e-06 [updatestate_assign_eliminate]: 3.93001e-06 [updatestate_loads_eliminate]: 3.58e-06 [parameter_eliminate]: 1.92999e-06 [a_2]: 0.00010464 [accelerated_algorithm]: 8.84998e-06 [shard]: 1.86e-06 [meta_shard_fg_expand]: 2.21998e-06 [shard_inline]: 8.35999e-06 [merge_send_recv]: 1.021e-05 [auto_parallel]: 6.93e-06 [parallel]: 1.795e-05 [flash_sp]: 7.75998e-06 [merge_comm]: 4.87e-06 [allreduce_fusion]: 4.13001e-06 [matmul_add_comm_reduction]: 9.71e-06 [allreduce_slice_to_reducescatter]: 6.00005e-07 [virtual_shard_identity]: 8.87e-06 [virtual_dataset]: 7.78999e-06 [get_grad_eliminate_]: 7.54002e-06 [virtual_output]: 7.29001e-06 [merge_forward]: 4.37e-06 [cell_reuse_recompute_pass]: 1.29e-06 [offload_activation]: 1.074e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.431e-05 [merge_recompute_call_nodes]: 1.42999e-06 [before_grad]: 1.3e-05 [set_forward_comm_id_for_comm_node_pass]: 4.48999e-06 [meta_fg_expand]: 3.25e-06 [flash_sp_send_recv_attached]: 2.31e-06 [receive_attached]: 2.39001e-06 [after_resolve]: 1.238e-05 [a_after_grad]: 1.156e-05 [renormalize]: 0.00054038 [add_forward_monad_depend]: 4.67998e-06 [auto_monad_grad]: 2.06e-06 [auto_monad_eliminator]: 1.617e-05 [cse]: 3.632e-05 [a_3]: 5.848e-05 [Cycle 2]: 0.0007772, [45] [expand_dump_flag]: 9.50007e-07 [switch_simplify]: 9.62999e-06 [loop_unroll]: 7.73001e-06 [a_1]: 0.00018406 [with_stream_mark]: 1.145e-05 [recompute_prepare]: 8.10999e-06 [updatestate_depend_eliminate]: 3.95e-06 [updatestate_assign_eliminate]: 3.21999e-06 [updatestate_loads_eliminate]: 2.99999e-06 [parameter_eliminate]: 1.04e-06 [a_2]: 9.252e-05 [accelerated_algorithm]: 7.46001e-06 [shard]: 1.10001e-06 [meta_shard_fg_expand]: 1.40001e-06 [shard_inline]: 7.38e-06 [merge_send_recv]: 6.06998e-06 [auto_parallel]: 7.2e-06 [parallel]: 4.38999e-06 [flash_sp]: 3.59002e-06 [merge_comm]: 4.73001e-06 [allreduce_fusion]: 4.49998e-06 [matmul_add_comm_reduction]: 6.17999e-06 [allreduce_slice_to_reducescatter]: 4.00003e-07 [virtual_shard_identity]: 8.77e-06 [virtual_dataset]: 8.00999e-06 [get_grad_eliminate_]: 7.83001e-06 [virtual_output]: 7.58999e-06 [merge_forward]: 3.43999e-06 [cell_reuse_recompute_pass]: 2.11e-06 [offload_activation]: 7.65e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.593e-05 [merge_recompute_call_nodes]: 7.2e-07 [before_grad]: 1.133e-05 [set_forward_comm_id_for_comm_node_pass]: 4.15e-06 [meta_fg_expand]: 2.74999e-06 [flash_sp_send_recv_attached]: 8.79983e-07 [receive_attached]: 1.05999e-06 [after_resolve]: 1.147e-05 [a_after_grad]: 1.092e-05 [renormalize]: 8.00064e-08 [add_forward_monad_depend]: 1.30999e-06 [auto_monad_grad]: 1.02e-06 [auto_monad_eliminator]: 8.43999e-06 [cse]: 1.713e-05 [a_3]: 4.521e-05 [py_interpret_to_execute_after_opt_a]: 9.66003e-06 [slice_cell_reuse_recomputed_activation]: 2.29001e-06 [rewriter_after_opt_a]: 3.849e-05 [convert_after_rewriter]: 8.07e-06 [order_py_execute_after_rewriter]: 5.59e-06 [mutable_eliminate]: 0.00045677 [opt_b]: 0.00023795, [1] [Cycle 1]: 0.00023251, [7] [b_1]: 0.00015469 [b_2]: 9.35001e-06 [updatestate_depend_eliminate]: 5.73002e-06 [updatestate_assign_eliminate]: 2.98e-06 [updatestate_loads_eliminate]: 2.98e-06 [renormalize]: 3.69997e-07 [cse]: 2.11e-05 [optimize_parallel_all_gather_comm]: 1.741e-05 [overlap_param_gather]: 2.26e-06 [cconv]: 2.351e-05 [loop_unroll]: 0.00042582 [opt_after_cconv]: 0.00011408, [1] [Cycle 1]: 0.0001084, [7] [c_1]: 3.872e-05 [parameter_eliminate]: 2.59001e-06 [updatestate_depend_eliminate]: 5.71e-06 [updatestate_assign_eliminate]: 3.02002e-06 [updatestate_loads_eliminate]: 2.72001e-06 [cse]: 2.238e-05 [renormalize]: 3.19997e-07 [remove_dup_value]: 1.419e-05 [tuple_transform]: 8.42e-05, [1] [Cycle 1]: 7.987e-05, [4] [d_1]: 5.111e-05 [none_parameter_eliminate]: 1.62001e-06 [renormalize]: 2.19996e-07 [switch_simplify]: 8.87e-06 [partial_unused_args_eliminate]: 1.72001e-06 [add_recomputation]: 5.414e-05 [cse_after_recomputation]: 2.45e-05, [1] [Cycle 1]: 1.97e-05, [1] [cse]: 1.435e-05 [environ_conv]: 6.01998e-06 [swap_dp_allreduce_reducescatter]: 5.54e-06 [bias_add_comm_swap]: 2.54999e-06 [label_micro_interleaved_index]: 4.03999e-06 [label_fine_grained_interleaved_index]: 2.65002e-06 [merge_cast_opt]: 1.32999e-06 [slice_recompute_activation]: 2.26e-06 [micro_interleaved_order_control]: 2.06e-06 [assign_add_opt]: 1.12e-06 [ForceFp32Comm]: 7.50006e-07 [remove_cast_before_assign_add]: 9.50007e-07 [full_micro_interleaved_order_control]: 2.02001e-06 [reorder_send_recv_between_fp_bp]: 2.73e-06 [comm_op_add_attrs]: 9.89996e-07 [add_comm_op_reuse_tag]: 9.20001e-07 [interleave_split_concat_branches]: 1.12999e-06 [interleave_parallel_branches]: 1.12999e-06 [overlap_opt_shard_in_pipeline]: 1.49e-06 [overlap_opt_shard_grad_in_pipeline]: 1.86998e-06 [control_data_broadcast_order]: 1.505e-05 [grouped_pairwise_exchange_alltoall]: 1.77999e-06 [offloading_packed_experts]: 4.17e-06 [overlap_recompute_and_grad_model_parallel]: 5.51e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.10999e-06 [overlap_recompute_allgather_and_fa_grad]: 1.34e-06 [overlap_recompute_comm]: 2.46e-06 [overlap_grad_ring_attention]: 4.33999e-06 [overlap_grad_flash_sp]: 2.111e-05 [begin_end_overlap_inline]: 5.69999e-07 [split_matmul_comm_elemetwise]: 2.27001e-06 [split_layernorm_comm]: 1.60999e-06 [handle_group_info]: 8.89995e-07 [symbol_engine_optimizer]: 7.95e-05, [1] [Cycle 1]: 7.538e-05, [6] [build]: 3.13e-06 [elim_shapecalc]: 1.047e-05 [elim_not_effective]: 1.471e-05 [opt_reshape]: 8.04002e-06 [fold_const_symbol]: 1.161e-05 [renormalize]: 1.8999e-07 [detach_backward]: 1.74998e-06 [pipeline_parallel_scheduler]: 1.48002e-06 [auto_monad_reorder]: 1.927e-05 [get_jit_bprop_graph]: 1.24e-06 [rewriter_after_jit_bprop_graph]: 3.41001e-06 [opt_after_jit_grad]: 0.00045245 [validate]: 3.934e-05 Sums bootstrap : 0.000415s : 3.92% type_inference : 0.005515s : 52.04% event_method : 0.000018s : 0.17% auto_monad : 0.000060s : 0.57% graph_reusing : 0.000006s : 0.06% inline : 0.000003s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000018s : 0.17% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.06% parallel-infer-symbol : 0.000003s : 0.03% pre_auto_parallel : 0.000030s : 0.29% insert-virtual-dataset : 0.000002s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000024s : 0.23% optimize.rewriter_before_opt_a : 0.000083s : 0.78% optimize.opt_a.expand_dump_flag : 0.000004s : 0.04% optimize.opt_a.switch_simplify : 0.000056s : 0.53% optimize.opt_a.loop_unroll : 0.000040s : 0.38% optimize.opt_a.a_1 : 0.000877s : 8.27% optimize.opt_a.with_stream_mark : 0.000025s : 0.24% optimize.opt_a.recompute_prepare : 0.000018s : 0.17% optimize.opt_a.updatestate_depend_eliminate : 0.000009s : 0.08% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.07% optimize.opt_a.updatestate_loads_eliminate : 0.000007s : 0.06% optimize.opt_a.parameter_eliminate : 0.000003s : 0.03% optimize.opt_a.a_2 : 0.000197s : 1.86% optimize.opt_a.accelerated_algorithm : 0.000016s : 0.15% optimize.opt_a.shard : 0.000003s : 0.03% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.03% optimize.opt_a.shard_inline : 0.000016s : 0.15% optimize.opt_a.merge_send_recv : 0.000016s : 0.15% optimize.opt_a.auto_parallel : 0.000014s : 0.13% optimize.opt_a.parallel : 0.000022s : 0.21% optimize.opt_a.flash_sp : 0.000011s : 0.11% optimize.opt_a.merge_comm : 0.000010s : 0.09% optimize.opt_a.allreduce_fusion : 0.000009s : 0.08% optimize.opt_a.matmul_add_comm_reduction : 0.000016s : 0.15% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000018s : 0.17% optimize.opt_a.virtual_dataset : 0.000016s : 0.15% optimize.opt_a.get_grad_eliminate_ : 0.000015s : 0.15% optimize.opt_a.virtual_output : 0.000015s : 0.14% optimize.opt_a.merge_forward : 0.000008s : 0.07% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.03% optimize.opt_a.offload_activation : 0.000018s : 0.17% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000030s : 0.29% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.02% optimize.opt_a.before_grad : 0.000024s : 0.23% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000009s : 0.08% optimize.opt_a.meta_fg_expand : 0.000006s : 0.06% optimize.opt_a.flash_sp_send_recv_attached : 0.000003s : 0.03% optimize.opt_a.receive_attached : 0.000003s : 0.03% optimize.opt_a.after_resolve : 0.000024s : 0.23% optimize.opt_a.a_after_grad : 0.000022s : 0.21% optimize.opt_a.renormalize : 0.000540s : 5.10% optimize.opt_a.add_forward_monad_depend : 0.000006s : 0.06% optimize.opt_a.auto_monad_grad : 0.000003s : 0.03% optimize.opt_a.auto_monad_eliminator : 0.000025s : 0.23% optimize.opt_a.cse : 0.000053s : 0.50% optimize.opt_a.a_3 : 0.000104s : 0.98% optimize.py_interpret_to_execute_after_opt_a : 0.000010s : 0.09% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.02% optimize.rewriter_after_opt_a : 0.000038s : 0.36% optimize.convert_after_rewriter : 0.000008s : 0.08% optimize.order_py_execute_after_rewriter : 0.000006s : 0.05% optimize.mutable_eliminate : 0.000457s : 4.31% optimize.opt_b.b_1 : 0.000155s : 1.46% optimize.opt_b.b_2 : 0.000009s : 0.09% optimize.opt_b.updatestate_depend_eliminate : 0.000006s : 0.05% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_b.renormalize : 0.000000s : 0.00% optimize.opt_b.cse : 0.000021s : 0.20% optimize.optimize_parallel_all_gather_comm : 0.000017s : 0.16% optimize.overlap_param_gather : 0.000002s : 0.02% optimize.cconv : 0.000024s : 0.22% optimize.loop_unroll : 0.000426s : 4.02% optimize.opt_after_cconv.c_1 : 0.000039s : 0.37% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.02% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.05% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.cse : 0.000022s : 0.21% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000014s : 0.13% optimize.tuple_transform.d_1 : 0.000051s : 0.48% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.02% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000009s : 0.08% optimize.partial_unused_args_eliminate : 0.000002s : 0.02% optimize.add_recomputation : 0.000054s : 0.51% optimize.cse_after_recomputation.cse : 0.000014s : 0.14% optimize.environ_conv : 0.000006s : 0.06% optimize.swap_dp_allreduce_reducescatter : 0.000006s : 0.05% optimize.bias_add_comm_swap : 0.000003s : 0.02% optimize.label_micro_interleaved_index : 0.000004s : 0.04% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.03% optimize.merge_cast_opt : 0.000001s : 0.01% optimize.slice_recompute_activation : 0.000002s : 0.02% optimize.micro_interleaved_order_control : 0.000002s : 0.02% optimize.assign_add_opt : 0.000001s : 0.01% optimize.ForceFp32Comm : 0.000001s : 0.01% optimize.remove_cast_before_assign_add : 0.000001s : 0.01% optimize.full_micro_interleaved_order_control : 0.000002s : 0.02% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.03% optimize.comm_op_add_attrs : 0.000001s : 0.01% optimize.add_comm_op_reuse_tag : 0.000001s : 0.01% optimize.interleave_split_concat_branches : 0.000001s : 0.01% optimize.interleave_parallel_branches : 0.000001s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.02% optimize.control_data_broadcast_order : 0.000015s : 0.14% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.02% optimize.offloading_packed_experts : 0.000004s : 0.04% optimize.overlap_recompute_and_grad_model_parallel : 0.000006s : 0.05% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.01% optimize.overlap_recompute_comm : 0.000002s : 0.02% optimize.overlap_grad_ring_attention : 0.000004s : 0.04% optimize.overlap_grad_flash_sp : 0.000021s : 0.20% optimize.begin_end_overlap_inline : 0.000001s : 0.01% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.02% optimize.split_layernorm_comm : 0.000002s : 0.02% optimize.handle_group_info : 0.000001s : 0.01% optimize.symbol_engine_optimizer.build : 0.000003s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000010s : 0.10% optimize.symbol_engine_optimizer.elim_not_effective : 0.000015s : 0.14% optimize.symbol_engine_optimizer.opt_reshape : 0.000008s : 0.08% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000012s : 0.11% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.02% pipeline_parallel_scheduler : 0.000001s : 0.01% auto_monad_reorder : 0.000019s : 0.18% get_jit_bprop_graph : 0.000001s : 0.01% rewriter_after_jit_bprop_graph : 0.000003s : 0.03% opt_after_jit_grad : 0.000452s : 4.27% validate : 0.000039s : 0.37% Time group info: ------[substitution.] 0.000199 38 11.03% : 0.000022s : 3: substitution.cast_eliminate 1.08% : 0.000002s : 3: substitution.elim_not_effective 0.82% : 0.000002s : 3: substitution.fold_const_symbol 3.11% : 0.000006s : 5: substitution.graph_param_transform 69.94% : 0.000139s : 4: substitution.inline 2.10% : 0.000004s : 6: substitution.j_node_and_user_rematch 3.18% : 0.000006s : 6: substitution.remove_not_recompute_node 2.15% : 0.000004s : 4: substitution.replace_old_param 6.58% : 0.000013s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.005462 2 87.76% : 0.004793s : 1: type_inference.infer 12.24% : 0.000669s : 1: type_inference.specialize ------[replace.] 0.000061 8 60.42% : 0.000037s : 4: replace.inline 39.58% : 0.000024s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000147 8 92.49% : 0.000136s : 4: match.inline 7.51% : 0.000011s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000246 1504 1.00% : 0.000002s : 15: predicate.accumulaten_eliminater 0.81% : 0.000002s : 5: predicate.ad_related_special_op_eliminate 0.61% : 0.000001s : 10: predicate.addn_check_dump 0.92% : 0.000002s : 15: predicate.addn_zero_filter 0.88% : 0.000002s : 15: predicate.adjust_all_reduce_mul_add 2.06% : 0.000005s : 25: predicate.arithmetic_simplify 1.05% : 0.000003s : 15: predicate.cast_eliminate 0.59% : 0.000001s : 10: predicate.check_bprop_eliminate 0.63% : 0.000002s : 10: predicate.compare_switch_simplify 0.19% : 0.000000s : 5: predicate.const_output_eliminate 0.66% : 0.000002s : 10: predicate.depend_value_elim 1.00% : 0.000002s : 15: predicate.dict_get_item_const_eliminator 1.10% : 0.000003s : 15: predicate.dict_get_item_eliminator 0.96% : 0.000002s : 15: predicate.dict_set_item_eliminator 1.02% : 0.000003s : 10: predicate.dumpgradient_eliminate 0.20% : 0.000000s : 5: predicate.elim_not_effective 0.38% : 0.000001s : 5: predicate.elim_shapecalc_of_broadcastargs 1.20% : 0.000003s : 20: predicate.environ_add_const_eliminate 1.13% : 0.000003s : 20: predicate.environ_get_add_eliminate 1.16% : 0.000003s : 20: predicate.environ_get_depend_swap 1.83% : 0.000004s : 30: predicate.environ_get_eliminate 1.25% : 0.000003s : 20: predicate.environ_get_set_eliminate 1.40% : 0.000003s : 23: predicate.exchange_switch_depend_value 2.41% : 0.000006s : 23: predicate.float_depend_g_call 0.57% : 0.000001s : 10: predicate.float_environ_get_switch 0.83% : 0.000002s : 15: predicate.float_tuple_getitem_switch 0.18% : 0.000000s : 5: predicate.fold_const_symbol 0.80% : 0.000002s : 10: predicate.get_grad_eliminate 0.22% : 0.000001s : 5: predicate.graph_param_transform 0.61% : 0.000002s : 10: predicate.incorporate_call 0.51% : 0.000001s : 10: predicate.incorporate_call_switch 6.25% : 0.000015s : 68: predicate.inline 0.79% : 0.000002s : 10: predicate.inline_without_move 0.33% : 0.000001s : 10: predicate.j_node_and_user_rematch 0.77% : 0.000002s : 10: predicate.less_batch_normalization 1.80% : 0.000004s : 29: predicate.list_to_tuple_eliminator_ 2.59% : 0.000006s : 44: predicate.load_eliminater 0.73% : 0.000002s : 5: predicate.loop_unroll_after_grad 2.20% : 0.000005s : 36: predicate.loop_unroll_before_grad 1.68% : 0.000004s : 25: predicate.make_slice_get_slice_eliminator 0.60% : 0.000001s : 10: predicate.merge_addn 0.57% : 0.000001s : 10: predicate.micro_step_allgather_replace 0.59% : 0.000001s : 10: predicate.mini_step_allgather_replace 0.86% : 0.000002s : 15: predicate.minmaximum_grad 0.77% : 0.000002s : 5: predicate.mutable_eliminate 0.33% : 0.000001s : 5: predicate.opt_reshape 0.36% : 0.000001s : 5: predicate.parallel_virtual_node 1.71% : 0.000004s : 23: predicate.partial_defer_inline 1.67% : 0.000004s : 24: predicate.partial_eliminate 0.90% : 0.000002s : 15: predicate.print_const_string_wrapper 0.60% : 0.000001s : 10: predicate.reduce_all_const_elim 1.21% : 0.000003s : 15: predicate.reduce_eliminate 2.70% : 0.000007s : 44: predicate.redundant_stop_gradient_eliminater 0.40% : 0.000001s : 10: predicate.remove_not_recompute_node 1.34% : 0.000003s : 29: predicate.replace_applicator 0.42% : 0.000001s : 10: predicate.replace_old_param 0.26% : 0.000001s : 5: predicate.reset_defer_inline 1.10% : 0.000003s : 15: predicate.reshape_eliminate 0.66% : 0.000002s : 10: predicate.row_tensor_add_zeros_like 0.35% : 0.000001s : 5: predicate.row_tensor_eliminate 0.70% : 0.000002s : 10: predicate.same_eliminate 0.45% : 0.000001s : 10: predicate.set_cell_output_no_recompute 0.82% : 0.000002s : 10: predicate.shard_identity_eliminate 0.86% : 0.000002s : 10: predicate.special_op_eliminate 0.73% : 0.000002s : 10: predicate.specialize_transform 0.78% : 0.000002s : 10: predicate.split_environ_get_set_with_tuple_value 0.69% : 0.000002s : 10: predicate.stack_unstack_eliminate 0.35% : 0.000001s : 5: predicate.switch_call_monad_eliminater 1.53% : 0.000004s : 23: predicate.switch_defer_inline 2.02% : 0.000005s : 33: predicate.switch_layer_defer_inline 5.11% : 0.000013s : 74: predicate.switch_simplify 1.01% : 0.000002s : 15: predicate.tile_eliminate 1.01% : 0.000002s : 15: predicate.transpose_eliminate 1.53% : 0.000004s : 25: predicate.tuple_list_convert_item_index_to_positive 1.57% : 0.000004s : 25: predicate.tuple_list_get_item_const_eliminator 1.51% : 0.000004s : 25: predicate.tuple_list_get_item_depend_reorder 3.04% : 0.000007s : 39: predicate.tuple_list_get_item_eliminator 1.57% : 0.000004s : 25: predicate.tuple_list_get_set_item_eliminator 2.22% : 0.000005s : 35: predicate.tuple_list_set_item_eliminator 1.78% : 0.000004s : 29: predicate.tuple_to_list_eliminator_ 2.48% : 0.000006s : 44: predicate.updatestate_pure_node_eliminater 3.09% : 0.000008s : 54: predicate.updatestate_useless_node_eliminater 0.32% : 0.000001s : 5: predicate.value_based_eliminate 0.74% : 0.000002s : 10: predicate.virtual_dataset_eliminate 0.65% : 0.000002s : 10: predicate.virtual_output_eliminate 0.35% : 0.000001s : 5: predicate.virtual_view_grad_eliminate 0.41% : 0.000001s : 5: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000479 11 55.94% : 0.000268s : 5: func_graph_cloner_run.FuncGraphClonerGraph 44.06% : 0.000211s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.024440 192 0.01% : 0.000003s : 1: ForceFp32Comm 12.02% : 0.002937s : 1: add_attr 11.98% : 0.002928s : 1: add_attr_with_inline 0.01% : 0.000004s : 1: add_comm_op_reuse_tag 0.24% : 0.000058s : 1: add_recomputation 0.02% : 0.000004s : 1: assign_add_opt 0.27% : 0.000066s : 1: auto_monad 0.09% : 0.000023s : 1: auto_monad_reorder 0.01% : 0.000003s : 1: begin_end_overlap_inline 0.02% : 0.000005s : 1: bias_add_comm_swap 1.81% : 0.000443s : 1: bootstrap 0.11% : 0.000027s : 1: cconv 0.02% : 0.000004s : 1: comm_op_add_attrs 0.08% : 0.000018s : 1: control_data_broadcast_order 0.05% : 0.000011s : 1: convert_after_rewriter 0.11% : 0.000027s : 1: cse_after_recomputation 0.02% : 0.000005s : 1: dataset_repeat_opt 0.02% : 0.000005s : 1: detach_backward 0.04% : 0.000009s : 1: environ_conv 0.10% : 0.000024s : 1: event_method 0.02% : 0.000005s : 1: full_micro_interleaved_order_control 0.02% : 0.000004s : 1: get_jit_bprop_graph 0.04% : 0.000010s : 1: graph_reusing 0.02% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000003s : 1: handle_group_info 0.02% : 0.000006s : 1: inline 0.02% : 0.000006s : 1: insert-virtual-dataset 0.02% : 0.000004s : 1: interleave_parallel_branches 0.02% : 0.000004s : 1: interleave_split_concat_branches 0.02% : 0.000006s : 1: label_fine_grained_interleaved_index 0.03% : 0.000007s : 1: label_micro_interleaved_index 1.77% : 0.000433s : 1: loop_unroll 0.02% : 0.000004s : 1: merge_cast_opt 0.02% : 0.000005s : 1: micro_interleaved_order_control 1.90% : 0.000464s : 1: mutable_eliminate 0.03% : 0.000007s : 1: offloading_packed_experts 0.06% : 0.000016s : 1: opt.transform.loop_unroll_optimizer 0.07% : 0.000016s : 1: opt.transform.mutable_eliminate 5.72% : 0.001398s : 78: opt.transform.opt_a 0.15% : 0.000037s : 1: opt.transform.opt_after_cconv 0.13% : 0.000031s : 1: opt.transform.opt_after_jit_grad 0.54% : 0.000133s : 28: opt.transform.opt_b 0.24% : 0.000058s : 2: opt.transform.opt_trans_graph 0.17% : 0.000041s : 4: opt.transform.symbol_engine_opt 11.46% : 0.002802s : 1: opt_a 0.48% : 0.000118s : 1: opt_after_cconv 1.89% : 0.000461s : 1: opt_after_jit_grad 0.99% : 0.000241s : 1: opt_b 19.88% : 0.004858s : 1: optimize 0.09% : 0.000021s : 1: optimize_parallel_all_gather_comm 0.04% : 0.000009s : 1: order_py_execute_after_rewriter 0.10% : 0.000024s : 1: overlap_grad_flash_sp 0.02% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.03% : 0.000007s : 1: overlap_grad_ring_attention 0.02% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.02% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.02% : 0.000005s : 1: overlap_param_gather 0.02% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.03% : 0.000008s : 1: overlap_recompute_and_grad_model_parallel 0.02% : 0.000005s : 1: overlap_recompute_comm 0.03% : 0.000006s : 1: parallel-infer-symbol 0.01% : 0.000004s : 1: parallel-infer-symbol-second 0.02% : 0.000005s : 1: partial_unused_args_eliminate 0.02% : 0.000005s : 1: pipeline_parallel_scheduler 0.02% : 0.000004s : 1: pipeline_split 0.14% : 0.000034s : 1: pre_auto_parallel 0.24% : 0.000058s : 1: py_interpret_to_execute 0.05% : 0.000013s : 1: py_interpret_to_execute_after_opt_a 0.02% : 0.000004s : 1: remove_cast_before_assign_add 0.07% : 0.000018s : 1: remove_dup_value 1.17% : 0.000285s : 1: renormalize.infer 1.01% : 0.000248s : 1: renormalize.specialize 0.02% : 0.000005s : 1: reorder_send_recv_between_fp_bp 0.03% : 0.000006s : 1: rewriter_after_jit_bprop_graph 0.17% : 0.000042s : 1: rewriter_after_opt_a 0.36% : 0.000088s : 1: rewriter_before_opt_a 0.02% : 0.000006s : 1: slice_cell_reuse_recomputed_activation 0.02% : 0.000005s : 1: slice_recompute_activation 0.02% : 0.000004s : 1: split_layernorm_comm 0.02% : 0.000005s : 1: split_matmul_comm_elemetwise 0.03% : 0.000008s : 1: swap_dp_allreduce_reducescatter 0.34% : 0.000082s : 1: symbol_engine_optimizer 0.36% : 0.000087s : 1: tuple_transform 22.62% : 0.005529s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:45:55.211.015 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:45:55.211.282 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0148163, [21] [bootstrap]: 0.00041607 [type_inference]: 0.00535063 [event_method]: 1.747e-05 [auto_monad]: 5.988e-05 [graph_reusing]: 5.89e-06 [inline]: 1.99999e-06 [add_attr]: 0.00292039, [1] [add_attr_with_inline]: 0.00291272, [1] [Cycle 1]: 6.202e-05, [2] [tag_attr]: 1.782e-05 [meta_addattr_fg_expand]: 6.12999e-06 [parallel-infer-symbol]: 2.84999e-06 [pre_auto_parallel]: 2.966e-05 [insert-virtual-dataset]: 2.52001e-06 [parallel-infer-symbol-second]: 7.10017e-07 [dataset_repeat_opt]: 1.94e-06 [pipeline_split]: 1.91e-06 [optimize]: 0.00496127, [53] [py_interpret_to_execute]: 2.571e-05 [rewriter_before_opt_a]: 8.08e-05 [opt_a]: 0.00278869, [2] [Cycle 1]: 0.00191579, [45] [expand_dump_flag]: 3.13998e-06 [switch_simplify]: 3.968e-05 [loop_unroll]: 3.004e-05 [a_1]: 0.00059356 [with_stream_mark]: 1.335e-05 [recompute_prepare]: 8.32998e-06 [updatestate_depend_eliminate]: 3.66999e-06 [updatestate_assign_eliminate]: 3.38999e-06 [updatestate_loads_eliminate]: 3.01001e-06 [parameter_eliminate]: 2.09e-06 [a_2]: 0.00010893 [accelerated_algorithm]: 7.26999e-06 [shard]: 2.21998e-06 [meta_shard_fg_expand]: 1.72001e-06 [shard_inline]: 6.85998e-06 [merge_send_recv]: 8.79998e-06 [auto_parallel]: 6.03002e-06 [parallel]: 1.798e-05 [flash_sp]: 7.21001e-06 [merge_comm]: 3.88001e-06 [allreduce_fusion]: 3.61001e-06 [matmul_add_comm_reduction]: 9.01002e-06 [allreduce_slice_to_reducescatter]: 8.10018e-07 [virtual_shard_identity]: 7.65e-06 [virtual_dataset]: 6.81001e-06 [get_grad_eliminate_]: 6.39999e-06 [virtual_output]: 6.36e-06 [merge_forward]: 3.78001e-06 [cell_reuse_recompute_pass]: 1.54e-06 [offload_activation]: 9.84001e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.412e-05 [merge_recompute_call_nodes]: 1.39998e-06 [before_grad]: 1.067e-05 [set_forward_comm_id_for_comm_node_pass]: 3.63999e-06 [meta_fg_expand]: 2.94001e-06 [flash_sp_send_recv_attached]: 2.39001e-06 [receive_attached]: 2.36e-06 [after_resolve]: 1.123e-05 [a_after_grad]: 9.92999e-06 [renormalize]: 0.00046343 [add_forward_monad_depend]: 5.10001e-06 [auto_monad_grad]: 1.86e-06 [auto_monad_eliminator]: 1.363e-05 [cse]: 1.656e-05 [a_3]: 5.907e-05 [Cycle 2]: 0.00086046, [45] [expand_dump_flag]: 1.03001e-06 [switch_simplify]: 7.35e-06 [loop_unroll]: 6.29999e-06 [a_1]: 0.00013345 [with_stream_mark]: 1.197e-05 [recompute_prepare]: 7.14001e-06 [updatestate_depend_eliminate]: 3.14001e-06 [updatestate_assign_eliminate]: 2.58998e-06 [updatestate_loads_eliminate]: 2.49001e-06 [parameter_eliminate]: 1.15001e-06 [a_2]: 0.00010827 [accelerated_algorithm]: 6.80998e-06 [shard]: 1.33002e-06 [meta_shard_fg_expand]: 1.40999e-06 [shard_inline]: 6.79999e-06 [merge_send_recv]: 5.29998e-06 [auto_parallel]: 5.30999e-06 [parallel]: 3.98001e-06 [flash_sp]: 3.52997e-06 [merge_comm]: 3.11999e-06 [allreduce_fusion]: 6.02001e-06 [matmul_add_comm_reduction]: 5.12999e-06 [allreduce_slice_to_reducescatter]: 3.19997e-07 [virtual_shard_identity]: 6.57002e-06 [virtual_dataset]: 6.41e-06 [get_grad_eliminate_]: 6.11e-06 [virtual_output]: 6.24999e-06 [merge_forward]: 2.89999e-06 [cell_reuse_recompute_pass]: 1.85001e-06 [offload_activation]: 6.38003e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.407e-05 [merge_recompute_call_nodes]: 8.50006e-07 [before_grad]: 9.92001e-06 [set_forward_comm_id_for_comm_node_pass]: 3.66999e-06 [meta_fg_expand]: 2.06998e-06 [flash_sp_send_recv_attached]: 8.89995e-07 [receive_attached]: 1.05001e-06 [after_resolve]: 1.041e-05 [a_after_grad]: 9.57001e-06 [renormalize]: 1.00001e-07 [add_forward_monad_depend]: 1.37e-06 [auto_monad_grad]: 1.20999e-06 [auto_monad_eliminator]: 7.60998e-06 [cse]: 1.261e-05 [a_3]: 4.977e-05 [py_interpret_to_execute_after_opt_a]: 1.071e-05 [slice_cell_reuse_recomputed_activation]: 5.29998e-06 [rewriter_after_opt_a]: 3.449e-05 [convert_after_rewriter]: 9.64999e-06 [order_py_execute_after_rewriter]: 8.48999e-06 [mutable_eliminate]: 0.0004649 [opt_b]: 0.00027036, [1] [Cycle 1]: 0.0002613, [7] [b_1]: 0.0001656 [b_2]: 8.33001e-06 [updatestate_depend_eliminate]: 5.23002e-06 [updatestate_assign_eliminate]: 2.81e-06 [updatestate_loads_eliminate]: 2.68998e-06 [renormalize]: 4.39992e-07 [cse]: 1.753e-05 [optimize_parallel_all_gather_comm]: 1.87e-05 [overlap_param_gather]: 5.17e-06 [cconv]: 2.551e-05 [loop_unroll]: 0.00042829 [opt_after_cconv]: 0.00012364, [1] [Cycle 1]: 0.00011485, [7] [c_1]: 3.019e-05 [parameter_eliminate]: 2.45002e-06 [updatestate_depend_eliminate]: 5.25999e-06 [updatestate_assign_eliminate]: 2.56e-06 [updatestate_loads_eliminate]: 2.46e-06 [cse]: 1.66e-05 [renormalize]: 4.60015e-07 [remove_dup_value]: 1.66e-05 [tuple_transform]: 8.914e-05, [1] [Cycle 1]: 8.216e-05, [4] [d_1]: 4.298e-05 [none_parameter_eliminate]: 1.70001e-06 [renormalize]: 2.19996e-07 [switch_simplify]: 7.37002e-06 [partial_unused_args_eliminate]: 4.80001e-06 [add_recomputation]: 4.671e-05 [cse_after_recomputation]: 2.662e-05, [1] [Cycle 1]: 1.947e-05, [1] [cse]: 1.03e-05 [environ_conv]: 7.53e-06 [swap_dp_allreduce_reducescatter]: 7.8e-06 [bias_add_comm_swap]: 4.67998e-06 [label_micro_interleaved_index]: 6.81001e-06 [label_fine_grained_interleaved_index]: 5.07e-06 [merge_cast_opt]: 4.30999e-06 [slice_recompute_activation]: 4.30999e-06 [micro_interleaved_order_control]: 4.85001e-06 [assign_add_opt]: 3.64002e-06 [ForceFp32Comm]: 3.14999e-06 [remove_cast_before_assign_add]: 3.79002e-06 [full_micro_interleaved_order_control]: 4.35e-06 [reorder_send_recv_between_fp_bp]: 5.10999e-06 [comm_op_add_attrs]: 3.75998e-06 [add_comm_op_reuse_tag]: 3.14999e-06 [interleave_split_concat_branches]: 3.61999e-06 [interleave_parallel_branches]: 3.78999e-06 [overlap_opt_shard_in_pipeline]: 3.90998e-06 [overlap_opt_shard_grad_in_pipeline]: 4.35999e-06 [control_data_broadcast_order]: 1.482e-05 [grouped_pairwise_exchange_alltoall]: 4.33001e-06 [offloading_packed_experts]: 6.07999e-06 [overlap_recompute_and_grad_model_parallel]: 7.12002e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.57002e-06 [overlap_recompute_allgather_and_fa_grad]: 3.95e-06 [overlap_recompute_comm]: 4.66002e-06 [overlap_grad_ring_attention]: 6.46999e-06 [overlap_grad_flash_sp]: 1.815e-05 [begin_end_overlap_inline]: 3.22002e-06 [split_matmul_comm_elemetwise]: 4.65001e-06 [split_layernorm_comm]: 4.05e-06 [handle_group_info]: 3.37002e-06 [symbol_engine_optimizer]: 9.952e-05, [1] [Cycle 1]: 9.253e-05, [6] [build]: 2.79001e-06 [elim_shapecalc]: 1.016e-05 [elim_not_effective]: 1.374e-05 [opt_reshape]: 7.44002e-06 [fold_const_symbol]: 1.074e-05 [renormalize]: 2.9002e-07 [detach_backward]: 2.91999e-06 [pipeline_parallel_scheduler]: 1.91e-06 [auto_monad_reorder]: 1.751e-05 [get_jit_bprop_graph]: 9.50007e-07 [rewriter_after_jit_bprop_graph]: 3.73001e-06 [opt_after_jit_grad]: 0.00046959 [validate]: 3.454e-05 Sums bootstrap : 0.000416s : 4.07% type_inference : 0.005351s : 52.37% event_method : 0.000017s : 0.17% auto_monad : 0.000060s : 0.59% graph_reusing : 0.000006s : 0.06% inline : 0.000002s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000018s : 0.17% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.06% parallel-infer-symbol : 0.000003s : 0.03% pre_auto_parallel : 0.000030s : 0.29% insert-virtual-dataset : 0.000003s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.02% optimize.py_interpret_to_execute : 0.000026s : 0.25% optimize.rewriter_before_opt_a : 0.000081s : 0.79% optimize.opt_a.expand_dump_flag : 0.000004s : 0.04% optimize.opt_a.switch_simplify : 0.000047s : 0.46% optimize.opt_a.loop_unroll : 0.000036s : 0.36% optimize.opt_a.a_1 : 0.000727s : 7.12% optimize.opt_a.with_stream_mark : 0.000025s : 0.25% optimize.opt_a.recompute_prepare : 0.000015s : 0.15% optimize.opt_a.updatestate_depend_eliminate : 0.000007s : 0.07% optimize.opt_a.updatestate_assign_eliminate : 0.000006s : 0.06% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.05% optimize.opt_a.parameter_eliminate : 0.000003s : 0.03% optimize.opt_a.a_2 : 0.000217s : 2.13% optimize.opt_a.accelerated_algorithm : 0.000014s : 0.14% optimize.opt_a.shard : 0.000004s : 0.03% optimize.opt_a.meta_shard_fg_expand : 0.000003s : 0.03% optimize.opt_a.shard_inline : 0.000014s : 0.13% optimize.opt_a.merge_send_recv : 0.000014s : 0.14% optimize.opt_a.auto_parallel : 0.000011s : 0.11% optimize.opt_a.parallel : 0.000022s : 0.21% optimize.opt_a.flash_sp : 0.000011s : 0.11% optimize.opt_a.merge_comm : 0.000007s : 0.07% optimize.opt_a.allreduce_fusion : 0.000010s : 0.09% optimize.opt_a.matmul_add_comm_reduction : 0.000014s : 0.14% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000014s : 0.14% optimize.opt_a.virtual_dataset : 0.000013s : 0.13% optimize.opt_a.get_grad_eliminate_ : 0.000013s : 0.12% optimize.opt_a.virtual_output : 0.000013s : 0.12% optimize.opt_a.merge_forward : 0.000007s : 0.07% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.03% optimize.opt_a.offload_activation : 0.000016s : 0.16% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000028s : 0.28% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.02% optimize.opt_a.before_grad : 0.000021s : 0.20% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000007s : 0.07% optimize.opt_a.meta_fg_expand : 0.000005s : 0.05% optimize.opt_a.flash_sp_send_recv_attached : 0.000003s : 0.03% optimize.opt_a.receive_attached : 0.000003s : 0.03% optimize.opt_a.after_resolve : 0.000022s : 0.21% optimize.opt_a.a_after_grad : 0.000020s : 0.19% optimize.opt_a.renormalize : 0.000464s : 4.54% optimize.opt_a.add_forward_monad_depend : 0.000006s : 0.06% optimize.opt_a.auto_monad_grad : 0.000003s : 0.03% optimize.opt_a.auto_monad_eliminator : 0.000021s : 0.21% optimize.opt_a.cse : 0.000029s : 0.29% optimize.opt_a.a_3 : 0.000109s : 1.07% optimize.py_interpret_to_execute_after_opt_a : 0.000011s : 0.10% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.05% optimize.rewriter_after_opt_a : 0.000034s : 0.34% optimize.convert_after_rewriter : 0.000010s : 0.09% optimize.order_py_execute_after_rewriter : 0.000008s : 0.08% optimize.mutable_eliminate : 0.000465s : 4.55% optimize.opt_b.b_1 : 0.000166s : 1.62% optimize.opt_b.b_2 : 0.000008s : 0.08% optimize.opt_b.updatestate_depend_eliminate : 0.000005s : 0.05% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_b.renormalize : 0.000000s : 0.00% optimize.opt_b.cse : 0.000018s : 0.17% optimize.optimize_parallel_all_gather_comm : 0.000019s : 0.18% optimize.overlap_param_gather : 0.000005s : 0.05% optimize.cconv : 0.000026s : 0.25% optimize.loop_unroll : 0.000428s : 4.19% optimize.opt_after_cconv.c_1 : 0.000030s : 0.30% optimize.opt_after_cconv.parameter_eliminate : 0.000002s : 0.02% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000005s : 0.05% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.02% optimize.opt_after_cconv.cse : 0.000017s : 0.16% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000017s : 0.16% optimize.tuple_transform.d_1 : 0.000043s : 0.42% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.02% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000007s : 0.07% optimize.partial_unused_args_eliminate : 0.000005s : 0.05% optimize.add_recomputation : 0.000047s : 0.46% optimize.cse_after_recomputation.cse : 0.000010s : 0.10% optimize.environ_conv : 0.000008s : 0.07% optimize.swap_dp_allreduce_reducescatter : 0.000008s : 0.08% optimize.bias_add_comm_swap : 0.000005s : 0.05% optimize.label_micro_interleaved_index : 0.000007s : 0.07% optimize.label_fine_grained_interleaved_index : 0.000005s : 0.05% optimize.merge_cast_opt : 0.000004s : 0.04% optimize.slice_recompute_activation : 0.000004s : 0.04% optimize.micro_interleaved_order_control : 0.000005s : 0.05% optimize.assign_add_opt : 0.000004s : 0.04% optimize.ForceFp32Comm : 0.000003s : 0.03% optimize.remove_cast_before_assign_add : 0.000004s : 0.04% optimize.full_micro_interleaved_order_control : 0.000004s : 0.04% optimize.reorder_send_recv_between_fp_bp : 0.000005s : 0.05% optimize.comm_op_add_attrs : 0.000004s : 0.04% optimize.add_comm_op_reuse_tag : 0.000003s : 0.03% optimize.interleave_split_concat_branches : 0.000004s : 0.04% optimize.interleave_parallel_branches : 0.000004s : 0.04% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.04% optimize.overlap_opt_shard_grad_in_pipeline : 0.000004s : 0.04% optimize.control_data_broadcast_order : 0.000015s : 0.15% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.04% optimize.offloading_packed_experts : 0.000006s : 0.06% optimize.overlap_recompute_and_grad_model_parallel : 0.000007s : 0.07% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.03% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.04% optimize.overlap_recompute_comm : 0.000005s : 0.05% optimize.overlap_grad_ring_attention : 0.000006s : 0.06% optimize.overlap_grad_flash_sp : 0.000018s : 0.18% optimize.begin_end_overlap_inline : 0.000003s : 0.03% optimize.split_matmul_comm_elemetwise : 0.000005s : 0.05% optimize.split_layernorm_comm : 0.000004s : 0.04% optimize.handle_group_info : 0.000003s : 0.03% optimize.symbol_engine_optimizer.build : 0.000003s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000010s : 0.10% optimize.symbol_engine_optimizer.elim_not_effective : 0.000014s : 0.13% optimize.symbol_engine_optimizer.opt_reshape : 0.000007s : 0.07% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000011s : 0.11% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000003s : 0.03% pipeline_parallel_scheduler : 0.000002s : 0.02% auto_monad_reorder : 0.000018s : 0.17% get_jit_bprop_graph : 0.000001s : 0.01% rewriter_after_jit_bprop_graph : 0.000004s : 0.04% opt_after_jit_grad : 0.000470s : 4.60% validate : 0.000035s : 0.34% Time group info: ------[substitution.] 0.000161 28 1.13% : 0.000002s : 2: substitution.elim_not_effective 0.93% : 0.000001s : 2: substitution.fold_const_symbol 3.54% : 0.000006s : 4: substitution.graph_param_transform 77.76% : 0.000125s : 4: substitution.inline 2.18% : 0.000004s : 4: substitution.j_node_and_user_rematch 2.87% : 0.000005s : 4: substitution.remove_not_recompute_node 2.61% : 0.000004s : 4: substitution.replace_old_param 8.98% : 0.000014s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.005306 2 87.80% : 0.004659s : 1: type_inference.infer 12.20% : 0.000648s : 1: type_inference.specialize ------[replace.] 0.000057 8 62.70% : 0.000036s : 4: replace.inline 37.30% : 0.000021s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000135 8 90.70% : 0.000123s : 4: match.inline 9.30% : 0.000013s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000202 1278 0.89% : 0.000002s : 13: predicate.accumulaten_eliminater 0.75% : 0.000002s : 4: predicate.ad_related_special_op_eliminate 0.56% : 0.000001s : 8: predicate.addn_check_dump 1.01% : 0.000002s : 13: predicate.addn_zero_filter 0.77% : 0.000002s : 13: predicate.adjust_all_reduce_mul_add 2.00% : 0.000004s : 21: predicate.arithmetic_simplify 0.96% : 0.000002s : 13: predicate.cast_eliminate 0.61% : 0.000001s : 8: predicate.check_bprop_eliminate 0.62% : 0.000001s : 8: predicate.compare_switch_simplify 0.19% : 0.000000s : 4: predicate.const_output_eliminate 0.64% : 0.000001s : 8: predicate.depend_value_elim 0.96% : 0.000002s : 13: predicate.dict_get_item_const_eliminator 1.04% : 0.000002s : 13: predicate.dict_get_item_eliminator 0.93% : 0.000002s : 13: predicate.dict_set_item_eliminator 1.02% : 0.000002s : 8: predicate.dumpgradient_eliminate 0.21% : 0.000000s : 4: predicate.elim_not_effective 0.44% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.14% : 0.000002s : 17: predicate.environ_add_const_eliminate 1.11% : 0.000002s : 17: predicate.environ_get_add_eliminate 1.12% : 0.000002s : 17: predicate.environ_get_depend_swap 1.71% : 0.000003s : 25: predicate.environ_get_eliminate 1.06% : 0.000002s : 17: predicate.environ_get_set_eliminate 1.51% : 0.000003s : 21: predicate.exchange_switch_depend_value 2.52% : 0.000005s : 21: predicate.float_depend_g_call 0.59% : 0.000001s : 8: predicate.float_environ_get_switch 0.82% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.18% : 0.000000s : 4: predicate.fold_const_symbol 0.66% : 0.000001s : 8: predicate.get_grad_eliminate 0.21% : 0.000000s : 4: predicate.graph_param_transform 0.68% : 0.000001s : 8: predicate.incorporate_call 0.52% : 0.000001s : 8: predicate.incorporate_call_switch 6.22% : 0.000013s : 58: predicate.inline 0.79% : 0.000002s : 8: predicate.inline_without_move 0.34% : 0.000001s : 8: predicate.j_node_and_user_rematch 0.85% : 0.000002s : 8: predicate.less_batch_normalization 1.80% : 0.000004s : 25: predicate.list_to_tuple_eliminator_ 2.47% : 0.000005s : 38: predicate.load_eliminater 0.92% : 0.000002s : 4: predicate.loop_unroll_after_grad 2.34% : 0.000005s : 34: predicate.loop_unroll_before_grad 1.64% : 0.000003s : 21: predicate.make_slice_get_slice_eliminator 0.57% : 0.000001s : 8: predicate.merge_addn 0.55% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.60% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.82% : 0.000002s : 13: predicate.minmaximum_grad 1.07% : 0.000002s : 4: predicate.mutable_eliminate 0.36% : 0.000001s : 4: predicate.opt_reshape 0.35% : 0.000001s : 4: predicate.parallel_virtual_node 1.79% : 0.000004s : 21: predicate.partial_defer_inline 1.68% : 0.000003s : 21: predicate.partial_eliminate 0.95% : 0.000002s : 13: predicate.print_const_string_wrapper 0.61% : 0.000001s : 8: predicate.reduce_all_const_elim 1.13% : 0.000002s : 13: predicate.reduce_eliminate 2.46% : 0.000005s : 38: predicate.redundant_stop_gradient_eliminater 0.44% : 0.000001s : 8: predicate.remove_not_recompute_node 1.42% : 0.000003s : 25: predicate.replace_applicator 0.51% : 0.000001s : 8: predicate.replace_old_param 0.27% : 0.000001s : 4: predicate.reset_defer_inline 0.92% : 0.000002s : 13: predicate.reshape_eliminate 0.60% : 0.000001s : 8: predicate.row_tensor_add_zeros_like 0.38% : 0.000001s : 4: predicate.row_tensor_eliminate 0.74% : 0.000001s : 8: predicate.same_eliminate 0.47% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.74% : 0.000001s : 8: predicate.shard_identity_eliminate 0.68% : 0.000001s : 8: predicate.special_op_eliminate 0.72% : 0.000001s : 8: predicate.specialize_transform 0.78% : 0.000002s : 8: predicate.split_environ_get_set_with_tuple_value 0.75% : 0.000002s : 8: predicate.stack_unstack_eliminate 0.33% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.55% : 0.000003s : 21: predicate.switch_defer_inline 2.06% : 0.000004s : 29: predicate.switch_layer_defer_inline 5.15% : 0.000010s : 67: predicate.switch_simplify 0.84% : 0.000002s : 13: predicate.tile_eliminate 1.05% : 0.000002s : 13: predicate.transpose_eliminate 1.46% : 0.000003s : 21: predicate.tuple_list_convert_item_index_to_positive 1.62% : 0.000003s : 21: predicate.tuple_list_get_item_const_eliminator 1.53% : 0.000003s : 21: predicate.tuple_list_get_item_depend_reorder 3.41% : 0.000007s : 33: predicate.tuple_list_get_item_eliminator 1.47% : 0.000003s : 21: predicate.tuple_list_get_set_item_eliminator 2.37% : 0.000005s : 29: predicate.tuple_list_set_item_eliminator 1.79% : 0.000004s : 25: predicate.tuple_to_list_eliminator_ 2.43% : 0.000005s : 38: predicate.updatestate_pure_node_eliminater 3.24% : 0.000007s : 46: predicate.updatestate_useless_node_eliminater 0.41% : 0.000001s : 4: predicate.value_based_eliminate 0.64% : 0.000001s : 8: predicate.virtual_dataset_eliminate 0.65% : 0.000001s : 8: predicate.virtual_output_eliminate 0.33% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.49% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000469 11 56.44% : 0.000265s : 5: func_graph_cloner_run.FuncGraphClonerGraph 43.56% : 0.000204s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.024463 192 0.02% : 0.000006s : 1: ForceFp32Comm 11.97% : 0.002928s : 1: add_attr 11.92% : 0.002916s : 1: add_attr_with_inline 0.02% : 0.000006s : 1: add_comm_op_reuse_tag 0.21% : 0.000050s : 1: add_recomputation 0.03% : 0.000006s : 1: assign_add_opt 0.28% : 0.000068s : 1: auto_monad 0.10% : 0.000025s : 1: auto_monad_reorder 0.02% : 0.000006s : 1: begin_end_overlap_inline 0.03% : 0.000007s : 1: bias_add_comm_swap 1.87% : 0.000458s : 1: bootstrap 0.12% : 0.000029s : 1: cconv 0.03% : 0.000006s : 1: comm_op_add_attrs 0.07% : 0.000018s : 1: control_data_broadcast_order 0.05% : 0.000013s : 1: convert_after_rewriter 0.12% : 0.000030s : 1: cse_after_recomputation 0.03% : 0.000007s : 1: dataset_repeat_opt 0.07% : 0.000017s : 1: detach_backward 0.04% : 0.000010s : 1: environ_conv 0.11% : 0.000027s : 1: event_method 0.03% : 0.000007s : 1: full_micro_interleaved_order_control 0.03% : 0.000007s : 1: get_jit_bprop_graph 0.05% : 0.000012s : 1: graph_reusing 0.03% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.02% : 0.000006s : 1: handle_group_info 0.03% : 0.000008s : 1: inline 0.04% : 0.000009s : 1: insert-virtual-dataset 0.03% : 0.000007s : 1: interleave_parallel_branches 0.03% : 0.000006s : 1: interleave_split_concat_branches 0.03% : 0.000008s : 1: label_fine_grained_interleaved_index 0.04% : 0.000010s : 1: label_micro_interleaved_index 1.77% : 0.000434s : 1: loop_unroll 0.03% : 0.000007s : 1: merge_cast_opt 0.03% : 0.000007s : 1: micro_interleaved_order_control 1.92% : 0.000471s : 1: mutable_eliminate 0.04% : 0.000009s : 1: offloading_packed_experts 0.05% : 0.000013s : 1: opt.transform.loop_unroll_optimizer 0.06% : 0.000016s : 1: opt.transform.mutable_eliminate 4.70% : 0.001150s : 78: opt.transform.opt_a 0.12% : 0.000029s : 1: opt.transform.opt_after_cconv 0.11% : 0.000026s : 1: opt.transform.opt_after_jit_grad 0.41% : 0.000101s : 28: opt.transform.opt_b 0.20% : 0.000048s : 2: opt.transform.opt_trans_graph 0.16% : 0.000038s : 4: opt.transform.symbol_engine_opt 11.41% : 0.002792s : 1: opt_a 0.52% : 0.000127s : 1: opt_after_cconv 1.96% : 0.000480s : 1: opt_after_jit_grad 1.12% : 0.000274s : 1: opt_b 21.44% : 0.005246s : 1: optimize 0.09% : 0.000022s : 1: optimize_parallel_all_gather_comm 0.05% : 0.000011s : 1: order_py_execute_after_rewriter 0.09% : 0.000021s : 1: overlap_grad_flash_sp 0.03% : 0.000006s : 1: overlap_grad_matmul_and_grad_allreduce 0.04% : 0.000009s : 1: overlap_grad_ring_attention 0.03% : 0.000007s : 1: overlap_opt_shard_grad_in_pipeline 0.03% : 0.000007s : 1: overlap_opt_shard_in_pipeline 0.03% : 0.000008s : 1: overlap_param_gather 0.03% : 0.000006s : 1: overlap_recompute_allgather_and_fa_grad 0.04% : 0.000010s : 1: overlap_recompute_and_grad_model_parallel 0.03% : 0.000007s : 1: overlap_recompute_comm 0.04% : 0.000009s : 1: parallel-infer-symbol 0.03% : 0.000006s : 1: parallel-infer-symbol-second 0.03% : 0.000008s : 1: partial_unused_args_eliminate 0.04% : 0.000009s : 1: pipeline_parallel_scheduler 0.03% : 0.000007s : 1: pipeline_split 0.15% : 0.000037s : 1: pre_auto_parallel 0.12% : 0.000029s : 1: py_interpret_to_execute 0.06% : 0.000014s : 1: py_interpret_to_execute_after_opt_a 0.03% : 0.000006s : 1: remove_cast_before_assign_add 0.08% : 0.000020s : 1: remove_dup_value 0.94% : 0.000230s : 1: renormalize.infer 0.93% : 0.000226s : 1: renormalize.specialize 0.03% : 0.000008s : 1: reorder_send_recv_between_fp_bp 0.04% : 0.000010s : 1: rewriter_after_jit_bprop_graph 0.16% : 0.000038s : 1: rewriter_after_opt_a 0.34% : 0.000084s : 1: rewriter_before_opt_a 0.03% : 0.000008s : 1: slice_cell_reuse_recomputed_activation 0.03% : 0.000007s : 1: slice_recompute_activation 0.03% : 0.000007s : 1: split_layernorm_comm 0.03% : 0.000008s : 1: split_matmul_comm_elemetwise 0.04% : 0.000011s : 1: swap_dp_allreduce_reducescatter 0.42% : 0.000103s : 1: symbol_engine_optimizer 0.38% : 0.000092s : 1: tuple_transform 21.99% : 0.005380s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:45:55.609.828 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0310181, [21] [bootstrap]: 0.00042242 [type_inference]: 0.022117 [event_method]: 1.889e-05 [auto_monad]: 6.345e-05 [graph_reusing]: 6.11998e-06 [inline]: 2.54001e-06 [add_attr]: 0.00318414, [1] [add_attr_with_inline]: 0.00317486, [1] [Cycle 1]: 6.333e-05, [2] [tag_attr]: 1.926e-05 [meta_addattr_fg_expand]: 6.16e-06 [parallel-infer-symbol]: 3.01999e-06 [pre_auto_parallel]: 3.255e-05 [insert-virtual-dataset]: 2.36e-06 [parallel-infer-symbol-second]: 7.50006e-07 [dataset_repeat_opt]: 2.06998e-06 [pipeline_split]: 1.63002e-06 [optimize]: 0.00451783, [53] [py_interpret_to_execute]: 2.592e-05 [rewriter_before_opt_a]: 8.039e-05 [opt_a]: 0.00259143, [2] [Cycle 1]: 0.00192757, [45] [expand_dump_flag]: 3.14999e-06 [switch_simplify]: 4.38e-05 [loop_unroll]: 3.065e-05 [a_1]: 0.00061551 [with_stream_mark]: 1.631e-05 [recompute_prepare]: 1.001e-05 [updatestate_depend_eliminate]: 4.47998e-06 [updatestate_assign_eliminate]: 3.41001e-06 [updatestate_loads_eliminate]: 3.27002e-06 [parameter_eliminate]: 2.02999e-06 [a_2]: 8.155e-05 [accelerated_algorithm]: 6.88e-06 [shard]: 1.97001e-06 [meta_shard_fg_expand]: 2.16e-06 [shard_inline]: 6.53e-06 [merge_send_recv]: 7.61001e-06 [auto_parallel]: 6.35997e-06 [parallel]: 1.835e-05 [flash_sp]: 8.32e-06 [merge_comm]: 4.17e-06 [allreduce_fusion]: 3.71999e-06 [matmul_add_comm_reduction]: 9.87001e-06 [allreduce_slice_to_reducescatter]: 6.09987e-07 [virtual_shard_identity]: 7.77e-06 [virtual_dataset]: 6.84001e-06 [get_grad_eliminate_]: 6.68e-06 [virtual_output]: 6.27001e-06 [merge_forward]: 3.67002e-06 [cell_reuse_recompute_pass]: 1.15999e-06 [offload_activation]: 1.02e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.229e-05 [merge_recompute_call_nodes]: 1.49e-06 [before_grad]: 1.04e-05 [set_forward_comm_id_for_comm_node_pass]: 3.48e-06 [meta_fg_expand]: 2.98998e-06 [flash_sp_send_recv_attached]: 2.49001e-06 [receive_attached]: 2.62001e-06 [after_resolve]: 1.138e-05 [a_after_grad]: 9.87001e-06 [renormalize]: 0.00059465 [add_forward_monad_depend]: 5.49e-06 [auto_monad_grad]: 2.16998e-06 [auto_monad_eliminator]: 1.42e-05 [cse]: 2.656e-05 [a_3]: 4.515e-05 [Cycle 2]: 0.00065437, [45] [expand_dump_flag]: 9.20001e-07 [switch_simplify]: 7.48e-06 [loop_unroll]: 6.07001e-06 [a_1]: 0.00013718 [with_stream_mark]: 1.051e-05 [recompute_prepare]: 6.36e-06 [updatestate_depend_eliminate]: 2.98998e-06 [updatestate_assign_eliminate]: 2.16998e-06 [updatestate_loads_eliminate]: 2.30002e-06 [parameter_eliminate]: 9.5999e-07 [a_2]: 7.278e-05 [accelerated_algorithm]: 6.83e-06 [shard]: 1.48002e-06 [meta_shard_fg_expand]: 1.32e-06 [shard_inline]: 6.84001e-06 [merge_send_recv]: 4.79e-06 [auto_parallel]: 5.35001e-06 [parallel]: 4.48999e-06 [flash_sp]: 3.91999e-06 [merge_comm]: 3.2e-06 [allreduce_fusion]: 3.26001e-06 [matmul_add_comm_reduction]: 5.45001e-06 [allreduce_slice_to_reducescatter]: 3.59985e-07 [virtual_shard_identity]: 6.71e-06 [virtual_dataset]: 5.97999e-06 [get_grad_eliminate_]: 5.72999e-06 [virtual_output]: 5.60001e-06 [merge_forward]: 2.59999e-06 [cell_reuse_recompute_pass]: 1.46002e-06 [offload_activation]: 6.27001e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.237e-05 [merge_recompute_call_nodes]: 7.39994e-07 [before_grad]: 8.68001e-06 [set_forward_comm_id_for_comm_node_pass]: 3.09001e-06 [meta_fg_expand]: 2.21998e-06 [flash_sp_send_recv_attached]: 8.89995e-07 [receive_attached]: 1.10001e-06 [after_resolve]: 1.154e-05 [a_after_grad]: 1.228e-05 [renormalize]: 1.00001e-07 [add_forward_monad_depend]: 1.14e-06 [auto_monad_grad]: 8.70001e-07 [auto_monad_eliminator]: 7.06999e-06 [cse]: 1.223e-05 [a_3]: 3.616e-05 [py_interpret_to_execute_after_opt_a]: 8.45999e-06 [slice_cell_reuse_recomputed_activation]: 2.01998e-06 [rewriter_after_opt_a]: 3.277e-05 [convert_after_rewriter]: 6.26e-06 [order_py_execute_after_rewriter]: 4.90999e-06 [mutable_eliminate]: 0.00047823 [opt_b]: 0.00020287, [1] [Cycle 1]: 0.00019681, [7] [b_1]: 0.00012725 [b_2]: 7.65e-06 [updatestate_depend_eliminate]: 5.07e-06 [updatestate_assign_eliminate]: 2.29999e-06 [updatestate_loads_eliminate]: 2.34999e-06 [renormalize]: 9.09989e-07 [cse]: 1.632e-05 [optimize_parallel_all_gather_comm]: 1.586e-05 [overlap_param_gather]: 2.11e-06 [cconv]: 2.244e-05 [loop_unroll]: 0.00041154 [opt_after_cconv]: 9.995e-05, [1] [Cycle 1]: 9.429e-05, [7] [c_1]: 3.057e-05 [parameter_eliminate]: 3.03998e-06 [updatestate_depend_eliminate]: 5.04998e-06 [updatestate_assign_eliminate]: 2.45002e-06 [updatestate_loads_eliminate]: 2.34999e-06 [cse]: 1.579e-05 [renormalize]: 3.19997e-07 [remove_dup_value]: 1.335e-05 [tuple_transform]: 7.324e-05, [1] [Cycle 1]: 6.897e-05, [4] [d_1]: 4.222e-05 [none_parameter_eliminate]: 1.59e-06 [renormalize]: 1.8999e-07 [switch_simplify]: 6.78e-06 [partial_unused_args_eliminate]: 1.71998e-06 [add_recomputation]: 4.456e-05 [cse_after_recomputation]: 1.989e-05, [1] [Cycle 1]: 1.546e-05, [1] [cse]: 1.019e-05 [environ_conv]: 4.38999e-06 [swap_dp_allreduce_reducescatter]: 4.68999e-06 [bias_add_comm_swap]: 2.35002e-06 [label_micro_interleaved_index]: 4.22e-06 [label_fine_grained_interleaved_index]: 2.57001e-06 [merge_cast_opt]: 1.55001e-06 [slice_recompute_activation]: 2.11e-06 [micro_interleaved_order_control]: 2.37001e-06 [assign_add_opt]: 1.15999e-06 [ForceFp32Comm]: 1.25001e-06 [remove_cast_before_assign_add]: 1.00999e-06 [full_micro_interleaved_order_control]: 1.96998e-06 [reorder_send_recv_between_fp_bp]: 2.66e-06 [comm_op_add_attrs]: 9.60019e-07 [add_comm_op_reuse_tag]: 9.50007e-07 [interleave_split_concat_branches]: 1.11997e-06 [interleave_parallel_branches]: 1.00001e-06 [overlap_opt_shard_in_pipeline]: 1.16002e-06 [overlap_opt_shard_grad_in_pipeline]: 2.04999e-06 [control_data_broadcast_order]: 1.175e-05 [grouped_pairwise_exchange_alltoall]: 1.74e-06 [offloading_packed_experts]: 3.43999e-06 [overlap_recompute_and_grad_model_parallel]: 4.44002e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.15999e-06 [overlap_recompute_allgather_and_fa_grad]: 1.69998e-06 [overlap_recompute_comm]: 2.34001e-06 [overlap_grad_ring_attention]: 3.94002e-06 [overlap_grad_flash_sp]: 1.844e-05 [begin_end_overlap_inline]: 5.19998e-07 [split_matmul_comm_elemetwise]: 2.24999e-06 [split_layernorm_comm]: 1.59998e-06 [handle_group_info]: 1.20999e-06 [symbol_engine_optimizer]: 7.391e-05, [1] [Cycle 1]: 6.999e-05, [6] [build]: 2.81e-06 [elim_shapecalc]: 9.24e-06 [elim_not_effective]: 1.287e-05 [opt_reshape]: 6.81999e-06 [fold_const_symbol]: 1.013e-05 [renormalize]: 1.59984e-07 [detach_backward]: 1.99999e-06 [pipeline_parallel_scheduler]: 1.47001e-06 [auto_monad_reorder]: 1.558e-05 [get_jit_bprop_graph]: 1.40001e-06 [rewriter_after_jit_bprop_graph]: 3.27002e-06 [opt_after_jit_grad]: 0.00043658 [validate]: 3.409e-05 Sums bootstrap : 0.000422s : 1.57% type_inference : 0.022117s : 82.27% event_method : 0.000019s : 0.07% auto_monad : 0.000063s : 0.24% graph_reusing : 0.000006s : 0.02% inline : 0.000003s : 0.01% add_attr.add_attr_with_inline.tag_attr : 0.000019s : 0.07% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.02% parallel-infer-symbol : 0.000003s : 0.01% pre_auto_parallel : 0.000033s : 0.12% insert-virtual-dataset : 0.000002s : 0.01% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.01% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000026s : 0.10% optimize.rewriter_before_opt_a : 0.000080s : 0.30% optimize.opt_a.expand_dump_flag : 0.000004s : 0.02% optimize.opt_a.switch_simplify : 0.000051s : 0.19% optimize.opt_a.loop_unroll : 0.000037s : 0.14% optimize.opt_a.a_1 : 0.000753s : 2.80% optimize.opt_a.with_stream_mark : 0.000027s : 0.10% optimize.opt_a.recompute_prepare : 0.000016s : 0.06% optimize.opt_a.updatestate_depend_eliminate : 0.000007s : 0.03% optimize.opt_a.updatestate_assign_eliminate : 0.000006s : 0.02% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.02% optimize.opt_a.parameter_eliminate : 0.000003s : 0.01% optimize.opt_a.a_2 : 0.000154s : 0.57% optimize.opt_a.accelerated_algorithm : 0.000014s : 0.05% optimize.opt_a.shard : 0.000003s : 0.01% optimize.opt_a.meta_shard_fg_expand : 0.000003s : 0.01% optimize.opt_a.shard_inline : 0.000013s : 0.05% optimize.opt_a.merge_send_recv : 0.000012s : 0.05% optimize.opt_a.auto_parallel : 0.000012s : 0.04% optimize.opt_a.parallel : 0.000023s : 0.08% optimize.opt_a.flash_sp : 0.000012s : 0.05% optimize.opt_a.merge_comm : 0.000007s : 0.03% optimize.opt_a.allreduce_fusion : 0.000007s : 0.03% optimize.opt_a.matmul_add_comm_reduction : 0.000015s : 0.06% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000014s : 0.05% optimize.opt_a.virtual_dataset : 0.000013s : 0.05% optimize.opt_a.get_grad_eliminate_ : 0.000012s : 0.05% optimize.opt_a.virtual_output : 0.000012s : 0.04% optimize.opt_a.merge_forward : 0.000006s : 0.02% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.01% optimize.opt_a.offload_activation : 0.000016s : 0.06% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000025s : 0.09% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.01% optimize.opt_a.before_grad : 0.000019s : 0.07% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000007s : 0.02% optimize.opt_a.meta_fg_expand : 0.000005s : 0.02% optimize.opt_a.flash_sp_send_recv_attached : 0.000003s : 0.01% optimize.opt_a.receive_attached : 0.000004s : 0.01% optimize.opt_a.after_resolve : 0.000023s : 0.09% optimize.opt_a.a_after_grad : 0.000022s : 0.08% optimize.opt_a.renormalize : 0.000595s : 2.21% optimize.opt_a.add_forward_monad_depend : 0.000007s : 0.02% optimize.opt_a.auto_monad_grad : 0.000003s : 0.01% optimize.opt_a.auto_monad_eliminator : 0.000021s : 0.08% optimize.opt_a.cse : 0.000039s : 0.14% optimize.opt_a.a_3 : 0.000081s : 0.30% optimize.py_interpret_to_execute_after_opt_a : 0.000008s : 0.03% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.01% optimize.rewriter_after_opt_a : 0.000033s : 0.12% optimize.convert_after_rewriter : 0.000006s : 0.02% optimize.order_py_execute_after_rewriter : 0.000005s : 0.02% optimize.mutable_eliminate : 0.000478s : 1.78% optimize.opt_b.b_1 : 0.000127s : 0.47% optimize.opt_b.b_2 : 0.000008s : 0.03% optimize.opt_b.updatestate_depend_eliminate : 0.000005s : 0.02% optimize.opt_b.updatestate_assign_eliminate : 0.000002s : 0.01% optimize.opt_b.updatestate_loads_eliminate : 0.000002s : 0.01% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000016s : 0.06% optimize.optimize_parallel_all_gather_comm : 0.000016s : 0.06% optimize.overlap_param_gather : 0.000002s : 0.01% optimize.cconv : 0.000022s : 0.08% optimize.loop_unroll : 0.000412s : 1.53% optimize.opt_after_cconv.c_1 : 0.000031s : 0.11% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000005s : 0.02% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000002s : 0.01% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.01% optimize.opt_after_cconv.cse : 0.000016s : 0.06% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000013s : 0.05% optimize.tuple_transform.d_1 : 0.000042s : 0.16% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000007s : 0.03% optimize.partial_unused_args_eliminate : 0.000002s : 0.01% optimize.add_recomputation : 0.000045s : 0.17% optimize.cse_after_recomputation.cse : 0.000010s : 0.04% optimize.environ_conv : 0.000004s : 0.02% optimize.swap_dp_allreduce_reducescatter : 0.000005s : 0.02% optimize.bias_add_comm_swap : 0.000002s : 0.01% optimize.label_micro_interleaved_index : 0.000004s : 0.02% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.01% optimize.merge_cast_opt : 0.000002s : 0.01% optimize.slice_recompute_activation : 0.000002s : 0.01% optimize.micro_interleaved_order_control : 0.000002s : 0.01% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.00% optimize.full_micro_interleaved_order_control : 0.000002s : 0.01% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.01% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.01% optimize.control_data_broadcast_order : 0.000012s : 0.04% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.01% optimize.offloading_packed_experts : 0.000003s : 0.01% optimize.overlap_recompute_and_grad_model_parallel : 0.000004s : 0.02% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000002s : 0.01% optimize.overlap_recompute_comm : 0.000002s : 0.01% optimize.overlap_grad_ring_attention : 0.000004s : 0.01% optimize.overlap_grad_flash_sp : 0.000018s : 0.07% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.01% optimize.split_layernorm_comm : 0.000002s : 0.01% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000003s : 0.01% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000009s : 0.03% optimize.symbol_engine_optimizer.elim_not_effective : 0.000013s : 0.05% optimize.symbol_engine_optimizer.opt_reshape : 0.000007s : 0.03% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000010s : 0.04% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.01% pipeline_parallel_scheduler : 0.000001s : 0.01% auto_monad_reorder : 0.000016s : 0.06% get_jit_bprop_graph : 0.000001s : 0.01% rewriter_after_jit_bprop_graph : 0.000003s : 0.01% opt_after_jit_grad : 0.000437s : 1.62% validate : 0.000034s : 0.13% Time group info: ------[substitution.] 0.000173 28 1.26% : 0.000002s : 2: substitution.elim_not_effective 0.81% : 0.000001s : 2: substitution.fold_const_symbol 3.37% : 0.000006s : 4: substitution.graph_param_transform 78.81% : 0.000136s : 4: substitution.inline 1.77% : 0.000003s : 4: substitution.j_node_and_user_rematch 2.73% : 0.000005s : 4: substitution.remove_not_recompute_node 2.64% : 0.000005s : 4: substitution.replace_old_param 8.61% : 0.000015s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.022051 2 96.74% : 0.021331s : 1: type_inference.infer 3.26% : 0.000720s : 1: type_inference.specialize ------[replace.] 0.000061 8 61.20% : 0.000037s : 4: replace.inline 38.80% : 0.000024s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000147 8 91.15% : 0.000134s : 4: match.inline 8.85% : 0.000013s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000202 1278 0.95% : 0.000002s : 13: predicate.accumulaten_eliminater 0.69% : 0.000001s : 4: predicate.ad_related_special_op_eliminate 0.55% : 0.000001s : 8: predicate.addn_check_dump 0.86% : 0.000002s : 13: predicate.addn_zero_filter 0.82% : 0.000002s : 13: predicate.adjust_all_reduce_mul_add 2.19% : 0.000004s : 21: predicate.arithmetic_simplify 1.12% : 0.000002s : 13: predicate.cast_eliminate 0.59% : 0.000001s : 8: predicate.check_bprop_eliminate 0.51% : 0.000001s : 8: predicate.compare_switch_simplify 0.19% : 0.000000s : 4: predicate.const_output_eliminate 0.70% : 0.000001s : 8: predicate.depend_value_elim 0.97% : 0.000002s : 13: predicate.dict_get_item_const_eliminator 1.23% : 0.000002s : 13: predicate.dict_get_item_eliminator 0.87% : 0.000002s : 13: predicate.dict_set_item_eliminator 0.90% : 0.000002s : 8: predicate.dumpgradient_eliminate 0.21% : 0.000000s : 4: predicate.elim_not_effective 0.35% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.12% : 0.000002s : 17: predicate.environ_add_const_eliminate 1.12% : 0.000002s : 17: predicate.environ_get_add_eliminate 1.10% : 0.000002s : 17: predicate.environ_get_depend_swap 1.85% : 0.000004s : 25: predicate.environ_get_eliminate 1.14% : 0.000002s : 17: predicate.environ_get_set_eliminate 1.49% : 0.000003s : 21: predicate.exchange_switch_depend_value 2.58% : 0.000005s : 21: predicate.float_depend_g_call 0.54% : 0.000001s : 8: predicate.float_environ_get_switch 0.75% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.20% : 0.000000s : 4: predicate.fold_const_symbol 0.63% : 0.000001s : 8: predicate.get_grad_eliminate 0.22% : 0.000000s : 4: predicate.graph_param_transform 0.59% : 0.000001s : 8: predicate.incorporate_call 0.47% : 0.000001s : 8: predicate.incorporate_call_switch 6.57% : 0.000013s : 58: predicate.inline 0.82% : 0.000002s : 8: predicate.inline_without_move 0.32% : 0.000001s : 8: predicate.j_node_and_user_rematch 0.77% : 0.000002s : 8: predicate.less_batch_normalization 1.88% : 0.000004s : 25: predicate.list_to_tuple_eliminator_ 2.52% : 0.000005s : 38: predicate.load_eliminater 0.88% : 0.000002s : 4: predicate.loop_unroll_after_grad 2.33% : 0.000005s : 34: predicate.loop_unroll_before_grad 1.51% : 0.000003s : 21: predicate.make_slice_get_slice_eliminator 0.55% : 0.000001s : 8: predicate.merge_addn 0.53% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.70% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.84% : 0.000002s : 13: predicate.minmaximum_grad 0.91% : 0.000002s : 4: predicate.mutable_eliminate 0.34% : 0.000001s : 4: predicate.opt_reshape 0.48% : 0.000001s : 4: predicate.parallel_virtual_node 1.88% : 0.000004s : 21: predicate.partial_defer_inline 1.67% : 0.000003s : 21: predicate.partial_eliminate 0.87% : 0.000002s : 13: predicate.print_const_string_wrapper 0.57% : 0.000001s : 8: predicate.reduce_all_const_elim 1.26% : 0.000003s : 13: predicate.reduce_eliminate 2.51% : 0.000005s : 38: predicate.redundant_stop_gradient_eliminater 0.41% : 0.000001s : 8: predicate.remove_not_recompute_node 1.37% : 0.000003s : 25: predicate.replace_applicator 0.54% : 0.000001s : 8: predicate.replace_old_param 0.29% : 0.000001s : 4: predicate.reset_defer_inline 0.90% : 0.000002s : 13: predicate.reshape_eliminate 0.57% : 0.000001s : 8: predicate.row_tensor_add_zeros_like 0.42% : 0.000001s : 4: predicate.row_tensor_eliminate 0.69% : 0.000001s : 8: predicate.same_eliminate 0.49% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.77% : 0.000002s : 8: predicate.shard_identity_eliminate 0.75% : 0.000002s : 8: predicate.special_op_eliminate 0.73% : 0.000001s : 8: predicate.specialize_transform 0.82% : 0.000002s : 8: predicate.split_environ_get_set_with_tuple_value 0.78% : 0.000002s : 8: predicate.stack_unstack_eliminate 0.36% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.61% : 0.000003s : 21: predicate.switch_defer_inline 2.13% : 0.000004s : 29: predicate.switch_layer_defer_inline 5.10% : 0.000010s : 67: predicate.switch_simplify 0.97% : 0.000002s : 13: predicate.tile_eliminate 0.94% : 0.000002s : 13: predicate.transpose_eliminate 1.52% : 0.000003s : 21: predicate.tuple_list_convert_item_index_to_positive 1.60% : 0.000003s : 21: predicate.tuple_list_get_item_const_eliminator 1.46% : 0.000003s : 21: predicate.tuple_list_get_item_depend_reorder 3.20% : 0.000006s : 33: predicate.tuple_list_get_item_eliminator 1.46% : 0.000003s : 21: predicate.tuple_list_get_set_item_eliminator 2.15% : 0.000004s : 29: predicate.tuple_list_set_item_eliminator 1.70% : 0.000003s : 25: predicate.tuple_to_list_eliminator_ 2.46% : 0.000005s : 38: predicate.updatestate_pure_node_eliminater 3.18% : 0.000006s : 46: predicate.updatestate_useless_node_eliminater 0.40% : 0.000001s : 4: predicate.value_based_eliminate 0.69% : 0.000001s : 8: predicate.virtual_dataset_eliminate 0.62% : 0.000001s : 8: predicate.virtual_output_eliminate 0.29% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.43% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000546 11 56.04% : 0.000306s : 5: func_graph_cloner_run.FuncGraphClonerGraph 43.96% : 0.000240s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.040630 192 0.01% : 0.000004s : 1: ForceFp32Comm 7.85% : 0.003189s : 1: add_attr 7.82% : 0.003179s : 1: add_attr_with_inline 0.01% : 0.000004s : 1: add_comm_op_reuse_tag 0.12% : 0.000048s : 1: add_recomputation 0.01% : 0.000004s : 1: assign_add_opt 0.17% : 0.000069s : 1: auto_monad 0.05% : 0.000019s : 1: auto_monad_reorder 0.01% : 0.000003s : 1: begin_end_overlap_inline 0.01% : 0.000005s : 1: bias_add_comm_swap 1.11% : 0.000451s : 1: bootstrap 0.06% : 0.000026s : 1: cconv 0.01% : 0.000004s : 1: comm_op_add_attrs 0.04% : 0.000015s : 1: control_data_broadcast_order 0.02% : 0.000009s : 1: convert_after_rewriter 0.06% : 0.000023s : 1: cse_after_recomputation 0.01% : 0.000005s : 1: dataset_repeat_opt 0.01% : 0.000005s : 1: detach_backward 0.02% : 0.000007s : 1: environ_conv 0.06% : 0.000026s : 1: event_method 0.01% : 0.000005s : 1: full_micro_interleaved_order_control 0.01% : 0.000004s : 1: get_jit_bprop_graph 0.02% : 0.000010s : 1: graph_reusing 0.01% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000004s : 1: handle_group_info 0.01% : 0.000006s : 1: inline 0.01% : 0.000006s : 1: insert-virtual-dataset 0.01% : 0.000004s : 1: interleave_parallel_branches 0.01% : 0.000004s : 1: interleave_split_concat_branches 0.01% : 0.000006s : 1: label_fine_grained_interleaved_index 0.02% : 0.000007s : 1: label_micro_interleaved_index 1.03% : 0.000419s : 1: loop_unroll 0.01% : 0.000004s : 1: merge_cast_opt 0.01% : 0.000005s : 1: micro_interleaved_order_control 1.20% : 0.000486s : 1: mutable_eliminate 0.02% : 0.000006s : 1: offloading_packed_experts 0.03% : 0.000013s : 1: opt.transform.loop_unroll_optimizer 0.03% : 0.000014s : 1: opt.transform.mutable_eliminate 2.88% : 0.001171s : 78: opt.transform.opt_a 0.07% : 0.000029s : 1: opt.transform.opt_after_cconv 0.06% : 0.000023s : 1: opt.transform.opt_after_jit_grad 0.25% : 0.000103s : 28: opt.transform.opt_b 0.12% : 0.000047s : 2: opt.transform.opt_trans_graph 0.09% : 0.000036s : 4: opt.transform.symbol_engine_opt 6.38% : 0.002594s : 1: opt_a 0.25% : 0.000103s : 1: opt_after_cconv 1.10% : 0.000445s : 1: opt_after_jit_grad 0.51% : 0.000206s : 1: opt_b 11.13% : 0.004522s : 1: optimize 0.05% : 0.000019s : 1: optimize_parallel_all_gather_comm 0.02% : 0.000008s : 1: order_py_execute_after_rewriter 0.05% : 0.000022s : 1: overlap_grad_flash_sp 0.01% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.02% : 0.000007s : 1: overlap_grad_ring_attention 0.01% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.01% : 0.000005s : 1: overlap_param_gather 0.01% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.02% : 0.000008s : 1: overlap_recompute_and_grad_model_parallel 0.01% : 0.000005s : 1: overlap_recompute_comm 0.02% : 0.000007s : 1: parallel-infer-symbol 0.01% : 0.000004s : 1: parallel-infer-symbol-second 0.01% : 0.000005s : 1: partial_unused_args_eliminate 0.01% : 0.000004s : 1: pipeline_parallel_scheduler 0.01% : 0.000005s : 1: pipeline_split 0.09% : 0.000037s : 1: pre_auto_parallel 0.07% : 0.000030s : 1: py_interpret_to_execute 0.03% : 0.000012s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000004s : 1: remove_cast_before_assign_add 0.04% : 0.000017s : 1: remove_dup_value 0.78% : 0.000315s : 1: renormalize.infer 0.67% : 0.000272s : 1: renormalize.specialize 0.01% : 0.000005s : 1: reorder_send_recv_between_fp_bp 0.02% : 0.000006s : 1: rewriter_after_jit_bprop_graph 0.09% : 0.000036s : 1: rewriter_after_opt_a 0.21% : 0.000085s : 1: rewriter_before_opt_a 0.01% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.01% : 0.000005s : 1: slice_recompute_activation 0.01% : 0.000005s : 1: split_layernorm_comm 0.01% : 0.000005s : 1: split_matmul_comm_elemetwise 0.02% : 0.000007s : 1: swap_dp_allreduce_reducescatter 0.19% : 0.000077s : 1: symbol_engine_optimizer 0.19% : 0.000076s : 1: tuple_transform 54.48% : 0.022136s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:45:56.280.91 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:45:56.283.61 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0162913, [21] [bootstrap]: 0.00041167 [type_inference]: 0.00554424 [event_method]: 1.957e-05 [auto_monad]: 6.289e-05 [graph_reusing]: 6.19001e-06 [inline]: 2.25002e-06 [add_attr]: 0.00308563, [1] [add_attr_with_inline]: 0.00307725, [1] [Cycle 1]: 7.285e-05, [2] [tag_attr]: 1.947e-05 [meta_addattr_fg_expand]: 6.42001e-06 [parallel-infer-symbol]: 3.12002e-06 [pre_auto_parallel]: 3.34e-05 [insert-virtual-dataset]: 2.39999e-06 [parallel-infer-symbol-second]: 6.89994e-07 [dataset_repeat_opt]: 2.39001e-06 [pipeline_split]: 1.54998e-06 [optimize]: 0.00597232, [53] [py_interpret_to_execute]: 3.083e-05 [rewriter_before_opt_a]: 8.757e-05 [opt_a]: 0.00350831, [2] [Cycle 1]: 0.0025425, [45] [expand_dump_flag]: 3.21999e-06 [switch_simplify]: 4.24e-05 [loop_unroll]: 3.121e-05 [a_1]: 0.00073341 [with_stream_mark]: 1.593e-05 [recompute_prepare]: 1.098e-05 [updatestate_depend_eliminate]: 5.30999e-06 [updatestate_assign_eliminate]: 4.07998e-06 [updatestate_loads_eliminate]: 3.92998e-06 [parameter_eliminate]: 2.14e-06 [a_2]: 0.00013093 [accelerated_algorithm]: 8.38001e-06 [shard]: 1.76003e-06 [meta_shard_fg_expand]: 1.98002e-06 [shard_inline]: 7.55e-06 [merge_send_recv]: 9.54e-06 [auto_parallel]: 7.51001e-06 [parallel]: 1.982e-05 [flash_sp]: 8.56002e-06 [merge_comm]: 4.84003e-06 [allreduce_fusion]: 4.08001e-06 [matmul_add_comm_reduction]: 1.082e-05 [allreduce_slice_to_reducescatter]: 6.39993e-07 [virtual_shard_identity]: 8.74e-06 [virtual_dataset]: 8e-06 [get_grad_eliminate_]: 1.086e-05 [virtual_output]: 8.64e-06 [merge_forward]: 5.81e-06 [cell_reuse_recompute_pass]: 1.32999e-06 [offload_activation]: 1.201e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.899e-05 [merge_recompute_call_nodes]: 1.62001e-06 [before_grad]: 1.327e-05 [set_forward_comm_id_for_comm_node_pass]: 5.29998e-06 [meta_fg_expand]: 3.58e-06 [flash_sp_send_recv_attached]: 3.23e-06 [receive_attached]: 2.49999e-06 [after_resolve]: 1.333e-05 [a_after_grad]: 1.241e-05 [renormalize]: 0.00065292 [add_forward_monad_depend]: 6.35002e-06 [auto_monad_grad]: 2.55002e-06 [auto_monad_eliminator]: 1.772e-05 [cse]: 3.631e-05 [a_3]: 7.633e-05 [Cycle 2]: 0.00095196, [45] [expand_dump_flag]: 1.20001e-06 [switch_simplify]: 1.027e-05 [loop_unroll]: 8.14002e-06 [a_1]: 0.00018568 [with_stream_mark]: 1.435e-05 [recompute_prepare]: 8.60001e-06 [updatestate_depend_eliminate]: 4.16001e-06 [updatestate_assign_eliminate]: 2.88998e-06 [updatestate_loads_eliminate]: 2.98998e-06 [parameter_eliminate]: 1.14e-06 [a_2]: 0.00012272 [accelerated_algorithm]: 7.71001e-06 [shard]: 1.94999e-06 [meta_shard_fg_expand]: 1.94999e-06 [shard_inline]: 7.65e-06 [merge_send_recv]: 6.02001e-06 [auto_parallel]: 6.54999e-06 [parallel]: 5.89e-06 [flash_sp]: 4.25e-06 [merge_comm]: 4.08001e-06 [allreduce_fusion]: 4.52998e-06 [matmul_add_comm_reduction]: 7.22997e-06 [allreduce_slice_to_reducescatter]: 9.20001e-07 [virtual_shard_identity]: 8.48999e-06 [virtual_dataset]: 7.23e-06 [get_grad_eliminate_]: 6.83e-06 [virtual_output]: 6.96001e-06 [merge_forward]: 3.6e-06 [cell_reuse_recompute_pass]: 1.29998e-06 [offload_activation]: 8.01001e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.605e-05 [merge_recompute_call_nodes]: 1.52999e-06 [before_grad]: 1.183e-05 [set_forward_comm_id_for_comm_node_pass]: 4.25e-06 [meta_fg_expand]: 2.70002e-06 [flash_sp_send_recv_attached]: 9.50007e-07 [receive_attached]: 1.05001e-06 [after_resolve]: 1.087e-05 [a_after_grad]: 1.162e-05 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 1.99e-06 [auto_monad_grad]: 1.42e-06 [auto_monad_eliminator]: 8.93002e-06 [cse]: 2.094e-05 [a_3]: 6.019e-05 [py_interpret_to_execute_after_opt_a]: 1.34e-05 [slice_cell_reuse_recomputed_activation]: 5.09e-06 [rewriter_after_opt_a]: 4.628e-05 [convert_after_rewriter]: 1.079e-05 [order_py_execute_after_rewriter]: 9.37001e-06 [mutable_eliminate]: 0.00050444 [opt_b]: 0.00031864, [1] [Cycle 1]: 0.00030905, [7] [b_1]: 0.0002029 [b_2]: 9.62999e-06 [updatestate_depend_eliminate]: 6.81999e-06 [updatestate_assign_eliminate]: 3.25e-06 [updatestate_loads_eliminate]: 3.28e-06 [renormalize]: 5.89993e-07 [cse]: 2.437e-05 [optimize_parallel_all_gather_comm]: 2.169e-05 [overlap_param_gather]: 5.15001e-06 [cconv]: 2.87e-05 [loop_unroll]: 0.00045405 [opt_after_cconv]: 0.00015065, [1] [Cycle 1]: 0.00014129, [7] [c_1]: 4.18e-05 [parameter_eliminate]: 2.89001e-06 [updatestate_depend_eliminate]: 6.81999e-06 [updatestate_assign_eliminate]: 3.18e-06 [updatestate_loads_eliminate]: 2.99999e-06 [cse]: 2.297e-05 [renormalize]: 4.10015e-07 [remove_dup_value]: 1.815e-05 [tuple_transform]: 0.00010978, [1] [Cycle 1]: 0.00010204, [4] [d_1]: 5.789e-05 [none_parameter_eliminate]: 1.67001e-06 [renormalize]: 2.10013e-07 [switch_simplify]: 9.49e-06 [partial_unused_args_eliminate]: 4.78001e-06 [add_recomputation]: 7.714e-05 [cse_after_recomputation]: 3.586e-05, [1] [Cycle 1]: 2.811e-05, [1] [cse]: 1.746e-05 [environ_conv]: 9.67001e-06 [swap_dp_allreduce_reducescatter]: 9.39998e-06 [bias_add_comm_swap]: 5.14e-06 [label_micro_interleaved_index]: 7.87e-06 [label_fine_grained_interleaved_index]: 5.22e-06 [merge_cast_opt]: 3.66001e-06 [slice_recompute_activation]: 4.62e-06 [micro_interleaved_order_control]: 4.99e-06 [assign_add_opt]: 3.76999e-06 [ForceFp32Comm]: 3.84002e-06 [remove_cast_before_assign_add]: 3.66999e-06 [full_micro_interleaved_order_control]: 4.74e-06 [reorder_send_recv_between_fp_bp]: 5.37001e-06 [comm_op_add_attrs]: 4.58999e-06 [add_comm_op_reuse_tag]: 3.57997e-06 [interleave_split_concat_branches]: 3.78999e-06 [interleave_parallel_branches]: 3.63e-06 [overlap_opt_shard_in_pipeline]: 4.12e-06 [overlap_opt_shard_grad_in_pipeline]: 4.66002e-06 [control_data_broadcast_order]: 1.881e-05 [grouped_pairwise_exchange_alltoall]: 4.62e-06 [offloading_packed_experts]: 7.64002e-06 [overlap_recompute_and_grad_model_parallel]: 8.26002e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.9e-06 [overlap_recompute_allgather_and_fa_grad]: 3.72002e-06 [overlap_recompute_comm]: 5.29e-06 [overlap_grad_ring_attention]: 7.78001e-06 [overlap_grad_flash_sp]: 2.517e-05 [begin_end_overlap_inline]: 3.21999e-06 [split_matmul_comm_elemetwise]: 4.92999e-06 [split_layernorm_comm]: 4.70001e-06 [handle_group_info]: 3.51999e-06 [symbol_engine_optimizer]: 0.00010924, [1] [Cycle 1]: 0.00010196, [6] [build]: 3.08998e-06 [elim_shapecalc]: 1.282e-05 [elim_not_effective]: 1.662e-05 [opt_reshape]: 8.75999e-06 [fold_const_symbol]: 1.261e-05 [renormalize]: 2.69996e-07 [detach_backward]: 3.39001e-06 [pipeline_parallel_scheduler]: 1.89999e-06 [auto_monad_reorder]: 2.296e-05 [get_jit_bprop_graph]: 1.37999e-06 [rewriter_after_jit_bprop_graph]: 4.38001e-06 [opt_after_jit_grad]: 0.00048816 [validate]: 4.085e-05 Sums bootstrap : 0.000412s : 3.65% type_inference : 0.005544s : 49.17% event_method : 0.000020s : 0.17% auto_monad : 0.000063s : 0.56% graph_reusing : 0.000006s : 0.05% inline : 0.000002s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000019s : 0.17% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.06% parallel-infer-symbol : 0.000003s : 0.03% pre_auto_parallel : 0.000033s : 0.30% insert-virtual-dataset : 0.000002s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000031s : 0.27% optimize.rewriter_before_opt_a : 0.000088s : 0.78% optimize.opt_a.expand_dump_flag : 0.000004s : 0.04% optimize.opt_a.switch_simplify : 0.000053s : 0.47% optimize.opt_a.loop_unroll : 0.000039s : 0.35% optimize.opt_a.a_1 : 0.000919s : 8.15% optimize.opt_a.with_stream_mark : 0.000030s : 0.27% optimize.opt_a.recompute_prepare : 0.000020s : 0.17% optimize.opt_a.updatestate_depend_eliminate : 0.000009s : 0.08% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.06% optimize.opt_a.updatestate_loads_eliminate : 0.000007s : 0.06% optimize.opt_a.parameter_eliminate : 0.000003s : 0.03% optimize.opt_a.a_2 : 0.000254s : 2.25% optimize.opt_a.accelerated_algorithm : 0.000016s : 0.14% optimize.opt_a.shard : 0.000004s : 0.03% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.03% optimize.opt_a.shard_inline : 0.000015s : 0.13% optimize.opt_a.merge_send_recv : 0.000016s : 0.14% optimize.opt_a.auto_parallel : 0.000014s : 0.12% optimize.opt_a.parallel : 0.000026s : 0.23% optimize.opt_a.flash_sp : 0.000013s : 0.11% optimize.opt_a.merge_comm : 0.000009s : 0.08% optimize.opt_a.allreduce_fusion : 0.000009s : 0.08% optimize.opt_a.matmul_add_comm_reduction : 0.000018s : 0.16% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000002s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000017s : 0.15% optimize.opt_a.virtual_dataset : 0.000015s : 0.14% optimize.opt_a.get_grad_eliminate_ : 0.000018s : 0.16% optimize.opt_a.virtual_output : 0.000016s : 0.14% optimize.opt_a.merge_forward : 0.000009s : 0.08% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.02% optimize.opt_a.offload_activation : 0.000020s : 0.18% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000035s : 0.31% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.03% optimize.opt_a.before_grad : 0.000025s : 0.22% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000010s : 0.08% optimize.opt_a.meta_fg_expand : 0.000006s : 0.06% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.04% optimize.opt_a.receive_attached : 0.000004s : 0.03% optimize.opt_a.after_resolve : 0.000024s : 0.21% optimize.opt_a.a_after_grad : 0.000024s : 0.21% optimize.opt_a.renormalize : 0.000653s : 5.79% optimize.opt_a.add_forward_monad_depend : 0.000008s : 0.07% optimize.opt_a.auto_monad_grad : 0.000004s : 0.04% optimize.opt_a.auto_monad_eliminator : 0.000027s : 0.24% optimize.opt_a.cse : 0.000057s : 0.51% optimize.opt_a.a_3 : 0.000137s : 1.21% optimize.py_interpret_to_execute_after_opt_a : 0.000013s : 0.12% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.05% optimize.rewriter_after_opt_a : 0.000046s : 0.41% optimize.convert_after_rewriter : 0.000011s : 0.10% optimize.order_py_execute_after_rewriter : 0.000009s : 0.08% optimize.mutable_eliminate : 0.000504s : 4.47% optimize.opt_b.b_1 : 0.000203s : 1.80% optimize.opt_b.b_2 : 0.000010s : 0.09% optimize.opt_b.updatestate_depend_eliminate : 0.000007s : 0.06% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_b.renormalize : 0.000001s : 0.01% optimize.opt_b.cse : 0.000024s : 0.22% optimize.optimize_parallel_all_gather_comm : 0.000022s : 0.19% optimize.overlap_param_gather : 0.000005s : 0.05% optimize.cconv : 0.000029s : 0.25% optimize.loop_unroll : 0.000454s : 4.03% optimize.opt_after_cconv.c_1 : 0.000042s : 0.37% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000007s : 0.06% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.cse : 0.000023s : 0.20% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000018s : 0.16% optimize.tuple_transform.d_1 : 0.000058s : 0.51% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000009s : 0.08% optimize.partial_unused_args_eliminate : 0.000005s : 0.04% optimize.add_recomputation : 0.000077s : 0.68% optimize.cse_after_recomputation.cse : 0.000017s : 0.15% optimize.environ_conv : 0.000010s : 0.09% optimize.swap_dp_allreduce_reducescatter : 0.000009s : 0.08% optimize.bias_add_comm_swap : 0.000005s : 0.05% optimize.label_micro_interleaved_index : 0.000008s : 0.07% optimize.label_fine_grained_interleaved_index : 0.000005s : 0.05% optimize.merge_cast_opt : 0.000004s : 0.03% optimize.slice_recompute_activation : 0.000005s : 0.04% optimize.micro_interleaved_order_control : 0.000005s : 0.04% optimize.assign_add_opt : 0.000004s : 0.03% optimize.ForceFp32Comm : 0.000004s : 0.03% optimize.remove_cast_before_assign_add : 0.000004s : 0.03% optimize.full_micro_interleaved_order_control : 0.000005s : 0.04% optimize.reorder_send_recv_between_fp_bp : 0.000005s : 0.05% optimize.comm_op_add_attrs : 0.000005s : 0.04% optimize.add_comm_op_reuse_tag : 0.000004s : 0.03% optimize.interleave_split_concat_branches : 0.000004s : 0.03% optimize.interleave_parallel_branches : 0.000004s : 0.03% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.04% optimize.overlap_opt_shard_grad_in_pipeline : 0.000005s : 0.04% optimize.control_data_broadcast_order : 0.000019s : 0.17% optimize.grouped_pairwise_exchange_alltoall : 0.000005s : 0.04% optimize.offloading_packed_experts : 0.000008s : 0.07% optimize.overlap_recompute_and_grad_model_parallel : 0.000008s : 0.07% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.03% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.03% optimize.overlap_recompute_comm : 0.000005s : 0.05% optimize.overlap_grad_ring_attention : 0.000008s : 0.07% optimize.overlap_grad_flash_sp : 0.000025s : 0.22% optimize.begin_end_overlap_inline : 0.000003s : 0.03% optimize.split_matmul_comm_elemetwise : 0.000005s : 0.04% optimize.split_layernorm_comm : 0.000005s : 0.04% optimize.handle_group_info : 0.000004s : 0.03% optimize.symbol_engine_optimizer.build : 0.000003s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000013s : 0.11% optimize.symbol_engine_optimizer.elim_not_effective : 0.000017s : 0.15% optimize.symbol_engine_optimizer.opt_reshape : 0.000009s : 0.08% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000013s : 0.11% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000003s : 0.03% pipeline_parallel_scheduler : 0.000002s : 0.02% auto_monad_reorder : 0.000023s : 0.20% get_jit_bprop_graph : 0.000001s : 0.01% rewriter_after_jit_bprop_graph : 0.000004s : 0.04% opt_after_jit_grad : 0.000488s : 4.33% validate : 0.000041s : 0.36% Time group info: ------[substitution.] 0.000200 38 10.84% : 0.000022s : 3: substitution.cast_eliminate 1.16% : 0.000002s : 3: substitution.elim_not_effective 0.85% : 0.000002s : 3: substitution.fold_const_symbol 3.48% : 0.000007s : 5: substitution.graph_param_transform 68.72% : 0.000137s : 4: substitution.inline 2.18% : 0.000004s : 6: substitution.j_node_and_user_rematch 3.09% : 0.000006s : 6: substitution.remove_not_recompute_node 2.15% : 0.000004s : 4: substitution.replace_old_param 7.54% : 0.000015s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.005497 2 87.29% : 0.004799s : 1: type_inference.infer 12.71% : 0.000699s : 1: type_inference.specialize ------[replace.] 0.000063 8 60.34% : 0.000038s : 4: replace.inline 39.66% : 0.000025s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000148 8 91.14% : 0.000135s : 4: match.inline 8.86% : 0.000013s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000259 1596 0.95% : 0.000002s : 17: predicate.accumulaten_eliminater 0.72% : 0.000002s : 5: predicate.ad_related_special_op_eliminate 0.60% : 0.000002s : 10: predicate.addn_check_dump 0.94% : 0.000002s : 17: predicate.addn_zero_filter 0.88% : 0.000002s : 17: predicate.adjust_all_reduce_mul_add 2.07% : 0.000005s : 27: predicate.arithmetic_simplify 1.04% : 0.000003s : 17: predicate.cast_eliminate 0.63% : 0.000002s : 10: predicate.check_bprop_eliminate 0.58% : 0.000001s : 10: predicate.compare_switch_simplify 0.19% : 0.000000s : 5: predicate.const_output_eliminate 0.61% : 0.000002s : 10: predicate.depend_value_elim 1.03% : 0.000003s : 17: predicate.dict_get_item_const_eliminator 1.09% : 0.000003s : 17: predicate.dict_get_item_eliminator 0.93% : 0.000002s : 17: predicate.dict_set_item_eliminator 0.82% : 0.000002s : 10: predicate.dumpgradient_eliminate 0.21% : 0.000001s : 5: predicate.elim_not_effective 0.39% : 0.000001s : 5: predicate.elim_shapecalc_of_broadcastargs 1.16% : 0.000003s : 22: predicate.environ_add_const_eliminate 1.15% : 0.000003s : 22: predicate.environ_get_add_eliminate 1.16% : 0.000003s : 22: predicate.environ_get_depend_swap 1.73% : 0.000004s : 32: predicate.environ_get_eliminate 1.11% : 0.000003s : 22: predicate.environ_get_set_eliminate 1.49% : 0.000004s : 25: predicate.exchange_switch_depend_value 2.28% : 0.000006s : 25: predicate.float_depend_g_call 0.54% : 0.000001s : 10: predicate.float_environ_get_switch 0.84% : 0.000002s : 15: predicate.float_tuple_getitem_switch 0.19% : 0.000000s : 5: predicate.fold_const_symbol 0.81% : 0.000002s : 10: predicate.get_grad_eliminate 0.21% : 0.000001s : 5: predicate.graph_param_transform 0.63% : 0.000002s : 10: predicate.incorporate_call 0.53% : 0.000001s : 10: predicate.incorporate_call_switch 6.50% : 0.000017s : 72: predicate.inline 0.79% : 0.000002s : 10: predicate.inline_without_move 0.32% : 0.000001s : 10: predicate.j_node_and_user_rematch 0.78% : 0.000002s : 10: predicate.less_batch_normalization 1.84% : 0.000005s : 31: predicate.list_to_tuple_eliminator_ 2.60% : 0.000007s : 48: predicate.load_eliminater 0.84% : 0.000002s : 5: predicate.loop_unroll_after_grad 2.01% : 0.000005s : 36: predicate.loop_unroll_before_grad 1.72% : 0.000004s : 27: predicate.make_slice_get_slice_eliminator 0.59% : 0.000002s : 10: predicate.merge_addn 0.75% : 0.000002s : 10: predicate.micro_step_allgather_replace 0.58% : 0.000001s : 10: predicate.mini_step_allgather_replace 0.88% : 0.000002s : 17: predicate.minmaximum_grad 0.97% : 0.000003s : 5: predicate.mutable_eliminate 0.33% : 0.000001s : 5: predicate.opt_reshape 0.34% : 0.000001s : 5: predicate.parallel_virtual_node 1.73% : 0.000004s : 25: predicate.partial_defer_inline 1.72% : 0.000004s : 26: predicate.partial_eliminate 0.91% : 0.000002s : 17: predicate.print_const_string_wrapper 0.58% : 0.000001s : 10: predicate.reduce_all_const_elim 1.24% : 0.000003s : 17: predicate.reduce_eliminate 2.57% : 0.000007s : 48: predicate.redundant_stop_gradient_eliminater 0.37% : 0.000001s : 10: predicate.remove_not_recompute_node 1.35% : 0.000003s : 31: predicate.replace_applicator 0.41% : 0.000001s : 10: predicate.replace_old_param 0.27% : 0.000001s : 5: predicate.reset_defer_inline 0.98% : 0.000003s : 17: predicate.reshape_eliminate 0.63% : 0.000002s : 10: predicate.row_tensor_add_zeros_like 0.34% : 0.000001s : 5: predicate.row_tensor_eliminate 0.71% : 0.000002s : 10: predicate.same_eliminate 0.59% : 0.000002s : 10: predicate.set_cell_output_no_recompute 0.65% : 0.000002s : 10: predicate.shard_identity_eliminate 0.69% : 0.000002s : 10: predicate.special_op_eliminate 0.72% : 0.000002s : 10: predicate.specialize_transform 0.82% : 0.000002s : 10: predicate.split_environ_get_set_with_tuple_value 0.75% : 0.000002s : 10: predicate.stack_unstack_eliminate 0.39% : 0.000001s : 5: predicate.switch_call_monad_eliminater 1.50% : 0.000004s : 25: predicate.switch_defer_inline 2.06% : 0.000005s : 35: predicate.switch_layer_defer_inline 4.87% : 0.000013s : 76: predicate.switch_simplify 0.97% : 0.000003s : 17: predicate.tile_eliminate 1.01% : 0.000003s : 17: predicate.transpose_eliminate 1.59% : 0.000004s : 27: predicate.tuple_list_convert_item_index_to_positive 1.65% : 0.000004s : 27: predicate.tuple_list_get_item_const_eliminator 1.53% : 0.000004s : 27: predicate.tuple_list_get_item_depend_reorder 3.19% : 0.000008s : 41: predicate.tuple_list_get_item_eliminator 1.64% : 0.000004s : 27: predicate.tuple_list_get_set_item_eliminator 2.30% : 0.000006s : 37: predicate.tuple_list_set_item_eliminator 1.80% : 0.000005s : 31: predicate.tuple_to_list_eliminator_ 2.57% : 0.000007s : 48: predicate.updatestate_pure_node_eliminater 3.29% : 0.000009s : 58: predicate.updatestate_useless_node_eliminater 0.43% : 0.000001s : 5: predicate.value_based_eliminate 0.58% : 0.000001s : 10: predicate.virtual_dataset_eliminate 0.61% : 0.000002s : 10: predicate.virtual_output_eliminate 0.27% : 0.000001s : 5: predicate.virtual_view_grad_eliminate 0.41% : 0.000001s : 5: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000509 11 54.35% : 0.000277s : 5: func_graph_cloner_run.FuncGraphClonerGraph 45.65% : 0.000232s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.027669 192 0.03% : 0.000007s : 1: ForceFp32Comm 11.18% : 0.003094s : 1: add_attr 11.14% : 0.003081s : 1: add_attr_with_inline 0.02% : 0.000007s : 1: add_comm_op_reuse_tag 0.29% : 0.000081s : 1: add_recomputation 0.02% : 0.000006s : 1: assign_add_opt 0.26% : 0.000072s : 1: auto_monad 0.11% : 0.000030s : 1: auto_monad_reorder 0.02% : 0.000006s : 1: begin_end_overlap_inline 0.03% : 0.000008s : 1: bias_add_comm_swap 1.65% : 0.000455s : 1: bootstrap 0.12% : 0.000032s : 1: cconv 0.03% : 0.000008s : 1: comm_op_add_attrs 0.08% : 0.000022s : 1: control_data_broadcast_order 0.05% : 0.000014s : 1: convert_after_rewriter 0.14% : 0.000040s : 1: cse_after_recomputation 0.03% : 0.000008s : 1: dataset_repeat_opt 0.07% : 0.000018s : 1: detach_backward 0.05% : 0.000013s : 1: environ_conv 0.11% : 0.000030s : 1: event_method 0.03% : 0.000008s : 1: full_micro_interleaved_order_control 0.03% : 0.000007s : 1: get_jit_bprop_graph 0.04% : 0.000012s : 1: graph_reusing 0.03% : 0.000008s : 1: grouped_pairwise_exchange_alltoall 0.02% : 0.000007s : 1: handle_group_info 0.03% : 0.000008s : 1: inline 0.03% : 0.000008s : 1: insert-virtual-dataset 0.02% : 0.000006s : 1: interleave_parallel_branches 0.02% : 0.000007s : 1: interleave_split_concat_branches 0.03% : 0.000008s : 1: label_fine_grained_interleaved_index 0.04% : 0.000011s : 1: label_micro_interleaved_index 1.67% : 0.000461s : 1: loop_unroll 0.02% : 0.000006s : 1: merge_cast_opt 0.03% : 0.000008s : 1: micro_interleaved_order_control 1.85% : 0.000511s : 1: mutable_eliminate 0.04% : 0.000011s : 1: offloading_packed_experts 0.06% : 0.000017s : 1: opt.transform.loop_unroll_optimizer 0.07% : 0.000018s : 1: opt.transform.mutable_eliminate 5.23% : 0.001447s : 78: opt.transform.opt_a 0.14% : 0.000040s : 1: opt.transform.opt_after_cconv 0.11% : 0.000029s : 1: opt.transform.opt_after_jit_grad 0.50% : 0.000137s : 28: opt.transform.opt_b 0.23% : 0.000065s : 2: opt.transform.opt_trans_graph 0.17% : 0.000047s : 4: opt.transform.symbol_engine_opt 12.69% : 0.003512s : 1: opt_a 0.56% : 0.000154s : 1: opt_after_cconv 1.80% : 0.000499s : 1: opt_after_jit_grad 1.16% : 0.000322s : 1: opt_b 22.83% : 0.006318s : 1: optimize 0.09% : 0.000025s : 1: optimize_parallel_all_gather_comm 0.04% : 0.000012s : 1: order_py_execute_after_rewriter 0.10% : 0.000029s : 1: overlap_grad_flash_sp 0.02% : 0.000007s : 1: overlap_grad_matmul_and_grad_allreduce 0.04% : 0.000011s : 1: overlap_grad_ring_attention 0.03% : 0.000008s : 1: overlap_opt_shard_grad_in_pipeline 0.03% : 0.000008s : 1: overlap_opt_shard_in_pipeline 0.03% : 0.000008s : 1: overlap_param_gather 0.02% : 0.000006s : 1: overlap_recompute_allgather_and_fa_grad 0.04% : 0.000011s : 1: overlap_recompute_and_grad_model_parallel 0.03% : 0.000009s : 1: overlap_recompute_comm 0.04% : 0.000010s : 1: parallel-infer-symbol 0.02% : 0.000006s : 1: parallel-infer-symbol-second 0.03% : 0.000008s : 1: partial_unused_args_eliminate 0.03% : 0.000009s : 1: pipeline_parallel_scheduler 0.03% : 0.000007s : 1: pipeline_split 0.15% : 0.000041s : 1: pre_auto_parallel 0.13% : 0.000035s : 1: py_interpret_to_execute 0.06% : 0.000016s : 1: py_interpret_to_execute_after_opt_a 0.02% : 0.000006s : 1: remove_cast_before_assign_add 0.08% : 0.000022s : 1: remove_dup_value 1.30% : 0.000361s : 1: renormalize.infer 1.03% : 0.000284s : 1: renormalize.specialize 0.03% : 0.000008s : 1: reorder_send_recv_between_fp_bp 0.04% : 0.000010s : 1: rewriter_after_jit_bprop_graph 0.18% : 0.000051s : 1: rewriter_after_opt_a 0.33% : 0.000091s : 1: rewriter_before_opt_a 0.03% : 0.000008s : 1: slice_cell_reuse_recomputed_activation 0.03% : 0.000007s : 1: slice_recompute_activation 0.03% : 0.000008s : 1: split_layernorm_comm 0.03% : 0.000008s : 1: split_matmul_comm_elemetwise 0.04% : 0.000012s : 1: swap_dp_allreduce_reducescatter 0.41% : 0.000112s : 1: symbol_engine_optimizer 0.41% : 0.000113s : 1: tuple_transform 20.16% : 0.005577s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:45:56.477.806 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.025248, [21] [bootstrap]: 0.00036498 [type_inference]: 0.00531377 [event_method]: 1.82e-05 [auto_monad]: 6.287e-05 [graph_reusing]: 6.19001e-06 [inline]: 1.71e-06 [add_attr]: 0.00293565, [1] [add_attr_with_inline]: 0.00292772, [1] [Cycle 1]: 5.312e-05, [2] [tag_attr]: 1.723e-05 [meta_addattr_fg_expand]: 5.81003e-06 [parallel-infer-symbol]: 3.09001e-06 [pre_auto_parallel]: 3.118e-05 [insert-virtual-dataset]: 2.43002e-06 [parallel-infer-symbol-second]: 7.7e-07 [dataset_repeat_opt]: 1.84e-06 [pipeline_split]: 1.61002e-06 [optimize]: 0.0158297, [53] [py_interpret_to_execute]: 2.386e-05 [rewriter_before_opt_a]: 7.935e-05 [opt_a]: 0.0135935, [2] [Cycle 1]: 0.0127548, [45] [expand_dump_flag]: 2.91e-06 [switch_simplify]: 9.259e-05 [loop_unroll]: 3.254e-05 [a_1]: 0.00072242 [with_stream_mark]: 1.55e-05 [recompute_prepare]: 1.043e-05 [updatestate_depend_eliminate]: 4.32998e-06 [updatestate_assign_eliminate]: 3.65e-06 [updatestate_loads_eliminate]: 3.60998e-06 [parameter_eliminate]: 2.30002e-06 [a_2]: 9.991e-05 [accelerated_algorithm]: 8.70999e-06 [shard]: 1.68002e-06 [meta_shard_fg_expand]: 1.95001e-06 [shard_inline]: 7.78001e-06 [merge_send_recv]: 9.14e-06 [auto_parallel]: 7.73999e-06 [parallel]: 1.795e-05 [flash_sp]: 9.00001e-06 [merge_comm]: 4.70001e-06 [allreduce_fusion]: 4.23999e-06 [matmul_add_comm_reduction]: 1.085e-05 [allreduce_slice_to_reducescatter]: 6.19999e-07 [virtual_shard_identity]: 1.011e-05 [virtual_dataset]: 8.05e-06 [get_grad_eliminate_]: 7.40998e-06 [virtual_output]: 7.48e-06 [merge_forward]: 4.31002e-06 [cell_reuse_recompute_pass]: 1.32e-06 [offload_activation]: 1.164e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.681e-05 [merge_recompute_call_nodes]: 1.72001e-06 [before_grad]: 1.409e-05 [set_forward_comm_id_for_comm_node_pass]: 4.60001e-06 [meta_fg_expand]: 3.54002e-06 [flash_sp_send_recv_attached]: 2.68003e-06 [receive_attached]: 2.32999e-06 [after_resolve]: 1.326e-05 [a_after_grad]: 1.138e-05 [renormalize]: 0.0111392 [add_forward_monad_depend]: 8.13999e-06 [auto_monad_grad]: 2.54999e-06 [auto_monad_eliminator]: 1.934e-05 [cse]: 3.928e-05 [a_3]: 7.036e-05 [Cycle 2]: 0.0008268, [45] [expand_dump_flag]: 2.45002e-06 [switch_simplify]: 1.065e-05 [loop_unroll]: 8.37e-06 [a_1]: 0.00019622 [with_stream_mark]: 1.762e-05 [recompute_prepare]: 8.25999e-06 [updatestate_depend_eliminate]: 4.71002e-06 [updatestate_assign_eliminate]: 3.48e-06 [updatestate_loads_eliminate]: 3.63e-06 [parameter_eliminate]: 2.29001e-06 [a_2]: 9.306e-05 [accelerated_algorithm]: 8.97999e-06 [shard]: 2.49001e-06 [meta_shard_fg_expand]: 2.43998e-06 [shard_inline]: 7.35e-06 [merge_send_recv]: 9.30001e-06 [auto_parallel]: 1.012e-05 [parallel]: 9.58002e-06 [flash_sp]: 4.58001e-06 [merge_comm]: 4.37e-06 [allreduce_fusion]: 5.85002e-06 [matmul_add_comm_reduction]: 1.001e-05 [allreduce_slice_to_reducescatter]: 8.79983e-07 [virtual_shard_identity]: 8.29002e-06 [virtual_dataset]: 7.46999e-06 [get_grad_eliminate_]: 7.81001e-06 [virtual_output]: 7.53e-06 [merge_forward]: 5.10999e-06 [cell_reuse_recompute_pass]: 2.82002e-06 [offload_activation]: 9.61e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.507e-05 [merge_recompute_call_nodes]: 1.57001e-06 [before_grad]: 1.227e-05 [set_forward_comm_id_for_comm_node_pass]: 4.33999e-06 [meta_fg_expand]: 3.22002e-06 [flash_sp_send_recv_attached]: 1.49e-06 [receive_attached]: 2.91e-06 [after_resolve]: 1.251e-05 [a_after_grad]: 1.116e-05 [renormalize]: 5.9983e-08 [add_forward_monad_depend]: 1.47001e-06 [auto_monad_grad]: 9.20001e-07 [auto_monad_eliminator]: 8.15999e-06 [cse]: 1.815e-05 [a_3]: 4.542e-05 [py_interpret_to_execute_after_opt_a]: 1.338e-05 [slice_cell_reuse_recomputed_activation]: 2.02999e-06 [rewriter_after_opt_a]: 4.121e-05 [convert_after_rewriter]: 7.51999e-06 [order_py_execute_after_rewriter]: 5.96998e-06 [mutable_eliminate]: 0.00065964 [opt_b]: 0.00024329, [1] [Cycle 1]: 0.00023581, [7] [b_1]: 0.00015381 [b_2]: 9.92999e-06 [updatestate_depend_eliminate]: 6.09001e-06 [updatestate_assign_eliminate]: 3.03e-06 [updatestate_loads_eliminate]: 2.96999e-06 [renormalize]: 9.20001e-07 [cse]: 2.185e-05 [optimize_parallel_all_gather_comm]: 1.701e-05 [overlap_param_gather]: 2.51998e-06 [cconv]: 2.355e-05 [loop_unroll]: 0.0004134 [opt_after_cconv]: 0.00011474, [1] [Cycle 1]: 0.00010877, [7] [c_1]: 3.778e-05 [parameter_eliminate]: 2.51e-06 [updatestate_depend_eliminate]: 5.52001e-06 [updatestate_assign_eliminate]: 3.20002e-06 [updatestate_loads_eliminate]: 2.78003e-06 [cse]: 2.192e-05 [renormalize]: 3.80009e-07 [remove_dup_value]: 1.507e-05 [tuple_transform]: 8.539e-05, [1] [Cycle 1]: 8.08e-05, [4] [d_1]: 5.256e-05 [none_parameter_eliminate]: 1.59e-06 [renormalize]: 1.8999e-07 [switch_simplify]: 7.95998e-06 [partial_unused_args_eliminate]: 2.34001e-06 [add_recomputation]: 5.523e-05 [cse_after_recomputation]: 2.53e-05, [1] [Cycle 1]: 2.05e-05, [1] [cse]: 1.43e-05 [environ_conv]: 5.86998e-06 [swap_dp_allreduce_reducescatter]: 5.59998e-06 [bias_add_comm_swap]: 2.31e-06 [label_micro_interleaved_index]: 3.91999e-06 [label_fine_grained_interleaved_index]: 2.69001e-06 [merge_cast_opt]: 1.27e-06 [slice_recompute_activation]: 1.96e-06 [micro_interleaved_order_control]: 2.29999e-06 [assign_add_opt]: 1.15001e-06 [ForceFp32Comm]: 8.2e-07 [remove_cast_before_assign_add]: 1.17e-06 [full_micro_interleaved_order_control]: 2.29999e-06 [reorder_send_recv_between_fp_bp]: 2.74001e-06 [comm_op_add_attrs]: 1.10999e-06 [add_comm_op_reuse_tag]: 1.00001e-06 [interleave_split_concat_branches]: 1.17e-06 [interleave_parallel_branches]: 1.02998e-06 [overlap_opt_shard_in_pipeline]: 1.65001e-06 [overlap_opt_shard_grad_in_pipeline]: 1.94e-06 [control_data_broadcast_order]: 1.439e-05 [grouped_pairwise_exchange_alltoall]: 1.93002e-06 [offloading_packed_experts]: 4.49002e-06 [overlap_recompute_and_grad_model_parallel]: 4.85999e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.11997e-06 [overlap_recompute_allgather_and_fa_grad]: 1.45001e-06 [overlap_recompute_comm]: 2.73e-06 [overlap_grad_ring_attention]: 4.75999e-06 [overlap_grad_flash_sp]: 2.218e-05 [begin_end_overlap_inline]: 5.69999e-07 [split_matmul_comm_elemetwise]: 1.97999e-06 [split_layernorm_comm]: 2.06e-06 [handle_group_info]: 1.02e-06 [symbol_engine_optimizer]: 8.359e-05, [1] [Cycle 1]: 7.904e-05, [6] [build]: 2.88e-06 [elim_shapecalc]: 1.118e-05 [elim_not_effective]: 1.55e-05 [opt_reshape]: 8.45001e-06 [fold_const_symbol]: 1.207e-05 [renormalize]: 2.50002e-07 [detach_backward]: 2.41e-06 [pipeline_parallel_scheduler]: 1.49e-06 [auto_monad_reorder]: 1.958e-05 [get_jit_bprop_graph]: 1.67999e-06 [rewriter_after_jit_bprop_graph]: 3.42997e-06 [opt_after_jit_grad]: 0.00046523 [validate]: 3.952e-05 Sums bootstrap : 0.000365s : 1.71% type_inference : 0.005314s : 24.92% event_method : 0.000018s : 0.09% auto_monad : 0.000063s : 0.29% graph_reusing : 0.000006s : 0.03% inline : 0.000002s : 0.01% add_attr.add_attr_with_inline.tag_attr : 0.000017s : 0.08% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.03% parallel-infer-symbol : 0.000003s : 0.01% pre_auto_parallel : 0.000031s : 0.15% insert-virtual-dataset : 0.000002s : 0.01% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.01% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000024s : 0.11% optimize.rewriter_before_opt_a : 0.000079s : 0.37% optimize.opt_a.expand_dump_flag : 0.000005s : 0.03% optimize.opt_a.switch_simplify : 0.000103s : 0.48% optimize.opt_a.loop_unroll : 0.000041s : 0.19% optimize.opt_a.a_1 : 0.000919s : 4.31% optimize.opt_a.with_stream_mark : 0.000033s : 0.16% optimize.opt_a.recompute_prepare : 0.000019s : 0.09% optimize.opt_a.updatestate_depend_eliminate : 0.000009s : 0.04% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.03% optimize.opt_a.updatestate_loads_eliminate : 0.000007s : 0.03% optimize.opt_a.parameter_eliminate : 0.000005s : 0.02% optimize.opt_a.a_2 : 0.000193s : 0.91% optimize.opt_a.accelerated_algorithm : 0.000018s : 0.08% optimize.opt_a.shard : 0.000004s : 0.02% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.02% optimize.opt_a.shard_inline : 0.000015s : 0.07% optimize.opt_a.merge_send_recv : 0.000018s : 0.09% optimize.opt_a.auto_parallel : 0.000018s : 0.08% optimize.opt_a.parallel : 0.000028s : 0.13% optimize.opt_a.flash_sp : 0.000014s : 0.06% optimize.opt_a.merge_comm : 0.000009s : 0.04% optimize.opt_a.allreduce_fusion : 0.000010s : 0.05% optimize.opt_a.matmul_add_comm_reduction : 0.000021s : 0.10% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000018s : 0.09% optimize.opt_a.virtual_dataset : 0.000016s : 0.07% optimize.opt_a.get_grad_eliminate_ : 0.000015s : 0.07% optimize.opt_a.virtual_output : 0.000015s : 0.07% optimize.opt_a.merge_forward : 0.000009s : 0.04% optimize.opt_a.cell_reuse_recompute_pass : 0.000004s : 0.02% optimize.opt_a.offload_activation : 0.000021s : 0.10% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000032s : 0.15% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.02% optimize.opt_a.before_grad : 0.000026s : 0.12% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000009s : 0.04% optimize.opt_a.meta_fg_expand : 0.000007s : 0.03% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.02% optimize.opt_a.receive_attached : 0.000005s : 0.02% optimize.opt_a.after_resolve : 0.000026s : 0.12% optimize.opt_a.a_after_grad : 0.000023s : 0.11% optimize.opt_a.renormalize : 0.011139s : 52.25% optimize.opt_a.add_forward_monad_depend : 0.000010s : 0.05% optimize.opt_a.auto_monad_grad : 0.000003s : 0.02% optimize.opt_a.auto_monad_eliminator : 0.000027s : 0.13% optimize.opt_a.cse : 0.000057s : 0.27% optimize.opt_a.a_3 : 0.000116s : 0.54% optimize.py_interpret_to_execute_after_opt_a : 0.000013s : 0.06% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.01% optimize.rewriter_after_opt_a : 0.000041s : 0.19% optimize.convert_after_rewriter : 0.000008s : 0.04% optimize.order_py_execute_after_rewriter : 0.000006s : 0.03% optimize.mutable_eliminate : 0.000660s : 3.09% optimize.opt_b.b_1 : 0.000154s : 0.72% optimize.opt_b.b_2 : 0.000010s : 0.05% optimize.opt_b.updatestate_depend_eliminate : 0.000006s : 0.03% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.01% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.01% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000022s : 0.10% optimize.optimize_parallel_all_gather_comm : 0.000017s : 0.08% optimize.overlap_param_gather : 0.000003s : 0.01% optimize.cconv : 0.000024s : 0.11% optimize.loop_unroll : 0.000413s : 1.94% optimize.opt_after_cconv.c_1 : 0.000038s : 0.18% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.03% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.02% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.cse : 0.000022s : 0.10% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000015s : 0.07% optimize.tuple_transform.d_1 : 0.000053s : 0.25% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000008s : 0.04% optimize.partial_unused_args_eliminate : 0.000002s : 0.01% optimize.add_recomputation : 0.000055s : 0.26% optimize.cse_after_recomputation.cse : 0.000014s : 0.07% optimize.environ_conv : 0.000006s : 0.03% optimize.swap_dp_allreduce_reducescatter : 0.000006s : 0.03% optimize.bias_add_comm_swap : 0.000002s : 0.01% optimize.label_micro_interleaved_index : 0.000004s : 0.02% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.01% optimize.merge_cast_opt : 0.000001s : 0.01% optimize.slice_recompute_activation : 0.000002s : 0.01% optimize.micro_interleaved_order_control : 0.000002s : 0.01% optimize.assign_add_opt : 0.000001s : 0.01% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.01% optimize.full_micro_interleaved_order_control : 0.000002s : 0.01% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.01% optimize.comm_op_add_attrs : 0.000001s : 0.01% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.01% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000002s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.01% optimize.control_data_broadcast_order : 0.000014s : 0.07% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.01% optimize.offloading_packed_experts : 0.000004s : 0.02% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.02% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.01% optimize.overlap_recompute_comm : 0.000003s : 0.01% optimize.overlap_grad_ring_attention : 0.000005s : 0.02% optimize.overlap_grad_flash_sp : 0.000022s : 0.10% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.01% optimize.split_layernorm_comm : 0.000002s : 0.01% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000003s : 0.01% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000011s : 0.05% optimize.symbol_engine_optimizer.elim_not_effective : 0.000016s : 0.07% optimize.symbol_engine_optimizer.opt_reshape : 0.000008s : 0.04% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000012s : 0.06% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.01% pipeline_parallel_scheduler : 0.000001s : 0.01% auto_monad_reorder : 0.000020s : 0.09% get_jit_bprop_graph : 0.000002s : 0.01% rewriter_after_jit_bprop_graph : 0.000003s : 0.02% opt_after_jit_grad : 0.000465s : 2.18% validate : 0.000040s : 0.19% Time group info: ------[substitution.] 0.000204 38 13.43% : 0.000027s : 3: substitution.cast_eliminate 1.17% : 0.000002s : 3: substitution.elim_not_effective 0.78% : 0.000002s : 3: substitution.fold_const_symbol 3.31% : 0.000007s : 5: substitution.graph_param_transform 65.37% : 0.000133s : 4: substitution.inline 2.38% : 0.000005s : 6: substitution.j_node_and_user_rematch 3.24% : 0.000007s : 6: substitution.remove_not_recompute_node 2.79% : 0.000006s : 4: substitution.replace_old_param 7.54% : 0.000015s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.005260 2 86.59% : 0.004555s : 1: type_inference.infer 13.41% : 0.000706s : 1: type_inference.specialize ------[replace.] 0.000060 8 58.21% : 0.000035s : 4: replace.inline 41.79% : 0.000025s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000145 8 90.69% : 0.000131s : 4: match.inline 9.31% : 0.000013s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000262 1596 0.99% : 0.000003s : 17: predicate.accumulaten_eliminater 0.70% : 0.000002s : 5: predicate.ad_related_special_op_eliminate 0.54% : 0.000001s : 10: predicate.addn_check_dump 0.95% : 0.000003s : 17: predicate.addn_zero_filter 0.88% : 0.000002s : 17: predicate.adjust_all_reduce_mul_add 2.07% : 0.000005s : 27: predicate.arithmetic_simplify 1.03% : 0.000003s : 17: predicate.cast_eliminate 0.58% : 0.000002s : 10: predicate.check_bprop_eliminate 0.74% : 0.000002s : 10: predicate.compare_switch_simplify 0.19% : 0.000000s : 5: predicate.const_output_eliminate 0.58% : 0.000002s : 10: predicate.depend_value_elim 0.98% : 0.000003s : 17: predicate.dict_get_item_const_eliminator 1.23% : 0.000003s : 17: predicate.dict_get_item_eliminator 0.90% : 0.000002s : 17: predicate.dict_set_item_eliminator 0.84% : 0.000002s : 10: predicate.dumpgradient_eliminate 0.22% : 0.000001s : 5: predicate.elim_not_effective 0.38% : 0.000001s : 5: predicate.elim_shapecalc_of_broadcastargs 1.34% : 0.000004s : 22: predicate.environ_add_const_eliminate 1.14% : 0.000003s : 22: predicate.environ_get_add_eliminate 1.25% : 0.000003s : 22: predicate.environ_get_depend_swap 1.86% : 0.000005s : 32: predicate.environ_get_eliminate 1.22% : 0.000003s : 22: predicate.environ_get_set_eliminate 1.37% : 0.000004s : 25: predicate.exchange_switch_depend_value 2.41% : 0.000006s : 25: predicate.float_depend_g_call 0.53% : 0.000001s : 10: predicate.float_environ_get_switch 0.83% : 0.000002s : 15: predicate.float_tuple_getitem_switch 0.18% : 0.000000s : 5: predicate.fold_const_symbol 0.66% : 0.000002s : 10: predicate.get_grad_eliminate 0.20% : 0.000001s : 5: predicate.graph_param_transform 0.60% : 0.000002s : 10: predicate.incorporate_call 0.48% : 0.000001s : 10: predicate.incorporate_call_switch 6.04% : 0.000016s : 72: predicate.inline 0.77% : 0.000002s : 10: predicate.inline_without_move 0.32% : 0.000001s : 10: predicate.j_node_and_user_rematch 1.04% : 0.000003s : 10: predicate.less_batch_normalization 1.89% : 0.000005s : 31: predicate.list_to_tuple_eliminator_ 2.46% : 0.000006s : 48: predicate.load_eliminater 0.73% : 0.000002s : 5: predicate.loop_unroll_after_grad 2.01% : 0.000005s : 36: predicate.loop_unroll_before_grad 1.73% : 0.000005s : 27: predicate.make_slice_get_slice_eliminator 0.54% : 0.000001s : 10: predicate.merge_addn 0.57% : 0.000001s : 10: predicate.micro_step_allgather_replace 0.65% : 0.000002s : 10: predicate.mini_step_allgather_replace 0.87% : 0.000002s : 17: predicate.minmaximum_grad 0.82% : 0.000002s : 5: predicate.mutable_eliminate 0.36% : 0.000001s : 5: predicate.opt_reshape 0.37% : 0.000001s : 5: predicate.parallel_virtual_node 1.61% : 0.000004s : 25: predicate.partial_defer_inline 1.64% : 0.000004s : 26: predicate.partial_eliminate 0.94% : 0.000002s : 17: predicate.print_const_string_wrapper 0.69% : 0.000002s : 10: predicate.reduce_all_const_elim 1.36% : 0.000004s : 17: predicate.reduce_eliminate 2.60% : 0.000007s : 48: predicate.redundant_stop_gradient_eliminater 0.38% : 0.000001s : 10: predicate.remove_not_recompute_node 1.43% : 0.000004s : 31: predicate.replace_applicator 0.55% : 0.000001s : 10: predicate.replace_old_param 0.24% : 0.000001s : 5: predicate.reset_defer_inline 1.07% : 0.000003s : 17: predicate.reshape_eliminate 0.61% : 0.000002s : 10: predicate.row_tensor_add_zeros_like 0.48% : 0.000001s : 5: predicate.row_tensor_eliminate 0.83% : 0.000002s : 10: predicate.same_eliminate 0.40% : 0.000001s : 10: predicate.set_cell_output_no_recompute 0.72% : 0.000002s : 10: predicate.shard_identity_eliminate 0.68% : 0.000002s : 10: predicate.special_op_eliminate 0.75% : 0.000002s : 10: predicate.specialize_transform 1.05% : 0.000003s : 10: predicate.split_environ_get_set_with_tuple_value 0.70% : 0.000002s : 10: predicate.stack_unstack_eliminate 0.35% : 0.000001s : 5: predicate.switch_call_monad_eliminater 1.46% : 0.000004s : 25: predicate.switch_defer_inline 1.94% : 0.000005s : 35: predicate.switch_layer_defer_inline 4.79% : 0.000013s : 76: predicate.switch_simplify 0.93% : 0.000002s : 17: predicate.tile_eliminate 0.99% : 0.000003s : 17: predicate.transpose_eliminate 1.52% : 0.000004s : 27: predicate.tuple_list_convert_item_index_to_positive 1.74% : 0.000005s : 27: predicate.tuple_list_get_item_const_eliminator 1.54% : 0.000004s : 27: predicate.tuple_list_get_item_depend_reorder 3.25% : 0.000009s : 41: predicate.tuple_list_get_item_eliminator 1.56% : 0.000004s : 27: predicate.tuple_list_get_set_item_eliminator 2.37% : 0.000006s : 37: predicate.tuple_list_set_item_eliminator 1.77% : 0.000005s : 31: predicate.tuple_to_list_eliminator_ 2.44% : 0.000006s : 48: predicate.updatestate_pure_node_eliminater 3.20% : 0.000008s : 58: predicate.updatestate_useless_node_eliminater 0.40% : 0.000001s : 5: predicate.value_based_eliminate 0.65% : 0.000002s : 10: predicate.virtual_dataset_eliminate 0.66% : 0.000002s : 10: predicate.virtual_output_eliminate 0.27% : 0.000001s : 5: predicate.virtual_view_grad_eliminate 0.38% : 0.000001s : 5: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000532 11 44.26% : 0.000235s : 5: func_graph_cloner_run.FuncGraphClonerGraph 55.74% : 0.000297s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.056849 192 0.01% : 0.000004s : 1: ForceFp32Comm 5.17% : 0.002940s : 1: add_attr 5.16% : 0.002931s : 1: add_attr_with_inline 0.01% : 0.000004s : 1: add_comm_op_reuse_tag 0.10% : 0.000059s : 1: add_recomputation 0.01% : 0.000004s : 1: assign_add_opt 0.12% : 0.000070s : 1: auto_monad 0.04% : 0.000024s : 1: auto_monad_reorder 0.01% : 0.000003s : 1: begin_end_overlap_inline 0.01% : 0.000005s : 1: bias_add_comm_swap 0.69% : 0.000391s : 1: bootstrap 0.05% : 0.000027s : 1: cconv 0.01% : 0.000004s : 1: comm_op_add_attrs 0.03% : 0.000017s : 1: control_data_broadcast_order 0.02% : 0.000011s : 1: convert_after_rewriter 0.05% : 0.000028s : 1: cse_after_recomputation 0.01% : 0.000005s : 1: dataset_repeat_opt 0.01% : 0.000006s : 1: detach_backward 0.02% : 0.000009s : 1: environ_conv 0.04% : 0.000025s : 1: event_method 0.01% : 0.000005s : 1: full_micro_interleaved_order_control 0.01% : 0.000005s : 1: get_jit_bprop_graph 0.02% : 0.000010s : 1: graph_reusing 0.01% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000004s : 1: handle_group_info 0.01% : 0.000005s : 1: inline 0.01% : 0.000006s : 1: insert-virtual-dataset 0.01% : 0.000004s : 1: interleave_parallel_branches 0.01% : 0.000004s : 1: interleave_split_concat_branches 0.01% : 0.000006s : 1: label_fine_grained_interleaved_index 0.01% : 0.000007s : 1: label_micro_interleaved_index 0.74% : 0.000421s : 1: loop_unroll 0.01% : 0.000004s : 1: merge_cast_opt 0.01% : 0.000005s : 1: micro_interleaved_order_control 1.17% : 0.000668s : 1: mutable_eliminate 0.01% : 0.000007s : 1: offloading_packed_experts 0.03% : 0.000015s : 1: opt.transform.loop_unroll_optimizer 0.03% : 0.000017s : 1: opt.transform.mutable_eliminate 2.63% : 0.001494s : 78: opt.transform.opt_a 0.06% : 0.000037s : 1: opt.transform.opt_after_cconv 0.05% : 0.000028s : 1: opt.transform.opt_after_jit_grad 0.23% : 0.000132s : 28: opt.transform.opt_b 0.10% : 0.000058s : 2: opt.transform.opt_trans_graph 0.08% : 0.000044s : 4: opt.transform.symbol_engine_opt 23.92% : 0.013597s : 1: opt_a 0.21% : 0.000118s : 1: opt_after_cconv 0.83% : 0.000474s : 1: opt_after_jit_grad 0.43% : 0.000247s : 1: opt_b 27.85% : 0.015835s : 1: optimize 0.04% : 0.000021s : 1: optimize_parallel_all_gather_comm 0.02% : 0.000009s : 1: order_py_execute_after_rewriter 0.05% : 0.000026s : 1: overlap_grad_flash_sp 0.01% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.01% : 0.000008s : 1: overlap_grad_ring_attention 0.01% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.01% : 0.000006s : 1: overlap_param_gather 0.01% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.01% : 0.000008s : 1: overlap_recompute_and_grad_model_parallel 0.01% : 0.000006s : 1: overlap_recompute_comm 0.01% : 0.000007s : 1: parallel-infer-symbol 0.01% : 0.000004s : 1: parallel-infer-symbol-second 0.01% : 0.000005s : 1: partial_unused_args_eliminate 0.01% : 0.000005s : 1: pipeline_parallel_scheduler 0.01% : 0.000004s : 1: pipeline_split 0.06% : 0.000035s : 1: pre_auto_parallel 0.05% : 0.000028s : 1: py_interpret_to_execute 0.03% : 0.000017s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000004s : 1: remove_cast_before_assign_add 0.03% : 0.000018s : 1: remove_dup_value 18.83% : 0.010707s : 1: renormalize.infer 0.74% : 0.000418s : 1: renormalize.specialize 0.01% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.01% : 0.000006s : 1: rewriter_after_jit_bprop_graph 0.08% : 0.000045s : 1: rewriter_after_opt_a 0.15% : 0.000083s : 1: rewriter_before_opt_a 0.01% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.01% : 0.000005s : 1: slice_recompute_activation 0.01% : 0.000005s : 1: split_layernorm_comm 0.01% : 0.000005s : 1: split_matmul_comm_elemetwise 0.02% : 0.000009s : 1: swap_dp_allreduce_reducescatter 0.15% : 0.000086s : 1: symbol_engine_optimizer 0.16% : 0.000088s : 1: tuple_transform 9.37% : 0.005327s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:45:56.887.140 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:45:56.887.414 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0163341, [21] [bootstrap]: 0.00043725 [type_inference]: 0.00577302 [event_method]: 2.035e-05 [auto_monad]: 6.42e-05 [graph_reusing]: 5.54e-06 [inline]: 2.34001e-06 [add_attr]: 0.003118, [1] [add_attr_with_inline]: 0.00310937, [1] [Cycle 1]: 7.16e-05, [2] [tag_attr]: 1.877e-05 [meta_addattr_fg_expand]: 6.33998e-06 [parallel-infer-symbol]: 3.13998e-06 [pre_auto_parallel]: 3.324e-05 [insert-virtual-dataset]: 2.42001e-06 [parallel-infer-symbol-second]: 8.09989e-07 [dataset_repeat_opt]: 1.93002e-06 [pipeline_split]: 1.87001e-06 [optimize]: 0.00577788, [53] [py_interpret_to_execute]: 3.011e-05 [rewriter_before_opt_a]: 8.704e-05 [opt_a]: 0.00346624, [2] [Cycle 1]: 0.00252462, [45] [expand_dump_flag]: 2.93998e-06 [switch_simplify]: 4.321e-05 [loop_unroll]: 3.132e-05 [a_1]: 0.00088599 [with_stream_mark]: 1.62e-05 [recompute_prepare]: 1.146e-05 [updatestate_depend_eliminate]: 4.70001e-06 [updatestate_assign_eliminate]: 3.98001e-06 [updatestate_loads_eliminate]: 3.76999e-06 [parameter_eliminate]: 1.84e-06 [a_2]: 0.00013044 [accelerated_algorithm]: 8.09002e-06 [shard]: 2.26e-06 [meta_shard_fg_expand]: 2.05002e-06 [shard_inline]: 7.93001e-06 [merge_send_recv]: 9.34e-06 [auto_parallel]: 7.74002e-06 [parallel]: 1.746e-05 [flash_sp]: 8.00999e-06 [merge_comm]: 4.52998e-06 [allreduce_fusion]: 4.15e-06 [matmul_add_comm_reduction]: 9.44e-06 [allreduce_slice_to_reducescatter]: 8.2e-07 [virtual_shard_identity]: 9.01002e-06 [virtual_dataset]: 8e-06 [get_grad_eliminate_]: 7.45e-06 [virtual_output]: 7.75998e-06 [merge_forward]: 4.30999e-06 [cell_reuse_recompute_pass]: 1.22e-06 [offload_activation]: 1.147e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.666e-05 [merge_recompute_call_nodes]: 1.55999e-06 [before_grad]: 1.298e-05 [set_forward_comm_id_for_comm_node_pass]: 4.48001e-06 [meta_fg_expand]: 3.47002e-06 [flash_sp_send_recv_attached]: 2.39999e-06 [receive_attached]: 1.97999e-06 [after_resolve]: 1.25e-05 [a_after_grad]: 1.167e-05 [renormalize]: 0.00067762 [add_forward_monad_depend]: 5.42999e-06 [auto_monad_grad]: 2.34001e-06 [auto_monad_eliminator]: 1.681e-05 [cse]: 3.398e-05 [a_3]: 6.966e-05 [Cycle 2]: 0.00092804, [45] [expand_dump_flag]: 1.16002e-06 [switch_simplify]: 9.41e-06 [loop_unroll]: 7.61999e-06 [a_1]: 0.00017693 [with_stream_mark]: 1.062e-05 [recompute_prepare]: 8.05e-06 [updatestate_depend_eliminate]: 3.68e-06 [updatestate_assign_eliminate]: 3.08e-06 [updatestate_loads_eliminate]: 2.93998e-06 [parameter_eliminate]: 1.04998e-06 [a_2]: 0.0001214 [accelerated_algorithm]: 7.8e-06 [shard]: 1.07e-06 [meta_shard_fg_expand]: 1.67999e-06 [shard_inline]: 7.57998e-06 [merge_send_recv]: 5.69e-06 [auto_parallel]: 6.09999e-06 [parallel]: 4.05e-06 [flash_sp]: 3.23998e-06 [merge_comm]: 4.44002e-06 [allreduce_fusion]: 3.73001e-06 [matmul_add_comm_reduction]: 6.79999e-06 [allreduce_slice_to_reducescatter]: 4.30009e-07 [virtual_shard_identity]: 8.42e-06 [virtual_dataset]: 7.45998e-06 [get_grad_eliminate_]: 7.00998e-06 [virtual_output]: 7.00002e-06 [merge_forward]: 3.19001e-06 [cell_reuse_recompute_pass]: 1.38002e-06 [offload_activation]: 7.21999e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.601e-05 [merge_recompute_call_nodes]: 7.2e-07 [before_grad]: 1.172e-05 [set_forward_comm_id_for_comm_node_pass]: 4.44002e-06 [meta_fg_expand]: 2.79999e-06 [flash_sp_send_recv_attached]: 1.02998e-06 [receive_attached]: 1.04e-06 [after_resolve]: 1.188e-05 [a_after_grad]: 1.139e-05 [renormalize]: 8.9989e-08 [add_forward_monad_depend]: 1.50999e-06 [auto_monad_grad]: 8.50006e-07 [auto_monad_eliminator]: 8.27e-06 [cse]: 1.789e-05 [a_3]: 5.911e-05 [py_interpret_to_execute_after_opt_a]: 1.314e-05 [slice_cell_reuse_recomputed_activation]: 4.78001e-06 [rewriter_after_opt_a]: 4.204e-05 [convert_after_rewriter]: 1.083e-05 [order_py_execute_after_rewriter]: 9.34e-06 [mutable_eliminate]: 0.00047296 [opt_b]: 0.00032015, [1] [Cycle 1]: 0.00031099, [7] [b_1]: 0.00021199 [b_2]: 9.28002e-06 [updatestate_depend_eliminate]: 5.64e-06 [updatestate_assign_eliminate]: 3.08998e-06 [updatestate_loads_eliminate]: 2.98998e-06 [renormalize]: 4.00003e-07 [cse]: 2.191e-05 [optimize_parallel_all_gather_comm]: 1.97e-05 [overlap_param_gather]: 4.94998e-06 [cconv]: 2.725e-05 [loop_unroll]: 0.00042539 [opt_after_cconv]: 0.00013832, [1] [Cycle 1]: 0.00012993, [7] [c_1]: 3.788e-05 [parameter_eliminate]: 2.41998e-06 [updatestate_depend_eliminate]: 5.67999e-06 [updatestate_assign_eliminate]: 3.21001e-06 [updatestate_loads_eliminate]: 2.78e-06 [cse]: 2.177e-05 [renormalize]: 4.89992e-07 [remove_dup_value]: 1.841e-05 [tuple_transform]: 0.00010495, [1] [Cycle 1]: 9.76e-05, [4] [d_1]: 5.619e-05 [none_parameter_eliminate]: 1.74998e-06 [renormalize]: 1.90019e-07 [switch_simplify]: 8.74998e-06 [partial_unused_args_eliminate]: 4.27e-06 [add_recomputation]: 5.602e-05 [cse_after_recomputation]: 3.103e-05, [1] [Cycle 1]: 2.422e-05, [1] [cse]: 1.489e-05 [environ_conv]: 9.59999e-06 [swap_dp_allreduce_reducescatter]: 8.43999e-06 [bias_add_comm_swap]: 4.90001e-06 [label_micro_interleaved_index]: 6.57002e-06 [label_fine_grained_interleaved_index]: 5.19e-06 [merge_cast_opt]: 3.75e-06 [slice_recompute_activation]: 4.4e-06 [micro_interleaved_order_control]: 5.10999e-06 [assign_add_opt]: 3.54002e-06 [ForceFp32Comm]: 3.09999e-06 [remove_cast_before_assign_add]: 3.61001e-06 [full_micro_interleaved_order_control]: 4.54998e-06 [reorder_send_recv_between_fp_bp]: 5.31998e-06 [comm_op_add_attrs]: 3.68e-06 [add_comm_op_reuse_tag]: 3.25e-06 [interleave_split_concat_branches]: 3.48e-06 [interleave_parallel_branches]: 3.46001e-06 [overlap_opt_shard_in_pipeline]: 3.38999e-06 [overlap_opt_shard_grad_in_pipeline]: 4.18999e-06 [control_data_broadcast_order]: 1.679e-05 [grouped_pairwise_exchange_alltoall]: 3.94002e-06 [offloading_packed_experts]: 7.53999e-06 [overlap_recompute_and_grad_model_parallel]: 7.45998e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.41999e-06 [overlap_recompute_allgather_and_fa_grad]: 3.88001e-06 [overlap_recompute_comm]: 4.88001e-06 [overlap_grad_ring_attention]: 6.99001e-06 [overlap_grad_flash_sp]: 2.374e-05 [begin_end_overlap_inline]: 2.91e-06 [split_matmul_comm_elemetwise]: 4.49002e-06 [split_layernorm_comm]: 4.17e-06 [handle_group_info]: 3.4e-06 [symbol_engine_optimizer]: 0.00010149, [1] [Cycle 1]: 9.524e-05, [6] [build]: 3.34001e-06 [elim_shapecalc]: 1.066e-05 [elim_not_effective]: 1.553e-05 [opt_reshape]: 8.57e-06 [fold_const_symbol]: 1.254e-05 [renormalize]: 2.70025e-07 [detach_backward]: 3.03e-06 [pipeline_parallel_scheduler]: 1.73002e-06 [auto_monad_reorder]: 2.122e-05 [get_jit_bprop_graph]: 1.43002e-06 [rewriter_after_jit_bprop_graph]: 4.41002e-06 [opt_after_jit_grad]: 0.00046967 [validate]: 3.811e-05 Sums bootstrap : 0.000437s : 3.80% type_inference : 0.005773s : 50.15% event_method : 0.000020s : 0.18% auto_monad : 0.000064s : 0.56% graph_reusing : 0.000006s : 0.05% inline : 0.000002s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000019s : 0.16% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.06% parallel-infer-symbol : 0.000003s : 0.03% pre_auto_parallel : 0.000033s : 0.29% insert-virtual-dataset : 0.000002s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.02% optimize.py_interpret_to_execute : 0.000030s : 0.26% optimize.rewriter_before_opt_a : 0.000087s : 0.76% optimize.opt_a.expand_dump_flag : 0.000004s : 0.04% optimize.opt_a.switch_simplify : 0.000053s : 0.46% optimize.opt_a.loop_unroll : 0.000039s : 0.34% optimize.opt_a.a_1 : 0.001063s : 9.23% optimize.opt_a.with_stream_mark : 0.000027s : 0.23% optimize.opt_a.recompute_prepare : 0.000020s : 0.17% optimize.opt_a.updatestate_depend_eliminate : 0.000008s : 0.07% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.06% optimize.opt_a.updatestate_loads_eliminate : 0.000007s : 0.06% optimize.opt_a.parameter_eliminate : 0.000003s : 0.03% optimize.opt_a.a_2 : 0.000252s : 2.19% optimize.opt_a.accelerated_algorithm : 0.000016s : 0.14% optimize.opt_a.shard : 0.000003s : 0.03% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.03% optimize.opt_a.shard_inline : 0.000016s : 0.13% optimize.opt_a.merge_send_recv : 0.000015s : 0.13% optimize.opt_a.auto_parallel : 0.000014s : 0.12% optimize.opt_a.parallel : 0.000022s : 0.19% optimize.opt_a.flash_sp : 0.000011s : 0.10% optimize.opt_a.merge_comm : 0.000009s : 0.08% optimize.opt_a.allreduce_fusion : 0.000008s : 0.07% optimize.opt_a.matmul_add_comm_reduction : 0.000016s : 0.14% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000017s : 0.15% optimize.opt_a.virtual_dataset : 0.000015s : 0.13% optimize.opt_a.get_grad_eliminate_ : 0.000014s : 0.13% optimize.opt_a.virtual_output : 0.000015s : 0.13% optimize.opt_a.merge_forward : 0.000007s : 0.07% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.02% optimize.opt_a.offload_activation : 0.000019s : 0.16% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000033s : 0.28% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.02% optimize.opt_a.before_grad : 0.000025s : 0.21% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000009s : 0.08% optimize.opt_a.meta_fg_expand : 0.000006s : 0.05% optimize.opt_a.flash_sp_send_recv_attached : 0.000003s : 0.03% optimize.opt_a.receive_attached : 0.000003s : 0.03% optimize.opt_a.after_resolve : 0.000024s : 0.21% optimize.opt_a.a_after_grad : 0.000023s : 0.20% optimize.opt_a.renormalize : 0.000678s : 5.89% optimize.opt_a.add_forward_monad_depend : 0.000007s : 0.06% optimize.opt_a.auto_monad_grad : 0.000003s : 0.03% optimize.opt_a.auto_monad_eliminator : 0.000025s : 0.22% optimize.opt_a.cse : 0.000052s : 0.45% optimize.opt_a.a_3 : 0.000129s : 1.12% optimize.py_interpret_to_execute_after_opt_a : 0.000013s : 0.11% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.04% optimize.rewriter_after_opt_a : 0.000042s : 0.37% optimize.convert_after_rewriter : 0.000011s : 0.09% optimize.order_py_execute_after_rewriter : 0.000009s : 0.08% optimize.mutable_eliminate : 0.000473s : 4.11% optimize.opt_b.b_1 : 0.000212s : 1.84% optimize.opt_b.b_2 : 0.000009s : 0.08% optimize.opt_b.updatestate_depend_eliminate : 0.000006s : 0.05% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_b.renormalize : 0.000000s : 0.00% optimize.opt_b.cse : 0.000022s : 0.19% optimize.optimize_parallel_all_gather_comm : 0.000020s : 0.17% optimize.overlap_param_gather : 0.000005s : 0.04% optimize.cconv : 0.000027s : 0.24% optimize.loop_unroll : 0.000425s : 3.70% optimize.opt_after_cconv.c_1 : 0.000038s : 0.33% optimize.opt_after_cconv.parameter_eliminate : 0.000002s : 0.02% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.05% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.02% optimize.opt_after_cconv.cse : 0.000022s : 0.19% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000018s : 0.16% optimize.tuple_transform.d_1 : 0.000056s : 0.49% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.02% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000009s : 0.08% optimize.partial_unused_args_eliminate : 0.000004s : 0.04% optimize.add_recomputation : 0.000056s : 0.49% optimize.cse_after_recomputation.cse : 0.000015s : 0.13% optimize.environ_conv : 0.000010s : 0.08% optimize.swap_dp_allreduce_reducescatter : 0.000008s : 0.07% optimize.bias_add_comm_swap : 0.000005s : 0.04% optimize.label_micro_interleaved_index : 0.000007s : 0.06% optimize.label_fine_grained_interleaved_index : 0.000005s : 0.05% optimize.merge_cast_opt : 0.000004s : 0.03% optimize.slice_recompute_activation : 0.000004s : 0.04% optimize.micro_interleaved_order_control : 0.000005s : 0.04% optimize.assign_add_opt : 0.000004s : 0.03% optimize.ForceFp32Comm : 0.000003s : 0.03% optimize.remove_cast_before_assign_add : 0.000004s : 0.03% optimize.full_micro_interleaved_order_control : 0.000005s : 0.04% optimize.reorder_send_recv_between_fp_bp : 0.000005s : 0.05% optimize.comm_op_add_attrs : 0.000004s : 0.03% optimize.add_comm_op_reuse_tag : 0.000003s : 0.03% optimize.interleave_split_concat_branches : 0.000003s : 0.03% optimize.interleave_parallel_branches : 0.000003s : 0.03% optimize.overlap_opt_shard_in_pipeline : 0.000003s : 0.03% optimize.overlap_opt_shard_grad_in_pipeline : 0.000004s : 0.04% optimize.control_data_broadcast_order : 0.000017s : 0.15% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.03% optimize.offloading_packed_experts : 0.000008s : 0.07% optimize.overlap_recompute_and_grad_model_parallel : 0.000007s : 0.06% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000003s : 0.03% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.03% optimize.overlap_recompute_comm : 0.000005s : 0.04% optimize.overlap_grad_ring_attention : 0.000007s : 0.06% optimize.overlap_grad_flash_sp : 0.000024s : 0.21% optimize.begin_end_overlap_inline : 0.000003s : 0.03% optimize.split_matmul_comm_elemetwise : 0.000004s : 0.04% optimize.split_layernorm_comm : 0.000004s : 0.04% optimize.handle_group_info : 0.000003s : 0.03% optimize.symbol_engine_optimizer.build : 0.000003s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000011s : 0.09% optimize.symbol_engine_optimizer.elim_not_effective : 0.000016s : 0.13% optimize.symbol_engine_optimizer.opt_reshape : 0.000009s : 0.07% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000013s : 0.11% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000003s : 0.03% pipeline_parallel_scheduler : 0.000002s : 0.02% auto_monad_reorder : 0.000021s : 0.18% get_jit_bprop_graph : 0.000001s : 0.01% rewriter_after_jit_bprop_graph : 0.000004s : 0.04% opt_after_jit_grad : 0.000470s : 4.08% validate : 0.000038s : 0.33% Time group info: ------[substitution.] 0.000246 38 9.37% : 0.000023s : 3: substitution.cast_eliminate 0.94% : 0.000002s : 3: substitution.elim_not_effective 0.67% : 0.000002s : 3: substitution.fold_const_symbol 2.69% : 0.000007s : 5: substitution.graph_param_transform 75.27% : 0.000185s : 4: substitution.inline 1.65% : 0.000004s : 6: substitution.j_node_and_user_rematch 2.30% : 0.000006s : 6: substitution.remove_not_recompute_node 1.75% : 0.000004s : 4: substitution.replace_old_param 5.37% : 0.000013s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.005721 2 87.42% : 0.005001s : 1: type_inference.infer 12.58% : 0.000720s : 1: type_inference.specialize ------[replace.] 0.000061 8 62.30% : 0.000038s : 4: replace.inline 37.70% : 0.000023s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000193 8 94.18% : 0.000182s : 4: match.inline 5.82% : 0.000011s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000294 1504 0.75% : 0.000002s : 15: predicate.accumulaten_eliminater 0.62% : 0.000002s : 5: predicate.ad_related_special_op_eliminate 0.49% : 0.000001s : 10: predicate.addn_check_dump 19.52% : 0.000057s : 15: predicate.addn_zero_filter 0.72% : 0.000002s : 15: predicate.adjust_all_reduce_mul_add 1.65% : 0.000005s : 25: predicate.arithmetic_simplify 0.85% : 0.000003s : 15: predicate.cast_eliminate 0.51% : 0.000002s : 10: predicate.check_bprop_eliminate 0.49% : 0.000001s : 10: predicate.compare_switch_simplify 0.16% : 0.000000s : 5: predicate.const_output_eliminate 0.55% : 0.000002s : 10: predicate.depend_value_elim 0.77% : 0.000002s : 15: predicate.dict_get_item_const_eliminator 0.85% : 0.000002s : 15: predicate.dict_get_item_eliminator 0.70% : 0.000002s : 15: predicate.dict_set_item_eliminator 0.73% : 0.000002s : 10: predicate.dumpgradient_eliminate 0.16% : 0.000000s : 5: predicate.elim_not_effective 0.34% : 0.000001s : 5: predicate.elim_shapecalc_of_broadcastargs 0.99% : 0.000003s : 20: predicate.environ_add_const_eliminate 0.97% : 0.000003s : 20: predicate.environ_get_add_eliminate 0.88% : 0.000003s : 20: predicate.environ_get_depend_swap 1.50% : 0.000004s : 30: predicate.environ_get_eliminate 0.89% : 0.000003s : 20: predicate.environ_get_set_eliminate 1.12% : 0.000003s : 23: predicate.exchange_switch_depend_value 2.21% : 0.000006s : 23: predicate.float_depend_g_call 0.46% : 0.000001s : 10: predicate.float_environ_get_switch 0.69% : 0.000002s : 15: predicate.float_tuple_getitem_switch 0.15% : 0.000000s : 5: predicate.fold_const_symbol 0.53% : 0.000002s : 10: predicate.get_grad_eliminate 0.18% : 0.000001s : 5: predicate.graph_param_transform 0.52% : 0.000002s : 10: predicate.incorporate_call 0.44% : 0.000001s : 10: predicate.incorporate_call_switch 5.13% : 0.000015s : 68: predicate.inline 0.66% : 0.000002s : 10: predicate.inline_without_move 0.27% : 0.000001s : 10: predicate.j_node_and_user_rematch 0.63% : 0.000002s : 10: predicate.less_batch_normalization 1.49% : 0.000004s : 29: predicate.list_to_tuple_eliminator_ 2.08% : 0.000006s : 44: predicate.load_eliminater 0.70% : 0.000002s : 5: predicate.loop_unroll_after_grad 1.72% : 0.000005s : 36: predicate.loop_unroll_before_grad 1.33% : 0.000004s : 25: predicate.make_slice_get_slice_eliminator 0.49% : 0.000001s : 10: predicate.merge_addn 0.47% : 0.000001s : 10: predicate.micro_step_allgather_replace 0.52% : 0.000002s : 10: predicate.mini_step_allgather_replace 0.71% : 0.000002s : 15: predicate.minmaximum_grad 0.69% : 0.000002s : 5: predicate.mutable_eliminate 0.32% : 0.000001s : 5: predicate.opt_reshape 0.33% : 0.000001s : 5: predicate.parallel_virtual_node 1.42% : 0.000004s : 23: predicate.partial_defer_inline 1.33% : 0.000004s : 24: predicate.partial_eliminate 0.72% : 0.000002s : 15: predicate.print_const_string_wrapper 0.53% : 0.000002s : 10: predicate.reduce_all_const_elim 0.92% : 0.000003s : 15: predicate.reduce_eliminate 2.11% : 0.000006s : 44: predicate.redundant_stop_gradient_eliminater 0.32% : 0.000001s : 10: predicate.remove_not_recompute_node 1.09% : 0.000003s : 29: predicate.replace_applicator 0.41% : 0.000001s : 10: predicate.replace_old_param 0.21% : 0.000001s : 5: predicate.reset_defer_inline 0.79% : 0.000002s : 15: predicate.reshape_eliminate 0.52% : 0.000002s : 10: predicate.row_tensor_add_zeros_like 0.29% : 0.000001s : 5: predicate.row_tensor_eliminate 0.62% : 0.000002s : 10: predicate.same_eliminate 0.36% : 0.000001s : 10: predicate.set_cell_output_no_recompute 0.67% : 0.000002s : 10: predicate.shard_identity_eliminate 0.59% : 0.000002s : 10: predicate.special_op_eliminate 0.63% : 0.000002s : 10: predicate.specialize_transform 0.66% : 0.000002s : 10: predicate.split_environ_get_set_with_tuple_value 0.61% : 0.000002s : 10: predicate.stack_unstack_eliminate 0.29% : 0.000001s : 5: predicate.switch_call_monad_eliminater 1.20% : 0.000004s : 23: predicate.switch_defer_inline 1.65% : 0.000005s : 33: predicate.switch_layer_defer_inline 3.98% : 0.000012s : 74: predicate.switch_simplify 0.72% : 0.000002s : 15: predicate.tile_eliminate 0.73% : 0.000002s : 15: predicate.transpose_eliminate 1.28% : 0.000004s : 25: predicate.tuple_list_convert_item_index_to_positive 1.34% : 0.000004s : 25: predicate.tuple_list_get_item_const_eliminator 1.21% : 0.000004s : 25: predicate.tuple_list_get_item_depend_reorder 2.65% : 0.000008s : 39: predicate.tuple_list_get_item_eliminator 1.20% : 0.000004s : 25: predicate.tuple_list_get_set_item_eliminator 1.82% : 0.000005s : 35: predicate.tuple_list_set_item_eliminator 1.51% : 0.000004s : 29: predicate.tuple_to_list_eliminator_ 1.99% : 0.000006s : 44: predicate.updatestate_pure_node_eliminater 2.69% : 0.000008s : 54: predicate.updatestate_useless_node_eliminater 0.29% : 0.000001s : 5: predicate.value_based_eliminate 0.51% : 0.000002s : 10: predicate.virtual_dataset_eliminate 0.55% : 0.000002s : 10: predicate.virtual_output_eliminate 0.25% : 0.000001s : 5: predicate.virtual_view_grad_eliminate 0.38% : 0.000001s : 5: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000527 11 54.06% : 0.000285s : 5: func_graph_cloner_run.FuncGraphClonerGraph 45.94% : 0.000242s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.027714 192 0.02% : 0.000006s : 1: ForceFp32Comm 11.28% : 0.003127s : 1: add_attr 11.23% : 0.003113s : 1: add_attr_with_inline 0.02% : 0.000006s : 1: add_comm_op_reuse_tag 0.21% : 0.000059s : 1: add_recomputation 0.02% : 0.000006s : 1: assign_add_opt 0.27% : 0.000074s : 1: auto_monad 0.10% : 0.000028s : 1: auto_monad_reorder 0.02% : 0.000006s : 1: begin_end_overlap_inline 0.03% : 0.000008s : 1: bias_add_comm_swap 1.74% : 0.000483s : 1: bootstrap 0.11% : 0.000030s : 1: cconv 0.02% : 0.000006s : 1: comm_op_add_attrs 0.07% : 0.000020s : 1: control_data_broadcast_order 0.05% : 0.000014s : 1: convert_after_rewriter 0.12% : 0.000034s : 1: cse_after_recomputation 0.03% : 0.000007s : 1: dataset_repeat_opt 0.06% : 0.000016s : 1: detach_backward 0.05% : 0.000012s : 1: environ_conv 0.11% : 0.000031s : 1: event_method 0.03% : 0.000007s : 1: full_micro_interleaved_order_control 0.03% : 0.000007s : 1: get_jit_bprop_graph 0.04% : 0.000012s : 1: graph_reusing 0.02% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.02% : 0.000006s : 1: handle_group_info 0.03% : 0.000008s : 1: inline 0.03% : 0.000009s : 1: insert-virtual-dataset 0.02% : 0.000006s : 1: interleave_parallel_branches 0.02% : 0.000006s : 1: interleave_split_concat_branches 0.03% : 0.000008s : 1: label_fine_grained_interleaved_index 0.03% : 0.000009s : 1: label_micro_interleaved_index 1.55% : 0.000431s : 1: loop_unroll 0.02% : 0.000007s : 1: merge_cast_opt 0.03% : 0.000008s : 1: micro_interleaved_order_control 1.73% : 0.000479s : 1: mutable_eliminate 0.04% : 0.000010s : 1: offloading_packed_experts 0.06% : 0.000015s : 1: opt.transform.loop_unroll_optimizer 0.06% : 0.000017s : 1: opt.transform.mutable_eliminate 5.71% : 0.001582s : 78: opt.transform.opt_a 0.13% : 0.000036s : 1: opt.transform.opt_after_cconv 0.10% : 0.000028s : 1: opt.transform.opt_after_jit_grad 0.53% : 0.000147s : 28: opt.transform.opt_b 0.23% : 0.000063s : 2: opt.transform.opt_trans_graph 0.16% : 0.000044s : 4: opt.transform.symbol_engine_opt 12.52% : 0.003469s : 1: opt_a 0.51% : 0.000142s : 1: opt_after_cconv 1.73% : 0.000480s : 1: opt_after_jit_grad 1.17% : 0.000324s : 1: opt_b 21.96% : 0.006086s : 1: optimize 0.08% : 0.000023s : 1: optimize_parallel_all_gather_comm 0.04% : 0.000012s : 1: order_py_execute_after_rewriter 0.10% : 0.000027s : 1: overlap_grad_flash_sp 0.02% : 0.000006s : 1: overlap_grad_matmul_and_grad_allreduce 0.04% : 0.000010s : 1: overlap_grad_ring_attention 0.03% : 0.000007s : 1: overlap_opt_shard_grad_in_pipeline 0.02% : 0.000006s : 1: overlap_opt_shard_in_pipeline 0.03% : 0.000008s : 1: overlap_param_gather 0.02% : 0.000006s : 1: overlap_recompute_allgather_and_fa_grad 0.04% : 0.000010s : 1: overlap_recompute_and_grad_model_parallel 0.03% : 0.000008s : 1: overlap_recompute_comm 0.03% : 0.000010s : 1: parallel-infer-symbol 0.02% : 0.000006s : 1: parallel-infer-symbol-second 0.03% : 0.000007s : 1: partial_unused_args_eliminate 0.03% : 0.000009s : 1: pipeline_parallel_scheduler 0.03% : 0.000008s : 1: pipeline_split 0.15% : 0.000040s : 1: pre_auto_parallel 0.12% : 0.000034s : 1: py_interpret_to_execute 0.06% : 0.000016s : 1: py_interpret_to_execute_after_opt_a 0.02% : 0.000007s : 1: remove_cast_before_assign_add 0.08% : 0.000022s : 1: remove_dup_value 1.35% : 0.000375s : 1: renormalize.infer 1.06% : 0.000295s : 1: renormalize.specialize 0.03% : 0.000008s : 1: reorder_send_recv_between_fp_bp 0.04% : 0.000010s : 1: rewriter_after_jit_bprop_graph 0.16% : 0.000045s : 1: rewriter_after_opt_a 0.33% : 0.000091s : 1: rewriter_before_opt_a 0.03% : 0.000008s : 1: slice_cell_reuse_recomputed_activation 0.03% : 0.000007s : 1: slice_recompute_activation 0.02% : 0.000007s : 1: split_layernorm_comm 0.03% : 0.000007s : 1: split_matmul_comm_elemetwise 0.04% : 0.000011s : 1: swap_dp_allreduce_reducescatter 0.38% : 0.000104s : 1: symbol_engine_optimizer 0.39% : 0.000108s : 1: tuple_transform 20.98% : 0.005813s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:45:57.317.859 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0314014, [21] [bootstrap]: 0.00039775 [type_inference]: 0.00546682 [event_method]: 1.9e-05 [auto_monad]: 6.181e-05 [graph_reusing]: 6.53e-06 [inline]: 2.03002e-06 [add_attr]: 0.0194575, [1] [add_attr_with_inline]: 0.0194454, [1] [Cycle 1]: 7.552e-05, [2] [tag_attr]: 2.332e-05 [meta_addattr_fg_expand]: 6.28e-06 [parallel-infer-symbol]: 3.72002e-06 [pre_auto_parallel]: 4.17e-05 [insert-virtual-dataset]: 2.96999e-06 [parallel-infer-symbol-second]: 6.89994e-07 [dataset_repeat_opt]: 2.27001e-06 [pipeline_split]: 1.66e-06 [optimize]: 0.00521431, [53] [py_interpret_to_execute]: 2.793e-05 [rewriter_before_opt_a]: 9.116e-05 [opt_a]: 0.00310143, [2] [Cycle 1]: 0.0023329, [45] [expand_dump_flag]: 3.23e-06 [switch_simplify]: 4.36e-05 [loop_unroll]: 3.096e-05 [a_1]: 0.00068465 [with_stream_mark]: 1.707e-05 [recompute_prepare]: 1.036e-05 [updatestate_depend_eliminate]: 4.92999e-06 [updatestate_assign_eliminate]: 3.77998e-06 [updatestate_loads_eliminate]: 3.56001e-06 [parameter_eliminate]: 1.76e-06 [a_2]: 0.00010775 [accelerated_algorithm]: 8.3e-06 [shard]: 1.87001e-06 [meta_shard_fg_expand]: 1.95001e-06 [shard_inline]: 8.12e-06 [merge_send_recv]: 9.25001e-06 [auto_parallel]: 7.12997e-06 [parallel]: 1.918e-05 [flash_sp]: 8.84998e-06 [merge_comm]: 4.75001e-06 [allreduce_fusion]: 4.19002e-06 [matmul_add_comm_reduction]: 1.053e-05 [allreduce_slice_to_reducescatter]: 6.19999e-07 [virtual_shard_identity]: 9.45001e-06 [virtual_dataset]: 8e-06 [get_grad_eliminate_]: 7.82e-06 [virtual_output]: 7.81001e-06 [merge_forward]: 4.25999e-06 [cell_reuse_recompute_pass]: 1.45001e-06 [offload_activation]: 1.147e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.631e-05 [merge_recompute_call_nodes]: 1.56002e-06 [before_grad]: 1.313e-05 [set_forward_comm_id_for_comm_node_pass]: 4.53001e-06 [meta_fg_expand]: 3.21999e-06 [flash_sp_send_recv_attached]: 2.27001e-06 [receive_attached]: 2.31998e-06 [after_resolve]: 1.355e-05 [a_after_grad]: 1.191e-05 [renormalize]: 0.00084586 [add_forward_monad_depend]: 5.62001e-06 [auto_monad_grad]: 2.59001e-06 [auto_monad_eliminator]: 1.751e-05 [cse]: 3.614e-05 [a_3]: 6.176e-05 [Cycle 2]: 0.00075804, [45] [expand_dump_flag]: 1.28002e-06 [switch_simplify]: 9.04998e-06 [loop_unroll]: 7.35e-06 [a_1]: 0.00017543 [with_stream_mark]: 1.192e-05 [recompute_prepare]: 7.8e-06 [updatestate_depend_eliminate]: 3.78001e-06 [updatestate_assign_eliminate]: 3.02002e-06 [updatestate_loads_eliminate]: 2.93998e-06 [parameter_eliminate]: 1.06002e-06 [a_2]: 9.278e-05 [accelerated_algorithm]: 7.73999e-06 [shard]: 1.54e-06 [meta_shard_fg_expand]: 1.48002e-06 [shard_inline]: 7.4e-06 [merge_send_recv]: 5.92001e-06 [auto_parallel]: 8.61002e-06 [parallel]: 4.98001e-06 [flash_sp]: 3.56999e-06 [merge_comm]: 4.08999e-06 [allreduce_fusion]: 3.89002e-06 [matmul_add_comm_reduction]: 6.88e-06 [allreduce_slice_to_reducescatter]: 3.69997e-07 [virtual_shard_identity]: 8.01001e-06 [virtual_dataset]: 7.15e-06 [get_grad_eliminate_]: 7.21001e-06 [virtual_output]: 6.85998e-06 [merge_forward]: 3.73001e-06 [cell_reuse_recompute_pass]: 1.59e-06 [offload_activation]: 9.66e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.458e-05 [merge_recompute_call_nodes]: 9.09989e-07 [before_grad]: 1.223e-05 [set_forward_comm_id_for_comm_node_pass]: 4.53001e-06 [meta_fg_expand]: 2.74001e-06 [flash_sp_send_recv_attached]: 1.34e-06 [receive_attached]: 9.50007e-07 [after_resolve]: 1.161e-05 [a_after_grad]: 1.12e-05 [renormalize]: 8.00064e-08 [add_forward_monad_depend]: 1.77999e-06 [auto_monad_grad]: 1.13001e-06 [auto_monad_eliminator]: 8.38001e-06 [cse]: 1.818e-05 [a_3]: 4.573e-05 [py_interpret_to_execute_after_opt_a]: 1.134e-05 [slice_cell_reuse_recomputed_activation]: 2.07001e-06 [rewriter_after_opt_a]: 4.249e-05 [convert_after_rewriter]: 7.71001e-06 [order_py_execute_after_rewriter]: 6.04001e-06 [mutable_eliminate]: 0.00050564 [opt_b]: 0.00024154, [1] [Cycle 1]: 0.00023524, [7] [b_1]: 0.00015303 [b_2]: 8.90001e-06 [updatestate_depend_eliminate]: 7.21001e-06 [updatestate_assign_eliminate]: 3.11001e-06 [updatestate_loads_eliminate]: 3.01001e-06 [renormalize]: 4.60015e-07 [cse]: 2.352e-05 [optimize_parallel_all_gather_comm]: 1.782e-05 [overlap_param_gather]: 1.87001e-06 [cconv]: 2.679e-05 [loop_unroll]: 0.00041172 [opt_after_cconv]: 0.00011951, [1] [Cycle 1]: 0.00011382, [7] [c_1]: 3.779e-05 [parameter_eliminate]: 3.43999e-06 [updatestate_depend_eliminate]: 6.46e-06 [updatestate_assign_eliminate]: 3.25002e-06 [updatestate_loads_eliminate]: 3.33998e-06 [cse]: 2.263e-05 [renormalize]: 4.40021e-07 [remove_dup_value]: 1.603e-05 [tuple_transform]: 9.271e-05, [1] [Cycle 1]: 8.818e-05, [4] [d_1]: 5.749e-05 [none_parameter_eliminate]: 1.82001e-06 [renormalize]: 2.30008e-07 [switch_simplify]: 8.85001e-06 [partial_unused_args_eliminate]: 1.86e-06 [add_recomputation]: 5.902e-05 [cse_after_recomputation]: 2.553e-05, [1] [Cycle 1]: 2.106e-05, [1] [cse]: 1.551e-05 [environ_conv]: 6.04001e-06 [swap_dp_allreduce_reducescatter]: 5.97999e-06 [bias_add_comm_swap]: 3.30003e-06 [label_micro_interleaved_index]: 4.41002e-06 [label_fine_grained_interleaved_index]: 2.64001e-06 [merge_cast_opt]: 1.24998e-06 [slice_recompute_activation]: 2.34001e-06 [micro_interleaved_order_control]: 2.63998e-06 [assign_add_opt]: 1.24998e-06 [ForceFp32Comm]: 7.80012e-07 [remove_cast_before_assign_add]: 1.00001e-06 [full_micro_interleaved_order_control]: 2.17999e-06 [reorder_send_recv_between_fp_bp]: 2.62001e-06 [comm_op_add_attrs]: 9.5999e-07 [add_comm_op_reuse_tag]: 1.00001e-06 [interleave_split_concat_branches]: 1.14998e-06 [interleave_parallel_branches]: 1.22999e-06 [overlap_opt_shard_in_pipeline]: 1.07e-06 [overlap_opt_shard_grad_in_pipeline]: 1.79998e-06 [control_data_broadcast_order]: 1.559e-05 [grouped_pairwise_exchange_alltoall]: 1.58002e-06 [offloading_packed_experts]: 4.00998e-06 [overlap_recompute_and_grad_model_parallel]: 4.82998e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.45999e-06 [overlap_recompute_allgather_and_fa_grad]: 1.64e-06 [overlap_recompute_comm]: 2.21998e-06 [overlap_grad_ring_attention]: 4.42998e-06 [overlap_grad_flash_sp]: 2.137e-05 [begin_end_overlap_inline]: 5.50004e-07 [split_matmul_comm_elemetwise]: 2.09e-06 [split_layernorm_comm]: 1.74e-06 [handle_group_info]: 1.30999e-06 [symbol_engine_optimizer]: 8.233e-05, [1] [Cycle 1]: 7.818e-05, [6] [build]: 2.98998e-06 [elim_shapecalc]: 1.144e-05 [elim_not_effective]: 1.516e-05 [opt_reshape]: 7.97e-06 [fold_const_symbol]: 1.23e-05 [renormalize]: 2.19996e-07 [detach_backward]: 1.66e-06 [pipeline_parallel_scheduler]: 1.76e-06 [auto_monad_reorder]: 2.008e-05 [get_jit_bprop_graph]: 1.84998e-06 [rewriter_after_jit_bprop_graph]: 5.04998e-06 [opt_after_jit_grad]: 0.00049896 [validate]: 4.284e-05 Sums bootstrap : 0.000398s : 3.62% type_inference : 0.005467s : 49.73% event_method : 0.000019s : 0.17% auto_monad : 0.000062s : 0.56% graph_reusing : 0.000007s : 0.06% inline : 0.000002s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000023s : 0.21% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.06% parallel-infer-symbol : 0.000004s : 0.03% pre_auto_parallel : 0.000042s : 0.38% insert-virtual-dataset : 0.000003s : 0.03% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.02% optimize.py_interpret_to_execute : 0.000028s : 0.25% optimize.rewriter_before_opt_a : 0.000091s : 0.83% optimize.opt_a.expand_dump_flag : 0.000005s : 0.04% optimize.opt_a.switch_simplify : 0.000053s : 0.48% optimize.opt_a.loop_unroll : 0.000038s : 0.35% optimize.opt_a.a_1 : 0.000860s : 7.82% optimize.opt_a.with_stream_mark : 0.000029s : 0.26% optimize.opt_a.recompute_prepare : 0.000018s : 0.17% optimize.opt_a.updatestate_depend_eliminate : 0.000009s : 0.08% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.06% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.06% optimize.opt_a.parameter_eliminate : 0.000003s : 0.03% optimize.opt_a.a_2 : 0.000201s : 1.82% optimize.opt_a.accelerated_algorithm : 0.000016s : 0.15% optimize.opt_a.shard : 0.000003s : 0.03% optimize.opt_a.meta_shard_fg_expand : 0.000003s : 0.03% optimize.opt_a.shard_inline : 0.000016s : 0.14% optimize.opt_a.merge_send_recv : 0.000015s : 0.14% optimize.opt_a.auto_parallel : 0.000016s : 0.14% optimize.opt_a.parallel : 0.000024s : 0.22% optimize.opt_a.flash_sp : 0.000012s : 0.11% optimize.opt_a.merge_comm : 0.000009s : 0.08% optimize.opt_a.allreduce_fusion : 0.000008s : 0.07% optimize.opt_a.matmul_add_comm_reduction : 0.000017s : 0.16% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000017s : 0.16% optimize.opt_a.virtual_dataset : 0.000015s : 0.14% optimize.opt_a.get_grad_eliminate_ : 0.000015s : 0.14% optimize.opt_a.virtual_output : 0.000015s : 0.13% optimize.opt_a.merge_forward : 0.000008s : 0.07% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.03% optimize.opt_a.offload_activation : 0.000021s : 0.19% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000031s : 0.28% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.02% optimize.opt_a.before_grad : 0.000025s : 0.23% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000009s : 0.08% optimize.opt_a.meta_fg_expand : 0.000006s : 0.05% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.03% optimize.opt_a.receive_attached : 0.000003s : 0.03% optimize.opt_a.after_resolve : 0.000025s : 0.23% optimize.opt_a.a_after_grad : 0.000023s : 0.21% optimize.opt_a.renormalize : 0.000846s : 7.70% optimize.opt_a.add_forward_monad_depend : 0.000007s : 0.07% optimize.opt_a.auto_monad_grad : 0.000004s : 0.03% optimize.opt_a.auto_monad_eliminator : 0.000026s : 0.24% optimize.opt_a.cse : 0.000054s : 0.49% optimize.opt_a.a_3 : 0.000107s : 0.98% optimize.py_interpret_to_execute_after_opt_a : 0.000011s : 0.10% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.02% optimize.rewriter_after_opt_a : 0.000042s : 0.39% optimize.convert_after_rewriter : 0.000008s : 0.07% optimize.order_py_execute_after_rewriter : 0.000006s : 0.05% optimize.mutable_eliminate : 0.000506s : 4.60% optimize.opt_b.b_1 : 0.000153s : 1.39% optimize.opt_b.b_2 : 0.000009s : 0.08% optimize.opt_b.updatestate_depend_eliminate : 0.000007s : 0.07% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_b.renormalize : 0.000000s : 0.00% optimize.opt_b.cse : 0.000024s : 0.21% optimize.optimize_parallel_all_gather_comm : 0.000018s : 0.16% optimize.overlap_param_gather : 0.000002s : 0.02% optimize.cconv : 0.000027s : 0.24% optimize.loop_unroll : 0.000412s : 3.75% optimize.opt_after_cconv.c_1 : 0.000038s : 0.34% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.06% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.cse : 0.000023s : 0.21% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000016s : 0.15% optimize.tuple_transform.d_1 : 0.000057s : 0.52% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.02% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000009s : 0.08% optimize.partial_unused_args_eliminate : 0.000002s : 0.02% optimize.add_recomputation : 0.000059s : 0.54% optimize.cse_after_recomputation.cse : 0.000016s : 0.14% optimize.environ_conv : 0.000006s : 0.05% optimize.swap_dp_allreduce_reducescatter : 0.000006s : 0.05% optimize.bias_add_comm_swap : 0.000003s : 0.03% optimize.label_micro_interleaved_index : 0.000004s : 0.04% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.02% optimize.merge_cast_opt : 0.000001s : 0.01% optimize.slice_recompute_activation : 0.000002s : 0.02% optimize.micro_interleaved_order_control : 0.000003s : 0.02% optimize.assign_add_opt : 0.000001s : 0.01% optimize.ForceFp32Comm : 0.000001s : 0.01% optimize.remove_cast_before_assign_add : 0.000001s : 0.01% optimize.full_micro_interleaved_order_control : 0.000002s : 0.02% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.02% optimize.comm_op_add_attrs : 0.000001s : 0.01% optimize.add_comm_op_reuse_tag : 0.000001s : 0.01% optimize.interleave_split_concat_branches : 0.000001s : 0.01% optimize.interleave_parallel_branches : 0.000001s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.02% optimize.control_data_broadcast_order : 0.000016s : 0.14% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.01% optimize.offloading_packed_experts : 0.000004s : 0.04% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.04% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000002s : 0.01% optimize.overlap_recompute_comm : 0.000002s : 0.02% optimize.overlap_grad_ring_attention : 0.000004s : 0.04% optimize.overlap_grad_flash_sp : 0.000021s : 0.19% optimize.begin_end_overlap_inline : 0.000001s : 0.01% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.02% optimize.split_layernorm_comm : 0.000002s : 0.02% optimize.handle_group_info : 0.000001s : 0.01% optimize.symbol_engine_optimizer.build : 0.000003s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000011s : 0.10% optimize.symbol_engine_optimizer.elim_not_effective : 0.000015s : 0.14% optimize.symbol_engine_optimizer.opt_reshape : 0.000008s : 0.07% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000012s : 0.11% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.02% pipeline_parallel_scheduler : 0.000002s : 0.02% auto_monad_reorder : 0.000020s : 0.18% get_jit_bprop_graph : 0.000002s : 0.02% rewriter_after_jit_bprop_graph : 0.000005s : 0.05% opt_after_jit_grad : 0.000499s : 4.54% validate : 0.000043s : 0.39% Time group info: ------[substitution.] 0.000218 38 11.18% : 0.000024s : 3: substitution.cast_eliminate 0.96% : 0.000002s : 3: substitution.elim_not_effective 0.85% : 0.000002s : 3: substitution.fold_const_symbol 3.37% : 0.000007s : 5: substitution.graph_param_transform 71.06% : 0.000155s : 4: substitution.inline 2.13% : 0.000005s : 6: substitution.j_node_and_user_rematch 2.83% : 0.000006s : 6: substitution.remove_not_recompute_node 1.99% : 0.000004s : 4: substitution.replace_old_param 5.63% : 0.000012s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.005409 2 87.00% : 0.004706s : 1: type_inference.infer 13.00% : 0.000703s : 1: type_inference.specialize ------[replace.] 0.000062 8 62.52% : 0.000039s : 4: replace.inline 37.48% : 0.000023s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000162 8 93.58% : 0.000152s : 4: match.inline 6.42% : 0.000010s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000234 1504 0.93% : 0.000002s : 15: predicate.accumulaten_eliminater 0.76% : 0.000002s : 5: predicate.ad_related_special_op_eliminate 0.57% : 0.000001s : 10: predicate.addn_check_dump 0.92% : 0.000002s : 15: predicate.addn_zero_filter 0.81% : 0.000002s : 15: predicate.adjust_all_reduce_mul_add 2.04% : 0.000005s : 25: predicate.arithmetic_simplify 1.08% : 0.000003s : 15: predicate.cast_eliminate 0.67% : 0.000002s : 10: predicate.check_bprop_eliminate 0.60% : 0.000001s : 10: predicate.compare_switch_simplify 0.21% : 0.000001s : 5: predicate.const_output_eliminate 0.70% : 0.000002s : 10: predicate.depend_value_elim 0.93% : 0.000002s : 15: predicate.dict_get_item_const_eliminator 1.00% : 0.000002s : 15: predicate.dict_get_item_eliminator 0.85% : 0.000002s : 15: predicate.dict_set_item_eliminator 1.04% : 0.000002s : 10: predicate.dumpgradient_eliminate 0.20% : 0.000000s : 5: predicate.elim_not_effective 0.40% : 0.000001s : 5: predicate.elim_shapecalc_of_broadcastargs 1.18% : 0.000003s : 20: predicate.environ_add_const_eliminate 1.11% : 0.000003s : 20: predicate.environ_get_add_eliminate 1.09% : 0.000003s : 20: predicate.environ_get_depend_swap 1.74% : 0.000004s : 30: predicate.environ_get_eliminate 1.11% : 0.000003s : 20: predicate.environ_get_set_eliminate 1.39% : 0.000003s : 23: predicate.exchange_switch_depend_value 2.27% : 0.000005s : 23: predicate.float_depend_g_call 0.62% : 0.000001s : 10: predicate.float_environ_get_switch 0.87% : 0.000002s : 15: predicate.float_tuple_getitem_switch 0.20% : 0.000000s : 5: predicate.fold_const_symbol 0.64% : 0.000002s : 10: predicate.get_grad_eliminate 0.22% : 0.000001s : 5: predicate.graph_param_transform 0.70% : 0.000002s : 10: predicate.incorporate_call 0.60% : 0.000001s : 10: predicate.incorporate_call_switch 6.37% : 0.000015s : 68: predicate.inline 1.06% : 0.000002s : 10: predicate.inline_without_move 0.35% : 0.000001s : 10: predicate.j_node_and_user_rematch 0.76% : 0.000002s : 10: predicate.less_batch_normalization 1.83% : 0.000004s : 29: predicate.list_to_tuple_eliminator_ 2.50% : 0.000006s : 44: predicate.load_eliminater 0.87% : 0.000002s : 5: predicate.loop_unroll_after_grad 2.18% : 0.000005s : 36: predicate.loop_unroll_before_grad 1.66% : 0.000004s : 25: predicate.make_slice_get_slice_eliminator 0.62% : 0.000001s : 10: predicate.merge_addn 0.58% : 0.000001s : 10: predicate.micro_step_allgather_replace 0.61% : 0.000001s : 10: predicate.mini_step_allgather_replace 0.83% : 0.000002s : 15: predicate.minmaximum_grad 1.13% : 0.000003s : 5: predicate.mutable_eliminate 0.34% : 0.000001s : 5: predicate.opt_reshape 0.35% : 0.000001s : 5: predicate.parallel_virtual_node 1.63% : 0.000004s : 23: predicate.partial_defer_inline 1.71% : 0.000004s : 24: predicate.partial_eliminate 0.87% : 0.000002s : 15: predicate.print_const_string_wrapper 0.63% : 0.000001s : 10: predicate.reduce_all_const_elim 1.25% : 0.000003s : 15: predicate.reduce_eliminate 2.51% : 0.000006s : 44: predicate.redundant_stop_gradient_eliminater 0.39% : 0.000001s : 10: predicate.remove_not_recompute_node 1.37% : 0.000003s : 29: predicate.replace_applicator 0.53% : 0.000001s : 10: predicate.replace_old_param 0.26% : 0.000001s : 5: predicate.reset_defer_inline 0.90% : 0.000002s : 15: predicate.reshape_eliminate 0.65% : 0.000002s : 10: predicate.row_tensor_add_zeros_like 0.43% : 0.000001s : 5: predicate.row_tensor_eliminate 0.79% : 0.000002s : 10: predicate.same_eliminate 0.42% : 0.000001s : 10: predicate.set_cell_output_no_recompute 0.75% : 0.000002s : 10: predicate.shard_identity_eliminate 0.76% : 0.000002s : 10: predicate.special_op_eliminate 0.75% : 0.000002s : 10: predicate.specialize_transform 0.86% : 0.000002s : 10: predicate.split_environ_get_set_with_tuple_value 0.76% : 0.000002s : 10: predicate.stack_unstack_eliminate 0.38% : 0.000001s : 5: predicate.switch_call_monad_eliminater 1.44% : 0.000003s : 23: predicate.switch_defer_inline 2.06% : 0.000005s : 33: predicate.switch_layer_defer_inline 4.91% : 0.000011s : 74: predicate.switch_simplify 0.88% : 0.000002s : 15: predicate.tile_eliminate 0.90% : 0.000002s : 15: predicate.transpose_eliminate 1.54% : 0.000004s : 25: predicate.tuple_list_convert_item_index_to_positive 1.64% : 0.000004s : 25: predicate.tuple_list_get_item_const_eliminator 1.50% : 0.000004s : 25: predicate.tuple_list_get_item_depend_reorder 3.14% : 0.000007s : 39: predicate.tuple_list_get_item_eliminator 1.59% : 0.000004s : 25: predicate.tuple_list_get_set_item_eliminator 2.40% : 0.000006s : 35: predicate.tuple_list_set_item_eliminator 1.77% : 0.000004s : 29: predicate.tuple_to_list_eliminator_ 2.46% : 0.000006s : 44: predicate.updatestate_pure_node_eliminater 3.16% : 0.000007s : 54: predicate.updatestate_useless_node_eliminater 0.42% : 0.000001s : 5: predicate.value_based_eliminate 0.70% : 0.000002s : 10: predicate.virtual_dataset_eliminate 0.64% : 0.000001s : 10: predicate.virtual_output_eliminate 0.29% : 0.000001s : 5: predicate.virtual_view_grad_eliminate 0.45% : 0.000001s : 5: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000509 11 49.08% : 0.000250s : 5: func_graph_cloner_run.FuncGraphClonerGraph 50.92% : 0.000259s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.058506 192 0.01% : 0.000003s : 1: ForceFp32Comm 33.27% : 0.019464s : 1: add_attr 33.24% : 0.019450s : 1: add_attr_with_inline 0.01% : 0.000004s : 1: add_comm_op_reuse_tag 0.11% : 0.000063s : 1: add_recomputation 0.01% : 0.000004s : 1: assign_add_opt 0.12% : 0.000068s : 1: auto_monad 0.04% : 0.000024s : 1: auto_monad_reorder 0.01% : 0.000003s : 1: begin_end_overlap_inline 0.01% : 0.000006s : 1: bias_add_comm_swap 0.73% : 0.000426s : 1: bootstrap 0.05% : 0.000030s : 1: cconv 0.01% : 0.000004s : 1: comm_op_add_attrs 0.03% : 0.000019s : 1: control_data_broadcast_order 0.02% : 0.000011s : 1: convert_after_rewriter 0.05% : 0.000028s : 1: cse_after_recomputation 0.01% : 0.000005s : 1: dataset_repeat_opt 0.01% : 0.000005s : 1: detach_backward 0.02% : 0.000009s : 1: environ_conv 0.04% : 0.000025s : 1: event_method 0.01% : 0.000005s : 1: full_micro_interleaved_order_control 0.01% : 0.000005s : 1: get_jit_bprop_graph 0.02% : 0.000010s : 1: graph_reusing 0.01% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000004s : 1: handle_group_info 0.01% : 0.000005s : 1: inline 0.01% : 0.000007s : 1: insert-virtual-dataset 0.01% : 0.000004s : 1: interleave_parallel_branches 0.01% : 0.000004s : 1: interleave_split_concat_branches 0.01% : 0.000006s : 1: label_fine_grained_interleaved_index 0.01% : 0.000007s : 1: label_micro_interleaved_index 0.72% : 0.000419s : 1: loop_unroll 0.01% : 0.000004s : 1: merge_cast_opt 0.01% : 0.000005s : 1: micro_interleaved_order_control 0.88% : 0.000514s : 1: mutable_eliminate 0.01% : 0.000007s : 1: offloading_packed_experts 0.03% : 0.000015s : 1: opt.transform.loop_unroll_optimizer 0.03% : 0.000017s : 1: opt.transform.mutable_eliminate 2.36% : 0.001382s : 78: opt.transform.opt_a 0.06% : 0.000036s : 1: opt.transform.opt_after_cconv 0.05% : 0.000031s : 1: opt.transform.opt_after_jit_grad 0.22% : 0.000131s : 28: opt.transform.opt_b 0.11% : 0.000064s : 2: opt.transform.opt_trans_graph 0.07% : 0.000043s : 4: opt.transform.symbol_engine_opt 5.31% : 0.003104s : 1: opt_a 0.21% : 0.000123s : 1: opt_after_cconv 0.87% : 0.000508s : 1: opt_after_jit_grad 0.42% : 0.000245s : 1: opt_b 8.92% : 0.005219s : 1: optimize 0.04% : 0.000021s : 1: optimize_parallel_all_gather_comm 0.02% : 0.000009s : 1: order_py_execute_after_rewriter 0.04% : 0.000025s : 1: overlap_grad_flash_sp 0.01% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.01% : 0.000007s : 1: overlap_grad_ring_attention 0.01% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.01% : 0.000005s : 1: overlap_param_gather 0.01% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.01% : 0.000008s : 1: overlap_recompute_and_grad_model_parallel 0.01% : 0.000005s : 1: overlap_recompute_comm 0.01% : 0.000007s : 1: parallel-infer-symbol 0.01% : 0.000004s : 1: parallel-infer-symbol-second 0.01% : 0.000005s : 1: partial_unused_args_eliminate 0.01% : 0.000005s : 1: pipeline_parallel_scheduler 0.01% : 0.000005s : 1: pipeline_split 0.08% : 0.000046s : 1: pre_auto_parallel 0.05% : 0.000032s : 1: py_interpret_to_execute 0.03% : 0.000015s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000004s : 1: remove_cast_before_assign_add 0.03% : 0.000020s : 1: remove_dup_value 0.84% : 0.000490s : 1: renormalize.infer 0.59% : 0.000348s : 1: renormalize.specialize 0.01% : 0.000005s : 1: reorder_send_recv_between_fp_bp 0.01% : 0.000008s : 1: rewriter_after_jit_bprop_graph 0.08% : 0.000047s : 1: rewriter_after_opt_a 0.16% : 0.000095s : 1: rewriter_before_opt_a 0.01% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.01% : 0.000005s : 1: slice_recompute_activation 0.01% : 0.000004s : 1: split_layernorm_comm 0.01% : 0.000005s : 1: split_matmul_comm_elemetwise 0.02% : 0.000009s : 1: swap_dp_allreduce_reducescatter 0.15% : 0.000085s : 1: symbol_engine_optimizer 0.16% : 0.000096s : 1: tuple_transform 9.37% : 0.005483s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:45:57.731.200 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:45:57.731.460 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0155873, [21] [bootstrap]: 0.00040936 [type_inference]: 0.00547805 [event_method]: 1.89e-05 [auto_monad]: 6.015e-05 [graph_reusing]: 6.22001e-06 [inline]: 2.04e-06 [add_attr]: 0.00307656, [1] [add_attr_with_inline]: 0.003069, [1] [Cycle 1]: 6.21e-05, [2] [tag_attr]: 1.839e-05 [meta_addattr_fg_expand]: 5.21002e-06 [parallel-infer-symbol]: 2.93e-06 [pre_auto_parallel]: 2.894e-05 [insert-virtual-dataset]: 2.36e-06 [parallel-infer-symbol-second]: 7.09988e-07 [dataset_repeat_opt]: 2.03002e-06 [pipeline_split]: 1.76e-06 [optimize]: 0.00542196, [53] [py_interpret_to_execute]: 2.79e-05 [rewriter_before_opt_a]: 8.969e-05 [opt_a]: 0.00311325, [2] [Cycle 1]: 0.00217904, [45] [expand_dump_flag]: 2.29001e-06 [switch_simplify]: 3.899e-05 [loop_unroll]: 3.217e-05 [a_1]: 0.00066186 [with_stream_mark]: 1.31e-05 [recompute_prepare]: 9.77999e-06 [updatestate_depend_eliminate]: 4.16001e-06 [updatestate_assign_eliminate]: 3.77998e-06 [updatestate_loads_eliminate]: 3.53999e-06 [parameter_eliminate]: 1.34e-06 [a_2]: 0.00013166 [accelerated_algorithm]: 9.17999e-06 [shard]: 1.52001e-06 [meta_shard_fg_expand]: 1.98002e-06 [shard_inline]: 8.38999e-06 [merge_send_recv]: 6.81999e-06 [auto_parallel]: 6.40997e-06 [parallel]: 1.456e-05 [flash_sp]: 7.92e-06 [merge_comm]: 5.00001e-06 [allreduce_fusion]: 3.94002e-06 [matmul_add_comm_reduction]: 7.6e-06 [allreduce_slice_to_reducescatter]: 4.7998e-07 [virtual_shard_identity]: 9.14e-06 [virtual_dataset]: 8.55999e-06 [get_grad_eliminate_]: 7.45e-06 [virtual_output]: 8.72e-06 [merge_forward]: 3.88001e-06 [cell_reuse_recompute_pass]: 1.16997e-06 [offload_activation]: 8.2e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.652e-05 [merge_recompute_call_nodes]: 9.20001e-07 [before_grad]: 1.362e-05 [set_forward_comm_id_for_comm_node_pass]: 4.38999e-06 [meta_fg_expand]: 3.06001e-06 [flash_sp_send_recv_attached]: 2.14e-06 [receive_attached]: 1.35999e-06 [after_resolve]: 1.309e-05 [a_after_grad]: 1.278e-05 [renormalize]: 0.0005928 [add_forward_monad_depend]: 4.80001e-06 [auto_monad_grad]: 1.37999e-06 [auto_monad_eliminator]: 1.3e-05 [cse]: 2.256e-05 [a_3]: 6.899e-05 [Cycle 2]: 0.00092115, [45] [expand_dump_flag]: 1.49e-06 [switch_simplify]: 8.87e-06 [loop_unroll]: 7.46001e-06 [a_1]: 0.00017243 [with_stream_mark]: 1.184e-05 [recompute_prepare]: 7.75e-06 [updatestate_depend_eliminate]: 3.84002e-06 [updatestate_assign_eliminate]: 3.04001e-06 [updatestate_loads_eliminate]: 3.01001e-06 [parameter_eliminate]: 1.00999e-06 [a_2]: 0.00011943 [accelerated_algorithm]: 7.36999e-06 [shard]: 1.12e-06 [meta_shard_fg_expand]: 1.51998e-06 [shard_inline]: 7.61999e-06 [merge_send_recv]: 6.25002e-06 [auto_parallel]: 6.17001e-06 [parallel]: 4.12998e-06 [flash_sp]: 3.46001e-06 [merge_comm]: 4.33999e-06 [allreduce_fusion]: 3.85e-06 [matmul_add_comm_reduction]: 9.71998e-06 [allreduce_slice_to_reducescatter]: 3.80009e-07 [virtual_shard_identity]: 8.35001e-06 [virtual_dataset]: 7.06001e-06 [get_grad_eliminate_]: 7.01001e-06 [virtual_output]: 6.89999e-06 [merge_forward]: 3.19001e-06 [cell_reuse_recompute_pass]: 1.32e-06 [offload_activation]: 6.96001e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.601e-05 [merge_recompute_call_nodes]: 7.30011e-07 [before_grad]: 1.177e-05 [set_forward_comm_id_for_comm_node_pass]: 4.3e-06 [meta_fg_expand]: 2.81e-06 [flash_sp_send_recv_attached]: 8.2e-07 [receive_attached]: 1.12e-06 [after_resolve]: 1.299e-05 [a_after_grad]: 1.188e-05 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 1.37999e-06 [auto_monad_grad]: 9.79984e-07 [auto_monad_eliminator]: 8.62998e-06 [cse]: 1.823e-05 [a_3]: 5.832e-05 [py_interpret_to_execute_after_opt_a]: 1.203e-05 [slice_cell_reuse_recomputed_activation]: 4.48999e-06 [rewriter_after_opt_a]: 4.255e-05 [convert_after_rewriter]: 1.067e-05 [order_py_execute_after_rewriter]: 8.54e-06 [mutable_eliminate]: 0.00048539 [opt_b]: 0.00030911, [1] [Cycle 1]: 0.00029996, [7] [b_1]: 0.00020152 [b_2]: 9.38002e-06 [updatestate_depend_eliminate]: 5.81998e-06 [updatestate_assign_eliminate]: 3.03998e-06 [updatestate_loads_eliminate]: 3.01999e-06 [renormalize]: 3.69997e-07 [cse]: 2.2e-05 [optimize_parallel_all_gather_comm]: 2.104e-05 [overlap_param_gather]: 4.79002e-06 [cconv]: 2.752e-05 [loop_unroll]: 0.00042577 [opt_after_cconv]: 0.00013698, [1] [Cycle 1]: 0.00012838, [7] [c_1]: 3.669e-05 [parameter_eliminate]: 2.64001e-06 [updatestate_depend_eliminate]: 5.82999e-06 [updatestate_assign_eliminate]: 3.21001e-06 [updatestate_loads_eliminate]: 2.94999e-06 [cse]: 2.144e-05 [renormalize]: 3.20026e-07 [remove_dup_value]: 1.819e-05 [tuple_transform]: 0.00010005, [1] [Cycle 1]: 9.265e-05, [4] [d_1]: 5.158e-05 [none_parameter_eliminate]: 1.68002e-06 [renormalize]: 2.40019e-07 [switch_simplify]: 8.36002e-06 [partial_unused_args_eliminate]: 4.27e-06 [add_recomputation]: 5.739e-05 [cse_after_recomputation]: 3.16e-05, [1] [Cycle 1]: 2.463e-05, [1] [cse]: 1.562e-05 [environ_conv]: 9.41998e-06 [swap_dp_allreduce_reducescatter]: 8.84e-06 [bias_add_comm_swap]: 5.00001e-06 [label_micro_interleaved_index]: 6.89999e-06 [label_fine_grained_interleaved_index]: 5.24998e-06 [merge_cast_opt]: 3.79002e-06 [slice_recompute_activation]: 4.45999e-06 [micro_interleaved_order_control]: 5.05001e-06 [assign_add_opt]: 3.53999e-06 [ForceFp32Comm]: 3.15998e-06 [remove_cast_before_assign_add]: 3.35e-06 [full_micro_interleaved_order_control]: 4.77e-06 [reorder_send_recv_between_fp_bp]: 5.44998e-06 [comm_op_add_attrs]: 3.33e-06 [add_comm_op_reuse_tag]: 3.2e-06 [interleave_split_concat_branches]: 3.54002e-06 [interleave_parallel_branches]: 3.38e-06 [overlap_opt_shard_in_pipeline]: 3.50998e-06 [overlap_opt_shard_grad_in_pipeline]: 4.07e-06 [control_data_broadcast_order]: 1.668e-05 [grouped_pairwise_exchange_alltoall]: 3.81001e-06 [offloading_packed_experts]: 7.03e-06 [overlap_recompute_and_grad_model_parallel]: 7.28999e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.71001e-06 [overlap_recompute_allgather_and_fa_grad]: 3.75e-06 [overlap_recompute_comm]: 4.65001e-06 [overlap_grad_ring_attention]: 6.75998e-06 [overlap_grad_flash_sp]: 2.469e-05 [begin_end_overlap_inline]: 2.86e-06 [split_matmul_comm_elemetwise]: 4.50999e-06 [split_layernorm_comm]: 3.93999e-06 [handle_group_info]: 3.5e-06 [symbol_engine_optimizer]: 0.00010091, [1] [Cycle 1]: 9.377e-05, [6] [build]: 3.08998e-06 [elim_shapecalc]: 1.009e-05 [elim_not_effective]: 1.53e-05 [opt_reshape]: 8.25e-06 [fold_const_symbol]: 1.208e-05 [renormalize]: 2.10013e-07 [detach_backward]: 3.29001e-06 [pipeline_parallel_scheduler]: 1.74e-06 [auto_monad_reorder]: 2.122e-05 [get_jit_bprop_graph]: 1.75001e-06 [rewriter_after_jit_bprop_graph]: 3.66999e-06 [opt_after_jit_grad]: 0.00046898 [validate]: 3.912e-05 Sums bootstrap : 0.000409s : 3.78% type_inference : 0.005478s : 50.60% event_method : 0.000019s : 0.17% auto_monad : 0.000060s : 0.56% graph_reusing : 0.000006s : 0.06% inline : 0.000002s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000018s : 0.17% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000005s : 0.05% parallel-infer-symbol : 0.000003s : 0.03% pre_auto_parallel : 0.000029s : 0.27% insert-virtual-dataset : 0.000002s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.02% optimize.py_interpret_to_execute : 0.000028s : 0.26% optimize.rewriter_before_opt_a : 0.000090s : 0.83% optimize.opt_a.expand_dump_flag : 0.000004s : 0.03% optimize.opt_a.switch_simplify : 0.000048s : 0.44% optimize.opt_a.loop_unroll : 0.000040s : 0.37% optimize.opt_a.a_1 : 0.000834s : 7.71% optimize.opt_a.with_stream_mark : 0.000025s : 0.23% optimize.opt_a.recompute_prepare : 0.000018s : 0.16% optimize.opt_a.updatestate_depend_eliminate : 0.000008s : 0.07% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.06% optimize.opt_a.updatestate_loads_eliminate : 0.000007s : 0.06% optimize.opt_a.parameter_eliminate : 0.000002s : 0.02% optimize.opt_a.a_2 : 0.000251s : 2.32% optimize.opt_a.accelerated_algorithm : 0.000017s : 0.15% optimize.opt_a.shard : 0.000003s : 0.02% optimize.opt_a.meta_shard_fg_expand : 0.000003s : 0.03% optimize.opt_a.shard_inline : 0.000016s : 0.15% optimize.opt_a.merge_send_recv : 0.000013s : 0.12% optimize.opt_a.auto_parallel : 0.000013s : 0.12% optimize.opt_a.parallel : 0.000019s : 0.17% optimize.opt_a.flash_sp : 0.000011s : 0.11% optimize.opt_a.merge_comm : 0.000009s : 0.09% optimize.opt_a.allreduce_fusion : 0.000008s : 0.07% optimize.opt_a.matmul_add_comm_reduction : 0.000017s : 0.16% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000017s : 0.16% optimize.opt_a.virtual_dataset : 0.000016s : 0.14% optimize.opt_a.get_grad_eliminate_ : 0.000014s : 0.13% optimize.opt_a.virtual_output : 0.000016s : 0.14% optimize.opt_a.merge_forward : 0.000007s : 0.07% optimize.opt_a.cell_reuse_recompute_pass : 0.000002s : 0.02% optimize.opt_a.offload_activation : 0.000015s : 0.14% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000033s : 0.30% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.02% optimize.opt_a.before_grad : 0.000025s : 0.23% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000009s : 0.08% optimize.opt_a.meta_fg_expand : 0.000006s : 0.05% optimize.opt_a.flash_sp_send_recv_attached : 0.000003s : 0.03% optimize.opt_a.receive_attached : 0.000002s : 0.02% optimize.opt_a.after_resolve : 0.000026s : 0.24% optimize.opt_a.a_after_grad : 0.000025s : 0.23% optimize.opt_a.renormalize : 0.000593s : 5.48% optimize.opt_a.add_forward_monad_depend : 0.000006s : 0.06% optimize.opt_a.auto_monad_grad : 0.000002s : 0.02% optimize.opt_a.auto_monad_eliminator : 0.000022s : 0.20% optimize.opt_a.cse : 0.000041s : 0.38% optimize.opt_a.a_3 : 0.000127s : 1.18% optimize.py_interpret_to_execute_after_opt_a : 0.000012s : 0.11% optimize.slice_cell_reuse_recomputed_activation : 0.000004s : 0.04% optimize.rewriter_after_opt_a : 0.000043s : 0.39% optimize.convert_after_rewriter : 0.000011s : 0.10% optimize.order_py_execute_after_rewriter : 0.000009s : 0.08% optimize.mutable_eliminate : 0.000485s : 4.48% optimize.opt_b.b_1 : 0.000202s : 1.86% optimize.opt_b.b_2 : 0.000009s : 0.09% optimize.opt_b.updatestate_depend_eliminate : 0.000006s : 0.05% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_b.renormalize : 0.000000s : 0.00% optimize.opt_b.cse : 0.000022s : 0.20% optimize.optimize_parallel_all_gather_comm : 0.000021s : 0.19% optimize.overlap_param_gather : 0.000005s : 0.04% optimize.cconv : 0.000028s : 0.25% optimize.loop_unroll : 0.000426s : 3.93% optimize.opt_after_cconv.c_1 : 0.000037s : 0.34% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.02% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.05% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.cse : 0.000021s : 0.20% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000018s : 0.17% optimize.tuple_transform.d_1 : 0.000052s : 0.48% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.02% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000008s : 0.08% optimize.partial_unused_args_eliminate : 0.000004s : 0.04% optimize.add_recomputation : 0.000057s : 0.53% optimize.cse_after_recomputation.cse : 0.000016s : 0.14% optimize.environ_conv : 0.000009s : 0.09% optimize.swap_dp_allreduce_reducescatter : 0.000009s : 0.08% optimize.bias_add_comm_swap : 0.000005s : 0.05% optimize.label_micro_interleaved_index : 0.000007s : 0.06% optimize.label_fine_grained_interleaved_index : 0.000005s : 0.05% optimize.merge_cast_opt : 0.000004s : 0.04% optimize.slice_recompute_activation : 0.000004s : 0.04% optimize.micro_interleaved_order_control : 0.000005s : 0.05% optimize.assign_add_opt : 0.000004s : 0.03% optimize.ForceFp32Comm : 0.000003s : 0.03% optimize.remove_cast_before_assign_add : 0.000003s : 0.03% optimize.full_micro_interleaved_order_control : 0.000005s : 0.04% optimize.reorder_send_recv_between_fp_bp : 0.000005s : 0.05% optimize.comm_op_add_attrs : 0.000003s : 0.03% optimize.add_comm_op_reuse_tag : 0.000003s : 0.03% optimize.interleave_split_concat_branches : 0.000004s : 0.03% optimize.interleave_parallel_branches : 0.000003s : 0.03% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.03% optimize.overlap_opt_shard_grad_in_pipeline : 0.000004s : 0.04% optimize.control_data_broadcast_order : 0.000017s : 0.15% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.04% optimize.offloading_packed_experts : 0.000007s : 0.06% optimize.overlap_recompute_and_grad_model_parallel : 0.000007s : 0.07% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.03% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.03% optimize.overlap_recompute_comm : 0.000005s : 0.04% optimize.overlap_grad_ring_attention : 0.000007s : 0.06% optimize.overlap_grad_flash_sp : 0.000025s : 0.23% optimize.begin_end_overlap_inline : 0.000003s : 0.03% optimize.split_matmul_comm_elemetwise : 0.000005s : 0.04% optimize.split_layernorm_comm : 0.000004s : 0.04% optimize.handle_group_info : 0.000003s : 0.03% optimize.symbol_engine_optimizer.build : 0.000003s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000010s : 0.09% optimize.symbol_engine_optimizer.elim_not_effective : 0.000015s : 0.14% optimize.symbol_engine_optimizer.opt_reshape : 0.000008s : 0.08% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000012s : 0.11% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000003s : 0.03% pipeline_parallel_scheduler : 0.000002s : 0.02% auto_monad_reorder : 0.000021s : 0.20% get_jit_bprop_graph : 0.000002s : 0.02% rewriter_after_jit_bprop_graph : 0.000004s : 0.03% opt_after_jit_grad : 0.000469s : 4.33% validate : 0.000039s : 0.36% Time group info: ------[substitution.] 0.000186 38 11.37% : 0.000021s : 3: substitution.cast_eliminate 1.20% : 0.000002s : 3: substitution.elim_not_effective 0.88% : 0.000002s : 3: substitution.fold_const_symbol 3.64% : 0.000007s : 5: substitution.graph_param_transform 68.94% : 0.000128s : 4: substitution.inline 1.97% : 0.000004s : 6: substitution.j_node_and_user_rematch 2.91% : 0.000005s : 6: substitution.remove_not_recompute_node 2.35% : 0.000004s : 4: substitution.replace_old_param 6.73% : 0.000013s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.005434 2 87.66% : 0.004763s : 1: type_inference.infer 12.34% : 0.000671s : 1: type_inference.specialize ------[replace.] 0.000058 8 60.43% : 0.000035s : 4: replace.inline 39.57% : 0.000023s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000136 8 92.21% : 0.000125s : 4: match.inline 7.79% : 0.000011s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000239 1504 0.86% : 0.000002s : 15: predicate.accumulaten_eliminater 0.69% : 0.000002s : 5: predicate.ad_related_special_op_eliminate 0.64% : 0.000002s : 10: predicate.addn_check_dump 0.97% : 0.000002s : 15: predicate.addn_zero_filter 0.84% : 0.000002s : 15: predicate.adjust_all_reduce_mul_add 2.02% : 0.000005s : 25: predicate.arithmetic_simplify 0.98% : 0.000002s : 15: predicate.cast_eliminate 0.62% : 0.000001s : 10: predicate.check_bprop_eliminate 0.65% : 0.000002s : 10: predicate.compare_switch_simplify 0.20% : 0.000000s : 5: predicate.const_output_eliminate 0.70% : 0.000002s : 10: predicate.depend_value_elim 0.94% : 0.000002s : 15: predicate.dict_get_item_const_eliminator 1.01% : 0.000002s : 15: predicate.dict_get_item_eliminator 0.88% : 0.000002s : 15: predicate.dict_set_item_eliminator 0.93% : 0.000002s : 10: predicate.dumpgradient_eliminate 0.23% : 0.000001s : 5: predicate.elim_not_effective 0.33% : 0.000001s : 5: predicate.elim_shapecalc_of_broadcastargs 1.21% : 0.000003s : 20: predicate.environ_add_const_eliminate 1.14% : 0.000003s : 20: predicate.environ_get_add_eliminate 1.11% : 0.000003s : 20: predicate.environ_get_depend_swap 1.82% : 0.000004s : 30: predicate.environ_get_eliminate 1.09% : 0.000003s : 20: predicate.environ_get_set_eliminate 1.48% : 0.000004s : 23: predicate.exchange_switch_depend_value 2.14% : 0.000005s : 23: predicate.float_depend_g_call 0.62% : 0.000001s : 10: predicate.float_environ_get_switch 0.89% : 0.000002s : 15: predicate.float_tuple_getitem_switch 0.22% : 0.000001s : 5: predicate.fold_const_symbol 0.64% : 0.000002s : 10: predicate.get_grad_eliminate 0.21% : 0.000001s : 5: predicate.graph_param_transform 0.65% : 0.000002s : 10: predicate.incorporate_call 0.55% : 0.000001s : 10: predicate.incorporate_call_switch 6.29% : 0.000015s : 68: predicate.inline 0.84% : 0.000002s : 10: predicate.inline_without_move 0.35% : 0.000001s : 10: predicate.j_node_and_user_rematch 0.93% : 0.000002s : 10: predicate.less_batch_normalization 1.84% : 0.000004s : 29: predicate.list_to_tuple_eliminator_ 2.53% : 0.000006s : 44: predicate.load_eliminater 0.89% : 0.000002s : 5: predicate.loop_unroll_after_grad 2.22% : 0.000005s : 36: predicate.loop_unroll_before_grad 1.59% : 0.000004s : 25: predicate.make_slice_get_slice_eliminator 0.63% : 0.000002s : 10: predicate.merge_addn 0.59% : 0.000001s : 10: predicate.micro_step_allgather_replace 0.59% : 0.000001s : 10: predicate.mini_step_allgather_replace 0.90% : 0.000002s : 15: predicate.minmaximum_grad 0.96% : 0.000002s : 5: predicate.mutable_eliminate 0.36% : 0.000001s : 5: predicate.opt_reshape 0.38% : 0.000001s : 5: predicate.parallel_virtual_node 1.80% : 0.000004s : 23: predicate.partial_defer_inline 1.68% : 0.000004s : 24: predicate.partial_eliminate 0.97% : 0.000002s : 15: predicate.print_const_string_wrapper 0.69% : 0.000002s : 10: predicate.reduce_all_const_elim 1.17% : 0.000003s : 15: predicate.reduce_eliminate 2.48% : 0.000006s : 44: predicate.redundant_stop_gradient_eliminater 0.40% : 0.000001s : 10: predicate.remove_not_recompute_node 1.33% : 0.000003s : 29: predicate.replace_applicator 0.42% : 0.000001s : 10: predicate.replace_old_param 0.28% : 0.000001s : 5: predicate.reset_defer_inline 0.93% : 0.000002s : 15: predicate.reshape_eliminate 0.61% : 0.000001s : 10: predicate.row_tensor_add_zeros_like 0.34% : 0.000001s : 5: predicate.row_tensor_eliminate 0.73% : 0.000002s : 10: predicate.same_eliminate 0.44% : 0.000001s : 10: predicate.set_cell_output_no_recompute 0.79% : 0.000002s : 10: predicate.shard_identity_eliminate 0.70% : 0.000002s : 10: predicate.special_op_eliminate 0.76% : 0.000002s : 10: predicate.specialize_transform 0.84% : 0.000002s : 10: predicate.split_environ_get_set_with_tuple_value 0.78% : 0.000002s : 10: predicate.stack_unstack_eliminate 0.36% : 0.000001s : 5: predicate.switch_call_monad_eliminater 1.50% : 0.000004s : 23: predicate.switch_defer_inline 2.05% : 0.000005s : 33: predicate.switch_layer_defer_inline 4.80% : 0.000011s : 74: predicate.switch_simplify 0.91% : 0.000002s : 15: predicate.tile_eliminate 0.92% : 0.000002s : 15: predicate.transpose_eliminate 1.64% : 0.000004s : 25: predicate.tuple_list_convert_item_index_to_positive 1.68% : 0.000004s : 25: predicate.tuple_list_get_item_const_eliminator 1.60% : 0.000004s : 25: predicate.tuple_list_get_item_depend_reorder 3.18% : 0.000008s : 39: predicate.tuple_list_get_item_eliminator 1.78% : 0.000004s : 25: predicate.tuple_list_get_set_item_eliminator 2.27% : 0.000005s : 35: predicate.tuple_list_set_item_eliminator 1.79% : 0.000004s : 29: predicate.tuple_to_list_eliminator_ 2.50% : 0.000006s : 44: predicate.updatestate_pure_node_eliminater 3.24% : 0.000008s : 54: predicate.updatestate_useless_node_eliminater 0.39% : 0.000001s : 5: predicate.value_based_eliminate 0.65% : 0.000002s : 10: predicate.virtual_dataset_eliminate 0.67% : 0.000002s : 10: predicate.virtual_output_eliminate 0.27% : 0.000001s : 5: predicate.virtual_view_grad_eliminate 0.47% : 0.000001s : 5: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000490 11 52.44% : 0.000257s : 5: func_graph_cloner_run.FuncGraphClonerGraph 47.56% : 0.000233s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.026240 192 0.02% : 0.000006s : 1: ForceFp32Comm 11.76% : 0.003085s : 1: add_attr 11.71% : 0.003072s : 1: add_attr_with_inline 0.02% : 0.000006s : 1: add_comm_op_reuse_tag 0.23% : 0.000061s : 1: add_recomputation 0.02% : 0.000006s : 1: assign_add_opt 0.26% : 0.000069s : 1: auto_monad 0.11% : 0.000029s : 1: auto_monad_reorder 0.02% : 0.000006s : 1: begin_end_overlap_inline 0.03% : 0.000008s : 1: bias_add_comm_swap 1.71% : 0.000450s : 1: bootstrap 0.12% : 0.000031s : 1: cconv 0.02% : 0.000006s : 1: comm_op_add_attrs 0.07% : 0.000020s : 1: control_data_broadcast_order 0.05% : 0.000014s : 1: convert_after_rewriter 0.13% : 0.000035s : 1: cse_after_recomputation 0.03% : 0.000007s : 1: dataset_repeat_opt 0.06% : 0.000017s : 1: detach_backward 0.05% : 0.000012s : 1: environ_conv 0.11% : 0.000029s : 1: event_method 0.03% : 0.000007s : 1: full_micro_interleaved_order_control 0.03% : 0.000008s : 1: get_jit_bprop_graph 0.05% : 0.000012s : 1: graph_reusing 0.02% : 0.000006s : 1: grouped_pairwise_exchange_alltoall 0.02% : 0.000006s : 1: handle_group_info 0.03% : 0.000008s : 1: inline 0.03% : 0.000008s : 1: insert-virtual-dataset 0.02% : 0.000006s : 1: interleave_parallel_branches 0.02% : 0.000006s : 1: interleave_split_concat_branches 0.03% : 0.000008s : 1: label_fine_grained_interleaved_index 0.04% : 0.000010s : 1: label_micro_interleaved_index 1.64% : 0.000431s : 1: loop_unroll 0.02% : 0.000006s : 1: merge_cast_opt 0.03% : 0.000008s : 1: micro_interleaved_order_control 1.87% : 0.000491s : 1: mutable_eliminate 0.04% : 0.000010s : 1: offloading_packed_experts 0.06% : 0.000016s : 1: opt.transform.loop_unroll_optimizer 0.06% : 0.000017s : 1: opt.transform.mutable_eliminate 5.16% : 0.001353s : 78: opt.transform.opt_a 0.13% : 0.000035s : 1: opt.transform.opt_after_cconv 0.11% : 0.000029s : 1: opt.transform.opt_after_jit_grad 0.52% : 0.000137s : 28: opt.transform.opt_b 0.22% : 0.000058s : 2: opt.transform.opt_trans_graph 0.16% : 0.000043s : 4: opt.transform.symbol_engine_opt 11.88% : 0.003116s : 1: opt_a 0.54% : 0.000141s : 1: opt_after_cconv 1.82% : 0.000479s : 1: opt_after_jit_grad 1.19% : 0.000313s : 1: opt_b 21.84% : 0.005732s : 1: optimize 0.09% : 0.000024s : 1: optimize_parallel_all_gather_comm 0.04% : 0.000011s : 1: order_py_execute_after_rewriter 0.11% : 0.000028s : 1: overlap_grad_flash_sp 0.02% : 0.000006s : 1: overlap_grad_matmul_and_grad_allreduce 0.04% : 0.000010s : 1: overlap_grad_ring_attention 0.03% : 0.000007s : 1: overlap_opt_shard_grad_in_pipeline 0.02% : 0.000006s : 1: overlap_opt_shard_in_pipeline 0.03% : 0.000008s : 1: overlap_param_gather 0.02% : 0.000006s : 1: overlap_recompute_allgather_and_fa_grad 0.04% : 0.000010s : 1: overlap_recompute_and_grad_model_parallel 0.03% : 0.000007s : 1: overlap_recompute_comm 0.04% : 0.000009s : 1: parallel-infer-symbol 0.02% : 0.000006s : 1: parallel-infer-symbol-second 0.03% : 0.000007s : 1: partial_unused_args_eliminate 0.04% : 0.000009s : 1: pipeline_parallel_scheduler 0.03% : 0.000007s : 1: pipeline_split 0.14% : 0.000036s : 1: pre_auto_parallel 0.12% : 0.000032s : 1: py_interpret_to_execute 0.06% : 0.000015s : 1: py_interpret_to_execute_after_opt_a 0.02% : 0.000006s : 1: remove_cast_before_assign_add 0.08% : 0.000022s : 1: remove_dup_value 1.20% : 0.000315s : 1: renormalize.infer 1.03% : 0.000270s : 1: renormalize.specialize 0.03% : 0.000008s : 1: reorder_send_recv_between_fp_bp 0.04% : 0.000009s : 1: rewriter_after_jit_bprop_graph 0.17% : 0.000046s : 1: rewriter_after_opt_a 0.36% : 0.000094s : 1: rewriter_before_opt_a 0.03% : 0.000007s : 1: slice_cell_reuse_recomputed_activation 0.03% : 0.000007s : 1: slice_recompute_activation 0.03% : 0.000007s : 1: split_layernorm_comm 0.03% : 0.000007s : 1: split_matmul_comm_elemetwise 0.04% : 0.000012s : 1: swap_dp_allreduce_reducescatter 0.40% : 0.000104s : 1: symbol_engine_optimizer 0.39% : 0.000103s : 1: tuple_transform 21.00% : 0.005510s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:45:58.282.267 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0317212, [21] [bootstrap]: 0.00042138 [type_inference]: 0.0222636 [event_method]: 2.044e-05 [auto_monad]: 6.454e-05 [graph_reusing]: 6.24999e-06 [inline]: 2.48e-06 [add_attr]: 0.00315, [1] [add_attr_with_inline]: 0.00314199, [1] [Cycle 1]: 5.99e-05, [2] [tag_attr]: 1.871e-05 [meta_addattr_fg_expand]: 6.11e-06 [parallel-infer-symbol]: 3.08998e-06 [pre_auto_parallel]: 3.28e-05 [insert-virtual-dataset]: 2.34001e-06 [parallel-infer-symbol-second]: 8.89995e-07 [dataset_repeat_opt]: 1.84998e-06 [pipeline_split]: 2.06e-06 [optimize]: 0.00504337, [53] [py_interpret_to_execute]: 2.531e-05 [rewriter_before_opt_a]: 8.23e-05 [opt_a]: 0.0029611, [2] [Cycle 1]: 0.0021763, [45] [expand_dump_flag]: 3.06999e-06 [switch_simplify]: 4.378e-05 [loop_unroll]: 3.069e-05 [a_1]: 0.00066667 [with_stream_mark]: 1.559e-05 [recompute_prepare]: 1.048e-05 [updatestate_depend_eliminate]: 4.55999e-06 [updatestate_assign_eliminate]: 3.84002e-06 [updatestate_loads_eliminate]: 3.86999e-06 [parameter_eliminate]: 1.75001e-06 [a_2]: 0.0001023 [accelerated_algorithm]: 8.57998e-06 [shard]: 2.11e-06 [meta_shard_fg_expand]: 1.99e-06 [shard_inline]: 7.92998e-06 [merge_send_recv]: 9.51e-06 [auto_parallel]: 6.83e-06 [parallel]: 1.797e-05 [flash_sp]: 8.38999e-06 [merge_comm]: 4.75001e-06 [allreduce_fusion]: 4.69998e-06 [matmul_add_comm_reduction]: 1.071e-05 [allreduce_slice_to_reducescatter]: 7.59988e-07 [virtual_shard_identity]: 1.074e-05 [virtual_dataset]: 7.81001e-06 [get_grad_eliminate_]: 7.6e-06 [virtual_output]: 7.39002e-06 [merge_forward]: 4.69002e-06 [cell_reuse_recompute_pass]: 1.41998e-06 [offload_activation]: 1.025e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.515e-05 [merge_recompute_call_nodes]: 1.42e-06 [before_grad]: 1.28e-05 [set_forward_comm_id_for_comm_node_pass]: 4.97e-06 [meta_fg_expand]: 3.80998e-06 [flash_sp_send_recv_attached]: 2.68e-06 [receive_attached]: 2.66999e-06 [after_resolve]: 1.231e-05 [a_after_grad]: 1.164e-05 [renormalize]: 0.00071904 [add_forward_monad_depend]: 5.51e-06 [auto_monad_grad]: 2.28998e-06 [auto_monad_eliminator]: 1.661e-05 [cse]: 3.546e-05 [a_3]: 6.007e-05 [Cycle 2]: 0.00077465, [45] [expand_dump_flag]: 1.35999e-06 [switch_simplify]: 9.53002e-06 [loop_unroll]: 7.58999e-06 [a_1]: 0.00017432 [with_stream_mark]: 1.304e-05 [recompute_prepare]: 8.27e-06 [updatestate_depend_eliminate]: 3.82002e-06 [updatestate_assign_eliminate]: 2.96001e-06 [updatestate_loads_eliminate]: 2.88e-06 [parameter_eliminate]: 1.16002e-06 [a_2]: 9.655e-05 [accelerated_algorithm]: 7.66999e-06 [shard]: 1.72001e-06 [meta_shard_fg_expand]: 2.19001e-06 [shard_inline]: 7.56001e-06 [merge_send_recv]: 6.51999e-06 [auto_parallel]: 5.99999e-06 [parallel]: 5.63002e-06 [flash_sp]: 3.70998e-06 [merge_comm]: 4.73001e-06 [allreduce_fusion]: 4.25e-06 [matmul_add_comm_reduction]: 6.96999e-06 [allreduce_slice_to_reducescatter]: 6.40022e-07 [virtual_shard_identity]: 8.35999e-06 [virtual_dataset]: 7.44002e-06 [get_grad_eliminate_]: 7.41999e-06 [virtual_output]: 6.87002e-06 [merge_forward]: 3.55998e-06 [cell_reuse_recompute_pass]: 1.30001e-06 [offload_activation]: 8.3e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.468e-05 [merge_recompute_call_nodes]: 1.10001e-06 [before_grad]: 1.162e-05 [set_forward_comm_id_for_comm_node_pass]: 4.23999e-06 [meta_fg_expand]: 2.76999e-06 [flash_sp_send_recv_attached]: 7.89994e-07 [receive_attached]: 1.34e-06 [after_resolve]: 1.1e-05 [a_after_grad]: 1.271e-05 [renormalize]: 8.9989e-08 [add_forward_monad_depend]: 1.50999e-06 [auto_monad_grad]: 1.19e-06 [auto_monad_eliminator]: 8.96998e-06 [cse]: 2.036e-05 [a_3]: 4.638e-05 [py_interpret_to_execute_after_opt_a]: 1.14e-05 [slice_cell_reuse_recomputed_activation]: 1.91e-06 [rewriter_after_opt_a]: 4.101e-05 [convert_after_rewriter]: 8.35999e-06 [order_py_execute_after_rewriter]: 5.61e-06 [mutable_eliminate]: 0.00049328 [opt_b]: 0.00024144, [1] [Cycle 1]: 0.00023527, [7] [b_1]: 0.00015441 [b_2]: 9.20001e-06 [updatestate_depend_eliminate]: 6.79999e-06 [updatestate_assign_eliminate]: 2.99001e-06 [updatestate_loads_eliminate]: 3.18e-06 [renormalize]: 7.89994e-07 [cse]: 2.199e-05 [optimize_parallel_all_gather_comm]: 1.748e-05 [overlap_param_gather]: 2.23998e-06 [cconv]: 2.617e-05 [loop_unroll]: 0.00041808 [opt_after_cconv]: 0.00011531, [1] [Cycle 1]: 0.00010966, [7] [c_1]: 3.761e-05 [parameter_eliminate]: 2.94001e-06 [updatestate_depend_eliminate]: 6.01e-06 [updatestate_assign_eliminate]: 3.13e-06 [updatestate_loads_eliminate]: 2.88e-06 [cse]: 2.226e-05 [renormalize]: 3.89991e-07 [remove_dup_value]: 1.413e-05 [tuple_transform]: 8.569e-05, [1] [Cycle 1]: 8.146e-05, [4] [d_1]: 5.258e-05 [none_parameter_eliminate]: 2.00002e-06 [renormalize]: 2.10013e-07 [switch_simplify]: 8.32998e-06 [partial_unused_args_eliminate]: 2.06998e-06 [add_recomputation]: 5.54e-05 [cse_after_recomputation]: 2.511e-05, [1] [Cycle 1]: 2.058e-05, [1] [cse]: 1.499e-05 [environ_conv]: 6.23e-06 [swap_dp_allreduce_reducescatter]: 5.89e-06 [bias_add_comm_swap]: 2.76e-06 [label_micro_interleaved_index]: 4.59002e-06 [label_fine_grained_interleaved_index]: 2.85002e-06 [merge_cast_opt]: 1.54e-06 [slice_recompute_activation]: 2.04999e-06 [micro_interleaved_order_control]: 2.26e-06 [assign_add_opt]: 1.43002e-06 [ForceFp32Comm]: 7.7e-07 [remove_cast_before_assign_add]: 1.17e-06 [full_micro_interleaved_order_control]: 2.14e-06 [reorder_send_recv_between_fp_bp]: 3.23998e-06 [comm_op_add_attrs]: 1.07e-06 [add_comm_op_reuse_tag]: 9.90025e-07 [interleave_split_concat_branches]: 1.13001e-06 [interleave_parallel_branches]: 1.38002e-06 [overlap_opt_shard_in_pipeline]: 1.39e-06 [overlap_opt_shard_grad_in_pipeline]: 2.01e-06 [control_data_broadcast_order]: 1.574e-05 [grouped_pairwise_exchange_alltoall]: 2.07001e-06 [offloading_packed_experts]: 4.28999e-06 [overlap_recompute_and_grad_model_parallel]: 4.95001e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.15001e-06 [overlap_recompute_allgather_and_fa_grad]: 1.29e-06 [overlap_recompute_comm]: 2.30002e-06 [overlap_grad_ring_attention]: 4.65001e-06 [overlap_grad_flash_sp]: 2.182e-05 [begin_end_overlap_inline]: 5.59987e-07 [split_matmul_comm_elemetwise]: 2.26e-06 [split_layernorm_comm]: 1.55999e-06 [handle_group_info]: 8.79983e-07 [symbol_engine_optimizer]: 8.474e-05, [1] [Cycle 1]: 8.054e-05, [6] [build]: 3.18998e-06 [elim_shapecalc]: 1.18e-05 [elim_not_effective]: 1.533e-05 [opt_reshape]: 8.43001e-06 [fold_const_symbol]: 1.235e-05 [renormalize]: 2.3999e-07 [detach_backward]: 2.30002e-06 [pipeline_parallel_scheduler]: 1.96003e-06 [auto_monad_reorder]: 1.983e-05 [get_jit_bprop_graph]: 1.25001e-06 [rewriter_after_jit_bprop_graph]: 4.77998e-06 [opt_after_jit_grad]: 0.00048197 [validate]: 4.175e-05 Sums bootstrap : 0.000421s : 1.53% type_inference : 0.022264s : 80.65% event_method : 0.000020s : 0.07% auto_monad : 0.000065s : 0.23% graph_reusing : 0.000006s : 0.02% inline : 0.000002s : 0.01% add_attr.add_attr_with_inline.tag_attr : 0.000019s : 0.07% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.02% parallel-infer-symbol : 0.000003s : 0.01% pre_auto_parallel : 0.000033s : 0.12% insert-virtual-dataset : 0.000002s : 0.01% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.01% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000025s : 0.09% optimize.rewriter_before_opt_a : 0.000082s : 0.30% optimize.opt_a.expand_dump_flag : 0.000004s : 0.02% optimize.opt_a.switch_simplify : 0.000053s : 0.19% optimize.opt_a.loop_unroll : 0.000038s : 0.14% optimize.opt_a.a_1 : 0.000841s : 3.05% optimize.opt_a.with_stream_mark : 0.000029s : 0.10% optimize.opt_a.recompute_prepare : 0.000019s : 0.07% optimize.opt_a.updatestate_depend_eliminate : 0.000008s : 0.03% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.02% optimize.opt_a.updatestate_loads_eliminate : 0.000007s : 0.02% optimize.opt_a.parameter_eliminate : 0.000003s : 0.01% optimize.opt_a.a_2 : 0.000199s : 0.72% optimize.opt_a.accelerated_algorithm : 0.000016s : 0.06% optimize.opt_a.shard : 0.000004s : 0.01% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.02% optimize.opt_a.shard_inline : 0.000015s : 0.06% optimize.opt_a.merge_send_recv : 0.000016s : 0.06% optimize.opt_a.auto_parallel : 0.000013s : 0.05% optimize.opt_a.parallel : 0.000024s : 0.09% optimize.opt_a.flash_sp : 0.000012s : 0.04% optimize.opt_a.merge_comm : 0.000009s : 0.03% optimize.opt_a.allreduce_fusion : 0.000009s : 0.03% optimize.opt_a.matmul_add_comm_reduction : 0.000018s : 0.06% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000019s : 0.07% optimize.opt_a.virtual_dataset : 0.000015s : 0.06% optimize.opt_a.get_grad_eliminate_ : 0.000015s : 0.05% optimize.opt_a.virtual_output : 0.000014s : 0.05% optimize.opt_a.merge_forward : 0.000008s : 0.03% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.01% optimize.opt_a.offload_activation : 0.000019s : 0.07% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000030s : 0.11% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.01% optimize.opt_a.before_grad : 0.000024s : 0.09% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000009s : 0.03% optimize.opt_a.meta_fg_expand : 0.000007s : 0.02% optimize.opt_a.flash_sp_send_recv_attached : 0.000003s : 0.01% optimize.opt_a.receive_attached : 0.000004s : 0.01% optimize.opt_a.after_resolve : 0.000023s : 0.08% optimize.opt_a.a_after_grad : 0.000024s : 0.09% optimize.opt_a.renormalize : 0.000719s : 2.60% optimize.opt_a.add_forward_monad_depend : 0.000007s : 0.03% optimize.opt_a.auto_monad_grad : 0.000003s : 0.01% optimize.opt_a.auto_monad_eliminator : 0.000026s : 0.09% optimize.opt_a.cse : 0.000056s : 0.20% optimize.opt_a.a_3 : 0.000106s : 0.39% optimize.py_interpret_to_execute_after_opt_a : 0.000011s : 0.04% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.01% optimize.rewriter_after_opt_a : 0.000041s : 0.15% optimize.convert_after_rewriter : 0.000008s : 0.03% optimize.order_py_execute_after_rewriter : 0.000006s : 0.02% optimize.mutable_eliminate : 0.000493s : 1.79% optimize.opt_b.b_1 : 0.000154s : 0.56% optimize.opt_b.b_2 : 0.000009s : 0.03% optimize.opt_b.updatestate_depend_eliminate : 0.000007s : 0.02% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.01% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.01% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000022s : 0.08% optimize.optimize_parallel_all_gather_comm : 0.000017s : 0.06% optimize.overlap_param_gather : 0.000002s : 0.01% optimize.cconv : 0.000026s : 0.09% optimize.loop_unroll : 0.000418s : 1.51% optimize.opt_after_cconv.c_1 : 0.000038s : 0.14% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.02% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.cse : 0.000022s : 0.08% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000014s : 0.05% optimize.tuple_transform.d_1 : 0.000053s : 0.19% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000008s : 0.03% optimize.partial_unused_args_eliminate : 0.000002s : 0.01% optimize.add_recomputation : 0.000055s : 0.20% optimize.cse_after_recomputation.cse : 0.000015s : 0.05% optimize.environ_conv : 0.000006s : 0.02% optimize.swap_dp_allreduce_reducescatter : 0.000006s : 0.02% optimize.bias_add_comm_swap : 0.000003s : 0.01% optimize.label_micro_interleaved_index : 0.000005s : 0.02% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.01% optimize.merge_cast_opt : 0.000002s : 0.01% optimize.slice_recompute_activation : 0.000002s : 0.01% optimize.micro_interleaved_order_control : 0.000002s : 0.01% optimize.assign_add_opt : 0.000001s : 0.01% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.00% optimize.full_micro_interleaved_order_control : 0.000002s : 0.01% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.01% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.01% optimize.control_data_broadcast_order : 0.000016s : 0.06% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.01% optimize.offloading_packed_experts : 0.000004s : 0.02% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.02% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.00% optimize.overlap_recompute_comm : 0.000002s : 0.01% optimize.overlap_grad_ring_attention : 0.000005s : 0.02% optimize.overlap_grad_flash_sp : 0.000022s : 0.08% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.01% optimize.split_layernorm_comm : 0.000002s : 0.01% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000003s : 0.01% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000012s : 0.04% optimize.symbol_engine_optimizer.elim_not_effective : 0.000015s : 0.06% optimize.symbol_engine_optimizer.opt_reshape : 0.000008s : 0.03% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000012s : 0.04% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.01% pipeline_parallel_scheduler : 0.000002s : 0.01% auto_monad_reorder : 0.000020s : 0.07% get_jit_bprop_graph : 0.000001s : 0.00% rewriter_after_jit_bprop_graph : 0.000005s : 0.02% opt_after_jit_grad : 0.000482s : 1.75% validate : 0.000042s : 0.15% Time group info: ------[substitution.] 0.000201 38 11.53% : 0.000023s : 3: substitution.cast_eliminate 1.04% : 0.000002s : 3: substitution.elim_not_effective 0.87% : 0.000002s : 3: substitution.fold_const_symbol 3.10% : 0.000006s : 5: substitution.graph_param_transform 69.57% : 0.000140s : 4: substitution.inline 2.32% : 0.000005s : 6: substitution.j_node_and_user_rematch 2.88% : 0.000006s : 6: substitution.remove_not_recompute_node 2.18% : 0.000004s : 4: substitution.replace_old_param 6.50% : 0.000013s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.022199 2 96.70% : 0.021465s : 1: type_inference.infer 3.30% : 0.000734s : 1: type_inference.specialize ------[replace.] 0.000061 8 61.83% : 0.000037s : 4: replace.inline 38.17% : 0.000023s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000148 8 92.43% : 0.000137s : 4: match.inline 7.57% : 0.000011s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000235 1504 0.91% : 0.000002s : 15: predicate.accumulaten_eliminater 0.68% : 0.000002s : 5: predicate.ad_related_special_op_eliminate 0.57% : 0.000001s : 10: predicate.addn_check_dump 0.88% : 0.000002s : 15: predicate.addn_zero_filter 0.81% : 0.000002s : 15: predicate.adjust_all_reduce_mul_add 1.93% : 0.000005s : 25: predicate.arithmetic_simplify 1.05% : 0.000002s : 15: predicate.cast_eliminate 0.60% : 0.000001s : 10: predicate.check_bprop_eliminate 0.59% : 0.000001s : 10: predicate.compare_switch_simplify 0.21% : 0.000001s : 5: predicate.const_output_eliminate 0.85% : 0.000002s : 10: predicate.depend_value_elim 0.98% : 0.000002s : 15: predicate.dict_get_item_const_eliminator 0.97% : 0.000002s : 15: predicate.dict_get_item_eliminator 0.85% : 0.000002s : 15: predicate.dict_set_item_eliminator 1.03% : 0.000002s : 10: predicate.dumpgradient_eliminate 0.22% : 0.000001s : 5: predicate.elim_not_effective 0.38% : 0.000001s : 5: predicate.elim_shapecalc_of_broadcastargs 1.13% : 0.000003s : 20: predicate.environ_add_const_eliminate 1.09% : 0.000003s : 20: predicate.environ_get_add_eliminate 1.11% : 0.000003s : 20: predicate.environ_get_depend_swap 1.68% : 0.000004s : 30: predicate.environ_get_eliminate 1.11% : 0.000003s : 20: predicate.environ_get_set_eliminate 1.38% : 0.000003s : 23: predicate.exchange_switch_depend_value 2.23% : 0.000005s : 23: predicate.float_depend_g_call 0.56% : 0.000001s : 10: predicate.float_environ_get_switch 0.84% : 0.000002s : 15: predicate.float_tuple_getitem_switch 0.20% : 0.000000s : 5: predicate.fold_const_symbol 0.71% : 0.000002s : 10: predicate.get_grad_eliminate 0.22% : 0.000001s : 5: predicate.graph_param_transform 0.65% : 0.000002s : 10: predicate.incorporate_call 0.58% : 0.000001s : 10: predicate.incorporate_call_switch 6.41% : 0.000015s : 68: predicate.inline 0.88% : 0.000002s : 10: predicate.inline_without_move 0.32% : 0.000001s : 10: predicate.j_node_and_user_rematch 0.86% : 0.000002s : 10: predicate.less_batch_normalization 1.73% : 0.000004s : 29: predicate.list_to_tuple_eliminator_ 2.59% : 0.000006s : 44: predicate.load_eliminater 0.88% : 0.000002s : 5: predicate.loop_unroll_after_grad 2.14% : 0.000005s : 36: predicate.loop_unroll_before_grad 1.67% : 0.000004s : 25: predicate.make_slice_get_slice_eliminator 0.62% : 0.000001s : 10: predicate.merge_addn 0.60% : 0.000001s : 10: predicate.micro_step_allgather_replace 0.62% : 0.000001s : 10: predicate.mini_step_allgather_replace 0.84% : 0.000002s : 15: predicate.minmaximum_grad 0.99% : 0.000002s : 5: predicate.mutable_eliminate 0.37% : 0.000001s : 5: predicate.opt_reshape 0.32% : 0.000001s : 5: predicate.parallel_virtual_node 1.65% : 0.000004s : 23: predicate.partial_defer_inline 1.65% : 0.000004s : 24: predicate.partial_eliminate 0.86% : 0.000002s : 15: predicate.print_const_string_wrapper 0.71% : 0.000002s : 10: predicate.reduce_all_const_elim 1.19% : 0.000003s : 15: predicate.reduce_eliminate 2.49% : 0.000006s : 44: predicate.redundant_stop_gradient_eliminater 0.40% : 0.000001s : 10: predicate.remove_not_recompute_node 1.36% : 0.000003s : 29: predicate.replace_applicator 0.50% : 0.000001s : 10: predicate.replace_old_param 0.27% : 0.000001s : 5: predicate.reset_defer_inline 0.93% : 0.000002s : 15: predicate.reshape_eliminate 0.65% : 0.000002s : 10: predicate.row_tensor_add_zeros_like 0.37% : 0.000001s : 5: predicate.row_tensor_eliminate 0.77% : 0.000002s : 10: predicate.same_eliminate 0.48% : 0.000001s : 10: predicate.set_cell_output_no_recompute 1.07% : 0.000003s : 10: predicate.shard_identity_eliminate 0.70% : 0.000002s : 10: predicate.special_op_eliminate 0.77% : 0.000002s : 10: predicate.specialize_transform 0.84% : 0.000002s : 10: predicate.split_environ_get_set_with_tuple_value 0.77% : 0.000002s : 10: predicate.stack_unstack_eliminate 0.36% : 0.000001s : 5: predicate.switch_call_monad_eliminater 1.46% : 0.000003s : 23: predicate.switch_defer_inline 2.06% : 0.000005s : 33: predicate.switch_layer_defer_inline 5.06% : 0.000012s : 74: predicate.switch_simplify 0.89% : 0.000002s : 15: predicate.tile_eliminate 0.90% : 0.000002s : 15: predicate.transpose_eliminate 1.49% : 0.000003s : 25: predicate.tuple_list_convert_item_index_to_positive 1.65% : 0.000004s : 25: predicate.tuple_list_get_item_const_eliminator 1.60% : 0.000004s : 25: predicate.tuple_list_get_item_depend_reorder 3.51% : 0.000008s : 39: predicate.tuple_list_get_item_eliminator 1.58% : 0.000004s : 25: predicate.tuple_list_get_set_item_eliminator 2.30% : 0.000005s : 35: predicate.tuple_list_set_item_eliminator 1.74% : 0.000004s : 29: predicate.tuple_to_list_eliminator_ 2.44% : 0.000006s : 44: predicate.updatestate_pure_node_eliminater 3.12% : 0.000007s : 54: predicate.updatestate_useless_node_eliminater 0.39% : 0.000001s : 5: predicate.value_based_eliminate 0.73% : 0.000002s : 10: predicate.virtual_dataset_eliminate 0.62% : 0.000001s : 10: predicate.virtual_output_eliminate 0.37% : 0.000001s : 5: predicate.virtual_view_grad_eliminate 0.46% : 0.000001s : 5: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000559 11 56.04% : 0.000313s : 5: func_graph_cloner_run.FuncGraphClonerGraph 43.96% : 0.000246s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.042201 192 0.01% : 0.000003s : 1: ForceFp32Comm 7.48% : 0.003155s : 1: add_attr 7.45% : 0.003146s : 1: add_attr_with_inline 0.01% : 0.000004s : 1: add_comm_op_reuse_tag 0.14% : 0.000060s : 1: add_recomputation 0.01% : 0.000004s : 1: assign_add_opt 0.17% : 0.000071s : 1: auto_monad 0.06% : 0.000024s : 1: auto_monad_reorder 0.01% : 0.000003s : 1: begin_end_overlap_inline 0.01% : 0.000006s : 1: bias_add_comm_swap 1.06% : 0.000449s : 1: bootstrap 0.07% : 0.000030s : 1: cconv 0.01% : 0.000004s : 1: comm_op_add_attrs 0.05% : 0.000019s : 1: control_data_broadcast_order 0.03% : 0.000012s : 1: convert_after_rewriter 0.07% : 0.000028s : 1: cse_after_recomputation 0.01% : 0.000005s : 1: dataset_repeat_opt 0.01% : 0.000005s : 1: detach_backward 0.02% : 0.000010s : 1: environ_conv 0.07% : 0.000028s : 1: event_method 0.01% : 0.000005s : 1: full_micro_interleaved_order_control 0.01% : 0.000004s : 1: get_jit_bprop_graph 0.02% : 0.000010s : 1: graph_reusing 0.01% : 0.000005s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000004s : 1: handle_group_info 0.01% : 0.000005s : 1: inline 0.01% : 0.000006s : 1: insert-virtual-dataset 0.01% : 0.000004s : 1: interleave_parallel_branches 0.01% : 0.000004s : 1: interleave_split_concat_branches 0.01% : 0.000006s : 1: label_fine_grained_interleaved_index 0.02% : 0.000007s : 1: label_micro_interleaved_index 1.01% : 0.000426s : 1: loop_unroll 0.01% : 0.000004s : 1: merge_cast_opt 0.01% : 0.000005s : 1: micro_interleaved_order_control 1.19% : 0.000502s : 1: mutable_eliminate 0.02% : 0.000007s : 1: offloading_packed_experts 0.04% : 0.000016s : 1: opt.transform.loop_unroll_optimizer 0.04% : 0.000017s : 1: opt.transform.mutable_eliminate 3.23% : 0.001361s : 78: opt.transform.opt_a 0.09% : 0.000036s : 1: opt.transform.opt_after_cconv 0.07% : 0.000030s : 1: opt.transform.opt_after_jit_grad 0.31% : 0.000132s : 28: opt.transform.opt_b 0.14% : 0.000059s : 2: opt.transform.opt_trans_graph 0.10% : 0.000044s : 4: opt.transform.symbol_engine_opt 7.02% : 0.002964s : 1: opt_a 0.28% : 0.000119s : 1: opt_after_cconv 1.16% : 0.000491s : 1: opt_after_jit_grad 0.58% : 0.000245s : 1: opt_b 11.96% : 0.005048s : 1: optimize 0.05% : 0.000021s : 1: optimize_parallel_all_gather_comm 0.02% : 0.000009s : 1: order_py_execute_after_rewriter 0.06% : 0.000025s : 1: overlap_grad_flash_sp 0.01% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.02% : 0.000008s : 1: overlap_grad_ring_attention 0.01% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.01% : 0.000006s : 1: overlap_param_gather 0.01% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.02% : 0.000008s : 1: overlap_recompute_and_grad_model_parallel 0.01% : 0.000005s : 1: overlap_recompute_comm 0.02% : 0.000007s : 1: parallel-infer-symbol 0.01% : 0.000004s : 1: parallel-infer-symbol-second 0.01% : 0.000005s : 1: partial_unused_args_eliminate 0.01% : 0.000005s : 1: pipeline_parallel_scheduler 0.01% : 0.000005s : 1: pipeline_split 0.09% : 0.000037s : 1: pre_auto_parallel 0.07% : 0.000030s : 1: py_interpret_to_execute 0.04% : 0.000015s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000004s : 1: remove_cast_before_assign_add 0.04% : 0.000018s : 1: remove_dup_value 1.01% : 0.000424s : 1: renormalize.infer 0.68% : 0.000286s : 1: renormalize.specialize 0.01% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.02% : 0.000008s : 1: rewriter_after_jit_bprop_graph 0.11% : 0.000045s : 1: rewriter_after_opt_a 0.20% : 0.000086s : 1: rewriter_before_opt_a 0.01% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.01% : 0.000005s : 1: slice_recompute_activation 0.01% : 0.000004s : 1: split_layernorm_comm 0.01% : 0.000005s : 1: split_matmul_comm_elemetwise 0.02% : 0.000009s : 1: swap_dp_allreduce_reducescatter 0.21% : 0.000088s : 1: symbol_engine_optimizer 0.21% : 0.000089s : 1: tuple_transform 52.80% : 0.022283s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:45:58.727.299 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:45:58.727.545 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0488125, [21] [bootstrap]: 0.0165437 [type_inference]: 0.00566732 [event_method]: 2.034e-05 [auto_monad]: 6.711e-05 [graph_reusing]: 6.88e-06 [inline]: 1.86e-06 [add_attr]: 0.00318846, [1] [add_attr_with_inline]: 0.00317865, [1] [Cycle 1]: 7.422e-05, [2] [tag_attr]: 1.922e-05 [meta_addattr_fg_expand]: 6.36e-06 [parallel-infer-symbol]: 3.01001e-06 [pre_auto_parallel]: 3.311e-05 [insert-virtual-dataset]: 2.22999e-06 [parallel-infer-symbol-second]: 7.7e-07 [dataset_repeat_opt]: 2.11e-06 [pipeline_split]: 1.82001e-06 [optimize]: 0.00579965, [53] [py_interpret_to_execute]: 3.126e-05 [rewriter_before_opt_a]: 8.841e-05 [opt_a]: 0.00337601, [2] [Cycle 1]: 0.00239292, [45] [expand_dump_flag]: 2.93e-06 [switch_simplify]: 4.321e-05 [loop_unroll]: 3.23e-05 [a_1]: 0.00068883 [with_stream_mark]: 1.791e-05 [recompute_prepare]: 1.217e-05 [updatestate_depend_eliminate]: 5.05001e-06 [updatestate_assign_eliminate]: 4.05998e-06 [updatestate_loads_eliminate]: 4.21001e-06 [parameter_eliminate]: 2.25002e-06 [a_2]: 0.00013179 [accelerated_algorithm]: 9.19e-06 [shard]: 2.35002e-06 [meta_shard_fg_expand]: 2.04e-06 [shard_inline]: 8.30999e-06 [merge_send_recv]: 1.086e-05 [auto_parallel]: 8.33999e-06 [parallel]: 1.899e-05 [flash_sp]: 9.21998e-06 [merge_comm]: 4.50999e-06 [allreduce_fusion]: 4.43999e-06 [matmul_add_comm_reduction]: 1.12e-05 [allreduce_slice_to_reducescatter]: 9.5999e-07 [virtual_shard_identity]: 1.015e-05 [virtual_dataset]: 8.60999e-06 [get_grad_eliminate_]: 7.90998e-06 [virtual_output]: 7.87e-06 [merge_forward]: 4.47e-06 [cell_reuse_recompute_pass]: 1.39e-06 [offload_activation]: 1.114e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.865e-05 [merge_recompute_call_nodes]: 1.64e-06 [before_grad]: 1.299e-05 [set_forward_comm_id_for_comm_node_pass]: 4.79e-06 [meta_fg_expand]: 3.43999e-06 [flash_sp_send_recv_attached]: 2.65002e-06 [receive_attached]: 2.56e-06 [after_resolve]: 1.396e-05 [a_after_grad]: 1.341e-05 [renormalize]: 0.00067411 [add_forward_monad_depend]: 6.24001e-06 [auto_monad_grad]: 1.99999e-06 [auto_monad_eliminator]: 1.728e-05 [cse]: 3.753e-05 [a_3]: 7.383e-05 [Cycle 2]: 0.00096922, [45] [expand_dump_flag]: 1.42e-06 [switch_simplify]: 9.17999e-06 [loop_unroll]: 7.7e-06 [a_1]: 0.00017691 [with_stream_mark]: 1.435e-05 [recompute_prepare]: 8.41002e-06 [updatestate_depend_eliminate]: 4.46002e-06 [updatestate_assign_eliminate]: 3.14999e-06 [updatestate_loads_eliminate]: 2.94999e-06 [parameter_eliminate]: 1.05001e-06 [a_2]: 0.00012052 [accelerated_algorithm]: 7.67002e-06 [shard]: 1.86e-06 [meta_shard_fg_expand]: 1.64e-06 [shard_inline]: 8.03999e-06 [merge_send_recv]: 6.02001e-06 [auto_parallel]: 6.89999e-06 [parallel]: 5.56e-06 [flash_sp]: 3.56999e-06 [merge_comm]: 4.38001e-06 [allreduce_fusion]: 4.25e-06 [matmul_add_comm_reduction]: 7.66999e-06 [allreduce_slice_to_reducescatter]: 5.19998e-07 [virtual_shard_identity]: 8.97e-06 [virtual_dataset]: 7.48e-06 [get_grad_eliminate_]: 7.32997e-06 [virtual_output]: 7.55e-06 [merge_forward]: 3.95e-06 [cell_reuse_recompute_pass]: 1.67001e-06 [offload_activation]: 8.60001e-06 [cell_reuse_handle_not_recompute_node_pass]: 2.03e-05 [merge_recompute_call_nodes]: 8.80013e-07 [before_grad]: 1.243e-05 [set_forward_comm_id_for_comm_node_pass]: 5.07e-06 [meta_fg_expand]: 3.04001e-06 [flash_sp_send_recv_attached]: 1.73002e-06 [receive_attached]: 1.00001e-06 [after_resolve]: 1.234e-05 [a_after_grad]: 1.122e-05 [renormalize]: 8.00064e-08 [add_forward_monad_depend]: 2.47001e-06 [auto_monad_grad]: 1.30999e-06 [auto_monad_eliminator]: 1.087e-05 [cse]: 2.685e-05 [a_3]: 6.106e-05 [py_interpret_to_execute_after_opt_a]: 1.49e-05 [slice_cell_reuse_recomputed_activation]: 4.63999e-06 [rewriter_after_opt_a]: 4.822e-05 [convert_after_rewriter]: 1.151e-05 [order_py_execute_after_rewriter]: 8.62e-06 [mutable_eliminate]: 0.00051438 [opt_b]: 0.00031744, [1] [Cycle 1]: 0.00030848, [7] [b_1]: 0.00020127 [b_2]: 9.85002e-06 [updatestate_depend_eliminate]: 7.59002e-06 [updatestate_assign_eliminate]: 3.09001e-06 [updatestate_loads_eliminate]: 3.06999e-06 [renormalize]: 4.39992e-07 [cse]: 2.619e-05 [optimize_parallel_all_gather_comm]: 2.1e-05 [overlap_param_gather]: 5.18002e-06 [cconv]: 3.104e-05 [loop_unroll]: 0.00045461 [opt_after_cconv]: 0.00014522, [1] [Cycle 1]: 0.00013591, [7] [c_1]: 3.874e-05 [parameter_eliminate]: 3.16999e-06 [updatestate_depend_eliminate]: 6.64001e-06 [updatestate_assign_eliminate]: 3.28e-06 [updatestate_loads_eliminate]: 3.18e-06 [cse]: 2.312e-05 [renormalize]: 6.00005e-07 [remove_dup_value]: 1.827e-05 [tuple_transform]: 0.0001031, [1] [Cycle 1]: 9.59e-05, [4] [d_1]: 5.271e-05 [none_parameter_eliminate]: 2.07999e-06 [renormalize]: 2.09984e-07 [switch_simplify]: 8.81002e-06 [partial_unused_args_eliminate]: 4.63001e-06 [add_recomputation]: 5.723e-05 [cse_after_recomputation]: 3.067e-05, [1] [Cycle 1]: 2.395e-05, [1] [cse]: 1.468e-05 [environ_conv]: 9.48002e-06 [swap_dp_allreduce_reducescatter]: 8.97e-06 [bias_add_comm_swap]: 5.19e-06 [label_micro_interleaved_index]: 7.78999e-06 [label_fine_grained_interleaved_index]: 5.22999e-06 [merge_cast_opt]: 3.59002e-06 [slice_recompute_activation]: 4.51002e-06 [micro_interleaved_order_control]: 4.53001e-06 [assign_add_opt]: 3.63999e-06 [ForceFp32Comm]: 3.16001e-06 [remove_cast_before_assign_add]: 3.31001e-06 [full_micro_interleaved_order_control]: 4.43999e-06 [reorder_send_recv_between_fp_bp]: 5.61998e-06 [comm_op_add_attrs]: 3.36001e-06 [add_comm_op_reuse_tag]: 3.19001e-06 [interleave_split_concat_branches]: 3.50998e-06 [interleave_parallel_branches]: 3.63e-06 [overlap_opt_shard_in_pipeline]: 3.55998e-06 [overlap_opt_shard_grad_in_pipeline]: 4.35999e-06 [control_data_broadcast_order]: 1.904e-05 [grouped_pairwise_exchange_alltoall]: 3.84002e-06 [offloading_packed_experts]: 7.26999e-06 [overlap_recompute_and_grad_model_parallel]: 8.07e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.78001e-06 [overlap_recompute_allgather_and_fa_grad]: 3.88001e-06 [overlap_recompute_comm]: 5.12e-06 [overlap_grad_ring_attention]: 7.01001e-06 [overlap_grad_flash_sp]: 2.474e-05 [begin_end_overlap_inline]: 3.33e-06 [split_matmul_comm_elemetwise]: 4.51002e-06 [split_layernorm_comm]: 4.10998e-06 [handle_group_info]: 3.2e-06 [symbol_engine_optimizer]: 0.00010827, [1] [Cycle 1]: 0.00010144, [6] [build]: 3.56999e-06 [elim_shapecalc]: 1.129e-05 [elim_not_effective]: 1.582e-05 [opt_reshape]: 8.82e-06 [fold_const_symbol]: 1.259e-05 [renormalize]: 2.29978e-07 [detach_backward]: 3.73999e-06 [pipeline_parallel_scheduler]: 1.79e-06 [auto_monad_reorder]: 2.406e-05 [get_jit_bprop_graph]: 1.93002e-06 [rewriter_after_jit_bprop_graph]: 4.63999e-06 [opt_after_jit_grad]: 0.0167437 [validate]: 4.967e-05 Sums bootstrap : 0.016544s : 37.80% type_inference : 0.005667s : 12.95% event_method : 0.000020s : 0.05% auto_monad : 0.000067s : 0.15% graph_reusing : 0.000007s : 0.02% inline : 0.000002s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000019s : 0.04% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.01% parallel-infer-symbol : 0.000003s : 0.01% pre_auto_parallel : 0.000033s : 0.08% insert-virtual-dataset : 0.000002s : 0.01% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000031s : 0.07% optimize.rewriter_before_opt_a : 0.000088s : 0.20% optimize.opt_a.expand_dump_flag : 0.000004s : 0.01% optimize.opt_a.switch_simplify : 0.000052s : 0.12% optimize.opt_a.loop_unroll : 0.000040s : 0.09% optimize.opt_a.a_1 : 0.000866s : 1.98% optimize.opt_a.with_stream_mark : 0.000032s : 0.07% optimize.opt_a.recompute_prepare : 0.000021s : 0.05% optimize.opt_a.updatestate_depend_eliminate : 0.000010s : 0.02% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.02% optimize.opt_a.updatestate_loads_eliminate : 0.000007s : 0.02% optimize.opt_a.parameter_eliminate : 0.000003s : 0.01% optimize.opt_a.a_2 : 0.000252s : 0.58% optimize.opt_a.accelerated_algorithm : 0.000017s : 0.04% optimize.opt_a.shard : 0.000004s : 0.01% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.01% optimize.opt_a.shard_inline : 0.000016s : 0.04% optimize.opt_a.merge_send_recv : 0.000017s : 0.04% optimize.opt_a.auto_parallel : 0.000015s : 0.03% optimize.opt_a.parallel : 0.000025s : 0.06% optimize.opt_a.flash_sp : 0.000013s : 0.03% optimize.opt_a.merge_comm : 0.000009s : 0.02% optimize.opt_a.allreduce_fusion : 0.000009s : 0.02% optimize.opt_a.matmul_add_comm_reduction : 0.000019s : 0.04% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000019s : 0.04% optimize.opt_a.virtual_dataset : 0.000016s : 0.04% optimize.opt_a.get_grad_eliminate_ : 0.000015s : 0.03% optimize.opt_a.virtual_output : 0.000015s : 0.04% optimize.opt_a.merge_forward : 0.000008s : 0.02% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.01% optimize.opt_a.offload_activation : 0.000020s : 0.05% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000039s : 0.09% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.01% optimize.opt_a.before_grad : 0.000025s : 0.06% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000010s : 0.02% optimize.opt_a.meta_fg_expand : 0.000006s : 0.01% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.01% optimize.opt_a.receive_attached : 0.000004s : 0.01% optimize.opt_a.after_resolve : 0.000026s : 0.06% optimize.opt_a.a_after_grad : 0.000025s : 0.06% optimize.opt_a.renormalize : 0.000674s : 1.54% optimize.opt_a.add_forward_monad_depend : 0.000009s : 0.02% optimize.opt_a.auto_monad_grad : 0.000003s : 0.01% optimize.opt_a.auto_monad_eliminator : 0.000028s : 0.06% optimize.opt_a.cse : 0.000064s : 0.15% optimize.opt_a.a_3 : 0.000135s : 0.31% optimize.py_interpret_to_execute_after_opt_a : 0.000015s : 0.03% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.01% optimize.rewriter_after_opt_a : 0.000048s : 0.11% optimize.convert_after_rewriter : 0.000012s : 0.03% optimize.order_py_execute_after_rewriter : 0.000009s : 0.02% optimize.mutable_eliminate : 0.000514s : 1.18% optimize.opt_b.b_1 : 0.000201s : 0.46% optimize.opt_b.b_2 : 0.000010s : 0.02% optimize.opt_b.updatestate_depend_eliminate : 0.000008s : 0.02% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.01% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.01% optimize.opt_b.renormalize : 0.000000s : 0.00% optimize.opt_b.cse : 0.000026s : 0.06% optimize.optimize_parallel_all_gather_comm : 0.000021s : 0.05% optimize.overlap_param_gather : 0.000005s : 0.01% optimize.cconv : 0.000031s : 0.07% optimize.loop_unroll : 0.000455s : 1.04% optimize.opt_after_cconv.c_1 : 0.000039s : 0.09% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000007s : 0.02% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.cse : 0.000023s : 0.05% optimize.opt_after_cconv.renormalize : 0.000001s : 0.00% optimize.remove_dup_value : 0.000018s : 0.04% optimize.tuple_transform.d_1 : 0.000053s : 0.12% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000009s : 0.02% optimize.partial_unused_args_eliminate : 0.000005s : 0.01% optimize.add_recomputation : 0.000057s : 0.13% optimize.cse_after_recomputation.cse : 0.000015s : 0.03% optimize.environ_conv : 0.000009s : 0.02% optimize.swap_dp_allreduce_reducescatter : 0.000009s : 0.02% optimize.bias_add_comm_swap : 0.000005s : 0.01% optimize.label_micro_interleaved_index : 0.000008s : 0.02% optimize.label_fine_grained_interleaved_index : 0.000005s : 0.01% optimize.merge_cast_opt : 0.000004s : 0.01% optimize.slice_recompute_activation : 0.000005s : 0.01% optimize.micro_interleaved_order_control : 0.000005s : 0.01% optimize.assign_add_opt : 0.000004s : 0.01% optimize.ForceFp32Comm : 0.000003s : 0.01% optimize.remove_cast_before_assign_add : 0.000003s : 0.01% optimize.full_micro_interleaved_order_control : 0.000004s : 0.01% optimize.reorder_send_recv_between_fp_bp : 0.000006s : 0.01% optimize.comm_op_add_attrs : 0.000003s : 0.01% optimize.add_comm_op_reuse_tag : 0.000003s : 0.01% optimize.interleave_split_concat_branches : 0.000004s : 0.01% optimize.interleave_parallel_branches : 0.000004s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000004s : 0.01% optimize.control_data_broadcast_order : 0.000019s : 0.04% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.01% optimize.offloading_packed_experts : 0.000007s : 0.02% optimize.overlap_recompute_and_grad_model_parallel : 0.000008s : 0.02% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.01% optimize.overlap_recompute_comm : 0.000005s : 0.01% optimize.overlap_grad_ring_attention : 0.000007s : 0.02% optimize.overlap_grad_flash_sp : 0.000025s : 0.06% optimize.begin_end_overlap_inline : 0.000003s : 0.01% optimize.split_matmul_comm_elemetwise : 0.000005s : 0.01% optimize.split_layernorm_comm : 0.000004s : 0.01% optimize.handle_group_info : 0.000003s : 0.01% optimize.symbol_engine_optimizer.build : 0.000004s : 0.01% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000011s : 0.03% optimize.symbol_engine_optimizer.elim_not_effective : 0.000016s : 0.04% optimize.symbol_engine_optimizer.opt_reshape : 0.000009s : 0.02% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000013s : 0.03% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000004s : 0.01% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000024s : 0.05% get_jit_bprop_graph : 0.000002s : 0.00% rewriter_after_jit_bprop_graph : 0.000005s : 0.01% opt_after_jit_grad : 0.016744s : 38.26% validate : 0.000050s : 0.11% Time group info: ------[substitution.] 0.000219 38 11.59% : 0.000025s : 3: substitution.cast_eliminate 1.07% : 0.000002s : 3: substitution.elim_not_effective 0.86% : 0.000002s : 3: substitution.fold_const_symbol 3.11% : 0.000007s : 5: substitution.graph_param_transform 64.48% : 0.000141s : 4: substitution.inline 2.17% : 0.000005s : 6: substitution.j_node_and_user_rematch 3.21% : 0.000007s : 6: substitution.remove_not_recompute_node 2.24% : 0.000005s : 4: substitution.replace_old_param 11.28% : 0.000025s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.005612 2 87.22% : 0.004894s : 1: type_inference.infer 12.78% : 0.000717s : 1: type_inference.specialize ------[replace.] 0.000062 8 63.00% : 0.000039s : 4: replace.inline 37.00% : 0.000023s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000161 8 85.85% : 0.000138s : 4: match.inline 14.15% : 0.000023s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000243 1504 0.84% : 0.000002s : 15: predicate.accumulaten_eliminater 1.21% : 0.000003s : 5: predicate.ad_related_special_op_eliminate 0.59% : 0.000001s : 10: predicate.addn_check_dump 0.87% : 0.000002s : 15: predicate.addn_zero_filter 0.79% : 0.000002s : 15: predicate.adjust_all_reduce_mul_add 1.92% : 0.000005s : 25: predicate.arithmetic_simplify 0.97% : 0.000002s : 15: predicate.cast_eliminate 0.64% : 0.000002s : 10: predicate.check_bprop_eliminate 0.57% : 0.000001s : 10: predicate.compare_switch_simplify 0.20% : 0.000000s : 5: predicate.const_output_eliminate 0.67% : 0.000002s : 10: predicate.depend_value_elim 0.89% : 0.000002s : 15: predicate.dict_get_item_const_eliminator 1.06% : 0.000003s : 15: predicate.dict_get_item_eliminator 0.81% : 0.000002s : 15: predicate.dict_set_item_eliminator 1.07% : 0.000003s : 10: predicate.dumpgradient_eliminate 0.21% : 0.000001s : 5: predicate.elim_not_effective 0.47% : 0.000001s : 5: predicate.elim_shapecalc_of_broadcastargs 1.16% : 0.000003s : 20: predicate.environ_add_const_eliminate 1.07% : 0.000003s : 20: predicate.environ_get_add_eliminate 1.11% : 0.000003s : 20: predicate.environ_get_depend_swap 1.74% : 0.000004s : 30: predicate.environ_get_eliminate 1.07% : 0.000003s : 20: predicate.environ_get_set_eliminate 1.37% : 0.000003s : 23: predicate.exchange_switch_depend_value 2.31% : 0.000006s : 23: predicate.float_depend_g_call 0.56% : 0.000001s : 10: predicate.float_environ_get_switch 0.83% : 0.000002s : 15: predicate.float_tuple_getitem_switch 0.20% : 0.000000s : 5: predicate.fold_const_symbol 0.69% : 0.000002s : 10: predicate.get_grad_eliminate 0.22% : 0.000001s : 5: predicate.graph_param_transform 0.63% : 0.000002s : 10: predicate.incorporate_call 0.55% : 0.000001s : 10: predicate.incorporate_call_switch 6.40% : 0.000016s : 68: predicate.inline 0.88% : 0.000002s : 10: predicate.inline_without_move 0.33% : 0.000001s : 10: predicate.j_node_and_user_rematch 0.83% : 0.000002s : 10: predicate.less_batch_normalization 1.85% : 0.000005s : 29: predicate.list_to_tuple_eliminator_ 2.59% : 0.000006s : 44: predicate.load_eliminater 0.92% : 0.000002s : 5: predicate.loop_unroll_after_grad 2.18% : 0.000005s : 36: predicate.loop_unroll_before_grad 1.57% : 0.000004s : 25: predicate.make_slice_get_slice_eliminator 0.60% : 0.000001s : 10: predicate.merge_addn 0.60% : 0.000001s : 10: predicate.micro_step_allgather_replace 0.60% : 0.000001s : 10: predicate.mini_step_allgather_replace 0.79% : 0.000002s : 15: predicate.minmaximum_grad 0.94% : 0.000002s : 5: predicate.mutable_eliminate 0.38% : 0.000001s : 5: predicate.opt_reshape 0.37% : 0.000001s : 5: predicate.parallel_virtual_node 1.66% : 0.000004s : 23: predicate.partial_defer_inline 1.61% : 0.000004s : 24: predicate.partial_eliminate 0.90% : 0.000002s : 15: predicate.print_const_string_wrapper 0.72% : 0.000002s : 10: predicate.reduce_all_const_elim 1.18% : 0.000003s : 15: predicate.reduce_eliminate 2.51% : 0.000006s : 44: predicate.redundant_stop_gradient_eliminater 0.60% : 0.000001s : 10: predicate.remove_not_recompute_node 1.36% : 0.000003s : 29: predicate.replace_applicator 0.46% : 0.000001s : 10: predicate.replace_old_param 0.27% : 0.000001s : 5: predicate.reset_defer_inline 0.99% : 0.000002s : 15: predicate.reshape_eliminate 0.69% : 0.000002s : 10: predicate.row_tensor_add_zeros_like 0.38% : 0.000001s : 5: predicate.row_tensor_eliminate 0.71% : 0.000002s : 10: predicate.same_eliminate 0.45% : 0.000001s : 10: predicate.set_cell_output_no_recompute 0.78% : 0.000002s : 10: predicate.shard_identity_eliminate 0.78% : 0.000002s : 10: predicate.special_op_eliminate 0.83% : 0.000002s : 10: predicate.specialize_transform 0.82% : 0.000002s : 10: predicate.split_environ_get_set_with_tuple_value 0.79% : 0.000002s : 10: predicate.stack_unstack_eliminate 0.35% : 0.000001s : 5: predicate.switch_call_monad_eliminater 1.47% : 0.000004s : 23: predicate.switch_defer_inline 2.20% : 0.000005s : 33: predicate.switch_layer_defer_inline 4.81% : 0.000012s : 74: predicate.switch_simplify 0.88% : 0.000002s : 15: predicate.tile_eliminate 1.00% : 0.000002s : 15: predicate.transpose_eliminate 1.50% : 0.000004s : 25: predicate.tuple_list_convert_item_index_to_positive 1.58% : 0.000004s : 25: predicate.tuple_list_get_item_const_eliminator 1.43% : 0.000003s : 25: predicate.tuple_list_get_item_depend_reorder 3.39% : 0.000008s : 39: predicate.tuple_list_get_item_eliminator 1.64% : 0.000004s : 25: predicate.tuple_list_get_set_item_eliminator 2.31% : 0.000006s : 35: predicate.tuple_list_set_item_eliminator 1.76% : 0.000004s : 29: predicate.tuple_to_list_eliminator_ 2.47% : 0.000006s : 44: predicate.updatestate_pure_node_eliminater 3.13% : 0.000008s : 54: predicate.updatestate_useless_node_eliminater 0.41% : 0.000001s : 5: predicate.value_based_eliminate 0.65% : 0.000002s : 10: predicate.virtual_dataset_eliminate 0.69% : 0.000002s : 10: predicate.virtual_output_eliminate 0.28% : 0.000001s : 5: predicate.virtual_view_grad_eliminate 0.45% : 0.000001s : 5: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000505 11 52.30% : 0.000264s : 5: func_graph_cloner_run.FuncGraphClonerGraph 47.70% : 0.000241s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.060083 192 0.01% : 0.000006s : 1: ForceFp32Comm 5.32% : 0.003197s : 1: add_attr 5.30% : 0.003183s : 1: add_attr_with_inline 0.01% : 0.000006s : 1: add_comm_op_reuse_tag 0.10% : 0.000061s : 1: add_recomputation 0.01% : 0.000006s : 1: assign_add_opt 0.13% : 0.000079s : 1: auto_monad 0.05% : 0.000032s : 1: auto_monad_reorder 0.01% : 0.000006s : 1: begin_end_overlap_inline 0.01% : 0.000008s : 1: bias_add_comm_swap 27.62% : 0.016595s : 1: bootstrap 0.06% : 0.000034s : 1: cconv 0.01% : 0.000006s : 1: comm_op_add_attrs 0.04% : 0.000022s : 1: control_data_broadcast_order 0.02% : 0.000015s : 1: convert_after_rewriter 0.06% : 0.000034s : 1: cse_after_recomputation 0.01% : 0.000008s : 1: dataset_repeat_opt 0.03% : 0.000020s : 1: detach_backward 0.02% : 0.000012s : 1: environ_conv 0.05% : 0.000032s : 1: event_method 0.01% : 0.000007s : 1: full_micro_interleaved_order_control 0.01% : 0.000008s : 1: get_jit_bprop_graph 0.02% : 0.000014s : 1: graph_reusing 0.01% : 0.000006s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000006s : 1: handle_group_info 0.01% : 0.000007s : 1: inline 0.01% : 0.000009s : 1: insert-virtual-dataset 0.01% : 0.000006s : 1: interleave_parallel_branches 0.01% : 0.000006s : 1: interleave_split_concat_branches 0.01% : 0.000008s : 1: label_fine_grained_interleaved_index 0.02% : 0.000011s : 1: label_micro_interleaved_index 0.77% : 0.000461s : 1: loop_unroll 0.01% : 0.000006s : 1: merge_cast_opt 0.01% : 0.000007s : 1: micro_interleaved_order_control 0.87% : 0.000521s : 1: mutable_eliminate 0.02% : 0.000010s : 1: offloading_packed_experts 0.03% : 0.000017s : 1: opt.transform.loop_unroll_optimizer 0.03% : 0.000018s : 1: opt.transform.mutable_eliminate 2.33% : 0.001400s : 78: opt.transform.opt_a 0.06% : 0.000037s : 1: opt.transform.opt_after_cconv 0.07% : 0.000039s : 1: opt.transform.opt_after_jit_grad 0.23% : 0.000137s : 28: opt.transform.opt_b 0.10% : 0.000059s : 2: opt.transform.opt_trans_graph 0.07% : 0.000045s : 4: opt.transform.symbol_engine_opt 5.62% : 0.003379s : 1: opt_a 0.25% : 0.000149s : 1: opt_after_cconv 27.89% : 0.016760s : 1: opt_after_jit_grad 0.53% : 0.000321s : 1: opt_b 10.29% : 0.006180s : 1: optimize 0.04% : 0.000025s : 1: optimize_parallel_all_gather_comm 0.02% : 0.000012s : 1: order_py_execute_after_rewriter 0.05% : 0.000028s : 1: overlap_grad_flash_sp 0.01% : 0.000007s : 1: overlap_grad_matmul_and_grad_allreduce 0.02% : 0.000010s : 1: overlap_grad_ring_attention 0.01% : 0.000007s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000007s : 1: overlap_opt_shard_in_pipeline 0.01% : 0.000008s : 1: overlap_param_gather 0.01% : 0.000006s : 1: overlap_recompute_allgather_and_fa_grad 0.02% : 0.000011s : 1: overlap_recompute_and_grad_model_parallel 0.01% : 0.000008s : 1: overlap_recompute_comm 0.02% : 0.000010s : 1: parallel-infer-symbol 0.01% : 0.000006s : 1: parallel-infer-symbol-second 0.01% : 0.000007s : 1: partial_unused_args_eliminate 0.02% : 0.000010s : 1: pipeline_parallel_scheduler 0.01% : 0.000007s : 1: pipeline_split 0.07% : 0.000041s : 1: pre_auto_parallel 0.06% : 0.000035s : 1: py_interpret_to_execute 0.03% : 0.000018s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000006s : 1: remove_cast_before_assign_add 0.04% : 0.000022s : 1: remove_dup_value 0.62% : 0.000375s : 1: renormalize.infer 0.48% : 0.000290s : 1: renormalize.specialize 0.01% : 0.000008s : 1: reorder_send_recv_between_fp_bp 0.02% : 0.000010s : 1: rewriter_after_jit_bprop_graph 0.09% : 0.000052s : 1: rewriter_after_opt_a 0.15% : 0.000093s : 1: rewriter_before_opt_a 0.01% : 0.000007s : 1: slice_cell_reuse_recomputed_activation 0.01% : 0.000007s : 1: slice_recompute_activation 0.01% : 0.000007s : 1: split_layernorm_comm 0.01% : 0.000007s : 1: split_matmul_comm_elemetwise 0.02% : 0.000012s : 1: swap_dp_allreduce_reducescatter 0.19% : 0.000112s : 1: symbol_engine_optimizer 0.18% : 0.000106s : 1: tuple_transform 9.50% : 0.005706s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:45:59.126.300 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0152497, [21] [bootstrap]: 0.00043322 [type_inference]: 0.00584132 [event_method]: 1.916e-05 [auto_monad]: 6.51e-05 [graph_reusing]: 5.91e-06 [inline]: 2.16998e-06 [add_attr]: 0.00313298, [1] [add_attr_with_inline]: 0.00312418, [1] [Cycle 1]: 5.933e-05, [2] [tag_attr]: 1.939e-05 [meta_addattr_fg_expand]: 6.17001e-06 [parallel-infer-symbol]: 2.94999e-06 [pre_auto_parallel]: 3.407e-05 [insert-virtual-dataset]: 2.46998e-06 [parallel-infer-symbol-second]: 6.69999e-07 [dataset_repeat_opt]: 1.94e-06 [pipeline_split]: 1.60999e-06 [optimize]: 0.00501659, [53] [py_interpret_to_execute]: 2.704e-05 [rewriter_before_opt_a]: 8.482e-05 [opt_a]: 0.00286999, [2] [Cycle 1]: 0.00209608, [45] [expand_dump_flag]: 3.46001e-06 [switch_simplify]: 4.385e-05 [loop_unroll]: 3.162e-05 [a_1]: 0.00066549 [with_stream_mark]: 1.629e-05 [recompute_prepare]: 1.102e-05 [updatestate_depend_eliminate]: 4.97999e-06 [updatestate_assign_eliminate]: 3.91999e-06 [updatestate_loads_eliminate]: 3.66001e-06 [parameter_eliminate]: 1.97999e-06 [a_2]: 0.00010177 [accelerated_algorithm]: 8.33999e-06 [shard]: 2.14e-06 [meta_shard_fg_expand]: 2.20002e-06 [shard_inline]: 8.14002e-06 [merge_send_recv]: 9.14e-06 [auto_parallel]: 7.08e-06 [parallel]: 1.888e-05 [flash_sp]: 8.72e-06 [merge_comm]: 4.74e-06 [allreduce_fusion]: 4.26001e-06 [matmul_add_comm_reduction]: 9.72999e-06 [allreduce_slice_to_reducescatter]: 7.10017e-07 [virtual_shard_identity]: 9.57999e-06 [virtual_dataset]: 7.85e-06 [get_grad_eliminate_]: 7.56001e-06 [virtual_output]: 7.82e-06 [merge_forward]: 4.33001e-06 [cell_reuse_recompute_pass]: 1.22e-06 [offload_activation]: 1.039e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.54e-05 [merge_recompute_call_nodes]: 1.97001e-06 [before_grad]: 1.3e-05 [set_forward_comm_id_for_comm_node_pass]: 4.32e-06 [meta_fg_expand]: 3.3e-06 [flash_sp_send_recv_attached]: 2.54999e-06 [receive_attached]: 2.32999e-06 [after_resolve]: 1.29e-05 [a_after_grad]: 1.201e-05 [renormalize]: 0.00064288 [add_forward_monad_depend]: 5.64998e-06 [auto_monad_grad]: 2.30002e-06 [auto_monad_eliminator]: 1.568e-05 [cse]: 3.416e-05 [a_3]: 5.736e-05 [Cycle 2]: 0.00076421, [45] [expand_dump_flag]: 1.00001e-06 [switch_simplify]: 9.24e-06 [loop_unroll]: 7.48e-06 [a_1]: 0.00017547 [with_stream_mark]: 1.194e-05 [recompute_prepare]: 7.62002e-06 [updatestate_depend_eliminate]: 3.95e-06 [updatestate_assign_eliminate]: 2.88998e-06 [updatestate_loads_eliminate]: 3.11999e-06 [parameter_eliminate]: 1.19e-06 [a_2]: 9.449e-05 [accelerated_algorithm]: 7.5e-06 [shard]: 1.50001e-06 [meta_shard_fg_expand]: 1.89e-06 [shard_inline]: 7.28999e-06 [merge_send_recv]: 6.21998e-06 [auto_parallel]: 6.27001e-06 [parallel]: 4.98001e-06 [flash_sp]: 3.47002e-06 [merge_comm]: 4.49002e-06 [allreduce_fusion]: 3.9e-06 [matmul_add_comm_reduction]: 6.31998e-06 [allreduce_slice_to_reducescatter]: 5.3001e-07 [virtual_shard_identity]: 8.17003e-06 [virtual_dataset]: 7.16999e-06 [get_grad_eliminate_]: 6.89999e-06 [virtual_output]: 6.81001e-06 [merge_forward]: 3.41999e-06 [cell_reuse_recompute_pass]: 1.48002e-06 [offload_activation]: 7.45998e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.434e-05 [merge_recompute_call_nodes]: 9.39996e-07 [before_grad]: 1.393e-05 [set_forward_comm_id_for_comm_node_pass]: 4.84e-06 [meta_fg_expand]: 2.93998e-06 [flash_sp_send_recv_attached]: 1.34e-06 [receive_attached]: 9.99979e-07 [after_resolve]: 1.171e-05 [a_after_grad]: 1.13e-05 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 1.60001e-06 [auto_monad_grad]: 9.5999e-07 [auto_monad_eliminator]: 9.25001e-06 [cse]: 1.831e-05 [a_3]: 4.51e-05 [py_interpret_to_execute_after_opt_a]: 9.52999e-06 [slice_cell_reuse_recomputed_activation]: 2.14e-06 [rewriter_after_opt_a]: 4.01e-05 [convert_after_rewriter]: 7.85e-06 [order_py_execute_after_rewriter]: 5.81998e-06 [mutable_eliminate]: 0.00047333 [opt_b]: 0.00025268, [1] [Cycle 1]: 0.00024671, [7] [b_1]: 0.00015415 [b_2]: 9.46998e-06 [updatestate_depend_eliminate]: 6.23e-06 [updatestate_assign_eliminate]: 3.16999e-06 [updatestate_loads_eliminate]: 3.01001e-06 [renormalize]: 6.59988e-07 [cse]: 2.387e-05 [optimize_parallel_all_gather_comm]: 1.829e-05 [overlap_param_gather]: 1.86e-06 [cconv]: 2.515e-05 [loop_unroll]: 0.00047486 [opt_after_cconv]: 0.0001233, [1] [Cycle 1]: 0.00011758, [7] [c_1]: 4.075e-05 [parameter_eliminate]: 2.91999e-06 [updatestate_depend_eliminate]: 6.51999e-06 [updatestate_assign_eliminate]: 3.46001e-06 [updatestate_loads_eliminate]: 3.30998e-06 [cse]: 2.366e-05 [renormalize]: 5.89993e-07 [remove_dup_value]: 1.517e-05 [tuple_transform]: 8.894e-05, [1] [Cycle 1]: 8.454e-05, [4] [d_1]: 5.532e-05 [none_parameter_eliminate]: 1.63002e-06 [renormalize]: 2.09984e-07 [switch_simplify]: 8.13001e-06 [partial_unused_args_eliminate]: 2.10002e-06 [add_recomputation]: 5.584e-05 [cse_after_recomputation]: 2.634e-05, [1] [Cycle 1]: 2.147e-05, [1] [cse]: 1.581e-05 [environ_conv]: 6.13002e-06 [swap_dp_allreduce_reducescatter]: 6.07999e-06 [bias_add_comm_swap]: 2.73998e-06 [label_micro_interleaved_index]: 4.22e-06 [label_fine_grained_interleaved_index]: 3.03998e-06 [merge_cast_opt]: 1.37e-06 [slice_recompute_activation]: 2.26998e-06 [micro_interleaved_order_control]: 2.09e-06 [assign_add_opt]: 1.39998e-06 [ForceFp32Comm]: 7.89994e-07 [remove_cast_before_assign_add]: 1.05001e-06 [full_micro_interleaved_order_control]: 2.41998e-06 [reorder_send_recv_between_fp_bp]: 2.83998e-06 [comm_op_add_attrs]: 9.5999e-07 [add_comm_op_reuse_tag]: 9.29984e-07 [interleave_split_concat_branches]: 1.15999e-06 [interleave_parallel_branches]: 1.39e-06 [overlap_opt_shard_in_pipeline]: 1.14998e-06 [overlap_opt_shard_grad_in_pipeline]: 2.00002e-06 [control_data_broadcast_order]: 1.547e-05 [grouped_pairwise_exchange_alltoall]: 1.46998e-06 [offloading_packed_experts]: 4.17e-06 [overlap_recompute_and_grad_model_parallel]: 5.29e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.36002e-06 [overlap_recompute_allgather_and_fa_grad]: 1.38002e-06 [overlap_recompute_comm]: 2.26e-06 [overlap_grad_ring_attention]: 4.84e-06 [overlap_grad_flash_sp]: 2.104e-05 [begin_end_overlap_inline]: 5.89993e-07 [split_matmul_comm_elemetwise]: 2.16e-06 [split_layernorm_comm]: 1.64e-06 [handle_group_info]: 1.02e-06 [symbol_engine_optimizer]: 8.378e-05, [1] [Cycle 1]: 7.926e-05, [6] [build]: 3.75e-06 [elim_shapecalc]: 1.103e-05 [elim_not_effective]: 1.491e-05 [opt_reshape]: 8.64e-06 [fold_const_symbol]: 1.23e-05 [renormalize]: 2.40019e-07 [detach_backward]: 2.18002e-06 [pipeline_parallel_scheduler]: 1.53002e-06 [auto_monad_reorder]: 1.961e-05 [get_jit_bprop_graph]: 1.37e-06 [rewriter_after_jit_bprop_graph]: 3.97e-06 [opt_after_jit_grad]: 0.00047097 [validate]: 4e-05 Sums bootstrap : 0.000433s : 3.89% type_inference : 0.005841s : 52.44% event_method : 0.000019s : 0.17% auto_monad : 0.000065s : 0.58% graph_reusing : 0.000006s : 0.05% inline : 0.000002s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000019s : 0.17% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.06% parallel-infer-symbol : 0.000003s : 0.03% pre_auto_parallel : 0.000034s : 0.31% insert-virtual-dataset : 0.000002s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000027s : 0.24% optimize.rewriter_before_opt_a : 0.000085s : 0.76% optimize.opt_a.expand_dump_flag : 0.000004s : 0.04% optimize.opt_a.switch_simplify : 0.000053s : 0.48% optimize.opt_a.loop_unroll : 0.000039s : 0.35% optimize.opt_a.a_1 : 0.000841s : 7.55% optimize.opt_a.with_stream_mark : 0.000028s : 0.25% optimize.opt_a.recompute_prepare : 0.000019s : 0.17% optimize.opt_a.updatestate_depend_eliminate : 0.000009s : 0.08% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.06% optimize.opt_a.updatestate_loads_eliminate : 0.000007s : 0.06% optimize.opt_a.parameter_eliminate : 0.000003s : 0.03% optimize.opt_a.a_2 : 0.000196s : 1.76% optimize.opt_a.accelerated_algorithm : 0.000016s : 0.14% optimize.opt_a.shard : 0.000004s : 0.03% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.04% optimize.opt_a.shard_inline : 0.000015s : 0.14% optimize.opt_a.merge_send_recv : 0.000015s : 0.14% optimize.opt_a.auto_parallel : 0.000013s : 0.12% optimize.opt_a.parallel : 0.000024s : 0.21% optimize.opt_a.flash_sp : 0.000012s : 0.11% optimize.opt_a.merge_comm : 0.000009s : 0.08% optimize.opt_a.allreduce_fusion : 0.000008s : 0.07% optimize.opt_a.matmul_add_comm_reduction : 0.000016s : 0.14% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000018s : 0.16% optimize.opt_a.virtual_dataset : 0.000015s : 0.13% optimize.opt_a.get_grad_eliminate_ : 0.000014s : 0.13% optimize.opt_a.virtual_output : 0.000015s : 0.13% optimize.opt_a.merge_forward : 0.000008s : 0.07% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.02% optimize.opt_a.offload_activation : 0.000018s : 0.16% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000030s : 0.27% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.03% optimize.opt_a.before_grad : 0.000027s : 0.24% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000009s : 0.08% optimize.opt_a.meta_fg_expand : 0.000006s : 0.06% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.03% optimize.opt_a.receive_attached : 0.000003s : 0.03% optimize.opt_a.after_resolve : 0.000025s : 0.22% optimize.opt_a.a_after_grad : 0.000023s : 0.21% optimize.opt_a.renormalize : 0.000643s : 5.77% optimize.opt_a.add_forward_monad_depend : 0.000007s : 0.07% optimize.opt_a.auto_monad_grad : 0.000003s : 0.03% optimize.opt_a.auto_monad_eliminator : 0.000025s : 0.22% optimize.opt_a.cse : 0.000052s : 0.47% optimize.opt_a.a_3 : 0.000102s : 0.92% optimize.py_interpret_to_execute_after_opt_a : 0.000010s : 0.09% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.02% optimize.rewriter_after_opt_a : 0.000040s : 0.36% optimize.convert_after_rewriter : 0.000008s : 0.07% optimize.order_py_execute_after_rewriter : 0.000006s : 0.05% optimize.mutable_eliminate : 0.000473s : 4.25% optimize.opt_b.b_1 : 0.000154s : 1.38% optimize.opt_b.b_2 : 0.000009s : 0.09% optimize.opt_b.updatestate_depend_eliminate : 0.000006s : 0.06% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_b.renormalize : 0.000001s : 0.01% optimize.opt_b.cse : 0.000024s : 0.21% optimize.optimize_parallel_all_gather_comm : 0.000018s : 0.16% optimize.overlap_param_gather : 0.000002s : 0.02% optimize.cconv : 0.000025s : 0.23% optimize.loop_unroll : 0.000475s : 4.26% optimize.opt_after_cconv.c_1 : 0.000041s : 0.37% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000007s : 0.06% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.cse : 0.000024s : 0.21% optimize.opt_after_cconv.renormalize : 0.000001s : 0.01% optimize.remove_dup_value : 0.000015s : 0.14% optimize.tuple_transform.d_1 : 0.000055s : 0.50% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000008s : 0.07% optimize.partial_unused_args_eliminate : 0.000002s : 0.02% optimize.add_recomputation : 0.000056s : 0.50% optimize.cse_after_recomputation.cse : 0.000016s : 0.14% optimize.environ_conv : 0.000006s : 0.06% optimize.swap_dp_allreduce_reducescatter : 0.000006s : 0.05% optimize.bias_add_comm_swap : 0.000003s : 0.02% optimize.label_micro_interleaved_index : 0.000004s : 0.04% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.03% optimize.merge_cast_opt : 0.000001s : 0.01% optimize.slice_recompute_activation : 0.000002s : 0.02% optimize.micro_interleaved_order_control : 0.000002s : 0.02% optimize.assign_add_opt : 0.000001s : 0.01% optimize.ForceFp32Comm : 0.000001s : 0.01% optimize.remove_cast_before_assign_add : 0.000001s : 0.01% optimize.full_micro_interleaved_order_control : 0.000002s : 0.02% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.03% optimize.comm_op_add_attrs : 0.000001s : 0.01% optimize.add_comm_op_reuse_tag : 0.000001s : 0.01% optimize.interleave_split_concat_branches : 0.000001s : 0.01% optimize.interleave_parallel_branches : 0.000001s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.02% optimize.control_data_broadcast_order : 0.000015s : 0.14% optimize.grouped_pairwise_exchange_alltoall : 0.000001s : 0.01% optimize.offloading_packed_experts : 0.000004s : 0.04% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.05% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.01% optimize.overlap_recompute_comm : 0.000002s : 0.02% optimize.overlap_grad_ring_attention : 0.000005s : 0.04% optimize.overlap_grad_flash_sp : 0.000021s : 0.19% optimize.begin_end_overlap_inline : 0.000001s : 0.01% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.02% optimize.split_layernorm_comm : 0.000002s : 0.01% optimize.handle_group_info : 0.000001s : 0.01% optimize.symbol_engine_optimizer.build : 0.000004s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000011s : 0.10% optimize.symbol_engine_optimizer.elim_not_effective : 0.000015s : 0.13% optimize.symbol_engine_optimizer.opt_reshape : 0.000009s : 0.08% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000012s : 0.11% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.02% pipeline_parallel_scheduler : 0.000002s : 0.01% auto_monad_reorder : 0.000020s : 0.18% get_jit_bprop_graph : 0.000001s : 0.01% rewriter_after_jit_bprop_graph : 0.000004s : 0.04% opt_after_jit_grad : 0.000471s : 4.23% validate : 0.000040s : 0.36% Time group info: ------[substitution.] 0.000202 38 11.62% : 0.000023s : 3: substitution.cast_eliminate 1.00% : 0.000002s : 3: substitution.elim_not_effective 0.83% : 0.000002s : 3: substitution.fold_const_symbol 3.39% : 0.000007s : 5: substitution.graph_param_transform 69.46% : 0.000140s : 4: substitution.inline 2.39% : 0.000005s : 6: substitution.j_node_and_user_rematch 2.86% : 0.000006s : 6: substitution.remove_not_recompute_node 2.21% : 0.000004s : 4: substitution.replace_old_param 6.24% : 0.000013s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.005781 2 87.65% : 0.005067s : 1: type_inference.infer 12.35% : 0.000714s : 1: type_inference.specialize ------[replace.] 0.000059 8 62.23% : 0.000037s : 4: replace.inline 37.77% : 0.000022s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000148 8 92.75% : 0.000138s : 4: match.inline 7.25% : 0.000011s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000236 1504 0.87% : 0.000002s : 15: predicate.accumulaten_eliminater 0.69% : 0.000002s : 5: predicate.ad_related_special_op_eliminate 0.56% : 0.000001s : 10: predicate.addn_check_dump 0.84% : 0.000002s : 15: predicate.addn_zero_filter 0.82% : 0.000002s : 15: predicate.adjust_all_reduce_mul_add 2.02% : 0.000005s : 25: predicate.arithmetic_simplify 1.05% : 0.000002s : 15: predicate.cast_eliminate 0.65% : 0.000002s : 10: predicate.check_bprop_eliminate 0.62% : 0.000001s : 10: predicate.compare_switch_simplify 0.20% : 0.000000s : 5: predicate.const_output_eliminate 0.67% : 0.000002s : 10: predicate.depend_value_elim 0.92% : 0.000002s : 15: predicate.dict_get_item_const_eliminator 1.03% : 0.000002s : 15: predicate.dict_get_item_eliminator 0.84% : 0.000002s : 15: predicate.dict_set_item_eliminator 0.94% : 0.000002s : 10: predicate.dumpgradient_eliminate 0.22% : 0.000001s : 5: predicate.elim_not_effective 0.40% : 0.000001s : 5: predicate.elim_shapecalc_of_broadcastargs 1.11% : 0.000003s : 20: predicate.environ_add_const_eliminate 1.08% : 0.000003s : 20: predicate.environ_get_add_eliminate 1.13% : 0.000003s : 20: predicate.environ_get_depend_swap 1.72% : 0.000004s : 30: predicate.environ_get_eliminate 1.12% : 0.000003s : 20: predicate.environ_get_set_eliminate 1.37% : 0.000003s : 23: predicate.exchange_switch_depend_value 2.33% : 0.000005s : 23: predicate.float_depend_g_call 0.58% : 0.000001s : 10: predicate.float_environ_get_switch 0.83% : 0.000002s : 15: predicate.float_tuple_getitem_switch 0.19% : 0.000000s : 5: predicate.fold_const_symbol 0.70% : 0.000002s : 10: predicate.get_grad_eliminate 0.25% : 0.000001s : 5: predicate.graph_param_transform 0.68% : 0.000002s : 10: predicate.incorporate_call 0.56% : 0.000001s : 10: predicate.incorporate_call_switch 6.55% : 0.000015s : 68: predicate.inline 1.08% : 0.000003s : 10: predicate.inline_without_move 0.33% : 0.000001s : 10: predicate.j_node_and_user_rematch 0.90% : 0.000002s : 10: predicate.less_batch_normalization 1.79% : 0.000004s : 29: predicate.list_to_tuple_eliminator_ 2.52% : 0.000006s : 44: predicate.load_eliminater 0.86% : 0.000002s : 5: predicate.loop_unroll_after_grad 2.15% : 0.000005s : 36: predicate.loop_unroll_before_grad 1.67% : 0.000004s : 25: predicate.make_slice_get_slice_eliminator 0.63% : 0.000001s : 10: predicate.merge_addn 0.57% : 0.000001s : 10: predicate.micro_step_allgather_replace 0.58% : 0.000001s : 10: predicate.mini_step_allgather_replace 0.82% : 0.000002s : 15: predicate.minmaximum_grad 0.77% : 0.000002s : 5: predicate.mutable_eliminate 0.36% : 0.000001s : 5: predicate.opt_reshape 0.35% : 0.000001s : 5: predicate.parallel_virtual_node 1.69% : 0.000004s : 23: predicate.partial_defer_inline 1.65% : 0.000004s : 24: predicate.partial_eliminate 0.96% : 0.000002s : 15: predicate.print_const_string_wrapper 0.71% : 0.000002s : 10: predicate.reduce_all_const_elim 1.28% : 0.000003s : 15: predicate.reduce_eliminate 2.55% : 0.000006s : 44: predicate.redundant_stop_gradient_eliminater 0.38% : 0.000001s : 10: predicate.remove_not_recompute_node 1.38% : 0.000003s : 29: predicate.replace_applicator 0.54% : 0.000001s : 10: predicate.replace_old_param 0.27% : 0.000001s : 5: predicate.reset_defer_inline 0.90% : 0.000002s : 15: predicate.reshape_eliminate 0.71% : 0.000002s : 10: predicate.row_tensor_add_zeros_like 0.39% : 0.000001s : 5: predicate.row_tensor_eliminate 0.74% : 0.000002s : 10: predicate.same_eliminate 0.45% : 0.000001s : 10: predicate.set_cell_output_no_recompute 0.73% : 0.000002s : 10: predicate.shard_identity_eliminate 0.74% : 0.000002s : 10: predicate.special_op_eliminate 0.77% : 0.000002s : 10: predicate.specialize_transform 0.82% : 0.000002s : 10: predicate.split_environ_get_set_with_tuple_value 0.79% : 0.000002s : 10: predicate.stack_unstack_eliminate 0.41% : 0.000001s : 5: predicate.switch_call_monad_eliminater 1.44% : 0.000003s : 23: predicate.switch_defer_inline 2.03% : 0.000005s : 33: predicate.switch_layer_defer_inline 4.88% : 0.000012s : 74: predicate.switch_simplify 0.85% : 0.000002s : 15: predicate.tile_eliminate 0.96% : 0.000002s : 15: predicate.transpose_eliminate 1.53% : 0.000004s : 25: predicate.tuple_list_convert_item_index_to_positive 1.62% : 0.000004s : 25: predicate.tuple_list_get_item_const_eliminator 1.50% : 0.000004s : 25: predicate.tuple_list_get_item_depend_reorder 3.33% : 0.000008s : 39: predicate.tuple_list_get_item_eliminator 1.61% : 0.000004s : 25: predicate.tuple_list_get_set_item_eliminator 2.52% : 0.000006s : 35: predicate.tuple_list_set_item_eliminator 1.76% : 0.000004s : 29: predicate.tuple_to_list_eliminator_ 2.55% : 0.000006s : 44: predicate.updatestate_pure_node_eliminater 3.38% : 0.000008s : 54: predicate.updatestate_useless_node_eliminater 0.35% : 0.000001s : 5: predicate.value_based_eliminate 0.64% : 0.000002s : 10: predicate.virtual_dataset_eliminate 0.65% : 0.000002s : 10: predicate.virtual_output_eliminate 0.31% : 0.000001s : 5: predicate.virtual_view_grad_eliminate 0.34% : 0.000001s : 5: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000532 11 54.81% : 0.000291s : 5: func_graph_cloner_run.FuncGraphClonerGraph 45.19% : 0.000240s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.025609 192 0.01% : 0.000004s : 1: ForceFp32Comm 12.25% : 0.003138s : 1: add_attr 12.21% : 0.003128s : 1: add_attr_with_inline 0.01% : 0.000004s : 1: add_comm_op_reuse_tag 0.23% : 0.000060s : 1: add_recomputation 0.02% : 0.000004s : 1: assign_add_opt 0.28% : 0.000072s : 1: auto_monad 0.09% : 0.000024s : 1: auto_monad_reorder 0.01% : 0.000003s : 1: begin_end_overlap_inline 0.02% : 0.000006s : 1: bias_add_comm_swap 1.81% : 0.000463s : 1: bootstrap 0.11% : 0.000029s : 1: cconv 0.01% : 0.000004s : 1: comm_op_add_attrs 0.07% : 0.000019s : 1: control_data_broadcast_order 0.04% : 0.000011s : 1: convert_after_rewriter 0.11% : 0.000029s : 1: cse_after_recomputation 0.02% : 0.000005s : 1: dataset_repeat_opt 0.02% : 0.000005s : 1: detach_backward 0.04% : 0.000010s : 1: environ_conv 0.10% : 0.000025s : 1: event_method 0.02% : 0.000005s : 1: full_micro_interleaved_order_control 0.02% : 0.000005s : 1: get_jit_bprop_graph 0.04% : 0.000009s : 1: graph_reusing 0.02% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.02% : 0.000004s : 1: handle_group_info 0.02% : 0.000005s : 1: inline 0.02% : 0.000006s : 1: insert-virtual-dataset 0.02% : 0.000004s : 1: interleave_parallel_branches 0.02% : 0.000004s : 1: interleave_split_concat_branches 0.02% : 0.000006s : 1: label_fine_grained_interleaved_index 0.03% : 0.000007s : 1: label_micro_interleaved_index 1.89% : 0.000484s : 1: loop_unroll 0.02% : 0.000004s : 1: merge_cast_opt 0.02% : 0.000005s : 1: micro_interleaved_order_control 1.88% : 0.000481s : 1: mutable_eliminate 0.03% : 0.000007s : 1: offloading_packed_experts 0.06% : 0.000016s : 1: opt.transform.loop_unroll_optimizer 0.06% : 0.000016s : 1: opt.transform.mutable_eliminate 5.30% : 0.001357s : 78: opt.transform.opt_a 0.15% : 0.000039s : 1: opt.transform.opt_after_cconv 0.11% : 0.000029s : 1: opt.transform.opt_after_jit_grad 0.52% : 0.000132s : 28: opt.transform.opt_b 0.24% : 0.000061s : 2: opt.transform.opt_trans_graph 0.17% : 0.000043s : 4: opt.transform.symbol_engine_opt 11.22% : 0.002873s : 1: opt_a 0.50% : 0.000127s : 1: opt_after_cconv 1.87% : 0.000480s : 1: opt_after_jit_grad 1.00% : 0.000256s : 1: opt_b 19.60% : 0.005021s : 1: optimize 0.09% : 0.000022s : 1: optimize_parallel_all_gather_comm 0.03% : 0.000009s : 1: order_py_execute_after_rewriter 0.10% : 0.000025s : 1: overlap_grad_flash_sp 0.02% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.03% : 0.000008s : 1: overlap_grad_ring_attention 0.02% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.02% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.02% : 0.000005s : 1: overlap_param_gather 0.02% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.03% : 0.000008s : 1: overlap_recompute_and_grad_model_parallel 0.02% : 0.000005s : 1: overlap_recompute_comm 0.03% : 0.000006s : 1: parallel-infer-symbol 0.01% : 0.000004s : 1: parallel-infer-symbol-second 0.02% : 0.000005s : 1: partial_unused_args_eliminate 0.02% : 0.000005s : 1: pipeline_parallel_scheduler 0.02% : 0.000004s : 1: pipeline_split 0.15% : 0.000038s : 1: pre_auto_parallel 0.12% : 0.000031s : 1: py_interpret_to_execute 0.05% : 0.000013s : 1: py_interpret_to_execute_after_opt_a 0.02% : 0.000004s : 1: remove_cast_before_assign_add 0.07% : 0.000019s : 1: remove_dup_value 1.38% : 0.000352s : 1: renormalize.infer 1.10% : 0.000282s : 1: renormalize.specialize 0.02% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.03% : 0.000007s : 1: rewriter_after_jit_bprop_graph 0.17% : 0.000044s : 1: rewriter_after_opt_a 0.35% : 0.000089s : 1: rewriter_before_opt_a 0.02% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.02% : 0.000005s : 1: slice_recompute_activation 0.02% : 0.000004s : 1: split_layernorm_comm 0.02% : 0.000005s : 1: split_matmul_comm_elemetwise 0.04% : 0.000009s : 1: swap_dp_allreduce_reducescatter 0.34% : 0.000086s : 1: symbol_engine_optimizer 0.36% : 0.000092s : 1: tuple_transform 22.88% : 0.005859s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:45:59.518.909 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:45:59.519.163 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0314867, [21] [bootstrap]: 0.00040707 [type_inference]: 0.00537366 [event_method]: 1.696e-05 [auto_monad]: 4.538e-05 [graph_reusing]: 4.92e-06 [inline]: 1.55999e-06 [add_attr]: 0.0193975, [1] [add_attr_with_inline]: 0.0193878, [1] [Cycle 1]: 7.378e-05, [2] [tag_attr]: 2.01e-05 [meta_addattr_fg_expand]: 5.49e-06 [parallel-infer-symbol]: 3.3e-06 [pre_auto_parallel]: 2.947e-05 [insert-virtual-dataset]: 2.24001e-06 [parallel-infer-symbol-second]: 7.7e-07 [dataset_repeat_opt]: 1.99999e-06 [pipeline_split]: 1.84e-06 [optimize]: 0.00507896, [53] [py_interpret_to_execute]: 2.861e-05 [rewriter_before_opt_a]: 8.616e-05 [opt_a]: 0.0029764, [2] [Cycle 1]: 0.00214382, [45] [expand_dump_flag]: 2.07999e-06 [switch_simplify]: 3.658e-05 [loop_unroll]: 3.059e-05 [a_1]: 0.00062316 [with_stream_mark]: 1.382e-05 [recompute_prepare]: 8.45999e-06 [updatestate_depend_eliminate]: 3.50003e-06 [updatestate_assign_eliminate]: 2.64999e-06 [updatestate_loads_eliminate]: 2.71e-06 [parameter_eliminate]: 1.42999e-06 [a_2]: 0.00010757 [accelerated_algorithm]: 6.96001e-06 [shard]: 1.50001e-06 [meta_shard_fg_expand]: 1.71998e-06 [shard_inline]: 6.73998e-06 [merge_send_recv]: 5.75001e-06 [auto_parallel]: 5.87999e-06 [parallel]: 1.552e-05 [flash_sp]: 2.927e-05 [merge_comm]: 4.57e-06 [allreduce_fusion]: 3.53e-06 [matmul_add_comm_reduction]: 6.00002e-06 [allreduce_slice_to_reducescatter]: 6.09987e-07 [virtual_shard_identity]: 8.22e-06 [virtual_dataset]: 7.03e-06 [get_grad_eliminate_]: 6.96001e-06 [virtual_output]: 6.79999e-06 [merge_forward]: 3.69002e-06 [cell_reuse_recompute_pass]: 1.17999e-06 [offload_activation]: 7.33999e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.397e-05 [merge_recompute_call_nodes]: 9.30013e-07 [before_grad]: 1.071e-05 [set_forward_comm_id_for_comm_node_pass]: 3.5e-06 [meta_fg_expand]: 2.52001e-06 [flash_sp_send_recv_attached]: 1.96e-06 [receive_attached]: 1.13001e-06 [after_resolve]: 1.067e-05 [a_after_grad]: 1.037e-05 [renormalize]: 0.00065083 [add_forward_monad_depend]: 4.56002e-06 [auto_monad_grad]: 1.57999e-06 [auto_monad_eliminator]: 1.196e-05 [cse]: 1.643e-05 [a_3]: 6.253e-05 [Cycle 2]: 0.00082014, [45] [expand_dump_flag]: 1.00001e-06 [switch_simplify]: 7.56999e-06 [loop_unroll]: 6.88998e-06 [a_1]: 0.00013145 [with_stream_mark]: 9.84001e-06 [recompute_prepare]: 6.51999e-06 [updatestate_depend_eliminate]: 3.03e-06 [updatestate_assign_eliminate]: 2.43e-06 [updatestate_loads_eliminate]: 2.14e-06 [parameter_eliminate]: 1.08001e-06 [a_2]: 9.978e-05 [accelerated_algorithm]: 6.19001e-06 [shard]: 1.52999e-06 [meta_shard_fg_expand]: 1.30999e-06 [shard_inline]: 6.38e-06 [merge_send_recv]: 4.69998e-06 [auto_parallel]: 5.70001e-06 [parallel]: 7.3e-06 [flash_sp]: 3.14001e-06 [merge_comm]: 3.23e-06 [allreduce_fusion]: 3.09999e-06 [matmul_add_comm_reduction]: 5.40999e-06 [allreduce_slice_to_reducescatter]: 4.19997e-07 [virtual_shard_identity]: 6.96001e-06 [virtual_dataset]: 6.16e-06 [get_grad_eliminate_]: 6.01998e-06 [virtual_output]: 7.77e-06 [merge_forward]: 2.67001e-06 [cell_reuse_recompute_pass]: 1.30999e-06 [offload_activation]: 6.54999e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.542e-05 [merge_recompute_call_nodes]: 9.20001e-07 [before_grad]: 1.034e-05 [set_forward_comm_id_for_comm_node_pass]: 3.6e-06 [meta_fg_expand]: 2.19001e-06 [flash_sp_send_recv_attached]: 7.39994e-07 [receive_attached]: 1.02e-06 [after_resolve]: 1.02e-05 [a_after_grad]: 9.72999e-06 [renormalize]: 1.19995e-07 [add_forward_monad_depend]: 1.62999e-06 [auto_monad_grad]: 9.29984e-07 [auto_monad_eliminator]: 7.16001e-06 [cse]: 1.275e-05 [a_3]: 5.05e-05 [py_interpret_to_execute_after_opt_a]: 1.142e-05 [slice_cell_reuse_recomputed_activation]: 4.17998e-06 [rewriter_after_opt_a]: 3.741e-05 [convert_after_rewriter]: 1.003e-05 [order_py_execute_after_rewriter]: 7.41001e-06 [mutable_eliminate]: 0.0004653 [opt_b]: 0.00026408, [1] [Cycle 1]: 0.00025547, [7] [b_1]: 0.00016551 [b_2]: 8.15e-06 [updatestate_depend_eliminate]: 5.59998e-06 [updatestate_assign_eliminate]: 2.48e-06 [updatestate_loads_eliminate]: 2.30002e-06 [renormalize]: 4.30009e-07 [cse]: 1.57e-05 [optimize_parallel_all_gather_comm]: 1.492e-05 [overlap_param_gather]: 4.47e-06 [cconv]: 2.049e-05 [loop_unroll]: 0.00042476 [opt_after_cconv]: 0.00012347, [1] [Cycle 1]: 0.00011549, [7] [c_1]: 3.223e-05 [parameter_eliminate]: 2.43e-06 [updatestate_depend_eliminate]: 5.04e-06 [updatestate_assign_eliminate]: 2.46e-06 [updatestate_loads_eliminate]: 2.33002e-06 [cse]: 1.61e-05 [renormalize]: 4.40021e-07 [remove_dup_value]: 9.51998e-06 [tuple_transform]: 8.733e-05, [1] [Cycle 1]: 8.077e-05, [4] [d_1]: 4.215e-05 [none_parameter_eliminate]: 1.18001e-06 [renormalize]: 2.09984e-07 [switch_simplify]: 6.98e-06 [partial_unused_args_eliminate]: 3.9e-06 [add_recomputation]: 3.469e-05 [cse_after_recomputation]: 2.641e-05, [1] [Cycle 1]: 1.975e-05, [1] [cse]: 1.07e-05 [environ_conv]: 6.51e-06 [swap_dp_allreduce_reducescatter]: 6.83998e-06 [bias_add_comm_swap]: 3.73999e-06 [label_micro_interleaved_index]: 5.82001e-06 [label_fine_grained_interleaved_index]: 4.3e-06 [merge_cast_opt]: 3.01999e-06 [slice_recompute_activation]: 3.52002e-06 [micro_interleaved_order_control]: 3.66999e-06 [assign_add_opt]: 3.09999e-06 [ForceFp32Comm]: 2.79999e-06 [remove_cast_before_assign_add]: 2.78003e-06 [full_micro_interleaved_order_control]: 3.45e-06 [reorder_send_recv_between_fp_bp]: 3.83999e-06 [comm_op_add_attrs]: 2.91e-06 [add_comm_op_reuse_tag]: 2.73e-06 [interleave_split_concat_branches]: 3.08998e-06 [interleave_parallel_branches]: 3.22002e-06 [overlap_opt_shard_in_pipeline]: 3.27002e-06 [overlap_opt_shard_grad_in_pipeline]: 3.33998e-06 [control_data_broadcast_order]: 1.226e-05 [grouped_pairwise_exchange_alltoall]: 2.94001e-06 [offloading_packed_experts]: 5.60001e-06 [overlap_recompute_and_grad_model_parallel]: 5.92999e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.2e-06 [overlap_recompute_allgather_and_fa_grad]: 3.04999e-06 [overlap_recompute_comm]: 3.93001e-06 [overlap_grad_ring_attention]: 5.65001e-06 [overlap_grad_flash_sp]: 1.532e-05 [begin_end_overlap_inline]: 2.86999e-06 [split_matmul_comm_elemetwise]: 3.31999e-06 [split_layernorm_comm]: 3.18998e-06 [handle_group_info]: 2.98e-06 [symbol_engine_optimizer]: 9.168e-05, [1] [Cycle 1]: 8.5e-05, [6] [build]: 1.82999e-06 [elim_shapecalc]: 9.56e-06 [elim_not_effective]: 1.247e-05 [opt_reshape]: 6.93e-06 [fold_const_symbol]: 9.77999e-06 [renormalize]: 1.90019e-07 [detach_backward]: 2.64001e-06 [pipeline_parallel_scheduler]: 1.55001e-06 [auto_monad_reorder]: 1.346e-05 [get_jit_bprop_graph]: 1.49998e-06 [rewriter_after_jit_bprop_graph]: 4.66002e-06 [opt_after_jit_grad]: 0.00046851 [validate]: 2.844e-05 Sums bootstrap : 0.000407s : 3.93% type_inference : 0.005374s : 51.90% event_method : 0.000017s : 0.16% auto_monad : 0.000045s : 0.44% graph_reusing : 0.000005s : 0.05% inline : 0.000002s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000020s : 0.19% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000005s : 0.05% parallel-infer-symbol : 0.000003s : 0.03% pre_auto_parallel : 0.000029s : 0.28% insert-virtual-dataset : 0.000002s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.02% optimize.py_interpret_to_execute : 0.000029s : 0.28% optimize.rewriter_before_opt_a : 0.000086s : 0.83% optimize.opt_a.expand_dump_flag : 0.000003s : 0.03% optimize.opt_a.switch_simplify : 0.000044s : 0.43% optimize.opt_a.loop_unroll : 0.000037s : 0.36% optimize.opt_a.a_1 : 0.000755s : 7.29% optimize.opt_a.with_stream_mark : 0.000024s : 0.23% optimize.opt_a.recompute_prepare : 0.000015s : 0.14% optimize.opt_a.updatestate_depend_eliminate : 0.000007s : 0.06% optimize.opt_a.updatestate_assign_eliminate : 0.000005s : 0.05% optimize.opt_a.updatestate_loads_eliminate : 0.000005s : 0.05% optimize.opt_a.parameter_eliminate : 0.000003s : 0.02% optimize.opt_a.a_2 : 0.000207s : 2.00% optimize.opt_a.accelerated_algorithm : 0.000013s : 0.13% optimize.opt_a.shard : 0.000003s : 0.03% optimize.opt_a.meta_shard_fg_expand : 0.000003s : 0.03% optimize.opt_a.shard_inline : 0.000013s : 0.13% optimize.opt_a.merge_send_recv : 0.000010s : 0.10% optimize.opt_a.auto_parallel : 0.000012s : 0.11% optimize.opt_a.parallel : 0.000023s : 0.22% optimize.opt_a.flash_sp : 0.000032s : 0.31% optimize.opt_a.merge_comm : 0.000008s : 0.08% optimize.opt_a.allreduce_fusion : 0.000007s : 0.06% optimize.opt_a.matmul_add_comm_reduction : 0.000011s : 0.11% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000015s : 0.15% optimize.opt_a.virtual_dataset : 0.000013s : 0.13% optimize.opt_a.get_grad_eliminate_ : 0.000013s : 0.13% optimize.opt_a.virtual_output : 0.000015s : 0.14% optimize.opt_a.merge_forward : 0.000006s : 0.06% optimize.opt_a.cell_reuse_recompute_pass : 0.000002s : 0.02% optimize.opt_a.offload_activation : 0.000014s : 0.13% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000029s : 0.28% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.02% optimize.opt_a.before_grad : 0.000021s : 0.20% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000007s : 0.07% optimize.opt_a.meta_fg_expand : 0.000005s : 0.05% optimize.opt_a.flash_sp_send_recv_attached : 0.000003s : 0.03% optimize.opt_a.receive_attached : 0.000002s : 0.02% optimize.opt_a.after_resolve : 0.000021s : 0.20% optimize.opt_a.a_after_grad : 0.000020s : 0.19% optimize.opt_a.renormalize : 0.000651s : 6.29% optimize.opt_a.add_forward_monad_depend : 0.000006s : 0.06% optimize.opt_a.auto_monad_grad : 0.000003s : 0.02% optimize.opt_a.auto_monad_eliminator : 0.000019s : 0.18% optimize.opt_a.cse : 0.000029s : 0.28% optimize.opt_a.a_3 : 0.000113s : 1.09% optimize.py_interpret_to_execute_after_opt_a : 0.000011s : 0.11% optimize.slice_cell_reuse_recomputed_activation : 0.000004s : 0.04% optimize.rewriter_after_opt_a : 0.000037s : 0.36% optimize.convert_after_rewriter : 0.000010s : 0.10% optimize.order_py_execute_after_rewriter : 0.000007s : 0.07% optimize.mutable_eliminate : 0.000465s : 4.49% optimize.opt_b.b_1 : 0.000166s : 1.60% optimize.opt_b.b_2 : 0.000008s : 0.08% optimize.opt_b.updatestate_depend_eliminate : 0.000006s : 0.05% optimize.opt_b.updatestate_assign_eliminate : 0.000002s : 0.02% optimize.opt_b.updatestate_loads_eliminate : 0.000002s : 0.02% optimize.opt_b.renormalize : 0.000000s : 0.00% optimize.opt_b.cse : 0.000016s : 0.15% optimize.optimize_parallel_all_gather_comm : 0.000015s : 0.14% optimize.overlap_param_gather : 0.000004s : 0.04% optimize.cconv : 0.000020s : 0.20% optimize.loop_unroll : 0.000425s : 4.10% optimize.opt_after_cconv.c_1 : 0.000032s : 0.31% optimize.opt_after_cconv.parameter_eliminate : 0.000002s : 0.02% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000005s : 0.05% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000002s : 0.02% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.02% optimize.opt_after_cconv.cse : 0.000016s : 0.16% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000010s : 0.09% optimize.tuple_transform.d_1 : 0.000042s : 0.41% optimize.tuple_transform.none_parameter_eliminate : 0.000001s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000007s : 0.07% optimize.partial_unused_args_eliminate : 0.000004s : 0.04% optimize.add_recomputation : 0.000035s : 0.34% optimize.cse_after_recomputation.cse : 0.000011s : 0.10% optimize.environ_conv : 0.000007s : 0.06% optimize.swap_dp_allreduce_reducescatter : 0.000007s : 0.07% optimize.bias_add_comm_swap : 0.000004s : 0.04% optimize.label_micro_interleaved_index : 0.000006s : 0.06% optimize.label_fine_grained_interleaved_index : 0.000004s : 0.04% optimize.merge_cast_opt : 0.000003s : 0.03% optimize.slice_recompute_activation : 0.000004s : 0.03% optimize.micro_interleaved_order_control : 0.000004s : 0.04% optimize.assign_add_opt : 0.000003s : 0.03% optimize.ForceFp32Comm : 0.000003s : 0.03% optimize.remove_cast_before_assign_add : 0.000003s : 0.03% optimize.full_micro_interleaved_order_control : 0.000003s : 0.03% optimize.reorder_send_recv_between_fp_bp : 0.000004s : 0.04% optimize.comm_op_add_attrs : 0.000003s : 0.03% optimize.add_comm_op_reuse_tag : 0.000003s : 0.03% optimize.interleave_split_concat_branches : 0.000003s : 0.03% optimize.interleave_parallel_branches : 0.000003s : 0.03% optimize.overlap_opt_shard_in_pipeline : 0.000003s : 0.03% optimize.overlap_opt_shard_grad_in_pipeline : 0.000003s : 0.03% optimize.control_data_broadcast_order : 0.000012s : 0.12% optimize.grouped_pairwise_exchange_alltoall : 0.000003s : 0.03% optimize.offloading_packed_experts : 0.000006s : 0.05% optimize.overlap_recompute_and_grad_model_parallel : 0.000006s : 0.06% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000003s : 0.03% optimize.overlap_recompute_allgather_and_fa_grad : 0.000003s : 0.03% optimize.overlap_recompute_comm : 0.000004s : 0.04% optimize.overlap_grad_ring_attention : 0.000006s : 0.05% optimize.overlap_grad_flash_sp : 0.000015s : 0.15% optimize.begin_end_overlap_inline : 0.000003s : 0.03% optimize.split_matmul_comm_elemetwise : 0.000003s : 0.03% optimize.split_layernorm_comm : 0.000003s : 0.03% optimize.handle_group_info : 0.000003s : 0.03% optimize.symbol_engine_optimizer.build : 0.000002s : 0.02% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000010s : 0.09% optimize.symbol_engine_optimizer.elim_not_effective : 0.000012s : 0.12% optimize.symbol_engine_optimizer.opt_reshape : 0.000007s : 0.07% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000010s : 0.09% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000003s : 0.03% pipeline_parallel_scheduler : 0.000002s : 0.01% auto_monad_reorder : 0.000013s : 0.13% get_jit_bprop_graph : 0.000001s : 0.01% rewriter_after_jit_bprop_graph : 0.000005s : 0.05% opt_after_jit_grad : 0.000469s : 4.52% validate : 0.000028s : 0.27% Time group info: ------[substitution.] 0.000174 28 0.86% : 0.000001s : 2: substitution.elim_not_effective 0.63% : 0.000001s : 2: substitution.fold_const_symbol 2.32% : 0.000004s : 4: substitution.graph_param_transform 81.35% : 0.000141s : 4: substitution.inline 2.12% : 0.000004s : 4: substitution.j_node_and_user_rematch 2.63% : 0.000005s : 4: substitution.remove_not_recompute_node 2.25% : 0.000004s : 4: substitution.replace_old_param 7.85% : 0.000014s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.005332 2 88.02% : 0.004693s : 1: type_inference.infer 11.98% : 0.000639s : 1: type_inference.specialize ------[replace.] 0.000064 8 63.19% : 0.000040s : 4: replace.inline 36.81% : 0.000023s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000151 8 92.19% : 0.000139s : 4: match.inline 7.81% : 0.000012s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000201 1278 0.90% : 0.000002s : 13: predicate.accumulaten_eliminater 0.73% : 0.000001s : 4: predicate.ad_related_special_op_eliminate 0.58% : 0.000001s : 8: predicate.addn_check_dump 0.95% : 0.000002s : 13: predicate.addn_zero_filter 0.80% : 0.000002s : 13: predicate.adjust_all_reduce_mul_add 2.13% : 0.000004s : 21: predicate.arithmetic_simplify 0.95% : 0.000002s : 13: predicate.cast_eliminate 0.72% : 0.000001s : 8: predicate.check_bprop_eliminate 0.52% : 0.000001s : 8: predicate.compare_switch_simplify 0.20% : 0.000000s : 4: predicate.const_output_eliminate 0.67% : 0.000001s : 8: predicate.depend_value_elim 0.95% : 0.000002s : 13: predicate.dict_get_item_const_eliminator 1.06% : 0.000002s : 13: predicate.dict_get_item_eliminator 0.89% : 0.000002s : 13: predicate.dict_set_item_eliminator 0.94% : 0.000002s : 8: predicate.dumpgradient_eliminate 0.29% : 0.000001s : 4: predicate.elim_not_effective 0.40% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.17% : 0.000002s : 17: predicate.environ_add_const_eliminate 1.08% : 0.000002s : 17: predicate.environ_get_add_eliminate 1.17% : 0.000002s : 17: predicate.environ_get_depend_swap 1.79% : 0.000004s : 25: predicate.environ_get_eliminate 1.09% : 0.000002s : 17: predicate.environ_get_set_eliminate 1.48% : 0.000003s : 21: predicate.exchange_switch_depend_value 2.37% : 0.000005s : 21: predicate.float_depend_g_call 0.54% : 0.000001s : 8: predicate.float_environ_get_switch 0.78% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.17% : 0.000000s : 4: predicate.fold_const_symbol 0.68% : 0.000001s : 8: predicate.get_grad_eliminate 0.22% : 0.000000s : 4: predicate.graph_param_transform 0.58% : 0.000001s : 8: predicate.incorporate_call 0.51% : 0.000001s : 8: predicate.incorporate_call_switch 6.30% : 0.000013s : 58: predicate.inline 0.94% : 0.000002s : 8: predicate.inline_without_move 0.36% : 0.000001s : 8: predicate.j_node_and_user_rematch 0.81% : 0.000002s : 8: predicate.less_batch_normalization 1.85% : 0.000004s : 25: predicate.list_to_tuple_eliminator_ 2.53% : 0.000005s : 38: predicate.load_eliminater 0.80% : 0.000002s : 4: predicate.loop_unroll_after_grad 2.72% : 0.000005s : 34: predicate.loop_unroll_before_grad 1.57% : 0.000003s : 21: predicate.make_slice_get_slice_eliminator 0.57% : 0.000001s : 8: predicate.merge_addn 0.64% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.57% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.82% : 0.000002s : 13: predicate.minmaximum_grad 0.88% : 0.000002s : 4: predicate.mutable_eliminate 0.37% : 0.000001s : 4: predicate.opt_reshape 0.35% : 0.000001s : 4: predicate.parallel_virtual_node 1.81% : 0.000004s : 21: predicate.partial_defer_inline 1.71% : 0.000003s : 21: predicate.partial_eliminate 0.90% : 0.000002s : 13: predicate.print_const_string_wrapper 0.61% : 0.000001s : 8: predicate.reduce_all_const_elim 1.14% : 0.000002s : 13: predicate.reduce_eliminate 2.68% : 0.000005s : 38: predicate.redundant_stop_gradient_eliminater 0.40% : 0.000001s : 8: predicate.remove_not_recompute_node 1.41% : 0.000003s : 25: predicate.replace_applicator 0.44% : 0.000001s : 8: predicate.replace_old_param 0.30% : 0.000001s : 4: predicate.reset_defer_inline 0.91% : 0.000002s : 13: predicate.reshape_eliminate 0.65% : 0.000001s : 8: predicate.row_tensor_add_zeros_like 0.39% : 0.000001s : 4: predicate.row_tensor_eliminate 0.71% : 0.000001s : 8: predicate.same_eliminate 0.45% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.75% : 0.000002s : 8: predicate.shard_identity_eliminate 0.65% : 0.000001s : 8: predicate.special_op_eliminate 0.74% : 0.000001s : 8: predicate.specialize_transform 0.86% : 0.000002s : 8: predicate.split_environ_get_set_with_tuple_value 0.70% : 0.000001s : 8: predicate.stack_unstack_eliminate 0.35% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.59% : 0.000003s : 21: predicate.switch_defer_inline 2.26% : 0.000005s : 29: predicate.switch_layer_defer_inline 5.05% : 0.000010s : 67: predicate.switch_simplify 0.87% : 0.000002s : 13: predicate.tile_eliminate 1.02% : 0.000002s : 13: predicate.transpose_eliminate 1.48% : 0.000003s : 21: predicate.tuple_list_convert_item_index_to_positive 1.58% : 0.000003s : 21: predicate.tuple_list_get_item_const_eliminator 1.45% : 0.000003s : 21: predicate.tuple_list_get_item_depend_reorder 3.16% : 0.000006s : 33: predicate.tuple_list_get_item_eliminator 1.43% : 0.000003s : 21: predicate.tuple_list_get_set_item_eliminator 2.15% : 0.000004s : 29: predicate.tuple_list_set_item_eliminator 1.85% : 0.000004s : 25: predicate.tuple_to_list_eliminator_ 2.50% : 0.000005s : 38: predicate.updatestate_pure_node_eliminater 3.16% : 0.000006s : 46: predicate.updatestate_useless_node_eliminater 0.55% : 0.000001s : 4: predicate.value_based_eliminate 0.61% : 0.000001s : 8: predicate.virtual_dataset_eliminate 0.66% : 0.000001s : 8: predicate.virtual_output_eliminate 0.27% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.41% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000501 11 54.03% : 0.000271s : 5: func_graph_cloner_run.FuncGraphClonerGraph 45.97% : 0.000230s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.057938 192 0.01% : 0.000005s : 1: ForceFp32Comm 33.50% : 0.019409s : 1: add_attr 33.47% : 0.019391s : 1: add_attr_with_inline 0.01% : 0.000006s : 1: add_comm_op_reuse_tag 0.07% : 0.000038s : 1: add_recomputation 0.01% : 0.000006s : 1: assign_add_opt 0.09% : 0.000054s : 1: auto_monad 0.04% : 0.000021s : 1: auto_monad_reorder 0.01% : 0.000006s : 1: begin_end_overlap_inline 0.01% : 0.000006s : 1: bias_add_comm_swap 0.78% : 0.000450s : 1: bootstrap 0.04% : 0.000023s : 1: cconv 0.01% : 0.000006s : 1: comm_op_add_attrs 0.03% : 0.000015s : 1: control_data_broadcast_order 0.02% : 0.000013s : 1: convert_after_rewriter 0.05% : 0.000029s : 1: cse_after_recomputation 0.01% : 0.000008s : 1: dataset_repeat_opt 0.03% : 0.000018s : 1: detach_backward 0.02% : 0.000009s : 1: environ_conv 0.05% : 0.000026s : 1: event_method 0.01% : 0.000006s : 1: full_micro_interleaved_order_control 0.01% : 0.000008s : 1: get_jit_bprop_graph 0.02% : 0.000011s : 1: graph_reusing 0.01% : 0.000006s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000006s : 1: handle_group_info 0.01% : 0.000007s : 1: inline 0.02% : 0.000009s : 1: insert-virtual-dataset 0.01% : 0.000006s : 1: interleave_parallel_branches 0.01% : 0.000006s : 1: interleave_split_concat_branches 0.01% : 0.000007s : 1: label_fine_grained_interleaved_index 0.01% : 0.000009s : 1: label_micro_interleaved_index 0.74% : 0.000430s : 1: loop_unroll 0.01% : 0.000006s : 1: merge_cast_opt 0.01% : 0.000006s : 1: micro_interleaved_order_control 0.81% : 0.000471s : 1: mutable_eliminate 0.01% : 0.000008s : 1: offloading_packed_experts 0.02% : 0.000014s : 1: opt.transform.loop_unroll_optimizer 0.02% : 0.000014s : 1: opt.transform.mutable_eliminate 2.03% : 0.001177s : 78: opt.transform.opt_a 0.05% : 0.000031s : 1: opt.transform.opt_after_cconv 0.04% : 0.000024s : 1: opt.transform.opt_after_jit_grad 0.18% : 0.000103s : 28: opt.transform.opt_b 0.08% : 0.000047s : 2: opt.transform.opt_trans_graph 0.06% : 0.000035s : 4: opt.transform.symbol_engine_opt 5.14% : 0.002979s : 1: opt_a 0.22% : 0.000127s : 1: opt_after_cconv 0.83% : 0.000479s : 1: opt_after_jit_grad 0.46% : 0.000267s : 1: opt_b 9.39% : 0.005442s : 1: optimize 0.03% : 0.000018s : 1: optimize_parallel_all_gather_comm 0.02% : 0.000010s : 1: order_py_execute_after_rewriter 0.03% : 0.000018s : 1: overlap_grad_flash_sp 0.01% : 0.000006s : 1: overlap_grad_matmul_and_grad_allreduce 0.01% : 0.000009s : 1: overlap_grad_ring_attention 0.01% : 0.000006s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000006s : 1: overlap_opt_shard_in_pipeline 0.01% : 0.000008s : 1: overlap_param_gather 0.01% : 0.000006s : 1: overlap_recompute_allgather_and_fa_grad 0.02% : 0.000009s : 1: overlap_recompute_and_grad_model_parallel 0.01% : 0.000007s : 1: overlap_recompute_comm 0.02% : 0.000011s : 1: parallel-infer-symbol 0.01% : 0.000007s : 1: parallel-infer-symbol-second 0.01% : 0.000007s : 1: partial_unused_args_eliminate 0.02% : 0.000009s : 1: pipeline_parallel_scheduler 0.01% : 0.000008s : 1: pipeline_split 0.06% : 0.000037s : 1: pre_auto_parallel 0.06% : 0.000032s : 1: py_interpret_to_execute 0.02% : 0.000014s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000005s : 1: remove_cast_before_assign_add 0.02% : 0.000013s : 1: remove_dup_value 0.60% : 0.000349s : 1: renormalize.infer 0.51% : 0.000294s : 1: renormalize.specialize 0.01% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.02% : 0.000011s : 1: rewriter_after_jit_bprop_graph 0.07% : 0.000041s : 1: rewriter_after_opt_a 0.16% : 0.000090s : 1: rewriter_before_opt_a 0.01% : 0.000007s : 1: slice_cell_reuse_recomputed_activation 0.01% : 0.000006s : 1: slice_recompute_activation 0.01% : 0.000006s : 1: split_layernorm_comm 0.01% : 0.000006s : 1: split_matmul_comm_elemetwise 0.02% : 0.000010s : 1: swap_dp_allreduce_reducescatter 0.16% : 0.000094s : 1: symbol_engine_optimizer 0.16% : 0.000090s : 1: tuple_transform 9.33% : 0.005404s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:45:59.933.847 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0304469, [21] [bootstrap]: 0.00041908 [type_inference]: 0.00541746 [event_method]: 1.753e-05 [auto_monad]: 6.066e-05 [graph_reusing]: 5.89e-06 [inline]: 2.37999e-06 [add_attr]: 0.00300902, [1] [add_attr_with_inline]: 0.00299919, [1] [Cycle 1]: 5.275e-05, [2] [tag_attr]: 1.833e-05 [meta_addattr_fg_expand]: 5.91e-06 [parallel-infer-symbol]: 3.3e-06 [pre_auto_parallel]: 3.236e-05 [insert-virtual-dataset]: 2.31e-06 [parallel-infer-symbol-second]: 8.60018e-07 [dataset_repeat_opt]: 2.02001e-06 [pipeline_split]: 1.79e-06 [optimize]: 0.020798, [53] [py_interpret_to_execute]: 2.587e-05 [rewriter_before_opt_a]: 7.997e-05 [opt_a]: 0.00248335, [2] [Cycle 1]: 0.00183828, [45] [expand_dump_flag]: 3.07002e-06 [switch_simplify]: 4.288e-05 [loop_unroll]: 3.007e-05 [a_1]: 0.00060515 [with_stream_mark]: 1.475e-05 [recompute_prepare]: 8.01001e-06 [updatestate_depend_eliminate]: 3.63e-06 [updatestate_assign_eliminate]: 3.21999e-06 [updatestate_loads_eliminate]: 2.94999e-06 [parameter_eliminate]: 2.09999e-06 [a_2]: 8.135e-05 [accelerated_algorithm]: 7.05e-06 [shard]: 1.59998e-06 [meta_shard_fg_expand]: 1.95001e-06 [shard_inline]: 6.54999e-06 [merge_send_recv]: 8.85001e-06 [auto_parallel]: 5.44998e-06 [parallel]: 1.815e-05 [flash_sp]: 7.37002e-06 [merge_comm]: 3.90998e-06 [allreduce_fusion]: 3.40998e-06 [matmul_add_comm_reduction]: 9.04e-06 [allreduce_slice_to_reducescatter]: 6.60017e-07 [virtual_shard_identity]: 7.96001e-06 [virtual_dataset]: 6.49999e-06 [get_grad_eliminate_]: 6.09001e-06 [virtual_output]: 6.39001e-06 [merge_forward]: 4.28999e-06 [cell_reuse_recompute_pass]: 1.24e-06 [offload_activation]: 9.67001e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.259e-05 [merge_recompute_call_nodes]: 1.39998e-06 [before_grad]: 9.92001e-06 [set_forward_comm_id_for_comm_node_pass]: 3.75998e-06 [meta_fg_expand]: 2.86999e-06 [flash_sp_send_recv_attached]: 3.01001e-06 [receive_attached]: 2.84001e-06 [after_resolve]: 1.187e-05 [a_after_grad]: 9.97001e-06 [renormalize]: 0.00053264 [add_forward_monad_depend]: 4.99998e-06 [auto_monad_grad]: 2.36e-06 [auto_monad_eliminator]: 1.571e-05 [cse]: 2.781e-05 [a_3]: 4.758e-05 [Cycle 2]: 0.00063546, [45] [expand_dump_flag]: 1.24e-06 [switch_simplify]: 7.50998e-06 [loop_unroll]: 6.25002e-06 [a_1]: 0.00012897 [with_stream_mark]: 1.047e-05 [recompute_prepare]: 6.19001e-06 [updatestate_depend_eliminate]: 2.89999e-06 [updatestate_assign_eliminate]: 2.22001e-06 [updatestate_loads_eliminate]: 2.32999e-06 [parameter_eliminate]: 9.89996e-07 [a_2]: 7.144e-05 [accelerated_algorithm]: 6.07999e-06 [shard]: 1.15001e-06 [meta_shard_fg_expand]: 1.40999e-06 [shard_inline]: 6.09001e-06 [merge_send_recv]: 4.62998e-06 [auto_parallel]: 5.14e-06 [parallel]: 4.47e-06 [flash_sp]: 4.02e-06 [merge_comm]: 3.41001e-06 [allreduce_fusion]: 2.85002e-06 [matmul_add_comm_reduction]: 5.11002e-06 [allreduce_slice_to_reducescatter]: 3.69997e-07 [virtual_shard_identity]: 6.70998e-06 [virtual_dataset]: 5.71e-06 [get_grad_eliminate_]: 6.30002e-06 [virtual_output]: 5.51e-06 [merge_forward]: 2.93998e-06 [cell_reuse_recompute_pass]: 1.44998e-06 [offload_activation]: 7.96001e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.272e-05 [merge_recompute_call_nodes]: 8.70001e-07 [before_grad]: 9.07001e-06 [set_forward_comm_id_for_comm_node_pass]: 3.04001e-06 [meta_fg_expand]: 1.99e-06 [flash_sp_send_recv_attached]: 1.45001e-06 [receive_attached]: 1.13001e-06 [after_resolve]: 1.126e-05 [a_after_grad]: 9.47001e-06 [renormalize]: 8.9989e-08 [add_forward_monad_depend]: 1.06002e-06 [auto_monad_grad]: 1.28002e-06 [auto_monad_eliminator]: 6.91999e-06 [cse]: 1.193e-05 [a_3]: 3.526e-05 [py_interpret_to_execute_after_opt_a]: 8.52e-06 [slice_cell_reuse_recomputed_activation]: 2.00002e-06 [rewriter_after_opt_a]: 3.345e-05 [convert_after_rewriter]: 2.242e-05 [order_py_execute_after_rewriter]: 6.04001e-06 [mutable_eliminate]: 0.00070742 [opt_b]: 0.00022888, [1] [Cycle 1]: 0.00022074, [7] [b_1]: 0.00013197 [b_2]: 8.72e-06 [updatestate_depend_eliminate]: 6.27001e-06 [updatestate_assign_eliminate]: 3.25e-06 [updatestate_loads_eliminate]: 2.96001e-06 [renormalize]: 8.2e-07 [cse]: 3.114e-05 [optimize_parallel_all_gather_comm]: 1.962e-05 [overlap_param_gather]: 2.11e-06 [cconv]: 3.336e-05 [loop_unroll]: 0.00042946 [opt_after_cconv]: 0.00010353, [1] [Cycle 1]: 9.724e-05, [7] [c_1]: 3.146e-05 [parameter_eliminate]: 3.30003e-06 [updatestate_depend_eliminate]: 4.75001e-06 [updatestate_assign_eliminate]: 2.42001e-06 [updatestate_loads_eliminate]: 2.75997e-06 [cse]: 1.811e-05 [renormalize]: 8.00006e-07 [remove_dup_value]: 1.387e-05 [tuple_transform]: 7.946e-05, [1] [Cycle 1]: 7.478e-05, [4] [d_1]: 4.688e-05 [none_parameter_eliminate]: 1.64e-06 [renormalize]: 1.8999e-07 [switch_simplify]: 7.2e-06 [partial_unused_args_eliminate]: 1.70001e-06 [add_recomputation]: 5.265e-05 [cse_after_recomputation]: 2.27e-05, [1] [Cycle 1]: 1.75e-05, [1] [cse]: 1.175e-05 [environ_conv]: 5.10999e-06 [swap_dp_allreduce_reducescatter]: 5.22e-06 [bias_add_comm_swap]: 2.81999e-06 [label_micro_interleaved_index]: 4.75001e-06 [label_fine_grained_interleaved_index]: 2.68998e-06 [merge_cast_opt]: 1.24e-06 [slice_recompute_activation]: 2.20002e-06 [micro_interleaved_order_control]: 2.38998e-06 [assign_add_opt]: 1.14998e-06 [ForceFp32Comm]: 8.50006e-07 [remove_cast_before_assign_add]: 1.02998e-06 [full_micro_interleaved_order_control]: 2.36998e-06 [reorder_send_recv_between_fp_bp]: 2.46e-06 [comm_op_add_attrs]: 1.00999e-06 [add_comm_op_reuse_tag]: 9.70002e-07 [interleave_split_concat_branches]: 1.40001e-06 [interleave_parallel_branches]: 1.08001e-06 [overlap_opt_shard_in_pipeline]: 1.52001e-06 [overlap_opt_shard_grad_in_pipeline]: 1.79998e-06 [control_data_broadcast_order]: 1.188e-05 [grouped_pairwise_exchange_alltoall]: 1.77001e-06 [offloading_packed_experts]: 3.91999e-06 [overlap_recompute_and_grad_model_parallel]: 4.36002e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.19e-06 [overlap_recompute_allgather_and_fa_grad]: 1.53002e-06 [overlap_recompute_comm]: 2.44999e-06 [overlap_grad_ring_attention]: 4.31002e-06 [overlap_grad_flash_sp]: 2.142e-05 [begin_end_overlap_inline]: 5.19998e-07 [split_matmul_comm_elemetwise]: 2.32999e-06 [split_layernorm_comm]: 1.74998e-06 [handle_group_info]: 1.22e-06 [symbol_engine_optimizer]: 7.799e-05, [1] [Cycle 1]: 7.361e-05, [6] [build]: 3.21001e-06 [elim_shapecalc]: 1.039e-05 [elim_not_effective]: 1.362e-05 [opt_reshape]: 7.07997e-06 [fold_const_symbol]: 1.059e-05 [renormalize]: 1.59984e-07 [detach_backward]: 2.26e-06 [pipeline_parallel_scheduler]: 1.50999e-06 [auto_monad_reorder]: 1.55e-05 [get_jit_bprop_graph]: 1.71002e-06 [rewriter_after_jit_bprop_graph]: 3.41999e-06 [opt_after_jit_grad]: 0.00046462 [validate]: 3.903e-05 Sums bootstrap : 0.000419s : 4.01% type_inference : 0.005417s : 51.84% event_method : 0.000018s : 0.17% auto_monad : 0.000061s : 0.58% graph_reusing : 0.000006s : 0.06% inline : 0.000002s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000018s : 0.18% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.06% parallel-infer-symbol : 0.000003s : 0.03% pre_auto_parallel : 0.000032s : 0.31% insert-virtual-dataset : 0.000002s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.02% optimize.py_interpret_to_execute : 0.000026s : 0.25% optimize.rewriter_before_opt_a : 0.000080s : 0.77% optimize.opt_a.expand_dump_flag : 0.000004s : 0.04% optimize.opt_a.switch_simplify : 0.000050s : 0.48% optimize.opt_a.loop_unroll : 0.000036s : 0.35% optimize.opt_a.a_1 : 0.000734s : 7.02% optimize.opt_a.with_stream_mark : 0.000025s : 0.24% optimize.opt_a.recompute_prepare : 0.000014s : 0.14% optimize.opt_a.updatestate_depend_eliminate : 0.000007s : 0.06% optimize.opt_a.updatestate_assign_eliminate : 0.000005s : 0.05% optimize.opt_a.updatestate_loads_eliminate : 0.000005s : 0.05% optimize.opt_a.parameter_eliminate : 0.000003s : 0.03% optimize.opt_a.a_2 : 0.000153s : 1.46% optimize.opt_a.accelerated_algorithm : 0.000013s : 0.13% optimize.opt_a.shard : 0.000003s : 0.03% optimize.opt_a.meta_shard_fg_expand : 0.000003s : 0.03% optimize.opt_a.shard_inline : 0.000013s : 0.12% optimize.opt_a.merge_send_recv : 0.000013s : 0.13% optimize.opt_a.auto_parallel : 0.000011s : 0.10% optimize.opt_a.parallel : 0.000023s : 0.22% optimize.opt_a.flash_sp : 0.000011s : 0.11% optimize.opt_a.merge_comm : 0.000007s : 0.07% optimize.opt_a.allreduce_fusion : 0.000006s : 0.06% optimize.opt_a.matmul_add_comm_reduction : 0.000014s : 0.14% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000015s : 0.14% optimize.opt_a.virtual_dataset : 0.000012s : 0.12% optimize.opt_a.get_grad_eliminate_ : 0.000012s : 0.12% optimize.opt_a.virtual_output : 0.000012s : 0.11% optimize.opt_a.merge_forward : 0.000007s : 0.07% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.03% optimize.opt_a.offload_activation : 0.000018s : 0.17% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000025s : 0.24% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.02% optimize.opt_a.before_grad : 0.000019s : 0.18% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000007s : 0.07% optimize.opt_a.meta_fg_expand : 0.000005s : 0.05% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.04% optimize.opt_a.receive_attached : 0.000004s : 0.04% optimize.opt_a.after_resolve : 0.000023s : 0.22% optimize.opt_a.a_after_grad : 0.000019s : 0.19% optimize.opt_a.renormalize : 0.000533s : 5.10% optimize.opt_a.add_forward_monad_depend : 0.000006s : 0.06% optimize.opt_a.auto_monad_grad : 0.000004s : 0.03% optimize.opt_a.auto_monad_eliminator : 0.000023s : 0.22% optimize.opt_a.cse : 0.000040s : 0.38% optimize.opt_a.a_3 : 0.000083s : 0.79% optimize.py_interpret_to_execute_after_opt_a : 0.000009s : 0.08% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.02% optimize.rewriter_after_opt_a : 0.000033s : 0.32% optimize.convert_after_rewriter : 0.000022s : 0.21% optimize.order_py_execute_after_rewriter : 0.000006s : 0.06% optimize.mutable_eliminate : 0.000707s : 6.77% optimize.opt_b.b_1 : 0.000132s : 1.26% optimize.opt_b.b_2 : 0.000009s : 0.08% optimize.opt_b.updatestate_depend_eliminate : 0.000006s : 0.06% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_b.renormalize : 0.000001s : 0.01% optimize.opt_b.cse : 0.000031s : 0.30% optimize.optimize_parallel_all_gather_comm : 0.000020s : 0.19% optimize.overlap_param_gather : 0.000002s : 0.02% optimize.cconv : 0.000033s : 0.32% optimize.loop_unroll : 0.000429s : 4.11% optimize.opt_after_cconv.c_1 : 0.000031s : 0.30% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000005s : 0.05% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000002s : 0.02% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.cse : 0.000018s : 0.17% optimize.opt_after_cconv.renormalize : 0.000001s : 0.01% optimize.remove_dup_value : 0.000014s : 0.13% optimize.tuple_transform.d_1 : 0.000047s : 0.45% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.02% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000007s : 0.07% optimize.partial_unused_args_eliminate : 0.000002s : 0.02% optimize.add_recomputation : 0.000053s : 0.50% optimize.cse_after_recomputation.cse : 0.000012s : 0.11% optimize.environ_conv : 0.000005s : 0.05% optimize.swap_dp_allreduce_reducescatter : 0.000005s : 0.05% optimize.bias_add_comm_swap : 0.000003s : 0.03% optimize.label_micro_interleaved_index : 0.000005s : 0.05% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.03% optimize.merge_cast_opt : 0.000001s : 0.01% optimize.slice_recompute_activation : 0.000002s : 0.02% optimize.micro_interleaved_order_control : 0.000002s : 0.02% optimize.assign_add_opt : 0.000001s : 0.01% optimize.ForceFp32Comm : 0.000001s : 0.01% optimize.remove_cast_before_assign_add : 0.000001s : 0.01% optimize.full_micro_interleaved_order_control : 0.000002s : 0.02% optimize.reorder_send_recv_between_fp_bp : 0.000002s : 0.02% optimize.comm_op_add_attrs : 0.000001s : 0.01% optimize.add_comm_op_reuse_tag : 0.000001s : 0.01% optimize.interleave_split_concat_branches : 0.000001s : 0.01% optimize.interleave_parallel_branches : 0.000001s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000002s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.02% optimize.control_data_broadcast_order : 0.000012s : 0.11% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.02% optimize.offloading_packed_experts : 0.000004s : 0.04% optimize.overlap_recompute_and_grad_model_parallel : 0.000004s : 0.04% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000002s : 0.01% optimize.overlap_recompute_comm : 0.000002s : 0.02% optimize.overlap_grad_ring_attention : 0.000004s : 0.04% optimize.overlap_grad_flash_sp : 0.000021s : 0.20% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.02% optimize.split_layernorm_comm : 0.000002s : 0.02% optimize.handle_group_info : 0.000001s : 0.01% optimize.symbol_engine_optimizer.build : 0.000003s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000010s : 0.10% optimize.symbol_engine_optimizer.elim_not_effective : 0.000014s : 0.13% optimize.symbol_engine_optimizer.opt_reshape : 0.000007s : 0.07% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000011s : 0.10% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.02% pipeline_parallel_scheduler : 0.000002s : 0.01% auto_monad_reorder : 0.000016s : 0.15% get_jit_bprop_graph : 0.000002s : 0.02% rewriter_after_jit_bprop_graph : 0.000003s : 0.03% opt_after_jit_grad : 0.000465s : 4.45% validate : 0.000039s : 0.37% Time group info: ------[substitution.] 0.000166 28 1.14% : 0.000002s : 2: substitution.elim_not_effective 0.91% : 0.000002s : 2: substitution.fold_const_symbol 4.03% : 0.000007s : 4: substitution.graph_param_transform 77.49% : 0.000129s : 4: substitution.inline 1.89% : 0.000003s : 4: substitution.j_node_and_user_rematch 2.95% : 0.000005s : 4: substitution.remove_not_recompute_node 2.77% : 0.000005s : 4: substitution.replace_old_param 8.82% : 0.000015s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.005364 2 87.86% : 0.004713s : 1: type_inference.infer 12.14% : 0.000651s : 1: type_inference.specialize ------[replace.] 0.000059 8 63.09% : 0.000037s : 4: replace.inline 36.91% : 0.000022s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000139 8 90.88% : 0.000126s : 4: match.inline 9.12% : 0.000013s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000202 1278 0.88% : 0.000002s : 13: predicate.accumulaten_eliminater 0.78% : 0.000002s : 4: predicate.ad_related_special_op_eliminate 0.52% : 0.000001s : 8: predicate.addn_check_dump 0.83% : 0.000002s : 13: predicate.addn_zero_filter 0.81% : 0.000002s : 13: predicate.adjust_all_reduce_mul_add 2.11% : 0.000004s : 21: predicate.arithmetic_simplify 0.86% : 0.000002s : 13: predicate.cast_eliminate 0.57% : 0.000001s : 8: predicate.check_bprop_eliminate 0.50% : 0.000001s : 8: predicate.compare_switch_simplify 0.18% : 0.000000s : 4: predicate.const_output_eliminate 0.57% : 0.000001s : 8: predicate.depend_value_elim 0.93% : 0.000002s : 13: predicate.dict_get_item_const_eliminator 0.99% : 0.000002s : 13: predicate.dict_get_item_eliminator 0.92% : 0.000002s : 13: predicate.dict_set_item_eliminator 0.90% : 0.000002s : 8: predicate.dumpgradient_eliminate 0.29% : 0.000001s : 4: predicate.elim_not_effective 0.47% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.21% : 0.000002s : 17: predicate.environ_add_const_eliminate 1.09% : 0.000002s : 17: predicate.environ_get_add_eliminate 1.12% : 0.000002s : 17: predicate.environ_get_depend_swap 1.66% : 0.000003s : 25: predicate.environ_get_eliminate 1.08% : 0.000002s : 17: predicate.environ_get_set_eliminate 1.49% : 0.000003s : 21: predicate.exchange_switch_depend_value 2.43% : 0.000005s : 21: predicate.float_depend_g_call 0.55% : 0.000001s : 8: predicate.float_environ_get_switch 0.91% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.20% : 0.000000s : 4: predicate.fold_const_symbol 0.64% : 0.000001s : 8: predicate.get_grad_eliminate 0.21% : 0.000000s : 4: predicate.graph_param_transform 0.58% : 0.000001s : 8: predicate.incorporate_call 0.47% : 0.000001s : 8: predicate.incorporate_call_switch 6.11% : 0.000012s : 58: predicate.inline 0.73% : 0.000001s : 8: predicate.inline_without_move 0.34% : 0.000001s : 8: predicate.j_node_and_user_rematch 0.84% : 0.000002s : 8: predicate.less_batch_normalization 1.80% : 0.000004s : 25: predicate.list_to_tuple_eliminator_ 2.45% : 0.000005s : 38: predicate.load_eliminater 0.99% : 0.000002s : 4: predicate.loop_unroll_after_grad 2.32% : 0.000005s : 34: predicate.loop_unroll_before_grad 1.84% : 0.000004s : 21: predicate.make_slice_get_slice_eliminator 0.55% : 0.000001s : 8: predicate.merge_addn 0.56% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.58% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.83% : 0.000002s : 13: predicate.minmaximum_grad 1.07% : 0.000002s : 4: predicate.mutable_eliminate 0.33% : 0.000001s : 4: predicate.opt_reshape 0.45% : 0.000001s : 4: predicate.parallel_virtual_node 1.78% : 0.000004s : 21: predicate.partial_defer_inline 1.62% : 0.000003s : 21: predicate.partial_eliminate 0.88% : 0.000002s : 13: predicate.print_const_string_wrapper 0.55% : 0.000001s : 8: predicate.reduce_all_const_elim 1.06% : 0.000002s : 13: predicate.reduce_eliminate 2.50% : 0.000005s : 38: predicate.redundant_stop_gradient_eliminater 0.51% : 0.000001s : 8: predicate.remove_not_recompute_node 1.33% : 0.000003s : 25: predicate.replace_applicator 0.47% : 0.000001s : 8: predicate.replace_old_param 0.37% : 0.000001s : 4: predicate.reset_defer_inline 0.96% : 0.000002s : 13: predicate.reshape_eliminate 0.85% : 0.000002s : 8: predicate.row_tensor_add_zeros_like 0.53% : 0.000001s : 4: predicate.row_tensor_eliminate 0.72% : 0.000001s : 8: predicate.same_eliminate 0.43% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.75% : 0.000002s : 8: predicate.shard_identity_eliminate 0.72% : 0.000001s : 8: predicate.special_op_eliminate 0.64% : 0.000001s : 8: predicate.specialize_transform 0.80% : 0.000002s : 8: predicate.split_environ_get_set_with_tuple_value 0.82% : 0.000002s : 8: predicate.stack_unstack_eliminate 0.40% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.54% : 0.000003s : 21: predicate.switch_defer_inline 2.13% : 0.000004s : 29: predicate.switch_layer_defer_inline 5.14% : 0.000010s : 67: predicate.switch_simplify 0.91% : 0.000002s : 13: predicate.tile_eliminate 0.91% : 0.000002s : 13: predicate.transpose_eliminate 1.50% : 0.000003s : 21: predicate.tuple_list_convert_item_index_to_positive 1.63% : 0.000003s : 21: predicate.tuple_list_get_item_const_eliminator 1.67% : 0.000003s : 21: predicate.tuple_list_get_item_depend_reorder 3.47% : 0.000007s : 33: predicate.tuple_list_get_item_eliminator 1.65% : 0.000003s : 21: predicate.tuple_list_get_set_item_eliminator 2.35% : 0.000005s : 29: predicate.tuple_list_set_item_eliminator 1.79% : 0.000004s : 25: predicate.tuple_to_list_eliminator_ 2.49% : 0.000005s : 38: predicate.updatestate_pure_node_eliminater 3.07% : 0.000006s : 46: predicate.updatestate_useless_node_eliminater 0.44% : 0.000001s : 4: predicate.value_based_eliminate 0.67% : 0.000001s : 8: predicate.virtual_dataset_eliminate 0.65% : 0.000001s : 8: predicate.virtual_output_eliminate 0.28% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.45% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000493 11 55.47% : 0.000274s : 5: func_graph_cloner_run.FuncGraphClonerGraph 44.53% : 0.000220s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.056083 192 0.01% : 0.000004s : 1: ForceFp32Comm 5.37% : 0.003014s : 1: add_attr 5.36% : 0.003003s : 1: add_attr_with_inline 0.01% : 0.000004s : 1: add_comm_op_reuse_tag 0.10% : 0.000056s : 1: add_recomputation 0.01% : 0.000004s : 1: assign_add_opt 0.12% : 0.000066s : 1: auto_monad 0.03% : 0.000019s : 1: auto_monad_reorder 0.01% : 0.000003s : 1: begin_end_overlap_inline 0.01% : 0.000006s : 1: bias_add_comm_swap 0.80% : 0.000448s : 1: bootstrap 0.07% : 0.000037s : 1: cconv 0.01% : 0.000004s : 1: comm_op_add_attrs 0.03% : 0.000015s : 1: control_data_broadcast_order 0.07% : 0.000039s : 1: convert_after_rewriter 0.05% : 0.000026s : 1: cse_after_recomputation 0.01% : 0.000005s : 1: dataset_repeat_opt 0.01% : 0.000005s : 1: detach_backward 0.01% : 0.000008s : 1: environ_conv 0.04% : 0.000023s : 1: event_method 0.01% : 0.000005s : 1: full_micro_interleaved_order_control 0.01% : 0.000005s : 1: get_jit_bprop_graph 0.02% : 0.000010s : 1: graph_reusing 0.01% : 0.000005s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000004s : 1: handle_group_info 0.01% : 0.000006s : 1: inline 0.01% : 0.000006s : 1: insert-virtual-dataset 0.01% : 0.000004s : 1: interleave_parallel_branches 0.01% : 0.000004s : 1: interleave_split_concat_branches 0.01% : 0.000006s : 1: label_fine_grained_interleaved_index 0.01% : 0.000008s : 1: label_micro_interleaved_index 0.78% : 0.000437s : 1: loop_unroll 0.01% : 0.000004s : 1: merge_cast_opt 0.01% : 0.000005s : 1: micro_interleaved_order_control 1.28% : 0.000717s : 1: mutable_eliminate 0.01% : 0.000007s : 1: offloading_packed_experts 0.02% : 0.000014s : 1: opt.transform.loop_unroll_optimizer 0.04% : 0.000020s : 1: opt.transform.mutable_eliminate 2.05% : 0.001147s : 78: opt.transform.opt_a 0.05% : 0.000030s : 1: opt.transform.opt_after_cconv 0.05% : 0.000025s : 1: opt.transform.opt_after_jit_grad 0.19% : 0.000105s : 28: opt.transform.opt_b 0.09% : 0.000052s : 2: opt.transform.opt_trans_graph 0.07% : 0.000038s : 4: opt.transform.symbol_engine_opt 4.43% : 0.002486s : 1: opt_a 0.19% : 0.000107s : 1: opt_after_cconv 0.84% : 0.000473s : 1: opt_after_jit_grad 0.41% : 0.000232s : 1: opt_b 37.09% : 0.020804s : 1: optimize 0.04% : 0.000023s : 1: optimize_parallel_all_gather_comm 0.02% : 0.000010s : 1: order_py_execute_after_rewriter 0.04% : 0.000025s : 1: overlap_grad_flash_sp 0.01% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.01% : 0.000007s : 1: overlap_grad_ring_attention 0.01% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.01% : 0.000005s : 1: overlap_param_gather 0.01% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.01% : 0.000007s : 1: overlap_recompute_and_grad_model_parallel 0.01% : 0.000005s : 1: overlap_recompute_comm 0.01% : 0.000007s : 1: parallel-infer-symbol 0.01% : 0.000004s : 1: parallel-infer-symbol-second 0.01% : 0.000005s : 1: partial_unused_args_eliminate 0.01% : 0.000005s : 1: pipeline_parallel_scheduler 0.01% : 0.000005s : 1: pipeline_split 0.07% : 0.000036s : 1: pre_auto_parallel 0.05% : 0.000030s : 1: py_interpret_to_execute 0.02% : 0.000012s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000004s : 1: remove_cast_before_assign_add 0.03% : 0.000017s : 1: remove_dup_value 0.48% : 0.000269s : 1: renormalize.infer 0.46% : 0.000256s : 1: renormalize.specialize 0.01% : 0.000005s : 1: reorder_send_recv_between_fp_bp 0.01% : 0.000007s : 1: rewriter_after_jit_bprop_graph 28.63% : 0.016058s : 1: rewriter_after_opt_a 0.15% : 0.000084s : 1: rewriter_before_opt_a 0.01% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.01% : 0.000006s : 1: slice_recompute_activation 0.01% : 0.000005s : 1: split_layernorm_comm 0.01% : 0.000005s : 1: split_matmul_comm_elemetwise 0.01% : 0.000008s : 1: swap_dp_allreduce_reducescatter 0.14% : 0.000081s : 1: symbol_engine_optimizer 0.15% : 0.000082s : 1: tuple_transform 9.69% : 0.005432s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:00.363.887 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:00.364.149 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0618641, [21] [bootstrap]: 0.00043526 [type_inference]: 0.0365162 [event_method]: 4.035e-05 [auto_monad]: 9.752e-05 [graph_reusing]: 6.84999e-06 [inline]: 2.34001e-06 [add_attr]: 0.00304985, [1] [add_attr_with_inline]: 0.00304139, [1] [Cycle 1]: 8.076e-05, [2] [tag_attr]: 2.805e-05 [meta_addattr_fg_expand]: 8.07e-06 [parallel-infer-symbol]: 3.31999e-06 [pre_auto_parallel]: 4.119e-05 [insert-virtual-dataset]: 2.51998e-06 [parallel-infer-symbol-second]: 7.7e-07 [dataset_repeat_opt]: 1.91e-06 [pipeline_split]: 1.72001e-06 [optimize]: 0.0204875, [53] [py_interpret_to_execute]: 3.336e-05 [rewriter_before_opt_a]: 0.0001171 [opt_a]: 0.0178528, [2] [Cycle 1]: 0.016938, [45] [expand_dump_flag]: 3.46001e-06 [switch_simplify]: 0.00012514 [loop_unroll]: 4.177e-05 [a_1]: 0.00083432 [with_stream_mark]: 1.685e-05 [recompute_prepare]: 9.53002e-06 [updatestate_depend_eliminate]: 4.12e-06 [updatestate_assign_eliminate]: 3.48e-06 [updatestate_loads_eliminate]: 3.13998e-06 [parameter_eliminate]: 1.90001e-06 [a_2]: 0.00011305 [accelerated_algorithm]: 7.11999e-06 [shard]: 1.94999e-06 [meta_shard_fg_expand]: 2.09e-06 [shard_inline]: 6.65002e-06 [merge_send_recv]: 8.68001e-06 [auto_parallel]: 6.04001e-06 [parallel]: 1.831e-05 [flash_sp]: 8.35999e-06 [merge_comm]: 4.22e-06 [allreduce_fusion]: 3.40003e-06 [matmul_add_comm_reduction]: 1.008e-05 [allreduce_slice_to_reducescatter]: 6.79982e-07 [virtual_shard_identity]: 8.18001e-06 [virtual_dataset]: 6.63e-06 [get_grad_eliminate_]: 6.72002e-06 [virtual_output]: 6.39001e-06 [merge_forward]: 4.25e-06 [cell_reuse_recompute_pass]: 1.34e-06 [offload_activation]: 9.31e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.454e-05 [merge_recompute_call_nodes]: 1.42999e-06 [before_grad]: 1.073e-05 [set_forward_comm_id_for_comm_node_pass]: 3.88001e-06 [meta_fg_expand]: 3.11001e-06 [flash_sp_send_recv_attached]: 2.59001e-06 [receive_attached]: 2.13998e-06 [after_resolve]: 1.217e-05 [a_after_grad]: 1.017e-05 [renormalize]: 0.0150173 [add_forward_monad_depend]: 1.135e-05 [auto_monad_grad]: 2.94001e-06 [auto_monad_eliminator]: 2.318e-05 [cse]: 3.105e-05 [a_3]: 7.629e-05 [Cycle 2]: 0.00089814, [45] [expand_dump_flag]: 2.22001e-06 [switch_simplify]: 9.36e-06 [loop_unroll]: 6.48998e-06 [a_1]: 0.00014296 [with_stream_mark]: 1.903e-05 [recompute_prepare]: 6.65002e-06 [updatestate_depend_eliminate]: 4.08001e-06 [updatestate_assign_eliminate]: 3.04999e-06 [updatestate_loads_eliminate]: 3.36001e-06 [parameter_eliminate]: 2.26998e-06 [a_2]: 0.00010368 [accelerated_algorithm]: 6.63e-06 [shard]: 2.46e-06 [meta_shard_fg_expand]: 2.81999e-06 [shard_inline]: 6.93e-06 [merge_send_recv]: 8.91002e-06 [auto_parallel]: 9.02e-06 [parallel]: 9.69999e-06 [flash_sp]: 4.23999e-06 [merge_comm]: 3.81001e-06 [allreduce_fusion]: 3.25e-06 [matmul_add_comm_reduction]: 9.92001e-06 [allreduce_slice_to_reducescatter]: 9.30013e-07 [virtual_shard_identity]: 8.17e-06 [virtual_dataset]: 7.43999e-06 [get_grad_eliminate_]: 6.17001e-06 [virtual_output]: 6.09999e-06 [merge_forward]: 3.86001e-06 [cell_reuse_recompute_pass]: 2.78e-06 [offload_activation]: 1.036e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.898e-05 [merge_recompute_call_nodes]: 1.49e-06 [before_grad]: 1.093e-05 [set_forward_comm_id_for_comm_node_pass]: 3.83999e-06 [meta_fg_expand]: 2.39001e-06 [flash_sp_send_recv_attached]: 1.94999e-06 [receive_attached]: 2.21e-06 [after_resolve]: 1.326e-05 [a_after_grad]: 1.052e-05 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 1.22e-06 [auto_monad_grad]: 1.10001e-06 [auto_monad_eliminator]: 7.21001e-06 [cse]: 1.417e-05 [a_3]: 5.234e-05 [py_interpret_to_execute_after_opt_a]: 2.071e-05 [slice_cell_reuse_recomputed_activation]: 5.12e-06 [rewriter_after_opt_a]: 4.316e-05 [convert_after_rewriter]: 1.029e-05 [order_py_execute_after_rewriter]: 8.45999e-06 [mutable_eliminate]: 0.00073597 [opt_b]: 0.0002976, [1] [Cycle 1]: 0.0002865, [7] [b_1]: 0.00017829 [b_2]: 8.17998e-06 [updatestate_depend_eliminate]: 6.48e-06 [updatestate_assign_eliminate]: 2.76999e-06 [updatestate_loads_eliminate]: 2.26998e-06 [renormalize]: 6.09987e-07 [cse]: 3.024e-05 [optimize_parallel_all_gather_comm]: 2.063e-05 [overlap_param_gather]: 5.14998e-06 [cconv]: 3.119e-05 [loop_unroll]: 0.00045602 [opt_after_cconv]: 0.00013263, [1] [Cycle 1]: 0.00012344, [7] [c_1]: 3.234e-05 [parameter_eliminate]: 3.83001e-06 [updatestate_depend_eliminate]: 5.41998e-06 [updatestate_assign_eliminate]: 2.78e-06 [updatestate_loads_eliminate]: 2.68e-06 [cse]: 1.716e-05 [renormalize]: 3.89991e-07 [remove_dup_value]: 1.599e-05 [tuple_transform]: 9.705e-05, [1] [Cycle 1]: 8.923e-05, [4] [d_1]: 4.593e-05 [none_parameter_eliminate]: 1.74e-06 [renormalize]: 1.50001e-07 [switch_simplify]: 8.23001e-06 [partial_unused_args_eliminate]: 5.34e-06 [add_recomputation]: 4.953e-05 [cse_after_recomputation]: 2.741e-05, [1] [Cycle 1]: 2.042e-05, [1] [cse]: 1.11e-05 [environ_conv]: 8.03001e-06 [swap_dp_allreduce_reducescatter]: 8.14002e-06 [bias_add_comm_swap]: 5.07999e-06 [label_micro_interleaved_index]: 6.80998e-06 [label_fine_grained_interleaved_index]: 5.67999e-06 [merge_cast_opt]: 3.81001e-06 [slice_recompute_activation]: 4.29002e-06 [micro_interleaved_order_control]: 4.87998e-06 [assign_add_opt]: 3.61001e-06 [ForceFp32Comm]: 3.16999e-06 [remove_cast_before_assign_add]: 3.8e-06 [full_micro_interleaved_order_control]: 4.57998e-06 [reorder_send_recv_between_fp_bp]: 5.41002e-06 [comm_op_add_attrs]: 2.37e-05 [add_comm_op_reuse_tag]: 3.35e-06 [interleave_split_concat_branches]: 3.83999e-06 [interleave_parallel_branches]: 3.4e-06 [overlap_opt_shard_in_pipeline]: 3.53e-06 [overlap_opt_shard_grad_in_pipeline]: 3.97002e-06 [control_data_broadcast_order]: 1.553e-05 [grouped_pairwise_exchange_alltoall]: 4e-06 [offloading_packed_experts]: 6.38998e-06 [overlap_recompute_and_grad_model_parallel]: 7.27002e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.95998e-06 [overlap_recompute_allgather_and_fa_grad]: 3.83001e-06 [overlap_recompute_comm]: 5.22e-06 [overlap_grad_ring_attention]: 6.76e-06 [overlap_grad_flash_sp]: 2.412e-05 [begin_end_overlap_inline]: 3.08e-06 [split_matmul_comm_elemetwise]: 4.60001e-06 [split_layernorm_comm]: 4.44998e-06 [handle_group_info]: 3.45e-06 [symbol_engine_optimizer]: 0.00010069, [1] [Cycle 1]: 9.364e-05, [6] [build]: 3.5e-06 [elim_shapecalc]: 1.073e-05 [elim_not_effective]: 1.366e-05 [opt_reshape]: 8.13001e-06 [fold_const_symbol]: 1.058e-05 [renormalize]: 2.3999e-07 [detach_backward]: 4.31002e-06 [pipeline_parallel_scheduler]: 2.01998e-06 [auto_monad_reorder]: 1.931e-05 [get_jit_bprop_graph]: 1.89999e-06 [rewriter_after_jit_bprop_graph]: 4.37e-06 [opt_after_jit_grad]: 0.00050422 [validate]: 3.979e-05 Sums bootstrap : 0.000435s : 0.76% type_inference : 0.036516s : 64.08% event_method : 0.000040s : 0.07% auto_monad : 0.000098s : 0.17% graph_reusing : 0.000007s : 0.01% inline : 0.000002s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000028s : 0.05% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000008s : 0.01% parallel-infer-symbol : 0.000003s : 0.01% pre_auto_parallel : 0.000041s : 0.07% insert-virtual-dataset : 0.000003s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000033s : 0.06% optimize.rewriter_before_opt_a : 0.000117s : 0.21% optimize.opt_a.expand_dump_flag : 0.000006s : 0.01% optimize.opt_a.switch_simplify : 0.000135s : 0.24% optimize.opt_a.loop_unroll : 0.000048s : 0.08% optimize.opt_a.a_1 : 0.000977s : 1.71% optimize.opt_a.with_stream_mark : 0.000036s : 0.06% optimize.opt_a.recompute_prepare : 0.000016s : 0.03% optimize.opt_a.updatestate_depend_eliminate : 0.000008s : 0.01% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.01% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.01% optimize.opt_a.parameter_eliminate : 0.000004s : 0.01% optimize.opt_a.a_2 : 0.000217s : 0.38% optimize.opt_a.accelerated_algorithm : 0.000014s : 0.02% optimize.opt_a.shard : 0.000004s : 0.01% optimize.opt_a.meta_shard_fg_expand : 0.000005s : 0.01% optimize.opt_a.shard_inline : 0.000014s : 0.02% optimize.opt_a.merge_send_recv : 0.000018s : 0.03% optimize.opt_a.auto_parallel : 0.000015s : 0.03% optimize.opt_a.parallel : 0.000028s : 0.05% optimize.opt_a.flash_sp : 0.000013s : 0.02% optimize.opt_a.merge_comm : 0.000008s : 0.01% optimize.opt_a.allreduce_fusion : 0.000007s : 0.01% optimize.opt_a.matmul_add_comm_reduction : 0.000020s : 0.04% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000002s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000016s : 0.03% optimize.opt_a.virtual_dataset : 0.000014s : 0.02% optimize.opt_a.get_grad_eliminate_ : 0.000013s : 0.02% optimize.opt_a.virtual_output : 0.000012s : 0.02% optimize.opt_a.merge_forward : 0.000008s : 0.01% optimize.opt_a.cell_reuse_recompute_pass : 0.000004s : 0.01% optimize.opt_a.offload_activation : 0.000020s : 0.03% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000034s : 0.06% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.01% optimize.opt_a.before_grad : 0.000022s : 0.04% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000008s : 0.01% optimize.opt_a.meta_fg_expand : 0.000006s : 0.01% optimize.opt_a.flash_sp_send_recv_attached : 0.000005s : 0.01% optimize.opt_a.receive_attached : 0.000004s : 0.01% optimize.opt_a.after_resolve : 0.000025s : 0.04% optimize.opt_a.a_after_grad : 0.000021s : 0.04% optimize.opt_a.renormalize : 0.015017s : 26.35% optimize.opt_a.add_forward_monad_depend : 0.000013s : 0.02% optimize.opt_a.auto_monad_grad : 0.000004s : 0.01% optimize.opt_a.auto_monad_eliminator : 0.000030s : 0.05% optimize.opt_a.cse : 0.000045s : 0.08% optimize.opt_a.a_3 : 0.000129s : 0.23% optimize.py_interpret_to_execute_after_opt_a : 0.000021s : 0.04% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.01% optimize.rewriter_after_opt_a : 0.000043s : 0.08% optimize.convert_after_rewriter : 0.000010s : 0.02% optimize.order_py_execute_after_rewriter : 0.000008s : 0.01% optimize.mutable_eliminate : 0.000736s : 1.29% optimize.opt_b.b_1 : 0.000178s : 0.31% optimize.opt_b.b_2 : 0.000008s : 0.01% optimize.opt_b.updatestate_depend_eliminate : 0.000006s : 0.01% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000002s : 0.00% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000030s : 0.05% optimize.optimize_parallel_all_gather_comm : 0.000021s : 0.04% optimize.overlap_param_gather : 0.000005s : 0.01% optimize.cconv : 0.000031s : 0.05% optimize.loop_unroll : 0.000456s : 0.80% optimize.opt_after_cconv.c_1 : 0.000032s : 0.06% optimize.opt_after_cconv.parameter_eliminate : 0.000004s : 0.01% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000005s : 0.01% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.cse : 0.000017s : 0.03% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000016s : 0.03% optimize.tuple_transform.d_1 : 0.000046s : 0.08% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000008s : 0.01% optimize.partial_unused_args_eliminate : 0.000005s : 0.01% optimize.add_recomputation : 0.000050s : 0.09% optimize.cse_after_recomputation.cse : 0.000011s : 0.02% optimize.environ_conv : 0.000008s : 0.01% optimize.swap_dp_allreduce_reducescatter : 0.000008s : 0.01% optimize.bias_add_comm_swap : 0.000005s : 0.01% optimize.label_micro_interleaved_index : 0.000007s : 0.01% optimize.label_fine_grained_interleaved_index : 0.000006s : 0.01% optimize.merge_cast_opt : 0.000004s : 0.01% optimize.slice_recompute_activation : 0.000004s : 0.01% optimize.micro_interleaved_order_control : 0.000005s : 0.01% optimize.assign_add_opt : 0.000004s : 0.01% optimize.ForceFp32Comm : 0.000003s : 0.01% optimize.remove_cast_before_assign_add : 0.000004s : 0.01% optimize.full_micro_interleaved_order_control : 0.000005s : 0.01% optimize.reorder_send_recv_between_fp_bp : 0.000005s : 0.01% optimize.comm_op_add_attrs : 0.000024s : 0.04% optimize.add_comm_op_reuse_tag : 0.000003s : 0.01% optimize.interleave_split_concat_branches : 0.000004s : 0.01% optimize.interleave_parallel_branches : 0.000003s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000004s : 0.01% optimize.control_data_broadcast_order : 0.000016s : 0.03% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.01% optimize.offloading_packed_experts : 0.000006s : 0.01% optimize.overlap_recompute_and_grad_model_parallel : 0.000007s : 0.01% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.01% optimize.overlap_recompute_comm : 0.000005s : 0.01% optimize.overlap_grad_ring_attention : 0.000007s : 0.01% optimize.overlap_grad_flash_sp : 0.000024s : 0.04% optimize.begin_end_overlap_inline : 0.000003s : 0.01% optimize.split_matmul_comm_elemetwise : 0.000005s : 0.01% optimize.split_layernorm_comm : 0.000004s : 0.01% optimize.handle_group_info : 0.000003s : 0.01% optimize.symbol_engine_optimizer.build : 0.000003s : 0.01% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000011s : 0.02% optimize.symbol_engine_optimizer.elim_not_effective : 0.000014s : 0.02% optimize.symbol_engine_optimizer.opt_reshape : 0.000008s : 0.01% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000011s : 0.02% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000004s : 0.01% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000019s : 0.03% get_jit_bprop_graph : 0.000002s : 0.00% rewriter_after_jit_bprop_graph : 0.000004s : 0.01% opt_after_jit_grad : 0.000504s : 0.88% validate : 0.000040s : 0.07% Time group info: ------[substitution.] 0.000219 34 0.83% : 0.000002s : 2: substitution.elim_not_effective 0.69% : 0.000002s : 2: substitution.fold_const_symbol 2.70% : 0.000006s : 4: substitution.graph_param_transform 76.70% : 0.000168s : 8: substitution.inline 1.91% : 0.000004s : 4: substitution.j_node_and_user_rematch 2.76% : 0.000006s : 4: substitution.remove_not_recompute_node 2.88% : 0.000006s : 4: substitution.replace_old_param 4.68% : 0.000010s : 2: substitution.switch_simplify 6.85% : 0.000015s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.036454 2 96.42% : 0.035148s : 1: type_inference.infer 3.58% : 0.001305s : 1: type_inference.specialize ------[replace.] 0.000117 14 42.87% : 0.000050s : 8: replace.inline 38.33% : 0.000045s : 2: replace.switch_simplify 18.80% : 0.000022s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000186 14 88.23% : 0.000164s : 8: match.inline 4.72% : 0.000009s : 2: match.switch_simplify 7.05% : 0.000013s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000256 1520 1.00% : 0.000003s : 17: predicate.accumulaten_eliminater 0.62% : 0.000002s : 4: predicate.ad_related_special_op_eliminate 0.44% : 0.000001s : 8: predicate.addn_check_dump 0.96% : 0.000002s : 17: predicate.addn_zero_filter 0.95% : 0.000002s : 17: predicate.adjust_all_reduce_mul_add 2.12% : 0.000005s : 25: predicate.arithmetic_simplify 1.09% : 0.000003s : 17: predicate.cast_eliminate 0.66% : 0.000002s : 8: predicate.check_bprop_eliminate 0.50% : 0.000001s : 8: predicate.compare_switch_simplify 0.16% : 0.000000s : 4: predicate.const_output_eliminate 0.53% : 0.000001s : 8: predicate.depend_value_elim 0.98% : 0.000003s : 17: predicate.dict_get_item_const_eliminator 1.09% : 0.000003s : 17: predicate.dict_get_item_eliminator 0.99% : 0.000003s : 17: predicate.dict_set_item_eliminator 0.79% : 0.000002s : 8: predicate.dumpgradient_eliminate 0.16% : 0.000000s : 4: predicate.elim_not_effective 0.30% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.22% : 0.000003s : 21: predicate.environ_add_const_eliminate 1.08% : 0.000003s : 21: predicate.environ_get_add_eliminate 1.13% : 0.000003s : 21: predicate.environ_get_depend_swap 1.72% : 0.000004s : 29: predicate.environ_get_eliminate 1.13% : 0.000003s : 21: predicate.environ_get_set_eliminate 1.67% : 0.000004s : 29: predicate.exchange_switch_depend_value 2.76% : 0.000007s : 29: predicate.float_depend_g_call 0.55% : 0.000001s : 8: predicate.float_environ_get_switch 0.66% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.15% : 0.000000s : 4: predicate.fold_const_symbol 0.58% : 0.000001s : 8: predicate.get_grad_eliminate 0.18% : 0.000000s : 4: predicate.graph_param_transform 0.48% : 0.000001s : 8: predicate.incorporate_call 0.42% : 0.000001s : 8: predicate.incorporate_call_switch 6.36% : 0.000016s : 70: predicate.inline 0.66% : 0.000002s : 8: predicate.inline_without_move 0.29% : 0.000001s : 8: predicate.j_node_and_user_rematch 0.70% : 0.000002s : 8: predicate.less_batch_normalization 1.93% : 0.000005s : 29: predicate.list_to_tuple_eliminator_ 2.53% : 0.000006s : 46: predicate.load_eliminater 0.65% : 0.000002s : 4: predicate.loop_unroll_after_grad 2.84% : 0.000007s : 47: predicate.loop_unroll_before_grad 1.60% : 0.000004s : 25: predicate.make_slice_get_slice_eliminator 0.47% : 0.000001s : 8: predicate.merge_addn 0.57% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.51% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.89% : 0.000002s : 17: predicate.minmaximum_grad 0.85% : 0.000002s : 4: predicate.mutable_eliminate 0.43% : 0.000001s : 4: predicate.opt_reshape 0.31% : 0.000001s : 4: predicate.parallel_virtual_node 2.04% : 0.000005s : 29: predicate.partial_defer_inline 1.58% : 0.000004s : 25: predicate.partial_eliminate 1.01% : 0.000003s : 17: predicate.print_const_string_wrapper 0.47% : 0.000001s : 8: predicate.reduce_all_const_elim 1.21% : 0.000003s : 17: predicate.reduce_eliminate 2.52% : 0.000006s : 46: predicate.redundant_stop_gradient_eliminater 0.42% : 0.000001s : 8: predicate.remove_not_recompute_node 1.57% : 0.000004s : 29: predicate.replace_applicator 0.47% : 0.000001s : 8: predicate.replace_old_param 0.21% : 0.000001s : 4: predicate.reset_defer_inline 1.11% : 0.000003s : 17: predicate.reshape_eliminate 0.61% : 0.000002s : 8: predicate.row_tensor_add_zeros_like 0.34% : 0.000001s : 4: predicate.row_tensor_eliminate 0.64% : 0.000002s : 8: predicate.same_eliminate 0.34% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.81% : 0.000002s : 8: predicate.shard_identity_eliminate 0.59% : 0.000002s : 8: predicate.special_op_eliminate 0.62% : 0.000002s : 8: predicate.specialize_transform 0.88% : 0.000002s : 8: predicate.split_environ_get_set_with_tuple_value 0.72% : 0.000002s : 8: predicate.stack_unstack_eliminate 0.26% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.88% : 0.000005s : 29: predicate.switch_defer_inline 2.20% : 0.000006s : 37: predicate.switch_layer_defer_inline 5.79% : 0.000015s : 92: predicate.switch_simplify 0.98% : 0.000003s : 17: predicate.tile_eliminate 1.03% : 0.000003s : 17: predicate.transpose_eliminate 1.63% : 0.000004s : 25: predicate.tuple_list_convert_item_index_to_positive 1.60% : 0.000004s : 25: predicate.tuple_list_get_item_const_eliminator 1.50% : 0.000004s : 25: predicate.tuple_list_get_item_depend_reorder 2.85% : 0.000007s : 37: predicate.tuple_list_get_item_eliminator 1.54% : 0.000004s : 25: predicate.tuple_list_get_set_item_eliminator 2.28% : 0.000006s : 33: predicate.tuple_list_set_item_eliminator 1.89% : 0.000005s : 29: predicate.tuple_to_list_eliminator_ 2.52% : 0.000006s : 46: predicate.updatestate_pure_node_eliminater 2.95% : 0.000008s : 54: predicate.updatestate_useless_node_eliminater 0.42% : 0.000001s : 4: predicate.value_based_eliminate 0.57% : 0.000001s : 8: predicate.virtual_dataset_eliminate 0.61% : 0.000002s : 8: predicate.virtual_output_eliminate 0.24% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.45% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.001049 16 54.11% : 0.000568s : 6: func_graph_cloner_run.FuncGraphClonerGraph 45.89% : 0.000481s : 10: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.102091 192 0.01% : 0.000006s : 1: ForceFp32Comm 3.00% : 0.003059s : 1: add_attr 2.98% : 0.003045s : 1: add_attr_with_inline 0.01% : 0.000007s : 1: add_comm_op_reuse_tag 0.05% : 0.000053s : 1: add_recomputation 0.01% : 0.000006s : 1: assign_add_opt 0.11% : 0.000107s : 1: auto_monad 0.03% : 0.000028s : 1: auto_monad_reorder 0.01% : 0.000006s : 1: begin_end_overlap_inline 0.01% : 0.000008s : 1: bias_add_comm_swap 0.47% : 0.000478s : 1: bootstrap 0.03% : 0.000035s : 1: cconv 0.03% : 0.000027s : 1: comm_op_add_attrs 0.02% : 0.000018s : 1: control_data_broadcast_order 0.01% : 0.000014s : 1: convert_after_rewriter 0.03% : 0.000031s : 1: cse_after_recomputation 0.01% : 0.000008s : 1: dataset_repeat_opt 0.02% : 0.000021s : 1: detach_backward 0.01% : 0.000011s : 1: environ_conv 0.05% : 0.000052s : 1: event_method 0.01% : 0.000007s : 1: full_micro_interleaved_order_control 0.01% : 0.000009s : 1: get_jit_bprop_graph 0.01% : 0.000013s : 1: graph_reusing 0.01% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000006s : 1: handle_group_info 0.01% : 0.000008s : 1: inline 0.01% : 0.000008s : 1: insert-virtual-dataset 0.01% : 0.000006s : 1: interleave_parallel_branches 0.01% : 0.000007s : 1: interleave_split_concat_branches 0.01% : 0.000008s : 1: label_fine_grained_interleaved_index 0.01% : 0.000010s : 1: label_micro_interleaved_index 0.45% : 0.000462s : 1: loop_unroll 0.01% : 0.000006s : 1: merge_cast_opt 0.01% : 0.000008s : 1: micro_interleaved_order_control 0.73% : 0.000743s : 1: mutable_eliminate 0.01% : 0.000009s : 1: offloading_packed_experts 0.01% : 0.000014s : 1: opt.transform.loop_unroll_optimizer 0.01% : 0.000015s : 1: opt.transform.mutable_eliminate 1.49% : 0.001524s : 78: opt.transform.opt_a 0.03% : 0.000031s : 1: opt.transform.opt_after_cconv 0.03% : 0.000027s : 1: opt.transform.opt_after_jit_grad 0.11% : 0.000112s : 28: opt.transform.opt_b 0.05% : 0.000052s : 2: opt.transform.opt_trans_graph 0.04% : 0.000039s : 4: opt.transform.symbol_engine_opt 17.49% : 0.017857s : 1: opt_a 0.13% : 0.000136s : 1: opt_after_cconv 0.50% : 0.000514s : 1: opt_after_jit_grad 0.30% : 0.000302s : 1: opt_b 20.41% : 0.020836s : 1: optimize 0.02% : 0.000024s : 1: optimize_parallel_all_gather_comm 0.01% : 0.000012s : 1: order_py_execute_after_rewriter 0.03% : 0.000027s : 1: overlap_grad_flash_sp 0.01% : 0.000007s : 1: overlap_grad_matmul_and_grad_allreduce 0.01% : 0.000010s : 1: overlap_grad_ring_attention 0.01% : 0.000007s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000006s : 1: overlap_opt_shard_in_pipeline 0.01% : 0.000008s : 1: overlap_param_gather 0.01% : 0.000007s : 1: overlap_recompute_allgather_and_fa_grad 0.01% : 0.000010s : 1: overlap_recompute_and_grad_model_parallel 0.01% : 0.000008s : 1: overlap_recompute_comm 0.01% : 0.000010s : 1: parallel-infer-symbol 0.01% : 0.000006s : 1: parallel-infer-symbol-second 0.01% : 0.000008s : 1: partial_unused_args_eliminate 0.01% : 0.000009s : 1: pipeline_parallel_scheduler 0.01% : 0.000007s : 1: pipeline_split 0.05% : 0.000049s : 1: pre_auto_parallel 0.04% : 0.000038s : 1: py_interpret_to_execute 0.02% : 0.000024s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000006s : 1: remove_cast_before_assign_add 0.02% : 0.000019s : 1: remove_dup_value 14.09% : 0.014389s : 1: renormalize.infer 0.60% : 0.000614s : 1: renormalize.specialize 0.01% : 0.000008s : 1: reorder_send_recv_between_fp_bp 0.01% : 0.000010s : 1: rewriter_after_jit_bprop_graph 0.05% : 0.000047s : 1: rewriter_after_opt_a 0.12% : 0.000121s : 1: rewriter_before_opt_a 0.01% : 0.000008s : 1: slice_cell_reuse_recomputed_activation 0.01% : 0.000008s : 1: slice_recompute_activation 0.01% : 0.000008s : 1: split_layernorm_comm 0.01% : 0.000007s : 1: split_matmul_comm_elemetwise 0.01% : 0.000011s : 1: swap_dp_allreduce_reducescatter 0.10% : 0.000104s : 1: symbol_engine_optimizer 0.10% : 0.000100s : 1: tuple_transform 35.81% : 0.036556s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:00.809.909 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0346077, [21] [bootstrap]: 0.00042476 [type_inference]: 0.0250801 [event_method]: 4.391e-05 [auto_monad]: 9.547e-05 [graph_reusing]: 7.2e-06 [inline]: 1.94999e-06 [add_attr]: 0.00305739, [1] [add_attr_with_inline]: 0.00304939, [1] [Cycle 1]: 6.435e-05, [2] [tag_attr]: 2.56e-05 [meta_addattr_fg_expand]: 7.82998e-06 [parallel-infer-symbol]: 3.76999e-06 [pre_auto_parallel]: 3.989e-05 [insert-virtual-dataset]: 2.41e-06 [parallel-infer-symbol-second]: 1.02e-06 [dataset_repeat_opt]: 1.79e-06 [pipeline_split]: 2.11998e-06 [optimize]: 0.00516641, [53] [py_interpret_to_execute]: 2.933e-05 [rewriter_before_opt_a]: 0.00011154 [opt_a]: 0.00307229, [2] [Cycle 1]: 0.00241676, [45] [expand_dump_flag]: 3.43e-06 [switch_simplify]: 0.00012238 [loop_unroll]: 4.148e-05 [a_1]: 0.00083893 [with_stream_mark]: 1.599e-05 [recompute_prepare]: 8.48001e-06 [updatestate_depend_eliminate]: 4.13001e-06 [updatestate_assign_eliminate]: 3.14999e-06 [updatestate_loads_eliminate]: 3.02002e-06 [parameter_eliminate]: 1.92999e-06 [a_2]: 8.35e-05 [accelerated_algorithm]: 7.15e-06 [shard]: 1.76998e-06 [meta_shard_fg_expand]: 1.94e-06 [shard_inline]: 6.66999e-06 [merge_send_recv]: 8.60999e-06 [auto_parallel]: 5.66e-06 [parallel]: 1.845e-05 [flash_sp]: 7.38999e-06 [merge_comm]: 3.86001e-06 [allreduce_fusion]: 3.56999e-06 [matmul_add_comm_reduction]: 9.74e-06 [allreduce_slice_to_reducescatter]: 6.19999e-07 [virtual_shard_identity]: 8.03001e-06 [virtual_dataset]: 6.76e-06 [get_grad_eliminate_]: 6.29999e-06 [virtual_output]: 6.28e-06 [merge_forward]: 4.07e-06 [cell_reuse_recompute_pass]: 1.12e-06 [offload_activation]: 9.62001e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.202e-05 [merge_recompute_call_nodes]: 1.57001e-06 [before_grad]: 1.052e-05 [set_forward_comm_id_for_comm_node_pass]: 3.91001e-06 [meta_fg_expand]: 3.31001e-06 [flash_sp_send_recv_attached]: 2.54001e-06 [receive_attached]: 1.97999e-06 [after_resolve]: 1.165e-05 [a_after_grad]: 1.039e-05 [renormalize]: 0.0007738 [add_forward_monad_depend]: 5.24e-06 [auto_monad_grad]: 2.46998e-06 [auto_monad_eliminator]: 1.546e-05 [cse]: 2.793e-05 [a_3]: 4.679e-05 [Cycle 2]: 0.00064559, [45] [expand_dump_flag]: 1.23002e-06 [switch_simplify]: 7.76001e-06 [loop_unroll]: 6.51999e-06 [a_1]: 0.00013322 [with_stream_mark]: 1.114e-05 [recompute_prepare]: 6.61e-06 [updatestate_depend_eliminate]: 3.13e-06 [updatestate_assign_eliminate]: 2.37001e-06 [updatestate_loads_eliminate]: 2.76e-06 [parameter_eliminate]: 1.09998e-06 [a_2]: 7.31e-05 [accelerated_algorithm]: 6.04001e-06 [shard]: 1.12e-06 [meta_shard_fg_expand]: 1.25999e-06 [shard_inline]: 6.53e-06 [merge_send_recv]: 5.59998e-06 [auto_parallel]: 5.59e-06 [parallel]: 4.77998e-06 [flash_sp]: 3.33e-06 [merge_comm]: 3.09001e-06 [allreduce_fusion]: 2.86999e-06 [matmul_add_comm_reduction]: 5.71e-06 [allreduce_slice_to_reducescatter]: 4.19997e-07 [virtual_shard_identity]: 6.48e-06 [virtual_dataset]: 5.83002e-06 [get_grad_eliminate_]: 5.85002e-06 [virtual_output]: 5.76e-06 [merge_forward]: 2.63e-06 [cell_reuse_recompute_pass]: 1.49e-06 [offload_activation]: 6.73e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.305e-05 [merge_recompute_call_nodes]: 7.80012e-07 [before_grad]: 9.14e-06 [set_forward_comm_id_for_comm_node_pass]: 3.37002e-06 [meta_fg_expand]: 2.14e-06 [flash_sp_send_recv_attached]: 7.99977e-07 [receive_attached]: 1.08001e-06 [after_resolve]: 1.031e-05 [a_after_grad]: 9.10999e-06 [renormalize]: 8.00064e-08 [add_forward_monad_depend]: 1.35001e-06 [auto_monad_grad]: 8.30012e-07 [auto_monad_eliminator]: 7.05e-06 [cse]: 1.409e-05 [a_3]: 4.007e-05 [py_interpret_to_execute_after_opt_a]: 9.51e-06 [slice_cell_reuse_recomputed_activation]: 2.34999e-06 [rewriter_after_opt_a]: 3.717e-05 [convert_after_rewriter]: 7.08e-06 [order_py_execute_after_rewriter]: 5.59e-06 [mutable_eliminate]: 0.00049869 [opt_b]: 0.00021614, [1] [Cycle 1]: 0.00020975, [7] [b_1]: 0.00013324 [b_2]: 9.07001e-06 [updatestate_depend_eliminate]: 5.40001e-06 [updatestate_assign_eliminate]: 2.69001e-06 [updatestate_loads_eliminate]: 2.60002e-06 [renormalize]: 5.39992e-07 [cse]: 1.767e-05 [optimize_parallel_all_gather_comm]: 1.685e-05 [overlap_param_gather]: 2.24001e-06 [cconv]: 3.747e-05 [loop_unroll]: 0.00046866 [opt_after_cconv]: 0.00010316, [1] [Cycle 1]: 9.769e-05, [7] [c_1]: 3.309e-05 [parameter_eliminate]: 2.63e-06 [updatestate_depend_eliminate]: 5.51002e-06 [updatestate_assign_eliminate]: 2.56e-06 [updatestate_loads_eliminate]: 2.24999e-06 [cse]: 1.677e-05 [renormalize]: 4.60015e-07 [remove_dup_value]: 1.249e-05 [tuple_transform]: 7.533e-05, [1] [Cycle 1]: 7.075e-05, [4] [d_1]: 4.369e-05 [none_parameter_eliminate]: 1.64e-06 [renormalize]: 2.50002e-07 [switch_simplify]: 6.73e-06 [partial_unused_args_eliminate]: 1.72001e-06 [add_recomputation]: 4.573e-05 [cse_after_recomputation]: 2.108e-05, [1] [Cycle 1]: 1.651e-05, [1] [cse]: 1.107e-05 [environ_conv]: 4.53001e-06 [swap_dp_allreduce_reducescatter]: 5.52999e-06 [bias_add_comm_swap]: 2.46e-06 [label_micro_interleaved_index]: 4.53001e-06 [label_fine_grained_interleaved_index]: 2.99001e-06 [merge_cast_opt]: 1.38002e-06 [slice_recompute_activation]: 2.07999e-06 [micro_interleaved_order_control]: 2.36998e-06 [assign_add_opt]: 1.14e-06 [ForceFp32Comm]: 8.00006e-07 [remove_cast_before_assign_add]: 1.03001e-06 [full_micro_interleaved_order_control]: 2.59999e-06 [reorder_send_recv_between_fp_bp]: 3.06999e-06 [comm_op_add_attrs]: 1.07e-06 [add_comm_op_reuse_tag]: 1.34e-06 [interleave_split_concat_branches]: 1.14998e-06 [interleave_parallel_branches]: 1.04e-06 [overlap_opt_shard_in_pipeline]: 1.20001e-06 [overlap_opt_shard_grad_in_pipeline]: 2.06003e-06 [control_data_broadcast_order]: 1.263e-05 [grouped_pairwise_exchange_alltoall]: 1.45999e-06 [offloading_packed_experts]: 3.6e-06 [overlap_recompute_and_grad_model_parallel]: 4.65999e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.15001e-06 [overlap_recompute_allgather_and_fa_grad]: 1.34e-06 [overlap_recompute_comm]: 2.20002e-06 [overlap_grad_ring_attention]: 3.70998e-06 [overlap_grad_flash_sp]: 1.856e-05 [begin_end_overlap_inline]: 7.30011e-07 [split_matmul_comm_elemetwise]: 2.37001e-06 [split_layernorm_comm]: 2.19999e-06 [handle_group_info]: 1.04e-06 [symbol_engine_optimizer]: 7.405e-05, [1] [Cycle 1]: 7e-05, [6] [build]: 2.64001e-06 [elim_shapecalc]: 9.94001e-06 [elim_not_effective]: 1.291e-05 [opt_reshape]: 6.86001e-06 [fold_const_symbol]: 9.95002e-06 [renormalize]: 4.39992e-07 [detach_backward]: 2.23998e-06 [pipeline_parallel_scheduler]: 1.64e-06 [auto_monad_reorder]: 1.658e-05 [get_jit_bprop_graph]: 1.17999e-06 [rewriter_after_jit_bprop_graph]: 4.50999e-06 [opt_after_jit_grad]: 0.00046096 [validate]: 3.655e-05 Sums bootstrap : 0.000425s : 1.39% type_inference : 0.025080s : 81.95% event_method : 0.000044s : 0.14% auto_monad : 0.000095s : 0.31% graph_reusing : 0.000007s : 0.02% inline : 0.000002s : 0.01% add_attr.add_attr_with_inline.tag_attr : 0.000026s : 0.08% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000008s : 0.03% parallel-infer-symbol : 0.000004s : 0.01% pre_auto_parallel : 0.000040s : 0.13% insert-virtual-dataset : 0.000002s : 0.01% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.01% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000029s : 0.10% optimize.rewriter_before_opt_a : 0.000112s : 0.36% optimize.opt_a.expand_dump_flag : 0.000005s : 0.02% optimize.opt_a.switch_simplify : 0.000130s : 0.43% optimize.opt_a.loop_unroll : 0.000048s : 0.16% optimize.opt_a.a_1 : 0.000972s : 3.18% optimize.opt_a.with_stream_mark : 0.000027s : 0.09% optimize.opt_a.recompute_prepare : 0.000015s : 0.05% optimize.opt_a.updatestate_depend_eliminate : 0.000007s : 0.02% optimize.opt_a.updatestate_assign_eliminate : 0.000006s : 0.02% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.02% optimize.opt_a.parameter_eliminate : 0.000003s : 0.01% optimize.opt_a.a_2 : 0.000157s : 0.51% optimize.opt_a.accelerated_algorithm : 0.000013s : 0.04% optimize.opt_a.shard : 0.000003s : 0.01% optimize.opt_a.meta_shard_fg_expand : 0.000003s : 0.01% optimize.opt_a.shard_inline : 0.000013s : 0.04% optimize.opt_a.merge_send_recv : 0.000014s : 0.05% optimize.opt_a.auto_parallel : 0.000011s : 0.04% optimize.opt_a.parallel : 0.000023s : 0.08% optimize.opt_a.flash_sp : 0.000011s : 0.04% optimize.opt_a.merge_comm : 0.000007s : 0.02% optimize.opt_a.allreduce_fusion : 0.000006s : 0.02% optimize.opt_a.matmul_add_comm_reduction : 0.000015s : 0.05% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000015s : 0.05% optimize.opt_a.virtual_dataset : 0.000013s : 0.04% optimize.opt_a.get_grad_eliminate_ : 0.000012s : 0.04% optimize.opt_a.virtual_output : 0.000012s : 0.04% optimize.opt_a.merge_forward : 0.000007s : 0.02% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.01% optimize.opt_a.offload_activation : 0.000016s : 0.05% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000025s : 0.08% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.01% optimize.opt_a.before_grad : 0.000020s : 0.06% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000007s : 0.02% optimize.opt_a.meta_fg_expand : 0.000005s : 0.02% optimize.opt_a.flash_sp_send_recv_attached : 0.000003s : 0.01% optimize.opt_a.receive_attached : 0.000003s : 0.01% optimize.opt_a.after_resolve : 0.000022s : 0.07% optimize.opt_a.a_after_grad : 0.000020s : 0.06% optimize.opt_a.renormalize : 0.000774s : 2.53% optimize.opt_a.add_forward_monad_depend : 0.000007s : 0.02% optimize.opt_a.auto_monad_grad : 0.000003s : 0.01% optimize.opt_a.auto_monad_eliminator : 0.000023s : 0.07% optimize.opt_a.cse : 0.000042s : 0.14% optimize.opt_a.a_3 : 0.000087s : 0.28% optimize.py_interpret_to_execute_after_opt_a : 0.000010s : 0.03% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.01% optimize.rewriter_after_opt_a : 0.000037s : 0.12% optimize.convert_after_rewriter : 0.000007s : 0.02% optimize.order_py_execute_after_rewriter : 0.000006s : 0.02% optimize.mutable_eliminate : 0.000499s : 1.63% optimize.opt_b.b_1 : 0.000133s : 0.44% optimize.opt_b.b_2 : 0.000009s : 0.03% optimize.opt_b.updatestate_depend_eliminate : 0.000005s : 0.02% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.01% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.01% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000018s : 0.06% optimize.optimize_parallel_all_gather_comm : 0.000017s : 0.06% optimize.overlap_param_gather : 0.000002s : 0.01% optimize.cconv : 0.000037s : 0.12% optimize.loop_unroll : 0.000469s : 1.53% optimize.opt_after_cconv.c_1 : 0.000033s : 0.11% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.02% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.01% optimize.opt_after_cconv.cse : 0.000017s : 0.05% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000012s : 0.04% optimize.tuple_transform.d_1 : 0.000044s : 0.14% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000007s : 0.02% optimize.partial_unused_args_eliminate : 0.000002s : 0.01% optimize.add_recomputation : 0.000046s : 0.15% optimize.cse_after_recomputation.cse : 0.000011s : 0.04% optimize.environ_conv : 0.000005s : 0.01% optimize.swap_dp_allreduce_reducescatter : 0.000006s : 0.02% optimize.bias_add_comm_swap : 0.000002s : 0.01% optimize.label_micro_interleaved_index : 0.000005s : 0.01% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.01% optimize.merge_cast_opt : 0.000001s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.01% optimize.micro_interleaved_order_control : 0.000002s : 0.01% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.00% optimize.full_micro_interleaved_order_control : 0.000003s : 0.01% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.01% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.01% optimize.control_data_broadcast_order : 0.000013s : 0.04% optimize.grouped_pairwise_exchange_alltoall : 0.000001s : 0.00% optimize.offloading_packed_experts : 0.000004s : 0.01% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.02% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.00% optimize.overlap_recompute_comm : 0.000002s : 0.01% optimize.overlap_grad_ring_attention : 0.000004s : 0.01% optimize.overlap_grad_flash_sp : 0.000019s : 0.06% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.01% optimize.split_layernorm_comm : 0.000002s : 0.01% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000003s : 0.01% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000010s : 0.03% optimize.symbol_engine_optimizer.elim_not_effective : 0.000013s : 0.04% optimize.symbol_engine_optimizer.opt_reshape : 0.000007s : 0.02% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000010s : 0.03% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.01% pipeline_parallel_scheduler : 0.000002s : 0.01% auto_monad_reorder : 0.000017s : 0.05% get_jit_bprop_graph : 0.000001s : 0.00% rewriter_after_jit_bprop_graph : 0.000005s : 0.01% opt_after_jit_grad : 0.000461s : 1.51% validate : 0.000037s : 0.12% Time group info: ------[substitution.] 0.000221 34 0.93% : 0.000002s : 2: substitution.elim_not_effective 0.58% : 0.000001s : 2: substitution.fold_const_symbol 2.61% : 0.000006s : 4: substitution.graph_param_transform 79.19% : 0.000175s : 8: substitution.inline 1.46% : 0.000003s : 4: substitution.j_node_and_user_rematch 2.16% : 0.000005s : 4: substitution.remove_not_recompute_node 1.87% : 0.000004s : 4: substitution.replace_old_param 4.27% : 0.000009s : 2: substitution.switch_simplify 6.92% : 0.000015s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.025009 2 94.77% : 0.023700s : 1: type_inference.infer 5.23% : 0.001309s : 1: type_inference.specialize ------[replace.] 0.000121 14 43.69% : 0.000053s : 8: replace.inline 37.08% : 0.000045s : 2: replace.switch_simplify 19.23% : 0.000023s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000192 14 88.86% : 0.000170s : 8: match.inline 4.19% : 0.000008s : 2: match.switch_simplify 6.95% : 0.000013s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000242 1520 0.99% : 0.000002s : 17: predicate.accumulaten_eliminater 0.61% : 0.000001s : 4: predicate.ad_related_special_op_eliminate 0.41% : 0.000001s : 8: predicate.addn_check_dump 0.98% : 0.000002s : 17: predicate.addn_zero_filter 0.92% : 0.000002s : 17: predicate.adjust_all_reduce_mul_add 2.02% : 0.000005s : 25: predicate.arithmetic_simplify 1.03% : 0.000003s : 17: predicate.cast_eliminate 0.50% : 0.000001s : 8: predicate.check_bprop_eliminate 0.42% : 0.000001s : 8: predicate.compare_switch_simplify 0.15% : 0.000000s : 4: predicate.const_output_eliminate 0.47% : 0.000001s : 8: predicate.depend_value_elim 1.10% : 0.000003s : 17: predicate.dict_get_item_const_eliminator 1.19% : 0.000003s : 17: predicate.dict_get_item_eliminator 1.10% : 0.000003s : 17: predicate.dict_set_item_eliminator 0.92% : 0.000002s : 8: predicate.dumpgradient_eliminate 0.18% : 0.000000s : 4: predicate.elim_not_effective 0.31% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.24% : 0.000003s : 21: predicate.environ_add_const_eliminate 1.14% : 0.000003s : 21: predicate.environ_get_add_eliminate 1.22% : 0.000003s : 21: predicate.environ_get_depend_swap 1.61% : 0.000004s : 29: predicate.environ_get_eliminate 1.10% : 0.000003s : 21: predicate.environ_get_set_eliminate 1.82% : 0.000004s : 29: predicate.exchange_switch_depend_value 2.83% : 0.000007s : 29: predicate.float_depend_g_call 0.46% : 0.000001s : 8: predicate.float_environ_get_switch 0.67% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.16% : 0.000000s : 4: predicate.fold_const_symbol 0.57% : 0.000001s : 8: predicate.get_grad_eliminate 0.17% : 0.000000s : 4: predicate.graph_param_transform 0.48% : 0.000001s : 8: predicate.incorporate_call 0.42% : 0.000001s : 8: predicate.incorporate_call_switch 6.37% : 0.000015s : 70: predicate.inline 0.65% : 0.000002s : 8: predicate.inline_without_move 0.27% : 0.000001s : 8: predicate.j_node_and_user_rematch 0.66% : 0.000002s : 8: predicate.less_batch_normalization 1.83% : 0.000004s : 29: predicate.list_to_tuple_eliminator_ 2.75% : 0.000007s : 46: predicate.load_eliminater 0.93% : 0.000002s : 4: predicate.loop_unroll_after_grad 2.81% : 0.000007s : 47: predicate.loop_unroll_before_grad 1.61% : 0.000004s : 25: predicate.make_slice_get_slice_eliminator 0.47% : 0.000001s : 8: predicate.merge_addn 0.49% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.47% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.94% : 0.000002s : 17: predicate.minmaximum_grad 0.79% : 0.000002s : 4: predicate.mutable_eliminate 0.28% : 0.000001s : 4: predicate.opt_reshape 0.32% : 0.000001s : 4: predicate.parallel_virtual_node 2.09% : 0.000005s : 29: predicate.partial_defer_inline 1.78% : 0.000004s : 25: predicate.partial_eliminate 0.99% : 0.000002s : 17: predicate.print_const_string_wrapper 0.45% : 0.000001s : 8: predicate.reduce_all_const_elim 1.26% : 0.000003s : 17: predicate.reduce_eliminate 2.62% : 0.000006s : 46: predicate.redundant_stop_gradient_eliminater 0.47% : 0.000001s : 8: predicate.remove_not_recompute_node 1.37% : 0.000003s : 29: predicate.replace_applicator 0.52% : 0.000001s : 8: predicate.replace_old_param 0.23% : 0.000001s : 4: predicate.reset_defer_inline 1.00% : 0.000002s : 17: predicate.reshape_eliminate 0.52% : 0.000001s : 8: predicate.row_tensor_add_zeros_like 0.35% : 0.000001s : 4: predicate.row_tensor_eliminate 0.61% : 0.000001s : 8: predicate.same_eliminate 0.40% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.57% : 0.000001s : 8: predicate.shard_identity_eliminate 0.58% : 0.000001s : 8: predicate.special_op_eliminate 0.59% : 0.000001s : 8: predicate.specialize_transform 0.73% : 0.000002s : 8: predicate.split_environ_get_set_with_tuple_value 0.63% : 0.000002s : 8: predicate.stack_unstack_eliminate 0.28% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.89% : 0.000005s : 29: predicate.switch_defer_inline 2.36% : 0.000006s : 37: predicate.switch_layer_defer_inline 6.09% : 0.000015s : 92: predicate.switch_simplify 0.97% : 0.000002s : 17: predicate.tile_eliminate 1.01% : 0.000002s : 17: predicate.transpose_eliminate 1.49% : 0.000004s : 25: predicate.tuple_list_convert_item_index_to_positive 1.66% : 0.000004s : 25: predicate.tuple_list_get_item_const_eliminator 1.45% : 0.000004s : 25: predicate.tuple_list_get_item_depend_reorder 3.08% : 0.000007s : 37: predicate.tuple_list_get_item_eliminator 1.47% : 0.000004s : 25: predicate.tuple_list_get_set_item_eliminator 2.12% : 0.000005s : 33: predicate.tuple_list_set_item_eliminator 1.77% : 0.000004s : 29: predicate.tuple_to_list_eliminator_ 2.55% : 0.000006s : 46: predicate.updatestate_pure_node_eliminater 3.21% : 0.000008s : 54: predicate.updatestate_useless_node_eliminater 0.30% : 0.000001s : 4: predicate.value_based_eliminate 0.52% : 0.000001s : 8: predicate.virtual_dataset_eliminate 0.50% : 0.000001s : 8: predicate.virtual_output_eliminate 0.23% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.45% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000917 16 57.20% : 0.000524s : 6: func_graph_cloner_run.FuncGraphClonerGraph 42.80% : 0.000392s : 10: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.045241 192 0.01% : 0.000004s : 1: ForceFp32Comm 6.77% : 0.003062s : 1: add_attr 6.75% : 0.003053s : 1: add_attr_with_inline 0.01% : 0.000004s : 1: add_comm_op_reuse_tag 0.11% : 0.000050s : 1: add_recomputation 0.01% : 0.000004s : 1: assign_add_opt 0.23% : 0.000102s : 1: auto_monad 0.05% : 0.000020s : 1: auto_monad_reorder 0.01% : 0.000004s : 1: begin_end_overlap_inline 0.01% : 0.000005s : 1: bias_add_comm_swap 1.00% : 0.000452s : 1: bootstrap 0.09% : 0.000042s : 1: cconv 0.01% : 0.000004s : 1: comm_op_add_attrs 0.03% : 0.000016s : 1: control_data_broadcast_order 0.02% : 0.000011s : 1: convert_after_rewriter 0.05% : 0.000024s : 1: cse_after_recomputation 0.01% : 0.000005s : 1: dataset_repeat_opt 0.01% : 0.000005s : 1: detach_backward 0.02% : 0.000008s : 1: environ_conv 0.12% : 0.000052s : 1: event_method 0.01% : 0.000005s : 1: full_micro_interleaved_order_control 0.01% : 0.000004s : 1: get_jit_bprop_graph 0.02% : 0.000011s : 1: graph_reusing 0.01% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000004s : 1: handle_group_info 0.01% : 0.000005s : 1: inline 0.01% : 0.000006s : 1: insert-virtual-dataset 0.01% : 0.000004s : 1: interleave_parallel_branches 0.01% : 0.000004s : 1: interleave_split_concat_branches 0.01% : 0.000006s : 1: label_fine_grained_interleaved_index 0.02% : 0.000007s : 1: label_micro_interleaved_index 1.05% : 0.000477s : 1: loop_unroll 0.01% : 0.000005s : 1: merge_cast_opt 0.01% : 0.000005s : 1: micro_interleaved_order_control 1.12% : 0.000508s : 1: mutable_eliminate 0.01% : 0.000007s : 1: offloading_packed_experts 0.03% : 0.000014s : 1: opt.transform.loop_unroll_optimizer 0.03% : 0.000014s : 1: opt.transform.mutable_eliminate 3.28% : 0.001483s : 78: opt.transform.opt_a 0.07% : 0.000031s : 1: opt.transform.opt_after_cconv 0.06% : 0.000025s : 1: opt.transform.opt_after_jit_grad 0.24% : 0.000108s : 28: opt.transform.opt_b 0.11% : 0.000048s : 2: opt.transform.opt_trans_graph 0.08% : 0.000036s : 4: opt.transform.symbol_engine_opt 6.80% : 0.003075s : 1: opt_a 0.24% : 0.000107s : 1: opt_after_cconv 1.04% : 0.000470s : 1: opt_after_jit_grad 0.49% : 0.000220s : 1: opt_b 11.43% : 0.005171s : 1: optimize 0.05% : 0.000021s : 1: optimize_parallel_all_gather_comm 0.02% : 0.000009s : 1: order_py_execute_after_rewriter 0.05% : 0.000022s : 1: overlap_grad_flash_sp 0.01% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.01% : 0.000007s : 1: overlap_grad_ring_attention 0.01% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.01% : 0.000006s : 1: overlap_param_gather 0.01% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.02% : 0.000007s : 1: overlap_recompute_and_grad_model_parallel 0.01% : 0.000005s : 1: overlap_recompute_comm 0.02% : 0.000007s : 1: parallel-infer-symbol 0.01% : 0.000004s : 1: parallel-infer-symbol-second 0.01% : 0.000005s : 1: partial_unused_args_eliminate 0.01% : 0.000005s : 1: pipeline_parallel_scheduler 0.01% : 0.000005s : 1: pipeline_split 0.10% : 0.000044s : 1: pre_auto_parallel 0.07% : 0.000034s : 1: py_interpret_to_execute 0.03% : 0.000013s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000004s : 1: remove_cast_before_assign_add 0.04% : 0.000016s : 1: remove_dup_value 0.87% : 0.000395s : 1: renormalize.infer 0.82% : 0.000370s : 1: renormalize.specialize 0.01% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.02% : 0.000008s : 1: rewriter_after_jit_bprop_graph 0.09% : 0.000042s : 1: rewriter_after_opt_a 0.26% : 0.000116s : 1: rewriter_before_opt_a 0.01% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.01% : 0.000005s : 1: slice_recompute_activation 0.01% : 0.000005s : 1: split_layernorm_comm 0.01% : 0.000006s : 1: split_matmul_comm_elemetwise 0.02% : 0.000008s : 1: swap_dp_allreduce_reducescatter 0.17% : 0.000077s : 1: symbol_engine_optimizer 0.17% : 0.000078s : 1: tuple_transform 55.48% : 0.025099s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:01.255.321 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:01.255.590 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0382091, [21] [bootstrap]: 0.00044918 [type_inference]: 0.0130242 [event_method]: 7.544e-05 [auto_monad]: 9.95e-05 [graph_reusing]: 7.72998e-06 [inline]: 2.32001e-06 [add_attr]: 0.00318707, [1] [add_attr_with_inline]: 0.00317871, [1] [Cycle 1]: 8.243e-05, [2] [tag_attr]: 2.802e-05 [meta_addattr_fg_expand]: 8.29998e-06 [parallel-infer-symbol]: 3.18e-06 [pre_auto_parallel]: 4.113e-05 [insert-virtual-dataset]: 2.36e-06 [parallel-infer-symbol-second]: 7.50006e-07 [dataset_repeat_opt]: 1.96e-06 [pipeline_split]: 1.90001e-06 [optimize]: 0.0198032, [53] [py_interpret_to_execute]: 3.484e-05 [rewriter_before_opt_a]: 0.00011842 [opt_a]: 0.0170578, [2] [Cycle 1]: 0.0160134, [45] [expand_dump_flag]: 3.53999e-06 [switch_simplify]: 0.0001229 [loop_unroll]: 4.222e-05 [a_1]: 0.00091134 [with_stream_mark]: 1.667e-05 [recompute_prepare]: 1.045e-05 [updatestate_depend_eliminate]: 4.62998e-06 [updatestate_assign_eliminate]: 4.17998e-06 [updatestate_loads_eliminate]: 3.44001e-06 [parameter_eliminate]: 2.13002e-06 [a_2]: 0.00013729 [accelerated_algorithm]: 9.60001e-06 [shard]: 1.84e-06 [meta_shard_fg_expand]: 2.68e-06 [shard_inline]: 7.83999e-06 [merge_send_recv]: 9.72999e-06 [auto_parallel]: 7.85998e-06 [parallel]: 1.92e-05 [flash_sp]: 8.62e-06 [merge_comm]: 4.86002e-06 [allreduce_fusion]: 4.3e-06 [matmul_add_comm_reduction]: 1.031e-05 [allreduce_slice_to_reducescatter]: 6.39993e-07 [virtual_shard_identity]: 1.012e-05 [virtual_dataset]: 8.31002e-06 [get_grad_eliminate_]: 7.8e-06 [virtual_output]: 7.78001e-06 [merge_forward]: 4.86997e-06 [cell_reuse_recompute_pass]: 1.45001e-06 [offload_activation]: 1.014e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.796e-05 [merge_recompute_call_nodes]: 1.61998e-06 [before_grad]: 1.408e-05 [set_forward_comm_id_for_comm_node_pass]: 4.75001e-06 [meta_fg_expand]: 3.68999e-06 [flash_sp_send_recv_attached]: 2.76999e-06 [receive_attached]: 2.29001e-06 [after_resolve]: 1.256e-05 [a_after_grad]: 1.303e-05 [renormalize]: 0.0139387 [add_forward_monad_depend]: 1.119e-05 [auto_monad_grad]: 3.19001e-06 [auto_monad_eliminator]: 2.825e-05 [cse]: 3.894e-05 [a_3]: 8.956e-05 [Cycle 2]: 0.00102675, [45] [expand_dump_flag]: 2.69001e-06 [switch_simplify]: 1.113e-05 [loop_unroll]: 8.01001e-06 [a_1]: 0.00019407 [with_stream_mark]: 2.065e-05 [recompute_prepare]: 8.2e-06 [updatestate_depend_eliminate]: 5.17e-06 [updatestate_assign_eliminate]: 4.34997e-06 [updatestate_loads_eliminate]: 3.36999e-06 [parameter_eliminate]: 2.34999e-06 [a_2]: 0.00012314 [accelerated_algorithm]: 8.02e-06 [shard]: 2.43998e-06 [meta_shard_fg_expand]: 2.32001e-06 [shard_inline]: 8.38999e-06 [merge_send_recv]: 9.64999e-06 [auto_parallel]: 1.033e-05 [parallel]: 1.015e-05 [flash_sp]: 3.88001e-06 [merge_comm]: 4.70999e-06 [allreduce_fusion]: 4.53999e-06 [matmul_add_comm_reduction]: 1.144e-05 [allreduce_slice_to_reducescatter]: 6.30011e-07 [virtual_shard_identity]: 9.82999e-06 [virtual_dataset]: 7.98001e-06 [get_grad_eliminate_]: 7.71001e-06 [virtual_output]: 8.08999e-06 [merge_forward]: 4.58001e-06 [cell_reuse_recompute_pass]: 3.51999e-06 [offload_activation]: 1.04e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.842e-05 [merge_recompute_call_nodes]: 1.47001e-06 [before_grad]: 1.377e-05 [set_forward_comm_id_for_comm_node_pass]: 4.94e-06 [meta_fg_expand]: 4.05e-06 [flash_sp_send_recv_attached]: 2.09999e-06 [receive_attached]: 2.78e-06 [after_resolve]: 1.534e-05 [a_after_grad]: 1.321e-05 [renormalize]: 7.99773e-08 [add_forward_monad_depend]: 2.82002e-06 [auto_monad_grad]: 1.44003e-06 [auto_monad_eliminator]: 1.053e-05 [cse]: 2.026e-05 [a_3]: 5.983e-05 [py_interpret_to_execute_after_opt_a]: 2.059e-05 [slice_cell_reuse_recomputed_activation]: 5.07e-06 [rewriter_after_opt_a]: 4.945e-05 [convert_after_rewriter]: 1.108e-05 [order_py_execute_after_rewriter]: 8.59002e-06 [mutable_eliminate]: 0.00073109 [opt_b]: 0.00033439, [1] [Cycle 1]: 0.00032354, [7] [b_1]: 0.00020281 [b_2]: 9.09e-06 [updatestate_depend_eliminate]: 8.12e-06 [updatestate_assign_eliminate]: 3.16999e-06 [updatestate_loads_eliminate]: 3.31001e-06 [renormalize]: 6.59988e-07 [cse]: 2.589e-05 [optimize_parallel_all_gather_comm]: 2.106e-05 [overlap_param_gather]: 4.57e-06 [cconv]: 3.182e-05 [loop_unroll]: 0.00045952 [opt_after_cconv]: 0.00014435, [1] [Cycle 1]: 0.00013597, [7] [c_1]: 3.907e-05 [parameter_eliminate]: 3.06001e-06 [updatestate_depend_eliminate]: 6.78e-06 [updatestate_assign_eliminate]: 3.4e-06 [updatestate_loads_eliminate]: 3.08e-06 [cse]: 2.262e-05 [renormalize]: 7.00005e-07 [remove_dup_value]: 1.918e-05 [tuple_transform]: 0.00013235, [1] [Cycle 1]: 0.00012432, [4] [d_1]: 8.128e-05 [none_parameter_eliminate]: 1.79e-06 [renormalize]: 1.69995e-07 [switch_simplify]: 9.40001e-06 [partial_unused_args_eliminate]: 4.56002e-06 [add_recomputation]: 6.362e-05 [cse_after_recomputation]: 3.347e-05, [1] [Cycle 1]: 2.577e-05, [1] [cse]: 1.656e-05 [environ_conv]: 9.02999e-06 [swap_dp_allreduce_reducescatter]: 8.92e-06 [bias_add_comm_swap]: 4.75001e-06 [label_micro_interleaved_index]: 7.13998e-06 [label_fine_grained_interleaved_index]: 5.29998e-06 [merge_cast_opt]: 3.93999e-06 [slice_recompute_activation]: 4.36002e-06 [micro_interleaved_order_control]: 4.70999e-06 [assign_add_opt]: 3.61001e-06 [ForceFp32Comm]: 3.23e-06 [remove_cast_before_assign_add]: 3.38e-06 [full_micro_interleaved_order_control]: 4.53999e-06 [reorder_send_recv_between_fp_bp]: 5.48002e-06 [comm_op_add_attrs]: 3.65e-06 [add_comm_op_reuse_tag]: 3.2e-06 [interleave_split_concat_branches]: 3.43e-06 [interleave_parallel_branches]: 3.40003e-06 [overlap_opt_shard_in_pipeline]: 3.71001e-06 [overlap_opt_shard_grad_in_pipeline]: 4.52e-06 [control_data_broadcast_order]: 1.739e-05 [grouped_pairwise_exchange_alltoall]: 3.92998e-06 [offloading_packed_experts]: 6.70998e-06 [overlap_recompute_and_grad_model_parallel]: 7.88001e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.77998e-06 [overlap_recompute_allgather_and_fa_grad]: 3.9e-06 [overlap_recompute_comm]: 5.19998e-06 [overlap_grad_ring_attention]: 7.01999e-06 [overlap_grad_flash_sp]: 2.635e-05 [begin_end_overlap_inline]: 3.03998e-06 [split_matmul_comm_elemetwise]: 4.68001e-06 [split_layernorm_comm]: 4.53001e-06 [handle_group_info]: 3.71999e-06 [symbol_engine_optimizer]: 0.00010917, [1] [Cycle 1]: 0.00010199, [6] [build]: 4.1e-06 [elim_shapecalc]: 1.218e-05 [elim_not_effective]: 1.569e-05 [opt_reshape]: 9.07001e-06 [fold_const_symbol]: 1.291e-05 [renormalize]: 2.00002e-07 [detach_backward]: 4.89e-06 [pipeline_parallel_scheduler]: 2.11e-06 [auto_monad_reorder]: 2.62e-05 [get_jit_bprop_graph]: 2.45002e-06 [rewriter_after_jit_bprop_graph]: 4.97e-06 [opt_after_jit_grad]: 0.00051441 [validate]: 4.719e-05 Sums bootstrap : 0.000449s : 1.37% type_inference : 0.013024s : 39.61% event_method : 0.000075s : 0.23% auto_monad : 0.000099s : 0.30% graph_reusing : 0.000008s : 0.02% inline : 0.000002s : 0.01% add_attr.add_attr_with_inline.tag_attr : 0.000028s : 0.09% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000008s : 0.03% parallel-infer-symbol : 0.000003s : 0.01% pre_auto_parallel : 0.000041s : 0.13% insert-virtual-dataset : 0.000002s : 0.01% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.01% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000035s : 0.11% optimize.rewriter_before_opt_a : 0.000118s : 0.36% optimize.opt_a.expand_dump_flag : 0.000006s : 0.02% optimize.opt_a.switch_simplify : 0.000134s : 0.41% optimize.opt_a.loop_unroll : 0.000050s : 0.15% optimize.opt_a.a_1 : 0.001105s : 3.36% optimize.opt_a.with_stream_mark : 0.000037s : 0.11% optimize.opt_a.recompute_prepare : 0.000019s : 0.06% optimize.opt_a.updatestate_depend_eliminate : 0.000010s : 0.03% optimize.opt_a.updatestate_assign_eliminate : 0.000009s : 0.03% optimize.opt_a.updatestate_loads_eliminate : 0.000007s : 0.02% optimize.opt_a.parameter_eliminate : 0.000004s : 0.01% optimize.opt_a.a_2 : 0.000260s : 0.79% optimize.opt_a.accelerated_algorithm : 0.000018s : 0.05% optimize.opt_a.shard : 0.000004s : 0.01% optimize.opt_a.meta_shard_fg_expand : 0.000005s : 0.02% optimize.opt_a.shard_inline : 0.000016s : 0.05% optimize.opt_a.merge_send_recv : 0.000019s : 0.06% optimize.opt_a.auto_parallel : 0.000018s : 0.06% optimize.opt_a.parallel : 0.000029s : 0.09% optimize.opt_a.flash_sp : 0.000013s : 0.04% optimize.opt_a.merge_comm : 0.000010s : 0.03% optimize.opt_a.allreduce_fusion : 0.000009s : 0.03% optimize.opt_a.matmul_add_comm_reduction : 0.000022s : 0.07% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000020s : 0.06% optimize.opt_a.virtual_dataset : 0.000016s : 0.05% optimize.opt_a.get_grad_eliminate_ : 0.000016s : 0.05% optimize.opt_a.virtual_output : 0.000016s : 0.05% optimize.opt_a.merge_forward : 0.000009s : 0.03% optimize.opt_a.cell_reuse_recompute_pass : 0.000005s : 0.02% optimize.opt_a.offload_activation : 0.000021s : 0.06% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000036s : 0.11% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.01% optimize.opt_a.before_grad : 0.000028s : 0.08% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000010s : 0.03% optimize.opt_a.meta_fg_expand : 0.000008s : 0.02% optimize.opt_a.flash_sp_send_recv_attached : 0.000005s : 0.01% optimize.opt_a.receive_attached : 0.000005s : 0.02% optimize.opt_a.after_resolve : 0.000028s : 0.08% optimize.opt_a.a_after_grad : 0.000026s : 0.08% optimize.opt_a.renormalize : 0.013939s : 42.40% optimize.opt_a.add_forward_monad_depend : 0.000014s : 0.04% optimize.opt_a.auto_monad_grad : 0.000005s : 0.01% optimize.opt_a.auto_monad_eliminator : 0.000039s : 0.12% optimize.opt_a.cse : 0.000059s : 0.18% optimize.opt_a.a_3 : 0.000149s : 0.45% optimize.py_interpret_to_execute_after_opt_a : 0.000021s : 0.06% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.02% optimize.rewriter_after_opt_a : 0.000049s : 0.15% optimize.convert_after_rewriter : 0.000011s : 0.03% optimize.order_py_execute_after_rewriter : 0.000009s : 0.03% optimize.mutable_eliminate : 0.000731s : 2.22% optimize.opt_b.b_1 : 0.000203s : 0.62% optimize.opt_b.b_2 : 0.000009s : 0.03% optimize.opt_b.updatestate_depend_eliminate : 0.000008s : 0.02% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.01% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.01% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000026s : 0.08% optimize.optimize_parallel_all_gather_comm : 0.000021s : 0.06% optimize.overlap_param_gather : 0.000005s : 0.01% optimize.cconv : 0.000032s : 0.10% optimize.loop_unroll : 0.000460s : 1.40% optimize.opt_after_cconv.c_1 : 0.000039s : 0.12% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000007s : 0.02% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.cse : 0.000023s : 0.07% optimize.opt_after_cconv.renormalize : 0.000001s : 0.00% optimize.remove_dup_value : 0.000019s : 0.06% optimize.tuple_transform.d_1 : 0.000081s : 0.25% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000009s : 0.03% optimize.partial_unused_args_eliminate : 0.000005s : 0.01% optimize.add_recomputation : 0.000064s : 0.19% optimize.cse_after_recomputation.cse : 0.000017s : 0.05% optimize.environ_conv : 0.000009s : 0.03% optimize.swap_dp_allreduce_reducescatter : 0.000009s : 0.03% optimize.bias_add_comm_swap : 0.000005s : 0.01% optimize.label_micro_interleaved_index : 0.000007s : 0.02% optimize.label_fine_grained_interleaved_index : 0.000005s : 0.02% optimize.merge_cast_opt : 0.000004s : 0.01% optimize.slice_recompute_activation : 0.000004s : 0.01% optimize.micro_interleaved_order_control : 0.000005s : 0.01% optimize.assign_add_opt : 0.000004s : 0.01% optimize.ForceFp32Comm : 0.000003s : 0.01% optimize.remove_cast_before_assign_add : 0.000003s : 0.01% optimize.full_micro_interleaved_order_control : 0.000005s : 0.01% optimize.reorder_send_recv_between_fp_bp : 0.000005s : 0.02% optimize.comm_op_add_attrs : 0.000004s : 0.01% optimize.add_comm_op_reuse_tag : 0.000003s : 0.01% optimize.interleave_split_concat_branches : 0.000003s : 0.01% optimize.interleave_parallel_branches : 0.000003s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000005s : 0.01% optimize.control_data_broadcast_order : 0.000017s : 0.05% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.01% optimize.offloading_packed_experts : 0.000007s : 0.02% optimize.overlap_recompute_and_grad_model_parallel : 0.000008s : 0.02% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.01% optimize.overlap_recompute_comm : 0.000005s : 0.02% optimize.overlap_grad_ring_attention : 0.000007s : 0.02% optimize.overlap_grad_flash_sp : 0.000026s : 0.08% optimize.begin_end_overlap_inline : 0.000003s : 0.01% optimize.split_matmul_comm_elemetwise : 0.000005s : 0.01% optimize.split_layernorm_comm : 0.000005s : 0.01% optimize.handle_group_info : 0.000004s : 0.01% optimize.symbol_engine_optimizer.build : 0.000004s : 0.01% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000012s : 0.04% optimize.symbol_engine_optimizer.elim_not_effective : 0.000016s : 0.05% optimize.symbol_engine_optimizer.opt_reshape : 0.000009s : 0.03% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000013s : 0.04% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000005s : 0.01% pipeline_parallel_scheduler : 0.000002s : 0.01% auto_monad_reorder : 0.000026s : 0.08% get_jit_bprop_graph : 0.000002s : 0.01% rewriter_after_jit_bprop_graph : 0.000005s : 0.02% opt_after_jit_grad : 0.000514s : 1.56% validate : 0.000047s : 0.14% Time group info: ------[substitution.] 0.000259 44 11.88% : 0.000031s : 3: substitution.cast_eliminate 0.84% : 0.000002s : 3: substitution.elim_not_effective 0.69% : 0.000002s : 3: substitution.fold_const_symbol 2.95% : 0.000008s : 5: substitution.graph_param_transform 67.75% : 0.000176s : 8: substitution.inline 2.22% : 0.000006s : 6: substitution.j_node_and_user_rematch 2.47% : 0.000006s : 6: substitution.remove_not_recompute_node 2.39% : 0.000006s : 4: substitution.replace_old_param 3.82% : 0.000010s : 2: substitution.switch_simplify 4.99% : 0.000013s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.012963 2 89.88% : 0.011650s : 1: type_inference.infer 10.12% : 0.001312s : 1: type_inference.specialize ------[replace.] 0.000121 14 45.14% : 0.000054s : 8: replace.inline 36.00% : 0.000043s : 2: replace.switch_simplify 18.86% : 0.000023s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000191 14 89.70% : 0.000171s : 8: match.inline 4.49% : 0.000009s : 2: match.switch_simplify 5.81% : 0.000011s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000296 1746 1.02% : 0.000003s : 19: predicate.accumulaten_eliminater 0.68% : 0.000002s : 5: predicate.ad_related_special_op_eliminate 0.50% : 0.000001s : 10: predicate.addn_check_dump 1.11% : 0.000003s : 19: predicate.addn_zero_filter 0.89% : 0.000003s : 19: predicate.adjust_all_reduce_mul_add 2.11% : 0.000006s : 29: predicate.arithmetic_simplify 1.09% : 0.000003s : 19: predicate.cast_eliminate 0.60% : 0.000002s : 10: predicate.check_bprop_eliminate 0.49% : 0.000001s : 10: predicate.compare_switch_simplify 0.17% : 0.000000s : 5: predicate.const_output_eliminate 0.53% : 0.000002s : 10: predicate.depend_value_elim 0.99% : 0.000003s : 19: predicate.dict_get_item_const_eliminator 1.10% : 0.000003s : 19: predicate.dict_get_item_eliminator 0.94% : 0.000003s : 19: predicate.dict_set_item_eliminator 0.77% : 0.000002s : 10: predicate.dumpgradient_eliminate 0.17% : 0.000000s : 5: predicate.elim_not_effective 0.33% : 0.000001s : 5: predicate.elim_shapecalc_of_broadcastargs 1.31% : 0.000004s : 24: predicate.environ_add_const_eliminate 1.19% : 0.000004s : 24: predicate.environ_get_add_eliminate 1.14% : 0.000003s : 24: predicate.environ_get_depend_swap 1.70% : 0.000005s : 34: predicate.environ_get_eliminate 1.19% : 0.000004s : 24: predicate.environ_get_set_eliminate 1.61% : 0.000005s : 31: predicate.exchange_switch_depend_value 2.65% : 0.000008s : 31: predicate.float_depend_g_call 0.55% : 0.000002s : 10: predicate.float_environ_get_switch 0.73% : 0.000002s : 15: predicate.float_tuple_getitem_switch 0.15% : 0.000000s : 5: predicate.fold_const_symbol 0.60% : 0.000002s : 10: predicate.get_grad_eliminate 0.21% : 0.000001s : 5: predicate.graph_param_transform 0.56% : 0.000002s : 10: predicate.incorporate_call 0.47% : 0.000001s : 10: predicate.incorporate_call_switch 6.32% : 0.000019s : 80: predicate.inline 0.76% : 0.000002s : 10: predicate.inline_without_move 0.27% : 0.000001s : 10: predicate.j_node_and_user_rematch 0.92% : 0.000003s : 10: predicate.less_batch_normalization 1.84% : 0.000005s : 33: predicate.list_to_tuple_eliminator_ 2.47% : 0.000007s : 52: predicate.load_eliminater 0.93% : 0.000003s : 5: predicate.loop_unroll_after_grad 2.45% : 0.000007s : 49: predicate.loop_unroll_before_grad 1.58% : 0.000005s : 29: predicate.make_slice_get_slice_eliminator 0.56% : 0.000002s : 10: predicate.merge_addn 0.53% : 0.000002s : 10: predicate.micro_step_allgather_replace 0.55% : 0.000002s : 10: predicate.mini_step_allgather_replace 0.89% : 0.000003s : 19: predicate.minmaximum_grad 0.94% : 0.000003s : 5: predicate.mutable_eliminate 0.32% : 0.000001s : 5: predicate.opt_reshape 0.30% : 0.000001s : 5: predicate.parallel_virtual_node 1.87% : 0.000006s : 31: predicate.partial_defer_inline 1.57% : 0.000005s : 28: predicate.partial_eliminate 1.09% : 0.000003s : 19: predicate.print_const_string_wrapper 0.47% : 0.000001s : 10: predicate.reduce_all_const_elim 1.27% : 0.000004s : 19: predicate.reduce_eliminate 2.52% : 0.000007s : 52: predicate.redundant_stop_gradient_eliminater 0.31% : 0.000001s : 10: predicate.remove_not_recompute_node 1.50% : 0.000004s : 33: predicate.replace_applicator 0.54% : 0.000002s : 10: predicate.replace_old_param 0.22% : 0.000001s : 5: predicate.reset_defer_inline 0.98% : 0.000003s : 19: predicate.reshape_eliminate 0.55% : 0.000002s : 10: predicate.row_tensor_add_zeros_like 0.36% : 0.000001s : 5: predicate.row_tensor_eliminate 0.82% : 0.000002s : 10: predicate.same_eliminate 0.38% : 0.000001s : 10: predicate.set_cell_output_no_recompute 0.65% : 0.000002s : 10: predicate.shard_identity_eliminate 0.58% : 0.000002s : 10: predicate.special_op_eliminate 0.66% : 0.000002s : 10: predicate.specialize_transform 0.91% : 0.000003s : 10: predicate.split_environ_get_set_with_tuple_value 0.77% : 0.000002s : 10: predicate.stack_unstack_eliminate 0.28% : 0.000001s : 5: predicate.switch_call_monad_eliminater 1.72% : 0.000005s : 31: predicate.switch_defer_inline 2.14% : 0.000006s : 41: predicate.switch_layer_defer_inline 5.43% : 0.000016s : 99: predicate.switch_simplify 0.95% : 0.000003s : 19: predicate.tile_eliminate 1.03% : 0.000003s : 19: predicate.transpose_eliminate 1.61% : 0.000005s : 29: predicate.tuple_list_convert_item_index_to_positive 1.52% : 0.000005s : 29: predicate.tuple_list_get_item_const_eliminator 1.49% : 0.000004s : 29: predicate.tuple_list_get_item_depend_reorder 2.97% : 0.000009s : 43: predicate.tuple_list_get_item_eliminator 1.43% : 0.000004s : 29: predicate.tuple_list_get_set_item_eliminator 2.37% : 0.000007s : 39: predicate.tuple_list_set_item_eliminator 1.72% : 0.000005s : 33: predicate.tuple_to_list_eliminator_ 2.50% : 0.000007s : 52: predicate.updatestate_pure_node_eliminater 3.18% : 0.000009s : 62: predicate.updatestate_useless_node_eliminater 0.52% : 0.000002s : 5: predicate.value_based_eliminate 0.70% : 0.000002s : 10: predicate.virtual_dataset_eliminate 0.69% : 0.000002s : 10: predicate.virtual_output_eliminate 0.23% : 0.000001s : 5: predicate.virtual_view_grad_eliminate 0.35% : 0.000001s : 5: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.001039 16 52.03% : 0.000540s : 6: func_graph_cloner_run.FuncGraphClonerGraph 47.97% : 0.000498s : 10: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.077107 192 0.01% : 0.000006s : 1: ForceFp32Comm 4.15% : 0.003196s : 1: add_attr 4.13% : 0.003183s : 1: add_attr_with_inline 0.01% : 0.000006s : 1: add_comm_op_reuse_tag 0.09% : 0.000067s : 1: add_recomputation 0.01% : 0.000006s : 1: assign_add_opt 0.14% : 0.000110s : 1: auto_monad 0.04% : 0.000034s : 1: auto_monad_reorder 0.01% : 0.000006s : 1: begin_end_overlap_inline 0.01% : 0.000008s : 1: bias_add_comm_swap 0.64% : 0.000493s : 1: bootstrap 0.05% : 0.000035s : 1: cconv 0.01% : 0.000006s : 1: comm_op_add_attrs 0.03% : 0.000020s : 1: control_data_broadcast_order 0.02% : 0.000014s : 1: convert_after_rewriter 0.05% : 0.000037s : 1: cse_after_recomputation 0.01% : 0.000008s : 1: dataset_repeat_opt 0.04% : 0.000028s : 1: detach_backward 0.02% : 0.000012s : 1: environ_conv 0.12% : 0.000089s : 1: event_method 0.01% : 0.000007s : 1: full_micro_interleaved_order_control 0.01% : 0.000009s : 1: get_jit_bprop_graph 0.02% : 0.000015s : 1: graph_reusing 0.01% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000007s : 1: handle_group_info 0.01% : 0.000008s : 1: inline 0.01% : 0.000009s : 1: insert-virtual-dataset 0.01% : 0.000006s : 1: interleave_parallel_branches 0.01% : 0.000007s : 1: interleave_split_concat_branches 0.01% : 0.000008s : 1: label_fine_grained_interleaved_index 0.01% : 0.000010s : 1: label_micro_interleaved_index 0.60% : 0.000466s : 1: loop_unroll 0.01% : 0.000007s : 1: merge_cast_opt 0.01% : 0.000008s : 1: micro_interleaved_order_control 0.96% : 0.000738s : 1: mutable_eliminate 0.01% : 0.000010s : 1: offloading_packed_experts 0.02% : 0.000017s : 1: opt.transform.loop_unroll_optimizer 0.03% : 0.000019s : 1: opt.transform.mutable_eliminate 2.27% : 0.001746s : 78: opt.transform.opt_a 0.05% : 0.000038s : 1: opt.transform.opt_after_cconv 0.04% : 0.000032s : 1: opt.transform.opt_after_jit_grad 0.18% : 0.000137s : 28: opt.transform.opt_b 0.11% : 0.000088s : 2: opt.transform.opt_trans_graph 0.06% : 0.000046s : 4: opt.transform.symbol_engine_opt 22.13% : 0.017062s : 1: opt_a 0.19% : 0.000148s : 1: opt_after_cconv 0.68% : 0.000526s : 1: opt_after_jit_grad 0.44% : 0.000338s : 1: opt_b 26.52% : 0.020445s : 1: optimize 0.03% : 0.000025s : 1: optimize_parallel_all_gather_comm 0.01% : 0.000012s : 1: order_py_execute_after_rewriter 0.04% : 0.000030s : 1: overlap_grad_flash_sp 0.01% : 0.000007s : 1: overlap_grad_matmul_and_grad_allreduce 0.01% : 0.000010s : 1: overlap_grad_ring_attention 0.01% : 0.000007s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000006s : 1: overlap_opt_shard_in_pipeline 0.01% : 0.000008s : 1: overlap_param_gather 0.01% : 0.000007s : 1: overlap_recompute_allgather_and_fa_grad 0.01% : 0.000011s : 1: overlap_recompute_and_grad_model_parallel 0.01% : 0.000008s : 1: overlap_recompute_comm 0.01% : 0.000010s : 1: parallel-infer-symbol 0.01% : 0.000006s : 1: parallel-infer-symbol-second 0.01% : 0.000008s : 1: partial_unused_args_eliminate 0.01% : 0.000010s : 1: pipeline_parallel_scheduler 0.01% : 0.000007s : 1: pipeline_split 0.06% : 0.000049s : 1: pre_auto_parallel 0.05% : 0.000039s : 1: py_interpret_to_execute 0.03% : 0.000024s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000006s : 1: remove_cast_before_assign_add 0.03% : 0.000022s : 1: remove_dup_value 17.20% : 0.013261s : 1: renormalize.infer 0.86% : 0.000661s : 1: renormalize.specialize 0.01% : 0.000008s : 1: reorder_send_recv_between_fp_bp 0.01% : 0.000011s : 1: rewriter_after_jit_bprop_graph 0.07% : 0.000054s : 1: rewriter_after_opt_a 0.16% : 0.000122s : 1: rewriter_before_opt_a 0.01% : 0.000008s : 1: slice_cell_reuse_recomputed_activation 0.01% : 0.000007s : 1: slice_recompute_activation 0.01% : 0.000007s : 1: split_layernorm_comm 0.01% : 0.000007s : 1: split_matmul_comm_elemetwise 0.02% : 0.000012s : 1: swap_dp_allreduce_reducescatter 0.15% : 0.000112s : 1: symbol_engine_optimizer 0.18% : 0.000135s : 1: tuple_transform 16.94% : 0.013063s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:01.718.422 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0374289, [21] [bootstrap]: 0.00044219 [type_inference]: 0.0135651 [event_method]: 7.855e-05 [auto_monad]: 0.00010134 [graph_reusing]: 7.78001e-06 [inline]: 2.86999e-06 [add_attr]: 0.00326749, [1] [add_attr_with_inline]: 0.00325805, [1] [Cycle 1]: 7.447e-05, [2] [tag_attr]: 3.092e-05 [meta_addattr_fg_expand]: 8.70999e-06 [parallel-infer-symbol]: 3.55e-06 [pre_auto_parallel]: 4.593e-05 [insert-virtual-dataset]: 2.36998e-06 [parallel-infer-symbol-second]: 8.89995e-07 [dataset_repeat_opt]: 2.04e-06 [pipeline_split]: 2.04e-06 [optimize]: 0.0191673, [53] [py_interpret_to_execute]: 3.324e-05 [rewriter_before_opt_a]: 0.0001176 [opt_a]: 0.0167771, [2] [Cycle 1]: 0.015945, [45] [expand_dump_flag]: 3.48e-06 [switch_simplify]: 0.00012953 [loop_unroll]: 4.342e-05 [a_1]: 0.00092855 [with_stream_mark]: 1.908e-05 [recompute_prepare]: 1.121e-05 [updatestate_depend_eliminate]: 5.07999e-06 [updatestate_assign_eliminate]: 3.83999e-06 [updatestate_loads_eliminate]: 3.68e-06 [parameter_eliminate]: 2.29999e-06 [a_2]: 0.000107 [accelerated_algorithm]: 8.47e-06 [shard]: 1.71e-06 [meta_shard_fg_expand]: 2.30002e-06 [shard_inline]: 7.7e-06 [merge_send_recv]: 1.039e-05 [auto_parallel]: 8.07e-06 [parallel]: 2.139e-05 [flash_sp]: 8.67e-06 [merge_comm]: 4.82998e-06 [allreduce_fusion]: 4.49002e-06 [matmul_add_comm_reduction]: 1.132e-05 [allreduce_slice_to_reducescatter]: 7.59988e-07 [virtual_shard_identity]: 9.92999e-06 [virtual_dataset]: 8.05e-06 [get_grad_eliminate_]: 7.45e-06 [virtual_output]: 7.97e-06 [merge_forward]: 4.42998e-06 [cell_reuse_recompute_pass]: 1.78002e-06 [offload_activation]: 1.125e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.57e-05 [merge_recompute_call_nodes]: 1.38002e-06 [before_grad]: 1.264e-05 [set_forward_comm_id_for_comm_node_pass]: 4.31002e-06 [meta_fg_expand]: 4.27998e-06 [flash_sp_send_recv_attached]: 2.72001e-06 [receive_attached]: 2.17001e-06 [after_resolve]: 1.407e-05 [a_after_grad]: 1.3e-05 [renormalize]: 0.0140444 [add_forward_monad_depend]: 1.021e-05 [auto_monad_grad]: 2.53998e-06 [auto_monad_eliminator]: 2.514e-05 [cse]: 4.108e-05 [a_3]: 7.176e-05 [Cycle 2]: 0.00081947, [45] [expand_dump_flag]: 2.54999e-06 [switch_simplify]: 1.08e-05 [loop_unroll]: 7.87e-06 [a_1]: 0.00019439 [with_stream_mark]: 1.875e-05 [recompute_prepare]: 7.82e-06 [updatestate_depend_eliminate]: 5.54998e-06 [updatestate_assign_eliminate]: 3.66001e-06 [updatestate_loads_eliminate]: 3.23998e-06 [parameter_eliminate]: 1.51002e-06 [a_2]: 9.585e-05 [accelerated_algorithm]: 7.93001e-06 [shard]: 2.77002e-06 [meta_shard_fg_expand]: 2.27999e-06 [shard_inline]: 7.75998e-06 [merge_send_recv]: 9.34998e-06 [auto_parallel]: 9.75002e-06 [parallel]: 8.92e-06 [flash_sp]: 4.18999e-06 [merge_comm]: 4.33999e-06 [allreduce_fusion]: 4.18001e-06 [matmul_add_comm_reduction]: 1.049e-05 [allreduce_slice_to_reducescatter]: 6.89994e-07 [virtual_shard_identity]: 8.92999e-06 [virtual_dataset]: 7.82e-06 [get_grad_eliminate_]: 7.41999e-06 [virtual_output]: 7.1e-06 [merge_forward]: 5.36998e-06 [cell_reuse_recompute_pass]: 3.11999e-06 [offload_activation]: 1.035e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.485e-05 [merge_recompute_call_nodes]: 1.49e-06 [before_grad]: 1.209e-05 [set_forward_comm_id_for_comm_node_pass]: 4.73001e-06 [meta_fg_expand]: 3.36999e-06 [flash_sp_send_recv_attached]: 1.64e-06 [receive_attached]: 2.85002e-06 [after_resolve]: 1.265e-05 [a_after_grad]: 1.145e-05 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 1.27999e-06 [auto_monad_grad]: 1.04e-06 [auto_monad_eliminator]: 8.80001e-06 [cse]: 1.828e-05 [a_3]: 4.614e-05 [py_interpret_to_execute_after_opt_a]: 1.917e-05 [slice_cell_reuse_recomputed_activation]: 1.77001e-06 [rewriter_after_opt_a]: 4.309e-05 [convert_after_rewriter]: 7.46001e-06 [order_py_execute_after_rewriter]: 5.96e-06 [mutable_eliminate]: 0.00070078 [opt_b]: 0.00024798, [1] [Cycle 1]: 0.00024097, [7] [b_1]: 0.00015846 [b_2]: 9.36e-06 [updatestate_depend_eliminate]: 6.78e-06 [updatestate_assign_eliminate]: 3.12002e-06 [updatestate_loads_eliminate]: 3.04001e-06 [renormalize]: 1.05999e-06 [cse]: 2.25e-05 [optimize_parallel_all_gather_comm]: 1.835e-05 [overlap_param_gather]: 2.09999e-06 [cconv]: 2.655e-05 [loop_unroll]: 0.00043643 [opt_after_cconv]: 0.00013392, [1] [Cycle 1]: 0.00012798, [7] [c_1]: 3.805e-05 [parameter_eliminate]: 2.67001e-06 [updatestate_depend_eliminate]: 5.63997e-06 [updatestate_assign_eliminate]: 1.732e-05 [updatestate_loads_eliminate]: 3.26001e-06 [cse]: 2.384e-05 [renormalize]: 3.4002e-07 [remove_dup_value]: 1.433e-05 [tuple_transform]: 8.748e-05, [1] [Cycle 1]: 8.267e-05, [4] [d_1]: 5.398e-05 [none_parameter_eliminate]: 1.99999e-06 [renormalize]: 2.19996e-07 [switch_simplify]: 8.05e-06 [partial_unused_args_eliminate]: 1.76e-06 [add_recomputation]: 5.658e-05 [cse_after_recomputation]: 2.608e-05, [1] [Cycle 1]: 2.133e-05, [1] [cse]: 1.592e-05 [environ_conv]: 6.38003e-06 [swap_dp_allreduce_reducescatter]: 5.65001e-06 [bias_add_comm_swap]: 2.78e-06 [label_micro_interleaved_index]: 4.02e-06 [label_fine_grained_interleaved_index]: 2.64001e-06 [merge_cast_opt]: 1.24998e-06 [slice_recompute_activation]: 2.08002e-06 [micro_interleaved_order_control]: 2.63e-06 [assign_add_opt]: 1.20999e-06 [ForceFp32Comm]: 8.2e-07 [remove_cast_before_assign_add]: 1.34998e-06 [full_micro_interleaved_order_control]: 2.54001e-06 [reorder_send_recv_between_fp_bp]: 2.91e-06 [comm_op_add_attrs]: 1.01002e-06 [add_comm_op_reuse_tag]: 9.5999e-07 [interleave_split_concat_branches]: 1.15999e-06 [interleave_parallel_branches]: 1.03001e-06 [overlap_opt_shard_in_pipeline]: 1.10001e-06 [overlap_opt_shard_grad_in_pipeline]: 1.94999e-06 [control_data_broadcast_order]: 1.433e-05 [grouped_pairwise_exchange_alltoall]: 1.50999e-06 [offloading_packed_experts]: 4.44998e-06 [overlap_recompute_and_grad_model_parallel]: 5.12e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.15001e-06 [overlap_recompute_allgather_and_fa_grad]: 1.34e-06 [overlap_recompute_comm]: 2.35002e-06 [overlap_grad_ring_attention]: 4.3e-06 [overlap_grad_flash_sp]: 2.402e-05 [begin_end_overlap_inline]: 8.2e-07 [split_matmul_comm_elemetwise]: 1.99e-06 [split_layernorm_comm]: 1.91e-06 [handle_group_info]: 9.09989e-07 [symbol_engine_optimizer]: 8.225e-05, [1] [Cycle 1]: 7.807e-05, [6] [build]: 3.71999e-06 [elim_shapecalc]: 1.109e-05 [elim_not_effective]: 1.526e-05 [opt_reshape]: 8.37998e-06 [fold_const_symbol]: 1.223e-05 [renormalize]: 2.09984e-07 [detach_backward]: 2.42001e-06 [pipeline_parallel_scheduler]: 1.55999e-06 [auto_monad_reorder]: 2.192e-05 [get_jit_bprop_graph]: 2.09e-06 [rewriter_after_jit_bprop_graph]: 4.32e-06 [opt_after_jit_grad]: 0.00048632 [validate]: 4.426e-05 Sums bootstrap : 0.000442s : 1.33% type_inference : 0.013565s : 40.91% event_method : 0.000079s : 0.24% auto_monad : 0.000101s : 0.31% graph_reusing : 0.000008s : 0.02% inline : 0.000003s : 0.01% add_attr.add_attr_with_inline.tag_attr : 0.000031s : 0.09% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000009s : 0.03% parallel-infer-symbol : 0.000004s : 0.01% pre_auto_parallel : 0.000046s : 0.14% insert-virtual-dataset : 0.000002s : 0.01% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.01% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000033s : 0.10% optimize.rewriter_before_opt_a : 0.000118s : 0.35% optimize.opt_a.expand_dump_flag : 0.000006s : 0.02% optimize.opt_a.switch_simplify : 0.000140s : 0.42% optimize.opt_a.loop_unroll : 0.000051s : 0.15% optimize.opt_a.a_1 : 0.001123s : 3.39% optimize.opt_a.with_stream_mark : 0.000038s : 0.11% optimize.opt_a.recompute_prepare : 0.000019s : 0.06% optimize.opt_a.updatestate_depend_eliminate : 0.000011s : 0.03% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.02% optimize.opt_a.updatestate_loads_eliminate : 0.000007s : 0.02% optimize.opt_a.parameter_eliminate : 0.000004s : 0.01% optimize.opt_a.a_2 : 0.000203s : 0.61% optimize.opt_a.accelerated_algorithm : 0.000016s : 0.05% optimize.opt_a.shard : 0.000004s : 0.01% optimize.opt_a.meta_shard_fg_expand : 0.000005s : 0.01% optimize.opt_a.shard_inline : 0.000015s : 0.05% optimize.opt_a.merge_send_recv : 0.000020s : 0.06% optimize.opt_a.auto_parallel : 0.000018s : 0.05% optimize.opt_a.parallel : 0.000030s : 0.09% optimize.opt_a.flash_sp : 0.000013s : 0.04% optimize.opt_a.merge_comm : 0.000009s : 0.03% optimize.opt_a.allreduce_fusion : 0.000009s : 0.03% optimize.opt_a.matmul_add_comm_reduction : 0.000022s : 0.07% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000019s : 0.06% optimize.opt_a.virtual_dataset : 0.000016s : 0.05% optimize.opt_a.get_grad_eliminate_ : 0.000015s : 0.04% optimize.opt_a.virtual_output : 0.000015s : 0.05% optimize.opt_a.merge_forward : 0.000010s : 0.03% optimize.opt_a.cell_reuse_recompute_pass : 0.000005s : 0.01% optimize.opt_a.offload_activation : 0.000022s : 0.07% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000031s : 0.09% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.01% optimize.opt_a.before_grad : 0.000025s : 0.07% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000009s : 0.03% optimize.opt_a.meta_fg_expand : 0.000008s : 0.02% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.01% optimize.opt_a.receive_attached : 0.000005s : 0.02% optimize.opt_a.after_resolve : 0.000027s : 0.08% optimize.opt_a.a_after_grad : 0.000024s : 0.07% optimize.opt_a.renormalize : 0.014044s : 42.36% optimize.opt_a.add_forward_monad_depend : 0.000011s : 0.03% optimize.opt_a.auto_monad_grad : 0.000004s : 0.01% optimize.opt_a.auto_monad_eliminator : 0.000034s : 0.10% optimize.opt_a.cse : 0.000059s : 0.18% optimize.opt_a.a_3 : 0.000118s : 0.36% optimize.py_interpret_to_execute_after_opt_a : 0.000019s : 0.06% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.01% optimize.rewriter_after_opt_a : 0.000043s : 0.13% optimize.convert_after_rewriter : 0.000007s : 0.02% optimize.order_py_execute_after_rewriter : 0.000006s : 0.02% optimize.mutable_eliminate : 0.000701s : 2.11% optimize.opt_b.b_1 : 0.000158s : 0.48% optimize.opt_b.b_2 : 0.000009s : 0.03% optimize.opt_b.updatestate_depend_eliminate : 0.000007s : 0.02% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.01% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.01% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000022s : 0.07% optimize.optimize_parallel_all_gather_comm : 0.000018s : 0.06% optimize.overlap_param_gather : 0.000002s : 0.01% optimize.cconv : 0.000027s : 0.08% optimize.loop_unroll : 0.000436s : 1.32% optimize.opt_after_cconv.c_1 : 0.000038s : 0.11% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.02% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000017s : 0.05% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.cse : 0.000024s : 0.07% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000014s : 0.04% optimize.tuple_transform.d_1 : 0.000054s : 0.16% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000008s : 0.02% optimize.partial_unused_args_eliminate : 0.000002s : 0.01% optimize.add_recomputation : 0.000057s : 0.17% optimize.cse_after_recomputation.cse : 0.000016s : 0.05% optimize.environ_conv : 0.000006s : 0.02% optimize.swap_dp_allreduce_reducescatter : 0.000006s : 0.02% optimize.bias_add_comm_swap : 0.000003s : 0.01% optimize.label_micro_interleaved_index : 0.000004s : 0.01% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.01% optimize.merge_cast_opt : 0.000001s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.01% optimize.micro_interleaved_order_control : 0.000003s : 0.01% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.00% optimize.full_micro_interleaved_order_control : 0.000003s : 0.01% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.01% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.01% optimize.control_data_broadcast_order : 0.000014s : 0.04% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.00% optimize.offloading_packed_experts : 0.000004s : 0.01% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.02% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.00% optimize.overlap_recompute_comm : 0.000002s : 0.01% optimize.overlap_grad_ring_attention : 0.000004s : 0.01% optimize.overlap_grad_flash_sp : 0.000024s : 0.07% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.01% optimize.split_layernorm_comm : 0.000002s : 0.01% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000004s : 0.01% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000011s : 0.03% optimize.symbol_engine_optimizer.elim_not_effective : 0.000015s : 0.05% optimize.symbol_engine_optimizer.opt_reshape : 0.000008s : 0.03% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000012s : 0.04% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.01% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000022s : 0.07% get_jit_bprop_graph : 0.000002s : 0.01% rewriter_after_jit_bprop_graph : 0.000004s : 0.01% opt_after_jit_grad : 0.000486s : 1.47% validate : 0.000044s : 0.13% Time group info: ------[substitution.] 0.000269 44 11.77% : 0.000032s : 3: substitution.cast_eliminate 0.96% : 0.000003s : 3: substitution.elim_not_effective 0.67% : 0.000002s : 3: substitution.fold_const_symbol 2.69% : 0.000007s : 5: substitution.graph_param_transform 68.57% : 0.000184s : 8: substitution.inline 1.84% : 0.000005s : 6: substitution.j_node_and_user_rematch 2.40% : 0.000006s : 6: substitution.remove_not_recompute_node 1.94% : 0.000005s : 4: substitution.replace_old_param 3.93% : 0.000011s : 2: substitution.switch_simplify 5.21% : 0.000014s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.013494 2 89.88% : 0.012129s : 1: type_inference.infer 10.12% : 0.001365s : 1: type_inference.specialize ------[replace.] 0.000128 14 43.21% : 0.000055s : 8: replace.inline 37.72% : 0.000048s : 2: replace.switch_simplify 19.07% : 0.000024s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000201 14 89.44% : 0.000180s : 8: match.inline 4.54% : 0.000009s : 2: match.switch_simplify 6.02% : 0.000012s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000289 1746 1.03% : 0.000003s : 19: predicate.accumulaten_eliminater 0.60% : 0.000002s : 5: predicate.ad_related_special_op_eliminate 0.52% : 0.000002s : 10: predicate.addn_check_dump 1.15% : 0.000003s : 19: predicate.addn_zero_filter 0.90% : 0.000003s : 19: predicate.adjust_all_reduce_mul_add 2.10% : 0.000006s : 29: predicate.arithmetic_simplify 1.23% : 0.000004s : 19: predicate.cast_eliminate 0.47% : 0.000001s : 10: predicate.check_bprop_eliminate 0.52% : 0.000002s : 10: predicate.compare_switch_simplify 0.17% : 0.000000s : 5: predicate.const_output_eliminate 0.55% : 0.000002s : 10: predicate.depend_value_elim 1.06% : 0.000003s : 19: predicate.dict_get_item_const_eliminator 1.21% : 0.000003s : 19: predicate.dict_get_item_eliminator 0.98% : 0.000003s : 19: predicate.dict_set_item_eliminator 0.86% : 0.000002s : 10: predicate.dumpgradient_eliminate 0.14% : 0.000000s : 5: predicate.elim_not_effective 0.36% : 0.000001s : 5: predicate.elim_shapecalc_of_broadcastargs 1.28% : 0.000004s : 24: predicate.environ_add_const_eliminate 1.31% : 0.000004s : 24: predicate.environ_get_add_eliminate 1.15% : 0.000003s : 24: predicate.environ_get_depend_swap 1.70% : 0.000005s : 34: predicate.environ_get_eliminate 1.13% : 0.000003s : 24: predicate.environ_get_set_eliminate 1.61% : 0.000005s : 31: predicate.exchange_switch_depend_value 2.59% : 0.000007s : 31: predicate.float_depend_g_call 0.49% : 0.000001s : 10: predicate.float_environ_get_switch 0.71% : 0.000002s : 15: predicate.float_tuple_getitem_switch 0.15% : 0.000000s : 5: predicate.fold_const_symbol 0.59% : 0.000002s : 10: predicate.get_grad_eliminate 0.17% : 0.000001s : 5: predicate.graph_param_transform 0.52% : 0.000001s : 10: predicate.incorporate_call 0.47% : 0.000001s : 10: predicate.incorporate_call_switch 6.23% : 0.000018s : 80: predicate.inline 0.69% : 0.000002s : 10: predicate.inline_without_move 0.27% : 0.000001s : 10: predicate.j_node_and_user_rematch 0.68% : 0.000002s : 10: predicate.less_batch_normalization 1.78% : 0.000005s : 33: predicate.list_to_tuple_eliminator_ 2.69% : 0.000008s : 52: predicate.load_eliminater 0.75% : 0.000002s : 5: predicate.loop_unroll_after_grad 2.51% : 0.000007s : 49: predicate.loop_unroll_before_grad 1.70% : 0.000005s : 29: predicate.make_slice_get_slice_eliminator 0.57% : 0.000002s : 10: predicate.merge_addn 0.49% : 0.000001s : 10: predicate.micro_step_allgather_replace 0.52% : 0.000001s : 10: predicate.mini_step_allgather_replace 0.87% : 0.000003s : 19: predicate.minmaximum_grad 0.77% : 0.000002s : 5: predicate.mutable_eliminate 0.35% : 0.000001s : 5: predicate.opt_reshape 0.36% : 0.000001s : 5: predicate.parallel_virtual_node 2.02% : 0.000006s : 31: predicate.partial_defer_inline 1.67% : 0.000005s : 28: predicate.partial_eliminate 0.95% : 0.000003s : 19: predicate.print_const_string_wrapper 0.54% : 0.000002s : 10: predicate.reduce_all_const_elim 1.29% : 0.000004s : 19: predicate.reduce_eliminate 2.55% : 0.000007s : 52: predicate.redundant_stop_gradient_eliminater 0.33% : 0.000001s : 10: predicate.remove_not_recompute_node 1.38% : 0.000004s : 33: predicate.replace_applicator 0.44% : 0.000001s : 10: predicate.replace_old_param 0.22% : 0.000001s : 5: predicate.reset_defer_inline 1.07% : 0.000003s : 19: predicate.reshape_eliminate 0.67% : 0.000002s : 10: predicate.row_tensor_add_zeros_like 0.44% : 0.000001s : 5: predicate.row_tensor_eliminate 0.78% : 0.000002s : 10: predicate.same_eliminate 0.37% : 0.000001s : 10: predicate.set_cell_output_no_recompute 0.61% : 0.000002s : 10: predicate.shard_identity_eliminate 0.61% : 0.000002s : 10: predicate.special_op_eliminate 0.68% : 0.000002s : 10: predicate.specialize_transform 0.95% : 0.000003s : 10: predicate.split_environ_get_set_with_tuple_value 0.65% : 0.000002s : 10: predicate.stack_unstack_eliminate 0.31% : 0.000001s : 5: predicate.switch_call_monad_eliminater 1.81% : 0.000005s : 31: predicate.switch_defer_inline 2.17% : 0.000006s : 41: predicate.switch_layer_defer_inline 5.58% : 0.000016s : 99: predicate.switch_simplify 1.04% : 0.000003s : 19: predicate.tile_eliminate 1.00% : 0.000003s : 19: predicate.transpose_eliminate 1.57% : 0.000005s : 29: predicate.tuple_list_convert_item_index_to_positive 1.68% : 0.000005s : 29: predicate.tuple_list_get_item_const_eliminator 1.42% : 0.000004s : 29: predicate.tuple_list_get_item_depend_reorder 3.18% : 0.000009s : 43: predicate.tuple_list_get_item_eliminator 1.47% : 0.000004s : 29: predicate.tuple_list_get_set_item_eliminator 2.28% : 0.000007s : 39: predicate.tuple_list_set_item_eliminator 1.72% : 0.000005s : 33: predicate.tuple_to_list_eliminator_ 2.44% : 0.000007s : 52: predicate.updatestate_pure_node_eliminater 3.06% : 0.000009s : 62: predicate.updatestate_useless_node_eliminater 0.35% : 0.000001s : 5: predicate.value_based_eliminate 0.62% : 0.000002s : 10: predicate.virtual_dataset_eliminate 0.53% : 0.000002s : 10: predicate.virtual_output_eliminate 0.26% : 0.000001s : 5: predicate.virtual_view_grad_eliminate 0.32% : 0.000001s : 5: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.001040 16 53.31% : 0.000554s : 6: func_graph_cloner_run.FuncGraphClonerGraph 46.69% : 0.000485s : 10: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.075855 192 0.00% : 0.000004s : 1: ForceFp32Comm 4.31% : 0.003273s : 1: add_attr 4.30% : 0.003262s : 1: add_attr_with_inline 0.00% : 0.000004s : 1: add_comm_op_reuse_tag 0.08% : 0.000061s : 1: add_recomputation 0.01% : 0.000004s : 1: assign_add_opt 0.14% : 0.000108s : 1: auto_monad 0.03% : 0.000026s : 1: auto_monad_reorder 0.00% : 0.000004s : 1: begin_end_overlap_inline 0.01% : 0.000006s : 1: bias_add_comm_swap 0.62% : 0.000471s : 1: bootstrap 0.04% : 0.000030s : 1: cconv 0.00% : 0.000004s : 1: comm_op_add_attrs 0.02% : 0.000017s : 1: control_data_broadcast_order 0.01% : 0.000011s : 1: convert_after_rewriter 0.04% : 0.000029s : 1: cse_after_recomputation 0.01% : 0.000005s : 1: dataset_repeat_opt 0.01% : 0.000007s : 1: detach_backward 0.01% : 0.000010s : 1: environ_conv 0.12% : 0.000088s : 1: event_method 0.01% : 0.000005s : 1: full_micro_interleaved_order_control 0.01% : 0.000005s : 1: get_jit_bprop_graph 0.02% : 0.000011s : 1: graph_reusing 0.01% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000004s : 1: handle_group_info 0.01% : 0.000006s : 1: inline 0.01% : 0.000006s : 1: insert-virtual-dataset 0.00% : 0.000004s : 1: interleave_parallel_branches 0.01% : 0.000004s : 1: interleave_split_concat_branches 0.01% : 0.000006s : 1: label_fine_grained_interleaved_index 0.01% : 0.000007s : 1: label_micro_interleaved_index 0.59% : 0.000445s : 1: loop_unroll 0.01% : 0.000004s : 1: merge_cast_opt 0.01% : 0.000005s : 1: micro_interleaved_order_control 0.93% : 0.000709s : 1: mutable_eliminate 0.01% : 0.000007s : 1: offloading_packed_experts 0.02% : 0.000016s : 1: opt.transform.loop_unroll_optimizer 0.02% : 0.000016s : 1: opt.transform.mutable_eliminate 2.32% : 0.001756s : 78: opt.transform.opt_a 0.05% : 0.000037s : 1: opt.transform.opt_after_cconv 0.04% : 0.000029s : 1: opt.transform.opt_after_jit_grad 0.18% : 0.000136s : 28: opt.transform.opt_b 0.08% : 0.000059s : 2: opt.transform.opt_trans_graph 0.06% : 0.000043s : 4: opt.transform.symbol_engine_opt 22.12% : 0.016781s : 1: opt_a 0.18% : 0.000137s : 1: opt_after_cconv 0.65% : 0.000496s : 1: opt_after_jit_grad 0.33% : 0.000251s : 1: opt_b 25.29% : 0.019185s : 1: optimize 0.03% : 0.000022s : 1: optimize_parallel_all_gather_comm 0.01% : 0.000009s : 1: order_py_execute_after_rewriter 0.04% : 0.000027s : 1: overlap_grad_flash_sp 0.01% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.01% : 0.000008s : 1: overlap_grad_ring_attention 0.01% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.01% : 0.000005s : 1: overlap_param_gather 0.01% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.01% : 0.000008s : 1: overlap_recompute_and_grad_model_parallel 0.01% : 0.000005s : 1: overlap_recompute_comm 0.01% : 0.000007s : 1: parallel-infer-symbol 0.01% : 0.000004s : 1: parallel-infer-symbol-second 0.01% : 0.000005s : 1: partial_unused_args_eliminate 0.01% : 0.000005s : 1: pipeline_parallel_scheduler 0.01% : 0.000005s : 1: pipeline_split 0.07% : 0.000050s : 1: pre_auto_parallel 0.05% : 0.000038s : 1: py_interpret_to_execute 0.03% : 0.000022s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000005s : 1: remove_cast_before_assign_add 0.02% : 0.000018s : 1: remove_dup_value 17.63% : 0.013371s : 1: renormalize.infer 0.87% : 0.000657s : 1: renormalize.specialize 0.01% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.01% : 0.000007s : 1: rewriter_after_jit_bprop_graph 0.06% : 0.000047s : 1: rewriter_after_opt_a 0.16% : 0.000122s : 1: rewriter_before_opt_a 0.01% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.01% : 0.000005s : 1: slice_recompute_activation 0.01% : 0.000005s : 1: split_layernorm_comm 0.01% : 0.000005s : 1: split_matmul_comm_elemetwise 0.01% : 0.000009s : 1: swap_dp_allreduce_reducescatter 0.11% : 0.000085s : 1: symbol_engine_optimizer 0.12% : 0.000090s : 1: tuple_transform 17.91% : 0.013586s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:02.111.381 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:02.111.640 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0244923, [21] [bootstrap]: 0.00044173 [type_inference]: 0.0128647 [event_method]: 7.558e-05 [auto_monad]: 0.00010094 [graph_reusing]: 8.70001e-06 [inline]: 1.96e-06 [add_attr]: 0.00305474, [1] [add_attr_with_inline]: 0.00304596, [1] [Cycle 1]: 8.523e-05, [2] [tag_attr]: 2.982e-05 [meta_addattr_fg_expand]: 8.47e-06 [parallel-infer-symbol]: 3.18998e-06 [pre_auto_parallel]: 4.357e-05 [insert-virtual-dataset]: 2.34001e-06 [parallel-infer-symbol-second]: 7.10017e-07 [dataset_repeat_opt]: 1.97999e-06 [pipeline_split]: 1.77001e-06 [optimize]: 0.00661975, [53] [py_interpret_to_execute]: 3.509e-05 [rewriter_before_opt_a]: 0.00012264 [opt_a]: 0.00403431, [2] [Cycle 1]: 0.00295656, [45] [expand_dump_flag]: 3.97e-06 [switch_simplify]: 0.00012091 [loop_unroll]: 4.505e-05 [a_1]: 0.00092136 [with_stream_mark]: 1.645e-05 [recompute_prepare]: 1.174e-05 [updatestate_depend_eliminate]: 5.86e-06 [updatestate_assign_eliminate]: 4.60001e-06 [updatestate_loads_eliminate]: 4.08001e-06 [parameter_eliminate]: 2.01e-06 [a_2]: 0.0001499 [accelerated_algorithm]: 9.89001e-06 [shard]: 2.14e-06 [meta_shard_fg_expand]: 2.48e-06 [shard_inline]: 9.07999e-06 [merge_send_recv]: 1e-05 [auto_parallel]: 7.35998e-06 [parallel]: 1.93e-05 [flash_sp]: 8.85999e-06 [merge_comm]: 5.42001e-06 [allreduce_fusion]: 4.85999e-06 [matmul_add_comm_reduction]: 1.094e-05 [allreduce_slice_to_reducescatter]: 6.80011e-07 [virtual_shard_identity]: 1.054e-05 [virtual_dataset]: 1.006e-05 [get_grad_eliminate_]: 8.82999e-06 [virtual_output]: 8.97999e-06 [merge_forward]: 4.98001e-06 [cell_reuse_recompute_pass]: 1.29998e-06 [offload_activation]: 1.201e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.935e-05 [merge_recompute_call_nodes]: 1.52001e-06 [before_grad]: 1.495e-05 [set_forward_comm_id_for_comm_node_pass]: 5.29e-06 [meta_fg_expand]: 4.45e-06 [flash_sp_send_recv_attached]: 2.44001e-06 [receive_attached]: 2.17001e-06 [after_resolve]: 1.364e-05 [a_after_grad]: 1.393e-05 [renormalize]: 0.00090509 [add_forward_monad_depend]: 5.45001e-06 [auto_monad_grad]: 2.46e-06 [auto_monad_eliminator]: 1.876e-05 [cse]: 4.101e-05 [a_3]: 8.294e-05 [Cycle 2]: 0.00106521, [45] [expand_dump_flag]: 1.09e-06 [switch_simplify]: 1.089e-05 [loop_unroll]: 8.97e-06 [a_1]: 0.00022033 [with_stream_mark]: 1.285e-05 [recompute_prepare]: 9.25999e-06 [updatestate_depend_eliminate]: 4.58999e-06 [updatestate_assign_eliminate]: 3.95e-06 [updatestate_loads_eliminate]: 3.63999e-06 [parameter_eliminate]: 1.18001e-06 [a_2]: 0.00017103 [accelerated_algorithm]: 9.32001e-06 [shard]: 1.18001e-06 [meta_shard_fg_expand]: 2.02001e-06 [shard_inline]: 9.34e-06 [merge_send_recv]: 6.29999e-06 [auto_parallel]: 6.89001e-06 [parallel]: 5.27999e-06 [flash_sp]: 3.78001e-06 [merge_comm]: 4.97e-06 [allreduce_fusion]: 4.85001e-06 [matmul_add_comm_reduction]: 7.85e-06 [allreduce_slice_to_reducescatter]: 3.69997e-07 [virtual_shard_identity]: 9.69e-06 [virtual_dataset]: 8.77e-06 [get_grad_eliminate_]: 8.13999e-06 [virtual_output]: 8.03001e-06 [merge_forward]: 3.93001e-06 [cell_reuse_recompute_pass]: 1.51998e-06 [offload_activation]: 8.58001e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.979e-05 [merge_recompute_call_nodes]: 7.50006e-07 [before_grad]: 1.377e-05 [set_forward_comm_id_for_comm_node_pass]: 5.56e-06 [meta_fg_expand]: 3.2e-06 [flash_sp_send_recv_attached]: 9.90025e-07 [receive_attached]: 1.17999e-06 [after_resolve]: 1.297e-05 [a_after_grad]: 1.314e-05 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 1.31998e-06 [auto_monad_grad]: 1.05001e-06 [auto_monad_eliminator]: 1.023e-05 [cse]: 2.332e-05 [a_3]: 6.761e-05 [py_interpret_to_execute_after_opt_a]: 1.495e-05 [slice_cell_reuse_recomputed_activation]: 4.60001e-06 [rewriter_after_opt_a]: 4.71e-05 [convert_after_rewriter]: 1.152e-05 [order_py_execute_after_rewriter]: 9.71e-06 [mutable_eliminate]: 0.00052486 [opt_b]: 0.00034554, [1] [Cycle 1]: 0.00033709, [7] [b_1]: 0.00022824 [b_2]: 1.091e-05 [updatestate_depend_eliminate]: 6.74999e-06 [updatestate_assign_eliminate]: 3.89002e-06 [updatestate_loads_eliminate]: 3.82998e-06 [renormalize]: 5.90022e-07 [cse]: 2.634e-05 [optimize_parallel_all_gather_comm]: 2.27e-05 [overlap_param_gather]: 5.46e-06 [cconv]: 3.146e-05 [loop_unroll]: 0.00046295 [opt_after_cconv]: 0.00015705, [1] [Cycle 1]: 0.0001485, [7] [c_1]: 4.571e-05 [parameter_eliminate]: 3.71999e-06 [updatestate_depend_eliminate]: 6.70998e-06 [updatestate_assign_eliminate]: 3.81999e-06 [updatestate_loads_eliminate]: 3.75e-06 [cse]: 2.725e-05 [renormalize]: 5.3001e-07 [remove_dup_value]: 4.193e-05 [tuple_transform]: 0.00011315, [1] [Cycle 1]: 0.00010557, [4] [d_1]: 6.249e-05 [none_parameter_eliminate]: 1.67001e-06 [renormalize]: 2.50002e-07 [switch_simplify]: 9.88002e-06 [partial_unused_args_eliminate]: 4.47998e-06 [add_recomputation]: 6.233e-05 [cse_after_recomputation]: 3.358e-05, [1] [Cycle 1]: 2.636e-05, [1] [cse]: 1.73e-05 [environ_conv]: 1.032e-05 [swap_dp_allreduce_reducescatter]: 9.57999e-06 [bias_add_comm_swap]: 5.49e-06 [label_micro_interleaved_index]: 7.43999e-06 [label_fine_grained_interleaved_index]: 5.57999e-06 [merge_cast_opt]: 3.65e-06 [slice_recompute_activation]: 4.49002e-06 [micro_interleaved_order_control]: 4.88001e-06 [assign_add_opt]: 3.59002e-06 [ForceFp32Comm]: 3.24001e-06 [remove_cast_before_assign_add]: 3.56999e-06 [full_micro_interleaved_order_control]: 4.55001e-06 [reorder_send_recv_between_fp_bp]: 5.61003e-06 [comm_op_add_attrs]: 3.89002e-06 [add_comm_op_reuse_tag]: 3.40998e-06 [interleave_split_concat_branches]: 3.51999e-06 [interleave_parallel_branches]: 3.46999e-06 [overlap_opt_shard_in_pipeline]: 3.48e-06 [overlap_opt_shard_grad_in_pipeline]: 4e-06 [control_data_broadcast_order]: 1.911e-05 [grouped_pairwise_exchange_alltoall]: 4.02e-06 [offloading_packed_experts]: 7.53e-06 [overlap_recompute_and_grad_model_parallel]: 7.63001e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.66999e-06 [overlap_recompute_allgather_and_fa_grad]: 3.67998e-06 [overlap_recompute_comm]: 4.53001e-06 [overlap_grad_ring_attention]: 7.18e-06 [overlap_grad_flash_sp]: 2.809e-05 [begin_end_overlap_inline]: 2.94999e-06 [split_matmul_comm_elemetwise]: 4.3e-06 [split_layernorm_comm]: 3.98999e-06 [handle_group_info]: 3.36999e-06 [symbol_engine_optimizer]: 0.00011371, [1] [Cycle 1]: 0.00010651, [6] [build]: 3.48999e-06 [elim_shapecalc]: 1.333e-05 [elim_not_effective]: 1.809e-05 [opt_reshape]: 1.008e-05 [fold_const_symbol]: 1.506e-05 [renormalize]: 2.60014e-07 [detach_backward]: 3.86999e-06 [pipeline_parallel_scheduler]: 1.83002e-06 [auto_monad_reorder]: 2.497e-05 [get_jit_bprop_graph]: 1.52999e-06 [rewriter_after_jit_bprop_graph]: 5.15999e-06 [opt_after_jit_grad]: 0.00053569 [validate]: 4.513e-05 Sums bootstrap : 0.000442s : 2.25% type_inference : 0.012865s : 65.56% event_method : 0.000076s : 0.39% auto_monad : 0.000101s : 0.51% graph_reusing : 0.000009s : 0.04% inline : 0.000002s : 0.01% add_attr.add_attr_with_inline.tag_attr : 0.000030s : 0.15% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000008s : 0.04% parallel-infer-symbol : 0.000003s : 0.02% pre_auto_parallel : 0.000044s : 0.22% insert-virtual-dataset : 0.000002s : 0.01% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.01% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000035s : 0.18% optimize.rewriter_before_opt_a : 0.000123s : 0.62% optimize.opt_a.expand_dump_flag : 0.000005s : 0.03% optimize.opt_a.switch_simplify : 0.000132s : 0.67% optimize.opt_a.loop_unroll : 0.000054s : 0.28% optimize.opt_a.a_1 : 0.001142s : 5.82% optimize.opt_a.with_stream_mark : 0.000029s : 0.15% optimize.opt_a.recompute_prepare : 0.000021s : 0.11% optimize.opt_a.updatestate_depend_eliminate : 0.000010s : 0.05% optimize.opt_a.updatestate_assign_eliminate : 0.000009s : 0.04% optimize.opt_a.updatestate_loads_eliminate : 0.000008s : 0.04% optimize.opt_a.parameter_eliminate : 0.000003s : 0.02% optimize.opt_a.a_2 : 0.000321s : 1.64% optimize.opt_a.accelerated_algorithm : 0.000019s : 0.10% optimize.opt_a.shard : 0.000003s : 0.02% optimize.opt_a.meta_shard_fg_expand : 0.000005s : 0.02% optimize.opt_a.shard_inline : 0.000018s : 0.09% optimize.opt_a.merge_send_recv : 0.000016s : 0.08% optimize.opt_a.auto_parallel : 0.000014s : 0.07% optimize.opt_a.parallel : 0.000025s : 0.13% optimize.opt_a.flash_sp : 0.000013s : 0.06% optimize.opt_a.merge_comm : 0.000010s : 0.05% optimize.opt_a.allreduce_fusion : 0.000010s : 0.05% optimize.opt_a.matmul_add_comm_reduction : 0.000019s : 0.10% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000020s : 0.10% optimize.opt_a.virtual_dataset : 0.000019s : 0.10% optimize.opt_a.get_grad_eliminate_ : 0.000017s : 0.09% optimize.opt_a.virtual_output : 0.000017s : 0.09% optimize.opt_a.merge_forward : 0.000009s : 0.05% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.01% optimize.opt_a.offload_activation : 0.000021s : 0.10% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000039s : 0.20% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.01% optimize.opt_a.before_grad : 0.000029s : 0.15% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000011s : 0.06% optimize.opt_a.meta_fg_expand : 0.000008s : 0.04% optimize.opt_a.flash_sp_send_recv_attached : 0.000003s : 0.02% optimize.opt_a.receive_attached : 0.000003s : 0.02% optimize.opt_a.after_resolve : 0.000027s : 0.14% optimize.opt_a.a_after_grad : 0.000027s : 0.14% optimize.opt_a.renormalize : 0.000905s : 4.61% optimize.opt_a.add_forward_monad_depend : 0.000007s : 0.03% optimize.opt_a.auto_monad_grad : 0.000004s : 0.02% optimize.opt_a.auto_monad_eliminator : 0.000029s : 0.15% optimize.opt_a.cse : 0.000064s : 0.33% optimize.opt_a.a_3 : 0.000151s : 0.77% optimize.py_interpret_to_execute_after_opt_a : 0.000015s : 0.08% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.02% optimize.rewriter_after_opt_a : 0.000047s : 0.24% optimize.convert_after_rewriter : 0.000012s : 0.06% optimize.order_py_execute_after_rewriter : 0.000010s : 0.05% optimize.mutable_eliminate : 0.000525s : 2.67% optimize.opt_b.b_1 : 0.000228s : 1.16% optimize.opt_b.b_2 : 0.000011s : 0.06% optimize.opt_b.updatestate_depend_eliminate : 0.000007s : 0.03% optimize.opt_b.updatestate_assign_eliminate : 0.000004s : 0.02% optimize.opt_b.updatestate_loads_eliminate : 0.000004s : 0.02% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000026s : 0.13% optimize.optimize_parallel_all_gather_comm : 0.000023s : 0.12% optimize.overlap_param_gather : 0.000005s : 0.03% optimize.cconv : 0.000031s : 0.16% optimize.loop_unroll : 0.000463s : 2.36% optimize.opt_after_cconv.c_1 : 0.000046s : 0.23% optimize.opt_after_cconv.parameter_eliminate : 0.000004s : 0.02% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000007s : 0.03% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000004s : 0.02% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000004s : 0.02% optimize.opt_after_cconv.cse : 0.000027s : 0.14% optimize.opt_after_cconv.renormalize : 0.000001s : 0.00% optimize.remove_dup_value : 0.000042s : 0.21% optimize.tuple_transform.d_1 : 0.000062s : 0.32% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000010s : 0.05% optimize.partial_unused_args_eliminate : 0.000004s : 0.02% optimize.add_recomputation : 0.000062s : 0.32% optimize.cse_after_recomputation.cse : 0.000017s : 0.09% optimize.environ_conv : 0.000010s : 0.05% optimize.swap_dp_allreduce_reducescatter : 0.000010s : 0.05% optimize.bias_add_comm_swap : 0.000005s : 0.03% optimize.label_micro_interleaved_index : 0.000007s : 0.04% optimize.label_fine_grained_interleaved_index : 0.000006s : 0.03% optimize.merge_cast_opt : 0.000004s : 0.02% optimize.slice_recompute_activation : 0.000004s : 0.02% optimize.micro_interleaved_order_control : 0.000005s : 0.02% optimize.assign_add_opt : 0.000004s : 0.02% optimize.ForceFp32Comm : 0.000003s : 0.02% optimize.remove_cast_before_assign_add : 0.000004s : 0.02% optimize.full_micro_interleaved_order_control : 0.000005s : 0.02% optimize.reorder_send_recv_between_fp_bp : 0.000006s : 0.03% optimize.comm_op_add_attrs : 0.000004s : 0.02% optimize.add_comm_op_reuse_tag : 0.000003s : 0.02% optimize.interleave_split_concat_branches : 0.000004s : 0.02% optimize.interleave_parallel_branches : 0.000003s : 0.02% optimize.overlap_opt_shard_in_pipeline : 0.000003s : 0.02% optimize.overlap_opt_shard_grad_in_pipeline : 0.000004s : 0.02% optimize.control_data_broadcast_order : 0.000019s : 0.10% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.02% optimize.offloading_packed_experts : 0.000008s : 0.04% optimize.overlap_recompute_and_grad_model_parallel : 0.000008s : 0.04% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.02% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.02% optimize.overlap_recompute_comm : 0.000005s : 0.02% optimize.overlap_grad_ring_attention : 0.000007s : 0.04% optimize.overlap_grad_flash_sp : 0.000028s : 0.14% optimize.begin_end_overlap_inline : 0.000003s : 0.02% optimize.split_matmul_comm_elemetwise : 0.000004s : 0.02% optimize.split_layernorm_comm : 0.000004s : 0.02% optimize.handle_group_info : 0.000003s : 0.02% optimize.symbol_engine_optimizer.build : 0.000003s : 0.02% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000013s : 0.07% optimize.symbol_engine_optimizer.elim_not_effective : 0.000018s : 0.09% optimize.symbol_engine_optimizer.opt_reshape : 0.000010s : 0.05% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000015s : 0.08% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000004s : 0.02% pipeline_parallel_scheduler : 0.000002s : 0.01% auto_monad_reorder : 0.000025s : 0.13% get_jit_bprop_graph : 0.000002s : 0.01% rewriter_after_jit_bprop_graph : 0.000005s : 0.03% opt_after_jit_grad : 0.000536s : 2.73% validate : 0.000045s : 0.23% Time group info: ------[substitution.] 0.000255 54 11.15% : 0.000028s : 6: substitution.cast_eliminate 1.12% : 0.000003s : 4: substitution.elim_not_effective 0.94% : 0.000002s : 4: substitution.fold_const_symbol 2.95% : 0.000008s : 6: substitution.graph_param_transform 68.12% : 0.000174s : 8: substitution.inline 1.88% : 0.000005s : 8: substitution.j_node_and_user_rematch 2.83% : 0.000007s : 8: substitution.remove_not_recompute_node 1.85% : 0.000005s : 4: substitution.replace_old_param 4.10% : 0.000010s : 2: substitution.switch_simplify 5.05% : 0.000013s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.012807 2 89.23% : 0.011428s : 1: type_inference.infer 10.77% : 0.001379s : 1: type_inference.specialize ------[replace.] 0.000116 14 43.35% : 0.000050s : 8: replace.inline 36.91% : 0.000043s : 2: replace.switch_simplify 19.74% : 0.000023s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000189 14 89.52% : 0.000169s : 8: match.inline 4.73% : 0.000009s : 2: match.switch_simplify 5.75% : 0.000011s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000312 1972 0.96% : 0.000003s : 21: predicate.accumulaten_eliminater 0.65% : 0.000002s : 6: predicate.ad_related_special_op_eliminate 0.54% : 0.000002s : 12: predicate.addn_check_dump 0.96% : 0.000003s : 21: predicate.addn_zero_filter 0.87% : 0.000003s : 21: predicate.adjust_all_reduce_mul_add 2.01% : 0.000006s : 33: predicate.arithmetic_simplify 1.10% : 0.000003s : 21: predicate.cast_eliminate 0.60% : 0.000002s : 12: predicate.check_bprop_eliminate 0.55% : 0.000002s : 12: predicate.compare_switch_simplify 0.19% : 0.000001s : 6: predicate.const_output_eliminate 0.65% : 0.000002s : 12: predicate.depend_value_elim 1.05% : 0.000003s : 21: predicate.dict_get_item_const_eliminator 1.07% : 0.000003s : 21: predicate.dict_get_item_eliminator 0.94% : 0.000003s : 21: predicate.dict_set_item_eliminator 0.88% : 0.000003s : 12: predicate.dumpgradient_eliminate 0.17% : 0.000001s : 6: predicate.elim_not_effective 0.35% : 0.000001s : 6: predicate.elim_shapecalc_of_broadcastargs 1.22% : 0.000004s : 27: predicate.environ_add_const_eliminate 1.17% : 0.000004s : 27: predicate.environ_get_add_eliminate 1.19% : 0.000004s : 27: predicate.environ_get_depend_swap 1.75% : 0.000005s : 39: predicate.environ_get_eliminate 1.15% : 0.000004s : 27: predicate.environ_get_set_eliminate 1.52% : 0.000005s : 33: predicate.exchange_switch_depend_value 2.59% : 0.000008s : 33: predicate.float_depend_g_call 0.52% : 0.000002s : 12: predicate.float_environ_get_switch 0.82% : 0.000003s : 18: predicate.float_tuple_getitem_switch 0.18% : 0.000001s : 6: predicate.fold_const_symbol 0.62% : 0.000002s : 12: predicate.get_grad_eliminate 0.22% : 0.000001s : 6: predicate.graph_param_transform 0.64% : 0.000002s : 12: predicate.incorporate_call 0.53% : 0.000002s : 12: predicate.incorporate_call_switch 6.57% : 0.000020s : 90: predicate.inline 0.75% : 0.000002s : 12: predicate.inline_without_move 0.30% : 0.000001s : 12: predicate.j_node_and_user_rematch 0.73% : 0.000002s : 12: predicate.less_batch_normalization 1.73% : 0.000005s : 37: predicate.list_to_tuple_eliminator_ 2.61% : 0.000008s : 58: predicate.load_eliminater 0.86% : 0.000003s : 6: predicate.loop_unroll_after_grad 2.36% : 0.000007s : 51: predicate.loop_unroll_before_grad 1.70% : 0.000005s : 33: predicate.make_slice_get_slice_eliminator 0.64% : 0.000002s : 12: predicate.merge_addn 0.64% : 0.000002s : 12: predicate.micro_step_allgather_replace 0.69% : 0.000002s : 12: predicate.mini_step_allgather_replace 0.93% : 0.000003s : 21: predicate.minmaximum_grad 0.79% : 0.000002s : 6: predicate.mutable_eliminate 0.34% : 0.000001s : 6: predicate.opt_reshape 0.35% : 0.000001s : 6: predicate.parallel_virtual_node 1.81% : 0.000006s : 33: predicate.partial_defer_inline 1.67% : 0.000005s : 31: predicate.partial_eliminate 0.98% : 0.000003s : 21: predicate.print_const_string_wrapper 0.55% : 0.000002s : 12: predicate.reduce_all_const_elim 1.18% : 0.000004s : 21: predicate.reduce_eliminate 2.53% : 0.000008s : 58: predicate.redundant_stop_gradient_eliminater 0.36% : 0.000001s : 12: predicate.remove_not_recompute_node 1.34% : 0.000004s : 37: predicate.replace_applicator 0.37% : 0.000001s : 12: predicate.replace_old_param 0.22% : 0.000001s : 6: predicate.reset_defer_inline 0.99% : 0.000003s : 21: predicate.reshape_eliminate 0.59% : 0.000002s : 12: predicate.row_tensor_add_zeros_like 0.36% : 0.000001s : 6: predicate.row_tensor_eliminate 0.70% : 0.000002s : 12: predicate.same_eliminate 0.40% : 0.000001s : 12: predicate.set_cell_output_no_recompute 0.67% : 0.000002s : 12: predicate.shard_identity_eliminate 0.63% : 0.000002s : 12: predicate.special_op_eliminate 0.70% : 0.000002s : 12: predicate.specialize_transform 0.74% : 0.000002s : 12: predicate.split_environ_get_set_with_tuple_value 0.65% : 0.000002s : 12: predicate.stack_unstack_eliminate 0.32% : 0.000001s : 6: predicate.switch_call_monad_eliminater 1.64% : 0.000005s : 33: predicate.switch_defer_inline 2.17% : 0.000007s : 45: predicate.switch_layer_defer_inline 5.52% : 0.000017s : 106: predicate.switch_simplify 0.95% : 0.000003s : 21: predicate.tile_eliminate 1.01% : 0.000003s : 21: predicate.transpose_eliminate 1.60% : 0.000005s : 33: predicate.tuple_list_convert_item_index_to_positive 1.67% : 0.000005s : 33: predicate.tuple_list_get_item_const_eliminator 1.60% : 0.000005s : 33: predicate.tuple_list_get_item_depend_reorder 3.12% : 0.000010s : 49: predicate.tuple_list_get_item_eliminator 1.55% : 0.000005s : 33: predicate.tuple_list_get_set_item_eliminator 2.21% : 0.000007s : 45: predicate.tuple_list_set_item_eliminator 1.67% : 0.000005s : 37: predicate.tuple_to_list_eliminator_ 2.48% : 0.000008s : 58: predicate.updatestate_pure_node_eliminater 3.15% : 0.000010s : 70: predicate.updatestate_useless_node_eliminater 0.33% : 0.000001s : 6: predicate.value_based_eliminate 0.70% : 0.000002s : 12: predicate.virtual_dataset_eliminate 0.58% : 0.000002s : 12: predicate.virtual_output_eliminate 0.30% : 0.000001s : 6: predicate.virtual_view_grad_eliminate 0.38% : 0.000001s : 6: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000928 16 55.90% : 0.000519s : 6: func_graph_cloner_run.FuncGraphClonerGraph 44.10% : 0.000409s : 10: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.037200 192 0.02% : 0.000006s : 1: ForceFp32Comm 8.23% : 0.003063s : 1: add_attr 8.20% : 0.003050s : 1: add_attr_with_inline 0.02% : 0.000006s : 1: add_comm_op_reuse_tag 0.18% : 0.000066s : 1: add_recomputation 0.02% : 0.000006s : 1: assign_add_opt 0.30% : 0.000112s : 1: auto_monad 0.09% : 0.000033s : 1: auto_monad_reorder 0.02% : 0.000006s : 1: begin_end_overlap_inline 0.02% : 0.000008s : 1: bias_add_comm_swap 1.31% : 0.000486s : 1: bootstrap 0.09% : 0.000035s : 1: cconv 0.02% : 0.000007s : 1: comm_op_add_attrs 0.06% : 0.000023s : 1: control_data_broadcast_order 0.04% : 0.000015s : 1: convert_after_rewriter 0.10% : 0.000037s : 1: cse_after_recomputation 0.02% : 0.000007s : 1: dataset_repeat_opt 0.06% : 0.000023s : 1: detach_backward 0.04% : 0.000013s : 1: environ_conv 0.24% : 0.000089s : 1: event_method 0.02% : 0.000008s : 1: full_micro_interleaved_order_control 0.02% : 0.000007s : 1: get_jit_bprop_graph 0.04% : 0.000015s : 1: graph_reusing 0.02% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.02% : 0.000006s : 1: handle_group_info 0.02% : 0.000007s : 1: inline 0.02% : 0.000009s : 1: insert-virtual-dataset 0.02% : 0.000006s : 1: interleave_parallel_branches 0.02% : 0.000006s : 1: interleave_split_concat_branches 0.02% : 0.000008s : 1: label_fine_grained_interleaved_index 0.03% : 0.000010s : 1: label_micro_interleaved_index 1.26% : 0.000469s : 1: loop_unroll 0.02% : 0.000006s : 1: merge_cast_opt 0.02% : 0.000007s : 1: micro_interleaved_order_control 1.43% : 0.000531s : 1: mutable_eliminate 0.03% : 0.000010s : 1: offloading_packed_experts 0.05% : 0.000018s : 1: opt.transform.loop_unroll_optimizer 0.05% : 0.000019s : 1: opt.transform.mutable_eliminate 5.05% : 0.001878s : 78: opt.transform.opt_a 0.12% : 0.000044s : 1: opt.transform.opt_after_cconv 0.09% : 0.000034s : 1: opt.transform.opt_after_jit_grad 0.45% : 0.000166s : 28: opt.transform.opt_b 0.19% : 0.000070s : 2: opt.transform.opt_trans_graph 0.14% : 0.000053s : 4: opt.transform.symbol_engine_opt 10.85% : 0.004038s : 1: opt_a 0.43% : 0.000161s : 1: opt_after_cconv 1.47% : 0.000547s : 1: opt_after_jit_grad 0.94% : 0.000349s : 1: opt_b 18.86% : 0.007017s : 1: optimize 0.07% : 0.000026s : 1: optimize_parallel_all_gather_comm 0.03% : 0.000013s : 1: order_py_execute_after_rewriter 0.09% : 0.000032s : 1: overlap_grad_flash_sp 0.02% : 0.000006s : 1: overlap_grad_matmul_and_grad_allreduce 0.03% : 0.000010s : 1: overlap_grad_ring_attention 0.02% : 0.000007s : 1: overlap_opt_shard_grad_in_pipeline 0.02% : 0.000006s : 1: overlap_opt_shard_in_pipeline 0.02% : 0.000009s : 1: overlap_param_gather 0.02% : 0.000006s : 1: overlap_recompute_allgather_and_fa_grad 0.03% : 0.000010s : 1: overlap_recompute_and_grad_model_parallel 0.02% : 0.000007s : 1: overlap_recompute_comm 0.03% : 0.000010s : 1: parallel-infer-symbol 0.02% : 0.000006s : 1: parallel-infer-symbol-second 0.02% : 0.000008s : 1: partial_unused_args_eliminate 0.02% : 0.000009s : 1: pipeline_parallel_scheduler 0.02% : 0.000008s : 1: pipeline_split 0.14% : 0.000051s : 1: pre_auto_parallel 0.11% : 0.000039s : 1: py_interpret_to_execute 0.05% : 0.000018s : 1: py_interpret_to_execute_after_opt_a 0.02% : 0.000006s : 1: remove_cast_before_assign_add 0.12% : 0.000046s : 1: remove_dup_value 1.28% : 0.000476s : 1: renormalize.infer 1.13% : 0.000421s : 1: renormalize.specialize 0.02% : 0.000008s : 1: reorder_send_recv_between_fp_bp 0.03% : 0.000011s : 1: rewriter_after_jit_bprop_graph 0.14% : 0.000051s : 1: rewriter_after_opt_a 0.34% : 0.000126s : 1: rewriter_before_opt_a 0.02% : 0.000007s : 1: slice_cell_reuse_recomputed_activation 0.02% : 0.000007s : 1: slice_recompute_activation 0.02% : 0.000007s : 1: split_layernorm_comm 0.02% : 0.000007s : 1: split_matmul_comm_elemetwise 0.03% : 0.000012s : 1: swap_dp_allreduce_reducescatter 0.31% : 0.000117s : 1: symbol_engine_optimizer 0.31% : 0.000116s : 1: tuple_transform 34.68% : 0.012901s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:02.434.353 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0236971, [21] [bootstrap]: 0.00045823 [type_inference]: 0.012896 [event_method]: 8.368e-05 [auto_monad]: 9.937e-05 [graph_reusing]: 7.88999e-06 [inline]: 2.33998e-06 [add_attr]: 0.00310668, [1] [add_attr_with_inline]: 0.00309724, [1] [Cycle 1]: 7.231e-05, [2] [tag_attr]: 2.95e-05 [meta_addattr_fg_expand]: 8.52e-06 [parallel-infer-symbol]: 3.21999e-06 [pre_auto_parallel]: 4.352e-05 [insert-virtual-dataset]: 2.98e-06 [parallel-infer-symbol-second]: 8.70001e-07 [dataset_repeat_opt]: 1.93002e-06 [pipeline_split]: 1.69e-06 [optimize]: 0.00625717, [53] [py_interpret_to_execute]: 3.18e-05 [rewriter_before_opt_a]: 0.00011854 [opt_a]: 0.0038518, [2] [Cycle 1]: 0.00294932, [45] [expand_dump_flag]: 4.30999e-06 [switch_simplify]: 0.00012673 [loop_unroll]: 4.67e-05 [a_1]: 0.00098498 [with_stream_mark]: 1.745e-05 [recompute_prepare]: 1.177e-05 [updatestate_depend_eliminate]: 5.44e-06 [updatestate_assign_eliminate]: 4.46002e-06 [updatestate_loads_eliminate]: 4.48001e-06 [parameter_eliminate]: 2.01e-06 [a_2]: 0.0001242 [accelerated_algorithm]: 1.025e-05 [shard]: 2.26e-06 [meta_shard_fg_expand]: 2.41e-06 [shard_inline]: 9.50001e-06 [merge_send_recv]: 1.053e-05 [auto_parallel]: 8.22e-06 [parallel]: 1.938e-05 [flash_sp]: 9.32999e-06 [merge_comm]: 5.61003e-06 [allreduce_fusion]: 5.00999e-06 [matmul_add_comm_reduction]: 1.196e-05 [allreduce_slice_to_reducescatter]: 6.80011e-07 [virtual_shard_identity]: 1.11e-05 [virtual_dataset]: 9.42001e-06 [get_grad_eliminate_]: 9.12001e-06 [virtual_output]: 9.59e-06 [merge_forward]: 5.29e-06 [cell_reuse_recompute_pass]: 1.18001e-06 [offload_activation]: 1.192e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.007e-05 [merge_recompute_call_nodes]: 1.82999e-06 [before_grad]: 1.515e-05 [set_forward_comm_id_for_comm_node_pass]: 5.07e-06 [meta_fg_expand]: 4.92e-06 [flash_sp_send_recv_attached]: 2.53003e-06 [receive_attached]: 2.43e-06 [after_resolve]: 1.463e-05 [a_after_grad]: 1.444e-05 [renormalize]: 0.00098722 [add_forward_monad_depend]: 5.74999e-06 [auto_monad_grad]: 1.99999e-06 [auto_monad_eliminator]: 1.904e-05 [cse]: 4.433e-05 [a_3]: 6.859e-05 [Cycle 2]: 0.00089198, [45] [expand_dump_flag]: 1.50999e-06 [switch_simplify]: 1.069e-05 [loop_unroll]: 9.27001e-06 [a_1]: 0.0002191 [with_stream_mark]: 1.318e-05 [recompute_prepare]: 9.20999e-06 [updatestate_depend_eliminate]: 4.50001e-06 [updatestate_assign_eliminate]: 3.61001e-06 [updatestate_loads_eliminate]: 3.75e-06 [parameter_eliminate]: 1.05999e-06 [a_2]: 0.00011327 [accelerated_algorithm]: 9.49999e-06 [shard]: 1.50999e-06 [meta_shard_fg_expand]: 1.96e-06 [shard_inline]: 9.27001e-06 [merge_send_recv]: 7.46001e-06 [auto_parallel]: 7.68001e-06 [parallel]: 5.69e-06 [flash_sp]: 4.28001e-06 [merge_comm]: 5.05999e-06 [allreduce_fusion]: 4.52e-06 [matmul_add_comm_reduction]: 8.05e-06 [allreduce_slice_to_reducescatter]: 5.50004e-07 [virtual_shard_identity]: 9.49999e-06 [virtual_dataset]: 8.49998e-06 [get_grad_eliminate_]: 8.56002e-06 [virtual_output]: 8.28001e-06 [merge_forward]: 4.17e-06 [cell_reuse_recompute_pass]: 1.59e-06 [offload_activation]: 7.89002e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.744e-05 [merge_recompute_call_nodes]: 7.30011e-07 [before_grad]: 1.356e-05 [set_forward_comm_id_for_comm_node_pass]: 5.55001e-06 [meta_fg_expand]: 3.36999e-06 [flash_sp_send_recv_attached]: 7.89994e-07 [receive_attached]: 1.08001e-06 [after_resolve]: 1.244e-05 [a_after_grad]: 1.324e-05 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 1.30001e-06 [auto_monad_grad]: 1.12999e-06 [auto_monad_eliminator]: 9.52999e-06 [cse]: 2.283e-05 [a_3]: 5.726e-05 [py_interpret_to_execute_after_opt_a]: 1.146e-05 [slice_cell_reuse_recomputed_activation]: 2.04e-06 [rewriter_after_opt_a]: 4.42e-05 [convert_after_rewriter]: 8.29998e-06 [order_py_execute_after_rewriter]: 6.25002e-06 [mutable_eliminate]: 0.00055611 [opt_b]: 0.00028095, [1] [Cycle 1]: 0.00027411, [7] [b_1]: 0.0001873 [b_2]: 1.019e-05 [updatestate_depend_eliminate]: 6.81999e-06 [updatestate_assign_eliminate]: 3.8e-06 [updatestate_loads_eliminate]: 4.02e-06 [renormalize]: 3.09985e-07 [cse]: 2.626e-05 [optimize_parallel_all_gather_comm]: 1.974e-05 [overlap_param_gather]: 2.19999e-06 [cconv]: 2.363e-05 [loop_unroll]: 0.00043745 [opt_after_cconv]: 0.00014902, [1] [Cycle 1]: 0.00014308, [7] [c_1]: 6.139e-05 [parameter_eliminate]: 2.51998e-06 [updatestate_depend_eliminate]: 7.38e-06 [updatestate_assign_eliminate]: 4.3e-06 [updatestate_loads_eliminate]: 3.85e-06 [cse]: 2.705e-05 [renormalize]: 3.20026e-07 [remove_dup_value]: 8.04e-05 [tuple_transform]: 0.0001063, [1] [Cycle 1]: 0.00010047, [4] [d_1]: 6.821e-05 [none_parameter_eliminate]: 2.12999e-06 [renormalize]: 2.80008e-07 [switch_simplify]: 9.76998e-06 [partial_unused_args_eliminate]: 1.76e-06 [add_recomputation]: 6.313e-05 [cse_after_recomputation]: 3.141e-05, [1] [Cycle 1]: 2.629e-05, [1] [cse]: 2.018e-05 [environ_conv]: 6.44001e-06 [swap_dp_allreduce_reducescatter]: 7.07002e-06 [bias_add_comm_swap]: 2.68e-06 [label_micro_interleaved_index]: 4.70999e-06 [label_fine_grained_interleaved_index]: 2.86e-06 [merge_cast_opt]: 1.24e-06 [slice_recompute_activation]: 2.09999e-06 [micro_interleaved_order_control]: 2.56e-06 [assign_add_opt]: 1.17999e-06 [ForceFp32Comm]: 8.39995e-07 [remove_cast_before_assign_add]: 1.03001e-06 [full_micro_interleaved_order_control]: 2.32001e-06 [reorder_send_recv_between_fp_bp]: 2.84999e-06 [comm_op_add_attrs]: 1.17999e-06 [add_comm_op_reuse_tag]: 1.00001e-06 [interleave_split_concat_branches]: 1.20001e-06 [interleave_parallel_branches]: 1.00001e-06 [overlap_opt_shard_in_pipeline]: 1.15999e-06 [overlap_opt_shard_grad_in_pipeline]: 2.10002e-06 [control_data_broadcast_order]: 1.59e-05 [grouped_pairwise_exchange_alltoall]: 1.52001e-06 [offloading_packed_experts]: 5.34e-06 [overlap_recompute_and_grad_model_parallel]: 5.50001e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.12999e-06 [overlap_recompute_allgather_and_fa_grad]: 1.37999e-06 [overlap_recompute_comm]: 2.55997e-06 [overlap_grad_ring_attention]: 4.62e-06 [overlap_grad_flash_sp]: 2.435e-05 [begin_end_overlap_inline]: 5.39992e-07 [split_matmul_comm_elemetwise]: 2.36e-06 [split_layernorm_comm]: 1.79e-06 [handle_group_info]: 9.40025e-07 [symbol_engine_optimizer]: 9.372e-05, [1] [Cycle 1]: 8.936e-05, [6] [build]: 3.5e-06 [elim_shapecalc]: 1.346e-05 [elim_not_effective]: 1.834e-05 [opt_reshape]: 9.86e-06 [fold_const_symbol]: 1.494e-05 [renormalize]: 2.19996e-07 [detach_backward]: 2.02999e-06 [pipeline_parallel_scheduler]: 1.66002e-06 [auto_monad_reorder]: 2.086e-05 [get_jit_bprop_graph]: 1.47999e-06 [rewriter_after_jit_bprop_graph]: 4e-06 [opt_after_jit_grad]: 0.00049838 [validate]: 4.317e-05 Sums bootstrap : 0.000458s : 2.34% type_inference : 0.012896s : 65.81% event_method : 0.000084s : 0.43% auto_monad : 0.000099s : 0.51% graph_reusing : 0.000008s : 0.04% inline : 0.000002s : 0.01% add_attr.add_attr_with_inline.tag_attr : 0.000030s : 0.15% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000009s : 0.04% parallel-infer-symbol : 0.000003s : 0.02% pre_auto_parallel : 0.000044s : 0.22% insert-virtual-dataset : 0.000003s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.01% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000032s : 0.16% optimize.rewriter_before_opt_a : 0.000119s : 0.60% optimize.opt_a.expand_dump_flag : 0.000006s : 0.03% optimize.opt_a.switch_simplify : 0.000137s : 0.70% optimize.opt_a.loop_unroll : 0.000056s : 0.29% optimize.opt_a.a_1 : 0.001204s : 6.14% optimize.opt_a.with_stream_mark : 0.000031s : 0.16% optimize.opt_a.recompute_prepare : 0.000021s : 0.11% optimize.opt_a.updatestate_depend_eliminate : 0.000010s : 0.05% optimize.opt_a.updatestate_assign_eliminate : 0.000008s : 0.04% optimize.opt_a.updatestate_loads_eliminate : 0.000008s : 0.04% optimize.opt_a.parameter_eliminate : 0.000003s : 0.02% optimize.opt_a.a_2 : 0.000237s : 1.21% optimize.opt_a.accelerated_algorithm : 0.000020s : 0.10% optimize.opt_a.shard : 0.000004s : 0.02% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.02% optimize.opt_a.shard_inline : 0.000019s : 0.10% optimize.opt_a.merge_send_recv : 0.000018s : 0.09% optimize.opt_a.auto_parallel : 0.000016s : 0.08% optimize.opt_a.parallel : 0.000025s : 0.13% optimize.opt_a.flash_sp : 0.000014s : 0.07% optimize.opt_a.merge_comm : 0.000011s : 0.05% optimize.opt_a.allreduce_fusion : 0.000010s : 0.05% optimize.opt_a.matmul_add_comm_reduction : 0.000020s : 0.10% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000021s : 0.11% optimize.opt_a.virtual_dataset : 0.000018s : 0.09% optimize.opt_a.get_grad_eliminate_ : 0.000018s : 0.09% optimize.opt_a.virtual_output : 0.000018s : 0.09% optimize.opt_a.merge_forward : 0.000009s : 0.05% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.01% optimize.opt_a.offload_activation : 0.000020s : 0.10% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000038s : 0.19% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.01% optimize.opt_a.before_grad : 0.000029s : 0.15% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000011s : 0.05% optimize.opt_a.meta_fg_expand : 0.000008s : 0.04% optimize.opt_a.flash_sp_send_recv_attached : 0.000003s : 0.02% optimize.opt_a.receive_attached : 0.000004s : 0.02% optimize.opt_a.after_resolve : 0.000027s : 0.14% optimize.opt_a.a_after_grad : 0.000028s : 0.14% optimize.opt_a.renormalize : 0.000987s : 5.04% optimize.opt_a.add_forward_monad_depend : 0.000007s : 0.04% optimize.opt_a.auto_monad_grad : 0.000003s : 0.02% optimize.opt_a.auto_monad_eliminator : 0.000029s : 0.15% optimize.opt_a.cse : 0.000067s : 0.34% optimize.opt_a.a_3 : 0.000126s : 0.64% optimize.py_interpret_to_execute_after_opt_a : 0.000011s : 0.06% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.01% optimize.rewriter_after_opt_a : 0.000044s : 0.23% optimize.convert_after_rewriter : 0.000008s : 0.04% optimize.order_py_execute_after_rewriter : 0.000006s : 0.03% optimize.mutable_eliminate : 0.000556s : 2.84% optimize.opt_b.b_1 : 0.000187s : 0.96% optimize.opt_b.b_2 : 0.000010s : 0.05% optimize.opt_b.updatestate_depend_eliminate : 0.000007s : 0.03% optimize.opt_b.updatestate_assign_eliminate : 0.000004s : 0.02% optimize.opt_b.updatestate_loads_eliminate : 0.000004s : 0.02% optimize.opt_b.renormalize : 0.000000s : 0.00% optimize.opt_b.cse : 0.000026s : 0.13% optimize.optimize_parallel_all_gather_comm : 0.000020s : 0.10% optimize.overlap_param_gather : 0.000002s : 0.01% optimize.cconv : 0.000024s : 0.12% optimize.loop_unroll : 0.000437s : 2.23% optimize.opt_after_cconv.c_1 : 0.000061s : 0.31% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000007s : 0.04% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000004s : 0.02% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000004s : 0.02% optimize.opt_after_cconv.cse : 0.000027s : 0.14% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000080s : 0.41% optimize.tuple_transform.d_1 : 0.000068s : 0.35% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000010s : 0.05% optimize.partial_unused_args_eliminate : 0.000002s : 0.01% optimize.add_recomputation : 0.000063s : 0.32% optimize.cse_after_recomputation.cse : 0.000020s : 0.10% optimize.environ_conv : 0.000006s : 0.03% optimize.swap_dp_allreduce_reducescatter : 0.000007s : 0.04% optimize.bias_add_comm_swap : 0.000003s : 0.01% optimize.label_micro_interleaved_index : 0.000005s : 0.02% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.01% optimize.merge_cast_opt : 0.000001s : 0.01% optimize.slice_recompute_activation : 0.000002s : 0.01% optimize.micro_interleaved_order_control : 0.000003s : 0.01% optimize.assign_add_opt : 0.000001s : 0.01% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.01% optimize.full_micro_interleaved_order_control : 0.000002s : 0.01% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.01% optimize.comm_op_add_attrs : 0.000001s : 0.01% optimize.add_comm_op_reuse_tag : 0.000001s : 0.01% optimize.interleave_split_concat_branches : 0.000001s : 0.01% optimize.interleave_parallel_branches : 0.000001s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.01% optimize.control_data_broadcast_order : 0.000016s : 0.08% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.01% optimize.offloading_packed_experts : 0.000005s : 0.03% optimize.overlap_recompute_and_grad_model_parallel : 0.000006s : 0.03% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.01% optimize.overlap_recompute_comm : 0.000003s : 0.01% optimize.overlap_grad_ring_attention : 0.000005s : 0.02% optimize.overlap_grad_flash_sp : 0.000024s : 0.12% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.01% optimize.split_layernorm_comm : 0.000002s : 0.01% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000003s : 0.02% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000013s : 0.07% optimize.symbol_engine_optimizer.elim_not_effective : 0.000018s : 0.09% optimize.symbol_engine_optimizer.opt_reshape : 0.000010s : 0.05% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000015s : 0.08% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.01% pipeline_parallel_scheduler : 0.000002s : 0.01% auto_monad_reorder : 0.000021s : 0.11% get_jit_bprop_graph : 0.000001s : 0.01% rewriter_after_jit_bprop_graph : 0.000004s : 0.02% opt_after_jit_grad : 0.000498s : 2.54% validate : 0.000043s : 0.22% Time group info: ------[substitution.] 0.000270 54 12.01% : 0.000032s : 6: substitution.cast_eliminate 0.99% : 0.000003s : 4: substitution.elim_not_effective 0.84% : 0.000002s : 4: substitution.fold_const_symbol 3.30% : 0.000009s : 6: substitution.graph_param_transform 67.46% : 0.000182s : 8: substitution.inline 1.73% : 0.000005s : 8: substitution.j_node_and_user_rematch 2.97% : 0.000008s : 8: substitution.remove_not_recompute_node 1.74% : 0.000005s : 4: substitution.replace_old_param 3.75% : 0.000010s : 2: substitution.switch_simplify 5.20% : 0.000014s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.012829 2 89.19% : 0.011443s : 1: type_inference.infer 10.81% : 0.001387s : 1: type_inference.specialize ------[replace.] 0.000122 14 42.12% : 0.000051s : 8: replace.inline 36.88% : 0.000045s : 2: replace.switch_simplify 21.00% : 0.000026s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000197 14 89.70% : 0.000177s : 8: match.inline 4.34% : 0.000009s : 2: match.switch_simplify 5.96% : 0.000012s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000317 1972 0.97% : 0.000003s : 21: predicate.accumulaten_eliminater 0.67% : 0.000002s : 6: predicate.ad_related_special_op_eliminate 0.50% : 0.000002s : 12: predicate.addn_check_dump 0.95% : 0.000003s : 21: predicate.addn_zero_filter 0.90% : 0.000003s : 21: predicate.adjust_all_reduce_mul_add 2.10% : 0.000007s : 33: predicate.arithmetic_simplify 1.13% : 0.000004s : 21: predicate.cast_eliminate 0.63% : 0.000002s : 12: predicate.check_bprop_eliminate 0.57% : 0.000002s : 12: predicate.compare_switch_simplify 0.18% : 0.000001s : 6: predicate.const_output_eliminate 0.61% : 0.000002s : 12: predicate.depend_value_elim 1.05% : 0.000003s : 21: predicate.dict_get_item_const_eliminator 1.10% : 0.000003s : 21: predicate.dict_get_item_eliminator 0.99% : 0.000003s : 21: predicate.dict_set_item_eliminator 0.80% : 0.000003s : 12: predicate.dumpgradient_eliminate 0.20% : 0.000001s : 6: predicate.elim_not_effective 0.36% : 0.000001s : 6: predicate.elim_shapecalc_of_broadcastargs 1.19% : 0.000004s : 27: predicate.environ_add_const_eliminate 1.19% : 0.000004s : 27: predicate.environ_get_add_eliminate 1.21% : 0.000004s : 27: predicate.environ_get_depend_swap 1.73% : 0.000005s : 39: predicate.environ_get_eliminate 1.13% : 0.000004s : 27: predicate.environ_get_set_eliminate 1.52% : 0.000005s : 33: predicate.exchange_switch_depend_value 2.44% : 0.000008s : 33: predicate.float_depend_g_call 0.54% : 0.000002s : 12: predicate.float_environ_get_switch 0.75% : 0.000002s : 18: predicate.float_tuple_getitem_switch 0.19% : 0.000001s : 6: predicate.fold_const_symbol 0.59% : 0.000002s : 12: predicate.get_grad_eliminate 0.20% : 0.000001s : 6: predicate.graph_param_transform 0.61% : 0.000002s : 12: predicate.incorporate_call 0.52% : 0.000002s : 12: predicate.incorporate_call_switch 6.41% : 0.000020s : 90: predicate.inline 0.77% : 0.000002s : 12: predicate.inline_without_move 0.33% : 0.000001s : 12: predicate.j_node_and_user_rematch 0.75% : 0.000002s : 12: predicate.less_batch_normalization 1.71% : 0.000005s : 37: predicate.list_to_tuple_eliminator_ 2.54% : 0.000008s : 58: predicate.load_eliminater 0.69% : 0.000002s : 6: predicate.loop_unroll_after_grad 2.38% : 0.000008s : 51: predicate.loop_unroll_before_grad 1.71% : 0.000005s : 33: predicate.make_slice_get_slice_eliminator 0.55% : 0.000002s : 12: predicate.merge_addn 0.67% : 0.000002s : 12: predicate.micro_step_allgather_replace 0.55% : 0.000002s : 12: predicate.mini_step_allgather_replace 0.87% : 0.000003s : 21: predicate.minmaximum_grad 0.77% : 0.000002s : 6: predicate.mutable_eliminate 0.33% : 0.000001s : 6: predicate.opt_reshape 0.31% : 0.000001s : 6: predicate.parallel_virtual_node 1.83% : 0.000006s : 33: predicate.partial_defer_inline 1.70% : 0.000005s : 31: predicate.partial_eliminate 0.92% : 0.000003s : 21: predicate.print_const_string_wrapper 0.53% : 0.000002s : 12: predicate.reduce_all_const_elim 1.19% : 0.000004s : 21: predicate.reduce_eliminate 2.54% : 0.000008s : 58: predicate.redundant_stop_gradient_eliminater 0.36% : 0.000001s : 12: predicate.remove_not_recompute_node 1.30% : 0.000004s : 37: predicate.replace_applicator 0.51% : 0.000002s : 12: predicate.replace_old_param 0.22% : 0.000001s : 6: predicate.reset_defer_inline 1.00% : 0.000003s : 21: predicate.reshape_eliminate 0.58% : 0.000002s : 12: predicate.row_tensor_add_zeros_like 0.40% : 0.000001s : 6: predicate.row_tensor_eliminate 0.74% : 0.000002s : 12: predicate.same_eliminate 0.41% : 0.000001s : 12: predicate.set_cell_output_no_recompute 0.65% : 0.000002s : 12: predicate.shard_identity_eliminate 0.70% : 0.000002s : 12: predicate.special_op_eliminate 0.84% : 0.000003s : 12: predicate.specialize_transform 0.82% : 0.000003s : 12: predicate.split_environ_get_set_with_tuple_value 0.71% : 0.000002s : 12: predicate.stack_unstack_eliminate 0.34% : 0.000001s : 6: predicate.switch_call_monad_eliminater 1.62% : 0.000005s : 33: predicate.switch_defer_inline 2.14% : 0.000007s : 45: predicate.switch_layer_defer_inline 5.48% : 0.000017s : 106: predicate.switch_simplify 1.00% : 0.000003s : 21: predicate.tile_eliminate 0.98% : 0.000003s : 21: predicate.transpose_eliminate 1.55% : 0.000005s : 33: predicate.tuple_list_convert_item_index_to_positive 1.67% : 0.000005s : 33: predicate.tuple_list_get_item_const_eliminator 1.69% : 0.000005s : 33: predicate.tuple_list_get_item_depend_reorder 3.21% : 0.000010s : 49: predicate.tuple_list_get_item_eliminator 1.58% : 0.000005s : 33: predicate.tuple_list_get_set_item_eliminator 2.30% : 0.000007s : 45: predicate.tuple_list_set_item_eliminator 1.74% : 0.000006s : 37: predicate.tuple_to_list_eliminator_ 2.50% : 0.000008s : 58: predicate.updatestate_pure_node_eliminater 3.08% : 0.000010s : 70: predicate.updatestate_useless_node_eliminater 0.34% : 0.000001s : 6: predicate.value_based_eliminate 0.58% : 0.000002s : 12: predicate.virtual_dataset_eliminate 0.65% : 0.000002s : 12: predicate.virtual_output_eliminate 0.27% : 0.000001s : 6: predicate.virtual_view_grad_eliminate 0.41% : 0.000001s : 6: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000986 16 53.15% : 0.000524s : 6: func_graph_cloner_run.FuncGraphClonerGraph 46.85% : 0.000462s : 10: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.036255 192 0.01% : 0.000004s : 1: ForceFp32Comm 8.58% : 0.003111s : 1: add_attr 8.55% : 0.003101s : 1: add_attr_with_inline 0.01% : 0.000004s : 1: add_comm_op_reuse_tag 0.19% : 0.000067s : 1: add_recomputation 0.01% : 0.000004s : 1: assign_add_opt 0.29% : 0.000107s : 1: auto_monad 0.07% : 0.000025s : 1: auto_monad_reorder 0.01% : 0.000004s : 1: begin_end_overlap_inline 0.02% : 0.000006s : 1: bias_add_comm_swap 1.34% : 0.000487s : 1: bootstrap 0.08% : 0.000027s : 1: cconv 0.01% : 0.000004s : 1: comm_op_add_attrs 0.05% : 0.000019s : 1: control_data_broadcast_order 0.03% : 0.000012s : 1: convert_after_rewriter 0.09% : 0.000034s : 1: cse_after_recomputation 0.01% : 0.000005s : 1: dataset_repeat_opt 0.01% : 0.000005s : 1: detach_backward 0.03% : 0.000010s : 1: environ_conv 0.26% : 0.000094s : 1: event_method 0.01% : 0.000005s : 1: full_micro_interleaved_order_control 0.01% : 0.000005s : 1: get_jit_bprop_graph 0.03% : 0.000012s : 1: graph_reusing 0.01% : 0.000005s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000004s : 1: handle_group_info 0.02% : 0.000006s : 1: inline 0.02% : 0.000006s : 1: insert-virtual-dataset 0.01% : 0.000004s : 1: interleave_parallel_branches 0.01% : 0.000004s : 1: interleave_split_concat_branches 0.02% : 0.000006s : 1: label_fine_grained_interleaved_index 0.02% : 0.000008s : 1: label_micro_interleaved_index 1.23% : 0.000445s : 1: loop_unroll 0.01% : 0.000004s : 1: merge_cast_opt 0.01% : 0.000005s : 1: micro_interleaved_order_control 1.56% : 0.000565s : 1: mutable_eliminate 0.02% : 0.000008s : 1: offloading_packed_experts 0.05% : 0.000017s : 1: opt.transform.loop_unroll_optimizer 0.05% : 0.000019s : 1: opt.transform.mutable_eliminate 5.30% : 0.001920s : 78: opt.transform.opt_a 0.17% : 0.000060s : 1: opt.transform.opt_after_cconv 0.10% : 0.000034s : 1: opt.transform.opt_after_jit_grad 0.45% : 0.000165s : 28: opt.transform.opt_b 0.21% : 0.000075s : 2: opt.transform.opt_trans_graph 0.15% : 0.000053s : 4: opt.transform.symbol_engine_opt 10.63% : 0.003855s : 1: opt_a 0.42% : 0.000153s : 1: opt_after_cconv 1.40% : 0.000507s : 1: opt_after_jit_grad 0.78% : 0.000285s : 1: opt_b 17.27% : 0.006262s : 1: optimize 0.06% : 0.000023s : 1: optimize_parallel_all_gather_comm 0.03% : 0.000009s : 1: order_py_execute_after_rewriter 0.08% : 0.000028s : 1: overlap_grad_flash_sp 0.01% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.02% : 0.000007s : 1: overlap_grad_ring_attention 0.01% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.01% : 0.000005s : 1: overlap_param_gather 0.01% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.02% : 0.000008s : 1: overlap_recompute_and_grad_model_parallel 0.01% : 0.000005s : 1: overlap_recompute_comm 0.02% : 0.000007s : 1: parallel-infer-symbol 0.01% : 0.000004s : 1: parallel-infer-symbol-second 0.01% : 0.000005s : 1: partial_unused_args_eliminate 0.01% : 0.000005s : 1: pipeline_parallel_scheduler 0.01% : 0.000004s : 1: pipeline_split 0.13% : 0.000048s : 1: pre_auto_parallel 0.10% : 0.000036s : 1: py_interpret_to_execute 0.04% : 0.000015s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000004s : 1: remove_cast_before_assign_add 0.24% : 0.000088s : 1: remove_dup_value 1.47% : 0.000532s : 1: renormalize.infer 1.23% : 0.000447s : 1: renormalize.specialize 0.02% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.02% : 0.000008s : 1: rewriter_after_jit_bprop_graph 0.13% : 0.000048s : 1: rewriter_after_opt_a 0.34% : 0.000123s : 1: rewriter_before_opt_a 0.01% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.01% : 0.000005s : 1: slice_recompute_activation 0.01% : 0.000004s : 1: split_layernorm_comm 0.01% : 0.000005s : 1: split_matmul_comm_elemetwise 0.03% : 0.000010s : 1: swap_dp_allreduce_reducescatter 0.27% : 0.000096s : 1: symbol_engine_optimizer 0.30% : 0.000109s : 1: tuple_transform 35.62% : 0.012914s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:02.755.336 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:02.755.603 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.028262, [21] [bootstrap]: 0.00045312 [type_inference]: 0.0149774 [event_method]: 9.717e-05 [auto_monad]: 0.00010333 [graph_reusing]: 7.5e-06 [inline]: 2.69999e-06 [add_attr]: 0.00394312, [1] [add_attr_with_inline]: 0.00393084, [1] [Cycle 1]: 0.00011513, [2] [tag_attr]: 3.477e-05 [meta_addattr_fg_expand]: 8.54e-06 [parallel-infer-symbol]: 3.81999e-06 [pre_auto_parallel]: 5.318e-05 [insert-virtual-dataset]: 2.49001e-06 [parallel-infer-symbol-second]: 8.70001e-07 [dataset_repeat_opt]: 2.06998e-06 [pipeline_split]: 1.88002e-06 [optimize]: 0.00727942, [53] [py_interpret_to_execute]: 4.304e-05 [rewriter_before_opt_a]: 0.00013948 [opt_a]: 0.0043802, [2] [Cycle 1]: 0.00339814, [45] [expand_dump_flag]: 4.30999e-06 [switch_simplify]: 0.00013894 [loop_unroll]: 4.276e-05 [a_1]: 0.00102869 [with_stream_mark]: 2.102e-05 [recompute_prepare]: 1.213e-05 [updatestate_depend_eliminate]: 5.11997e-06 [updatestate_assign_eliminate]: 4.11001e-06 [updatestate_loads_eliminate]: 3.95e-06 [parameter_eliminate]: 1.95001e-06 [a_2]: 0.00013463 [accelerated_algorithm]: 9.14998e-06 [shard]: 2.26998e-06 [meta_shard_fg_expand]: 2.64999e-06 [shard_inline]: 8.04997e-06 [merge_send_recv]: 1.099e-05 [auto_parallel]: 8.94e-06 [parallel]: 1.912e-05 [flash_sp]: 9.49e-06 [merge_comm]: 5.14e-06 [allreduce_fusion]: 4.51002e-06 [matmul_add_comm_reduction]: 1.102e-05 [allreduce_slice_to_reducescatter]: 6.09987e-07 [virtual_shard_identity]: 1.039e-05 [virtual_dataset]: 8.13999e-06 [get_grad_eliminate_]: 7.63999e-06 [virtual_output]: 8.05e-06 [merge_forward]: 4.98001e-06 [cell_reuse_recompute_pass]: 1.34e-06 [offload_activation]: 1.223e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.974e-05 [merge_recompute_call_nodes]: 1.60999e-06 [before_grad]: 1.353e-05 [set_forward_comm_id_for_comm_node_pass]: 5.15999e-06 [meta_fg_expand]: 4.33001e-06 [flash_sp_send_recv_attached]: 2.94001e-06 [receive_attached]: 2.83e-06 [after_resolve]: 1.373e-05 [a_after_grad]: 1.299e-05 [renormalize]: 0.00121137 [add_forward_monad_depend]: 7.03e-06 [auto_monad_grad]: 3.01001e-06 [auto_monad_eliminator]: 2.242e-05 [cse]: 3.701e-05 [a_3]: 7.687e-05 [Cycle 2]: 0.00096566, [45] [expand_dump_flag]: 1.92999e-06 [switch_simplify]: 1.012e-05 [loop_unroll]: 8.37e-06 [a_1]: 0.00018864 [with_stream_mark]: 1.426e-05 [recompute_prepare]: 7.75998e-06 [updatestate_depend_eliminate]: 4.98001e-06 [updatestate_assign_eliminate]: 3.21999e-06 [updatestate_loads_eliminate]: 3.02002e-06 [parameter_eliminate]: 1.10001e-06 [a_2]: 0.0001222 [accelerated_algorithm]: 8.32e-06 [shard]: 1.39e-06 [meta_shard_fg_expand]: 2.34999e-06 [shard_inline]: 8.42e-06 [merge_send_recv]: 7.26999e-06 [auto_parallel]: 7.28999e-06 [parallel]: 6.19999e-06 [flash_sp]: 3.86999e-06 [merge_comm]: 4.25e-06 [allreduce_fusion]: 4.73001e-06 [matmul_add_comm_reduction]: 7.73999e-06 [allreduce_slice_to_reducescatter]: 5.00004e-07 [virtual_shard_identity]: 8.50999e-06 [virtual_dataset]: 7.31001e-06 [get_grad_eliminate_]: 7.26999e-06 [virtual_output]: 7.26999e-06 [merge_forward]: 3.86999e-06 [cell_reuse_recompute_pass]: 2.27999e-06 [offload_activation]: 8.80999e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.737e-05 [merge_recompute_call_nodes]: 7.79983e-07 [before_grad]: 1.282e-05 [set_forward_comm_id_for_comm_node_pass]: 4.94e-06 [meta_fg_expand]: 3.34001e-06 [flash_sp_send_recv_attached]: 1.02998e-06 [receive_attached]: 1.30999e-06 [after_resolve]: 1.228e-05 [a_after_grad]: 1.172e-05 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 1.24998e-06 [auto_monad_grad]: 1.69998e-06 [auto_monad_eliminator]: 9.94001e-06 [cse]: 2.043e-05 [a_3]: 5.939e-05 [py_interpret_to_execute_after_opt_a]: 1.853e-05 [slice_cell_reuse_recomputed_activation]: 4.64002e-06 [rewriter_after_opt_a]: 4.831e-05 [convert_after_rewriter]: 1.086e-05 [order_py_execute_after_rewriter]: 9.68997e-06 [mutable_eliminate]: 0.00070831 [opt_b]: 0.00033943, [1] [Cycle 1]: 0.00032774, [7] [b_1]: 0.0002154 [b_2]: 1.041e-05 [updatestate_depend_eliminate]: 7.01001e-06 [updatestate_assign_eliminate]: 3.44001e-06 [updatestate_loads_eliminate]: 2.91e-06 [renormalize]: 6.39993e-07 [cse]: 2.607e-05 [optimize_parallel_all_gather_comm]: 2.481e-05 [overlap_param_gather]: 5.61e-06 [cconv]: 3.326e-05 [loop_unroll]: 0.0005203 [opt_after_cconv]: 0.00015048, [1] [Cycle 1]: 0.00014064, [7] [c_1]: 4.121e-05 [parameter_eliminate]: 3.97e-06 [updatestate_depend_eliminate]: 6.59001e-06 [updatestate_assign_eliminate]: 3.29001e-06 [updatestate_loads_eliminate]: 2.89999e-06 [cse]: 2.287e-05 [renormalize]: 3.89991e-07 [remove_dup_value]: 1.934e-05 [tuple_transform]: 0.00014508, [1] [Cycle 1]: 0.00013735, [4] [d_1]: 9.012e-05 [none_parameter_eliminate]: 2.09999e-06 [renormalize]: 3.19997e-07 [switch_simplify]: 1.038e-05 [partial_unused_args_eliminate]: 4.92e-06 [add_recomputation]: 6.345e-05 [cse_after_recomputation]: 3.595e-05, [1] [Cycle 1]: 2.789e-05, [1] [cse]: 1.785e-05 [environ_conv]: 1.025e-05 [swap_dp_allreduce_reducescatter]: 8.94e-06 [bias_add_comm_swap]: 5.56e-06 [label_micro_interleaved_index]: 8.17e-06 [label_fine_grained_interleaved_index]: 5.47999e-06 [merge_cast_opt]: 4.17998e-06 [slice_recompute_activation]: 5.05999e-06 [micro_interleaved_order_control]: 5.25999e-06 [assign_add_opt]: 4.02e-06 [ForceFp32Comm]: 4.08001e-06 [remove_cast_before_assign_add]: 3.98001e-06 [full_micro_interleaved_order_control]: 5.27999e-06 [reorder_send_recv_between_fp_bp]: 5.43002e-06 [comm_op_add_attrs]: 3.75e-06 [add_comm_op_reuse_tag]: 3.91001e-06 [interleave_split_concat_branches]: 3.95e-06 [interleave_parallel_branches]: 3.86001e-06 [overlap_opt_shard_in_pipeline]: 4.38999e-06 [overlap_opt_shard_grad_in_pipeline]: 4.49002e-06 [control_data_broadcast_order]: 1.786e-05 [grouped_pairwise_exchange_alltoall]: 4.05e-06 [offloading_packed_experts]: 7.43999e-06 [overlap_recompute_and_grad_model_parallel]: 7.98999e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.93999e-06 [overlap_recompute_allgather_and_fa_grad]: 3.92998e-06 [overlap_recompute_comm]: 5.80002e-06 [overlap_grad_ring_attention]: 7.53999e-06 [overlap_grad_flash_sp]: 2.612e-05 [begin_end_overlap_inline]: 3.46999e-06 [split_matmul_comm_elemetwise]: 4.85001e-06 [split_layernorm_comm]: 4.2e-06 [handle_group_info]: 3.58e-06 [symbol_engine_optimizer]: 0.00011366, [1] [Cycle 1]: 0.00010645, [6] [build]: 3.5e-06 [elim_shapecalc]: 1.273e-05 [elim_not_effective]: 1.769e-05 [opt_reshape]: 9.93998e-06 [fold_const_symbol]: 1.346e-05 [renormalize]: 1.8999e-07 [detach_backward]: 4.25e-06 [pipeline_parallel_scheduler]: 2.27999e-06 [auto_monad_reorder]: 2.552e-05 [get_jit_bprop_graph]: 1.99999e-06 [rewriter_after_jit_bprop_graph]: 4.98001e-06 [opt_after_jit_grad]: 0.00054671 [validate]: 4.49e-05 Sums bootstrap : 0.000453s : 2.03% type_inference : 0.014977s : 66.96% event_method : 0.000097s : 0.43% auto_monad : 0.000103s : 0.46% graph_reusing : 0.000007s : 0.03% inline : 0.000003s : 0.01% add_attr.add_attr_with_inline.tag_attr : 0.000035s : 0.16% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000009s : 0.04% parallel-infer-symbol : 0.000004s : 0.02% pre_auto_parallel : 0.000053s : 0.24% insert-virtual-dataset : 0.000002s : 0.01% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.01% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000043s : 0.19% optimize.rewriter_before_opt_a : 0.000139s : 0.62% optimize.opt_a.expand_dump_flag : 0.000006s : 0.03% optimize.opt_a.switch_simplify : 0.000149s : 0.67% optimize.opt_a.loop_unroll : 0.000051s : 0.23% optimize.opt_a.a_1 : 0.001217s : 5.44% optimize.opt_a.with_stream_mark : 0.000035s : 0.16% optimize.opt_a.recompute_prepare : 0.000020s : 0.09% optimize.opt_a.updatestate_depend_eliminate : 0.000010s : 0.05% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.03% optimize.opt_a.updatestate_loads_eliminate : 0.000007s : 0.03% optimize.opt_a.parameter_eliminate : 0.000003s : 0.01% optimize.opt_a.a_2 : 0.000257s : 1.15% optimize.opt_a.accelerated_algorithm : 0.000017s : 0.08% optimize.opt_a.shard : 0.000004s : 0.02% optimize.opt_a.meta_shard_fg_expand : 0.000005s : 0.02% optimize.opt_a.shard_inline : 0.000016s : 0.07% optimize.opt_a.merge_send_recv : 0.000018s : 0.08% optimize.opt_a.auto_parallel : 0.000016s : 0.07% optimize.opt_a.parallel : 0.000025s : 0.11% optimize.opt_a.flash_sp : 0.000013s : 0.06% optimize.opt_a.merge_comm : 0.000009s : 0.04% optimize.opt_a.allreduce_fusion : 0.000009s : 0.04% optimize.opt_a.matmul_add_comm_reduction : 0.000019s : 0.08% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000019s : 0.08% optimize.opt_a.virtual_dataset : 0.000015s : 0.07% optimize.opt_a.get_grad_eliminate_ : 0.000015s : 0.07% optimize.opt_a.virtual_output : 0.000015s : 0.07% optimize.opt_a.merge_forward : 0.000009s : 0.04% optimize.opt_a.cell_reuse_recompute_pass : 0.000004s : 0.02% optimize.opt_a.offload_activation : 0.000021s : 0.09% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000037s : 0.17% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.01% optimize.opt_a.before_grad : 0.000026s : 0.12% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000010s : 0.05% optimize.opt_a.meta_fg_expand : 0.000008s : 0.03% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.02% optimize.opt_a.receive_attached : 0.000004s : 0.02% optimize.opt_a.after_resolve : 0.000026s : 0.12% optimize.opt_a.a_after_grad : 0.000025s : 0.11% optimize.opt_a.renormalize : 0.001211s : 5.42% optimize.opt_a.add_forward_monad_depend : 0.000008s : 0.04% optimize.opt_a.auto_monad_grad : 0.000005s : 0.02% optimize.opt_a.auto_monad_eliminator : 0.000032s : 0.14% optimize.opt_a.cse : 0.000057s : 0.26% optimize.opt_a.a_3 : 0.000136s : 0.61% optimize.py_interpret_to_execute_after_opt_a : 0.000019s : 0.08% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.02% optimize.rewriter_after_opt_a : 0.000048s : 0.22% optimize.convert_after_rewriter : 0.000011s : 0.05% optimize.order_py_execute_after_rewriter : 0.000010s : 0.04% optimize.mutable_eliminate : 0.000708s : 3.17% optimize.opt_b.b_1 : 0.000215s : 0.96% optimize.opt_b.b_2 : 0.000010s : 0.05% optimize.opt_b.updatestate_depend_eliminate : 0.000007s : 0.03% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.02% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.01% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000026s : 0.12% optimize.optimize_parallel_all_gather_comm : 0.000025s : 0.11% optimize.overlap_param_gather : 0.000006s : 0.03% optimize.cconv : 0.000033s : 0.15% optimize.loop_unroll : 0.000520s : 2.33% optimize.opt_after_cconv.c_1 : 0.000041s : 0.18% optimize.opt_after_cconv.parameter_eliminate : 0.000004s : 0.02% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000007s : 0.03% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.cse : 0.000023s : 0.10% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000019s : 0.09% optimize.tuple_transform.d_1 : 0.000090s : 0.40% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000010s : 0.05% optimize.partial_unused_args_eliminate : 0.000005s : 0.02% optimize.add_recomputation : 0.000063s : 0.28% optimize.cse_after_recomputation.cse : 0.000018s : 0.08% optimize.environ_conv : 0.000010s : 0.05% optimize.swap_dp_allreduce_reducescatter : 0.000009s : 0.04% optimize.bias_add_comm_swap : 0.000006s : 0.02% optimize.label_micro_interleaved_index : 0.000008s : 0.04% optimize.label_fine_grained_interleaved_index : 0.000005s : 0.02% optimize.merge_cast_opt : 0.000004s : 0.02% optimize.slice_recompute_activation : 0.000005s : 0.02% optimize.micro_interleaved_order_control : 0.000005s : 0.02% optimize.assign_add_opt : 0.000004s : 0.02% optimize.ForceFp32Comm : 0.000004s : 0.02% optimize.remove_cast_before_assign_add : 0.000004s : 0.02% optimize.full_micro_interleaved_order_control : 0.000005s : 0.02% optimize.reorder_send_recv_between_fp_bp : 0.000005s : 0.02% optimize.comm_op_add_attrs : 0.000004s : 0.02% optimize.add_comm_op_reuse_tag : 0.000004s : 0.02% optimize.interleave_split_concat_branches : 0.000004s : 0.02% optimize.interleave_parallel_branches : 0.000004s : 0.02% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.02% optimize.overlap_opt_shard_grad_in_pipeline : 0.000004s : 0.02% optimize.control_data_broadcast_order : 0.000018s : 0.08% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.02% optimize.offloading_packed_experts : 0.000007s : 0.03% optimize.overlap_recompute_and_grad_model_parallel : 0.000008s : 0.04% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.02% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.02% optimize.overlap_recompute_comm : 0.000006s : 0.03% optimize.overlap_grad_ring_attention : 0.000008s : 0.03% optimize.overlap_grad_flash_sp : 0.000026s : 0.12% optimize.begin_end_overlap_inline : 0.000003s : 0.02% optimize.split_matmul_comm_elemetwise : 0.000005s : 0.02% optimize.split_layernorm_comm : 0.000004s : 0.02% optimize.handle_group_info : 0.000004s : 0.02% optimize.symbol_engine_optimizer.build : 0.000003s : 0.02% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000013s : 0.06% optimize.symbol_engine_optimizer.elim_not_effective : 0.000018s : 0.08% optimize.symbol_engine_optimizer.opt_reshape : 0.000010s : 0.04% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000013s : 0.06% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000004s : 0.02% pipeline_parallel_scheduler : 0.000002s : 0.01% auto_monad_reorder : 0.000026s : 0.11% get_jit_bprop_graph : 0.000002s : 0.01% rewriter_after_jit_bprop_graph : 0.000005s : 0.02% opt_after_jit_grad : 0.000547s : 2.44% validate : 0.000045s : 0.20% Time group info: ------[substitution.] 0.000289 44 8.71% : 0.000025s : 3: substitution.cast_eliminate 0.95% : 0.000003s : 3: substitution.elim_not_effective 0.65% : 0.000002s : 3: substitution.fold_const_symbol 2.93% : 0.000008s : 5: substitution.graph_param_transform 71.26% : 0.000206s : 8: substitution.inline 1.83% : 0.000005s : 6: substitution.j_node_and_user_rematch 2.32% : 0.000007s : 6: substitution.remove_not_recompute_node 1.75% : 0.000005s : 4: substitution.replace_old_param 3.66% : 0.000011s : 2: substitution.switch_simplify 5.95% : 0.000017s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.014908 2 89.64% : 0.013363s : 1: type_inference.infer 10.36% : 0.001545s : 1: type_inference.specialize ------[replace.] 0.000137 14 39.79% : 0.000055s : 8: replace.inline 40.62% : 0.000056s : 2: replace.switch_simplify 19.59% : 0.000027s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000225 14 89.39% : 0.000201s : 8: match.inline 4.03% : 0.000009s : 2: match.switch_simplify 6.58% : 0.000015s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000305 1838 1.03% : 0.000003s : 21: predicate.accumulaten_eliminater 0.67% : 0.000002s : 5: predicate.ad_related_special_op_eliminate 0.54% : 0.000002s : 10: predicate.addn_check_dump 1.05% : 0.000003s : 21: predicate.addn_zero_filter 0.95% : 0.000003s : 21: predicate.adjust_all_reduce_mul_add 2.22% : 0.000007s : 31: predicate.arithmetic_simplify 1.04% : 0.000003s : 21: predicate.cast_eliminate 0.52% : 0.000002s : 10: predicate.check_bprop_eliminate 0.49% : 0.000001s : 10: predicate.compare_switch_simplify 0.16% : 0.000000s : 5: predicate.const_output_eliminate 0.56% : 0.000002s : 10: predicate.depend_value_elim 1.11% : 0.000003s : 21: predicate.dict_get_item_const_eliminator 1.11% : 0.000003s : 21: predicate.dict_get_item_eliminator 0.95% : 0.000003s : 21: predicate.dict_set_item_eliminator 0.78% : 0.000002s : 10: predicate.dumpgradient_eliminate 0.24% : 0.000001s : 5: predicate.elim_not_effective 0.40% : 0.000001s : 5: predicate.elim_shapecalc_of_broadcastargs 1.24% : 0.000004s : 26: predicate.environ_add_const_eliminate 1.12% : 0.000003s : 26: predicate.environ_get_add_eliminate 1.18% : 0.000004s : 26: predicate.environ_get_depend_swap 1.70% : 0.000005s : 36: predicate.environ_get_eliminate 1.21% : 0.000004s : 26: predicate.environ_get_set_eliminate 1.59% : 0.000005s : 33: predicate.exchange_switch_depend_value 2.49% : 0.000008s : 33: predicate.float_depend_g_call 0.45% : 0.000001s : 10: predicate.float_environ_get_switch 0.80% : 0.000002s : 15: predicate.float_tuple_getitem_switch 0.16% : 0.000000s : 5: predicate.fold_const_symbol 0.58% : 0.000002s : 10: predicate.get_grad_eliminate 0.17% : 0.000001s : 5: predicate.graph_param_transform 0.50% : 0.000002s : 10: predicate.incorporate_call 0.41% : 0.000001s : 10: predicate.incorporate_call_switch 6.33% : 0.000019s : 84: predicate.inline 0.64% : 0.000002s : 10: predicate.inline_without_move 0.27% : 0.000001s : 10: predicate.j_node_and_user_rematch 0.74% : 0.000002s : 10: predicate.less_batch_normalization 1.80% : 0.000005s : 35: predicate.list_to_tuple_eliminator_ 2.54% : 0.000008s : 56: predicate.load_eliminater 0.85% : 0.000003s : 5: predicate.loop_unroll_after_grad 2.37% : 0.000007s : 49: predicate.loop_unroll_before_grad 1.63% : 0.000005s : 31: predicate.make_slice_get_slice_eliminator 0.52% : 0.000002s : 10: predicate.merge_addn 0.53% : 0.000002s : 10: predicate.micro_step_allgather_replace 0.49% : 0.000002s : 10: predicate.mini_step_allgather_replace 0.94% : 0.000003s : 21: predicate.minmaximum_grad 0.81% : 0.000002s : 5: predicate.mutable_eliminate 0.35% : 0.000001s : 5: predicate.opt_reshape 0.28% : 0.000001s : 5: predicate.parallel_virtual_node 2.14% : 0.000007s : 33: predicate.partial_defer_inline 1.64% : 0.000005s : 30: predicate.partial_eliminate 1.07% : 0.000003s : 21: predicate.print_const_string_wrapper 0.78% : 0.000002s : 10: predicate.reduce_all_const_elim 1.46% : 0.000004s : 21: predicate.reduce_eliminate 2.65% : 0.000008s : 56: predicate.redundant_stop_gradient_eliminater 0.35% : 0.000001s : 10: predicate.remove_not_recompute_node 1.32% : 0.000004s : 35: predicate.replace_applicator 0.43% : 0.000001s : 10: predicate.replace_old_param 0.24% : 0.000001s : 5: predicate.reset_defer_inline 1.06% : 0.000003s : 21: predicate.reshape_eliminate 0.50% : 0.000002s : 10: predicate.row_tensor_add_zeros_like 0.34% : 0.000001s : 5: predicate.row_tensor_eliminate 0.60% : 0.000002s : 10: predicate.same_eliminate 0.45% : 0.000001s : 10: predicate.set_cell_output_no_recompute 0.61% : 0.000002s : 10: predicate.shard_identity_eliminate 0.65% : 0.000002s : 10: predicate.special_op_eliminate 0.64% : 0.000002s : 10: predicate.specialize_transform 0.75% : 0.000002s : 10: predicate.split_environ_get_set_with_tuple_value 0.72% : 0.000002s : 10: predicate.stack_unstack_eliminate 0.30% : 0.000001s : 5: predicate.switch_call_monad_eliminater 1.76% : 0.000005s : 33: predicate.switch_defer_inline 2.23% : 0.000007s : 43: predicate.switch_layer_defer_inline 5.35% : 0.000016s : 101: predicate.switch_simplify 1.02% : 0.000003s : 21: predicate.tile_eliminate 1.06% : 0.000003s : 21: predicate.transpose_eliminate 1.49% : 0.000005s : 31: predicate.tuple_list_convert_item_index_to_positive 1.81% : 0.000006s : 31: predicate.tuple_list_get_item_const_eliminator 1.59% : 0.000005s : 31: predicate.tuple_list_get_item_depend_reorder 3.15% : 0.000010s : 45: predicate.tuple_list_get_item_eliminator 1.61% : 0.000005s : 31: predicate.tuple_list_get_set_item_eliminator 2.45% : 0.000007s : 41: predicate.tuple_list_set_item_eliminator 1.63% : 0.000005s : 35: predicate.tuple_to_list_eliminator_ 2.47% : 0.000008s : 56: predicate.updatestate_pure_node_eliminater 3.14% : 0.000010s : 66: predicate.updatestate_useless_node_eliminater 0.32% : 0.000001s : 5: predicate.value_based_eliminate 0.57% : 0.000002s : 10: predicate.virtual_dataset_eliminate 0.49% : 0.000001s : 10: predicate.virtual_output_eliminate 0.23% : 0.000001s : 5: predicate.virtual_view_grad_eliminate 0.42% : 0.000001s : 5: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.001100 16 53.48% : 0.000588s : 6: func_graph_cloner_run.FuncGraphClonerGraph 46.52% : 0.000511s : 10: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.042803 192 0.02% : 0.000007s : 1: ForceFp32Comm 9.24% : 0.003955s : 1: add_attr 9.19% : 0.003936s : 1: add_attr_with_inline 0.02% : 0.000007s : 1: add_comm_op_reuse_tag 0.16% : 0.000067s : 1: add_recomputation 0.02% : 0.000007s : 1: assign_add_opt 0.27% : 0.000115s : 1: auto_monad 0.08% : 0.000033s : 1: auto_monad_reorder 0.01% : 0.000006s : 1: begin_end_overlap_inline 0.02% : 0.000009s : 1: bias_add_comm_swap 1.16% : 0.000497s : 1: bootstrap 0.08% : 0.000036s : 1: cconv 0.02% : 0.000008s : 1: comm_op_add_attrs 0.05% : 0.000021s : 1: control_data_broadcast_order 0.03% : 0.000014s : 1: convert_after_rewriter 0.09% : 0.000039s : 1: cse_after_recomputation 0.02% : 0.000008s : 1: dataset_repeat_opt 0.06% : 0.000024s : 1: detach_backward 0.03% : 0.000013s : 1: environ_conv 0.26% : 0.000113s : 1: event_method 0.02% : 0.000008s : 1: full_micro_interleaved_order_control 0.02% : 0.000009s : 1: get_jit_bprop_graph 0.04% : 0.000015s : 1: graph_reusing 0.02% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.02% : 0.000007s : 1: handle_group_info 0.02% : 0.000009s : 1: inline 0.02% : 0.000008s : 1: insert-virtual-dataset 0.02% : 0.000006s : 1: interleave_parallel_branches 0.02% : 0.000007s : 1: interleave_split_concat_branches 0.02% : 0.000008s : 1: label_fine_grained_interleaved_index 0.03% : 0.000011s : 1: label_micro_interleaved_index 1.23% : 0.000527s : 1: loop_unroll 0.02% : 0.000007s : 1: merge_cast_opt 0.02% : 0.000008s : 1: micro_interleaved_order_control 1.74% : 0.000743s : 1: mutable_eliminate 0.03% : 0.000011s : 1: offloading_packed_experts 0.04% : 0.000018s : 1: opt.transform.loop_unroll_optimizer 0.04% : 0.000019s : 1: opt.transform.mutable_eliminate 4.35% : 0.001862s : 78: opt.transform.opt_a 0.09% : 0.000039s : 1: opt.transform.opt_after_cconv 0.08% : 0.000032s : 1: opt.transform.opt_after_jit_grad 0.34% : 0.000146s : 28: opt.transform.opt_b 0.23% : 0.000098s : 2: opt.transform.opt_trans_graph 0.12% : 0.000050s : 4: opt.transform.symbol_engine_opt 10.24% : 0.004384s : 1: opt_a 0.36% : 0.000154s : 1: opt_after_cconv 1.30% : 0.000558s : 1: opt_after_jit_grad 0.80% : 0.000343s : 1: opt_b 18.00% : 0.007705s : 1: optimize 0.07% : 0.000028s : 1: optimize_parallel_all_gather_comm 0.03% : 0.000013s : 1: order_py_execute_after_rewriter 0.07% : 0.000029s : 1: overlap_grad_flash_sp 0.02% : 0.000007s : 1: overlap_grad_matmul_and_grad_allreduce 0.02% : 0.000011s : 1: overlap_grad_ring_attention 0.02% : 0.000007s : 1: overlap_opt_shard_grad_in_pipeline 0.02% : 0.000007s : 1: overlap_opt_shard_in_pipeline 0.02% : 0.000009s : 1: overlap_param_gather 0.02% : 0.000007s : 1: overlap_recompute_allgather_and_fa_grad 0.03% : 0.000011s : 1: overlap_recompute_and_grad_model_parallel 0.02% : 0.000009s : 1: overlap_recompute_comm 0.03% : 0.000011s : 1: parallel-infer-symbol 0.02% : 0.000007s : 1: parallel-infer-symbol-second 0.02% : 0.000008s : 1: partial_unused_args_eliminate 0.02% : 0.000009s : 1: pipeline_parallel_scheduler 0.02% : 0.000007s : 1: pipeline_split 0.14% : 0.000061s : 1: pre_auto_parallel 0.11% : 0.000047s : 1: py_interpret_to_execute 0.05% : 0.000022s : 1: py_interpret_to_execute_after_opt_a 0.02% : 0.000007s : 1: remove_cast_before_assign_add 0.05% : 0.000023s : 1: remove_dup_value 1.47% : 0.000629s : 1: renormalize.infer 1.34% : 0.000572s : 1: renormalize.specialize 0.02% : 0.000009s : 1: reorder_send_recv_between_fp_bp 0.03% : 0.000011s : 1: rewriter_after_jit_bprop_graph 0.12% : 0.000052s : 1: rewriter_after_opt_a 0.33% : 0.000143s : 1: rewriter_before_opt_a 0.02% : 0.000008s : 1: slice_cell_reuse_recomputed_activation 0.02% : 0.000008s : 1: slice_recompute_activation 0.02% : 0.000007s : 1: split_layernorm_comm 0.02% : 0.000008s : 1: split_matmul_comm_elemetwise 0.03% : 0.000012s : 1: swap_dp_allreduce_reducescatter 0.27% : 0.000117s : 1: symbol_engine_optimizer 0.35% : 0.000148s : 1: tuple_transform 35.10% : 0.015022s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:02.992.091 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0251757, [21] [bootstrap]: 0.00045768 [type_inference]: 0.0138241 [event_method]: 8.454e-05 [auto_monad]: 0.00010156 [graph_reusing]: 7.38e-06 [inline]: 2.41e-06 [add_attr]: 0.00336937, [1] [add_attr_with_inline]: 0.00335833, [1] [Cycle 1]: 7.863e-05, [2] [tag_attr]: 3.066e-05 [meta_addattr_fg_expand]: 8.16002e-06 [parallel-infer-symbol]: 3.15998e-06 [pre_auto_parallel]: 4.555e-05 [insert-virtual-dataset]: 2.48998e-06 [parallel-infer-symbol-second]: 1.12e-06 [dataset_repeat_opt]: 1.92001e-06 [pipeline_split]: 1.80001e-06 [optimize]: 0.00651319, [53] [py_interpret_to_execute]: 3.307e-05 [rewriter_before_opt_a]: 0.00011995 [opt_a]: 0.00392925, [2] [Cycle 1]: 0.00309984, [45] [expand_dump_flag]: 4.2e-06 [switch_simplify]: 0.0001357 [loop_unroll]: 4.239e-05 [a_1]: 0.00103662 [with_stream_mark]: 2.172e-05 [recompute_prepare]: 1.459e-05 [updatestate_depend_eliminate]: 5.62999e-06 [updatestate_assign_eliminate]: 3.8e-06 [updatestate_loads_eliminate]: 3.95998e-06 [parameter_eliminate]: 2.11998e-06 [a_2]: 0.00010802 [accelerated_algorithm]: 9.39e-06 [shard]: 2.36e-06 [meta_shard_fg_expand]: 2.36e-06 [shard_inline]: 8.03999e-06 [merge_send_recv]: 1.021e-05 [auto_parallel]: 7.97e-06 [parallel]: 2.012e-05 [flash_sp]: 9.51e-06 [merge_comm]: 5.25999e-06 [allreduce_fusion]: 4.08999e-06 [matmul_add_comm_reduction]: 1.126e-05 [allreduce_slice_to_reducescatter]: 8.09989e-07 [virtual_shard_identity]: 1.086e-05 [virtual_dataset]: 8.18999e-06 [get_grad_eliminate_]: 7.91001e-06 [virtual_output]: 7.97e-06 [merge_forward]: 5.00001e-06 [cell_reuse_recompute_pass]: 1.24e-06 [offload_activation]: 1.156e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.858e-05 [merge_recompute_call_nodes]: 1.64e-06 [before_grad]: 1.39e-05 [set_forward_comm_id_for_comm_node_pass]: 6.02999e-06 [meta_fg_expand]: 3.58e-06 [flash_sp_send_recv_attached]: 3.18e-06 [receive_attached]: 2.07999e-06 [after_resolve]: 1.598e-05 [a_after_grad]: 1.296e-05 [renormalize]: 0.00107438 [add_forward_monad_depend]: 7.75998e-06 [auto_monad_grad]: 2.41998e-06 [auto_monad_eliminator]: 2.143e-05 [cse]: 3.768e-05 [a_3]: 6.395e-05 [Cycle 2]: 0.00081729, [45] [expand_dump_flag]: 1.50999e-06 [switch_simplify]: 9.89999e-06 [loop_unroll]: 7.82998e-06 [a_1]: 0.00019182 [with_stream_mark]: 1.545e-05 [recompute_prepare]: 8.65001e-06 [updatestate_depend_eliminate]: 4.67e-06 [updatestate_assign_eliminate]: 2.91999e-06 [updatestate_loads_eliminate]: 2.76e-06 [parameter_eliminate]: 2.12001e-06 [a_2]: 9.419e-05 [accelerated_algorithm]: 9.27001e-06 [shard]: 1.94e-06 [meta_shard_fg_expand]: 1.86e-06 [shard_inline]: 8.02e-06 [merge_send_recv]: 7.63999e-06 [auto_parallel]: 7.18e-06 [parallel]: 6.56e-06 [flash_sp]: 3.43e-06 [merge_comm]: 4.36002e-06 [allreduce_fusion]: 4.22e-06 [matmul_add_comm_reduction]: 8.69003e-06 [allreduce_slice_to_reducescatter]: 5.79981e-07 [virtual_shard_identity]: 1.027e-05 [virtual_dataset]: 7.46001e-06 [get_grad_eliminate_]: 7.15e-06 [virtual_output]: 6.84999e-06 [merge_forward]: 3.88001e-06 [cell_reuse_recompute_pass]: 1.50999e-06 [offload_activation]: 9.25999e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.619e-05 [merge_recompute_call_nodes]: 1.52999e-06 [before_grad]: 1.209e-05 [set_forward_comm_id_for_comm_node_pass]: 4.63001e-06 [meta_fg_expand]: 3.16001e-06 [flash_sp_send_recv_attached]: 8.50006e-07 [receive_attached]: 1.38002e-06 [after_resolve]: 1.155e-05 [a_after_grad]: 1.112e-05 [renormalize]: 7.99773e-08 [add_forward_monad_depend]: 2.52001e-06 [auto_monad_grad]: 1.57999e-06 [auto_monad_eliminator]: 1.065e-05 [cse]: 2.204e-05 [a_3]: 4.719e-05 [py_interpret_to_execute_after_opt_a]: 1.399e-05 [slice_cell_reuse_recomputed_activation]: 2.12999e-06 [rewriter_after_opt_a]: 4.902e-05 [convert_after_rewriter]: 8.16002e-06 [order_py_execute_after_rewriter]: 6.27001e-06 [mutable_eliminate]: 0.00061873 [opt_b]: 0.00042584, [1] [Cycle 1]: 0.00041777, [7] [b_1]: 0.00030721 [b_2]: 1.39e-05 [updatestate_depend_eliminate]: 8.90001e-06 [updatestate_assign_eliminate]: 3.32002e-06 [updatestate_loads_eliminate]: 3.86999e-06 [renormalize]: 5.29981e-07 [cse]: 2.966e-05 [optimize_parallel_all_gather_comm]: 2.067e-05 [overlap_param_gather]: 1.98002e-06 [cconv]: 2.837e-05 [loop_unroll]: 0.00047515 [opt_after_cconv]: 0.00012435, [1] [Cycle 1]: 0.00011749, [7] [c_1]: 3.869e-05 [parameter_eliminate]: 3.38e-06 [updatestate_depend_eliminate]: 7.33e-06 [updatestate_assign_eliminate]: 3.29001e-06 [updatestate_loads_eliminate]: 3.01001e-06 [cse]: 2.612e-05 [renormalize]: 5.8001e-07 [remove_dup_value]: 1.504e-05 [tuple_transform]: 0.0001068, [1] [Cycle 1]: 0.00010132, [4] [d_1]: 5.461e-05 [none_parameter_eliminate]: 1.70001e-06 [renormalize]: 1.80007e-07 [switch_simplify]: 9.80002e-06 [partial_unused_args_eliminate]: 2.14999e-06 [add_recomputation]: 6.55e-05 [cse_after_recomputation]: 2.83e-05, [1] [Cycle 1]: 2.367e-05, [1] [cse]: 1.63e-05 [environ_conv]: 6.46999e-06 [swap_dp_allreduce_reducescatter]: 6.21e-06 [bias_add_comm_swap]: 3.43999e-06 [label_micro_interleaved_index]: 4.55001e-06 [label_fine_grained_interleaved_index]: 3.03003e-06 [merge_cast_opt]: 1.24e-06 [slice_recompute_activation]: 2.27999e-06 [micro_interleaved_order_control]: 2.29001e-06 [assign_add_opt]: 1.29e-06 [ForceFp32Comm]: 1.02e-06 [remove_cast_before_assign_add]: 1.07e-06 [full_micro_interleaved_order_control]: 2.39999e-06 [reorder_send_recv_between_fp_bp]: 2.36e-06 [comm_op_add_attrs]: 9.89996e-07 [add_comm_op_reuse_tag]: 1.01997e-06 [interleave_split_concat_branches]: 1.39e-06 [interleave_parallel_branches]: 1.04998e-06 [overlap_opt_shard_in_pipeline]: 1.07998e-06 [overlap_opt_shard_grad_in_pipeline]: 1.97001e-06 [control_data_broadcast_order]: 1.619e-05 [grouped_pairwise_exchange_alltoall]: 1.92999e-06 [offloading_packed_experts]: 4.29002e-06 [overlap_recompute_and_grad_model_parallel]: 5.02e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.16997e-06 [overlap_recompute_allgather_and_fa_grad]: 1.30999e-06 [overlap_recompute_comm]: 2.26998e-06 [overlap_grad_ring_attention]: 5.12999e-06 [overlap_grad_flash_sp]: 2.39e-05 [begin_end_overlap_inline]: 5.60016e-07 [split_matmul_comm_elemetwise]: 2.37001e-06 [split_layernorm_comm]: 1.89999e-06 [handle_group_info]: 1.29e-06 [symbol_engine_optimizer]: 9.262e-05, [1] [Cycle 1]: 8.702e-05, [6] [build]: 4.15e-06 [elim_shapecalc]: 1.467e-05 [elim_not_effective]: 1.585e-05 [opt_reshape]: 8.43999e-06 [fold_const_symbol]: 1.257e-05 [renormalize]: 4.09986e-07 [detach_backward]: 2.61e-06 [pipeline_parallel_scheduler]: 1.87999e-06 [auto_monad_reorder]: 2.286e-05 [get_jit_bprop_graph]: 1.92999e-06 [rewriter_after_jit_bprop_graph]: 4.25e-06 [opt_after_jit_grad]: 0.00050791 [validate]: 4.998e-05 Sums bootstrap : 0.000458s : 2.21% type_inference : 0.013824s : 66.68% event_method : 0.000085s : 0.41% auto_monad : 0.000102s : 0.49% graph_reusing : 0.000007s : 0.04% inline : 0.000002s : 0.01% add_attr.add_attr_with_inline.tag_attr : 0.000031s : 0.15% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000008s : 0.04% parallel-infer-symbol : 0.000003s : 0.02% pre_auto_parallel : 0.000046s : 0.22% insert-virtual-dataset : 0.000002s : 0.01% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.01% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000033s : 0.16% optimize.rewriter_before_opt_a : 0.000120s : 0.58% optimize.opt_a.expand_dump_flag : 0.000006s : 0.03% optimize.opt_a.switch_simplify : 0.000146s : 0.70% optimize.opt_a.loop_unroll : 0.000050s : 0.24% optimize.opt_a.a_1 : 0.001228s : 5.93% optimize.opt_a.with_stream_mark : 0.000037s : 0.18% optimize.opt_a.recompute_prepare : 0.000023s : 0.11% optimize.opt_a.updatestate_depend_eliminate : 0.000010s : 0.05% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.03% optimize.opt_a.updatestate_loads_eliminate : 0.000007s : 0.03% optimize.opt_a.parameter_eliminate : 0.000004s : 0.02% optimize.opt_a.a_2 : 0.000202s : 0.98% optimize.opt_a.accelerated_algorithm : 0.000019s : 0.09% optimize.opt_a.shard : 0.000004s : 0.02% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.02% optimize.opt_a.shard_inline : 0.000016s : 0.08% optimize.opt_a.merge_send_recv : 0.000018s : 0.09% optimize.opt_a.auto_parallel : 0.000015s : 0.07% optimize.opt_a.parallel : 0.000027s : 0.13% optimize.opt_a.flash_sp : 0.000013s : 0.06% optimize.opt_a.merge_comm : 0.000010s : 0.05% optimize.opt_a.allreduce_fusion : 0.000008s : 0.04% optimize.opt_a.matmul_add_comm_reduction : 0.000020s : 0.10% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000021s : 0.10% optimize.opt_a.virtual_dataset : 0.000016s : 0.08% optimize.opt_a.get_grad_eliminate_ : 0.000015s : 0.07% optimize.opt_a.virtual_output : 0.000015s : 0.07% optimize.opt_a.merge_forward : 0.000009s : 0.04% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.01% optimize.opt_a.offload_activation : 0.000021s : 0.10% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000035s : 0.17% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.02% optimize.opt_a.before_grad : 0.000026s : 0.13% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000011s : 0.05% optimize.opt_a.meta_fg_expand : 0.000007s : 0.03% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.02% optimize.opt_a.receive_attached : 0.000003s : 0.02% optimize.opt_a.after_resolve : 0.000028s : 0.13% optimize.opt_a.a_after_grad : 0.000024s : 0.12% optimize.opt_a.renormalize : 0.001074s : 5.18% optimize.opt_a.add_forward_monad_depend : 0.000010s : 0.05% optimize.opt_a.auto_monad_grad : 0.000004s : 0.02% optimize.opt_a.auto_monad_eliminator : 0.000032s : 0.15% optimize.opt_a.cse : 0.000060s : 0.29% optimize.opt_a.a_3 : 0.000111s : 0.54% optimize.py_interpret_to_execute_after_opt_a : 0.000014s : 0.07% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.01% optimize.rewriter_after_opt_a : 0.000049s : 0.24% optimize.convert_after_rewriter : 0.000008s : 0.04% optimize.order_py_execute_after_rewriter : 0.000006s : 0.03% optimize.mutable_eliminate : 0.000619s : 2.98% optimize.opt_b.b_1 : 0.000307s : 1.48% optimize.opt_b.b_2 : 0.000014s : 0.07% optimize.opt_b.updatestate_depend_eliminate : 0.000009s : 0.04% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.02% optimize.opt_b.updatestate_loads_eliminate : 0.000004s : 0.02% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000030s : 0.14% optimize.optimize_parallel_all_gather_comm : 0.000021s : 0.10% optimize.overlap_param_gather : 0.000002s : 0.01% optimize.cconv : 0.000028s : 0.14% optimize.loop_unroll : 0.000475s : 2.29% optimize.opt_after_cconv.c_1 : 0.000039s : 0.19% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.02% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000007s : 0.04% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.02% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.cse : 0.000026s : 0.13% optimize.opt_after_cconv.renormalize : 0.000001s : 0.00% optimize.remove_dup_value : 0.000015s : 0.07% optimize.tuple_transform.d_1 : 0.000055s : 0.26% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000010s : 0.05% optimize.partial_unused_args_eliminate : 0.000002s : 0.01% optimize.add_recomputation : 0.000066s : 0.32% optimize.cse_after_recomputation.cse : 0.000016s : 0.08% optimize.environ_conv : 0.000006s : 0.03% optimize.swap_dp_allreduce_reducescatter : 0.000006s : 0.03% optimize.bias_add_comm_swap : 0.000003s : 0.02% optimize.label_micro_interleaved_index : 0.000005s : 0.02% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.01% optimize.merge_cast_opt : 0.000001s : 0.01% optimize.slice_recompute_activation : 0.000002s : 0.01% optimize.micro_interleaved_order_control : 0.000002s : 0.01% optimize.assign_add_opt : 0.000001s : 0.01% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.01% optimize.full_micro_interleaved_order_control : 0.000002s : 0.01% optimize.reorder_send_recv_between_fp_bp : 0.000002s : 0.01% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.01% optimize.interleave_parallel_branches : 0.000001s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.01% optimize.control_data_broadcast_order : 0.000016s : 0.08% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.01% optimize.offloading_packed_experts : 0.000004s : 0.02% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.02% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.01% optimize.overlap_recompute_comm : 0.000002s : 0.01% optimize.overlap_grad_ring_attention : 0.000005s : 0.02% optimize.overlap_grad_flash_sp : 0.000024s : 0.12% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.01% optimize.split_layernorm_comm : 0.000002s : 0.01% optimize.handle_group_info : 0.000001s : 0.01% optimize.symbol_engine_optimizer.build : 0.000004s : 0.02% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000015s : 0.07% optimize.symbol_engine_optimizer.elim_not_effective : 0.000016s : 0.08% optimize.symbol_engine_optimizer.opt_reshape : 0.000008s : 0.04% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000013s : 0.06% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000003s : 0.01% pipeline_parallel_scheduler : 0.000002s : 0.01% auto_monad_reorder : 0.000023s : 0.11% get_jit_bprop_graph : 0.000002s : 0.01% rewriter_after_jit_bprop_graph : 0.000004s : 0.02% opt_after_jit_grad : 0.000508s : 2.45% validate : 0.000050s : 0.24% Time group info: ------[substitution.] 0.000283 44 9.51% : 0.000027s : 3: substitution.cast_eliminate 0.77% : 0.000002s : 3: substitution.elim_not_effective 0.60% : 0.000002s : 3: substitution.fold_const_symbol 2.38% : 0.000007s : 5: substitution.graph_param_transform 71.50% : 0.000202s : 8: substitution.inline 1.76% : 0.000005s : 6: substitution.j_node_and_user_rematch 2.13% : 0.000006s : 6: substitution.remove_not_recompute_node 1.69% : 0.000005s : 4: substitution.replace_old_param 3.60% : 0.000010s : 2: substitution.switch_simplify 6.03% : 0.000017s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.013746 2 89.81% : 0.012345s : 1: type_inference.infer 10.19% : 0.001401s : 1: type_inference.specialize ------[replace.] 0.000142 14 44.40% : 0.000063s : 8: replace.inline 35.26% : 0.000050s : 2: replace.switch_simplify 20.34% : 0.000029s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000221 14 89.18% : 0.000197s : 8: match.inline 4.08% : 0.000009s : 2: match.switch_simplify 6.74% : 0.000015s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000299 1838 0.96% : 0.000003s : 21: predicate.accumulaten_eliminater 0.73% : 0.000002s : 5: predicate.ad_related_special_op_eliminate 0.44% : 0.000001s : 10: predicate.addn_check_dump 0.97% : 0.000003s : 21: predicate.addn_zero_filter 0.90% : 0.000003s : 21: predicate.adjust_all_reduce_mul_add 2.12% : 0.000006s : 31: predicate.arithmetic_simplify 1.10% : 0.000003s : 21: predicate.cast_eliminate 0.50% : 0.000001s : 10: predicate.check_bprop_eliminate 0.47% : 0.000001s : 10: predicate.compare_switch_simplify 0.27% : 0.000001s : 5: predicate.const_output_eliminate 0.49% : 0.000001s : 10: predicate.depend_value_elim 1.06% : 0.000003s : 21: predicate.dict_get_item_const_eliminator 1.08% : 0.000003s : 21: predicate.dict_get_item_eliminator 1.00% : 0.000003s : 21: predicate.dict_set_item_eliminator 0.86% : 0.000003s : 10: predicate.dumpgradient_eliminate 0.18% : 0.000001s : 5: predicate.elim_not_effective 0.31% : 0.000001s : 5: predicate.elim_shapecalc_of_broadcastargs 1.22% : 0.000004s : 26: predicate.environ_add_const_eliminate 1.22% : 0.000004s : 26: predicate.environ_get_add_eliminate 1.23% : 0.000004s : 26: predicate.environ_get_depend_swap 1.70% : 0.000005s : 36: predicate.environ_get_eliminate 1.17% : 0.000003s : 26: predicate.environ_get_set_eliminate 1.57% : 0.000005s : 33: predicate.exchange_switch_depend_value 2.45% : 0.000007s : 33: predicate.float_depend_g_call 0.44% : 0.000001s : 10: predicate.float_environ_get_switch 0.71% : 0.000002s : 15: predicate.float_tuple_getitem_switch 0.16% : 0.000000s : 5: predicate.fold_const_symbol 0.61% : 0.000002s : 10: predicate.get_grad_eliminate 0.19% : 0.000001s : 5: predicate.graph_param_transform 0.51% : 0.000002s : 10: predicate.incorporate_call 0.43% : 0.000001s : 10: predicate.incorporate_call_switch 6.40% : 0.000019s : 84: predicate.inline 0.79% : 0.000002s : 10: predicate.inline_without_move 0.28% : 0.000001s : 10: predicate.j_node_and_user_rematch 0.79% : 0.000002s : 10: predicate.less_batch_normalization 1.75% : 0.000005s : 35: predicate.list_to_tuple_eliminator_ 3.13% : 0.000009s : 56: predicate.load_eliminater 0.76% : 0.000002s : 5: predicate.loop_unroll_after_grad 2.37% : 0.000007s : 49: predicate.loop_unroll_before_grad 1.59% : 0.000005s : 31: predicate.make_slice_get_slice_eliminator 0.47% : 0.000001s : 10: predicate.merge_addn 0.49% : 0.000001s : 10: predicate.micro_step_allgather_replace 0.46% : 0.000001s : 10: predicate.mini_step_allgather_replace 0.90% : 0.000003s : 21: predicate.minmaximum_grad 0.90% : 0.000003s : 5: predicate.mutable_eliminate 0.28% : 0.000001s : 5: predicate.opt_reshape 0.31% : 0.000001s : 5: predicate.parallel_virtual_node 1.97% : 0.000006s : 33: predicate.partial_defer_inline 1.67% : 0.000005s : 30: predicate.partial_eliminate 1.02% : 0.000003s : 21: predicate.print_const_string_wrapper 0.51% : 0.000002s : 10: predicate.reduce_all_const_elim 1.34% : 0.000004s : 21: predicate.reduce_eliminate 2.60% : 0.000008s : 56: predicate.redundant_stop_gradient_eliminater 0.34% : 0.000001s : 10: predicate.remove_not_recompute_node 1.29% : 0.000004s : 35: predicate.replace_applicator 0.50% : 0.000001s : 10: predicate.replace_old_param 0.26% : 0.000001s : 5: predicate.reset_defer_inline 1.10% : 0.000003s : 21: predicate.reshape_eliminate 0.50% : 0.000001s : 10: predicate.row_tensor_add_zeros_like 0.54% : 0.000002s : 5: predicate.row_tensor_eliminate 0.68% : 0.000002s : 10: predicate.same_eliminate 0.43% : 0.000001s : 10: predicate.set_cell_output_no_recompute 0.78% : 0.000002s : 10: predicate.shard_identity_eliminate 0.59% : 0.000002s : 10: predicate.special_op_eliminate 0.72% : 0.000002s : 10: predicate.specialize_transform 1.06% : 0.000003s : 10: predicate.split_environ_get_set_with_tuple_value 0.59% : 0.000002s : 10: predicate.stack_unstack_eliminate 0.30% : 0.000001s : 5: predicate.switch_call_monad_eliminater 1.72% : 0.000005s : 33: predicate.switch_defer_inline 2.10% : 0.000006s : 43: predicate.switch_layer_defer_inline 5.59% : 0.000017s : 101: predicate.switch_simplify 0.97% : 0.000003s : 21: predicate.tile_eliminate 1.14% : 0.000003s : 21: predicate.transpose_eliminate 1.58% : 0.000005s : 31: predicate.tuple_list_convert_item_index_to_positive 1.69% : 0.000005s : 31: predicate.tuple_list_get_item_const_eliminator 1.52% : 0.000005s : 31: predicate.tuple_list_get_item_depend_reorder 3.36% : 0.000010s : 45: predicate.tuple_list_get_item_eliminator 1.49% : 0.000004s : 31: predicate.tuple_list_get_set_item_eliminator 2.25% : 0.000007s : 41: predicate.tuple_list_set_item_eliminator 1.64% : 0.000005s : 35: predicate.tuple_to_list_eliminator_ 2.50% : 0.000007s : 56: predicate.updatestate_pure_node_eliminater 3.02% : 0.000009s : 66: predicate.updatestate_useless_node_eliminater 0.29% : 0.000001s : 5: predicate.value_based_eliminate 0.49% : 0.000001s : 10: predicate.virtual_dataset_eliminate 0.51% : 0.000002s : 10: predicate.virtual_output_eliminate 0.24% : 0.000001s : 5: predicate.virtual_view_grad_eliminate 0.38% : 0.000001s : 5: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.001021 16 56.15% : 0.000573s : 6: func_graph_cloner_run.FuncGraphClonerGraph 43.85% : 0.000448s : 10: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.038351 192 0.01% : 0.000004s : 1: ForceFp32Comm 8.80% : 0.003374s : 1: add_attr 8.77% : 0.003362s : 1: add_attr_with_inline 0.01% : 0.000004s : 1: add_comm_op_reuse_tag 0.18% : 0.000071s : 1: add_recomputation 0.01% : 0.000004s : 1: assign_add_opt 0.28% : 0.000109s : 1: auto_monad 0.07% : 0.000027s : 1: auto_monad_reorder 0.01% : 0.000004s : 1: begin_end_overlap_inline 0.02% : 0.000006s : 1: bias_add_comm_swap 1.27% : 0.000486s : 1: bootstrap 0.08% : 0.000032s : 1: cconv 0.01% : 0.000004s : 1: comm_op_add_attrs 0.05% : 0.000020s : 1: control_data_broadcast_order 0.03% : 0.000013s : 1: convert_after_rewriter 0.08% : 0.000031s : 1: cse_after_recomputation 0.01% : 0.000005s : 1: dataset_repeat_opt 0.02% : 0.000006s : 1: detach_backward 0.03% : 0.000010s : 1: environ_conv 0.25% : 0.000095s : 1: event_method 0.01% : 0.000005s : 1: full_micro_interleaved_order_control 0.01% : 0.000005s : 1: get_jit_bprop_graph 0.03% : 0.000011s : 1: graph_reusing 0.01% : 0.000005s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000004s : 1: handle_group_info 0.02% : 0.000006s : 1: inline 0.01% : 0.000006s : 1: insert-virtual-dataset 0.01% : 0.000004s : 1: interleave_parallel_branches 0.01% : 0.000004s : 1: interleave_split_concat_branches 0.02% : 0.000006s : 1: label_fine_grained_interleaved_index 0.02% : 0.000008s : 1: label_micro_interleaved_index 1.26% : 0.000485s : 1: loop_unroll 0.01% : 0.000004s : 1: merge_cast_opt 0.01% : 0.000005s : 1: micro_interleaved_order_control 1.64% : 0.000629s : 1: mutable_eliminate 0.02% : 0.000007s : 1: offloading_packed_experts 0.05% : 0.000020s : 1: opt.transform.loop_unroll_optimizer 0.05% : 0.000021s : 1: opt.transform.mutable_eliminate 4.87% : 0.001869s : 78: opt.transform.opt_a 0.10% : 0.000037s : 1: opt.transform.opt_after_cconv 0.09% : 0.000034s : 1: opt.transform.opt_after_jit_grad 0.73% : 0.000281s : 28: opt.transform.opt_b 0.16% : 0.000061s : 2: opt.transform.opt_trans_graph 0.12% : 0.000047s : 4: opt.transform.symbol_engine_opt 10.25% : 0.003932s : 1: opt_a 0.33% : 0.000128s : 1: opt_after_cconv 1.35% : 0.000518s : 1: opt_after_jit_grad 1.12% : 0.000430s : 1: opt_b 17.00% : 0.006520s : 1: optimize 0.07% : 0.000025s : 1: optimize_parallel_all_gather_comm 0.02% : 0.000009s : 1: order_py_execute_after_rewriter 0.08% : 0.000029s : 1: overlap_grad_flash_sp 0.01% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.02% : 0.000008s : 1: overlap_grad_ring_attention 0.01% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.01% : 0.000005s : 1: overlap_param_gather 0.01% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.02% : 0.000008s : 1: overlap_recompute_and_grad_model_parallel 0.01% : 0.000005s : 1: overlap_recompute_comm 0.02% : 0.000007s : 1: parallel-infer-symbol 0.01% : 0.000004s : 1: parallel-infer-symbol-second 0.02% : 0.000006s : 1: partial_unused_args_eliminate 0.01% : 0.000005s : 1: pipeline_parallel_scheduler 0.01% : 0.000005s : 1: pipeline_split 0.13% : 0.000050s : 1: pre_auto_parallel 0.10% : 0.000038s : 1: py_interpret_to_execute 0.05% : 0.000018s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000004s : 1: remove_cast_before_assign_add 0.05% : 0.000019s : 1: remove_dup_value 1.49% : 0.000570s : 1: renormalize.infer 1.29% : 0.000493s : 1: renormalize.specialize 0.01% : 0.000005s : 1: reorder_send_recv_between_fp_bp 0.02% : 0.000008s : 1: rewriter_after_jit_bprop_graph 0.14% : 0.000054s : 1: rewriter_after_opt_a 0.33% : 0.000125s : 1: rewriter_before_opt_a 0.01% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.01% : 0.000005s : 1: slice_recompute_activation 0.01% : 0.000005s : 1: split_layernorm_comm 0.01% : 0.000005s : 1: split_matmul_comm_elemetwise 0.02% : 0.000009s : 1: swap_dp_allreduce_reducescatter 0.25% : 0.000095s : 1: symbol_engine_optimizer 0.29% : 0.000110s : 1: tuple_transform 36.10% : 0.013845s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:03.233.891 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:03.234.177 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0434241, [21] [bootstrap]: 0.00044476 [type_inference]: 0.0149558 [event_method]: 5.139e-05 [auto_monad]: 0.00010331 [graph_reusing]: 8.42998e-06 [inline]: 2.64001e-06 [add_attr]: 0.00407132, [1] [add_attr_with_inline]: 0.00405841, [1] [Cycle 1]: 0.00011881, [2] [tag_attr]: 3.711e-05 [meta_addattr_fg_expand]: 8.29998e-06 [parallel-infer-symbol]: 3.77998e-06 [pre_auto_parallel]: 5.461e-05 [insert-virtual-dataset]: 2.46e-06 [parallel-infer-symbol-second]: 8.2e-07 [dataset_repeat_opt]: 2.04e-06 [pipeline_split]: 1.61998e-06 [optimize]: 0.0225161, [53] [py_interpret_to_execute]: 4.392e-05 [rewriter_before_opt_a]: 0.00013528 [opt_a]: 0.0199593, [2] [Cycle 1]: 0.0190899, [45] [expand_dump_flag]: 4.94e-06 [switch_simplify]: 0.00014048 [loop_unroll]: 4.176e-05 [a_1]: 0.00091265 [with_stream_mark]: 2.198e-05 [recompute_prepare]: 1.549e-05 [updatestate_depend_eliminate]: 4.47e-06 [updatestate_assign_eliminate]: 3.39001e-06 [updatestate_loads_eliminate]: 2.69999e-06 [parameter_eliminate]: 2.34001e-06 [a_2]: 0.00011665 [accelerated_algorithm]: 8.22e-06 [shard]: 2.68e-06 [meta_shard_fg_expand]: 2.49001e-06 [shard_inline]: 7.71999e-06 [merge_send_recv]: 1.009e-05 [auto_parallel]: 8.43001e-06 [parallel]: 3.893e-05 [flash_sp]: 1.021e-05 [merge_comm]: 4.77e-06 [allreduce_fusion]: 3.74002e-06 [matmul_add_comm_reduction]: 1.791e-05 [allreduce_slice_to_reducescatter]: 6.50005e-07 [virtual_shard_identity]: 2.628e-05 [virtual_dataset]: 8.74e-06 [get_grad_eliminate_]: 7.67002e-06 [virtual_output]: 7.23999e-06 [merge_forward]: 6.48003e-06 [cell_reuse_recompute_pass]: 3.7e-06 [offload_activation]: 1.101e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.381e-05 [merge_recompute_call_nodes]: 2.21e-06 [before_grad]: 1.164e-05 [set_forward_comm_id_for_comm_node_pass]: 8.27998e-06 [meta_fg_expand]: 5.89999e-06 [flash_sp_send_recv_attached]: 6.76e-06 [receive_attached]: 2.58998e-06 [after_resolve]: 1.634e-05 [a_after_grad]: 1.139e-05 [renormalize]: 0.00099816 [add_forward_monad_depend]: 6.93e-06 [auto_monad_grad]: 2.48002e-06 [auto_monad_eliminator]: 1.636e-05 [cse]: 2.74e-05 [a_3]: 6.396e-05 [Cycle 2]: 0.00085159, [45] [expand_dump_flag]: 1.93002e-06 [switch_simplify]: 8.65001e-06 [loop_unroll]: 6.88e-06 [a_1]: 0.0001429 [with_stream_mark]: 1.5e-05 [recompute_prepare]: 6.71e-06 [updatestate_depend_eliminate]: 3.33e-06 [updatestate_assign_eliminate]: 3.31999e-06 [updatestate_loads_eliminate]: 3.26999e-06 [parameter_eliminate]: 1.77001e-06 [a_2]: 0.00010384 [accelerated_algorithm]: 7.31999e-06 [shard]: 2.19999e-06 [meta_shard_fg_expand]: 1.87001e-06 [shard_inline]: 6.46e-06 [merge_send_recv]: 5.71e-06 [auto_parallel]: 8.52e-06 [parallel]: 8.74e-06 [flash_sp]: 4.06001e-06 [merge_comm]: 3.35e-06 [allreduce_fusion]: 3.41001e-06 [matmul_add_comm_reduction]: 7.26999e-06 [allreduce_slice_to_reducescatter]: 6.10016e-07 [virtual_shard_identity]: 7.4e-06 [virtual_dataset]: 6.44999e-06 [get_grad_eliminate_]: 6.10002e-06 [virtual_output]: 5.81e-06 [merge_forward]: 2.59999e-06 [cell_reuse_recompute_pass]: 1.77001e-06 [offload_activation]: 6.54001e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.469e-05 [merge_recompute_call_nodes]: 1.12e-06 [before_grad]: 9.74999e-06 [set_forward_comm_id_for_comm_node_pass]: 3.59002e-06 [meta_fg_expand]: 2.16e-06 [flash_sp_send_recv_attached]: 8.50006e-07 [receive_attached]: 9.70002e-07 [after_resolve]: 1.083e-05 [a_after_grad]: 9.44998e-06 [renormalize]: 8.00064e-08 [add_forward_monad_depend]: 1.77999e-06 [auto_monad_grad]: 1.19998e-06 [auto_monad_eliminator]: 7.28e-06 [cse]: 1.322e-05 [a_3]: 5.107e-05 [py_interpret_to_execute_after_opt_a]: 1.378e-05 [slice_cell_reuse_recomputed_activation]: 5.32001e-06 [rewriter_after_opt_a]: 3.842e-05 [convert_after_rewriter]: 9.76e-06 [order_py_execute_after_rewriter]: 8.2e-06 [mutable_eliminate]: 0.00068565 [opt_b]: 0.0002741, [1] [Cycle 1]: 0.00026439, [7] [b_1]: 0.00017249 [b_2]: 8.02e-06 [updatestate_depend_eliminate]: 5.39e-06 [updatestate_assign_eliminate]: 2.47001e-06 [updatestate_loads_eliminate]: 2.32001e-06 [renormalize]: 5.89993e-07 [cse]: 1.759e-05 [optimize_parallel_all_gather_comm]: 1.795e-05 [overlap_param_gather]: 4.91002e-06 [cconv]: 2.963e-05 [loop_unroll]: 0.00044159 [opt_after_cconv]: 0.0001474, [1] [Cycle 1]: 0.0001383, [7] [c_1]: 3.266e-05 [parameter_eliminate]: 2.63998e-06 [updatestate_depend_eliminate]: 5.19e-06 [updatestate_assign_eliminate]: 2.48998e-06 [updatestate_loads_eliminate]: 2.66e-06 [cse]: 1.689e-05 [renormalize]: 2.09984e-07 [remove_dup_value]: 1.679e-05 [tuple_transform]: 9.58e-05, [1] [Cycle 1]: 8.785e-05, [4] [d_1]: 4.825e-05 [none_parameter_eliminate]: 1.94e-06 [renormalize]: 2.30008e-07 [switch_simplify]: 7.46001e-06 [partial_unused_args_eliminate]: 4.3e-06 [add_recomputation]: 6.05e-05 [cse_after_recomputation]: 2.956e-05, [1] [Cycle 1]: 2.21e-05, [1] [cse]: 1.271e-05 [environ_conv]: 8.50001e-06 [swap_dp_allreduce_reducescatter]: 8.08999e-06 [bias_add_comm_swap]: 5.53002e-06 [label_micro_interleaved_index]: 6.43e-06 [label_fine_grained_interleaved_index]: 5.10001e-06 [merge_cast_opt]: 3.97e-06 [slice_recompute_activation]: 4.47e-06 [micro_interleaved_order_control]: 5.12e-06 [assign_add_opt]: 3.7e-06 [ForceFp32Comm]: 3.6e-06 [remove_cast_before_assign_add]: 3.63e-06 [full_micro_interleaved_order_control]: 5.25999e-06 [reorder_send_recv_between_fp_bp]: 5.49e-06 [comm_op_add_attrs]: 3.38e-06 [add_comm_op_reuse_tag]: 3.35e-06 [interleave_split_concat_branches]: 3.53e-06 [interleave_parallel_branches]: 3.4e-06 [overlap_opt_shard_in_pipeline]: 3.48999e-06 [overlap_opt_shard_grad_in_pipeline]: 4.82e-06 [control_data_broadcast_order]: 1.51e-05 [grouped_pairwise_exchange_alltoall]: 4.07998e-06 [offloading_packed_experts]: 6.33e-06 [overlap_recompute_and_grad_model_parallel]: 6.83e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.55e-06 [overlap_recompute_allgather_and_fa_grad]: 3.67002e-06 [overlap_recompute_comm]: 4.79e-06 [overlap_grad_ring_attention]: 6.86999e-06 [overlap_grad_flash_sp]: 2.334e-05 [begin_end_overlap_inline]: 3.03998e-06 [split_matmul_comm_elemetwise]: 4.89998e-06 [split_layernorm_comm]: 4.06001e-06 [handle_group_info]: 3.27002e-06 [symbol_engine_optimizer]: 9.848e-05, [1] [Cycle 1]: 9.146e-05, [6] [build]: 2.73e-06 [elim_shapecalc]: 1.015e-05 [elim_not_effective]: 1.352e-05 [opt_reshape]: 7.45e-06 [fold_const_symbol]: 1.055e-05 [renormalize]: 2.60014e-07 [detach_backward]: 3.79002e-06 [pipeline_parallel_scheduler]: 1.84e-06 [auto_monad_reorder]: 1.928e-05 [get_jit_bprop_graph]: 1.78997e-06 [rewriter_after_jit_bprop_graph]: 4.94998e-06 [opt_after_jit_grad]: 0.00050379 [validate]: 3.445e-05 Sums bootstrap : 0.000445s : 2.07% type_inference : 0.014956s : 69.53% event_method : 0.000051s : 0.24% auto_monad : 0.000103s : 0.48% graph_reusing : 0.000008s : 0.04% inline : 0.000003s : 0.01% add_attr.add_attr_with_inline.tag_attr : 0.000037s : 0.17% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000008s : 0.04% parallel-infer-symbol : 0.000004s : 0.02% pre_auto_parallel : 0.000055s : 0.25% insert-virtual-dataset : 0.000002s : 0.01% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.01% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000044s : 0.20% optimize.rewriter_before_opt_a : 0.000135s : 0.63% optimize.opt_a.expand_dump_flag : 0.000007s : 0.03% optimize.opt_a.switch_simplify : 0.000149s : 0.69% optimize.opt_a.loop_unroll : 0.000049s : 0.23% optimize.opt_a.a_1 : 0.001056s : 4.91% optimize.opt_a.with_stream_mark : 0.000037s : 0.17% optimize.opt_a.recompute_prepare : 0.000022s : 0.10% optimize.opt_a.updatestate_depend_eliminate : 0.000008s : 0.04% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.03% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.03% optimize.opt_a.parameter_eliminate : 0.000004s : 0.02% optimize.opt_a.a_2 : 0.000220s : 1.03% optimize.opt_a.accelerated_algorithm : 0.000016s : 0.07% optimize.opt_a.shard : 0.000005s : 0.02% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.02% optimize.opt_a.shard_inline : 0.000014s : 0.07% optimize.opt_a.merge_send_recv : 0.000016s : 0.07% optimize.opt_a.auto_parallel : 0.000017s : 0.08% optimize.opt_a.parallel : 0.000048s : 0.22% optimize.opt_a.flash_sp : 0.000014s : 0.07% optimize.opt_a.merge_comm : 0.000008s : 0.04% optimize.opt_a.allreduce_fusion : 0.000007s : 0.03% optimize.opt_a.matmul_add_comm_reduction : 0.000025s : 0.12% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000034s : 0.16% optimize.opt_a.virtual_dataset : 0.000015s : 0.07% optimize.opt_a.get_grad_eliminate_ : 0.000014s : 0.06% optimize.opt_a.virtual_output : 0.000013s : 0.06% optimize.opt_a.merge_forward : 0.000009s : 0.04% optimize.opt_a.cell_reuse_recompute_pass : 0.000005s : 0.03% optimize.opt_a.offload_activation : 0.000018s : 0.08% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000038s : 0.18% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.02% optimize.opt_a.before_grad : 0.000021s : 0.10% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000012s : 0.06% optimize.opt_a.meta_fg_expand : 0.000008s : 0.04% optimize.opt_a.flash_sp_send_recv_attached : 0.000008s : 0.04% optimize.opt_a.receive_attached : 0.000004s : 0.02% optimize.opt_a.after_resolve : 0.000027s : 0.13% optimize.opt_a.a_after_grad : 0.000021s : 0.10% optimize.opt_a.renormalize : 0.000998s : 4.64% optimize.opt_a.add_forward_monad_depend : 0.000009s : 0.04% optimize.opt_a.auto_monad_grad : 0.000004s : 0.02% optimize.opt_a.auto_monad_eliminator : 0.000024s : 0.11% optimize.opt_a.cse : 0.000041s : 0.19% optimize.opt_a.a_3 : 0.000115s : 0.53% optimize.py_interpret_to_execute_after_opt_a : 0.000014s : 0.06% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.02% optimize.rewriter_after_opt_a : 0.000038s : 0.18% optimize.convert_after_rewriter : 0.000010s : 0.05% optimize.order_py_execute_after_rewriter : 0.000008s : 0.04% optimize.mutable_eliminate : 0.000686s : 3.19% optimize.opt_b.b_1 : 0.000172s : 0.80% optimize.opt_b.b_2 : 0.000008s : 0.04% optimize.opt_b.updatestate_depend_eliminate : 0.000005s : 0.03% optimize.opt_b.updatestate_assign_eliminate : 0.000002s : 0.01% optimize.opt_b.updatestate_loads_eliminate : 0.000002s : 0.01% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000018s : 0.08% optimize.optimize_parallel_all_gather_comm : 0.000018s : 0.08% optimize.overlap_param_gather : 0.000005s : 0.02% optimize.cconv : 0.000030s : 0.14% optimize.loop_unroll : 0.000442s : 2.05% optimize.opt_after_cconv.c_1 : 0.000033s : 0.15% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000005s : 0.02% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000002s : 0.01% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.cse : 0.000017s : 0.08% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000017s : 0.08% optimize.tuple_transform.d_1 : 0.000048s : 0.22% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000007s : 0.03% optimize.partial_unused_args_eliminate : 0.000004s : 0.02% optimize.add_recomputation : 0.000060s : 0.28% optimize.cse_after_recomputation.cse : 0.000013s : 0.06% optimize.environ_conv : 0.000009s : 0.04% optimize.swap_dp_allreduce_reducescatter : 0.000008s : 0.04% optimize.bias_add_comm_swap : 0.000006s : 0.03% optimize.label_micro_interleaved_index : 0.000006s : 0.03% optimize.label_fine_grained_interleaved_index : 0.000005s : 0.02% optimize.merge_cast_opt : 0.000004s : 0.02% optimize.slice_recompute_activation : 0.000004s : 0.02% optimize.micro_interleaved_order_control : 0.000005s : 0.02% optimize.assign_add_opt : 0.000004s : 0.02% optimize.ForceFp32Comm : 0.000004s : 0.02% optimize.remove_cast_before_assign_add : 0.000004s : 0.02% optimize.full_micro_interleaved_order_control : 0.000005s : 0.02% optimize.reorder_send_recv_between_fp_bp : 0.000005s : 0.03% optimize.comm_op_add_attrs : 0.000003s : 0.02% optimize.add_comm_op_reuse_tag : 0.000003s : 0.02% optimize.interleave_split_concat_branches : 0.000004s : 0.02% optimize.interleave_parallel_branches : 0.000003s : 0.02% optimize.overlap_opt_shard_in_pipeline : 0.000003s : 0.02% optimize.overlap_opt_shard_grad_in_pipeline : 0.000005s : 0.02% optimize.control_data_broadcast_order : 0.000015s : 0.07% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.02% optimize.offloading_packed_experts : 0.000006s : 0.03% optimize.overlap_recompute_and_grad_model_parallel : 0.000007s : 0.03% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.02% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.02% optimize.overlap_recompute_comm : 0.000005s : 0.02% optimize.overlap_grad_ring_attention : 0.000007s : 0.03% optimize.overlap_grad_flash_sp : 0.000023s : 0.11% optimize.begin_end_overlap_inline : 0.000003s : 0.01% optimize.split_matmul_comm_elemetwise : 0.000005s : 0.02% optimize.split_layernorm_comm : 0.000004s : 0.02% optimize.handle_group_info : 0.000003s : 0.02% optimize.symbol_engine_optimizer.build : 0.000003s : 0.01% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000010s : 0.05% optimize.symbol_engine_optimizer.elim_not_effective : 0.000014s : 0.06% optimize.symbol_engine_optimizer.opt_reshape : 0.000007s : 0.03% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000011s : 0.05% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000004s : 0.02% pipeline_parallel_scheduler : 0.000002s : 0.01% auto_monad_reorder : 0.000019s : 0.09% get_jit_bprop_graph : 0.000002s : 0.01% rewriter_after_jit_bprop_graph : 0.000005s : 0.02% opt_after_jit_grad : 0.000504s : 2.34% validate : 0.000034s : 0.16% Time group info: ------[substitution.] 0.000266 34 0.78% : 0.000002s : 2: substitution.elim_not_effective 0.51% : 0.000001s : 2: substitution.fold_const_symbol 2.73% : 0.000007s : 4: substitution.graph_param_transform 79.07% : 0.000210s : 8: substitution.inline 1.42% : 0.000004s : 4: substitution.j_node_and_user_rematch 1.85% : 0.000005s : 4: substitution.remove_not_recompute_node 2.67% : 0.000007s : 4: substitution.replace_old_param 4.58% : 0.000012s : 2: substitution.switch_simplify 6.40% : 0.000017s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.014878 2 89.17% : 0.013266s : 1: type_inference.infer 10.83% : 0.001612s : 1: type_inference.specialize ------[replace.] 0.000142 14 41.91% : 0.000060s : 8: replace.inline 39.55% : 0.000056s : 2: replace.switch_simplify 18.54% : 0.000026s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000231 14 88.92% : 0.000205s : 8: match.inline 4.67% : 0.000011s : 2: match.switch_simplify 6.41% : 0.000015s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000253 1520 0.92% : 0.000002s : 17: predicate.accumulaten_eliminater 0.64% : 0.000002s : 4: predicate.ad_related_special_op_eliminate 0.50% : 0.000001s : 8: predicate.addn_check_dump 1.19% : 0.000003s : 17: predicate.addn_zero_filter 1.00% : 0.000003s : 17: predicate.adjust_all_reduce_mul_add 2.23% : 0.000006s : 25: predicate.arithmetic_simplify 1.08% : 0.000003s : 17: predicate.cast_eliminate 0.59% : 0.000002s : 8: predicate.check_bprop_eliminate 0.45% : 0.000001s : 8: predicate.compare_switch_simplify 0.15% : 0.000000s : 4: predicate.const_output_eliminate 0.55% : 0.000001s : 8: predicate.depend_value_elim 1.17% : 0.000003s : 17: predicate.dict_get_item_const_eliminator 1.29% : 0.000003s : 17: predicate.dict_get_item_eliminator 1.08% : 0.000003s : 17: predicate.dict_set_item_eliminator 0.77% : 0.000002s : 8: predicate.dumpgradient_eliminate 0.16% : 0.000000s : 4: predicate.elim_not_effective 0.32% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.21% : 0.000003s : 21: predicate.environ_add_const_eliminate 1.11% : 0.000003s : 21: predicate.environ_get_add_eliminate 1.22% : 0.000003s : 21: predicate.environ_get_depend_swap 1.49% : 0.000004s : 29: predicate.environ_get_eliminate 1.12% : 0.000003s : 21: predicate.environ_get_set_eliminate 1.67% : 0.000004s : 29: predicate.exchange_switch_depend_value 2.72% : 0.000007s : 29: predicate.float_depend_g_call 0.51% : 0.000001s : 8: predicate.float_environ_get_switch 0.65% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.15% : 0.000000s : 4: predicate.fold_const_symbol 0.57% : 0.000001s : 8: predicate.get_grad_eliminate 0.19% : 0.000000s : 4: predicate.graph_param_transform 0.48% : 0.000001s : 8: predicate.incorporate_call 0.41% : 0.000001s : 8: predicate.incorporate_call_switch 6.19% : 0.000016s : 70: predicate.inline 0.64% : 0.000002s : 8: predicate.inline_without_move 0.26% : 0.000001s : 8: predicate.j_node_and_user_rematch 0.67% : 0.000002s : 8: predicate.less_batch_normalization 1.77% : 0.000004s : 29: predicate.list_to_tuple_eliminator_ 2.42% : 0.000006s : 46: predicate.load_eliminater 0.74% : 0.000002s : 4: predicate.loop_unroll_after_grad 2.74% : 0.000007s : 47: predicate.loop_unroll_before_grad 1.65% : 0.000004s : 25: predicate.make_slice_get_slice_eliminator 0.42% : 0.000001s : 8: predicate.merge_addn 0.48% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.51% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.87% : 0.000002s : 17: predicate.minmaximum_grad 0.79% : 0.000002s : 4: predicate.mutable_eliminate 0.26% : 0.000001s : 4: predicate.opt_reshape 0.44% : 0.000001s : 4: predicate.parallel_virtual_node 2.36% : 0.000006s : 29: predicate.partial_defer_inline 1.63% : 0.000004s : 25: predicate.partial_eliminate 1.12% : 0.000003s : 17: predicate.print_const_string_wrapper 0.52% : 0.000001s : 8: predicate.reduce_all_const_elim 1.38% : 0.000003s : 17: predicate.reduce_eliminate 2.50% : 0.000006s : 46: predicate.redundant_stop_gradient_eliminater 0.43% : 0.000001s : 8: predicate.remove_not_recompute_node 1.30% : 0.000003s : 29: predicate.replace_applicator 0.42% : 0.000001s : 8: predicate.replace_old_param 0.23% : 0.000001s : 4: predicate.reset_defer_inline 1.10% : 0.000003s : 17: predicate.reshape_eliminate 0.53% : 0.000001s : 8: predicate.row_tensor_add_zeros_like 0.36% : 0.000001s : 4: predicate.row_tensor_eliminate 0.58% : 0.000001s : 8: predicate.same_eliminate 0.60% : 0.000002s : 8: predicate.set_cell_output_no_recompute 1.27% : 0.000003s : 8: predicate.shard_identity_eliminate 0.61% : 0.000002s : 8: predicate.special_op_eliminate 0.77% : 0.000002s : 8: predicate.specialize_transform 0.83% : 0.000002s : 8: predicate.split_environ_get_set_with_tuple_value 0.63% : 0.000002s : 8: predicate.stack_unstack_eliminate 0.26% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.95% : 0.000005s : 29: predicate.switch_defer_inline 2.16% : 0.000005s : 37: predicate.switch_layer_defer_inline 5.80% : 0.000015s : 92: predicate.switch_simplify 0.94% : 0.000002s : 17: predicate.tile_eliminate 0.98% : 0.000002s : 17: predicate.transpose_eliminate 1.51% : 0.000004s : 25: predicate.tuple_list_convert_item_index_to_positive 1.66% : 0.000004s : 25: predicate.tuple_list_get_item_const_eliminator 1.41% : 0.000004s : 25: predicate.tuple_list_get_item_depend_reorder 3.02% : 0.000008s : 37: predicate.tuple_list_get_item_eliminator 1.37% : 0.000003s : 25: predicate.tuple_list_get_set_item_eliminator 2.28% : 0.000006s : 33: predicate.tuple_list_set_item_eliminator 1.69% : 0.000004s : 29: predicate.tuple_to_list_eliminator_ 2.37% : 0.000006s : 46: predicate.updatestate_pure_node_eliminater 2.96% : 0.000007s : 54: predicate.updatestate_useless_node_eliminater 0.29% : 0.000001s : 4: predicate.value_based_eliminate 0.58% : 0.000001s : 8: predicate.virtual_dataset_eliminate 0.56% : 0.000001s : 8: predicate.virtual_output_eliminate 0.28% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.36% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.001041 16 55.14% : 0.000574s : 6: func_graph_cloner_run.FuncGraphClonerGraph 44.86% : 0.000467s : 10: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.072801 192 0.01% : 0.000006s : 1: ForceFp32Comm 5.61% : 0.004084s : 1: add_attr 5.58% : 0.004063s : 1: add_attr_with_inline 0.01% : 0.000006s : 1: add_comm_op_reuse_tag 0.09% : 0.000064s : 1: add_recomputation 0.01% : 0.000006s : 1: assign_add_opt 0.16% : 0.000115s : 1: auto_monad 0.04% : 0.000027s : 1: auto_monad_reorder 0.01% : 0.000006s : 1: begin_end_overlap_inline 0.01% : 0.000008s : 1: bias_add_comm_swap 0.68% : 0.000499s : 1: bootstrap 0.05% : 0.000033s : 1: cconv 0.01% : 0.000006s : 1: comm_op_add_attrs 0.03% : 0.000018s : 1: control_data_broadcast_order 0.02% : 0.000013s : 1: convert_after_rewriter 0.04% : 0.000033s : 1: cse_after_recomputation 0.01% : 0.000007s : 1: dataset_repeat_opt 0.03% : 0.000021s : 1: detach_backward 0.02% : 0.000012s : 1: environ_conv 0.09% : 0.000065s : 1: event_method 0.01% : 0.000008s : 1: full_micro_interleaved_order_control 0.01% : 0.000008s : 1: get_jit_bprop_graph 0.02% : 0.000015s : 1: graph_reusing 0.01% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000006s : 1: handle_group_info 0.01% : 0.000008s : 1: inline 0.01% : 0.000008s : 1: insert-virtual-dataset 0.01% : 0.000006s : 1: interleave_parallel_branches 0.01% : 0.000006s : 1: interleave_split_concat_branches 0.01% : 0.000008s : 1: label_fine_grained_interleaved_index 0.01% : 0.000009s : 1: label_micro_interleaved_index 0.61% : 0.000447s : 1: loop_unroll 0.01% : 0.000007s : 1: merge_cast_opt 0.01% : 0.000008s : 1: micro_interleaved_order_control 0.95% : 0.000692s : 1: mutable_eliminate 0.01% : 0.000009s : 1: offloading_packed_experts 0.02% : 0.000014s : 1: opt.transform.loop_unroll_optimizer 0.02% : 0.000016s : 1: opt.transform.mutable_eliminate 2.25% : 0.001640s : 78: opt.transform.opt_a 0.04% : 0.000031s : 1: opt.transform.opt_after_cconv 0.04% : 0.000028s : 1: opt.transform.opt_after_jit_grad 0.15% : 0.000107s : 28: opt.transform.opt_b 0.07% : 0.000053s : 2: opt.transform.opt_trans_graph 0.05% : 0.000038s : 4: opt.transform.symbol_engine_opt 27.42% : 0.019963s : 1: opt_a 0.21% : 0.000151s : 1: opt_after_cconv 0.71% : 0.000514s : 1: opt_after_jit_grad 0.38% : 0.000277s : 1: opt_b 31.41% : 0.022866s : 1: optimize 0.03% : 0.000021s : 1: optimize_parallel_all_gather_comm 0.02% : 0.000011s : 1: order_py_execute_after_rewriter 0.04% : 0.000026s : 1: overlap_grad_flash_sp 0.01% : 0.000006s : 1: overlap_grad_matmul_and_grad_allreduce 0.01% : 0.000010s : 1: overlap_grad_ring_attention 0.01% : 0.000008s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000006s : 1: overlap_opt_shard_in_pipeline 0.01% : 0.000008s : 1: overlap_param_gather 0.01% : 0.000006s : 1: overlap_recompute_allgather_and_fa_grad 0.01% : 0.000010s : 1: overlap_recompute_and_grad_model_parallel 0.01% : 0.000008s : 1: overlap_recompute_comm 0.01% : 0.000010s : 1: parallel-infer-symbol 0.01% : 0.000008s : 1: parallel-infer-symbol-second 0.01% : 0.000007s : 1: partial_unused_args_eliminate 0.01% : 0.000010s : 1: pipeline_parallel_scheduler 0.01% : 0.000007s : 1: pipeline_split 0.09% : 0.000062s : 1: pre_auto_parallel 0.07% : 0.000048s : 1: py_interpret_to_execute 0.02% : 0.000017s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000006s : 1: remove_cast_before_assign_add 0.03% : 0.000020s : 1: remove_dup_value 0.72% : 0.000521s : 1: renormalize.infer 0.64% : 0.000468s : 1: renormalize.specialize 0.01% : 0.000008s : 1: reorder_send_recv_between_fp_bp 0.02% : 0.000011s : 1: rewriter_after_jit_bprop_graph 0.06% : 0.000042s : 1: rewriter_after_opt_a 0.19% : 0.000140s : 1: rewriter_before_opt_a 0.01% : 0.000008s : 1: slice_cell_reuse_recomputed_activation 0.01% : 0.000007s : 1: slice_recompute_activation 0.01% : 0.000007s : 1: split_layernorm_comm 0.01% : 0.000008s : 1: split_matmul_comm_elemetwise 0.02% : 0.000011s : 1: swap_dp_allreduce_reducescatter 0.14% : 0.000101s : 1: symbol_engine_optimizer 0.14% : 0.000099s : 1: tuple_transform 20.62% : 0.015009s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:03.670.052 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0358144, [21] [bootstrap]: 0.00044992 [type_inference]: 0.0259591 [event_method]: 4.984e-05 [auto_monad]: 9.829e-05 [graph_reusing]: 7.23999e-06 [inline]: 2.24001e-06 [add_attr]: 0.00335683, [1] [add_attr_with_inline]: 0.0033481, [1] [Cycle 1]: 6.828e-05, [2] [tag_attr]: 2.732e-05 [meta_addattr_fg_expand]: 7.73999e-06 [parallel-infer-symbol]: 3.58e-06 [pre_auto_parallel]: 4.19e-05 [insert-virtual-dataset]: 2.78e-06 [parallel-infer-symbol-second]: 7.79983e-07 [dataset_repeat_opt]: 1.96e-06 [pipeline_split]: 1.97001e-06 [optimize]: 0.00517338, [53] [py_interpret_to_execute]: 3.185e-05 [rewriter_before_opt_a]: 0.0001102 [opt_a]: 0.00317627, [2] [Cycle 1]: 0.00249536, [45] [expand_dump_flag]: 3.75e-06 [switch_simplify]: 0.0001253 [loop_unroll]: 4.273e-05 [a_1]: 0.00083096 [with_stream_mark]: 1.77e-05 [recompute_prepare]: 1.08e-05 [updatestate_depend_eliminate]: 3.76001e-06 [updatestate_assign_eliminate]: 3.02002e-06 [updatestate_loads_eliminate]: 3.36001e-06 [parameter_eliminate]: 2.94001e-06 [a_2]: 8.269e-05 [accelerated_algorithm]: 7.2e-06 [shard]: 1.89e-06 [meta_shard_fg_expand]: 2.07001e-06 [shard_inline]: 6.58e-06 [merge_send_recv]: 8.25e-06 [auto_parallel]: 6.48e-06 [parallel]: 1.898e-05 [flash_sp]: 8.03001e-06 [merge_comm]: 3.90998e-06 [allreduce_fusion]: 3.45e-06 [matmul_add_comm_reduction]: 9.38002e-06 [allreduce_slice_to_reducescatter]: 6.09987e-07 [virtual_shard_identity]: 8.47e-06 [virtual_dataset]: 6.97997e-06 [get_grad_eliminate_]: 6.21e-06 [virtual_output]: 6.38e-06 [merge_forward]: 3.71999e-06 [cell_reuse_recompute_pass]: 1.19e-06 [offload_activation]: 9.96998e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.247e-05 [merge_recompute_call_nodes]: 1.45001e-06 [before_grad]: 1.107e-05 [set_forward_comm_id_for_comm_node_pass]: 3.45e-06 [meta_fg_expand]: 3.3e-06 [flash_sp_send_recv_attached]: 2.73998e-06 [receive_attached]: 2.39001e-06 [after_resolve]: 1.156e-05 [a_after_grad]: 1.002e-05 [renormalize]: 0.00082845 [add_forward_monad_depend]: 6.04001e-06 [auto_monad_grad]: 2.22001e-06 [auto_monad_eliminator]: 1.673e-05 [cse]: 2.926e-05 [a_3]: 4.869e-05 [Cycle 2]: 0.00067111, [45] [expand_dump_flag]: 1.00999e-06 [switch_simplify]: 7.98001e-06 [loop_unroll]: 6.06003e-06 [a_1]: 0.00013391 [with_stream_mark]: 1.185e-05 [recompute_prepare]: 6.49001e-06 [updatestate_depend_eliminate]: 3.27997e-06 [updatestate_assign_eliminate]: 2.30002e-06 [updatestate_loads_eliminate]: 2.86999e-06 [parameter_eliminate]: 1.12999e-06 [a_2]: 7.26e-05 [accelerated_algorithm]: 6.44001e-06 [shard]: 1.77001e-06 [meta_shard_fg_expand]: 1.34e-06 [shard_inline]: 6.26998e-06 [merge_send_recv]: 5.56e-06 [auto_parallel]: 5.34e-06 [parallel]: 5.85002e-06 [flash_sp]: 3.75e-06 [merge_comm]: 3.65e-06 [allreduce_fusion]: 3.3e-06 [matmul_add_comm_reduction]: 6.61999e-06 [allreduce_slice_to_reducescatter]: 5.50004e-07 [virtual_shard_identity]: 6.49001e-06 [virtual_dataset]: 5.81e-06 [get_grad_eliminate_]: 5.92001e-06 [virtual_output]: 5.86e-06 [merge_forward]: 2.90002e-06 [cell_reuse_recompute_pass]: 1.35001e-06 [offload_activation]: 7.05002e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.318e-05 [merge_recompute_call_nodes]: 9.79984e-07 [before_grad]: 1.051e-05 [set_forward_comm_id_for_comm_node_pass]: 4.03001e-06 [meta_fg_expand]: 2.18998e-06 [flash_sp_send_recv_attached]: 1.09e-06 [receive_attached]: 9.80013e-07 [after_resolve]: 1.079e-05 [a_after_grad]: 9.57001e-06 [renormalize]: 8.00064e-08 [add_forward_monad_depend]: 1.79e-06 [auto_monad_grad]: 1.07998e-06 [auto_monad_eliminator]: 7.53e-06 [cse]: 1.438e-05 [a_3]: 3.803e-05 [py_interpret_to_execute_after_opt_a]: 1.035e-05 [slice_cell_reuse_recomputed_activation]: 2.41e-06 [rewriter_after_opt_a]: 3.597e-05 [convert_after_rewriter]: 6.98e-06 [order_py_execute_after_rewriter]: 5.35999e-06 [mutable_eliminate]: 0.00048305 [opt_b]: 0.00020059, [1] [Cycle 1]: 0.00019475, [7] [b_1]: 0.0001249 [b_2]: 8.48001e-06 [updatestate_depend_eliminate]: 5.17999e-06 [updatestate_assign_eliminate]: 2.33002e-06 [updatestate_loads_eliminate]: 2.21e-06 [renormalize]: 5.09986e-07 [cse]: 1.731e-05 [optimize_parallel_all_gather_comm]: 1.608e-05 [overlap_param_gather]: 2.70997e-06 [cconv]: 2.361e-05 [loop_unroll]: 0.00041177 [opt_after_cconv]: 9.922e-05, [1] [Cycle 1]: 9.366e-05, [7] [c_1]: 3.18e-05 [parameter_eliminate]: 2.70002e-06 [updatestate_depend_eliminate]: 4.90001e-06 [updatestate_assign_eliminate]: 2.34999e-06 [updatestate_loads_eliminate]: 2.22001e-06 [cse]: 1.598e-05 [renormalize]: 4.10015e-07 [remove_dup_value]: 1.265e-05 [tuple_transform]: 7.499e-05, [1] [Cycle 1]: 7.098e-05, [4] [d_1]: 4.342e-05 [none_parameter_eliminate]: 1.57001e-06 [renormalize]: 2.19996e-07 [switch_simplify]: 6.73e-06 [partial_unused_args_eliminate]: 1.72999e-06 [add_recomputation]: 4.58e-05 [cse_after_recomputation]: 3.4e-05, [1] [Cycle 1]: 2.931e-05, [1] [cse]: 2.342e-05 [environ_conv]: 5.54e-06 [swap_dp_allreduce_reducescatter]: 5.09e-06 [bias_add_comm_swap]: 2.87002e-06 [label_micro_interleaved_index]: 4.23999e-06 [label_fine_grained_interleaved_index]: 2.69999e-06 [merge_cast_opt]: 1.46002e-06 [slice_recompute_activation]: 1.92999e-06 [micro_interleaved_order_control]: 2.27001e-06 [assign_add_opt]: 1.42e-06 [ForceFp32Comm]: 9.89996e-07 [remove_cast_before_assign_add]: 9.89996e-07 [full_micro_interleaved_order_control]: 1.91998e-06 [reorder_send_recv_between_fp_bp]: 2.66999e-06 [comm_op_add_attrs]: 9.90025e-07 [add_comm_op_reuse_tag]: 9.10019e-07 [interleave_split_concat_branches]: 1.11002e-06 [interleave_parallel_branches]: 1.09e-06 [overlap_opt_shard_in_pipeline]: 1.49998e-06 [overlap_opt_shard_grad_in_pipeline]: 2.59999e-06 [control_data_broadcast_order]: 1.242e-05 [grouped_pairwise_exchange_alltoall]: 1.68002e-06 [offloading_packed_experts]: 4.05e-06 [overlap_recompute_and_grad_model_parallel]: 4.51002e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.18001e-06 [overlap_recompute_allgather_and_fa_grad]: 1.32e-06 [overlap_recompute_comm]: 2.39001e-06 [overlap_grad_ring_attention]: 3.86999e-06 [overlap_grad_flash_sp]: 1.881e-05 [begin_end_overlap_inline]: 5.39992e-07 [split_matmul_comm_elemetwise]: 1.94e-06 [split_layernorm_comm]: 1.49e-06 [handle_group_info]: 8.80013e-07 [symbol_engine_optimizer]: 7.721e-05, [1] [Cycle 1]: 7.291e-05, [6] [build]: 2.64001e-06 [elim_shapecalc]: 9.82001e-06 [elim_not_effective]: 1.374e-05 [opt_reshape]: 7.21001e-06 [fold_const_symbol]: 1.058e-05 [renormalize]: 2.30008e-07 [detach_backward]: 1.81e-06 [pipeline_parallel_scheduler]: 1.47999e-06 [auto_monad_reorder]: 1.63e-05 [get_jit_bprop_graph]: 1.19e-06 [rewriter_after_jit_bprop_graph]: 4.2e-06 [opt_after_jit_grad]: 0.00044942 [validate]: 3.533e-05 Sums bootstrap : 0.000450s : 1.43% type_inference : 0.025959s : 82.43% event_method : 0.000050s : 0.16% auto_monad : 0.000098s : 0.31% graph_reusing : 0.000007s : 0.02% inline : 0.000002s : 0.01% add_attr.add_attr_with_inline.tag_attr : 0.000027s : 0.09% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000008s : 0.02% parallel-infer-symbol : 0.000004s : 0.01% pre_auto_parallel : 0.000042s : 0.13% insert-virtual-dataset : 0.000003s : 0.01% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.01% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000032s : 0.10% optimize.rewriter_before_opt_a : 0.000110s : 0.35% optimize.opt_a.expand_dump_flag : 0.000005s : 0.02% optimize.opt_a.switch_simplify : 0.000133s : 0.42% optimize.opt_a.loop_unroll : 0.000049s : 0.15% optimize.opt_a.a_1 : 0.000965s : 3.06% optimize.opt_a.with_stream_mark : 0.000030s : 0.09% optimize.opt_a.recompute_prepare : 0.000017s : 0.05% optimize.opt_a.updatestate_depend_eliminate : 0.000007s : 0.02% optimize.opt_a.updatestate_assign_eliminate : 0.000005s : 0.02% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.02% optimize.opt_a.parameter_eliminate : 0.000004s : 0.01% optimize.opt_a.a_2 : 0.000155s : 0.49% optimize.opt_a.accelerated_algorithm : 0.000014s : 0.04% optimize.opt_a.shard : 0.000004s : 0.01% optimize.opt_a.meta_shard_fg_expand : 0.000003s : 0.01% optimize.opt_a.shard_inline : 0.000013s : 0.04% optimize.opt_a.merge_send_recv : 0.000014s : 0.04% optimize.opt_a.auto_parallel : 0.000012s : 0.04% optimize.opt_a.parallel : 0.000025s : 0.08% optimize.opt_a.flash_sp : 0.000012s : 0.04% optimize.opt_a.merge_comm : 0.000008s : 0.02% optimize.opt_a.allreduce_fusion : 0.000007s : 0.02% optimize.opt_a.matmul_add_comm_reduction : 0.000016s : 0.05% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000015s : 0.05% optimize.opt_a.virtual_dataset : 0.000013s : 0.04% optimize.opt_a.get_grad_eliminate_ : 0.000012s : 0.04% optimize.opt_a.virtual_output : 0.000012s : 0.04% optimize.opt_a.merge_forward : 0.000007s : 0.02% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.01% optimize.opt_a.offload_activation : 0.000017s : 0.05% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000026s : 0.08% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.01% optimize.opt_a.before_grad : 0.000022s : 0.07% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000007s : 0.02% optimize.opt_a.meta_fg_expand : 0.000005s : 0.02% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.01% optimize.opt_a.receive_attached : 0.000003s : 0.01% optimize.opt_a.after_resolve : 0.000022s : 0.07% optimize.opt_a.a_after_grad : 0.000020s : 0.06% optimize.opt_a.renormalize : 0.000829s : 2.63% optimize.opt_a.add_forward_monad_depend : 0.000008s : 0.02% optimize.opt_a.auto_monad_grad : 0.000003s : 0.01% optimize.opt_a.auto_monad_eliminator : 0.000024s : 0.08% optimize.opt_a.cse : 0.000044s : 0.14% optimize.opt_a.a_3 : 0.000087s : 0.28% optimize.py_interpret_to_execute_after_opt_a : 0.000010s : 0.03% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.01% optimize.rewriter_after_opt_a : 0.000036s : 0.11% optimize.convert_after_rewriter : 0.000007s : 0.02% optimize.order_py_execute_after_rewriter : 0.000005s : 0.02% optimize.mutable_eliminate : 0.000483s : 1.53% optimize.opt_b.b_1 : 0.000125s : 0.40% optimize.opt_b.b_2 : 0.000008s : 0.03% optimize.opt_b.updatestate_depend_eliminate : 0.000005s : 0.02% optimize.opt_b.updatestate_assign_eliminate : 0.000002s : 0.01% optimize.opt_b.updatestate_loads_eliminate : 0.000002s : 0.01% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000017s : 0.05% optimize.optimize_parallel_all_gather_comm : 0.000016s : 0.05% optimize.overlap_param_gather : 0.000003s : 0.01% optimize.cconv : 0.000024s : 0.07% optimize.loop_unroll : 0.000412s : 1.31% optimize.opt_after_cconv.c_1 : 0.000032s : 0.10% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000005s : 0.02% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000002s : 0.01% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.01% optimize.opt_after_cconv.cse : 0.000016s : 0.05% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000013s : 0.04% optimize.tuple_transform.d_1 : 0.000043s : 0.14% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000007s : 0.02% optimize.partial_unused_args_eliminate : 0.000002s : 0.01% optimize.add_recomputation : 0.000046s : 0.15% optimize.cse_after_recomputation.cse : 0.000023s : 0.07% optimize.environ_conv : 0.000006s : 0.02% optimize.swap_dp_allreduce_reducescatter : 0.000005s : 0.02% optimize.bias_add_comm_swap : 0.000003s : 0.01% optimize.label_micro_interleaved_index : 0.000004s : 0.01% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.01% optimize.merge_cast_opt : 0.000001s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.01% optimize.micro_interleaved_order_control : 0.000002s : 0.01% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.00% optimize.full_micro_interleaved_order_control : 0.000002s : 0.01% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.01% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000003s : 0.01% optimize.control_data_broadcast_order : 0.000012s : 0.04% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.01% optimize.offloading_packed_experts : 0.000004s : 0.01% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.01% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.00% optimize.overlap_recompute_comm : 0.000002s : 0.01% optimize.overlap_grad_ring_attention : 0.000004s : 0.01% optimize.overlap_grad_flash_sp : 0.000019s : 0.06% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.01% optimize.split_layernorm_comm : 0.000001s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000003s : 0.01% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000010s : 0.03% optimize.symbol_engine_optimizer.elim_not_effective : 0.000014s : 0.04% optimize.symbol_engine_optimizer.opt_reshape : 0.000007s : 0.02% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000011s : 0.03% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.01% pipeline_parallel_scheduler : 0.000001s : 0.00% auto_monad_reorder : 0.000016s : 0.05% get_jit_bprop_graph : 0.000001s : 0.00% rewriter_after_jit_bprop_graph : 0.000004s : 0.01% opt_after_jit_grad : 0.000449s : 1.43% validate : 0.000035s : 0.11% Time group info: ------[substitution.] 0.000215 34 0.92% : 0.000002s : 2: substitution.elim_not_effective 0.80% : 0.000002s : 2: substitution.fold_const_symbol 2.66% : 0.000006s : 4: substitution.graph_param_transform 77.41% : 0.000167s : 8: substitution.inline 1.85% : 0.000004s : 4: substitution.j_node_and_user_rematch 2.12% : 0.000005s : 4: substitution.remove_not_recompute_node 2.13% : 0.000005s : 4: substitution.replace_old_param 4.78% : 0.000010s : 2: substitution.switch_simplify 7.32% : 0.000016s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.025886 2 94.60% : 0.024488s : 1: type_inference.infer 5.40% : 0.001398s : 1: type_inference.specialize ------[replace.] 0.000119 14 42.99% : 0.000051s : 8: replace.inline 38.58% : 0.000046s : 2: replace.switch_simplify 18.43% : 0.000022s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000185 14 87.66% : 0.000162s : 8: match.inline 4.89% : 0.000009s : 2: match.switch_simplify 7.45% : 0.000014s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000235 1520 1.04% : 0.000002s : 17: predicate.accumulaten_eliminater 0.56% : 0.000001s : 4: predicate.ad_related_special_op_eliminate 0.43% : 0.000001s : 8: predicate.addn_check_dump 1.00% : 0.000002s : 17: predicate.addn_zero_filter 0.97% : 0.000002s : 17: predicate.adjust_all_reduce_mul_add 2.08% : 0.000005s : 25: predicate.arithmetic_simplify 1.02% : 0.000002s : 17: predicate.cast_eliminate 0.51% : 0.000001s : 8: predicate.check_bprop_eliminate 0.41% : 0.000001s : 8: predicate.compare_switch_simplify 0.16% : 0.000000s : 4: predicate.const_output_eliminate 0.51% : 0.000001s : 8: predicate.depend_value_elim 1.13% : 0.000003s : 17: predicate.dict_get_item_const_eliminator 1.14% : 0.000003s : 17: predicate.dict_get_item_eliminator 0.99% : 0.000002s : 17: predicate.dict_set_item_eliminator 0.77% : 0.000002s : 8: predicate.dumpgradient_eliminate 0.19% : 0.000000s : 4: predicate.elim_not_effective 0.31% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.23% : 0.000003s : 21: predicate.environ_add_const_eliminate 1.13% : 0.000003s : 21: predicate.environ_get_add_eliminate 1.28% : 0.000003s : 21: predicate.environ_get_depend_swap 1.70% : 0.000004s : 29: predicate.environ_get_eliminate 1.15% : 0.000003s : 21: predicate.environ_get_set_eliminate 1.78% : 0.000004s : 29: predicate.exchange_switch_depend_value 2.81% : 0.000007s : 29: predicate.float_depend_g_call 0.45% : 0.000001s : 8: predicate.float_environ_get_switch 0.65% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.17% : 0.000000s : 4: predicate.fold_const_symbol 0.54% : 0.000001s : 8: predicate.get_grad_eliminate 0.20% : 0.000000s : 4: predicate.graph_param_transform 0.52% : 0.000001s : 8: predicate.incorporate_call 0.41% : 0.000001s : 8: predicate.incorporate_call_switch 6.61% : 0.000016s : 70: predicate.inline 0.65% : 0.000002s : 8: predicate.inline_without_move 0.29% : 0.000001s : 8: predicate.j_node_and_user_rematch 0.67% : 0.000002s : 8: predicate.less_batch_normalization 1.91% : 0.000004s : 29: predicate.list_to_tuple_eliminator_ 2.53% : 0.000006s : 46: predicate.load_eliminater 0.74% : 0.000002s : 4: predicate.loop_unroll_after_grad 2.89% : 0.000007s : 47: predicate.loop_unroll_before_grad 1.60% : 0.000004s : 25: predicate.make_slice_get_slice_eliminator 0.44% : 0.000001s : 8: predicate.merge_addn 0.46% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.55% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.91% : 0.000002s : 17: predicate.minmaximum_grad 0.79% : 0.000002s : 4: predicate.mutable_eliminate 0.29% : 0.000001s : 4: predicate.opt_reshape 0.28% : 0.000001s : 4: predicate.parallel_virtual_node 2.11% : 0.000005s : 29: predicate.partial_defer_inline 1.70% : 0.000004s : 25: predicate.partial_eliminate 0.99% : 0.000002s : 17: predicate.print_const_string_wrapper 0.47% : 0.000001s : 8: predicate.reduce_all_const_elim 1.25% : 0.000003s : 17: predicate.reduce_eliminate 2.58% : 0.000006s : 46: predicate.redundant_stop_gradient_eliminater 0.39% : 0.000001s : 8: predicate.remove_not_recompute_node 1.42% : 0.000003s : 29: predicate.replace_applicator 0.37% : 0.000001s : 8: predicate.replace_old_param 0.24% : 0.000001s : 4: predicate.reset_defer_inline 0.99% : 0.000002s : 17: predicate.reshape_eliminate 0.50% : 0.000001s : 8: predicate.row_tensor_add_zeros_like 0.34% : 0.000001s : 4: predicate.row_tensor_eliminate 0.62% : 0.000001s : 8: predicate.same_eliminate 0.47% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.71% : 0.000002s : 8: predicate.shard_identity_eliminate 0.63% : 0.000001s : 8: predicate.special_op_eliminate 0.68% : 0.000002s : 8: predicate.specialize_transform 0.68% : 0.000002s : 8: predicate.split_environ_get_set_with_tuple_value 0.62% : 0.000001s : 8: predicate.stack_unstack_eliminate 0.30% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.84% : 0.000004s : 29: predicate.switch_defer_inline 2.30% : 0.000005s : 37: predicate.switch_layer_defer_inline 6.17% : 0.000015s : 92: predicate.switch_simplify 0.99% : 0.000002s : 17: predicate.tile_eliminate 1.03% : 0.000002s : 17: predicate.transpose_eliminate 1.48% : 0.000003s : 25: predicate.tuple_list_convert_item_index_to_positive 1.61% : 0.000004s : 25: predicate.tuple_list_get_item_const_eliminator 1.55% : 0.000004s : 25: predicate.tuple_list_get_item_depend_reorder 3.09% : 0.000007s : 37: predicate.tuple_list_get_item_eliminator 1.49% : 0.000004s : 25: predicate.tuple_list_get_set_item_eliminator 2.18% : 0.000005s : 33: predicate.tuple_list_set_item_eliminator 1.75% : 0.000004s : 29: predicate.tuple_to_list_eliminator_ 2.51% : 0.000006s : 46: predicate.updatestate_pure_node_eliminater 3.08% : 0.000007s : 54: predicate.updatestate_useless_node_eliminater 0.35% : 0.000001s : 4: predicate.value_based_eliminate 0.52% : 0.000001s : 8: predicate.virtual_dataset_eliminate 0.51% : 0.000001s : 8: predicate.virtual_output_eliminate 0.26% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.39% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000999 16 58.78% : 0.000587s : 6: func_graph_cloner_run.FuncGraphClonerGraph 41.22% : 0.000412s : 10: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.046805 192 0.01% : 0.000004s : 1: ForceFp32Comm 7.18% : 0.003362s : 1: add_attr 7.16% : 0.003352s : 1: add_attr_with_inline 0.01% : 0.000004s : 1: add_comm_op_reuse_tag 0.11% : 0.000050s : 1: add_recomputation 0.01% : 0.000004s : 1: assign_add_opt 0.23% : 0.000105s : 1: auto_monad 0.04% : 0.000020s : 1: auto_monad_reorder 0.01% : 0.000003s : 1: begin_end_overlap_inline 0.01% : 0.000006s : 1: bias_add_comm_swap 1.02% : 0.000478s : 1: bootstrap 0.06% : 0.000027s : 1: cconv 0.01% : 0.000004s : 1: comm_op_add_attrs 0.03% : 0.000016s : 1: control_data_broadcast_order 0.02% : 0.000010s : 1: convert_after_rewriter 0.08% : 0.000037s : 1: cse_after_recomputation 0.01% : 0.000005s : 1: dataset_repeat_opt 0.01% : 0.000005s : 1: detach_backward 0.02% : 0.000009s : 1: environ_conv 0.12% : 0.000057s : 1: event_method 0.01% : 0.000005s : 1: full_micro_interleaved_order_control 0.01% : 0.000004s : 1: get_jit_bprop_graph 0.02% : 0.000011s : 1: graph_reusing 0.01% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000004s : 1: handle_group_info 0.01% : 0.000005s : 1: inline 0.01% : 0.000006s : 1: insert-virtual-dataset 0.01% : 0.000004s : 1: interleave_parallel_branches 0.01% : 0.000004s : 1: interleave_split_concat_branches 0.01% : 0.000005s : 1: label_fine_grained_interleaved_index 0.02% : 0.000007s : 1: label_micro_interleaved_index 0.90% : 0.000420s : 1: loop_unroll 0.01% : 0.000005s : 1: merge_cast_opt 0.01% : 0.000005s : 1: micro_interleaved_order_control 1.05% : 0.000491s : 1: mutable_eliminate 0.02% : 0.000007s : 1: offloading_packed_experts 0.03% : 0.000013s : 1: opt.transform.loop_unroll_optimizer 0.03% : 0.000015s : 1: opt.transform.mutable_eliminate 3.17% : 0.001484s : 78: opt.transform.opt_a 0.07% : 0.000031s : 1: opt.transform.opt_after_cconv 0.05% : 0.000024s : 1: opt.transform.opt_after_jit_grad 0.22% : 0.000103s : 28: opt.transform.opt_b 0.10% : 0.000048s : 2: opt.transform.opt_trans_graph 0.08% : 0.000038s : 4: opt.transform.symbol_engine_opt 6.79% : 0.003179s : 1: opt_a 0.22% : 0.000102s : 1: opt_after_cconv 0.98% : 0.000458s : 1: opt_after_jit_grad 0.44% : 0.000204s : 1: opt_b 11.06% : 0.005178s : 1: optimize 0.04% : 0.000019s : 1: optimize_parallel_all_gather_comm 0.02% : 0.000008s : 1: order_py_execute_after_rewriter 0.05% : 0.000022s : 1: overlap_grad_flash_sp 0.01% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.01% : 0.000007s : 1: overlap_grad_ring_attention 0.01% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.01% : 0.000006s : 1: overlap_param_gather 0.01% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.02% : 0.000007s : 1: overlap_recompute_and_grad_model_parallel 0.01% : 0.000005s : 1: overlap_recompute_comm 0.02% : 0.000007s : 1: parallel-infer-symbol 0.01% : 0.000004s : 1: parallel-infer-symbol-second 0.01% : 0.000005s : 1: partial_unused_args_eliminate 0.01% : 0.000004s : 1: pipeline_parallel_scheduler 0.01% : 0.000005s : 1: pipeline_split 0.10% : 0.000046s : 1: pre_auto_parallel 0.08% : 0.000036s : 1: py_interpret_to_execute 0.03% : 0.000014s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000004s : 1: remove_cast_before_assign_add 0.03% : 0.000016s : 1: remove_dup_value 0.92% : 0.000429s : 1: renormalize.infer 0.83% : 0.000389s : 1: renormalize.specialize 0.01% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.02% : 0.000007s : 1: rewriter_after_jit_bprop_graph 0.09% : 0.000040s : 1: rewriter_after_opt_a 0.24% : 0.000114s : 1: rewriter_before_opt_a 0.01% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.01% : 0.000005s : 1: slice_recompute_activation 0.01% : 0.000004s : 1: split_layernorm_comm 0.01% : 0.000005s : 1: split_matmul_comm_elemetwise 0.02% : 0.000008s : 1: swap_dp_allreduce_reducescatter 0.17% : 0.000080s : 1: symbol_engine_optimizer 0.17% : 0.000078s : 1: tuple_transform 55.51% : 0.025980s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:04.130.956 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:04.131.218 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0453742, [21] [bootstrap]: 0.00049412 [type_inference]: 0.0154879 [event_method]: 9.991e-05 [auto_monad]: 0.00010407 [graph_reusing]: 7.78001e-06 [inline]: 2.64001e-06 [add_attr]: 0.00342738, [1] [add_attr_with_inline]: 0.00341658, [1] [Cycle 1]: 0.00010141, [2] [tag_attr]: 3.259e-05 [meta_addattr_fg_expand]: 8.95999e-06 [parallel-infer-symbol]: 3.43999e-06 [pre_auto_parallel]: 5.062e-05 [insert-virtual-dataset]: 1.463e-05 [parallel-infer-symbol-second]: 1.28002e-06 [dataset_repeat_opt]: 2.02999e-06 [pipeline_split]: 1.57999e-06 [optimize]: 0.00797714, [53] [py_interpret_to_execute]: 4.054e-05 [rewriter_before_opt_a]: 0.00012526 [opt_a]: 0.00468922, [2] [Cycle 1]: 0.00362946, [45] [expand_dump_flag]: 4.19002e-06 [switch_simplify]: 0.00013409 [loop_unroll]: 4.601e-05 [a_1]: 0.00102247 [with_stream_mark]: 2.181e-05 [recompute_prepare]: 1.397e-05 [updatestate_depend_eliminate]: 5.40001e-06 [updatestate_assign_eliminate]: 4.35e-06 [updatestate_loads_eliminate]: 3.83999e-06 [parameter_eliminate]: 2.32001e-06 [a_2]: 0.0001377 [accelerated_algorithm]: 1.173e-05 [shard]: 2.46e-06 [meta_shard_fg_expand]: 3.27002e-06 [shard_inline]: 8.73001e-06 [merge_send_recv]: 9.74999e-06 [auto_parallel]: 8.99e-06 [parallel]: 2.045e-05 [flash_sp]: 1.193e-05 [merge_comm]: 5.48002e-06 [allreduce_fusion]: 4.37e-06 [matmul_add_comm_reduction]: 1.296e-05 [allreduce_slice_to_reducescatter]: 7.50006e-07 [virtual_shard_identity]: 1.248e-05 [virtual_dataset]: 9.27001e-06 [get_grad_eliminate_]: 8.26002e-06 [virtual_output]: 8.67e-06 [merge_forward]: 5.04e-06 [cell_reuse_recompute_pass]: 1.44e-06 [offload_activation]: 1.207e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.067e-05 [merge_recompute_call_nodes]: 1.98002e-06 [before_grad]: 1.406e-05 [set_forward_comm_id_for_comm_node_pass]: 5.11997e-06 [meta_fg_expand]: 4.60999e-06 [flash_sp_send_recv_attached]: 2.51e-06 [receive_attached]: 2.17001e-06 [after_resolve]: 1.457e-05 [a_after_grad]: 1.365e-05 [renormalize]: 0.00138185 [add_forward_monad_depend]: 8.14002e-06 [auto_monad_grad]: 3.08998e-06 [auto_monad_eliminator]: 2.22e-05 [cse]: 3.822e-05 [a_3]: 8.537e-05 [Cycle 2]: 0.00104271, [45] [expand_dump_flag]: 2.26e-06 [switch_simplify]: 9.97999e-06 [loop_unroll]: 8.60001e-06 [a_1]: 0.00021599 [with_stream_mark]: 1.862e-05 [recompute_prepare]: 9.66998e-06 [updatestate_depend_eliminate]: 4.84e-06 [updatestate_assign_eliminate]: 3.16001e-06 [updatestate_loads_eliminate]: 3.58e-06 [parameter_eliminate]: 1.86003e-06 [a_2]: 0.00012511 [accelerated_algorithm]: 8.75999e-06 [shard]: 2.48e-06 [meta_shard_fg_expand]: 2.29999e-06 [shard_inline]: 8.14002e-06 [merge_send_recv]: 8.43001e-06 [auto_parallel]: 7.97e-06 [parallel]: 7.09001e-06 [flash_sp]: 3.26999e-06 [merge_comm]: 4.28001e-06 [allreduce_fusion]: 4.72e-06 [matmul_add_comm_reduction]: 9.39998e-06 [allreduce_slice_to_reducescatter]: 4.89992e-07 [virtual_shard_identity]: 1.008e-05 [virtual_dataset]: 8.23999e-06 [get_grad_eliminate_]: 7.35e-06 [virtual_output]: 7.28e-06 [merge_forward]: 4.37998e-06 [cell_reuse_recompute_pass]: 2.65997e-06 [offload_activation]: 9.99999e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.87e-05 [merge_recompute_call_nodes]: 1.07998e-06 [before_grad]: 1.307e-05 [set_forward_comm_id_for_comm_node_pass]: 5.00999e-06 [meta_fg_expand]: 3.53e-06 [flash_sp_send_recv_attached]: 9.79984e-07 [receive_attached]: 1.64e-06 [after_resolve]: 1.443e-05 [a_after_grad]: 1.218e-05 [renormalize]: 8.00064e-08 [add_forward_monad_depend]: 2.84999e-06 [auto_monad_grad]: 1.12999e-06 [auto_monad_eliminator]: 1.14e-05 [cse]: 2.675e-05 [a_3]: 6.31e-05 [py_interpret_to_execute_after_opt_a]: 1.986e-05 [slice_cell_reuse_recomputed_activation]: 5.36998e-06 [rewriter_after_opt_a]: 5.377e-05 [convert_after_rewriter]: 1.065e-05 [order_py_execute_after_rewriter]: 1.062e-05 [mutable_eliminate]: 0.00085788 [opt_b]: 0.00035693, [1] [Cycle 1]: 0.00034332, [7] [b_1]: 0.00021458 [b_2]: 1.24e-05 [updatestate_depend_eliminate]: 9.81e-06 [updatestate_assign_eliminate]: 3.46999e-06 [updatestate_loads_eliminate]: 3.35e-06 [renormalize]: 9.80013e-07 [cse]: 3.473e-05 [optimize_parallel_all_gather_comm]: 2.645e-05 [overlap_param_gather]: 5.64998e-06 [cconv]: 3.977e-05 [loop_unroll]: 0.0006908 [opt_after_cconv]: 0.00016752, [1] [Cycle 1]: 0.00015714, [7] [c_1]: 4.209e-05 [parameter_eliminate]: 5.35001e-06 [updatestate_depend_eliminate]: 8.85001e-06 [updatestate_assign_eliminate]: 3.37002e-06 [updatestate_loads_eliminate]: 3.38e-06 [cse]: 3.525e-05 [renormalize]: 9.49978e-07 [remove_dup_value]: 2.037e-05 [tuple_transform]: 0.00011741, [1] [Cycle 1]: 0.00010862, [4] [d_1]: 6.232e-05 [none_parameter_eliminate]: 1.84998e-06 [renormalize]: 4.19997e-07 [switch_simplify]: 8.99e-06 [partial_unused_args_eliminate]: 4.73001e-06 [add_recomputation]: 0.00011331 [cse_after_recomputation]: 4.077e-05, [1] [Cycle 1]: 3.275e-05, [1] [cse]: 2.131e-05 [environ_conv]: 1.088e-05 [swap_dp_allreduce_reducescatter]: 9.47001e-06 [bias_add_comm_swap]: 5.49e-06 [label_micro_interleaved_index]: 8.79e-06 [label_fine_grained_interleaved_index]: 5.35001e-06 [merge_cast_opt]: 4.15e-06 [slice_recompute_activation]: 4.46002e-06 [micro_interleaved_order_control]: 4.93001e-06 [assign_add_opt]: 4.13999e-06 [ForceFp32Comm]: 3.38999e-06 [remove_cast_before_assign_add]: 3.43e-06 [full_micro_interleaved_order_control]: 4.57998e-06 [reorder_send_recv_between_fp_bp]: 6.22001e-06 [comm_op_add_attrs]: 3.8e-06 [add_comm_op_reuse_tag]: 3.47002e-06 [interleave_split_concat_branches]: 3.60998e-06 [interleave_parallel_branches]: 3.54002e-06 [overlap_opt_shard_in_pipeline]: 3.9e-06 [overlap_opt_shard_grad_in_pipeline]: 4.52e-06 [control_data_broadcast_order]: 1.962e-05 [grouped_pairwise_exchange_alltoall]: 3.96001e-06 [offloading_packed_experts]: 7.75998e-06 [overlap_recompute_and_grad_model_parallel]: 8.14002e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.66999e-06 [overlap_recompute_allgather_and_fa_grad]: 4.01001e-06 [overlap_recompute_comm]: 5.15999e-06 [overlap_grad_ring_attention]: 7.23e-06 [overlap_grad_flash_sp]: 2.888e-05 [begin_end_overlap_inline]: 3.25e-06 [split_matmul_comm_elemetwise]: 4.77e-06 [split_layernorm_comm]: 4.32e-06 [handle_group_info]: 3.36999e-06 [symbol_engine_optimizer]: 0.00013561, [1] [Cycle 1]: 0.00011224, [6] [build]: 4.2e-06 [elim_shapecalc]: 1.572e-05 [elim_not_effective]: 1.816e-05 [opt_reshape]: 9.29998e-06 [fold_const_symbol]: 1.454e-05 [renormalize]: 2.30008e-07 [detach_backward]: 9.25999e-06 [pipeline_parallel_scheduler]: 2.39999e-06 [auto_monad_reorder]: 3.28e-05 [get_jit_bprop_graph]: 2.27001e-06 [rewriter_after_jit_bprop_graph]: 9.14e-06 [opt_after_jit_grad]: 0.0008288 [validate]: 5.598e-05 Sums bootstrap : 0.000494s : 2.07% type_inference : 0.015488s : 64.85% event_method : 0.000100s : 0.42% auto_monad : 0.000104s : 0.44% graph_reusing : 0.000008s : 0.03% inline : 0.000003s : 0.01% add_attr.add_attr_with_inline.tag_attr : 0.000033s : 0.14% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000009s : 0.04% parallel-infer-symbol : 0.000003s : 0.01% pre_auto_parallel : 0.000051s : 0.21% insert-virtual-dataset : 0.000015s : 0.06% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.01% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000041s : 0.17% optimize.rewriter_before_opt_a : 0.000125s : 0.52% optimize.opt_a.expand_dump_flag : 0.000006s : 0.03% optimize.opt_a.switch_simplify : 0.000144s : 0.60% optimize.opt_a.loop_unroll : 0.000055s : 0.23% optimize.opt_a.a_1 : 0.001238s : 5.19% optimize.opt_a.with_stream_mark : 0.000040s : 0.17% optimize.opt_a.recompute_prepare : 0.000024s : 0.10% optimize.opt_a.updatestate_depend_eliminate : 0.000010s : 0.04% optimize.opt_a.updatestate_assign_eliminate : 0.000008s : 0.03% optimize.opt_a.updatestate_loads_eliminate : 0.000007s : 0.03% optimize.opt_a.parameter_eliminate : 0.000004s : 0.02% optimize.opt_a.a_2 : 0.000263s : 1.10% optimize.opt_a.accelerated_algorithm : 0.000020s : 0.09% optimize.opt_a.shard : 0.000005s : 0.02% optimize.opt_a.meta_shard_fg_expand : 0.000006s : 0.02% optimize.opt_a.shard_inline : 0.000017s : 0.07% optimize.opt_a.merge_send_recv : 0.000018s : 0.08% optimize.opt_a.auto_parallel : 0.000017s : 0.07% optimize.opt_a.parallel : 0.000028s : 0.12% optimize.opt_a.flash_sp : 0.000015s : 0.06% optimize.opt_a.merge_comm : 0.000010s : 0.04% optimize.opt_a.allreduce_fusion : 0.000009s : 0.04% optimize.opt_a.matmul_add_comm_reduction : 0.000022s : 0.09% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000023s : 0.09% optimize.opt_a.virtual_dataset : 0.000018s : 0.07% optimize.opt_a.get_grad_eliminate_ : 0.000016s : 0.07% optimize.opt_a.virtual_output : 0.000016s : 0.07% optimize.opt_a.merge_forward : 0.000009s : 0.04% optimize.opt_a.cell_reuse_recompute_pass : 0.000004s : 0.02% optimize.opt_a.offload_activation : 0.000022s : 0.09% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000039s : 0.16% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.01% optimize.opt_a.before_grad : 0.000027s : 0.11% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000010s : 0.04% optimize.opt_a.meta_fg_expand : 0.000008s : 0.03% optimize.opt_a.flash_sp_send_recv_attached : 0.000003s : 0.01% optimize.opt_a.receive_attached : 0.000004s : 0.02% optimize.opt_a.after_resolve : 0.000029s : 0.12% optimize.opt_a.a_after_grad : 0.000026s : 0.11% optimize.opt_a.renormalize : 0.001382s : 5.79% optimize.opt_a.add_forward_monad_depend : 0.000011s : 0.05% optimize.opt_a.auto_monad_grad : 0.000004s : 0.02% optimize.opt_a.auto_monad_eliminator : 0.000034s : 0.14% optimize.opt_a.cse : 0.000065s : 0.27% optimize.opt_a.a_3 : 0.000148s : 0.62% optimize.py_interpret_to_execute_after_opt_a : 0.000020s : 0.08% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.02% optimize.rewriter_after_opt_a : 0.000054s : 0.23% optimize.convert_after_rewriter : 0.000011s : 0.04% optimize.order_py_execute_after_rewriter : 0.000011s : 0.04% optimize.mutable_eliminate : 0.000858s : 3.59% optimize.opt_b.b_1 : 0.000215s : 0.90% optimize.opt_b.b_2 : 0.000012s : 0.05% optimize.opt_b.updatestate_depend_eliminate : 0.000010s : 0.04% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.01% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.01% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000035s : 0.15% optimize.optimize_parallel_all_gather_comm : 0.000026s : 0.11% optimize.overlap_param_gather : 0.000006s : 0.02% optimize.cconv : 0.000040s : 0.17% optimize.loop_unroll : 0.000691s : 2.89% optimize.opt_after_cconv.c_1 : 0.000042s : 0.18% optimize.opt_after_cconv.parameter_eliminate : 0.000005s : 0.02% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000009s : 0.04% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.cse : 0.000035s : 0.15% optimize.opt_after_cconv.renormalize : 0.000001s : 0.00% optimize.remove_dup_value : 0.000020s : 0.09% optimize.tuple_transform.d_1 : 0.000062s : 0.26% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000009s : 0.04% optimize.partial_unused_args_eliminate : 0.000005s : 0.02% optimize.add_recomputation : 0.000113s : 0.47% optimize.cse_after_recomputation.cse : 0.000021s : 0.09% optimize.environ_conv : 0.000011s : 0.05% optimize.swap_dp_allreduce_reducescatter : 0.000009s : 0.04% optimize.bias_add_comm_swap : 0.000005s : 0.02% optimize.label_micro_interleaved_index : 0.000009s : 0.04% optimize.label_fine_grained_interleaved_index : 0.000005s : 0.02% optimize.merge_cast_opt : 0.000004s : 0.02% optimize.slice_recompute_activation : 0.000004s : 0.02% optimize.micro_interleaved_order_control : 0.000005s : 0.02% optimize.assign_add_opt : 0.000004s : 0.02% optimize.ForceFp32Comm : 0.000003s : 0.01% optimize.remove_cast_before_assign_add : 0.000003s : 0.01% optimize.full_micro_interleaved_order_control : 0.000005s : 0.02% optimize.reorder_send_recv_between_fp_bp : 0.000006s : 0.03% optimize.comm_op_add_attrs : 0.000004s : 0.02% optimize.add_comm_op_reuse_tag : 0.000003s : 0.01% optimize.interleave_split_concat_branches : 0.000004s : 0.02% optimize.interleave_parallel_branches : 0.000004s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.02% optimize.overlap_opt_shard_grad_in_pipeline : 0.000005s : 0.02% optimize.control_data_broadcast_order : 0.000020s : 0.08% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.02% optimize.offloading_packed_experts : 0.000008s : 0.03% optimize.overlap_recompute_and_grad_model_parallel : 0.000008s : 0.03% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.02% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.02% optimize.overlap_recompute_comm : 0.000005s : 0.02% optimize.overlap_grad_ring_attention : 0.000007s : 0.03% optimize.overlap_grad_flash_sp : 0.000029s : 0.12% optimize.begin_end_overlap_inline : 0.000003s : 0.01% optimize.split_matmul_comm_elemetwise : 0.000005s : 0.02% optimize.split_layernorm_comm : 0.000004s : 0.02% optimize.handle_group_info : 0.000003s : 0.01% optimize.symbol_engine_optimizer.build : 0.000004s : 0.02% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000016s : 0.07% optimize.symbol_engine_optimizer.elim_not_effective : 0.000018s : 0.08% optimize.symbol_engine_optimizer.opt_reshape : 0.000009s : 0.04% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000015s : 0.06% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000009s : 0.04% pipeline_parallel_scheduler : 0.000002s : 0.01% auto_monad_reorder : 0.000033s : 0.14% get_jit_bprop_graph : 0.000002s : 0.01% rewriter_after_jit_bprop_graph : 0.000009s : 0.04% opt_after_jit_grad : 0.000829s : 3.47% validate : 0.000056s : 0.23% Time group info: ------[substitution.] 0.000286 44 9.40% : 0.000027s : 3: substitution.cast_eliminate 0.79% : 0.000002s : 3: substitution.elim_not_effective 0.83% : 0.000002s : 3: substitution.fold_const_symbol 2.68% : 0.000008s : 5: substitution.graph_param_transform 70.32% : 0.000201s : 8: substitution.inline 1.80% : 0.000005s : 6: substitution.j_node_and_user_rematch 2.12% : 0.000006s : 6: substitution.remove_not_recompute_node 2.24% : 0.000006s : 4: substitution.replace_old_param 3.95% : 0.000011s : 2: substitution.switch_simplify 5.87% : 0.000017s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.015380 2 90.00% : 0.013841s : 1: type_inference.infer 10.00% : 0.001539s : 1: type_inference.specialize ------[replace.] 0.000139 14 42.03% : 0.000059s : 8: replace.inline 36.93% : 0.000052s : 2: replace.switch_simplify 21.03% : 0.000029s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000221 14 88.90% : 0.000197s : 8: match.inline 4.49% : 0.000010s : 2: match.switch_simplify 6.61% : 0.000015s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000321 1838 0.98% : 0.000003s : 21: predicate.accumulaten_eliminater 0.79% : 0.000003s : 5: predicate.ad_related_special_op_eliminate 0.48% : 0.000002s : 10: predicate.addn_check_dump 0.93% : 0.000003s : 21: predicate.addn_zero_filter 0.87% : 0.000003s : 21: predicate.adjust_all_reduce_mul_add 2.20% : 0.000007s : 31: predicate.arithmetic_simplify 1.03% : 0.000003s : 21: predicate.cast_eliminate 0.60% : 0.000002s : 10: predicate.check_bprop_eliminate 0.47% : 0.000002s : 10: predicate.compare_switch_simplify 0.16% : 0.000001s : 5: predicate.const_output_eliminate 0.58% : 0.000002s : 10: predicate.depend_value_elim 1.05% : 0.000003s : 21: predicate.dict_get_item_const_eliminator 1.20% : 0.000004s : 21: predicate.dict_get_item_eliminator 1.03% : 0.000003s : 21: predicate.dict_set_item_eliminator 0.82% : 0.000003s : 10: predicate.dumpgradient_eliminate 0.21% : 0.000001s : 5: predicate.elim_not_effective 0.52% : 0.000002s : 5: predicate.elim_shapecalc_of_broadcastargs 1.18% : 0.000004s : 26: predicate.environ_add_const_eliminate 1.09% : 0.000004s : 26: predicate.environ_get_add_eliminate 1.10% : 0.000004s : 26: predicate.environ_get_depend_swap 1.62% : 0.000005s : 36: predicate.environ_get_eliminate 1.06% : 0.000003s : 26: predicate.environ_get_set_eliminate 1.50% : 0.000005s : 33: predicate.exchange_switch_depend_value 2.56% : 0.000008s : 33: predicate.float_depend_g_call 0.45% : 0.000001s : 10: predicate.float_environ_get_switch 0.68% : 0.000002s : 15: predicate.float_tuple_getitem_switch 0.13% : 0.000000s : 5: predicate.fold_const_symbol 0.69% : 0.000002s : 10: predicate.get_grad_eliminate 0.17% : 0.000001s : 5: predicate.graph_param_transform 0.52% : 0.000002s : 10: predicate.incorporate_call 0.43% : 0.000001s : 10: predicate.incorporate_call_switch 6.15% : 0.000020s : 84: predicate.inline 0.72% : 0.000002s : 10: predicate.inline_without_move 0.26% : 0.000001s : 10: predicate.j_node_and_user_rematch 0.72% : 0.000002s : 10: predicate.less_batch_normalization 1.95% : 0.000006s : 35: predicate.list_to_tuple_eliminator_ 2.44% : 0.000008s : 56: predicate.load_eliminater 1.40% : 0.000004s : 5: predicate.loop_unroll_after_grad 2.61% : 0.000008s : 49: predicate.loop_unroll_before_grad 1.67% : 0.000005s : 31: predicate.make_slice_get_slice_eliminator 0.46% : 0.000001s : 10: predicate.merge_addn 0.50% : 0.000002s : 10: predicate.micro_step_allgather_replace 0.53% : 0.000002s : 10: predicate.mini_step_allgather_replace 0.93% : 0.000003s : 21: predicate.minmaximum_grad 1.16% : 0.000004s : 5: predicate.mutable_eliminate 0.36% : 0.000001s : 5: predicate.opt_reshape 0.35% : 0.000001s : 5: predicate.parallel_virtual_node 2.08% : 0.000007s : 33: predicate.partial_defer_inline 1.55% : 0.000005s : 30: predicate.partial_eliminate 1.01% : 0.000003s : 21: predicate.print_const_string_wrapper 0.49% : 0.000002s : 10: predicate.reduce_all_const_elim 1.28% : 0.000004s : 21: predicate.reduce_eliminate 2.44% : 0.000008s : 56: predicate.redundant_stop_gradient_eliminater 0.29% : 0.000001s : 10: predicate.remove_not_recompute_node 1.30% : 0.000004s : 35: predicate.replace_applicator 0.40% : 0.000001s : 10: predicate.replace_old_param 0.45% : 0.000001s : 5: predicate.reset_defer_inline 1.03% : 0.000003s : 21: predicate.reshape_eliminate 0.57% : 0.000002s : 10: predicate.row_tensor_add_zeros_like 0.37% : 0.000001s : 5: predicate.row_tensor_eliminate 0.80% : 0.000003s : 10: predicate.same_eliminate 0.32% : 0.000001s : 10: predicate.set_cell_output_no_recompute 0.67% : 0.000002s : 10: predicate.shard_identity_eliminate 0.67% : 0.000002s : 10: predicate.special_op_eliminate 0.83% : 0.000003s : 10: predicate.specialize_transform 0.95% : 0.000003s : 10: predicate.split_environ_get_set_with_tuple_value 0.84% : 0.000003s : 10: predicate.stack_unstack_eliminate 0.26% : 0.000001s : 5: predicate.switch_call_monad_eliminater 1.68% : 0.000005s : 33: predicate.switch_defer_inline 2.04% : 0.000007s : 43: predicate.switch_layer_defer_inline 5.19% : 0.000017s : 101: predicate.switch_simplify 0.93% : 0.000003s : 21: predicate.tile_eliminate 1.11% : 0.000004s : 21: predicate.transpose_eliminate 1.52% : 0.000005s : 31: predicate.tuple_list_convert_item_index_to_positive 1.71% : 0.000005s : 31: predicate.tuple_list_get_item_const_eliminator 1.57% : 0.000005s : 31: predicate.tuple_list_get_item_depend_reorder 3.22% : 0.000010s : 45: predicate.tuple_list_get_item_eliminator 1.56% : 0.000005s : 31: predicate.tuple_list_get_set_item_eliminator 2.46% : 0.000008s : 41: predicate.tuple_list_set_item_eliminator 1.54% : 0.000005s : 35: predicate.tuple_to_list_eliminator_ 2.32% : 0.000007s : 56: predicate.updatestate_pure_node_eliminater 2.99% : 0.000010s : 66: predicate.updatestate_useless_node_eliminater 0.40% : 0.000001s : 5: predicate.value_based_eliminate 0.71% : 0.000002s : 10: predicate.virtual_dataset_eliminate 0.52% : 0.000002s : 10: predicate.virtual_output_eliminate 0.21% : 0.000001s : 5: predicate.virtual_view_grad_eliminate 0.42% : 0.000001s : 5: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.001191 16 53.14% : 0.000633s : 6: func_graph_cloner_run.FuncGraphClonerGraph 46.86% : 0.000558s : 10: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.060290 192 0.01% : 0.000006s : 1: ForceFp32Comm 5.70% : 0.003438s : 1: add_attr 5.67% : 0.003421s : 1: add_attr_with_inline 0.01% : 0.000006s : 1: add_comm_op_reuse_tag 0.20% : 0.000119s : 1: add_recomputation 0.01% : 0.000007s : 1: assign_add_opt 0.19% : 0.000117s : 1: auto_monad 0.07% : 0.000041s : 1: auto_monad_reorder 0.01% : 0.000006s : 1: begin_end_overlap_inline 0.01% : 0.000008s : 1: bias_add_comm_swap 0.91% : 0.000548s : 1: bootstrap 0.07% : 0.000043s : 1: cconv 0.01% : 0.000007s : 1: comm_op_add_attrs 0.04% : 0.000023s : 1: control_data_broadcast_order 0.02% : 0.000014s : 1: convert_after_rewriter 0.07% : 0.000045s : 1: cse_after_recomputation 0.01% : 0.000008s : 1: dataset_repeat_opt 0.08% : 0.000050s : 1: detach_backward 0.02% : 0.000014s : 1: environ_conv 0.20% : 0.000118s : 1: event_method 0.01% : 0.000007s : 1: full_micro_interleaved_order_control 0.01% : 0.000008s : 1: get_jit_bprop_graph 0.02% : 0.000015s : 1: graph_reusing 0.01% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000007s : 1: handle_group_info 0.01% : 0.000008s : 1: inline 0.04% : 0.000022s : 1: insert-virtual-dataset 0.01% : 0.000006s : 1: interleave_parallel_branches 0.01% : 0.000007s : 1: interleave_split_concat_branches 0.01% : 0.000008s : 1: label_fine_grained_interleaved_index 0.02% : 0.000012s : 1: label_micro_interleaved_index 1.16% : 0.000699s : 1: loop_unroll 0.01% : 0.000007s : 1: merge_cast_opt 0.01% : 0.000009s : 1: micro_interleaved_order_control 1.44% : 0.000868s : 1: mutable_eliminate 0.02% : 0.000011s : 1: offloading_packed_experts 0.04% : 0.000026s : 1: opt.transform.loop_unroll_optimizer 0.04% : 0.000027s : 1: opt.transform.mutable_eliminate 3.16% : 0.001904s : 78: opt.transform.opt_a 0.07% : 0.000041s : 1: opt.transform.opt_after_cconv 0.07% : 0.000041s : 1: opt.transform.opt_after_jit_grad 0.25% : 0.000148s : 28: opt.transform.opt_b 0.11% : 0.000068s : 2: opt.transform.opt_trans_graph 0.09% : 0.000054s : 4: opt.transform.symbol_engine_opt 7.78% : 0.004693s : 1: opt_a 0.28% : 0.000172s : 1: opt_after_cconv 1.40% : 0.000841s : 1: opt_after_jit_grad 0.60% : 0.000361s : 1: opt_b 40.50% : 0.024415s : 1: optimize 0.05% : 0.000031s : 1: optimize_parallel_all_gather_comm 0.02% : 0.000014s : 1: order_py_execute_after_rewriter 0.05% : 0.000033s : 1: overlap_grad_flash_sp 0.01% : 0.000007s : 1: overlap_grad_matmul_and_grad_allreduce 0.02% : 0.000010s : 1: overlap_grad_ring_attention 0.01% : 0.000007s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000007s : 1: overlap_opt_shard_in_pipeline 0.01% : 0.000009s : 1: overlap_param_gather 0.01% : 0.000007s : 1: overlap_recompute_allgather_and_fa_grad 0.02% : 0.000011s : 1: overlap_recompute_and_grad_model_parallel 0.01% : 0.000008s : 1: overlap_recompute_comm 0.02% : 0.000011s : 1: parallel-infer-symbol 0.01% : 0.000008s : 1: parallel-infer-symbol-second 0.01% : 0.000008s : 1: partial_unused_args_eliminate 0.02% : 0.000011s : 1: pipeline_parallel_scheduler 0.01% : 0.000007s : 1: pipeline_split 0.10% : 0.000059s : 1: pre_auto_parallel 0.07% : 0.000045s : 1: py_interpret_to_execute 0.04% : 0.000023s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000006s : 1: remove_cast_before_assign_add 0.04% : 0.000024s : 1: remove_dup_value 1.21% : 0.000727s : 1: renormalize.infer 1.07% : 0.000642s : 1: renormalize.specialize 0.01% : 0.000009s : 1: reorder_send_recv_between_fp_bp 0.03% : 0.000015s : 1: rewriter_after_jit_bprop_graph 0.10% : 0.000058s : 1: rewriter_after_opt_a 0.22% : 0.000130s : 1: rewriter_before_opt_a 0.01% : 0.000008s : 1: slice_cell_reuse_recomputed_activation 0.01% : 0.000007s : 1: slice_recompute_activation 0.01% : 0.000007s : 1: split_layernorm_comm 0.01% : 0.000007s : 1: split_matmul_comm_elemetwise 0.02% : 0.000013s : 1: swap_dp_allreduce_reducescatter 0.23% : 0.000139s : 1: symbol_engine_optimizer 0.20% : 0.000120s : 1: tuple_transform 25.77% : 0.015536s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:04.557.372 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0554952, [21] [bootstrap]: 0.00045548 [type_inference]: 0.0331044 [event_method]: 7.916e-05 [auto_monad]: 0.00010119 [graph_reusing]: 8.38001e-06 [inline]: 2.26e-06 [add_attr]: 0.0152728, [1] [add_attr_with_inline]: 0.0152629, [1] [Cycle 1]: 6.517e-05, [2] [tag_attr]: 2.712e-05 [meta_addattr_fg_expand]: 7.78001e-06 [parallel-infer-symbol]: 2.74001e-06 [pre_auto_parallel]: 3.925e-05 [insert-virtual-dataset]: 1.43002e-06 [parallel-infer-symbol-second]: 7.60017e-07 [dataset_repeat_opt]: 1.40001e-06 [pipeline_split]: 1.12e-06 [optimize]: 0.00573041, [53] [py_interpret_to_execute]: 3.3e-05 [rewriter_before_opt_a]: 0.0001193 [opt_a]: 0.00358848, [2] [Cycle 1]: 0.00281892, [45] [expand_dump_flag]: 2.69001e-06 [switch_simplify]: 0.00010704 [loop_unroll]: 4.318e-05 [a_1]: 0.00095939 [with_stream_mark]: 1.392e-05 [recompute_prepare]: 1.042e-05 [updatestate_depend_eliminate]: 4.44002e-06 [updatestate_assign_eliminate]: 2.86e-06 [updatestate_loads_eliminate]: 2.78e-06 [parameter_eliminate]: 1.08001e-06 [a_2]: 0.00010371 [accelerated_algorithm]: 8.70001e-06 [shard]: 1.24e-06 [meta_shard_fg_expand]: 2.51e-06 [shard_inline]: 8.02e-06 [merge_send_recv]: 7.67002e-06 [auto_parallel]: 6.84001e-06 [parallel]: 1.21e-05 [flash_sp]: 6.93e-06 [merge_comm]: 4.39998e-06 [allreduce_fusion]: 4.13999e-06 [matmul_add_comm_reduction]: 8.53001e-06 [allreduce_slice_to_reducescatter]: 6.80011e-07 [virtual_shard_identity]: 9.61e-06 [virtual_dataset]: 8.03999e-06 [get_grad_eliminate_]: 8.10999e-06 [virtual_output]: 7.68001e-06 [merge_forward]: 3.63999e-06 [cell_reuse_recompute_pass]: 1.29e-06 [offload_activation]: 9.27999e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.522e-05 [merge_recompute_call_nodes]: 1.04e-06 [before_grad]: 1.428e-05 [set_forward_comm_id_for_comm_node_pass]: 4.25999e-06 [meta_fg_expand]: 3.46001e-06 [flash_sp_send_recv_attached]: 2.72001e-06 [receive_attached]: 1.96998e-06 [after_resolve]: 1.391e-05 [a_after_grad]: 1.267e-05 [renormalize]: 0.00098876 [add_forward_monad_depend]: 5.47999e-06 [auto_monad_grad]: 2.51e-06 [auto_monad_eliminator]: 1.704e-05 [cse]: 3.574e-05 [a_3]: 7.782e-05 [Cycle 2]: 0.00075946, [45] [expand_dump_flag]: 1.34e-06 [switch_simplify]: 1.032e-05 [loop_unroll]: 7.61999e-06 [a_1]: 0.00017898 [with_stream_mark]: 1.183e-05 [recompute_prepare]: 7.87003e-06 [updatestate_depend_eliminate]: 4.12e-06 [updatestate_assign_eliminate]: 2.93e-06 [updatestate_loads_eliminate]: 2.69999e-06 [parameter_eliminate]: 1.02e-06 [a_2]: 9.494e-05 [accelerated_algorithm]: 7.87e-06 [shard]: 1.19e-06 [meta_shard_fg_expand]: 1.59998e-06 [shard_inline]: 8.23999e-06 [merge_send_recv]: 6.10002e-06 [auto_parallel]: 6.41998e-06 [parallel]: 4.68999e-06 [flash_sp]: 3.8e-06 [merge_comm]: 4.21001e-06 [allreduce_fusion]: 3.94002e-06 [matmul_add_comm_reduction]: 6.53e-06 [allreduce_slice_to_reducescatter]: 3.89991e-07 [virtual_shard_identity]: 8.27e-06 [virtual_dataset]: 7.38999e-06 [get_grad_eliminate_]: 6.63e-06 [virtual_output]: 6.75002e-06 [merge_forward]: 3.65998e-06 [cell_reuse_recompute_pass]: 1.42e-06 [offload_activation]: 8.08999e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.456e-05 [merge_recompute_call_nodes]: 1.19998e-06 [before_grad]: 1.259e-05 [set_forward_comm_id_for_comm_node_pass]: 4.49998e-06 [meta_fg_expand]: 2.63003e-06 [flash_sp_send_recv_attached]: 9.30013e-07 [receive_attached]: 9.79984e-07 [after_resolve]: 1.113e-05 [a_after_grad]: 1.177e-05 [renormalize]: 9.00181e-08 [add_forward_monad_depend]: 1.37e-06 [auto_monad_grad]: 1.34e-06 [auto_monad_eliminator]: 8.23999e-06 [cse]: 1.757e-05 [a_3]: 4.618e-05 [py_interpret_to_execute_after_opt_a]: 9.77999e-06 [slice_cell_reuse_recomputed_activation]: 2.45002e-06 [rewriter_after_opt_a]: 4.034e-05 [convert_after_rewriter]: 6.91001e-06 [order_py_execute_after_rewriter]: 5.82001e-06 [mutable_eliminate]: 0.00049625 [opt_b]: 0.00024139, [1] [Cycle 1]: 0.00023552, [7] [b_1]: 0.00015729 [b_2]: 9.74e-06 [updatestate_depend_eliminate]: 5.91e-06 [updatestate_assign_eliminate]: 2.99999e-06 [updatestate_loads_eliminate]: 2.77002e-06 [renormalize]: 6.00005e-07 [cse]: 2.187e-05 [optimize_parallel_all_gather_comm]: 1.747e-05 [overlap_param_gather]: 2.16e-06 [cconv]: 2.168e-05 [loop_unroll]: 0.00042656 [opt_after_cconv]: 0.00011505, [1] [Cycle 1]: 0.00010972, [7] [c_1]: 3.927e-05 [parameter_eliminate]: 2.85998e-06 [updatestate_depend_eliminate]: 5.91e-06 [updatestate_assign_eliminate]: 2.95002e-06 [updatestate_loads_eliminate]: 2.74999e-06 [cse]: 2.168e-05 [renormalize]: 4.40021e-07 [remove_dup_value]: 1.473e-05 [tuple_transform]: 8.665e-05, [1] [Cycle 1]: 8.246e-05, [4] [d_1]: 5.23e-05 [none_parameter_eliminate]: 1.78002e-06 [renormalize]: 1.69995e-07 [switch_simplify]: 8.48001e-06 [partial_unused_args_eliminate]: 1.82001e-06 [add_recomputation]: 7.04e-05 [cse_after_recomputation]: 2.673e-05, [1] [Cycle 1]: 2.21e-05, [1] [cse]: 1.607e-05 [environ_conv]: 6.16e-06 [swap_dp_allreduce_reducescatter]: 6.14001e-06 [bias_add_comm_swap]: 2.92002e-06 [label_micro_interleaved_index]: 4.17998e-06 [label_fine_grained_interleaved_index]: 3.06001e-06 [merge_cast_opt]: 1.22999e-06 [slice_recompute_activation]: 2.21e-06 [micro_interleaved_order_control]: 2.11e-06 [assign_add_opt]: 1.42e-06 [ForceFp32Comm]: 1.19e-06 [remove_cast_before_assign_add]: 9.70002e-07 [full_micro_interleaved_order_control]: 1.99e-06 [reorder_send_recv_between_fp_bp]: 2.61e-06 [comm_op_add_attrs]: 9.70002e-07 [add_comm_op_reuse_tag]: 9.80013e-07 [interleave_split_concat_branches]: 1.10001e-06 [interleave_parallel_branches]: 1.00999e-06 [overlap_opt_shard_in_pipeline]: 1.50999e-06 [overlap_opt_shard_grad_in_pipeline]: 1.81998e-06 [control_data_broadcast_order]: 1.396e-05 [grouped_pairwise_exchange_alltoall]: 1.99e-06 [offloading_packed_experts]: 3.77002e-06 [overlap_recompute_and_grad_model_parallel]: 5.14e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.20999e-06 [overlap_recompute_allgather_and_fa_grad]: 1.39e-06 [overlap_recompute_comm]: 2.38002e-06 [overlap_grad_ring_attention]: 4.31002e-06 [overlap_grad_flash_sp]: 2.044e-05 [begin_end_overlap_inline]: 5.00004e-07 [split_matmul_comm_elemetwise]: 2.04e-06 [split_layernorm_comm]: 1.62999e-06 [handle_group_info]: 8.89995e-07 [symbol_engine_optimizer]: 8.429e-05, [1] [Cycle 1]: 7.979e-05, [6] [build]: 3.09001e-06 [elim_shapecalc]: 1.133e-05 [elim_not_effective]: 1.66e-05 [opt_reshape]: 8.43001e-06 [fold_const_symbol]: 1.239e-05 [renormalize]: 2.69996e-07 [detach_backward]: 2.14e-06 [pipeline_parallel_scheduler]: 1.53002e-06 [auto_monad_reorder]: 1.982e-05 [get_jit_bprop_graph]: 1.36002e-06 [rewriter_after_jit_bprop_graph]: 4.1e-06 [opt_after_jit_grad]: 0.00046437 [validate]: 4.141e-05 Sums bootstrap : 0.000455s : 1.16% type_inference : 0.033104s : 84.30% event_method : 0.000079s : 0.20% auto_monad : 0.000101s : 0.26% graph_reusing : 0.000008s : 0.02% inline : 0.000002s : 0.01% add_attr.add_attr_with_inline.tag_attr : 0.000027s : 0.07% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000008s : 0.02% parallel-infer-symbol : 0.000003s : 0.01% pre_auto_parallel : 0.000039s : 0.10% insert-virtual-dataset : 0.000001s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000001s : 0.00% pipeline_split : 0.000001s : 0.00% optimize.py_interpret_to_execute : 0.000033s : 0.08% optimize.rewriter_before_opt_a : 0.000119s : 0.30% optimize.opt_a.expand_dump_flag : 0.000004s : 0.01% optimize.opt_a.switch_simplify : 0.000117s : 0.30% optimize.opt_a.loop_unroll : 0.000051s : 0.13% optimize.opt_a.a_1 : 0.001138s : 2.90% optimize.opt_a.with_stream_mark : 0.000026s : 0.07% optimize.opt_a.recompute_prepare : 0.000018s : 0.05% optimize.opt_a.updatestate_depend_eliminate : 0.000009s : 0.02% optimize.opt_a.updatestate_assign_eliminate : 0.000006s : 0.01% optimize.opt_a.updatestate_loads_eliminate : 0.000005s : 0.01% optimize.opt_a.parameter_eliminate : 0.000002s : 0.01% optimize.opt_a.a_2 : 0.000199s : 0.51% optimize.opt_a.accelerated_algorithm : 0.000017s : 0.04% optimize.opt_a.shard : 0.000002s : 0.01% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.01% optimize.opt_a.shard_inline : 0.000016s : 0.04% optimize.opt_a.merge_send_recv : 0.000014s : 0.04% optimize.opt_a.auto_parallel : 0.000013s : 0.03% optimize.opt_a.parallel : 0.000017s : 0.04% optimize.opt_a.flash_sp : 0.000011s : 0.03% optimize.opt_a.merge_comm : 0.000009s : 0.02% optimize.opt_a.allreduce_fusion : 0.000008s : 0.02% optimize.opt_a.matmul_add_comm_reduction : 0.000015s : 0.04% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000018s : 0.05% optimize.opt_a.virtual_dataset : 0.000015s : 0.04% optimize.opt_a.get_grad_eliminate_ : 0.000015s : 0.04% optimize.opt_a.virtual_output : 0.000014s : 0.04% optimize.opt_a.merge_forward : 0.000007s : 0.02% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.01% optimize.opt_a.offload_activation : 0.000017s : 0.04% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000030s : 0.08% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.01% optimize.opt_a.before_grad : 0.000027s : 0.07% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000009s : 0.02% optimize.opt_a.meta_fg_expand : 0.000006s : 0.02% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.01% optimize.opt_a.receive_attached : 0.000003s : 0.01% optimize.opt_a.after_resolve : 0.000025s : 0.06% optimize.opt_a.a_after_grad : 0.000024s : 0.06% optimize.opt_a.renormalize : 0.000989s : 2.52% optimize.opt_a.add_forward_monad_depend : 0.000007s : 0.02% optimize.opt_a.auto_monad_grad : 0.000004s : 0.01% optimize.opt_a.auto_monad_eliminator : 0.000025s : 0.06% optimize.opt_a.cse : 0.000053s : 0.14% optimize.opt_a.a_3 : 0.000124s : 0.32% optimize.py_interpret_to_execute_after_opt_a : 0.000010s : 0.02% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.01% optimize.rewriter_after_opt_a : 0.000040s : 0.10% optimize.convert_after_rewriter : 0.000007s : 0.02% optimize.order_py_execute_after_rewriter : 0.000006s : 0.01% optimize.mutable_eliminate : 0.000496s : 1.26% optimize.opt_b.b_1 : 0.000157s : 0.40% optimize.opt_b.b_2 : 0.000010s : 0.02% optimize.opt_b.updatestate_depend_eliminate : 0.000006s : 0.02% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.01% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.01% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000022s : 0.06% optimize.optimize_parallel_all_gather_comm : 0.000017s : 0.04% optimize.overlap_param_gather : 0.000002s : 0.01% optimize.cconv : 0.000022s : 0.06% optimize.loop_unroll : 0.000427s : 1.09% optimize.opt_after_cconv.c_1 : 0.000039s : 0.10% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.02% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.cse : 0.000022s : 0.06% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000015s : 0.04% optimize.tuple_transform.d_1 : 0.000052s : 0.13% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000008s : 0.02% optimize.partial_unused_args_eliminate : 0.000002s : 0.00% optimize.add_recomputation : 0.000070s : 0.18% optimize.cse_after_recomputation.cse : 0.000016s : 0.04% optimize.environ_conv : 0.000006s : 0.02% optimize.swap_dp_allreduce_reducescatter : 0.000006s : 0.02% optimize.bias_add_comm_swap : 0.000003s : 0.01% optimize.label_micro_interleaved_index : 0.000004s : 0.01% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.01% optimize.merge_cast_opt : 0.000001s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.01% optimize.micro_interleaved_order_control : 0.000002s : 0.01% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.00% optimize.full_micro_interleaved_order_control : 0.000002s : 0.01% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.01% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000002s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.00% optimize.control_data_broadcast_order : 0.000014s : 0.04% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.01% optimize.offloading_packed_experts : 0.000004s : 0.01% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.01% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.00% optimize.overlap_recompute_comm : 0.000002s : 0.01% optimize.overlap_grad_ring_attention : 0.000004s : 0.01% optimize.overlap_grad_flash_sp : 0.000020s : 0.05% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.01% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000003s : 0.01% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000011s : 0.03% optimize.symbol_engine_optimizer.elim_not_effective : 0.000017s : 0.04% optimize.symbol_engine_optimizer.opt_reshape : 0.000008s : 0.02% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000012s : 0.03% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.01% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000020s : 0.05% get_jit_bprop_graph : 0.000001s : 0.00% rewriter_after_jit_bprop_graph : 0.000004s : 0.01% opt_after_jit_grad : 0.000464s : 1.18% validate : 0.000041s : 0.11% Time group info: ------[substitution.] 0.000238 44 8.53% : 0.000020s : 3: substitution.cast_eliminate 0.98% : 0.000002s : 3: substitution.elim_not_effective 0.72% : 0.000002s : 3: substitution.fold_const_symbol 2.60% : 0.000006s : 5: substitution.graph_param_transform 72.08% : 0.000171s : 8: substitution.inline 2.22% : 0.000005s : 6: substitution.j_node_and_user_rematch 2.34% : 0.000006s : 6: substitution.remove_not_recompute_node 1.79% : 0.000004s : 4: substitution.replace_old_param 3.28% : 0.000008s : 2: substitution.switch_simplify 5.45% : 0.000013s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.033040 2 96.18% : 0.031779s : 1: type_inference.infer 3.82% : 0.001261s : 1: type_inference.specialize ------[replace.] 0.000111 14 45.79% : 0.000051s : 8: replace.inline 30.98% : 0.000034s : 2: replace.switch_simplify 23.23% : 0.000026s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000184 14 90.60% : 0.000167s : 8: match.inline 3.52% : 0.000006s : 2: match.switch_simplify 5.88% : 0.000011s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000290 1838 1.00% : 0.000003s : 21: predicate.accumulaten_eliminater 0.58% : 0.000002s : 5: predicate.ad_related_special_op_eliminate 0.64% : 0.000002s : 10: predicate.addn_check_dump 1.02% : 0.000003s : 21: predicate.addn_zero_filter 0.97% : 0.000003s : 21: predicate.adjust_all_reduce_mul_add 2.13% : 0.000006s : 31: predicate.arithmetic_simplify 1.20% : 0.000003s : 21: predicate.cast_eliminate 0.51% : 0.000001s : 10: predicate.check_bprop_eliminate 0.51% : 0.000001s : 10: predicate.compare_switch_simplify 0.17% : 0.000000s : 5: predicate.const_output_eliminate 0.50% : 0.000001s : 10: predicate.depend_value_elim 1.06% : 0.000003s : 21: predicate.dict_get_item_const_eliminator 1.12% : 0.000003s : 21: predicate.dict_get_item_eliminator 1.01% : 0.000003s : 21: predicate.dict_set_item_eliminator 0.72% : 0.000002s : 10: predicate.dumpgradient_eliminate 0.19% : 0.000001s : 5: predicate.elim_not_effective 0.27% : 0.000001s : 5: predicate.elim_shapecalc_of_broadcastargs 1.19% : 0.000003s : 26: predicate.environ_add_const_eliminate 1.16% : 0.000003s : 26: predicate.environ_get_add_eliminate 1.15% : 0.000003s : 26: predicate.environ_get_depend_swap 1.71% : 0.000005s : 36: predicate.environ_get_eliminate 1.17% : 0.000003s : 26: predicate.environ_get_set_eliminate 1.65% : 0.000005s : 33: predicate.exchange_switch_depend_value 2.46% : 0.000007s : 33: predicate.float_depend_g_call 0.59% : 0.000002s : 10: predicate.float_environ_get_switch 0.69% : 0.000002s : 15: predicate.float_tuple_getitem_switch 0.16% : 0.000000s : 5: predicate.fold_const_symbol 0.52% : 0.000002s : 10: predicate.get_grad_eliminate 0.18% : 0.000001s : 5: predicate.graph_param_transform 0.54% : 0.000002s : 10: predicate.incorporate_call 0.44% : 0.000001s : 10: predicate.incorporate_call_switch 6.72% : 0.000020s : 84: predicate.inline 0.68% : 0.000002s : 10: predicate.inline_without_move 0.27% : 0.000001s : 10: predicate.j_node_and_user_rematch 0.70% : 0.000002s : 10: predicate.less_batch_normalization 1.83% : 0.000005s : 35: predicate.list_to_tuple_eliminator_ 2.61% : 0.000008s : 56: predicate.load_eliminater 0.80% : 0.000002s : 5: predicate.loop_unroll_after_grad 2.51% : 0.000007s : 49: predicate.loop_unroll_before_grad 1.60% : 0.000005s : 31: predicate.make_slice_get_slice_eliminator 0.53% : 0.000002s : 10: predicate.merge_addn 0.51% : 0.000001s : 10: predicate.micro_step_allgather_replace 0.47% : 0.000001s : 10: predicate.mini_step_allgather_replace 0.97% : 0.000003s : 21: predicate.minmaximum_grad 0.84% : 0.000002s : 5: predicate.mutable_eliminate 0.33% : 0.000001s : 5: predicate.opt_reshape 0.28% : 0.000001s : 5: predicate.parallel_virtual_node 1.97% : 0.000006s : 33: predicate.partial_defer_inline 1.73% : 0.000005s : 30: predicate.partial_eliminate 1.01% : 0.000003s : 21: predicate.print_const_string_wrapper 0.53% : 0.000002s : 10: predicate.reduce_all_const_elim 1.36% : 0.000004s : 21: predicate.reduce_eliminate 2.58% : 0.000008s : 56: predicate.redundant_stop_gradient_eliminater 0.33% : 0.000001s : 10: predicate.remove_not_recompute_node 1.33% : 0.000004s : 35: predicate.replace_applicator 0.38% : 0.000001s : 10: predicate.replace_old_param 0.24% : 0.000001s : 5: predicate.reset_defer_inline 1.06% : 0.000003s : 21: predicate.reshape_eliminate 0.50% : 0.000001s : 10: predicate.row_tensor_add_zeros_like 0.45% : 0.000001s : 5: predicate.row_tensor_eliminate 0.61% : 0.000002s : 10: predicate.same_eliminate 0.37% : 0.000001s : 10: predicate.set_cell_output_no_recompute 0.60% : 0.000002s : 10: predicate.shard_identity_eliminate 0.55% : 0.000002s : 10: predicate.special_op_eliminate 0.63% : 0.000002s : 10: predicate.specialize_transform 0.68% : 0.000002s : 10: predicate.split_environ_get_set_with_tuple_value 0.64% : 0.000002s : 10: predicate.stack_unstack_eliminate 0.30% : 0.000001s : 5: predicate.switch_call_monad_eliminater 1.75% : 0.000005s : 33: predicate.switch_defer_inline 2.21% : 0.000006s : 43: predicate.switch_layer_defer_inline 5.48% : 0.000016s : 101: predicate.switch_simplify 1.06% : 0.000003s : 21: predicate.tile_eliminate 1.26% : 0.000004s : 21: predicate.transpose_eliminate 1.64% : 0.000005s : 31: predicate.tuple_list_convert_item_index_to_positive 1.80% : 0.000005s : 31: predicate.tuple_list_get_item_const_eliminator 1.56% : 0.000005s : 31: predicate.tuple_list_get_item_depend_reorder 3.04% : 0.000009s : 45: predicate.tuple_list_get_item_eliminator 1.63% : 0.000005s : 31: predicate.tuple_list_get_set_item_eliminator 2.40% : 0.000007s : 41: predicate.tuple_list_set_item_eliminator 1.69% : 0.000005s : 35: predicate.tuple_to_list_eliminator_ 2.57% : 0.000007s : 56: predicate.updatestate_pure_node_eliminater 3.18% : 0.000009s : 66: predicate.updatestate_useless_node_eliminater 0.29% : 0.000001s : 5: predicate.value_based_eliminate 0.57% : 0.000002s : 10: predicate.virtual_dataset_eliminate 0.48% : 0.000001s : 10: predicate.virtual_output_eliminate 0.23% : 0.000001s : 5: predicate.virtual_view_grad_eliminate 0.40% : 0.000001s : 5: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000930 16 58.11% : 0.000541s : 6: func_graph_cloner_run.FuncGraphClonerGraph 41.89% : 0.000390s : 10: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.079448 192 0.00% : 0.000004s : 1: ForceFp32Comm 19.23% : 0.015278s : 1: add_attr 19.22% : 0.015267s : 1: add_attr_with_inline 0.00% : 0.000004s : 1: add_comm_op_reuse_tag 0.09% : 0.000075s : 1: add_recomputation 0.01% : 0.000004s : 1: assign_add_opt 0.14% : 0.000109s : 1: auto_monad 0.03% : 0.000023s : 1: auto_monad_reorder 0.00% : 0.000003s : 1: begin_end_overlap_inline 0.01% : 0.000006s : 1: bias_add_comm_swap 0.61% : 0.000484s : 1: bootstrap 0.03% : 0.000025s : 1: cconv 0.00% : 0.000004s : 1: comm_op_add_attrs 0.02% : 0.000017s : 1: control_data_broadcast_order 0.01% : 0.000010s : 1: convert_after_rewriter 0.04% : 0.000030s : 1: cse_after_recomputation 0.01% : 0.000004s : 1: dataset_repeat_opt 0.01% : 0.000005s : 1: detach_backward 0.01% : 0.000009s : 1: environ_conv 0.11% : 0.000089s : 1: event_method 0.01% : 0.000005s : 1: full_micro_interleaved_order_control 0.01% : 0.000005s : 1: get_jit_bprop_graph 0.02% : 0.000013s : 1: graph_reusing 0.01% : 0.000005s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000004s : 1: handle_group_info 0.01% : 0.000006s : 1: inline 0.01% : 0.000004s : 1: insert-virtual-dataset 0.00% : 0.000004s : 1: interleave_parallel_branches 0.01% : 0.000004s : 1: interleave_split_concat_branches 0.01% : 0.000006s : 1: label_fine_grained_interleaved_index 0.01% : 0.000007s : 1: label_micro_interleaved_index 0.55% : 0.000435s : 1: loop_unroll 0.01% : 0.000004s : 1: merge_cast_opt 0.01% : 0.000005s : 1: micro_interleaved_order_control 0.63% : 0.000504s : 1: mutable_eliminate 0.01% : 0.000007s : 1: offloading_packed_experts 0.02% : 0.000016s : 1: opt.transform.loop_unroll_optimizer 0.02% : 0.000017s : 1: opt.transform.mutable_eliminate 2.21% : 0.001756s : 78: opt.transform.opt_a 0.05% : 0.000038s : 1: opt.transform.opt_after_cconv 0.04% : 0.000029s : 1: opt.transform.opt_after_jit_grad 0.17% : 0.000135s : 28: opt.transform.opt_b 0.07% : 0.000059s : 2: opt.transform.opt_trans_graph 0.06% : 0.000045s : 4: opt.transform.symbol_engine_opt 4.52% : 0.003592s : 1: opt_a 0.15% : 0.000118s : 1: opt_after_cconv 0.60% : 0.000473s : 1: opt_after_jit_grad 0.31% : 0.000245s : 1: opt_b 7.22% : 0.005735s : 1: optimize 0.03% : 0.000021s : 1: optimize_parallel_all_gather_comm 0.01% : 0.000009s : 1: order_py_execute_after_rewriter 0.03% : 0.000024s : 1: overlap_grad_flash_sp 0.01% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.01% : 0.000007s : 1: overlap_grad_ring_attention 0.01% : 0.000004s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.01% : 0.000005s : 1: overlap_param_gather 0.01% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.01% : 0.000008s : 1: overlap_recompute_and_grad_model_parallel 0.01% : 0.000005s : 1: overlap_recompute_comm 0.01% : 0.000006s : 1: parallel-infer-symbol 0.00% : 0.000004s : 1: parallel-infer-symbol-second 0.01% : 0.000005s : 1: partial_unused_args_eliminate 0.01% : 0.000005s : 1: pipeline_parallel_scheduler 0.01% : 0.000004s : 1: pipeline_split 0.05% : 0.000043s : 1: pre_auto_parallel 0.05% : 0.000037s : 1: py_interpret_to_execute 0.02% : 0.000013s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000004s : 1: remove_cast_before_assign_add 0.02% : 0.000018s : 1: remove_dup_value 0.68% : 0.000544s : 1: renormalize.infer 0.55% : 0.000437s : 1: renormalize.specialize 0.01% : 0.000005s : 1: reorder_send_recv_between_fp_bp 0.01% : 0.000007s : 1: rewriter_after_jit_bprop_graph 0.06% : 0.000044s : 1: rewriter_after_opt_a 0.16% : 0.000123s : 1: rewriter_before_opt_a 0.01% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.01% : 0.000005s : 1: slice_recompute_activation 0.01% : 0.000004s : 1: split_layernorm_comm 0.01% : 0.000005s : 1: split_matmul_comm_elemetwise 0.01% : 0.000009s : 1: swap_dp_allreduce_reducescatter 0.11% : 0.000087s : 1: symbol_engine_optimizer 0.11% : 0.000089s : 1: tuple_transform 41.69% : 0.033123s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:04.983.430 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:04.983.697 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0368319, [21] [bootstrap]: 0.0124793 [type_inference]: 0.0131599 [event_method]: 7.753e-05 [auto_monad]: 0.00010221 [graph_reusing]: 7.48e-06 [inline]: 2.44001e-06 [add_attr]: 0.00312003, [1] [add_attr_with_inline]: 0.00311147, [1] [Cycle 1]: 8.145e-05, [2] [tag_attr]: 2.761e-05 [meta_addattr_fg_expand]: 8.60001e-06 [parallel-infer-symbol]: 3.28e-06 [pre_auto_parallel]: 4.209e-05 [insert-virtual-dataset]: 2.27001e-06 [parallel-infer-symbol-second]: 7.2e-07 [dataset_repeat_opt]: 1.74e-06 [pipeline_split]: 1.79e-06 [optimize]: 0.00659777, [53] [py_interpret_to_execute]: 3.617e-05 [rewriter_before_opt_a]: 0.00012119 [opt_a]: 0.00405037, [2] [Cycle 1]: 0.00299285, [45] [expand_dump_flag]: 3.91001e-06 [switch_simplify]: 0.00012651 [loop_unroll]: 4.437e-05 [a_1]: 0.00092474 [with_stream_mark]: 1.645e-05 [recompute_prepare]: 1.151e-05 [updatestate_depend_eliminate]: 5.39e-06 [updatestate_assign_eliminate]: 4.42998e-06 [updatestate_loads_eliminate]: 4.22003e-06 [parameter_eliminate]: 1.86998e-06 [a_2]: 0.0001492 [accelerated_algorithm]: 9.99001e-06 [shard]: 1.66e-06 [meta_shard_fg_expand]: 2.34001e-06 [shard_inline]: 9.20001e-06 [merge_send_recv]: 1.011e-05 [auto_parallel]: 8.32998e-06 [parallel]: 1.808e-05 [flash_sp]: 8.88002e-06 [merge_comm]: 6.14001e-06 [allreduce_fusion]: 5.12999e-06 [matmul_add_comm_reduction]: 1.126e-05 [allreduce_slice_to_reducescatter]: 6.19999e-07 [virtual_shard_identity]: 1.044e-05 [virtual_dataset]: 9.17999e-06 [get_grad_eliminate_]: 9.09e-06 [virtual_output]: 9.20001e-06 [merge_forward]: 5.06997e-06 [cell_reuse_recompute_pass]: 1.39e-06 [offload_activation]: 1.158e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.031e-05 [merge_recompute_call_nodes]: 1.40001e-06 [before_grad]: 1.519e-05 [set_forward_comm_id_for_comm_node_pass]: 5.54998e-06 [meta_fg_expand]: 4.55001e-06 [flash_sp_send_recv_attached]: 2.89001e-06 [receive_attached]: 1.97999e-06 [after_resolve]: 1.435e-05 [a_after_grad]: 1.421e-05 [renormalize]: 0.00087684 [add_forward_monad_depend]: 5.61003e-06 [auto_monad_grad]: 1.99e-06 [auto_monad_eliminator]: 1.966e-05 [cse]: 4.123e-05 [a_3]: 0.00010745 [Cycle 2]: 0.00104457, [45] [expand_dump_flag]: 1.27e-06 [switch_simplify]: 1.241e-05 [loop_unroll]: 9.44998e-06 [a_1]: 0.00021859 [with_stream_mark]: 1.369e-05 [recompute_prepare]: 9.30001e-06 [updatestate_depend_eliminate]: 4.63001e-06 [updatestate_assign_eliminate]: 3.65e-06 [updatestate_loads_eliminate]: 3.73001e-06 [parameter_eliminate]: 1.03001e-06 [a_2]: 0.00014039 [accelerated_algorithm]: 9.00001e-06 [shard]: 1.49e-06 [meta_shard_fg_expand]: 2.01e-06 [shard_inline]: 9.12001e-06 [merge_send_recv]: 7.25e-06 [auto_parallel]: 7.29001e-06 [parallel]: 5.75001e-06 [flash_sp]: 3.55e-06 [merge_comm]: 4.97e-06 [allreduce_fusion]: 4.57e-06 [matmul_add_comm_reduction]: 8.56002e-06 [allreduce_slice_to_reducescatter]: 6.69999e-07 [virtual_shard_identity]: 9.52001e-06 [virtual_dataset]: 8.73001e-06 [get_grad_eliminate_]: 8.28999e-06 [virtual_output]: 8.05e-06 [merge_forward]: 4.02e-06 [cell_reuse_recompute_pass]: 1.75001e-06 [offload_activation]: 8.23999e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.939e-05 [merge_recompute_call_nodes]: 6.89994e-07 [before_grad]: 1.43e-05 [set_forward_comm_id_for_comm_node_pass]: 5.84e-06 [meta_fg_expand]: 3.66001e-06 [flash_sp_send_recv_attached]: 8.70001e-07 [receive_attached]: 1.04003e-06 [after_resolve]: 1.243e-05 [a_after_grad]: 1.334e-05 [renormalize]: 8.00064e-08 [add_forward_monad_depend]: 1.77001e-06 [auto_monad_grad]: 1.08001e-06 [auto_monad_eliminator]: 9.92001e-06 [cse]: 2.338e-05 [a_3]: 6.796e-05 [py_interpret_to_execute_after_opt_a]: 1.472e-05 [slice_cell_reuse_recomputed_activation]: 4.82998e-06 [rewriter_after_opt_a]: 4.849e-05 [convert_after_rewriter]: 1.148e-05 [order_py_execute_after_rewriter]: 9.52999e-06 [mutable_eliminate]: 0.00051303 [opt_b]: 0.00034659, [1] [Cycle 1]: 0.00033788, [7] [b_1]: 0.00022885 [b_2]: 1.071e-05 [updatestate_depend_eliminate]: 6.66e-06 [updatestate_assign_eliminate]: 3.54002e-06 [updatestate_loads_eliminate]: 3.42002e-06 [renormalize]: 4.19997e-07 [cse]: 2.694e-05 [optimize_parallel_all_gather_comm]: 2.126e-05 [overlap_param_gather]: 4.81002e-06 [cconv]: 3.057e-05 [loop_unroll]: 0.00044801 [opt_after_cconv]: 0.00015171, [1] [Cycle 1]: 0.00014303, [7] [c_1]: 4.476e-05 [parameter_eliminate]: 3.08e-06 [updatestate_depend_eliminate]: 6.54001e-06 [updatestate_assign_eliminate]: 3.55e-06 [updatestate_loads_eliminate]: 4.07e-06 [cse]: 2.574e-05 [renormalize]: 2.9002e-07 [remove_dup_value]: 3.893e-05 [tuple_transform]: 0.0001123, [1] [Cycle 1]: 0.00010511, [4] [d_1]: 6.211e-05 [none_parameter_eliminate]: 2.02999e-06 [renormalize]: 2.09984e-07 [switch_simplify]: 1.043e-05 [partial_unused_args_eliminate]: 4.82998e-06 [add_recomputation]: 6.24e-05 [cse_after_recomputation]: 3.315e-05, [1] [Cycle 1]: 2.637e-05, [1] [cse]: 1.698e-05 [environ_conv]: 9.61e-06 [swap_dp_allreduce_reducescatter]: 9.15999e-06 [bias_add_comm_swap]: 5.00999e-06 [label_micro_interleaved_index]: 7.63001e-06 [label_fine_grained_interleaved_index]: 5.65001e-06 [merge_cast_opt]: 3.7e-06 [slice_recompute_activation]: 4.57998e-06 [micro_interleaved_order_control]: 4.78001e-06 [assign_add_opt]: 3.51999e-06 [ForceFp32Comm]: 3.7e-06 [remove_cast_before_assign_add]: 3.43e-06 [full_micro_interleaved_order_control]: 4.63999e-06 [reorder_send_recv_between_fp_bp]: 4.94998e-06 [comm_op_add_attrs]: 3.4e-06 [add_comm_op_reuse_tag]: 3.53e-06 [interleave_split_concat_branches]: 3.5e-06 [interleave_parallel_branches]: 3.53e-06 [overlap_opt_shard_in_pipeline]: 3.61999e-06 [overlap_opt_shard_grad_in_pipeline]: 4.14002e-06 [control_data_broadcast_order]: 1.895e-05 [grouped_pairwise_exchange_alltoall]: 3.91001e-06 [offloading_packed_experts]: 7.48e-06 [overlap_recompute_and_grad_model_parallel]: 8.21002e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.58999e-06 [overlap_recompute_allgather_and_fa_grad]: 3.68e-06 [overlap_recompute_comm]: 4.55001e-06 [overlap_grad_ring_attention]: 7.56999e-06 [overlap_grad_flash_sp]: 2.525e-05 [begin_end_overlap_inline]: 2.96999e-06 [split_matmul_comm_elemetwise]: 4.52e-06 [split_layernorm_comm]: 4.23999e-06 [handle_group_info]: 3.95e-06 [symbol_engine_optimizer]: 0.00011506, [1] [Cycle 1]: 0.0001077, [6] [build]: 3.03e-06 [elim_shapecalc]: 1.402e-05 [elim_not_effective]: 1.853e-05 [opt_reshape]: 1.018e-05 [fold_const_symbol]: 1.529e-05 [renormalize]: 2.3999e-07 [detach_backward]: 3.86999e-06 [pipeline_parallel_scheduler]: 1.77001e-06 [auto_monad_reorder]: 2.564e-05 [get_jit_bprop_graph]: 1.19e-06 [rewriter_after_jit_bprop_graph]: 5.04e-06 [opt_after_jit_grad]: 0.00051364 [validate]: 4.242e-05 Sums bootstrap : 0.012479s : 39.15% type_inference : 0.013160s : 41.29% event_method : 0.000078s : 0.24% auto_monad : 0.000102s : 0.32% graph_reusing : 0.000007s : 0.02% inline : 0.000002s : 0.01% add_attr.add_attr_with_inline.tag_attr : 0.000028s : 0.09% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000009s : 0.03% parallel-infer-symbol : 0.000003s : 0.01% pre_auto_parallel : 0.000042s : 0.13% insert-virtual-dataset : 0.000002s : 0.01% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.01% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000036s : 0.11% optimize.rewriter_before_opt_a : 0.000121s : 0.38% optimize.opt_a.expand_dump_flag : 0.000005s : 0.02% optimize.opt_a.switch_simplify : 0.000139s : 0.44% optimize.opt_a.loop_unroll : 0.000054s : 0.17% optimize.opt_a.a_1 : 0.001143s : 3.59% optimize.opt_a.with_stream_mark : 0.000030s : 0.09% optimize.opt_a.recompute_prepare : 0.000021s : 0.07% optimize.opt_a.updatestate_depend_eliminate : 0.000010s : 0.03% optimize.opt_a.updatestate_assign_eliminate : 0.000008s : 0.03% optimize.opt_a.updatestate_loads_eliminate : 0.000008s : 0.02% optimize.opt_a.parameter_eliminate : 0.000003s : 0.01% optimize.opt_a.a_2 : 0.000290s : 0.91% optimize.opt_a.accelerated_algorithm : 0.000019s : 0.06% optimize.opt_a.shard : 0.000003s : 0.01% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.01% optimize.opt_a.shard_inline : 0.000018s : 0.06% optimize.opt_a.merge_send_recv : 0.000017s : 0.05% optimize.opt_a.auto_parallel : 0.000016s : 0.05% optimize.opt_a.parallel : 0.000024s : 0.07% optimize.opt_a.flash_sp : 0.000012s : 0.04% optimize.opt_a.merge_comm : 0.000011s : 0.03% optimize.opt_a.allreduce_fusion : 0.000010s : 0.03% optimize.opt_a.matmul_add_comm_reduction : 0.000020s : 0.06% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000020s : 0.06% optimize.opt_a.virtual_dataset : 0.000018s : 0.06% optimize.opt_a.get_grad_eliminate_ : 0.000017s : 0.05% optimize.opt_a.virtual_output : 0.000017s : 0.05% optimize.opt_a.merge_forward : 0.000009s : 0.03% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.01% optimize.opt_a.offload_activation : 0.000020s : 0.06% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000040s : 0.12% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.01% optimize.opt_a.before_grad : 0.000029s : 0.09% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000011s : 0.04% optimize.opt_a.meta_fg_expand : 0.000008s : 0.03% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.01% optimize.opt_a.receive_attached : 0.000003s : 0.01% optimize.opt_a.after_resolve : 0.000027s : 0.08% optimize.opt_a.a_after_grad : 0.000028s : 0.09% optimize.opt_a.renormalize : 0.000877s : 2.75% optimize.opt_a.add_forward_monad_depend : 0.000007s : 0.02% optimize.opt_a.auto_monad_grad : 0.000003s : 0.01% optimize.opt_a.auto_monad_eliminator : 0.000030s : 0.09% optimize.opt_a.cse : 0.000065s : 0.20% optimize.opt_a.a_3 : 0.000175s : 0.55% optimize.py_interpret_to_execute_after_opt_a : 0.000015s : 0.05% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.02% optimize.rewriter_after_opt_a : 0.000048s : 0.15% optimize.convert_after_rewriter : 0.000011s : 0.04% optimize.order_py_execute_after_rewriter : 0.000010s : 0.03% optimize.mutable_eliminate : 0.000513s : 1.61% optimize.opt_b.b_1 : 0.000229s : 0.72% optimize.opt_b.b_2 : 0.000011s : 0.03% optimize.opt_b.updatestate_depend_eliminate : 0.000007s : 0.02% optimize.opt_b.updatestate_assign_eliminate : 0.000004s : 0.01% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.01% optimize.opt_b.renormalize : 0.000000s : 0.00% optimize.opt_b.cse : 0.000027s : 0.08% optimize.optimize_parallel_all_gather_comm : 0.000021s : 0.07% optimize.overlap_param_gather : 0.000005s : 0.02% optimize.cconv : 0.000031s : 0.10% optimize.loop_unroll : 0.000448s : 1.41% optimize.opt_after_cconv.c_1 : 0.000045s : 0.14% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000007s : 0.02% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000004s : 0.01% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000004s : 0.01% optimize.opt_after_cconv.cse : 0.000026s : 0.08% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000039s : 0.12% optimize.tuple_transform.d_1 : 0.000062s : 0.19% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000010s : 0.03% optimize.partial_unused_args_eliminate : 0.000005s : 0.02% optimize.add_recomputation : 0.000062s : 0.20% optimize.cse_after_recomputation.cse : 0.000017s : 0.05% optimize.environ_conv : 0.000010s : 0.03% optimize.swap_dp_allreduce_reducescatter : 0.000009s : 0.03% optimize.bias_add_comm_swap : 0.000005s : 0.02% optimize.label_micro_interleaved_index : 0.000008s : 0.02% optimize.label_fine_grained_interleaved_index : 0.000006s : 0.02% optimize.merge_cast_opt : 0.000004s : 0.01% optimize.slice_recompute_activation : 0.000005s : 0.01% optimize.micro_interleaved_order_control : 0.000005s : 0.01% optimize.assign_add_opt : 0.000004s : 0.01% optimize.ForceFp32Comm : 0.000004s : 0.01% optimize.remove_cast_before_assign_add : 0.000003s : 0.01% optimize.full_micro_interleaved_order_control : 0.000005s : 0.01% optimize.reorder_send_recv_between_fp_bp : 0.000005s : 0.02% optimize.comm_op_add_attrs : 0.000003s : 0.01% optimize.add_comm_op_reuse_tag : 0.000004s : 0.01% optimize.interleave_split_concat_branches : 0.000003s : 0.01% optimize.interleave_parallel_branches : 0.000004s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000004s : 0.01% optimize.control_data_broadcast_order : 0.000019s : 0.06% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.01% optimize.offloading_packed_experts : 0.000007s : 0.02% optimize.overlap_recompute_and_grad_model_parallel : 0.000008s : 0.03% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.01% optimize.overlap_recompute_comm : 0.000005s : 0.01% optimize.overlap_grad_ring_attention : 0.000008s : 0.02% optimize.overlap_grad_flash_sp : 0.000025s : 0.08% optimize.begin_end_overlap_inline : 0.000003s : 0.01% optimize.split_matmul_comm_elemetwise : 0.000005s : 0.01% optimize.split_layernorm_comm : 0.000004s : 0.01% optimize.handle_group_info : 0.000004s : 0.01% optimize.symbol_engine_optimizer.build : 0.000003s : 0.01% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000014s : 0.04% optimize.symbol_engine_optimizer.elim_not_effective : 0.000019s : 0.06% optimize.symbol_engine_optimizer.opt_reshape : 0.000010s : 0.03% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000015s : 0.05% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000004s : 0.01% pipeline_parallel_scheduler : 0.000002s : 0.01% auto_monad_reorder : 0.000026s : 0.08% get_jit_bprop_graph : 0.000001s : 0.00% rewriter_after_jit_bprop_graph : 0.000005s : 0.02% opt_after_jit_grad : 0.000514s : 1.61% validate : 0.000042s : 0.13% Time group info: ------[substitution.] 0.000254 54 11.83% : 0.000030s : 6: substitution.cast_eliminate 1.14% : 0.000003s : 4: substitution.elim_not_effective 0.83% : 0.000002s : 4: substitution.fold_const_symbol 2.81% : 0.000007s : 6: substitution.graph_param_transform 67.65% : 0.000172s : 8: substitution.inline 2.02% : 0.000005s : 8: substitution.j_node_and_user_rematch 2.89% : 0.000007s : 8: substitution.remove_not_recompute_node 1.73% : 0.000004s : 4: substitution.replace_old_param 3.99% : 0.000010s : 2: substitution.switch_simplify 5.11% : 0.000013s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.013098 2 89.97% : 0.011785s : 1: type_inference.infer 10.03% : 0.001313s : 1: type_inference.specialize ------[replace.] 0.000117 14 43.14% : 0.000051s : 8: replace.inline 37.54% : 0.000044s : 2: replace.switch_simplify 19.33% : 0.000023s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000187 14 89.34% : 0.000167s : 8: match.inline 4.74% : 0.000009s : 2: match.switch_simplify 5.92% : 0.000011s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000309 1972 0.94% : 0.000003s : 21: predicate.accumulaten_eliminater 0.64% : 0.000002s : 6: predicate.ad_related_special_op_eliminate 0.52% : 0.000002s : 12: predicate.addn_check_dump 0.97% : 0.000003s : 21: predicate.addn_zero_filter 0.90% : 0.000003s : 21: predicate.adjust_all_reduce_mul_add 2.10% : 0.000006s : 33: predicate.arithmetic_simplify 1.05% : 0.000003s : 21: predicate.cast_eliminate 0.62% : 0.000002s : 12: predicate.check_bprop_eliminate 0.54% : 0.000002s : 12: predicate.compare_switch_simplify 0.17% : 0.000001s : 6: predicate.const_output_eliminate 0.59% : 0.000002s : 12: predicate.depend_value_elim 0.99% : 0.000003s : 21: predicate.dict_get_item_const_eliminator 1.04% : 0.000003s : 21: predicate.dict_get_item_eliminator 0.95% : 0.000003s : 21: predicate.dict_set_item_eliminator 0.80% : 0.000002s : 12: predicate.dumpgradient_eliminate 0.22% : 0.000001s : 6: predicate.elim_not_effective 0.37% : 0.000001s : 6: predicate.elim_shapecalc_of_broadcastargs 1.21% : 0.000004s : 27: predicate.environ_add_const_eliminate 1.12% : 0.000003s : 27: predicate.environ_get_add_eliminate 1.21% : 0.000004s : 27: predicate.environ_get_depend_swap 1.74% : 0.000005s : 39: predicate.environ_get_eliminate 1.18% : 0.000004s : 27: predicate.environ_get_set_eliminate 1.52% : 0.000005s : 33: predicate.exchange_switch_depend_value 2.50% : 0.000008s : 33: predicate.float_depend_g_call 0.53% : 0.000002s : 12: predicate.float_environ_get_switch 0.83% : 0.000003s : 18: predicate.float_tuple_getitem_switch 0.19% : 0.000001s : 6: predicate.fold_const_symbol 0.62% : 0.000002s : 12: predicate.get_grad_eliminate 0.21% : 0.000001s : 6: predicate.graph_param_transform 0.60% : 0.000002s : 12: predicate.incorporate_call 0.53% : 0.000002s : 12: predicate.incorporate_call_switch 6.64% : 0.000021s : 90: predicate.inline 0.89% : 0.000003s : 12: predicate.inline_without_move 0.30% : 0.000001s : 12: predicate.j_node_and_user_rematch 0.79% : 0.000002s : 12: predicate.less_batch_normalization 1.73% : 0.000005s : 37: predicate.list_to_tuple_eliminator_ 2.53% : 0.000008s : 58: predicate.load_eliminater 0.80% : 0.000002s : 6: predicate.loop_unroll_after_grad 2.39% : 0.000007s : 51: predicate.loop_unroll_before_grad 1.57% : 0.000005s : 33: predicate.make_slice_get_slice_eliminator 0.56% : 0.000002s : 12: predicate.merge_addn 0.53% : 0.000002s : 12: predicate.micro_step_allgather_replace 0.56% : 0.000002s : 12: predicate.mini_step_allgather_replace 0.87% : 0.000003s : 21: predicate.minmaximum_grad 0.73% : 0.000002s : 6: predicate.mutable_eliminate 0.36% : 0.000001s : 6: predicate.opt_reshape 0.33% : 0.000001s : 6: predicate.parallel_virtual_node 1.83% : 0.000006s : 33: predicate.partial_defer_inline 1.65% : 0.000005s : 31: predicate.partial_eliminate 0.93% : 0.000003s : 21: predicate.print_const_string_wrapper 0.61% : 0.000002s : 12: predicate.reduce_all_const_elim 1.23% : 0.000004s : 21: predicate.reduce_eliminate 2.51% : 0.000008s : 58: predicate.redundant_stop_gradient_eliminater 0.34% : 0.000001s : 12: predicate.remove_not_recompute_node 1.30% : 0.000004s : 37: predicate.replace_applicator 0.46% : 0.000001s : 12: predicate.replace_old_param 0.21% : 0.000001s : 6: predicate.reset_defer_inline 0.99% : 0.000003s : 21: predicate.reshape_eliminate 0.62% : 0.000002s : 12: predicate.row_tensor_add_zeros_like 0.35% : 0.000001s : 6: predicate.row_tensor_eliminate 0.75% : 0.000002s : 12: predicate.same_eliminate 0.39% : 0.000001s : 12: predicate.set_cell_output_no_recompute 0.71% : 0.000002s : 12: predicate.shard_identity_eliminate 0.68% : 0.000002s : 12: predicate.special_op_eliminate 0.74% : 0.000002s : 12: predicate.specialize_transform 0.74% : 0.000002s : 12: predicate.split_environ_get_set_with_tuple_value 0.68% : 0.000002s : 12: predicate.stack_unstack_eliminate 0.32% : 0.000001s : 6: predicate.switch_call_monad_eliminater 1.63% : 0.000005s : 33: predicate.switch_defer_inline 2.14% : 0.000007s : 45: predicate.switch_layer_defer_inline 5.57% : 0.000017s : 106: predicate.switch_simplify 0.93% : 0.000003s : 21: predicate.tile_eliminate 1.05% : 0.000003s : 21: predicate.transpose_eliminate 1.55% : 0.000005s : 33: predicate.tuple_list_convert_item_index_to_positive 1.64% : 0.000005s : 33: predicate.tuple_list_get_item_const_eliminator 1.57% : 0.000005s : 33: predicate.tuple_list_get_item_depend_reorder 3.25% : 0.000010s : 49: predicate.tuple_list_get_item_eliminator 1.61% : 0.000005s : 33: predicate.tuple_list_get_set_item_eliminator 2.35% : 0.000007s : 45: predicate.tuple_list_set_item_eliminator 1.71% : 0.000005s : 37: predicate.tuple_to_list_eliminator_ 2.47% : 0.000008s : 58: predicate.updatestate_pure_node_eliminater 3.20% : 0.000010s : 70: predicate.updatestate_useless_node_eliminater 0.33% : 0.000001s : 6: predicate.value_based_eliminate 0.66% : 0.000002s : 12: predicate.virtual_dataset_eliminate 0.59% : 0.000002s : 12: predicate.virtual_output_eliminate 0.30% : 0.000001s : 6: predicate.virtual_view_grad_eliminate 0.42% : 0.000001s : 6: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000951 16 56.63% : 0.000538s : 6: func_graph_cloner_run.FuncGraphClonerGraph 43.37% : 0.000412s : 10: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.049536 192 0.01% : 0.000007s : 1: ForceFp32Comm 6.32% : 0.003129s : 1: add_attr 6.29% : 0.003115s : 1: add_attr_with_inline 0.01% : 0.000006s : 1: add_comm_op_reuse_tag 0.13% : 0.000066s : 1: add_recomputation 0.01% : 0.000006s : 1: assign_add_opt 0.23% : 0.000114s : 1: auto_monad 0.07% : 0.000034s : 1: auto_monad_reorder 0.01% : 0.000006s : 1: begin_end_overlap_inline 0.02% : 0.000008s : 1: bias_add_comm_swap 25.28% : 0.012523s : 1: bootstrap 0.07% : 0.000034s : 1: cconv 0.01% : 0.000006s : 1: comm_op_add_attrs 0.04% : 0.000022s : 1: control_data_broadcast_order 0.03% : 0.000015s : 1: convert_after_rewriter 0.07% : 0.000036s : 1: cse_after_recomputation 0.01% : 0.000007s : 1: dataset_repeat_opt 0.05% : 0.000022s : 1: detach_backward 0.03% : 0.000013s : 1: environ_conv 0.19% : 0.000092s : 1: event_method 0.02% : 0.000008s : 1: full_micro_interleaved_order_control 0.01% : 0.000007s : 1: get_jit_bprop_graph 0.03% : 0.000014s : 1: graph_reusing 0.01% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000007s : 1: handle_group_info 0.02% : 0.000008s : 1: inline 0.02% : 0.000009s : 1: insert-virtual-dataset 0.01% : 0.000006s : 1: interleave_parallel_branches 0.01% : 0.000006s : 1: interleave_split_concat_branches 0.02% : 0.000008s : 1: label_fine_grained_interleaved_index 0.02% : 0.000010s : 1: label_micro_interleaved_index 0.92% : 0.000454s : 1: loop_unroll 0.01% : 0.000006s : 1: merge_cast_opt 0.01% : 0.000007s : 1: micro_interleaved_order_control 1.05% : 0.000520s : 1: mutable_eliminate 0.02% : 0.000010s : 1: offloading_packed_experts 0.04% : 0.000018s : 1: opt.transform.loop_unroll_optimizer 0.04% : 0.000019s : 1: opt.transform.mutable_eliminate 3.75% : 0.001856s : 78: opt.transform.opt_a 0.09% : 0.000043s : 1: opt.transform.opt_after_cconv 0.07% : 0.000035s : 1: opt.transform.opt_after_jit_grad 0.34% : 0.000167s : 28: opt.transform.opt_b 0.14% : 0.000070s : 2: opt.transform.opt_trans_graph 0.11% : 0.000054s : 4: opt.transform.symbol_engine_opt 8.18% : 0.004053s : 1: opt_a 0.31% : 0.000155s : 1: opt_after_cconv 1.06% : 0.000524s : 1: opt_after_jit_grad 0.71% : 0.000350s : 1: opt_b 14.09% : 0.006979s : 1: optimize 0.05% : 0.000025s : 1: optimize_parallel_all_gather_comm 0.03% : 0.000012s : 1: order_py_execute_after_rewriter 0.06% : 0.000028s : 1: overlap_grad_flash_sp 0.01% : 0.000006s : 1: overlap_grad_matmul_and_grad_allreduce 0.02% : 0.000011s : 1: overlap_grad_ring_attention 0.01% : 0.000007s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000006s : 1: overlap_opt_shard_in_pipeline 0.02% : 0.000008s : 1: overlap_param_gather 0.01% : 0.000006s : 1: overlap_recompute_allgather_and_fa_grad 0.02% : 0.000011s : 1: overlap_recompute_and_grad_model_parallel 0.01% : 0.000007s : 1: overlap_recompute_comm 0.02% : 0.000010s : 1: parallel-infer-symbol 0.01% : 0.000006s : 1: parallel-infer-symbol-second 0.02% : 0.000008s : 1: partial_unused_args_eliminate 0.02% : 0.000009s : 1: pipeline_parallel_scheduler 0.02% : 0.000008s : 1: pipeline_split 0.10% : 0.000050s : 1: pre_auto_parallel 0.08% : 0.000040s : 1: py_interpret_to_execute 0.04% : 0.000018s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000007s : 1: remove_cast_before_assign_add 0.09% : 0.000043s : 1: remove_dup_value 0.93% : 0.000461s : 1: renormalize.infer 0.82% : 0.000407s : 1: renormalize.specialize 0.02% : 0.000008s : 1: reorder_send_recv_between_fp_bp 0.02% : 0.000011s : 1: rewriter_after_jit_bprop_graph 0.10% : 0.000052s : 1: rewriter_after_opt_a 0.25% : 0.000125s : 1: rewriter_before_opt_a 0.02% : 0.000008s : 1: slice_cell_reuse_recomputed_activation 0.01% : 0.000007s : 1: slice_recompute_activation 0.01% : 0.000007s : 1: split_layernorm_comm 0.01% : 0.000007s : 1: split_matmul_comm_elemetwise 0.02% : 0.000012s : 1: swap_dp_allreduce_reducescatter 0.24% : 0.000118s : 1: symbol_engine_optimizer 0.23% : 0.000115s : 1: tuple_transform 26.64% : 0.013199s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:05.454.942 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0353825, [21] [bootstrap]: 0.00046779 [type_inference]: 0.0130798 [event_method]: 7.808e-05 [auto_monad]: 0.00010044 [graph_reusing]: 8.28001e-06 [inline]: 2.65997e-06 [add_attr]: 0.00311682, [1] [add_attr_with_inline]: 0.00310754, [1] [Cycle 1]: 7.092e-05, [2] [tag_attr]: 2.902e-05 [meta_addattr_fg_expand]: 8.38001e-06 [parallel-infer-symbol]: 3.16001e-06 [pre_auto_parallel]: 4.494e-05 [insert-virtual-dataset]: 2.50002e-06 [parallel-infer-symbol-second]: 8.2e-07 [dataset_repeat_opt]: 2.44999e-06 [pipeline_split]: 1.57001e-06 [optimize]: 0.0175224, [53] [py_interpret_to_execute]: 3.297e-05 [rewriter_before_opt_a]: 0.0001186 [opt_a]: 0.0143625, [2] [Cycle 1]: 0.0133664, [45] [expand_dump_flag]: 4.81997e-06 [switch_simplify]: 0.00012478 [loop_unroll]: 4.361e-05 [a_1]: 0.00097229 [with_stream_mark]: 1.908e-05 [recompute_prepare]: 1.544e-05 [updatestate_depend_eliminate]: 5.54998e-06 [updatestate_assign_eliminate]: 4.08999e-06 [updatestate_loads_eliminate]: 4.65001e-06 [parameter_eliminate]: 2.12999e-06 [a_2]: 0.00012113 [accelerated_algorithm]: 1.017e-05 [shard]: 1.83002e-06 [meta_shard_fg_expand]: 2.46e-06 [shard_inline]: 9.75002e-06 [merge_send_recv]: 1.088e-05 [auto_parallel]: 8.20999e-06 [parallel]: 1.913e-05 [flash_sp]: 9.10001e-06 [merge_comm]: 5.99999e-06 [allreduce_fusion]: 5.04e-06 [matmul_add_comm_reduction]: 1.21e-05 [allreduce_slice_to_reducescatter]: 8.39995e-07 [virtual_shard_identity]: 1.285e-05 [virtual_dataset]: 9.96e-06 [get_grad_eliminate_]: 9.64e-06 [virtual_output]: 9.19e-06 [merge_forward]: 5.14998e-06 [cell_reuse_recompute_pass]: 1.32999e-06 [offload_activation]: 1.196e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.032e-05 [merge_recompute_call_nodes]: 1.69e-06 [before_grad]: 1.606e-05 [set_forward_comm_id_for_comm_node_pass]: 5.29e-06 [meta_fg_expand]: 4.69002e-06 [flash_sp_send_recv_attached]: 3.25998e-06 [receive_attached]: 2.79001e-06 [after_resolve]: 1.477e-05 [a_after_grad]: 1.555e-05 [renormalize]: 0.0113206 [add_forward_monad_depend]: 1.186e-05 [auto_monad_grad]: 2.36e-06 [auto_monad_eliminator]: 3.034e-05 [cse]: 5.126e-05 [a_3]: 8.879e-05 [Cycle 2]: 0.00098224, [45] [expand_dump_flag]: 2.47001e-06 [switch_simplify]: 1.371e-05 [loop_unroll]: 9.49e-06 [a_1]: 0.0002421 [with_stream_mark]: 2.222e-05 [recompute_prepare]: 9.67001e-06 [updatestate_depend_eliminate]: 5.33002e-06 [updatestate_assign_eliminate]: 4.37e-06 [updatestate_loads_eliminate]: 4.43999e-06 [parameter_eliminate]: 2.06998e-06 [a_2]: 0.00011648 [accelerated_algorithm]: 9.88998e-06 [shard]: 2.86e-06 [meta_shard_fg_expand]: 3.18e-06 [shard_inline]: 9.53002e-06 [merge_send_recv]: 1.051e-05 [auto_parallel]: 1.121e-05 [parallel]: 9.56e-06 [flash_sp]: 4.40999e-06 [merge_comm]: 5.27001e-06 [allreduce_fusion]: 5.55001e-06 [matmul_add_comm_reduction]: 1.26e-05 [allreduce_slice_to_reducescatter]: 9.70002e-07 [virtual_shard_identity]: 1.072e-05 [virtual_dataset]: 9.47001e-06 [get_grad_eliminate_]: 9.49e-06 [virtual_output]: 8.74003e-06 [merge_forward]: 5.54e-06 [cell_reuse_recompute_pass]: 3.58e-06 [offload_activation]: 1.248e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.915e-05 [merge_recompute_call_nodes]: 1.76003e-06 [before_grad]: 1.586e-05 [set_forward_comm_id_for_comm_node_pass]: 5.82999e-06 [meta_fg_expand]: 4.37003e-06 [flash_sp_send_recv_attached]: 1.92999e-06 [receive_attached]: 3.02002e-06 [after_resolve]: 1.6e-05 [a_after_grad]: 1.574e-05 [renormalize]: 8.00064e-08 [add_forward_monad_depend]: 1.87001e-06 [auto_monad_grad]: 1.79e-06 [auto_monad_eliminator]: 1.396e-05 [cse]: 2.908e-05 [a_3]: 5.787e-05 [py_interpret_to_execute_after_opt_a]: 2.123e-05 [slice_cell_reuse_recomputed_activation]: 1.90001e-06 [rewriter_after_opt_a]: 5.539e-05 [convert_after_rewriter]: 8.02998e-06 [order_py_execute_after_rewriter]: 7.43e-06 [mutable_eliminate]: 0.0008492 [opt_b]: 0.00032916, [1] [Cycle 1]: 0.00031993, [7] [b_1]: 0.00020528 [b_2]: 1.21e-05 [updatestate_depend_eliminate]: 1.154e-05 [updatestate_assign_eliminate]: 3.76999e-06 [updatestate_loads_eliminate]: 5.52999e-06 [renormalize]: 9.60019e-07 [cse]: 4.056e-05 [optimize_parallel_all_gather_comm]: 2.419e-05 [overlap_param_gather]: 2.37999e-06 [cconv]: 3.618e-05 [loop_unroll]: 0.00071867 [opt_after_cconv]: 0.00015485, [1] [Cycle 1]: 0.00014608, [7] [c_1]: 4.934e-05 [parameter_eliminate]: 4.45e-06 [updatestate_depend_eliminate]: 8.92e-06 [updatestate_assign_eliminate]: 3.63999e-06 [updatestate_loads_eliminate]: 3.86999e-06 [cse]: 3.739e-05 [renormalize]: 5.3001e-07 [remove_dup_value]: 5.136e-05 [tuple_transform]: 0.00010882, [1] [Cycle 1]: 0.00010412, [4] [d_1]: 7.01e-05 [none_parameter_eliminate]: 2.01998e-06 [renormalize]: 2.3999e-07 [switch_simplify]: 1.033e-05 [partial_unused_args_eliminate]: 1.99e-06 [add_recomputation]: 7.196e-05 [cse_after_recomputation]: 3.087e-05, [1] [Cycle 1]: 2.546e-05, [1] [cse]: 2.003e-05 [environ_conv]: 7.58001e-06 [swap_dp_allreduce_reducescatter]: 7.21001e-06 [bias_add_comm_swap]: 3.04001e-06 [label_micro_interleaved_index]: 5.84e-06 [label_fine_grained_interleaved_index]: 2.89001e-06 [merge_cast_opt]: 1.65001e-06 [slice_recompute_activation]: 2.07999e-06 [micro_interleaved_order_control]: 2.51e-06 [assign_add_opt]: 1.56998e-06 [ForceFp32Comm]: 9.09989e-07 [remove_cast_before_assign_add]: 1.16002e-06 [full_micro_interleaved_order_control]: 2.76e-06 [reorder_send_recv_between_fp_bp]: 2.82002e-06 [comm_op_add_attrs]: 9.79984e-07 [add_comm_op_reuse_tag]: 1.04e-06 [interleave_split_concat_branches]: 1.12999e-06 [interleave_parallel_branches]: 1.04e-06 [overlap_opt_shard_in_pipeline]: 1.49e-06 [overlap_opt_shard_grad_in_pipeline]: 2.11998e-06 [control_data_broadcast_order]: 1.966e-05 [grouped_pairwise_exchange_alltoall]: 1.64998e-06 [offloading_packed_experts]: 5.05999e-06 [overlap_recompute_and_grad_model_parallel]: 5.84999e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.24e-06 [overlap_recompute_allgather_and_fa_grad]: 1.46998e-06 [overlap_recompute_comm]: 2.74001e-06 [overlap_grad_ring_attention]: 4.97999e-06 [overlap_grad_flash_sp]: 2.822e-05 [begin_end_overlap_inline]: 5.3001e-07 [split_matmul_comm_elemetwise]: 2.73e-06 [split_layernorm_comm]: 1.91e-06 [handle_group_info]: 9.30013e-07 [symbol_engine_optimizer]: 0.00010788, [1] [Cycle 1]: 0.0001022, [6] [build]: 5.19e-06 [elim_shapecalc]: 1.733e-05 [elim_not_effective]: 2.097e-05 [opt_reshape]: 1.11e-05 [fold_const_symbol]: 1.578e-05 [renormalize]: 2.3999e-07 [detach_backward]: 2.59001e-06 [pipeline_parallel_scheduler]: 1.97999e-06 [auto_monad_reorder]: 2.351e-05 [get_jit_bprop_graph]: 1.87001e-06 [rewriter_after_jit_bprop_graph]: 7.18998e-06 [opt_after_jit_grad]: 0.00069087 [validate]: 5.557e-05 Sums bootstrap : 0.000468s : 1.50% type_inference : 0.013080s : 42.01% event_method : 0.000078s : 0.25% auto_monad : 0.000100s : 0.32% graph_reusing : 0.000008s : 0.03% inline : 0.000003s : 0.01% add_attr.add_attr_with_inline.tag_attr : 0.000029s : 0.09% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000008s : 0.03% parallel-infer-symbol : 0.000003s : 0.01% pre_auto_parallel : 0.000045s : 0.14% insert-virtual-dataset : 0.000003s : 0.01% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.01% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000033s : 0.11% optimize.rewriter_before_opt_a : 0.000119s : 0.38% optimize.opt_a.expand_dump_flag : 0.000007s : 0.02% optimize.opt_a.switch_simplify : 0.000138s : 0.44% optimize.opt_a.loop_unroll : 0.000053s : 0.17% optimize.opt_a.a_1 : 0.001214s : 3.90% optimize.opt_a.with_stream_mark : 0.000041s : 0.13% optimize.opt_a.recompute_prepare : 0.000025s : 0.08% optimize.opt_a.updatestate_depend_eliminate : 0.000011s : 0.03% optimize.opt_a.updatestate_assign_eliminate : 0.000008s : 0.03% optimize.opt_a.updatestate_loads_eliminate : 0.000009s : 0.03% optimize.opt_a.parameter_eliminate : 0.000004s : 0.01% optimize.opt_a.a_2 : 0.000238s : 0.76% optimize.opt_a.accelerated_algorithm : 0.000020s : 0.06% optimize.opt_a.shard : 0.000005s : 0.02% optimize.opt_a.meta_shard_fg_expand : 0.000006s : 0.02% optimize.opt_a.shard_inline : 0.000019s : 0.06% optimize.opt_a.merge_send_recv : 0.000021s : 0.07% optimize.opt_a.auto_parallel : 0.000019s : 0.06% optimize.opt_a.parallel : 0.000029s : 0.09% optimize.opt_a.flash_sp : 0.000014s : 0.04% optimize.opt_a.merge_comm : 0.000011s : 0.04% optimize.opt_a.allreduce_fusion : 0.000011s : 0.03% optimize.opt_a.matmul_add_comm_reduction : 0.000025s : 0.08% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000002s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000024s : 0.08% optimize.opt_a.virtual_dataset : 0.000019s : 0.06% optimize.opt_a.get_grad_eliminate_ : 0.000019s : 0.06% optimize.opt_a.virtual_output : 0.000018s : 0.06% optimize.opt_a.merge_forward : 0.000011s : 0.03% optimize.opt_a.cell_reuse_recompute_pass : 0.000005s : 0.02% optimize.opt_a.offload_activation : 0.000024s : 0.08% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000039s : 0.13% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.01% optimize.opt_a.before_grad : 0.000032s : 0.10% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000011s : 0.04% optimize.opt_a.meta_fg_expand : 0.000009s : 0.03% optimize.opt_a.flash_sp_send_recv_attached : 0.000005s : 0.02% optimize.opt_a.receive_attached : 0.000006s : 0.02% optimize.opt_a.after_resolve : 0.000031s : 0.10% optimize.opt_a.a_after_grad : 0.000031s : 0.10% optimize.opt_a.renormalize : 0.011321s : 36.36% optimize.opt_a.add_forward_monad_depend : 0.000014s : 0.04% optimize.opt_a.auto_monad_grad : 0.000004s : 0.01% optimize.opt_a.auto_monad_eliminator : 0.000044s : 0.14% optimize.opt_a.cse : 0.000080s : 0.26% optimize.opt_a.a_3 : 0.000147s : 0.47% optimize.py_interpret_to_execute_after_opt_a : 0.000021s : 0.07% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.01% optimize.rewriter_after_opt_a : 0.000055s : 0.18% optimize.convert_after_rewriter : 0.000008s : 0.03% optimize.order_py_execute_after_rewriter : 0.000007s : 0.02% optimize.mutable_eliminate : 0.000849s : 2.73% optimize.opt_b.b_1 : 0.000205s : 0.66% optimize.opt_b.b_2 : 0.000012s : 0.04% optimize.opt_b.updatestate_depend_eliminate : 0.000012s : 0.04% optimize.opt_b.updatestate_assign_eliminate : 0.000004s : 0.01% optimize.opt_b.updatestate_loads_eliminate : 0.000006s : 0.02% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000041s : 0.13% optimize.optimize_parallel_all_gather_comm : 0.000024s : 0.08% optimize.overlap_param_gather : 0.000002s : 0.01% optimize.cconv : 0.000036s : 0.12% optimize.loop_unroll : 0.000719s : 2.31% optimize.opt_after_cconv.c_1 : 0.000049s : 0.16% optimize.opt_after_cconv.parameter_eliminate : 0.000004s : 0.01% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000009s : 0.03% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000004s : 0.01% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000004s : 0.01% optimize.opt_after_cconv.cse : 0.000037s : 0.12% optimize.opt_after_cconv.renormalize : 0.000001s : 0.00% optimize.remove_dup_value : 0.000051s : 0.16% optimize.tuple_transform.d_1 : 0.000070s : 0.23% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000010s : 0.03% optimize.partial_unused_args_eliminate : 0.000002s : 0.01% optimize.add_recomputation : 0.000072s : 0.23% optimize.cse_after_recomputation.cse : 0.000020s : 0.06% optimize.environ_conv : 0.000008s : 0.02% optimize.swap_dp_allreduce_reducescatter : 0.000007s : 0.02% optimize.bias_add_comm_swap : 0.000003s : 0.01% optimize.label_micro_interleaved_index : 0.000006s : 0.02% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.01% optimize.merge_cast_opt : 0.000002s : 0.01% optimize.slice_recompute_activation : 0.000002s : 0.01% optimize.micro_interleaved_order_control : 0.000003s : 0.01% optimize.assign_add_opt : 0.000002s : 0.01% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.00% optimize.full_micro_interleaved_order_control : 0.000003s : 0.01% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.01% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.01% optimize.control_data_broadcast_order : 0.000020s : 0.06% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.01% optimize.offloading_packed_experts : 0.000005s : 0.02% optimize.overlap_recompute_and_grad_model_parallel : 0.000006s : 0.02% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.00% optimize.overlap_recompute_comm : 0.000003s : 0.01% optimize.overlap_grad_ring_attention : 0.000005s : 0.02% optimize.overlap_grad_flash_sp : 0.000028s : 0.09% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000003s : 0.01% optimize.split_layernorm_comm : 0.000002s : 0.01% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000005s : 0.02% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000017s : 0.06% optimize.symbol_engine_optimizer.elim_not_effective : 0.000021s : 0.07% optimize.symbol_engine_optimizer.opt_reshape : 0.000011s : 0.04% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000016s : 0.05% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000003s : 0.01% pipeline_parallel_scheduler : 0.000002s : 0.01% auto_monad_reorder : 0.000024s : 0.08% get_jit_bprop_graph : 0.000002s : 0.01% rewriter_after_jit_bprop_graph : 0.000007s : 0.02% opt_after_jit_grad : 0.000691s : 2.22% validate : 0.000056s : 0.18% Time group info: ------[substitution.] 0.000282 54 13.81% : 0.000039s : 6: substitution.cast_eliminate 1.13% : 0.000003s : 4: substitution.elim_not_effective 0.83% : 0.000002s : 4: substitution.fold_const_symbol 3.15% : 0.000009s : 6: substitution.graph_param_transform 65.45% : 0.000184s : 8: substitution.inline 2.33% : 0.000007s : 8: substitution.j_node_and_user_rematch 2.86% : 0.000008s : 8: substitution.remove_not_recompute_node 2.36% : 0.000007s : 4: substitution.replace_old_param 3.45% : 0.000010s : 2: substitution.switch_simplify 4.63% : 0.000013s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.013012 2 89.87% : 0.011694s : 1: type_inference.infer 10.13% : 0.001318s : 1: type_inference.specialize ------[replace.] 0.000123 14 44.50% : 0.000055s : 8: replace.inline 35.76% : 0.000044s : 2: replace.switch_simplify 19.74% : 0.000024s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000199 14 90.20% : 0.000180s : 8: match.inline 4.27% : 0.000008s : 2: match.switch_simplify 5.53% : 0.000011s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000333 1972 1.02% : 0.000003s : 21: predicate.accumulaten_eliminater 0.89% : 0.000003s : 6: predicate.ad_related_special_op_eliminate 0.48% : 0.000002s : 12: predicate.addn_check_dump 0.90% : 0.000003s : 21: predicate.addn_zero_filter 0.81% : 0.000003s : 21: predicate.adjust_all_reduce_mul_add 2.11% : 0.000007s : 33: predicate.arithmetic_simplify 1.20% : 0.000004s : 21: predicate.cast_eliminate 0.73% : 0.000002s : 12: predicate.check_bprop_eliminate 0.47% : 0.000002s : 12: predicate.compare_switch_simplify 0.17% : 0.000001s : 6: predicate.const_output_eliminate 0.60% : 0.000002s : 12: predicate.depend_value_elim 0.96% : 0.000003s : 21: predicate.dict_get_item_const_eliminator 1.04% : 0.000003s : 21: predicate.dict_get_item_eliminator 0.88% : 0.000003s : 21: predicate.dict_set_item_eliminator 0.83% : 0.000003s : 12: predicate.dumpgradient_eliminate 0.20% : 0.000001s : 6: predicate.elim_not_effective 0.36% : 0.000001s : 6: predicate.elim_shapecalc_of_broadcastargs 1.24% : 0.000004s : 27: predicate.environ_add_const_eliminate 1.08% : 0.000004s : 27: predicate.environ_get_add_eliminate 1.07% : 0.000004s : 27: predicate.environ_get_depend_swap 1.61% : 0.000005s : 39: predicate.environ_get_eliminate 1.22% : 0.000004s : 27: predicate.environ_get_set_eliminate 1.43% : 0.000005s : 33: predicate.exchange_switch_depend_value 2.26% : 0.000008s : 33: predicate.float_depend_g_call 0.48% : 0.000002s : 12: predicate.float_environ_get_switch 0.84% : 0.000003s : 18: predicate.float_tuple_getitem_switch 0.20% : 0.000001s : 6: predicate.fold_const_symbol 0.67% : 0.000002s : 12: predicate.get_grad_eliminate 0.19% : 0.000001s : 6: predicate.graph_param_transform 0.58% : 0.000002s : 12: predicate.incorporate_call 0.48% : 0.000002s : 12: predicate.incorporate_call_switch 6.30% : 0.000021s : 90: predicate.inline 0.80% : 0.000003s : 12: predicate.inline_without_move 0.28% : 0.000001s : 12: predicate.j_node_and_user_rematch 0.73% : 0.000002s : 12: predicate.less_batch_normalization 2.03% : 0.000007s : 37: predicate.list_to_tuple_eliminator_ 2.54% : 0.000008s : 58: predicate.load_eliminater 1.41% : 0.000005s : 6: predicate.loop_unroll_after_grad 2.37% : 0.000008s : 51: predicate.loop_unroll_before_grad 1.54% : 0.000005s : 33: predicate.make_slice_get_slice_eliminator 0.56% : 0.000002s : 12: predicate.merge_addn 0.52% : 0.000002s : 12: predicate.micro_step_allgather_replace 0.54% : 0.000002s : 12: predicate.mini_step_allgather_replace 0.83% : 0.000003s : 21: predicate.minmaximum_grad 1.17% : 0.000004s : 6: predicate.mutable_eliminate 0.42% : 0.000001s : 6: predicate.opt_reshape 0.33% : 0.000001s : 6: predicate.parallel_virtual_node 1.74% : 0.000006s : 33: predicate.partial_defer_inline 1.56% : 0.000005s : 31: predicate.partial_eliminate 0.95% : 0.000003s : 21: predicate.print_const_string_wrapper 0.55% : 0.000002s : 12: predicate.reduce_all_const_elim 1.18% : 0.000004s : 21: predicate.reduce_eliminate 2.51% : 0.000008s : 58: predicate.redundant_stop_gradient_eliminater 0.34% : 0.000001s : 12: predicate.remove_not_recompute_node 1.31% : 0.000004s : 37: predicate.replace_applicator 0.47% : 0.000002s : 12: predicate.replace_old_param 0.27% : 0.000001s : 6: predicate.reset_defer_inline 1.07% : 0.000004s : 21: predicate.reshape_eliminate 0.60% : 0.000002s : 12: predicate.row_tensor_add_zeros_like 0.48% : 0.000002s : 6: predicate.row_tensor_eliminate 0.78% : 0.000003s : 12: predicate.same_eliminate 0.51% : 0.000002s : 12: predicate.set_cell_output_no_recompute 0.72% : 0.000002s : 12: predicate.shard_identity_eliminate 0.67% : 0.000002s : 12: predicate.special_op_eliminate 0.72% : 0.000002s : 12: predicate.specialize_transform 1.05% : 0.000003s : 12: predicate.split_environ_get_set_with_tuple_value 0.85% : 0.000003s : 12: predicate.stack_unstack_eliminate 0.32% : 0.000001s : 6: predicate.switch_call_monad_eliminater 1.51% : 0.000005s : 33: predicate.switch_defer_inline 1.98% : 0.000007s : 45: predicate.switch_layer_defer_inline 5.24% : 0.000017s : 106: predicate.switch_simplify 0.96% : 0.000003s : 21: predicate.tile_eliminate 1.01% : 0.000003s : 21: predicate.transpose_eliminate 1.55% : 0.000005s : 33: predicate.tuple_list_convert_item_index_to_positive 1.81% : 0.000006s : 33: predicate.tuple_list_get_item_const_eliminator 1.60% : 0.000005s : 33: predicate.tuple_list_get_item_depend_reorder 2.94% : 0.000010s : 49: predicate.tuple_list_get_item_eliminator 1.46% : 0.000005s : 33: predicate.tuple_list_get_set_item_eliminator 2.49% : 0.000008s : 45: predicate.tuple_list_set_item_eliminator 1.81% : 0.000006s : 37: predicate.tuple_to_list_eliminator_ 2.33% : 0.000008s : 58: predicate.updatestate_pure_node_eliminater 3.02% : 0.000010s : 70: predicate.updatestate_useless_node_eliminater 0.43% : 0.000001s : 6: predicate.value_based_eliminate 0.62% : 0.000002s : 12: predicate.virtual_dataset_eliminate 0.55% : 0.000002s : 12: predicate.virtual_output_eliminate 0.27% : 0.000001s : 6: predicate.virtual_view_grad_eliminate 0.40% : 0.000001s : 6: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.001072 16 51.25% : 0.000549s : 6: func_graph_cloner_run.FuncGraphClonerGraph 48.75% : 0.000523s : 10: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.069586 192 0.01% : 0.000005s : 1: ForceFp32Comm 4.49% : 0.003121s : 1: add_attr 4.47% : 0.003111s : 1: add_attr_with_inline 0.01% : 0.000004s : 1: add_comm_op_reuse_tag 0.11% : 0.000076s : 1: add_recomputation 0.01% : 0.000004s : 1: assign_add_opt 0.15% : 0.000108s : 1: auto_monad 0.04% : 0.000027s : 1: auto_monad_reorder 0.01% : 0.000004s : 1: begin_end_overlap_inline 0.07% : 0.000047s : 1: bias_add_comm_swap 0.72% : 0.000498s : 1: bootstrap 0.06% : 0.000040s : 1: cconv 0.01% : 0.000004s : 1: comm_op_add_attrs 0.03% : 0.000023s : 1: control_data_broadcast_order 0.02% : 0.000012s : 1: convert_after_rewriter 0.05% : 0.000035s : 1: cse_after_recomputation 0.01% : 0.000006s : 1: dataset_repeat_opt 0.01% : 0.000006s : 1: detach_backward 0.02% : 0.000011s : 1: environ_conv 0.12% : 0.000087s : 1: event_method 0.01% : 0.000005s : 1: full_micro_interleaved_order_control 0.01% : 0.000005s : 1: get_jit_bprop_graph 0.02% : 0.000012s : 1: graph_reusing 0.01% : 0.000005s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000004s : 1: handle_group_info 0.01% : 0.000006s : 1: inline 0.01% : 0.000006s : 1: insert-virtual-dataset 0.01% : 0.000004s : 1: interleave_parallel_branches 0.01% : 0.000004s : 1: interleave_split_concat_branches 0.01% : 0.000007s : 1: label_fine_grained_interleaved_index 0.01% : 0.000009s : 1: label_micro_interleaved_index 1.05% : 0.000733s : 1: loop_unroll 0.01% : 0.000005s : 1: merge_cast_opt 0.01% : 0.000006s : 1: micro_interleaved_order_control 1.24% : 0.000866s : 1: mutable_eliminate 0.01% : 0.000008s : 1: offloading_packed_experts 0.04% : 0.000026s : 1: opt.transform.loop_unroll_optimizer 0.04% : 0.000029s : 1: opt.transform.mutable_eliminate 2.81% : 0.001956s : 78: opt.transform.opt_a 0.07% : 0.000048s : 1: opt.transform.opt_after_cconv 0.06% : 0.000040s : 1: opt.transform.opt_after_jit_grad 0.26% : 0.000178s : 28: opt.transform.opt_b 0.11% : 0.000078s : 2: opt.transform.opt_trans_graph 0.09% : 0.000061s : 4: opt.transform.symbol_engine_opt 20.65% : 0.014366s : 1: opt_a 0.23% : 0.000159s : 1: opt_after_cconv 1.01% : 0.000704s : 1: opt_after_jit_grad 0.48% : 0.000333s : 1: opt_b 25.19% : 0.017529s : 1: optimize 0.04% : 0.000028s : 1: optimize_parallel_all_gather_comm 0.02% : 0.000010s : 1: order_py_execute_after_rewriter 0.05% : 0.000032s : 1: overlap_grad_flash_sp 0.01% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.01% : 0.000008s : 1: overlap_grad_ring_attention 0.01% : 0.000006s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000005s : 1: overlap_opt_shard_in_pipeline 0.01% : 0.000006s : 1: overlap_param_gather 0.01% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.01% : 0.000009s : 1: overlap_recompute_and_grad_model_parallel 0.01% : 0.000007s : 1: overlap_recompute_comm 0.01% : 0.000006s : 1: parallel-infer-symbol 0.01% : 0.000004s : 1: parallel-infer-symbol-second 0.01% : 0.000005s : 1: partial_unused_args_eliminate 0.01% : 0.000006s : 1: pipeline_parallel_scheduler 0.01% : 0.000004s : 1: pipeline_split 0.07% : 0.000050s : 1: pre_auto_parallel 0.05% : 0.000037s : 1: py_interpret_to_execute 0.04% : 0.000025s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000004s : 1: remove_cast_before_assign_add 0.08% : 0.000056s : 1: remove_dup_value 15.17% : 0.010556s : 1: renormalize.infer 1.07% : 0.000746s : 1: renormalize.specialize 0.01% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.02% : 0.000011s : 1: rewriter_after_jit_bprop_graph 0.09% : 0.000060s : 1: rewriter_after_opt_a 0.18% : 0.000122s : 1: rewriter_before_opt_a 0.01% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.01% : 0.000005s : 1: slice_recompute_activation 0.01% : 0.000005s : 1: split_layernorm_comm 0.01% : 0.000006s : 1: split_matmul_comm_elemetwise 0.01% : 0.000010s : 1: swap_dp_allreduce_reducescatter 0.16% : 0.000111s : 1: symbol_engine_optimizer 0.16% : 0.000113s : 1: tuple_transform 18.82% : 0.013098s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:05.883.997 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:05.884.259 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.061888, [21] [bootstrap]: 0.00043787 [type_inference]: 0.0377915 [event_method]: 8.613e-05 [auto_monad]: 0.00010336 [graph_reusing]: 7.52998e-06 [inline]: 2.99001e-06 [add_attr]: 0.00319474, [1] [add_attr_with_inline]: 0.00318559, [1] [Cycle 1]: 8.559e-05, [2] [tag_attr]: 2.749e-05 [meta_addattr_fg_expand]: 8.18001e-06 [parallel-infer-symbol]: 3.06999e-06 [pre_auto_parallel]: 4.749e-05 [insert-virtual-dataset]: 2.48998e-06 [parallel-infer-symbol-second]: 8.89995e-07 [dataset_repeat_opt]: 2.04999e-06 [pipeline_split]: 1.74e-06 [optimize]: 0.0189191, [53] [py_interpret_to_execute]: 3.826e-05 [rewriter_before_opt_a]: 0.00012406 [opt_a]: 0.0161527, [2] [Cycle 1]: 0.00294145, [45] [expand_dump_flag]: 4e-06 [switch_simplify]: 0.00013018 [loop_unroll]: 4.454e-05 [a_1]: 0.00090135 [with_stream_mark]: 1.875e-05 [recompute_prepare]: 1.338e-05 [updatestate_depend_eliminate]: 4.77998e-06 [updatestate_assign_eliminate]: 4.23001e-06 [updatestate_loads_eliminate]: 3.4e-06 [parameter_eliminate]: 1.87001e-06 [a_2]: 0.00013376 [accelerated_algorithm]: 9.13002e-06 [shard]: 1.91e-06 [meta_shard_fg_expand]: 2.67001e-06 [shard_inline]: 7.90998e-06 [merge_send_recv]: 9.37001e-06 [auto_parallel]: 8.13999e-06 [parallel]: 1.919e-05 [flash_sp]: 9.32001e-06 [merge_comm]: 4.53001e-06 [allreduce_fusion]: 4.2e-06 [matmul_add_comm_reduction]: 1.149e-05 [allreduce_slice_to_reducescatter]: 6.30011e-07 [virtual_shard_identity]: 9.98002e-06 [virtual_dataset]: 8.41002e-06 [get_grad_eliminate_]: 7.57998e-06 [virtual_output]: 7.8e-06 [merge_forward]: 4.32e-06 [cell_reuse_recompute_pass]: 1.27999e-06 [offload_activation]: 1.12e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.851e-05 [merge_recompute_call_nodes]: 1.49e-06 [before_grad]: 1.391e-05 [set_forward_comm_id_for_comm_node_pass]: 4.52e-06 [meta_fg_expand]: 3.73999e-06 [flash_sp_send_recv_attached]: 2.83e-06 [receive_attached]: 2.17999e-06 [after_resolve]: 1.363e-05 [a_after_grad]: 1.209e-05 [renormalize]: 0.00092301 [add_forward_monad_depend]: 6.56e-06 [auto_monad_grad]: 2.17001e-06 [auto_monad_eliminator]: 1.81e-05 [cse]: 3.648e-05 [a_3]: 7.575e-05 [Cycle 2]: 0.0131951, [45] [expand_dump_flag]: 1.07998e-06 [switch_simplify]: 1.029e-05 [loop_unroll]: 7.83999e-06 [a_1]: 0.0122624 [with_stream_mark]: 3.5e-05 [recompute_prepare]: 1.389e-05 [updatestate_depend_eliminate]: 6.12001e-06 [updatestate_assign_eliminate]: 3.93001e-06 [updatestate_loads_eliminate]: 3.71001e-06 [parameter_eliminate]: 2.49001e-06 [a_2]: 0.00013056 [accelerated_algorithm]: 8.53001e-06 [shard]: 2.79999e-06 [meta_shard_fg_expand]: 3.32002e-06 [shard_inline]: 8.79e-06 [merge_send_recv]: 1.137e-05 [auto_parallel]: 1.028e-05 [parallel]: 9.96e-06 [flash_sp]: 4.03999e-06 [merge_comm]: 4.63999e-06 [allreduce_fusion]: 4.25999e-06 [matmul_add_comm_reduction]: 1.289e-05 [allreduce_slice_to_reducescatter]: 1.17e-06 [virtual_shard_identity]: 9.26998e-06 [virtual_dataset]: 7.91001e-06 [get_grad_eliminate_]: 7.31001e-06 [virtual_output]: 7.2e-06 [merge_forward]: 5.25999e-06 [cell_reuse_recompute_pass]: 3.08998e-06 [offload_activation]: 1.173e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.788e-05 [merge_recompute_call_nodes]: 1.80001e-06 [before_grad]: 1.349e-05 [set_forward_comm_id_for_comm_node_pass]: 6.18998e-06 [meta_fg_expand]: 3.46001e-06 [flash_sp_send_recv_attached]: 1.87999e-06 [receive_attached]: 2.69001e-06 [after_resolve]: 1.517e-05 [a_after_grad]: 1.179e-05 [renormalize]: 6.00121e-08 [add_forward_monad_depend]: 3.21001e-06 [auto_monad_grad]: 2.59999e-06 [auto_monad_eliminator]: 1.743e-05 [cse]: 3.846e-05 [a_3]: 6.233e-05 [py_interpret_to_execute_after_opt_a]: 2.305e-05 [slice_cell_reuse_recomputed_activation]: 4.70001e-06 [rewriter_after_opt_a]: 5.267e-05 [convert_after_rewriter]: 1.122e-05 [order_py_execute_after_rewriter]: 9.64999e-06 [mutable_eliminate]: 0.00075375 [opt_b]: 0.00031962, [1] [Cycle 1]: 0.00030912, [7] [b_1]: 0.00020419 [b_2]: 9.51e-06 [updatestate_depend_eliminate]: 6.86001e-06 [updatestate_assign_eliminate]: 3.21999e-06 [updatestate_loads_eliminate]: 4.09002e-06 [renormalize]: 5.59987e-07 [cse]: 2.441e-05 [optimize_parallel_all_gather_comm]: 2.339e-05 [overlap_param_gather]: 5.43002e-06 [cconv]: 3.727e-05 [loop_unroll]: 0.0004398 [opt_after_cconv]: 0.0001431, [1] [Cycle 1]: 0.0001332, [7] [c_1]: 3.943e-05 [parameter_eliminate]: 3.56999e-06 [updatestate_depend_eliminate]: 6.55002e-06 [updatestate_assign_eliminate]: 3.03e-06 [updatestate_loads_eliminate]: 2.85002e-06 [cse]: 2.2e-05 [renormalize]: 4.10015e-07 [remove_dup_value]: 1.954e-05 [tuple_transform]: 0.00010297, [1] [Cycle 1]: 9.581e-05, [4] [d_1]: 5.459e-05 [none_parameter_eliminate]: 1.76998e-06 [renormalize]: 2.20025e-07 [switch_simplify]: 8.69e-06 [partial_unused_args_eliminate]: 5.40001e-06 [add_recomputation]: 6.072e-05 [cse_after_recomputation]: 3.356e-05, [1] [Cycle 1]: 2.55e-05, [1] [cse]: 1.604e-05 [environ_conv]: 1.147e-05 [swap_dp_allreduce_reducescatter]: 8.62e-06 [bias_add_comm_swap]: 5.10001e-06 [label_micro_interleaved_index]: 7.03998e-06 [label_fine_grained_interleaved_index]: 4.92e-06 [merge_cast_opt]: 3.85e-06 [slice_recompute_activation]: 4.18001e-06 [micro_interleaved_order_control]: 4.90001e-06 [assign_add_opt]: 3.46999e-06 [ForceFp32Comm]: 3.21999e-06 [remove_cast_before_assign_add]: 3.76999e-06 [full_micro_interleaved_order_control]: 4.61002e-06 [reorder_send_recv_between_fp_bp]: 5.29e-06 [comm_op_add_attrs]: 3.27997e-06 [add_comm_op_reuse_tag]: 3.29001e-06 [interleave_split_concat_branches]: 4.4e-06 [interleave_parallel_branches]: 3.43e-06 [overlap_opt_shard_in_pipeline]: 3.93999e-06 [overlap_opt_shard_grad_in_pipeline]: 4.07e-06 [control_data_broadcast_order]: 2e-05 [grouped_pairwise_exchange_alltoall]: 3.95e-06 [offloading_packed_experts]: 7.46001e-06 [overlap_recompute_and_grad_model_parallel]: 7.44002e-06 [overlap_grad_matmul_and_grad_allreduce]: 4.37998e-06 [overlap_recompute_allgather_and_fa_grad]: 4.26001e-06 [overlap_recompute_comm]: 5.08002e-06 [overlap_grad_ring_attention]: 7.24001e-06 [overlap_grad_flash_sp]: 2.642e-05 [begin_end_overlap_inline]: 3.26999e-06 [split_matmul_comm_elemetwise]: 4.34997e-06 [split_layernorm_comm]: 4.18999e-06 [handle_group_info]: 3.45e-06 [symbol_engine_optimizer]: 0.00011157, [1] [Cycle 1]: 0.00010459, [6] [build]: 3.85998e-06 [elim_shapecalc]: 1.217e-05 [elim_not_effective]: 1.673e-05 [opt_reshape]: 9.15001e-06 [fold_const_symbol]: 1.332e-05 [renormalize]: 2.50002e-07 [detach_backward]: 3.71001e-06 [pipeline_parallel_scheduler]: 1.74e-06 [auto_monad_reorder]: 2.47e-05 [get_jit_bprop_graph]: 1.92999e-06 [rewriter_after_jit_bprop_graph]: 5.09e-06 [opt_after_jit_grad]: 0.000507 [validate]: 4.596e-05 Sums bootstrap : 0.000438s : 0.77% type_inference : 0.037792s : 66.62% event_method : 0.000086s : 0.15% auto_monad : 0.000103s : 0.18% graph_reusing : 0.000008s : 0.01% inline : 0.000003s : 0.01% add_attr.add_attr_with_inline.tag_attr : 0.000027s : 0.05% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000008s : 0.01% parallel-infer-symbol : 0.000003s : 0.01% pre_auto_parallel : 0.000047s : 0.08% insert-virtual-dataset : 0.000002s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000038s : 0.07% optimize.rewriter_before_opt_a : 0.000124s : 0.22% optimize.opt_a.expand_dump_flag : 0.000005s : 0.01% optimize.opt_a.switch_simplify : 0.000140s : 0.25% optimize.opt_a.loop_unroll : 0.000052s : 0.09% optimize.opt_a.a_1 : 0.013164s : 23.21% optimize.opt_a.with_stream_mark : 0.000054s : 0.09% optimize.opt_a.recompute_prepare : 0.000027s : 0.05% optimize.opt_a.updatestate_depend_eliminate : 0.000011s : 0.02% optimize.opt_a.updatestate_assign_eliminate : 0.000008s : 0.01% optimize.opt_a.updatestate_loads_eliminate : 0.000007s : 0.01% optimize.opt_a.parameter_eliminate : 0.000004s : 0.01% optimize.opt_a.a_2 : 0.000264s : 0.47% optimize.opt_a.accelerated_algorithm : 0.000018s : 0.03% optimize.opt_a.shard : 0.000005s : 0.01% optimize.opt_a.meta_shard_fg_expand : 0.000006s : 0.01% optimize.opt_a.shard_inline : 0.000017s : 0.03% optimize.opt_a.merge_send_recv : 0.000021s : 0.04% optimize.opt_a.auto_parallel : 0.000018s : 0.03% optimize.opt_a.parallel : 0.000029s : 0.05% optimize.opt_a.flash_sp : 0.000013s : 0.02% optimize.opt_a.merge_comm : 0.000009s : 0.02% optimize.opt_a.allreduce_fusion : 0.000008s : 0.01% optimize.opt_a.matmul_add_comm_reduction : 0.000024s : 0.04% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000002s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000019s : 0.03% optimize.opt_a.virtual_dataset : 0.000016s : 0.03% optimize.opt_a.get_grad_eliminate_ : 0.000015s : 0.03% optimize.opt_a.virtual_output : 0.000015s : 0.03% optimize.opt_a.merge_forward : 0.000010s : 0.02% optimize.opt_a.cell_reuse_recompute_pass : 0.000004s : 0.01% optimize.opt_a.offload_activation : 0.000023s : 0.04% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000036s : 0.06% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.01% optimize.opt_a.before_grad : 0.000027s : 0.05% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000011s : 0.02% optimize.opt_a.meta_fg_expand : 0.000007s : 0.01% optimize.opt_a.flash_sp_send_recv_attached : 0.000005s : 0.01% optimize.opt_a.receive_attached : 0.000005s : 0.01% optimize.opt_a.after_resolve : 0.000029s : 0.05% optimize.opt_a.a_after_grad : 0.000024s : 0.04% optimize.opt_a.renormalize : 0.000923s : 1.63% optimize.opt_a.add_forward_monad_depend : 0.000010s : 0.02% optimize.opt_a.auto_monad_grad : 0.000005s : 0.01% optimize.opt_a.auto_monad_eliminator : 0.000036s : 0.06% optimize.opt_a.cse : 0.000075s : 0.13% optimize.opt_a.a_3 : 0.000138s : 0.24% optimize.py_interpret_to_execute_after_opt_a : 0.000023s : 0.04% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.01% optimize.rewriter_after_opt_a : 0.000053s : 0.09% optimize.convert_after_rewriter : 0.000011s : 0.02% optimize.order_py_execute_after_rewriter : 0.000010s : 0.02% optimize.mutable_eliminate : 0.000754s : 1.33% optimize.opt_b.b_1 : 0.000204s : 0.36% optimize.opt_b.b_2 : 0.000010s : 0.02% optimize.opt_b.updatestate_depend_eliminate : 0.000007s : 0.01% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.01% optimize.opt_b.updatestate_loads_eliminate : 0.000004s : 0.01% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000024s : 0.04% optimize.optimize_parallel_all_gather_comm : 0.000023s : 0.04% optimize.overlap_param_gather : 0.000005s : 0.01% optimize.cconv : 0.000037s : 0.07% optimize.loop_unroll : 0.000440s : 0.78% optimize.opt_after_cconv.c_1 : 0.000039s : 0.07% optimize.opt_after_cconv.parameter_eliminate : 0.000004s : 0.01% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000007s : 0.01% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.cse : 0.000022s : 0.04% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000020s : 0.03% optimize.tuple_transform.d_1 : 0.000055s : 0.10% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000009s : 0.02% optimize.partial_unused_args_eliminate : 0.000005s : 0.01% optimize.add_recomputation : 0.000061s : 0.11% optimize.cse_after_recomputation.cse : 0.000016s : 0.03% optimize.environ_conv : 0.000011s : 0.02% optimize.swap_dp_allreduce_reducescatter : 0.000009s : 0.02% optimize.bias_add_comm_swap : 0.000005s : 0.01% optimize.label_micro_interleaved_index : 0.000007s : 0.01% optimize.label_fine_grained_interleaved_index : 0.000005s : 0.01% optimize.merge_cast_opt : 0.000004s : 0.01% optimize.slice_recompute_activation : 0.000004s : 0.01% optimize.micro_interleaved_order_control : 0.000005s : 0.01% optimize.assign_add_opt : 0.000003s : 0.01% optimize.ForceFp32Comm : 0.000003s : 0.01% optimize.remove_cast_before_assign_add : 0.000004s : 0.01% optimize.full_micro_interleaved_order_control : 0.000005s : 0.01% optimize.reorder_send_recv_between_fp_bp : 0.000005s : 0.01% optimize.comm_op_add_attrs : 0.000003s : 0.01% optimize.add_comm_op_reuse_tag : 0.000003s : 0.01% optimize.interleave_split_concat_branches : 0.000004s : 0.01% optimize.interleave_parallel_branches : 0.000003s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000004s : 0.01% optimize.control_data_broadcast_order : 0.000020s : 0.04% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.01% optimize.offloading_packed_experts : 0.000007s : 0.01% optimize.overlap_recompute_and_grad_model_parallel : 0.000007s : 0.01% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.01% optimize.overlap_recompute_comm : 0.000005s : 0.01% optimize.overlap_grad_ring_attention : 0.000007s : 0.01% optimize.overlap_grad_flash_sp : 0.000026s : 0.05% optimize.begin_end_overlap_inline : 0.000003s : 0.01% optimize.split_matmul_comm_elemetwise : 0.000004s : 0.01% optimize.split_layernorm_comm : 0.000004s : 0.01% optimize.handle_group_info : 0.000003s : 0.01% optimize.symbol_engine_optimizer.build : 0.000004s : 0.01% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000012s : 0.02% optimize.symbol_engine_optimizer.elim_not_effective : 0.000017s : 0.03% optimize.symbol_engine_optimizer.opt_reshape : 0.000009s : 0.02% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000013s : 0.02% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000004s : 0.01% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000025s : 0.04% get_jit_bprop_graph : 0.000002s : 0.00% rewriter_after_jit_bprop_graph : 0.000005s : 0.01% opt_after_jit_grad : 0.000507s : 0.89% validate : 0.000046s : 0.08% Time group info: ------[substitution.] 0.000272 44 14.04% : 0.000038s : 3: substitution.cast_eliminate 0.87% : 0.000002s : 3: substitution.elim_not_effective 0.77% : 0.000002s : 3: substitution.fold_const_symbol 2.55% : 0.000007s : 5: substitution.graph_param_transform 65.89% : 0.000179s : 8: substitution.inline 2.05% : 0.000006s : 6: substitution.j_node_and_user_rematch 2.34% : 0.000006s : 6: substitution.remove_not_recompute_node 2.76% : 0.000008s : 4: substitution.replace_old_param 3.85% : 0.000010s : 2: substitution.switch_simplify 4.88% : 0.000013s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.037723 2 96.29% : 0.036322s : 1: type_inference.infer 3.71% : 0.001401s : 1: type_inference.specialize ------[replace.] 0.000126 14 42.42% : 0.000053s : 8: replace.inline 38.09% : 0.000048s : 2: replace.switch_simplify 19.49% : 0.000025s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000195 14 89.47% : 0.000175s : 8: match.inline 4.68% : 0.000009s : 2: match.switch_simplify 5.84% : 0.000011s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000289 1746 0.98% : 0.000003s : 19: predicate.accumulaten_eliminater 0.57% : 0.000002s : 5: predicate.ad_related_special_op_eliminate 0.48% : 0.000001s : 10: predicate.addn_check_dump 1.00% : 0.000003s : 19: predicate.addn_zero_filter 0.90% : 0.000003s : 19: predicate.adjust_all_reduce_mul_add 2.22% : 0.000006s : 29: predicate.arithmetic_simplify 1.24% : 0.000004s : 19: predicate.cast_eliminate 0.54% : 0.000002s : 10: predicate.check_bprop_eliminate 0.52% : 0.000002s : 10: predicate.compare_switch_simplify 0.16% : 0.000000s : 5: predicate.const_output_eliminate 0.56% : 0.000002s : 10: predicate.depend_value_elim 0.95% : 0.000003s : 19: predicate.dict_get_item_const_eliminator 1.03% : 0.000003s : 19: predicate.dict_get_item_eliminator 0.86% : 0.000002s : 19: predicate.dict_set_item_eliminator 0.86% : 0.000002s : 10: predicate.dumpgradient_eliminate 0.17% : 0.000000s : 5: predicate.elim_not_effective 0.34% : 0.000001s : 5: predicate.elim_shapecalc_of_broadcastargs 1.22% : 0.000004s : 24: predicate.environ_add_const_eliminate 1.09% : 0.000003s : 24: predicate.environ_get_add_eliminate 2.37% : 0.000007s : 24: predicate.environ_get_depend_swap 1.69% : 0.000005s : 34: predicate.environ_get_eliminate 1.11% : 0.000003s : 24: predicate.environ_get_set_eliminate 1.54% : 0.000004s : 31: predicate.exchange_switch_depend_value 2.46% : 0.000007s : 31: predicate.float_depend_g_call 0.47% : 0.000001s : 10: predicate.float_environ_get_switch 0.73% : 0.000002s : 15: predicate.float_tuple_getitem_switch 0.17% : 0.000000s : 5: predicate.fold_const_symbol 0.60% : 0.000002s : 10: predicate.get_grad_eliminate 0.21% : 0.000001s : 5: predicate.graph_param_transform 0.54% : 0.000002s : 10: predicate.incorporate_call 0.45% : 0.000001s : 10: predicate.incorporate_call_switch 7.16% : 0.000021s : 80: predicate.inline 0.65% : 0.000002s : 10: predicate.inline_without_move 0.28% : 0.000001s : 10: predicate.j_node_and_user_rematch 0.69% : 0.000002s : 10: predicate.less_batch_normalization 1.62% : 0.000005s : 33: predicate.list_to_tuple_eliminator_ 2.52% : 0.000007s : 52: predicate.load_eliminater 0.72% : 0.000002s : 5: predicate.loop_unroll_after_grad 2.54% : 0.000007s : 49: predicate.loop_unroll_before_grad 1.62% : 0.000005s : 29: predicate.make_slice_get_slice_eliminator 0.54% : 0.000002s : 10: predicate.merge_addn 0.51% : 0.000001s : 10: predicate.micro_step_allgather_replace 0.59% : 0.000002s : 10: predicate.mini_step_allgather_replace 0.99% : 0.000003s : 19: predicate.minmaximum_grad 0.76% : 0.000002s : 5: predicate.mutable_eliminate 0.33% : 0.000001s : 5: predicate.opt_reshape 0.40% : 0.000001s : 5: predicate.parallel_virtual_node 1.92% : 0.000006s : 31: predicate.partial_defer_inline 1.60% : 0.000005s : 28: predicate.partial_eliminate 0.96% : 0.000003s : 19: predicate.print_const_string_wrapper 0.55% : 0.000002s : 10: predicate.reduce_all_const_elim 1.53% : 0.000004s : 19: predicate.reduce_eliminate 2.42% : 0.000007s : 52: predicate.redundant_stop_gradient_eliminater 0.36% : 0.000001s : 10: predicate.remove_not_recompute_node 1.32% : 0.000004s : 33: predicate.replace_applicator 0.45% : 0.000001s : 10: predicate.replace_old_param 0.29% : 0.000001s : 5: predicate.reset_defer_inline 0.92% : 0.000003s : 19: predicate.reshape_eliminate 0.54% : 0.000002s : 10: predicate.row_tensor_add_zeros_like 0.38% : 0.000001s : 5: predicate.row_tensor_eliminate 0.82% : 0.000002s : 10: predicate.same_eliminate 0.47% : 0.000001s : 10: predicate.set_cell_output_no_recompute 0.70% : 0.000002s : 10: predicate.shard_identity_eliminate 0.64% : 0.000002s : 10: predicate.special_op_eliminate 0.79% : 0.000002s : 10: predicate.specialize_transform 0.77% : 0.000002s : 10: predicate.split_environ_get_set_with_tuple_value 0.68% : 0.000002s : 10: predicate.stack_unstack_eliminate 0.29% : 0.000001s : 5: predicate.switch_call_monad_eliminater 1.67% : 0.000005s : 31: predicate.switch_defer_inline 2.05% : 0.000006s : 41: predicate.switch_layer_defer_inline 5.82% : 0.000017s : 99: predicate.switch_simplify 0.91% : 0.000003s : 19: predicate.tile_eliminate 1.17% : 0.000003s : 19: predicate.transpose_eliminate 1.47% : 0.000004s : 29: predicate.tuple_list_convert_item_index_to_positive 1.54% : 0.000004s : 29: predicate.tuple_list_get_item_const_eliminator 1.34% : 0.000004s : 29: predicate.tuple_list_get_item_depend_reorder 2.85% : 0.000008s : 43: predicate.tuple_list_get_item_eliminator 1.48% : 0.000004s : 29: predicate.tuple_list_get_set_item_eliminator 2.26% : 0.000007s : 39: predicate.tuple_list_set_item_eliminator 1.64% : 0.000005s : 33: predicate.tuple_to_list_eliminator_ 2.38% : 0.000007s : 52: predicate.updatestate_pure_node_eliminater 2.97% : 0.000009s : 62: predicate.updatestate_useless_node_eliminater 0.36% : 0.000001s : 5: predicate.value_based_eliminate 0.60% : 0.000002s : 10: predicate.virtual_dataset_eliminate 0.55% : 0.000002s : 10: predicate.virtual_output_eliminate 0.24% : 0.000001s : 5: predicate.virtual_view_grad_eliminate 0.33% : 0.000001s : 5: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.001008 16 57.07% : 0.000575s : 6: func_graph_cloner_run.FuncGraphClonerGraph 42.93% : 0.000433s : 10: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.098914 192 0.01% : 0.000006s : 1: ForceFp32Comm 3.24% : 0.003204s : 1: add_attr 3.22% : 0.003190s : 1: add_attr_with_inline 0.01% : 0.000006s : 1: add_comm_op_reuse_tag 0.07% : 0.000065s : 1: add_recomputation 0.01% : 0.000006s : 1: assign_add_opt 0.12% : 0.000115s : 1: auto_monad 0.03% : 0.000033s : 1: auto_monad_reorder 0.01% : 0.000006s : 1: begin_end_overlap_inline 0.01% : 0.000008s : 1: bias_add_comm_swap 0.49% : 0.000482s : 1: bootstrap 0.04% : 0.000040s : 1: cconv 0.01% : 0.000007s : 1: comm_op_add_attrs 0.02% : 0.000023s : 1: control_data_broadcast_order 0.01% : 0.000014s : 1: convert_after_rewriter 0.04% : 0.000037s : 1: cse_after_recomputation 0.01% : 0.000008s : 1: dataset_repeat_opt 0.02% : 0.000023s : 1: detach_backward 0.01% : 0.000014s : 1: environ_conv 0.10% : 0.000100s : 1: event_method 0.01% : 0.000007s : 1: full_micro_interleaved_order_control 0.01% : 0.000008s : 1: get_jit_bprop_graph 0.01% : 0.000014s : 1: graph_reusing 0.01% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000006s : 1: handle_group_info 0.01% : 0.000009s : 1: inline 0.01% : 0.000009s : 1: insert-virtual-dataset 0.01% : 0.000006s : 1: interleave_parallel_branches 0.01% : 0.000007s : 1: interleave_split_concat_branches 0.01% : 0.000008s : 1: label_fine_grained_interleaved_index 0.01% : 0.000010s : 1: label_micro_interleaved_index 0.45% : 0.000445s : 1: loop_unroll 0.01% : 0.000007s : 1: merge_cast_opt 0.01% : 0.000008s : 1: micro_interleaved_order_control 0.77% : 0.000760s : 1: mutable_eliminate 0.01% : 0.000011s : 1: offloading_packed_experts 0.02% : 0.000017s : 1: opt.transform.loop_unroll_optimizer 0.02% : 0.000020s : 1: opt.transform.mutable_eliminate 13.97% : 0.013814s : 78: opt.transform.opt_a 0.04% : 0.000038s : 1: opt.transform.opt_after_cconv 0.03% : 0.000030s : 1: opt.transform.opt_after_jit_grad 0.14% : 0.000141s : 28: opt.transform.opt_b 0.06% : 0.000060s : 2: opt.transform.opt_trans_graph 0.05% : 0.000048s : 4: opt.transform.symbol_engine_opt 16.33% : 0.016157s : 1: opt_a 0.15% : 0.000147s : 1: opt_after_cconv 0.52% : 0.000517s : 1: opt_after_jit_grad 0.33% : 0.000323s : 1: opt_b 19.57% : 0.019354s : 1: optimize 0.03% : 0.000027s : 1: optimize_parallel_all_gather_comm 0.01% : 0.000013s : 1: order_py_execute_after_rewriter 0.03% : 0.000030s : 1: overlap_grad_flash_sp 0.01% : 0.000007s : 1: overlap_grad_matmul_and_grad_allreduce 0.01% : 0.000010s : 1: overlap_grad_ring_attention 0.01% : 0.000007s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000007s : 1: overlap_opt_shard_in_pipeline 0.01% : 0.000008s : 1: overlap_param_gather 0.01% : 0.000007s : 1: overlap_recompute_allgather_and_fa_grad 0.01% : 0.000010s : 1: overlap_recompute_and_grad_model_parallel 0.01% : 0.000008s : 1: overlap_recompute_comm 0.01% : 0.000010s : 1: parallel-infer-symbol 0.01% : 0.000007s : 1: parallel-infer-symbol-second 0.01% : 0.000008s : 1: partial_unused_args_eliminate 0.01% : 0.000009s : 1: pipeline_parallel_scheduler 0.01% : 0.000007s : 1: pipeline_split 0.06% : 0.000056s : 1: pre_auto_parallel 0.04% : 0.000043s : 1: py_interpret_to_execute 0.03% : 0.000026s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000006s : 1: remove_cast_before_assign_add 0.02% : 0.000023s : 1: remove_dup_value 0.49% : 0.000487s : 1: renormalize.infer 0.43% : 0.000427s : 1: renormalize.specialize 0.01% : 0.000008s : 1: reorder_send_recv_between_fp_bp 0.01% : 0.000011s : 1: rewriter_after_jit_bprop_graph 0.06% : 0.000057s : 1: rewriter_after_opt_a 0.13% : 0.000128s : 1: rewriter_before_opt_a 0.01% : 0.000008s : 1: slice_cell_reuse_recomputed_activation 0.01% : 0.000007s : 1: slice_recompute_activation 0.01% : 0.000007s : 1: split_layernorm_comm 0.01% : 0.000008s : 1: split_matmul_comm_elemetwise 0.01% : 0.000011s : 1: swap_dp_allreduce_reducescatter 0.12% : 0.000115s : 1: symbol_engine_optimizer 0.11% : 0.000106s : 1: tuple_transform 38.25% : 0.037833s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:06.315.027 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0437883, [21] [bootstrap]: 0.00044328 [type_inference]: 0.0312047 [event_method]: 0.00013007 [auto_monad]: 0.00011671 [graph_reusing]: 7.34002e-06 [inline]: 2.53e-06 [add_attr]: 0.00410368, [1] [add_attr_with_inline]: 0.00409004, [1] [Cycle 1]: 9.308e-05, [2] [tag_attr]: 3.708e-05 [meta_addattr_fg_expand]: 8.58001e-06 [parallel-infer-symbol]: 4.13001e-06 [pre_auto_parallel]: 5.359e-05 [insert-virtual-dataset]: 3.24001e-06 [parallel-infer-symbol-second]: 9.00007e-07 [dataset_repeat_opt]: 2.12001e-06 [pipeline_split]: 2.00002e-06 [optimize]: 0.00690294, [53] [py_interpret_to_execute]: 4.226e-05 [rewriter_before_opt_a]: 0.00013112 [opt_a]: 0.00416619, [2] [Cycle 1]: 0.00330145, [45] [expand_dump_flag]: 4.68999e-06 [switch_simplify]: 0.00014476 [loop_unroll]: 4.284e-05 [a_1]: 0.0009631 [with_stream_mark]: 2.266e-05 [recompute_prepare]: 1.329e-05 [updatestate_depend_eliminate]: 5.75001e-06 [updatestate_assign_eliminate]: 3.76001e-06 [updatestate_loads_eliminate]: 3.48999e-06 [parameter_eliminate]: 2.89001e-06 [a_2]: 0.00010848 [accelerated_algorithm]: 9.49e-06 [shard]: 2.36e-06 [meta_shard_fg_expand]: 2.91e-06 [shard_inline]: 8.72e-06 [merge_send_recv]: 1.101e-05 [auto_parallel]: 8.76002e-06 [parallel]: 2.011e-05 [flash_sp]: 9.57999e-06 [merge_comm]: 4.82e-06 [allreduce_fusion]: 4.35999e-06 [matmul_add_comm_reduction]: 1.145e-05 [allreduce_slice_to_reducescatter]: 6.50005e-07 [virtual_shard_identity]: 1.088e-05 [virtual_dataset]: 7.96001e-06 [get_grad_eliminate_]: 7.83001e-06 [virtual_output]: 8.27e-06 [merge_forward]: 4.52e-06 [cell_reuse_recompute_pass]: 1.77001e-06 [offload_activation]: 1.161e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.825e-05 [merge_recompute_call_nodes]: 1.52999e-06 [before_grad]: 1.443e-05 [set_forward_comm_id_for_comm_node_pass]: 4.74e-06 [meta_fg_expand]: 4.32e-06 [flash_sp_send_recv_attached]: 3.03998e-06 [receive_attached]: 2.11998e-06 [after_resolve]: 1.52e-05 [a_after_grad]: 1.255e-05 [renormalize]: 0.00133063 [add_forward_monad_depend]: 8.07e-06 [auto_monad_grad]: 3.41999e-06 [auto_monad_eliminator]: 2.149e-05 [cse]: 3.934e-05 [a_3]: 6.879e-05 [Cycle 2]: 0.00084982, [45] [expand_dump_flag]: 1.97001e-06 [switch_simplify]: 1.164e-05 [loop_unroll]: 7.95e-06 [a_1]: 0.00019878 [with_stream_mark]: 1.81e-05 [recompute_prepare]: 8.40999e-06 [updatestate_depend_eliminate]: 5.15999e-06 [updatestate_assign_eliminate]: 3.28e-06 [updatestate_loads_eliminate]: 3.41999e-06 [parameter_eliminate]: 1.91e-06 [a_2]: 9.691e-05 [accelerated_algorithm]: 8.1e-06 [shard]: 1.71998e-06 [meta_shard_fg_expand]: 2.59999e-06 [shard_inline]: 7.8e-06 [merge_send_recv]: 8.29998e-06 [auto_parallel]: 9.41e-06 [parallel]: 8.69998e-06 [flash_sp]: 3.78001e-06 [merge_comm]: 4.54998e-06 [allreduce_fusion]: 4.03999e-06 [matmul_add_comm_reduction]: 1.044e-05 [allreduce_slice_to_reducescatter]: 4.30009e-07 [virtual_shard_identity]: 9.72001e-06 [virtual_dataset]: 7.26999e-06 [get_grad_eliminate_]: 7.87e-06 [virtual_output]: 7.16999e-06 [merge_forward]: 4.46002e-06 [cell_reuse_recompute_pass]: 2.79999e-06 [offload_activation]: 1.068e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.592e-05 [merge_recompute_call_nodes]: 1.24998e-06 [before_grad]: 1.274e-05 [set_forward_comm_id_for_comm_node_pass]: 5.13002e-06 [meta_fg_expand]: 3.21001e-06 [flash_sp_send_recv_attached]: 1.19e-06 [receive_attached]: 2.16e-06 [after_resolve]: 1.404e-05 [a_after_grad]: 1.103e-05 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 2.69001e-06 [auto_monad_grad]: 2.36e-06 [auto_monad_eliminator]: 1.147e-05 [cse]: 2.555e-05 [a_3]: 4.83e-05 [py_interpret_to_execute_after_opt_a]: 1.947e-05 [slice_cell_reuse_recomputed_activation]: 2.53998e-06 [rewriter_after_opt_a]: 4.828e-05 [convert_after_rewriter]: 7.87e-06 [order_py_execute_after_rewriter]: 6.63003e-06 [mutable_eliminate]: 0.00078291 [opt_b]: 0.00027447, [1] [Cycle 1]: 0.00026626, [7] [b_1]: 0.00016637 [b_2]: 1.177e-05 [updatestate_depend_eliminate]: 9.62001e-06 [updatestate_assign_eliminate]: 3.38e-06 [updatestate_loads_eliminate]: 3.8e-06 [renormalize]: 7.09988e-07 [cse]: 3.29e-05 [optimize_parallel_all_gather_comm]: 2.169e-05 [overlap_param_gather]: 2.54001e-06 [cconv]: 3.377e-05 [loop_unroll]: 0.00053181 [opt_after_cconv]: 0.00012593, [1] [Cycle 1]: 0.00011991, [7] [c_1]: 4.127e-05 [parameter_eliminate]: 4.21001e-06 [updatestate_depend_eliminate]: 6.89001e-06 [updatestate_assign_eliminate]: 3.18e-06 [updatestate_loads_eliminate]: 2.71999e-06 [cse]: 2.507e-05 [renormalize]: 6.59988e-07 [remove_dup_value]: 1.633e-05 [tuple_transform]: 0.00010373, [1] [Cycle 1]: 9.909e-05, [4] [d_1]: 5.792e-05 [none_parameter_eliminate]: 1.62999e-06 [renormalize]: 2.00002e-07 [switch_simplify]: 1.951e-05 [partial_unused_args_eliminate]: 1.97001e-06 [add_recomputation]: 6.611e-05 [cse_after_recomputation]: 2.901e-05, [1] [Cycle 1]: 2.377e-05, [1] [cse]: 1.772e-05 [environ_conv]: 6.98998e-06 [swap_dp_allreduce_reducescatter]: 6.01e-06 [bias_add_comm_swap]: 2.99999e-06 [label_micro_interleaved_index]: 4.62e-06 [label_fine_grained_interleaved_index]: 2.63998e-06 [merge_cast_opt]: 1.44e-06 [slice_recompute_activation]: 2.32001e-06 [micro_interleaved_order_control]: 2.48e-06 [assign_add_opt]: 1.35001e-06 [ForceFp32Comm]: 8.50006e-07 [remove_cast_before_assign_add]: 1.07998e-06 [full_micro_interleaved_order_control]: 2.58e-06 [reorder_send_recv_between_fp_bp]: 2.82002e-06 [comm_op_add_attrs]: 1.02998e-06 [add_comm_op_reuse_tag]: 1.00001e-06 [interleave_split_concat_branches]: 1.22e-06 [interleave_parallel_branches]: 1.16997e-06 [overlap_opt_shard_in_pipeline]: 1.18001e-06 [overlap_opt_shard_grad_in_pipeline]: 2.17001e-06 [control_data_broadcast_order]: 1.85e-05 [grouped_pairwise_exchange_alltoall]: 2.01e-06 [offloading_packed_experts]: 4.75999e-06 [overlap_recompute_and_grad_model_parallel]: 5.02e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.17e-06 [overlap_recompute_allgather_and_fa_grad]: 1.34e-06 [overlap_recompute_comm]: 2.28998e-06 [overlap_grad_ring_attention]: 5.19e-06 [overlap_grad_flash_sp]: 2.442e-05 [begin_end_overlap_inline]: 7.30011e-07 [split_matmul_comm_elemetwise]: 2.66e-06 [split_layernorm_comm]: 2.12001e-06 [handle_group_info]: 1.10001e-06 [symbol_engine_optimizer]: 9.446e-05, [1] [Cycle 1]: 8.927e-05, [6] [build]: 4.17e-06 [elim_shapecalc]: 1.422e-05 [elim_not_effective]: 1.771e-05 [opt_reshape]: 9.05999e-06 [fold_const_symbol]: 1.32e-05 [renormalize]: 2.30008e-07 [detach_backward]: 2.75997e-06 [pipeline_parallel_scheduler]: 1.99e-06 [auto_monad_reorder]: 2.363e-05 [get_jit_bprop_graph]: 1.84e-06 [rewriter_after_jit_bprop_graph]: 5.17e-06 [opt_after_jit_grad]: 0.0005349 [validate]: 4.699e-05 Sums bootstrap : 0.000443s : 1.15% type_inference : 0.031205s : 80.88% event_method : 0.000130s : 0.34% auto_monad : 0.000117s : 0.30% graph_reusing : 0.000007s : 0.02% inline : 0.000003s : 0.01% add_attr.add_attr_with_inline.tag_attr : 0.000037s : 0.10% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000009s : 0.02% parallel-infer-symbol : 0.000004s : 0.01% pre_auto_parallel : 0.000054s : 0.14% insert-virtual-dataset : 0.000003s : 0.01% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.01% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000042s : 0.11% optimize.rewriter_before_opt_a : 0.000131s : 0.34% optimize.opt_a.expand_dump_flag : 0.000007s : 0.02% optimize.opt_a.switch_simplify : 0.000156s : 0.41% optimize.opt_a.loop_unroll : 0.000051s : 0.13% optimize.opt_a.a_1 : 0.001162s : 3.01% optimize.opt_a.with_stream_mark : 0.000041s : 0.11% optimize.opt_a.recompute_prepare : 0.000022s : 0.06% optimize.opt_a.updatestate_depend_eliminate : 0.000011s : 0.03% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.02% optimize.opt_a.updatestate_loads_eliminate : 0.000007s : 0.02% optimize.opt_a.parameter_eliminate : 0.000005s : 0.01% optimize.opt_a.a_2 : 0.000205s : 0.53% optimize.opt_a.accelerated_algorithm : 0.000018s : 0.05% optimize.opt_a.shard : 0.000004s : 0.01% optimize.opt_a.meta_shard_fg_expand : 0.000006s : 0.01% optimize.opt_a.shard_inline : 0.000017s : 0.04% optimize.opt_a.merge_send_recv : 0.000019s : 0.05% optimize.opt_a.auto_parallel : 0.000018s : 0.05% optimize.opt_a.parallel : 0.000029s : 0.07% optimize.opt_a.flash_sp : 0.000013s : 0.03% optimize.opt_a.merge_comm : 0.000009s : 0.02% optimize.opt_a.allreduce_fusion : 0.000008s : 0.02% optimize.opt_a.matmul_add_comm_reduction : 0.000022s : 0.06% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000021s : 0.05% optimize.opt_a.virtual_dataset : 0.000015s : 0.04% optimize.opt_a.get_grad_eliminate_ : 0.000016s : 0.04% optimize.opt_a.virtual_output : 0.000015s : 0.04% optimize.opt_a.merge_forward : 0.000009s : 0.02% optimize.opt_a.cell_reuse_recompute_pass : 0.000005s : 0.01% optimize.opt_a.offload_activation : 0.000022s : 0.06% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000034s : 0.09% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.01% optimize.opt_a.before_grad : 0.000027s : 0.07% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000010s : 0.03% optimize.opt_a.meta_fg_expand : 0.000008s : 0.02% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.01% optimize.opt_a.receive_attached : 0.000004s : 0.01% optimize.opt_a.after_resolve : 0.000029s : 0.08% optimize.opt_a.a_after_grad : 0.000024s : 0.06% optimize.opt_a.renormalize : 0.001331s : 3.45% optimize.opt_a.add_forward_monad_depend : 0.000011s : 0.03% optimize.opt_a.auto_monad_grad : 0.000006s : 0.01% optimize.opt_a.auto_monad_eliminator : 0.000033s : 0.09% optimize.opt_a.cse : 0.000065s : 0.17% optimize.opt_a.a_3 : 0.000117s : 0.30% optimize.py_interpret_to_execute_after_opt_a : 0.000019s : 0.05% optimize.slice_cell_reuse_recomputed_activation : 0.000003s : 0.01% optimize.rewriter_after_opt_a : 0.000048s : 0.13% optimize.convert_after_rewriter : 0.000008s : 0.02% optimize.order_py_execute_after_rewriter : 0.000007s : 0.02% optimize.mutable_eliminate : 0.000783s : 2.03% optimize.opt_b.b_1 : 0.000166s : 0.43% optimize.opt_b.b_2 : 0.000012s : 0.03% optimize.opt_b.updatestate_depend_eliminate : 0.000010s : 0.02% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.01% optimize.opt_b.updatestate_loads_eliminate : 0.000004s : 0.01% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000033s : 0.09% optimize.optimize_parallel_all_gather_comm : 0.000022s : 0.06% optimize.overlap_param_gather : 0.000003s : 0.01% optimize.cconv : 0.000034s : 0.09% optimize.loop_unroll : 0.000532s : 1.38% optimize.opt_after_cconv.c_1 : 0.000041s : 0.11% optimize.opt_after_cconv.parameter_eliminate : 0.000004s : 0.01% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000007s : 0.02% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.cse : 0.000025s : 0.06% optimize.opt_after_cconv.renormalize : 0.000001s : 0.00% optimize.remove_dup_value : 0.000016s : 0.04% optimize.tuple_transform.d_1 : 0.000058s : 0.15% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000020s : 0.05% optimize.partial_unused_args_eliminate : 0.000002s : 0.01% optimize.add_recomputation : 0.000066s : 0.17% optimize.cse_after_recomputation.cse : 0.000018s : 0.05% optimize.environ_conv : 0.000007s : 0.02% optimize.swap_dp_allreduce_reducescatter : 0.000006s : 0.02% optimize.bias_add_comm_swap : 0.000003s : 0.01% optimize.label_micro_interleaved_index : 0.000005s : 0.01% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.01% optimize.merge_cast_opt : 0.000001s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.01% optimize.micro_interleaved_order_control : 0.000002s : 0.01% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.00% optimize.full_micro_interleaved_order_control : 0.000003s : 0.01% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.01% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.01% optimize.control_data_broadcast_order : 0.000018s : 0.05% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.01% optimize.offloading_packed_experts : 0.000005s : 0.01% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.01% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.00% optimize.overlap_recompute_comm : 0.000002s : 0.01% optimize.overlap_grad_ring_attention : 0.000005s : 0.01% optimize.overlap_grad_flash_sp : 0.000024s : 0.06% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000003s : 0.01% optimize.split_layernorm_comm : 0.000002s : 0.01% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000004s : 0.01% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000014s : 0.04% optimize.symbol_engine_optimizer.elim_not_effective : 0.000018s : 0.05% optimize.symbol_engine_optimizer.opt_reshape : 0.000009s : 0.02% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000013s : 0.03% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000003s : 0.01% pipeline_parallel_scheduler : 0.000002s : 0.01% auto_monad_reorder : 0.000024s : 0.06% get_jit_bprop_graph : 0.000002s : 0.00% rewriter_after_jit_bprop_graph : 0.000005s : 0.01% opt_after_jit_grad : 0.000535s : 1.39% validate : 0.000047s : 0.12% Time group info: ------[substitution.] 0.000301 44 10.56% : 0.000032s : 3: substitution.cast_eliminate 0.85% : 0.000003s : 3: substitution.elim_not_effective 0.62% : 0.000002s : 3: substitution.fold_const_symbol 2.32% : 0.000007s : 5: substitution.graph_param_transform 70.55% : 0.000212s : 8: substitution.inline 1.86% : 0.000006s : 6: substitution.j_node_and_user_rematch 2.15% : 0.000006s : 6: substitution.remove_not_recompute_node 2.29% : 0.000007s : 4: substitution.replace_old_param 4.02% : 0.000012s : 2: substitution.switch_simplify 4.79% : 0.000014s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.031107 2 94.18% : 0.029298s : 1: type_inference.infer 5.82% : 0.001809s : 1: type_inference.specialize ------[replace.] 0.000141 14 41.01% : 0.000058s : 8: replace.inline 39.90% : 0.000056s : 2: replace.switch_simplify 19.10% : 0.000027s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000230 14 89.98% : 0.000207s : 8: match.inline 4.67% : 0.000011s : 2: match.switch_simplify 5.35% : 0.000012s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000293 1746 1.01% : 0.000003s : 19: predicate.accumulaten_eliminater 0.80% : 0.000002s : 5: predicate.ad_related_special_op_eliminate 0.50% : 0.000001s : 10: predicate.addn_check_dump 1.03% : 0.000003s : 19: predicate.addn_zero_filter 0.88% : 0.000003s : 19: predicate.adjust_all_reduce_mul_add 2.11% : 0.000006s : 29: predicate.arithmetic_simplify 1.12% : 0.000003s : 19: predicate.cast_eliminate 0.51% : 0.000001s : 10: predicate.check_bprop_eliminate 0.50% : 0.000001s : 10: predicate.compare_switch_simplify 0.15% : 0.000000s : 5: predicate.const_output_eliminate 0.51% : 0.000002s : 10: predicate.depend_value_elim 1.09% : 0.000003s : 19: predicate.dict_get_item_const_eliminator 1.15% : 0.000003s : 19: predicate.dict_get_item_eliminator 0.89% : 0.000003s : 19: predicate.dict_set_item_eliminator 0.76% : 0.000002s : 10: predicate.dumpgradient_eliminate 0.21% : 0.000001s : 5: predicate.elim_not_effective 0.49% : 0.000001s : 5: predicate.elim_shapecalc_of_broadcastargs 1.29% : 0.000004s : 24: predicate.environ_add_const_eliminate 1.15% : 0.000003s : 24: predicate.environ_get_add_eliminate 1.24% : 0.000004s : 24: predicate.environ_get_depend_swap 1.64% : 0.000005s : 34: predicate.environ_get_eliminate 1.15% : 0.000003s : 24: predicate.environ_get_set_eliminate 1.58% : 0.000005s : 31: predicate.exchange_switch_depend_value 2.38% : 0.000007s : 31: predicate.float_depend_g_call 0.49% : 0.000001s : 10: predicate.float_environ_get_switch 0.75% : 0.000002s : 15: predicate.float_tuple_getitem_switch 0.17% : 0.000001s : 5: predicate.fold_const_symbol 0.61% : 0.000002s : 10: predicate.get_grad_eliminate 0.18% : 0.000001s : 5: predicate.graph_param_transform 0.55% : 0.000002s : 10: predicate.incorporate_call 0.45% : 0.000001s : 10: predicate.incorporate_call_switch 6.67% : 0.000020s : 80: predicate.inline 0.68% : 0.000002s : 10: predicate.inline_without_move 0.28% : 0.000001s : 10: predicate.j_node_and_user_rematch 0.63% : 0.000002s : 10: predicate.less_batch_normalization 1.84% : 0.000005s : 33: predicate.list_to_tuple_eliminator_ 2.40% : 0.000007s : 52: predicate.load_eliminater 0.88% : 0.000003s : 5: predicate.loop_unroll_after_grad 2.50% : 0.000007s : 49: predicate.loop_unroll_before_grad 1.57% : 0.000005s : 29: predicate.make_slice_get_slice_eliminator 0.53% : 0.000002s : 10: predicate.merge_addn 0.56% : 0.000002s : 10: predicate.micro_step_allgather_replace 0.49% : 0.000001s : 10: predicate.mini_step_allgather_replace 0.89% : 0.000003s : 19: predicate.minmaximum_grad 1.15% : 0.000003s : 5: predicate.mutable_eliminate 0.37% : 0.000001s : 5: predicate.opt_reshape 0.29% : 0.000001s : 5: predicate.parallel_virtual_node 2.01% : 0.000006s : 31: predicate.partial_defer_inline 1.54% : 0.000005s : 28: predicate.partial_eliminate 0.94% : 0.000003s : 19: predicate.print_const_string_wrapper 0.53% : 0.000002s : 10: predicate.reduce_all_const_elim 1.37% : 0.000004s : 19: predicate.reduce_eliminate 2.60% : 0.000008s : 52: predicate.redundant_stop_gradient_eliminater 0.37% : 0.000001s : 10: predicate.remove_not_recompute_node 1.36% : 0.000004s : 33: predicate.replace_applicator 0.43% : 0.000001s : 10: predicate.replace_old_param 0.30% : 0.000001s : 5: predicate.reset_defer_inline 1.10% : 0.000003s : 19: predicate.reshape_eliminate 0.50% : 0.000001s : 10: predicate.row_tensor_add_zeros_like 0.47% : 0.000001s : 5: predicate.row_tensor_eliminate 0.68% : 0.000002s : 10: predicate.same_eliminate 0.39% : 0.000001s : 10: predicate.set_cell_output_no_recompute 0.70% : 0.000002s : 10: predicate.shard_identity_eliminate 0.58% : 0.000002s : 10: predicate.special_op_eliminate 0.79% : 0.000002s : 10: predicate.specialize_transform 0.72% : 0.000002s : 10: predicate.split_environ_get_set_with_tuple_value 0.64% : 0.000002s : 10: predicate.stack_unstack_eliminate 0.29% : 0.000001s : 5: predicate.switch_call_monad_eliminater 1.61% : 0.000005s : 31: predicate.switch_defer_inline 2.13% : 0.000006s : 41: predicate.switch_layer_defer_inline 5.73% : 0.000017s : 99: predicate.switch_simplify 1.06% : 0.000003s : 19: predicate.tile_eliminate 0.98% : 0.000003s : 19: predicate.transpose_eliminate 1.56% : 0.000005s : 29: predicate.tuple_list_convert_item_index_to_positive 1.70% : 0.000005s : 29: predicate.tuple_list_get_item_const_eliminator 1.49% : 0.000004s : 29: predicate.tuple_list_get_item_depend_reorder 3.05% : 0.000009s : 43: predicate.tuple_list_get_item_eliminator 1.56% : 0.000005s : 29: predicate.tuple_list_get_set_item_eliminator 2.39% : 0.000007s : 39: predicate.tuple_list_set_item_eliminator 1.66% : 0.000005s : 33: predicate.tuple_to_list_eliminator_ 2.42% : 0.000007s : 52: predicate.updatestate_pure_node_eliminater 3.07% : 0.000009s : 62: predicate.updatestate_useless_node_eliminater 0.30% : 0.000001s : 5: predicate.value_based_eliminate 0.56% : 0.000002s : 10: predicate.virtual_dataset_eliminate 0.59% : 0.000002s : 10: predicate.virtual_output_eliminate 0.28% : 0.000001s : 5: predicate.virtual_view_grad_eliminate 0.44% : 0.000001s : 5: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.001176 16 50.46% : 0.000593s : 6: func_graph_cloner_run.FuncGraphClonerGraph 49.54% : 0.000582s : 10: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.058142 192 0.01% : 0.000003s : 1: ForceFp32Comm 7.07% : 0.004110s : 1: add_attr 7.04% : 0.004094s : 1: add_attr_with_inline 0.01% : 0.000004s : 1: add_comm_op_reuse_tag 0.12% : 0.000070s : 1: add_recomputation 0.01% : 0.000004s : 1: assign_add_opt 0.22% : 0.000127s : 1: auto_monad 0.05% : 0.000028s : 1: auto_monad_reorder 0.01% : 0.000004s : 1: begin_end_overlap_inline 0.01% : 0.000007s : 1: bias_add_comm_swap 0.82% : 0.000474s : 1: bootstrap 0.06% : 0.000037s : 1: cconv 0.01% : 0.000005s : 1: comm_op_add_attrs 0.04% : 0.000022s : 1: control_data_broadcast_order 0.02% : 0.000011s : 1: convert_after_rewriter 0.06% : 0.000033s : 1: cse_after_recomputation 0.01% : 0.000005s : 1: dataset_repeat_opt 0.01% : 0.000006s : 1: detach_backward 0.02% : 0.000010s : 1: environ_conv 0.25% : 0.000143s : 1: event_method 0.01% : 0.000005s : 1: full_micro_interleaved_order_control 0.01% : 0.000005s : 1: get_jit_bprop_graph 0.02% : 0.000011s : 1: graph_reusing 0.01% : 0.000005s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000004s : 1: handle_group_info 0.01% : 0.000006s : 1: inline 0.01% : 0.000007s : 1: insert-virtual-dataset 0.01% : 0.000004s : 1: interleave_parallel_branches 0.01% : 0.000004s : 1: interleave_split_concat_branches 0.01% : 0.000007s : 1: label_fine_grained_interleaved_index 0.01% : 0.000008s : 1: label_micro_interleaved_index 0.93% : 0.000541s : 1: loop_unroll 0.01% : 0.000004s : 1: merge_cast_opt 0.01% : 0.000005s : 1: micro_interleaved_order_control 1.37% : 0.000794s : 1: mutable_eliminate 0.02% : 0.000010s : 1: offloading_packed_experts 0.03% : 0.000019s : 1: opt.transform.loop_unroll_optimizer 0.04% : 0.000024s : 1: opt.transform.mutable_eliminate 3.14% : 0.001823s : 78: opt.transform.opt_a 0.07% : 0.000040s : 1: opt.transform.opt_after_cconv 0.06% : 0.000035s : 1: opt.transform.opt_after_jit_grad 0.25% : 0.000144s : 28: opt.transform.opt_b 0.13% : 0.000075s : 2: opt.transform.opt_trans_graph 0.08% : 0.000049s : 4: opt.transform.symbol_engine_opt 7.17% : 0.004170s : 1: opt_a 0.22% : 0.000129s : 1: opt_after_cconv 0.94% : 0.000546s : 1: opt_after_jit_grad 0.48% : 0.000279s : 1: opt_b 11.88% : 0.006909s : 1: optimize 0.04% : 0.000025s : 1: optimize_parallel_all_gather_comm 0.02% : 0.000010s : 1: order_py_execute_after_rewriter 0.05% : 0.000028s : 1: overlap_grad_flash_sp 0.01% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.01% : 0.000008s : 1: overlap_grad_ring_attention 0.01% : 0.000006s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.01% : 0.000006s : 1: overlap_param_gather 0.01% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.01% : 0.000008s : 1: overlap_recompute_and_grad_model_parallel 0.01% : 0.000005s : 1: overlap_recompute_comm 0.01% : 0.000008s : 1: parallel-infer-symbol 0.01% : 0.000004s : 1: parallel-infer-symbol-second 0.01% : 0.000005s : 1: partial_unused_args_eliminate 0.01% : 0.000005s : 1: pipeline_parallel_scheduler 0.01% : 0.000005s : 1: pipeline_split 0.10% : 0.000059s : 1: pre_auto_parallel 0.08% : 0.000047s : 1: py_interpret_to_execute 0.04% : 0.000023s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000004s : 1: remove_cast_before_assign_add 0.04% : 0.000021s : 1: remove_dup_value 1.24% : 0.000720s : 1: renormalize.infer 1.03% : 0.000599s : 1: renormalize.specialize 0.01% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.02% : 0.000009s : 1: rewriter_after_jit_bprop_graph 0.09% : 0.000052s : 1: rewriter_after_opt_a 0.23% : 0.000136s : 1: rewriter_before_opt_a 0.01% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.01% : 0.000005s : 1: slice_recompute_activation 0.01% : 0.000005s : 1: split_layernorm_comm 0.01% : 0.000007s : 1: split_matmul_comm_elemetwise 0.02% : 0.000009s : 1: swap_dp_allreduce_reducescatter 0.17% : 0.000097s : 1: symbol_engine_optimizer 0.19% : 0.000108s : 1: tuple_transform 53.72% : 0.031236s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:06.903.403 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:06.903.670 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0406551, [21] [bootstrap]: 0.00042724 [type_inference]: 0.0127025 [event_method]: 5.074e-05 [auto_monad]: 9.751e-05 [graph_reusing]: 7.34002e-06 [inline]: 2.61999e-06 [add_attr]: 0.00336805, [1] [add_attr_with_inline]: 0.00335699, [1] [Cycle 1]: 9.808e-05, [2] [tag_attr]: 3.122e-05 [meta_addattr_fg_expand]: 9.04e-06 [parallel-infer-symbol]: 3.17997e-06 [pre_auto_parallel]: 4.952e-05 [insert-virtual-dataset]: 2.79001e-06 [parallel-infer-symbol-second]: 8.50006e-07 [dataset_repeat_opt]: 2.26e-06 [pipeline_split]: 1.59e-06 [optimize]: 0.0224087, [53] [py_interpret_to_execute]: 4.704e-05 [rewriter_before_opt_a]: 0.00013197 [opt_a]: 0.00380949, [2] [Cycle 1]: 0.00295453, [45] [expand_dump_flag]: 5.39998e-06 [switch_simplify]: 0.0001382 [loop_unroll]: 4.534e-05 [a_1]: 0.00089589 [with_stream_mark]: 1.8e-05 [recompute_prepare]: 9.97999e-06 [updatestate_depend_eliminate]: 4.25e-06 [updatestate_assign_eliminate]: 3.47997e-06 [updatestate_loads_eliminate]: 3.02002e-06 [parameter_eliminate]: 2.21998e-06 [a_2]: 0.00011469 [accelerated_algorithm]: 7.45e-06 [shard]: 1.87001e-06 [meta_shard_fg_expand]: 2.34001e-06 [shard_inline]: 6.56999e-06 [merge_send_recv]: 8.17e-06 [auto_parallel]: 7.82e-06 [parallel]: 1.872e-05 [flash_sp]: 8.55001e-06 [merge_comm]: 4.41002e-06 [allreduce_fusion]: 3.65e-06 [matmul_add_comm_reduction]: 9.63997e-06 [allreduce_slice_to_reducescatter]: 6.79982e-07 [virtual_shard_identity]: 8.40001e-06 [virtual_dataset]: 6.84999e-06 [get_grad_eliminate_]: 6.63998e-06 [virtual_output]: 6.59999e-06 [merge_forward]: 3.66999e-06 [cell_reuse_recompute_pass]: 1.94e-06 [offload_activation]: 1.126e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.502e-05 [merge_recompute_call_nodes]: 1.50001e-06 [before_grad]: 1.169e-05 [set_forward_comm_id_for_comm_node_pass]: 3.75998e-06 [meta_fg_expand]: 3.75e-06 [flash_sp_send_recv_attached]: 2.67001e-06 [receive_attached]: 2.54001e-06 [after_resolve]: 1.244e-05 [a_after_grad]: 1.007e-05 [renormalize]: 0.00099774 [add_forward_monad_depend]: 6.39999e-06 [auto_monad_grad]: 2.64999e-06 [auto_monad_eliminator]: 1.697e-05 [cse]: 2.774e-05 [a_3]: 6.6e-05 [Cycle 2]: 0.00084028, [45] [expand_dump_flag]: 1.32e-06 [switch_simplify]: 8.88002e-06 [loop_unroll]: 6.66e-06 [a_1]: 0.00013795 [with_stream_mark]: 1.261e-05 [recompute_prepare]: 6.41998e-06 [updatestate_depend_eliminate]: 2.82002e-06 [updatestate_assign_eliminate]: 2.36e-06 [updatestate_loads_eliminate]: 2.57001e-06 [parameter_eliminate]: 1.22999e-06 [a_2]: 0.00010354 [accelerated_algorithm]: 6.73e-06 [shard]: 1.64998e-06 [meta_shard_fg_expand]: 1.62999e-06 [shard_inline]: 6.78e-06 [merge_send_recv]: 5.24e-06 [auto_parallel]: 5.98998e-06 [parallel]: 5.53002e-06 [flash_sp]: 3.75e-06 [merge_comm]: 3.2e-06 [allreduce_fusion]: 3.35998e-06 [matmul_add_comm_reduction]: 7.29001e-06 [allreduce_slice_to_reducescatter]: 5.8001e-07 [virtual_shard_identity]: 7.26999e-06 [virtual_dataset]: 6.58e-06 [get_grad_eliminate_]: 6.24001e-06 [virtual_output]: 5.87999e-06 [merge_forward]: 2.86e-06 [cell_reuse_recompute_pass]: 1.45999e-06 [offload_activation]: 6.99001e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.525e-05 [merge_recompute_call_nodes]: 9.79984e-07 [before_grad]: 1.025e-05 [set_forward_comm_id_for_comm_node_pass]: 3.76999e-06 [meta_fg_expand]: 2.34001e-06 [flash_sp_send_recv_attached]: 9.09989e-07 [receive_attached]: 1.10999e-06 [after_resolve]: 1.203e-05 [a_after_grad]: 9.71e-06 [renormalize]: 8.00064e-08 [add_forward_monad_depend]: 1.47001e-06 [auto_monad_grad]: 1.17999e-06 [auto_monad_eliminator]: 6.96001e-06 [cse]: 1.353e-05 [a_3]: 5.166e-05 [py_interpret_to_execute_after_opt_a]: 1.34e-05 [slice_cell_reuse_recomputed_activation]: 5.12e-06 [rewriter_after_opt_a]: 4.073e-05 [convert_after_rewriter]: 1.004e-05 [order_py_execute_after_rewriter]: 8.33001e-06 [mutable_eliminate]: 0.00061988 [opt_b]: 0.00028064, [1] [Cycle 1]: 0.00027136, [7] [b_1]: 0.00017518 [b_2]: 8.23001e-06 [updatestate_depend_eliminate]: 6.30002e-06 [updatestate_assign_eliminate]: 2.47001e-06 [updatestate_loads_eliminate]: 2.37999e-06 [renormalize]: 6.39993e-07 [cse]: 1.873e-05 [optimize_parallel_all_gather_comm]: 1.974e-05 [overlap_param_gather]: 4.90999e-06 [cconv]: 3.007e-05 [loop_unroll]: 0.00045876 [opt_after_cconv]: 0.00012685, [1] [Cycle 1]: 0.00011831, [7] [c_1]: 3.341e-05 [parameter_eliminate]: 2.81999e-06 [updatestate_depend_eliminate]: 5.22999e-06 [updatestate_assign_eliminate]: 2.52001e-06 [updatestate_loads_eliminate]: 2.66999e-06 [cse]: 1.577e-05 [renormalize]: 4.40021e-07 [remove_dup_value]: 1.678e-05 [tuple_transform]: 9.602e-05, [1] [Cycle 1]: 8.807e-05, [4] [d_1]: 4.801e-05 [none_parameter_eliminate]: 1.54e-06 [renormalize]: 2.30008e-07 [switch_simplify]: 7.25003e-06 [partial_unused_args_eliminate]: 4.41002e-06 [add_recomputation]: 5.011e-05 [cse_after_recomputation]: 2.681e-05, [1] [Cycle 1]: 1.99e-05, [1] [cse]: 1.06e-05 [environ_conv]: 7.54002e-06 [swap_dp_allreduce_reducescatter]: 7.30003e-06 [bias_add_comm_swap]: 4.62e-06 [label_micro_interleaved_index]: 7.26001e-06 [label_fine_grained_interleaved_index]: 6.01e-06 [merge_cast_opt]: 3.76999e-06 [slice_recompute_activation]: 4.52998e-06 [micro_interleaved_order_control]: 4.75001e-06 [assign_add_opt]: 3.38e-06 [ForceFp32Comm]: 2.041e-05 [remove_cast_before_assign_add]: 3.88999e-06 [full_micro_interleaved_order_control]: 5.12999e-06 [reorder_send_recv_between_fp_bp]: 5.15999e-06 [comm_op_add_attrs]: 3.47002e-06 [add_comm_op_reuse_tag]: 3.45e-06 [interleave_split_concat_branches]: 3.61999e-06 [interleave_parallel_branches]: 3.48e-06 [overlap_opt_shard_in_pipeline]: 3.46001e-06 [overlap_opt_shard_grad_in_pipeline]: 3.98001e-06 [control_data_broadcast_order]: 1.588e-05 [grouped_pairwise_exchange_alltoall]: 3.92998e-06 [offloading_packed_experts]: 6.11e-06 [overlap_recompute_and_grad_model_parallel]: 7.49002e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.70998e-06 [overlap_recompute_allgather_and_fa_grad]: 3.77998e-06 [overlap_recompute_comm]: 4.72e-06 [overlap_grad_ring_attention]: 3.751e-05 [overlap_grad_flash_sp]: 2.216e-05 [begin_end_overlap_inline]: 3.14001e-06 [split_matmul_comm_elemetwise]: 0.0159844 [split_layernorm_comm]: 5.67999e-06 [handle_group_info]: 4.03999e-06 [symbol_engine_optimizer]: 0.00015318, [1] [Cycle 1]: 0.00013988, [6] [build]: 7.51999e-06 [elim_shapecalc]: 2.843e-05 [elim_not_effective]: 2.373e-05 [opt_reshape]: 9.09e-06 [fold_const_symbol]: 1.209e-05 [renormalize]: 8.80013e-07 [detach_backward]: 3.66999e-06 [pipeline_parallel_scheduler]: 1.99999e-06 [auto_monad_reorder]: 2.608e-05 [get_jit_bprop_graph]: 1.86e-06 [rewriter_after_jit_bprop_graph]: 6.01e-06 [opt_after_jit_grad]: 0.00076677 [validate]: 4.33e-05 Sums bootstrap : 0.000427s : 1.21% type_inference : 0.012702s : 35.90% event_method : 0.000051s : 0.14% auto_monad : 0.000098s : 0.28% graph_reusing : 0.000007s : 0.02% inline : 0.000003s : 0.01% add_attr.add_attr_with_inline.tag_attr : 0.000031s : 0.09% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000009s : 0.03% parallel-infer-symbol : 0.000003s : 0.01% pre_auto_parallel : 0.000050s : 0.14% insert-virtual-dataset : 0.000003s : 0.01% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.01% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000047s : 0.13% optimize.rewriter_before_opt_a : 0.000132s : 0.37% optimize.opt_a.expand_dump_flag : 0.000007s : 0.02% optimize.opt_a.switch_simplify : 0.000147s : 0.42% optimize.opt_a.loop_unroll : 0.000052s : 0.15% optimize.opt_a.a_1 : 0.001034s : 2.92% optimize.opt_a.with_stream_mark : 0.000031s : 0.09% optimize.opt_a.recompute_prepare : 0.000016s : 0.05% optimize.opt_a.updatestate_depend_eliminate : 0.000007s : 0.02% optimize.opt_a.updatestate_assign_eliminate : 0.000006s : 0.02% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.02% optimize.opt_a.parameter_eliminate : 0.000003s : 0.01% optimize.opt_a.a_2 : 0.000218s : 0.62% optimize.opt_a.accelerated_algorithm : 0.000014s : 0.04% optimize.opt_a.shard : 0.000004s : 0.01% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.01% optimize.opt_a.shard_inline : 0.000013s : 0.04% optimize.opt_a.merge_send_recv : 0.000013s : 0.04% optimize.opt_a.auto_parallel : 0.000014s : 0.04% optimize.opt_a.parallel : 0.000024s : 0.07% optimize.opt_a.flash_sp : 0.000012s : 0.03% optimize.opt_a.merge_comm : 0.000008s : 0.02% optimize.opt_a.allreduce_fusion : 0.000007s : 0.02% optimize.opt_a.matmul_add_comm_reduction : 0.000017s : 0.05% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000016s : 0.04% optimize.opt_a.virtual_dataset : 0.000013s : 0.04% optimize.opt_a.get_grad_eliminate_ : 0.000013s : 0.04% optimize.opt_a.virtual_output : 0.000012s : 0.04% optimize.opt_a.merge_forward : 0.000007s : 0.02% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.01% optimize.opt_a.offload_activation : 0.000018s : 0.05% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000030s : 0.09% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.01% optimize.opt_a.before_grad : 0.000022s : 0.06% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000008s : 0.02% optimize.opt_a.meta_fg_expand : 0.000006s : 0.02% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.01% optimize.opt_a.receive_attached : 0.000004s : 0.01% optimize.opt_a.after_resolve : 0.000024s : 0.07% optimize.opt_a.a_after_grad : 0.000020s : 0.06% optimize.opt_a.renormalize : 0.000998s : 2.82% optimize.opt_a.add_forward_monad_depend : 0.000008s : 0.02% optimize.opt_a.auto_monad_grad : 0.000004s : 0.01% optimize.opt_a.auto_monad_eliminator : 0.000024s : 0.07% optimize.opt_a.cse : 0.000041s : 0.12% optimize.opt_a.a_3 : 0.000118s : 0.33% optimize.py_interpret_to_execute_after_opt_a : 0.000013s : 0.04% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.01% optimize.rewriter_after_opt_a : 0.000041s : 0.12% optimize.convert_after_rewriter : 0.000010s : 0.03% optimize.order_py_execute_after_rewriter : 0.000008s : 0.02% optimize.mutable_eliminate : 0.000620s : 1.75% optimize.opt_b.b_1 : 0.000175s : 0.50% optimize.opt_b.b_2 : 0.000008s : 0.02% optimize.opt_b.updatestate_depend_eliminate : 0.000006s : 0.02% optimize.opt_b.updatestate_assign_eliminate : 0.000002s : 0.01% optimize.opt_b.updatestate_loads_eliminate : 0.000002s : 0.01% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000019s : 0.05% optimize.optimize_parallel_all_gather_comm : 0.000020s : 0.06% optimize.overlap_param_gather : 0.000005s : 0.01% optimize.cconv : 0.000030s : 0.08% optimize.loop_unroll : 0.000459s : 1.30% optimize.opt_after_cconv.c_1 : 0.000033s : 0.09% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000005s : 0.01% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.cse : 0.000016s : 0.04% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000017s : 0.05% optimize.tuple_transform.d_1 : 0.000048s : 0.14% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000007s : 0.02% optimize.partial_unused_args_eliminate : 0.000004s : 0.01% optimize.add_recomputation : 0.000050s : 0.14% optimize.cse_after_recomputation.cse : 0.000011s : 0.03% optimize.environ_conv : 0.000008s : 0.02% optimize.swap_dp_allreduce_reducescatter : 0.000007s : 0.02% optimize.bias_add_comm_swap : 0.000005s : 0.01% optimize.label_micro_interleaved_index : 0.000007s : 0.02% optimize.label_fine_grained_interleaved_index : 0.000006s : 0.02% optimize.merge_cast_opt : 0.000004s : 0.01% optimize.slice_recompute_activation : 0.000005s : 0.01% optimize.micro_interleaved_order_control : 0.000005s : 0.01% optimize.assign_add_opt : 0.000003s : 0.01% optimize.ForceFp32Comm : 0.000020s : 0.06% optimize.remove_cast_before_assign_add : 0.000004s : 0.01% optimize.full_micro_interleaved_order_control : 0.000005s : 0.01% optimize.reorder_send_recv_between_fp_bp : 0.000005s : 0.01% optimize.comm_op_add_attrs : 0.000003s : 0.01% optimize.add_comm_op_reuse_tag : 0.000003s : 0.01% optimize.interleave_split_concat_branches : 0.000004s : 0.01% optimize.interleave_parallel_branches : 0.000003s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000003s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000004s : 0.01% optimize.control_data_broadcast_order : 0.000016s : 0.04% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.01% optimize.offloading_packed_experts : 0.000006s : 0.02% optimize.overlap_recompute_and_grad_model_parallel : 0.000007s : 0.02% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.01% optimize.overlap_recompute_comm : 0.000005s : 0.01% optimize.overlap_grad_ring_attention : 0.000038s : 0.11% optimize.overlap_grad_flash_sp : 0.000022s : 0.06% optimize.begin_end_overlap_inline : 0.000003s : 0.01% optimize.split_matmul_comm_elemetwise : 0.015984s : 45.17% optimize.split_layernorm_comm : 0.000006s : 0.02% optimize.handle_group_info : 0.000004s : 0.01% optimize.symbol_engine_optimizer.build : 0.000008s : 0.02% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000028s : 0.08% optimize.symbol_engine_optimizer.elim_not_effective : 0.000024s : 0.07% optimize.symbol_engine_optimizer.opt_reshape : 0.000009s : 0.03% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000012s : 0.03% optimize.symbol_engine_optimizer.renormalize : 0.000001s : 0.00% detach_backward : 0.000004s : 0.01% pipeline_parallel_scheduler : 0.000002s : 0.01% auto_monad_reorder : 0.000026s : 0.07% get_jit_bprop_graph : 0.000002s : 0.01% rewriter_after_jit_bprop_graph : 0.000006s : 0.02% opt_after_jit_grad : 0.000767s : 2.17% validate : 0.000043s : 0.12% Time group info: ------[substitution.] 0.000241 34 1.16% : 0.000003s : 2: substitution.elim_not_effective 0.63% : 0.000002s : 2: substitution.fold_const_symbol 3.00% : 0.000007s : 4: substitution.graph_param_transform 77.12% : 0.000186s : 8: substitution.inline 1.53% : 0.000004s : 4: substitution.j_node_and_user_rematch 1.91% : 0.000005s : 4: substitution.remove_not_recompute_node 2.23% : 0.000005s : 4: substitution.replace_old_param 5.07% : 0.000012s : 2: substitution.switch_simplify 7.35% : 0.000018s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.012640 2 89.56% : 0.011320s : 1: type_inference.infer 10.44% : 0.001320s : 1: type_inference.specialize ------[replace.] 0.000135 14 42.45% : 0.000057s : 8: replace.inline 39.40% : 0.000053s : 2: replace.switch_simplify 18.15% : 0.000024s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000208 14 87.43% : 0.000182s : 8: match.inline 5.21% : 0.000011s : 2: match.switch_simplify 7.36% : 0.000015s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000255 1520 0.95% : 0.000002s : 17: predicate.accumulaten_eliminater 0.77% : 0.000002s : 4: predicate.ad_related_special_op_eliminate 0.49% : 0.000001s : 8: predicate.addn_check_dump 1.07% : 0.000003s : 17: predicate.addn_zero_filter 0.87% : 0.000002s : 17: predicate.adjust_all_reduce_mul_add 2.15% : 0.000005s : 25: predicate.arithmetic_simplify 1.06% : 0.000003s : 17: predicate.cast_eliminate 0.53% : 0.000001s : 8: predicate.check_bprop_eliminate 0.43% : 0.000001s : 8: predicate.compare_switch_simplify 0.16% : 0.000000s : 4: predicate.const_output_eliminate 0.59% : 0.000002s : 8: predicate.depend_value_elim 1.00% : 0.000003s : 17: predicate.dict_get_item_const_eliminator 1.07% : 0.000003s : 17: predicate.dict_get_item_eliminator 0.98% : 0.000003s : 17: predicate.dict_set_item_eliminator 0.85% : 0.000002s : 8: predicate.dumpgradient_eliminate 0.37% : 0.000001s : 4: predicate.elim_not_effective 1.02% : 0.000003s : 4: predicate.elim_shapecalc_of_broadcastargs 1.23% : 0.000003s : 21: predicate.environ_add_const_eliminate 1.25% : 0.000003s : 21: predicate.environ_get_add_eliminate 1.16% : 0.000003s : 21: predicate.environ_get_depend_swap 1.66% : 0.000004s : 29: predicate.environ_get_eliminate 1.32% : 0.000003s : 21: predicate.environ_get_set_eliminate 1.71% : 0.000004s : 29: predicate.exchange_switch_depend_value 2.80% : 0.000007s : 29: predicate.float_depend_g_call 0.42% : 0.000001s : 8: predicate.float_environ_get_switch 0.65% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.16% : 0.000000s : 4: predicate.fold_const_symbol 0.56% : 0.000001s : 8: predicate.get_grad_eliminate 0.18% : 0.000000s : 4: predicate.graph_param_transform 0.47% : 0.000001s : 8: predicate.incorporate_call 0.40% : 0.000001s : 8: predicate.incorporate_call_switch 6.27% : 0.000016s : 70: predicate.inline 0.69% : 0.000002s : 8: predicate.inline_without_move 0.25% : 0.000001s : 8: predicate.j_node_and_user_rematch 0.67% : 0.000002s : 8: predicate.less_batch_normalization 1.84% : 0.000005s : 29: predicate.list_to_tuple_eliminator_ 2.46% : 0.000006s : 46: predicate.load_eliminater 0.69% : 0.000002s : 4: predicate.loop_unroll_after_grad 3.04% : 0.000008s : 47: predicate.loop_unroll_before_grad 1.59% : 0.000004s : 25: predicate.make_slice_get_slice_eliminator 0.44% : 0.000001s : 8: predicate.merge_addn 0.53% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.48% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.87% : 0.000002s : 17: predicate.minmaximum_grad 0.89% : 0.000002s : 4: predicate.mutable_eliminate 0.35% : 0.000001s : 4: predicate.opt_reshape 0.31% : 0.000001s : 4: predicate.parallel_virtual_node 2.19% : 0.000006s : 29: predicate.partial_defer_inline 1.59% : 0.000004s : 25: predicate.partial_eliminate 1.13% : 0.000003s : 17: predicate.print_const_string_wrapper 0.51% : 0.000001s : 8: predicate.reduce_all_const_elim 1.32% : 0.000003s : 17: predicate.reduce_eliminate 2.52% : 0.000006s : 46: predicate.redundant_stop_gradient_eliminater 0.33% : 0.000001s : 8: predicate.remove_not_recompute_node 1.28% : 0.000003s : 29: predicate.replace_applicator 0.43% : 0.000001s : 8: predicate.replace_old_param 0.20% : 0.000000s : 4: predicate.reset_defer_inline 1.10% : 0.000003s : 17: predicate.reshape_eliminate 0.53% : 0.000001s : 8: predicate.row_tensor_add_zeros_like 0.27% : 0.000001s : 4: predicate.row_tensor_eliminate 0.71% : 0.000002s : 8: predicate.same_eliminate 0.36% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.56% : 0.000001s : 8: predicate.shard_identity_eliminate 0.61% : 0.000002s : 8: predicate.special_op_eliminate 0.55% : 0.000001s : 8: predicate.specialize_transform 0.67% : 0.000002s : 8: predicate.split_environ_get_set_with_tuple_value 0.56% : 0.000001s : 8: predicate.stack_unstack_eliminate 0.27% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.78% : 0.000005s : 29: predicate.switch_defer_inline 2.39% : 0.000006s : 37: predicate.switch_layer_defer_inline 5.94% : 0.000015s : 92: predicate.switch_simplify 1.05% : 0.000003s : 17: predicate.tile_eliminate 1.00% : 0.000003s : 17: predicate.transpose_eliminate 1.56% : 0.000004s : 25: predicate.tuple_list_convert_item_index_to_positive 1.55% : 0.000004s : 25: predicate.tuple_list_get_item_const_eliminator 1.51% : 0.000004s : 25: predicate.tuple_list_get_item_depend_reorder 2.95% : 0.000008s : 37: predicate.tuple_list_get_item_eliminator 1.48% : 0.000004s : 25: predicate.tuple_list_get_set_item_eliminator 2.29% : 0.000006s : 33: predicate.tuple_list_set_item_eliminator 1.68% : 0.000004s : 29: predicate.tuple_to_list_eliminator_ 2.49% : 0.000006s : 46: predicate.updatestate_pure_node_eliminater 2.99% : 0.000008s : 54: predicate.updatestate_useless_node_eliminater 0.30% : 0.000001s : 4: predicate.value_based_eliminate 0.55% : 0.000001s : 8: predicate.virtual_dataset_eliminate 0.49% : 0.000001s : 8: predicate.virtual_output_eliminate 0.24% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.33% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000935 16 55.82% : 0.000522s : 6: func_graph_cloner_run.FuncGraphClonerGraph 44.18% : 0.000413s : 10: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.069182 192 0.03% : 0.000023s : 1: ForceFp32Comm 4.88% : 0.003379s : 1: add_attr 4.86% : 0.003361s : 1: add_attr_with_inline 0.01% : 0.000006s : 1: add_comm_op_reuse_tag 0.08% : 0.000054s : 1: add_recomputation 0.01% : 0.000006s : 1: assign_add_opt 0.16% : 0.000108s : 1: auto_monad 0.05% : 0.000034s : 1: auto_monad_reorder 0.01% : 0.000006s : 1: begin_end_overlap_inline 0.01% : 0.000008s : 1: bias_add_comm_swap 0.68% : 0.000470s : 1: bootstrap 0.05% : 0.000033s : 1: cconv 0.01% : 0.000006s : 1: comm_op_add_attrs 0.03% : 0.000019s : 1: control_data_broadcast_order 0.02% : 0.000013s : 1: convert_after_rewriter 0.04% : 0.000030s : 1: cse_after_recomputation 0.01% : 0.000008s : 1: dataset_repeat_opt 0.03% : 0.000022s : 1: detach_backward 0.02% : 0.000011s : 1: environ_conv 0.09% : 0.000062s : 1: event_method 0.01% : 0.000008s : 1: full_micro_interleaved_order_control 0.01% : 0.000008s : 1: get_jit_bprop_graph 0.02% : 0.000014s : 1: graph_reusing 0.01% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000007s : 1: handle_group_info 0.01% : 0.000009s : 1: inline 0.01% : 0.000009s : 1: insert-virtual-dataset 0.01% : 0.000006s : 1: interleave_parallel_branches 0.01% : 0.000006s : 1: interleave_split_concat_branches 0.01% : 0.000009s : 1: label_fine_grained_interleaved_index 0.01% : 0.000010s : 1: label_micro_interleaved_index 0.67% : 0.000465s : 1: loop_unroll 0.01% : 0.000006s : 1: merge_cast_opt 0.01% : 0.000007s : 1: micro_interleaved_order_control 0.91% : 0.000627s : 1: mutable_eliminate 0.01% : 0.000009s : 1: offloading_packed_experts 0.02% : 0.000014s : 1: opt.transform.loop_unroll_optimizer 0.02% : 0.000016s : 1: opt.transform.mutable_eliminate 2.29% : 0.001588s : 78: opt.transform.opt_a 0.05% : 0.000032s : 1: opt.transform.opt_after_cconv 0.04% : 0.000030s : 1: opt.transform.opt_after_jit_grad 0.16% : 0.000109s : 28: opt.transform.opt_b 0.08% : 0.000053s : 2: opt.transform.opt_trans_graph 0.09% : 0.000065s : 4: opt.transform.symbol_engine_opt 5.51% : 0.003813s : 1: opt_a 0.19% : 0.000131s : 1: opt_after_cconv 1.12% : 0.000778s : 1: opt_after_jit_grad 0.41% : 0.000285s : 1: opt_b 33.00% : 0.022830s : 1: optimize 0.03% : 0.000023s : 1: optimize_parallel_all_gather_comm 0.02% : 0.000011s : 1: order_py_execute_after_rewriter 0.04% : 0.000025s : 1: overlap_grad_flash_sp 0.01% : 0.000006s : 1: overlap_grad_matmul_and_grad_allreduce 0.06% : 0.000042s : 1: overlap_grad_ring_attention 0.01% : 0.000007s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000006s : 1: overlap_opt_shard_in_pipeline 0.01% : 0.000008s : 1: overlap_param_gather 0.01% : 0.000006s : 1: overlap_recompute_allgather_and_fa_grad 0.01% : 0.000010s : 1: overlap_recompute_and_grad_model_parallel 0.01% : 0.000008s : 1: overlap_recompute_comm 0.01% : 0.000010s : 1: parallel-infer-symbol 0.01% : 0.000008s : 1: parallel-infer-symbol-second 0.01% : 0.000007s : 1: partial_unused_args_eliminate 0.01% : 0.000009s : 1: pipeline_parallel_scheduler 0.01% : 0.000007s : 1: pipeline_split 0.08% : 0.000058s : 1: pre_auto_parallel 0.07% : 0.000051s : 1: py_interpret_to_execute 0.02% : 0.000017s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000007s : 1: remove_cast_before_assign_add 0.03% : 0.000020s : 1: remove_dup_value 0.76% : 0.000529s : 1: renormalize.infer 0.66% : 0.000459s : 1: renormalize.specialize 0.01% : 0.000008s : 1: reorder_send_recv_between_fp_bp 0.02% : 0.000012s : 1: rewriter_after_jit_bprop_graph 0.06% : 0.000045s : 1: rewriter_after_opt_a 0.20% : 0.000137s : 1: rewriter_before_opt_a 0.01% : 0.000008s : 1: slice_cell_reuse_recomputed_activation 0.01% : 0.000007s : 1: slice_recompute_activation 0.02% : 0.000011s : 1: split_layernorm_comm 23.13% : 0.016004s : 1: split_matmul_comm_elemetwise 0.01% : 0.000010s : 1: swap_dp_allreduce_reducescatter 0.23% : 0.000156s : 1: symbol_engine_optimizer 0.14% : 0.000099s : 1: tuple_transform 18.41% : 0.012740s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:07.458.760 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0226151, [21] [bootstrap]: 0.00042147 [type_inference]: 0.0130451 [event_method]: 4.814e-05 [auto_monad]: 9.409e-05 [graph_reusing]: 7.15e-06 [inline]: 2.08998e-06 [add_attr]: 0.00308255, [1] [add_attr_with_inline]: 0.00307357, [1] [Cycle 1]: 6.979e-05, [2] [tag_attr]: 2.753e-05 [meta_addattr_fg_expand]: 7.97998e-06 [parallel-infer-symbol]: 3.5e-06 [pre_auto_parallel]: 3.963e-05 [insert-virtual-dataset]: 3.15998e-06 [parallel-infer-symbol-second]: 7.7e-07 [dataset_repeat_opt]: 2.16e-06 [pipeline_split]: 2.01e-06 [optimize]: 0.00518368, [53] [py_interpret_to_execute]: 2.981e-05 [rewriter_before_opt_a]: 0.00010972 [opt_a]: 0.00313544, [2] [Cycle 1]: 0.002484, [45] [expand_dump_flag]: 4.17e-06 [switch_simplify]: 0.00012144 [loop_unroll]: 4.198e-05 [a_1]: 0.00088841 [with_stream_mark]: 1.601e-05 [recompute_prepare]: 9.46e-06 [updatestate_depend_eliminate]: 4.17e-06 [updatestate_assign_eliminate]: 3.10002e-06 [updatestate_loads_eliminate]: 3.03e-06 [parameter_eliminate]: 1.97999e-06 [a_2]: 8.268e-05 [accelerated_algorithm]: 8.18999e-06 [shard]: 2.01e-06 [meta_shard_fg_expand]: 2.14999e-06 [shard_inline]: 7.11001e-06 [merge_send_recv]: 8.68001e-06 [auto_parallel]: 6.41e-06 [parallel]: 1.808e-05 [flash_sp]: 8.22e-06 [merge_comm]: 3.91001e-06 [allreduce_fusion]: 3.75998e-06 [matmul_add_comm_reduction]: 9.32001e-06 [allreduce_slice_to_reducescatter]: 8.70001e-07 [virtual_shard_identity]: 9.23002e-06 [virtual_dataset]: 6.59001e-06 [get_grad_eliminate_]: 6.39001e-06 [virtual_output]: 6.51e-06 [merge_forward]: 3.94002e-06 [cell_reuse_recompute_pass]: 1.39998e-06 [offload_activation]: 9.87001e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.289e-05 [merge_recompute_call_nodes]: 1.42e-06 [before_grad]: 1.054e-05 [set_forward_comm_id_for_comm_node_pass]: 3.56001e-06 [meta_fg_expand]: 3.23e-06 [flash_sp_send_recv_attached]: 2.56e-06 [receive_attached]: 2.58e-06 [after_resolve]: 1.175e-05 [a_after_grad]: 1.039e-05 [renormalize]: 0.0007747 [add_forward_monad_depend]: 5.52001e-06 [auto_monad_grad]: 2.13002e-06 [auto_monad_eliminator]: 1.529e-05 [cse]: 2.785e-05 [a_3]: 4.869e-05 [Cycle 2]: 0.00064153, [45] [expand_dump_flag]: 1.14e-06 [switch_simplify]: 7.77e-06 [loop_unroll]: 6.39001e-06 [a_1]: 0.00013448 [with_stream_mark]: 1.091e-05 [recompute_prepare]: 6.53e-06 [updatestate_depend_eliminate]: 3.17002e-06 [updatestate_assign_eliminate]: 2.29999e-06 [updatestate_loads_eliminate]: 2.66999e-06 [parameter_eliminate]: 1.00999e-06 [a_2]: 7.337e-05 [accelerated_algorithm]: 6.31998e-06 [shard]: 1.12999e-06 [meta_shard_fg_expand]: 1.34e-06 [shard_inline]: 6.55997e-06 [merge_send_recv]: 4.80001e-06 [auto_parallel]: 5.15999e-06 [parallel]: 5.29998e-06 [flash_sp]: 3.65998e-06 [merge_comm]: 2.99001e-06 [allreduce_fusion]: 3.25998e-06 [matmul_add_comm_reduction]: 6.19001e-06 [allreduce_slice_to_reducescatter]: 3.60014e-07 [virtual_shard_identity]: 6.76e-06 [virtual_dataset]: 5.86998e-06 [get_grad_eliminate_]: 5.86e-06 [virtual_output]: 5.62001e-06 [merge_forward]: 2.71999e-06 [cell_reuse_recompute_pass]: 1.56002e-06 [offload_activation]: 6.52001e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.291e-05 [merge_recompute_call_nodes]: 9.20001e-07 [before_grad]: 9.64e-06 [set_forward_comm_id_for_comm_node_pass]: 3.76001e-06 [meta_fg_expand]: 2.06e-06 [flash_sp_send_recv_attached]: 8.70001e-07 [receive_attached]: 9.20001e-07 [after_resolve]: 1.069e-05 [a_after_grad]: 9.74e-06 [renormalize]: 7.00238e-08 [add_forward_monad_depend]: 1.18001e-06 [auto_monad_grad]: 9.70002e-07 [auto_monad_eliminator]: 6.68e-06 [cse]: 1.218e-05 [a_3]: 3.718e-05 [py_interpret_to_execute_after_opt_a]: 9.30001e-06 [slice_cell_reuse_recomputed_activation]: 2.11e-06 [rewriter_after_opt_a]: 3.362e-05 [convert_after_rewriter]: 6.59001e-06 [order_py_execute_after_rewriter]: 5.17999e-06 [mutable_eliminate]: 0.00049086 [opt_b]: 0.00020396, [1] [Cycle 1]: 0.00019779, [7] [b_1]: 0.00012672 [b_2]: 7.80998e-06 [updatestate_depend_eliminate]: 5.22e-06 [updatestate_assign_eliminate]: 2.46e-06 [updatestate_loads_eliminate]: 3.03e-06 [renormalize]: 4.39992e-07 [cse]: 1.686e-05 [optimize_parallel_all_gather_comm]: 1.683e-05 [overlap_param_gather]: 1.86998e-06 [cconv]: 2.278e-05 [loop_unroll]: 0.00045193 [opt_after_cconv]: 0.00010308, [1] [Cycle 1]: 9.71e-05, [7] [c_1]: 3.226e-05 [parameter_eliminate]: 3.01999e-06 [updatestate_depend_eliminate]: 5.44e-06 [updatestate_assign_eliminate]: 2.39001e-06 [updatestate_loads_eliminate]: 2.19001e-06 [cse]: 1.712e-05 [renormalize]: 4.7998e-07 [remove_dup_value]: 1.265e-05 [tuple_transform]: 7.584e-05, [1] [Cycle 1]: 7.143e-05, [4] [d_1]: 4.468e-05 [none_parameter_eliminate]: 1.69e-06 [renormalize]: 1.39989e-07 [switch_simplify]: 6.99001e-06 [partial_unused_args_eliminate]: 1.73997e-06 [add_recomputation]: 4.509e-05 [cse_after_recomputation]: 2.026e-05, [1] [Cycle 1]: 1.621e-05, [1] [cse]: 1.081e-05 [environ_conv]: 5.43002e-06 [swap_dp_allreduce_reducescatter]: 5.02e-06 [bias_add_comm_swap]: 2.79001e-06 [label_micro_interleaved_index]: 4.66002e-06 [label_fine_grained_interleaved_index]: 2.74999e-06 [merge_cast_opt]: 1.54e-06 [slice_recompute_activation]: 2.14999e-06 [micro_interleaved_order_control]: 2.39999e-06 [assign_add_opt]: 1.45001e-06 [ForceFp32Comm]: 1.07e-06 [remove_cast_before_assign_add]: 1.14998e-06 [full_micro_interleaved_order_control]: 1.98002e-06 [reorder_send_recv_between_fp_bp]: 2.71e-06 [comm_op_add_attrs]: 9.29984e-07 [add_comm_op_reuse_tag]: 9.20001e-07 [interleave_split_concat_branches]: 1.20001e-06 [interleave_parallel_branches]: 1.00999e-06 [overlap_opt_shard_in_pipeline]: 1.15999e-06 [overlap_opt_shard_grad_in_pipeline]: 2.02999e-06 [control_data_broadcast_order]: 1.351e-05 [grouped_pairwise_exchange_alltoall]: 1.54998e-06 [offloading_packed_experts]: 3.71999e-06 [overlap_recompute_and_grad_model_parallel]: 4.72e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.47999e-06 [overlap_recompute_allgather_and_fa_grad]: 1.37e-06 [overlap_recompute_comm]: 2.25002e-06 [overlap_grad_ring_attention]: 4.06001e-06 [overlap_grad_flash_sp]: 1.903e-05 [begin_end_overlap_inline]: 6.09987e-07 [split_matmul_comm_elemetwise]: 2.22999e-06 [split_layernorm_comm]: 1.59998e-06 [handle_group_info]: 9.39996e-07 [symbol_engine_optimizer]: 7.668e-05, [1] [Cycle 1]: 7.219e-05, [6] [build]: 2.98998e-06 [elim_shapecalc]: 1.012e-05 [elim_not_effective]: 1.328e-05 [opt_reshape]: 7.08e-06 [fold_const_symbol]: 1.023e-05 [renormalize]: 2.19996e-07 [detach_backward]: 1.91e-06 [pipeline_parallel_scheduler]: 1.74e-06 [auto_monad_reorder]: 1.615e-05 [get_jit_bprop_graph]: 1.79998e-06 [rewriter_after_jit_bprop_graph]: 3.5e-06 [opt_after_jit_grad]: 0.00046483 [validate]: 3.637e-05 Sums bootstrap : 0.000421s : 2.27% type_inference : 0.013045s : 70.22% event_method : 0.000048s : 0.26% auto_monad : 0.000094s : 0.51% graph_reusing : 0.000007s : 0.04% inline : 0.000002s : 0.01% add_attr.add_attr_with_inline.tag_attr : 0.000028s : 0.15% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000008s : 0.04% parallel-infer-symbol : 0.000003s : 0.02% pre_auto_parallel : 0.000040s : 0.21% insert-virtual-dataset : 0.000003s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.01% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000030s : 0.16% optimize.rewriter_before_opt_a : 0.000110s : 0.59% optimize.opt_a.expand_dump_flag : 0.000005s : 0.03% optimize.opt_a.switch_simplify : 0.000129s : 0.70% optimize.opt_a.loop_unroll : 0.000048s : 0.26% optimize.opt_a.a_1 : 0.001023s : 5.51% optimize.opt_a.with_stream_mark : 0.000027s : 0.14% optimize.opt_a.recompute_prepare : 0.000016s : 0.09% optimize.opt_a.updatestate_depend_eliminate : 0.000007s : 0.04% optimize.opt_a.updatestate_assign_eliminate : 0.000005s : 0.03% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.03% optimize.opt_a.parameter_eliminate : 0.000003s : 0.02% optimize.opt_a.a_2 : 0.000156s : 0.84% optimize.opt_a.accelerated_algorithm : 0.000015s : 0.08% optimize.opt_a.shard : 0.000003s : 0.02% optimize.opt_a.meta_shard_fg_expand : 0.000003s : 0.02% optimize.opt_a.shard_inline : 0.000014s : 0.07% optimize.opt_a.merge_send_recv : 0.000013s : 0.07% optimize.opt_a.auto_parallel : 0.000012s : 0.06% optimize.opt_a.parallel : 0.000023s : 0.13% optimize.opt_a.flash_sp : 0.000012s : 0.06% optimize.opt_a.merge_comm : 0.000007s : 0.04% optimize.opt_a.allreduce_fusion : 0.000007s : 0.04% optimize.opt_a.matmul_add_comm_reduction : 0.000016s : 0.08% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000016s : 0.09% optimize.opt_a.virtual_dataset : 0.000012s : 0.07% optimize.opt_a.get_grad_eliminate_ : 0.000012s : 0.07% optimize.opt_a.virtual_output : 0.000012s : 0.07% optimize.opt_a.merge_forward : 0.000007s : 0.04% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.02% optimize.opt_a.offload_activation : 0.000016s : 0.09% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000026s : 0.14% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.01% optimize.opt_a.before_grad : 0.000020s : 0.11% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000007s : 0.04% optimize.opt_a.meta_fg_expand : 0.000005s : 0.03% optimize.opt_a.flash_sp_send_recv_attached : 0.000003s : 0.02% optimize.opt_a.receive_attached : 0.000003s : 0.02% optimize.opt_a.after_resolve : 0.000022s : 0.12% optimize.opt_a.a_after_grad : 0.000020s : 0.11% optimize.opt_a.renormalize : 0.000775s : 4.17% optimize.opt_a.add_forward_monad_depend : 0.000007s : 0.04% optimize.opt_a.auto_monad_grad : 0.000003s : 0.02% optimize.opt_a.auto_monad_eliminator : 0.000022s : 0.12% optimize.opt_a.cse : 0.000040s : 0.22% optimize.opt_a.a_3 : 0.000086s : 0.46% optimize.py_interpret_to_execute_after_opt_a : 0.000009s : 0.05% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.01% optimize.rewriter_after_opt_a : 0.000034s : 0.18% optimize.convert_after_rewriter : 0.000007s : 0.04% optimize.order_py_execute_after_rewriter : 0.000005s : 0.03% optimize.mutable_eliminate : 0.000491s : 2.64% optimize.opt_b.b_1 : 0.000127s : 0.68% optimize.opt_b.b_2 : 0.000008s : 0.04% optimize.opt_b.updatestate_depend_eliminate : 0.000005s : 0.03% optimize.opt_b.updatestate_assign_eliminate : 0.000002s : 0.01% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.02% optimize.opt_b.renormalize : 0.000000s : 0.00% optimize.opt_b.cse : 0.000017s : 0.09% optimize.optimize_parallel_all_gather_comm : 0.000017s : 0.09% optimize.overlap_param_gather : 0.000002s : 0.01% optimize.cconv : 0.000023s : 0.12% optimize.loop_unroll : 0.000452s : 2.43% optimize.opt_after_cconv.c_1 : 0.000032s : 0.17% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.02% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000005s : 0.03% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000002s : 0.01% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.01% optimize.opt_after_cconv.cse : 0.000017s : 0.09% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000013s : 0.07% optimize.tuple_transform.d_1 : 0.000045s : 0.24% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000007s : 0.04% optimize.partial_unused_args_eliminate : 0.000002s : 0.01% optimize.add_recomputation : 0.000045s : 0.24% optimize.cse_after_recomputation.cse : 0.000011s : 0.06% optimize.environ_conv : 0.000005s : 0.03% optimize.swap_dp_allreduce_reducescatter : 0.000005s : 0.03% optimize.bias_add_comm_swap : 0.000003s : 0.02% optimize.label_micro_interleaved_index : 0.000005s : 0.03% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.01% optimize.merge_cast_opt : 0.000002s : 0.01% optimize.slice_recompute_activation : 0.000002s : 0.01% optimize.micro_interleaved_order_control : 0.000002s : 0.01% optimize.assign_add_opt : 0.000001s : 0.01% optimize.ForceFp32Comm : 0.000001s : 0.01% optimize.remove_cast_before_assign_add : 0.000001s : 0.01% optimize.full_micro_interleaved_order_control : 0.000002s : 0.01% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.01% optimize.comm_op_add_attrs : 0.000001s : 0.01% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.01% optimize.interleave_parallel_branches : 0.000001s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.01% optimize.control_data_broadcast_order : 0.000014s : 0.07% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.01% optimize.offloading_packed_experts : 0.000004s : 0.02% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.03% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.01% optimize.overlap_recompute_comm : 0.000002s : 0.01% optimize.overlap_grad_ring_attention : 0.000004s : 0.02% optimize.overlap_grad_flash_sp : 0.000019s : 0.10% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.01% optimize.split_layernorm_comm : 0.000002s : 0.01% optimize.handle_group_info : 0.000001s : 0.01% optimize.symbol_engine_optimizer.build : 0.000003s : 0.02% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000010s : 0.05% optimize.symbol_engine_optimizer.elim_not_effective : 0.000013s : 0.07% optimize.symbol_engine_optimizer.opt_reshape : 0.000007s : 0.04% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000010s : 0.06% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.01% pipeline_parallel_scheduler : 0.000002s : 0.01% auto_monad_reorder : 0.000016s : 0.09% get_jit_bprop_graph : 0.000002s : 0.01% rewriter_after_jit_bprop_graph : 0.000003s : 0.02% opt_after_jit_grad : 0.000465s : 2.50% validate : 0.000036s : 0.20% Time group info: ------[substitution.] 0.000247 34 0.91% : 0.000002s : 2: substitution.elim_not_effective 0.54% : 0.000001s : 2: substitution.fold_const_symbol 2.45% : 0.000006s : 4: substitution.graph_param_transform 80.62% : 0.000199s : 8: substitution.inline 1.41% : 0.000003s : 4: substitution.j_node_and_user_rematch 1.80% : 0.000004s : 4: substitution.remove_not_recompute_node 1.82% : 0.000004s : 4: substitution.replace_old_param 3.89% : 0.000010s : 2: substitution.switch_simplify 6.56% : 0.000016s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.012975 2 89.96% : 0.011672s : 1: type_inference.infer 10.04% : 0.001303s : 1: type_inference.specialize ------[replace.] 0.000119 14 46.10% : 0.000055s : 8: replace.inline 35.33% : 0.000042s : 2: replace.switch_simplify 18.57% : 0.000022s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000217 14 89.62% : 0.000195s : 8: match.inline 3.86% : 0.000008s : 2: match.switch_simplify 6.51% : 0.000014s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000241 1520 1.03% : 0.000002s : 17: predicate.accumulaten_eliminater 0.71% : 0.000002s : 4: predicate.ad_related_special_op_eliminate 0.42% : 0.000001s : 8: predicate.addn_check_dump 1.05% : 0.000003s : 17: predicate.addn_zero_filter 0.91% : 0.000002s : 17: predicate.adjust_all_reduce_mul_add 2.07% : 0.000005s : 25: predicate.arithmetic_simplify 1.08% : 0.000003s : 17: predicate.cast_eliminate 0.51% : 0.000001s : 8: predicate.check_bprop_eliminate 0.40% : 0.000001s : 8: predicate.compare_switch_simplify 0.16% : 0.000000s : 4: predicate.const_output_eliminate 0.49% : 0.000001s : 8: predicate.depend_value_elim 1.09% : 0.000003s : 17: predicate.dict_get_item_const_eliminator 1.10% : 0.000003s : 17: predicate.dict_get_item_eliminator 0.95% : 0.000002s : 17: predicate.dict_set_item_eliminator 0.93% : 0.000002s : 8: predicate.dumpgradient_eliminate 0.17% : 0.000000s : 4: predicate.elim_not_effective 0.29% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.33% : 0.000003s : 21: predicate.environ_add_const_eliminate 1.15% : 0.000003s : 21: predicate.environ_get_add_eliminate 1.14% : 0.000003s : 21: predicate.environ_get_depend_swap 1.73% : 0.000004s : 29: predicate.environ_get_eliminate 1.10% : 0.000003s : 21: predicate.environ_get_set_eliminate 1.72% : 0.000004s : 29: predicate.exchange_switch_depend_value 2.90% : 0.000007s : 29: predicate.float_depend_g_call 0.42% : 0.000001s : 8: predicate.float_environ_get_switch 0.62% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.14% : 0.000000s : 4: predicate.fold_const_symbol 0.53% : 0.000001s : 8: predicate.get_grad_eliminate 0.21% : 0.000001s : 4: predicate.graph_param_transform 0.49% : 0.000001s : 8: predicate.incorporate_call 0.39% : 0.000001s : 8: predicate.incorporate_call_switch 6.69% : 0.000016s : 70: predicate.inline 0.67% : 0.000002s : 8: predicate.inline_without_move 0.29% : 0.000001s : 8: predicate.j_node_and_user_rematch 0.81% : 0.000002s : 8: predicate.less_batch_normalization 1.87% : 0.000004s : 29: predicate.list_to_tuple_eliminator_ 2.52% : 0.000006s : 46: predicate.load_eliminater 0.79% : 0.000002s : 4: predicate.loop_unroll_after_grad 2.75% : 0.000007s : 47: predicate.loop_unroll_before_grad 1.51% : 0.000004s : 25: predicate.make_slice_get_slice_eliminator 0.45% : 0.000001s : 8: predicate.merge_addn 0.45% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.46% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.98% : 0.000002s : 17: predicate.minmaximum_grad 0.75% : 0.000002s : 4: predicate.mutable_eliminate 0.32% : 0.000001s : 4: predicate.opt_reshape 0.27% : 0.000001s : 4: predicate.parallel_virtual_node 2.17% : 0.000005s : 29: predicate.partial_defer_inline 1.66% : 0.000004s : 25: predicate.partial_eliminate 0.92% : 0.000002s : 17: predicate.print_const_string_wrapper 0.45% : 0.000001s : 8: predicate.reduce_all_const_elim 1.45% : 0.000003s : 17: predicate.reduce_eliminate 2.50% : 0.000006s : 46: predicate.redundant_stop_gradient_eliminater 0.39% : 0.000001s : 8: predicate.remove_not_recompute_node 1.38% : 0.000003s : 29: predicate.replace_applicator 0.45% : 0.000001s : 8: predicate.replace_old_param 0.32% : 0.000001s : 4: predicate.reset_defer_inline 1.11% : 0.000003s : 17: predicate.reshape_eliminate 0.49% : 0.000001s : 8: predicate.row_tensor_add_zeros_like 0.30% : 0.000001s : 4: predicate.row_tensor_eliminate 0.65% : 0.000002s : 8: predicate.same_eliminate 0.37% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.62% : 0.000002s : 8: predicate.shard_identity_eliminate 0.55% : 0.000001s : 8: predicate.special_op_eliminate 0.60% : 0.000001s : 8: predicate.specialize_transform 0.66% : 0.000002s : 8: predicate.split_environ_get_set_with_tuple_value 0.63% : 0.000002s : 8: predicate.stack_unstack_eliminate 0.28% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.86% : 0.000004s : 29: predicate.switch_defer_inline 2.30% : 0.000006s : 37: predicate.switch_layer_defer_inline 6.01% : 0.000014s : 92: predicate.switch_simplify 0.99% : 0.000002s : 17: predicate.tile_eliminate 1.18% : 0.000003s : 17: predicate.transpose_eliminate 1.55% : 0.000004s : 25: predicate.tuple_list_convert_item_index_to_positive 1.59% : 0.000004s : 25: predicate.tuple_list_get_item_const_eliminator 1.54% : 0.000004s : 25: predicate.tuple_list_get_item_depend_reorder 3.22% : 0.000008s : 37: predicate.tuple_list_get_item_eliminator 1.59% : 0.000004s : 25: predicate.tuple_list_get_set_item_eliminator 2.14% : 0.000005s : 33: predicate.tuple_list_set_item_eliminator 1.68% : 0.000004s : 29: predicate.tuple_to_list_eliminator_ 2.52% : 0.000006s : 46: predicate.updatestate_pure_node_eliminater 3.01% : 0.000007s : 54: predicate.updatestate_useless_node_eliminater 0.33% : 0.000001s : 4: predicate.value_based_eliminate 0.49% : 0.000001s : 8: predicate.virtual_dataset_eliminate 0.59% : 0.000001s : 8: predicate.virtual_output_eliminate 0.24% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.39% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000912 16 58.47% : 0.000533s : 6: func_graph_cloner_run.FuncGraphClonerGraph 41.53% : 0.000379s : 10: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.033334 192 0.01% : 0.000004s : 1: ForceFp32Comm 9.26% : 0.003087s : 1: add_attr 9.23% : 0.003077s : 1: add_attr_with_inline 0.01% : 0.000003s : 1: add_comm_op_reuse_tag 0.15% : 0.000049s : 1: add_recomputation 0.01% : 0.000004s : 1: assign_add_opt 0.30% : 0.000100s : 1: auto_monad 0.06% : 0.000020s : 1: auto_monad_reorder 0.01% : 0.000003s : 1: begin_end_overlap_inline 0.02% : 0.000005s : 1: bias_add_comm_swap 1.35% : 0.000449s : 1: bootstrap 0.08% : 0.000026s : 1: cconv 0.01% : 0.000004s : 1: comm_op_add_attrs 0.05% : 0.000017s : 1: control_data_broadcast_order 0.03% : 0.000010s : 1: convert_after_rewriter 0.07% : 0.000023s : 1: cse_after_recomputation 0.02% : 0.000005s : 1: dataset_repeat_opt 0.02% : 0.000005s : 1: detach_backward 0.03% : 0.000008s : 1: environ_conv 0.17% : 0.000056s : 1: event_method 0.01% : 0.000005s : 1: full_micro_interleaved_order_control 0.01% : 0.000005s : 1: get_jit_bprop_graph 0.03% : 0.000011s : 1: graph_reusing 0.01% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000004s : 1: handle_group_info 0.02% : 0.000005s : 1: inline 0.02% : 0.000006s : 1: insert-virtual-dataset 0.01% : 0.000004s : 1: interleave_parallel_branches 0.01% : 0.000004s : 1: interleave_split_concat_branches 0.02% : 0.000007s : 1: label_fine_grained_interleaved_index 0.02% : 0.000007s : 1: label_micro_interleaved_index 1.38% : 0.000460s : 1: loop_unroll 0.01% : 0.000004s : 1: merge_cast_opt 0.02% : 0.000005s : 1: micro_interleaved_order_control 1.50% : 0.000500s : 1: mutable_eliminate 0.02% : 0.000007s : 1: offloading_packed_experts 0.04% : 0.000015s : 1: opt.transform.loop_unroll_optimizer 0.05% : 0.000015s : 1: opt.transform.mutable_eliminate 4.61% : 0.001538s : 78: opt.transform.opt_a 0.09% : 0.000031s : 1: opt.transform.opt_after_cconv 0.08% : 0.000026s : 1: opt.transform.opt_after_jit_grad 0.31% : 0.000104s : 28: opt.transform.opt_b 0.15% : 0.000049s : 2: opt.transform.opt_trans_graph 0.11% : 0.000037s : 4: opt.transform.symbol_engine_opt 9.42% : 0.003139s : 1: opt_a 0.32% : 0.000106s : 1: opt_after_cconv 1.42% : 0.000474s : 1: opt_after_jit_grad 0.62% : 0.000207s : 1: opt_b 15.56% : 0.005188s : 1: optimize 0.06% : 0.000020s : 1: optimize_parallel_all_gather_comm 0.02% : 0.000008s : 1: order_py_execute_after_rewriter 0.07% : 0.000022s : 1: overlap_grad_flash_sp 0.01% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.02% : 0.000007s : 1: overlap_grad_ring_attention 0.02% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.01% : 0.000005s : 1: overlap_param_gather 0.01% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.02% : 0.000008s : 1: overlap_recompute_and_grad_model_parallel 0.02% : 0.000005s : 1: overlap_recompute_comm 0.02% : 0.000007s : 1: parallel-infer-symbol 0.01% : 0.000004s : 1: parallel-infer-symbol-second 0.02% : 0.000005s : 1: partial_unused_args_eliminate 0.02% : 0.000005s : 1: pipeline_parallel_scheduler 0.01% : 0.000005s : 1: pipeline_split 0.13% : 0.000044s : 1: pre_auto_parallel 0.10% : 0.000034s : 1: py_interpret_to_execute 0.04% : 0.000012s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000005s : 1: remove_cast_before_assign_add 0.05% : 0.000016s : 1: remove_dup_value 1.14% : 0.000381s : 1: renormalize.infer 1.16% : 0.000385s : 1: renormalize.specialize 0.02% : 0.000005s : 1: reorder_send_recv_between_fp_bp 0.02% : 0.000007s : 1: rewriter_after_jit_bprop_graph 0.11% : 0.000038s : 1: rewriter_after_opt_a 0.34% : 0.000114s : 1: rewriter_before_opt_a 0.01% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.01% : 0.000005s : 1: slice_recompute_activation 0.01% : 0.000004s : 1: split_layernorm_comm 0.02% : 0.000005s : 1: split_matmul_comm_elemetwise 0.02% : 0.000008s : 1: swap_dp_allreduce_reducescatter 0.24% : 0.000079s : 1: symbol_engine_optimizer 0.24% : 0.000079s : 1: tuple_transform 39.19% : 0.013064s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:07.827.637 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:07.827.896 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0778453, [21] [bootstrap]: 0.00043197 [type_inference]: 0.041898 [event_method]: 4.963e-05 [auto_monad]: 0.00010037 [graph_reusing]: 7.75e-06 [inline]: 2.06e-06 [add_attr]: 0.00321426, [1] [add_attr_with_inline]: 0.00320304, [1] [Cycle 1]: 9.696e-05, [2] [tag_attr]: 2.868e-05 [meta_addattr_fg_expand]: 8.29998e-06 [parallel-infer-symbol]: 3.46001e-06 [pre_auto_parallel]: 4.542e-05 [insert-virtual-dataset]: 2.36e-06 [parallel-infer-symbol-second]: 6.89994e-07 [dataset_repeat_opt]: 2.29999e-06 [pipeline_split]: 1.67001e-06 [optimize]: 0.0307396, [53] [py_interpret_to_execute]: 3.992e-05 [rewriter_before_opt_a]: 0.00011896 [opt_a]: 0.0037139, [2] [Cycle 1]: 0.00286443, [45] [expand_dump_flag]: 4.18001e-06 [switch_simplify]: 0.0001303 [loop_unroll]: 4.222e-05 [a_1]: 0.00087919 [with_stream_mark]: 2.008e-05 [recompute_prepare]: 1.149e-05 [updatestate_depend_eliminate]: 4.99e-06 [updatestate_assign_eliminate]: 3.85e-06 [updatestate_loads_eliminate]: 2.79001e-06 [parameter_eliminate]: 2.77002e-06 [a_2]: 0.00011306 [accelerated_algorithm]: 7.56001e-06 [shard]: 2.45002e-06 [meta_shard_fg_expand]: 2.41e-06 [shard_inline]: 7.00998e-06 [merge_send_recv]: 9.64e-06 [auto_parallel]: 7.97e-06 [parallel]: 1.905e-05 [flash_sp]: 9.09998e-06 [merge_comm]: 3.96001e-06 [allreduce_fusion]: 3.52002e-06 [matmul_add_comm_reduction]: 1.037e-05 [allreduce_slice_to_reducescatter]: 7.2e-07 [virtual_shard_identity]: 1.037e-05 [virtual_dataset]: 7.23999e-06 [get_grad_eliminate_]: 6.85002e-06 [virtual_output]: 7.06999e-06 [merge_forward]: 4.05998e-06 [cell_reuse_recompute_pass]: 1.20999e-06 [offload_activation]: 1.047e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.555e-05 [merge_recompute_call_nodes]: 1.62999e-06 [before_grad]: 1.172e-05 [set_forward_comm_id_for_comm_node_pass]: 3.95e-06 [meta_fg_expand]: 3.14999e-06 [flash_sp_send_recv_attached]: 2.71e-06 [receive_attached]: 1.99999e-06 [after_resolve]: 1.263e-05 [a_after_grad]: 1.025e-05 [renormalize]: 0.00091519 [add_forward_monad_depend]: 7.28e-06 [auto_monad_grad]: 3.09001e-06 [auto_monad_eliminator]: 1.695e-05 [cse]: 2.911e-05 [a_3]: 6.397e-05 [Cycle 2]: 0.00083586, [45] [expand_dump_flag]: 1.53002e-06 [switch_simplify]: 8.43999e-06 [loop_unroll]: 6.41e-06 [a_1]: 0.00013571 [with_stream_mark]: 1.347e-05 [recompute_prepare]: 6.38998e-06 [updatestate_depend_eliminate]: 3.11001e-06 [updatestate_assign_eliminate]: 2.56998e-06 [updatestate_loads_eliminate]: 2.76999e-06 [parameter_eliminate]: 1.27e-06 [a_2]: 0.00010128 [accelerated_algorithm]: 7.05998e-06 [shard]: 1.57001e-06 [meta_shard_fg_expand]: 1.43002e-06 [shard_inline]: 6.58998e-06 [merge_send_recv]: 6.16998e-06 [auto_parallel]: 7.01001e-06 [parallel]: 5.46002e-06 [flash_sp]: 3.55e-06 [merge_comm]: 3.36999e-06 [allreduce_fusion]: 3.21001e-06 [matmul_add_comm_reduction]: 7.22002e-06 [allreduce_slice_to_reducescatter]: 5.39992e-07 [virtual_shard_identity]: 8.00999e-06 [virtual_dataset]: 6.34999e-06 [get_grad_eliminate_]: 5.95002e-06 [virtual_output]: 6.08998e-06 [merge_forward]: 2.99999e-06 [cell_reuse_recompute_pass]: 1.46002e-06 [offload_activation]: 7.03e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.685e-05 [merge_recompute_call_nodes]: 1.19e-06 [before_grad]: 9.78002e-06 [set_forward_comm_id_for_comm_node_pass]: 3.7e-06 [meta_fg_expand]: 2.41e-06 [flash_sp_send_recv_attached]: 1.30001e-06 [receive_attached]: 1.03001e-06 [after_resolve]: 1.14e-05 [a_after_grad]: 9.57001e-06 [renormalize]: 5.9983e-08 [add_forward_monad_depend]: 1.81e-06 [auto_monad_grad]: 1.42999e-06 [auto_monad_eliminator]: 7.77e-06 [cse]: 1.309e-05 [a_3]: 5.193e-05 [py_interpret_to_execute_after_opt_a]: 1.435e-05 [slice_cell_reuse_recomputed_activation]: 4.78001e-06 [rewriter_after_opt_a]: 4.21e-05 [convert_after_rewriter]: 9.91998e-06 [order_py_execute_after_rewriter]: 8.37998e-06 [mutable_eliminate]: 0.0246756 [opt_b]: 0.00035568, [1] [Cycle 1]: 0.00034118, [7] [b_1]: 0.0002085 [b_2]: 9.80002e-06 [updatestate_depend_eliminate]: 1.383e-05 [updatestate_assign_eliminate]: 3.3e-06 [updatestate_loads_eliminate]: 2.98e-06 [renormalize]: 1.19e-06 [cse]: 3.769e-05 [optimize_parallel_all_gather_comm]: 2.931e-05 [overlap_param_gather]: 4.85999e-06 [cconv]: 4.165e-05 [loop_unroll]: 0.00069729 [opt_after_cconv]: 0.00015276, [1] [Cycle 1]: 0.00014188, [7] [c_1]: 3.817e-05 [parameter_eliminate]: 6.84999e-06 [updatestate_depend_eliminate]: 7.15998e-06 [updatestate_assign_eliminate]: 2.63e-06 [updatestate_loads_eliminate]: 2.46e-06 [cse]: 2.127e-05 [renormalize]: 6.60017e-07 [remove_dup_value]: 1.912e-05 [tuple_transform]: 0.00010807, [1] [Cycle 1]: 9.919e-05, [4] [d_1]: 5.683e-05 [none_parameter_eliminate]: 1.67001e-06 [renormalize]: 1.59984e-07 [switch_simplify]: 8.08999e-06 [partial_unused_args_eliminate]: 5.02e-06 [add_recomputation]: 6.004e-05 [cse_after_recomputation]: 3.005e-05, [1] [Cycle 1]: 2.288e-05, [1] [cse]: 1.344e-05 [environ_conv]: 9.24e-06 [swap_dp_allreduce_reducescatter]: 9.49e-06 [bias_add_comm_swap]: 5.78997e-06 [label_micro_interleaved_index]: 8.03999e-06 [label_fine_grained_interleaved_index]: 5.74e-06 [merge_cast_opt]: 3.80998e-06 [slice_recompute_activation]: 4.69998e-06 [micro_interleaved_order_control]: 5.14e-06 [assign_add_opt]: 3.56001e-06 [ForceFp32Comm]: 3.38e-06 [remove_cast_before_assign_add]: 3.45e-06 [full_micro_interleaved_order_control]: 5.09998e-06 [reorder_send_recv_between_fp_bp]: 6.81001e-06 [comm_op_add_attrs]: 4.317e-05 [add_comm_op_reuse_tag]: 3.93001e-06 [interleave_split_concat_branches]: 4.4e-06 [interleave_parallel_branches]: 3.58999e-06 [overlap_opt_shard_in_pipeline]: 3.96001e-06 [overlap_opt_shard_grad_in_pipeline]: 4.47e-06 [control_data_broadcast_order]: 1.984e-05 [grouped_pairwise_exchange_alltoall]: 4.48999e-06 [offloading_packed_experts]: 6.62002e-06 [overlap_recompute_and_grad_model_parallel]: 8.02003e-06 [overlap_grad_matmul_and_grad_allreduce]: 4e-06 [overlap_recompute_allgather_and_fa_grad]: 3.90998e-06 [overlap_recompute_comm]: 5.76003e-06 [overlap_grad_ring_attention]: 7e-06 [overlap_grad_flash_sp]: 2.77e-05 [begin_end_overlap_inline]: 3.26999e-06 [split_matmul_comm_elemetwise]: 4.62e-06 [split_layernorm_comm]: 4.27003e-06 [handle_group_info]: 3.91001e-06 [symbol_engine_optimizer]: 0.00011569, [1] [Cycle 1]: 0.00010681, [6] [build]: 3.86001e-06 [elim_shapecalc]: 1.305e-05 [elim_not_effective]: 1.581e-05 [opt_reshape]: 8.55001e-06 [fold_const_symbol]: 1.277e-05 [renormalize]: 2.79979e-07 [detach_backward]: 4.79998e-06 [pipeline_parallel_scheduler]: 1.87001e-06 [auto_monad_reorder]: 2.115e-05 [get_jit_bprop_graph]: 2.19001e-06 [rewriter_after_jit_bprop_graph]: 7.04001e-06 [opt_after_jit_grad]: 0.0005612 [validate]: 4.611e-05 Sums bootstrap : 0.000432s : 0.59% type_inference : 0.041898s : 57.64% event_method : 0.000050s : 0.07% auto_monad : 0.000100s : 0.14% graph_reusing : 0.000008s : 0.01% inline : 0.000002s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000029s : 0.04% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000008s : 0.01% parallel-infer-symbol : 0.000003s : 0.00% pre_auto_parallel : 0.000045s : 0.06% insert-virtual-dataset : 0.000002s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000040s : 0.05% optimize.rewriter_before_opt_a : 0.000119s : 0.16% optimize.opt_a.expand_dump_flag : 0.000006s : 0.01% optimize.opt_a.switch_simplify : 0.000139s : 0.19% optimize.opt_a.loop_unroll : 0.000049s : 0.07% optimize.opt_a.a_1 : 0.001015s : 1.40% optimize.opt_a.with_stream_mark : 0.000034s : 0.05% optimize.opt_a.recompute_prepare : 0.000018s : 0.02% optimize.opt_a.updatestate_depend_eliminate : 0.000008s : 0.01% optimize.opt_a.updatestate_assign_eliminate : 0.000006s : 0.01% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.01% optimize.opt_a.parameter_eliminate : 0.000004s : 0.01% optimize.opt_a.a_2 : 0.000214s : 0.29% optimize.opt_a.accelerated_algorithm : 0.000015s : 0.02% optimize.opt_a.shard : 0.000004s : 0.01% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.01% optimize.opt_a.shard_inline : 0.000014s : 0.02% optimize.opt_a.merge_send_recv : 0.000016s : 0.02% optimize.opt_a.auto_parallel : 0.000015s : 0.02% optimize.opt_a.parallel : 0.000025s : 0.03% optimize.opt_a.flash_sp : 0.000013s : 0.02% optimize.opt_a.merge_comm : 0.000007s : 0.01% optimize.opt_a.allreduce_fusion : 0.000007s : 0.01% optimize.opt_a.matmul_add_comm_reduction : 0.000018s : 0.02% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000018s : 0.03% optimize.opt_a.virtual_dataset : 0.000014s : 0.02% optimize.opt_a.get_grad_eliminate_ : 0.000013s : 0.02% optimize.opt_a.virtual_output : 0.000013s : 0.02% optimize.opt_a.merge_forward : 0.000007s : 0.01% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.00% optimize.opt_a.offload_activation : 0.000018s : 0.02% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000032s : 0.04% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.00% optimize.opt_a.before_grad : 0.000022s : 0.03% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000008s : 0.01% optimize.opt_a.meta_fg_expand : 0.000006s : 0.01% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.01% optimize.opt_a.receive_attached : 0.000003s : 0.00% optimize.opt_a.after_resolve : 0.000024s : 0.03% optimize.opt_a.a_after_grad : 0.000020s : 0.03% optimize.opt_a.renormalize : 0.000915s : 1.26% optimize.opt_a.add_forward_monad_depend : 0.000009s : 0.01% optimize.opt_a.auto_monad_grad : 0.000005s : 0.01% optimize.opt_a.auto_monad_eliminator : 0.000025s : 0.03% optimize.opt_a.cse : 0.000042s : 0.06% optimize.opt_a.a_3 : 0.000116s : 0.16% optimize.py_interpret_to_execute_after_opt_a : 0.000014s : 0.02% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.01% optimize.rewriter_after_opt_a : 0.000042s : 0.06% optimize.convert_after_rewriter : 0.000010s : 0.01% optimize.order_py_execute_after_rewriter : 0.000008s : 0.01% optimize.mutable_eliminate : 0.024676s : 33.95% optimize.opt_b.b_1 : 0.000208s : 0.29% optimize.opt_b.b_2 : 0.000010s : 0.01% optimize.opt_b.updatestate_depend_eliminate : 0.000014s : 0.02% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.00% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000038s : 0.05% optimize.optimize_parallel_all_gather_comm : 0.000029s : 0.04% optimize.overlap_param_gather : 0.000005s : 0.01% optimize.cconv : 0.000042s : 0.06% optimize.loop_unroll : 0.000697s : 0.96% optimize.opt_after_cconv.c_1 : 0.000038s : 0.05% optimize.opt_after_cconv.parameter_eliminate : 0.000007s : 0.01% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000007s : 0.01% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.00% optimize.opt_after_cconv.cse : 0.000021s : 0.03% optimize.opt_after_cconv.renormalize : 0.000001s : 0.00% optimize.remove_dup_value : 0.000019s : 0.03% optimize.tuple_transform.d_1 : 0.000057s : 0.08% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000008s : 0.01% optimize.partial_unused_args_eliminate : 0.000005s : 0.01% optimize.add_recomputation : 0.000060s : 0.08% optimize.cse_after_recomputation.cse : 0.000013s : 0.02% optimize.environ_conv : 0.000009s : 0.01% optimize.swap_dp_allreduce_reducescatter : 0.000009s : 0.01% optimize.bias_add_comm_swap : 0.000006s : 0.01% optimize.label_micro_interleaved_index : 0.000008s : 0.01% optimize.label_fine_grained_interleaved_index : 0.000006s : 0.01% optimize.merge_cast_opt : 0.000004s : 0.01% optimize.slice_recompute_activation : 0.000005s : 0.01% optimize.micro_interleaved_order_control : 0.000005s : 0.01% optimize.assign_add_opt : 0.000004s : 0.00% optimize.ForceFp32Comm : 0.000003s : 0.00% optimize.remove_cast_before_assign_add : 0.000003s : 0.00% optimize.full_micro_interleaved_order_control : 0.000005s : 0.01% optimize.reorder_send_recv_between_fp_bp : 0.000007s : 0.01% optimize.comm_op_add_attrs : 0.000043s : 0.06% optimize.add_comm_op_reuse_tag : 0.000004s : 0.01% optimize.interleave_split_concat_branches : 0.000004s : 0.01% optimize.interleave_parallel_branches : 0.000004s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000004s : 0.01% optimize.control_data_broadcast_order : 0.000020s : 0.03% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.01% optimize.offloading_packed_experts : 0.000007s : 0.01% optimize.overlap_recompute_and_grad_model_parallel : 0.000008s : 0.01% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.01% optimize.overlap_recompute_comm : 0.000006s : 0.01% optimize.overlap_grad_ring_attention : 0.000007s : 0.01% optimize.overlap_grad_flash_sp : 0.000028s : 0.04% optimize.begin_end_overlap_inline : 0.000003s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000005s : 0.01% optimize.split_layernorm_comm : 0.000004s : 0.01% optimize.handle_group_info : 0.000004s : 0.01% optimize.symbol_engine_optimizer.build : 0.000004s : 0.01% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000013s : 0.02% optimize.symbol_engine_optimizer.elim_not_effective : 0.000016s : 0.02% optimize.symbol_engine_optimizer.opt_reshape : 0.000009s : 0.01% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000013s : 0.02% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000005s : 0.01% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000021s : 0.03% get_jit_bprop_graph : 0.000002s : 0.00% rewriter_after_jit_bprop_graph : 0.000007s : 0.01% opt_after_jit_grad : 0.000561s : 0.77% validate : 0.000046s : 0.06% Time group info: ------[substitution.] 0.000230 34 0.96% : 0.000002s : 2: substitution.elim_not_effective 0.62% : 0.000001s : 2: substitution.fold_const_symbol 2.97% : 0.000007s : 4: substitution.graph_param_transform 78.05% : 0.000179s : 8: substitution.inline 1.90% : 0.000004s : 4: substitution.j_node_and_user_rematch 2.02% : 0.000005s : 4: substitution.remove_not_recompute_node 2.02% : 0.000005s : 4: substitution.replace_old_param 4.55% : 0.000010s : 2: substitution.switch_simplify 6.92% : 0.000016s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.041824 2 96.75% : 0.040465s : 1: type_inference.infer 3.25% : 0.001359s : 1: type_inference.specialize ------[replace.] 0.000127 14 44.15% : 0.000056s : 8: replace.inline 37.70% : 0.000048s : 2: replace.switch_simplify 18.16% : 0.000023s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000198 14 88.23% : 0.000175s : 8: match.inline 4.67% : 0.000009s : 2: match.switch_simplify 7.10% : 0.000014s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000260 1520 0.86% : 0.000002s : 17: predicate.accumulaten_eliminater 0.60% : 0.000002s : 4: predicate.ad_related_special_op_eliminate 0.41% : 0.000001s : 8: predicate.addn_check_dump 0.93% : 0.000002s : 17: predicate.addn_zero_filter 0.81% : 0.000002s : 17: predicate.adjust_all_reduce_mul_add 2.44% : 0.000006s : 25: predicate.arithmetic_simplify 0.91% : 0.000002s : 17: predicate.cast_eliminate 0.50% : 0.000001s : 8: predicate.check_bprop_eliminate 0.41% : 0.000001s : 8: predicate.compare_switch_simplify 0.15% : 0.000000s : 4: predicate.const_output_eliminate 0.46% : 0.000001s : 8: predicate.depend_value_elim 0.98% : 0.000003s : 17: predicate.dict_get_item_const_eliminator 1.07% : 0.000003s : 17: predicate.dict_get_item_eliminator 0.89% : 0.000002s : 17: predicate.dict_set_item_eliminator 0.93% : 0.000002s : 8: predicate.dumpgradient_eliminate 0.22% : 0.000001s : 4: predicate.elim_not_effective 0.37% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.13% : 0.000003s : 21: predicate.environ_add_const_eliminate 1.09% : 0.000003s : 21: predicate.environ_get_add_eliminate 1.09% : 0.000003s : 21: predicate.environ_get_depend_swap 1.66% : 0.000004s : 29: predicate.environ_get_eliminate 1.11% : 0.000003s : 21: predicate.environ_get_set_eliminate 1.64% : 0.000004s : 29: predicate.exchange_switch_depend_value 2.62% : 0.000007s : 29: predicate.float_depend_g_call 0.41% : 0.000001s : 8: predicate.float_environ_get_switch 0.69% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.15% : 0.000000s : 4: predicate.fold_const_symbol 0.57% : 0.000001s : 8: predicate.get_grad_eliminate 0.27% : 0.000001s : 4: predicate.graph_param_transform 0.46% : 0.000001s : 8: predicate.incorporate_call 0.40% : 0.000001s : 8: predicate.incorporate_call_switch 6.65% : 0.000017s : 70: predicate.inline 0.58% : 0.000002s : 8: predicate.inline_without_move 0.24% : 0.000001s : 8: predicate.j_node_and_user_rematch 0.62% : 0.000002s : 8: predicate.less_batch_normalization 1.80% : 0.000005s : 29: predicate.list_to_tuple_eliminator_ 2.54% : 0.000007s : 46: predicate.load_eliminater 1.04% : 0.000003s : 4: predicate.loop_unroll_after_grad 2.65% : 0.000007s : 47: predicate.loop_unroll_before_grad 1.67% : 0.000004s : 25: predicate.make_slice_get_slice_eliminator 0.43% : 0.000001s : 8: predicate.merge_addn 0.50% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.45% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.85% : 0.000002s : 17: predicate.minmaximum_grad 2.45% : 0.000006s : 4: predicate.mutable_eliminate 0.36% : 0.000001s : 4: predicate.opt_reshape 0.30% : 0.000001s : 4: predicate.parallel_virtual_node 2.00% : 0.000005s : 29: predicate.partial_defer_inline 1.56% : 0.000004s : 25: predicate.partial_eliminate 0.88% : 0.000002s : 17: predicate.print_const_string_wrapper 0.45% : 0.000001s : 8: predicate.reduce_all_const_elim 1.17% : 0.000003s : 17: predicate.reduce_eliminate 2.52% : 0.000007s : 46: predicate.redundant_stop_gradient_eliminater 0.34% : 0.000001s : 8: predicate.remove_not_recompute_node 1.32% : 0.000003s : 29: predicate.replace_applicator 0.41% : 0.000001s : 8: predicate.replace_old_param 0.49% : 0.000001s : 4: predicate.reset_defer_inline 0.91% : 0.000002s : 17: predicate.reshape_eliminate 0.51% : 0.000001s : 8: predicate.row_tensor_add_zeros_like 0.48% : 0.000001s : 4: predicate.row_tensor_eliminate 0.58% : 0.000002s : 8: predicate.same_eliminate 0.32% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.73% : 0.000002s : 8: predicate.shard_identity_eliminate 0.71% : 0.000002s : 8: predicate.special_op_eliminate 0.65% : 0.000002s : 8: predicate.specialize_transform 0.75% : 0.000002s : 8: predicate.split_environ_get_set_with_tuple_value 0.55% : 0.000001s : 8: predicate.stack_unstack_eliminate 0.34% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.73% : 0.000005s : 29: predicate.switch_defer_inline 2.06% : 0.000005s : 37: predicate.switch_layer_defer_inline 5.86% : 0.000015s : 92: predicate.switch_simplify 1.33% : 0.000003s : 17: predicate.tile_eliminate 0.96% : 0.000002s : 17: predicate.transpose_eliminate 1.55% : 0.000004s : 25: predicate.tuple_list_convert_item_index_to_positive 1.73% : 0.000004s : 25: predicate.tuple_list_get_item_const_eliminator 1.54% : 0.000004s : 25: predicate.tuple_list_get_item_depend_reorder 3.51% : 0.000009s : 37: predicate.tuple_list_get_item_eliminator 1.44% : 0.000004s : 25: predicate.tuple_list_get_set_item_eliminator 2.22% : 0.000006s : 33: predicate.tuple_list_set_item_eliminator 1.62% : 0.000004s : 29: predicate.tuple_to_list_eliminator_ 2.42% : 0.000006s : 46: predicate.updatestate_pure_node_eliminater 2.97% : 0.000008s : 54: predicate.updatestate_useless_node_eliminater 0.38% : 0.000001s : 4: predicate.value_based_eliminate 0.47% : 0.000001s : 8: predicate.virtual_dataset_eliminate 0.48% : 0.000001s : 8: predicate.virtual_output_eliminate 0.22% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.47% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.001061 16 60.61% : 0.000643s : 6: func_graph_cloner_run.FuncGraphClonerGraph 39.39% : 0.000418s : 10: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.114476 192 0.01% : 0.000006s : 1: ForceFp32Comm 2.82% : 0.003224s : 1: add_attr 2.80% : 0.003208s : 1: add_attr_with_inline 0.01% : 0.000007s : 1: add_comm_op_reuse_tag 0.06% : 0.000064s : 1: add_recomputation 0.01% : 0.000006s : 1: assign_add_opt 0.10% : 0.000111s : 1: auto_monad 0.03% : 0.000030s : 1: auto_monad_reorder 0.01% : 0.000006s : 1: begin_end_overlap_inline 0.01% : 0.000008s : 1: bias_add_comm_swap 0.41% : 0.000474s : 1: bootstrap 0.04% : 0.000045s : 1: cconv 0.04% : 0.000047s : 1: comm_op_add_attrs 0.02% : 0.000023s : 1: control_data_broadcast_order 0.01% : 0.000013s : 1: convert_after_rewriter 0.03% : 0.000033s : 1: cse_after_recomputation 0.01% : 0.000008s : 1: dataset_repeat_opt 0.02% : 0.000025s : 1: detach_backward 0.01% : 0.000012s : 1: environ_conv 0.05% : 0.000063s : 1: event_method 0.01% : 0.000008s : 1: full_micro_interleaved_order_control 0.01% : 0.000008s : 1: get_jit_bprop_graph 0.01% : 0.000014s : 1: graph_reusing 0.01% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000007s : 1: handle_group_info 0.01% : 0.000008s : 1: inline 0.01% : 0.000009s : 1: insert-virtual-dataset 0.01% : 0.000007s : 1: interleave_parallel_branches 0.01% : 0.000007s : 1: interleave_split_concat_branches 0.01% : 0.000009s : 1: label_fine_grained_interleaved_index 0.01% : 0.000011s : 1: label_micro_interleaved_index 0.62% : 0.000705s : 1: loop_unroll 0.01% : 0.000007s : 1: merge_cast_opt 0.01% : 0.000008s : 1: micro_interleaved_order_control 21.57% : 0.024691s : 1: mutable_eliminate 0.01% : 0.000010s : 1: offloading_packed_experts 0.02% : 0.000019s : 1: opt.transform.loop_unroll_optimizer 0.04% : 0.000041s : 1: opt.transform.mutable_eliminate 1.36% : 0.001557s : 78: opt.transform.opt_a 0.03% : 0.000037s : 1: opt.transform.opt_after_cconv 0.03% : 0.000031s : 1: opt.transform.opt_after_jit_grad 0.12% : 0.000132s : 28: opt.transform.opt_b 0.05% : 0.000062s : 2: opt.transform.opt_trans_graph 0.04% : 0.000045s : 4: opt.transform.symbol_engine_opt 3.25% : 0.003717s : 1: opt_a 0.14% : 0.000157s : 1: opt_after_cconv 0.50% : 0.000573s : 1: opt_after_jit_grad 0.31% : 0.000360s : 1: opt_b 27.22% : 0.031157s : 1: optimize 0.03% : 0.000033s : 1: optimize_parallel_all_gather_comm 0.01% : 0.000011s : 1: order_py_execute_after_rewriter 0.03% : 0.000031s : 1: overlap_grad_flash_sp 0.01% : 0.000007s : 1: overlap_grad_matmul_and_grad_allreduce 0.01% : 0.000011s : 1: overlap_grad_ring_attention 0.01% : 0.000008s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000007s : 1: overlap_opt_shard_in_pipeline 0.01% : 0.000008s : 1: overlap_param_gather 0.01% : 0.000007s : 1: overlap_recompute_allgather_and_fa_grad 0.01% : 0.000011s : 1: overlap_recompute_and_grad_model_parallel 0.01% : 0.000009s : 1: overlap_recompute_comm 0.01% : 0.000010s : 1: parallel-infer-symbol 0.01% : 0.000007s : 1: parallel-infer-symbol-second 0.01% : 0.000008s : 1: partial_unused_args_eliminate 0.01% : 0.000010s : 1: pipeline_parallel_scheduler 0.01% : 0.000007s : 1: pipeline_split 0.05% : 0.000054s : 1: pre_auto_parallel 0.04% : 0.000044s : 1: py_interpret_to_execute 0.02% : 0.000018s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000007s : 1: remove_cast_before_assign_add 0.02% : 0.000023s : 1: remove_dup_value 0.41% : 0.000473s : 1: renormalize.infer 0.38% : 0.000432s : 1: renormalize.specialize 0.01% : 0.000010s : 1: reorder_send_recv_between_fp_bp 0.01% : 0.000013s : 1: rewriter_after_jit_bprop_graph 0.04% : 0.000047s : 1: rewriter_after_opt_a 0.11% : 0.000124s : 1: rewriter_before_opt_a 0.01% : 0.000008s : 1: slice_cell_reuse_recomputed_activation 0.01% : 0.000007s : 1: slice_recompute_activation 0.01% : 0.000007s : 1: split_layernorm_comm 0.01% : 0.000007s : 1: split_matmul_comm_elemetwise 0.01% : 0.000012s : 1: swap_dp_allreduce_reducescatter 0.10% : 0.000119s : 1: symbol_engine_optimizer 0.10% : 0.000111s : 1: tuple_transform 36.65% : 0.041955s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:08.334.390 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0488452, [21] [bootstrap]: 0.000465 [type_inference]: 0.0259588 [event_method]: 5.159e-05 [auto_monad]: 0.00010343 [graph_reusing]: 7.97998e-06 [inline]: 2.63e-06 [add_attr]: 0.00357319, [1] [add_attr_with_inline]: 0.00356175, [1] [Cycle 1]: 8.078e-05, [2] [tag_attr]: 2.98e-05 [meta_addattr_fg_expand]: 8.08001e-06 [parallel-infer-symbol]: 3.75e-06 [pre_auto_parallel]: 4.627e-05 [insert-virtual-dataset]: 3.26001e-06 [parallel-infer-symbol-second]: 7.59988e-07 [dataset_repeat_opt]: 2.12999e-06 [pipeline_split]: 2.07999e-06 [optimize]: 0.00551959, [53] [py_interpret_to_execute]: 3.64e-05 [rewriter_before_opt_a]: 0.00011496 [opt_a]: 0.00341609, [2] [Cycle 1]: 0.00273937, [45] [expand_dump_flag]: 3.75e-06 [switch_simplify]: 0.00013342 [loop_unroll]: 4.102e-05 [a_1]: 0.00087973 [with_stream_mark]: 1.772e-05 [recompute_prepare]: 1.09e-05 [updatestate_depend_eliminate]: 4.63999e-06 [updatestate_assign_eliminate]: 3.06999e-06 [updatestate_loads_eliminate]: 2.67001e-06 [parameter_eliminate]: 1.91003e-06 [a_2]: 8.425e-05 [accelerated_algorithm]: 7.26999e-06 [shard]: 1.74e-06 [meta_shard_fg_expand]: 2.48e-06 [shard_inline]: 6.74999e-06 [merge_send_recv]: 8.83001e-06 [auto_parallel]: 7.41999e-06 [parallel]: 1.944e-05 [flash_sp]: 8.48001e-06 [merge_comm]: 4.46002e-06 [allreduce_fusion]: 3.32002e-06 [matmul_add_comm_reduction]: 1.027e-05 [allreduce_slice_to_reducescatter]: 7.39994e-07 [virtual_shard_identity]: 9.27999e-06 [virtual_dataset]: 7.11999e-06 [get_grad_eliminate_]: 6.17999e-06 [virtual_output]: 6.51999e-06 [merge_forward]: 3.56001e-06 [cell_reuse_recompute_pass]: 1.12999e-06 [offload_activation]: 9.61998e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.316e-05 [merge_recompute_call_nodes]: 1.51998e-06 [before_grad]: 1.028e-05 [set_forward_comm_id_for_comm_node_pass]: 3.68999e-06 [meta_fg_expand]: 3.33e-06 [flash_sp_send_recv_attached]: 2.61e-06 [receive_attached]: 2.69001e-06 [after_resolve]: 1.269e-05 [a_after_grad]: 9.94999e-06 [renormalize]: 0.00100539 [add_forward_monad_depend]: 6.59999e-06 [auto_monad_grad]: 2.64999e-06 [auto_monad_eliminator]: 1.637e-05 [cse]: 2.87e-05 [a_3]: 5.047e-05 [Cycle 2]: 0.00066604, [45] [expand_dump_flag]: 9.5999e-07 [switch_simplify]: 8.50001e-06 [loop_unroll]: 6.49999e-06 [a_1]: 0.00013548 [with_stream_mark]: 1.192e-05 [recompute_prepare]: 6.84001e-06 [updatestate_depend_eliminate]: 3.19001e-06 [updatestate_assign_eliminate]: 2.62001e-06 [updatestate_loads_eliminate]: 2.98e-06 [parameter_eliminate]: 1.49e-06 [a_2]: 7.392e-05 [accelerated_algorithm]: 6.48003e-06 [shard]: 1.15001e-06 [meta_shard_fg_expand]: 1.66e-06 [shard_inline]: 6.78e-06 [merge_send_recv]: 5.82999e-06 [auto_parallel]: 5.49e-06 [parallel]: 5.89e-06 [flash_sp]: 3.71999e-06 [merge_comm]: 3.51999e-06 [allreduce_fusion]: 3.17002e-06 [matmul_add_comm_reduction]: 7.05998e-06 [allreduce_slice_to_reducescatter]: 4.20026e-07 [virtual_shard_identity]: 6.96999e-06 [virtual_dataset]: 6.08002e-06 [get_grad_eliminate_]: 6.13002e-06 [virtual_output]: 5.74e-06 [merge_forward]: 2.91999e-06 [cell_reuse_recompute_pass]: 1.99999e-06 [offload_activation]: 6.86001e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.439e-05 [merge_recompute_call_nodes]: 1.27e-06 [before_grad]: 9.72999e-06 [set_forward_comm_id_for_comm_node_pass]: 3.60998e-06 [meta_fg_expand]: 2.01e-06 [flash_sp_send_recv_attached]: 7.89994e-07 [receive_attached]: 1.28002e-06 [after_resolve]: 1.141e-05 [a_after_grad]: 9.19e-06 [renormalize]: 6.00121e-08 [add_forward_monad_depend]: 1.30999e-06 [auto_monad_grad]: 9.30013e-07 [auto_monad_eliminator]: 8.51997e-06 [cse]: 1.456e-05 [a_3]: 4.085e-05 [py_interpret_to_execute_after_opt_a]: 1.135e-05 [slice_cell_reuse_recomputed_activation]: 2.40002e-06 [rewriter_after_opt_a]: 3.526e-05 [convert_after_rewriter]: 6.66e-06 [order_py_execute_after_rewriter]: 5.51998e-06 [mutable_eliminate]: 0.00051773 [opt_b]: 0.00020717, [1] [Cycle 1]: 0.00020014, [7] [b_1]: 0.00012689 [b_2]: 8.53001e-06 [updatestate_depend_eliminate]: 6.79001e-06 [updatestate_assign_eliminate]: 2.49999e-06 [updatestate_loads_eliminate]: 2.20002e-06 [renormalize]: 9.70002e-07 [cse]: 1.727e-05 [optimize_parallel_all_gather_comm]: 1.697e-05 [overlap_param_gather]: 2.22999e-06 [cconv]: 2.757e-05 [loop_unroll]: 0.00043837 [opt_after_cconv]: 0.00010394, [1] [Cycle 1]: 9.807e-05, [7] [c_1]: 3.291e-05 [parameter_eliminate]: 3.81999e-06 [updatestate_depend_eliminate]: 5.35999e-06 [updatestate_assign_eliminate]: 3.01001e-06 [updatestate_loads_eliminate]: 2.32001e-06 [cse]: 1.609e-05 [renormalize]: 4.60015e-07 [remove_dup_value]: 1.292e-05 [tuple_transform]: 8.394e-05, [1] [Cycle 1]: 7.834e-05, [4] [d_1]: 5.065e-05 [none_parameter_eliminate]: 1.57001e-06 [renormalize]: 2.00002e-07 [switch_simplify]: 7.56001e-06 [partial_unused_args_eliminate]: 1.94999e-06 [add_recomputation]: 4.836e-05 [cse_after_recomputation]: 2.186e-05, [1] [Cycle 1]: 1.681e-05, [1] [cse]: 1.143e-05 [environ_conv]: 5.35999e-06 [swap_dp_allreduce_reducescatter]: 4.95999e-06 [bias_add_comm_swap]: 2.98e-06 [label_micro_interleaved_index]: 4.70999e-06 [label_fine_grained_interleaved_index]: 2.73e-06 [merge_cast_opt]: 1.41002e-06 [slice_recompute_activation]: 2.02001e-06 [micro_interleaved_order_control]: 2.33998e-06 [assign_add_opt]: 1.22999e-06 [ForceFp32Comm]: 8.09989e-07 [remove_cast_before_assign_add]: 9.79984e-07 [full_micro_interleaved_order_control]: 2.20002e-06 [reorder_send_recv_between_fp_bp]: 2.61999e-06 [comm_op_add_attrs]: 1.01002e-06 [add_comm_op_reuse_tag]: 9.70002e-07 [interleave_split_concat_branches]: 1.10001e-06 [interleave_parallel_branches]: 1.43002e-06 [overlap_opt_shard_in_pipeline]: 1.32999e-06 [overlap_opt_shard_grad_in_pipeline]: 2.09e-06 [control_data_broadcast_order]: 1.325e-05 [grouped_pairwise_exchange_alltoall]: 1.83002e-06 [offloading_packed_experts]: 3.79002e-06 [overlap_recompute_and_grad_model_parallel]: 4.17e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.37e-06 [overlap_recompute_allgather_and_fa_grad]: 1.36002e-06 [overlap_recompute_comm]: 2.34001e-06 [overlap_grad_ring_attention]: 3.77002e-06 [overlap_grad_flash_sp]: 1.926e-05 [begin_end_overlap_inline]: 5.8001e-07 [split_matmul_comm_elemetwise]: 2.71999e-06 [split_layernorm_comm]: 1.74e-06 [handle_group_info]: 1.04e-06 [symbol_engine_optimizer]: 7.55e-05, [1] [Cycle 1]: 7.117e-05, [6] [build]: 2.62001e-06 [elim_shapecalc]: 9.93002e-06 [elim_not_effective]: 1.362e-05 [opt_reshape]: 6.93e-06 [fold_const_symbol]: 1.073e-05 [renormalize]: 2.3999e-07 [detach_backward]: 2.04e-06 [pipeline_parallel_scheduler]: 1.54e-06 [auto_monad_reorder]: 1.676e-05 [get_jit_bprop_graph]: 1.72999e-06 [rewriter_after_jit_bprop_graph]: 4.25999e-06 [opt_after_jit_grad]: 0.0128373 [validate]: 5.74e-05 Sums bootstrap : 0.000465s : 1.05% type_inference : 0.025959s : 58.64% event_method : 0.000052s : 0.12% auto_monad : 0.000103s : 0.23% graph_reusing : 0.000008s : 0.02% inline : 0.000003s : 0.01% add_attr.add_attr_with_inline.tag_attr : 0.000030s : 0.07% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000008s : 0.02% parallel-infer-symbol : 0.000004s : 0.01% pre_auto_parallel : 0.000046s : 0.10% insert-virtual-dataset : 0.000003s : 0.01% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000036s : 0.08% optimize.rewriter_before_opt_a : 0.000115s : 0.26% optimize.opt_a.expand_dump_flag : 0.000005s : 0.01% optimize.opt_a.switch_simplify : 0.000142s : 0.32% optimize.opt_a.loop_unroll : 0.000048s : 0.11% optimize.opt_a.a_1 : 0.001015s : 2.29% optimize.opt_a.with_stream_mark : 0.000030s : 0.07% optimize.opt_a.recompute_prepare : 0.000018s : 0.04% optimize.opt_a.updatestate_depend_eliminate : 0.000008s : 0.02% optimize.opt_a.updatestate_assign_eliminate : 0.000006s : 0.01% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.01% optimize.opt_a.parameter_eliminate : 0.000003s : 0.01% optimize.opt_a.a_2 : 0.000158s : 0.36% optimize.opt_a.accelerated_algorithm : 0.000014s : 0.03% optimize.opt_a.shard : 0.000003s : 0.01% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.01% optimize.opt_a.shard_inline : 0.000014s : 0.03% optimize.opt_a.merge_send_recv : 0.000015s : 0.03% optimize.opt_a.auto_parallel : 0.000013s : 0.03% optimize.opt_a.parallel : 0.000025s : 0.06% optimize.opt_a.flash_sp : 0.000012s : 0.03% optimize.opt_a.merge_comm : 0.000008s : 0.02% optimize.opt_a.allreduce_fusion : 0.000006s : 0.01% optimize.opt_a.matmul_add_comm_reduction : 0.000017s : 0.04% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000016s : 0.04% optimize.opt_a.virtual_dataset : 0.000013s : 0.03% optimize.opt_a.get_grad_eliminate_ : 0.000012s : 0.03% optimize.opt_a.virtual_output : 0.000012s : 0.03% optimize.opt_a.merge_forward : 0.000006s : 0.01% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.01% optimize.opt_a.offload_activation : 0.000016s : 0.04% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000028s : 0.06% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.01% optimize.opt_a.before_grad : 0.000020s : 0.05% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000007s : 0.02% optimize.opt_a.meta_fg_expand : 0.000005s : 0.01% optimize.opt_a.flash_sp_send_recv_attached : 0.000003s : 0.01% optimize.opt_a.receive_attached : 0.000004s : 0.01% optimize.opt_a.after_resolve : 0.000024s : 0.05% optimize.opt_a.a_after_grad : 0.000019s : 0.04% optimize.opt_a.renormalize : 0.001005s : 2.27% optimize.opt_a.add_forward_monad_depend : 0.000008s : 0.02% optimize.opt_a.auto_monad_grad : 0.000004s : 0.01% optimize.opt_a.auto_monad_eliminator : 0.000025s : 0.06% optimize.opt_a.cse : 0.000043s : 0.10% optimize.opt_a.a_3 : 0.000091s : 0.21% optimize.py_interpret_to_execute_after_opt_a : 0.000011s : 0.03% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.01% optimize.rewriter_after_opt_a : 0.000035s : 0.08% optimize.convert_after_rewriter : 0.000007s : 0.02% optimize.order_py_execute_after_rewriter : 0.000006s : 0.01% optimize.mutable_eliminate : 0.000518s : 1.17% optimize.opt_b.b_1 : 0.000127s : 0.29% optimize.opt_b.b_2 : 0.000009s : 0.02% optimize.opt_b.updatestate_depend_eliminate : 0.000007s : 0.02% optimize.opt_b.updatestate_assign_eliminate : 0.000002s : 0.01% optimize.opt_b.updatestate_loads_eliminate : 0.000002s : 0.00% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000017s : 0.04% optimize.optimize_parallel_all_gather_comm : 0.000017s : 0.04% optimize.overlap_param_gather : 0.000002s : 0.01% optimize.cconv : 0.000028s : 0.06% optimize.loop_unroll : 0.000438s : 0.99% optimize.opt_after_cconv.c_1 : 0.000033s : 0.07% optimize.opt_after_cconv.parameter_eliminate : 0.000004s : 0.01% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000005s : 0.01% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.01% optimize.opt_after_cconv.cse : 0.000016s : 0.04% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000013s : 0.03% optimize.tuple_transform.d_1 : 0.000051s : 0.11% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000008s : 0.02% optimize.partial_unused_args_eliminate : 0.000002s : 0.00% optimize.add_recomputation : 0.000048s : 0.11% optimize.cse_after_recomputation.cse : 0.000011s : 0.03% optimize.environ_conv : 0.000005s : 0.01% optimize.swap_dp_allreduce_reducescatter : 0.000005s : 0.01% optimize.bias_add_comm_swap : 0.000003s : 0.01% optimize.label_micro_interleaved_index : 0.000005s : 0.01% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.01% optimize.merge_cast_opt : 0.000001s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.00% optimize.micro_interleaved_order_control : 0.000002s : 0.01% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.00% optimize.full_micro_interleaved_order_control : 0.000002s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.01% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.00% optimize.control_data_broadcast_order : 0.000013s : 0.03% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.00% optimize.offloading_packed_experts : 0.000004s : 0.01% optimize.overlap_recompute_and_grad_model_parallel : 0.000004s : 0.01% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.00% optimize.overlap_recompute_comm : 0.000002s : 0.01% optimize.overlap_grad_ring_attention : 0.000004s : 0.01% optimize.overlap_grad_flash_sp : 0.000019s : 0.04% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000003s : 0.01% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000003s : 0.01% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000010s : 0.02% optimize.symbol_engine_optimizer.elim_not_effective : 0.000014s : 0.03% optimize.symbol_engine_optimizer.opt_reshape : 0.000007s : 0.02% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000011s : 0.02% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.00% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000017s : 0.04% get_jit_bprop_graph : 0.000002s : 0.00% rewriter_after_jit_bprop_graph : 0.000004s : 0.01% opt_after_jit_grad : 0.012837s : 29.00% validate : 0.000057s : 0.13% Time group info: ------[substitution.] 0.000234 34 0.85% : 0.000002s : 2: substitution.elim_not_effective 0.55% : 0.000001s : 2: substitution.fold_const_symbol 2.58% : 0.000006s : 4: substitution.graph_param_transform 79.26% : 0.000185s : 8: substitution.inline 1.64% : 0.000004s : 4: substitution.j_node_and_user_rematch 2.02% : 0.000005s : 4: substitution.remove_not_recompute_node 2.17% : 0.000005s : 4: substitution.replace_old_param 4.24% : 0.000010s : 2: substitution.switch_simplify 6.69% : 0.000016s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.025873 2 93.73% : 0.024251s : 1: type_inference.infer 6.27% : 0.001622s : 1: type_inference.specialize ------[replace.] 0.000128 14 41.24% : 0.000053s : 8: replace.inline 40.67% : 0.000052s : 2: replace.switch_simplify 18.08% : 0.000023s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000203 14 89.05% : 0.000181s : 8: match.inline 4.26% : 0.000009s : 2: match.switch_simplify 6.70% : 0.000014s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000249 1520 0.98% : 0.000002s : 17: predicate.accumulaten_eliminater 1.64% : 0.000004s : 4: predicate.ad_related_special_op_eliminate 0.39% : 0.000001s : 8: predicate.addn_check_dump 1.01% : 0.000003s : 17: predicate.addn_zero_filter 0.91% : 0.000002s : 17: predicate.adjust_all_reduce_mul_add 2.00% : 0.000005s : 25: predicate.arithmetic_simplify 0.99% : 0.000002s : 17: predicate.cast_eliminate 0.51% : 0.000001s : 8: predicate.check_bprop_eliminate 0.40% : 0.000001s : 8: predicate.compare_switch_simplify 0.16% : 0.000000s : 4: predicate.const_output_eliminate 0.44% : 0.000001s : 8: predicate.depend_value_elim 0.97% : 0.000002s : 17: predicate.dict_get_item_const_eliminator 1.27% : 0.000003s : 17: predicate.dict_get_item_eliminator 0.95% : 0.000002s : 17: predicate.dict_set_item_eliminator 1.97% : 0.000005s : 8: predicate.dumpgradient_eliminate 0.17% : 0.000000s : 4: predicate.elim_not_effective 0.32% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.22% : 0.000003s : 21: predicate.environ_add_const_eliminate 1.14% : 0.000003s : 21: predicate.environ_get_add_eliminate 1.21% : 0.000003s : 21: predicate.environ_get_depend_swap 1.58% : 0.000004s : 29: predicate.environ_get_eliminate 1.15% : 0.000003s : 21: predicate.environ_get_set_eliminate 1.75% : 0.000004s : 29: predicate.exchange_switch_depend_value 2.63% : 0.000007s : 29: predicate.float_depend_g_call 0.40% : 0.000001s : 8: predicate.float_environ_get_switch 0.73% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.16% : 0.000000s : 4: predicate.fold_const_symbol 0.51% : 0.000001s : 8: predicate.get_grad_eliminate 0.18% : 0.000000s : 4: predicate.graph_param_transform 0.46% : 0.000001s : 8: predicate.incorporate_call 0.39% : 0.000001s : 8: predicate.incorporate_call_switch 6.67% : 0.000017s : 70: predicate.inline 0.61% : 0.000002s : 8: predicate.inline_without_move 0.27% : 0.000001s : 8: predicate.j_node_and_user_rematch 0.63% : 0.000002s : 8: predicate.less_batch_normalization 1.87% : 0.000005s : 29: predicate.list_to_tuple_eliminator_ 2.48% : 0.000006s : 46: predicate.load_eliminater 0.71% : 0.000002s : 4: predicate.loop_unroll_after_grad 2.72% : 0.000007s : 47: predicate.loop_unroll_before_grad 1.47% : 0.000004s : 25: predicate.make_slice_get_slice_eliminator 0.42% : 0.000001s : 8: predicate.merge_addn 0.45% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.48% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.90% : 0.000002s : 17: predicate.minmaximum_grad 0.74% : 0.000002s : 4: predicate.mutable_eliminate 0.28% : 0.000001s : 4: predicate.opt_reshape 0.26% : 0.000001s : 4: predicate.parallel_virtual_node 2.11% : 0.000005s : 29: predicate.partial_defer_inline 1.63% : 0.000004s : 25: predicate.partial_eliminate 0.96% : 0.000002s : 17: predicate.print_const_string_wrapper 0.47% : 0.000001s : 8: predicate.reduce_all_const_elim 1.25% : 0.000003s : 17: predicate.reduce_eliminate 2.43% : 0.000006s : 46: predicate.redundant_stop_gradient_eliminater 0.32% : 0.000001s : 8: predicate.remove_not_recompute_node 1.41% : 0.000004s : 29: predicate.replace_applicator 0.45% : 0.000001s : 8: predicate.replace_old_param 0.29% : 0.000001s : 4: predicate.reset_defer_inline 0.97% : 0.000002s : 17: predicate.reshape_eliminate 0.47% : 0.000001s : 8: predicate.row_tensor_add_zeros_like 0.37% : 0.000001s : 4: predicate.row_tensor_eliminate 0.67% : 0.000002s : 8: predicate.same_eliminate 0.44% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.69% : 0.000002s : 8: predicate.shard_identity_eliminate 0.76% : 0.000002s : 8: predicate.special_op_eliminate 0.54% : 0.000001s : 8: predicate.specialize_transform 0.64% : 0.000002s : 8: predicate.split_environ_get_set_with_tuple_value 0.55% : 0.000001s : 8: predicate.stack_unstack_eliminate 0.28% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.80% : 0.000004s : 29: predicate.switch_defer_inline 2.24% : 0.000006s : 37: predicate.switch_layer_defer_inline 6.05% : 0.000015s : 92: predicate.switch_simplify 1.05% : 0.000003s : 17: predicate.tile_eliminate 1.00% : 0.000002s : 17: predicate.transpose_eliminate 1.49% : 0.000004s : 25: predicate.tuple_list_convert_item_index_to_positive 1.62% : 0.000004s : 25: predicate.tuple_list_get_item_const_eliminator 1.43% : 0.000004s : 25: predicate.tuple_list_get_item_depend_reorder 3.11% : 0.000008s : 37: predicate.tuple_list_get_item_eliminator 1.46% : 0.000004s : 25: predicate.tuple_list_get_set_item_eliminator 2.30% : 0.000006s : 33: predicate.tuple_list_set_item_eliminator 1.69% : 0.000004s : 29: predicate.tuple_to_list_eliminator_ 2.61% : 0.000007s : 46: predicate.updatestate_pure_node_eliminater 3.01% : 0.000007s : 54: predicate.updatestate_useless_node_eliminater 0.30% : 0.000001s : 4: predicate.value_based_eliminate 0.51% : 0.000001s : 8: predicate.virtual_dataset_eliminate 0.50% : 0.000001s : 8: predicate.virtual_output_eliminate 0.22% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.38% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.001077 16 53.54% : 0.000576s : 6: func_graph_cloner_run.FuncGraphClonerGraph 46.46% : 0.000500s : 10: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.060634 192 0.01% : 0.000003s : 1: ForceFp32Comm 5.90% : 0.003579s : 1: add_attr 5.88% : 0.003566s : 1: add_attr_with_inline 0.01% : 0.000003s : 1: add_comm_op_reuse_tag 0.09% : 0.000052s : 1: add_recomputation 0.01% : 0.000004s : 1: assign_add_opt 0.18% : 0.000110s : 1: auto_monad 0.03% : 0.000021s : 1: auto_monad_reorder 0.01% : 0.000003s : 1: begin_end_overlap_inline 0.01% : 0.000006s : 1: bias_add_comm_swap 0.81% : 0.000492s : 1: bootstrap 0.05% : 0.000031s : 1: cconv 0.01% : 0.000004s : 1: comm_op_add_attrs 0.03% : 0.000017s : 1: control_data_broadcast_order 0.02% : 0.000010s : 1: convert_after_rewriter 0.04% : 0.000025s : 1: cse_after_recomputation 0.01% : 0.000005s : 1: dataset_repeat_opt 0.01% : 0.000005s : 1: detach_backward 0.01% : 0.000008s : 1: environ_conv 0.10% : 0.000061s : 1: event_method 0.01% : 0.000005s : 1: full_micro_interleaved_order_control 0.01% : 0.000005s : 1: get_jit_bprop_graph 0.02% : 0.000012s : 1: graph_reusing 0.01% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000004s : 1: handle_group_info 0.01% : 0.000006s : 1: inline 0.01% : 0.000007s : 1: insert-virtual-dataset 0.01% : 0.000004s : 1: interleave_parallel_branches 0.01% : 0.000004s : 1: interleave_split_concat_branches 0.01% : 0.000006s : 1: label_fine_grained_interleaved_index 0.01% : 0.000007s : 1: label_micro_interleaved_index 0.74% : 0.000446s : 1: loop_unroll 0.01% : 0.000005s : 1: merge_cast_opt 0.01% : 0.000005s : 1: micro_interleaved_order_control 0.87% : 0.000527s : 1: mutable_eliminate 0.01% : 0.000007s : 1: offloading_packed_experts 0.02% : 0.000015s : 1: opt.transform.loop_unroll_optimizer 0.02% : 0.000015s : 1: opt.transform.mutable_eliminate 2.56% : 0.001550s : 78: opt.transform.opt_a 0.05% : 0.000032s : 1: opt.transform.opt_after_cconv 0.09% : 0.000053s : 1: opt.transform.opt_after_jit_grad 0.17% : 0.000103s : 28: opt.transform.opt_b 0.09% : 0.000055s : 2: opt.transform.opt_trans_graph 0.06% : 0.000038s : 4: opt.transform.symbol_engine_opt 5.64% : 0.003420s : 1: opt_a 0.18% : 0.000107s : 1: opt_after_cconv 21.21% : 0.012858s : 1: opt_after_jit_grad 0.35% : 0.000210s : 1: opt_b 9.11% : 0.005524s : 1: optimize 0.03% : 0.000021s : 1: optimize_parallel_all_gather_comm 0.01% : 0.000009s : 1: order_py_execute_after_rewriter 0.04% : 0.000022s : 1: overlap_grad_flash_sp 0.01% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.01% : 0.000007s : 1: overlap_grad_ring_attention 0.01% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.01% : 0.000006s : 1: overlap_param_gather 0.01% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.01% : 0.000007s : 1: overlap_recompute_and_grad_model_parallel 0.01% : 0.000005s : 1: overlap_recompute_comm 0.01% : 0.000007s : 1: parallel-infer-symbol 0.01% : 0.000004s : 1: parallel-infer-symbol-second 0.01% : 0.000005s : 1: partial_unused_args_eliminate 0.01% : 0.000005s : 1: pipeline_parallel_scheduler 0.01% : 0.000005s : 1: pipeline_split 0.08% : 0.000051s : 1: pre_auto_parallel 0.07% : 0.000041s : 1: py_interpret_to_execute 0.02% : 0.000015s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000004s : 1: remove_cast_before_assign_add 0.03% : 0.000017s : 1: remove_dup_value 0.91% : 0.000553s : 1: renormalize.infer 0.73% : 0.000443s : 1: renormalize.specialize 0.01% : 0.000005s : 1: reorder_send_recv_between_fp_bp 0.01% : 0.000008s : 1: rewriter_after_jit_bprop_graph 0.07% : 0.000040s : 1: rewriter_after_opt_a 0.20% : 0.000119s : 1: rewriter_before_opt_a 0.01% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.01% : 0.000005s : 1: slice_recompute_activation 0.01% : 0.000004s : 1: split_layernorm_comm 0.01% : 0.000005s : 1: split_matmul_comm_elemetwise 0.01% : 0.000008s : 1: swap_dp_allreduce_reducescatter 0.13% : 0.000078s : 1: symbol_engine_optimizer 0.14% : 0.000087s : 1: tuple_transform 42.86% : 0.025987s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:08.895.062 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:08.895.340 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0495414, [21] [bootstrap]: 0.00048037 [type_inference]: 0.0255511 [event_method]: 9.383e-05 [auto_monad]: 0.00010513 [graph_reusing]: 8.05999e-06 [inline]: 2.67001e-06 [add_attr]: 0.00344661, [1] [add_attr_with_inline]: 0.00343816, [1] [Cycle 1]: 8.083e-05, [2] [tag_attr]: 2.786e-05 [meta_addattr_fg_expand]: 8.28001e-06 [parallel-infer-symbol]: 2.94999e-06 [pre_auto_parallel]: 4.271e-05 [insert-virtual-dataset]: 2.31e-06 [parallel-infer-symbol-second]: 8.2e-07 [dataset_repeat_opt]: 2.18998e-06 [pipeline_split]: 1.67999e-06 [optimize]: 0.00635858, [53] [py_interpret_to_execute]: 3.798e-05 [rewriter_before_opt_a]: 0.00011919 [opt_a]: 0.00393866, [2] [Cycle 1]: 0.00298453, [45] [expand_dump_flag]: 4.03999e-06 [switch_simplify]: 0.00012543 [loop_unroll]: 4.255e-05 [a_1]: 0.00089508 [with_stream_mark]: 1.775e-05 [recompute_prepare]: 1.128e-05 [updatestate_depend_eliminate]: 4.60001e-06 [updatestate_assign_eliminate]: 4.55999e-06 [updatestate_loads_eliminate]: 3.50003e-06 [parameter_eliminate]: 1.92001e-06 [a_2]: 0.00013256 [accelerated_algorithm]: 8.84003e-06 [shard]: 2.07001e-06 [meta_shard_fg_expand]: 2.53e-06 [shard_inline]: 8.12998e-06 [merge_send_recv]: 1.069e-05 [auto_parallel]: 8.24002e-06 [parallel]: 1.716e-05 [flash_sp]: 8.79998e-06 [merge_comm]: 4.81002e-06 [allreduce_fusion]: 4.20999e-06 [matmul_add_comm_reduction]: 9.86e-06 [allreduce_slice_to_reducescatter]: 8.99978e-07 [virtual_shard_identity]: 1.065e-05 [virtual_dataset]: 8.51002e-06 [get_grad_eliminate_]: 7.83001e-06 [virtual_output]: 7.61999e-06 [merge_forward]: 4.82998e-06 [cell_reuse_recompute_pass]: 1.27e-06 [offload_activation]: 1.178e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.793e-05 [merge_recompute_call_nodes]: 1.49e-06 [before_grad]: 1.46e-05 [set_forward_comm_id_for_comm_node_pass]: 5.03002e-06 [meta_fg_expand]: 4.19002e-06 [flash_sp_send_recv_attached]: 2.45002e-06 [receive_attached]: 2.31e-06 [after_resolve]: 1.391e-05 [a_after_grad]: 1.357e-05 [renormalize]: 0.00099519 [add_forward_monad_depend]: 5.59e-06 [auto_monad_grad]: 2.37001e-06 [auto_monad_eliminator]: 1.84e-05 [cse]: 3.573e-05 [a_3]: 7.456e-05 [Cycle 2]: 0.00094052, [45] [expand_dump_flag]: 1.19e-06 [switch_simplify]: 9.20001e-06 [loop_unroll]: 7.68001e-06 [a_1]: 0.00018092 [with_stream_mark]: 1.231e-05 [recompute_prepare]: 8.27e-06 [updatestate_depend_eliminate]: 4.23999e-06 [updatestate_assign_eliminate]: 3.42002e-06 [updatestate_loads_eliminate]: 2.90998e-06 [parameter_eliminate]: 1.12e-06 [a_2]: 0.00012258 [accelerated_algorithm]: 8.04002e-06 [shard]: 1.32999e-06 [meta_shard_fg_expand]: 1.67999e-06 [shard_inline]: 8.21002e-06 [merge_send_recv]: 6.48998e-06 [auto_parallel]: 6.29999e-06 [parallel]: 5.32001e-06 [flash_sp]: 3.62002e-06 [merge_comm]: 4.26001e-06 [allreduce_fusion]: 4.26001e-06 [matmul_add_comm_reduction]: 7.11001e-06 [allreduce_slice_to_reducescatter]: 5.3001e-07 [virtual_shard_identity]: 8.74e-06 [virtual_dataset]: 7.77002e-06 [get_grad_eliminate_]: 7.38999e-06 [virtual_output]: 7.28999e-06 [merge_forward]: 3.40998e-06 [cell_reuse_recompute_pass]: 1.91003e-06 [offload_activation]: 7.68001e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.679e-05 [merge_recompute_call_nodes]: 9.39996e-07 [before_grad]: 1.243e-05 [set_forward_comm_id_for_comm_node_pass]: 4.68999e-06 [meta_fg_expand]: 2.68003e-06 [flash_sp_send_recv_attached]: 1.00001e-06 [receive_attached]: 1.04998e-06 [after_resolve]: 1.193e-05 [a_after_grad]: 1.154e-05 [renormalize]: 8.9989e-08 [add_forward_monad_depend]: 1.47001e-06 [auto_monad_grad]: 8.99978e-07 [auto_monad_eliminator]: 8.82e-06 [cse]: 1.812e-05 [a_3]: 5.985e-05 [py_interpret_to_execute_after_opt_a]: 1.445e-05 [slice_cell_reuse_recomputed_activation]: 4.58001e-06 [rewriter_after_opt_a]: 4.408e-05 [convert_after_rewriter]: 9.96998e-06 [order_py_execute_after_rewriter]: 8.60001e-06 [mutable_eliminate]: 0.00050347 [opt_b]: 0.00031245, [1] [Cycle 1]: 0.00030371, [7] [b_1]: 0.00020313 [b_2]: 9.19e-06 [updatestate_depend_eliminate]: 6.71999e-06 [updatestate_assign_eliminate]: 3.09999e-06 [updatestate_loads_eliminate]: 3.08998e-06 [renormalize]: 4.50003e-07 [cse]: 2.169e-05 [optimize_parallel_all_gather_comm]: 2.115e-05 [overlap_param_gather]: 5.20999e-06 [cconv]: 2.669e-05 [loop_unroll]: 0.00043706 [opt_after_cconv]: 0.00014109, [1] [Cycle 1]: 0.00013199, [7] [c_1]: 3.918e-05 [parameter_eliminate]: 2.61e-06 [updatestate_depend_eliminate]: 6.31998e-06 [updatestate_assign_eliminate]: 3.17002e-06 [updatestate_loads_eliminate]: 2.78e-06 [cse]: 2.143e-05 [renormalize]: 4.09986e-07 [remove_dup_value]: 1.728e-05 [tuple_transform]: 0.00010231, [1] [Cycle 1]: 9.563e-05, [4] [d_1]: 5.396e-05 [none_parameter_eliminate]: 1.82999e-06 [renormalize]: 2.00002e-07 [switch_simplify]: 8.33999e-06 [partial_unused_args_eliminate]: 4.30999e-06 [add_recomputation]: 5.756e-05 [cse_after_recomputation]: 3.125e-05, [1] [Cycle 1]: 2.441e-05, [1] [cse]: 1.518e-05 [environ_conv]: 9.22999e-06 [swap_dp_allreduce_reducescatter]: 8.74e-06 [bias_add_comm_swap]: 4.99e-06 [label_micro_interleaved_index]: 6.85002e-06 [label_fine_grained_interleaved_index]: 6.07999e-06 [merge_cast_opt]: 3.43e-06 [slice_recompute_activation]: 4.18999e-06 [micro_interleaved_order_control]: 5.30001e-06 [assign_add_opt]: 3.48999e-06 [ForceFp32Comm]: 3.13e-06 [remove_cast_before_assign_add]: 3.6e-06 [full_micro_interleaved_order_control]: 4.42e-06 [reorder_send_recv_between_fp_bp]: 5.36002e-06 [comm_op_add_attrs]: 3.24001e-06 [add_comm_op_reuse_tag]: 3.28e-06 [interleave_split_concat_branches]: 3.71999e-06 [interleave_parallel_branches]: 1.645e-05 [overlap_opt_shard_in_pipeline]: 4.28001e-06 [overlap_opt_shard_grad_in_pipeline]: 4.25e-06 [control_data_broadcast_order]: 1.92e-05 [grouped_pairwise_exchange_alltoall]: 4.20999e-06 [offloading_packed_experts]: 7.38e-06 [overlap_recompute_and_grad_model_parallel]: 7.84002e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.68e-06 [overlap_recompute_allgather_and_fa_grad]: 3.60998e-06 [overlap_recompute_comm]: 4.89e-06 [overlap_grad_ring_attention]: 7.28999e-06 [overlap_grad_flash_sp]: 2.423e-05 [begin_end_overlap_inline]: 2.93e-06 [split_matmul_comm_elemetwise]: 4.53999e-06 [split_layernorm_comm]: 4.2e-06 [handle_group_info]: 3.78001e-06 [symbol_engine_optimizer]: 0.00010761, [1] [Cycle 1]: 0.00010048, [6] [build]: 2.88998e-06 [elim_shapecalc]: 1.228e-05 [elim_not_effective]: 1.686e-05 [opt_reshape]: 8.84e-06 [fold_const_symbol]: 1.291e-05 [renormalize]: 2.50002e-07 [detach_backward]: 1.089e-05 [pipeline_parallel_scheduler]: 2.34001e-06 [auto_monad_reorder]: 4e-05 [get_jit_bprop_graph]: 2.46e-06 [rewriter_after_jit_bprop_graph]: 1.002e-05 [opt_after_jit_grad]: 0.00077932 [validate]: 5.728e-05 Sums bootstrap : 0.000480s : 1.48% type_inference : 0.025551s : 78.86% event_method : 0.000094s : 0.29% auto_monad : 0.000105s : 0.32% graph_reusing : 0.000008s : 0.02% inline : 0.000003s : 0.01% add_attr.add_attr_with_inline.tag_attr : 0.000028s : 0.09% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000008s : 0.03% parallel-infer-symbol : 0.000003s : 0.01% pre_auto_parallel : 0.000043s : 0.13% insert-virtual-dataset : 0.000002s : 0.01% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.01% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000038s : 0.12% optimize.rewriter_before_opt_a : 0.000119s : 0.37% optimize.opt_a.expand_dump_flag : 0.000005s : 0.02% optimize.opt_a.switch_simplify : 0.000135s : 0.42% optimize.opt_a.loop_unroll : 0.000050s : 0.16% optimize.opt_a.a_1 : 0.001076s : 3.32% optimize.opt_a.with_stream_mark : 0.000030s : 0.09% optimize.opt_a.recompute_prepare : 0.000020s : 0.06% optimize.opt_a.updatestate_depend_eliminate : 0.000009s : 0.03% optimize.opt_a.updatestate_assign_eliminate : 0.000008s : 0.02% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.02% optimize.opt_a.parameter_eliminate : 0.000003s : 0.01% optimize.opt_a.a_2 : 0.000255s : 0.79% optimize.opt_a.accelerated_algorithm : 0.000017s : 0.05% optimize.opt_a.shard : 0.000003s : 0.01% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.01% optimize.opt_a.shard_inline : 0.000016s : 0.05% optimize.opt_a.merge_send_recv : 0.000017s : 0.05% optimize.opt_a.auto_parallel : 0.000015s : 0.04% optimize.opt_a.parallel : 0.000022s : 0.07% optimize.opt_a.flash_sp : 0.000012s : 0.04% optimize.opt_a.merge_comm : 0.000009s : 0.03% optimize.opt_a.allreduce_fusion : 0.000008s : 0.03% optimize.opt_a.matmul_add_comm_reduction : 0.000017s : 0.05% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000019s : 0.06% optimize.opt_a.virtual_dataset : 0.000016s : 0.05% optimize.opt_a.get_grad_eliminate_ : 0.000015s : 0.05% optimize.opt_a.virtual_output : 0.000015s : 0.05% optimize.opt_a.merge_forward : 0.000008s : 0.03% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.01% optimize.opt_a.offload_activation : 0.000019s : 0.06% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000035s : 0.11% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.01% optimize.opt_a.before_grad : 0.000027s : 0.08% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000010s : 0.03% optimize.opt_a.meta_fg_expand : 0.000007s : 0.02% optimize.opt_a.flash_sp_send_recv_attached : 0.000003s : 0.01% optimize.opt_a.receive_attached : 0.000003s : 0.01% optimize.opt_a.after_resolve : 0.000026s : 0.08% optimize.opt_a.a_after_grad : 0.000025s : 0.08% optimize.opt_a.renormalize : 0.000995s : 3.07% optimize.opt_a.add_forward_monad_depend : 0.000007s : 0.02% optimize.opt_a.auto_monad_grad : 0.000003s : 0.01% optimize.opt_a.auto_monad_eliminator : 0.000027s : 0.08% optimize.opt_a.cse : 0.000054s : 0.17% optimize.opt_a.a_3 : 0.000134s : 0.41% optimize.py_interpret_to_execute_after_opt_a : 0.000014s : 0.04% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.01% optimize.rewriter_after_opt_a : 0.000044s : 0.14% optimize.convert_after_rewriter : 0.000010s : 0.03% optimize.order_py_execute_after_rewriter : 0.000009s : 0.03% optimize.mutable_eliminate : 0.000503s : 1.55% optimize.opt_b.b_1 : 0.000203s : 0.63% optimize.opt_b.b_2 : 0.000009s : 0.03% optimize.opt_b.updatestate_depend_eliminate : 0.000007s : 0.02% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.01% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.01% optimize.opt_b.renormalize : 0.000000s : 0.00% optimize.opt_b.cse : 0.000022s : 0.07% optimize.optimize_parallel_all_gather_comm : 0.000021s : 0.07% optimize.overlap_param_gather : 0.000005s : 0.02% optimize.cconv : 0.000027s : 0.08% optimize.loop_unroll : 0.000437s : 1.35% optimize.opt_after_cconv.c_1 : 0.000039s : 0.12% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.02% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.cse : 0.000021s : 0.07% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000017s : 0.05% optimize.tuple_transform.d_1 : 0.000054s : 0.17% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000008s : 0.03% optimize.partial_unused_args_eliminate : 0.000004s : 0.01% optimize.add_recomputation : 0.000058s : 0.18% optimize.cse_after_recomputation.cse : 0.000015s : 0.05% optimize.environ_conv : 0.000009s : 0.03% optimize.swap_dp_allreduce_reducescatter : 0.000009s : 0.03% optimize.bias_add_comm_swap : 0.000005s : 0.02% optimize.label_micro_interleaved_index : 0.000007s : 0.02% optimize.label_fine_grained_interleaved_index : 0.000006s : 0.02% optimize.merge_cast_opt : 0.000003s : 0.01% optimize.slice_recompute_activation : 0.000004s : 0.01% optimize.micro_interleaved_order_control : 0.000005s : 0.02% optimize.assign_add_opt : 0.000003s : 0.01% optimize.ForceFp32Comm : 0.000003s : 0.01% optimize.remove_cast_before_assign_add : 0.000004s : 0.01% optimize.full_micro_interleaved_order_control : 0.000004s : 0.01% optimize.reorder_send_recv_between_fp_bp : 0.000005s : 0.02% optimize.comm_op_add_attrs : 0.000003s : 0.01% optimize.add_comm_op_reuse_tag : 0.000003s : 0.01% optimize.interleave_split_concat_branches : 0.000004s : 0.01% optimize.interleave_parallel_branches : 0.000016s : 0.05% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000004s : 0.01% optimize.control_data_broadcast_order : 0.000019s : 0.06% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.01% optimize.offloading_packed_experts : 0.000007s : 0.02% optimize.overlap_recompute_and_grad_model_parallel : 0.000008s : 0.02% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.01% optimize.overlap_recompute_comm : 0.000005s : 0.02% optimize.overlap_grad_ring_attention : 0.000007s : 0.02% optimize.overlap_grad_flash_sp : 0.000024s : 0.07% optimize.begin_end_overlap_inline : 0.000003s : 0.01% optimize.split_matmul_comm_elemetwise : 0.000005s : 0.01% optimize.split_layernorm_comm : 0.000004s : 0.01% optimize.handle_group_info : 0.000004s : 0.01% optimize.symbol_engine_optimizer.build : 0.000003s : 0.01% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000012s : 0.04% optimize.symbol_engine_optimizer.elim_not_effective : 0.000017s : 0.05% optimize.symbol_engine_optimizer.opt_reshape : 0.000009s : 0.03% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000013s : 0.04% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000011s : 0.03% pipeline_parallel_scheduler : 0.000002s : 0.01% auto_monad_reorder : 0.000040s : 0.12% get_jit_bprop_graph : 0.000002s : 0.01% rewriter_after_jit_bprop_graph : 0.000010s : 0.03% opt_after_jit_grad : 0.000779s : 2.41% validate : 0.000057s : 0.18% Time group info: ------[substitution.] 0.000246 44 9.58% : 0.000024s : 3: substitution.cast_eliminate 1.08% : 0.000003s : 3: substitution.elim_not_effective 0.71% : 0.000002s : 3: substitution.fold_const_symbol 2.57% : 0.000006s : 5: substitution.graph_param_transform 70.40% : 0.000173s : 8: substitution.inline 2.17% : 0.000005s : 6: substitution.j_node_and_user_rematch 2.57% : 0.000006s : 6: substitution.remove_not_recompute_node 1.83% : 0.000005s : 4: substitution.replace_old_param 3.99% : 0.000010s : 2: substitution.switch_simplify 5.10% : 0.000013s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.025476 2 93.73% : 0.023880s : 1: type_inference.infer 6.27% : 0.001596s : 1: type_inference.specialize ------[replace.] 0.000123 14 43.08% : 0.000053s : 8: replace.inline 37.93% : 0.000047s : 2: replace.switch_simplify 18.99% : 0.000023s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000188 14 89.91% : 0.000169s : 8: match.inline 4.55% : 0.000009s : 2: match.switch_simplify 5.54% : 0.000010s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000278 1746 0.93% : 0.000003s : 19: predicate.accumulaten_eliminater 0.75% : 0.000002s : 5: predicate.ad_related_special_op_eliminate 0.49% : 0.000001s : 10: predicate.addn_check_dump 0.96% : 0.000003s : 19: predicate.addn_zero_filter 0.89% : 0.000002s : 19: predicate.adjust_all_reduce_mul_add 2.01% : 0.000006s : 29: predicate.arithmetic_simplify 1.07% : 0.000003s : 19: predicate.cast_eliminate 0.54% : 0.000002s : 10: predicate.check_bprop_eliminate 0.50% : 0.000001s : 10: predicate.compare_switch_simplify 0.18% : 0.000000s : 5: predicate.const_output_eliminate 0.58% : 0.000002s : 10: predicate.depend_value_elim 1.01% : 0.000003s : 19: predicate.dict_get_item_const_eliminator 1.15% : 0.000003s : 19: predicate.dict_get_item_eliminator 0.94% : 0.000003s : 19: predicate.dict_set_item_eliminator 0.95% : 0.000003s : 10: predicate.dumpgradient_eliminate 0.22% : 0.000001s : 5: predicate.elim_not_effective 0.39% : 0.000001s : 5: predicate.elim_shapecalc_of_broadcastargs 1.16% : 0.000003s : 24: predicate.environ_add_const_eliminate 1.11% : 0.000003s : 24: predicate.environ_get_add_eliminate 1.16% : 0.000003s : 24: predicate.environ_get_depend_swap 1.80% : 0.000005s : 34: predicate.environ_get_eliminate 1.15% : 0.000003s : 24: predicate.environ_get_set_eliminate 1.65% : 0.000005s : 31: predicate.exchange_switch_depend_value 2.50% : 0.000007s : 31: predicate.float_depend_g_call 0.48% : 0.000001s : 10: predicate.float_environ_get_switch 0.75% : 0.000002s : 15: predicate.float_tuple_getitem_switch 0.17% : 0.000000s : 5: predicate.fold_const_symbol 0.62% : 0.000002s : 10: predicate.get_grad_eliminate 0.19% : 0.000001s : 5: predicate.graph_param_transform 0.57% : 0.000002s : 10: predicate.incorporate_call 0.50% : 0.000001s : 10: predicate.incorporate_call_switch 6.32% : 0.000018s : 80: predicate.inline 0.70% : 0.000002s : 10: predicate.inline_without_move 0.32% : 0.000001s : 10: predicate.j_node_and_user_rematch 0.73% : 0.000002s : 10: predicate.less_batch_normalization 1.74% : 0.000005s : 33: predicate.list_to_tuple_eliminator_ 2.58% : 0.000007s : 52: predicate.load_eliminater 0.73% : 0.000002s : 5: predicate.loop_unroll_after_grad 2.60% : 0.000007s : 49: predicate.loop_unroll_before_grad 1.57% : 0.000004s : 29: predicate.make_slice_get_slice_eliminator 0.52% : 0.000001s : 10: predicate.merge_addn 0.54% : 0.000001s : 10: predicate.micro_step_allgather_replace 0.53% : 0.000001s : 10: predicate.mini_step_allgather_replace 0.87% : 0.000002s : 19: predicate.minmaximum_grad 0.79% : 0.000002s : 5: predicate.mutable_eliminate 0.34% : 0.000001s : 5: predicate.opt_reshape 0.30% : 0.000001s : 5: predicate.parallel_virtual_node 1.97% : 0.000005s : 31: predicate.partial_defer_inline 1.62% : 0.000005s : 28: predicate.partial_eliminate 1.03% : 0.000003s : 19: predicate.print_const_string_wrapper 0.67% : 0.000002s : 10: predicate.reduce_all_const_elim 1.24% : 0.000003s : 19: predicate.reduce_eliminate 2.69% : 0.000007s : 52: predicate.redundant_stop_gradient_eliminater 0.40% : 0.000001s : 10: predicate.remove_not_recompute_node 1.38% : 0.000004s : 33: predicate.replace_applicator 0.37% : 0.000001s : 10: predicate.replace_old_param 0.24% : 0.000001s : 5: predicate.reset_defer_inline 0.99% : 0.000003s : 19: predicate.reshape_eliminate 0.56% : 0.000002s : 10: predicate.row_tensor_add_zeros_like 0.30% : 0.000001s : 5: predicate.row_tensor_eliminate 0.62% : 0.000002s : 10: predicate.same_eliminate 0.41% : 0.000001s : 10: predicate.set_cell_output_no_recompute 0.68% : 0.000002s : 10: predicate.shard_identity_eliminate 0.67% : 0.000002s : 10: predicate.special_op_eliminate 0.84% : 0.000002s : 10: predicate.specialize_transform 0.70% : 0.000002s : 10: predicate.split_environ_get_set_with_tuple_value 0.86% : 0.000002s : 10: predicate.stack_unstack_eliminate 0.31% : 0.000001s : 5: predicate.switch_call_monad_eliminater 1.70% : 0.000005s : 31: predicate.switch_defer_inline 2.21% : 0.000006s : 41: predicate.switch_layer_defer_inline 5.68% : 0.000016s : 99: predicate.switch_simplify 1.07% : 0.000003s : 19: predicate.tile_eliminate 0.97% : 0.000003s : 19: predicate.transpose_eliminate 1.50% : 0.000004s : 29: predicate.tuple_list_convert_item_index_to_positive 1.65% : 0.000005s : 29: predicate.tuple_list_get_item_const_eliminator 1.52% : 0.000004s : 29: predicate.tuple_list_get_item_depend_reorder 3.26% : 0.000009s : 43: predicate.tuple_list_get_item_eliminator 1.59% : 0.000004s : 29: predicate.tuple_list_get_set_item_eliminator 2.22% : 0.000006s : 39: predicate.tuple_list_set_item_eliminator 1.69% : 0.000005s : 33: predicate.tuple_to_list_eliminator_ 2.51% : 0.000007s : 52: predicate.updatestate_pure_node_eliminater 3.28% : 0.000009s : 62: predicate.updatestate_useless_node_eliminater 0.33% : 0.000001s : 5: predicate.value_based_eliminate 0.55% : 0.000002s : 10: predicate.virtual_dataset_eliminate 0.54% : 0.000002s : 10: predicate.virtual_output_eliminate 0.26% : 0.000001s : 5: predicate.virtual_view_grad_eliminate 0.37% : 0.000001s : 5: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.001020 16 52.39% : 0.000535s : 6: func_graph_cloner_run.FuncGraphClonerGraph 47.61% : 0.000486s : 10: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.062240 192 0.01% : 0.000006s : 1: ForceFp32Comm 5.55% : 0.003455s : 1: add_attr 5.53% : 0.003442s : 1: add_attr_with_inline 0.01% : 0.000006s : 1: add_comm_op_reuse_tag 0.10% : 0.000061s : 1: add_recomputation 0.01% : 0.000006s : 1: assign_add_opt 0.19% : 0.000117s : 1: auto_monad 0.08% : 0.000048s : 1: auto_monad_reorder 0.01% : 0.000006s : 1: begin_end_overlap_inline 0.01% : 0.000008s : 1: bias_add_comm_swap 0.85% : 0.000527s : 1: bootstrap 0.05% : 0.000030s : 1: cconv 0.01% : 0.000006s : 1: comm_op_add_attrs 0.04% : 0.000023s : 1: control_data_broadcast_order 0.02% : 0.000013s : 1: convert_after_rewriter 0.06% : 0.000034s : 1: cse_after_recomputation 0.01% : 0.000008s : 1: dataset_repeat_opt 0.09% : 0.000059s : 1: detach_backward 0.02% : 0.000012s : 1: environ_conv 0.18% : 0.000109s : 1: event_method 0.01% : 0.000007s : 1: full_micro_interleaved_order_control 0.01% : 0.000009s : 1: get_jit_bprop_graph 0.02% : 0.000015s : 1: graph_reusing 0.01% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000006s : 1: handle_group_info 0.01% : 0.000008s : 1: inline 0.01% : 0.000008s : 1: insert-virtual-dataset 0.03% : 0.000019s : 1: interleave_parallel_branches 0.01% : 0.000006s : 1: interleave_split_concat_branches 0.01% : 0.000009s : 1: label_fine_grained_interleaved_index 0.02% : 0.000010s : 1: label_micro_interleaved_index 0.71% : 0.000443s : 1: loop_unroll 0.01% : 0.000006s : 1: merge_cast_opt 0.01% : 0.000008s : 1: micro_interleaved_order_control 0.82% : 0.000510s : 1: mutable_eliminate 0.02% : 0.000010s : 1: offloading_packed_experts 0.03% : 0.000016s : 1: opt.transform.loop_unroll_optimizer 0.03% : 0.000017s : 1: opt.transform.mutable_eliminate 2.74% : 0.001706s : 78: opt.transform.opt_a 0.06% : 0.000038s : 1: opt.transform.opt_after_cconv 0.06% : 0.000039s : 1: opt.transform.opt_after_jit_grad 0.22% : 0.000139s : 28: opt.transform.opt_b 0.10% : 0.000060s : 2: opt.transform.opt_trans_graph 0.08% : 0.000047s : 4: opt.transform.symbol_engine_opt 6.33% : 0.003942s : 1: opt_a 0.23% : 0.000145s : 1: opt_after_cconv 1.27% : 0.000791s : 1: opt_after_jit_grad 0.51% : 0.000316s : 1: opt_b 29.86% : 0.018587s : 1: optimize 0.04% : 0.000025s : 1: optimize_parallel_all_gather_comm 0.02% : 0.000011s : 1: order_py_execute_after_rewriter 0.04% : 0.000027s : 1: overlap_grad_flash_sp 0.01% : 0.000006s : 1: overlap_grad_matmul_and_grad_allreduce 0.02% : 0.000010s : 1: overlap_grad_ring_attention 0.01% : 0.000007s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000007s : 1: overlap_opt_shard_in_pipeline 0.01% : 0.000008s : 1: overlap_param_gather 0.01% : 0.000006s : 1: overlap_recompute_allgather_and_fa_grad 0.02% : 0.000011s : 1: overlap_recompute_and_grad_model_parallel 0.01% : 0.000008s : 1: overlap_recompute_comm 0.01% : 0.000009s : 1: parallel-infer-symbol 0.01% : 0.000007s : 1: parallel-infer-symbol-second 0.01% : 0.000007s : 1: partial_unused_args_eliminate 0.02% : 0.000011s : 1: pipeline_parallel_scheduler 0.01% : 0.000007s : 1: pipeline_split 0.08% : 0.000050s : 1: pre_auto_parallel 0.07% : 0.000042s : 1: py_interpret_to_execute 0.03% : 0.000018s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000006s : 1: remove_cast_before_assign_add 0.03% : 0.000021s : 1: remove_dup_value 0.90% : 0.000558s : 1: renormalize.infer 0.69% : 0.000428s : 1: renormalize.specialize 0.01% : 0.000008s : 1: reorder_send_recv_between_fp_bp 0.03% : 0.000016s : 1: rewriter_after_jit_bprop_graph 0.08% : 0.000048s : 1: rewriter_after_opt_a 0.20% : 0.000123s : 1: rewriter_before_opt_a 0.01% : 0.000007s : 1: slice_cell_reuse_recomputed_activation 0.01% : 0.000007s : 1: slice_recompute_activation 0.01% : 0.000007s : 1: split_layernorm_comm 0.01% : 0.000007s : 1: split_matmul_comm_elemetwise 0.02% : 0.000012s : 1: swap_dp_allreduce_reducescatter 0.18% : 0.000111s : 1: symbol_engine_optimizer 0.17% : 0.000105s : 1: tuple_transform 41.13% : 0.025599s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:09.350.321 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0518198, [21] [bootstrap]: 0.00045329 [type_inference]: 0.0417352 [event_method]: 8.088e-05 [auto_monad]: 9.697e-05 [graph_reusing]: 6.76999e-06 [inline]: 1.92001e-06 [add_attr]: 0.00306676, [1] [add_attr_with_inline]: 0.00305743, [1] [Cycle 1]: 6.819e-05, [2] [tag_attr]: 2.741e-05 [meta_addattr_fg_expand]: 8.38001e-06 [parallel-infer-symbol]: 3.15002e-06 [pre_auto_parallel]: 4.52e-05 [insert-virtual-dataset]: 2.44001e-06 [parallel-infer-symbol-second]: 7.7e-07 [dataset_repeat_opt]: 2.01e-06 [pipeline_split]: 2.06e-06 [optimize]: 0.00561122, [53] [py_interpret_to_execute]: 3.321e-05 [rewriter_before_opt_a]: 0.00011483 [opt_a]: 0.00347954, [2] [Cycle 1]: 0.00270208, [45] [expand_dump_flag]: 4.42e-06 [switch_simplify]: 0.000127 [loop_unroll]: 4.205e-05 [a_1]: 0.0008805 [with_stream_mark]: 1.649e-05 [recompute_prepare]: 1.043e-05 [updatestate_depend_eliminate]: 4.43999e-06 [updatestate_assign_eliminate]: 3.95e-06 [updatestate_loads_eliminate]: 3.39001e-06 [parameter_eliminate]: 2.29001e-06 [a_2]: 0.00010571 [accelerated_algorithm]: 9.02999e-06 [shard]: 1.81e-06 [meta_shard_fg_expand]: 2.31e-06 [shard_inline]: 8.05e-06 [merge_send_recv]: 9.72999e-06 [auto_parallel]: 6.86001e-06 [parallel]: 1.823e-05 [flash_sp]: 8.74998e-06 [merge_comm]: 4.82e-06 [allreduce_fusion]: 4.48001e-06 [matmul_add_comm_reduction]: 1.039e-05 [allreduce_slice_to_reducescatter]: 9.30013e-07 [virtual_shard_identity]: 9.54e-06 [virtual_dataset]: 8.26002e-06 [get_grad_eliminate_]: 7.82e-06 [virtual_output]: 7.75998e-06 [merge_forward]: 4.57e-06 [cell_reuse_recompute_pass]: 1.30999e-06 [offload_activation]: 1.017e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.699e-05 [merge_recompute_call_nodes]: 1.64e-06 [before_grad]: 1.316e-05 [set_forward_comm_id_for_comm_node_pass]: 4.50001e-06 [meta_fg_expand]: 3.58e-06 [flash_sp_send_recv_attached]: 2.66e-06 [receive_attached]: 2.12001e-06 [after_resolve]: 1.321e-05 [a_after_grad]: 1.226e-05 [renormalize]: 0.00091649 [add_forward_monad_depend]: 6.36998e-06 [auto_monad_grad]: 2.19001e-06 [auto_monad_eliminator]: 1.779e-05 [cse]: 3.668e-05 [a_3]: 6.107e-05 [Cycle 2]: 0.00076712, [45] [expand_dump_flag]: 1.42e-06 [switch_simplify]: 9.36e-06 [loop_unroll]: 7.63001e-06 [a_1]: 0.00018173 [with_stream_mark]: 1.306e-05 [recompute_prepare]: 7.93999e-06 [updatestate_depend_eliminate]: 4.14002e-06 [updatestate_assign_eliminate]: 3.33e-06 [updatestate_loads_eliminate]: 3.06999e-06 [parameter_eliminate]: 1.54e-06 [a_2]: 9.616e-05 [accelerated_algorithm]: 7.84002e-06 [shard]: 1.36998e-06 [meta_shard_fg_expand]: 1.60001e-06 [shard_inline]: 7.78001e-06 [merge_send_recv]: 6.12999e-06 [auto_parallel]: 6.41e-06 [parallel]: 5.28002e-06 [flash_sp]: 3.14001e-06 [merge_comm]: 4.17998e-06 [allreduce_fusion]: 3.69002e-06 [matmul_add_comm_reduction]: 7.78999e-06 [allreduce_slice_to_reducescatter]: 4.10015e-07 [virtual_shard_identity]: 8.57e-06 [virtual_dataset]: 7.43999e-06 [get_grad_eliminate_]: 7.50998e-06 [virtual_output]: 7.00998e-06 [merge_forward]: 3.35998e-06 [cell_reuse_recompute_pass]: 1.79e-06 [offload_activation]: 8e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.451e-05 [merge_recompute_call_nodes]: 1.07e-06 [before_grad]: 1.224e-05 [set_forward_comm_id_for_comm_node_pass]: 4.33999e-06 [meta_fg_expand]: 2.79999e-06 [flash_sp_send_recv_attached]: 1.04e-06 [receive_attached]: 1.31002e-06 [after_resolve]: 1.174e-05 [a_after_grad]: 1.132e-05 [renormalize]: 7.00238e-08 [add_forward_monad_depend]: 1.42e-06 [auto_monad_grad]: 1.05001e-06 [auto_monad_eliminator]: 8.51997e-06 [cse]: 1.738e-05 [a_3]: 4.715e-05 [py_interpret_to_execute_after_opt_a]: 1.153e-05 [slice_cell_reuse_recomputed_activation]: 2.19999e-06 [rewriter_after_opt_a]: 4.002e-05 [convert_after_rewriter]: 7.10998e-06 [order_py_execute_after_rewriter]: 5.89e-06 [mutable_eliminate]: 0.00048816 [opt_b]: 0.00024277, [1] [Cycle 1]: 0.00023675, [7] [b_1]: 0.00015709 [b_2]: 9.46e-06 [updatestate_depend_eliminate]: 6.16e-06 [updatestate_assign_eliminate]: 2.98998e-06 [updatestate_loads_eliminate]: 3.00998e-06 [renormalize]: 3.60014e-07 [cse]: 2.205e-05 [optimize_parallel_all_gather_comm]: 1.737e-05 [overlap_param_gather]: 1.91e-06 [cconv]: 2.68e-05 [loop_unroll]: 0.0004207 [opt_after_cconv]: 0.00011457, [1] [Cycle 1]: 0.00010919, [7] [c_1]: 3.925e-05 [parameter_eliminate]: 2.39999e-06 [updatestate_depend_eliminate]: 5.35001e-06 [updatestate_assign_eliminate]: 3.08e-06 [updatestate_loads_eliminate]: 3.36001e-06 [cse]: 2.122e-05 [renormalize]: 2.89991e-07 [remove_dup_value]: 1.519e-05 [tuple_transform]: 8.639e-05, [1] [Cycle 1]: 8.128e-05, [4] [d_1]: 5.314e-05 [none_parameter_eliminate]: 1.71998e-06 [renormalize]: 1.60013e-07 [switch_simplify]: 8.2e-06 [partial_unused_args_eliminate]: 1.80001e-06 [add_recomputation]: 5.698e-05 [cse_after_recomputation]: 2.636e-05, [1] [Cycle 1]: 2.15e-05, [1] [cse]: 1.556e-05 [environ_conv]: 5.96e-06 [swap_dp_allreduce_reducescatter]: 5.53002e-06 [bias_add_comm_swap]: 2.93998e-06 [label_micro_interleaved_index]: 4.15999e-06 [label_fine_grained_interleaved_index]: 2.90002e-06 [merge_cast_opt]: 1.33002e-06 [slice_recompute_activation]: 1.91998e-06 [micro_interleaved_order_control]: 2.31e-06 [assign_add_opt]: 1.22e-06 [ForceFp32Comm]: 8.09989e-07 [remove_cast_before_assign_add]: 1.19e-06 [full_micro_interleaved_order_control]: 2.07001e-06 [reorder_send_recv_between_fp_bp]: 3.04001e-06 [comm_op_add_attrs]: 1.13001e-06 [add_comm_op_reuse_tag]: 1.15001e-06 [interleave_split_concat_branches]: 1.14e-06 [interleave_parallel_branches]: 1.02e-06 [overlap_opt_shard_in_pipeline]: 1.12999e-06 [overlap_opt_shard_grad_in_pipeline]: 1.80001e-06 [control_data_broadcast_order]: 1.673e-05 [grouped_pairwise_exchange_alltoall]: 2.16e-06 [offloading_packed_experts]: 4.34002e-06 [overlap_recompute_and_grad_model_parallel]: 5.39998e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.11997e-06 [overlap_recompute_allgather_and_fa_grad]: 1.31002e-06 [overlap_recompute_comm]: 2.17001e-06 [overlap_grad_ring_attention]: 4.48001e-06 [overlap_grad_flash_sp]: 2.031e-05 [begin_end_overlap_inline]: 5.29981e-07 [split_matmul_comm_elemetwise]: 2.46e-06 [split_layernorm_comm]: 1.81998e-06 [handle_group_info]: 8.70001e-07 [symbol_engine_optimizer]: 8.461e-05, [1] [Cycle 1]: 8.031e-05, [6] [build]: 2.98e-06 [elim_shapecalc]: 1.137e-05 [elim_not_effective]: 1.601e-05 [opt_reshape]: 8.43999e-06 [fold_const_symbol]: 1.257e-05 [renormalize]: 2.00002e-07 [detach_backward]: 1.92999e-06 [pipeline_parallel_scheduler]: 1.57999e-06 [auto_monad_reorder]: 1.994e-05 [get_jit_bprop_graph]: 1.41002e-06 [rewriter_after_jit_bprop_graph]: 3.64002e-06 [opt_after_jit_grad]: 0.00048116 [validate]: 4.443e-05 Sums bootstrap : 0.000453s : 0.95% type_inference : 0.041735s : 87.33% event_method : 0.000081s : 0.17% auto_monad : 0.000097s : 0.20% graph_reusing : 0.000007s : 0.01% inline : 0.000002s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000027s : 0.06% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000008s : 0.02% parallel-infer-symbol : 0.000003s : 0.01% pre_auto_parallel : 0.000045s : 0.09% insert-virtual-dataset : 0.000002s : 0.01% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000033s : 0.07% optimize.rewriter_before_opt_a : 0.000115s : 0.24% optimize.opt_a.expand_dump_flag : 0.000006s : 0.01% optimize.opt_a.switch_simplify : 0.000136s : 0.29% optimize.opt_a.loop_unroll : 0.000050s : 0.10% optimize.opt_a.a_1 : 0.001062s : 2.22% optimize.opt_a.with_stream_mark : 0.000030s : 0.06% optimize.opt_a.recompute_prepare : 0.000018s : 0.04% optimize.opt_a.updatestate_depend_eliminate : 0.000009s : 0.02% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.02% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.01% optimize.opt_a.parameter_eliminate : 0.000004s : 0.01% optimize.opt_a.a_2 : 0.000202s : 0.42% optimize.opt_a.accelerated_algorithm : 0.000017s : 0.04% optimize.opt_a.shard : 0.000003s : 0.01% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.01% optimize.opt_a.shard_inline : 0.000016s : 0.03% optimize.opt_a.merge_send_recv : 0.000016s : 0.03% optimize.opt_a.auto_parallel : 0.000013s : 0.03% optimize.opt_a.parallel : 0.000024s : 0.05% optimize.opt_a.flash_sp : 0.000012s : 0.02% optimize.opt_a.merge_comm : 0.000009s : 0.02% optimize.opt_a.allreduce_fusion : 0.000008s : 0.02% optimize.opt_a.matmul_add_comm_reduction : 0.000018s : 0.04% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000018s : 0.04% optimize.opt_a.virtual_dataset : 0.000016s : 0.03% optimize.opt_a.get_grad_eliminate_ : 0.000015s : 0.03% optimize.opt_a.virtual_output : 0.000015s : 0.03% optimize.opt_a.merge_forward : 0.000008s : 0.02% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.01% optimize.opt_a.offload_activation : 0.000018s : 0.04% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000032s : 0.07% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.01% optimize.opt_a.before_grad : 0.000025s : 0.05% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000009s : 0.02% optimize.opt_a.meta_fg_expand : 0.000006s : 0.01% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.01% optimize.opt_a.receive_attached : 0.000003s : 0.01% optimize.opt_a.after_resolve : 0.000025s : 0.05% optimize.opt_a.a_after_grad : 0.000024s : 0.05% optimize.opt_a.renormalize : 0.000917s : 1.92% optimize.opt_a.add_forward_monad_depend : 0.000008s : 0.02% optimize.opt_a.auto_monad_grad : 0.000003s : 0.01% optimize.opt_a.auto_monad_eliminator : 0.000026s : 0.06% optimize.opt_a.cse : 0.000054s : 0.11% optimize.opt_a.a_3 : 0.000108s : 0.23% optimize.py_interpret_to_execute_after_opt_a : 0.000012s : 0.02% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.00% optimize.rewriter_after_opt_a : 0.000040s : 0.08% optimize.convert_after_rewriter : 0.000007s : 0.01% optimize.order_py_execute_after_rewriter : 0.000006s : 0.01% optimize.mutable_eliminate : 0.000488s : 1.02% optimize.opt_b.b_1 : 0.000157s : 0.33% optimize.opt_b.b_2 : 0.000009s : 0.02% optimize.opt_b.updatestate_depend_eliminate : 0.000006s : 0.01% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.01% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.01% optimize.opt_b.renormalize : 0.000000s : 0.00% optimize.opt_b.cse : 0.000022s : 0.05% optimize.optimize_parallel_all_gather_comm : 0.000017s : 0.04% optimize.overlap_param_gather : 0.000002s : 0.00% optimize.cconv : 0.000027s : 0.06% optimize.loop_unroll : 0.000421s : 0.88% optimize.opt_after_cconv.c_1 : 0.000039s : 0.08% optimize.opt_after_cconv.parameter_eliminate : 0.000002s : 0.01% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000005s : 0.01% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.cse : 0.000021s : 0.04% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000015s : 0.03% optimize.tuple_transform.d_1 : 0.000053s : 0.11% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000008s : 0.02% optimize.partial_unused_args_eliminate : 0.000002s : 0.00% optimize.add_recomputation : 0.000057s : 0.12% optimize.cse_after_recomputation.cse : 0.000016s : 0.03% optimize.environ_conv : 0.000006s : 0.01% optimize.swap_dp_allreduce_reducescatter : 0.000006s : 0.01% optimize.bias_add_comm_swap : 0.000003s : 0.01% optimize.label_micro_interleaved_index : 0.000004s : 0.01% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.01% optimize.merge_cast_opt : 0.000001s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.00% optimize.micro_interleaved_order_control : 0.000002s : 0.00% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.00% optimize.full_micro_interleaved_order_control : 0.000002s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.01% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.00% optimize.control_data_broadcast_order : 0.000017s : 0.04% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.00% optimize.offloading_packed_experts : 0.000004s : 0.01% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.01% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.00% optimize.overlap_recompute_comm : 0.000002s : 0.00% optimize.overlap_grad_ring_attention : 0.000004s : 0.01% optimize.overlap_grad_flash_sp : 0.000020s : 0.04% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.01% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000003s : 0.01% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000011s : 0.02% optimize.symbol_engine_optimizer.elim_not_effective : 0.000016s : 0.03% optimize.symbol_engine_optimizer.opt_reshape : 0.000008s : 0.02% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000013s : 0.03% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.00% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000020s : 0.04% get_jit_bprop_graph : 0.000001s : 0.00% rewriter_after_jit_bprop_graph : 0.000004s : 0.01% opt_after_jit_grad : 0.000481s : 1.01% validate : 0.000044s : 0.09% Time group info: ------[substitution.] 0.000244 44 10.23% : 0.000025s : 3: substitution.cast_eliminate 0.91% : 0.000002s : 3: substitution.elim_not_effective 0.72% : 0.000002s : 3: substitution.fold_const_symbol 2.92% : 0.000007s : 5: substitution.graph_param_transform 69.91% : 0.000171s : 8: substitution.inline 1.79% : 0.000004s : 6: substitution.j_node_and_user_rematch 2.37% : 0.000006s : 6: substitution.remove_not_recompute_node 1.92% : 0.000005s : 4: substitution.replace_old_param 3.83% : 0.000009s : 2: substitution.switch_simplify 5.38% : 0.000013s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.041668 2 96.88% : 0.040367s : 1: type_inference.infer 3.12% : 0.001300s : 1: type_inference.specialize ------[replace.] 0.000119 14 42.45% : 0.000051s : 8: replace.inline 38.73% : 0.000046s : 2: replace.switch_simplify 18.82% : 0.000022s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000185 14 89.67% : 0.000166s : 8: match.inline 4.33% : 0.000008s : 2: match.switch_simplify 6.00% : 0.000011s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000273 1746 1.09% : 0.000003s : 19: predicate.accumulaten_eliminater 0.70% : 0.000002s : 5: predicate.ad_related_special_op_eliminate 0.54% : 0.000001s : 10: predicate.addn_check_dump 0.97% : 0.000003s : 19: predicate.addn_zero_filter 0.92% : 0.000003s : 19: predicate.adjust_all_reduce_mul_add 2.19% : 0.000006s : 29: predicate.arithmetic_simplify 1.17% : 0.000003s : 19: predicate.cast_eliminate 0.54% : 0.000001s : 10: predicate.check_bprop_eliminate 0.50% : 0.000001s : 10: predicate.compare_switch_simplify 0.18% : 0.000001s : 5: predicate.const_output_eliminate 0.51% : 0.000001s : 10: predicate.depend_value_elim 1.03% : 0.000003s : 19: predicate.dict_get_item_const_eliminator 1.12% : 0.000003s : 19: predicate.dict_get_item_eliminator 0.94% : 0.000003s : 19: predicate.dict_set_item_eliminator 0.79% : 0.000002s : 10: predicate.dumpgradient_eliminate 0.18% : 0.000000s : 5: predicate.elim_not_effective 0.32% : 0.000001s : 5: predicate.elim_shapecalc_of_broadcastargs 1.18% : 0.000003s : 24: predicate.environ_add_const_eliminate 1.14% : 0.000003s : 24: predicate.environ_get_add_eliminate 1.18% : 0.000003s : 24: predicate.environ_get_depend_swap 1.71% : 0.000005s : 34: predicate.environ_get_eliminate 1.14% : 0.000003s : 24: predicate.environ_get_set_eliminate 1.61% : 0.000004s : 31: predicate.exchange_switch_depend_value 2.46% : 0.000007s : 31: predicate.float_depend_g_call 0.48% : 0.000001s : 10: predicate.float_environ_get_switch 0.74% : 0.000002s : 15: predicate.float_tuple_getitem_switch 0.17% : 0.000000s : 5: predicate.fold_const_symbol 0.69% : 0.000002s : 10: predicate.get_grad_eliminate 0.18% : 0.000001s : 5: predicate.graph_param_transform 0.60% : 0.000002s : 10: predicate.incorporate_call 0.49% : 0.000001s : 10: predicate.incorporate_call_switch 6.53% : 0.000018s : 80: predicate.inline 0.73% : 0.000002s : 10: predicate.inline_without_move 0.30% : 0.000001s : 10: predicate.j_node_and_user_rematch 0.85% : 0.000002s : 10: predicate.less_batch_normalization 1.76% : 0.000005s : 33: predicate.list_to_tuple_eliminator_ 2.57% : 0.000007s : 52: predicate.load_eliminater 0.83% : 0.000002s : 5: predicate.loop_unroll_after_grad 2.56% : 0.000007s : 49: predicate.loop_unroll_before_grad 1.62% : 0.000004s : 29: predicate.make_slice_get_slice_eliminator 0.54% : 0.000001s : 10: predicate.merge_addn 0.49% : 0.000001s : 10: predicate.micro_step_allgather_replace 0.54% : 0.000001s : 10: predicate.mini_step_allgather_replace 0.94% : 0.000003s : 19: predicate.minmaximum_grad 0.94% : 0.000003s : 5: predicate.mutable_eliminate 0.29% : 0.000001s : 5: predicate.opt_reshape 0.31% : 0.000001s : 5: predicate.parallel_virtual_node 1.92% : 0.000005s : 31: predicate.partial_defer_inline 1.65% : 0.000005s : 28: predicate.partial_eliminate 0.94% : 0.000003s : 19: predicate.print_const_string_wrapper 0.49% : 0.000001s : 10: predicate.reduce_all_const_elim 1.34% : 0.000004s : 19: predicate.reduce_eliminate 2.55% : 0.000007s : 52: predicate.redundant_stop_gradient_eliminater 0.36% : 0.000001s : 10: predicate.remove_not_recompute_node 1.42% : 0.000004s : 33: predicate.replace_applicator 0.45% : 0.000001s : 10: predicate.replace_old_param 0.30% : 0.000001s : 5: predicate.reset_defer_inline 1.09% : 0.000003s : 19: predicate.reshape_eliminate 0.55% : 0.000002s : 10: predicate.row_tensor_add_zeros_like 0.31% : 0.000001s : 5: predicate.row_tensor_eliminate 0.66% : 0.000002s : 10: predicate.same_eliminate 0.40% : 0.000001s : 10: predicate.set_cell_output_no_recompute 0.64% : 0.000002s : 10: predicate.shard_identity_eliminate 0.69% : 0.000002s : 10: predicate.special_op_eliminate 0.78% : 0.000002s : 10: predicate.specialize_transform 0.74% : 0.000002s : 10: predicate.split_environ_get_set_with_tuple_value 0.65% : 0.000002s : 10: predicate.stack_unstack_eliminate 0.30% : 0.000001s : 5: predicate.switch_call_monad_eliminater 1.75% : 0.000005s : 31: predicate.switch_defer_inline 2.19% : 0.000006s : 41: predicate.switch_layer_defer_inline 5.79% : 0.000016s : 99: predicate.switch_simplify 1.00% : 0.000003s : 19: predicate.tile_eliminate 1.01% : 0.000003s : 19: predicate.transpose_eliminate 1.53% : 0.000004s : 29: predicate.tuple_list_convert_item_index_to_positive 1.63% : 0.000004s : 29: predicate.tuple_list_get_item_const_eliminator 1.46% : 0.000004s : 29: predicate.tuple_list_get_item_depend_reorder 3.00% : 0.000008s : 43: predicate.tuple_list_get_item_eliminator 1.48% : 0.000004s : 29: predicate.tuple_list_get_set_item_eliminator 2.30% : 0.000006s : 39: predicate.tuple_list_set_item_eliminator 1.70% : 0.000005s : 33: predicate.tuple_to_list_eliminator_ 2.51% : 0.000007s : 52: predicate.updatestate_pure_node_eliminater 3.06% : 0.000008s : 62: predicate.updatestate_useless_node_eliminater 0.30% : 0.000001s : 5: predicate.value_based_eliminate 0.62% : 0.000002s : 10: predicate.virtual_dataset_eliminate 0.55% : 0.000002s : 10: predicate.virtual_output_eliminate 0.25% : 0.000001s : 5: predicate.virtual_view_grad_eliminate 0.34% : 0.000001s : 5: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.001038 16 61.29% : 0.000636s : 6: func_graph_cloner_run.FuncGraphClonerGraph 38.71% : 0.000402s : 10: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.063301 192 0.01% : 0.000004s : 1: ForceFp32Comm 4.85% : 0.003072s : 1: add_attr 4.84% : 0.003061s : 1: add_attr_with_inline 0.01% : 0.000004s : 1: add_comm_op_reuse_tag 0.10% : 0.000061s : 1: add_recomputation 0.01% : 0.000004s : 1: assign_add_opt 0.16% : 0.000104s : 1: auto_monad 0.04% : 0.000024s : 1: auto_monad_reorder 0.01% : 0.000003s : 1: begin_end_overlap_inline 0.01% : 0.000006s : 1: bias_add_comm_swap 0.76% : 0.000480s : 1: bootstrap 0.05% : 0.000030s : 1: cconv 0.01% : 0.000004s : 1: comm_op_add_attrs 0.03% : 0.000020s : 1: control_data_broadcast_order 0.02% : 0.000010s : 1: convert_after_rewriter 0.05% : 0.000029s : 1: cse_after_recomputation 0.01% : 0.000005s : 1: dataset_repeat_opt 0.01% : 0.000005s : 1: detach_backward 0.01% : 0.000009s : 1: environ_conv 0.14% : 0.000090s : 1: event_method 0.01% : 0.000005s : 1: full_micro_interleaved_order_control 0.01% : 0.000005s : 1: get_jit_bprop_graph 0.02% : 0.000010s : 1: graph_reusing 0.01% : 0.000005s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000003s : 1: handle_group_info 0.01% : 0.000005s : 1: inline 0.01% : 0.000006s : 1: insert-virtual-dataset 0.01% : 0.000004s : 1: interleave_parallel_branches 0.01% : 0.000004s : 1: interleave_split_concat_branches 0.01% : 0.000006s : 1: label_fine_grained_interleaved_index 0.01% : 0.000007s : 1: label_micro_interleaved_index 0.68% : 0.000429s : 1: loop_unroll 0.01% : 0.000004s : 1: merge_cast_opt 0.01% : 0.000005s : 1: micro_interleaved_order_control 0.78% : 0.000496s : 1: mutable_eliminate 0.01% : 0.000007s : 1: offloading_packed_experts 0.03% : 0.000017s : 1: opt.transform.loop_unroll_optimizer 0.03% : 0.000017s : 1: opt.transform.mutable_eliminate 2.66% : 0.001684s : 78: opt.transform.opt_a 0.06% : 0.000038s : 1: opt.transform.opt_after_cconv 0.05% : 0.000030s : 1: opt.transform.opt_after_jit_grad 0.21% : 0.000136s : 28: opt.transform.opt_b 0.09% : 0.000059s : 2: opt.transform.opt_trans_graph 0.07% : 0.000045s : 4: opt.transform.symbol_engine_opt 5.50% : 0.003483s : 1: opt_a 0.19% : 0.000118s : 1: opt_after_cconv 0.78% : 0.000491s : 1: opt_after_jit_grad 0.39% : 0.000246s : 1: opt_b 8.87% : 0.005616s : 1: optimize 0.03% : 0.000021s : 1: optimize_parallel_all_gather_comm 0.01% : 0.000009s : 1: order_py_execute_after_rewriter 0.04% : 0.000024s : 1: overlap_grad_flash_sp 0.01% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.01% : 0.000007s : 1: overlap_grad_ring_attention 0.01% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.03% : 0.000017s : 1: overlap_opt_shard_in_pipeline 0.01% : 0.000005s : 1: overlap_param_gather 0.01% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.01% : 0.000008s : 1: overlap_recompute_and_grad_model_parallel 0.01% : 0.000005s : 1: overlap_recompute_comm 0.01% : 0.000007s : 1: parallel-infer-symbol 0.01% : 0.000004s : 1: parallel-infer-symbol-second 0.01% : 0.000005s : 1: partial_unused_args_eliminate 0.01% : 0.000005s : 1: pipeline_parallel_scheduler 0.01% : 0.000005s : 1: pipeline_split 0.08% : 0.000050s : 1: pre_auto_parallel 0.06% : 0.000037s : 1: py_interpret_to_execute 0.02% : 0.000015s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000004s : 1: remove_cast_before_assign_add 0.03% : 0.000019s : 1: remove_dup_value 0.79% : 0.000497s : 1: renormalize.infer 0.65% : 0.000411s : 1: renormalize.specialize 0.01% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.01% : 0.000007s : 1: rewriter_after_jit_bprop_graph 0.07% : 0.000044s : 1: rewriter_after_opt_a 0.19% : 0.000119s : 1: rewriter_before_opt_a 0.01% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.01% : 0.000005s : 1: slice_recompute_activation 0.01% : 0.000005s : 1: split_layernorm_comm 0.01% : 0.000005s : 1: split_matmul_comm_elemetwise 0.01% : 0.000008s : 1: swap_dp_allreduce_reducescatter 0.14% : 0.000087s : 1: symbol_engine_optimizer 0.14% : 0.000089s : 1: tuple_transform 65.96% : 0.041752s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:09.779.522 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:09.779.815 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0448582, [21] [bootstrap]: 0.00045494 [type_inference]: 0.0310473 [event_method]: 0.00013947 [auto_monad]: 0.00011315 [graph_reusing]: 8.13999e-06 [inline]: 2.68e-06 [add_attr]: 0.00408445, [1] [add_attr_with_inline]: 0.00407093, [1] [Cycle 1]: 0.00011711, [2] [tag_attr]: 3.86e-05 [meta_addattr_fg_expand]: 9.10999e-06 [parallel-infer-symbol]: 3.98999e-06 [pre_auto_parallel]: 5.602e-05 [insert-virtual-dataset]: 2.39999e-06 [parallel-infer-symbol-second]: 1.11002e-06 [dataset_repeat_opt]: 1.91e-06 [pipeline_split]: 1.67001e-06 [optimize]: 0.00763537, [53] [py_interpret_to_execute]: 5.273e-05 [rewriter_before_opt_a]: 0.00014969 [opt_a]: 0.00475569, [2] [Cycle 1]: 0.00366402, [45] [expand_dump_flag]: 5.24e-06 [switch_simplify]: 0.00014471 [loop_unroll]: 4.431e-05 [a_1]: 0.0010159 [with_stream_mark]: 2.266e-05 [recompute_prepare]: 1.528e-05 [updatestate_depend_eliminate]: 5.99999e-06 [updatestate_assign_eliminate]: 4.88001e-06 [updatestate_loads_eliminate]: 4.92e-06 [parameter_eliminate]: 2.34999e-06 [a_2]: 0.00015394 [accelerated_algorithm]: 1.04e-05 [shard]: 2.69999e-06 [meta_shard_fg_expand]: 2.97002e-06 [shard_inline]: 9.17999e-06 [merge_send_recv]: 1.197e-05 [auto_parallel]: 1.044e-05 [parallel]: 2.046e-05 [flash_sp]: 1.083e-05 [merge_comm]: 6.23998e-06 [allreduce_fusion]: 8.59e-06 [matmul_add_comm_reduction]: 1.26e-05 [allreduce_slice_to_reducescatter]: 8.49977e-07 [virtual_shard_identity]: 1.61e-05 [virtual_dataset]: 1.09e-05 [get_grad_eliminate_]: 9.54e-06 [virtual_output]: 1.01e-05 [merge_forward]: 5.35999e-06 [cell_reuse_recompute_pass]: 1.89999e-06 [offload_activation]: 1.376e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.63e-05 [merge_recompute_call_nodes]: 1.55999e-06 [before_grad]: 1.682e-05 [set_forward_comm_id_for_comm_node_pass]: 5.37001e-06 [meta_fg_expand]: 5.08002e-06 [flash_sp_send_recv_attached]: 3.68999e-06 [receive_attached]: 2.49001e-06 [after_resolve]: 1.617e-05 [a_after_grad]: 1.572e-05 [renormalize]: 0.00125759 [add_forward_monad_depend]: 7.66999e-06 [auto_monad_grad]: 2.63e-06 [auto_monad_eliminator]: 2.259e-05 [cse]: 4.429e-05 [a_3]: 9.131e-05 [Cycle 2]: 0.00107482, [45] [expand_dump_flag]: 1.61002e-06 [switch_simplify]: 1.164e-05 [loop_unroll]: 9.57999e-06 [a_1]: 0.00023339 [with_stream_mark]: 1.605e-05 [recompute_prepare]: 1.008e-05 [updatestate_depend_eliminate]: 5.21002e-06 [updatestate_assign_eliminate]: 4.08999e-06 [updatestate_loads_eliminate]: 4.1e-06 [parameter_eliminate]: 1.45999e-06 [a_2]: 0.00014246 [accelerated_algorithm]: 9.30001e-06 [shard]: 1.72999e-06 [meta_shard_fg_expand]: 2.83003e-06 [shard_inline]: 9.07001e-06 [merge_send_recv]: 7.56999e-06 [auto_parallel]: 8.12003e-06 [parallel]: 6.16e-06 [flash_sp]: 3.91999e-06 [merge_comm]: 4.95999e-06 [allreduce_fusion]: 5.27001e-06 [matmul_add_comm_reduction]: 9.07999e-06 [allreduce_slice_to_reducescatter]: 6.10016e-07 [virtual_shard_identity]: 1.038e-05 [virtual_dataset]: 8.70001e-06 [get_grad_eliminate_]: 8.69998e-06 [virtual_output]: 8.58001e-06 [merge_forward]: 4.92e-06 [cell_reuse_recompute_pass]: 1.99999e-06 [offload_activation]: 9.37999e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.924e-05 [merge_recompute_call_nodes]: 1.19e-06 [before_grad]: 1.476e-05 [set_forward_comm_id_for_comm_node_pass]: 5.49e-06 [meta_fg_expand]: 4.2e-06 [flash_sp_send_recv_attached]: 9.60019e-07 [receive_attached]: 1.04e-06 [after_resolve]: 1.419e-05 [a_after_grad]: 1.357e-05 [renormalize]: 8.9989e-08 [add_forward_monad_depend]: 1.67999e-06 [auto_monad_grad]: 1.04e-06 [auto_monad_eliminator]: 1.028e-05 [cse]: 2.366e-05 [a_3]: 6.839e-05 [py_interpret_to_execute_after_opt_a]: 1.919e-05 [slice_cell_reuse_recomputed_activation]: 4.46002e-06 [rewriter_after_opt_a]: 5.254e-05 [convert_after_rewriter]: 1.164e-05 [order_py_execute_after_rewriter]: 9.87999e-06 [mutable_eliminate]: 0.00068474 [opt_b]: 0.00038155, [1] [Cycle 1]: 0.00037129, [7] [b_1]: 0.00025581 [b_2]: 1.054e-05 [updatestate_depend_eliminate]: 8.53001e-06 [updatestate_assign_eliminate]: 3.98999e-06 [updatestate_loads_eliminate]: 3.87002e-06 [renormalize]: 6.69999e-07 [cse]: 3.024e-05 [optimize_parallel_all_gather_comm]: 2.317e-05 [overlap_param_gather]: 5.57999e-06 [cconv]: 3.177e-05 [loop_unroll]: 0.00047331 [opt_after_cconv]: 0.00015945, [1] [Cycle 1]: 0.00015042, [7] [c_1]: 4.591e-05 [parameter_eliminate]: 3.56001e-06 [updatestate_depend_eliminate]: 7.91001e-06 [updatestate_assign_eliminate]: 4.07e-06 [updatestate_loads_eliminate]: 3.61999e-06 [cse]: 2.766e-05 [renormalize]: 6.60017e-07 [remove_dup_value]: 4.491e-05 [tuple_transform]: 0.00011451, [1] [Cycle 1]: 0.00010706, [4] [d_1]: 6.335e-05 [none_parameter_eliminate]: 1.80001e-06 [renormalize]: 2.09984e-07 [switch_simplify]: 9.56998e-06 [partial_unused_args_eliminate]: 4.61002e-06 [add_recomputation]: 6.624e-05 [cse_after_recomputation]: 3.365e-05, [1] [Cycle 1]: 2.679e-05, [1] [cse]: 1.646e-05 [environ_conv]: 1.027e-05 [swap_dp_allreduce_reducescatter]: 9.47999e-06 [bias_add_comm_swap]: 5.52001e-06 [label_micro_interleaved_index]: 7.51999e-06 [label_fine_grained_interleaved_index]: 5.18002e-06 [merge_cast_opt]: 3.70998e-06 [slice_recompute_activation]: 4.74e-06 [micro_interleaved_order_control]: 4.97e-06 [assign_add_opt]: 3.46999e-06 [ForceFp32Comm]: 4.23001e-06 [remove_cast_before_assign_add]: 3.38e-06 [full_micro_interleaved_order_control]: 4.37003e-06 [reorder_send_recv_between_fp_bp]: 5.17e-06 [comm_op_add_attrs]: 4.01001e-06 [add_comm_op_reuse_tag]: 3.29001e-06 [interleave_split_concat_branches]: 3.62998e-06 [interleave_parallel_branches]: 3.42002e-06 [overlap_opt_shard_in_pipeline]: 3.41001e-06 [overlap_opt_shard_grad_in_pipeline]: 4.62e-06 [control_data_broadcast_order]: 2.003e-05 [grouped_pairwise_exchange_alltoall]: 4.20999e-06 [offloading_packed_experts]: 7.51999e-06 [overlap_recompute_and_grad_model_parallel]: 8.38001e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.71001e-06 [overlap_recompute_allgather_and_fa_grad]: 3.76999e-06 [overlap_recompute_comm]: 4.91002e-06 [overlap_grad_ring_attention]: 8.01001e-06 [overlap_grad_flash_sp]: 2.82e-05 [begin_end_overlap_inline]: 2.93998e-06 [split_matmul_comm_elemetwise]: 5.02e-06 [split_layernorm_comm]: 4.15e-06 [handle_group_info]: 3.51001e-06 [symbol_engine_optimizer]: 0.00012015, [1] [Cycle 1]: 0.0001122, [6] [build]: 3.33998e-06 [elim_shapecalc]: 1.449e-05 [elim_not_effective]: 1.978e-05 [opt_reshape]: 9.96e-06 [fold_const_symbol]: 1.553e-05 [renormalize]: 2.50002e-07 [detach_backward]: 4.42998e-06 [pipeline_parallel_scheduler]: 1.82001e-06 [auto_monad_reorder]: 2.687e-05 [get_jit_bprop_graph]: 1.99e-06 [rewriter_after_jit_bprop_graph]: 5.35999e-06 [opt_after_jit_grad]: 0.00053245 [validate]: 4.514e-05 Sums bootstrap : 0.000455s : 1.17% type_inference : 0.031047s : 80.12% event_method : 0.000139s : 0.36% auto_monad : 0.000113s : 0.29% graph_reusing : 0.000008s : 0.02% inline : 0.000003s : 0.01% add_attr.add_attr_with_inline.tag_attr : 0.000039s : 0.10% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000009s : 0.02% parallel-infer-symbol : 0.000004s : 0.01% pre_auto_parallel : 0.000056s : 0.14% insert-virtual-dataset : 0.000002s : 0.01% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000053s : 0.14% optimize.rewriter_before_opt_a : 0.000150s : 0.39% optimize.opt_a.expand_dump_flag : 0.000007s : 0.02% optimize.opt_a.switch_simplify : 0.000156s : 0.40% optimize.opt_a.loop_unroll : 0.000054s : 0.14% optimize.opt_a.a_1 : 0.001249s : 3.22% optimize.opt_a.with_stream_mark : 0.000039s : 0.10% optimize.opt_a.recompute_prepare : 0.000025s : 0.07% optimize.opt_a.updatestate_depend_eliminate : 0.000011s : 0.03% optimize.opt_a.updatestate_assign_eliminate : 0.000009s : 0.02% optimize.opt_a.updatestate_loads_eliminate : 0.000009s : 0.02% optimize.opt_a.parameter_eliminate : 0.000004s : 0.01% optimize.opt_a.a_2 : 0.000296s : 0.76% optimize.opt_a.accelerated_algorithm : 0.000020s : 0.05% optimize.opt_a.shard : 0.000004s : 0.01% optimize.opt_a.meta_shard_fg_expand : 0.000006s : 0.01% optimize.opt_a.shard_inline : 0.000018s : 0.05% optimize.opt_a.merge_send_recv : 0.000020s : 0.05% optimize.opt_a.auto_parallel : 0.000019s : 0.05% optimize.opt_a.parallel : 0.000027s : 0.07% optimize.opt_a.flash_sp : 0.000015s : 0.04% optimize.opt_a.merge_comm : 0.000011s : 0.03% optimize.opt_a.allreduce_fusion : 0.000014s : 0.04% optimize.opt_a.matmul_add_comm_reduction : 0.000022s : 0.06% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000026s : 0.07% optimize.opt_a.virtual_dataset : 0.000020s : 0.05% optimize.opt_a.get_grad_eliminate_ : 0.000018s : 0.05% optimize.opt_a.virtual_output : 0.000019s : 0.05% optimize.opt_a.merge_forward : 0.000010s : 0.03% optimize.opt_a.cell_reuse_recompute_pass : 0.000004s : 0.01% optimize.opt_a.offload_activation : 0.000023s : 0.06% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000046s : 0.12% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.01% optimize.opt_a.before_grad : 0.000032s : 0.08% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000011s : 0.03% optimize.opt_a.meta_fg_expand : 0.000009s : 0.02% optimize.opt_a.flash_sp_send_recv_attached : 0.000005s : 0.01% optimize.opt_a.receive_attached : 0.000004s : 0.01% optimize.opt_a.after_resolve : 0.000030s : 0.08% optimize.opt_a.a_after_grad : 0.000029s : 0.08% optimize.opt_a.renormalize : 0.001258s : 3.25% optimize.opt_a.add_forward_monad_depend : 0.000009s : 0.02% optimize.opt_a.auto_monad_grad : 0.000004s : 0.01% optimize.opt_a.auto_monad_eliminator : 0.000033s : 0.08% optimize.opt_a.cse : 0.000068s : 0.18% optimize.opt_a.a_3 : 0.000160s : 0.41% optimize.py_interpret_to_execute_after_opt_a : 0.000019s : 0.05% optimize.slice_cell_reuse_recomputed_activation : 0.000004s : 0.01% optimize.rewriter_after_opt_a : 0.000053s : 0.14% optimize.convert_after_rewriter : 0.000012s : 0.03% optimize.order_py_execute_after_rewriter : 0.000010s : 0.03% optimize.mutable_eliminate : 0.000685s : 1.77% optimize.opt_b.b_1 : 0.000256s : 0.66% optimize.opt_b.b_2 : 0.000011s : 0.03% optimize.opt_b.updatestate_depend_eliminate : 0.000009s : 0.02% optimize.opt_b.updatestate_assign_eliminate : 0.000004s : 0.01% optimize.opt_b.updatestate_loads_eliminate : 0.000004s : 0.01% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000030s : 0.08% optimize.optimize_parallel_all_gather_comm : 0.000023s : 0.06% optimize.overlap_param_gather : 0.000006s : 0.01% optimize.cconv : 0.000032s : 0.08% optimize.loop_unroll : 0.000473s : 1.22% optimize.opt_after_cconv.c_1 : 0.000046s : 0.12% optimize.opt_after_cconv.parameter_eliminate : 0.000004s : 0.01% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000008s : 0.02% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000004s : 0.01% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000004s : 0.01% optimize.opt_after_cconv.cse : 0.000028s : 0.07% optimize.opt_after_cconv.renormalize : 0.000001s : 0.00% optimize.remove_dup_value : 0.000045s : 0.12% optimize.tuple_transform.d_1 : 0.000063s : 0.16% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000010s : 0.02% optimize.partial_unused_args_eliminate : 0.000005s : 0.01% optimize.add_recomputation : 0.000066s : 0.17% optimize.cse_after_recomputation.cse : 0.000016s : 0.04% optimize.environ_conv : 0.000010s : 0.03% optimize.swap_dp_allreduce_reducescatter : 0.000009s : 0.02% optimize.bias_add_comm_swap : 0.000006s : 0.01% optimize.label_micro_interleaved_index : 0.000008s : 0.02% optimize.label_fine_grained_interleaved_index : 0.000005s : 0.01% optimize.merge_cast_opt : 0.000004s : 0.01% optimize.slice_recompute_activation : 0.000005s : 0.01% optimize.micro_interleaved_order_control : 0.000005s : 0.01% optimize.assign_add_opt : 0.000003s : 0.01% optimize.ForceFp32Comm : 0.000004s : 0.01% optimize.remove_cast_before_assign_add : 0.000003s : 0.01% optimize.full_micro_interleaved_order_control : 0.000004s : 0.01% optimize.reorder_send_recv_between_fp_bp : 0.000005s : 0.01% optimize.comm_op_add_attrs : 0.000004s : 0.01% optimize.add_comm_op_reuse_tag : 0.000003s : 0.01% optimize.interleave_split_concat_branches : 0.000004s : 0.01% optimize.interleave_parallel_branches : 0.000003s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000003s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000005s : 0.01% optimize.control_data_broadcast_order : 0.000020s : 0.05% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.01% optimize.offloading_packed_experts : 0.000008s : 0.02% optimize.overlap_recompute_and_grad_model_parallel : 0.000008s : 0.02% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.01% optimize.overlap_recompute_comm : 0.000005s : 0.01% optimize.overlap_grad_ring_attention : 0.000008s : 0.02% optimize.overlap_grad_flash_sp : 0.000028s : 0.07% optimize.begin_end_overlap_inline : 0.000003s : 0.01% optimize.split_matmul_comm_elemetwise : 0.000005s : 0.01% optimize.split_layernorm_comm : 0.000004s : 0.01% optimize.handle_group_info : 0.000004s : 0.01% optimize.symbol_engine_optimizer.build : 0.000003s : 0.01% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000014s : 0.04% optimize.symbol_engine_optimizer.elim_not_effective : 0.000020s : 0.05% optimize.symbol_engine_optimizer.opt_reshape : 0.000010s : 0.03% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000016s : 0.04% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000004s : 0.01% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000027s : 0.07% get_jit_bprop_graph : 0.000002s : 0.01% rewriter_after_jit_bprop_graph : 0.000005s : 0.01% opt_after_jit_grad : 0.000532s : 1.37% validate : 0.000045s : 0.12% Time group info: ------[substitution.] 0.000318 54 11.93% : 0.000038s : 6: substitution.cast_eliminate 0.92% : 0.000003s : 4: substitution.elim_not_effective 0.72% : 0.000002s : 4: substitution.fold_const_symbol 2.23% : 0.000007s : 6: substitution.graph_param_transform 69.19% : 0.000220s : 8: substitution.inline 1.93% : 0.000006s : 8: substitution.j_node_and_user_rematch 2.49% : 0.000008s : 8: substitution.remove_not_recompute_node 2.01% : 0.000006s : 4: substitution.replace_old_param 3.86% : 0.000012s : 2: substitution.switch_simplify 4.71% : 0.000015s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.030962 2 94.31% : 0.029199s : 1: type_inference.infer 5.69% : 0.001763s : 1: type_inference.specialize ------[replace.] 0.000143 14 41.58% : 0.000059s : 8: replace.inline 39.84% : 0.000057s : 2: replace.switch_simplify 18.58% : 0.000027s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000238 14 90.03% : 0.000215s : 8: match.inline 4.57% : 0.000011s : 2: match.switch_simplify 5.40% : 0.000013s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000324 1972 0.95% : 0.000003s : 21: predicate.accumulaten_eliminater 0.63% : 0.000002s : 6: predicate.ad_related_special_op_eliminate 0.52% : 0.000002s : 12: predicate.addn_check_dump 0.94% : 0.000003s : 21: predicate.addn_zero_filter 0.88% : 0.000003s : 21: predicate.adjust_all_reduce_mul_add 2.27% : 0.000007s : 33: predicate.arithmetic_simplify 1.10% : 0.000004s : 21: predicate.cast_eliminate 0.67% : 0.000002s : 12: predicate.check_bprop_eliminate 0.53% : 0.000002s : 12: predicate.compare_switch_simplify 0.17% : 0.000001s : 6: predicate.const_output_eliminate 0.63% : 0.000002s : 12: predicate.depend_value_elim 1.11% : 0.000004s : 21: predicate.dict_get_item_const_eliminator 1.16% : 0.000004s : 21: predicate.dict_get_item_eliminator 0.94% : 0.000003s : 21: predicate.dict_set_item_eliminator 0.80% : 0.000003s : 12: predicate.dumpgradient_eliminate 0.17% : 0.000001s : 6: predicate.elim_not_effective 0.33% : 0.000001s : 6: predicate.elim_shapecalc_of_broadcastargs 1.13% : 0.000004s : 27: predicate.environ_add_const_eliminate 1.24% : 0.000004s : 27: predicate.environ_get_add_eliminate 1.19% : 0.000004s : 27: predicate.environ_get_depend_swap 1.86% : 0.000006s : 39: predicate.environ_get_eliminate 1.25% : 0.000004s : 27: predicate.environ_get_set_eliminate 1.43% : 0.000005s : 33: predicate.exchange_switch_depend_value 2.23% : 0.000007s : 33: predicate.float_depend_g_call 0.55% : 0.000002s : 12: predicate.float_environ_get_switch 0.85% : 0.000003s : 18: predicate.float_tuple_getitem_switch 0.18% : 0.000001s : 6: predicate.fold_const_symbol 0.60% : 0.000002s : 12: predicate.get_grad_eliminate 0.19% : 0.000001s : 6: predicate.graph_param_transform 0.58% : 0.000002s : 12: predicate.incorporate_call 0.50% : 0.000002s : 12: predicate.incorporate_call_switch 6.36% : 0.000021s : 90: predicate.inline 0.88% : 0.000003s : 12: predicate.inline_without_move 0.29% : 0.000001s : 12: predicate.j_node_and_user_rematch 0.72% : 0.000002s : 12: predicate.less_batch_normalization 1.74% : 0.000006s : 37: predicate.list_to_tuple_eliminator_ 2.58% : 0.000008s : 58: predicate.load_eliminater 0.91% : 0.000003s : 6: predicate.loop_unroll_after_grad 2.33% : 0.000008s : 51: predicate.loop_unroll_before_grad 1.82% : 0.000006s : 33: predicate.make_slice_get_slice_eliminator 0.56% : 0.000002s : 12: predicate.merge_addn 0.57% : 0.000002s : 12: predicate.micro_step_allgather_replace 0.56% : 0.000002s : 12: predicate.mini_step_allgather_replace 0.97% : 0.000003s : 21: predicate.minmaximum_grad 0.88% : 0.000003s : 6: predicate.mutable_eliminate 0.30% : 0.000001s : 6: predicate.opt_reshape 0.31% : 0.000001s : 6: predicate.parallel_virtual_node 1.99% : 0.000006s : 33: predicate.partial_defer_inline 1.55% : 0.000005s : 31: predicate.partial_eliminate 1.00% : 0.000003s : 21: predicate.print_const_string_wrapper 0.65% : 0.000002s : 12: predicate.reduce_all_const_elim 1.42% : 0.000005s : 21: predicate.reduce_eliminate 2.43% : 0.000008s : 58: predicate.redundant_stop_gradient_eliminater 0.35% : 0.000001s : 12: predicate.remove_not_recompute_node 1.33% : 0.000004s : 37: predicate.replace_applicator 0.37% : 0.000001s : 12: predicate.replace_old_param 0.27% : 0.000001s : 6: predicate.reset_defer_inline 1.01% : 0.000003s : 21: predicate.reshape_eliminate 0.62% : 0.000002s : 12: predicate.row_tensor_add_zeros_like 0.32% : 0.000001s : 6: predicate.row_tensor_eliminate 0.77% : 0.000002s : 12: predicate.same_eliminate 0.38% : 0.000001s : 12: predicate.set_cell_output_no_recompute 0.81% : 0.000003s : 12: predicate.shard_identity_eliminate 0.73% : 0.000002s : 12: predicate.special_op_eliminate 0.89% : 0.000003s : 12: predicate.specialize_transform 0.72% : 0.000002s : 12: predicate.split_environ_get_set_with_tuple_value 0.73% : 0.000002s : 12: predicate.stack_unstack_eliminate 0.30% : 0.000001s : 6: predicate.switch_call_monad_eliminater 1.57% : 0.000005s : 33: predicate.switch_defer_inline 2.20% : 0.000007s : 45: predicate.switch_layer_defer_inline 5.43% : 0.000018s : 106: predicate.switch_simplify 0.97% : 0.000003s : 21: predicate.tile_eliminate 1.00% : 0.000003s : 21: predicate.transpose_eliminate 1.56% : 0.000005s : 33: predicate.tuple_list_convert_item_index_to_positive 1.68% : 0.000005s : 33: predicate.tuple_list_get_item_const_eliminator 1.47% : 0.000005s : 33: predicate.tuple_list_get_item_depend_reorder 2.94% : 0.000010s : 49: predicate.tuple_list_get_item_eliminator 1.54% : 0.000005s : 33: predicate.tuple_list_get_set_item_eliminator 2.31% : 0.000008s : 45: predicate.tuple_list_set_item_eliminator 1.67% : 0.000005s : 37: predicate.tuple_to_list_eliminator_ 2.43% : 0.000008s : 58: predicate.updatestate_pure_node_eliminater 3.09% : 0.000010s : 70: predicate.updatestate_useless_node_eliminater 0.31% : 0.000001s : 6: predicate.value_based_eliminate 0.61% : 0.000002s : 12: predicate.virtual_dataset_eliminate 0.60% : 0.000002s : 12: predicate.virtual_output_eliminate 0.25% : 0.000001s : 6: predicate.virtual_view_grad_eliminate 0.40% : 0.000001s : 6: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.001122 16 50.93% : 0.000571s : 6: func_graph_cloner_run.FuncGraphClonerGraph 49.07% : 0.000550s : 10: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.060107 192 0.01% : 0.000007s : 1: ForceFp32Comm 6.81% : 0.004095s : 1: add_attr 6.78% : 0.004075s : 1: add_attr_with_inline 0.01% : 0.000006s : 1: add_comm_op_reuse_tag 0.12% : 0.000070s : 1: add_recomputation 0.01% : 0.000007s : 1: assign_add_opt 0.21% : 0.000128s : 1: auto_monad 0.06% : 0.000035s : 1: auto_monad_reorder 0.01% : 0.000006s : 1: begin_end_overlap_inline 0.01% : 0.000008s : 1: bias_add_comm_swap 0.83% : 0.000498s : 1: bootstrap 0.06% : 0.000035s : 1: cconv 0.01% : 0.000007s : 1: comm_op_add_attrs 0.04% : 0.000023s : 1: control_data_broadcast_order 0.02% : 0.000015s : 1: convert_after_rewriter 0.06% : 0.000037s : 1: cse_after_recomputation 0.01% : 0.000007s : 1: dataset_repeat_opt 0.04% : 0.000025s : 1: detach_backward 0.02% : 0.000013s : 1: environ_conv 0.26% : 0.000157s : 1: event_method 0.01% : 0.000007s : 1: full_micro_interleaved_order_control 0.01% : 0.000008s : 1: get_jit_bprop_graph 0.02% : 0.000015s : 1: graph_reusing 0.01% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000006s : 1: handle_group_info 0.01% : 0.000008s : 1: inline 0.02% : 0.000009s : 1: insert-virtual-dataset 0.01% : 0.000006s : 1: interleave_parallel_branches 0.01% : 0.000006s : 1: interleave_split_concat_branches 0.01% : 0.000008s : 1: label_fine_grained_interleaved_index 0.02% : 0.000010s : 1: label_micro_interleaved_index 0.80% : 0.000480s : 1: loop_unroll 0.01% : 0.000007s : 1: merge_cast_opt 0.01% : 0.000008s : 1: micro_interleaved_order_control 1.15% : 0.000691s : 1: mutable_eliminate 0.02% : 0.000010s : 1: offloading_packed_experts 0.03% : 0.000019s : 1: opt.transform.loop_unroll_optimizer 0.03% : 0.000020s : 1: opt.transform.mutable_eliminate 3.34% : 0.002010s : 78: opt.transform.opt_a 0.07% : 0.000045s : 1: opt.transform.opt_after_cconv 0.06% : 0.000036s : 1: opt.transform.opt_after_jit_grad 0.32% : 0.000191s : 28: opt.transform.opt_b 0.12% : 0.000071s : 2: opt.transform.opt_trans_graph 0.09% : 0.000056s : 4: opt.transform.symbol_engine_opt 7.92% : 0.004759s : 1: opt_a 0.27% : 0.000163s : 1: opt_after_cconv 0.90% : 0.000543s : 1: opt_after_jit_grad 0.64% : 0.000385s : 1: opt_b 13.36% : 0.008030s : 1: optimize 0.04% : 0.000026s : 1: optimize_parallel_all_gather_comm 0.02% : 0.000013s : 1: order_py_execute_after_rewriter 0.05% : 0.000031s : 1: overlap_grad_flash_sp 0.01% : 0.000006s : 1: overlap_grad_matmul_and_grad_allreduce 0.02% : 0.000011s : 1: overlap_grad_ring_attention 0.01% : 0.000007s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000006s : 1: overlap_opt_shard_in_pipeline 0.01% : 0.000009s : 1: overlap_param_gather 0.01% : 0.000007s : 1: overlap_recompute_allgather_and_fa_grad 0.02% : 0.000011s : 1: overlap_recompute_and_grad_model_parallel 0.01% : 0.000008s : 1: overlap_recompute_comm 0.02% : 0.000011s : 1: parallel-infer-symbol 0.01% : 0.000007s : 1: parallel-infer-symbol-second 0.01% : 0.000008s : 1: partial_unused_args_eliminate 0.01% : 0.000008s : 1: pipeline_parallel_scheduler 0.01% : 0.000008s : 1: pipeline_split 0.11% : 0.000065s : 1: pre_auto_parallel 0.10% : 0.000057s : 1: py_interpret_to_execute 0.04% : 0.000023s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000006s : 1: remove_cast_before_assign_add 0.08% : 0.000049s : 1: remove_dup_value 1.13% : 0.000678s : 1: renormalize.infer 0.94% : 0.000567s : 1: renormalize.specialize 0.01% : 0.000008s : 1: reorder_send_recv_between_fp_bp 0.02% : 0.000011s : 1: rewriter_after_jit_bprop_graph 0.09% : 0.000056s : 1: rewriter_after_opt_a 0.26% : 0.000156s : 1: rewriter_before_opt_a 0.01% : 0.000007s : 1: slice_cell_reuse_recomputed_activation 0.01% : 0.000007s : 1: slice_recompute_activation 0.01% : 0.000007s : 1: split_layernorm_comm 0.01% : 0.000008s : 1: split_matmul_comm_elemetwise 0.02% : 0.000012s : 1: swap_dp_allreduce_reducescatter 0.20% : 0.000123s : 1: symbol_engine_optimizer 0.20% : 0.000118s : 1: tuple_transform 51.74% : 0.031102s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:10.214.699 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0379238, [21] [bootstrap]: 0.00045354 [type_inference]: 0.0131445 [event_method]: 8.439e-05 [auto_monad]: 0.00010202 [graph_reusing]: 8.27e-06 [inline]: 2.36e-06 [add_attr]: 0.00303725, [1] [add_attr_with_inline]: 0.00302868, [1] [Cycle 1]: 6.543e-05, [2] [tag_attr]: 2.799e-05 [meta_addattr_fg_expand]: 8.38999e-06 [parallel-infer-symbol]: 3.07002e-06 [pre_auto_parallel]: 5.368e-05 [insert-virtual-dataset]: 2.26998e-06 [parallel-infer-symbol-second]: 1.10999e-06 [dataset_repeat_opt]: 1.88002e-06 [pipeline_split]: 1.52001e-06 [optimize]: 0.0203107, [53] [py_interpret_to_execute]: 7.728e-05 [rewriter_before_opt_a]: 0.00012134 [opt_a]: 0.0176694, [2] [Cycle 1]: 0.0167326, [45] [expand_dump_flag]: 3.93999e-06 [switch_simplify]: 0.00012914 [loop_unroll]: 4.401e-05 [a_1]: 0.00093102 [with_stream_mark]: 1.785e-05 [recompute_prepare]: 1.211e-05 [updatestate_depend_eliminate]: 5.18002e-06 [updatestate_assign_eliminate]: 4.39002e-06 [updatestate_loads_eliminate]: 4.13001e-06 [parameter_eliminate]: 2.09e-06 [a_2]: 0.00012271 [accelerated_algorithm]: 9.51e-06 [shard]: 2.26e-06 [meta_shard_fg_expand]: 2.66e-06 [shard_inline]: 8.94e-06 [merge_send_recv]: 1.028e-05 [auto_parallel]: 8.27e-06 [parallel]: 1.904e-05 [flash_sp]: 8.85001e-06 [merge_comm]: 5.37999e-06 [allreduce_fusion]: 5.10001e-06 [matmul_add_comm_reduction]: 1.125e-05 [allreduce_slice_to_reducescatter]: 6.19999e-07 [virtual_shard_identity]: 1.149e-05 [virtual_dataset]: 9.22001e-06 [get_grad_eliminate_]: 8.63001e-06 [virtual_output]: 8.72998e-06 [merge_forward]: 5.61e-06 [cell_reuse_recompute_pass]: 1.24e-06 [offload_activation]: 1.19e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.858e-05 [merge_recompute_call_nodes]: 1.66002e-06 [before_grad]: 1.507e-05 [set_forward_comm_id_for_comm_node_pass]: 5.23002e-06 [meta_fg_expand]: 4.50999e-06 [flash_sp_send_recv_attached]: 3.10998e-06 [receive_attached]: 2.76e-06 [after_resolve]: 1.452e-05 [a_after_grad]: 1.373e-05 [renormalize]: 0.0147734 [add_forward_monad_depend]: 1.162e-05 [auto_monad_grad]: 2.63e-06 [auto_monad_eliminator]: 2.774e-05 [cse]: 4.765e-05 [a_3]: 8.353e-05 [Cycle 2]: 0.0009239, [45] [expand_dump_flag]: 2.18998e-06 [switch_simplify]: 1.191e-05 [loop_unroll]: 9.32999e-06 [a_1]: 0.00023716 [with_stream_mark]: 1.998e-05 [recompute_prepare]: 8.95001e-06 [updatestate_depend_eliminate]: 5.56e-06 [updatestate_assign_eliminate]: 4.32e-06 [updatestate_loads_eliminate]: 4.27e-06 [parameter_eliminate]: 1.97999e-06 [a_2]: 0.00011521 [accelerated_algorithm]: 9.19e-06 [shard]: 2.64999e-06 [meta_shard_fg_expand]: 3.03e-06 [shard_inline]: 9.07999e-06 [merge_send_recv]: 9.67999e-06 [auto_parallel]: 1.024e-05 [parallel]: 8.53001e-06 [flash_sp]: 4.12998e-06 [merge_comm]: 5.27001e-06 [allreduce_fusion]: 4.75999e-06 [matmul_add_comm_reduction]: 1.186e-05 [allreduce_slice_to_reducescatter]: 6.39993e-07 [virtual_shard_identity]: 9.97001e-06 [virtual_dataset]: 8.92e-06 [get_grad_eliminate_]: 8.45999e-06 [virtual_output]: 8.18999e-06 [merge_forward]: 5.42001e-06 [cell_reuse_recompute_pass]: 2.91999e-06 [offload_activation]: 1.165e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.704e-05 [merge_recompute_call_nodes]: 1.52001e-06 [before_grad]: 1.457e-05 [set_forward_comm_id_for_comm_node_pass]: 5.56002e-06 [meta_fg_expand]: 3.95e-06 [flash_sp_send_recv_attached]: 2.22999e-06 [receive_attached]: 2.46998e-06 [after_resolve]: 1.523e-05 [a_after_grad]: 1.326e-05 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 1.32e-06 [auto_monad_grad]: 1.00001e-06 [auto_monad_eliminator]: 9.84001e-06 [cse]: 2.049e-05 [a_3]: 5.419e-05 [py_interpret_to_execute_after_opt_a]: 1.796e-05 [slice_cell_reuse_recomputed_activation]: 1.76998e-06 [rewriter_after_opt_a]: 4.934e-05 [convert_after_rewriter]: 8.10999e-06 [order_py_execute_after_rewriter]: 6.46e-06 [mutable_eliminate]: 0.00071517 [opt_b]: 0.00028437, [1] [Cycle 1]: 0.00027749, [7] [b_1]: 0.00018772 [b_2]: 1.007e-05 [updatestate_depend_eliminate]: 7.34002e-06 [updatestate_assign_eliminate]: 3.73999e-06 [updatestate_loads_eliminate]: 3.53999e-06 [renormalize]: 5.50004e-07 [cse]: 2.81e-05 [optimize_parallel_all_gather_comm]: 1.895e-05 [overlap_param_gather]: 2.06e-06 [cconv]: 2.689e-05 [loop_unroll]: 0.00042969 [opt_after_cconv]: 0.00012982, [1] [Cycle 1]: 0.00012376, [7] [c_1]: 4.534e-05 [parameter_eliminate]: 3.42002e-06 [updatestate_depend_eliminate]: 6.21e-06 [updatestate_assign_eliminate]: 4.38001e-06 [updatestate_loads_eliminate]: 3.3e-06 [cse]: 2.582e-05 [renormalize]: 4.10015e-07 [remove_dup_value]: 4.304e-05 [tuple_transform]: 0.00010104, [1] [Cycle 1]: 9.638e-05, [4] [d_1]: 6.597e-05 [none_parameter_eliminate]: 1.82999e-06 [renormalize]: 2.69996e-07 [switch_simplify]: 9.42999e-06 [partial_unused_args_eliminate]: 1.79e-06 [add_recomputation]: 5.874e-05 [cse_after_recomputation]: 2.644e-05, [1] [Cycle 1]: 2.159e-05, [1] [cse]: 1.595e-05 [environ_conv]: 6.68e-06 [swap_dp_allreduce_reducescatter]: 6.41e-06 [bias_add_comm_swap]: 2.36e-06 [label_micro_interleaved_index]: 3.85e-06 [label_fine_grained_interleaved_index]: 2.55002e-06 [merge_cast_opt]: 1.227e-05 [slice_recompute_activation]: 2.04999e-06 [micro_interleaved_order_control]: 2.31998e-06 [assign_add_opt]: 1.52001e-06 [ForceFp32Comm]: 1.19e-06 [remove_cast_before_assign_add]: 1.02e-06 [full_micro_interleaved_order_control]: 2.01e-06 [reorder_send_recv_between_fp_bp]: 2.89999e-06 [comm_op_add_attrs]: 9.89996e-07 [add_comm_op_reuse_tag]: 1.04003e-06 [interleave_split_concat_branches]: 1.17999e-06 [interleave_parallel_branches]: 1.19998e-06 [overlap_opt_shard_in_pipeline]: 1.14e-06 [overlap_opt_shard_grad_in_pipeline]: 2.06e-06 [control_data_broadcast_order]: 2.844e-05 [grouped_pairwise_exchange_alltoall]: 1.56998e-06 [offloading_packed_experts]: 5.44e-06 [overlap_recompute_and_grad_model_parallel]: 5.52999e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.19e-06 [overlap_recompute_allgather_and_fa_grad]: 2.16e-06 [overlap_recompute_comm]: 2.19001e-06 [overlap_grad_ring_attention]: 6.93998e-06 [overlap_grad_flash_sp]: 2.708e-05 [begin_end_overlap_inline]: 6.19999e-07 [split_matmul_comm_elemetwise]: 2.96001e-06 [split_layernorm_comm]: 1.60001e-06 [handle_group_info]: 8.89995e-07 [symbol_engine_optimizer]: 0.00010184, [1] [Cycle 1]: 9.699e-05, [6] [build]: 4.31002e-06 [elim_shapecalc]: 1.516e-05 [elim_not_effective]: 1.926e-05 [opt_reshape]: 1.035e-05 [fold_const_symbol]: 1.619e-05 [renormalize]: 2.59985e-07 [detach_backward]: 2.15002e-06 [pipeline_parallel_scheduler]: 1.50999e-06 [auto_monad_reorder]: 2.175e-05 [get_jit_bprop_graph]: 1.64998e-06 [rewriter_after_jit_bprop_graph]: 5.42999e-06 [opt_after_jit_grad]: 0.00047662 [validate]: 4.663e-05 Sums bootstrap : 0.000454s : 1.34% type_inference : 0.013144s : 38.86% event_method : 0.000084s : 0.25% auto_monad : 0.000102s : 0.30% graph_reusing : 0.000008s : 0.02% inline : 0.000002s : 0.01% add_attr.add_attr_with_inline.tag_attr : 0.000028s : 0.08% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000008s : 0.02% parallel-infer-symbol : 0.000003s : 0.01% pre_auto_parallel : 0.000054s : 0.16% insert-virtual-dataset : 0.000002s : 0.01% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.01% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000077s : 0.23% optimize.rewriter_before_opt_a : 0.000121s : 0.36% optimize.opt_a.expand_dump_flag : 0.000006s : 0.02% optimize.opt_a.switch_simplify : 0.000141s : 0.42% optimize.opt_a.loop_unroll : 0.000053s : 0.16% optimize.opt_a.a_1 : 0.001168s : 3.45% optimize.opt_a.with_stream_mark : 0.000038s : 0.11% optimize.opt_a.recompute_prepare : 0.000021s : 0.06% optimize.opt_a.updatestate_depend_eliminate : 0.000011s : 0.03% optimize.opt_a.updatestate_assign_eliminate : 0.000009s : 0.03% optimize.opt_a.updatestate_loads_eliminate : 0.000008s : 0.02% optimize.opt_a.parameter_eliminate : 0.000004s : 0.01% optimize.opt_a.a_2 : 0.000238s : 0.70% optimize.opt_a.accelerated_algorithm : 0.000019s : 0.06% optimize.opt_a.shard : 0.000005s : 0.01% optimize.opt_a.meta_shard_fg_expand : 0.000006s : 0.02% optimize.opt_a.shard_inline : 0.000018s : 0.05% optimize.opt_a.merge_send_recv : 0.000020s : 0.06% optimize.opt_a.auto_parallel : 0.000019s : 0.05% optimize.opt_a.parallel : 0.000028s : 0.08% optimize.opt_a.flash_sp : 0.000013s : 0.04% optimize.opt_a.merge_comm : 0.000011s : 0.03% optimize.opt_a.allreduce_fusion : 0.000010s : 0.03% optimize.opt_a.matmul_add_comm_reduction : 0.000023s : 0.07% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000021s : 0.06% optimize.opt_a.virtual_dataset : 0.000018s : 0.05% optimize.opt_a.get_grad_eliminate_ : 0.000017s : 0.05% optimize.opt_a.virtual_output : 0.000017s : 0.05% optimize.opt_a.merge_forward : 0.000011s : 0.03% optimize.opt_a.cell_reuse_recompute_pass : 0.000004s : 0.01% optimize.opt_a.offload_activation : 0.000024s : 0.07% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000036s : 0.11% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.01% optimize.opt_a.before_grad : 0.000030s : 0.09% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000011s : 0.03% optimize.opt_a.meta_fg_expand : 0.000008s : 0.03% optimize.opt_a.flash_sp_send_recv_attached : 0.000005s : 0.02% optimize.opt_a.receive_attached : 0.000005s : 0.02% optimize.opt_a.after_resolve : 0.000030s : 0.09% optimize.opt_a.a_after_grad : 0.000027s : 0.08% optimize.opt_a.renormalize : 0.014773s : 43.67% optimize.opt_a.add_forward_monad_depend : 0.000013s : 0.04% optimize.opt_a.auto_monad_grad : 0.000004s : 0.01% optimize.opt_a.auto_monad_eliminator : 0.000038s : 0.11% optimize.opt_a.cse : 0.000068s : 0.20% optimize.opt_a.a_3 : 0.000138s : 0.41% optimize.py_interpret_to_execute_after_opt_a : 0.000018s : 0.05% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.01% optimize.rewriter_after_opt_a : 0.000049s : 0.15% optimize.convert_after_rewriter : 0.000008s : 0.02% optimize.order_py_execute_after_rewriter : 0.000006s : 0.02% optimize.mutable_eliminate : 0.000715s : 2.11% optimize.opt_b.b_1 : 0.000188s : 0.55% optimize.opt_b.b_2 : 0.000010s : 0.03% optimize.opt_b.updatestate_depend_eliminate : 0.000007s : 0.02% optimize.opt_b.updatestate_assign_eliminate : 0.000004s : 0.01% optimize.opt_b.updatestate_loads_eliminate : 0.000004s : 0.01% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000028s : 0.08% optimize.optimize_parallel_all_gather_comm : 0.000019s : 0.06% optimize.overlap_param_gather : 0.000002s : 0.01% optimize.cconv : 0.000027s : 0.08% optimize.loop_unroll : 0.000430s : 1.27% optimize.opt_after_cconv.c_1 : 0.000045s : 0.13% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.02% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000004s : 0.01% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.cse : 0.000026s : 0.08% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000043s : 0.13% optimize.tuple_transform.d_1 : 0.000066s : 0.20% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000009s : 0.03% optimize.partial_unused_args_eliminate : 0.000002s : 0.01% optimize.add_recomputation : 0.000059s : 0.17% optimize.cse_after_recomputation.cse : 0.000016s : 0.05% optimize.environ_conv : 0.000007s : 0.02% optimize.swap_dp_allreduce_reducescatter : 0.000006s : 0.02% optimize.bias_add_comm_swap : 0.000002s : 0.01% optimize.label_micro_interleaved_index : 0.000004s : 0.01% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.01% optimize.merge_cast_opt : 0.000012s : 0.04% optimize.slice_recompute_activation : 0.000002s : 0.01% optimize.micro_interleaved_order_control : 0.000002s : 0.01% optimize.assign_add_opt : 0.000002s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.00% optimize.full_micro_interleaved_order_control : 0.000002s : 0.01% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.01% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.01% optimize.control_data_broadcast_order : 0.000028s : 0.08% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.00% optimize.offloading_packed_experts : 0.000005s : 0.02% optimize.overlap_recompute_and_grad_model_parallel : 0.000006s : 0.02% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000002s : 0.01% optimize.overlap_recompute_comm : 0.000002s : 0.01% optimize.overlap_grad_ring_attention : 0.000007s : 0.02% optimize.overlap_grad_flash_sp : 0.000027s : 0.08% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000003s : 0.01% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000004s : 0.01% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000015s : 0.04% optimize.symbol_engine_optimizer.elim_not_effective : 0.000019s : 0.06% optimize.symbol_engine_optimizer.opt_reshape : 0.000010s : 0.03% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000016s : 0.05% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.01% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000022s : 0.06% get_jit_bprop_graph : 0.000002s : 0.00% rewriter_after_jit_bprop_graph : 0.000005s : 0.02% opt_after_jit_grad : 0.000477s : 1.41% validate : 0.000047s : 0.14% Time group info: ------[substitution.] 0.000270 54 14.09% : 0.000038s : 6: substitution.cast_eliminate 0.96% : 0.000003s : 4: substitution.elim_not_effective 0.76% : 0.000002s : 4: substitution.fold_const_symbol 2.94% : 0.000008s : 6: substitution.graph_param_transform 65.52% : 0.000177s : 8: substitution.inline 2.12% : 0.000006s : 8: substitution.j_node_and_user_rematch 2.79% : 0.000008s : 8: substitution.remove_not_recompute_node 2.39% : 0.000006s : 4: substitution.replace_old_param 3.67% : 0.000010s : 2: substitution.switch_simplify 4.75% : 0.000013s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.013077 2 89.89% : 0.011755s : 1: type_inference.infer 10.11% : 0.001322s : 1: type_inference.specialize ------[replace.] 0.000123 14 42.83% : 0.000053s : 8: replace.inline 38.71% : 0.000047s : 2: replace.switch_simplify 18.46% : 0.000023s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000192 14 89.81% : 0.000172s : 8: match.inline 4.53% : 0.000009s : 2: match.switch_simplify 5.67% : 0.000011s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000316 1972 1.00% : 0.000003s : 21: predicate.accumulaten_eliminater 0.65% : 0.000002s : 6: predicate.ad_related_special_op_eliminate 0.55% : 0.000002s : 12: predicate.addn_check_dump 0.97% : 0.000003s : 21: predicate.addn_zero_filter 0.87% : 0.000003s : 21: predicate.adjust_all_reduce_mul_add 2.23% : 0.000007s : 33: predicate.arithmetic_simplify 1.10% : 0.000003s : 21: predicate.cast_eliminate 0.58% : 0.000002s : 12: predicate.check_bprop_eliminate 0.56% : 0.000002s : 12: predicate.compare_switch_simplify 0.17% : 0.000001s : 6: predicate.const_output_eliminate 0.60% : 0.000002s : 12: predicate.depend_value_elim 1.00% : 0.000003s : 21: predicate.dict_get_item_const_eliminator 1.18% : 0.000004s : 21: predicate.dict_get_item_eliminator 1.03% : 0.000003s : 21: predicate.dict_set_item_eliminator 0.80% : 0.000003s : 12: predicate.dumpgradient_eliminate 0.19% : 0.000001s : 6: predicate.elim_not_effective 0.35% : 0.000001s : 6: predicate.elim_shapecalc_of_broadcastargs 1.17% : 0.000004s : 27: predicate.environ_add_const_eliminate 1.16% : 0.000004s : 27: predicate.environ_get_add_eliminate 1.16% : 0.000004s : 27: predicate.environ_get_depend_swap 1.84% : 0.000006s : 39: predicate.environ_get_eliminate 1.16% : 0.000004s : 27: predicate.environ_get_set_eliminate 1.48% : 0.000005s : 33: predicate.exchange_switch_depend_value 2.58% : 0.000008s : 33: predicate.float_depend_g_call 0.51% : 0.000002s : 12: predicate.float_environ_get_switch 0.78% : 0.000002s : 18: predicate.float_tuple_getitem_switch 0.19% : 0.000001s : 6: predicate.fold_const_symbol 0.61% : 0.000002s : 12: predicate.get_grad_eliminate 0.19% : 0.000001s : 6: predicate.graph_param_transform 0.60% : 0.000002s : 12: predicate.incorporate_call 0.52% : 0.000002s : 12: predicate.incorporate_call_switch 6.18% : 0.000019s : 90: predicate.inline 0.76% : 0.000002s : 12: predicate.inline_without_move 0.32% : 0.000001s : 12: predicate.j_node_and_user_rematch 0.76% : 0.000002s : 12: predicate.less_batch_normalization 1.83% : 0.000006s : 37: predicate.list_to_tuple_eliminator_ 2.54% : 0.000008s : 58: predicate.load_eliminater 0.72% : 0.000002s : 6: predicate.loop_unroll_after_grad 2.47% : 0.000008s : 51: predicate.loop_unroll_before_grad 1.73% : 0.000005s : 33: predicate.make_slice_get_slice_eliminator 0.63% : 0.000002s : 12: predicate.merge_addn 0.56% : 0.000002s : 12: predicate.micro_step_allgather_replace 0.61% : 0.000002s : 12: predicate.mini_step_allgather_replace 0.89% : 0.000003s : 21: predicate.minmaximum_grad 0.70% : 0.000002s : 6: predicate.mutable_eliminate 0.32% : 0.000001s : 6: predicate.opt_reshape 0.31% : 0.000001s : 6: predicate.parallel_virtual_node 1.91% : 0.000006s : 33: predicate.partial_defer_inline 1.58% : 0.000005s : 31: predicate.partial_eliminate 0.94% : 0.000003s : 21: predicate.print_const_string_wrapper 0.58% : 0.000002s : 12: predicate.reduce_all_const_elim 1.18% : 0.000004s : 21: predicate.reduce_eliminate 2.42% : 0.000008s : 58: predicate.redundant_stop_gradient_eliminater 0.34% : 0.000001s : 12: predicate.remove_not_recompute_node 1.36% : 0.000004s : 37: predicate.replace_applicator 0.48% : 0.000002s : 12: predicate.replace_old_param 0.22% : 0.000001s : 6: predicate.reset_defer_inline 1.01% : 0.000003s : 21: predicate.reshape_eliminate 0.57% : 0.000002s : 12: predicate.row_tensor_add_zeros_like 0.35% : 0.000001s : 6: predicate.row_tensor_eliminate 0.81% : 0.000003s : 12: predicate.same_eliminate 0.38% : 0.000001s : 12: predicate.set_cell_output_no_recompute 0.86% : 0.000003s : 12: predicate.shard_identity_eliminate 0.67% : 0.000002s : 12: predicate.special_op_eliminate 0.77% : 0.000002s : 12: predicate.specialize_transform 0.86% : 0.000003s : 12: predicate.split_environ_get_set_with_tuple_value 0.71% : 0.000002s : 12: predicate.stack_unstack_eliminate 0.32% : 0.000001s : 6: predicate.switch_call_monad_eliminater 1.67% : 0.000005s : 33: predicate.switch_defer_inline 2.09% : 0.000007s : 45: predicate.switch_layer_defer_inline 5.54% : 0.000017s : 106: predicate.switch_simplify 0.90% : 0.000003s : 21: predicate.tile_eliminate 1.04% : 0.000003s : 21: predicate.transpose_eliminate 1.57% : 0.000005s : 33: predicate.tuple_list_convert_item_index_to_positive 1.62% : 0.000005s : 33: predicate.tuple_list_get_item_const_eliminator 1.48% : 0.000005s : 33: predicate.tuple_list_get_item_depend_reorder 3.15% : 0.000010s : 49: predicate.tuple_list_get_item_eliminator 1.49% : 0.000005s : 33: predicate.tuple_list_get_set_item_eliminator 2.41% : 0.000008s : 45: predicate.tuple_list_set_item_eliminator 1.92% : 0.000006s : 37: predicate.tuple_to_list_eliminator_ 2.43% : 0.000008s : 58: predicate.updatestate_pure_node_eliminater 3.00% : 0.000009s : 70: predicate.updatestate_useless_node_eliminater 0.35% : 0.000001s : 6: predicate.value_based_eliminate 0.62% : 0.000002s : 12: predicate.virtual_dataset_eliminate 0.61% : 0.000002s : 12: predicate.virtual_output_eliminate 0.28% : 0.000001s : 6: predicate.virtual_view_grad_eliminate 0.37% : 0.000001s : 6: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.001055 16 51.24% : 0.000541s : 6: func_graph_cloner_run.FuncGraphClonerGraph 48.76% : 0.000514s : 10: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.078184 192 0.00% : 0.000004s : 1: ForceFp32Comm 3.89% : 0.003042s : 1: add_attr 3.88% : 0.003032s : 1: add_attr_with_inline 0.00% : 0.000004s : 1: add_comm_op_reuse_tag 0.08% : 0.000063s : 1: add_recomputation 0.01% : 0.000004s : 1: assign_add_opt 0.14% : 0.000110s : 1: auto_monad 0.03% : 0.000026s : 1: auto_monad_reorder 0.00% : 0.000003s : 1: begin_end_overlap_inline 0.01% : 0.000005s : 1: bias_add_comm_swap 0.61% : 0.000480s : 1: bootstrap 0.04% : 0.000031s : 1: cconv 0.01% : 0.000004s : 1: comm_op_add_attrs 0.04% : 0.000033s : 1: control_data_broadcast_order 0.01% : 0.000011s : 1: convert_after_rewriter 0.04% : 0.000029s : 1: cse_after_recomputation 0.01% : 0.000005s : 1: dataset_repeat_opt 0.01% : 0.000005s : 1: detach_backward 0.01% : 0.000010s : 1: environ_conv 0.12% : 0.000094s : 1: event_method 0.01% : 0.000005s : 1: full_micro_interleaved_order_control 0.01% : 0.000005s : 1: get_jit_bprop_graph 0.02% : 0.000012s : 1: graph_reusing 0.01% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000004s : 1: handle_group_info 0.01% : 0.000005s : 1: inline 0.01% : 0.000006s : 1: insert-virtual-dataset 0.00% : 0.000004s : 1: interleave_parallel_branches 0.00% : 0.000004s : 1: interleave_split_concat_branches 0.01% : 0.000006s : 1: label_fine_grained_interleaved_index 0.01% : 0.000007s : 1: label_micro_interleaved_index 0.56% : 0.000437s : 1: loop_unroll 0.02% : 0.000016s : 1: merge_cast_opt 0.01% : 0.000005s : 1: micro_interleaved_order_control 0.93% : 0.000724s : 1: mutable_eliminate 0.01% : 0.000009s : 1: offloading_packed_experts 0.02% : 0.000017s : 1: opt.transform.loop_unroll_optimizer 0.02% : 0.000017s : 1: opt.transform.mutable_eliminate 2.41% : 0.001886s : 78: opt.transform.opt_a 0.06% : 0.000044s : 1: opt.transform.opt_after_cconv 0.04% : 0.000034s : 1: opt.transform.opt_after_jit_grad 0.21% : 0.000165s : 28: opt.transform.opt_b 0.09% : 0.000073s : 2: opt.transform.opt_trans_graph 0.07% : 0.000056s : 4: opt.transform.symbol_engine_opt 22.60% : 0.017673s : 1: opt_a 0.17% : 0.000133s : 1: opt_after_cconv 0.62% : 0.000486s : 1: opt_after_jit_grad 0.37% : 0.000288s : 1: opt_b 25.99% : 0.020318s : 1: optimize 0.03% : 0.000023s : 1: optimize_parallel_all_gather_comm 0.01% : 0.000010s : 1: order_py_execute_after_rewriter 0.04% : 0.000031s : 1: overlap_grad_flash_sp 0.06% : 0.000048s : 1: overlap_grad_matmul_and_grad_allreduce 0.01% : 0.000010s : 1: overlap_grad_ring_attention 0.01% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.01% : 0.000005s : 1: overlap_param_gather 0.01% : 0.000007s : 1: overlap_recompute_allgather_and_fa_grad 0.01% : 0.000009s : 1: overlap_recompute_and_grad_model_parallel 0.01% : 0.000005s : 1: overlap_recompute_comm 0.01% : 0.000007s : 1: parallel-infer-symbol 0.01% : 0.000005s : 1: parallel-infer-symbol-second 0.01% : 0.000005s : 1: partial_unused_args_eliminate 0.01% : 0.000005s : 1: pipeline_parallel_scheduler 0.01% : 0.000004s : 1: pipeline_split 0.08% : 0.000059s : 1: pre_auto_parallel 0.11% : 0.000085s : 1: py_interpret_to_execute 0.03% : 0.000022s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000004s : 1: remove_cast_before_assign_add 0.06% : 0.000047s : 1: remove_dup_value 17.98% : 0.014055s : 1: renormalize.infer 0.90% : 0.000700s : 1: renormalize.specialize 0.01% : 0.000005s : 1: reorder_send_recv_between_fp_bp 0.01% : 0.000009s : 1: rewriter_after_jit_bprop_graph 0.07% : 0.000053s : 1: rewriter_after_opt_a 0.16% : 0.000126s : 1: rewriter_before_opt_a 0.01% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.01% : 0.000005s : 1: slice_recompute_activation 0.01% : 0.000004s : 1: split_layernorm_comm 0.01% : 0.000006s : 1: split_matmul_comm_elemetwise 0.01% : 0.000009s : 1: swap_dp_allreduce_reducescatter 0.13% : 0.000105s : 1: symbol_engine_optimizer 0.13% : 0.000104s : 1: tuple_transform 16.84% : 0.013163s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:10.650.850 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:10.651.113 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0257618, [21] [bootstrap]: 0.00044048 [type_inference]: 0.0128452 [event_method]: 8.581e-05 [auto_monad]: 9.9e-05 [graph_reusing]: 7.31999e-06 [inline]: 2.25002e-06 [add_attr]: 0.00318631, [1] [add_attr_with_inline]: 0.00317624, [1] [Cycle 1]: 9.785e-05, [2] [tag_attr]: 3.229e-05 [meta_addattr_fg_expand]: 8.89e-06 [parallel-infer-symbol]: 3.7e-06 [pre_auto_parallel]: 4.937e-05 [insert-virtual-dataset]: 2.32999e-06 [parallel-infer-symbol-second]: 7.00005e-07 [dataset_repeat_opt]: 1.97999e-06 [pipeline_split]: 1.57001e-06 [optimize]: 0.00756893, [53] [py_interpret_to_execute]: 4.409e-05 [rewriter_before_opt_a]: 0.00012734 [opt_a]: 0.00447566, [2] [Cycle 1]: 0.00347343, [45] [expand_dump_flag]: 5.39998e-06 [switch_simplify]: 0.00013399 [loop_unroll]: 4.309e-05 [a_1]: 0.00100154 [with_stream_mark]: 2.064e-05 [recompute_prepare]: 1.27e-05 [updatestate_depend_eliminate]: 5.15999e-06 [updatestate_assign_eliminate]: 4.30999e-06 [updatestate_loads_eliminate]: 3.28e-06 [parameter_eliminate]: 2.02999e-06 [a_2]: 0.00013625 [accelerated_algorithm]: 9.78998e-06 [shard]: 2.13002e-06 [meta_shard_fg_expand]: 2.63003e-06 [shard_inline]: 8.32003e-06 [merge_send_recv]: 1.015e-05 [auto_parallel]: 8.46002e-06 [parallel]: 1.915e-05 [flash_sp]: 9.96998e-06 [merge_comm]: 5.32999e-06 [allreduce_fusion]: 4.53999e-06 [matmul_add_comm_reduction]: 1.124e-05 [allreduce_slice_to_reducescatter]: 1.08001e-06 [virtual_shard_identity]: 1.057e-05 [virtual_dataset]: 9.21002e-06 [get_grad_eliminate_]: 8.67998e-06 [virtual_output]: 7.96001e-06 [merge_forward]: 4.4e-06 [cell_reuse_recompute_pass]: 1.44e-06 [offload_activation]: 1.107e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.057e-05 [merge_recompute_call_nodes]: 1.89e-06 [before_grad]: 1.373e-05 [set_forward_comm_id_for_comm_node_pass]: 4.82e-06 [meta_fg_expand]: 4.48999e-06 [flash_sp_send_recv_attached]: 2.99999e-06 [receive_attached]: 2.37999e-06 [after_resolve]: 1.347e-05 [a_after_grad]: 1.33e-05 [renormalize]: 0.0013043 [add_forward_monad_depend]: 6.74999e-06 [auto_monad_grad]: 2.46e-06 [auto_monad_eliminator]: 2.155e-05 [cse]: 3.756e-05 [a_3]: 8.065e-05 [Cycle 2]: 0.00098675, [45] [expand_dump_flag]: 1.85001e-06 [switch_simplify]: 1.034e-05 [loop_unroll]: 8.50001e-06 [a_1]: 0.00019257 [with_stream_mark]: 1.549e-05 [recompute_prepare]: 8.40999e-06 [updatestate_depend_eliminate]: 5.35999e-06 [updatestate_assign_eliminate]: 3.41001e-06 [updatestate_loads_eliminate]: 3.70998e-06 [parameter_eliminate]: 1.40999e-06 [a_2]: 0.00012477 [accelerated_algorithm]: 8.62e-06 [shard]: 2.08002e-06 [meta_shard_fg_expand]: 2.42001e-06 [shard_inline]: 8.52998e-06 [merge_send_recv]: 7.48e-06 [auto_parallel]: 7.80998e-06 [parallel]: 7.61999e-06 [flash_sp]: 3.68999e-06 [merge_comm]: 4.52998e-06 [allreduce_fusion]: 4.30999e-06 [matmul_add_comm_reduction]: 8.74e-06 [allreduce_slice_to_reducescatter]: 4.19997e-07 [virtual_shard_identity]: 9.52001e-06 [virtual_dataset]: 7.72002e-06 [get_grad_eliminate_]: 7.6e-06 [virtual_output]: 7.14001e-06 [merge_forward]: 3.99002e-06 [cell_reuse_recompute_pass]: 2.27999e-06 [offload_activation]: 9.64e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.735e-05 [merge_recompute_call_nodes]: 9.70002e-07 [before_grad]: 1.324e-05 [set_forward_comm_id_for_comm_node_pass]: 5.47001e-06 [meta_fg_expand]: 3.14001e-06 [flash_sp_send_recv_attached]: 1.04e-06 [receive_attached]: 1.22e-06 [after_resolve]: 1.252e-05 [a_after_grad]: 1.17e-05 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 1.19e-06 [auto_monad_grad]: 1.52999e-06 [auto_monad_eliminator]: 1.023e-05 [cse]: 2.191e-05 [a_3]: 6.015e-05 [py_interpret_to_execute_after_opt_a]: 1.984e-05 [slice_cell_reuse_recomputed_activation]: 4.65001e-06 [rewriter_after_opt_a]: 5.161e-05 [convert_after_rewriter]: 1.092e-05 [order_py_execute_after_rewriter]: 9.12001e-06 [mutable_eliminate]: 0.00076449 [opt_b]: 0.00036257, [1] [Cycle 1]: 0.00034974, [7] [b_1]: 0.00022438 [b_2]: 1.243e-05 [updatestate_depend_eliminate]: 9.05001e-06 [updatestate_assign_eliminate]: 3.65e-06 [updatestate_loads_eliminate]: 3.41999e-06 [renormalize]: 8.29983e-07 [cse]: 3.187e-05 [optimize_parallel_all_gather_comm]: 2.249e-05 [overlap_param_gather]: 5.04e-06 [cconv]: 3.625e-05 [loop_unroll]: 0.00064675 [opt_after_cconv]: 0.00016349, [1] [Cycle 1]: 0.00015276, [7] [c_1]: 4.43e-05 [parameter_eliminate]: 4.37003e-06 [updatestate_depend_eliminate]: 8.21002e-06 [updatestate_assign_eliminate]: 3.26999e-06 [updatestate_loads_eliminate]: 3.50998e-06 [cse]: 3.131e-05 [renormalize]: 5.50004e-07 [remove_dup_value]: 1.997e-05 [tuple_transform]: 0.00010903, [1] [Cycle 1]: 0.00010054, [4] [d_1]: 5.809e-05 [none_parameter_eliminate]: 1.79e-06 [renormalize]: 2.49973e-07 [switch_simplify]: 8.72998e-06 [partial_unused_args_eliminate]: 4.43999e-06 [add_recomputation]: 6.389e-05 [cse_after_recomputation]: 3.283e-05, [1] [Cycle 1]: 2.553e-05, [1] [cse]: 1.589e-05 [environ_conv]: 1.155e-05 [swap_dp_allreduce_reducescatter]: 8.57e-06 [bias_add_comm_swap]: 5.20001e-06 [label_micro_interleaved_index]: 7.96001e-06 [label_fine_grained_interleaved_index]: 4.94e-06 [merge_cast_opt]: 4.27003e-06 [slice_recompute_activation]: 5.19e-06 [micro_interleaved_order_control]: 5.31998e-06 [assign_add_opt]: 3.57997e-06 [ForceFp32Comm]: 4.47e-06 [remove_cast_before_assign_add]: 3.61999e-06 [full_micro_interleaved_order_control]: 5.08002e-06 [reorder_send_recv_between_fp_bp]: 6.21e-06 [comm_op_add_attrs]: 4.73001e-06 [add_comm_op_reuse_tag]: 3.88999e-06 [interleave_split_concat_branches]: 4.39998e-06 [interleave_parallel_branches]: 3.97e-06 [overlap_opt_shard_in_pipeline]: 3.98999e-06 [overlap_opt_shard_grad_in_pipeline]: 5.05001e-06 [control_data_broadcast_order]: 2.364e-05 [grouped_pairwise_exchange_alltoall]: 3.97e-06 [offloading_packed_experts]: 7.21001e-06 [overlap_recompute_and_grad_model_parallel]: 7.76001e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.98999e-06 [overlap_recompute_allgather_and_fa_grad]: 4.08001e-06 [overlap_recompute_comm]: 5.60001e-06 [overlap_grad_ring_attention]: 7.63001e-06 [overlap_grad_flash_sp]: 2.771e-05 [begin_end_overlap_inline]: 3.29001e-06 [split_matmul_comm_elemetwise]: 4.76002e-06 [split_layernorm_comm]: 4.23999e-06 [handle_group_info]: 3.75998e-06 [symbol_engine_optimizer]: 0.00011571, [1] [Cycle 1]: 0.00010835, [6] [build]: 4.03001e-06 [elim_shapecalc]: 1.406e-05 [elim_not_effective]: 1.899e-05 [opt_reshape]: 9.82001e-06 [fold_const_symbol]: 1.34e-05 [renormalize]: 2.59985e-07 [detach_backward]: 4.72e-06 [pipeline_parallel_scheduler]: 2.09e-06 [auto_monad_reorder]: 2.752e-05 [get_jit_bprop_graph]: 1.90001e-06 [rewriter_after_jit_bprop_graph]: 6.53e-06 [opt_after_jit_grad]: 0.00067401 [validate]: 5.018e-05 Sums bootstrap : 0.000440s : 2.14% type_inference : 0.012845s : 62.32% event_method : 0.000086s : 0.42% auto_monad : 0.000099s : 0.48% graph_reusing : 0.000007s : 0.04% inline : 0.000002s : 0.01% add_attr.add_attr_with_inline.tag_attr : 0.000032s : 0.16% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000009s : 0.04% parallel-infer-symbol : 0.000004s : 0.02% pre_auto_parallel : 0.000049s : 0.24% insert-virtual-dataset : 0.000002s : 0.01% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.01% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000044s : 0.21% optimize.rewriter_before_opt_a : 0.000127s : 0.62% optimize.opt_a.expand_dump_flag : 0.000007s : 0.04% optimize.opt_a.switch_simplify : 0.000144s : 0.70% optimize.opt_a.loop_unroll : 0.000052s : 0.25% optimize.opt_a.a_1 : 0.001194s : 5.79% optimize.opt_a.with_stream_mark : 0.000036s : 0.18% optimize.opt_a.recompute_prepare : 0.000021s : 0.10% optimize.opt_a.updatestate_depend_eliminate : 0.000011s : 0.05% optimize.opt_a.updatestate_assign_eliminate : 0.000008s : 0.04% optimize.opt_a.updatestate_loads_eliminate : 0.000007s : 0.03% optimize.opt_a.parameter_eliminate : 0.000003s : 0.02% optimize.opt_a.a_2 : 0.000261s : 1.27% optimize.opt_a.accelerated_algorithm : 0.000018s : 0.09% optimize.opt_a.shard : 0.000004s : 0.02% optimize.opt_a.meta_shard_fg_expand : 0.000005s : 0.02% optimize.opt_a.shard_inline : 0.000017s : 0.08% optimize.opt_a.merge_send_recv : 0.000018s : 0.09% optimize.opt_a.auto_parallel : 0.000016s : 0.08% optimize.opt_a.parallel : 0.000027s : 0.13% optimize.opt_a.flash_sp : 0.000014s : 0.07% optimize.opt_a.merge_comm : 0.000010s : 0.05% optimize.opt_a.allreduce_fusion : 0.000009s : 0.04% optimize.opt_a.matmul_add_comm_reduction : 0.000020s : 0.10% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000002s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000020s : 0.10% optimize.opt_a.virtual_dataset : 0.000017s : 0.08% optimize.opt_a.get_grad_eliminate_ : 0.000016s : 0.08% optimize.opt_a.virtual_output : 0.000015s : 0.07% optimize.opt_a.merge_forward : 0.000008s : 0.04% optimize.opt_a.cell_reuse_recompute_pass : 0.000004s : 0.02% optimize.opt_a.offload_activation : 0.000021s : 0.10% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000038s : 0.18% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.01% optimize.opt_a.before_grad : 0.000027s : 0.13% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000010s : 0.05% optimize.opt_a.meta_fg_expand : 0.000008s : 0.04% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.02% optimize.opt_a.receive_attached : 0.000004s : 0.02% optimize.opt_a.after_resolve : 0.000026s : 0.13% optimize.opt_a.a_after_grad : 0.000025s : 0.12% optimize.opt_a.renormalize : 0.001304s : 6.33% optimize.opt_a.add_forward_monad_depend : 0.000008s : 0.04% optimize.opt_a.auto_monad_grad : 0.000004s : 0.02% optimize.opt_a.auto_monad_eliminator : 0.000032s : 0.15% optimize.opt_a.cse : 0.000059s : 0.29% optimize.opt_a.a_3 : 0.000141s : 0.68% optimize.py_interpret_to_execute_after_opt_a : 0.000020s : 0.10% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.02% optimize.rewriter_after_opt_a : 0.000052s : 0.25% optimize.convert_after_rewriter : 0.000011s : 0.05% optimize.order_py_execute_after_rewriter : 0.000009s : 0.04% optimize.mutable_eliminate : 0.000764s : 3.71% optimize.opt_b.b_1 : 0.000224s : 1.09% optimize.opt_b.b_2 : 0.000012s : 0.06% optimize.opt_b.updatestate_depend_eliminate : 0.000009s : 0.04% optimize.opt_b.updatestate_assign_eliminate : 0.000004s : 0.02% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.02% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000032s : 0.15% optimize.optimize_parallel_all_gather_comm : 0.000022s : 0.11% optimize.overlap_param_gather : 0.000005s : 0.02% optimize.cconv : 0.000036s : 0.18% optimize.loop_unroll : 0.000647s : 3.14% optimize.opt_after_cconv.c_1 : 0.000044s : 0.21% optimize.opt_after_cconv.parameter_eliminate : 0.000004s : 0.02% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000008s : 0.04% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.02% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000004s : 0.02% optimize.opt_after_cconv.cse : 0.000031s : 0.15% optimize.opt_after_cconv.renormalize : 0.000001s : 0.00% optimize.remove_dup_value : 0.000020s : 0.10% optimize.tuple_transform.d_1 : 0.000058s : 0.28% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000009s : 0.04% optimize.partial_unused_args_eliminate : 0.000004s : 0.02% optimize.add_recomputation : 0.000064s : 0.31% optimize.cse_after_recomputation.cse : 0.000016s : 0.08% optimize.environ_conv : 0.000012s : 0.06% optimize.swap_dp_allreduce_reducescatter : 0.000009s : 0.04% optimize.bias_add_comm_swap : 0.000005s : 0.03% optimize.label_micro_interleaved_index : 0.000008s : 0.04% optimize.label_fine_grained_interleaved_index : 0.000005s : 0.02% optimize.merge_cast_opt : 0.000004s : 0.02% optimize.slice_recompute_activation : 0.000005s : 0.03% optimize.micro_interleaved_order_control : 0.000005s : 0.03% optimize.assign_add_opt : 0.000004s : 0.02% optimize.ForceFp32Comm : 0.000004s : 0.02% optimize.remove_cast_before_assign_add : 0.000004s : 0.02% optimize.full_micro_interleaved_order_control : 0.000005s : 0.02% optimize.reorder_send_recv_between_fp_bp : 0.000006s : 0.03% optimize.comm_op_add_attrs : 0.000005s : 0.02% optimize.add_comm_op_reuse_tag : 0.000004s : 0.02% optimize.interleave_split_concat_branches : 0.000004s : 0.02% optimize.interleave_parallel_branches : 0.000004s : 0.02% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.02% optimize.overlap_opt_shard_grad_in_pipeline : 0.000005s : 0.02% optimize.control_data_broadcast_order : 0.000024s : 0.11% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.02% optimize.offloading_packed_experts : 0.000007s : 0.03% optimize.overlap_recompute_and_grad_model_parallel : 0.000008s : 0.04% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.02% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.02% optimize.overlap_recompute_comm : 0.000006s : 0.03% optimize.overlap_grad_ring_attention : 0.000008s : 0.04% optimize.overlap_grad_flash_sp : 0.000028s : 0.13% optimize.begin_end_overlap_inline : 0.000003s : 0.02% optimize.split_matmul_comm_elemetwise : 0.000005s : 0.02% optimize.split_layernorm_comm : 0.000004s : 0.02% optimize.handle_group_info : 0.000004s : 0.02% optimize.symbol_engine_optimizer.build : 0.000004s : 0.02% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000014s : 0.07% optimize.symbol_engine_optimizer.elim_not_effective : 0.000019s : 0.09% optimize.symbol_engine_optimizer.opt_reshape : 0.000010s : 0.05% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000013s : 0.07% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000005s : 0.02% pipeline_parallel_scheduler : 0.000002s : 0.01% auto_monad_reorder : 0.000028s : 0.13% get_jit_bprop_graph : 0.000002s : 0.01% rewriter_after_jit_bprop_graph : 0.000007s : 0.03% opt_after_jit_grad : 0.000674s : 3.27% validate : 0.000050s : 0.24% Time group info: ------[substitution.] 0.000277 44 9.36% : 0.000026s : 3: substitution.cast_eliminate 0.99% : 0.000003s : 3: substitution.elim_not_effective 0.68% : 0.000002s : 3: substitution.fold_const_symbol 2.54% : 0.000007s : 5: substitution.graph_param_transform 70.70% : 0.000196s : 8: substitution.inline 1.79% : 0.000005s : 6: substitution.j_node_and_user_rematch 2.64% : 0.000007s : 6: substitution.remove_not_recompute_node 1.93% : 0.000005s : 4: substitution.replace_old_param 3.60% : 0.000010s : 2: substitution.switch_simplify 5.76% : 0.000016s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.012787 2 90.20% : 0.011533s : 1: type_inference.infer 9.80% : 0.001254s : 1: type_inference.specialize ------[replace.] 0.000133 14 41.17% : 0.000055s : 8: replace.inline 38.15% : 0.000051s : 2: replace.switch_simplify 20.68% : 0.000028s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000213 14 89.48% : 0.000191s : 8: match.inline 4.08% : 0.000009s : 2: match.switch_simplify 6.44% : 0.000014s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000308 1838 1.06% : 0.000003s : 21: predicate.accumulaten_eliminater 0.93% : 0.000003s : 5: predicate.ad_related_special_op_eliminate 0.50% : 0.000002s : 10: predicate.addn_check_dump 0.99% : 0.000003s : 21: predicate.addn_zero_filter 0.92% : 0.000003s : 21: predicate.adjust_all_reduce_mul_add 2.08% : 0.000006s : 31: predicate.arithmetic_simplify 1.07% : 0.000003s : 21: predicate.cast_eliminate 0.54% : 0.000002s : 10: predicate.check_bprop_eliminate 0.49% : 0.000002s : 10: predicate.compare_switch_simplify 0.16% : 0.000000s : 5: predicate.const_output_eliminate 0.50% : 0.000002s : 10: predicate.depend_value_elim 1.06% : 0.000003s : 21: predicate.dict_get_item_const_eliminator 1.23% : 0.000004s : 21: predicate.dict_get_item_eliminator 0.96% : 0.000003s : 21: predicate.dict_set_item_eliminator 0.89% : 0.000003s : 10: predicate.dumpgradient_eliminate 0.20% : 0.000001s : 5: predicate.elim_not_effective 0.33% : 0.000001s : 5: predicate.elim_shapecalc_of_broadcastargs 1.17% : 0.000004s : 26: predicate.environ_add_const_eliminate 1.12% : 0.000003s : 26: predicate.environ_get_add_eliminate 1.14% : 0.000004s : 26: predicate.environ_get_depend_swap 1.80% : 0.000006s : 36: predicate.environ_get_eliminate 1.13% : 0.000003s : 26: predicate.environ_get_set_eliminate 1.59% : 0.000005s : 33: predicate.exchange_switch_depend_value 2.42% : 0.000007s : 33: predicate.float_depend_g_call 0.45% : 0.000001s : 10: predicate.float_environ_get_switch 0.74% : 0.000002s : 15: predicate.float_tuple_getitem_switch 0.13% : 0.000000s : 5: predicate.fold_const_symbol 0.61% : 0.000002s : 10: predicate.get_grad_eliminate 0.18% : 0.000001s : 5: predicate.graph_param_transform 0.53% : 0.000002s : 10: predicate.incorporate_call 0.42% : 0.000001s : 10: predicate.incorporate_call_switch 6.36% : 0.000020s : 84: predicate.inline 0.65% : 0.000002s : 10: predicate.inline_without_move 0.29% : 0.000001s : 10: predicate.j_node_and_user_rematch 0.85% : 0.000003s : 10: predicate.less_batch_normalization 1.76% : 0.000005s : 35: predicate.list_to_tuple_eliminator_ 2.55% : 0.000008s : 56: predicate.load_eliminater 0.99% : 0.000003s : 5: predicate.loop_unroll_after_grad 2.47% : 0.000008s : 49: predicate.loop_unroll_before_grad 1.63% : 0.000005s : 31: predicate.make_slice_get_slice_eliminator 0.54% : 0.000002s : 10: predicate.merge_addn 0.49% : 0.000002s : 10: predicate.micro_step_allgather_replace 0.48% : 0.000001s : 10: predicate.mini_step_allgather_replace 0.95% : 0.000003s : 21: predicate.minmaximum_grad 1.07% : 0.000003s : 5: predicate.mutable_eliminate 0.31% : 0.000001s : 5: predicate.opt_reshape 0.34% : 0.000001s : 5: predicate.parallel_virtual_node 1.88% : 0.000006s : 33: predicate.partial_defer_inline 1.59% : 0.000005s : 30: predicate.partial_eliminate 1.05% : 0.000003s : 21: predicate.print_const_string_wrapper 0.48% : 0.000001s : 10: predicate.reduce_all_const_elim 1.36% : 0.000004s : 21: predicate.reduce_eliminate 2.67% : 0.000008s : 56: predicate.redundant_stop_gradient_eliminater 0.35% : 0.000001s : 10: predicate.remove_not_recompute_node 1.32% : 0.000004s : 35: predicate.replace_applicator 0.41% : 0.000001s : 10: predicate.replace_old_param 0.21% : 0.000001s : 5: predicate.reset_defer_inline 1.10% : 0.000003s : 21: predicate.reshape_eliminate 0.53% : 0.000002s : 10: predicate.row_tensor_add_zeros_like 0.34% : 0.000001s : 5: predicate.row_tensor_eliminate 0.67% : 0.000002s : 10: predicate.same_eliminate 0.44% : 0.000001s : 10: predicate.set_cell_output_no_recompute 0.68% : 0.000002s : 10: predicate.shard_identity_eliminate 0.65% : 0.000002s : 10: predicate.special_op_eliminate 0.58% : 0.000002s : 10: predicate.specialize_transform 0.87% : 0.000003s : 10: predicate.split_environ_get_set_with_tuple_value 0.65% : 0.000002s : 10: predicate.stack_unstack_eliminate 0.28% : 0.000001s : 5: predicate.switch_call_monad_eliminater 1.66% : 0.000005s : 33: predicate.switch_defer_inline 2.10% : 0.000006s : 43: predicate.switch_layer_defer_inline 5.39% : 0.000017s : 101: predicate.switch_simplify 1.07% : 0.000003s : 21: predicate.tile_eliminate 0.98% : 0.000003s : 21: predicate.transpose_eliminate 1.60% : 0.000005s : 31: predicate.tuple_list_convert_item_index_to_positive 1.68% : 0.000005s : 31: predicate.tuple_list_get_item_const_eliminator 1.52% : 0.000005s : 31: predicate.tuple_list_get_item_depend_reorder 3.24% : 0.000010s : 45: predicate.tuple_list_get_item_eliminator 1.60% : 0.000005s : 31: predicate.tuple_list_get_set_item_eliminator 2.31% : 0.000007s : 41: predicate.tuple_list_set_item_eliminator 1.83% : 0.000006s : 35: predicate.tuple_to_list_eliminator_ 2.50% : 0.000008s : 56: predicate.updatestate_pure_node_eliminater 3.20% : 0.000010s : 66: predicate.updatestate_useless_node_eliminater 0.37% : 0.000001s : 5: predicate.value_based_eliminate 0.60% : 0.000002s : 10: predicate.virtual_dataset_eliminate 0.52% : 0.000002s : 10: predicate.virtual_output_eliminate 0.25% : 0.000001s : 5: predicate.virtual_view_grad_eliminate 0.40% : 0.000001s : 5: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000980 16 55.37% : 0.000543s : 6: func_graph_cloner_run.FuncGraphClonerGraph 44.63% : 0.000438s : 10: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.039862 192 0.02% : 0.000007s : 1: ForceFp32Comm 8.02% : 0.003197s : 1: add_attr 7.98% : 0.003180s : 1: add_attr_with_inline 0.02% : 0.000007s : 1: add_comm_op_reuse_tag 0.17% : 0.000068s : 1: add_recomputation 0.02% : 0.000006s : 1: assign_add_opt 0.28% : 0.000110s : 1: auto_monad 0.09% : 0.000036s : 1: auto_monad_reorder 0.02% : 0.000006s : 1: begin_end_overlap_inline 0.02% : 0.000008s : 1: bias_add_comm_swap 1.21% : 0.000484s : 1: bootstrap 0.10% : 0.000040s : 1: cconv 0.02% : 0.000010s : 1: comm_op_add_attrs 0.07% : 0.000027s : 1: control_data_broadcast_order 0.03% : 0.000014s : 1: convert_after_rewriter 0.09% : 0.000036s : 1: cse_after_recomputation 0.02% : 0.000007s : 1: dataset_repeat_opt 0.06% : 0.000026s : 1: detach_backward 0.04% : 0.000015s : 1: environ_conv 0.25% : 0.000101s : 1: event_method 0.02% : 0.000008s : 1: full_micro_interleaved_order_control 0.02% : 0.000008s : 1: get_jit_bprop_graph 0.04% : 0.000014s : 1: graph_reusing 0.02% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.02% : 0.000007s : 1: handle_group_info 0.02% : 0.000008s : 1: inline 0.02% : 0.000008s : 1: insert-virtual-dataset 0.02% : 0.000007s : 1: interleave_parallel_branches 0.02% : 0.000008s : 1: interleave_split_concat_branches 0.02% : 0.000008s : 1: label_fine_grained_interleaved_index 0.03% : 0.000011s : 1: label_micro_interleaved_index 1.64% : 0.000655s : 1: loop_unroll 0.02% : 0.000007s : 1: merge_cast_opt 0.02% : 0.000008s : 1: micro_interleaved_order_control 1.94% : 0.000773s : 1: mutable_eliminate 0.03% : 0.000011s : 1: offloading_packed_experts 0.05% : 0.000020s : 1: opt.transform.loop_unroll_optimizer 0.05% : 0.000021s : 1: opt.transform.mutable_eliminate 4.63% : 0.001845s : 78: opt.transform.opt_a 0.11% : 0.000043s : 1: opt.transform.opt_after_cconv 0.10% : 0.000039s : 1: opt.transform.opt_after_jit_grad 0.39% : 0.000155s : 28: opt.transform.opt_b 0.16% : 0.000065s : 2: opt.transform.opt_trans_graph 0.13% : 0.000052s : 4: opt.transform.symbol_engine_opt 11.24% : 0.004479s : 1: opt_a 0.42% : 0.000167s : 1: opt_after_cconv 1.73% : 0.000688s : 1: opt_after_jit_grad 0.92% : 0.000367s : 1: opt_b 20.08% : 0.008004s : 1: optimize 0.07% : 0.000026s : 1: optimize_parallel_all_gather_comm 0.03% : 0.000012s : 1: order_py_execute_after_rewriter 0.08% : 0.000031s : 1: overlap_grad_flash_sp 0.02% : 0.000007s : 1: overlap_grad_matmul_and_grad_allreduce 0.03% : 0.000011s : 1: overlap_grad_ring_attention 0.02% : 0.000008s : 1: overlap_opt_shard_grad_in_pipeline 0.02% : 0.000007s : 1: overlap_opt_shard_in_pipeline 0.02% : 0.000008s : 1: overlap_param_gather 0.02% : 0.000007s : 1: overlap_recompute_allgather_and_fa_grad 0.03% : 0.000011s : 1: overlap_recompute_and_grad_model_parallel 0.02% : 0.000009s : 1: overlap_recompute_comm 0.02% : 0.000010s : 1: parallel-infer-symbol 0.02% : 0.000007s : 1: parallel-infer-symbol-second 0.02% : 0.000007s : 1: partial_unused_args_eliminate 0.02% : 0.000009s : 1: pipeline_parallel_scheduler 0.02% : 0.000007s : 1: pipeline_split 0.14% : 0.000057s : 1: pre_auto_parallel 0.12% : 0.000048s : 1: py_interpret_to_execute 0.06% : 0.000023s : 1: py_interpret_to_execute_after_opt_a 0.02% : 0.000006s : 1: remove_cast_before_assign_add 0.06% : 0.000023s : 1: remove_dup_value 1.81% : 0.000723s : 1: renormalize.infer 1.43% : 0.000570s : 1: renormalize.specialize 0.02% : 0.000009s : 1: reorder_send_recv_between_fp_bp 0.03% : 0.000013s : 1: rewriter_after_jit_bprop_graph 0.14% : 0.000055s : 1: rewriter_after_opt_a 0.33% : 0.000131s : 1: rewriter_before_opt_a 0.02% : 0.000007s : 1: slice_cell_reuse_recomputed_activation 0.02% : 0.000008s : 1: slice_recompute_activation 0.02% : 0.000007s : 1: split_layernorm_comm 0.02% : 0.000008s : 1: split_matmul_comm_elemetwise 0.03% : 0.000011s : 1: swap_dp_allreduce_reducescatter 0.30% : 0.000119s : 1: symbol_engine_optimizer 0.28% : 0.000112s : 1: tuple_transform 32.31% : 0.012879s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:11.109.904 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.025196, [21] [bootstrap]: 0.00047126 [type_inference]: 0.0140275 [event_method]: 8.774e-05 [auto_monad]: 0.00010177 [graph_reusing]: 7.88999e-06 [inline]: 2.24001e-06 [add_attr]: 0.00336954, [1] [add_attr_with_inline]: 0.00335811, [1] [Cycle 1]: 7.826e-05, [2] [tag_attr]: 3.096e-05 [meta_addattr_fg_expand]: 8.11002e-06 [parallel-infer-symbol]: 3.24001e-06 [pre_auto_parallel]: 4.815e-05 [insert-virtual-dataset]: 2.71e-06 [parallel-infer-symbol-second]: 9.89996e-07 [dataset_repeat_opt]: 2.21e-06 [pipeline_split]: 1.62001e-06 [optimize]: 0.00632424, [53] [py_interpret_to_execute]: 3.764e-05 [rewriter_before_opt_a]: 0.00011794 [opt_a]: 0.00391111, [2] [Cycle 1]: 0.00307175, [45] [expand_dump_flag]: 4.07998e-06 [switch_simplify]: 0.0001326 [loop_unroll]: 4.337e-05 [a_1]: 0.00100403 [with_stream_mark]: 2.125e-05 [recompute_prepare]: 1.424e-05 [updatestate_depend_eliminate]: 5.43002e-06 [updatestate_assign_eliminate]: 3.64002e-06 [updatestate_loads_eliminate]: 3.36001e-06 [parameter_eliminate]: 2.21998e-06 [a_2]: 0.00010558 [accelerated_algorithm]: 8.92e-06 [shard]: 2.46e-06 [meta_shard_fg_expand]: 2.41998e-06 [shard_inline]: 8.46002e-06 [merge_send_recv]: 1.022e-05 [auto_parallel]: 7.65998e-06 [parallel]: 2.027e-05 [flash_sp]: 9.91998e-06 [merge_comm]: 5.65001e-06 [allreduce_fusion]: 4.46002e-06 [matmul_add_comm_reduction]: 1.149e-05 [allreduce_slice_to_reducescatter]: 6.39993e-07 [virtual_shard_identity]: 1.059e-05 [virtual_dataset]: 7.82998e-06 [get_grad_eliminate_]: 7.58999e-06 [virtual_output]: 7.72002e-06 [merge_forward]: 5.15001e-06 [cell_reuse_recompute_pass]: 1.27e-06 [offload_activation]: 1.087e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.831e-05 [merge_recompute_call_nodes]: 1.44e-06 [before_grad]: 1.387e-05 [set_forward_comm_id_for_comm_node_pass]: 5.40001e-06 [meta_fg_expand]: 4.09002e-06 [flash_sp_send_recv_attached]: 3.05998e-06 [receive_attached]: 2.36e-06 [after_resolve]: 1.427e-05 [a_after_grad]: 1.246e-05 [renormalize]: 0.00109306 [add_forward_monad_depend]: 6.11e-06 [auto_monad_grad]: 2.93e-06 [auto_monad_eliminator]: 2.072e-05 [cse]: 3.725e-05 [a_3]: 6.255e-05 [Cycle 2]: 0.00082831, [45] [expand_dump_flag]: 1.31002e-06 [switch_simplify]: 1.044e-05 [loop_unroll]: 7.78001e-06 [a_1]: 0.00018519 [with_stream_mark]: 1.667e-05 [recompute_prepare]: 9.42001e-06 [updatestate_depend_eliminate]: 4.70001e-06 [updatestate_assign_eliminate]: 2.94999e-06 [updatestate_loads_eliminate]: 2.93e-06 [parameter_eliminate]: 1.87999e-06 [a_2]: 9.601e-05 [accelerated_algorithm]: 8.03999e-06 [shard]: 1.77001e-06 [meta_shard_fg_expand]: 1.91e-06 [shard_inline]: 8.27e-06 [merge_send_recv]: 7.04001e-06 [auto_parallel]: 6.56e-06 [parallel]: 6.10002e-06 [flash_sp]: 3.66999e-06 [merge_comm]: 4.60001e-06 [allreduce_fusion]: 4.24002e-06 [matmul_add_comm_reduction]: 7.83001e-06 [allreduce_slice_to_reducescatter]: 5.60016e-07 [virtual_shard_identity]: 1.086e-05 [virtual_dataset]: 7.6e-06 [get_grad_eliminate_]: 7.26999e-06 [virtual_output]: 6.79999e-06 [merge_forward]: 4.25e-06 [cell_reuse_recompute_pass]: 1.49003e-06 [offload_activation]: 8.58001e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.68e-05 [merge_recompute_call_nodes]: 1.25999e-06 [before_grad]: 1.288e-05 [set_forward_comm_id_for_comm_node_pass]: 5.99e-06 [meta_fg_expand]: 2.79999e-06 [flash_sp_send_recv_attached]: 1.52001e-06 [receive_attached]: 1.40001e-06 [after_resolve]: 1.299e-05 [a_after_grad]: 1.16e-05 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 2.66e-06 [auto_monad_grad]: 1.38002e-06 [auto_monad_eliminator]: 1.124e-05 [cse]: 2.169e-05 [a_3]: 4.797e-05 [py_interpret_to_execute_after_opt_a]: 1.199e-05 [slice_cell_reuse_recomputed_activation]: 2.05002e-06 [rewriter_after_opt_a]: 4.336e-05 [convert_after_rewriter]: 7.98999e-06 [order_py_execute_after_rewriter]: 6.01e-06 [mutable_eliminate]: 0.00058818 [opt_b]: 0.00025557, [1] [Cycle 1]: 0.00024829, [7] [b_1]: 0.00015922 [b_2]: 9.12001e-06 [updatestate_depend_eliminate]: 7.98001e-06 [updatestate_assign_eliminate]: 3.16001e-06 [updatestate_loads_eliminate]: 3.45e-06 [renormalize]: 6.00005e-07 [cse]: 2.623e-05 [optimize_parallel_all_gather_comm]: 1.978e-05 [overlap_param_gather]: 2.00002e-06 [cconv]: 2.86e-05 [loop_unroll]: 0.00045672 [opt_after_cconv]: 0.00012826, [1] [Cycle 1]: 0.00012236, [7] [c_1]: 3.966e-05 [parameter_eliminate]: 3.36999e-06 [updatestate_depend_eliminate]: 8.1e-06 [updatestate_assign_eliminate]: 3.55998e-06 [updatestate_loads_eliminate]: 3.23e-06 [cse]: 2.789e-05 [renormalize]: 6.89994e-07 [remove_dup_value]: 1.5e-05 [tuple_transform]: 8.983e-05, [1] [Cycle 1]: 8.576e-05, [4] [d_1]: 5.492e-05 [none_parameter_eliminate]: 1.67001e-06 [renormalize]: 2.69996e-07 [switch_simplify]: 9.84999e-06 [partial_unused_args_eliminate]: 2.31e-06 [add_recomputation]: 5.96e-05 [cse_after_recomputation]: 2.949e-05, [1] [Cycle 1]: 2.488e-05, [1] [cse]: 1.774e-05 [environ_conv]: 6.41998e-06 [swap_dp_allreduce_reducescatter]: 6.68e-06 [bias_add_comm_swap]: 2.97002e-06 [label_micro_interleaved_index]: 4.53999e-06 [label_fine_grained_interleaved_index]: 2.71999e-06 [merge_cast_opt]: 1.26002e-06 [slice_recompute_activation]: 2.01e-06 [micro_interleaved_order_control]: 2.60002e-06 [assign_add_opt]: 1.16002e-06 [ForceFp32Comm]: 7.80012e-07 [remove_cast_before_assign_add]: 1.25001e-06 [full_micro_interleaved_order_control]: 2.43e-06 [reorder_send_recv_between_fp_bp]: 2.51e-06 [comm_op_add_attrs]: 1.26002e-06 [add_comm_op_reuse_tag]: 1.04e-06 [interleave_split_concat_branches]: 1.40001e-06 [interleave_parallel_branches]: 1.06997e-06 [overlap_opt_shard_in_pipeline]: 1.32e-06 [overlap_opt_shard_grad_in_pipeline]: 1.74998e-06 [control_data_broadcast_order]: 3.751e-05 [grouped_pairwise_exchange_alltoall]: 1.61002e-06 [offloading_packed_experts]: 5.23002e-06 [overlap_recompute_and_grad_model_parallel]: 5.75001e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.14e-06 [overlap_recompute_allgather_and_fa_grad]: 1.34e-06 [overlap_recompute_comm]: 2.53e-06 [overlap_grad_ring_attention]: 4.65999e-06 [overlap_grad_flash_sp]: 2.4e-05 [begin_end_overlap_inline]: 5.29981e-07 [split_matmul_comm_elemetwise]: 2.04999e-06 [split_layernorm_comm]: 1.91e-06 [handle_group_info]: 8.89995e-07 [symbol_engine_optimizer]: 0.00012679, [1] [Cycle 1]: 0.00012112, [6] [build]: 4.17e-06 [elim_shapecalc]: 1.452e-05 [elim_not_effective]: 2.068e-05 [opt_reshape]: 9.50001e-06 [fold_const_symbol]: 1.343e-05 [renormalize]: 2.80008e-07 [detach_backward]: 3.06999e-06 [pipeline_parallel_scheduler]: 1.81e-06 [auto_monad_reorder]: 2.231e-05 [get_jit_bprop_graph]: 1.57001e-06 [rewriter_after_jit_bprop_graph]: 5.67001e-06 [opt_after_jit_grad]: 0.00049934 [validate]: 4.633e-05 Sums bootstrap : 0.000471s : 2.27% type_inference : 0.014027s : 67.66% event_method : 0.000088s : 0.42% auto_monad : 0.000102s : 0.49% graph_reusing : 0.000008s : 0.04% inline : 0.000002s : 0.01% add_attr.add_attr_with_inline.tag_attr : 0.000031s : 0.15% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000008s : 0.04% parallel-infer-symbol : 0.000003s : 0.02% pre_auto_parallel : 0.000048s : 0.23% insert-virtual-dataset : 0.000003s : 0.01% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.01% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000038s : 0.18% optimize.rewriter_before_opt_a : 0.000118s : 0.57% optimize.opt_a.expand_dump_flag : 0.000005s : 0.03% optimize.opt_a.switch_simplify : 0.000143s : 0.69% optimize.opt_a.loop_unroll : 0.000051s : 0.25% optimize.opt_a.a_1 : 0.001189s : 5.74% optimize.opt_a.with_stream_mark : 0.000038s : 0.18% optimize.opt_a.recompute_prepare : 0.000024s : 0.11% optimize.opt_a.updatestate_depend_eliminate : 0.000010s : 0.05% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.03% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.03% optimize.opt_a.parameter_eliminate : 0.000004s : 0.02% optimize.opt_a.a_2 : 0.000202s : 0.97% optimize.opt_a.accelerated_algorithm : 0.000017s : 0.08% optimize.opt_a.shard : 0.000004s : 0.02% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.02% optimize.opt_a.shard_inline : 0.000017s : 0.08% optimize.opt_a.merge_send_recv : 0.000017s : 0.08% optimize.opt_a.auto_parallel : 0.000014s : 0.07% optimize.opt_a.parallel : 0.000026s : 0.13% optimize.opt_a.flash_sp : 0.000014s : 0.07% optimize.opt_a.merge_comm : 0.000010s : 0.05% optimize.opt_a.allreduce_fusion : 0.000009s : 0.04% optimize.opt_a.matmul_add_comm_reduction : 0.000019s : 0.09% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000021s : 0.10% optimize.opt_a.virtual_dataset : 0.000015s : 0.07% optimize.opt_a.get_grad_eliminate_ : 0.000015s : 0.07% optimize.opt_a.virtual_output : 0.000015s : 0.07% optimize.opt_a.merge_forward : 0.000009s : 0.05% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.01% optimize.opt_a.offload_activation : 0.000019s : 0.09% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000035s : 0.17% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.01% optimize.opt_a.before_grad : 0.000027s : 0.13% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000011s : 0.05% optimize.opt_a.meta_fg_expand : 0.000007s : 0.03% optimize.opt_a.flash_sp_send_recv_attached : 0.000005s : 0.02% optimize.opt_a.receive_attached : 0.000004s : 0.02% optimize.opt_a.after_resolve : 0.000027s : 0.13% optimize.opt_a.a_after_grad : 0.000024s : 0.12% optimize.opt_a.renormalize : 0.001093s : 5.27% optimize.opt_a.add_forward_monad_depend : 0.000009s : 0.04% optimize.opt_a.auto_monad_grad : 0.000004s : 0.02% optimize.opt_a.auto_monad_eliminator : 0.000032s : 0.15% optimize.opt_a.cse : 0.000059s : 0.28% optimize.opt_a.a_3 : 0.000111s : 0.53% optimize.py_interpret_to_execute_after_opt_a : 0.000012s : 0.06% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.01% optimize.rewriter_after_opt_a : 0.000043s : 0.21% optimize.convert_after_rewriter : 0.000008s : 0.04% optimize.order_py_execute_after_rewriter : 0.000006s : 0.03% optimize.mutable_eliminate : 0.000588s : 2.84% optimize.opt_b.b_1 : 0.000159s : 0.77% optimize.opt_b.b_2 : 0.000009s : 0.04% optimize.opt_b.updatestate_depend_eliminate : 0.000008s : 0.04% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.02% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.02% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000026s : 0.13% optimize.optimize_parallel_all_gather_comm : 0.000020s : 0.10% optimize.overlap_param_gather : 0.000002s : 0.01% optimize.cconv : 0.000029s : 0.14% optimize.loop_unroll : 0.000457s : 2.20% optimize.opt_after_cconv.c_1 : 0.000040s : 0.19% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.02% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000008s : 0.04% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000004s : 0.02% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.02% optimize.opt_after_cconv.cse : 0.000028s : 0.13% optimize.opt_after_cconv.renormalize : 0.000001s : 0.00% optimize.remove_dup_value : 0.000015s : 0.07% optimize.tuple_transform.d_1 : 0.000055s : 0.26% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000010s : 0.05% optimize.partial_unused_args_eliminate : 0.000002s : 0.01% optimize.add_recomputation : 0.000060s : 0.29% optimize.cse_after_recomputation.cse : 0.000018s : 0.09% optimize.environ_conv : 0.000006s : 0.03% optimize.swap_dp_allreduce_reducescatter : 0.000007s : 0.03% optimize.bias_add_comm_swap : 0.000003s : 0.01% optimize.label_micro_interleaved_index : 0.000005s : 0.02% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.01% optimize.merge_cast_opt : 0.000001s : 0.01% optimize.slice_recompute_activation : 0.000002s : 0.01% optimize.micro_interleaved_order_control : 0.000003s : 0.01% optimize.assign_add_opt : 0.000001s : 0.01% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.01% optimize.full_micro_interleaved_order_control : 0.000002s : 0.01% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.01% optimize.comm_op_add_attrs : 0.000001s : 0.01% optimize.add_comm_op_reuse_tag : 0.000001s : 0.01% optimize.interleave_split_concat_branches : 0.000001s : 0.01% optimize.interleave_parallel_branches : 0.000001s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.01% optimize.control_data_broadcast_order : 0.000038s : 0.18% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.01% optimize.offloading_packed_experts : 0.000005s : 0.03% optimize.overlap_recompute_and_grad_model_parallel : 0.000006s : 0.03% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.01% optimize.overlap_recompute_comm : 0.000003s : 0.01% optimize.overlap_grad_ring_attention : 0.000005s : 0.02% optimize.overlap_grad_flash_sp : 0.000024s : 0.12% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.01% optimize.split_layernorm_comm : 0.000002s : 0.01% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000004s : 0.02% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000015s : 0.07% optimize.symbol_engine_optimizer.elim_not_effective : 0.000021s : 0.10% optimize.symbol_engine_optimizer.opt_reshape : 0.000010s : 0.05% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000013s : 0.06% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000003s : 0.01% pipeline_parallel_scheduler : 0.000002s : 0.01% auto_monad_reorder : 0.000022s : 0.11% get_jit_bprop_graph : 0.000002s : 0.01% rewriter_after_jit_bprop_graph : 0.000006s : 0.03% opt_after_jit_grad : 0.000499s : 2.41% validate : 0.000046s : 0.22% Time group info: ------[substitution.] 0.000274 44 10.20% : 0.000028s : 3: substitution.cast_eliminate 0.98% : 0.000003s : 3: substitution.elim_not_effective 0.67% : 0.000002s : 3: substitution.fold_const_symbol 2.46% : 0.000007s : 5: substitution.graph_param_transform 69.89% : 0.000192s : 8: substitution.inline 1.88% : 0.000005s : 6: substitution.j_node_and_user_rematch 2.17% : 0.000006s : 6: substitution.remove_not_recompute_node 1.89% : 0.000005s : 4: substitution.replace_old_param 3.63% : 0.000010s : 2: substitution.switch_simplify 6.22% : 0.000017s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.013951 2 89.75% : 0.012521s : 1: type_inference.infer 10.25% : 0.001430s : 1: type_inference.specialize ------[replace.] 0.000137 14 43.50% : 0.000060s : 8: replace.inline 35.89% : 0.000049s : 2: replace.switch_simplify 20.61% : 0.000028s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000210 14 88.70% : 0.000187s : 8: match.inline 4.17% : 0.000009s : 2: match.switch_simplify 7.13% : 0.000015s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000297 1838 1.01% : 0.000003s : 21: predicate.accumulaten_eliminater 0.92% : 0.000003s : 5: predicate.ad_related_special_op_eliminate 0.44% : 0.000001s : 10: predicate.addn_check_dump 1.08% : 0.000003s : 21: predicate.addn_zero_filter 0.99% : 0.000003s : 21: predicate.adjust_all_reduce_mul_add 2.23% : 0.000007s : 31: predicate.arithmetic_simplify 1.07% : 0.000003s : 21: predicate.cast_eliminate 0.51% : 0.000002s : 10: predicate.check_bprop_eliminate 0.48% : 0.000001s : 10: predicate.compare_switch_simplify 0.16% : 0.000000s : 5: predicate.const_output_eliminate 0.49% : 0.000001s : 10: predicate.depend_value_elim 1.05% : 0.000003s : 21: predicate.dict_get_item_const_eliminator 1.13% : 0.000003s : 21: predicate.dict_get_item_eliminator 0.99% : 0.000003s : 21: predicate.dict_set_item_eliminator 0.73% : 0.000002s : 10: predicate.dumpgradient_eliminate 0.26% : 0.000001s : 5: predicate.elim_not_effective 0.34% : 0.000001s : 5: predicate.elim_shapecalc_of_broadcastargs 1.17% : 0.000003s : 26: predicate.environ_add_const_eliminate 1.18% : 0.000003s : 26: predicate.environ_get_add_eliminate 1.13% : 0.000003s : 26: predicate.environ_get_depend_swap 1.81% : 0.000005s : 36: predicate.environ_get_eliminate 1.12% : 0.000003s : 26: predicate.environ_get_set_eliminate 1.56% : 0.000005s : 33: predicate.exchange_switch_depend_value 2.57% : 0.000008s : 33: predicate.float_depend_g_call 0.44% : 0.000001s : 10: predicate.float_environ_get_switch 0.69% : 0.000002s : 15: predicate.float_tuple_getitem_switch 0.15% : 0.000000s : 5: predicate.fold_const_symbol 0.57% : 0.000002s : 10: predicate.get_grad_eliminate 0.18% : 0.000001s : 5: predicate.graph_param_transform 0.53% : 0.000002s : 10: predicate.incorporate_call 0.41% : 0.000001s : 10: predicate.incorporate_call_switch 6.62% : 0.000020s : 84: predicate.inline 0.70% : 0.000002s : 10: predicate.inline_without_move 0.27% : 0.000001s : 10: predicate.j_node_and_user_rematch 0.64% : 0.000002s : 10: predicate.less_batch_normalization 1.73% : 0.000005s : 35: predicate.list_to_tuple_eliminator_ 2.59% : 0.000008s : 56: predicate.load_eliminater 0.96% : 0.000003s : 5: predicate.loop_unroll_after_grad 2.47% : 0.000007s : 49: predicate.loop_unroll_before_grad 1.67% : 0.000005s : 31: predicate.make_slice_get_slice_eliminator 0.48% : 0.000001s : 10: predicate.merge_addn 0.47% : 0.000001s : 10: predicate.micro_step_allgather_replace 0.61% : 0.000002s : 10: predicate.mini_step_allgather_replace 0.92% : 0.000003s : 21: predicate.minmaximum_grad 0.79% : 0.000002s : 5: predicate.mutable_eliminate 0.41% : 0.000001s : 5: predicate.opt_reshape 0.26% : 0.000001s : 5: predicate.parallel_virtual_node 1.99% : 0.000006s : 33: predicate.partial_defer_inline 1.64% : 0.000005s : 30: predicate.partial_eliminate 0.97% : 0.000003s : 21: predicate.print_const_string_wrapper 0.48% : 0.000001s : 10: predicate.reduce_all_const_elim 1.21% : 0.000004s : 21: predicate.reduce_eliminate 2.54% : 0.000008s : 56: predicate.redundant_stop_gradient_eliminater 0.40% : 0.000001s : 10: predicate.remove_not_recompute_node 1.31% : 0.000004s : 35: predicate.replace_applicator 0.51% : 0.000002s : 10: predicate.replace_old_param 0.23% : 0.000001s : 5: predicate.reset_defer_inline 1.00% : 0.000003s : 21: predicate.reshape_eliminate 0.52% : 0.000002s : 10: predicate.row_tensor_add_zeros_like 0.28% : 0.000001s : 5: predicate.row_tensor_eliminate 0.66% : 0.000002s : 10: predicate.same_eliminate 0.59% : 0.000002s : 10: predicate.set_cell_output_no_recompute 0.88% : 0.000003s : 10: predicate.shard_identity_eliminate 0.58% : 0.000002s : 10: predicate.special_op_eliminate 0.69% : 0.000002s : 10: predicate.specialize_transform 0.70% : 0.000002s : 10: predicate.split_environ_get_set_with_tuple_value 0.59% : 0.000002s : 10: predicate.stack_unstack_eliminate 0.30% : 0.000001s : 5: predicate.switch_call_monad_eliminater 1.71% : 0.000005s : 33: predicate.switch_defer_inline 2.12% : 0.000006s : 43: predicate.switch_layer_defer_inline 5.92% : 0.000018s : 101: predicate.switch_simplify 1.02% : 0.000003s : 21: predicate.tile_eliminate 1.01% : 0.000003s : 21: predicate.transpose_eliminate 1.59% : 0.000005s : 31: predicate.tuple_list_convert_item_index_to_positive 1.66% : 0.000005s : 31: predicate.tuple_list_get_item_const_eliminator 1.60% : 0.000005s : 31: predicate.tuple_list_get_item_depend_reorder 3.32% : 0.000010s : 45: predicate.tuple_list_get_item_eliminator 1.67% : 0.000005s : 31: predicate.tuple_list_get_set_item_eliminator 2.20% : 0.000007s : 41: predicate.tuple_list_set_item_eliminator 1.73% : 0.000005s : 35: predicate.tuple_to_list_eliminator_ 2.47% : 0.000007s : 56: predicate.updatestate_pure_node_eliminater 3.04% : 0.000009s : 66: predicate.updatestate_useless_node_eliminater 0.26% : 0.000001s : 5: predicate.value_based_eliminate 0.52% : 0.000002s : 10: predicate.virtual_dataset_eliminate 0.50% : 0.000001s : 10: predicate.virtual_output_eliminate 0.27% : 0.000001s : 5: predicate.virtual_view_grad_eliminate 0.31% : 0.000001s : 5: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.001049 16 56.85% : 0.000596s : 6: func_graph_cloner_run.FuncGraphClonerGraph 43.15% : 0.000453s : 10: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.038027 192 0.01% : 0.000004s : 1: ForceFp32Comm 8.88% : 0.003375s : 1: add_attr 8.84% : 0.003362s : 1: add_attr_with_inline 0.01% : 0.000004s : 1: add_comm_op_reuse_tag 0.17% : 0.000064s : 1: add_recomputation 0.01% : 0.000004s : 1: assign_add_opt 0.29% : 0.000110s : 1: auto_monad 0.07% : 0.000027s : 1: auto_monad_reorder 0.01% : 0.000003s : 1: begin_end_overlap_inline 0.02% : 0.000006s : 1: bias_add_comm_swap 1.31% : 0.000498s : 1: bootstrap 0.08% : 0.000032s : 1: cconv 0.04% : 0.000017s : 1: comm_op_add_attrs 0.11% : 0.000043s : 1: control_data_broadcast_order 0.03% : 0.000012s : 1: convert_after_rewriter 0.09% : 0.000032s : 1: cse_after_recomputation 0.01% : 0.000005s : 1: dataset_repeat_opt 0.02% : 0.000007s : 1: detach_backward 0.03% : 0.000010s : 1: environ_conv 0.26% : 0.000098s : 1: event_method 0.01% : 0.000005s : 1: full_micro_interleaved_order_control 0.01% : 0.000005s : 1: get_jit_bprop_graph 0.03% : 0.000012s : 1: graph_reusing 0.01% : 0.000005s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000004s : 1: handle_group_info 0.01% : 0.000005s : 1: inline 0.02% : 0.000006s : 1: insert-virtual-dataset 0.01% : 0.000004s : 1: interleave_parallel_branches 0.01% : 0.000004s : 1: interleave_split_concat_branches 0.02% : 0.000006s : 1: label_fine_grained_interleaved_index 0.02% : 0.000008s : 1: label_micro_interleaved_index 1.23% : 0.000466s : 1: loop_unroll 0.01% : 0.000004s : 1: merge_cast_opt 0.01% : 0.000005s : 1: micro_interleaved_order_control 1.57% : 0.000598s : 1: mutable_eliminate 0.02% : 0.000008s : 1: offloading_packed_experts 0.05% : 0.000019s : 1: opt.transform.loop_unroll_optimizer 0.05% : 0.000019s : 1: opt.transform.mutable_eliminate 4.81% : 0.001829s : 78: opt.transform.opt_a 0.10% : 0.000038s : 1: opt.transform.opt_after_cconv 0.09% : 0.000034s : 1: opt.transform.opt_after_jit_grad 0.35% : 0.000134s : 28: opt.transform.opt_b 0.16% : 0.000062s : 2: opt.transform.opt_trans_graph 0.14% : 0.000052s : 4: opt.transform.symbol_engine_opt 10.29% : 0.003914s : 1: opt_a 0.35% : 0.000133s : 1: opt_after_cconv 1.34% : 0.000510s : 1: opt_after_jit_grad 0.68% : 0.000259s : 1: opt_b 16.65% : 0.006330s : 1: optimize 0.06% : 0.000024s : 1: optimize_parallel_all_gather_comm 0.03% : 0.000010s : 1: order_py_execute_after_rewriter 0.07% : 0.000028s : 1: overlap_grad_flash_sp 0.01% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.02% : 0.000008s : 1: overlap_grad_ring_attention 0.01% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.01% : 0.000005s : 1: overlap_param_gather 0.01% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.02% : 0.000009s : 1: overlap_recompute_and_grad_model_parallel 0.02% : 0.000006s : 1: overlap_recompute_comm 0.02% : 0.000007s : 1: parallel-infer-symbol 0.01% : 0.000004s : 1: parallel-infer-symbol-second 0.01% : 0.000005s : 1: partial_unused_args_eliminate 0.01% : 0.000005s : 1: pipeline_parallel_scheduler 0.01% : 0.000004s : 1: pipeline_split 0.14% : 0.000053s : 1: pre_auto_parallel 0.11% : 0.000043s : 1: py_interpret_to_execute 0.04% : 0.000016s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000004s : 1: remove_cast_before_assign_add 0.05% : 0.000019s : 1: remove_dup_value 1.60% : 0.000609s : 1: renormalize.infer 1.25% : 0.000474s : 1: renormalize.specialize 0.01% : 0.000005s : 1: reorder_send_recv_between_fp_bp 0.02% : 0.000009s : 1: rewriter_after_jit_bprop_graph 0.13% : 0.000049s : 1: rewriter_after_opt_a 0.33% : 0.000124s : 1: rewriter_before_opt_a 0.01% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.01% : 0.000005s : 1: slice_recompute_activation 0.01% : 0.000005s : 1: split_layernorm_comm 0.01% : 0.000005s : 1: split_matmul_comm_elemetwise 0.03% : 0.000010s : 1: swap_dp_allreduce_reducescatter 0.34% : 0.000130s : 1: symbol_engine_optimizer 0.24% : 0.000093s : 1: tuple_transform 36.94% : 0.014048s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:11.383.462 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:11.383.747 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0557581, [21] [bootstrap]: 0.0285543 [type_inference]: 0.0144229 [event_method]: 5.951e-05 [auto_monad]: 0.00010985 [graph_reusing]: 8.45999e-06 [inline]: 2.66999e-06 [add_attr]: 0.00406469, [1] [add_attr_with_inline]: 0.00405078, [1] [Cycle 1]: 0.00011821, [2] [tag_attr]: 3.44e-05 [meta_addattr_fg_expand]: 8.43001e-06 [parallel-infer-symbol]: 3.56999e-06 [pre_auto_parallel]: 5.405e-05 [insert-virtual-dataset]: 3.04999e-06 [parallel-infer-symbol-second]: 7.50006e-07 [dataset_repeat_opt]: 2.21e-06 [pipeline_split]: 1.62999e-06 [optimize]: 0.00715735, [53] [py_interpret_to_execute]: 4.94e-05 [rewriter_before_opt_a]: 0.0001324 [opt_a]: 0.00430358, [2] [Cycle 1]: 0.00336285, [45] [expand_dump_flag]: 5.66e-06 [switch_simplify]: 0.00013679 [loop_unroll]: 4.199e-05 [a_1]: 0.0009341 [with_stream_mark]: 2.625e-05 [recompute_prepare]: 1.255e-05 [updatestate_depend_eliminate]: 5.37999e-06 [updatestate_assign_eliminate]: 3.95998e-06 [updatestate_loads_eliminate]: 2.85998e-06 [parameter_eliminate]: 2.43e-06 [a_2]: 0.00011638 [accelerated_algorithm]: 8.53001e-06 [shard]: 1.97999e-06 [meta_shard_fg_expand]: 2.86e-06 [shard_inline]: 7.30998e-06 [merge_send_recv]: 8.82e-06 [auto_parallel]: 8.68001e-06 [parallel]: 2.238e-05 [flash_sp]: 1.088e-05 [merge_comm]: 4.08999e-06 [allreduce_fusion]: 3.46999e-06 [matmul_add_comm_reduction]: 1.128e-05 [allreduce_slice_to_reducescatter]: 6.59988e-07 [virtual_shard_identity]: 1.01e-05 [virtual_dataset]: 7.96001e-06 [get_grad_eliminate_]: 7.97e-06 [virtual_output]: 6.91001e-06 [merge_forward]: 4.25e-06 [cell_reuse_recompute_pass]: 1.40001e-06 [offload_activation]: 1.075e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.723e-05 [merge_recompute_call_nodes]: 1.52001e-06 [before_grad]: 1.181e-05 [set_forward_comm_id_for_comm_node_pass]: 3.93001e-06 [meta_fg_expand]: 3.65e-06 [flash_sp_send_recv_attached]: 3.16999e-06 [receive_attached]: 2.78e-06 [after_resolve]: 1.38e-05 [a_after_grad]: 1.105e-05 [renormalize]: 0.00126539 [add_forward_monad_depend]: 8.62998e-06 [auto_monad_grad]: 2.94999e-06 [auto_monad_eliminator]: 2.261e-05 [cse]: 3.14e-05 [a_3]: 7.635e-05 [Cycle 2]: 0.00092242, [45] [expand_dump_flag]: 1.71e-06 [switch_simplify]: 9.02999e-06 [loop_unroll]: 6.93998e-06 [a_1]: 0.0001521 [with_stream_mark]: 1.755e-05 [recompute_prepare]: 6.59999e-06 [updatestate_depend_eliminate]: 3.65e-06 [updatestate_assign_eliminate]: 2.61e-06 [updatestate_loads_eliminate]: 3.35998e-06 [parameter_eliminate]: 1.60999e-06 [a_2]: 0.00010894 [accelerated_algorithm]: 7.2e-06 [shard]: 2.17999e-06 [meta_shard_fg_expand]: 2.09e-06 [shard_inline]: 6.81001e-06 [merge_send_recv]: 7.77e-06 [auto_parallel]: 8.22998e-06 [parallel]: 8.15e-06 [flash_sp]: 3.96001e-06 [merge_comm]: 4.05e-06 [allreduce_fusion]: 3.43e-06 [matmul_add_comm_reduction]: 9.93998e-06 [allreduce_slice_to_reducescatter]: 6.79982e-07 [virtual_shard_identity]: 8.57e-06 [virtual_dataset]: 6.59001e-06 [get_grad_eliminate_]: 6.71e-06 [virtual_output]: 6.33998e-06 [merge_forward]: 3.81999e-06 [cell_reuse_recompute_pass]: 2.48e-06 [offload_activation]: 9.66e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.943e-05 [merge_recompute_call_nodes]: 1.27999e-06 [before_grad]: 1.092e-05 [set_forward_comm_id_for_comm_node_pass]: 4.27003e-06 [meta_fg_expand]: 2.79999e-06 [flash_sp_send_recv_attached]: 1.20999e-06 [receive_attached]: 1.49e-06 [after_resolve]: 1.29e-05 [a_after_grad]: 1.016e-05 [renormalize]: 1.19995e-07 [add_forward_monad_depend]: 2.17999e-06 [auto_monad_grad]: 1.29998e-06 [auto_monad_eliminator]: 1.174e-05 [cse]: 1.905e-05 [a_3]: 5.306e-05 [py_interpret_to_execute_after_opt_a]: 1.888e-05 [slice_cell_reuse_recomputed_activation]: 5.11002e-06 [rewriter_after_opt_a]: 4.979e-05 [convert_after_rewriter]: 1.116e-05 [order_py_execute_after_rewriter]: 9.42001e-06 [mutable_eliminate]: 0.00082057 [opt_b]: 0.00029364, [1] [Cycle 1]: 0.00028267, [7] [b_1]: 0.00017846 [b_2]: 8.12e-06 [updatestate_depend_eliminate]: 7.61999e-06 [updatestate_assign_eliminate]: 2.68e-06 [updatestate_loads_eliminate]: 2.89999e-06 [renormalize]: 5.69999e-07 [cse]: 2.382e-05 [optimize_parallel_all_gather_comm]: 2.225e-05 [overlap_param_gather]: 5.49998e-06 [cconv]: 3.725e-05 [loop_unroll]: 0.00047572 [opt_after_cconv]: 0.00014166, [1] [Cycle 1]: 0.00013078, [7] [c_1]: 3.538e-05 [parameter_eliminate]: 4e-06 [updatestate_depend_eliminate]: 7.01001e-06 [updatestate_assign_eliminate]: 2.57001e-06 [updatestate_loads_eliminate]: 2.31e-06 [cse]: 1.999e-05 [renormalize]: 3.80009e-07 [remove_dup_value]: 1.731e-05 [tuple_transform]: 0.00010426, [1] [Cycle 1]: 9.659e-05, [4] [d_1]: 5.352e-05 [none_parameter_eliminate]: 1.91e-06 [renormalize]: 1.60013e-07 [switch_simplify]: 8.40001e-06 [partial_unused_args_eliminate]: 4.57e-06 [add_recomputation]: 5.686e-05 [cse_after_recomputation]: 3.073e-05, [1] [Cycle 1]: 2.348e-05, [1] [cse]: 1.264e-05 [environ_conv]: 8.92999e-06 [swap_dp_allreduce_reducescatter]: 9.31002e-06 [bias_add_comm_swap]: 5.67001e-06 [label_micro_interleaved_index]: 8.37e-06 [label_fine_grained_interleaved_index]: 6.02999e-06 [merge_cast_opt]: 4.37e-06 [slice_recompute_activation]: 4.42e-06 [micro_interleaved_order_control]: 4.95999e-06 [assign_add_opt]: 4.30999e-06 [ForceFp32Comm]: 3.14001e-06 [remove_cast_before_assign_add]: 3.3e-06 [full_micro_interleaved_order_control]: 5.18002e-06 [reorder_send_recv_between_fp_bp]: 5.97001e-06 [comm_op_add_attrs]: 3.80003e-06 [add_comm_op_reuse_tag]: 4.05e-06 [interleave_split_concat_branches]: 4.07e-06 [interleave_parallel_branches]: 3.61999e-06 [overlap_opt_shard_in_pipeline]: 4.35999e-06 [overlap_opt_shard_grad_in_pipeline]: 4.73001e-06 [control_data_broadcast_order]: 1.893e-05 [grouped_pairwise_exchange_alltoall]: 4.53999e-06 [offloading_packed_experts]: 6.67002e-06 [overlap_recompute_and_grad_model_parallel]: 7.83001e-06 [overlap_grad_matmul_and_grad_allreduce]: 4.21001e-06 [overlap_recompute_allgather_and_fa_grad]: 3.81999e-06 [overlap_recompute_comm]: 4.63999e-06 [overlap_grad_ring_attention]: 7.07002e-06 [overlap_grad_flash_sp]: 2.564e-05 [begin_end_overlap_inline]: 4.06001e-06 [split_matmul_comm_elemetwise]: 4.82998e-06 [split_layernorm_comm]: 4.62998e-06 [handle_group_info]: 3.68e-06 [symbol_engine_optimizer]: 0.0001095, [1] [Cycle 1]: 0.00010219, [6] [build]: 3.87002e-06 [elim_shapecalc]: 1.413e-05 [elim_not_effective]: 1.493e-05 [opt_reshape]: 7.4e-06 [fold_const_symbol]: 1.106e-05 [renormalize]: 1.8999e-07 [detach_backward]: 4.95001e-06 [pipeline_parallel_scheduler]: 2.15002e-06 [auto_monad_reorder]: 2.045e-05 [get_jit_bprop_graph]: 2.14e-06 [rewriter_after_jit_bprop_graph]: 5.51998e-06 [opt_after_jit_grad]: 0.00054048 [validate]: 4.182e-05 Sums bootstrap : 0.028554s : 57.45% type_inference : 0.014423s : 29.02% event_method : 0.000060s : 0.12% auto_monad : 0.000110s : 0.22% graph_reusing : 0.000008s : 0.02% inline : 0.000003s : 0.01% add_attr.add_attr_with_inline.tag_attr : 0.000034s : 0.07% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000008s : 0.02% parallel-infer-symbol : 0.000004s : 0.01% pre_auto_parallel : 0.000054s : 0.11% insert-virtual-dataset : 0.000003s : 0.01% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000049s : 0.10% optimize.rewriter_before_opt_a : 0.000132s : 0.27% optimize.opt_a.expand_dump_flag : 0.000007s : 0.01% optimize.opt_a.switch_simplify : 0.000146s : 0.29% optimize.opt_a.loop_unroll : 0.000049s : 0.10% optimize.opt_a.a_1 : 0.001086s : 2.19% optimize.opt_a.with_stream_mark : 0.000044s : 0.09% optimize.opt_a.recompute_prepare : 0.000019s : 0.04% optimize.opt_a.updatestate_depend_eliminate : 0.000009s : 0.02% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.01% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.01% optimize.opt_a.parameter_eliminate : 0.000004s : 0.01% optimize.opt_a.a_2 : 0.000225s : 0.45% optimize.opt_a.accelerated_algorithm : 0.000016s : 0.03% optimize.opt_a.shard : 0.000004s : 0.01% optimize.opt_a.meta_shard_fg_expand : 0.000005s : 0.01% optimize.opt_a.shard_inline : 0.000014s : 0.03% optimize.opt_a.merge_send_recv : 0.000017s : 0.03% optimize.opt_a.auto_parallel : 0.000017s : 0.03% optimize.opt_a.parallel : 0.000031s : 0.06% optimize.opt_a.flash_sp : 0.000015s : 0.03% optimize.opt_a.merge_comm : 0.000008s : 0.02% optimize.opt_a.allreduce_fusion : 0.000007s : 0.01% optimize.opt_a.matmul_add_comm_reduction : 0.000021s : 0.04% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000019s : 0.04% optimize.opt_a.virtual_dataset : 0.000015s : 0.03% optimize.opt_a.get_grad_eliminate_ : 0.000015s : 0.03% optimize.opt_a.virtual_output : 0.000013s : 0.03% optimize.opt_a.merge_forward : 0.000008s : 0.02% optimize.opt_a.cell_reuse_recompute_pass : 0.000004s : 0.01% optimize.opt_a.offload_activation : 0.000020s : 0.04% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000037s : 0.07% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.01% optimize.opt_a.before_grad : 0.000023s : 0.05% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000008s : 0.02% optimize.opt_a.meta_fg_expand : 0.000006s : 0.01% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.01% optimize.opt_a.receive_attached : 0.000004s : 0.01% optimize.opt_a.after_resolve : 0.000027s : 0.05% optimize.opt_a.a_after_grad : 0.000021s : 0.04% optimize.opt_a.renormalize : 0.001266s : 2.55% optimize.opt_a.add_forward_monad_depend : 0.000011s : 0.02% optimize.opt_a.auto_monad_grad : 0.000004s : 0.01% optimize.opt_a.auto_monad_eliminator : 0.000034s : 0.07% optimize.opt_a.cse : 0.000050s : 0.10% optimize.opt_a.a_3 : 0.000129s : 0.26% optimize.py_interpret_to_execute_after_opt_a : 0.000019s : 0.04% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.01% optimize.rewriter_after_opt_a : 0.000050s : 0.10% optimize.convert_after_rewriter : 0.000011s : 0.02% optimize.order_py_execute_after_rewriter : 0.000009s : 0.02% optimize.mutable_eliminate : 0.000821s : 1.65% optimize.opt_b.b_1 : 0.000178s : 0.36% optimize.opt_b.b_2 : 0.000008s : 0.02% optimize.opt_b.updatestate_depend_eliminate : 0.000008s : 0.02% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.01% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.01% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000024s : 0.05% optimize.optimize_parallel_all_gather_comm : 0.000022s : 0.04% optimize.overlap_param_gather : 0.000005s : 0.01% optimize.cconv : 0.000037s : 0.07% optimize.loop_unroll : 0.000476s : 0.96% optimize.opt_after_cconv.c_1 : 0.000035s : 0.07% optimize.opt_after_cconv.parameter_eliminate : 0.000004s : 0.01% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000007s : 0.01% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.00% optimize.opt_after_cconv.cse : 0.000020s : 0.04% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000017s : 0.03% optimize.tuple_transform.d_1 : 0.000054s : 0.11% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000008s : 0.02% optimize.partial_unused_args_eliminate : 0.000005s : 0.01% optimize.add_recomputation : 0.000057s : 0.11% optimize.cse_after_recomputation.cse : 0.000013s : 0.03% optimize.environ_conv : 0.000009s : 0.02% optimize.swap_dp_allreduce_reducescatter : 0.000009s : 0.02% optimize.bias_add_comm_swap : 0.000006s : 0.01% optimize.label_micro_interleaved_index : 0.000008s : 0.02% optimize.label_fine_grained_interleaved_index : 0.000006s : 0.01% optimize.merge_cast_opt : 0.000004s : 0.01% optimize.slice_recompute_activation : 0.000004s : 0.01% optimize.micro_interleaved_order_control : 0.000005s : 0.01% optimize.assign_add_opt : 0.000004s : 0.01% optimize.ForceFp32Comm : 0.000003s : 0.01% optimize.remove_cast_before_assign_add : 0.000003s : 0.01% optimize.full_micro_interleaved_order_control : 0.000005s : 0.01% optimize.reorder_send_recv_between_fp_bp : 0.000006s : 0.01% optimize.comm_op_add_attrs : 0.000004s : 0.01% optimize.add_comm_op_reuse_tag : 0.000004s : 0.01% optimize.interleave_split_concat_branches : 0.000004s : 0.01% optimize.interleave_parallel_branches : 0.000004s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000005s : 0.01% optimize.control_data_broadcast_order : 0.000019s : 0.04% optimize.grouped_pairwise_exchange_alltoall : 0.000005s : 0.01% optimize.offloading_packed_experts : 0.000007s : 0.01% optimize.overlap_recompute_and_grad_model_parallel : 0.000008s : 0.02% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.01% optimize.overlap_recompute_comm : 0.000005s : 0.01% optimize.overlap_grad_ring_attention : 0.000007s : 0.01% optimize.overlap_grad_flash_sp : 0.000026s : 0.05% optimize.begin_end_overlap_inline : 0.000004s : 0.01% optimize.split_matmul_comm_elemetwise : 0.000005s : 0.01% optimize.split_layernorm_comm : 0.000005s : 0.01% optimize.handle_group_info : 0.000004s : 0.01% optimize.symbol_engine_optimizer.build : 0.000004s : 0.01% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000014s : 0.03% optimize.symbol_engine_optimizer.elim_not_effective : 0.000015s : 0.03% optimize.symbol_engine_optimizer.opt_reshape : 0.000007s : 0.01% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000011s : 0.02% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000005s : 0.01% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000020s : 0.04% get_jit_bprop_graph : 0.000002s : 0.00% rewriter_after_jit_bprop_graph : 0.000006s : 0.01% opt_after_jit_grad : 0.000540s : 1.09% validate : 0.000042s : 0.08% Time group info: ------[substitution.] 0.000268 34 0.84% : 0.000002s : 2: substitution.elim_not_effective 0.58% : 0.000002s : 2: substitution.fold_const_symbol 2.50% : 0.000007s : 4: substitution.graph_param_transform 79.35% : 0.000212s : 8: substitution.inline 1.71% : 0.000005s : 4: substitution.j_node_and_user_rematch 1.98% : 0.000005s : 4: substitution.remove_not_recompute_node 2.56% : 0.000007s : 4: substitution.replace_old_param 3.96% : 0.000011s : 2: substitution.switch_simplify 6.52% : 0.000017s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.014334 2 88.38% : 0.012667s : 1: type_inference.infer 11.62% : 0.001666s : 1: type_inference.specialize ------[replace.] 0.000140 14 41.63% : 0.000058s : 8: replace.inline 39.61% : 0.000055s : 2: replace.switch_simplify 18.76% : 0.000026s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000232 14 89.38% : 0.000208s : 8: match.inline 4.03% : 0.000009s : 2: match.switch_simplify 6.59% : 0.000015s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000260 1520 1.18% : 0.000003s : 17: predicate.accumulaten_eliminater 0.71% : 0.000002s : 4: predicate.ad_related_special_op_eliminate 0.46% : 0.000001s : 8: predicate.addn_check_dump 0.92% : 0.000002s : 17: predicate.addn_zero_filter 0.86% : 0.000002s : 17: predicate.adjust_all_reduce_mul_add 2.28% : 0.000006s : 25: predicate.arithmetic_simplify 0.99% : 0.000003s : 17: predicate.cast_eliminate 0.53% : 0.000001s : 8: predicate.check_bprop_eliminate 0.40% : 0.000001s : 8: predicate.compare_switch_simplify 0.17% : 0.000000s : 4: predicate.const_output_eliminate 0.54% : 0.000001s : 8: predicate.depend_value_elim 0.98% : 0.000003s : 17: predicate.dict_get_item_const_eliminator 1.32% : 0.000003s : 17: predicate.dict_get_item_eliminator 1.08% : 0.000003s : 17: predicate.dict_set_item_eliminator 0.74% : 0.000002s : 8: predicate.dumpgradient_eliminate 0.15% : 0.000000s : 4: predicate.elim_not_effective 0.35% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.18% : 0.000003s : 21: predicate.environ_add_const_eliminate 1.11% : 0.000003s : 21: predicate.environ_get_add_eliminate 1.31% : 0.000003s : 21: predicate.environ_get_depend_swap 1.65% : 0.000004s : 29: predicate.environ_get_eliminate 1.06% : 0.000003s : 21: predicate.environ_get_set_eliminate 1.60% : 0.000004s : 29: predicate.exchange_switch_depend_value 2.79% : 0.000007s : 29: predicate.float_depend_g_call 0.45% : 0.000001s : 8: predicate.float_environ_get_switch 0.71% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.13% : 0.000000s : 4: predicate.fold_const_symbol 0.61% : 0.000002s : 8: predicate.get_grad_eliminate 0.17% : 0.000000s : 4: predicate.graph_param_transform 0.50% : 0.000001s : 8: predicate.incorporate_call 0.38% : 0.000001s : 8: predicate.incorporate_call_switch 6.11% : 0.000016s : 70: predicate.inline 0.60% : 0.000002s : 8: predicate.inline_without_move 0.26% : 0.000001s : 8: predicate.j_node_and_user_rematch 0.84% : 0.000002s : 8: predicate.less_batch_normalization 1.75% : 0.000005s : 29: predicate.list_to_tuple_eliminator_ 2.49% : 0.000006s : 46: predicate.load_eliminater 0.89% : 0.000002s : 4: predicate.loop_unroll_after_grad 2.59% : 0.000007s : 47: predicate.loop_unroll_before_grad 1.64% : 0.000004s : 25: predicate.make_slice_get_slice_eliminator 0.47% : 0.000001s : 8: predicate.merge_addn 0.57% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.46% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.83% : 0.000002s : 17: predicate.minmaximum_grad 1.09% : 0.000003s : 4: predicate.mutable_eliminate 0.31% : 0.000001s : 4: predicate.opt_reshape 0.38% : 0.000001s : 4: predicate.parallel_virtual_node 2.14% : 0.000006s : 29: predicate.partial_defer_inline 1.65% : 0.000004s : 25: predicate.partial_eliminate 1.02% : 0.000003s : 17: predicate.print_const_string_wrapper 0.49% : 0.000001s : 8: predicate.reduce_all_const_elim 1.34% : 0.000003s : 17: predicate.reduce_eliminate 2.52% : 0.000007s : 46: predicate.redundant_stop_gradient_eliminater 0.37% : 0.000001s : 8: predicate.remove_not_recompute_node 1.39% : 0.000004s : 29: predicate.replace_applicator 0.34% : 0.000001s : 8: predicate.replace_old_param 0.29% : 0.000001s : 4: predicate.reset_defer_inline 1.13% : 0.000003s : 17: predicate.reshape_eliminate 0.57% : 0.000001s : 8: predicate.row_tensor_add_zeros_like 0.32% : 0.000001s : 4: predicate.row_tensor_eliminate 0.93% : 0.000002s : 8: predicate.same_eliminate 0.37% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.88% : 0.000002s : 8: predicate.shard_identity_eliminate 0.52% : 0.000001s : 8: predicate.special_op_eliminate 0.57% : 0.000001s : 8: predicate.specialize_transform 0.89% : 0.000002s : 8: predicate.split_environ_get_set_with_tuple_value 0.76% : 0.000002s : 8: predicate.stack_unstack_eliminate 0.25% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.68% : 0.000004s : 29: predicate.switch_defer_inline 2.24% : 0.000006s : 37: predicate.switch_layer_defer_inline 5.65% : 0.000015s : 92: predicate.switch_simplify 1.08% : 0.000003s : 17: predicate.tile_eliminate 1.05% : 0.000003s : 17: predicate.transpose_eliminate 1.59% : 0.000004s : 25: predicate.tuple_list_convert_item_index_to_positive 1.60% : 0.000004s : 25: predicate.tuple_list_get_item_const_eliminator 1.64% : 0.000004s : 25: predicate.tuple_list_get_item_depend_reorder 3.13% : 0.000008s : 37: predicate.tuple_list_get_item_eliminator 1.67% : 0.000004s : 25: predicate.tuple_list_get_set_item_eliminator 2.28% : 0.000006s : 33: predicate.tuple_list_set_item_eliminator 1.78% : 0.000005s : 29: predicate.tuple_to_list_eliminator_ 2.41% : 0.000006s : 46: predicate.updatestate_pure_node_eliminater 2.91% : 0.000008s : 54: predicate.updatestate_useless_node_eliminater 0.33% : 0.000001s : 4: predicate.value_based_eliminate 0.57% : 0.000001s : 8: predicate.virtual_dataset_eliminate 0.57% : 0.000001s : 8: predicate.virtual_output_eliminate 0.18% : 0.000000s : 4: predicate.virtual_view_grad_eliminate 0.30% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.001113 16 53.16% : 0.000592s : 6: func_graph_cloner_run.FuncGraphClonerGraph 46.84% : 0.000521s : 10: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.070037 192 0.01% : 0.000006s : 1: ForceFp32Comm 5.82% : 0.004076s : 1: add_attr 5.79% : 0.004055s : 1: add_attr_with_inline 0.01% : 0.000007s : 1: add_comm_op_reuse_tag 0.09% : 0.000061s : 1: add_recomputation 0.01% : 0.000007s : 1: assign_add_opt 0.17% : 0.000121s : 1: auto_monad 0.04% : 0.000028s : 1: auto_monad_reorder 0.01% : 0.000007s : 1: begin_end_overlap_inline 0.01% : 0.000008s : 1: bias_add_comm_swap 40.85% : 0.028612s : 1: bootstrap 0.06% : 0.000041s : 1: cconv 0.01% : 0.000007s : 1: comm_op_add_attrs 0.03% : 0.000023s : 1: control_data_broadcast_order 0.02% : 0.000014s : 1: convert_after_rewriter 0.05% : 0.000034s : 1: cse_after_recomputation 0.01% : 0.000008s : 1: dataset_repeat_opt 0.04% : 0.000027s : 1: detach_backward 0.02% : 0.000012s : 1: environ_conv 0.10% : 0.000073s : 1: event_method 0.01% : 0.000009s : 1: full_micro_interleaved_order_control 0.01% : 0.000009s : 1: get_jit_bprop_graph 0.02% : 0.000015s : 1: graph_reusing 0.01% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000006s : 1: handle_group_info 0.01% : 0.000009s : 1: inline 0.01% : 0.000009s : 1: insert-virtual-dataset 0.01% : 0.000006s : 1: interleave_parallel_branches 0.01% : 0.000007s : 1: interleave_split_concat_branches 0.01% : 0.000009s : 1: label_fine_grained_interleaved_index 0.02% : 0.000012s : 1: label_micro_interleaved_index 0.69% : 0.000483s : 1: loop_unroll 0.01% : 0.000007s : 1: merge_cast_opt 0.01% : 0.000008s : 1: micro_interleaved_order_control 1.18% : 0.000828s : 1: mutable_eliminate 0.01% : 0.000010s : 1: offloading_packed_experts 0.02% : 0.000016s : 1: opt.transform.loop_unroll_optimizer 0.03% : 0.000020s : 1: opt.transform.mutable_eliminate 2.37% : 0.001662s : 78: opt.transform.opt_a 0.05% : 0.000034s : 1: opt.transform.opt_after_cconv 0.04% : 0.000029s : 1: opt.transform.opt_after_jit_grad 0.16% : 0.000111s : 28: opt.transform.opt_b 0.08% : 0.000059s : 2: opt.transform.opt_trans_graph 0.06% : 0.000043s : 4: opt.transform.symbol_engine_opt 6.15% : 0.004308s : 1: opt_a 0.21% : 0.000146s : 1: opt_after_cconv 0.79% : 0.000552s : 1: opt_after_jit_grad 0.42% : 0.000297s : 1: opt_b 10.79% : 0.007555s : 1: optimize 0.04% : 0.000026s : 1: optimize_parallel_all_gather_comm 0.02% : 0.000013s : 1: order_py_execute_after_rewriter 0.04% : 0.000030s : 1: overlap_grad_flash_sp 0.01% : 0.000007s : 1: overlap_grad_matmul_and_grad_allreduce 0.01% : 0.000010s : 1: overlap_grad_ring_attention 0.01% : 0.000008s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000007s : 1: overlap_opt_shard_in_pipeline 0.01% : 0.000010s : 1: overlap_param_gather 0.01% : 0.000006s : 1: overlap_recompute_allgather_and_fa_grad 0.02% : 0.000011s : 1: overlap_recompute_and_grad_model_parallel 0.01% : 0.000007s : 1: overlap_recompute_comm 0.01% : 0.000010s : 1: parallel-infer-symbol 0.01% : 0.000007s : 1: parallel-infer-symbol-second 0.01% : 0.000008s : 1: partial_unused_args_eliminate 0.01% : 0.000009s : 1: pipeline_parallel_scheduler 0.01% : 0.000007s : 1: pipeline_split 0.09% : 0.000062s : 1: pre_auto_parallel 0.08% : 0.000053s : 1: py_interpret_to_execute 0.03% : 0.000022s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000006s : 1: remove_cast_before_assign_add 0.03% : 0.000021s : 1: remove_dup_value 0.94% : 0.000656s : 1: renormalize.infer 0.85% : 0.000597s : 1: renormalize.specialize 0.01% : 0.000009s : 1: reorder_send_recv_between_fp_bp 0.02% : 0.000011s : 1: rewriter_after_jit_bprop_graph 0.08% : 0.000055s : 1: rewriter_after_opt_a 0.20% : 0.000137s : 1: rewriter_before_opt_a 0.01% : 0.000008s : 1: slice_cell_reuse_recomputed_activation 0.01% : 0.000007s : 1: slice_recompute_activation 0.01% : 0.000008s : 1: split_layernorm_comm 0.01% : 0.000007s : 1: split_matmul_comm_elemetwise 0.02% : 0.000012s : 1: swap_dp_allreduce_reducescatter 0.16% : 0.000113s : 1: symbol_engine_optimizer 0.15% : 0.000107s : 1: tuple_transform 20.67% : 0.014479s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:11.926.280 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.025232, [21] [bootstrap]: 0.00047805 [type_inference]: 0.0150972 [event_method]: 4.975e-05 [auto_monad]: 9.703e-05 [graph_reusing]: 7.03e-06 [inline]: 2.83998e-06 [add_attr]: 0.00317396, [1] [add_attr_with_inline]: 0.00316546, [1] [Cycle 1]: 6.842e-05, [2] [tag_attr]: 2.813e-05 [meta_addattr_fg_expand]: 8e-06 [parallel-infer-symbol]: 3.19001e-06 [pre_auto_parallel]: 4.236e-05 [insert-virtual-dataset]: 2.61e-06 [parallel-infer-symbol-second]: 7.89994e-07 [dataset_repeat_opt]: 1.99e-06 [pipeline_split]: 2.24001e-06 [optimize]: 0.00541499, [53] [py_interpret_to_execute]: 3.286e-05 [rewriter_before_opt_a]: 0.00011102 [opt_a]: 0.0032155, [2] [Cycle 1]: 0.00255965, [45] [expand_dump_flag]: 4.09002e-06 [switch_simplify]: 0.00012473 [loop_unroll]: 4.13e-05 [a_1]: 0.00083807 [with_stream_mark]: 1.754e-05 [recompute_prepare]: 9.62001e-06 [updatestate_depend_eliminate]: 3.75998e-06 [updatestate_assign_eliminate]: 3.57002e-06 [updatestate_loads_eliminate]: 2.86999e-06 [parameter_eliminate]: 2.04999e-06 [a_2]: 8.618e-05 [accelerated_algorithm]: 7.38999e-06 [shard]: 1.69998e-06 [meta_shard_fg_expand]: 2.02999e-06 [shard_inline]: 6.72002e-06 [merge_send_recv]: 8.85999e-06 [auto_parallel]: 6.44001e-06 [parallel]: 1.806e-05 [flash_sp]: 7.57002e-06 [merge_comm]: 3.69002e-06 [allreduce_fusion]: 3.23998e-06 [matmul_add_comm_reduction]: 9.07001e-06 [allreduce_slice_to_reducescatter]: 7.00005e-07 [virtual_shard_identity]: 9.01002e-06 [virtual_dataset]: 6.98998e-06 [get_grad_eliminate_]: 6.24999e-06 [virtual_output]: 6.36e-06 [merge_forward]: 3.93001e-06 [cell_reuse_recompute_pass]: 1.17e-06 [offload_activation]: 1.019e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.303e-05 [merge_recompute_call_nodes]: 1.45001e-06 [before_grad]: 1.048e-05 [set_forward_comm_id_for_comm_node_pass]: 3.5e-06 [meta_fg_expand]: 3.13e-06 [flash_sp_send_recv_attached]: 2.48e-06 [receive_attached]: 2.34001e-06 [after_resolve]: 1.157e-05 [a_after_grad]: 1.024e-05 [renormalize]: 0.0008876 [add_forward_monad_depend]: 5.37999e-06 [auto_monad_grad]: 2.54001e-06 [auto_monad_eliminator]: 1.641e-05 [cse]: 3.028e-05 [a_3]: 5.012e-05 [Cycle 2]: 0.0006456, [45] [expand_dump_flag]: 1.30999e-06 [switch_simplify]: 8.23001e-06 [loop_unroll]: 6.38003e-06 [a_1]: 0.00013535 [with_stream_mark]: 1.175e-05 [recompute_prepare]: 6.30002e-06 [updatestate_depend_eliminate]: 3.03e-06 [updatestate_assign_eliminate]: 2.41998e-06 [updatestate_loads_eliminate]: 2.89999e-06 [parameter_eliminate]: 1.12e-06 [a_2]: 7.359e-05 [accelerated_algorithm]: 6.19001e-06 [shard]: 1.49e-06 [meta_shard_fg_expand]: 1.27999e-06 [shard_inline]: 6.31e-06 [merge_send_recv]: 4.99998e-06 [auto_parallel]: 5.25001e-06 [parallel]: 5.24e-06 [flash_sp]: 3.41001e-06 [merge_comm]: 3.18998e-06 [allreduce_fusion]: 2.96999e-06 [matmul_add_comm_reduction]: 6.44001e-06 [allreduce_slice_to_reducescatter]: 7.00005e-07 [virtual_shard_identity]: 6.62002e-06 [virtual_dataset]: 5.96e-06 [get_grad_eliminate_]: 5.84999e-06 [virtual_output]: 5.46998e-06 [merge_forward]: 2.51998e-06 [cell_reuse_recompute_pass]: 1.62999e-06 [offload_activation]: 7.03e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.251e-05 [merge_recompute_call_nodes]: 1.10001e-06 [before_grad]: 9.50001e-06 [set_forward_comm_id_for_comm_node_pass]: 3.48e-06 [meta_fg_expand]: 2.00002e-06 [flash_sp_send_recv_attached]: 9.70002e-07 [receive_attached]: 9.89996e-07 [after_resolve]: 1.103e-05 [a_after_grad]: 9.33002e-06 [renormalize]: 8.00064e-08 [add_forward_monad_depend]: 1.13001e-06 [auto_monad_grad]: 1.22999e-06 [auto_monad_eliminator]: 7.15998e-06 [cse]: 1.269e-05 [a_3]: 3.678e-05 [py_interpret_to_execute_after_opt_a]: 1.014e-05 [slice_cell_reuse_recomputed_activation]: 2.74999e-06 [rewriter_after_opt_a]: 3.543e-05 [convert_after_rewriter]: 6.49999e-06 [order_py_execute_after_rewriter]: 5.20999e-06 [mutable_eliminate]: 0.0005278 [opt_b]: 0.00021081, [1] [Cycle 1]: 0.00020374, [7] [b_1]: 0.00012894 [b_2]: 8.37998e-06 [updatestate_depend_eliminate]: 6.02999e-06 [updatestate_assign_eliminate]: 2.36e-06 [updatestate_loads_eliminate]: 2.46e-06 [renormalize]: 6.19999e-07 [cse]: 1.817e-05 [optimize_parallel_all_gather_comm]: 1.618e-05 [overlap_param_gather]: 1.99e-06 [cconv]: 2.749e-05 [loop_unroll]: 0.00047051 [opt_after_cconv]: 0.00011174, [1] [Cycle 1]: 0.00010534, [7] [c_1]: 3.373e-05 [parameter_eliminate]: 4.15e-06 [updatestate_depend_eliminate]: 5.52999e-06 [updatestate_assign_eliminate]: 3.68e-06 [updatestate_loads_eliminate]: 2.34999e-06 [cse]: 2.058e-05 [renormalize]: 4.19997e-07 [remove_dup_value]: 1.383e-05 [tuple_transform]: 8.35e-05, [1] [Cycle 1]: 7.761e-05, [4] [d_1]: 4.922e-05 [none_parameter_eliminate]: 1.91003e-06 [renormalize]: 2.30008e-07 [switch_simplify]: 7.51001e-06 [partial_unused_args_eliminate]: 2.46998e-06 [add_recomputation]: 5.389e-05 [cse_after_recomputation]: 2.205e-05, [1] [Cycle 1]: 1.633e-05, [1] [cse]: 1.061e-05 [environ_conv]: 5.82001e-06 [swap_dp_allreduce_reducescatter]: 5.12999e-06 [bias_add_comm_swap]: 2.59999e-06 [label_micro_interleaved_index]: 6.46e-06 [label_fine_grained_interleaved_index]: 2.98e-06 [merge_cast_opt]: 1.55001e-06 [slice_recompute_activation]: 2.17999e-06 [micro_interleaved_order_control]: 2.97002e-06 [assign_add_opt]: 1.40001e-06 [ForceFp32Comm]: 7.99977e-07 [remove_cast_before_assign_add]: 1.15999e-06 [full_micro_interleaved_order_control]: 2.41e-06 [reorder_send_recv_between_fp_bp]: 2.72001e-06 [comm_op_add_attrs]: 1.33002e-06 [add_comm_op_reuse_tag]: 1.13001e-06 [interleave_split_concat_branches]: 1.59998e-06 [interleave_parallel_branches]: 1.10001e-06 [overlap_opt_shard_in_pipeline]: 1.34998e-06 [overlap_opt_shard_grad_in_pipeline]: 1.94e-06 [control_data_broadcast_order]: 1.568e-05 [grouped_pairwise_exchange_alltoall]: 1.96e-06 [offloading_packed_experts]: 4.25999e-06 [overlap_recompute_and_grad_model_parallel]: 4.90999e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.67001e-06 [overlap_recompute_allgather_and_fa_grad]: 1.35999e-06 [overlap_recompute_comm]: 2.32999e-06 [overlap_grad_ring_attention]: 4.58999e-06 [overlap_grad_flash_sp]: 2.172e-05 [begin_end_overlap_inline]: 5.19998e-07 [split_matmul_comm_elemetwise]: 2.26e-06 [split_layernorm_comm]: 1.96e-06 [handle_group_info]: 9.30013e-07 [symbol_engine_optimizer]: 8.495e-05, [1] [Cycle 1]: 8.026e-05, [6] [build]: 3.91999e-06 [elim_shapecalc]: 1.142e-05 [elim_not_effective]: 1.646e-05 [opt_reshape]: 8.27e-06 [fold_const_symbol]: 1.088e-05 [renormalize]: 4.10015e-07 [detach_backward]: 2.26e-06 [pipeline_parallel_scheduler]: 1.65001e-06 [auto_monad_reorder]: 1.753e-05 [get_jit_bprop_graph]: 2.14e-06 [rewriter_after_jit_bprop_graph]: 5.57001e-06 [opt_after_jit_grad]: 0.00062052 [validate]: 4.369e-05 Sums bootstrap : 0.000478s : 2.27% type_inference : 0.015097s : 71.72% event_method : 0.000050s : 0.24% auto_monad : 0.000097s : 0.46% graph_reusing : 0.000007s : 0.03% inline : 0.000003s : 0.01% add_attr.add_attr_with_inline.tag_attr : 0.000028s : 0.13% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000008s : 0.04% parallel-infer-symbol : 0.000003s : 0.02% pre_auto_parallel : 0.000042s : 0.20% insert-virtual-dataset : 0.000003s : 0.01% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.01% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000033s : 0.16% optimize.rewriter_before_opt_a : 0.000111s : 0.53% optimize.opt_a.expand_dump_flag : 0.000005s : 0.03% optimize.opt_a.switch_simplify : 0.000133s : 0.63% optimize.opt_a.loop_unroll : 0.000048s : 0.23% optimize.opt_a.a_1 : 0.000973s : 4.62% optimize.opt_a.with_stream_mark : 0.000029s : 0.14% optimize.opt_a.recompute_prepare : 0.000016s : 0.08% optimize.opt_a.updatestate_depend_eliminate : 0.000007s : 0.03% optimize.opt_a.updatestate_assign_eliminate : 0.000006s : 0.03% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.03% optimize.opt_a.parameter_eliminate : 0.000003s : 0.02% optimize.opt_a.a_2 : 0.000160s : 0.76% optimize.opt_a.accelerated_algorithm : 0.000014s : 0.06% optimize.opt_a.shard : 0.000003s : 0.02% optimize.opt_a.meta_shard_fg_expand : 0.000003s : 0.02% optimize.opt_a.shard_inline : 0.000013s : 0.06% optimize.opt_a.merge_send_recv : 0.000014s : 0.07% optimize.opt_a.auto_parallel : 0.000012s : 0.06% optimize.opt_a.parallel : 0.000023s : 0.11% optimize.opt_a.flash_sp : 0.000011s : 0.05% optimize.opt_a.merge_comm : 0.000007s : 0.03% optimize.opt_a.allreduce_fusion : 0.000006s : 0.03% optimize.opt_a.matmul_add_comm_reduction : 0.000016s : 0.07% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000016s : 0.07% optimize.opt_a.virtual_dataset : 0.000013s : 0.06% optimize.opt_a.get_grad_eliminate_ : 0.000012s : 0.06% optimize.opt_a.virtual_output : 0.000012s : 0.06% optimize.opt_a.merge_forward : 0.000006s : 0.03% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.01% optimize.opt_a.offload_activation : 0.000017s : 0.08% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000026s : 0.12% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.01% optimize.opt_a.before_grad : 0.000020s : 0.09% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000007s : 0.03% optimize.opt_a.meta_fg_expand : 0.000005s : 0.02% optimize.opt_a.flash_sp_send_recv_attached : 0.000003s : 0.02% optimize.opt_a.receive_attached : 0.000003s : 0.02% optimize.opt_a.after_resolve : 0.000023s : 0.11% optimize.opt_a.a_after_grad : 0.000020s : 0.09% optimize.opt_a.renormalize : 0.000888s : 4.22% optimize.opt_a.add_forward_monad_depend : 0.000007s : 0.03% optimize.opt_a.auto_monad_grad : 0.000004s : 0.02% optimize.opt_a.auto_monad_eliminator : 0.000024s : 0.11% optimize.opt_a.cse : 0.000043s : 0.20% optimize.opt_a.a_3 : 0.000087s : 0.41% optimize.py_interpret_to_execute_after_opt_a : 0.000010s : 0.05% optimize.slice_cell_reuse_recomputed_activation : 0.000003s : 0.01% optimize.rewriter_after_opt_a : 0.000035s : 0.17% optimize.convert_after_rewriter : 0.000006s : 0.03% optimize.order_py_execute_after_rewriter : 0.000005s : 0.02% optimize.mutable_eliminate : 0.000528s : 2.51% optimize.opt_b.b_1 : 0.000129s : 0.61% optimize.opt_b.b_2 : 0.000008s : 0.04% optimize.opt_b.updatestate_depend_eliminate : 0.000006s : 0.03% optimize.opt_b.updatestate_assign_eliminate : 0.000002s : 0.01% optimize.opt_b.updatestate_loads_eliminate : 0.000002s : 0.01% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000018s : 0.09% optimize.optimize_parallel_all_gather_comm : 0.000016s : 0.08% optimize.overlap_param_gather : 0.000002s : 0.01% optimize.cconv : 0.000027s : 0.13% optimize.loop_unroll : 0.000471s : 2.24% optimize.opt_after_cconv.c_1 : 0.000034s : 0.16% optimize.opt_after_cconv.parameter_eliminate : 0.000004s : 0.02% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.03% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000004s : 0.02% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.01% optimize.opt_after_cconv.cse : 0.000021s : 0.10% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000014s : 0.07% optimize.tuple_transform.d_1 : 0.000049s : 0.23% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000008s : 0.04% optimize.partial_unused_args_eliminate : 0.000002s : 0.01% optimize.add_recomputation : 0.000054s : 0.26% optimize.cse_after_recomputation.cse : 0.000011s : 0.05% optimize.environ_conv : 0.000006s : 0.03% optimize.swap_dp_allreduce_reducescatter : 0.000005s : 0.02% optimize.bias_add_comm_swap : 0.000003s : 0.01% optimize.label_micro_interleaved_index : 0.000006s : 0.03% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.01% optimize.merge_cast_opt : 0.000002s : 0.01% optimize.slice_recompute_activation : 0.000002s : 0.01% optimize.micro_interleaved_order_control : 0.000003s : 0.01% optimize.assign_add_opt : 0.000001s : 0.01% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.01% optimize.full_micro_interleaved_order_control : 0.000002s : 0.01% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.01% optimize.comm_op_add_attrs : 0.000001s : 0.01% optimize.add_comm_op_reuse_tag : 0.000001s : 0.01% optimize.interleave_split_concat_branches : 0.000002s : 0.01% optimize.interleave_parallel_branches : 0.000001s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.01% optimize.control_data_broadcast_order : 0.000016s : 0.07% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.01% optimize.offloading_packed_experts : 0.000004s : 0.02% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.02% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000002s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.01% optimize.overlap_recompute_comm : 0.000002s : 0.01% optimize.overlap_grad_ring_attention : 0.000005s : 0.02% optimize.overlap_grad_flash_sp : 0.000022s : 0.10% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.01% optimize.split_layernorm_comm : 0.000002s : 0.01% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000004s : 0.02% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000011s : 0.05% optimize.symbol_engine_optimizer.elim_not_effective : 0.000016s : 0.08% optimize.symbol_engine_optimizer.opt_reshape : 0.000008s : 0.04% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000011s : 0.05% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.01% pipeline_parallel_scheduler : 0.000002s : 0.01% auto_monad_reorder : 0.000018s : 0.08% get_jit_bprop_graph : 0.000002s : 0.01% rewriter_after_jit_bprop_graph : 0.000006s : 0.03% opt_after_jit_grad : 0.000621s : 2.95% validate : 0.000044s : 0.21% Time group info: ------[substitution.] 0.000221 34 1.04% : 0.000002s : 2: substitution.elim_not_effective 0.78% : 0.000002s : 2: substitution.fold_const_symbol 2.83% : 0.000006s : 4: substitution.graph_param_transform 77.89% : 0.000172s : 8: substitution.inline 1.67% : 0.000004s : 4: substitution.j_node_and_user_rematch 2.24% : 0.000005s : 4: substitution.remove_not_recompute_node 2.00% : 0.000004s : 4: substitution.replace_old_param 4.03% : 0.000009s : 2: substitution.switch_simplify 7.53% : 0.000017s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.015022 2 90.68% : 0.013622s : 1: type_inference.infer 9.32% : 0.001400s : 1: type_inference.specialize ------[replace.] 0.000122 14 43.54% : 0.000053s : 8: replace.inline 38.20% : 0.000047s : 2: replace.switch_simplify 18.26% : 0.000022s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000189 14 88.39% : 0.000167s : 8: match.inline 4.00% : 0.000008s : 2: match.switch_simplify 7.60% : 0.000014s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000239 1520 0.97% : 0.000002s : 17: predicate.accumulaten_eliminater 0.71% : 0.000002s : 4: predicate.ad_related_special_op_eliminate 0.42% : 0.000001s : 8: predicate.addn_check_dump 0.94% : 0.000002s : 17: predicate.addn_zero_filter 0.88% : 0.000002s : 17: predicate.adjust_all_reduce_mul_add 2.01% : 0.000005s : 25: predicate.arithmetic_simplify 1.03% : 0.000002s : 17: predicate.cast_eliminate 0.50% : 0.000001s : 8: predicate.check_bprop_eliminate 0.45% : 0.000001s : 8: predicate.compare_switch_simplify 0.16% : 0.000000s : 4: predicate.const_output_eliminate 0.69% : 0.000002s : 8: predicate.depend_value_elim 1.08% : 0.000003s : 17: predicate.dict_get_item_const_eliminator 1.23% : 0.000003s : 17: predicate.dict_get_item_eliminator 0.95% : 0.000002s : 17: predicate.dict_set_item_eliminator 0.79% : 0.000002s : 8: predicate.dumpgradient_eliminate 0.23% : 0.000001s : 4: predicate.elim_not_effective 0.33% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.16% : 0.000003s : 21: predicate.environ_add_const_eliminate 1.17% : 0.000003s : 21: predicate.environ_get_add_eliminate 1.15% : 0.000003s : 21: predicate.environ_get_depend_swap 1.63% : 0.000004s : 29: predicate.environ_get_eliminate 1.26% : 0.000003s : 21: predicate.environ_get_set_eliminate 1.70% : 0.000004s : 29: predicate.exchange_switch_depend_value 2.72% : 0.000007s : 29: predicate.float_depend_g_call 0.41% : 0.000001s : 8: predicate.float_environ_get_switch 0.63% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.14% : 0.000000s : 4: predicate.fold_const_symbol 0.54% : 0.000001s : 8: predicate.get_grad_eliminate 0.20% : 0.000000s : 4: predicate.graph_param_transform 0.53% : 0.000001s : 8: predicate.incorporate_call 0.41% : 0.000001s : 8: predicate.incorporate_call_switch 6.57% : 0.000016s : 70: predicate.inline 0.65% : 0.000002s : 8: predicate.inline_without_move 0.28% : 0.000001s : 8: predicate.j_node_and_user_rematch 0.70% : 0.000002s : 8: predicate.less_batch_normalization 1.78% : 0.000004s : 29: predicate.list_to_tuple_eliminator_ 2.53% : 0.000006s : 46: predicate.load_eliminater 0.87% : 0.000002s : 4: predicate.loop_unroll_after_grad 2.82% : 0.000007s : 47: predicate.loop_unroll_before_grad 1.61% : 0.000004s : 25: predicate.make_slice_get_slice_eliminator 0.48% : 0.000001s : 8: predicate.merge_addn 0.49% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.48% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.89% : 0.000002s : 17: predicate.minmaximum_grad 0.98% : 0.000002s : 4: predicate.mutable_eliminate 0.38% : 0.000001s : 4: predicate.opt_reshape 0.28% : 0.000001s : 4: predicate.parallel_virtual_node 2.10% : 0.000005s : 29: predicate.partial_defer_inline 1.64% : 0.000004s : 25: predicate.partial_eliminate 0.99% : 0.000002s : 17: predicate.print_const_string_wrapper 0.46% : 0.000001s : 8: predicate.reduce_all_const_elim 1.26% : 0.000003s : 17: predicate.reduce_eliminate 2.61% : 0.000006s : 46: predicate.redundant_stop_gradient_eliminater 0.36% : 0.000001s : 8: predicate.remove_not_recompute_node 1.41% : 0.000003s : 29: predicate.replace_applicator 0.43% : 0.000001s : 8: predicate.replace_old_param 0.26% : 0.000001s : 4: predicate.reset_defer_inline 1.04% : 0.000002s : 17: predicate.reshape_eliminate 0.51% : 0.000001s : 8: predicate.row_tensor_add_zeros_like 0.38% : 0.000001s : 4: predicate.row_tensor_eliminate 0.61% : 0.000001s : 8: predicate.same_eliminate 0.38% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.60% : 0.000001s : 8: predicate.shard_identity_eliminate 0.61% : 0.000001s : 8: predicate.special_op_eliminate 0.60% : 0.000001s : 8: predicate.specialize_transform 0.65% : 0.000002s : 8: predicate.split_environ_get_set_with_tuple_value 0.65% : 0.000002s : 8: predicate.stack_unstack_eliminate 0.30% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.86% : 0.000004s : 29: predicate.switch_defer_inline 2.25% : 0.000005s : 37: predicate.switch_layer_defer_inline 6.37% : 0.000015s : 92: predicate.switch_simplify 0.98% : 0.000002s : 17: predicate.tile_eliminate 0.97% : 0.000002s : 17: predicate.transpose_eliminate 1.69% : 0.000004s : 25: predicate.tuple_list_convert_item_index_to_positive 1.67% : 0.000004s : 25: predicate.tuple_list_get_item_const_eliminator 1.53% : 0.000004s : 25: predicate.tuple_list_get_item_depend_reorder 3.06% : 0.000007s : 37: predicate.tuple_list_get_item_eliminator 1.54% : 0.000004s : 25: predicate.tuple_list_get_set_item_eliminator 2.28% : 0.000005s : 33: predicate.tuple_list_set_item_eliminator 1.70% : 0.000004s : 29: predicate.tuple_to_list_eliminator_ 2.46% : 0.000006s : 46: predicate.updatestate_pure_node_eliminater 3.06% : 0.000007s : 54: predicate.updatestate_useless_node_eliminater 0.28% : 0.000001s : 4: predicate.value_based_eliminate 0.51% : 0.000001s : 8: predicate.virtual_dataset_eliminate 0.51% : 0.000001s : 8: predicate.virtual_output_eliminate 0.22% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.37% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.001088 16 61.01% : 0.000664s : 6: func_graph_cloner_run.FuncGraphClonerGraph 38.99% : 0.000424s : 10: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.036358 192 0.01% : 0.000003s : 1: ForceFp32Comm 8.74% : 0.003178s : 1: add_attr 8.72% : 0.003169s : 1: add_attr_with_inline 0.01% : 0.000004s : 1: add_comm_op_reuse_tag 0.16% : 0.000058s : 1: add_recomputation 0.01% : 0.000005s : 1: assign_add_opt 0.28% : 0.000103s : 1: auto_monad 0.06% : 0.000021s : 1: auto_monad_reorder 0.01% : 0.000004s : 1: begin_end_overlap_inline 0.02% : 0.000006s : 1: bias_add_comm_swap 1.40% : 0.000509s : 1: bootstrap 0.08% : 0.000031s : 1: cconv 0.01% : 0.000004s : 1: comm_op_add_attrs 0.05% : 0.000019s : 1: control_data_broadcast_order 0.03% : 0.000010s : 1: convert_after_rewriter 0.07% : 0.000025s : 1: cse_after_recomputation 0.01% : 0.000005s : 1: dataset_repeat_opt 0.02% : 0.000006s : 1: detach_backward 0.02% : 0.000009s : 1: environ_conv 0.16% : 0.000059s : 1: event_method 0.01% : 0.000005s : 1: full_micro_interleaved_order_control 0.01% : 0.000005s : 1: get_jit_bprop_graph 0.03% : 0.000011s : 1: graph_reusing 0.02% : 0.000006s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000004s : 1: handle_group_info 0.02% : 0.000006s : 1: inline 0.02% : 0.000006s : 1: insert-virtual-dataset 0.01% : 0.000004s : 1: interleave_parallel_branches 0.01% : 0.000004s : 1: interleave_split_concat_branches 0.02% : 0.000007s : 1: label_fine_grained_interleaved_index 0.03% : 0.000009s : 1: label_micro_interleaved_index 1.32% : 0.000479s : 1: loop_unroll 0.01% : 0.000005s : 1: merge_cast_opt 0.02% : 0.000006s : 1: micro_interleaved_order_control 1.48% : 0.000538s : 1: mutable_eliminate 0.02% : 0.000007s : 1: offloading_packed_experts 0.04% : 0.000016s : 1: opt.transform.loop_unroll_optimizer 0.05% : 0.000016s : 1: opt.transform.mutable_eliminate 4.11% : 0.001494s : 78: opt.transform.opt_a 0.09% : 0.000032s : 1: opt.transform.opt_after_cconv 0.07% : 0.000027s : 1: opt.transform.opt_after_jit_grad 0.29% : 0.000105s : 28: opt.transform.opt_b 0.15% : 0.000054s : 2: opt.transform.opt_trans_graph 0.12% : 0.000043s : 4: opt.transform.symbol_engine_opt 8.85% : 0.003219s : 1: opt_a 0.32% : 0.000115s : 1: opt_after_cconv 1.74% : 0.000631s : 1: opt_after_jit_grad 0.59% : 0.000214s : 1: opt_b 14.91% : 0.005421s : 1: optimize 0.05% : 0.000020s : 1: optimize_parallel_all_gather_comm 0.02% : 0.000008s : 1: order_py_execute_after_rewriter 0.07% : 0.000025s : 1: overlap_grad_flash_sp 0.01% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.02% : 0.000007s : 1: overlap_grad_ring_attention 0.01% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.09% : 0.000033s : 1: overlap_opt_shard_in_pipeline 0.01% : 0.000005s : 1: overlap_param_gather 0.01% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.02% : 0.000008s : 1: overlap_recompute_and_grad_model_parallel 0.02% : 0.000005s : 1: overlap_recompute_comm 0.02% : 0.000007s : 1: parallel-infer-symbol 0.01% : 0.000004s : 1: parallel-infer-symbol-second 0.01% : 0.000005s : 1: partial_unused_args_eliminate 0.01% : 0.000005s : 1: pipeline_parallel_scheduler 0.01% : 0.000005s : 1: pipeline_split 0.13% : 0.000047s : 1: pre_auto_parallel 0.10% : 0.000037s : 1: py_interpret_to_execute 0.04% : 0.000014s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000004s : 1: remove_cast_before_assign_add 0.05% : 0.000017s : 1: remove_dup_value 1.29% : 0.000470s : 1: renormalize.infer 1.12% : 0.000409s : 1: renormalize.specialize 0.02% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.03% : 0.000010s : 1: rewriter_after_jit_bprop_graph 0.11% : 0.000039s : 1: rewriter_after_opt_a 0.32% : 0.000116s : 1: rewriter_before_opt_a 0.01% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.01% : 0.000005s : 1: slice_recompute_activation 0.01% : 0.000005s : 1: split_layernorm_comm 0.01% : 0.000005s : 1: split_matmul_comm_elemetwise 0.02% : 0.000008s : 1: swap_dp_allreduce_reducescatter 0.24% : 0.000088s : 1: symbol_engine_optimizer 0.24% : 0.000086s : 1: tuple_transform 41.58% : 0.015117s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:12.399.441 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:12.399.707 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0398335, [21] [bootstrap]: 0.00044476 [type_inference]: 0.0142771 [event_method]: 9.069e-05 [auto_monad]: 0.00010347 [graph_reusing]: 7.70998e-06 [inline]: 2.49999e-06 [add_attr]: 0.00336741, [1] [add_attr_with_inline]: 0.00335759, [1] [Cycle 1]: 9.451e-05, [2] [tag_attr]: 2.932e-05 [meta_addattr_fg_expand]: 8.44002e-06 [parallel-infer-symbol]: 3.21001e-06 [pre_auto_parallel]: 4.707e-05 [insert-virtual-dataset]: 2.86e-06 [parallel-infer-symbol-second]: 7.89994e-07 [dataset_repeat_opt]: 1.89999e-06 [pipeline_split]: 1.64e-06 [optimize]: 0.0201514, [53] [py_interpret_to_execute]: 3.968e-05 [rewriter_before_opt_a]: 0.00012642 [opt_a]: 0.0172364, [2] [Cycle 1]: 0.0160978, [45] [expand_dump_flag]: 3.97e-06 [switch_simplify]: 0.00013218 [loop_unroll]: 4.338e-05 [a_1]: 0.00101205 [with_stream_mark]: 2.285e-05 [recompute_prepare]: 1.444e-05 [updatestate_depend_eliminate]: 5.35999e-06 [updatestate_assign_eliminate]: 4.12e-06 [updatestate_loads_eliminate]: 3.7e-06 [parameter_eliminate]: 2.54001e-06 [a_2]: 0.00013462 [accelerated_algorithm]: 9.77999e-06 [shard]: 3.09999e-06 [meta_shard_fg_expand]: 2.56e-06 [shard_inline]: 9.04e-06 [merge_send_recv]: 1.055e-05 [auto_parallel]: 7.75998e-06 [parallel]: 1.902e-05 [flash_sp]: 9.69e-06 [merge_comm]: 5.25999e-06 [allreduce_fusion]: 4.29002e-06 [matmul_add_comm_reduction]: 1.159e-05 [allreduce_slice_to_reducescatter]: 7.00005e-07 [virtual_shard_identity]: 1.024e-05 [virtual_dataset]: 8.53001e-06 [get_grad_eliminate_]: 7.61999e-06 [virtual_output]: 7.92e-06 [merge_forward]: 4.57998e-06 [cell_reuse_recompute_pass]: 1.25001e-06 [offload_activation]: 1.207e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.051e-05 [merge_recompute_call_nodes]: 1.64e-06 [before_grad]: 1.474e-05 [set_forward_comm_id_for_comm_node_pass]: 4.86002e-06 [meta_fg_expand]: 3.83001e-06 [flash_sp_send_recv_attached]: 3.3e-06 [receive_attached]: 2.74001e-06 [after_resolve]: 1.567e-05 [a_after_grad]: 1.25e-05 [renormalize]: 0.0138362 [add_forward_monad_depend]: 1.239e-05 [auto_monad_grad]: 2.94999e-06 [auto_monad_eliminator]: 2.845e-05 [cse]: 4.156e-05 [a_3]: 9.096e-05 [Cycle 2]: 0.00111962, [45] [expand_dump_flag]: 2.37999e-06 [switch_simplify]: 1.14e-05 [loop_unroll]: 8.82e-06 [a_1]: 0.00020892 [with_stream_mark]: 2.411e-05 [recompute_prepare]: 1.019e-05 [updatestate_depend_eliminate]: 6.04001e-06 [updatestate_assign_eliminate]: 4.11001e-06 [updatestate_loads_eliminate]: 3.51999e-06 [parameter_eliminate]: 2.76e-06 [a_2]: 0.00012618 [accelerated_algorithm]: 9.34998e-06 [shard]: 2.53e-06 [meta_shard_fg_expand]: 2.48e-06 [shard_inline]: 9.19e-06 [merge_send_recv]: 1.152e-05 [auto_parallel]: 1.062e-05 [parallel]: 9.79e-06 [flash_sp]: 4.52998e-06 [merge_comm]: 5.10999e-06 [allreduce_fusion]: 4.45e-06 [matmul_add_comm_reduction]: 1.21e-05 [allreduce_slice_to_reducescatter]: 6.79982e-07 [virtual_shard_identity]: 1.173e-05 [virtual_dataset]: 7.83001e-06 [get_grad_eliminate_]: 7.53999e-06 [virtual_output]: 7.66999e-06 [merge_forward]: 5.06002e-06 [cell_reuse_recompute_pass]: 3.78001e-06 [offload_activation]: 1.161e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.225e-05 [merge_recompute_call_nodes]: 1.62001e-06 [before_grad]: 1.367e-05 [set_forward_comm_id_for_comm_node_pass]: 6.19001e-06 [meta_fg_expand]: 3.33e-06 [flash_sp_send_recv_attached]: 1.97999e-06 [receive_attached]: 2.07001e-06 [after_resolve]: 1.654e-05 [a_after_grad]: 1.313e-05 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 3.82998e-06 [auto_monad_grad]: 2.11e-06 [auto_monad_eliminator]: 1.315e-05 [cse]: 2.614e-05 [a_3]: 6.335e-05 [py_interpret_to_execute_after_opt_a]: 2.434e-05 [slice_cell_reuse_recomputed_activation]: 4.85999e-06 [rewriter_after_opt_a]: 5.613e-05 [convert_after_rewriter]: 1.201e-05 [order_py_execute_after_rewriter]: 9.35001e-06 [mutable_eliminate]: 0.00077605 [opt_b]: 0.00033653, [1] [Cycle 1]: 0.00032521, [7] [b_1]: 0.00020334 [b_2]: 1.108e-05 [updatestate_depend_eliminate]: 1.057e-05 [updatestate_assign_eliminate]: 3.46001e-06 [updatestate_loads_eliminate]: 3.14999e-06 [renormalize]: 7.49977e-07 [cse]: 3.109e-05 [optimize_parallel_all_gather_comm]: 2.314e-05 [overlap_param_gather]: 5.55001e-06 [cconv]: 4.022e-05 [loop_unroll]: 0.00050623 [opt_after_cconv]: 0.00015399, [1] [Cycle 1]: 0.00014436, [7] [c_1]: 3.988e-05 [parameter_eliminate]: 5.02999e-06 [updatestate_depend_eliminate]: 7.53e-06 [updatestate_assign_eliminate]: 3.09001e-06 [updatestate_loads_eliminate]: 3.00002e-06 [cse]: 2.703e-05 [renormalize]: 8.2e-07 [remove_dup_value]: 1.991e-05 [tuple_transform]: 0.00010936, [1] [Cycle 1]: 0.00010173, [4] [d_1]: 5.718e-05 [none_parameter_eliminate]: 2.21e-06 [renormalize]: 2.49973e-07 [switch_simplify]: 8.96002e-06 [partial_unused_args_eliminate]: 4.38001e-06 [add_recomputation]: 6.557e-05 [cse_after_recomputation]: 3.716e-05, [1] [Cycle 1]: 2.936e-05, [1] [cse]: 1.644e-05 [environ_conv]: 9.61998e-06 [swap_dp_allreduce_reducescatter]: 9.15999e-06 [bias_add_comm_swap]: 5.07999e-06 [label_micro_interleaved_index]: 7.48e-06 [label_fine_grained_interleaved_index]: 6.01e-06 [merge_cast_opt]: 3.91999e-06 [slice_recompute_activation]: 4.18999e-06 [micro_interleaved_order_control]: 4.63001e-06 [assign_add_opt]: 3.2e-06 [ForceFp32Comm]: 3.58e-06 [remove_cast_before_assign_add]: 3.17002e-06 [full_micro_interleaved_order_control]: 4.45e-06 [reorder_send_recv_between_fp_bp]: 6.33998e-06 [comm_op_add_attrs]: 3.14001e-06 [add_comm_op_reuse_tag]: 3.06999e-06 [interleave_split_concat_branches]: 3.78001e-06 [interleave_parallel_branches]: 3.60998e-06 [overlap_opt_shard_in_pipeline]: 1.557e-05 [overlap_opt_shard_grad_in_pipeline]: 4.32998e-06 [control_data_broadcast_order]: 2.214e-05 [grouped_pairwise_exchange_alltoall]: 4.4e-06 [offloading_packed_experts]: 7.61001e-06 [overlap_recompute_and_grad_model_parallel]: 7.97998e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.55e-06 [overlap_recompute_allgather_and_fa_grad]: 3.71001e-06 [overlap_recompute_comm]: 4.68001e-06 [overlap_grad_ring_attention]: 7.97e-06 [overlap_grad_flash_sp]: 2.977e-05 [begin_end_overlap_inline]: 3.6e-06 [split_matmul_comm_elemetwise]: 4.92e-06 [split_layernorm_comm]: 3.97998e-06 [handle_group_info]: 3.54002e-06 [symbol_engine_optimizer]: 0.00011929, [1] [Cycle 1]: 0.00011101, [6] [build]: 3.81999e-06 [elim_shapecalc]: 1.619e-05 [elim_not_effective]: 1.834e-05 [opt_reshape]: 8.80999e-06 [fold_const_symbol]: 1.248e-05 [renormalize]: 5.10016e-07 [detach_backward]: 4.43001e-06 [pipeline_parallel_scheduler]: 2.21e-06 [auto_monad_reorder]: 2.618e-05 [get_jit_bprop_graph]: 2.39001e-06 [rewriter_after_jit_bprop_graph]: 6.49999e-06 [opt_after_jit_grad]: 0.00054195 [validate]: 4.724e-05 Sums bootstrap : 0.000445s : 1.29% type_inference : 0.014277s : 41.47% event_method : 0.000091s : 0.26% auto_monad : 0.000103s : 0.30% graph_reusing : 0.000008s : 0.02% inline : 0.000002s : 0.01% add_attr.add_attr_with_inline.tag_attr : 0.000029s : 0.09% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000008s : 0.02% parallel-infer-symbol : 0.000003s : 0.01% pre_auto_parallel : 0.000047s : 0.14% insert-virtual-dataset : 0.000003s : 0.01% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.01% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000040s : 0.12% optimize.rewriter_before_opt_a : 0.000126s : 0.37% optimize.opt_a.expand_dump_flag : 0.000006s : 0.02% optimize.opt_a.switch_simplify : 0.000144s : 0.42% optimize.opt_a.loop_unroll : 0.000052s : 0.15% optimize.opt_a.a_1 : 0.001221s : 3.55% optimize.opt_a.with_stream_mark : 0.000047s : 0.14% optimize.opt_a.recompute_prepare : 0.000025s : 0.07% optimize.opt_a.updatestate_depend_eliminate : 0.000011s : 0.03% optimize.opt_a.updatestate_assign_eliminate : 0.000008s : 0.02% optimize.opt_a.updatestate_loads_eliminate : 0.000007s : 0.02% optimize.opt_a.parameter_eliminate : 0.000005s : 0.02% optimize.opt_a.a_2 : 0.000261s : 0.76% optimize.opt_a.accelerated_algorithm : 0.000019s : 0.06% optimize.opt_a.shard : 0.000006s : 0.02% optimize.opt_a.meta_shard_fg_expand : 0.000005s : 0.01% optimize.opt_a.shard_inline : 0.000018s : 0.05% optimize.opt_a.merge_send_recv : 0.000022s : 0.06% optimize.opt_a.auto_parallel : 0.000018s : 0.05% optimize.opt_a.parallel : 0.000029s : 0.08% optimize.opt_a.flash_sp : 0.000014s : 0.04% optimize.opt_a.merge_comm : 0.000010s : 0.03% optimize.opt_a.allreduce_fusion : 0.000009s : 0.03% optimize.opt_a.matmul_add_comm_reduction : 0.000024s : 0.07% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000022s : 0.06% optimize.opt_a.virtual_dataset : 0.000016s : 0.05% optimize.opt_a.get_grad_eliminate_ : 0.000015s : 0.04% optimize.opt_a.virtual_output : 0.000016s : 0.05% optimize.opt_a.merge_forward : 0.000010s : 0.03% optimize.opt_a.cell_reuse_recompute_pass : 0.000005s : 0.01% optimize.opt_a.offload_activation : 0.000024s : 0.07% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000043s : 0.12% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.01% optimize.opt_a.before_grad : 0.000028s : 0.08% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000011s : 0.03% optimize.opt_a.meta_fg_expand : 0.000007s : 0.02% optimize.opt_a.flash_sp_send_recv_attached : 0.000005s : 0.02% optimize.opt_a.receive_attached : 0.000005s : 0.01% optimize.opt_a.after_resolve : 0.000032s : 0.09% optimize.opt_a.a_after_grad : 0.000026s : 0.07% optimize.opt_a.renormalize : 0.013836s : 40.19% optimize.opt_a.add_forward_monad_depend : 0.000016s : 0.05% optimize.opt_a.auto_monad_grad : 0.000005s : 0.01% optimize.opt_a.auto_monad_eliminator : 0.000042s : 0.12% optimize.opt_a.cse : 0.000068s : 0.20% optimize.opt_a.a_3 : 0.000154s : 0.45% optimize.py_interpret_to_execute_after_opt_a : 0.000024s : 0.07% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.01% optimize.rewriter_after_opt_a : 0.000056s : 0.16% optimize.convert_after_rewriter : 0.000012s : 0.03% optimize.order_py_execute_after_rewriter : 0.000009s : 0.03% optimize.mutable_eliminate : 0.000776s : 2.25% optimize.opt_b.b_1 : 0.000203s : 0.59% optimize.opt_b.b_2 : 0.000011s : 0.03% optimize.opt_b.updatestate_depend_eliminate : 0.000011s : 0.03% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.01% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.01% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000031s : 0.09% optimize.optimize_parallel_all_gather_comm : 0.000023s : 0.07% optimize.overlap_param_gather : 0.000006s : 0.02% optimize.cconv : 0.000040s : 0.12% optimize.loop_unroll : 0.000506s : 1.47% optimize.opt_after_cconv.c_1 : 0.000040s : 0.12% optimize.opt_after_cconv.parameter_eliminate : 0.000005s : 0.01% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000008s : 0.02% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.cse : 0.000027s : 0.08% optimize.opt_after_cconv.renormalize : 0.000001s : 0.00% optimize.remove_dup_value : 0.000020s : 0.06% optimize.tuple_transform.d_1 : 0.000057s : 0.17% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000009s : 0.03% optimize.partial_unused_args_eliminate : 0.000004s : 0.01% optimize.add_recomputation : 0.000066s : 0.19% optimize.cse_after_recomputation.cse : 0.000016s : 0.05% optimize.environ_conv : 0.000010s : 0.03% optimize.swap_dp_allreduce_reducescatter : 0.000009s : 0.03% optimize.bias_add_comm_swap : 0.000005s : 0.01% optimize.label_micro_interleaved_index : 0.000007s : 0.02% optimize.label_fine_grained_interleaved_index : 0.000006s : 0.02% optimize.merge_cast_opt : 0.000004s : 0.01% optimize.slice_recompute_activation : 0.000004s : 0.01% optimize.micro_interleaved_order_control : 0.000005s : 0.01% optimize.assign_add_opt : 0.000003s : 0.01% optimize.ForceFp32Comm : 0.000004s : 0.01% optimize.remove_cast_before_assign_add : 0.000003s : 0.01% optimize.full_micro_interleaved_order_control : 0.000004s : 0.01% optimize.reorder_send_recv_between_fp_bp : 0.000006s : 0.02% optimize.comm_op_add_attrs : 0.000003s : 0.01% optimize.add_comm_op_reuse_tag : 0.000003s : 0.01% optimize.interleave_split_concat_branches : 0.000004s : 0.01% optimize.interleave_parallel_branches : 0.000004s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000016s : 0.05% optimize.overlap_opt_shard_grad_in_pipeline : 0.000004s : 0.01% optimize.control_data_broadcast_order : 0.000022s : 0.06% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.01% optimize.offloading_packed_experts : 0.000008s : 0.02% optimize.overlap_recompute_and_grad_model_parallel : 0.000008s : 0.02% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.01% optimize.overlap_recompute_comm : 0.000005s : 0.01% optimize.overlap_grad_ring_attention : 0.000008s : 0.02% optimize.overlap_grad_flash_sp : 0.000030s : 0.09% optimize.begin_end_overlap_inline : 0.000004s : 0.01% optimize.split_matmul_comm_elemetwise : 0.000005s : 0.01% optimize.split_layernorm_comm : 0.000004s : 0.01% optimize.handle_group_info : 0.000004s : 0.01% optimize.symbol_engine_optimizer.build : 0.000004s : 0.01% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000016s : 0.05% optimize.symbol_engine_optimizer.elim_not_effective : 0.000018s : 0.05% optimize.symbol_engine_optimizer.opt_reshape : 0.000009s : 0.03% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000012s : 0.04% optimize.symbol_engine_optimizer.renormalize : 0.000001s : 0.00% detach_backward : 0.000004s : 0.01% pipeline_parallel_scheduler : 0.000002s : 0.01% auto_monad_reorder : 0.000026s : 0.08% get_jit_bprop_graph : 0.000002s : 0.01% rewriter_after_jit_bprop_graph : 0.000006s : 0.02% opt_after_jit_grad : 0.000542s : 1.57% validate : 0.000047s : 0.14% Time group info: ------[substitution.] 0.000289 44 11.36% : 0.000033s : 3: substitution.cast_eliminate 0.77% : 0.000002s : 3: substitution.elim_not_effective 0.59% : 0.000002s : 3: substitution.fold_const_symbol 2.40% : 0.000007s : 5: substitution.graph_param_transform 68.25% : 0.000197s : 8: substitution.inline 2.31% : 0.000007s : 6: substitution.j_node_and_user_rematch 2.26% : 0.000007s : 6: substitution.remove_not_recompute_node 2.82% : 0.000008s : 4: substitution.replace_old_param 3.48% : 0.000010s : 2: substitution.switch_simplify 5.76% : 0.000017s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.014203 2 89.28% : 0.012681s : 1: type_inference.infer 10.72% : 0.001523s : 1: type_inference.specialize ------[replace.] 0.000138 14 41.60% : 0.000057s : 8: replace.inline 36.86% : 0.000051s : 2: replace.switch_simplify 21.54% : 0.000030s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000215 14 89.21% : 0.000192s : 8: match.inline 4.09% : 0.000009s : 2: match.switch_simplify 6.70% : 0.000014s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000307 1838 0.98% : 0.000003s : 21: predicate.accumulaten_eliminater 0.66% : 0.000002s : 5: predicate.ad_related_special_op_eliminate 0.46% : 0.000001s : 10: predicate.addn_check_dump 1.01% : 0.000003s : 21: predicate.addn_zero_filter 0.87% : 0.000003s : 21: predicate.adjust_all_reduce_mul_add 2.21% : 0.000007s : 31: predicate.arithmetic_simplify 1.21% : 0.000004s : 21: predicate.cast_eliminate 0.52% : 0.000002s : 10: predicate.check_bprop_eliminate 0.46% : 0.000001s : 10: predicate.compare_switch_simplify 0.19% : 0.000001s : 5: predicate.const_output_eliminate 0.52% : 0.000002s : 10: predicate.depend_value_elim 1.15% : 0.000004s : 21: predicate.dict_get_item_const_eliminator 1.26% : 0.000004s : 21: predicate.dict_get_item_eliminator 0.98% : 0.000003s : 21: predicate.dict_set_item_eliminator 0.78% : 0.000002s : 10: predicate.dumpgradient_eliminate 0.25% : 0.000001s : 5: predicate.elim_not_effective 0.32% : 0.000001s : 5: predicate.elim_shapecalc_of_broadcastargs 1.18% : 0.000004s : 26: predicate.environ_add_const_eliminate 1.15% : 0.000004s : 26: predicate.environ_get_add_eliminate 1.11% : 0.000003s : 26: predicate.environ_get_depend_swap 1.68% : 0.000005s : 36: predicate.environ_get_eliminate 1.17% : 0.000004s : 26: predicate.environ_get_set_eliminate 1.60% : 0.000005s : 33: predicate.exchange_switch_depend_value 2.79% : 0.000009s : 33: predicate.float_depend_g_call 0.45% : 0.000001s : 10: predicate.float_environ_get_switch 0.70% : 0.000002s : 15: predicate.float_tuple_getitem_switch 0.14% : 0.000000s : 5: predicate.fold_const_symbol 0.55% : 0.000002s : 10: predicate.get_grad_eliminate 0.17% : 0.000001s : 5: predicate.graph_param_transform 0.51% : 0.000002s : 10: predicate.incorporate_call 0.44% : 0.000001s : 10: predicate.incorporate_call_switch 6.63% : 0.000020s : 84: predicate.inline 0.78% : 0.000002s : 10: predicate.inline_without_move 0.26% : 0.000001s : 10: predicate.j_node_and_user_rematch 0.75% : 0.000002s : 10: predicate.less_batch_normalization 1.69% : 0.000005s : 35: predicate.list_to_tuple_eliminator_ 2.41% : 0.000007s : 56: predicate.load_eliminater 0.81% : 0.000002s : 5: predicate.loop_unroll_after_grad 2.37% : 0.000007s : 49: predicate.loop_unroll_before_grad 1.59% : 0.000005s : 31: predicate.make_slice_get_slice_eliminator 0.46% : 0.000001s : 10: predicate.merge_addn 0.48% : 0.000001s : 10: predicate.micro_step_allgather_replace 0.46% : 0.000001s : 10: predicate.mini_step_allgather_replace 0.86% : 0.000003s : 21: predicate.minmaximum_grad 0.87% : 0.000003s : 5: predicate.mutable_eliminate 0.26% : 0.000001s : 5: predicate.opt_reshape 0.29% : 0.000001s : 5: predicate.parallel_virtual_node 2.11% : 0.000006s : 33: predicate.partial_defer_inline 1.62% : 0.000005s : 30: predicate.partial_eliminate 0.96% : 0.000003s : 21: predicate.print_const_string_wrapper 0.49% : 0.000002s : 10: predicate.reduce_all_const_elim 1.36% : 0.000004s : 21: predicate.reduce_eliminate 2.44% : 0.000007s : 56: predicate.redundant_stop_gradient_eliminater 0.31% : 0.000001s : 10: predicate.remove_not_recompute_node 1.31% : 0.000004s : 35: predicate.replace_applicator 0.76% : 0.000002s : 10: predicate.replace_old_param 0.40% : 0.000001s : 5: predicate.reset_defer_inline 1.24% : 0.000004s : 21: predicate.reshape_eliminate 0.59% : 0.000002s : 10: predicate.row_tensor_add_zeros_like 0.34% : 0.000001s : 5: predicate.row_tensor_eliminate 0.79% : 0.000002s : 10: predicate.same_eliminate 0.50% : 0.000002s : 10: predicate.set_cell_output_no_recompute 0.89% : 0.000003s : 10: predicate.shard_identity_eliminate 0.58% : 0.000002s : 10: predicate.special_op_eliminate 0.73% : 0.000002s : 10: predicate.specialize_transform 0.91% : 0.000003s : 10: predicate.split_environ_get_set_with_tuple_value 0.64% : 0.000002s : 10: predicate.stack_unstack_eliminate 0.28% : 0.000001s : 5: predicate.switch_call_monad_eliminater 1.89% : 0.000006s : 33: predicate.switch_defer_inline 2.22% : 0.000007s : 43: predicate.switch_layer_defer_inline 5.47% : 0.000017s : 101: predicate.switch_simplify 1.06% : 0.000003s : 21: predicate.tile_eliminate 1.12% : 0.000003s : 21: predicate.transpose_eliminate 1.60% : 0.000005s : 31: predicate.tuple_list_convert_item_index_to_positive 1.66% : 0.000005s : 31: predicate.tuple_list_get_item_const_eliminator 1.44% : 0.000004s : 31: predicate.tuple_list_get_item_depend_reorder 2.95% : 0.000009s : 45: predicate.tuple_list_get_item_eliminator 1.45% : 0.000004s : 31: predicate.tuple_list_get_set_item_eliminator 2.23% : 0.000007s : 41: predicate.tuple_list_set_item_eliminator 1.66% : 0.000005s : 35: predicate.tuple_to_list_eliminator_ 2.45% : 0.000008s : 56: predicate.updatestate_pure_node_eliminater 3.02% : 0.000009s : 66: predicate.updatestate_useless_node_eliminater 0.31% : 0.000001s : 5: predicate.value_based_eliminate 0.63% : 0.000002s : 10: predicate.virtual_dataset_eliminate 0.50% : 0.000002s : 10: predicate.virtual_output_eliminate 0.22% : 0.000001s : 5: predicate.virtual_view_grad_eliminate 0.40% : 0.000001s : 5: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.001213 16 50.16% : 0.000608s : 6: func_graph_cloner_run.FuncGraphClonerGraph 49.84% : 0.000604s : 10: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.079286 192 0.01% : 0.000006s : 1: ForceFp32Comm 4.26% : 0.003377s : 1: add_attr 4.24% : 0.003362s : 1: add_attr_with_inline 0.01% : 0.000006s : 1: add_comm_op_reuse_tag 0.09% : 0.000071s : 1: add_recomputation 0.01% : 0.000006s : 1: assign_add_opt 0.14% : 0.000115s : 1: auto_monad 0.04% : 0.000035s : 1: auto_monad_reorder 0.01% : 0.000006s : 1: begin_end_overlap_inline 0.01% : 0.000008s : 1: bias_add_comm_swap 0.61% : 0.000486s : 1: bootstrap 0.06% : 0.000044s : 1: cconv 0.01% : 0.000006s : 1: comm_op_add_attrs 0.03% : 0.000027s : 1: control_data_broadcast_order 0.02% : 0.000016s : 1: convert_after_rewriter 0.05% : 0.000041s : 1: cse_after_recomputation 0.01% : 0.000007s : 1: dataset_repeat_opt 0.03% : 0.000026s : 1: detach_backward 0.02% : 0.000013s : 1: environ_conv 0.14% : 0.000108s : 1: event_method 0.01% : 0.000008s : 1: full_micro_interleaved_order_control 0.01% : 0.000009s : 1: get_jit_bprop_graph 0.02% : 0.000015s : 1: graph_reusing 0.01% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000006s : 1: handle_group_info 0.01% : 0.000008s : 1: inline 0.01% : 0.000009s : 1: insert-virtual-dataset 0.01% : 0.000006s : 1: interleave_parallel_branches 0.01% : 0.000006s : 1: interleave_split_concat_branches 0.01% : 0.000009s : 1: label_fine_grained_interleaved_index 0.01% : 0.000010s : 1: label_micro_interleaved_index 0.65% : 0.000514s : 1: loop_unroll 0.01% : 0.000007s : 1: merge_cast_opt 0.01% : 0.000007s : 1: micro_interleaved_order_control 0.99% : 0.000785s : 1: mutable_eliminate 0.01% : 0.000011s : 1: offloading_packed_experts 0.02% : 0.000020s : 1: opt.transform.loop_unroll_optimizer 0.03% : 0.000022s : 1: opt.transform.mutable_eliminate 2.39% : 0.001893s : 78: opt.transform.opt_a 0.05% : 0.000038s : 1: opt.transform.opt_after_cconv 0.04% : 0.000033s : 1: opt.transform.opt_after_jit_grad 0.18% : 0.000139s : 28: opt.transform.opt_b 0.08% : 0.000064s : 2: opt.transform.opt_trans_graph 0.06% : 0.000051s : 4: opt.transform.symbol_engine_opt 21.74% : 0.017241s : 1: opt_a 0.20% : 0.000158s : 1: opt_after_cconv 0.70% : 0.000553s : 1: opt_after_jit_grad 0.43% : 0.000341s : 1: opt_b 25.96% : 0.020579s : 1: optimize 0.03% : 0.000027s : 1: optimize_parallel_all_gather_comm 0.02% : 0.000013s : 1: order_py_execute_after_rewriter 0.04% : 0.000034s : 1: overlap_grad_flash_sp 0.01% : 0.000006s : 1: overlap_grad_matmul_and_grad_allreduce 0.01% : 0.000011s : 1: overlap_grad_ring_attention 0.01% : 0.000007s : 1: overlap_opt_shard_grad_in_pipeline 0.02% : 0.000019s : 1: overlap_opt_shard_in_pipeline 0.01% : 0.000009s : 1: overlap_param_gather 0.01% : 0.000006s : 1: overlap_recompute_allgather_and_fa_grad 0.01% : 0.000011s : 1: overlap_recompute_and_grad_model_parallel 0.01% : 0.000008s : 1: overlap_recompute_comm 0.01% : 0.000010s : 1: parallel-infer-symbol 0.01% : 0.000007s : 1: parallel-infer-symbol-second 0.01% : 0.000008s : 1: partial_unused_args_eliminate 0.01% : 0.000010s : 1: pipeline_parallel_scheduler 0.01% : 0.000007s : 1: pipeline_split 0.07% : 0.000056s : 1: pre_auto_parallel 0.06% : 0.000044s : 1: py_interpret_to_execute 0.04% : 0.000028s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000006s : 1: remove_cast_before_assign_add 0.03% : 0.000023s : 1: remove_dup_value 16.51% : 0.013093s : 1: renormalize.infer 0.91% : 0.000724s : 1: renormalize.specialize 0.01% : 0.000009s : 1: reorder_send_recv_between_fp_bp 0.02% : 0.000013s : 1: rewriter_after_jit_bprop_graph 0.08% : 0.000061s : 1: rewriter_after_opt_a 0.17% : 0.000131s : 1: rewriter_before_opt_a 0.01% : 0.000008s : 1: slice_cell_reuse_recomputed_activation 0.01% : 0.000007s : 1: slice_recompute_activation 0.01% : 0.000007s : 1: split_layernorm_comm 0.01% : 0.000008s : 1: split_matmul_comm_elemetwise 0.02% : 0.000012s : 1: swap_dp_allreduce_reducescatter 0.15% : 0.000123s : 1: symbol_engine_optimizer 0.14% : 0.000112s : 1: tuple_transform 18.06% : 0.014320s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:12.954.289 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0359958, [21] [bootstrap]: 0.00044026 [type_inference]: 0.0135841 [event_method]: 0.00010654 [auto_monad]: 0.00010633 [graph_reusing]: 7.43e-06 [inline]: 2.70997e-06 [add_attr]: 0.00389071, [1] [add_attr_with_inline]: 0.00387807, [1] [Cycle 1]: 9.149e-05, [2] [tag_attr]: 3.472e-05 [meta_addattr_fg_expand]: 8.59998e-06 [parallel-infer-symbol]: 3.27002e-06 [pre_auto_parallel]: 5.172e-05 [insert-virtual-dataset]: 2.63e-06 [parallel-infer-symbol-second]: 7.09988e-07 [dataset_repeat_opt]: 1.97999e-06 [pipeline_split]: 1.71e-06 [optimize]: 0.0170638, [53] [py_interpret_to_execute]: 4.11e-05 [rewriter_before_opt_a]: 0.00012902 [opt_a]: 0.0145775, [2] [Cycle 1]: 0.0137176, [45] [expand_dump_flag]: 4e-06 [switch_simplify]: 0.00014096 [loop_unroll]: 4.29e-05 [a_1]: 0.00106726 [with_stream_mark]: 2.336e-05 [recompute_prepare]: 1.349e-05 [updatestate_depend_eliminate]: 5.62999e-06 [updatestate_assign_eliminate]: 3.85e-06 [updatestate_loads_eliminate]: 3.57997e-06 [parameter_eliminate]: 2.24001e-06 [a_2]: 0.00010789 [accelerated_algorithm]: 1.076e-05 [shard]: 2.68998e-06 [meta_shard_fg_expand]: 3.36001e-06 [shard_inline]: 8.73001e-06 [merge_send_recv]: 1.063e-05 [auto_parallel]: 9.85002e-06 [parallel]: 2.121e-05 [flash_sp]: 1.088e-05 [merge_comm]: 5.22e-06 [allreduce_fusion]: 4.37998e-06 [matmul_add_comm_reduction]: 1.198e-05 [allreduce_slice_to_reducescatter]: 7.40023e-07 [virtual_shard_identity]: 1.204e-05 [virtual_dataset]: 9.19e-06 [get_grad_eliminate_]: 8.12e-06 [virtual_output]: 8.11002e-06 [merge_forward]: 4.66002e-06 [cell_reuse_recompute_pass]: 1.17999e-06 [offload_activation]: 1.214e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.786e-05 [merge_recompute_call_nodes]: 1.50001e-06 [before_grad]: 1.521e-05 [set_forward_comm_id_for_comm_node_pass]: 4.63999e-06 [meta_fg_expand]: 4.67998e-06 [flash_sp_send_recv_attached]: 2.60002e-06 [receive_attached]: 2.63e-06 [after_resolve]: 1.45e-05 [a_after_grad]: 1.401e-05 [renormalize]: 0.0116022 [add_forward_monad_depend]: 1.114e-05 [auto_monad_grad]: 2.84001e-06 [auto_monad_eliminator]: 2.572e-05 [cse]: 4.042e-05 [a_3]: 7.681e-05 [Cycle 2]: 0.00084447, [45] [expand_dump_flag]: 2.29999e-06 [switch_simplify]: 1.107e-05 [loop_unroll]: 7.92998e-06 [a_1]: 0.00020144 [with_stream_mark]: 1.974e-05 [recompute_prepare]: 8.54998e-06 [updatestate_depend_eliminate]: 5.37001e-06 [updatestate_assign_eliminate]: 3.53999e-06 [updatestate_loads_eliminate]: 3.35e-06 [parameter_eliminate]: 2.11e-06 [a_2]: 9.659e-05 [accelerated_algorithm]: 7.8e-06 [shard]: 2.86999e-06 [meta_shard_fg_expand]: 2.82002e-06 [shard_inline]: 7.92003e-06 [merge_send_recv]: 9.52001e-06 [auto_parallel]: 1.004e-05 [parallel]: 9.69e-06 [flash_sp]: 3.82998e-06 [merge_comm]: 4.12e-06 [allreduce_fusion]: 4.01001e-06 [matmul_add_comm_reduction]: 1.093e-05 [allreduce_slice_to_reducescatter]: 7.89994e-07 [virtual_shard_identity]: 8.99e-06 [virtual_dataset]: 7.45998e-06 [get_grad_eliminate_]: 7.56999e-06 [virtual_output]: 7.06999e-06 [merge_forward]: 5.05001e-06 [cell_reuse_recompute_pass]: 3.03e-06 [offload_activation]: 1.072e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.498e-05 [merge_recompute_call_nodes]: 1.60001e-06 [before_grad]: 1.293e-05 [set_forward_comm_id_for_comm_node_pass]: 4.92e-06 [meta_fg_expand]: 3.13998e-06 [flash_sp_send_recv_attached]: 1.97999e-06 [receive_attached]: 2.17999e-06 [after_resolve]: 1.339e-05 [a_after_grad]: 1.21e-05 [renormalize]: 6.00121e-08 [add_forward_monad_depend]: 1.64e-06 [auto_monad_grad]: 9.70002e-07 [auto_monad_eliminator]: 8.75001e-06 [cse]: 1.955e-05 [a_3]: 4.639e-05 [py_interpret_to_execute_after_opt_a]: 1.948e-05 [slice_cell_reuse_recomputed_activation]: 1.87001e-06 [rewriter_after_opt_a]: 4.458e-05 [convert_after_rewriter]: 7.6e-06 [order_py_execute_after_rewriter]: 5.84e-06 [mutable_eliminate]: 0.0007331 [opt_b]: 0.000257, [1] [Cycle 1]: 0.00024881, [7] [b_1]: 0.00016 [b_2]: 9.49e-06 [updatestate_depend_eliminate]: 7.62998e-06 [updatestate_assign_eliminate]: 3.08e-06 [updatestate_loads_eliminate]: 3.16999e-06 [renormalize]: 7.00005e-07 [cse]: 2.706e-05 [optimize_parallel_all_gather_comm]: 1.876e-05 [overlap_param_gather]: 1.92001e-06 [cconv]: 2.823e-05 [loop_unroll]: 0.00044128 [opt_after_cconv]: 0.0001185, [1] [Cycle 1]: 0.00011269, [7] [c_1]: 3.946e-05 [parameter_eliminate]: 3.14001e-06 [updatestate_depend_eliminate]: 6.09001e-06 [updatestate_assign_eliminate]: 3.69002e-06 [updatestate_loads_eliminate]: 2.74999e-06 [cse]: 2.261e-05 [renormalize]: 3.59985e-07 [remove_dup_value]: 1.525e-05 [tuple_transform]: 8.904e-05, [1] [Cycle 1]: 8.364e-05, [4] [d_1]: 5.401e-05 [none_parameter_eliminate]: 2.06e-06 [renormalize]: 2.29978e-07 [switch_simplify]: 8.84e-06 [partial_unused_args_eliminate]: 2.19999e-06 [add_recomputation]: 5.782e-05 [cse_after_recomputation]: 2.589e-05, [1] [Cycle 1]: 2.108e-05, [1] [cse]: 1.509e-05 [environ_conv]: 6.07999e-06 [swap_dp_allreduce_reducescatter]: 6.02999e-06 [bias_add_comm_swap]: 2.88e-06 [label_micro_interleaved_index]: 4.21001e-06 [label_fine_grained_interleaved_index]: 2.66e-06 [merge_cast_opt]: 1.47001e-06 [slice_recompute_activation]: 2.30002e-06 [micro_interleaved_order_control]: 2.49999e-06 [assign_add_opt]: 1.09998e-06 [ForceFp32Comm]: 8.30012e-07 [remove_cast_before_assign_add]: 1.01002e-06 [full_micro_interleaved_order_control]: 2.05002e-06 [reorder_send_recv_between_fp_bp]: 2.74001e-06 [comm_op_add_attrs]: 9.70002e-07 [add_comm_op_reuse_tag]: 9.39996e-07 [interleave_split_concat_branches]: 1.14e-06 [interleave_parallel_branches]: 1.37e-06 [overlap_opt_shard_in_pipeline]: 1.10001e-06 [overlap_opt_shard_grad_in_pipeline]: 1.94e-06 [control_data_broadcast_order]: 1.388e-05 [grouped_pairwise_exchange_alltoall]: 1.48002e-06 [offloading_packed_experts]: 3.63999e-06 [overlap_recompute_and_grad_model_parallel]: 6.32001e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.16997e-06 [overlap_recompute_allgather_and_fa_grad]: 1.60999e-06 [overlap_recompute_comm]: 2.26e-06 [overlap_grad_ring_attention]: 5.29e-06 [overlap_grad_flash_sp]: 2.57e-05 [begin_end_overlap_inline]: 6.40022e-07 [split_matmul_comm_elemetwise]: 2.09999e-06 [split_layernorm_comm]: 1.62001e-06 [handle_group_info]: 9.79984e-07 [symbol_engine_optimizer]: 9.058e-05, [1] [Cycle 1]: 8.543e-05, [6] [build]: 3.68e-06 [elim_shapecalc]: 1.253e-05 [elim_not_effective]: 1.742e-05 [opt_reshape]: 8.90001e-06 [fold_const_symbol]: 1.283e-05 [renormalize]: 2.3999e-07 [detach_backward]: 2.09999e-06 [pipeline_parallel_scheduler]: 1.54e-06 [auto_monad_reorder]: 2.071e-05 [get_jit_bprop_graph]: 1.67001e-06 [rewriter_after_jit_bprop_graph]: 5.52001e-06 [opt_after_jit_grad]: 0.00047858 [validate]: 4.692e-05 Sums bootstrap : 0.000440s : 1.42% type_inference : 0.013584s : 43.78% event_method : 0.000107s : 0.34% auto_monad : 0.000106s : 0.34% graph_reusing : 0.000007s : 0.02% inline : 0.000003s : 0.01% add_attr.add_attr_with_inline.tag_attr : 0.000035s : 0.11% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000009s : 0.03% parallel-infer-symbol : 0.000003s : 0.01% pre_auto_parallel : 0.000052s : 0.17% insert-virtual-dataset : 0.000003s : 0.01% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.01% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000041s : 0.13% optimize.rewriter_before_opt_a : 0.000129s : 0.42% optimize.opt_a.expand_dump_flag : 0.000006s : 0.02% optimize.opt_a.switch_simplify : 0.000152s : 0.49% optimize.opt_a.loop_unroll : 0.000051s : 0.16% optimize.opt_a.a_1 : 0.001269s : 4.09% optimize.opt_a.with_stream_mark : 0.000043s : 0.14% optimize.opt_a.recompute_prepare : 0.000022s : 0.07% optimize.opt_a.updatestate_depend_eliminate : 0.000011s : 0.04% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.02% optimize.opt_a.updatestate_loads_eliminate : 0.000007s : 0.02% optimize.opt_a.parameter_eliminate : 0.000004s : 0.01% optimize.opt_a.a_2 : 0.000204s : 0.66% optimize.opt_a.accelerated_algorithm : 0.000019s : 0.06% optimize.opt_a.shard : 0.000006s : 0.02% optimize.opt_a.meta_shard_fg_expand : 0.000006s : 0.02% optimize.opt_a.shard_inline : 0.000017s : 0.05% optimize.opt_a.merge_send_recv : 0.000020s : 0.06% optimize.opt_a.auto_parallel : 0.000020s : 0.06% optimize.opt_a.parallel : 0.000031s : 0.10% optimize.opt_a.flash_sp : 0.000015s : 0.05% optimize.opt_a.merge_comm : 0.000009s : 0.03% optimize.opt_a.allreduce_fusion : 0.000008s : 0.03% optimize.opt_a.matmul_add_comm_reduction : 0.000023s : 0.07% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000002s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000021s : 0.07% optimize.opt_a.virtual_dataset : 0.000017s : 0.05% optimize.opt_a.get_grad_eliminate_ : 0.000016s : 0.05% optimize.opt_a.virtual_output : 0.000015s : 0.05% optimize.opt_a.merge_forward : 0.000010s : 0.03% optimize.opt_a.cell_reuse_recompute_pass : 0.000004s : 0.01% optimize.opt_a.offload_activation : 0.000023s : 0.07% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000033s : 0.11% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.01% optimize.opt_a.before_grad : 0.000028s : 0.09% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000010s : 0.03% optimize.opt_a.meta_fg_expand : 0.000008s : 0.03% optimize.opt_a.flash_sp_send_recv_attached : 0.000005s : 0.01% optimize.opt_a.receive_attached : 0.000005s : 0.02% optimize.opt_a.after_resolve : 0.000028s : 0.09% optimize.opt_a.a_after_grad : 0.000026s : 0.08% optimize.opt_a.renormalize : 0.011602s : 37.39% optimize.opt_a.add_forward_monad_depend : 0.000013s : 0.04% optimize.opt_a.auto_monad_grad : 0.000004s : 0.01% optimize.opt_a.auto_monad_eliminator : 0.000034s : 0.11% optimize.opt_a.cse : 0.000060s : 0.19% optimize.opt_a.a_3 : 0.000123s : 0.40% optimize.py_interpret_to_execute_after_opt_a : 0.000019s : 0.06% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.01% optimize.rewriter_after_opt_a : 0.000045s : 0.14% optimize.convert_after_rewriter : 0.000008s : 0.02% optimize.order_py_execute_after_rewriter : 0.000006s : 0.02% optimize.mutable_eliminate : 0.000733s : 2.36% optimize.opt_b.b_1 : 0.000160s : 0.52% optimize.opt_b.b_2 : 0.000009s : 0.03% optimize.opt_b.updatestate_depend_eliminate : 0.000008s : 0.02% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.01% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.01% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000027s : 0.09% optimize.optimize_parallel_all_gather_comm : 0.000019s : 0.06% optimize.overlap_param_gather : 0.000002s : 0.01% optimize.cconv : 0.000028s : 0.09% optimize.loop_unroll : 0.000441s : 1.42% optimize.opt_after_cconv.c_1 : 0.000039s : 0.13% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.02% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000004s : 0.01% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.cse : 0.000023s : 0.07% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000015s : 0.05% optimize.tuple_transform.d_1 : 0.000054s : 0.17% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000009s : 0.03% optimize.partial_unused_args_eliminate : 0.000002s : 0.01% optimize.add_recomputation : 0.000058s : 0.19% optimize.cse_after_recomputation.cse : 0.000015s : 0.05% optimize.environ_conv : 0.000006s : 0.02% optimize.swap_dp_allreduce_reducescatter : 0.000006s : 0.02% optimize.bias_add_comm_swap : 0.000003s : 0.01% optimize.label_micro_interleaved_index : 0.000004s : 0.01% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.01% optimize.merge_cast_opt : 0.000001s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.01% optimize.micro_interleaved_order_control : 0.000002s : 0.01% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.00% optimize.full_micro_interleaved_order_control : 0.000002s : 0.01% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.01% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.01% optimize.control_data_broadcast_order : 0.000014s : 0.04% optimize.grouped_pairwise_exchange_alltoall : 0.000001s : 0.00% optimize.offloading_packed_experts : 0.000004s : 0.01% optimize.overlap_recompute_and_grad_model_parallel : 0.000006s : 0.02% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000002s : 0.01% optimize.overlap_recompute_comm : 0.000002s : 0.01% optimize.overlap_grad_ring_attention : 0.000005s : 0.02% optimize.overlap_grad_flash_sp : 0.000026s : 0.08% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.01% optimize.split_layernorm_comm : 0.000002s : 0.01% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000004s : 0.01% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000013s : 0.04% optimize.symbol_engine_optimizer.elim_not_effective : 0.000017s : 0.06% optimize.symbol_engine_optimizer.opt_reshape : 0.000009s : 0.03% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000013s : 0.04% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.01% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000021s : 0.07% get_jit_bprop_graph : 0.000002s : 0.01% rewriter_after_jit_bprop_graph : 0.000006s : 0.02% opt_after_jit_grad : 0.000479s : 1.54% validate : 0.000047s : 0.15% Time group info: ------[substitution.] 0.000306 44 10.09% : 0.000031s : 3: substitution.cast_eliminate 0.92% : 0.000003s : 3: substitution.elim_not_effective 0.65% : 0.000002s : 3: substitution.fold_const_symbol 2.21% : 0.000007s : 5: substitution.graph_param_transform 70.19% : 0.000215s : 8: substitution.inline 1.89% : 0.000006s : 6: substitution.j_node_and_user_rematch 2.02% : 0.000006s : 6: substitution.remove_not_recompute_node 2.12% : 0.000006s : 4: substitution.replace_old_param 3.75% : 0.000011s : 2: substitution.switch_simplify 6.16% : 0.000019s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.013500 2 88.64% : 0.011966s : 1: type_inference.infer 11.36% : 0.001534s : 1: type_inference.specialize ------[replace.] 0.000143 14 40.31% : 0.000058s : 8: replace.inline 39.04% : 0.000056s : 2: replace.switch_simplify 20.65% : 0.000030s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000236 14 88.94% : 0.000210s : 8: match.inline 4.31% : 0.000010s : 2: match.switch_simplify 6.75% : 0.000016s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000335 1838 0.97% : 0.000003s : 21: predicate.accumulaten_eliminater 0.65% : 0.000002s : 5: predicate.ad_related_special_op_eliminate 0.41% : 0.000001s : 10: predicate.addn_check_dump 0.90% : 0.000003s : 21: predicate.addn_zero_filter 0.81% : 0.000003s : 21: predicate.adjust_all_reduce_mul_add 2.03% : 0.000007s : 31: predicate.arithmetic_simplify 1.07% : 0.000004s : 21: predicate.cast_eliminate 0.43% : 0.000001s : 10: predicate.check_bprop_eliminate 0.44% : 0.000001s : 10: predicate.compare_switch_simplify 0.14% : 0.000000s : 5: predicate.const_output_eliminate 0.46% : 0.000002s : 10: predicate.depend_value_elim 0.99% : 0.000003s : 21: predicate.dict_get_item_const_eliminator 1.15% : 0.000004s : 21: predicate.dict_get_item_eliminator 0.88% : 0.000003s : 21: predicate.dict_set_item_eliminator 0.64% : 0.000002s : 10: predicate.dumpgradient_eliminate 0.14% : 0.000000s : 5: predicate.elim_not_effective 0.27% : 0.000001s : 5: predicate.elim_shapecalc_of_broadcastargs 1.19% : 0.000004s : 26: predicate.environ_add_const_eliminate 1.05% : 0.000004s : 26: predicate.environ_get_add_eliminate 1.18% : 0.000004s : 26: predicate.environ_get_depend_swap 1.55% : 0.000005s : 36: predicate.environ_get_eliminate 1.08% : 0.000004s : 26: predicate.environ_get_set_eliminate 1.50% : 0.000005s : 33: predicate.exchange_switch_depend_value 2.28% : 0.000008s : 33: predicate.float_depend_g_call 0.39% : 0.000001s : 10: predicate.float_environ_get_switch 0.73% : 0.000002s : 15: predicate.float_tuple_getitem_switch 0.12% : 0.000000s : 5: predicate.fold_const_symbol 0.52% : 0.000002s : 10: predicate.get_grad_eliminate 0.16% : 0.000001s : 5: predicate.graph_param_transform 0.47% : 0.000002s : 10: predicate.incorporate_call 0.39% : 0.000001s : 10: predicate.incorporate_call_switch 5.61% : 0.000019s : 84: predicate.inline 0.70% : 0.000002s : 10: predicate.inline_without_move 0.24% : 0.000001s : 10: predicate.j_node_and_user_rematch 0.66% : 0.000002s : 10: predicate.less_batch_normalization 1.86% : 0.000006s : 35: predicate.list_to_tuple_eliminator_ 2.50% : 0.000008s : 56: predicate.load_eliminater 0.66% : 0.000002s : 5: predicate.loop_unroll_after_grad 2.14% : 0.000007s : 49: predicate.loop_unroll_before_grad 1.49% : 0.000005s : 31: predicate.make_slice_get_slice_eliminator 0.57% : 0.000002s : 10: predicate.merge_addn 0.45% : 0.000002s : 10: predicate.micro_step_allgather_replace 0.44% : 0.000001s : 10: predicate.mini_step_allgather_replace 0.90% : 0.000003s : 21: predicate.minmaximum_grad 0.84% : 0.000003s : 5: predicate.mutable_eliminate 0.31% : 0.000001s : 5: predicate.opt_reshape 0.28% : 0.000001s : 5: predicate.parallel_virtual_node 1.95% : 0.000007s : 33: predicate.partial_defer_inline 1.51% : 0.000005s : 30: predicate.partial_eliminate 0.98% : 0.000003s : 21: predicate.print_const_string_wrapper 0.45% : 0.000002s : 10: predicate.reduce_all_const_elim 1.36% : 0.000005s : 21: predicate.reduce_eliminate 2.32% : 0.000008s : 56: predicate.redundant_stop_gradient_eliminater 0.29% : 0.000001s : 10: predicate.remove_not_recompute_node 1.22% : 0.000004s : 35: predicate.replace_applicator 0.52% : 0.000002s : 10: predicate.replace_old_param 0.19% : 0.000001s : 5: predicate.reset_defer_inline 0.99% : 0.000003s : 21: predicate.reshape_eliminate 0.44% : 0.000001s : 10: predicate.row_tensor_add_zeros_like 0.26% : 0.000001s : 5: predicate.row_tensor_eliminate 0.65% : 0.000002s : 10: predicate.same_eliminate 0.46% : 0.000002s : 10: predicate.set_cell_output_no_recompute 0.61% : 0.000002s : 10: predicate.shard_identity_eliminate 0.54% : 0.000002s : 10: predicate.special_op_eliminate 0.62% : 0.000002s : 10: predicate.specialize_transform 0.68% : 0.000002s : 10: predicate.split_environ_get_set_with_tuple_value 0.73% : 0.000002s : 10: predicate.stack_unstack_eliminate 0.26% : 0.000001s : 5: predicate.switch_call_monad_eliminater 9.75% : 0.000033s : 33: predicate.switch_defer_inline 1.95% : 0.000007s : 43: predicate.switch_layer_defer_inline 4.98% : 0.000017s : 101: predicate.switch_simplify 1.11% : 0.000004s : 21: predicate.tile_eliminate 0.95% : 0.000003s : 21: predicate.transpose_eliminate 1.39% : 0.000005s : 31: predicate.tuple_list_convert_item_index_to_positive 1.65% : 0.000006s : 31: predicate.tuple_list_get_item_const_eliminator 1.45% : 0.000005s : 31: predicate.tuple_list_get_item_depend_reorder 2.99% : 0.000010s : 45: predicate.tuple_list_get_item_eliminator 1.49% : 0.000005s : 31: predicate.tuple_list_get_set_item_eliminator 2.15% : 0.000007s : 41: predicate.tuple_list_set_item_eliminator 1.70% : 0.000006s : 35: predicate.tuple_to_list_eliminator_ 2.23% : 0.000007s : 56: predicate.updatestate_pure_node_eliminater 2.80% : 0.000009s : 66: predicate.updatestate_useless_node_eliminater 0.27% : 0.000001s : 5: predicate.value_based_eliminate 0.51% : 0.000002s : 10: predicate.virtual_dataset_eliminate 0.48% : 0.000002s : 10: predicate.virtual_output_eliminate 0.20% : 0.000001s : 5: predicate.virtual_view_grad_eliminate 0.30% : 0.000001s : 5: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.001107 16 49.14% : 0.000544s : 6: func_graph_cloner_run.FuncGraphClonerGraph 50.86% : 0.000563s : 10: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.070658 192 0.00% : 0.000003s : 1: ForceFp32Comm 5.52% : 0.003898s : 1: add_attr 5.50% : 0.003883s : 1: add_attr_with_inline 0.01% : 0.000004s : 1: add_comm_op_reuse_tag 0.09% : 0.000062s : 1: add_recomputation 0.01% : 0.000004s : 1: assign_add_opt 0.16% : 0.000115s : 1: auto_monad 0.03% : 0.000025s : 1: auto_monad_reorder 0.00% : 0.000003s : 1: begin_end_overlap_inline 0.01% : 0.000006s : 1: bias_add_comm_swap 0.66% : 0.000469s : 1: bootstrap 0.05% : 0.000032s : 1: cconv 0.01% : 0.000004s : 1: comm_op_add_attrs 0.02% : 0.000017s : 1: control_data_broadcast_order 0.02% : 0.000011s : 1: convert_after_rewriter 0.04% : 0.000029s : 1: cse_after_recomputation 0.01% : 0.000005s : 1: dataset_repeat_opt 0.01% : 0.000005s : 1: detach_backward 0.01% : 0.000009s : 1: environ_conv 0.16% : 0.000116s : 1: event_method 0.01% : 0.000005s : 1: full_micro_interleaved_order_control 0.01% : 0.000005s : 1: get_jit_bprop_graph 0.02% : 0.000011s : 1: graph_reusing 0.01% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000004s : 1: handle_group_info 0.01% : 0.000006s : 1: inline 0.01% : 0.000006s : 1: insert-virtual-dataset 0.01% : 0.000004s : 1: interleave_parallel_branches 0.01% : 0.000004s : 1: interleave_split_concat_branches 0.01% : 0.000006s : 1: label_fine_grained_interleaved_index 0.01% : 0.000007s : 1: label_micro_interleaved_index 0.64% : 0.000450s : 1: loop_unroll 0.01% : 0.000004s : 1: merge_cast_opt 0.01% : 0.000005s : 1: micro_interleaved_order_control 1.05% : 0.000743s : 1: mutable_eliminate 0.01% : 0.000007s : 1: offloading_packed_experts 0.02% : 0.000017s : 1: opt.transform.loop_unroll_optimizer 0.03% : 0.000020s : 1: opt.transform.mutable_eliminate 2.73% : 0.001927s : 78: opt.transform.opt_a 0.05% : 0.000038s : 1: opt.transform.opt_after_cconv 0.04% : 0.000030s : 1: opt.transform.opt_after_jit_grad 0.19% : 0.000136s : 28: opt.transform.opt_b 0.09% : 0.000060s : 2: opt.transform.opt_trans_graph 0.07% : 0.000047s : 4: opt.transform.symbol_engine_opt 20.64% : 0.014581s : 1: opt_a 0.17% : 0.000122s : 1: opt_after_cconv 0.69% : 0.000488s : 1: opt_after_jit_grad 0.37% : 0.000261s : 1: opt_b 24.16% : 0.017070s : 1: optimize 0.03% : 0.000022s : 1: optimize_parallel_all_gather_comm 0.01% : 0.000009s : 1: order_py_execute_after_rewriter 0.04% : 0.000029s : 1: overlap_grad_flash_sp 0.02% : 0.000015s : 1: overlap_grad_matmul_and_grad_allreduce 0.01% : 0.000008s : 1: overlap_grad_ring_attention 0.01% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.01% : 0.000005s : 1: overlap_param_gather 0.01% : 0.000005s : 1: overlap_recompute_allgather_and_fa_grad 0.01% : 0.000010s : 1: overlap_recompute_and_grad_model_parallel 0.01% : 0.000005s : 1: overlap_recompute_comm 0.01% : 0.000007s : 1: parallel-infer-symbol 0.01% : 0.000004s : 1: parallel-infer-symbol-second 0.01% : 0.000005s : 1: partial_unused_args_eliminate 0.01% : 0.000005s : 1: pipeline_parallel_scheduler 0.01% : 0.000004s : 1: pipeline_split 0.08% : 0.000056s : 1: pre_auto_parallel 0.06% : 0.000045s : 1: py_interpret_to_execute 0.03% : 0.000023s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000003s : 1: remove_cast_before_assign_add 0.03% : 0.000019s : 1: remove_dup_value 15.41% : 0.010889s : 1: renormalize.infer 0.98% : 0.000693s : 1: renormalize.specialize 0.01% : 0.000005s : 1: reorder_send_recv_between_fp_bp 0.01% : 0.000009s : 1: rewriter_after_jit_bprop_graph 0.07% : 0.000049s : 1: rewriter_after_opt_a 0.19% : 0.000134s : 1: rewriter_before_opt_a 0.01% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.01% : 0.000005s : 1: slice_recompute_activation 0.01% : 0.000004s : 1: split_layernorm_comm 0.01% : 0.000005s : 1: split_matmul_comm_elemetwise 0.01% : 0.000009s : 1: swap_dp_allreduce_reducescatter 0.13% : 0.000093s : 1: symbol_engine_optimizer 0.13% : 0.000092s : 1: tuple_transform 19.26% : 0.013609s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:13.458.960 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:13.459.231 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0552958, [21] [bootstrap]: 0.00045207 [type_inference]: 0.0420505 [event_method]: 8.988e-05 [auto_monad]: 0.00010444 [graph_reusing]: 7.66001e-06 [inline]: 2.46e-06 [add_attr]: 0.00319531, [1] [add_attr_with_inline]: 0.00318619, [1] [Cycle 1]: 9.75e-05, [2] [tag_attr]: 2.896e-05 [meta_addattr_fg_expand]: 9.17001e-06 [parallel-infer-symbol]: 3.04999e-06 [pre_auto_parallel]: 4.925e-05 [insert-virtual-dataset]: 2.42001e-06 [parallel-infer-symbol-second]: 7.09988e-07 [dataset_repeat_opt]: 2.26e-06 [pipeline_split]: 1.50999e-06 [optimize]: 0.00775863, [53] [py_interpret_to_execute]: 3.737e-05 [rewriter_before_opt_a]: 0.0001281 [opt_a]: 0.00450422, [2] [Cycle 1]: 0.00322725, [45] [expand_dump_flag]: 4.10998e-06 [switch_simplify]: 0.0001288 [loop_unroll]: 4.474e-05 [a_1]: 0.00094785 [with_stream_mark]: 1.849e-05 [recompute_prepare]: 1.33e-05 [updatestate_depend_eliminate]: 5.61998e-06 [updatestate_assign_eliminate]: 4.95999e-06 [updatestate_loads_eliminate]: 4.22e-06 [parameter_eliminate]: 2.27999e-06 [a_2]: 0.0001522 [accelerated_algorithm]: 1.011e-05 [shard]: 2.26e-06 [meta_shard_fg_expand]: 2.40997e-06 [shard_inline]: 9.20999e-06 [merge_send_recv]: 1.034e-05 [auto_parallel]: 9.13002e-06 [parallel]: 1.863e-05 [flash_sp]: 9.66e-06 [merge_comm]: 5.35999e-06 [allreduce_fusion]: 5.35999e-06 [matmul_add_comm_reduction]: 1.156e-05 [allreduce_slice_to_reducescatter]: 6.30011e-07 [virtual_shard_identity]: 1.15e-05 [virtual_dataset]: 9.27001e-06 [get_grad_eliminate_]: 8.77999e-06 [virtual_output]: 9.09998e-06 [merge_forward]: 4.86002e-06 [cell_reuse_recompute_pass]: 1.42e-06 [offload_activation]: 1.148e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.111e-05 [merge_recompute_call_nodes]: 1.67001e-06 [before_grad]: 1.6e-05 [set_forward_comm_id_for_comm_node_pass]: 5.72001e-06 [meta_fg_expand]: 4.67e-06 [flash_sp_send_recv_attached]: 2.81999e-06 [receive_attached]: 2.27999e-06 [after_resolve]: 1.386e-05 [a_after_grad]: 1.425e-05 [renormalize]: 0.00108531 [add_forward_monad_depend]: 7.23e-06 [auto_monad_grad]: 2.66e-06 [auto_monad_eliminator]: 2.422e-05 [cse]: 4.52e-05 [a_3]: 9.146e-05 [Cycle 2]: 0.00126097, [45] [expand_dump_flag]: 1.64998e-06 [switch_simplify]: 1.199e-05 [loop_unroll]: 9.65002e-06 [a_1]: 0.00024671 [with_stream_mark]: 1.719e-05 [recompute_prepare]: 9.81e-06 [updatestate_depend_eliminate]: 5.35001e-06 [updatestate_assign_eliminate]: 3.59002e-06 [updatestate_loads_eliminate]: 4.23999e-06 [parameter_eliminate]: 1.65001e-06 [a_2]: 0.00023616 [accelerated_algorithm]: 1.182e-05 [shard]: 3.04999e-06 [meta_shard_fg_expand]: 3.4e-06 [shard_inline]: 1.032e-05 [merge_send_recv]: 1.094e-05 [auto_parallel]: 1.004e-05 [parallel]: 7.28e-06 [flash_sp]: 3.68999e-06 [merge_comm]: 5.66003e-06 [allreduce_fusion]: 4.95999e-06 [matmul_add_comm_reduction]: 1.139e-05 [allreduce_slice_to_reducescatter]: 7.09988e-07 [virtual_shard_identity]: 1.126e-05 [virtual_dataset]: 8.90001e-06 [get_grad_eliminate_]: 8.92e-06 [virtual_output]: 8.40999e-06 [merge_forward]: 5.09e-06 [cell_reuse_recompute_pass]: 2.61999e-06 [offload_activation]: 1.079e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.307e-05 [merge_recompute_call_nodes]: 1.25999e-06 [before_grad]: 1.69e-05 [set_forward_comm_id_for_comm_node_pass]: 7.56001e-06 [meta_fg_expand]: 4.11001e-06 [flash_sp_send_recv_attached]: 1.35999e-06 [receive_attached]: 1.35001e-06 [after_resolve]: 1.588e-05 [a_after_grad]: 1.411e-05 [renormalize]: 8.00064e-08 [add_forward_monad_depend]: 2.84001e-06 [auto_monad_grad]: 2.61e-06 [auto_monad_eliminator]: 1.44e-05 [cse]: 3.607e-05 [a_3]: 7.268e-05 [py_interpret_to_execute_after_opt_a]: 2.272e-05 [slice_cell_reuse_recomputed_activation]: 5.45001e-06 [rewriter_after_opt_a]: 5.976e-05 [convert_after_rewriter]: 1.19e-05 [order_py_execute_after_rewriter]: 1.009e-05 [mutable_eliminate]: 0.00079437 [opt_b]: 0.00038431, [1] [Cycle 1]: 0.00037184, [7] [b_1]: 0.00023894 [b_2]: 1.12e-05 [updatestate_depend_eliminate]: 1.008e-05 [updatestate_assign_eliminate]: 3.71999e-06 [updatestate_loads_eliminate]: 3.73999e-06 [renormalize]: 8.00006e-07 [cse]: 4.176e-05 [optimize_parallel_all_gather_comm]: 2.714e-05 [overlap_param_gather]: 5.76e-06 [cconv]: 3.884e-05 [loop_unroll]: 0.00063023 [opt_after_cconv]: 0.00017958, [1] [Cycle 1]: 0.00016902, [7] [c_1]: 5.334e-05 [parameter_eliminate]: 5.34998e-06 [updatestate_depend_eliminate]: 1.088e-05 [updatestate_assign_eliminate]: 4.08001e-06 [updatestate_loads_eliminate]: 3.64002e-06 [cse]: 3.152e-05 [renormalize]: 4.50003e-07 [remove_dup_value]: 5.378e-05 [tuple_transform]: 0.00013276, [1] [Cycle 1]: 0.00012398, [4] [d_1]: 7.635e-05 [none_parameter_eliminate]: 2.02001e-06 [renormalize]: 3.50003e-07 [switch_simplify]: 1.113e-05 [partial_unused_args_eliminate]: 6.68998e-06 [add_recomputation]: 7.306e-05 [cse_after_recomputation]: 3.742e-05, [1] [Cycle 1]: 2.972e-05, [1] [cse]: 1.988e-05 [environ_conv]: 1.15e-05 [swap_dp_allreduce_reducescatter]: 9.64e-06 [bias_add_comm_swap]: 5.87001e-06 [label_micro_interleaved_index]: 9.05001e-06 [label_fine_grained_interleaved_index]: 5.25001e-06 [merge_cast_opt]: 5.09e-06 [slice_recompute_activation]: 4.90001e-06 [micro_interleaved_order_control]: 4.94998e-06 [assign_add_opt]: 3.41001e-06 [ForceFp32Comm]: 3.41001e-06 [remove_cast_before_assign_add]: 4.246e-05 [full_micro_interleaved_order_control]: 5.24e-06 [reorder_send_recv_between_fp_bp]: 5.24e-06 [comm_op_add_attrs]: 3.38999e-06 [add_comm_op_reuse_tag]: 3.41001e-06 [interleave_split_concat_branches]: 3.53999e-06 [interleave_parallel_branches]: 3.60998e-06 [overlap_opt_shard_in_pipeline]: 3.81999e-06 [overlap_opt_shard_grad_in_pipeline]: 4.45e-06 [control_data_broadcast_order]: 2.191e-05 [grouped_pairwise_exchange_alltoall]: 3.93999e-06 [offloading_packed_experts]: 7.5e-06 [overlap_recompute_and_grad_model_parallel]: 8.87e-06 [overlap_grad_matmul_and_grad_allreduce]: 4.22998e-06 [overlap_recompute_allgather_and_fa_grad]: 3.87002e-06 [overlap_recompute_comm]: 6.05002e-06 [overlap_grad_ring_attention]: 7.8e-06 [overlap_grad_flash_sp]: 3.054e-05 [begin_end_overlap_inline]: 3.28998e-06 [split_matmul_comm_elemetwise]: 4.89e-06 [split_layernorm_comm]: 4.13001e-06 [handle_group_info]: 3.45e-06 [symbol_engine_optimizer]: 0.00012849, [1] [Cycle 1]: 0.00012027, [6] [build]: 5.63002e-06 [elim_shapecalc]: 1.701e-05 [elim_not_effective]: 2.109e-05 [opt_reshape]: 1.106e-05 [fold_const_symbol]: 1.588e-05 [renormalize]: 2.19996e-07 [detach_backward]: 4.92e-06 [pipeline_parallel_scheduler]: 2.14e-06 [auto_monad_reorder]: 2.88e-05 [get_jit_bprop_graph]: 1.87999e-06 [rewriter_after_jit_bprop_graph]: 7.16999e-06 [opt_after_jit_grad]: 0.00071291 [validate]: 5.444e-05 Sums bootstrap : 0.000452s : 0.90% type_inference : 0.042051s : 83.95% event_method : 0.000090s : 0.18% auto_monad : 0.000104s : 0.21% graph_reusing : 0.000008s : 0.02% inline : 0.000002s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000029s : 0.06% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000009s : 0.02% parallel-infer-symbol : 0.000003s : 0.01% pre_auto_parallel : 0.000049s : 0.10% insert-virtual-dataset : 0.000002s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000037s : 0.07% optimize.rewriter_before_opt_a : 0.000128s : 0.26% optimize.opt_a.expand_dump_flag : 0.000006s : 0.01% optimize.opt_a.switch_simplify : 0.000141s : 0.28% optimize.opt_a.loop_unroll : 0.000054s : 0.11% optimize.opt_a.a_1 : 0.001195s : 2.38% optimize.opt_a.with_stream_mark : 0.000036s : 0.07% optimize.opt_a.recompute_prepare : 0.000023s : 0.05% optimize.opt_a.updatestate_depend_eliminate : 0.000011s : 0.02% optimize.opt_a.updatestate_assign_eliminate : 0.000009s : 0.02% optimize.opt_a.updatestate_loads_eliminate : 0.000008s : 0.02% optimize.opt_a.parameter_eliminate : 0.000004s : 0.01% optimize.opt_a.a_2 : 0.000388s : 0.78% optimize.opt_a.accelerated_algorithm : 0.000022s : 0.04% optimize.opt_a.shard : 0.000005s : 0.01% optimize.opt_a.meta_shard_fg_expand : 0.000006s : 0.01% optimize.opt_a.shard_inline : 0.000020s : 0.04% optimize.opt_a.merge_send_recv : 0.000021s : 0.04% optimize.opt_a.auto_parallel : 0.000019s : 0.04% optimize.opt_a.parallel : 0.000026s : 0.05% optimize.opt_a.flash_sp : 0.000013s : 0.03% optimize.opt_a.merge_comm : 0.000011s : 0.02% optimize.opt_a.allreduce_fusion : 0.000010s : 0.02% optimize.opt_a.matmul_add_comm_reduction : 0.000023s : 0.05% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000023s : 0.05% optimize.opt_a.virtual_dataset : 0.000018s : 0.04% optimize.opt_a.get_grad_eliminate_ : 0.000018s : 0.04% optimize.opt_a.virtual_output : 0.000018s : 0.03% optimize.opt_a.merge_forward : 0.000010s : 0.02% optimize.opt_a.cell_reuse_recompute_pass : 0.000004s : 0.01% optimize.opt_a.offload_activation : 0.000022s : 0.04% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000044s : 0.09% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.01% optimize.opt_a.before_grad : 0.000033s : 0.07% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000013s : 0.03% optimize.opt_a.meta_fg_expand : 0.000009s : 0.02% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.01% optimize.opt_a.receive_attached : 0.000004s : 0.01% optimize.opt_a.after_resolve : 0.000030s : 0.06% optimize.opt_a.a_after_grad : 0.000028s : 0.06% optimize.opt_a.renormalize : 0.001085s : 2.17% optimize.opt_a.add_forward_monad_depend : 0.000010s : 0.02% optimize.opt_a.auto_monad_grad : 0.000005s : 0.01% optimize.opt_a.auto_monad_eliminator : 0.000039s : 0.08% optimize.opt_a.cse : 0.000081s : 0.16% optimize.opt_a.a_3 : 0.000164s : 0.33% optimize.py_interpret_to_execute_after_opt_a : 0.000023s : 0.05% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.01% optimize.rewriter_after_opt_a : 0.000060s : 0.12% optimize.convert_after_rewriter : 0.000012s : 0.02% optimize.order_py_execute_after_rewriter : 0.000010s : 0.02% optimize.mutable_eliminate : 0.000794s : 1.59% optimize.opt_b.b_1 : 0.000239s : 0.48% optimize.opt_b.b_2 : 0.000011s : 0.02% optimize.opt_b.updatestate_depend_eliminate : 0.000010s : 0.02% optimize.opt_b.updatestate_assign_eliminate : 0.000004s : 0.01% optimize.opt_b.updatestate_loads_eliminate : 0.000004s : 0.01% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000042s : 0.08% optimize.optimize_parallel_all_gather_comm : 0.000027s : 0.05% optimize.overlap_param_gather : 0.000006s : 0.01% optimize.cconv : 0.000039s : 0.08% optimize.loop_unroll : 0.000630s : 1.26% optimize.opt_after_cconv.c_1 : 0.000053s : 0.11% optimize.opt_after_cconv.parameter_eliminate : 0.000005s : 0.01% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000011s : 0.02% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000004s : 0.01% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000004s : 0.01% optimize.opt_after_cconv.cse : 0.000032s : 0.06% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000054s : 0.11% optimize.tuple_transform.d_1 : 0.000076s : 0.15% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000011s : 0.02% optimize.partial_unused_args_eliminate : 0.000007s : 0.01% optimize.add_recomputation : 0.000073s : 0.15% optimize.cse_after_recomputation.cse : 0.000020s : 0.04% optimize.environ_conv : 0.000012s : 0.02% optimize.swap_dp_allreduce_reducescatter : 0.000010s : 0.02% optimize.bias_add_comm_swap : 0.000006s : 0.01% optimize.label_micro_interleaved_index : 0.000009s : 0.02% optimize.label_fine_grained_interleaved_index : 0.000005s : 0.01% optimize.merge_cast_opt : 0.000005s : 0.01% optimize.slice_recompute_activation : 0.000005s : 0.01% optimize.micro_interleaved_order_control : 0.000005s : 0.01% optimize.assign_add_opt : 0.000003s : 0.01% optimize.ForceFp32Comm : 0.000003s : 0.01% optimize.remove_cast_before_assign_add : 0.000042s : 0.08% optimize.full_micro_interleaved_order_control : 0.000005s : 0.01% optimize.reorder_send_recv_between_fp_bp : 0.000005s : 0.01% optimize.comm_op_add_attrs : 0.000003s : 0.01% optimize.add_comm_op_reuse_tag : 0.000003s : 0.01% optimize.interleave_split_concat_branches : 0.000004s : 0.01% optimize.interleave_parallel_branches : 0.000004s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000004s : 0.01% optimize.control_data_broadcast_order : 0.000022s : 0.04% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.01% optimize.offloading_packed_experts : 0.000007s : 0.01% optimize.overlap_recompute_and_grad_model_parallel : 0.000009s : 0.02% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.01% optimize.overlap_recompute_comm : 0.000006s : 0.01% optimize.overlap_grad_ring_attention : 0.000008s : 0.02% optimize.overlap_grad_flash_sp : 0.000031s : 0.06% optimize.begin_end_overlap_inline : 0.000003s : 0.01% optimize.split_matmul_comm_elemetwise : 0.000005s : 0.01% optimize.split_layernorm_comm : 0.000004s : 0.01% optimize.handle_group_info : 0.000003s : 0.01% optimize.symbol_engine_optimizer.build : 0.000006s : 0.01% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000017s : 0.03% optimize.symbol_engine_optimizer.elim_not_effective : 0.000021s : 0.04% optimize.symbol_engine_optimizer.opt_reshape : 0.000011s : 0.02% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000016s : 0.03% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000005s : 0.01% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000029s : 0.06% get_jit_bprop_graph : 0.000002s : 0.00% rewriter_after_jit_bprop_graph : 0.000007s : 0.01% opt_after_jit_grad : 0.000713s : 1.42% validate : 0.000054s : 0.11% Time group info: ------[substitution.] 0.000276 54 12.98% : 0.000036s : 6: substitution.cast_eliminate 1.14% : 0.000003s : 4: substitution.elim_not_effective 0.80% : 0.000002s : 4: substitution.fold_const_symbol 2.92% : 0.000008s : 6: substitution.graph_param_transform 66.56% : 0.000184s : 8: substitution.inline 2.18% : 0.000006s : 8: substitution.j_node_and_user_rematch 2.76% : 0.000008s : 8: substitution.remove_not_recompute_node 2.34% : 0.000006s : 4: substitution.replace_old_param 3.44% : 0.000010s : 2: substitution.switch_simplify 4.86% : 0.000013s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.041983 2 96.36% : 0.040454s : 1: type_inference.infer 3.64% : 0.001529s : 1: type_inference.specialize ------[replace.] 0.000124 14 43.20% : 0.000054s : 8: replace.inline 37.20% : 0.000046s : 2: replace.switch_simplify 19.60% : 0.000024s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000199 14 90.09% : 0.000179s : 8: match.inline 4.17% : 0.000008s : 2: match.switch_simplify 5.75% : 0.000011s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000330 1972 0.92% : 0.000003s : 21: predicate.accumulaten_eliminater 0.89% : 0.000003s : 6: predicate.ad_related_special_op_eliminate 0.54% : 0.000002s : 12: predicate.addn_check_dump 0.96% : 0.000003s : 21: predicate.addn_zero_filter 0.87% : 0.000003s : 21: predicate.adjust_all_reduce_mul_add 2.24% : 0.000007s : 33: predicate.arithmetic_simplify 1.17% : 0.000004s : 21: predicate.cast_eliminate 0.62% : 0.000002s : 12: predicate.check_bprop_eliminate 0.62% : 0.000002s : 12: predicate.compare_switch_simplify 0.17% : 0.000001s : 6: predicate.const_output_eliminate 0.59% : 0.000002s : 12: predicate.depend_value_elim 1.00% : 0.000003s : 21: predicate.dict_get_item_const_eliminator 1.07% : 0.000004s : 21: predicate.dict_get_item_eliminator 1.00% : 0.000003s : 21: predicate.dict_set_item_eliminator 0.85% : 0.000003s : 12: predicate.dumpgradient_eliminate 0.30% : 0.000001s : 6: predicate.elim_not_effective 0.41% : 0.000001s : 6: predicate.elim_shapecalc_of_broadcastargs 1.13% : 0.000004s : 27: predicate.environ_add_const_eliminate 1.10% : 0.000004s : 27: predicate.environ_get_add_eliminate 1.16% : 0.000004s : 27: predicate.environ_get_depend_swap 1.74% : 0.000006s : 39: predicate.environ_get_eliminate 1.16% : 0.000004s : 27: predicate.environ_get_set_eliminate 1.50% : 0.000005s : 33: predicate.exchange_switch_depend_value 2.20% : 0.000007s : 33: predicate.float_depend_g_call 0.54% : 0.000002s : 12: predicate.float_environ_get_switch 0.79% : 0.000003s : 18: predicate.float_tuple_getitem_switch 0.17% : 0.000001s : 6: predicate.fold_const_symbol 0.59% : 0.000002s : 12: predicate.get_grad_eliminate 0.20% : 0.000001s : 6: predicate.graph_param_transform 0.59% : 0.000002s : 12: predicate.incorporate_call 0.49% : 0.000002s : 12: predicate.incorporate_call_switch 6.48% : 0.000021s : 90: predicate.inline 0.77% : 0.000003s : 12: predicate.inline_without_move 0.30% : 0.000001s : 12: predicate.j_node_and_user_rematch 0.74% : 0.000002s : 12: predicate.less_batch_normalization 1.69% : 0.000006s : 37: predicate.list_to_tuple_eliminator_ 2.55% : 0.000008s : 58: predicate.load_eliminater 0.84% : 0.000003s : 6: predicate.loop_unroll_after_grad 2.27% : 0.000007s : 51: predicate.loop_unroll_before_grad 1.70% : 0.000006s : 33: predicate.make_slice_get_slice_eliminator 0.61% : 0.000002s : 12: predicate.merge_addn 0.61% : 0.000002s : 12: predicate.micro_step_allgather_replace 0.66% : 0.000002s : 12: predicate.mini_step_allgather_replace 0.83% : 0.000003s : 21: predicate.minmaximum_grad 0.94% : 0.000003s : 6: predicate.mutable_eliminate 0.32% : 0.000001s : 6: predicate.opt_reshape 0.39% : 0.000001s : 6: predicate.parallel_virtual_node 1.80% : 0.000006s : 33: predicate.partial_defer_inline 1.59% : 0.000005s : 31: predicate.partial_eliminate 0.99% : 0.000003s : 21: predicate.print_const_string_wrapper 0.57% : 0.000002s : 12: predicate.reduce_all_const_elim 1.19% : 0.000004s : 21: predicate.reduce_eliminate 2.44% : 0.000008s : 58: predicate.redundant_stop_gradient_eliminater 0.33% : 0.000001s : 12: predicate.remove_not_recompute_node 1.28% : 0.000004s : 37: predicate.replace_applicator 0.49% : 0.000002s : 12: predicate.replace_old_param 0.28% : 0.000001s : 6: predicate.reset_defer_inline 1.08% : 0.000004s : 21: predicate.reshape_eliminate 0.66% : 0.000002s : 12: predicate.row_tensor_add_zeros_like 0.36% : 0.000001s : 6: predicate.row_tensor_eliminate 0.76% : 0.000002s : 12: predicate.same_eliminate 0.37% : 0.000001s : 12: predicate.set_cell_output_no_recompute 0.70% : 0.000002s : 12: predicate.shard_identity_eliminate 0.75% : 0.000002s : 12: predicate.special_op_eliminate 0.75% : 0.000002s : 12: predicate.specialize_transform 0.85% : 0.000003s : 12: predicate.split_environ_get_set_with_tuple_value 0.66% : 0.000002s : 12: predicate.stack_unstack_eliminate 0.34% : 0.000001s : 6: predicate.switch_call_monad_eliminater 1.68% : 0.000006s : 33: predicate.switch_defer_inline 2.06% : 0.000007s : 45: predicate.switch_layer_defer_inline 5.30% : 0.000017s : 106: predicate.switch_simplify 0.95% : 0.000003s : 21: predicate.tile_eliminate 1.05% : 0.000003s : 21: predicate.transpose_eliminate 1.58% : 0.000005s : 33: predicate.tuple_list_convert_item_index_to_positive 1.71% : 0.000006s : 33: predicate.tuple_list_get_item_const_eliminator 1.53% : 0.000005s : 33: predicate.tuple_list_get_item_depend_reorder 3.01% : 0.000010s : 49: predicate.tuple_list_get_item_eliminator 1.51% : 0.000005s : 33: predicate.tuple_list_get_set_item_eliminator 2.38% : 0.000008s : 45: predicate.tuple_list_set_item_eliminator 1.73% : 0.000006s : 37: predicate.tuple_to_list_eliminator_ 2.40% : 0.000008s : 58: predicate.updatestate_pure_node_eliminater 3.29% : 0.000011s : 70: predicate.updatestate_useless_node_eliminater 0.40% : 0.000001s : 6: predicate.value_based_eliminate 0.63% : 0.000002s : 12: predicate.virtual_dataset_eliminate 0.56% : 0.000002s : 12: predicate.virtual_output_eliminate 0.27% : 0.000001s : 6: predicate.virtual_view_grad_eliminate 0.47% : 0.000002s : 6: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.001028 16 53.38% : 0.000549s : 6: func_graph_cloner_run.FuncGraphClonerGraph 46.62% : 0.000479s : 10: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.069573 192 0.01% : 0.000006s : 1: ForceFp32Comm 4.61% : 0.003204s : 1: add_attr 4.59% : 0.003190s : 1: add_attr_with_inline 0.01% : 0.000006s : 1: add_comm_op_reuse_tag 0.11% : 0.000077s : 1: add_recomputation 0.01% : 0.000006s : 1: assign_add_opt 0.17% : 0.000116s : 1: auto_monad 0.05% : 0.000036s : 1: auto_monad_reorder 0.01% : 0.000006s : 1: begin_end_overlap_inline 0.01% : 0.000009s : 1: bias_add_comm_swap 0.71% : 0.000495s : 1: bootstrap 0.06% : 0.000043s : 1: cconv 0.01% : 0.000007s : 1: comm_op_add_attrs 0.04% : 0.000025s : 1: control_data_broadcast_order 0.02% : 0.000015s : 1: convert_after_rewriter 0.06% : 0.000041s : 1: cse_after_recomputation 0.01% : 0.000008s : 1: dataset_repeat_opt 0.04% : 0.000028s : 1: detach_backward 0.02% : 0.000014s : 1: environ_conv 0.15% : 0.000104s : 1: event_method 0.01% : 0.000008s : 1: full_micro_interleaved_order_control 0.01% : 0.000008s : 1: get_jit_bprop_graph 0.02% : 0.000014s : 1: graph_reusing 0.01% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000007s : 1: handle_group_info 0.01% : 0.000008s : 1: inline 0.01% : 0.000009s : 1: insert-virtual-dataset 0.01% : 0.000006s : 1: interleave_parallel_branches 0.01% : 0.000007s : 1: interleave_split_concat_branches 0.01% : 0.000008s : 1: label_fine_grained_interleaved_index 0.02% : 0.000012s : 1: label_micro_interleaved_index 0.92% : 0.000638s : 1: loop_unroll 0.01% : 0.000008s : 1: merge_cast_opt 0.01% : 0.000008s : 1: micro_interleaved_order_control 1.15% : 0.000803s : 1: mutable_eliminate 0.01% : 0.000010s : 1: offloading_packed_experts 0.03% : 0.000023s : 1: opt.transform.loop_unroll_optimizer 0.03% : 0.000024s : 1: opt.transform.mutable_eliminate 2.79% : 0.001944s : 78: opt.transform.opt_a 0.07% : 0.000052s : 1: opt.transform.opt_after_cconv 0.06% : 0.000043s : 1: opt.transform.opt_after_jit_grad 0.25% : 0.000172s : 28: opt.transform.opt_b 0.12% : 0.000083s : 2: opt.transform.opt_trans_graph 0.09% : 0.000060s : 4: opt.transform.symbol_engine_opt 6.48% : 0.004508s : 1: opt_a 0.26% : 0.000183s : 1: opt_after_cconv 1.04% : 0.000726s : 1: opt_after_jit_grad 0.56% : 0.000388s : 1: opt_b 11.86% : 0.008252s : 1: optimize 0.04% : 0.000031s : 1: optimize_parallel_all_gather_comm 0.02% : 0.000013s : 1: order_py_execute_after_rewriter 0.05% : 0.000034s : 1: overlap_grad_flash_sp 0.01% : 0.000008s : 1: overlap_grad_matmul_and_grad_allreduce 0.02% : 0.000011s : 1: overlap_grad_ring_attention 0.01% : 0.000007s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000007s : 1: overlap_opt_shard_in_pipeline 0.02% : 0.000010s : 1: overlap_param_gather 0.01% : 0.000006s : 1: overlap_recompute_allgather_and_fa_grad 0.02% : 0.000012s : 1: overlap_recompute_and_grad_model_parallel 0.01% : 0.000009s : 1: overlap_recompute_comm 0.01% : 0.000010s : 1: parallel-infer-symbol 0.01% : 0.000006s : 1: parallel-infer-symbol-second 0.01% : 0.000010s : 1: partial_unused_args_eliminate 0.01% : 0.000010s : 1: pipeline_parallel_scheduler 0.01% : 0.000007s : 1: pipeline_split 0.08% : 0.000057s : 1: pre_auto_parallel 0.06% : 0.000041s : 1: py_interpret_to_execute 0.04% : 0.000026s : 1: py_interpret_to_execute_after_opt_a 0.07% : 0.000045s : 1: remove_cast_before_assign_add 0.08% : 0.000058s : 1: remove_dup_value 0.80% : 0.000558s : 1: renormalize.infer 0.74% : 0.000518s : 1: renormalize.specialize 0.01% : 0.000008s : 1: reorder_send_recv_between_fp_bp 0.02% : 0.000014s : 1: rewriter_after_jit_bprop_graph 0.09% : 0.000065s : 1: rewriter_after_opt_a 0.19% : 0.000132s : 1: rewriter_before_opt_a 0.01% : 0.000008s : 1: slice_cell_reuse_recomputed_activation 0.01% : 0.000008s : 1: slice_recompute_activation 0.01% : 0.000007s : 1: split_layernorm_comm 0.01% : 0.000008s : 1: split_matmul_comm_elemetwise 0.02% : 0.000013s : 1: swap_dp_allreduce_reducescatter 0.19% : 0.000131s : 1: symbol_engine_optimizer 0.20% : 0.000136s : 1: tuple_transform 60.50% : 0.042090s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:14.260.07 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0417778, [21] [bootstrap]: 0.00045789 [type_inference]: 0.0307156 [event_method]: 9.407e-05 [auto_monad]: 0.0001031 [graph_reusing]: 7.01001e-06 [inline]: 2.86e-06 [add_attr]: 0.00328631, [1] [add_attr_with_inline]: 0.00327623, [1] [Cycle 1]: 7.708e-05, [2] [tag_attr]: 3.01e-05 [meta_addattr_fg_expand]: 8.18999e-06 [parallel-infer-symbol]: 3.3e-06 [pre_auto_parallel]: 4.543e-05 [insert-virtual-dataset]: 2.26e-06 [parallel-infer-symbol-second]: 7.80012e-07 [dataset_repeat_opt]: 1.94e-06 [pipeline_split]: 1.50999e-06 [optimize]: 0.00634065, [53] [py_interpret_to_execute]: 3.29e-05 [rewriter_before_opt_a]: 0.00012031 [opt_a]: 0.00393436, [2] [Cycle 1]: 0.00299787, [45] [expand_dump_flag]: 4.12e-06 [switch_simplify]: 0.00013317 [loop_unroll]: 4.336e-05 [a_1]: 0.00094608 [with_stream_mark]: 1.834e-05 [recompute_prepare]: 1.115e-05 [updatestate_depend_eliminate]: 5.67999e-06 [updatestate_assign_eliminate]: 5.32001e-06 [updatestate_loads_eliminate]: 3.88999e-06 [parameter_eliminate]: 1.84e-06 [a_2]: 0.00012276 [accelerated_algorithm]: 9.94001e-06 [shard]: 1.62001e-06 [meta_shard_fg_expand]: 2.60002e-06 [shard_inline]: 8.90999e-06 [merge_send_recv]: 9.62999e-06 [auto_parallel]: 7.15998e-06 [parallel]: 1.865e-05 [flash_sp]: 8.90999e-06 [merge_comm]: 5.59e-06 [allreduce_fusion]: 4.72e-06 [matmul_add_comm_reduction]: 1.173e-05 [allreduce_slice_to_reducescatter]: 6.00005e-07 [virtual_shard_identity]: 1.075e-05 [virtual_dataset]: 8.85999e-06 [get_grad_eliminate_]: 9.19998e-06 [virtual_output]: 8.95999e-06 [merge_forward]: 5.76998e-06 [cell_reuse_recompute_pass]: 1.20001e-06 [offload_activation]: 1.164e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.96e-05 [merge_recompute_call_nodes]: 1.66998e-06 [before_grad]: 1.561e-05 [set_forward_comm_id_for_comm_node_pass]: 5.18002e-06 [meta_fg_expand]: 4.45999e-06 [flash_sp_send_recv_attached]: 2.37999e-06 [receive_attached]: 2.18002e-06 [after_resolve]: 1.443e-05 [a_after_grad]: 1.368e-05 [renormalize]: 0.00101291 [add_forward_monad_depend]: 6.31e-06 [auto_monad_grad]: 2.65002e-06 [auto_monad_eliminator]: 2.045e-05 [cse]: 4.458e-05 [a_3]: 0.00013101 [Cycle 2]: 0.00092543, [45] [expand_dump_flag]: 1.34998e-06 [switch_simplify]: 1.186e-05 [loop_unroll]: 1.033e-05 [a_1]: 0.00022859 [with_stream_mark]: 1.58e-05 [recompute_prepare]: 1.071e-05 [updatestate_depend_eliminate]: 5.17999e-06 [updatestate_assign_eliminate]: 3.93001e-06 [updatestate_loads_eliminate]: 4.22e-06 [parameter_eliminate]: 1.42999e-06 [a_2]: 0.00011481 [accelerated_algorithm]: 9.39998e-06 [shard]: 1.50001e-06 [meta_shard_fg_expand]: 2.21998e-06 [shard_inline]: 8.87e-06 [merge_send_recv]: 7.67998e-06 [auto_parallel]: 7.05e-06 [parallel]: 5.78002e-06 [flash_sp]: 3.36001e-06 [merge_comm]: 4.91002e-06 [allreduce_fusion]: 5.10001e-06 [matmul_add_comm_reduction]: 9.35001e-06 [allreduce_slice_to_reducescatter]: 5.60016e-07 [virtual_shard_identity]: 1.079e-05 [virtual_dataset]: 9.02e-06 [get_grad_eliminate_]: 8.08999e-06 [virtual_output]: 8.55999e-06 [merge_forward]: 4.33001e-06 [cell_reuse_recompute_pass]: 2.01998e-06 [offload_activation]: 9.94001e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.919e-05 [merge_recompute_call_nodes]: 1.29e-06 [before_grad]: 1.493e-05 [set_forward_comm_id_for_comm_node_pass]: 6.98998e-06 [meta_fg_expand]: 3.40998e-06 [flash_sp_send_recv_attached]: 9.70002e-07 [receive_attached]: 1.30001e-06 [after_resolve]: 1.335e-05 [a_after_grad]: 1.312e-05 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 2.04999e-06 [auto_monad_grad]: 2.14e-06 [auto_monad_eliminator]: 1.25e-05 [cse]: 2.814e-05 [a_3]: 6.021e-05 [py_interpret_to_execute_after_opt_a]: 1.253e-05 [slice_cell_reuse_recomputed_activation]: 2.51e-06 [rewriter_after_opt_a]: 5.014e-05 [convert_after_rewriter]: 8.94998e-06 [order_py_execute_after_rewriter]: 7.05002e-06 [mutable_eliminate]: 0.00053853 [opt_b]: 0.00030529, [1] [Cycle 1]: 0.00029879, [7] [b_1]: 0.00019906 [b_2]: 1.166e-05 [updatestate_depend_eliminate]: 8.28001e-06 [updatestate_assign_eliminate]: 4.3e-06 [updatestate_loads_eliminate]: 3.9e-06 [renormalize]: 7.89994e-07 [cse]: 3.202e-05 [optimize_parallel_all_gather_comm]: 2.068e-05 [overlap_param_gather]: 1.86e-06 [cconv]: 3.041e-05 [loop_unroll]: 0.00046592 [opt_after_cconv]: 0.00013174, [1] [Cycle 1]: 0.00012611, [7] [c_1]: 4.492e-05 [parameter_eliminate]: 3.54002e-06 [updatestate_depend_eliminate]: 6.78e-06 [updatestate_assign_eliminate]: 3.56001e-06 [updatestate_loads_eliminate]: 3.41999e-06 [cse]: 2.864e-05 [renormalize]: 4.59986e-07 [remove_dup_value]: 3.859e-05 [tuple_transform]: 9.838e-05, [1] [Cycle 1]: 9.341e-05, [4] [d_1]: 6.345e-05 [none_parameter_eliminate]: 1.62999e-06 [renormalize]: 1.80007e-07 [switch_simplify]: 9.51998e-06 [partial_unused_args_eliminate]: 1.74998e-06 [add_recomputation]: 6.168e-05 [cse_after_recomputation]: 2.761e-05, [1] [Cycle 1]: 2.281e-05, [1] [cse]: 1.73e-05 [environ_conv]: 6.29999e-06 [swap_dp_allreduce_reducescatter]: 7.16001e-06 [bias_add_comm_swap]: 3.01001e-06 [label_micro_interleaved_index]: 4.62e-06 [label_fine_grained_interleaved_index]: 2.68e-06 [merge_cast_opt]: 1.21002e-06 [slice_recompute_activation]: 1.96e-06 [micro_interleaved_order_control]: 2.14e-06 [assign_add_opt]: 1.15999e-06 [ForceFp32Comm]: 7.99977e-07 [remove_cast_before_assign_add]: 9.80013e-07 [full_micro_interleaved_order_control]: 2.18002e-06 [reorder_send_recv_between_fp_bp]: 2.98e-06 [comm_op_add_attrs]: 9.19972e-07 [add_comm_op_reuse_tag]: 9.50007e-07 [interleave_split_concat_branches]: 1.14998e-06 [interleave_parallel_branches]: 1.19e-06 [overlap_opt_shard_in_pipeline]: 1.12e-06 [overlap_opt_shard_grad_in_pipeline]: 1.74e-06 [control_data_broadcast_order]: 1.767e-05 [grouped_pairwise_exchange_alltoall]: 1.84998e-06 [offloading_packed_experts]: 4.58001e-06 [overlap_recompute_and_grad_model_parallel]: 5.09998e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.16002e-06 [overlap_recompute_allgather_and_fa_grad]: 1.38002e-06 [overlap_recompute_comm]: 2.41e-06 [overlap_grad_ring_attention]: 4.92999e-06 [overlap_grad_flash_sp]: 2.612e-05 [begin_end_overlap_inline]: 6.29982e-07 [split_matmul_comm_elemetwise]: 2.32999e-06 [split_layernorm_comm]: 1.96e-06 [handle_group_info]: 1.00999e-06 [symbol_engine_optimizer]: 9.458e-05, [1] [Cycle 1]: 9.005e-05, [6] [build]: 3.31001e-06 [elim_shapecalc]: 1.4e-05 [elim_not_effective]: 1.839e-05 [opt_reshape]: 1.003e-05 [fold_const_symbol]: 1.44e-05 [renormalize]: 1.8999e-07 [detach_backward]: 1.82999e-06 [pipeline_parallel_scheduler]: 1.92999e-06 [auto_monad_reorder]: 2.139e-05 [get_jit_bprop_graph]: 1.81998e-06 [rewriter_after_jit_bprop_graph]: 4.72998e-06 [opt_after_jit_grad]: 0.00047465 [validate]: 4.456e-05 Sums bootstrap : 0.000458s : 1.22% type_inference : 0.030716s : 81.94% event_method : 0.000094s : 0.25% auto_monad : 0.000103s : 0.28% graph_reusing : 0.000007s : 0.02% inline : 0.000003s : 0.01% add_attr.add_attr_with_inline.tag_attr : 0.000030s : 0.08% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000008s : 0.02% parallel-infer-symbol : 0.000003s : 0.01% pre_auto_parallel : 0.000045s : 0.12% insert-virtual-dataset : 0.000002s : 0.01% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.01% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000033s : 0.09% optimize.rewriter_before_opt_a : 0.000120s : 0.32% optimize.opt_a.expand_dump_flag : 0.000005s : 0.01% optimize.opt_a.switch_simplify : 0.000145s : 0.39% optimize.opt_a.loop_unroll : 0.000054s : 0.14% optimize.opt_a.a_1 : 0.001175s : 3.13% optimize.opt_a.with_stream_mark : 0.000034s : 0.09% optimize.opt_a.recompute_prepare : 0.000022s : 0.06% optimize.opt_a.updatestate_depend_eliminate : 0.000011s : 0.03% optimize.opt_a.updatestate_assign_eliminate : 0.000009s : 0.02% optimize.opt_a.updatestate_loads_eliminate : 0.000008s : 0.02% optimize.opt_a.parameter_eliminate : 0.000003s : 0.01% optimize.opt_a.a_2 : 0.000238s : 0.63% optimize.opt_a.accelerated_algorithm : 0.000019s : 0.05% optimize.opt_a.shard : 0.000003s : 0.01% optimize.opt_a.meta_shard_fg_expand : 0.000005s : 0.01% optimize.opt_a.shard_inline : 0.000018s : 0.05% optimize.opt_a.merge_send_recv : 0.000017s : 0.05% optimize.opt_a.auto_parallel : 0.000014s : 0.04% optimize.opt_a.parallel : 0.000024s : 0.07% optimize.opt_a.flash_sp : 0.000012s : 0.03% optimize.opt_a.merge_comm : 0.000011s : 0.03% optimize.opt_a.allreduce_fusion : 0.000010s : 0.03% optimize.opt_a.matmul_add_comm_reduction : 0.000021s : 0.06% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000022s : 0.06% optimize.opt_a.virtual_dataset : 0.000018s : 0.05% optimize.opt_a.get_grad_eliminate_ : 0.000017s : 0.05% optimize.opt_a.virtual_output : 0.000018s : 0.05% optimize.opt_a.merge_forward : 0.000010s : 0.03% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.01% optimize.opt_a.offload_activation : 0.000022s : 0.06% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000039s : 0.10% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.01% optimize.opt_a.before_grad : 0.000031s : 0.08% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000012s : 0.03% optimize.opt_a.meta_fg_expand : 0.000008s : 0.02% optimize.opt_a.flash_sp_send_recv_attached : 0.000003s : 0.01% optimize.opt_a.receive_attached : 0.000003s : 0.01% optimize.opt_a.after_resolve : 0.000028s : 0.07% optimize.opt_a.a_after_grad : 0.000027s : 0.07% optimize.opt_a.renormalize : 0.001013s : 2.70% optimize.opt_a.add_forward_monad_depend : 0.000008s : 0.02% optimize.opt_a.auto_monad_grad : 0.000005s : 0.01% optimize.opt_a.auto_monad_eliminator : 0.000033s : 0.09% optimize.opt_a.cse : 0.000073s : 0.19% optimize.opt_a.a_3 : 0.000191s : 0.51% optimize.py_interpret_to_execute_after_opt_a : 0.000013s : 0.03% optimize.slice_cell_reuse_recomputed_activation : 0.000003s : 0.01% optimize.rewriter_after_opt_a : 0.000050s : 0.13% optimize.convert_after_rewriter : 0.000009s : 0.02% optimize.order_py_execute_after_rewriter : 0.000007s : 0.02% optimize.mutable_eliminate : 0.000539s : 1.44% optimize.opt_b.b_1 : 0.000199s : 0.53% optimize.opt_b.b_2 : 0.000012s : 0.03% optimize.opt_b.updatestate_depend_eliminate : 0.000008s : 0.02% optimize.opt_b.updatestate_assign_eliminate : 0.000004s : 0.01% optimize.opt_b.updatestate_loads_eliminate : 0.000004s : 0.01% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000032s : 0.09% optimize.optimize_parallel_all_gather_comm : 0.000021s : 0.06% optimize.overlap_param_gather : 0.000002s : 0.00% optimize.cconv : 0.000030s : 0.08% optimize.loop_unroll : 0.000466s : 1.24% optimize.opt_after_cconv.c_1 : 0.000045s : 0.12% optimize.opt_after_cconv.parameter_eliminate : 0.000004s : 0.01% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000007s : 0.02% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000004s : 0.01% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.cse : 0.000029s : 0.08% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000039s : 0.10% optimize.tuple_transform.d_1 : 0.000063s : 0.17% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000010s : 0.03% optimize.partial_unused_args_eliminate : 0.000002s : 0.00% optimize.add_recomputation : 0.000062s : 0.16% optimize.cse_after_recomputation.cse : 0.000017s : 0.05% optimize.environ_conv : 0.000006s : 0.02% optimize.swap_dp_allreduce_reducescatter : 0.000007s : 0.02% optimize.bias_add_comm_swap : 0.000003s : 0.01% optimize.label_micro_interleaved_index : 0.000005s : 0.01% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.01% optimize.merge_cast_opt : 0.000001s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.01% optimize.micro_interleaved_order_control : 0.000002s : 0.01% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.00% optimize.full_micro_interleaved_order_control : 0.000002s : 0.01% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.01% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.00% optimize.control_data_broadcast_order : 0.000018s : 0.05% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.00% optimize.offloading_packed_experts : 0.000005s : 0.01% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.01% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.00% optimize.overlap_recompute_comm : 0.000002s : 0.01% optimize.overlap_grad_ring_attention : 0.000005s : 0.01% optimize.overlap_grad_flash_sp : 0.000026s : 0.07% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.01% optimize.split_layernorm_comm : 0.000002s : 0.01% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000003s : 0.01% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000014s : 0.04% optimize.symbol_engine_optimizer.elim_not_effective : 0.000018s : 0.05% optimize.symbol_engine_optimizer.opt_reshape : 0.000010s : 0.03% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000014s : 0.04% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.00% pipeline_parallel_scheduler : 0.000002s : 0.01% auto_monad_reorder : 0.000021s : 0.06% get_jit_bprop_graph : 0.000002s : 0.00% rewriter_after_jit_bprop_graph : 0.000005s : 0.01% opt_after_jit_grad : 0.000475s : 1.27% validate : 0.000045s : 0.12% Time group info: ------[substitution.] 0.000270 54 12.19% : 0.000033s : 6: substitution.cast_eliminate 0.96% : 0.000003s : 4: substitution.elim_not_effective 0.75% : 0.000002s : 4: substitution.fold_const_symbol 2.73% : 0.000007s : 6: substitution.graph_param_transform 67.95% : 0.000184s : 8: substitution.inline 2.11% : 0.000006s : 8: substitution.j_node_and_user_rematch 2.72% : 0.000007s : 8: substitution.remove_not_recompute_node 1.82% : 0.000005s : 4: substitution.replace_old_param 3.69% : 0.000010s : 2: substitution.switch_simplify 5.07% : 0.000014s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.030636 2 94.80% : 0.029044s : 1: type_inference.infer 5.20% : 0.001592s : 1: type_inference.specialize ------[replace.] 0.000123 14 41.62% : 0.000051s : 8: replace.inline 39.55% : 0.000049s : 2: replace.switch_simplify 18.83% : 0.000023s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000199 14 89.84% : 0.000179s : 8: match.inline 4.34% : 0.000009s : 2: match.switch_simplify 5.82% : 0.000012s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000315 1972 0.90% : 0.000003s : 21: predicate.accumulaten_eliminater 0.68% : 0.000002s : 6: predicate.ad_related_special_op_eliminate 0.49% : 0.000002s : 12: predicate.addn_check_dump 0.94% : 0.000003s : 21: predicate.addn_zero_filter 0.86% : 0.000003s : 21: predicate.adjust_all_reduce_mul_add 2.00% : 0.000006s : 33: predicate.arithmetic_simplify 1.19% : 0.000004s : 21: predicate.cast_eliminate 0.61% : 0.000002s : 12: predicate.check_bprop_eliminate 0.50% : 0.000002s : 12: predicate.compare_switch_simplify 0.19% : 0.000001s : 6: predicate.const_output_eliminate 0.58% : 0.000002s : 12: predicate.depend_value_elim 1.08% : 0.000003s : 21: predicate.dict_get_item_const_eliminator 1.13% : 0.000004s : 21: predicate.dict_get_item_eliminator 0.93% : 0.000003s : 21: predicate.dict_set_item_eliminator 0.79% : 0.000003s : 12: predicate.dumpgradient_eliminate 0.20% : 0.000001s : 6: predicate.elim_not_effective 0.35% : 0.000001s : 6: predicate.elim_shapecalc_of_broadcastargs 1.24% : 0.000004s : 27: predicate.environ_add_const_eliminate 1.18% : 0.000004s : 27: predicate.environ_get_add_eliminate 1.13% : 0.000004s : 27: predicate.environ_get_depend_swap 1.70% : 0.000005s : 39: predicate.environ_get_eliminate 1.19% : 0.000004s : 27: predicate.environ_get_set_eliminate 1.49% : 0.000005s : 33: predicate.exchange_switch_depend_value 2.39% : 0.000008s : 33: predicate.float_depend_g_call 0.51% : 0.000002s : 12: predicate.float_environ_get_switch 0.77% : 0.000002s : 18: predicate.float_tuple_getitem_switch 0.17% : 0.000001s : 6: predicate.fold_const_symbol 0.65% : 0.000002s : 12: predicate.get_grad_eliminate 0.20% : 0.000001s : 6: predicate.graph_param_transform 0.64% : 0.000002s : 12: predicate.incorporate_call 0.54% : 0.000002s : 12: predicate.incorporate_call_switch 6.59% : 0.000021s : 90: predicate.inline 0.74% : 0.000002s : 12: predicate.inline_without_move 0.31% : 0.000001s : 12: predicate.j_node_and_user_rematch 0.69% : 0.000002s : 12: predicate.less_batch_normalization 1.87% : 0.000006s : 37: predicate.list_to_tuple_eliminator_ 2.54% : 0.000008s : 58: predicate.load_eliminater 0.84% : 0.000003s : 6: predicate.loop_unroll_after_grad 2.38% : 0.000007s : 51: predicate.loop_unroll_before_grad 1.60% : 0.000005s : 33: predicate.make_slice_get_slice_eliminator 0.56% : 0.000002s : 12: predicate.merge_addn 0.60% : 0.000002s : 12: predicate.micro_step_allgather_replace 0.58% : 0.000002s : 12: predicate.mini_step_allgather_replace 0.86% : 0.000003s : 21: predicate.minmaximum_grad 0.92% : 0.000003s : 6: predicate.mutable_eliminate 0.33% : 0.000001s : 6: predicate.opt_reshape 0.35% : 0.000001s : 6: predicate.parallel_virtual_node 1.84% : 0.000006s : 33: predicate.partial_defer_inline 1.61% : 0.000005s : 31: predicate.partial_eliminate 0.94% : 0.000003s : 21: predicate.print_const_string_wrapper 0.55% : 0.000002s : 12: predicate.reduce_all_const_elim 1.20% : 0.000004s : 21: predicate.reduce_eliminate 2.48% : 0.000008s : 58: predicate.redundant_stop_gradient_eliminater 0.40% : 0.000001s : 12: predicate.remove_not_recompute_node 1.36% : 0.000004s : 37: predicate.replace_applicator 0.39% : 0.000001s : 12: predicate.replace_old_param 0.30% : 0.000001s : 6: predicate.reset_defer_inline 1.00% : 0.000003s : 21: predicate.reshape_eliminate 0.54% : 0.000002s : 12: predicate.row_tensor_add_zeros_like 0.34% : 0.000001s : 6: predicate.row_tensor_eliminate 1.04% : 0.000003s : 12: predicate.same_eliminate 0.40% : 0.000001s : 12: predicate.set_cell_output_no_recompute 0.73% : 0.000002s : 12: predicate.shard_identity_eliminate 0.66% : 0.000002s : 12: predicate.special_op_eliminate 0.81% : 0.000003s : 12: predicate.specialize_transform 0.76% : 0.000002s : 12: predicate.split_environ_get_set_with_tuple_value 0.65% : 0.000002s : 12: predicate.stack_unstack_eliminate 0.33% : 0.000001s : 6: predicate.switch_call_monad_eliminater 1.61% : 0.000005s : 33: predicate.switch_defer_inline 2.25% : 0.000007s : 45: predicate.switch_layer_defer_inline 5.51% : 0.000017s : 106: predicate.switch_simplify 0.99% : 0.000003s : 21: predicate.tile_eliminate 0.97% : 0.000003s : 21: predicate.transpose_eliminate 1.63% : 0.000005s : 33: predicate.tuple_list_convert_item_index_to_positive 1.65% : 0.000005s : 33: predicate.tuple_list_get_item_const_eliminator 1.51% : 0.000005s : 33: predicate.tuple_list_get_item_depend_reorder 2.94% : 0.000009s : 49: predicate.tuple_list_get_item_eliminator 1.47% : 0.000005s : 33: predicate.tuple_list_get_set_item_eliminator 2.31% : 0.000007s : 45: predicate.tuple_list_set_item_eliminator 1.69% : 0.000005s : 37: predicate.tuple_to_list_eliminator_ 2.48% : 0.000008s : 58: predicate.updatestate_pure_node_eliminater 3.18% : 0.000010s : 70: predicate.updatestate_useless_node_eliminater 0.42% : 0.000001s : 6: predicate.value_based_eliminate 0.67% : 0.000002s : 12: predicate.virtual_dataset_eliminate 0.61% : 0.000002s : 12: predicate.virtual_output_eliminate 0.27% : 0.000001s : 6: predicate.virtual_view_grad_eliminate 0.50% : 0.000002s : 6: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.001102 16 55.17% : 0.000608s : 6: func_graph_cloner_run.FuncGraphClonerGraph 44.83% : 0.000494s : 10: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.054646 192 0.01% : 0.000003s : 1: ForceFp32Comm 6.03% : 0.003293s : 1: add_attr 6.00% : 0.003280s : 1: add_attr_with_inline 0.01% : 0.000004s : 1: add_comm_op_reuse_tag 0.12% : 0.000066s : 1: add_recomputation 0.01% : 0.000004s : 1: assign_add_opt 0.20% : 0.000111s : 1: auto_monad 0.05% : 0.000025s : 1: auto_monad_reorder 0.01% : 0.000004s : 1: begin_end_overlap_inline 0.01% : 0.000006s : 1: bias_add_comm_swap 0.89% : 0.000489s : 1: bootstrap 0.06% : 0.000034s : 1: cconv 0.01% : 0.000004s : 1: comm_op_add_attrs 0.04% : 0.000021s : 1: control_data_broadcast_order 0.02% : 0.000013s : 1: convert_after_rewriter 0.06% : 0.000031s : 1: cse_after_recomputation 0.01% : 0.000005s : 1: dataset_repeat_opt 0.01% : 0.000005s : 1: detach_backward 0.02% : 0.000009s : 1: environ_conv 0.19% : 0.000105s : 1: event_method 0.01% : 0.000005s : 1: full_micro_interleaved_order_control 0.01% : 0.000005s : 1: get_jit_bprop_graph 0.02% : 0.000011s : 1: graph_reusing 0.01% : 0.000005s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000004s : 1: handle_group_info 0.01% : 0.000006s : 1: inline 0.01% : 0.000005s : 1: insert-virtual-dataset 0.01% : 0.000004s : 1: interleave_parallel_branches 0.01% : 0.000004s : 1: interleave_split_concat_branches 0.01% : 0.000006s : 1: label_fine_grained_interleaved_index 0.01% : 0.000007s : 1: label_micro_interleaved_index 0.87% : 0.000475s : 1: loop_unroll 0.01% : 0.000004s : 1: merge_cast_opt 0.01% : 0.000005s : 1: micro_interleaved_order_control 1.00% : 0.000548s : 1: mutable_eliminate 0.01% : 0.000007s : 1: offloading_packed_experts 0.04% : 0.000020s : 1: opt.transform.loop_unroll_optimizer 0.04% : 0.000021s : 1: opt.transform.mutable_eliminate 3.59% : 0.001961s : 78: opt.transform.opt_a 0.08% : 0.000044s : 1: opt.transform.opt_after_cconv 0.06% : 0.000033s : 1: opt.transform.opt_after_jit_grad 0.32% : 0.000176s : 28: opt.transform.opt_b 0.13% : 0.000071s : 2: opt.transform.opt_trans_graph 0.10% : 0.000053s : 4: opt.transform.symbol_engine_opt 7.21% : 0.003937s : 1: opt_a 0.25% : 0.000135s : 1: opt_after_cconv 0.88% : 0.000483s : 1: opt_after_jit_grad 0.57% : 0.000310s : 1: opt_b 11.61% : 0.006346s : 1: optimize 0.04% : 0.000024s : 1: optimize_parallel_all_gather_comm 0.02% : 0.000010s : 1: order_py_execute_after_rewriter 0.05% : 0.000030s : 1: overlap_grad_flash_sp 0.01% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.01% : 0.000008s : 1: overlap_grad_ring_attention 0.01% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.01% : 0.000005s : 1: overlap_param_gather 0.01% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.01% : 0.000008s : 1: overlap_recompute_and_grad_model_parallel 0.01% : 0.000005s : 1: overlap_recompute_comm 0.01% : 0.000007s : 1: parallel-infer-symbol 0.01% : 0.000004s : 1: parallel-infer-symbol-second 0.01% : 0.000005s : 1: partial_unused_args_eliminate 0.01% : 0.000005s : 1: pipeline_parallel_scheduler 0.01% : 0.000004s : 1: pipeline_split 0.09% : 0.000050s : 1: pre_auto_parallel 0.07% : 0.000037s : 1: py_interpret_to_execute 0.03% : 0.000016s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000003s : 1: remove_cast_before_assign_add 0.08% : 0.000043s : 1: remove_dup_value 0.99% : 0.000539s : 1: renormalize.infer 0.85% : 0.000466s : 1: renormalize.specialize 0.01% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.01% : 0.000008s : 1: rewriter_after_jit_bprop_graph 0.10% : 0.000055s : 1: rewriter_after_opt_a 0.23% : 0.000125s : 1: rewriter_before_opt_a 0.01% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.01% : 0.000005s : 1: slice_recompute_activation 0.01% : 0.000005s : 1: split_layernorm_comm 0.01% : 0.000005s : 1: split_matmul_comm_elemetwise 0.02% : 0.000010s : 1: swap_dp_allreduce_reducescatter 0.18% : 0.000097s : 1: symbol_engine_optimizer 0.19% : 0.000101s : 1: tuple_transform 56.25% : 0.030737s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:14.499.308 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:14.499.581 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0252397, [21] [bootstrap]: 0.00043507 [type_inference]: 0.0132831 [event_method]: 9.703e-05 [auto_monad]: 9.948e-05 [graph_reusing]: 8.08001e-06 [inline]: 1.89e-06 [add_attr]: 0.00322317, [1] [add_attr_with_inline]: 0.00321314, [1] [Cycle 1]: 8.787e-05, [2] [tag_attr]: 3.024e-05 [meta_addattr_fg_expand]: 8.73001e-06 [parallel-infer-symbol]: 3.25002e-06 [pre_auto_parallel]: 4.633e-05 [insert-virtual-dataset]: 2.39001e-06 [parallel-infer-symbol-second]: 8.79983e-07 [dataset_repeat_opt]: 1.94999e-06 [pipeline_split]: 1.62001e-06 [optimize]: 0.0067192, [53] [py_interpret_to_execute]: 4.239e-05 [rewriter_before_opt_a]: 0.00012853 [opt_a]: 0.00412183, [2] [Cycle 1]: 0.0030658, [45] [expand_dump_flag]: 4.52e-06 [switch_simplify]: 0.00013573 [loop_unroll]: 4.436e-05 [a_1]: 0.00093003 [with_stream_mark]: 2.038e-05 [recompute_prepare]: 1.205e-05 [updatestate_depend_eliminate]: 4.94998e-06 [updatestate_assign_eliminate]: 4.10998e-06 [updatestate_loads_eliminate]: 3.8e-06 [parameter_eliminate]: 2.05002e-06 [a_2]: 0.00013627 [accelerated_algorithm]: 9.64e-06 [shard]: 1.84e-06 [meta_shard_fg_expand]: 2.41e-06 [shard_inline]: 8.96002e-06 [merge_send_recv]: 1.026e-05 [auto_parallel]: 7.73999e-06 [parallel]: 2.025e-05 [flash_sp]: 8.63001e-06 [merge_comm]: 5.42001e-06 [allreduce_fusion]: 4.74e-06 [matmul_add_comm_reduction]: 1.067e-05 [allreduce_slice_to_reducescatter]: 7.00005e-07 [virtual_shard_identity]: 1.031e-05 [virtual_dataset]: 8.52e-06 [get_grad_eliminate_]: 7.75998e-06 [virtual_output]: 8.40999e-06 [merge_forward]: 5.53002e-06 [cell_reuse_recompute_pass]: 1.30001e-06 [offload_activation]: 1.214e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.007e-05 [merge_recompute_call_nodes]: 1.66998e-06 [before_grad]: 1.464e-05 [set_forward_comm_id_for_comm_node_pass]: 4.90001e-06 [meta_fg_expand]: 4.08001e-06 [flash_sp_send_recv_attached]: 2.44001e-06 [receive_attached]: 2.09e-06 [after_resolve]: 1.463e-05 [a_after_grad]: 1.406e-05 [renormalize]: 0.0009865 [add_forward_monad_depend]: 6.28e-06 [auto_monad_grad]: 2.43e-06 [auto_monad_eliminator]: 1.855e-05 [cse]: 3.533e-05 [a_3]: 7.573e-05 [Cycle 2]: 0.00104155, [45] [expand_dump_flag]: 1.08001e-06 [switch_simplify]: 1.023e-05 [loop_unroll]: 8.25999e-06 [a_1]: 0.00021616 [with_stream_mark]: 1.463e-05 [recompute_prepare]: 8.79e-06 [updatestate_depend_eliminate]: 4.79998e-06 [updatestate_assign_eliminate]: 3.26001e-06 [updatestate_loads_eliminate]: 3.35998e-06 [parameter_eliminate]: 1.85001e-06 [a_2]: 0.0001337 [accelerated_algorithm]: 8.55001e-06 [shard]: 1.54e-06 [meta_shard_fg_expand]: 2.09999e-06 [shard_inline]: 8.55999e-06 [merge_send_recv]: 7.36001e-06 [auto_parallel]: 6.54001e-06 [parallel]: 6.44999e-06 [flash_sp]: 4.33999e-06 [merge_comm]: 4.57e-06 [allreduce_fusion]: 3.88001e-06 [matmul_add_comm_reduction]: 8.86002e-06 [allreduce_slice_to_reducescatter]: 4.50003e-07 [virtual_shard_identity]: 9.57001e-06 [virtual_dataset]: 8.15e-06 [get_grad_eliminate_]: 8.23001e-06 [virtual_output]: 8.15999e-06 [merge_forward]: 3.90998e-06 [cell_reuse_recompute_pass]: 2.15002e-06 [offload_activation]: 9.24998e-06 [cell_reuse_handle_not_recompute_node_pass]: 2.032e-05 [merge_recompute_call_nodes]: 9.5999e-07 [before_grad]: 1.272e-05 [set_forward_comm_id_for_comm_node_pass]: 4.59002e-06 [meta_fg_expand]: 3.24001e-06 [flash_sp_send_recv_attached]: 9.00007e-07 [receive_attached]: 1.41998e-06 [after_resolve]: 1.282e-05 [a_after_grad]: 1.153e-05 [renormalize]: 8.9989e-08 [add_forward_monad_depend]: 2.15002e-06 [auto_monad_grad]: 1.30001e-06 [auto_monad_eliminator]: 1.003e-05 [cse]: 2.184e-05 [a_3]: 6.689e-05 [py_interpret_to_execute_after_opt_a]: 1.791e-05 [slice_cell_reuse_recomputed_activation]: 4.74e-06 [rewriter_after_opt_a]: 4.999e-05 [convert_after_rewriter]: 1.21e-05 [order_py_execute_after_rewriter]: 1.057e-05 [mutable_eliminate]: 0.00057929 [opt_b]: 0.00031996, [1] [Cycle 1]: 0.00031149, [7] [b_1]: 0.00020505 [b_2]: 9.47999e-06 [updatestate_depend_eliminate]: 7.21001e-06 [updatestate_assign_eliminate]: 3.23e-06 [updatestate_loads_eliminate]: 3.13e-06 [renormalize]: 5.89993e-07 [cse]: 2.6e-05 [optimize_parallel_all_gather_comm]: 2.187e-05 [overlap_param_gather]: 4.72e-06 [cconv]: 3.118e-05 [loop_unroll]: 0.00046854 [opt_after_cconv]: 0.00014605, [1] [Cycle 1]: 0.00013718, [7] [c_1]: 3.9e-05 [parameter_eliminate]: 3.97998e-06 [updatestate_depend_eliminate]: 7.61999e-06 [updatestate_assign_eliminate]: 3.23e-06 [updatestate_loads_eliminate]: 2.84999e-06 [cse]: 2.357e-05 [renormalize]: 8.10018e-07 [remove_dup_value]: 1.726e-05 [tuple_transform]: 0.00010294, [1] [Cycle 1]: 9.59e-05, [4] [d_1]: 5.469e-05 [none_parameter_eliminate]: 1.77001e-06 [renormalize]: 2.89991e-07 [switch_simplify]: 9.12999e-06 [partial_unused_args_eliminate]: 4.27e-06 [add_recomputation]: 6.072e-05 [cse_after_recomputation]: 3.241e-05, [1] [Cycle 1]: 2.553e-05, [1] [cse]: 1.585e-05 [environ_conv]: 9.14998e-06 [swap_dp_allreduce_reducescatter]: 9.59e-06 [bias_add_comm_swap]: 5.22e-06 [label_micro_interleaved_index]: 7.35e-06 [label_fine_grained_interleaved_index]: 5.74e-06 [merge_cast_opt]: 3.56001e-06 [slice_recompute_activation]: 4.47e-06 [micro_interleaved_order_control]: 4.63999e-06 [assign_add_opt]: 3.44001e-06 [ForceFp32Comm]: 3.01999e-06 [remove_cast_before_assign_add]: 3.35e-06 [full_micro_interleaved_order_control]: 4.48999e-06 [reorder_send_recv_between_fp_bp]: 5.56e-06 [comm_op_add_attrs]: 3.14999e-06 [add_comm_op_reuse_tag]: 3.45003e-06 [interleave_split_concat_branches]: 3.41001e-06 [interleave_parallel_branches]: 1.97e-05 [overlap_opt_shard_in_pipeline]: 3.5e-06 [overlap_opt_shard_grad_in_pipeline]: 4.48001e-06 [control_data_broadcast_order]: 1.986e-05 [grouped_pairwise_exchange_alltoall]: 4.12003e-06 [offloading_packed_experts]: 7.21999e-06 [overlap_recompute_and_grad_model_parallel]: 7.65998e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.48999e-06 [overlap_recompute_allgather_and_fa_grad]: 3.78001e-06 [overlap_recompute_comm]: 4.68001e-06 [overlap_grad_ring_attention]: 6.82002e-06 [overlap_grad_flash_sp]: 2.534e-05 [begin_end_overlap_inline]: 2.82002e-06 [split_matmul_comm_elemetwise]: 4.55999e-06 [split_layernorm_comm]: 4.15999e-06 [handle_group_info]: 3.43999e-06 [symbol_engine_optimizer]: 0.00011373, [1] [Cycle 1]: 0.00010578, [6] [build]: 3.91999e-06 [elim_shapecalc]: 1.283e-05 [elim_not_effective]: 1.734e-05 [opt_reshape]: 9.11002e-06 [fold_const_symbol]: 1.321e-05 [renormalize]: 2.50002e-07 [detach_backward]: 3.86999e-06 [pipeline_parallel_scheduler]: 1.84998e-06 [auto_monad_reorder]: 2.499e-05 [get_jit_bprop_graph]: 1.42999e-06 [rewriter_after_jit_bprop_graph]: 5.89999e-06 [opt_after_jit_grad]: 0.0005742 [validate]: 4.35e-05 Sums bootstrap : 0.000435s : 2.16% type_inference : 0.013283s : 65.94% event_method : 0.000097s : 0.48% auto_monad : 0.000099s : 0.49% graph_reusing : 0.000008s : 0.04% inline : 0.000002s : 0.01% add_attr.add_attr_with_inline.tag_attr : 0.000030s : 0.15% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000009s : 0.04% parallel-infer-symbol : 0.000003s : 0.02% pre_auto_parallel : 0.000046s : 0.23% insert-virtual-dataset : 0.000002s : 0.01% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.01% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000042s : 0.21% optimize.rewriter_before_opt_a : 0.000129s : 0.64% optimize.opt_a.expand_dump_flag : 0.000006s : 0.03% optimize.opt_a.switch_simplify : 0.000146s : 0.72% optimize.opt_a.loop_unroll : 0.000053s : 0.26% optimize.opt_a.a_1 : 0.001146s : 5.69% optimize.opt_a.with_stream_mark : 0.000035s : 0.17% optimize.opt_a.recompute_prepare : 0.000021s : 0.10% optimize.opt_a.updatestate_depend_eliminate : 0.000010s : 0.05% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.04% optimize.opt_a.updatestate_loads_eliminate : 0.000007s : 0.04% optimize.opt_a.parameter_eliminate : 0.000004s : 0.02% optimize.opt_a.a_2 : 0.000270s : 1.34% optimize.opt_a.accelerated_algorithm : 0.000018s : 0.09% optimize.opt_a.shard : 0.000003s : 0.02% optimize.opt_a.meta_shard_fg_expand : 0.000005s : 0.02% optimize.opt_a.shard_inline : 0.000018s : 0.09% optimize.opt_a.merge_send_recv : 0.000018s : 0.09% optimize.opt_a.auto_parallel : 0.000014s : 0.07% optimize.opt_a.parallel : 0.000027s : 0.13% optimize.opt_a.flash_sp : 0.000013s : 0.06% optimize.opt_a.merge_comm : 0.000010s : 0.05% optimize.opt_a.allreduce_fusion : 0.000009s : 0.04% optimize.opt_a.matmul_add_comm_reduction : 0.000020s : 0.10% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000020s : 0.10% optimize.opt_a.virtual_dataset : 0.000017s : 0.08% optimize.opt_a.get_grad_eliminate_ : 0.000016s : 0.08% optimize.opt_a.virtual_output : 0.000017s : 0.08% optimize.opt_a.merge_forward : 0.000009s : 0.05% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.02% optimize.opt_a.offload_activation : 0.000021s : 0.11% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000040s : 0.20% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.01% optimize.opt_a.before_grad : 0.000027s : 0.14% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000009s : 0.05% optimize.opt_a.meta_fg_expand : 0.000007s : 0.04% optimize.opt_a.flash_sp_send_recv_attached : 0.000003s : 0.02% optimize.opt_a.receive_attached : 0.000004s : 0.02% optimize.opt_a.after_resolve : 0.000027s : 0.14% optimize.opt_a.a_after_grad : 0.000026s : 0.13% optimize.opt_a.renormalize : 0.000987s : 4.90% optimize.opt_a.add_forward_monad_depend : 0.000008s : 0.04% optimize.opt_a.auto_monad_grad : 0.000004s : 0.02% optimize.opt_a.auto_monad_eliminator : 0.000029s : 0.14% optimize.opt_a.cse : 0.000057s : 0.28% optimize.opt_a.a_3 : 0.000143s : 0.71% optimize.py_interpret_to_execute_after_opt_a : 0.000018s : 0.09% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.02% optimize.rewriter_after_opt_a : 0.000050s : 0.25% optimize.convert_after_rewriter : 0.000012s : 0.06% optimize.order_py_execute_after_rewriter : 0.000011s : 0.05% optimize.mutable_eliminate : 0.000579s : 2.88% optimize.opt_b.b_1 : 0.000205s : 1.02% optimize.opt_b.b_2 : 0.000009s : 0.05% optimize.opt_b.updatestate_depend_eliminate : 0.000007s : 0.04% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.02% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.02% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000026s : 0.13% optimize.optimize_parallel_all_gather_comm : 0.000022s : 0.11% optimize.overlap_param_gather : 0.000005s : 0.02% optimize.cconv : 0.000031s : 0.15% optimize.loop_unroll : 0.000469s : 2.33% optimize.opt_after_cconv.c_1 : 0.000039s : 0.19% optimize.opt_after_cconv.parameter_eliminate : 0.000004s : 0.02% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000008s : 0.04% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.02% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.cse : 0.000024s : 0.12% optimize.opt_after_cconv.renormalize : 0.000001s : 0.00% optimize.remove_dup_value : 0.000017s : 0.09% optimize.tuple_transform.d_1 : 0.000055s : 0.27% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000009s : 0.05% optimize.partial_unused_args_eliminate : 0.000004s : 0.02% optimize.add_recomputation : 0.000061s : 0.30% optimize.cse_after_recomputation.cse : 0.000016s : 0.08% optimize.environ_conv : 0.000009s : 0.05% optimize.swap_dp_allreduce_reducescatter : 0.000010s : 0.05% optimize.bias_add_comm_swap : 0.000005s : 0.03% optimize.label_micro_interleaved_index : 0.000007s : 0.04% optimize.label_fine_grained_interleaved_index : 0.000006s : 0.03% optimize.merge_cast_opt : 0.000004s : 0.02% optimize.slice_recompute_activation : 0.000004s : 0.02% optimize.micro_interleaved_order_control : 0.000005s : 0.02% optimize.assign_add_opt : 0.000003s : 0.02% optimize.ForceFp32Comm : 0.000003s : 0.01% optimize.remove_cast_before_assign_add : 0.000003s : 0.02% optimize.full_micro_interleaved_order_control : 0.000004s : 0.02% optimize.reorder_send_recv_between_fp_bp : 0.000006s : 0.03% optimize.comm_op_add_attrs : 0.000003s : 0.02% optimize.add_comm_op_reuse_tag : 0.000003s : 0.02% optimize.interleave_split_concat_branches : 0.000003s : 0.02% optimize.interleave_parallel_branches : 0.000020s : 0.10% optimize.overlap_opt_shard_in_pipeline : 0.000003s : 0.02% optimize.overlap_opt_shard_grad_in_pipeline : 0.000004s : 0.02% optimize.control_data_broadcast_order : 0.000020s : 0.10% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.02% optimize.offloading_packed_experts : 0.000007s : 0.04% optimize.overlap_recompute_and_grad_model_parallel : 0.000008s : 0.04% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000003s : 0.02% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.02% optimize.overlap_recompute_comm : 0.000005s : 0.02% optimize.overlap_grad_ring_attention : 0.000007s : 0.03% optimize.overlap_grad_flash_sp : 0.000025s : 0.13% optimize.begin_end_overlap_inline : 0.000003s : 0.01% optimize.split_matmul_comm_elemetwise : 0.000005s : 0.02% optimize.split_layernorm_comm : 0.000004s : 0.02% optimize.handle_group_info : 0.000003s : 0.02% optimize.symbol_engine_optimizer.build : 0.000004s : 0.02% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000013s : 0.06% optimize.symbol_engine_optimizer.elim_not_effective : 0.000017s : 0.09% optimize.symbol_engine_optimizer.opt_reshape : 0.000009s : 0.05% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000013s : 0.07% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000004s : 0.02% pipeline_parallel_scheduler : 0.000002s : 0.01% auto_monad_reorder : 0.000025s : 0.12% get_jit_bprop_graph : 0.000001s : 0.01% rewriter_after_jit_bprop_graph : 0.000006s : 0.03% opt_after_jit_grad : 0.000574s : 2.85% validate : 0.000044s : 0.22% Time group info: ------[substitution.] 0.000263 44 9.65% : 0.000025s : 3: substitution.cast_eliminate 0.98% : 0.000003s : 3: substitution.elim_not_effective 0.68% : 0.000002s : 3: substitution.fold_const_symbol 2.42% : 0.000006s : 5: substitution.graph_param_transform 71.13% : 0.000187s : 8: substitution.inline 1.86% : 0.000005s : 6: substitution.j_node_and_user_rematch 2.53% : 0.000007s : 6: substitution.remove_not_recompute_node 1.81% : 0.000005s : 4: substitution.replace_old_param 3.95% : 0.000010s : 2: substitution.switch_simplify 4.98% : 0.000013s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.013218 2 89.76% : 0.011864s : 1: type_inference.infer 10.24% : 0.001354s : 1: type_inference.specialize ------[replace.] 0.000128 14 42.55% : 0.000055s : 8: replace.inline 38.49% : 0.000049s : 2: replace.switch_simplify 18.96% : 0.000024s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000203 14 90.09% : 0.000183s : 8: match.inline 4.45% : 0.000009s : 2: match.switch_simplify 5.46% : 0.000011s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000286 1746 0.95% : 0.000003s : 19: predicate.accumulaten_eliminater 0.77% : 0.000002s : 5: predicate.ad_related_special_op_eliminate 0.53% : 0.000002s : 10: predicate.addn_check_dump 1.05% : 0.000003s : 19: predicate.addn_zero_filter 0.86% : 0.000002s : 19: predicate.adjust_all_reduce_mul_add 2.15% : 0.000006s : 29: predicate.arithmetic_simplify 1.09% : 0.000003s : 19: predicate.cast_eliminate 0.57% : 0.000002s : 10: predicate.check_bprop_eliminate 0.50% : 0.000001s : 10: predicate.compare_switch_simplify 0.18% : 0.000001s : 5: predicate.const_output_eliminate 0.57% : 0.000002s : 10: predicate.depend_value_elim 1.02% : 0.000003s : 19: predicate.dict_get_item_const_eliminator 1.07% : 0.000003s : 19: predicate.dict_get_item_eliminator 1.01% : 0.000003s : 19: predicate.dict_set_item_eliminator 0.81% : 0.000002s : 10: predicate.dumpgradient_eliminate 0.20% : 0.000001s : 5: predicate.elim_not_effective 0.33% : 0.000001s : 5: predicate.elim_shapecalc_of_broadcastargs 1.21% : 0.000003s : 24: predicate.environ_add_const_eliminate 1.12% : 0.000003s : 24: predicate.environ_get_add_eliminate 1.14% : 0.000003s : 24: predicate.environ_get_depend_swap 1.70% : 0.000005s : 34: predicate.environ_get_eliminate 1.12% : 0.000003s : 24: predicate.environ_get_set_eliminate 1.58% : 0.000005s : 31: predicate.exchange_switch_depend_value 2.45% : 0.000007s : 31: predicate.float_depend_g_call 0.51% : 0.000001s : 10: predicate.float_environ_get_switch 0.73% : 0.000002s : 15: predicate.float_tuple_getitem_switch 0.17% : 0.000000s : 5: predicate.fold_const_symbol 0.62% : 0.000002s : 10: predicate.get_grad_eliminate 0.17% : 0.000000s : 5: predicate.graph_param_transform 0.58% : 0.000002s : 10: predicate.incorporate_call 0.49% : 0.000001s : 10: predicate.incorporate_call_switch 6.43% : 0.000018s : 80: predicate.inline 0.70% : 0.000002s : 10: predicate.inline_without_move 0.28% : 0.000001s : 10: predicate.j_node_and_user_rematch 0.81% : 0.000002s : 10: predicate.less_batch_normalization 1.78% : 0.000005s : 33: predicate.list_to_tuple_eliminator_ 2.42% : 0.000007s : 52: predicate.load_eliminater 0.86% : 0.000002s : 5: predicate.loop_unroll_after_grad 2.54% : 0.000007s : 49: predicate.loop_unroll_before_grad 1.75% : 0.000005s : 29: predicate.make_slice_get_slice_eliminator 0.54% : 0.000002s : 10: predicate.merge_addn 0.49% : 0.000001s : 10: predicate.micro_step_allgather_replace 0.52% : 0.000001s : 10: predicate.mini_step_allgather_replace 0.87% : 0.000003s : 19: predicate.minmaximum_grad 0.86% : 0.000002s : 5: predicate.mutable_eliminate 0.34% : 0.000001s : 5: predicate.opt_reshape 0.31% : 0.000001s : 5: predicate.parallel_virtual_node 1.97% : 0.000006s : 31: predicate.partial_defer_inline 1.62% : 0.000005s : 28: predicate.partial_eliminate 0.89% : 0.000003s : 19: predicate.print_const_string_wrapper 0.59% : 0.000002s : 10: predicate.reduce_all_const_elim 1.29% : 0.000004s : 19: predicate.reduce_eliminate 2.46% : 0.000007s : 52: predicate.redundant_stop_gradient_eliminater 0.35% : 0.000001s : 10: predicate.remove_not_recompute_node 1.34% : 0.000004s : 33: predicate.replace_applicator 0.52% : 0.000001s : 10: predicate.replace_old_param 0.29% : 0.000001s : 5: predicate.reset_defer_inline 1.07% : 0.000003s : 19: predicate.reshape_eliminate 0.52% : 0.000001s : 10: predicate.row_tensor_add_zeros_like 0.31% : 0.000001s : 5: predicate.row_tensor_eliminate 0.69% : 0.000002s : 10: predicate.same_eliminate 0.36% : 0.000001s : 10: predicate.set_cell_output_no_recompute 0.75% : 0.000002s : 10: predicate.shard_identity_eliminate 0.65% : 0.000002s : 10: predicate.special_op_eliminate 0.65% : 0.000002s : 10: predicate.specialize_transform 0.73% : 0.000002s : 10: predicate.split_environ_get_set_with_tuple_value 0.66% : 0.000002s : 10: predicate.stack_unstack_eliminate 0.29% : 0.000001s : 5: predicate.switch_call_monad_eliminater 1.71% : 0.000005s : 31: predicate.switch_defer_inline 2.18% : 0.000006s : 41: predicate.switch_layer_defer_inline 6.02% : 0.000017s : 99: predicate.switch_simplify 1.04% : 0.000003s : 19: predicate.tile_eliminate 0.99% : 0.000003s : 19: predicate.transpose_eliminate 1.59% : 0.000005s : 29: predicate.tuple_list_convert_item_index_to_positive 1.70% : 0.000005s : 29: predicate.tuple_list_get_item_const_eliminator 1.49% : 0.000004s : 29: predicate.tuple_list_get_item_depend_reorder 3.05% : 0.000009s : 43: predicate.tuple_list_get_item_eliminator 1.54% : 0.000004s : 29: predicate.tuple_list_get_set_item_eliminator 2.32% : 0.000007s : 39: predicate.tuple_list_set_item_eliminator 1.76% : 0.000005s : 33: predicate.tuple_to_list_eliminator_ 2.49% : 0.000007s : 52: predicate.updatestate_pure_node_eliminater 3.08% : 0.000009s : 62: predicate.updatestate_useless_node_eliminater 0.33% : 0.000001s : 5: predicate.value_based_eliminate 0.60% : 0.000002s : 10: predicate.virtual_dataset_eliminate 0.58% : 0.000002s : 10: predicate.virtual_output_eliminate 0.23% : 0.000001s : 5: predicate.virtual_view_grad_eliminate 0.52% : 0.000001s : 5: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000986 16 55.43% : 0.000547s : 6: func_graph_cloner_run.FuncGraphClonerGraph 44.57% : 0.000440s : 10: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.038193 192 0.01% : 0.000006s : 1: ForceFp32Comm 8.46% : 0.003232s : 1: add_attr 8.42% : 0.003218s : 1: add_attr_with_inline 0.02% : 0.000006s : 1: add_comm_op_reuse_tag 0.17% : 0.000064s : 1: add_recomputation 0.02% : 0.000007s : 1: assign_add_opt 0.29% : 0.000111s : 1: auto_monad 0.09% : 0.000033s : 1: auto_monad_reorder 0.01% : 0.000006s : 1: begin_end_overlap_inline 0.02% : 0.000008s : 1: bias_add_comm_swap 1.25% : 0.000478s : 1: bootstrap 0.09% : 0.000035s : 1: cconv 0.02% : 0.000006s : 1: comm_op_add_attrs 0.06% : 0.000023s : 1: control_data_broadcast_order 0.04% : 0.000016s : 1: convert_after_rewriter 0.09% : 0.000035s : 1: cse_after_recomputation 0.02% : 0.000007s : 1: dataset_repeat_opt 0.06% : 0.000024s : 1: detach_backward 0.03% : 0.000012s : 1: environ_conv 0.30% : 0.000114s : 1: event_method 0.02% : 0.000007s : 1: full_micro_interleaved_order_control 0.02% : 0.000008s : 1: get_jit_bprop_graph 0.04% : 0.000015s : 1: graph_reusing 0.02% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.02% : 0.000006s : 1: handle_group_info 0.02% : 0.000007s : 1: inline 0.02% : 0.000008s : 1: insert-virtual-dataset 0.06% : 0.000023s : 1: interleave_parallel_branches 0.02% : 0.000006s : 1: interleave_split_concat_branches 0.02% : 0.000009s : 1: label_fine_grained_interleaved_index 0.03% : 0.000010s : 1: label_micro_interleaved_index 1.24% : 0.000475s : 1: loop_unroll 0.02% : 0.000006s : 1: merge_cast_opt 0.02% : 0.000007s : 1: micro_interleaved_order_control 1.54% : 0.000586s : 1: mutable_eliminate 0.03% : 0.000010s : 1: offloading_packed_experts 0.05% : 0.000017s : 1: opt.transform.loop_unroll_optimizer 0.05% : 0.000018s : 1: opt.transform.mutable_eliminate 4.75% : 0.001814s : 78: opt.transform.opt_a 0.10% : 0.000038s : 1: opt.transform.opt_after_cconv 0.09% : 0.000033s : 1: opt.transform.opt_after_jit_grad 0.37% : 0.000141s : 28: opt.transform.opt_b 0.16% : 0.000062s : 2: opt.transform.opt_trans_graph 0.13% : 0.000048s : 4: opt.transform.symbol_engine_opt 10.80% : 0.004126s : 1: opt_a 0.39% : 0.000150s : 1: opt_after_cconv 1.53% : 0.000586s : 1: opt_after_jit_grad 0.85% : 0.000323s : 1: opt_b 18.63% : 0.007114s : 1: optimize 0.07% : 0.000025s : 1: optimize_parallel_all_gather_comm 0.04% : 0.000014s : 1: order_py_execute_after_rewriter 0.08% : 0.000029s : 1: overlap_grad_flash_sp 0.02% : 0.000006s : 1: overlap_grad_matmul_and_grad_allreduce 0.03% : 0.000010s : 1: overlap_grad_ring_attention 0.02% : 0.000007s : 1: overlap_opt_shard_grad_in_pipeline 0.02% : 0.000006s : 1: overlap_opt_shard_in_pipeline 0.02% : 0.000008s : 1: overlap_param_gather 0.02% : 0.000007s : 1: overlap_recompute_allgather_and_fa_grad 0.03% : 0.000010s : 1: overlap_recompute_and_grad_model_parallel 0.02% : 0.000007s : 1: overlap_recompute_comm 0.03% : 0.000010s : 1: parallel-infer-symbol 0.02% : 0.000007s : 1: parallel-infer-symbol-second 0.02% : 0.000007s : 1: partial_unused_args_eliminate 0.02% : 0.000009s : 1: pipeline_parallel_scheduler 0.02% : 0.000007s : 1: pipeline_split 0.14% : 0.000055s : 1: pre_auto_parallel 0.12% : 0.000047s : 1: py_interpret_to_execute 0.06% : 0.000022s : 1: py_interpret_to_execute_after_opt_a 0.02% : 0.000006s : 1: remove_cast_before_assign_add 0.05% : 0.000020s : 1: remove_dup_value 1.37% : 0.000522s : 1: renormalize.infer 1.19% : 0.000455s : 1: renormalize.specialize 0.02% : 0.000008s : 1: reorder_send_recv_between_fp_bp 0.03% : 0.000012s : 1: rewriter_after_jit_bprop_graph 0.14% : 0.000054s : 1: rewriter_after_opt_a 0.35% : 0.000132s : 1: rewriter_before_opt_a 0.02% : 0.000008s : 1: slice_cell_reuse_recomputed_activation 0.02% : 0.000007s : 1: slice_recompute_activation 0.02% : 0.000007s : 1: split_layernorm_comm 0.02% : 0.000007s : 1: split_matmul_comm_elemetwise 0.03% : 0.000012s : 1: swap_dp_allreduce_reducescatter 0.31% : 0.000117s : 1: symbol_engine_optimizer 0.28% : 0.000106s : 1: tuple_transform 34.88% : 0.013323s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:14.806.356 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0450683, [21] [bootstrap]: 0.00046683 [type_inference]: 0.0318197 [event_method]: 0.00016198 [auto_monad]: 0.00012418 [graph_reusing]: 8.89e-06 [inline]: 3.26999e-06 [add_attr]: 0.00422065, [1] [add_attr_with_inline]: 0.00420536, [1] [Cycle 1]: 0.00010047, [2] [tag_attr]: 3.76e-05 [meta_addattr_fg_expand]: 8.94998e-06 [parallel-infer-symbol]: 3.53e-06 [pre_auto_parallel]: 5.479e-05 [insert-virtual-dataset]: 2.48e-06 [parallel-infer-symbol-second]: 8.59989e-07 [dataset_repeat_opt]: 1.94999e-06 [pipeline_split]: 1.67999e-06 [optimize]: 0.0072562, [53] [py_interpret_to_execute]: 4.553e-05 [rewriter_before_opt_a]: 0.00013341 [opt_a]: 0.00437326, [2] [Cycle 1]: 0.0034544, [45] [expand_dump_flag]: 4.57e-06 [switch_simplify]: 0.00014335 [loop_unroll]: 4.275e-05 [a_1]: 0.00101629 [with_stream_mark]: 2.57e-05 [recompute_prepare]: 1.669e-05 [updatestate_depend_eliminate]: 5.71998e-06 [updatestate_assign_eliminate]: 3.8e-06 [updatestate_loads_eliminate]: 3.65998e-06 [parameter_eliminate]: 2.32001e-06 [a_2]: 0.00010903 [accelerated_algorithm]: 9.60001e-06 [shard]: 2.64999e-06 [meta_shard_fg_expand]: 2.89999e-06 [shard_inline]: 9.28002e-06 [merge_send_recv]: 1.107e-05 [auto_parallel]: 9.52999e-06 [parallel]: 2.067e-05 [flash_sp]: 9.96e-06 [merge_comm]: 5.51998e-06 [allreduce_fusion]: 4.39002e-06 [matmul_add_comm_reduction]: 1.188e-05 [allreduce_slice_to_reducescatter]: 8.89995e-07 [virtual_shard_identity]: 1.278e-05 [virtual_dataset]: 8.45999e-06 [get_grad_eliminate_]: 8.32998e-06 [virtual_output]: 8.34998e-06 [merge_forward]: 5.23002e-06 [cell_reuse_recompute_pass]: 1.44e-06 [offload_activation]: 1.152e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.189e-05 [merge_recompute_call_nodes]: 1.55999e-06 [before_grad]: 1.572e-05 [set_forward_comm_id_for_comm_node_pass]: 6.11e-06 [meta_fg_expand]: 3.91999e-06 [flash_sp_send_recv_attached]: 3.51001e-06 [receive_attached]: 2.59001e-06 [after_resolve]: 1.724e-05 [a_after_grad]: 1.402e-05 [renormalize]: 0.00137204 [add_forward_monad_depend]: 8.74998e-06 [auto_monad_grad]: 2.96001e-06 [auto_monad_eliminator]: 2.471e-05 [cse]: 3.933e-05 [a_3]: 7.356e-05 [Cycle 2]: 0.00090498, [45] [expand_dump_flag]: 2.06e-06 [switch_simplify]: 1.064e-05 [loop_unroll]: 7.83001e-06 [a_1]: 0.00020499 [with_stream_mark]: 2.311e-05 [recompute_prepare]: 1.004e-05 [updatestate_depend_eliminate]: 5.02e-06 [updatestate_assign_eliminate]: 3.31001e-06 [updatestate_loads_eliminate]: 3.35e-06 [parameter_eliminate]: 2.21e-06 [a_2]: 0.00010172 [accelerated_algorithm]: 1.05e-05 [shard]: 1.86e-06 [meta_shard_fg_expand]: 2.23002e-06 [shard_inline]: 8.47e-06 [merge_send_recv]: 9.19e-06 [auto_parallel]: 8.60001e-06 [parallel]: 8.79e-06 [flash_sp]: 4.26001e-06 [merge_comm]: 4e-06 [allreduce_fusion]: 4.05998e-06 [matmul_add_comm_reduction]: 1.176e-05 [allreduce_slice_to_reducescatter]: 7.10017e-07 [virtual_shard_identity]: 1.062e-05 [virtual_dataset]: 7.46001e-06 [get_grad_eliminate_]: 7.59002e-06 [virtual_output]: 7.66999e-06 [merge_forward]: 5.15001e-06 [cell_reuse_recompute_pass]: 2.93e-06 [offload_activation]: 1.116e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.802e-05 [merge_recompute_call_nodes]: 1.36002e-06 [before_grad]: 1.494e-05 [set_forward_comm_id_for_comm_node_pass]: 5.89999e-06 [meta_fg_expand]: 3.29001e-06 [flash_sp_send_recv_attached]: 1.67001e-06 [receive_attached]: 1.37e-06 [after_resolve]: 1.509e-05 [a_after_grad]: 1.223e-05 [renormalize]: 1.80007e-07 [add_forward_monad_depend]: 3.53999e-06 [auto_monad_grad]: 1.94e-06 [auto_monad_eliminator]: 1.309e-05 [cse]: 2.501e-05 [a_3]: 5.226e-05 [py_interpret_to_execute_after_opt_a]: 1.802e-05 [slice_cell_reuse_recomputed_activation]: 2.06e-06 [rewriter_after_opt_a]: 4.988e-05 [convert_after_rewriter]: 8.12e-06 [order_py_execute_after_rewriter]: 7.21999e-06 [mutable_eliminate]: 0.00078255 [opt_b]: 0.00028023, [1] [Cycle 1]: 0.0002715, [7] [b_1]: 0.0001697 [b_2]: 9.45001e-06 [updatestate_depend_eliminate]: 8.56002e-06 [updatestate_assign_eliminate]: 3.23998e-06 [updatestate_loads_eliminate]: 3.48e-06 [renormalize]: 8.40024e-07 [cse]: 3.7e-05 [optimize_parallel_all_gather_comm]: 2.281e-05 [overlap_param_gather]: 2.26e-06 [cconv]: 3.623e-05 [loop_unroll]: 0.00057055 [opt_after_cconv]: 0.00014587, [1] [Cycle 1]: 0.00013786, [7] [c_1]: 4.482e-05 [parameter_eliminate]: 5.76e-06 [updatestate_depend_eliminate]: 8.55999e-06 [updatestate_assign_eliminate]: 3.25998e-06 [updatestate_loads_eliminate]: 4.42003e-06 [cse]: 3.161e-05 [renormalize]: 7.7e-07 [remove_dup_value]: 1.716e-05 [tuple_transform]: 9.753e-05, [1] [Cycle 1]: 9.158e-05, [4] [d_1]: 6.101e-05 [none_parameter_eliminate]: 2.11e-06 [renormalize]: 1.39989e-07 [switch_simplify]: 8.79998e-06 [partial_unused_args_eliminate]: 2.38998e-06 [add_recomputation]: 6.83e-05 [cse_after_recomputation]: 3.034e-05, [1] [Cycle 1]: 2.439e-05, [1] [cse]: 1.81e-05 [environ_conv]: 7.87e-06 [swap_dp_allreduce_reducescatter]: 7.04001e-06 [bias_add_comm_swap]: 3.06999e-06 [label_micro_interleaved_index]: 6.06e-06 [label_fine_grained_interleaved_index]: 2.71e-06 [merge_cast_opt]: 1.27e-06 [slice_recompute_activation]: 2.29001e-06 [micro_interleaved_order_control]: 3.04999e-06 [assign_add_opt]: 1.44998e-06 [ForceFp32Comm]: 8.30012e-07 [remove_cast_before_assign_add]: 1.12e-06 [full_micro_interleaved_order_control]: 2.06e-06 [reorder_send_recv_between_fp_bp]: 2.79999e-06 [comm_op_add_attrs]: 1.05999e-06 [add_comm_op_reuse_tag]: 9.39996e-07 [interleave_split_concat_branches]: 1.15999e-06 [interleave_parallel_branches]: 1.04e-06 [overlap_opt_shard_in_pipeline]: 1.29e-06 [overlap_opt_shard_grad_in_pipeline]: 2.14999e-06 [control_data_broadcast_order]: 2.124e-05 [grouped_pairwise_exchange_alltoall]: 1.72999e-06 [offloading_packed_experts]: 4.91002e-06 [overlap_recompute_and_grad_model_parallel]: 6.48e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.70001e-06 [overlap_recompute_allgather_and_fa_grad]: 1.67999e-06 [overlap_recompute_comm]: 2.47001e-06 [overlap_grad_ring_attention]: 5.53997e-06 [overlap_grad_flash_sp]: 2.565e-05 [begin_end_overlap_inline]: 9.00007e-07 [split_matmul_comm_elemetwise]: 2.41e-06 [split_layernorm_comm]: 1.82999e-06 [handle_group_info]: 1.27999e-06 [symbol_engine_optimizer]: 0.00010632, [1] [Cycle 1]: 0.00010085, [6] [build]: 5.40001e-06 [elim_shapecalc]: 1.919e-05 [elim_not_effective]: 1.842e-05 [opt_reshape]: 8.95001e-06 [fold_const_symbol]: 1.285e-05 [renormalize]: 4.00003e-07 [detach_backward]: 2.73998e-06 [pipeline_parallel_scheduler]: 1.49998e-06 [auto_monad_reorder]: 2.584e-05 [get_jit_bprop_graph]: 2.11e-06 [rewriter_after_jit_bprop_graph]: 6.12999e-06 [opt_after_jit_grad]: 0.00064072 [validate]: 5.411e-05 Sums bootstrap : 0.000467s : 1.18% type_inference : 0.031820s : 80.32% event_method : 0.000162s : 0.41% auto_monad : 0.000124s : 0.31% graph_reusing : 0.000009s : 0.02% inline : 0.000003s : 0.01% add_attr.add_attr_with_inline.tag_attr : 0.000038s : 0.09% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000009s : 0.02% parallel-infer-symbol : 0.000004s : 0.01% pre_auto_parallel : 0.000055s : 0.14% insert-virtual-dataset : 0.000002s : 0.01% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000046s : 0.11% optimize.rewriter_before_opt_a : 0.000133s : 0.34% optimize.opt_a.expand_dump_flag : 0.000007s : 0.02% optimize.opt_a.switch_simplify : 0.000154s : 0.39% optimize.opt_a.loop_unroll : 0.000051s : 0.13% optimize.opt_a.a_1 : 0.001221s : 3.08% optimize.opt_a.with_stream_mark : 0.000049s : 0.12% optimize.opt_a.recompute_prepare : 0.000027s : 0.07% optimize.opt_a.updatestate_depend_eliminate : 0.000011s : 0.03% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.02% optimize.opt_a.updatestate_loads_eliminate : 0.000007s : 0.02% optimize.opt_a.parameter_eliminate : 0.000005s : 0.01% optimize.opt_a.a_2 : 0.000211s : 0.53% optimize.opt_a.accelerated_algorithm : 0.000020s : 0.05% optimize.opt_a.shard : 0.000005s : 0.01% optimize.opt_a.meta_shard_fg_expand : 0.000005s : 0.01% optimize.opt_a.shard_inline : 0.000018s : 0.04% optimize.opt_a.merge_send_recv : 0.000020s : 0.05% optimize.opt_a.auto_parallel : 0.000018s : 0.05% optimize.opt_a.parallel : 0.000029s : 0.07% optimize.opt_a.flash_sp : 0.000014s : 0.04% optimize.opt_a.merge_comm : 0.000010s : 0.02% optimize.opt_a.allreduce_fusion : 0.000008s : 0.02% optimize.opt_a.matmul_add_comm_reduction : 0.000024s : 0.06% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000002s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000023s : 0.06% optimize.opt_a.virtual_dataset : 0.000016s : 0.04% optimize.opt_a.get_grad_eliminate_ : 0.000016s : 0.04% optimize.opt_a.virtual_output : 0.000016s : 0.04% optimize.opt_a.merge_forward : 0.000010s : 0.03% optimize.opt_a.cell_reuse_recompute_pass : 0.000004s : 0.01% optimize.opt_a.offload_activation : 0.000023s : 0.06% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000040s : 0.10% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.01% optimize.opt_a.before_grad : 0.000031s : 0.08% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000012s : 0.03% optimize.opt_a.meta_fg_expand : 0.000007s : 0.02% optimize.opt_a.flash_sp_send_recv_attached : 0.000005s : 0.01% optimize.opt_a.receive_attached : 0.000004s : 0.01% optimize.opt_a.after_resolve : 0.000032s : 0.08% optimize.opt_a.a_after_grad : 0.000026s : 0.07% optimize.opt_a.renormalize : 0.001372s : 3.46% optimize.opt_a.add_forward_monad_depend : 0.000012s : 0.03% optimize.opt_a.auto_monad_grad : 0.000005s : 0.01% optimize.opt_a.auto_monad_eliminator : 0.000038s : 0.10% optimize.opt_a.cse : 0.000064s : 0.16% optimize.opt_a.a_3 : 0.000126s : 0.32% optimize.py_interpret_to_execute_after_opt_a : 0.000018s : 0.05% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.01% optimize.rewriter_after_opt_a : 0.000050s : 0.13% optimize.convert_after_rewriter : 0.000008s : 0.02% optimize.order_py_execute_after_rewriter : 0.000007s : 0.02% optimize.mutable_eliminate : 0.000783s : 1.98% optimize.opt_b.b_1 : 0.000170s : 0.43% optimize.opt_b.b_2 : 0.000009s : 0.02% optimize.opt_b.updatestate_depend_eliminate : 0.000009s : 0.02% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.01% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.01% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000037s : 0.09% optimize.optimize_parallel_all_gather_comm : 0.000023s : 0.06% optimize.overlap_param_gather : 0.000002s : 0.01% optimize.cconv : 0.000036s : 0.09% optimize.loop_unroll : 0.000571s : 1.44% optimize.opt_after_cconv.c_1 : 0.000045s : 0.11% optimize.opt_after_cconv.parameter_eliminate : 0.000006s : 0.01% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000009s : 0.02% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000004s : 0.01% optimize.opt_after_cconv.cse : 0.000032s : 0.08% optimize.opt_after_cconv.renormalize : 0.000001s : 0.00% optimize.remove_dup_value : 0.000017s : 0.04% optimize.tuple_transform.d_1 : 0.000061s : 0.15% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000009s : 0.02% optimize.partial_unused_args_eliminate : 0.000002s : 0.01% optimize.add_recomputation : 0.000068s : 0.17% optimize.cse_after_recomputation.cse : 0.000018s : 0.05% optimize.environ_conv : 0.000008s : 0.02% optimize.swap_dp_allreduce_reducescatter : 0.000007s : 0.02% optimize.bias_add_comm_swap : 0.000003s : 0.01% optimize.label_micro_interleaved_index : 0.000006s : 0.02% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.01% optimize.merge_cast_opt : 0.000001s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.01% optimize.micro_interleaved_order_control : 0.000003s : 0.01% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.00% optimize.full_micro_interleaved_order_control : 0.000002s : 0.01% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.01% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.01% optimize.control_data_broadcast_order : 0.000021s : 0.05% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.00% optimize.offloading_packed_experts : 0.000005s : 0.01% optimize.overlap_recompute_and_grad_model_parallel : 0.000006s : 0.02% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000002s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000002s : 0.00% optimize.overlap_recompute_comm : 0.000002s : 0.01% optimize.overlap_grad_ring_attention : 0.000006s : 0.01% optimize.overlap_grad_flash_sp : 0.000026s : 0.06% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.01% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000005s : 0.01% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000019s : 0.05% optimize.symbol_engine_optimizer.elim_not_effective : 0.000018s : 0.05% optimize.symbol_engine_optimizer.opt_reshape : 0.000009s : 0.02% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000013s : 0.03% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000003s : 0.01% pipeline_parallel_scheduler : 0.000001s : 0.00% auto_monad_reorder : 0.000026s : 0.07% get_jit_bprop_graph : 0.000002s : 0.01% rewriter_after_jit_bprop_graph : 0.000006s : 0.02% opt_after_jit_grad : 0.000641s : 1.62% validate : 0.000054s : 0.14% Time group info: ------[substitution.] 0.000328 44 10.42% : 0.000034s : 3: substitution.cast_eliminate 0.83% : 0.000003s : 3: substitution.elim_not_effective 0.54% : 0.000002s : 3: substitution.fold_const_symbol 2.66% : 0.000009s : 5: substitution.graph_param_transform 70.79% : 0.000232s : 8: substitution.inline 2.12% : 0.000007s : 6: substitution.j_node_and_user_rematch 2.31% : 0.000008s : 6: substitution.remove_not_recompute_node 2.28% : 0.000007s : 4: substitution.replace_old_param 3.50% : 0.000011s : 2: substitution.switch_simplify 4.54% : 0.000015s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.031716 2 43.56% : 0.013816s : 1: type_inference.infer 56.44% : 0.017900s : 1: type_inference.specialize ------[replace.] 0.000155 14 43.10% : 0.000067s : 8: replace.inline 36.26% : 0.000056s : 2: replace.switch_simplify 20.63% : 0.000032s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000249 14 90.82% : 0.000226s : 8: match.inline 4.12% : 0.000010s : 2: match.switch_simplify 5.05% : 0.000013s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000301 1746 0.88% : 0.000003s : 19: predicate.accumulaten_eliminater 0.77% : 0.000002s : 5: predicate.ad_related_special_op_eliminate 0.54% : 0.000002s : 10: predicate.addn_check_dump 0.99% : 0.000003s : 19: predicate.addn_zero_filter 0.83% : 0.000002s : 19: predicate.adjust_all_reduce_mul_add 1.99% : 0.000006s : 29: predicate.arithmetic_simplify 1.20% : 0.000004s : 19: predicate.cast_eliminate 0.54% : 0.000002s : 10: predicate.check_bprop_eliminate 0.45% : 0.000001s : 10: predicate.compare_switch_simplify 0.17% : 0.000001s : 5: predicate.const_output_eliminate 0.54% : 0.000002s : 10: predicate.depend_value_elim 0.99% : 0.000003s : 19: predicate.dict_get_item_const_eliminator 1.14% : 0.000003s : 19: predicate.dict_get_item_eliminator 0.96% : 0.000003s : 19: predicate.dict_set_item_eliminator 0.86% : 0.000003s : 10: predicate.dumpgradient_eliminate 0.29% : 0.000001s : 5: predicate.elim_not_effective 0.49% : 0.000001s : 5: predicate.elim_shapecalc_of_broadcastargs 1.22% : 0.000004s : 24: predicate.environ_add_const_eliminate 1.07% : 0.000003s : 24: predicate.environ_get_add_eliminate 1.07% : 0.000003s : 24: predicate.environ_get_depend_swap 1.57% : 0.000005s : 34: predicate.environ_get_eliminate 1.11% : 0.000003s : 24: predicate.environ_get_set_eliminate 1.51% : 0.000005s : 31: predicate.exchange_switch_depend_value 2.53% : 0.000008s : 31: predicate.float_depend_g_call 0.48% : 0.000001s : 10: predicate.float_environ_get_switch 0.70% : 0.000002s : 15: predicate.float_tuple_getitem_switch 0.17% : 0.000000s : 5: predicate.fold_const_symbol 0.56% : 0.000002s : 10: predicate.get_grad_eliminate 0.18% : 0.000001s : 5: predicate.graph_param_transform 0.52% : 0.000002s : 10: predicate.incorporate_call 0.45% : 0.000001s : 10: predicate.incorporate_call_switch 7.07% : 0.000021s : 80: predicate.inline 0.94% : 0.000003s : 10: predicate.inline_without_move 0.27% : 0.000001s : 10: predicate.j_node_and_user_rematch 0.78% : 0.000002s : 10: predicate.less_batch_normalization 1.66% : 0.000005s : 33: predicate.list_to_tuple_eliminator_ 2.44% : 0.000007s : 52: predicate.load_eliminater 0.99% : 0.000003s : 5: predicate.loop_unroll_after_grad 2.40% : 0.000007s : 49: predicate.loop_unroll_before_grad 1.57% : 0.000005s : 29: predicate.make_slice_get_slice_eliminator 0.53% : 0.000002s : 10: predicate.merge_addn 0.49% : 0.000001s : 10: predicate.micro_step_allgather_replace 0.49% : 0.000001s : 10: predicate.mini_step_allgather_replace 0.84% : 0.000003s : 19: predicate.minmaximum_grad 1.19% : 0.000004s : 5: predicate.mutable_eliminate 0.30% : 0.000001s : 5: predicate.opt_reshape 0.32% : 0.000001s : 5: predicate.parallel_virtual_node 2.11% : 0.000006s : 31: predicate.partial_defer_inline 1.53% : 0.000005s : 28: predicate.partial_eliminate 0.94% : 0.000003s : 19: predicate.print_const_string_wrapper 0.63% : 0.000002s : 10: predicate.reduce_all_const_elim 1.33% : 0.000004s : 19: predicate.reduce_eliminate 2.48% : 0.000007s : 52: predicate.redundant_stop_gradient_eliminater 0.40% : 0.000001s : 10: predicate.remove_not_recompute_node 1.29% : 0.000004s : 33: predicate.replace_applicator 0.48% : 0.000001s : 10: predicate.replace_old_param 0.40% : 0.000001s : 5: predicate.reset_defer_inline 1.01% : 0.000003s : 19: predicate.reshape_eliminate 0.51% : 0.000002s : 10: predicate.row_tensor_add_zeros_like 0.38% : 0.000001s : 5: predicate.row_tensor_eliminate 1.00% : 0.000003s : 10: predicate.same_eliminate 0.58% : 0.000002s : 10: predicate.set_cell_output_no_recompute 0.83% : 0.000002s : 10: predicate.shard_identity_eliminate 0.63% : 0.000002s : 10: predicate.special_op_eliminate 0.60% : 0.000002s : 10: predicate.specialize_transform 0.84% : 0.000003s : 10: predicate.split_environ_get_set_with_tuple_value 0.73% : 0.000002s : 10: predicate.stack_unstack_eliminate 0.29% : 0.000001s : 5: predicate.switch_call_monad_eliminater 1.56% : 0.000005s : 31: predicate.switch_defer_inline 2.08% : 0.000006s : 41: predicate.switch_layer_defer_inline 5.44% : 0.000016s : 99: predicate.switch_simplify 0.96% : 0.000003s : 19: predicate.tile_eliminate 0.98% : 0.000003s : 19: predicate.transpose_eliminate 1.59% : 0.000005s : 29: predicate.tuple_list_convert_item_index_to_positive 1.69% : 0.000005s : 29: predicate.tuple_list_get_item_const_eliminator 1.41% : 0.000004s : 29: predicate.tuple_list_get_item_depend_reorder 3.00% : 0.000009s : 43: predicate.tuple_list_get_item_eliminator 1.67% : 0.000005s : 29: predicate.tuple_list_get_set_item_eliminator 2.48% : 0.000007s : 39: predicate.tuple_list_set_item_eliminator 1.71% : 0.000005s : 33: predicate.tuple_to_list_eliminator_ 2.26% : 0.000007s : 52: predicate.updatestate_pure_node_eliminater 2.87% : 0.000009s : 62: predicate.updatestate_useless_node_eliminater 0.34% : 0.000001s : 5: predicate.value_based_eliminate 0.62% : 0.000002s : 10: predicate.virtual_dataset_eliminate 0.67% : 0.000002s : 10: predicate.virtual_output_eliminate 0.28% : 0.000001s : 5: predicate.virtual_view_grad_eliminate 0.35% : 0.000001s : 5: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.001179 16 55.55% : 0.000655s : 6: func_graph_cloner_run.FuncGraphClonerGraph 44.45% : 0.000524s : 10: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.060039 192 0.01% : 0.000004s : 1: ForceFp32Comm 7.04% : 0.004228s : 1: add_attr 7.01% : 0.004210s : 1: add_attr_with_inline 0.01% : 0.000005s : 1: add_comm_op_reuse_tag 0.12% : 0.000074s : 1: add_recomputation 0.01% : 0.000005s : 1: assign_add_opt 0.22% : 0.000134s : 1: auto_monad 0.05% : 0.000030s : 1: auto_monad_reorder 0.01% : 0.000004s : 1: begin_end_overlap_inline 0.01% : 0.000007s : 1: bias_add_comm_swap 0.83% : 0.000500s : 1: bootstrap 0.07% : 0.000040s : 1: cconv 0.01% : 0.000004s : 1: comm_op_add_attrs 0.04% : 0.000026s : 1: control_data_broadcast_order 0.02% : 0.000012s : 1: convert_after_rewriter 0.06% : 0.000033s : 1: cse_after_recomputation 0.01% : 0.000005s : 1: dataset_repeat_opt 0.01% : 0.000006s : 1: detach_backward 0.02% : 0.000011s : 1: environ_conv 0.29% : 0.000177s : 1: event_method 0.01% : 0.000005s : 1: full_micro_interleaved_order_control 0.01% : 0.000006s : 1: get_jit_bprop_graph 0.02% : 0.000014s : 1: graph_reusing 0.01% : 0.000005s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000004s : 1: handle_group_info 0.01% : 0.000006s : 1: inline 0.01% : 0.000006s : 1: insert-virtual-dataset 0.01% : 0.000004s : 1: interleave_parallel_branches 0.01% : 0.000004s : 1: interleave_split_concat_branches 0.01% : 0.000006s : 1: label_fine_grained_interleaved_index 0.01% : 0.000009s : 1: label_micro_interleaved_index 1.03% : 0.000619s : 1: loop_unroll 0.01% : 0.000005s : 1: merge_cast_opt 0.01% : 0.000006s : 1: micro_interleaved_order_control 1.32% : 0.000795s : 1: mutable_eliminate 0.01% : 0.000008s : 1: offloading_packed_experts 0.04% : 0.000022s : 1: opt.transform.loop_unroll_optimizer 0.04% : 0.000025s : 1: opt.transform.mutable_eliminate 3.18% : 0.001909s : 78: opt.transform.opt_a 0.07% : 0.000042s : 1: opt.transform.opt_after_cconv 0.06% : 0.000038s : 1: opt.transform.opt_after_jit_grad 0.24% : 0.000141s : 28: opt.transform.opt_b 0.11% : 0.000067s : 2: opt.transform.opt_trans_graph 0.09% : 0.000054s : 4: opt.transform.symbol_engine_opt 7.29% : 0.004377s : 1: opt_a 0.25% : 0.000150s : 1: opt_after_cconv 1.09% : 0.000654s : 1: opt_after_jit_grad 0.47% : 0.000284s : 1: opt_b 12.10% : 0.007263s : 1: optimize 0.04% : 0.000026s : 1: optimize_parallel_all_gather_comm 0.02% : 0.000010s : 1: order_py_execute_after_rewriter 0.05% : 0.000030s : 1: overlap_grad_flash_sp 0.01% : 0.000005s : 1: overlap_grad_matmul_and_grad_allreduce 0.01% : 0.000008s : 1: overlap_grad_ring_attention 0.07% : 0.000045s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.01% : 0.000005s : 1: overlap_param_gather 0.01% : 0.000005s : 1: overlap_recompute_allgather_and_fa_grad 0.02% : 0.000010s : 1: overlap_recompute_and_grad_model_parallel 0.01% : 0.000006s : 1: overlap_recompute_comm 0.01% : 0.000007s : 1: parallel-infer-symbol 0.01% : 0.000004s : 1: parallel-infer-symbol-second 0.01% : 0.000005s : 1: partial_unused_args_eliminate 0.01% : 0.000005s : 1: pipeline_parallel_scheduler 0.01% : 0.000004s : 1: pipeline_split 0.10% : 0.000059s : 1: pre_auto_parallel 0.08% : 0.000050s : 1: py_interpret_to_execute 0.04% : 0.000023s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000004s : 1: remove_cast_before_assign_add 0.04% : 0.000022s : 1: remove_dup_value 1.23% : 0.000739s : 1: renormalize.infer 1.03% : 0.000619s : 1: renormalize.specialize 0.01% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.02% : 0.000009s : 1: rewriter_after_jit_bprop_graph 0.09% : 0.000054s : 1: rewriter_after_opt_a 0.23% : 0.000139s : 1: rewriter_before_opt_a 0.01% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.01% : 0.000005s : 1: slice_recompute_activation 0.01% : 0.000005s : 1: split_layernorm_comm 0.01% : 0.000005s : 1: split_matmul_comm_elemetwise 0.02% : 0.000010s : 1: swap_dp_allreduce_reducescatter 0.18% : 0.000109s : 1: symbol_engine_optimizer 0.17% : 0.000100s : 1: tuple_transform 53.05% : 0.031853s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:15.251.416 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:15.251.705 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0401273, [21] [bootstrap]: 0.00047363 [type_inference]: 0.0293806 [event_method]: 4.754e-05 [auto_monad]: 9.661e-05 [graph_reusing]: 7.33e-06 [inline]: 2.80002e-06 [add_attr]: 0.00311674, [1] [add_attr_with_inline]: 0.00310841, [1] [Cycle 1]: 8.332e-05, [2] [tag_attr]: 2.685e-05 [meta_addattr_fg_expand]: 8.25e-06 [parallel-infer-symbol]: 3.3e-06 [pre_auto_parallel]: 4.477e-05 [insert-virtual-dataset]: 2.33002e-06 [parallel-infer-symbol-second]: 7.10017e-07 [dataset_repeat_opt]: 2.29999e-06 [pipeline_split]: 1.66e-06 [optimize]: 0.00578823, [53] [py_interpret_to_execute]: 3.547e-05 [rewriter_before_opt_a]: 0.00011451 [opt_a]: 0.00349702, [2] [Cycle 1]: 0.0026371, [45] [expand_dump_flag]: 4.03001e-06 [switch_simplify]: 0.00012226 [loop_unroll]: 4.11e-05 [a_1]: 0.00083081 [with_stream_mark]: 1.603e-05 [recompute_prepare]: 9.14e-06 [updatestate_depend_eliminate]: 4.11001e-06 [updatestate_assign_eliminate]: 3.50998e-06 [updatestate_loads_eliminate]: 2.79001e-06 [parameter_eliminate]: 1.97999e-06 [a_2]: 0.00011121 [accelerated_algorithm]: 7.54002e-06 [shard]: 1.96e-06 [meta_shard_fg_expand]: 2.12999e-06 [shard_inline]: 6.73e-06 [merge_send_recv]: 8.95001e-06 [auto_parallel]: 6.74999e-06 [parallel]: 1.871e-05 [flash_sp]: 8.31002e-06 [merge_comm]: 4.26001e-06 [allreduce_fusion]: 3.63e-06 [matmul_add_comm_reduction]: 9.47999e-06 [allreduce_slice_to_reducescatter]: 5.99975e-07 [virtual_shard_identity]: 8.57e-06 [virtual_dataset]: 7.19001e-06 [get_grad_eliminate_]: 6.66e-06 [virtual_output]: 6.61999e-06 [merge_forward]: 3.76999e-06 [cell_reuse_recompute_pass]: 1.24e-06 [offload_activation]: 1.003e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.531e-05 [merge_recompute_call_nodes]: 1.67001e-06 [before_grad]: 1.086e-05 [set_forward_comm_id_for_comm_node_pass]: 3.6e-06 [meta_fg_expand]: 3.46001e-06 [flash_sp_send_recv_attached]: 2.64001e-06 [receive_attached]: 2.40002e-06 [after_resolve]: 1.181e-05 [a_after_grad]: 1.06e-05 [renormalize]: 0.0007974 [add_forward_monad_depend]: 5.34e-06 [auto_monad_grad]: 2.19001e-06 [auto_monad_eliminator]: 1.635e-05 [cse]: 2.653e-05 [a_3]: 6.32e-05 [Cycle 2]: 0.00084588, [45] [expand_dump_flag]: 9.89996e-07 [switch_simplify]: 7.82e-06 [loop_unroll]: 6.56e-06 [a_1]: 0.00013589 [with_stream_mark]: 1.143e-05 [recompute_prepare]: 6.56e-06 [updatestate_depend_eliminate]: 3.23998e-06 [updatestate_assign_eliminate]: 2.34001e-06 [updatestate_loads_eliminate]: 2.54999e-06 [parameter_eliminate]: 1.39998e-06 [a_2]: 0.000102 [accelerated_algorithm]: 6.60002e-06 [shard]: 1.11002e-06 [meta_shard_fg_expand]: 1.29998e-06 [shard_inline]: 6.64001e-06 [merge_send_recv]: 4.74e-06 [auto_parallel]: 5.49e-06 [parallel]: 5.05001e-06 [flash_sp]: 3.81999e-06 [merge_comm]: 3.20002e-06 [allreduce_fusion]: 3.04999e-06 [matmul_add_comm_reduction]: 6.56e-06 [allreduce_slice_to_reducescatter]: 6.10016e-07 [virtual_shard_identity]: 7.49002e-06 [virtual_dataset]: 7.16999e-06 [get_grad_eliminate_]: 6.94999e-06 [virtual_output]: 6.43998e-06 [merge_forward]: 3.46999e-06 [cell_reuse_recompute_pass]: 1.47999e-06 [offload_activation]: 6.55997e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.704e-05 [merge_recompute_call_nodes]: 9.09989e-07 [before_grad]: 9.78002e-06 [set_forward_comm_id_for_comm_node_pass]: 4.03001e-06 [meta_fg_expand]: 2.16e-06 [flash_sp_send_recv_attached]: 8.39995e-07 [receive_attached]: 1.09e-06 [after_resolve]: 1.15e-05 [a_after_grad]: 1.056e-05 [renormalize]: 5.9983e-08 [add_forward_monad_depend]: 1.40999e-06 [auto_monad_grad]: 1.01002e-06 [auto_monad_eliminator]: 7.51999e-06 [cse]: 1.312e-05 [a_3]: 5.105e-05 [py_interpret_to_execute_after_opt_a]: 1.196e-05 [slice_cell_reuse_recomputed_activation]: 4.90999e-06 [rewriter_after_opt_a]: 3.79e-05 [convert_after_rewriter]: 9.80002e-06 [order_py_execute_after_rewriter]: 8.48999e-06 [mutable_eliminate]: 0.00049853 [opt_b]: 0.00027022, [1] [Cycle 1]: 0.00026164, [7] [b_1]: 0.00016855 [b_2]: 8.40999e-06 [updatestate_depend_eliminate]: 5.76e-06 [updatestate_assign_eliminate]: 2.51998e-06 [updatestate_loads_eliminate]: 2.26998e-06 [renormalize]: 6.50005e-07 [cse]: 1.674e-05 [optimize_parallel_all_gather_comm]: 1.877e-05 [overlap_param_gather]: 4.80999e-06 [cconv]: 2.81e-05 [loop_unroll]: 0.00043482 [opt_after_cconv]: 0.00012585, [1] [Cycle 1]: 0.00011724, [7] [c_1]: 3.206e-05 [parameter_eliminate]: 3.04999e-06 [updatestate_depend_eliminate]: 4.85001e-06 [updatestate_assign_eliminate]: 3.08e-06 [updatestate_loads_eliminate]: 2.17001e-06 [cse]: 1.625e-05 [renormalize]: 2.89991e-07 [remove_dup_value]: 1.575e-05 [tuple_transform]: 9.062e-05, [1] [Cycle 1]: 8.421e-05, [4] [d_1]: 4.533e-05 [none_parameter_eliminate]: 1.64e-06 [renormalize]: 1.69995e-07 [switch_simplify]: 6.98e-06 [partial_unused_args_eliminate]: 4.08999e-06 [add_recomputation]: 5.061e-05 [cse_after_recomputation]: 2.743e-05, [1] [Cycle 1]: 2.01e-05, [1] [cse]: 1.077e-05 [environ_conv]: 7.94002e-06 [swap_dp_allreduce_reducescatter]: 7.95998e-06 [bias_add_comm_swap]: 5.64e-06 [label_micro_interleaved_index]: 7.08e-06 [label_fine_grained_interleaved_index]: 5.24998e-06 [merge_cast_opt]: 4.21001e-06 [slice_recompute_activation]: 4.38001e-06 [micro_interleaved_order_control]: 4.94998e-06 [assign_add_opt]: 3.84002e-06 [ForceFp32Comm]: 3.11001e-06 [remove_cast_before_assign_add]: 3.36001e-06 [full_micro_interleaved_order_control]: 4.52e-06 [reorder_send_recv_between_fp_bp]: 5.56e-06 [comm_op_add_attrs]: 3.60998e-06 [add_comm_op_reuse_tag]: 3.13998e-06 [interleave_split_concat_branches]: 3.52002e-06 [interleave_parallel_branches]: 1.669e-05 [overlap_opt_shard_in_pipeline]: 3.8e-06 [overlap_opt_shard_grad_in_pipeline]: 4.15e-06 [control_data_broadcast_order]: 1.564e-05 [grouped_pairwise_exchange_alltoall]: 4.07e-06 [offloading_packed_experts]: 6.55997e-06 [overlap_recompute_and_grad_model_parallel]: 6.88e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.55e-06 [overlap_recompute_allgather_and_fa_grad]: 4.12e-06 [overlap_recompute_comm]: 4.66002e-06 [overlap_grad_ring_attention]: 6.46e-06 [overlap_grad_flash_sp]: 2.144e-05 [begin_end_overlap_inline]: 2.97002e-06 [split_matmul_comm_elemetwise]: 4.63001e-06 [split_layernorm_comm]: 4.2e-06 [handle_group_info]: 3.23e-06 [symbol_engine_optimizer]: 9.686e-05, [1] [Cycle 1]: 9.005e-05, [6] [build]: 2.76999e-06 [elim_shapecalc]: 1.019e-05 [elim_not_effective]: 1.441e-05 [opt_reshape]: 7.21001e-06 [fold_const_symbol]: 1.051e-05 [renormalize]: 2.29978e-07 [detach_backward]: 3.76999e-06 [pipeline_parallel_scheduler]: 2.20002e-06 [auto_monad_reorder]: 1.949e-05 [get_jit_bprop_graph]: 1.64998e-06 [rewriter_after_jit_bprop_graph]: 4.41002e-06 [opt_after_jit_grad]: 0.00048333 [validate]: 3.644e-05 Sums bootstrap : 0.000474s : 1.34% type_inference : 0.029381s : 83.37% event_method : 0.000048s : 0.13% auto_monad : 0.000097s : 0.27% graph_reusing : 0.000007s : 0.02% inline : 0.000003s : 0.01% add_attr.add_attr_with_inline.tag_attr : 0.000027s : 0.08% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000008s : 0.02% parallel-infer-symbol : 0.000003s : 0.01% pre_auto_parallel : 0.000045s : 0.13% insert-virtual-dataset : 0.000002s : 0.01% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.01% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000035s : 0.10% optimize.rewriter_before_opt_a : 0.000115s : 0.32% optimize.opt_a.expand_dump_flag : 0.000005s : 0.01% optimize.opt_a.switch_simplify : 0.000130s : 0.37% optimize.opt_a.loop_unroll : 0.000048s : 0.14% optimize.opt_a.a_1 : 0.000967s : 2.74% optimize.opt_a.with_stream_mark : 0.000027s : 0.08% optimize.opt_a.recompute_prepare : 0.000016s : 0.04% optimize.opt_a.updatestate_depend_eliminate : 0.000007s : 0.02% optimize.opt_a.updatestate_assign_eliminate : 0.000006s : 0.02% optimize.opt_a.updatestate_loads_eliminate : 0.000005s : 0.02% optimize.opt_a.parameter_eliminate : 0.000003s : 0.01% optimize.opt_a.a_2 : 0.000213s : 0.61% optimize.opt_a.accelerated_algorithm : 0.000014s : 0.04% optimize.opt_a.shard : 0.000003s : 0.01% optimize.opt_a.meta_shard_fg_expand : 0.000003s : 0.01% optimize.opt_a.shard_inline : 0.000013s : 0.04% optimize.opt_a.merge_send_recv : 0.000014s : 0.04% optimize.opt_a.auto_parallel : 0.000012s : 0.03% optimize.opt_a.parallel : 0.000024s : 0.07% optimize.opt_a.flash_sp : 0.000012s : 0.03% optimize.opt_a.merge_comm : 0.000007s : 0.02% optimize.opt_a.allreduce_fusion : 0.000007s : 0.02% optimize.opt_a.matmul_add_comm_reduction : 0.000016s : 0.05% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000016s : 0.05% optimize.opt_a.virtual_dataset : 0.000014s : 0.04% optimize.opt_a.get_grad_eliminate_ : 0.000014s : 0.04% optimize.opt_a.virtual_output : 0.000013s : 0.04% optimize.opt_a.merge_forward : 0.000007s : 0.02% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.01% optimize.opt_a.offload_activation : 0.000017s : 0.05% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000032s : 0.09% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.01% optimize.opt_a.before_grad : 0.000021s : 0.06% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000008s : 0.02% optimize.opt_a.meta_fg_expand : 0.000006s : 0.02% optimize.opt_a.flash_sp_send_recv_attached : 0.000003s : 0.01% optimize.opt_a.receive_attached : 0.000003s : 0.01% optimize.opt_a.after_resolve : 0.000023s : 0.07% optimize.opt_a.a_after_grad : 0.000021s : 0.06% optimize.opt_a.renormalize : 0.000797s : 2.26% optimize.opt_a.add_forward_monad_depend : 0.000007s : 0.02% optimize.opt_a.auto_monad_grad : 0.000003s : 0.01% optimize.opt_a.auto_monad_eliminator : 0.000024s : 0.07% optimize.opt_a.cse : 0.000040s : 0.11% optimize.opt_a.a_3 : 0.000114s : 0.32% optimize.py_interpret_to_execute_after_opt_a : 0.000012s : 0.03% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.01% optimize.rewriter_after_opt_a : 0.000038s : 0.11% optimize.convert_after_rewriter : 0.000010s : 0.03% optimize.order_py_execute_after_rewriter : 0.000008s : 0.02% optimize.mutable_eliminate : 0.000499s : 1.41% optimize.opt_b.b_1 : 0.000169s : 0.48% optimize.opt_b.b_2 : 0.000008s : 0.02% optimize.opt_b.updatestate_depend_eliminate : 0.000006s : 0.02% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.01% optimize.opt_b.updatestate_loads_eliminate : 0.000002s : 0.01% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000017s : 0.05% optimize.optimize_parallel_all_gather_comm : 0.000019s : 0.05% optimize.overlap_param_gather : 0.000005s : 0.01% optimize.cconv : 0.000028s : 0.08% optimize.loop_unroll : 0.000435s : 1.23% optimize.opt_after_cconv.c_1 : 0.000032s : 0.09% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000005s : 0.01% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.01% optimize.opt_after_cconv.cse : 0.000016s : 0.05% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000016s : 0.04% optimize.tuple_transform.d_1 : 0.000045s : 0.13% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000007s : 0.02% optimize.partial_unused_args_eliminate : 0.000004s : 0.01% optimize.add_recomputation : 0.000051s : 0.14% optimize.cse_after_recomputation.cse : 0.000011s : 0.03% optimize.environ_conv : 0.000008s : 0.02% optimize.swap_dp_allreduce_reducescatter : 0.000008s : 0.02% optimize.bias_add_comm_swap : 0.000006s : 0.02% optimize.label_micro_interleaved_index : 0.000007s : 0.02% optimize.label_fine_grained_interleaved_index : 0.000005s : 0.01% optimize.merge_cast_opt : 0.000004s : 0.01% optimize.slice_recompute_activation : 0.000004s : 0.01% optimize.micro_interleaved_order_control : 0.000005s : 0.01% optimize.assign_add_opt : 0.000004s : 0.01% optimize.ForceFp32Comm : 0.000003s : 0.01% optimize.remove_cast_before_assign_add : 0.000003s : 0.01% optimize.full_micro_interleaved_order_control : 0.000005s : 0.01% optimize.reorder_send_recv_between_fp_bp : 0.000006s : 0.02% optimize.comm_op_add_attrs : 0.000004s : 0.01% optimize.add_comm_op_reuse_tag : 0.000003s : 0.01% optimize.interleave_split_concat_branches : 0.000004s : 0.01% optimize.interleave_parallel_branches : 0.000017s : 0.05% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000004s : 0.01% optimize.control_data_broadcast_order : 0.000016s : 0.04% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.01% optimize.offloading_packed_experts : 0.000007s : 0.02% optimize.overlap_recompute_and_grad_model_parallel : 0.000007s : 0.02% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.01% optimize.overlap_recompute_comm : 0.000005s : 0.01% optimize.overlap_grad_ring_attention : 0.000006s : 0.02% optimize.overlap_grad_flash_sp : 0.000021s : 0.06% optimize.begin_end_overlap_inline : 0.000003s : 0.01% optimize.split_matmul_comm_elemetwise : 0.000005s : 0.01% optimize.split_layernorm_comm : 0.000004s : 0.01% optimize.handle_group_info : 0.000003s : 0.01% optimize.symbol_engine_optimizer.build : 0.000003s : 0.01% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000010s : 0.03% optimize.symbol_engine_optimizer.elim_not_effective : 0.000014s : 0.04% optimize.symbol_engine_optimizer.opt_reshape : 0.000007s : 0.02% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000011s : 0.03% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000004s : 0.01% pipeline_parallel_scheduler : 0.000002s : 0.01% auto_monad_reorder : 0.000019s : 0.06% get_jit_bprop_graph : 0.000002s : 0.00% rewriter_after_jit_bprop_graph : 0.000004s : 0.01% opt_after_jit_grad : 0.000483s : 1.37% validate : 0.000036s : 0.10% Time group info: ------[substitution.] 0.000214 34 0.99% : 0.000002s : 2: substitution.elim_not_effective 0.62% : 0.000001s : 2: substitution.fold_const_symbol 2.87% : 0.000006s : 4: substitution.graph_param_transform 77.84% : 0.000167s : 8: substitution.inline 1.68% : 0.000004s : 4: substitution.j_node_and_user_rematch 2.15% : 0.000005s : 4: substitution.remove_not_recompute_node 2.05% : 0.000004s : 4: substitution.replace_old_param 4.52% : 0.000010s : 2: substitution.switch_simplify 7.27% : 0.000016s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.029315 2 95.54% : 0.028007s : 1: type_inference.infer 4.46% : 0.001308s : 1: type_inference.specialize ------[replace.] 0.000117 14 43.87% : 0.000051s : 8: replace.inline 37.51% : 0.000044s : 2: replace.switch_simplify 18.62% : 0.000022s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000184 14 88.09% : 0.000162s : 8: match.inline 4.61% : 0.000008s : 2: match.switch_simplify 7.30% : 0.000013s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000238 1520 1.00% : 0.000002s : 17: predicate.accumulaten_eliminater 0.75% : 0.000002s : 4: predicate.ad_related_special_op_eliminate 0.44% : 0.000001s : 8: predicate.addn_check_dump 0.98% : 0.000002s : 17: predicate.addn_zero_filter 0.91% : 0.000002s : 17: predicate.adjust_all_reduce_mul_add 1.99% : 0.000005s : 25: predicate.arithmetic_simplify 0.98% : 0.000002s : 17: predicate.cast_eliminate 0.52% : 0.000001s : 8: predicate.check_bprop_eliminate 0.44% : 0.000001s : 8: predicate.compare_switch_simplify 0.16% : 0.000000s : 4: predicate.const_output_eliminate 0.53% : 0.000001s : 8: predicate.depend_value_elim 1.08% : 0.000003s : 17: predicate.dict_get_item_const_eliminator 1.21% : 0.000003s : 17: predicate.dict_get_item_eliminator 1.11% : 0.000003s : 17: predicate.dict_set_item_eliminator 0.75% : 0.000002s : 8: predicate.dumpgradient_eliminate 0.18% : 0.000000s : 4: predicate.elim_not_effective 0.32% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.21% : 0.000003s : 21: predicate.environ_add_const_eliminate 1.17% : 0.000003s : 21: predicate.environ_get_add_eliminate 1.21% : 0.000003s : 21: predicate.environ_get_depend_swap 1.84% : 0.000004s : 29: predicate.environ_get_eliminate 1.13% : 0.000003s : 21: predicate.environ_get_set_eliminate 1.75% : 0.000004s : 29: predicate.exchange_switch_depend_value 2.69% : 0.000006s : 29: predicate.float_depend_g_call 0.44% : 0.000001s : 8: predicate.float_environ_get_switch 0.66% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.16% : 0.000000s : 4: predicate.fold_const_symbol 0.59% : 0.000001s : 8: predicate.get_grad_eliminate 0.19% : 0.000000s : 4: predicate.graph_param_transform 0.53% : 0.000001s : 8: predicate.incorporate_call 0.42% : 0.000001s : 8: predicate.incorporate_call_switch 6.32% : 0.000015s : 70: predicate.inline 0.69% : 0.000002s : 8: predicate.inline_without_move 0.27% : 0.000001s : 8: predicate.j_node_and_user_rematch 0.78% : 0.000002s : 8: predicate.less_batch_normalization 1.80% : 0.000004s : 29: predicate.list_to_tuple_eliminator_ 2.57% : 0.000006s : 46: predicate.load_eliminater 0.76% : 0.000002s : 4: predicate.loop_unroll_after_grad 2.86% : 0.000007s : 47: predicate.loop_unroll_before_grad 1.62% : 0.000004s : 25: predicate.make_slice_get_slice_eliminator 0.48% : 0.000001s : 8: predicate.merge_addn 0.50% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.51% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.89% : 0.000002s : 17: predicate.minmaximum_grad 0.83% : 0.000002s : 4: predicate.mutable_eliminate 0.30% : 0.000001s : 4: predicate.opt_reshape 0.41% : 0.000001s : 4: predicate.parallel_virtual_node 2.24% : 0.000005s : 29: predicate.partial_defer_inline 1.71% : 0.000004s : 25: predicate.partial_eliminate 1.00% : 0.000002s : 17: predicate.print_const_string_wrapper 0.52% : 0.000001s : 8: predicate.reduce_all_const_elim 1.26% : 0.000003s : 17: predicate.reduce_eliminate 2.50% : 0.000006s : 46: predicate.redundant_stop_gradient_eliminater 0.34% : 0.000001s : 8: predicate.remove_not_recompute_node 1.41% : 0.000003s : 29: predicate.replace_applicator 0.45% : 0.000001s : 8: predicate.replace_old_param 0.24% : 0.000001s : 4: predicate.reset_defer_inline 1.01% : 0.000002s : 17: predicate.reshape_eliminate 0.50% : 0.000001s : 8: predicate.row_tensor_add_zeros_like 0.33% : 0.000001s : 4: predicate.row_tensor_eliminate 0.61% : 0.000001s : 8: predicate.same_eliminate 0.41% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.66% : 0.000002s : 8: predicate.shard_identity_eliminate 0.60% : 0.000001s : 8: predicate.special_op_eliminate 0.62% : 0.000001s : 8: predicate.specialize_transform 0.67% : 0.000002s : 8: predicate.split_environ_get_set_with_tuple_value 0.66% : 0.000002s : 8: predicate.stack_unstack_eliminate 0.28% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.94% : 0.000005s : 29: predicate.switch_defer_inline 2.32% : 0.000005s : 37: predicate.switch_layer_defer_inline 6.03% : 0.000014s : 92: predicate.switch_simplify 0.97% : 0.000002s : 17: predicate.tile_eliminate 1.15% : 0.000003s : 17: predicate.transpose_eliminate 1.55% : 0.000004s : 25: predicate.tuple_list_convert_item_index_to_positive 1.73% : 0.000004s : 25: predicate.tuple_list_get_item_const_eliminator 1.46% : 0.000003s : 25: predicate.tuple_list_get_item_depend_reorder 3.06% : 0.000007s : 37: predicate.tuple_list_get_item_eliminator 1.48% : 0.000004s : 25: predicate.tuple_list_get_set_item_eliminator 2.05% : 0.000005s : 33: predicate.tuple_list_set_item_eliminator 1.71% : 0.000004s : 29: predicate.tuple_to_list_eliminator_ 2.46% : 0.000006s : 46: predicate.updatestate_pure_node_eliminater 3.06% : 0.000007s : 54: predicate.updatestate_useless_node_eliminater 0.27% : 0.000001s : 4: predicate.value_based_eliminate 0.68% : 0.000002s : 8: predicate.virtual_dataset_eliminate 0.56% : 0.000001s : 8: predicate.virtual_output_eliminate 0.23% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.36% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.016982 16 97.73% : 0.016595s : 6: func_graph_cloner_run.FuncGraphClonerGraph 2.27% : 0.000386s : 10: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.051471 192 0.01% : 0.000006s : 1: ForceFp32Comm 6.07% : 0.003126s : 1: add_attr 6.05% : 0.003112s : 1: add_attr_with_inline 0.01% : 0.000006s : 1: add_comm_op_reuse_tag 0.11% : 0.000054s : 1: add_recomputation 0.01% : 0.000007s : 1: assign_add_opt 0.21% : 0.000107s : 1: auto_monad 0.05% : 0.000027s : 1: auto_monad_reorder 0.01% : 0.000006s : 1: begin_end_overlap_inline 0.02% : 0.000008s : 1: bias_add_comm_swap 1.01% : 0.000518s : 1: bootstrap 0.06% : 0.000031s : 1: cconv 0.01% : 0.000006s : 1: comm_op_add_attrs 0.04% : 0.000019s : 1: control_data_broadcast_order 0.02% : 0.000013s : 1: convert_after_rewriter 0.06% : 0.000031s : 1: cse_after_recomputation 0.01% : 0.000008s : 1: dataset_repeat_opt 0.04% : 0.000020s : 1: detach_backward 0.02% : 0.000011s : 1: environ_conv 0.12% : 0.000060s : 1: event_method 0.01% : 0.000008s : 1: full_micro_interleaved_order_control 0.02% : 0.000008s : 1: get_jit_bprop_graph 0.03% : 0.000014s : 1: graph_reusing 0.01% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000006s : 1: handle_group_info 0.02% : 0.000009s : 1: inline 0.02% : 0.000008s : 1: insert-virtual-dataset 0.04% : 0.000020s : 1: interleave_parallel_branches 0.01% : 0.000006s : 1: interleave_split_concat_branches 0.02% : 0.000008s : 1: label_fine_grained_interleaved_index 0.02% : 0.000010s : 1: label_micro_interleaved_index 0.86% : 0.000441s : 1: loop_unroll 0.01% : 0.000007s : 1: merge_cast_opt 0.01% : 0.000008s : 1: micro_interleaved_order_control 0.98% : 0.000504s : 1: mutable_eliminate 0.02% : 0.000009s : 1: offloading_packed_experts 0.03% : 0.000014s : 1: opt.transform.loop_unroll_optimizer 0.03% : 0.000014s : 1: opt.transform.mutable_eliminate 2.91% : 0.001496s : 78: opt.transform.opt_a 0.06% : 0.000031s : 1: opt.transform.opt_after_cconv 0.05% : 0.000026s : 1: opt.transform.opt_after_jit_grad 0.20% : 0.000105s : 28: opt.transform.opt_b 0.10% : 0.000050s : 2: opt.transform.opt_trans_graph 0.08% : 0.000039s : 4: opt.transform.symbol_engine_opt 6.80% : 0.003500s : 1: opt_a 0.25% : 0.000129s : 1: opt_after_cconv 0.96% : 0.000494s : 1: opt_after_jit_grad 0.53% : 0.000274s : 1: opt_b 11.93% : 0.006139s : 1: optimize 0.04% : 0.000022s : 1: optimize_parallel_all_gather_comm 0.02% : 0.000011s : 1: order_py_execute_after_rewriter 0.05% : 0.000025s : 1: overlap_grad_flash_sp 0.01% : 0.000006s : 1: overlap_grad_matmul_and_grad_allreduce 0.02% : 0.000009s : 1: overlap_grad_ring_attention 0.01% : 0.000007s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000007s : 1: overlap_opt_shard_in_pipeline 0.02% : 0.000008s : 1: overlap_param_gather 0.01% : 0.000007s : 1: overlap_recompute_allgather_and_fa_grad 0.02% : 0.000010s : 1: overlap_recompute_and_grad_model_parallel 0.01% : 0.000008s : 1: overlap_recompute_comm 0.02% : 0.000010s : 1: parallel-infer-symbol 0.01% : 0.000007s : 1: parallel-infer-symbol-second 0.01% : 0.000007s : 1: partial_unused_args_eliminate 0.02% : 0.000009s : 1: pipeline_parallel_scheduler 0.01% : 0.000007s : 1: pipeline_split 0.10% : 0.000053s : 1: pre_auto_parallel 0.08% : 0.000039s : 1: py_interpret_to_execute 0.03% : 0.000015s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000006s : 1: remove_cast_before_assign_add 0.04% : 0.000019s : 1: remove_dup_value 0.79% : 0.000407s : 1: renormalize.infer 0.74% : 0.000382s : 1: renormalize.specialize 0.02% : 0.000008s : 1: reorder_send_recv_between_fp_bp 0.02% : 0.000011s : 1: rewriter_after_jit_bprop_graph 0.08% : 0.000041s : 1: rewriter_after_opt_a 0.23% : 0.000118s : 1: rewriter_before_opt_a 0.01% : 0.000008s : 1: slice_cell_reuse_recomputed_activation 0.01% : 0.000007s : 1: slice_recompute_activation 0.01% : 0.000007s : 1: split_layernorm_comm 0.01% : 0.000007s : 1: split_matmul_comm_elemetwise 0.02% : 0.000011s : 1: swap_dp_allreduce_reducescatter 0.19% : 0.000100s : 1: symbol_engine_optimizer 0.18% : 0.000093s : 1: tuple_transform 57.16% : 0.029421s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:15.729.869 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0388296, [21] [bootstrap]: 0.00043547 [type_inference]: 0.0294812 [event_method]: 5.304e-05 [auto_monad]: 9.669e-05 [graph_reusing]: 7.55e-06 [inline]: 2.37999e-06 [add_attr]: 0.00306566, [1] [add_attr_with_inline]: 0.00305748, [1] [Cycle 1]: 6.427e-05, [2] [tag_attr]: 2.536e-05 [meta_addattr_fg_expand]: 7.71001e-06 [parallel-infer-symbol]: 2.97002e-06 [pre_auto_parallel]: 3.901e-05 [insert-virtual-dataset]: 2.65002e-06 [parallel-infer-symbol-second]: 7.89994e-07 [dataset_repeat_opt]: 1.78997e-06 [pipeline_split]: 2.09e-06 [optimize]: 0.00496652, [53] [py_interpret_to_execute]: 3.06e-05 [rewriter_before_opt_a]: 0.00010802 [opt_a]: 0.00299171, [2] [Cycle 1]: 0.00233606, [45] [expand_dump_flag]: 4.15999e-06 [switch_simplify]: 0.00011907 [loop_unroll]: 4.407e-05 [a_1]: 0.00082557 [with_stream_mark]: 1.433e-05 [recompute_prepare]: 8.56002e-06 [updatestate_depend_eliminate]: 4.09997e-06 [updatestate_assign_eliminate]: 3.01001e-06 [updatestate_loads_eliminate]: 2.59001e-06 [parameter_eliminate]: 2.39999e-06 [a_2]: 8.32e-05 [accelerated_algorithm]: 6.91001e-06 [shard]: 1.71e-06 [meta_shard_fg_expand]: 1.99e-06 [shard_inline]: 6.51e-06 [merge_send_recv]: 8.45001e-06 [auto_parallel]: 5.97001e-06 [parallel]: 1.797e-05 [flash_sp]: 7.23e-06 [merge_comm]: 4.4e-06 [allreduce_fusion]: 3.88001e-06 [matmul_add_comm_reduction]: 8.60001e-06 [allreduce_slice_to_reducescatter]: 7.2e-07 [virtual_shard_identity]: 7.94997e-06 [virtual_dataset]: 6.69001e-06 [get_grad_eliminate_]: 6.54999e-06 [virtual_output]: 6.46999e-06 [merge_forward]: 3.61999e-06 [cell_reuse_recompute_pass]: 1.27999e-06 [offload_activation]: 9.10999e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.231e-05 [merge_recompute_call_nodes]: 1.76003e-06 [before_grad]: 1.073e-05 [set_forward_comm_id_for_comm_node_pass]: 3.70998e-06 [meta_fg_expand]: 3.21001e-06 [flash_sp_send_recv_attached]: 2.59001e-06 [receive_attached]: 2.24001e-06 [after_resolve]: 1.141e-05 [a_after_grad]: 9.81e-06 [renormalize]: 0.00070864 [add_forward_monad_depend]: 5.00001e-06 [auto_monad_grad]: 1.62001e-06 [auto_monad_eliminator]: 1.571e-05 [cse]: 2.674e-05 [a_3]: 4.845e-05 [Cycle 2]: 0.00064673, [45] [expand_dump_flag]: 1.04e-06 [switch_simplify]: 8.33001e-06 [loop_unroll]: 7.06001e-06 [a_1]: 0.00013587 [with_stream_mark]: 1.052e-05 [recompute_prepare]: 6.52001e-06 [updatestate_depend_eliminate]: 2.91e-06 [updatestate_assign_eliminate]: 2.26e-06 [updatestate_loads_eliminate]: 2.79001e-06 [parameter_eliminate]: 1.04998e-06 [a_2]: 7.5e-05 [accelerated_algorithm]: 6.65002e-06 [shard]: 1.26002e-06 [meta_shard_fg_expand]: 1.32999e-06 [shard_inline]: 6.63e-06 [merge_send_recv]: 4.43001e-06 [auto_parallel]: 5.25999e-06 [parallel]: 4.27e-06 [flash_sp]: 3.49001e-06 [merge_comm]: 3.16001e-06 [allreduce_fusion]: 2.98e-06 [matmul_add_comm_reduction]: 5.64e-06 [allreduce_slice_to_reducescatter]: 3.9002e-07 [virtual_shard_identity]: 6.68e-06 [virtual_dataset]: 6.07999e-06 [get_grad_eliminate_]: 5.92999e-06 [virtual_output]: 5.62001e-06 [merge_forward]: 2.53e-06 [cell_reuse_recompute_pass]: 1.54e-06 [offload_activation]: 5.92001e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.323e-05 [merge_recompute_call_nodes]: 9.39996e-07 [before_grad]: 9.74e-06 [set_forward_comm_id_for_comm_node_pass]: 3.83001e-06 [meta_fg_expand]: 2.01e-06 [flash_sp_send_recv_attached]: 8.60018e-07 [receive_attached]: 1.00001e-06 [after_resolve]: 1.059e-05 [a_after_grad]: 9.32999e-06 [renormalize]: 9.00181e-08 [add_forward_monad_depend]: 1.10001e-06 [auto_monad_grad]: 9.50007e-07 [auto_monad_eliminator]: 6.76e-06 [cse]: 1.251e-05 [a_3]: 3.719e-05 [py_interpret_to_execute_after_opt_a]: 8.10999e-06 [slice_cell_reuse_recomputed_activation]: 1.81998e-06 [rewriter_after_opt_a]: 3.297e-05 [convert_after_rewriter]: 6.64999e-06 [order_py_execute_after_rewriter]: 5.16998e-06 [mutable_eliminate]: 0.00045761 [opt_b]: 0.00020215, [1] [Cycle 1]: 0.00019623, [7] [b_1]: 0.00012635 [b_2]: 8.27e-06 [updatestate_depend_eliminate]: 4.89e-06 [updatestate_assign_eliminate]: 2.42001e-06 [updatestate_loads_eliminate]: 2.14e-06 [renormalize]: 3.50003e-07 [cse]: 1.601e-05 [optimize_parallel_all_gather_comm]: 1.544e-05 [overlap_param_gather]: 1.76e-06 [cconv]: 2.468e-05 [loop_unroll]: 0.00041979 [opt_after_cconv]: 0.00010098, [1] [Cycle 1]: 9.561e-05, [7] [c_1]: 3.233e-05 [parameter_eliminate]: 2.84999e-06 [updatestate_depend_eliminate]: 4.83001e-06 [updatestate_assign_eliminate]: 2.33998e-06 [updatestate_loads_eliminate]: 2.26e-06 [cse]: 1.645e-05 [renormalize]: 6.19999e-07 [remove_dup_value]: 1.319e-05 [tuple_transform]: 7.511e-05, [1] [Cycle 1]: 7.113e-05, [4] [d_1]: 4.363e-05 [none_parameter_eliminate]: 1.79e-06 [renormalize]: 2.30008e-07 [switch_simplify]: 6.84999e-06 [partial_unused_args_eliminate]: 1.76998e-06 [add_recomputation]: 4.627e-05 [cse_after_recomputation]: 2.024e-05, [1] [Cycle 1]: 1.61e-05, [1] [cse]: 1.054e-05 [environ_conv]: 4.70999e-06 [swap_dp_allreduce_reducescatter]: 4.77e-06 [bias_add_comm_swap]: 3.13e-06 [label_micro_interleaved_index]: 3.97998e-06 [label_fine_grained_interleaved_index]: 2.91e-06 [merge_cast_opt]: 1.37999e-06 [slice_recompute_activation]: 2.16e-06 [micro_interleaved_order_control]: 2.34999e-06 [assign_add_opt]: 1.15999e-06 [ForceFp32Comm]: 7.90023e-07 [remove_cast_before_assign_add]: 1.09e-06 [full_micro_interleaved_order_control]: 2.16e-06 [reorder_send_recv_between_fp_bp]: 2.76e-06 [comm_op_add_attrs]: 1.24e-06 [add_comm_op_reuse_tag]: 9.10019e-07 [interleave_split_concat_branches]: 1.33002e-06 [interleave_parallel_branches]: 1.07998e-06 [overlap_opt_shard_in_pipeline]: 1.26997e-06 [overlap_opt_shard_grad_in_pipeline]: 1.97001e-06 [control_data_broadcast_order]: 1.302e-05 [grouped_pairwise_exchange_alltoall]: 2.41e-06 [offloading_packed_experts]: 3.83999e-06 [overlap_recompute_and_grad_model_parallel]: 4.58001e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.21002e-06 [overlap_recompute_allgather_and_fa_grad]: 1.32e-06 [overlap_recompute_comm]: 1.94e-06 [overlap_grad_ring_attention]: 3.76001e-06 [overlap_grad_flash_sp]: 1.776e-05 [begin_end_overlap_inline]: 5.50004e-07 [split_matmul_comm_elemetwise]: 2.31998e-06 [split_layernorm_comm]: 1.75001e-06 [handle_group_info]: 8.60018e-07 [symbol_engine_optimizer]: 7.672e-05, [1] [Cycle 1]: 7.226e-05, [6] [build]: 2.94001e-06 [elim_shapecalc]: 1.035e-05 [elim_not_effective]: 1.311e-05 [opt_reshape]: 7.26001e-06 [fold_const_symbol]: 1.032e-05 [renormalize]: 3.09985e-07 [detach_backward]: 1.89999e-06 [pipeline_parallel_scheduler]: 1.69e-06 [auto_monad_reorder]: 1.612e-05 [get_jit_bprop_graph]: 1.19e-06 [rewriter_after_jit_bprop_graph]: 4.1e-06 [opt_after_jit_grad]: 0.00045832 [validate]: 3.521e-05 Sums bootstrap : 0.000435s : 1.25% type_inference : 0.029481s : 84.69% event_method : 0.000053s : 0.15% auto_monad : 0.000097s : 0.28% graph_reusing : 0.000008s : 0.02% inline : 0.000002s : 0.01% add_attr.add_attr_with_inline.tag_attr : 0.000025s : 0.07% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000008s : 0.02% parallel-infer-symbol : 0.000003s : 0.01% pre_auto_parallel : 0.000039s : 0.11% insert-virtual-dataset : 0.000003s : 0.01% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.01% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000031s : 0.09% optimize.rewriter_before_opt_a : 0.000108s : 0.31% optimize.opt_a.expand_dump_flag : 0.000005s : 0.01% optimize.opt_a.switch_simplify : 0.000127s : 0.37% optimize.opt_a.loop_unroll : 0.000051s : 0.15% optimize.opt_a.a_1 : 0.000961s : 2.76% optimize.opt_a.with_stream_mark : 0.000025s : 0.07% optimize.opt_a.recompute_prepare : 0.000015s : 0.04% optimize.opt_a.updatestate_depend_eliminate : 0.000007s : 0.02% optimize.opt_a.updatestate_assign_eliminate : 0.000005s : 0.02% optimize.opt_a.updatestate_loads_eliminate : 0.000005s : 0.02% optimize.opt_a.parameter_eliminate : 0.000003s : 0.01% optimize.opt_a.a_2 : 0.000158s : 0.45% optimize.opt_a.accelerated_algorithm : 0.000014s : 0.04% optimize.opt_a.shard : 0.000003s : 0.01% optimize.opt_a.meta_shard_fg_expand : 0.000003s : 0.01% optimize.opt_a.shard_inline : 0.000013s : 0.04% optimize.opt_a.merge_send_recv : 0.000013s : 0.04% optimize.opt_a.auto_parallel : 0.000011s : 0.03% optimize.opt_a.parallel : 0.000022s : 0.06% optimize.opt_a.flash_sp : 0.000011s : 0.03% optimize.opt_a.merge_comm : 0.000008s : 0.02% optimize.opt_a.allreduce_fusion : 0.000007s : 0.02% optimize.opt_a.matmul_add_comm_reduction : 0.000014s : 0.04% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000015s : 0.04% optimize.opt_a.virtual_dataset : 0.000013s : 0.04% optimize.opt_a.get_grad_eliminate_ : 0.000012s : 0.04% optimize.opt_a.virtual_output : 0.000012s : 0.03% optimize.opt_a.merge_forward : 0.000006s : 0.02% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.01% optimize.opt_a.offload_activation : 0.000015s : 0.04% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000026s : 0.07% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.01% optimize.opt_a.before_grad : 0.000020s : 0.06% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000008s : 0.02% optimize.opt_a.meta_fg_expand : 0.000005s : 0.01% optimize.opt_a.flash_sp_send_recv_attached : 0.000003s : 0.01% optimize.opt_a.receive_attached : 0.000003s : 0.01% optimize.opt_a.after_resolve : 0.000022s : 0.06% optimize.opt_a.a_after_grad : 0.000019s : 0.05% optimize.opt_a.renormalize : 0.000709s : 2.04% optimize.opt_a.add_forward_monad_depend : 0.000006s : 0.02% optimize.opt_a.auto_monad_grad : 0.000003s : 0.01% optimize.opt_a.auto_monad_eliminator : 0.000022s : 0.06% optimize.opt_a.cse : 0.000039s : 0.11% optimize.opt_a.a_3 : 0.000086s : 0.25% optimize.py_interpret_to_execute_after_opt_a : 0.000008s : 0.02% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.01% optimize.rewriter_after_opt_a : 0.000033s : 0.09% optimize.convert_after_rewriter : 0.000007s : 0.02% optimize.order_py_execute_after_rewriter : 0.000005s : 0.01% optimize.mutable_eliminate : 0.000458s : 1.31% optimize.opt_b.b_1 : 0.000126s : 0.36% optimize.opt_b.b_2 : 0.000008s : 0.02% optimize.opt_b.updatestate_depend_eliminate : 0.000005s : 0.01% optimize.opt_b.updatestate_assign_eliminate : 0.000002s : 0.01% optimize.opt_b.updatestate_loads_eliminate : 0.000002s : 0.01% optimize.opt_b.renormalize : 0.000000s : 0.00% optimize.opt_b.cse : 0.000016s : 0.05% optimize.optimize_parallel_all_gather_comm : 0.000015s : 0.04% optimize.overlap_param_gather : 0.000002s : 0.01% optimize.cconv : 0.000025s : 0.07% optimize.loop_unroll : 0.000420s : 1.21% optimize.opt_after_cconv.c_1 : 0.000032s : 0.09% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000005s : 0.01% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000002s : 0.01% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.01% optimize.opt_after_cconv.cse : 0.000016s : 0.05% optimize.opt_after_cconv.renormalize : 0.000001s : 0.00% optimize.remove_dup_value : 0.000013s : 0.04% optimize.tuple_transform.d_1 : 0.000044s : 0.13% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000007s : 0.02% optimize.partial_unused_args_eliminate : 0.000002s : 0.01% optimize.add_recomputation : 0.000046s : 0.13% optimize.cse_after_recomputation.cse : 0.000011s : 0.03% optimize.environ_conv : 0.000005s : 0.01% optimize.swap_dp_allreduce_reducescatter : 0.000005s : 0.01% optimize.bias_add_comm_swap : 0.000003s : 0.01% optimize.label_micro_interleaved_index : 0.000004s : 0.01% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.01% optimize.merge_cast_opt : 0.000001s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.01% optimize.micro_interleaved_order_control : 0.000002s : 0.01% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.00% optimize.full_micro_interleaved_order_control : 0.000002s : 0.01% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.01% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.01% optimize.control_data_broadcast_order : 0.000013s : 0.04% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.01% optimize.offloading_packed_experts : 0.000004s : 0.01% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.01% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.00% optimize.overlap_recompute_comm : 0.000002s : 0.01% optimize.overlap_grad_ring_attention : 0.000004s : 0.01% optimize.overlap_grad_flash_sp : 0.000018s : 0.05% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.01% optimize.split_layernorm_comm : 0.000002s : 0.01% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000003s : 0.01% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000010s : 0.03% optimize.symbol_engine_optimizer.elim_not_effective : 0.000013s : 0.04% optimize.symbol_engine_optimizer.opt_reshape : 0.000007s : 0.02% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000010s : 0.03% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.01% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000016s : 0.05% get_jit_bprop_graph : 0.000001s : 0.00% rewriter_after_jit_bprop_graph : 0.000004s : 0.01% opt_after_jit_grad : 0.000458s : 1.32% validate : 0.000035s : 0.10% Time group info: ------[substitution.] 0.000209 34 0.90% : 0.000002s : 2: substitution.elim_not_effective 0.68% : 0.000001s : 2: substitution.fold_const_symbol 2.57% : 0.000005s : 4: substitution.graph_param_transform 78.14% : 0.000164s : 8: substitution.inline 1.65% : 0.000003s : 4: substitution.j_node_and_user_rematch 2.23% : 0.000005s : 4: substitution.remove_not_recompute_node 1.83% : 0.000004s : 4: substitution.replace_old_param 4.60% : 0.000010s : 2: substitution.switch_simplify 7.41% : 0.000016s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.029410 2 95.43% : 0.028066s : 1: type_inference.infer 4.57% : 0.001344s : 1: type_inference.specialize ------[replace.] 0.000115 14 44.43% : 0.000051s : 8: replace.inline 36.40% : 0.000042s : 2: replace.switch_simplify 19.17% : 0.000022s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000181 14 87.94% : 0.000159s : 8: match.inline 4.67% : 0.000008s : 2: match.switch_simplify 7.39% : 0.000013s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000236 1520 0.97% : 0.000002s : 17: predicate.accumulaten_eliminater 0.65% : 0.000002s : 4: predicate.ad_related_special_op_eliminate 0.46% : 0.000001s : 8: predicate.addn_check_dump 0.99% : 0.000002s : 17: predicate.addn_zero_filter 0.93% : 0.000002s : 17: predicate.adjust_all_reduce_mul_add 2.09% : 0.000005s : 25: predicate.arithmetic_simplify 1.07% : 0.000003s : 17: predicate.cast_eliminate 0.49% : 0.000001s : 8: predicate.check_bprop_eliminate 0.41% : 0.000001s : 8: predicate.compare_switch_simplify 0.17% : 0.000000s : 4: predicate.const_output_eliminate 0.46% : 0.000001s : 8: predicate.depend_value_elim 1.07% : 0.000003s : 17: predicate.dict_get_item_const_eliminator 1.17% : 0.000003s : 17: predicate.dict_get_item_eliminator 0.99% : 0.000002s : 17: predicate.dict_set_item_eliminator 0.76% : 0.000002s : 8: predicate.dumpgradient_eliminate 0.21% : 0.000000s : 4: predicate.elim_not_effective 0.35% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.21% : 0.000003s : 21: predicate.environ_add_const_eliminate 1.19% : 0.000003s : 21: predicate.environ_get_add_eliminate 1.16% : 0.000003s : 21: predicate.environ_get_depend_swap 1.71% : 0.000004s : 29: predicate.environ_get_eliminate 1.12% : 0.000003s : 21: predicate.environ_get_set_eliminate 1.70% : 0.000004s : 29: predicate.exchange_switch_depend_value 2.80% : 0.000007s : 29: predicate.float_depend_g_call 0.46% : 0.000001s : 8: predicate.float_environ_get_switch 0.62% : 0.000001s : 12: predicate.float_tuple_getitem_switch 0.17% : 0.000000s : 4: predicate.fold_const_symbol 0.66% : 0.000002s : 8: predicate.get_grad_eliminate 0.20% : 0.000000s : 4: predicate.graph_param_transform 0.54% : 0.000001s : 8: predicate.incorporate_call 0.43% : 0.000001s : 8: predicate.incorporate_call_switch 6.31% : 0.000015s : 70: predicate.inline 0.69% : 0.000002s : 8: predicate.inline_without_move 0.30% : 0.000001s : 8: predicate.j_node_and_user_rematch 0.62% : 0.000001s : 8: predicate.less_batch_normalization 2.00% : 0.000005s : 29: predicate.list_to_tuple_eliminator_ 2.65% : 0.000006s : 46: predicate.load_eliminater 0.82% : 0.000002s : 4: predicate.loop_unroll_after_grad 2.86% : 0.000007s : 47: predicate.loop_unroll_before_grad 1.67% : 0.000004s : 25: predicate.make_slice_get_slice_eliminator 0.43% : 0.000001s : 8: predicate.merge_addn 0.48% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.50% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.96% : 0.000002s : 17: predicate.minmaximum_grad 0.79% : 0.000002s : 4: predicate.mutable_eliminate 0.29% : 0.000001s : 4: predicate.opt_reshape 0.36% : 0.000001s : 4: predicate.parallel_virtual_node 2.10% : 0.000005s : 29: predicate.partial_defer_inline 1.70% : 0.000004s : 25: predicate.partial_eliminate 1.01% : 0.000002s : 17: predicate.print_const_string_wrapper 0.43% : 0.000001s : 8: predicate.reduce_all_const_elim 1.27% : 0.000003s : 17: predicate.reduce_eliminate 2.55% : 0.000006s : 46: predicate.redundant_stop_gradient_eliminater 0.38% : 0.000001s : 8: predicate.remove_not_recompute_node 1.35% : 0.000003s : 29: predicate.replace_applicator 0.52% : 0.000001s : 8: predicate.replace_old_param 0.25% : 0.000001s : 4: predicate.reset_defer_inline 1.10% : 0.000003s : 17: predicate.reshape_eliminate 0.52% : 0.000001s : 8: predicate.row_tensor_add_zeros_like 0.29% : 0.000001s : 4: predicate.row_tensor_eliminate 0.59% : 0.000001s : 8: predicate.same_eliminate 0.39% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.62% : 0.000001s : 8: predicate.shard_identity_eliminate 0.55% : 0.000001s : 8: predicate.special_op_eliminate 0.57% : 0.000001s : 8: predicate.specialize_transform 0.64% : 0.000002s : 8: predicate.split_environ_get_set_with_tuple_value 0.61% : 0.000001s : 8: predicate.stack_unstack_eliminate 0.30% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.89% : 0.000004s : 29: predicate.switch_defer_inline 2.29% : 0.000005s : 37: predicate.switch_layer_defer_inline 6.17% : 0.000015s : 92: predicate.switch_simplify 0.98% : 0.000002s : 17: predicate.tile_eliminate 1.02% : 0.000002s : 17: predicate.transpose_eliminate 1.52% : 0.000004s : 25: predicate.tuple_list_convert_item_index_to_positive 1.67% : 0.000004s : 25: predicate.tuple_list_get_item_const_eliminator 1.48% : 0.000003s : 25: predicate.tuple_list_get_item_depend_reorder 3.10% : 0.000007s : 37: predicate.tuple_list_get_item_eliminator 1.48% : 0.000004s : 25: predicate.tuple_list_get_set_item_eliminator 2.14% : 0.000005s : 33: predicate.tuple_list_set_item_eliminator 1.73% : 0.000004s : 29: predicate.tuple_to_list_eliminator_ 2.52% : 0.000006s : 46: predicate.updatestate_pure_node_eliminater 3.11% : 0.000007s : 54: predicate.updatestate_useless_node_eliminater 0.36% : 0.000001s : 4: predicate.value_based_eliminate 0.54% : 0.000001s : 8: predicate.virtual_dataset_eliminate 0.52% : 0.000001s : 8: predicate.virtual_output_eliminate 0.24% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.57% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000960 16 59.55% : 0.000572s : 6: func_graph_cloner_run.FuncGraphClonerGraph 40.45% : 0.000388s : 10: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.049196 192 0.01% : 0.000004s : 1: ForceFp32Comm 6.24% : 0.003071s : 1: add_attr 6.22% : 0.003061s : 1: add_attr_with_inline 0.03% : 0.000017s : 1: add_comm_op_reuse_tag 0.10% : 0.000050s : 1: add_recomputation 0.01% : 0.000004s : 1: assign_add_opt 0.21% : 0.000103s : 1: auto_monad 0.04% : 0.000020s : 1: auto_monad_reorder 0.01% : 0.000004s : 1: begin_end_overlap_inline 0.01% : 0.000006s : 1: bias_add_comm_swap 0.94% : 0.000465s : 1: bootstrap 0.06% : 0.000028s : 1: cconv 0.01% : 0.000004s : 1: comm_op_add_attrs 0.03% : 0.000016s : 1: control_data_broadcast_order 0.02% : 0.000010s : 1: convert_after_rewriter 0.05% : 0.000023s : 1: cse_after_recomputation 0.01% : 0.000005s : 1: dataset_repeat_opt 0.01% : 0.000005s : 1: detach_backward 0.02% : 0.000008s : 1: environ_conv 0.12% : 0.000061s : 1: event_method 0.01% : 0.000005s : 1: full_micro_interleaved_order_control 0.01% : 0.000004s : 1: get_jit_bprop_graph 0.02% : 0.000011s : 1: graph_reusing 0.01% : 0.000006s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000004s : 1: handle_group_info 0.01% : 0.000006s : 1: inline 0.01% : 0.000006s : 1: insert-virtual-dataset 0.01% : 0.000004s : 1: interleave_parallel_branches 0.01% : 0.000004s : 1: interleave_split_concat_branches 0.01% : 0.000006s : 1: label_fine_grained_interleaved_index 0.01% : 0.000007s : 1: label_micro_interleaved_index 0.87% : 0.000428s : 1: loop_unroll 0.01% : 0.000004s : 1: merge_cast_opt 0.01% : 0.000005s : 1: micro_interleaved_order_control 0.95% : 0.000466s : 1: mutable_eliminate 0.01% : 0.000007s : 1: offloading_packed_experts 0.03% : 0.000014s : 1: opt.transform.loop_unroll_optimizer 0.03% : 0.000014s : 1: opt.transform.mutable_eliminate 3.00% : 0.001475s : 78: opt.transform.opt_a 0.06% : 0.000031s : 1: opt.transform.opt_after_cconv 0.05% : 0.000025s : 1: opt.transform.opt_after_jit_grad 0.21% : 0.000104s : 28: opt.transform.opt_b 0.10% : 0.000048s : 2: opt.transform.opt_trans_graph 0.08% : 0.000037s : 4: opt.transform.symbol_engine_opt 6.09% : 0.002995s : 1: opt_a 0.21% : 0.000104s : 1: opt_after_cconv 0.95% : 0.000467s : 1: opt_after_jit_grad 0.42% : 0.000205s : 1: opt_b 10.10% : 0.004971s : 1: optimize 0.04% : 0.000019s : 1: optimize_parallel_all_gather_comm 0.02% : 0.000008s : 1: order_py_execute_after_rewriter 0.04% : 0.000021s : 1: overlap_grad_flash_sp 0.01% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.01% : 0.000007s : 1: overlap_grad_ring_attention 0.01% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.01% : 0.000005s : 1: overlap_param_gather 0.01% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.02% : 0.000007s : 1: overlap_recompute_and_grad_model_parallel 0.01% : 0.000005s : 1: overlap_recompute_comm 0.01% : 0.000006s : 1: parallel-infer-symbol 0.01% : 0.000004s : 1: parallel-infer-symbol-second 0.01% : 0.000005s : 1: partial_unused_args_eliminate 0.01% : 0.000005s : 1: pipeline_parallel_scheduler 0.01% : 0.000005s : 1: pipeline_split 0.09% : 0.000043s : 1: pre_auto_parallel 0.07% : 0.000035s : 1: py_interpret_to_execute 0.02% : 0.000011s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000004s : 1: remove_cast_before_assign_add 0.03% : 0.000017s : 1: remove_dup_value 0.72% : 0.000354s : 1: renormalize.infer 0.70% : 0.000346s : 1: renormalize.specialize 0.01% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.02% : 0.000008s : 1: rewriter_after_jit_bprop_graph 0.07% : 0.000037s : 1: rewriter_after_opt_a 0.23% : 0.000112s : 1: rewriter_before_opt_a 0.01% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.01% : 0.000005s : 1: slice_recompute_activation 0.01% : 0.000005s : 1: split_layernorm_comm 0.01% : 0.000005s : 1: split_matmul_comm_elemetwise 0.02% : 0.000008s : 1: swap_dp_allreduce_reducescatter 0.16% : 0.000079s : 1: symbol_engine_optimizer 0.16% : 0.000078s : 1: tuple_transform 59.96% : 0.029500s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:16.155.507 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:16.155.790 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0401954, [21] [bootstrap]: 0.00043377 [type_inference]: 0.0297215 [event_method]: 4.729e-05 [auto_monad]: 9.519e-05 [graph_reusing]: 7.26999e-06 [inline]: 2.56e-06 [add_attr]: 0.00306149, [1] [add_attr_with_inline]: 0.00305317, [1] [Cycle 1]: 7.747e-05, [2] [tag_attr]: 2.616e-05 [meta_addattr_fg_expand]: 7.68999e-06 [parallel-infer-symbol]: 2.89999e-06 [pre_auto_parallel]: 3.973e-05 [insert-virtual-dataset]: 2.19999e-06 [parallel-infer-symbol-second]: 7.50006e-07 [dataset_repeat_opt]: 2.30002e-06 [pipeline_split]: 1.98002e-06 [optimize]: 0.00566142, [53] [py_interpret_to_execute]: 3.56e-05 [rewriter_before_opt_a]: 0.0001129 [opt_a]: 0.00337766, [2] [Cycle 1]: 0.00254729, [45] [expand_dump_flag]: 3.94002e-06 [switch_simplify]: 0.00011879 [loop_unroll]: 4.161e-05 [a_1]: 0.00082133 [with_stream_mark]: 1.456e-05 [recompute_prepare]: 8.74e-06 [updatestate_depend_eliminate]: 3.8e-06 [updatestate_assign_eliminate]: 2.88e-06 [updatestate_loads_eliminate]: 2.51e-06 [parameter_eliminate]: 1.66e-06 [a_2]: 0.00011937 [accelerated_algorithm]: 7.36999e-06 [shard]: 2.27999e-06 [meta_shard_fg_expand]: 1.87001e-06 [shard_inline]: 6.84001e-06 [merge_send_recv]: 8.50001e-06 [auto_parallel]: 6.33998e-06 [parallel]: 1.827e-05 [flash_sp]: 8.19998e-06 [merge_comm]: 3.91001e-06 [allreduce_fusion]: 3.45e-06 [matmul_add_comm_reduction]: 9.72001e-06 [allreduce_slice_to_reducescatter]: 6.69999e-07 [virtual_shard_identity]: 7.66999e-06 [virtual_dataset]: 6.75002e-06 [get_grad_eliminate_]: 6.81001e-06 [virtual_output]: 6.53e-06 [merge_forward]: 3.68e-06 [cell_reuse_recompute_pass]: 1.42e-06 [offload_activation]: 1.002e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.469e-05 [merge_recompute_call_nodes]: 2.04e-06 [before_grad]: 1.016e-05 [set_forward_comm_id_for_comm_node_pass]: 3.53e-06 [meta_fg_expand]: 3.46999e-06 [flash_sp_send_recv_attached]: 2.51998e-06 [receive_attached]: 2.49999e-06 [after_resolve]: 1.137e-05 [a_after_grad]: 1.003e-05 [renormalize]: 0.00072963 [add_forward_monad_depend]: 5.63002e-06 [auto_monad_grad]: 2.39999e-06 [auto_monad_eliminator]: 1.652e-05 [cse]: 2.667e-05 [a_3]: 6.225e-05 [Cycle 2]: 0.00081724, [45] [expand_dump_flag]: 1.10001e-06 [switch_simplify]: 7.65998e-06 [loop_unroll]: 6.34001e-06 [a_1]: 0.00013516 [with_stream_mark]: 1.126e-05 [recompute_prepare]: 6.80998e-06 [updatestate_depend_eliminate]: 3.13e-06 [updatestate_assign_eliminate]: 2.57001e-06 [updatestate_loads_eliminate]: 2.49999e-06 [parameter_eliminate]: 1.24e-06 [a_2]: 0.00010202 [accelerated_algorithm]: 7.28999e-06 [shard]: 1.15999e-06 [meta_shard_fg_expand]: 1.27e-06 [shard_inline]: 6.53e-06 [merge_send_recv]: 4.85001e-06 [auto_parallel]: 5.40001e-06 [parallel]: 4.79e-06 [flash_sp]: 3.8e-06 [merge_comm]: 3.26999e-06 [allreduce_fusion]: 3.2e-06 [matmul_add_comm_reduction]: 5.96998e-06 [allreduce_slice_to_reducescatter]: 4.00003e-07 [virtual_shard_identity]: 6.86001e-06 [virtual_dataset]: 6.21e-06 [get_grad_eliminate_]: 5.91003e-06 [virtual_output]: 5.84e-06 [merge_forward]: 2.66e-06 [cell_reuse_recompute_pass]: 1.40001e-06 [offload_activation]: 6.11e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.498e-05 [merge_recompute_call_nodes]: 8.80013e-07 [before_grad]: 9.39e-06 [set_forward_comm_id_for_comm_node_pass]: 3.98999e-06 [meta_fg_expand]: 2.21e-06 [flash_sp_send_recv_attached]: 9.29984e-07 [receive_attached]: 1.24e-06 [after_resolve]: 1.079e-05 [a_after_grad]: 9.86e-06 [renormalize]: 8.00064e-08 [add_forward_monad_depend]: 1.51998e-06 [auto_monad_grad]: 9.79984e-07 [auto_monad_eliminator]: 8.17e-06 [cse]: 1.204e-05 [a_3]: 5.017e-05 [py_interpret_to_execute_after_opt_a]: 1.175e-05 [slice_cell_reuse_recomputed_activation]: 5.35999e-06 [rewriter_after_opt_a]: 3.662e-05 [convert_after_rewriter]: 1.076e-05 [order_py_execute_after_rewriter]: 8.87999e-06 [mutable_eliminate]: 0.00048584 [opt_b]: 0.00026673, [1] [Cycle 1]: 0.00025842, [7] [b_1]: 0.00017047 [b_2]: 7.55998e-06 [updatestate_depend_eliminate]: 4.90999e-06 [updatestate_assign_eliminate]: 2.47001e-06 [updatestate_loads_eliminate]: 2.43e-06 [renormalize]: 4.00003e-07 [cse]: 1.597e-05 [optimize_parallel_all_gather_comm]: 1.927e-05 [overlap_param_gather]: 4.78001e-06 [cconv]: 2.623e-05 [loop_unroll]: 0.00042853 [opt_after_cconv]: 0.00012316, [1] [Cycle 1]: 0.00011474, [7] [c_1]: 3.227e-05 [parameter_eliminate]: 2.64001e-06 [updatestate_depend_eliminate]: 5.36998e-06 [updatestate_assign_eliminate]: 2.49001e-06 [updatestate_loads_eliminate]: 2.36e-06 [cse]: 1.565e-05 [renormalize]: 4.69998e-07 [remove_dup_value]: 1.547e-05 [tuple_transform]: 8.92e-05, [1] [Cycle 1]: 8.246e-05, [4] [d_1]: 4.357e-05 [none_parameter_eliminate]: 1.59e-06 [renormalize]: 2.19996e-07 [switch_simplify]: 7.42002e-06 [partial_unused_args_eliminate]: 4.45e-06 [add_recomputation]: 4.859e-05 [cse_after_recomputation]: 2.637e-05, [1] [Cycle 1]: 1.943e-05, [1] [cse]: 1.04e-05 [environ_conv]: 7.48e-06 [swap_dp_allreduce_reducescatter]: 8.08999e-06 [bias_add_comm_swap]: 4.51002e-06 [label_micro_interleaved_index]: 6.49999e-06 [label_fine_grained_interleaved_index]: 5.44e-06 [merge_cast_opt]: 3.90998e-06 [slice_recompute_activation]: 4.46002e-06 [micro_interleaved_order_control]: 1.422e-05 [assign_add_opt]: 3.95998e-06 [ForceFp32Comm]: 3.26999e-06 [remove_cast_before_assign_add]: 3.21999e-06 [full_micro_interleaved_order_control]: 4.67e-06 [reorder_send_recv_between_fp_bp]: 5.29e-06 [comm_op_add_attrs]: 4.13001e-06 [add_comm_op_reuse_tag]: 3.75e-06 [interleave_split_concat_branches]: 4.59002e-06 [interleave_parallel_branches]: 4.24002e-06 [overlap_opt_shard_in_pipeline]: 3.91999e-06 [overlap_opt_shard_grad_in_pipeline]: 4.35999e-06 [control_data_broadcast_order]: 1.714e-05 [grouped_pairwise_exchange_alltoall]: 4.58001e-06 [offloading_packed_experts]: 7.17002e-06 [overlap_recompute_and_grad_model_parallel]: 7.88999e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.80998e-06 [overlap_recompute_allgather_and_fa_grad]: 4.11001e-06 [overlap_recompute_comm]: 5.10999e-06 [overlap_grad_ring_attention]: 6.67002e-06 [overlap_grad_flash_sp]: 2.2e-05 [begin_end_overlap_inline]: 3.28998e-06 [split_matmul_comm_elemetwise]: 4.63999e-06 [split_layernorm_comm]: 4.28001e-06 [handle_group_info]: 3.51999e-06 [symbol_engine_optimizer]: 9.982e-05, [1] [Cycle 1]: 9.301e-05, [6] [build]: 2.74001e-06 [elim_shapecalc]: 1.069e-05 [elim_not_effective]: 1.352e-05 [opt_reshape]: 7.65e-06 [fold_const_symbol]: 1.137e-05 [renormalize]: 2.09984e-07 [detach_backward]: 3.31999e-06 [pipeline_parallel_scheduler]: 2.04e-06 [auto_monad_reorder]: 1.899e-05 [get_jit_bprop_graph]: 1.24998e-06 [rewriter_after_jit_bprop_graph]: 3.84997e-06 [opt_after_jit_grad]: 0.00047532 [validate]: 3.666e-05 Sums bootstrap : 0.000434s : 1.23% type_inference : 0.029722s : 83.94% event_method : 0.000047s : 0.13% auto_monad : 0.000095s : 0.27% graph_reusing : 0.000007s : 0.02% inline : 0.000003s : 0.01% add_attr.add_attr_with_inline.tag_attr : 0.000026s : 0.07% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000008s : 0.02% parallel-infer-symbol : 0.000003s : 0.01% pre_auto_parallel : 0.000040s : 0.11% insert-virtual-dataset : 0.000002s : 0.01% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.01% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000036s : 0.10% optimize.rewriter_before_opt_a : 0.000113s : 0.32% optimize.opt_a.expand_dump_flag : 0.000005s : 0.01% optimize.opt_a.switch_simplify : 0.000126s : 0.36% optimize.opt_a.loop_unroll : 0.000048s : 0.14% optimize.opt_a.a_1 : 0.000956s : 2.70% optimize.opt_a.with_stream_mark : 0.000026s : 0.07% optimize.opt_a.recompute_prepare : 0.000016s : 0.04% optimize.opt_a.updatestate_depend_eliminate : 0.000007s : 0.02% optimize.opt_a.updatestate_assign_eliminate : 0.000005s : 0.02% optimize.opt_a.updatestate_loads_eliminate : 0.000005s : 0.01% optimize.opt_a.parameter_eliminate : 0.000003s : 0.01% optimize.opt_a.a_2 : 0.000221s : 0.63% optimize.opt_a.accelerated_algorithm : 0.000015s : 0.04% optimize.opt_a.shard : 0.000003s : 0.01% optimize.opt_a.meta_shard_fg_expand : 0.000003s : 0.01% optimize.opt_a.shard_inline : 0.000013s : 0.04% optimize.opt_a.merge_send_recv : 0.000013s : 0.04% optimize.opt_a.auto_parallel : 0.000012s : 0.03% optimize.opt_a.parallel : 0.000023s : 0.07% optimize.opt_a.flash_sp : 0.000012s : 0.03% optimize.opt_a.merge_comm : 0.000007s : 0.02% optimize.opt_a.allreduce_fusion : 0.000007s : 0.02% optimize.opt_a.matmul_add_comm_reduction : 0.000016s : 0.04% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000015s : 0.04% optimize.opt_a.virtual_dataset : 0.000013s : 0.04% optimize.opt_a.get_grad_eliminate_ : 0.000013s : 0.04% optimize.opt_a.virtual_output : 0.000012s : 0.03% optimize.opt_a.merge_forward : 0.000006s : 0.02% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.01% optimize.opt_a.offload_activation : 0.000016s : 0.05% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000030s : 0.08% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.01% optimize.opt_a.before_grad : 0.000020s : 0.06% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000008s : 0.02% optimize.opt_a.meta_fg_expand : 0.000006s : 0.02% optimize.opt_a.flash_sp_send_recv_attached : 0.000003s : 0.01% optimize.opt_a.receive_attached : 0.000004s : 0.01% optimize.opt_a.after_resolve : 0.000022s : 0.06% optimize.opt_a.a_after_grad : 0.000020s : 0.06% optimize.opt_a.renormalize : 0.000730s : 2.06% optimize.opt_a.add_forward_monad_depend : 0.000007s : 0.02% optimize.opt_a.auto_monad_grad : 0.000003s : 0.01% optimize.opt_a.auto_monad_eliminator : 0.000025s : 0.07% optimize.opt_a.cse : 0.000039s : 0.11% optimize.opt_a.a_3 : 0.000112s : 0.32% optimize.py_interpret_to_execute_after_opt_a : 0.000012s : 0.03% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.02% optimize.rewriter_after_opt_a : 0.000037s : 0.10% optimize.convert_after_rewriter : 0.000011s : 0.03% optimize.order_py_execute_after_rewriter : 0.000009s : 0.03% optimize.mutable_eliminate : 0.000486s : 1.37% optimize.opt_b.b_1 : 0.000170s : 0.48% optimize.opt_b.b_2 : 0.000008s : 0.02% optimize.opt_b.updatestate_depend_eliminate : 0.000005s : 0.01% optimize.opt_b.updatestate_assign_eliminate : 0.000002s : 0.01% optimize.opt_b.updatestate_loads_eliminate : 0.000002s : 0.01% optimize.opt_b.renormalize : 0.000000s : 0.00% optimize.opt_b.cse : 0.000016s : 0.05% optimize.optimize_parallel_all_gather_comm : 0.000019s : 0.05% optimize.overlap_param_gather : 0.000005s : 0.01% optimize.cconv : 0.000026s : 0.07% optimize.loop_unroll : 0.000429s : 1.21% optimize.opt_after_cconv.c_1 : 0.000032s : 0.09% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000005s : 0.02% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000002s : 0.01% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.01% optimize.opt_after_cconv.cse : 0.000016s : 0.04% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000015s : 0.04% optimize.tuple_transform.d_1 : 0.000044s : 0.12% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000007s : 0.02% optimize.partial_unused_args_eliminate : 0.000004s : 0.01% optimize.add_recomputation : 0.000049s : 0.14% optimize.cse_after_recomputation.cse : 0.000010s : 0.03% optimize.environ_conv : 0.000007s : 0.02% optimize.swap_dp_allreduce_reducescatter : 0.000008s : 0.02% optimize.bias_add_comm_swap : 0.000005s : 0.01% optimize.label_micro_interleaved_index : 0.000006s : 0.02% optimize.label_fine_grained_interleaved_index : 0.000005s : 0.02% optimize.merge_cast_opt : 0.000004s : 0.01% optimize.slice_recompute_activation : 0.000004s : 0.01% optimize.micro_interleaved_order_control : 0.000014s : 0.04% optimize.assign_add_opt : 0.000004s : 0.01% optimize.ForceFp32Comm : 0.000003s : 0.01% optimize.remove_cast_before_assign_add : 0.000003s : 0.01% optimize.full_micro_interleaved_order_control : 0.000005s : 0.01% optimize.reorder_send_recv_between_fp_bp : 0.000005s : 0.01% optimize.comm_op_add_attrs : 0.000004s : 0.01% optimize.add_comm_op_reuse_tag : 0.000004s : 0.01% optimize.interleave_split_concat_branches : 0.000005s : 0.01% optimize.interleave_parallel_branches : 0.000004s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000004s : 0.01% optimize.control_data_broadcast_order : 0.000017s : 0.05% optimize.grouped_pairwise_exchange_alltoall : 0.000005s : 0.01% optimize.offloading_packed_experts : 0.000007s : 0.02% optimize.overlap_recompute_and_grad_model_parallel : 0.000008s : 0.02% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.01% optimize.overlap_recompute_comm : 0.000005s : 0.01% optimize.overlap_grad_ring_attention : 0.000007s : 0.02% optimize.overlap_grad_flash_sp : 0.000022s : 0.06% optimize.begin_end_overlap_inline : 0.000003s : 0.01% optimize.split_matmul_comm_elemetwise : 0.000005s : 0.01% optimize.split_layernorm_comm : 0.000004s : 0.01% optimize.handle_group_info : 0.000004s : 0.01% optimize.symbol_engine_optimizer.build : 0.000003s : 0.01% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000011s : 0.03% optimize.symbol_engine_optimizer.elim_not_effective : 0.000014s : 0.04% optimize.symbol_engine_optimizer.opt_reshape : 0.000008s : 0.02% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000011s : 0.03% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000003s : 0.01% pipeline_parallel_scheduler : 0.000002s : 0.01% auto_monad_reorder : 0.000019s : 0.05% get_jit_bprop_graph : 0.000001s : 0.00% rewriter_after_jit_bprop_graph : 0.000004s : 0.01% opt_after_jit_grad : 0.000475s : 1.34% validate : 0.000037s : 0.10% Time group info: ------[substitution.] 0.000207 34 0.92% : 0.000002s : 2: substitution.elim_not_effective 0.69% : 0.000001s : 2: substitution.fold_const_symbol 2.55% : 0.000005s : 4: substitution.graph_param_transform 78.20% : 0.000162s : 8: substitution.inline 1.52% : 0.000003s : 4: substitution.j_node_and_user_rematch 2.15% : 0.000004s : 4: substitution.remove_not_recompute_node 1.93% : 0.000004s : 4: substitution.replace_old_param 4.35% : 0.000009s : 2: substitution.switch_simplify 7.68% : 0.000016s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.029656 2 95.66% : 0.028368s : 1: type_inference.infer 4.34% : 0.001288s : 1: type_inference.specialize ------[replace.] 0.000115 14 44.52% : 0.000051s : 8: replace.inline 36.60% : 0.000042s : 2: replace.switch_simplify 18.88% : 0.000022s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000179 14 87.81% : 0.000157s : 8: match.inline 4.38% : 0.000008s : 2: match.switch_simplify 7.80% : 0.000014s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000239 1520 0.97% : 0.000002s : 17: predicate.accumulaten_eliminater 0.59% : 0.000001s : 4: predicate.ad_related_special_op_eliminate 0.44% : 0.000001s : 8: predicate.addn_check_dump 0.97% : 0.000002s : 17: predicate.addn_zero_filter 0.90% : 0.000002s : 17: predicate.adjust_all_reduce_mul_add 2.09% : 0.000005s : 25: predicate.arithmetic_simplify 1.12% : 0.000003s : 17: predicate.cast_eliminate 0.56% : 0.000001s : 8: predicate.check_bprop_eliminate 0.43% : 0.000001s : 8: predicate.compare_switch_simplify 0.16% : 0.000000s : 4: predicate.const_output_eliminate 0.49% : 0.000001s : 8: predicate.depend_value_elim 1.08% : 0.000003s : 17: predicate.dict_get_item_const_eliminator 1.17% : 0.000003s : 17: predicate.dict_get_item_eliminator 0.98% : 0.000002s : 17: predicate.dict_set_item_eliminator 0.81% : 0.000002s : 8: predicate.dumpgradient_eliminate 0.15% : 0.000000s : 4: predicate.elim_not_effective 0.47% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.21% : 0.000003s : 21: predicate.environ_add_const_eliminate 1.17% : 0.000003s : 21: predicate.environ_get_add_eliminate 1.17% : 0.000003s : 21: predicate.environ_get_depend_swap 1.60% : 0.000004s : 29: predicate.environ_get_eliminate 1.16% : 0.000003s : 21: predicate.environ_get_set_eliminate 1.75% : 0.000004s : 29: predicate.exchange_switch_depend_value 2.89% : 0.000007s : 29: predicate.float_depend_g_call 0.44% : 0.000001s : 8: predicate.float_environ_get_switch 0.65% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.18% : 0.000000s : 4: predicate.fold_const_symbol 0.58% : 0.000001s : 8: predicate.get_grad_eliminate 0.19% : 0.000000s : 4: predicate.graph_param_transform 0.51% : 0.000001s : 8: predicate.incorporate_call 0.44% : 0.000001s : 8: predicate.incorporate_call_switch 6.49% : 0.000015s : 70: predicate.inline 0.72% : 0.000002s : 8: predicate.inline_without_move 0.29% : 0.000001s : 8: predicate.j_node_and_user_rematch 0.87% : 0.000002s : 8: predicate.less_batch_normalization 1.76% : 0.000004s : 29: predicate.list_to_tuple_eliminator_ 2.65% : 0.000006s : 46: predicate.load_eliminater 0.89% : 0.000002s : 4: predicate.loop_unroll_after_grad 2.89% : 0.000007s : 47: predicate.loop_unroll_before_grad 1.65% : 0.000004s : 25: predicate.make_slice_get_slice_eliminator 0.48% : 0.000001s : 8: predicate.merge_addn 0.48% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.48% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.89% : 0.000002s : 17: predicate.minmaximum_grad 0.74% : 0.000002s : 4: predicate.mutable_eliminate 0.27% : 0.000001s : 4: predicate.opt_reshape 0.46% : 0.000001s : 4: predicate.parallel_virtual_node 2.14% : 0.000005s : 29: predicate.partial_defer_inline 1.65% : 0.000004s : 25: predicate.partial_eliminate 1.01% : 0.000002s : 17: predicate.print_const_string_wrapper 0.50% : 0.000001s : 8: predicate.reduce_all_const_elim 1.26% : 0.000003s : 17: predicate.reduce_eliminate 2.60% : 0.000006s : 46: predicate.redundant_stop_gradient_eliminater 0.33% : 0.000001s : 8: predicate.remove_not_recompute_node 1.38% : 0.000003s : 29: predicate.replace_applicator 0.46% : 0.000001s : 8: predicate.replace_old_param 0.23% : 0.000001s : 4: predicate.reset_defer_inline 1.04% : 0.000002s : 17: predicate.reshape_eliminate 0.52% : 0.000001s : 8: predicate.row_tensor_add_zeros_like 0.31% : 0.000001s : 4: predicate.row_tensor_eliminate 0.76% : 0.000002s : 8: predicate.same_eliminate 0.38% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.62% : 0.000001s : 8: predicate.shard_identity_eliminate 0.57% : 0.000001s : 8: predicate.special_op_eliminate 0.66% : 0.000002s : 8: predicate.specialize_transform 0.65% : 0.000002s : 8: predicate.split_environ_get_set_with_tuple_value 0.62% : 0.000001s : 8: predicate.stack_unstack_eliminate 0.27% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.85% : 0.000004s : 29: predicate.switch_defer_inline 2.28% : 0.000005s : 37: predicate.switch_layer_defer_inline 6.01% : 0.000014s : 92: predicate.switch_simplify 0.98% : 0.000002s : 17: predicate.tile_eliminate 0.98% : 0.000002s : 17: predicate.transpose_eliminate 1.62% : 0.000004s : 25: predicate.tuple_list_convert_item_index_to_positive 1.65% : 0.000004s : 25: predicate.tuple_list_get_item_const_eliminator 1.46% : 0.000003s : 25: predicate.tuple_list_get_item_depend_reorder 3.02% : 0.000007s : 37: predicate.tuple_list_get_item_eliminator 1.47% : 0.000004s : 25: predicate.tuple_list_get_set_item_eliminator 2.07% : 0.000005s : 33: predicate.tuple_list_set_item_eliminator 1.70% : 0.000004s : 29: predicate.tuple_to_list_eliminator_ 2.60% : 0.000006s : 46: predicate.updatestate_pure_node_eliminater 3.08% : 0.000007s : 54: predicate.updatestate_useless_node_eliminater 0.33% : 0.000001s : 4: predicate.value_based_eliminate 0.55% : 0.000001s : 8: predicate.virtual_dataset_eliminate 0.49% : 0.000001s : 8: predicate.virtual_output_eliminate 0.26% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.32% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000930 16 61.11% : 0.000568s : 6: func_graph_cloner_run.FuncGraphClonerGraph 38.89% : 0.000362s : 10: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.051266 192 0.01% : 0.000006s : 1: ForceFp32Comm 5.99% : 0.003070s : 1: add_attr 5.96% : 0.003057s : 1: add_attr_with_inline 0.01% : 0.000007s : 1: add_comm_op_reuse_tag 0.10% : 0.000052s : 1: add_recomputation 0.01% : 0.000007s : 1: assign_add_opt 0.20% : 0.000105s : 1: auto_monad 0.05% : 0.000028s : 1: auto_monad_reorder 0.01% : 0.000006s : 1: begin_end_overlap_inline 0.01% : 0.000007s : 1: bias_add_comm_swap 0.93% : 0.000476s : 1: bootstrap 0.06% : 0.000029s : 1: cconv 0.02% : 0.000008s : 1: comm_op_add_attrs 0.04% : 0.000021s : 1: control_data_broadcast_order 0.03% : 0.000014s : 1: convert_after_rewriter 0.06% : 0.000029s : 1: cse_after_recomputation 0.02% : 0.000008s : 1: dataset_repeat_opt 0.04% : 0.000020s : 1: detach_backward 0.02% : 0.000010s : 1: environ_conv 0.12% : 0.000060s : 1: event_method 0.01% : 0.000007s : 1: full_micro_interleaved_order_control 0.01% : 0.000007s : 1: get_jit_bprop_graph 0.03% : 0.000014s : 1: graph_reusing 0.01% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000006s : 1: handle_group_info 0.02% : 0.000009s : 1: inline 0.02% : 0.000008s : 1: insert-virtual-dataset 0.01% : 0.000007s : 1: interleave_parallel_branches 0.02% : 0.000008s : 1: interleave_split_concat_branches 0.02% : 0.000008s : 1: label_fine_grained_interleaved_index 0.02% : 0.000009s : 1: label_micro_interleaved_index 0.85% : 0.000434s : 1: loop_unroll 0.01% : 0.000006s : 1: merge_cast_opt 0.04% : 0.000020s : 1: micro_interleaved_order_control 0.96% : 0.000492s : 1: mutable_eliminate 0.02% : 0.000010s : 1: offloading_packed_experts 0.03% : 0.000013s : 1: opt.transform.loop_unroll_optimizer 0.03% : 0.000014s : 1: opt.transform.mutable_eliminate 2.89% : 0.001481s : 78: opt.transform.opt_a 0.06% : 0.000031s : 1: opt.transform.opt_after_cconv 0.05% : 0.000027s : 1: opt.transform.opt_after_jit_grad 0.21% : 0.000106s : 28: opt.transform.opt_b 0.10% : 0.000049s : 2: opt.transform.opt_trans_graph 0.08% : 0.000039s : 4: opt.transform.symbol_engine_opt 6.60% : 0.003381s : 1: opt_a 0.25% : 0.000127s : 1: opt_after_cconv 0.95% : 0.000485s : 1: opt_after_jit_grad 0.53% : 0.000270s : 1: opt_b 11.68% : 0.005986s : 1: optimize 0.04% : 0.000023s : 1: optimize_parallel_all_gather_comm 0.02% : 0.000012s : 1: order_py_execute_after_rewriter 0.05% : 0.000025s : 1: overlap_grad_flash_sp 0.01% : 0.000007s : 1: overlap_grad_matmul_and_grad_allreduce 0.02% : 0.000010s : 1: overlap_grad_ring_attention 0.01% : 0.000007s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000007s : 1: overlap_opt_shard_in_pipeline 0.02% : 0.000008s : 1: overlap_param_gather 0.01% : 0.000007s : 1: overlap_recompute_allgather_and_fa_grad 0.02% : 0.000011s : 1: overlap_recompute_and_grad_model_parallel 0.02% : 0.000008s : 1: overlap_recompute_comm 0.02% : 0.000009s : 1: parallel-infer-symbol 0.01% : 0.000007s : 1: parallel-infer-symbol-second 0.01% : 0.000007s : 1: partial_unused_args_eliminate 0.02% : 0.000009s : 1: pipeline_parallel_scheduler 0.01% : 0.000007s : 1: pipeline_split 0.09% : 0.000047s : 1: pre_auto_parallel 0.08% : 0.000039s : 1: py_interpret_to_execute 0.03% : 0.000015s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000006s : 1: remove_cast_before_assign_add 0.04% : 0.000019s : 1: remove_dup_value 0.70% : 0.000359s : 1: renormalize.infer 0.71% : 0.000362s : 1: renormalize.specialize 0.02% : 0.000008s : 1: reorder_send_recv_between_fp_bp 0.02% : 0.000010s : 1: rewriter_after_jit_bprop_graph 0.08% : 0.000040s : 1: rewriter_after_opt_a 0.23% : 0.000116s : 1: rewriter_before_opt_a 0.02% : 0.000008s : 1: slice_cell_reuse_recomputed_activation 0.01% : 0.000007s : 1: slice_recompute_activation 0.01% : 0.000007s : 1: split_layernorm_comm 0.01% : 0.000007s : 1: split_matmul_comm_elemetwise 0.02% : 0.000011s : 1: swap_dp_allreduce_reducescatter 0.20% : 0.000103s : 1: symbol_engine_optimizer 0.18% : 0.000092s : 1: tuple_transform 58.05% : 0.029761s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:16.574.441 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0429458, [21] [bootstrap]: 0.00043066 [type_inference]: 0.0331148 [event_method]: 4.917e-05 [auto_monad]: 9.782e-05 [graph_reusing]: 7.51999e-06 [inline]: 2.32999e-06 [add_attr]: 0.00318786, [1] [add_attr_with_inline]: 0.00317915, [1] [Cycle 1]: 7.23e-05, [2] [tag_attr]: 2.748e-05 [meta_addattr_fg_expand]: 7.89002e-06 [parallel-infer-symbol]: 3.04999e-06 [pre_auto_parallel]: 4.178e-05 [insert-virtual-dataset]: 2.99999e-06 [parallel-infer-symbol-second]: 8.40024e-07 [dataset_repeat_opt]: 1.84998e-06 [pipeline_split]: 2.48e-06 [optimize]: 0.00528368, [53] [py_interpret_to_execute]: 3.054e-05 [rewriter_before_opt_a]: 0.00011082 [opt_a]: 0.00318069, [2] [Cycle 1]: 0.00250564, [45] [expand_dump_flag]: 4.16001e-06 [switch_simplify]: 0.00012545 [loop_unroll]: 4.108e-05 [a_1]: 0.00084251 [with_stream_mark]: 1.73e-05 [recompute_prepare]: 1.178e-05 [updatestate_depend_eliminate]: 4.08001e-06 [updatestate_assign_eliminate]: 3.3e-06 [updatestate_loads_eliminate]: 2.86e-06 [parameter_eliminate]: 2.34999e-06 [a_2]: 8.436e-05 [accelerated_algorithm]: 6.98998e-06 [shard]: 2.17999e-06 [meta_shard_fg_expand]: 1.82999e-06 [shard_inline]: 6.78003e-06 [merge_send_recv]: 8.67e-06 [auto_parallel]: 6.71e-06 [parallel]: 1.933e-05 [flash_sp]: 7.82e-06 [merge_comm]: 4.27998e-06 [allreduce_fusion]: 3.51001e-06 [matmul_add_comm_reduction]: 9.89999e-06 [allreduce_slice_to_reducescatter]: 7.59988e-07 [virtual_shard_identity]: 8.75999e-06 [virtual_dataset]: 6.65002e-06 [get_grad_eliminate_]: 6.39999e-06 [virtual_output]: 6.62002e-06 [merge_forward]: 3.62002e-06 [cell_reuse_recompute_pass]: 1.17999e-06 [offload_activation]: 9.47001e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.427e-05 [merge_recompute_call_nodes]: 1.56002e-06 [before_grad]: 1.063e-05 [set_forward_comm_id_for_comm_node_pass]: 3.48999e-06 [meta_fg_expand]: 3.18e-06 [flash_sp_send_recv_attached]: 2.56998e-06 [receive_attached]: 2.46998e-06 [after_resolve]: 1.198e-05 [a_after_grad]: 1.027e-05 [renormalize]: 0.00081956 [add_forward_monad_depend]: 5.85002e-06 [auto_monad_grad]: 2.53998e-06 [auto_monad_eliminator]: 1.636e-05 [cse]: 3.031e-05 [a_3]: 5.094e-05 [Cycle 2]: 0.00066407, [45] [expand_dump_flag]: 1.47001e-06 [switch_simplify]: 8.95001e-06 [loop_unroll]: 6.59001e-06 [a_1]: 0.00013702 [with_stream_mark]: 1.229e-05 [recompute_prepare]: 6.16e-06 [updatestate_depend_eliminate]: 3.09999e-06 [updatestate_assign_eliminate]: 2.12001e-06 [updatestate_loads_eliminate]: 2.68e-06 [parameter_eliminate]: 1.03001e-06 [a_2]: 7.398e-05 [accelerated_algorithm]: 6.76999e-06 [shard]: 1.79e-06 [meta_shard_fg_expand]: 1.40001e-06 [shard_inline]: 6.21e-06 [merge_send_recv]: 5.49e-06 [auto_parallel]: 5.80002e-06 [parallel]: 5.77999e-06 [flash_sp]: 3.86001e-06 [merge_comm]: 3.39001e-06 [allreduce_fusion]: 3.08e-06 [matmul_add_comm_reduction]: 6.51e-06 [allreduce_slice_to_reducescatter]: 5.59987e-07 [virtual_shard_identity]: 7.46999e-06 [virtual_dataset]: 5.90002e-06 [get_grad_eliminate_]: 6.17999e-06 [virtual_output]: 5.47999e-06 [merge_forward]: 3.11999e-06 [cell_reuse_recompute_pass]: 1.50001e-06 [offload_activation]: 6.73998e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.346e-05 [merge_recompute_call_nodes]: 9.79984e-07 [before_grad]: 9.25001e-06 [set_forward_comm_id_for_comm_node_pass]: 3.88001e-06 [meta_fg_expand]: 2.34999e-06 [flash_sp_send_recv_attached]: 8.29983e-07 [receive_attached]: 1.40999e-06 [after_resolve]: 1.07e-05 [a_after_grad]: 9.93998e-06 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 1.19998e-06 [auto_monad_grad]: 1.00999e-06 [auto_monad_eliminator]: 7.98999e-06 [cse]: 1.366e-05 [a_3]: 3.789e-05 [py_interpret_to_execute_after_opt_a]: 9.41e-06 [slice_cell_reuse_recomputed_activation]: 1.85001e-06 [rewriter_after_opt_a]: 3.611e-05 [convert_after_rewriter]: 7.08e-06 [order_py_execute_after_rewriter]: 5.34e-06 [mutable_eliminate]: 0.0005211 [opt_b]: 0.00021285, [1] [Cycle 1]: 0.00020634, [7] [b_1]: 0.0001291 [b_2]: 8.33999e-06 [updatestate_depend_eliminate]: 6.69001e-06 [updatestate_assign_eliminate]: 2.43e-06 [updatestate_loads_eliminate]: 2.29999e-06 [renormalize]: 5.59987e-07 [cse]: 2.116e-05 [optimize_parallel_all_gather_comm]: 1.728e-05 [overlap_param_gather]: 1.95001e-06 [cconv]: 2.593e-05 [loop_unroll]: 0.00043766 [opt_after_cconv]: 0.00010405, [1] [Cycle 1]: 9.86e-05, [7] [c_1]: 3.277e-05 [parameter_eliminate]: 2.96001e-06 [updatestate_depend_eliminate]: 5.68002e-06 [updatestate_assign_eliminate]: 2.32999e-06 [updatestate_loads_eliminate]: 2.29001e-06 [cse]: 1.804e-05 [renormalize]: 5.79981e-07 [remove_dup_value]: 1.27e-05 [tuple_transform]: 7.75e-05, [1] [Cycle 1]: 7.325e-05, [4] [d_1]: 4.533e-05 [none_parameter_eliminate]: 1.71e-06 [renormalize]: 1.8999e-07 [switch_simplify]: 6.81001e-06 [partial_unused_args_eliminate]: 1.89999e-06 [add_recomputation]: 4.945e-05 [cse_after_recomputation]: 2.184e-05, [1] [Cycle 1]: 1.667e-05, [1] [cse]: 1.087e-05 [environ_conv]: 5.19e-06 [swap_dp_allreduce_reducescatter]: 5.04998e-06 [bias_add_comm_swap]: 2.79999e-06 [label_micro_interleaved_index]: 4.35e-06 [label_fine_grained_interleaved_index]: 2.65002e-06 [merge_cast_opt]: 1.34e-06 [slice_recompute_activation]: 2.08998e-06 [micro_interleaved_order_control]: 2.27001e-06 [assign_add_opt]: 1.12e-06 [ForceFp32Comm]: 7.7e-07 [remove_cast_before_assign_add]: 9.79984e-07 [full_micro_interleaved_order_control]: 2.02999e-06 [reorder_send_recv_between_fp_bp]: 2.78e-06 [comm_op_add_attrs]: 9.60019e-07 [add_comm_op_reuse_tag]: 1.00001e-06 [interleave_split_concat_branches]: 1.23002e-06 [interleave_parallel_branches]: 1.22e-06 [overlap_opt_shard_in_pipeline]: 1.29e-06 [overlap_opt_shard_grad_in_pipeline]: 1.72999e-06 [control_data_broadcast_order]: 1.278e-05 [grouped_pairwise_exchange_alltoall]: 1.58002e-06 [offloading_packed_experts]: 1.8e-05 [overlap_recompute_and_grad_model_parallel]: 5.09e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.15001e-06 [overlap_recompute_allgather_and_fa_grad]: 1.32999e-06 [overlap_recompute_comm]: 2.71999e-06 [overlap_grad_ring_attention]: 3.78999e-06 [overlap_grad_flash_sp]: 1.999e-05 [begin_end_overlap_inline]: 5.69999e-07 [split_matmul_comm_elemetwise]: 2.11e-06 [split_layernorm_comm]: 1.78002e-06 [handle_group_info]: 8.99978e-07 [symbol_engine_optimizer]: 7.966e-05, [1] [Cycle 1]: 7.538e-05, [6] [build]: 3.36001e-06 [elim_shapecalc]: 1.024e-05 [elim_not_effective]: 1.318e-05 [opt_reshape]: 7.48999e-06 [fold_const_symbol]: 1.056e-05 [renormalize]: 2.10013e-07 [detach_backward]: 1.85001e-06 [pipeline_parallel_scheduler]: 1.54e-06 [auto_monad_reorder]: 1.706e-05 [get_jit_bprop_graph]: 1.86e-06 [rewriter_after_jit_bprop_graph]: 4.63999e-06 [opt_after_jit_grad]: 0.00048875 [validate]: 3.829e-05 Sums bootstrap : 0.000431s : 1.11% type_inference : 0.033115s : 85.40% event_method : 0.000049s : 0.13% auto_monad : 0.000098s : 0.25% graph_reusing : 0.000008s : 0.02% inline : 0.000002s : 0.01% add_attr.add_attr_with_inline.tag_attr : 0.000027s : 0.07% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000008s : 0.02% parallel-infer-symbol : 0.000003s : 0.01% pre_auto_parallel : 0.000042s : 0.11% insert-virtual-dataset : 0.000003s : 0.01% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000031s : 0.08% optimize.rewriter_before_opt_a : 0.000111s : 0.29% optimize.opt_a.expand_dump_flag : 0.000006s : 0.01% optimize.opt_a.switch_simplify : 0.000134s : 0.35% optimize.opt_a.loop_unroll : 0.000048s : 0.12% optimize.opt_a.a_1 : 0.000980s : 2.53% optimize.opt_a.with_stream_mark : 0.000030s : 0.08% optimize.opt_a.recompute_prepare : 0.000018s : 0.05% optimize.opt_a.updatestate_depend_eliminate : 0.000007s : 0.02% optimize.opt_a.updatestate_assign_eliminate : 0.000005s : 0.01% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.01% optimize.opt_a.parameter_eliminate : 0.000003s : 0.01% optimize.opt_a.a_2 : 0.000158s : 0.41% optimize.opt_a.accelerated_algorithm : 0.000014s : 0.04% optimize.opt_a.shard : 0.000004s : 0.01% optimize.opt_a.meta_shard_fg_expand : 0.000003s : 0.01% optimize.opt_a.shard_inline : 0.000013s : 0.03% optimize.opt_a.merge_send_recv : 0.000014s : 0.04% optimize.opt_a.auto_parallel : 0.000013s : 0.03% optimize.opt_a.parallel : 0.000025s : 0.06% optimize.opt_a.flash_sp : 0.000012s : 0.03% optimize.opt_a.merge_comm : 0.000008s : 0.02% optimize.opt_a.allreduce_fusion : 0.000007s : 0.02% optimize.opt_a.matmul_add_comm_reduction : 0.000016s : 0.04% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000016s : 0.04% optimize.opt_a.virtual_dataset : 0.000013s : 0.03% optimize.opt_a.get_grad_eliminate_ : 0.000013s : 0.03% optimize.opt_a.virtual_output : 0.000012s : 0.03% optimize.opt_a.merge_forward : 0.000007s : 0.02% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.01% optimize.opt_a.offload_activation : 0.000016s : 0.04% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000028s : 0.07% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.01% optimize.opt_a.before_grad : 0.000020s : 0.05% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000007s : 0.02% optimize.opt_a.meta_fg_expand : 0.000006s : 0.01% optimize.opt_a.flash_sp_send_recv_attached : 0.000003s : 0.01% optimize.opt_a.receive_attached : 0.000004s : 0.01% optimize.opt_a.after_resolve : 0.000023s : 0.06% optimize.opt_a.a_after_grad : 0.000020s : 0.05% optimize.opt_a.renormalize : 0.000820s : 2.11% optimize.opt_a.add_forward_monad_depend : 0.000007s : 0.02% optimize.opt_a.auto_monad_grad : 0.000004s : 0.01% optimize.opt_a.auto_monad_eliminator : 0.000024s : 0.06% optimize.opt_a.cse : 0.000044s : 0.11% optimize.opt_a.a_3 : 0.000089s : 0.23% optimize.py_interpret_to_execute_after_opt_a : 0.000009s : 0.02% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.00% optimize.rewriter_after_opt_a : 0.000036s : 0.09% optimize.convert_after_rewriter : 0.000007s : 0.02% optimize.order_py_execute_after_rewriter : 0.000005s : 0.01% optimize.mutable_eliminate : 0.000521s : 1.34% optimize.opt_b.b_1 : 0.000129s : 0.33% optimize.opt_b.b_2 : 0.000008s : 0.02% optimize.opt_b.updatestate_depend_eliminate : 0.000007s : 0.02% optimize.opt_b.updatestate_assign_eliminate : 0.000002s : 0.01% optimize.opt_b.updatestate_loads_eliminate : 0.000002s : 0.01% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000021s : 0.05% optimize.optimize_parallel_all_gather_comm : 0.000017s : 0.04% optimize.overlap_param_gather : 0.000002s : 0.01% optimize.cconv : 0.000026s : 0.07% optimize.loop_unroll : 0.000438s : 1.13% optimize.opt_after_cconv.c_1 : 0.000033s : 0.08% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.01% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000002s : 0.01% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.01% optimize.opt_after_cconv.cse : 0.000018s : 0.05% optimize.opt_after_cconv.renormalize : 0.000001s : 0.00% optimize.remove_dup_value : 0.000013s : 0.03% optimize.tuple_transform.d_1 : 0.000045s : 0.12% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000007s : 0.02% optimize.partial_unused_args_eliminate : 0.000002s : 0.00% optimize.add_recomputation : 0.000049s : 0.13% optimize.cse_after_recomputation.cse : 0.000011s : 0.03% optimize.environ_conv : 0.000005s : 0.01% optimize.swap_dp_allreduce_reducescatter : 0.000005s : 0.01% optimize.bias_add_comm_swap : 0.000003s : 0.01% optimize.label_micro_interleaved_index : 0.000004s : 0.01% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.01% optimize.merge_cast_opt : 0.000001s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.01% optimize.micro_interleaved_order_control : 0.000002s : 0.01% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.00% optimize.full_micro_interleaved_order_control : 0.000002s : 0.01% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.01% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.00% optimize.control_data_broadcast_order : 0.000013s : 0.03% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.00% optimize.offloading_packed_experts : 0.000018s : 0.05% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.01% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.00% optimize.overlap_recompute_comm : 0.000003s : 0.01% optimize.overlap_grad_ring_attention : 0.000004s : 0.01% optimize.overlap_grad_flash_sp : 0.000020s : 0.05% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.01% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000003s : 0.01% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000010s : 0.03% optimize.symbol_engine_optimizer.elim_not_effective : 0.000013s : 0.03% optimize.symbol_engine_optimizer.opt_reshape : 0.000007s : 0.02% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000011s : 0.03% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.00% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000017s : 0.04% get_jit_bprop_graph : 0.000002s : 0.00% rewriter_after_jit_bprop_graph : 0.000005s : 0.01% opt_after_jit_grad : 0.000489s : 1.26% validate : 0.000038s : 0.10% Time group info: ------[substitution.] 0.000220 34 0.84% : 0.000002s : 2: substitution.elim_not_effective 0.63% : 0.000001s : 2: substitution.fold_const_symbol 2.61% : 0.000006s : 4: substitution.graph_param_transform 78.45% : 0.000172s : 8: substitution.inline 1.68% : 0.000004s : 4: substitution.j_node_and_user_rematch 2.14% : 0.000005s : 4: substitution.remove_not_recompute_node 2.11% : 0.000005s : 4: substitution.replace_old_param 4.49% : 0.000010s : 2: substitution.switch_simplify 7.05% : 0.000015s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.033042 2 95.95% : 0.031705s : 1: type_inference.infer 4.05% : 0.001337s : 1: type_inference.specialize ------[replace.] 0.000120 14 43.29% : 0.000052s : 8: replace.inline 37.87% : 0.000046s : 2: replace.switch_simplify 18.84% : 0.000023s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000190 14 88.44% : 0.000168s : 8: match.inline 4.46% : 0.000008s : 2: match.switch_simplify 7.10% : 0.000013s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000241 1520 0.96% : 0.000002s : 17: predicate.accumulaten_eliminater 0.66% : 0.000002s : 4: predicate.ad_related_special_op_eliminate 0.43% : 0.000001s : 8: predicate.addn_check_dump 0.96% : 0.000002s : 17: predicate.addn_zero_filter 0.92% : 0.000002s : 17: predicate.adjust_all_reduce_mul_add 2.29% : 0.000006s : 25: predicate.arithmetic_simplify 1.04% : 0.000003s : 17: predicate.cast_eliminate 0.48% : 0.000001s : 8: predicate.check_bprop_eliminate 0.41% : 0.000001s : 8: predicate.compare_switch_simplify 0.16% : 0.000000s : 4: predicate.const_output_eliminate 0.48% : 0.000001s : 8: predicate.depend_value_elim 1.03% : 0.000002s : 17: predicate.dict_get_item_const_eliminator 1.20% : 0.000003s : 17: predicate.dict_get_item_eliminator 0.95% : 0.000002s : 17: predicate.dict_set_item_eliminator 0.88% : 0.000002s : 8: predicate.dumpgradient_eliminate 0.18% : 0.000000s : 4: predicate.elim_not_effective 0.31% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.21% : 0.000003s : 21: predicate.environ_add_const_eliminate 1.11% : 0.000003s : 21: predicate.environ_get_add_eliminate 1.11% : 0.000003s : 21: predicate.environ_get_depend_swap 1.57% : 0.000004s : 29: predicate.environ_get_eliminate 1.18% : 0.000003s : 21: predicate.environ_get_set_eliminate 1.68% : 0.000004s : 29: predicate.exchange_switch_depend_value 2.89% : 0.000007s : 29: predicate.float_depend_g_call 0.40% : 0.000001s : 8: predicate.float_environ_get_switch 0.64% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.16% : 0.000000s : 4: predicate.fold_const_symbol 0.50% : 0.000001s : 8: predicate.get_grad_eliminate 0.22% : 0.000001s : 4: predicate.graph_param_transform 0.52% : 0.000001s : 8: predicate.incorporate_call 0.40% : 0.000001s : 8: predicate.incorporate_call_switch 6.53% : 0.000016s : 70: predicate.inline 0.77% : 0.000002s : 8: predicate.inline_without_move 0.28% : 0.000001s : 8: predicate.j_node_and_user_rematch 0.62% : 0.000002s : 8: predicate.less_batch_normalization 1.79% : 0.000004s : 29: predicate.list_to_tuple_eliminator_ 2.72% : 0.000007s : 46: predicate.load_eliminater 0.84% : 0.000002s : 4: predicate.loop_unroll_after_grad 2.80% : 0.000007s : 47: predicate.loop_unroll_before_grad 1.77% : 0.000004s : 25: predicate.make_slice_get_slice_eliminator 0.46% : 0.000001s : 8: predicate.merge_addn 0.46% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.48% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.91% : 0.000002s : 17: predicate.minmaximum_grad 0.98% : 0.000002s : 4: predicate.mutable_eliminate 0.31% : 0.000001s : 4: predicate.opt_reshape 0.27% : 0.000001s : 4: predicate.parallel_virtual_node 2.14% : 0.000005s : 29: predicate.partial_defer_inline 1.65% : 0.000004s : 25: predicate.partial_eliminate 0.94% : 0.000002s : 17: predicate.print_const_string_wrapper 0.46% : 0.000001s : 8: predicate.reduce_all_const_elim 1.41% : 0.000003s : 17: predicate.reduce_eliminate 2.49% : 0.000006s : 46: predicate.redundant_stop_gradient_eliminater 0.32% : 0.000001s : 8: predicate.remove_not_recompute_node 1.36% : 0.000003s : 29: predicate.replace_applicator 0.45% : 0.000001s : 8: predicate.replace_old_param 0.23% : 0.000001s : 4: predicate.reset_defer_inline 1.02% : 0.000002s : 17: predicate.reshape_eliminate 0.53% : 0.000001s : 8: predicate.row_tensor_add_zeros_like 0.33% : 0.000001s : 4: predicate.row_tensor_eliminate 0.61% : 0.000001s : 8: predicate.same_eliminate 0.49% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.67% : 0.000002s : 8: predicate.shard_identity_eliminate 0.56% : 0.000001s : 8: predicate.special_op_eliminate 0.67% : 0.000002s : 8: predicate.specialize_transform 0.68% : 0.000002s : 8: predicate.split_environ_get_set_with_tuple_value 0.60% : 0.000001s : 8: predicate.stack_unstack_eliminate 0.27% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.82% : 0.000004s : 29: predicate.switch_defer_inline 2.26% : 0.000005s : 37: predicate.switch_layer_defer_inline 6.08% : 0.000015s : 92: predicate.switch_simplify 0.96% : 0.000002s : 17: predicate.tile_eliminate 1.05% : 0.000003s : 17: predicate.transpose_eliminate 1.55% : 0.000004s : 25: predicate.tuple_list_convert_item_index_to_positive 1.56% : 0.000004s : 25: predicate.tuple_list_get_item_const_eliminator 1.44% : 0.000003s : 25: predicate.tuple_list_get_item_depend_reorder 3.24% : 0.000008s : 37: predicate.tuple_list_get_item_eliminator 1.57% : 0.000004s : 25: predicate.tuple_list_get_set_item_eliminator 2.20% : 0.000005s : 33: predicate.tuple_list_set_item_eliminator 1.65% : 0.000004s : 29: predicate.tuple_to_list_eliminator_ 2.64% : 0.000006s : 46: predicate.updatestate_pure_node_eliminater 3.22% : 0.000008s : 54: predicate.updatestate_useless_node_eliminater 0.29% : 0.000001s : 4: predicate.value_based_eliminate 0.55% : 0.000001s : 8: predicate.virtual_dataset_eliminate 0.57% : 0.000001s : 8: predicate.virtual_output_eliminate 0.21% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.37% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.001071 16 60.67% : 0.000650s : 6: func_graph_cloner_run.FuncGraphClonerGraph 39.33% : 0.000421s : 10: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.053888 192 0.01% : 0.000003s : 1: ForceFp32Comm 5.92% : 0.003193s : 1: add_attr 5.91% : 0.003183s : 1: add_attr_with_inline 0.01% : 0.000004s : 1: add_comm_op_reuse_tag 0.10% : 0.000054s : 1: add_recomputation 0.01% : 0.000004s : 1: assign_add_opt 0.19% : 0.000104s : 1: auto_monad 0.04% : 0.000021s : 1: auto_monad_reorder 0.01% : 0.000004s : 1: begin_end_overlap_inline 0.01% : 0.000005s : 1: bias_add_comm_swap 0.85% : 0.000460s : 1: bootstrap 0.05% : 0.000029s : 1: cconv 0.01% : 0.000005s : 1: comm_op_add_attrs 0.03% : 0.000016s : 1: control_data_broadcast_order 0.02% : 0.000011s : 1: convert_after_rewriter 0.05% : 0.000025s : 1: cse_after_recomputation 0.01% : 0.000005s : 1: dataset_repeat_opt 0.01% : 0.000005s : 1: detach_backward 0.02% : 0.000009s : 1: environ_conv 0.11% : 0.000058s : 1: event_method 0.01% : 0.000005s : 1: full_micro_interleaved_order_control 0.01% : 0.000005s : 1: get_jit_bprop_graph 0.02% : 0.000011s : 1: graph_reusing 0.01% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000004s : 1: handle_group_info 0.01% : 0.000005s : 1: inline 0.01% : 0.000006s : 1: insert-virtual-dataset 0.01% : 0.000004s : 1: interleave_parallel_branches 0.01% : 0.000004s : 1: interleave_split_concat_branches 0.01% : 0.000006s : 1: label_fine_grained_interleaved_index 0.01% : 0.000007s : 1: label_micro_interleaved_index 0.83% : 0.000447s : 1: loop_unroll 0.01% : 0.000004s : 1: merge_cast_opt 0.01% : 0.000005s : 1: micro_interleaved_order_control 0.99% : 0.000531s : 1: mutable_eliminate 0.04% : 0.000022s : 1: offloading_packed_experts 0.03% : 0.000015s : 1: opt.transform.loop_unroll_optimizer 0.03% : 0.000017s : 1: opt.transform.mutable_eliminate 2.79% : 0.001503s : 78: opt.transform.opt_a 0.06% : 0.000032s : 1: opt.transform.opt_after_cconv 0.05% : 0.000027s : 1: opt.transform.opt_after_jit_grad 0.19% : 0.000105s : 28: opt.transform.opt_b 0.09% : 0.000050s : 2: opt.transform.opt_trans_graph 0.07% : 0.000037s : 4: opt.transform.symbol_engine_opt 5.91% : 0.003184s : 1: opt_a 0.20% : 0.000107s : 1: opt_after_cconv 0.93% : 0.000499s : 1: opt_after_jit_grad 0.40% : 0.000216s : 1: opt_b 9.81% : 0.005289s : 1: optimize 0.04% : 0.000021s : 1: optimize_parallel_all_gather_comm 0.02% : 0.000009s : 1: order_py_execute_after_rewriter 0.04% : 0.000024s : 1: overlap_grad_flash_sp 0.01% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.01% : 0.000007s : 1: overlap_grad_ring_attention 0.01% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.01% : 0.000005s : 1: overlap_param_gather 0.01% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.02% : 0.000008s : 1: overlap_recompute_and_grad_model_parallel 0.01% : 0.000006s : 1: overlap_recompute_comm 0.01% : 0.000007s : 1: parallel-infer-symbol 0.01% : 0.000004s : 1: parallel-infer-symbol-second 0.01% : 0.000005s : 1: partial_unused_args_eliminate 0.01% : 0.000005s : 1: pipeline_parallel_scheduler 0.01% : 0.000005s : 1: pipeline_split 0.09% : 0.000046s : 1: pre_auto_parallel 0.07% : 0.000035s : 1: py_interpret_to_execute 0.03% : 0.000014s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000004s : 1: remove_cast_before_assign_add 0.03% : 0.000016s : 1: remove_dup_value 0.78% : 0.000419s : 1: renormalize.infer 0.73% : 0.000391s : 1: renormalize.specialize 0.01% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.01% : 0.000008s : 1: rewriter_after_jit_bprop_graph 0.08% : 0.000041s : 1: rewriter_after_opt_a 0.21% : 0.000115s : 1: rewriter_before_opt_a 0.01% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.01% : 0.000005s : 1: slice_recompute_activation 0.01% : 0.000005s : 1: split_layernorm_comm 0.01% : 0.000006s : 1: split_matmul_comm_elemetwise 0.01% : 0.000008s : 1: swap_dp_allreduce_reducescatter 0.15% : 0.000082s : 1: symbol_engine_optimizer 0.15% : 0.000080s : 1: tuple_transform 61.49% : 0.033137s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:17.513.59 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:17.516.25 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0408979, [21] [bootstrap]: 0.00042633 [type_inference]: 0.02926 [event_method]: 9.036e-05 [auto_monad]: 0.00010046 [graph_reusing]: 8.03999e-06 [inline]: 2.47001e-06 [add_attr]: 0.00321119, [1] [add_attr_with_inline]: 0.00320295, [1] [Cycle 1]: 8.184e-05, [2] [tag_attr]: 2.771e-05 [meta_addattr_fg_expand]: 8.25e-06 [parallel-infer-symbol]: 2.99001e-06 [pre_auto_parallel]: 4.416e-05 [insert-virtual-dataset]: 2.27001e-06 [parallel-infer-symbol-second]: 9.99979e-07 [dataset_repeat_opt]: 1.96003e-06 [pipeline_split]: 1.69e-06 [optimize]: 0.0065069, [53] [py_interpret_to_execute]: 4.002e-05 [rewriter_before_opt_a]: 0.00012995 [opt_a]: 0.00399206, [2] [Cycle 1]: 0.00299415, [45] [expand_dump_flag]: 4.32e-06 [switch_simplify]: 0.00012575 [loop_unroll]: 4.758e-05 [a_1]: 0.00092525 [with_stream_mark]: 1.794e-05 [recompute_prepare]: 1.221e-05 [updatestate_depend_eliminate]: 4.38001e-06 [updatestate_assign_eliminate]: 4.52e-06 [updatestate_loads_eliminate]: 3.76999e-06 [parameter_eliminate]: 2.24001e-06 [a_2]: 0.00014042 [accelerated_algorithm]: 9.19e-06 [shard]: 2.02001e-06 [meta_shard_fg_expand]: 2.26e-06 [shard_inline]: 8.25e-06 [merge_send_recv]: 9.76e-06 [auto_parallel]: 7.85998e-06 [parallel]: 1.983e-05 [flash_sp]: 8.42e-06 [merge_comm]: 4.92e-06 [allreduce_fusion]: 4.32e-06 [matmul_add_comm_reduction]: 1.019e-05 [allreduce_slice_to_reducescatter]: 6.69999e-07 [virtual_shard_identity]: 1.051e-05 [virtual_dataset]: 8.00999e-06 [get_grad_eliminate_]: 7.98999e-06 [virtual_output]: 8.45001e-06 [merge_forward]: 4.37e-06 [cell_reuse_recompute_pass]: 1.42e-06 [offload_activation]: 1.097e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.019e-05 [merge_recompute_call_nodes]: 1.46998e-06 [before_grad]: 1.508e-05 [set_forward_comm_id_for_comm_node_pass]: 5.08002e-06 [meta_fg_expand]: 4.02998e-06 [flash_sp_send_recv_attached]: 2.96001e-06 [receive_attached]: 2.44999e-06 [after_resolve]: 1.428e-05 [a_after_grad]: 1.299e-05 [renormalize]: 0.0009211 [add_forward_monad_depend]: 6.17999e-06 [auto_monad_grad]: 2.34001e-06 [auto_monad_eliminator]: 1.827e-05 [cse]: 4.639e-05 [a_3]: 8.154e-05 [Cycle 2]: 0.00098429, [45] [expand_dump_flag]: 1.26002e-06 [switch_simplify]: 1.075e-05 [loop_unroll]: 7.92998e-06 [a_1]: 0.00018468 [with_stream_mark]: 1.361e-05 [recompute_prepare]: 8.71002e-06 [updatestate_depend_eliminate]: 4.69002e-06 [updatestate_assign_eliminate]: 3.15998e-06 [updatestate_loads_eliminate]: 2.98e-06 [parameter_eliminate]: 1.34e-06 [a_2]: 0.00012396 [accelerated_algorithm]: 8.18001e-06 [shard]: 2.14999e-06 [meta_shard_fg_expand]: 2.19999e-06 [shard_inline]: 8.99998e-06 [merge_send_recv]: 7.03e-06 [auto_parallel]: 6.88e-06 [parallel]: 6.04001e-06 [flash_sp]: 3.55e-06 [merge_comm]: 4.60001e-06 [allreduce_fusion]: 4.57e-06 [matmul_add_comm_reduction]: 8e-06 [allreduce_slice_to_reducescatter]: 5.69999e-07 [virtual_shard_identity]: 8.85001e-06 [virtual_dataset]: 7.35e-06 [get_grad_eliminate_]: 7.92998e-06 [virtual_output]: 6.92002e-06 [merge_forward]: 4.28001e-06 [cell_reuse_recompute_pass]: 1.52001e-06 [offload_activation]: 8.38999e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.848e-05 [merge_recompute_call_nodes]: 9.49978e-07 [before_grad]: 1.28e-05 [set_forward_comm_id_for_comm_node_pass]: 4.96002e-06 [meta_fg_expand]: 3.17002e-06 [flash_sp_send_recv_attached]: 9.89996e-07 [receive_attached]: 1.06002e-06 [after_resolve]: 1.341e-05 [a_after_grad]: 1.161e-05 [renormalize]: 8.00064e-08 [add_forward_monad_depend]: 2.67001e-06 [auto_monad_grad]: 1.27e-06 [auto_monad_eliminator]: 1.128e-05 [cse]: 2.274e-05 [a_3]: 6.162e-05 [py_interpret_to_execute_after_opt_a]: 1.516e-05 [slice_cell_reuse_recomputed_activation]: 5.51e-06 [rewriter_after_opt_a]: 4.762e-05 [convert_after_rewriter]: 1.129e-05 [order_py_execute_after_rewriter]: 8.72e-06 [mutable_eliminate]: 0.00053758 [opt_b]: 0.00032004, [1] [Cycle 1]: 0.00031065, [7] [b_1]: 0.00020435 [b_2]: 1.003e-05 [updatestate_depend_eliminate]: 6.96001e-06 [updatestate_assign_eliminate]: 3.15002e-06 [updatestate_loads_eliminate]: 4.05e-06 [renormalize]: 5.40022e-07 [cse]: 2.422e-05 [optimize_parallel_all_gather_comm]: 2.364e-05 [overlap_param_gather]: 5.22999e-06 [cconv]: 3.065e-05 [loop_unroll]: 0.00045443 [opt_after_cconv]: 0.00014389, [1] [Cycle 1]: 0.00013455, [7] [c_1]: 3.935e-05 [parameter_eliminate]: 2.99001e-06 [updatestate_depend_eliminate]: 6.12999e-06 [updatestate_assign_eliminate]: 3.25e-06 [updatestate_loads_eliminate]: 2.99999e-06 [cse]: 2.237e-05 [renormalize]: 4.89992e-07 [remove_dup_value]: 1.795e-05 [tuple_transform]: 0.00010159, [1] [Cycle 1]: 9.464e-05, [4] [d_1]: 5.366e-05 [none_parameter_eliminate]: 1.60001e-06 [renormalize]: 2.80008e-07 [switch_simplify]: 8.60999e-06 [partial_unused_args_eliminate]: 4.79e-06 [add_recomputation]: 5.713e-05 [cse_after_recomputation]: 3.199e-05, [1] [Cycle 1]: 2.486e-05, [1] [cse]: 1.564e-05 [environ_conv]: 9.84999e-06 [swap_dp_allreduce_reducescatter]: 9.07001e-06 [bias_add_comm_swap]: 4.85999e-06 [label_micro_interleaved_index]: 6.74999e-06 [label_fine_grained_interleaved_index]: 5.27001e-06 [merge_cast_opt]: 3.56001e-06 [slice_recompute_activation]: 4.23001e-06 [micro_interleaved_order_control]: 4.38001e-06 [assign_add_opt]: 3.43e-06 [ForceFp32Comm]: 3.25e-06 [remove_cast_before_assign_add]: 3.55e-06 [full_micro_interleaved_order_control]: 4.63999e-06 [reorder_send_recv_between_fp_bp]: 5.13002e-06 [comm_op_add_attrs]: 3.2e-06 [add_comm_op_reuse_tag]: 3.19001e-06 [interleave_split_concat_branches]: 3.54002e-06 [interleave_parallel_branches]: 3.31999e-06 [overlap_opt_shard_in_pipeline]: 3.45998e-06 [overlap_opt_shard_grad_in_pipeline]: 2.096e-05 [control_data_broadcast_order]: 1.952e-05 [grouped_pairwise_exchange_alltoall]: 4.1e-06 [offloading_packed_experts]: 7.63001e-06 [overlap_recompute_and_grad_model_parallel]: 7.89002e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.57002e-06 [overlap_recompute_allgather_and_fa_grad]: 3.66999e-06 [overlap_recompute_comm]: 5.37001e-06 [overlap_grad_ring_attention]: 6.91001e-06 [overlap_grad_flash_sp]: 2.462e-05 [begin_end_overlap_inline]: 2.91999e-06 [split_matmul_comm_elemetwise]: 4.57e-06 [split_layernorm_comm]: 4.01001e-06 [handle_group_info]: 3.53e-06 [symbol_engine_optimizer]: 0.00010822, [1] [Cycle 1]: 0.00010077, [6] [build]: 3.45e-06 [elim_shapecalc]: 1.218e-05 [elim_not_effective]: 1.651e-05 [opt_reshape]: 8.79e-06 [fold_const_symbol]: 1.304e-05 [renormalize]: 1.90019e-07 [detach_backward]: 3.71001e-06 [pipeline_parallel_scheduler]: 2.09999e-06 [auto_monad_reorder]: 2.39e-05 [get_jit_bprop_graph]: 1.42999e-06 [rewriter_after_jit_bprop_graph]: 5.04e-06 [opt_after_jit_grad]: 0.00053045 [validate]: 4.332e-05 Sums bootstrap : 0.000426s : 1.19% type_inference : 0.029260s : 81.58% event_method : 0.000090s : 0.25% auto_monad : 0.000100s : 0.28% graph_reusing : 0.000008s : 0.02% inline : 0.000002s : 0.01% add_attr.add_attr_with_inline.tag_attr : 0.000028s : 0.08% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000008s : 0.02% parallel-infer-symbol : 0.000003s : 0.01% pre_auto_parallel : 0.000044s : 0.12% insert-virtual-dataset : 0.000002s : 0.01% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.01% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000040s : 0.11% optimize.rewriter_before_opt_a : 0.000130s : 0.36% optimize.opt_a.expand_dump_flag : 0.000006s : 0.02% optimize.opt_a.switch_simplify : 0.000136s : 0.38% optimize.opt_a.loop_unroll : 0.000056s : 0.15% optimize.opt_a.a_1 : 0.001110s : 3.09% optimize.opt_a.with_stream_mark : 0.000032s : 0.09% optimize.opt_a.recompute_prepare : 0.000021s : 0.06% optimize.opt_a.updatestate_depend_eliminate : 0.000009s : 0.03% optimize.opt_a.updatestate_assign_eliminate : 0.000008s : 0.02% optimize.opt_a.updatestate_loads_eliminate : 0.000007s : 0.02% optimize.opt_a.parameter_eliminate : 0.000004s : 0.01% optimize.opt_a.a_2 : 0.000264s : 0.74% optimize.opt_a.accelerated_algorithm : 0.000017s : 0.05% optimize.opt_a.shard : 0.000004s : 0.01% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.01% optimize.opt_a.shard_inline : 0.000017s : 0.05% optimize.opt_a.merge_send_recv : 0.000017s : 0.05% optimize.opt_a.auto_parallel : 0.000015s : 0.04% optimize.opt_a.parallel : 0.000026s : 0.07% optimize.opt_a.flash_sp : 0.000012s : 0.03% optimize.opt_a.merge_comm : 0.000010s : 0.03% optimize.opt_a.allreduce_fusion : 0.000009s : 0.02% optimize.opt_a.matmul_add_comm_reduction : 0.000018s : 0.05% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000019s : 0.05% optimize.opt_a.virtual_dataset : 0.000015s : 0.04% optimize.opt_a.get_grad_eliminate_ : 0.000016s : 0.04% optimize.opt_a.virtual_output : 0.000015s : 0.04% optimize.opt_a.merge_forward : 0.000009s : 0.02% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.01% optimize.opt_a.offload_activation : 0.000019s : 0.05% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000039s : 0.11% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.01% optimize.opt_a.before_grad : 0.000028s : 0.08% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000010s : 0.03% optimize.opt_a.meta_fg_expand : 0.000007s : 0.02% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.01% optimize.opt_a.receive_attached : 0.000004s : 0.01% optimize.opt_a.after_resolve : 0.000028s : 0.08% optimize.opt_a.a_after_grad : 0.000025s : 0.07% optimize.opt_a.renormalize : 0.000921s : 2.57% optimize.opt_a.add_forward_monad_depend : 0.000009s : 0.02% optimize.opt_a.auto_monad_grad : 0.000004s : 0.01% optimize.opt_a.auto_monad_eliminator : 0.000030s : 0.08% optimize.opt_a.cse : 0.000069s : 0.19% optimize.opt_a.a_3 : 0.000143s : 0.40% optimize.py_interpret_to_execute_after_opt_a : 0.000015s : 0.04% optimize.slice_cell_reuse_recomputed_activation : 0.000006s : 0.02% optimize.rewriter_after_opt_a : 0.000048s : 0.13% optimize.convert_after_rewriter : 0.000011s : 0.03% optimize.order_py_execute_after_rewriter : 0.000009s : 0.02% optimize.mutable_eliminate : 0.000538s : 1.50% optimize.opt_b.b_1 : 0.000204s : 0.57% optimize.opt_b.b_2 : 0.000010s : 0.03% optimize.opt_b.updatestate_depend_eliminate : 0.000007s : 0.02% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.01% optimize.opt_b.updatestate_loads_eliminate : 0.000004s : 0.01% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000024s : 0.07% optimize.optimize_parallel_all_gather_comm : 0.000024s : 0.07% optimize.overlap_param_gather : 0.000005s : 0.01% optimize.cconv : 0.000031s : 0.09% optimize.loop_unroll : 0.000454s : 1.27% optimize.opt_after_cconv.c_1 : 0.000039s : 0.11% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.02% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.cse : 0.000022s : 0.06% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000018s : 0.05% optimize.tuple_transform.d_1 : 0.000054s : 0.15% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000009s : 0.02% optimize.partial_unused_args_eliminate : 0.000005s : 0.01% optimize.add_recomputation : 0.000057s : 0.16% optimize.cse_after_recomputation.cse : 0.000016s : 0.04% optimize.environ_conv : 0.000010s : 0.03% optimize.swap_dp_allreduce_reducescatter : 0.000009s : 0.03% optimize.bias_add_comm_swap : 0.000005s : 0.01% optimize.label_micro_interleaved_index : 0.000007s : 0.02% optimize.label_fine_grained_interleaved_index : 0.000005s : 0.01% optimize.merge_cast_opt : 0.000004s : 0.01% optimize.slice_recompute_activation : 0.000004s : 0.01% optimize.micro_interleaved_order_control : 0.000004s : 0.01% optimize.assign_add_opt : 0.000003s : 0.01% optimize.ForceFp32Comm : 0.000003s : 0.01% optimize.remove_cast_before_assign_add : 0.000004s : 0.01% optimize.full_micro_interleaved_order_control : 0.000005s : 0.01% optimize.reorder_send_recv_between_fp_bp : 0.000005s : 0.01% optimize.comm_op_add_attrs : 0.000003s : 0.01% optimize.add_comm_op_reuse_tag : 0.000003s : 0.01% optimize.interleave_split_concat_branches : 0.000004s : 0.01% optimize.interleave_parallel_branches : 0.000003s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000003s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000021s : 0.06% optimize.control_data_broadcast_order : 0.000020s : 0.05% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.01% optimize.offloading_packed_experts : 0.000008s : 0.02% optimize.overlap_recompute_and_grad_model_parallel : 0.000008s : 0.02% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.01% optimize.overlap_recompute_comm : 0.000005s : 0.01% optimize.overlap_grad_ring_attention : 0.000007s : 0.02% optimize.overlap_grad_flash_sp : 0.000025s : 0.07% optimize.begin_end_overlap_inline : 0.000003s : 0.01% optimize.split_matmul_comm_elemetwise : 0.000005s : 0.01% optimize.split_layernorm_comm : 0.000004s : 0.01% optimize.handle_group_info : 0.000004s : 0.01% optimize.symbol_engine_optimizer.build : 0.000003s : 0.01% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000012s : 0.03% optimize.symbol_engine_optimizer.elim_not_effective : 0.000017s : 0.05% optimize.symbol_engine_optimizer.opt_reshape : 0.000009s : 0.02% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000013s : 0.04% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000004s : 0.01% pipeline_parallel_scheduler : 0.000002s : 0.01% auto_monad_reorder : 0.000024s : 0.07% get_jit_bprop_graph : 0.000001s : 0.00% rewriter_after_jit_bprop_graph : 0.000005s : 0.01% opt_after_jit_grad : 0.000530s : 1.48% validate : 0.000043s : 0.12% Time group info: ------[substitution.] 0.000254 44 9.70% : 0.000025s : 3: substitution.cast_eliminate 1.00% : 0.000003s : 3: substitution.elim_not_effective 0.70% : 0.000002s : 3: substitution.fold_const_symbol 2.51% : 0.000006s : 5: substitution.graph_param_transform 70.44% : 0.000179s : 8: substitution.inline 1.93% : 0.000005s : 6: substitution.j_node_and_user_rematch 2.60% : 0.000007s : 6: substitution.remove_not_recompute_node 2.01% : 0.000005s : 4: substitution.replace_old_param 3.79% : 0.000010s : 2: substitution.switch_simplify 5.31% : 0.000014s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.029196 2 95.26% : 0.027811s : 1: type_inference.infer 4.74% : 0.001385s : 1: type_inference.specialize ------[replace.] 0.000120 14 43.15% : 0.000052s : 8: replace.inline 36.95% : 0.000044s : 2: replace.switch_simplify 19.90% : 0.000024s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000194 14 89.83% : 0.000175s : 8: match.inline 4.22% : 0.000008s : 2: match.switch_simplify 5.95% : 0.000012s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000285 1746 1.00% : 0.000003s : 19: predicate.accumulaten_eliminater 0.62% : 0.000002s : 5: predicate.ad_related_special_op_eliminate 0.51% : 0.000001s : 10: predicate.addn_check_dump 0.91% : 0.000003s : 19: predicate.addn_zero_filter 1.07% : 0.000003s : 19: predicate.adjust_all_reduce_mul_add 2.20% : 0.000006s : 29: predicate.arithmetic_simplify 1.19% : 0.000003s : 19: predicate.cast_eliminate 0.57% : 0.000002s : 10: predicate.check_bprop_eliminate 0.53% : 0.000002s : 10: predicate.compare_switch_simplify 0.18% : 0.000001s : 5: predicate.const_output_eliminate 0.55% : 0.000002s : 10: predicate.depend_value_elim 1.00% : 0.000003s : 19: predicate.dict_get_item_const_eliminator 1.05% : 0.000003s : 19: predicate.dict_get_item_eliminator 0.91% : 0.000003s : 19: predicate.dict_set_item_eliminator 0.79% : 0.000002s : 10: predicate.dumpgradient_eliminate 0.17% : 0.000000s : 5: predicate.elim_not_effective 0.38% : 0.000001s : 5: predicate.elim_shapecalc_of_broadcastargs 1.27% : 0.000004s : 24: predicate.environ_add_const_eliminate 1.16% : 0.000003s : 24: predicate.environ_get_add_eliminate 1.19% : 0.000003s : 24: predicate.environ_get_depend_swap 1.75% : 0.000005s : 34: predicate.environ_get_eliminate 1.18% : 0.000003s : 24: predicate.environ_get_set_eliminate 1.62% : 0.000005s : 31: predicate.exchange_switch_depend_value 2.52% : 0.000007s : 31: predicate.float_depend_g_call 0.49% : 0.000001s : 10: predicate.float_environ_get_switch 0.73% : 0.000002s : 15: predicate.float_tuple_getitem_switch 0.17% : 0.000000s : 5: predicate.fold_const_symbol 0.56% : 0.000002s : 10: predicate.get_grad_eliminate 0.20% : 0.000001s : 5: predicate.graph_param_transform 0.57% : 0.000002s : 10: predicate.incorporate_call 0.48% : 0.000001s : 10: predicate.incorporate_call_switch 6.28% : 0.000018s : 80: predicate.inline 0.69% : 0.000002s : 10: predicate.inline_without_move 0.29% : 0.000001s : 10: predicate.j_node_and_user_rematch 0.73% : 0.000002s : 10: predicate.less_batch_normalization 1.85% : 0.000005s : 33: predicate.list_to_tuple_eliminator_ 2.52% : 0.000007s : 52: predicate.load_eliminater 0.72% : 0.000002s : 5: predicate.loop_unroll_after_grad 2.80% : 0.000008s : 49: predicate.loop_unroll_before_grad 1.63% : 0.000005s : 29: predicate.make_slice_get_slice_eliminator 0.50% : 0.000001s : 10: predicate.merge_addn 0.52% : 0.000001s : 10: predicate.micro_step_allgather_replace 0.53% : 0.000002s : 10: predicate.mini_step_allgather_replace 0.91% : 0.000003s : 19: predicate.minmaximum_grad 0.76% : 0.000002s : 5: predicate.mutable_eliminate 0.29% : 0.000001s : 5: predicate.opt_reshape 0.31% : 0.000001s : 5: predicate.parallel_virtual_node 2.00% : 0.000006s : 31: predicate.partial_defer_inline 1.59% : 0.000005s : 28: predicate.partial_eliminate 1.04% : 0.000003s : 19: predicate.print_const_string_wrapper 0.53% : 0.000002s : 10: predicate.reduce_all_const_elim 1.25% : 0.000004s : 19: predicate.reduce_eliminate 2.65% : 0.000008s : 52: predicate.redundant_stop_gradient_eliminater 0.34% : 0.000001s : 10: predicate.remove_not_recompute_node 1.35% : 0.000004s : 33: predicate.replace_applicator 0.46% : 0.000001s : 10: predicate.replace_old_param 0.20% : 0.000001s : 5: predicate.reset_defer_inline 1.05% : 0.000003s : 19: predicate.reshape_eliminate 0.60% : 0.000002s : 10: predicate.row_tensor_add_zeros_like 0.34% : 0.000001s : 5: predicate.row_tensor_eliminate 0.66% : 0.000002s : 10: predicate.same_eliminate 0.44% : 0.000001s : 10: predicate.set_cell_output_no_recompute 0.82% : 0.000002s : 10: predicate.shard_identity_eliminate 0.65% : 0.000002s : 10: predicate.special_op_eliminate 0.65% : 0.000002s : 10: predicate.specialize_transform 0.77% : 0.000002s : 10: predicate.split_environ_get_set_with_tuple_value 0.64% : 0.000002s : 10: predicate.stack_unstack_eliminate 0.28% : 0.000001s : 5: predicate.switch_call_monad_eliminater 1.71% : 0.000005s : 31: predicate.switch_defer_inline 2.41% : 0.000007s : 41: predicate.switch_layer_defer_inline 5.84% : 0.000017s : 99: predicate.switch_simplify 1.15% : 0.000003s : 19: predicate.tile_eliminate 1.03% : 0.000003s : 19: predicate.transpose_eliminate 1.53% : 0.000004s : 29: predicate.tuple_list_convert_item_index_to_positive 1.63% : 0.000005s : 29: predicate.tuple_list_get_item_const_eliminator 1.42% : 0.000004s : 29: predicate.tuple_list_get_item_depend_reorder 3.09% : 0.000009s : 43: predicate.tuple_list_get_item_eliminator 1.48% : 0.000004s : 29: predicate.tuple_list_get_set_item_eliminator 2.33% : 0.000007s : 39: predicate.tuple_list_set_item_eliminator 1.80% : 0.000005s : 33: predicate.tuple_to_list_eliminator_ 2.41% : 0.000007s : 52: predicate.updatestate_pure_node_eliminater 3.00% : 0.000009s : 62: predicate.updatestate_useless_node_eliminater 0.28% : 0.000001s : 5: predicate.value_based_eliminate 0.54% : 0.000002s : 10: predicate.virtual_dataset_eliminate 0.51% : 0.000001s : 10: predicate.virtual_output_eliminate 0.24% : 0.000001s : 5: predicate.virtual_view_grad_eliminate 0.41% : 0.000001s : 5: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000969 16 55.37% : 0.000536s : 6: func_graph_cloner_run.FuncGraphClonerGraph 44.63% : 0.000432s : 10: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.053513 192 0.01% : 0.000006s : 1: ForceFp32Comm 6.02% : 0.003221s : 1: add_attr 5.99% : 0.003207s : 1: add_attr_with_inline 0.01% : 0.000006s : 1: add_comm_op_reuse_tag 0.11% : 0.000061s : 1: add_recomputation 0.01% : 0.000006s : 1: assign_add_opt 0.21% : 0.000112s : 1: auto_monad 0.06% : 0.000031s : 1: auto_monad_reorder 0.01% : 0.000006s : 1: begin_end_overlap_inline 0.01% : 0.000008s : 1: bias_add_comm_swap 0.88% : 0.000468s : 1: bootstrap 0.06% : 0.000034s : 1: cconv 0.01% : 0.000006s : 1: comm_op_add_attrs 0.04% : 0.000023s : 1: control_data_broadcast_order 0.03% : 0.000014s : 1: convert_after_rewriter 0.07% : 0.000035s : 1: cse_after_recomputation 0.01% : 0.000007s : 1: dataset_repeat_opt 0.04% : 0.000022s : 1: detach_backward 0.02% : 0.000013s : 1: environ_conv 0.20% : 0.000105s : 1: event_method 0.01% : 0.000007s : 1: full_micro_interleaved_order_control 0.02% : 0.000008s : 1: get_jit_bprop_graph 0.03% : 0.000015s : 1: graph_reusing 0.01% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000006s : 1: handle_group_info 0.02% : 0.000008s : 1: inline 0.01% : 0.000008s : 1: insert-virtual-dataset 0.01% : 0.000006s : 1: interleave_parallel_branches 0.01% : 0.000006s : 1: interleave_split_concat_branches 0.02% : 0.000008s : 1: label_fine_grained_interleaved_index 0.02% : 0.000010s : 1: label_micro_interleaved_index 0.86% : 0.000461s : 1: loop_unroll 0.01% : 0.000006s : 1: merge_cast_opt 0.01% : 0.000007s : 1: micro_interleaved_order_control 1.02% : 0.000544s : 1: mutable_eliminate 0.02% : 0.000011s : 1: offloading_packed_experts 0.03% : 0.000018s : 1: opt.transform.loop_unroll_optimizer 0.03% : 0.000018s : 1: opt.transform.mutable_eliminate 3.30% : 0.001764s : 78: opt.transform.opt_a 0.07% : 0.000038s : 1: opt.transform.opt_after_cconv 0.06% : 0.000031s : 1: opt.transform.opt_after_jit_grad 0.26% : 0.000141s : 28: opt.transform.opt_b 0.11% : 0.000059s : 2: opt.transform.opt_trans_graph 0.09% : 0.000047s : 4: opt.transform.symbol_engine_opt 7.47% : 0.003995s : 1: opt_a 0.28% : 0.000148s : 1: opt_after_cconv 1.01% : 0.000541s : 1: opt_after_jit_grad 0.61% : 0.000324s : 1: opt_b 12.86% : 0.006880s : 1: optimize 0.05% : 0.000027s : 1: optimize_parallel_all_gather_comm 0.02% : 0.000012s : 1: order_py_execute_after_rewriter 0.05% : 0.000028s : 1: overlap_grad_flash_sp 0.01% : 0.000006s : 1: overlap_grad_matmul_and_grad_allreduce 0.02% : 0.000010s : 1: overlap_grad_ring_attention 0.04% : 0.000024s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000006s : 1: overlap_opt_shard_in_pipeline 0.02% : 0.000008s : 1: overlap_param_gather 0.01% : 0.000006s : 1: overlap_recompute_allgather_and_fa_grad 0.02% : 0.000011s : 1: overlap_recompute_and_grad_model_parallel 0.02% : 0.000008s : 1: overlap_recompute_comm 0.02% : 0.000010s : 1: parallel-infer-symbol 0.01% : 0.000007s : 1: parallel-infer-symbol-second 0.01% : 0.000008s : 1: partial_unused_args_eliminate 0.02% : 0.000009s : 1: pipeline_parallel_scheduler 0.01% : 0.000007s : 1: pipeline_split 0.10% : 0.000052s : 1: pre_auto_parallel 0.08% : 0.000044s : 1: py_interpret_to_execute 0.03% : 0.000019s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000006s : 1: remove_cast_before_assign_add 0.04% : 0.000021s : 1: remove_dup_value 0.90% : 0.000483s : 1: renormalize.infer 0.80% : 0.000429s : 1: renormalize.specialize 0.01% : 0.000008s : 1: reorder_send_recv_between_fp_bp 0.02% : 0.000011s : 1: rewriter_after_jit_bprop_graph 0.10% : 0.000052s : 1: rewriter_after_opt_a 0.25% : 0.000134s : 1: rewriter_before_opt_a 0.02% : 0.000008s : 1: slice_cell_reuse_recomputed_activation 0.01% : 0.000007s : 1: slice_recompute_activation 0.01% : 0.000007s : 1: split_layernorm_comm 0.01% : 0.000007s : 1: split_matmul_comm_elemetwise 0.02% : 0.000012s : 1: swap_dp_allreduce_reducescatter 0.21% : 0.000111s : 1: symbol_engine_optimizer 0.20% : 0.000105s : 1: tuple_transform 54.75% : 0.029299s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:17.530.306 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0386415, [21] [bootstrap]: 0.00041814 [type_inference]: 0.0129647 [event_method]: 9.169e-05 [auto_monad]: 9.583e-05 [graph_reusing]: 7.03e-06 [inline]: 2.22999e-06 [add_attr]: 0.00315686, [1] [add_attr_with_inline]: 0.00314739, [1] [Cycle 1]: 6.821e-05, [2] [tag_attr]: 2.791e-05 [meta_addattr_fg_expand]: 8.02e-06 [parallel-infer-symbol]: 3.20002e-06 [pre_auto_parallel]: 4.266e-05 [insert-virtual-dataset]: 2.35002e-06 [parallel-infer-symbol-second]: 8.2e-07 [dataset_repeat_opt]: 2.00002e-06 [pipeline_split]: 1.82999e-06 [optimize]: 0.0211097, [53] [py_interpret_to_execute]: 3.183e-05 [rewriter_before_opt_a]: 0.00011515 [opt_a]: 0.0188602, [2] [Cycle 1]: 0.018063, [45] [expand_dump_flag]: 4.82998e-06 [switch_simplify]: 0.00011883 [loop_unroll]: 4.239e-05 [a_1]: 0.00089014 [with_stream_mark]: 1.465e-05 [recompute_prepare]: 1.01e-05 [updatestate_depend_eliminate]: 4.41002e-06 [updatestate_assign_eliminate]: 3.75e-06 [updatestate_loads_eliminate]: 3.98999e-06 [parameter_eliminate]: 1.91e-06 [a_2]: 0.00011229 [accelerated_algorithm]: 8.94e-06 [shard]: 1.88997e-06 [meta_shard_fg_expand]: 2.74999e-06 [shard_inline]: 8.32e-06 [merge_send_recv]: 9.71998e-06 [auto_parallel]: 7.16001e-06 [parallel]: 1.754e-05 [flash_sp]: 7.94002e-06 [merge_comm]: 5.41002e-06 [allreduce_fusion]: 4.2e-06 [matmul_add_comm_reduction]: 1.044e-05 [allreduce_slice_to_reducescatter]: 6.50005e-07 [virtual_shard_identity]: 9.96e-06 [virtual_dataset]: 8.40001e-06 [get_grad_eliminate_]: 7.5e-06 [virtual_output]: 7.50998e-06 [merge_forward]: 4.72998e-06 [cell_reuse_recompute_pass]: 1.69e-06 [offload_activation]: 1.055e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.596e-05 [merge_recompute_call_nodes]: 1.66002e-06 [before_grad]: 1.3e-05 [set_forward_comm_id_for_comm_node_pass]: 4.44002e-06 [meta_fg_expand]: 3.98999e-06 [flash_sp_send_recv_attached]: 2.64001e-06 [receive_attached]: 2.18998e-06 [after_resolve]: 1.313e-05 [a_after_grad]: 1.225e-05 [renormalize]: 0.0162348 [add_forward_monad_depend]: 6.21e-06 [auto_monad_grad]: 2.27999e-06 [auto_monad_eliminator]: 1.986e-05 [cse]: 3.732e-05 [a_3]: 6.613e-05 [Cycle 2]: 0.00078667, [45] [expand_dump_flag]: 1.32e-06 [switch_simplify]: 1.007e-05 [loop_unroll]: 7.96001e-06 [a_1]: 0.00018799 [with_stream_mark]: 1.581e-05 [recompute_prepare]: 8.23999e-06 [updatestate_depend_eliminate]: 4.80001e-06 [updatestate_assign_eliminate]: 3.11001e-06 [updatestate_loads_eliminate]: 2.98e-06 [parameter_eliminate]: 1.01002e-06 [a_2]: 9.666e-05 [accelerated_algorithm]: 7.77998e-06 [shard]: 1.92999e-06 [meta_shard_fg_expand]: 1.84e-06 [shard_inline]: 8.05e-06 [merge_send_recv]: 7.23e-06 [auto_parallel]: 7.07002e-06 [parallel]: 6.11998e-06 [flash_sp]: 3.38e-06 [merge_comm]: 4.13001e-06 [allreduce_fusion]: 4.08999e-06 [matmul_add_comm_reduction]: 7.6e-06 [allreduce_slice_to_reducescatter]: 6.50005e-07 [virtual_shard_identity]: 8.84e-06 [virtual_dataset]: 7.56001e-06 [get_grad_eliminate_]: 7.06001e-06 [virtual_output]: 6.98e-06 [merge_forward]: 3.97002e-06 [cell_reuse_recompute_pass]: 1.59e-06 [offload_activation]: 9.20999e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.445e-05 [merge_recompute_call_nodes]: 1.00999e-06 [before_grad]: 1.256e-05 [set_forward_comm_id_for_comm_node_pass]: 4.82e-06 [meta_fg_expand]: 3.29001e-06 [flash_sp_send_recv_attached]: 8.80013e-07 [receive_attached]: 1.32999e-06 [after_resolve]: 1.225e-05 [a_after_grad]: 1.143e-05 [renormalize]: 8.00064e-08 [add_forward_monad_depend]: 1.27e-06 [auto_monad_grad]: 1.01002e-06 [auto_monad_eliminator]: 8.39002e-06 [cse]: 1.781e-05 [a_3]: 4.772e-05 [py_interpret_to_execute_after_opt_a]: 1.249e-05 [slice_cell_reuse_recomputed_activation]: 2.19001e-06 [rewriter_after_opt_a]: 4.026e-05 [convert_after_rewriter]: 7.33e-06 [order_py_execute_after_rewriter]: 6.07001e-06 [mutable_eliminate]: 0.00054834 [opt_b]: 0.00024454, [1] [Cycle 1]: 0.00023813, [7] [b_1]: 0.00015946 [b_2]: 9.36e-06 [updatestate_depend_eliminate]: 5.87999e-06 [updatestate_assign_eliminate]: 3.01001e-06 [updatestate_loads_eliminate]: 3.06001e-06 [renormalize]: 5.3001e-07 [cse]: 2.165e-05 [optimize_parallel_all_gather_comm]: 1.679e-05 [overlap_param_gather]: 1.91e-06 [cconv]: 2.4e-05 [loop_unroll]: 0.00044321 [opt_after_cconv]: 0.00011731, [1] [Cycle 1]: 0.00011186, [7] [c_1]: 3.988e-05 [parameter_eliminate]: 2.47001e-06 [updatestate_depend_eliminate]: 5.93002e-06 [updatestate_assign_eliminate]: 2.87002e-06 [updatestate_loads_eliminate]: 2.98e-06 [cse]: 2.186e-05 [renormalize]: 3.80009e-07 [remove_dup_value]: 2.738e-05 [tuple_transform]: 9.02e-05, [1] [Cycle 1]: 8.557e-05, [4] [d_1]: 5.639e-05 [none_parameter_eliminate]: 1.97001e-06 [renormalize]: 2.50002e-07 [switch_simplify]: 8.05e-06 [partial_unused_args_eliminate]: 1.82999e-06 [add_recomputation]: 5.487e-05 [cse_after_recomputation]: 2.624e-05, [1] [Cycle 1]: 2.182e-05, [1] [cse]: 1.633e-05 [environ_conv]: 6.28e-06 [swap_dp_allreduce_reducescatter]: 5.82001e-06 [bias_add_comm_swap]: 3.05002e-06 [label_micro_interleaved_index]: 4.48999e-06 [label_fine_grained_interleaved_index]: 2.84999e-06 [merge_cast_opt]: 1.22e-06 [slice_recompute_activation]: 2.06e-06 [micro_interleaved_order_control]: 2.41998e-06 [assign_add_opt]: 1.39e-06 [ForceFp32Comm]: 7.7e-07 [remove_cast_before_assign_add]: 1.00001e-06 [full_micro_interleaved_order_control]: 1.94999e-06 [reorder_send_recv_between_fp_bp]: 2.95002e-06 [comm_op_add_attrs]: 9.50007e-07 [add_comm_op_reuse_tag]: 9.89996e-07 [interleave_split_concat_branches]: 1.07998e-06 [interleave_parallel_branches]: 1.00001e-06 [overlap_opt_shard_in_pipeline]: 1.12e-06 [overlap_opt_shard_grad_in_pipeline]: 2.06e-06 [control_data_broadcast_order]: 3.157e-05 [grouped_pairwise_exchange_alltoall]: 1.73002e-06 [offloading_packed_experts]: 4.71997e-06 [overlap_recompute_and_grad_model_parallel]: 5.59e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.54e-06 [overlap_recompute_allgather_and_fa_grad]: 1.45001e-06 [overlap_recompute_comm]: 2.63e-06 [overlap_grad_ring_attention]: 5.08002e-06 [overlap_grad_flash_sp]: 2.193e-05 [begin_end_overlap_inline]: 6.30011e-07 [split_matmul_comm_elemetwise]: 2.32999e-06 [split_layernorm_comm]: 1.56002e-06 [handle_group_info]: 1.05999e-06 [symbol_engine_optimizer]: 8.831e-05, [1] [Cycle 1]: 8.369e-05, [6] [build]: 3.19001e-06 [elim_shapecalc]: 1.19e-05 [elim_not_effective]: 1.674e-05 [opt_reshape]: 9.00999e-06 [fold_const_symbol]: 1.343e-05 [renormalize]: 3.4002e-07 [detach_backward]: 2.06998e-06 [pipeline_parallel_scheduler]: 1.50999e-06 [auto_monad_reorder]: 1.985e-05 [get_jit_bprop_graph]: 1.29998e-06 [rewriter_after_jit_bprop_graph]: 3.88001e-06 [opt_after_jit_grad]: 0.000516 [validate]: 4.013e-05 Sums bootstrap : 0.000418s : 1.21% type_inference : 0.012965s : 37.59% event_method : 0.000092s : 0.27% auto_monad : 0.000096s : 0.28% graph_reusing : 0.000007s : 0.02% inline : 0.000002s : 0.01% add_attr.add_attr_with_inline.tag_attr : 0.000028s : 0.08% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000008s : 0.02% parallel-infer-symbol : 0.000003s : 0.01% pre_auto_parallel : 0.000043s : 0.12% insert-virtual-dataset : 0.000002s : 0.01% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.01% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000032s : 0.09% optimize.rewriter_before_opt_a : 0.000115s : 0.33% optimize.opt_a.expand_dump_flag : 0.000006s : 0.02% optimize.opt_a.switch_simplify : 0.000129s : 0.37% optimize.opt_a.loop_unroll : 0.000050s : 0.15% optimize.opt_a.a_1 : 0.001078s : 3.13% optimize.opt_a.with_stream_mark : 0.000030s : 0.09% optimize.opt_a.recompute_prepare : 0.000018s : 0.05% optimize.opt_a.updatestate_depend_eliminate : 0.000009s : 0.03% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.02% optimize.opt_a.updatestate_loads_eliminate : 0.000007s : 0.02% optimize.opt_a.parameter_eliminate : 0.000003s : 0.01% optimize.opt_a.a_2 : 0.000209s : 0.61% optimize.opt_a.accelerated_algorithm : 0.000017s : 0.05% optimize.opt_a.shard : 0.000004s : 0.01% optimize.opt_a.meta_shard_fg_expand : 0.000005s : 0.01% optimize.opt_a.shard_inline : 0.000016s : 0.05% optimize.opt_a.merge_send_recv : 0.000017s : 0.05% optimize.opt_a.auto_parallel : 0.000014s : 0.04% optimize.opt_a.parallel : 0.000024s : 0.07% optimize.opt_a.flash_sp : 0.000011s : 0.03% optimize.opt_a.merge_comm : 0.000010s : 0.03% optimize.opt_a.allreduce_fusion : 0.000008s : 0.02% optimize.opt_a.matmul_add_comm_reduction : 0.000018s : 0.05% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000019s : 0.05% optimize.opt_a.virtual_dataset : 0.000016s : 0.05% optimize.opt_a.get_grad_eliminate_ : 0.000015s : 0.04% optimize.opt_a.virtual_output : 0.000014s : 0.04% optimize.opt_a.merge_forward : 0.000009s : 0.03% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.01% optimize.opt_a.offload_activation : 0.000020s : 0.06% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000030s : 0.09% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.01% optimize.opt_a.before_grad : 0.000026s : 0.07% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000009s : 0.03% optimize.opt_a.meta_fg_expand : 0.000007s : 0.02% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.01% optimize.opt_a.receive_attached : 0.000004s : 0.01% optimize.opt_a.after_resolve : 0.000025s : 0.07% optimize.opt_a.a_after_grad : 0.000024s : 0.07% optimize.opt_a.renormalize : 0.016235s : 47.07% optimize.opt_a.add_forward_monad_depend : 0.000007s : 0.02% optimize.opt_a.auto_monad_grad : 0.000003s : 0.01% optimize.opt_a.auto_monad_eliminator : 0.000028s : 0.08% optimize.opt_a.cse : 0.000055s : 0.16% optimize.opt_a.a_3 : 0.000114s : 0.33% optimize.py_interpret_to_execute_after_opt_a : 0.000012s : 0.04% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.01% optimize.rewriter_after_opt_a : 0.000040s : 0.12% optimize.convert_after_rewriter : 0.000007s : 0.02% optimize.order_py_execute_after_rewriter : 0.000006s : 0.02% optimize.mutable_eliminate : 0.000548s : 1.59% optimize.opt_b.b_1 : 0.000159s : 0.46% optimize.opt_b.b_2 : 0.000009s : 0.03% optimize.opt_b.updatestate_depend_eliminate : 0.000006s : 0.02% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.01% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.01% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000022s : 0.06% optimize.optimize_parallel_all_gather_comm : 0.000017s : 0.05% optimize.overlap_param_gather : 0.000002s : 0.01% optimize.cconv : 0.000024s : 0.07% optimize.loop_unroll : 0.000443s : 1.29% optimize.opt_after_cconv.c_1 : 0.000040s : 0.12% optimize.opt_after_cconv.parameter_eliminate : 0.000002s : 0.01% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.02% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.cse : 0.000022s : 0.06% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000027s : 0.08% optimize.tuple_transform.d_1 : 0.000056s : 0.16% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000008s : 0.02% optimize.partial_unused_args_eliminate : 0.000002s : 0.01% optimize.add_recomputation : 0.000055s : 0.16% optimize.cse_after_recomputation.cse : 0.000016s : 0.05% optimize.environ_conv : 0.000006s : 0.02% optimize.swap_dp_allreduce_reducescatter : 0.000006s : 0.02% optimize.bias_add_comm_swap : 0.000003s : 0.01% optimize.label_micro_interleaved_index : 0.000004s : 0.01% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.01% optimize.merge_cast_opt : 0.000001s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.01% optimize.micro_interleaved_order_control : 0.000002s : 0.01% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.00% optimize.full_micro_interleaved_order_control : 0.000002s : 0.01% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.01% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.01% optimize.control_data_broadcast_order : 0.000032s : 0.09% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.01% optimize.offloading_packed_experts : 0.000005s : 0.01% optimize.overlap_recompute_and_grad_model_parallel : 0.000006s : 0.02% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000002s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.00% optimize.overlap_recompute_comm : 0.000003s : 0.01% optimize.overlap_grad_ring_attention : 0.000005s : 0.01% optimize.overlap_grad_flash_sp : 0.000022s : 0.06% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.01% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000003s : 0.01% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000012s : 0.03% optimize.symbol_engine_optimizer.elim_not_effective : 0.000017s : 0.05% optimize.symbol_engine_optimizer.opt_reshape : 0.000009s : 0.03% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000013s : 0.04% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.01% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000020s : 0.06% get_jit_bprop_graph : 0.000001s : 0.00% rewriter_after_jit_bprop_graph : 0.000004s : 0.01% opt_after_jit_grad : 0.000516s : 1.50% validate : 0.000040s : 0.12% Time group info: ------[substitution.] 0.000246 44 10.34% : 0.000025s : 3: substitution.cast_eliminate 0.96% : 0.000002s : 3: substitution.elim_not_effective 0.81% : 0.000002s : 3: substitution.fold_const_symbol 2.80% : 0.000007s : 5: substitution.graph_param_transform 69.95% : 0.000172s : 8: substitution.inline 2.05% : 0.000005s : 6: substitution.j_node_and_user_rematch 2.37% : 0.000006s : 6: substitution.remove_not_recompute_node 1.86% : 0.000005s : 4: substitution.replace_old_param 3.65% : 0.000009s : 2: substitution.switch_simplify 5.21% : 0.000013s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.012900 2 89.83% : 0.011588s : 1: type_inference.infer 10.17% : 0.001312s : 1: type_inference.specialize ------[replace.] 0.000115 14 44.52% : 0.000051s : 8: replace.inline 35.17% : 0.000040s : 2: replace.switch_simplify 20.31% : 0.000023s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000187 14 90.00% : 0.000168s : 8: match.inline 4.17% : 0.000008s : 2: match.switch_simplify 5.84% : 0.000011s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000278 1746 0.94% : 0.000003s : 19: predicate.accumulaten_eliminater 0.73% : 0.000002s : 5: predicate.ad_related_special_op_eliminate 0.50% : 0.000001s : 10: predicate.addn_check_dump 0.94% : 0.000003s : 19: predicate.addn_zero_filter 0.91% : 0.000003s : 19: predicate.adjust_all_reduce_mul_add 2.05% : 0.000006s : 29: predicate.arithmetic_simplify 1.21% : 0.000003s : 19: predicate.cast_eliminate 0.53% : 0.000001s : 10: predicate.check_bprop_eliminate 0.53% : 0.000001s : 10: predicate.compare_switch_simplify 0.17% : 0.000000s : 5: predicate.const_output_eliminate 0.60% : 0.000002s : 10: predicate.depend_value_elim 1.19% : 0.000003s : 19: predicate.dict_get_item_const_eliminator 1.09% : 0.000003s : 19: predicate.dict_get_item_eliminator 0.97% : 0.000003s : 19: predicate.dict_set_item_eliminator 0.84% : 0.000002s : 10: predicate.dumpgradient_eliminate 0.19% : 0.000001s : 5: predicate.elim_not_effective 0.33% : 0.000001s : 5: predicate.elim_shapecalc_of_broadcastargs 1.21% : 0.000003s : 24: predicate.environ_add_const_eliminate 1.14% : 0.000003s : 24: predicate.environ_get_add_eliminate 1.21% : 0.000003s : 24: predicate.environ_get_depend_swap 1.69% : 0.000005s : 34: predicate.environ_get_eliminate 1.16% : 0.000003s : 24: predicate.environ_get_set_eliminate 1.63% : 0.000005s : 31: predicate.exchange_switch_depend_value 2.51% : 0.000007s : 31: predicate.float_depend_g_call 0.51% : 0.000001s : 10: predicate.float_environ_get_switch 0.73% : 0.000002s : 15: predicate.float_tuple_getitem_switch 0.19% : 0.000001s : 5: predicate.fold_const_symbol 0.57% : 0.000002s : 10: predicate.get_grad_eliminate 0.19% : 0.000001s : 5: predicate.graph_param_transform 0.62% : 0.000002s : 10: predicate.incorporate_call 0.50% : 0.000001s : 10: predicate.incorporate_call_switch 6.32% : 0.000018s : 80: predicate.inline 0.81% : 0.000002s : 10: predicate.inline_without_move 0.30% : 0.000001s : 10: predicate.j_node_and_user_rematch 0.69% : 0.000002s : 10: predicate.less_batch_normalization 1.78% : 0.000005s : 33: predicate.list_to_tuple_eliminator_ 2.52% : 0.000007s : 52: predicate.load_eliminater 0.80% : 0.000002s : 5: predicate.loop_unroll_after_grad 2.63% : 0.000007s : 49: predicate.loop_unroll_before_grad 1.74% : 0.000005s : 29: predicate.make_slice_get_slice_eliminator 0.58% : 0.000002s : 10: predicate.merge_addn 0.51% : 0.000001s : 10: predicate.micro_step_allgather_replace 0.49% : 0.000001s : 10: predicate.mini_step_allgather_replace 0.90% : 0.000003s : 19: predicate.minmaximum_grad 0.73% : 0.000002s : 5: predicate.mutable_eliminate 0.34% : 0.000001s : 5: predicate.opt_reshape 0.31% : 0.000001s : 5: predicate.parallel_virtual_node 2.00% : 0.000006s : 31: predicate.partial_defer_inline 1.64% : 0.000005s : 28: predicate.partial_eliminate 1.03% : 0.000003s : 19: predicate.print_const_string_wrapper 0.51% : 0.000001s : 10: predicate.reduce_all_const_elim 1.25% : 0.000003s : 19: predicate.reduce_eliminate 2.56% : 0.000007s : 52: predicate.redundant_stop_gradient_eliminater 0.34% : 0.000001s : 10: predicate.remove_not_recompute_node 1.31% : 0.000004s : 33: predicate.replace_applicator 0.42% : 0.000001s : 10: predicate.replace_old_param 0.22% : 0.000001s : 5: predicate.reset_defer_inline 1.00% : 0.000003s : 19: predicate.reshape_eliminate 0.65% : 0.000002s : 10: predicate.row_tensor_add_zeros_like 0.34% : 0.000001s : 5: predicate.row_tensor_eliminate 0.69% : 0.000002s : 10: predicate.same_eliminate 0.37% : 0.000001s : 10: predicate.set_cell_output_no_recompute 0.60% : 0.000002s : 10: predicate.shard_identity_eliminate 0.66% : 0.000002s : 10: predicate.special_op_eliminate 0.82% : 0.000002s : 10: predicate.specialize_transform 0.73% : 0.000002s : 10: predicate.split_environ_get_set_with_tuple_value 0.66% : 0.000002s : 10: predicate.stack_unstack_eliminate 0.30% : 0.000001s : 5: predicate.switch_call_monad_eliminater 1.72% : 0.000005s : 31: predicate.switch_defer_inline 2.25% : 0.000006s : 41: predicate.switch_layer_defer_inline 5.87% : 0.000016s : 99: predicate.switch_simplify 1.01% : 0.000003s : 19: predicate.tile_eliminate 1.13% : 0.000003s : 19: predicate.transpose_eliminate 1.58% : 0.000004s : 29: predicate.tuple_list_convert_item_index_to_positive 1.62% : 0.000005s : 29: predicate.tuple_list_get_item_const_eliminator 1.48% : 0.000004s : 29: predicate.tuple_list_get_item_depend_reorder 3.04% : 0.000008s : 43: predicate.tuple_list_get_item_eliminator 1.46% : 0.000004s : 29: predicate.tuple_list_get_set_item_eliminator 2.29% : 0.000006s : 39: predicate.tuple_list_set_item_eliminator 1.71% : 0.000005s : 33: predicate.tuple_to_list_eliminator_ 2.50% : 0.000007s : 52: predicate.updatestate_pure_node_eliminater 3.14% : 0.000009s : 62: predicate.updatestate_useless_node_eliminater 0.28% : 0.000001s : 5: predicate.value_based_eliminate 0.59% : 0.000002s : 10: predicate.virtual_dataset_eliminate 0.57% : 0.000002s : 10: predicate.virtual_output_eliminate 0.26% : 0.000001s : 5: predicate.virtual_view_grad_eliminate 0.34% : 0.000001s : 5: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000965 16 55.89% : 0.000539s : 6: func_graph_cloner_run.FuncGraphClonerGraph 44.11% : 0.000426s : 10: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.081057 192 0.00% : 0.000004s : 1: ForceFp32Comm 3.90% : 0.003162s : 1: add_attr 3.89% : 0.003151s : 1: add_attr_with_inline 0.00% : 0.000004s : 1: add_comm_op_reuse_tag 0.07% : 0.000059s : 1: add_recomputation 0.00% : 0.000004s : 1: assign_add_opt 0.13% : 0.000103s : 1: auto_monad 0.03% : 0.000023s : 1: auto_monad_reorder 0.00% : 0.000003s : 1: begin_end_overlap_inline 0.01% : 0.000007s : 1: bias_add_comm_swap 0.55% : 0.000446s : 1: bootstrap 0.03% : 0.000027s : 1: cconv 0.00% : 0.000004s : 1: comm_op_add_attrs 0.04% : 0.000035s : 1: control_data_broadcast_order 0.01% : 0.000010s : 1: convert_after_rewriter 0.04% : 0.000030s : 1: cse_after_recomputation 0.01% : 0.000005s : 1: dataset_repeat_opt 0.01% : 0.000005s : 1: detach_backward 0.01% : 0.000009s : 1: environ_conv 0.12% : 0.000101s : 1: event_method 0.01% : 0.000005s : 1: full_micro_interleaved_order_control 0.01% : 0.000004s : 1: get_jit_bprop_graph 0.01% : 0.000011s : 1: graph_reusing 0.01% : 0.000005s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000004s : 1: handle_group_info 0.01% : 0.000005s : 1: inline 0.01% : 0.000005s : 1: insert-virtual-dataset 0.00% : 0.000004s : 1: interleave_parallel_branches 0.01% : 0.000004s : 1: interleave_split_concat_branches 0.01% : 0.000006s : 1: label_fine_grained_interleaved_index 0.01% : 0.000007s : 1: label_micro_interleaved_index 0.56% : 0.000452s : 1: loop_unroll 0.00% : 0.000004s : 1: merge_cast_opt 0.01% : 0.000005s : 1: micro_interleaved_order_control 0.69% : 0.000557s : 1: mutable_eliminate 0.01% : 0.000008s : 1: offloading_packed_experts 0.02% : 0.000017s : 1: opt.transform.loop_unroll_optimizer 0.02% : 0.000016s : 1: opt.transform.mutable_eliminate 2.10% : 0.001701s : 78: opt.transform.opt_a 0.05% : 0.000038s : 1: opt.transform.opt_after_cconv 0.04% : 0.000032s : 1: opt.transform.opt_after_jit_grad 0.17% : 0.000137s : 28: opt.transform.opt_b 0.08% : 0.000062s : 2: opt.transform.opt_trans_graph 0.06% : 0.000047s : 4: opt.transform.symbol_engine_opt 23.27% : 0.018864s : 1: opt_a 0.15% : 0.000121s : 1: opt_after_cconv 0.65% : 0.000525s : 1: opt_after_jit_grad 0.31% : 0.000249s : 1: opt_b 26.05% : 0.021115s : 1: optimize 0.03% : 0.000020s : 1: optimize_parallel_all_gather_comm 0.01% : 0.000009s : 1: order_py_execute_after_rewriter 0.03% : 0.000025s : 1: overlap_grad_flash_sp 0.01% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.01% : 0.000008s : 1: overlap_grad_ring_attention 0.01% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.01% : 0.000005s : 1: overlap_param_gather 0.01% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.01% : 0.000009s : 1: overlap_recompute_and_grad_model_parallel 0.01% : 0.000006s : 1: overlap_recompute_comm 0.01% : 0.000007s : 1: parallel-infer-symbol 0.00% : 0.000004s : 1: parallel-infer-symbol-second 0.01% : 0.000005s : 1: partial_unused_args_eliminate 0.01% : 0.000005s : 1: pipeline_parallel_scheduler 0.01% : 0.000005s : 1: pipeline_split 0.06% : 0.000047s : 1: pre_auto_parallel 0.04% : 0.000036s : 1: py_interpret_to_execute 0.02% : 0.000016s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000003s : 1: remove_cast_before_assign_add 0.04% : 0.000032s : 1: remove_dup_value 19.42% : 0.015744s : 1: renormalize.infer 0.59% : 0.000480s : 1: renormalize.specialize 0.01% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.01% : 0.000007s : 1: rewriter_after_jit_bprop_graph 0.05% : 0.000044s : 1: rewriter_after_opt_a 0.15% : 0.000119s : 1: rewriter_before_opt_a 0.01% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.01% : 0.000005s : 1: slice_recompute_activation 0.01% : 0.000004s : 1: split_layernorm_comm 0.01% : 0.000005s : 1: split_matmul_comm_elemetwise 0.01% : 0.000009s : 1: swap_dp_allreduce_reducescatter 0.11% : 0.000091s : 1: symbol_engine_optimizer 0.12% : 0.000094s : 1: tuple_transform 16.02% : 0.012983s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:18.232.16 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:18.234.63 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0582646, [21] [bootstrap]: 0.00042636 [type_inference]: 0.0298116 [event_method]: 8.888e-05 [auto_monad]: 0.00011311 [graph_reusing]: 7.9e-06 [inline]: 2.49001e-06 [add_attr]: 0.00316905, [1] [add_attr_with_inline]: 0.00316013, [1] [Cycle 1]: 7.951e-05, [2] [tag_attr]: 2.791e-05 [meta_addattr_fg_expand]: 8.49998e-06 [parallel-infer-symbol]: 3.32002e-06 [pre_auto_parallel]: 4.288e-05 [insert-virtual-dataset]: 2.49999e-06 [parallel-infer-symbol-second]: 8.39995e-07 [dataset_repeat_opt]: 1.99999e-06 [pipeline_split]: 1.50999e-06 [optimize]: 0.0232877, [53] [py_interpret_to_execute]: 3.885e-05 [rewriter_before_opt_a]: 0.00012431 [opt_a]: 0.00408155, [2] [Cycle 1]: 0.00303507, [45] [expand_dump_flag]: 4.17998e-06 [switch_simplify]: 0.00012426 [loop_unroll]: 4.317e-05 [a_1]: 0.00098271 [with_stream_mark]: 1.722e-05 [recompute_prepare]: 1.109e-05 [updatestate_depend_eliminate]: 5.21998e-06 [updatestate_assign_eliminate]: 5.13002e-06 [updatestate_loads_eliminate]: 4.32e-06 [parameter_eliminate]: 1.82999e-06 [a_2]: 0.00015216 [accelerated_algorithm]: 9.99999e-06 [shard]: 2.07001e-06 [meta_shard_fg_expand]: 2.64001e-06 [shard_inline]: 8.99e-06 [merge_send_recv]: 1.03e-05 [auto_parallel]: 7.97998e-06 [parallel]: 1.814e-05 [flash_sp]: 8.35001e-06 [merge_comm]: 5.32001e-06 [allreduce_fusion]: 5.42999e-06 [matmul_add_comm_reduction]: 1.131e-05 [allreduce_slice_to_reducescatter]: 6.10016e-07 [virtual_shard_identity]: 1.11e-05 [virtual_dataset]: 9.44e-06 [get_grad_eliminate_]: 8.67e-06 [virtual_output]: 8.75999e-06 [merge_forward]: 4.92999e-06 [cell_reuse_recompute_pass]: 1.26002e-06 [offload_activation]: 1.114e-05 [cell_reuse_handle_not_recompute_node_pass]: 2e-05 [merge_recompute_call_nodes]: 1.43002e-06 [before_grad]: 1.538e-05 [set_forward_comm_id_for_comm_node_pass]: 5.52001e-06 [meta_fg_expand]: 5.04998e-06 [flash_sp_send_recv_attached]: 2.72001e-06 [receive_attached]: 2.04e-06 [after_resolve]: 1.442e-05 [a_after_grad]: 1.494e-05 [renormalize]: 0.00091352 [add_forward_monad_depend]: 5.86e-06 [auto_monad_grad]: 2.06e-06 [auto_monad_eliminator]: 1.987e-05 [cse]: 4.333e-05 [a_3]: 8.176e-05 [Cycle 2]: 0.00103391, [45] [expand_dump_flag]: 1.00999e-06 [switch_simplify]: 1.089e-05 [loop_unroll]: 9.36e-06 [a_1]: 0.00022103 [with_stream_mark]: 1.399e-05 [recompute_prepare]: 9.39e-06 [updatestate_depend_eliminate]: 4.53001e-06 [updatestate_assign_eliminate]: 3.71999e-06 [updatestate_loads_eliminate]: 3.55e-06 [parameter_eliminate]: 1.19998e-06 [a_2]: 0.00014161 [accelerated_algorithm]: 8.92999e-06 [shard]: 1.29e-06 [meta_shard_fg_expand]: 1.77001e-06 [shard_inline]: 8.97999e-06 [merge_send_recv]: 6.86999e-06 [auto_parallel]: 6.87002e-06 [parallel]: 4.62998e-06 [flash_sp]: 3.14999e-06 [merge_comm]: 4.75001e-06 [allreduce_fusion]: 4.28999e-06 [matmul_add_comm_reduction]: 7.73999e-06 [allreduce_slice_to_reducescatter]: 4.69998e-07 [virtual_shard_identity]: 9.54e-06 [virtual_dataset]: 8.43999e-06 [get_grad_eliminate_]: 8.35001e-06 [virtual_output]: 8.47e-06 [merge_forward]: 4.10998e-06 [cell_reuse_recompute_pass]: 1.49e-06 [offload_activation]: 8.25e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.911e-05 [merge_recompute_call_nodes]: 9.40025e-07 [before_grad]: 1.407e-05 [set_forward_comm_id_for_comm_node_pass]: 5.41998e-06 [meta_fg_expand]: 3.43e-06 [flash_sp_send_recv_attached]: 8.70001e-07 [receive_attached]: 1.01002e-06 [after_resolve]: 1.267e-05 [a_after_grad]: 1.341e-05 [renormalize]: 8.00064e-08 [add_forward_monad_depend]: 1.27e-06 [auto_monad_grad]: 8.39995e-07 [auto_monad_eliminator]: 1.019e-05 [cse]: 2.211e-05 [a_3]: 6.945e-05 [py_interpret_to_execute_after_opt_a]: 1.323e-05 [slice_cell_reuse_recomputed_activation]: 5.30999e-06 [rewriter_after_opt_a]: 5.069e-05 [convert_after_rewriter]: 1.106e-05 [order_py_execute_after_rewriter]: 1.015e-05 [mutable_eliminate]: 0.0168282 [opt_b]: 0.00042473, [1] [Cycle 1]: 0.00040968, [7] [b_1]: 0.00025118 [b_2]: 1.255e-05 [updatestate_depend_eliminate]: 1.476e-05 [updatestate_assign_eliminate]: 5.16002e-06 [updatestate_loads_eliminate]: 4.68999e-06 [renormalize]: 1.09e-06 [cse]: 5.726e-05 [optimize_parallel_all_gather_comm]: 3.082e-05 [overlap_param_gather]: 5.34e-06 [cconv]: 4.582e-05 [loop_unroll]: 0.00054296 [opt_after_cconv]: 0.00017055, [1] [Cycle 1]: 0.00016158, [7] [c_1]: 4.841e-05 [parameter_eliminate]: 6.29999e-06 [updatestate_depend_eliminate]: 8.28001e-06 [updatestate_assign_eliminate]: 3.88001e-06 [updatestate_loads_eliminate]: 3.6e-06 [cse]: 3.239e-05 [renormalize]: 5.90022e-07 [remove_dup_value]: 5.931e-05 [tuple_transform]: 0.00012251, [1] [Cycle 1]: 0.00011423, [4] [d_1]: 6.94e-05 [none_parameter_eliminate]: 2.00002e-06 [renormalize]: 2.19996e-07 [switch_simplify]: 9.92999e-06 [partial_unused_args_eliminate]: 4.50999e-06 [add_recomputation]: 7.631e-05 [cse_after_recomputation]: 3.798e-05, [1] [Cycle 1]: 3.1e-05, [1] [cse]: 2.064e-05 [environ_conv]: 1.128e-05 [swap_dp_allreduce_reducescatter]: 1.08e-05 [bias_add_comm_swap]: 6.12001e-06 [label_micro_interleaved_index]: 7.05e-06 [label_fine_grained_interleaved_index]: 5.20999e-06 [merge_cast_opt]: 3.86001e-06 [slice_recompute_activation]: 4.75001e-06 [micro_interleaved_order_control]: 4.99e-06 [assign_add_opt]: 3.43e-06 [ForceFp32Comm]: 3.06001e-06 [remove_cast_before_assign_add]: 3.24001e-06 [full_micro_interleaved_order_control]: 4.52e-06 [reorder_send_recv_between_fp_bp]: 5.95002e-06 [comm_op_add_attrs]: 3.25e-06 [add_comm_op_reuse_tag]: 4.03001e-06 [interleave_split_concat_branches]: 3.68999e-06 [interleave_parallel_branches]: 3.55e-06 [overlap_opt_shard_in_pipeline]: 3.53e-06 [overlap_opt_shard_grad_in_pipeline]: 4.47e-06 [control_data_broadcast_order]: 5.867e-05 [grouped_pairwise_exchange_alltoall]: 4.39002e-06 [offloading_packed_experts]: 8.72e-06 [overlap_recompute_and_grad_model_parallel]: 9.31e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.86001e-06 [overlap_recompute_allgather_and_fa_grad]: 4.21001e-06 [overlap_recompute_comm]: 4.84003e-06 [overlap_grad_ring_attention]: 7.66001e-06 [overlap_grad_flash_sp]: 2.991e-05 [begin_end_overlap_inline]: 3.11001e-06 [split_matmul_comm_elemetwise]: 5.06002e-06 [split_layernorm_comm]: 4.08999e-06 [handle_group_info]: 3.68e-06 [symbol_engine_optimizer]: 0.00012296, [1] [Cycle 1]: 0.00011552, [6] [build]: 4.87e-06 [elim_shapecalc]: 1.616e-05 [elim_not_effective]: 1.889e-05 [opt_reshape]: 1.039e-05 [fold_const_symbol]: 1.565e-05 [renormalize]: 2.10013e-07 [detach_backward]: 3.89002e-06 [pipeline_parallel_scheduler]: 2.06e-06 [auto_monad_reorder]: 2.524e-05 [get_jit_bprop_graph]: 2.14e-06 [rewriter_after_jit_bprop_graph]: 6.53998e-06 [opt_after_jit_grad]: 0.00050759 [validate]: 4.81e-05 Sums bootstrap : 0.000426s : 0.80% type_inference : 0.029812s : 56.04% event_method : 0.000089s : 0.17% auto_monad : 0.000113s : 0.21% graph_reusing : 0.000008s : 0.01% inline : 0.000002s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000028s : 0.05% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000008s : 0.02% parallel-infer-symbol : 0.000003s : 0.01% pre_auto_parallel : 0.000043s : 0.08% insert-virtual-dataset : 0.000002s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000039s : 0.07% optimize.rewriter_before_opt_a : 0.000124s : 0.23% optimize.opt_a.expand_dump_flag : 0.000005s : 0.01% optimize.opt_a.switch_simplify : 0.000135s : 0.25% optimize.opt_a.loop_unroll : 0.000053s : 0.10% optimize.opt_a.a_1 : 0.001204s : 2.26% optimize.opt_a.with_stream_mark : 0.000031s : 0.06% optimize.opt_a.recompute_prepare : 0.000020s : 0.04% optimize.opt_a.updatestate_depend_eliminate : 0.000010s : 0.02% optimize.opt_a.updatestate_assign_eliminate : 0.000009s : 0.02% optimize.opt_a.updatestate_loads_eliminate : 0.000008s : 0.01% optimize.opt_a.parameter_eliminate : 0.000003s : 0.01% optimize.opt_a.a_2 : 0.000294s : 0.55% optimize.opt_a.accelerated_algorithm : 0.000019s : 0.04% optimize.opt_a.shard : 0.000003s : 0.01% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.01% optimize.opt_a.shard_inline : 0.000018s : 0.03% optimize.opt_a.merge_send_recv : 0.000017s : 0.03% optimize.opt_a.auto_parallel : 0.000015s : 0.03% optimize.opt_a.parallel : 0.000023s : 0.04% optimize.opt_a.flash_sp : 0.000012s : 0.02% optimize.opt_a.merge_comm : 0.000010s : 0.02% optimize.opt_a.allreduce_fusion : 0.000010s : 0.02% optimize.opt_a.matmul_add_comm_reduction : 0.000019s : 0.04% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000021s : 0.04% optimize.opt_a.virtual_dataset : 0.000018s : 0.03% optimize.opt_a.get_grad_eliminate_ : 0.000017s : 0.03% optimize.opt_a.virtual_output : 0.000017s : 0.03% optimize.opt_a.merge_forward : 0.000009s : 0.02% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.01% optimize.opt_a.offload_activation : 0.000019s : 0.04% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000039s : 0.07% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.00% optimize.opt_a.before_grad : 0.000029s : 0.06% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000011s : 0.02% optimize.opt_a.meta_fg_expand : 0.000008s : 0.02% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.01% optimize.opt_a.receive_attached : 0.000003s : 0.01% optimize.opt_a.after_resolve : 0.000027s : 0.05% optimize.opt_a.a_after_grad : 0.000028s : 0.05% optimize.opt_a.renormalize : 0.000914s : 1.72% optimize.opt_a.add_forward_monad_depend : 0.000007s : 0.01% optimize.opt_a.auto_monad_grad : 0.000003s : 0.01% optimize.opt_a.auto_monad_eliminator : 0.000030s : 0.06% optimize.opt_a.cse : 0.000065s : 0.12% optimize.opt_a.a_3 : 0.000151s : 0.28% optimize.py_interpret_to_execute_after_opt_a : 0.000013s : 0.02% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.01% optimize.rewriter_after_opt_a : 0.000051s : 0.10% optimize.convert_after_rewriter : 0.000011s : 0.02% optimize.order_py_execute_after_rewriter : 0.000010s : 0.02% optimize.mutable_eliminate : 0.016828s : 31.64% optimize.opt_b.b_1 : 0.000251s : 0.47% optimize.opt_b.b_2 : 0.000013s : 0.02% optimize.opt_b.updatestate_depend_eliminate : 0.000015s : 0.03% optimize.opt_b.updatestate_assign_eliminate : 0.000005s : 0.01% optimize.opt_b.updatestate_loads_eliminate : 0.000005s : 0.01% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000057s : 0.11% optimize.optimize_parallel_all_gather_comm : 0.000031s : 0.06% optimize.overlap_param_gather : 0.000005s : 0.01% optimize.cconv : 0.000046s : 0.09% optimize.loop_unroll : 0.000543s : 1.02% optimize.opt_after_cconv.c_1 : 0.000048s : 0.09% optimize.opt_after_cconv.parameter_eliminate : 0.000006s : 0.01% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000008s : 0.02% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000004s : 0.01% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000004s : 0.01% optimize.opt_after_cconv.cse : 0.000032s : 0.06% optimize.opt_after_cconv.renormalize : 0.000001s : 0.00% optimize.remove_dup_value : 0.000059s : 0.11% optimize.tuple_transform.d_1 : 0.000069s : 0.13% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000010s : 0.02% optimize.partial_unused_args_eliminate : 0.000005s : 0.01% optimize.add_recomputation : 0.000076s : 0.14% optimize.cse_after_recomputation.cse : 0.000021s : 0.04% optimize.environ_conv : 0.000011s : 0.02% optimize.swap_dp_allreduce_reducescatter : 0.000011s : 0.02% optimize.bias_add_comm_swap : 0.000006s : 0.01% optimize.label_micro_interleaved_index : 0.000007s : 0.01% optimize.label_fine_grained_interleaved_index : 0.000005s : 0.01% optimize.merge_cast_opt : 0.000004s : 0.01% optimize.slice_recompute_activation : 0.000005s : 0.01% optimize.micro_interleaved_order_control : 0.000005s : 0.01% optimize.assign_add_opt : 0.000003s : 0.01% optimize.ForceFp32Comm : 0.000003s : 0.01% optimize.remove_cast_before_assign_add : 0.000003s : 0.01% optimize.full_micro_interleaved_order_control : 0.000005s : 0.01% optimize.reorder_send_recv_between_fp_bp : 0.000006s : 0.01% optimize.comm_op_add_attrs : 0.000003s : 0.01% optimize.add_comm_op_reuse_tag : 0.000004s : 0.01% optimize.interleave_split_concat_branches : 0.000004s : 0.01% optimize.interleave_parallel_branches : 0.000004s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000004s : 0.01% optimize.control_data_broadcast_order : 0.000059s : 0.11% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.01% optimize.offloading_packed_experts : 0.000009s : 0.02% optimize.overlap_recompute_and_grad_model_parallel : 0.000009s : 0.02% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.01% optimize.overlap_recompute_comm : 0.000005s : 0.01% optimize.overlap_grad_ring_attention : 0.000008s : 0.01% optimize.overlap_grad_flash_sp : 0.000030s : 0.06% optimize.begin_end_overlap_inline : 0.000003s : 0.01% optimize.split_matmul_comm_elemetwise : 0.000005s : 0.01% optimize.split_layernorm_comm : 0.000004s : 0.01% optimize.handle_group_info : 0.000004s : 0.01% optimize.symbol_engine_optimizer.build : 0.000005s : 0.01% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000016s : 0.03% optimize.symbol_engine_optimizer.elim_not_effective : 0.000019s : 0.04% optimize.symbol_engine_optimizer.opt_reshape : 0.000010s : 0.02% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000016s : 0.03% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000004s : 0.01% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000025s : 0.05% get_jit_bprop_graph : 0.000002s : 0.00% rewriter_after_jit_bprop_graph : 0.000007s : 0.01% opt_after_jit_grad : 0.000508s : 0.95% validate : 0.000048s : 0.09% Time group info: ------[substitution.] 0.000279 54 11.42% : 0.000032s : 6: substitution.cast_eliminate 0.98% : 0.000003s : 4: substitution.elim_not_effective 0.76% : 0.000002s : 4: substitution.fold_const_symbol 2.90% : 0.000008s : 6: substitution.graph_param_transform 69.38% : 0.000194s : 8: substitution.inline 1.75% : 0.000005s : 8: substitution.j_node_and_user_rematch 2.81% : 0.000008s : 8: substitution.remove_not_recompute_node 1.70% : 0.000005s : 4: substitution.replace_old_param 3.36% : 0.000009s : 2: substitution.switch_simplify 4.93% : 0.000014s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.029743 2 95.00% : 0.028258s : 1: type_inference.infer 5.00% : 0.001486s : 1: type_inference.specialize ------[replace.] 0.000123 14 44.31% : 0.000055s : 8: replace.inline 36.42% : 0.000045s : 2: replace.switch_simplify 19.27% : 0.000024s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000209 14 90.75% : 0.000189s : 8: match.inline 3.91% : 0.000008s : 2: match.switch_simplify 5.34% : 0.000011s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000330 1972 0.90% : 0.000003s : 21: predicate.accumulaten_eliminater 0.57% : 0.000002s : 6: predicate.ad_related_special_op_eliminate 0.52% : 0.000002s : 12: predicate.addn_check_dump 0.93% : 0.000003s : 21: predicate.addn_zero_filter 0.88% : 0.000003s : 21: predicate.adjust_all_reduce_mul_add 2.13% : 0.000007s : 33: predicate.arithmetic_simplify 1.11% : 0.000004s : 21: predicate.cast_eliminate 0.59% : 0.000002s : 12: predicate.check_bprop_eliminate 0.51% : 0.000002s : 12: predicate.compare_switch_simplify 0.18% : 0.000001s : 6: predicate.const_output_eliminate 0.58% : 0.000002s : 12: predicate.depend_value_elim 0.98% : 0.000003s : 21: predicate.dict_get_item_const_eliminator 1.03% : 0.000003s : 21: predicate.dict_get_item_eliminator 0.92% : 0.000003s : 21: predicate.dict_set_item_eliminator 0.87% : 0.000003s : 12: predicate.dumpgradient_eliminate 0.17% : 0.000001s : 6: predicate.elim_not_effective 0.47% : 0.000002s : 6: predicate.elim_shapecalc_of_broadcastargs 1.19% : 0.000004s : 27: predicate.environ_add_const_eliminate 1.15% : 0.000004s : 27: predicate.environ_get_add_eliminate 1.16% : 0.000004s : 27: predicate.environ_get_depend_swap 1.77% : 0.000006s : 39: predicate.environ_get_eliminate 1.14% : 0.000004s : 27: predicate.environ_get_set_eliminate 1.46% : 0.000005s : 33: predicate.exchange_switch_depend_value 2.39% : 0.000008s : 33: predicate.float_depend_g_call 0.51% : 0.000002s : 12: predicate.float_environ_get_switch 0.76% : 0.000002s : 18: predicate.float_tuple_getitem_switch 0.17% : 0.000001s : 6: predicate.fold_const_symbol 0.57% : 0.000002s : 12: predicate.get_grad_eliminate 0.19% : 0.000001s : 6: predicate.graph_param_transform 0.61% : 0.000002s : 12: predicate.incorporate_call 0.50% : 0.000002s : 12: predicate.incorporate_call_switch 6.39% : 0.000021s : 90: predicate.inline 0.75% : 0.000002s : 12: predicate.inline_without_move 0.28% : 0.000001s : 12: predicate.j_node_and_user_rematch 0.78% : 0.000003s : 12: predicate.less_batch_normalization 1.79% : 0.000006s : 37: predicate.list_to_tuple_eliminator_ 2.55% : 0.000008s : 58: predicate.load_eliminater 0.97% : 0.000003s : 6: predicate.loop_unroll_after_grad 2.26% : 0.000007s : 51: predicate.loop_unroll_before_grad 1.69% : 0.000006s : 33: predicate.make_slice_get_slice_eliminator 0.55% : 0.000002s : 12: predicate.merge_addn 0.51% : 0.000002s : 12: predicate.micro_step_allgather_replace 0.58% : 0.000002s : 12: predicate.mini_step_allgather_replace 0.91% : 0.000003s : 21: predicate.minmaximum_grad 2.23% : 0.000007s : 6: predicate.mutable_eliminate 0.32% : 0.000001s : 6: predicate.opt_reshape 0.38% : 0.000001s : 6: predicate.parallel_virtual_node 1.91% : 0.000006s : 33: predicate.partial_defer_inline 1.55% : 0.000005s : 31: predicate.partial_eliminate 0.91% : 0.000003s : 21: predicate.print_const_string_wrapper 0.52% : 0.000002s : 12: predicate.reduce_all_const_elim 1.21% : 0.000004s : 21: predicate.reduce_eliminate 2.44% : 0.000008s : 58: predicate.redundant_stop_gradient_eliminater 0.37% : 0.000001s : 12: predicate.remove_not_recompute_node 1.25% : 0.000004s : 37: predicate.replace_applicator 0.44% : 0.000001s : 12: predicate.replace_old_param 0.41% : 0.000001s : 6: predicate.reset_defer_inline 1.02% : 0.000003s : 21: predicate.reshape_eliminate 0.58% : 0.000002s : 12: predicate.row_tensor_add_zeros_like 0.47% : 0.000002s : 6: predicate.row_tensor_eliminate 0.63% : 0.000002s : 12: predicate.same_eliminate 0.36% : 0.000001s : 12: predicate.set_cell_output_no_recompute 0.67% : 0.000002s : 12: predicate.shard_identity_eliminate 0.57% : 0.000002s : 12: predicate.special_op_eliminate 0.79% : 0.000003s : 12: predicate.specialize_transform 0.74% : 0.000002s : 12: predicate.split_environ_get_set_with_tuple_value 0.71% : 0.000002s : 12: predicate.stack_unstack_eliminate 0.35% : 0.000001s : 6: predicate.switch_call_monad_eliminater 1.62% : 0.000005s : 33: predicate.switch_defer_inline 2.12% : 0.000007s : 45: predicate.switch_layer_defer_inline 5.33% : 0.000018s : 106: predicate.switch_simplify 0.94% : 0.000003s : 21: predicate.tile_eliminate 0.92% : 0.000003s : 21: predicate.transpose_eliminate 1.57% : 0.000005s : 33: predicate.tuple_list_convert_item_index_to_positive 1.63% : 0.000005s : 33: predicate.tuple_list_get_item_const_eliminator 1.65% : 0.000005s : 33: predicate.tuple_list_get_item_depend_reorder 3.16% : 0.000010s : 49: predicate.tuple_list_get_item_eliminator 1.53% : 0.000005s : 33: predicate.tuple_list_get_set_item_eliminator 2.29% : 0.000008s : 45: predicate.tuple_list_set_item_eliminator 1.77% : 0.000006s : 37: predicate.tuple_to_list_eliminator_ 2.44% : 0.000008s : 58: predicate.updatestate_pure_node_eliminater 3.17% : 0.000010s : 70: predicate.updatestate_useless_node_eliminater 0.40% : 0.000001s : 6: predicate.value_based_eliminate 0.56% : 0.000002s : 12: predicate.virtual_dataset_eliminate 0.57% : 0.000002s : 12: predicate.virtual_output_eliminate 0.25% : 0.000001s : 6: predicate.virtual_view_grad_eliminate 0.37% : 0.000001s : 6: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000996 16 55.30% : 0.000551s : 6: func_graph_cloner_run.FuncGraphClonerGraph 44.70% : 0.000445s : 10: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.087861 192 0.01% : 0.000006s : 1: ForceFp32Comm 3.62% : 0.003178s : 1: add_attr 3.60% : 0.003164s : 1: add_attr_with_inline 0.01% : 0.000007s : 1: add_comm_op_reuse_tag 0.09% : 0.000080s : 1: add_recomputation 0.01% : 0.000006s : 1: assign_add_opt 0.14% : 0.000126s : 1: auto_monad 0.04% : 0.000034s : 1: auto_monad_reorder 0.01% : 0.000007s : 1: begin_end_overlap_inline 0.01% : 0.000009s : 1: bias_add_comm_swap 0.54% : 0.000470s : 1: bootstrap 0.06% : 0.000050s : 1: cconv 0.01% : 0.000006s : 1: comm_op_add_attrs 0.07% : 0.000062s : 1: control_data_broadcast_order 0.02% : 0.000014s : 1: convert_after_rewriter 0.05% : 0.000041s : 1: cse_after_recomputation 0.01% : 0.000007s : 1: dataset_repeat_opt 0.03% : 0.000024s : 1: detach_backward 0.02% : 0.000014s : 1: environ_conv 0.12% : 0.000104s : 1: event_method 0.01% : 0.000007s : 1: full_micro_interleaved_order_control 0.01% : 0.000008s : 1: get_jit_bprop_graph 0.02% : 0.000015s : 1: graph_reusing 0.01% : 0.000008s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000007s : 1: handle_group_info 0.01% : 0.000008s : 1: inline 0.01% : 0.000009s : 1: insert-virtual-dataset 0.01% : 0.000006s : 1: interleave_parallel_branches 0.01% : 0.000006s : 1: interleave_split_concat_branches 0.01% : 0.000008s : 1: label_fine_grained_interleaved_index 0.01% : 0.000010s : 1: label_micro_interleaved_index 0.63% : 0.000551s : 1: loop_unroll 0.01% : 0.000006s : 1: merge_cast_opt 0.01% : 0.000008s : 1: micro_interleaved_order_control 19.17% : 0.016842s : 1: mutable_eliminate 0.01% : 0.000012s : 1: offloading_packed_experts 0.03% : 0.000022s : 1: opt.transform.loop_unroll_optimizer 0.05% : 0.000043s : 1: opt.transform.mutable_eliminate 2.18% : 0.001914s : 78: opt.transform.opt_a 0.05% : 0.000047s : 1: opt.transform.opt_after_cconv 0.04% : 0.000034s : 1: opt.transform.opt_after_jit_grad 0.21% : 0.000182s : 28: opt.transform.opt_b 0.09% : 0.000077s : 2: opt.transform.opt_trans_graph 0.06% : 0.000057s : 4: opt.transform.symbol_engine_opt 4.65% : 0.004085s : 1: opt_a 0.20% : 0.000175s : 1: opt_after_cconv 0.59% : 0.000518s : 1: opt_after_jit_grad 0.49% : 0.000429s : 1: opt_b 27.01% : 0.023735s : 1: optimize 0.04% : 0.000034s : 1: optimize_parallel_all_gather_comm 0.01% : 0.000013s : 1: order_py_execute_after_rewriter 0.04% : 0.000033s : 1: overlap_grad_flash_sp 0.01% : 0.000007s : 1: overlap_grad_matmul_and_grad_allreduce 0.01% : 0.000011s : 1: overlap_grad_ring_attention 0.01% : 0.000007s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000006s : 1: overlap_opt_shard_in_pipeline 0.01% : 0.000008s : 1: overlap_param_gather 0.01% : 0.000007s : 1: overlap_recompute_allgather_and_fa_grad 0.01% : 0.000012s : 1: overlap_recompute_and_grad_model_parallel 0.01% : 0.000008s : 1: overlap_recompute_comm 0.01% : 0.000010s : 1: parallel-infer-symbol 0.01% : 0.000006s : 1: parallel-infer-symbol-second 0.01% : 0.000008s : 1: partial_unused_args_eliminate 0.01% : 0.000009s : 1: pipeline_parallel_scheduler 0.01% : 0.000007s : 1: pipeline_split 0.06% : 0.000050s : 1: pre_auto_parallel 0.05% : 0.000043s : 1: py_interpret_to_execute 0.02% : 0.000016s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000006s : 1: remove_cast_before_assign_add 0.07% : 0.000063s : 1: remove_dup_value 0.54% : 0.000477s : 1: renormalize.infer 0.49% : 0.000429s : 1: renormalize.specialize 0.01% : 0.000010s : 1: reorder_send_recv_between_fp_bp 0.01% : 0.000013s : 1: rewriter_after_jit_bprop_graph 0.06% : 0.000054s : 1: rewriter_after_opt_a 0.15% : 0.000128s : 1: rewriter_before_opt_a 0.01% : 0.000008s : 1: slice_cell_reuse_recomputed_activation 0.01% : 0.000007s : 1: slice_recompute_activation 0.01% : 0.000007s : 1: split_layernorm_comm 0.01% : 0.000008s : 1: split_matmul_comm_elemetwise 0.02% : 0.000014s : 1: swap_dp_allreduce_reducescatter 0.14% : 0.000126s : 1: symbol_engine_optimizer 0.14% : 0.000125s : 1: tuple_transform 33.98% : 0.029851s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:18.526.304 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0565585, [21] [bootstrap]: 0.00042535 [type_inference]: 0.0296645 [event_method]: 8.654e-05 [auto_monad]: 9.951e-05 [graph_reusing]: 7.72002e-06 [inline]: 2.66e-06 [add_attr]: 0.00306417, [1] [add_attr_with_inline]: 0.00305583, [1] [Cycle 1]: 7.086e-05, [2] [tag_attr]: 2.731e-05 [meta_addattr_fg_expand]: 8.43999e-06 [parallel-infer-symbol]: 2.91e-06 [pre_auto_parallel]: 4.228e-05 [insert-virtual-dataset]: 2.28998e-06 [parallel-infer-symbol-second]: 7.39994e-07 [dataset_repeat_opt]: 1.94e-06 [pipeline_split]: 1.63002e-06 [optimize]: 0.0224133, [53] [py_interpret_to_execute]: 3.214e-05 [rewriter_before_opt_a]: 0.00011702 [opt_a]: 0.00363553, [2] [Cycle 1]: 0.00276195, [45] [expand_dump_flag]: 3.97e-06 [switch_simplify]: 0.00015175 [loop_unroll]: 4.345e-05 [a_1]: 0.00092416 [with_stream_mark]: 1.611e-05 [recompute_prepare]: 1.177e-05 [updatestate_depend_eliminate]: 5.32001e-06 [updatestate_assign_eliminate]: 4.39002e-06 [updatestate_loads_eliminate]: 4.18999e-06 [parameter_eliminate]: 2.02999e-06 [a_2]: 0.00012323 [accelerated_algorithm]: 1.02e-05 [shard]: 2.36e-06 [meta_shard_fg_expand]: 2.51e-06 [shard_inline]: 1.101e-05 [merge_send_recv]: 1.072e-05 [auto_parallel]: 7.58001e-06 [parallel]: 1.919e-05 [flash_sp]: 8.28999e-06 [merge_comm]: 6.23e-06 [allreduce_fusion]: 5.54e-06 [matmul_add_comm_reduction]: 1.111e-05 [allreduce_slice_to_reducescatter]: 6.19999e-07 [virtual_shard_identity]: 1.111e-05 [virtual_dataset]: 9.13002e-06 [get_grad_eliminate_]: 8.82e-06 [virtual_output]: 8.80001e-06 [merge_forward]: 4.75999e-06 [cell_reuse_recompute_pass]: 1.19e-06 [offload_activation]: 1.165e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.853e-05 [merge_recompute_call_nodes]: 1.40001e-06 [before_grad]: 1.463e-05 [set_forward_comm_id_for_comm_node_pass]: 5.10999e-06 [meta_fg_expand]: 4.22998e-06 [flash_sp_send_recv_attached]: 2.49999e-06 [receive_attached]: 2.29999e-06 [after_resolve]: 1.438e-05 [a_after_grad]: 1.399e-05 [renormalize]: 0.00085369 [add_forward_monad_depend]: 5.45001e-06 [auto_monad_grad]: 1.62999e-06 [auto_monad_eliminator]: 1.82e-05 [cse]: 4.29e-05 [a_3]: 6.787e-05 [Cycle 2]: 0.00086249, [45] [expand_dump_flag]: 9.5999e-07 [switch_simplify]: 1.04e-05 [loop_unroll]: 9.02999e-06 [a_1]: 0.00021784 [with_stream_mark]: 1.325e-05 [recompute_prepare]: 9.09998e-06 [updatestate_depend_eliminate]: 4.46002e-06 [updatestate_assign_eliminate]: 3.36999e-06 [updatestate_loads_eliminate]: 3.48e-06 [parameter_eliminate]: 1.04e-06 [a_2]: 0.00011345 [accelerated_algorithm]: 8.94998e-06 [shard]: 1.98002e-06 [meta_shard_fg_expand]: 1.97999e-06 [shard_inline]: 9.12001e-06 [merge_send_recv]: 6.23998e-06 [auto_parallel]: 7.03e-06 [parallel]: 4.55001e-06 [flash_sp]: 3.17002e-06 [merge_comm]: 4.55001e-06 [allreduce_fusion]: 4.63999e-06 [matmul_add_comm_reduction]: 8.13999e-06 [allreduce_slice_to_reducescatter]: 3.69997e-07 [virtual_shard_identity]: 9.74e-06 [virtual_dataset]: 9.05999e-06 [get_grad_eliminate_]: 8.15e-06 [virtual_output]: 8.19002e-06 [merge_forward]: 3.68999e-06 [cell_reuse_recompute_pass]: 1.77001e-06 [offload_activation]: 8.42998e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.777e-05 [merge_recompute_call_nodes]: 7.59988e-07 [before_grad]: 1.434e-05 [set_forward_comm_id_for_comm_node_pass]: 5.59e-06 [meta_fg_expand]: 3.71999e-06 [flash_sp_send_recv_attached]: 1.03001e-06 [receive_attached]: 1.03001e-06 [after_resolve]: 1.262e-05 [a_after_grad]: 1.306e-05 [renormalize]: 9.00181e-08 [add_forward_monad_depend]: 1.39e-06 [auto_monad_grad]: 1.02998e-06 [auto_monad_eliminator]: 9.72999e-06 [cse]: 2.166e-05 [a_3]: 5.533e-05 [py_interpret_to_execute_after_opt_a]: 1.031e-05 [slice_cell_reuse_recomputed_activation]: 1.91e-06 [rewriter_after_opt_a]: 4.372e-05 [convert_after_rewriter]: 8.31002e-06 [order_py_execute_after_rewriter]: 6.33e-06 [mutable_eliminate]: 0.0166467 [opt_b]: 0.00034104, [1] [Cycle 1]: 0.00032946, [7] [b_1]: 0.00020167 [b_2]: 1.231e-05 [updatestate_depend_eliminate]: 1.344e-05 [updatestate_assign_eliminate]: 4.19002e-06 [updatestate_loads_eliminate]: 4.13999e-06 [renormalize]: 5.60016e-07 [cse]: 5.406e-05 [optimize_parallel_all_gather_comm]: 2.898e-05 [overlap_param_gather]: 2.26998e-06 [cconv]: 4.109e-05 [loop_unroll]: 0.00058271 [opt_after_cconv]: 0.00014475, [1] [Cycle 1]: 0.00013845, [7] [c_1]: 4.784e-05 [parameter_eliminate]: 6.10002e-06 [updatestate_depend_eliminate]: 7.8e-06 [updatestate_assign_eliminate]: 3.76001e-06 [updatestate_loads_eliminate]: 3.53e-06 [cse]: 3.195e-05 [renormalize]: 5.50004e-07 [remove_dup_value]: 5.046e-05 [tuple_transform]: 0.00010528, [1] [Cycle 1]: 0.00010064, [4] [d_1]: 7.055e-05 [none_parameter_eliminate]: 1.74998e-06 [renormalize]: 1.70025e-07 [switch_simplify]: 9.64999e-06 [partial_unused_args_eliminate]: 1.88997e-06 [add_recomputation]: 7.176e-05 [cse_after_recomputation]: 2.925e-05, [1] [Cycle 1]: 2.342e-05, [1] [cse]: 1.752e-05 [environ_conv]: 7.29001e-06 [swap_dp_allreduce_reducescatter]: 6.89001e-06 [bias_add_comm_swap]: 3.95e-06 [label_micro_interleaved_index]: 4.48001e-06 [label_fine_grained_interleaved_index]: 2.54999e-06 [merge_cast_opt]: 1.31002e-06 [slice_recompute_activation]: 2.51998e-06 [micro_interleaved_order_control]: 2.35002e-06 [assign_add_opt]: 1.52001e-06 [ForceFp32Comm]: 1.09e-06 [remove_cast_before_assign_add]: 1.05999e-06 [full_micro_interleaved_order_control]: 1.99e-06 [reorder_send_recv_between_fp_bp]: 2.83e-06 [comm_op_add_attrs]: 1.00001e-06 [add_comm_op_reuse_tag]: 1.02998e-06 [interleave_split_concat_branches]: 1.38002e-06 [interleave_parallel_branches]: 1.02998e-06 [overlap_opt_shard_in_pipeline]: 1.19e-06 [overlap_opt_shard_grad_in_pipeline]: 1.77001e-06 [control_data_broadcast_order]: 1.866e-05 [grouped_pairwise_exchange_alltoall]: 1.62001e-06 [offloading_packed_experts]: 4.89003e-06 [overlap_recompute_and_grad_model_parallel]: 6.04001e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.16002e-06 [overlap_recompute_allgather_and_fa_grad]: 1.67999e-06 [overlap_recompute_comm]: 2.45002e-06 [overlap_grad_ring_attention]: 4.99e-06 [overlap_grad_flash_sp]: 2.747e-05 [begin_end_overlap_inline]: 5.89993e-07 [split_matmul_comm_elemetwise]: 2.53e-06 [split_layernorm_comm]: 1.82999e-06 [handle_group_info]: 9.70002e-07 [symbol_engine_optimizer]: 0.0001031, [1] [Cycle 1]: 9.808e-05, [6] [build]: 4.42998e-06 [elim_shapecalc]: 1.579e-05 [elim_not_effective]: 1.959e-05 [opt_reshape]: 1.067e-05 [fold_const_symbol]: 1.604e-05 [renormalize]: 2.30008e-07 [detach_backward]: 2.53e-06 [pipeline_parallel_scheduler]: 1.76998e-06 [auto_monad_reorder]: 2.176e-05 [get_jit_bprop_graph]: 1.84998e-06 [rewriter_after_jit_bprop_graph]: 5.96e-06 [opt_after_jit_grad]: 0.00050363 [validate]: 4.859e-05 Sums bootstrap : 0.000425s : 0.81% type_inference : 0.029664s : 56.56% event_method : 0.000087s : 0.16% auto_monad : 0.000100s : 0.19% graph_reusing : 0.000008s : 0.01% inline : 0.000003s : 0.01% add_attr.add_attr_with_inline.tag_attr : 0.000027s : 0.05% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000008s : 0.02% parallel-infer-symbol : 0.000003s : 0.01% pre_auto_parallel : 0.000042s : 0.08% insert-virtual-dataset : 0.000002s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000032s : 0.06% optimize.rewriter_before_opt_a : 0.000117s : 0.22% optimize.opt_a.expand_dump_flag : 0.000005s : 0.01% optimize.opt_a.switch_simplify : 0.000162s : 0.31% optimize.opt_a.loop_unroll : 0.000052s : 0.10% optimize.opt_a.a_1 : 0.001142s : 2.18% optimize.opt_a.with_stream_mark : 0.000029s : 0.06% optimize.opt_a.recompute_prepare : 0.000021s : 0.04% optimize.opt_a.updatestate_depend_eliminate : 0.000010s : 0.02% optimize.opt_a.updatestate_assign_eliminate : 0.000008s : 0.01% optimize.opt_a.updatestate_loads_eliminate : 0.000008s : 0.01% optimize.opt_a.parameter_eliminate : 0.000003s : 0.01% optimize.opt_a.a_2 : 0.000237s : 0.45% optimize.opt_a.accelerated_algorithm : 0.000019s : 0.04% optimize.opt_a.shard : 0.000004s : 0.01% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.01% optimize.opt_a.shard_inline : 0.000020s : 0.04% optimize.opt_a.merge_send_recv : 0.000017s : 0.03% optimize.opt_a.auto_parallel : 0.000015s : 0.03% optimize.opt_a.parallel : 0.000024s : 0.05% optimize.opt_a.flash_sp : 0.000011s : 0.02% optimize.opt_a.merge_comm : 0.000011s : 0.02% optimize.opt_a.allreduce_fusion : 0.000010s : 0.02% optimize.opt_a.matmul_add_comm_reduction : 0.000019s : 0.04% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000021s : 0.04% optimize.opt_a.virtual_dataset : 0.000018s : 0.03% optimize.opt_a.get_grad_eliminate_ : 0.000017s : 0.03% optimize.opt_a.virtual_output : 0.000017s : 0.03% optimize.opt_a.merge_forward : 0.000008s : 0.02% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.01% optimize.opt_a.offload_activation : 0.000020s : 0.04% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000036s : 0.07% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.00% optimize.opt_a.before_grad : 0.000029s : 0.06% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000011s : 0.02% optimize.opt_a.meta_fg_expand : 0.000008s : 0.02% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.01% optimize.opt_a.receive_attached : 0.000003s : 0.01% optimize.opt_a.after_resolve : 0.000027s : 0.05% optimize.opt_a.a_after_grad : 0.000027s : 0.05% optimize.opt_a.renormalize : 0.000854s : 1.63% optimize.opt_a.add_forward_monad_depend : 0.000007s : 0.01% optimize.opt_a.auto_monad_grad : 0.000003s : 0.01% optimize.opt_a.auto_monad_eliminator : 0.000028s : 0.05% optimize.opt_a.cse : 0.000065s : 0.12% optimize.opt_a.a_3 : 0.000123s : 0.23% optimize.py_interpret_to_execute_after_opt_a : 0.000010s : 0.02% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.00% optimize.rewriter_after_opt_a : 0.000044s : 0.08% optimize.convert_after_rewriter : 0.000008s : 0.02% optimize.order_py_execute_after_rewriter : 0.000006s : 0.01% optimize.mutable_eliminate : 0.016647s : 31.74% optimize.opt_b.b_1 : 0.000202s : 0.38% optimize.opt_b.b_2 : 0.000012s : 0.02% optimize.opt_b.updatestate_depend_eliminate : 0.000013s : 0.03% optimize.opt_b.updatestate_assign_eliminate : 0.000004s : 0.01% optimize.opt_b.updatestate_loads_eliminate : 0.000004s : 0.01% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000054s : 0.10% optimize.optimize_parallel_all_gather_comm : 0.000029s : 0.06% optimize.overlap_param_gather : 0.000002s : 0.00% optimize.cconv : 0.000041s : 0.08% optimize.loop_unroll : 0.000583s : 1.11% optimize.opt_after_cconv.c_1 : 0.000048s : 0.09% optimize.opt_after_cconv.parameter_eliminate : 0.000006s : 0.01% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000008s : 0.01% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000004s : 0.01% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000004s : 0.01% optimize.opt_after_cconv.cse : 0.000032s : 0.06% optimize.opt_after_cconv.renormalize : 0.000001s : 0.00% optimize.remove_dup_value : 0.000050s : 0.10% optimize.tuple_transform.d_1 : 0.000071s : 0.13% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000010s : 0.02% optimize.partial_unused_args_eliminate : 0.000002s : 0.00% optimize.add_recomputation : 0.000072s : 0.14% optimize.cse_after_recomputation.cse : 0.000018s : 0.03% optimize.environ_conv : 0.000007s : 0.01% optimize.swap_dp_allreduce_reducescatter : 0.000007s : 0.01% optimize.bias_add_comm_swap : 0.000004s : 0.01% optimize.label_micro_interleaved_index : 0.000004s : 0.01% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.00% optimize.merge_cast_opt : 0.000001s : 0.00% optimize.slice_recompute_activation : 0.000003s : 0.00% optimize.micro_interleaved_order_control : 0.000002s : 0.00% optimize.assign_add_opt : 0.000002s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.00% optimize.full_micro_interleaved_order_control : 0.000002s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.01% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.00% optimize.control_data_broadcast_order : 0.000019s : 0.04% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.00% optimize.offloading_packed_experts : 0.000005s : 0.01% optimize.overlap_recompute_and_grad_model_parallel : 0.000006s : 0.01% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000002s : 0.00% optimize.overlap_recompute_comm : 0.000002s : 0.00% optimize.overlap_grad_ring_attention : 0.000005s : 0.01% optimize.overlap_grad_flash_sp : 0.000027s : 0.05% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000003s : 0.00% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000004s : 0.01% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000016s : 0.03% optimize.symbol_engine_optimizer.elim_not_effective : 0.000020s : 0.04% optimize.symbol_engine_optimizer.opt_reshape : 0.000011s : 0.02% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000016s : 0.03% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000003s : 0.00% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000022s : 0.04% get_jit_bprop_graph : 0.000002s : 0.00% rewriter_after_jit_bprop_graph : 0.000006s : 0.01% opt_after_jit_grad : 0.000504s : 0.96% validate : 0.000049s : 0.09% Time group info: ------[substitution.] 0.000260 54 11.08% : 0.000029s : 6: substitution.cast_eliminate 1.11% : 0.000003s : 4: substitution.elim_not_effective 1.08% : 0.000003s : 4: substitution.fold_const_symbol 3.06% : 0.000008s : 6: substitution.graph_param_transform 67.93% : 0.000176s : 8: substitution.inline 2.01% : 0.000005s : 8: substitution.j_node_and_user_rematch 2.93% : 0.000008s : 8: substitution.remove_not_recompute_node 1.77% : 0.000005s : 4: substitution.replace_old_param 4.06% : 0.000011s : 2: substitution.switch_simplify 4.95% : 0.000013s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.029593 2 94.96% : 0.028103s : 1: type_inference.infer 5.04% : 0.001490s : 1: type_inference.specialize ------[replace.] 0.000144 14 34.77% : 0.000050s : 8: replace.inline 49.05% : 0.000071s : 2: replace.switch_simplify 16.18% : 0.000023s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000192 14 89.62% : 0.000172s : 8: match.inline 4.77% : 0.000009s : 2: match.switch_simplify 5.61% : 0.000011s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000313 1972 0.93% : 0.000003s : 21: predicate.accumulaten_eliminater 0.62% : 0.000002s : 6: predicate.ad_related_special_op_eliminate 0.50% : 0.000002s : 12: predicate.addn_check_dump 0.89% : 0.000003s : 21: predicate.addn_zero_filter 0.86% : 0.000003s : 21: predicate.adjust_all_reduce_mul_add 1.97% : 0.000006s : 33: predicate.arithmetic_simplify 1.11% : 0.000003s : 21: predicate.cast_eliminate 0.60% : 0.000002s : 12: predicate.check_bprop_eliminate 0.51% : 0.000002s : 12: predicate.compare_switch_simplify 0.19% : 0.000001s : 6: predicate.const_output_eliminate 0.56% : 0.000002s : 12: predicate.depend_value_elim 0.98% : 0.000003s : 21: predicate.dict_get_item_const_eliminator 1.02% : 0.000003s : 21: predicate.dict_get_item_eliminator 0.97% : 0.000003s : 21: predicate.dict_set_item_eliminator 0.86% : 0.000003s : 12: predicate.dumpgradient_eliminate 0.23% : 0.000001s : 6: predicate.elim_not_effective 0.49% : 0.000002s : 6: predicate.elim_shapecalc_of_broadcastargs 1.16% : 0.000004s : 27: predicate.environ_add_const_eliminate 1.13% : 0.000004s : 27: predicate.environ_get_add_eliminate 1.17% : 0.000004s : 27: predicate.environ_get_depend_swap 1.73% : 0.000005s : 39: predicate.environ_get_eliminate 1.14% : 0.000004s : 27: predicate.environ_get_set_eliminate 1.47% : 0.000005s : 33: predicate.exchange_switch_depend_value 2.21% : 0.000007s : 33: predicate.float_depend_g_call 0.54% : 0.000002s : 12: predicate.float_environ_get_switch 0.76% : 0.000002s : 18: predicate.float_tuple_getitem_switch 0.18% : 0.000001s : 6: predicate.fold_const_symbol 0.60% : 0.000002s : 12: predicate.get_grad_eliminate 0.19% : 0.000001s : 6: predicate.graph_param_transform 0.60% : 0.000002s : 12: predicate.incorporate_call 0.53% : 0.000002s : 12: predicate.incorporate_call_switch 6.46% : 0.000020s : 90: predicate.inline 0.91% : 0.000003s : 12: predicate.inline_without_move 0.30% : 0.000001s : 12: predicate.j_node_and_user_rematch 0.74% : 0.000002s : 12: predicate.less_batch_normalization 1.79% : 0.000006s : 37: predicate.list_to_tuple_eliminator_ 2.49% : 0.000008s : 58: predicate.load_eliminater 0.95% : 0.000003s : 6: predicate.loop_unroll_after_grad 2.33% : 0.000007s : 51: predicate.loop_unroll_before_grad 1.87% : 0.000006s : 33: predicate.make_slice_get_slice_eliminator 0.57% : 0.000002s : 12: predicate.merge_addn 0.56% : 0.000002s : 12: predicate.micro_step_allgather_replace 0.56% : 0.000002s : 12: predicate.mini_step_allgather_replace 0.86% : 0.000003s : 21: predicate.minmaximum_grad 2.12% : 0.000007s : 6: predicate.mutable_eliminate 0.37% : 0.000001s : 6: predicate.opt_reshape 0.35% : 0.000001s : 6: predicate.parallel_virtual_node 1.81% : 0.000006s : 33: predicate.partial_defer_inline 1.60% : 0.000005s : 31: predicate.partial_eliminate 0.92% : 0.000003s : 21: predicate.print_const_string_wrapper 0.57% : 0.000002s : 12: predicate.reduce_all_const_elim 1.19% : 0.000004s : 21: predicate.reduce_eliminate 2.50% : 0.000008s : 58: predicate.redundant_stop_gradient_eliminater 0.35% : 0.000001s : 12: predicate.remove_not_recompute_node 1.25% : 0.000004s : 37: predicate.replace_applicator 0.56% : 0.000002s : 12: predicate.replace_old_param 0.34% : 0.000001s : 6: predicate.reset_defer_inline 0.97% : 0.000003s : 21: predicate.reshape_eliminate 0.53% : 0.000002s : 12: predicate.row_tensor_add_zeros_like 0.34% : 0.000001s : 6: predicate.row_tensor_eliminate 0.66% : 0.000002s : 12: predicate.same_eliminate 0.39% : 0.000001s : 12: predicate.set_cell_output_no_recompute 0.71% : 0.000002s : 12: predicate.shard_identity_eliminate 0.68% : 0.000002s : 12: predicate.special_op_eliminate 0.71% : 0.000002s : 12: predicate.specialize_transform 0.71% : 0.000002s : 12: predicate.split_environ_get_set_with_tuple_value 0.64% : 0.000002s : 12: predicate.stack_unstack_eliminate 0.36% : 0.000001s : 6: predicate.switch_call_monad_eliminater 1.60% : 0.000005s : 33: predicate.switch_defer_inline 2.13% : 0.000007s : 45: predicate.switch_layer_defer_inline 5.50% : 0.000017s : 106: predicate.switch_simplify 0.90% : 0.000003s : 21: predicate.tile_eliminate 0.95% : 0.000003s : 21: predicate.transpose_eliminate 1.67% : 0.000005s : 33: predicate.tuple_list_convert_item_index_to_positive 1.67% : 0.000005s : 33: predicate.tuple_list_get_item_const_eliminator 1.58% : 0.000005s : 33: predicate.tuple_list_get_item_depend_reorder 3.06% : 0.000010s : 49: predicate.tuple_list_get_item_eliminator 1.53% : 0.000005s : 33: predicate.tuple_list_get_set_item_eliminator 2.26% : 0.000007s : 45: predicate.tuple_list_set_item_eliminator 1.78% : 0.000006s : 37: predicate.tuple_to_list_eliminator_ 2.41% : 0.000008s : 58: predicate.updatestate_pure_node_eliminater 3.03% : 0.000009s : 70: predicate.updatestate_useless_node_eliminater 0.36% : 0.000001s : 6: predicate.value_based_eliminate 0.59% : 0.000002s : 12: predicate.virtual_dataset_eliminate 0.60% : 0.000002s : 12: predicate.virtual_output_eliminate 0.28% : 0.000001s : 6: predicate.virtual_view_grad_eliminate 0.41% : 0.000001s : 6: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000940 16 55.24% : 0.000519s : 6: func_graph_cloner_run.FuncGraphClonerGraph 44.76% : 0.000421s : 10: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.085076 192 0.01% : 0.000004s : 1: ForceFp32Comm 3.61% : 0.003069s : 1: add_attr 3.60% : 0.003059s : 1: add_attr_with_inline 0.00% : 0.000004s : 1: add_comm_op_reuse_tag 0.09% : 0.000076s : 1: add_recomputation 0.01% : 0.000005s : 1: assign_add_opt 0.13% : 0.000107s : 1: auto_monad 0.03% : 0.000026s : 1: auto_monad_reorder 0.00% : 0.000003s : 1: begin_end_overlap_inline 0.01% : 0.000008s : 1: bias_add_comm_swap 0.53% : 0.000451s : 1: bootstrap 0.05% : 0.000045s : 1: cconv 0.05% : 0.000044s : 1: comm_op_add_attrs 0.03% : 0.000022s : 1: control_data_broadcast_order 0.01% : 0.000012s : 1: convert_after_rewriter 0.04% : 0.000032s : 1: cse_after_recomputation 0.01% : 0.000005s : 1: dataset_repeat_opt 0.01% : 0.000006s : 1: detach_backward 0.01% : 0.000010s : 1: environ_conv 0.11% : 0.000096s : 1: event_method 0.01% : 0.000005s : 1: full_micro_interleaved_order_control 0.01% : 0.000005s : 1: get_jit_bprop_graph 0.01% : 0.000012s : 1: graph_reusing 0.01% : 0.000005s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000004s : 1: handle_group_info 0.01% : 0.000006s : 1: inline 0.01% : 0.000005s : 1: insert-virtual-dataset 0.00% : 0.000004s : 1: interleave_parallel_branches 0.00% : 0.000004s : 1: interleave_split_concat_branches 0.01% : 0.000006s : 1: label_fine_grained_interleaved_index 0.01% : 0.000007s : 1: label_micro_interleaved_index 0.70% : 0.000592s : 1: loop_unroll 0.01% : 0.000005s : 1: merge_cast_opt 0.01% : 0.000005s : 1: micro_interleaved_order_control 19.59% : 0.016666s : 1: mutable_eliminate 0.01% : 0.000008s : 1: offloading_packed_experts 0.03% : 0.000022s : 1: opt.transform.loop_unroll_optimizer 0.05% : 0.000041s : 1: opt.transform.mutable_eliminate 2.20% : 0.001874s : 78: opt.transform.opt_a 0.05% : 0.000047s : 1: opt.transform.opt_after_cconv 0.04% : 0.000035s : 1: opt.transform.opt_after_jit_grad 0.21% : 0.000176s : 28: opt.transform.opt_b 0.09% : 0.000078s : 2: opt.transform.opt_trans_graph 0.07% : 0.000058s : 4: opt.transform.symbol_engine_opt 4.28% : 0.003639s : 1: opt_a 0.17% : 0.000149s : 1: opt_after_cconv 0.60% : 0.000512s : 1: opt_after_jit_grad 0.41% : 0.000345s : 1: opt_b 26.35% : 0.022420s : 1: optimize 0.04% : 0.000033s : 1: optimize_parallel_all_gather_comm 0.01% : 0.000009s : 1: order_py_execute_after_rewriter 0.04% : 0.000031s : 1: overlap_grad_flash_sp 0.00% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.01% : 0.000008s : 1: overlap_grad_ring_attention 0.01% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000005s : 1: overlap_opt_shard_in_pipeline 0.01% : 0.000005s : 1: overlap_param_gather 0.01% : 0.000005s : 1: overlap_recompute_allgather_and_fa_grad 0.01% : 0.000009s : 1: overlap_recompute_and_grad_model_parallel 0.01% : 0.000006s : 1: overlap_recompute_comm 0.01% : 0.000006s : 1: parallel-infer-symbol 0.00% : 0.000004s : 1: parallel-infer-symbol-second 0.01% : 0.000005s : 1: partial_unused_args_eliminate 0.01% : 0.000006s : 1: pipeline_parallel_scheduler 0.01% : 0.000004s : 1: pipeline_split 0.05% : 0.000047s : 1: pre_auto_parallel 0.04% : 0.000036s : 1: py_interpret_to_execute 0.02% : 0.000014s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000004s : 1: remove_cast_before_assign_add 0.07% : 0.000056s : 1: remove_dup_value 0.51% : 0.000437s : 1: renormalize.infer 0.48% : 0.000410s : 1: renormalize.specialize 0.01% : 0.000005s : 1: reorder_send_recv_between_fp_bp 0.01% : 0.000009s : 1: rewriter_after_jit_bprop_graph 0.06% : 0.000048s : 1: rewriter_after_opt_a 0.14% : 0.000121s : 1: rewriter_before_opt_a 0.01% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.01% : 0.000005s : 1: slice_recompute_activation 0.01% : 0.000005s : 1: split_layernorm_comm 0.01% : 0.000005s : 1: split_matmul_comm_elemetwise 0.01% : 0.000010s : 1: swap_dp_allreduce_reducescatter 0.12% : 0.000106s : 1: symbol_engine_optimizer 0.13% : 0.000109s : 1: tuple_transform 34.89% : 0.029684s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:19.193.46 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:19.196.12 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0569153, [21] [bootstrap]: 0.00041132 [type_inference]: 0.0290643 [event_method]: 9.478e-05 [auto_monad]: 0.0001016 [graph_reusing]: 7.82e-06 [inline]: 2.49999e-06 [add_attr]: 0.00321645, [1] [add_attr_with_inline]: 0.00320787, [1] [Cycle 1]: 7.952e-05, [2] [tag_attr]: 2.805e-05 [meta_addattr_fg_expand]: 8.32e-06 [parallel-infer-symbol]: 3.21999e-06 [pre_auto_parallel]: 4.332e-05 [insert-virtual-dataset]: 2.31e-06 [parallel-infer-symbol-second]: 8.59989e-07 [dataset_repeat_opt]: 1.84e-06 [pipeline_split]: 1.66e-06 [optimize]: 0.0225395, [53] [py_interpret_to_execute]: 3.586e-05 [rewriter_before_opt_a]: 0.0001196 [opt_a]: 0.00389071, [2] [Cycle 1]: 0.00294771, [45] [expand_dump_flag]: 3.76001e-06 [switch_simplify]: 0.0001226 [loop_unroll]: 4.285e-05 [a_1]: 0.0009644 [with_stream_mark]: 1.614e-05 [recompute_prepare]: 1.013e-05 [updatestate_depend_eliminate]: 4.57e-06 [updatestate_assign_eliminate]: 4.03001e-06 [updatestate_loads_eliminate]: 3.51001e-06 [parameter_eliminate]: 2.19001e-06 [a_2]: 0.00014651 [accelerated_algorithm]: 9.76998e-06 [shard]: 1.68002e-06 [meta_shard_fg_expand]: 2.75002e-06 [shard_inline]: 8.40001e-06 [merge_send_recv]: 9.44998e-06 [auto_parallel]: 7.80998e-06 [parallel]: 1.862e-05 [flash_sp]: 7.9e-06 [merge_comm]: 4.77998e-06 [allreduce_fusion]: 4.17e-06 [matmul_add_comm_reduction]: 9.92001e-06 [allreduce_slice_to_reducescatter]: 6.50005e-07 [virtual_shard_identity]: 9.86e-06 [virtual_dataset]: 8.42e-06 [get_grad_eliminate_]: 7.65e-06 [virtual_output]: 7.97e-06 [merge_forward]: 4.77e-06 [cell_reuse_recompute_pass]: 1.42999e-06 [offload_activation]: 1.155e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.772e-05 [merge_recompute_call_nodes]: 1.40999e-06 [before_grad]: 1.32e-05 [set_forward_comm_id_for_comm_node_pass]: 4.57e-06 [meta_fg_expand]: 4.1e-06 [flash_sp_send_recv_attached]: 2.58e-06 [receive_attached]: 3.09999e-06 [after_resolve]: 1.34e-05 [a_after_grad]: 1.258e-05 [renormalize]: 0.00088588 [add_forward_monad_depend]: 5.40999e-06 [auto_monad_grad]: 2.01e-06 [auto_monad_eliminator]: 1.798e-05 [cse]: 3.451e-05 [a_3]: 7.234e-05 [Cycle 2]: 0.00092989, [45] [expand_dump_flag]: 1.09998e-06 [switch_simplify]: 9.47999e-06 [loop_unroll]: 7.79002e-06 [a_1]: 0.00018198 [with_stream_mark]: 1.165e-05 [recompute_prepare]: 8.13001e-06 [updatestate_depend_eliminate]: 4.10998e-06 [updatestate_assign_eliminate]: 2.91e-06 [updatestate_loads_eliminate]: 2.79001e-06 [parameter_eliminate]: 1.25999e-06 [a_2]: 0.00012141 [accelerated_algorithm]: 7.77e-06 [shard]: 1.20999e-06 [meta_shard_fg_expand]: 1.69e-06 [shard_inline]: 8.02e-06 [merge_send_recv]: 5.67999e-06 [auto_parallel]: 6.09001e-06 [parallel]: 4.83001e-06 [flash_sp]: 3.86999e-06 [merge_comm]: 4.15e-06 [allreduce_fusion]: 3.86001e-06 [matmul_add_comm_reduction]: 6.58998e-06 [allreduce_slice_to_reducescatter]: 3.9002e-07 [virtual_shard_identity]: 8.38999e-06 [virtual_dataset]: 7.5e-06 [get_grad_eliminate_]: 8.40999e-06 [virtual_output]: 7.82998e-06 [merge_forward]: 3.43999e-06 [cell_reuse_recompute_pass]: 1.84998e-06 [offload_activation]: 7.45998e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.653e-05 [merge_recompute_call_nodes]: 7.10017e-07 [before_grad]: 1.165e-05 [set_forward_comm_id_for_comm_node_pass]: 5.02e-06 [meta_fg_expand]: 2.79999e-06 [flash_sp_send_recv_attached]: 1.02e-06 [receive_attached]: 1.00999e-06 [after_resolve]: 1.142e-05 [a_after_grad]: 1.167e-05 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 1.36998e-06 [auto_monad_grad]: 9.20001e-07 [auto_monad_eliminator]: 8.52e-06 [cse]: 1.764e-05 [a_3]: 5.872e-05 [py_interpret_to_execute_after_opt_a]: 1.277e-05 [slice_cell_reuse_recomputed_activation]: 5.21002e-06 [rewriter_after_opt_a]: 4.418e-05 [convert_after_rewriter]: 1.026e-05 [order_py_execute_after_rewriter]: 8.64003e-06 [mutable_eliminate]: 0.00051562 [opt_b]: 0.00031174, [1] [Cycle 1]: 0.00030304, [7] [b_1]: 0.00020068 [b_2]: 1.071e-05 [updatestate_depend_eliminate]: 6.33e-06 [updatestate_assign_eliminate]: 3.06001e-06 [updatestate_loads_eliminate]: 2.84999e-06 [renormalize]: 4.89992e-07 [cse]: 2.286e-05 [optimize_parallel_all_gather_comm]: 2.08e-05 [overlap_param_gather]: 5.02999e-06 [cconv]: 2.775e-05 [loop_unroll]: 0.0004583 [opt_after_cconv]: 0.0162578, [1] [Cycle 1]: 0.00022624, [7] [c_1]: 6.808e-05 [parameter_eliminate]: 7.00002e-06 [updatestate_depend_eliminate]: 1.29e-05 [updatestate_assign_eliminate]: 3.68e-06 [updatestate_loads_eliminate]: 4.45e-06 [cse]: 4.129e-05 [renormalize]: 8.00006e-07 [remove_dup_value]: 2.195e-05 [tuple_transform]: 0.00011556, [1] [Cycle 1]: 0.00010645, [4] [d_1]: 6.488e-05 [none_parameter_eliminate]: 1.56002e-06 [renormalize]: 1.69995e-07 [switch_simplify]: 9.07999e-06 [partial_unused_args_eliminate]: 4.47e-06 [add_recomputation]: 7.029e-05 [cse_after_recomputation]: 3.384e-05, [1] [Cycle 1]: 2.469e-05, [1] [cse]: 1.506e-05 [environ_conv]: 1.031e-05 [swap_dp_allreduce_reducescatter]: 9.46e-06 [bias_add_comm_swap]: 6.28002e-06 [label_micro_interleaved_index]: 9.36e-06 [label_fine_grained_interleaved_index]: 5.05999e-06 [merge_cast_opt]: 4.60999e-06 [slice_recompute_activation]: 4.68001e-06 [micro_interleaved_order_control]: 4.67998e-06 [assign_add_opt]: 5.07999e-06 [ForceFp32Comm]: 3.08e-06 [remove_cast_before_assign_add]: 3.20002e-06 [full_micro_interleaved_order_control]: 4.42e-06 [reorder_send_recv_between_fp_bp]: 5.55001e-06 [comm_op_add_attrs]: 3.24001e-06 [add_comm_op_reuse_tag]: 3.56999e-06 [interleave_split_concat_branches]: 3.48999e-06 [interleave_parallel_branches]: 4.03999e-06 [overlap_opt_shard_in_pipeline]: 3.83999e-06 [overlap_opt_shard_grad_in_pipeline]: 4.37e-06 [control_data_broadcast_order]: 2.395e-05 [grouped_pairwise_exchange_alltoall]: 4.43999e-06 [offloading_packed_experts]: 7.82e-06 [overlap_recompute_and_grad_model_parallel]: 7.74002e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.86001e-06 [overlap_recompute_allgather_and_fa_grad]: 3.88001e-06 [overlap_recompute_comm]: 5.05001e-06 [overlap_grad_ring_attention]: 7.46001e-06 [overlap_grad_flash_sp]: 2.711e-05 [begin_end_overlap_inline]: 2.88998e-06 [split_matmul_comm_elemetwise]: 5.21002e-06 [split_layernorm_comm]: 4.09002e-06 [handle_group_info]: 4.3e-06 [symbol_engine_optimizer]: 0.00011067, [1] [Cycle 1]: 0.00010383, [6] [build]: 4.63001e-06 [elim_shapecalc]: 1.305e-05 [elim_not_effective]: 1.627e-05 [opt_reshape]: 9.66e-06 [fold_const_symbol]: 1.417e-05 [renormalize]: 2.40019e-07 [detach_backward]: 4.37998e-06 [pipeline_parallel_scheduler]: 2.03002e-06 [auto_monad_reorder]: 2.42e-05 [get_jit_bprop_graph]: 1.67999e-06 [rewriter_after_jit_bprop_graph]: 6.05002e-06 [opt_after_jit_grad]: 0.00065853 [validate]: 4.492e-05 Sums bootstrap : 0.000411s : 1.15% type_inference : 0.029064s : 81.27% event_method : 0.000095s : 0.27% auto_monad : 0.000102s : 0.28% graph_reusing : 0.000008s : 0.02% inline : 0.000002s : 0.01% add_attr.add_attr_with_inline.tag_attr : 0.000028s : 0.08% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000008s : 0.02% parallel-infer-symbol : 0.000003s : 0.01% pre_auto_parallel : 0.000043s : 0.12% insert-virtual-dataset : 0.000002s : 0.01% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.01% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000036s : 0.10% optimize.rewriter_before_opt_a : 0.000120s : 0.33% optimize.opt_a.expand_dump_flag : 0.000005s : 0.01% optimize.opt_a.switch_simplify : 0.000132s : 0.37% optimize.opt_a.loop_unroll : 0.000051s : 0.14% optimize.opt_a.a_1 : 0.001146s : 3.21% optimize.opt_a.with_stream_mark : 0.000028s : 0.08% optimize.opt_a.recompute_prepare : 0.000018s : 0.05% optimize.opt_a.updatestate_depend_eliminate : 0.000009s : 0.02% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.02% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.02% optimize.opt_a.parameter_eliminate : 0.000003s : 0.01% optimize.opt_a.a_2 : 0.000268s : 0.75% optimize.opt_a.accelerated_algorithm : 0.000018s : 0.05% optimize.opt_a.shard : 0.000003s : 0.01% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.01% optimize.opt_a.shard_inline : 0.000016s : 0.05% optimize.opt_a.merge_send_recv : 0.000015s : 0.04% optimize.opt_a.auto_parallel : 0.000014s : 0.04% optimize.opt_a.parallel : 0.000023s : 0.07% optimize.opt_a.flash_sp : 0.000012s : 0.03% optimize.opt_a.merge_comm : 0.000009s : 0.02% optimize.opt_a.allreduce_fusion : 0.000008s : 0.02% optimize.opt_a.matmul_add_comm_reduction : 0.000017s : 0.05% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000018s : 0.05% optimize.opt_a.virtual_dataset : 0.000016s : 0.04% optimize.opt_a.get_grad_eliminate_ : 0.000016s : 0.04% optimize.opt_a.virtual_output : 0.000016s : 0.04% optimize.opt_a.merge_forward : 0.000008s : 0.02% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.01% optimize.opt_a.offload_activation : 0.000019s : 0.05% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000034s : 0.10% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.01% optimize.opt_a.before_grad : 0.000025s : 0.07% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000010s : 0.03% optimize.opt_a.meta_fg_expand : 0.000007s : 0.02% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.01% optimize.opt_a.receive_attached : 0.000004s : 0.01% optimize.opt_a.after_resolve : 0.000025s : 0.07% optimize.opt_a.a_after_grad : 0.000024s : 0.07% optimize.opt_a.renormalize : 0.000886s : 2.48% optimize.opt_a.add_forward_monad_depend : 0.000007s : 0.02% optimize.opt_a.auto_monad_grad : 0.000003s : 0.01% optimize.opt_a.auto_monad_eliminator : 0.000026s : 0.07% optimize.opt_a.cse : 0.000052s : 0.15% optimize.opt_a.a_3 : 0.000131s : 0.37% optimize.py_interpret_to_execute_after_opt_a : 0.000013s : 0.04% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.01% optimize.rewriter_after_opt_a : 0.000044s : 0.12% optimize.convert_after_rewriter : 0.000010s : 0.03% optimize.order_py_execute_after_rewriter : 0.000009s : 0.02% optimize.mutable_eliminate : 0.000516s : 1.44% optimize.opt_b.b_1 : 0.000201s : 0.56% optimize.opt_b.b_2 : 0.000011s : 0.03% optimize.opt_b.updatestate_depend_eliminate : 0.000006s : 0.02% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.01% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.01% optimize.opt_b.renormalize : 0.000000s : 0.00% optimize.opt_b.cse : 0.000023s : 0.06% optimize.optimize_parallel_all_gather_comm : 0.000021s : 0.06% optimize.overlap_param_gather : 0.000005s : 0.01% optimize.cconv : 0.000028s : 0.08% optimize.loop_unroll : 0.000458s : 1.28% optimize.opt_after_cconv.c_1 : 0.000068s : 0.19% optimize.opt_after_cconv.parameter_eliminate : 0.000007s : 0.02% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000013s : 0.04% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000004s : 0.01% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000004s : 0.01% optimize.opt_after_cconv.cse : 0.000041s : 0.12% optimize.opt_after_cconv.renormalize : 0.000001s : 0.00% optimize.remove_dup_value : 0.000022s : 0.06% optimize.tuple_transform.d_1 : 0.000065s : 0.18% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000009s : 0.03% optimize.partial_unused_args_eliminate : 0.000004s : 0.01% optimize.add_recomputation : 0.000070s : 0.20% optimize.cse_after_recomputation.cse : 0.000015s : 0.04% optimize.environ_conv : 0.000010s : 0.03% optimize.swap_dp_allreduce_reducescatter : 0.000009s : 0.03% optimize.bias_add_comm_swap : 0.000006s : 0.02% optimize.label_micro_interleaved_index : 0.000009s : 0.03% optimize.label_fine_grained_interleaved_index : 0.000005s : 0.01% optimize.merge_cast_opt : 0.000005s : 0.01% optimize.slice_recompute_activation : 0.000005s : 0.01% optimize.micro_interleaved_order_control : 0.000005s : 0.01% optimize.assign_add_opt : 0.000005s : 0.01% optimize.ForceFp32Comm : 0.000003s : 0.01% optimize.remove_cast_before_assign_add : 0.000003s : 0.01% optimize.full_micro_interleaved_order_control : 0.000004s : 0.01% optimize.reorder_send_recv_between_fp_bp : 0.000006s : 0.02% optimize.comm_op_add_attrs : 0.000003s : 0.01% optimize.add_comm_op_reuse_tag : 0.000004s : 0.01% optimize.interleave_split_concat_branches : 0.000003s : 0.01% optimize.interleave_parallel_branches : 0.000004s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000004s : 0.01% optimize.control_data_broadcast_order : 0.000024s : 0.07% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.01% optimize.offloading_packed_experts : 0.000008s : 0.02% optimize.overlap_recompute_and_grad_model_parallel : 0.000008s : 0.02% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.01% optimize.overlap_recompute_comm : 0.000005s : 0.01% optimize.overlap_grad_ring_attention : 0.000007s : 0.02% optimize.overlap_grad_flash_sp : 0.000027s : 0.08% optimize.begin_end_overlap_inline : 0.000003s : 0.01% optimize.split_matmul_comm_elemetwise : 0.000005s : 0.01% optimize.split_layernorm_comm : 0.000004s : 0.01% optimize.handle_group_info : 0.000004s : 0.01% optimize.symbol_engine_optimizer.build : 0.000005s : 0.01% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000013s : 0.04% optimize.symbol_engine_optimizer.elim_not_effective : 0.000016s : 0.05% optimize.symbol_engine_optimizer.opt_reshape : 0.000010s : 0.03% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000014s : 0.04% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000004s : 0.01% pipeline_parallel_scheduler : 0.000002s : 0.01% auto_monad_reorder : 0.000024s : 0.07% get_jit_bprop_graph : 0.000002s : 0.00% rewriter_after_jit_bprop_graph : 0.000006s : 0.02% opt_after_jit_grad : 0.000659s : 1.84% validate : 0.000045s : 0.13% Time group info: ------[substitution.] 0.000252 44 8.53% : 0.000022s : 3: substitution.cast_eliminate 1.00% : 0.000003s : 3: substitution.elim_not_effective 1.06% : 0.000003s : 3: substitution.fold_const_symbol 3.37% : 0.000008s : 5: substitution.graph_param_transform 69.91% : 0.000176s : 8: substitution.inline 1.63% : 0.000004s : 6: substitution.j_node_and_user_rematch 2.29% : 0.000006s : 6: substitution.remove_not_recompute_node 1.94% : 0.000005s : 4: substitution.replace_old_param 3.95% : 0.000010s : 2: substitution.switch_simplify 6.31% : 0.000016s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.029001 2 95.21% : 0.027611s : 1: type_inference.infer 4.79% : 0.001390s : 1: type_inference.specialize ------[replace.] 0.000121 14 42.91% : 0.000052s : 8: replace.inline 35.80% : 0.000043s : 2: replace.switch_simplify 21.30% : 0.000026s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000194 14 88.43% : 0.000172s : 8: match.inline 4.50% : 0.000009s : 2: match.switch_simplify 7.07% : 0.000014s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000311 1838 0.91% : 0.000003s : 21: predicate.accumulaten_eliminater 0.68% : 0.000002s : 5: predicate.ad_related_special_op_eliminate 0.44% : 0.000001s : 10: predicate.addn_check_dump 0.97% : 0.000003s : 21: predicate.addn_zero_filter 0.90% : 0.000003s : 21: predicate.adjust_all_reduce_mul_add 1.99% : 0.000006s : 31: predicate.arithmetic_simplify 1.02% : 0.000003s : 21: predicate.cast_eliminate 0.47% : 0.000001s : 10: predicate.check_bprop_eliminate 0.45% : 0.000001s : 10: predicate.compare_switch_simplify 0.15% : 0.000000s : 5: predicate.const_output_eliminate 0.49% : 0.000002s : 10: predicate.depend_value_elim 0.99% : 0.000003s : 21: predicate.dict_get_item_const_eliminator 1.08% : 0.000003s : 21: predicate.dict_get_item_eliminator 0.94% : 0.000003s : 21: predicate.dict_set_item_eliminator 0.78% : 0.000002s : 10: predicate.dumpgradient_eliminate 0.17% : 0.000001s : 5: predicate.elim_not_effective 0.34% : 0.000001s : 5: predicate.elim_shapecalc_of_broadcastargs 1.12% : 0.000003s : 26: predicate.environ_add_const_eliminate 1.08% : 0.000003s : 26: predicate.environ_get_add_eliminate 1.13% : 0.000004s : 26: predicate.environ_get_depend_swap 1.62% : 0.000005s : 36: predicate.environ_get_eliminate 1.08% : 0.000003s : 26: predicate.environ_get_set_eliminate 1.53% : 0.000005s : 33: predicate.exchange_switch_depend_value 2.50% : 0.000008s : 33: predicate.float_depend_g_call 0.47% : 0.000001s : 10: predicate.float_environ_get_switch 0.67% : 0.000002s : 15: predicate.float_tuple_getitem_switch 0.15% : 0.000000s : 5: predicate.fold_const_symbol 0.55% : 0.000002s : 10: predicate.get_grad_eliminate 0.30% : 0.000001s : 5: predicate.graph_param_transform 0.51% : 0.000002s : 10: predicate.incorporate_call 0.42% : 0.000001s : 10: predicate.incorporate_call_switch 7.12% : 0.000022s : 84: predicate.inline 0.73% : 0.000002s : 10: predicate.inline_without_move 0.30% : 0.000001s : 10: predicate.j_node_and_user_rematch 0.73% : 0.000002s : 10: predicate.less_batch_normalization 1.72% : 0.000005s : 35: predicate.list_to_tuple_eliminator_ 2.47% : 0.000008s : 56: predicate.load_eliminater 0.74% : 0.000002s : 5: predicate.loop_unroll_after_grad 2.30% : 0.000007s : 49: predicate.loop_unroll_before_grad 1.86% : 0.000006s : 31: predicate.make_slice_get_slice_eliminator 0.51% : 0.000002s : 10: predicate.merge_addn 0.50% : 0.000002s : 10: predicate.micro_step_allgather_replace 0.46% : 0.000001s : 10: predicate.mini_step_allgather_replace 0.89% : 0.000003s : 21: predicate.minmaximum_grad 0.78% : 0.000002s : 5: predicate.mutable_eliminate 0.29% : 0.000001s : 5: predicate.opt_reshape 0.28% : 0.000001s : 5: predicate.parallel_virtual_node 1.95% : 0.000006s : 33: predicate.partial_defer_inline 1.61% : 0.000005s : 30: predicate.partial_eliminate 0.92% : 0.000003s : 21: predicate.print_const_string_wrapper 0.48% : 0.000001s : 10: predicate.reduce_all_const_elim 1.24% : 0.000004s : 21: predicate.reduce_eliminate 2.38% : 0.000007s : 56: predicate.redundant_stop_gradient_eliminater 0.31% : 0.000001s : 10: predicate.remove_not_recompute_node 1.23% : 0.000004s : 35: predicate.replace_applicator 0.34% : 0.000001s : 10: predicate.replace_old_param 0.21% : 0.000001s : 5: predicate.reset_defer_inline 0.99% : 0.000003s : 21: predicate.reshape_eliminate 0.50% : 0.000002s : 10: predicate.row_tensor_add_zeros_like 0.29% : 0.000001s : 5: predicate.row_tensor_eliminate 0.57% : 0.000002s : 10: predicate.same_eliminate 0.35% : 0.000001s : 10: predicate.set_cell_output_no_recompute 0.57% : 0.000002s : 10: predicate.shard_identity_eliminate 0.56% : 0.000002s : 10: predicate.special_op_eliminate 0.64% : 0.000002s : 10: predicate.specialize_transform 0.67% : 0.000002s : 10: predicate.split_environ_get_set_with_tuple_value 0.60% : 0.000002s : 10: predicate.stack_unstack_eliminate 0.28% : 0.000001s : 5: predicate.switch_call_monad_eliminater 1.77% : 0.000005s : 33: predicate.switch_defer_inline 2.06% : 0.000006s : 43: predicate.switch_layer_defer_inline 5.18% : 0.000016s : 101: predicate.switch_simplify 0.94% : 0.000003s : 21: predicate.tile_eliminate 1.01% : 0.000003s : 21: predicate.transpose_eliminate 1.51% : 0.000005s : 31: predicate.tuple_list_convert_item_index_to_positive 1.65% : 0.000005s : 31: predicate.tuple_list_get_item_const_eliminator 1.49% : 0.000005s : 31: predicate.tuple_list_get_item_depend_reorder 6.99% : 0.000022s : 45: predicate.tuple_list_get_item_eliminator 1.47% : 0.000005s : 31: predicate.tuple_list_get_set_item_eliminator 2.13% : 0.000007s : 41: predicate.tuple_list_set_item_eliminator 1.67% : 0.000005s : 35: predicate.tuple_to_list_eliminator_ 2.40% : 0.000007s : 56: predicate.updatestate_pure_node_eliminater 3.52% : 0.000011s : 66: predicate.updatestate_useless_node_eliminater 0.31% : 0.000001s : 5: predicate.value_based_eliminate 0.52% : 0.000002s : 10: predicate.virtual_dataset_eliminate 0.52% : 0.000002s : 10: predicate.virtual_output_eliminate 0.24% : 0.000001s : 5: predicate.virtual_view_grad_eliminate 0.35% : 0.000001s : 5: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000955 16 55.94% : 0.000534s : 6: func_graph_cloner_run.FuncGraphClonerGraph 44.06% : 0.000421s : 10: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.085542 192 0.01% : 0.000006s : 1: ForceFp32Comm 3.77% : 0.003225s : 1: add_attr 3.75% : 0.003212s : 1: add_attr_with_inline 0.01% : 0.000006s : 1: add_comm_op_reuse_tag 0.09% : 0.000074s : 1: add_recomputation 0.01% : 0.000008s : 1: assign_add_opt 0.13% : 0.000113s : 1: auto_monad 0.04% : 0.000031s : 1: auto_monad_reorder 0.01% : 0.000006s : 1: begin_end_overlap_inline 0.01% : 0.000009s : 1: bias_add_comm_swap 0.53% : 0.000452s : 1: bootstrap 0.04% : 0.000031s : 1: cconv 0.01% : 0.000006s : 1: comm_op_add_attrs 0.03% : 0.000028s : 1: control_data_broadcast_order 0.02% : 0.000013s : 1: convert_after_rewriter 0.04% : 0.000037s : 1: cse_after_recomputation 0.01% : 0.000007s : 1: dataset_repeat_opt 0.03% : 0.000023s : 1: detach_backward 0.02% : 0.000013s : 1: environ_conv 0.13% : 0.000110s : 1: event_method 0.01% : 0.000008s : 1: full_micro_interleaved_order_control 0.01% : 0.000008s : 1: get_jit_bprop_graph 0.02% : 0.000015s : 1: graph_reusing 0.01% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000007s : 1: handle_group_info 0.01% : 0.000008s : 1: inline 0.01% : 0.000008s : 1: insert-virtual-dataset 0.01% : 0.000008s : 1: interleave_parallel_branches 0.01% : 0.000006s : 1: interleave_split_concat_branches 0.01% : 0.000008s : 1: label_fine_grained_interleaved_index 0.01% : 0.000012s : 1: label_micro_interleaved_index 0.54% : 0.000465s : 1: loop_unroll 0.01% : 0.000007s : 1: merge_cast_opt 0.01% : 0.000007s : 1: micro_interleaved_order_control 0.61% : 0.000522s : 1: mutable_eliminate 0.01% : 0.000011s : 1: offloading_packed_experts 0.02% : 0.000017s : 1: opt.transform.loop_unroll_optimizer 0.02% : 0.000017s : 1: opt.transform.mutable_eliminate 2.08% : 0.001779s : 78: opt.transform.opt_a 0.07% : 0.000063s : 1: opt.transform.opt_after_cconv 0.03% : 0.000030s : 1: opt.transform.opt_after_jit_grad 0.16% : 0.000139s : 28: opt.transform.opt_b 0.08% : 0.000071s : 2: opt.transform.opt_trans_graph 0.06% : 0.000049s : 4: opt.transform.symbol_engine_opt 4.55% : 0.003894s : 1: opt_a 19.01% : 0.016262s : 1: opt_after_cconv 0.78% : 0.000669s : 1: opt_after_jit_grad 0.37% : 0.000315s : 1: opt_b 26.85% : 0.022965s : 1: optimize 0.03% : 0.000024s : 1: optimize_parallel_all_gather_comm 0.01% : 0.000012s : 1: order_py_execute_after_rewriter 0.04% : 0.000031s : 1: overlap_grad_flash_sp 0.01% : 0.000007s : 1: overlap_grad_matmul_and_grad_allreduce 0.01% : 0.000010s : 1: overlap_grad_ring_attention 0.01% : 0.000007s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000007s : 1: overlap_opt_shard_in_pipeline 0.01% : 0.000008s : 1: overlap_param_gather 0.01% : 0.000006s : 1: overlap_recompute_allgather_and_fa_grad 0.01% : 0.000011s : 1: overlap_recompute_and_grad_model_parallel 0.01% : 0.000008s : 1: overlap_recompute_comm 0.01% : 0.000010s : 1: parallel-infer-symbol 0.01% : 0.000007s : 1: parallel-infer-symbol-second 0.01% : 0.000008s : 1: partial_unused_args_eliminate 0.01% : 0.000009s : 1: pipeline_parallel_scheduler 0.01% : 0.000007s : 1: pipeline_split 0.06% : 0.000051s : 1: pre_auto_parallel 0.05% : 0.000040s : 1: py_interpret_to_execute 0.02% : 0.000016s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000006s : 1: remove_cast_before_assign_add 0.03% : 0.000025s : 1: remove_dup_value 0.53% : 0.000452s : 1: renormalize.infer 0.50% : 0.000425s : 1: renormalize.specialize 0.01% : 0.000008s : 1: reorder_send_recv_between_fp_bp 0.01% : 0.000012s : 1: rewriter_after_jit_bprop_graph 0.06% : 0.000048s : 1: rewriter_after_opt_a 0.14% : 0.000124s : 1: rewriter_before_opt_a 0.01% : 0.000008s : 1: slice_cell_reuse_recomputed_activation 0.01% : 0.000007s : 1: slice_recompute_activation 0.01% : 0.000007s : 1: split_layernorm_comm 0.01% : 0.000008s : 1: split_matmul_comm_elemetwise 0.01% : 0.000012s : 1: swap_dp_allreduce_reducescatter 0.13% : 0.000114s : 1: symbol_engine_optimizer 0.14% : 0.000118s : 1: tuple_transform 34.02% : 0.029104s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:19.490.348 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0399121, [21] [bootstrap]: 0.00044959 [type_inference]: 0.0296729 [event_method]: 9.271e-05 [auto_monad]: 9.659e-05 [graph_reusing]: 7.56999e-06 [inline]: 2.54001e-06 [add_attr]: 0.00320292, [1] [add_attr_with_inline]: 0.00319394, [1] [Cycle 1]: 6.994e-05, [2] [tag_attr]: 2.699e-05 [meta_addattr_fg_expand]: 8.03999e-06 [parallel-infer-symbol]: 3.04999e-06 [pre_auto_parallel]: 4.253e-05 [insert-virtual-dataset]: 2.39001e-06 [parallel-infer-symbol-second]: 8.00006e-07 [dataset_repeat_opt]: 2.09e-06 [pipeline_split]: 1.64e-06 [optimize]: 0.00563202, [53] [py_interpret_to_execute]: 3.278e-05 [rewriter_before_opt_a]: 0.00011702 [opt_a]: 0.00350056, [2] [Cycle 1]: 0.0027182, [45] [expand_dump_flag]: 4.37998e-06 [switch_simplify]: 0.00012603 [loop_unroll]: 4.26e-05 [a_1]: 0.00096435 [with_stream_mark]: 1.555e-05 [recompute_prepare]: 1.015e-05 [updatestate_depend_eliminate]: 4.60999e-06 [updatestate_assign_eliminate]: 3.66001e-06 [updatestate_loads_eliminate]: 3.74002e-06 [parameter_eliminate]: 1.84e-06 [a_2]: 0.00010435 [accelerated_algorithm]: 8.60999e-06 [shard]: 1.79e-06 [meta_shard_fg_expand]: 2.19001e-06 [shard_inline]: 8.07e-06 [merge_send_recv]: 9.67001e-06 [auto_parallel]: 6.73998e-06 [parallel]: 1.849e-05 [flash_sp]: 7.92e-06 [merge_comm]: 4.77e-06 [allreduce_fusion]: 4.68001e-06 [matmul_add_comm_reduction]: 9.86e-06 [allreduce_slice_to_reducescatter]: 6.39993e-07 [virtual_shard_identity]: 9.96998e-06 [virtual_dataset]: 7.88999e-06 [get_grad_eliminate_]: 7.55e-06 [virtual_output]: 7.6e-06 [merge_forward]: 4.48999e-06 [cell_reuse_recompute_pass]: 1.29e-06 [offload_activation]: 1.077e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.486e-05 [merge_recompute_call_nodes]: 1.62001e-06 [before_grad]: 1.283e-05 [set_forward_comm_id_for_comm_node_pass]: 4.44998e-06 [meta_fg_expand]: 4.21001e-06 [flash_sp_send_recv_attached]: 2.27999e-06 [receive_attached]: 2.34001e-06 [after_resolve]: 1.322e-05 [a_after_grad]: 1.223e-05 [renormalize]: 0.00086879 [add_forward_monad_depend]: 5.45001e-06 [auto_monad_grad]: 1.79e-06 [auto_monad_eliminator]: 1.683e-05 [cse]: 3.49e-05 [a_3]: 5.892e-05 [Cycle 2]: 0.00077237, [45] [expand_dump_flag]: 1.05001e-06 [switch_simplify]: 9.19e-06 [loop_unroll]: 7.75e-06 [a_1]: 0.00018036 [with_stream_mark]: 1.124e-05 [recompute_prepare]: 7.75e-06 [updatestate_depend_eliminate]: 3.97e-06 [updatestate_assign_eliminate]: 2.93e-06 [updatestate_loads_eliminate]: 2.79001e-06 [parameter_eliminate]: 1.09e-06 [a_2]: 0.00010547 [accelerated_algorithm]: 8.40001e-06 [shard]: 1.52999e-06 [meta_shard_fg_expand]: 1.72001e-06 [shard_inline]: 7.85e-06 [merge_send_recv]: 6.09999e-06 [auto_parallel]: 6.29999e-06 [parallel]: 4.99e-06 [flash_sp]: 3.16999e-06 [merge_comm]: 3.9e-06 [allreduce_fusion]: 3.85e-06 [matmul_add_comm_reduction]: 6.56e-06 [allreduce_slice_to_reducescatter]: 3.89991e-07 [virtual_shard_identity]: 8.43999e-06 [virtual_dataset]: 7.25e-06 [get_grad_eliminate_]: 7.12997e-06 [virtual_output]: 6.89999e-06 [merge_forward]: 3.3e-06 [cell_reuse_recompute_pass]: 1.76998e-06 [offload_activation]: 6.96999e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.471e-05 [merge_recompute_call_nodes]: 7.39994e-07 [before_grad]: 1.146e-05 [set_forward_comm_id_for_comm_node_pass]: 4.67998e-06 [meta_fg_expand]: 2.69001e-06 [flash_sp_send_recv_attached]: 9.09989e-07 [receive_attached]: 1.05001e-06 [after_resolve]: 1.175e-05 [a_after_grad]: 1.109e-05 [renormalize]: 8.00064e-08 [add_forward_monad_depend]: 1.45999e-06 [auto_monad_grad]: 9.20001e-07 [auto_monad_eliminator]: 8.47e-06 [cse]: 1.76e-05 [a_3]: 4.739e-05 [py_interpret_to_execute_after_opt_a]: 9.57001e-06 [slice_cell_reuse_recomputed_activation]: 1.82001e-06 [rewriter_after_opt_a]: 3.932e-05 [convert_after_rewriter]: 7.13e-06 [order_py_execute_after_rewriter]: 5.92999e-06 [mutable_eliminate]: 0.00047789 [opt_b]: 0.00024118, [1] [Cycle 1]: 0.00023531, [7] [b_1]: 0.00015699 [b_2]: 9.42001e-06 [updatestate_depend_eliminate]: 5.81e-06 [updatestate_assign_eliminate]: 2.81999e-06 [updatestate_loads_eliminate]: 2.80002e-06 [renormalize]: 4.19997e-07 [cse]: 2.212e-05 [optimize_parallel_all_gather_comm]: 1.637e-05 [overlap_param_gather]: 2.21e-06 [cconv]: 2.378e-05 [loop_unroll]: 0.00043338 [opt_after_cconv]: 0.00011464, [1] [Cycle 1]: 0.00010911, [7] [c_1]: 3.897e-05 [parameter_eliminate]: 2.46e-06 [updatestate_depend_eliminate]: 5.34e-06 [updatestate_assign_eliminate]: 3.46999e-06 [updatestate_loads_eliminate]: 2.73e-06 [cse]: 2.103e-05 [renormalize]: 4.40021e-07 [remove_dup_value]: 1.571e-05 [tuple_transform]: 8.619e-05, [1] [Cycle 1]: 8.126e-05, [4] [d_1]: 5.361e-05 [none_parameter_eliminate]: 1.50999e-06 [renormalize]: 2.19996e-07 [switch_simplify]: 8.03999e-06 [partial_unused_args_eliminate]: 2.07001e-06 [add_recomputation]: 5.441e-05 [cse_after_recomputation]: 2.427e-05, [1] [Cycle 1]: 1.961e-05, [1] [cse]: 1.421e-05 [environ_conv]: 6.94001e-06 [swap_dp_allreduce_reducescatter]: 6.22001e-06 [bias_add_comm_swap]: 2.63e-06 [label_micro_interleaved_index]: 4.56002e-06 [label_fine_grained_interleaved_index]: 2.94001e-06 [merge_cast_opt]: 1.22e-06 [slice_recompute_activation]: 2.16e-06 [micro_interleaved_order_control]: 2.59001e-06 [assign_add_opt]: 1.19998e-06 [ForceFp32Comm]: 1.40001e-06 [remove_cast_before_assign_add]: 9.90025e-07 [full_micro_interleaved_order_control]: 2.24999e-06 [reorder_send_recv_between_fp_bp]: 2.93e-06 [comm_op_add_attrs]: 9.30013e-07 [add_comm_op_reuse_tag]: 9.30013e-07 [interleave_split_concat_branches]: 1.28002e-06 [interleave_parallel_branches]: 1.13001e-06 [overlap_opt_shard_in_pipeline]: 1.24998e-06 [overlap_opt_shard_grad_in_pipeline]: 2.17999e-06 [control_data_broadcast_order]: 1.605e-05 [grouped_pairwise_exchange_alltoall]: 1.85001e-06 [offloading_packed_experts]: 4.11001e-06 [overlap_recompute_and_grad_model_parallel]: 4.76002e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.22e-06 [overlap_recompute_allgather_and_fa_grad]: 1.35999e-06 [overlap_recompute_comm]: 2.57001e-06 [overlap_grad_ring_attention]: 4.2e-06 [overlap_grad_flash_sp]: 2.157e-05 [begin_end_overlap_inline]: 8.00006e-07 [split_matmul_comm_elemetwise]: 2.26e-06 [split_layernorm_comm]: 1.57001e-06 [handle_group_info]: 1.07998e-06 [symbol_engine_optimizer]: 8.617e-05, [1] [Cycle 1]: 8.17e-05, [6] [build]: 3.48e-06 [elim_shapecalc]: 1.155e-05 [elim_not_effective]: 1.61e-05 [opt_reshape]: 8.67e-06 [fold_const_symbol]: 1.287e-05 [renormalize]: 2.30008e-07 [detach_backward]: 1.47001e-06 [pipeline_parallel_scheduler]: 1.55999e-06 [auto_monad_reorder]: 1.98e-05 [get_jit_bprop_graph]: 1.24e-06 [rewriter_after_jit_bprop_graph]: 3.55e-06 [opt_after_jit_grad]: 0.00047552 [validate]: 3.851e-05 Sums bootstrap : 0.000450s : 1.26% type_inference : 0.029673s : 83.03% event_method : 0.000093s : 0.26% auto_monad : 0.000097s : 0.27% graph_reusing : 0.000008s : 0.02% inline : 0.000003s : 0.01% add_attr.add_attr_with_inline.tag_attr : 0.000027s : 0.08% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000008s : 0.02% parallel-infer-symbol : 0.000003s : 0.01% pre_auto_parallel : 0.000043s : 0.12% insert-virtual-dataset : 0.000002s : 0.01% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.01% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000033s : 0.09% optimize.rewriter_before_opt_a : 0.000117s : 0.33% optimize.opt_a.expand_dump_flag : 0.000005s : 0.02% optimize.opt_a.switch_simplify : 0.000135s : 0.38% optimize.opt_a.loop_unroll : 0.000050s : 0.14% optimize.opt_a.a_1 : 0.001145s : 3.20% optimize.opt_a.with_stream_mark : 0.000027s : 0.07% optimize.opt_a.recompute_prepare : 0.000018s : 0.05% optimize.opt_a.updatestate_depend_eliminate : 0.000009s : 0.02% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.02% optimize.opt_a.updatestate_loads_eliminate : 0.000007s : 0.02% optimize.opt_a.parameter_eliminate : 0.000003s : 0.01% optimize.opt_a.a_2 : 0.000210s : 0.59% optimize.opt_a.accelerated_algorithm : 0.000017s : 0.05% optimize.opt_a.shard : 0.000003s : 0.01% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.01% optimize.opt_a.shard_inline : 0.000016s : 0.04% optimize.opt_a.merge_send_recv : 0.000016s : 0.04% optimize.opt_a.auto_parallel : 0.000013s : 0.04% optimize.opt_a.parallel : 0.000023s : 0.07% optimize.opt_a.flash_sp : 0.000011s : 0.03% optimize.opt_a.merge_comm : 0.000009s : 0.02% optimize.opt_a.allreduce_fusion : 0.000009s : 0.02% optimize.opt_a.matmul_add_comm_reduction : 0.000016s : 0.05% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000018s : 0.05% optimize.opt_a.virtual_dataset : 0.000015s : 0.04% optimize.opt_a.get_grad_eliminate_ : 0.000015s : 0.04% optimize.opt_a.virtual_output : 0.000014s : 0.04% optimize.opt_a.merge_forward : 0.000008s : 0.02% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.01% optimize.opt_a.offload_activation : 0.000018s : 0.05% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000030s : 0.08% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.01% optimize.opt_a.before_grad : 0.000024s : 0.07% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000009s : 0.03% optimize.opt_a.meta_fg_expand : 0.000007s : 0.02% optimize.opt_a.flash_sp_send_recv_attached : 0.000003s : 0.01% optimize.opt_a.receive_attached : 0.000003s : 0.01% optimize.opt_a.after_resolve : 0.000025s : 0.07% optimize.opt_a.a_after_grad : 0.000023s : 0.07% optimize.opt_a.renormalize : 0.000869s : 2.43% optimize.opt_a.add_forward_monad_depend : 0.000007s : 0.02% optimize.opt_a.auto_monad_grad : 0.000003s : 0.01% optimize.opt_a.auto_monad_eliminator : 0.000025s : 0.07% optimize.opt_a.cse : 0.000052s : 0.15% optimize.opt_a.a_3 : 0.000106s : 0.30% optimize.py_interpret_to_execute_after_opt_a : 0.000010s : 0.03% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.01% optimize.rewriter_after_opt_a : 0.000039s : 0.11% optimize.convert_after_rewriter : 0.000007s : 0.02% optimize.order_py_execute_after_rewriter : 0.000006s : 0.02% optimize.mutable_eliminate : 0.000478s : 1.34% optimize.opt_b.b_1 : 0.000157s : 0.44% optimize.opt_b.b_2 : 0.000009s : 0.03% optimize.opt_b.updatestate_depend_eliminate : 0.000006s : 0.02% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.01% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.01% optimize.opt_b.renormalize : 0.000000s : 0.00% optimize.opt_b.cse : 0.000022s : 0.06% optimize.optimize_parallel_all_gather_comm : 0.000016s : 0.05% optimize.overlap_param_gather : 0.000002s : 0.01% optimize.cconv : 0.000024s : 0.07% optimize.loop_unroll : 0.000433s : 1.21% optimize.opt_after_cconv.c_1 : 0.000039s : 0.11% optimize.opt_after_cconv.parameter_eliminate : 0.000002s : 0.01% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000005s : 0.01% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.cse : 0.000021s : 0.06% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000016s : 0.04% optimize.tuple_transform.d_1 : 0.000054s : 0.15% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000008s : 0.02% optimize.partial_unused_args_eliminate : 0.000002s : 0.01% optimize.add_recomputation : 0.000054s : 0.15% optimize.cse_after_recomputation.cse : 0.000014s : 0.04% optimize.environ_conv : 0.000007s : 0.02% optimize.swap_dp_allreduce_reducescatter : 0.000006s : 0.02% optimize.bias_add_comm_swap : 0.000003s : 0.01% optimize.label_micro_interleaved_index : 0.000005s : 0.01% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.01% optimize.merge_cast_opt : 0.000001s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.01% optimize.micro_interleaved_order_control : 0.000003s : 0.01% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.00% optimize.full_micro_interleaved_order_control : 0.000002s : 0.01% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.01% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.01% optimize.control_data_broadcast_order : 0.000016s : 0.04% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.01% optimize.offloading_packed_experts : 0.000004s : 0.01% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.01% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.00% optimize.overlap_recompute_comm : 0.000003s : 0.01% optimize.overlap_grad_ring_attention : 0.000004s : 0.01% optimize.overlap_grad_flash_sp : 0.000022s : 0.06% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.01% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000003s : 0.01% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000012s : 0.03% optimize.symbol_engine_optimizer.elim_not_effective : 0.000016s : 0.05% optimize.symbol_engine_optimizer.opt_reshape : 0.000009s : 0.02% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000013s : 0.04% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000001s : 0.00% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000020s : 0.06% get_jit_bprop_graph : 0.000001s : 0.00% rewriter_after_jit_bprop_graph : 0.000004s : 0.01% opt_after_jit_grad : 0.000476s : 1.33% validate : 0.000039s : 0.11% Time group info: ------[substitution.] 0.000247 44 7.76% : 0.000019s : 3: substitution.cast_eliminate 1.03% : 0.000003s : 3: substitution.elim_not_effective 0.73% : 0.000002s : 3: substitution.fold_const_symbol 2.87% : 0.000007s : 5: substitution.graph_param_transform 71.52% : 0.000176s : 8: substitution.inline 1.72% : 0.000004s : 6: substitution.j_node_and_user_rematch 2.30% : 0.000006s : 6: substitution.remove_not_recompute_node 1.78% : 0.000004s : 4: substitution.replace_old_param 3.84% : 0.000009s : 2: substitution.switch_simplify 6.44% : 0.000016s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.029603 2 95.38% : 0.028235s : 1: type_inference.infer 4.62% : 0.001368s : 1: type_inference.specialize ------[replace.] 0.000121 14 43.17% : 0.000052s : 8: replace.inline 36.53% : 0.000044s : 2: replace.switch_simplify 20.31% : 0.000025s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000194 14 88.67% : 0.000172s : 8: match.inline 4.25% : 0.000008s : 2: match.switch_simplify 7.08% : 0.000014s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000291 1838 1.01% : 0.000003s : 21: predicate.accumulaten_eliminater 0.68% : 0.000002s : 5: predicate.ad_related_special_op_eliminate 0.46% : 0.000001s : 10: predicate.addn_check_dump 0.99% : 0.000003s : 21: predicate.addn_zero_filter 0.96% : 0.000003s : 21: predicate.adjust_all_reduce_mul_add 2.04% : 0.000006s : 31: predicate.arithmetic_simplify 1.10% : 0.000003s : 21: predicate.cast_eliminate 0.49% : 0.000001s : 10: predicate.check_bprop_eliminate 0.48% : 0.000001s : 10: predicate.compare_switch_simplify 0.16% : 0.000000s : 5: predicate.const_output_eliminate 0.50% : 0.000001s : 10: predicate.depend_value_elim 1.08% : 0.000003s : 21: predicate.dict_get_item_const_eliminator 1.13% : 0.000003s : 21: predicate.dict_get_item_eliminator 0.98% : 0.000003s : 21: predicate.dict_set_item_eliminator 0.78% : 0.000002s : 10: predicate.dumpgradient_eliminate 0.19% : 0.000001s : 5: predicate.elim_not_effective 0.31% : 0.000001s : 5: predicate.elim_shapecalc_of_broadcastargs 1.18% : 0.000003s : 26: predicate.environ_add_const_eliminate 1.18% : 0.000003s : 26: predicate.environ_get_add_eliminate 1.19% : 0.000003s : 26: predicate.environ_get_depend_swap 1.71% : 0.000005s : 36: predicate.environ_get_eliminate 1.22% : 0.000004s : 26: predicate.environ_get_set_eliminate 1.61% : 0.000005s : 33: predicate.exchange_switch_depend_value 2.59% : 0.000008s : 33: predicate.float_depend_g_call 0.45% : 0.000001s : 10: predicate.float_environ_get_switch 0.65% : 0.000002s : 15: predicate.float_tuple_getitem_switch 0.17% : 0.000001s : 5: predicate.fold_const_symbol 0.56% : 0.000002s : 10: predicate.get_grad_eliminate 0.19% : 0.000001s : 5: predicate.graph_param_transform 0.55% : 0.000002s : 10: predicate.incorporate_call 0.47% : 0.000001s : 10: predicate.incorporate_call_switch 6.42% : 0.000019s : 84: predicate.inline 0.68% : 0.000002s : 10: predicate.inline_without_move 0.29% : 0.000001s : 10: predicate.j_node_and_user_rematch 0.78% : 0.000002s : 10: predicate.less_batch_normalization 1.82% : 0.000005s : 35: predicate.list_to_tuple_eliminator_ 2.71% : 0.000008s : 56: predicate.load_eliminater 0.65% : 0.000002s : 5: predicate.loop_unroll_after_grad 2.44% : 0.000007s : 49: predicate.loop_unroll_before_grad 1.56% : 0.000005s : 31: predicate.make_slice_get_slice_eliminator 0.53% : 0.000002s : 10: predicate.merge_addn 0.62% : 0.000002s : 10: predicate.micro_step_allgather_replace 0.51% : 0.000001s : 10: predicate.mini_step_allgather_replace 1.00% : 0.000003s : 21: predicate.minmaximum_grad 0.77% : 0.000002s : 5: predicate.mutable_eliminate 0.31% : 0.000001s : 5: predicate.opt_reshape 0.27% : 0.000001s : 5: predicate.parallel_virtual_node 2.11% : 0.000006s : 33: predicate.partial_defer_inline 1.69% : 0.000005s : 30: predicate.partial_eliminate 1.03% : 0.000003s : 21: predicate.print_const_string_wrapper 0.50% : 0.000001s : 10: predicate.reduce_all_const_elim 1.37% : 0.000004s : 21: predicate.reduce_eliminate 2.60% : 0.000008s : 56: predicate.redundant_stop_gradient_eliminater 0.32% : 0.000001s : 10: predicate.remove_not_recompute_node 1.32% : 0.000004s : 35: predicate.replace_applicator 0.47% : 0.000001s : 10: predicate.replace_old_param 0.23% : 0.000001s : 5: predicate.reset_defer_inline 1.05% : 0.000003s : 21: predicate.reshape_eliminate 0.53% : 0.000002s : 10: predicate.row_tensor_add_zeros_like 0.32% : 0.000001s : 5: predicate.row_tensor_eliminate 0.59% : 0.000002s : 10: predicate.same_eliminate 0.35% : 0.000001s : 10: predicate.set_cell_output_no_recompute 0.63% : 0.000002s : 10: predicate.shard_identity_eliminate 0.69% : 0.000002s : 10: predicate.special_op_eliminate 0.67% : 0.000002s : 10: predicate.specialize_transform 0.67% : 0.000002s : 10: predicate.split_environ_get_set_with_tuple_value 0.62% : 0.000002s : 10: predicate.stack_unstack_eliminate 0.30% : 0.000001s : 5: predicate.switch_call_monad_eliminater 1.77% : 0.000005s : 33: predicate.switch_defer_inline 2.23% : 0.000006s : 43: predicate.switch_layer_defer_inline 5.75% : 0.000017s : 101: predicate.switch_simplify 1.00% : 0.000003s : 21: predicate.tile_eliminate 1.05% : 0.000003s : 21: predicate.transpose_eliminate 1.62% : 0.000005s : 31: predicate.tuple_list_convert_item_index_to_positive 1.77% : 0.000005s : 31: predicate.tuple_list_get_item_const_eliminator 1.56% : 0.000005s : 31: predicate.tuple_list_get_item_depend_reorder 3.18% : 0.000009s : 45: predicate.tuple_list_get_item_eliminator 1.60% : 0.000005s : 31: predicate.tuple_list_get_set_item_eliminator 2.39% : 0.000007s : 41: predicate.tuple_list_set_item_eliminator 1.79% : 0.000005s : 35: predicate.tuple_to_list_eliminator_ 2.68% : 0.000008s : 56: predicate.updatestate_pure_node_eliminater 3.17% : 0.000009s : 66: predicate.updatestate_useless_node_eliminater 0.31% : 0.000001s : 5: predicate.value_based_eliminate 0.60% : 0.000002s : 10: predicate.virtual_dataset_eliminate 0.51% : 0.000001s : 10: predicate.virtual_output_eliminate 0.25% : 0.000001s : 5: predicate.virtual_view_grad_eliminate 0.32% : 0.000001s : 5: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000993 16 58.49% : 0.000581s : 6: func_graph_cloner_run.FuncGraphClonerGraph 41.51% : 0.000412s : 10: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.051595 192 0.01% : 0.000004s : 1: ForceFp32Comm 6.22% : 0.003208s : 1: add_attr 6.20% : 0.003198s : 1: add_attr_with_inline 0.04% : 0.000020s : 1: add_comm_op_reuse_tag 0.11% : 0.000058s : 1: add_recomputation 0.01% : 0.000005s : 1: assign_add_opt 0.20% : 0.000104s : 1: auto_monad 0.05% : 0.000024s : 1: auto_monad_reorder 0.01% : 0.000004s : 1: begin_end_overlap_inline 0.01% : 0.000005s : 1: bias_add_comm_swap 0.93% : 0.000482s : 1: bootstrap 0.05% : 0.000027s : 1: cconv 0.01% : 0.000003s : 1: comm_op_add_attrs 0.04% : 0.000019s : 1: control_data_broadcast_order 0.02% : 0.000010s : 1: convert_after_rewriter 0.05% : 0.000027s : 1: cse_after_recomputation 0.01% : 0.000005s : 1: dataset_repeat_opt 0.01% : 0.000005s : 1: detach_backward 0.02% : 0.000010s : 1: environ_conv 0.20% : 0.000102s : 1: event_method 0.01% : 0.000005s : 1: full_micro_interleaved_order_control 0.01% : 0.000004s : 1: get_jit_bprop_graph 0.02% : 0.000011s : 1: graph_reusing 0.01% : 0.000005s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000004s : 1: handle_group_info 0.01% : 0.000005s : 1: inline 0.01% : 0.000005s : 1: insert-virtual-dataset 0.01% : 0.000004s : 1: interleave_parallel_branches 0.01% : 0.000004s : 1: interleave_split_concat_branches 0.01% : 0.000006s : 1: label_fine_grained_interleaved_index 0.01% : 0.000007s : 1: label_micro_interleaved_index 0.86% : 0.000441s : 1: loop_unroll 0.01% : 0.000004s : 1: merge_cast_opt 0.01% : 0.000005s : 1: micro_interleaved_order_control 0.94% : 0.000486s : 1: mutable_eliminate 0.01% : 0.000007s : 1: offloading_packed_experts 0.03% : 0.000016s : 1: opt.transform.loop_unroll_optimizer 0.03% : 0.000016s : 1: opt.transform.mutable_eliminate 3.43% : 0.001769s : 78: opt.transform.opt_a 0.07% : 0.000038s : 1: opt.transform.opt_after_cconv 0.06% : 0.000030s : 1: opt.transform.opt_after_jit_grad 0.26% : 0.000134s : 28: opt.transform.opt_b 0.12% : 0.000060s : 2: opt.transform.opt_trans_graph 0.09% : 0.000045s : 4: opt.transform.symbol_engine_opt 6.79% : 0.003504s : 1: opt_a 0.23% : 0.000118s : 1: opt_after_cconv 0.94% : 0.000484s : 1: opt_after_jit_grad 0.47% : 0.000245s : 1: opt_b 10.92% : 0.005636s : 1: optimize 0.04% : 0.000020s : 1: optimize_parallel_all_gather_comm 0.02% : 0.000009s : 1: order_py_execute_after_rewriter 0.05% : 0.000025s : 1: overlap_grad_flash_sp 0.01% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.01% : 0.000007s : 1: overlap_grad_ring_attention 0.01% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.01% : 0.000005s : 1: overlap_param_gather 0.01% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.01% : 0.000008s : 1: overlap_recompute_and_grad_model_parallel 0.01% : 0.000005s : 1: overlap_recompute_comm 0.01% : 0.000007s : 1: parallel-infer-symbol 0.01% : 0.000004s : 1: parallel-infer-symbol-second 0.01% : 0.000005s : 1: partial_unused_args_eliminate 0.01% : 0.000005s : 1: pipeline_parallel_scheduler 0.01% : 0.000004s : 1: pipeline_split 0.09% : 0.000047s : 1: pre_auto_parallel 0.07% : 0.000037s : 1: py_interpret_to_execute 0.02% : 0.000013s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000003s : 1: remove_cast_before_assign_add 0.04% : 0.000019s : 1: remove_dup_value 0.87% : 0.000451s : 1: renormalize.infer 0.79% : 0.000409s : 1: renormalize.specialize 0.01% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.01% : 0.000007s : 1: rewriter_after_jit_bprop_graph 0.08% : 0.000043s : 1: rewriter_after_opt_a 0.24% : 0.000121s : 1: rewriter_before_opt_a 0.01% : 0.000004s : 1: slice_cell_reuse_recomputed_activation 0.01% : 0.000005s : 1: slice_recompute_activation 0.01% : 0.000004s : 1: split_layernorm_comm 0.01% : 0.000005s : 1: split_matmul_comm_elemetwise 0.02% : 0.000009s : 1: swap_dp_allreduce_reducescatter 0.17% : 0.000089s : 1: symbol_engine_optimizer 0.17% : 0.000089s : 1: tuple_transform 57.55% : 0.029693s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:19.999.180 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:19.999.450 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0421744, [21] [bootstrap]: 0.00045601 [type_inference]: 0.0141103 [event_method]: 5.467e-05 [auto_monad]: 9.913e-05 [graph_reusing]: 7.45e-06 [inline]: 2.67001e-06 [add_attr]: 0.0196169, [1] [add_attr_with_inline]: 0.0196057, [1] [Cycle 1]: 0.00010386, [2] [tag_attr]: 3.287e-05 [meta_addattr_fg_expand]: 8.99e-06 [parallel-infer-symbol]: 3.83001e-06 [pre_auto_parallel]: 4.941e-05 [insert-virtual-dataset]: 2.37001e-06 [parallel-infer-symbol-second]: 9.79984e-07 [dataset_repeat_opt]: 1.92001e-06 [pipeline_split]: 1.92999e-06 [optimize]: 0.00647421, [53] [py_interpret_to_execute]: 4.187e-05 [rewriter_before_opt_a]: 0.00013121 [opt_a]: 0.00389175, [2] [Cycle 1]: 0.00303358, [45] [expand_dump_flag]: 4.45e-06 [switch_simplify]: 0.00013093 [loop_unroll]: 4.222e-05 [a_1]: 0.00091881 [with_stream_mark]: 1.991e-05 [recompute_prepare]: 9.39998e-06 [updatestate_depend_eliminate]: 5.24e-06 [updatestate_assign_eliminate]: 3.43999e-06 [updatestate_loads_eliminate]: 3.08998e-06 [parameter_eliminate]: 2.16003e-06 [a_2]: 0.00011646 [accelerated_algorithm]: 7.39002e-06 [shard]: 2.01998e-06 [meta_shard_fg_expand]: 2.74999e-06 [shard_inline]: 7.28e-06 [merge_send_recv]: 8.73001e-06 [auto_parallel]: 7.84002e-06 [parallel]: 2.115e-05 [flash_sp]: 8.95999e-06 [merge_comm]: 4.05e-06 [allreduce_fusion]: 3.66001e-06 [matmul_add_comm_reduction]: 9.59999e-06 [allreduce_slice_to_reducescatter]: 8.70001e-07 [virtual_shard_identity]: 9.27999e-06 [virtual_dataset]: 6.94001e-06 [get_grad_eliminate_]: 6.96001e-06 [virtual_output]: 6.53e-06 [merge_forward]: 3.88999e-06 [cell_reuse_recompute_pass]: 1.37999e-06 [offload_activation]: 1.033e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.542e-05 [merge_recompute_call_nodes]: 1.74e-06 [before_grad]: 1.14e-05 [set_forward_comm_id_for_comm_node_pass]: 3.61001e-06 [meta_fg_expand]: 3.50998e-06 [flash_sp_send_recv_attached]: 2.75997e-06 [receive_attached]: 2.19001e-06 [after_resolve]: 1.252e-05 [a_after_grad]: 1.138e-05 [renormalize]: 0.00105354 [add_forward_monad_depend]: 5.92001e-06 [auto_monad_grad]: 2.39999e-06 [auto_monad_eliminator]: 1.775e-05 [cse]: 2.963e-05 [a_3]: 6.636e-05 [Cycle 2]: 0.00084265, [45] [expand_dump_flag]: 2.06998e-06 [switch_simplify]: 8.36002e-06 [loop_unroll]: 6.46e-06 [a_1]: 0.00014354 [with_stream_mark]: 1.275e-05 [recompute_prepare]: 7.1e-06 [updatestate_depend_eliminate]: 3.04001e-06 [updatestate_assign_eliminate]: 2.49999e-06 [updatestate_loads_eliminate]: 2.66999e-06 [parameter_eliminate]: 1.09e-06 [a_2]: 0.00010362 [accelerated_algorithm]: 6.43e-06 [shard]: 1.24e-06 [meta_shard_fg_expand]: 1.67001e-06 [shard_inline]: 6.71999e-06 [merge_send_recv]: 5.27001e-06 [auto_parallel]: 6.24001e-06 [parallel]: 5.19e-06 [flash_sp]: 3.6e-06 [merge_comm]: 3.41999e-06 [allreduce_fusion]: 3.08e-06 [matmul_add_comm_reduction]: 6.34001e-06 [allreduce_slice_to_reducescatter]: 5.19998e-07 [virtual_shard_identity]: 7.25998e-06 [virtual_dataset]: 6.29999e-06 [get_grad_eliminate_]: 6.18002e-06 [virtual_output]: 6.03002e-06 [merge_forward]: 2.84001e-06 [cell_reuse_recompute_pass]: 2.03002e-06 [offload_activation]: 7.51999e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.601e-05 [merge_recompute_call_nodes]: 7.00005e-07 [before_grad]: 1.018e-05 [set_forward_comm_id_for_comm_node_pass]: 4.13001e-06 [meta_fg_expand]: 2.41e-06 [flash_sp_send_recv_attached]: 9.60019e-07 [receive_attached]: 1.54e-06 [after_resolve]: 1.184e-05 [a_after_grad]: 9.88998e-06 [renormalize]: 8.00064e-08 [add_forward_monad_depend]: 1.33002e-06 [auto_monad_grad]: 8.59989e-07 [auto_monad_eliminator]: 8.2e-06 [cse]: 1.381e-05 [a_3]: 5.103e-05 [py_interpret_to_execute_after_opt_a]: 1.471e-05 [slice_cell_reuse_recomputed_activation]: 4.51002e-06 [rewriter_after_opt_a]: 4.159e-05 [convert_after_rewriter]: 9.59e-06 [order_py_execute_after_rewriter]: 9.24e-06 [mutable_eliminate]: 0.00062719 [opt_b]: 0.00029781, [1] [Cycle 1]: 0.00028711, [7] [b_1]: 0.00019357 [b_2]: 8.35999e-06 [updatestate_depend_eliminate]: 5.75001e-06 [updatestate_assign_eliminate]: 2.46998e-06 [updatestate_loads_eliminate]: 2.17001e-06 [renormalize]: 5.50004e-07 [cse]: 1.74e-05 [optimize_parallel_all_gather_comm]: 1.92e-05 [overlap_param_gather]: 5.48002e-06 [cconv]: 2.943e-05 [loop_unroll]: 0.00048893 [opt_after_cconv]: 0.00012926, [1] [Cycle 1]: 0.00012083, [7] [c_1]: 3.426e-05 [parameter_eliminate]: 2.81e-06 [updatestate_depend_eliminate]: 5.17999e-06 [updatestate_assign_eliminate]: 2.58e-06 [updatestate_loads_eliminate]: 2.39001e-06 [cse]: 1.67e-05 [renormalize]: 3.29979e-07 [remove_dup_value]: 1.707e-05 [tuple_transform]: 9.483e-05, [1] [Cycle 1]: 8.719e-05, [4] [d_1]: 4.803e-05 [none_parameter_eliminate]: 1.79998e-06 [renormalize]: 1.69995e-07 [switch_simplify]: 6.98e-06 [partial_unused_args_eliminate]: 4.43999e-06 [add_recomputation]: 5.128e-05 [cse_after_recomputation]: 2.814e-05, [1] [Cycle 1]: 2.099e-05, [1] [cse]: 1.161e-05 [environ_conv]: 8.46002e-06 [swap_dp_allreduce_reducescatter]: 7.81001e-06 [bias_add_comm_swap]: 5.35999e-06 [label_micro_interleaved_index]: 7.60998e-06 [label_fine_grained_interleaved_index]: 6.05002e-06 [merge_cast_opt]: 3.91001e-06 [slice_recompute_activation]: 5.51002e-06 [micro_interleaved_order_control]: 5.24e-06 [assign_add_opt]: 3.78001e-06 [ForceFp32Comm]: 4.36002e-06 [remove_cast_before_assign_add]: 3.56999e-06 [full_micro_interleaved_order_control]: 4.60001e-06 [reorder_send_recv_between_fp_bp]: 5.47001e-06 [comm_op_add_attrs]: 3.32002e-06 [add_comm_op_reuse_tag]: 3.26001e-06 [interleave_split_concat_branches]: 3.40998e-06 [interleave_parallel_branches]: 3.31999e-06 [overlap_opt_shard_in_pipeline]: 3.91999e-06 [overlap_opt_shard_grad_in_pipeline]: 4.10998e-06 [control_data_broadcast_order]: 1.669e-05 [grouped_pairwise_exchange_alltoall]: 3.73001e-06 [offloading_packed_experts]: 6.16e-06 [overlap_recompute_and_grad_model_parallel]: 3.464e-05 [overlap_grad_matmul_and_grad_allreduce]: 4.38999e-06 [overlap_recompute_allgather_and_fa_grad]: 4.13001e-06 [overlap_recompute_comm]: 4.78001e-06 [overlap_grad_ring_attention]: 7.30998e-06 [overlap_grad_flash_sp]: 2.363e-05 [begin_end_overlap_inline]: 2.82002e-06 [split_matmul_comm_elemetwise]: 4.79998e-06 [split_layernorm_comm]: 4.39002e-06 [handle_group_info]: 3.33e-06 [symbol_engine_optimizer]: 0.00010172, [1] [Cycle 1]: 9.47e-05, [6] [build]: 3.13e-06 [elim_shapecalc]: 1.098e-05 [elim_not_effective]: 1.45e-05 [opt_reshape]: 7.67002e-06 [fold_const_symbol]: 1.078e-05 [renormalize]: 2.40019e-07 [detach_backward]: 4.27e-06 [pipeline_parallel_scheduler]: 2.34001e-06 [auto_monad_reorder]: 1.905e-05 [get_jit_bprop_graph]: 1.83997e-06 [rewriter_after_jit_bprop_graph]: 4.92e-06 [opt_after_jit_grad]: 0.00057094 [validate]: 3.85e-05 Sums bootstrap : 0.000456s : 2.20% type_inference : 0.014110s : 68.05% event_method : 0.000055s : 0.26% auto_monad : 0.000099s : 0.48% graph_reusing : 0.000007s : 0.04% inline : 0.000003s : 0.01% add_attr.add_attr_with_inline.tag_attr : 0.000033s : 0.16% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000009s : 0.04% parallel-infer-symbol : 0.000004s : 0.02% pre_auto_parallel : 0.000049s : 0.24% insert-virtual-dataset : 0.000002s : 0.01% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.01% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000042s : 0.20% optimize.rewriter_before_opt_a : 0.000131s : 0.63% optimize.opt_a.expand_dump_flag : 0.000007s : 0.03% optimize.opt_a.switch_simplify : 0.000139s : 0.67% optimize.opt_a.loop_unroll : 0.000049s : 0.23% optimize.opt_a.a_1 : 0.001062s : 5.12% optimize.opt_a.with_stream_mark : 0.000033s : 0.16% optimize.opt_a.recompute_prepare : 0.000016s : 0.08% optimize.opt_a.updatestate_depend_eliminate : 0.000008s : 0.04% optimize.opt_a.updatestate_assign_eliminate : 0.000006s : 0.03% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.03% optimize.opt_a.parameter_eliminate : 0.000003s : 0.02% optimize.opt_a.a_2 : 0.000220s : 1.06% optimize.opt_a.accelerated_algorithm : 0.000014s : 0.07% optimize.opt_a.shard : 0.000003s : 0.02% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.02% optimize.opt_a.shard_inline : 0.000014s : 0.07% optimize.opt_a.merge_send_recv : 0.000014s : 0.07% optimize.opt_a.auto_parallel : 0.000014s : 0.07% optimize.opt_a.parallel : 0.000026s : 0.13% optimize.opt_a.flash_sp : 0.000013s : 0.06% optimize.opt_a.merge_comm : 0.000007s : 0.04% optimize.opt_a.allreduce_fusion : 0.000007s : 0.03% optimize.opt_a.matmul_add_comm_reduction : 0.000016s : 0.08% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000017s : 0.08% optimize.opt_a.virtual_dataset : 0.000013s : 0.06% optimize.opt_a.get_grad_eliminate_ : 0.000013s : 0.06% optimize.opt_a.virtual_output : 0.000013s : 0.06% optimize.opt_a.merge_forward : 0.000007s : 0.03% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.02% optimize.opt_a.offload_activation : 0.000018s : 0.09% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000031s : 0.15% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.01% optimize.opt_a.before_grad : 0.000022s : 0.10% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000008s : 0.04% optimize.opt_a.meta_fg_expand : 0.000006s : 0.03% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.02% optimize.opt_a.receive_attached : 0.000004s : 0.02% optimize.opt_a.after_resolve : 0.000024s : 0.12% optimize.opt_a.a_after_grad : 0.000021s : 0.10% optimize.opt_a.renormalize : 0.001054s : 5.08% optimize.opt_a.add_forward_monad_depend : 0.000007s : 0.03% optimize.opt_a.auto_monad_grad : 0.000003s : 0.02% optimize.opt_a.auto_monad_eliminator : 0.000026s : 0.13% optimize.opt_a.cse : 0.000043s : 0.21% optimize.opt_a.a_3 : 0.000117s : 0.57% optimize.py_interpret_to_execute_after_opt_a : 0.000015s : 0.07% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.02% optimize.rewriter_after_opt_a : 0.000042s : 0.20% optimize.convert_after_rewriter : 0.000010s : 0.05% optimize.order_py_execute_after_rewriter : 0.000009s : 0.04% optimize.mutable_eliminate : 0.000627s : 3.02% optimize.opt_b.b_1 : 0.000194s : 0.93% optimize.opt_b.b_2 : 0.000008s : 0.04% optimize.opt_b.updatestate_depend_eliminate : 0.000006s : 0.03% optimize.opt_b.updatestate_assign_eliminate : 0.000002s : 0.01% optimize.opt_b.updatestate_loads_eliminate : 0.000002s : 0.01% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000017s : 0.08% optimize.optimize_parallel_all_gather_comm : 0.000019s : 0.09% optimize.overlap_param_gather : 0.000005s : 0.03% optimize.cconv : 0.000029s : 0.14% optimize.loop_unroll : 0.000489s : 2.36% optimize.opt_after_cconv.c_1 : 0.000034s : 0.17% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000005s : 0.02% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.01% optimize.opt_after_cconv.cse : 0.000017s : 0.08% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000017s : 0.08% optimize.tuple_transform.d_1 : 0.000048s : 0.23% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000007s : 0.03% optimize.partial_unused_args_eliminate : 0.000004s : 0.02% optimize.add_recomputation : 0.000051s : 0.25% optimize.cse_after_recomputation.cse : 0.000012s : 0.06% optimize.environ_conv : 0.000008s : 0.04% optimize.swap_dp_allreduce_reducescatter : 0.000008s : 0.04% optimize.bias_add_comm_swap : 0.000005s : 0.03% optimize.label_micro_interleaved_index : 0.000008s : 0.04% optimize.label_fine_grained_interleaved_index : 0.000006s : 0.03% optimize.merge_cast_opt : 0.000004s : 0.02% optimize.slice_recompute_activation : 0.000006s : 0.03% optimize.micro_interleaved_order_control : 0.000005s : 0.03% optimize.assign_add_opt : 0.000004s : 0.02% optimize.ForceFp32Comm : 0.000004s : 0.02% optimize.remove_cast_before_assign_add : 0.000004s : 0.02% optimize.full_micro_interleaved_order_control : 0.000005s : 0.02% optimize.reorder_send_recv_between_fp_bp : 0.000005s : 0.03% optimize.comm_op_add_attrs : 0.000003s : 0.02% optimize.add_comm_op_reuse_tag : 0.000003s : 0.02% optimize.interleave_split_concat_branches : 0.000003s : 0.02% optimize.interleave_parallel_branches : 0.000003s : 0.02% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.02% optimize.overlap_opt_shard_grad_in_pipeline : 0.000004s : 0.02% optimize.control_data_broadcast_order : 0.000017s : 0.08% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.02% optimize.offloading_packed_experts : 0.000006s : 0.03% optimize.overlap_recompute_and_grad_model_parallel : 0.000035s : 0.17% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.02% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.02% optimize.overlap_recompute_comm : 0.000005s : 0.02% optimize.overlap_grad_ring_attention : 0.000007s : 0.04% optimize.overlap_grad_flash_sp : 0.000024s : 0.11% optimize.begin_end_overlap_inline : 0.000003s : 0.01% optimize.split_matmul_comm_elemetwise : 0.000005s : 0.02% optimize.split_layernorm_comm : 0.000004s : 0.02% optimize.handle_group_info : 0.000003s : 0.02% optimize.symbol_engine_optimizer.build : 0.000003s : 0.02% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000011s : 0.05% optimize.symbol_engine_optimizer.elim_not_effective : 0.000014s : 0.07% optimize.symbol_engine_optimizer.opt_reshape : 0.000008s : 0.04% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000011s : 0.05% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000004s : 0.02% pipeline_parallel_scheduler : 0.000002s : 0.01% auto_monad_reorder : 0.000019s : 0.09% get_jit_bprop_graph : 0.000002s : 0.01% rewriter_after_jit_bprop_graph : 0.000005s : 0.02% opt_after_jit_grad : 0.000571s : 2.75% validate : 0.000039s : 0.19% Time group info: ------[substitution.] 0.000248 34 0.84% : 0.000002s : 2: substitution.elim_not_effective 0.58% : 0.000001s : 2: substitution.fold_const_symbol 2.44% : 0.000006s : 4: substitution.graph_param_transform 79.81% : 0.000198s : 8: substitution.inline 1.60% : 0.000004s : 4: substitution.j_node_and_user_rematch 2.02% : 0.000005s : 4: substitution.remove_not_recompute_node 2.03% : 0.000005s : 4: substitution.replace_old_param 3.87% : 0.000010s : 2: substitution.switch_simplify 6.82% : 0.000017s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.014041 2 89.69% : 0.012594s : 1: type_inference.infer 10.31% : 0.001447s : 1: type_inference.specialize ------[replace.] 0.000131 14 42.65% : 0.000056s : 8: replace.inline 38.08% : 0.000050s : 2: replace.switch_simplify 19.27% : 0.000025s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000216 14 89.33% : 0.000193s : 8: match.inline 3.84% : 0.000008s : 2: match.switch_simplify 6.83% : 0.000015s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000255 1520 0.92% : 0.000002s : 17: predicate.accumulaten_eliminater 0.62% : 0.000002s : 4: predicate.ad_related_special_op_eliminate 0.41% : 0.000001s : 8: predicate.addn_check_dump 0.95% : 0.000002s : 17: predicate.addn_zero_filter 0.89% : 0.000002s : 17: predicate.adjust_all_reduce_mul_add 2.20% : 0.000006s : 25: predicate.arithmetic_simplify 1.19% : 0.000003s : 17: predicate.cast_eliminate 0.51% : 0.000001s : 8: predicate.check_bprop_eliminate 0.46% : 0.000001s : 8: predicate.compare_switch_simplify 0.15% : 0.000000s : 4: predicate.const_output_eliminate 0.51% : 0.000001s : 8: predicate.depend_value_elim 1.06% : 0.000003s : 17: predicate.dict_get_item_const_eliminator 1.28% : 0.000003s : 17: predicate.dict_get_item_eliminator 1.22% : 0.000003s : 17: predicate.dict_set_item_eliminator 0.79% : 0.000002s : 8: predicate.dumpgradient_eliminate 0.24% : 0.000001s : 4: predicate.elim_not_effective 0.34% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.21% : 0.000003s : 21: predicate.environ_add_const_eliminate 1.11% : 0.000003s : 21: predicate.environ_get_add_eliminate 1.11% : 0.000003s : 21: predicate.environ_get_depend_swap 1.66% : 0.000004s : 29: predicate.environ_get_eliminate 1.12% : 0.000003s : 21: predicate.environ_get_set_eliminate 1.62% : 0.000004s : 29: predicate.exchange_switch_depend_value 2.71% : 0.000007s : 29: predicate.float_depend_g_call 0.43% : 0.000001s : 8: predicate.float_environ_get_switch 0.64% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.15% : 0.000000s : 4: predicate.fold_const_symbol 0.57% : 0.000001s : 8: predicate.get_grad_eliminate 0.17% : 0.000000s : 4: predicate.graph_param_transform 0.49% : 0.000001s : 8: predicate.incorporate_call 0.42% : 0.000001s : 8: predicate.incorporate_call_switch 6.29% : 0.000016s : 70: predicate.inline 0.62% : 0.000002s : 8: predicate.inline_without_move 0.27% : 0.000001s : 8: predicate.j_node_and_user_rematch 0.69% : 0.000002s : 8: predicate.less_batch_normalization 1.92% : 0.000005s : 29: predicate.list_to_tuple_eliminator_ 2.58% : 0.000007s : 46: predicate.load_eliminater 0.81% : 0.000002s : 4: predicate.loop_unroll_after_grad 2.76% : 0.000007s : 47: predicate.loop_unroll_before_grad 1.63% : 0.000004s : 25: predicate.make_slice_get_slice_eliminator 0.60% : 0.000002s : 8: predicate.merge_addn 0.51% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.53% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.91% : 0.000002s : 17: predicate.minmaximum_grad 0.82% : 0.000002s : 4: predicate.mutable_eliminate 0.29% : 0.000001s : 4: predicate.opt_reshape 0.27% : 0.000001s : 4: predicate.parallel_virtual_node 2.24% : 0.000006s : 29: predicate.partial_defer_inline 1.63% : 0.000004s : 25: predicate.partial_eliminate 1.07% : 0.000003s : 17: predicate.print_const_string_wrapper 0.49% : 0.000001s : 8: predicate.reduce_all_const_elim 1.45% : 0.000004s : 17: predicate.reduce_eliminate 2.47% : 0.000006s : 46: predicate.redundant_stop_gradient_eliminater 0.35% : 0.000001s : 8: predicate.remove_not_recompute_node 1.31% : 0.000003s : 29: predicate.replace_applicator 0.35% : 0.000001s : 8: predicate.replace_old_param 0.22% : 0.000001s : 4: predicate.reset_defer_inline 1.08% : 0.000003s : 17: predicate.reshape_eliminate 0.55% : 0.000001s : 8: predicate.row_tensor_add_zeros_like 0.33% : 0.000001s : 4: predicate.row_tensor_eliminate 0.62% : 0.000002s : 8: predicate.same_eliminate 0.37% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.75% : 0.000002s : 8: predicate.shard_identity_eliminate 0.60% : 0.000002s : 8: predicate.special_op_eliminate 0.60% : 0.000002s : 8: predicate.specialize_transform 0.71% : 0.000002s : 8: predicate.split_environ_get_set_with_tuple_value 0.66% : 0.000002s : 8: predicate.stack_unstack_eliminate 0.25% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.80% : 0.000005s : 29: predicate.switch_defer_inline 2.33% : 0.000006s : 37: predicate.switch_layer_defer_inline 5.99% : 0.000015s : 92: predicate.switch_simplify 0.96% : 0.000002s : 17: predicate.tile_eliminate 1.00% : 0.000003s : 17: predicate.transpose_eliminate 1.59% : 0.000004s : 25: predicate.tuple_list_convert_item_index_to_positive 1.62% : 0.000004s : 25: predicate.tuple_list_get_item_const_eliminator 1.75% : 0.000004s : 25: predicate.tuple_list_get_item_depend_reorder 3.21% : 0.000008s : 37: predicate.tuple_list_get_item_eliminator 1.51% : 0.000004s : 25: predicate.tuple_list_get_set_item_eliminator 2.18% : 0.000006s : 33: predicate.tuple_list_set_item_eliminator 1.73% : 0.000004s : 29: predicate.tuple_to_list_eliminator_ 2.34% : 0.000006s : 46: predicate.updatestate_pure_node_eliminater 3.10% : 0.000008s : 54: predicate.updatestate_useless_node_eliminater 0.33% : 0.000001s : 4: predicate.value_based_eliminate 0.60% : 0.000002s : 8: predicate.virtual_dataset_eliminate 0.50% : 0.000001s : 8: predicate.virtual_output_eliminate 0.27% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.42% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.001027 16 57.36% : 0.000589s : 6: func_graph_cloner_run.FuncGraphClonerGraph 42.64% : 0.000438s : 10: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.071080 192 0.01% : 0.000008s : 1: ForceFp32Comm 27.61% : 0.019628s : 1: add_attr 27.59% : 0.019610s : 1: add_attr_with_inline 0.01% : 0.000006s : 1: add_comm_op_reuse_tag 0.08% : 0.000055s : 1: add_recomputation 0.01% : 0.000007s : 1: assign_add_opt 0.15% : 0.000110s : 1: auto_monad 0.04% : 0.000027s : 1: auto_monad_reorder 0.01% : 0.000006s : 1: begin_end_overlap_inline 0.01% : 0.000008s : 1: bias_add_comm_swap 0.71% : 0.000501s : 1: bootstrap 0.05% : 0.000033s : 1: cconv 0.01% : 0.000006s : 1: comm_op_add_attrs 0.03% : 0.000020s : 1: control_data_broadcast_order 0.02% : 0.000013s : 1: convert_after_rewriter 0.04% : 0.000031s : 1: cse_after_recomputation 0.01% : 0.000007s : 1: dataset_repeat_opt 0.03% : 0.000023s : 1: detach_backward 0.02% : 0.000011s : 1: environ_conv 0.10% : 0.000069s : 1: event_method 0.01% : 0.000007s : 1: full_micro_interleaved_order_control 0.01% : 0.000008s : 1: get_jit_bprop_graph 0.02% : 0.000014s : 1: graph_reusing 0.01% : 0.000006s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000006s : 1: handle_group_info 0.01% : 0.000009s : 1: inline 0.01% : 0.000008s : 1: insert-virtual-dataset 0.01% : 0.000006s : 1: interleave_parallel_branches 0.01% : 0.000006s : 1: interleave_split_concat_branches 0.01% : 0.000009s : 1: label_fine_grained_interleaved_index 0.01% : 0.000010s : 1: label_micro_interleaved_index 0.70% : 0.000495s : 1: loop_unroll 0.01% : 0.000007s : 1: merge_cast_opt 0.01% : 0.000008s : 1: micro_interleaved_order_control 0.89% : 0.000635s : 1: mutable_eliminate 0.01% : 0.000009s : 1: offloading_packed_experts 0.02% : 0.000015s : 1: opt.transform.loop_unroll_optimizer 0.02% : 0.000016s : 1: opt.transform.mutable_eliminate 2.26% : 0.001609s : 78: opt.transform.opt_a 0.05% : 0.000032s : 1: opt.transform.opt_after_cconv 0.04% : 0.000028s : 1: opt.transform.opt_after_jit_grad 0.16% : 0.000113s : 28: opt.transform.opt_b 0.07% : 0.000053s : 2: opt.transform.opt_trans_graph 0.06% : 0.000040s : 4: opt.transform.symbol_engine_opt 5.48% : 0.003895s : 1: opt_a 0.19% : 0.000133s : 1: opt_after_cconv 0.82% : 0.000582s : 1: opt_after_jit_grad 0.42% : 0.000302s : 1: opt_b 9.65% : 0.006857s : 1: optimize 0.03% : 0.000022s : 1: optimize_parallel_all_gather_comm 0.02% : 0.000012s : 1: order_py_execute_after_rewriter 0.04% : 0.000027s : 1: overlap_grad_flash_sp 0.01% : 0.000007s : 1: overlap_grad_matmul_and_grad_allreduce 0.01% : 0.000010s : 1: overlap_grad_ring_attention 0.01% : 0.000007s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000007s : 1: overlap_opt_shard_in_pipeline 0.01% : 0.000008s : 1: overlap_param_gather 0.01% : 0.000007s : 1: overlap_recompute_allgather_and_fa_grad 0.05% : 0.000038s : 1: overlap_recompute_and_grad_model_parallel 0.01% : 0.000008s : 1: overlap_recompute_comm 0.01% : 0.000011s : 1: parallel-infer-symbol 0.01% : 0.000007s : 1: parallel-infer-symbol-second 0.01% : 0.000007s : 1: partial_unused_args_eliminate 0.01% : 0.000009s : 1: pipeline_parallel_scheduler 0.01% : 0.000007s : 1: pipeline_split 0.08% : 0.000057s : 1: pre_auto_parallel 0.06% : 0.000046s : 1: py_interpret_to_execute 0.03% : 0.000018s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000006s : 1: remove_cast_before_assign_add 0.03% : 0.000020s : 1: remove_dup_value 0.77% : 0.000545s : 1: renormalize.infer 0.70% : 0.000498s : 1: renormalize.specialize 0.01% : 0.000008s : 1: reorder_send_recv_between_fp_bp 0.02% : 0.000011s : 1: rewriter_after_jit_bprop_graph 0.06% : 0.000045s : 1: rewriter_after_opt_a 0.19% : 0.000135s : 1: rewriter_before_opt_a 0.01% : 0.000007s : 1: slice_cell_reuse_recomputed_activation 0.01% : 0.000008s : 1: slice_recompute_activation 0.01% : 0.000007s : 1: split_layernorm_comm 0.01% : 0.000008s : 1: split_matmul_comm_elemetwise 0.01% : 0.000011s : 1: swap_dp_allreduce_reducescatter 0.15% : 0.000105s : 1: symbol_engine_optimizer 0.14% : 0.000098s : 1: tuple_transform 19.92% : 0.014156s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:20.429.894 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0401406, [21] [bootstrap]: 0.00041316 [type_inference]: 0.0297889 [event_method]: 5.096e-05 [auto_monad]: 9.962e-05 [graph_reusing]: 7.2e-06 [inline]: 2.56e-06 [add_attr]: 0.00345657, [1] [add_attr_with_inline]: 0.00344717, [1] [Cycle 1]: 7.449e-05, [2] [tag_attr]: 2.867e-05 [meta_addattr_fg_expand]: 8.13999e-06 [parallel-infer-symbol]: 3.47002e-06 [pre_auto_parallel]: 4.519e-05 [insert-virtual-dataset]: 3.02002e-06 [parallel-infer-symbol-second]: 7.7e-07 [dataset_repeat_opt]: 1.92001e-06 [pipeline_split]: 2.42001e-06 [optimize]: 0.00549109, [53] [py_interpret_to_execute]: 3.396e-05 [rewriter_before_opt_a]: 0.00011507 [opt_a]: 0.00334474, [2] [Cycle 1]: 0.00266155, [45] [expand_dump_flag]: 4.15999e-06 [switch_simplify]: 0.00012974 [loop_unroll]: 4.482e-05 [a_1]: 0.00085778 [with_stream_mark]: 1.663e-05 [recompute_prepare]: 1.036e-05 [updatestate_depend_eliminate]: 4.15999e-06 [updatestate_assign_eliminate]: 3.15002e-06 [updatestate_loads_eliminate]: 3.15998e-06 [parameter_eliminate]: 1.99e-06 [a_2]: 8.511e-05 [accelerated_algorithm]: 8.28999e-06 [shard]: 1.65001e-06 [meta_shard_fg_expand]: 1.92001e-06 [shard_inline]: 6.93998e-06 [merge_send_recv]: 8.17003e-06 [auto_parallel]: 6.17999e-06 [parallel]: 1.923e-05 [flash_sp]: 8.29002e-06 [merge_comm]: 4.20999e-06 [allreduce_fusion]: 3.65998e-06 [matmul_add_comm_reduction]: 9.84999e-06 [allreduce_slice_to_reducescatter]: 6.10016e-07 [virtual_shard_identity]: 9.41e-06 [virtual_dataset]: 6.92002e-06 [get_grad_eliminate_]: 6.08002e-06 [virtual_output]: 6.71e-06 [merge_forward]: 4.13999e-06 [cell_reuse_recompute_pass]: 1.19998e-06 [offload_activation]: 1.037e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.371e-05 [merge_recompute_call_nodes]: 1.62999e-06 [before_grad]: 1.069e-05 [set_forward_comm_id_for_comm_node_pass]: 3.74002e-06 [meta_fg_expand]: 3.42002e-06 [flash_sp_send_recv_attached]: 2.39001e-06 [receive_attached]: 2.16e-06 [after_resolve]: 1.272e-05 [a_after_grad]: 1.017e-05 [renormalize]: 0.00094603 [add_forward_monad_depend]: 6.47001e-06 [auto_monad_grad]: 2.32001e-06 [auto_monad_eliminator]: 1.65e-05 [cse]: 3.108e-05 [a_3]: 5.122e-05 [Cycle 2]: 0.00067233, [45] [expand_dump_flag]: 1.40999e-06 [switch_simplify]: 8.08001e-06 [loop_unroll]: 6.50002e-06 [a_1]: 0.00013673 [with_stream_mark]: 1.364e-05 [recompute_prepare]: 6.85998e-06 [updatestate_depend_eliminate]: 3.06001e-06 [updatestate_assign_eliminate]: 2.48e-06 [updatestate_loads_eliminate]: 3.09001e-06 [parameter_eliminate]: 1.02e-06 [a_2]: 7.588e-05 [accelerated_algorithm]: 6.79999e-06 [shard]: 2.07001e-06 [meta_shard_fg_expand]: 1.62999e-06 [shard_inline]: 6.86001e-06 [merge_send_recv]: 5.25001e-06 [auto_parallel]: 5.51e-06 [parallel]: 5.49e-06 [flash_sp]: 4.19002e-06 [merge_comm]: 3.19001e-06 [allreduce_fusion]: 3.13998e-06 [matmul_add_comm_reduction]: 7.06999e-06 [allreduce_slice_to_reducescatter]: 6.69999e-07 [virtual_shard_identity]: 6.94999e-06 [virtual_dataset]: 6.20002e-06 [get_grad_eliminate_]: 5.89e-06 [virtual_output]: 5.75001e-06 [merge_forward]: 2.89999e-06 [cell_reuse_recompute_pass]: 1.42e-06 [offload_activation]: 6.77002e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.327e-05 [merge_recompute_call_nodes]: 1.12e-06 [before_grad]: 9.65002e-06 [set_forward_comm_id_for_comm_node_pass]: 3.85998e-06 [meta_fg_expand]: 2.39999e-06 [flash_sp_send_recv_attached]: 9.50007e-07 [receive_attached]: 1.12999e-06 [after_resolve]: 1.203e-05 [a_after_grad]: 1.033e-05 [renormalize]: 1.20024e-07 [add_forward_monad_depend]: 1.67001e-06 [auto_monad_grad]: 1.26997e-06 [auto_monad_eliminator]: 7.45e-06 [cse]: 1.403e-05 [a_3]: 3.864e-05 [py_interpret_to_execute_after_opt_a]: 1.017e-05 [slice_cell_reuse_recomputed_activation]: 1.84998e-06 [rewriter_after_opt_a]: 3.635e-05 [convert_after_rewriter]: 6.82002e-06 [order_py_execute_after_rewriter]: 5.56e-06 [mutable_eliminate]: 0.00054076 [opt_b]: 0.00021169, [1] [Cycle 1]: 0.00020497, [7] [b_1]: 0.00012886 [b_2]: 7.92e-06 [updatestate_depend_eliminate]: 5.71e-06 [updatestate_assign_eliminate]: 2.41998e-06 [updatestate_loads_eliminate]: 2.93e-06 [renormalize]: 3.39991e-07 [cse]: 2.054e-05 [optimize_parallel_all_gather_comm]: 1.658e-05 [overlap_param_gather]: 1.91e-06 [cconv]: 2.733e-05 [loop_unroll]: 0.00044434 [opt_after_cconv]: 0.00010475, [1] [Cycle 1]: 9.914e-05, [7] [c_1]: 3.265e-05 [parameter_eliminate]: 2.86e-06 [updatestate_depend_eliminate]: 5.28002e-06 [updatestate_assign_eliminate]: 2.48e-06 [updatestate_loads_eliminate]: 2.86e-06 [cse]: 1.822e-05 [renormalize]: 3.30008e-07 [remove_dup_value]: 1.342e-05 [tuple_transform]: 7.839e-05, [1] [Cycle 1]: 7.332e-05, [4] [d_1]: 4.593e-05 [none_parameter_eliminate]: 1.68002e-06 [renormalize]: 1.69995e-07 [switch_simplify]: 7e-06 [partial_unused_args_eliminate]: 2.00002e-06 [add_recomputation]: 4.805e-05 [cse_after_recomputation]: 2.155e-05, [1] [Cycle 1]: 1.715e-05, [1] [cse]: 1.092e-05 [environ_conv]: 5.35999e-06 [swap_dp_allreduce_reducescatter]: 4.90001e-06 [bias_add_comm_swap]: 3.02002e-06 [label_micro_interleaved_index]: 4.62e-06 [label_fine_grained_interleaved_index]: 2.88e-06 [merge_cast_opt]: 1.37999e-06 [slice_recompute_activation]: 2.29001e-06 [micro_interleaved_order_control]: 2.78e-06 [assign_add_opt]: 1.24e-06 [ForceFp32Comm]: 8.2e-07 [remove_cast_before_assign_add]: 1.37999e-06 [full_micro_interleaved_order_control]: 2.01e-06 [reorder_send_recv_between_fp_bp]: 2.67001e-06 [comm_op_add_attrs]: 1.23002e-06 [add_comm_op_reuse_tag]: 9.5999e-07 [interleave_split_concat_branches]: 1.25999e-06 [interleave_parallel_branches]: 1.14e-06 [overlap_opt_shard_in_pipeline]: 1.62001e-06 [overlap_opt_shard_grad_in_pipeline]: 2.31e-06 [control_data_broadcast_order]: 1.285e-05 [grouped_pairwise_exchange_alltoall]: 1.52999e-06 [offloading_packed_experts]: 3.67002e-06 [overlap_recompute_and_grad_model_parallel]: 2.215e-05 [overlap_grad_matmul_and_grad_allreduce]: 1.65001e-06 [overlap_recompute_allgather_and_fa_grad]: 1.34998e-06 [overlap_recompute_comm]: 2.18002e-06 [overlap_grad_ring_attention]: 4.68001e-06 [overlap_grad_flash_sp]: 2.118e-05 [begin_end_overlap_inline]: 5.49975e-07 [split_matmul_comm_elemetwise]: 2.23998e-06 [split_layernorm_comm]: 1.55999e-06 [handle_group_info]: 9.09989e-07 [symbol_engine_optimizer]: 8.007e-05, [1] [Cycle 1]: 7.506e-05, [6] [build]: 3.04001e-06 [elim_shapecalc]: 1.067e-05 [elim_not_effective]: 1.429e-05 [opt_reshape]: 7.46001e-06 [fold_const_symbol]: 1.044e-05 [renormalize]: 2.3999e-07 [detach_backward]: 2.16e-06 [pipeline_parallel_scheduler]: 1.54e-06 [auto_monad_reorder]: 1.777e-05 [get_jit_bprop_graph]: 1.64e-06 [rewriter_after_jit_bprop_graph]: 4.58999e-06 [opt_after_jit_grad]: 0.00053836 [validate]: 4.233e-05 Sums bootstrap : 0.000413s : 1.16% type_inference : 0.029789s : 83.45% event_method : 0.000051s : 0.14% auto_monad : 0.000100s : 0.28% graph_reusing : 0.000007s : 0.02% inline : 0.000003s : 0.01% add_attr.add_attr_with_inline.tag_attr : 0.000029s : 0.08% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000008s : 0.02% parallel-infer-symbol : 0.000003s : 0.01% pre_auto_parallel : 0.000045s : 0.13% insert-virtual-dataset : 0.000003s : 0.01% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.01% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000034s : 0.10% optimize.rewriter_before_opt_a : 0.000115s : 0.32% optimize.opt_a.expand_dump_flag : 0.000006s : 0.02% optimize.opt_a.switch_simplify : 0.000138s : 0.39% optimize.opt_a.loop_unroll : 0.000051s : 0.14% optimize.opt_a.a_1 : 0.000995s : 2.79% optimize.opt_a.with_stream_mark : 0.000030s : 0.08% optimize.opt_a.recompute_prepare : 0.000017s : 0.05% optimize.opt_a.updatestate_depend_eliminate : 0.000007s : 0.02% optimize.opt_a.updatestate_assign_eliminate : 0.000006s : 0.02% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.02% optimize.opt_a.parameter_eliminate : 0.000003s : 0.01% optimize.opt_a.a_2 : 0.000161s : 0.45% optimize.opt_a.accelerated_algorithm : 0.000015s : 0.04% optimize.opt_a.shard : 0.000004s : 0.01% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.01% optimize.opt_a.shard_inline : 0.000014s : 0.04% optimize.opt_a.merge_send_recv : 0.000013s : 0.04% optimize.opt_a.auto_parallel : 0.000012s : 0.03% optimize.opt_a.parallel : 0.000025s : 0.07% optimize.opt_a.flash_sp : 0.000012s : 0.03% optimize.opt_a.merge_comm : 0.000007s : 0.02% optimize.opt_a.allreduce_fusion : 0.000007s : 0.02% optimize.opt_a.matmul_add_comm_reduction : 0.000017s : 0.05% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000016s : 0.05% optimize.opt_a.virtual_dataset : 0.000013s : 0.04% optimize.opt_a.get_grad_eliminate_ : 0.000012s : 0.03% optimize.opt_a.virtual_output : 0.000012s : 0.03% optimize.opt_a.merge_forward : 0.000007s : 0.02% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.01% optimize.opt_a.offload_activation : 0.000017s : 0.05% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000027s : 0.08% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.01% optimize.opt_a.before_grad : 0.000020s : 0.06% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000008s : 0.02% optimize.opt_a.meta_fg_expand : 0.000006s : 0.02% optimize.opt_a.flash_sp_send_recv_attached : 0.000003s : 0.01% optimize.opt_a.receive_attached : 0.000003s : 0.01% optimize.opt_a.after_resolve : 0.000025s : 0.07% optimize.opt_a.a_after_grad : 0.000020s : 0.06% optimize.opt_a.renormalize : 0.000946s : 2.65% optimize.opt_a.add_forward_monad_depend : 0.000008s : 0.02% optimize.opt_a.auto_monad_grad : 0.000004s : 0.01% optimize.opt_a.auto_monad_eliminator : 0.000024s : 0.07% optimize.opt_a.cse : 0.000045s : 0.13% optimize.opt_a.a_3 : 0.000090s : 0.25% optimize.py_interpret_to_execute_after_opt_a : 0.000010s : 0.03% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.01% optimize.rewriter_after_opt_a : 0.000036s : 0.10% optimize.convert_after_rewriter : 0.000007s : 0.02% optimize.order_py_execute_after_rewriter : 0.000006s : 0.02% optimize.mutable_eliminate : 0.000541s : 1.51% optimize.opt_b.b_1 : 0.000129s : 0.36% optimize.opt_b.b_2 : 0.000008s : 0.02% optimize.opt_b.updatestate_depend_eliminate : 0.000006s : 0.02% optimize.opt_b.updatestate_assign_eliminate : 0.000002s : 0.01% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.01% optimize.opt_b.renormalize : 0.000000s : 0.00% optimize.opt_b.cse : 0.000021s : 0.06% optimize.optimize_parallel_all_gather_comm : 0.000017s : 0.05% optimize.overlap_param_gather : 0.000002s : 0.01% optimize.cconv : 0.000027s : 0.08% optimize.loop_unroll : 0.000444s : 1.24% optimize.opt_after_cconv.c_1 : 0.000033s : 0.09% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000005s : 0.01% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000002s : 0.01% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.cse : 0.000018s : 0.05% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000013s : 0.04% optimize.tuple_transform.d_1 : 0.000046s : 0.13% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000007s : 0.02% optimize.partial_unused_args_eliminate : 0.000002s : 0.01% optimize.add_recomputation : 0.000048s : 0.13% optimize.cse_after_recomputation.cse : 0.000011s : 0.03% optimize.environ_conv : 0.000005s : 0.02% optimize.swap_dp_allreduce_reducescatter : 0.000005s : 0.01% optimize.bias_add_comm_swap : 0.000003s : 0.01% optimize.label_micro_interleaved_index : 0.000005s : 0.01% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.01% optimize.merge_cast_opt : 0.000001s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.01% optimize.micro_interleaved_order_control : 0.000003s : 0.01% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.00% optimize.full_micro_interleaved_order_control : 0.000002s : 0.01% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.01% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000002s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.01% optimize.control_data_broadcast_order : 0.000013s : 0.04% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.00% optimize.offloading_packed_experts : 0.000004s : 0.01% optimize.overlap_recompute_and_grad_model_parallel : 0.000022s : 0.06% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000002s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.00% optimize.overlap_recompute_comm : 0.000002s : 0.01% optimize.overlap_grad_ring_attention : 0.000005s : 0.01% optimize.overlap_grad_flash_sp : 0.000021s : 0.06% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.01% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000003s : 0.01% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000011s : 0.03% optimize.symbol_engine_optimizer.elim_not_effective : 0.000014s : 0.04% optimize.symbol_engine_optimizer.opt_reshape : 0.000007s : 0.02% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000010s : 0.03% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.01% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000018s : 0.05% get_jit_bprop_graph : 0.000002s : 0.00% rewriter_after_jit_bprop_graph : 0.000005s : 0.01% opt_after_jit_grad : 0.000538s : 1.51% validate : 0.000042s : 0.12% Time group info: ------[substitution.] 0.000232 34 0.96% : 0.000002s : 2: substitution.elim_not_effective 0.62% : 0.000001s : 2: substitution.fold_const_symbol 2.95% : 0.000007s : 4: substitution.graph_param_transform 78.43% : 0.000182s : 8: substitution.inline 1.65% : 0.000004s : 4: substitution.j_node_and_user_rematch 2.20% : 0.000005s : 4: substitution.remove_not_recompute_node 1.96% : 0.000005s : 4: substitution.replace_old_param 4.46% : 0.000010s : 2: substitution.switch_simplify 6.77% : 0.000016s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.029712 2 94.79% : 0.028163s : 1: type_inference.infer 5.21% : 0.001549s : 1: type_inference.specialize ------[replace.] 0.000124 14 42.19% : 0.000052s : 8: replace.inline 38.95% : 0.000048s : 2: replace.switch_simplify 18.86% : 0.000023s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000200 14 88.60% : 0.000177s : 8: match.inline 4.52% : 0.000009s : 2: match.switch_simplify 6.88% : 0.000014s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000245 1520 0.97% : 0.000002s : 17: predicate.accumulaten_eliminater 0.81% : 0.000002s : 4: predicate.ad_related_special_op_eliminate 0.43% : 0.000001s : 8: predicate.addn_check_dump 0.98% : 0.000002s : 17: predicate.addn_zero_filter 0.91% : 0.000002s : 17: predicate.adjust_all_reduce_mul_add 2.05% : 0.000005s : 25: predicate.arithmetic_simplify 1.01% : 0.000002s : 17: predicate.cast_eliminate 0.51% : 0.000001s : 8: predicate.check_bprop_eliminate 0.42% : 0.000001s : 8: predicate.compare_switch_simplify 0.17% : 0.000000s : 4: predicate.const_output_eliminate 0.54% : 0.000001s : 8: predicate.depend_value_elim 1.07% : 0.000003s : 17: predicate.dict_get_item_const_eliminator 1.17% : 0.000003s : 17: predicate.dict_get_item_eliminator 0.92% : 0.000002s : 17: predicate.dict_set_item_eliminator 0.87% : 0.000002s : 8: predicate.dumpgradient_eliminate 0.20% : 0.000000s : 4: predicate.elim_not_effective 0.33% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.28% : 0.000003s : 21: predicate.environ_add_const_eliminate 1.10% : 0.000003s : 21: predicate.environ_get_add_eliminate 1.12% : 0.000003s : 21: predicate.environ_get_depend_swap 1.64% : 0.000004s : 29: predicate.environ_get_eliminate 1.11% : 0.000003s : 21: predicate.environ_get_set_eliminate 1.72% : 0.000004s : 29: predicate.exchange_switch_depend_value 2.81% : 0.000007s : 29: predicate.float_depend_g_call 0.43% : 0.000001s : 8: predicate.float_environ_get_switch 0.67% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.17% : 0.000000s : 4: predicate.fold_const_symbol 0.55% : 0.000001s : 8: predicate.get_grad_eliminate 0.18% : 0.000000s : 4: predicate.graph_param_transform 0.50% : 0.000001s : 8: predicate.incorporate_call 0.40% : 0.000001s : 8: predicate.incorporate_call_switch 6.43% : 0.000016s : 70: predicate.inline 0.61% : 0.000001s : 8: predicate.inline_without_move 0.27% : 0.000001s : 8: predicate.j_node_and_user_rematch 1.04% : 0.000003s : 8: predicate.less_batch_normalization 1.74% : 0.000004s : 29: predicate.list_to_tuple_eliminator_ 2.54% : 0.000006s : 46: predicate.load_eliminater 0.82% : 0.000002s : 4: predicate.loop_unroll_after_grad 3.02% : 0.000007s : 47: predicate.loop_unroll_before_grad 1.57% : 0.000004s : 25: predicate.make_slice_get_slice_eliminator 0.44% : 0.000001s : 8: predicate.merge_addn 0.48% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.56% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.92% : 0.000002s : 17: predicate.minmaximum_grad 0.92% : 0.000002s : 4: predicate.mutable_eliminate 0.29% : 0.000001s : 4: predicate.opt_reshape 0.28% : 0.000001s : 4: predicate.parallel_virtual_node 2.20% : 0.000005s : 29: predicate.partial_defer_inline 1.61% : 0.000004s : 25: predicate.partial_eliminate 1.01% : 0.000002s : 17: predicate.print_const_string_wrapper 0.50% : 0.000001s : 8: predicate.reduce_all_const_elim 1.33% : 0.000003s : 17: predicate.reduce_eliminate 2.63% : 0.000006s : 46: predicate.redundant_stop_gradient_eliminater 0.35% : 0.000001s : 8: predicate.remove_not_recompute_node 1.33% : 0.000003s : 29: predicate.replace_applicator 0.39% : 0.000001s : 8: predicate.replace_old_param 0.24% : 0.000001s : 4: predicate.reset_defer_inline 1.06% : 0.000003s : 17: predicate.reshape_eliminate 0.53% : 0.000001s : 8: predicate.row_tensor_add_zeros_like 0.32% : 0.000001s : 4: predicate.row_tensor_eliminate 0.63% : 0.000002s : 8: predicate.same_eliminate 0.38% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.61% : 0.000001s : 8: predicate.shard_identity_eliminate 0.57% : 0.000001s : 8: predicate.special_op_eliminate 0.56% : 0.000001s : 8: predicate.specialize_transform 0.72% : 0.000002s : 8: predicate.split_environ_get_set_with_tuple_value 0.60% : 0.000001s : 8: predicate.stack_unstack_eliminate 0.28% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.83% : 0.000004s : 29: predicate.switch_defer_inline 2.21% : 0.000005s : 37: predicate.switch_layer_defer_inline 6.19% : 0.000015s : 92: predicate.switch_simplify 1.00% : 0.000002s : 17: predicate.tile_eliminate 1.03% : 0.000003s : 17: predicate.transpose_eliminate 1.51% : 0.000004s : 25: predicate.tuple_list_convert_item_index_to_positive 1.59% : 0.000004s : 25: predicate.tuple_list_get_item_const_eliminator 1.48% : 0.000004s : 25: predicate.tuple_list_get_item_depend_reorder 3.35% : 0.000008s : 37: predicate.tuple_list_get_item_eliminator 1.44% : 0.000004s : 25: predicate.tuple_list_get_set_item_eliminator 2.27% : 0.000006s : 33: predicate.tuple_list_set_item_eliminator 1.80% : 0.000004s : 29: predicate.tuple_to_list_eliminator_ 2.52% : 0.000006s : 46: predicate.updatestate_pure_node_eliminater 2.96% : 0.000007s : 54: predicate.updatestate_useless_node_eliminater 0.28% : 0.000001s : 4: predicate.value_based_eliminate 0.55% : 0.000001s : 8: predicate.virtual_dataset_eliminate 0.54% : 0.000001s : 8: predicate.virtual_output_eliminate 0.27% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.37% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.001032 16 55.35% : 0.000571s : 6: func_graph_cloner_run.FuncGraphClonerGraph 44.65% : 0.000461s : 10: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.051715 192 0.01% : 0.000003s : 1: ForceFp32Comm 6.70% : 0.003462s : 1: add_attr 6.67% : 0.003452s : 1: add_attr_with_inline 0.01% : 0.000003s : 1: add_comm_op_reuse_tag 0.10% : 0.000052s : 1: add_recomputation 0.01% : 0.000004s : 1: assign_add_opt 0.20% : 0.000106s : 1: auto_monad 0.04% : 0.000022s : 1: auto_monad_reorder 0.01% : 0.000004s : 1: begin_end_overlap_inline 0.01% : 0.000006s : 1: bias_add_comm_swap 0.85% : 0.000441s : 1: bootstrap 0.06% : 0.000031s : 1: cconv 0.01% : 0.000004s : 1: comm_op_add_attrs 0.03% : 0.000016s : 1: control_data_broadcast_order 0.02% : 0.000010s : 1: convert_after_rewriter 0.05% : 0.000026s : 1: cse_after_recomputation 0.01% : 0.000005s : 1: dataset_repeat_opt 0.01% : 0.000005s : 1: detach_backward 0.02% : 0.000009s : 1: environ_conv 0.11% : 0.000059s : 1: event_method 0.01% : 0.000005s : 1: full_micro_interleaved_order_control 0.01% : 0.000005s : 1: get_jit_bprop_graph 0.02% : 0.000011s : 1: graph_reusing 0.01% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000004s : 1: handle_group_info 0.01% : 0.000006s : 1: inline 0.01% : 0.000006s : 1: insert-virtual-dataset 0.01% : 0.000004s : 1: interleave_parallel_branches 0.01% : 0.000005s : 1: interleave_split_concat_branches 0.01% : 0.000006s : 1: label_fine_grained_interleaved_index 0.01% : 0.000007s : 1: label_micro_interleaved_index 0.88% : 0.000453s : 1: loop_unroll 0.01% : 0.000004s : 1: merge_cast_opt 0.01% : 0.000006s : 1: micro_interleaved_order_control 1.06% : 0.000551s : 1: mutable_eliminate 0.01% : 0.000007s : 1: offloading_packed_experts 0.03% : 0.000015s : 1: opt.transform.loop_unroll_optimizer 0.03% : 0.000017s : 1: opt.transform.mutable_eliminate 2.96% : 0.001532s : 78: opt.transform.opt_a 0.06% : 0.000031s : 1: opt.transform.opt_after_cconv 0.06% : 0.000029s : 1: opt.transform.opt_after_jit_grad 0.20% : 0.000105s : 28: opt.transform.opt_b 0.10% : 0.000051s : 2: opt.transform.opt_trans_graph 0.07% : 0.000039s : 4: opt.transform.symbol_engine_opt 6.47% : 0.003348s : 1: opt_a 0.21% : 0.000108s : 1: opt_after_cconv 1.06% : 0.000549s : 1: opt_after_jit_grad 0.42% : 0.000215s : 1: opt_b 10.63% : 0.005496s : 1: optimize 0.04% : 0.000020s : 1: optimize_parallel_all_gather_comm 0.02% : 0.000009s : 1: order_py_execute_after_rewriter 0.05% : 0.000025s : 1: overlap_grad_flash_sp 0.01% : 0.000005s : 1: overlap_grad_matmul_and_grad_allreduce 0.02% : 0.000008s : 1: overlap_grad_ring_attention 0.01% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.01% : 0.000005s : 1: overlap_param_gather 0.01% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.05% : 0.000026s : 1: overlap_recompute_and_grad_model_parallel 0.01% : 0.000005s : 1: overlap_recompute_comm 0.01% : 0.000007s : 1: parallel-infer-symbol 0.01% : 0.000004s : 1: parallel-infer-symbol-second 0.01% : 0.000005s : 1: partial_unused_args_eliminate 0.01% : 0.000005s : 1: pipeline_parallel_scheduler 0.01% : 0.000005s : 1: pipeline_split 0.10% : 0.000049s : 1: pre_auto_parallel 0.07% : 0.000038s : 1: py_interpret_to_execute 0.03% : 0.000013s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000004s : 1: remove_cast_before_assign_add 0.03% : 0.000017s : 1: remove_dup_value 0.97% : 0.000500s : 1: renormalize.infer 0.84% : 0.000436s : 1: renormalize.specialize 0.01% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.01% : 0.000008s : 1: rewriter_after_jit_bprop_graph 0.08% : 0.000041s : 1: rewriter_after_opt_a 0.23% : 0.000120s : 1: rewriter_before_opt_a 0.01% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.01% : 0.000005s : 1: slice_recompute_activation 0.01% : 0.000004s : 1: split_layernorm_comm 0.01% : 0.000005s : 1: split_matmul_comm_elemetwise 0.02% : 0.000008s : 1: swap_dp_allreduce_reducescatter 0.16% : 0.000083s : 1: symbol_engine_optimizer 0.16% : 0.000081s : 1: tuple_transform 57.65% : 0.029815s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:20.847.277 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:20.847.548 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0608174, [21] [bootstrap]: 0.00045789 [type_inference]: 0.0310604 [event_method]: 9.521e-05 [auto_monad]: 0.00010991 [graph_reusing]: 7.9e-06 [inline]: 2.81e-06 [add_attr]: 0.00373265, [1] [add_attr_with_inline]: 0.00372022, [1] [Cycle 1]: 0.00010591, [2] [tag_attr]: 3.278e-05 [meta_addattr_fg_expand]: 8.17e-06 [parallel-infer-symbol]: 3.63999e-06 [pre_auto_parallel]: 5.012e-05 [insert-virtual-dataset]: 2.43998e-06 [parallel-infer-symbol-second]: 7.89994e-07 [dataset_repeat_opt]: 2.54999e-06 [pipeline_split]: 1.55999e-06 [optimize]: 0.0238713, [53] [py_interpret_to_execute]: 4.683e-05 [rewriter_before_opt_a]: 0.0001292 [opt_a]: 0.00454038, [2] [Cycle 1]: 0.0034626, [45] [expand_dump_flag]: 4.70999e-06 [switch_simplify]: 0.00013771 [loop_unroll]: 4.488e-05 [a_1]: 0.0010105 [with_stream_mark]: 2.367e-05 [recompute_prepare]: 1.497e-05 [updatestate_depend_eliminate]: 5.77001e-06 [updatestate_assign_eliminate]: 4.43999e-06 [updatestate_loads_eliminate]: 3.65e-06 [parameter_eliminate]: 2.21e-06 [a_2]: 0.00015217 [accelerated_algorithm]: 1.054e-05 [shard]: 2.71999e-06 [meta_shard_fg_expand]: 2.59001e-06 [shard_inline]: 9.45001e-06 [merge_send_recv]: 1.242e-05 [auto_parallel]: 9.57001e-06 [parallel]: 2.108e-05 [flash_sp]: 1.02e-05 [merge_comm]: 5.74e-06 [allreduce_fusion]: 4.87998e-06 [matmul_add_comm_reduction]: 1.325e-05 [allreduce_slice_to_reducescatter]: 6.50005e-07 [virtual_shard_identity]: 1.246e-05 [virtual_dataset]: 9.27001e-06 [get_grad_eliminate_]: 8.45001e-06 [virtual_output]: 8.95999e-06 [merge_forward]: 4.94e-06 [cell_reuse_recompute_pass]: 1.88997e-06 [offload_activation]: 1.26e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.009e-05 [merge_recompute_call_nodes]: 1.79e-06 [before_grad]: 1.452e-05 [set_forward_comm_id_for_comm_node_pass]: 5.17e-06 [meta_fg_expand]: 4.17e-06 [flash_sp_send_recv_attached]: 2.99999e-06 [receive_attached]: 2.81999e-06 [after_resolve]: 1.54e-05 [a_after_grad]: 1.384e-05 [renormalize]: 0.00119016 [add_forward_monad_depend]: 7.85998e-06 [auto_monad_grad]: 3.23e-06 [auto_monad_eliminator]: 2.162e-05 [cse]: 3.82e-05 [a_3]: 8.333e-05 [Cycle 2]: 0.00106112, [45] [expand_dump_flag]: 2.02001e-06 [switch_simplify]: 1.207e-05 [loop_unroll]: 9.12001e-06 [a_1]: 0.00019976 [with_stream_mark]: 1.749e-05 [recompute_prepare]: 9.22999e-06 [updatestate_depend_eliminate]: 5.28002e-06 [updatestate_assign_eliminate]: 3.52002e-06 [updatestate_loads_eliminate]: 3.37002e-06 [parameter_eliminate]: 1.91e-06 [a_2]: 0.00012585 [accelerated_algorithm]: 8.44998e-06 [shard]: 1.67999e-06 [meta_shard_fg_expand]: 2.13002e-06 [shard_inline]: 8.90999e-06 [merge_send_recv]: 7.68001e-06 [auto_parallel]: 8.50999e-06 [parallel]: 7.50998e-06 [flash_sp]: 4.48001e-06 [merge_comm]: 4.28001e-06 [allreduce_fusion]: 4.23001e-06 [matmul_add_comm_reduction]: 9.37001e-06 [allreduce_slice_to_reducescatter]: 5.89993e-07 [virtual_shard_identity]: 1.179e-05 [virtual_dataset]: 8e-06 [get_grad_eliminate_]: 7.42002e-06 [virtual_output]: 8.38001e-06 [merge_forward]: 4.90001e-06 [cell_reuse_recompute_pass]: 2.36998e-06 [offload_activation]: 9.62001e-06 [cell_reuse_handle_not_recompute_node_pass]: 2.002e-05 [merge_recompute_call_nodes]: 1.54e-06 [before_grad]: 1.272e-05 [set_forward_comm_id_for_comm_node_pass]: 6.11e-06 [meta_fg_expand]: 3.43e-06 [flash_sp_send_recv_attached]: 1.07998e-06 [receive_attached]: 1.76e-06 [after_resolve]: 1.349e-05 [a_after_grad]: 1.266e-05 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 2.95002e-06 [auto_monad_grad]: 1.77001e-06 [auto_monad_eliminator]: 1.287e-05 [cse]: 2.329e-05 [a_3]: 6.308e-05 [py_interpret_to_execute_after_opt_a]: 2.108e-05 [slice_cell_reuse_recomputed_activation]: 5.96003e-06 [rewriter_after_opt_a]: 5.599e-05 [convert_after_rewriter]: 1.136e-05 [order_py_execute_after_rewriter]: 9.79999e-06 [mutable_eliminate]: 0.0168564 [opt_b]: 0.00038851, [1] [Cycle 1]: 0.00037526, [7] [b_1]: 0.00022547 [b_2]: 1.306e-05 [updatestate_depend_eliminate]: 1.522e-05 [updatestate_assign_eliminate]: 3.98001e-06 [updatestate_loads_eliminate]: 3.51999e-06 [renormalize]: 1.27999e-06 [cse]: 4.999e-05 [optimize_parallel_all_gather_comm]: 3.008e-05 [overlap_param_gather]: 6.25002e-06 [cconv]: 4.525e-05 [loop_unroll]: 0.00067707 [opt_after_cconv]: 0.00016106, [1] [Cycle 1]: 0.00015153, [7] [c_1]: 4.066e-05 [parameter_eliminate]: 5.92999e-06 [updatestate_depend_eliminate]: 8.2e-06 [updatestate_assign_eliminate]: 3.48999e-06 [updatestate_loads_eliminate]: 4.38999e-06 [cse]: 2.965e-05 [renormalize]: 8.2e-07 [remove_dup_value]: 2.271e-05 [tuple_transform]: 0.00012004, [1] [Cycle 1]: 0.00011131, [4] [d_1]: 6.542e-05 [none_parameter_eliminate]: 1.91e-06 [renormalize]: 4.09986e-07 [switch_simplify]: 1.004e-05 [partial_unused_args_eliminate]: 4.38999e-06 [add_recomputation]: 7.061e-05 [cse_after_recomputation]: 3.824e-05, [1] [Cycle 1]: 2.978e-05, [1] [cse]: 1.847e-05 [environ_conv]: 1.086e-05 [swap_dp_allreduce_reducescatter]: 9.84999e-06 [bias_add_comm_swap]: 7.15998e-06 [label_micro_interleaved_index]: 8.15e-06 [label_fine_grained_interleaved_index]: 5.20999e-06 [merge_cast_opt]: 5.46e-06 [slice_recompute_activation]: 4.68999e-06 [micro_interleaved_order_control]: 4.61002e-06 [assign_add_opt]: 4.17e-06 [ForceFp32Comm]: 3.09999e-06 [remove_cast_before_assign_add]: 4.46002e-06 [full_micro_interleaved_order_control]: 4.60999e-06 [reorder_send_recv_between_fp_bp]: 5.58002e-06 [comm_op_add_attrs]: 3.53999e-06 [add_comm_op_reuse_tag]: 3.11001e-06 [interleave_split_concat_branches]: 3.65e-06 [interleave_parallel_branches]: 3.29001e-06 [overlap_opt_shard_in_pipeline]: 3.88001e-06 [overlap_opt_shard_grad_in_pipeline]: 4.18001e-06 [control_data_broadcast_order]: 2.439e-05 [grouped_pairwise_exchange_alltoall]: 4.38999e-06 [offloading_packed_experts]: 8.95001e-06 [overlap_recompute_and_grad_model_parallel]: 7.96001e-06 [overlap_grad_matmul_and_grad_allreduce]: 4.32e-06 [overlap_recompute_allgather_and_fa_grad]: 3.95998e-06 [overlap_recompute_comm]: 4.67998e-06 [overlap_grad_ring_attention]: 7.6e-06 [overlap_grad_flash_sp]: 2.93e-05 [begin_end_overlap_inline]: 3.22002e-06 [split_matmul_comm_elemetwise]: 4.95999e-06 [split_layernorm_comm]: 4.23999e-06 [handle_group_info]: 3.35003e-06 [symbol_engine_optimizer]: 0.0001244, [1] [Cycle 1]: 0.00011594, [6] [build]: 4.42e-06 [elim_shapecalc]: 1.613e-05 [elim_not_effective]: 1.929e-05 [opt_reshape]: 9.52001e-06 [fold_const_symbol]: 1.313e-05 [renormalize]: 6.00005e-07 [detach_backward]: 4.93001e-06 [pipeline_parallel_scheduler]: 2.06e-06 [auto_monad_reorder]: 2.73e-05 [get_jit_bprop_graph]: 2.42001e-06 [rewriter_after_jit_bprop_graph]: 7.06001e-06 [opt_after_jit_grad]: 0.00057051 [validate]: 4.865e-05 Sums bootstrap : 0.000458s : 0.83% type_inference : 0.031060s : 56.53% event_method : 0.000095s : 0.17% auto_monad : 0.000110s : 0.20% graph_reusing : 0.000008s : 0.01% inline : 0.000003s : 0.01% add_attr.add_attr_with_inline.tag_attr : 0.000033s : 0.06% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000008s : 0.01% parallel-infer-symbol : 0.000004s : 0.01% pre_auto_parallel : 0.000050s : 0.09% insert-virtual-dataset : 0.000002s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000003s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000047s : 0.09% optimize.rewriter_before_opt_a : 0.000129s : 0.24% optimize.opt_a.expand_dump_flag : 0.000007s : 0.01% optimize.opt_a.switch_simplify : 0.000150s : 0.27% optimize.opt_a.loop_unroll : 0.000054s : 0.10% optimize.opt_a.a_1 : 0.001210s : 2.20% optimize.opt_a.with_stream_mark : 0.000041s : 0.07% optimize.opt_a.recompute_prepare : 0.000024s : 0.04% optimize.opt_a.updatestate_depend_eliminate : 0.000011s : 0.02% optimize.opt_a.updatestate_assign_eliminate : 0.000008s : 0.01% optimize.opt_a.updatestate_loads_eliminate : 0.000007s : 0.01% optimize.opt_a.parameter_eliminate : 0.000004s : 0.01% optimize.opt_a.a_2 : 0.000278s : 0.51% optimize.opt_a.accelerated_algorithm : 0.000019s : 0.03% optimize.opt_a.shard : 0.000004s : 0.01% optimize.opt_a.meta_shard_fg_expand : 0.000005s : 0.01% optimize.opt_a.shard_inline : 0.000018s : 0.03% optimize.opt_a.merge_send_recv : 0.000020s : 0.04% optimize.opt_a.auto_parallel : 0.000018s : 0.03% optimize.opt_a.parallel : 0.000029s : 0.05% optimize.opt_a.flash_sp : 0.000015s : 0.03% optimize.opt_a.merge_comm : 0.000010s : 0.02% optimize.opt_a.allreduce_fusion : 0.000009s : 0.02% optimize.opt_a.matmul_add_comm_reduction : 0.000023s : 0.04% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000024s : 0.04% optimize.opt_a.virtual_dataset : 0.000017s : 0.03% optimize.opt_a.get_grad_eliminate_ : 0.000016s : 0.03% optimize.opt_a.virtual_output : 0.000017s : 0.03% optimize.opt_a.merge_forward : 0.000010s : 0.02% optimize.opt_a.cell_reuse_recompute_pass : 0.000004s : 0.01% optimize.opt_a.offload_activation : 0.000022s : 0.04% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000040s : 0.07% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.01% optimize.opt_a.before_grad : 0.000027s : 0.05% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000011s : 0.02% optimize.opt_a.meta_fg_expand : 0.000008s : 0.01% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.01% optimize.opt_a.receive_attached : 0.000005s : 0.01% optimize.opt_a.after_resolve : 0.000029s : 0.05% optimize.opt_a.a_after_grad : 0.000026s : 0.05% optimize.opt_a.renormalize : 0.001190s : 2.17% optimize.opt_a.add_forward_monad_depend : 0.000011s : 0.02% optimize.opt_a.auto_monad_grad : 0.000005s : 0.01% optimize.opt_a.auto_monad_eliminator : 0.000034s : 0.06% optimize.opt_a.cse : 0.000061s : 0.11% optimize.opt_a.a_3 : 0.000146s : 0.27% optimize.py_interpret_to_execute_after_opt_a : 0.000021s : 0.04% optimize.slice_cell_reuse_recomputed_activation : 0.000006s : 0.01% optimize.rewriter_after_opt_a : 0.000056s : 0.10% optimize.convert_after_rewriter : 0.000011s : 0.02% optimize.order_py_execute_after_rewriter : 0.000010s : 0.02% optimize.mutable_eliminate : 0.016856s : 30.68% optimize.opt_b.b_1 : 0.000225s : 0.41% optimize.opt_b.b_2 : 0.000013s : 0.02% optimize.opt_b.updatestate_depend_eliminate : 0.000015s : 0.03% optimize.opt_b.updatestate_assign_eliminate : 0.000004s : 0.01% optimize.opt_b.updatestate_loads_eliminate : 0.000004s : 0.01% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000050s : 0.09% optimize.optimize_parallel_all_gather_comm : 0.000030s : 0.05% optimize.overlap_param_gather : 0.000006s : 0.01% optimize.cconv : 0.000045s : 0.08% optimize.loop_unroll : 0.000677s : 1.23% optimize.opt_after_cconv.c_1 : 0.000041s : 0.07% optimize.opt_after_cconv.parameter_eliminate : 0.000006s : 0.01% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000008s : 0.01% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000004s : 0.01% optimize.opt_after_cconv.cse : 0.000030s : 0.05% optimize.opt_after_cconv.renormalize : 0.000001s : 0.00% optimize.remove_dup_value : 0.000023s : 0.04% optimize.tuple_transform.d_1 : 0.000065s : 0.12% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000010s : 0.02% optimize.partial_unused_args_eliminate : 0.000004s : 0.01% optimize.add_recomputation : 0.000071s : 0.13% optimize.cse_after_recomputation.cse : 0.000018s : 0.03% optimize.environ_conv : 0.000011s : 0.02% optimize.swap_dp_allreduce_reducescatter : 0.000010s : 0.02% optimize.bias_add_comm_swap : 0.000007s : 0.01% optimize.label_micro_interleaved_index : 0.000008s : 0.01% optimize.label_fine_grained_interleaved_index : 0.000005s : 0.01% optimize.merge_cast_opt : 0.000005s : 0.01% optimize.slice_recompute_activation : 0.000005s : 0.01% optimize.micro_interleaved_order_control : 0.000005s : 0.01% optimize.assign_add_opt : 0.000004s : 0.01% optimize.ForceFp32Comm : 0.000003s : 0.01% optimize.remove_cast_before_assign_add : 0.000004s : 0.01% optimize.full_micro_interleaved_order_control : 0.000005s : 0.01% optimize.reorder_send_recv_between_fp_bp : 0.000006s : 0.01% optimize.comm_op_add_attrs : 0.000004s : 0.01% optimize.add_comm_op_reuse_tag : 0.000003s : 0.01% optimize.interleave_split_concat_branches : 0.000004s : 0.01% optimize.interleave_parallel_branches : 0.000003s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000004s : 0.01% optimize.control_data_broadcast_order : 0.000024s : 0.04% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.01% optimize.offloading_packed_experts : 0.000009s : 0.02% optimize.overlap_recompute_and_grad_model_parallel : 0.000008s : 0.01% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.01% optimize.overlap_recompute_comm : 0.000005s : 0.01% optimize.overlap_grad_ring_attention : 0.000008s : 0.01% optimize.overlap_grad_flash_sp : 0.000029s : 0.05% optimize.begin_end_overlap_inline : 0.000003s : 0.01% optimize.split_matmul_comm_elemetwise : 0.000005s : 0.01% optimize.split_layernorm_comm : 0.000004s : 0.01% optimize.handle_group_info : 0.000003s : 0.01% optimize.symbol_engine_optimizer.build : 0.000004s : 0.01% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000016s : 0.03% optimize.symbol_engine_optimizer.elim_not_effective : 0.000019s : 0.04% optimize.symbol_engine_optimizer.opt_reshape : 0.000010s : 0.02% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000013s : 0.02% optimize.symbol_engine_optimizer.renormalize : 0.000001s : 0.00% detach_backward : 0.000005s : 0.01% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000027s : 0.05% get_jit_bprop_graph : 0.000002s : 0.00% rewriter_after_jit_bprop_graph : 0.000007s : 0.01% opt_after_jit_grad : 0.000571s : 1.04% validate : 0.000049s : 0.09% Time group info: ------[substitution.] 0.000286 44 9.82% : 0.000028s : 3: substitution.cast_eliminate 0.87% : 0.000003s : 3: substitution.elim_not_effective 0.65% : 0.000002s : 3: substitution.fold_const_symbol 3.07% : 0.000009s : 5: substitution.graph_param_transform 69.86% : 0.000200s : 8: substitution.inline 1.93% : 0.000006s : 6: substitution.j_node_and_user_rematch 2.17% : 0.000006s : 6: substitution.remove_not_recompute_node 2.03% : 0.000006s : 4: substitution.replace_old_param 3.79% : 0.000011s : 2: substitution.switch_simplify 5.81% : 0.000017s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.030984 2 94.84% : 0.029386s : 1: type_inference.infer 5.16% : 0.001598s : 1: type_inference.specialize ------[replace.] 0.000141 14 41.65% : 0.000059s : 8: replace.inline 38.07% : 0.000054s : 2: replace.switch_simplify 20.27% : 0.000029s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000218 14 89.07% : 0.000195s : 8: match.inline 4.39% : 0.000010s : 2: match.switch_simplify 6.54% : 0.000014s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000320 1838 0.90% : 0.000003s : 21: predicate.accumulaten_eliminater 0.71% : 0.000002s : 5: predicate.ad_related_special_op_eliminate 0.46% : 0.000001s : 10: predicate.addn_check_dump 0.98% : 0.000003s : 21: predicate.addn_zero_filter 0.90% : 0.000003s : 21: predicate.adjust_all_reduce_mul_add 2.05% : 0.000007s : 31: predicate.arithmetic_simplify 0.98% : 0.000003s : 21: predicate.cast_eliminate 0.49% : 0.000002s : 10: predicate.check_bprop_eliminate 0.50% : 0.000002s : 10: predicate.compare_switch_simplify 0.15% : 0.000000s : 5: predicate.const_output_eliminate 0.53% : 0.000002s : 10: predicate.depend_value_elim 1.05% : 0.000003s : 21: predicate.dict_get_item_const_eliminator 1.11% : 0.000004s : 21: predicate.dict_get_item_eliminator 0.89% : 0.000003s : 21: predicate.dict_set_item_eliminator 0.64% : 0.000002s : 10: predicate.dumpgradient_eliminate 0.31% : 0.000001s : 5: predicate.elim_not_effective 0.36% : 0.000001s : 5: predicate.elim_shapecalc_of_broadcastargs 1.14% : 0.000004s : 26: predicate.environ_add_const_eliminate 1.11% : 0.000004s : 26: predicate.environ_get_add_eliminate 1.12% : 0.000004s : 26: predicate.environ_get_depend_swap 1.63% : 0.000005s : 36: predicate.environ_get_eliminate 1.10% : 0.000004s : 26: predicate.environ_get_set_eliminate 1.48% : 0.000005s : 33: predicate.exchange_switch_depend_value 2.45% : 0.000008s : 33: predicate.float_depend_g_call 0.50% : 0.000002s : 10: predicate.float_environ_get_switch 0.82% : 0.000003s : 15: predicate.float_tuple_getitem_switch 0.15% : 0.000000s : 5: predicate.fold_const_symbol 0.66% : 0.000002s : 10: predicate.get_grad_eliminate 0.24% : 0.000001s : 5: predicate.graph_param_transform 0.51% : 0.000002s : 10: predicate.incorporate_call 0.42% : 0.000001s : 10: predicate.incorporate_call_switch 6.60% : 0.000021s : 84: predicate.inline 0.86% : 0.000003s : 10: predicate.inline_without_move 0.26% : 0.000001s : 10: predicate.j_node_and_user_rematch 0.71% : 0.000002s : 10: predicate.less_batch_normalization 1.65% : 0.000005s : 35: predicate.list_to_tuple_eliminator_ 2.45% : 0.000008s : 56: predicate.load_eliminater 0.81% : 0.000003s : 5: predicate.loop_unroll_after_grad 2.42% : 0.000008s : 49: predicate.loop_unroll_before_grad 1.70% : 0.000005s : 31: predicate.make_slice_get_slice_eliminator 0.62% : 0.000002s : 10: predicate.merge_addn 0.47% : 0.000002s : 10: predicate.micro_step_allgather_replace 0.55% : 0.000002s : 10: predicate.mini_step_allgather_replace 0.89% : 0.000003s : 21: predicate.minmaximum_grad 2.24% : 0.000007s : 5: predicate.mutable_eliminate 0.32% : 0.000001s : 5: predicate.opt_reshape 0.39% : 0.000001s : 5: predicate.parallel_virtual_node 1.96% : 0.000006s : 33: predicate.partial_defer_inline 1.56% : 0.000005s : 30: predicate.partial_eliminate 1.07% : 0.000003s : 21: predicate.print_const_string_wrapper 0.60% : 0.000002s : 10: predicate.reduce_all_const_elim 1.16% : 0.000004s : 21: predicate.reduce_eliminate 2.32% : 0.000007s : 56: predicate.redundant_stop_gradient_eliminater 0.43% : 0.000001s : 10: predicate.remove_not_recompute_node 1.29% : 0.000004s : 35: predicate.replace_applicator 0.35% : 0.000001s : 10: predicate.replace_old_param 0.32% : 0.000001s : 5: predicate.reset_defer_inline 1.13% : 0.000004s : 21: predicate.reshape_eliminate 0.66% : 0.000002s : 10: predicate.row_tensor_add_zeros_like 0.52% : 0.000002s : 5: predicate.row_tensor_eliminate 0.65% : 0.000002s : 10: predicate.same_eliminate 0.37% : 0.000001s : 10: predicate.set_cell_output_no_recompute 0.87% : 0.000003s : 10: predicate.shard_identity_eliminate 0.69% : 0.000002s : 10: predicate.special_op_eliminate 0.77% : 0.000002s : 10: predicate.specialize_transform 0.70% : 0.000002s : 10: predicate.split_environ_get_set_with_tuple_value 0.66% : 0.000002s : 10: predicate.stack_unstack_eliminate 0.42% : 0.000001s : 5: predicate.switch_call_monad_eliminater 1.63% : 0.000005s : 33: predicate.switch_defer_inline 2.03% : 0.000006s : 43: predicate.switch_layer_defer_inline 5.44% : 0.000017s : 101: predicate.switch_simplify 1.01% : 0.000003s : 21: predicate.tile_eliminate 1.00% : 0.000003s : 21: predicate.transpose_eliminate 1.53% : 0.000005s : 31: predicate.tuple_list_convert_item_index_to_positive 1.64% : 0.000005s : 31: predicate.tuple_list_get_item_const_eliminator 1.47% : 0.000005s : 31: predicate.tuple_list_get_item_depend_reorder 3.43% : 0.000011s : 45: predicate.tuple_list_get_item_eliminator 1.51% : 0.000005s : 31: predicate.tuple_list_get_set_item_eliminator 2.23% : 0.000007s : 41: predicate.tuple_list_set_item_eliminator 1.61% : 0.000005s : 35: predicate.tuple_to_list_eliminator_ 2.35% : 0.000008s : 56: predicate.updatestate_pure_node_eliminater 3.03% : 0.000010s : 66: predicate.updatestate_useless_node_eliminater 0.39% : 0.000001s : 5: predicate.value_based_eliminate 0.57% : 0.000002s : 10: predicate.virtual_dataset_eliminate 0.58% : 0.000002s : 10: predicate.virtual_output_eliminate 0.26% : 0.000001s : 5: predicate.virtual_view_grad_eliminate 0.53% : 0.000002s : 5: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.001136 16 55.31% : 0.000628s : 6: func_graph_cloner_run.FuncGraphClonerGraph 44.69% : 0.000507s : 10: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.091727 192 0.01% : 0.000006s : 1: ForceFp32Comm 4.08% : 0.003743s : 1: add_attr 4.06% : 0.003725s : 1: add_attr_with_inline 0.01% : 0.000006s : 1: add_comm_op_reuse_tag 0.08% : 0.000075s : 1: add_recomputation 0.01% : 0.000007s : 1: assign_add_opt 0.13% : 0.000123s : 1: auto_monad 0.04% : 0.000036s : 1: auto_monad_reorder 0.01% : 0.000006s : 1: begin_end_overlap_inline 0.01% : 0.000010s : 1: bias_add_comm_swap 0.55% : 0.000506s : 1: bootstrap 0.05% : 0.000049s : 1: cconv 0.01% : 0.000006s : 1: comm_op_add_attrs 0.03% : 0.000028s : 1: control_data_broadcast_order 0.02% : 0.000015s : 1: convert_after_rewriter 0.05% : 0.000041s : 1: cse_after_recomputation 0.01% : 0.000008s : 1: dataset_repeat_opt 0.03% : 0.000029s : 1: detach_backward 0.02% : 0.000014s : 1: environ_conv 0.12% : 0.000111s : 1: event_method 0.01% : 0.000007s : 1: full_micro_interleaved_order_control 0.01% : 0.000009s : 1: get_jit_bprop_graph 0.02% : 0.000016s : 1: graph_reusing 0.01% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000006s : 1: handle_group_info 0.01% : 0.000008s : 1: inline 0.01% : 0.000008s : 1: insert-virtual-dataset 0.01% : 0.000006s : 1: interleave_parallel_branches 0.01% : 0.000007s : 1: interleave_split_concat_branches 0.01% : 0.000008s : 1: label_fine_grained_interleaved_index 0.01% : 0.000011s : 1: label_micro_interleaved_index 0.75% : 0.000684s : 1: loop_unroll 0.01% : 0.000008s : 1: merge_cast_opt 0.01% : 0.000008s : 1: micro_interleaved_order_control 18.39% : 0.016870s : 1: mutable_eliminate 0.01% : 0.000012s : 1: offloading_packed_experts 0.02% : 0.000020s : 1: opt.transform.loop_unroll_optimizer 0.05% : 0.000045s : 1: opt.transform.mutable_eliminate 2.07% : 0.001902s : 78: opt.transform.opt_a 0.04% : 0.000039s : 1: opt.transform.opt_after_cconv 0.04% : 0.000034s : 1: opt.transform.opt_after_jit_grad 0.17% : 0.000156s : 28: opt.transform.opt_b 0.08% : 0.000072s : 2: opt.transform.opt_trans_graph 0.06% : 0.000053s : 4: opt.transform.symbol_engine_opt 4.95% : 0.004545s : 1: opt_a 0.18% : 0.000165s : 1: opt_after_cconv 0.64% : 0.000583s : 1: opt_after_jit_grad 0.43% : 0.000394s : 1: opt_b 26.53% : 0.024336s : 1: optimize 0.04% : 0.000034s : 1: optimize_parallel_all_gather_comm 0.01% : 0.000013s : 1: order_py_execute_after_rewriter 0.04% : 0.000034s : 1: overlap_grad_flash_sp 0.01% : 0.000007s : 1: overlap_grad_matmul_and_grad_allreduce 0.01% : 0.000012s : 1: overlap_grad_ring_attention 0.01% : 0.000008s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000007s : 1: overlap_opt_shard_in_pipeline 0.01% : 0.000010s : 1: overlap_param_gather 0.01% : 0.000007s : 1: overlap_recompute_allgather_and_fa_grad 0.01% : 0.000011s : 1: overlap_recompute_and_grad_model_parallel 0.01% : 0.000008s : 1: overlap_recompute_comm 0.01% : 0.000010s : 1: parallel-infer-symbol 0.01% : 0.000007s : 1: parallel-infer-symbol-second 0.01% : 0.000007s : 1: partial_unused_args_eliminate 0.01% : 0.000009s : 1: pipeline_parallel_scheduler 0.01% : 0.000007s : 1: pipeline_split 0.06% : 0.000059s : 1: pre_auto_parallel 0.06% : 0.000051s : 1: py_interpret_to_execute 0.03% : 0.000025s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000008s : 1: remove_cast_before_assign_add 0.03% : 0.000026s : 1: remove_dup_value 0.70% : 0.000643s : 1: renormalize.infer 0.58% : 0.000536s : 1: renormalize.specialize 0.01% : 0.000008s : 1: reorder_send_recv_between_fp_bp 0.01% : 0.000013s : 1: rewriter_after_jit_bprop_graph 0.07% : 0.000061s : 1: rewriter_after_opt_a 0.15% : 0.000133s : 1: rewriter_before_opt_a 0.01% : 0.000009s : 1: slice_cell_reuse_recomputed_activation 0.01% : 0.000008s : 1: slice_recompute_activation 0.01% : 0.000007s : 1: split_layernorm_comm 0.01% : 0.000008s : 1: split_matmul_comm_elemetwise 0.01% : 0.000013s : 1: swap_dp_allreduce_reducescatter 0.14% : 0.000128s : 1: symbol_engine_optimizer 0.13% : 0.000123s : 1: tuple_transform 33.92% : 0.031111s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:21.290.264 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0404312, [21] [bootstrap]: 0.00043512 [type_inference]: 0.0299151 [event_method]: 8.981e-05 [auto_monad]: 9.787e-05 [graph_reusing]: 7.82e-06 [inline]: 2.66999e-06 [add_attr]: 0.00321697, [1] [add_attr_with_inline]: 0.00320694, [1] [Cycle 1]: 7.22e-05, [2] [tag_attr]: 2.909e-05 [meta_addattr_fg_expand]: 8.36002e-06 [parallel-infer-symbol]: 2.94001e-06 [pre_auto_parallel]: 4.446e-05 [insert-virtual-dataset]: 2.43002e-06 [parallel-infer-symbol-second]: 6.60017e-07 [dataset_repeat_opt]: 2.08002e-06 [pipeline_split]: 1.57001e-06 [optimize]: 0.00588881, [53] [py_interpret_to_execute]: 3.46e-05 [rewriter_before_opt_a]: 0.00011687 [opt_a]: 0.00365132, [2] [Cycle 1]: 0.00287435, [45] [expand_dump_flag]: 4.02002e-06 [switch_simplify]: 0.00012564 [loop_unroll]: 4.197e-05 [a_1]: 0.00100238 [with_stream_mark]: 1.898e-05 [recompute_prepare]: 1.152e-05 [updatestate_depend_eliminate]: 4.76002e-06 [updatestate_assign_eliminate]: 3.65e-06 [updatestate_loads_eliminate]: 3.27997e-06 [parameter_eliminate]: 2.14e-06 [a_2]: 0.000104 [accelerated_algorithm]: 9.14e-06 [shard]: 1.81e-06 [meta_shard_fg_expand]: 2.66999e-06 [shard_inline]: 8.03001e-06 [merge_send_recv]: 1.016e-05 [auto_parallel]: 6.86001e-06 [parallel]: 1.793e-05 [flash_sp]: 8.95999e-06 [merge_comm]: 4.62e-06 [allreduce_fusion]: 4.08999e-06 [matmul_add_comm_reduction]: 1.086e-05 [allreduce_slice_to_reducescatter]: 7.09988e-07 [virtual_shard_identity]: 1.006e-05 [virtual_dataset]: 7.93001e-06 [get_grad_eliminate_]: 7.61001e-06 [virtual_output]: 7.5e-06 [merge_forward]: 5.04e-06 [cell_reuse_recompute_pass]: 1.37e-06 [offload_activation]: 1.083e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.659e-05 [merge_recompute_call_nodes]: 1.42e-06 [before_grad]: 1.253e-05 [set_forward_comm_id_for_comm_node_pass]: 5.14e-06 [meta_fg_expand]: 3.85e-06 [flash_sp_send_recv_attached]: 2.53998e-06 [receive_attached]: 2.14999e-06 [after_resolve]: 1.314e-05 [a_after_grad]: 1.181e-05 [renormalize]: 0.00096234 [add_forward_monad_depend]: 5.84e-06 [auto_monad_grad]: 2.21998e-06 [auto_monad_eliminator]: 1.811e-05 [cse]: 3.956e-05 [a_3]: 6.016e-05 [Cycle 2]: 0.00076681, [45] [expand_dump_flag]: 1.19e-06 [switch_simplify]: 9.80002e-06 [loop_unroll]: 7.6e-06 [a_1]: 0.00018368 [with_stream_mark]: 1.309e-05 [recompute_prepare]: 8.32998e-06 [updatestate_depend_eliminate]: 4.42003e-06 [updatestate_assign_eliminate]: 3.01001e-06 [updatestate_loads_eliminate]: 2.88e-06 [parameter_eliminate]: 1.15001e-06 [a_2]: 9.401e-05 [accelerated_algorithm]: 7.77e-06 [shard]: 1.50999e-06 [meta_shard_fg_expand]: 1.82999e-06 [shard_inline]: 7.75e-06 [merge_send_recv]: 5.90002e-06 [auto_parallel]: 6.80002e-06 [parallel]: 5.48002e-06 [flash_sp]: 3.36001e-06 [merge_comm]: 4.03001e-06 [allreduce_fusion]: 4.10998e-06 [matmul_add_comm_reduction]: 7.48e-06 [allreduce_slice_to_reducescatter]: 5.8001e-07 [virtual_shard_identity]: 8.16002e-06 [virtual_dataset]: 7.08e-06 [get_grad_eliminate_]: 7.18e-06 [virtual_output]: 6.73e-06 [merge_forward]: 3.66001e-06 [cell_reuse_recompute_pass]: 1.44998e-06 [offload_activation]: 7.58999e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.526e-05 [merge_recompute_call_nodes]: 1.02e-06 [before_grad]: 1.237e-05 [set_forward_comm_id_for_comm_node_pass]: 4.53001e-06 [meta_fg_expand]: 3.21001e-06 [flash_sp_send_recv_attached]: 1.07e-06 [receive_attached]: 1.10001e-06 [after_resolve]: 1.159e-05 [a_after_grad]: 1.097e-05 [renormalize]: 7.99773e-08 [add_forward_monad_depend]: 1.66e-06 [auto_monad_grad]: 1.22e-06 [auto_monad_eliminator]: 8.75001e-06 [cse]: 1.758e-05 [a_3]: 4.651e-05 [py_interpret_to_execute_after_opt_a]: 1.089e-05 [slice_cell_reuse_recomputed_activation]: 2.14999e-06 [rewriter_after_opt_a]: 3.994e-05 [convert_after_rewriter]: 7.26999e-06 [order_py_execute_after_rewriter]: 5.98002e-06 [mutable_eliminate]: 0.00052853 [opt_b]: 0.00024603, [1] [Cycle 1]: 0.00024022, [7] [b_1]: 0.00015929 [b_2]: 8.67e-06 [updatestate_depend_eliminate]: 6.26998e-06 [updatestate_assign_eliminate]: 3.13e-06 [updatestate_loads_eliminate]: 2.99999e-06 [renormalize]: 4.40021e-07 [cse]: 2.361e-05 [optimize_parallel_all_gather_comm]: 1.767e-05 [overlap_param_gather]: 1.97001e-06 [cconv]: 2.589e-05 [loop_unroll]: 0.00045959 [opt_after_cconv]: 0.00011871, [1] [Cycle 1]: 0.00011287, [7] [c_1]: 4.056e-05 [parameter_eliminate]: 2.70002e-06 [updatestate_depend_eliminate]: 5.81003e-06 [updatestate_assign_eliminate]: 3.06999e-06 [updatestate_loads_eliminate]: 3.28998e-06 [cse]: 2.167e-05 [renormalize]: 5.29981e-07 [remove_dup_value]: 1.575e-05 [tuple_transform]: 8.815e-05, [1] [Cycle 1]: 8.285e-05, [4] [d_1]: 5.447e-05 [none_parameter_eliminate]: 1.77001e-06 [renormalize]: 2.10013e-07 [switch_simplify]: 7.87e-06 [partial_unused_args_eliminate]: 2.27999e-06 [add_recomputation]: 5.288e-05 [cse_after_recomputation]: 2.54e-05, [1] [Cycle 1]: 2e-05, [1] [cse]: 1.435e-05 [environ_conv]: 5.95002e-06 [swap_dp_allreduce_reducescatter]: 5.67999e-06 [bias_add_comm_swap]: 2.49999e-06 [label_micro_interleaved_index]: 4.72e-06 [label_fine_grained_interleaved_index]: 2.67001e-06 [merge_cast_opt]: 1.40999e-06 [slice_recompute_activation]: 1.96e-06 [micro_interleaved_order_control]: 2.14e-06 [assign_add_opt]: 1.27e-06 [ForceFp32Comm]: 7.89994e-07 [remove_cast_before_assign_add]: 1.27999e-06 [full_micro_interleaved_order_control]: 2.37001e-06 [reorder_send_recv_between_fp_bp]: 2.56e-06 [comm_op_add_attrs]: 1.00001e-06 [add_comm_op_reuse_tag]: 9.50007e-07 [interleave_split_concat_branches]: 1.12999e-06 [interleave_parallel_branches]: 9.89996e-07 [overlap_opt_shard_in_pipeline]: 1.24e-06 [overlap_opt_shard_grad_in_pipeline]: 1.75001e-06 [control_data_broadcast_order]: 1.674e-05 [grouped_pairwise_exchange_alltoall]: 1.70001e-06 [offloading_packed_experts]: 4.42e-06 [overlap_recompute_and_grad_model_parallel]: 5.30999e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.18001e-06 [overlap_recompute_allgather_and_fa_grad]: 1.37e-06 [overlap_recompute_comm]: 2.53e-06 [overlap_grad_ring_attention]: 4.38999e-06 [overlap_grad_flash_sp]: 2.307e-05 [begin_end_overlap_inline]: 6.19999e-07 [split_matmul_comm_elemetwise]: 2.07001e-06 [split_layernorm_comm]: 1.64998e-06 [handle_group_info]: 9.5999e-07 [symbol_engine_optimizer]: 9.117e-05, [1] [Cycle 1]: 8.689e-05, [6] [build]: 3.26001e-06 [elim_shapecalc]: 1.354e-05 [elim_not_effective]: 1.809e-05 [opt_reshape]: 8.79e-06 [fold_const_symbol]: 1.288e-05 [renormalize]: 3.09985e-07 [detach_backward]: 1.99e-06 [pipeline_parallel_scheduler]: 1.77001e-06 [auto_monad_reorder]: 2.06e-05 [get_jit_bprop_graph]: 1.99e-06 [rewriter_after_jit_bprop_graph]: 4.40999e-06 [opt_after_jit_grad]: 0.00048838 [validate]: 4.247e-05 Sums bootstrap : 0.000435s : 1.20% type_inference : 0.029915s : 82.57% event_method : 0.000090s : 0.25% auto_monad : 0.000098s : 0.27% graph_reusing : 0.000008s : 0.02% inline : 0.000003s : 0.01% add_attr.add_attr_with_inline.tag_attr : 0.000029s : 0.08% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000008s : 0.02% parallel-infer-symbol : 0.000003s : 0.01% pre_auto_parallel : 0.000044s : 0.12% insert-virtual-dataset : 0.000002s : 0.01% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.01% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000035s : 0.10% optimize.rewriter_before_opt_a : 0.000117s : 0.32% optimize.opt_a.expand_dump_flag : 0.000005s : 0.01% optimize.opt_a.switch_simplify : 0.000135s : 0.37% optimize.opt_a.loop_unroll : 0.000050s : 0.14% optimize.opt_a.a_1 : 0.001186s : 3.27% optimize.opt_a.with_stream_mark : 0.000032s : 0.09% optimize.opt_a.recompute_prepare : 0.000020s : 0.05% optimize.opt_a.updatestate_depend_eliminate : 0.000009s : 0.03% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.02% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.02% optimize.opt_a.parameter_eliminate : 0.000003s : 0.01% optimize.opt_a.a_2 : 0.000198s : 0.55% optimize.opt_a.accelerated_algorithm : 0.000017s : 0.05% optimize.opt_a.shard : 0.000003s : 0.01% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.01% optimize.opt_a.shard_inline : 0.000016s : 0.04% optimize.opt_a.merge_send_recv : 0.000016s : 0.04% optimize.opt_a.auto_parallel : 0.000014s : 0.04% optimize.opt_a.parallel : 0.000023s : 0.06% optimize.opt_a.flash_sp : 0.000012s : 0.03% optimize.opt_a.merge_comm : 0.000009s : 0.02% optimize.opt_a.allreduce_fusion : 0.000008s : 0.02% optimize.opt_a.matmul_add_comm_reduction : 0.000018s : 0.05% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000018s : 0.05% optimize.opt_a.virtual_dataset : 0.000015s : 0.04% optimize.opt_a.get_grad_eliminate_ : 0.000015s : 0.04% optimize.opt_a.virtual_output : 0.000014s : 0.04% optimize.opt_a.merge_forward : 0.000009s : 0.02% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.01% optimize.opt_a.offload_activation : 0.000018s : 0.05% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000032s : 0.09% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.01% optimize.opt_a.before_grad : 0.000025s : 0.07% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000010s : 0.03% optimize.opt_a.meta_fg_expand : 0.000007s : 0.02% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.01% optimize.opt_a.receive_attached : 0.000003s : 0.01% optimize.opt_a.after_resolve : 0.000025s : 0.07% optimize.opt_a.a_after_grad : 0.000023s : 0.06% optimize.opt_a.renormalize : 0.000962s : 2.66% optimize.opt_a.add_forward_monad_depend : 0.000007s : 0.02% optimize.opt_a.auto_monad_grad : 0.000003s : 0.01% optimize.opt_a.auto_monad_eliminator : 0.000027s : 0.07% optimize.opt_a.cse : 0.000057s : 0.16% optimize.opt_a.a_3 : 0.000107s : 0.29% optimize.py_interpret_to_execute_after_opt_a : 0.000011s : 0.03% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.01% optimize.rewriter_after_opt_a : 0.000040s : 0.11% optimize.convert_after_rewriter : 0.000007s : 0.02% optimize.order_py_execute_after_rewriter : 0.000006s : 0.02% optimize.mutable_eliminate : 0.000529s : 1.46% optimize.opt_b.b_1 : 0.000159s : 0.44% optimize.opt_b.b_2 : 0.000009s : 0.02% optimize.opt_b.updatestate_depend_eliminate : 0.000006s : 0.02% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.01% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.01% optimize.opt_b.renormalize : 0.000000s : 0.00% optimize.opt_b.cse : 0.000024s : 0.07% optimize.optimize_parallel_all_gather_comm : 0.000018s : 0.05% optimize.overlap_param_gather : 0.000002s : 0.01% optimize.cconv : 0.000026s : 0.07% optimize.loop_unroll : 0.000460s : 1.27% optimize.opt_after_cconv.c_1 : 0.000041s : 0.11% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.02% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.cse : 0.000022s : 0.06% optimize.opt_after_cconv.renormalize : 0.000001s : 0.00% optimize.remove_dup_value : 0.000016s : 0.04% optimize.tuple_transform.d_1 : 0.000054s : 0.15% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000008s : 0.02% optimize.partial_unused_args_eliminate : 0.000002s : 0.01% optimize.add_recomputation : 0.000053s : 0.15% optimize.cse_after_recomputation.cse : 0.000014s : 0.04% optimize.environ_conv : 0.000006s : 0.02% optimize.swap_dp_allreduce_reducescatter : 0.000006s : 0.02% optimize.bias_add_comm_swap : 0.000002s : 0.01% optimize.label_micro_interleaved_index : 0.000005s : 0.01% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.01% optimize.merge_cast_opt : 0.000001s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.01% optimize.micro_interleaved_order_control : 0.000002s : 0.01% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.00% optimize.full_micro_interleaved_order_control : 0.000002s : 0.01% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.01% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.00% optimize.control_data_broadcast_order : 0.000017s : 0.05% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.00% optimize.offloading_packed_experts : 0.000004s : 0.01% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.01% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.00% optimize.overlap_recompute_comm : 0.000003s : 0.01% optimize.overlap_grad_ring_attention : 0.000004s : 0.01% optimize.overlap_grad_flash_sp : 0.000023s : 0.06% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.01% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000003s : 0.01% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000014s : 0.04% optimize.symbol_engine_optimizer.elim_not_effective : 0.000018s : 0.05% optimize.symbol_engine_optimizer.opt_reshape : 0.000009s : 0.02% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000013s : 0.04% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.01% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000021s : 0.06% get_jit_bprop_graph : 0.000002s : 0.01% rewriter_after_jit_bprop_graph : 0.000004s : 0.01% opt_after_jit_grad : 0.000488s : 1.35% validate : 0.000042s : 0.12% Time group info: ------[substitution.] 0.000282 44 8.04% : 0.000023s : 3: substitution.cast_eliminate 0.87% : 0.000002s : 3: substitution.elim_not_effective 0.65% : 0.000002s : 3: substitution.fold_const_symbol 2.50% : 0.000007s : 5: substitution.graph_param_transform 73.93% : 0.000208s : 8: substitution.inline 1.57% : 0.000004s : 6: substitution.j_node_and_user_rematch 2.12% : 0.000006s : 6: substitution.remove_not_recompute_node 1.64% : 0.000005s : 4: substitution.replace_old_param 3.30% : 0.000009s : 2: substitution.switch_simplify 5.39% : 0.000015s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.029845 2 95.38% : 0.028468s : 1: type_inference.infer 4.62% : 0.001378s : 1: type_inference.specialize ------[replace.] 0.000127 14 43.75% : 0.000056s : 8: replace.inline 35.60% : 0.000045s : 2: replace.switch_simplify 20.65% : 0.000026s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000225 14 90.57% : 0.000203s : 8: match.inline 3.62% : 0.000008s : 2: match.switch_simplify 5.81% : 0.000013s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000292 1838 1.00% : 0.000003s : 21: predicate.accumulaten_eliminater 0.66% : 0.000002s : 5: predicate.ad_related_special_op_eliminate 0.48% : 0.000001s : 10: predicate.addn_check_dump 1.10% : 0.000003s : 21: predicate.addn_zero_filter 0.93% : 0.000003s : 21: predicate.adjust_all_reduce_mul_add 2.34% : 0.000007s : 31: predicate.arithmetic_simplify 1.09% : 0.000003s : 21: predicate.cast_eliminate 0.50% : 0.000001s : 10: predicate.check_bprop_eliminate 0.48% : 0.000001s : 10: predicate.compare_switch_simplify 0.17% : 0.000000s : 5: predicate.const_output_eliminate 0.54% : 0.000002s : 10: predicate.depend_value_elim 1.05% : 0.000003s : 21: predicate.dict_get_item_const_eliminator 1.23% : 0.000004s : 21: predicate.dict_get_item_eliminator 1.01% : 0.000003s : 21: predicate.dict_set_item_eliminator 0.77% : 0.000002s : 10: predicate.dumpgradient_eliminate 0.24% : 0.000001s : 5: predicate.elim_not_effective 0.41% : 0.000001s : 5: predicate.elim_shapecalc_of_broadcastargs 1.32% : 0.000004s : 26: predicate.environ_add_const_eliminate 1.14% : 0.000003s : 26: predicate.environ_get_add_eliminate 1.20% : 0.000004s : 26: predicate.environ_get_depend_swap 1.80% : 0.000005s : 36: predicate.environ_get_eliminate 1.14% : 0.000003s : 26: predicate.environ_get_set_eliminate 1.64% : 0.000005s : 33: predicate.exchange_switch_depend_value 2.49% : 0.000007s : 33: predicate.float_depend_g_call 0.47% : 0.000001s : 10: predicate.float_environ_get_switch 0.68% : 0.000002s : 15: predicate.float_tuple_getitem_switch 0.16% : 0.000000s : 5: predicate.fold_const_symbol 0.56% : 0.000002s : 10: predicate.get_grad_eliminate 0.21% : 0.000001s : 5: predicate.graph_param_transform 0.56% : 0.000002s : 10: predicate.incorporate_call 0.44% : 0.000001s : 10: predicate.incorporate_call_switch 6.64% : 0.000019s : 84: predicate.inline 0.70% : 0.000002s : 10: predicate.inline_without_move 0.27% : 0.000001s : 10: predicate.j_node_and_user_rematch 0.67% : 0.000002s : 10: predicate.less_batch_normalization 1.78% : 0.000005s : 35: predicate.list_to_tuple_eliminator_ 2.56% : 0.000007s : 56: predicate.load_eliminater 0.80% : 0.000002s : 5: predicate.loop_unroll_after_grad 2.47% : 0.000007s : 49: predicate.loop_unroll_before_grad 1.73% : 0.000005s : 31: predicate.make_slice_get_slice_eliminator 0.51% : 0.000001s : 10: predicate.merge_addn 0.54% : 0.000002s : 10: predicate.micro_step_allgather_replace 0.50% : 0.000001s : 10: predicate.mini_step_allgather_replace 0.92% : 0.000003s : 21: predicate.minmaximum_grad 0.75% : 0.000002s : 5: predicate.mutable_eliminate 0.30% : 0.000001s : 5: predicate.opt_reshape 0.28% : 0.000001s : 5: predicate.parallel_virtual_node 2.18% : 0.000006s : 33: predicate.partial_defer_inline 1.70% : 0.000005s : 30: predicate.partial_eliminate 0.97% : 0.000003s : 21: predicate.print_const_string_wrapper 0.49% : 0.000001s : 10: predicate.reduce_all_const_elim 1.37% : 0.000004s : 21: predicate.reduce_eliminate 2.69% : 0.000008s : 56: predicate.redundant_stop_gradient_eliminater 0.42% : 0.000001s : 10: predicate.remove_not_recompute_node 1.29% : 0.000004s : 35: predicate.replace_applicator 0.37% : 0.000001s : 10: predicate.replace_old_param 0.22% : 0.000001s : 5: predicate.reset_defer_inline 1.04% : 0.000003s : 21: predicate.reshape_eliminate 0.55% : 0.000002s : 10: predicate.row_tensor_add_zeros_like 0.29% : 0.000001s : 5: predicate.row_tensor_eliminate 0.65% : 0.000002s : 10: predicate.same_eliminate 0.40% : 0.000001s : 10: predicate.set_cell_output_no_recompute 0.65% : 0.000002s : 10: predicate.shard_identity_eliminate 0.61% : 0.000002s : 10: predicate.special_op_eliminate 0.64% : 0.000002s : 10: predicate.specialize_transform 0.72% : 0.000002s : 10: predicate.split_environ_get_set_with_tuple_value 0.66% : 0.000002s : 10: predicate.stack_unstack_eliminate 0.30% : 0.000001s : 5: predicate.switch_call_monad_eliminater 1.75% : 0.000005s : 33: predicate.switch_defer_inline 2.14% : 0.000006s : 43: predicate.switch_layer_defer_inline 5.64% : 0.000016s : 101: predicate.switch_simplify 1.01% : 0.000003s : 21: predicate.tile_eliminate 1.05% : 0.000003s : 21: predicate.transpose_eliminate 1.66% : 0.000005s : 31: predicate.tuple_list_convert_item_index_to_positive 1.66% : 0.000005s : 31: predicate.tuple_list_get_item_const_eliminator 1.49% : 0.000004s : 31: predicate.tuple_list_get_item_depend_reorder 3.02% : 0.000009s : 45: predicate.tuple_list_get_item_eliminator 1.64% : 0.000005s : 31: predicate.tuple_list_get_set_item_eliminator 2.20% : 0.000006s : 41: predicate.tuple_list_set_item_eliminator 1.75% : 0.000005s : 35: predicate.tuple_to_list_eliminator_ 2.51% : 0.000007s : 56: predicate.updatestate_pure_node_eliminater 3.13% : 0.000009s : 66: predicate.updatestate_useless_node_eliminater 0.30% : 0.000001s : 5: predicate.value_based_eliminate 0.55% : 0.000002s : 10: predicate.virtual_dataset_eliminate 0.52% : 0.000002s : 10: predicate.virtual_output_eliminate 0.24% : 0.000001s : 5: predicate.virtual_view_grad_eliminate 0.31% : 0.000001s : 5: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.001024 16 59.25% : 0.000607s : 6: func_graph_cloner_run.FuncGraphClonerGraph 40.75% : 0.000417s : 10: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.052510 192 0.01% : 0.000003s : 1: ForceFp32Comm 6.13% : 0.003221s : 1: add_attr 6.11% : 0.003211s : 1: add_attr_with_inline 0.01% : 0.000003s : 1: add_comm_op_reuse_tag 0.11% : 0.000057s : 1: add_recomputation 0.01% : 0.000005s : 1: assign_add_opt 0.20% : 0.000105s : 1: auto_monad 0.05% : 0.000025s : 1: auto_monad_reorder 0.01% : 0.000003s : 1: begin_end_overlap_inline 0.01% : 0.000006s : 1: bias_add_comm_swap 0.88% : 0.000462s : 1: bootstrap 0.06% : 0.000029s : 1: cconv 0.01% : 0.000004s : 1: comm_op_add_attrs 0.04% : 0.000020s : 1: control_data_broadcast_order 0.02% : 0.000010s : 1: convert_after_rewriter 0.05% : 0.000028s : 1: cse_after_recomputation 0.01% : 0.000005s : 1: dataset_repeat_opt 0.01% : 0.000005s : 1: detach_backward 0.02% : 0.000009s : 1: environ_conv 0.19% : 0.000100s : 1: event_method 0.01% : 0.000005s : 1: full_micro_interleaved_order_control 0.01% : 0.000005s : 1: get_jit_bprop_graph 0.02% : 0.000012s : 1: graph_reusing 0.01% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000004s : 1: handle_group_info 0.01% : 0.000006s : 1: inline 0.01% : 0.000006s : 1: insert-virtual-dataset 0.01% : 0.000004s : 1: interleave_parallel_branches 0.01% : 0.000004s : 1: interleave_split_concat_branches 0.01% : 0.000006s : 1: label_fine_grained_interleaved_index 0.01% : 0.000007s : 1: label_micro_interleaved_index 0.89% : 0.000467s : 1: loop_unroll 0.01% : 0.000005s : 1: merge_cast_opt 0.01% : 0.000005s : 1: micro_interleaved_order_control 1.02% : 0.000537s : 1: mutable_eliminate 0.01% : 0.000008s : 1: offloading_packed_experts 0.03% : 0.000016s : 1: opt.transform.loop_unroll_optimizer 0.03% : 0.000017s : 1: opt.transform.mutable_eliminate 3.43% : 0.001801s : 78: opt.transform.opt_a 0.07% : 0.000039s : 1: opt.transform.opt_after_cconv 0.06% : 0.000030s : 1: opt.transform.opt_after_jit_grad 0.26% : 0.000136s : 28: opt.transform.opt_b 0.11% : 0.000060s : 2: opt.transform.opt_trans_graph 0.09% : 0.000049s : 4: opt.transform.symbol_engine_opt 6.96% : 0.003655s : 1: opt_a 0.23% : 0.000123s : 1: opt_after_cconv 0.95% : 0.000498s : 1: opt_after_jit_grad 0.48% : 0.000250s : 1: opt_b 11.23% : 0.005894s : 1: optimize 0.04% : 0.000021s : 1: optimize_parallel_all_gather_comm 0.02% : 0.000009s : 1: order_py_execute_after_rewriter 0.05% : 0.000027s : 1: overlap_grad_flash_sp 0.01% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.01% : 0.000008s : 1: overlap_grad_ring_attention 0.01% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.04% : 0.000021s : 1: overlap_opt_shard_in_pipeline 0.01% : 0.000005s : 1: overlap_param_gather 0.01% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.02% : 0.000008s : 1: overlap_recompute_and_grad_model_parallel 0.01% : 0.000005s : 1: overlap_recompute_comm 0.01% : 0.000006s : 1: parallel-infer-symbol 0.01% : 0.000003s : 1: parallel-infer-symbol-second 0.01% : 0.000005s : 1: partial_unused_args_eliminate 0.01% : 0.000005s : 1: pipeline_parallel_scheduler 0.01% : 0.000004s : 1: pipeline_split 0.09% : 0.000049s : 1: pre_auto_parallel 0.07% : 0.000039s : 1: py_interpret_to_execute 0.03% : 0.000014s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000004s : 1: remove_cast_before_assign_add 0.04% : 0.000020s : 1: remove_dup_value 0.96% : 0.000503s : 1: renormalize.infer 0.86% : 0.000451s : 1: renormalize.specialize 0.01% : 0.000005s : 1: reorder_send_recv_between_fp_bp 0.01% : 0.000008s : 1: rewriter_after_jit_bprop_graph 0.08% : 0.000044s : 1: rewriter_after_opt_a 0.23% : 0.000121s : 1: rewriter_before_opt_a 0.01% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.01% : 0.000005s : 1: slice_recompute_activation 0.01% : 0.000005s : 1: split_layernorm_comm 0.01% : 0.000005s : 1: split_matmul_comm_elemetwise 0.02% : 0.000008s : 1: swap_dp_allreduce_reducescatter 0.18% : 0.000094s : 1: symbol_engine_optimizer 0.17% : 0.000091s : 1: tuple_transform 57.01% : 0.029934s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:21.715.361 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:21.715.631 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0593533, [21] [bootstrap]: 0.00044558 [type_inference]: 0.030286 [event_method]: 0.00010018 [auto_monad]: 0.00011018 [graph_reusing]: 8.18999e-06 [inline]: 3.16001e-06 [add_attr]: 0.00354651, [1] [add_attr_with_inline]: 0.00353719, [1] [Cycle 1]: 9.591e-05, [2] [tag_attr]: 3.079e-05 [meta_addattr_fg_expand]: 8.74998e-06 [parallel-infer-symbol]: 3.25998e-06 [pre_auto_parallel]: 4.627e-05 [insert-virtual-dataset]: 2.26e-06 [parallel-infer-symbol-second]: 8.80013e-07 [dataset_repeat_opt]: 2.17001e-06 [pipeline_split]: 1.87001e-06 [optimize]: 0.0233535, [53] [py_interpret_to_execute]: 4.409e-05 [rewriter_before_opt_a]: 0.00013094 [opt_a]: 0.00427867, [2] [Cycle 1]: 0.00322416, [45] [expand_dump_flag]: 5.39e-06 [switch_simplify]: 0.00013041 [loop_unroll]: 4.509e-05 [a_1]: 0.00096431 [with_stream_mark]: 1.784e-05 [recompute_prepare]: 1.182e-05 [updatestate_depend_eliminate]: 5.47001e-06 [updatestate_assign_eliminate]: 4.80999e-06 [updatestate_loads_eliminate]: 4.90001e-06 [parameter_eliminate]: 2.09e-06 [a_2]: 0.00015328 [accelerated_algorithm]: 9.79e-06 [shard]: 1.60001e-06 [meta_shard_fg_expand]: 2.59999e-06 [shard_inline]: 9.15999e-06 [merge_send_recv]: 1.004e-05 [auto_parallel]: 8.38999e-06 [parallel]: 1.86e-05 [flash_sp]: 8.79e-06 [merge_comm]: 5.12999e-06 [allreduce_fusion]: 4.82998e-06 [matmul_add_comm_reduction]: 1.145e-05 [allreduce_slice_to_reducescatter]: 9.99979e-07 [virtual_shard_identity]: 1.054e-05 [virtual_dataset]: 9.76003e-06 [get_grad_eliminate_]: 8.80999e-06 [virtual_output]: 8.91002e-06 [merge_forward]: 4.90001e-06 [cell_reuse_recompute_pass]: 1.66002e-06 [offload_activation]: 3.832e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.148e-05 [merge_recompute_call_nodes]: 1.47001e-06 [before_grad]: 1.594e-05 [set_forward_comm_id_for_comm_node_pass]: 5.58002e-06 [meta_fg_expand]: 4.45999e-06 [flash_sp_send_recv_attached]: 2.67001e-06 [receive_attached]: 1.91e-06 [after_resolve]: 1.425e-05 [a_after_grad]: 1.47e-05 [renormalize]: 0.00107549 [add_forward_monad_depend]: 5.61e-06 [auto_monad_grad]: 2.17001e-06 [auto_monad_eliminator]: 1.937e-05 [cse]: 4.104e-05 [a_3]: 8.197e-05 [Cycle 2]: 0.00104034, [45] [expand_dump_flag]: 1.30999e-06 [switch_simplify]: 1.044e-05 [loop_unroll]: 8.84e-06 [a_1]: 0.00021931 [with_stream_mark]: 1.256e-05 [recompute_prepare]: 9.20001e-06 [updatestate_depend_eliminate]: 4.58999e-06 [updatestate_assign_eliminate]: 3.76999e-06 [updatestate_loads_eliminate]: 3.83001e-06 [parameter_eliminate]: 1.12e-06 [a_2]: 0.00014041 [accelerated_algorithm]: 8.92e-06 [shard]: 1.40999e-06 [meta_shard_fg_expand]: 2.12999e-06 [shard_inline]: 9.27001e-06 [merge_send_recv]: 6.79001e-06 [auto_parallel]: 6.78e-06 [parallel]: 5.92999e-06 [flash_sp]: 3.4e-06 [merge_comm]: 5.04e-06 [allreduce_fusion]: 4.82998e-06 [matmul_add_comm_reduction]: 8.43001e-06 [allreduce_slice_to_reducescatter]: 6.39993e-07 [virtual_shard_identity]: 9.59e-06 [virtual_dataset]: 8.67e-06 [get_grad_eliminate_]: 8.70999e-06 [virtual_output]: 8.23001e-06 [merge_forward]: 3.87998e-06 [cell_reuse_recompute_pass]: 1.42e-06 [offload_activation]: 8.38001e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.922e-05 [merge_recompute_call_nodes]: 6.69999e-07 [before_grad]: 1.408e-05 [set_forward_comm_id_for_comm_node_pass]: 5.39998e-06 [meta_fg_expand]: 3.6e-06 [flash_sp_send_recv_attached]: 8.30012e-07 [receive_attached]: 1.20001e-06 [after_resolve]: 1.293e-05 [a_after_grad]: 1.403e-05 [renormalize]: 8.00064e-08 [add_forward_monad_depend]: 1.40999e-06 [auto_monad_grad]: 1.09998e-06 [auto_monad_eliminator]: 1.021e-05 [cse]: 2.414e-05 [a_3]: 6.965e-05 [py_interpret_to_execute_after_opt_a]: 1.545e-05 [slice_cell_reuse_recomputed_activation]: 5.09e-06 [rewriter_after_opt_a]: 5.199e-05 [convert_after_rewriter]: 1.181e-05 [order_py_execute_after_rewriter]: 1.004e-05 [mutable_eliminate]: 0.00058199 [opt_b]: 0.00034905, [1] [Cycle 1]: 0.00034022, [7] [b_1]: 0.00022917 [b_2]: 1.097e-05 [updatestate_depend_eliminate]: 6.71999e-06 [updatestate_assign_eliminate]: 3.56001e-06 [updatestate_loads_eliminate]: 3.36999e-06 [renormalize]: 5.00004e-07 [cse]: 2.936e-05 [optimize_parallel_all_gather_comm]: 2.188e-05 [overlap_param_gather]: 4.95999e-06 [cconv]: 2.942e-05 [loop_unroll]: 0.0166895 [opt_after_cconv]: 0.00021323, [1] [Cycle 1]: 0.00020057, [7] [c_1]: 5.115e-05 [parameter_eliminate]: 7.28e-06 [updatestate_depend_eliminate]: 1.433e-05 [updatestate_assign_eliminate]: 4.57e-06 [updatestate_loads_eliminate]: 5.44e-06 [cse]: 5.707e-05 [renormalize]: 1.22e-06 [remove_dup_value]: 5.987e-05 [tuple_transform]: 0.00012761, [1] [Cycle 1]: 0.00012018, [4] [d_1]: 7.575e-05 [none_parameter_eliminate]: 1.57999e-06 [renormalize]: 2.30008e-07 [switch_simplify]: 1.16e-05 [partial_unused_args_eliminate]: 5.10001e-06 [add_recomputation]: 7.582e-05 [cse_after_recomputation]: 3.533e-05, [1] [Cycle 1]: 2.857e-05, [1] [cse]: 1.902e-05 [environ_conv]: 1.024e-05 [swap_dp_allreduce_reducescatter]: 9.13002e-06 [bias_add_comm_swap]: 5.76e-06 [label_micro_interleaved_index]: 1.063e-05 [label_fine_grained_interleaved_index]: 6.19999e-06 [merge_cast_opt]: 3.5e-06 [slice_recompute_activation]: 4.58999e-06 [micro_interleaved_order_control]: 6.09999e-06 [assign_add_opt]: 3.73999e-06 [ForceFp32Comm]: 3.26001e-06 [remove_cast_before_assign_add]: 4.11001e-06 [full_micro_interleaved_order_control]: 4.31002e-06 [reorder_send_recv_between_fp_bp]: 5.91e-06 [comm_op_add_attrs]: 3.56001e-06 [add_comm_op_reuse_tag]: 3.21001e-06 [interleave_split_concat_branches]: 4.391e-05 [interleave_parallel_branches]: 3.93001e-06 [overlap_opt_shard_in_pipeline]: 3.51001e-06 [overlap_opt_shard_grad_in_pipeline]: 4.51002e-06 [control_data_broadcast_order]: 2.476e-05 [grouped_pairwise_exchange_alltoall]: 4.03999e-06 [offloading_packed_experts]: 8.89998e-06 [overlap_recompute_and_grad_model_parallel]: 8.62e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.88001e-06 [overlap_recompute_allgather_and_fa_grad]: 4.05e-06 [overlap_recompute_comm]: 5.51e-06 [overlap_grad_ring_attention]: 8.62998e-06 [overlap_grad_flash_sp]: 3.08e-05 [begin_end_overlap_inline]: 3.18e-06 [split_matmul_comm_elemetwise]: 4.58999e-06 [split_layernorm_comm]: 4.69002e-06 [handle_group_info]: 3.34001e-06 [symbol_engine_optimizer]: 0.00012379, [1] [Cycle 1]: 0.00011677, [6] [build]: 4.57e-06 [elim_shapecalc]: 1.554e-05 [elim_not_effective]: 1.956e-05 [opt_reshape]: 1.153e-05 [fold_const_symbol]: 1.616e-05 [renormalize]: 2.19996e-07 [detach_backward]: 3.88999e-06 [pipeline_parallel_scheduler]: 1.74998e-06 [auto_monad_reorder]: 2.51e-05 [get_jit_bprop_graph]: 1.97001e-06 [rewriter_after_jit_bprop_graph]: 7.10998e-06 [opt_after_jit_grad]: 0.00063808 [validate]: 5.116e-05 Sums bootstrap : 0.000446s : 0.83% type_inference : 0.030286s : 56.18% event_method : 0.000100s : 0.19% auto_monad : 0.000110s : 0.20% graph_reusing : 0.000008s : 0.02% inline : 0.000003s : 0.01% add_attr.add_attr_with_inline.tag_attr : 0.000031s : 0.06% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000009s : 0.02% parallel-infer-symbol : 0.000003s : 0.01% pre_auto_parallel : 0.000046s : 0.09% insert-virtual-dataset : 0.000002s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000044s : 0.08% optimize.rewriter_before_opt_a : 0.000131s : 0.24% optimize.opt_a.expand_dump_flag : 0.000007s : 0.01% optimize.opt_a.switch_simplify : 0.000141s : 0.26% optimize.opt_a.loop_unroll : 0.000054s : 0.10% optimize.opt_a.a_1 : 0.001184s : 2.20% optimize.opt_a.with_stream_mark : 0.000030s : 0.06% optimize.opt_a.recompute_prepare : 0.000021s : 0.04% optimize.opt_a.updatestate_depend_eliminate : 0.000010s : 0.02% optimize.opt_a.updatestate_assign_eliminate : 0.000009s : 0.02% optimize.opt_a.updatestate_loads_eliminate : 0.000009s : 0.02% optimize.opt_a.parameter_eliminate : 0.000003s : 0.01% optimize.opt_a.a_2 : 0.000294s : 0.54% optimize.opt_a.accelerated_algorithm : 0.000019s : 0.03% optimize.opt_a.shard : 0.000003s : 0.01% optimize.opt_a.meta_shard_fg_expand : 0.000005s : 0.01% optimize.opt_a.shard_inline : 0.000018s : 0.03% optimize.opt_a.merge_send_recv : 0.000017s : 0.03% optimize.opt_a.auto_parallel : 0.000015s : 0.03% optimize.opt_a.parallel : 0.000025s : 0.05% optimize.opt_a.flash_sp : 0.000012s : 0.02% optimize.opt_a.merge_comm : 0.000010s : 0.02% optimize.opt_a.allreduce_fusion : 0.000010s : 0.02% optimize.opt_a.matmul_add_comm_reduction : 0.000020s : 0.04% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000002s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000020s : 0.04% optimize.opt_a.virtual_dataset : 0.000018s : 0.03% optimize.opt_a.get_grad_eliminate_ : 0.000018s : 0.03% optimize.opt_a.virtual_output : 0.000017s : 0.03% optimize.opt_a.merge_forward : 0.000009s : 0.02% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.01% optimize.opt_a.offload_activation : 0.000047s : 0.09% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000041s : 0.08% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.00% optimize.opt_a.before_grad : 0.000030s : 0.06% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000011s : 0.02% optimize.opt_a.meta_fg_expand : 0.000008s : 0.01% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.01% optimize.opt_a.receive_attached : 0.000003s : 0.01% optimize.opt_a.after_resolve : 0.000027s : 0.05% optimize.opt_a.a_after_grad : 0.000029s : 0.05% optimize.opt_a.renormalize : 0.001076s : 2.00% optimize.opt_a.add_forward_monad_depend : 0.000007s : 0.01% optimize.opt_a.auto_monad_grad : 0.000003s : 0.01% optimize.opt_a.auto_monad_eliminator : 0.000030s : 0.05% optimize.opt_a.cse : 0.000065s : 0.12% optimize.opt_a.a_3 : 0.000152s : 0.28% optimize.py_interpret_to_execute_after_opt_a : 0.000015s : 0.03% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.01% optimize.rewriter_after_opt_a : 0.000052s : 0.10% optimize.convert_after_rewriter : 0.000012s : 0.02% optimize.order_py_execute_after_rewriter : 0.000010s : 0.02% optimize.mutable_eliminate : 0.000582s : 1.08% optimize.opt_b.b_1 : 0.000229s : 0.43% optimize.opt_b.b_2 : 0.000011s : 0.02% optimize.opt_b.updatestate_depend_eliminate : 0.000007s : 0.01% optimize.opt_b.updatestate_assign_eliminate : 0.000004s : 0.01% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.01% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000029s : 0.05% optimize.optimize_parallel_all_gather_comm : 0.000022s : 0.04% optimize.overlap_param_gather : 0.000005s : 0.01% optimize.cconv : 0.000029s : 0.05% optimize.loop_unroll : 0.016690s : 30.96% optimize.opt_after_cconv.c_1 : 0.000051s : 0.09% optimize.opt_after_cconv.parameter_eliminate : 0.000007s : 0.01% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000014s : 0.03% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000005s : 0.01% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000005s : 0.01% optimize.opt_after_cconv.cse : 0.000057s : 0.11% optimize.opt_after_cconv.renormalize : 0.000001s : 0.00% optimize.remove_dup_value : 0.000060s : 0.11% optimize.tuple_transform.d_1 : 0.000076s : 0.14% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000012s : 0.02% optimize.partial_unused_args_eliminate : 0.000005s : 0.01% optimize.add_recomputation : 0.000076s : 0.14% optimize.cse_after_recomputation.cse : 0.000019s : 0.04% optimize.environ_conv : 0.000010s : 0.02% optimize.swap_dp_allreduce_reducescatter : 0.000009s : 0.02% optimize.bias_add_comm_swap : 0.000006s : 0.01% optimize.label_micro_interleaved_index : 0.000011s : 0.02% optimize.label_fine_grained_interleaved_index : 0.000006s : 0.01% optimize.merge_cast_opt : 0.000003s : 0.01% optimize.slice_recompute_activation : 0.000005s : 0.01% optimize.micro_interleaved_order_control : 0.000006s : 0.01% optimize.assign_add_opt : 0.000004s : 0.01% optimize.ForceFp32Comm : 0.000003s : 0.01% optimize.remove_cast_before_assign_add : 0.000004s : 0.01% optimize.full_micro_interleaved_order_control : 0.000004s : 0.01% optimize.reorder_send_recv_between_fp_bp : 0.000006s : 0.01% optimize.comm_op_add_attrs : 0.000004s : 0.01% optimize.add_comm_op_reuse_tag : 0.000003s : 0.01% optimize.interleave_split_concat_branches : 0.000044s : 0.08% optimize.interleave_parallel_branches : 0.000004s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000005s : 0.01% optimize.control_data_broadcast_order : 0.000025s : 0.05% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.01% optimize.offloading_packed_experts : 0.000009s : 0.02% optimize.overlap_recompute_and_grad_model_parallel : 0.000009s : 0.02% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.01% optimize.overlap_recompute_comm : 0.000006s : 0.01% optimize.overlap_grad_ring_attention : 0.000009s : 0.02% optimize.overlap_grad_flash_sp : 0.000031s : 0.06% optimize.begin_end_overlap_inline : 0.000003s : 0.01% optimize.split_matmul_comm_elemetwise : 0.000005s : 0.01% optimize.split_layernorm_comm : 0.000005s : 0.01% optimize.handle_group_info : 0.000003s : 0.01% optimize.symbol_engine_optimizer.build : 0.000005s : 0.01% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000016s : 0.03% optimize.symbol_engine_optimizer.elim_not_effective : 0.000020s : 0.04% optimize.symbol_engine_optimizer.opt_reshape : 0.000012s : 0.02% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000016s : 0.03% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000004s : 0.01% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000025s : 0.05% get_jit_bprop_graph : 0.000002s : 0.00% rewriter_after_jit_bprop_graph : 0.000007s : 0.01% opt_after_jit_grad : 0.000638s : 1.18% validate : 0.000051s : 0.09% Time group info: ------[substitution.] 0.000274 54 11.60% : 0.000032s : 6: substitution.cast_eliminate 1.00% : 0.000003s : 4: substitution.elim_not_effective 0.87% : 0.000002s : 4: substitution.fold_const_symbol 3.01% : 0.000008s : 6: substitution.graph_param_transform 68.46% : 0.000188s : 8: substitution.inline 1.78% : 0.000005s : 8: substitution.j_node_and_user_rematch 3.02% : 0.000008s : 8: substitution.remove_not_recompute_node 1.62% : 0.000004s : 4: substitution.replace_old_param 3.72% : 0.000010s : 2: substitution.switch_simplify 4.91% : 0.000013s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.030207 2 94.32% : 0.028490s : 1: type_inference.infer 5.68% : 0.001716s : 1: type_inference.specialize ------[replace.] 0.000126 14 42.03% : 0.000053s : 8: replace.inline 37.91% : 0.000048s : 2: replace.switch_simplify 20.06% : 0.000025s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000203 14 89.96% : 0.000183s : 8: match.inline 4.48% : 0.000009s : 2: match.switch_simplify 5.56% : 0.000011s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000327 1972 0.95% : 0.000003s : 21: predicate.accumulaten_eliminater 0.61% : 0.000002s : 6: predicate.ad_related_special_op_eliminate 0.52% : 0.000002s : 12: predicate.addn_check_dump 1.13% : 0.000004s : 21: predicate.addn_zero_filter 0.93% : 0.000003s : 21: predicate.adjust_all_reduce_mul_add 2.08% : 0.000007s : 33: predicate.arithmetic_simplify 1.06% : 0.000003s : 21: predicate.cast_eliminate 0.63% : 0.000002s : 12: predicate.check_bprop_eliminate 0.53% : 0.000002s : 12: predicate.compare_switch_simplify 0.18% : 0.000001s : 6: predicate.const_output_eliminate 0.55% : 0.000002s : 12: predicate.depend_value_elim 1.06% : 0.000003s : 21: predicate.dict_get_item_const_eliminator 1.15% : 0.000004s : 21: predicate.dict_get_item_eliminator 0.90% : 0.000003s : 21: predicate.dict_set_item_eliminator 0.76% : 0.000002s : 12: predicate.dumpgradient_eliminate 0.23% : 0.000001s : 6: predicate.elim_not_effective 0.36% : 0.000001s : 6: predicate.elim_shapecalc_of_broadcastargs 1.12% : 0.000004s : 27: predicate.environ_add_const_eliminate 1.09% : 0.000004s : 27: predicate.environ_get_add_eliminate 1.10% : 0.000004s : 27: predicate.environ_get_depend_swap 1.95% : 0.000006s : 39: predicate.environ_get_eliminate 1.11% : 0.000004s : 27: predicate.environ_get_set_eliminate 1.49% : 0.000005s : 33: predicate.exchange_switch_depend_value 2.29% : 0.000007s : 33: predicate.float_depend_g_call 0.53% : 0.000002s : 12: predicate.float_environ_get_switch 0.77% : 0.000003s : 18: predicate.float_tuple_getitem_switch 0.17% : 0.000001s : 6: predicate.fold_const_symbol 0.59% : 0.000002s : 12: predicate.get_grad_eliminate 0.24% : 0.000001s : 6: predicate.graph_param_transform 0.59% : 0.000002s : 12: predicate.incorporate_call 0.51% : 0.000002s : 12: predicate.incorporate_call_switch 6.54% : 0.000021s : 90: predicate.inline 0.79% : 0.000003s : 12: predicate.inline_without_move 0.29% : 0.000001s : 12: predicate.j_node_and_user_rematch 0.74% : 0.000002s : 12: predicate.less_batch_normalization 1.89% : 0.000006s : 37: predicate.list_to_tuple_eliminator_ 2.52% : 0.000008s : 58: predicate.load_eliminater 1.97% : 0.000006s : 6: predicate.loop_unroll_after_grad 2.30% : 0.000008s : 51: predicate.loop_unroll_before_grad 1.68% : 0.000005s : 33: predicate.make_slice_get_slice_eliminator 0.55% : 0.000002s : 12: predicate.merge_addn 0.55% : 0.000002s : 12: predicate.micro_step_allgather_replace 0.54% : 0.000002s : 12: predicate.mini_step_allgather_replace 0.84% : 0.000003s : 21: predicate.minmaximum_grad 0.77% : 0.000003s : 6: predicate.mutable_eliminate 0.39% : 0.000001s : 6: predicate.opt_reshape 0.29% : 0.000001s : 6: predicate.parallel_virtual_node 1.81% : 0.000006s : 33: predicate.partial_defer_inline 1.61% : 0.000005s : 31: predicate.partial_eliminate 0.96% : 0.000003s : 21: predicate.print_const_string_wrapper 0.55% : 0.000002s : 12: predicate.reduce_all_const_elim 1.21% : 0.000004s : 21: predicate.reduce_eliminate 2.73% : 0.000009s : 58: predicate.redundant_stop_gradient_eliminater 0.36% : 0.000001s : 12: predicate.remove_not_recompute_node 1.24% : 0.000004s : 37: predicate.replace_applicator 0.42% : 0.000001s : 12: predicate.replace_old_param 0.23% : 0.000001s : 6: predicate.reset_defer_inline 0.99% : 0.000003s : 21: predicate.reshape_eliminate 0.56% : 0.000002s : 12: predicate.row_tensor_add_zeros_like 0.35% : 0.000001s : 6: predicate.row_tensor_eliminate 0.65% : 0.000002s : 12: predicate.same_eliminate 0.37% : 0.000001s : 12: predicate.set_cell_output_no_recompute 0.66% : 0.000002s : 12: predicate.shard_identity_eliminate 0.68% : 0.000002s : 12: predicate.special_op_eliminate 0.72% : 0.000002s : 12: predicate.specialize_transform 0.74% : 0.000002s : 12: predicate.split_environ_get_set_with_tuple_value 0.69% : 0.000002s : 12: predicate.stack_unstack_eliminate 0.35% : 0.000001s : 6: predicate.switch_call_monad_eliminater 1.59% : 0.000005s : 33: predicate.switch_defer_inline 2.03% : 0.000007s : 45: predicate.switch_layer_defer_inline 5.44% : 0.000018s : 106: predicate.switch_simplify 0.88% : 0.000003s : 21: predicate.tile_eliminate 1.00% : 0.000003s : 21: predicate.transpose_eliminate 1.52% : 0.000005s : 33: predicate.tuple_list_convert_item_index_to_positive 1.69% : 0.000006s : 33: predicate.tuple_list_get_item_const_eliminator 1.65% : 0.000005s : 33: predicate.tuple_list_get_item_depend_reorder 3.07% : 0.000010s : 49: predicate.tuple_list_get_item_eliminator 1.72% : 0.000006s : 33: predicate.tuple_list_get_set_item_eliminator 2.38% : 0.000008s : 45: predicate.tuple_list_set_item_eliminator 1.66% : 0.000005s : 37: predicate.tuple_to_list_eliminator_ 2.44% : 0.000008s : 58: predicate.updatestate_pure_node_eliminater 3.08% : 0.000010s : 70: predicate.updatestate_useless_node_eliminater 0.30% : 0.000001s : 6: predicate.value_based_eliminate 0.61% : 0.000002s : 12: predicate.virtual_dataset_eliminate 0.58% : 0.000002s : 12: predicate.virtual_output_eliminate 0.26% : 0.000001s : 6: predicate.virtual_view_grad_eliminate 0.39% : 0.000001s : 6: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.001059 16 51.86% : 0.000549s : 6: func_graph_cloner_run.FuncGraphClonerGraph 48.14% : 0.000510s : 10: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.089535 192 0.01% : 0.000006s : 1: ForceFp32Comm 3.97% : 0.003557s : 1: add_attr 3.96% : 0.003541s : 1: add_attr_with_inline 0.01% : 0.000006s : 1: add_comm_op_reuse_tag 0.09% : 0.000080s : 1: add_recomputation 0.01% : 0.000006s : 1: assign_add_opt 0.14% : 0.000123s : 1: auto_monad 0.04% : 0.000033s : 1: auto_monad_reorder 0.01% : 0.000006s : 1: begin_end_overlap_inline 0.01% : 0.000009s : 1: bias_add_comm_swap 0.55% : 0.000490s : 1: bootstrap 0.04% : 0.000033s : 1: cconv 0.01% : 0.000007s : 1: comm_op_add_attrs 0.03% : 0.000028s : 1: control_data_broadcast_order 0.02% : 0.000015s : 1: convert_after_rewriter 0.04% : 0.000038s : 1: cse_after_recomputation 0.01% : 0.000008s : 1: dataset_repeat_opt 0.03% : 0.000023s : 1: detach_backward 0.01% : 0.000013s : 1: environ_conv 0.13% : 0.000116s : 1: event_method 0.01% : 0.000007s : 1: full_micro_interleaved_order_control 0.01% : 0.000008s : 1: get_jit_bprop_graph 0.02% : 0.000015s : 1: graph_reusing 0.01% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000006s : 1: handle_group_info 0.01% : 0.000011s : 1: inline 0.01% : 0.000009s : 1: insert-virtual-dataset 0.01% : 0.000007s : 1: interleave_parallel_branches 0.05% : 0.000047s : 1: interleave_split_concat_branches 0.01% : 0.000009s : 1: label_fine_grained_interleaved_index 0.01% : 0.000013s : 1: label_micro_interleaved_index 18.65% : 0.016703s : 1: loop_unroll 0.01% : 0.000006s : 1: merge_cast_opt 0.01% : 0.000009s : 1: micro_interleaved_order_control 0.66% : 0.000589s : 1: mutable_eliminate 0.01% : 0.000013s : 1: offloading_packed_experts 0.05% : 0.000043s : 1: opt.transform.loop_unroll_optimizer 0.02% : 0.000018s : 1: opt.transform.mutable_eliminate 2.13% : 0.001904s : 78: opt.transform.opt_a 0.06% : 0.000049s : 1: opt.transform.opt_after_cconv 0.04% : 0.000036s : 1: opt.transform.opt_after_jit_grad 0.19% : 0.000168s : 28: opt.transform.opt_b 0.09% : 0.000085s : 2: opt.transform.opt_trans_graph 0.07% : 0.000059s : 4: opt.transform.symbol_engine_opt 4.78% : 0.004282s : 1: opt_a 0.24% : 0.000218s : 1: opt_after_cconv 0.72% : 0.000649s : 1: opt_after_jit_grad 0.39% : 0.000353s : 1: opt_b 26.57% : 0.023787s : 1: optimize 0.03% : 0.000025s : 1: optimize_parallel_all_gather_comm 0.01% : 0.000013s : 1: order_py_execute_after_rewriter 0.04% : 0.000034s : 1: overlap_grad_flash_sp 0.01% : 0.000007s : 1: overlap_grad_matmul_and_grad_allreduce 0.01% : 0.000012s : 1: overlap_grad_ring_attention 0.01% : 0.000007s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000006s : 1: overlap_opt_shard_in_pipeline 0.01% : 0.000008s : 1: overlap_param_gather 0.01% : 0.000007s : 1: overlap_recompute_allgather_and_fa_grad 0.01% : 0.000012s : 1: overlap_recompute_and_grad_model_parallel 0.01% : 0.000009s : 1: overlap_recompute_comm 0.01% : 0.000011s : 1: parallel-infer-symbol 0.01% : 0.000006s : 1: parallel-infer-symbol-second 0.01% : 0.000008s : 1: partial_unused_args_eliminate 0.01% : 0.000009s : 1: pipeline_parallel_scheduler 0.01% : 0.000008s : 1: pipeline_split 0.06% : 0.000054s : 1: pre_auto_parallel 0.05% : 0.000048s : 1: py_interpret_to_execute 0.02% : 0.000019s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000007s : 1: remove_cast_before_assign_add 0.07% : 0.000064s : 1: remove_dup_value 0.65% : 0.000579s : 1: renormalize.infer 0.54% : 0.000488s : 1: renormalize.specialize 0.01% : 0.000009s : 1: reorder_send_recv_between_fp_bp 0.01% : 0.000013s : 1: rewriter_after_jit_bprop_graph 0.06% : 0.000056s : 1: rewriter_after_opt_a 0.15% : 0.000135s : 1: rewriter_before_opt_a 0.01% : 0.000008s : 1: slice_cell_reuse_recomputed_activation 0.01% : 0.000007s : 1: slice_recompute_activation 0.01% : 0.000007s : 1: split_layernorm_comm 0.01% : 0.000008s : 1: split_matmul_comm_elemetwise 0.01% : 0.000012s : 1: swap_dp_allreduce_reducescatter 0.14% : 0.000127s : 1: symbol_engine_optimizer 0.15% : 0.000131s : 1: tuple_transform 33.88% : 0.030338s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:22.165.833 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0582618, [21] [bootstrap]: 0.00043444 [type_inference]: 0.0305739 [event_method]: 9.205e-05 [auto_monad]: 0.00010093 [graph_reusing]: 7.56001e-06 [inline]: 2.69999e-06 [add_attr]: 0.00344679, [1] [add_attr_with_inline]: 0.00343638, [1] [Cycle 1]: 7.733e-05, [2] [tag_attr]: 3.107e-05 [meta_addattr_fg_expand]: 8.42e-06 [parallel-infer-symbol]: 3.68999e-06 [pre_auto_parallel]: 4.429e-05 [insert-virtual-dataset]: 2.34999e-06 [parallel-infer-symbol-second]: 9.39996e-07 [dataset_repeat_opt]: 1.87001e-06 [pipeline_split]: 1.55999e-06 [optimize]: 0.0225448, [53] [py_interpret_to_execute]: 3.583e-05 [rewriter_before_opt_a]: 0.00012305 [opt_a]: 0.0039091, [2] [Cycle 1]: 0.00299802, [45] [expand_dump_flag]: 3.95998e-06 [switch_simplify]: 0.0001261 [loop_unroll]: 4.353e-05 [a_1]: 0.00094269 [with_stream_mark]: 1.658e-05 [recompute_prepare]: 1.182e-05 [updatestate_depend_eliminate]: 5.72001e-06 [updatestate_assign_eliminate]: 4.31002e-06 [updatestate_loads_eliminate]: 4.00998e-06 [parameter_eliminate]: 2.43e-06 [a_2]: 0.00012435 [accelerated_algorithm]: 9.90002e-06 [shard]: 2.02999e-06 [meta_shard_fg_expand]: 2.58003e-06 [shard_inline]: 9.49e-06 [merge_send_recv]: 1.005e-05 [auto_parallel]: 7.5e-06 [parallel]: 3.015e-05 [flash_sp]: 9.32001e-06 [merge_comm]: 6.09001e-06 [allreduce_fusion]: 5.03002e-06 [matmul_add_comm_reduction]: 1.111e-05 [allreduce_slice_to_reducescatter]: 6.39993e-07 [virtual_shard_identity]: 1.162e-05 [virtual_dataset]: 9.20001e-06 [get_grad_eliminate_]: 9.54999e-06 [virtual_output]: 9.91998e-06 [merge_forward]: 4.94e-06 [cell_reuse_recompute_pass]: 1.87001e-06 [offload_activation]: 1.28e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.958e-05 [merge_recompute_call_nodes]: 1.72999e-06 [before_grad]: 1.647e-05 [set_forward_comm_id_for_comm_node_pass]: 5.49998e-06 [meta_fg_expand]: 4.52e-06 [flash_sp_send_recv_attached]: 2.98998e-06 [receive_attached]: 2.78e-06 [after_resolve]: 1.49e-05 [a_after_grad]: 1.56e-05 [renormalize]: 0.00105935 [add_forward_monad_depend]: 5.76998e-06 [auto_monad_grad]: 1.96998e-06 [auto_monad_eliminator]: 1.96e-05 [cse]: 4.548e-05 [a_3]: 7.185e-05 [Cycle 2]: 0.00090043, [45] [expand_dump_flag]: 1.18001e-06 [switch_simplify]: 1.173e-05 [loop_unroll]: 9.25999e-06 [a_1]: 0.00022591 [with_stream_mark]: 1.297e-05 [recompute_prepare]: 9.51998e-06 [updatestate_depend_eliminate]: 4.89e-06 [updatestate_assign_eliminate]: 3.91001e-06 [updatestate_loads_eliminate]: 3.9e-06 [parameter_eliminate]: 1.29e-06 [a_2]: 0.00011601 [accelerated_algorithm]: 9.02e-06 [shard]: 1.29e-06 [meta_shard_fg_expand]: 2.05002e-06 [shard_inline]: 8.99003e-06 [merge_send_recv]: 7.2e-06 [auto_parallel]: 7.64002e-06 [parallel]: 5.63002e-06 [flash_sp]: 3.58999e-06 [merge_comm]: 5.30999e-06 [allreduce_fusion]: 5.04e-06 [matmul_add_comm_reduction]: 8.27998e-06 [allreduce_slice_to_reducescatter]: 4.00003e-07 [virtual_shard_identity]: 1.029e-05 [virtual_dataset]: 8.65999e-06 [get_grad_eliminate_]: 9.20001e-06 [virtual_output]: 8.94998e-06 [merge_forward]: 4.4e-06 [cell_reuse_recompute_pass]: 1.87999e-06 [offload_activation]: 8.94e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.78e-05 [merge_recompute_call_nodes]: 9.80013e-07 [before_grad]: 1.475e-05 [set_forward_comm_id_for_comm_node_pass]: 5.38002e-06 [meta_fg_expand]: 3.51001e-06 [flash_sp_send_recv_attached]: 8.80013e-07 [receive_attached]: 1.19e-06 [after_resolve]: 1.396e-05 [a_after_grad]: 1.41e-05 [renormalize]: 8.00064e-08 [add_forward_monad_depend]: 1.42e-06 [auto_monad_grad]: 1.05999e-06 [auto_monad_eliminator]: 1.102e-05 [cse]: 2.341e-05 [a_3]: 5.85e-05 [py_interpret_to_execute_after_opt_a]: 1.168e-05 [slice_cell_reuse_recomputed_activation]: 2.18002e-06 [rewriter_after_opt_a]: 4.655e-05 [convert_after_rewriter]: 8.52e-06 [order_py_execute_after_rewriter]: 7.15e-06 [mutable_eliminate]: 0.00054321 [opt_b]: 0.00028339, [1] [Cycle 1]: 0.00027689, [7] [b_1]: 0.00018597 [b_2]: 1.052e-05 [updatestate_depend_eliminate]: 7.15e-06 [updatestate_assign_eliminate]: 3.76999e-06 [updatestate_loads_eliminate]: 3.71001e-06 [renormalize]: 3.30008e-07 [cse]: 2.893e-05 [optimize_parallel_all_gather_comm]: 1.898e-05 [overlap_param_gather]: 1.99999e-06 [cconv]: 2.467e-05 [loop_unroll]: 0.00044163 [opt_after_cconv]: 0.016252, [1] [Cycle 1]: 0.0162437, [7] [c_1]: 4.56e-05 [parameter_eliminate]: 2.78e-06 [updatestate_depend_eliminate]: 6.84999e-06 [updatestate_assign_eliminate]: 0.0160389 [updatestate_loads_eliminate]: 8.57998e-06 [cse]: 6.14e-05 [renormalize]: 1.15001e-06 [remove_dup_value]: 6.23e-05 [tuple_transform]: 0.00014069, [1] [Cycle 1]: 0.0001314, [4] [d_1]: 9.368e-05 [none_parameter_eliminate]: 4.65999e-06 [renormalize]: 2.79979e-07 [switch_simplify]: 1.085e-05 [partial_unused_args_eliminate]: 2.37999e-06 [add_recomputation]: 8.005e-05 [cse_after_recomputation]: 3.165e-05, [1] [Cycle 1]: 2.544e-05, [1] [cse]: 1.861e-05 [environ_conv]: 8.08001e-06 [swap_dp_allreduce_reducescatter]: 6.86999e-06 [bias_add_comm_swap]: 3.44001e-06 [label_micro_interleaved_index]: 7.58001e-06 [label_fine_grained_interleaved_index]: 3.04001e-06 [merge_cast_opt]: 1.26997e-06 [slice_recompute_activation]: 2.43e-06 [micro_interleaved_order_control]: 2.44999e-06 [assign_add_opt]: 1.40001e-06 [ForceFp32Comm]: 1.18001e-06 [remove_cast_before_assign_add]: 1.09e-06 [full_micro_interleaved_order_control]: 2.04999e-06 [reorder_send_recv_between_fp_bp]: 2.78998e-06 [comm_op_add_attrs]: 1.02998e-06 [add_comm_op_reuse_tag]: 1.34e-06 [interleave_split_concat_branches]: 1.27999e-06 [interleave_parallel_branches]: 1.05999e-06 [overlap_opt_shard_in_pipeline]: 1.29998e-06 [overlap_opt_shard_grad_in_pipeline]: 2.50002e-06 [control_data_broadcast_order]: 2.298e-05 [grouped_pairwise_exchange_alltoall]: 1.86e-06 [offloading_packed_experts]: 4.62e-06 [overlap_recompute_and_grad_model_parallel]: 5.73002e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.25999e-06 [overlap_recompute_allgather_and_fa_grad]: 1.86e-06 [overlap_recompute_comm]: 2.31e-06 [overlap_grad_ring_attention]: 5.12e-06 [overlap_grad_flash_sp]: 2.7e-05 [begin_end_overlap_inline]: 6.59988e-07 [split_matmul_comm_elemetwise]: 2.07001e-06 [split_layernorm_comm]: 1.60999e-06 [handle_group_info]: 1.22e-06 [symbol_engine_optimizer]: 0.00010494, [1] [Cycle 1]: 9.976e-05, [6] [build]: 5.64e-06 [elim_shapecalc]: 1.628e-05 [elim_not_effective]: 1.983e-05 [opt_reshape]: 1.062e-05 [fold_const_symbol]: 1.568e-05 [renormalize]: 2.3999e-07 [detach_backward]: 2.32999e-06 [pipeline_parallel_scheduler]: 1.60999e-06 [auto_monad_reorder]: 2.198e-05 [get_jit_bprop_graph]: 2.56998e-06 [rewriter_after_jit_bprop_graph]: 6.36998e-06 [opt_after_jit_grad]: 0.00072993 [validate]: 5.213e-05 Sums bootstrap : 0.000434s : 0.81% type_inference : 0.030574s : 56.95% event_method : 0.000092s : 0.17% auto_monad : 0.000101s : 0.19% graph_reusing : 0.000008s : 0.01% inline : 0.000003s : 0.01% add_attr.add_attr_with_inline.tag_attr : 0.000031s : 0.06% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000008s : 0.02% parallel-infer-symbol : 0.000004s : 0.01% pre_auto_parallel : 0.000044s : 0.08% insert-virtual-dataset : 0.000002s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000036s : 0.07% optimize.rewriter_before_opt_a : 0.000123s : 0.23% optimize.opt_a.expand_dump_flag : 0.000005s : 0.01% optimize.opt_a.switch_simplify : 0.000138s : 0.26% optimize.opt_a.loop_unroll : 0.000053s : 0.10% optimize.opt_a.a_1 : 0.001169s : 2.18% optimize.opt_a.with_stream_mark : 0.000030s : 0.06% optimize.opt_a.recompute_prepare : 0.000021s : 0.04% optimize.opt_a.updatestate_depend_eliminate : 0.000011s : 0.02% optimize.opt_a.updatestate_assign_eliminate : 0.000008s : 0.02% optimize.opt_a.updatestate_loads_eliminate : 0.000008s : 0.01% optimize.opt_a.parameter_eliminate : 0.000004s : 0.01% optimize.opt_a.a_2 : 0.000240s : 0.45% optimize.opt_a.accelerated_algorithm : 0.000019s : 0.04% optimize.opt_a.shard : 0.000003s : 0.01% optimize.opt_a.meta_shard_fg_expand : 0.000005s : 0.01% optimize.opt_a.shard_inline : 0.000018s : 0.03% optimize.opt_a.merge_send_recv : 0.000017s : 0.03% optimize.opt_a.auto_parallel : 0.000015s : 0.03% optimize.opt_a.parallel : 0.000036s : 0.07% optimize.opt_a.flash_sp : 0.000013s : 0.02% optimize.opt_a.merge_comm : 0.000011s : 0.02% optimize.opt_a.allreduce_fusion : 0.000010s : 0.02% optimize.opt_a.matmul_add_comm_reduction : 0.000019s : 0.04% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000022s : 0.04% optimize.opt_a.virtual_dataset : 0.000018s : 0.03% optimize.opt_a.get_grad_eliminate_ : 0.000019s : 0.03% optimize.opt_a.virtual_output : 0.000019s : 0.04% optimize.opt_a.merge_forward : 0.000009s : 0.02% optimize.opt_a.cell_reuse_recompute_pass : 0.000004s : 0.01% optimize.opt_a.offload_activation : 0.000022s : 0.04% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000037s : 0.07% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.01% optimize.opt_a.before_grad : 0.000031s : 0.06% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000011s : 0.02% optimize.opt_a.meta_fg_expand : 0.000008s : 0.01% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.01% optimize.opt_a.receive_attached : 0.000004s : 0.01% optimize.opt_a.after_resolve : 0.000029s : 0.05% optimize.opt_a.a_after_grad : 0.000030s : 0.06% optimize.opt_a.renormalize : 0.001059s : 1.97% optimize.opt_a.add_forward_monad_depend : 0.000007s : 0.01% optimize.opt_a.auto_monad_grad : 0.000003s : 0.01% optimize.opt_a.auto_monad_eliminator : 0.000031s : 0.06% optimize.opt_a.cse : 0.000069s : 0.13% optimize.opt_a.a_3 : 0.000130s : 0.24% optimize.py_interpret_to_execute_after_opt_a : 0.000012s : 0.02% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.00% optimize.rewriter_after_opt_a : 0.000047s : 0.09% optimize.convert_after_rewriter : 0.000009s : 0.02% optimize.order_py_execute_after_rewriter : 0.000007s : 0.01% optimize.mutable_eliminate : 0.000543s : 1.01% optimize.opt_b.b_1 : 0.000186s : 0.35% optimize.opt_b.b_2 : 0.000011s : 0.02% optimize.opt_b.updatestate_depend_eliminate : 0.000007s : 0.01% optimize.opt_b.updatestate_assign_eliminate : 0.000004s : 0.01% optimize.opt_b.updatestate_loads_eliminate : 0.000004s : 0.01% optimize.opt_b.renormalize : 0.000000s : 0.00% optimize.opt_b.cse : 0.000029s : 0.05% optimize.optimize_parallel_all_gather_comm : 0.000019s : 0.04% optimize.overlap_param_gather : 0.000002s : 0.00% optimize.cconv : 0.000025s : 0.05% optimize.loop_unroll : 0.000442s : 0.82% optimize.opt_after_cconv.c_1 : 0.000046s : 0.08% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000007s : 0.01% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.016039s : 29.87% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000009s : 0.02% optimize.opt_after_cconv.cse : 0.000061s : 0.11% optimize.opt_after_cconv.renormalize : 0.000001s : 0.00% optimize.remove_dup_value : 0.000062s : 0.12% optimize.tuple_transform.d_1 : 0.000094s : 0.17% optimize.tuple_transform.none_parameter_eliminate : 0.000005s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000011s : 0.02% optimize.partial_unused_args_eliminate : 0.000002s : 0.00% optimize.add_recomputation : 0.000080s : 0.15% optimize.cse_after_recomputation.cse : 0.000019s : 0.03% optimize.environ_conv : 0.000008s : 0.02% optimize.swap_dp_allreduce_reducescatter : 0.000007s : 0.01% optimize.bias_add_comm_swap : 0.000003s : 0.01% optimize.label_micro_interleaved_index : 0.000008s : 0.01% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.01% optimize.merge_cast_opt : 0.000001s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.00% optimize.micro_interleaved_order_control : 0.000002s : 0.00% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.00% optimize.full_micro_interleaved_order_control : 0.000002s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.01% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000003s : 0.00% optimize.control_data_broadcast_order : 0.000023s : 0.04% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.00% optimize.offloading_packed_experts : 0.000005s : 0.01% optimize.overlap_recompute_and_grad_model_parallel : 0.000006s : 0.01% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000002s : 0.00% optimize.overlap_recompute_comm : 0.000002s : 0.00% optimize.overlap_grad_ring_attention : 0.000005s : 0.01% optimize.overlap_grad_flash_sp : 0.000027s : 0.05% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.00% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000006s : 0.01% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000016s : 0.03% optimize.symbol_engine_optimizer.elim_not_effective : 0.000020s : 0.04% optimize.symbol_engine_optimizer.opt_reshape : 0.000011s : 0.02% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000016s : 0.03% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.00% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000022s : 0.04% get_jit_bprop_graph : 0.000003s : 0.00% rewriter_after_jit_bprop_graph : 0.000006s : 0.01% opt_after_jit_grad : 0.000730s : 1.36% validate : 0.000052s : 0.10% Time group info: ------[substitution.] 0.000272 54 12.31% : 0.000034s : 6: substitution.cast_eliminate 1.05% : 0.000003s : 4: substitution.elim_not_effective 0.91% : 0.000002s : 4: substitution.fold_const_symbol 3.43% : 0.000009s : 6: substitution.graph_param_transform 67.63% : 0.000184s : 8: substitution.inline 1.85% : 0.000005s : 8: substitution.j_node_and_user_rematch 2.83% : 0.000008s : 8: substitution.remove_not_recompute_node 1.83% : 0.000005s : 4: substitution.replace_old_param 3.43% : 0.000009s : 2: substitution.switch_simplify 4.74% : 0.000013s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.030497 2 94.98% : 0.028968s : 1: type_inference.infer 5.02% : 0.001530s : 1: type_inference.specialize ------[replace.] 0.000124 14 42.61% : 0.000053s : 8: replace.inline 38.19% : 0.000047s : 2: replace.switch_simplify 19.20% : 0.000024s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000199 14 90.39% : 0.000180s : 8: match.inline 4.09% : 0.000008s : 2: match.switch_simplify 5.52% : 0.000011s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000317 1972 0.91% : 0.000003s : 21: predicate.accumulaten_eliminater 0.68% : 0.000002s : 6: predicate.ad_related_special_op_eliminate 0.52% : 0.000002s : 12: predicate.addn_check_dump 0.95% : 0.000003s : 21: predicate.addn_zero_filter 0.85% : 0.000003s : 21: predicate.adjust_all_reduce_mul_add 2.12% : 0.000007s : 33: predicate.arithmetic_simplify 1.09% : 0.000003s : 21: predicate.cast_eliminate 0.71% : 0.000002s : 12: predicate.check_bprop_eliminate 0.51% : 0.000002s : 12: predicate.compare_switch_simplify 0.18% : 0.000001s : 6: predicate.const_output_eliminate 0.56% : 0.000002s : 12: predicate.depend_value_elim 1.00% : 0.000003s : 21: predicate.dict_get_item_const_eliminator 1.10% : 0.000003s : 21: predicate.dict_get_item_eliminator 1.05% : 0.000003s : 21: predicate.dict_set_item_eliminator 0.89% : 0.000003s : 12: predicate.dumpgradient_eliminate 0.25% : 0.000001s : 6: predicate.elim_not_effective 0.39% : 0.000001s : 6: predicate.elim_shapecalc_of_broadcastargs 1.18% : 0.000004s : 27: predicate.environ_add_const_eliminate 1.12% : 0.000004s : 27: predicate.environ_get_add_eliminate 1.15% : 0.000004s : 27: predicate.environ_get_depend_swap 1.85% : 0.000006s : 39: predicate.environ_get_eliminate 1.12% : 0.000004s : 27: predicate.environ_get_set_eliminate 1.47% : 0.000005s : 33: predicate.exchange_switch_depend_value 2.39% : 0.000008s : 33: predicate.float_depend_g_call 0.56% : 0.000002s : 12: predicate.float_environ_get_switch 0.83% : 0.000003s : 18: predicate.float_tuple_getitem_switch 0.16% : 0.000001s : 6: predicate.fold_const_symbol 0.64% : 0.000002s : 12: predicate.get_grad_eliminate 0.35% : 0.000001s : 6: predicate.graph_param_transform 0.61% : 0.000002s : 12: predicate.incorporate_call 0.51% : 0.000002s : 12: predicate.incorporate_call_switch 6.63% : 0.000021s : 90: predicate.inline 1.00% : 0.000003s : 12: predicate.inline_without_move 0.32% : 0.000001s : 12: predicate.j_node_and_user_rematch 0.71% : 0.000002s : 12: predicate.less_batch_normalization 1.86% : 0.000006s : 37: predicate.list_to_tuple_eliminator_ 2.49% : 0.000008s : 58: predicate.load_eliminater 0.73% : 0.000002s : 6: predicate.loop_unroll_after_grad 2.39% : 0.000008s : 51: predicate.loop_unroll_before_grad 1.60% : 0.000005s : 33: predicate.make_slice_get_slice_eliminator 0.56% : 0.000002s : 12: predicate.merge_addn 0.56% : 0.000002s : 12: predicate.micro_step_allgather_replace 0.59% : 0.000002s : 12: predicate.mini_step_allgather_replace 0.88% : 0.000003s : 21: predicate.minmaximum_grad 0.89% : 0.000003s : 6: predicate.mutable_eliminate 0.39% : 0.000001s : 6: predicate.opt_reshape 0.33% : 0.000001s : 6: predicate.parallel_virtual_node 1.83% : 0.000006s : 33: predicate.partial_defer_inline 1.63% : 0.000005s : 31: predicate.partial_eliminate 0.90% : 0.000003s : 21: predicate.print_const_string_wrapper 0.52% : 0.000002s : 12: predicate.reduce_all_const_elim 1.21% : 0.000004s : 21: predicate.reduce_eliminate 2.53% : 0.000008s : 58: predicate.redundant_stop_gradient_eliminater 0.35% : 0.000001s : 12: predicate.remove_not_recompute_node 1.31% : 0.000004s : 37: predicate.replace_applicator 0.45% : 0.000001s : 12: predicate.replace_old_param 0.21% : 0.000001s : 6: predicate.reset_defer_inline 1.05% : 0.000003s : 21: predicate.reshape_eliminate 0.67% : 0.000002s : 12: predicate.row_tensor_add_zeros_like 0.32% : 0.000001s : 6: predicate.row_tensor_eliminate 0.66% : 0.000002s : 12: predicate.same_eliminate 0.41% : 0.000001s : 12: predicate.set_cell_output_no_recompute 0.69% : 0.000002s : 12: predicate.shard_identity_eliminate 0.74% : 0.000002s : 12: predicate.special_op_eliminate 0.72% : 0.000002s : 12: predicate.specialize_transform 0.78% : 0.000002s : 12: predicate.split_environ_get_set_with_tuple_value 0.71% : 0.000002s : 12: predicate.stack_unstack_eliminate 0.32% : 0.000001s : 6: predicate.switch_call_monad_eliminater 1.56% : 0.000005s : 33: predicate.switch_defer_inline 2.07% : 0.000007s : 45: predicate.switch_layer_defer_inline 5.33% : 0.000017s : 106: predicate.switch_simplify 1.08% : 0.000003s : 21: predicate.tile_eliminate 1.05% : 0.000003s : 21: predicate.transpose_eliminate 1.57% : 0.000005s : 33: predicate.tuple_list_convert_item_index_to_positive 1.70% : 0.000005s : 33: predicate.tuple_list_get_item_const_eliminator 1.60% : 0.000005s : 33: predicate.tuple_list_get_item_depend_reorder 3.02% : 0.000010s : 49: predicate.tuple_list_get_item_eliminator 1.49% : 0.000005s : 33: predicate.tuple_list_get_set_item_eliminator 2.32% : 0.000007s : 45: predicate.tuple_list_set_item_eliminator 1.77% : 0.000006s : 37: predicate.tuple_to_list_eliminator_ 2.42% : 0.000008s : 58: predicate.updatestate_pure_node_eliminater 3.11% : 0.000010s : 70: predicate.updatestate_useless_node_eliminater 0.30% : 0.000001s : 6: predicate.value_based_eliminate 0.57% : 0.000002s : 12: predicate.virtual_dataset_eliminate 0.64% : 0.000002s : 12: predicate.virtual_output_eliminate 0.25% : 0.000001s : 6: predicate.virtual_view_grad_eliminate 0.44% : 0.000001s : 6: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.001063 16 55.98% : 0.000595s : 6: func_graph_cloner_run.FuncGraphClonerGraph 44.02% : 0.000468s : 10: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.087488 192 0.00% : 0.000004s : 1: ForceFp32Comm 3.95% : 0.003452s : 1: add_attr 3.93% : 0.003441s : 1: add_attr_with_inline 0.00% : 0.000004s : 1: add_comm_op_reuse_tag 0.10% : 0.000085s : 1: add_recomputation 0.01% : 0.000005s : 1: assign_add_opt 0.12% : 0.000109s : 1: auto_monad 0.03% : 0.000026s : 1: auto_monad_reorder 0.00% : 0.000004s : 1: begin_end_overlap_inline 0.01% : 0.000006s : 1: bias_add_comm_swap 0.55% : 0.000478s : 1: bootstrap 0.03% : 0.000028s : 1: cconv 0.01% : 0.000004s : 1: comm_op_add_attrs 0.03% : 0.000026s : 1: control_data_broadcast_order 0.01% : 0.000012s : 1: convert_after_rewriter 0.04% : 0.000035s : 1: cse_after_recomputation 0.01% : 0.000005s : 1: dataset_repeat_opt 0.01% : 0.000007s : 1: detach_backward 0.01% : 0.000012s : 1: environ_conv 0.12% : 0.000103s : 1: event_method 0.01% : 0.000005s : 1: full_micro_interleaved_order_control 0.01% : 0.000006s : 1: get_jit_bprop_graph 0.01% : 0.000011s : 1: graph_reusing 0.01% : 0.000005s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000004s : 1: handle_group_info 0.01% : 0.000006s : 1: inline 0.01% : 0.000006s : 1: insert-virtual-dataset 0.05% : 0.000044s : 1: interleave_parallel_branches 0.00% : 0.000004s : 1: interleave_split_concat_branches 0.01% : 0.000007s : 1: label_fine_grained_interleaved_index 0.01% : 0.000010s : 1: label_micro_interleaved_index 0.51% : 0.000450s : 1: loop_unroll 0.00% : 0.000004s : 1: merge_cast_opt 0.01% : 0.000005s : 1: micro_interleaved_order_control 0.63% : 0.000551s : 1: mutable_eliminate 0.01% : 0.000008s : 1: offloading_packed_experts 0.02% : 0.000018s : 1: opt.transform.loop_unroll_optimizer 0.02% : 0.000020s : 1: opt.transform.mutable_eliminate 2.17% : 0.001894s : 78: opt.transform.opt_a 0.05% : 0.000044s : 1: opt.transform.opt_after_cconv 0.04% : 0.000035s : 1: opt.transform.opt_after_jit_grad 0.19% : 0.000165s : 28: opt.transform.opt_b 0.11% : 0.000099s : 2: opt.transform.opt_trans_graph 0.07% : 0.000058s : 4: opt.transform.symbol_engine_opt 4.47% : 0.003912s : 1: opt_a 18.58% : 0.016258s : 1: opt_after_cconv 0.85% : 0.000740s : 1: opt_after_jit_grad 0.33% : 0.000287s : 1: opt_b 25.78% : 0.022552s : 1: optimize 0.03% : 0.000023s : 1: optimize_parallel_all_gather_comm 0.01% : 0.000011s : 1: order_py_execute_after_rewriter 0.03% : 0.000030s : 1: overlap_grad_flash_sp 0.00% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.01% : 0.000008s : 1: overlap_grad_ring_attention 0.01% : 0.000006s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.01% : 0.000005s : 1: overlap_param_gather 0.01% : 0.000005s : 1: overlap_recompute_allgather_and_fa_grad 0.01% : 0.000009s : 1: overlap_recompute_and_grad_model_parallel 0.01% : 0.000005s : 1: overlap_recompute_comm 0.01% : 0.000007s : 1: parallel-infer-symbol 0.00% : 0.000004s : 1: parallel-infer-symbol-second 0.01% : 0.000006s : 1: partial_unused_args_eliminate 0.01% : 0.000005s : 1: pipeline_parallel_scheduler 0.00% : 0.000004s : 1: pipeline_split 0.06% : 0.000049s : 1: pre_auto_parallel 0.05% : 0.000040s : 1: py_interpret_to_execute 0.02% : 0.000015s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000004s : 1: remove_cast_before_assign_add 0.08% : 0.000068s : 1: remove_dup_value 0.64% : 0.000564s : 1: renormalize.infer 0.56% : 0.000487s : 1: renormalize.specialize 0.01% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.01% : 0.000010s : 1: rewriter_after_jit_bprop_graph 0.06% : 0.000051s : 1: rewriter_after_opt_a 0.15% : 0.000127s : 1: rewriter_before_opt_a 0.01% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.01% : 0.000005s : 1: slice_recompute_activation 0.01% : 0.000005s : 1: split_layernorm_comm 0.01% : 0.000005s : 1: split_matmul_comm_elemetwise 0.01% : 0.000010s : 1: swap_dp_allreduce_reducescatter 0.12% : 0.000108s : 1: symbol_engine_optimizer 0.16% : 0.000144s : 1: tuple_transform 34.97% : 0.030598s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:22.590.883 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:22.591.154 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0583038, [21] [bootstrap]: 0.00041555 [type_inference]: 0.045891 [event_method]: 9.969e-05 [auto_monad]: 0.00010396 [graph_reusing]: 7.6e-06 [inline]: 3.29001e-06 [add_attr]: 0.00359562, [1] [add_attr_with_inline]: 0.00358575, [1] [Cycle 1]: 8.929e-05, [2] [tag_attr]: 3.06e-05 [meta_addattr_fg_expand]: 8.50001e-06 [parallel-infer-symbol]: 3.08e-06 [pre_auto_parallel]: 4.677e-05 [insert-virtual-dataset]: 2.46998e-06 [parallel-infer-symbol-second]: 9.20001e-07 [dataset_repeat_opt]: 2.07001e-06 [pipeline_split]: 1.69998e-06 [optimize]: 0.00684226, [53] [py_interpret_to_execute]: 4.086e-05 [rewriter_before_opt_a]: 0.00012759 [opt_a]: 0.00419763, [2] [Cycle 1]: 0.00322372, [45] [expand_dump_flag]: 4.09002e-06 [switch_simplify]: 0.0001257 [loop_unroll]: 4.232e-05 [a_1]: 0.0009241 [with_stream_mark]: 2.082e-05 [recompute_prepare]: 1.469e-05 [updatestate_depend_eliminate]: 5.46002e-06 [updatestate_assign_eliminate]: 3.78999e-06 [updatestate_loads_eliminate]: 3.4e-06 [parameter_eliminate]: 2.03002e-06 [a_2]: 0.00013561 [accelerated_algorithm]: 9.30001e-06 [shard]: 2.74001e-06 [meta_shard_fg_expand]: 2.48e-06 [shard_inline]: 8.17e-06 [merge_send_recv]: 1.063e-05 [auto_parallel]: 8.19002e-06 [parallel]: 1.935e-05 [flash_sp]: 1.013e-05 [merge_comm]: 5.97001e-06 [allreduce_fusion]: 4.92999e-06 [matmul_add_comm_reduction]: 1.226e-05 [allreduce_slice_to_reducescatter]: 6.40022e-07 [virtual_shard_identity]: 1.171e-05 [virtual_dataset]: 8.43001e-06 [get_grad_eliminate_]: 7.65998e-06 [virtual_output]: 8.48999e-06 [merge_forward]: 5.08002e-06 [cell_reuse_recompute_pass]: 1.61998e-06 [offload_activation]: 1.183e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.145e-05 [merge_recompute_call_nodes]: 1.99e-06 [before_grad]: 1.421e-05 [set_forward_comm_id_for_comm_node_pass]: 5.76998e-06 [meta_fg_expand]: 3.83001e-06 [flash_sp_send_recv_attached]: 2.97002e-06 [receive_attached]: 2.53e-06 [after_resolve]: 1.481e-05 [a_after_grad]: 1.306e-05 [renormalize]: 0.00113248 [add_forward_monad_depend]: 6.46999e-06 [auto_monad_grad]: 2.61e-06 [auto_monad_eliminator]: 1.959e-05 [cse]: 3.355e-05 [a_3]: 7.807e-05 [Cycle 2]: 0.00095957, [45] [expand_dump_flag]: 1.44e-06 [switch_simplify]: 1.082e-05 [loop_unroll]: 7.87e-06 [a_1]: 0.00018867 [with_stream_mark]: 1.441e-05 [recompute_prepare]: 8.13001e-06 [updatestate_depend_eliminate]: 4.51002e-06 [updatestate_assign_eliminate]: 3.57002e-06 [updatestate_loads_eliminate]: 3.01999e-06 [parameter_eliminate]: 1.29e-06 [a_2]: 0.00012395 [accelerated_algorithm]: 8.27e-06 [shard]: 1.94e-06 [meta_shard_fg_expand]: 1.96e-06 [shard_inline]: 8.33999e-06 [merge_send_recv]: 6.79001e-06 [auto_parallel]: 6.64999e-06 [parallel]: 6.24999e-06 [flash_sp]: 3.81999e-06 [merge_comm]: 4.32e-06 [allreduce_fusion]: 4.17998e-06 [matmul_add_comm_reduction]: 7.71999e-06 [allreduce_slice_to_reducescatter]: 5.79981e-07 [virtual_shard_identity]: 9.07999e-06 [virtual_dataset]: 7.53999e-06 [get_grad_eliminate_]: 7.18e-06 [virtual_output]: 7.31999e-06 [merge_forward]: 3.97998e-06 [cell_reuse_recompute_pass]: 1.85001e-06 [offload_activation]: 8.50999e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.681e-05 [merge_recompute_call_nodes]: 1.09003e-06 [before_grad]: 1.276e-05 [set_forward_comm_id_for_comm_node_pass]: 4.60001e-06 [meta_fg_expand]: 3.14001e-06 [flash_sp_send_recv_attached]: 1.12e-06 [receive_attached]: 1.12e-06 [after_resolve]: 1.234e-05 [a_after_grad]: 1.139e-05 [renormalize]: 8.00064e-08 [add_forward_monad_depend]: 1.39e-06 [auto_monad_grad]: 9.09989e-07 [auto_monad_eliminator]: 8.50999e-06 [cse]: 1.819e-05 [a_3]: 6.043e-05 [py_interpret_to_execute_after_opt_a]: 1.75e-05 [slice_cell_reuse_recomputed_activation]: 4.52998e-06 [rewriter_after_opt_a]: 4.467e-05 [convert_after_rewriter]: 1.038e-05 [order_py_execute_after_rewriter]: 8.92e-06 [mutable_eliminate]: 0.0005997 [opt_b]: 0.0003201, [1] [Cycle 1]: 0.00031077, [7] [b_1]: 0.00020744 [b_2]: 8.97999e-06 [updatestate_depend_eliminate]: 6.49999e-06 [updatestate_assign_eliminate]: 3.14999e-06 [updatestate_loads_eliminate]: 2.99999e-06 [renormalize]: 5.89993e-07 [cse]: 2.368e-05 [optimize_parallel_all_gather_comm]: 2.274e-05 [overlap_param_gather]: 5.55001e-06 [cconv]: 3.149e-05 [loop_unroll]: 0.00047525 [opt_after_cconv]: 0.00014209, [1] [Cycle 1]: 0.00013289, [7] [c_1]: 3.999e-05 [parameter_eliminate]: 3.2e-06 [updatestate_depend_eliminate]: 5.89e-06 [updatestate_assign_eliminate]: 3.3e-06 [updatestate_loads_eliminate]: 2.84999e-06 [cse]: 2.179e-05 [renormalize]: 2.49973e-07 [remove_dup_value]: 1.869e-05 [tuple_transform]: 0.00010429, [1] [Cycle 1]: 9.759e-05, [4] [d_1]: 5.658e-05 [none_parameter_eliminate]: 1.67999e-06 [renormalize]: 2.29978e-07 [switch_simplify]: 8.19998e-06 [partial_unused_args_eliminate]: 4.68999e-06 [add_recomputation]: 5.857e-05 [cse_after_recomputation]: 3.122e-05, [1] [Cycle 1]: 2.46e-05, [1] [cse]: 1.553e-05 [environ_conv]: 8.90999e-06 [swap_dp_allreduce_reducescatter]: 8.69e-06 [bias_add_comm_swap]: 5.82999e-06 [label_micro_interleaved_index]: 7.33e-06 [label_fine_grained_interleaved_index]: 5.61003e-06 [merge_cast_opt]: 3.55e-06 [slice_recompute_activation]: 4.37998e-06 [micro_interleaved_order_control]: 5.22999e-06 [assign_add_opt]: 3.65e-06 [ForceFp32Comm]: 3.2e-06 [remove_cast_before_assign_add]: 3.48e-06 [full_micro_interleaved_order_control]: 4.60001e-06 [reorder_send_recv_between_fp_bp]: 5.72999e-06 [comm_op_add_attrs]: 3.51001e-06 [add_comm_op_reuse_tag]: 3.56001e-06 [interleave_split_concat_branches]: 3.57997e-06 [interleave_parallel_branches]: 3.34001e-06 [overlap_opt_shard_in_pipeline]: 3.53999e-06 [overlap_opt_shard_grad_in_pipeline]: 4.42998e-06 [control_data_broadcast_order]: 1.953e-05 [grouped_pairwise_exchange_alltoall]: 3.85998e-06 [offloading_packed_experts]: 6.84999e-06 [overlap_recompute_and_grad_model_parallel]: 7.48e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.63999e-06 [overlap_recompute_allgather_and_fa_grad]: 3.71999e-06 [overlap_recompute_comm]: 4.79e-06 [overlap_grad_ring_attention]: 7.10998e-06 [overlap_grad_flash_sp]: 2.389e-05 [begin_end_overlap_inline]: 2.99001e-06 [split_matmul_comm_elemetwise]: 4.57998e-06 [split_layernorm_comm]: 4.31002e-06 [handle_group_info]: 3.5e-06 [symbol_engine_optimizer]: 0.0001542, [1] [Cycle 1]: 0.00010894, [6] [build]: 3.85998e-06 [elim_shapecalc]: 1.344e-05 [elim_not_effective]: 1.761e-05 [opt_reshape]: 9.94001e-06 [fold_const_symbol]: 1.35e-05 [renormalize]: 3.9002e-07 [detach_backward]: 4.4e-06 [pipeline_parallel_scheduler]: 1.94e-06 [auto_monad_reorder]: 2.451e-05 [get_jit_bprop_graph]: 1.94e-06 [rewriter_after_jit_bprop_graph]: 5.00999e-06 [opt_after_jit_grad]: 0.00054924 [validate]: 4.486e-05 Sums bootstrap : 0.000416s : 0.79% type_inference : 0.045891s : 86.93% event_method : 0.000100s : 0.19% auto_monad : 0.000104s : 0.20% graph_reusing : 0.000008s : 0.01% inline : 0.000003s : 0.01% add_attr.add_attr_with_inline.tag_attr : 0.000031s : 0.06% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000009s : 0.02% parallel-infer-symbol : 0.000003s : 0.01% pre_auto_parallel : 0.000047s : 0.09% insert-virtual-dataset : 0.000002s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000041s : 0.08% optimize.rewriter_before_opt_a : 0.000128s : 0.24% optimize.opt_a.expand_dump_flag : 0.000006s : 0.01% optimize.opt_a.switch_simplify : 0.000137s : 0.26% optimize.opt_a.loop_unroll : 0.000050s : 0.10% optimize.opt_a.a_1 : 0.001113s : 2.11% optimize.opt_a.with_stream_mark : 0.000035s : 0.07% optimize.opt_a.recompute_prepare : 0.000023s : 0.04% optimize.opt_a.updatestate_depend_eliminate : 0.000010s : 0.02% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.01% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.01% optimize.opt_a.parameter_eliminate : 0.000003s : 0.01% optimize.opt_a.a_2 : 0.000260s : 0.49% optimize.opt_a.accelerated_algorithm : 0.000018s : 0.03% optimize.opt_a.shard : 0.000005s : 0.01% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.01% optimize.opt_a.shard_inline : 0.000017s : 0.03% optimize.opt_a.merge_send_recv : 0.000017s : 0.03% optimize.opt_a.auto_parallel : 0.000015s : 0.03% optimize.opt_a.parallel : 0.000026s : 0.05% optimize.opt_a.flash_sp : 0.000014s : 0.03% optimize.opt_a.merge_comm : 0.000010s : 0.02% optimize.opt_a.allreduce_fusion : 0.000009s : 0.02% optimize.opt_a.matmul_add_comm_reduction : 0.000020s : 0.04% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000021s : 0.04% optimize.opt_a.virtual_dataset : 0.000016s : 0.03% optimize.opt_a.get_grad_eliminate_ : 0.000015s : 0.03% optimize.opt_a.virtual_output : 0.000016s : 0.03% optimize.opt_a.merge_forward : 0.000009s : 0.02% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.01% optimize.opt_a.offload_activation : 0.000020s : 0.04% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000038s : 0.07% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.01% optimize.opt_a.before_grad : 0.000027s : 0.05% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000010s : 0.02% optimize.opt_a.meta_fg_expand : 0.000007s : 0.01% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.01% optimize.opt_a.receive_attached : 0.000004s : 0.01% optimize.opt_a.after_resolve : 0.000027s : 0.05% optimize.opt_a.a_after_grad : 0.000024s : 0.05% optimize.opt_a.renormalize : 0.001133s : 2.15% optimize.opt_a.add_forward_monad_depend : 0.000008s : 0.01% optimize.opt_a.auto_monad_grad : 0.000004s : 0.01% optimize.opt_a.auto_monad_eliminator : 0.000028s : 0.05% optimize.opt_a.cse : 0.000052s : 0.10% optimize.opt_a.a_3 : 0.000138s : 0.26% optimize.py_interpret_to_execute_after_opt_a : 0.000018s : 0.03% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.01% optimize.rewriter_after_opt_a : 0.000045s : 0.08% optimize.convert_after_rewriter : 0.000010s : 0.02% optimize.order_py_execute_after_rewriter : 0.000009s : 0.02% optimize.mutable_eliminate : 0.000600s : 1.14% optimize.opt_b.b_1 : 0.000207s : 0.39% optimize.opt_b.b_2 : 0.000009s : 0.02% optimize.opt_b.updatestate_depend_eliminate : 0.000006s : 0.01% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.01% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.01% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000024s : 0.04% optimize.optimize_parallel_all_gather_comm : 0.000023s : 0.04% optimize.overlap_param_gather : 0.000006s : 0.01% optimize.cconv : 0.000031s : 0.06% optimize.loop_unroll : 0.000475s : 0.90% optimize.opt_after_cconv.c_1 : 0.000040s : 0.08% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.01% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.cse : 0.000022s : 0.04% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000019s : 0.04% optimize.tuple_transform.d_1 : 0.000057s : 0.11% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000008s : 0.02% optimize.partial_unused_args_eliminate : 0.000005s : 0.01% optimize.add_recomputation : 0.000059s : 0.11% optimize.cse_after_recomputation.cse : 0.000016s : 0.03% optimize.environ_conv : 0.000009s : 0.02% optimize.swap_dp_allreduce_reducescatter : 0.000009s : 0.02% optimize.bias_add_comm_swap : 0.000006s : 0.01% optimize.label_micro_interleaved_index : 0.000007s : 0.01% optimize.label_fine_grained_interleaved_index : 0.000006s : 0.01% optimize.merge_cast_opt : 0.000004s : 0.01% optimize.slice_recompute_activation : 0.000004s : 0.01% optimize.micro_interleaved_order_control : 0.000005s : 0.01% optimize.assign_add_opt : 0.000004s : 0.01% optimize.ForceFp32Comm : 0.000003s : 0.01% optimize.remove_cast_before_assign_add : 0.000003s : 0.01% optimize.full_micro_interleaved_order_control : 0.000005s : 0.01% optimize.reorder_send_recv_between_fp_bp : 0.000006s : 0.01% optimize.comm_op_add_attrs : 0.000004s : 0.01% optimize.add_comm_op_reuse_tag : 0.000004s : 0.01% optimize.interleave_split_concat_branches : 0.000004s : 0.01% optimize.interleave_parallel_branches : 0.000003s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000004s : 0.01% optimize.control_data_broadcast_order : 0.000020s : 0.04% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.01% optimize.offloading_packed_experts : 0.000007s : 0.01% optimize.overlap_recompute_and_grad_model_parallel : 0.000007s : 0.01% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.01% optimize.overlap_recompute_comm : 0.000005s : 0.01% optimize.overlap_grad_ring_attention : 0.000007s : 0.01% optimize.overlap_grad_flash_sp : 0.000024s : 0.05% optimize.begin_end_overlap_inline : 0.000003s : 0.01% optimize.split_matmul_comm_elemetwise : 0.000005s : 0.01% optimize.split_layernorm_comm : 0.000004s : 0.01% optimize.handle_group_info : 0.000003s : 0.01% optimize.symbol_engine_optimizer.build : 0.000004s : 0.01% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000013s : 0.03% optimize.symbol_engine_optimizer.elim_not_effective : 0.000018s : 0.03% optimize.symbol_engine_optimizer.opt_reshape : 0.000010s : 0.02% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000014s : 0.03% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000004s : 0.01% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000025s : 0.05% get_jit_bprop_graph : 0.000002s : 0.00% rewriter_after_jit_bprop_graph : 0.000005s : 0.01% opt_after_jit_grad : 0.000549s : 1.04% validate : 0.000045s : 0.08% Time group info: ------[substitution.] 0.000269 44 10.54% : 0.000028s : 3: substitution.cast_eliminate 0.87% : 0.000002s : 3: substitution.elim_not_effective 0.74% : 0.000002s : 3: substitution.fold_const_symbol 2.48% : 0.000007s : 5: substitution.graph_param_transform 70.53% : 0.000190s : 8: substitution.inline 1.97% : 0.000005s : 6: substitution.j_node_and_user_rematch 2.43% : 0.000007s : 6: substitution.remove_not_recompute_node 2.15% : 0.000006s : 4: substitution.replace_old_param 3.54% : 0.000010s : 2: substitution.switch_simplify 4.77% : 0.000013s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.045818 2 61.28% : 0.028076s : 1: type_inference.infer 38.72% : 0.017742s : 1: type_inference.specialize ------[replace.] 0.000128 14 42.99% : 0.000055s : 8: replace.inline 37.17% : 0.000048s : 2: replace.switch_simplify 19.84% : 0.000025s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000204 14 90.57% : 0.000185s : 8: match.inline 4.10% : 0.000008s : 2: match.switch_simplify 5.33% : 0.000011s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000280 1746 0.95% : 0.000003s : 19: predicate.accumulaten_eliminater 0.59% : 0.000002s : 5: predicate.ad_related_special_op_eliminate 0.50% : 0.000001s : 10: predicate.addn_check_dump 1.03% : 0.000003s : 19: predicate.addn_zero_filter 0.87% : 0.000002s : 19: predicate.adjust_all_reduce_mul_add 2.08% : 0.000006s : 29: predicate.arithmetic_simplify 1.06% : 0.000003s : 19: predicate.cast_eliminate 0.56% : 0.000002s : 10: predicate.check_bprop_eliminate 0.50% : 0.000001s : 10: predicate.compare_switch_simplify 0.18% : 0.000001s : 5: predicate.const_output_eliminate 0.59% : 0.000002s : 10: predicate.depend_value_elim 1.05% : 0.000003s : 19: predicate.dict_get_item_const_eliminator 1.05% : 0.000003s : 19: predicate.dict_get_item_eliminator 0.93% : 0.000003s : 19: predicate.dict_set_item_eliminator 0.86% : 0.000002s : 10: predicate.dumpgradient_eliminate 0.20% : 0.000001s : 5: predicate.elim_not_effective 0.37% : 0.000001s : 5: predicate.elim_shapecalc_of_broadcastargs 1.18% : 0.000003s : 24: predicate.environ_add_const_eliminate 1.11% : 0.000003s : 24: predicate.environ_get_add_eliminate 1.15% : 0.000003s : 24: predicate.environ_get_depend_swap 1.75% : 0.000005s : 34: predicate.environ_get_eliminate 1.18% : 0.000003s : 24: predicate.environ_get_set_eliminate 1.59% : 0.000004s : 31: predicate.exchange_switch_depend_value 2.46% : 0.000007s : 31: predicate.float_depend_g_call 0.50% : 0.000001s : 10: predicate.float_environ_get_switch 0.71% : 0.000002s : 15: predicate.float_tuple_getitem_switch 0.16% : 0.000000s : 5: predicate.fold_const_symbol 0.60% : 0.000002s : 10: predicate.get_grad_eliminate 0.19% : 0.000001s : 5: predicate.graph_param_transform 0.56% : 0.000002s : 10: predicate.incorporate_call 0.46% : 0.000001s : 10: predicate.incorporate_call_switch 6.57% : 0.000018s : 80: predicate.inline 0.78% : 0.000002s : 10: predicate.inline_without_move 0.28% : 0.000001s : 10: predicate.j_node_and_user_rematch 0.73% : 0.000002s : 10: predicate.less_batch_normalization 1.87% : 0.000005s : 33: predicate.list_to_tuple_eliminator_ 2.73% : 0.000008s : 52: predicate.load_eliminater 0.75% : 0.000002s : 5: predicate.loop_unroll_after_grad 2.63% : 0.000007s : 49: predicate.loop_unroll_before_grad 1.70% : 0.000005s : 29: predicate.make_slice_get_slice_eliminator 0.54% : 0.000002s : 10: predicate.merge_addn 0.54% : 0.000002s : 10: predicate.micro_step_allgather_replace 0.54% : 0.000002s : 10: predicate.mini_step_allgather_replace 0.90% : 0.000003s : 19: predicate.minmaximum_grad 0.76% : 0.000002s : 5: predicate.mutable_eliminate 0.34% : 0.000001s : 5: predicate.opt_reshape 0.29% : 0.000001s : 5: predicate.parallel_virtual_node 2.03% : 0.000006s : 31: predicate.partial_defer_inline 1.64% : 0.000005s : 28: predicate.partial_eliminate 0.94% : 0.000003s : 19: predicate.print_const_string_wrapper 0.54% : 0.000001s : 10: predicate.reduce_all_const_elim 1.30% : 0.000004s : 19: predicate.reduce_eliminate 2.61% : 0.000007s : 52: predicate.redundant_stop_gradient_eliminater 0.37% : 0.000001s : 10: predicate.remove_not_recompute_node 1.32% : 0.000004s : 33: predicate.replace_applicator 0.43% : 0.000001s : 10: predicate.replace_old_param 0.23% : 0.000001s : 5: predicate.reset_defer_inline 1.02% : 0.000003s : 19: predicate.reshape_eliminate 0.54% : 0.000002s : 10: predicate.row_tensor_add_zeros_like 0.37% : 0.000001s : 5: predicate.row_tensor_eliminate 0.66% : 0.000002s : 10: predicate.same_eliminate 0.46% : 0.000001s : 10: predicate.set_cell_output_no_recompute 0.82% : 0.000002s : 10: predicate.shard_identity_eliminate 0.64% : 0.000002s : 10: predicate.special_op_eliminate 0.76% : 0.000002s : 10: predicate.specialize_transform 0.74% : 0.000002s : 10: predicate.split_environ_get_set_with_tuple_value 0.62% : 0.000002s : 10: predicate.stack_unstack_eliminate 0.29% : 0.000001s : 5: predicate.switch_call_monad_eliminater 1.76% : 0.000005s : 31: predicate.switch_defer_inline 2.13% : 0.000006s : 41: predicate.switch_layer_defer_inline 5.75% : 0.000016s : 99: predicate.switch_simplify 0.94% : 0.000003s : 19: predicate.tile_eliminate 1.01% : 0.000003s : 19: predicate.transpose_eliminate 1.56% : 0.000004s : 29: predicate.tuple_list_convert_item_index_to_positive 1.73% : 0.000005s : 29: predicate.tuple_list_get_item_const_eliminator 1.49% : 0.000004s : 29: predicate.tuple_list_get_item_depend_reorder 3.20% : 0.000009s : 43: predicate.tuple_list_get_item_eliminator 1.60% : 0.000004s : 29: predicate.tuple_list_get_set_item_eliminator 2.24% : 0.000006s : 39: predicate.tuple_list_set_item_eliminator 1.74% : 0.000005s : 33: predicate.tuple_to_list_eliminator_ 2.41% : 0.000007s : 52: predicate.updatestate_pure_node_eliminater 3.14% : 0.000009s : 62: predicate.updatestate_useless_node_eliminater 0.39% : 0.000001s : 5: predicate.value_based_eliminate 0.56% : 0.000002s : 10: predicate.virtual_dataset_eliminate 0.55% : 0.000002s : 10: predicate.virtual_output_eliminate 0.26% : 0.000001s : 5: predicate.virtual_view_grad_eliminate 0.34% : 0.000001s : 5: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.001059 16 53.26% : 0.000564s : 6: func_graph_cloner_run.FuncGraphClonerGraph 46.74% : 0.000495s : 10: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.071826 192 0.01% : 0.000006s : 1: ForceFp32Comm 5.02% : 0.003605s : 1: add_attr 5.00% : 0.003590s : 1: add_attr_with_inline 0.01% : 0.000006s : 1: add_comm_op_reuse_tag 0.09% : 0.000062s : 1: add_recomputation 0.01% : 0.000006s : 1: assign_add_opt 0.16% : 0.000116s : 1: auto_monad 0.05% : 0.000032s : 1: auto_monad_reorder 0.01% : 0.000006s : 1: begin_end_overlap_inline 0.01% : 0.000008s : 1: bias_add_comm_swap 0.64% : 0.000459s : 1: bootstrap 0.05% : 0.000035s : 1: cconv 0.01% : 0.000006s : 1: comm_op_add_attrs 0.03% : 0.000023s : 1: control_data_broadcast_order 0.02% : 0.000013s : 1: convert_after_rewriter 0.05% : 0.000034s : 1: cse_after_recomputation 0.01% : 0.000008s : 1: dataset_repeat_opt 0.03% : 0.000024s : 1: detach_backward 0.02% : 0.000012s : 1: environ_conv 0.16% : 0.000116s : 1: event_method 0.01% : 0.000007s : 1: full_micro_interleaved_order_control 0.01% : 0.000008s : 1: get_jit_bprop_graph 0.02% : 0.000014s : 1: graph_reusing 0.01% : 0.000006s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000007s : 1: handle_group_info 0.01% : 0.000009s : 1: inline 0.01% : 0.000008s : 1: insert-virtual-dataset 0.01% : 0.000006s : 1: interleave_parallel_branches 0.01% : 0.000007s : 1: interleave_split_concat_branches 0.01% : 0.000008s : 1: label_fine_grained_interleaved_index 0.01% : 0.000010s : 1: label_micro_interleaved_index 0.67% : 0.000481s : 1: loop_unroll 0.01% : 0.000006s : 1: merge_cast_opt 0.01% : 0.000008s : 1: micro_interleaved_order_control 0.84% : 0.000606s : 1: mutable_eliminate 0.01% : 0.000010s : 1: offloading_packed_experts 0.02% : 0.000017s : 1: opt.transform.loop_unroll_optimizer 0.02% : 0.000017s : 1: opt.transform.mutable_eliminate 2.44% : 0.001755s : 78: opt.transform.opt_a 0.05% : 0.000039s : 1: opt.transform.opt_after_cconv 0.05% : 0.000034s : 1: opt.transform.opt_after_jit_grad 0.20% : 0.000141s : 28: opt.transform.opt_b 0.09% : 0.000062s : 2: opt.transform.opt_trans_graph 0.07% : 0.000050s : 4: opt.transform.symbol_engine_opt 5.85% : 0.004201s : 1: opt_a 0.20% : 0.000146s : 1: opt_after_cconv 0.78% : 0.000561s : 1: opt_after_jit_grad 0.45% : 0.000324s : 1: opt_b 10.06% : 0.007227s : 1: optimize 0.04% : 0.000026s : 1: optimize_parallel_all_gather_comm 0.02% : 0.000012s : 1: order_py_execute_after_rewriter 0.04% : 0.000027s : 1: overlap_grad_flash_sp 0.01% : 0.000007s : 1: overlap_grad_matmul_and_grad_allreduce 0.01% : 0.000010s : 1: overlap_grad_ring_attention 0.01% : 0.000007s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000006s : 1: overlap_opt_shard_in_pipeline 0.01% : 0.000009s : 1: overlap_param_gather 0.01% : 0.000006s : 1: overlap_recompute_allgather_and_fa_grad 0.01% : 0.000010s : 1: overlap_recompute_and_grad_model_parallel 0.01% : 0.000008s : 1: overlap_recompute_comm 0.01% : 0.000009s : 1: parallel-infer-symbol 0.01% : 0.000007s : 1: parallel-infer-symbol-second 0.01% : 0.000008s : 1: partial_unused_args_eliminate 0.01% : 0.000009s : 1: pipeline_parallel_scheduler 0.01% : 0.000007s : 1: pipeline_split 0.08% : 0.000055s : 1: pre_auto_parallel 0.06% : 0.000044s : 1: py_interpret_to_execute 0.03% : 0.000021s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000006s : 1: remove_cast_before_assign_add 0.03% : 0.000022s : 1: remove_dup_value 0.84% : 0.000602s : 1: renormalize.infer 0.72% : 0.000519s : 1: renormalize.specialize 0.01% : 0.000009s : 1: reorder_send_recv_between_fp_bp 0.02% : 0.000012s : 1: rewriter_after_jit_bprop_graph 0.07% : 0.000048s : 1: rewriter_after_opt_a 0.18% : 0.000131s : 1: rewriter_before_opt_a 0.01% : 0.000007s : 1: slice_cell_reuse_recomputed_activation 0.01% : 0.000007s : 1: slice_recompute_activation 0.01% : 0.000007s : 1: split_layernorm_comm 0.01% : 0.000007s : 1: split_matmul_comm_elemetwise 0.02% : 0.000011s : 1: swap_dp_allreduce_reducescatter 0.22% : 0.000157s : 1: symbol_engine_optimizer 0.15% : 0.000107s : 1: tuple_transform 63.96% : 0.045939s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:23.137.95 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0400371, [21] [bootstrap]: 0.00041938 [type_inference]: 0.029624 [event_method]: 9.134e-05 [auto_monad]: 9.903e-05 [graph_reusing]: 7.36999e-06 [inline]: 2.83e-06 [add_attr]: 0.00324807, [1] [add_attr_with_inline]: 0.00323829, [1] [Cycle 1]: 7.287e-05, [2] [tag_attr]: 2.959e-05 [meta_addattr_fg_expand]: 7.98001e-06 [parallel-infer-symbol]: 3.12002e-06 [pre_auto_parallel]: 4.392e-05 [insert-virtual-dataset]: 2.39001e-06 [parallel-infer-symbol-second]: 8.10018e-07 [dataset_repeat_opt]: 2.24001e-06 [pipeline_split]: 1.61998e-06 [optimize]: 0.00577484, [53] [py_interpret_to_execute]: 3.383e-05 [rewriter_before_opt_a]: 0.00011682 [opt_a]: 0.00356796, [2] [Cycle 1]: 0.00276157, [45] [expand_dump_flag]: 4.32e-06 [switch_simplify]: 0.00012872 [loop_unroll]: 4.212e-05 [a_1]: 0.00090287 [with_stream_mark]: 1.708e-05 [recompute_prepare]: 1.057e-05 [updatestate_depend_eliminate]: 4.63999e-06 [updatestate_assign_eliminate]: 3.67998e-06 [updatestate_loads_eliminate]: 3.50998e-06 [parameter_eliminate]: 1.89e-06 [a_2]: 0.00011373 [accelerated_algorithm]: 9.42001e-06 [shard]: 2.26998e-06 [meta_shard_fg_expand]: 2.57001e-06 [shard_inline]: 8.68001e-06 [merge_send_recv]: 9.75002e-06 [auto_parallel]: 7.2e-06 [parallel]: 1.885e-05 [flash_sp]: 8.90001e-06 [merge_comm]: 4.83001e-06 [allreduce_fusion]: 4.03001e-06 [matmul_add_comm_reduction]: 1.052e-05 [allreduce_slice_to_reducescatter]: 6.80011e-07 [virtual_shard_identity]: 9.61e-06 [virtual_dataset]: 7.98001e-06 [get_grad_eliminate_]: 7.78999e-06 [virtual_output]: 7.88001e-06 [merge_forward]: 4.32e-06 [cell_reuse_recompute_pass]: 1.40999e-06 [offload_activation]: 1.194e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.53e-05 [merge_recompute_call_nodes]: 1.40001e-06 [before_grad]: 1.346e-05 [set_forward_comm_id_for_comm_node_pass]: 4.57e-06 [meta_fg_expand]: 4.18001e-06 [flash_sp_send_recv_attached]: 3.04999e-06 [receive_attached]: 2.61999e-06 [after_resolve]: 1.333e-05 [a_after_grad]: 1.45e-05 [renormalize]: 0.0009384 [add_forward_monad_depend]: 5.40999e-06 [auto_monad_grad]: 2.22001e-06 [auto_monad_eliminator]: 1.788e-05 [cse]: 3.461e-05 [a_3]: 6.008e-05 [Cycle 2]: 0.00079636, [45] [expand_dump_flag]: 1.50999e-06 [switch_simplify]: 9.35001e-06 [loop_unroll]: 2.86e-05 [a_1]: 0.00018191 [with_stream_mark]: 1.336e-05 [recompute_prepare]: 8.12e-06 [updatestate_depend_eliminate]: 4.33999e-06 [updatestate_assign_eliminate]: 2.94001e-06 [updatestate_loads_eliminate]: 2.74999e-06 [parameter_eliminate]: 1.07998e-06 [a_2]: 9.642e-05 [accelerated_algorithm]: 7.77e-06 [shard]: 1.39e-06 [meta_shard_fg_expand]: 1.62999e-06 [shard_inline]: 7.78999e-06 [merge_send_recv]: 5.89999e-06 [auto_parallel]: 6.76e-06 [parallel]: 5.05999e-06 [flash_sp]: 3.23e-06 [merge_comm]: 3.78001e-06 [allreduce_fusion]: 3.78001e-06 [matmul_add_comm_reduction]: 6.61e-06 [allreduce_slice_to_reducescatter]: 3.69997e-07 [virtual_shard_identity]: 8.70999e-06 [virtual_dataset]: 7.66001e-06 [get_grad_eliminate_]: 8.08999e-06 [virtual_output]: 7.61999e-06 [merge_forward]: 3.95e-06 [cell_reuse_recompute_pass]: 1.75001e-06 [offload_activation]: 8.12998e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.454e-05 [merge_recompute_call_nodes]: 1.10001e-06 [before_grad]: 1.188e-05 [set_forward_comm_id_for_comm_node_pass]: 4.82e-06 [meta_fg_expand]: 3.5e-06 [flash_sp_send_recv_attached]: 8.39995e-07 [receive_attached]: 1.04e-06 [after_resolve]: 1.145e-05 [a_after_grad]: 1.162e-05 [renormalize]: 7.00238e-08 [add_forward_monad_depend]: 1.30999e-06 [auto_monad_grad]: 1.17999e-06 [auto_monad_eliminator]: 8.3e-06 [cse]: 1.978e-05 [a_3]: 4.701e-05 [py_interpret_to_execute_after_opt_a]: 1.164e-05 [slice_cell_reuse_recomputed_activation]: 2.51e-06 [rewriter_after_opt_a]: 3.929e-05 [convert_after_rewriter]: 7.68999e-06 [order_py_execute_after_rewriter]: 6.37001e-06 [mutable_eliminate]: 0.0005348 [opt_b]: 0.0002448, [1] [Cycle 1]: 0.000239, [7] [b_1]: 0.00016017 [b_2]: 9.05999e-06 [updatestate_depend_eliminate]: 5.87001e-06 [updatestate_assign_eliminate]: 3.03e-06 [updatestate_loads_eliminate]: 3.21999e-06 [renormalize]: 3.59985e-07 [cse]: 2.248e-05 [optimize_parallel_all_gather_comm]: 1.673e-05 [overlap_param_gather]: 1.88002e-06 [cconv]: 2.618e-05 [loop_unroll]: 0.00043761 [opt_after_cconv]: 0.00011485, [1] [Cycle 1]: 0.00010945, [7] [c_1]: 3.865e-05 [parameter_eliminate]: 2.99999e-06 [updatestate_depend_eliminate]: 5.61e-06 [updatestate_assign_eliminate]: 2.96001e-06 [updatestate_loads_eliminate]: 3.38999e-06 [cse]: 2.179e-05 [renormalize]: 4.00003e-07 [remove_dup_value]: 1.491e-05 [tuple_transform]: 8.887e-05, [1] [Cycle 1]: 8.401e-05, [4] [d_1]: 5.582e-05 [none_parameter_eliminate]: 1.77001e-06 [renormalize]: 2.70025e-07 [switch_simplify]: 8.05e-06 [partial_unused_args_eliminate]: 1.94e-06 [add_recomputation]: 5.5e-05 [cse_after_recomputation]: 2.494e-05, [1] [Cycle 1]: 2.023e-05, [1] [cse]: 1.44e-05 [environ_conv]: 6.59999e-06 [swap_dp_allreduce_reducescatter]: 5.52999e-06 [bias_add_comm_swap]: 2.43002e-06 [label_micro_interleaved_index]: 4.66002e-06 [label_fine_grained_interleaved_index]: 2.83e-06 [merge_cast_opt]: 1.28002e-06 [slice_recompute_activation]: 2.34001e-06 [micro_interleaved_order_control]: 2.84001e-06 [assign_add_opt]: 1.12e-06 [ForceFp32Comm]: 9.30013e-07 [remove_cast_before_assign_add]: 1.52999e-06 [full_micro_interleaved_order_control]: 2.00002e-06 [reorder_send_recv_between_fp_bp]: 2.77002e-06 [comm_op_add_attrs]: 9.29984e-07 [add_comm_op_reuse_tag]: 9.80013e-07 [interleave_split_concat_branches]: 1.17999e-06 [interleave_parallel_branches]: 1.05001e-06 [overlap_opt_shard_in_pipeline]: 1.18001e-06 [overlap_opt_shard_grad_in_pipeline]: 1.90001e-06 [control_data_broadcast_order]: 1.648e-05 [grouped_pairwise_exchange_alltoall]: 1.59e-06 [offloading_packed_experts]: 4.56002e-06 [overlap_recompute_and_grad_model_parallel]: 5.47999e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.31002e-06 [overlap_recompute_allgather_and_fa_grad]: 1.35001e-06 [overlap_recompute_comm]: 2.50002e-06 [overlap_grad_ring_attention]: 4.79e-06 [overlap_grad_flash_sp]: 2.02e-05 [begin_end_overlap_inline]: 4.89992e-07 [split_matmul_comm_elemetwise]: 2.49001e-06 [split_layernorm_comm]: 1.62001e-06 [handle_group_info]: 1.09003e-06 [symbol_engine_optimizer]: 8.563e-05, [1] [Cycle 1]: 8.112e-05, [6] [build]: 3.41999e-06 [elim_shapecalc]: 1.205e-05 [elim_not_effective]: 1.624e-05 [opt_reshape]: 8.47e-06 [fold_const_symbol]: 1.282e-05 [renormalize]: 2.00002e-07 [detach_backward]: 1.91e-06 [pipeline_parallel_scheduler]: 1.57001e-06 [auto_monad_reorder]: 1.921e-05 [get_jit_bprop_graph]: 1.59998e-06 [rewriter_after_jit_bprop_graph]: 4.30999e-06 [opt_after_jit_grad]: 0.0004845 [validate]: 4.248e-05 Sums bootstrap : 0.000419s : 1.17% type_inference : 0.029624s : 82.72% event_method : 0.000091s : 0.26% auto_monad : 0.000099s : 0.28% graph_reusing : 0.000007s : 0.02% inline : 0.000003s : 0.01% add_attr.add_attr_with_inline.tag_attr : 0.000030s : 0.08% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000008s : 0.02% parallel-infer-symbol : 0.000003s : 0.01% pre_auto_parallel : 0.000044s : 0.12% insert-virtual-dataset : 0.000002s : 0.01% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.01% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000034s : 0.09% optimize.rewriter_before_opt_a : 0.000117s : 0.33% optimize.opt_a.expand_dump_flag : 0.000006s : 0.02% optimize.opt_a.switch_simplify : 0.000138s : 0.39% optimize.opt_a.loop_unroll : 0.000071s : 0.20% optimize.opt_a.a_1 : 0.001085s : 3.03% optimize.opt_a.with_stream_mark : 0.000030s : 0.09% optimize.opt_a.recompute_prepare : 0.000019s : 0.05% optimize.opt_a.updatestate_depend_eliminate : 0.000009s : 0.03% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.02% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.02% optimize.opt_a.parameter_eliminate : 0.000003s : 0.01% optimize.opt_a.a_2 : 0.000210s : 0.59% optimize.opt_a.accelerated_algorithm : 0.000017s : 0.05% optimize.opt_a.shard : 0.000004s : 0.01% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.01% optimize.opt_a.shard_inline : 0.000016s : 0.05% optimize.opt_a.merge_send_recv : 0.000016s : 0.04% optimize.opt_a.auto_parallel : 0.000014s : 0.04% optimize.opt_a.parallel : 0.000024s : 0.07% optimize.opt_a.flash_sp : 0.000012s : 0.03% optimize.opt_a.merge_comm : 0.000009s : 0.02% optimize.opt_a.allreduce_fusion : 0.000008s : 0.02% optimize.opt_a.matmul_add_comm_reduction : 0.000017s : 0.05% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000018s : 0.05% optimize.opt_a.virtual_dataset : 0.000016s : 0.04% optimize.opt_a.get_grad_eliminate_ : 0.000016s : 0.04% optimize.opt_a.virtual_output : 0.000016s : 0.04% optimize.opt_a.merge_forward : 0.000008s : 0.02% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.01% optimize.opt_a.offload_activation : 0.000020s : 0.06% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000030s : 0.08% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.01% optimize.opt_a.before_grad : 0.000025s : 0.07% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000009s : 0.03% optimize.opt_a.meta_fg_expand : 0.000008s : 0.02% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.01% optimize.opt_a.receive_attached : 0.000004s : 0.01% optimize.opt_a.after_resolve : 0.000025s : 0.07% optimize.opt_a.a_after_grad : 0.000026s : 0.07% optimize.opt_a.renormalize : 0.000938s : 2.62% optimize.opt_a.add_forward_monad_depend : 0.000007s : 0.02% optimize.opt_a.auto_monad_grad : 0.000003s : 0.01% optimize.opt_a.auto_monad_eliminator : 0.000026s : 0.07% optimize.opt_a.cse : 0.000054s : 0.15% optimize.opt_a.a_3 : 0.000107s : 0.30% optimize.py_interpret_to_execute_after_opt_a : 0.000012s : 0.03% optimize.slice_cell_reuse_recomputed_activation : 0.000003s : 0.01% optimize.rewriter_after_opt_a : 0.000039s : 0.11% optimize.convert_after_rewriter : 0.000008s : 0.02% optimize.order_py_execute_after_rewriter : 0.000006s : 0.02% optimize.mutable_eliminate : 0.000535s : 1.49% optimize.opt_b.b_1 : 0.000160s : 0.45% optimize.opt_b.b_2 : 0.000009s : 0.03% optimize.opt_b.updatestate_depend_eliminate : 0.000006s : 0.02% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.01% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.01% optimize.opt_b.renormalize : 0.000000s : 0.00% optimize.opt_b.cse : 0.000022s : 0.06% optimize.optimize_parallel_all_gather_comm : 0.000017s : 0.05% optimize.overlap_param_gather : 0.000002s : 0.01% optimize.cconv : 0.000026s : 0.07% optimize.loop_unroll : 0.000438s : 1.22% optimize.opt_after_cconv.c_1 : 0.000039s : 0.11% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.02% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.cse : 0.000022s : 0.06% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000015s : 0.04% optimize.tuple_transform.d_1 : 0.000056s : 0.16% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000008s : 0.02% optimize.partial_unused_args_eliminate : 0.000002s : 0.01% optimize.add_recomputation : 0.000055s : 0.15% optimize.cse_after_recomputation.cse : 0.000014s : 0.04% optimize.environ_conv : 0.000007s : 0.02% optimize.swap_dp_allreduce_reducescatter : 0.000006s : 0.02% optimize.bias_add_comm_swap : 0.000002s : 0.01% optimize.label_micro_interleaved_index : 0.000005s : 0.01% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.01% optimize.merge_cast_opt : 0.000001s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.01% optimize.micro_interleaved_order_control : 0.000003s : 0.01% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000002s : 0.00% optimize.full_micro_interleaved_order_control : 0.000002s : 0.01% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.01% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.01% optimize.control_data_broadcast_order : 0.000016s : 0.05% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.00% optimize.offloading_packed_experts : 0.000005s : 0.01% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.02% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.00% optimize.overlap_recompute_comm : 0.000003s : 0.01% optimize.overlap_grad_ring_attention : 0.000005s : 0.01% optimize.overlap_grad_flash_sp : 0.000020s : 0.06% optimize.begin_end_overlap_inline : 0.000000s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.01% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000003s : 0.01% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000012s : 0.03% optimize.symbol_engine_optimizer.elim_not_effective : 0.000016s : 0.05% optimize.symbol_engine_optimizer.opt_reshape : 0.000008s : 0.02% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000013s : 0.04% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.01% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000019s : 0.05% get_jit_bprop_graph : 0.000002s : 0.00% rewriter_after_jit_bprop_graph : 0.000004s : 0.01% opt_after_jit_grad : 0.000484s : 1.35% validate : 0.000042s : 0.12% Time group info: ------[substitution.] 0.000253 44 9.23% : 0.000023s : 3: substitution.cast_eliminate 0.89% : 0.000002s : 3: substitution.elim_not_effective 0.82% : 0.000002s : 3: substitution.fold_const_symbol 2.96% : 0.000007s : 5: substitution.graph_param_transform 71.18% : 0.000180s : 8: substitution.inline 1.72% : 0.000004s : 6: substitution.j_node_and_user_rematch 2.47% : 0.000006s : 6: substitution.remove_not_recompute_node 1.80% : 0.000005s : 4: substitution.replace_old_param 3.70% : 0.000009s : 2: substitution.switch_simplify 5.22% : 0.000013s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.029554 2 95.45% : 0.028208s : 1: type_inference.infer 4.55% : 0.001345s : 1: type_inference.specialize ------[replace.] 0.000124 14 41.81% : 0.000052s : 8: replace.inline 38.92% : 0.000048s : 2: replace.switch_simplify 19.27% : 0.000024s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000194 14 90.18% : 0.000175s : 8: match.inline 4.09% : 0.000008s : 2: match.switch_simplify 5.74% : 0.000011s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000276 1746 0.95% : 0.000003s : 19: predicate.accumulaten_eliminater 0.60% : 0.000002s : 5: predicate.ad_related_special_op_eliminate 0.53% : 0.000001s : 10: predicate.addn_check_dump 1.04% : 0.000003s : 19: predicate.addn_zero_filter 0.99% : 0.000003s : 19: predicate.adjust_all_reduce_mul_add 2.43% : 0.000007s : 29: predicate.arithmetic_simplify 1.12% : 0.000003s : 19: predicate.cast_eliminate 0.59% : 0.000002s : 10: predicate.check_bprop_eliminate 0.53% : 0.000001s : 10: predicate.compare_switch_simplify 0.16% : 0.000000s : 5: predicate.const_output_eliminate 0.59% : 0.000002s : 10: predicate.depend_value_elim 1.00% : 0.000003s : 19: predicate.dict_get_item_const_eliminator 1.10% : 0.000003s : 19: predicate.dict_get_item_eliminator 1.02% : 0.000003s : 19: predicate.dict_set_item_eliminator 0.89% : 0.000002s : 10: predicate.dumpgradient_eliminate 0.20% : 0.000001s : 5: predicate.elim_not_effective 0.33% : 0.000001s : 5: predicate.elim_shapecalc_of_broadcastargs 1.20% : 0.000003s : 24: predicate.environ_add_const_eliminate 1.17% : 0.000003s : 24: predicate.environ_get_add_eliminate 1.17% : 0.000003s : 24: predicate.environ_get_depend_swap 1.68% : 0.000005s : 34: predicate.environ_get_eliminate 1.13% : 0.000003s : 24: predicate.environ_get_set_eliminate 1.60% : 0.000004s : 31: predicate.exchange_switch_depend_value 2.57% : 0.000007s : 31: predicate.float_depend_g_call 0.50% : 0.000001s : 10: predicate.float_environ_get_switch 0.76% : 0.000002s : 15: predicate.float_tuple_getitem_switch 0.17% : 0.000000s : 5: predicate.fold_const_symbol 0.65% : 0.000002s : 10: predicate.get_grad_eliminate 0.18% : 0.000001s : 5: predicate.graph_param_transform 0.58% : 0.000002s : 10: predicate.incorporate_call 0.50% : 0.000001s : 10: predicate.incorporate_call_switch 6.44% : 0.000018s : 80: predicate.inline 0.84% : 0.000002s : 10: predicate.inline_without_move 0.29% : 0.000001s : 10: predicate.j_node_and_user_rematch 0.77% : 0.000002s : 10: predicate.less_batch_normalization 1.80% : 0.000005s : 33: predicate.list_to_tuple_eliminator_ 2.51% : 0.000007s : 52: predicate.load_eliminater 0.69% : 0.000002s : 5: predicate.loop_unroll_after_grad 2.61% : 0.000007s : 49: predicate.loop_unroll_before_grad 1.65% : 0.000005s : 29: predicate.make_slice_get_slice_eliminator 0.53% : 0.000001s : 10: predicate.merge_addn 0.49% : 0.000001s : 10: predicate.micro_step_allgather_replace 0.51% : 0.000001s : 10: predicate.mini_step_allgather_replace 0.88% : 0.000002s : 19: predicate.minmaximum_grad 0.74% : 0.000002s : 5: predicate.mutable_eliminate 0.30% : 0.000001s : 5: predicate.opt_reshape 0.43% : 0.000001s : 5: predicate.parallel_virtual_node 1.88% : 0.000005s : 31: predicate.partial_defer_inline 1.70% : 0.000005s : 28: predicate.partial_eliminate 0.96% : 0.000003s : 19: predicate.print_const_string_wrapper 0.53% : 0.000001s : 10: predicate.reduce_all_const_elim 1.18% : 0.000003s : 19: predicate.reduce_eliminate 2.60% : 0.000007s : 52: predicate.redundant_stop_gradient_eliminater 0.34% : 0.000001s : 10: predicate.remove_not_recompute_node 1.31% : 0.000004s : 33: predicate.replace_applicator 0.49% : 0.000001s : 10: predicate.replace_old_param 0.23% : 0.000001s : 5: predicate.reset_defer_inline 1.03% : 0.000003s : 19: predicate.reshape_eliminate 0.59% : 0.000002s : 10: predicate.row_tensor_add_zeros_like 0.33% : 0.000001s : 5: predicate.row_tensor_eliminate 0.61% : 0.000002s : 10: predicate.same_eliminate 0.38% : 0.000001s : 10: predicate.set_cell_output_no_recompute 0.69% : 0.000002s : 10: predicate.shard_identity_eliminate 0.59% : 0.000002s : 10: predicate.special_op_eliminate 0.68% : 0.000002s : 10: predicate.specialize_transform 0.71% : 0.000002s : 10: predicate.split_environ_get_set_with_tuple_value 0.74% : 0.000002s : 10: predicate.stack_unstack_eliminate 0.31% : 0.000001s : 5: predicate.switch_call_monad_eliminater 1.68% : 0.000005s : 31: predicate.switch_defer_inline 2.12% : 0.000006s : 41: predicate.switch_layer_defer_inline 5.81% : 0.000016s : 99: predicate.switch_simplify 1.04% : 0.000003s : 19: predicate.tile_eliminate 1.02% : 0.000003s : 19: predicate.transpose_eliminate 1.67% : 0.000005s : 29: predicate.tuple_list_convert_item_index_to_positive 1.67% : 0.000005s : 29: predicate.tuple_list_get_item_const_eliminator 1.51% : 0.000004s : 29: predicate.tuple_list_get_item_depend_reorder 2.99% : 0.000008s : 43: predicate.tuple_list_get_item_eliminator 1.53% : 0.000004s : 29: predicate.tuple_list_get_set_item_eliminator 2.24% : 0.000006s : 39: predicate.tuple_list_set_item_eliminator 1.68% : 0.000005s : 33: predicate.tuple_to_list_eliminator_ 2.46% : 0.000007s : 52: predicate.updatestate_pure_node_eliminater 3.06% : 0.000008s : 62: predicate.updatestate_useless_node_eliminater 0.43% : 0.000001s : 5: predicate.value_based_eliminate 0.58% : 0.000002s : 10: predicate.virtual_dataset_eliminate 0.58% : 0.000002s : 10: predicate.virtual_output_eliminate 0.27% : 0.000001s : 5: predicate.virtual_view_grad_eliminate 0.38% : 0.000001s : 5: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000995 16 57.84% : 0.000575s : 6: func_graph_cloner_run.FuncGraphClonerGraph 42.16% : 0.000419s : 10: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.051929 192 0.01% : 0.000004s : 1: ForceFp32Comm 6.26% : 0.003253s : 1: add_attr 6.24% : 0.003242s : 1: add_attr_with_inline 0.01% : 0.000004s : 1: add_comm_op_reuse_tag 0.11% : 0.000059s : 1: add_recomputation 0.01% : 0.000004s : 1: assign_add_opt 0.20% : 0.000106s : 1: auto_monad 0.04% : 0.000023s : 1: auto_monad_reorder 0.01% : 0.000003s : 1: begin_end_overlap_inline 0.01% : 0.000006s : 1: bias_add_comm_swap 0.86% : 0.000448s : 1: bootstrap 0.06% : 0.000030s : 1: cconv 0.01% : 0.000004s : 1: comm_op_add_attrs 0.04% : 0.000020s : 1: control_data_broadcast_order 0.02% : 0.000011s : 1: convert_after_rewriter 0.05% : 0.000028s : 1: cse_after_recomputation 0.01% : 0.000005s : 1: dataset_repeat_opt 0.01% : 0.000005s : 1: detach_backward 0.02% : 0.000010s : 1: environ_conv 0.19% : 0.000101s : 1: event_method 0.01% : 0.000005s : 1: full_micro_interleaved_order_control 0.01% : 0.000005s : 1: get_jit_bprop_graph 0.02% : 0.000011s : 1: graph_reusing 0.01% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000004s : 1: handle_group_info 0.01% : 0.000006s : 1: inline 0.01% : 0.000006s : 1: insert-virtual-dataset 0.01% : 0.000004s : 1: interleave_parallel_branches 0.01% : 0.000004s : 1: interleave_split_concat_branches 0.01% : 0.000006s : 1: label_fine_grained_interleaved_index 0.01% : 0.000007s : 1: label_micro_interleaved_index 0.86% : 0.000445s : 1: loop_unroll 0.01% : 0.000004s : 1: merge_cast_opt 0.01% : 0.000006s : 1: micro_interleaved_order_control 1.05% : 0.000543s : 1: mutable_eliminate 0.01% : 0.000008s : 1: offloading_packed_experts 0.03% : 0.000016s : 1: opt.transform.loop_unroll_optimizer 0.03% : 0.000017s : 1: opt.transform.mutable_eliminate 3.34% : 0.001737s : 78: opt.transform.opt_a 0.07% : 0.000037s : 1: opt.transform.opt_after_cconv 0.06% : 0.000030s : 1: opt.transform.opt_after_jit_grad 0.27% : 0.000138s : 28: opt.transform.opt_b 0.12% : 0.000062s : 2: opt.transform.opt_trans_graph 0.09% : 0.000045s : 4: opt.transform.symbol_engine_opt 6.88% : 0.003571s : 1: opt_a 0.23% : 0.000118s : 1: opt_after_cconv 0.95% : 0.000493s : 1: opt_after_jit_grad 0.48% : 0.000248s : 1: opt_b 11.13% : 0.005780s : 1: optimize 0.04% : 0.000020s : 1: optimize_parallel_all_gather_comm 0.02% : 0.000009s : 1: order_py_execute_after_rewriter 0.05% : 0.000024s : 1: overlap_grad_flash_sp 0.01% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.01% : 0.000008s : 1: overlap_grad_ring_attention 0.01% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.01% : 0.000005s : 1: overlap_param_gather 0.01% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.02% : 0.000008s : 1: overlap_recompute_and_grad_model_parallel 0.01% : 0.000005s : 1: overlap_recompute_comm 0.01% : 0.000007s : 1: parallel-infer-symbol 0.01% : 0.000004s : 1: parallel-infer-symbol-second 0.01% : 0.000005s : 1: partial_unused_args_eliminate 0.01% : 0.000005s : 1: pipeline_parallel_scheduler 0.01% : 0.000004s : 1: pipeline_split 0.09% : 0.000048s : 1: pre_auto_parallel 0.07% : 0.000038s : 1: py_interpret_to_execute 0.03% : 0.000015s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000004s : 1: remove_cast_before_assign_add 0.04% : 0.000019s : 1: remove_dup_value 0.95% : 0.000493s : 1: renormalize.infer 0.84% : 0.000436s : 1: renormalize.specialize 0.01% : 0.000005s : 1: reorder_send_recv_between_fp_bp 0.01% : 0.000008s : 1: rewriter_after_jit_bprop_graph 0.08% : 0.000043s : 1: rewriter_after_opt_a 0.23% : 0.000121s : 1: rewriter_before_opt_a 0.01% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.01% : 0.000005s : 1: slice_recompute_activation 0.01% : 0.000005s : 1: split_layernorm_comm 0.01% : 0.000005s : 1: split_matmul_comm_elemetwise 0.02% : 0.000008s : 1: swap_dp_allreduce_reducescatter 0.17% : 0.000088s : 1: symbol_engine_optimizer 0.18% : 0.000092s : 1: tuple_transform 57.09% : 0.029645s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:23.407.324 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:23.407.602 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0405403, [21] [bootstrap]: 0.0164734 [type_inference]: 0.013276 [event_method]: 5.255e-05 [auto_monad]: 9.993e-05 [graph_reusing]: 8.02e-06 [inline]: 2.70997e-06 [add_attr]: 0.00326317, [1] [add_attr_with_inline]: 0.00325405, [1] [Cycle 1]: 8.471e-05, [2] [tag_attr]: 2.881e-05 [meta_addattr_fg_expand]: 8.85999e-06 [parallel-infer-symbol]: 3.86999e-06 [pre_auto_parallel]: 4.406e-05 [insert-virtual-dataset]: 2.36e-06 [parallel-infer-symbol-second]: 7.39994e-07 [dataset_repeat_opt]: 1.91998e-06 [pipeline_split]: 1.57001e-06 [optimize]: 0.00608493, [53] [py_interpret_to_execute]: 4.261e-05 [rewriter_before_opt_a]: 0.00012306 [opt_a]: 0.00366986, [2] [Cycle 1]: 0.00284019, [45] [expand_dump_flag]: 4.41002e-06 [switch_simplify]: 0.00012408 [loop_unroll]: 4.734e-05 [a_1]: 0.00086068 [with_stream_mark]: 1.58e-05 [recompute_prepare]: 8.77e-06 [updatestate_depend_eliminate]: 4e-06 [updatestate_assign_eliminate]: 3.21001e-06 [updatestate_loads_eliminate]: 2.96001e-06 [parameter_eliminate]: 1.94999e-06 [a_2]: 0.0001116 [accelerated_algorithm]: 7.39002e-06 [shard]: 1.94999e-06 [meta_shard_fg_expand]: 2.36e-06 [shard_inline]: 6.88e-06 [merge_send_recv]: 8.20999e-06 [auto_parallel]: 7.01001e-06 [parallel]: 1.95e-05 [flash_sp]: 8.72998e-06 [merge_comm]: 3.86999e-06 [allreduce_fusion]: 3.58e-06 [matmul_add_comm_reduction]: 9.71998e-06 [allreduce_slice_to_reducescatter]: 7.90023e-07 [virtual_shard_identity]: 8.30999e-06 [virtual_dataset]: 6.74999e-06 [get_grad_eliminate_]: 6.79001e-06 [virtual_output]: 6.59001e-06 [merge_forward]: 3.70998e-06 [cell_reuse_recompute_pass]: 1.12999e-06 [offload_activation]: 1.041e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.488e-05 [merge_recompute_call_nodes]: 1.51002e-06 [before_grad]: 1.06e-05 [set_forward_comm_id_for_comm_node_pass]: 3.83001e-06 [meta_fg_expand]: 3.38e-06 [flash_sp_send_recv_attached]: 2.78998e-06 [receive_attached]: 2.29001e-06 [after_resolve]: 1.191e-05 [a_after_grad]: 9.87001e-06 [renormalize]: 0.00095032 [add_forward_monad_depend]: 5.69e-06 [auto_monad_grad]: 2.09e-06 [auto_monad_eliminator]: 1.598e-05 [cse]: 3.919e-05 [a_3]: 6.283e-05 [Cycle 2]: 0.00081745, [45] [expand_dump_flag]: 1.12e-06 [switch_simplify]: 8.23001e-06 [loop_unroll]: 6.51999e-06 [a_1]: 0.00013619 [with_stream_mark]: 1.19e-05 [recompute_prepare]: 6.58e-06 [updatestate_depend_eliminate]: 3.05002e-06 [updatestate_assign_eliminate]: 2.43e-06 [updatestate_loads_eliminate]: 2.64999e-06 [parameter_eliminate]: 1.09998e-06 [a_2]: 0.00010138 [accelerated_algorithm]: 6.51e-06 [shard]: 1.39998e-06 [meta_shard_fg_expand]: 1.50999e-06 [shard_inline]: 6.57002e-06 [merge_send_recv]: 5.27001e-06 [auto_parallel]: 5.94e-06 [parallel]: 5.07e-06 [flash_sp]: 3.38e-06 [merge_comm]: 3.14001e-06 [allreduce_fusion]: 2.91e-06 [matmul_add_comm_reduction]: 6.03002e-06 [allreduce_slice_to_reducescatter]: 4.80009e-07 [virtual_shard_identity]: 6.88e-06 [virtual_dataset]: 6.26998e-06 [get_grad_eliminate_]: 6.12999e-06 [virtual_output]: 5.83002e-06 [merge_forward]: 2.64001e-06 [cell_reuse_recompute_pass]: 1.45001e-06 [offload_activation]: 6.39001e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.553e-05 [merge_recompute_call_nodes]: 7.50006e-07 [before_grad]: 9.56e-06 [set_forward_comm_id_for_comm_node_pass]: 3.6e-06 [meta_fg_expand]: 2.03997e-06 [flash_sp_send_recv_attached]: 9.60019e-07 [receive_attached]: 1.08001e-06 [after_resolve]: 1.083e-05 [a_after_grad]: 9.25001e-06 [renormalize]: 8.00064e-08 [add_forward_monad_depend]: 1.60999e-06 [auto_monad_grad]: 1.02e-06 [auto_monad_eliminator]: 6.95998e-06 [cse]: 1.274e-05 [a_3]: 5.021e-05 [py_interpret_to_execute_after_opt_a]: 1.381e-05 [slice_cell_reuse_recomputed_activation]: 4.40999e-06 [rewriter_after_opt_a]: 3.819e-05 [convert_after_rewriter]: 9.50001e-06 [order_py_execute_after_rewriter]: 8.25999e-06 [mutable_eliminate]: 0.00056133 [opt_b]: 0.00027659, [1] [Cycle 1]: 0.00026667, [7] [b_1]: 0.00017263 [b_2]: 8.37998e-06 [updatestate_depend_eliminate]: 6.25002e-06 [updatestate_assign_eliminate]: 2.54999e-06 [updatestate_loads_eliminate]: 2.41e-06 [renormalize]: 4.50003e-07 [cse]: 1.887e-05 [optimize_parallel_all_gather_comm]: 2.022e-05 [overlap_param_gather]: 5.45001e-06 [cconv]: 2.774e-05 [loop_unroll]: 0.00045787 [opt_after_cconv]: 0.00012794, [1] [Cycle 1]: 0.00012003, [7] [c_1]: 3.399e-05 [parameter_eliminate]: 3.11001e-06 [updatestate_depend_eliminate]: 5.04e-06 [updatestate_assign_eliminate]: 2.37999e-06 [updatestate_loads_eliminate]: 2.56e-06 [cse]: 1.725e-05 [renormalize]: 2.89991e-07 [remove_dup_value]: 1.593e-05 [tuple_transform]: 9.327e-05, [1] [Cycle 1]: 8.635e-05, [4] [d_1]: 4.593e-05 [none_parameter_eliminate]: 1.97001e-06 [renormalize]: 2.00002e-07 [switch_simplify]: 7.1e-06 [partial_unused_args_eliminate]: 4.30999e-06 [add_recomputation]: 4.799e-05 [cse_after_recomputation]: 2.738e-05, [1] [Cycle 1]: 2.013e-05, [1] [cse]: 1.095e-05 [environ_conv]: 8e-06 [swap_dp_allreduce_reducescatter]: 7.65e-06 [bias_add_comm_swap]: 5.00001e-06 [label_micro_interleaved_index]: 7.7e-06 [label_fine_grained_interleaved_index]: 5.19998e-06 [merge_cast_opt]: 3.5e-06 [slice_recompute_activation]: 4.87998e-06 [micro_interleaved_order_control]: 5.12999e-06 [assign_add_opt]: 3.46999e-06 [ForceFp32Comm]: 3.58e-06 [remove_cast_before_assign_add]: 3.45e-06 [full_micro_interleaved_order_control]: 4.60999e-06 [reorder_send_recv_between_fp_bp]: 5.79e-06 [comm_op_add_attrs]: 3.33e-06 [add_comm_op_reuse_tag]: 3.20998e-06 [interleave_split_concat_branches]: 2.132e-05 [interleave_parallel_branches]: 3.76001e-06 [overlap_opt_shard_in_pipeline]: 3.58e-06 [overlap_opt_shard_grad_in_pipeline]: 4.15e-06 [control_data_broadcast_order]: 1.628e-05 [grouped_pairwise_exchange_alltoall]: 4.22e-06 [offloading_packed_experts]: 6.28e-06 [overlap_recompute_and_grad_model_parallel]: 7.08998e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.9e-06 [overlap_recompute_allgather_and_fa_grad]: 3.86001e-06 [overlap_recompute_comm]: 4.95999e-06 [overlap_grad_ring_attention]: 6.68e-06 [overlap_grad_flash_sp]: 2.123e-05 [begin_end_overlap_inline]: 3.09999e-06 [split_matmul_comm_elemetwise]: 4.70999e-06 [split_layernorm_comm]: 4.82e-06 [handle_group_info]: 3.39001e-06 [symbol_engine_optimizer]: 9.962e-05, [1] [Cycle 1]: 9.309e-05, [6] [build]: 2.89001e-06 [elim_shapecalc]: 1.041e-05 [elim_not_effective]: 1.42e-05 [opt_reshape]: 7.68001e-06 [fold_const_symbol]: 1.1e-05 [renormalize]: 1.80007e-07 [detach_backward]: 3.56001e-06 [pipeline_parallel_scheduler]: 1.96e-06 [auto_monad_reorder]: 1.927e-05 [get_jit_bprop_graph]: 1.73002e-06 [rewriter_after_jit_bprop_graph]: 5.00999e-06 [opt_after_jit_grad]: 0.00052358 [validate]: 3.868e-05 Sums bootstrap : 0.016473s : 46.40% type_inference : 0.013276s : 37.39% event_method : 0.000053s : 0.15% auto_monad : 0.000100s : 0.28% graph_reusing : 0.000008s : 0.02% inline : 0.000003s : 0.01% add_attr.add_attr_with_inline.tag_attr : 0.000029s : 0.08% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000009s : 0.02% parallel-infer-symbol : 0.000004s : 0.01% pre_auto_parallel : 0.000044s : 0.12% insert-virtual-dataset : 0.000002s : 0.01% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.01% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000043s : 0.12% optimize.rewriter_before_opt_a : 0.000123s : 0.35% optimize.opt_a.expand_dump_flag : 0.000006s : 0.02% optimize.opt_a.switch_simplify : 0.000132s : 0.37% optimize.opt_a.loop_unroll : 0.000054s : 0.15% optimize.opt_a.a_1 : 0.000997s : 2.81% optimize.opt_a.with_stream_mark : 0.000028s : 0.08% optimize.opt_a.recompute_prepare : 0.000015s : 0.04% optimize.opt_a.updatestate_depend_eliminate : 0.000007s : 0.02% optimize.opt_a.updatestate_assign_eliminate : 0.000006s : 0.02% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.02% optimize.opt_a.parameter_eliminate : 0.000003s : 0.01% optimize.opt_a.a_2 : 0.000213s : 0.60% optimize.opt_a.accelerated_algorithm : 0.000014s : 0.04% optimize.opt_a.shard : 0.000003s : 0.01% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.01% optimize.opt_a.shard_inline : 0.000013s : 0.04% optimize.opt_a.merge_send_recv : 0.000013s : 0.04% optimize.opt_a.auto_parallel : 0.000013s : 0.04% optimize.opt_a.parallel : 0.000025s : 0.07% optimize.opt_a.flash_sp : 0.000012s : 0.03% optimize.opt_a.merge_comm : 0.000007s : 0.02% optimize.opt_a.allreduce_fusion : 0.000006s : 0.02% optimize.opt_a.matmul_add_comm_reduction : 0.000016s : 0.04% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000015s : 0.04% optimize.opt_a.virtual_dataset : 0.000013s : 0.04% optimize.opt_a.get_grad_eliminate_ : 0.000013s : 0.04% optimize.opt_a.virtual_output : 0.000012s : 0.03% optimize.opt_a.merge_forward : 0.000006s : 0.02% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.01% optimize.opt_a.offload_activation : 0.000017s : 0.05% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000030s : 0.09% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.01% optimize.opt_a.before_grad : 0.000020s : 0.06% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000007s : 0.02% optimize.opt_a.meta_fg_expand : 0.000005s : 0.02% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.01% optimize.opt_a.receive_attached : 0.000003s : 0.01% optimize.opt_a.after_resolve : 0.000023s : 0.06% optimize.opt_a.a_after_grad : 0.000019s : 0.05% optimize.opt_a.renormalize : 0.000950s : 2.68% optimize.opt_a.add_forward_monad_depend : 0.000007s : 0.02% optimize.opt_a.auto_monad_grad : 0.000003s : 0.01% optimize.opt_a.auto_monad_eliminator : 0.000023s : 0.06% optimize.opt_a.cse : 0.000052s : 0.15% optimize.opt_a.a_3 : 0.000113s : 0.32% optimize.py_interpret_to_execute_after_opt_a : 0.000014s : 0.04% optimize.slice_cell_reuse_recomputed_activation : 0.000004s : 0.01% optimize.rewriter_after_opt_a : 0.000038s : 0.11% optimize.convert_after_rewriter : 0.000010s : 0.03% optimize.order_py_execute_after_rewriter : 0.000008s : 0.02% optimize.mutable_eliminate : 0.000561s : 1.58% optimize.opt_b.b_1 : 0.000173s : 0.49% optimize.opt_b.b_2 : 0.000008s : 0.02% optimize.opt_b.updatestate_depend_eliminate : 0.000006s : 0.02% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.01% optimize.opt_b.updatestate_loads_eliminate : 0.000002s : 0.01% optimize.opt_b.renormalize : 0.000000s : 0.00% optimize.opt_b.cse : 0.000019s : 0.05% optimize.optimize_parallel_all_gather_comm : 0.000020s : 0.06% optimize.overlap_param_gather : 0.000005s : 0.02% optimize.cconv : 0.000028s : 0.08% optimize.loop_unroll : 0.000458s : 1.29% optimize.opt_after_cconv.c_1 : 0.000034s : 0.10% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000005s : 0.01% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000002s : 0.01% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.cse : 0.000017s : 0.05% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000016s : 0.04% optimize.tuple_transform.d_1 : 0.000046s : 0.13% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000007s : 0.02% optimize.partial_unused_args_eliminate : 0.000004s : 0.01% optimize.add_recomputation : 0.000048s : 0.14% optimize.cse_after_recomputation.cse : 0.000011s : 0.03% optimize.environ_conv : 0.000008s : 0.02% optimize.swap_dp_allreduce_reducescatter : 0.000008s : 0.02% optimize.bias_add_comm_swap : 0.000005s : 0.01% optimize.label_micro_interleaved_index : 0.000008s : 0.02% optimize.label_fine_grained_interleaved_index : 0.000005s : 0.01% optimize.merge_cast_opt : 0.000003s : 0.01% optimize.slice_recompute_activation : 0.000005s : 0.01% optimize.micro_interleaved_order_control : 0.000005s : 0.01% optimize.assign_add_opt : 0.000003s : 0.01% optimize.ForceFp32Comm : 0.000004s : 0.01% optimize.remove_cast_before_assign_add : 0.000003s : 0.01% optimize.full_micro_interleaved_order_control : 0.000005s : 0.01% optimize.reorder_send_recv_between_fp_bp : 0.000006s : 0.02% optimize.comm_op_add_attrs : 0.000003s : 0.01% optimize.add_comm_op_reuse_tag : 0.000003s : 0.01% optimize.interleave_split_concat_branches : 0.000021s : 0.06% optimize.interleave_parallel_branches : 0.000004s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000004s : 0.01% optimize.control_data_broadcast_order : 0.000016s : 0.05% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.01% optimize.offloading_packed_experts : 0.000006s : 0.02% optimize.overlap_recompute_and_grad_model_parallel : 0.000007s : 0.02% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.01% optimize.overlap_recompute_comm : 0.000005s : 0.01% optimize.overlap_grad_ring_attention : 0.000007s : 0.02% optimize.overlap_grad_flash_sp : 0.000021s : 0.06% optimize.begin_end_overlap_inline : 0.000003s : 0.01% optimize.split_matmul_comm_elemetwise : 0.000005s : 0.01% optimize.split_layernorm_comm : 0.000005s : 0.01% optimize.handle_group_info : 0.000003s : 0.01% optimize.symbol_engine_optimizer.build : 0.000003s : 0.01% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000010s : 0.03% optimize.symbol_engine_optimizer.elim_not_effective : 0.000014s : 0.04% optimize.symbol_engine_optimizer.opt_reshape : 0.000008s : 0.02% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000011s : 0.03% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000004s : 0.01% pipeline_parallel_scheduler : 0.000002s : 0.01% auto_monad_reorder : 0.000019s : 0.05% get_jit_bprop_graph : 0.000002s : 0.00% rewriter_after_jit_bprop_graph : 0.000005s : 0.01% opt_after_jit_grad : 0.000524s : 1.47% validate : 0.000039s : 0.11% Time group info: ------[substitution.] 0.000224 34 1.01% : 0.000002s : 2: substitution.elim_not_effective 0.66% : 0.000001s : 2: substitution.fold_const_symbol 2.66% : 0.000006s : 4: substitution.graph_param_transform 78.78% : 0.000176s : 8: substitution.inline 1.59% : 0.000004s : 4: substitution.j_node_and_user_rematch 1.95% : 0.000004s : 4: substitution.remove_not_recompute_node 2.06% : 0.000005s : 4: substitution.replace_old_param 4.62% : 0.000010s : 2: substitution.switch_simplify 6.67% : 0.000015s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.013207 2 89.13% : 0.011771s : 1: type_inference.infer 10.87% : 0.001436s : 1: type_inference.specialize ------[replace.] 0.000120 14 44.33% : 0.000053s : 8: replace.inline 37.21% : 0.000045s : 2: replace.switch_simplify 18.47% : 0.000022s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000193 14 88.68% : 0.000172s : 8: match.inline 4.72% : 0.000009s : 2: match.switch_simplify 6.60% : 0.000013s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000247 1520 0.96% : 0.000002s : 17: predicate.accumulaten_eliminater 0.72% : 0.000002s : 4: predicate.ad_related_special_op_eliminate 0.42% : 0.000001s : 8: predicate.addn_check_dump 1.08% : 0.000003s : 17: predicate.addn_zero_filter 0.88% : 0.000002s : 17: predicate.adjust_all_reduce_mul_add 2.10% : 0.000005s : 25: predicate.arithmetic_simplify 1.01% : 0.000002s : 17: predicate.cast_eliminate 0.52% : 0.000001s : 8: predicate.check_bprop_eliminate 0.43% : 0.000001s : 8: predicate.compare_switch_simplify 0.15% : 0.000000s : 4: predicate.const_output_eliminate 0.47% : 0.000001s : 8: predicate.depend_value_elim 1.08% : 0.000003s : 17: predicate.dict_get_item_const_eliminator 1.16% : 0.000003s : 17: predicate.dict_get_item_eliminator 0.96% : 0.000002s : 17: predicate.dict_set_item_eliminator 0.78% : 0.000002s : 8: predicate.dumpgradient_eliminate 0.19% : 0.000000s : 4: predicate.elim_not_effective 0.35% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.18% : 0.000003s : 21: predicate.environ_add_const_eliminate 1.16% : 0.000003s : 21: predicate.environ_get_add_eliminate 1.12% : 0.000003s : 21: predicate.environ_get_depend_swap 1.70% : 0.000004s : 29: predicate.environ_get_eliminate 1.13% : 0.000003s : 21: predicate.environ_get_set_eliminate 1.73% : 0.000004s : 29: predicate.exchange_switch_depend_value 2.66% : 0.000007s : 29: predicate.float_depend_g_call 0.43% : 0.000001s : 8: predicate.float_environ_get_switch 0.64% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.17% : 0.000000s : 4: predicate.fold_const_symbol 0.59% : 0.000001s : 8: predicate.get_grad_eliminate 0.17% : 0.000000s : 4: predicate.graph_param_transform 0.46% : 0.000001s : 8: predicate.incorporate_call 0.40% : 0.000001s : 8: predicate.incorporate_call_switch 6.44% : 0.000016s : 70: predicate.inline 0.65% : 0.000002s : 8: predicate.inline_without_move 0.26% : 0.000001s : 8: predicate.j_node_and_user_rematch 0.65% : 0.000002s : 8: predicate.less_batch_normalization 1.92% : 0.000005s : 29: predicate.list_to_tuple_eliminator_ 2.77% : 0.000007s : 46: predicate.load_eliminater 0.70% : 0.000002s : 4: predicate.loop_unroll_after_grad 3.11% : 0.000008s : 47: predicate.loop_unroll_before_grad 1.52% : 0.000004s : 25: predicate.make_slice_get_slice_eliminator 0.43% : 0.000001s : 8: predicate.merge_addn 0.51% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.56% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.92% : 0.000002s : 17: predicate.minmaximum_grad 0.97% : 0.000002s : 4: predicate.mutable_eliminate 0.28% : 0.000001s : 4: predicate.opt_reshape 0.43% : 0.000001s : 4: predicate.parallel_virtual_node 2.18% : 0.000005s : 29: predicate.partial_defer_inline 1.68% : 0.000004s : 25: predicate.partial_eliminate 1.10% : 0.000003s : 17: predicate.print_const_string_wrapper 0.50% : 0.000001s : 8: predicate.reduce_all_const_elim 1.25% : 0.000003s : 17: predicate.reduce_eliminate 2.65% : 0.000007s : 46: predicate.redundant_stop_gradient_eliminater 0.34% : 0.000001s : 8: predicate.remove_not_recompute_node 1.38% : 0.000003s : 29: predicate.replace_applicator 0.42% : 0.000001s : 8: predicate.replace_old_param 0.22% : 0.000001s : 4: predicate.reset_defer_inline 1.01% : 0.000003s : 17: predicate.reshape_eliminate 0.54% : 0.000001s : 8: predicate.row_tensor_add_zeros_like 0.31% : 0.000001s : 4: predicate.row_tensor_eliminate 0.64% : 0.000002s : 8: predicate.same_eliminate 0.39% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.62% : 0.000002s : 8: predicate.shard_identity_eliminate 0.69% : 0.000002s : 8: predicate.special_op_eliminate 0.60% : 0.000001s : 8: predicate.specialize_transform 0.67% : 0.000002s : 8: predicate.split_environ_get_set_with_tuple_value 0.56% : 0.000001s : 8: predicate.stack_unstack_eliminate 0.25% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.84% : 0.000005s : 29: predicate.switch_defer_inline 2.34% : 0.000006s : 37: predicate.switch_layer_defer_inline 6.10% : 0.000015s : 92: predicate.switch_simplify 0.97% : 0.000002s : 17: predicate.tile_eliminate 1.05% : 0.000003s : 17: predicate.transpose_eliminate 1.48% : 0.000004s : 25: predicate.tuple_list_convert_item_index_to_positive 1.56% : 0.000004s : 25: predicate.tuple_list_get_item_const_eliminator 1.42% : 0.000004s : 25: predicate.tuple_list_get_item_depend_reorder 3.16% : 0.000008s : 37: predicate.tuple_list_get_item_eliminator 1.57% : 0.000004s : 25: predicate.tuple_list_get_set_item_eliminator 2.21% : 0.000005s : 33: predicate.tuple_list_set_item_eliminator 1.77% : 0.000004s : 29: predicate.tuple_to_list_eliminator_ 2.48% : 0.000006s : 46: predicate.updatestate_pure_node_eliminater 3.23% : 0.000008s : 54: predicate.updatestate_useless_node_eliminater 0.35% : 0.000001s : 4: predicate.value_based_eliminate 0.50% : 0.000001s : 8: predicate.virtual_dataset_eliminate 0.48% : 0.000001s : 8: predicate.virtual_output_eliminate 0.21% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.37% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000984 16 54.84% : 0.000540s : 6: func_graph_cloner_run.FuncGraphClonerGraph 45.16% : 0.000444s : 10: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.052518 192 0.01% : 0.000006s : 1: ForceFp32Comm 6.23% : 0.003273s : 1: add_attr 6.20% : 0.003258s : 1: add_attr_with_inline 0.01% : 0.000006s : 1: add_comm_op_reuse_tag 0.10% : 0.000051s : 1: add_recomputation 0.01% : 0.000006s : 1: assign_add_opt 0.21% : 0.000111s : 1: auto_monad 0.05% : 0.000027s : 1: auto_monad_reorder 0.01% : 0.000006s : 1: begin_end_overlap_inline 0.01% : 0.000008s : 1: bias_add_comm_swap 31.46% : 0.016524s : 1: bootstrap 0.06% : 0.000031s : 1: cconv 0.01% : 0.000006s : 1: comm_op_add_attrs 0.04% : 0.000020s : 1: control_data_broadcast_order 0.02% : 0.000013s : 1: convert_after_rewriter 0.06% : 0.000030s : 1: cse_after_recomputation 0.01% : 0.000008s : 1: dataset_repeat_opt 0.04% : 0.000021s : 1: detach_backward 0.02% : 0.000011s : 1: environ_conv 0.12% : 0.000065s : 1: event_method 0.01% : 0.000008s : 1: full_micro_interleaved_order_control 0.01% : 0.000008s : 1: get_jit_bprop_graph 0.03% : 0.000015s : 1: graph_reusing 0.01% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000006s : 1: handle_group_info 0.02% : 0.000008s : 1: inline 0.02% : 0.000008s : 1: insert-virtual-dataset 0.01% : 0.000006s : 1: interleave_parallel_branches 0.05% : 0.000024s : 1: interleave_split_concat_branches 0.02% : 0.000008s : 1: label_fine_grained_interleaved_index 0.02% : 0.000010s : 1: label_micro_interleaved_index 0.88% : 0.000464s : 1: loop_unroll 0.01% : 0.000006s : 1: merge_cast_opt 0.01% : 0.000008s : 1: micro_interleaved_order_control 1.08% : 0.000568s : 1: mutable_eliminate 0.02% : 0.000009s : 1: offloading_packed_experts 0.03% : 0.000014s : 1: opt.transform.loop_unroll_optimizer 0.03% : 0.000017s : 1: opt.transform.mutable_eliminate 2.91% : 0.001527s : 78: opt.transform.opt_a 0.06% : 0.000032s : 1: opt.transform.opt_after_cconv 0.05% : 0.000027s : 1: opt.transform.opt_after_jit_grad 0.21% : 0.000108s : 28: opt.transform.opt_b 0.10% : 0.000051s : 2: opt.transform.opt_trans_graph 0.08% : 0.000040s : 4: opt.transform.symbol_engine_opt 6.99% : 0.003673s : 1: opt_a 0.25% : 0.000131s : 1: opt_after_cconv 1.02% : 0.000534s : 1: opt_after_jit_grad 0.53% : 0.000280s : 1: opt_b 12.28% : 0.006447s : 1: optimize 0.04% : 0.000023s : 1: optimize_parallel_all_gather_comm 0.02% : 0.000011s : 1: order_py_execute_after_rewriter 0.05% : 0.000025s : 1: overlap_grad_flash_sp 0.01% : 0.000007s : 1: overlap_grad_matmul_and_grad_allreduce 0.02% : 0.000009s : 1: overlap_grad_ring_attention 0.01% : 0.000007s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000006s : 1: overlap_opt_shard_in_pipeline 0.02% : 0.000008s : 1: overlap_param_gather 0.01% : 0.000006s : 1: overlap_recompute_allgather_and_fa_grad 0.02% : 0.000010s : 1: overlap_recompute_and_grad_model_parallel 0.01% : 0.000008s : 1: overlap_recompute_comm 0.02% : 0.000011s : 1: parallel-infer-symbol 0.01% : 0.000007s : 1: parallel-infer-symbol-second 0.01% : 0.000008s : 1: partial_unused_args_eliminate 0.02% : 0.000009s : 1: pipeline_parallel_scheduler 0.01% : 0.000007s : 1: pipeline_split 0.10% : 0.000052s : 1: pre_auto_parallel 0.09% : 0.000047s : 1: py_interpret_to_execute 0.03% : 0.000017s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000006s : 1: remove_cast_before_assign_add 0.04% : 0.000019s : 1: remove_dup_value 0.95% : 0.000499s : 1: renormalize.infer 0.84% : 0.000441s : 1: renormalize.specialize 0.02% : 0.000008s : 1: reorder_send_recv_between_fp_bp 0.02% : 0.000012s : 1: rewriter_after_jit_bprop_graph 0.08% : 0.000042s : 1: rewriter_after_opt_a 0.24% : 0.000127s : 1: rewriter_before_opt_a 0.01% : 0.000007s : 1: slice_cell_reuse_recomputed_activation 0.01% : 0.000007s : 1: slice_recompute_activation 0.01% : 0.000008s : 1: split_layernorm_comm 0.01% : 0.000007s : 1: split_matmul_comm_elemetwise 0.02% : 0.000010s : 1: swap_dp_allreduce_reducescatter 0.20% : 0.000103s : 1: symbol_engine_optimizer 0.18% : 0.000096s : 1: tuple_transform 25.36% : 0.013320s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:23.825.802 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0391779, [21] [bootstrap]: 0.00042942 [type_inference]: 0.0294663 [event_method]: 5.331e-05 [auto_monad]: 9.589e-05 [graph_reusing]: 7.46999e-06 [inline]: 2.41e-06 [add_attr]: 0.00310439, [1] [add_attr_with_inline]: 0.00309549, [1] [Cycle 1]: 6.69e-05, [2] [tag_attr]: 2.737e-05 [meta_addattr_fg_expand]: 8.32998e-06 [parallel-infer-symbol]: 3.03e-06 [pre_auto_parallel]: 4.052e-05 [insert-virtual-dataset]: 2.97002e-06 [parallel-infer-symbol-second]: 9.00007e-07 [dataset_repeat_opt]: 1.94e-06 [pipeline_split]: 2.27999e-06 [optimize]: 0.00525267, [53] [py_interpret_to_execute]: 4.613e-05 [rewriter_before_opt_a]: 0.00011018 [opt_a]: 0.00313749, [2] [Cycle 1]: 0.00248094, [45] [expand_dump_flag]: 4.68001e-06 [switch_simplify]: 0.00011919 [loop_unroll]: 4.143e-05 [a_1]: 0.00083754 [with_stream_mark]: 1.429e-05 [recompute_prepare]: 8.81002e-06 [updatestate_depend_eliminate]: 4.32e-06 [updatestate_assign_eliminate]: 3.31001e-06 [updatestate_loads_eliminate]: 2.61999e-06 [parameter_eliminate]: 1.74e-06 [a_2]: 8.301e-05 [accelerated_algorithm]: 7.88001e-06 [shard]: 1.64e-06 [meta_shard_fg_expand]: 2.04999e-06 [shard_inline]: 6.73e-06 [merge_send_recv]: 8.11002e-06 [auto_parallel]: 7.04001e-06 [parallel]: 1.72e-05 [flash_sp]: 8.42e-06 [merge_comm]: 4.60001e-06 [allreduce_fusion]: 3.35e-06 [matmul_add_comm_reduction]: 9.13002e-06 [allreduce_slice_to_reducescatter]: 6.09987e-07 [virtual_shard_identity]: 8.58001e-06 [virtual_dataset]: 7.06999e-06 [get_grad_eliminate_]: 6.48998e-06 [virtual_output]: 6.74999e-06 [merge_forward]: 3.5e-06 [cell_reuse_recompute_pass]: 1.05001e-06 [offload_activation]: 9.70002e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.25e-05 [merge_recompute_call_nodes]: 1.84e-06 [before_grad]: 1.035e-05 [set_forward_comm_id_for_comm_node_pass]: 3.41001e-06 [meta_fg_expand]: 3.37002e-06 [flash_sp_send_recv_attached]: 2.46e-06 [receive_attached]: 2.56998e-06 [after_resolve]: 1.158e-05 [a_after_grad]: 1.057e-05 [renormalize]: 0.00083181 [add_forward_monad_depend]: 5.34e-06 [auto_monad_grad]: 1.62999e-06 [auto_monad_eliminator]: 1.59e-05 [cse]: 2.666e-05 [a_3]: 4.971e-05 [Cycle 2]: 0.00064662, [45] [expand_dump_flag]: 1.10999e-06 [switch_simplify]: 8.23001e-06 [loop_unroll]: 6.35002e-06 [a_1]: 0.00013478 [with_stream_mark]: 1.191e-05 [recompute_prepare]: 6.51e-06 [updatestate_depend_eliminate]: 3.08e-06 [updatestate_assign_eliminate]: 2.29001e-06 [updatestate_loads_eliminate]: 2.69999e-06 [parameter_eliminate]: 1.14e-06 [a_2]: 7.459e-05 [accelerated_algorithm]: 6.46e-06 [shard]: 1.16002e-06 [meta_shard_fg_expand]: 1.52001e-06 [shard_inline]: 6.48998e-06 [merge_send_recv]: 5.14998e-06 [auto_parallel]: 5.16002e-06 [parallel]: 4.96002e-06 [flash_sp]: 3.5e-06 [merge_comm]: 3.01999e-06 [allreduce_fusion]: 2.98e-06 [matmul_add_comm_reduction]: 5.64e-06 [allreduce_slice_to_reducescatter]: 3.80009e-07 [virtual_shard_identity]: 6.74999e-06 [virtual_dataset]: 6.02001e-06 [get_grad_eliminate_]: 5.90002e-06 [virtual_output]: 5.67001e-06 [merge_forward]: 2.68e-06 [cell_reuse_recompute_pass]: 1.47999e-06 [offload_activation]: 6.26e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.359e-05 [merge_recompute_call_nodes]: 9.20001e-07 [before_grad]: 9.30001e-06 [set_forward_comm_id_for_comm_node_pass]: 3.56001e-06 [meta_fg_expand]: 2.06e-06 [flash_sp_send_recv_attached]: 8.39995e-07 [receive_attached]: 1.05001e-06 [after_resolve]: 1.064e-05 [a_after_grad]: 9.54e-06 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 1.20001e-06 [auto_monad_grad]: 8.50006e-07 [auto_monad_eliminator]: 7.16001e-06 [cse]: 1.261e-05 [a_3]: 3.777e-05 [py_interpret_to_execute_after_opt_a]: 8.55001e-06 [slice_cell_reuse_recomputed_activation]: 2.02999e-06 [rewriter_after_opt_a]: 3.348e-05 [convert_after_rewriter]: 6.56e-06 [order_py_execute_after_rewriter]: 5.19e-06 [mutable_eliminate]: 0.00050434 [opt_b]: 0.00023426, [1] [Cycle 1]: 0.00022834, [7] [b_1]: 0.00015516 [b_2]: 8.56997e-06 [updatestate_depend_eliminate]: 5.52001e-06 [updatestate_assign_eliminate]: 2.46e-06 [updatestate_loads_eliminate]: 2.24999e-06 [renormalize]: 6.39993e-07 [cse]: 1.677e-05 [optimize_parallel_all_gather_comm]: 1.719e-05 [overlap_param_gather]: 1.92001e-06 [cconv]: 2.573e-05 [loop_unroll]: 0.00044476 [opt_after_cconv]: 0.00010388, [1] [Cycle 1]: 9.791e-05, [7] [c_1]: 3.311e-05 [parameter_eliminate]: 2.80997e-06 [updatestate_depend_eliminate]: 5.47001e-06 [updatestate_assign_eliminate]: 2.37001e-06 [updatestate_loads_eliminate]: 2.27999e-06 [cse]: 1.735e-05 [renormalize]: 4.69998e-07 [remove_dup_value]: 1.345e-05 [tuple_transform]: 7.669e-05, [1] [Cycle 1]: 7.246e-05, [4] [d_1]: 4.498e-05 [none_parameter_eliminate]: 1.90001e-06 [renormalize]: 2.00002e-07 [switch_simplify]: 6.93e-06 [partial_unused_args_eliminate]: 1.70001e-06 [add_recomputation]: 4.405e-05 [cse_after_recomputation]: 2.02e-05, [1] [Cycle 1]: 1.593e-05, [1] [cse]: 1.048e-05 [environ_conv]: 5.24998e-06 [swap_dp_allreduce_reducescatter]: 4.94998e-06 [bias_add_comm_swap]: 2.39001e-06 [label_micro_interleaved_index]: 4.52e-06 [label_fine_grained_interleaved_index]: 2.79999e-06 [merge_cast_opt]: 1.50999e-06 [slice_recompute_activation]: 2.35002e-06 [micro_interleaved_order_control]: 2.27999e-06 [assign_add_opt]: 1.10001e-06 [ForceFp32Comm]: 8.2e-07 [remove_cast_before_assign_add]: 9.70002e-07 [full_micro_interleaved_order_control]: 2.26e-06 [reorder_send_recv_between_fp_bp]: 2.76999e-06 [comm_op_add_attrs]: 1.13001e-06 [add_comm_op_reuse_tag]: 1.07998e-06 [interleave_split_concat_branches]: 1.34e-06 [interleave_parallel_branches]: 1.18001e-06 [overlap_opt_shard_in_pipeline]: 1.15999e-06 [overlap_opt_shard_grad_in_pipeline]: 2.66e-06 [control_data_broadcast_order]: 1.362e-05 [grouped_pairwise_exchange_alltoall]: 1.55001e-06 [offloading_packed_experts]: 3.97002e-06 [overlap_recompute_and_grad_model_parallel]: 4.17e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.12e-06 [overlap_recompute_allgather_and_fa_grad]: 1.33002e-06 [overlap_recompute_comm]: 2.07001e-06 [overlap_grad_ring_attention]: 4.12e-06 [overlap_grad_flash_sp]: 1.732e-05 [begin_end_overlap_inline]: 6.50005e-07 [split_matmul_comm_elemetwise]: 2.28002e-06 [split_layernorm_comm]: 1.86e-06 [handle_group_info]: 1.06002e-06 [symbol_engine_optimizer]: 7.717e-05, [1] [Cycle 1]: 7.252e-05, [6] [build]: 3.14001e-06 [elim_shapecalc]: 1.056e-05 [elim_not_effective]: 1.37e-05 [opt_reshape]: 7.15e-06 [fold_const_symbol]: 9.96e-06 [renormalize]: 2.69996e-07 [detach_backward]: 2.02999e-06 [pipeline_parallel_scheduler]: 1.80001e-06 [auto_monad_reorder]: 1.605e-05 [get_jit_bprop_graph]: 1.47999e-06 [rewriter_after_jit_bprop_graph]: 3.97e-06 [opt_after_jit_grad]: 0.00049312 [validate]: 3.707e-05 Sums bootstrap : 0.000429s : 1.22% type_inference : 0.029466s : 83.95% event_method : 0.000053s : 0.15% auto_monad : 0.000096s : 0.27% graph_reusing : 0.000007s : 0.02% inline : 0.000002s : 0.01% add_attr.add_attr_with_inline.tag_attr : 0.000027s : 0.08% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000008s : 0.02% parallel-infer-symbol : 0.000003s : 0.01% pre_auto_parallel : 0.000041s : 0.12% insert-virtual-dataset : 0.000003s : 0.01% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.01% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000046s : 0.13% optimize.rewriter_before_opt_a : 0.000110s : 0.31% optimize.opt_a.expand_dump_flag : 0.000006s : 0.02% optimize.opt_a.switch_simplify : 0.000127s : 0.36% optimize.opt_a.loop_unroll : 0.000048s : 0.14% optimize.opt_a.a_1 : 0.000972s : 2.77% optimize.opt_a.with_stream_mark : 0.000026s : 0.07% optimize.opt_a.recompute_prepare : 0.000015s : 0.04% optimize.opt_a.updatestate_depend_eliminate : 0.000007s : 0.02% optimize.opt_a.updatestate_assign_eliminate : 0.000006s : 0.02% optimize.opt_a.updatestate_loads_eliminate : 0.000005s : 0.02% optimize.opt_a.parameter_eliminate : 0.000003s : 0.01% optimize.opt_a.a_2 : 0.000158s : 0.45% optimize.opt_a.accelerated_algorithm : 0.000014s : 0.04% optimize.opt_a.shard : 0.000003s : 0.01% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.01% optimize.opt_a.shard_inline : 0.000013s : 0.04% optimize.opt_a.merge_send_recv : 0.000013s : 0.04% optimize.opt_a.auto_parallel : 0.000012s : 0.03% optimize.opt_a.parallel : 0.000022s : 0.06% optimize.opt_a.flash_sp : 0.000012s : 0.03% optimize.opt_a.merge_comm : 0.000008s : 0.02% optimize.opt_a.allreduce_fusion : 0.000006s : 0.02% optimize.opt_a.matmul_add_comm_reduction : 0.000015s : 0.04% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000015s : 0.04% optimize.opt_a.virtual_dataset : 0.000013s : 0.04% optimize.opt_a.get_grad_eliminate_ : 0.000012s : 0.04% optimize.opt_a.virtual_output : 0.000012s : 0.04% optimize.opt_a.merge_forward : 0.000006s : 0.02% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.01% optimize.opt_a.offload_activation : 0.000016s : 0.05% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000026s : 0.07% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.01% optimize.opt_a.before_grad : 0.000020s : 0.06% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000007s : 0.02% optimize.opt_a.meta_fg_expand : 0.000005s : 0.02% optimize.opt_a.flash_sp_send_recv_attached : 0.000003s : 0.01% optimize.opt_a.receive_attached : 0.000004s : 0.01% optimize.opt_a.after_resolve : 0.000022s : 0.06% optimize.opt_a.a_after_grad : 0.000020s : 0.06% optimize.opt_a.renormalize : 0.000832s : 2.37% optimize.opt_a.add_forward_monad_depend : 0.000007s : 0.02% optimize.opt_a.auto_monad_grad : 0.000002s : 0.01% optimize.opt_a.auto_monad_eliminator : 0.000023s : 0.07% optimize.opt_a.cse : 0.000039s : 0.11% optimize.opt_a.a_3 : 0.000087s : 0.25% optimize.py_interpret_to_execute_after_opt_a : 0.000009s : 0.02% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.01% optimize.rewriter_after_opt_a : 0.000033s : 0.10% optimize.convert_after_rewriter : 0.000007s : 0.02% optimize.order_py_execute_after_rewriter : 0.000005s : 0.01% optimize.mutable_eliminate : 0.000504s : 1.44% optimize.opt_b.b_1 : 0.000155s : 0.44% optimize.opt_b.b_2 : 0.000009s : 0.02% optimize.opt_b.updatestate_depend_eliminate : 0.000006s : 0.02% optimize.opt_b.updatestate_assign_eliminate : 0.000002s : 0.01% optimize.opt_b.updatestate_loads_eliminate : 0.000002s : 0.01% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000017s : 0.05% optimize.optimize_parallel_all_gather_comm : 0.000017s : 0.05% optimize.overlap_param_gather : 0.000002s : 0.01% optimize.cconv : 0.000026s : 0.07% optimize.loop_unroll : 0.000445s : 1.27% optimize.opt_after_cconv.c_1 : 0.000033s : 0.09% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000005s : 0.02% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000002s : 0.01% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.01% optimize.opt_after_cconv.cse : 0.000017s : 0.05% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000013s : 0.04% optimize.tuple_transform.d_1 : 0.000045s : 0.13% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000007s : 0.02% optimize.partial_unused_args_eliminate : 0.000002s : 0.00% optimize.add_recomputation : 0.000044s : 0.13% optimize.cse_after_recomputation.cse : 0.000010s : 0.03% optimize.environ_conv : 0.000005s : 0.01% optimize.swap_dp_allreduce_reducescatter : 0.000005s : 0.01% optimize.bias_add_comm_swap : 0.000002s : 0.01% optimize.label_micro_interleaved_index : 0.000005s : 0.01% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.01% optimize.merge_cast_opt : 0.000002s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.01% optimize.micro_interleaved_order_control : 0.000002s : 0.01% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.00% optimize.full_micro_interleaved_order_control : 0.000002s : 0.01% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.01% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000003s : 0.01% optimize.control_data_broadcast_order : 0.000014s : 0.04% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.00% optimize.offloading_packed_experts : 0.000004s : 0.01% optimize.overlap_recompute_and_grad_model_parallel : 0.000004s : 0.01% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.00% optimize.overlap_recompute_comm : 0.000002s : 0.01% optimize.overlap_grad_ring_attention : 0.000004s : 0.01% optimize.overlap_grad_flash_sp : 0.000017s : 0.05% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.01% optimize.split_layernorm_comm : 0.000002s : 0.01% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000003s : 0.01% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000011s : 0.03% optimize.symbol_engine_optimizer.elim_not_effective : 0.000014s : 0.04% optimize.symbol_engine_optimizer.opt_reshape : 0.000007s : 0.02% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000010s : 0.03% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.01% pipeline_parallel_scheduler : 0.000002s : 0.01% auto_monad_reorder : 0.000016s : 0.05% get_jit_bprop_graph : 0.000001s : 0.00% rewriter_after_jit_bprop_graph : 0.000004s : 0.01% opt_after_jit_grad : 0.000493s : 1.40% validate : 0.000037s : 0.11% Time group info: ------[substitution.] 0.000220 34 1.16% : 0.000003s : 2: substitution.elim_not_effective 0.64% : 0.000001s : 2: substitution.fold_const_symbol 2.72% : 0.000006s : 4: substitution.graph_param_transform 78.43% : 0.000173s : 8: substitution.inline 1.57% : 0.000003s : 4: substitution.j_node_and_user_rematch 2.01% : 0.000004s : 4: substitution.remove_not_recompute_node 2.01% : 0.000004s : 4: substitution.replace_old_param 4.24% : 0.000009s : 2: substitution.switch_simplify 7.22% : 0.000016s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.029396 2 95.36% : 0.028031s : 1: type_inference.infer 4.64% : 0.001365s : 1: type_inference.specialize ------[replace.] 0.000117 14 44.38% : 0.000052s : 8: replace.inline 35.78% : 0.000042s : 2: replace.switch_simplify 19.84% : 0.000023s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000190 14 88.50% : 0.000168s : 8: match.inline 4.19% : 0.000008s : 2: match.switch_simplify 7.32% : 0.000014s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000239 1520 1.02% : 0.000002s : 17: predicate.accumulaten_eliminater 0.59% : 0.000001s : 4: predicate.ad_related_special_op_eliminate 0.50% : 0.000001s : 8: predicate.addn_check_dump 1.06% : 0.000003s : 17: predicate.addn_zero_filter 0.92% : 0.000002s : 17: predicate.adjust_all_reduce_mul_add 2.00% : 0.000005s : 25: predicate.arithmetic_simplify 1.09% : 0.000003s : 17: predicate.cast_eliminate 0.50% : 0.000001s : 8: predicate.check_bprop_eliminate 0.44% : 0.000001s : 8: predicate.compare_switch_simplify 0.16% : 0.000000s : 4: predicate.const_output_eliminate 0.50% : 0.000001s : 8: predicate.depend_value_elim 1.07% : 0.000003s : 17: predicate.dict_get_item_const_eliminator 1.22% : 0.000003s : 17: predicate.dict_get_item_eliminator 0.96% : 0.000002s : 17: predicate.dict_set_item_eliminator 0.83% : 0.000002s : 8: predicate.dumpgradient_eliminate 0.17% : 0.000000s : 4: predicate.elim_not_effective 0.33% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.21% : 0.000003s : 21: predicate.environ_add_const_eliminate 1.13% : 0.000003s : 21: predicate.environ_get_add_eliminate 1.17% : 0.000003s : 21: predicate.environ_get_depend_swap 1.65% : 0.000004s : 29: predicate.environ_get_eliminate 1.17% : 0.000003s : 21: predicate.environ_get_set_eliminate 1.72% : 0.000004s : 29: predicate.exchange_switch_depend_value 2.83% : 0.000007s : 29: predicate.float_depend_g_call 0.43% : 0.000001s : 8: predicate.float_environ_get_switch 0.62% : 0.000001s : 12: predicate.float_tuple_getitem_switch 0.14% : 0.000000s : 4: predicate.fold_const_symbol 0.58% : 0.000001s : 8: predicate.get_grad_eliminate 0.21% : 0.000000s : 4: predicate.graph_param_transform 0.51% : 0.000001s : 8: predicate.incorporate_call 0.41% : 0.000001s : 8: predicate.incorporate_call_switch 6.44% : 0.000015s : 70: predicate.inline 0.76% : 0.000002s : 8: predicate.inline_without_move 0.28% : 0.000001s : 8: predicate.j_node_and_user_rematch 0.84% : 0.000002s : 8: predicate.less_batch_normalization 1.80% : 0.000004s : 29: predicate.list_to_tuple_eliminator_ 2.61% : 0.000006s : 46: predicate.load_eliminater 0.87% : 0.000002s : 4: predicate.loop_unroll_after_grad 2.82% : 0.000007s : 47: predicate.loop_unroll_before_grad 1.79% : 0.000004s : 25: predicate.make_slice_get_slice_eliminator 0.44% : 0.000001s : 8: predicate.merge_addn 0.48% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.56% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.95% : 0.000002s : 17: predicate.minmaximum_grad 0.78% : 0.000002s : 4: predicate.mutable_eliminate 0.31% : 0.000001s : 4: predicate.opt_reshape 0.38% : 0.000001s : 4: predicate.parallel_virtual_node 2.17% : 0.000005s : 29: predicate.partial_defer_inline 1.67% : 0.000004s : 25: predicate.partial_eliminate 0.97% : 0.000002s : 17: predicate.print_const_string_wrapper 0.48% : 0.000001s : 8: predicate.reduce_all_const_elim 1.24% : 0.000003s : 17: predicate.reduce_eliminate 2.57% : 0.000006s : 46: predicate.redundant_stop_gradient_eliminater 0.36% : 0.000001s : 8: predicate.remove_not_recompute_node 1.38% : 0.000003s : 29: predicate.replace_applicator 0.38% : 0.000001s : 8: predicate.replace_old_param 0.24% : 0.000001s : 4: predicate.reset_defer_inline 1.00% : 0.000002s : 17: predicate.reshape_eliminate 0.51% : 0.000001s : 8: predicate.row_tensor_add_zeros_like 0.34% : 0.000001s : 4: predicate.row_tensor_eliminate 0.59% : 0.000001s : 8: predicate.same_eliminate 0.38% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.67% : 0.000002s : 8: predicate.shard_identity_eliminate 0.66% : 0.000002s : 8: predicate.special_op_eliminate 0.59% : 0.000001s : 8: predicate.specialize_transform 0.72% : 0.000002s : 8: predicate.split_environ_get_set_with_tuple_value 0.66% : 0.000002s : 8: predicate.stack_unstack_eliminate 0.27% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.85% : 0.000004s : 29: predicate.switch_defer_inline 2.31% : 0.000006s : 37: predicate.switch_layer_defer_inline 6.13% : 0.000015s : 92: predicate.switch_simplify 0.99% : 0.000002s : 17: predicate.tile_eliminate 1.01% : 0.000002s : 17: predicate.transpose_eliminate 1.47% : 0.000004s : 25: predicate.tuple_list_convert_item_index_to_positive 1.59% : 0.000004s : 25: predicate.tuple_list_get_item_const_eliminator 1.48% : 0.000004s : 25: predicate.tuple_list_get_item_depend_reorder 3.03% : 0.000007s : 37: predicate.tuple_list_get_item_eliminator 1.55% : 0.000004s : 25: predicate.tuple_list_get_set_item_eliminator 2.19% : 0.000005s : 33: predicate.tuple_list_set_item_eliminator 1.71% : 0.000004s : 29: predicate.tuple_to_list_eliminator_ 2.51% : 0.000006s : 46: predicate.updatestate_pure_node_eliminater 3.07% : 0.000007s : 54: predicate.updatestate_useless_node_eliminater 0.32% : 0.000001s : 4: predicate.value_based_eliminate 0.71% : 0.000002s : 8: predicate.virtual_dataset_eliminate 0.48% : 0.000001s : 8: predicate.virtual_output_eliminate 0.22% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.31% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000971 16 59.31% : 0.000576s : 6: func_graph_cloner_run.FuncGraphClonerGraph 40.69% : 0.000395s : 10: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.050030 192 0.01% : 0.000004s : 1: ForceFp32Comm 6.21% : 0.003109s : 1: add_attr 6.20% : 0.003099s : 1: add_attr_with_inline 0.01% : 0.000004s : 1: add_comm_op_reuse_tag 0.10% : 0.000048s : 1: add_recomputation 0.01% : 0.000004s : 1: assign_add_opt 0.20% : 0.000102s : 1: auto_monad 0.04% : 0.000020s : 1: auto_monad_reorder 0.01% : 0.000004s : 1: begin_end_overlap_inline 0.01% : 0.000005s : 1: bias_add_comm_swap 0.92% : 0.000460s : 1: bootstrap 0.06% : 0.000029s : 1: cconv 0.04% : 0.000021s : 1: comm_op_add_attrs 0.03% : 0.000017s : 1: control_data_broadcast_order 0.02% : 0.000010s : 1: convert_after_rewriter 0.05% : 0.000023s : 1: cse_after_recomputation 0.01% : 0.000005s : 1: dataset_repeat_opt 0.01% : 0.000005s : 1: detach_backward 0.02% : 0.000009s : 1: environ_conv 0.12% : 0.000061s : 1: event_method 0.01% : 0.000005s : 1: full_micro_interleaved_order_control 0.01% : 0.000005s : 1: get_jit_bprop_graph 0.02% : 0.000011s : 1: graph_reusing 0.01% : 0.000005s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000004s : 1: handle_group_info 0.01% : 0.000006s : 1: inline 0.01% : 0.000006s : 1: insert-virtual-dataset 0.01% : 0.000004s : 1: interleave_parallel_branches 0.01% : 0.000004s : 1: interleave_split_concat_branches 0.01% : 0.000006s : 1: label_fine_grained_interleaved_index 0.02% : 0.000008s : 1: label_micro_interleaved_index 0.91% : 0.000453s : 1: loop_unroll 0.01% : 0.000004s : 1: merge_cast_opt 0.01% : 0.000005s : 1: micro_interleaved_order_control 1.03% : 0.000513s : 1: mutable_eliminate 0.01% : 0.000007s : 1: offloading_packed_experts 0.03% : 0.000014s : 1: opt.transform.loop_unroll_optimizer 0.03% : 0.000014s : 1: opt.transform.mutable_eliminate 2.97% : 0.001486s : 78: opt.transform.opt_a 0.06% : 0.000032s : 1: opt.transform.opt_after_cconv 0.05% : 0.000027s : 1: opt.transform.opt_after_jit_grad 0.26% : 0.000132s : 28: opt.transform.opt_b 0.10% : 0.000050s : 2: opt.transform.opt_trans_graph 0.08% : 0.000038s : 4: opt.transform.symbol_engine_opt 6.28% : 0.003141s : 1: opt_a 0.21% : 0.000107s : 1: opt_after_cconv 1.01% : 0.000503s : 1: opt_after_jit_grad 0.48% : 0.000238s : 1: opt_b 10.51% : 0.005257s : 1: optimize 0.04% : 0.000021s : 1: optimize_parallel_all_gather_comm 0.02% : 0.000008s : 1: order_py_execute_after_rewriter 0.04% : 0.000021s : 1: overlap_grad_flash_sp 0.01% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.01% : 0.000007s : 1: overlap_grad_ring_attention 0.01% : 0.000006s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.01% : 0.000005s : 1: overlap_param_gather 0.01% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.01% : 0.000007s : 1: overlap_recompute_and_grad_model_parallel 0.01% : 0.000005s : 1: overlap_recompute_comm 0.01% : 0.000007s : 1: parallel-infer-symbol 0.01% : 0.000004s : 1: parallel-infer-symbol-second 0.01% : 0.000005s : 1: partial_unused_args_eliminate 0.01% : 0.000005s : 1: pipeline_parallel_scheduler 0.01% : 0.000005s : 1: pipeline_split 0.09% : 0.000045s : 1: pre_auto_parallel 0.10% : 0.000051s : 1: py_interpret_to_execute 0.02% : 0.000012s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000004s : 1: remove_cast_before_assign_add 0.03% : 0.000017s : 1: remove_dup_value 0.85% : 0.000423s : 1: renormalize.infer 0.80% : 0.000400s : 1: renormalize.specialize 0.01% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.01% : 0.000007s : 1: rewriter_after_jit_bprop_graph 0.07% : 0.000037s : 1: rewriter_after_opt_a 0.23% : 0.000114s : 1: rewriter_before_opt_a 0.01% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.01% : 0.000006s : 1: slice_recompute_activation 0.01% : 0.000005s : 1: split_layernorm_comm 0.01% : 0.000005s : 1: split_matmul_comm_elemetwise 0.02% : 0.000008s : 1: swap_dp_allreduce_reducescatter 0.16% : 0.000080s : 1: symbol_engine_optimizer 0.16% : 0.000080s : 1: tuple_transform 58.94% : 0.029486s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:24.247.666 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:24.247.931 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0330127, [21] [bootstrap]: 0.00042968 [type_inference]: 0.0227918 [event_method]: 1.306e-05 [auto_monad]: 5.618e-05 [graph_reusing]: 5.63997e-06 [inline]: 2.75002e-06 [add_attr]: 0.00332112, [1] [add_attr_with_inline]: 0.00331065, [1] [Cycle 1]: 7.133e-05, [2] [tag_attr]: 1.442e-05 [meta_addattr_fg_expand]: 3.85e-06 [parallel-infer-symbol]: 3.61001e-06 [pre_auto_parallel]: 2.852e-05 [insert-virtual-dataset]: 2.43002e-06 [parallel-infer-symbol-second]: 9.10019e-07 [dataset_repeat_opt]: 2.61e-06 [pipeline_split]: 1.74e-06 [optimize]: 0.00519225, [53] [py_interpret_to_execute]: 2.461e-05 [rewriter_before_opt_a]: 5.355e-05 [opt_a]: 0.00280999, [2] [Cycle 1]: 0.00190379, [45] [expand_dump_flag]: 3.04001e-06 [switch_simplify]: 2.661e-05 [loop_unroll]: 1.525e-05 [a_1]: 0.00035652 [with_stream_mark]: 1.869e-05 [recompute_prepare]: 9.59999e-06 [updatestate_depend_eliminate]: 4.02e-06 [updatestate_assign_eliminate]: 3.00002e-06 [updatestate_loads_eliminate]: 2.84001e-06 [parameter_eliminate]: 2.21998e-06 [a_2]: 0.00013343 [accelerated_algorithm]: 2.117e-05 [shard]: 2.53e-06 [meta_shard_fg_expand]: 2.18998e-06 [shard_inline]: 7.33e-06 [merge_send_recv]: 8.89998e-06 [auto_parallel]: 7.14001e-06 [parallel]: 1.975e-05 [flash_sp]: 7.84002e-06 [merge_comm]: 3.7e-06 [allreduce_fusion]: 3.39001e-06 [matmul_add_comm_reduction]: 1.051e-05 [allreduce_slice_to_reducescatter]: 6.89994e-07 [virtual_shard_identity]: 7.87e-06 [virtual_dataset]: 6.64999e-06 [get_grad_eliminate_]: 6.33e-06 [virtual_output]: 6.61999e-06 [merge_forward]: 4.23001e-06 [cell_reuse_recompute_pass]: 1.44e-06 [offload_activation]: 1.124e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.587e-05 [merge_recompute_call_nodes]: 1.50999e-06 [before_grad]: 1.07e-05 [set_forward_comm_id_for_comm_node_pass]: 4.02e-06 [meta_fg_expand]: 2.96001e-06 [flash_sp_send_recv_attached]: 5.08002e-06 [receive_attached]: 2.16e-06 [after_resolve]: 1.042e-05 [a_after_grad]: 9.27001e-06 [renormalize]: 0.00059718 [add_forward_monad_depend]: 5.74999e-06 [auto_monad_grad]: 2.60002e-06 [auto_monad_eliminator]: 1.558e-05 [cse]: 3.346e-05 [a_3]: 6.479e-05 [Cycle 2]: 0.00089249, [45] [expand_dump_flag]: 1.20999e-06 [switch_simplify]: 8.07003e-06 [loop_unroll]: 6.37001e-06 [a_1]: 0.00014236 [with_stream_mark]: 9.87999e-06 [recompute_prepare]: 7.9e-06 [updatestate_depend_eliminate]: 3.68e-06 [updatestate_assign_eliminate]: 2.84999e-06 [updatestate_loads_eliminate]: 2.88998e-06 [parameter_eliminate]: 1.01002e-06 [a_2]: 0.00011788 [accelerated_algorithm]: 1.04e-05 [shard]: 1.29e-06 [meta_shard_fg_expand]: 1.47999e-06 [shard_inline]: 7.38e-06 [merge_send_recv]: 5.75001e-06 [auto_parallel]: 5.94e-06 [parallel]: 6.26e-06 [flash_sp]: 3.68999e-06 [merge_comm]: 3.66001e-06 [allreduce_fusion]: 3.41999e-06 [matmul_add_comm_reduction]: 7.46999e-06 [allreduce_slice_to_reducescatter]: 7.2e-07 [virtual_shard_identity]: 7.71001e-06 [virtual_dataset]: 6.43e-06 [get_grad_eliminate_]: 6.22001e-06 [virtual_output]: 5.96e-06 [merge_forward]: 3.45e-06 [cell_reuse_recompute_pass]: 1.52999e-06 [offload_activation]: 7.84002e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.488e-05 [merge_recompute_call_nodes]: 1.23002e-06 [before_grad]: 1.005e-05 [set_forward_comm_id_for_comm_node_pass]: 4.90999e-06 [meta_fg_expand]: 2.41e-06 [flash_sp_send_recv_attached]: 1.25999e-06 [receive_attached]: 1.26002e-06 [after_resolve]: 9.79999e-06 [a_after_grad]: 9.39998e-06 [renormalize]: 8.00064e-08 [add_forward_monad_depend]: 1.56998e-06 [auto_monad_grad]: 1.16997e-06 [auto_monad_eliminator]: 8.36002e-06 [cse]: 1.738e-05 [a_3]: 4.925e-05 [py_interpret_to_execute_after_opt_a]: 1.291e-05 [slice_cell_reuse_recomputed_activation]: 5.24e-06 [rewriter_after_opt_a]: 4.074e-05 [convert_after_rewriter]: 9.57999e-06 [order_py_execute_after_rewriter]: 8.55999e-06 [mutable_eliminate]: 0.00061729 [opt_b]: 0.0002842, [1] [Cycle 1]: 0.00027504, [7] [b_1]: 0.00017425 [b_2]: 7.95e-06 [updatestate_depend_eliminate]: 6.10002e-06 [updatestate_assign_eliminate]: 2.51998e-06 [updatestate_loads_eliminate]: 2.81999e-06 [renormalize]: 6.50005e-07 [cse]: 2.026e-05 [optimize_parallel_all_gather_comm]: 2.082e-05 [overlap_param_gather]: 4.82998e-06 [cconv]: 3.235e-05 [loop_unroll]: 0.00045351 [opt_after_cconv]: 0.00012594, [1] [Cycle 1]: 0.00011747, [7] [c_1]: 2.859e-05 [parameter_eliminate]: 3.26999e-06 [updatestate_depend_eliminate]: 5.12999e-06 [updatestate_assign_eliminate]: 2.51e-06 [updatestate_loads_eliminate]: 2.75002e-06 [cse]: 1.908e-05 [renormalize]: 4.19997e-07 [remove_dup_value]: 1.844e-05 [tuple_transform]: 9.487e-05, [1] [Cycle 1]: 8.674e-05, [4] [d_1]: 4.476e-05 [none_parameter_eliminate]: 1.77001e-06 [renormalize]: 1.80007e-07 [switch_simplify]: 7.49002e-06 [partial_unused_args_eliminate]: 4.99e-06 [add_recomputation]: 4.889e-05 [cse_after_recomputation]: 2.812e-05, [1] [Cycle 1]: 2.103e-05, [1] [cse]: 1.236e-05 [environ_conv]: 8.38999e-06 [swap_dp_allreduce_reducescatter]: 8.1e-06 [bias_add_comm_swap]: 5.05001e-06 [label_micro_interleaved_index]: 7.46001e-06 [label_fine_grained_interleaved_index]: 5.49e-06 [merge_cast_opt]: 3.88999e-06 [slice_recompute_activation]: 4.62e-06 [micro_interleaved_order_control]: 4.58999e-06 [assign_add_opt]: 3.71001e-06 [ForceFp32Comm]: 3.09999e-06 [remove_cast_before_assign_add]: 3.58e-06 [full_micro_interleaved_order_control]: 4.85001e-06 [reorder_send_recv_between_fp_bp]: 5.05999e-06 [comm_op_add_attrs]: 3.79002e-06 [add_comm_op_reuse_tag]: 3.35e-06 [interleave_split_concat_branches]: 3.49001e-06 [interleave_parallel_branches]: 3.6e-06 [overlap_opt_shard_in_pipeline]: 4.23001e-06 [overlap_opt_shard_grad_in_pipeline]: 4.22e-06 [control_data_broadcast_order]: 1.603e-05 [grouped_pairwise_exchange_alltoall]: 4.05998e-06 [offloading_packed_experts]: 6.68e-06 [overlap_recompute_and_grad_model_parallel]: 7.43999e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.57002e-06 [overlap_recompute_allgather_and_fa_grad]: 3.71001e-06 [overlap_recompute_comm]: 5.40001e-06 [overlap_grad_ring_attention]: 6.81001e-06 [overlap_grad_flash_sp]: 2.229e-05 [begin_end_overlap_inline]: 3.09999e-06 [split_matmul_comm_elemetwise]: 4.83001e-06 [split_layernorm_comm]: 4.12003e-06 [handle_group_info]: 3.57002e-06 [symbol_engine_optimizer]: 9.515e-05, [1] [Cycle 1]: 8.858e-05, [6] [build]: 2.64001e-06 [elim_shapecalc]: 9.55001e-06 [elim_not_effective]: 1.33e-05 [opt_reshape]: 6.94999e-06 [fold_const_symbol]: 9.89999e-06 [renormalize]: 2.19996e-07 [detach_backward]: 3.83001e-06 [pipeline_parallel_scheduler]: 1.87001e-06 [auto_monad_reorder]: 1.824e-05 [get_jit_bprop_graph]: 2.08998e-06 [rewriter_after_jit_bprop_graph]: 4.45999e-06 [opt_after_jit_grad]: 0.00049235 [validate]: 4.058e-05 Sums bootstrap : 0.000430s : 1.54% type_inference : 0.022792s : 81.77% event_method : 0.000013s : 0.05% auto_monad : 0.000056s : 0.20% graph_reusing : 0.000006s : 0.02% inline : 0.000003s : 0.01% add_attr.add_attr_with_inline.tag_attr : 0.000014s : 0.05% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000004s : 0.01% parallel-infer-symbol : 0.000004s : 0.01% pre_auto_parallel : 0.000029s : 0.10% insert-virtual-dataset : 0.000002s : 0.01% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000003s : 0.01% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000025s : 0.09% optimize.rewriter_before_opt_a : 0.000054s : 0.19% optimize.opt_a.expand_dump_flag : 0.000004s : 0.02% optimize.opt_a.switch_simplify : 0.000035s : 0.12% optimize.opt_a.loop_unroll : 0.000022s : 0.08% optimize.opt_a.a_1 : 0.000499s : 1.79% optimize.opt_a.with_stream_mark : 0.000029s : 0.10% optimize.opt_a.recompute_prepare : 0.000017s : 0.06% optimize.opt_a.updatestate_depend_eliminate : 0.000008s : 0.03% optimize.opt_a.updatestate_assign_eliminate : 0.000006s : 0.02% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.02% optimize.opt_a.parameter_eliminate : 0.000003s : 0.01% optimize.opt_a.a_2 : 0.000251s : 0.90% optimize.opt_a.accelerated_algorithm : 0.000032s : 0.11% optimize.opt_a.shard : 0.000004s : 0.01% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.01% optimize.opt_a.shard_inline : 0.000015s : 0.05% optimize.opt_a.merge_send_recv : 0.000015s : 0.05% optimize.opt_a.auto_parallel : 0.000013s : 0.05% optimize.opt_a.parallel : 0.000026s : 0.09% optimize.opt_a.flash_sp : 0.000012s : 0.04% optimize.opt_a.merge_comm : 0.000007s : 0.03% optimize.opt_a.allreduce_fusion : 0.000007s : 0.02% optimize.opt_a.matmul_add_comm_reduction : 0.000018s : 0.06% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000016s : 0.06% optimize.opt_a.virtual_dataset : 0.000013s : 0.05% optimize.opt_a.get_grad_eliminate_ : 0.000013s : 0.05% optimize.opt_a.virtual_output : 0.000013s : 0.05% optimize.opt_a.merge_forward : 0.000008s : 0.03% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.01% optimize.opt_a.offload_activation : 0.000019s : 0.07% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000031s : 0.11% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.01% optimize.opt_a.before_grad : 0.000021s : 0.07% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000009s : 0.03% optimize.opt_a.meta_fg_expand : 0.000005s : 0.02% optimize.opt_a.flash_sp_send_recv_attached : 0.000006s : 0.02% optimize.opt_a.receive_attached : 0.000003s : 0.01% optimize.opt_a.after_resolve : 0.000020s : 0.07% optimize.opt_a.a_after_grad : 0.000019s : 0.07% optimize.opt_a.renormalize : 0.000597s : 2.14% optimize.opt_a.add_forward_monad_depend : 0.000007s : 0.03% optimize.opt_a.auto_monad_grad : 0.000004s : 0.01% optimize.opt_a.auto_monad_eliminator : 0.000024s : 0.09% optimize.opt_a.cse : 0.000051s : 0.18% optimize.opt_a.a_3 : 0.000114s : 0.41% optimize.py_interpret_to_execute_after_opt_a : 0.000013s : 0.05% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.02% optimize.rewriter_after_opt_a : 0.000041s : 0.15% optimize.convert_after_rewriter : 0.000010s : 0.03% optimize.order_py_execute_after_rewriter : 0.000009s : 0.03% optimize.mutable_eliminate : 0.000617s : 2.21% optimize.opt_b.b_1 : 0.000174s : 0.63% optimize.opt_b.b_2 : 0.000008s : 0.03% optimize.opt_b.updatestate_depend_eliminate : 0.000006s : 0.02% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.01% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.01% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000020s : 0.07% optimize.optimize_parallel_all_gather_comm : 0.000021s : 0.07% optimize.overlap_param_gather : 0.000005s : 0.02% optimize.cconv : 0.000032s : 0.12% optimize.loop_unroll : 0.000454s : 1.63% optimize.opt_after_cconv.c_1 : 0.000029s : 0.10% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000005s : 0.02% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.cse : 0.000019s : 0.07% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000018s : 0.07% optimize.tuple_transform.d_1 : 0.000045s : 0.16% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000007s : 0.03% optimize.partial_unused_args_eliminate : 0.000005s : 0.02% optimize.add_recomputation : 0.000049s : 0.18% optimize.cse_after_recomputation.cse : 0.000012s : 0.04% optimize.environ_conv : 0.000008s : 0.03% optimize.swap_dp_allreduce_reducescatter : 0.000008s : 0.03% optimize.bias_add_comm_swap : 0.000005s : 0.02% optimize.label_micro_interleaved_index : 0.000007s : 0.03% optimize.label_fine_grained_interleaved_index : 0.000005s : 0.02% optimize.merge_cast_opt : 0.000004s : 0.01% optimize.slice_recompute_activation : 0.000005s : 0.02% optimize.micro_interleaved_order_control : 0.000005s : 0.02% optimize.assign_add_opt : 0.000004s : 0.01% optimize.ForceFp32Comm : 0.000003s : 0.01% optimize.remove_cast_before_assign_add : 0.000004s : 0.01% optimize.full_micro_interleaved_order_control : 0.000005s : 0.02% optimize.reorder_send_recv_between_fp_bp : 0.000005s : 0.02% optimize.comm_op_add_attrs : 0.000004s : 0.01% optimize.add_comm_op_reuse_tag : 0.000003s : 0.01% optimize.interleave_split_concat_branches : 0.000003s : 0.01% optimize.interleave_parallel_branches : 0.000004s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.02% optimize.overlap_opt_shard_grad_in_pipeline : 0.000004s : 0.02% optimize.control_data_broadcast_order : 0.000016s : 0.06% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.01% optimize.offloading_packed_experts : 0.000007s : 0.02% optimize.overlap_recompute_and_grad_model_parallel : 0.000007s : 0.03% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.01% optimize.overlap_recompute_comm : 0.000005s : 0.02% optimize.overlap_grad_ring_attention : 0.000007s : 0.02% optimize.overlap_grad_flash_sp : 0.000022s : 0.08% optimize.begin_end_overlap_inline : 0.000003s : 0.01% optimize.split_matmul_comm_elemetwise : 0.000005s : 0.02% optimize.split_layernorm_comm : 0.000004s : 0.01% optimize.handle_group_info : 0.000004s : 0.01% optimize.symbol_engine_optimizer.build : 0.000003s : 0.01% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000010s : 0.03% optimize.symbol_engine_optimizer.elim_not_effective : 0.000013s : 0.05% optimize.symbol_engine_optimizer.opt_reshape : 0.000007s : 0.02% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000010s : 0.04% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000004s : 0.01% pipeline_parallel_scheduler : 0.000002s : 0.01% auto_monad_reorder : 0.000018s : 0.07% get_jit_bprop_graph : 0.000002s : 0.01% rewriter_after_jit_bprop_graph : 0.000004s : 0.02% opt_after_jit_grad : 0.000492s : 1.77% validate : 0.000041s : 0.15% Time group info: ------[substitution.] 0.000204 25 37.58% : 0.000077s : 4: substitution.arithmetic_simplify 1.01% : 0.000002s : 2: substitution.elim_not_effective 0.68% : 0.000001s : 2: substitution.fold_const_symbol 2.72% : 0.000006s : 3: substitution.graph_param_transform 46.05% : 0.000094s : 2: substitution.inline 1.89% : 0.000004s : 4: substitution.j_node_and_user_rematch 5.85% : 0.000012s : 2: substitution.less_batch_normalization 2.36% : 0.000005s : 4: substitution.remove_not_recompute_node 1.86% : 0.000004s : 2: substitution.replace_old_param ------[type_inference.] 0.022739 2 97.99% : 0.022283s : 1: type_inference.infer 2.01% : 0.000456s : 1: type_inference.specialize ------[replace.] 0.000023 2 100.00% : 0.000023s : 2: replace.inline ------[match.] 0.000092 2 100.00% : 0.000092s : 2: match.inline ------[predicate.] 0.000148 754 0.83% : 0.000001s : 7: predicate.accumulaten_eliminater 1.00% : 0.000001s : 3: predicate.ad_related_special_op_eliminate 0.64% : 0.000001s : 6: predicate.addn_check_dump 0.78% : 0.000001s : 7: predicate.addn_zero_filter 0.72% : 0.000001s : 7: predicate.adjust_all_reduce_mul_add 2.69% : 0.000004s : 13: predicate.arithmetic_simplify 0.91% : 0.000001s : 7: predicate.cast_eliminate 0.80% : 0.000001s : 6: predicate.check_bprop_eliminate 0.64% : 0.000001s : 6: predicate.compare_switch_simplify 0.20% : 0.000000s : 3: predicate.const_output_eliminate 0.85% : 0.000001s : 6: predicate.depend_value_elim 0.70% : 0.000001s : 7: predicate.dict_get_item_const_eliminator 0.98% : 0.000001s : 7: predicate.dict_get_item_eliminator 0.65% : 0.000001s : 7: predicate.dict_set_item_eliminator 1.19% : 0.000002s : 6: predicate.dumpgradient_eliminate 0.24% : 0.000000s : 3: predicate.elim_not_effective 0.46% : 0.000001s : 3: predicate.elim_shapecalc_of_broadcastargs 1.00% : 0.000001s : 10: predicate.environ_add_const_eliminate 1.04% : 0.000002s : 10: predicate.environ_get_add_eliminate 0.95% : 0.000001s : 10: predicate.environ_get_depend_swap 1.90% : 0.000003s : 16: predicate.environ_get_eliminate 1.05% : 0.000002s : 10: predicate.environ_get_set_eliminate 0.93% : 0.000001s : 9: predicate.exchange_switch_depend_value 2.08% : 0.000003s : 9: predicate.float_depend_g_call 0.65% : 0.000001s : 6: predicate.float_environ_get_switch 0.98% : 0.000001s : 9: predicate.float_tuple_getitem_switch 0.20% : 0.000000s : 3: predicate.fold_const_symbol 0.83% : 0.000001s : 6: predicate.get_grad_eliminate 0.25% : 0.000000s : 3: predicate.graph_param_transform 0.81% : 0.000001s : 6: predicate.incorporate_call 0.64% : 0.000001s : 6: predicate.incorporate_call_switch 6.66% : 0.000010s : 34: predicate.inline 0.92% : 0.000001s : 6: predicate.inline_without_move 0.39% : 0.000001s : 6: predicate.j_node_and_user_rematch 1.31% : 0.000002s : 6: predicate.less_batch_normalization 1.80% : 0.000003s : 13: predicate.list_to_tuple_eliminator_ 2.33% : 0.000003s : 20: predicate.load_eliminater 1.14% : 0.000002s : 3: predicate.loop_unroll_after_grad 1.65% : 0.000002s : 14: predicate.loop_unroll_before_grad 1.63% : 0.000002s : 13: predicate.make_slice_get_slice_eliminator 0.68% : 0.000001s : 6: predicate.merge_addn 0.69% : 0.000001s : 6: predicate.micro_step_allgather_replace 0.77% : 0.000001s : 6: predicate.mini_step_allgather_replace 0.64% : 0.000001s : 7: predicate.minmaximum_grad 1.58% : 0.000002s : 3: predicate.mutable_eliminate 0.43% : 0.000001s : 3: predicate.opt_reshape 0.47% : 0.000001s : 3: predicate.parallel_virtual_node 1.44% : 0.000002s : 9: predicate.partial_defer_inline 1.16% : 0.000002s : 10: predicate.partial_eliminate 0.97% : 0.000001s : 7: predicate.print_const_string_wrapper 0.82% : 0.000001s : 6: predicate.reduce_all_const_elim 1.08% : 0.000002s : 7: predicate.reduce_eliminate 1.84% : 0.000003s : 20: predicate.redundant_stop_gradient_eliminater 0.81% : 0.000001s : 6: predicate.remove_not_recompute_node 1.04% : 0.000002s : 13: predicate.replace_applicator 0.60% : 0.000001s : 6: predicate.replace_old_param 0.32% : 0.000000s : 3: predicate.reset_defer_inline 0.81% : 0.000001s : 7: predicate.reshape_eliminate 0.74% : 0.000001s : 6: predicate.row_tensor_add_zeros_like 0.58% : 0.000001s : 3: predicate.row_tensor_eliminate 1.12% : 0.000002s : 6: predicate.same_eliminate 0.58% : 0.000001s : 6: predicate.set_cell_output_no_recompute 1.06% : 0.000002s : 6: predicate.shard_identity_eliminate 0.79% : 0.000001s : 6: predicate.special_op_eliminate 1.00% : 0.000001s : 6: predicate.specialize_transform 1.06% : 0.000002s : 6: predicate.split_environ_get_set_with_tuple_value 0.85% : 0.000001s : 6: predicate.stack_unstack_eliminate 0.39% : 0.000001s : 3: predicate.switch_call_monad_eliminater 0.99% : 0.000001s : 9: predicate.switch_defer_inline 1.98% : 0.000003s : 15: predicate.switch_layer_defer_inline 4.36% : 0.000006s : 32: predicate.switch_simplify 0.81% : 0.000001s : 7: predicate.tile_eliminate 0.78% : 0.000001s : 7: predicate.transpose_eliminate 1.55% : 0.000002s : 13: predicate.tuple_list_convert_item_index_to_positive 1.84% : 0.000003s : 13: predicate.tuple_list_get_item_const_eliminator 1.39% : 0.000002s : 13: predicate.tuple_list_get_item_depend_reorder 3.30% : 0.000005s : 19: predicate.tuple_list_get_item_eliminator 1.29% : 0.000002s : 13: predicate.tuple_list_get_set_item_eliminator 2.57% : 0.000004s : 19: predicate.tuple_list_set_item_eliminator 1.55% : 0.000002s : 13: predicate.tuple_to_list_eliminator_ 1.86% : 0.000003s : 20: predicate.updatestate_pure_node_eliminater 3.00% : 0.000004s : 26: predicate.updatestate_useless_node_eliminater 0.47% : 0.000001s : 3: predicate.value_based_eliminate 0.75% : 0.000001s : 6: predicate.virtual_dataset_eliminate 0.74% : 0.000001s : 6: predicate.virtual_output_eliminate 0.34% : 0.000001s : 3: predicate.virtual_view_grad_eliminate 0.70% : 0.000001s : 3: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000248 5 7.98% : 0.000020s : 1: func_graph_cloner_run.FuncGraphClonerGraph 92.02% : 0.000228s : 4: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.043192 192 0.01% : 0.000006s : 1: ForceFp32Comm 7.71% : 0.003331s : 1: add_attr 7.67% : 0.003315s : 1: add_attr_with_inline 0.01% : 0.000006s : 1: add_comm_op_reuse_tag 0.12% : 0.000053s : 1: add_recomputation 0.02% : 0.000006s : 1: assign_add_opt 0.15% : 0.000065s : 1: auto_monad 0.06% : 0.000025s : 1: auto_monad_reorder 0.01% : 0.000006s : 1: begin_end_overlap_inline 0.02% : 0.000008s : 1: bias_add_comm_swap 1.10% : 0.000476s : 1: bootstrap 0.08% : 0.000035s : 1: cconv 0.02% : 0.000006s : 1: comm_op_add_attrs 0.04% : 0.000019s : 1: control_data_broadcast_order 0.03% : 0.000013s : 1: convert_after_rewriter 0.07% : 0.000031s : 1: cse_after_recomputation 0.02% : 0.000008s : 1: dataset_repeat_opt 0.04% : 0.000019s : 1: detach_backward 0.03% : 0.000011s : 1: environ_conv 0.05% : 0.000023s : 1: event_method 0.02% : 0.000008s : 1: full_micro_interleaved_order_control 0.02% : 0.000009s : 1: get_jit_bprop_graph 0.03% : 0.000012s : 1: graph_reusing 0.02% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.02% : 0.000007s : 1: handle_group_info 0.02% : 0.000008s : 1: inline 0.02% : 0.000010s : 1: insert-virtual-dataset 0.01% : 0.000006s : 1: interleave_parallel_branches 0.01% : 0.000006s : 1: interleave_split_concat_branches 0.02% : 0.000008s : 1: label_fine_grained_interleaved_index 0.02% : 0.000010s : 1: label_micro_interleaved_index 1.06% : 0.000460s : 1: loop_unroll 0.02% : 0.000007s : 1: merge_cast_opt 0.02% : 0.000007s : 1: micro_interleaved_order_control 1.44% : 0.000624s : 1: mutable_eliminate 0.02% : 0.000009s : 1: offloading_packed_experts 0.03% : 0.000014s : 1: opt.transform.loop_unroll_optimizer 0.04% : 0.000017s : 1: opt.transform.mutable_eliminate 2.17% : 0.000936s : 78: opt.transform.opt_a 0.06% : 0.000027s : 1: opt.transform.opt_after_cconv 0.06% : 0.000025s : 1: opt.transform.opt_after_jit_grad 0.24% : 0.000104s : 28: opt.transform.opt_b 0.11% : 0.000049s : 2: opt.transform.opt_trans_graph 0.08% : 0.000036s : 4: opt.transform.symbol_engine_opt 6.51% : 0.002813s : 1: opt_a 0.30% : 0.000130s : 1: opt_after_cconv 1.16% : 0.000503s : 1: opt_after_jit_grad 0.67% : 0.000288s : 1: opt_b 12.82% : 0.005538s : 1: optimize 0.06% : 0.000024s : 1: optimize_parallel_all_gather_comm 0.03% : 0.000012s : 1: order_py_execute_after_rewriter 0.06% : 0.000025s : 1: overlap_grad_flash_sp 0.01% : 0.000006s : 1: overlap_grad_matmul_and_grad_allreduce 0.02% : 0.000010s : 1: overlap_grad_ring_attention 0.02% : 0.000007s : 1: overlap_opt_shard_grad_in_pipeline 0.02% : 0.000007s : 1: overlap_opt_shard_in_pipeline 0.02% : 0.000008s : 1: overlap_param_gather 0.01% : 0.000006s : 1: overlap_recompute_allgather_and_fa_grad 0.02% : 0.000010s : 1: overlap_recompute_and_grad_model_parallel 0.02% : 0.000008s : 1: overlap_recompute_comm 0.02% : 0.000011s : 1: parallel-infer-symbol 0.02% : 0.000007s : 1: parallel-infer-symbol-second 0.02% : 0.000008s : 1: partial_unused_args_eliminate 0.02% : 0.000009s : 1: pipeline_parallel_scheduler 0.02% : 0.000008s : 1: pipeline_split 0.08% : 0.000036s : 1: pre_auto_parallel 0.07% : 0.000028s : 1: py_interpret_to_execute 0.04% : 0.000016s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000006s : 1: remove_cast_before_assign_add 0.05% : 0.000021s : 1: remove_dup_value 0.75% : 0.000326s : 1: renormalize.infer 0.61% : 0.000262s : 1: renormalize.specialize 0.02% : 0.000008s : 1: reorder_send_recv_between_fp_bp 0.03% : 0.000011s : 1: rewriter_after_jit_bprop_graph 0.10% : 0.000044s : 1: rewriter_after_opt_a 0.13% : 0.000057s : 1: rewriter_before_opt_a 0.02% : 0.000008s : 1: slice_cell_reuse_recomputed_activation 0.02% : 0.000007s : 1: slice_recompute_activation 0.02% : 0.000007s : 1: split_layernorm_comm 0.02% : 0.000007s : 1: split_matmul_comm_elemetwise 0.03% : 0.000011s : 1: swap_dp_allreduce_reducescatter 0.23% : 0.000098s : 1: symbol_engine_optimizer 0.23% : 0.000098s : 1: tuple_transform 52.86% : 0.022829s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:24.653.877 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0298852, [21] [bootstrap]: 0.00043887 [type_inference]: 0.00482658 [event_method]: 1.147e-05 [auto_monad]: 5.562e-05 [graph_reusing]: 6.02999e-06 [inline]: 2.39001e-06 [add_attr]: 0.0193985, [1] [add_attr_with_inline]: 0.0193863, [1] [Cycle 1]: 6.979e-05, [2] [tag_attr]: 1.805e-05 [meta_addattr_fg_expand]: 3.69002e-06 [parallel-infer-symbol]: 3.48e-06 [pre_auto_parallel]: 3.325e-05 [insert-virtual-dataset]: 2.59001e-06 [parallel-infer-symbol-second]: 7.50006e-07 [dataset_repeat_opt]: 1.98002e-06 [pipeline_split]: 2.83e-06 [optimize]: 0.00442285, [53] [py_interpret_to_execute]: 2.478e-05 [rewriter_before_opt_a]: 5.58e-05 [opt_a]: 0.00242151, [2] [Cycle 1]: 0.00172925, [45] [expand_dump_flag]: 2.68998e-06 [switch_simplify]: 2.6e-05 [loop_unroll]: 1.358e-05 [a_1]: 0.00036045 [with_stream_mark]: 2.002e-05 [recompute_prepare]: 8.58001e-06 [updatestate_depend_eliminate]: 3.93001e-06 [updatestate_assign_eliminate]: 3.53e-06 [updatestate_loads_eliminate]: 3.30998e-06 [parameter_eliminate]: 1.84e-06 [a_2]: 9.123e-05 [accelerated_algorithm]: 1.951e-05 [shard]: 2.27001e-06 [meta_shard_fg_expand]: 2.04999e-06 [shard_inline]: 6.74999e-06 [merge_send_recv]: 8.95999e-06 [auto_parallel]: 5.79999e-06 [parallel]: 2.048e-05 [flash_sp]: 7.50998e-06 [merge_comm]: 4.62e-06 [allreduce_fusion]: 3.45e-06 [matmul_add_comm_reduction]: 1.059e-05 [allreduce_slice_to_reducescatter]: 6.19999e-07 [virtual_shard_identity]: 8.57998e-06 [virtual_dataset]: 6.38e-06 [get_grad_eliminate_]: 5.74e-06 [virtual_output]: 7.03998e-06 [merge_forward]: 4.07e-06 [cell_reuse_recompute_pass]: 1.41002e-06 [offload_activation]: 1.033e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.229e-05 [merge_recompute_call_nodes]: 1.49e-06 [before_grad]: 1.256e-05 [set_forward_comm_id_for_comm_node_pass]: 3.85e-06 [meta_fg_expand]: 2.58e-06 [flash_sp_send_recv_attached]: 4.97999e-06 [receive_attached]: 2.66999e-06 [after_resolve]: 1.016e-05 [a_after_grad]: 9.70002e-06 [renormalize]: 0.00065589 [add_forward_monad_depend]: 5.59e-06 [auto_monad_grad]: 2.59001e-06 [auto_monad_eliminator]: 1.424e-05 [cse]: 3.099e-05 [a_3]: 4.526e-05 [Cycle 2]: 0.00065691, [45] [expand_dump_flag]: 1.02e-06 [switch_simplify]: 7.41999e-06 [loop_unroll]: 5.92001e-06 [a_1]: 0.00012933 [with_stream_mark]: 9.38002e-06 [recompute_prepare]: 6.93998e-06 [updatestate_depend_eliminate]: 3.11001e-06 [updatestate_assign_eliminate]: 2.46998e-06 [updatestate_loads_eliminate]: 2.68003e-06 [parameter_eliminate]: 1.18001e-06 [a_2]: 8.101e-05 [accelerated_algorithm]: 9.84999e-06 [shard]: 1.05001e-06 [meta_shard_fg_expand]: 1.40001e-06 [shard_inline]: 6.25002e-06 [merge_send_recv]: 5.20001e-06 [auto_parallel]: 5.80002e-06 [parallel]: 5.62999e-06 [flash_sp]: 3.97e-06 [merge_comm]: 3.52002e-06 [allreduce_fusion]: 3.13998e-06 [matmul_add_comm_reduction]: 6.79001e-06 [allreduce_slice_to_reducescatter]: 5.49975e-07 [virtual_shard_identity]: 6.71999e-06 [virtual_dataset]: 5.44998e-06 [get_grad_eliminate_]: 5.49e-06 [virtual_output]: 5.45001e-06 [merge_forward]: 3.01001e-06 [cell_reuse_recompute_pass]: 1.33002e-06 [offload_activation]: 6.84001e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.106e-05 [merge_recompute_call_nodes]: 9.50007e-07 [before_grad]: 9.22001e-06 [set_forward_comm_id_for_comm_node_pass]: 3.95998e-06 [meta_fg_expand]: 2.02001e-06 [flash_sp_send_recv_attached]: 8.30012e-07 [receive_attached]: 1.34998e-06 [after_resolve]: 9.08002e-06 [a_after_grad]: 8.16002e-06 [renormalize]: 8.9989e-08 [add_forward_monad_depend]: 1.77001e-06 [auto_monad_grad]: 1.19998e-06 [auto_monad_eliminator]: 7.46001e-06 [cse]: 1.528e-05 [a_3]: 3.492e-05 [py_interpret_to_execute_after_opt_a]: 1.209e-05 [slice_cell_reuse_recomputed_activation]: 2.01e-06 [rewriter_after_opt_a]: 3.475e-05 [convert_after_rewriter]: 6.18002e-06 [order_py_execute_after_rewriter]: 4.99e-06 [mutable_eliminate]: 0.00054322 [opt_b]: 0.00019906, [1] [Cycle 1]: 0.00019249, [7] [b_1]: 0.00011795 [b_2]: 7.3e-06 [updatestate_depend_eliminate]: 5.62999e-06 [updatestate_assign_eliminate]: 2.83003e-06 [updatestate_loads_eliminate]: 2.44001e-06 [renormalize]: 5.60016e-07 [cse]: 1.973e-05 [optimize_parallel_all_gather_comm]: 1.586e-05 [overlap_param_gather]: 2.17001e-06 [cconv]: 2.565e-05 [loop_unroll]: 0.00042433 [opt_after_cconv]: 0.00010101, [1] [Cycle 1]: 9.53e-05, [7] [c_1]: 2.741e-05 [parameter_eliminate]: 3.03e-06 [updatestate_depend_eliminate]: 5.64e-06 [updatestate_assign_eliminate]: 2.67001e-06 [updatestate_loads_eliminate]: 2.48e-06 [cse]: 1.847e-05 [renormalize]: 4.50003e-07 [remove_dup_value]: 1.636e-05 [tuple_transform]: 7.241e-05, [1] [Cycle 1]: 6.718e-05, [4] [d_1]: 3.99e-05 [none_parameter_eliminate]: 1.57999e-06 [renormalize]: 2.09984e-07 [switch_simplify]: 6.64999e-06 [partial_unused_args_eliminate]: 1.94e-06 [add_recomputation]: 4.542e-05 [cse_after_recomputation]: 2.139e-05, [1] [Cycle 1]: 1.719e-05, [1] [cse]: 1.174e-05 [environ_conv]: 5.27999e-06 [swap_dp_allreduce_reducescatter]: 5.09e-06 [bias_add_comm_swap]: 2.83998e-06 [label_micro_interleaved_index]: 4.27e-06 [label_fine_grained_interleaved_index]: 2.85998e-06 [merge_cast_opt]: 1.60999e-06 [slice_recompute_activation]: 2.04999e-06 [micro_interleaved_order_control]: 2.27999e-06 [assign_add_opt]: 1.34998e-06 [ForceFp32Comm]: 1.15999e-06 [remove_cast_before_assign_add]: 1.03001e-06 [full_micro_interleaved_order_control]: 1.99e-06 [reorder_send_recv_between_fp_bp]: 2.63e-06 [comm_op_add_attrs]: 9.39996e-07 [add_comm_op_reuse_tag]: 9.39996e-07 [interleave_split_concat_branches]: 1.05001e-06 [interleave_parallel_branches]: 1.64e-06 [overlap_opt_shard_in_pipeline]: 1.50999e-06 [overlap_opt_shard_grad_in_pipeline]: 2.27001e-06 [control_data_broadcast_order]: 1.229e-05 [grouped_pairwise_exchange_alltoall]: 1.82001e-06 [offloading_packed_experts]: 3.82998e-06 [overlap_recompute_and_grad_model_parallel]: 4.58999e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.29e-06 [overlap_recompute_allgather_and_fa_grad]: 1.32999e-06 [overlap_recompute_comm]: 2.58e-06 [overlap_grad_ring_attention]: 4.40999e-06 [overlap_grad_flash_sp]: 1.808e-05 [begin_end_overlap_inline]: 6.19999e-07 [split_matmul_comm_elemetwise]: 2.04999e-06 [split_layernorm_comm]: 1.55999e-06 [handle_group_info]: 1.03001e-06 [symbol_engine_optimizer]: 7.407e-05, [1] [Cycle 1]: 6.973e-05, [6] [build]: 2.29999e-06 [elim_shapecalc]: 9.72999e-06 [elim_not_effective]: 1.304e-05 [opt_reshape]: 6.78998e-06 [fold_const_symbol]: 9.57999e-06 [renormalize]: 2.29978e-07 [detach_backward]: 1.86e-06 [pipeline_parallel_scheduler]: 1.45999e-06 [auto_monad_reorder]: 1.567e-05 [get_jit_bprop_graph]: 1.86e-06 [rewriter_after_jit_bprop_graph]: 3.74002e-06 [opt_after_jit_grad]: 0.00046337 [validate]: 3.679e-05 Sums bootstrap : 0.000439s : 4.62% type_inference : 0.004827s : 50.83% event_method : 0.000011s : 0.12% auto_monad : 0.000056s : 0.59% graph_reusing : 0.000006s : 0.06% inline : 0.000002s : 0.03% add_attr.add_attr_with_inline.tag_attr : 0.000018s : 0.19% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000004s : 0.04% parallel-infer-symbol : 0.000003s : 0.04% pre_auto_parallel : 0.000033s : 0.35% insert-virtual-dataset : 0.000003s : 0.03% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000003s : 0.03% optimize.py_interpret_to_execute : 0.000025s : 0.26% optimize.rewriter_before_opt_a : 0.000056s : 0.59% optimize.opt_a.expand_dump_flag : 0.000004s : 0.04% optimize.opt_a.switch_simplify : 0.000033s : 0.35% optimize.opt_a.loop_unroll : 0.000020s : 0.21% optimize.opt_a.a_1 : 0.000490s : 5.16% optimize.opt_a.with_stream_mark : 0.000029s : 0.31% optimize.opt_a.recompute_prepare : 0.000016s : 0.16% optimize.opt_a.updatestate_depend_eliminate : 0.000007s : 0.07% optimize.opt_a.updatestate_assign_eliminate : 0.000006s : 0.06% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.06% optimize.opt_a.parameter_eliminate : 0.000003s : 0.03% optimize.opt_a.a_2 : 0.000172s : 1.81% optimize.opt_a.accelerated_algorithm : 0.000029s : 0.31% optimize.opt_a.shard : 0.000003s : 0.03% optimize.opt_a.meta_shard_fg_expand : 0.000003s : 0.04% optimize.opt_a.shard_inline : 0.000013s : 0.14% optimize.opt_a.merge_send_recv : 0.000014s : 0.15% optimize.opt_a.auto_parallel : 0.000012s : 0.12% optimize.opt_a.parallel : 0.000026s : 0.27% optimize.opt_a.flash_sp : 0.000011s : 0.12% optimize.opt_a.merge_comm : 0.000008s : 0.09% optimize.opt_a.allreduce_fusion : 0.000007s : 0.07% optimize.opt_a.matmul_add_comm_reduction : 0.000017s : 0.18% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000015s : 0.16% optimize.opt_a.virtual_dataset : 0.000012s : 0.12% optimize.opt_a.get_grad_eliminate_ : 0.000011s : 0.12% optimize.opt_a.virtual_output : 0.000012s : 0.13% optimize.opt_a.merge_forward : 0.000007s : 0.07% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.03% optimize.opt_a.offload_activation : 0.000017s : 0.18% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000023s : 0.25% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.03% optimize.opt_a.before_grad : 0.000022s : 0.23% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000008s : 0.08% optimize.opt_a.meta_fg_expand : 0.000005s : 0.05% optimize.opt_a.flash_sp_send_recv_attached : 0.000006s : 0.06% optimize.opt_a.receive_attached : 0.000004s : 0.04% optimize.opt_a.after_resolve : 0.000019s : 0.20% optimize.opt_a.a_after_grad : 0.000018s : 0.19% optimize.opt_a.renormalize : 0.000656s : 6.91% optimize.opt_a.add_forward_monad_depend : 0.000007s : 0.08% optimize.opt_a.auto_monad_grad : 0.000004s : 0.04% optimize.opt_a.auto_monad_eliminator : 0.000022s : 0.23% optimize.opt_a.cse : 0.000046s : 0.49% optimize.opt_a.a_3 : 0.000080s : 0.84% optimize.py_interpret_to_execute_after_opt_a : 0.000012s : 0.13% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.02% optimize.rewriter_after_opt_a : 0.000035s : 0.37% optimize.convert_after_rewriter : 0.000006s : 0.07% optimize.order_py_execute_after_rewriter : 0.000005s : 0.05% optimize.mutable_eliminate : 0.000543s : 5.72% optimize.opt_b.b_1 : 0.000118s : 1.24% optimize.opt_b.b_2 : 0.000007s : 0.08% optimize.opt_b.updatestate_depend_eliminate : 0.000006s : 0.06% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_b.updatestate_loads_eliminate : 0.000002s : 0.03% optimize.opt_b.renormalize : 0.000001s : 0.01% optimize.opt_b.cse : 0.000020s : 0.21% optimize.optimize_parallel_all_gather_comm : 0.000016s : 0.17% optimize.overlap_param_gather : 0.000002s : 0.02% optimize.cconv : 0.000026s : 0.27% optimize.loop_unroll : 0.000424s : 4.47% optimize.opt_after_cconv.c_1 : 0.000027s : 0.29% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.06% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.03% optimize.opt_after_cconv.cse : 0.000018s : 0.19% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000016s : 0.17% optimize.tuple_transform.d_1 : 0.000040s : 0.42% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.02% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000007s : 0.07% optimize.partial_unused_args_eliminate : 0.000002s : 0.02% optimize.add_recomputation : 0.000045s : 0.48% optimize.cse_after_recomputation.cse : 0.000012s : 0.12% optimize.environ_conv : 0.000005s : 0.06% optimize.swap_dp_allreduce_reducescatter : 0.000005s : 0.05% optimize.bias_add_comm_swap : 0.000003s : 0.03% optimize.label_micro_interleaved_index : 0.000004s : 0.04% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.03% optimize.merge_cast_opt : 0.000002s : 0.02% optimize.slice_recompute_activation : 0.000002s : 0.02% optimize.micro_interleaved_order_control : 0.000002s : 0.02% optimize.assign_add_opt : 0.000001s : 0.01% optimize.ForceFp32Comm : 0.000001s : 0.01% optimize.remove_cast_before_assign_add : 0.000001s : 0.01% optimize.full_micro_interleaved_order_control : 0.000002s : 0.02% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.03% optimize.comm_op_add_attrs : 0.000001s : 0.01% optimize.add_comm_op_reuse_tag : 0.000001s : 0.01% optimize.interleave_split_concat_branches : 0.000001s : 0.01% optimize.interleave_parallel_branches : 0.000002s : 0.02% optimize.overlap_opt_shard_in_pipeline : 0.000002s : 0.02% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.02% optimize.control_data_broadcast_order : 0.000012s : 0.13% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.02% optimize.offloading_packed_experts : 0.000004s : 0.04% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.05% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.01% optimize.overlap_recompute_comm : 0.000003s : 0.03% optimize.overlap_grad_ring_attention : 0.000004s : 0.05% optimize.overlap_grad_flash_sp : 0.000018s : 0.19% optimize.begin_end_overlap_inline : 0.000001s : 0.01% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.02% optimize.split_layernorm_comm : 0.000002s : 0.02% optimize.handle_group_info : 0.000001s : 0.01% optimize.symbol_engine_optimizer.build : 0.000002s : 0.02% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000010s : 0.10% optimize.symbol_engine_optimizer.elim_not_effective : 0.000013s : 0.14% optimize.symbol_engine_optimizer.opt_reshape : 0.000007s : 0.07% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000010s : 0.10% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.02% pipeline_parallel_scheduler : 0.000001s : 0.02% auto_monad_reorder : 0.000016s : 0.17% get_jit_bprop_graph : 0.000002s : 0.02% rewriter_after_jit_bprop_graph : 0.000004s : 0.04% opt_after_jit_grad : 0.000463s : 4.88% validate : 0.000037s : 0.39% Time group info: ------[substitution.] 0.000213 25 33.19% : 0.000071s : 4: substitution.arithmetic_simplify 0.92% : 0.000002s : 2: substitution.elim_not_effective 0.64% : 0.000001s : 2: substitution.fold_const_symbol 2.71% : 0.000006s : 3: substitution.graph_param_transform 51.59% : 0.000110s : 2: substitution.inline 2.23% : 0.000005s : 4: substitution.j_node_and_user_rematch 5.36% : 0.000011s : 2: substitution.less_batch_normalization 2.07% : 0.000004s : 4: substitution.remove_not_recompute_node 1.29% : 0.000003s : 2: substitution.replace_old_param ------[type_inference.] 0.004781 2 91.13% : 0.004357s : 1: type_inference.infer 8.87% : 0.000424s : 1: type_inference.specialize ------[replace.] 0.000024 2 100.00% : 0.000024s : 2: replace.inline ------[match.] 0.000108 2 100.00% : 0.000108s : 2: match.inline ------[predicate.] 0.000137 754 0.76% : 0.000001s : 7: predicate.accumulaten_eliminater 0.97% : 0.000001s : 3: predicate.ad_related_special_op_eliminate 0.63% : 0.000001s : 6: predicate.addn_check_dump 0.81% : 0.000001s : 7: predicate.addn_zero_filter 0.63% : 0.000001s : 7: predicate.adjust_all_reduce_mul_add 3.05% : 0.000004s : 13: predicate.arithmetic_simplify 0.97% : 0.000001s : 7: predicate.cast_eliminate 0.84% : 0.000001s : 6: predicate.check_bprop_eliminate 0.67% : 0.000001s : 6: predicate.compare_switch_simplify 0.22% : 0.000000s : 3: predicate.const_output_eliminate 0.81% : 0.000001s : 6: predicate.depend_value_elim 0.87% : 0.000001s : 7: predicate.dict_get_item_const_eliminator 0.95% : 0.000001s : 7: predicate.dict_get_item_eliminator 0.71% : 0.000001s : 7: predicate.dict_set_item_eliminator 1.19% : 0.000002s : 6: predicate.dumpgradient_eliminate 0.29% : 0.000000s : 3: predicate.elim_not_effective 0.44% : 0.000001s : 3: predicate.elim_shapecalc_of_broadcastargs 1.05% : 0.000001s : 10: predicate.environ_add_const_eliminate 1.08% : 0.000001s : 10: predicate.environ_get_add_eliminate 1.20% : 0.000002s : 10: predicate.environ_get_depend_swap 1.67% : 0.000002s : 16: predicate.environ_get_eliminate 0.98% : 0.000001s : 10: predicate.environ_get_set_eliminate 0.97% : 0.000001s : 9: predicate.exchange_switch_depend_value 1.88% : 0.000003s : 9: predicate.float_depend_g_call 0.68% : 0.000001s : 6: predicate.float_environ_get_switch 1.01% : 0.000001s : 9: predicate.float_tuple_getitem_switch 0.23% : 0.000000s : 3: predicate.fold_const_symbol 0.75% : 0.000001s : 6: predicate.get_grad_eliminate 0.29% : 0.000000s : 3: predicate.graph_param_transform 0.82% : 0.000001s : 6: predicate.incorporate_call 0.71% : 0.000001s : 6: predicate.incorporate_call_switch 6.57% : 0.000009s : 34: predicate.inline 1.16% : 0.000002s : 6: predicate.inline_without_move 0.45% : 0.000001s : 6: predicate.j_node_and_user_rematch 1.16% : 0.000002s : 6: predicate.less_batch_normalization 1.59% : 0.000002s : 13: predicate.list_to_tuple_eliminator_ 2.07% : 0.000003s : 20: predicate.load_eliminater 1.21% : 0.000002s : 3: predicate.loop_unroll_after_grad 1.66% : 0.000002s : 14: predicate.loop_unroll_before_grad 1.75% : 0.000002s : 13: predicate.make_slice_get_slice_eliminator 0.73% : 0.000001s : 6: predicate.merge_addn 0.65% : 0.000001s : 6: predicate.micro_step_allgather_replace 0.82% : 0.000001s : 6: predicate.mini_step_allgather_replace 0.73% : 0.000001s : 7: predicate.minmaximum_grad 1.59% : 0.000002s : 3: predicate.mutable_eliminate 0.42% : 0.000001s : 3: predicate.opt_reshape 0.44% : 0.000001s : 3: predicate.parallel_virtual_node 1.60% : 0.000002s : 9: predicate.partial_defer_inline 1.20% : 0.000002s : 10: predicate.partial_eliminate 0.65% : 0.000001s : 7: predicate.print_const_string_wrapper 0.67% : 0.000001s : 6: predicate.reduce_all_const_elim 1.07% : 0.000001s : 7: predicate.reduce_eliminate 2.03% : 0.000003s : 20: predicate.redundant_stop_gradient_eliminater 0.76% : 0.000001s : 6: predicate.remove_not_recompute_node 1.08% : 0.000001s : 13: predicate.replace_applicator 0.77% : 0.000001s : 6: predicate.replace_old_param 0.36% : 0.000001s : 3: predicate.reset_defer_inline 0.84% : 0.000001s : 7: predicate.reshape_eliminate 0.76% : 0.000001s : 6: predicate.row_tensor_add_zeros_like 0.55% : 0.000001s : 3: predicate.row_tensor_eliminate 1.04% : 0.000001s : 6: predicate.same_eliminate 0.52% : 0.000001s : 6: predicate.set_cell_output_no_recompute 1.11% : 0.000002s : 6: predicate.shard_identity_eliminate 1.03% : 0.000001s : 6: predicate.special_op_eliminate 0.89% : 0.000001s : 6: predicate.specialize_transform 1.10% : 0.000002s : 6: predicate.split_environ_get_set_with_tuple_value 0.87% : 0.000001s : 6: predicate.stack_unstack_eliminate 0.41% : 0.000001s : 3: predicate.switch_call_monad_eliminater 0.98% : 0.000001s : 9: predicate.switch_defer_inline 1.65% : 0.000002s : 15: predicate.switch_layer_defer_inline 4.13% : 0.000006s : 32: predicate.switch_simplify 0.77% : 0.000001s : 7: predicate.tile_eliminate 0.80% : 0.000001s : 7: predicate.transpose_eliminate 1.45% : 0.000002s : 13: predicate.tuple_list_convert_item_index_to_positive 1.53% : 0.000002s : 13: predicate.tuple_list_get_item_const_eliminator 1.31% : 0.000002s : 13: predicate.tuple_list_get_item_depend_reorder 3.40% : 0.000005s : 19: predicate.tuple_list_get_item_eliminator 1.32% : 0.000002s : 13: predicate.tuple_list_get_set_item_eliminator 2.96% : 0.000004s : 19: predicate.tuple_list_set_item_eliminator 1.40% : 0.000002s : 13: predicate.tuple_to_list_eliminator_ 1.91% : 0.000003s : 20: predicate.updatestate_pure_node_eliminater 2.81% : 0.000004s : 26: predicate.updatestate_useless_node_eliminater 0.43% : 0.000001s : 3: predicate.value_based_eliminate 0.76% : 0.000001s : 6: predicate.virtual_dataset_eliminate 0.87% : 0.000001s : 6: predicate.virtual_output_eliminate 0.58% : 0.000001s : 3: predicate.virtual_view_grad_eliminate 0.47% : 0.000001s : 3: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000233 5 7.23% : 0.000017s : 1: func_graph_cloner_run.FuncGraphClonerGraph 92.77% : 0.000216s : 4: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.055376 192 0.01% : 0.000004s : 1: ForceFp32Comm 35.04% : 0.019405s : 1: add_attr 35.02% : 0.019390s : 1: add_attr_with_inline 0.01% : 0.000004s : 1: add_comm_op_reuse_tag 0.09% : 0.000049s : 1: add_recomputation 0.01% : 0.000004s : 1: assign_add_opt 0.11% : 0.000061s : 1: auto_monad 0.03% : 0.000019s : 1: auto_monad_reorder 0.01% : 0.000003s : 1: begin_end_overlap_inline 0.01% : 0.000006s : 1: bias_add_comm_swap 0.85% : 0.000468s : 1: bootstrap 0.05% : 0.000029s : 1: cconv 0.01% : 0.000004s : 1: comm_op_add_attrs 0.03% : 0.000015s : 1: control_data_broadcast_order 0.02% : 0.000009s : 1: convert_after_rewriter 0.04% : 0.000024s : 1: cse_after_recomputation 0.01% : 0.000005s : 1: dataset_repeat_opt 0.01% : 0.000005s : 1: detach_backward 0.02% : 0.000008s : 1: environ_conv 0.03% : 0.000017s : 1: event_method 0.01% : 0.000005s : 1: full_micro_interleaved_order_control 0.01% : 0.000005s : 1: get_jit_bprop_graph 0.02% : 0.000010s : 1: graph_reusing 0.01% : 0.000005s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000004s : 1: handle_group_info 0.01% : 0.000005s : 1: inline 0.01% : 0.000006s : 1: insert-virtual-dataset 0.01% : 0.000005s : 1: interleave_parallel_branches 0.01% : 0.000004s : 1: interleave_split_concat_branches 0.01% : 0.000006s : 1: label_fine_grained_interleaved_index 0.01% : 0.000007s : 1: label_micro_interleaved_index 0.78% : 0.000432s : 1: loop_unroll 0.01% : 0.000004s : 1: merge_cast_opt 0.01% : 0.000005s : 1: micro_interleaved_order_control 1.00% : 0.000552s : 1: mutable_eliminate 0.01% : 0.000007s : 1: offloading_packed_experts 0.02% : 0.000013s : 1: opt.transform.loop_unroll_optimizer 0.03% : 0.000015s : 1: opt.transform.mutable_eliminate 1.61% : 0.000893s : 78: opt.transform.opt_a 0.05% : 0.000026s : 1: opt.transform.opt_after_cconv 0.04% : 0.000023s : 1: opt.transform.opt_after_jit_grad 0.17% : 0.000094s : 28: opt.transform.opt_b 0.08% : 0.000044s : 2: opt.transform.opt_trans_graph 0.06% : 0.000035s : 4: opt.transform.symbol_engine_opt 4.38% : 0.002426s : 1: opt_a 0.19% : 0.000105s : 1: opt_after_cconv 0.85% : 0.000472s : 1: opt_after_jit_grad 0.37% : 0.000202s : 1: opt_b 8.00% : 0.004428s : 1: optimize 0.03% : 0.000019s : 1: optimize_parallel_all_gather_comm 0.01% : 0.000008s : 1: order_py_execute_after_rewriter 0.04% : 0.000021s : 1: overlap_grad_flash_sp 0.01% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.01% : 0.000007s : 1: overlap_grad_ring_attention 0.01% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.01% : 0.000005s : 1: overlap_param_gather 0.01% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.01% : 0.000007s : 1: overlap_recompute_and_grad_model_parallel 0.01% : 0.000005s : 1: overlap_recompute_comm 0.01% : 0.000008s : 1: parallel-infer-symbol 0.01% : 0.000004s : 1: parallel-infer-symbol-second 0.01% : 0.000005s : 1: partial_unused_args_eliminate 0.01% : 0.000005s : 1: pipeline_parallel_scheduler 0.01% : 0.000006s : 1: pipeline_split 0.07% : 0.000037s : 1: pre_auto_parallel 0.05% : 0.000028s : 1: py_interpret_to_execute 0.03% : 0.000016s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000004s : 1: remove_cast_before_assign_add 0.04% : 0.000020s : 1: remove_dup_value 0.66% : 0.000364s : 1: renormalize.infer 0.51% : 0.000285s : 1: renormalize.specialize 0.01% : 0.000005s : 1: reorder_send_recv_between_fp_bp 0.01% : 0.000007s : 1: rewriter_after_jit_bprop_graph 0.07% : 0.000039s : 1: rewriter_after_opt_a 0.11% : 0.000060s : 1: rewriter_before_opt_a 0.01% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.01% : 0.000005s : 1: slice_recompute_activation 0.01% : 0.000004s : 1: split_layernorm_comm 0.01% : 0.000005s : 1: split_matmul_comm_elemetwise 0.01% : 0.000008s : 1: swap_dp_allreduce_reducescatter 0.14% : 0.000077s : 1: symbol_engine_optimizer 0.14% : 0.000075s : 1: tuple_transform 8.75% : 0.004844s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:25.181.350 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:25.181.651 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0309488, [21] [bootstrap]: 0.0004085 [type_inference]: 0.00471794 [event_method]: 1.2e-05 [auto_monad]: 5.708e-05 [graph_reusing]: 5.84e-06 [inline]: 2.06e-06 [add_attr]: 0.0030396, [1] [add_attr_with_inline]: 0.00303143, [1] [Cycle 1]: 6.334e-05, [2] [tag_attr]: 1.417e-05 [meta_addattr_fg_expand]: 4e-06 [parallel-infer-symbol]: 3.47002e-06 [pre_auto_parallel]: 2.406e-05 [insert-virtual-dataset]: 2.36e-06 [parallel-infer-symbol-second]: 6.69999e-07 [dataset_repeat_opt]: 2.35997e-06 [pipeline_split]: 1.62999e-06 [optimize]: 0.0050998, [53] [py_interpret_to_execute]: 2.031e-05 [rewriter_before_opt_a]: 5.059e-05 [opt_a]: 0.00282059, [2] [Cycle 1]: 0.00187561, [45] [expand_dump_flag]: 2.80002e-06 [switch_simplify]: 2.621e-05 [loop_unroll]: 1.573e-05 [a_1]: 0.0003765 [with_stream_mark]: 1.55e-05 [recompute_prepare]: 1.018e-05 [updatestate_depend_eliminate]: 4.39002e-06 [updatestate_assign_eliminate]: 4.08001e-06 [updatestate_loads_eliminate]: 3.8e-06 [parameter_eliminate]: 1.74e-06 [a_2]: 0.00014571 [accelerated_algorithm]: 2.194e-05 [shard]: 2.16e-06 [meta_shard_fg_expand]: 2.04e-06 [shard_inline]: 8.23999e-06 [merge_send_recv]: 9.05001e-06 [auto_parallel]: 6.71e-06 [parallel]: 1.918e-05 [flash_sp]: 7.6e-06 [merge_comm]: 4.65001e-06 [allreduce_fusion]: 4.45999e-06 [matmul_add_comm_reduction]: 1.225e-05 [allreduce_slice_to_reducescatter]: 6.10016e-07 [virtual_shard_identity]: 9.12001e-06 [virtual_dataset]: 7.71001e-06 [get_grad_eliminate_]: 7.46999e-06 [virtual_output]: 7.65e-06 [merge_forward]: 4.43001e-06 [cell_reuse_recompute_pass]: 1.48002e-06 [offload_activation]: 1.11e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.695e-05 [merge_recompute_call_nodes]: 1.60999e-06 [before_grad]: 1.269e-05 [set_forward_comm_id_for_comm_node_pass]: 4.63999e-06 [meta_fg_expand]: 3.07002e-06 [flash_sp_send_recv_attached]: 4.82e-06 [receive_attached]: 2.33002e-06 [after_resolve]: 1.148e-05 [a_after_grad]: 1.152e-05 [renormalize]: 0.0005418 [add_forward_monad_depend]: 5.23002e-06 [auto_monad_grad]: 1.82999e-06 [auto_monad_eliminator]: 1.606e-05 [cse]: 3.48e-05 [a_3]: 6.823e-05 [Cycle 2]: 0.0009307, [45] [expand_dump_flag]: 1.02e-06 [switch_simplify]: 8.42e-06 [loop_unroll]: 7.28e-06 [a_1]: 0.00016999 [with_stream_mark]: 9.15999e-06 [recompute_prepare]: 7.71999e-06 [updatestate_depend_eliminate]: 4.13999e-06 [updatestate_assign_eliminate]: 3.2e-06 [updatestate_loads_eliminate]: 2.89001e-06 [parameter_eliminate]: 1.20999e-06 [a_2]: 0.00013043 [accelerated_algorithm]: 1.09e-05 [shard]: 9.80013e-07 [meta_shard_fg_expand]: 1.70001e-06 [shard_inline]: 7.70998e-06 [merge_send_recv]: 5.71e-06 [auto_parallel]: 6.19001e-06 [parallel]: 5.15001e-06 [flash_sp]: 3.38e-06 [merge_comm]: 4.33999e-06 [allreduce_fusion]: 3.85e-06 [matmul_add_comm_reduction]: 7.33999e-06 [allreduce_slice_to_reducescatter]: 3.19997e-07 [virtual_shard_identity]: 8.47e-06 [virtual_dataset]: 6.91001e-06 [get_grad_eliminate_]: 6.79001e-06 [virtual_output]: 6.74001e-06 [merge_forward]: 3.46999e-06 [cell_reuse_recompute_pass]: 1.41002e-06 [offload_activation]: 7.15e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.524e-05 [merge_recompute_call_nodes]: 7.09988e-07 [before_grad]: 1.163e-05 [set_forward_comm_id_for_comm_node_pass]: 4.45e-06 [meta_fg_expand]: 3.15002e-06 [flash_sp_send_recv_attached]: 8.09989e-07 [receive_attached]: 1.03001e-06 [after_resolve]: 9.84999e-06 [a_after_grad]: 1.036e-05 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 1.44e-06 [auto_monad_grad]: 8.00006e-07 [auto_monad_eliminator]: 8.97e-06 [cse]: 2.004e-05 [a_3]: 5.628e-05 [py_interpret_to_execute_after_opt_a]: 1.32e-05 [slice_cell_reuse_recomputed_activation]: 4.84e-06 [rewriter_after_opt_a]: 4.354e-05 [convert_after_rewriter]: 1.074e-05 [order_py_execute_after_rewriter]: 9.38002e-06 [mutable_eliminate]: 0.00049032 [opt_b]: 0.0002953, [1] [Cycle 1]: 0.00028694, [7] [b_1]: 0.00018762 [b_2]: 8.93002e-06 [updatestate_depend_eliminate]: 5.82999e-06 [updatestate_assign_eliminate]: 3.18e-06 [updatestate_loads_eliminate]: 2.84999e-06 [renormalize]: 3.89991e-07 [cse]: 2.287e-05 [optimize_parallel_all_gather_comm]: 2.046e-05 [overlap_param_gather]: 4.67e-06 [cconv]: 2.783e-05 [loop_unroll]: 0.00043967 [opt_after_cconv]: 0.00014113, [1] [Cycle 1]: 0.00013248, [7] [c_1]: 3.608e-05 [parameter_eliminate]: 3.28e-06 [updatestate_depend_eliminate]: 6.38e-06 [updatestate_assign_eliminate]: 3.14999e-06 [updatestate_loads_eliminate]: 3.43999e-06 [cse]: 2.387e-05 [renormalize]: 4.60015e-07 [remove_dup_value]: 2.032e-05 [tuple_transform]: 9.608e-05, [1] [Cycle 1]: 8.887e-05, [4] [d_1]: 4.756e-05 [none_parameter_eliminate]: 1.84e-06 [renormalize]: 1.8999e-07 [switch_simplify]: 8.35999e-06 [partial_unused_args_eliminate]: 4.75999e-06 [add_recomputation]: 5.728e-05 [cse_after_recomputation]: 3.246e-05, [1] [Cycle 1]: 2.526e-05, [1] [cse]: 1.646e-05 [environ_conv]: 9.12001e-06 [swap_dp_allreduce_reducescatter]: 8.66002e-06 [bias_add_comm_swap]: 4.97e-06 [label_micro_interleaved_index]: 6.68e-06 [label_fine_grained_interleaved_index]: 5.44e-06 [merge_cast_opt]: 4.32e-06 [slice_recompute_activation]: 4.65001e-06 [micro_interleaved_order_control]: 4.73001e-06 [assign_add_opt]: 3.48999e-06 [ForceFp32Comm]: 3.61999e-06 [remove_cast_before_assign_add]: 3.36001e-06 [full_micro_interleaved_order_control]: 4.43999e-06 [reorder_send_recv_between_fp_bp]: 5.12e-06 [comm_op_add_attrs]: 3.78999e-06 [add_comm_op_reuse_tag]: 3.21001e-06 [interleave_split_concat_branches]: 3.5e-06 [interleave_parallel_branches]: 3.35998e-06 [overlap_opt_shard_in_pipeline]: 3.75e-06 [overlap_opt_shard_grad_in_pipeline]: 4.15999e-06 [control_data_broadcast_order]: 1.752e-05 [grouped_pairwise_exchange_alltoall]: 3.98001e-06 [offloading_packed_experts]: 7.29001e-06 [overlap_recompute_and_grad_model_parallel]: 7.9e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.66999e-06 [overlap_recompute_allgather_and_fa_grad]: 3.65e-06 [overlap_recompute_comm]: 5.56e-06 [overlap_grad_ring_attention]: 7.06001e-06 [overlap_grad_flash_sp]: 2.358e-05 [begin_end_overlap_inline]: 2.89999e-06 [split_matmul_comm_elemetwise]: 4.39002e-06 [split_layernorm_comm]: 3.9e-06 [handle_group_info]: 3.63e-06 [symbol_engine_optimizer]: 0.00010281, [1] [Cycle 1]: 9.601e-05, [6] [build]: 2.84001e-06 [elim_shapecalc]: 1.114e-05 [elim_not_effective]: 1.547e-05 [opt_reshape]: 8.04002e-06 [fold_const_symbol]: 1.254e-05 [renormalize]: 2.09984e-07 [detach_backward]: 3.2e-06 [pipeline_parallel_scheduler]: 2.01e-06 [auto_monad_reorder]: 2.212e-05 [get_jit_bprop_graph]: 1.17e-06 [rewriter_after_jit_bprop_graph]: 4.43999e-06 [opt_after_jit_grad]: 0.016912 [validate]: 5.656e-05 Sums bootstrap : 0.000408s : 1.56% type_inference : 0.004718s : 18.03% event_method : 0.000012s : 0.05% auto_monad : 0.000057s : 0.22% graph_reusing : 0.000006s : 0.02% inline : 0.000002s : 0.01% add_attr.add_attr_with_inline.tag_attr : 0.000014s : 0.05% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000004s : 0.02% parallel-infer-symbol : 0.000003s : 0.01% pre_auto_parallel : 0.000024s : 0.09% insert-virtual-dataset : 0.000002s : 0.01% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.01% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000020s : 0.08% optimize.rewriter_before_opt_a : 0.000051s : 0.19% optimize.opt_a.expand_dump_flag : 0.000004s : 0.01% optimize.opt_a.switch_simplify : 0.000035s : 0.13% optimize.opt_a.loop_unroll : 0.000023s : 0.09% optimize.opt_a.a_1 : 0.000546s : 2.09% optimize.opt_a.with_stream_mark : 0.000025s : 0.09% optimize.opt_a.recompute_prepare : 0.000018s : 0.07% optimize.opt_a.updatestate_depend_eliminate : 0.000009s : 0.03% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.03% optimize.opt_a.updatestate_loads_eliminate : 0.000007s : 0.03% optimize.opt_a.parameter_eliminate : 0.000003s : 0.01% optimize.opt_a.a_2 : 0.000276s : 1.06% optimize.opt_a.accelerated_algorithm : 0.000033s : 0.13% optimize.opt_a.shard : 0.000003s : 0.01% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.01% optimize.opt_a.shard_inline : 0.000016s : 0.06% optimize.opt_a.merge_send_recv : 0.000015s : 0.06% optimize.opt_a.auto_parallel : 0.000013s : 0.05% optimize.opt_a.parallel : 0.000024s : 0.09% optimize.opt_a.flash_sp : 0.000011s : 0.04% optimize.opt_a.merge_comm : 0.000009s : 0.03% optimize.opt_a.allreduce_fusion : 0.000008s : 0.03% optimize.opt_a.matmul_add_comm_reduction : 0.000020s : 0.07% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000018s : 0.07% optimize.opt_a.virtual_dataset : 0.000015s : 0.06% optimize.opt_a.get_grad_eliminate_ : 0.000014s : 0.05% optimize.opt_a.virtual_output : 0.000014s : 0.05% optimize.opt_a.merge_forward : 0.000008s : 0.03% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.01% optimize.opt_a.offload_activation : 0.000018s : 0.07% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000032s : 0.12% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.01% optimize.opt_a.before_grad : 0.000024s : 0.09% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000009s : 0.03% optimize.opt_a.meta_fg_expand : 0.000006s : 0.02% optimize.opt_a.flash_sp_send_recv_attached : 0.000006s : 0.02% optimize.opt_a.receive_attached : 0.000003s : 0.01% optimize.opt_a.after_resolve : 0.000021s : 0.08% optimize.opt_a.a_after_grad : 0.000022s : 0.08% optimize.opt_a.renormalize : 0.000542s : 2.07% optimize.opt_a.add_forward_monad_depend : 0.000007s : 0.03% optimize.opt_a.auto_monad_grad : 0.000003s : 0.01% optimize.opt_a.auto_monad_eliminator : 0.000025s : 0.10% optimize.opt_a.cse : 0.000055s : 0.21% optimize.opt_a.a_3 : 0.000125s : 0.48% optimize.py_interpret_to_execute_after_opt_a : 0.000013s : 0.05% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.02% optimize.rewriter_after_opt_a : 0.000044s : 0.17% optimize.convert_after_rewriter : 0.000011s : 0.04% optimize.order_py_execute_after_rewriter : 0.000009s : 0.04% optimize.mutable_eliminate : 0.000490s : 1.87% optimize.opt_b.b_1 : 0.000188s : 0.72% optimize.opt_b.b_2 : 0.000009s : 0.03% optimize.opt_b.updatestate_depend_eliminate : 0.000006s : 0.02% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.01% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.01% optimize.opt_b.renormalize : 0.000000s : 0.00% optimize.opt_b.cse : 0.000023s : 0.09% optimize.optimize_parallel_all_gather_comm : 0.000020s : 0.08% optimize.overlap_param_gather : 0.000005s : 0.02% optimize.cconv : 0.000028s : 0.11% optimize.loop_unroll : 0.000440s : 1.68% optimize.opt_after_cconv.c_1 : 0.000036s : 0.14% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.02% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.cse : 0.000024s : 0.09% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000020s : 0.08% optimize.tuple_transform.d_1 : 0.000048s : 0.18% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000008s : 0.03% optimize.partial_unused_args_eliminate : 0.000005s : 0.02% optimize.add_recomputation : 0.000057s : 0.22% optimize.cse_after_recomputation.cse : 0.000016s : 0.06% optimize.environ_conv : 0.000009s : 0.03% optimize.swap_dp_allreduce_reducescatter : 0.000009s : 0.03% optimize.bias_add_comm_swap : 0.000005s : 0.02% optimize.label_micro_interleaved_index : 0.000007s : 0.03% optimize.label_fine_grained_interleaved_index : 0.000005s : 0.02% optimize.merge_cast_opt : 0.000004s : 0.02% optimize.slice_recompute_activation : 0.000005s : 0.02% optimize.micro_interleaved_order_control : 0.000005s : 0.02% optimize.assign_add_opt : 0.000003s : 0.01% optimize.ForceFp32Comm : 0.000004s : 0.01% optimize.remove_cast_before_assign_add : 0.000003s : 0.01% optimize.full_micro_interleaved_order_control : 0.000004s : 0.02% optimize.reorder_send_recv_between_fp_bp : 0.000005s : 0.02% optimize.comm_op_add_attrs : 0.000004s : 0.01% optimize.add_comm_op_reuse_tag : 0.000003s : 0.01% optimize.interleave_split_concat_branches : 0.000003s : 0.01% optimize.interleave_parallel_branches : 0.000003s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000004s : 0.02% optimize.control_data_broadcast_order : 0.000018s : 0.07% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.02% optimize.offloading_packed_experts : 0.000007s : 0.03% optimize.overlap_recompute_and_grad_model_parallel : 0.000008s : 0.03% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.01% optimize.overlap_recompute_comm : 0.000006s : 0.02% optimize.overlap_grad_ring_attention : 0.000007s : 0.03% optimize.overlap_grad_flash_sp : 0.000024s : 0.09% optimize.begin_end_overlap_inline : 0.000003s : 0.01% optimize.split_matmul_comm_elemetwise : 0.000004s : 0.02% optimize.split_layernorm_comm : 0.000004s : 0.01% optimize.handle_group_info : 0.000004s : 0.01% optimize.symbol_engine_optimizer.build : 0.000003s : 0.01% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000011s : 0.04% optimize.symbol_engine_optimizer.elim_not_effective : 0.000015s : 0.06% optimize.symbol_engine_optimizer.opt_reshape : 0.000008s : 0.03% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000013s : 0.05% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000003s : 0.01% pipeline_parallel_scheduler : 0.000002s : 0.01% auto_monad_reorder : 0.000022s : 0.08% get_jit_bprop_graph : 0.000001s : 0.00% rewriter_after_jit_bprop_graph : 0.000004s : 0.02% opt_after_jit_grad : 0.016912s : 64.61% validate : 0.000057s : 0.22% Time group info: ------[substitution.] 0.000214 34 30.57% : 0.000065s : 4: substitution.arithmetic_simplify 8.84% : 0.000019s : 2: substitution.cast_eliminate 1.01% : 0.000002s : 3: substitution.elim_not_effective 0.78% : 0.000002s : 3: substitution.fold_const_symbol 2.96% : 0.000006s : 4: substitution.graph_param_transform 43.96% : 0.000094s : 2: substitution.inline 2.00% : 0.000004s : 6: substitution.j_node_and_user_rematch 5.32% : 0.000011s : 2: substitution.less_batch_normalization 2.84% : 0.000006s : 6: substitution.remove_not_recompute_node 1.72% : 0.000004s : 2: substitution.replace_old_param ------[type_inference.] 0.004678 2 90.55% : 0.004236s : 1: type_inference.infer 9.45% : 0.000442s : 1: type_inference.specialize ------[replace.] 0.000021 2 100.00% : 0.000021s : 2: replace.inline ------[match.] 0.000092 2 100.00% : 0.000092s : 2: match.inline ------[predicate.] 0.000174 980 0.85% : 0.000001s : 9: predicate.accumulaten_eliminater 3.00% : 0.000005s : 4: predicate.ad_related_special_op_eliminate 0.66% : 0.000001s : 8: predicate.addn_check_dump 0.84% : 0.000001s : 9: predicate.addn_zero_filter 0.70% : 0.000001s : 9: predicate.adjust_all_reduce_mul_add 2.75% : 0.000005s : 17: predicate.arithmetic_simplify 0.84% : 0.000001s : 9: predicate.cast_eliminate 0.73% : 0.000001s : 8: predicate.check_bprop_eliminate 0.70% : 0.000001s : 8: predicate.compare_switch_simplify 0.23% : 0.000000s : 4: predicate.const_output_eliminate 0.84% : 0.000001s : 8: predicate.depend_value_elim 0.84% : 0.000001s : 9: predicate.dict_get_item_const_eliminator 0.87% : 0.000002s : 9: predicate.dict_get_item_eliminator 0.79% : 0.000001s : 9: predicate.dict_set_item_eliminator 2.42% : 0.000004s : 8: predicate.dumpgradient_eliminate 0.25% : 0.000000s : 4: predicate.elim_not_effective 0.60% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.08% : 0.000002s : 13: predicate.environ_add_const_eliminate 1.00% : 0.000002s : 13: predicate.environ_get_add_eliminate 0.99% : 0.000002s : 13: predicate.environ_get_depend_swap 1.83% : 0.000003s : 21: predicate.environ_get_eliminate 1.01% : 0.000002s : 13: predicate.environ_get_set_eliminate 0.95% : 0.000002s : 11: predicate.exchange_switch_depend_value 1.82% : 0.000003s : 11: predicate.float_depend_g_call 0.73% : 0.000001s : 8: predicate.float_environ_get_switch 1.01% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.23% : 0.000000s : 4: predicate.fold_const_symbol 0.76% : 0.000001s : 8: predicate.get_grad_eliminate 0.22% : 0.000000s : 4: predicate.graph_param_transform 0.87% : 0.000002s : 8: predicate.incorporate_call 0.65% : 0.000001s : 8: predicate.incorporate_call_switch 6.53% : 0.000011s : 44: predicate.inline 1.00% : 0.000002s : 8: predicate.inline_without_move 0.38% : 0.000001s : 8: predicate.j_node_and_user_rematch 1.27% : 0.000002s : 8: predicate.less_batch_normalization 1.54% : 0.000003s : 17: predicate.list_to_tuple_eliminator_ 2.06% : 0.000004s : 26: predicate.load_eliminater 1.05% : 0.000002s : 4: predicate.loop_unroll_after_grad 1.46% : 0.000003s : 16: predicate.loop_unroll_before_grad 1.72% : 0.000003s : 17: predicate.make_slice_get_slice_eliminator 0.88% : 0.000002s : 8: predicate.merge_addn 0.68% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.72% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.68% : 0.000001s : 9: predicate.minmaximum_grad 1.19% : 0.000002s : 4: predicate.mutable_eliminate 0.45% : 0.000001s : 4: predicate.opt_reshape 0.40% : 0.000001s : 4: predicate.parallel_virtual_node 1.13% : 0.000002s : 11: predicate.partial_defer_inline 1.28% : 0.000002s : 13: predicate.partial_eliminate 0.76% : 0.000001s : 9: predicate.print_const_string_wrapper 0.69% : 0.000001s : 8: predicate.reduce_all_const_elim 1.12% : 0.000002s : 9: predicate.reduce_eliminate 2.04% : 0.000004s : 26: predicate.redundant_stop_gradient_eliminater 0.46% : 0.000001s : 8: predicate.remove_not_recompute_node 1.18% : 0.000002s : 17: predicate.replace_applicator 0.63% : 0.000001s : 8: predicate.replace_old_param 0.33% : 0.000001s : 4: predicate.reset_defer_inline 0.86% : 0.000001s : 9: predicate.reshape_eliminate 0.79% : 0.000001s : 8: predicate.row_tensor_add_zeros_like 0.53% : 0.000001s : 4: predicate.row_tensor_eliminate 0.90% : 0.000002s : 8: predicate.same_eliminate 0.52% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.95% : 0.000002s : 8: predicate.shard_identity_eliminate 0.96% : 0.000002s : 8: predicate.special_op_eliminate 0.94% : 0.000002s : 8: predicate.specialize_transform 1.03% : 0.000002s : 8: predicate.split_environ_get_set_with_tuple_value 0.88% : 0.000002s : 8: predicate.stack_unstack_eliminate 0.44% : 0.000001s : 4: predicate.switch_call_monad_eliminater 0.91% : 0.000002s : 11: predicate.switch_defer_inline 1.65% : 0.000003s : 19: predicate.switch_layer_defer_inline 3.88% : 0.000007s : 39: predicate.switch_simplify 0.77% : 0.000001s : 9: predicate.tile_eliminate 0.95% : 0.000002s : 9: predicate.transpose_eliminate 1.55% : 0.000003s : 17: predicate.tuple_list_convert_item_index_to_positive 1.55% : 0.000003s : 17: predicate.tuple_list_get_item_const_eliminator 1.34% : 0.000002s : 17: predicate.tuple_list_get_item_depend_reorder 2.87% : 0.000005s : 25: predicate.tuple_list_get_item_eliminator 1.36% : 0.000002s : 17: predicate.tuple_list_get_set_item_eliminator 2.31% : 0.000004s : 25: predicate.tuple_list_set_item_eliminator 1.52% : 0.000003s : 17: predicate.tuple_to_list_eliminator_ 1.98% : 0.000003s : 26: predicate.updatestate_pure_node_eliminater 2.91% : 0.000005s : 34: predicate.updatestate_useless_node_eliminater 0.45% : 0.000001s : 4: predicate.value_based_eliminate 0.80% : 0.000001s : 8: predicate.virtual_dataset_eliminate 0.80% : 0.000001s : 8: predicate.virtual_output_eliminate 0.40% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.52% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000215 5 7.89% : 0.000017s : 1: func_graph_cloner_run.FuncGraphClonerGraph 92.11% : 0.000198s : 4: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.040883 192 0.02% : 0.000006s : 1: ForceFp32Comm 7.46% : 0.003048s : 1: add_attr 7.42% : 0.003035s : 1: add_attr_with_inline 0.01% : 0.000006s : 1: add_comm_op_reuse_tag 0.15% : 0.000061s : 1: add_recomputation 0.01% : 0.000006s : 1: assign_add_opt 0.16% : 0.000065s : 1: auto_monad 0.07% : 0.000030s : 1: auto_monad_reorder 0.01% : 0.000006s : 1: begin_end_overlap_inline 0.02% : 0.000008s : 1: bias_add_comm_swap 1.10% : 0.000448s : 1: bootstrap 0.08% : 0.000031s : 1: cconv 0.02% : 0.000007s : 1: comm_op_add_attrs 0.05% : 0.000021s : 1: control_data_broadcast_order 0.03% : 0.000014s : 1: convert_after_rewriter 0.09% : 0.000036s : 1: cse_after_recomputation 0.02% : 0.000008s : 1: dataset_repeat_opt 0.04% : 0.000017s : 1: detach_backward 0.03% : 0.000012s : 1: environ_conv 0.05% : 0.000021s : 1: event_method 0.02% : 0.000007s : 1: full_micro_interleaved_order_control 0.02% : 0.000007s : 1: get_jit_bprop_graph 0.03% : 0.000012s : 1: graph_reusing 0.02% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.02% : 0.000006s : 1: handle_group_info 0.02% : 0.000008s : 1: inline 0.02% : 0.000008s : 1: insert-virtual-dataset 0.02% : 0.000006s : 1: interleave_parallel_branches 0.02% : 0.000006s : 1: interleave_split_concat_branches 0.02% : 0.000008s : 1: label_fine_grained_interleaved_index 0.02% : 0.000009s : 1: label_micro_interleaved_index 1.09% : 0.000446s : 1: loop_unroll 0.02% : 0.000007s : 1: merge_cast_opt 0.02% : 0.000007s : 1: micro_interleaved_order_control 1.21% : 0.000496s : 1: mutable_eliminate 0.02% : 0.000010s : 1: offloading_packed_experts 0.04% : 0.000015s : 1: opt.transform.loop_unroll_optimizer 0.04% : 0.000016s : 1: opt.transform.mutable_eliminate 2.58% : 0.001057s : 78: opt.transform.opt_a 0.08% : 0.000035s : 1: opt.transform.opt_after_cconv 0.14% : 0.000056s : 1: opt.transform.opt_after_jit_grad 0.30% : 0.000124s : 28: opt.transform.opt_b 0.13% : 0.000054s : 2: opt.transform.opt_trans_graph 0.11% : 0.000043s : 4: opt.transform.symbol_engine_opt 6.91% : 0.002824s : 1: opt_a 0.35% : 0.000145s : 1: opt_after_cconv 41.42% : 0.016933s : 1: opt_after_jit_grad 0.73% : 0.000299s : 1: opt_b 13.30% : 0.005438s : 1: optimize 0.06% : 0.000024s : 1: optimize_parallel_all_gather_comm 0.03% : 0.000012s : 1: order_py_execute_after_rewriter 0.06% : 0.000027s : 1: overlap_grad_flash_sp 0.02% : 0.000006s : 1: overlap_grad_matmul_and_grad_allreduce 0.02% : 0.000010s : 1: overlap_grad_ring_attention 0.02% : 0.000007s : 1: overlap_opt_shard_grad_in_pipeline 0.02% : 0.000006s : 1: overlap_opt_shard_in_pipeline 0.02% : 0.000008s : 1: overlap_param_gather 0.02% : 0.000006s : 1: overlap_recompute_allgather_and_fa_grad 0.03% : 0.000011s : 1: overlap_recompute_and_grad_model_parallel 0.02% : 0.000008s : 1: overlap_recompute_comm 0.02% : 0.000010s : 1: parallel-infer-symbol 0.01% : 0.000006s : 1: parallel-infer-symbol-second 0.02% : 0.000008s : 1: partial_unused_args_eliminate 0.02% : 0.000009s : 1: pipeline_parallel_scheduler 0.02% : 0.000007s : 1: pipeline_split 0.08% : 0.000032s : 1: pre_auto_parallel 0.06% : 0.000024s : 1: py_interpret_to_execute 0.04% : 0.000017s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000006s : 1: remove_cast_before_assign_add 0.06% : 0.000024s : 1: remove_dup_value 0.73% : 0.000299s : 1: renormalize.infer 0.58% : 0.000236s : 1: renormalize.specialize 0.02% : 0.000008s : 1: reorder_send_recv_between_fp_bp 0.03% : 0.000010s : 1: rewriter_after_jit_bprop_graph 0.11% : 0.000047s : 1: rewriter_after_opt_a 0.13% : 0.000054s : 1: rewriter_before_opt_a 0.02% : 0.000008s : 1: slice_cell_reuse_recomputed_activation 0.02% : 0.000007s : 1: slice_recompute_activation 0.02% : 0.000007s : 1: split_layernorm_comm 0.02% : 0.000007s : 1: split_matmul_comm_elemetwise 0.03% : 0.000012s : 1: swap_dp_allreduce_reducescatter 0.26% : 0.000106s : 1: symbol_engine_optimizer 0.24% : 0.000099s : 1: tuple_transform 11.60% : 0.004742s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:25.593.930 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0301594, [21] [bootstrap]: 0.00044225 [type_inference]: 0.00482764 [event_method]: 1.269e-05 [auto_monad]: 5.885e-05 [graph_reusing]: 5.10001e-06 [inline]: 2.68003e-06 [add_attr]: 0.0192223, [1] [add_attr_with_inline]: 0.0192078, [1] [Cycle 1]: 6.208e-05, [2] [tag_attr]: 1.76e-05 [meta_addattr_fg_expand]: 3.93001e-06 [parallel-infer-symbol]: 3.14999e-06 [pre_auto_parallel]: 3.12e-05 [insert-virtual-dataset]: 2.46998e-06 [parallel-infer-symbol-second]: 7.09988e-07 [dataset_repeat_opt]: 1.72001e-06 [pipeline_split]: 1.84e-06 [optimize]: 0.00486195, [53] [py_interpret_to_execute]: 2.274e-05 [rewriter_before_opt_a]: 5.713e-05 [opt_a]: 0.00264844, [2] [Cycle 1]: 0.00188028, [45] [expand_dump_flag]: 2.78998e-06 [switch_simplify]: 2.791e-05 [loop_unroll]: 1.586e-05 [a_1]: 0.00040907 [with_stream_mark]: 1.646e-05 [recompute_prepare]: 9.96e-06 [updatestate_depend_eliminate]: 4.99998e-06 [updatestate_assign_eliminate]: 3.88999e-06 [updatestate_loads_eliminate]: 3.72998e-06 [parameter_eliminate]: 1.76e-06 [a_2]: 0.00011293 [accelerated_algorithm]: 2.047e-05 [shard]: 2.06e-06 [meta_shard_fg_expand]: 2.19999e-06 [shard_inline]: 7.71999e-06 [merge_send_recv]: 9.04e-06 [auto_parallel]: 7.63001e-06 [parallel]: 1.926e-05 [flash_sp]: 7.16001e-06 [merge_comm]: 4.55001e-06 [allreduce_fusion]: 4.79e-06 [matmul_add_comm_reduction]: 1.11e-05 [allreduce_slice_to_reducescatter]: 9.49978e-07 [virtual_shard_identity]: 9.42001e-06 [virtual_dataset]: 7.87e-06 [get_grad_eliminate_]: 7.59002e-06 [virtual_output]: 7.43999e-06 [merge_forward]: 4.42e-06 [cell_reuse_recompute_pass]: 1.25999e-06 [offload_activation]: 1.155e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.458e-05 [merge_recompute_call_nodes]: 1.51002e-06 [before_grad]: 1.337e-05 [set_forward_comm_id_for_comm_node_pass]: 4.64998e-06 [meta_fg_expand]: 3.80998e-06 [flash_sp_send_recv_attached]: 4.53001e-06 [receive_attached]: 2.36e-06 [after_resolve]: 1.179e-05 [a_after_grad]: 1.205e-05 [renormalize]: 0.00069702 [add_forward_monad_depend]: 5.19998e-06 [auto_monad_grad]: 2.52001e-06 [auto_monad_eliminator]: 1.548e-05 [cse]: 3.541e-05 [a_3]: 5.728e-05 [Cycle 2]: 0.00075865, [45] [expand_dump_flag]: 1.10999e-06 [switch_simplify]: 8.45999e-06 [loop_unroll]: 7.45e-06 [a_1]: 0.00017203 [with_stream_mark]: 9.15001e-06 [recompute_prepare]: 7.73999e-06 [updatestate_depend_eliminate]: 4.25e-06 [updatestate_assign_eliminate]: 2.97002e-06 [updatestate_loads_eliminate]: 2.69999e-06 [parameter_eliminate]: 1.04998e-06 [a_2]: 9.993e-05 [accelerated_algorithm]: 1.079e-05 [shard]: 9.79984e-07 [meta_shard_fg_expand]: 1.86e-06 [shard_inline]: 7.72998e-06 [merge_send_recv]: 5.90002e-06 [auto_parallel]: 6.79999e-06 [parallel]: 4.53999e-06 [flash_sp]: 3.92998e-06 [merge_comm]: 4.17998e-06 [allreduce_fusion]: 4.17e-06 [matmul_add_comm_reduction]: 7.45e-06 [allreduce_slice_to_reducescatter]: 3.30008e-07 [virtual_shard_identity]: 8.38001e-06 [virtual_dataset]: 6.93e-06 [get_grad_eliminate_]: 6.72002e-06 [virtual_output]: 6.38e-06 [merge_forward]: 3.41001e-06 [cell_reuse_recompute_pass]: 1.49e-06 [offload_activation]: 7.31999e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.311e-05 [merge_recompute_call_nodes]: 9.50007e-07 [before_grad]: 1.119e-05 [set_forward_comm_id_for_comm_node_pass]: 4.37e-06 [meta_fg_expand]: 2.94001e-06 [flash_sp_send_recv_attached]: 8.80013e-07 [receive_attached]: 9.80013e-07 [after_resolve]: 1.036e-05 [a_after_grad]: 1.001e-05 [renormalize]: 8.00064e-08 [add_forward_monad_depend]: 1.20999e-06 [auto_monad_grad]: 8.70001e-07 [auto_monad_eliminator]: 8.40001e-06 [cse]: 1.943e-05 [a_3]: 4.338e-05 [py_interpret_to_execute_after_opt_a]: 9.02999e-06 [slice_cell_reuse_recomputed_activation]: 2.29001e-06 [rewriter_after_opt_a]: 4.051e-05 [convert_after_rewriter]: 7.16999e-06 [order_py_execute_after_rewriter]: 6.02999e-06 [mutable_eliminate]: 0.00065188 [opt_b]: 0.00023626, [1] [Cycle 1]: 0.00023035, [7] [b_1]: 0.00014798 [b_2]: 8.85999e-06 [updatestate_depend_eliminate]: 6.23998e-06 [updatestate_assign_eliminate]: 3.18998e-06 [updatestate_loads_eliminate]: 3.61999e-06 [renormalize]: 5.39992e-07 [cse]: 2.379e-05 [optimize_parallel_all_gather_comm]: 1.815e-05 [overlap_param_gather]: 1.84998e-06 [cconv]: 2.581e-05 [loop_unroll]: 0.00043777 [opt_after_cconv]: 0.00011345, [1] [Cycle 1]: 0.00010822, [7] [c_1]: 3.426e-05 [parameter_eliminate]: 2.54001e-06 [updatestate_depend_eliminate]: 6.21998e-06 [updatestate_assign_eliminate]: 3.06001e-06 [updatestate_loads_eliminate]: 3.3e-06 [cse]: 2.348e-05 [renormalize]: 4.00003e-07 [remove_dup_value]: 1.669e-05 [tuple_transform]: 8.066e-05, [1] [Cycle 1]: 7.616e-05, [4] [d_1]: 4.715e-05 [none_parameter_eliminate]: 1.64e-06 [renormalize]: 2.49973e-07 [switch_simplify]: 7.9e-06 [partial_unused_args_eliminate]: 1.70001e-06 [add_recomputation]: 5.408e-05 [cse_after_recomputation]: 2.66e-05, [1] [Cycle 1]: 2.189e-05, [1] [cse]: 1.628e-05 [environ_conv]: 6.95002e-06 [swap_dp_allreduce_reducescatter]: 5.67001e-06 [bias_add_comm_swap]: 2.64999e-06 [label_micro_interleaved_index]: 4.10998e-06 [label_fine_grained_interleaved_index]: 2.66999e-06 [merge_cast_opt]: 1.29e-06 [slice_recompute_activation]: 2.26e-06 [micro_interleaved_order_control]: 2.32999e-06 [assign_add_opt]: 1.19e-06 [ForceFp32Comm]: 1.13001e-06 [remove_cast_before_assign_add]: 1.25001e-06 [full_micro_interleaved_order_control]: 2.14999e-06 [reorder_send_recv_between_fp_bp]: 2.51998e-06 [comm_op_add_attrs]: 9.5999e-07 [add_comm_op_reuse_tag]: 9.39996e-07 [interleave_split_concat_branches]: 1.12e-06 [interleave_parallel_branches]: 1.12e-06 [overlap_opt_shard_in_pipeline]: 1.42e-06 [overlap_opt_shard_grad_in_pipeline]: 1.89e-06 [control_data_broadcast_order]: 1.439e-05 [grouped_pairwise_exchange_alltoall]: 1.48002e-06 [offloading_packed_experts]: 4.55001e-06 [overlap_recompute_and_grad_model_parallel]: 5.07999e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.40999e-06 [overlap_recompute_allgather_and_fa_grad]: 1.67001e-06 [overlap_recompute_comm]: 2.29001e-06 [overlap_grad_ring_attention]: 4.60999e-06 [overlap_grad_flash_sp]: 1.964e-05 [begin_end_overlap_inline]: 7.60017e-07 [split_matmul_comm_elemetwise]: 2.15002e-06 [split_layernorm_comm]: 1.80001e-06 [handle_group_info]: 1.07998e-06 [symbol_engine_optimizer]: 8.219e-05, [1] [Cycle 1]: 7.772e-05, [6] [build]: 3.18998e-06 [elim_shapecalc]: 1.122e-05 [elim_not_effective]: 1.46e-05 [opt_reshape]: 8.18999e-06 [fold_const_symbol]: 1.234e-05 [renormalize]: 1.8999e-07 [detach_backward]: 2.06e-06 [pipeline_parallel_scheduler]: 1.50999e-06 [auto_monad_reorder]: 2.066e-05 [get_jit_bprop_graph]: 1.62001e-06 [rewriter_after_jit_bprop_graph]: 3.97e-06 [opt_after_jit_grad]: 0.00046584 [validate]: 4.045e-05 Sums bootstrap : 0.000442s : 4.43% type_inference : 0.004828s : 48.39% event_method : 0.000013s : 0.13% auto_monad : 0.000059s : 0.59% graph_reusing : 0.000005s : 0.05% inline : 0.000003s : 0.03% add_attr.add_attr_with_inline.tag_attr : 0.000018s : 0.18% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000004s : 0.04% parallel-infer-symbol : 0.000003s : 0.03% pre_auto_parallel : 0.000031s : 0.31% insert-virtual-dataset : 0.000002s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.02% optimize.py_interpret_to_execute : 0.000023s : 0.23% optimize.rewriter_before_opt_a : 0.000057s : 0.57% optimize.opt_a.expand_dump_flag : 0.000004s : 0.04% optimize.opt_a.switch_simplify : 0.000036s : 0.36% optimize.opt_a.loop_unroll : 0.000023s : 0.23% optimize.opt_a.a_1 : 0.000581s : 5.82% optimize.opt_a.with_stream_mark : 0.000026s : 0.26% optimize.opt_a.recompute_prepare : 0.000018s : 0.18% optimize.opt_a.updatestate_depend_eliminate : 0.000009s : 0.09% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.07% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.06% optimize.opt_a.parameter_eliminate : 0.000003s : 0.03% optimize.opt_a.a_2 : 0.000213s : 2.13% optimize.opt_a.accelerated_algorithm : 0.000031s : 0.31% optimize.opt_a.shard : 0.000003s : 0.03% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.04% optimize.opt_a.shard_inline : 0.000015s : 0.15% optimize.opt_a.merge_send_recv : 0.000015s : 0.15% optimize.opt_a.auto_parallel : 0.000014s : 0.14% optimize.opt_a.parallel : 0.000024s : 0.24% optimize.opt_a.flash_sp : 0.000011s : 0.11% optimize.opt_a.merge_comm : 0.000009s : 0.09% optimize.opt_a.allreduce_fusion : 0.000009s : 0.09% optimize.opt_a.matmul_add_comm_reduction : 0.000019s : 0.19% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000018s : 0.18% optimize.opt_a.virtual_dataset : 0.000015s : 0.15% optimize.opt_a.get_grad_eliminate_ : 0.000014s : 0.14% optimize.opt_a.virtual_output : 0.000014s : 0.14% optimize.opt_a.merge_forward : 0.000008s : 0.08% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.03% optimize.opt_a.offload_activation : 0.000019s : 0.19% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000028s : 0.28% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.02% optimize.opt_a.before_grad : 0.000025s : 0.25% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000009s : 0.09% optimize.opt_a.meta_fg_expand : 0.000007s : 0.07% optimize.opt_a.flash_sp_send_recv_attached : 0.000005s : 0.05% optimize.opt_a.receive_attached : 0.000003s : 0.03% optimize.opt_a.after_resolve : 0.000022s : 0.22% optimize.opt_a.a_after_grad : 0.000022s : 0.22% optimize.opt_a.renormalize : 0.000697s : 6.99% optimize.opt_a.add_forward_monad_depend : 0.000006s : 0.06% optimize.opt_a.auto_monad_grad : 0.000003s : 0.03% optimize.opt_a.auto_monad_eliminator : 0.000024s : 0.24% optimize.opt_a.cse : 0.000055s : 0.55% optimize.opt_a.a_3 : 0.000101s : 1.01% optimize.py_interpret_to_execute_after_opt_a : 0.000009s : 0.09% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.02% optimize.rewriter_after_opt_a : 0.000041s : 0.41% optimize.convert_after_rewriter : 0.000007s : 0.07% optimize.order_py_execute_after_rewriter : 0.000006s : 0.06% optimize.mutable_eliminate : 0.000652s : 6.53% optimize.opt_b.b_1 : 0.000148s : 1.48% optimize.opt_b.b_2 : 0.000009s : 0.09% optimize.opt_b.updatestate_depend_eliminate : 0.000006s : 0.06% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_b.updatestate_loads_eliminate : 0.000004s : 0.04% optimize.opt_b.renormalize : 0.000001s : 0.01% optimize.opt_b.cse : 0.000024s : 0.24% optimize.optimize_parallel_all_gather_comm : 0.000018s : 0.18% optimize.overlap_param_gather : 0.000002s : 0.02% optimize.cconv : 0.000026s : 0.26% optimize.loop_unroll : 0.000438s : 4.39% optimize.opt_after_cconv.c_1 : 0.000034s : 0.34% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.06% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.cse : 0.000023s : 0.24% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000017s : 0.17% optimize.tuple_transform.d_1 : 0.000047s : 0.47% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.02% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000008s : 0.08% optimize.partial_unused_args_eliminate : 0.000002s : 0.02% optimize.add_recomputation : 0.000054s : 0.54% optimize.cse_after_recomputation.cse : 0.000016s : 0.16% optimize.environ_conv : 0.000007s : 0.07% optimize.swap_dp_allreduce_reducescatter : 0.000006s : 0.06% optimize.bias_add_comm_swap : 0.000003s : 0.03% optimize.label_micro_interleaved_index : 0.000004s : 0.04% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.03% optimize.merge_cast_opt : 0.000001s : 0.01% optimize.slice_recompute_activation : 0.000002s : 0.02% optimize.micro_interleaved_order_control : 0.000002s : 0.02% optimize.assign_add_opt : 0.000001s : 0.01% optimize.ForceFp32Comm : 0.000001s : 0.01% optimize.remove_cast_before_assign_add : 0.000001s : 0.01% optimize.full_micro_interleaved_order_control : 0.000002s : 0.02% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.03% optimize.comm_op_add_attrs : 0.000001s : 0.01% optimize.add_comm_op_reuse_tag : 0.000001s : 0.01% optimize.interleave_split_concat_branches : 0.000001s : 0.01% optimize.interleave_parallel_branches : 0.000001s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.02% optimize.control_data_broadcast_order : 0.000014s : 0.14% optimize.grouped_pairwise_exchange_alltoall : 0.000001s : 0.01% optimize.offloading_packed_experts : 0.000005s : 0.05% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.05% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000002s : 0.02% optimize.overlap_recompute_comm : 0.000002s : 0.02% optimize.overlap_grad_ring_attention : 0.000005s : 0.05% optimize.overlap_grad_flash_sp : 0.000020s : 0.20% optimize.begin_end_overlap_inline : 0.000001s : 0.01% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.02% optimize.split_layernorm_comm : 0.000002s : 0.02% optimize.handle_group_info : 0.000001s : 0.01% optimize.symbol_engine_optimizer.build : 0.000003s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000011s : 0.11% optimize.symbol_engine_optimizer.elim_not_effective : 0.000015s : 0.15% optimize.symbol_engine_optimizer.opt_reshape : 0.000008s : 0.08% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000012s : 0.12% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.02% pipeline_parallel_scheduler : 0.000002s : 0.02% auto_monad_reorder : 0.000021s : 0.21% get_jit_bprop_graph : 0.000002s : 0.02% rewriter_after_jit_bprop_graph : 0.000004s : 0.04% opt_after_jit_grad : 0.000466s : 4.67% validate : 0.000040s : 0.41% Time group info: ------[substitution.] 0.000234 34 27.98% : 0.000066s : 4: substitution.arithmetic_simplify 8.13% : 0.000019s : 2: substitution.cast_eliminate 0.93% : 0.000002s : 3: substitution.elim_not_effective 0.71% : 0.000002s : 3: substitution.fold_const_symbol 2.74% : 0.000006s : 4: substitution.graph_param_transform 48.46% : 0.000113s : 2: substitution.inline 1.95% : 0.000005s : 6: substitution.j_node_and_user_rematch 4.83% : 0.000011s : 2: substitution.less_batch_normalization 2.46% : 0.000006s : 6: substitution.remove_not_recompute_node 1.83% : 0.000004s : 2: substitution.replace_old_param ------[type_inference.] 0.004785 2 90.43% : 0.004328s : 1: type_inference.infer 9.57% : 0.000458s : 1: type_inference.specialize ------[replace.] 0.000023 2 100.00% : 0.000023s : 2: replace.inline ------[match.] 0.000112 2 100.00% : 0.000112s : 2: match.inline ------[predicate.] 0.000177 980 0.89% : 0.000002s : 9: predicate.accumulaten_eliminater 0.97% : 0.000002s : 4: predicate.ad_related_special_op_eliminate 0.68% : 0.000001s : 8: predicate.addn_check_dump 0.79% : 0.000001s : 9: predicate.addn_zero_filter 0.84% : 0.000001s : 9: predicate.adjust_all_reduce_mul_add 2.72% : 0.000005s : 17: predicate.arithmetic_simplify 0.89% : 0.000002s : 9: predicate.cast_eliminate 0.72% : 0.000001s : 8: predicate.check_bprop_eliminate 0.72% : 0.000001s : 8: predicate.compare_switch_simplify 0.22% : 0.000000s : 4: predicate.const_output_eliminate 0.75% : 0.000001s : 8: predicate.depend_value_elim 0.83% : 0.000001s : 9: predicate.dict_get_item_const_eliminator 1.12% : 0.000002s : 9: predicate.dict_get_item_eliminator 0.80% : 0.000001s : 9: predicate.dict_set_item_eliminator 1.22% : 0.000002s : 8: predicate.dumpgradient_eliminate 0.30% : 0.000001s : 4: predicate.elim_not_effective 0.59% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.14% : 0.000002s : 13: predicate.environ_add_const_eliminate 1.08% : 0.000002s : 13: predicate.environ_get_add_eliminate 1.18% : 0.000002s : 13: predicate.environ_get_depend_swap 1.82% : 0.000003s : 21: predicate.environ_get_eliminate 1.05% : 0.000002s : 13: predicate.environ_get_set_eliminate 0.92% : 0.000002s : 11: predicate.exchange_switch_depend_value 1.77% : 0.000003s : 11: predicate.float_depend_g_call 0.65% : 0.000001s : 8: predicate.float_environ_get_switch 1.02% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.23% : 0.000000s : 4: predicate.fold_const_symbol 0.82% : 0.000001s : 8: predicate.get_grad_eliminate 0.25% : 0.000000s : 4: predicate.graph_param_transform 0.81% : 0.000001s : 8: predicate.incorporate_call 0.65% : 0.000001s : 8: predicate.incorporate_call_switch 6.16% : 0.000011s : 44: predicate.inline 1.00% : 0.000002s : 8: predicate.inline_without_move 0.34% : 0.000001s : 8: predicate.j_node_and_user_rematch 1.20% : 0.000002s : 8: predicate.less_batch_normalization 1.72% : 0.000003s : 17: predicate.list_to_tuple_eliminator_ 2.20% : 0.000004s : 26: predicate.load_eliminater 1.12% : 0.000002s : 4: predicate.loop_unroll_after_grad 1.52% : 0.000003s : 16: predicate.loop_unroll_before_grad 1.71% : 0.000003s : 17: predicate.make_slice_get_slice_eliminator 0.77% : 0.000001s : 8: predicate.merge_addn 0.72% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.78% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.77% : 0.000001s : 9: predicate.minmaximum_grad 1.43% : 0.000003s : 4: predicate.mutable_eliminate 0.42% : 0.000001s : 4: predicate.opt_reshape 0.47% : 0.000001s : 4: predicate.parallel_virtual_node 1.18% : 0.000002s : 11: predicate.partial_defer_inline 1.22% : 0.000002s : 13: predicate.partial_eliminate 0.80% : 0.000001s : 9: predicate.print_const_string_wrapper 0.97% : 0.000002s : 8: predicate.reduce_all_const_elim 1.13% : 0.000002s : 9: predicate.reduce_eliminate 2.17% : 0.000004s : 26: predicate.redundant_stop_gradient_eliminater 0.47% : 0.000001s : 8: predicate.remove_not_recompute_node 1.14% : 0.000002s : 17: predicate.replace_applicator 0.62% : 0.000001s : 8: predicate.replace_old_param 0.29% : 0.000001s : 4: predicate.reset_defer_inline 0.95% : 0.000002s : 9: predicate.reshape_eliminate 0.84% : 0.000001s : 8: predicate.row_tensor_add_zeros_like 0.56% : 0.000001s : 4: predicate.row_tensor_eliminate 0.99% : 0.000002s : 8: predicate.same_eliminate 0.50% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.90% : 0.000002s : 8: predicate.shard_identity_eliminate 0.85% : 0.000001s : 8: predicate.special_op_eliminate 0.90% : 0.000002s : 8: predicate.specialize_transform 1.19% : 0.000002s : 8: predicate.split_environ_get_set_with_tuple_value 0.91% : 0.000002s : 8: predicate.stack_unstack_eliminate 0.43% : 0.000001s : 4: predicate.switch_call_monad_eliminater 0.93% : 0.000002s : 11: predicate.switch_defer_inline 1.73% : 0.000003s : 19: predicate.switch_layer_defer_inline 4.08% : 0.000007s : 39: predicate.switch_simplify 0.81% : 0.000001s : 9: predicate.tile_eliminate 0.84% : 0.000001s : 9: predicate.transpose_eliminate 1.57% : 0.000003s : 17: predicate.tuple_list_convert_item_index_to_positive 1.77% : 0.000003s : 17: predicate.tuple_list_get_item_const_eliminator 1.52% : 0.000003s : 17: predicate.tuple_list_get_item_depend_reorder 3.01% : 0.000005s : 25: predicate.tuple_list_get_item_eliminator 1.54% : 0.000003s : 17: predicate.tuple_list_get_set_item_eliminator 2.43% : 0.000004s : 25: predicate.tuple_list_set_item_eliminator 1.60% : 0.000003s : 17: predicate.tuple_to_list_eliminator_ 2.10% : 0.000004s : 26: predicate.updatestate_pure_node_eliminater 3.03% : 0.000005s : 34: predicate.updatestate_useless_node_eliminater 0.58% : 0.000001s : 4: predicate.value_based_eliminate 0.89% : 0.000002s : 8: predicate.virtual_dataset_eliminate 0.84% : 0.000001s : 8: predicate.virtual_output_eliminate 0.35% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.60% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000246 5 7.42% : 0.000018s : 1: func_graph_cloner_run.FuncGraphClonerGraph 92.58% : 0.000228s : 4: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.056201 192 0.01% : 0.000004s : 1: ForceFp32Comm 34.21% : 0.019229s : 1: add_attr 34.19% : 0.019213s : 1: add_attr_with_inline 0.01% : 0.000004s : 1: add_comm_op_reuse_tag 0.10% : 0.000058s : 1: add_recomputation 0.01% : 0.000004s : 1: assign_add_opt 0.11% : 0.000064s : 1: auto_monad 0.04% : 0.000024s : 1: auto_monad_reorder 0.01% : 0.000004s : 1: begin_end_overlap_inline 0.01% : 0.000006s : 1: bias_add_comm_swap 0.84% : 0.000471s : 1: bootstrap 0.05% : 0.000029s : 1: cconv 0.01% : 0.000004s : 1: comm_op_add_attrs 0.03% : 0.000018s : 1: control_data_broadcast_order 0.02% : 0.000011s : 1: convert_after_rewriter 0.05% : 0.000030s : 1: cse_after_recomputation 0.01% : 0.000004s : 1: dataset_repeat_opt 0.01% : 0.000005s : 1: detach_backward 0.02% : 0.000010s : 1: environ_conv 0.03% : 0.000019s : 1: event_method 0.01% : 0.000005s : 1: full_micro_interleaved_order_control 0.01% : 0.000005s : 1: get_jit_bprop_graph 0.02% : 0.000008s : 1: graph_reusing 0.01% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000004s : 1: handle_group_info 0.01% : 0.000006s : 1: inline 0.01% : 0.000006s : 1: insert-virtual-dataset 0.01% : 0.000004s : 1: interleave_parallel_branches 0.01% : 0.000004s : 1: interleave_split_concat_branches 0.01% : 0.000006s : 1: label_fine_grained_interleaved_index 0.01% : 0.000007s : 1: label_micro_interleaved_index 0.79% : 0.000446s : 1: loop_unroll 0.01% : 0.000004s : 1: merge_cast_opt 0.01% : 0.000005s : 1: micro_interleaved_order_control 1.18% : 0.000661s : 1: mutable_eliminate 0.01% : 0.000007s : 1: offloading_packed_experts 0.03% : 0.000016s : 1: opt.transform.loop_unroll_optimizer 0.03% : 0.000018s : 1: opt.transform.mutable_eliminate 1.92% : 0.001080s : 78: opt.transform.opt_a 0.06% : 0.000033s : 1: opt.transform.opt_after_cconv 0.05% : 0.000028s : 1: opt.transform.opt_after_jit_grad 0.22% : 0.000125s : 28: opt.transform.opt_b 0.09% : 0.000053s : 2: opt.transform.opt_trans_graph 0.08% : 0.000043s : 4: opt.transform.symbol_engine_opt 4.72% : 0.002651s : 1: opt_a 0.21% : 0.000117s : 1: opt_after_cconv 0.84% : 0.000475s : 1: opt_after_jit_grad 0.43% : 0.000240s : 1: opt_b 8.66% : 0.004866s : 1: optimize 0.04% : 0.000022s : 1: optimize_parallel_all_gather_comm 0.02% : 0.000009s : 1: order_py_execute_after_rewriter 0.04% : 0.000023s : 1: overlap_grad_flash_sp 0.01% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.01% : 0.000008s : 1: overlap_grad_ring_attention 0.01% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.01% : 0.000005s : 1: overlap_param_gather 0.01% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.01% : 0.000008s : 1: overlap_recompute_and_grad_model_parallel 0.01% : 0.000006s : 1: overlap_recompute_comm 0.01% : 0.000007s : 1: parallel-infer-symbol 0.01% : 0.000004s : 1: parallel-infer-symbol-second 0.01% : 0.000005s : 1: partial_unused_args_eliminate 0.01% : 0.000005s : 1: pipeline_parallel_scheduler 0.01% : 0.000005s : 1: pipeline_split 0.06% : 0.000035s : 1: pre_auto_parallel 0.05% : 0.000026s : 1: py_interpret_to_execute 0.02% : 0.000013s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000004s : 1: remove_cast_before_assign_add 0.04% : 0.000020s : 1: remove_dup_value 0.72% : 0.000404s : 1: renormalize.infer 0.51% : 0.000285s : 1: renormalize.specialize 0.01% : 0.000005s : 1: reorder_send_recv_between_fp_bp 0.01% : 0.000007s : 1: rewriter_after_jit_bprop_graph 0.08% : 0.000044s : 1: rewriter_after_opt_a 0.11% : 0.000061s : 1: rewriter_before_opt_a 0.01% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.01% : 0.000005s : 1: slice_recompute_activation 0.01% : 0.000005s : 1: split_layernorm_comm 0.01% : 0.000005s : 1: split_matmul_comm_elemetwise 0.02% : 0.000009s : 1: swap_dp_allreduce_reducescatter 0.15% : 0.000085s : 1: symbol_engine_optimizer 0.15% : 0.000084s : 1: tuple_transform 8.62% : 0.004844s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:26.105.365 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:26.105.643 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0322949, [21] [bootstrap]: 0.00042987 [type_inference]: 0.0218589 [event_method]: 1.623e-05 [auto_monad]: 5.933e-05 [graph_reusing]: 5.39e-06 [inline]: 2.39001e-06 [add_attr]: 0.003491, [1] [add_attr_with_inline]: 0.00348216, [1] [Cycle 1]: 7.094e-05, [2] [tag_attr]: 1.6e-05 [meta_addattr_fg_expand]: 4.33999e-06 [parallel-infer-symbol]: 3.5e-06 [pre_auto_parallel]: 2.642e-05 [insert-virtual-dataset]: 2.79999e-06 [parallel-infer-symbol-second]: 8.39995e-07 [dataset_repeat_opt]: 2.07999e-06 [pipeline_split]: 1.96998e-06 [optimize]: 0.00525729, [53] [py_interpret_to_execute]: 2.153e-05 [rewriter_before_opt_a]: 5.255e-05 [opt_a]: 0.00292076, [2] [Cycle 1]: 0.0019668, [45] [expand_dump_flag]: 3.44001e-06 [switch_simplify]: 2.727e-05 [loop_unroll]: 1.593e-05 [a_1]: 0.0003846 [with_stream_mark]: 1.669e-05 [recompute_prepare]: 1.136e-05 [updatestate_depend_eliminate]: 4.67e-06 [updatestate_assign_eliminate]: 4.07e-06 [updatestate_loads_eliminate]: 4.15e-06 [parameter_eliminate]: 1.86e-06 [a_2]: 0.00014572 [accelerated_algorithm]: 2.095e-05 [shard]: 2.34001e-06 [meta_shard_fg_expand]: 2.17999e-06 [shard_inline]: 9.00001e-06 [merge_send_recv]: 8.92e-06 [auto_parallel]: 6.60002e-06 [parallel]: 1.958e-05 [flash_sp]: 8.35001e-06 [merge_comm]: 4.75001e-06 [allreduce_fusion]: 4.47e-06 [matmul_add_comm_reduction]: 1.183e-05 [allreduce_slice_to_reducescatter]: 8.40024e-07 [virtual_shard_identity]: 9.86998e-06 [virtual_dataset]: 7.77e-06 [get_grad_eliminate_]: 7.73001e-06 [virtual_output]: 7.55e-06 [merge_forward]: 4.55001e-06 [cell_reuse_recompute_pass]: 1.25001e-06 [offload_activation]: 1.156e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.758e-05 [merge_recompute_call_nodes]: 1.39e-06 [before_grad]: 1.345e-05 [set_forward_comm_id_for_comm_node_pass]: 5.14998e-06 [meta_fg_expand]: 3.26999e-06 [flash_sp_send_recv_attached]: 4.55001e-06 [receive_attached]: 2.78e-06 [after_resolve]: 1.235e-05 [a_after_grad]: 1.197e-05 [renormalize]: 0.00059842 [add_forward_monad_depend]: 5.38002e-06 [auto_monad_grad]: 2.31e-06 [auto_monad_eliminator]: 1.7e-05 [cse]: 3.736e-05 [a_3]: 7.049e-05 [Cycle 2]: 0.00093941, [45] [expand_dump_flag]: 1.10001e-06 [switch_simplify]: 8.64e-06 [loop_unroll]: 6.98e-06 [a_1]: 0.00017157 [with_stream_mark]: 9.99999e-06 [recompute_prepare]: 7.88999e-06 [updatestate_depend_eliminate]: 4.50001e-06 [updatestate_assign_eliminate]: 3.06999e-06 [updatestate_loads_eliminate]: 2.90998e-06 [parameter_eliminate]: 1.47001e-06 [a_2]: 0.00012769 [accelerated_algorithm]: 1.067e-05 [shard]: 9.29984e-07 [meta_shard_fg_expand]: 1.75001e-06 [shard_inline]: 7.49002e-06 [merge_send_recv]: 6.53998e-06 [auto_parallel]: 6.47001e-06 [parallel]: 5.59e-06 [flash_sp]: 3.81001e-06 [merge_comm]: 4.28001e-06 [allreduce_fusion]: 4.2e-06 [matmul_add_comm_reduction]: 8.89e-06 [allreduce_slice_to_reducescatter]: 5.60016e-07 [virtual_shard_identity]: 8.22e-06 [virtual_dataset]: 7.08998e-06 [get_grad_eliminate_]: 7.08e-06 [virtual_output]: 6.63998e-06 [merge_forward]: 3.75e-06 [cell_reuse_recompute_pass]: 1.55001e-06 [offload_activation]: 7.93001e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.525e-05 [merge_recompute_call_nodes]: 9.00007e-07 [before_grad]: 1.164e-05 [set_forward_comm_id_for_comm_node_pass]: 4.74e-06 [meta_fg_expand]: 3.04999e-06 [flash_sp_send_recv_attached]: 1.03001e-06 [receive_attached]: 1.15999e-06 [after_resolve]: 9.32001e-06 [a_after_grad]: 1.023e-05 [renormalize]: 8.00064e-08 [add_forward_monad_depend]: 1.30001e-06 [auto_monad_grad]: 9.50007e-07 [auto_monad_eliminator]: 9.25999e-06 [cse]: 2.207e-05 [a_3]: 5.703e-05 [py_interpret_to_execute_after_opt_a]: 1.373e-05 [slice_cell_reuse_recomputed_activation]: 4.59998e-06 [rewriter_after_opt_a]: 4.463e-05 [convert_after_rewriter]: 1.084e-05 [order_py_execute_after_rewriter]: 9.05999e-06 [mutable_eliminate]: 0.00053093 [opt_b]: 0.00030615, [1] [Cycle 1]: 0.00029727, [7] [b_1]: 0.00019126 [b_2]: 9.48002e-06 [updatestate_depend_eliminate]: 7.24001e-06 [updatestate_assign_eliminate]: 3.18e-06 [updatestate_loads_eliminate]: 2.81999e-06 [renormalize]: 4.30009e-07 [cse]: 2.551e-05 [optimize_parallel_all_gather_comm]: 2.046e-05 [overlap_param_gather]: 4.74e-06 [cconv]: 3.003e-05 [loop_unroll]: 0.00043582 [opt_after_cconv]: 0.00013909, [1] [Cycle 1]: 0.00013048, [7] [c_1]: 3.488e-05 [parameter_eliminate]: 2.94001e-06 [updatestate_depend_eliminate]: 6.28998e-06 [updatestate_assign_eliminate]: 3.14999e-06 [updatestate_loads_eliminate]: 3.18e-06 [cse]: 2.341e-05 [renormalize]: 3.30008e-07 [remove_dup_value]: 1.958e-05 [tuple_transform]: 9.784e-05, [1] [Cycle 1]: 9.053e-05, [4] [d_1]: 4.923e-05 [none_parameter_eliminate]: 1.72999e-06 [renormalize]: 2.10013e-07 [switch_simplify]: 8.69e-06 [partial_unused_args_eliminate]: 5.02999e-06 [add_recomputation]: 5.811e-05 [cse_after_recomputation]: 3.116e-05, [1] [Cycle 1]: 2.436e-05, [1] [cse]: 1.543e-05 [environ_conv]: 9.79e-06 [swap_dp_allreduce_reducescatter]: 9.07999e-06 [bias_add_comm_swap]: 5.15001e-06 [label_micro_interleaved_index]: 7.1e-06 [label_fine_grained_interleaved_index]: 5.30001e-06 [merge_cast_opt]: 3.74002e-06 [slice_recompute_activation]: 4.60001e-06 [micro_interleaved_order_control]: 4.75999e-06 [assign_add_opt]: 3.58999e-06 [ForceFp32Comm]: 3.46001e-06 [remove_cast_before_assign_add]: 3.54002e-06 [full_micro_interleaved_order_control]: 4.40999e-06 [reorder_send_recv_between_fp_bp]: 5.32001e-06 [comm_op_add_attrs]: 3.49001e-06 [add_comm_op_reuse_tag]: 3.33998e-06 [interleave_split_concat_branches]: 3.57002e-06 [interleave_parallel_branches]: 3.58e-06 [overlap_opt_shard_in_pipeline]: 3.73999e-06 [overlap_opt_shard_grad_in_pipeline]: 4.28999e-06 [control_data_broadcast_order]: 1.799e-05 [grouped_pairwise_exchange_alltoall]: 3.85e-06 [offloading_packed_experts]: 7.53e-06 [overlap_recompute_and_grad_model_parallel]: 7.88999e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.81999e-06 [overlap_recompute_allgather_and_fa_grad]: 3.63999e-06 [overlap_recompute_comm]: 4.95001e-06 [overlap_grad_ring_attention]: 6.84001e-06 [overlap_grad_flash_sp]: 2.497e-05 [begin_end_overlap_inline]: 3.3e-06 [split_matmul_comm_elemetwise]: 4.60999e-06 [split_layernorm_comm]: 4.28999e-06 [handle_group_info]: 3.28998e-06 [symbol_engine_optimizer]: 0.00010236, [1] [Cycle 1]: 9.546e-05, [6] [build]: 2.98e-06 [elim_shapecalc]: 1.067e-05 [elim_not_effective]: 1.55e-05 [opt_reshape]: 8.28001e-06 [fold_const_symbol]: 1.222e-05 [renormalize]: 2.89991e-07 [detach_backward]: 3.25e-06 [pipeline_parallel_scheduler]: 2.01e-06 [auto_monad_reorder]: 2.238e-05 [get_jit_bprop_graph]: 2.08002e-06 [rewriter_after_jit_bprop_graph]: 4.06001e-06 [opt_after_jit_grad]: 0.00047589 [validate]: 4.095e-05 Sums bootstrap : 0.000430s : 1.59% type_inference : 0.021859s : 80.84% event_method : 0.000016s : 0.06% auto_monad : 0.000059s : 0.22% graph_reusing : 0.000005s : 0.02% inline : 0.000002s : 0.01% add_attr.add_attr_with_inline.tag_attr : 0.000016s : 0.06% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000004s : 0.02% parallel-infer-symbol : 0.000003s : 0.01% pre_auto_parallel : 0.000026s : 0.10% insert-virtual-dataset : 0.000003s : 0.01% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.01% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000022s : 0.08% optimize.rewriter_before_opt_a : 0.000053s : 0.19% optimize.opt_a.expand_dump_flag : 0.000005s : 0.02% optimize.opt_a.switch_simplify : 0.000036s : 0.13% optimize.opt_a.loop_unroll : 0.000023s : 0.08% optimize.opt_a.a_1 : 0.000556s : 2.06% optimize.opt_a.with_stream_mark : 0.000027s : 0.10% optimize.opt_a.recompute_prepare : 0.000019s : 0.07% optimize.opt_a.updatestate_depend_eliminate : 0.000009s : 0.03% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.03% optimize.opt_a.updatestate_loads_eliminate : 0.000007s : 0.03% optimize.opt_a.parameter_eliminate : 0.000003s : 0.01% optimize.opt_a.a_2 : 0.000273s : 1.01% optimize.opt_a.accelerated_algorithm : 0.000032s : 0.12% optimize.opt_a.shard : 0.000003s : 0.01% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.01% optimize.opt_a.shard_inline : 0.000016s : 0.06% optimize.opt_a.merge_send_recv : 0.000015s : 0.06% optimize.opt_a.auto_parallel : 0.000013s : 0.05% optimize.opt_a.parallel : 0.000025s : 0.09% optimize.opt_a.flash_sp : 0.000012s : 0.04% optimize.opt_a.merge_comm : 0.000009s : 0.03% optimize.opt_a.allreduce_fusion : 0.000009s : 0.03% optimize.opt_a.matmul_add_comm_reduction : 0.000021s : 0.08% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000018s : 0.07% optimize.opt_a.virtual_dataset : 0.000015s : 0.05% optimize.opt_a.get_grad_eliminate_ : 0.000015s : 0.05% optimize.opt_a.virtual_output : 0.000014s : 0.05% optimize.opt_a.merge_forward : 0.000008s : 0.03% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.01% optimize.opt_a.offload_activation : 0.000019s : 0.07% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000033s : 0.12% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.01% optimize.opt_a.before_grad : 0.000025s : 0.09% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000010s : 0.04% optimize.opt_a.meta_fg_expand : 0.000006s : 0.02% optimize.opt_a.flash_sp_send_recv_attached : 0.000006s : 0.02% optimize.opt_a.receive_attached : 0.000004s : 0.01% optimize.opt_a.after_resolve : 0.000022s : 0.08% optimize.opt_a.a_after_grad : 0.000022s : 0.08% optimize.opt_a.renormalize : 0.000598s : 2.21% optimize.opt_a.add_forward_monad_depend : 0.000007s : 0.02% optimize.opt_a.auto_monad_grad : 0.000003s : 0.01% optimize.opt_a.auto_monad_eliminator : 0.000026s : 0.10% optimize.opt_a.cse : 0.000059s : 0.22% optimize.opt_a.a_3 : 0.000128s : 0.47% optimize.py_interpret_to_execute_after_opt_a : 0.000014s : 0.05% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.02% optimize.rewriter_after_opt_a : 0.000045s : 0.17% optimize.convert_after_rewriter : 0.000011s : 0.04% optimize.order_py_execute_after_rewriter : 0.000009s : 0.03% optimize.mutable_eliminate : 0.000531s : 1.96% optimize.opt_b.b_1 : 0.000191s : 0.71% optimize.opt_b.b_2 : 0.000009s : 0.04% optimize.opt_b.updatestate_depend_eliminate : 0.000007s : 0.03% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.01% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.01% optimize.opt_b.renormalize : 0.000000s : 0.00% optimize.opt_b.cse : 0.000026s : 0.09% optimize.optimize_parallel_all_gather_comm : 0.000020s : 0.08% optimize.overlap_param_gather : 0.000005s : 0.02% optimize.cconv : 0.000030s : 0.11% optimize.loop_unroll : 0.000436s : 1.61% optimize.opt_after_cconv.c_1 : 0.000035s : 0.13% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.02% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.cse : 0.000023s : 0.09% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000020s : 0.07% optimize.tuple_transform.d_1 : 0.000049s : 0.18% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000009s : 0.03% optimize.partial_unused_args_eliminate : 0.000005s : 0.02% optimize.add_recomputation : 0.000058s : 0.21% optimize.cse_after_recomputation.cse : 0.000015s : 0.06% optimize.environ_conv : 0.000010s : 0.04% optimize.swap_dp_allreduce_reducescatter : 0.000009s : 0.03% optimize.bias_add_comm_swap : 0.000005s : 0.02% optimize.label_micro_interleaved_index : 0.000007s : 0.03% optimize.label_fine_grained_interleaved_index : 0.000005s : 0.02% optimize.merge_cast_opt : 0.000004s : 0.01% optimize.slice_recompute_activation : 0.000005s : 0.02% optimize.micro_interleaved_order_control : 0.000005s : 0.02% optimize.assign_add_opt : 0.000004s : 0.01% optimize.ForceFp32Comm : 0.000003s : 0.01% optimize.remove_cast_before_assign_add : 0.000004s : 0.01% optimize.full_micro_interleaved_order_control : 0.000004s : 0.02% optimize.reorder_send_recv_between_fp_bp : 0.000005s : 0.02% optimize.comm_op_add_attrs : 0.000003s : 0.01% optimize.add_comm_op_reuse_tag : 0.000003s : 0.01% optimize.interleave_split_concat_branches : 0.000004s : 0.01% optimize.interleave_parallel_branches : 0.000004s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000004s : 0.02% optimize.control_data_broadcast_order : 0.000018s : 0.07% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.01% optimize.offloading_packed_experts : 0.000008s : 0.03% optimize.overlap_recompute_and_grad_model_parallel : 0.000008s : 0.03% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.01% optimize.overlap_recompute_comm : 0.000005s : 0.02% optimize.overlap_grad_ring_attention : 0.000007s : 0.03% optimize.overlap_grad_flash_sp : 0.000025s : 0.09% optimize.begin_end_overlap_inline : 0.000003s : 0.01% optimize.split_matmul_comm_elemetwise : 0.000005s : 0.02% optimize.split_layernorm_comm : 0.000004s : 0.02% optimize.handle_group_info : 0.000003s : 0.01% optimize.symbol_engine_optimizer.build : 0.000003s : 0.01% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000011s : 0.04% optimize.symbol_engine_optimizer.elim_not_effective : 0.000016s : 0.06% optimize.symbol_engine_optimizer.opt_reshape : 0.000008s : 0.03% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000012s : 0.05% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000003s : 0.01% pipeline_parallel_scheduler : 0.000002s : 0.01% auto_monad_reorder : 0.000022s : 0.08% get_jit_bprop_graph : 0.000002s : 0.01% rewriter_after_jit_bprop_graph : 0.000004s : 0.02% opt_after_jit_grad : 0.000476s : 1.76% validate : 0.000041s : 0.15% Time group info: ------[substitution.] 0.000222 34 29.95% : 0.000066s : 4: substitution.arithmetic_simplify 8.71% : 0.000019s : 2: substitution.cast_eliminate 0.96% : 0.000002s : 3: substitution.elim_not_effective 0.79% : 0.000002s : 3: substitution.fold_const_symbol 2.82% : 0.000006s : 4: substitution.graph_param_transform 45.02% : 0.000100s : 2: substitution.inline 2.00% : 0.000004s : 6: substitution.j_node_and_user_rematch 5.05% : 0.000011s : 2: substitution.less_batch_normalization 3.03% : 0.000007s : 6: substitution.remove_not_recompute_node 1.67% : 0.000004s : 2: substitution.replace_old_param ------[type_inference.] 0.021801 2 97.55% : 0.021267s : 1: type_inference.infer 2.45% : 0.000534s : 1: type_inference.specialize ------[replace.] 0.000022 2 100.00% : 0.000022s : 2: replace.inline ------[match.] 0.000098 2 100.00% : 0.000098s : 2: match.inline ------[predicate.] 0.000169 980 0.99% : 0.000002s : 9: predicate.accumulaten_eliminater 0.85% : 0.000001s : 4: predicate.ad_related_special_op_eliminate 0.71% : 0.000001s : 8: predicate.addn_check_dump 0.89% : 0.000002s : 9: predicate.addn_zero_filter 0.74% : 0.000001s : 9: predicate.adjust_all_reduce_mul_add 2.60% : 0.000004s : 17: predicate.arithmetic_simplify 0.94% : 0.000002s : 9: predicate.cast_eliminate 0.82% : 0.000001s : 8: predicate.check_bprop_eliminate 0.68% : 0.000001s : 8: predicate.compare_switch_simplify 0.23% : 0.000000s : 4: predicate.const_output_eliminate 0.76% : 0.000001s : 8: predicate.depend_value_elim 0.87% : 0.000001s : 9: predicate.dict_get_item_const_eliminator 1.01% : 0.000002s : 9: predicate.dict_get_item_eliminator 0.76% : 0.000001s : 9: predicate.dict_set_item_eliminator 1.10% : 0.000002s : 8: predicate.dumpgradient_eliminate 0.23% : 0.000000s : 4: predicate.elim_not_effective 0.49% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.39% : 0.000002s : 13: predicate.environ_add_const_eliminate 1.03% : 0.000002s : 13: predicate.environ_get_add_eliminate 1.04% : 0.000002s : 13: predicate.environ_get_depend_swap 2.02% : 0.000003s : 21: predicate.environ_get_eliminate 1.03% : 0.000002s : 13: predicate.environ_get_set_eliminate 0.90% : 0.000002s : 11: predicate.exchange_switch_depend_value 1.77% : 0.000003s : 11: predicate.float_depend_g_call 0.74% : 0.000001s : 8: predicate.float_environ_get_switch 1.02% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.24% : 0.000000s : 4: predicate.fold_const_symbol 0.83% : 0.000001s : 8: predicate.get_grad_eliminate 0.26% : 0.000000s : 4: predicate.graph_param_transform 0.82% : 0.000001s : 8: predicate.incorporate_call 0.66% : 0.000001s : 8: predicate.incorporate_call_switch 6.31% : 0.000011s : 44: predicate.inline 1.02% : 0.000002s : 8: predicate.inline_without_move 0.38% : 0.000001s : 8: predicate.j_node_and_user_rematch 1.39% : 0.000002s : 8: predicate.less_batch_normalization 1.63% : 0.000003s : 17: predicate.list_to_tuple_eliminator_ 2.14% : 0.000004s : 26: predicate.load_eliminater 1.07% : 0.000002s : 4: predicate.loop_unroll_after_grad 1.58% : 0.000003s : 16: predicate.loop_unroll_before_grad 1.79% : 0.000003s : 17: predicate.make_slice_get_slice_eliminator 0.77% : 0.000001s : 8: predicate.merge_addn 0.70% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.73% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.73% : 0.000001s : 9: predicate.minmaximum_grad 1.16% : 0.000002s : 4: predicate.mutable_eliminate 0.48% : 0.000001s : 4: predicate.opt_reshape 0.43% : 0.000001s : 4: predicate.parallel_virtual_node 1.23% : 0.000002s : 11: predicate.partial_defer_inline 1.29% : 0.000002s : 13: predicate.partial_eliminate 0.78% : 0.000001s : 9: predicate.print_const_string_wrapper 0.76% : 0.000001s : 8: predicate.reduce_all_const_elim 1.10% : 0.000002s : 9: predicate.reduce_eliminate 2.11% : 0.000004s : 26: predicate.redundant_stop_gradient_eliminater 0.45% : 0.000001s : 8: predicate.remove_not_recompute_node 1.11% : 0.000002s : 17: predicate.replace_applicator 0.63% : 0.000001s : 8: predicate.replace_old_param 0.34% : 0.000001s : 4: predicate.reset_defer_inline 0.87% : 0.000001s : 9: predicate.reshape_eliminate 0.97% : 0.000002s : 8: predicate.row_tensor_add_zeros_like 0.47% : 0.000001s : 4: predicate.row_tensor_eliminate 1.05% : 0.000002s : 8: predicate.same_eliminate 0.53% : 0.000001s : 8: predicate.set_cell_output_no_recompute 1.01% : 0.000002s : 8: predicate.shard_identity_eliminate 1.03% : 0.000002s : 8: predicate.special_op_eliminate 1.01% : 0.000002s : 8: predicate.specialize_transform 1.22% : 0.000002s : 8: predicate.split_environ_get_set_with_tuple_value 0.93% : 0.000002s : 8: predicate.stack_unstack_eliminate 0.42% : 0.000001s : 4: predicate.switch_call_monad_eliminater 0.94% : 0.000002s : 11: predicate.switch_defer_inline 1.81% : 0.000003s : 19: predicate.switch_layer_defer_inline 4.15% : 0.000007s : 39: predicate.switch_simplify 0.79% : 0.000001s : 9: predicate.tile_eliminate 0.78% : 0.000001s : 9: predicate.transpose_eliminate 1.62% : 0.000003s : 17: predicate.tuple_list_convert_item_index_to_positive 1.72% : 0.000003s : 17: predicate.tuple_list_get_item_const_eliminator 1.52% : 0.000003s : 17: predicate.tuple_list_get_item_depend_reorder 3.12% : 0.000005s : 25: predicate.tuple_list_get_item_eliminator 1.52% : 0.000003s : 17: predicate.tuple_list_get_set_item_eliminator 2.45% : 0.000004s : 25: predicate.tuple_list_set_item_eliminator 1.62% : 0.000003s : 17: predicate.tuple_to_list_eliminator_ 2.07% : 0.000004s : 26: predicate.updatestate_pure_node_eliminater 3.07% : 0.000005s : 34: predicate.updatestate_useless_node_eliminater 0.42% : 0.000001s : 4: predicate.value_based_eliminate 0.78% : 0.000001s : 8: predicate.virtual_dataset_eliminate 0.76% : 0.000001s : 8: predicate.virtual_output_eliminate 0.33% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.50% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000238 5 8.01% : 0.000019s : 1: func_graph_cloner_run.FuncGraphClonerGraph 91.99% : 0.000219s : 4: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.042897 192 0.01% : 0.000006s : 1: ForceFp32Comm 8.16% : 0.003500s : 1: add_attr 8.13% : 0.003486s : 1: add_attr_with_inline 0.01% : 0.000006s : 1: add_comm_op_reuse_tag 0.14% : 0.000062s : 1: add_recomputation 0.01% : 0.000006s : 1: assign_add_opt 0.16% : 0.000068s : 1: auto_monad 0.07% : 0.000030s : 1: auto_monad_reorder 0.01% : 0.000006s : 1: begin_end_overlap_inline 0.02% : 0.000008s : 1: bias_add_comm_swap 1.10% : 0.000473s : 1: bootstrap 0.08% : 0.000033s : 1: cconv 0.01% : 0.000006s : 1: comm_op_add_attrs 0.05% : 0.000021s : 1: control_data_broadcast_order 0.03% : 0.000014s : 1: convert_after_rewriter 0.08% : 0.000034s : 1: cse_after_recomputation 0.02% : 0.000008s : 1: dataset_repeat_opt 0.04% : 0.000018s : 1: detach_backward 0.03% : 0.000013s : 1: environ_conv 0.06% : 0.000028s : 1: event_method 0.02% : 0.000007s : 1: full_micro_interleaved_order_control 0.02% : 0.000008s : 1: get_jit_bprop_graph 0.03% : 0.000012s : 1: graph_reusing 0.02% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000006s : 1: handle_group_info 0.02% : 0.000008s : 1: inline 0.02% : 0.000009s : 1: insert-virtual-dataset 0.01% : 0.000006s : 1: interleave_parallel_branches 0.01% : 0.000006s : 1: interleave_split_concat_branches 0.02% : 0.000008s : 1: label_fine_grained_interleaved_index 0.02% : 0.000010s : 1: label_micro_interleaved_index 1.03% : 0.000442s : 1: loop_unroll 0.02% : 0.000007s : 1: merge_cast_opt 0.02% : 0.000007s : 1: micro_interleaved_order_control 1.25% : 0.000537s : 1: mutable_eliminate 0.02% : 0.000010s : 1: offloading_packed_experts 0.04% : 0.000016s : 1: opt.transform.loop_unroll_optimizer 0.04% : 0.000017s : 1: opt.transform.mutable_eliminate 2.49% : 0.001068s : 78: opt.transform.opt_a 0.08% : 0.000033s : 1: opt.transform.opt_after_cconv 0.06% : 0.000027s : 1: opt.transform.opt_after_jit_grad 0.30% : 0.000127s : 28: opt.transform.opt_b 0.13% : 0.000056s : 2: opt.transform.opt_trans_graph 0.10% : 0.000043s : 4: opt.transform.symbol_engine_opt 6.82% : 0.002924s : 1: opt_a 0.33% : 0.000142s : 1: opt_after_cconv 1.13% : 0.000486s : 1: opt_after_jit_grad 0.72% : 0.000310s : 1: opt_b 13.05% : 0.005598s : 1: optimize 0.06% : 0.000024s : 1: optimize_parallel_all_gather_comm 0.03% : 0.000012s : 1: order_py_execute_after_rewriter 0.07% : 0.000028s : 1: overlap_grad_flash_sp 0.02% : 0.000007s : 1: overlap_grad_matmul_and_grad_allreduce 0.02% : 0.000010s : 1: overlap_grad_ring_attention 0.02% : 0.000007s : 1: overlap_opt_shard_grad_in_pipeline 0.02% : 0.000007s : 1: overlap_opt_shard_in_pipeline 0.02% : 0.000008s : 1: overlap_param_gather 0.01% : 0.000006s : 1: overlap_recompute_allgather_and_fa_grad 0.02% : 0.000011s : 1: overlap_recompute_and_grad_model_parallel 0.02% : 0.000008s : 1: overlap_recompute_comm 0.02% : 0.000010s : 1: parallel-infer-symbol 0.02% : 0.000006s : 1: parallel-infer-symbol-second 0.02% : 0.000008s : 1: partial_unused_args_eliminate 0.02% : 0.000009s : 1: pipeline_parallel_scheduler 0.02% : 0.000007s : 1: pipeline_split 0.08% : 0.000034s : 1: pre_auto_parallel 0.06% : 0.000025s : 1: py_interpret_to_execute 0.04% : 0.000017s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000006s : 1: remove_cast_before_assign_add 0.05% : 0.000023s : 1: remove_dup_value 0.82% : 0.000350s : 1: renormalize.infer 0.56% : 0.000241s : 1: renormalize.specialize 0.02% : 0.000008s : 1: reorder_send_recv_between_fp_bp 0.02% : 0.000010s : 1: rewriter_after_jit_bprop_graph 0.11% : 0.000048s : 1: rewriter_after_opt_a 0.13% : 0.000056s : 1: rewriter_before_opt_a 0.02% : 0.000007s : 1: slice_cell_reuse_recomputed_activation 0.02% : 0.000007s : 1: slice_recompute_activation 0.02% : 0.000007s : 1: split_layernorm_comm 0.02% : 0.000007s : 1: split_matmul_comm_elemetwise 0.03% : 0.000012s : 1: swap_dp_allreduce_reducescatter 0.25% : 0.000105s : 1: symbol_engine_optimizer 0.24% : 0.000101s : 1: tuple_transform 51.05% : 0.021898s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:26.510.228 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0307414, [21] [bootstrap]: 0.00041774 [type_inference]: 0.0216667 [event_method]: 1.282e-05 [auto_monad]: 5.67e-05 [graph_reusing]: 4.97999e-06 [inline]: 2.74001e-06 [add_attr]: 0.00331055, [1] [add_attr_with_inline]: 0.00330075, [1] [Cycle 1]: 5.355e-05, [2] [tag_attr]: 1.649e-05 [meta_addattr_fg_expand]: 4.27e-06 [parallel-infer-symbol]: 3.22002e-06 [pre_auto_parallel]: 2.551e-05 [insert-virtual-dataset]: 2.39999e-06 [parallel-infer-symbol-second]: 8.49977e-07 [dataset_repeat_opt]: 2.39001e-06 [pipeline_split]: 1.72999e-06 [optimize]: 0.00453537, [53] [py_interpret_to_execute]: 1.811e-05 [rewriter_before_opt_a]: 5.437e-05 [opt_a]: 0.00250973, [2] [Cycle 1]: 0.00175496, [45] [expand_dump_flag]: 2.88e-06 [switch_simplify]: 2.962e-05 [loop_unroll]: 1.8e-05 [a_1]: 0.00039698 [with_stream_mark]: 1.45e-05 [recompute_prepare]: 1.014e-05 [updatestate_depend_eliminate]: 5.51998e-06 [updatestate_assign_eliminate]: 4.39002e-06 [updatestate_loads_eliminate]: 4.18999e-06 [parameter_eliminate]: 2.24001e-06 [a_2]: 0.00012113 [accelerated_algorithm]: 2.155e-05 [shard]: 2.31e-06 [meta_shard_fg_expand]: 1.97001e-06 [shard_inline]: 7.88999e-06 [merge_send_recv]: 9.12001e-06 [auto_parallel]: 6.94001e-06 [parallel]: 1.896e-05 [flash_sp]: 7.77002e-06 [merge_comm]: 5.05001e-06 [allreduce_fusion]: 4.22e-06 [matmul_add_comm_reduction]: 1.244e-05 [allreduce_slice_to_reducescatter]: 6.39993e-07 [virtual_shard_identity]: 9.00001e-06 [virtual_dataset]: 8.05999e-06 [get_grad_eliminate_]: 7.2e-06 [virtual_output]: 7.53999e-06 [merge_forward]: 4.30999e-06 [cell_reuse_recompute_pass]: 1.28002e-06 [offload_activation]: 1.096e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.462e-05 [merge_recompute_call_nodes]: 1.41002e-06 [before_grad]: 1.323e-05 [set_forward_comm_id_for_comm_node_pass]: 4.35e-06 [meta_fg_expand]: 3.34001e-06 [flash_sp_send_recv_attached]: 4.63001e-06 [receive_attached]: 2.42001e-06 [after_resolve]: 1.196e-05 [a_after_grad]: 1.157e-05 [renormalize]: 0.00056742 [add_forward_monad_depend]: 5.15001e-06 [auto_monad_grad]: 1.97001e-06 [auto_monad_eliminator]: 1.632e-05 [cse]: 3.51e-05 [a_3]: 5.452e-05 [Cycle 2]: 0.000745, [45] [expand_dump_flag]: 9.30013e-07 [switch_simplify]: 8.88002e-06 [loop_unroll]: 7.11001e-06 [a_1]: 0.00016783 [with_stream_mark]: 1.068e-05 [recompute_prepare]: 7.38e-06 [updatestate_depend_eliminate]: 4.67e-06 [updatestate_assign_eliminate]: 2.98998e-06 [updatestate_loads_eliminate]: 2.71999e-06 [parameter_eliminate]: 1.21002e-06 [a_2]: 9.922e-05 [accelerated_algorithm]: 1.051e-05 [shard]: 9.5999e-07 [meta_shard_fg_expand]: 1.37e-06 [shard_inline]: 7.41001e-06 [merge_send_recv]: 6.07999e-06 [auto_parallel]: 5.96e-06 [parallel]: 4.99e-06 [flash_sp]: 3.31999e-06 [merge_comm]: 4.05e-06 [allreduce_fusion]: 4.05e-06 [matmul_add_comm_reduction]: 7.55e-06 [allreduce_slice_to_reducescatter]: 3.30008e-07 [virtual_shard_identity]: 7.9e-06 [virtual_dataset]: 6.79001e-06 [get_grad_eliminate_]: 6.51999e-06 [virtual_output]: 6.51999e-06 [merge_forward]: 3.35e-06 [cell_reuse_recompute_pass]: 1.57001e-06 [offload_activation]: 6.96999e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.286e-05 [merge_recompute_call_nodes]: 7.39994e-07 [before_grad]: 1.123e-05 [set_forward_comm_id_for_comm_node_pass]: 4.47e-06 [meta_fg_expand]: 2.51998e-06 [flash_sp_send_recv_attached]: 8.79983e-07 [receive_attached]: 1.00001e-06 [after_resolve]: 9.27001e-06 [a_after_grad]: 9.83002e-06 [renormalize]: 6.00121e-08 [add_forward_monad_depend]: 1.35001e-06 [auto_monad_grad]: 9.20001e-07 [auto_monad_eliminator]: 8.40001e-06 [cse]: 1.985e-05 [a_3]: 4.232e-05 [py_interpret_to_execute_after_opt_a]: 9.70002e-06 [slice_cell_reuse_recomputed_activation]: 1.90001e-06 [rewriter_after_opt_a]: 3.791e-05 [convert_after_rewriter]: 7.46999e-06 [order_py_execute_after_rewriter]: 5.87001e-06 [mutable_eliminate]: 0.00048803 [opt_b]: 0.00023201, [1] [Cycle 1]: 0.00022603, [7] [b_1]: 0.00014557 [b_2]: 8.54e-06 [updatestate_depend_eliminate]: 6.18998e-06 [updatestate_assign_eliminate]: 2.94999e-06 [updatestate_loads_eliminate]: 3.39001e-06 [renormalize]: 5.39992e-07 [cse]: 2.391e-05 [optimize_parallel_all_gather_comm]: 1.737e-05 [overlap_param_gather]: 2.36e-06 [cconv]: 2.443e-05 [loop_unroll]: 0.00042768 [opt_after_cconv]: 0.00011592, [1] [Cycle 1]: 0.00010998, [7] [c_1]: 3.419e-05 [parameter_eliminate]: 3.68999e-06 [updatestate_depend_eliminate]: 5.99e-06 [updatestate_assign_eliminate]: 3.13e-06 [updatestate_loads_eliminate]: 3.11999e-06 [cse]: 2.42e-05 [renormalize]: 5.09986e-07 [remove_dup_value]: 1.688e-05 [tuple_transform]: 8.076e-05, [1] [Cycle 1]: 7.637e-05, [4] [d_1]: 4.743e-05 [none_parameter_eliminate]: 1.70001e-06 [renormalize]: 2.00002e-07 [switch_simplify]: 8.14002e-06 [partial_unused_args_eliminate]: 1.87999e-06 [add_recomputation]: 5.412e-05 [cse_after_recomputation]: 2.576e-05, [1] [Cycle 1]: 2.116e-05, [1] [cse]: 1.571e-05 [environ_conv]: 5.94999e-06 [swap_dp_allreduce_reducescatter]: 5.99999e-06 [bias_add_comm_swap]: 2.51e-06 [label_micro_interleaved_index]: 3.93999e-06 [label_fine_grained_interleaved_index]: 2.78e-06 [merge_cast_opt]: 1.51998e-06 [slice_recompute_activation]: 1.95001e-06 [micro_interleaved_order_control]: 2.46e-06 [assign_add_opt]: 1.59e-06 [ForceFp32Comm]: 8.29983e-07 [remove_cast_before_assign_add]: 1.02e-06 [full_micro_interleaved_order_control]: 2.34001e-06 [reorder_send_recv_between_fp_bp]: 2.67001e-06 [comm_op_add_attrs]: 1.02e-06 [add_comm_op_reuse_tag]: 9.80013e-07 [interleave_split_concat_branches]: 1.14e-06 [interleave_parallel_branches]: 1.37999e-06 [overlap_opt_shard_in_pipeline]: 1.27999e-06 [overlap_opt_shard_grad_in_pipeline]: 1.84e-06 [control_data_broadcast_order]: 1.569e-05 [grouped_pairwise_exchange_alltoall]: 1.44998e-06 [offloading_packed_experts]: 4.16001e-06 [overlap_recompute_and_grad_model_parallel]: 5.40001e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.47999e-06 [overlap_recompute_allgather_and_fa_grad]: 1.29e-06 [overlap_recompute_comm]: 2.14999e-06 [overlap_grad_ring_attention]: 4.35e-06 [overlap_grad_flash_sp]: 1.881e-05 [begin_end_overlap_inline]: 1.09e-06 [split_matmul_comm_elemetwise]: 2.32001e-06 [split_layernorm_comm]: 1.74998e-06 [handle_group_info]: 9.30013e-07 [symbol_engine_optimizer]: 8.467e-05, [1] [Cycle 1]: 8.025e-05, [6] [build]: 3.41999e-06 [elim_shapecalc]: 1.118e-05 [elim_not_effective]: 1.506e-05 [opt_reshape]: 8.66002e-06 [fold_const_symbol]: 1.258e-05 [renormalize]: 7.60017e-07 [detach_backward]: 1.82999e-06 [pipeline_parallel_scheduler]: 1.40999e-06 [auto_monad_reorder]: 1.969e-05 [get_jit_bprop_graph]: 1.30999e-06 [rewriter_after_jit_bprop_graph]: 3.86999e-06 [opt_after_jit_grad]: 0.00047133 [validate]: 4.037e-05 Sums bootstrap : 0.000418s : 1.58% type_inference : 0.021667s : 81.88% event_method : 0.000013s : 0.05% auto_monad : 0.000057s : 0.21% graph_reusing : 0.000005s : 0.02% inline : 0.000003s : 0.01% add_attr.add_attr_with_inline.tag_attr : 0.000016s : 0.06% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000004s : 0.02% parallel-infer-symbol : 0.000003s : 0.01% pre_auto_parallel : 0.000026s : 0.10% insert-virtual-dataset : 0.000002s : 0.01% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.01% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000018s : 0.07% optimize.rewriter_before_opt_a : 0.000054s : 0.21% optimize.opt_a.expand_dump_flag : 0.000004s : 0.01% optimize.opt_a.switch_simplify : 0.000039s : 0.15% optimize.opt_a.loop_unroll : 0.000025s : 0.09% optimize.opt_a.a_1 : 0.000565s : 2.13% optimize.opt_a.with_stream_mark : 0.000025s : 0.10% optimize.opt_a.recompute_prepare : 0.000018s : 0.07% optimize.opt_a.updatestate_depend_eliminate : 0.000010s : 0.04% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.03% optimize.opt_a.updatestate_loads_eliminate : 0.000007s : 0.03% optimize.opt_a.parameter_eliminate : 0.000003s : 0.01% optimize.opt_a.a_2 : 0.000220s : 0.83% optimize.opt_a.accelerated_algorithm : 0.000032s : 0.12% optimize.opt_a.shard : 0.000003s : 0.01% optimize.opt_a.meta_shard_fg_expand : 0.000003s : 0.01% optimize.opt_a.shard_inline : 0.000015s : 0.06% optimize.opt_a.merge_send_recv : 0.000015s : 0.06% optimize.opt_a.auto_parallel : 0.000013s : 0.05% optimize.opt_a.parallel : 0.000024s : 0.09% optimize.opt_a.flash_sp : 0.000011s : 0.04% optimize.opt_a.merge_comm : 0.000009s : 0.03% optimize.opt_a.allreduce_fusion : 0.000008s : 0.03% optimize.opt_a.matmul_add_comm_reduction : 0.000020s : 0.08% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000017s : 0.06% optimize.opt_a.virtual_dataset : 0.000015s : 0.06% optimize.opt_a.get_grad_eliminate_ : 0.000014s : 0.05% optimize.opt_a.virtual_output : 0.000014s : 0.05% optimize.opt_a.merge_forward : 0.000008s : 0.03% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.01% optimize.opt_a.offload_activation : 0.000018s : 0.07% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000027s : 0.10% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.01% optimize.opt_a.before_grad : 0.000024s : 0.09% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000009s : 0.03% optimize.opt_a.meta_fg_expand : 0.000006s : 0.02% optimize.opt_a.flash_sp_send_recv_attached : 0.000006s : 0.02% optimize.opt_a.receive_attached : 0.000003s : 0.01% optimize.opt_a.after_resolve : 0.000021s : 0.08% optimize.opt_a.a_after_grad : 0.000021s : 0.08% optimize.opt_a.renormalize : 0.000567s : 2.14% optimize.opt_a.add_forward_monad_depend : 0.000007s : 0.02% optimize.opt_a.auto_monad_grad : 0.000003s : 0.01% optimize.opt_a.auto_monad_eliminator : 0.000025s : 0.09% optimize.opt_a.cse : 0.000055s : 0.21% optimize.opt_a.a_3 : 0.000097s : 0.37% optimize.py_interpret_to_execute_after_opt_a : 0.000010s : 0.04% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.01% optimize.rewriter_after_opt_a : 0.000038s : 0.14% optimize.convert_after_rewriter : 0.000007s : 0.03% optimize.order_py_execute_after_rewriter : 0.000006s : 0.02% optimize.mutable_eliminate : 0.000488s : 1.84% optimize.opt_b.b_1 : 0.000146s : 0.55% optimize.opt_b.b_2 : 0.000009s : 0.03% optimize.opt_b.updatestate_depend_eliminate : 0.000006s : 0.02% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.01% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.01% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000024s : 0.09% optimize.optimize_parallel_all_gather_comm : 0.000017s : 0.07% optimize.overlap_param_gather : 0.000002s : 0.01% optimize.cconv : 0.000024s : 0.09% optimize.loop_unroll : 0.000428s : 1.62% optimize.opt_after_cconv.c_1 : 0.000034s : 0.13% optimize.opt_after_cconv.parameter_eliminate : 0.000004s : 0.01% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.02% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.cse : 0.000024s : 0.09% optimize.opt_after_cconv.renormalize : 0.000001s : 0.00% optimize.remove_dup_value : 0.000017s : 0.06% optimize.tuple_transform.d_1 : 0.000047s : 0.18% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000008s : 0.03% optimize.partial_unused_args_eliminate : 0.000002s : 0.01% optimize.add_recomputation : 0.000054s : 0.20% optimize.cse_after_recomputation.cse : 0.000016s : 0.06% optimize.environ_conv : 0.000006s : 0.02% optimize.swap_dp_allreduce_reducescatter : 0.000006s : 0.02% optimize.bias_add_comm_swap : 0.000003s : 0.01% optimize.label_micro_interleaved_index : 0.000004s : 0.01% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.01% optimize.merge_cast_opt : 0.000002s : 0.01% optimize.slice_recompute_activation : 0.000002s : 0.01% optimize.micro_interleaved_order_control : 0.000002s : 0.01% optimize.assign_add_opt : 0.000002s : 0.01% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.00% optimize.full_micro_interleaved_order_control : 0.000002s : 0.01% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.01% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.01% optimize.control_data_broadcast_order : 0.000016s : 0.06% optimize.grouped_pairwise_exchange_alltoall : 0.000001s : 0.01% optimize.offloading_packed_experts : 0.000004s : 0.02% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.02% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.00% optimize.overlap_recompute_comm : 0.000002s : 0.01% optimize.overlap_grad_ring_attention : 0.000004s : 0.02% optimize.overlap_grad_flash_sp : 0.000019s : 0.07% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.01% optimize.split_layernorm_comm : 0.000002s : 0.01% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000003s : 0.01% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000011s : 0.04% optimize.symbol_engine_optimizer.elim_not_effective : 0.000015s : 0.06% optimize.symbol_engine_optimizer.opt_reshape : 0.000009s : 0.03% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000013s : 0.05% optimize.symbol_engine_optimizer.renormalize : 0.000001s : 0.00% detach_backward : 0.000002s : 0.01% pipeline_parallel_scheduler : 0.000001s : 0.01% auto_monad_reorder : 0.000020s : 0.07% get_jit_bprop_graph : 0.000001s : 0.00% rewriter_after_jit_bprop_graph : 0.000004s : 0.01% opt_after_jit_grad : 0.000471s : 1.78% validate : 0.000040s : 0.15% Time group info: ------[substitution.] 0.000215 34 29.86% : 0.000064s : 4: substitution.arithmetic_simplify 8.33% : 0.000018s : 2: substitution.cast_eliminate 1.07% : 0.000002s : 3: substitution.elim_not_effective 0.79% : 0.000002s : 3: substitution.fold_const_symbol 3.00% : 0.000006s : 4: substitution.graph_param_transform 45.51% : 0.000098s : 2: substitution.inline 2.04% : 0.000004s : 6: substitution.j_node_and_user_rematch 5.30% : 0.000011s : 2: substitution.less_batch_normalization 2.69% : 0.000006s : 6: substitution.remove_not_recompute_node 1.41% : 0.000003s : 2: substitution.replace_old_param ------[type_inference.] 0.021618 2 97.87% : 0.021156s : 1: type_inference.infer 2.13% : 0.000462s : 1: type_inference.specialize ------[replace.] 0.000021 2 100.00% : 0.000021s : 2: replace.inline ------[match.] 0.000096 2 100.00% : 0.000096s : 2: match.inline ------[predicate.] 0.000175 980 0.85% : 0.000001s : 9: predicate.accumulaten_eliminater 0.98% : 0.000002s : 4: predicate.ad_related_special_op_eliminate 0.65% : 0.000001s : 8: predicate.addn_check_dump 1.07% : 0.000002s : 9: predicate.addn_zero_filter 0.72% : 0.000001s : 9: predicate.adjust_all_reduce_mul_add 2.96% : 0.000005s : 17: predicate.arithmetic_simplify 0.79% : 0.000001s : 9: predicate.cast_eliminate 0.70% : 0.000001s : 8: predicate.check_bprop_eliminate 0.63% : 0.000001s : 8: predicate.compare_switch_simplify 0.21% : 0.000000s : 4: predicate.const_output_eliminate 0.71% : 0.000001s : 8: predicate.depend_value_elim 0.95% : 0.000002s : 9: predicate.dict_get_item_const_eliminator 1.02% : 0.000002s : 9: predicate.dict_get_item_eliminator 0.78% : 0.000001s : 9: predicate.dict_set_item_eliminator 1.15% : 0.000002s : 8: predicate.dumpgradient_eliminate 0.27% : 0.000000s : 4: predicate.elim_not_effective 0.53% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.11% : 0.000002s : 13: predicate.environ_add_const_eliminate 1.07% : 0.000002s : 13: predicate.environ_get_add_eliminate 1.07% : 0.000002s : 13: predicate.environ_get_depend_swap 1.83% : 0.000003s : 21: predicate.environ_get_eliminate 1.01% : 0.000002s : 13: predicate.environ_get_set_eliminate 0.97% : 0.000002s : 11: predicate.exchange_switch_depend_value 1.95% : 0.000003s : 11: predicate.float_depend_g_call 0.69% : 0.000001s : 8: predicate.float_environ_get_switch 0.98% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.23% : 0.000000s : 4: predicate.fold_const_symbol 0.78% : 0.000001s : 8: predicate.get_grad_eliminate 0.26% : 0.000000s : 4: predicate.graph_param_transform 0.83% : 0.000001s : 8: predicate.incorporate_call 0.75% : 0.000001s : 8: predicate.incorporate_call_switch 6.63% : 0.000012s : 44: predicate.inline 1.12% : 0.000002s : 8: predicate.inline_without_move 0.34% : 0.000001s : 8: predicate.j_node_and_user_rematch 1.16% : 0.000002s : 8: predicate.less_batch_normalization 1.54% : 0.000003s : 17: predicate.list_to_tuple_eliminator_ 2.23% : 0.000004s : 26: predicate.load_eliminater 1.09% : 0.000002s : 4: predicate.loop_unroll_after_grad 1.57% : 0.000003s : 16: predicate.loop_unroll_before_grad 1.65% : 0.000003s : 17: predicate.make_slice_get_slice_eliminator 0.74% : 0.000001s : 8: predicate.merge_addn 0.67% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.86% : 0.000002s : 8: predicate.mini_step_allgather_replace 0.76% : 0.000001s : 9: predicate.minmaximum_grad 1.29% : 0.000002s : 4: predicate.mutable_eliminate 0.45% : 0.000001s : 4: predicate.opt_reshape 0.54% : 0.000001s : 4: predicate.parallel_virtual_node 1.30% : 0.000002s : 11: predicate.partial_defer_inline 1.30% : 0.000002s : 13: predicate.partial_eliminate 0.78% : 0.000001s : 9: predicate.print_const_string_wrapper 0.86% : 0.000002s : 8: predicate.reduce_all_const_elim 1.05% : 0.000002s : 9: predicate.reduce_eliminate 2.19% : 0.000004s : 26: predicate.redundant_stop_gradient_eliminater 0.47% : 0.000001s : 8: predicate.remove_not_recompute_node 1.12% : 0.000002s : 17: predicate.replace_applicator 0.59% : 0.000001s : 8: predicate.replace_old_param 0.31% : 0.000001s : 4: predicate.reset_defer_inline 0.89% : 0.000002s : 9: predicate.reshape_eliminate 0.74% : 0.000001s : 8: predicate.row_tensor_add_zeros_like 0.51% : 0.000001s : 4: predicate.row_tensor_eliminate 0.86% : 0.000001s : 8: predicate.same_eliminate 0.50% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.92% : 0.000002s : 8: predicate.shard_identity_eliminate 0.89% : 0.000002s : 8: predicate.special_op_eliminate 1.02% : 0.000002s : 8: predicate.specialize_transform 0.99% : 0.000002s : 8: predicate.split_environ_get_set_with_tuple_value 0.88% : 0.000002s : 8: predicate.stack_unstack_eliminate 0.42% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.06% : 0.000002s : 11: predicate.switch_defer_inline 1.74% : 0.000003s : 19: predicate.switch_layer_defer_inline 4.50% : 0.000008s : 39: predicate.switch_simplify 0.80% : 0.000001s : 9: predicate.tile_eliminate 0.84% : 0.000001s : 9: predicate.transpose_eliminate 1.53% : 0.000003s : 17: predicate.tuple_list_convert_item_index_to_positive 1.67% : 0.000003s : 17: predicate.tuple_list_get_item_const_eliminator 1.52% : 0.000003s : 17: predicate.tuple_list_get_item_depend_reorder 3.18% : 0.000006s : 25: predicate.tuple_list_get_item_eliminator 1.44% : 0.000003s : 17: predicate.tuple_list_get_set_item_eliminator 2.42% : 0.000004s : 25: predicate.tuple_list_set_item_eliminator 1.52% : 0.000003s : 17: predicate.tuple_to_list_eliminator_ 2.08% : 0.000004s : 26: predicate.updatestate_pure_node_eliminater 3.09% : 0.000005s : 34: predicate.updatestate_useless_node_eliminater 0.42% : 0.000001s : 4: predicate.value_based_eliminate 0.79% : 0.000001s : 8: predicate.virtual_dataset_eliminate 0.75% : 0.000001s : 8: predicate.virtual_output_eliminate 0.36% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.50% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000210 5 7.98% : 0.000017s : 1: func_graph_cloner_run.FuncGraphClonerGraph 92.02% : 0.000194s : 4: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.040409 192 0.01% : 0.000003s : 1: ForceFp32Comm 8.21% : 0.003316s : 1: add_attr 8.18% : 0.003305s : 1: add_attr_with_inline 0.01% : 0.000004s : 1: add_comm_op_reuse_tag 0.14% : 0.000058s : 1: add_recomputation 0.01% : 0.000004s : 1: assign_add_opt 0.15% : 0.000062s : 1: auto_monad 0.06% : 0.000024s : 1: auto_monad_reorder 0.01% : 0.000004s : 1: begin_end_overlap_inline 0.01% : 0.000005s : 1: bias_add_comm_swap 1.11% : 0.000447s : 1: bootstrap 0.07% : 0.000028s : 1: cconv 0.01% : 0.000004s : 1: comm_op_add_attrs 0.05% : 0.000019s : 1: control_data_broadcast_order 0.03% : 0.000011s : 1: convert_after_rewriter 0.07% : 0.000029s : 1: cse_after_recomputation 0.01% : 0.000006s : 1: dataset_repeat_opt 0.01% : 0.000005s : 1: detach_backward 0.02% : 0.000009s : 1: environ_conv 0.05% : 0.000020s : 1: event_method 0.01% : 0.000005s : 1: full_micro_interleaved_order_control 0.01% : 0.000005s : 1: get_jit_bprop_graph 0.02% : 0.000008s : 1: graph_reusing 0.01% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000004s : 1: handle_group_info 0.01% : 0.000006s : 1: inline 0.02% : 0.000006s : 1: insert-virtual-dataset 0.01% : 0.000004s : 1: interleave_parallel_branches 0.01% : 0.000004s : 1: interleave_split_concat_branches 0.01% : 0.000006s : 1: label_fine_grained_interleaved_index 0.02% : 0.000007s : 1: label_micro_interleaved_index 1.08% : 0.000436s : 1: loop_unroll 0.01% : 0.000004s : 1: merge_cast_opt 0.01% : 0.000005s : 1: micro_interleaved_order_control 1.23% : 0.000496s : 1: mutable_eliminate 0.02% : 0.000007s : 1: offloading_packed_experts 0.04% : 0.000016s : 1: opt.transform.loop_unroll_optimizer 0.04% : 0.000016s : 1: opt.transform.mutable_eliminate 2.65% : 0.001070s : 78: opt.transform.opt_a 0.08% : 0.000033s : 1: opt.transform.opt_after_cconv 0.07% : 0.000027s : 1: opt.transform.opt_after_jit_grad 0.30% : 0.000123s : 28: opt.transform.opt_b 0.13% : 0.000053s : 2: opt.transform.opt_trans_graph 0.11% : 0.000043s : 4: opt.transform.symbol_engine_opt 6.22% : 0.002513s : 1: opt_a 0.30% : 0.000119s : 1: opt_after_cconv 1.19% : 0.000480s : 1: opt_after_jit_grad 0.58% : 0.000235s : 1: opt_b 11.23% : 0.004540s : 1: optimize 0.05% : 0.000021s : 1: optimize_parallel_all_gather_comm 0.02% : 0.000009s : 1: order_py_execute_after_rewriter 0.06% : 0.000022s : 1: overlap_grad_flash_sp 0.01% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.02% : 0.000007s : 1: overlap_grad_ring_attention 0.01% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.01% : 0.000005s : 1: overlap_param_gather 0.01% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.02% : 0.000008s : 1: overlap_recompute_and_grad_model_parallel 0.01% : 0.000005s : 1: overlap_recompute_comm 0.02% : 0.000007s : 1: parallel-infer-symbol 0.01% : 0.000004s : 1: parallel-infer-symbol-second 0.01% : 0.000005s : 1: partial_unused_args_eliminate 0.01% : 0.000004s : 1: pipeline_parallel_scheduler 0.01% : 0.000005s : 1: pipeline_split 0.07% : 0.000030s : 1: pre_auto_parallel 0.06% : 0.000023s : 1: py_interpret_to_execute 0.03% : 0.000013s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000004s : 1: remove_cast_before_assign_add 0.05% : 0.000020s : 1: remove_dup_value 0.76% : 0.000306s : 1: renormalize.infer 0.63% : 0.000254s : 1: renormalize.specialize 0.01% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.02% : 0.000007s : 1: rewriter_after_jit_bprop_graph 0.10% : 0.000042s : 1: rewriter_after_opt_a 0.15% : 0.000059s : 1: rewriter_before_opt_a 0.01% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.01% : 0.000005s : 1: slice_recompute_activation 0.01% : 0.000005s : 1: split_layernorm_comm 0.01% : 0.000005s : 1: split_matmul_comm_elemetwise 0.02% : 0.000009s : 1: swap_dp_allreduce_reducescatter 0.22% : 0.000087s : 1: symbol_engine_optimizer 0.21% : 0.000084s : 1: tuple_transform 53.67% : 0.021689s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:27.572.96 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:27.575.83 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0311389, [21] [bootstrap]: 0.00041937 [type_inference]: 0.0216677 [event_method]: 1.206e-05 [auto_monad]: 5.459e-05 [graph_reusing]: 5.04e-06 [inline]: 2.63998e-06 [add_attr]: 0.00316546, [1] [add_attr_with_inline]: 0.00315688, [1] [Cycle 1]: 5.954e-05, [2] [tag_attr]: 1.394e-05 [meta_addattr_fg_expand]: 3.73001e-06 [parallel-infer-symbol]: 3.28e-06 [pre_auto_parallel]: 2.324e-05 [insert-virtual-dataset]: 2.66999e-06 [parallel-infer-symbol-second]: 7.50006e-07 [dataset_repeat_opt]: 1.92001e-06 [pipeline_split]: 1.57001e-06 [optimize]: 0.00468566, [53] [py_interpret_to_execute]: 1.899e-05 [rewriter_before_opt_a]: 4.661e-05 [opt_a]: 0.00246847, [2] [Cycle 1]: 0.00164326, [45] [expand_dump_flag]: 2.87002e-06 [switch_simplify]: 2.318e-05 [loop_unroll]: 1.425e-05 [a_1]: 0.00030944 [with_stream_mark]: 1.534e-05 [recompute_prepare]: 8.59002e-06 [updatestate_depend_eliminate]: 3.93999e-06 [updatestate_assign_eliminate]: 3.6e-06 [updatestate_loads_eliminate]: 2.88e-06 [parameter_eliminate]: 2.47001e-06 [a_2]: 0.00012266 [accelerated_algorithm]: 1.881e-05 [shard]: 1.81003e-06 [meta_shard_fg_expand]: 1.77001e-06 [shard_inline]: 6.41998e-06 [merge_send_recv]: 7.71001e-06 [auto_parallel]: 6.00002e-06 [parallel]: 1.893e-05 [flash_sp]: 7.00002e-06 [merge_comm]: 3.71001e-06 [allreduce_fusion]: 3.43e-06 [matmul_add_comm_reduction]: 1.011e-05 [allreduce_slice_to_reducescatter]: 8.70001e-07 [virtual_shard_identity]: 7.83999e-06 [virtual_dataset]: 6.43e-06 [get_grad_eliminate_]: 6.00002e-06 [virtual_output]: 6.92002e-06 [merge_forward]: 3.80998e-06 [cell_reuse_recompute_pass]: 1.34e-06 [offload_activation]: 8.96002e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.336e-05 [merge_recompute_call_nodes]: 1.39998e-06 [before_grad]: 1.031e-05 [set_forward_comm_id_for_comm_node_pass]: 3.46999e-06 [meta_fg_expand]: 2.83e-06 [flash_sp_send_recv_attached]: 4.38999e-06 [receive_attached]: 2.35002e-06 [after_resolve]: 1.002e-05 [a_after_grad]: 9.22001e-06 [renormalize]: 0.00046612 [add_forward_monad_depend]: 5.11997e-06 [auto_monad_grad]: 2.07001e-06 [auto_monad_eliminator]: 1.532e-05 [cse]: 2.768e-05 [a_3]: 5.823e-05 [Cycle 2]: 0.00081248, [45] [expand_dump_flag]: 1.12999e-06 [switch_simplify]: 6.93e-06 [loop_unroll]: 6.00002e-06 [a_1]: 0.00012805 [with_stream_mark]: 8.53001e-06 [recompute_prepare]: 6.89001e-06 [updatestate_depend_eliminate]: 3.25e-06 [updatestate_assign_eliminate]: 2.49999e-06 [updatestate_loads_eliminate]: 2.39001e-06 [parameter_eliminate]: 1.15001e-06 [a_2]: 0.00010928 [accelerated_algorithm]: 9.32999e-06 [shard]: 9.89996e-07 [meta_shard_fg_expand]: 1.49e-06 [shard_inline]: 6.45002e-06 [merge_send_recv]: 4.89e-06 [auto_parallel]: 5.39e-06 [parallel]: 4.63001e-06 [flash_sp]: 3.4e-06 [merge_comm]: 3.18998e-06 [allreduce_fusion]: 2.92002e-06 [matmul_add_comm_reduction]: 6.93e-06 [allreduce_slice_to_reducescatter]: 3.20026e-07 [virtual_shard_identity]: 6.47001e-06 [virtual_dataset]: 5.74999e-06 [get_grad_eliminate_]: 5.62999e-06 [virtual_output]: 5.24e-06 [merge_forward]: 2.61e-06 [cell_reuse_recompute_pass]: 1.28002e-06 [offload_activation]: 6.44001e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.234e-05 [merge_recompute_call_nodes]: 9.10019e-07 [before_grad]: 8.67e-06 [set_forward_comm_id_for_comm_node_pass]: 3.53e-06 [meta_fg_expand]: 2.27001e-06 [flash_sp_send_recv_attached]: 1.06002e-06 [receive_attached]: 9.70002e-07 [after_resolve]: 8.3e-06 [a_after_grad]: 8.09997e-06 [renormalize]: 7.99773e-08 [add_forward_monad_depend]: 1.59e-06 [auto_monad_grad]: 9.30013e-07 [auto_monad_eliminator]: 7.55e-06 [cse]: 1.524e-05 [a_3]: 4.847e-05 [py_interpret_to_execute_after_opt_a]: 1.154e-05 [slice_cell_reuse_recomputed_activation]: 5.22e-06 [rewriter_after_opt_a]: 4.077e-05 [convert_after_rewriter]: 9.42999e-06 [order_py_execute_after_rewriter]: 8.03999e-06 [mutable_eliminate]: 0.00050658 [opt_b]: 0.00028399, [1] [Cycle 1]: 0.00027452, [7] [b_1]: 0.00017769 [b_2]: 8.69998e-06 [updatestate_depend_eliminate]: 5.68997e-06 [updatestate_assign_eliminate]: 2.88e-06 [updatestate_loads_eliminate]: 2.57001e-06 [renormalize]: 4.19997e-07 [cse]: 1.813e-05 [optimize_parallel_all_gather_comm]: 1.916e-05 [overlap_param_gather]: 5.20999e-06 [cconv]: 2.906e-05 [loop_unroll]: 0.00043707 [opt_after_cconv]: 0.00012294, [1] [Cycle 1]: 0.00011453, [7] [c_1]: 2.805e-05 [parameter_eliminate]: 2.78e-06 [updatestate_depend_eliminate]: 5.00999e-06 [updatestate_assign_eliminate]: 2.48998e-06 [updatestate_loads_eliminate]: 2.78e-06 [cse]: 1.819e-05 [renormalize]: 3.19997e-07 [remove_dup_value]: 1.814e-05 [tuple_transform]: 8.619e-05, [1] [Cycle 1]: 7.879e-05, [4] [d_1]: 3.869e-05 [none_parameter_eliminate]: 1.72001e-06 [renormalize]: 2.30008e-07 [switch_simplify]: 6.99001e-06 [partial_unused_args_eliminate]: 4.49002e-06 [add_recomputation]: 4.77e-05 [cse_after_recomputation]: 2.722e-05, [1] [Cycle 1]: 2.052e-05, [1] [cse]: 1.156e-05 [environ_conv]: 8.07e-06 [swap_dp_allreduce_reducescatter]: 8.07e-06 [bias_add_comm_swap]: 5.34e-06 [label_micro_interleaved_index]: 6.56e-06 [label_fine_grained_interleaved_index]: 5.59e-06 [merge_cast_opt]: 3.65998e-06 [slice_recompute_activation]: 4.26001e-06 [micro_interleaved_order_control]: 4.79998e-06 [assign_add_opt]: 3.56999e-06 [ForceFp32Comm]: 3.51001e-06 [remove_cast_before_assign_add]: 3.26001e-06 [full_micro_interleaved_order_control]: 4.33999e-06 [reorder_send_recv_between_fp_bp]: 5.46002e-06 [comm_op_add_attrs]: 3.66001e-06 [add_comm_op_reuse_tag]: 3.35e-06 [interleave_split_concat_branches]: 3.76999e-06 [interleave_parallel_branches]: 3.60003e-06 [overlap_opt_shard_in_pipeline]: 3.93001e-06 [overlap_opt_shard_grad_in_pipeline]: 4.23001e-06 [control_data_broadcast_order]: 1.532e-05 [grouped_pairwise_exchange_alltoall]: 4.84e-06 [offloading_packed_experts]: 7.28e-06 [overlap_recompute_and_grad_model_parallel]: 7.8e-06 [overlap_grad_matmul_and_grad_allreduce]: 4.10998e-06 [overlap_recompute_allgather_and_fa_grad]: 4.27e-06 [overlap_recompute_comm]: 5.77999e-06 [overlap_grad_ring_attention]: 7.12002e-06 [overlap_grad_flash_sp]: 2.035e-05 [begin_end_overlap_inline]: 3.18e-06 [split_matmul_comm_elemetwise]: 4.92999e-06 [split_layernorm_comm]: 4.38999e-06 [handle_group_info]: 3.38e-06 [symbol_engine_optimizer]: 9.462e-05, [1] [Cycle 1]: 8.803e-05, [6] [build]: 2.47001e-06 [elim_shapecalc]: 1.012e-05 [elim_not_effective]: 1.309e-05 [opt_reshape]: 7.05998e-06 [fold_const_symbol]: 9.79999e-06 [renormalize]: 2.29978e-07 [detach_backward]: 3.48e-06 [pipeline_parallel_scheduler]: 1.78002e-06 [auto_monad_reorder]: 1.823e-05 [get_jit_bprop_graph]: 1.20999e-06 [rewriter_after_jit_bprop_graph]: 4.03999e-06 [opt_after_jit_grad]: 0.00049419 [validate]: 3.486e-05 Sums bootstrap : 0.000419s : 1.60% type_inference : 0.021668s : 82.43% event_method : 0.000012s : 0.05% auto_monad : 0.000055s : 0.21% graph_reusing : 0.000005s : 0.02% inline : 0.000003s : 0.01% add_attr.add_attr_with_inline.tag_attr : 0.000014s : 0.05% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000004s : 0.01% parallel-infer-symbol : 0.000003s : 0.01% pre_auto_parallel : 0.000023s : 0.09% insert-virtual-dataset : 0.000003s : 0.01% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.01% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000019s : 0.07% optimize.rewriter_before_opt_a : 0.000047s : 0.18% optimize.opt_a.expand_dump_flag : 0.000004s : 0.02% optimize.opt_a.switch_simplify : 0.000030s : 0.11% optimize.opt_a.loop_unroll : 0.000020s : 0.08% optimize.opt_a.a_1 : 0.000437s : 1.66% optimize.opt_a.with_stream_mark : 0.000024s : 0.09% optimize.opt_a.recompute_prepare : 0.000015s : 0.06% optimize.opt_a.updatestate_depend_eliminate : 0.000007s : 0.03% optimize.opt_a.updatestate_assign_eliminate : 0.000006s : 0.02% optimize.opt_a.updatestate_loads_eliminate : 0.000005s : 0.02% optimize.opt_a.parameter_eliminate : 0.000004s : 0.01% optimize.opt_a.a_2 : 0.000232s : 0.88% optimize.opt_a.accelerated_algorithm : 0.000028s : 0.11% optimize.opt_a.shard : 0.000003s : 0.01% optimize.opt_a.meta_shard_fg_expand : 0.000003s : 0.01% optimize.opt_a.shard_inline : 0.000013s : 0.05% optimize.opt_a.merge_send_recv : 0.000013s : 0.05% optimize.opt_a.auto_parallel : 0.000011s : 0.04% optimize.opt_a.parallel : 0.000024s : 0.09% optimize.opt_a.flash_sp : 0.000010s : 0.04% optimize.opt_a.merge_comm : 0.000007s : 0.03% optimize.opt_a.allreduce_fusion : 0.000006s : 0.02% optimize.opt_a.matmul_add_comm_reduction : 0.000017s : 0.06% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000014s : 0.05% optimize.opt_a.virtual_dataset : 0.000012s : 0.05% optimize.opt_a.get_grad_eliminate_ : 0.000012s : 0.04% optimize.opt_a.virtual_output : 0.000012s : 0.05% optimize.opt_a.merge_forward : 0.000006s : 0.02% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.01% optimize.opt_a.offload_activation : 0.000015s : 0.06% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000026s : 0.10% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.01% optimize.opt_a.before_grad : 0.000019s : 0.07% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000007s : 0.03% optimize.opt_a.meta_fg_expand : 0.000005s : 0.02% optimize.opt_a.flash_sp_send_recv_attached : 0.000005s : 0.02% optimize.opt_a.receive_attached : 0.000003s : 0.01% optimize.opt_a.after_resolve : 0.000018s : 0.07% optimize.opt_a.a_after_grad : 0.000017s : 0.07% optimize.opt_a.renormalize : 0.000466s : 1.77% optimize.opt_a.add_forward_monad_depend : 0.000007s : 0.03% optimize.opt_a.auto_monad_grad : 0.000003s : 0.01% optimize.opt_a.auto_monad_eliminator : 0.000023s : 0.09% optimize.opt_a.cse : 0.000043s : 0.16% optimize.opt_a.a_3 : 0.000107s : 0.41% optimize.py_interpret_to_execute_after_opt_a : 0.000012s : 0.04% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.02% optimize.rewriter_after_opt_a : 0.000041s : 0.16% optimize.convert_after_rewriter : 0.000009s : 0.04% optimize.order_py_execute_after_rewriter : 0.000008s : 0.03% optimize.mutable_eliminate : 0.000507s : 1.93% optimize.opt_b.b_1 : 0.000178s : 0.68% optimize.opt_b.b_2 : 0.000009s : 0.03% optimize.opt_b.updatestate_depend_eliminate : 0.000006s : 0.02% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.01% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.01% optimize.opt_b.renormalize : 0.000000s : 0.00% optimize.opt_b.cse : 0.000018s : 0.07% optimize.optimize_parallel_all_gather_comm : 0.000019s : 0.07% optimize.overlap_param_gather : 0.000005s : 0.02% optimize.cconv : 0.000029s : 0.11% optimize.loop_unroll : 0.000437s : 1.66% optimize.opt_after_cconv.c_1 : 0.000028s : 0.11% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000005s : 0.02% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000002s : 0.01% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.cse : 0.000018s : 0.07% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000018s : 0.07% optimize.tuple_transform.d_1 : 0.000039s : 0.15% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000007s : 0.03% optimize.partial_unused_args_eliminate : 0.000004s : 0.02% optimize.add_recomputation : 0.000048s : 0.18% optimize.cse_after_recomputation.cse : 0.000012s : 0.04% optimize.environ_conv : 0.000008s : 0.03% optimize.swap_dp_allreduce_reducescatter : 0.000008s : 0.03% optimize.bias_add_comm_swap : 0.000005s : 0.02% optimize.label_micro_interleaved_index : 0.000007s : 0.02% optimize.label_fine_grained_interleaved_index : 0.000006s : 0.02% optimize.merge_cast_opt : 0.000004s : 0.01% optimize.slice_recompute_activation : 0.000004s : 0.02% optimize.micro_interleaved_order_control : 0.000005s : 0.02% optimize.assign_add_opt : 0.000004s : 0.01% optimize.ForceFp32Comm : 0.000004s : 0.01% optimize.remove_cast_before_assign_add : 0.000003s : 0.01% optimize.full_micro_interleaved_order_control : 0.000004s : 0.02% optimize.reorder_send_recv_between_fp_bp : 0.000005s : 0.02% optimize.comm_op_add_attrs : 0.000004s : 0.01% optimize.add_comm_op_reuse_tag : 0.000003s : 0.01% optimize.interleave_split_concat_branches : 0.000004s : 0.01% optimize.interleave_parallel_branches : 0.000004s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000004s : 0.02% optimize.control_data_broadcast_order : 0.000015s : 0.06% optimize.grouped_pairwise_exchange_alltoall : 0.000005s : 0.02% optimize.offloading_packed_experts : 0.000007s : 0.03% optimize.overlap_recompute_and_grad_model_parallel : 0.000008s : 0.03% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.02% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.02% optimize.overlap_recompute_comm : 0.000006s : 0.02% optimize.overlap_grad_ring_attention : 0.000007s : 0.03% optimize.overlap_grad_flash_sp : 0.000020s : 0.08% optimize.begin_end_overlap_inline : 0.000003s : 0.01% optimize.split_matmul_comm_elemetwise : 0.000005s : 0.02% optimize.split_layernorm_comm : 0.000004s : 0.02% optimize.handle_group_info : 0.000003s : 0.01% optimize.symbol_engine_optimizer.build : 0.000002s : 0.01% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000010s : 0.04% optimize.symbol_engine_optimizer.elim_not_effective : 0.000013s : 0.05% optimize.symbol_engine_optimizer.opt_reshape : 0.000007s : 0.03% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000010s : 0.04% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000003s : 0.01% pipeline_parallel_scheduler : 0.000002s : 0.01% auto_monad_reorder : 0.000018s : 0.07% get_jit_bprop_graph : 0.000001s : 0.00% rewriter_after_jit_bprop_graph : 0.000004s : 0.02% opt_after_jit_grad : 0.000494s : 1.88% validate : 0.000035s : 0.13% Time group info: ------[substitution.] 0.000170 25 33.71% : 0.000057s : 4: substitution.arithmetic_simplify 1.08% : 0.000002s : 2: substitution.elim_not_effective 0.76% : 0.000001s : 2: substitution.fold_const_symbol 3.26% : 0.000006s : 3: substitution.graph_param_transform 48.65% : 0.000083s : 2: substitution.inline 1.92% : 0.000003s : 4: substitution.j_node_and_user_rematch 6.11% : 0.000010s : 2: substitution.less_batch_normalization 2.55% : 0.000004s : 4: substitution.remove_not_recompute_node 1.95% : 0.000003s : 2: substitution.replace_old_param ------[type_inference.] 0.021618 2 97.93% : 0.021171s : 1: type_inference.infer 2.07% : 0.000447s : 1: type_inference.specialize ------[replace.] 0.000020 2 100.00% : 0.000020s : 2: replace.inline ------[match.] 0.000081 2 100.00% : 0.000081s : 2: match.inline ------[predicate.] 0.000136 754 0.81% : 0.000001s : 7: predicate.accumulaten_eliminater 1.12% : 0.000002s : 3: predicate.ad_related_special_op_eliminate 0.65% : 0.000001s : 6: predicate.addn_check_dump 0.79% : 0.000001s : 7: predicate.addn_zero_filter 0.66% : 0.000001s : 7: predicate.adjust_all_reduce_mul_add 2.84% : 0.000004s : 13: predicate.arithmetic_simplify 0.75% : 0.000001s : 7: predicate.cast_eliminate 0.80% : 0.000001s : 6: predicate.check_bprop_eliminate 0.67% : 0.000001s : 6: predicate.compare_switch_simplify 0.23% : 0.000000s : 3: predicate.const_output_eliminate 0.86% : 0.000001s : 6: predicate.depend_value_elim 0.76% : 0.000001s : 7: predicate.dict_get_item_const_eliminator 0.98% : 0.000001s : 7: predicate.dict_get_item_eliminator 0.74% : 0.000001s : 7: predicate.dict_set_item_eliminator 1.33% : 0.000002s : 6: predicate.dumpgradient_eliminate 0.34% : 0.000000s : 3: predicate.elim_not_effective 0.56% : 0.000001s : 3: predicate.elim_shapecalc_of_broadcastargs 1.33% : 0.000002s : 10: predicate.environ_add_const_eliminate 1.10% : 0.000001s : 10: predicate.environ_get_add_eliminate 1.01% : 0.000001s : 10: predicate.environ_get_depend_swap 1.85% : 0.000003s : 16: predicate.environ_get_eliminate 1.06% : 0.000001s : 10: predicate.environ_get_set_eliminate 0.93% : 0.000001s : 9: predicate.exchange_switch_depend_value 2.01% : 0.000003s : 9: predicate.float_depend_g_call 0.68% : 0.000001s : 6: predicate.float_environ_get_switch 1.00% : 0.000001s : 9: predicate.float_tuple_getitem_switch 0.21% : 0.000000s : 3: predicate.fold_const_symbol 0.84% : 0.000001s : 6: predicate.get_grad_eliminate 0.27% : 0.000000s : 3: predicate.graph_param_transform 0.81% : 0.000001s : 6: predicate.incorporate_call 0.65% : 0.000001s : 6: predicate.incorporate_call_switch 6.34% : 0.000009s : 34: predicate.inline 0.99% : 0.000001s : 6: predicate.inline_without_move 0.35% : 0.000000s : 6: predicate.j_node_and_user_rematch 1.17% : 0.000002s : 6: predicate.less_batch_normalization 1.69% : 0.000002s : 13: predicate.list_to_tuple_eliminator_ 2.04% : 0.000003s : 20: predicate.load_eliminater 1.23% : 0.000002s : 3: predicate.loop_unroll_after_grad 1.66% : 0.000002s : 14: predicate.loop_unroll_before_grad 1.90% : 0.000003s : 13: predicate.make_slice_get_slice_eliminator 0.67% : 0.000001s : 6: predicate.merge_addn 0.70% : 0.000001s : 6: predicate.micro_step_allgather_replace 0.73% : 0.000001s : 6: predicate.mini_step_allgather_replace 0.66% : 0.000001s : 7: predicate.minmaximum_grad 1.56% : 0.000002s : 3: predicate.mutable_eliminate 0.44% : 0.000001s : 3: predicate.opt_reshape 0.45% : 0.000001s : 3: predicate.parallel_virtual_node 1.28% : 0.000002s : 9: predicate.partial_defer_inline 1.24% : 0.000002s : 10: predicate.partial_eliminate 0.73% : 0.000001s : 7: predicate.print_const_string_wrapper 0.73% : 0.000001s : 6: predicate.reduce_all_const_elim 0.93% : 0.000001s : 7: predicate.reduce_eliminate 2.04% : 0.000003s : 20: predicate.redundant_stop_gradient_eliminater 0.56% : 0.000001s : 6: predicate.remove_not_recompute_node 1.06% : 0.000001s : 13: predicate.replace_applicator 0.59% : 0.000001s : 6: predicate.replace_old_param 0.34% : 0.000000s : 3: predicate.reset_defer_inline 0.92% : 0.000001s : 7: predicate.reshape_eliminate 0.99% : 0.000001s : 6: predicate.row_tensor_add_zeros_like 0.51% : 0.000001s : 3: predicate.row_tensor_eliminate 0.85% : 0.000001s : 6: predicate.same_eliminate 0.53% : 0.000001s : 6: predicate.set_cell_output_no_recompute 1.03% : 0.000001s : 6: predicate.shard_identity_eliminate 0.97% : 0.000001s : 6: predicate.special_op_eliminate 1.02% : 0.000001s : 6: predicate.specialize_transform 1.04% : 0.000001s : 6: predicate.split_environ_get_set_with_tuple_value 0.90% : 0.000001s : 6: predicate.stack_unstack_eliminate 0.43% : 0.000001s : 3: predicate.switch_call_monad_eliminater 0.95% : 0.000001s : 9: predicate.switch_defer_inline 1.75% : 0.000002s : 15: predicate.switch_layer_defer_inline 4.21% : 0.000006s : 32: predicate.switch_simplify 0.73% : 0.000001s : 7: predicate.tile_eliminate 0.78% : 0.000001s : 7: predicate.transpose_eliminate 1.45% : 0.000002s : 13: predicate.tuple_list_convert_item_index_to_positive 1.69% : 0.000002s : 13: predicate.tuple_list_get_item_const_eliminator 1.35% : 0.000002s : 13: predicate.tuple_list_get_item_depend_reorder 3.77% : 0.000005s : 19: predicate.tuple_list_get_item_eliminator 1.39% : 0.000002s : 13: predicate.tuple_list_get_set_item_eliminator 2.63% : 0.000004s : 19: predicate.tuple_list_set_item_eliminator 1.47% : 0.000002s : 13: predicate.tuple_to_list_eliminator_ 2.02% : 0.000003s : 20: predicate.updatestate_pure_node_eliminater 2.83% : 0.000004s : 26: predicate.updatestate_useless_node_eliminater 0.45% : 0.000001s : 3: predicate.value_based_eliminate 0.79% : 0.000001s : 6: predicate.virtual_dataset_eliminate 0.78% : 0.000001s : 6: predicate.virtual_output_eliminate 0.32% : 0.000000s : 3: predicate.virtual_view_grad_eliminate 0.75% : 0.000001s : 3: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000215 5 8.34% : 0.000018s : 1: func_graph_cloner_run.FuncGraphClonerGraph 91.66% : 0.000197s : 4: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.040435 192 0.02% : 0.000006s : 1: ForceFp32Comm 7.85% : 0.003174s : 1: add_attr 7.82% : 0.003161s : 1: add_attr_with_inline 0.02% : 0.000006s : 1: add_comm_op_reuse_tag 0.13% : 0.000051s : 1: add_recomputation 0.02% : 0.000006s : 1: assign_add_opt 0.16% : 0.000063s : 1: auto_monad 0.06% : 0.000026s : 1: auto_monad_reorder 0.02% : 0.000006s : 1: begin_end_overlap_inline 0.02% : 0.000008s : 1: bias_add_comm_swap 1.14% : 0.000460s : 1: bootstrap 0.08% : 0.000033s : 1: cconv 0.02% : 0.000006s : 1: comm_op_add_attrs 0.05% : 0.000019s : 1: control_data_broadcast_order 0.03% : 0.000012s : 1: convert_after_rewriter 0.07% : 0.000030s : 1: cse_after_recomputation 0.02% : 0.000008s : 1: dataset_repeat_opt 0.04% : 0.000018s : 1: detach_backward 0.03% : 0.000011s : 1: environ_conv 0.06% : 0.000023s : 1: event_method 0.02% : 0.000007s : 1: full_micro_interleaved_order_control 0.02% : 0.000007s : 1: get_jit_bprop_graph 0.03% : 0.000011s : 1: graph_reusing 0.02% : 0.000008s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000006s : 1: handle_group_info 0.02% : 0.000009s : 1: inline 0.02% : 0.000009s : 1: insert-virtual-dataset 0.02% : 0.000006s : 1: interleave_parallel_branches 0.02% : 0.000006s : 1: interleave_split_concat_branches 0.02% : 0.000008s : 1: label_fine_grained_interleaved_index 0.02% : 0.000009s : 1: label_micro_interleaved_index 1.09% : 0.000443s : 1: loop_unroll 0.02% : 0.000006s : 1: merge_cast_opt 0.02% : 0.000008s : 1: micro_interleaved_order_control 1.27% : 0.000513s : 1: mutable_eliminate 0.03% : 0.000010s : 1: offloading_packed_experts 0.03% : 0.000013s : 1: opt.transform.loop_unroll_optimizer 0.04% : 0.000016s : 1: opt.transform.mutable_eliminate 2.08% : 0.000840s : 78: opt.transform.opt_a 0.07% : 0.000027s : 1: opt.transform.opt_after_cconv 0.06% : 0.000023s : 1: opt.transform.opt_after_jit_grad 0.27% : 0.000107s : 28: opt.transform.opt_b 0.11% : 0.000043s : 2: opt.transform.opt_trans_graph 0.09% : 0.000036s : 4: opt.transform.symbol_engine_opt 6.11% : 0.002472s : 1: opt_a 0.31% : 0.000127s : 1: opt_after_cconv 1.25% : 0.000505s : 1: opt_after_jit_grad 0.71% : 0.000287s : 1: opt_b 12.33% : 0.004984s : 1: optimize 0.06% : 0.000022s : 1: optimize_parallel_all_gather_comm 0.03% : 0.000011s : 1: order_py_execute_after_rewriter 0.06% : 0.000024s : 1: overlap_grad_flash_sp 0.02% : 0.000007s : 1: overlap_grad_matmul_and_grad_allreduce 0.03% : 0.000010s : 1: overlap_grad_ring_attention 0.02% : 0.000007s : 1: overlap_opt_shard_grad_in_pipeline 0.02% : 0.000007s : 1: overlap_opt_shard_in_pipeline 0.02% : 0.000008s : 1: overlap_param_gather 0.02% : 0.000007s : 1: overlap_recompute_allgather_and_fa_grad 0.03% : 0.000011s : 1: overlap_recompute_and_grad_model_parallel 0.02% : 0.000009s : 1: overlap_recompute_comm 0.02% : 0.000010s : 1: parallel-infer-symbol 0.02% : 0.000006s : 1: parallel-infer-symbol-second 0.02% : 0.000007s : 1: partial_unused_args_eliminate 0.02% : 0.000009s : 1: pipeline_parallel_scheduler 0.02% : 0.000007s : 1: pipeline_split 0.08% : 0.000031s : 1: pre_auto_parallel 0.06% : 0.000022s : 1: py_interpret_to_execute 0.04% : 0.000015s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000006s : 1: remove_cast_before_assign_add 0.05% : 0.000022s : 1: remove_dup_value 0.58% : 0.000236s : 1: renormalize.infer 0.55% : 0.000222s : 1: renormalize.specialize 0.02% : 0.000008s : 1: reorder_send_recv_between_fp_bp 0.02% : 0.000010s : 1: rewriter_after_jit_bprop_graph 0.11% : 0.000044s : 1: rewriter_after_opt_a 0.12% : 0.000050s : 1: rewriter_before_opt_a 0.02% : 0.000008s : 1: slice_cell_reuse_recomputed_activation 0.02% : 0.000007s : 1: slice_recompute_activation 0.02% : 0.000007s : 1: split_layernorm_comm 0.02% : 0.000008s : 1: split_matmul_comm_elemetwise 0.03% : 0.000011s : 1: swap_dp_allreduce_reducescatter 0.24% : 0.000097s : 1: symbol_engine_optimizer 0.22% : 0.000089s : 1: tuple_transform 53.67% : 0.021703s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:27.501.830 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0301098, [21] [bootstrap]: 0.00044193 [type_inference]: 0.0217674 [event_method]: 1.233e-05 [auto_monad]: 7.171e-05 [graph_reusing]: 5.35999e-06 [inline]: 2.48e-06 [add_attr]: 0.00308301, [1] [add_attr_with_inline]: 0.00307431, [1] [Cycle 1]: 4.901e-05, [2] [tag_attr]: 1.418e-05 [meta_addattr_fg_expand]: 3.88999e-06 [parallel-infer-symbol]: 3.03e-06 [pre_auto_parallel]: 2.477e-05 [insert-virtual-dataset]: 2.48998e-06 [parallel-infer-symbol-second]: 7.30011e-07 [dataset_repeat_opt]: 2.08998e-06 [pipeline_split]: 1.54e-06 [optimize]: 0.00402453, [53] [py_interpret_to_execute]: 1.578e-05 [rewriter_before_opt_a]: 4.401e-05 [opt_a]: 0.00212951, [2] [Cycle 1]: 0.00147283, [45] [expand_dump_flag]: 2.59999e-06 [switch_simplify]: 2.451e-05 [loop_unroll]: 1.414e-05 [a_1]: 0.00031004 [with_stream_mark]: 1.618e-05 [recompute_prepare]: 8.64e-06 [updatestate_depend_eliminate]: 3.6e-06 [updatestate_assign_eliminate]: 3.03e-06 [updatestate_loads_eliminate]: 3.40998e-06 [parameter_eliminate]: 1.96998e-06 [a_2]: 8.993e-05 [accelerated_algorithm]: 1.86e-05 [shard]: 2.21998e-06 [meta_shard_fg_expand]: 1.70001e-06 [shard_inline]: 6.11998e-06 [merge_send_recv]: 8.58001e-06 [auto_parallel]: 6.02999e-06 [parallel]: 1.844e-05 [flash_sp]: 7.71001e-06 [merge_comm]: 3.73001e-06 [allreduce_fusion]: 3.71999e-06 [matmul_add_comm_reduction]: 1.039e-05 [allreduce_slice_to_reducescatter]: 6.40022e-07 [virtual_shard_identity]: 7.82e-06 [virtual_dataset]: 6.42001e-06 [get_grad_eliminate_]: 6.11998e-06 [virtual_output]: 7.1e-06 [merge_forward]: 4.51002e-06 [cell_reuse_recompute_pass]: 1.55999e-06 [offload_activation]: 1.008e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.258e-05 [merge_recompute_call_nodes]: 1.55999e-06 [before_grad]: 1.111e-05 [set_forward_comm_id_for_comm_node_pass]: 3.68999e-06 [meta_fg_expand]: 3.03998e-06 [flash_sp_send_recv_attached]: 5.17e-06 [receive_attached]: 2.34001e-06 [after_resolve]: 1.108e-05 [a_after_grad]: 9.62001e-06 [renormalize]: 0.00045566 [add_forward_monad_depend]: 4.63999e-06 [auto_monad_grad]: 2.41e-06 [auto_monad_eliminator]: 1.431e-05 [cse]: 2.738e-05 [a_3]: 4.456e-05 [Cycle 2]: 0.00064715, [45] [expand_dump_flag]: 1.14e-06 [switch_simplify]: 6.99001e-06 [loop_unroll]: 5.97999e-06 [a_1]: 0.00013051 [with_stream_mark]: 8.59002e-06 [recompute_prepare]: 6.58003e-06 [updatestate_depend_eliminate]: 3.08e-06 [updatestate_assign_eliminate]: 2.44999e-06 [updatestate_loads_eliminate]: 2.83e-06 [parameter_eliminate]: 1.66e-06 [a_2]: 8.001e-05 [accelerated_algorithm]: 9.57001e-06 [shard]: 1.09e-06 [meta_shard_fg_expand]: 1.61002e-06 [shard_inline]: 6.16e-06 [merge_send_recv]: 4.87e-06 [auto_parallel]: 5.45001e-06 [parallel]: 4.79002e-06 [flash_sp]: 3.46999e-06 [merge_comm]: 3.28e-06 [allreduce_fusion]: 3.01001e-06 [matmul_add_comm_reduction]: 6.41998e-06 [allreduce_slice_to_reducescatter]: 3.9002e-07 [virtual_shard_identity]: 6.71e-06 [virtual_dataset]: 5.59998e-06 [get_grad_eliminate_]: 5.49e-06 [virtual_output]: 5.17999e-06 [merge_forward]: 2.66e-06 [cell_reuse_recompute_pass]: 1.51998e-06 [offload_activation]: 6.96999e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.05e-05 [merge_recompute_call_nodes]: 9.50007e-07 [before_grad]: 8.83001e-06 [set_forward_comm_id_for_comm_node_pass]: 3.71001e-06 [meta_fg_expand]: 2.09e-06 [flash_sp_send_recv_attached]: 7.99977e-07 [receive_attached]: 9.89996e-07 [after_resolve]: 8.47998e-06 [a_after_grad]: 7.87e-06 [renormalize]: 8.00064e-08 [add_forward_monad_depend]: 1.22e-06 [auto_monad_grad]: 8.60018e-07 [auto_monad_eliminator]: 7.92e-06 [cse]: 1.567e-05 [a_3]: 3.43e-05 [py_interpret_to_execute_after_opt_a]: 8.99003e-06 [slice_cell_reuse_recomputed_activation]: 2.14999e-06 [rewriter_after_opt_a]: 3.405e-05 [convert_after_rewriter]: 6.61e-06 [order_py_execute_after_rewriter]: 5.44e-06 [mutable_eliminate]: 0.00048228 [opt_b]: 0.00019741, [1] [Cycle 1]: 0.00019131, [7] [b_1]: 0.00011777 [b_2]: 7.15e-06 [updatestate_depend_eliminate]: 5.59998e-06 [updatestate_assign_eliminate]: 2.43e-06 [updatestate_loads_eliminate]: 2.46e-06 [renormalize]: 9.20001e-07 [cse]: 1.933e-05 [optimize_parallel_all_gather_comm]: 1.489e-05 [overlap_param_gather]: 2.06998e-06 [cconv]: 2.711e-05 [loop_unroll]: 0.00042335 [opt_after_cconv]: 9.841e-05, [1] [Cycle 1]: 9.321e-05, [7] [c_1]: 2.712e-05 [parameter_eliminate]: 3.25e-06 [updatestate_depend_eliminate]: 4.87e-06 [updatestate_assign_eliminate]: 2.53e-06 [updatestate_loads_eliminate]: 2.27999e-06 [cse]: 1.899e-05 [renormalize]: 3.59985e-07 [remove_dup_value]: 1.527e-05 [tuple_transform]: 6.919e-05, [1] [Cycle 1]: 6.474e-05, [4] [d_1]: 3.801e-05 [none_parameter_eliminate]: 1.60001e-06 [renormalize]: 2.29978e-07 [switch_simplify]: 6.56999e-06 [partial_unused_args_eliminate]: 2.12999e-06 [add_recomputation]: 4.447e-05 [cse_after_recomputation]: 2.144e-05, [1] [Cycle 1]: 1.725e-05, [1] [cse]: 1.147e-05 [environ_conv]: 5.36998e-06 [swap_dp_allreduce_reducescatter]: 5.04e-06 [bias_add_comm_swap]: 2.68e-06 [label_micro_interleaved_index]: 4.67e-06 [label_fine_grained_interleaved_index]: 2.82002e-06 [merge_cast_opt]: 1.20999e-06 [slice_recompute_activation]: 2.01e-06 [micro_interleaved_order_control]: 2.21998e-06 [assign_add_opt]: 1.28002e-06 [ForceFp32Comm]: 7.89994e-07 [remove_cast_before_assign_add]: 1.34e-06 [full_micro_interleaved_order_control]: 2.14e-06 [reorder_send_recv_between_fp_bp]: 2.74999e-06 [comm_op_add_attrs]: 1.05001e-06 [add_comm_op_reuse_tag]: 9.39996e-07 [interleave_split_concat_branches]: 1.11997e-06 [interleave_parallel_branches]: 1.03001e-06 [overlap_opt_shard_in_pipeline]: 1.04998e-06 [overlap_opt_shard_grad_in_pipeline]: 2.24001e-06 [control_data_broadcast_order]: 1.213e-05 [grouped_pairwise_exchange_alltoall]: 1.50999e-06 [offloading_packed_experts]: 3.71001e-06 [overlap_recompute_and_grad_model_parallel]: 4.67e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.12e-06 [overlap_recompute_allgather_and_fa_grad]: 1.37e-06 [overlap_recompute_comm]: 2.46e-06 [overlap_grad_ring_attention]: 4.27e-06 [overlap_grad_flash_sp]: 1.756e-05 [begin_end_overlap_inline]: 5.79981e-07 [split_matmul_comm_elemetwise]: 2.05002e-06 [split_layernorm_comm]: 1.57001e-06 [handle_group_info]: 9.00007e-07 [symbol_engine_optimizer]: 7.191e-05, [1] [Cycle 1]: 6.786e-05, [6] [build]: 2.54001e-06 [elim_shapecalc]: 9.06002e-06 [elim_not_effective]: 1.191e-05 [opt_reshape]: 6.53e-06 [fold_const_symbol]: 9.52999e-06 [renormalize]: 2.19996e-07 [detach_backward]: 1.86e-06 [pipeline_parallel_scheduler]: 1.52999e-06 [auto_monad_reorder]: 1.637e-05 [get_jit_bprop_graph]: 1.45999e-06 [rewriter_after_jit_bprop_graph]: 3.3e-06 [opt_after_jit_grad]: 0.00045438 [validate]: 3.458e-05 Sums bootstrap : 0.000442s : 1.70% type_inference : 0.021767s : 83.52% event_method : 0.000012s : 0.05% auto_monad : 0.000072s : 0.28% graph_reusing : 0.000005s : 0.02% inline : 0.000002s : 0.01% add_attr.add_attr_with_inline.tag_attr : 0.000014s : 0.05% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000004s : 0.01% parallel-infer-symbol : 0.000003s : 0.01% pre_auto_parallel : 0.000025s : 0.10% insert-virtual-dataset : 0.000002s : 0.01% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.01% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000016s : 0.06% optimize.rewriter_before_opt_a : 0.000044s : 0.17% optimize.opt_a.expand_dump_flag : 0.000004s : 0.01% optimize.opt_a.switch_simplify : 0.000032s : 0.12% optimize.opt_a.loop_unroll : 0.000020s : 0.08% optimize.opt_a.a_1 : 0.000441s : 1.69% optimize.opt_a.with_stream_mark : 0.000025s : 0.10% optimize.opt_a.recompute_prepare : 0.000015s : 0.06% optimize.opt_a.updatestate_depend_eliminate : 0.000007s : 0.03% optimize.opt_a.updatestate_assign_eliminate : 0.000005s : 0.02% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.02% optimize.opt_a.parameter_eliminate : 0.000004s : 0.01% optimize.opt_a.a_2 : 0.000170s : 0.65% optimize.opt_a.accelerated_algorithm : 0.000028s : 0.11% optimize.opt_a.shard : 0.000003s : 0.01% optimize.opt_a.meta_shard_fg_expand : 0.000003s : 0.01% optimize.opt_a.shard_inline : 0.000012s : 0.05% optimize.opt_a.merge_send_recv : 0.000013s : 0.05% optimize.opt_a.auto_parallel : 0.000011s : 0.04% optimize.opt_a.parallel : 0.000023s : 0.09% optimize.opt_a.flash_sp : 0.000011s : 0.04% optimize.opt_a.merge_comm : 0.000007s : 0.03% optimize.opt_a.allreduce_fusion : 0.000007s : 0.03% optimize.opt_a.matmul_add_comm_reduction : 0.000017s : 0.06% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000015s : 0.06% optimize.opt_a.virtual_dataset : 0.000012s : 0.05% optimize.opt_a.get_grad_eliminate_ : 0.000012s : 0.04% optimize.opt_a.virtual_output : 0.000012s : 0.05% optimize.opt_a.merge_forward : 0.000007s : 0.03% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.01% optimize.opt_a.offload_activation : 0.000017s : 0.07% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000023s : 0.09% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.01% optimize.opt_a.before_grad : 0.000020s : 0.08% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000007s : 0.03% optimize.opt_a.meta_fg_expand : 0.000005s : 0.02% optimize.opt_a.flash_sp_send_recv_attached : 0.000006s : 0.02% optimize.opt_a.receive_attached : 0.000003s : 0.01% optimize.opt_a.after_resolve : 0.000020s : 0.08% optimize.opt_a.a_after_grad : 0.000017s : 0.07% optimize.opt_a.renormalize : 0.000456s : 1.75% optimize.opt_a.add_forward_monad_depend : 0.000006s : 0.02% optimize.opt_a.auto_monad_grad : 0.000003s : 0.01% optimize.opt_a.auto_monad_eliminator : 0.000022s : 0.09% optimize.opt_a.cse : 0.000043s : 0.17% optimize.opt_a.a_3 : 0.000079s : 0.30% optimize.py_interpret_to_execute_after_opt_a : 0.000009s : 0.03% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.01% optimize.rewriter_after_opt_a : 0.000034s : 0.13% optimize.convert_after_rewriter : 0.000007s : 0.03% optimize.order_py_execute_after_rewriter : 0.000005s : 0.02% optimize.mutable_eliminate : 0.000482s : 1.85% optimize.opt_b.b_1 : 0.000118s : 0.45% optimize.opt_b.b_2 : 0.000007s : 0.03% optimize.opt_b.updatestate_depend_eliminate : 0.000006s : 0.02% optimize.opt_b.updatestate_assign_eliminate : 0.000002s : 0.01% optimize.opt_b.updatestate_loads_eliminate : 0.000002s : 0.01% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000019s : 0.07% optimize.optimize_parallel_all_gather_comm : 0.000015s : 0.06% optimize.overlap_param_gather : 0.000002s : 0.01% optimize.cconv : 0.000027s : 0.10% optimize.loop_unroll : 0.000423s : 1.62% optimize.opt_after_cconv.c_1 : 0.000027s : 0.10% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000005s : 0.02% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.01% optimize.opt_after_cconv.cse : 0.000019s : 0.07% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000015s : 0.06% optimize.tuple_transform.d_1 : 0.000038s : 0.15% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000007s : 0.03% optimize.partial_unused_args_eliminate : 0.000002s : 0.01% optimize.add_recomputation : 0.000044s : 0.17% optimize.cse_after_recomputation.cse : 0.000011s : 0.04% optimize.environ_conv : 0.000005s : 0.02% optimize.swap_dp_allreduce_reducescatter : 0.000005s : 0.02% optimize.bias_add_comm_swap : 0.000003s : 0.01% optimize.label_micro_interleaved_index : 0.000005s : 0.02% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.01% optimize.merge_cast_opt : 0.000001s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.01% optimize.micro_interleaved_order_control : 0.000002s : 0.01% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.01% optimize.full_micro_interleaved_order_control : 0.000002s : 0.01% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.01% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.01% optimize.control_data_broadcast_order : 0.000012s : 0.05% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.01% optimize.offloading_packed_experts : 0.000004s : 0.01% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.02% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.01% optimize.overlap_recompute_comm : 0.000002s : 0.01% optimize.overlap_grad_ring_attention : 0.000004s : 0.02% optimize.overlap_grad_flash_sp : 0.000018s : 0.07% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.01% optimize.split_layernorm_comm : 0.000002s : 0.01% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000003s : 0.01% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000009s : 0.03% optimize.symbol_engine_optimizer.elim_not_effective : 0.000012s : 0.05% optimize.symbol_engine_optimizer.opt_reshape : 0.000007s : 0.03% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000010s : 0.04% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.01% pipeline_parallel_scheduler : 0.000002s : 0.01% auto_monad_reorder : 0.000016s : 0.06% get_jit_bprop_graph : 0.000001s : 0.01% rewriter_after_jit_bprop_graph : 0.000003s : 0.01% opt_after_jit_grad : 0.000454s : 1.74% validate : 0.000035s : 0.13% Time group info: ------[substitution.] 0.000171 25 32.90% : 0.000056s : 4: substitution.arithmetic_simplify 1.06% : 0.000002s : 2: substitution.elim_not_effective 0.79% : 0.000001s : 2: substitution.fold_const_symbol 3.11% : 0.000005s : 3: substitution.graph_param_transform 49.25% : 0.000084s : 2: substitution.inline 1.95% : 0.000003s : 4: substitution.j_node_and_user_rematch 6.59% : 0.000011s : 2: substitution.less_batch_normalization 2.56% : 0.000004s : 4: substitution.remove_not_recompute_node 1.79% : 0.000003s : 2: substitution.replace_old_param ------[type_inference.] 0.021718 2 97.95% : 0.021272s : 1: type_inference.infer 2.05% : 0.000446s : 1: type_inference.specialize ------[replace.] 0.000020 2 100.00% : 0.000020s : 2: replace.inline ------[match.] 0.000082 2 100.00% : 0.000082s : 2: match.inline ------[predicate.] 0.000131 754 0.84% : 0.000001s : 7: predicate.accumulaten_eliminater 1.19% : 0.000002s : 3: predicate.ad_related_special_op_eliminate 0.63% : 0.000001s : 6: predicate.addn_check_dump 0.86% : 0.000001s : 7: predicate.addn_zero_filter 0.71% : 0.000001s : 7: predicate.adjust_all_reduce_mul_add 2.78% : 0.000004s : 13: predicate.arithmetic_simplify 0.75% : 0.000001s : 7: predicate.cast_eliminate 0.76% : 0.000001s : 6: predicate.check_bprop_eliminate 0.73% : 0.000001s : 6: predicate.compare_switch_simplify 0.20% : 0.000000s : 3: predicate.const_output_eliminate 0.71% : 0.000001s : 6: predicate.depend_value_elim 0.77% : 0.000001s : 7: predicate.dict_get_item_const_eliminator 0.91% : 0.000001s : 7: predicate.dict_get_item_eliminator 0.75% : 0.000001s : 7: predicate.dict_set_item_eliminator 1.49% : 0.000002s : 6: predicate.dumpgradient_eliminate 0.29% : 0.000000s : 3: predicate.elim_not_effective 0.47% : 0.000001s : 3: predicate.elim_shapecalc_of_broadcastargs 1.14% : 0.000002s : 10: predicate.environ_add_const_eliminate 1.03% : 0.000001s : 10: predicate.environ_get_add_eliminate 1.03% : 0.000001s : 10: predicate.environ_get_depend_swap 1.76% : 0.000002s : 16: predicate.environ_get_eliminate 1.05% : 0.000001s : 10: predicate.environ_get_set_eliminate 0.96% : 0.000001s : 9: predicate.exchange_switch_depend_value 2.02% : 0.000003s : 9: predicate.float_depend_g_call 0.65% : 0.000001s : 6: predicate.float_environ_get_switch 1.03% : 0.000001s : 9: predicate.float_tuple_getitem_switch 0.23% : 0.000000s : 3: predicate.fold_const_symbol 0.87% : 0.000001s : 6: predicate.get_grad_eliminate 0.30% : 0.000000s : 3: predicate.graph_param_transform 0.81% : 0.000001s : 6: predicate.incorporate_call 0.67% : 0.000001s : 6: predicate.incorporate_call_switch 6.27% : 0.000008s : 34: predicate.inline 1.07% : 0.000001s : 6: predicate.inline_without_move 0.35% : 0.000000s : 6: predicate.j_node_and_user_rematch 1.25% : 0.000002s : 6: predicate.less_batch_normalization 1.65% : 0.000002s : 13: predicate.list_to_tuple_eliminator_ 2.08% : 0.000003s : 20: predicate.load_eliminater 1.22% : 0.000002s : 3: predicate.loop_unroll_after_grad 1.67% : 0.000002s : 14: predicate.loop_unroll_before_grad 1.91% : 0.000003s : 13: predicate.make_slice_get_slice_eliminator 0.71% : 0.000001s : 6: predicate.merge_addn 0.63% : 0.000001s : 6: predicate.micro_step_allgather_replace 0.68% : 0.000001s : 6: predicate.mini_step_allgather_replace 0.67% : 0.000001s : 7: predicate.minmaximum_grad 1.53% : 0.000002s : 3: predicate.mutable_eliminate 0.46% : 0.000001s : 3: predicate.opt_reshape 0.46% : 0.000001s : 3: predicate.parallel_virtual_node 1.30% : 0.000002s : 9: predicate.partial_defer_inline 1.25% : 0.000002s : 10: predicate.partial_eliminate 0.80% : 0.000001s : 7: predicate.print_const_string_wrapper 0.70% : 0.000001s : 6: predicate.reduce_all_const_elim 1.02% : 0.000001s : 7: predicate.reduce_eliminate 2.06% : 0.000003s : 20: predicate.redundant_stop_gradient_eliminater 0.60% : 0.000001s : 6: predicate.remove_not_recompute_node 1.16% : 0.000002s : 13: predicate.replace_applicator 0.71% : 0.000001s : 6: predicate.replace_old_param 0.34% : 0.000000s : 3: predicate.reset_defer_inline 0.83% : 0.000001s : 7: predicate.reshape_eliminate 0.80% : 0.000001s : 6: predicate.row_tensor_add_zeros_like 0.49% : 0.000001s : 3: predicate.row_tensor_eliminate 1.06% : 0.000001s : 6: predicate.same_eliminate 0.52% : 0.000001s : 6: predicate.set_cell_output_no_recompute 0.90% : 0.000001s : 6: predicate.shard_identity_eliminate 0.93% : 0.000001s : 6: predicate.special_op_eliminate 0.96% : 0.000001s : 6: predicate.specialize_transform 1.06% : 0.000001s : 6: predicate.split_environ_get_set_with_tuple_value 0.94% : 0.000001s : 6: predicate.stack_unstack_eliminate 0.43% : 0.000001s : 3: predicate.switch_call_monad_eliminater 1.01% : 0.000001s : 9: predicate.switch_defer_inline 1.69% : 0.000002s : 15: predicate.switch_layer_defer_inline 4.32% : 0.000006s : 32: predicate.switch_simplify 0.77% : 0.000001s : 7: predicate.tile_eliminate 0.84% : 0.000001s : 7: predicate.transpose_eliminate 1.54% : 0.000002s : 13: predicate.tuple_list_convert_item_index_to_positive 1.63% : 0.000002s : 13: predicate.tuple_list_get_item_const_eliminator 1.49% : 0.000002s : 13: predicate.tuple_list_get_item_depend_reorder 3.17% : 0.000004s : 19: predicate.tuple_list_get_item_eliminator 1.40% : 0.000002s : 13: predicate.tuple_list_get_set_item_eliminator 2.57% : 0.000003s : 19: predicate.tuple_list_set_item_eliminator 1.56% : 0.000002s : 13: predicate.tuple_to_list_eliminator_ 2.02% : 0.000003s : 20: predicate.updatestate_pure_node_eliminater 3.12% : 0.000004s : 26: predicate.updatestate_useless_node_eliminater 0.45% : 0.000001s : 3: predicate.value_based_eliminate 0.85% : 0.000001s : 6: predicate.virtual_dataset_eliminate 0.81% : 0.000001s : 6: predicate.virtual_output_eliminate 0.36% : 0.000000s : 3: predicate.virtual_view_grad_eliminate 0.49% : 0.000001s : 3: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000210 5 8.37% : 0.000018s : 1: func_graph_cloner_run.FuncGraphClonerGraph 91.63% : 0.000192s : 4: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.038635 192 0.01% : 0.000004s : 1: ForceFp32Comm 7.99% : 0.003088s : 1: add_attr 7.97% : 0.003078s : 1: add_attr_with_inline 0.01% : 0.000004s : 1: add_comm_op_reuse_tag 0.12% : 0.000048s : 1: add_recomputation 0.01% : 0.000004s : 1: assign_add_opt 0.20% : 0.000077s : 1: auto_monad 0.05% : 0.000020s : 1: auto_monad_reorder 0.01% : 0.000003s : 1: begin_end_overlap_inline 0.01% : 0.000005s : 1: bias_add_comm_swap 1.23% : 0.000474s : 1: bootstrap 0.08% : 0.000031s : 1: cconv 0.01% : 0.000004s : 1: comm_op_add_attrs 0.04% : 0.000015s : 1: control_data_broadcast_order 0.03% : 0.000010s : 1: convert_after_rewriter 0.06% : 0.000024s : 1: cse_after_recomputation 0.01% : 0.000005s : 1: dataset_repeat_opt 0.01% : 0.000005s : 1: detach_backward 0.02% : 0.000009s : 1: environ_conv 0.05% : 0.000019s : 1: event_method 0.01% : 0.000005s : 1: full_micro_interleaved_order_control 0.01% : 0.000004s : 1: get_jit_bprop_graph 0.02% : 0.000009s : 1: graph_reusing 0.01% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000004s : 1: handle_group_info 0.01% : 0.000005s : 1: inline 0.01% : 0.000006s : 1: insert-virtual-dataset 0.01% : 0.000004s : 1: interleave_parallel_branches 0.01% : 0.000004s : 1: interleave_split_concat_branches 0.02% : 0.000006s : 1: label_fine_grained_interleaved_index 0.02% : 0.000008s : 1: label_micro_interleaved_index 1.11% : 0.000431s : 1: loop_unroll 0.01% : 0.000004s : 1: merge_cast_opt 0.01% : 0.000005s : 1: micro_interleaved_order_control 1.27% : 0.000491s : 1: mutable_eliminate 0.02% : 0.000007s : 1: offloading_packed_experts 0.03% : 0.000013s : 1: opt.transform.loop_unroll_optimizer 0.04% : 0.000014s : 1: opt.transform.mutable_eliminate 2.16% : 0.000836s : 78: opt.transform.opt_a 0.07% : 0.000026s : 1: opt.transform.opt_after_cconv 0.06% : 0.000023s : 1: opt.transform.opt_after_jit_grad 0.24% : 0.000093s : 28: opt.transform.opt_b 0.11% : 0.000043s : 2: opt.transform.opt_trans_graph 0.09% : 0.000033s : 4: opt.transform.symbol_engine_opt 5.52% : 0.002133s : 1: opt_a 0.26% : 0.000102s : 1: opt_after_cconv 1.20% : 0.000463s : 1: opt_after_jit_grad 0.52% : 0.000201s : 1: opt_b 10.43% : 0.004029s : 1: optimize 0.05% : 0.000018s : 1: optimize_parallel_all_gather_comm 0.02% : 0.000008s : 1: order_py_execute_after_rewriter 0.05% : 0.000021s : 1: overlap_grad_flash_sp 0.01% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.02% : 0.000007s : 1: overlap_grad_ring_attention 0.01% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.01% : 0.000005s : 1: overlap_param_gather 0.01% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.02% : 0.000008s : 1: overlap_recompute_and_grad_model_parallel 0.01% : 0.000005s : 1: overlap_recompute_comm 0.02% : 0.000006s : 1: parallel-infer-symbol 0.01% : 0.000004s : 1: parallel-infer-symbol-second 0.01% : 0.000005s : 1: partial_unused_args_eliminate 0.01% : 0.000005s : 1: pipeline_parallel_scheduler 0.01% : 0.000004s : 1: pipeline_split 0.07% : 0.000029s : 1: pre_auto_parallel 0.05% : 0.000019s : 1: py_interpret_to_execute 0.03% : 0.000012s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000004s : 1: remove_cast_before_assign_add 0.05% : 0.000019s : 1: remove_dup_value 0.63% : 0.000242s : 1: renormalize.infer 0.53% : 0.000206s : 1: renormalize.specialize 0.01% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.02% : 0.000006s : 1: rewriter_after_jit_bprop_graph 0.10% : 0.000039s : 1: rewriter_after_opt_a 0.12% : 0.000048s : 1: rewriter_before_opt_a 0.01% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.01% : 0.000005s : 1: slice_recompute_activation 0.01% : 0.000004s : 1: split_layernorm_comm 0.01% : 0.000005s : 1: split_matmul_comm_elemetwise 0.02% : 0.000008s : 1: swap_dp_allreduce_reducescatter 0.19% : 0.000075s : 1: symbol_engine_optimizer 0.19% : 0.000072s : 1: tuple_transform 56.39% : 0.021786s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:28.212.95 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:28.215.85 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0145408, [21] [bootstrap]: 0.00039991 [type_inference]: 0.00516548 [event_method]: 1.173e-05 [auto_monad]: 5.455e-05 [graph_reusing]: 5.43002e-06 [inline]: 2.79999e-06 [add_attr]: 0.00306446, [1] [add_attr_with_inline]: 0.0030567, [1] [Cycle 1]: 6.371e-05, [2] [tag_attr]: 1.497e-05 [meta_addattr_fg_expand]: 3.86999e-06 [parallel-infer-symbol]: 3.45e-06 [pre_auto_parallel]: 2.455e-05 [insert-virtual-dataset]: 2.29999e-06 [parallel-infer-symbol-second]: 7.40023e-07 [dataset_repeat_opt]: 2.02001e-06 [pipeline_split]: 1.82999e-06 [optimize]: 0.00472316, [53] [py_interpret_to_execute]: 2.052e-05 [rewriter_before_opt_a]: 4.83e-05 [opt_a]: 0.00250924, [2] [Cycle 1]: 0.00167393, [45] [expand_dump_flag]: 2.91e-06 [switch_simplify]: 2.516e-05 [loop_unroll]: 1.432e-05 [a_1]: 0.00031563 [with_stream_mark]: 1.629e-05 [recompute_prepare]: 9.04e-06 [updatestate_depend_eliminate]: 4.36002e-06 [updatestate_assign_eliminate]: 3.38999e-06 [updatestate_loads_eliminate]: 3.41001e-06 [parameter_eliminate]: 2.24999e-06 [a_2]: 0.0001258 [accelerated_algorithm]: 1.871e-05 [shard]: 2.10002e-06 [meta_shard_fg_expand]: 1.77999e-06 [shard_inline]: 7.43e-06 [merge_send_recv]: 8.31002e-06 [auto_parallel]: 6.47001e-06 [parallel]: 1.906e-05 [flash_sp]: 6.91999e-06 [merge_comm]: 3.90998e-06 [allreduce_fusion]: 3.56999e-06 [matmul_add_comm_reduction]: 1.105e-05 [allreduce_slice_to_reducescatter]: 6.19999e-07 [virtual_shard_identity]: 7.98999e-06 [virtual_dataset]: 6.69001e-06 [get_grad_eliminate_]: 6.50002e-06 [virtual_output]: 6.98998e-06 [merge_forward]: 4.18001e-06 [cell_reuse_recompute_pass]: 1.49e-06 [offload_activation]: 9.84001e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.474e-05 [merge_recompute_call_nodes]: 1.44e-06 [before_grad]: 1.102e-05 [set_forward_comm_id_for_comm_node_pass]: 3.87998e-06 [meta_fg_expand]: 2.59999e-06 [flash_sp_send_recv_attached]: 5.12e-06 [receive_attached]: 2.41e-06 [after_resolve]: 1.023e-05 [a_after_grad]: 9.64999e-06 [renormalize]: 0.000465 [add_forward_monad_depend]: 4.72e-06 [auto_monad_grad]: 2.14e-06 [auto_monad_eliminator]: 1.388e-05 [cse]: 2.896e-05 [a_3]: 5.948e-05 [Cycle 2]: 0.00082221, [45] [expand_dump_flag]: 1.00001e-06 [switch_simplify]: 7.33e-06 [loop_unroll]: 5.93998e-06 [a_1]: 0.00012829 [with_stream_mark]: 9.39998e-06 [recompute_prepare]: 6.76999e-06 [updatestate_depend_eliminate]: 3.06001e-06 [updatestate_assign_eliminate]: 2.51e-06 [updatestate_loads_eliminate]: 2.63998e-06 [parameter_eliminate]: 1.34e-06 [a_2]: 0.00010913 [accelerated_algorithm]: 9.49999e-06 [shard]: 9.09989e-07 [meta_shard_fg_expand]: 1.54998e-06 [shard_inline]: 6.61e-06 [merge_send_recv]: 5.29e-06 [auto_parallel]: 6.10002e-06 [parallel]: 5.07999e-06 [flash_sp]: 3.69002e-06 [merge_comm]: 3.55998e-06 [allreduce_fusion]: 3.90998e-06 [matmul_add_comm_reduction]: 6.86001e-06 [allreduce_slice_to_reducescatter]: 3.39991e-07 [virtual_shard_identity]: 6.54001e-06 [virtual_dataset]: 5.97001e-06 [get_grad_eliminate_]: 5.49e-06 [virtual_output]: 5.37001e-06 [merge_forward]: 2.91999e-06 [cell_reuse_recompute_pass]: 1.32999e-06 [offload_activation]: 6.52001e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.293e-05 [merge_recompute_call_nodes]: 1.02e-06 [before_grad]: 8.97e-06 [set_forward_comm_id_for_comm_node_pass]: 3.7e-06 [meta_fg_expand]: 2.12001e-06 [flash_sp_send_recv_attached]: 9.10019e-07 [receive_attached]: 1.13001e-06 [after_resolve]: 8.72998e-06 [a_after_grad]: 8.03001e-06 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 1.29e-06 [auto_monad_grad]: 1.07e-06 [auto_monad_eliminator]: 7.9e-06 [cse]: 1.525e-05 [a_3]: 4.796e-05 [py_interpret_to_execute_after_opt_a]: 1.238e-05 [slice_cell_reuse_recomputed_activation]: 4.48999e-06 [rewriter_after_opt_a]: 3.872e-05 [convert_after_rewriter]: 9.91998e-06 [order_py_execute_after_rewriter]: 8.15999e-06 [mutable_eliminate]: 0.00052546 [opt_b]: 0.00026713, [1] [Cycle 1]: 0.00025862, [7] [b_1]: 0.00016266 [b_2]: 8.19998e-06 [updatestate_depend_eliminate]: 6.19001e-06 [updatestate_assign_eliminate]: 2.74999e-06 [updatestate_loads_eliminate]: 2.27001e-06 [renormalize]: 8.2e-07 [cse]: 1.897e-05 [optimize_parallel_all_gather_comm]: 1.942e-05 [overlap_param_gather]: 4.73001e-06 [cconv]: 2.983e-05 [loop_unroll]: 0.00043706 [opt_after_cconv]: 0.00012373, [1] [Cycle 1]: 0.00011491, [7] [c_1]: 2.794e-05 [parameter_eliminate]: 2.71999e-06 [updatestate_depend_eliminate]: 5.10001e-06 [updatestate_assign_eliminate]: 2.49999e-06 [updatestate_loads_eliminate]: 2.81999e-06 [cse]: 1.785e-05 [renormalize]: 4.19997e-07 [remove_dup_value]: 1.793e-05 [tuple_transform]: 8.535e-05, [1] [Cycle 1]: 7.803e-05, [4] [d_1]: 3.939e-05 [none_parameter_eliminate]: 1.59e-06 [renormalize]: 2.30008e-07 [switch_simplify]: 6.54001e-06 [partial_unused_args_eliminate]: 4.83001e-06 [add_recomputation]: 4.834e-05 [cse_after_recomputation]: 2.695e-05, [1] [Cycle 1]: 2.053e-05, [1] [cse]: 1.168e-05 [environ_conv]: 8.25e-06 [swap_dp_allreduce_reducescatter]: 7.63001e-06 [bias_add_comm_swap]: 5.40001e-06 [label_micro_interleaved_index]: 6.63e-06 [label_fine_grained_interleaved_index]: 5.12e-06 [merge_cast_opt]: 3.86999e-06 [slice_recompute_activation]: 4.52e-06 [micro_interleaved_order_control]: 4.67998e-06 [assign_add_opt]: 3.58e-06 [ForceFp32Comm]: 3.40998e-06 [remove_cast_before_assign_add]: 3.4e-06 [full_micro_interleaved_order_control]: 4.50001e-06 [reorder_send_recv_between_fp_bp]: 5.32001e-06 [comm_op_add_attrs]: 3.7e-06 [add_comm_op_reuse_tag]: 3.33998e-06 [interleave_split_concat_branches]: 3.8e-06 [interleave_parallel_branches]: 3.5e-06 [overlap_opt_shard_in_pipeline]: 4.07e-06 [overlap_opt_shard_grad_in_pipeline]: 4.42e-06 [control_data_broadcast_order]: 1.541e-05 [grouped_pairwise_exchange_alltoall]: 4.37e-06 [offloading_packed_experts]: 6.34001e-06 [overlap_recompute_and_grad_model_parallel]: 7.26999e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.58e-06 [overlap_recompute_allgather_and_fa_grad]: 3.72002e-06 [overlap_recompute_comm]: 5.25999e-06 [overlap_grad_ring_attention]: 6.56e-06 [overlap_grad_flash_sp]: 2.12e-05 [begin_end_overlap_inline]: 2.92002e-06 [split_matmul_comm_elemetwise]: 4.53001e-06 [split_layernorm_comm]: 4.05e-06 [handle_group_info]: 3.48e-06 [symbol_engine_optimizer]: 9.332e-05, [1] [Cycle 1]: 8.671e-05, [6] [build]: 2.61e-06 [elim_shapecalc]: 9.19998e-06 [elim_not_effective]: 1.239e-05 [opt_reshape]: 6.79999e-06 [fold_const_symbol]: 1.028e-05 [renormalize]: 2.09984e-07 [detach_backward]: 3.58999e-06 [pipeline_parallel_scheduler]: 1.73002e-06 [auto_monad_reorder]: 1.776e-05 [get_jit_bprop_graph]: 1.35001e-06 [rewriter_after_jit_bprop_graph]: 4.55001e-06 [opt_after_jit_grad]: 0.00047329 [validate]: 3.497e-05 Sums bootstrap : 0.000400s : 4.09% type_inference : 0.005165s : 52.83% event_method : 0.000012s : 0.12% auto_monad : 0.000055s : 0.56% graph_reusing : 0.000005s : 0.06% inline : 0.000003s : 0.03% add_attr.add_attr_with_inline.tag_attr : 0.000015s : 0.15% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000004s : 0.04% parallel-infer-symbol : 0.000003s : 0.04% pre_auto_parallel : 0.000025s : 0.25% insert-virtual-dataset : 0.000002s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.02% optimize.py_interpret_to_execute : 0.000021s : 0.21% optimize.rewriter_before_opt_a : 0.000048s : 0.49% optimize.opt_a.expand_dump_flag : 0.000004s : 0.04% optimize.opt_a.switch_simplify : 0.000032s : 0.33% optimize.opt_a.loop_unroll : 0.000020s : 0.21% optimize.opt_a.a_1 : 0.000444s : 4.54% optimize.opt_a.with_stream_mark : 0.000026s : 0.26% optimize.opt_a.recompute_prepare : 0.000016s : 0.16% optimize.opt_a.updatestate_depend_eliminate : 0.000007s : 0.08% optimize.opt_a.updatestate_assign_eliminate : 0.000006s : 0.06% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.06% optimize.opt_a.parameter_eliminate : 0.000004s : 0.04% optimize.opt_a.a_2 : 0.000235s : 2.40% optimize.opt_a.accelerated_algorithm : 0.000028s : 0.29% optimize.opt_a.shard : 0.000003s : 0.03% optimize.opt_a.meta_shard_fg_expand : 0.000003s : 0.03% optimize.opt_a.shard_inline : 0.000014s : 0.14% optimize.opt_a.merge_send_recv : 0.000014s : 0.14% optimize.opt_a.auto_parallel : 0.000013s : 0.13% optimize.opt_a.parallel : 0.000024s : 0.25% optimize.opt_a.flash_sp : 0.000011s : 0.11% optimize.opt_a.merge_comm : 0.000007s : 0.08% optimize.opt_a.allreduce_fusion : 0.000007s : 0.08% optimize.opt_a.matmul_add_comm_reduction : 0.000018s : 0.18% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000015s : 0.15% optimize.opt_a.virtual_dataset : 0.000013s : 0.13% optimize.opt_a.get_grad_eliminate_ : 0.000012s : 0.12% optimize.opt_a.virtual_output : 0.000012s : 0.13% optimize.opt_a.merge_forward : 0.000007s : 0.07% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.03% optimize.opt_a.offload_activation : 0.000016s : 0.17% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000028s : 0.28% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.03% optimize.opt_a.before_grad : 0.000020s : 0.20% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000008s : 0.08% optimize.opt_a.meta_fg_expand : 0.000005s : 0.05% optimize.opt_a.flash_sp_send_recv_attached : 0.000006s : 0.06% optimize.opt_a.receive_attached : 0.000004s : 0.04% optimize.opt_a.after_resolve : 0.000019s : 0.19% optimize.opt_a.a_after_grad : 0.000018s : 0.18% optimize.opt_a.renormalize : 0.000465s : 4.76% optimize.opt_a.add_forward_monad_depend : 0.000006s : 0.06% optimize.opt_a.auto_monad_grad : 0.000003s : 0.03% optimize.opt_a.auto_monad_eliminator : 0.000022s : 0.22% optimize.opt_a.cse : 0.000044s : 0.45% optimize.opt_a.a_3 : 0.000107s : 1.10% optimize.py_interpret_to_execute_after_opt_a : 0.000012s : 0.13% optimize.slice_cell_reuse_recomputed_activation : 0.000004s : 0.05% optimize.rewriter_after_opt_a : 0.000039s : 0.40% optimize.convert_after_rewriter : 0.000010s : 0.10% optimize.order_py_execute_after_rewriter : 0.000008s : 0.08% optimize.mutable_eliminate : 0.000525s : 5.37% optimize.opt_b.b_1 : 0.000163s : 1.66% optimize.opt_b.b_2 : 0.000008s : 0.08% optimize.opt_b.updatestate_depend_eliminate : 0.000006s : 0.06% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_b.updatestate_loads_eliminate : 0.000002s : 0.02% optimize.opt_b.renormalize : 0.000001s : 0.01% optimize.opt_b.cse : 0.000019s : 0.19% optimize.optimize_parallel_all_gather_comm : 0.000019s : 0.20% optimize.overlap_param_gather : 0.000005s : 0.05% optimize.cconv : 0.000030s : 0.31% optimize.loop_unroll : 0.000437s : 4.47% optimize.opt_after_cconv.c_1 : 0.000028s : 0.29% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000005s : 0.05% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000002s : 0.03% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.cse : 0.000018s : 0.18% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000018s : 0.18% optimize.tuple_transform.d_1 : 0.000039s : 0.40% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.02% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000007s : 0.07% optimize.partial_unused_args_eliminate : 0.000005s : 0.05% optimize.add_recomputation : 0.000048s : 0.49% optimize.cse_after_recomputation.cse : 0.000012s : 0.12% optimize.environ_conv : 0.000008s : 0.08% optimize.swap_dp_allreduce_reducescatter : 0.000008s : 0.08% optimize.bias_add_comm_swap : 0.000005s : 0.06% optimize.label_micro_interleaved_index : 0.000007s : 0.07% optimize.label_fine_grained_interleaved_index : 0.000005s : 0.05% optimize.merge_cast_opt : 0.000004s : 0.04% optimize.slice_recompute_activation : 0.000005s : 0.05% optimize.micro_interleaved_order_control : 0.000005s : 0.05% optimize.assign_add_opt : 0.000004s : 0.04% optimize.ForceFp32Comm : 0.000003s : 0.03% optimize.remove_cast_before_assign_add : 0.000003s : 0.03% optimize.full_micro_interleaved_order_control : 0.000005s : 0.05% optimize.reorder_send_recv_between_fp_bp : 0.000005s : 0.05% optimize.comm_op_add_attrs : 0.000004s : 0.04% optimize.add_comm_op_reuse_tag : 0.000003s : 0.03% optimize.interleave_split_concat_branches : 0.000004s : 0.04% optimize.interleave_parallel_branches : 0.000003s : 0.04% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.04% optimize.overlap_opt_shard_grad_in_pipeline : 0.000004s : 0.05% optimize.control_data_broadcast_order : 0.000015s : 0.16% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.04% optimize.offloading_packed_experts : 0.000006s : 0.06% optimize.overlap_recompute_and_grad_model_parallel : 0.000007s : 0.07% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.04% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.04% optimize.overlap_recompute_comm : 0.000005s : 0.05% optimize.overlap_grad_ring_attention : 0.000007s : 0.07% optimize.overlap_grad_flash_sp : 0.000021s : 0.22% optimize.begin_end_overlap_inline : 0.000003s : 0.03% optimize.split_matmul_comm_elemetwise : 0.000005s : 0.05% optimize.split_layernorm_comm : 0.000004s : 0.04% optimize.handle_group_info : 0.000003s : 0.04% optimize.symbol_engine_optimizer.build : 0.000003s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000009s : 0.09% optimize.symbol_engine_optimizer.elim_not_effective : 0.000012s : 0.13% optimize.symbol_engine_optimizer.opt_reshape : 0.000007s : 0.07% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000010s : 0.11% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000004s : 0.04% pipeline_parallel_scheduler : 0.000002s : 0.02% auto_monad_reorder : 0.000018s : 0.18% get_jit_bprop_graph : 0.000001s : 0.01% rewriter_after_jit_bprop_graph : 0.000005s : 0.05% opt_after_jit_grad : 0.000473s : 4.84% validate : 0.000035s : 0.36% Time group info: ------[substitution.] 0.000178 25 34.81% : 0.000062s : 4: substitution.arithmetic_simplify 1.01% : 0.000002s : 2: substitution.elim_not_effective 0.93% : 0.000002s : 2: substitution.fold_const_symbol 3.11% : 0.000006s : 3: substitution.graph_param_transform 47.87% : 0.000085s : 2: substitution.inline 1.99% : 0.000004s : 4: substitution.j_node_and_user_rematch 5.82% : 0.000010s : 2: substitution.less_batch_normalization 2.71% : 0.000005s : 4: substitution.remove_not_recompute_node 1.75% : 0.000003s : 2: substitution.replace_old_param ------[type_inference.] 0.005122 2 91.47% : 0.004685s : 1: type_inference.infer 8.53% : 0.000437s : 1: type_inference.specialize ------[replace.] 0.000021 2 100.00% : 0.000021s : 2: replace.inline ------[match.] 0.000083 2 100.00% : 0.000083s : 2: match.inline ------[predicate.] 0.000136 754 0.78% : 0.000001s : 7: predicate.accumulaten_eliminater 0.97% : 0.000001s : 3: predicate.ad_related_special_op_eliminate 0.66% : 0.000001s : 6: predicate.addn_check_dump 0.84% : 0.000001s : 7: predicate.addn_zero_filter 0.69% : 0.000001s : 7: predicate.adjust_all_reduce_mul_add 2.61% : 0.000004s : 13: predicate.arithmetic_simplify 0.78% : 0.000001s : 7: predicate.cast_eliminate 0.74% : 0.000001s : 6: predicate.check_bprop_eliminate 0.69% : 0.000001s : 6: predicate.compare_switch_simplify 0.21% : 0.000000s : 3: predicate.const_output_eliminate 0.71% : 0.000001s : 6: predicate.depend_value_elim 0.79% : 0.000001s : 7: predicate.dict_get_item_const_eliminator 1.03% : 0.000001s : 7: predicate.dict_get_item_eliminator 0.79% : 0.000001s : 7: predicate.dict_set_item_eliminator 1.20% : 0.000002s : 6: predicate.dumpgradient_eliminate 0.27% : 0.000000s : 3: predicate.elim_not_effective 0.43% : 0.000001s : 3: predicate.elim_shapecalc_of_broadcastargs 1.08% : 0.000001s : 10: predicate.environ_add_const_eliminate 0.96% : 0.000001s : 10: predicate.environ_get_add_eliminate 0.95% : 0.000001s : 10: predicate.environ_get_depend_swap 1.80% : 0.000002s : 16: predicate.environ_get_eliminate 1.00% : 0.000001s : 10: predicate.environ_get_set_eliminate 0.95% : 0.000001s : 9: predicate.exchange_switch_depend_value 1.94% : 0.000003s : 9: predicate.float_depend_g_call 0.67% : 0.000001s : 6: predicate.float_environ_get_switch 1.00% : 0.000001s : 9: predicate.float_tuple_getitem_switch 0.26% : 0.000000s : 3: predicate.fold_const_symbol 0.81% : 0.000001s : 6: predicate.get_grad_eliminate 0.27% : 0.000000s : 3: predicate.graph_param_transform 0.98% : 0.000001s : 6: predicate.incorporate_call 0.65% : 0.000001s : 6: predicate.incorporate_call_switch 7.06% : 0.000010s : 34: predicate.inline 1.03% : 0.000001s : 6: predicate.inline_without_move 0.40% : 0.000001s : 6: predicate.j_node_and_user_rematch 1.37% : 0.000002s : 6: predicate.less_batch_normalization 1.68% : 0.000002s : 13: predicate.list_to_tuple_eliminator_ 1.98% : 0.000003s : 20: predicate.load_eliminater 1.20% : 0.000002s : 3: predicate.loop_unroll_after_grad 1.63% : 0.000002s : 14: predicate.loop_unroll_before_grad 1.69% : 0.000002s : 13: predicate.make_slice_get_slice_eliminator 0.70% : 0.000001s : 6: predicate.merge_addn 0.70% : 0.000001s : 6: predicate.micro_step_allgather_replace 0.85% : 0.000001s : 6: predicate.mini_step_allgather_replace 0.67% : 0.000001s : 7: predicate.minmaximum_grad 1.42% : 0.000002s : 3: predicate.mutable_eliminate 0.42% : 0.000001s : 3: predicate.opt_reshape 0.47% : 0.000001s : 3: predicate.parallel_virtual_node 1.26% : 0.000002s : 9: predicate.partial_defer_inline 1.20% : 0.000002s : 10: predicate.partial_eliminate 0.76% : 0.000001s : 7: predicate.print_const_string_wrapper 0.76% : 0.000001s : 6: predicate.reduce_all_const_elim 0.94% : 0.000001s : 7: predicate.reduce_eliminate 2.08% : 0.000003s : 20: predicate.redundant_stop_gradient_eliminater 0.65% : 0.000001s : 6: predicate.remove_not_recompute_node 1.13% : 0.000002s : 13: predicate.replace_applicator 0.67% : 0.000001s : 6: predicate.replace_old_param 0.33% : 0.000000s : 3: predicate.reset_defer_inline 0.86% : 0.000001s : 7: predicate.reshape_eliminate 0.76% : 0.000001s : 6: predicate.row_tensor_add_zeros_like 0.69% : 0.000001s : 3: predicate.row_tensor_eliminate 0.93% : 0.000001s : 6: predicate.same_eliminate 0.68% : 0.000001s : 6: predicate.set_cell_output_no_recompute 0.89% : 0.000001s : 6: predicate.shard_identity_eliminate 0.90% : 0.000001s : 6: predicate.special_op_eliminate 0.97% : 0.000001s : 6: predicate.specialize_transform 1.11% : 0.000002s : 6: predicate.split_environ_get_set_with_tuple_value 0.93% : 0.000001s : 6: predicate.stack_unstack_eliminate 0.44% : 0.000001s : 3: predicate.switch_call_monad_eliminater 1.00% : 0.000001s : 9: predicate.switch_defer_inline 1.92% : 0.000003s : 15: predicate.switch_layer_defer_inline 4.42% : 0.000006s : 32: predicate.switch_simplify 0.73% : 0.000001s : 7: predicate.tile_eliminate 0.79% : 0.000001s : 7: predicate.transpose_eliminate 1.56% : 0.000002s : 13: predicate.tuple_list_convert_item_index_to_positive 1.60% : 0.000002s : 13: predicate.tuple_list_get_item_const_eliminator 1.40% : 0.000002s : 13: predicate.tuple_list_get_item_depend_reorder 3.19% : 0.000004s : 19: predicate.tuple_list_get_item_eliminator 1.34% : 0.000002s : 13: predicate.tuple_list_get_set_item_eliminator 2.88% : 0.000004s : 19: predicate.tuple_list_set_item_eliminator 1.53% : 0.000002s : 13: predicate.tuple_to_list_eliminator_ 1.91% : 0.000003s : 20: predicate.updatestate_pure_node_eliminater 2.93% : 0.000004s : 26: predicate.updatestate_useless_node_eliminater 0.47% : 0.000001s : 3: predicate.value_based_eliminate 0.91% : 0.000001s : 6: predicate.virtual_dataset_eliminate 0.85% : 0.000001s : 6: predicate.virtual_output_eliminate 0.33% : 0.000000s : 3: predicate.virtual_view_grad_eliminate 0.50% : 0.000001s : 3: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000211 5 8.19% : 0.000017s : 1: func_graph_cloner_run.FuncGraphClonerGraph 91.81% : 0.000193s : 4: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.023780 192 0.03% : 0.000006s : 1: ForceFp32Comm 12.92% : 0.003073s : 1: add_attr 12.87% : 0.003060s : 1: add_attr_with_inline 0.03% : 0.000006s : 1: add_comm_op_reuse_tag 0.22% : 0.000052s : 1: add_recomputation 0.03% : 0.000006s : 1: assign_add_opt 0.26% : 0.000062s : 1: auto_monad 0.10% : 0.000025s : 1: auto_monad_reorder 0.02% : 0.000006s : 1: begin_end_overlap_inline 0.03% : 0.000008s : 1: bias_add_comm_swap 1.85% : 0.000441s : 1: bootstrap 0.14% : 0.000033s : 1: cconv 0.03% : 0.000007s : 1: comm_op_add_attrs 0.08% : 0.000018s : 1: control_data_broadcast_order 0.05% : 0.000013s : 1: convert_after_rewriter 0.13% : 0.000030s : 1: cse_after_recomputation 0.03% : 0.000007s : 1: dataset_repeat_opt 0.07% : 0.000017s : 1: detach_backward 0.05% : 0.000011s : 1: environ_conv 0.09% : 0.000021s : 1: event_method 0.03% : 0.000007s : 1: full_micro_interleaved_order_control 0.03% : 0.000008s : 1: get_jit_bprop_graph 0.05% : 0.000012s : 1: graph_reusing 0.03% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.03% : 0.000006s : 1: handle_group_info 0.04% : 0.000009s : 1: inline 0.03% : 0.000008s : 1: insert-virtual-dataset 0.03% : 0.000006s : 1: interleave_parallel_branches 0.03% : 0.000007s : 1: interleave_split_concat_branches 0.03% : 0.000008s : 1: label_fine_grained_interleaved_index 0.04% : 0.000009s : 1: label_micro_interleaved_index 1.86% : 0.000443s : 1: loop_unroll 0.03% : 0.000007s : 1: merge_cast_opt 0.03% : 0.000007s : 1: micro_interleaved_order_control 2.24% : 0.000532s : 1: mutable_eliminate 0.04% : 0.000009s : 1: offloading_packed_experts 0.06% : 0.000013s : 1: opt.transform.loop_unroll_optimizer 0.06% : 0.000015s : 1: opt.transform.mutable_eliminate 3.61% : 0.000859s : 78: opt.transform.opt_a 0.11% : 0.000027s : 1: opt.transform.opt_after_cconv 0.10% : 0.000023s : 1: opt.transform.opt_after_jit_grad 0.41% : 0.000097s : 28: opt.transform.opt_b 0.18% : 0.000044s : 2: opt.transform.opt_trans_graph 0.15% : 0.000035s : 4: opt.transform.symbol_engine_opt 10.57% : 0.002513s : 1: opt_a 0.54% : 0.000127s : 1: opt_after_cconv 2.03% : 0.000484s : 1: opt_after_jit_grad 1.14% : 0.000271s : 1: opt_b 21.20% : 0.005041s : 1: optimize 0.10% : 0.000023s : 1: optimize_parallel_all_gather_comm 0.05% : 0.000011s : 1: order_py_execute_after_rewriter 0.10% : 0.000024s : 1: overlap_grad_flash_sp 0.03% : 0.000006s : 1: overlap_grad_matmul_and_grad_allreduce 0.04% : 0.000009s : 1: overlap_grad_ring_attention 0.03% : 0.000007s : 1: overlap_opt_shard_grad_in_pipeline 0.03% : 0.000007s : 1: overlap_opt_shard_in_pipeline 0.03% : 0.000008s : 1: overlap_param_gather 0.03% : 0.000006s : 1: overlap_recompute_allgather_and_fa_grad 0.04% : 0.000010s : 1: overlap_recompute_and_grad_model_parallel 0.03% : 0.000008s : 1: overlap_recompute_comm 0.04% : 0.000010s : 1: parallel-infer-symbol 0.03% : 0.000006s : 1: parallel-infer-symbol-second 0.03% : 0.000008s : 1: partial_unused_args_eliminate 0.03% : 0.000008s : 1: pipeline_parallel_scheduler 0.03% : 0.000007s : 1: pipeline_split 0.13% : 0.000032s : 1: pre_auto_parallel 0.10% : 0.000024s : 1: py_interpret_to_execute 0.07% : 0.000016s : 1: py_interpret_to_execute_after_opt_a 0.02% : 0.000006s : 1: remove_cast_before_assign_add 0.09% : 0.000021s : 1: remove_dup_value 1.04% : 0.000247s : 1: renormalize.infer 0.89% : 0.000211s : 1: renormalize.specialize 0.03% : 0.000008s : 1: reorder_send_recv_between_fp_bp 0.04% : 0.000010s : 1: rewriter_after_jit_bprop_graph 0.18% : 0.000042s : 1: rewriter_after_opt_a 0.22% : 0.000052s : 1: rewriter_before_opt_a 0.03% : 0.000007s : 1: slice_cell_reuse_recomputed_activation 0.03% : 0.000007s : 1: slice_recompute_activation 0.03% : 0.000007s : 1: split_layernorm_comm 0.03% : 0.000007s : 1: split_matmul_comm_elemetwise 0.04% : 0.000010s : 1: swap_dp_allreduce_reducescatter 0.40% : 0.000096s : 1: symbol_engine_optimizer 0.37% : 0.000088s : 1: tuple_transform 21.83% : 0.005192s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:28.441.819 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0297295, [21] [bootstrap]: 0.00041611 [type_inference]: 0.0051253 [event_method]: 1.152e-05 [auto_monad]: 5.39e-05 [graph_reusing]: 6.06998e-06 [inline]: 2.53e-06 [add_attr]: 0.00301762, [1] [add_attr_with_inline]: 0.00300977, [1] [Cycle 1]: 4.572e-05, [2] [tag_attr]: 1.317e-05 [meta_addattr_fg_expand]: 4.02e-06 [parallel-infer-symbol]: 3.36999e-06 [pre_auto_parallel]: 2.211e-05 [insert-virtual-dataset]: 2.51e-06 [parallel-infer-symbol-second]: 1.10999e-06 [dataset_repeat_opt]: 2.66999e-06 [pipeline_split]: 1.57001e-06 [optimize]: 0.00401595, [53] [py_interpret_to_execute]: 1.6e-05 [rewriter_before_opt_a]: 5.083e-05 [opt_a]: 0.00210548, [2] [Cycle 1]: 0.00145457, [45] [expand_dump_flag]: 3.23e-06 [switch_simplify]: 2.543e-05 [loop_unroll]: 1.524e-05 [a_1]: 0.00030802 [with_stream_mark]: 1.457e-05 [recompute_prepare]: 9.37999e-06 [updatestate_depend_eliminate]: 3.98999e-06 [updatestate_assign_eliminate]: 3.21999e-06 [updatestate_loads_eliminate]: 3.08e-06 [parameter_eliminate]: 1.97999e-06 [a_2]: 9.44e-05 [accelerated_algorithm]: 1.92e-05 [shard]: 2.32999e-06 [meta_shard_fg_expand]: 1.67999e-06 [shard_inline]: 6.36e-06 [merge_send_recv]: 8.87e-06 [auto_parallel]: 5.99e-06 [parallel]: 1.833e-05 [flash_sp]: 7.58999e-06 [merge_comm]: 4.44002e-06 [allreduce_fusion]: 3.36999e-06 [matmul_add_comm_reduction]: 1.13e-05 [allreduce_slice_to_reducescatter]: 7.79983e-07 [virtual_shard_identity]: 9.00001e-06 [virtual_dataset]: 6.84999e-06 [get_grad_eliminate_]: 6.21998e-06 [virtual_output]: 6.36e-06 [merge_forward]: 3.97e-06 [cell_reuse_recompute_pass]: 1.20999e-06 [offload_activation]: 9.56003e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.176e-05 [merge_recompute_call_nodes]: 1.55999e-06 [before_grad]: 1.088e-05 [set_forward_comm_id_for_comm_node_pass]: 3.85e-06 [meta_fg_expand]: 2.73e-06 [flash_sp_send_recv_attached]: 4.80001e-06 [receive_attached]: 2.27001e-06 [after_resolve]: 1.11e-05 [a_after_grad]: 1.052e-05 [renormalize]: 0.00044243 [add_forward_monad_depend]: 5.27999e-06 [auto_monad_grad]: 2.46e-06 [auto_monad_eliminator]: 1.382e-05 [cse]: 2.854e-05 [a_3]: 4.539e-05 [Cycle 2]: 0.00064076, [45] [expand_dump_flag]: 9.50007e-07 [switch_simplify]: 7.18e-06 [loop_unroll]: 5.67999e-06 [a_1]: 0.00012555 [with_stream_mark]: 9.19e-06 [recompute_prepare]: 6.28002e-06 [updatestate_depend_eliminate]: 3.48999e-06 [updatestate_assign_eliminate]: 2.27001e-06 [updatestate_loads_eliminate]: 2.83e-06 [parameter_eliminate]: 1.40001e-06 [a_2]: 8.089e-05 [accelerated_algorithm]: 9.39998e-06 [shard]: 1.07e-06 [meta_shard_fg_expand]: 1.32999e-06 [shard_inline]: 6.06998e-06 [merge_send_recv]: 5.16998e-06 [auto_parallel]: 5.20999e-06 [parallel]: 5.19e-06 [flash_sp]: 3.35003e-06 [merge_comm]: 3.16999e-06 [allreduce_fusion]: 3.71001e-06 [matmul_add_comm_reduction]: 6.18998e-06 [allreduce_slice_to_reducescatter]: 3.30008e-07 [virtual_shard_identity]: 6.81001e-06 [virtual_dataset]: 5.61e-06 [get_grad_eliminate_]: 5.63002e-06 [virtual_output]: 5.39e-06 [merge_forward]: 3.13998e-06 [cell_reuse_recompute_pass]: 1.35999e-06 [offload_activation]: 6.14999e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.089e-05 [merge_recompute_call_nodes]: 7.39994e-07 [before_grad]: 8.52e-06 [set_forward_comm_id_for_comm_node_pass]: 3.51999e-06 [meta_fg_expand]: 2.01e-06 [flash_sp_send_recv_attached]: 1.02e-06 [receive_attached]: 1.01002e-06 [after_resolve]: 8.43999e-06 [a_after_grad]: 8.13001e-06 [renormalize]: 1.00001e-07 [add_forward_monad_depend]: 1.19998e-06 [auto_monad_grad]: 7.79983e-07 [auto_monad_eliminator]: 6.79999e-06 [cse]: 1.528e-05 [a_3]: 3.416e-05 [py_interpret_to_execute_after_opt_a]: 8.82999e-06 [slice_cell_reuse_recomputed_activation]: 1.96e-06 [rewriter_after_opt_a]: 3.282e-05 [convert_after_rewriter]: 6.40002e-06 [order_py_execute_after_rewriter]: 5.17999e-06 [mutable_eliminate]: 0.00047504 [opt_b]: 0.00019594, [1] [Cycle 1]: 0.00018995, [7] [b_1]: 0.00011789 [b_2]: 6.84999e-06 [updatestate_depend_eliminate]: 5.59998e-06 [updatestate_assign_eliminate]: 2.29999e-06 [updatestate_loads_eliminate]: 2.69999e-06 [renormalize]: 6.90023e-07 [cse]: 1.861e-05 [optimize_parallel_all_gather_comm]: 1.642e-05 [overlap_param_gather]: 2.24001e-06 [cconv]: 2.451e-05 [loop_unroll]: 0.00042644 [opt_after_cconv]: 0.0001009, [1] [Cycle 1]: 9.523e-05, [7] [c_1]: 2.689e-05 [parameter_eliminate]: 3.18e-06 [updatestate_depend_eliminate]: 5.69999e-06 [updatestate_assign_eliminate]: 2.53998e-06 [updatestate_loads_eliminate]: 2.41e-06 [cse]: 1.867e-05 [renormalize]: 3.50003e-07 [remove_dup_value]: 1.523e-05 [tuple_transform]: 7.064e-05, [1] [Cycle 1]: 6.619e-05, [4] [d_1]: 3.879e-05 [none_parameter_eliminate]: 1.65001e-06 [renormalize]: 1.90019e-07 [switch_simplify]: 6.66e-06 [partial_unused_args_eliminate]: 1.92999e-06 [add_recomputation]: 4.67e-05 [cse_after_recomputation]: 2.221e-05, [1] [Cycle 1]: 1.747e-05, [1] [cse]: 1.188e-05 [environ_conv]: 5.20001e-06 [swap_dp_allreduce_reducescatter]: 5.00001e-06 [bias_add_comm_swap]: 2.34001e-06 [label_micro_interleaved_index]: 4.93001e-06 [label_fine_grained_interleaved_index]: 2.65002e-06 [merge_cast_opt]: 1.22999e-06 [slice_recompute_activation]: 2.01e-06 [micro_interleaved_order_control]: 2.48e-06 [assign_add_opt]: 1.15999e-06 [ForceFp32Comm]: 8.2e-07 [remove_cast_before_assign_add]: 1.25999e-06 [full_micro_interleaved_order_control]: 2.39001e-06 [reorder_send_recv_between_fp_bp]: 2.66e-06 [comm_op_add_attrs]: 1.00001e-06 [add_comm_op_reuse_tag]: 9.60019e-07 [interleave_split_concat_branches]: 1.10999e-06 [interleave_parallel_branches]: 1.07998e-06 [overlap_opt_shard_in_pipeline]: 1.71e-06 [overlap_opt_shard_grad_in_pipeline]: 1.92001e-06 [control_data_broadcast_order]: 1.27e-05 [grouped_pairwise_exchange_alltoall]: 1.64e-06 [offloading_packed_experts]: 3.51001e-06 [overlap_recompute_and_grad_model_parallel]: 4.65999e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.27e-06 [overlap_recompute_allgather_and_fa_grad]: 1.35001e-06 [overlap_recompute_comm]: 2.36e-06 [overlap_grad_ring_attention]: 4.23999e-06 [overlap_grad_flash_sp]: 1.672e-05 [begin_end_overlap_inline]: 5.00004e-07 [split_matmul_comm_elemetwise]: 2.03997e-06 [split_layernorm_comm]: 1.64e-06 [handle_group_info]: 1.62999e-06 [symbol_engine_optimizer]: 7.804e-05, [1] [Cycle 1]: 7.346e-05, [6] [build]: 2.46e-06 [elim_shapecalc]: 1.032e-05 [elim_not_effective]: 1.255e-05 [opt_reshape]: 7.08e-06 [fold_const_symbol]: 1.116e-05 [renormalize]: 4.59986e-07 [detach_backward]: 2.09999e-06 [pipeline_parallel_scheduler]: 2.01e-06 [auto_monad_reorder]: 1.748e-05 [get_jit_bprop_graph]: 1.77999e-06 [rewriter_after_jit_bprop_graph]: 1.723e-05 [opt_after_jit_grad]: 0.00074142 [validate]: 4.45e-05 Sums bootstrap : 0.000416s : 4.30% type_inference : 0.005125s : 52.93% event_method : 0.000012s : 0.12% auto_monad : 0.000054s : 0.56% graph_reusing : 0.000006s : 0.06% inline : 0.000003s : 0.03% add_attr.add_attr_with_inline.tag_attr : 0.000013s : 0.14% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000004s : 0.04% parallel-infer-symbol : 0.000003s : 0.03% pre_auto_parallel : 0.000022s : 0.23% insert-virtual-dataset : 0.000003s : 0.03% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000003s : 0.03% pipeline_split : 0.000002s : 0.02% optimize.py_interpret_to_execute : 0.000016s : 0.17% optimize.rewriter_before_opt_a : 0.000051s : 0.52% optimize.opt_a.expand_dump_flag : 0.000004s : 0.04% optimize.opt_a.switch_simplify : 0.000033s : 0.34% optimize.opt_a.loop_unroll : 0.000021s : 0.22% optimize.opt_a.a_1 : 0.000434s : 4.48% optimize.opt_a.with_stream_mark : 0.000024s : 0.25% optimize.opt_a.recompute_prepare : 0.000016s : 0.16% optimize.opt_a.updatestate_depend_eliminate : 0.000007s : 0.08% optimize.opt_a.updatestate_assign_eliminate : 0.000005s : 0.06% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.06% optimize.opt_a.parameter_eliminate : 0.000003s : 0.03% optimize.opt_a.a_2 : 0.000175s : 1.81% optimize.opt_a.accelerated_algorithm : 0.000029s : 0.30% optimize.opt_a.shard : 0.000003s : 0.04% optimize.opt_a.meta_shard_fg_expand : 0.000003s : 0.03% optimize.opt_a.shard_inline : 0.000012s : 0.13% optimize.opt_a.merge_send_recv : 0.000014s : 0.15% optimize.opt_a.auto_parallel : 0.000011s : 0.12% optimize.opt_a.parallel : 0.000024s : 0.24% optimize.opt_a.flash_sp : 0.000011s : 0.11% optimize.opt_a.merge_comm : 0.000008s : 0.08% optimize.opt_a.allreduce_fusion : 0.000007s : 0.07% optimize.opt_a.matmul_add_comm_reduction : 0.000017s : 0.18% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000016s : 0.16% optimize.opt_a.virtual_dataset : 0.000012s : 0.13% optimize.opt_a.get_grad_eliminate_ : 0.000012s : 0.12% optimize.opt_a.virtual_output : 0.000012s : 0.12% optimize.opt_a.merge_forward : 0.000007s : 0.07% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.03% optimize.opt_a.offload_activation : 0.000016s : 0.16% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000023s : 0.23% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.02% optimize.opt_a.before_grad : 0.000019s : 0.20% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000007s : 0.08% optimize.opt_a.meta_fg_expand : 0.000005s : 0.05% optimize.opt_a.flash_sp_send_recv_attached : 0.000006s : 0.06% optimize.opt_a.receive_attached : 0.000003s : 0.03% optimize.opt_a.after_resolve : 0.000020s : 0.20% optimize.opt_a.a_after_grad : 0.000019s : 0.19% optimize.opt_a.renormalize : 0.000443s : 4.57% optimize.opt_a.add_forward_monad_depend : 0.000006s : 0.07% optimize.opt_a.auto_monad_grad : 0.000003s : 0.03% optimize.opt_a.auto_monad_eliminator : 0.000021s : 0.21% optimize.opt_a.cse : 0.000044s : 0.45% optimize.opt_a.a_3 : 0.000080s : 0.82% optimize.py_interpret_to_execute_after_opt_a : 0.000009s : 0.09% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.02% optimize.rewriter_after_opt_a : 0.000033s : 0.34% optimize.convert_after_rewriter : 0.000006s : 0.07% optimize.order_py_execute_after_rewriter : 0.000005s : 0.05% optimize.mutable_eliminate : 0.000475s : 4.91% optimize.opt_b.b_1 : 0.000118s : 1.22% optimize.opt_b.b_2 : 0.000007s : 0.07% optimize.opt_b.updatestate_depend_eliminate : 0.000006s : 0.06% optimize.opt_b.updatestate_assign_eliminate : 0.000002s : 0.02% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_b.renormalize : 0.000001s : 0.01% optimize.opt_b.cse : 0.000019s : 0.19% optimize.optimize_parallel_all_gather_comm : 0.000016s : 0.17% optimize.overlap_param_gather : 0.000002s : 0.02% optimize.cconv : 0.000025s : 0.25% optimize.loop_unroll : 0.000426s : 4.40% optimize.opt_after_cconv.c_1 : 0.000027s : 0.28% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.06% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.02% optimize.opt_after_cconv.cse : 0.000019s : 0.19% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000015s : 0.16% optimize.tuple_transform.d_1 : 0.000039s : 0.40% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.02% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000007s : 0.07% optimize.partial_unused_args_eliminate : 0.000002s : 0.02% optimize.add_recomputation : 0.000047s : 0.48% optimize.cse_after_recomputation.cse : 0.000012s : 0.12% optimize.environ_conv : 0.000005s : 0.05% optimize.swap_dp_allreduce_reducescatter : 0.000005s : 0.05% optimize.bias_add_comm_swap : 0.000002s : 0.02% optimize.label_micro_interleaved_index : 0.000005s : 0.05% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.03% optimize.merge_cast_opt : 0.000001s : 0.01% optimize.slice_recompute_activation : 0.000002s : 0.02% optimize.micro_interleaved_order_control : 0.000002s : 0.03% optimize.assign_add_opt : 0.000001s : 0.01% optimize.ForceFp32Comm : 0.000001s : 0.01% optimize.remove_cast_before_assign_add : 0.000001s : 0.01% optimize.full_micro_interleaved_order_control : 0.000002s : 0.02% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.03% optimize.comm_op_add_attrs : 0.000001s : 0.01% optimize.add_comm_op_reuse_tag : 0.000001s : 0.01% optimize.interleave_split_concat_branches : 0.000001s : 0.01% optimize.interleave_parallel_branches : 0.000001s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000002s : 0.02% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.02% optimize.control_data_broadcast_order : 0.000013s : 0.13% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.02% optimize.offloading_packed_experts : 0.000004s : 0.04% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.05% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.01% optimize.overlap_recompute_comm : 0.000002s : 0.02% optimize.overlap_grad_ring_attention : 0.000004s : 0.04% optimize.overlap_grad_flash_sp : 0.000017s : 0.17% optimize.begin_end_overlap_inline : 0.000001s : 0.01% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.02% optimize.split_layernorm_comm : 0.000002s : 0.02% optimize.handle_group_info : 0.000002s : 0.02% optimize.symbol_engine_optimizer.build : 0.000002s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000010s : 0.11% optimize.symbol_engine_optimizer.elim_not_effective : 0.000013s : 0.13% optimize.symbol_engine_optimizer.opt_reshape : 0.000007s : 0.07% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000011s : 0.12% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.02% pipeline_parallel_scheduler : 0.000002s : 0.02% auto_monad_reorder : 0.000017s : 0.18% get_jit_bprop_graph : 0.000002s : 0.02% rewriter_after_jit_bprop_graph : 0.000017s : 0.18% opt_after_jit_grad : 0.000741s : 7.66% validate : 0.000045s : 0.46% Time group info: ------[substitution.] 0.000172 25 33.79% : 0.000058s : 4: substitution.arithmetic_simplify 1.20% : 0.000002s : 2: substitution.elim_not_effective 0.96% : 0.000002s : 2: substitution.fold_const_symbol 2.95% : 0.000005s : 3: substitution.graph_param_transform 48.61% : 0.000084s : 2: substitution.inline 1.94% : 0.000003s : 4: substitution.j_node_and_user_rematch 6.05% : 0.000010s : 2: substitution.less_batch_normalization 2.58% : 0.000004s : 4: substitution.remove_not_recompute_node 1.92% : 0.000003s : 2: substitution.replace_old_param ------[type_inference.] 0.005082 2 91.58% : 0.004654s : 1: type_inference.infer 8.42% : 0.000428s : 1: type_inference.specialize ------[replace.] 0.000021 2 100.00% : 0.000021s : 2: replace.inline ------[match.] 0.000082 2 100.00% : 0.000082s : 2: match.inline ------[predicate.] 0.000135 754 0.79% : 0.000001s : 7: predicate.accumulaten_eliminater 2.06% : 0.000003s : 3: predicate.ad_related_special_op_eliminate 0.65% : 0.000001s : 6: predicate.addn_check_dump 0.81% : 0.000001s : 7: predicate.addn_zero_filter 0.67% : 0.000001s : 7: predicate.adjust_all_reduce_mul_add 2.78% : 0.000004s : 13: predicate.arithmetic_simplify 0.77% : 0.000001s : 7: predicate.cast_eliminate 0.69% : 0.000001s : 6: predicate.check_bprop_eliminate 0.71% : 0.000001s : 6: predicate.compare_switch_simplify 0.22% : 0.000000s : 3: predicate.const_output_eliminate 0.87% : 0.000001s : 6: predicate.depend_value_elim 0.79% : 0.000001s : 7: predicate.dict_get_item_const_eliminator 1.01% : 0.000001s : 7: predicate.dict_get_item_eliminator 0.72% : 0.000001s : 7: predicate.dict_set_item_eliminator 1.64% : 0.000002s : 6: predicate.dumpgradient_eliminate 0.24% : 0.000000s : 3: predicate.elim_not_effective 0.50% : 0.000001s : 3: predicate.elim_shapecalc_of_broadcastargs 1.04% : 0.000001s : 10: predicate.environ_add_const_eliminate 0.98% : 0.000001s : 10: predicate.environ_get_add_eliminate 0.96% : 0.000001s : 10: predicate.environ_get_depend_swap 1.78% : 0.000002s : 16: predicate.environ_get_eliminate 1.05% : 0.000001s : 10: predicate.environ_get_set_eliminate 0.94% : 0.000001s : 9: predicate.exchange_switch_depend_value 1.93% : 0.000003s : 9: predicate.float_depend_g_call 0.68% : 0.000001s : 6: predicate.float_environ_get_switch 0.88% : 0.000001s : 9: predicate.float_tuple_getitem_switch 0.22% : 0.000000s : 3: predicate.fold_const_symbol 0.83% : 0.000001s : 6: predicate.get_grad_eliminate 0.25% : 0.000000s : 3: predicate.graph_param_transform 0.78% : 0.000001s : 6: predicate.incorporate_call 0.65% : 0.000001s : 6: predicate.incorporate_call_switch 6.54% : 0.000009s : 34: predicate.inline 1.22% : 0.000002s : 6: predicate.inline_without_move 0.42% : 0.000001s : 6: predicate.j_node_and_user_rematch 1.29% : 0.000002s : 6: predicate.less_batch_normalization 1.49% : 0.000002s : 13: predicate.list_to_tuple_eliminator_ 2.16% : 0.000003s : 20: predicate.load_eliminater 1.20% : 0.000002s : 3: predicate.loop_unroll_after_grad 1.67% : 0.000002s : 14: predicate.loop_unroll_before_grad 1.72% : 0.000002s : 13: predicate.make_slice_get_slice_eliminator 0.64% : 0.000001s : 6: predicate.merge_addn 0.64% : 0.000001s : 6: predicate.micro_step_allgather_replace 0.70% : 0.000001s : 6: predicate.mini_step_allgather_replace 0.69% : 0.000001s : 7: predicate.minmaximum_grad 1.44% : 0.000002s : 3: predicate.mutable_eliminate 0.47% : 0.000001s : 3: predicate.opt_reshape 0.50% : 0.000001s : 3: predicate.parallel_virtual_node 1.32% : 0.000002s : 9: predicate.partial_defer_inline 1.30% : 0.000002s : 10: predicate.partial_eliminate 0.76% : 0.000001s : 7: predicate.print_const_string_wrapper 0.73% : 0.000001s : 6: predicate.reduce_all_const_elim 1.28% : 0.000002s : 7: predicate.reduce_eliminate 2.01% : 0.000003s : 20: predicate.redundant_stop_gradient_eliminater 0.61% : 0.000001s : 6: predicate.remove_not_recompute_node 1.15% : 0.000002s : 13: predicate.replace_applicator 0.75% : 0.000001s : 6: predicate.replace_old_param 0.35% : 0.000000s : 3: predicate.reset_defer_inline 1.06% : 0.000001s : 7: predicate.reshape_eliminate 0.77% : 0.000001s : 6: predicate.row_tensor_add_zeros_like 0.44% : 0.000001s : 3: predicate.row_tensor_eliminate 0.93% : 0.000001s : 6: predicate.same_eliminate 0.58% : 0.000001s : 6: predicate.set_cell_output_no_recompute 1.07% : 0.000001s : 6: predicate.shard_identity_eliminate 0.95% : 0.000001s : 6: predicate.special_op_eliminate 0.95% : 0.000001s : 6: predicate.specialize_transform 1.08% : 0.000001s : 6: predicate.split_environ_get_set_with_tuple_value 0.86% : 0.000001s : 6: predicate.stack_unstack_eliminate 0.44% : 0.000001s : 3: predicate.switch_call_monad_eliminater 0.98% : 0.000001s : 9: predicate.switch_defer_inline 1.62% : 0.000002s : 15: predicate.switch_layer_defer_inline 4.42% : 0.000006s : 32: predicate.switch_simplify 0.77% : 0.000001s : 7: predicate.tile_eliminate 0.75% : 0.000001s : 7: predicate.transpose_eliminate 1.44% : 0.000002s : 13: predicate.tuple_list_convert_item_index_to_positive 1.54% : 0.000002s : 13: predicate.tuple_list_get_item_const_eliminator 1.27% : 0.000002s : 13: predicate.tuple_list_get_item_depend_reorder 2.88% : 0.000004s : 19: predicate.tuple_list_get_item_eliminator 1.47% : 0.000002s : 13: predicate.tuple_list_get_set_item_eliminator 2.35% : 0.000003s : 19: predicate.tuple_list_set_item_eliminator 1.53% : 0.000002s : 13: predicate.tuple_to_list_eliminator_ 1.98% : 0.000003s : 20: predicate.updatestate_pure_node_eliminater 2.75% : 0.000004s : 26: predicate.updatestate_useless_node_eliminater 0.63% : 0.000001s : 3: predicate.value_based_eliminate 0.81% : 0.000001s : 6: predicate.virtual_dataset_eliminate 0.81% : 0.000001s : 6: predicate.virtual_output_eliminate 0.33% : 0.000000s : 3: predicate.virtual_view_grad_eliminate 0.60% : 0.000001s : 3: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000197 5 8.67% : 0.000017s : 1: func_graph_cloner_run.FuncGraphClonerGraph 91.33% : 0.000180s : 4: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.038164 192 0.01% : 0.000004s : 1: ForceFp32Comm 7.92% : 0.003022s : 1: add_attr 7.90% : 0.003013s : 1: add_attr_with_inline 0.01% : 0.000004s : 1: add_comm_op_reuse_tag 0.13% : 0.000050s : 1: add_recomputation 0.01% : 0.000004s : 1: assign_add_opt 0.15% : 0.000059s : 1: auto_monad 0.06% : 0.000021s : 1: auto_monad_reorder 0.01% : 0.000003s : 1: begin_end_overlap_inline 0.01% : 0.000005s : 1: bias_add_comm_swap 1.17% : 0.000445s : 1: bootstrap 0.07% : 0.000028s : 1: cconv 0.01% : 0.000004s : 1: comm_op_add_attrs 0.04% : 0.000016s : 1: control_data_broadcast_order 0.03% : 0.000010s : 1: convert_after_rewriter 0.07% : 0.000025s : 1: cse_after_recomputation 0.01% : 0.000006s : 1: dataset_repeat_opt 0.02% : 0.000006s : 1: detach_backward 0.02% : 0.000008s : 1: environ_conv 0.05% : 0.000018s : 1: event_method 0.01% : 0.000005s : 1: full_micro_interleaved_order_control 42.01% : 0.016031s : 1: get_jit_bprop_graph 0.02% : 0.000009s : 1: graph_reusing 0.01% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000004s : 1: handle_group_info 0.01% : 0.000006s : 1: inline 0.02% : 0.000006s : 1: insert-virtual-dataset 0.01% : 0.000004s : 1: interleave_parallel_branches 0.01% : 0.000004s : 1: interleave_split_concat_branches 0.02% : 0.000006s : 1: label_fine_grained_interleaved_index 0.02% : 0.000008s : 1: label_micro_interleaved_index 1.14% : 0.000435s : 1: loop_unroll 0.01% : 0.000004s : 1: merge_cast_opt 0.01% : 0.000005s : 1: micro_interleaved_order_control 1.27% : 0.000483s : 1: mutable_eliminate 0.02% : 0.000006s : 1: offloading_packed_experts 0.04% : 0.000014s : 1: opt.transform.loop_unroll_optimizer 0.04% : 0.000014s : 1: opt.transform.mutable_eliminate 2.20% : 0.000839s : 78: opt.transform.opt_a 0.07% : 0.000025s : 1: opt.transform.opt_after_cconv 0.09% : 0.000035s : 1: opt.transform.opt_after_jit_grad 0.25% : 0.000094s : 28: opt.transform.opt_b 0.11% : 0.000043s : 2: opt.transform.opt_trans_graph 0.10% : 0.000037s : 4: opt.transform.symbol_engine_opt 5.53% : 0.002109s : 1: opt_a 0.27% : 0.000104s : 1: opt_after_cconv 1.97% : 0.000751s : 1: opt_after_jit_grad 0.52% : 0.000200s : 1: opt_b 10.54% : 0.004022s : 1: optimize 0.05% : 0.000020s : 1: optimize_parallel_all_gather_comm 0.02% : 0.000008s : 1: order_py_execute_after_rewriter 0.05% : 0.000020s : 1: overlap_grad_flash_sp 0.01% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.02% : 0.000007s : 1: overlap_grad_ring_attention 0.01% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.01% : 0.000005s : 1: overlap_param_gather 0.01% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.02% : 0.000008s : 1: overlap_recompute_and_grad_model_parallel 0.01% : 0.000005s : 1: overlap_recompute_comm 0.02% : 0.000007s : 1: parallel-infer-symbol 0.01% : 0.000004s : 1: parallel-infer-symbol-second 0.01% : 0.000005s : 1: partial_unused_args_eliminate 0.01% : 0.000005s : 1: pipeline_parallel_scheduler 0.01% : 0.000004s : 1: pipeline_split 0.07% : 0.000026s : 1: pre_auto_parallel 0.05% : 0.000020s : 1: py_interpret_to_execute 0.03% : 0.000012s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000004s : 1: remove_cast_before_assign_add 0.05% : 0.000019s : 1: remove_dup_value 0.61% : 0.000234s : 1: renormalize.infer 0.53% : 0.000201s : 1: renormalize.specialize 0.01% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.14% : 0.000052s : 1: rewriter_after_jit_bprop_graph 0.10% : 0.000037s : 1: rewriter_after_opt_a 0.14% : 0.000055s : 1: rewriter_before_opt_a 0.01% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.01% : 0.000005s : 1: slice_recompute_activation 0.01% : 0.000004s : 1: split_layernorm_comm 0.01% : 0.000005s : 1: split_matmul_comm_elemetwise 0.02% : 0.000008s : 1: swap_dp_allreduce_reducescatter 0.21% : 0.000081s : 1: symbol_engine_optimizer 0.19% : 0.000074s : 1: tuple_transform 13.47% : 0.005141s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:28.969.104 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:28.969.377 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0141746, [21] [bootstrap]: 0.00040659 [type_inference]: 0.00493802 [event_method]: 1.106e-05 [auto_monad]: 5.351e-05 [graph_reusing]: 5.26002e-06 [inline]: 1.79e-06 [add_attr]: 0.00303734, [1] [add_attr_with_inline]: 0.00302982, [1] [Cycle 1]: 6.008e-05, [2] [tag_attr]: 1.288e-05 [meta_addattr_fg_expand]: 3.58e-06 [parallel-infer-symbol]: 3.10998e-06 [pre_auto_parallel]: 2.312e-05 [insert-virtual-dataset]: 2.24001e-06 [parallel-infer-symbol-second]: 7.99977e-07 [dataset_repeat_opt]: 1.92999e-06 [pipeline_split]: 1.54e-06 [optimize]: 0.0046232, [53] [py_interpret_to_execute]: 1.931e-05 [rewriter_before_opt_a]: 4.679e-05 [opt_a]: 0.00245799, [2] [Cycle 1]: 0.00163415, [45] [expand_dump_flag]: 3.09999e-06 [switch_simplify]: 2.389e-05 [loop_unroll]: 1.361e-05 [a_1]: 0.00032162 [with_stream_mark]: 1.525e-05 [recompute_prepare]: 8.38999e-06 [updatestate_depend_eliminate]: 4.55001e-06 [updatestate_assign_eliminate]: 3.43e-06 [updatestate_loads_eliminate]: 3.05002e-06 [parameter_eliminate]: 2.02001e-06 [a_2]: 0.00012168 [accelerated_algorithm]: 1.819e-05 [shard]: 2.12001e-06 [meta_shard_fg_expand]: 1.87999e-06 [shard_inline]: 6.41e-06 [merge_send_recv]: 8.48999e-06 [auto_parallel]: 5.76e-06 [parallel]: 1.789e-05 [flash_sp]: 7.40003e-06 [merge_comm]: 4.12e-06 [allreduce_fusion]: 3.35e-06 [matmul_add_comm_reduction]: 1.054e-05 [allreduce_slice_to_reducescatter]: 6.00005e-07 [virtual_shard_identity]: 7.71999e-06 [virtual_dataset]: 6.57002e-06 [get_grad_eliminate_]: 6.29999e-06 [virtual_output]: 6.11e-06 [merge_forward]: 3.83999e-06 [cell_reuse_recompute_pass]: 1.37999e-06 [offload_activation]: 9.79999e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.371e-05 [merge_recompute_call_nodes]: 1.37999e-06 [before_grad]: 1.127e-05 [set_forward_comm_id_for_comm_node_pass]: 3.88001e-06 [meta_fg_expand]: 2.74001e-06 [flash_sp_send_recv_attached]: 4.80999e-06 [receive_attached]: 1.99e-06 [after_resolve]: 1.008e-05 [a_after_grad]: 9.71e-06 [renormalize]: 0.00044173 [add_forward_monad_depend]: 4.53999e-06 [auto_monad_grad]: 2.04e-06 [auto_monad_eliminator]: 1.429e-05 [cse]: 2.62e-05 [a_3]: 5.849e-05 [Cycle 2]: 0.00081119, [45] [expand_dump_flag]: 9.5999e-07 [switch_simplify]: 7.08998e-06 [loop_unroll]: 5.77001e-06 [a_1]: 0.00012478 [with_stream_mark]: 7.82e-06 [recompute_prepare]: 6.44001e-06 [updatestate_depend_eliminate]: 2.99001e-06 [updatestate_assign_eliminate]: 2.47001e-06 [updatestate_loads_eliminate]: 2.51998e-06 [parameter_eliminate]: 1.04998e-06 [a_2]: 0.00010842 [accelerated_algorithm]: 9.05001e-06 [shard]: 9.49978e-07 [meta_shard_fg_expand]: 1.32e-06 [shard_inline]: 6.31e-06 [merge_send_recv]: 5.18002e-06 [auto_parallel]: 5.56e-06 [parallel]: 4.92e-06 [flash_sp]: 3.86999e-06 [merge_comm]: 3.58e-06 [allreduce_fusion]: 3.58e-06 [matmul_add_comm_reduction]: 7.03e-06 [allreduce_slice_to_reducescatter]: 3.30008e-07 [virtual_shard_identity]: 6.94001e-06 [virtual_dataset]: 5.76e-06 [get_grad_eliminate_]: 5.64e-06 [virtual_output]: 5.42999e-06 [merge_forward]: 2.64999e-06 [cell_reuse_recompute_pass]: 1.35001e-06 [offload_activation]: 6.43e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.242e-05 [merge_recompute_call_nodes]: 1.15999e-06 [before_grad]: 9.09e-06 [set_forward_comm_id_for_comm_node_pass]: 3.80998e-06 [meta_fg_expand]: 2.21998e-06 [flash_sp_send_recv_attached]: 8.10018e-07 [receive_attached]: 1.09003e-06 [after_resolve]: 8.74e-06 [a_after_grad]: 8.02e-06 [renormalize]: 8.00064e-08 [add_forward_monad_depend]: 1.39998e-06 [auto_monad_grad]: 9.00007e-07 [auto_monad_eliminator]: 7.09001e-06 [cse]: 1.567e-05 [a_3]: 4.679e-05 [py_interpret_to_execute_after_opt_a]: 1.098e-05 [slice_cell_reuse_recomputed_activation]: 4.66002e-06 [rewriter_after_opt_a]: 3.703e-05 [convert_after_rewriter]: 9.43002e-06 [order_py_execute_after_rewriter]: 8.41002e-06 [mutable_eliminate]: 0.00049098 [opt_b]: 0.00026336, [1] [Cycle 1]: 0.00025502, [7] [b_1]: 0.00016255 [b_2]: 7.40998e-06 [updatestate_depend_eliminate]: 4.84e-06 [updatestate_assign_eliminate]: 2.51998e-06 [updatestate_loads_eliminate]: 2.27999e-06 [renormalize]: 4.30009e-07 [cse]: 1.911e-05 [optimize_parallel_all_gather_comm]: 1.883e-05 [overlap_param_gather]: 4.68999e-06 [cconv]: 2.81e-05 [loop_unroll]: 0.00043071 [opt_after_cconv]: 0.00012295, [1] [Cycle 1]: 0.00011438, [7] [c_1]: 2.809e-05 [parameter_eliminate]: 2.52001e-06 [updatestate_depend_eliminate]: 5.19e-06 [updatestate_assign_eliminate]: 2.57001e-06 [updatestate_loads_eliminate]: 2.38002e-06 [cse]: 1.758e-05 [renormalize]: 3.89991e-07 [remove_dup_value]: 1.804e-05 [tuple_transform]: 8.384e-05, [1] [Cycle 1]: 7.689e-05, [4] [d_1]: 3.802e-05 [none_parameter_eliminate]: 1.70001e-06 [renormalize]: 2.00002e-07 [switch_simplify]: 6.84001e-06 [partial_unused_args_eliminate]: 4.43999e-06 [add_recomputation]: 4.484e-05 [cse_after_recomputation]: 3.939e-05, [1] [Cycle 1]: 3.238e-05, [1] [cse]: 2.223e-05 [environ_conv]: 1.014e-05 [swap_dp_allreduce_reducescatter]: 8.69998e-06 [bias_add_comm_swap]: 4.82e-06 [label_micro_interleaved_index]: 6.86999e-06 [label_fine_grained_interleaved_index]: 5.02999e-06 [merge_cast_opt]: 3.71999e-06 [slice_recompute_activation]: 4.47998e-06 [micro_interleaved_order_control]: 4.62e-06 [assign_add_opt]: 3.65e-06 [ForceFp32Comm]: 3.14001e-06 [remove_cast_before_assign_add]: 3.65e-06 [full_micro_interleaved_order_control]: 4.79e-06 [reorder_send_recv_between_fp_bp]: 5.04998e-06 [comm_op_add_attrs]: 3.63e-06 [add_comm_op_reuse_tag]: 3.26999e-06 [interleave_split_concat_branches]: 3.63999e-06 [interleave_parallel_branches]: 3.5e-06 [overlap_opt_shard_in_pipeline]: 3.41999e-06 [overlap_opt_shard_grad_in_pipeline]: 4.1e-06 [control_data_broadcast_order]: 1.507e-05 [grouped_pairwise_exchange_alltoall]: 4.22998e-06 [offloading_packed_experts]: 6.53e-06 [overlap_recompute_and_grad_model_parallel]: 6.94999e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.56001e-06 [overlap_recompute_allgather_and_fa_grad]: 3.60998e-06 [overlap_recompute_comm]: 4.72e-06 [overlap_grad_ring_attention]: 6.43e-06 [overlap_grad_flash_sp]: 1.923e-05 [begin_end_overlap_inline]: 2.88998e-06 [split_matmul_comm_elemetwise]: 4.38001e-06 [split_layernorm_comm]: 4.32e-06 [handle_group_info]: 3.55e-06 [symbol_engine_optimizer]: 9.461e-05, [1] [Cycle 1]: 8.811e-05, [6] [build]: 2.42001e-06 [elim_shapecalc]: 9.72001e-06 [elim_not_effective]: 1.32e-05 [opt_reshape]: 7.13e-06 [fold_const_symbol]: 9.74e-06 [renormalize]: 2.09984e-07 [detach_backward]: 2.94999e-06 [pipeline_parallel_scheduler]: 1.75001e-06 [auto_monad_reorder]: 1.809e-05 [get_jit_bprop_graph]: 1.19e-06 [rewriter_after_jit_bprop_graph]: 4.17e-06 [opt_after_jit_grad]: 0.00047328 [validate]: 3.524e-05 Sums bootstrap : 0.000407s : 4.30% type_inference : 0.004938s : 52.22% event_method : 0.000011s : 0.12% auto_monad : 0.000054s : 0.57% graph_reusing : 0.000005s : 0.06% inline : 0.000002s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000013s : 0.14% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000004s : 0.04% parallel-infer-symbol : 0.000003s : 0.03% pre_auto_parallel : 0.000023s : 0.24% insert-virtual-dataset : 0.000002s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.02% optimize.py_interpret_to_execute : 0.000019s : 0.20% optimize.rewriter_before_opt_a : 0.000047s : 0.49% optimize.opt_a.expand_dump_flag : 0.000004s : 0.04% optimize.opt_a.switch_simplify : 0.000031s : 0.33% optimize.opt_a.loop_unroll : 0.000019s : 0.20% optimize.opt_a.a_1 : 0.000446s : 4.72% optimize.opt_a.with_stream_mark : 0.000023s : 0.24% optimize.opt_a.recompute_prepare : 0.000015s : 0.16% optimize.opt_a.updatestate_depend_eliminate : 0.000008s : 0.08% optimize.opt_a.updatestate_assign_eliminate : 0.000006s : 0.06% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.06% optimize.opt_a.parameter_eliminate : 0.000003s : 0.03% optimize.opt_a.a_2 : 0.000230s : 2.43% optimize.opt_a.accelerated_algorithm : 0.000027s : 0.29% optimize.opt_a.shard : 0.000003s : 0.03% optimize.opt_a.meta_shard_fg_expand : 0.000003s : 0.03% optimize.opt_a.shard_inline : 0.000013s : 0.13% optimize.opt_a.merge_send_recv : 0.000014s : 0.14% optimize.opt_a.auto_parallel : 0.000011s : 0.12% optimize.opt_a.parallel : 0.000023s : 0.24% optimize.opt_a.flash_sp : 0.000011s : 0.12% optimize.opt_a.merge_comm : 0.000008s : 0.08% optimize.opt_a.allreduce_fusion : 0.000007s : 0.07% optimize.opt_a.matmul_add_comm_reduction : 0.000018s : 0.19% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000015s : 0.16% optimize.opt_a.virtual_dataset : 0.000012s : 0.13% optimize.opt_a.get_grad_eliminate_ : 0.000012s : 0.13% optimize.opt_a.virtual_output : 0.000012s : 0.12% optimize.opt_a.merge_forward : 0.000006s : 0.07% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.03% optimize.opt_a.offload_activation : 0.000016s : 0.17% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000026s : 0.28% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.03% optimize.opt_a.before_grad : 0.000020s : 0.22% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000008s : 0.08% optimize.opt_a.meta_fg_expand : 0.000005s : 0.05% optimize.opt_a.flash_sp_send_recv_attached : 0.000006s : 0.06% optimize.opt_a.receive_attached : 0.000003s : 0.03% optimize.opt_a.after_resolve : 0.000019s : 0.20% optimize.opt_a.a_after_grad : 0.000018s : 0.19% optimize.opt_a.renormalize : 0.000442s : 4.67% optimize.opt_a.add_forward_monad_depend : 0.000006s : 0.06% optimize.opt_a.auto_monad_grad : 0.000003s : 0.03% optimize.opt_a.auto_monad_eliminator : 0.000021s : 0.23% optimize.opt_a.cse : 0.000042s : 0.44% optimize.opt_a.a_3 : 0.000105s : 1.11% optimize.py_interpret_to_execute_after_opt_a : 0.000011s : 0.12% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.05% optimize.rewriter_after_opt_a : 0.000037s : 0.39% optimize.convert_after_rewriter : 0.000009s : 0.10% optimize.order_py_execute_after_rewriter : 0.000008s : 0.09% optimize.mutable_eliminate : 0.000491s : 5.19% optimize.opt_b.b_1 : 0.000163s : 1.72% optimize.opt_b.b_2 : 0.000007s : 0.08% optimize.opt_b.updatestate_depend_eliminate : 0.000005s : 0.05% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_b.updatestate_loads_eliminate : 0.000002s : 0.02% optimize.opt_b.renormalize : 0.000000s : 0.00% optimize.opt_b.cse : 0.000019s : 0.20% optimize.optimize_parallel_all_gather_comm : 0.000019s : 0.20% optimize.overlap_param_gather : 0.000005s : 0.05% optimize.cconv : 0.000028s : 0.30% optimize.loop_unroll : 0.000431s : 4.56% optimize.opt_after_cconv.c_1 : 0.000028s : 0.30% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000005s : 0.05% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.03% optimize.opt_after_cconv.cse : 0.000018s : 0.19% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000018s : 0.19% optimize.tuple_transform.d_1 : 0.000038s : 0.40% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.02% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000007s : 0.07% optimize.partial_unused_args_eliminate : 0.000004s : 0.05% optimize.add_recomputation : 0.000045s : 0.47% optimize.cse_after_recomputation.cse : 0.000022s : 0.24% optimize.environ_conv : 0.000010s : 0.11% optimize.swap_dp_allreduce_reducescatter : 0.000009s : 0.09% optimize.bias_add_comm_swap : 0.000005s : 0.05% optimize.label_micro_interleaved_index : 0.000007s : 0.07% optimize.label_fine_grained_interleaved_index : 0.000005s : 0.05% optimize.merge_cast_opt : 0.000004s : 0.04% optimize.slice_recompute_activation : 0.000004s : 0.05% optimize.micro_interleaved_order_control : 0.000005s : 0.05% optimize.assign_add_opt : 0.000004s : 0.04% optimize.ForceFp32Comm : 0.000003s : 0.03% optimize.remove_cast_before_assign_add : 0.000004s : 0.04% optimize.full_micro_interleaved_order_control : 0.000005s : 0.05% optimize.reorder_send_recv_between_fp_bp : 0.000005s : 0.05% optimize.comm_op_add_attrs : 0.000004s : 0.04% optimize.add_comm_op_reuse_tag : 0.000003s : 0.03% optimize.interleave_split_concat_branches : 0.000004s : 0.04% optimize.interleave_parallel_branches : 0.000003s : 0.04% optimize.overlap_opt_shard_in_pipeline : 0.000003s : 0.04% optimize.overlap_opt_shard_grad_in_pipeline : 0.000004s : 0.04% optimize.control_data_broadcast_order : 0.000015s : 0.16% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.04% optimize.offloading_packed_experts : 0.000007s : 0.07% optimize.overlap_recompute_and_grad_model_parallel : 0.000007s : 0.07% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.04% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.04% optimize.overlap_recompute_comm : 0.000005s : 0.05% optimize.overlap_grad_ring_attention : 0.000006s : 0.07% optimize.overlap_grad_flash_sp : 0.000019s : 0.20% optimize.begin_end_overlap_inline : 0.000003s : 0.03% optimize.split_matmul_comm_elemetwise : 0.000004s : 0.05% optimize.split_layernorm_comm : 0.000004s : 0.05% optimize.handle_group_info : 0.000004s : 0.04% optimize.symbol_engine_optimizer.build : 0.000002s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000010s : 0.10% optimize.symbol_engine_optimizer.elim_not_effective : 0.000013s : 0.14% optimize.symbol_engine_optimizer.opt_reshape : 0.000007s : 0.08% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000010s : 0.10% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000003s : 0.03% pipeline_parallel_scheduler : 0.000002s : 0.02% auto_monad_reorder : 0.000018s : 0.19% get_jit_bprop_graph : 0.000001s : 0.01% rewriter_after_jit_bprop_graph : 0.000004s : 0.04% opt_after_jit_grad : 0.000473s : 5.01% validate : 0.000035s : 0.37% Time group info: ------[substitution.] 0.000185 25 32.58% : 0.000060s : 4: substitution.arithmetic_simplify 1.01% : 0.000002s : 2: substitution.elim_not_effective 0.71% : 0.000001s : 2: substitution.fold_const_symbol 3.06% : 0.000006s : 3: substitution.graph_param_transform 50.80% : 0.000094s : 2: substitution.inline 2.10% : 0.000004s : 4: substitution.j_node_and_user_rematch 5.66% : 0.000010s : 2: substitution.less_batch_normalization 2.35% : 0.000004s : 4: substitution.remove_not_recompute_node 1.72% : 0.000003s : 2: substitution.replace_old_param ------[type_inference.] 0.004897 2 84.61% : 0.004143s : 1: type_inference.infer 15.39% : 0.000753s : 1: type_inference.specialize ------[replace.] 0.000021 2 100.00% : 0.000021s : 2: replace.inline ------[match.] 0.000092 2 100.00% : 0.000092s : 2: match.inline ------[predicate.] 0.000134 754 0.88% : 0.000001s : 7: predicate.accumulaten_eliminater 1.09% : 0.000001s : 3: predicate.ad_related_special_op_eliminate 0.66% : 0.000001s : 6: predicate.addn_check_dump 0.87% : 0.000001s : 7: predicate.addn_zero_filter 0.64% : 0.000001s : 7: predicate.adjust_all_reduce_mul_add 2.86% : 0.000004s : 13: predicate.arithmetic_simplify 0.86% : 0.000001s : 7: predicate.cast_eliminate 0.75% : 0.000001s : 6: predicate.check_bprop_eliminate 0.67% : 0.000001s : 6: predicate.compare_switch_simplify 0.26% : 0.000000s : 3: predicate.const_output_eliminate 0.82% : 0.000001s : 6: predicate.depend_value_elim 0.78% : 0.000001s : 7: predicate.dict_get_item_const_eliminator 0.89% : 0.000001s : 7: predicate.dict_get_item_eliminator 0.81% : 0.000001s : 7: predicate.dict_set_item_eliminator 1.23% : 0.000002s : 6: predicate.dumpgradient_eliminate 0.28% : 0.000000s : 3: predicate.elim_not_effective 0.51% : 0.000001s : 3: predicate.elim_shapecalc_of_broadcastargs 1.12% : 0.000001s : 10: predicate.environ_add_const_eliminate 1.00% : 0.000001s : 10: predicate.environ_get_add_eliminate 1.03% : 0.000001s : 10: predicate.environ_get_depend_swap 1.98% : 0.000003s : 16: predicate.environ_get_eliminate 1.03% : 0.000001s : 10: predicate.environ_get_set_eliminate 0.91% : 0.000001s : 9: predicate.exchange_switch_depend_value 1.91% : 0.000003s : 9: predicate.float_depend_g_call 0.66% : 0.000001s : 6: predicate.float_environ_get_switch 1.08% : 0.000001s : 9: predicate.float_tuple_getitem_switch 0.25% : 0.000000s : 3: predicate.fold_const_symbol 0.79% : 0.000001s : 6: predicate.get_grad_eliminate 0.40% : 0.000001s : 3: predicate.graph_param_transform 0.85% : 0.000001s : 6: predicate.incorporate_call 0.67% : 0.000001s : 6: predicate.incorporate_call_switch 6.37% : 0.000009s : 34: predicate.inline 1.03% : 0.000001s : 6: predicate.inline_without_move 0.37% : 0.000000s : 6: predicate.j_node_and_user_rematch 1.15% : 0.000002s : 6: predicate.less_batch_normalization 1.73% : 0.000002s : 13: predicate.list_to_tuple_eliminator_ 1.96% : 0.000003s : 20: predicate.load_eliminater 1.21% : 0.000002s : 3: predicate.loop_unroll_after_grad 1.57% : 0.000002s : 14: predicate.loop_unroll_before_grad 1.75% : 0.000002s : 13: predicate.make_slice_get_slice_eliminator 0.73% : 0.000001s : 6: predicate.merge_addn 0.90% : 0.000001s : 6: predicate.micro_step_allgather_replace 0.74% : 0.000001s : 6: predicate.mini_step_allgather_replace 0.67% : 0.000001s : 7: predicate.minmaximum_grad 1.25% : 0.000002s : 3: predicate.mutable_eliminate 0.47% : 0.000001s : 3: predicate.opt_reshape 0.39% : 0.000001s : 3: predicate.parallel_virtual_node 1.30% : 0.000002s : 9: predicate.partial_defer_inline 1.20% : 0.000002s : 10: predicate.partial_eliminate 0.78% : 0.000001s : 7: predicate.print_const_string_wrapper 0.76% : 0.000001s : 6: predicate.reduce_all_const_elim 0.99% : 0.000001s : 7: predicate.reduce_eliminate 2.10% : 0.000003s : 20: predicate.redundant_stop_gradient_eliminater 0.81% : 0.000001s : 6: predicate.remove_not_recompute_node 1.21% : 0.000002s : 13: predicate.replace_applicator 0.65% : 0.000001s : 6: predicate.replace_old_param 0.34% : 0.000000s : 3: predicate.reset_defer_inline 0.80% : 0.000001s : 7: predicate.reshape_eliminate 0.79% : 0.000001s : 6: predicate.row_tensor_add_zeros_like 0.47% : 0.000001s : 3: predicate.row_tensor_eliminate 1.03% : 0.000001s : 6: predicate.same_eliminate 0.54% : 0.000001s : 6: predicate.set_cell_output_no_recompute 0.97% : 0.000001s : 6: predicate.shard_identity_eliminate 0.78% : 0.000001s : 6: predicate.special_op_eliminate 0.92% : 0.000001s : 6: predicate.specialize_transform 1.04% : 0.000001s : 6: predicate.split_environ_get_set_with_tuple_value 0.99% : 0.000001s : 6: predicate.stack_unstack_eliminate 0.39% : 0.000001s : 3: predicate.switch_call_monad_eliminater 1.02% : 0.000001s : 9: predicate.switch_defer_inline 1.75% : 0.000002s : 15: predicate.switch_layer_defer_inline 4.42% : 0.000006s : 32: predicate.switch_simplify 0.84% : 0.000001s : 7: predicate.tile_eliminate 1.00% : 0.000001s : 7: predicate.transpose_eliminate 1.66% : 0.000002s : 13: predicate.tuple_list_convert_item_index_to_positive 1.62% : 0.000002s : 13: predicate.tuple_list_get_item_const_eliminator 1.32% : 0.000002s : 13: predicate.tuple_list_get_item_depend_reorder 3.24% : 0.000004s : 19: predicate.tuple_list_get_item_eliminator 1.42% : 0.000002s : 13: predicate.tuple_list_get_set_item_eliminator 2.49% : 0.000003s : 19: predicate.tuple_list_set_item_eliminator 1.54% : 0.000002s : 13: predicate.tuple_to_list_eliminator_ 1.97% : 0.000003s : 20: predicate.updatestate_pure_node_eliminater 3.02% : 0.000004s : 26: predicate.updatestate_useless_node_eliminater 0.46% : 0.000001s : 3: predicate.value_based_eliminate 0.74% : 0.000001s : 6: predicate.virtual_dataset_eliminate 0.82% : 0.000001s : 6: predicate.virtual_output_eliminate 0.33% : 0.000000s : 3: predicate.virtual_view_grad_eliminate 0.62% : 0.000001s : 3: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000513 5 3.28% : 0.000017s : 1: func_graph_cloner_run.FuncGraphClonerGraph 96.72% : 0.000496s : 4: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.023256 192 0.03% : 0.000006s : 1: ForceFp32Comm 13.10% : 0.003046s : 1: add_attr 13.04% : 0.003033s : 1: add_attr_with_inline 0.03% : 0.000006s : 1: add_comm_op_reuse_tag 0.21% : 0.000049s : 1: add_recomputation 0.03% : 0.000006s : 1: assign_add_opt 0.26% : 0.000062s : 1: auto_monad 0.11% : 0.000025s : 1: auto_monad_reorder 0.02% : 0.000006s : 1: begin_end_overlap_inline 0.03% : 0.000008s : 1: bias_add_comm_swap 1.92% : 0.000446s : 1: bootstrap 0.13% : 0.000031s : 1: cconv 0.03% : 0.000006s : 1: comm_op_add_attrs 0.08% : 0.000018s : 1: control_data_broadcast_order 0.05% : 0.000012s : 1: convert_after_rewriter 0.18% : 0.000043s : 1: cse_after_recomputation 0.03% : 0.000007s : 1: dataset_repeat_opt 0.08% : 0.000018s : 1: detach_backward 0.06% : 0.000014s : 1: environ_conv 0.09% : 0.000021s : 1: event_method 0.03% : 0.000008s : 1: full_micro_interleaved_order_control 0.03% : 0.000007s : 1: get_jit_bprop_graph 0.05% : 0.000011s : 1: graph_reusing 0.03% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.03% : 0.000006s : 1: handle_group_info 0.03% : 0.000008s : 1: inline 0.03% : 0.000008s : 1: insert-virtual-dataset 0.03% : 0.000006s : 1: interleave_parallel_branches 0.03% : 0.000006s : 1: interleave_split_concat_branches 0.03% : 0.000008s : 1: label_fine_grained_interleaved_index 0.04% : 0.000010s : 1: label_micro_interleaved_index 1.88% : 0.000436s : 1: loop_unroll 0.03% : 0.000006s : 1: merge_cast_opt 0.03% : 0.000008s : 1: micro_interleaved_order_control 2.14% : 0.000497s : 1: mutable_eliminate 0.04% : 0.000009s : 1: offloading_packed_experts 0.06% : 0.000013s : 1: opt.transform.loop_unroll_optimizer 0.06% : 0.000014s : 1: opt.transform.mutable_eliminate 3.65% : 0.000848s : 78: opt.transform.opt_a 0.11% : 0.000027s : 1: opt.transform.opt_after_cconv 0.10% : 0.000023s : 1: opt.transform.opt_after_jit_grad 0.42% : 0.000097s : 28: opt.transform.opt_b 0.18% : 0.000043s : 2: opt.transform.opt_trans_graph 0.16% : 0.000036s : 4: opt.transform.symbol_engine_opt 10.58% : 0.002462s : 1: opt_a 0.54% : 0.000126s : 1: opt_after_cconv 2.08% : 0.000483s : 1: opt_after_jit_grad 1.15% : 0.000267s : 1: opt_b 21.23% : 0.004937s : 1: optimize 0.10% : 0.000022s : 1: optimize_parallel_all_gather_comm 0.05% : 0.000011s : 1: order_py_execute_after_rewriter 0.10% : 0.000022s : 1: overlap_grad_flash_sp 0.03% : 0.000006s : 1: overlap_grad_matmul_and_grad_allreduce 0.04% : 0.000009s : 1: overlap_grad_ring_attention 0.03% : 0.000007s : 1: overlap_opt_shard_grad_in_pipeline 0.03% : 0.000006s : 1: overlap_opt_shard_in_pipeline 0.03% : 0.000008s : 1: overlap_param_gather 0.03% : 0.000006s : 1: overlap_recompute_allgather_and_fa_grad 0.04% : 0.000010s : 1: overlap_recompute_and_grad_model_parallel 0.03% : 0.000008s : 1: overlap_recompute_comm 0.04% : 0.000009s : 1: parallel-infer-symbol 0.03% : 0.000006s : 1: parallel-infer-symbol-second 0.03% : 0.000007s : 1: partial_unused_args_eliminate 0.04% : 0.000008s : 1: pipeline_parallel_scheduler 0.03% : 0.000007s : 1: pipeline_split 0.13% : 0.000030s : 1: pre_auto_parallel 0.10% : 0.000023s : 1: py_interpret_to_execute 0.06% : 0.000014s : 1: py_interpret_to_execute_after_opt_a 0.03% : 0.000006s : 1: remove_cast_before_assign_add 0.09% : 0.000021s : 1: remove_dup_value 1.02% : 0.000238s : 1: renormalize.infer 0.85% : 0.000197s : 1: renormalize.specialize 0.03% : 0.000008s : 1: reorder_send_recv_between_fp_bp 0.04% : 0.000010s : 1: rewriter_after_jit_bprop_graph 0.17% : 0.000040s : 1: rewriter_after_opt_a 0.22% : 0.000050s : 1: rewriter_before_opt_a 0.03% : 0.000007s : 1: slice_cell_reuse_recomputed_activation 0.03% : 0.000007s : 1: slice_recompute_activation 0.03% : 0.000007s : 1: split_layernorm_comm 0.03% : 0.000007s : 1: split_matmul_comm_elemetwise 0.05% : 0.000012s : 1: swap_dp_allreduce_reducescatter 0.42% : 0.000097s : 1: symbol_engine_optimizer 0.37% : 0.000087s : 1: tuple_transform 21.33% : 0.004961s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:29.258.793 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0276082, [21] [bootstrap]: 0.0004281 [type_inference]: 0.00472693 [event_method]: 1.133e-05 [auto_monad]: 5.635e-05 [graph_reusing]: 5.14e-06 [inline]: 2.41e-06 [add_attr]: 0.00307242, [1] [add_attr_with_inline]: 0.00306429, [1] [Cycle 1]: 4.703e-05, [2] [tag_attr]: 1.337e-05 [meta_addattr_fg_expand]: 3.77002e-06 [parallel-infer-symbol]: 2.64999e-06 [pre_auto_parallel]: 2.415e-05 [insert-virtual-dataset]: 2.37001e-06 [parallel-infer-symbol-second]: 7.29982e-07 [dataset_repeat_opt]: 2.25002e-06 [pipeline_split]: 1.57001e-06 [optimize]: 0.0186045, [53] [py_interpret_to_execute]: 1.59e-05 [rewriter_before_opt_a]: 4.404e-05 [opt_a]: 0.0165459, [2] [Cycle 1]: 0.0158328, [45] [expand_dump_flag]: 2.66e-06 [switch_simplify]: 2.521e-05 [loop_unroll]: 1.376e-05 [a_1]: 0.00031694 [with_stream_mark]: 1.632e-05 [recompute_prepare]: 8.77999e-06 [updatestate_depend_eliminate]: 4.02998e-06 [updatestate_assign_eliminate]: 2.98e-06 [updatestate_loads_eliminate]: 3.18998e-06 [parameter_eliminate]: 1.86998e-06 [a_2]: 9.477e-05 [accelerated_algorithm]: 1.931e-05 [shard]: 2.12001e-06 [meta_shard_fg_expand]: 2.06998e-06 [shard_inline]: 6.52001e-06 [merge_send_recv]: 8.34002e-06 [auto_parallel]: 6.51999e-06 [parallel]: 1.789e-05 [flash_sp]: 7.14001e-06 [merge_comm]: 3.88999e-06 [allreduce_fusion]: 3.56001e-06 [matmul_add_comm_reduction]: 1.002e-05 [allreduce_slice_to_reducescatter]: 7.00005e-07 [virtual_shard_identity]: 8.57998e-06 [virtual_dataset]: 6.35002e-06 [get_grad_eliminate_]: 5.77999e-06 [virtual_output]: 5.91003e-06 [merge_forward]: 3.68e-06 [cell_reuse_recompute_pass]: 1.39e-06 [offload_activation]: 9.55001e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.189e-05 [merge_recompute_call_nodes]: 1.63002e-06 [before_grad]: 1.069e-05 [set_forward_comm_id_for_comm_node_pass]: 4.07e-06 [meta_fg_expand]: 2.84001e-06 [flash_sp_send_recv_attached]: 4.40999e-06 [receive_attached]: 2.69001e-06 [after_resolve]: 9.82999e-06 [a_after_grad]: 9.04e-06 [renormalize]: 0.0147833 [add_forward_monad_depend]: 9.98002e-06 [auto_monad_grad]: 2.16e-06 [auto_monad_eliminator]: 1.865e-05 [cse]: 3.236e-05 [a_3]: 5.49e-05 [Cycle 2]: 0.00070202, [45] [expand_dump_flag]: 2.00002e-06 [switch_simplify]: 8.13999e-06 [loop_unroll]: 5.81e-06 [a_1]: 0.00014473 [with_stream_mark]: 1.291e-05 [recompute_prepare]: 6.74001e-06 [updatestate_depend_eliminate]: 3.78999e-06 [updatestate_assign_eliminate]: 2.76999e-06 [updatestate_loads_eliminate]: 3.03e-06 [parameter_eliminate]: 1.45001e-06 [a_2]: 8.487e-05 [accelerated_algorithm]: 1.056e-05 [shard]: 1.52999e-06 [meta_shard_fg_expand]: 2.12001e-06 [shard_inline]: 5.99e-06 [merge_send_recv]: 7.16001e-06 [auto_parallel]: 8.80001e-06 [parallel]: 6.93998e-06 [flash_sp]: 4.24002e-06 [merge_comm]: 3.51001e-06 [allreduce_fusion]: 3.39001e-06 [matmul_add_comm_reduction]: 9.96e-06 [allreduce_slice_to_reducescatter]: 5.89993e-07 [virtual_shard_identity]: 6.69999e-06 [virtual_dataset]: 5.84999e-06 [get_grad_eliminate_]: 5.52001e-06 [virtual_output]: 5.30999e-06 [merge_forward]: 3.60998e-06 [cell_reuse_recompute_pass]: 2.17999e-06 [offload_activation]: 9.34e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.157e-05 [merge_recompute_call_nodes]: 1.30999e-06 [before_grad]: 9.44e-06 [set_forward_comm_id_for_comm_node_pass]: 4.01001e-06 [meta_fg_expand]: 2.43998e-06 [flash_sp_send_recv_attached]: 1.09e-06 [receive_attached]: 1.22999e-06 [after_resolve]: 9.77001e-06 [a_after_grad]: 8.17998e-06 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 1.62001e-06 [auto_monad_grad]: 1.12999e-06 [auto_monad_eliminator]: 7.28e-06 [cse]: 1.595e-05 [a_3]: 3.426e-05 [py_interpret_to_execute_after_opt_a]: 1.309e-05 [slice_cell_reuse_recomputed_activation]: 2.10002e-06 [rewriter_after_opt_a]: 3.663e-05 [convert_after_rewriter]: 6.51e-06 [order_py_execute_after_rewriter]: 5.22999e-06 [mutable_eliminate]: 0.0006255 [opt_b]: 0.00019662, [1] [Cycle 1]: 0.00019032, [7] [b_1]: 0.00011854 [b_2]: 7.45e-06 [updatestate_depend_eliminate]: 5.25999e-06 [updatestate_assign_eliminate]: 2.27001e-06 [updatestate_loads_eliminate]: 2.29001e-06 [renormalize]: 4.70027e-07 [cse]: 1.841e-05 [optimize_parallel_all_gather_comm]: 1.528e-05 [overlap_param_gather]: 2.14e-06 [cconv]: 2.705e-05 [loop_unroll]: 0.00042887 [opt_after_cconv]: 9.837e-05, [1] [Cycle 1]: 9.29e-05, [7] [c_1]: 2.761e-05 [parameter_eliminate]: 2.39999e-06 [updatestate_depend_eliminate]: 5.32001e-06 [updatestate_assign_eliminate]: 2.54999e-06 [updatestate_loads_eliminate]: 2.14e-06 [cse]: 1.747e-05 [renormalize]: 4.39992e-07 [remove_dup_value]: 1.495e-05 [tuple_transform]: 7.021e-05, [1] [Cycle 1]: 6.575e-05, [4] [d_1]: 3.885e-05 [none_parameter_eliminate]: 1.55999e-06 [renormalize]: 2.19996e-07 [switch_simplify]: 6.71e-06 [partial_unused_args_eliminate]: 1.83997e-06 [add_recomputation]: 4.669e-05 [cse_after_recomputation]: 2.177e-05, [1] [Cycle 1]: 1.75e-05, [1] [cse]: 1.199e-05 [environ_conv]: 4.89e-06 [swap_dp_allreduce_reducescatter]: 5.09e-06 [bias_add_comm_swap]: 2.17999e-06 [label_micro_interleaved_index]: 4.37e-06 [label_fine_grained_interleaved_index]: 2.70002e-06 [merge_cast_opt]: 1.20999e-06 [slice_recompute_activation]: 2.24001e-06 [micro_interleaved_order_control]: 2.61e-06 [assign_add_opt]: 1.43002e-06 [ForceFp32Comm]: 1.07e-06 [remove_cast_before_assign_add]: 1.04003e-06 [full_micro_interleaved_order_control]: 2.06e-06 [reorder_send_recv_between_fp_bp]: 2.76999e-06 [comm_op_add_attrs]: 9.39996e-07 [add_comm_op_reuse_tag]: 9.29984e-07 [interleave_split_concat_branches]: 1.14003e-06 [interleave_parallel_branches]: 1.10999e-06 [overlap_opt_shard_in_pipeline]: 1.08001e-06 [overlap_opt_shard_grad_in_pipeline]: 2.01e-06 [control_data_broadcast_order]: 1.169e-05 [grouped_pairwise_exchange_alltoall]: 1.74e-06 [offloading_packed_experts]: 3.80998e-06 [overlap_recompute_and_grad_model_parallel]: 4.52e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.42e-06 [overlap_recompute_allgather_and_fa_grad]: 1.34e-06 [overlap_recompute_comm]: 2.01e-06 [overlap_grad_ring_attention]: 4.37e-06 [overlap_grad_flash_sp]: 1.88e-05 [begin_end_overlap_inline]: 6.10016e-07 [split_matmul_comm_elemetwise]: 2.74999e-06 [split_layernorm_comm]: 1.61998e-06 [handle_group_info]: 8.79983e-07 [symbol_engine_optimizer]: 7.397e-05, [1] [Cycle 1]: 6.949e-05, [6] [build]: 2.68e-06 [elim_shapecalc]: 9.38997e-06 [elim_not_effective]: 1.234e-05 [opt_reshape]: 6.91999e-06 [fold_const_symbol]: 9.54999e-06 [renormalize]: 2.09984e-07 [detach_backward]: 1.81998e-06 [pipeline_parallel_scheduler]: 1.63002e-06 [auto_monad_reorder]: 1.558e-05 [get_jit_bprop_graph]: 1.59e-06 [rewriter_after_jit_bprop_graph]: 3.9e-06 [opt_after_jit_grad]: 0.00045939 [validate]: 3.748e-05 Sums bootstrap : 0.000428s : 1.82% type_inference : 0.004727s : 20.06% event_method : 0.000011s : 0.05% auto_monad : 0.000056s : 0.24% graph_reusing : 0.000005s : 0.02% inline : 0.000002s : 0.01% add_attr.add_attr_with_inline.tag_attr : 0.000013s : 0.06% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000004s : 0.02% parallel-infer-symbol : 0.000003s : 0.01% pre_auto_parallel : 0.000024s : 0.10% insert-virtual-dataset : 0.000002s : 0.01% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.01% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000016s : 0.07% optimize.rewriter_before_opt_a : 0.000044s : 0.19% optimize.opt_a.expand_dump_flag : 0.000005s : 0.02% optimize.opt_a.switch_simplify : 0.000033s : 0.14% optimize.opt_a.loop_unroll : 0.000020s : 0.08% optimize.opt_a.a_1 : 0.000462s : 1.96% optimize.opt_a.with_stream_mark : 0.000029s : 0.12% optimize.opt_a.recompute_prepare : 0.000016s : 0.07% optimize.opt_a.updatestate_depend_eliminate : 0.000008s : 0.03% optimize.opt_a.updatestate_assign_eliminate : 0.000006s : 0.02% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.03% optimize.opt_a.parameter_eliminate : 0.000003s : 0.01% optimize.opt_a.a_2 : 0.000180s : 0.76% optimize.opt_a.accelerated_algorithm : 0.000030s : 0.13% optimize.opt_a.shard : 0.000004s : 0.02% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.02% optimize.opt_a.shard_inline : 0.000013s : 0.05% optimize.opt_a.merge_send_recv : 0.000016s : 0.07% optimize.opt_a.auto_parallel : 0.000015s : 0.07% optimize.opt_a.parallel : 0.000025s : 0.11% optimize.opt_a.flash_sp : 0.000011s : 0.05% optimize.opt_a.merge_comm : 0.000007s : 0.03% optimize.opt_a.allreduce_fusion : 0.000007s : 0.03% optimize.opt_a.matmul_add_comm_reduction : 0.000020s : 0.08% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000015s : 0.06% optimize.opt_a.virtual_dataset : 0.000012s : 0.05% optimize.opt_a.get_grad_eliminate_ : 0.000011s : 0.05% optimize.opt_a.virtual_output : 0.000011s : 0.05% optimize.opt_a.merge_forward : 0.000007s : 0.03% optimize.opt_a.cell_reuse_recompute_pass : 0.000004s : 0.02% optimize.opt_a.offload_activation : 0.000019s : 0.08% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000023s : 0.10% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.01% optimize.opt_a.before_grad : 0.000020s : 0.09% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000008s : 0.03% optimize.opt_a.meta_fg_expand : 0.000005s : 0.02% optimize.opt_a.flash_sp_send_recv_attached : 0.000005s : 0.02% optimize.opt_a.receive_attached : 0.000004s : 0.02% optimize.opt_a.after_resolve : 0.000020s : 0.08% optimize.opt_a.a_after_grad : 0.000017s : 0.07% optimize.opt_a.renormalize : 0.014783s : 62.73% optimize.opt_a.add_forward_monad_depend : 0.000012s : 0.05% optimize.opt_a.auto_monad_grad : 0.000003s : 0.01% optimize.opt_a.auto_monad_eliminator : 0.000026s : 0.11% optimize.opt_a.cse : 0.000048s : 0.21% optimize.opt_a.a_3 : 0.000089s : 0.38% optimize.py_interpret_to_execute_after_opt_a : 0.000013s : 0.06% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.01% optimize.rewriter_after_opt_a : 0.000037s : 0.16% optimize.convert_after_rewriter : 0.000007s : 0.03% optimize.order_py_execute_after_rewriter : 0.000005s : 0.02% optimize.mutable_eliminate : 0.000625s : 2.65% optimize.opt_b.b_1 : 0.000119s : 0.50% optimize.opt_b.b_2 : 0.000007s : 0.03% optimize.opt_b.updatestate_depend_eliminate : 0.000005s : 0.02% optimize.opt_b.updatestate_assign_eliminate : 0.000002s : 0.01% optimize.opt_b.updatestate_loads_eliminate : 0.000002s : 0.01% optimize.opt_b.renormalize : 0.000000s : 0.00% optimize.opt_b.cse : 0.000018s : 0.08% optimize.optimize_parallel_all_gather_comm : 0.000015s : 0.06% optimize.overlap_param_gather : 0.000002s : 0.01% optimize.cconv : 0.000027s : 0.11% optimize.loop_unroll : 0.000429s : 1.82% optimize.opt_after_cconv.c_1 : 0.000028s : 0.12% optimize.opt_after_cconv.parameter_eliminate : 0.000002s : 0.01% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000005s : 0.02% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.01% optimize.opt_after_cconv.cse : 0.000017s : 0.07% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000015s : 0.06% optimize.tuple_transform.d_1 : 0.000039s : 0.16% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000007s : 0.03% optimize.partial_unused_args_eliminate : 0.000002s : 0.01% optimize.add_recomputation : 0.000047s : 0.20% optimize.cse_after_recomputation.cse : 0.000012s : 0.05% optimize.environ_conv : 0.000005s : 0.02% optimize.swap_dp_allreduce_reducescatter : 0.000005s : 0.02% optimize.bias_add_comm_swap : 0.000002s : 0.01% optimize.label_micro_interleaved_index : 0.000004s : 0.02% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.01% optimize.merge_cast_opt : 0.000001s : 0.01% optimize.slice_recompute_activation : 0.000002s : 0.01% optimize.micro_interleaved_order_control : 0.000003s : 0.01% optimize.assign_add_opt : 0.000001s : 0.01% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.00% optimize.full_micro_interleaved_order_control : 0.000002s : 0.01% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.01% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.01% optimize.control_data_broadcast_order : 0.000012s : 0.05% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.01% optimize.offloading_packed_experts : 0.000004s : 0.02% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.02% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.01% optimize.overlap_recompute_comm : 0.000002s : 0.01% optimize.overlap_grad_ring_attention : 0.000004s : 0.02% optimize.overlap_grad_flash_sp : 0.000019s : 0.08% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000003s : 0.01% optimize.split_layernorm_comm : 0.000002s : 0.01% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000003s : 0.01% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000009s : 0.04% optimize.symbol_engine_optimizer.elim_not_effective : 0.000012s : 0.05% optimize.symbol_engine_optimizer.opt_reshape : 0.000007s : 0.03% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000010s : 0.04% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.01% pipeline_parallel_scheduler : 0.000002s : 0.01% auto_monad_reorder : 0.000016s : 0.07% get_jit_bprop_graph : 0.000002s : 0.01% rewriter_after_jit_bprop_graph : 0.000004s : 0.02% opt_after_jit_grad : 0.000459s : 1.95% validate : 0.000037s : 0.16% Time group info: ------[substitution.] 0.000192 25 39.64% : 0.000076s : 4: substitution.arithmetic_simplify 0.91% : 0.000002s : 2: substitution.elim_not_effective 0.66% : 0.000001s : 2: substitution.fold_const_symbol 3.04% : 0.000006s : 3: substitution.graph_param_transform 42.60% : 0.000082s : 2: substitution.inline 2.16% : 0.000004s : 4: substitution.j_node_and_user_rematch 6.38% : 0.000012s : 2: substitution.less_batch_normalization 2.63% : 0.000005s : 4: substitution.remove_not_recompute_node 1.98% : 0.000004s : 2: substitution.replace_old_param ------[type_inference.] 0.004684 2 91.27% : 0.004275s : 1: type_inference.infer 8.73% : 0.000409s : 1: type_inference.specialize ------[replace.] 0.000021 2 100.00% : 0.000021s : 2: replace.inline ------[match.] 0.000080 2 100.00% : 0.000080s : 2: match.inline ------[predicate.] 0.000137 754 0.87% : 0.000001s : 7: predicate.accumulaten_eliminater 1.08% : 0.000001s : 3: predicate.ad_related_special_op_eliminate 0.69% : 0.000001s : 6: predicate.addn_check_dump 0.86% : 0.000001s : 7: predicate.addn_zero_filter 0.70% : 0.000001s : 7: predicate.adjust_all_reduce_mul_add 2.72% : 0.000004s : 13: predicate.arithmetic_simplify 0.80% : 0.000001s : 7: predicate.cast_eliminate 0.77% : 0.000001s : 6: predicate.check_bprop_eliminate 0.69% : 0.000001s : 6: predicate.compare_switch_simplify 0.22% : 0.000000s : 3: predicate.const_output_eliminate 0.86% : 0.000001s : 6: predicate.depend_value_elim 0.77% : 0.000001s : 7: predicate.dict_get_item_const_eliminator 0.94% : 0.000001s : 7: predicate.dict_get_item_eliminator 0.76% : 0.000001s : 7: predicate.dict_set_item_eliminator 1.10% : 0.000002s : 6: predicate.dumpgradient_eliminate 0.37% : 0.000001s : 3: predicate.elim_not_effective 0.44% : 0.000001s : 3: predicate.elim_shapecalc_of_broadcastargs 1.24% : 0.000002s : 10: predicate.environ_add_const_eliminate 0.97% : 0.000001s : 10: predicate.environ_get_add_eliminate 0.99% : 0.000001s : 10: predicate.environ_get_depend_swap 1.71% : 0.000002s : 16: predicate.environ_get_eliminate 1.02% : 0.000001s : 10: predicate.environ_get_set_eliminate 0.96% : 0.000001s : 9: predicate.exchange_switch_depend_value 2.19% : 0.000003s : 9: predicate.float_depend_g_call 0.72% : 0.000001s : 6: predicate.float_environ_get_switch 1.00% : 0.000001s : 9: predicate.float_tuple_getitem_switch 0.22% : 0.000000s : 3: predicate.fold_const_symbol 0.86% : 0.000001s : 6: predicate.get_grad_eliminate 0.26% : 0.000000s : 3: predicate.graph_param_transform 0.78% : 0.000001s : 6: predicate.incorporate_call 0.65% : 0.000001s : 6: predicate.incorporate_call_switch 6.23% : 0.000009s : 34: predicate.inline 0.99% : 0.000001s : 6: predicate.inline_without_move 0.40% : 0.000001s : 6: predicate.j_node_and_user_rematch 1.36% : 0.000002s : 6: predicate.less_batch_normalization 1.63% : 0.000002s : 13: predicate.list_to_tuple_eliminator_ 1.98% : 0.000003s : 20: predicate.load_eliminater 1.42% : 0.000002s : 3: predicate.loop_unroll_after_grad 1.65% : 0.000002s : 14: predicate.loop_unroll_before_grad 1.59% : 0.000002s : 13: predicate.make_slice_get_slice_eliminator 0.63% : 0.000001s : 6: predicate.merge_addn 0.73% : 0.000001s : 6: predicate.micro_step_allgather_replace 0.66% : 0.000001s : 6: predicate.mini_step_allgather_replace 0.65% : 0.000001s : 7: predicate.minmaximum_grad 1.45% : 0.000002s : 3: predicate.mutable_eliminate 0.47% : 0.000001s : 3: predicate.opt_reshape 0.45% : 0.000001s : 3: predicate.parallel_virtual_node 1.20% : 0.000002s : 9: predicate.partial_defer_inline 1.19% : 0.000002s : 10: predicate.partial_eliminate 0.80% : 0.000001s : 7: predicate.print_const_string_wrapper 0.75% : 0.000001s : 6: predicate.reduce_all_const_elim 1.12% : 0.000002s : 7: predicate.reduce_eliminate 1.95% : 0.000003s : 20: predicate.redundant_stop_gradient_eliminater 0.73% : 0.000001s : 6: predicate.remove_not_recompute_node 1.37% : 0.000002s : 13: predicate.replace_applicator 0.80% : 0.000001s : 6: predicate.replace_old_param 0.29% : 0.000000s : 3: predicate.reset_defer_inline 0.92% : 0.000001s : 7: predicate.reshape_eliminate 0.75% : 0.000001s : 6: predicate.row_tensor_add_zeros_like 0.42% : 0.000001s : 3: predicate.row_tensor_eliminate 1.04% : 0.000001s : 6: predicate.same_eliminate 0.46% : 0.000001s : 6: predicate.set_cell_output_no_recompute 1.35% : 0.000002s : 6: predicate.shard_identity_eliminate 0.91% : 0.000001s : 6: predicate.special_op_eliminate 0.92% : 0.000001s : 6: predicate.specialize_transform 1.22% : 0.000002s : 6: predicate.split_environ_get_set_with_tuple_value 0.91% : 0.000001s : 6: predicate.stack_unstack_eliminate 0.45% : 0.000001s : 3: predicate.switch_call_monad_eliminater 1.00% : 0.000001s : 9: predicate.switch_defer_inline 1.76% : 0.000002s : 15: predicate.switch_layer_defer_inline 4.13% : 0.000006s : 32: predicate.switch_simplify 0.71% : 0.000001s : 7: predicate.tile_eliminate 0.80% : 0.000001s : 7: predicate.transpose_eliminate 1.68% : 0.000002s : 13: predicate.tuple_list_convert_item_index_to_positive 1.63% : 0.000002s : 13: predicate.tuple_list_get_item_const_eliminator 1.42% : 0.000002s : 13: predicate.tuple_list_get_item_depend_reorder 3.38% : 0.000005s : 19: predicate.tuple_list_get_item_eliminator 1.56% : 0.000002s : 13: predicate.tuple_list_get_set_item_eliminator 2.42% : 0.000003s : 19: predicate.tuple_list_set_item_eliminator 1.50% : 0.000002s : 13: predicate.tuple_to_list_eliminator_ 2.04% : 0.000003s : 20: predicate.updatestate_pure_node_eliminater 2.81% : 0.000004s : 26: predicate.updatestate_useless_node_eliminater 0.41% : 0.000001s : 3: predicate.value_based_eliminate 1.08% : 0.000001s : 6: predicate.virtual_dataset_eliminate 0.72% : 0.000001s : 6: predicate.virtual_output_eliminate 0.31% : 0.000000s : 3: predicate.virtual_view_grad_eliminate 0.59% : 0.000001s : 3: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000242 5 7.63% : 0.000018s : 1: func_graph_cloner_run.FuncGraphClonerGraph 92.37% : 0.000224s : 4: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.065065 192 0.01% : 0.000004s : 1: ForceFp32Comm 4.73% : 0.003078s : 1: add_attr 4.71% : 0.003068s : 1: add_attr_with_inline 0.01% : 0.000004s : 1: add_comm_op_reuse_tag 0.08% : 0.000051s : 1: add_recomputation 0.01% : 0.000004s : 1: assign_add_opt 0.09% : 0.000061s : 1: auto_monad 0.03% : 0.000019s : 1: auto_monad_reorder 0.01% : 0.000003s : 1: begin_end_overlap_inline 0.01% : 0.000005s : 1: bias_add_comm_swap 0.70% : 0.000457s : 1: bootstrap 0.05% : 0.000031s : 1: cconv 0.01% : 0.000004s : 1: comm_op_add_attrs 0.02% : 0.000015s : 1: control_data_broadcast_order 0.01% : 0.000010s : 1: convert_after_rewriter 0.04% : 0.000025s : 1: cse_after_recomputation 0.01% : 0.000005s : 1: dataset_repeat_opt 0.01% : 0.000005s : 1: detach_backward 0.01% : 0.000008s : 1: environ_conv 0.03% : 0.000017s : 1: event_method 0.01% : 0.000005s : 1: full_micro_interleaved_order_control 0.01% : 0.000005s : 1: get_jit_bprop_graph 0.01% : 0.000008s : 1: graph_reusing 0.01% : 0.000005s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000004s : 1: handle_group_info 0.01% : 0.000005s : 1: inline 0.01% : 0.000006s : 1: insert-virtual-dataset 0.01% : 0.000004s : 1: interleave_parallel_branches 0.01% : 0.000004s : 1: interleave_split_concat_branches 0.01% : 0.000006s : 1: label_fine_grained_interleaved_index 0.01% : 0.000007s : 1: label_micro_interleaved_index 0.67% : 0.000437s : 1: loop_unroll 0.01% : 0.000004s : 1: merge_cast_opt 0.01% : 0.000006s : 1: micro_interleaved_order_control 0.97% : 0.000633s : 1: mutable_eliminate 0.01% : 0.000007s : 1: offloading_packed_experts 0.02% : 0.000014s : 1: opt.transform.loop_unroll_optimizer 0.02% : 0.000014s : 1: opt.transform.mutable_eliminate 1.34% : 0.000872s : 78: opt.transform.opt_a 0.04% : 0.000026s : 1: opt.transform.opt_after_cconv 0.04% : 0.000023s : 1: opt.transform.opt_after_jit_grad 0.15% : 0.000095s : 28: opt.transform.opt_b 0.07% : 0.000043s : 2: opt.transform.opt_trans_graph 0.05% : 0.000034s : 4: opt.transform.symbol_engine_opt 25.44% : 0.016550s : 1: opt_a 0.16% : 0.000102s : 1: opt_after_cconv 0.72% : 0.000469s : 1: opt_after_jit_grad 0.31% : 0.000200s : 1: opt_b 28.60% : 0.018610s : 1: optimize 0.03% : 0.000019s : 1: optimize_parallel_all_gather_comm 0.01% : 0.000008s : 1: order_py_execute_after_rewriter 0.03% : 0.000022s : 1: overlap_grad_flash_sp 0.01% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.01% : 0.000008s : 1: overlap_grad_ring_attention 0.01% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.01% : 0.000006s : 1: overlap_param_gather 0.01% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.01% : 0.000007s : 1: overlap_recompute_and_grad_model_parallel 0.01% : 0.000005s : 1: overlap_recompute_comm 0.01% : 0.000006s : 1: parallel-infer-symbol 0.01% : 0.000004s : 1: parallel-infer-symbol-second 0.01% : 0.000005s : 1: partial_unused_args_eliminate 0.01% : 0.000005s : 1: pipeline_parallel_scheduler 0.01% : 0.000005s : 1: pipeline_split 0.04% : 0.000028s : 1: pre_auto_parallel 0.03% : 0.000019s : 1: py_interpret_to_execute 0.03% : 0.000016s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000004s : 1: remove_cast_before_assign_add 0.03% : 0.000018s : 1: remove_dup_value 22.22% : 0.014457s : 1: renormalize.infer 0.48% : 0.000315s : 1: renormalize.specialize 0.01% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.01% : 0.000007s : 1: rewriter_after_jit_bprop_graph 0.06% : 0.000041s : 1: rewriter_after_opt_a 0.07% : 0.000048s : 1: rewriter_before_opt_a 0.01% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.01% : 0.000005s : 1: slice_recompute_activation 0.01% : 0.000004s : 1: split_layernorm_comm 0.01% : 0.000006s : 1: split_matmul_comm_elemetwise 0.01% : 0.000008s : 1: swap_dp_allreduce_reducescatter 0.12% : 0.000077s : 1: symbol_engine_optimizer 0.11% : 0.000073s : 1: tuple_transform 7.29% : 0.004742s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:29.773.039 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:29.773.303 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0312668, [21] [bootstrap]: 0.00042413 [type_inference]: 0.0216162 [event_method]: 1.272e-05 [auto_monad]: 5.703e-05 [graph_reusing]: 5.64998e-06 [inline]: 3.04999e-06 [add_attr]: 0.00317876, [1] [add_attr_with_inline]: 0.00317026, [1] [Cycle 1]: 8.759e-05, [2] [tag_attr]: 3.43e-05 [meta_addattr_fg_expand]: 3.93999e-06 [parallel-infer-symbol]: 2.95002e-06 [pre_auto_parallel]: 2.497e-05 [insert-virtual-dataset]: 2.37999e-06 [parallel-infer-symbol-second]: 7.09988e-07 [dataset_repeat_opt]: 2.06998e-06 [pipeline_split]: 1.84e-06 [optimize]: 0.00476368, [53] [py_interpret_to_execute]: 2.023e-05 [rewriter_before_opt_a]: 4.739e-05 [opt_a]: 0.00251998, [2] [Cycle 1]: 0.00168083, [45] [expand_dump_flag]: 2.98e-06 [switch_simplify]: 2.569e-05 [loop_unroll]: 1.454e-05 [a_1]: 0.00031369 [with_stream_mark]: 1.636e-05 [recompute_prepare]: 8.90999e-06 [updatestate_depend_eliminate]: 4.51002e-06 [updatestate_assign_eliminate]: 3.46001e-06 [updatestate_loads_eliminate]: 2.88e-06 [parameter_eliminate]: 2.01e-06 [a_2]: 0.00012599 [accelerated_algorithm]: 1.958e-05 [shard]: 2.21e-06 [meta_shard_fg_expand]: 1.86998e-06 [shard_inline]: 6.68998e-06 [merge_send_recv]: 9.07999e-06 [auto_parallel]: 6.46e-06 [parallel]: 1.874e-05 [flash_sp]: 7.26001e-06 [merge_comm]: 3.76999e-06 [allreduce_fusion]: 3.40003e-06 [matmul_add_comm_reduction]: 1.044e-05 [allreduce_slice_to_reducescatter]: 8.70001e-07 [virtual_shard_identity]: 8.27e-06 [virtual_dataset]: 6.36998e-06 [get_grad_eliminate_]: 6.09001e-06 [virtual_output]: 6.51e-06 [merge_forward]: 3.86001e-06 [cell_reuse_recompute_pass]: 1.28002e-06 [offload_activation]: 1.102e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.396e-05 [merge_recompute_call_nodes]: 1.67001e-06 [before_grad]: 1.051e-05 [set_forward_comm_id_for_comm_node_pass]: 3.65998e-06 [meta_fg_expand]: 2.58998e-06 [flash_sp_send_recv_attached]: 4.44998e-06 [receive_attached]: 2.27001e-06 [after_resolve]: 9.86998e-06 [a_after_grad]: 9.74999e-06 [renormalize]: 0.00048389 [add_forward_monad_depend]: 4.97e-06 [auto_monad_grad]: 1.84998e-06 [auto_monad_eliminator]: 1.434e-05 [cse]: 2.793e-05 [a_3]: 5.899e-05 [Cycle 2]: 0.00082651, [45] [expand_dump_flag]: 1.19e-06 [switch_simplify]: 6.93e-06 [loop_unroll]: 5.96998e-06 [a_1]: 0.00012897 [with_stream_mark]: 8.06001e-06 [recompute_prepare]: 6.38e-06 [updatestate_depend_eliminate]: 3.3e-06 [updatestate_assign_eliminate]: 2.67001e-06 [updatestate_loads_eliminate]: 2.59001e-06 [parameter_eliminate]: 1.03001e-06 [a_2]: 0.00011019 [accelerated_algorithm]: 9.59e-06 [shard]: 1.08001e-06 [meta_shard_fg_expand]: 1.30999e-06 [shard_inline]: 6.53e-06 [merge_send_recv]: 5.04998e-06 [auto_parallel]: 5.64998e-06 [parallel]: 4.59002e-06 [flash_sp]: 3.31999e-06 [merge_comm]: 3.30998e-06 [allreduce_fusion]: 3.25e-06 [matmul_add_comm_reduction]: 6.53e-06 [allreduce_slice_to_reducescatter]: 3.4002e-07 [virtual_shard_identity]: 7.00002e-06 [virtual_dataset]: 5.74e-06 [get_grad_eliminate_]: 5.61e-06 [virtual_output]: 5.51e-06 [merge_forward]: 2.66e-06 [cell_reuse_recompute_pass]: 1.49e-06 [offload_activation]: 6.63e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.321e-05 [merge_recompute_call_nodes]: 8.89995e-07 [before_grad]: 9.45001e-06 [set_forward_comm_id_for_comm_node_pass]: 3.58e-06 [meta_fg_expand]: 2.11998e-06 [flash_sp_send_recv_attached]: 7.60017e-07 [receive_attached]: 1.08001e-06 [after_resolve]: 8.79e-06 [a_after_grad]: 8.57e-06 [renormalize]: 8.00064e-08 [add_forward_monad_depend]: 1.22e-06 [auto_monad_grad]: 9.50007e-07 [auto_monad_eliminator]: 7.53e-06 [cse]: 1.653e-05 [a_3]: 4.824e-05 [py_interpret_to_execute_after_opt_a]: 1.151e-05 [slice_cell_reuse_recomputed_activation]: 4.70001e-06 [rewriter_after_opt_a]: 4.015e-05 [convert_after_rewriter]: 1.004e-05 [order_py_execute_after_rewriter]: 8.45001e-06 [mutable_eliminate]: 0.00048699 [opt_b]: 0.0002691, [1] [Cycle 1]: 0.00026029, [7] [b_1]: 0.00016345 [b_2]: 8.22998e-06 [updatestate_depend_eliminate]: 6.29001e-06 [updatestate_assign_eliminate]: 2.69001e-06 [updatestate_loads_eliminate]: 2.32001e-06 [renormalize]: 8.09989e-07 [cse]: 1.938e-05 [optimize_parallel_all_gather_comm]: 1.892e-05 [overlap_param_gather]: 4.75999e-06 [cconv]: 2.851e-05 [loop_unroll]: 0.00045064 [opt_after_cconv]: 0.00012874, [1] [Cycle 1]: 0.00011994, [7] [c_1]: 2.845e-05 [parameter_eliminate]: 3.81001e-06 [updatestate_depend_eliminate]: 5.70001e-06 [updatestate_assign_eliminate]: 2.63e-06 [updatestate_loads_eliminate]: 3.22002e-06 [cse]: 2.055e-05 [renormalize]: 3.00002e-07 [remove_dup_value]: 1.942e-05 [tuple_transform]: 9.111e-05, [1] [Cycle 1]: 8.366e-05, [4] [d_1]: 4.062e-05 [none_parameter_eliminate]: 2.03002e-06 [renormalize]: 2.00002e-07 [switch_simplify]: 7.32997e-06 [partial_unused_args_eliminate]: 4.84e-06 [add_recomputation]: 5.337e-05 [cse_after_recomputation]: 3.056e-05, [1] [Cycle 1]: 2.341e-05, [1] [cse]: 1.266e-05 [environ_conv]: 8.56002e-06 [swap_dp_allreduce_reducescatter]: 9.05999e-06 [bias_add_comm_swap]: 5.46e-06 [label_micro_interleaved_index]: 6.98998e-06 [label_fine_grained_interleaved_index]: 5.57001e-06 [merge_cast_opt]: 3.97998e-06 [slice_recompute_activation]: 4.90001e-06 [micro_interleaved_order_control]: 4.87998e-06 [assign_add_opt]: 3.68999e-06 [ForceFp32Comm]: 3.11999e-06 [remove_cast_before_assign_add]: 3.66999e-06 [full_micro_interleaved_order_control]: 4.59002e-06 [reorder_send_recv_between_fp_bp]: 5.73002e-06 [comm_op_add_attrs]: 3.69002e-06 [add_comm_op_reuse_tag]: 3.31001e-06 [interleave_split_concat_branches]: 3.45e-06 [interleave_parallel_branches]: 3.56999e-06 [overlap_opt_shard_in_pipeline]: 3.61999e-06 [overlap_opt_shard_grad_in_pipeline]: 4.55001e-06 [control_data_broadcast_order]: 1.741e-05 [grouped_pairwise_exchange_alltoall]: 4.27e-06 [offloading_packed_experts]: 7.32002e-06 [overlap_recompute_and_grad_model_parallel]: 7.98999e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.8e-06 [overlap_recompute_allgather_and_fa_grad]: 3.94002e-06 [overlap_recompute_comm]: 4.90999e-06 [overlap_grad_ring_attention]: 6.95002e-06 [overlap_grad_flash_sp]: 2.347e-05 [begin_end_overlap_inline]: 2.98e-06 [split_matmul_comm_elemetwise]: 4.72e-06 [split_layernorm_comm]: 4.37e-06 [handle_group_info]: 3.26001e-06 [symbol_engine_optimizer]: 0.00010721, [1] [Cycle 1]: 9.962e-05, [6] [build]: 3.63e-06 [elim_shapecalc]: 1.196e-05 [elim_not_effective]: 1.473e-05 [opt_reshape]: 7.21001e-06 [fold_const_symbol]: 1.051e-05 [renormalize]: 2.50002e-07 [detach_backward]: 4.2e-06 [pipeline_parallel_scheduler]: 2.18998e-06 [auto_monad_reorder]: 2.137e-05 [get_jit_bprop_graph]: 1.34e-06 [rewriter_after_jit_bprop_graph]: 5.15999e-06 [opt_after_jit_grad]: 0.00052445 [validate]: 3.887e-05 Sums bootstrap : 0.000424s : 1.61% type_inference : 0.021616s : 82.01% event_method : 0.000013s : 0.05% auto_monad : 0.000057s : 0.22% graph_reusing : 0.000006s : 0.02% inline : 0.000003s : 0.01% add_attr.add_attr_with_inline.tag_attr : 0.000034s : 0.13% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000004s : 0.01% parallel-infer-symbol : 0.000003s : 0.01% pre_auto_parallel : 0.000025s : 0.09% insert-virtual-dataset : 0.000002s : 0.01% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.01% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000020s : 0.08% optimize.rewriter_before_opt_a : 0.000047s : 0.18% optimize.opt_a.expand_dump_flag : 0.000004s : 0.02% optimize.opt_a.switch_simplify : 0.000033s : 0.12% optimize.opt_a.loop_unroll : 0.000021s : 0.08% optimize.opt_a.a_1 : 0.000443s : 1.68% optimize.opt_a.with_stream_mark : 0.000024s : 0.09% optimize.opt_a.recompute_prepare : 0.000015s : 0.06% optimize.opt_a.updatestate_depend_eliminate : 0.000008s : 0.03% optimize.opt_a.updatestate_assign_eliminate : 0.000006s : 0.02% optimize.opt_a.updatestate_loads_eliminate : 0.000005s : 0.02% optimize.opt_a.parameter_eliminate : 0.000003s : 0.01% optimize.opt_a.a_2 : 0.000236s : 0.90% optimize.opt_a.accelerated_algorithm : 0.000029s : 0.11% optimize.opt_a.shard : 0.000003s : 0.01% optimize.opt_a.meta_shard_fg_expand : 0.000003s : 0.01% optimize.opt_a.shard_inline : 0.000013s : 0.05% optimize.opt_a.merge_send_recv : 0.000014s : 0.05% optimize.opt_a.auto_parallel : 0.000012s : 0.05% optimize.opt_a.parallel : 0.000023s : 0.09% optimize.opt_a.flash_sp : 0.000011s : 0.04% optimize.opt_a.merge_comm : 0.000007s : 0.03% optimize.opt_a.allreduce_fusion : 0.000007s : 0.03% optimize.opt_a.matmul_add_comm_reduction : 0.000017s : 0.06% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000015s : 0.06% optimize.opt_a.virtual_dataset : 0.000012s : 0.05% optimize.opt_a.get_grad_eliminate_ : 0.000012s : 0.04% optimize.opt_a.virtual_output : 0.000012s : 0.05% optimize.opt_a.merge_forward : 0.000007s : 0.02% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.01% optimize.opt_a.offload_activation : 0.000018s : 0.07% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000027s : 0.10% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.01% optimize.opt_a.before_grad : 0.000020s : 0.08% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000007s : 0.03% optimize.opt_a.meta_fg_expand : 0.000005s : 0.02% optimize.opt_a.flash_sp_send_recv_attached : 0.000005s : 0.02% optimize.opt_a.receive_attached : 0.000003s : 0.01% optimize.opt_a.after_resolve : 0.000019s : 0.07% optimize.opt_a.a_after_grad : 0.000018s : 0.07% optimize.opt_a.renormalize : 0.000484s : 1.84% optimize.opt_a.add_forward_monad_depend : 0.000006s : 0.02% optimize.opt_a.auto_monad_grad : 0.000003s : 0.01% optimize.opt_a.auto_monad_eliminator : 0.000022s : 0.08% optimize.opt_a.cse : 0.000044s : 0.17% optimize.opt_a.a_3 : 0.000107s : 0.41% optimize.py_interpret_to_execute_after_opt_a : 0.000012s : 0.04% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.02% optimize.rewriter_after_opt_a : 0.000040s : 0.15% optimize.convert_after_rewriter : 0.000010s : 0.04% optimize.order_py_execute_after_rewriter : 0.000008s : 0.03% optimize.mutable_eliminate : 0.000487s : 1.85% optimize.opt_b.b_1 : 0.000163s : 0.62% optimize.opt_b.b_2 : 0.000008s : 0.03% optimize.opt_b.updatestate_depend_eliminate : 0.000006s : 0.02% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.01% optimize.opt_b.updatestate_loads_eliminate : 0.000002s : 0.01% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000019s : 0.07% optimize.optimize_parallel_all_gather_comm : 0.000019s : 0.07% optimize.overlap_param_gather : 0.000005s : 0.02% optimize.cconv : 0.000029s : 0.11% optimize.loop_unroll : 0.000451s : 1.71% optimize.opt_after_cconv.c_1 : 0.000028s : 0.11% optimize.opt_after_cconv.parameter_eliminate : 0.000004s : 0.01% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.02% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.cse : 0.000021s : 0.08% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000019s : 0.07% optimize.tuple_transform.d_1 : 0.000041s : 0.15% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000007s : 0.03% optimize.partial_unused_args_eliminate : 0.000005s : 0.02% optimize.add_recomputation : 0.000053s : 0.20% optimize.cse_after_recomputation.cse : 0.000013s : 0.05% optimize.environ_conv : 0.000009s : 0.03% optimize.swap_dp_allreduce_reducescatter : 0.000009s : 0.03% optimize.bias_add_comm_swap : 0.000005s : 0.02% optimize.label_micro_interleaved_index : 0.000007s : 0.03% optimize.label_fine_grained_interleaved_index : 0.000006s : 0.02% optimize.merge_cast_opt : 0.000004s : 0.02% optimize.slice_recompute_activation : 0.000005s : 0.02% optimize.micro_interleaved_order_control : 0.000005s : 0.02% optimize.assign_add_opt : 0.000004s : 0.01% optimize.ForceFp32Comm : 0.000003s : 0.01% optimize.remove_cast_before_assign_add : 0.000004s : 0.01% optimize.full_micro_interleaved_order_control : 0.000005s : 0.02% optimize.reorder_send_recv_between_fp_bp : 0.000006s : 0.02% optimize.comm_op_add_attrs : 0.000004s : 0.01% optimize.add_comm_op_reuse_tag : 0.000003s : 0.01% optimize.interleave_split_concat_branches : 0.000003s : 0.01% optimize.interleave_parallel_branches : 0.000004s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000005s : 0.02% optimize.control_data_broadcast_order : 0.000017s : 0.07% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.02% optimize.offloading_packed_experts : 0.000007s : 0.03% optimize.overlap_recompute_and_grad_model_parallel : 0.000008s : 0.03% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.01% optimize.overlap_recompute_comm : 0.000005s : 0.02% optimize.overlap_grad_ring_attention : 0.000007s : 0.03% optimize.overlap_grad_flash_sp : 0.000023s : 0.09% optimize.begin_end_overlap_inline : 0.000003s : 0.01% optimize.split_matmul_comm_elemetwise : 0.000005s : 0.02% optimize.split_layernorm_comm : 0.000004s : 0.02% optimize.handle_group_info : 0.000003s : 0.01% optimize.symbol_engine_optimizer.build : 0.000004s : 0.01% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000012s : 0.05% optimize.symbol_engine_optimizer.elim_not_effective : 0.000015s : 0.06% optimize.symbol_engine_optimizer.opt_reshape : 0.000007s : 0.03% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000011s : 0.04% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000004s : 0.02% pipeline_parallel_scheduler : 0.000002s : 0.01% auto_monad_reorder : 0.000021s : 0.08% get_jit_bprop_graph : 0.000001s : 0.01% rewriter_after_jit_bprop_graph : 0.000005s : 0.02% opt_after_jit_grad : 0.000524s : 1.99% validate : 0.000039s : 0.15% Time group info: ------[substitution.] 0.000178 25 34.07% : 0.000061s : 4: substitution.arithmetic_simplify 0.98% : 0.000002s : 2: substitution.elim_not_effective 0.78% : 0.000001s : 2: substitution.fold_const_symbol 3.26% : 0.000006s : 3: substitution.graph_param_transform 48.51% : 0.000086s : 2: substitution.inline 1.93% : 0.000003s : 4: substitution.j_node_and_user_rematch 6.25% : 0.000011s : 2: substitution.less_batch_normalization 2.43% : 0.000004s : 4: substitution.remove_not_recompute_node 1.79% : 0.000003s : 2: substitution.replace_old_param ------[type_inference.] 0.021562 2 97.82% : 0.021091s : 1: type_inference.infer 2.18% : 0.000471s : 1: type_inference.specialize ------[replace.] 0.000020 2 100.00% : 0.000020s : 2: replace.inline ------[match.] 0.000085 2 100.00% : 0.000085s : 2: match.inline ------[predicate.] 0.000135 754 0.83% : 0.000001s : 7: predicate.accumulaten_eliminater 1.33% : 0.000002s : 3: predicate.ad_related_special_op_eliminate 0.66% : 0.000001s : 6: predicate.addn_check_dump 0.88% : 0.000001s : 7: predicate.addn_zero_filter 0.72% : 0.000001s : 7: predicate.adjust_all_reduce_mul_add 2.86% : 0.000004s : 13: predicate.arithmetic_simplify 0.80% : 0.000001s : 7: predicate.cast_eliminate 0.74% : 0.000001s : 6: predicate.check_bprop_eliminate 0.68% : 0.000001s : 6: predicate.compare_switch_simplify 0.24% : 0.000000s : 3: predicate.const_output_eliminate 0.82% : 0.000001s : 6: predicate.depend_value_elim 0.72% : 0.000001s : 7: predicate.dict_get_item_const_eliminator 0.96% : 0.000001s : 7: predicate.dict_get_item_eliminator 0.76% : 0.000001s : 7: predicate.dict_set_item_eliminator 1.65% : 0.000002s : 6: predicate.dumpgradient_eliminate 0.26% : 0.000000s : 3: predicate.elim_not_effective 0.47% : 0.000001s : 3: predicate.elim_shapecalc_of_broadcastargs 1.14% : 0.000002s : 10: predicate.environ_add_const_eliminate 1.09% : 0.000001s : 10: predicate.environ_get_add_eliminate 1.01% : 0.000001s : 10: predicate.environ_get_depend_swap 1.74% : 0.000002s : 16: predicate.environ_get_eliminate 1.08% : 0.000001s : 10: predicate.environ_get_set_eliminate 0.91% : 0.000001s : 9: predicate.exchange_switch_depend_value 1.91% : 0.000003s : 9: predicate.float_depend_g_call 0.69% : 0.000001s : 6: predicate.float_environ_get_switch 0.98% : 0.000001s : 9: predicate.float_tuple_getitem_switch 0.23% : 0.000000s : 3: predicate.fold_const_symbol 0.78% : 0.000001s : 6: predicate.get_grad_eliminate 0.28% : 0.000000s : 3: predicate.graph_param_transform 0.85% : 0.000001s : 6: predicate.incorporate_call 0.65% : 0.000001s : 6: predicate.incorporate_call_switch 7.03% : 0.000009s : 34: predicate.inline 1.01% : 0.000001s : 6: predicate.inline_without_move 0.39% : 0.000001s : 6: predicate.j_node_and_user_rematch 1.16% : 0.000002s : 6: predicate.less_batch_normalization 1.60% : 0.000002s : 13: predicate.list_to_tuple_eliminator_ 1.97% : 0.000003s : 20: predicate.load_eliminater 1.39% : 0.000002s : 3: predicate.loop_unroll_after_grad 1.65% : 0.000002s : 14: predicate.loop_unroll_before_grad 1.65% : 0.000002s : 13: predicate.make_slice_get_slice_eliminator 0.72% : 0.000001s : 6: predicate.merge_addn 0.64% : 0.000001s : 6: predicate.micro_step_allgather_replace 0.74% : 0.000001s : 6: predicate.mini_step_allgather_replace 0.69% : 0.000001s : 7: predicate.minmaximum_grad 1.16% : 0.000002s : 3: predicate.mutable_eliminate 0.39% : 0.000001s : 3: predicate.opt_reshape 0.45% : 0.000001s : 3: predicate.parallel_virtual_node 1.32% : 0.000002s : 9: predicate.partial_defer_inline 1.24% : 0.000002s : 10: predicate.partial_eliminate 0.75% : 0.000001s : 7: predicate.print_const_string_wrapper 0.76% : 0.000001s : 6: predicate.reduce_all_const_elim 0.94% : 0.000001s : 7: predicate.reduce_eliminate 2.06% : 0.000003s : 20: predicate.redundant_stop_gradient_eliminater 0.66% : 0.000001s : 6: predicate.remove_not_recompute_node 1.14% : 0.000002s : 13: predicate.replace_applicator 0.68% : 0.000001s : 6: predicate.replace_old_param 0.47% : 0.000001s : 3: predicate.reset_defer_inline 0.78% : 0.000001s : 7: predicate.reshape_eliminate 0.79% : 0.000001s : 6: predicate.row_tensor_add_zeros_like 0.50% : 0.000001s : 3: predicate.row_tensor_eliminate 0.92% : 0.000001s : 6: predicate.same_eliminate 0.56% : 0.000001s : 6: predicate.set_cell_output_no_recompute 0.93% : 0.000001s : 6: predicate.shard_identity_eliminate 0.87% : 0.000001s : 6: predicate.special_op_eliminate 0.92% : 0.000001s : 6: predicate.specialize_transform 1.08% : 0.000001s : 6: predicate.split_environ_get_set_with_tuple_value 1.17% : 0.000002s : 6: predicate.stack_unstack_eliminate 0.45% : 0.000001s : 3: predicate.switch_call_monad_eliminater 1.02% : 0.000001s : 9: predicate.switch_defer_inline 1.70% : 0.000002s : 15: predicate.switch_layer_defer_inline 4.38% : 0.000006s : 32: predicate.switch_simplify 0.88% : 0.000001s : 7: predicate.tile_eliminate 0.82% : 0.000001s : 7: predicate.transpose_eliminate 1.61% : 0.000002s : 13: predicate.tuple_list_convert_item_index_to_positive 1.59% : 0.000002s : 13: predicate.tuple_list_get_item_const_eliminator 1.34% : 0.000002s : 13: predicate.tuple_list_get_item_depend_reorder 3.15% : 0.000004s : 19: predicate.tuple_list_get_item_eliminator 1.54% : 0.000002s : 13: predicate.tuple_list_get_set_item_eliminator 2.43% : 0.000003s : 19: predicate.tuple_list_set_item_eliminator 1.53% : 0.000002s : 13: predicate.tuple_to_list_eliminator_ 1.90% : 0.000003s : 20: predicate.updatestate_pure_node_eliminater 2.77% : 0.000004s : 26: predicate.updatestate_useless_node_eliminater 0.50% : 0.000001s : 3: predicate.value_based_eliminate 0.80% : 0.000001s : 6: predicate.virtual_dataset_eliminate 0.78% : 0.000001s : 6: predicate.virtual_output_eliminate 0.33% : 0.000000s : 3: predicate.virtual_view_grad_eliminate 0.62% : 0.000001s : 3: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000212 5 8.17% : 0.000017s : 1: func_graph_cloner_run.FuncGraphClonerGraph 91.83% : 0.000195s : 4: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.040682 192 0.01% : 0.000006s : 1: ForceFp32Comm 7.84% : 0.003188s : 1: add_attr 7.80% : 0.003174s : 1: add_attr_with_inline 0.01% : 0.000006s : 1: add_comm_op_reuse_tag 0.14% : 0.000058s : 1: add_recomputation 0.02% : 0.000006s : 1: assign_add_opt 0.16% : 0.000066s : 1: auto_monad 0.07% : 0.000030s : 1: auto_monad_reorder 0.02% : 0.000006s : 1: begin_end_overlap_inline 0.02% : 0.000008s : 1: bias_add_comm_swap 1.15% : 0.000468s : 1: bootstrap 0.08% : 0.000032s : 1: cconv 0.02% : 0.000006s : 1: comm_op_add_attrs 0.05% : 0.000021s : 1: control_data_broadcast_order 0.03% : 0.000013s : 1: convert_after_rewriter 0.09% : 0.000035s : 1: cse_after_recomputation 0.02% : 0.000008s : 1: dataset_repeat_opt 0.06% : 0.000023s : 1: detach_backward 0.03% : 0.000012s : 1: environ_conv 0.06% : 0.000024s : 1: event_method 0.02% : 0.000008s : 1: full_micro_interleaved_order_control 0.02% : 0.000008s : 1: get_jit_bprop_graph 0.03% : 0.000012s : 1: graph_reusing 0.02% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000006s : 1: handle_group_info 0.02% : 0.000009s : 1: inline 0.02% : 0.000008s : 1: insert-virtual-dataset 0.02% : 0.000006s : 1: interleave_parallel_branches 0.01% : 0.000006s : 1: interleave_split_concat_branches 0.02% : 0.000008s : 1: label_fine_grained_interleaved_index 0.02% : 0.000010s : 1: label_micro_interleaved_index 1.12% : 0.000457s : 1: loop_unroll 0.02% : 0.000007s : 1: merge_cast_opt 0.02% : 0.000008s : 1: micro_interleaved_order_control 1.21% : 0.000493s : 1: mutable_eliminate 0.03% : 0.000010s : 1: offloading_packed_experts 0.04% : 0.000014s : 1: opt.transform.loop_unroll_optimizer 0.03% : 0.000014s : 1: opt.transform.mutable_eliminate 2.10% : 0.000856s : 78: opt.transform.opt_a 0.07% : 0.000027s : 1: opt.transform.opt_after_cconv 0.07% : 0.000027s : 1: opt.transform.opt_after_jit_grad 0.24% : 0.000098s : 28: opt.transform.opt_b 0.11% : 0.000046s : 2: opt.transform.opt_trans_graph 0.10% : 0.000039s : 4: opt.transform.symbol_engine_opt 6.20% : 0.002523s : 1: opt_a 0.33% : 0.000132s : 1: opt_after_cconv 1.32% : 0.000537s : 1: opt_after_jit_grad 0.67% : 0.000273s : 1: opt_b 12.51% : 0.005088s : 1: optimize 0.05% : 0.000022s : 1: optimize_parallel_all_gather_comm 0.03% : 0.000011s : 1: order_py_execute_after_rewriter 0.07% : 0.000027s : 1: overlap_grad_flash_sp 0.02% : 0.000007s : 1: overlap_grad_matmul_and_grad_allreduce 0.02% : 0.000010s : 1: overlap_grad_ring_attention 0.02% : 0.000007s : 1: overlap_opt_shard_grad_in_pipeline 0.02% : 0.000006s : 1: overlap_opt_shard_in_pipeline 0.02% : 0.000008s : 1: overlap_param_gather 0.02% : 0.000007s : 1: overlap_recompute_allgather_and_fa_grad 0.03% : 0.000012s : 1: overlap_recompute_and_grad_model_parallel 0.02% : 0.000008s : 1: overlap_recompute_comm 0.02% : 0.000010s : 1: parallel-infer-symbol 0.02% : 0.000006s : 1: parallel-infer-symbol-second 0.02% : 0.000008s : 1: partial_unused_args_eliminate 0.02% : 0.000009s : 1: pipeline_parallel_scheduler 0.02% : 0.000007s : 1: pipeline_split 0.08% : 0.000032s : 1: pre_auto_parallel 0.06% : 0.000023s : 1: py_interpret_to_execute 0.04% : 0.000015s : 1: py_interpret_to_execute_after_opt_a 0.02% : 0.000006s : 1: remove_cast_before_assign_add 0.06% : 0.000023s : 1: remove_dup_value 0.66% : 0.000270s : 1: renormalize.infer 0.51% : 0.000206s : 1: renormalize.specialize 0.02% : 0.000009s : 1: reorder_send_recv_between_fp_bp 0.03% : 0.000011s : 1: rewriter_after_jit_bprop_graph 0.11% : 0.000044s : 1: rewriter_after_opt_a 0.13% : 0.000051s : 1: rewriter_before_opt_a 0.02% : 0.000007s : 1: slice_cell_reuse_recomputed_activation 0.02% : 0.000008s : 1: slice_recompute_activation 0.02% : 0.000008s : 1: split_layernorm_comm 0.02% : 0.000008s : 1: split_matmul_comm_elemetwise 0.03% : 0.000012s : 1: swap_dp_allreduce_reducescatter 0.27% : 0.000110s : 1: symbol_engine_optimizer 0.23% : 0.000094s : 1: tuple_transform 53.22% : 0.021651s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:30.146.099 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0132798, [21] [bootstrap]: 0.00041799 [type_inference]: 0.00503972 [event_method]: 1.219e-05 [auto_monad]: 5.343e-05 [graph_reusing]: 5.07999e-06 [inline]: 2.96001e-06 [add_attr]: 0.00305895, [1] [add_attr_with_inline]: 0.00305049, [1] [Cycle 1]: 4.677e-05, [2] [tag_attr]: 1.282e-05 [meta_addattr_fg_expand]: 3.90998e-06 [parallel-infer-symbol]: 2.88e-06 [pre_auto_parallel]: 2.54e-05 [insert-virtual-dataset]: 2.34001e-06 [parallel-infer-symbol-second]: 8.60018e-07 [dataset_repeat_opt]: 2.47001e-06 [pipeline_split]: 1.54998e-06 [optimize]: 0.00400798, [53] [py_interpret_to_execute]: 1.656e-05 [rewriter_before_opt_a]: 4.313e-05 [opt_a]: 0.00214836, [2] [Cycle 1]: 0.00144845, [45] [expand_dump_flag]: 2.94001e-06 [switch_simplify]: 2.696e-05 [loop_unroll]: 1.421e-05 [a_1]: 0.00030776 [with_stream_mark]: 1.645e-05 [recompute_prepare]: 9.66998e-06 [updatestate_depend_eliminate]: 3.98001e-06 [updatestate_assign_eliminate]: 3.34001e-06 [updatestate_loads_eliminate]: 3.02002e-06 [parameter_eliminate]: 1.72999e-06 [a_2]: 9.446e-05 [accelerated_algorithm]: 1.898e-05 [shard]: 2.07999e-06 [meta_shard_fg_expand]: 1.81e-06 [shard_inline]: 6.78998e-06 [merge_send_recv]: 7.87e-06 [auto_parallel]: 6.04001e-06 [parallel]: 1.83e-05 [flash_sp]: 7.33e-06 [merge_comm]: 3.88001e-06 [allreduce_fusion]: 3.59002e-06 [matmul_add_comm_reduction]: 1.029e-05 [allreduce_slice_to_reducescatter]: 5.8001e-07 [virtual_shard_identity]: 7.88001e-06 [virtual_dataset]: 6.56e-06 [get_grad_eliminate_]: 6.21998e-06 [virtual_output]: 6.76999e-06 [merge_forward]: 3.81001e-06 [cell_reuse_recompute_pass]: 1.30999e-06 [offload_activation]: 9.89999e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.263e-05 [merge_recompute_call_nodes]: 1.43002e-06 [before_grad]: 1.083e-05 [set_forward_comm_id_for_comm_node_pass]: 3.64002e-06 [meta_fg_expand]: 2.74001e-06 [flash_sp_send_recv_attached]: 4.42998e-06 [receive_attached]: 1.99e-06 [after_resolve]: 1.078e-05 [a_after_grad]: 9.92001e-06 [renormalize]: 0.00044275 [add_forward_monad_depend]: 5.13002e-06 [auto_monad_grad]: 2.03002e-06 [auto_monad_eliminator]: 1.371e-05 [cse]: 2.867e-05 [a_3]: 4.458e-05 [Cycle 2]: 0.00069006, [45] [expand_dump_flag]: 9.80013e-07 [switch_simplify]: 7.08e-06 [loop_unroll]: 5.72001e-06 [a_1]: 0.00012509 [with_stream_mark]: 8.17e-06 [recompute_prepare]: 6.38003e-06 [updatestate_depend_eliminate]: 2.94001e-06 [updatestate_assign_eliminate]: 2.34001e-06 [updatestate_loads_eliminate]: 2.77002e-06 [parameter_eliminate]: 9.30013e-07 [a_2]: 7.943e-05 [accelerated_algorithm]: 9.11002e-06 [shard]: 9.50007e-07 [meta_shard_fg_expand]: 1.32e-06 [shard_inline]: 6.04001e-06 [merge_send_recv]: 4.87e-06 [auto_parallel]: 5.52001e-06 [parallel]: 4.84e-06 [flash_sp]: 3.56999e-06 [merge_comm]: 3.32002e-06 [allreduce_fusion]: 3.08e-06 [matmul_add_comm_reduction]: 6.69999e-06 [allreduce_slice_to_reducescatter]: 3.19997e-07 [virtual_shard_identity]: 6.61e-06 [virtual_dataset]: 5.56e-06 [get_grad_eliminate_]: 5.54e-06 [virtual_output]: 5.33002e-06 [merge_forward]: 2.74999e-06 [cell_reuse_recompute_pass]: 1.23002e-06 [offload_activation]: 6.17999e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.061e-05 [merge_recompute_call_nodes]: 7.59988e-07 [before_grad]: 8.67e-06 [set_forward_comm_id_for_comm_node_pass]: 3.58999e-06 [meta_fg_expand]: 2.11e-06 [flash_sp_send_recv_attached]: 9.79984e-07 [receive_attached]: 1.02e-06 [after_resolve]: 8.36002e-06 [a_after_grad]: 8e-06 [renormalize]: 1.00001e-07 [add_forward_monad_depend]: 1.30999e-06 [auto_monad_grad]: 1.07e-06 [auto_monad_eliminator]: 7.11001e-06 [cse]: 1.912e-05 [a_3]: 3.594e-05 [py_interpret_to_execute_after_opt_a]: 7.91001e-06 [slice_cell_reuse_recomputed_activation]: 2.25002e-06 [rewriter_after_opt_a]: 3.483e-05 [convert_after_rewriter]: 6.48e-06 [order_py_execute_after_rewriter]: 5.38002e-06 [mutable_eliminate]: 0.0004604 [opt_b]: 0.00019614, [1] [Cycle 1]: 0.00018914, [7] [b_1]: 0.00011785 [b_2]: 7.16999e-06 [updatestate_depend_eliminate]: 5.20999e-06 [updatestate_assign_eliminate]: 2.36e-06 [updatestate_loads_eliminate]: 2.65997e-06 [renormalize]: 4.00003e-07 [cse]: 1.774e-05 [optimize_parallel_all_gather_comm]: 1.548e-05 [overlap_param_gather]: 1.87001e-06 [cconv]: 2.303e-05 [loop_unroll]: 0.0004123 [opt_after_cconv]: 9.743e-05, [1] [Cycle 1]: 9.181e-05, [7] [c_1]: 2.798e-05 [parameter_eliminate]: 2.44999e-06 [updatestate_depend_eliminate]: 5.05999e-06 [updatestate_assign_eliminate]: 2.46e-06 [updatestate_loads_eliminate]: 2.30002e-06 [cse]: 1.73e-05 [renormalize]: 3.80009e-07 [remove_dup_value]: 1.435e-05 [tuple_transform]: 6.845e-05, [1] [Cycle 1]: 6.393e-05, [4] [d_1]: 3.763e-05 [none_parameter_eliminate]: 1.50001e-06 [renormalize]: 2.3999e-07 [switch_simplify]: 6.64999e-06 [partial_unused_args_eliminate]: 2.25002e-06 [add_recomputation]: 4.572e-05 [cse_after_recomputation]: 2.138e-05, [1] [Cycle 1]: 1.709e-05, [1] [cse]: 1.165e-05 [environ_conv]: 4.95999e-06 [swap_dp_allreduce_reducescatter]: 5.11997e-06 [bias_add_comm_swap]: 2.63e-06 [label_micro_interleaved_index]: 4.13999e-06 [label_fine_grained_interleaved_index]: 2.89999e-06 [merge_cast_opt]: 1.47001e-06 [slice_recompute_activation]: 2.49999e-06 [micro_interleaved_order_control]: 2.66999e-06 [assign_add_opt]: 1.49998e-06 [ForceFp32Comm]: 8.39995e-07 [remove_cast_before_assign_add]: 1.49e-06 [full_micro_interleaved_order_control]: 2.19999e-06 [reorder_send_recv_between_fp_bp]: 2.64999e-06 [comm_op_add_attrs]: 1.15001e-06 [add_comm_op_reuse_tag]: 9.70002e-07 [interleave_split_concat_branches]: 1.17999e-06 [interleave_parallel_branches]: 1.08001e-06 [overlap_opt_shard_in_pipeline]: 1.49e-06 [overlap_opt_shard_grad_in_pipeline]: 1.76e-06 [control_data_broadcast_order]: 1.27e-05 [grouped_pairwise_exchange_alltoall]: 1.94e-06 [offloading_packed_experts]: 3.85998e-06 [overlap_recompute_and_grad_model_parallel]: 4.62998e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.19e-06 [overlap_recompute_allgather_and_fa_grad]: 1.37e-06 [overlap_recompute_comm]: 2.17999e-06 [overlap_grad_ring_attention]: 4.36002e-06 [overlap_grad_flash_sp]: 1.732e-05 [begin_end_overlap_inline]: 6.29982e-07 [split_matmul_comm_elemetwise]: 2.32001e-06 [split_layernorm_comm]: 1.72999e-06 [handle_group_info]: 9.00007e-07 [symbol_engine_optimizer]: 7.353e-05, [1] [Cycle 1]: 6.954e-05, [6] [build]: 2.73e-06 [elim_shapecalc]: 9.43002e-06 [elim_not_effective]: 1.219e-05 [opt_reshape]: 6.39999e-06 [fold_const_symbol]: 9.92001e-06 [renormalize]: 2.10013e-07 [detach_backward]: 1.59e-06 [pipeline_parallel_scheduler]: 1.52001e-06 [auto_monad_reorder]: 1.625e-05 [get_jit_bprop_graph]: 1.20999e-06 [rewriter_after_jit_bprop_graph]: 3.95e-06 [opt_after_jit_grad]: 0.00044605 [validate]: 3.501e-05 Sums bootstrap : 0.000418s : 4.53% type_inference : 0.005040s : 54.59% event_method : 0.000012s : 0.13% auto_monad : 0.000053s : 0.58% graph_reusing : 0.000005s : 0.06% inline : 0.000003s : 0.03% add_attr.add_attr_with_inline.tag_attr : 0.000013s : 0.14% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000004s : 0.04% parallel-infer-symbol : 0.000003s : 0.03% pre_auto_parallel : 0.000025s : 0.28% insert-virtual-dataset : 0.000002s : 0.03% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.03% pipeline_split : 0.000002s : 0.02% optimize.py_interpret_to_execute : 0.000017s : 0.18% optimize.rewriter_before_opt_a : 0.000043s : 0.47% optimize.opt_a.expand_dump_flag : 0.000004s : 0.04% optimize.opt_a.switch_simplify : 0.000034s : 0.37% optimize.opt_a.loop_unroll : 0.000020s : 0.22% optimize.opt_a.a_1 : 0.000433s : 4.69% optimize.opt_a.with_stream_mark : 0.000025s : 0.27% optimize.opt_a.recompute_prepare : 0.000016s : 0.17% optimize.opt_a.updatestate_depend_eliminate : 0.000007s : 0.07% optimize.opt_a.updatestate_assign_eliminate : 0.000006s : 0.06% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.06% optimize.opt_a.parameter_eliminate : 0.000003s : 0.03% optimize.opt_a.a_2 : 0.000174s : 1.88% optimize.opt_a.accelerated_algorithm : 0.000028s : 0.30% optimize.opt_a.shard : 0.000003s : 0.03% optimize.opt_a.meta_shard_fg_expand : 0.000003s : 0.03% optimize.opt_a.shard_inline : 0.000013s : 0.14% optimize.opt_a.merge_send_recv : 0.000013s : 0.14% optimize.opt_a.auto_parallel : 0.000012s : 0.13% optimize.opt_a.parallel : 0.000023s : 0.25% optimize.opt_a.flash_sp : 0.000011s : 0.12% optimize.opt_a.merge_comm : 0.000007s : 0.08% optimize.opt_a.allreduce_fusion : 0.000007s : 0.07% optimize.opt_a.matmul_add_comm_reduction : 0.000017s : 0.18% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000014s : 0.16% optimize.opt_a.virtual_dataset : 0.000012s : 0.13% optimize.opt_a.get_grad_eliminate_ : 0.000012s : 0.13% optimize.opt_a.virtual_output : 0.000012s : 0.13% optimize.opt_a.merge_forward : 0.000007s : 0.07% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.03% optimize.opt_a.offload_activation : 0.000016s : 0.17% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000023s : 0.25% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.02% optimize.opt_a.before_grad : 0.000020s : 0.21% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000007s : 0.08% optimize.opt_a.meta_fg_expand : 0.000005s : 0.05% optimize.opt_a.flash_sp_send_recv_attached : 0.000005s : 0.06% optimize.opt_a.receive_attached : 0.000003s : 0.03% optimize.opt_a.after_resolve : 0.000019s : 0.21% optimize.opt_a.a_after_grad : 0.000018s : 0.19% optimize.opt_a.renormalize : 0.000443s : 4.80% optimize.opt_a.add_forward_monad_depend : 0.000006s : 0.07% optimize.opt_a.auto_monad_grad : 0.000003s : 0.03% optimize.opt_a.auto_monad_eliminator : 0.000021s : 0.23% optimize.opt_a.cse : 0.000048s : 0.52% optimize.opt_a.a_3 : 0.000081s : 0.87% optimize.py_interpret_to_execute_after_opt_a : 0.000008s : 0.09% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.02% optimize.rewriter_after_opt_a : 0.000035s : 0.38% optimize.convert_after_rewriter : 0.000006s : 0.07% optimize.order_py_execute_after_rewriter : 0.000005s : 0.06% optimize.mutable_eliminate : 0.000460s : 4.99% optimize.opt_b.b_1 : 0.000118s : 1.28% optimize.opt_b.b_2 : 0.000007s : 0.08% optimize.opt_b.updatestate_depend_eliminate : 0.000005s : 0.06% optimize.opt_b.updatestate_assign_eliminate : 0.000002s : 0.03% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_b.renormalize : 0.000000s : 0.00% optimize.opt_b.cse : 0.000018s : 0.19% optimize.optimize_parallel_all_gather_comm : 0.000015s : 0.17% optimize.overlap_param_gather : 0.000002s : 0.02% optimize.cconv : 0.000023s : 0.25% optimize.loop_unroll : 0.000412s : 4.47% optimize.opt_after_cconv.c_1 : 0.000028s : 0.30% optimize.opt_after_cconv.parameter_eliminate : 0.000002s : 0.03% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000005s : 0.05% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000002s : 0.03% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.02% optimize.opt_after_cconv.cse : 0.000017s : 0.19% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000014s : 0.16% optimize.tuple_transform.d_1 : 0.000038s : 0.41% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.02% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000007s : 0.07% optimize.partial_unused_args_eliminate : 0.000002s : 0.02% optimize.add_recomputation : 0.000046s : 0.50% optimize.cse_after_recomputation.cse : 0.000012s : 0.13% optimize.environ_conv : 0.000005s : 0.05% optimize.swap_dp_allreduce_reducescatter : 0.000005s : 0.06% optimize.bias_add_comm_swap : 0.000003s : 0.03% optimize.label_micro_interleaved_index : 0.000004s : 0.04% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.03% optimize.merge_cast_opt : 0.000001s : 0.02% optimize.slice_recompute_activation : 0.000002s : 0.03% optimize.micro_interleaved_order_control : 0.000003s : 0.03% optimize.assign_add_opt : 0.000001s : 0.02% optimize.ForceFp32Comm : 0.000001s : 0.01% optimize.remove_cast_before_assign_add : 0.000001s : 0.02% optimize.full_micro_interleaved_order_control : 0.000002s : 0.02% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.03% optimize.comm_op_add_attrs : 0.000001s : 0.01% optimize.add_comm_op_reuse_tag : 0.000001s : 0.01% optimize.interleave_split_concat_branches : 0.000001s : 0.01% optimize.interleave_parallel_branches : 0.000001s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.02% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.02% optimize.control_data_broadcast_order : 0.000013s : 0.14% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.02% optimize.offloading_packed_experts : 0.000004s : 0.04% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.05% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.01% optimize.overlap_recompute_comm : 0.000002s : 0.02% optimize.overlap_grad_ring_attention : 0.000004s : 0.05% optimize.overlap_grad_flash_sp : 0.000017s : 0.19% optimize.begin_end_overlap_inline : 0.000001s : 0.01% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.03% optimize.split_layernorm_comm : 0.000002s : 0.02% optimize.handle_group_info : 0.000001s : 0.01% optimize.symbol_engine_optimizer.build : 0.000003s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000009s : 0.10% optimize.symbol_engine_optimizer.elim_not_effective : 0.000012s : 0.13% optimize.symbol_engine_optimizer.opt_reshape : 0.000006s : 0.07% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000010s : 0.11% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.02% pipeline_parallel_scheduler : 0.000002s : 0.02% auto_monad_reorder : 0.000016s : 0.18% get_jit_bprop_graph : 0.000001s : 0.01% rewriter_after_jit_bprop_graph : 0.000004s : 0.04% opt_after_jit_grad : 0.000446s : 4.83% validate : 0.000035s : 0.38% Time group info: ------[substitution.] 0.000168 25 32.85% : 0.000055s : 4: substitution.arithmetic_simplify 1.18% : 0.000002s : 2: substitution.elim_not_effective 0.96% : 0.000002s : 2: substitution.fold_const_symbol 3.03% : 0.000005s : 3: substitution.graph_param_transform 49.08% : 0.000082s : 2: substitution.inline 1.89% : 0.000003s : 4: substitution.j_node_and_user_rematch 6.43% : 0.000011s : 2: substitution.less_batch_normalization 2.76% : 0.000005s : 4: substitution.remove_not_recompute_node 1.82% : 0.000003s : 2: substitution.replace_old_param ------[type_inference.] 0.004998 2 91.28% : 0.004562s : 1: type_inference.infer 8.72% : 0.000436s : 1: type_inference.specialize ------[replace.] 0.000019 2 100.00% : 0.000019s : 2: replace.inline ------[match.] 0.000081 2 100.00% : 0.000081s : 2: match.inline ------[predicate.] 0.000132 754 0.77% : 0.000001s : 7: predicate.accumulaten_eliminater 1.06% : 0.000001s : 3: predicate.ad_related_special_op_eliminate 0.63% : 0.000001s : 6: predicate.addn_check_dump 0.93% : 0.000001s : 7: predicate.addn_zero_filter 0.71% : 0.000001s : 7: predicate.adjust_all_reduce_mul_add 2.74% : 0.000004s : 13: predicate.arithmetic_simplify 0.87% : 0.000001s : 7: predicate.cast_eliminate 0.78% : 0.000001s : 6: predicate.check_bprop_eliminate 0.66% : 0.000001s : 6: predicate.compare_switch_simplify 0.26% : 0.000000s : 3: predicate.const_output_eliminate 0.83% : 0.000001s : 6: predicate.depend_value_elim 0.77% : 0.000001s : 7: predicate.dict_get_item_const_eliminator 0.99% : 0.000001s : 7: predicate.dict_get_item_eliminator 0.76% : 0.000001s : 7: predicate.dict_set_item_eliminator 1.23% : 0.000002s : 6: predicate.dumpgradient_eliminate 0.35% : 0.000000s : 3: predicate.elim_not_effective 0.51% : 0.000001s : 3: predicate.elim_shapecalc_of_broadcastargs 1.08% : 0.000001s : 10: predicate.environ_add_const_eliminate 0.96% : 0.000001s : 10: predicate.environ_get_add_eliminate 0.96% : 0.000001s : 10: predicate.environ_get_depend_swap 1.67% : 0.000002s : 16: predicate.environ_get_eliminate 1.02% : 0.000001s : 10: predicate.environ_get_set_eliminate 0.95% : 0.000001s : 9: predicate.exchange_switch_depend_value 1.94% : 0.000003s : 9: predicate.float_depend_g_call 0.68% : 0.000001s : 6: predicate.float_environ_get_switch 0.92% : 0.000001s : 9: predicate.float_tuple_getitem_switch 0.23% : 0.000000s : 3: predicate.fold_const_symbol 0.88% : 0.000001s : 6: predicate.get_grad_eliminate 0.31% : 0.000000s : 3: predicate.graph_param_transform 0.77% : 0.000001s : 6: predicate.incorporate_call 0.65% : 0.000001s : 6: predicate.incorporate_call_switch 6.32% : 0.000008s : 34: predicate.inline 1.16% : 0.000002s : 6: predicate.inline_without_move 0.42% : 0.000001s : 6: predicate.j_node_and_user_rematch 1.20% : 0.000002s : 6: predicate.less_batch_normalization 1.88% : 0.000002s : 13: predicate.list_to_tuple_eliminator_ 2.02% : 0.000003s : 20: predicate.load_eliminater 1.17% : 0.000002s : 3: predicate.loop_unroll_after_grad 1.58% : 0.000002s : 14: predicate.loop_unroll_before_grad 1.82% : 0.000002s : 13: predicate.make_slice_get_slice_eliminator 0.68% : 0.000001s : 6: predicate.merge_addn 0.80% : 0.000001s : 6: predicate.micro_step_allgather_replace 0.66% : 0.000001s : 6: predicate.mini_step_allgather_replace 0.66% : 0.000001s : 7: predicate.minmaximum_grad 1.24% : 0.000002s : 3: predicate.mutable_eliminate 0.43% : 0.000001s : 3: predicate.opt_reshape 0.45% : 0.000001s : 3: predicate.parallel_virtual_node 1.27% : 0.000002s : 9: predicate.partial_defer_inline 1.27% : 0.000002s : 10: predicate.partial_eliminate 0.80% : 0.000001s : 7: predicate.print_const_string_wrapper 0.85% : 0.000001s : 6: predicate.reduce_all_const_elim 1.09% : 0.000001s : 7: predicate.reduce_eliminate 2.03% : 0.000003s : 20: predicate.redundant_stop_gradient_eliminater 0.65% : 0.000001s : 6: predicate.remove_not_recompute_node 1.18% : 0.000002s : 13: predicate.replace_applicator 0.84% : 0.000001s : 6: predicate.replace_old_param 0.34% : 0.000000s : 3: predicate.reset_defer_inline 0.81% : 0.000001s : 7: predicate.reshape_eliminate 0.78% : 0.000001s : 6: predicate.row_tensor_add_zeros_like 0.48% : 0.000001s : 3: predicate.row_tensor_eliminate 0.93% : 0.000001s : 6: predicate.same_eliminate 0.56% : 0.000001s : 6: predicate.set_cell_output_no_recompute 1.01% : 0.000001s : 6: predicate.shard_identity_eliminate 0.87% : 0.000001s : 6: predicate.special_op_eliminate 0.98% : 0.000001s : 6: predicate.specialize_transform 1.04% : 0.000001s : 6: predicate.split_environ_get_set_with_tuple_value 0.90% : 0.000001s : 6: predicate.stack_unstack_eliminate 0.46% : 0.000001s : 3: predicate.switch_call_monad_eliminater 1.03% : 0.000001s : 9: predicate.switch_defer_inline 1.72% : 0.000002s : 15: predicate.switch_layer_defer_inline 4.57% : 0.000006s : 32: predicate.switch_simplify 0.77% : 0.000001s : 7: predicate.tile_eliminate 1.06% : 0.000001s : 7: predicate.transpose_eliminate 1.56% : 0.000002s : 13: predicate.tuple_list_convert_item_index_to_positive 1.65% : 0.000002s : 13: predicate.tuple_list_get_item_const_eliminator 1.39% : 0.000002s : 13: predicate.tuple_list_get_item_depend_reorder 3.26% : 0.000004s : 19: predicate.tuple_list_get_item_eliminator 1.41% : 0.000002s : 13: predicate.tuple_list_get_set_item_eliminator 2.49% : 0.000003s : 19: predicate.tuple_list_set_item_eliminator 1.59% : 0.000002s : 13: predicate.tuple_to_list_eliminator_ 1.92% : 0.000003s : 20: predicate.updatestate_pure_node_eliminater 2.79% : 0.000004s : 26: predicate.updatestate_useless_node_eliminater 0.50% : 0.000001s : 3: predicate.value_based_eliminate 0.83% : 0.000001s : 6: predicate.virtual_dataset_eliminate 0.79% : 0.000001s : 6: predicate.virtual_output_eliminate 0.39% : 0.000001s : 3: predicate.virtual_view_grad_eliminate 0.81% : 0.000001s : 3: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000204 5 7.78% : 0.000016s : 1: func_graph_cloner_run.FuncGraphClonerGraph 92.22% : 0.000188s : 4: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.021757 192 0.02% : 0.000003s : 1: ForceFp32Comm 14.08% : 0.003064s : 1: add_attr 14.04% : 0.003054s : 1: add_attr_with_inline 0.02% : 0.000004s : 1: add_comm_op_reuse_tag 0.23% : 0.000050s : 1: add_recomputation 0.02% : 0.000004s : 1: assign_add_opt 0.27% : 0.000058s : 1: auto_monad 0.09% : 0.000020s : 1: auto_monad_reorder 0.02% : 0.000003s : 1: begin_end_overlap_inline 0.03% : 0.000005s : 1: bias_add_comm_swap 2.05% : 0.000445s : 1: bootstrap 0.12% : 0.000026s : 1: cconv 0.02% : 0.000004s : 1: comm_op_add_attrs 0.07% : 0.000016s : 1: control_data_broadcast_order 0.04% : 0.000010s : 1: convert_after_rewriter 0.11% : 0.000024s : 1: cse_after_recomputation 0.02% : 0.000005s : 1: dataset_repeat_opt 0.02% : 0.000005s : 1: detach_backward 0.04% : 0.000008s : 1: environ_conv 0.08% : 0.000018s : 1: event_method 0.02% : 0.000005s : 1: full_micro_interleaved_order_control 0.02% : 0.000004s : 1: get_jit_bprop_graph 0.04% : 0.000008s : 1: graph_reusing 0.02% : 0.000005s : 1: grouped_pairwise_exchange_alltoall 0.02% : 0.000004s : 1: handle_group_info 0.03% : 0.000006s : 1: inline 0.03% : 0.000006s : 1: insert-virtual-dataset 0.02% : 0.000004s : 1: interleave_parallel_branches 0.02% : 0.000004s : 1: interleave_split_concat_branches 0.03% : 0.000006s : 1: label_fine_grained_interleaved_index 0.03% : 0.000007s : 1: label_micro_interleaved_index 1.93% : 0.000420s : 1: loop_unroll 0.02% : 0.000004s : 1: merge_cast_opt 0.03% : 0.000006s : 1: micro_interleaved_order_control 2.15% : 0.000469s : 1: mutable_eliminate 0.03% : 0.000007s : 1: offloading_packed_experts 0.06% : 0.000014s : 1: opt.transform.loop_unroll_optimizer 0.06% : 0.000014s : 1: opt.transform.mutable_eliminate 3.85% : 0.000838s : 78: opt.transform.opt_a 0.12% : 0.000026s : 1: opt.transform.opt_after_cconv 0.11% : 0.000023s : 1: opt.transform.opt_after_jit_grad 0.43% : 0.000094s : 28: opt.transform.opt_b 0.19% : 0.000042s : 2: opt.transform.opt_trans_graph 0.16% : 0.000034s : 4: opt.transform.symbol_engine_opt 9.89% : 0.002151s : 1: opt_a 0.46% : 0.000101s : 1: opt_after_cconv 2.09% : 0.000454s : 1: opt_after_jit_grad 0.92% : 0.000199s : 1: opt_b 18.44% : 0.004012s : 1: optimize 0.09% : 0.000019s : 1: optimize_parallel_all_gather_comm 0.04% : 0.000008s : 1: order_py_execute_after_rewriter 0.09% : 0.000021s : 1: overlap_grad_flash_sp 0.02% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.03% : 0.000007s : 1: overlap_grad_ring_attention 0.02% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.02% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.02% : 0.000005s : 1: overlap_param_gather 0.02% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.04% : 0.000008s : 1: overlap_recompute_and_grad_model_parallel 0.02% : 0.000005s : 1: overlap_recompute_comm 0.03% : 0.000006s : 1: parallel-infer-symbol 0.02% : 0.000004s : 1: parallel-infer-symbol-second 0.02% : 0.000005s : 1: partial_unused_args_eliminate 0.02% : 0.000005s : 1: pipeline_parallel_scheduler 0.02% : 0.000004s : 1: pipeline_split 0.14% : 0.000030s : 1: pre_auto_parallel 0.09% : 0.000020s : 1: py_interpret_to_execute 0.05% : 0.000011s : 1: py_interpret_to_execute_after_opt_a 0.02% : 0.000004s : 1: remove_cast_before_assign_add 0.08% : 0.000018s : 1: remove_dup_value 1.07% : 0.000232s : 1: renormalize.infer 0.94% : 0.000203s : 1: renormalize.specialize 0.02% : 0.000005s : 1: reorder_send_recv_between_fp_bp 0.03% : 0.000007s : 1: rewriter_after_jit_bprop_graph 0.18% : 0.000039s : 1: rewriter_after_opt_a 0.22% : 0.000047s : 1: rewriter_before_opt_a 0.02% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.02% : 0.000005s : 1: slice_recompute_activation 0.02% : 0.000005s : 1: split_layernorm_comm 0.02% : 0.000005s : 1: split_matmul_comm_elemetwise 0.04% : 0.000008s : 1: swap_dp_allreduce_reducescatter 0.35% : 0.000076s : 1: symbol_engine_optimizer 0.33% : 0.000071s : 1: tuple_transform 23.24% : 0.005056s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:30.621.094 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:30.621.363 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0145804, [21] [bootstrap]: 0.00041884 [type_inference]: 0.00524374 [event_method]: 1.243e-05 [auto_monad]: 5.52e-05 [graph_reusing]: 5.25999e-06 [inline]: 2.14e-06 [add_attr]: 0.003062, [1] [add_attr_with_inline]: 0.00305368, [1] [Cycle 1]: 5.971e-05, [2] [tag_attr]: 1.435e-05 [meta_addattr_fg_expand]: 3.97002e-06 [parallel-infer-symbol]: 2.79999e-06 [pre_auto_parallel]: 2.377e-05 [insert-virtual-dataset]: 2.27001e-06 [parallel-infer-symbol-second]: 7.00005e-07 [dataset_repeat_opt]: 1.97999e-06 [pipeline_split]: 1.55999e-06 [optimize]: 0.00464079, [53] [py_interpret_to_execute]: 1.97e-05 [rewriter_before_opt_a]: 4.692e-05 [opt_a]: 0.00248602, [2] [Cycle 1]: 0.00165087, [45] [expand_dump_flag]: 2.78e-06 [switch_simplify]: 2.513e-05 [loop_unroll]: 1.414e-05 [a_1]: 0.00030531 [with_stream_mark]: 1.443e-05 [recompute_prepare]: 8.94e-06 [updatestate_depend_eliminate]: 3.84002e-06 [updatestate_assign_eliminate]: 3.63e-06 [updatestate_loads_eliminate]: 2.91e-06 [parameter_eliminate]: 2.32999e-06 [a_2]: 0.00012157 [accelerated_algorithm]: 1.88e-05 [shard]: 2.36e-06 [meta_shard_fg_expand]: 1.64e-06 [shard_inline]: 6.41998e-06 [merge_send_recv]: 7.88999e-06 [auto_parallel]: 5.94e-06 [parallel]: 1.76e-05 [flash_sp]: 7e-06 [merge_comm]: 4.06001e-06 [allreduce_fusion]: 3.3e-06 [matmul_add_comm_reduction]: 1.043e-05 [allreduce_slice_to_reducescatter]: 6.89994e-07 [virtual_shard_identity]: 7.68999e-06 [virtual_dataset]: 5.97001e-06 [get_grad_eliminate_]: 6.39001e-06 [virtual_output]: 6.81999e-06 [merge_forward]: 4.29002e-06 [cell_reuse_recompute_pass]: 1.61998e-06 [offload_activation]: 9.91e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.499e-05 [merge_recompute_call_nodes]: 1.57999e-06 [before_grad]: 1.091e-05 [set_forward_comm_id_for_comm_node_pass]: 3.66999e-06 [meta_fg_expand]: 2.64001e-06 [flash_sp_send_recv_attached]: 5.44e-06 [receive_attached]: 2.14999e-06 [after_resolve]: 1.009e-05 [a_after_grad]: 9.57001e-06 [renormalize]: 0.00045628 [add_forward_monad_depend]: 5.51e-06 [auto_monad_grad]: 1.87001e-06 [auto_monad_eliminator]: 1.409e-05 [cse]: 2.8e-05 [a_3]: 5.873e-05 [Cycle 2]: 0.00082164, [45] [expand_dump_flag]: 1.02e-06 [switch_simplify]: 7.07002e-06 [loop_unroll]: 5.96e-06 [a_1]: 0.00012849 [with_stream_mark]: 8.67e-06 [recompute_prepare]: 6.43e-06 [updatestate_depend_eliminate]: 3.25e-06 [updatestate_assign_eliminate]: 2.62001e-06 [updatestate_loads_eliminate]: 2.53e-06 [parameter_eliminate]: 9.39996e-07 [a_2]: 0.00011021 [accelerated_algorithm]: 9.32999e-06 [shard]: 9.00007e-07 [meta_shard_fg_expand]: 1.51998e-06 [shard_inline]: 6.90002e-06 [merge_send_recv]: 5.71e-06 [auto_parallel]: 5.98002e-06 [parallel]: 4.93001e-06 [flash_sp]: 3.63999e-06 [merge_comm]: 3.24001e-06 [allreduce_fusion]: 2.91e-06 [matmul_add_comm_reduction]: 7.48e-06 [allreduce_slice_to_reducescatter]: 3.39991e-07 [virtual_shard_identity]: 6.87002e-06 [virtual_dataset]: 5.92001e-06 [get_grad_eliminate_]: 5.89e-06 [virtual_output]: 5.40001e-06 [merge_forward]: 2.73003e-06 [cell_reuse_recompute_pass]: 1.37999e-06 [offload_activation]: 6.60002e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.237e-05 [merge_recompute_call_nodes]: 8.89995e-07 [before_grad]: 9.07999e-06 [set_forward_comm_id_for_comm_node_pass]: 3.7e-06 [meta_fg_expand]: 1.96998e-06 [flash_sp_send_recv_attached]: 8.30012e-07 [receive_attached]: 9.70002e-07 [after_resolve]: 9.00999e-06 [a_after_grad]: 8.62e-06 [renormalize]: 7.99773e-08 [add_forward_monad_depend]: 1.14e-06 [auto_monad_grad]: 9.29984e-07 [auto_monad_eliminator]: 8.1e-06 [cse]: 1.649e-05 [a_3]: 4.777e-05 [py_interpret_to_execute_after_opt_a]: 1.175e-05 [slice_cell_reuse_recomputed_activation]: 4.62e-06 [rewriter_after_opt_a]: 3.733e-05 [convert_after_rewriter]: 9.54e-06 [order_py_execute_after_rewriter]: 8.27998e-06 [mutable_eliminate]: 0.00048236 [opt_b]: 0.00026684, [1] [Cycle 1]: 0.00025849, [7] [b_1]: 0.00016269 [b_2]: 8.70999e-06 [updatestate_depend_eliminate]: 5.39e-06 [updatestate_assign_eliminate]: 2.74001e-06 [updatestate_loads_eliminate]: 2.24001e-06 [renormalize]: 7.29982e-07 [cse]: 1.939e-05 [optimize_parallel_all_gather_comm]: 1.939e-05 [overlap_param_gather]: 4.84e-06 [cconv]: 2.835e-05 [loop_unroll]: 0.00043427 [opt_after_cconv]: 0.00012211, [1] [Cycle 1]: 0.00011374, [7] [c_1]: 2.765e-05 [parameter_eliminate]: 2.46e-06 [updatestate_depend_eliminate]: 5.25999e-06 [updatestate_assign_eliminate]: 2.80002e-06 [updatestate_loads_eliminate]: 2.33998e-06 [cse]: 1.726e-05 [renormalize]: 5.00004e-07 [remove_dup_value]: 1.778e-05 [tuple_transform]: 8.481e-05, [1] [Cycle 1]: 7.739e-05, [4] [d_1]: 3.812e-05 [none_parameter_eliminate]: 1.67001e-06 [renormalize]: 2.09984e-07 [switch_simplify]: 6.81999e-06 [partial_unused_args_eliminate]: 4.73001e-06 [add_recomputation]: 4.649e-05 [cse_after_recomputation]: 2.769e-05, [1] [Cycle 1]: 2.089e-05, [1] [cse]: 1.155e-05 [environ_conv]: 7.95998e-06 [swap_dp_allreduce_reducescatter]: 7.61999e-06 [bias_add_comm_swap]: 5.44e-06 [label_micro_interleaved_index]: 7.13e-06 [label_fine_grained_interleaved_index]: 5.39e-06 [merge_cast_opt]: 3.75e-06 [slice_recompute_activation]: 4.37e-06 [micro_interleaved_order_control]: 4.85001e-06 [assign_add_opt]: 3.49001e-06 [ForceFp32Comm]: 3.05002e-06 [remove_cast_before_assign_add]: 3.33e-06 [full_micro_interleaved_order_control]: 4.58999e-06 [reorder_send_recv_between_fp_bp]: 5.32001e-06 [comm_op_add_attrs]: 3.75998e-06 [add_comm_op_reuse_tag]: 3.18e-06 [interleave_split_concat_branches]: 3.63e-06 [interleave_parallel_branches]: 3.51001e-06 [overlap_opt_shard_in_pipeline]: 3.9e-06 [overlap_opt_shard_grad_in_pipeline]: 4.62e-06 [control_data_broadcast_order]: 1.517e-05 [grouped_pairwise_exchange_alltoall]: 4.27e-06 [offloading_packed_experts]: 6.14001e-06 [overlap_recompute_and_grad_model_parallel]: 6.86999e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.55e-06 [overlap_recompute_allgather_and_fa_grad]: 3.83001e-06 [overlap_recompute_comm]: 4.80999e-06 [overlap_grad_ring_attention]: 6.43e-06 [overlap_grad_flash_sp]: 2.054e-05 [begin_end_overlap_inline]: 2.91999e-06 [split_matmul_comm_elemetwise]: 4.43001e-06 [split_layernorm_comm]: 3.86999e-06 [handle_group_info]: 3.75e-06 [symbol_engine_optimizer]: 9.487e-05, [1] [Cycle 1]: 8.837e-05, [6] [build]: 2.66e-06 [elim_shapecalc]: 9.28002e-06 [elim_not_effective]: 1.243e-05 [opt_reshape]: 7.13e-06 [fold_const_symbol]: 1.015e-05 [renormalize]: 2.30008e-07 [detach_backward]: 3.21001e-06 [pipeline_parallel_scheduler]: 1.64998e-06 [auto_monad_reorder]: 1.809e-05 [get_jit_bprop_graph]: 1.37e-06 [rewriter_after_jit_bprop_graph]: 4.17e-06 [opt_after_jit_grad]: 0.00047529 [validate]: 3.351e-05 Sums bootstrap : 0.000419s : 4.28% type_inference : 0.005244s : 53.59% event_method : 0.000012s : 0.13% auto_monad : 0.000055s : 0.56% graph_reusing : 0.000005s : 0.05% inline : 0.000002s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000014s : 0.15% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000004s : 0.04% parallel-infer-symbol : 0.000003s : 0.03% pre_auto_parallel : 0.000024s : 0.24% insert-virtual-dataset : 0.000002s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.02% optimize.py_interpret_to_execute : 0.000020s : 0.20% optimize.rewriter_before_opt_a : 0.000047s : 0.48% optimize.opt_a.expand_dump_flag : 0.000004s : 0.04% optimize.opt_a.switch_simplify : 0.000032s : 0.33% optimize.opt_a.loop_unroll : 0.000020s : 0.21% optimize.opt_a.a_1 : 0.000434s : 4.43% optimize.opt_a.with_stream_mark : 0.000023s : 0.24% optimize.opt_a.recompute_prepare : 0.000015s : 0.16% optimize.opt_a.updatestate_depend_eliminate : 0.000007s : 0.07% optimize.opt_a.updatestate_assign_eliminate : 0.000006s : 0.06% optimize.opt_a.updatestate_loads_eliminate : 0.000005s : 0.06% optimize.opt_a.parameter_eliminate : 0.000003s : 0.03% optimize.opt_a.a_2 : 0.000232s : 2.37% optimize.opt_a.accelerated_algorithm : 0.000028s : 0.29% optimize.opt_a.shard : 0.000003s : 0.03% optimize.opt_a.meta_shard_fg_expand : 0.000003s : 0.03% optimize.opt_a.shard_inline : 0.000013s : 0.14% optimize.opt_a.merge_send_recv : 0.000014s : 0.14% optimize.opt_a.auto_parallel : 0.000012s : 0.12% optimize.opt_a.parallel : 0.000023s : 0.23% optimize.opt_a.flash_sp : 0.000011s : 0.11% optimize.opt_a.merge_comm : 0.000007s : 0.07% optimize.opt_a.allreduce_fusion : 0.000006s : 0.06% optimize.opt_a.matmul_add_comm_reduction : 0.000018s : 0.18% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000015s : 0.15% optimize.opt_a.virtual_dataset : 0.000012s : 0.12% optimize.opt_a.get_grad_eliminate_ : 0.000012s : 0.13% optimize.opt_a.virtual_output : 0.000012s : 0.12% optimize.opt_a.merge_forward : 0.000007s : 0.07% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.03% optimize.opt_a.offload_activation : 0.000017s : 0.17% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000027s : 0.28% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.03% optimize.opt_a.before_grad : 0.000020s : 0.20% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000007s : 0.08% optimize.opt_a.meta_fg_expand : 0.000005s : 0.05% optimize.opt_a.flash_sp_send_recv_attached : 0.000006s : 0.06% optimize.opt_a.receive_attached : 0.000003s : 0.03% optimize.opt_a.after_resolve : 0.000019s : 0.20% optimize.opt_a.a_after_grad : 0.000018s : 0.19% optimize.opt_a.renormalize : 0.000456s : 4.66% optimize.opt_a.add_forward_monad_depend : 0.000007s : 0.07% optimize.opt_a.auto_monad_grad : 0.000003s : 0.03% optimize.opt_a.auto_monad_eliminator : 0.000022s : 0.23% optimize.opt_a.cse : 0.000044s : 0.45% optimize.opt_a.a_3 : 0.000106s : 1.09% optimize.py_interpret_to_execute_after_opt_a : 0.000012s : 0.12% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.05% optimize.rewriter_after_opt_a : 0.000037s : 0.38% optimize.convert_after_rewriter : 0.000010s : 0.10% optimize.order_py_execute_after_rewriter : 0.000008s : 0.08% optimize.mutable_eliminate : 0.000482s : 4.93% optimize.opt_b.b_1 : 0.000163s : 1.66% optimize.opt_b.b_2 : 0.000009s : 0.09% optimize.opt_b.updatestate_depend_eliminate : 0.000005s : 0.06% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_b.updatestate_loads_eliminate : 0.000002s : 0.02% optimize.opt_b.renormalize : 0.000001s : 0.01% optimize.opt_b.cse : 0.000019s : 0.20% optimize.optimize_parallel_all_gather_comm : 0.000019s : 0.20% optimize.overlap_param_gather : 0.000005s : 0.05% optimize.cconv : 0.000028s : 0.29% optimize.loop_unroll : 0.000434s : 4.44% optimize.opt_after_cconv.c_1 : 0.000028s : 0.28% optimize.opt_after_cconv.parameter_eliminate : 0.000002s : 0.03% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000005s : 0.05% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.02% optimize.opt_after_cconv.cse : 0.000017s : 0.18% optimize.opt_after_cconv.renormalize : 0.000001s : 0.01% optimize.remove_dup_value : 0.000018s : 0.18% optimize.tuple_transform.d_1 : 0.000038s : 0.39% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.02% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000007s : 0.07% optimize.partial_unused_args_eliminate : 0.000005s : 0.05% optimize.add_recomputation : 0.000046s : 0.48% optimize.cse_after_recomputation.cse : 0.000012s : 0.12% optimize.environ_conv : 0.000008s : 0.08% optimize.swap_dp_allreduce_reducescatter : 0.000008s : 0.08% optimize.bias_add_comm_swap : 0.000005s : 0.06% optimize.label_micro_interleaved_index : 0.000007s : 0.07% optimize.label_fine_grained_interleaved_index : 0.000005s : 0.06% optimize.merge_cast_opt : 0.000004s : 0.04% optimize.slice_recompute_activation : 0.000004s : 0.04% optimize.micro_interleaved_order_control : 0.000005s : 0.05% optimize.assign_add_opt : 0.000003s : 0.04% optimize.ForceFp32Comm : 0.000003s : 0.03% optimize.remove_cast_before_assign_add : 0.000003s : 0.03% optimize.full_micro_interleaved_order_control : 0.000005s : 0.05% optimize.reorder_send_recv_between_fp_bp : 0.000005s : 0.05% optimize.comm_op_add_attrs : 0.000004s : 0.04% optimize.add_comm_op_reuse_tag : 0.000003s : 0.03% optimize.interleave_split_concat_branches : 0.000004s : 0.04% optimize.interleave_parallel_branches : 0.000004s : 0.04% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.04% optimize.overlap_opt_shard_grad_in_pipeline : 0.000005s : 0.05% optimize.control_data_broadcast_order : 0.000015s : 0.16% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.04% optimize.offloading_packed_experts : 0.000006s : 0.06% optimize.overlap_recompute_and_grad_model_parallel : 0.000007s : 0.07% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.04% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.04% optimize.overlap_recompute_comm : 0.000005s : 0.05% optimize.overlap_grad_ring_attention : 0.000006s : 0.07% optimize.overlap_grad_flash_sp : 0.000021s : 0.21% optimize.begin_end_overlap_inline : 0.000003s : 0.03% optimize.split_matmul_comm_elemetwise : 0.000004s : 0.05% optimize.split_layernorm_comm : 0.000004s : 0.04% optimize.handle_group_info : 0.000004s : 0.04% optimize.symbol_engine_optimizer.build : 0.000003s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000009s : 0.09% optimize.symbol_engine_optimizer.elim_not_effective : 0.000012s : 0.13% optimize.symbol_engine_optimizer.opt_reshape : 0.000007s : 0.07% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000010s : 0.10% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000003s : 0.03% pipeline_parallel_scheduler : 0.000002s : 0.02% auto_monad_reorder : 0.000018s : 0.18% get_jit_bprop_graph : 0.000001s : 0.01% rewriter_after_jit_bprop_graph : 0.000004s : 0.04% opt_after_jit_grad : 0.000475s : 4.86% validate : 0.000034s : 0.34% Time group info: ------[substitution.] 0.000172 25 34.88% : 0.000060s : 4: substitution.arithmetic_simplify 0.98% : 0.000002s : 2: substitution.elim_not_effective 0.81% : 0.000001s : 2: substitution.fold_const_symbol 3.07% : 0.000005s : 3: substitution.graph_param_transform 47.37% : 0.000081s : 2: substitution.inline 2.00% : 0.000003s : 4: substitution.j_node_and_user_rematch 6.09% : 0.000010s : 2: substitution.less_batch_normalization 2.87% : 0.000005s : 4: substitution.remove_not_recompute_node 1.94% : 0.000003s : 2: substitution.replace_old_param ------[type_inference.] 0.005200 2 91.58% : 0.004762s : 1: type_inference.infer 8.42% : 0.000438s : 1: type_inference.specialize ------[replace.] 0.000020 2 100.00% : 0.000020s : 2: replace.inline ------[match.] 0.000080 2 100.00% : 0.000080s : 2: match.inline ------[predicate.] 0.000134 754 0.81% : 0.000001s : 7: predicate.accumulaten_eliminater 1.12% : 0.000001s : 3: predicate.ad_related_special_op_eliminate 0.63% : 0.000001s : 6: predicate.addn_check_dump 0.79% : 0.000001s : 7: predicate.addn_zero_filter 0.69% : 0.000001s : 7: predicate.adjust_all_reduce_mul_add 3.02% : 0.000004s : 13: predicate.arithmetic_simplify 0.74% : 0.000001s : 7: predicate.cast_eliminate 0.77% : 0.000001s : 6: predicate.check_bprop_eliminate 0.68% : 0.000001s : 6: predicate.compare_switch_simplify 0.23% : 0.000000s : 3: predicate.const_output_eliminate 0.80% : 0.000001s : 6: predicate.depend_value_elim 0.78% : 0.000001s : 7: predicate.dict_get_item_const_eliminator 0.84% : 0.000001s : 7: predicate.dict_get_item_eliminator 0.76% : 0.000001s : 7: predicate.dict_set_item_eliminator 1.16% : 0.000002s : 6: predicate.dumpgradient_eliminate 0.34% : 0.000000s : 3: predicate.elim_not_effective 0.51% : 0.000001s : 3: predicate.elim_shapecalc_of_broadcastargs 1.05% : 0.000001s : 10: predicate.environ_add_const_eliminate 0.97% : 0.000001s : 10: predicate.environ_get_add_eliminate 0.95% : 0.000001s : 10: predicate.environ_get_depend_swap 1.85% : 0.000002s : 16: predicate.environ_get_eliminate 0.98% : 0.000001s : 10: predicate.environ_get_set_eliminate 0.92% : 0.000001s : 9: predicate.exchange_switch_depend_value 1.91% : 0.000003s : 9: predicate.float_depend_g_call 0.67% : 0.000001s : 6: predicate.float_environ_get_switch 1.03% : 0.000001s : 9: predicate.float_tuple_getitem_switch 0.21% : 0.000000s : 3: predicate.fold_const_symbol 0.83% : 0.000001s : 6: predicate.get_grad_eliminate 0.27% : 0.000000s : 3: predicate.graph_param_transform 0.80% : 0.000001s : 6: predicate.incorporate_call 0.68% : 0.000001s : 6: predicate.incorporate_call_switch 6.41% : 0.000009s : 34: predicate.inline 1.02% : 0.000001s : 6: predicate.inline_without_move 0.43% : 0.000001s : 6: predicate.j_node_and_user_rematch 1.21% : 0.000002s : 6: predicate.less_batch_normalization 1.66% : 0.000002s : 13: predicate.list_to_tuple_eliminator_ 2.23% : 0.000003s : 20: predicate.load_eliminater 1.25% : 0.000002s : 3: predicate.loop_unroll_after_grad 1.73% : 0.000002s : 14: predicate.loop_unroll_before_grad 1.80% : 0.000002s : 13: predicate.make_slice_get_slice_eliminator 0.77% : 0.000001s : 6: predicate.merge_addn 0.67% : 0.000001s : 6: predicate.micro_step_allgather_replace 0.77% : 0.000001s : 6: predicate.mini_step_allgather_replace 0.67% : 0.000001s : 7: predicate.minmaximum_grad 1.47% : 0.000002s : 3: predicate.mutable_eliminate 0.49% : 0.000001s : 3: predicate.opt_reshape 0.45% : 0.000001s : 3: predicate.parallel_virtual_node 1.39% : 0.000002s : 9: predicate.partial_defer_inline 1.25% : 0.000002s : 10: predicate.partial_eliminate 0.78% : 0.000001s : 7: predicate.print_const_string_wrapper 0.74% : 0.000001s : 6: predicate.reduce_all_const_elim 0.91% : 0.000001s : 7: predicate.reduce_eliminate 2.08% : 0.000003s : 20: predicate.redundant_stop_gradient_eliminater 0.67% : 0.000001s : 6: predicate.remove_not_recompute_node 1.18% : 0.000002s : 13: predicate.replace_applicator 0.67% : 0.000001s : 6: predicate.replace_old_param 0.36% : 0.000000s : 3: predicate.reset_defer_inline 0.82% : 0.000001s : 7: predicate.reshape_eliminate 0.75% : 0.000001s : 6: predicate.row_tensor_add_zeros_like 0.80% : 0.000001s : 3: predicate.row_tensor_eliminate 0.92% : 0.000001s : 6: predicate.same_eliminate 0.55% : 0.000001s : 6: predicate.set_cell_output_no_recompute 0.99% : 0.000001s : 6: predicate.shard_identity_eliminate 0.80% : 0.000001s : 6: predicate.special_op_eliminate 1.01% : 0.000001s : 6: predicate.specialize_transform 1.09% : 0.000001s : 6: predicate.split_environ_get_set_with_tuple_value 1.00% : 0.000001s : 6: predicate.stack_unstack_eliminate 0.46% : 0.000001s : 3: predicate.switch_call_monad_eliminater 1.07% : 0.000001s : 9: predicate.switch_defer_inline 1.78% : 0.000002s : 15: predicate.switch_layer_defer_inline 4.50% : 0.000006s : 32: predicate.switch_simplify 0.74% : 0.000001s : 7: predicate.tile_eliminate 0.76% : 0.000001s : 7: predicate.transpose_eliminate 1.77% : 0.000002s : 13: predicate.tuple_list_convert_item_index_to_positive 1.68% : 0.000002s : 13: predicate.tuple_list_get_item_const_eliminator 1.39% : 0.000002s : 13: predicate.tuple_list_get_item_depend_reorder 3.11% : 0.000004s : 19: predicate.tuple_list_get_item_eliminator 1.41% : 0.000002s : 13: predicate.tuple_list_get_set_item_eliminator 2.43% : 0.000003s : 19: predicate.tuple_list_set_item_eliminator 1.52% : 0.000002s : 13: predicate.tuple_to_list_eliminator_ 1.92% : 0.000003s : 20: predicate.updatestate_pure_node_eliminater 2.84% : 0.000004s : 26: predicate.updatestate_useless_node_eliminater 0.50% : 0.000001s : 3: predicate.value_based_eliminate 0.80% : 0.000001s : 6: predicate.virtual_dataset_eliminate 0.80% : 0.000001s : 6: predicate.virtual_output_eliminate 0.29% : 0.000000s : 3: predicate.virtual_view_grad_eliminate 0.62% : 0.000001s : 3: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000207 5 8.17% : 0.000017s : 1: func_graph_cloner_run.FuncGraphClonerGraph 91.83% : 0.000190s : 4: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.023717 192 0.02% : 0.000006s : 1: ForceFp32Comm 12.95% : 0.003071s : 1: add_attr 12.89% : 0.003057s : 1: add_attr_with_inline 0.02% : 0.000006s : 1: add_comm_op_reuse_tag 0.21% : 0.000050s : 1: add_recomputation 0.03% : 0.000006s : 1: assign_add_opt 0.27% : 0.000065s : 1: auto_monad 0.11% : 0.000025s : 1: auto_monad_reorder 0.02% : 0.000006s : 1: begin_end_overlap_inline 0.03% : 0.000008s : 1: bias_add_comm_swap 1.94% : 0.000461s : 1: bootstrap 0.13% : 0.000032s : 1: cconv 0.03% : 0.000006s : 1: comm_op_add_attrs 0.08% : 0.000018s : 1: control_data_broadcast_order 0.05% : 0.000013s : 1: convert_after_rewriter 0.13% : 0.000031s : 1: cse_after_recomputation 0.03% : 0.000008s : 1: dataset_repeat_opt 0.07% : 0.000018s : 1: detach_backward 0.05% : 0.000011s : 1: environ_conv 0.09% : 0.000022s : 1: event_method 0.03% : 0.000007s : 1: full_micro_interleaved_order_control 0.03% : 0.000007s : 1: get_jit_bprop_graph 0.05% : 0.000011s : 1: graph_reusing 0.03% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.03% : 0.000006s : 1: handle_group_info 0.03% : 0.000008s : 1: inline 0.03% : 0.000008s : 1: insert-virtual-dataset 0.03% : 0.000006s : 1: interleave_parallel_branches 0.03% : 0.000006s : 1: interleave_split_concat_branches 0.03% : 0.000008s : 1: label_fine_grained_interleaved_index 0.04% : 0.000010s : 1: label_micro_interleaved_index 1.85% : 0.000440s : 1: loop_unroll 0.03% : 0.000006s : 1: merge_cast_opt 0.03% : 0.000007s : 1: micro_interleaved_order_control 2.06% : 0.000488s : 1: mutable_eliminate 0.04% : 0.000009s : 1: offloading_packed_experts 0.06% : 0.000013s : 1: opt.transform.loop_unroll_optimizer 0.06% : 0.000014s : 1: opt.transform.mutable_eliminate 3.56% : 0.000843s : 78: opt.transform.opt_a 0.11% : 0.000026s : 1: opt.transform.opt_after_cconv 0.10% : 0.000024s : 1: opt.transform.opt_after_jit_grad 0.42% : 0.000099s : 28: opt.transform.opt_b 0.18% : 0.000043s : 2: opt.transform.opt_trans_graph 0.15% : 0.000036s : 4: opt.transform.symbol_engine_opt 10.50% : 0.002489s : 1: opt_a 0.53% : 0.000126s : 1: opt_after_cconv 2.05% : 0.000486s : 1: opt_after_jit_grad 1.14% : 0.000271s : 1: opt_b 21.02% : 0.004985s : 1: optimize 0.10% : 0.000023s : 1: optimize_parallel_all_gather_comm 0.05% : 0.000011s : 1: order_py_execute_after_rewriter 0.10% : 0.000024s : 1: overlap_grad_flash_sp 0.03% : 0.000006s : 1: overlap_grad_matmul_and_grad_allreduce 0.04% : 0.000009s : 1: overlap_grad_ring_attention 0.03% : 0.000007s : 1: overlap_opt_shard_grad_in_pipeline 0.03% : 0.000007s : 1: overlap_opt_shard_in_pipeline 0.03% : 0.000008s : 1: overlap_param_gather 0.03% : 0.000007s : 1: overlap_recompute_allgather_and_fa_grad 0.04% : 0.000010s : 1: overlap_recompute_and_grad_model_parallel 0.03% : 0.000008s : 1: overlap_recompute_comm 0.04% : 0.000009s : 1: parallel-infer-symbol 0.03% : 0.000006s : 1: parallel-infer-symbol-second 0.03% : 0.000008s : 1: partial_unused_args_eliminate 0.04% : 0.000009s : 1: pipeline_parallel_scheduler 0.03% : 0.000007s : 1: pipeline_split 0.13% : 0.000031s : 1: pre_auto_parallel 0.10% : 0.000023s : 1: py_interpret_to_execute 0.06% : 0.000015s : 1: py_interpret_to_execute_after_opt_a 0.02% : 0.000006s : 1: remove_cast_before_assign_add 0.09% : 0.000021s : 1: remove_dup_value 1.07% : 0.000253s : 1: renormalize.infer 0.83% : 0.000196s : 1: renormalize.specialize 0.03% : 0.000008s : 1: reorder_send_recv_between_fp_bp 0.04% : 0.000010s : 1: rewriter_after_jit_bprop_graph 0.17% : 0.000041s : 1: rewriter_after_opt_a 0.21% : 0.000050s : 1: rewriter_before_opt_a 0.03% : 0.000007s : 1: slice_cell_reuse_recomputed_activation 0.03% : 0.000007s : 1: slice_recompute_activation 0.03% : 0.000007s : 1: split_layernorm_comm 0.03% : 0.000007s : 1: split_matmul_comm_elemetwise 0.04% : 0.000010s : 1: swap_dp_allreduce_reducescatter 0.41% : 0.000098s : 1: symbol_engine_optimizer 0.37% : 0.000088s : 1: tuple_transform 22.22% : 0.005270s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:31.176.5 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.030099, [21] [bootstrap]: 0.00042247 [type_inference]: 0.0215757 [event_method]: 1.254e-05 [auto_monad]: 5.474e-05 [graph_reusing]: 5.42999e-06 [inline]: 2.96001e-06 [add_attr]: 0.00314919, [1] [add_attr_with_inline]: 0.00313941, [1] [Cycle 1]: 5.281e-05, [2] [tag_attr]: 1.395e-05 [meta_addattr_fg_expand]: 3.7e-06 [parallel-infer-symbol]: 3.19001e-06 [pre_auto_parallel]: 2.493e-05 [insert-virtual-dataset]: 2.44999e-06 [parallel-infer-symbol-second]: 8.2e-07 [dataset_repeat_opt]: 1.82001e-06 [pipeline_split]: 1.69e-06 [optimize]: 0.00415996, [53] [py_interpret_to_execute]: 1.736e-05 [rewriter_before_opt_a]: 4.569e-05 [opt_a]: 0.00222197, [2] [Cycle 1]: 0.00156072, [45] [expand_dump_flag]: 3.04001e-06 [switch_simplify]: 2.8e-05 [loop_unroll]: 1.44e-05 [a_1]: 0.00031647 [with_stream_mark]: 1.67e-05 [recompute_prepare]: 9.59999e-06 [updatestate_depend_eliminate]: 4.13001e-06 [updatestate_assign_eliminate]: 3.33e-06 [updatestate_loads_eliminate]: 3.65e-06 [parameter_eliminate]: 1.86998e-06 [a_2]: 9.564e-05 [accelerated_algorithm]: 2.086e-05 [shard]: 2.25002e-06 [meta_shard_fg_expand]: 2.43e-06 [shard_inline]: 7.99997e-06 [merge_send_recv]: 8.93002e-06 [auto_parallel]: 7.02002e-06 [parallel]: 1.931e-05 [flash_sp]: 7.54002e-06 [merge_comm]: 3.91001e-06 [allreduce_fusion]: 3.85e-06 [matmul_add_comm_reduction]: 1.117e-05 [allreduce_slice_to_reducescatter]: 5.89993e-07 [virtual_shard_identity]: 9.09e-06 [virtual_dataset]: 8.37e-06 [get_grad_eliminate_]: 6.88e-06 [virtual_output]: 6.48e-06 [merge_forward]: 4.25e-06 [cell_reuse_recompute_pass]: 1.22999e-06 [offload_activation]: 9.89001e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.344e-05 [merge_recompute_call_nodes]: 1.42e-06 [before_grad]: 1.097e-05 [set_forward_comm_id_for_comm_node_pass]: 3.81999e-06 [meta_fg_expand]: 2.66e-06 [flash_sp_send_recv_attached]: 5.18002e-06 [receive_attached]: 2.54001e-06 [after_resolve]: 1.107e-05 [a_after_grad]: 9.45001e-06 [renormalize]: 0.00051317 [add_forward_monad_depend]: 5.51e-06 [auto_monad_grad]: 2.08002e-06 [auto_monad_eliminator]: 1.504e-05 [cse]: 3.011e-05 [a_3]: 4.799e-05 [Cycle 2]: 0.00065176, [45] [expand_dump_flag]: 1.17e-06 [switch_simplify]: 7.39002e-06 [loop_unroll]: 5.82999e-06 [a_1]: 0.00013065 [with_stream_mark]: 8.64e-06 [recompute_prepare]: 6.78e-06 [updatestate_depend_eliminate]: 3.10002e-06 [updatestate_assign_eliminate]: 2.51e-06 [updatestate_loads_eliminate]: 2.94001e-06 [parameter_eliminate]: 1.30001e-06 [a_2]: 8.149e-05 [accelerated_algorithm]: 9.05999e-06 [shard]: 1.10001e-06 [meta_shard_fg_expand]: 1.68002e-06 [shard_inline]: 6.17001e-06 [merge_send_recv]: 5.47001e-06 [auto_parallel]: 5.62001e-06 [parallel]: 5.65001e-06 [flash_sp]: 3.88999e-06 [merge_comm]: 3.28e-06 [allreduce_fusion]: 3.08e-06 [matmul_add_comm_reduction]: 7.02002e-06 [allreduce_slice_to_reducescatter]: 7.30011e-07 [virtual_shard_identity]: 6.34001e-06 [virtual_dataset]: 5.77001e-06 [get_grad_eliminate_]: 5.45001e-06 [virtual_output]: 5.20001e-06 [merge_forward]: 3.15998e-06 [cell_reuse_recompute_pass]: 1.35001e-06 [offload_activation]: 6.64999e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.074e-05 [merge_recompute_call_nodes]: 8.89995e-07 [before_grad]: 8.80001e-06 [set_forward_comm_id_for_comm_node_pass]: 3.5e-06 [meta_fg_expand]: 2.05002e-06 [flash_sp_send_recv_attached]: 9.89996e-07 [receive_attached]: 1.32999e-06 [after_resolve]: 8.88002e-06 [a_after_grad]: 8.35001e-06 [renormalize]: 8.00064e-08 [add_forward_monad_depend]: 1.27e-06 [auto_monad_grad]: 1.14e-06 [auto_monad_eliminator]: 7.85998e-06 [cse]: 1.534e-05 [a_3]: 3.444e-05 [py_interpret_to_execute_after_opt_a]: 8.77e-06 [slice_cell_reuse_recomputed_activation]: 2.34999e-06 [rewriter_after_opt_a]: 3.39e-05 [convert_after_rewriter]: 7.15e-06 [order_py_execute_after_rewriter]: 5.70001e-06 [mutable_eliminate]: 0.00049614 [opt_b]: 0.00019916, [1] [Cycle 1]: 0.00019298, [7] [b_1]: 0.00011805 [b_2]: 7.23e-06 [updatestate_depend_eliminate]: 5.91998e-06 [updatestate_assign_eliminate]: 2.40002e-06 [updatestate_loads_eliminate]: 2.44001e-06 [renormalize]: 5.10016e-07 [cse]: 2.002e-05 [optimize_parallel_all_gather_comm]: 1.855e-05 [overlap_param_gather]: 1.91998e-06 [cconv]: 2.655e-05 [loop_unroll]: 0.00042849 [opt_after_cconv]: 0.00010086, [1] [Cycle 1]: 9.532e-05, [7] [c_1]: 2.806e-05 [parameter_eliminate]: 3.60003e-06 [updatestate_depend_eliminate]: 5.37001e-06 [updatestate_assign_eliminate]: 2.47001e-06 [updatestate_loads_eliminate]: 2.56e-06 [cse]: 1.837e-05 [renormalize]: 4.00003e-07 [remove_dup_value]: 1.524e-05 [tuple_transform]: 7.124e-05, [1] [Cycle 1]: 6.688e-05, [4] [d_1]: 3.945e-05 [none_parameter_eliminate]: 1.76998e-06 [renormalize]: 1.99972e-07 [switch_simplify]: 6.79999e-06 [partial_unused_args_eliminate]: 1.76e-06 [add_recomputation]: 4.718e-05 [cse_after_recomputation]: 2.122e-05, [1] [Cycle 1]: 1.687e-05, [1] [cse]: 1.143e-05 [environ_conv]: 4.58001e-06 [swap_dp_allreduce_reducescatter]: 5.87001e-06 [bias_add_comm_swap]: 2.62001e-06 [label_micro_interleaved_index]: 4.86002e-06 [label_fine_grained_interleaved_index]: 2.78e-06 [merge_cast_opt]: 1.43002e-06 [slice_recompute_activation]: 2.02001e-06 [micro_interleaved_order_control]: 2.07999e-06 [assign_add_opt]: 1.55999e-06 [ForceFp32Comm]: 7.89994e-07 [remove_cast_before_assign_add]: 1.14e-06 [full_micro_interleaved_order_control]: 2.28998e-06 [reorder_send_recv_between_fp_bp]: 2.40002e-06 [comm_op_add_attrs]: 1.00001e-06 [add_comm_op_reuse_tag]: 9.60019e-07 [interleave_split_concat_branches]: 1.15001e-06 [interleave_parallel_branches]: 1.67999e-06 [overlap_opt_shard_in_pipeline]: 1.10999e-06 [overlap_opt_shard_grad_in_pipeline]: 2.11e-06 [control_data_broadcast_order]: 1.296e-05 [grouped_pairwise_exchange_alltoall]: 1.74998e-06 [offloading_packed_experts]: 3.97e-06 [overlap_recompute_and_grad_model_parallel]: 4.62998e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.13001e-06 [overlap_recompute_allgather_and_fa_grad]: 1.35001e-06 [overlap_recompute_comm]: 2.39999e-06 [overlap_grad_ring_attention]: 4.1e-06 [overlap_grad_flash_sp]: 1.841e-05 [begin_end_overlap_inline]: 5.3001e-07 [split_matmul_comm_elemetwise]: 2.76999e-06 [split_layernorm_comm]: 1.92999e-06 [handle_group_info]: 1.02998e-06 [symbol_engine_optimizer]: 7.31e-05, [1] [Cycle 1]: 6.873e-05, [6] [build]: 2.73e-06 [elim_shapecalc]: 9.24e-06 [elim_not_effective]: 1.208e-05 [opt_reshape]: 6.74999e-06 [fold_const_symbol]: 9.87999e-06 [renormalize]: 2.3999e-07 [detach_backward]: 2.06998e-06 [pipeline_parallel_scheduler]: 1.49e-06 [auto_monad_reorder]: 1.55e-05 [get_jit_bprop_graph]: 1.59998e-06 [rewriter_after_jit_bprop_graph]: 3.98999e-06 [opt_after_jit_grad]: 0.00046237 [validate]: 3.901e-05 Sums bootstrap : 0.000422s : 1.63% type_inference : 0.021576s : 83.03% event_method : 0.000013s : 0.05% auto_monad : 0.000055s : 0.21% graph_reusing : 0.000005s : 0.02% inline : 0.000003s : 0.01% add_attr.add_attr_with_inline.tag_attr : 0.000014s : 0.05% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000004s : 0.01% parallel-infer-symbol : 0.000003s : 0.01% pre_auto_parallel : 0.000025s : 0.10% insert-virtual-dataset : 0.000002s : 0.01% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.01% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000017s : 0.07% optimize.rewriter_before_opt_a : 0.000046s : 0.18% optimize.opt_a.expand_dump_flag : 0.000004s : 0.02% optimize.opt_a.switch_simplify : 0.000035s : 0.14% optimize.opt_a.loop_unroll : 0.000020s : 0.08% optimize.opt_a.a_1 : 0.000447s : 1.72% optimize.opt_a.with_stream_mark : 0.000025s : 0.10% optimize.opt_a.recompute_prepare : 0.000016s : 0.06% optimize.opt_a.updatestate_depend_eliminate : 0.000007s : 0.03% optimize.opt_a.updatestate_assign_eliminate : 0.000006s : 0.02% optimize.opt_a.updatestate_loads_eliminate : 0.000007s : 0.03% optimize.opt_a.parameter_eliminate : 0.000003s : 0.01% optimize.opt_a.a_2 : 0.000177s : 0.68% optimize.opt_a.accelerated_algorithm : 0.000030s : 0.12% optimize.opt_a.shard : 0.000003s : 0.01% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.02% optimize.opt_a.shard_inline : 0.000014s : 0.05% optimize.opt_a.merge_send_recv : 0.000014s : 0.06% optimize.opt_a.auto_parallel : 0.000013s : 0.05% optimize.opt_a.parallel : 0.000025s : 0.10% optimize.opt_a.flash_sp : 0.000011s : 0.04% optimize.opt_a.merge_comm : 0.000007s : 0.03% optimize.opt_a.allreduce_fusion : 0.000007s : 0.03% optimize.opt_a.matmul_add_comm_reduction : 0.000018s : 0.07% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000015s : 0.06% optimize.opt_a.virtual_dataset : 0.000014s : 0.05% optimize.opt_a.get_grad_eliminate_ : 0.000012s : 0.05% optimize.opt_a.virtual_output : 0.000012s : 0.04% optimize.opt_a.merge_forward : 0.000007s : 0.03% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.01% optimize.opt_a.offload_activation : 0.000017s : 0.06% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000024s : 0.09% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.01% optimize.opt_a.before_grad : 0.000020s : 0.08% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000007s : 0.03% optimize.opt_a.meta_fg_expand : 0.000005s : 0.02% optimize.opt_a.flash_sp_send_recv_attached : 0.000006s : 0.02% optimize.opt_a.receive_attached : 0.000004s : 0.01% optimize.opt_a.after_resolve : 0.000020s : 0.08% optimize.opt_a.a_after_grad : 0.000018s : 0.07% optimize.opt_a.renormalize : 0.000513s : 1.98% optimize.opt_a.add_forward_monad_depend : 0.000007s : 0.03% optimize.opt_a.auto_monad_grad : 0.000003s : 0.01% optimize.opt_a.auto_monad_eliminator : 0.000023s : 0.09% optimize.opt_a.cse : 0.000045s : 0.17% optimize.opt_a.a_3 : 0.000082s : 0.32% optimize.py_interpret_to_execute_after_opt_a : 0.000009s : 0.03% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.01% optimize.rewriter_after_opt_a : 0.000034s : 0.13% optimize.convert_after_rewriter : 0.000007s : 0.03% optimize.order_py_execute_after_rewriter : 0.000006s : 0.02% optimize.mutable_eliminate : 0.000496s : 1.91% optimize.opt_b.b_1 : 0.000118s : 0.45% optimize.opt_b.b_2 : 0.000007s : 0.03% optimize.opt_b.updatestate_depend_eliminate : 0.000006s : 0.02% optimize.opt_b.updatestate_assign_eliminate : 0.000002s : 0.01% optimize.opt_b.updatestate_loads_eliminate : 0.000002s : 0.01% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000020s : 0.08% optimize.optimize_parallel_all_gather_comm : 0.000019s : 0.07% optimize.overlap_param_gather : 0.000002s : 0.01% optimize.cconv : 0.000027s : 0.10% optimize.loop_unroll : 0.000428s : 1.65% optimize.opt_after_cconv.c_1 : 0.000028s : 0.11% optimize.opt_after_cconv.parameter_eliminate : 0.000004s : 0.01% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000005s : 0.02% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000002s : 0.01% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.cse : 0.000018s : 0.07% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000015s : 0.06% optimize.tuple_transform.d_1 : 0.000039s : 0.15% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000007s : 0.03% optimize.partial_unused_args_eliminate : 0.000002s : 0.01% optimize.add_recomputation : 0.000047s : 0.18% optimize.cse_after_recomputation.cse : 0.000011s : 0.04% optimize.environ_conv : 0.000005s : 0.02% optimize.swap_dp_allreduce_reducescatter : 0.000006s : 0.02% optimize.bias_add_comm_swap : 0.000003s : 0.01% optimize.label_micro_interleaved_index : 0.000005s : 0.02% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.01% optimize.merge_cast_opt : 0.000001s : 0.01% optimize.slice_recompute_activation : 0.000002s : 0.01% optimize.micro_interleaved_order_control : 0.000002s : 0.01% optimize.assign_add_opt : 0.000002s : 0.01% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.00% optimize.full_micro_interleaved_order_control : 0.000002s : 0.01% optimize.reorder_send_recv_between_fp_bp : 0.000002s : 0.01% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000002s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.01% optimize.control_data_broadcast_order : 0.000013s : 0.05% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.01% optimize.offloading_packed_experts : 0.000004s : 0.02% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.02% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.01% optimize.overlap_recompute_comm : 0.000002s : 0.01% optimize.overlap_grad_ring_attention : 0.000004s : 0.02% optimize.overlap_grad_flash_sp : 0.000018s : 0.07% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000003s : 0.01% optimize.split_layernorm_comm : 0.000002s : 0.01% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000003s : 0.01% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000009s : 0.04% optimize.symbol_engine_optimizer.elim_not_effective : 0.000012s : 0.05% optimize.symbol_engine_optimizer.opt_reshape : 0.000007s : 0.03% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000010s : 0.04% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.01% pipeline_parallel_scheduler : 0.000001s : 0.01% auto_monad_reorder : 0.000016s : 0.06% get_jit_bprop_graph : 0.000002s : 0.01% rewriter_after_jit_bprop_graph : 0.000004s : 0.02% opt_after_jit_grad : 0.000462s : 1.78% validate : 0.000039s : 0.15% Time group info: ------[substitution.] 0.000181 25 32.72% : 0.000059s : 4: substitution.arithmetic_simplify 1.04% : 0.000002s : 2: substitution.elim_not_effective 0.82% : 0.000001s : 2: substitution.fold_const_symbol 3.01% : 0.000005s : 3: substitution.graph_param_transform 49.31% : 0.000089s : 2: substitution.inline 2.28% : 0.000004s : 4: substitution.j_node_and_user_rematch 6.38% : 0.000012s : 2: substitution.less_batch_normalization 2.62% : 0.000005s : 4: substitution.remove_not_recompute_node 1.82% : 0.000003s : 2: substitution.replace_old_param ------[type_inference.] 0.021525 2 97.91% : 0.021076s : 1: type_inference.infer 2.09% : 0.000449s : 1: type_inference.specialize ------[replace.] 0.000021 2 100.00% : 0.000021s : 2: replace.inline ------[match.] 0.000088 2 100.00% : 0.000088s : 2: match.inline ------[predicate.] 0.000136 754 0.78% : 0.000001s : 7: predicate.accumulaten_eliminater 1.13% : 0.000002s : 3: predicate.ad_related_special_op_eliminate 0.68% : 0.000001s : 6: predicate.addn_check_dump 0.86% : 0.000001s : 7: predicate.addn_zero_filter 0.70% : 0.000001s : 7: predicate.adjust_all_reduce_mul_add 3.01% : 0.000004s : 13: predicate.arithmetic_simplify 0.73% : 0.000001s : 7: predicate.cast_eliminate 0.70% : 0.000001s : 6: predicate.check_bprop_eliminate 0.68% : 0.000001s : 6: predicate.compare_switch_simplify 0.21% : 0.000000s : 3: predicate.const_output_eliminate 0.78% : 0.000001s : 6: predicate.depend_value_elim 0.80% : 0.000001s : 7: predicate.dict_get_item_const_eliminator 1.03% : 0.000001s : 7: predicate.dict_get_item_eliminator 0.69% : 0.000001s : 7: predicate.dict_set_item_eliminator 1.27% : 0.000002s : 6: predicate.dumpgradient_eliminate 0.26% : 0.000000s : 3: predicate.elim_not_effective 0.51% : 0.000001s : 3: predicate.elim_shapecalc_of_broadcastargs 1.00% : 0.000001s : 10: predicate.environ_add_const_eliminate 0.98% : 0.000001s : 10: predicate.environ_get_add_eliminate 0.98% : 0.000001s : 10: predicate.environ_get_depend_swap 1.83% : 0.000003s : 16: predicate.environ_get_eliminate 1.03% : 0.000001s : 10: predicate.environ_get_set_eliminate 0.95% : 0.000001s : 9: predicate.exchange_switch_depend_value 1.87% : 0.000003s : 9: predicate.float_depend_g_call 0.61% : 0.000001s : 6: predicate.float_environ_get_switch 0.88% : 0.000001s : 9: predicate.float_tuple_getitem_switch 0.26% : 0.000000s : 3: predicate.fold_const_symbol 0.93% : 0.000001s : 6: predicate.get_grad_eliminate 0.32% : 0.000000s : 3: predicate.graph_param_transform 0.81% : 0.000001s : 6: predicate.incorporate_call 0.63% : 0.000001s : 6: predicate.incorporate_call_switch 6.69% : 0.000009s : 34: predicate.inline 1.14% : 0.000002s : 6: predicate.inline_without_move 0.37% : 0.000001s : 6: predicate.j_node_and_user_rematch 1.38% : 0.000002s : 6: predicate.less_batch_normalization 1.60% : 0.000002s : 13: predicate.list_to_tuple_eliminator_ 2.17% : 0.000003s : 20: predicate.load_eliminater 1.36% : 0.000002s : 3: predicate.loop_unroll_after_grad 1.66% : 0.000002s : 14: predicate.loop_unroll_before_grad 1.72% : 0.000002s : 13: predicate.make_slice_get_slice_eliminator 0.64% : 0.000001s : 6: predicate.merge_addn 0.68% : 0.000001s : 6: predicate.micro_step_allgather_replace 0.65% : 0.000001s : 6: predicate.mini_step_allgather_replace 0.78% : 0.000001s : 7: predicate.minmaximum_grad 1.77% : 0.000002s : 3: predicate.mutable_eliminate 0.45% : 0.000001s : 3: predicate.opt_reshape 0.51% : 0.000001s : 3: predicate.parallel_virtual_node 1.19% : 0.000002s : 9: predicate.partial_defer_inline 1.23% : 0.000002s : 10: predicate.partial_eliminate 0.75% : 0.000001s : 7: predicate.print_const_string_wrapper 0.74% : 0.000001s : 6: predicate.reduce_all_const_elim 0.92% : 0.000001s : 7: predicate.reduce_eliminate 2.05% : 0.000003s : 20: predicate.redundant_stop_gradient_eliminater 0.78% : 0.000001s : 6: predicate.remove_not_recompute_node 1.14% : 0.000002s : 13: predicate.replace_applicator 0.78% : 0.000001s : 6: predicate.replace_old_param 0.34% : 0.000000s : 3: predicate.reset_defer_inline 0.82% : 0.000001s : 7: predicate.reshape_eliminate 0.70% : 0.000001s : 6: predicate.row_tensor_add_zeros_like 0.46% : 0.000001s : 3: predicate.row_tensor_eliminate 1.06% : 0.000001s : 6: predicate.same_eliminate 0.53% : 0.000001s : 6: predicate.set_cell_output_no_recompute 0.93% : 0.000001s : 6: predicate.shard_identity_eliminate 0.90% : 0.000001s : 6: predicate.special_op_eliminate 0.89% : 0.000001s : 6: predicate.specialize_transform 0.98% : 0.000001s : 6: predicate.split_environ_get_set_with_tuple_value 0.89% : 0.000001s : 6: predicate.stack_unstack_eliminate 0.44% : 0.000001s : 3: predicate.switch_call_monad_eliminater 0.98% : 0.000001s : 9: predicate.switch_defer_inline 1.63% : 0.000002s : 15: predicate.switch_layer_defer_inline 4.24% : 0.000006s : 32: predicate.switch_simplify 0.84% : 0.000001s : 7: predicate.tile_eliminate 0.81% : 0.000001s : 7: predicate.transpose_eliminate 1.51% : 0.000002s : 13: predicate.tuple_list_convert_item_index_to_positive 1.66% : 0.000002s : 13: predicate.tuple_list_get_item_const_eliminator 1.60% : 0.000002s : 13: predicate.tuple_list_get_item_depend_reorder 3.26% : 0.000004s : 19: predicate.tuple_list_get_item_eliminator 1.31% : 0.000002s : 13: predicate.tuple_list_get_set_item_eliminator 2.59% : 0.000004s : 19: predicate.tuple_list_set_item_eliminator 1.57% : 0.000002s : 13: predicate.tuple_to_list_eliminator_ 1.94% : 0.000003s : 20: predicate.updatestate_pure_node_eliminater 2.93% : 0.000004s : 26: predicate.updatestate_useless_node_eliminater 0.37% : 0.000001s : 3: predicate.value_based_eliminate 1.09% : 0.000001s : 6: predicate.virtual_dataset_eliminate 0.77% : 0.000001s : 6: predicate.virtual_output_eliminate 0.32% : 0.000000s : 3: predicate.virtual_view_grad_eliminate 0.59% : 0.000001s : 3: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000215 5 8.40% : 0.000018s : 1: func_graph_cloner_run.FuncGraphClonerGraph 91.60% : 0.000197s : 4: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.038911 192 0.01% : 0.000004s : 1: ForceFp32Comm 8.11% : 0.003155s : 1: add_attr 8.08% : 0.003143s : 1: add_attr_with_inline 0.01% : 0.000004s : 1: add_comm_op_reuse_tag 0.13% : 0.000051s : 1: add_recomputation 0.01% : 0.000004s : 1: assign_add_opt 0.15% : 0.000060s : 1: auto_monad 0.05% : 0.000019s : 1: auto_monad_reorder 0.01% : 0.000003s : 1: begin_end_overlap_inline 0.01% : 0.000005s : 1: bias_add_comm_swap 1.16% : 0.000451s : 1: bootstrap 0.08% : 0.000030s : 1: cconv 0.01% : 0.000004s : 1: comm_op_add_attrs 0.04% : 0.000016s : 1: control_data_broadcast_order 0.03% : 0.000010s : 1: convert_after_rewriter 0.06% : 0.000024s : 1: cse_after_recomputation 0.01% : 0.000005s : 1: dataset_repeat_opt 0.01% : 0.000005s : 1: detach_backward 0.02% : 0.000008s : 1: environ_conv 0.05% : 0.000020s : 1: event_method 0.01% : 0.000005s : 1: full_micro_interleaved_order_control 0.01% : 0.000005s : 1: get_jit_bprop_graph 0.02% : 0.000009s : 1: graph_reusing 0.01% : 0.000005s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000004s : 1: handle_group_info 0.01% : 0.000006s : 1: inline 0.01% : 0.000006s : 1: insert-virtual-dataset 0.01% : 0.000004s : 1: interleave_parallel_branches 0.01% : 0.000004s : 1: interleave_split_concat_branches 0.01% : 0.000006s : 1: label_fine_grained_interleaved_index 0.02% : 0.000008s : 1: label_micro_interleaved_index 1.12% : 0.000437s : 1: loop_unroll 0.01% : 0.000004s : 1: merge_cast_opt 0.01% : 0.000005s : 1: micro_interleaved_order_control 1.30% : 0.000505s : 1: mutable_eliminate 0.02% : 0.000007s : 1: offloading_packed_experts 0.04% : 0.000014s : 1: opt.transform.loop_unroll_optimizer 0.04% : 0.000016s : 1: opt.transform.mutable_eliminate 2.21% : 0.000862s : 78: opt.transform.opt_a 0.07% : 0.000027s : 1: opt.transform.opt_after_cconv 0.06% : 0.000024s : 1: opt.transform.opt_after_jit_grad 0.24% : 0.000094s : 28: opt.transform.opt_b 0.11% : 0.000044s : 2: opt.transform.opt_trans_graph 0.09% : 0.000034s : 4: opt.transform.symbol_engine_opt 5.72% : 0.002225s : 1: opt_a 0.27% : 0.000105s : 1: opt_after_cconv 1.21% : 0.000472s : 1: opt_after_jit_grad 0.52% : 0.000203s : 1: opt_b 10.70% : 0.004165s : 1: optimize 0.06% : 0.000022s : 1: optimize_parallel_all_gather_comm 0.02% : 0.000009s : 1: order_py_execute_after_rewriter 0.06% : 0.000022s : 1: overlap_grad_flash_sp 0.01% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.02% : 0.000007s : 1: overlap_grad_ring_attention 0.01% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.01% : 0.000005s : 1: overlap_param_gather 0.01% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.02% : 0.000008s : 1: overlap_recompute_and_grad_model_parallel 0.01% : 0.000005s : 1: overlap_recompute_comm 0.02% : 0.000007s : 1: parallel-infer-symbol 0.01% : 0.000004s : 1: parallel-infer-symbol-second 0.01% : 0.000005s : 1: partial_unused_args_eliminate 0.01% : 0.000005s : 1: pipeline_parallel_scheduler 0.01% : 0.000004s : 1: pipeline_split 0.07% : 0.000029s : 1: pre_auto_parallel 0.05% : 0.000021s : 1: py_interpret_to_execute 0.03% : 0.000012s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000004s : 1: remove_cast_before_assign_add 0.05% : 0.000019s : 1: remove_dup_value 0.73% : 0.000282s : 1: renormalize.infer 0.57% : 0.000223s : 1: renormalize.specialize 0.01% : 0.000005s : 1: reorder_send_recv_between_fp_bp 0.02% : 0.000007s : 1: rewriter_after_jit_bprop_graph 0.10% : 0.000038s : 1: rewriter_after_opt_a 0.13% : 0.000050s : 1: rewriter_before_opt_a 0.01% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.01% : 0.000005s : 1: slice_recompute_activation 0.01% : 0.000005s : 1: split_layernorm_comm 0.01% : 0.000005s : 1: split_matmul_comm_elemetwise 0.02% : 0.000009s : 1: swap_dp_allreduce_reducescatter 0.19% : 0.000076s : 1: symbol_engine_optimizer 0.19% : 0.000074s : 1: tuple_transform 55.50% : 0.021597s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:31.529.143 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:31.529.403 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0157307, [21] [bootstrap]: 0.00073336 [type_inference]: 0.00536316 [event_method]: 1.251e-05 [auto_monad]: 5.501e-05 [graph_reusing]: 5.76e-06 [inline]: 2.69001e-06 [add_attr]: 0.00315026, [1] [add_attr_with_inline]: 0.00314225, [1] [Cycle 1]: 6.462e-05, [2] [tag_attr]: 1.503e-05 [meta_addattr_fg_expand]: 3.92998e-06 [parallel-infer-symbol]: 3.09001e-06 [pre_auto_parallel]: 2.634e-05 [insert-virtual-dataset]: 2.55002e-06 [parallel-infer-symbol-second]: 7.80012e-07 [dataset_repeat_opt]: 2.10002e-06 [pipeline_split]: 1.67001e-06 [optimize]: 0.0052027, [53] [py_interpret_to_execute]: 2.143e-05 [rewriter_before_opt_a]: 5.339e-05 [opt_a]: 0.00285942, [2] [Cycle 1]: 0.00190439, [45] [expand_dump_flag]: 2.76999e-06 [switch_simplify]: 2.803e-05 [loop_unroll]: 1.638e-05 [a_1]: 0.00037488 [with_stream_mark]: 1.663e-05 [recompute_prepare]: 9.92001e-06 [updatestate_depend_eliminate]: 5.57001e-06 [updatestate_assign_eliminate]: 3.95e-06 [updatestate_loads_eliminate]: 3.54002e-06 [parameter_eliminate]: 2.27999e-06 [a_2]: 0.00014658 [accelerated_algorithm]: 2.207e-05 [shard]: 1.89e-06 [meta_shard_fg_expand]: 1.97001e-06 [shard_inline]: 8e-06 [merge_send_recv]: 9.86e-06 [auto_parallel]: 7.11999e-06 [parallel]: 1.831e-05 [flash_sp]: 8.05e-06 [merge_comm]: 4.93001e-06 [allreduce_fusion]: 4.52e-06 [matmul_add_comm_reduction]: 1.183e-05 [allreduce_slice_to_reducescatter]: 6.19999e-07 [virtual_shard_identity]: 1.051e-05 [virtual_dataset]: 8.05999e-06 [get_grad_eliminate_]: 7.34002e-06 [virtual_output]: 7.77e-06 [merge_forward]: 4.62e-06 [cell_reuse_recompute_pass]: 1.39e-06 [offload_activation]: 1.123e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.767e-05 [merge_recompute_call_nodes]: 1.53002e-06 [before_grad]: 1.366e-05 [set_forward_comm_id_for_comm_node_pass]: 4.75999e-06 [meta_fg_expand]: 3.04999e-06 [flash_sp_send_recv_attached]: 4.47e-06 [receive_attached]: 2.29999e-06 [after_resolve]: 1.241e-05 [a_after_grad]: 1.189e-05 [renormalize]: 0.00055188 [add_forward_monad_depend]: 5.27999e-06 [auto_monad_grad]: 2.43002e-06 [auto_monad_eliminator]: 1.686e-05 [cse]: 3.657e-05 [a_3]: 7.08e-05 [Cycle 2]: 0.00094229, [45] [expand_dump_flag]: 1.00999e-06 [switch_simplify]: 9.16002e-06 [loop_unroll]: 7.18e-06 [a_1]: 0.00017098 [with_stream_mark]: 9.48002e-06 [recompute_prepare]: 7.67002e-06 [updatestate_depend_eliminate]: 4.44002e-06 [updatestate_assign_eliminate]: 3.09001e-06 [updatestate_loads_eliminate]: 2.93e-06 [parameter_eliminate]: 1.12999e-06 [a_2]: 0.00012681 [accelerated_algorithm]: 1.066e-05 [shard]: 1.32e-06 [meta_shard_fg_expand]: 1.71e-06 [shard_inline]: 7.65e-06 [merge_send_recv]: 1.64e-05 [auto_parallel]: 6.69999e-06 [parallel]: 5.54e-06 [flash_sp]: 3.55998e-06 [merge_comm]: 4.47998e-06 [allreduce_fusion]: 4.00998e-06 [matmul_add_comm_reduction]: 1.142e-05 [allreduce_slice_to_reducescatter]: 3.50003e-07 [virtual_shard_identity]: 8.84e-06 [virtual_dataset]: 6.98e-06 [get_grad_eliminate_]: 6.73e-06 [virtual_output]: 6.48e-06 [merge_forward]: 3.31001e-06 [cell_reuse_recompute_pass]: 1.45001e-06 [offload_activation]: 7.46001e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.53e-05 [merge_recompute_call_nodes]: 9.20001e-07 [before_grad]: 1.118e-05 [set_forward_comm_id_for_comm_node_pass]: 4.80001e-06 [meta_fg_expand]: 3.08e-06 [flash_sp_send_recv_attached]: 1.00001e-06 [receive_attached]: 1.00001e-06 [after_resolve]: 9.56e-06 [a_after_grad]: 9.97999e-06 [renormalize]: 6.00121e-08 [add_forward_monad_depend]: 1.72001e-06 [auto_monad_grad]: 9.39996e-07 [auto_monad_eliminator]: 8.54998e-06 [cse]: 1.972e-05 [a_3]: 5.668e-05 [py_interpret_to_execute_after_opt_a]: 1.315e-05 [slice_cell_reuse_recomputed_activation]: 4.63001e-06 [rewriter_after_opt_a]: 4.423e-05 [convert_after_rewriter]: 1.05e-05 [order_py_execute_after_rewriter]: 8.67e-06 [mutable_eliminate]: 0.00050533 [opt_b]: 0.00030992, [1] [Cycle 1]: 0.00030085, [7] [b_1]: 0.00019191 [b_2]: 9.40001e-06 [updatestate_depend_eliminate]: 7.26999e-06 [updatestate_assign_eliminate]: 3.26999e-06 [updatestate_loads_eliminate]: 3.2e-06 [renormalize]: 4.2998e-07 [cse]: 2.758e-05 [optimize_parallel_all_gather_comm]: 2.217e-05 [overlap_param_gather]: 4.92999e-06 [cconv]: 2.95e-05 [loop_unroll]: 0.00045346 [opt_after_cconv]: 0.00013976, [1] [Cycle 1]: 0.000131, [7] [c_1]: 3.462e-05 [parameter_eliminate]: 3.23e-06 [updatestate_depend_eliminate]: 6.17999e-06 [updatestate_assign_eliminate]: 3.19001e-06 [updatestate_loads_eliminate]: 3.27002e-06 [cse]: 2.459e-05 [renormalize]: 4.7998e-07 [remove_dup_value]: 2.105e-05 [tuple_transform]: 9.652e-05, [1] [Cycle 1]: 8.957e-05, [4] [d_1]: 4.834e-05 [none_parameter_eliminate]: 2.00002e-06 [renormalize]: 7.7e-07 [switch_simplify]: 7.73001e-06 [partial_unused_args_eliminate]: 4.57e-06 [add_recomputation]: 5.904e-05 [cse_after_recomputation]: 3.354e-05, [1] [Cycle 1]: 2.644e-05, [1] [cse]: 1.621e-05 [environ_conv]: 9.24998e-06 [swap_dp_allreduce_reducescatter]: 9.02e-06 [bias_add_comm_swap]: 5.97999e-06 [label_micro_interleaved_index]: 6.89999e-06 [label_fine_grained_interleaved_index]: 5.07e-06 [merge_cast_opt]: 4.07e-06 [slice_recompute_activation]: 4.80999e-06 [micro_interleaved_order_control]: 4.70001e-06 [assign_add_opt]: 4.05e-06 [ForceFp32Comm]: 3.24001e-06 [remove_cast_before_assign_add]: 3.28e-06 [full_micro_interleaved_order_control]: 4.43001e-06 [reorder_send_recv_between_fp_bp]: 5.30999e-06 [comm_op_add_attrs]: 3.5e-06 [add_comm_op_reuse_tag]: 3.35e-06 [interleave_split_concat_branches]: 3.51999e-06 [interleave_parallel_branches]: 3.47002e-06 [overlap_opt_shard_in_pipeline]: 3.53999e-06 [overlap_opt_shard_grad_in_pipeline]: 4.18999e-06 [control_data_broadcast_order]: 1.717e-05 [grouped_pairwise_exchange_alltoall]: 3.91999e-06 [offloading_packed_experts]: 7.15e-06 [overlap_recompute_and_grad_model_parallel]: 7.92e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.61999e-06 [overlap_recompute_allgather_and_fa_grad]: 3.61001e-06 [overlap_recompute_comm]: 5.02e-06 [overlap_grad_ring_attention]: 6.79999e-06 [overlap_grad_flash_sp]: 2.391e-05 [begin_end_overlap_inline]: 2.98e-06 [split_matmul_comm_elemetwise]: 5.34998e-06 [split_layernorm_comm]: 3.85998e-06 [handle_group_info]: 3.65998e-06 [symbol_engine_optimizer]: 0.00010676, [1] [Cycle 1]: 9.995e-05, [6] [build]: 3.20998e-06 [elim_shapecalc]: 1.212e-05 [elim_not_effective]: 1.54e-05 [opt_reshape]: 8.45001e-06 [fold_const_symbol]: 1.237e-05 [renormalize]: 1.90019e-07 [detach_backward]: 3.52997e-06 [pipeline_parallel_scheduler]: 1.66e-06 [auto_monad_reorder]: 2.351e-05 [get_jit_bprop_graph]: 1.72001e-06 [rewriter_after_jit_bprop_graph]: 5.51998e-06 [opt_after_jit_grad]: 0.00050693 [validate]: 4.387e-05 Sums bootstrap : 0.000733s : 6.78% type_inference : 0.005363s : 49.56% event_method : 0.000013s : 0.12% auto_monad : 0.000055s : 0.51% graph_reusing : 0.000006s : 0.05% inline : 0.000003s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000015s : 0.14% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000004s : 0.04% parallel-infer-symbol : 0.000003s : 0.03% pre_auto_parallel : 0.000026s : 0.24% insert-virtual-dataset : 0.000003s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.02% optimize.py_interpret_to_execute : 0.000021s : 0.20% optimize.rewriter_before_opt_a : 0.000053s : 0.49% optimize.opt_a.expand_dump_flag : 0.000004s : 0.03% optimize.opt_a.switch_simplify : 0.000037s : 0.34% optimize.opt_a.loop_unroll : 0.000024s : 0.22% optimize.opt_a.a_1 : 0.000546s : 5.04% optimize.opt_a.with_stream_mark : 0.000026s : 0.24% optimize.opt_a.recompute_prepare : 0.000018s : 0.16% optimize.opt_a.updatestate_depend_eliminate : 0.000010s : 0.09% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.07% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.06% optimize.opt_a.parameter_eliminate : 0.000003s : 0.03% optimize.opt_a.a_2 : 0.000273s : 2.53% optimize.opt_a.accelerated_algorithm : 0.000033s : 0.30% optimize.opt_a.shard : 0.000003s : 0.03% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.03% optimize.opt_a.shard_inline : 0.000016s : 0.14% optimize.opt_a.merge_send_recv : 0.000026s : 0.24% optimize.opt_a.auto_parallel : 0.000014s : 0.13% optimize.opt_a.parallel : 0.000024s : 0.22% optimize.opt_a.flash_sp : 0.000012s : 0.11% optimize.opt_a.merge_comm : 0.000009s : 0.09% optimize.opt_a.allreduce_fusion : 0.000009s : 0.08% optimize.opt_a.matmul_add_comm_reduction : 0.000023s : 0.21% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000019s : 0.18% optimize.opt_a.virtual_dataset : 0.000015s : 0.14% optimize.opt_a.get_grad_eliminate_ : 0.000014s : 0.13% optimize.opt_a.virtual_output : 0.000014s : 0.13% optimize.opt_a.merge_forward : 0.000008s : 0.07% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.03% optimize.opt_a.offload_activation : 0.000019s : 0.17% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000033s : 0.30% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.02% optimize.opt_a.before_grad : 0.000025s : 0.23% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000010s : 0.09% optimize.opt_a.meta_fg_expand : 0.000006s : 0.06% optimize.opt_a.flash_sp_send_recv_attached : 0.000005s : 0.05% optimize.opt_a.receive_attached : 0.000003s : 0.03% optimize.opt_a.after_resolve : 0.000022s : 0.20% optimize.opt_a.a_after_grad : 0.000022s : 0.20% optimize.opt_a.renormalize : 0.000552s : 5.10% optimize.opt_a.add_forward_monad_depend : 0.000007s : 0.06% optimize.opt_a.auto_monad_grad : 0.000003s : 0.03% optimize.opt_a.auto_monad_eliminator : 0.000025s : 0.23% optimize.opt_a.cse : 0.000056s : 0.52% optimize.opt_a.a_3 : 0.000127s : 1.18% optimize.py_interpret_to_execute_after_opt_a : 0.000013s : 0.12% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.04% optimize.rewriter_after_opt_a : 0.000044s : 0.41% optimize.convert_after_rewriter : 0.000011s : 0.10% optimize.order_py_execute_after_rewriter : 0.000009s : 0.08% optimize.mutable_eliminate : 0.000505s : 4.67% optimize.opt_b.b_1 : 0.000192s : 1.77% optimize.opt_b.b_2 : 0.000009s : 0.09% optimize.opt_b.updatestate_depend_eliminate : 0.000007s : 0.07% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_b.renormalize : 0.000000s : 0.00% optimize.opt_b.cse : 0.000028s : 0.25% optimize.optimize_parallel_all_gather_comm : 0.000022s : 0.20% optimize.overlap_param_gather : 0.000005s : 0.05% optimize.cconv : 0.000029s : 0.27% optimize.loop_unroll : 0.000453s : 4.19% optimize.opt_after_cconv.c_1 : 0.000035s : 0.32% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.06% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.cse : 0.000025s : 0.23% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000021s : 0.19% optimize.tuple_transform.d_1 : 0.000048s : 0.45% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.02% optimize.tuple_transform.renormalize : 0.000001s : 0.01% optimize.tuple_transform.switch_simplify : 0.000008s : 0.07% optimize.partial_unused_args_eliminate : 0.000005s : 0.04% optimize.add_recomputation : 0.000059s : 0.55% optimize.cse_after_recomputation.cse : 0.000016s : 0.15% optimize.environ_conv : 0.000009s : 0.09% optimize.swap_dp_allreduce_reducescatter : 0.000009s : 0.08% optimize.bias_add_comm_swap : 0.000006s : 0.06% optimize.label_micro_interleaved_index : 0.000007s : 0.06% optimize.label_fine_grained_interleaved_index : 0.000005s : 0.05% optimize.merge_cast_opt : 0.000004s : 0.04% optimize.slice_recompute_activation : 0.000005s : 0.04% optimize.micro_interleaved_order_control : 0.000005s : 0.04% optimize.assign_add_opt : 0.000004s : 0.04% optimize.ForceFp32Comm : 0.000003s : 0.03% optimize.remove_cast_before_assign_add : 0.000003s : 0.03% optimize.full_micro_interleaved_order_control : 0.000004s : 0.04% optimize.reorder_send_recv_between_fp_bp : 0.000005s : 0.05% optimize.comm_op_add_attrs : 0.000003s : 0.03% optimize.add_comm_op_reuse_tag : 0.000003s : 0.03% optimize.interleave_split_concat_branches : 0.000004s : 0.03% optimize.interleave_parallel_branches : 0.000003s : 0.03% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.03% optimize.overlap_opt_shard_grad_in_pipeline : 0.000004s : 0.04% optimize.control_data_broadcast_order : 0.000017s : 0.16% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.04% optimize.offloading_packed_experts : 0.000007s : 0.07% optimize.overlap_recompute_and_grad_model_parallel : 0.000008s : 0.07% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.03% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.03% optimize.overlap_recompute_comm : 0.000005s : 0.05% optimize.overlap_grad_ring_attention : 0.000007s : 0.06% optimize.overlap_grad_flash_sp : 0.000024s : 0.22% optimize.begin_end_overlap_inline : 0.000003s : 0.03% optimize.split_matmul_comm_elemetwise : 0.000005s : 0.05% optimize.split_layernorm_comm : 0.000004s : 0.04% optimize.handle_group_info : 0.000004s : 0.03% optimize.symbol_engine_optimizer.build : 0.000003s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000012s : 0.11% optimize.symbol_engine_optimizer.elim_not_effective : 0.000015s : 0.14% optimize.symbol_engine_optimizer.opt_reshape : 0.000008s : 0.08% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000012s : 0.11% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000004s : 0.03% pipeline_parallel_scheduler : 0.000002s : 0.02% auto_monad_reorder : 0.000024s : 0.22% get_jit_bprop_graph : 0.000002s : 0.02% rewriter_after_jit_bprop_graph : 0.000006s : 0.05% opt_after_jit_grad : 0.000507s : 4.68% validate : 0.000044s : 0.41% Time group info: ------[substitution.] 0.000214 34 29.06% : 0.000062s : 4: substitution.arithmetic_simplify 8.53% : 0.000018s : 2: substitution.cast_eliminate 1.07% : 0.000002s : 3: substitution.elim_not_effective 0.82% : 0.000002s : 3: substitution.fold_const_symbol 3.07% : 0.000007s : 4: substitution.graph_param_transform 45.29% : 0.000097s : 2: substitution.inline 2.08% : 0.000004s : 6: substitution.j_node_and_user_rematch 5.87% : 0.000013s : 2: substitution.less_batch_normalization 2.73% : 0.000006s : 6: substitution.remove_not_recompute_node 1.48% : 0.000003s : 2: substitution.replace_old_param ------[type_inference.] 0.005316 2 91.45% : 0.004861s : 1: type_inference.infer 8.55% : 0.000454s : 1: type_inference.specialize ------[replace.] 0.000022 2 100.00% : 0.000022s : 2: replace.inline ------[match.] 0.000095 2 100.00% : 0.000095s : 2: match.inline ------[predicate.] 0.000173 980 0.86% : 0.000001s : 9: predicate.accumulaten_eliminater 1.15% : 0.000002s : 4: predicate.ad_related_special_op_eliminate 0.68% : 0.000001s : 8: predicate.addn_check_dump 0.87% : 0.000002s : 9: predicate.addn_zero_filter 0.69% : 0.000001s : 9: predicate.adjust_all_reduce_mul_add 2.56% : 0.000004s : 17: predicate.arithmetic_simplify 0.80% : 0.000001s : 9: predicate.cast_eliminate 0.79% : 0.000001s : 8: predicate.check_bprop_eliminate 0.79% : 0.000001s : 8: predicate.compare_switch_simplify 0.23% : 0.000000s : 4: predicate.const_output_eliminate 0.75% : 0.000001s : 8: predicate.depend_value_elim 0.82% : 0.000001s : 9: predicate.dict_get_item_const_eliminator 0.94% : 0.000002s : 9: predicate.dict_get_item_eliminator 0.75% : 0.000001s : 9: predicate.dict_set_item_eliminator 1.22% : 0.000002s : 8: predicate.dumpgradient_eliminate 0.26% : 0.000000s : 4: predicate.elim_not_effective 0.47% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.11% : 0.000002s : 13: predicate.environ_add_const_eliminate 1.11% : 0.000002s : 13: predicate.environ_get_add_eliminate 1.01% : 0.000002s : 13: predicate.environ_get_depend_swap 2.01% : 0.000003s : 21: predicate.environ_get_eliminate 1.08% : 0.000002s : 13: predicate.environ_get_set_eliminate 0.94% : 0.000002s : 11: predicate.exchange_switch_depend_value 2.02% : 0.000003s : 11: predicate.float_depend_g_call 0.75% : 0.000001s : 8: predicate.float_environ_get_switch 1.11% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.20% : 0.000000s : 4: predicate.fold_const_symbol 0.79% : 0.000001s : 8: predicate.get_grad_eliminate 0.25% : 0.000000s : 4: predicate.graph_param_transform 0.86% : 0.000001s : 8: predicate.incorporate_call 0.67% : 0.000001s : 8: predicate.incorporate_call_switch 6.45% : 0.000011s : 44: predicate.inline 1.03% : 0.000002s : 8: predicate.inline_without_move 0.36% : 0.000001s : 8: predicate.j_node_and_user_rematch 1.04% : 0.000002s : 8: predicate.less_batch_normalization 1.61% : 0.000003s : 17: predicate.list_to_tuple_eliminator_ 2.09% : 0.000004s : 26: predicate.load_eliminater 1.20% : 0.000002s : 4: predicate.loop_unroll_after_grad 1.52% : 0.000003s : 16: predicate.loop_unroll_before_grad 1.68% : 0.000003s : 17: predicate.make_slice_get_slice_eliminator 0.73% : 0.000001s : 8: predicate.merge_addn 0.76% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.75% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.69% : 0.000001s : 9: predicate.minmaximum_grad 1.28% : 0.000002s : 4: predicate.mutable_eliminate 0.49% : 0.000001s : 4: predicate.opt_reshape 0.49% : 0.000001s : 4: predicate.parallel_virtual_node 1.16% : 0.000002s : 11: predicate.partial_defer_inline 1.28% : 0.000002s : 13: predicate.partial_eliminate 0.78% : 0.000001s : 9: predicate.print_const_string_wrapper 0.75% : 0.000001s : 8: predicate.reduce_all_const_elim 1.08% : 0.000002s : 9: predicate.reduce_eliminate 2.15% : 0.000004s : 26: predicate.redundant_stop_gradient_eliminater 0.50% : 0.000001s : 8: predicate.remove_not_recompute_node 1.10% : 0.000002s : 17: predicate.replace_applicator 0.74% : 0.000001s : 8: predicate.replace_old_param 0.32% : 0.000001s : 4: predicate.reset_defer_inline 0.91% : 0.000002s : 9: predicate.reshape_eliminate 0.79% : 0.000001s : 8: predicate.row_tensor_add_zeros_like 0.46% : 0.000001s : 4: predicate.row_tensor_eliminate 0.96% : 0.000002s : 8: predicate.same_eliminate 0.48% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.92% : 0.000002s : 8: predicate.shard_identity_eliminate 1.13% : 0.000002s : 8: predicate.special_op_eliminate 0.95% : 0.000002s : 8: predicate.specialize_transform 1.08% : 0.000002s : 8: predicate.split_environ_get_set_with_tuple_value 0.83% : 0.000001s : 8: predicate.stack_unstack_eliminate 0.44% : 0.000001s : 4: predicate.switch_call_monad_eliminater 0.98% : 0.000002s : 11: predicate.switch_defer_inline 1.91% : 0.000003s : 19: predicate.switch_layer_defer_inline 4.20% : 0.000007s : 39: predicate.switch_simplify 0.86% : 0.000001s : 9: predicate.tile_eliminate 0.81% : 0.000001s : 9: predicate.transpose_eliminate 1.71% : 0.000003s : 17: predicate.tuple_list_convert_item_index_to_positive 1.64% : 0.000003s : 17: predicate.tuple_list_get_item_const_eliminator 1.40% : 0.000002s : 17: predicate.tuple_list_get_item_depend_reorder 3.09% : 0.000005s : 25: predicate.tuple_list_get_item_eliminator 1.38% : 0.000002s : 17: predicate.tuple_list_get_set_item_eliminator 2.65% : 0.000005s : 25: predicate.tuple_list_set_item_eliminator 1.65% : 0.000003s : 17: predicate.tuple_to_list_eliminator_ 2.07% : 0.000004s : 26: predicate.updatestate_pure_node_eliminater 3.04% : 0.000005s : 34: predicate.updatestate_useless_node_eliminater 0.42% : 0.000001s : 4: predicate.value_based_eliminate 1.01% : 0.000002s : 8: predicate.virtual_dataset_eliminate 0.75% : 0.000001s : 8: predicate.virtual_output_eliminate 0.35% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.55% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000209 5 8.61% : 0.000018s : 1: func_graph_cloner_run.FuncGraphClonerGraph 91.39% : 0.000191s : 4: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.025881 192 0.02% : 0.000006s : 1: ForceFp32Comm 12.21% : 0.003160s : 1: add_attr 12.16% : 0.003146s : 1: add_attr_with_inline 0.02% : 0.000006s : 1: add_comm_op_reuse_tag 0.24% : 0.000063s : 1: add_recomputation 0.03% : 0.000007s : 1: assign_add_opt 0.24% : 0.000063s : 1: auto_monad 0.12% : 0.000031s : 1: auto_monad_reorder 0.02% : 0.000006s : 1: begin_end_overlap_inline 0.03% : 0.000009s : 1: bias_add_comm_swap 3.01% : 0.000778s : 1: bootstrap 0.13% : 0.000033s : 1: cconv 0.02% : 0.000006s : 1: comm_op_add_attrs 0.08% : 0.000021s : 1: control_data_broadcast_order 0.05% : 0.000014s : 1: convert_after_rewriter 0.14% : 0.000037s : 1: cse_after_recomputation 0.03% : 0.000008s : 1: dataset_repeat_opt 0.08% : 0.000020s : 1: detach_backward 0.05% : 0.000012s : 1: environ_conv 0.09% : 0.000022s : 1: event_method 0.03% : 0.000007s : 1: full_micro_interleaved_order_control 0.03% : 0.000008s : 1: get_jit_bprop_graph 0.05% : 0.000012s : 1: graph_reusing 0.03% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.02% : 0.000006s : 1: handle_group_info 0.03% : 0.000009s : 1: inline 0.03% : 0.000009s : 1: insert-virtual-dataset 0.02% : 0.000006s : 1: interleave_parallel_branches 0.02% : 0.000006s : 1: interleave_split_concat_branches 0.03% : 0.000008s : 1: label_fine_grained_interleaved_index 0.04% : 0.000010s : 1: label_micro_interleaved_index 1.78% : 0.000460s : 1: loop_unroll 0.03% : 0.000007s : 1: merge_cast_opt 0.03% : 0.000007s : 1: micro_interleaved_order_control 1.98% : 0.000512s : 1: mutable_eliminate 0.04% : 0.000010s : 1: offloading_packed_experts 0.06% : 0.000017s : 1: opt.transform.loop_unroll_optimizer 0.07% : 0.000018s : 1: opt.transform.mutable_eliminate 4.09% : 0.001058s : 78: opt.transform.opt_a 0.13% : 0.000033s : 1: opt.transform.opt_after_cconv 0.12% : 0.000031s : 1: opt.transform.opt_after_jit_grad 0.49% : 0.000128s : 28: opt.transform.opt_b 0.21% : 0.000054s : 2: opt.transform.opt_trans_graph 0.17% : 0.000044s : 4: opt.transform.symbol_engine_opt 11.06% : 0.002863s : 1: opt_a 0.55% : 0.000144s : 1: opt_after_cconv 2.00% : 0.000518s : 1: opt_after_jit_grad 1.21% : 0.000313s : 1: opt_b 21.44% : 0.005549s : 1: optimize 0.10% : 0.000026s : 1: optimize_parallel_all_gather_comm 0.04% : 0.000012s : 1: order_py_execute_after_rewriter 0.10% : 0.000027s : 1: overlap_grad_flash_sp 0.02% : 0.000006s : 1: overlap_grad_matmul_and_grad_allreduce 0.04% : 0.000010s : 1: overlap_grad_ring_attention 0.03% : 0.000007s : 1: overlap_opt_shard_grad_in_pipeline 0.03% : 0.000006s : 1: overlap_opt_shard_in_pipeline 0.03% : 0.000008s : 1: overlap_param_gather 0.02% : 0.000006s : 1: overlap_recompute_allgather_and_fa_grad 0.04% : 0.000011s : 1: overlap_recompute_and_grad_model_parallel 0.03% : 0.000008s : 1: overlap_recompute_comm 0.04% : 0.000010s : 1: parallel-infer-symbol 0.02% : 0.000006s : 1: parallel-infer-symbol-second 0.03% : 0.000008s : 1: partial_unused_args_eliminate 0.04% : 0.000010s : 1: pipeline_parallel_scheduler 0.03% : 0.000007s : 1: pipeline_split 0.13% : 0.000034s : 1: pre_auto_parallel 0.10% : 0.000025s : 1: py_interpret_to_execute 0.06% : 0.000017s : 1: py_interpret_to_execute_after_opt_a 0.02% : 0.000006s : 1: remove_cast_before_assign_add 0.10% : 0.000025s : 1: remove_dup_value 1.19% : 0.000309s : 1: renormalize.infer 0.91% : 0.000235s : 1: renormalize.specialize 0.03% : 0.000008s : 1: reorder_send_recv_between_fp_bp 0.04% : 0.000011s : 1: rewriter_after_jit_bprop_graph 0.19% : 0.000048s : 1: rewriter_after_opt_a 0.22% : 0.000057s : 1: rewriter_before_opt_a 0.03% : 0.000007s : 1: slice_cell_reuse_recomputed_activation 0.03% : 0.000007s : 1: slice_recompute_activation 0.03% : 0.000007s : 1: split_layernorm_comm 0.03% : 0.000008s : 1: split_matmul_comm_elemetwise 0.05% : 0.000012s : 1: swap_dp_allreduce_reducescatter 0.42% : 0.000110s : 1: symbol_engine_optimizer 0.38% : 0.000099s : 1: tuple_transform 20.83% : 0.005391s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:31.957.820 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0312982, [21] [bootstrap]: 0.0004376 [type_inference]: 0.0217673 [event_method]: 1.616e-05 [auto_monad]: 5.939e-05 [graph_reusing]: 5.35001e-06 [inline]: 2.30002e-06 [add_attr]: 0.00346402, [1] [add_attr_with_inline]: 0.00345506, [1] [Cycle 1]: 5.525e-05, [2] [tag_attr]: 1.59e-05 [meta_addattr_fg_expand]: 3.85e-06 [parallel-infer-symbol]: 3.46001e-06 [pre_auto_parallel]: 2.79e-05 [insert-virtual-dataset]: 2.34001e-06 [parallel-infer-symbol-second]: 7.50006e-07 [dataset_repeat_opt]: 2.17001e-06 [pipeline_split]: 1.52999e-06 [optimize]: 0.00479121, [53] [py_interpret_to_execute]: 1.893e-05 [rewriter_before_opt_a]: 5.162e-05 [opt_a]: 0.00261847, [2] [Cycle 1]: 0.00183409, [45] [expand_dump_flag]: 2.91e-06 [switch_simplify]: 2.874e-05 [loop_unroll]: 1.539e-05 [a_1]: 0.00038889 [with_stream_mark]: 1.66e-05 [recompute_prepare]: 1.028e-05 [updatestate_depend_eliminate]: 4.79998e-06 [updatestate_assign_eliminate]: 4.03001e-06 [updatestate_loads_eliminate]: 3.94997e-06 [parameter_eliminate]: 1.90001e-06 [a_2]: 0.0001169 [accelerated_algorithm]: 2.36e-05 [shard]: 2.17001e-06 [meta_shard_fg_expand]: 2.16e-06 [shard_inline]: 8.19002e-06 [merge_send_recv]: 9.14e-06 [auto_parallel]: 7.65e-06 [parallel]: 1.864e-05 [flash_sp]: 7.53999e-06 [merge_comm]: 4.77e-06 [allreduce_fusion]: 4.38999e-06 [matmul_add_comm_reduction]: 1.171e-05 [allreduce_slice_to_reducescatter]: 7.99977e-07 [virtual_shard_identity]: 1.053e-05 [virtual_dataset]: 9.46e-06 [get_grad_eliminate_]: 7.62998e-06 [virtual_output]: 7.63999e-06 [merge_forward]: 4.94e-06 [cell_reuse_recompute_pass]: 1.38002e-06 [offload_activation]: 1.16e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.499e-05 [merge_recompute_call_nodes]: 1.45999e-06 [before_grad]: 1.298e-05 [set_forward_comm_id_for_comm_node_pass]: 5.00999e-06 [meta_fg_expand]: 3.53e-06 [flash_sp_send_recv_attached]: 4.96002e-06 [receive_attached]: 2.01e-06 [after_resolve]: 1.182e-05 [a_after_grad]: 1.168e-05 [renormalize]: 0.00064297 [add_forward_monad_depend]: 5.21002e-06 [auto_monad_grad]: 2.01e-06 [auto_monad_eliminator]: 1.774e-05 [cse]: 3.668e-05 [a_3]: 5.688e-05 [Cycle 2]: 0.00077363, [45] [expand_dump_flag]: 1.07998e-06 [switch_simplify]: 8.95001e-06 [loop_unroll]: 7.06001e-06 [a_1]: 0.00017329 [with_stream_mark]: 1.1e-05 [recompute_prepare]: 7.93001e-06 [updatestate_depend_eliminate]: 4.35999e-06 [updatestate_assign_eliminate]: 2.96999e-06 [updatestate_loads_eliminate]: 2.94999e-06 [parameter_eliminate]: 1.12e-06 [a_2]: 9.962e-05 [accelerated_algorithm]: 1.072e-05 [shard]: 1.01997e-06 [meta_shard_fg_expand]: 2.53e-06 [shard_inline]: 7.38e-06 [merge_send_recv]: 6.73e-06 [auto_parallel]: 7.2e-06 [parallel]: 5.08002e-06 [flash_sp]: 3.94002e-06 [merge_comm]: 4.2e-06 [allreduce_fusion]: 4.15e-06 [matmul_add_comm_reduction]: 7.88999e-06 [allreduce_slice_to_reducescatter]: 5.69999e-07 [virtual_shard_identity]: 8.98002e-06 [virtual_dataset]: 7.29001e-06 [get_grad_eliminate_]: 6.69999e-06 [virtual_output]: 6.26e-06 [merge_forward]: 3.64002e-06 [cell_reuse_recompute_pass]: 1.49998e-06 [offload_activation]: 7.87e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.407e-05 [merge_recompute_call_nodes]: 9.10019e-07 [before_grad]: 1.181e-05 [set_forward_comm_id_for_comm_node_pass]: 4.65999e-06 [meta_fg_expand]: 2.96001e-06 [flash_sp_send_recv_attached]: 9.80013e-07 [receive_attached]: 9.39996e-07 [after_resolve]: 9.51e-06 [a_after_grad]: 9.72999e-06 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 1.50999e-06 [auto_monad_grad]: 8.89995e-07 [auto_monad_eliminator]: 9.63997e-06 [cse]: 2.153e-05 [a_3]: 4.351e-05 [py_interpret_to_execute_after_opt_a]: 1.061e-05 [slice_cell_reuse_recomputed_activation]: 1.82999e-06 [rewriter_after_opt_a]: 4.116e-05 [convert_after_rewriter]: 7.31999e-06 [order_py_execute_after_rewriter]: 6.94001e-06 [mutable_eliminate]: 0.00054124 [opt_b]: 0.00025025, [1] [Cycle 1]: 0.00024342, [7] [b_1]: 0.00015466 [b_2]: 8.99e-06 [updatestate_depend_eliminate]: 8.61002e-06 [updatestate_assign_eliminate]: 3.12002e-06 [updatestate_loads_eliminate]: 3.85998e-06 [renormalize]: 4.00003e-07 [cse]: 2.614e-05 [optimize_parallel_all_gather_comm]: 1.751e-05 [overlap_param_gather]: 1.81e-06 [cconv]: 2.713e-05 [loop_unroll]: 0.00047182 [opt_after_cconv]: 0.00012012, [1] [Cycle 1]: 0.00011448, [7] [c_1]: 3.51e-05 [parameter_eliminate]: 3.14999e-06 [updatestate_depend_eliminate]: 6.84999e-06 [updatestate_assign_eliminate]: 3.4e-06 [updatestate_loads_eliminate]: 3.78001e-06 [cse]: 2.454e-05 [renormalize]: 4.19997e-07 [remove_dup_value]: 1.739e-05 [tuple_transform]: 8.927e-05, [1] [Cycle 1]: 8.463e-05, [4] [d_1]: 5.354e-05 [none_parameter_eliminate]: 1.59e-06 [renormalize]: 2.00002e-07 [switch_simplify]: 9.42001e-06 [partial_unused_args_eliminate]: 1.91e-06 [add_recomputation]: 5.893e-05 [cse_after_recomputation]: 2.553e-05, [1] [Cycle 1]: 2.095e-05, [1] [cse]: 1.533e-05 [environ_conv]: 6.64999e-06 [swap_dp_allreduce_reducescatter]: 5.79e-06 [bias_add_comm_swap]: 2.81999e-06 [label_micro_interleaved_index]: 4.3e-06 [label_fine_grained_interleaved_index]: 2.57001e-06 [merge_cast_opt]: 1.20999e-06 [slice_recompute_activation]: 2.24001e-06 [micro_interleaved_order_control]: 2.29999e-06 [assign_add_opt]: 1.18001e-06 [ForceFp32Comm]: 9.79984e-07 [remove_cast_before_assign_add]: 9.89996e-07 [full_micro_interleaved_order_control]: 2.43e-06 [reorder_send_recv_between_fp_bp]: 2.80002e-06 [comm_op_add_attrs]: 9.60019e-07 [add_comm_op_reuse_tag]: 9.89996e-07 [interleave_split_concat_branches]: 1.15999e-06 [interleave_parallel_branches]: 1.04e-06 [overlap_opt_shard_in_pipeline]: 1.18001e-06 [overlap_opt_shard_grad_in_pipeline]: 1.78002e-06 [control_data_broadcast_order]: 1.424e-05 [grouped_pairwise_exchange_alltoall]: 1.76998e-06 [offloading_packed_experts]: 4.12998e-06 [overlap_recompute_and_grad_model_parallel]: 4.86002e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.13001e-06 [overlap_recompute_allgather_and_fa_grad]: 1.49e-06 [overlap_recompute_comm]: 2.32001e-06 [overlap_grad_ring_attention]: 4.49002e-06 [overlap_grad_flash_sp]: 2.185e-05 [begin_end_overlap_inline]: 4.90021e-07 [split_matmul_comm_elemetwise]: 2.06998e-06 [split_layernorm_comm]: 1.92001e-06 [handle_group_info]: 1.24e-06 [symbol_engine_optimizer]: 8.245e-05, [1] [Cycle 1]: 7.788e-05, [6] [build]: 3.4e-06 [elim_shapecalc]: 1.049e-05 [elim_not_effective]: 1.549e-05 [opt_reshape]: 7.92e-06 [fold_const_symbol]: 1.195e-05 [renormalize]: 3.30008e-07 [detach_backward]: 1.97999e-06 [pipeline_parallel_scheduler]: 1.57001e-06 [auto_monad_reorder]: 1.99e-05 [get_jit_bprop_graph]: 1.39e-06 [rewriter_after_jit_bprop_graph]: 4.12e-06 [opt_after_jit_grad]: 0.00049115 [validate]: 4.355e-05 Sums bootstrap : 0.000438s : 1.63% type_inference : 0.021767s : 81.10% event_method : 0.000016s : 0.06% auto_monad : 0.000059s : 0.22% graph_reusing : 0.000005s : 0.02% inline : 0.000002s : 0.01% add_attr.add_attr_with_inline.tag_attr : 0.000016s : 0.06% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000004s : 0.01% parallel-infer-symbol : 0.000003s : 0.01% pre_auto_parallel : 0.000028s : 0.10% insert-virtual-dataset : 0.000002s : 0.01% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.01% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000019s : 0.07% optimize.rewriter_before_opt_a : 0.000052s : 0.19% optimize.opt_a.expand_dump_flag : 0.000004s : 0.01% optimize.opt_a.switch_simplify : 0.000038s : 0.14% optimize.opt_a.loop_unroll : 0.000022s : 0.08% optimize.opt_a.a_1 : 0.000562s : 2.09% optimize.opt_a.with_stream_mark : 0.000028s : 0.10% optimize.opt_a.recompute_prepare : 0.000018s : 0.07% optimize.opt_a.updatestate_depend_eliminate : 0.000009s : 0.03% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.03% optimize.opt_a.updatestate_loads_eliminate : 0.000007s : 0.03% optimize.opt_a.parameter_eliminate : 0.000003s : 0.01% optimize.opt_a.a_2 : 0.000217s : 0.81% optimize.opt_a.accelerated_algorithm : 0.000034s : 0.13% optimize.opt_a.shard : 0.000003s : 0.01% optimize.opt_a.meta_shard_fg_expand : 0.000005s : 0.02% optimize.opt_a.shard_inline : 0.000016s : 0.06% optimize.opt_a.merge_send_recv : 0.000016s : 0.06% optimize.opt_a.auto_parallel : 0.000015s : 0.06% optimize.opt_a.parallel : 0.000024s : 0.09% optimize.opt_a.flash_sp : 0.000011s : 0.04% optimize.opt_a.merge_comm : 0.000009s : 0.03% optimize.opt_a.allreduce_fusion : 0.000009s : 0.03% optimize.opt_a.matmul_add_comm_reduction : 0.000020s : 0.07% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000020s : 0.07% optimize.opt_a.virtual_dataset : 0.000017s : 0.06% optimize.opt_a.get_grad_eliminate_ : 0.000014s : 0.05% optimize.opt_a.virtual_output : 0.000014s : 0.05% optimize.opt_a.merge_forward : 0.000009s : 0.03% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.01% optimize.opt_a.offload_activation : 0.000019s : 0.07% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000029s : 0.11% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.01% optimize.opt_a.before_grad : 0.000025s : 0.09% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000010s : 0.04% optimize.opt_a.meta_fg_expand : 0.000006s : 0.02% optimize.opt_a.flash_sp_send_recv_attached : 0.000006s : 0.02% optimize.opt_a.receive_attached : 0.000003s : 0.01% optimize.opt_a.after_resolve : 0.000021s : 0.08% optimize.opt_a.a_after_grad : 0.000021s : 0.08% optimize.opt_a.renormalize : 0.000643s : 2.40% optimize.opt_a.add_forward_monad_depend : 0.000007s : 0.03% optimize.opt_a.auto_monad_grad : 0.000003s : 0.01% optimize.opt_a.auto_monad_eliminator : 0.000027s : 0.10% optimize.opt_a.cse : 0.000058s : 0.22% optimize.opt_a.a_3 : 0.000100s : 0.37% optimize.py_interpret_to_execute_after_opt_a : 0.000011s : 0.04% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.01% optimize.rewriter_after_opt_a : 0.000041s : 0.15% optimize.convert_after_rewriter : 0.000007s : 0.03% optimize.order_py_execute_after_rewriter : 0.000007s : 0.03% optimize.mutable_eliminate : 0.000541s : 2.02% optimize.opt_b.b_1 : 0.000155s : 0.58% optimize.opt_b.b_2 : 0.000009s : 0.03% optimize.opt_b.updatestate_depend_eliminate : 0.000009s : 0.03% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.01% optimize.opt_b.updatestate_loads_eliminate : 0.000004s : 0.01% optimize.opt_b.renormalize : 0.000000s : 0.00% optimize.opt_b.cse : 0.000026s : 0.10% optimize.optimize_parallel_all_gather_comm : 0.000018s : 0.07% optimize.overlap_param_gather : 0.000002s : 0.01% optimize.cconv : 0.000027s : 0.10% optimize.loop_unroll : 0.000472s : 1.76% optimize.opt_after_cconv.c_1 : 0.000035s : 0.13% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000007s : 0.03% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000004s : 0.01% optimize.opt_after_cconv.cse : 0.000025s : 0.09% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000017s : 0.06% optimize.tuple_transform.d_1 : 0.000054s : 0.20% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000009s : 0.04% optimize.partial_unused_args_eliminate : 0.000002s : 0.01% optimize.add_recomputation : 0.000059s : 0.22% optimize.cse_after_recomputation.cse : 0.000015s : 0.06% optimize.environ_conv : 0.000007s : 0.02% optimize.swap_dp_allreduce_reducescatter : 0.000006s : 0.02% optimize.bias_add_comm_swap : 0.000003s : 0.01% optimize.label_micro_interleaved_index : 0.000004s : 0.02% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.01% optimize.merge_cast_opt : 0.000001s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.01% optimize.micro_interleaved_order_control : 0.000002s : 0.01% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.00% optimize.full_micro_interleaved_order_control : 0.000002s : 0.01% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.01% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.01% optimize.control_data_broadcast_order : 0.000014s : 0.05% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.01% optimize.offloading_packed_experts : 0.000004s : 0.02% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.02% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.01% optimize.overlap_recompute_comm : 0.000002s : 0.01% optimize.overlap_grad_ring_attention : 0.000004s : 0.02% optimize.overlap_grad_flash_sp : 0.000022s : 0.08% optimize.begin_end_overlap_inline : 0.000000s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.01% optimize.split_layernorm_comm : 0.000002s : 0.01% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000003s : 0.01% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000010s : 0.04% optimize.symbol_engine_optimizer.elim_not_effective : 0.000015s : 0.06% optimize.symbol_engine_optimizer.opt_reshape : 0.000008s : 0.03% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000012s : 0.04% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.01% pipeline_parallel_scheduler : 0.000002s : 0.01% auto_monad_reorder : 0.000020s : 0.07% get_jit_bprop_graph : 0.000001s : 0.01% rewriter_after_jit_bprop_graph : 0.000004s : 0.02% opt_after_jit_grad : 0.000491s : 1.83% validate : 0.000044s : 0.16% Time group info: ------[substitution.] 0.000232 34 29.49% : 0.000068s : 4: substitution.arithmetic_simplify 9.14% : 0.000021s : 2: substitution.cast_eliminate 1.08% : 0.000002s : 3: substitution.elim_not_effective 0.77% : 0.000002s : 3: substitution.fold_const_symbol 2.84% : 0.000007s : 4: substitution.graph_param_transform 44.85% : 0.000104s : 2: substitution.inline 2.06% : 0.000005s : 6: substitution.j_node_and_user_rematch 5.62% : 0.000013s : 2: substitution.less_batch_normalization 2.87% : 0.000007s : 6: substitution.remove_not_recompute_node 1.30% : 0.000003s : 2: substitution.replace_old_param ------[type_inference.] 0.021715 2 97.51% : 0.021174s : 1: type_inference.infer 2.49% : 0.000542s : 1: type_inference.specialize ------[replace.] 0.000022 2 100.00% : 0.000022s : 2: replace.inline ------[match.] 0.000102 2 100.00% : 0.000102s : 2: match.inline ------[predicate.] 0.000174 980 0.88% : 0.000002s : 9: predicate.accumulaten_eliminater 1.12% : 0.000002s : 4: predicate.ad_related_special_op_eliminate 0.64% : 0.000001s : 8: predicate.addn_check_dump 0.83% : 0.000001s : 9: predicate.addn_zero_filter 0.68% : 0.000001s : 9: predicate.adjust_all_reduce_mul_add 2.96% : 0.000005s : 17: predicate.arithmetic_simplify 0.97% : 0.000002s : 9: predicate.cast_eliminate 0.70% : 0.000001s : 8: predicate.check_bprop_eliminate 0.68% : 0.000001s : 8: predicate.compare_switch_simplify 0.21% : 0.000000s : 4: predicate.const_output_eliminate 0.71% : 0.000001s : 8: predicate.depend_value_elim 0.75% : 0.000001s : 9: predicate.dict_get_item_const_eliminator 0.95% : 0.000002s : 9: predicate.dict_get_item_eliminator 0.76% : 0.000001s : 9: predicate.dict_set_item_eliminator 1.14% : 0.000002s : 8: predicate.dumpgradient_eliminate 0.29% : 0.000001s : 4: predicate.elim_not_effective 0.46% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.11% : 0.000002s : 13: predicate.environ_add_const_eliminate 1.02% : 0.000002s : 13: predicate.environ_get_add_eliminate 0.98% : 0.000002s : 13: predicate.environ_get_depend_swap 2.15% : 0.000004s : 21: predicate.environ_get_eliminate 1.01% : 0.000002s : 13: predicate.environ_get_set_eliminate 0.88% : 0.000002s : 11: predicate.exchange_switch_depend_value 1.77% : 0.000003s : 11: predicate.float_depend_g_call 0.67% : 0.000001s : 8: predicate.float_environ_get_switch 1.07% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.25% : 0.000000s : 4: predicate.fold_const_symbol 0.79% : 0.000001s : 8: predicate.get_grad_eliminate 0.32% : 0.000001s : 4: predicate.graph_param_transform 0.81% : 0.000001s : 8: predicate.incorporate_call 0.66% : 0.000001s : 8: predicate.incorporate_call_switch 6.41% : 0.000011s : 44: predicate.inline 0.94% : 0.000002s : 8: predicate.inline_without_move 0.37% : 0.000001s : 8: predicate.j_node_and_user_rematch 1.31% : 0.000002s : 8: predicate.less_batch_normalization 1.70% : 0.000003s : 17: predicate.list_to_tuple_eliminator_ 2.16% : 0.000004s : 26: predicate.load_eliminater 1.31% : 0.000002s : 4: predicate.loop_unroll_after_grad 1.49% : 0.000003s : 16: predicate.loop_unroll_before_grad 1.74% : 0.000003s : 17: predicate.make_slice_get_slice_eliminator 0.69% : 0.000001s : 8: predicate.merge_addn 0.63% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.73% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.79% : 0.000001s : 9: predicate.minmaximum_grad 1.39% : 0.000002s : 4: predicate.mutable_eliminate 0.44% : 0.000001s : 4: predicate.opt_reshape 0.42% : 0.000001s : 4: predicate.parallel_virtual_node 1.19% : 0.000002s : 11: predicate.partial_defer_inline 1.27% : 0.000002s : 13: predicate.partial_eliminate 0.75% : 0.000001s : 9: predicate.print_const_string_wrapper 0.91% : 0.000002s : 8: predicate.reduce_all_const_elim 1.07% : 0.000002s : 9: predicate.reduce_eliminate 2.14% : 0.000004s : 26: predicate.redundant_stop_gradient_eliminater 0.48% : 0.000001s : 8: predicate.remove_not_recompute_node 1.19% : 0.000002s : 17: predicate.replace_applicator 0.61% : 0.000001s : 8: predicate.replace_old_param 0.53% : 0.000001s : 4: predicate.reset_defer_inline 0.88% : 0.000002s : 9: predicate.reshape_eliminate 0.70% : 0.000001s : 8: predicate.row_tensor_add_zeros_like 0.58% : 0.000001s : 4: predicate.row_tensor_eliminate 0.92% : 0.000002s : 8: predicate.same_eliminate 0.51% : 0.000001s : 8: predicate.set_cell_output_no_recompute 1.12% : 0.000002s : 8: predicate.shard_identity_eliminate 1.17% : 0.000002s : 8: predicate.special_op_eliminate 0.90% : 0.000002s : 8: predicate.specialize_transform 1.05% : 0.000002s : 8: predicate.split_environ_get_set_with_tuple_value 0.86% : 0.000001s : 8: predicate.stack_unstack_eliminate 0.43% : 0.000001s : 4: predicate.switch_call_monad_eliminater 0.95% : 0.000002s : 11: predicate.switch_defer_inline 1.70% : 0.000003s : 19: predicate.switch_layer_defer_inline 4.00% : 0.000007s : 39: predicate.switch_simplify 0.79% : 0.000001s : 9: predicate.tile_eliminate 0.80% : 0.000001s : 9: predicate.transpose_eliminate 1.57% : 0.000003s : 17: predicate.tuple_list_convert_item_index_to_positive 1.72% : 0.000003s : 17: predicate.tuple_list_get_item_const_eliminator 1.41% : 0.000002s : 17: predicate.tuple_list_get_item_depend_reorder 3.05% : 0.000005s : 25: predicate.tuple_list_get_item_eliminator 1.58% : 0.000003s : 17: predicate.tuple_list_get_set_item_eliminator 2.62% : 0.000005s : 25: predicate.tuple_list_set_item_eliminator 1.54% : 0.000003s : 17: predicate.tuple_to_list_eliminator_ 2.00% : 0.000003s : 26: predicate.updatestate_pure_node_eliminater 2.93% : 0.000005s : 34: predicate.updatestate_useless_node_eliminater 0.40% : 0.000001s : 4: predicate.value_based_eliminate 1.22% : 0.000002s : 8: predicate.virtual_dataset_eliminate 0.75% : 0.000001s : 8: predicate.virtual_output_eliminate 0.38% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.61% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000259 5 6.89% : 0.000018s : 1: func_graph_cloner_run.FuncGraphClonerGraph 93.11% : 0.000241s : 4: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.041469 192 0.01% : 0.000004s : 1: ForceFp32Comm 8.37% : 0.003469s : 1: add_attr 8.34% : 0.003459s : 1: add_attr_with_inline 0.01% : 0.000004s : 1: add_comm_op_reuse_tag 0.15% : 0.000063s : 1: add_recomputation 0.01% : 0.000004s : 1: assign_add_opt 0.16% : 0.000064s : 1: auto_monad 0.06% : 0.000024s : 1: auto_monad_reorder 0.01% : 0.000003s : 1: begin_end_overlap_inline 0.01% : 0.000006s : 1: bias_add_comm_swap 1.12% : 0.000465s : 1: bootstrap 0.07% : 0.000031s : 1: cconv 0.01% : 0.000004s : 1: comm_op_add_attrs 0.04% : 0.000018s : 1: control_data_broadcast_order 0.03% : 0.000010s : 1: convert_after_rewriter 0.07% : 0.000028s : 1: cse_after_recomputation 0.01% : 0.000005s : 1: dataset_repeat_opt 0.01% : 0.000005s : 1: detach_backward 0.02% : 0.000010s : 1: environ_conv 0.06% : 0.000023s : 1: event_method 0.01% : 0.000005s : 1: full_micro_interleaved_order_control 0.01% : 0.000005s : 1: get_jit_bprop_graph 0.02% : 0.000009s : 1: graph_reusing 0.01% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000004s : 1: handle_group_info 0.01% : 0.000005s : 1: inline 0.01% : 0.000006s : 1: insert-virtual-dataset 0.01% : 0.000004s : 1: interleave_parallel_branches 0.01% : 0.000004s : 1: interleave_split_concat_branches 0.01% : 0.000005s : 1: label_fine_grained_interleaved_index 0.02% : 0.000007s : 1: label_micro_interleaved_index 1.16% : 0.000481s : 1: loop_unroll 0.01% : 0.000004s : 1: merge_cast_opt 0.01% : 0.000005s : 1: micro_interleaved_order_control 1.33% : 0.000551s : 1: mutable_eliminate 0.02% : 0.000007s : 1: offloading_packed_experts 0.04% : 0.000018s : 1: opt.transform.loop_unroll_optimizer 0.05% : 0.000020s : 1: opt.transform.mutable_eliminate 2.58% : 0.001071s : 78: opt.transform.opt_a 0.08% : 0.000033s : 1: opt.transform.opt_after_cconv 0.08% : 0.000031s : 1: opt.transform.opt_after_jit_grad 0.31% : 0.000130s : 28: opt.transform.opt_b 0.15% : 0.000060s : 2: opt.transform.opt_trans_graph 0.10% : 0.000042s : 4: opt.transform.symbol_engine_opt 6.32% : 0.002622s : 1: opt_a 0.30% : 0.000124s : 1: opt_after_cconv 1.21% : 0.000500s : 1: opt_after_jit_grad 0.61% : 0.000254s : 1: opt_b 11.57% : 0.004796s : 1: optimize 0.05% : 0.000021s : 1: optimize_parallel_all_gather_comm 0.02% : 0.000010s : 1: order_py_execute_after_rewriter 0.06% : 0.000025s : 1: overlap_grad_flash_sp 0.01% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.02% : 0.000007s : 1: overlap_grad_ring_attention 0.01% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.01% : 0.000005s : 1: overlap_param_gather 0.01% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.02% : 0.000008s : 1: overlap_recompute_and_grad_model_parallel 0.01% : 0.000005s : 1: overlap_recompute_comm 0.02% : 0.000007s : 1: parallel-infer-symbol 0.01% : 0.000004s : 1: parallel-infer-symbol-second 0.01% : 0.000005s : 1: partial_unused_args_eliminate 0.01% : 0.000005s : 1: pipeline_parallel_scheduler 0.01% : 0.000005s : 1: pipeline_split 0.08% : 0.000032s : 1: pre_auto_parallel 0.06% : 0.000023s : 1: py_interpret_to_execute 0.03% : 0.000014s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000004s : 1: remove_cast_before_assign_add 0.05% : 0.000021s : 1: remove_dup_value 0.89% : 0.000371s : 1: renormalize.infer 0.64% : 0.000264s : 1: renormalize.specialize 0.01% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.02% : 0.000007s : 1: rewriter_after_jit_bprop_graph 0.11% : 0.000045s : 1: rewriter_after_opt_a 0.13% : 0.000055s : 1: rewriter_before_opt_a 0.01% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.01% : 0.000005s : 1: slice_recompute_activation 0.01% : 0.000005s : 1: split_layernorm_comm 0.01% : 0.000005s : 1: split_matmul_comm_elemetwise 0.02% : 0.000009s : 1: swap_dp_allreduce_reducescatter 0.21% : 0.000085s : 1: symbol_engine_optimizer 0.22% : 0.000093s : 1: tuple_transform 52.54% : 0.021788s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:32.501.522 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:32.501.799 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0309432, [21] [bootstrap]: 0.0165133 [type_inference]: 0.00482646 [event_method]: 1.322e-05 [auto_monad]: 5.5e-05 [graph_reusing]: 6.09001e-06 [inline]: 2.63e-06 [add_attr]: 0.00316242, [1] [add_attr_with_inline]: 0.00315303, [1] [Cycle 1]: 6.583e-05, [2] [tag_attr]: 1.527e-05 [meta_addattr_fg_expand]: 3.85998e-06 [parallel-infer-symbol]: 3.03e-06 [pre_auto_parallel]: 2.622e-05 [insert-virtual-dataset]: 2.63998e-06 [parallel-infer-symbol-second]: 7.30011e-07 [dataset_repeat_opt]: 2.06e-06 [pipeline_split]: 1.55001e-06 [optimize]: 0.00520683, [53] [py_interpret_to_execute]: 2.246e-05 [rewriter_before_opt_a]: 5.405e-05 [opt_a]: 0.00286835, [2] [Cycle 1]: 0.00192513, [45] [expand_dump_flag]: 2.80002e-06 [switch_simplify]: 2.628e-05 [loop_unroll]: 1.529e-05 [a_1]: 0.00038226 [with_stream_mark]: 1.619e-05 [recompute_prepare]: 9.57999e-06 [updatestate_depend_eliminate]: 4.43001e-06 [updatestate_assign_eliminate]: 4.05e-06 [updatestate_loads_eliminate]: 4.01001e-06 [parameter_eliminate]: 2.00002e-06 [a_2]: 0.00014005 [accelerated_algorithm]: 2.157e-05 [shard]: 2.34999e-06 [meta_shard_fg_expand]: 2.30002e-06 [shard_inline]: 8.07e-06 [merge_send_recv]: 9.42001e-06 [auto_parallel]: 6.63e-06 [parallel]: 1.949e-05 [flash_sp]: 8.32e-06 [merge_comm]: 4.64002e-06 [allreduce_fusion]: 4.32e-06 [matmul_add_comm_reduction]: 1.18e-05 [allreduce_slice_to_reducescatter]: 6.30011e-07 [virtual_shard_identity]: 9.67999e-06 [virtual_dataset]: 8.46002e-06 [get_grad_eliminate_]: 7.34002e-06 [virtual_output]: 7.71999e-06 [merge_forward]: 4.90999e-06 [cell_reuse_recompute_pass]: 1.32e-06 [offload_activation]: 1.108e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.688e-05 [merge_recompute_call_nodes]: 1.60001e-06 [before_grad]: 1.315e-05 [set_forward_comm_id_for_comm_node_pass]: 5.18002e-06 [meta_fg_expand]: 3.61001e-06 [flash_sp_send_recv_attached]: 5.32999e-06 [receive_attached]: 2.64999e-06 [after_resolve]: 1.228e-05 [a_after_grad]: 1.169e-05 [renormalize]: 0.000578 [add_forward_monad_depend]: 5.35999e-06 [auto_monad_grad]: 2.03002e-06 [auto_monad_eliminator]: 1.702e-05 [cse]: 3.611e-05 [a_3]: 7.014e-05 [Cycle 2]: 0.00093062, [45] [expand_dump_flag]: 1.00001e-06 [switch_simplify]: 9.00999e-06 [loop_unroll]: 7.71999e-06 [a_1]: 0.00017333 [with_stream_mark]: 1.014e-05 [recompute_prepare]: 7.61999e-06 [updatestate_depend_eliminate]: 4.4e-06 [updatestate_assign_eliminate]: 3.16001e-06 [updatestate_loads_eliminate]: 3.08e-06 [parameter_eliminate]: 9.29984e-07 [a_2]: 0.00012697 [accelerated_algorithm]: 1.052e-05 [shard]: 1.16002e-06 [meta_shard_fg_expand]: 1.66e-06 [shard_inline]: 7.38999e-06 [merge_send_recv]: 5.84e-06 [auto_parallel]: 6.68e-06 [parallel]: 5.63997e-06 [flash_sp]: 3.41999e-06 [merge_comm]: 4.11001e-06 [allreduce_fusion]: 4.08001e-06 [matmul_add_comm_reduction]: 7.55e-06 [allreduce_slice_to_reducescatter]: 3.29979e-07 [virtual_shard_identity]: 8.27e-06 [virtual_dataset]: 7.31001e-06 [get_grad_eliminate_]: 6.94999e-06 [virtual_output]: 6.48e-06 [merge_forward]: 3.33e-06 [cell_reuse_recompute_pass]: 1.45999e-06 [offload_activation]: 7.59002e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.576e-05 [merge_recompute_call_nodes]: 7.39994e-07 [before_grad]: 1.135e-05 [set_forward_comm_id_for_comm_node_pass]: 4.58999e-06 [meta_fg_expand]: 2.43e-06 [flash_sp_send_recv_attached]: 8.2e-07 [receive_attached]: 9.70002e-07 [after_resolve]: 9.58002e-06 [a_after_grad]: 1.022e-05 [renormalize]: 6.00121e-08 [add_forward_monad_depend]: 1.49e-06 [auto_monad_grad]: 1.20999e-06 [auto_monad_eliminator]: 8.72998e-06 [cse]: 1.97e-05 [a_3]: 5.584e-05 [py_interpret_to_execute_after_opt_a]: 1.401e-05 [slice_cell_reuse_recomputed_activation]: 4.68999e-06 [rewriter_after_opt_a]: 4.362e-05 [convert_after_rewriter]: 1.031e-05 [order_py_execute_after_rewriter]: 9.10001e-06 [mutable_eliminate]: 0.00055021 [opt_b]: 0.00029682, [1] [Cycle 1]: 0.00028827, [7] [b_1]: 0.00018726 [b_2]: 8.59e-06 [updatestate_depend_eliminate]: 6.56e-06 [updatestate_assign_eliminate]: 3.06999e-06 [updatestate_loads_eliminate]: 3.21001e-06 [renormalize]: 4.19997e-07 [cse]: 2.391e-05 [optimize_parallel_all_gather_comm]: 2.064e-05 [overlap_param_gather]: 4.75999e-06 [cconv]: 2.714e-05 [loop_unroll]: 0.00044075 [opt_after_cconv]: 0.00013581, [1] [Cycle 1]: 0.00012751, [7] [c_1]: 3.471e-05 [parameter_eliminate]: 2.63998e-06 [updatestate_depend_eliminate]: 6.17999e-06 [updatestate_assign_eliminate]: 3.18e-06 [updatestate_loads_eliminate]: 3.63999e-06 [cse]: 2.237e-05 [renormalize]: 4.00003e-07 [remove_dup_value]: 1.915e-05 [tuple_transform]: 9.369e-05, [1] [Cycle 1]: 8.674e-05, [4] [d_1]: 4.709e-05 [none_parameter_eliminate]: 1.57001e-06 [renormalize]: 1.69995e-07 [switch_simplify]: 8.03001e-06 [partial_unused_args_eliminate]: 4.35e-06 [add_recomputation]: 5.746e-05 [cse_after_recomputation]: 3.289e-05, [1] [Cycle 1]: 2.568e-05, [1] [cse]: 1.599e-05 [environ_conv]: 8.84998e-06 [swap_dp_allreduce_reducescatter]: 8.62e-06 [bias_add_comm_swap]: 5.19e-06 [label_micro_interleaved_index]: 6.76e-06 [label_fine_grained_interleaved_index]: 5.30001e-06 [merge_cast_opt]: 4.05e-06 [slice_recompute_activation]: 4.54998e-06 [micro_interleaved_order_control]: 4.97e-06 [assign_add_opt]: 3.78999e-06 [ForceFp32Comm]: 3.16001e-06 [remove_cast_before_assign_add]: 3.35003e-06 [full_micro_interleaved_order_control]: 4.90999e-06 [reorder_send_recv_between_fp_bp]: 5.52001e-06 [comm_op_add_attrs]: 4.03999e-06 [add_comm_op_reuse_tag]: 3.26999e-06 [interleave_split_concat_branches]: 3.58999e-06 [interleave_parallel_branches]: 3.48999e-06 [overlap_opt_shard_in_pipeline]: 3.76999e-06 [overlap_opt_shard_grad_in_pipeline]: 4.26001e-06 [control_data_broadcast_order]: 1.736e-05 [grouped_pairwise_exchange_alltoall]: 4.07998e-06 [offloading_packed_experts]: 6.95998e-06 [overlap_recompute_and_grad_model_parallel]: 7.78999e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.55e-06 [overlap_recompute_allgather_and_fa_grad]: 3.81999e-06 [overlap_recompute_comm]: 4.99e-06 [overlap_grad_ring_attention]: 6.66e-06 [overlap_grad_flash_sp]: 2.368e-05 [begin_end_overlap_inline]: 3.09999e-06 [split_matmul_comm_elemetwise]: 5.02e-06 [split_layernorm_comm]: 3.78001e-06 [handle_group_info]: 3.35e-06 [symbol_engine_optimizer]: 0.00010235, [1] [Cycle 1]: 9.536e-05, [6] [build]: 3.26999e-06 [elim_shapecalc]: 1.136e-05 [elim_not_effective]: 1.496e-05 [opt_reshape]: 7.87e-06 [fold_const_symbol]: 1.232e-05 [renormalize]: 2.3999e-07 [detach_backward]: 3.77002e-06 [pipeline_parallel_scheduler]: 1.57999e-06 [auto_monad_reorder]: 2.186e-05 [get_jit_bprop_graph]: 1.47001e-06 [rewriter_after_jit_bprop_graph]: 4.53999e-06 [opt_after_jit_grad]: 0.00048083 [validate]: 3.99e-05 Sums bootstrap : 0.016513s : 63.39% type_inference : 0.004826s : 18.53% event_method : 0.000013s : 0.05% auto_monad : 0.000055s : 0.21% graph_reusing : 0.000006s : 0.02% inline : 0.000003s : 0.01% add_attr.add_attr_with_inline.tag_attr : 0.000015s : 0.06% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000004s : 0.01% parallel-infer-symbol : 0.000003s : 0.01% pre_auto_parallel : 0.000026s : 0.10% insert-virtual-dataset : 0.000003s : 0.01% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.01% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000022s : 0.09% optimize.rewriter_before_opt_a : 0.000054s : 0.21% optimize.opt_a.expand_dump_flag : 0.000004s : 0.01% optimize.opt_a.switch_simplify : 0.000035s : 0.14% optimize.opt_a.loop_unroll : 0.000023s : 0.09% optimize.opt_a.a_1 : 0.000556s : 2.13% optimize.opt_a.with_stream_mark : 0.000026s : 0.10% optimize.opt_a.recompute_prepare : 0.000017s : 0.07% optimize.opt_a.updatestate_depend_eliminate : 0.000009s : 0.03% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.03% optimize.opt_a.updatestate_loads_eliminate : 0.000007s : 0.03% optimize.opt_a.parameter_eliminate : 0.000003s : 0.01% optimize.opt_a.a_2 : 0.000267s : 1.02% optimize.opt_a.accelerated_algorithm : 0.000032s : 0.12% optimize.opt_a.shard : 0.000004s : 0.01% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.02% optimize.opt_a.shard_inline : 0.000015s : 0.06% optimize.opt_a.merge_send_recv : 0.000015s : 0.06% optimize.opt_a.auto_parallel : 0.000013s : 0.05% optimize.opt_a.parallel : 0.000025s : 0.10% optimize.opt_a.flash_sp : 0.000012s : 0.05% optimize.opt_a.merge_comm : 0.000009s : 0.03% optimize.opt_a.allreduce_fusion : 0.000008s : 0.03% optimize.opt_a.matmul_add_comm_reduction : 0.000019s : 0.07% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000018s : 0.07% optimize.opt_a.virtual_dataset : 0.000016s : 0.06% optimize.opt_a.get_grad_eliminate_ : 0.000014s : 0.05% optimize.opt_a.virtual_output : 0.000014s : 0.05% optimize.opt_a.merge_forward : 0.000008s : 0.03% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.01% optimize.opt_a.offload_activation : 0.000019s : 0.07% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000033s : 0.13% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.01% optimize.opt_a.before_grad : 0.000025s : 0.09% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000010s : 0.04% optimize.opt_a.meta_fg_expand : 0.000006s : 0.02% optimize.opt_a.flash_sp_send_recv_attached : 0.000006s : 0.02% optimize.opt_a.receive_attached : 0.000004s : 0.01% optimize.opt_a.after_resolve : 0.000022s : 0.08% optimize.opt_a.a_after_grad : 0.000022s : 0.08% optimize.opt_a.renormalize : 0.000578s : 2.22% optimize.opt_a.add_forward_monad_depend : 0.000007s : 0.03% optimize.opt_a.auto_monad_grad : 0.000003s : 0.01% optimize.opt_a.auto_monad_eliminator : 0.000026s : 0.10% optimize.opt_a.cse : 0.000056s : 0.21% optimize.opt_a.a_3 : 0.000126s : 0.48% optimize.py_interpret_to_execute_after_opt_a : 0.000014s : 0.05% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.02% optimize.rewriter_after_opt_a : 0.000044s : 0.17% optimize.convert_after_rewriter : 0.000010s : 0.04% optimize.order_py_execute_after_rewriter : 0.000009s : 0.03% optimize.mutable_eliminate : 0.000550s : 2.11% optimize.opt_b.b_1 : 0.000187s : 0.72% optimize.opt_b.b_2 : 0.000009s : 0.03% optimize.opt_b.updatestate_depend_eliminate : 0.000007s : 0.03% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.01% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.01% optimize.opt_b.renormalize : 0.000000s : 0.00% optimize.opt_b.cse : 0.000024s : 0.09% optimize.optimize_parallel_all_gather_comm : 0.000021s : 0.08% optimize.overlap_param_gather : 0.000005s : 0.02% optimize.cconv : 0.000027s : 0.10% optimize.loop_unroll : 0.000441s : 1.69% optimize.opt_after_cconv.c_1 : 0.000035s : 0.13% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.02% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000004s : 0.01% optimize.opt_after_cconv.cse : 0.000022s : 0.09% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000019s : 0.07% optimize.tuple_transform.d_1 : 0.000047s : 0.18% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000008s : 0.03% optimize.partial_unused_args_eliminate : 0.000004s : 0.02% optimize.add_recomputation : 0.000057s : 0.22% optimize.cse_after_recomputation.cse : 0.000016s : 0.06% optimize.environ_conv : 0.000009s : 0.03% optimize.swap_dp_allreduce_reducescatter : 0.000009s : 0.03% optimize.bias_add_comm_swap : 0.000005s : 0.02% optimize.label_micro_interleaved_index : 0.000007s : 0.03% optimize.label_fine_grained_interleaved_index : 0.000005s : 0.02% optimize.merge_cast_opt : 0.000004s : 0.02% optimize.slice_recompute_activation : 0.000005s : 0.02% optimize.micro_interleaved_order_control : 0.000005s : 0.02% optimize.assign_add_opt : 0.000004s : 0.01% optimize.ForceFp32Comm : 0.000003s : 0.01% optimize.remove_cast_before_assign_add : 0.000003s : 0.01% optimize.full_micro_interleaved_order_control : 0.000005s : 0.02% optimize.reorder_send_recv_between_fp_bp : 0.000006s : 0.02% optimize.comm_op_add_attrs : 0.000004s : 0.02% optimize.add_comm_op_reuse_tag : 0.000003s : 0.01% optimize.interleave_split_concat_branches : 0.000004s : 0.01% optimize.interleave_parallel_branches : 0.000003s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000004s : 0.02% optimize.control_data_broadcast_order : 0.000017s : 0.07% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.02% optimize.offloading_packed_experts : 0.000007s : 0.03% optimize.overlap_recompute_and_grad_model_parallel : 0.000008s : 0.03% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.01% optimize.overlap_recompute_comm : 0.000005s : 0.02% optimize.overlap_grad_ring_attention : 0.000007s : 0.03% optimize.overlap_grad_flash_sp : 0.000024s : 0.09% optimize.begin_end_overlap_inline : 0.000003s : 0.01% optimize.split_matmul_comm_elemetwise : 0.000005s : 0.02% optimize.split_layernorm_comm : 0.000004s : 0.01% optimize.handle_group_info : 0.000003s : 0.01% optimize.symbol_engine_optimizer.build : 0.000003s : 0.01% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000011s : 0.04% optimize.symbol_engine_optimizer.elim_not_effective : 0.000015s : 0.06% optimize.symbol_engine_optimizer.opt_reshape : 0.000008s : 0.03% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000012s : 0.05% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000004s : 0.01% pipeline_parallel_scheduler : 0.000002s : 0.01% auto_monad_reorder : 0.000022s : 0.08% get_jit_bprop_graph : 0.000001s : 0.01% rewriter_after_jit_bprop_graph : 0.000005s : 0.02% opt_after_jit_grad : 0.000481s : 1.85% validate : 0.000040s : 0.15% Time group info: ------[substitution.] 0.000226 34 31.92% : 0.000072s : 4: substitution.arithmetic_simplify 8.66% : 0.000020s : 2: substitution.cast_eliminate 1.03% : 0.000002s : 3: substitution.elim_not_effective 0.75% : 0.000002s : 3: substitution.fold_const_symbol 2.67% : 0.000006s : 4: substitution.graph_param_transform 43.36% : 0.000098s : 2: substitution.inline 2.02% : 0.000005s : 6: substitution.j_node_and_user_rematch 5.42% : 0.000012s : 2: substitution.less_batch_normalization 2.56% : 0.000006s : 6: substitution.remove_not_recompute_node 1.61% : 0.000004s : 2: substitution.replace_old_param ------[type_inference.] 0.004780 2 90.72% : 0.004337s : 1: type_inference.infer 9.28% : 0.000443s : 1: type_inference.specialize ------[replace.] 0.000022 2 100.00% : 0.000022s : 2: replace.inline ------[match.] 0.000097 2 100.00% : 0.000097s : 2: match.inline ------[predicate.] 0.000169 980 0.84% : 0.000001s : 9: predicate.accumulaten_eliminater 1.01% : 0.000002s : 4: predicate.ad_related_special_op_eliminate 0.68% : 0.000001s : 8: predicate.addn_check_dump 1.04% : 0.000002s : 9: predicate.addn_zero_filter 0.71% : 0.000001s : 9: predicate.adjust_all_reduce_mul_add 2.97% : 0.000005s : 17: predicate.arithmetic_simplify 0.84% : 0.000001s : 9: predicate.cast_eliminate 0.75% : 0.000001s : 8: predicate.check_bprop_eliminate 0.71% : 0.000001s : 8: predicate.compare_switch_simplify 0.22% : 0.000000s : 4: predicate.const_output_eliminate 0.78% : 0.000001s : 8: predicate.depend_value_elim 0.84% : 0.000001s : 9: predicate.dict_get_item_const_eliminator 0.99% : 0.000002s : 9: predicate.dict_get_item_eliminator 0.73% : 0.000001s : 9: predicate.dict_set_item_eliminator 1.14% : 0.000002s : 8: predicate.dumpgradient_eliminate 0.25% : 0.000000s : 4: predicate.elim_not_effective 0.52% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.13% : 0.000002s : 13: predicate.environ_add_const_eliminate 1.00% : 0.000002s : 13: predicate.environ_get_add_eliminate 1.03% : 0.000002s : 13: predicate.environ_get_depend_swap 1.85% : 0.000003s : 21: predicate.environ_get_eliminate 1.03% : 0.000002s : 13: predicate.environ_get_set_eliminate 0.94% : 0.000002s : 11: predicate.exchange_switch_depend_value 1.83% : 0.000003s : 11: predicate.float_depend_g_call 0.68% : 0.000001s : 8: predicate.float_environ_get_switch 1.01% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.22% : 0.000000s : 4: predicate.fold_const_symbol 0.79% : 0.000001s : 8: predicate.get_grad_eliminate 0.27% : 0.000000s : 4: predicate.graph_param_transform 0.82% : 0.000001s : 8: predicate.incorporate_call 0.68% : 0.000001s : 8: predicate.incorporate_call_switch 6.74% : 0.000011s : 44: predicate.inline 1.03% : 0.000002s : 8: predicate.inline_without_move 0.37% : 0.000001s : 8: predicate.j_node_and_user_rematch 1.20% : 0.000002s : 8: predicate.less_batch_normalization 1.59% : 0.000003s : 17: predicate.list_to_tuple_eliminator_ 2.24% : 0.000004s : 26: predicate.load_eliminater 1.08% : 0.000002s : 4: predicate.loop_unroll_after_grad 1.72% : 0.000003s : 16: predicate.loop_unroll_before_grad 1.79% : 0.000003s : 17: predicate.make_slice_get_slice_eliminator 0.78% : 0.000001s : 8: predicate.merge_addn 0.72% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.74% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.70% : 0.000001s : 9: predicate.minmaximum_grad 1.40% : 0.000002s : 4: predicate.mutable_eliminate 0.43% : 0.000001s : 4: predicate.opt_reshape 0.55% : 0.000001s : 4: predicate.parallel_virtual_node 1.18% : 0.000002s : 11: predicate.partial_defer_inline 1.29% : 0.000002s : 13: predicate.partial_eliminate 0.86% : 0.000001s : 9: predicate.print_const_string_wrapper 0.73% : 0.000001s : 8: predicate.reduce_all_const_elim 1.03% : 0.000002s : 9: predicate.reduce_eliminate 2.29% : 0.000004s : 26: predicate.redundant_stop_gradient_eliminater 0.47% : 0.000001s : 8: predicate.remove_not_recompute_node 1.17% : 0.000002s : 17: predicate.replace_applicator 0.59% : 0.000001s : 8: predicate.replace_old_param 0.36% : 0.000001s : 4: predicate.reset_defer_inline 0.82% : 0.000001s : 9: predicate.reshape_eliminate 0.75% : 0.000001s : 8: predicate.row_tensor_add_zeros_like 0.45% : 0.000001s : 4: predicate.row_tensor_eliminate 0.89% : 0.000002s : 8: predicate.same_eliminate 0.52% : 0.000001s : 8: predicate.set_cell_output_no_recompute 1.05% : 0.000002s : 8: predicate.shard_identity_eliminate 0.92% : 0.000002s : 8: predicate.special_op_eliminate 1.00% : 0.000002s : 8: predicate.specialize_transform 1.03% : 0.000002s : 8: predicate.split_environ_get_set_with_tuple_value 0.94% : 0.000002s : 8: predicate.stack_unstack_eliminate 0.46% : 0.000001s : 4: predicate.switch_call_monad_eliminater 0.93% : 0.000002s : 11: predicate.switch_defer_inline 1.72% : 0.000003s : 19: predicate.switch_layer_defer_inline 4.20% : 0.000007s : 39: predicate.switch_simplify 0.75% : 0.000001s : 9: predicate.tile_eliminate 0.79% : 0.000001s : 9: predicate.transpose_eliminate 1.58% : 0.000003s : 17: predicate.tuple_list_convert_item_index_to_positive 1.68% : 0.000003s : 17: predicate.tuple_list_get_item_const_eliminator 1.50% : 0.000003s : 17: predicate.tuple_list_get_item_depend_reorder 2.93% : 0.000005s : 25: predicate.tuple_list_get_item_eliminator 1.42% : 0.000002s : 17: predicate.tuple_list_get_set_item_eliminator 2.65% : 0.000004s : 25: predicate.tuple_list_set_item_eliminator 1.51% : 0.000003s : 17: predicate.tuple_to_list_eliminator_ 2.07% : 0.000003s : 26: predicate.updatestate_pure_node_eliminater 3.09% : 0.000005s : 34: predicate.updatestate_useless_node_eliminater 0.40% : 0.000001s : 4: predicate.value_based_eliminate 1.00% : 0.000002s : 8: predicate.virtual_dataset_eliminate 0.77% : 0.000001s : 8: predicate.virtual_output_eliminate 0.36% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.47% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000218 5 7.40% : 0.000016s : 1: func_graph_cloner_run.FuncGraphClonerGraph 92.60% : 0.000202s : 4: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.041132 192 0.01% : 0.000006s : 1: ForceFp32Comm 7.71% : 0.003172s : 1: add_attr 7.68% : 0.003157s : 1: add_attr_with_inline 0.01% : 0.000006s : 1: add_comm_op_reuse_tag 0.15% : 0.000061s : 1: add_recomputation 0.02% : 0.000006s : 1: assign_add_opt 0.15% : 0.000063s : 1: auto_monad 0.07% : 0.000029s : 1: auto_monad_reorder 0.01% : 0.000006s : 1: begin_end_overlap_inline 0.02% : 0.000008s : 1: bias_add_comm_swap 40.25% : 0.016556s : 1: bootstrap 0.07% : 0.000030s : 1: cconv 0.02% : 0.000007s : 1: comm_op_add_attrs 0.05% : 0.000021s : 1: control_data_broadcast_order 0.03% : 0.000013s : 1: convert_after_rewriter 0.09% : 0.000036s : 1: cse_after_recomputation 0.02% : 0.000008s : 1: dataset_repeat_opt 0.05% : 0.000019s : 1: detach_backward 0.03% : 0.000012s : 1: environ_conv 0.06% : 0.000023s : 1: event_method 0.02% : 0.000008s : 1: full_micro_interleaved_order_control 0.02% : 0.000008s : 1: get_jit_bprop_graph 0.03% : 0.000013s : 1: graph_reusing 0.02% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000006s : 1: handle_group_info 0.02% : 0.000008s : 1: inline 0.02% : 0.000009s : 1: insert-virtual-dataset 0.01% : 0.000006s : 1: interleave_parallel_branches 0.02% : 0.000006s : 1: interleave_split_concat_branches 0.02% : 0.000008s : 1: label_fine_grained_interleaved_index 0.02% : 0.000010s : 1: label_micro_interleaved_index 1.09% : 0.000446s : 1: loop_unroll 0.02% : 0.000007s : 1: merge_cast_opt 0.02% : 0.000008s : 1: micro_interleaved_order_control 1.35% : 0.000557s : 1: mutable_eliminate 0.02% : 0.000010s : 1: offloading_packed_experts 0.04% : 0.000016s : 1: opt.transform.loop_unroll_optimizer 0.04% : 0.000017s : 1: opt.transform.mutable_eliminate 2.57% : 0.001059s : 78: opt.transform.opt_a 0.08% : 0.000033s : 1: opt.transform.opt_after_cconv 0.07% : 0.000028s : 1: opt.transform.opt_after_jit_grad 0.30% : 0.000124s : 28: opt.transform.opt_b 0.13% : 0.000053s : 2: opt.transform.opt_trans_graph 0.10% : 0.000043s : 4: opt.transform.symbol_engine_opt 6.98% : 0.002872s : 1: opt_a 0.34% : 0.000140s : 1: opt_after_cconv 1.19% : 0.000491s : 1: opt_after_jit_grad 0.73% : 0.000300s : 1: opt_b 13.47% : 0.005541s : 1: optimize 0.06% : 0.000024s : 1: optimize_parallel_all_gather_comm 0.03% : 0.000012s : 1: order_py_execute_after_rewriter 0.07% : 0.000027s : 1: overlap_grad_flash_sp 0.02% : 0.000006s : 1: overlap_grad_matmul_and_grad_allreduce 0.02% : 0.000009s : 1: overlap_grad_ring_attention 0.02% : 0.000007s : 1: overlap_opt_shard_grad_in_pipeline 0.02% : 0.000006s : 1: overlap_opt_shard_in_pipeline 0.02% : 0.000008s : 1: overlap_param_gather 0.02% : 0.000006s : 1: overlap_recompute_allgather_and_fa_grad 0.03% : 0.000011s : 1: overlap_recompute_and_grad_model_parallel 0.02% : 0.000008s : 1: overlap_recompute_comm 0.02% : 0.000009s : 1: parallel-infer-symbol 0.01% : 0.000006s : 1: parallel-infer-symbol-second 0.02% : 0.000007s : 1: partial_unused_args_eliminate 0.02% : 0.000008s : 1: pipeline_parallel_scheduler 0.02% : 0.000007s : 1: pipeline_split 0.08% : 0.000033s : 1: pre_auto_parallel 0.06% : 0.000026s : 1: py_interpret_to_execute 0.04% : 0.000017s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000006s : 1: remove_cast_before_assign_add 0.05% : 0.000022s : 1: remove_dup_value 0.80% : 0.000327s : 1: renormalize.infer 0.59% : 0.000243s : 1: renormalize.specialize 0.02% : 0.000008s : 1: reorder_send_recv_between_fp_bp 0.03% : 0.000010s : 1: rewriter_after_jit_bprop_graph 0.11% : 0.000047s : 1: rewriter_after_opt_a 0.14% : 0.000058s : 1: rewriter_before_opt_a 0.02% : 0.000007s : 1: slice_cell_reuse_recomputed_activation 0.02% : 0.000007s : 1: slice_recompute_activation 0.02% : 0.000007s : 1: split_layernorm_comm 0.02% : 0.000008s : 1: split_matmul_comm_elemetwise 0.03% : 0.000011s : 1: swap_dp_allreduce_reducescatter 0.26% : 0.000105s : 1: symbol_engine_optimizer 0.23% : 0.000097s : 1: tuple_transform 11.81% : 0.004856s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:32.894.237 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0302514, [21] [bootstrap]: 0.00043497 [type_inference]: 0.0209907 [event_method]: 1.429e-05 [auto_monad]: 5.909e-05 [graph_reusing]: 5.74e-06 [inline]: 3.29001e-06 [add_attr]: 0.00343032, [1] [add_attr_with_inline]: 0.00342114, [1] [Cycle 1]: 5.24e-05, [2] [tag_attr]: 1.573e-05 [meta_addattr_fg_expand]: 4.29997e-06 [parallel-infer-symbol]: 3.13e-06 [pre_auto_parallel]: 2.667e-05 [insert-virtual-dataset]: 2.49999e-06 [parallel-infer-symbol-second]: 8.2e-07 [dataset_repeat_opt]: 2.16e-06 [pipeline_split]: 1.56002e-06 [optimize]: 0.00458517, [53] [py_interpret_to_execute]: 1.916e-05 [rewriter_before_opt_a]: 5.405e-05 [opt_a]: 0.00251371, [2] [Cycle 1]: 0.00175812, [45] [expand_dump_flag]: 3.13e-06 [switch_simplify]: 2.679e-05 [loop_unroll]: 1.439e-05 [a_1]: 0.00038148 [with_stream_mark]: 1.495e-05 [recompute_prepare]: 9.76e-06 [updatestate_depend_eliminate]: 4.35e-06 [updatestate_assign_eliminate]: 3.60998e-06 [updatestate_loads_eliminate]: 3.56999e-06 [parameter_eliminate]: 1.86e-06 [a_2]: 0.00010819 [accelerated_algorithm]: 2.038e-05 [shard]: 1.96e-06 [meta_shard_fg_expand]: 2.12999e-06 [shard_inline]: 8.28999e-06 [merge_send_recv]: 9.16002e-06 [auto_parallel]: 6.87002e-06 [parallel]: 1.859e-05 [flash_sp]: 7.51999e-06 [merge_comm]: 4.57e-06 [allreduce_fusion]: 4.25999e-06 [matmul_add_comm_reduction]: 1.168e-05 [allreduce_slice_to_reducescatter]: 6.89994e-07 [virtual_shard_identity]: 8.89e-06 [virtual_dataset]: 7.61999e-06 [get_grad_eliminate_]: 7.32997e-06 [virtual_output]: 7.36001e-06 [merge_forward]: 4.80999e-06 [cell_reuse_recompute_pass]: 1.15001e-06 [offload_activation]: 1.188e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.482e-05 [merge_recompute_call_nodes]: 1.37e-06 [before_grad]: 1.308e-05 [set_forward_comm_id_for_comm_node_pass]: 4.43999e-06 [meta_fg_expand]: 3.44001e-06 [flash_sp_send_recv_attached]: 5.35001e-06 [receive_attached]: 2.00002e-06 [after_resolve]: 1.089e-05 [a_after_grad]: 1.176e-05 [renormalize]: 0.00061461 [add_forward_monad_depend]: 5.00999e-06 [auto_monad_grad]: 1.96e-06 [auto_monad_eliminator]: 1.665e-05 [cse]: 3.691e-05 [a_3]: 5.6e-05 [Cycle 2]: 0.00074612, [45] [expand_dump_flag]: 1.02e-06 [switch_simplify]: 8.23001e-06 [loop_unroll]: 7.04001e-06 [a_1]: 0.00016889 [with_stream_mark]: 9.01998e-06 [recompute_prepare]: 7.97e-06 [updatestate_depend_eliminate]: 4.4e-06 [updatestate_assign_eliminate]: 2.85002e-06 [updatestate_loads_eliminate]: 2.59999e-06 [parameter_eliminate]: 1.15001e-06 [a_2]: 9.792e-05 [accelerated_algorithm]: 1.033e-05 [shard]: 1.00999e-06 [meta_shard_fg_expand]: 1.76e-06 [shard_inline]: 7.55e-06 [merge_send_recv]: 5.76e-06 [auto_parallel]: 6.70002e-06 [parallel]: 4.82998e-06 [flash_sp]: 3.2e-06 [merge_comm]: 3.94002e-06 [allreduce_fusion]: 3.73001e-06 [matmul_add_comm_reduction]: 7.41999e-06 [allreduce_slice_to_reducescatter]: 3.19997e-07 [virtual_shard_identity]: 8.02998e-06 [virtual_dataset]: 6.89999e-06 [get_grad_eliminate_]: 6.48e-06 [virtual_output]: 6.34001e-06 [merge_forward]: 3.21001e-06 [cell_reuse_recompute_pass]: 1.43002e-06 [offload_activation]: 7.17002e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.282e-05 [merge_recompute_call_nodes]: 8.80013e-07 [before_grad]: 1.089e-05 [set_forward_comm_id_for_comm_node_pass]: 4.33999e-06 [meta_fg_expand]: 2.75002e-06 [flash_sp_send_recv_attached]: 8.49977e-07 [receive_attached]: 1.05001e-06 [after_resolve]: 9.02999e-06 [a_after_grad]: 9.90002e-06 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 1.14e-06 [auto_monad_grad]: 8.50006e-07 [auto_monad_eliminator]: 8.27e-06 [cse]: 2.037e-05 [a_3]: 4.374e-05 [py_interpret_to_execute_after_opt_a]: 1.033e-05 [slice_cell_reuse_recomputed_activation]: 1.84998e-06 [rewriter_after_opt_a]: 4.017e-05 [convert_after_rewriter]: 7.66999e-06 [order_py_execute_after_rewriter]: 6.17999e-06 [mutable_eliminate]: 0.00049903 [opt_b]: 0.00024413, [1] [Cycle 1]: 0.00023796, [7] [b_1]: 0.00015237 [b_2]: 9.70002e-06 [updatestate_depend_eliminate]: 6.78e-06 [updatestate_assign_eliminate]: 3.14001e-06 [updatestate_loads_eliminate]: 3.14999e-06 [renormalize]: 4.80009e-07 [cse]: 2.395e-05 [optimize_parallel_all_gather_comm]: 1.83e-05 [overlap_param_gather]: 1.96e-06 [cconv]: 2.442e-05 [loop_unroll]: 0.00045538 [opt_after_cconv]: 0.00011053, [1] [Cycle 1]: 0.00010513, [7] [c_1]: 3.307e-05 [parameter_eliminate]: 2.69999e-06 [updatestate_depend_eliminate]: 5.72999e-06 [updatestate_assign_eliminate]: 3.09001e-06 [updatestate_loads_eliminate]: 3.03e-06 [cse]: 2.325e-05 [renormalize]: 4.59986e-07 [remove_dup_value]: 1.628e-05 [tuple_transform]: 7.987e-05, [1] [Cycle 1]: 7.549e-05, [4] [d_1]: 4.654e-05 [none_parameter_eliminate]: 1.79998e-06 [renormalize]: 1.80007e-07 [switch_simplify]: 7.83999e-06 [partial_unused_args_eliminate]: 1.87001e-06 [add_recomputation]: 5.441e-05 [cse_after_recomputation]: 2.55e-05, [1] [Cycle 1]: 2.101e-05, [1] [cse]: 1.551e-05 [environ_conv]: 6.04999e-06 [swap_dp_allreduce_reducescatter]: 5.61e-06 [bias_add_comm_swap]: 2.34999e-06 [label_micro_interleaved_index]: 4.1e-06 [label_fine_grained_interleaved_index]: 2.61e-06 [merge_cast_opt]: 1.24e-06 [slice_recompute_activation]: 2.02001e-06 [micro_interleaved_order_control]: 2.09999e-06 [assign_add_opt]: 1.19e-06 [ForceFp32Comm]: 8.39995e-07 [remove_cast_before_assign_add]: 9.79984e-07 [full_micro_interleaved_order_control]: 1.92999e-06 [reorder_send_recv_between_fp_bp]: 2.78998e-06 [comm_op_add_attrs]: 9.39996e-07 [add_comm_op_reuse_tag]: 9.09989e-07 [interleave_split_concat_branches]: 1.13001e-06 [interleave_parallel_branches]: 1.04e-06 [overlap_opt_shard_in_pipeline]: 1.42e-06 [overlap_opt_shard_grad_in_pipeline]: 1.79e-06 [control_data_broadcast_order]: 1.468e-05 [grouped_pairwise_exchange_alltoall]: 1.74998e-06 [offloading_packed_experts]: 4.48999e-06 [overlap_recompute_and_grad_model_parallel]: 5.10999e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.40999e-06 [overlap_recompute_allgather_and_fa_grad]: 1.31998e-06 [overlap_recompute_comm]: 2.06998e-06 [overlap_grad_ring_attention]: 4.72e-06 [overlap_grad_flash_sp]: 2.023e-05 [begin_end_overlap_inline]: 5.50004e-07 [split_matmul_comm_elemetwise]: 2.26e-06 [split_layernorm_comm]: 1.61998e-06 [handle_group_info]: 1.32e-06 [symbol_engine_optimizer]: 8.218e-05, [1] [Cycle 1]: 7.809e-05, [6] [build]: 3.26001e-06 [elim_shapecalc]: 1.078e-05 [elim_not_effective]: 1.49e-05 [opt_reshape]: 8.18999e-06 [fold_const_symbol]: 1.248e-05 [renormalize]: 2.50002e-07 [detach_backward]: 1.77001e-06 [pipeline_parallel_scheduler]: 1.65001e-06 [auto_monad_reorder]: 1.967e-05 [get_jit_bprop_graph]: 1.35999e-06 [rewriter_after_jit_bprop_graph]: 3.79002e-06 [opt_after_jit_grad]: 0.00046351 [validate]: 4.085e-05 Sums bootstrap : 0.000435s : 1.68% type_inference : 0.020991s : 81.19% event_method : 0.000014s : 0.06% auto_monad : 0.000059s : 0.23% graph_reusing : 0.000006s : 0.02% inline : 0.000003s : 0.01% add_attr.add_attr_with_inline.tag_attr : 0.000016s : 0.06% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000004s : 0.02% parallel-infer-symbol : 0.000003s : 0.01% pre_auto_parallel : 0.000027s : 0.10% insert-virtual-dataset : 0.000002s : 0.01% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.01% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000019s : 0.07% optimize.rewriter_before_opt_a : 0.000054s : 0.21% optimize.opt_a.expand_dump_flag : 0.000004s : 0.02% optimize.opt_a.switch_simplify : 0.000035s : 0.14% optimize.opt_a.loop_unroll : 0.000021s : 0.08% optimize.opt_a.a_1 : 0.000550s : 2.13% optimize.opt_a.with_stream_mark : 0.000024s : 0.09% optimize.opt_a.recompute_prepare : 0.000018s : 0.07% optimize.opt_a.updatestate_depend_eliminate : 0.000009s : 0.03% optimize.opt_a.updatestate_assign_eliminate : 0.000006s : 0.02% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.02% optimize.opt_a.parameter_eliminate : 0.000003s : 0.01% optimize.opt_a.a_2 : 0.000206s : 0.80% optimize.opt_a.accelerated_algorithm : 0.000031s : 0.12% optimize.opt_a.shard : 0.000003s : 0.01% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.02% optimize.opt_a.shard_inline : 0.000016s : 0.06% optimize.opt_a.merge_send_recv : 0.000015s : 0.06% optimize.opt_a.auto_parallel : 0.000014s : 0.05% optimize.opt_a.parallel : 0.000023s : 0.09% optimize.opt_a.flash_sp : 0.000011s : 0.04% optimize.opt_a.merge_comm : 0.000009s : 0.03% optimize.opt_a.allreduce_fusion : 0.000008s : 0.03% optimize.opt_a.matmul_add_comm_reduction : 0.000019s : 0.07% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000017s : 0.07% optimize.opt_a.virtual_dataset : 0.000015s : 0.06% optimize.opt_a.get_grad_eliminate_ : 0.000014s : 0.05% optimize.opt_a.virtual_output : 0.000014s : 0.05% optimize.opt_a.merge_forward : 0.000008s : 0.03% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.01% optimize.opt_a.offload_activation : 0.000019s : 0.07% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000028s : 0.11% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.01% optimize.opt_a.before_grad : 0.000024s : 0.09% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000009s : 0.03% optimize.opt_a.meta_fg_expand : 0.000006s : 0.02% optimize.opt_a.flash_sp_send_recv_attached : 0.000006s : 0.02% optimize.opt_a.receive_attached : 0.000003s : 0.01% optimize.opt_a.after_resolve : 0.000020s : 0.08% optimize.opt_a.a_after_grad : 0.000022s : 0.08% optimize.opt_a.renormalize : 0.000615s : 2.38% optimize.opt_a.add_forward_monad_depend : 0.000006s : 0.02% optimize.opt_a.auto_monad_grad : 0.000003s : 0.01% optimize.opt_a.auto_monad_eliminator : 0.000025s : 0.10% optimize.opt_a.cse : 0.000057s : 0.22% optimize.opt_a.a_3 : 0.000100s : 0.39% optimize.py_interpret_to_execute_after_opt_a : 0.000010s : 0.04% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.01% optimize.rewriter_after_opt_a : 0.000040s : 0.16% optimize.convert_after_rewriter : 0.000008s : 0.03% optimize.order_py_execute_after_rewriter : 0.000006s : 0.02% optimize.mutable_eliminate : 0.000499s : 1.93% optimize.opt_b.b_1 : 0.000152s : 0.59% optimize.opt_b.b_2 : 0.000010s : 0.04% optimize.opt_b.updatestate_depend_eliminate : 0.000007s : 0.03% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.01% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.01% optimize.opt_b.renormalize : 0.000000s : 0.00% optimize.opt_b.cse : 0.000024s : 0.09% optimize.optimize_parallel_all_gather_comm : 0.000018s : 0.07% optimize.overlap_param_gather : 0.000002s : 0.01% optimize.cconv : 0.000024s : 0.09% optimize.loop_unroll : 0.000455s : 1.76% optimize.opt_after_cconv.c_1 : 0.000033s : 0.13% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.02% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.cse : 0.000023s : 0.09% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000016s : 0.06% optimize.tuple_transform.d_1 : 0.000047s : 0.18% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000008s : 0.03% optimize.partial_unused_args_eliminate : 0.000002s : 0.01% optimize.add_recomputation : 0.000054s : 0.21% optimize.cse_after_recomputation.cse : 0.000016s : 0.06% optimize.environ_conv : 0.000006s : 0.02% optimize.swap_dp_allreduce_reducescatter : 0.000006s : 0.02% optimize.bias_add_comm_swap : 0.000002s : 0.01% optimize.label_micro_interleaved_index : 0.000004s : 0.02% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.01% optimize.merge_cast_opt : 0.000001s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.01% optimize.micro_interleaved_order_control : 0.000002s : 0.01% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.00% optimize.full_micro_interleaved_order_control : 0.000002s : 0.01% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.01% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.01% optimize.control_data_broadcast_order : 0.000015s : 0.06% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.01% optimize.offloading_packed_experts : 0.000004s : 0.02% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.02% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.01% optimize.overlap_recompute_comm : 0.000002s : 0.01% optimize.overlap_grad_ring_attention : 0.000005s : 0.02% optimize.overlap_grad_flash_sp : 0.000020s : 0.08% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.01% optimize.split_layernorm_comm : 0.000002s : 0.01% optimize.handle_group_info : 0.000001s : 0.01% optimize.symbol_engine_optimizer.build : 0.000003s : 0.01% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000011s : 0.04% optimize.symbol_engine_optimizer.elim_not_effective : 0.000015s : 0.06% optimize.symbol_engine_optimizer.opt_reshape : 0.000008s : 0.03% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000012s : 0.05% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.01% pipeline_parallel_scheduler : 0.000002s : 0.01% auto_monad_reorder : 0.000020s : 0.08% get_jit_bprop_graph : 0.000001s : 0.01% rewriter_after_jit_bprop_graph : 0.000004s : 0.01% opt_after_jit_grad : 0.000464s : 1.79% validate : 0.000041s : 0.16% Time group info: ------[substitution.] 0.000221 34 30.66% : 0.000068s : 4: substitution.arithmetic_simplify 9.22% : 0.000020s : 2: substitution.cast_eliminate 1.02% : 0.000002s : 3: substitution.elim_not_effective 0.91% : 0.000002s : 3: substitution.fold_const_symbol 2.81% : 0.000006s : 4: substitution.graph_param_transform 44.66% : 0.000099s : 2: substitution.inline 1.89% : 0.000004s : 6: substitution.j_node_and_user_rematch 4.99% : 0.000011s : 2: substitution.less_batch_normalization 2.59% : 0.000006s : 6: substitution.remove_not_recompute_node 1.25% : 0.000003s : 2: substitution.replace_old_param ------[type_inference.] 0.020939 2 97.41% : 0.020397s : 1: type_inference.infer 2.59% : 0.000542s : 1: type_inference.specialize ------[replace.] 0.000020 2 100.00% : 0.000020s : 2: replace.inline ------[match.] 0.000097 2 100.00% : 0.000097s : 2: match.inline ------[predicate.] 0.000169 980 0.88% : 0.000001s : 9: predicate.accumulaten_eliminater 0.92% : 0.000002s : 4: predicate.ad_related_special_op_eliminate 0.67% : 0.000001s : 8: predicate.addn_check_dump 0.92% : 0.000002s : 9: predicate.addn_zero_filter 0.72% : 0.000001s : 9: predicate.adjust_all_reduce_mul_add 2.73% : 0.000005s : 17: predicate.arithmetic_simplify 0.81% : 0.000001s : 9: predicate.cast_eliminate 0.84% : 0.000001s : 8: predicate.check_bprop_eliminate 0.70% : 0.000001s : 8: predicate.compare_switch_simplify 0.25% : 0.000000s : 4: predicate.const_output_eliminate 0.76% : 0.000001s : 8: predicate.depend_value_elim 0.88% : 0.000001s : 9: predicate.dict_get_item_const_eliminator 0.96% : 0.000002s : 9: predicate.dict_get_item_eliminator 0.77% : 0.000001s : 9: predicate.dict_set_item_eliminator 1.27% : 0.000002s : 8: predicate.dumpgradient_eliminate 0.28% : 0.000000s : 4: predicate.elim_not_effective 0.52% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.12% : 0.000002s : 13: predicate.environ_add_const_eliminate 1.04% : 0.000002s : 13: predicate.environ_get_add_eliminate 1.12% : 0.000002s : 13: predicate.environ_get_depend_swap 1.82% : 0.000003s : 21: predicate.environ_get_eliminate 1.15% : 0.000002s : 13: predicate.environ_get_set_eliminate 0.93% : 0.000002s : 11: predicate.exchange_switch_depend_value 1.87% : 0.000003s : 11: predicate.float_depend_g_call 0.69% : 0.000001s : 8: predicate.float_environ_get_switch 0.98% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.24% : 0.000000s : 4: predicate.fold_const_symbol 0.84% : 0.000001s : 8: predicate.get_grad_eliminate 0.28% : 0.000000s : 4: predicate.graph_param_transform 0.85% : 0.000001s : 8: predicate.incorporate_call 0.65% : 0.000001s : 8: predicate.incorporate_call_switch 6.45% : 0.000011s : 44: predicate.inline 1.28% : 0.000002s : 8: predicate.inline_without_move 0.38% : 0.000001s : 8: predicate.j_node_and_user_rematch 1.24% : 0.000002s : 8: predicate.less_batch_normalization 1.61% : 0.000003s : 17: predicate.list_to_tuple_eliminator_ 2.12% : 0.000004s : 26: predicate.load_eliminater 1.25% : 0.000002s : 4: predicate.loop_unroll_after_grad 1.48% : 0.000003s : 16: predicate.loop_unroll_before_grad 1.75% : 0.000003s : 17: predicate.make_slice_get_slice_eliminator 0.76% : 0.000001s : 8: predicate.merge_addn 0.79% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.72% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.69% : 0.000001s : 9: predicate.minmaximum_grad 1.27% : 0.000002s : 4: predicate.mutable_eliminate 0.47% : 0.000001s : 4: predicate.opt_reshape 0.52% : 0.000001s : 4: predicate.parallel_virtual_node 1.20% : 0.000002s : 11: predicate.partial_defer_inline 1.24% : 0.000002s : 13: predicate.partial_eliminate 0.75% : 0.000001s : 9: predicate.print_const_string_wrapper 0.73% : 0.000001s : 8: predicate.reduce_all_const_elim 1.22% : 0.000002s : 9: predicate.reduce_eliminate 2.24% : 0.000004s : 26: predicate.redundant_stop_gradient_eliminater 0.46% : 0.000001s : 8: predicate.remove_not_recompute_node 1.16% : 0.000002s : 17: predicate.replace_applicator 0.65% : 0.000001s : 8: predicate.replace_old_param 0.34% : 0.000001s : 4: predicate.reset_defer_inline 0.84% : 0.000001s : 9: predicate.reshape_eliminate 0.95% : 0.000002s : 8: predicate.row_tensor_add_zeros_like 0.51% : 0.000001s : 4: predicate.row_tensor_eliminate 0.86% : 0.000001s : 8: predicate.same_eliminate 0.59% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.92% : 0.000002s : 8: predicate.shard_identity_eliminate 0.93% : 0.000002s : 8: predicate.special_op_eliminate 0.95% : 0.000002s : 8: predicate.specialize_transform 1.11% : 0.000002s : 8: predicate.split_environ_get_set_with_tuple_value 0.92% : 0.000002s : 8: predicate.stack_unstack_eliminate 0.45% : 0.000001s : 4: predicate.switch_call_monad_eliminater 0.99% : 0.000002s : 11: predicate.switch_defer_inline 1.69% : 0.000003s : 19: predicate.switch_layer_defer_inline 3.97% : 0.000007s : 39: predicate.switch_simplify 0.82% : 0.000001s : 9: predicate.tile_eliminate 0.78% : 0.000001s : 9: predicate.transpose_eliminate 1.73% : 0.000003s : 17: predicate.tuple_list_convert_item_index_to_positive 1.64% : 0.000003s : 17: predicate.tuple_list_get_item_const_eliminator 1.44% : 0.000002s : 17: predicate.tuple_list_get_item_depend_reorder 2.94% : 0.000005s : 25: predicate.tuple_list_get_item_eliminator 1.77% : 0.000003s : 17: predicate.tuple_list_get_set_item_eliminator 2.43% : 0.000004s : 25: predicate.tuple_list_set_item_eliminator 1.53% : 0.000003s : 17: predicate.tuple_to_list_eliminator_ 2.05% : 0.000003s : 26: predicate.updatestate_pure_node_eliminater 2.83% : 0.000005s : 34: predicate.updatestate_useless_node_eliminater 0.53% : 0.000001s : 4: predicate.value_based_eliminate 0.82% : 0.000001s : 8: predicate.virtual_dataset_eliminate 0.77% : 0.000001s : 8: predicate.virtual_output_eliminate 0.43% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.56% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000259 5 6.61% : 0.000017s : 1: func_graph_cloner_run.FuncGraphClonerGraph 93.39% : 0.000242s : 4: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.040105 192 0.01% : 0.000004s : 1: ForceFp32Comm 8.57% : 0.003435s : 1: add_attr 8.54% : 0.003425s : 1: add_attr_with_inline 0.01% : 0.000004s : 1: add_comm_op_reuse_tag 0.15% : 0.000058s : 1: add_recomputation 0.01% : 0.000004s : 1: assign_add_opt 0.16% : 0.000064s : 1: auto_monad 0.06% : 0.000024s : 1: auto_monad_reorder 0.01% : 0.000003s : 1: begin_end_overlap_inline 0.01% : 0.000005s : 1: bias_add_comm_swap 1.16% : 0.000465s : 1: bootstrap 0.07% : 0.000028s : 1: cconv 0.01% : 0.000004s : 1: comm_op_add_attrs 0.04% : 0.000018s : 1: control_data_broadcast_order 0.03% : 0.000011s : 1: convert_after_rewriter 0.07% : 0.000028s : 1: cse_after_recomputation 0.01% : 0.000005s : 1: dataset_repeat_opt 0.01% : 0.000005s : 1: detach_backward 0.02% : 0.000009s : 1: environ_conv 0.05% : 0.000022s : 1: event_method 0.01% : 0.000005s : 1: full_micro_interleaved_order_control 0.01% : 0.000005s : 1: get_jit_bprop_graph 0.02% : 0.000009s : 1: graph_reusing 0.01% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000004s : 1: handle_group_info 0.02% : 0.000006s : 1: inline 0.02% : 0.000006s : 1: insert-virtual-dataset 0.01% : 0.000004s : 1: interleave_parallel_branches 0.01% : 0.000004s : 1: interleave_split_concat_branches 0.01% : 0.000006s : 1: label_fine_grained_interleaved_index 0.02% : 0.000007s : 1: label_micro_interleaved_index 1.16% : 0.000464s : 1: loop_unroll 0.01% : 0.000004s : 1: merge_cast_opt 0.01% : 0.000005s : 1: micro_interleaved_order_control 1.26% : 0.000507s : 1: mutable_eliminate 0.02% : 0.000008s : 1: offloading_packed_experts 0.04% : 0.000016s : 1: opt.transform.loop_unroll_optimizer 0.04% : 0.000017s : 1: opt.transform.mutable_eliminate 2.58% : 0.001035s : 78: opt.transform.opt_a 0.08% : 0.000032s : 1: opt.transform.opt_after_cconv 0.07% : 0.000027s : 1: opt.transform.opt_after_jit_grad 0.32% : 0.000129s : 28: opt.transform.opt_b 0.13% : 0.000052s : 2: opt.transform.opt_trans_graph 0.11% : 0.000043s : 4: opt.transform.symbol_engine_opt 6.28% : 0.002517s : 1: opt_a 0.28% : 0.000114s : 1: opt_after_cconv 1.18% : 0.000472s : 1: opt_after_jit_grad 0.62% : 0.000248s : 1: opt_b 11.44% : 0.004590s : 1: optimize 0.06% : 0.000022s : 1: optimize_parallel_all_gather_comm 0.02% : 0.000009s : 1: order_py_execute_after_rewriter 0.06% : 0.000023s : 1: overlap_grad_flash_sp 0.01% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.02% : 0.000008s : 1: overlap_grad_ring_attention 0.01% : 0.000004s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.01% : 0.000005s : 1: overlap_param_gather 0.01% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.02% : 0.000008s : 1: overlap_recompute_and_grad_model_parallel 0.01% : 0.000005s : 1: overlap_recompute_comm 0.02% : 0.000007s : 1: parallel-infer-symbol 0.01% : 0.000004s : 1: parallel-infer-symbol-second 0.01% : 0.000005s : 1: partial_unused_args_eliminate 0.01% : 0.000005s : 1: pipeline_parallel_scheduler 0.01% : 0.000005s : 1: pipeline_split 0.08% : 0.000031s : 1: pre_auto_parallel 0.06% : 0.000023s : 1: py_interpret_to_execute 0.03% : 0.000014s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000004s : 1: remove_cast_before_assign_add 0.05% : 0.000020s : 1: remove_dup_value 0.89% : 0.000356s : 1: renormalize.infer 0.63% : 0.000251s : 1: renormalize.specialize 0.01% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.02% : 0.000007s : 1: rewriter_after_jit_bprop_graph 0.11% : 0.000044s : 1: rewriter_after_opt_a 0.14% : 0.000058s : 1: rewriter_before_opt_a 0.01% : 0.000004s : 1: slice_cell_reuse_recomputed_activation 0.01% : 0.000005s : 1: slice_recompute_activation 0.01% : 0.000004s : 1: split_layernorm_comm 0.01% : 0.000005s : 1: split_matmul_comm_elemetwise 0.02% : 0.000009s : 1: swap_dp_allreduce_reducescatter 0.21% : 0.000085s : 1: symbol_engine_optimizer 0.21% : 0.000083s : 1: tuple_transform 52.40% : 0.021014s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:33.493.332 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:33.493.659 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0310998, [21] [bootstrap]: 0.00045199 [type_inference]: 0.0048626 [event_method]: 1.184e-05 [auto_monad]: 5.538e-05 [graph_reusing]: 5.06002e-06 [inline]: 2.66999e-06 [add_attr]: 0.0194139, [1] [add_attr_with_inline]: 0.0193995, [1] [Cycle 1]: 9.705e-05, [2] [tag_attr]: 1.974e-05 [meta_addattr_fg_expand]: 3.73999e-06 [parallel-infer-symbol]: 4.23999e-06 [pre_auto_parallel]: 3.481e-05 [insert-virtual-dataset]: 2.36e-06 [parallel-infer-symbol-second]: 7.30011e-07 [dataset_repeat_opt]: 1.86e-06 [pipeline_split]: 1.64e-06 [optimize]: 0.00512506, [53] [py_interpret_to_execute]: 2.904e-05 [rewriter_before_opt_a]: 5.966e-05 [opt_a]: 0.00276729, [2] [Cycle 1]: 0.00192707, [45] [expand_dump_flag]: 2.95002e-06 [switch_simplify]: 2.576e-05 [loop_unroll]: 1.449e-05 [a_1]: 0.00036654 [with_stream_mark]: 2.097e-05 [recompute_prepare]: 9.08002e-06 [updatestate_depend_eliminate]: 4.11001e-06 [updatestate_assign_eliminate]: 3.47997e-06 [updatestate_loads_eliminate]: 3.46001e-06 [parameter_eliminate]: 1.94999e-06 [a_2]: 0.00012219 [accelerated_algorithm]: 2.026e-05 [shard]: 2.41e-06 [meta_shard_fg_expand]: 1.66e-06 [shard_inline]: 6.52001e-06 [merge_send_recv]: 8.43001e-06 [auto_parallel]: 6.56e-06 [parallel]: 1.953e-05 [flash_sp]: 7.48e-06 [merge_comm]: 3.88001e-06 [allreduce_fusion]: 3.43999e-06 [matmul_add_comm_reduction]: 1.156e-05 [allreduce_slice_to_reducescatter]: 9.00007e-07 [virtual_shard_identity]: 8.70999e-06 [virtual_dataset]: 6.66e-06 [get_grad_eliminate_]: 6.35002e-06 [virtual_output]: 6.53e-06 [merge_forward]: 3.83001e-06 [cell_reuse_recompute_pass]: 1.45999e-06 [offload_activation]: 1.045e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.532e-05 [merge_recompute_call_nodes]: 1.42999e-06 [before_grad]: 1.128e-05 [set_forward_comm_id_for_comm_node_pass]: 3.71001e-06 [meta_fg_expand]: 2.83e-06 [flash_sp_send_recv_attached]: 4.57e-06 [receive_attached]: 2.78998e-06 [after_resolve]: 1.108e-05 [a_after_grad]: 9.73002e-06 [renormalize]: 0.00065064 [add_forward_monad_depend]: 5.36998e-06 [auto_monad_grad]: 2.59999e-06 [auto_monad_eliminator]: 1.481e-05 [cse]: 3.131e-05 [a_3]: 6.134e-05 [Cycle 2]: 0.00082694, [45] [expand_dump_flag]: 1.39e-06 [switch_simplify]: 7.13998e-06 [loop_unroll]: 6.06e-06 [a_1]: 0.00012962 [with_stream_mark]: 9.00999e-06 [recompute_prepare]: 6.39999e-06 [updatestate_depend_eliminate]: 3.85998e-06 [updatestate_assign_eliminate]: 2.68e-06 [updatestate_loads_eliminate]: 2.57001e-06 [parameter_eliminate]: 1.04e-06 [a_2]: 0.00010823 [accelerated_algorithm]: 9.82001e-06 [shard]: 8.60018e-07 [meta_shard_fg_expand]: 1.71e-06 [shard_inline]: 6.49001e-06 [merge_send_recv]: 5.12e-06 [auto_parallel]: 5.49e-06 [parallel]: 5.19e-06 [flash_sp]: 3.5e-06 [merge_comm]: 3.43e-06 [allreduce_fusion]: 3.32002e-06 [matmul_add_comm_reduction]: 7.58999e-06 [allreduce_slice_to_reducescatter]: 3.19997e-07 [virtual_shard_identity]: 6.91001e-06 [virtual_dataset]: 6.04999e-06 [get_grad_eliminate_]: 5.69999e-06 [virtual_output]: 5.42999e-06 [merge_forward]: 2.61e-06 [cell_reuse_recompute_pass]: 1.50001e-06 [offload_activation]: 7.68001e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.289e-05 [merge_recompute_call_nodes]: 9.20001e-07 [before_grad]: 9.13002e-06 [set_forward_comm_id_for_comm_node_pass]: 4.11001e-06 [meta_fg_expand]: 2.16e-06 [flash_sp_send_recv_attached]: 9.39996e-07 [receive_attached]: 1.19998e-06 [after_resolve]: 8.60001e-06 [a_after_grad]: 8.3e-06 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 1.19e-06 [auto_monad_grad]: 1.00001e-06 [auto_monad_eliminator]: 7.60998e-06 [cse]: 1.523e-05 [a_3]: 4.76e-05 [py_interpret_to_execute_after_opt_a]: 1.283e-05 [slice_cell_reuse_recomputed_activation]: 4.40999e-06 [rewriter_after_opt_a]: 3.898e-05 [convert_after_rewriter]: 9.18002e-06 [order_py_execute_after_rewriter]: 7.97e-06 [mutable_eliminate]: 0.0006248 [opt_b]: 0.00026928, [1] [Cycle 1]: 0.00026014, [7] [b_1]: 0.00016397 [b_2]: 7.77e-06 [updatestate_depend_eliminate]: 6.59999e-06 [updatestate_assign_eliminate]: 2.61999e-06 [updatestate_loads_eliminate]: 2.23002e-06 [renormalize]: 6.39993e-07 [cse]: 2.177e-05 [optimize_parallel_all_gather_comm]: 1.992e-05 [overlap_param_gather]: 5.30001e-06 [cconv]: 3.007e-05 [loop_unroll]: 0.00043914 [opt_after_cconv]: 0.00012599, [1] [Cycle 1]: 0.00011775, [7] [c_1]: 2.782e-05 [parameter_eliminate]: 3.55e-06 [updatestate_depend_eliminate]: 5.54e-06 [updatestate_assign_eliminate]: 2.47001e-06 [updatestate_loads_eliminate]: 2.57001e-06 [cse]: 1.87e-05 [renormalize]: 4.59986e-07 [remove_dup_value]: 1.903e-05 [tuple_transform]: 8.773e-05, [1] [Cycle 1]: 8.093e-05, [4] [d_1]: 4.076e-05 [none_parameter_eliminate]: 1.69998e-06 [renormalize]: 2.50002e-07 [switch_simplify]: 7.73999e-06 [partial_unused_args_eliminate]: 4.56002e-06 [add_recomputation]: 4.949e-05 [cse_after_recomputation]: 2.779e-05, [1] [Cycle 1]: 2.089e-05, [1] [cse]: 1.199e-05 [environ_conv]: 8.50001e-06 [swap_dp_allreduce_reducescatter]: 7.65e-06 [bias_add_comm_swap]: 5.41998e-06 [label_micro_interleaved_index]: 7.28e-06 [label_fine_grained_interleaved_index]: 5.22e-06 [merge_cast_opt]: 4.00998e-06 [slice_recompute_activation]: 4.62998e-06 [micro_interleaved_order_control]: 4.94998e-06 [assign_add_opt]: 3.84002e-06 [ForceFp32Comm]: 3.16001e-06 [remove_cast_before_assign_add]: 3.33998e-06 [full_micro_interleaved_order_control]: 4.99998e-06 [reorder_send_recv_between_fp_bp]: 5.37001e-06 [comm_op_add_attrs]: 3.48e-06 [add_comm_op_reuse_tag]: 3.20002e-06 [interleave_split_concat_branches]: 3.75998e-06 [interleave_parallel_branches]: 3.43e-06 [overlap_opt_shard_in_pipeline]: 3.65998e-06 [overlap_opt_shard_grad_in_pipeline]: 4.37e-06 [control_data_broadcast_order]: 1.65e-05 [grouped_pairwise_exchange_alltoall]: 4.42e-06 [offloading_packed_experts]: 7.48999e-06 [overlap_recompute_and_grad_model_parallel]: 7.20998e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.53999e-06 [overlap_recompute_allgather_and_fa_grad]: 3.7e-06 [overlap_recompute_comm]: 4.72e-06 [overlap_grad_ring_attention]: 6.72002e-06 [overlap_grad_flash_sp]: 2.297e-05 [begin_end_overlap_inline]: 3.06999e-06 [split_matmul_comm_elemetwise]: 4.40999e-06 [split_layernorm_comm]: 4.37e-06 [handle_group_info]: 3.33e-06 [symbol_engine_optimizer]: 9.879e-05, [1] [Cycle 1]: 9.204e-05, [6] [build]: 2.76e-06 [elim_shapecalc]: 1.062e-05 [elim_not_effective]: 1.29e-05 [opt_reshape]: 7.31001e-06 [fold_const_symbol]: 1.028e-05 [renormalize]: 2.3999e-07 [detach_backward]: 3.28e-06 [pipeline_parallel_scheduler]: 1.60001e-06 [auto_monad_reorder]: 1.873e-05 [get_jit_bprop_graph]: 1.94e-06 [rewriter_after_jit_bprop_graph]: 4.20999e-06 [opt_after_jit_grad]: 0.00048681 [validate]: 3.854e-05 Sums bootstrap : 0.000452s : 4.54% type_inference : 0.004863s : 48.86% event_method : 0.000012s : 0.12% auto_monad : 0.000055s : 0.56% graph_reusing : 0.000005s : 0.05% inline : 0.000003s : 0.03% add_attr.add_attr_with_inline.tag_attr : 0.000020s : 0.20% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000004s : 0.04% parallel-infer-symbol : 0.000004s : 0.04% pre_auto_parallel : 0.000035s : 0.35% insert-virtual-dataset : 0.000002s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.02% optimize.py_interpret_to_execute : 0.000029s : 0.29% optimize.rewriter_before_opt_a : 0.000060s : 0.60% optimize.opt_a.expand_dump_flag : 0.000004s : 0.04% optimize.opt_a.switch_simplify : 0.000033s : 0.33% optimize.opt_a.loop_unroll : 0.000021s : 0.21% optimize.opt_a.a_1 : 0.000496s : 4.99% optimize.opt_a.with_stream_mark : 0.000030s : 0.30% optimize.opt_a.recompute_prepare : 0.000015s : 0.16% optimize.opt_a.updatestate_depend_eliminate : 0.000008s : 0.08% optimize.opt_a.updatestate_assign_eliminate : 0.000006s : 0.06% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.06% optimize.opt_a.parameter_eliminate : 0.000003s : 0.03% optimize.opt_a.a_2 : 0.000230s : 2.32% optimize.opt_a.accelerated_algorithm : 0.000030s : 0.30% optimize.opt_a.shard : 0.000003s : 0.03% optimize.opt_a.meta_shard_fg_expand : 0.000003s : 0.03% optimize.opt_a.shard_inline : 0.000013s : 0.13% optimize.opt_a.merge_send_recv : 0.000014s : 0.14% optimize.opt_a.auto_parallel : 0.000012s : 0.12% optimize.opt_a.parallel : 0.000025s : 0.25% optimize.opt_a.flash_sp : 0.000011s : 0.11% optimize.opt_a.merge_comm : 0.000007s : 0.07% optimize.opt_a.allreduce_fusion : 0.000007s : 0.07% optimize.opt_a.matmul_add_comm_reduction : 0.000019s : 0.19% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000016s : 0.16% optimize.opt_a.virtual_dataset : 0.000013s : 0.13% optimize.opt_a.get_grad_eliminate_ : 0.000012s : 0.12% optimize.opt_a.virtual_output : 0.000012s : 0.12% optimize.opt_a.merge_forward : 0.000006s : 0.06% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.03% optimize.opt_a.offload_activation : 0.000018s : 0.18% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000028s : 0.28% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.02% optimize.opt_a.before_grad : 0.000020s : 0.21% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000008s : 0.08% optimize.opt_a.meta_fg_expand : 0.000005s : 0.05% optimize.opt_a.flash_sp_send_recv_attached : 0.000006s : 0.06% optimize.opt_a.receive_attached : 0.000004s : 0.04% optimize.opt_a.after_resolve : 0.000020s : 0.20% optimize.opt_a.a_after_grad : 0.000018s : 0.18% optimize.opt_a.renormalize : 0.000651s : 6.54% optimize.opt_a.add_forward_monad_depend : 0.000007s : 0.07% optimize.opt_a.auto_monad_grad : 0.000004s : 0.04% optimize.opt_a.auto_monad_eliminator : 0.000022s : 0.23% optimize.opt_a.cse : 0.000047s : 0.47% optimize.opt_a.a_3 : 0.000109s : 1.09% optimize.py_interpret_to_execute_after_opt_a : 0.000013s : 0.13% optimize.slice_cell_reuse_recomputed_activation : 0.000004s : 0.04% optimize.rewriter_after_opt_a : 0.000039s : 0.39% optimize.convert_after_rewriter : 0.000009s : 0.09% optimize.order_py_execute_after_rewriter : 0.000008s : 0.08% optimize.mutable_eliminate : 0.000625s : 6.28% optimize.opt_b.b_1 : 0.000164s : 1.65% optimize.opt_b.b_2 : 0.000008s : 0.08% optimize.opt_b.updatestate_depend_eliminate : 0.000007s : 0.07% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_b.updatestate_loads_eliminate : 0.000002s : 0.02% optimize.opt_b.renormalize : 0.000001s : 0.01% optimize.opt_b.cse : 0.000022s : 0.22% optimize.optimize_parallel_all_gather_comm : 0.000020s : 0.20% optimize.overlap_param_gather : 0.000005s : 0.05% optimize.cconv : 0.000030s : 0.30% optimize.loop_unroll : 0.000439s : 4.41% optimize.opt_after_cconv.c_1 : 0.000028s : 0.28% optimize.opt_after_cconv.parameter_eliminate : 0.000004s : 0.04% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.06% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000002s : 0.02% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.cse : 0.000019s : 0.19% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000019s : 0.19% optimize.tuple_transform.d_1 : 0.000041s : 0.41% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.02% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000008s : 0.08% optimize.partial_unused_args_eliminate : 0.000005s : 0.05% optimize.add_recomputation : 0.000049s : 0.50% optimize.cse_after_recomputation.cse : 0.000012s : 0.12% optimize.environ_conv : 0.000009s : 0.09% optimize.swap_dp_allreduce_reducescatter : 0.000008s : 0.08% optimize.bias_add_comm_swap : 0.000005s : 0.05% optimize.label_micro_interleaved_index : 0.000007s : 0.07% optimize.label_fine_grained_interleaved_index : 0.000005s : 0.05% optimize.merge_cast_opt : 0.000004s : 0.04% optimize.slice_recompute_activation : 0.000005s : 0.05% optimize.micro_interleaved_order_control : 0.000005s : 0.05% optimize.assign_add_opt : 0.000004s : 0.04% optimize.ForceFp32Comm : 0.000003s : 0.03% optimize.remove_cast_before_assign_add : 0.000003s : 0.03% optimize.full_micro_interleaved_order_control : 0.000005s : 0.05% optimize.reorder_send_recv_between_fp_bp : 0.000005s : 0.05% optimize.comm_op_add_attrs : 0.000003s : 0.03% optimize.add_comm_op_reuse_tag : 0.000003s : 0.03% optimize.interleave_split_concat_branches : 0.000004s : 0.04% optimize.interleave_parallel_branches : 0.000003s : 0.03% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.04% optimize.overlap_opt_shard_grad_in_pipeline : 0.000004s : 0.04% optimize.control_data_broadcast_order : 0.000017s : 0.17% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.04% optimize.offloading_packed_experts : 0.000007s : 0.08% optimize.overlap_recompute_and_grad_model_parallel : 0.000007s : 0.07% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.04% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.04% optimize.overlap_recompute_comm : 0.000005s : 0.05% optimize.overlap_grad_ring_attention : 0.000007s : 0.07% optimize.overlap_grad_flash_sp : 0.000023s : 0.23% optimize.begin_end_overlap_inline : 0.000003s : 0.03% optimize.split_matmul_comm_elemetwise : 0.000004s : 0.04% optimize.split_layernorm_comm : 0.000004s : 0.04% optimize.handle_group_info : 0.000003s : 0.03% optimize.symbol_engine_optimizer.build : 0.000003s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000011s : 0.11% optimize.symbol_engine_optimizer.elim_not_effective : 0.000013s : 0.13% optimize.symbol_engine_optimizer.opt_reshape : 0.000007s : 0.07% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000010s : 0.10% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000003s : 0.03% pipeline_parallel_scheduler : 0.000002s : 0.02% auto_monad_reorder : 0.000019s : 0.19% get_jit_bprop_graph : 0.000002s : 0.02% rewriter_after_jit_bprop_graph : 0.000004s : 0.04% opt_after_jit_grad : 0.000487s : 4.89% validate : 0.000039s : 0.39% Time group info: ------[substitution.] 0.000221 25 33.64% : 0.000074s : 4: substitution.arithmetic_simplify 0.85% : 0.000002s : 2: substitution.elim_not_effective 0.60% : 0.000001s : 2: substitution.fold_const_symbol 2.42% : 0.000005s : 3: substitution.graph_param_transform 51.89% : 0.000115s : 2: substitution.inline 1.64% : 0.000004s : 4: substitution.j_node_and_user_rematch 5.22% : 0.000012s : 2: substitution.less_batch_normalization 2.28% : 0.000005s : 4: substitution.remove_not_recompute_node 1.46% : 0.000003s : 2: substitution.replace_old_param ------[type_inference.] 0.004816 2 91.22% : 0.004393s : 1: type_inference.infer 8.78% : 0.000423s : 1: type_inference.specialize ------[replace.] 0.000026 2 100.00% : 0.000026s : 2: replace.inline ------[match.] 0.000113 2 100.00% : 0.000113s : 2: match.inline ------[predicate.] 0.000141 754 0.78% : 0.000001s : 7: predicate.accumulaten_eliminater 0.92% : 0.000001s : 3: predicate.ad_related_special_op_eliminate 0.69% : 0.000001s : 6: predicate.addn_check_dump 0.78% : 0.000001s : 7: predicate.addn_zero_filter 0.68% : 0.000001s : 7: predicate.adjust_all_reduce_mul_add 2.90% : 0.000004s : 13: predicate.arithmetic_simplify 0.69% : 0.000001s : 7: predicate.cast_eliminate 0.80% : 0.000001s : 6: predicate.check_bprop_eliminate 0.69% : 0.000001s : 6: predicate.compare_switch_simplify 0.22% : 0.000000s : 3: predicate.const_output_eliminate 0.72% : 0.000001s : 6: predicate.depend_value_elim 0.87% : 0.000001s : 7: predicate.dict_get_item_const_eliminator 0.98% : 0.000001s : 7: predicate.dict_get_item_eliminator 0.70% : 0.000001s : 7: predicate.dict_set_item_eliminator 1.29% : 0.000002s : 6: predicate.dumpgradient_eliminate 0.26% : 0.000000s : 3: predicate.elim_not_effective 0.52% : 0.000001s : 3: predicate.elim_shapecalc_of_broadcastargs 1.09% : 0.000002s : 10: predicate.environ_add_const_eliminate 1.04% : 0.000001s : 10: predicate.environ_get_add_eliminate 1.04% : 0.000001s : 10: predicate.environ_get_depend_swap 1.75% : 0.000002s : 16: predicate.environ_get_eliminate 0.98% : 0.000001s : 10: predicate.environ_get_set_eliminate 0.94% : 0.000001s : 9: predicate.exchange_switch_depend_value 1.85% : 0.000003s : 9: predicate.float_depend_g_call 0.72% : 0.000001s : 6: predicate.float_environ_get_switch 1.20% : 0.000002s : 9: predicate.float_tuple_getitem_switch 0.21% : 0.000000s : 3: predicate.fold_const_symbol 0.93% : 0.000001s : 6: predicate.get_grad_eliminate 0.26% : 0.000000s : 3: predicate.graph_param_transform 0.81% : 0.000001s : 6: predicate.incorporate_call 0.65% : 0.000001s : 6: predicate.incorporate_call_switch 6.78% : 0.000010s : 34: predicate.inline 0.90% : 0.000001s : 6: predicate.inline_without_move 0.36% : 0.000001s : 6: predicate.j_node_and_user_rematch 1.34% : 0.000002s : 6: predicate.less_batch_normalization 1.59% : 0.000002s : 13: predicate.list_to_tuple_eliminator_ 2.10% : 0.000003s : 20: predicate.load_eliminater 1.13% : 0.000002s : 3: predicate.loop_unroll_after_grad 1.63% : 0.000002s : 14: predicate.loop_unroll_before_grad 1.72% : 0.000002s : 13: predicate.make_slice_get_slice_eliminator 0.64% : 0.000001s : 6: predicate.merge_addn 0.60% : 0.000001s : 6: predicate.micro_step_allgather_replace 0.72% : 0.000001s : 6: predicate.mini_step_allgather_replace 0.63% : 0.000001s : 7: predicate.minmaximum_grad 1.66% : 0.000002s : 3: predicate.mutable_eliminate 0.47% : 0.000001s : 3: predicate.opt_reshape 0.60% : 0.000001s : 3: predicate.parallel_virtual_node 1.41% : 0.000002s : 9: predicate.partial_defer_inline 1.17% : 0.000002s : 10: predicate.partial_eliminate 0.73% : 0.000001s : 7: predicate.print_const_string_wrapper 0.65% : 0.000001s : 6: predicate.reduce_all_const_elim 0.92% : 0.000001s : 7: predicate.reduce_eliminate 2.03% : 0.000003s : 20: predicate.redundant_stop_gradient_eliminater 0.63% : 0.000001s : 6: predicate.remove_not_recompute_node 1.11% : 0.000002s : 13: predicate.replace_applicator 0.60% : 0.000001s : 6: predicate.replace_old_param 0.32% : 0.000000s : 3: predicate.reset_defer_inline 0.76% : 0.000001s : 7: predicate.reshape_eliminate 0.75% : 0.000001s : 6: predicate.row_tensor_add_zeros_like 0.68% : 0.000001s : 3: predicate.row_tensor_eliminate 0.95% : 0.000001s : 6: predicate.same_eliminate 0.50% : 0.000001s : 6: predicate.set_cell_output_no_recompute 1.13% : 0.000002s : 6: predicate.shard_identity_eliminate 0.96% : 0.000001s : 6: predicate.special_op_eliminate 1.02% : 0.000001s : 6: predicate.specialize_transform 1.04% : 0.000001s : 6: predicate.split_environ_get_set_with_tuple_value 0.97% : 0.000001s : 6: predicate.stack_unstack_eliminate 0.43% : 0.000001s : 3: predicate.switch_call_monad_eliminater 1.11% : 0.000002s : 9: predicate.switch_defer_inline 1.61% : 0.000002s : 15: predicate.switch_layer_defer_inline 4.63% : 0.000007s : 32: predicate.switch_simplify 0.74% : 0.000001s : 7: predicate.tile_eliminate 0.75% : 0.000001s : 7: predicate.transpose_eliminate 1.59% : 0.000002s : 13: predicate.tuple_list_convert_item_index_to_positive 1.77% : 0.000002s : 13: predicate.tuple_list_get_item_const_eliminator 1.64% : 0.000002s : 13: predicate.tuple_list_get_item_depend_reorder 3.19% : 0.000004s : 19: predicate.tuple_list_get_item_eliminator 1.48% : 0.000002s : 13: predicate.tuple_list_get_set_item_eliminator 2.58% : 0.000004s : 19: predicate.tuple_list_set_item_eliminator 1.54% : 0.000002s : 13: predicate.tuple_to_list_eliminator_ 1.98% : 0.000003s : 20: predicate.updatestate_pure_node_eliminater 2.77% : 0.000004s : 26: predicate.updatestate_useless_node_eliminater 0.49% : 0.000001s : 3: predicate.value_based_eliminate 0.91% : 0.000001s : 6: predicate.virtual_dataset_eliminate 0.77% : 0.000001s : 6: predicate.virtual_output_eliminate 0.33% : 0.000000s : 3: predicate.virtual_view_grad_eliminate 0.53% : 0.000001s : 3: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000240 5 7.73% : 0.000019s : 1: func_graph_cloner_run.FuncGraphClonerGraph 92.27% : 0.000222s : 4: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.057323 192 0.01% : 0.000006s : 1: ForceFp32Comm 33.89% : 0.019427s : 1: add_attr 33.85% : 0.019404s : 1: add_attr_with_inline 0.01% : 0.000006s : 1: add_comm_op_reuse_tag 0.09% : 0.000053s : 1: add_recomputation 0.01% : 0.000007s : 1: assign_add_opt 0.11% : 0.000064s : 1: auto_monad 0.05% : 0.000026s : 1: auto_monad_reorder 0.01% : 0.000006s : 1: begin_end_overlap_inline 0.01% : 0.000008s : 1: bias_add_comm_swap 0.87% : 0.000496s : 1: bootstrap 0.06% : 0.000033s : 1: cconv 0.01% : 0.000006s : 1: comm_op_add_attrs 0.03% : 0.000020s : 1: control_data_broadcast_order 0.02% : 0.000012s : 1: convert_after_rewriter 0.05% : 0.000031s : 1: cse_after_recomputation 0.01% : 0.000008s : 1: dataset_repeat_opt 0.03% : 0.000018s : 1: detach_backward 0.02% : 0.000011s : 1: environ_conv 0.04% : 0.000022s : 1: event_method 0.01% : 0.000008s : 1: full_micro_interleaved_order_control 0.01% : 0.000009s : 1: get_jit_bprop_graph 0.02% : 0.000011s : 1: graph_reusing 0.01% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000006s : 1: handle_group_info 0.01% : 0.000009s : 1: inline 0.01% : 0.000008s : 1: insert-virtual-dataset 0.01% : 0.000006s : 1: interleave_parallel_branches 0.01% : 0.000006s : 1: interleave_split_concat_branches 0.01% : 0.000008s : 1: label_fine_grained_interleaved_index 0.02% : 0.000010s : 1: label_micro_interleaved_index 0.78% : 0.000446s : 1: loop_unroll 0.01% : 0.000007s : 1: merge_cast_opt 0.01% : 0.000008s : 1: micro_interleaved_order_control 1.10% : 0.000631s : 1: mutable_eliminate 0.02% : 0.000010s : 1: offloading_packed_experts 0.02% : 0.000013s : 1: opt.transform.loop_unroll_optimizer 0.03% : 0.000017s : 1: opt.transform.mutable_eliminate 1.59% : 0.000909s : 78: opt.transform.opt_a 0.05% : 0.000026s : 1: opt.transform.opt_after_cconv 0.04% : 0.000025s : 1: opt.transform.opt_after_jit_grad 0.17% : 0.000098s : 28: opt.transform.opt_b 0.08% : 0.000046s : 2: opt.transform.opt_trans_graph 0.06% : 0.000037s : 4: opt.transform.symbol_engine_opt 4.83% : 0.002770s : 1: opt_a 0.23% : 0.000130s : 1: opt_after_cconv 0.87% : 0.000497s : 1: opt_after_jit_grad 0.48% : 0.000273s : 1: opt_b 9.53% : 0.005460s : 1: optimize 0.04% : 0.000023s : 1: optimize_parallel_all_gather_comm 0.02% : 0.000011s : 1: order_py_execute_after_rewriter 0.05% : 0.000026s : 1: overlap_grad_flash_sp 0.01% : 0.000006s : 1: overlap_grad_matmul_and_grad_allreduce 0.02% : 0.000010s : 1: overlap_grad_ring_attention 0.01% : 0.000007s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000006s : 1: overlap_opt_shard_in_pipeline 0.01% : 0.000009s : 1: overlap_param_gather 0.01% : 0.000006s : 1: overlap_recompute_allgather_and_fa_grad 0.02% : 0.000010s : 1: overlap_recompute_and_grad_model_parallel 0.01% : 0.000008s : 1: overlap_recompute_comm 0.02% : 0.000011s : 1: parallel-infer-symbol 0.01% : 0.000006s : 1: parallel-infer-symbol-second 0.01% : 0.000008s : 1: partial_unused_args_eliminate 0.01% : 0.000008s : 1: pipeline_parallel_scheduler 0.01% : 0.000007s : 1: pipeline_split 0.07% : 0.000042s : 1: pre_auto_parallel 0.06% : 0.000032s : 1: py_interpret_to_execute 0.03% : 0.000016s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000006s : 1: remove_cast_before_assign_add 0.04% : 0.000022s : 1: remove_dup_value 0.63% : 0.000364s : 1: renormalize.infer 0.49% : 0.000279s : 1: renormalize.specialize 0.01% : 0.000008s : 1: reorder_send_recv_between_fp_bp 0.02% : 0.000010s : 1: rewriter_after_jit_bprop_graph 0.07% : 0.000042s : 1: rewriter_after_opt_a 0.11% : 0.000063s : 1: rewriter_before_opt_a 0.01% : 0.000007s : 1: slice_cell_reuse_recomputed_activation 0.01% : 0.000007s : 1: slice_recompute_activation 0.01% : 0.000007s : 1: split_layernorm_comm 0.01% : 0.000007s : 1: split_matmul_comm_elemetwise 0.02% : 0.000010s : 1: swap_dp_allreduce_reducescatter 0.18% : 0.000102s : 1: symbol_engine_optimizer 0.16% : 0.000091s : 1: tuple_transform 8.53% : 0.004890s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:33.890.573 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0298637, [21] [bootstrap]: 0.00044001 [type_inference]: 0.00474959 [event_method]: 1.151e-05 [auto_monad]: 5.482e-05 [graph_reusing]: 5.14e-06 [inline]: 2.25002e-06 [add_attr]: 0.0193573, [1] [add_attr_with_inline]: 0.0193419, [1] [Cycle 1]: 7.834e-05, [2] [tag_attr]: 2.085e-05 [meta_addattr_fg_expand]: 3.76999e-06 [parallel-infer-symbol]: 3.61999e-06 [pre_auto_parallel]: 3.383e-05 [insert-virtual-dataset]: 2.33998e-06 [parallel-infer-symbol-second]: 7.7e-07 [dataset_repeat_opt]: 1.97999e-06 [pipeline_split]: 1.87001e-06 [optimize]: 0.00451679, [53] [py_interpret_to_execute]: 2.644e-05 [rewriter_before_opt_a]: 5.476e-05 [opt_a]: 0.0024328, [2] [Cycle 1]: 0.00176081, [45] [expand_dump_flag]: 2.62001e-06 [switch_simplify]: 2.682e-05 [loop_unroll]: 1.369e-05 [a_1]: 0.00036719 [with_stream_mark]: 2.064e-05 [recompute_prepare]: 9.15999e-06 [updatestate_depend_eliminate]: 4.38001e-06 [updatestate_assign_eliminate]: 4.23999e-06 [updatestate_loads_eliminate]: 3.06001e-06 [parameter_eliminate]: 2.04e-06 [a_2]: 9.365e-05 [accelerated_algorithm]: 2.086e-05 [shard]: 1.99999e-06 [meta_shard_fg_expand]: 1.86003e-06 [shard_inline]: 6.53e-06 [merge_send_recv]: 8.68001e-06 [auto_parallel]: 6.27001e-06 [parallel]: 1.935e-05 [flash_sp]: 7.98999e-06 [merge_comm]: 4.08001e-06 [allreduce_fusion]: 4.03999e-06 [matmul_add_comm_reduction]: 1.078e-05 [allreduce_slice_to_reducescatter]: 1.12e-06 [virtual_shard_identity]: 9.84001e-06 [virtual_dataset]: 6.33e-06 [get_grad_eliminate_]: 5.67999e-06 [virtual_output]: 6.73998e-06 [merge_forward]: 4.95001e-06 [cell_reuse_recompute_pass]: 1.47001e-06 [offload_activation]: 1.047e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.233e-05 [merge_recompute_call_nodes]: 1.39998e-06 [before_grad]: 1.154e-05 [set_forward_comm_id_for_comm_node_pass]: 4.09002e-06 [meta_fg_expand]: 3.01001e-06 [flash_sp_send_recv_attached]: 4.95001e-06 [receive_attached]: 2.61e-06 [after_resolve]: 1.075e-05 [a_after_grad]: 9.89999e-06 [renormalize]: 0.00066245 [add_forward_monad_depend]: 5.70001e-06 [auto_monad_grad]: 2.82002e-06 [auto_monad_eliminator]: 1.509e-05 [cse]: 3.083e-05 [a_3]: 4.865e-05 [Cycle 2]: 0.00066207, [45] [expand_dump_flag]: 1.08001e-06 [switch_simplify]: 7.11001e-06 [loop_unroll]: 5.86998e-06 [a_1]: 0.00013036 [with_stream_mark]: 9.94001e-06 [recompute_prepare]: 6.78e-06 [updatestate_depend_eliminate]: 3.19001e-06 [updatestate_assign_eliminate]: 2.34001e-06 [updatestate_loads_eliminate]: 2.87002e-06 [parameter_eliminate]: 1.09e-06 [a_2]: 8.338e-05 [accelerated_algorithm]: 9.47001e-06 [shard]: 1.10999e-06 [meta_shard_fg_expand]: 1.40999e-06 [shard_inline]: 6.23998e-06 [merge_send_recv]: 5.54998e-06 [auto_parallel]: 6.02999e-06 [parallel]: 5.05001e-06 [flash_sp]: 4.27e-06 [merge_comm]: 3.56999e-06 [allreduce_fusion]: 3.34001e-06 [matmul_add_comm_reduction]: 6.84001e-06 [allreduce_slice_to_reducescatter]: 5.3001e-07 [virtual_shard_identity]: 6.89001e-06 [virtual_dataset]: 5.47001e-06 [get_grad_eliminate_]: 5.61e-06 [virtual_output]: 5.60001e-06 [merge_forward]: 3.5e-06 [cell_reuse_recompute_pass]: 1.44998e-06 [offload_activation]: 7.66001e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.039e-05 [merge_recompute_call_nodes]: 1.24003e-06 [before_grad]: 9.20001e-06 [set_forward_comm_id_for_comm_node_pass]: 3.85998e-06 [meta_fg_expand]: 2.24001e-06 [flash_sp_send_recv_attached]: 9.70002e-07 [receive_attached]: 9.70002e-07 [after_resolve]: 8.84998e-06 [a_after_grad]: 8.64e-06 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 1.63002e-06 [auto_monad_grad]: 8.30012e-07 [auto_monad_eliminator]: 7.61001e-06 [cse]: 1.55e-05 [a_3]: 3.544e-05 [py_interpret_to_execute_after_opt_a]: 9.79e-06 [slice_cell_reuse_recomputed_activation]: 1.87001e-06 [rewriter_after_opt_a]: 3.611e-05 [convert_after_rewriter]: 6.36998e-06 [order_py_execute_after_rewriter]: 5.55001e-06 [mutable_eliminate]: 0.00060905 [opt_b]: 0.0002082, [1] [Cycle 1]: 0.0002024, [7] [b_1]: 0.00012269 [b_2]: 7.29001e-06 [updatestate_depend_eliminate]: 6.88e-06 [updatestate_assign_eliminate]: 3.03998e-06 [updatestate_loads_eliminate]: 2.37999e-06 [renormalize]: 7.29982e-07 [cse]: 2.321e-05 [optimize_parallel_all_gather_comm]: 1.698e-05 [overlap_param_gather]: 2.06e-06 [cconv]: 2.818e-05 [loop_unroll]: 0.0004239 [opt_after_cconv]: 0.00010108, [1] [Cycle 1]: 9.555e-05, [7] [c_1]: 2.756e-05 [parameter_eliminate]: 3.32002e-06 [updatestate_depend_eliminate]: 5.59998e-06 [updatestate_assign_eliminate]: 2.40002e-06 [updatestate_loads_eliminate]: 2.36e-06 [cse]: 1.909e-05 [renormalize]: 4.30009e-07 [remove_dup_value]: 1.449e-05 [tuple_transform]: 7.279e-05, [1] [Cycle 1]: 6.823e-05, [4] [d_1]: 4.083e-05 [none_parameter_eliminate]: 1.55999e-06 [renormalize]: 2.29978e-07 [switch_simplify]: 6.81001e-06 [partial_unused_args_eliminate]: 2.16e-06 [add_recomputation]: 4.664e-05 [cse_after_recomputation]: 2.164e-05, [1] [Cycle 1]: 1.711e-05, [1] [cse]: 1.119e-05 [environ_conv]: 5.25999e-06 [swap_dp_allreduce_reducescatter]: 5.94e-06 [bias_add_comm_swap]: 2.69999e-06 [label_micro_interleaved_index]: 4.25e-06 [label_fine_grained_interleaved_index]: 2.79999e-06 [merge_cast_opt]: 1.44998e-06 [slice_recompute_activation]: 2.02999e-06 [micro_interleaved_order_control]: 2.69999e-06 [assign_add_opt]: 1.17999e-06 [ForceFp32Comm]: 7.80012e-07 [remove_cast_before_assign_add]: 9.70002e-07 [full_micro_interleaved_order_control]: 2.07999e-06 [reorder_send_recv_between_fp_bp]: 2.69001e-06 [comm_op_add_attrs]: 9.89996e-07 [add_comm_op_reuse_tag]: 1.00001e-06 [interleave_split_concat_branches]: 1.10001e-06 [interleave_parallel_branches]: 1.04003e-06 [overlap_opt_shard_in_pipeline]: 1.14e-06 [overlap_opt_shard_grad_in_pipeline]: 1.69e-06 [control_data_broadcast_order]: 1.298e-05 [grouped_pairwise_exchange_alltoall]: 1.61998e-06 [offloading_packed_experts]: 3.91001e-06 [overlap_recompute_and_grad_model_parallel]: 4.70001e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.14e-06 [overlap_recompute_allgather_and_fa_grad]: 1.34e-06 [overlap_recompute_comm]: 2.56e-06 [overlap_grad_ring_attention]: 4.12998e-06 [overlap_grad_flash_sp]: 1.847e-05 [begin_end_overlap_inline]: 4.89992e-07 [split_matmul_comm_elemetwise]: 2.29001e-06 [split_layernorm_comm]: 1.50001e-06 [handle_group_info]: 1.10999e-06 [symbol_engine_optimizer]: 7.675e-05, [1] [Cycle 1]: 7.265e-05, [6] [build]: 2.84001e-06 [elim_shapecalc]: 1.086e-05 [elim_not_effective]: 1.295e-05 [opt_reshape]: 7.45e-06 [fold_const_symbol]: 9.79e-06 [renormalize]: 2.19996e-07 [detach_backward]: 1.96998e-06 [pipeline_parallel_scheduler]: 1.89e-06 [auto_monad_reorder]: 1.651e-05 [get_jit_bprop_graph]: 1.79998e-06 [rewriter_after_jit_bprop_graph]: 4.11001e-06 [opt_after_jit_grad]: 0.00046248 [validate]: 4.021e-05 Sums bootstrap : 0.000440s : 4.61% type_inference : 0.004750s : 49.79% event_method : 0.000012s : 0.12% auto_monad : 0.000055s : 0.57% graph_reusing : 0.000005s : 0.05% inline : 0.000002s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000021s : 0.22% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000004s : 0.04% parallel-infer-symbol : 0.000004s : 0.04% pre_auto_parallel : 0.000034s : 0.35% insert-virtual-dataset : 0.000002s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.02% optimize.py_interpret_to_execute : 0.000026s : 0.28% optimize.rewriter_before_opt_a : 0.000055s : 0.57% optimize.opt_a.expand_dump_flag : 0.000004s : 0.04% optimize.opt_a.switch_simplify : 0.000034s : 0.36% optimize.opt_a.loop_unroll : 0.000020s : 0.21% optimize.opt_a.a_1 : 0.000498s : 5.22% optimize.opt_a.with_stream_mark : 0.000031s : 0.32% optimize.opt_a.recompute_prepare : 0.000016s : 0.17% optimize.opt_a.updatestate_depend_eliminate : 0.000008s : 0.08% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.07% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.06% optimize.opt_a.parameter_eliminate : 0.000003s : 0.03% optimize.opt_a.a_2 : 0.000177s : 1.86% optimize.opt_a.accelerated_algorithm : 0.000030s : 0.32% optimize.opt_a.shard : 0.000003s : 0.03% optimize.opt_a.meta_shard_fg_expand : 0.000003s : 0.03% optimize.opt_a.shard_inline : 0.000013s : 0.13% optimize.opt_a.merge_send_recv : 0.000014s : 0.15% optimize.opt_a.auto_parallel : 0.000012s : 0.13% optimize.opt_a.parallel : 0.000024s : 0.26% optimize.opt_a.flash_sp : 0.000012s : 0.13% optimize.opt_a.merge_comm : 0.000008s : 0.08% optimize.opt_a.allreduce_fusion : 0.000007s : 0.08% optimize.opt_a.matmul_add_comm_reduction : 0.000018s : 0.18% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000002s : 0.02% optimize.opt_a.virtual_shard_identity : 0.000017s : 0.18% optimize.opt_a.virtual_dataset : 0.000012s : 0.12% optimize.opt_a.get_grad_eliminate_ : 0.000011s : 0.12% optimize.opt_a.virtual_output : 0.000012s : 0.13% optimize.opt_a.merge_forward : 0.000008s : 0.09% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.03% optimize.opt_a.offload_activation : 0.000018s : 0.19% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000023s : 0.24% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.03% optimize.opt_a.before_grad : 0.000021s : 0.22% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000008s : 0.08% optimize.opt_a.meta_fg_expand : 0.000005s : 0.06% optimize.opt_a.flash_sp_send_recv_attached : 0.000006s : 0.06% optimize.opt_a.receive_attached : 0.000004s : 0.04% optimize.opt_a.after_resolve : 0.000020s : 0.21% optimize.opt_a.a_after_grad : 0.000019s : 0.19% optimize.opt_a.renormalize : 0.000663s : 6.95% optimize.opt_a.add_forward_monad_depend : 0.000007s : 0.08% optimize.opt_a.auto_monad_grad : 0.000004s : 0.04% optimize.opt_a.auto_monad_eliminator : 0.000023s : 0.24% optimize.opt_a.cse : 0.000046s : 0.49% optimize.opt_a.a_3 : 0.000084s : 0.88% optimize.py_interpret_to_execute_after_opt_a : 0.000010s : 0.10% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.02% optimize.rewriter_after_opt_a : 0.000036s : 0.38% optimize.convert_after_rewriter : 0.000006s : 0.07% optimize.order_py_execute_after_rewriter : 0.000006s : 0.06% optimize.mutable_eliminate : 0.000609s : 6.38% optimize.opt_b.b_1 : 0.000123s : 1.29% optimize.opt_b.b_2 : 0.000007s : 0.08% optimize.opt_b.updatestate_depend_eliminate : 0.000007s : 0.07% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_b.updatestate_loads_eliminate : 0.000002s : 0.02% optimize.opt_b.renormalize : 0.000001s : 0.01% optimize.opt_b.cse : 0.000023s : 0.24% optimize.optimize_parallel_all_gather_comm : 0.000017s : 0.18% optimize.overlap_param_gather : 0.000002s : 0.02% optimize.cconv : 0.000028s : 0.30% optimize.loop_unroll : 0.000424s : 4.44% optimize.opt_after_cconv.c_1 : 0.000028s : 0.29% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.06% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000002s : 0.03% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.02% optimize.opt_after_cconv.cse : 0.000019s : 0.20% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000014s : 0.15% optimize.tuple_transform.d_1 : 0.000041s : 0.43% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.02% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000007s : 0.07% optimize.partial_unused_args_eliminate : 0.000002s : 0.02% optimize.add_recomputation : 0.000047s : 0.49% optimize.cse_after_recomputation.cse : 0.000011s : 0.12% optimize.environ_conv : 0.000005s : 0.06% optimize.swap_dp_allreduce_reducescatter : 0.000006s : 0.06% optimize.bias_add_comm_swap : 0.000003s : 0.03% optimize.label_micro_interleaved_index : 0.000004s : 0.04% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.03% optimize.merge_cast_opt : 0.000001s : 0.02% optimize.slice_recompute_activation : 0.000002s : 0.02% optimize.micro_interleaved_order_control : 0.000003s : 0.03% optimize.assign_add_opt : 0.000001s : 0.01% optimize.ForceFp32Comm : 0.000001s : 0.01% optimize.remove_cast_before_assign_add : 0.000001s : 0.01% optimize.full_micro_interleaved_order_control : 0.000002s : 0.02% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.03% optimize.comm_op_add_attrs : 0.000001s : 0.01% optimize.add_comm_op_reuse_tag : 0.000001s : 0.01% optimize.interleave_split_concat_branches : 0.000001s : 0.01% optimize.interleave_parallel_branches : 0.000001s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.02% optimize.control_data_broadcast_order : 0.000013s : 0.14% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.02% optimize.offloading_packed_experts : 0.000004s : 0.04% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.05% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.01% optimize.overlap_recompute_comm : 0.000003s : 0.03% optimize.overlap_grad_ring_attention : 0.000004s : 0.04% optimize.overlap_grad_flash_sp : 0.000018s : 0.19% optimize.begin_end_overlap_inline : 0.000000s : 0.01% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.02% optimize.split_layernorm_comm : 0.000002s : 0.02% optimize.handle_group_info : 0.000001s : 0.01% optimize.symbol_engine_optimizer.build : 0.000003s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000011s : 0.11% optimize.symbol_engine_optimizer.elim_not_effective : 0.000013s : 0.14% optimize.symbol_engine_optimizer.opt_reshape : 0.000007s : 0.08% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000010s : 0.10% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.02% pipeline_parallel_scheduler : 0.000002s : 0.02% auto_monad_reorder : 0.000017s : 0.17% get_jit_bprop_graph : 0.000002s : 0.02% rewriter_after_jit_bprop_graph : 0.000004s : 0.04% opt_after_jit_grad : 0.000462s : 4.85% validate : 0.000040s : 0.42% Time group info: ------[substitution.] 0.000219 25 32.67% : 0.000072s : 4: substitution.arithmetic_simplify 0.87% : 0.000002s : 2: substitution.elim_not_effective 0.68% : 0.000001s : 2: substitution.fold_const_symbol 2.82% : 0.000006s : 3: substitution.graph_param_transform 51.41% : 0.000113s : 2: substitution.inline 2.14% : 0.000005s : 4: substitution.j_node_and_user_rematch 5.78% : 0.000013s : 2: substitution.less_batch_normalization 1.96% : 0.000004s : 4: substitution.remove_not_recompute_node 1.66% : 0.000004s : 2: substitution.replace_old_param ------[type_inference.] 0.004702 2 91.09% : 0.004283s : 1: type_inference.infer 8.91% : 0.000419s : 1: type_inference.specialize ------[replace.] 0.000024 2 100.00% : 0.000024s : 2: replace.inline ------[match.] 0.000111 2 100.00% : 0.000111s : 2: match.inline ------[predicate.] 0.000142 754 0.79% : 0.000001s : 7: predicate.accumulaten_eliminater 1.19% : 0.000002s : 3: predicate.ad_related_special_op_eliminate 0.61% : 0.000001s : 6: predicate.addn_check_dump 1.02% : 0.000001s : 7: predicate.addn_zero_filter 0.66% : 0.000001s : 7: predicate.adjust_all_reduce_mul_add 2.75% : 0.000004s : 13: predicate.arithmetic_simplify 0.92% : 0.000001s : 7: predicate.cast_eliminate 0.72% : 0.000001s : 6: predicate.check_bprop_eliminate 0.81% : 0.000001s : 6: predicate.compare_switch_simplify 0.20% : 0.000000s : 3: predicate.const_output_eliminate 0.82% : 0.000001s : 6: predicate.depend_value_elim 0.71% : 0.000001s : 7: predicate.dict_get_item_const_eliminator 0.93% : 0.000001s : 7: predicate.dict_get_item_eliminator 0.73% : 0.000001s : 7: predicate.dict_set_item_eliminator 1.31% : 0.000002s : 6: predicate.dumpgradient_eliminate 0.25% : 0.000000s : 3: predicate.elim_not_effective 0.66% : 0.000001s : 3: predicate.elim_shapecalc_of_broadcastargs 1.03% : 0.000001s : 10: predicate.environ_add_const_eliminate 0.98% : 0.000001s : 10: predicate.environ_get_add_eliminate 1.07% : 0.000002s : 10: predicate.environ_get_depend_swap 1.92% : 0.000003s : 16: predicate.environ_get_eliminate 1.02% : 0.000001s : 10: predicate.environ_get_set_eliminate 0.95% : 0.000001s : 9: predicate.exchange_switch_depend_value 1.84% : 0.000003s : 9: predicate.float_depend_g_call 0.66% : 0.000001s : 6: predicate.float_environ_get_switch 0.95% : 0.000001s : 9: predicate.float_tuple_getitem_switch 0.20% : 0.000000s : 3: predicate.fold_const_symbol 0.76% : 0.000001s : 6: predicate.get_grad_eliminate 0.25% : 0.000000s : 3: predicate.graph_param_transform 0.81% : 0.000001s : 6: predicate.incorporate_call 0.63% : 0.000001s : 6: predicate.incorporate_call_switch 6.73% : 0.000010s : 34: predicate.inline 1.31% : 0.000002s : 6: predicate.inline_without_move 0.35% : 0.000000s : 6: predicate.j_node_and_user_rematch 1.26% : 0.000002s : 6: predicate.less_batch_normalization 1.52% : 0.000002s : 13: predicate.list_to_tuple_eliminator_ 2.30% : 0.000003s : 20: predicate.load_eliminater 1.17% : 0.000002s : 3: predicate.loop_unroll_after_grad 1.64% : 0.000002s : 14: predicate.loop_unroll_before_grad 1.65% : 0.000002s : 13: predicate.make_slice_get_slice_eliminator 0.68% : 0.000001s : 6: predicate.merge_addn 0.64% : 0.000001s : 6: predicate.micro_step_allgather_replace 0.71% : 0.000001s : 6: predicate.mini_step_allgather_replace 0.70% : 0.000001s : 7: predicate.minmaximum_grad 1.49% : 0.000002s : 3: predicate.mutable_eliminate 0.45% : 0.000001s : 3: predicate.opt_reshape 0.72% : 0.000001s : 3: predicate.parallel_virtual_node 1.24% : 0.000002s : 9: predicate.partial_defer_inline 1.22% : 0.000002s : 10: predicate.partial_eliminate 0.77% : 0.000001s : 7: predicate.print_const_string_wrapper 0.57% : 0.000001s : 6: predicate.reduce_all_const_elim 1.27% : 0.000002s : 7: predicate.reduce_eliminate 2.28% : 0.000003s : 20: predicate.redundant_stop_gradient_eliminater 0.57% : 0.000001s : 6: predicate.remove_not_recompute_node 1.13% : 0.000002s : 13: predicate.replace_applicator 0.62% : 0.000001s : 6: predicate.replace_old_param 0.66% : 0.000001s : 3: predicate.reset_defer_inline 0.86% : 0.000001s : 7: predicate.reshape_eliminate 0.78% : 0.000001s : 6: predicate.row_tensor_add_zeros_like 0.52% : 0.000001s : 3: predicate.row_tensor_eliminate 1.14% : 0.000002s : 6: predicate.same_eliminate 0.51% : 0.000001s : 6: predicate.set_cell_output_no_recompute 1.13% : 0.000002s : 6: predicate.shard_identity_eliminate 0.94% : 0.000001s : 6: predicate.special_op_eliminate 0.95% : 0.000001s : 6: predicate.specialize_transform 1.17% : 0.000002s : 6: predicate.split_environ_get_set_with_tuple_value 0.98% : 0.000001s : 6: predicate.stack_unstack_eliminate 0.42% : 0.000001s : 3: predicate.switch_call_monad_eliminater 0.91% : 0.000001s : 9: predicate.switch_defer_inline 1.60% : 0.000002s : 15: predicate.switch_layer_defer_inline 4.06% : 0.000006s : 32: predicate.switch_simplify 0.77% : 0.000001s : 7: predicate.tile_eliminate 0.79% : 0.000001s : 7: predicate.transpose_eliminate 1.63% : 0.000002s : 13: predicate.tuple_list_convert_item_index_to_positive 1.63% : 0.000002s : 13: predicate.tuple_list_get_item_const_eliminator 1.28% : 0.000002s : 13: predicate.tuple_list_get_item_depend_reorder 2.94% : 0.000004s : 19: predicate.tuple_list_get_item_eliminator 1.97% : 0.000003s : 13: predicate.tuple_list_get_set_item_eliminator 2.28% : 0.000003s : 19: predicate.tuple_list_set_item_eliminator 1.45% : 0.000002s : 13: predicate.tuple_to_list_eliminator_ 1.87% : 0.000003s : 20: predicate.updatestate_pure_node_eliminater 2.76% : 0.000004s : 26: predicate.updatestate_useless_node_eliminater 0.40% : 0.000001s : 3: predicate.value_based_eliminate 0.74% : 0.000001s : 6: predicate.virtual_dataset_eliminate 0.74% : 0.000001s : 6: predicate.virtual_output_eliminate 0.30% : 0.000000s : 3: predicate.virtual_view_grad_eliminate 0.68% : 0.000001s : 3: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000236 5 8.10% : 0.000019s : 1: func_graph_cloner_run.FuncGraphClonerGraph 91.90% : 0.000217s : 4: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.055425 192 0.01% : 0.000004s : 1: ForceFp32Comm 34.94% : 0.019364s : 1: add_attr 34.91% : 0.019347s : 1: add_attr_with_inline 0.01% : 0.000004s : 1: add_comm_op_reuse_tag 0.09% : 0.000050s : 1: add_recomputation 0.01% : 0.000004s : 1: assign_add_opt 0.11% : 0.000060s : 1: auto_monad 0.04% : 0.000020s : 1: auto_monad_reorder 0.01% : 0.000004s : 1: begin_end_overlap_inline 0.01% : 0.000005s : 1: bias_add_comm_swap 0.84% : 0.000468s : 1: bootstrap 0.06% : 0.000032s : 1: cconv 0.01% : 0.000004s : 1: comm_op_add_attrs 0.03% : 0.000016s : 1: control_data_broadcast_order 0.02% : 0.000010s : 1: convert_after_rewriter 0.04% : 0.000024s : 1: cse_after_recomputation 0.01% : 0.000005s : 1: dataset_repeat_opt 0.01% : 0.000005s : 1: detach_backward 0.02% : 0.000008s : 1: environ_conv 0.03% : 0.000017s : 1: event_method 0.01% : 0.000005s : 1: full_micro_interleaved_order_control 0.01% : 0.000005s : 1: get_jit_bprop_graph 0.02% : 0.000009s : 1: graph_reusing 0.01% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000004s : 1: handle_group_info 0.01% : 0.000005s : 1: inline 0.01% : 0.000006s : 1: insert-virtual-dataset 0.01% : 0.000004s : 1: interleave_parallel_branches 0.01% : 0.000004s : 1: interleave_split_concat_branches 0.01% : 0.000006s : 1: label_fine_grained_interleaved_index 0.01% : 0.000007s : 1: label_micro_interleaved_index 0.78% : 0.000432s : 1: loop_unroll 0.01% : 0.000004s : 1: merge_cast_opt 0.01% : 0.000005s : 1: micro_interleaved_order_control 1.12% : 0.000618s : 1: mutable_eliminate 0.01% : 0.000007s : 1: offloading_packed_experts 0.02% : 0.000013s : 1: opt.transform.loop_unroll_optimizer 0.03% : 0.000017s : 1: opt.transform.mutable_eliminate 1.63% : 0.000905s : 78: opt.transform.opt_a 0.05% : 0.000026s : 1: opt.transform.opt_after_cconv 0.04% : 0.000022s : 1: opt.transform.opt_after_jit_grad 0.17% : 0.000097s : 28: opt.transform.opt_b 0.08% : 0.000045s : 2: opt.transform.opt_trans_graph 0.07% : 0.000037s : 4: opt.transform.symbol_engine_opt 4.40% : 0.002436s : 1: opt_a 0.19% : 0.000104s : 1: opt_after_cconv 0.85% : 0.000472s : 1: opt_after_jit_grad 0.38% : 0.000212s : 1: opt_b 8.16% : 0.004522s : 1: optimize 0.04% : 0.000020s : 1: optimize_parallel_all_gather_comm 0.02% : 0.000009s : 1: order_py_execute_after_rewriter 0.04% : 0.000022s : 1: overlap_grad_flash_sp 0.01% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.01% : 0.000007s : 1: overlap_grad_ring_attention 0.01% : 0.000004s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.01% : 0.000005s : 1: overlap_param_gather 0.01% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.01% : 0.000008s : 1: overlap_recompute_and_grad_model_parallel 0.01% : 0.000005s : 1: overlap_recompute_comm 0.01% : 0.000008s : 1: parallel-infer-symbol 0.01% : 0.000004s : 1: parallel-infer-symbol-second 0.01% : 0.000005s : 1: partial_unused_args_eliminate 0.01% : 0.000005s : 1: pipeline_parallel_scheduler 0.01% : 0.000005s : 1: pipeline_split 0.07% : 0.000038s : 1: pre_auto_parallel 0.06% : 0.000031s : 1: py_interpret_to_execute 0.02% : 0.000014s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000004s : 1: remove_cast_before_assign_add 0.03% : 0.000018s : 1: remove_dup_value 0.66% : 0.000365s : 1: renormalize.infer 0.52% : 0.000289s : 1: renormalize.specialize 0.01% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.01% : 0.000007s : 1: rewriter_after_jit_bprop_graph 0.08% : 0.000042s : 1: rewriter_after_opt_a 0.11% : 0.000059s : 1: rewriter_before_opt_a 0.01% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.01% : 0.000005s : 1: slice_recompute_activation 0.01% : 0.000004s : 1: split_layernorm_comm 0.01% : 0.000005s : 1: split_matmul_comm_elemetwise 0.02% : 0.000009s : 1: swap_dp_allreduce_reducescatter 0.14% : 0.000079s : 1: symbol_engine_optimizer 0.14% : 0.000076s : 1: tuple_transform 8.60% : 0.004766s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:34.437.350 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:34.437.651 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0149694, [21] [bootstrap]: 0.00044023 [type_inference]: 0.0054023 [event_method]: 1.235e-05 [auto_monad]: 5.342e-05 [graph_reusing]: 5.20999e-06 [inline]: 2.53e-06 [add_attr]: 0.00313917, [1] [add_attr_with_inline]: 0.00313151, [1] [Cycle 1]: 6.447e-05, [2] [tag_attr]: 1.456e-05 [meta_addattr_fg_expand]: 3.41001e-06 [parallel-infer-symbol]: 3.11999e-06 [pre_auto_parallel]: 2.492e-05 [insert-virtual-dataset]: 2.53998e-06 [parallel-infer-symbol-second]: 7.89994e-07 [dataset_repeat_opt]: 2.16003e-06 [pipeline_split]: 1.59e-06 [optimize]: 0.00474939, [53] [py_interpret_to_execute]: 1.985e-05 [rewriter_before_opt_a]: 4.927e-05 [opt_a]: 0.00256848, [2] [Cycle 1]: 0.00173316, [45] [expand_dump_flag]: 2.97002e-06 [switch_simplify]: 2.424e-05 [loop_unroll]: 1.441e-05 [a_1]: 0.00031256 [with_stream_mark]: 1.657e-05 [recompute_prepare]: 8.62998e-06 [updatestate_depend_eliminate]: 4e-06 [updatestate_assign_eliminate]: 4.05e-06 [updatestate_loads_eliminate]: 3.00998e-06 [parameter_eliminate]: 2.39001e-06 [a_2]: 0.00012189 [accelerated_algorithm]: 1.877e-05 [shard]: 2.21998e-06 [meta_shard_fg_expand]: 2.11e-06 [shard_inline]: 6.75002e-06 [merge_send_recv]: 8.27e-06 [auto_parallel]: 6.11e-06 [parallel]: 1.822e-05 [flash_sp]: 8.01001e-06 [merge_comm]: 4.37e-06 [allreduce_fusion]: 3.31999e-06 [matmul_add_comm_reduction]: 1.019e-05 [allreduce_slice_to_reducescatter]: 6.80011e-07 [virtual_shard_identity]: 7.93001e-06 [virtual_dataset]: 6.33e-06 [get_grad_eliminate_]: 5.74999e-06 [virtual_output]: 6.31998e-06 [merge_forward]: 4.67998e-06 [cell_reuse_recompute_pass]: 1.49e-06 [offload_activation]: 1.014e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.443e-05 [merge_recompute_call_nodes]: 1.40001e-06 [before_grad]: 1.016e-05 [set_forward_comm_id_for_comm_node_pass]: 3.86999e-06 [meta_fg_expand]: 2.52001e-06 [flash_sp_send_recv_attached]: 4.78001e-06 [receive_attached]: 2.27001e-06 [after_resolve]: 9.85002e-06 [a_after_grad]: 9.74999e-06 [renormalize]: 0.00053211 [add_forward_monad_depend]: 4.90999e-06 [auto_monad_grad]: 2.09e-06 [auto_monad_eliminator]: 1.486e-05 [cse]: 3.104e-05 [a_3]: 6.221e-05 [Cycle 2]: 0.00082237, [45] [expand_dump_flag]: 1.04998e-06 [switch_simplify]: 7.01001e-06 [loop_unroll]: 5.85002e-06 [a_1]: 0.00012844 [with_stream_mark]: 9.42999e-06 [recompute_prepare]: 6.46e-06 [updatestate_depend_eliminate]: 3.16001e-06 [updatestate_assign_eliminate]: 2.66e-06 [updatestate_loads_eliminate]: 2.89999e-06 [parameter_eliminate]: 1.25999e-06 [a_2]: 0.00011009 [accelerated_algorithm]: 9.20999e-06 [shard]: 1.19998e-06 [meta_shard_fg_expand]: 1.32e-06 [shard_inline]: 6.69999e-06 [merge_send_recv]: 5.21998e-06 [auto_parallel]: 5.66e-06 [parallel]: 5.20001e-06 [flash_sp]: 3.28e-06 [merge_comm]: 3.51999e-06 [allreduce_fusion]: 3.23e-06 [matmul_add_comm_reduction]: 7.07002e-06 [allreduce_slice_to_reducescatter]: 4.00003e-07 [virtual_shard_identity]: 6.71999e-06 [virtual_dataset]: 5.77001e-06 [get_grad_eliminate_]: 5.67999e-06 [virtual_output]: 5.36998e-06 [merge_forward]: 2.63998e-06 [cell_reuse_recompute_pass]: 1.91e-06 [offload_activation]: 7.01001e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.287e-05 [merge_recompute_call_nodes]: 8.79983e-07 [before_grad]: 8.84e-06 [set_forward_comm_id_for_comm_node_pass]: 3.97998e-06 [meta_fg_expand]: 2.14e-06 [flash_sp_send_recv_attached]: 8.09989e-07 [receive_attached]: 1.02998e-06 [after_resolve]: 8.25999e-06 [a_after_grad]: 8.23001e-06 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 1.26997e-06 [auto_monad_grad]: 9.70002e-07 [auto_monad_eliminator]: 7.46999e-06 [cse]: 1.64e-05 [a_3]: 4.817e-05 [py_interpret_to_execute_after_opt_a]: 1.238e-05 [slice_cell_reuse_recomputed_activation]: 4.50001e-06 [rewriter_after_opt_a]: 3.913e-05 [convert_after_rewriter]: 9.29998e-06 [order_py_execute_after_rewriter]: 8.03001e-06 [mutable_eliminate]: 0.00050432 [opt_b]: 0.00026459, [1] [Cycle 1]: 0.00025575, [7] [b_1]: 0.00016189 [b_2]: 7.8e-06 [updatestate_depend_eliminate]: 6.23e-06 [updatestate_assign_eliminate]: 2.50997e-06 [updatestate_loads_eliminate]: 2.33998e-06 [renormalize]: 5.79981e-07 [cse]: 1.861e-05 [optimize_parallel_all_gather_comm]: 1.845e-05 [overlap_param_gather]: 5.02e-06 [cconv]: 2.82e-05 [loop_unroll]: 0.00042974 [opt_after_cconv]: 0.00012263, [1] [Cycle 1]: 0.00011422, [7] [c_1]: 2.821e-05 [parameter_eliminate]: 2.64001e-06 [updatestate_depend_eliminate]: 5.25999e-06 [updatestate_assign_eliminate]: 2.63e-06 [updatestate_loads_eliminate]: 2.58998e-06 [cse]: 1.824e-05 [renormalize]: 3.59985e-07 [remove_dup_value]: 1.792e-05 [tuple_transform]: 8.519e-05, [1] [Cycle 1]: 7.795e-05, [4] [d_1]: 3.903e-05 [none_parameter_eliminate]: 1.57001e-06 [renormalize]: 1.70025e-07 [switch_simplify]: 6.78e-06 [partial_unused_args_eliminate]: 4.40999e-06 [add_recomputation]: 4.788e-05 [cse_after_recomputation]: 2.774e-05, [1] [Cycle 1]: 2.045e-05, [1] [cse]: 1.142e-05 [environ_conv]: 8.07e-06 [swap_dp_allreduce_reducescatter]: 7.98001e-06 [bias_add_comm_swap]: 5.54998e-06 [label_micro_interleaved_index]: 6.61999e-06 [label_fine_grained_interleaved_index]: 5.21002e-06 [merge_cast_opt]: 3.76001e-06 [slice_recompute_activation]: 4.32e-06 [micro_interleaved_order_control]: 4.48999e-06 [assign_add_opt]: 3.45998e-06 [ForceFp32Comm]: 3.15998e-06 [remove_cast_before_assign_add]: 3.45e-06 [full_micro_interleaved_order_control]: 4.99e-06 [reorder_send_recv_between_fp_bp]: 4.92e-06 [comm_op_add_attrs]: 3.41999e-06 [add_comm_op_reuse_tag]: 3.28e-06 [interleave_split_concat_branches]: 3.43e-06 [interleave_parallel_branches]: 3.25e-06 [overlap_opt_shard_in_pipeline]: 3.86999e-06 [overlap_opt_shard_grad_in_pipeline]: 4.03001e-06 [control_data_broadcast_order]: 1.55e-05 [grouped_pairwise_exchange_alltoall]: 4.4e-06 [offloading_packed_experts]: 6.66999e-06 [overlap_recompute_and_grad_model_parallel]: 7.2e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.55e-06 [overlap_recompute_allgather_and_fa_grad]: 3.91001e-06 [overlap_recompute_comm]: 4.82998e-06 [overlap_grad_ring_attention]: 6.81999e-06 [overlap_grad_flash_sp]: 2.173e-05 [begin_end_overlap_inline]: 2.93998e-06 [split_matmul_comm_elemetwise]: 4.84998e-06 [split_layernorm_comm]: 4.22998e-06 [handle_group_info]: 3.65998e-06 [symbol_engine_optimizer]: 9.716e-05, [1] [Cycle 1]: 9.005e-05, [6] [build]: 2.53e-06 [elim_shapecalc]: 9.57001e-06 [elim_not_effective]: 1.255e-05 [opt_reshape]: 6.94999e-06 [fold_const_symbol]: 1.022e-05 [renormalize]: 1.69995e-07 [detach_backward]: 3.6e-06 [pipeline_parallel_scheduler]: 2.26998e-06 [auto_monad_reorder]: 1.831e-05 [get_jit_bprop_graph]: 1.34998e-06 [rewriter_after_jit_bprop_graph]: 4.12998e-06 [opt_after_jit_grad]: 0.00049803 [validate]: 3.718e-05 Sums bootstrap : 0.000440s : 4.35% type_inference : 0.005402s : 53.43% event_method : 0.000012s : 0.12% auto_monad : 0.000053s : 0.53% graph_reusing : 0.000005s : 0.05% inline : 0.000003s : 0.03% add_attr.add_attr_with_inline.tag_attr : 0.000015s : 0.14% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000003s : 0.03% parallel-infer-symbol : 0.000003s : 0.03% pre_auto_parallel : 0.000025s : 0.25% insert-virtual-dataset : 0.000003s : 0.03% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.02% optimize.py_interpret_to_execute : 0.000020s : 0.20% optimize.rewriter_before_opt_a : 0.000049s : 0.49% optimize.opt_a.expand_dump_flag : 0.000004s : 0.04% optimize.opt_a.switch_simplify : 0.000031s : 0.31% optimize.opt_a.loop_unroll : 0.000020s : 0.20% optimize.opt_a.a_1 : 0.000441s : 4.36% optimize.opt_a.with_stream_mark : 0.000026s : 0.26% optimize.opt_a.recompute_prepare : 0.000015s : 0.15% optimize.opt_a.updatestate_depend_eliminate : 0.000007s : 0.07% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.07% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.06% optimize.opt_a.parameter_eliminate : 0.000004s : 0.04% optimize.opt_a.a_2 : 0.000232s : 2.29% optimize.opt_a.accelerated_algorithm : 0.000028s : 0.28% optimize.opt_a.shard : 0.000003s : 0.03% optimize.opt_a.meta_shard_fg_expand : 0.000003s : 0.03% optimize.opt_a.shard_inline : 0.000013s : 0.13% optimize.opt_a.merge_send_recv : 0.000013s : 0.13% optimize.opt_a.auto_parallel : 0.000012s : 0.12% optimize.opt_a.parallel : 0.000023s : 0.23% optimize.opt_a.flash_sp : 0.000011s : 0.11% optimize.opt_a.merge_comm : 0.000008s : 0.08% optimize.opt_a.allreduce_fusion : 0.000007s : 0.06% optimize.opt_a.matmul_add_comm_reduction : 0.000017s : 0.17% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000015s : 0.14% optimize.opt_a.virtual_dataset : 0.000012s : 0.12% optimize.opt_a.get_grad_eliminate_ : 0.000011s : 0.11% optimize.opt_a.virtual_output : 0.000012s : 0.12% optimize.opt_a.merge_forward : 0.000007s : 0.07% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.03% optimize.opt_a.offload_activation : 0.000017s : 0.17% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000027s : 0.27% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.02% optimize.opt_a.before_grad : 0.000019s : 0.19% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000008s : 0.08% optimize.opt_a.meta_fg_expand : 0.000005s : 0.05% optimize.opt_a.flash_sp_send_recv_attached : 0.000006s : 0.06% optimize.opt_a.receive_attached : 0.000003s : 0.03% optimize.opt_a.after_resolve : 0.000018s : 0.18% optimize.opt_a.a_after_grad : 0.000018s : 0.18% optimize.opt_a.renormalize : 0.000532s : 5.26% optimize.opt_a.add_forward_monad_depend : 0.000006s : 0.06% optimize.opt_a.auto_monad_grad : 0.000003s : 0.03% optimize.opt_a.auto_monad_eliminator : 0.000022s : 0.22% optimize.opt_a.cse : 0.000047s : 0.47% optimize.opt_a.a_3 : 0.000110s : 1.09% optimize.py_interpret_to_execute_after_opt_a : 0.000012s : 0.12% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.04% optimize.rewriter_after_opt_a : 0.000039s : 0.39% optimize.convert_after_rewriter : 0.000009s : 0.09% optimize.order_py_execute_after_rewriter : 0.000008s : 0.08% optimize.mutable_eliminate : 0.000504s : 4.99% optimize.opt_b.b_1 : 0.000162s : 1.60% optimize.opt_b.b_2 : 0.000008s : 0.08% optimize.opt_b.updatestate_depend_eliminate : 0.000006s : 0.06% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.02% optimize.opt_b.updatestate_loads_eliminate : 0.000002s : 0.02% optimize.opt_b.renormalize : 0.000001s : 0.01% optimize.opt_b.cse : 0.000019s : 0.18% optimize.optimize_parallel_all_gather_comm : 0.000018s : 0.18% optimize.overlap_param_gather : 0.000005s : 0.05% optimize.cconv : 0.000028s : 0.28% optimize.loop_unroll : 0.000430s : 4.25% optimize.opt_after_cconv.c_1 : 0.000028s : 0.28% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000005s : 0.05% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.cse : 0.000018s : 0.18% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000018s : 0.18% optimize.tuple_transform.d_1 : 0.000039s : 0.39% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.02% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000007s : 0.07% optimize.partial_unused_args_eliminate : 0.000004s : 0.04% optimize.add_recomputation : 0.000048s : 0.47% optimize.cse_after_recomputation.cse : 0.000011s : 0.11% optimize.environ_conv : 0.000008s : 0.08% optimize.swap_dp_allreduce_reducescatter : 0.000008s : 0.08% optimize.bias_add_comm_swap : 0.000006s : 0.05% optimize.label_micro_interleaved_index : 0.000007s : 0.07% optimize.label_fine_grained_interleaved_index : 0.000005s : 0.05% optimize.merge_cast_opt : 0.000004s : 0.04% optimize.slice_recompute_activation : 0.000004s : 0.04% optimize.micro_interleaved_order_control : 0.000004s : 0.04% optimize.assign_add_opt : 0.000003s : 0.03% optimize.ForceFp32Comm : 0.000003s : 0.03% optimize.remove_cast_before_assign_add : 0.000003s : 0.03% optimize.full_micro_interleaved_order_control : 0.000005s : 0.05% optimize.reorder_send_recv_between_fp_bp : 0.000005s : 0.05% optimize.comm_op_add_attrs : 0.000003s : 0.03% optimize.add_comm_op_reuse_tag : 0.000003s : 0.03% optimize.interleave_split_concat_branches : 0.000003s : 0.03% optimize.interleave_parallel_branches : 0.000003s : 0.03% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.04% optimize.overlap_opt_shard_grad_in_pipeline : 0.000004s : 0.04% optimize.control_data_broadcast_order : 0.000016s : 0.15% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.04% optimize.offloading_packed_experts : 0.000007s : 0.07% optimize.overlap_recompute_and_grad_model_parallel : 0.000007s : 0.07% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.04% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.04% optimize.overlap_recompute_comm : 0.000005s : 0.05% optimize.overlap_grad_ring_attention : 0.000007s : 0.07% optimize.overlap_grad_flash_sp : 0.000022s : 0.21% optimize.begin_end_overlap_inline : 0.000003s : 0.03% optimize.split_matmul_comm_elemetwise : 0.000005s : 0.05% optimize.split_layernorm_comm : 0.000004s : 0.04% optimize.handle_group_info : 0.000004s : 0.04% optimize.symbol_engine_optimizer.build : 0.000003s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000010s : 0.09% optimize.symbol_engine_optimizer.elim_not_effective : 0.000013s : 0.12% optimize.symbol_engine_optimizer.opt_reshape : 0.000007s : 0.07% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000010s : 0.10% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000004s : 0.04% pipeline_parallel_scheduler : 0.000002s : 0.02% auto_monad_reorder : 0.000018s : 0.18% get_jit_bprop_graph : 0.000001s : 0.01% rewriter_after_jit_bprop_graph : 0.000004s : 0.04% opt_after_jit_grad : 0.000498s : 4.93% validate : 0.000037s : 0.37% Time group info: ------[substitution.] 0.000180 25 35.04% : 0.000063s : 4: substitution.arithmetic_simplify 1.05% : 0.000002s : 2: substitution.elim_not_effective 0.75% : 0.000001s : 2: substitution.fold_const_symbol 3.14% : 0.000006s : 3: substitution.graph_param_transform 47.90% : 0.000086s : 2: substitution.inline 1.98% : 0.000004s : 4: substitution.j_node_and_user_rematch 6.02% : 0.000011s : 2: substitution.less_batch_normalization 2.50% : 0.000004s : 4: substitution.remove_not_recompute_node 1.62% : 0.000003s : 2: substitution.replace_old_param ------[type_inference.] 0.005355 2 91.94% : 0.004924s : 1: type_inference.infer 8.06% : 0.000432s : 1: type_inference.specialize ------[replace.] 0.000020 2 100.00% : 0.000020s : 2: replace.inline ------[match.] 0.000084 2 100.00% : 0.000084s : 2: match.inline ------[predicate.] 0.000133 754 0.81% : 0.000001s : 7: predicate.accumulaten_eliminater 0.83% : 0.000001s : 3: predicate.ad_related_special_op_eliminate 0.67% : 0.000001s : 6: predicate.addn_check_dump 0.87% : 0.000001s : 7: predicate.addn_zero_filter 0.71% : 0.000001s : 7: predicate.adjust_all_reduce_mul_add 2.56% : 0.000003s : 13: predicate.arithmetic_simplify 0.74% : 0.000001s : 7: predicate.cast_eliminate 0.87% : 0.000001s : 6: predicate.check_bprop_eliminate 0.71% : 0.000001s : 6: predicate.compare_switch_simplify 0.22% : 0.000000s : 3: predicate.const_output_eliminate 0.78% : 0.000001s : 6: predicate.depend_value_elim 0.77% : 0.000001s : 7: predicate.dict_get_item_const_eliminator 1.05% : 0.000001s : 7: predicate.dict_get_item_eliminator 0.75% : 0.000001s : 7: predicate.dict_set_item_eliminator 1.24% : 0.000002s : 6: predicate.dumpgradient_eliminate 0.26% : 0.000000s : 3: predicate.elim_not_effective 0.50% : 0.000001s : 3: predicate.elim_shapecalc_of_broadcastargs 1.14% : 0.000002s : 10: predicate.environ_add_const_eliminate 1.15% : 0.000002s : 10: predicate.environ_get_add_eliminate 0.97% : 0.000001s : 10: predicate.environ_get_depend_swap 1.77% : 0.000002s : 16: predicate.environ_get_eliminate 0.96% : 0.000001s : 10: predicate.environ_get_set_eliminate 0.92% : 0.000001s : 9: predicate.exchange_switch_depend_value 2.02% : 0.000003s : 9: predicate.float_depend_g_call 0.65% : 0.000001s : 6: predicate.float_environ_get_switch 1.05% : 0.000001s : 9: predicate.float_tuple_getitem_switch 0.26% : 0.000000s : 3: predicate.fold_const_symbol 0.77% : 0.000001s : 6: predicate.get_grad_eliminate 0.23% : 0.000000s : 3: predicate.graph_param_transform 0.87% : 0.000001s : 6: predicate.incorporate_call 0.68% : 0.000001s : 6: predicate.incorporate_call_switch 6.42% : 0.000009s : 34: predicate.inline 1.07% : 0.000001s : 6: predicate.inline_without_move 0.41% : 0.000001s : 6: predicate.j_node_and_user_rematch 1.27% : 0.000002s : 6: predicate.less_batch_normalization 1.54% : 0.000002s : 13: predicate.list_to_tuple_eliminator_ 2.08% : 0.000003s : 20: predicate.load_eliminater 1.20% : 0.000002s : 3: predicate.loop_unroll_after_grad 1.70% : 0.000002s : 14: predicate.loop_unroll_before_grad 1.77% : 0.000002s : 13: predicate.make_slice_get_slice_eliminator 0.77% : 0.000001s : 6: predicate.merge_addn 0.76% : 0.000001s : 6: predicate.micro_step_allgather_replace 0.74% : 0.000001s : 6: predicate.mini_step_allgather_replace 0.67% : 0.000001s : 7: predicate.minmaximum_grad 1.44% : 0.000002s : 3: predicate.mutable_eliminate 0.45% : 0.000001s : 3: predicate.opt_reshape 0.44% : 0.000001s : 3: predicate.parallel_virtual_node 1.50% : 0.000002s : 9: predicate.partial_defer_inline 1.22% : 0.000002s : 10: predicate.partial_eliminate 0.76% : 0.000001s : 7: predicate.print_const_string_wrapper 0.77% : 0.000001s : 6: predicate.reduce_all_const_elim 1.01% : 0.000001s : 7: predicate.reduce_eliminate 2.17% : 0.000003s : 20: predicate.redundant_stop_gradient_eliminater 0.79% : 0.000001s : 6: predicate.remove_not_recompute_node 1.13% : 0.000001s : 13: predicate.replace_applicator 0.62% : 0.000001s : 6: predicate.replace_old_param 0.35% : 0.000000s : 3: predicate.reset_defer_inline 0.75% : 0.000001s : 7: predicate.reshape_eliminate 0.76% : 0.000001s : 6: predicate.row_tensor_add_zeros_like 0.47% : 0.000001s : 3: predicate.row_tensor_eliminate 1.06% : 0.000001s : 6: predicate.same_eliminate 0.56% : 0.000001s : 6: predicate.set_cell_output_no_recompute 0.95% : 0.000001s : 6: predicate.shard_identity_eliminate 0.96% : 0.000001s : 6: predicate.special_op_eliminate 0.93% : 0.000001s : 6: predicate.specialize_transform 1.12% : 0.000001s : 6: predicate.split_environ_get_set_with_tuple_value 0.95% : 0.000001s : 6: predicate.stack_unstack_eliminate 0.45% : 0.000001s : 3: predicate.switch_call_monad_eliminater 1.02% : 0.000001s : 9: predicate.switch_defer_inline 1.72% : 0.000002s : 15: predicate.switch_layer_defer_inline 4.56% : 0.000006s : 32: predicate.switch_simplify 0.87% : 0.000001s : 7: predicate.tile_eliminate 0.74% : 0.000001s : 7: predicate.transpose_eliminate 1.59% : 0.000002s : 13: predicate.tuple_list_convert_item_index_to_positive 1.58% : 0.000002s : 13: predicate.tuple_list_get_item_const_eliminator 1.44% : 0.000002s : 13: predicate.tuple_list_get_item_depend_reorder 3.32% : 0.000004s : 19: predicate.tuple_list_get_item_eliminator 1.56% : 0.000002s : 13: predicate.tuple_list_get_set_item_eliminator 2.52% : 0.000003s : 19: predicate.tuple_list_set_item_eliminator 1.54% : 0.000002s : 13: predicate.tuple_to_list_eliminator_ 2.02% : 0.000003s : 20: predicate.updatestate_pure_node_eliminater 2.76% : 0.000004s : 26: predicate.updatestate_useless_node_eliminater 0.45% : 0.000001s : 3: predicate.value_based_eliminate 0.77% : 0.000001s : 6: predicate.virtual_dataset_eliminate 0.78% : 0.000001s : 6: predicate.virtual_output_eliminate 0.35% : 0.000000s : 3: predicate.virtual_view_grad_eliminate 0.59% : 0.000001s : 3: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000210 5 7.88% : 0.000017s : 1: func_graph_cloner_run.FuncGraphClonerGraph 92.12% : 0.000194s : 4: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.024366 192 0.02% : 0.000006s : 1: ForceFp32Comm 12.92% : 0.003148s : 1: add_attr 12.87% : 0.003135s : 1: add_attr_with_inline 0.02% : 0.000006s : 1: add_comm_op_reuse_tag 0.21% : 0.000051s : 1: add_recomputation 0.02% : 0.000006s : 1: assign_add_opt 0.25% : 0.000062s : 1: auto_monad 0.11% : 0.000026s : 1: auto_monad_reorder 0.02% : 0.000006s : 1: begin_end_overlap_inline 0.03% : 0.000008s : 1: bias_add_comm_swap 1.98% : 0.000483s : 1: bootstrap 0.13% : 0.000031s : 1: cconv 0.03% : 0.000006s : 1: comm_op_add_attrs 0.08% : 0.000018s : 1: control_data_broadcast_order 0.05% : 0.000012s : 1: convert_after_rewriter 0.13% : 0.000031s : 1: cse_after_recomputation 0.03% : 0.000008s : 1: dataset_repeat_opt 0.08% : 0.000019s : 1: detach_backward 0.04% : 0.000011s : 1: environ_conv 0.09% : 0.000023s : 1: event_method 0.03% : 0.000008s : 1: full_micro_interleaved_order_control 0.03% : 0.000008s : 1: get_jit_bprop_graph 0.05% : 0.000012s : 1: graph_reusing 0.03% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.03% : 0.000006s : 1: handle_group_info 0.03% : 0.000008s : 1: inline 0.03% : 0.000008s : 1: insert-virtual-dataset 0.02% : 0.000006s : 1: interleave_parallel_branches 0.02% : 0.000006s : 1: interleave_split_concat_branches 0.03% : 0.000008s : 1: label_fine_grained_interleaved_index 0.04% : 0.000009s : 1: label_micro_interleaved_index 1.79% : 0.000436s : 1: loop_unroll 0.03% : 0.000006s : 1: merge_cast_opt 0.03% : 0.000007s : 1: micro_interleaved_order_control 2.09% : 0.000510s : 1: mutable_eliminate 0.04% : 0.000010s : 1: offloading_packed_experts 0.06% : 0.000014s : 1: opt.transform.loop_unroll_optimizer 0.06% : 0.000015s : 1: opt.transform.mutable_eliminate 3.48% : 0.000847s : 78: opt.transform.opt_a 0.11% : 0.000027s : 1: opt.transform.opt_after_cconv 0.10% : 0.000024s : 1: opt.transform.opt_after_jit_grad 0.40% : 0.000097s : 28: opt.transform.opt_b 0.18% : 0.000044s : 2: opt.transform.opt_trans_graph 0.15% : 0.000036s : 4: opt.transform.symbol_engine_opt 10.55% : 0.002572s : 1: opt_a 0.52% : 0.000126s : 1: opt_after_cconv 2.09% : 0.000509s : 1: opt_after_jit_grad 1.10% : 0.000268s : 1: opt_b 20.87% : 0.005086s : 1: optimize 0.09% : 0.000022s : 1: optimize_parallel_all_gather_comm 0.05% : 0.000011s : 1: order_py_execute_after_rewriter 0.10% : 0.000025s : 1: overlap_grad_flash_sp 0.03% : 0.000006s : 1: overlap_grad_matmul_and_grad_allreduce 0.04% : 0.000010s : 1: overlap_grad_ring_attention 0.03% : 0.000007s : 1: overlap_opt_shard_grad_in_pipeline 0.03% : 0.000007s : 1: overlap_opt_shard_in_pipeline 0.03% : 0.000008s : 1: overlap_param_gather 0.03% : 0.000007s : 1: overlap_recompute_allgather_and_fa_grad 0.04% : 0.000010s : 1: overlap_recompute_and_grad_model_parallel 0.03% : 0.000008s : 1: overlap_recompute_comm 0.04% : 0.000010s : 1: parallel-infer-symbol 0.03% : 0.000006s : 1: parallel-infer-symbol-second 0.03% : 0.000007s : 1: partial_unused_args_eliminate 0.04% : 0.000010s : 1: pipeline_parallel_scheduler 0.03% : 0.000007s : 1: pipeline_split 0.13% : 0.000032s : 1: pre_auto_parallel 0.10% : 0.000023s : 1: py_interpret_to_execute 0.06% : 0.000016s : 1: py_interpret_to_execute_after_opt_a 0.02% : 0.000006s : 1: remove_cast_before_assign_add 0.09% : 0.000021s : 1: remove_dup_value 1.20% : 0.000293s : 1: renormalize.infer 0.95% : 0.000231s : 1: renormalize.specialize 0.03% : 0.000008s : 1: reorder_send_recv_between_fp_bp 0.04% : 0.000010s : 1: rewriter_after_jit_bprop_graph 0.18% : 0.000043s : 1: rewriter_after_opt_a 0.22% : 0.000053s : 1: rewriter_before_opt_a 0.03% : 0.000007s : 1: slice_cell_reuse_recomputed_activation 0.03% : 0.000007s : 1: slice_recompute_activation 0.03% : 0.000007s : 1: split_layernorm_comm 0.03% : 0.000008s : 1: split_matmul_comm_elemetwise 0.04% : 0.000011s : 1: swap_dp_allreduce_reducescatter 0.41% : 0.000100s : 1: symbol_engine_optimizer 0.36% : 0.000088s : 1: tuple_transform 22.28% : 0.005428s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:34.838.654 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0159496, [21] [bootstrap]: 0.00043683 [type_inference]: 0.00552021 [event_method]: 1.406e-05 [auto_monad]: 5.949e-05 [graph_reusing]: 5.74999e-06 [inline]: 2.68e-06 [add_attr]: 0.0039014, [1] [add_attr_with_inline]: 0.00388775, [1] [Cycle 1]: 6.702e-05, [2] [tag_attr]: 1.834e-05 [meta_addattr_fg_expand]: 3.75e-06 [parallel-infer-symbol]: 3.68e-06 [pre_auto_parallel]: 3.335e-05 [insert-virtual-dataset]: 2.38002e-06 [parallel-infer-symbol-second]: 8.90024e-07 [dataset_repeat_opt]: 2.46e-06 [pipeline_split]: 1.99e-06 [optimize]: 0.00513069, [53] [py_interpret_to_execute]: 2.61e-05 [rewriter_before_opt_a]: 5.494e-05 [opt_a]: 0.00265698, [2] [Cycle 1]: 0.00187729, [45] [expand_dump_flag]: 2.78998e-06 [switch_simplify]: 2.777e-05 [loop_unroll]: 1.443e-05 [a_1]: 0.00036402 [with_stream_mark]: 2.322e-05 [recompute_prepare]: 1.008e-05 [updatestate_depend_eliminate]: 3.83999e-06 [updatestate_assign_eliminate]: 3.25e-06 [updatestate_loads_eliminate]: 3.35e-06 [parameter_eliminate]: 1.94999e-06 [a_2]: 0.00010343 [accelerated_algorithm]: 2.145e-05 [shard]: 2.66999e-06 [meta_shard_fg_expand]: 1.89e-06 [shard_inline]: 7.41001e-06 [merge_send_recv]: 8.77e-06 [auto_parallel]: 8.35001e-06 [parallel]: 2.026e-05 [flash_sp]: 8.38999e-06 [merge_comm]: 4.52e-06 [allreduce_fusion]: 3.44001e-06 [matmul_add_comm_reduction]: 1.109e-05 [allreduce_slice_to_reducescatter]: 6.50005e-07 [virtual_shard_identity]: 1.071e-05 [virtual_dataset]: 7.2e-06 [get_grad_eliminate_]: 6.66e-06 [virtual_output]: 6.69999e-06 [merge_forward]: 4.25e-06 [cell_reuse_recompute_pass]: 1.44e-06 [offload_activation]: 1.062e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.34e-05 [merge_recompute_call_nodes]: 1.78002e-06 [before_grad]: 1.155e-05 [set_forward_comm_id_for_comm_node_pass]: 3.55e-06 [meta_fg_expand]: 3.23998e-06 [flash_sp_send_recv_attached]: 5.05999e-06 [receive_attached]: 2.43e-06 [after_resolve]: 1.196e-05 [a_after_grad]: 9.76003e-06 [renormalize]: 0.00072918 [add_forward_monad_depend]: 6.25002e-06 [auto_monad_grad]: 2.74999e-06 [auto_monad_eliminator]: 1.926e-05 [cse]: 3.432e-05 [a_3]: 5.561e-05 [Cycle 2]: 0.00076758, [45] [expand_dump_flag]: 1.42e-06 [switch_simplify]: 7.79002e-06 [loop_unroll]: 6.24001e-06 [a_1]: 0.00015397 [with_stream_mark]: 1.142e-05 [recompute_prepare]: 7.68999e-06 [updatestate_depend_eliminate]: 3.46001e-06 [updatestate_assign_eliminate]: 2.71e-06 [updatestate_loads_eliminate]: 3.88001e-06 [parameter_eliminate]: 1.14998e-06 [a_2]: 8.244e-05 [accelerated_algorithm]: 1.104e-05 [shard]: 1.21997e-06 [meta_shard_fg_expand]: 1.77999e-06 [shard_inline]: 6.26e-06 [merge_send_recv]: 7.01001e-06 [auto_parallel]: 6.88e-06 [parallel]: 6.74999e-06 [flash_sp]: 3.83001e-06 [merge_comm]: 4.82e-06 [allreduce_fusion]: 3.73001e-06 [matmul_add_comm_reduction]: 8.90001e-06 [allreduce_slice_to_reducescatter]: 3.89991e-07 [virtual_shard_identity]: 8.79e-06 [virtual_dataset]: 6.53e-06 [get_grad_eliminate_]: 6.16e-06 [virtual_output]: 5.82001e-06 [merge_forward]: 3.58e-06 [cell_reuse_recompute_pass]: 2.22001e-06 [offload_activation]: 9.92999e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.228e-05 [merge_recompute_call_nodes]: 1.06997e-06 [before_grad]: 1.041e-05 [set_forward_comm_id_for_comm_node_pass]: 4.79e-06 [meta_fg_expand]: 2.64001e-06 [flash_sp_send_recv_attached]: 1.73002e-06 [receive_attached]: 1.57999e-06 [after_resolve]: 1.064e-05 [a_after_grad]: 8.69e-06 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 2.07001e-06 [auto_monad_grad]: 1.36998e-06 [auto_monad_eliminator]: 9.29e-06 [cse]: 2.043e-05 [a_3]: 3.767e-05 [py_interpret_to_execute_after_opt_a]: 1.44e-05 [slice_cell_reuse_recomputed_activation]: 2.01e-06 [rewriter_after_opt_a]: 3.943e-05 [convert_after_rewriter]: 6.41998e-06 [order_py_execute_after_rewriter]: 6.06998e-06 [mutable_eliminate]: 0.00075885 [opt_b]: 0.00022762, [1] [Cycle 1]: 0.0002181, [7] [b_1]: 0.00012969 [b_2]: 7.59002e-06 [updatestate_depend_eliminate]: 7.96001e-06 [updatestate_assign_eliminate]: 2.56e-06 [updatestate_loads_eliminate]: 3.42002e-06 [renormalize]: 8.39995e-07 [cse]: 2.766e-05 [optimize_parallel_all_gather_comm]: 1.767e-05 [overlap_param_gather]: 1.94999e-06 [cconv]: 3.119e-05 [loop_unroll]: 0.00057971 [opt_after_cconv]: 0.00011537, [1] [Cycle 1]: 0.00010622, [7] [c_1]: 3.077e-05 [parameter_eliminate]: 4.2e-06 [updatestate_depend_eliminate]: 7.15998e-06 [updatestate_assign_eliminate]: 2.76999e-06 [updatestate_loads_eliminate]: 2.91e-06 [cse]: 2.271e-05 [renormalize]: 4.39992e-07 [remove_dup_value]: 1.69e-05 [tuple_transform]: 7.803e-05, [1] [Cycle 1]: 7.282e-05, [4] [d_1]: 4.406e-05 [none_parameter_eliminate]: 1.83002e-06 [renormalize]: 1.79978e-07 [switch_simplify]: 6.90002e-06 [partial_unused_args_eliminate]: 1.87999e-06 [add_recomputation]: 5.333e-05 [cse_after_recomputation]: 2.276e-05, [1] [Cycle 1]: 1.772e-05, [1] [cse]: 1.186e-05 [environ_conv]: 6.39001e-06 [swap_dp_allreduce_reducescatter]: 5.46e-06 [bias_add_comm_swap]: 2.62001e-06 [label_micro_interleaved_index]: 5.24998e-06 [label_fine_grained_interleaved_index]: 2.65997e-06 [merge_cast_opt]: 1.42999e-06 [slice_recompute_activation]: 2.31e-06 [micro_interleaved_order_control]: 2.64999e-06 [assign_add_opt]: 1.22999e-06 [ForceFp32Comm]: 1.20001e-06 [remove_cast_before_assign_add]: 1.12e-06 [full_micro_interleaved_order_control]: 2.06e-06 [reorder_send_recv_between_fp_bp]: 2.84001e-06 [comm_op_add_attrs]: 1.02e-06 [add_comm_op_reuse_tag]: 9.49978e-07 [interleave_split_concat_branches]: 1.17e-06 [interleave_parallel_branches]: 1.04e-06 [overlap_opt_shard_in_pipeline]: 1.27999e-06 [overlap_opt_shard_grad_in_pipeline]: 2.15002e-06 [control_data_broadcast_order]: 1.404e-05 [grouped_pairwise_exchange_alltoall]: 1.57001e-06 [offloading_packed_experts]: 4.38001e-06 [overlap_recompute_and_grad_model_parallel]: 4.96002e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.20001e-06 [overlap_recompute_allgather_and_fa_grad]: 1.39998e-06 [overlap_recompute_comm]: 2.67001e-06 [overlap_grad_ring_attention]: 4.04002e-06 [overlap_grad_flash_sp]: 2.106e-05 [begin_end_overlap_inline]: 4.59986e-07 [split_matmul_comm_elemetwise]: 2.12999e-06 [split_layernorm_comm]: 1.62999e-06 [handle_group_info]: 9.20001e-07 [symbol_engine_optimizer]: 8.078e-05, [1] [Cycle 1]: 7.582e-05, [6] [build]: 3.78001e-06 [elim_shapecalc]: 1.073e-05 [elim_not_effective]: 1.416e-05 [opt_reshape]: 7.45e-06 [fold_const_symbol]: 9.95002e-06 [renormalize]: 4.69998e-07 [detach_backward]: 2.63e-06 [pipeline_parallel_scheduler]: 1.87001e-06 [auto_monad_reorder]: 1.772e-05 [get_jit_bprop_graph]: 2.31998e-06 [rewriter_after_jit_bprop_graph]: 5.05001e-06 [opt_after_jit_grad]: 0.00059333 [validate]: 4.663e-05 Sums bootstrap : 0.000437s : 3.98% type_inference : 0.005520s : 50.27% event_method : 0.000014s : 0.13% auto_monad : 0.000059s : 0.54% graph_reusing : 0.000006s : 0.05% inline : 0.000003s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000018s : 0.17% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000004s : 0.03% parallel-infer-symbol : 0.000004s : 0.03% pre_auto_parallel : 0.000033s : 0.30% insert-virtual-dataset : 0.000002s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.02% optimize.py_interpret_to_execute : 0.000026s : 0.24% optimize.rewriter_before_opt_a : 0.000055s : 0.50% optimize.opt_a.expand_dump_flag : 0.000004s : 0.04% optimize.opt_a.switch_simplify : 0.000036s : 0.32% optimize.opt_a.loop_unroll : 0.000021s : 0.19% optimize.opt_a.a_1 : 0.000518s : 4.72% optimize.opt_a.with_stream_mark : 0.000035s : 0.32% optimize.opt_a.recompute_prepare : 0.000018s : 0.16% optimize.opt_a.updatestate_depend_eliminate : 0.000007s : 0.07% optimize.opt_a.updatestate_assign_eliminate : 0.000006s : 0.05% optimize.opt_a.updatestate_loads_eliminate : 0.000007s : 0.07% optimize.opt_a.parameter_eliminate : 0.000003s : 0.03% optimize.opt_a.a_2 : 0.000186s : 1.69% optimize.opt_a.accelerated_algorithm : 0.000032s : 0.30% optimize.opt_a.shard : 0.000004s : 0.04% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.03% optimize.opt_a.shard_inline : 0.000014s : 0.12% optimize.opt_a.merge_send_recv : 0.000016s : 0.14% optimize.opt_a.auto_parallel : 0.000015s : 0.14% optimize.opt_a.parallel : 0.000027s : 0.25% optimize.opt_a.flash_sp : 0.000012s : 0.11% optimize.opt_a.merge_comm : 0.000009s : 0.09% optimize.opt_a.allreduce_fusion : 0.000007s : 0.07% optimize.opt_a.matmul_add_comm_reduction : 0.000020s : 0.18% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000020s : 0.18% optimize.opt_a.virtual_dataset : 0.000014s : 0.13% optimize.opt_a.get_grad_eliminate_ : 0.000013s : 0.12% optimize.opt_a.virtual_output : 0.000013s : 0.11% optimize.opt_a.merge_forward : 0.000008s : 0.07% optimize.opt_a.cell_reuse_recompute_pass : 0.000004s : 0.03% optimize.opt_a.offload_activation : 0.000021s : 0.19% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000026s : 0.23% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.03% optimize.opt_a.before_grad : 0.000022s : 0.20% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000008s : 0.08% optimize.opt_a.meta_fg_expand : 0.000006s : 0.05% optimize.opt_a.flash_sp_send_recv_attached : 0.000007s : 0.06% optimize.opt_a.receive_attached : 0.000004s : 0.04% optimize.opt_a.after_resolve : 0.000023s : 0.21% optimize.opt_a.a_after_grad : 0.000018s : 0.17% optimize.opt_a.renormalize : 0.000729s : 6.64% optimize.opt_a.add_forward_monad_depend : 0.000008s : 0.08% optimize.opt_a.auto_monad_grad : 0.000004s : 0.04% optimize.opt_a.auto_monad_eliminator : 0.000029s : 0.26% optimize.opt_a.cse : 0.000055s : 0.50% optimize.opt_a.a_3 : 0.000093s : 0.85% optimize.py_interpret_to_execute_after_opt_a : 0.000014s : 0.13% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.02% optimize.rewriter_after_opt_a : 0.000039s : 0.36% optimize.convert_after_rewriter : 0.000006s : 0.06% optimize.order_py_execute_after_rewriter : 0.000006s : 0.06% optimize.mutable_eliminate : 0.000759s : 6.91% optimize.opt_b.b_1 : 0.000130s : 1.18% optimize.opt_b.b_2 : 0.000008s : 0.07% optimize.opt_b.updatestate_depend_eliminate : 0.000008s : 0.07% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.02% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_b.renormalize : 0.000001s : 0.01% optimize.opt_b.cse : 0.000028s : 0.25% optimize.optimize_parallel_all_gather_comm : 0.000018s : 0.16% optimize.overlap_param_gather : 0.000002s : 0.02% optimize.cconv : 0.000031s : 0.28% optimize.loop_unroll : 0.000580s : 5.28% optimize.opt_after_cconv.c_1 : 0.000031s : 0.28% optimize.opt_after_cconv.parameter_eliminate : 0.000004s : 0.04% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000007s : 0.07% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.cse : 0.000023s : 0.21% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000017s : 0.15% optimize.tuple_transform.d_1 : 0.000044s : 0.40% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.02% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000007s : 0.06% optimize.partial_unused_args_eliminate : 0.000002s : 0.02% optimize.add_recomputation : 0.000053s : 0.49% optimize.cse_after_recomputation.cse : 0.000012s : 0.11% optimize.environ_conv : 0.000006s : 0.06% optimize.swap_dp_allreduce_reducescatter : 0.000005s : 0.05% optimize.bias_add_comm_swap : 0.000003s : 0.02% optimize.label_micro_interleaved_index : 0.000005s : 0.05% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.02% optimize.merge_cast_opt : 0.000001s : 0.01% optimize.slice_recompute_activation : 0.000002s : 0.02% optimize.micro_interleaved_order_control : 0.000003s : 0.02% optimize.assign_add_opt : 0.000001s : 0.01% optimize.ForceFp32Comm : 0.000001s : 0.01% optimize.remove_cast_before_assign_add : 0.000001s : 0.01% optimize.full_micro_interleaved_order_control : 0.000002s : 0.02% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.03% optimize.comm_op_add_attrs : 0.000001s : 0.01% optimize.add_comm_op_reuse_tag : 0.000001s : 0.01% optimize.interleave_split_concat_branches : 0.000001s : 0.01% optimize.interleave_parallel_branches : 0.000001s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.02% optimize.control_data_broadcast_order : 0.000014s : 0.13% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.01% optimize.offloading_packed_experts : 0.000004s : 0.04% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.05% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.01% optimize.overlap_recompute_comm : 0.000003s : 0.02% optimize.overlap_grad_ring_attention : 0.000004s : 0.04% optimize.overlap_grad_flash_sp : 0.000021s : 0.19% optimize.begin_end_overlap_inline : 0.000000s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.02% optimize.split_layernorm_comm : 0.000002s : 0.01% optimize.handle_group_info : 0.000001s : 0.01% optimize.symbol_engine_optimizer.build : 0.000004s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000011s : 0.10% optimize.symbol_engine_optimizer.elim_not_effective : 0.000014s : 0.13% optimize.symbol_engine_optimizer.opt_reshape : 0.000007s : 0.07% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000010s : 0.09% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000003s : 0.02% pipeline_parallel_scheduler : 0.000002s : 0.02% auto_monad_reorder : 0.000018s : 0.16% get_jit_bprop_graph : 0.000002s : 0.02% rewriter_after_jit_bprop_graph : 0.000005s : 0.05% opt_after_jit_grad : 0.000593s : 5.40% validate : 0.000047s : 0.42% Time group info: ------[substitution.] 0.000231 25 37.59% : 0.000087s : 4: substitution.arithmetic_simplify 0.90% : 0.000002s : 2: substitution.elim_not_effective 0.64% : 0.000001s : 2: substitution.fold_const_symbol 2.53% : 0.000006s : 3: substitution.graph_param_transform 46.16% : 0.000107s : 2: substitution.inline 2.01% : 0.000005s : 4: substitution.j_node_and_user_rematch 5.79% : 0.000013s : 2: substitution.less_batch_normalization 2.16% : 0.000005s : 4: substitution.remove_not_recompute_node 2.21% : 0.000005s : 2: substitution.replace_old_param ------[type_inference.] 0.005470 2 90.79% : 0.004966s : 1: type_inference.infer 9.21% : 0.000504s : 1: type_inference.specialize ------[replace.] 0.000024 2 100.00% : 0.000024s : 2: replace.inline ------[match.] 0.000105 2 100.00% : 0.000105s : 2: match.inline ------[predicate.] 0.000152 754 0.87% : 0.000001s : 7: predicate.accumulaten_eliminater 1.06% : 0.000002s : 3: predicate.ad_related_special_op_eliminate 0.64% : 0.000001s : 6: predicate.addn_check_dump 0.81% : 0.000001s : 7: predicate.addn_zero_filter 0.65% : 0.000001s : 7: predicate.adjust_all_reduce_mul_add 3.16% : 0.000005s : 13: predicate.arithmetic_simplify 0.68% : 0.000001s : 7: predicate.cast_eliminate 1.00% : 0.000002s : 6: predicate.check_bprop_eliminate 0.64% : 0.000001s : 6: predicate.compare_switch_simplify 0.18% : 0.000000s : 3: predicate.const_output_eliminate 0.69% : 0.000001s : 6: predicate.depend_value_elim 0.78% : 0.000001s : 7: predicate.dict_get_item_const_eliminator 0.99% : 0.000002s : 7: predicate.dict_get_item_eliminator 0.64% : 0.000001s : 7: predicate.dict_set_item_eliminator 1.31% : 0.000002s : 6: predicate.dumpgradient_eliminate 0.31% : 0.000000s : 3: predicate.elim_not_effective 0.47% : 0.000001s : 3: predicate.elim_shapecalc_of_broadcastargs 1.02% : 0.000002s : 10: predicate.environ_add_const_eliminate 0.94% : 0.000001s : 10: predicate.environ_get_add_eliminate 0.98% : 0.000001s : 10: predicate.environ_get_depend_swap 1.58% : 0.000002s : 16: predicate.environ_get_eliminate 0.89% : 0.000001s : 10: predicate.environ_get_set_eliminate 0.92% : 0.000001s : 9: predicate.exchange_switch_depend_value 1.85% : 0.000003s : 9: predicate.float_depend_g_call 0.63% : 0.000001s : 6: predicate.float_environ_get_switch 0.94% : 0.000001s : 9: predicate.float_tuple_getitem_switch 0.18% : 0.000000s : 3: predicate.fold_const_symbol 0.69% : 0.000001s : 6: predicate.get_grad_eliminate 0.32% : 0.000000s : 3: predicate.graph_param_transform 0.74% : 0.000001s : 6: predicate.incorporate_call 0.58% : 0.000001s : 6: predicate.incorporate_call_switch 6.25% : 0.000010s : 34: predicate.inline 0.76% : 0.000001s : 6: predicate.inline_without_move 0.37% : 0.000001s : 6: predicate.j_node_and_user_rematch 1.15% : 0.000002s : 6: predicate.less_batch_normalization 1.61% : 0.000002s : 13: predicate.list_to_tuple_eliminator_ 1.88% : 0.000003s : 20: predicate.load_eliminater 1.60% : 0.000002s : 3: predicate.loop_unroll_after_grad 1.42% : 0.000002s : 14: predicate.loop_unroll_before_grad 2.06% : 0.000003s : 13: predicate.make_slice_get_slice_eliminator 0.66% : 0.000001s : 6: predicate.merge_addn 0.73% : 0.000001s : 6: predicate.micro_step_allgather_replace 0.83% : 0.000001s : 6: predicate.mini_step_allgather_replace 0.72% : 0.000001s : 7: predicate.minmaximum_grad 2.64% : 0.000004s : 3: predicate.mutable_eliminate 0.42% : 0.000001s : 3: predicate.opt_reshape 0.57% : 0.000001s : 3: predicate.parallel_virtual_node 1.17% : 0.000002s : 9: predicate.partial_defer_inline 1.13% : 0.000002s : 10: predicate.partial_eliminate 0.79% : 0.000001s : 7: predicate.print_const_string_wrapper 1.06% : 0.000002s : 6: predicate.reduce_all_const_elim 1.15% : 0.000002s : 7: predicate.reduce_eliminate 2.34% : 0.000004s : 20: predicate.redundant_stop_gradient_eliminater 0.70% : 0.000001s : 6: predicate.remove_not_recompute_node 1.23% : 0.000002s : 13: predicate.replace_applicator 0.62% : 0.000001s : 6: predicate.replace_old_param 0.41% : 0.000001s : 3: predicate.reset_defer_inline 1.00% : 0.000002s : 7: predicate.reshape_eliminate 0.65% : 0.000001s : 6: predicate.row_tensor_add_zeros_like 0.44% : 0.000001s : 3: predicate.row_tensor_eliminate 1.14% : 0.000002s : 6: predicate.same_eliminate 0.47% : 0.000001s : 6: predicate.set_cell_output_no_recompute 1.02% : 0.000002s : 6: predicate.shard_identity_eliminate 1.09% : 0.000002s : 6: predicate.special_op_eliminate 0.80% : 0.000001s : 6: predicate.specialize_transform 1.19% : 0.000002s : 6: predicate.split_environ_get_set_with_tuple_value 0.88% : 0.000001s : 6: predicate.stack_unstack_eliminate 0.38% : 0.000001s : 3: predicate.switch_call_monad_eliminater 1.01% : 0.000002s : 9: predicate.switch_defer_inline 1.61% : 0.000002s : 15: predicate.switch_layer_defer_inline 4.08% : 0.000006s : 32: predicate.switch_simplify 0.87% : 0.000001s : 7: predicate.tile_eliminate 0.78% : 0.000001s : 7: predicate.transpose_eliminate 1.65% : 0.000003s : 13: predicate.tuple_list_convert_item_index_to_positive 1.76% : 0.000003s : 13: predicate.tuple_list_get_item_const_eliminator 1.58% : 0.000002s : 13: predicate.tuple_list_get_item_depend_reorder 3.10% : 0.000005s : 19: predicate.tuple_list_get_item_eliminator 1.44% : 0.000002s : 13: predicate.tuple_list_get_set_item_eliminator 2.72% : 0.000004s : 19: predicate.tuple_list_set_item_eliminator 1.44% : 0.000002s : 13: predicate.tuple_to_list_eliminator_ 1.82% : 0.000003s : 20: predicate.updatestate_pure_node_eliminater 2.45% : 0.000004s : 26: predicate.updatestate_useless_node_eliminater 0.88% : 0.000001s : 3: predicate.value_based_eliminate 0.79% : 0.000001s : 6: predicate.virtual_dataset_eliminate 0.71% : 0.000001s : 6: predicate.virtual_output_eliminate 0.32% : 0.000000s : 3: predicate.virtual_view_grad_eliminate 0.50% : 0.000001s : 3: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000269 5 7.71% : 0.000021s : 1: func_graph_cloner_run.FuncGraphClonerGraph 92.29% : 0.000248s : 4: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.026802 192 0.02% : 0.000004s : 1: ForceFp32Comm 14.58% : 0.003908s : 1: add_attr 14.52% : 0.003892s : 1: add_attr_with_inline 0.01% : 0.000004s : 1: add_comm_op_reuse_tag 0.21% : 0.000057s : 1: add_recomputation 0.02% : 0.000004s : 1: assign_add_opt 0.24% : 0.000065s : 1: auto_monad 0.08% : 0.000021s : 1: auto_monad_reorder 0.01% : 0.000003s : 1: begin_end_overlap_inline 0.02% : 0.000006s : 1: bias_add_comm_swap 1.74% : 0.000465s : 1: bootstrap 0.13% : 0.000035s : 1: cconv 0.01% : 0.000004s : 1: comm_op_add_attrs 0.06% : 0.000017s : 1: control_data_broadcast_order 0.04% : 0.000010s : 1: convert_after_rewriter 0.10% : 0.000026s : 1: cse_after_recomputation 0.02% : 0.000005s : 1: dataset_repeat_opt 0.02% : 0.000006s : 1: detach_backward 0.04% : 0.000010s : 1: environ_conv 0.08% : 0.000021s : 1: event_method 0.02% : 0.000005s : 1: full_micro_interleaved_order_control 0.02% : 0.000005s : 1: get_jit_bprop_graph 0.03% : 0.000009s : 1: graph_reusing 0.02% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000004s : 1: handle_group_info 0.02% : 0.000006s : 1: inline 0.02% : 0.000006s : 1: insert-virtual-dataset 0.01% : 0.000004s : 1: interleave_parallel_branches 0.01% : 0.000004s : 1: interleave_split_concat_branches 0.02% : 0.000006s : 1: label_fine_grained_interleaved_index 0.03% : 0.000008s : 1: label_micro_interleaved_index 2.20% : 0.000590s : 1: loop_unroll 0.02% : 0.000004s : 1: merge_cast_opt 0.02% : 0.000006s : 1: micro_interleaved_order_control 2.88% : 0.000772s : 1: mutable_eliminate 0.03% : 0.000007s : 1: offloading_packed_experts 0.07% : 0.000018s : 1: opt.transform.loop_unroll_optimizer 0.09% : 0.000024s : 1: opt.transform.mutable_eliminate 3.57% : 0.000957s : 78: opt.transform.opt_a 0.11% : 0.000029s : 1: opt.transform.opt_after_cconv 0.10% : 0.000027s : 1: opt.transform.opt_after_jit_grad 0.38% : 0.000102s : 28: opt.transform.opt_b 0.18% : 0.000048s : 2: opt.transform.opt_trans_graph 0.14% : 0.000038s : 4: opt.transform.symbol_engine_opt 9.93% : 0.002661s : 1: opt_a 0.44% : 0.000119s : 1: opt_after_cconv 2.26% : 0.000605s : 1: opt_after_jit_grad 0.86% : 0.000231s : 1: opt_b 19.16% : 0.005137s : 1: optimize 0.08% : 0.000021s : 1: optimize_parallel_all_gather_comm 0.03% : 0.000009s : 1: order_py_execute_after_rewriter 0.09% : 0.000025s : 1: overlap_grad_flash_sp 0.01% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.03% : 0.000007s : 1: overlap_grad_ring_attention 0.02% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.02% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.02% : 0.000005s : 1: overlap_param_gather 0.02% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.03% : 0.000008s : 1: overlap_recompute_and_grad_model_parallel 0.02% : 0.000006s : 1: overlap_recompute_comm 0.03% : 0.000008s : 1: parallel-infer-symbol 0.01% : 0.000004s : 1: parallel-infer-symbol-second 0.02% : 0.000005s : 1: partial_unused_args_eliminate 0.02% : 0.000005s : 1: pipeline_parallel_scheduler 0.02% : 0.000005s : 1: pipeline_split 0.14% : 0.000037s : 1: pre_auto_parallel 0.11% : 0.000030s : 1: py_interpret_to_execute 0.07% : 0.000018s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000004s : 1: remove_cast_before_assign_add 0.08% : 0.000021s : 1: remove_dup_value 1.47% : 0.000394s : 1: renormalize.infer 1.21% : 0.000325s : 1: renormalize.specialize 0.02% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.03% : 0.000008s : 1: rewriter_after_jit_bprop_graph 0.16% : 0.000044s : 1: rewriter_after_opt_a 0.22% : 0.000059s : 1: rewriter_before_opt_a 0.02% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.02% : 0.000005s : 1: slice_recompute_activation 0.02% : 0.000005s : 1: split_layernorm_comm 0.02% : 0.000005s : 1: split_matmul_comm_elemetwise 0.03% : 0.000008s : 1: swap_dp_allreduce_reducescatter 0.31% : 0.000083s : 1: symbol_engine_optimizer 0.30% : 0.000081s : 1: tuple_transform 20.68% : 0.005544s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:35.377.350 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:35.377.661 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0149969, [21] [bootstrap]: 0.00043967 [type_inference]: 0.00483738 [event_method]: 1.229e-05 [auto_monad]: 5.602e-05 [graph_reusing]: 6.01e-06 [inline]: 2.49999e-06 [add_attr]: 0.00314268, [1] [add_attr_with_inline]: 0.00313348, [1] [Cycle 1]: 7.472e-05, [2] [tag_attr]: 1.716e-05 [meta_addattr_fg_expand]: 3.93999e-06 [parallel-infer-symbol]: 3.66999e-06 [pre_auto_parallel]: 2.696e-05 [insert-virtual-dataset]: 2.36e-06 [parallel-infer-symbol-second]: 7.09988e-07 [dataset_repeat_opt]: 1.79998e-06 [pipeline_split]: 1.51998e-06 [optimize]: 0.00529414, [53] [py_interpret_to_execute]: 2.333e-05 [rewriter_before_opt_a]: 5.574e-05 [opt_a]: 0.00292012, [2] [Cycle 1]: 0.00195801, [45] [expand_dump_flag]: 3.09999e-06 [switch_simplify]: 2.743e-05 [loop_unroll]: 1.593e-05 [a_1]: 0.00038178 [with_stream_mark]: 1.64e-05 [recompute_prepare]: 1.021e-05 [updatestate_depend_eliminate]: 4.51002e-06 [updatestate_assign_eliminate]: 4.47e-06 [updatestate_loads_eliminate]: 3.55e-06 [parameter_eliminate]: 1.92001e-06 [a_2]: 0.00014292 [accelerated_algorithm]: 2.235e-05 [shard]: 2.01e-06 [meta_shard_fg_expand]: 2.04999e-06 [shard_inline]: 7.93001e-06 [merge_send_recv]: 9.29e-06 [auto_parallel]: 7.1e-06 [parallel]: 1.859e-05 [flash_sp]: 7.53e-06 [merge_comm]: 4.68999e-06 [allreduce_fusion]: 4.2e-06 [matmul_add_comm_reduction]: 1.208e-05 [allreduce_slice_to_reducescatter]: 9.20001e-07 [virtual_shard_identity]: 9.36998e-06 [virtual_dataset]: 8.07e-06 [get_grad_eliminate_]: 7.7e-06 [virtual_output]: 7.86001e-06 [merge_forward]: 4.75001e-06 [cell_reuse_recompute_pass]: 1.45999e-06 [offload_activation]: 1.126e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.754e-05 [merge_recompute_call_nodes]: 1.45999e-06 [before_grad]: 1.362e-05 [set_forward_comm_id_for_comm_node_pass]: 4.72e-06 [meta_fg_expand]: 3.26999e-06 [flash_sp_send_recv_attached]: 5.09e-06 [receive_attached]: 2.45002e-06 [after_resolve]: 1.221e-05 [a_after_grad]: 1.18e-05 [renormalize]: 0.00059228 [add_forward_monad_depend]: 5.29e-06 [auto_monad_grad]: 2.04e-06 [auto_monad_eliminator]: 1.664e-05 [cse]: 4.83e-05 [a_3]: 7.044e-05 [Cycle 2]: 0.00094773, [45] [expand_dump_flag]: 1.12e-06 [switch_simplify]: 8.47998e-06 [loop_unroll]: 7.31999e-06 [a_1]: 0.00017212 [with_stream_mark]: 1.019e-05 [recompute_prepare]: 8.35001e-06 [updatestate_depend_eliminate]: 4.33001e-06 [updatestate_assign_eliminate]: 3.29001e-06 [updatestate_loads_eliminate]: 3.13e-06 [parameter_eliminate]: 1.19e-06 [a_2]: 0.00012855 [accelerated_algorithm]: 1.091e-05 [shard]: 9.20001e-07 [meta_shard_fg_expand]: 1.59998e-06 [shard_inline]: 7.29001e-06 [merge_send_recv]: 6.71e-06 [auto_parallel]: 6.72002e-06 [parallel]: 5.40001e-06 [flash_sp]: 3.26999e-06 [merge_comm]: 4.3e-06 [allreduce_fusion]: 3.86001e-06 [matmul_add_comm_reduction]: 8.59002e-06 [allreduce_slice_to_reducescatter]: 3.7998e-07 [virtual_shard_identity]: 8.50001e-06 [virtual_dataset]: 7.61001e-06 [get_grad_eliminate_]: 6.61999e-06 [virtual_output]: 6.53998e-06 [merge_forward]: 3.38999e-06 [cell_reuse_recompute_pass]: 1.62001e-06 [offload_activation]: 7.82002e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.648e-05 [merge_recompute_call_nodes]: 9.09989e-07 [before_grad]: 1.197e-05 [set_forward_comm_id_for_comm_node_pass]: 4.68999e-06 [meta_fg_expand]: 2.63e-06 [flash_sp_send_recv_attached]: 8.70001e-07 [receive_attached]: 1.00999e-06 [after_resolve]: 9.64e-06 [a_after_grad]: 1.015e-05 [renormalize]: 9.00181e-08 [add_forward_monad_depend]: 1.79e-06 [auto_monad_grad]: 1.03001e-06 [auto_monad_eliminator]: 9.46e-06 [cse]: 2.101e-05 [a_3]: 5.677e-05 [py_interpret_to_execute_after_opt_a]: 1.424e-05 [slice_cell_reuse_recomputed_activation]: 4.74e-06 [rewriter_after_opt_a]: 4.922e-05 [convert_after_rewriter]: 1.134e-05 [order_py_execute_after_rewriter]: 9.04998e-06 [mutable_eliminate]: 0.00053983 [opt_b]: 0.00030684, [1] [Cycle 1]: 0.00029803, [7] [b_1]: 0.0001901 [b_2]: 9.47999e-06 [updatestate_depend_eliminate]: 7.21999e-06 [updatestate_assign_eliminate]: 3.4e-06 [updatestate_loads_eliminate]: 2.98e-06 [renormalize]: 6.89994e-07 [cse]: 2.755e-05 [optimize_parallel_all_gather_comm]: 2.175e-05 [overlap_param_gather]: 4.86002e-06 [cconv]: 2.931e-05 [loop_unroll]: 0.00044562 [opt_after_cconv]: 0.00013748, [1] [Cycle 1]: 0.00012923, [7] [c_1]: 3.426e-05 [parameter_eliminate]: 3.55e-06 [updatestate_depend_eliminate]: 5.99e-06 [updatestate_assign_eliminate]: 3.07002e-06 [updatestate_loads_eliminate]: 3.56001e-06 [cse]: 2.332e-05 [renormalize]: 3.39991e-07 [remove_dup_value]: 2.021e-05 [tuple_transform]: 9.712e-05, [1] [Cycle 1]: 8.962e-05, [4] [d_1]: 4.89e-05 [none_parameter_eliminate]: 1.74e-06 [renormalize]: 2.60014e-07 [switch_simplify]: 7.75e-06 [partial_unused_args_eliminate]: 4.42e-06 [add_recomputation]: 6.045e-05 [cse_after_recomputation]: 3.303e-05, [1] [Cycle 1]: 2.562e-05, [1] [cse]: 1.633e-05 [environ_conv]: 1e-05 [swap_dp_allreduce_reducescatter]: 9.19998e-06 [bias_add_comm_swap]: 5.32001e-06 [label_micro_interleaved_index]: 7.05e-06 [label_fine_grained_interleaved_index]: 5.00999e-06 [merge_cast_opt]: 3.79002e-06 [slice_recompute_activation]: 4.77e-06 [micro_interleaved_order_control]: 4.90999e-06 [assign_add_opt]: 3.9e-06 [ForceFp32Comm]: 3.18998e-06 [remove_cast_before_assign_add]: 3.86999e-06 [full_micro_interleaved_order_control]: 4.62e-06 [reorder_send_recv_between_fp_bp]: 4.82998e-06 [comm_op_add_attrs]: 3.39001e-06 [add_comm_op_reuse_tag]: 3.41999e-06 [interleave_split_concat_branches]: 3.44001e-06 [interleave_parallel_branches]: 3.59002e-06 [overlap_opt_shard_in_pipeline]: 3.53e-06 [overlap_opt_shard_grad_in_pipeline]: 3.97998e-06 [control_data_broadcast_order]: 1.837e-05 [grouped_pairwise_exchange_alltoall]: 4.27998e-06 [offloading_packed_experts]: 7.30998e-06 [overlap_recompute_and_grad_model_parallel]: 7.48e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.56999e-06 [overlap_recompute_allgather_and_fa_grad]: 4.08999e-06 [overlap_recompute_comm]: 5.33002e-06 [overlap_grad_ring_attention]: 7.21999e-06 [overlap_grad_flash_sp]: 2.41e-05 [begin_end_overlap_inline]: 2.91e-06 [split_matmul_comm_elemetwise]: 4.69998e-06 [split_layernorm_comm]: 4.33001e-06 [handle_group_info]: 3.78001e-06 [symbol_engine_optimizer]: 0.00010501, [1] [Cycle 1]: 9.799e-05, [6] [build]: 2.96001e-06 [elim_shapecalc]: 1.158e-05 [elim_not_effective]: 1.561e-05 [opt_reshape]: 8.55001e-06 [fold_const_symbol]: 1.266e-05 [renormalize]: 2.50002e-07 [detach_backward]: 3.25e-06 [pipeline_parallel_scheduler]: 1.71e-06 [auto_monad_reorder]: 2.14e-05 [get_jit_bprop_graph]: 1.74e-06 [rewriter_after_jit_bprop_graph]: 4.45e-06 [opt_after_jit_grad]: 0.00051727 [validate]: 4.436e-05 Sums bootstrap : 0.000440s : 4.35% type_inference : 0.004837s : 47.90% event_method : 0.000012s : 0.12% auto_monad : 0.000056s : 0.55% graph_reusing : 0.000006s : 0.06% inline : 0.000002s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000017s : 0.17% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000004s : 0.04% parallel-infer-symbol : 0.000004s : 0.04% pre_auto_parallel : 0.000027s : 0.27% insert-virtual-dataset : 0.000002s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.02% optimize.py_interpret_to_execute : 0.000023s : 0.23% optimize.rewriter_before_opt_a : 0.000056s : 0.55% optimize.opt_a.expand_dump_flag : 0.000004s : 0.04% optimize.opt_a.switch_simplify : 0.000036s : 0.36% optimize.opt_a.loop_unroll : 0.000023s : 0.23% optimize.opt_a.a_1 : 0.000554s : 5.48% optimize.opt_a.with_stream_mark : 0.000027s : 0.26% optimize.opt_a.recompute_prepare : 0.000019s : 0.18% optimize.opt_a.updatestate_depend_eliminate : 0.000009s : 0.09% optimize.opt_a.updatestate_assign_eliminate : 0.000008s : 0.08% optimize.opt_a.updatestate_loads_eliminate : 0.000007s : 0.07% optimize.opt_a.parameter_eliminate : 0.000003s : 0.03% optimize.opt_a.a_2 : 0.000271s : 2.69% optimize.opt_a.accelerated_algorithm : 0.000033s : 0.33% optimize.opt_a.shard : 0.000003s : 0.03% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.04% optimize.opt_a.shard_inline : 0.000015s : 0.15% optimize.opt_a.merge_send_recv : 0.000016s : 0.16% optimize.opt_a.auto_parallel : 0.000014s : 0.14% optimize.opt_a.parallel : 0.000024s : 0.24% optimize.opt_a.flash_sp : 0.000011s : 0.11% optimize.opt_a.merge_comm : 0.000009s : 0.09% optimize.opt_a.allreduce_fusion : 0.000008s : 0.08% optimize.opt_a.matmul_add_comm_reduction : 0.000021s : 0.20% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000018s : 0.18% optimize.opt_a.virtual_dataset : 0.000016s : 0.16% optimize.opt_a.get_grad_eliminate_ : 0.000014s : 0.14% optimize.opt_a.virtual_output : 0.000014s : 0.14% optimize.opt_a.merge_forward : 0.000008s : 0.08% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.03% optimize.opt_a.offload_activation : 0.000019s : 0.19% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000034s : 0.34% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.02% optimize.opt_a.before_grad : 0.000026s : 0.25% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000009s : 0.09% optimize.opt_a.meta_fg_expand : 0.000006s : 0.06% optimize.opt_a.flash_sp_send_recv_attached : 0.000006s : 0.06% optimize.opt_a.receive_attached : 0.000003s : 0.03% optimize.opt_a.after_resolve : 0.000022s : 0.22% optimize.opt_a.a_after_grad : 0.000022s : 0.22% optimize.opt_a.renormalize : 0.000592s : 5.87% optimize.opt_a.add_forward_monad_depend : 0.000007s : 0.07% optimize.opt_a.auto_monad_grad : 0.000003s : 0.03% optimize.opt_a.auto_monad_eliminator : 0.000026s : 0.26% optimize.opt_a.cse : 0.000069s : 0.69% optimize.opt_a.a_3 : 0.000127s : 1.26% optimize.py_interpret_to_execute_after_opt_a : 0.000014s : 0.14% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.05% optimize.rewriter_after_opt_a : 0.000049s : 0.49% optimize.convert_after_rewriter : 0.000011s : 0.11% optimize.order_py_execute_after_rewriter : 0.000009s : 0.09% optimize.mutable_eliminate : 0.000540s : 5.35% optimize.opt_b.b_1 : 0.000190s : 1.88% optimize.opt_b.b_2 : 0.000009s : 0.09% optimize.opt_b.updatestate_depend_eliminate : 0.000007s : 0.07% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_b.renormalize : 0.000001s : 0.01% optimize.opt_b.cse : 0.000028s : 0.27% optimize.optimize_parallel_all_gather_comm : 0.000022s : 0.22% optimize.overlap_param_gather : 0.000005s : 0.05% optimize.cconv : 0.000029s : 0.29% optimize.loop_unroll : 0.000446s : 4.41% optimize.opt_after_cconv.c_1 : 0.000034s : 0.34% optimize.opt_after_cconv.parameter_eliminate : 0.000004s : 0.04% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.06% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000004s : 0.04% optimize.opt_after_cconv.cse : 0.000023s : 0.23% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000020s : 0.20% optimize.tuple_transform.d_1 : 0.000049s : 0.48% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.02% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000008s : 0.08% optimize.partial_unused_args_eliminate : 0.000004s : 0.04% optimize.add_recomputation : 0.000060s : 0.60% optimize.cse_after_recomputation.cse : 0.000016s : 0.16% optimize.environ_conv : 0.000010s : 0.10% optimize.swap_dp_allreduce_reducescatter : 0.000009s : 0.09% optimize.bias_add_comm_swap : 0.000005s : 0.05% optimize.label_micro_interleaved_index : 0.000007s : 0.07% optimize.label_fine_grained_interleaved_index : 0.000005s : 0.05% optimize.merge_cast_opt : 0.000004s : 0.04% optimize.slice_recompute_activation : 0.000005s : 0.05% optimize.micro_interleaved_order_control : 0.000005s : 0.05% optimize.assign_add_opt : 0.000004s : 0.04% optimize.ForceFp32Comm : 0.000003s : 0.03% optimize.remove_cast_before_assign_add : 0.000004s : 0.04% optimize.full_micro_interleaved_order_control : 0.000005s : 0.05% optimize.reorder_send_recv_between_fp_bp : 0.000005s : 0.05% optimize.comm_op_add_attrs : 0.000003s : 0.03% optimize.add_comm_op_reuse_tag : 0.000003s : 0.03% optimize.interleave_split_concat_branches : 0.000003s : 0.03% optimize.interleave_parallel_branches : 0.000004s : 0.04% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.03% optimize.overlap_opt_shard_grad_in_pipeline : 0.000004s : 0.04% optimize.control_data_broadcast_order : 0.000018s : 0.18% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.04% optimize.offloading_packed_experts : 0.000007s : 0.07% optimize.overlap_recompute_and_grad_model_parallel : 0.000007s : 0.07% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.04% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.04% optimize.overlap_recompute_comm : 0.000005s : 0.05% optimize.overlap_grad_ring_attention : 0.000007s : 0.07% optimize.overlap_grad_flash_sp : 0.000024s : 0.24% optimize.begin_end_overlap_inline : 0.000003s : 0.03% optimize.split_matmul_comm_elemetwise : 0.000005s : 0.05% optimize.split_layernorm_comm : 0.000004s : 0.04% optimize.handle_group_info : 0.000004s : 0.04% optimize.symbol_engine_optimizer.build : 0.000003s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000012s : 0.11% optimize.symbol_engine_optimizer.elim_not_effective : 0.000016s : 0.15% optimize.symbol_engine_optimizer.opt_reshape : 0.000009s : 0.08% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000013s : 0.13% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000003s : 0.03% pipeline_parallel_scheduler : 0.000002s : 0.02% auto_monad_reorder : 0.000021s : 0.21% get_jit_bprop_graph : 0.000002s : 0.02% rewriter_after_jit_bprop_graph : 0.000004s : 0.04% opt_after_jit_grad : 0.000517s : 5.12% validate : 0.000044s : 0.44% Time group info: ------[substitution.] 0.000225 34 30.14% : 0.000068s : 4: substitution.arithmetic_simplify 8.89% : 0.000020s : 2: substitution.cast_eliminate 1.00% : 0.000002s : 3: substitution.elim_not_effective 0.81% : 0.000002s : 3: substitution.fold_const_symbol 2.77% : 0.000006s : 4: substitution.graph_param_transform 44.39% : 0.000100s : 2: substitution.inline 2.25% : 0.000005s : 6: substitution.j_node_and_user_rematch 5.47% : 0.000012s : 2: substitution.less_batch_normalization 2.76% : 0.000006s : 6: substitution.remove_not_recompute_node 1.53% : 0.000003s : 2: substitution.replace_old_param ------[type_inference.] 0.004792 2 91.03% : 0.004362s : 1: type_inference.infer 8.97% : 0.000430s : 1: type_inference.specialize ------[replace.] 0.000021 2 100.00% : 0.000021s : 2: replace.inline ------[match.] 0.000097 2 100.00% : 0.000097s : 2: match.inline ------[predicate.] 0.000173 980 0.99% : 0.000002s : 9: predicate.accumulaten_eliminater 1.10% : 0.000002s : 4: predicate.ad_related_special_op_eliminate 0.68% : 0.000001s : 8: predicate.addn_check_dump 0.85% : 0.000001s : 9: predicate.addn_zero_filter 0.73% : 0.000001s : 9: predicate.adjust_all_reduce_mul_add 2.50% : 0.000004s : 17: predicate.arithmetic_simplify 0.81% : 0.000001s : 9: predicate.cast_eliminate 0.78% : 0.000001s : 8: predicate.check_bprop_eliminate 0.66% : 0.000001s : 8: predicate.compare_switch_simplify 0.22% : 0.000000s : 4: predicate.const_output_eliminate 0.88% : 0.000002s : 8: predicate.depend_value_elim 0.86% : 0.000001s : 9: predicate.dict_get_item_const_eliminator 1.08% : 0.000002s : 9: predicate.dict_get_item_eliminator 0.77% : 0.000001s : 9: predicate.dict_set_item_eliminator 1.16% : 0.000002s : 8: predicate.dumpgradient_eliminate 0.28% : 0.000000s : 4: predicate.elim_not_effective 0.48% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.08% : 0.000002s : 13: predicate.environ_add_const_eliminate 1.15% : 0.000002s : 13: predicate.environ_get_add_eliminate 1.02% : 0.000002s : 13: predicate.environ_get_depend_swap 1.85% : 0.000003s : 21: predicate.environ_get_eliminate 1.06% : 0.000002s : 13: predicate.environ_get_set_eliminate 0.89% : 0.000002s : 11: predicate.exchange_switch_depend_value 1.76% : 0.000003s : 11: predicate.float_depend_g_call 0.82% : 0.000001s : 8: predicate.float_environ_get_switch 1.00% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.20% : 0.000000s : 4: predicate.fold_const_symbol 0.85% : 0.000001s : 8: predicate.get_grad_eliminate 0.25% : 0.000000s : 4: predicate.graph_param_transform 0.79% : 0.000001s : 8: predicate.incorporate_call 0.68% : 0.000001s : 8: predicate.incorporate_call_switch 6.34% : 0.000011s : 44: predicate.inline 0.97% : 0.000002s : 8: predicate.inline_without_move 0.39% : 0.000001s : 8: predicate.j_node_and_user_rematch 1.25% : 0.000002s : 8: predicate.less_batch_normalization 1.53% : 0.000003s : 17: predicate.list_to_tuple_eliminator_ 2.34% : 0.000004s : 26: predicate.load_eliminater 1.11% : 0.000002s : 4: predicate.loop_unroll_after_grad 1.55% : 0.000003s : 16: predicate.loop_unroll_before_grad 1.84% : 0.000003s : 17: predicate.make_slice_get_slice_eliminator 0.88% : 0.000002s : 8: predicate.merge_addn 0.71% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.72% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.69% : 0.000001s : 9: predicate.minmaximum_grad 1.45% : 0.000003s : 4: predicate.mutable_eliminate 0.40% : 0.000001s : 4: predicate.opt_reshape 0.46% : 0.000001s : 4: predicate.parallel_virtual_node 1.18% : 0.000002s : 11: predicate.partial_defer_inline 1.29% : 0.000002s : 13: predicate.partial_eliminate 0.75% : 0.000001s : 9: predicate.print_const_string_wrapper 0.77% : 0.000001s : 8: predicate.reduce_all_const_elim 1.04% : 0.000002s : 9: predicate.reduce_eliminate 2.08% : 0.000004s : 26: predicate.redundant_stop_gradient_eliminater 0.61% : 0.000001s : 8: predicate.remove_not_recompute_node 1.14% : 0.000002s : 17: predicate.replace_applicator 0.61% : 0.000001s : 8: predicate.replace_old_param 0.46% : 0.000001s : 4: predicate.reset_defer_inline 0.84% : 0.000001s : 9: predicate.reshape_eliminate 0.71% : 0.000001s : 8: predicate.row_tensor_add_zeros_like 0.59% : 0.000001s : 4: predicate.row_tensor_eliminate 1.11% : 0.000002s : 8: predicate.same_eliminate 0.63% : 0.000001s : 8: predicate.set_cell_output_no_recompute 1.05% : 0.000002s : 8: predicate.shard_identity_eliminate 1.04% : 0.000002s : 8: predicate.special_op_eliminate 0.97% : 0.000002s : 8: predicate.specialize_transform 1.01% : 0.000002s : 8: predicate.split_environ_get_set_with_tuple_value 1.06% : 0.000002s : 8: predicate.stack_unstack_eliminate 0.43% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.04% : 0.000002s : 11: predicate.switch_defer_inline 1.65% : 0.000003s : 19: predicate.switch_layer_defer_inline 4.08% : 0.000007s : 39: predicate.switch_simplify 0.77% : 0.000001s : 9: predicate.tile_eliminate 0.76% : 0.000001s : 9: predicate.transpose_eliminate 1.55% : 0.000003s : 17: predicate.tuple_list_convert_item_index_to_positive 1.79% : 0.000003s : 17: predicate.tuple_list_get_item_const_eliminator 1.45% : 0.000003s : 17: predicate.tuple_list_get_item_depend_reorder 2.83% : 0.000005s : 25: predicate.tuple_list_get_item_eliminator 1.61% : 0.000003s : 17: predicate.tuple_list_get_set_item_eliminator 2.46% : 0.000004s : 25: predicate.tuple_list_set_item_eliminator 1.56% : 0.000003s : 17: predicate.tuple_to_list_eliminator_ 1.97% : 0.000003s : 26: predicate.updatestate_pure_node_eliminater 2.95% : 0.000005s : 34: predicate.updatestate_useless_node_eliminater 0.46% : 0.000001s : 4: predicate.value_based_eliminate 0.93% : 0.000002s : 8: predicate.virtual_dataset_eliminate 0.95% : 0.000002s : 8: predicate.virtual_output_eliminate 0.38% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.58% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000214 5 8.41% : 0.000018s : 1: func_graph_cloner_run.FuncGraphClonerGraph 91.59% : 0.000196s : 4: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.025277 192 0.02% : 0.000006s : 1: ForceFp32Comm 12.47% : 0.003152s : 1: add_attr 12.41% : 0.003137s : 1: add_attr_with_inline 0.02% : 0.000006s : 1: add_comm_op_reuse_tag 0.26% : 0.000065s : 1: add_recomputation 0.03% : 0.000007s : 1: assign_add_opt 0.25% : 0.000064s : 1: auto_monad 0.12% : 0.000029s : 1: auto_monad_reorder 0.02% : 0.000006s : 1: begin_end_overlap_inline 0.03% : 0.000008s : 1: bias_add_comm_swap 1.91% : 0.000484s : 1: bootstrap 0.13% : 0.000032s : 1: cconv 0.02% : 0.000006s : 1: comm_op_add_attrs 0.08% : 0.000021s : 1: control_data_broadcast_order 0.06% : 0.000014s : 1: convert_after_rewriter 0.14% : 0.000036s : 1: cse_after_recomputation 0.03% : 0.000008s : 1: dataset_repeat_opt 0.07% : 0.000019s : 1: detach_backward 0.05% : 0.000013s : 1: environ_conv 0.08% : 0.000021s : 1: event_method 0.03% : 0.000008s : 1: full_micro_interleaved_order_control 0.03% : 0.000008s : 1: get_jit_bprop_graph 0.05% : 0.000012s : 1: graph_reusing 0.03% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.03% : 0.000007s : 1: handle_group_info 0.03% : 0.000008s : 1: inline 0.03% : 0.000009s : 1: insert-virtual-dataset 0.02% : 0.000006s : 1: interleave_parallel_branches 0.02% : 0.000006s : 1: interleave_split_concat_branches 0.03% : 0.000008s : 1: label_fine_grained_interleaved_index 0.04% : 0.000010s : 1: label_micro_interleaved_index 1.79% : 0.000452s : 1: loop_unroll 0.03% : 0.000007s : 1: merge_cast_opt 0.03% : 0.000008s : 1: micro_interleaved_order_control 2.16% : 0.000546s : 1: mutable_eliminate 0.04% : 0.000010s : 1: offloading_packed_experts 0.06% : 0.000016s : 1: opt.transform.loop_unroll_optimizer 0.07% : 0.000019s : 1: opt.transform.mutable_eliminate 4.22% : 0.001067s : 78: opt.transform.opt_a 0.13% : 0.000033s : 1: opt.transform.opt_after_cconv 0.12% : 0.000030s : 1: opt.transform.opt_after_jit_grad 0.50% : 0.000126s : 28: opt.transform.opt_b 0.21% : 0.000054s : 2: opt.transform.opt_trans_graph 0.18% : 0.000045s : 4: opt.transform.symbol_engine_opt 11.57% : 0.002924s : 1: opt_a 0.56% : 0.000141s : 1: opt_after_cconv 2.09% : 0.000528s : 1: opt_after_jit_grad 1.23% : 0.000310s : 1: opt_b 22.31% : 0.005640s : 1: optimize 0.10% : 0.000025s : 1: optimize_parallel_all_gather_comm 0.05% : 0.000012s : 1: order_py_execute_after_rewriter 0.11% : 0.000027s : 1: overlap_grad_flash_sp 0.02% : 0.000006s : 1: overlap_grad_matmul_and_grad_allreduce 0.04% : 0.000010s : 1: overlap_grad_ring_attention 0.03% : 0.000007s : 1: overlap_opt_shard_grad_in_pipeline 0.02% : 0.000006s : 1: overlap_opt_shard_in_pipeline 0.03% : 0.000008s : 1: overlap_param_gather 0.03% : 0.000007s : 1: overlap_recompute_allgather_and_fa_grad 0.04% : 0.000010s : 1: overlap_recompute_and_grad_model_parallel 0.03% : 0.000008s : 1: overlap_recompute_comm 0.04% : 0.000010s : 1: parallel-infer-symbol 0.02% : 0.000006s : 1: parallel-infer-symbol-second 0.03% : 0.000007s : 1: partial_unused_args_eliminate 0.04% : 0.000009s : 1: pipeline_parallel_scheduler 0.03% : 0.000007s : 1: pipeline_split 0.14% : 0.000035s : 1: pre_auto_parallel 0.10% : 0.000027s : 1: py_interpret_to_execute 0.07% : 0.000018s : 1: py_interpret_to_execute_after_opt_a 0.03% : 0.000007s : 1: remove_cast_before_assign_add 0.09% : 0.000023s : 1: remove_dup_value 1.33% : 0.000336s : 1: renormalize.infer 0.98% : 0.000248s : 1: renormalize.specialize 0.03% : 0.000008s : 1: reorder_send_recv_between_fp_bp 0.04% : 0.000010s : 1: rewriter_after_jit_bprop_graph 0.21% : 0.000053s : 1: rewriter_after_opt_a 0.23% : 0.000059s : 1: rewriter_before_opt_a 0.03% : 0.000007s : 1: slice_cell_reuse_recomputed_activation 0.03% : 0.000007s : 1: slice_recompute_activation 0.03% : 0.000007s : 1: split_layernorm_comm 0.03% : 0.000008s : 1: split_matmul_comm_elemetwise 0.05% : 0.000012s : 1: swap_dp_allreduce_reducescatter 0.43% : 0.000108s : 1: symbol_engine_optimizer 0.40% : 0.000100s : 1: tuple_transform 19.24% : 0.004862s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:35.785.866 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.031526, [21] [bootstrap]: 0.00048079 [type_inference]: 0.0219629 [event_method]: 1.457e-05 [auto_monad]: 6.12e-05 [graph_reusing]: 5.69999e-06 [inline]: 2.88e-06 [add_attr]: 0.00349295, [1] [add_attr_with_inline]: 0.00348344, [1] [Cycle 1]: 5.554e-05, [2] [tag_attr]: 1.606e-05 [meta_addattr_fg_expand]: 3.91999e-06 [parallel-infer-symbol]: 3.05998e-06 [pre_auto_parallel]: 2.808e-05 [insert-virtual-dataset]: 2.73e-06 [parallel-infer-symbol-second]: 1.19e-06 [dataset_repeat_opt]: 1.96e-06 [pipeline_split]: 1.97999e-06 [optimize]: 0.00473129, [53] [py_interpret_to_execute]: 1.858e-05 [rewriter_before_opt_a]: 5.127e-05 [opt_a]: 0.00261444, [2] [Cycle 1]: 0.00184831, [45] [expand_dump_flag]: 2.98e-06 [switch_simplify]: 3.016e-05 [loop_unroll]: 1.558e-05 [a_1]: 0.00038574 [with_stream_mark]: 1.64e-05 [recompute_prepare]: 9.63002e-06 [updatestate_depend_eliminate]: 4.31002e-06 [updatestate_assign_eliminate]: 4.55001e-06 [updatestate_loads_eliminate]: 4.07e-06 [parameter_eliminate]: 1.98002e-06 [a_2]: 0.00014858 [accelerated_algorithm]: 2.168e-05 [shard]: 2.78e-06 [meta_shard_fg_expand]: 2.19001e-06 [shard_inline]: 8.37e-06 [merge_send_recv]: 9.74999e-06 [auto_parallel]: 7.5e-06 [parallel]: 2.027e-05 [flash_sp]: 8.21002e-06 [merge_comm]: 5.42999e-06 [allreduce_fusion]: 4.25e-06 [matmul_add_comm_reduction]: 1.225e-05 [allreduce_slice_to_reducescatter]: 6.69999e-07 [virtual_shard_identity]: 9.44998e-06 [virtual_dataset]: 7.81001e-06 [get_grad_eliminate_]: 7.5e-06 [virtual_output]: 7.73999e-06 [merge_forward]: 4.82e-06 [cell_reuse_recompute_pass]: 1.48002e-06 [offload_activation]: 1.146e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.547e-05 [merge_recompute_call_nodes]: 1.41002e-06 [before_grad]: 1.314e-05 [set_forward_comm_id_for_comm_node_pass]: 5.17e-06 [meta_fg_expand]: 3.40998e-06 [flash_sp_send_recv_attached]: 4.85001e-06 [receive_attached]: 2.14999e-06 [after_resolve]: 1.169e-05 [a_after_grad]: 1.162e-05 [renormalize]: 0.00063602 [add_forward_monad_depend]: 5.27999e-06 [auto_monad_grad]: 2.16e-06 [auto_monad_eliminator]: 1.666e-05 [cse]: 3.653e-05 [a_3]: 5.527e-05 [Cycle 2]: 0.00075707, [45] [expand_dump_flag]: 1.12e-06 [switch_simplify]: 8.17e-06 [loop_unroll]: 7.06999e-06 [a_1]: 0.00017073 [with_stream_mark]: 9.87001e-06 [recompute_prepare]: 7.88001e-06 [updatestate_depend_eliminate]: 4.34002e-06 [updatestate_assign_eliminate]: 2.84001e-06 [updatestate_loads_eliminate]: 2.68e-06 [parameter_eliminate]: 1.52001e-06 [a_2]: 9.769e-05 [accelerated_algorithm]: 1.025e-05 [shard]: 1.02e-06 [meta_shard_fg_expand]: 1.76e-06 [shard_inline]: 7.76001e-06 [merge_send_recv]: 5.85002e-06 [auto_parallel]: 5.92999e-06 [parallel]: 4.80999e-06 [flash_sp]: 3.83001e-06 [merge_comm]: 4.00998e-06 [allreduce_fusion]: 3.83999e-06 [matmul_add_comm_reduction]: 8.37e-06 [allreduce_slice_to_reducescatter]: 3.59985e-07 [virtual_shard_identity]: 8.85999e-06 [virtual_dataset]: 6.73e-06 [get_grad_eliminate_]: 6.34001e-06 [virtual_output]: 6.44999e-06 [merge_forward]: 3.46999e-06 [cell_reuse_recompute_pass]: 1.35001e-06 [offload_activation]: 7.41001e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.32e-05 [merge_recompute_call_nodes]: 7.7e-07 [before_grad]: 1.115e-05 [set_forward_comm_id_for_comm_node_pass]: 4.48999e-06 [meta_fg_expand]: 3.22002e-06 [flash_sp_send_recv_attached]: 1.33002e-06 [receive_attached]: 1.14e-06 [after_resolve]: 9.71998e-06 [a_after_grad]: 1.067e-05 [renormalize]: 7.00238e-08 [add_forward_monad_depend]: 1.64e-06 [auto_monad_grad]: 1.03001e-06 [auto_monad_eliminator]: 8.72998e-06 [cse]: 2.059e-05 [a_3]: 4.285e-05 [py_interpret_to_execute_after_opt_a]: 1.135e-05 [slice_cell_reuse_recomputed_activation]: 2.37999e-06 [rewriter_after_opt_a]: 3.95e-05 [convert_after_rewriter]: 7.61001e-06 [order_py_execute_after_rewriter]: 6.54999e-06 [mutable_eliminate]: 0.00051073 [opt_b]: 0.000235, [1] [Cycle 1]: 0.00022902, [7] [b_1]: 0.00014569 [b_2]: 8.48001e-06 [updatestate_depend_eliminate]: 6.51e-06 [updatestate_assign_eliminate]: 3.14999e-06 [updatestate_loads_eliminate]: 3.09001e-06 [renormalize]: 3.89991e-07 [cse]: 2.486e-05 [optimize_parallel_all_gather_comm]: 1.748e-05 [overlap_param_gather]: 1.89999e-06 [cconv]: 2.612e-05 [loop_unroll]: 0.00043934 [opt_after_cconv]: 0.00011243, [1] [Cycle 1]: 0.00010642, [7] [c_1]: 3.376e-05 [parameter_eliminate]: 2.72001e-06 [updatestate_depend_eliminate]: 5.96e-06 [updatestate_assign_eliminate]: 2.91e-06 [updatestate_loads_eliminate]: 2.83e-06 [cse]: 2.359e-05 [renormalize]: 3.69997e-07 [remove_dup_value]: 1.705e-05 [tuple_transform]: 8.193e-05, [1] [Cycle 1]: 7.765e-05, [4] [d_1]: 4.827e-05 [none_parameter_eliminate]: 1.72001e-06 [renormalize]: 2.20025e-07 [switch_simplify]: 7.99002e-06 [partial_unused_args_eliminate]: 2.17001e-06 [add_recomputation]: 5.684e-05 [cse_after_recomputation]: 2.593e-05, [1] [Cycle 1]: 2.13e-05, [1] [cse]: 1.571e-05 [environ_conv]: 5.86998e-06 [swap_dp_allreduce_reducescatter]: 5.66003e-06 [bias_add_comm_swap]: 2.49999e-06 [label_micro_interleaved_index]: 4.84e-06 [label_fine_grained_interleaved_index]: 2.69999e-06 [merge_cast_opt]: 1.50001e-06 [slice_recompute_activation]: 2.32001e-06 [micro_interleaved_order_control]: 2.21e-06 [assign_add_opt]: 1.20999e-06 [ForceFp32Comm]: 9.00007e-07 [remove_cast_before_assign_add]: 9.79984e-07 [full_micro_interleaved_order_control]: 2.14999e-06 [reorder_send_recv_between_fp_bp]: 2.66e-06 [comm_op_add_attrs]: 9.49978e-07 [add_comm_op_reuse_tag]: 9.70002e-07 [interleave_split_concat_branches]: 1.10999e-06 [interleave_parallel_branches]: 1.22e-06 [overlap_opt_shard_in_pipeline]: 1.07e-06 [overlap_opt_shard_grad_in_pipeline]: 1.67001e-06 [control_data_broadcast_order]: 1.488e-05 [grouped_pairwise_exchange_alltoall]: 1.46998e-06 [offloading_packed_experts]: 4.3e-06 [overlap_recompute_and_grad_model_parallel]: 5.39e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.40001e-06 [overlap_recompute_allgather_and_fa_grad]: 1.30999e-06 [overlap_recompute_comm]: 2.21e-06 [overlap_grad_ring_attention]: 4.79e-06 [overlap_grad_flash_sp]: 2.029e-05 [begin_end_overlap_inline]: 5.40022e-07 [split_matmul_comm_elemetwise]: 2.24999e-06 [split_layernorm_comm]: 2.14999e-06 [handle_group_info]: 9.39996e-07 [symbol_engine_optimizer]: 0.00013071, [1] [Cycle 1]: 0.00012638, [6] [build]: 2.86e-06 [elim_shapecalc]: 1.111e-05 [elim_not_effective]: 1.504e-05 [opt_reshape]: 8.02e-06 [fold_const_symbol]: 1.461e-05 [renormalize]: 2.30008e-07 [detach_backward]: 2.13998e-06 [pipeline_parallel_scheduler]: 1.77001e-06 [auto_monad_reorder]: 2.172e-05 [get_jit_bprop_graph]: 1.64998e-06 [rewriter_after_jit_bprop_graph]: 4.58999e-06 [opt_after_jit_grad]: 0.00049272 [validate]: 4.39e-05 Sums bootstrap : 0.000481s : 1.78% type_inference : 0.021963s : 81.34% event_method : 0.000015s : 0.05% auto_monad : 0.000061s : 0.23% graph_reusing : 0.000006s : 0.02% inline : 0.000003s : 0.01% add_attr.add_attr_with_inline.tag_attr : 0.000016s : 0.06% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000004s : 0.01% parallel-infer-symbol : 0.000003s : 0.01% pre_auto_parallel : 0.000028s : 0.10% insert-virtual-dataset : 0.000003s : 0.01% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.01% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000019s : 0.07% optimize.rewriter_before_opt_a : 0.000051s : 0.19% optimize.opt_a.expand_dump_flag : 0.000004s : 0.02% optimize.opt_a.switch_simplify : 0.000038s : 0.14% optimize.opt_a.loop_unroll : 0.000023s : 0.08% optimize.opt_a.a_1 : 0.000556s : 2.06% optimize.opt_a.with_stream_mark : 0.000026s : 0.10% optimize.opt_a.recompute_prepare : 0.000018s : 0.06% optimize.opt_a.updatestate_depend_eliminate : 0.000009s : 0.03% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.03% optimize.opt_a.updatestate_loads_eliminate : 0.000007s : 0.02% optimize.opt_a.parameter_eliminate : 0.000004s : 0.01% optimize.opt_a.a_2 : 0.000246s : 0.91% optimize.opt_a.accelerated_algorithm : 0.000032s : 0.12% optimize.opt_a.shard : 0.000004s : 0.01% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.01% optimize.opt_a.shard_inline : 0.000016s : 0.06% optimize.opt_a.merge_send_recv : 0.000016s : 0.06% optimize.opt_a.auto_parallel : 0.000013s : 0.05% optimize.opt_a.parallel : 0.000025s : 0.09% optimize.opt_a.flash_sp : 0.000012s : 0.04% optimize.opt_a.merge_comm : 0.000009s : 0.03% optimize.opt_a.allreduce_fusion : 0.000008s : 0.03% optimize.opt_a.matmul_add_comm_reduction : 0.000021s : 0.08% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000018s : 0.07% optimize.opt_a.virtual_dataset : 0.000015s : 0.05% optimize.opt_a.get_grad_eliminate_ : 0.000014s : 0.05% optimize.opt_a.virtual_output : 0.000014s : 0.05% optimize.opt_a.merge_forward : 0.000008s : 0.03% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.01% optimize.opt_a.offload_activation : 0.000019s : 0.07% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000029s : 0.11% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.01% optimize.opt_a.before_grad : 0.000024s : 0.09% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000010s : 0.04% optimize.opt_a.meta_fg_expand : 0.000007s : 0.02% optimize.opt_a.flash_sp_send_recv_attached : 0.000006s : 0.02% optimize.opt_a.receive_attached : 0.000003s : 0.01% optimize.opt_a.after_resolve : 0.000021s : 0.08% optimize.opt_a.a_after_grad : 0.000022s : 0.08% optimize.opt_a.renormalize : 0.000636s : 2.36% optimize.opt_a.add_forward_monad_depend : 0.000007s : 0.03% optimize.opt_a.auto_monad_grad : 0.000003s : 0.01% optimize.opt_a.auto_monad_eliminator : 0.000025s : 0.09% optimize.opt_a.cse : 0.000057s : 0.21% optimize.opt_a.a_3 : 0.000098s : 0.36% optimize.py_interpret_to_execute_after_opt_a : 0.000011s : 0.04% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.01% optimize.rewriter_after_opt_a : 0.000040s : 0.15% optimize.convert_after_rewriter : 0.000008s : 0.03% optimize.order_py_execute_after_rewriter : 0.000007s : 0.02% optimize.mutable_eliminate : 0.000511s : 1.89% optimize.opt_b.b_1 : 0.000146s : 0.54% optimize.opt_b.b_2 : 0.000008s : 0.03% optimize.opt_b.updatestate_depend_eliminate : 0.000007s : 0.02% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.01% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.01% optimize.opt_b.renormalize : 0.000000s : 0.00% optimize.opt_b.cse : 0.000025s : 0.09% optimize.optimize_parallel_all_gather_comm : 0.000017s : 0.06% optimize.overlap_param_gather : 0.000002s : 0.01% optimize.cconv : 0.000026s : 0.10% optimize.loop_unroll : 0.000439s : 1.63% optimize.opt_after_cconv.c_1 : 0.000034s : 0.13% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.02% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.cse : 0.000024s : 0.09% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000017s : 0.06% optimize.tuple_transform.d_1 : 0.000048s : 0.18% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000008s : 0.03% optimize.partial_unused_args_eliminate : 0.000002s : 0.01% optimize.add_recomputation : 0.000057s : 0.21% optimize.cse_after_recomputation.cse : 0.000016s : 0.06% optimize.environ_conv : 0.000006s : 0.02% optimize.swap_dp_allreduce_reducescatter : 0.000006s : 0.02% optimize.bias_add_comm_swap : 0.000002s : 0.01% optimize.label_micro_interleaved_index : 0.000005s : 0.02% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.01% optimize.merge_cast_opt : 0.000002s : 0.01% optimize.slice_recompute_activation : 0.000002s : 0.01% optimize.micro_interleaved_order_control : 0.000002s : 0.01% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.00% optimize.full_micro_interleaved_order_control : 0.000002s : 0.01% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.01% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.01% optimize.control_data_broadcast_order : 0.000015s : 0.06% optimize.grouped_pairwise_exchange_alltoall : 0.000001s : 0.01% optimize.offloading_packed_experts : 0.000004s : 0.02% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.02% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.00% optimize.overlap_recompute_comm : 0.000002s : 0.01% optimize.overlap_grad_ring_attention : 0.000005s : 0.02% optimize.overlap_grad_flash_sp : 0.000020s : 0.08% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.01% optimize.split_layernorm_comm : 0.000002s : 0.01% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000003s : 0.01% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000011s : 0.04% optimize.symbol_engine_optimizer.elim_not_effective : 0.000015s : 0.06% optimize.symbol_engine_optimizer.opt_reshape : 0.000008s : 0.03% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000015s : 0.05% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.01% pipeline_parallel_scheduler : 0.000002s : 0.01% auto_monad_reorder : 0.000022s : 0.08% get_jit_bprop_graph : 0.000002s : 0.01% rewriter_after_jit_bprop_graph : 0.000005s : 0.02% opt_after_jit_grad : 0.000493s : 1.82% validate : 0.000044s : 0.16% Time group info: ------[substitution.] 0.000227 34 31.15% : 0.000071s : 4: substitution.arithmetic_simplify 8.50% : 0.000019s : 2: substitution.cast_eliminate 1.07% : 0.000002s : 3: substitution.elim_not_effective 0.85% : 0.000002s : 3: substitution.fold_const_symbol 2.80% : 0.000006s : 4: substitution.graph_param_transform 44.28% : 0.000101s : 2: substitution.inline 2.02% : 0.000005s : 6: substitution.j_node_and_user_rematch 5.40% : 0.000012s : 2: substitution.less_batch_normalization 2.48% : 0.000006s : 6: substitution.remove_not_recompute_node 1.44% : 0.000003s : 2: substitution.replace_old_param ------[type_inference.] 0.021907 2 97.67% : 0.021397s : 1: type_inference.infer 2.33% : 0.000510s : 1: type_inference.specialize ------[replace.] 0.000023 2 100.00% : 0.000023s : 2: replace.inline ------[match.] 0.000099 2 100.00% : 0.000099s : 2: match.inline ------[predicate.] 0.000169 980 0.85% : 0.000001s : 9: predicate.accumulaten_eliminater 1.22% : 0.000002s : 4: predicate.ad_related_special_op_eliminate 0.73% : 0.000001s : 8: predicate.addn_check_dump 0.86% : 0.000001s : 9: predicate.addn_zero_filter 0.68% : 0.000001s : 9: predicate.adjust_all_reduce_mul_add 2.87% : 0.000005s : 17: predicate.arithmetic_simplify 0.83% : 0.000001s : 9: predicate.cast_eliminate 0.75% : 0.000001s : 8: predicate.check_bprop_eliminate 0.64% : 0.000001s : 8: predicate.compare_switch_simplify 0.23% : 0.000000s : 4: predicate.const_output_eliminate 0.78% : 0.000001s : 8: predicate.depend_value_elim 0.82% : 0.000001s : 9: predicate.dict_get_item_const_eliminator 0.96% : 0.000002s : 9: predicate.dict_get_item_eliminator 0.79% : 0.000001s : 9: predicate.dict_set_item_eliminator 1.21% : 0.000002s : 8: predicate.dumpgradient_eliminate 0.25% : 0.000000s : 4: predicate.elim_not_effective 0.50% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.14% : 0.000002s : 13: predicate.environ_add_const_eliminate 1.08% : 0.000002s : 13: predicate.environ_get_add_eliminate 1.03% : 0.000002s : 13: predicate.environ_get_depend_swap 1.75% : 0.000003s : 21: predicate.environ_get_eliminate 1.05% : 0.000002s : 13: predicate.environ_get_set_eliminate 0.89% : 0.000002s : 11: predicate.exchange_switch_depend_value 1.72% : 0.000003s : 11: predicate.float_depend_g_call 0.67% : 0.000001s : 8: predicate.float_environ_get_switch 0.98% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.22% : 0.000000s : 4: predicate.fold_const_symbol 0.84% : 0.000001s : 8: predicate.get_grad_eliminate 0.27% : 0.000000s : 4: predicate.graph_param_transform 0.83% : 0.000001s : 8: predicate.incorporate_call 0.69% : 0.000001s : 8: predicate.incorporate_call_switch 6.49% : 0.000011s : 44: predicate.inline 1.03% : 0.000002s : 8: predicate.inline_without_move 0.41% : 0.000001s : 8: predicate.j_node_and_user_rematch 1.16% : 0.000002s : 8: predicate.less_batch_normalization 1.60% : 0.000003s : 17: predicate.list_to_tuple_eliminator_ 2.21% : 0.000004s : 26: predicate.load_eliminater 1.29% : 0.000002s : 4: predicate.loop_unroll_after_grad 1.50% : 0.000003s : 16: predicate.loop_unroll_before_grad 1.65% : 0.000003s : 17: predicate.make_slice_get_slice_eliminator 0.77% : 0.000001s : 8: predicate.merge_addn 0.69% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.80% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.72% : 0.000001s : 9: predicate.minmaximum_grad 1.46% : 0.000002s : 4: predicate.mutable_eliminate 0.41% : 0.000001s : 4: predicate.opt_reshape 0.46% : 0.000001s : 4: predicate.parallel_virtual_node 1.19% : 0.000002s : 11: predicate.partial_defer_inline 1.33% : 0.000002s : 13: predicate.partial_eliminate 0.80% : 0.000001s : 9: predicate.print_const_string_wrapper 0.73% : 0.000001s : 8: predicate.reduce_all_const_elim 1.01% : 0.000002s : 9: predicate.reduce_eliminate 2.12% : 0.000004s : 26: predicate.redundant_stop_gradient_eliminater 0.60% : 0.000001s : 8: predicate.remove_not_recompute_node 1.11% : 0.000002s : 17: predicate.replace_applicator 0.69% : 0.000001s : 8: predicate.replace_old_param 0.33% : 0.000001s : 4: predicate.reset_defer_inline 0.96% : 0.000002s : 9: predicate.reshape_eliminate 0.73% : 0.000001s : 8: predicate.row_tensor_add_zeros_like 0.56% : 0.000001s : 4: predicate.row_tensor_eliminate 0.94% : 0.000002s : 8: predicate.same_eliminate 0.53% : 0.000001s : 8: predicate.set_cell_output_no_recompute 1.04% : 0.000002s : 8: predicate.shard_identity_eliminate 0.95% : 0.000002s : 8: predicate.special_op_eliminate 1.00% : 0.000002s : 8: predicate.specialize_transform 1.12% : 0.000002s : 8: predicate.split_environ_get_set_with_tuple_value 0.91% : 0.000002s : 8: predicate.stack_unstack_eliminate 0.46% : 0.000001s : 4: predicate.switch_call_monad_eliminater 0.94% : 0.000002s : 11: predicate.switch_defer_inline 1.79% : 0.000003s : 19: predicate.switch_layer_defer_inline 4.17% : 0.000007s : 39: predicate.switch_simplify 0.83% : 0.000001s : 9: predicate.tile_eliminate 0.82% : 0.000001s : 9: predicate.transpose_eliminate 1.56% : 0.000003s : 17: predicate.tuple_list_convert_item_index_to_positive 1.70% : 0.000003s : 17: predicate.tuple_list_get_item_const_eliminator 1.44% : 0.000002s : 17: predicate.tuple_list_get_item_depend_reorder 3.04% : 0.000005s : 25: predicate.tuple_list_get_item_eliminator 1.58% : 0.000003s : 17: predicate.tuple_list_get_set_item_eliminator 2.65% : 0.000004s : 25: predicate.tuple_list_set_item_eliminator 1.52% : 0.000003s : 17: predicate.tuple_to_list_eliminator_ 2.12% : 0.000004s : 26: predicate.updatestate_pure_node_eliminater 2.94% : 0.000005s : 34: predicate.updatestate_useless_node_eliminater 0.45% : 0.000001s : 4: predicate.value_based_eliminate 0.79% : 0.000001s : 8: predicate.virtual_dataset_eliminate 0.87% : 0.000001s : 8: predicate.virtual_output_eliminate 0.35% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.54% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000247 5 7.27% : 0.000018s : 1: func_graph_cloner_run.FuncGraphClonerGraph 92.73% : 0.000229s : 4: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.041658 192 0.01% : 0.000004s : 1: ForceFp32Comm 8.40% : 0.003498s : 1: add_attr 8.37% : 0.003487s : 1: add_attr_with_inline 0.01% : 0.000004s : 1: add_comm_op_reuse_tag 0.15% : 0.000061s : 1: add_recomputation 0.01% : 0.000004s : 1: assign_add_opt 0.16% : 0.000067s : 1: auto_monad 0.06% : 0.000026s : 1: auto_monad_reorder 0.01% : 0.000003s : 1: begin_end_overlap_inline 0.01% : 0.000005s : 1: bias_add_comm_swap 1.23% : 0.000512s : 1: bootstrap 0.07% : 0.000029s : 1: cconv 0.01% : 0.000004s : 1: comm_op_add_attrs 0.04% : 0.000018s : 1: control_data_broadcast_order 0.03% : 0.000012s : 1: convert_after_rewriter 0.07% : 0.000029s : 1: cse_after_recomputation 0.01% : 0.000005s : 1: dataset_repeat_opt 0.01% : 0.000005s : 1: detach_backward 0.02% : 0.000009s : 1: environ_conv 0.05% : 0.000022s : 1: event_method 0.01% : 0.000005s : 1: full_micro_interleaved_order_control 0.01% : 0.000005s : 1: get_jit_bprop_graph 0.02% : 0.000009s : 1: graph_reusing 0.01% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000004s : 1: handle_group_info 0.01% : 0.000006s : 1: inline 0.01% : 0.000006s : 1: insert-virtual-dataset 0.01% : 0.000004s : 1: interleave_parallel_branches 0.01% : 0.000004s : 1: interleave_split_concat_branches 0.01% : 0.000006s : 1: label_fine_grained_interleaved_index 0.02% : 0.000008s : 1: label_micro_interleaved_index 1.07% : 0.000447s : 1: loop_unroll 0.01% : 0.000004s : 1: merge_cast_opt 0.01% : 0.000005s : 1: micro_interleaved_order_control 1.25% : 0.000520s : 1: mutable_eliminate 0.02% : 0.000007s : 1: offloading_packed_experts 0.04% : 0.000016s : 1: opt.transform.loop_unroll_optimizer 0.04% : 0.000018s : 1: opt.transform.mutable_eliminate 2.62% : 0.001090s : 78: opt.transform.opt_a 0.08% : 0.000032s : 1: opt.transform.opt_after_cconv 0.07% : 0.000030s : 1: opt.transform.opt_after_jit_grad 0.29% : 0.000123s : 28: opt.transform.opt_b 0.13% : 0.000054s : 2: opt.transform.opt_trans_graph 0.11% : 0.000044s : 4: opt.transform.symbol_engine_opt 6.28% : 0.002617s : 1: opt_a 0.28% : 0.000116s : 1: opt_after_cconv 1.21% : 0.000502s : 1: opt_after_jit_grad 0.57% : 0.000239s : 1: opt_b 11.37% : 0.004737s : 1: optimize 0.05% : 0.000021s : 1: optimize_parallel_all_gather_comm 0.02% : 0.000010s : 1: order_py_execute_after_rewriter 0.06% : 0.000024s : 1: overlap_grad_flash_sp 0.01% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.02% : 0.000008s : 1: overlap_grad_ring_attention 0.01% : 0.000004s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.01% : 0.000005s : 1: overlap_param_gather 0.01% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.02% : 0.000008s : 1: overlap_recompute_and_grad_model_parallel 0.01% : 0.000005s : 1: overlap_recompute_comm 0.02% : 0.000007s : 1: parallel-infer-symbol 0.01% : 0.000004s : 1: parallel-infer-symbol-second 0.01% : 0.000005s : 1: partial_unused_args_eliminate 0.01% : 0.000005s : 1: pipeline_parallel_scheduler 0.01% : 0.000005s : 1: pipeline_split 0.08% : 0.000032s : 1: pre_auto_parallel 0.05% : 0.000022s : 1: py_interpret_to_execute 0.03% : 0.000014s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000004s : 1: remove_cast_before_assign_add 0.05% : 0.000021s : 1: remove_dup_value 0.88% : 0.000368s : 1: renormalize.infer 0.62% : 0.000260s : 1: renormalize.specialize 0.01% : 0.000005s : 1: reorder_send_recv_between_fp_bp 0.02% : 0.000008s : 1: rewriter_after_jit_bprop_graph 0.10% : 0.000044s : 1: rewriter_after_opt_a 0.13% : 0.000055s : 1: rewriter_before_opt_a 0.01% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.01% : 0.000005s : 1: slice_recompute_activation 0.01% : 0.000005s : 1: split_layernorm_comm 0.01% : 0.000005s : 1: split_matmul_comm_elemetwise 0.02% : 0.000008s : 1: swap_dp_allreduce_reducescatter 0.32% : 0.000134s : 1: symbol_engine_optimizer 0.20% : 0.000085s : 1: tuple_transform 52.78% : 0.021988s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:36.313.389 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:36.313.683 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0314821, [21] [bootstrap]: 0.00044624 [type_inference]: 0.00493743 [event_method]: 1.296e-05 [auto_monad]: 5.666e-05 [graph_reusing]: 5.71998e-06 [inline]: 2.09e-06 [add_attr]: 0.00329557, [1] [add_attr_with_inline]: 0.00328615, [1] [Cycle 1]: 6.81e-05, [2] [tag_attr]: 1.524e-05 [meta_addattr_fg_expand]: 4.04997e-06 [parallel-infer-symbol]: 3.29001e-06 [pre_auto_parallel]: 2.871e-05 [insert-virtual-dataset]: 2.41e-06 [parallel-infer-symbol-second]: 7.49977e-07 [dataset_repeat_opt]: 1.99e-06 [pipeline_split]: 1.62001e-06 [optimize]: 0.0215269, [53] [py_interpret_to_execute]: 2.158e-05 [rewriter_before_opt_a]: 5.493e-05 [opt_a]: 0.0188663, [2] [Cycle 1]: 0.0178051, [45] [expand_dump_flag]: 3.2e-06 [switch_simplify]: 2.593e-05 [loop_unroll]: 1.511e-05 [a_1]: 0.0003859 [with_stream_mark]: 1.724e-05 [recompute_prepare]: 9.56e-06 [updatestate_depend_eliminate]: 5.04e-06 [updatestate_assign_eliminate]: 3.97e-06 [updatestate_loads_eliminate]: 3.40998e-06 [parameter_eliminate]: 1.96e-06 [a_2]: 0.00014051 [accelerated_algorithm]: 2.089e-05 [shard]: 1.99e-06 [meta_shard_fg_expand]: 2.11e-06 [shard_inline]: 8.22e-06 [merge_send_recv]: 9.50001e-06 [auto_parallel]: 7.03998e-06 [parallel]: 1.9e-05 [flash_sp]: 8.06001e-06 [merge_comm]: 4.72e-06 [allreduce_fusion]: 4.50001e-06 [matmul_add_comm_reduction]: 1.151e-05 [allreduce_slice_to_reducescatter]: 6.19999e-07 [virtual_shard_identity]: 9.25999e-06 [virtual_dataset]: 7.33e-06 [get_grad_eliminate_]: 7.78001e-06 [virtual_output]: 7.57998e-06 [merge_forward]: 4.61002e-06 [cell_reuse_recompute_pass]: 1.52001e-06 [offload_activation]: 1.104e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.702e-05 [merge_recompute_call_nodes]: 1.47001e-06 [before_grad]: 1.273e-05 [set_forward_comm_id_for_comm_node_pass]: 4.84e-06 [meta_fg_expand]: 3.43999e-06 [flash_sp_send_recv_attached]: 4.90999e-06 [receive_attached]: 2.29001e-06 [after_resolve]: 1.216e-05 [a_after_grad]: 1.136e-05 [renormalize]: 0.0163701 [add_forward_monad_depend]: 1.114e-05 [auto_monad_grad]: 2.95002e-06 [auto_monad_eliminator]: 2.641e-05 [cse]: 4.192e-05 [a_3]: 8.594e-05 [Cycle 2]: 0.00104294, [45] [expand_dump_flag]: 2.72001e-06 [switch_simplify]: 1.082e-05 [loop_unroll]: 8.29998e-06 [a_1]: 0.00021853 [with_stream_mark]: 1.596e-05 [recompute_prepare]: 8.02998e-06 [updatestate_depend_eliminate]: 5.20001e-06 [updatestate_assign_eliminate]: 4.05e-06 [updatestate_loads_eliminate]: 3.91999e-06 [parameter_eliminate]: 1.74998e-06 [a_2]: 0.00012897 [accelerated_algorithm]: 1.303e-05 [shard]: 1.99e-06 [meta_shard_fg_expand]: 2.73998e-06 [shard_inline]: 7.78999e-06 [merge_send_recv]: 9.84001e-06 [auto_parallel]: 1.009e-05 [parallel]: 9.10999e-06 [flash_sp]: 4.15999e-06 [merge_comm]: 4.27e-06 [allreduce_fusion]: 4.76002e-06 [matmul_add_comm_reduction]: 1.095e-05 [allreduce_slice_to_reducescatter]: 7.49977e-07 [virtual_shard_identity]: 8.50999e-06 [virtual_dataset]: 7.48e-06 [get_grad_eliminate_]: 7.66001e-06 [virtual_output]: 7.13e-06 [merge_forward]: 4.97e-06 [cell_reuse_recompute_pass]: 2.98998e-06 [offload_activation]: 1.087e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.681e-05 [merge_recompute_call_nodes]: 1.81e-06 [before_grad]: 1.278e-05 [set_forward_comm_id_for_comm_node_pass]: 5.15999e-06 [meta_fg_expand]: 3.36001e-06 [flash_sp_send_recv_attached]: 1.77001e-06 [receive_attached]: 2.53e-06 [after_resolve]: 1.197e-05 [a_after_grad]: 1.093e-05 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 1.71e-06 [auto_monad_grad]: 9.39996e-07 [auto_monad_eliminator]: 9.02999e-06 [cse]: 2.158e-05 [a_3]: 5.66e-05 [py_interpret_to_execute_after_opt_a]: 2.172e-05 [slice_cell_reuse_recomputed_activation]: 4.87e-06 [rewriter_after_opt_a]: 4.89e-05 [convert_after_rewriter]: 1.047e-05 [order_py_execute_after_rewriter]: 9.27999e-06 [mutable_eliminate]: 0.00074194 [opt_b]: 0.00031855, [1] [Cycle 1]: 0.00030809, [7] [b_1]: 0.00019378 [b_2]: 1.018e-05 [updatestate_depend_eliminate]: 8.06001e-06 [updatestate_assign_eliminate]: 3.6e-06 [updatestate_loads_eliminate]: 3.35e-06 [renormalize]: 6.00005e-07 [cse]: 2.674e-05 [optimize_parallel_all_gather_comm]: 2.306e-05 [overlap_param_gather]: 4.96002e-06 [cconv]: 3.073e-05 [loop_unroll]: 0.00048804 [opt_after_cconv]: 0.00014218, [1] [Cycle 1]: 0.00013224, [7] [c_1]: 3.59e-05 [parameter_eliminate]: 3.31001e-06 [updatestate_depend_eliminate]: 5.98002e-06 [updatestate_assign_eliminate]: 3.11001e-06 [updatestate_loads_eliminate]: 3.25002e-06 [cse]: 2.326e-05 [renormalize]: 5.39992e-07 [remove_dup_value]: 2.132e-05 [tuple_transform]: 9.71e-05, [1] [Cycle 1]: 8.936e-05, [4] [d_1]: 4.906e-05 [none_parameter_eliminate]: 1.67999e-06 [renormalize]: 3.10014e-07 [switch_simplify]: 7.96001e-06 [partial_unused_args_eliminate]: 4.57e-06 [add_recomputation]: 5.809e-05 [cse_after_recomputation]: 3.34e-05, [1] [Cycle 1]: 2.624e-05, [1] [cse]: 1.693e-05 [environ_conv]: 9.39e-06 [swap_dp_allreduce_reducescatter]: 8.64e-06 [bias_add_comm_swap]: 5.22999e-06 [label_micro_interleaved_index]: 7.33e-06 [label_fine_grained_interleaved_index]: 5.57001e-06 [merge_cast_opt]: 4.08001e-06 [slice_recompute_activation]: 5.39e-06 [micro_interleaved_order_control]: 5.11002e-06 [assign_add_opt]: 3.73001e-06 [ForceFp32Comm]: 3.76999e-06 [remove_cast_before_assign_add]: 3.36001e-06 [full_micro_interleaved_order_control]: 4.52e-06 [reorder_send_recv_between_fp_bp]: 5.04e-06 [comm_op_add_attrs]: 3.56999e-06 [add_comm_op_reuse_tag]: 3.76999e-06 [interleave_split_concat_branches]: 3.81001e-06 [interleave_parallel_branches]: 3.45998e-06 [overlap_opt_shard_in_pipeline]: 3.51999e-06 [overlap_opt_shard_grad_in_pipeline]: 4.12e-06 [control_data_broadcast_order]: 1.831e-05 [grouped_pairwise_exchange_alltoall]: 3.95998e-06 [offloading_packed_experts]: 7.26001e-06 [overlap_recompute_and_grad_model_parallel]: 7.67002e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.56999e-06 [overlap_recompute_allgather_and_fa_grad]: 3.67998e-06 [overlap_recompute_comm]: 4.90001e-06 [overlap_grad_ring_attention]: 6.94001e-06 [overlap_grad_flash_sp]: 2.651e-05 [begin_end_overlap_inline]: 2.88e-06 [split_matmul_comm_elemetwise]: 4.53001e-06 [split_layernorm_comm]: 4.42e-06 [handle_group_info]: 3.21999e-06 [symbol_engine_optimizer]: 0.00010767, [1] [Cycle 1]: 9.965e-05, [6] [build]: 4.29002e-06 [elim_shapecalc]: 1.227e-05 [elim_not_effective]: 1.598e-05 [opt_reshape]: 8.38999e-06 [fold_const_symbol]: 1.259e-05 [renormalize]: 1.8999e-07 [detach_backward]: 3.57997e-06 [pipeline_parallel_scheduler]: 1.61998e-06 [auto_monad_reorder]: 2.307e-05 [get_jit_bprop_graph]: 2.48e-06 [rewriter_after_jit_bprop_graph]: 4.80001e-06 [opt_after_jit_grad]: 0.00049857 [validate]: 4.367e-05 Sums bootstrap : 0.000446s : 1.69% type_inference : 0.004937s : 18.74% event_method : 0.000013s : 0.05% auto_monad : 0.000057s : 0.22% graph_reusing : 0.000006s : 0.02% inline : 0.000002s : 0.01% add_attr.add_attr_with_inline.tag_attr : 0.000015s : 0.06% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000004s : 0.02% parallel-infer-symbol : 0.000003s : 0.01% pre_auto_parallel : 0.000029s : 0.11% insert-virtual-dataset : 0.000002s : 0.01% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.01% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000022s : 0.08% optimize.rewriter_before_opt_a : 0.000055s : 0.21% optimize.opt_a.expand_dump_flag : 0.000006s : 0.02% optimize.opt_a.switch_simplify : 0.000037s : 0.14% optimize.opt_a.loop_unroll : 0.000023s : 0.09% optimize.opt_a.a_1 : 0.000604s : 2.29% optimize.opt_a.with_stream_mark : 0.000033s : 0.13% optimize.opt_a.recompute_prepare : 0.000018s : 0.07% optimize.opt_a.updatestate_depend_eliminate : 0.000010s : 0.04% optimize.opt_a.updatestate_assign_eliminate : 0.000008s : 0.03% optimize.opt_a.updatestate_loads_eliminate : 0.000007s : 0.03% optimize.opt_a.parameter_eliminate : 0.000004s : 0.01% optimize.opt_a.a_2 : 0.000269s : 1.02% optimize.opt_a.accelerated_algorithm : 0.000034s : 0.13% optimize.opt_a.shard : 0.000004s : 0.02% optimize.opt_a.meta_shard_fg_expand : 0.000005s : 0.02% optimize.opt_a.shard_inline : 0.000016s : 0.06% optimize.opt_a.merge_send_recv : 0.000019s : 0.07% optimize.opt_a.auto_parallel : 0.000017s : 0.07% optimize.opt_a.parallel : 0.000028s : 0.11% optimize.opt_a.flash_sp : 0.000012s : 0.05% optimize.opt_a.merge_comm : 0.000009s : 0.03% optimize.opt_a.allreduce_fusion : 0.000009s : 0.04% optimize.opt_a.matmul_add_comm_reduction : 0.000022s : 0.09% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000018s : 0.07% optimize.opt_a.virtual_dataset : 0.000015s : 0.06% optimize.opt_a.get_grad_eliminate_ : 0.000015s : 0.06% optimize.opt_a.virtual_output : 0.000015s : 0.06% optimize.opt_a.merge_forward : 0.000010s : 0.04% optimize.opt_a.cell_reuse_recompute_pass : 0.000005s : 0.02% optimize.opt_a.offload_activation : 0.000022s : 0.08% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000034s : 0.13% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.01% optimize.opt_a.before_grad : 0.000026s : 0.10% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000010s : 0.04% optimize.opt_a.meta_fg_expand : 0.000007s : 0.03% optimize.opt_a.flash_sp_send_recv_attached : 0.000007s : 0.03% optimize.opt_a.receive_attached : 0.000005s : 0.02% optimize.opt_a.after_resolve : 0.000024s : 0.09% optimize.opt_a.a_after_grad : 0.000022s : 0.08% optimize.opt_a.renormalize : 0.016370s : 62.13% optimize.opt_a.add_forward_monad_depend : 0.000013s : 0.05% optimize.opt_a.auto_monad_grad : 0.000004s : 0.01% optimize.opt_a.auto_monad_eliminator : 0.000035s : 0.13% optimize.opt_a.cse : 0.000063s : 0.24% optimize.opt_a.a_3 : 0.000143s : 0.54% optimize.py_interpret_to_execute_after_opt_a : 0.000022s : 0.08% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.02% optimize.rewriter_after_opt_a : 0.000049s : 0.19% optimize.convert_after_rewriter : 0.000010s : 0.04% optimize.order_py_execute_after_rewriter : 0.000009s : 0.04% optimize.mutable_eliminate : 0.000742s : 2.82% optimize.opt_b.b_1 : 0.000194s : 0.74% optimize.opt_b.b_2 : 0.000010s : 0.04% optimize.opt_b.updatestate_depend_eliminate : 0.000008s : 0.03% optimize.opt_b.updatestate_assign_eliminate : 0.000004s : 0.01% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.01% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000027s : 0.10% optimize.optimize_parallel_all_gather_comm : 0.000023s : 0.09% optimize.overlap_param_gather : 0.000005s : 0.02% optimize.cconv : 0.000031s : 0.12% optimize.loop_unroll : 0.000488s : 1.85% optimize.opt_after_cconv.c_1 : 0.000036s : 0.14% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.02% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.cse : 0.000023s : 0.09% optimize.opt_after_cconv.renormalize : 0.000001s : 0.00% optimize.remove_dup_value : 0.000021s : 0.08% optimize.tuple_transform.d_1 : 0.000049s : 0.19% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000008s : 0.03% optimize.partial_unused_args_eliminate : 0.000005s : 0.02% optimize.add_recomputation : 0.000058s : 0.22% optimize.cse_after_recomputation.cse : 0.000017s : 0.06% optimize.environ_conv : 0.000009s : 0.04% optimize.swap_dp_allreduce_reducescatter : 0.000009s : 0.03% optimize.bias_add_comm_swap : 0.000005s : 0.02% optimize.label_micro_interleaved_index : 0.000007s : 0.03% optimize.label_fine_grained_interleaved_index : 0.000006s : 0.02% optimize.merge_cast_opt : 0.000004s : 0.02% optimize.slice_recompute_activation : 0.000005s : 0.02% optimize.micro_interleaved_order_control : 0.000005s : 0.02% optimize.assign_add_opt : 0.000004s : 0.01% optimize.ForceFp32Comm : 0.000004s : 0.01% optimize.remove_cast_before_assign_add : 0.000003s : 0.01% optimize.full_micro_interleaved_order_control : 0.000005s : 0.02% optimize.reorder_send_recv_between_fp_bp : 0.000005s : 0.02% optimize.comm_op_add_attrs : 0.000004s : 0.01% optimize.add_comm_op_reuse_tag : 0.000004s : 0.01% optimize.interleave_split_concat_branches : 0.000004s : 0.01% optimize.interleave_parallel_branches : 0.000003s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000004s : 0.02% optimize.control_data_broadcast_order : 0.000018s : 0.07% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.02% optimize.offloading_packed_experts : 0.000007s : 0.03% optimize.overlap_recompute_and_grad_model_parallel : 0.000008s : 0.03% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.01% optimize.overlap_recompute_comm : 0.000005s : 0.02% optimize.overlap_grad_ring_attention : 0.000007s : 0.03% optimize.overlap_grad_flash_sp : 0.000027s : 0.10% optimize.begin_end_overlap_inline : 0.000003s : 0.01% optimize.split_matmul_comm_elemetwise : 0.000005s : 0.02% optimize.split_layernorm_comm : 0.000004s : 0.02% optimize.handle_group_info : 0.000003s : 0.01% optimize.symbol_engine_optimizer.build : 0.000004s : 0.02% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000012s : 0.05% optimize.symbol_engine_optimizer.elim_not_effective : 0.000016s : 0.06% optimize.symbol_engine_optimizer.opt_reshape : 0.000008s : 0.03% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000013s : 0.05% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000004s : 0.01% pipeline_parallel_scheduler : 0.000002s : 0.01% auto_monad_reorder : 0.000023s : 0.09% get_jit_bprop_graph : 0.000002s : 0.01% rewriter_after_jit_bprop_graph : 0.000005s : 0.02% opt_after_jit_grad : 0.000499s : 1.89% validate : 0.000044s : 0.17% Time group info: ------[substitution.] 0.000257 34 34.52% : 0.000089s : 4: substitution.arithmetic_simplify 10.22% : 0.000026s : 2: substitution.cast_eliminate 0.96% : 0.000002s : 3: substitution.elim_not_effective 0.70% : 0.000002s : 3: substitution.fold_const_symbol 2.69% : 0.000007s : 4: substitution.graph_param_transform 39.23% : 0.000101s : 2: substitution.inline 2.00% : 0.000005s : 6: substitution.j_node_and_user_rematch 5.01% : 0.000013s : 2: substitution.less_batch_normalization 2.66% : 0.000007s : 6: substitution.remove_not_recompute_node 2.00% : 0.000005s : 2: substitution.replace_old_param ------[type_inference.] 0.004892 2 90.78% : 0.004441s : 1: type_inference.infer 9.22% : 0.000451s : 1: type_inference.specialize ------[replace.] 0.000022 2 100.00% : 0.000022s : 2: replace.inline ------[match.] 0.000099 2 100.00% : 0.000099s : 2: match.inline ------[predicate.] 0.000187 980 0.94% : 0.000002s : 9: predicate.accumulaten_eliminater 0.75% : 0.000001s : 4: predicate.ad_related_special_op_eliminate 0.65% : 0.000001s : 8: predicate.addn_check_dump 0.98% : 0.000002s : 9: predicate.addn_zero_filter 0.66% : 0.000001s : 9: predicate.adjust_all_reduce_mul_add 3.12% : 0.000006s : 17: predicate.arithmetic_simplify 0.81% : 0.000002s : 9: predicate.cast_eliminate 0.86% : 0.000002s : 8: predicate.check_bprop_eliminate 0.67% : 0.000001s : 8: predicate.compare_switch_simplify 0.20% : 0.000000s : 4: predicate.const_output_eliminate 0.95% : 0.000002s : 8: predicate.depend_value_elim 0.83% : 0.000002s : 9: predicate.dict_get_item_const_eliminator 1.11% : 0.000002s : 9: predicate.dict_get_item_eliminator 0.76% : 0.000001s : 9: predicate.dict_set_item_eliminator 1.10% : 0.000002s : 8: predicate.dumpgradient_eliminate 0.32% : 0.000001s : 4: predicate.elim_not_effective 0.60% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.16% : 0.000002s : 13: predicate.environ_add_const_eliminate 0.94% : 0.000002s : 13: predicate.environ_get_add_eliminate 1.01% : 0.000002s : 13: predicate.environ_get_depend_swap 1.75% : 0.000003s : 21: predicate.environ_get_eliminate 0.98% : 0.000002s : 13: predicate.environ_get_set_eliminate 0.84% : 0.000002s : 11: predicate.exchange_switch_depend_value 1.89% : 0.000004s : 11: predicate.float_depend_g_call 0.66% : 0.000001s : 8: predicate.float_environ_get_switch 1.01% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.21% : 0.000000s : 4: predicate.fold_const_symbol 1.02% : 0.000002s : 8: predicate.get_grad_eliminate 0.25% : 0.000000s : 4: predicate.graph_param_transform 0.78% : 0.000001s : 8: predicate.incorporate_call 0.60% : 0.000001s : 8: predicate.incorporate_call_switch 6.08% : 0.000011s : 44: predicate.inline 1.08% : 0.000002s : 8: predicate.inline_without_move 0.35% : 0.000001s : 8: predicate.j_node_and_user_rematch 1.36% : 0.000003s : 8: predicate.less_batch_normalization 1.58% : 0.000003s : 17: predicate.list_to_tuple_eliminator_ 2.15% : 0.000004s : 26: predicate.load_eliminater 1.20% : 0.000002s : 4: predicate.loop_unroll_after_grad 1.47% : 0.000003s : 16: predicate.loop_unroll_before_grad 1.74% : 0.000003s : 17: predicate.make_slice_get_slice_eliminator 0.67% : 0.000001s : 8: predicate.merge_addn 0.70% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.74% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.68% : 0.000001s : 9: predicate.minmaximum_grad 1.12% : 0.000002s : 4: predicate.mutable_eliminate 0.46% : 0.000001s : 4: predicate.opt_reshape 0.58% : 0.000001s : 4: predicate.parallel_virtual_node 1.24% : 0.000002s : 11: predicate.partial_defer_inline 1.21% : 0.000002s : 13: predicate.partial_eliminate 0.81% : 0.000002s : 9: predicate.print_const_string_wrapper 0.73% : 0.000001s : 8: predicate.reduce_all_const_elim 1.14% : 0.000002s : 9: predicate.reduce_eliminate 2.05% : 0.000004s : 26: predicate.redundant_stop_gradient_eliminater 0.41% : 0.000001s : 8: predicate.remove_not_recompute_node 1.10% : 0.000002s : 17: predicate.replace_applicator 0.73% : 0.000001s : 8: predicate.replace_old_param 0.41% : 0.000001s : 4: predicate.reset_defer_inline 1.29% : 0.000002s : 9: predicate.reshape_eliminate 0.73% : 0.000001s : 8: predicate.row_tensor_add_zeros_like 0.68% : 0.000001s : 4: predicate.row_tensor_eliminate 1.13% : 0.000002s : 8: predicate.same_eliminate 0.48% : 0.000001s : 8: predicate.set_cell_output_no_recompute 1.01% : 0.000002s : 8: predicate.shard_identity_eliminate 1.21% : 0.000002s : 8: predicate.special_op_eliminate 1.04% : 0.000002s : 8: predicate.specialize_transform 1.41% : 0.000003s : 8: predicate.split_environ_get_set_with_tuple_value 0.94% : 0.000002s : 8: predicate.stack_unstack_eliminate 0.41% : 0.000001s : 4: predicate.switch_call_monad_eliminater 0.87% : 0.000002s : 11: predicate.switch_defer_inline 1.56% : 0.000003s : 19: predicate.switch_layer_defer_inline 3.96% : 0.000007s : 39: predicate.switch_simplify 0.72% : 0.000001s : 9: predicate.tile_eliminate 0.80% : 0.000002s : 9: predicate.transpose_eliminate 1.54% : 0.000003s : 17: predicate.tuple_list_convert_item_index_to_positive 1.49% : 0.000003s : 17: predicate.tuple_list_get_item_const_eliminator 1.45% : 0.000003s : 17: predicate.tuple_list_get_item_depend_reorder 3.15% : 0.000006s : 25: predicate.tuple_list_get_item_eliminator 1.50% : 0.000003s : 17: predicate.tuple_list_get_set_item_eliminator 2.57% : 0.000005s : 25: predicate.tuple_list_set_item_eliminator 1.53% : 0.000003s : 17: predicate.tuple_to_list_eliminator_ 2.07% : 0.000004s : 26: predicate.updatestate_pure_node_eliminater 3.04% : 0.000006s : 34: predicate.updatestate_useless_node_eliminater 0.52% : 0.000001s : 4: predicate.value_based_eliminate 0.80% : 0.000001s : 8: predicate.virtual_dataset_eliminate 0.96% : 0.000002s : 8: predicate.virtual_output_eliminate 0.40% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.52% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000283 5 6.33% : 0.000018s : 1: func_graph_cloner_run.FuncGraphClonerGraph 93.67% : 0.000265s : 4: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.073981 192 0.01% : 0.000007s : 1: ForceFp32Comm 4.47% : 0.003305s : 1: add_attr 4.45% : 0.003290s : 1: add_attr_with_inline 0.01% : 0.000006s : 1: add_comm_op_reuse_tag 0.08% : 0.000062s : 1: add_recomputation 0.01% : 0.000007s : 1: assign_add_opt 0.09% : 0.000065s : 1: auto_monad 0.04% : 0.000030s : 1: auto_monad_reorder 0.01% : 0.000006s : 1: begin_end_overlap_inline 0.01% : 0.000008s : 1: bias_add_comm_swap 0.66% : 0.000489s : 1: bootstrap 0.05% : 0.000034s : 1: cconv 0.01% : 0.000006s : 1: comm_op_add_attrs 0.03% : 0.000021s : 1: control_data_broadcast_order 0.02% : 0.000014s : 1: convert_after_rewriter 0.05% : 0.000036s : 1: cse_after_recomputation 0.01% : 0.000008s : 1: dataset_repeat_opt 0.02% : 0.000018s : 1: detach_backward 0.02% : 0.000012s : 1: environ_conv 0.03% : 0.000022s : 1: event_method 0.01% : 0.000007s : 1: full_micro_interleaved_order_control 0.01% : 0.000009s : 1: get_jit_bprop_graph 0.02% : 0.000012s : 1: graph_reusing 0.01% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000006s : 1: handle_group_info 0.01% : 0.000008s : 1: inline 0.01% : 0.000009s : 1: insert-virtual-dataset 0.01% : 0.000006s : 1: interleave_parallel_branches 0.01% : 0.000006s : 1: interleave_split_concat_branches 0.01% : 0.000008s : 1: label_fine_grained_interleaved_index 0.01% : 0.000010s : 1: label_micro_interleaved_index 0.67% : 0.000494s : 1: loop_unroll 0.03% : 0.000019s : 1: merge_cast_opt 0.01% : 0.000008s : 1: micro_interleaved_order_control 1.01% : 0.000748s : 1: mutable_eliminate 0.01% : 0.000010s : 1: offloading_packed_experts 0.02% : 0.000018s : 1: opt.transform.loop_unroll_optimizer 0.02% : 0.000017s : 1: opt.transform.mutable_eliminate 1.52% : 0.001126s : 78: opt.transform.opt_a 0.05% : 0.000034s : 1: opt.transform.opt_after_cconv 0.04% : 0.000027s : 1: opt.transform.opt_after_jit_grad 0.17% : 0.000129s : 28: opt.transform.opt_b 0.07% : 0.000055s : 2: opt.transform.opt_trans_graph 0.06% : 0.000045s : 4: opt.transform.symbol_engine_opt 25.51% : 0.018870s : 1: opt_a 0.20% : 0.000146s : 1: opt_after_cconv 0.69% : 0.000509s : 1: opt_after_jit_grad 0.44% : 0.000322s : 1: opt_b 29.57% : 0.021880s : 1: optimize 0.04% : 0.000026s : 1: optimize_parallel_all_gather_comm 0.02% : 0.000012s : 1: order_py_execute_after_rewriter 0.04% : 0.000029s : 1: overlap_grad_flash_sp 0.01% : 0.000006s : 1: overlap_grad_matmul_and_grad_allreduce 0.01% : 0.000010s : 1: overlap_grad_ring_attention 0.01% : 0.000007s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000006s : 1: overlap_opt_shard_in_pipeline 0.01% : 0.000009s : 1: overlap_param_gather 0.01% : 0.000006s : 1: overlap_recompute_allgather_and_fa_grad 0.01% : 0.000010s : 1: overlap_recompute_and_grad_model_parallel 0.01% : 0.000008s : 1: overlap_recompute_comm 0.01% : 0.000010s : 1: parallel-infer-symbol 0.01% : 0.000006s : 1: parallel-infer-symbol-second 0.01% : 0.000008s : 1: partial_unused_args_eliminate 0.01% : 0.000008s : 1: pipeline_parallel_scheduler 0.01% : 0.000007s : 1: pipeline_split 0.05% : 0.000036s : 1: pre_auto_parallel 0.03% : 0.000025s : 1: py_interpret_to_execute 0.03% : 0.000025s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000006s : 1: remove_cast_before_assign_add 0.03% : 0.000025s : 1: remove_dup_value 21.55% : 0.015944s : 1: renormalize.infer 0.55% : 0.000410s : 1: renormalize.specialize 0.01% : 0.000008s : 1: reorder_send_recv_between_fp_bp 0.01% : 0.000011s : 1: rewriter_after_jit_bprop_graph 0.07% : 0.000053s : 1: rewriter_after_opt_a 0.08% : 0.000059s : 1: rewriter_before_opt_a 0.01% : 0.000008s : 1: slice_cell_reuse_recomputed_activation 0.01% : 0.000008s : 1: slice_recompute_activation 0.01% : 0.000007s : 1: split_layernorm_comm 0.01% : 0.000007s : 1: split_matmul_comm_elemetwise 0.02% : 0.000012s : 1: swap_dp_allreduce_reducescatter 0.15% : 0.000111s : 1: symbol_engine_optimizer 0.14% : 0.000100s : 1: tuple_transform 6.71% : 0.004966s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:36.690.954 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0168207, [21] [bootstrap]: 0.00046807 [type_inference]: 0.00578118 [event_method]: 1.475e-05 [auto_monad]: 5.904e-05 [graph_reusing]: 5.43002e-06 [inline]: 2.60002e-06 [add_attr]: 0.00386359, [1] [add_attr_with_inline]: 0.0038509, [1] [Cycle 1]: 6.67e-05, [2] [tag_attr]: 1.839e-05 [meta_addattr_fg_expand]: 4.32998e-06 [parallel-infer-symbol]: 3.78001e-06 [pre_auto_parallel]: 3.18e-05 [insert-virtual-dataset]: 2.83e-06 [parallel-infer-symbol-second]: 7.00005e-07 [dataset_repeat_opt]: 2.58003e-06 [pipeline_split]: 1.55001e-06 [optimize]: 0.00567971, [53] [py_interpret_to_execute]: 2.488e-05 [rewriter_before_opt_a]: 5.867e-05 [opt_a]: 0.00302324, [2] [Cycle 1]: 0.00213684, [45] [expand_dump_flag]: 3.20002e-06 [switch_simplify]: 2.911e-05 [loop_unroll]: 1.632e-05 [a_1]: 0.00042538 [with_stream_mark]: 1.907e-05 [recompute_prepare]: 1.282e-05 [updatestate_depend_eliminate]: 4.68001e-06 [updatestate_assign_eliminate]: 4.52998e-06 [updatestate_loads_eliminate]: 3.98001e-06 [parameter_eliminate]: 2.91999e-06 [a_2]: 0.00011974 [accelerated_algorithm]: 2.486e-05 [shard]: 2.70997e-06 [meta_shard_fg_expand]: 2.71e-06 [shard_inline]: 8.35999e-06 [merge_send_recv]: 9.56e-06 [auto_parallel]: 7.92e-06 [parallel]: 4.695e-05 [flash_sp]: 1.028e-05 [merge_comm]: 5.91e-06 [allreduce_fusion]: 4.27998e-06 [matmul_add_comm_reduction]: 1.358e-05 [allreduce_slice_to_reducescatter]: 6.19999e-07 [virtual_shard_identity]: 1.334e-05 [virtual_dataset]: 8.42e-06 [get_grad_eliminate_]: 8.12998e-06 [virtual_output]: 7.4e-06 [merge_forward]: 4.83001e-06 [cell_reuse_recompute_pass]: 1.78002e-06 [offload_activation]: 1.148e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.823e-05 [merge_recompute_call_nodes]: 1.62999e-06 [before_grad]: 1.533e-05 [set_forward_comm_id_for_comm_node_pass]: 5.27001e-06 [meta_fg_expand]: 4.24997e-06 [flash_sp_send_recv_attached]: 5.07e-06 [receive_attached]: 2.43002e-06 [after_resolve]: 1.166e-05 [a_after_grad]: 1.247e-05 [renormalize]: 0.00080266 [add_forward_monad_depend]: 7.51999e-06 [auto_monad_grad]: 3.03998e-06 [auto_monad_eliminator]: 1.99e-05 [cse]: 3.992e-05 [a_3]: 6.507e-05 [Cycle 2]: 0.00087245, [45] [expand_dump_flag]: 2.24999e-06 [switch_simplify]: 9.57999e-06 [loop_unroll]: 7.8e-06 [a_1]: 0.00019676 [with_stream_mark]: 1.359e-05 [recompute_prepare]: 8.45001e-06 [updatestate_depend_eliminate]: 5.40999e-06 [updatestate_assign_eliminate]: 2.99001e-06 [updatestate_loads_eliminate]: 3.16999e-06 [parameter_eliminate]: 1.91998e-06 [a_2]: 0.00010248 [accelerated_algorithm]: 1.307e-05 [shard]: 1.32999e-06 [meta_shard_fg_expand]: 2.24001e-06 [shard_inline]: 8.53001e-06 [merge_send_recv]: 8.87999e-06 [auto_parallel]: 1.141e-05 [parallel]: 7.8e-06 [flash_sp]: 4.15e-06 [merge_comm]: 4.48999e-06 [allreduce_fusion]: 4.56002e-06 [matmul_add_comm_reduction]: 1.083e-05 [allreduce_slice_to_reducescatter]: 6.39993e-07 [virtual_shard_identity]: 9.71e-06 [virtual_dataset]: 7.11001e-06 [get_grad_eliminate_]: 7.43999e-06 [virtual_output]: 7.11999e-06 [merge_forward]: 4.27e-06 [cell_reuse_recompute_pass]: 2.57001e-06 [offload_activation]: 9.41e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.712e-05 [merge_recompute_call_nodes]: 1.28002e-06 [before_grad]: 1.241e-05 [set_forward_comm_id_for_comm_node_pass]: 5.47001e-06 [meta_fg_expand]: 3.5e-06 [flash_sp_send_recv_attached]: 1.37e-06 [receive_attached]: 1.73002e-06 [after_resolve]: 1.251e-05 [a_after_grad]: 1.156e-05 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 2.63998e-06 [auto_monad_grad]: 1.92001e-06 [auto_monad_eliminator]: 1.172e-05 [cse]: 2.473e-05 [a_3]: 4.716e-05 [py_interpret_to_execute_after_opt_a]: 1.547e-05 [slice_cell_reuse_recomputed_activation]: 2.68e-06 [rewriter_after_opt_a]: 4.507e-05 [convert_after_rewriter]: 8.15e-06 [order_py_execute_after_rewriter]: 6.31998e-06 [mutable_eliminate]: 0.00077955 [opt_b]: 0.00026646, [1] [Cycle 1]: 0.00025761, [7] [b_1]: 0.00015547 [b_2]: 9.92001e-06 [updatestate_depend_eliminate]: 9.99999e-06 [updatestate_assign_eliminate]: 3.43e-06 [updatestate_loads_eliminate]: 3.8e-06 [renormalize]: 6.19999e-07 [cse]: 3.681e-05 [optimize_parallel_all_gather_comm]: 2.142e-05 [overlap_param_gather]: 1.97999e-06 [cconv]: 3.451e-05 [loop_unroll]: 0.00059162 [opt_after_cconv]: 0.00013776, [1] [Cycle 1]: 0.00013046, [7] [c_1]: 3.768e-05 [parameter_eliminate]: 6.06998e-06 [updatestate_depend_eliminate]: 8.57e-06 [updatestate_assign_eliminate]: 3.74002e-06 [updatestate_loads_eliminate]: 3.06999e-06 [cse]: 3.279e-05 [renormalize]: 6.89994e-07 [remove_dup_value]: 1.974e-05 [tuple_transform]: 9.463e-05, [1] [Cycle 1]: 8.964e-05, [4] [d_1]: 5.795e-05 [none_parameter_eliminate]: 1.73002e-06 [renormalize]: 1.8999e-07 [switch_simplify]: 9.14e-06 [partial_unused_args_eliminate]: 1.93002e-06 [add_recomputation]: 6.094e-05 [cse_after_recomputation]: 2.917e-05, [1] [Cycle 1]: 2.417e-05, [1] [cse]: 1.691e-05 [environ_conv]: 6.76e-06 [swap_dp_allreduce_reducescatter]: 6.20002e-06 [bias_add_comm_swap]: 2.78e-06 [label_micro_interleaved_index]: 5.95002e-06 [label_fine_grained_interleaved_index]: 2.68e-06 [merge_cast_opt]: 1.39e-06 [slice_recompute_activation]: 2.29001e-06 [micro_interleaved_order_control]: 2.24999e-06 [assign_add_opt]: 1.31002e-06 [ForceFp32Comm]: 9.10019e-07 [remove_cast_before_assign_add]: 1.07e-06 [full_micro_interleaved_order_control]: 2.24001e-06 [reorder_send_recv_between_fp_bp]: 2.61999e-06 [comm_op_add_attrs]: 1.32999e-06 [add_comm_op_reuse_tag]: 1.07998e-06 [interleave_split_concat_branches]: 1.14003e-06 [interleave_parallel_branches]: 1.02e-06 [overlap_opt_shard_in_pipeline]: 1.19e-06 [overlap_opt_shard_grad_in_pipeline]: 1.95001e-06 [control_data_broadcast_order]: 1.741e-05 [grouped_pairwise_exchange_alltoall]: 1.87001e-06 [offloading_packed_experts]: 4.95999e-06 [overlap_recompute_and_grad_model_parallel]: 6.41e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.72999e-06 [overlap_recompute_allgather_and_fa_grad]: 1.37999e-06 [overlap_recompute_comm]: 2.38998e-06 [overlap_grad_ring_attention]: 4.82e-06 [overlap_grad_flash_sp]: 2.278e-05 [begin_end_overlap_inline]: 4.89992e-07 [split_matmul_comm_elemetwise]: 2.08998e-06 [split_layernorm_comm]: 1.57001e-06 [handle_group_info]: 1.26002e-06 [symbol_engine_optimizer]: 9.759e-05, [1] [Cycle 1]: 9.254e-05, [6] [build]: 4.10998e-06 [elim_shapecalc]: 1.517e-05 [elim_not_effective]: 1.702e-05 [opt_reshape]: 9.22999e-06 [fold_const_symbol]: 1.294e-05 [renormalize]: 1.99972e-07 [detach_backward]: 2.43e-06 [pipeline_parallel_scheduler]: 1.57001e-06 [auto_monad_reorder]: 2.333e-05 [get_jit_bprop_graph]: 1.92001e-06 [rewriter_after_jit_bprop_graph]: 6.09999e-06 [opt_after_jit_grad]: 0.00065069 [validate]: 4.932e-05 Sums bootstrap : 0.000468s : 3.95% type_inference : 0.005781s : 48.74% event_method : 0.000015s : 0.12% auto_monad : 0.000059s : 0.50% graph_reusing : 0.000005s : 0.05% inline : 0.000003s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000018s : 0.16% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000004s : 0.04% parallel-infer-symbol : 0.000004s : 0.03% pre_auto_parallel : 0.000032s : 0.27% insert-virtual-dataset : 0.000003s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000003s : 0.02% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000025s : 0.21% optimize.rewriter_before_opt_a : 0.000059s : 0.49% optimize.opt_a.expand_dump_flag : 0.000005s : 0.05% optimize.opt_a.switch_simplify : 0.000039s : 0.33% optimize.opt_a.loop_unroll : 0.000024s : 0.20% optimize.opt_a.a_1 : 0.000622s : 5.24% optimize.opt_a.with_stream_mark : 0.000033s : 0.28% optimize.opt_a.recompute_prepare : 0.000021s : 0.18% optimize.opt_a.updatestate_depend_eliminate : 0.000010s : 0.09% optimize.opt_a.updatestate_assign_eliminate : 0.000008s : 0.06% optimize.opt_a.updatestate_loads_eliminate : 0.000007s : 0.06% optimize.opt_a.parameter_eliminate : 0.000005s : 0.04% optimize.opt_a.a_2 : 0.000222s : 1.87% optimize.opt_a.accelerated_algorithm : 0.000038s : 0.32% optimize.opt_a.shard : 0.000004s : 0.03% optimize.opt_a.meta_shard_fg_expand : 0.000005s : 0.04% optimize.opt_a.shard_inline : 0.000017s : 0.14% optimize.opt_a.merge_send_recv : 0.000018s : 0.16% optimize.opt_a.auto_parallel : 0.000019s : 0.16% optimize.opt_a.parallel : 0.000055s : 0.46% optimize.opt_a.flash_sp : 0.000014s : 0.12% optimize.opt_a.merge_comm : 0.000010s : 0.09% optimize.opt_a.allreduce_fusion : 0.000009s : 0.07% optimize.opt_a.matmul_add_comm_reduction : 0.000024s : 0.21% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000023s : 0.19% optimize.opt_a.virtual_dataset : 0.000016s : 0.13% optimize.opt_a.get_grad_eliminate_ : 0.000016s : 0.13% optimize.opt_a.virtual_output : 0.000015s : 0.12% optimize.opt_a.merge_forward : 0.000009s : 0.08% optimize.opt_a.cell_reuse_recompute_pass : 0.000004s : 0.04% optimize.opt_a.offload_activation : 0.000021s : 0.18% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000035s : 0.30% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.02% optimize.opt_a.before_grad : 0.000028s : 0.23% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000011s : 0.09% optimize.opt_a.meta_fg_expand : 0.000008s : 0.07% optimize.opt_a.flash_sp_send_recv_attached : 0.000006s : 0.05% optimize.opt_a.receive_attached : 0.000004s : 0.04% optimize.opt_a.after_resolve : 0.000024s : 0.20% optimize.opt_a.a_after_grad : 0.000024s : 0.20% optimize.opt_a.renormalize : 0.000803s : 6.77% optimize.opt_a.add_forward_monad_depend : 0.000010s : 0.09% optimize.opt_a.auto_monad_grad : 0.000005s : 0.04% optimize.opt_a.auto_monad_eliminator : 0.000032s : 0.27% optimize.opt_a.cse : 0.000065s : 0.55% optimize.opt_a.a_3 : 0.000112s : 0.95% optimize.py_interpret_to_execute_after_opt_a : 0.000015s : 0.13% optimize.slice_cell_reuse_recomputed_activation : 0.000003s : 0.02% optimize.rewriter_after_opt_a : 0.000045s : 0.38% optimize.convert_after_rewriter : 0.000008s : 0.07% optimize.order_py_execute_after_rewriter : 0.000006s : 0.05% optimize.mutable_eliminate : 0.000780s : 6.57% optimize.opt_b.b_1 : 0.000155s : 1.31% optimize.opt_b.b_2 : 0.000010s : 0.08% optimize.opt_b.updatestate_depend_eliminate : 0.000010s : 0.08% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_b.updatestate_loads_eliminate : 0.000004s : 0.03% optimize.opt_b.renormalize : 0.000001s : 0.01% optimize.opt_b.cse : 0.000037s : 0.31% optimize.optimize_parallel_all_gather_comm : 0.000021s : 0.18% optimize.overlap_param_gather : 0.000002s : 0.02% optimize.cconv : 0.000035s : 0.29% optimize.loop_unroll : 0.000592s : 4.99% optimize.opt_after_cconv.c_1 : 0.000038s : 0.32% optimize.opt_after_cconv.parameter_eliminate : 0.000006s : 0.05% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000009s : 0.07% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000004s : 0.03% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.cse : 0.000033s : 0.28% optimize.opt_after_cconv.renormalize : 0.000001s : 0.01% optimize.remove_dup_value : 0.000020s : 0.17% optimize.tuple_transform.d_1 : 0.000058s : 0.49% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000009s : 0.08% optimize.partial_unused_args_eliminate : 0.000002s : 0.02% optimize.add_recomputation : 0.000061s : 0.51% optimize.cse_after_recomputation.cse : 0.000017s : 0.14% optimize.environ_conv : 0.000007s : 0.06% optimize.swap_dp_allreduce_reducescatter : 0.000006s : 0.05% optimize.bias_add_comm_swap : 0.000003s : 0.02% optimize.label_micro_interleaved_index : 0.000006s : 0.05% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.02% optimize.merge_cast_opt : 0.000001s : 0.01% optimize.slice_recompute_activation : 0.000002s : 0.02% optimize.micro_interleaved_order_control : 0.000002s : 0.02% optimize.assign_add_opt : 0.000001s : 0.01% optimize.ForceFp32Comm : 0.000001s : 0.01% optimize.remove_cast_before_assign_add : 0.000001s : 0.01% optimize.full_micro_interleaved_order_control : 0.000002s : 0.02% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.02% optimize.comm_op_add_attrs : 0.000001s : 0.01% optimize.add_comm_op_reuse_tag : 0.000001s : 0.01% optimize.interleave_split_concat_branches : 0.000001s : 0.01% optimize.interleave_parallel_branches : 0.000001s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.02% optimize.control_data_broadcast_order : 0.000017s : 0.15% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.02% optimize.offloading_packed_experts : 0.000005s : 0.04% optimize.overlap_recompute_and_grad_model_parallel : 0.000006s : 0.05% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000002s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.01% optimize.overlap_recompute_comm : 0.000002s : 0.02% optimize.overlap_grad_ring_attention : 0.000005s : 0.04% optimize.overlap_grad_flash_sp : 0.000023s : 0.19% optimize.begin_end_overlap_inline : 0.000000s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.02% optimize.split_layernorm_comm : 0.000002s : 0.01% optimize.handle_group_info : 0.000001s : 0.01% optimize.symbol_engine_optimizer.build : 0.000004s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000015s : 0.13% optimize.symbol_engine_optimizer.elim_not_effective : 0.000017s : 0.14% optimize.symbol_engine_optimizer.opt_reshape : 0.000009s : 0.08% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000013s : 0.11% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.02% pipeline_parallel_scheduler : 0.000002s : 0.01% auto_monad_reorder : 0.000023s : 0.20% get_jit_bprop_graph : 0.000002s : 0.02% rewriter_after_jit_bprop_graph : 0.000006s : 0.05% opt_after_jit_grad : 0.000651s : 5.49% validate : 0.000049s : 0.42% Time group info: ------[substitution.] 0.000274 34 30.26% : 0.000083s : 4: substitution.arithmetic_simplify 8.30% : 0.000023s : 2: substitution.cast_eliminate 0.90% : 0.000002s : 3: substitution.elim_not_effective 0.64% : 0.000002s : 3: substitution.fold_const_symbol 2.89% : 0.000008s : 4: substitution.graph_param_transform 45.26% : 0.000124s : 2: substitution.inline 2.15% : 0.000006s : 6: substitution.j_node_and_user_rematch 5.24% : 0.000014s : 2: substitution.less_batch_normalization 2.73% : 0.000007s : 6: substitution.remove_not_recompute_node 1.64% : 0.000004s : 2: substitution.replace_old_param ------[type_inference.] 0.005727 2 91.20% : 0.005223s : 1: type_inference.infer 8.80% : 0.000504s : 1: type_inference.specialize ------[replace.] 0.000023 2 100.00% : 0.000023s : 2: replace.inline ------[match.] 0.000122 2 100.00% : 0.000122s : 2: match.inline ------[predicate.] 0.000189 980 0.79% : 0.000001s : 9: predicate.accumulaten_eliminater 1.16% : 0.000002s : 4: predicate.ad_related_special_op_eliminate 0.64% : 0.000001s : 8: predicate.addn_check_dump 0.84% : 0.000002s : 9: predicate.addn_zero_filter 0.67% : 0.000001s : 9: predicate.adjust_all_reduce_mul_add 2.88% : 0.000005s : 17: predicate.arithmetic_simplify 0.78% : 0.000001s : 9: predicate.cast_eliminate 0.65% : 0.000001s : 8: predicate.check_bprop_eliminate 0.61% : 0.000001s : 8: predicate.compare_switch_simplify 0.22% : 0.000000s : 4: predicate.const_output_eliminate 0.79% : 0.000001s : 8: predicate.depend_value_elim 0.78% : 0.000001s : 9: predicate.dict_get_item_const_eliminator 1.13% : 0.000002s : 9: predicate.dict_get_item_eliminator 0.74% : 0.000001s : 9: predicate.dict_set_item_eliminator 1.31% : 0.000002s : 8: predicate.dumpgradient_eliminate 0.33% : 0.000001s : 4: predicate.elim_not_effective 0.44% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.18% : 0.000002s : 13: predicate.environ_add_const_eliminate 1.04% : 0.000002s : 13: predicate.environ_get_add_eliminate 0.95% : 0.000002s : 13: predicate.environ_get_depend_swap 1.65% : 0.000003s : 21: predicate.environ_get_eliminate 0.90% : 0.000002s : 13: predicate.environ_get_set_eliminate 0.87% : 0.000002s : 11: predicate.exchange_switch_depend_value 1.91% : 0.000004s : 11: predicate.float_depend_g_call 0.66% : 0.000001s : 8: predicate.float_environ_get_switch 0.95% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.21% : 0.000000s : 4: predicate.fold_const_symbol 0.83% : 0.000002s : 8: predicate.get_grad_eliminate 0.22% : 0.000000s : 4: predicate.graph_param_transform 0.76% : 0.000001s : 8: predicate.incorporate_call 0.59% : 0.000001s : 8: predicate.incorporate_call_switch 6.59% : 0.000012s : 44: predicate.inline 1.05% : 0.000002s : 8: predicate.inline_without_move 0.32% : 0.000001s : 8: predicate.j_node_and_user_rematch 1.38% : 0.000003s : 8: predicate.less_batch_normalization 1.75% : 0.000003s : 17: predicate.list_to_tuple_eliminator_ 2.07% : 0.000004s : 26: predicate.load_eliminater 1.54% : 0.000003s : 4: predicate.loop_unroll_after_grad 1.50% : 0.000003s : 16: predicate.loop_unroll_before_grad 2.14% : 0.000004s : 17: predicate.make_slice_get_slice_eliminator 0.73% : 0.000001s : 8: predicate.merge_addn 0.67% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.70% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.70% : 0.000001s : 9: predicate.minmaximum_grad 1.77% : 0.000003s : 4: predicate.mutable_eliminate 0.47% : 0.000001s : 4: predicate.opt_reshape 0.54% : 0.000001s : 4: predicate.parallel_virtual_node 1.27% : 0.000002s : 11: predicate.partial_defer_inline 1.13% : 0.000002s : 13: predicate.partial_eliminate 0.92% : 0.000002s : 9: predicate.print_const_string_wrapper 0.76% : 0.000001s : 8: predicate.reduce_all_const_elim 1.09% : 0.000002s : 9: predicate.reduce_eliminate 2.09% : 0.000004s : 26: predicate.redundant_stop_gradient_eliminater 0.44% : 0.000001s : 8: predicate.remove_not_recompute_node 1.07% : 0.000002s : 17: predicate.replace_applicator 0.62% : 0.000001s : 8: predicate.replace_old_param 0.40% : 0.000001s : 4: predicate.reset_defer_inline 0.95% : 0.000002s : 9: predicate.reshape_eliminate 0.77% : 0.000001s : 8: predicate.row_tensor_add_zeros_like 0.49% : 0.000001s : 4: predicate.row_tensor_eliminate 1.14% : 0.000002s : 8: predicate.same_eliminate 0.47% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.92% : 0.000002s : 8: predicate.shard_identity_eliminate 0.81% : 0.000002s : 8: predicate.special_op_eliminate 0.92% : 0.000002s : 8: predicate.specialize_transform 1.29% : 0.000002s : 8: predicate.split_environ_get_set_with_tuple_value 1.03% : 0.000002s : 8: predicate.stack_unstack_eliminate 0.41% : 0.000001s : 4: predicate.switch_call_monad_eliminater 0.95% : 0.000002s : 11: predicate.switch_defer_inline 1.53% : 0.000003s : 19: predicate.switch_layer_defer_inline 4.05% : 0.000008s : 39: predicate.switch_simplify 0.71% : 0.000001s : 9: predicate.tile_eliminate 0.86% : 0.000002s : 9: predicate.transpose_eliminate 1.59% : 0.000003s : 17: predicate.tuple_list_convert_item_index_to_positive 1.48% : 0.000003s : 17: predicate.tuple_list_get_item_const_eliminator 1.40% : 0.000003s : 17: predicate.tuple_list_get_item_depend_reorder 3.32% : 0.000006s : 25: predicate.tuple_list_get_item_eliminator 1.82% : 0.000003s : 17: predicate.tuple_list_get_set_item_eliminator 2.67% : 0.000005s : 25: predicate.tuple_list_set_item_eliminator 1.47% : 0.000003s : 17: predicate.tuple_to_list_eliminator_ 1.83% : 0.000003s : 26: predicate.updatestate_pure_node_eliminater 2.89% : 0.000005s : 34: predicate.updatestate_useless_node_eliminater 0.63% : 0.000001s : 4: predicate.value_based_eliminate 0.82% : 0.000002s : 8: predicate.virtual_dataset_eliminate 0.78% : 0.000001s : 8: predicate.virtual_output_eliminate 0.33% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.49% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000269 5 8.22% : 0.000022s : 1: func_graph_cloner_run.FuncGraphClonerGraph 91.78% : 0.000247s : 4: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.028547 192 0.01% : 0.000004s : 1: ForceFp32Comm 13.56% : 0.003870s : 1: add_attr 13.50% : 0.003855s : 1: add_attr_with_inline 0.01% : 0.000004s : 1: add_comm_op_reuse_tag 0.23% : 0.000066s : 1: add_recomputation 0.01% : 0.000004s : 1: assign_add_opt 0.23% : 0.000065s : 1: auto_monad 0.10% : 0.000028s : 1: auto_monad_reorder 0.01% : 0.000004s : 1: begin_end_overlap_inline 0.02% : 0.000006s : 1: bias_add_comm_swap 1.74% : 0.000497s : 1: bootstrap 0.13% : 0.000038s : 1: cconv 0.01% : 0.000004s : 1: comm_op_add_attrs 0.07% : 0.000021s : 1: control_data_broadcast_order 0.04% : 0.000012s : 1: convert_after_rewriter 0.11% : 0.000032s : 1: cse_after_recomputation 0.02% : 0.000005s : 1: dataset_repeat_opt 0.02% : 0.000006s : 1: detach_backward 0.04% : 0.000010s : 1: environ_conv 0.07% : 0.000021s : 1: event_method 0.02% : 0.000005s : 1: full_micro_interleaved_order_control 0.02% : 0.000005s : 1: get_jit_bprop_graph 0.03% : 0.000009s : 1: graph_reusing 0.02% : 0.000005s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000004s : 1: handle_group_info 0.02% : 0.000005s : 1: inline 0.02% : 0.000006s : 1: insert-virtual-dataset 0.01% : 0.000004s : 1: interleave_parallel_branches 0.01% : 0.000004s : 1: interleave_split_concat_branches 0.02% : 0.000006s : 1: label_fine_grained_interleaved_index 0.03% : 0.000009s : 1: label_micro_interleaved_index 2.11% : 0.000603s : 1: loop_unroll 0.02% : 0.000004s : 1: merge_cast_opt 0.02% : 0.000005s : 1: micro_interleaved_order_control 2.77% : 0.000792s : 1: mutable_eliminate 0.03% : 0.000008s : 1: offloading_packed_experts 0.12% : 0.000034s : 1: opt.transform.loop_unroll_optimizer 0.09% : 0.000026s : 1: opt.transform.mutable_eliminate 4.07% : 0.001162s : 78: opt.transform.opt_a 0.13% : 0.000036s : 1: opt.transform.opt_after_cconv 0.12% : 0.000035s : 1: opt.transform.opt_after_jit_grad 0.45% : 0.000129s : 28: opt.transform.opt_b 0.22% : 0.000064s : 2: opt.transform.opt_trans_graph 0.17% : 0.000049s : 4: opt.transform.symbol_engine_opt 10.61% : 0.003028s : 1: opt_a 0.50% : 0.000142s : 1: opt_after_cconv 2.32% : 0.000663s : 1: opt_after_jit_grad 0.95% : 0.000271s : 1: opt_b 19.92% : 0.005686s : 1: optimize 0.09% : 0.000025s : 1: optimize_parallel_all_gather_comm 0.03% : 0.000010s : 1: order_py_execute_after_rewriter 0.09% : 0.000027s : 1: overlap_grad_flash_sp 0.02% : 0.000005s : 1: overlap_grad_matmul_and_grad_allreduce 0.03% : 0.000008s : 1: overlap_grad_ring_attention 0.02% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.02% : 0.000005s : 1: overlap_param_gather 0.01% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.03% : 0.000009s : 1: overlap_recompute_and_grad_model_parallel 0.02% : 0.000005s : 1: overlap_recompute_comm 0.03% : 0.000008s : 1: parallel-infer-symbol 0.01% : 0.000004s : 1: parallel-infer-symbol-second 0.02% : 0.000005s : 1: partial_unused_args_eliminate 0.02% : 0.000005s : 1: pipeline_parallel_scheduler 0.01% : 0.000004s : 1: pipeline_split 0.13% : 0.000036s : 1: pre_auto_parallel 0.10% : 0.000028s : 1: py_interpret_to_execute 0.07% : 0.000020s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000004s : 1: remove_cast_before_assign_add 0.08% : 0.000024s : 1: remove_dup_value 1.62% : 0.000462s : 1: renormalize.infer 1.16% : 0.000331s : 1: renormalize.specialize 0.02% : 0.000005s : 1: reorder_send_recv_between_fp_bp 0.03% : 0.000010s : 1: rewriter_after_jit_bprop_graph 0.18% : 0.000051s : 1: rewriter_after_opt_a 0.22% : 0.000063s : 1: rewriter_before_opt_a 0.02% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.02% : 0.000005s : 1: slice_recompute_activation 0.02% : 0.000004s : 1: split_layernorm_comm 0.02% : 0.000005s : 1: split_matmul_comm_elemetwise 0.03% : 0.000009s : 1: swap_dp_allreduce_reducescatter 0.35% : 0.000100s : 1: symbol_engine_optimizer 0.34% : 0.000098s : 1: tuple_transform 20.34% : 0.005805s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:36.921.354 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:36.921.678 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0179609, [21] [bootstrap]: 0.00047353 [type_inference]: 0.00578676 [event_method]: 1.502e-05 [auto_monad]: 6.627e-05 [graph_reusing]: 5.81e-06 [inline]: 3.2e-06 [add_attr]: 0.00395788, [1] [add_attr_with_inline]: 0.00394648, [1] [Cycle 1]: 7.379e-05, [2] [tag_attr]: 1.611e-05 [meta_addattr_fg_expand]: 4.10998e-06 [parallel-infer-symbol]: 4.02e-06 [pre_auto_parallel]: 3.155e-05 [insert-virtual-dataset]: 2.37001e-06 [parallel-infer-symbol-second]: 6.69999e-07 [dataset_repeat_opt]: 1.89e-06 [pipeline_split]: 1.59e-06 [optimize]: 0.00621775, [53] [py_interpret_to_execute]: 2.6e-05 [rewriter_before_opt_a]: 6.216e-05 [opt_a]: 0.00331262, [2] [Cycle 1]: 0.0022654, [45] [expand_dump_flag]: 2.53e-06 [switch_simplify]: 2.792e-05 [loop_unroll]: 1.741e-05 [a_1]: 0.00041971 [with_stream_mark]: 2.037e-05 [recompute_prepare]: 1.08e-05 [updatestate_depend_eliminate]: 5.25999e-06 [updatestate_assign_eliminate]: 4.37998e-06 [updatestate_loads_eliminate]: 4.19002e-06 [parameter_eliminate]: 1.94e-06 [a_2]: 0.00015754 [accelerated_algorithm]: 2.229e-05 [shard]: 2.17001e-06 [meta_shard_fg_expand]: 2.31e-06 [shard_inline]: 8.50999e-06 [merge_send_recv]: 9.92001e-06 [auto_parallel]: 8.40999e-06 [parallel]: 2.018e-05 [flash_sp]: 8.67e-06 [merge_comm]: 4.58999e-06 [allreduce_fusion]: 4.07e-06 [matmul_add_comm_reduction]: 1.279e-05 [allreduce_slice_to_reducescatter]: 8.59989e-07 [virtual_shard_identity]: 9.84001e-06 [virtual_dataset]: 8.06001e-06 [get_grad_eliminate_]: 7.24001e-06 [virtual_output]: 8.13999e-06 [merge_forward]: 5.56e-06 [cell_reuse_recompute_pass]: 2.51e-06 [offload_activation]: 1.046e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.844e-05 [merge_recompute_call_nodes]: 1.40001e-06 [before_grad]: 1.435e-05 [set_forward_comm_id_for_comm_node_pass]: 4.94003e-06 [meta_fg_expand]: 3.7e-06 [flash_sp_send_recv_attached]: 4.92999e-06 [receive_attached]: 2.26998e-06 [after_resolve]: 1.298e-05 [a_after_grad]: 1.376e-05 [renormalize]: 0.00079437 [add_forward_monad_depend]: 6.24999e-06 [auto_monad_grad]: 2.79001e-06 [auto_monad_eliminator]: 1.905e-05 [cse]: 3.761e-05 [a_3]: 8.327e-05 [Cycle 2]: 0.00103219, [45] [expand_dump_flag]: 1.52001e-06 [switch_simplify]: 1.024e-05 [loop_unroll]: 8.27e-06 [a_1]: 0.00020181 [with_stream_mark]: 1.191e-05 [recompute_prepare]: 8.48999e-06 [updatestate_depend_eliminate]: 4.86997e-06 [updatestate_assign_eliminate]: 3.43999e-06 [updatestate_loads_eliminate]: 3.35998e-06 [parameter_eliminate]: 1.33002e-06 [a_2]: 0.00013217 [accelerated_algorithm]: 1.253e-05 [shard]: 1.32e-06 [meta_shard_fg_expand]: 2.91e-06 [shard_inline]: 8.08001e-06 [merge_send_recv]: 8.50001e-06 [auto_parallel]: 8.33001e-06 [parallel]: 6.51e-06 [flash_sp]: 4.44002e-06 [merge_comm]: 5.07e-06 [allreduce_fusion]: 4.38999e-06 [matmul_add_comm_reduction]: 9.87001e-06 [allreduce_slice_to_reducescatter]: 4.60015e-07 [virtual_shard_identity]: 8.84e-06 [virtual_dataset]: 7.58001e-06 [get_grad_eliminate_]: 6.94999e-06 [virtual_output]: 7.06999e-06 [merge_forward]: 4.36002e-06 [cell_reuse_recompute_pass]: 2.29999e-06 [offload_activation]: 9.94001e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.644e-05 [merge_recompute_call_nodes]: 1.50001e-06 [before_grad]: 1.252e-05 [set_forward_comm_id_for_comm_node_pass]: 6.01e-06 [meta_fg_expand]: 3.46999e-06 [flash_sp_send_recv_attached]: 1.30999e-06 [receive_attached]: 1.52999e-06 [after_resolve]: 1.125e-05 [a_after_grad]: 1.124e-05 [renormalize]: 8.00064e-08 [add_forward_monad_depend]: 2.44001e-06 [auto_monad_grad]: 1.55999e-06 [auto_monad_eliminator]: 1.159e-05 [cse]: 2.724e-05 [a_3]: 5.969e-05 [py_interpret_to_execute_after_opt_a]: 1.834e-05 [slice_cell_reuse_recomputed_activation]: 5.33002e-06 [rewriter_after_opt_a]: 4.937e-05 [convert_after_rewriter]: 1.05e-05 [order_py_execute_after_rewriter]: 1.007e-05 [mutable_eliminate]: 0.00077346 [opt_b]: 0.00032944, [1] [Cycle 1]: 0.00031807, [7] [b_1]: 0.00020046 [b_2]: 9.86e-06 [updatestate_depend_eliminate]: 7.94997e-06 [updatestate_assign_eliminate]: 3.11001e-06 [updatestate_loads_eliminate]: 3.38999e-06 [renormalize]: 6.79982e-07 [cse]: 3.365e-05 [optimize_parallel_all_gather_comm]: 2.384e-05 [overlap_param_gather]: 4.72e-06 [cconv]: 3.692e-05 [loop_unroll]: 0.00063776 [opt_after_cconv]: 0.0001517, [1] [Cycle 1]: 0.00014213, [7] [c_1]: 3.76e-05 [parameter_eliminate]: 5.07999e-06 [updatestate_depend_eliminate]: 7.70998e-06 [updatestate_assign_eliminate]: 3.08e-06 [updatestate_loads_eliminate]: 4.23999e-06 [cse]: 2.807e-05 [renormalize]: 3.19997e-07 [remove_dup_value]: 2.155e-05 [tuple_transform]: 0.00010752, [1] [Cycle 1]: 9.963e-05, [4] [d_1]: 5.657e-05 [none_parameter_eliminate]: 1.76e-06 [renormalize]: 2.00002e-07 [switch_simplify]: 8.92999e-06 [partial_unused_args_eliminate]: 5.15001e-06 [add_recomputation]: 6.139e-05 [cse_after_recomputation]: 3.437e-05, [1] [Cycle 1]: 2.719e-05, [1] [cse]: 1.759e-05 [environ_conv]: 9.93002e-06 [swap_dp_allreduce_reducescatter]: 8.74e-06 [bias_add_comm_swap]: 5.55001e-06 [label_micro_interleaved_index]: 8.2e-06 [label_fine_grained_interleaved_index]: 5.66003e-06 [merge_cast_opt]: 4.43999e-06 [slice_recompute_activation]: 4.73001e-06 [micro_interleaved_order_control]: 4.75001e-06 [assign_add_opt]: 3.72002e-06 [ForceFp32Comm]: 3.3e-06 [remove_cast_before_assign_add]: 4.32e-06 [full_micro_interleaved_order_control]: 4.94e-06 [reorder_send_recv_between_fp_bp]: 5.17e-06 [comm_op_add_attrs]: 3.98999e-06 [add_comm_op_reuse_tag]: 3.3e-06 [interleave_split_concat_branches]: 3.7e-06 [interleave_parallel_branches]: 3.56001e-06 [overlap_opt_shard_in_pipeline]: 3.69002e-06 [overlap_opt_shard_grad_in_pipeline]: 4.50999e-06 [control_data_broadcast_order]: 1.986e-05 [grouped_pairwise_exchange_alltoall]: 3.99002e-06 [offloading_packed_experts]: 7.55e-06 [overlap_recompute_and_grad_model_parallel]: 8.17e-06 [overlap_grad_matmul_and_grad_allreduce]: 4.08999e-06 [overlap_recompute_allgather_and_fa_grad]: 3.85998e-06 [overlap_recompute_comm]: 5.35999e-06 [overlap_grad_ring_attention]: 7.25998e-06 [overlap_grad_flash_sp]: 2.502e-05 [begin_end_overlap_inline]: 3.26001e-06 [split_matmul_comm_elemetwise]: 4.41002e-06 [split_layernorm_comm]: 4.07e-06 [handle_group_info]: 3.92998e-06 [symbol_engine_optimizer]: 0.00011516, [1] [Cycle 1]: 0.0001075, [6] [build]: 3.76999e-06 [elim_shapecalc]: 1.377e-05 [elim_not_effective]: 1.833e-05 [opt_reshape]: 8.93002e-06 [fold_const_symbol]: 1.304e-05 [renormalize]: 3.00002e-07 [detach_backward]: 4.37e-06 [pipeline_parallel_scheduler]: 1.76e-06 [auto_monad_reorder]: 2.436e-05 [get_jit_bprop_graph]: 1.96e-06 [rewriter_after_jit_bprop_graph]: 6.06e-06 [opt_after_jit_grad]: 0.00066408 [validate]: 4.847e-05 Sums bootstrap : 0.000474s : 3.91% type_inference : 0.005787s : 47.75% event_method : 0.000015s : 0.12% auto_monad : 0.000066s : 0.55% graph_reusing : 0.000006s : 0.05% inline : 0.000003s : 0.03% add_attr.add_attr_with_inline.tag_attr : 0.000016s : 0.13% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000004s : 0.03% parallel-infer-symbol : 0.000004s : 0.03% pre_auto_parallel : 0.000032s : 0.26% insert-virtual-dataset : 0.000002s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000026s : 0.21% optimize.rewriter_before_opt_a : 0.000062s : 0.51% optimize.opt_a.expand_dump_flag : 0.000004s : 0.03% optimize.opt_a.switch_simplify : 0.000038s : 0.31% optimize.opt_a.loop_unroll : 0.000026s : 0.21% optimize.opt_a.a_1 : 0.000622s : 5.13% optimize.opt_a.with_stream_mark : 0.000032s : 0.27% optimize.opt_a.recompute_prepare : 0.000019s : 0.16% optimize.opt_a.updatestate_depend_eliminate : 0.000010s : 0.08% optimize.opt_a.updatestate_assign_eliminate : 0.000008s : 0.06% optimize.opt_a.updatestate_loads_eliminate : 0.000008s : 0.06% optimize.opt_a.parameter_eliminate : 0.000003s : 0.03% optimize.opt_a.a_2 : 0.000290s : 2.39% optimize.opt_a.accelerated_algorithm : 0.000035s : 0.29% optimize.opt_a.shard : 0.000003s : 0.03% optimize.opt_a.meta_shard_fg_expand : 0.000005s : 0.04% optimize.opt_a.shard_inline : 0.000017s : 0.14% optimize.opt_a.merge_send_recv : 0.000018s : 0.15% optimize.opt_a.auto_parallel : 0.000017s : 0.14% optimize.opt_a.parallel : 0.000027s : 0.22% optimize.opt_a.flash_sp : 0.000013s : 0.11% optimize.opt_a.merge_comm : 0.000010s : 0.08% optimize.opt_a.allreduce_fusion : 0.000008s : 0.07% optimize.opt_a.matmul_add_comm_reduction : 0.000023s : 0.19% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000019s : 0.15% optimize.opt_a.virtual_dataset : 0.000016s : 0.13% optimize.opt_a.get_grad_eliminate_ : 0.000014s : 0.12% optimize.opt_a.virtual_output : 0.000015s : 0.13% optimize.opt_a.merge_forward : 0.000010s : 0.08% optimize.opt_a.cell_reuse_recompute_pass : 0.000005s : 0.04% optimize.opt_a.offload_activation : 0.000020s : 0.17% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000035s : 0.29% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.02% optimize.opt_a.before_grad : 0.000027s : 0.22% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000011s : 0.09% optimize.opt_a.meta_fg_expand : 0.000007s : 0.06% optimize.opt_a.flash_sp_send_recv_attached : 0.000006s : 0.05% optimize.opt_a.receive_attached : 0.000004s : 0.03% optimize.opt_a.after_resolve : 0.000024s : 0.20% optimize.opt_a.a_after_grad : 0.000025s : 0.21% optimize.opt_a.renormalize : 0.000794s : 6.56% optimize.opt_a.add_forward_monad_depend : 0.000009s : 0.07% optimize.opt_a.auto_monad_grad : 0.000004s : 0.04% optimize.opt_a.auto_monad_eliminator : 0.000031s : 0.25% optimize.opt_a.cse : 0.000065s : 0.54% optimize.opt_a.a_3 : 0.000143s : 1.18% optimize.py_interpret_to_execute_after_opt_a : 0.000018s : 0.15% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.04% optimize.rewriter_after_opt_a : 0.000049s : 0.41% optimize.convert_after_rewriter : 0.000011s : 0.09% optimize.order_py_execute_after_rewriter : 0.000010s : 0.08% optimize.mutable_eliminate : 0.000773s : 6.38% optimize.opt_b.b_1 : 0.000200s : 1.65% optimize.opt_b.b_2 : 0.000010s : 0.08% optimize.opt_b.updatestate_depend_eliminate : 0.000008s : 0.07% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_b.renormalize : 0.000001s : 0.01% optimize.opt_b.cse : 0.000034s : 0.28% optimize.optimize_parallel_all_gather_comm : 0.000024s : 0.20% optimize.overlap_param_gather : 0.000005s : 0.04% optimize.cconv : 0.000037s : 0.30% optimize.loop_unroll : 0.000638s : 5.26% optimize.opt_after_cconv.c_1 : 0.000038s : 0.31% optimize.opt_after_cconv.parameter_eliminate : 0.000005s : 0.04% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000008s : 0.06% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000004s : 0.03% optimize.opt_after_cconv.cse : 0.000028s : 0.23% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000022s : 0.18% optimize.tuple_transform.d_1 : 0.000057s : 0.47% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000009s : 0.07% optimize.partial_unused_args_eliminate : 0.000005s : 0.04% optimize.add_recomputation : 0.000061s : 0.51% optimize.cse_after_recomputation.cse : 0.000018s : 0.15% optimize.environ_conv : 0.000010s : 0.08% optimize.swap_dp_allreduce_reducescatter : 0.000009s : 0.07% optimize.bias_add_comm_swap : 0.000006s : 0.05% optimize.label_micro_interleaved_index : 0.000008s : 0.07% optimize.label_fine_grained_interleaved_index : 0.000006s : 0.05% optimize.merge_cast_opt : 0.000004s : 0.04% optimize.slice_recompute_activation : 0.000005s : 0.04% optimize.micro_interleaved_order_control : 0.000005s : 0.04% optimize.assign_add_opt : 0.000004s : 0.03% optimize.ForceFp32Comm : 0.000003s : 0.03% optimize.remove_cast_before_assign_add : 0.000004s : 0.04% optimize.full_micro_interleaved_order_control : 0.000005s : 0.04% optimize.reorder_send_recv_between_fp_bp : 0.000005s : 0.04% optimize.comm_op_add_attrs : 0.000004s : 0.03% optimize.add_comm_op_reuse_tag : 0.000003s : 0.03% optimize.interleave_split_concat_branches : 0.000004s : 0.03% optimize.interleave_parallel_branches : 0.000004s : 0.03% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.03% optimize.overlap_opt_shard_grad_in_pipeline : 0.000005s : 0.04% optimize.control_data_broadcast_order : 0.000020s : 0.16% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.03% optimize.offloading_packed_experts : 0.000008s : 0.06% optimize.overlap_recompute_and_grad_model_parallel : 0.000008s : 0.07% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.03% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.03% optimize.overlap_recompute_comm : 0.000005s : 0.04% optimize.overlap_grad_ring_attention : 0.000007s : 0.06% optimize.overlap_grad_flash_sp : 0.000025s : 0.21% optimize.begin_end_overlap_inline : 0.000003s : 0.03% optimize.split_matmul_comm_elemetwise : 0.000004s : 0.04% optimize.split_layernorm_comm : 0.000004s : 0.03% optimize.handle_group_info : 0.000004s : 0.03% optimize.symbol_engine_optimizer.build : 0.000004s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000014s : 0.11% optimize.symbol_engine_optimizer.elim_not_effective : 0.000018s : 0.15% optimize.symbol_engine_optimizer.opt_reshape : 0.000009s : 0.07% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000013s : 0.11% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000004s : 0.04% pipeline_parallel_scheduler : 0.000002s : 0.01% auto_monad_reorder : 0.000024s : 0.20% get_jit_bprop_graph : 0.000002s : 0.02% rewriter_after_jit_bprop_graph : 0.000006s : 0.05% opt_after_jit_grad : 0.000664s : 5.48% validate : 0.000048s : 0.40% Time group info: ------[substitution.] 0.000260 34 29.57% : 0.000077s : 4: substitution.arithmetic_simplify 8.42% : 0.000022s : 2: substitution.cast_eliminate 1.06% : 0.000003s : 3: substitution.elim_not_effective 0.68% : 0.000002s : 3: substitution.fold_const_symbol 2.83% : 0.000007s : 4: substitution.graph_param_transform 45.93% : 0.000120s : 2: substitution.inline 2.00% : 0.000005s : 6: substitution.j_node_and_user_rematch 4.88% : 0.000013s : 2: substitution.less_batch_normalization 2.88% : 0.000007s : 6: substitution.remove_not_recompute_node 1.75% : 0.000005s : 2: substitution.replace_old_param ------[type_inference.] 0.005736 2 90.95% : 0.005217s : 1: type_inference.infer 9.05% : 0.000519s : 1: type_inference.specialize ------[replace.] 0.000024 2 100.00% : 0.000024s : 2: replace.inline ------[match.] 0.000118 2 100.00% : 0.000118s : 2: match.inline ------[predicate.] 0.000195 980 0.79% : 0.000002s : 9: predicate.accumulaten_eliminater 1.16% : 0.000002s : 4: predicate.ad_related_special_op_eliminate 0.77% : 0.000002s : 8: predicate.addn_check_dump 0.83% : 0.000002s : 9: predicate.addn_zero_filter 0.62% : 0.000001s : 9: predicate.adjust_all_reduce_mul_add 3.03% : 0.000006s : 17: predicate.arithmetic_simplify 0.91% : 0.000002s : 9: predicate.cast_eliminate 0.78% : 0.000002s : 8: predicate.check_bprop_eliminate 0.68% : 0.000001s : 8: predicate.compare_switch_simplify 0.18% : 0.000000s : 4: predicate.const_output_eliminate 0.90% : 0.000002s : 8: predicate.depend_value_elim 0.80% : 0.000002s : 9: predicate.dict_get_item_const_eliminator 1.09% : 0.000002s : 9: predicate.dict_get_item_eliminator 0.73% : 0.000001s : 9: predicate.dict_set_item_eliminator 1.21% : 0.000002s : 8: predicate.dumpgradient_eliminate 0.35% : 0.000001s : 4: predicate.elim_not_effective 0.52% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.29% : 0.000003s : 13: predicate.environ_add_const_eliminate 1.11% : 0.000002s : 13: predicate.environ_get_add_eliminate 1.00% : 0.000002s : 13: predicate.environ_get_depend_swap 1.89% : 0.000004s : 21: predicate.environ_get_eliminate 0.95% : 0.000002s : 13: predicate.environ_get_set_eliminate 1.03% : 0.000002s : 11: predicate.exchange_switch_depend_value 1.70% : 0.000003s : 11: predicate.float_depend_g_call 0.72% : 0.000001s : 8: predicate.float_environ_get_switch 1.10% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.20% : 0.000000s : 4: predicate.fold_const_symbol 0.72% : 0.000001s : 8: predicate.get_grad_eliminate 0.27% : 0.000001s : 4: predicate.graph_param_transform 0.79% : 0.000002s : 8: predicate.incorporate_call 0.59% : 0.000001s : 8: predicate.incorporate_call_switch 6.05% : 0.000012s : 44: predicate.inline 1.03% : 0.000002s : 8: predicate.inline_without_move 0.33% : 0.000001s : 8: predicate.j_node_and_user_rematch 1.25% : 0.000002s : 8: predicate.less_batch_normalization 1.97% : 0.000004s : 17: predicate.list_to_tuple_eliminator_ 1.99% : 0.000004s : 26: predicate.load_eliminater 1.37% : 0.000003s : 4: predicate.loop_unroll_after_grad 1.54% : 0.000003s : 16: predicate.loop_unroll_before_grad 1.67% : 0.000003s : 17: predicate.make_slice_get_slice_eliminator 0.73% : 0.000001s : 8: predicate.merge_addn 0.79% : 0.000002s : 8: predicate.micro_step_allgather_replace 0.68% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.80% : 0.000002s : 9: predicate.minmaximum_grad 1.28% : 0.000002s : 4: predicate.mutable_eliminate 0.48% : 0.000001s : 4: predicate.opt_reshape 0.51% : 0.000001s : 4: predicate.parallel_virtual_node 1.16% : 0.000002s : 11: predicate.partial_defer_inline 1.15% : 0.000002s : 13: predicate.partial_eliminate 0.76% : 0.000001s : 9: predicate.print_const_string_wrapper 0.79% : 0.000002s : 8: predicate.reduce_all_const_elim 1.26% : 0.000002s : 9: predicate.reduce_eliminate 2.07% : 0.000004s : 26: predicate.redundant_stop_gradient_eliminater 0.49% : 0.000001s : 8: predicate.remove_not_recompute_node 1.18% : 0.000002s : 17: predicate.replace_applicator 0.56% : 0.000001s : 8: predicate.replace_old_param 0.31% : 0.000001s : 4: predicate.reset_defer_inline 0.87% : 0.000002s : 9: predicate.reshape_eliminate 0.87% : 0.000002s : 8: predicate.row_tensor_add_zeros_like 0.54% : 0.000001s : 4: predicate.row_tensor_eliminate 0.85% : 0.000002s : 8: predicate.same_eliminate 0.52% : 0.000001s : 8: predicate.set_cell_output_no_recompute 1.04% : 0.000002s : 8: predicate.shard_identity_eliminate 0.99% : 0.000002s : 8: predicate.special_op_eliminate 0.90% : 0.000002s : 8: predicate.specialize_transform 1.31% : 0.000003s : 8: predicate.split_environ_get_set_with_tuple_value 0.98% : 0.000002s : 8: predicate.stack_unstack_eliminate 0.38% : 0.000001s : 4: predicate.switch_call_monad_eliminater 0.90% : 0.000002s : 11: predicate.switch_defer_inline 1.74% : 0.000003s : 19: predicate.switch_layer_defer_inline 3.82% : 0.000007s : 39: predicate.switch_simplify 0.72% : 0.000001s : 9: predicate.tile_eliminate 0.84% : 0.000002s : 9: predicate.transpose_eliminate 1.67% : 0.000003s : 17: predicate.tuple_list_convert_item_index_to_positive 1.54% : 0.000003s : 17: predicate.tuple_list_get_item_const_eliminator 1.84% : 0.000004s : 17: predicate.tuple_list_get_item_depend_reorder 3.28% : 0.000006s : 25: predicate.tuple_list_get_item_eliminator 1.60% : 0.000003s : 17: predicate.tuple_list_get_set_item_eliminator 2.56% : 0.000005s : 25: predicate.tuple_list_set_item_eliminator 1.51% : 0.000003s : 17: predicate.tuple_to_list_eliminator_ 2.04% : 0.000004s : 26: predicate.updatestate_pure_node_eliminater 2.84% : 0.000006s : 34: predicate.updatestate_useless_node_eliminater 0.48% : 0.000001s : 4: predicate.value_based_eliminate 0.77% : 0.000001s : 8: predicate.virtual_dataset_eliminate 0.75% : 0.000001s : 8: predicate.virtual_output_eliminate 0.31% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.62% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000272 5 7.88% : 0.000021s : 1: func_graph_cloner_run.FuncGraphClonerGraph 92.12% : 0.000250s : 4: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.030306 192 0.02% : 0.000006s : 1: ForceFp32Comm 13.10% : 0.003969s : 1: add_attr 13.04% : 0.003950s : 1: add_attr_with_inline 0.02% : 0.000006s : 1: add_comm_op_reuse_tag 0.22% : 0.000065s : 1: add_recomputation 0.02% : 0.000006s : 1: assign_add_opt 0.25% : 0.000076s : 1: auto_monad 0.10% : 0.000032s : 1: auto_monad_reorder 0.02% : 0.000006s : 1: begin_end_overlap_inline 0.03% : 0.000008s : 1: bias_add_comm_swap 1.72% : 0.000521s : 1: bootstrap 0.13% : 0.000040s : 1: cconv 0.02% : 0.000007s : 1: comm_op_add_attrs 0.08% : 0.000023s : 1: control_data_broadcast_order 0.05% : 0.000014s : 1: convert_after_rewriter 0.12% : 0.000038s : 1: cse_after_recomputation 0.02% : 0.000007s : 1: dataset_repeat_opt 0.06% : 0.000019s : 1: detach_backward 0.04% : 0.000013s : 1: environ_conv 0.09% : 0.000026s : 1: event_method 0.03% : 0.000008s : 1: full_micro_interleaved_order_control 0.03% : 0.000008s : 1: get_jit_bprop_graph 0.04% : 0.000013s : 1: graph_reusing 0.02% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.02% : 0.000007s : 1: handle_group_info 0.03% : 0.000010s : 1: inline 0.03% : 0.000009s : 1: insert-virtual-dataset 0.02% : 0.000006s : 1: interleave_parallel_branches 0.02% : 0.000006s : 1: interleave_split_concat_branches 0.03% : 0.000008s : 1: label_fine_grained_interleaved_index 0.04% : 0.000011s : 1: label_micro_interleaved_index 2.13% : 0.000645s : 1: loop_unroll 0.02% : 0.000007s : 1: merge_cast_opt 0.02% : 0.000008s : 1: micro_interleaved_order_control 2.58% : 0.000781s : 1: mutable_eliminate 0.03% : 0.000010s : 1: offloading_packed_experts 0.06% : 0.000019s : 1: opt.transform.loop_unroll_optimizer 0.07% : 0.000021s : 1: opt.transform.mutable_eliminate 3.87% : 0.001172s : 78: opt.transform.opt_a 0.12% : 0.000036s : 1: opt.transform.opt_after_cconv 0.11% : 0.000034s : 1: opt.transform.opt_after_jit_grad 0.44% : 0.000133s : 28: opt.transform.opt_b 0.21% : 0.000063s : 2: opt.transform.opt_trans_graph 0.16% : 0.000050s : 4: opt.transform.symbol_engine_opt 10.94% : 0.003317s : 1: opt_a 0.51% : 0.000155s : 1: opt_after_cconv 2.23% : 0.000676s : 1: opt_after_jit_grad 1.10% : 0.000334s : 1: opt_b 21.78% : 0.006600s : 1: optimize 0.09% : 0.000027s : 1: optimize_parallel_all_gather_comm 0.04% : 0.000013s : 1: order_py_execute_after_rewriter 0.09% : 0.000028s : 1: overlap_grad_flash_sp 0.02% : 0.000007s : 1: overlap_grad_matmul_and_grad_allreduce 0.03% : 0.000010s : 1: overlap_grad_ring_attention 0.02% : 0.000007s : 1: overlap_opt_shard_grad_in_pipeline 0.02% : 0.000006s : 1: overlap_opt_shard_in_pipeline 0.03% : 0.000008s : 1: overlap_param_gather 0.02% : 0.000007s : 1: overlap_recompute_allgather_and_fa_grad 0.04% : 0.000011s : 1: overlap_recompute_and_grad_model_parallel 0.03% : 0.000008s : 1: overlap_recompute_comm 0.04% : 0.000011s : 1: parallel-infer-symbol 0.02% : 0.000006s : 1: parallel-infer-symbol-second 0.03% : 0.000008s : 1: partial_unused_args_eliminate 0.03% : 0.000009s : 1: pipeline_parallel_scheduler 0.02% : 0.000007s : 1: pipeline_split 0.13% : 0.000039s : 1: pre_auto_parallel 0.10% : 0.000029s : 1: py_interpret_to_execute 0.07% : 0.000022s : 1: py_interpret_to_execute_after_opt_a 0.02% : 0.000007s : 1: remove_cast_before_assign_add 0.08% : 0.000025s : 1: remove_dup_value 1.48% : 0.000449s : 1: renormalize.infer 1.11% : 0.000335s : 1: renormalize.specialize 0.03% : 0.000008s : 1: reorder_send_recv_between_fp_bp 0.04% : 0.000012s : 1: rewriter_after_jit_bprop_graph 0.18% : 0.000054s : 1: rewriter_after_opt_a 0.22% : 0.000066s : 1: rewriter_before_opt_a 0.03% : 0.000008s : 1: slice_cell_reuse_recomputed_activation 0.02% : 0.000008s : 1: slice_recompute_activation 0.02% : 0.000007s : 1: split_layernorm_comm 0.02% : 0.000007s : 1: split_matmul_comm_elemetwise 0.04% : 0.000012s : 1: swap_dp_allreduce_reducescatter 0.39% : 0.000118s : 1: symbol_engine_optimizer 0.36% : 0.000111s : 1: tuple_transform 19.23% : 0.005828s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:37.144.944 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.013987, [21] [bootstrap]: 0.00047457 [type_inference]: 0.00492246 [event_method]: 1.219e-05 [auto_monad]: 5.773e-05 [graph_reusing]: 5.30001e-06 [inline]: 1.91998e-06 [add_attr]: 0.00314321, [1] [add_attr_with_inline]: 0.00313436, [1] [Cycle 1]: 5.089e-05, [2] [tag_attr]: 1.494e-05 [meta_addattr_fg_expand]: 4.18001e-06 [parallel-infer-symbol]: 2.86e-06 [pre_auto_parallel]: 2.797e-05 [insert-virtual-dataset]: 2.39001e-06 [parallel-infer-symbol-second]: 1.16997e-06 [dataset_repeat_opt]: 2.32999e-06 [pipeline_split]: 1.59998e-06 [optimize]: 0.00463327, [53] [py_interpret_to_execute]: 1.859e-05 [rewriter_before_opt_a]: 5.061e-05 [opt_a]: 0.00252113, [2] [Cycle 1]: 0.00175539, [45] [expand_dump_flag]: 2.68e-06 [switch_simplify]: 2.767e-05 [loop_unroll]: 1.532e-05 [a_1]: 0.00038444 [with_stream_mark]: 1.635e-05 [recompute_prepare]: 1.091e-05 [updatestate_depend_eliminate]: 4.75001e-06 [updatestate_assign_eliminate]: 3.93001e-06 [updatestate_loads_eliminate]: 3.78999e-06 [parameter_eliminate]: 2.28998e-06 [a_2]: 0.00011007 [accelerated_algorithm]: 2.192e-05 [shard]: 1.77001e-06 [meta_shard_fg_expand]: 2.14e-06 [shard_inline]: 8.08001e-06 [merge_send_recv]: 8.90999e-06 [auto_parallel]: 6.41e-06 [parallel]: 1.952e-05 [flash_sp]: 7.68001e-06 [merge_comm]: 4.70999e-06 [allreduce_fusion]: 4.21001e-06 [matmul_add_comm_reduction]: 1.184e-05 [allreduce_slice_to_reducescatter]: 6.80011e-07 [virtual_shard_identity]: 9.76e-06 [virtual_dataset]: 7.22002e-06 [get_grad_eliminate_]: 7.4e-06 [virtual_output]: 7.48e-06 [merge_forward]: 4.77e-06 [cell_reuse_recompute_pass]: 1.43002e-06 [offload_activation]: 1.066e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.468e-05 [merge_recompute_call_nodes]: 1.86003e-06 [before_grad]: 1.342e-05 [set_forward_comm_id_for_comm_node_pass]: 4.43999e-06 [meta_fg_expand]: 3.40998e-06 [flash_sp_send_recv_attached]: 4.86002e-06 [receive_attached]: 2.36998e-06 [after_resolve]: 1.122e-05 [a_after_grad]: 1.138e-05 [renormalize]: 0.00059985 [add_forward_monad_depend]: 5.35001e-06 [auto_monad_grad]: 2.34001e-06 [auto_monad_eliminator]: 1.683e-05 [cse]: 3.72e-05 [a_3]: 5.564e-05 [Cycle 2]: 0.000756, [45] [expand_dump_flag]: 1.09e-06 [switch_simplify]: 8.46002e-06 [loop_unroll]: 7e-06 [a_1]: 0.00017081 [with_stream_mark]: 1.083e-05 [recompute_prepare]: 7.95998e-06 [updatestate_depend_eliminate]: 4.27e-06 [updatestate_assign_eliminate]: 2.99999e-06 [updatestate_loads_eliminate]: 2.71e-06 [parameter_eliminate]: 1.04e-06 [a_2]: 9.865e-05 [accelerated_algorithm]: 1.06e-05 [shard]: 9.30013e-07 [meta_shard_fg_expand]: 1.86e-06 [shard_inline]: 7.62998e-06 [merge_send_recv]: 5.91e-06 [auto_parallel]: 6.61999e-06 [parallel]: 4.94e-06 [flash_sp]: 3.6e-06 [merge_comm]: 4.08999e-06 [allreduce_fusion]: 4.1e-06 [matmul_add_comm_reduction]: 7.9e-06 [allreduce_slice_to_reducescatter]: 3.69997e-07 [virtual_shard_identity]: 7.55998e-06 [virtual_dataset]: 6.93e-06 [get_grad_eliminate_]: 6.92002e-06 [virtual_output]: 6.40002e-06 [merge_forward]: 3.43e-06 [cell_reuse_recompute_pass]: 1.36002e-06 [offload_activation]: 7.16001e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.367e-05 [merge_recompute_call_nodes]: 8.89995e-07 [before_grad]: 1.12e-05 [set_forward_comm_id_for_comm_node_pass]: 4.75001e-06 [meta_fg_expand]: 2.76e-06 [flash_sp_send_recv_attached]: 9.30013e-07 [receive_attached]: 1.15999e-06 [after_resolve]: 1.027e-05 [a_after_grad]: 9.99001e-06 [renormalize]: 8.00064e-08 [add_forward_monad_depend]: 1.94999e-06 [auto_monad_grad]: 9.39996e-07 [auto_monad_eliminator]: 8.45001e-06 [cse]: 2.023e-05 [a_3]: 4.37e-05 [py_interpret_to_execute_after_opt_a]: 9.34998e-06 [slice_cell_reuse_recomputed_activation]: 1.87001e-06 [rewriter_after_opt_a]: 4.047e-05 [convert_after_rewriter]: 7.27002e-06 [order_py_execute_after_rewriter]: 5.66e-06 [mutable_eliminate]: 0.0005014 [opt_b]: 0.00026137, [1] [Cycle 1]: 0.00025485, [7] [b_1]: 0.00016243 [b_2]: 9.44e-06 [updatestate_depend_eliminate]: 7.11001e-06 [updatestate_assign_eliminate]: 3.48999e-06 [updatestate_loads_eliminate]: 3.6e-06 [renormalize]: 5.89993e-07 [cse]: 2.862e-05 [optimize_parallel_all_gather_comm]: 1.779e-05 [overlap_param_gather]: 1.91e-06 [cconv]: 2.827e-05 [loop_unroll]: 0.0004572 [opt_after_cconv]: 0.00011684, [1] [Cycle 1]: 0.00011117, [7] [c_1]: 3.431e-05 [parameter_eliminate]: 3.5e-06 [updatestate_depend_eliminate]: 6.97002e-06 [updatestate_assign_eliminate]: 3.24001e-06 [updatestate_loads_eliminate]: 3.33998e-06 [cse]: 2.517e-05 [renormalize]: 5.69999e-07 [remove_dup_value]: 1.85e-05 [tuple_transform]: 8.354e-05, [1] [Cycle 1]: 7.904e-05, [4] [d_1]: 5.064e-05 [none_parameter_eliminate]: 1.55001e-06 [renormalize]: 2.19996e-07 [switch_simplify]: 7.82e-06 [partial_unused_args_eliminate]: 1.76003e-06 [add_recomputation]: 5.703e-05 [cse_after_recomputation]: 2.636e-05, [1] [Cycle 1]: 2.147e-05, [1] [cse]: 1.602e-05 [environ_conv]: 7.18e-06 [swap_dp_allreduce_reducescatter]: 6.10002e-06 [bias_add_comm_swap]: 2.75997e-06 [label_micro_interleaved_index]: 4.35999e-06 [label_fine_grained_interleaved_index]: 2.82002e-06 [merge_cast_opt]: 1.35001e-06 [slice_recompute_activation]: 1.99e-06 [micro_interleaved_order_control]: 2.31e-06 [assign_add_opt]: 1.17e-06 [ForceFp32Comm]: 7.99977e-07 [remove_cast_before_assign_add]: 9.89996e-07 [full_micro_interleaved_order_control]: 1.97001e-06 [reorder_send_recv_between_fp_bp]: 2.67001e-06 [comm_op_add_attrs]: 9.70002e-07 [add_comm_op_reuse_tag]: 9.99979e-07 [interleave_split_concat_branches]: 1.17e-06 [interleave_parallel_branches]: 1.15001e-06 [overlap_opt_shard_in_pipeline]: 1.14e-06 [overlap_opt_shard_grad_in_pipeline]: 2.09e-06 [control_data_broadcast_order]: 1.523e-05 [grouped_pairwise_exchange_alltoall]: 1.82001e-06 [offloading_packed_experts]: 4.50001e-06 [overlap_recompute_and_grad_model_parallel]: 5.81998e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.20001e-06 [overlap_recompute_allgather_and_fa_grad]: 1.27999e-06 [overlap_recompute_comm]: 2.22001e-06 [overlap_grad_ring_attention]: 4.70001e-06 [overlap_grad_flash_sp]: 2.136e-05 [begin_end_overlap_inline]: 5.69999e-07 [split_matmul_comm_elemetwise]: 2.07001e-06 [split_layernorm_comm]: 1.72999e-06 [handle_group_info]: 1.05999e-06 [symbol_engine_optimizer]: 8.169e-05, [1] [Cycle 1]: 7.74e-05, [6] [build]: 3.35e-06 [elim_shapecalc]: 1.037e-05 [elim_not_effective]: 1.475e-05 [opt_reshape]: 8.31002e-06 [fold_const_symbol]: 1.209e-05 [renormalize]: 1.8999e-07 [detach_backward]: 1.84e-06 [pipeline_parallel_scheduler]: 1.52001e-06 [auto_monad_reorder]: 1.984e-05 [get_jit_bprop_graph]: 1.64e-06 [rewriter_after_jit_bprop_graph]: 4.53001e-06 [opt_after_jit_grad]: 0.00047678 [validate]: 4.238e-05 Sums bootstrap : 0.000475s : 4.80% type_inference : 0.004922s : 49.81% event_method : 0.000012s : 0.12% auto_monad : 0.000058s : 0.58% graph_reusing : 0.000005s : 0.05% inline : 0.000002s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000015s : 0.15% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000004s : 0.04% parallel-infer-symbol : 0.000003s : 0.03% pre_auto_parallel : 0.000028s : 0.28% insert-virtual-dataset : 0.000002s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.02% optimize.py_interpret_to_execute : 0.000019s : 0.19% optimize.rewriter_before_opt_a : 0.000051s : 0.51% optimize.opt_a.expand_dump_flag : 0.000004s : 0.04% optimize.opt_a.switch_simplify : 0.000036s : 0.37% optimize.opt_a.loop_unroll : 0.000022s : 0.23% optimize.opt_a.a_1 : 0.000555s : 5.62% optimize.opt_a.with_stream_mark : 0.000027s : 0.28% optimize.opt_a.recompute_prepare : 0.000019s : 0.19% optimize.opt_a.updatestate_depend_eliminate : 0.000009s : 0.09% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.07% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.07% optimize.opt_a.parameter_eliminate : 0.000003s : 0.03% optimize.opt_a.a_2 : 0.000209s : 2.11% optimize.opt_a.accelerated_algorithm : 0.000033s : 0.33% optimize.opt_a.shard : 0.000003s : 0.03% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.04% optimize.opt_a.shard_inline : 0.000016s : 0.16% optimize.opt_a.merge_send_recv : 0.000015s : 0.15% optimize.opt_a.auto_parallel : 0.000013s : 0.13% optimize.opt_a.parallel : 0.000024s : 0.25% optimize.opt_a.flash_sp : 0.000011s : 0.11% optimize.opt_a.merge_comm : 0.000009s : 0.09% optimize.opt_a.allreduce_fusion : 0.000008s : 0.08% optimize.opt_a.matmul_add_comm_reduction : 0.000020s : 0.20% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000017s : 0.18% optimize.opt_a.virtual_dataset : 0.000014s : 0.14% optimize.opt_a.get_grad_eliminate_ : 0.000014s : 0.14% optimize.opt_a.virtual_output : 0.000014s : 0.14% optimize.opt_a.merge_forward : 0.000008s : 0.08% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.03% optimize.opt_a.offload_activation : 0.000018s : 0.18% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000028s : 0.29% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.03% optimize.opt_a.before_grad : 0.000025s : 0.25% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000009s : 0.09% optimize.opt_a.meta_fg_expand : 0.000006s : 0.06% optimize.opt_a.flash_sp_send_recv_attached : 0.000006s : 0.06% optimize.opt_a.receive_attached : 0.000004s : 0.04% optimize.opt_a.after_resolve : 0.000021s : 0.22% optimize.opt_a.a_after_grad : 0.000021s : 0.22% optimize.opt_a.renormalize : 0.000600s : 6.07% optimize.opt_a.add_forward_monad_depend : 0.000007s : 0.07% optimize.opt_a.auto_monad_grad : 0.000003s : 0.03% optimize.opt_a.auto_monad_eliminator : 0.000025s : 0.26% optimize.opt_a.cse : 0.000057s : 0.58% optimize.opt_a.a_3 : 0.000099s : 1.01% optimize.py_interpret_to_execute_after_opt_a : 0.000009s : 0.09% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.02% optimize.rewriter_after_opt_a : 0.000040s : 0.41% optimize.convert_after_rewriter : 0.000007s : 0.07% optimize.order_py_execute_after_rewriter : 0.000006s : 0.06% optimize.mutable_eliminate : 0.000501s : 5.07% optimize.opt_b.b_1 : 0.000162s : 1.64% optimize.opt_b.b_2 : 0.000009s : 0.10% optimize.opt_b.updatestate_depend_eliminate : 0.000007s : 0.07% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.04% optimize.opt_b.updatestate_loads_eliminate : 0.000004s : 0.04% optimize.opt_b.renormalize : 0.000001s : 0.01% optimize.opt_b.cse : 0.000029s : 0.29% optimize.optimize_parallel_all_gather_comm : 0.000018s : 0.18% optimize.overlap_param_gather : 0.000002s : 0.02% optimize.cconv : 0.000028s : 0.29% optimize.loop_unroll : 0.000457s : 4.63% optimize.opt_after_cconv.c_1 : 0.000034s : 0.35% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.04% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000007s : 0.07% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.cse : 0.000025s : 0.25% optimize.opt_after_cconv.renormalize : 0.000001s : 0.01% optimize.remove_dup_value : 0.000018s : 0.19% optimize.tuple_transform.d_1 : 0.000051s : 0.51% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.02% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000008s : 0.08% optimize.partial_unused_args_eliminate : 0.000002s : 0.02% optimize.add_recomputation : 0.000057s : 0.58% optimize.cse_after_recomputation.cse : 0.000016s : 0.16% optimize.environ_conv : 0.000007s : 0.07% optimize.swap_dp_allreduce_reducescatter : 0.000006s : 0.06% optimize.bias_add_comm_swap : 0.000003s : 0.03% optimize.label_micro_interleaved_index : 0.000004s : 0.04% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.03% optimize.merge_cast_opt : 0.000001s : 0.01% optimize.slice_recompute_activation : 0.000002s : 0.02% optimize.micro_interleaved_order_control : 0.000002s : 0.02% optimize.assign_add_opt : 0.000001s : 0.01% optimize.ForceFp32Comm : 0.000001s : 0.01% optimize.remove_cast_before_assign_add : 0.000001s : 0.01% optimize.full_micro_interleaved_order_control : 0.000002s : 0.02% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.03% optimize.comm_op_add_attrs : 0.000001s : 0.01% optimize.add_comm_op_reuse_tag : 0.000001s : 0.01% optimize.interleave_split_concat_branches : 0.000001s : 0.01% optimize.interleave_parallel_branches : 0.000001s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.02% optimize.control_data_broadcast_order : 0.000015s : 0.15% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.02% optimize.offloading_packed_experts : 0.000005s : 0.05% optimize.overlap_recompute_and_grad_model_parallel : 0.000006s : 0.06% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.01% optimize.overlap_recompute_comm : 0.000002s : 0.02% optimize.overlap_grad_ring_attention : 0.000005s : 0.05% optimize.overlap_grad_flash_sp : 0.000021s : 0.22% optimize.begin_end_overlap_inline : 0.000001s : 0.01% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.02% optimize.split_layernorm_comm : 0.000002s : 0.02% optimize.handle_group_info : 0.000001s : 0.01% optimize.symbol_engine_optimizer.build : 0.000003s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000010s : 0.10% optimize.symbol_engine_optimizer.elim_not_effective : 0.000015s : 0.15% optimize.symbol_engine_optimizer.opt_reshape : 0.000008s : 0.08% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000012s : 0.12% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.02% pipeline_parallel_scheduler : 0.000002s : 0.02% auto_monad_reorder : 0.000020s : 0.20% get_jit_bprop_graph : 0.000002s : 0.02% rewriter_after_jit_bprop_graph : 0.000005s : 0.05% opt_after_jit_grad : 0.000477s : 4.82% validate : 0.000042s : 0.43% Time group info: ------[substitution.] 0.000222 34 28.19% : 0.000063s : 4: substitution.arithmetic_simplify 9.16% : 0.000020s : 2: substitution.cast_eliminate 1.02% : 0.000002s : 3: substitution.elim_not_effective 0.75% : 0.000002s : 3: substitution.fold_const_symbol 3.13% : 0.000007s : 4: substitution.graph_param_transform 45.80% : 0.000102s : 2: substitution.inline 1.79% : 0.000004s : 6: substitution.j_node_and_user_rematch 5.61% : 0.000012s : 2: substitution.less_batch_normalization 2.68% : 0.000006s : 6: substitution.remove_not_recompute_node 1.86% : 0.000004s : 2: substitution.replace_old_param ------[type_inference.] 0.004877 2 90.73% : 0.004425s : 1: type_inference.infer 9.27% : 0.000452s : 1: type_inference.specialize ------[replace.] 0.000022 2 100.00% : 0.000022s : 2: replace.inline ------[match.] 0.000100 2 100.00% : 0.000100s : 2: match.inline ------[predicate.] 0.000171 980 0.80% : 0.000001s : 9: predicate.accumulaten_eliminater 1.03% : 0.000002s : 4: predicate.ad_related_special_op_eliminate 0.64% : 0.000001s : 8: predicate.addn_check_dump 0.73% : 0.000001s : 9: predicate.addn_zero_filter 0.73% : 0.000001s : 9: predicate.adjust_all_reduce_mul_add 2.79% : 0.000005s : 17: predicate.arithmetic_simplify 0.90% : 0.000002s : 9: predicate.cast_eliminate 0.75% : 0.000001s : 8: predicate.check_bprop_eliminate 0.69% : 0.000001s : 8: predicate.compare_switch_simplify 0.22% : 0.000000s : 4: predicate.const_output_eliminate 0.71% : 0.000001s : 8: predicate.depend_value_elim 0.87% : 0.000001s : 9: predicate.dict_get_item_const_eliminator 0.96% : 0.000002s : 9: predicate.dict_get_item_eliminator 0.83% : 0.000001s : 9: predicate.dict_set_item_eliminator 1.31% : 0.000002s : 8: predicate.dumpgradient_eliminate 0.25% : 0.000000s : 4: predicate.elim_not_effective 0.42% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.15% : 0.000002s : 13: predicate.environ_add_const_eliminate 1.01% : 0.000002s : 13: predicate.environ_get_add_eliminate 1.04% : 0.000002s : 13: predicate.environ_get_depend_swap 1.76% : 0.000003s : 21: predicate.environ_get_eliminate 1.18% : 0.000002s : 13: predicate.environ_get_set_eliminate 0.88% : 0.000002s : 11: predicate.exchange_switch_depend_value 1.81% : 0.000003s : 11: predicate.float_depend_g_call 0.67% : 0.000001s : 8: predicate.float_environ_get_switch 1.00% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.28% : 0.000000s : 4: predicate.fold_const_symbol 0.85% : 0.000001s : 8: predicate.get_grad_eliminate 0.29% : 0.000000s : 4: predicate.graph_param_transform 0.88% : 0.000002s : 8: predicate.incorporate_call 0.66% : 0.000001s : 8: predicate.incorporate_call_switch 6.64% : 0.000011s : 44: predicate.inline 0.96% : 0.000002s : 8: predicate.inline_without_move 0.37% : 0.000001s : 8: predicate.j_node_and_user_rematch 1.28% : 0.000002s : 8: predicate.less_batch_normalization 1.78% : 0.000003s : 17: predicate.list_to_tuple_eliminator_ 2.14% : 0.000004s : 26: predicate.load_eliminater 1.09% : 0.000002s : 4: predicate.loop_unroll_after_grad 1.38% : 0.000002s : 16: predicate.loop_unroll_before_grad 1.62% : 0.000003s : 17: predicate.make_slice_get_slice_eliminator 0.72% : 0.000001s : 8: predicate.merge_addn 0.69% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.74% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.75% : 0.000001s : 9: predicate.minmaximum_grad 1.45% : 0.000002s : 4: predicate.mutable_eliminate 0.44% : 0.000001s : 4: predicate.opt_reshape 0.50% : 0.000001s : 4: predicate.parallel_virtual_node 1.27% : 0.000002s : 11: predicate.partial_defer_inline 1.25% : 0.000002s : 13: predicate.partial_eliminate 0.83% : 0.000001s : 9: predicate.print_const_string_wrapper 0.76% : 0.000001s : 8: predicate.reduce_all_const_elim 1.07% : 0.000002s : 9: predicate.reduce_eliminate 2.11% : 0.000004s : 26: predicate.redundant_stop_gradient_eliminater 0.49% : 0.000001s : 8: predicate.remove_not_recompute_node 1.17% : 0.000002s : 17: predicate.replace_applicator 0.62% : 0.000001s : 8: predicate.replace_old_param 0.35% : 0.000001s : 4: predicate.reset_defer_inline 0.95% : 0.000002s : 9: predicate.reshape_eliminate 0.74% : 0.000001s : 8: predicate.row_tensor_add_zeros_like 0.59% : 0.000001s : 4: predicate.row_tensor_eliminate 0.93% : 0.000002s : 8: predicate.same_eliminate 0.54% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.94% : 0.000002s : 8: predicate.shard_identity_eliminate 0.87% : 0.000001s : 8: predicate.special_op_eliminate 0.93% : 0.000002s : 8: predicate.specialize_transform 1.03% : 0.000002s : 8: predicate.split_environ_get_set_with_tuple_value 0.90% : 0.000002s : 8: predicate.stack_unstack_eliminate 0.45% : 0.000001s : 4: predicate.switch_call_monad_eliminater 0.94% : 0.000002s : 11: predicate.switch_defer_inline 1.65% : 0.000003s : 19: predicate.switch_layer_defer_inline 4.08% : 0.000007s : 39: predicate.switch_simplify 0.81% : 0.000001s : 9: predicate.tile_eliminate 0.83% : 0.000001s : 9: predicate.transpose_eliminate 1.66% : 0.000003s : 17: predicate.tuple_list_convert_item_index_to_positive 1.63% : 0.000003s : 17: predicate.tuple_list_get_item_const_eliminator 1.50% : 0.000003s : 17: predicate.tuple_list_get_item_depend_reorder 3.08% : 0.000005s : 25: predicate.tuple_list_get_item_eliminator 1.76% : 0.000003s : 17: predicate.tuple_list_get_set_item_eliminator 2.64% : 0.000004s : 25: predicate.tuple_list_set_item_eliminator 1.60% : 0.000003s : 17: predicate.tuple_to_list_eliminator_ 2.07% : 0.000004s : 26: predicate.updatestate_pure_node_eliminater 3.23% : 0.000006s : 34: predicate.updatestate_useless_node_eliminater 0.43% : 0.000001s : 4: predicate.value_based_eliminate 0.77% : 0.000001s : 8: predicate.virtual_dataset_eliminate 0.98% : 0.000002s : 8: predicate.virtual_output_eliminate 0.36% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.54% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000234 5 8.45% : 0.000020s : 1: func_graph_cloner_run.FuncGraphClonerGraph 91.55% : 0.000214s : 4: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.023618 192 0.01% : 0.000003s : 1: ForceFp32Comm 13.33% : 0.003149s : 1: add_attr 13.29% : 0.003138s : 1: add_attr_with_inline 0.02% : 0.000004s : 1: add_comm_op_reuse_tag 0.26% : 0.000061s : 1: add_recomputation 0.02% : 0.000004s : 1: assign_add_opt 0.27% : 0.000063s : 1: auto_monad 0.10% : 0.000024s : 1: auto_monad_reorder 0.01% : 0.000003s : 1: begin_end_overlap_inline 0.02% : 0.000006s : 1: bias_add_comm_swap 2.14% : 0.000504s : 1: bootstrap 0.14% : 0.000032s : 1: cconv 0.02% : 0.000004s : 1: comm_op_add_attrs 0.08% : 0.000019s : 1: control_data_broadcast_order 0.04% : 0.000011s : 1: convert_after_rewriter 0.12% : 0.000029s : 1: cse_after_recomputation 0.02% : 0.000005s : 1: dataset_repeat_opt 0.02% : 0.000005s : 1: detach_backward 0.04% : 0.000010s : 1: environ_conv 0.08% : 0.000019s : 1: event_method 0.02% : 0.000005s : 1: full_micro_interleaved_order_control 0.02% : 0.000005s : 1: get_jit_bprop_graph 0.04% : 0.000009s : 1: graph_reusing 0.02% : 0.000005s : 1: grouped_pairwise_exchange_alltoall 0.02% : 0.000004s : 1: handle_group_info 0.02% : 0.000005s : 1: inline 0.02% : 0.000006s : 1: insert-virtual-dataset 0.02% : 0.000004s : 1: interleave_parallel_branches 0.02% : 0.000004s : 1: interleave_split_concat_branches 0.02% : 0.000006s : 1: label_fine_grained_interleaved_index 0.03% : 0.000007s : 1: label_micro_interleaved_index 1.97% : 0.000466s : 1: loop_unroll 0.02% : 0.000004s : 1: merge_cast_opt 0.02% : 0.000005s : 1: micro_interleaved_order_control 2.16% : 0.000510s : 1: mutable_eliminate 0.03% : 0.000007s : 1: offloading_packed_experts 0.08% : 0.000018s : 1: opt.transform.loop_unroll_optimizer 0.07% : 0.000017s : 1: opt.transform.mutable_eliminate 4.44% : 0.001049s : 78: opt.transform.opt_a 0.14% : 0.000033s : 1: opt.transform.opt_after_cconv 0.12% : 0.000028s : 1: opt.transform.opt_after_jit_grad 0.59% : 0.000140s : 28: opt.transform.opt_b 0.24% : 0.000056s : 2: opt.transform.opt_trans_graph 0.18% : 0.000042s : 4: opt.transform.symbol_engine_opt 10.69% : 0.002524s : 1: opt_a 0.51% : 0.000121s : 1: opt_after_cconv 2.06% : 0.000486s : 1: opt_after_jit_grad 1.12% : 0.000265s : 1: opt_b 19.64% : 0.004638s : 1: optimize 0.09% : 0.000022s : 1: optimize_parallel_all_gather_comm 0.04% : 0.000009s : 1: order_py_execute_after_rewriter 0.10% : 0.000025s : 1: overlap_grad_flash_sp 0.02% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.03% : 0.000008s : 1: overlap_grad_ring_attention 0.02% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.02% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.02% : 0.000005s : 1: overlap_param_gather 0.02% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.04% : 0.000009s : 1: overlap_recompute_and_grad_model_parallel 0.02% : 0.000005s : 1: overlap_recompute_comm 0.03% : 0.000006s : 1: parallel-infer-symbol 0.02% : 0.000004s : 1: parallel-infer-symbol-second 0.02% : 0.000005s : 1: partial_unused_args_eliminate 0.02% : 0.000005s : 1: pipeline_parallel_scheduler 0.02% : 0.000004s : 1: pipeline_split 0.14% : 0.000032s : 1: pre_auto_parallel 0.09% : 0.000022s : 1: py_interpret_to_execute 0.05% : 0.000013s : 1: py_interpret_to_execute_after_opt_a 0.02% : 0.000004s : 1: remove_cast_before_assign_add 0.10% : 0.000022s : 1: remove_dup_value 1.44% : 0.000340s : 1: renormalize.infer 1.07% : 0.000252s : 1: renormalize.specialize 0.02% : 0.000005s : 1: reorder_send_recv_between_fp_bp 0.03% : 0.000008s : 1: rewriter_after_jit_bprop_graph 0.19% : 0.000044s : 1: rewriter_after_opt_a 0.23% : 0.000055s : 1: rewriter_before_opt_a 0.02% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.02% : 0.000005s : 1: slice_recompute_activation 0.02% : 0.000004s : 1: split_layernorm_comm 0.02% : 0.000005s : 1: split_matmul_comm_elemetwise 0.04% : 0.000009s : 1: swap_dp_allreduce_reducescatter 0.36% : 0.000085s : 1: symbol_engine_optimizer 0.37% : 0.000087s : 1: tuple_transform 20.91% : 0.004938s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:37.352.392 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:37.352.709 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0146083, [21] [bootstrap]: 0.00043995 [type_inference]: 0.00481261 [event_method]: 1.191e-05 [auto_monad]: 5.411e-05 [graph_reusing]: 5.29e-06 [inline]: 2.19001e-06 [add_attr]: 0.00320509, [1] [add_attr_with_inline]: 0.00319542, [1] [Cycle 1]: 8.469e-05, [2] [tag_attr]: 1.567e-05 [meta_addattr_fg_expand]: 3.78001e-06 [parallel-infer-symbol]: 3.88999e-06 [pre_auto_parallel]: 2.724e-05 [insert-virtual-dataset]: 2.83e-06 [parallel-infer-symbol-second]: 6.90023e-07 [dataset_repeat_opt]: 2.32999e-06 [pipeline_split]: 1.71998e-06 [optimize]: 0.00490504, [53] [py_interpret_to_execute]: 2.152e-05 [rewriter_before_opt_a]: 5.219e-05 [opt_a]: 0.00261758, [2] [Cycle 1]: 0.00176728, [45] [expand_dump_flag]: 2.65002e-06 [switch_simplify]: 2.531e-05 [loop_unroll]: 1.366e-05 [a_1]: 0.00033055 [with_stream_mark]: 1.733e-05 [recompute_prepare]: 8.52e-06 [updatestate_depend_eliminate]: 3.76001e-06 [updatestate_assign_eliminate]: 3.38999e-06 [updatestate_loads_eliminate]: 3.37002e-06 [parameter_eliminate]: 1.90001e-06 [a_2]: 0.00012202 [accelerated_algorithm]: 2.127e-05 [shard]: 2.12001e-06 [meta_shard_fg_expand]: 1.81e-06 [shard_inline]: 7.28999e-06 [merge_send_recv]: 7.9e-06 [auto_parallel]: 6.72002e-06 [parallel]: 1.785e-05 [flash_sp]: 7.66999e-06 [merge_comm]: 3.96001e-06 [allreduce_fusion]: 4.03999e-06 [matmul_add_comm_reduction]: 1.08e-05 [allreduce_slice_to_reducescatter]: 6.19999e-07 [virtual_shard_identity]: 8.32e-06 [virtual_dataset]: 6.68e-06 [get_grad_eliminate_]: 5.87001e-06 [virtual_output]: 6.49001e-06 [merge_forward]: 3.58e-06 [cell_reuse_recompute_pass]: 1.67001e-06 [offload_activation]: 1.05e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.418e-05 [merge_recompute_call_nodes]: 1.55999e-06 [before_grad]: 1.122e-05 [set_forward_comm_id_for_comm_node_pass]: 4.22e-06 [meta_fg_expand]: 2.89999e-06 [flash_sp_send_recv_attached]: 5.30999e-06 [receive_attached]: 1.96e-06 [after_resolve]: 1.012e-05 [a_after_grad]: 9.14e-06 [renormalize]: 0.0005485 [add_forward_monad_depend]: 5.32001e-06 [auto_monad_grad]: 2.30002e-06 [auto_monad_eliminator]: 1.467e-05 [cse]: 2.811e-05 [a_3]: 5.98e-05 [Cycle 2]: 0.00083675, [45] [expand_dump_flag]: 1.12e-06 [switch_simplify]: 7.23e-06 [loop_unroll]: 6.12999e-06 [a_1]: 0.0001306 [with_stream_mark]: 1.009e-05 [recompute_prepare]: 7.07002e-06 [updatestate_depend_eliminate]: 3.59002e-06 [updatestate_assign_eliminate]: 2.74001e-06 [updatestate_loads_eliminate]: 2.76e-06 [parameter_eliminate]: 9.09989e-07 [a_2]: 0.0001087 [accelerated_algorithm]: 9.19998e-06 [shard]: 1.02e-06 [meta_shard_fg_expand]: 1.35001e-06 [shard_inline]: 6.11998e-06 [merge_send_recv]: 5.94999e-06 [auto_parallel]: 5.83002e-06 [parallel]: 5.34998e-06 [flash_sp]: 3.53e-06 [merge_comm]: 3.7e-06 [allreduce_fusion]: 3.4e-06 [matmul_add_comm_reduction]: 7.4e-06 [allreduce_slice_to_reducescatter]: 5.09986e-07 [virtual_shard_identity]: 6.88e-06 [virtual_dataset]: 5.96e-06 [get_grad_eliminate_]: 5.58002e-06 [virtual_output]: 5.56998e-06 [merge_forward]: 3.33e-06 [cell_reuse_recompute_pass]: 1.44998e-06 [offload_activation]: 7.71001e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.368e-05 [merge_recompute_call_nodes]: 7.09988e-07 [before_grad]: 9.74e-06 [set_forward_comm_id_for_comm_node_pass]: 4.11001e-06 [meta_fg_expand]: 2.31e-06 [flash_sp_send_recv_attached]: 8.70001e-07 [receive_attached]: 1.47999e-06 [after_resolve]: 9.36e-06 [a_after_grad]: 8.66002e-06 [renormalize]: 6.00121e-08 [add_forward_monad_depend]: 1.57001e-06 [auto_monad_grad]: 1.45999e-06 [auto_monad_eliminator]: 8.74e-06 [cse]: 1.571e-05 [a_3]: 4.804e-05 [py_interpret_to_execute_after_opt_a]: 1.329e-05 [slice_cell_reuse_recomputed_activation]: 4.65999e-06 [rewriter_after_opt_a]: 4.138e-05 [convert_after_rewriter]: 1.007e-05 [order_py_execute_after_rewriter]: 8.95001e-06 [mutable_eliminate]: 0.00054318 [opt_b]: 0.00026803, [1] [Cycle 1]: 0.00025762, [7] [b_1]: 0.00016198 [b_2]: 7.31001e-06 [updatestate_depend_eliminate]: 6.79001e-06 [updatestate_assign_eliminate]: 2.58e-06 [updatestate_loads_eliminate]: 2.75997e-06 [renormalize]: 4.90021e-07 [cse]: 1.979e-05 [optimize_parallel_all_gather_comm]: 1.963e-05 [overlap_param_gather]: 4.91002e-06 [cconv]: 3.012e-05 [loop_unroll]: 0.00047044 [opt_after_cconv]: 0.00012554, [1] [Cycle 1]: 0.00011731, [7] [c_1]: 2.824e-05 [parameter_eliminate]: 3.76999e-06 [updatestate_depend_eliminate]: 5.76998e-06 [updatestate_assign_eliminate]: 2.55002e-06 [updatestate_loads_eliminate]: 2.66e-06 [cse]: 1.777e-05 [renormalize]: 5.89993e-07 [remove_dup_value]: 1.886e-05 [tuple_transform]: 8.773e-05, [1] [Cycle 1]: 8.054e-05, [4] [d_1]: 4.175e-05 [none_parameter_eliminate]: 1.76e-06 [renormalize]: 2.60014e-07 [switch_simplify]: 6.48e-06 [partial_unused_args_eliminate]: 4.63999e-06 [add_recomputation]: 5.018e-05 [cse_after_recomputation]: 2.744e-05, [1] [Cycle 1]: 2.06e-05, [1] [cse]: 1.143e-05 [environ_conv]: 8.42e-06 [swap_dp_allreduce_reducescatter]: 8.27998e-06 [bias_add_comm_swap]: 5.20001e-06 [label_micro_interleaved_index]: 7.38e-06 [label_fine_grained_interleaved_index]: 4.92999e-06 [merge_cast_opt]: 3.7e-06 [slice_recompute_activation]: 4.48001e-06 [micro_interleaved_order_control]: 4.68001e-06 [assign_add_opt]: 3.76999e-06 [ForceFp32Comm]: 3.3e-06 [remove_cast_before_assign_add]: 3.23e-06 [full_micro_interleaved_order_control]: 4.33001e-06 [reorder_send_recv_between_fp_bp]: 5.67001e-06 [comm_op_add_attrs]: 3.32002e-06 [add_comm_op_reuse_tag]: 3.26999e-06 [interleave_split_concat_branches]: 3.57997e-06 [interleave_parallel_branches]: 3.43999e-06 [overlap_opt_shard_in_pipeline]: 3.98999e-06 [overlap_opt_shard_grad_in_pipeline]: 4.14002e-06 [control_data_broadcast_order]: 1.72e-05 [grouped_pairwise_exchange_alltoall]: 4.22e-06 [offloading_packed_experts]: 6.29999e-06 [overlap_recompute_and_grad_model_parallel]: 7.21999e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.7e-06 [overlap_recompute_allgather_and_fa_grad]: 3.55e-06 [overlap_recompute_comm]: 4.56002e-06 [overlap_grad_ring_attention]: 6.73998e-06 [overlap_grad_flash_sp]: 2.323e-05 [begin_end_overlap_inline]: 3.07002e-06 [split_matmul_comm_elemetwise]: 4.37e-06 [split_layernorm_comm]: 4.35e-06 [handle_group_info]: 3.81999e-06 [symbol_engine_optimizer]: 9.484e-05, [1] [Cycle 1]: 8.831e-05, [6] [build]: 2.89999e-06 [elim_shapecalc]: 9.95002e-06 [elim_not_effective]: 1.286e-05 [opt_reshape]: 6.73e-06 [fold_const_symbol]: 9.84999e-06 [renormalize]: 2.50002e-07 [detach_backward]: 3.33e-06 [pipeline_parallel_scheduler]: 1.97001e-06 [auto_monad_reorder]: 1.84e-05 [get_jit_bprop_graph]: 2.03002e-06 [rewriter_after_jit_bprop_graph]: 4.66002e-06 [opt_after_jit_grad]: 0.00049877 [validate]: 3.915e-05 Sums bootstrap : 0.000440s : 4.55% type_inference : 0.004813s : 49.72% event_method : 0.000012s : 0.12% auto_monad : 0.000054s : 0.56% graph_reusing : 0.000005s : 0.05% inline : 0.000002s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000016s : 0.16% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000004s : 0.04% parallel-infer-symbol : 0.000004s : 0.04% pre_auto_parallel : 0.000027s : 0.28% insert-virtual-dataset : 0.000003s : 0.03% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.02% optimize.py_interpret_to_execute : 0.000022s : 0.22% optimize.rewriter_before_opt_a : 0.000052s : 0.54% optimize.opt_a.expand_dump_flag : 0.000004s : 0.04% optimize.opt_a.switch_simplify : 0.000033s : 0.34% optimize.opt_a.loop_unroll : 0.000020s : 0.20% optimize.opt_a.a_1 : 0.000461s : 4.76% optimize.opt_a.with_stream_mark : 0.000027s : 0.28% optimize.opt_a.recompute_prepare : 0.000016s : 0.16% optimize.opt_a.updatestate_depend_eliminate : 0.000007s : 0.08% optimize.opt_a.updatestate_assign_eliminate : 0.000006s : 0.06% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.06% optimize.opt_a.parameter_eliminate : 0.000003s : 0.03% optimize.opt_a.a_2 : 0.000231s : 2.38% optimize.opt_a.accelerated_algorithm : 0.000030s : 0.31% optimize.opt_a.shard : 0.000003s : 0.03% optimize.opt_a.meta_shard_fg_expand : 0.000003s : 0.03% optimize.opt_a.shard_inline : 0.000013s : 0.14% optimize.opt_a.merge_send_recv : 0.000014s : 0.14% optimize.opt_a.auto_parallel : 0.000013s : 0.13% optimize.opt_a.parallel : 0.000023s : 0.24% optimize.opt_a.flash_sp : 0.000011s : 0.12% optimize.opt_a.merge_comm : 0.000008s : 0.08% optimize.opt_a.allreduce_fusion : 0.000007s : 0.08% optimize.opt_a.matmul_add_comm_reduction : 0.000018s : 0.19% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000015s : 0.16% optimize.opt_a.virtual_dataset : 0.000013s : 0.13% optimize.opt_a.get_grad_eliminate_ : 0.000011s : 0.12% optimize.opt_a.virtual_output : 0.000012s : 0.12% optimize.opt_a.merge_forward : 0.000007s : 0.07% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.03% optimize.opt_a.offload_activation : 0.000018s : 0.19% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000028s : 0.29% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.02% optimize.opt_a.before_grad : 0.000021s : 0.22% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000008s : 0.09% optimize.opt_a.meta_fg_expand : 0.000005s : 0.05% optimize.opt_a.flash_sp_send_recv_attached : 0.000006s : 0.06% optimize.opt_a.receive_attached : 0.000003s : 0.04% optimize.opt_a.after_resolve : 0.000019s : 0.20% optimize.opt_a.a_after_grad : 0.000018s : 0.18% optimize.opt_a.renormalize : 0.000549s : 5.67% optimize.opt_a.add_forward_monad_depend : 0.000007s : 0.07% optimize.opt_a.auto_monad_grad : 0.000004s : 0.04% optimize.opt_a.auto_monad_eliminator : 0.000023s : 0.24% optimize.opt_a.cse : 0.000044s : 0.45% optimize.opt_a.a_3 : 0.000108s : 1.11% optimize.py_interpret_to_execute_after_opt_a : 0.000013s : 0.14% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.05% optimize.rewriter_after_opt_a : 0.000041s : 0.43% optimize.convert_after_rewriter : 0.000010s : 0.10% optimize.order_py_execute_after_rewriter : 0.000009s : 0.09% optimize.mutable_eliminate : 0.000543s : 5.61% optimize.opt_b.b_1 : 0.000162s : 1.67% optimize.opt_b.b_2 : 0.000007s : 0.08% optimize.opt_b.updatestate_depend_eliminate : 0.000007s : 0.07% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_b.renormalize : 0.000000s : 0.01% optimize.opt_b.cse : 0.000020s : 0.20% optimize.optimize_parallel_all_gather_comm : 0.000020s : 0.20% optimize.overlap_param_gather : 0.000005s : 0.05% optimize.cconv : 0.000030s : 0.31% optimize.loop_unroll : 0.000470s : 4.86% optimize.opt_after_cconv.c_1 : 0.000028s : 0.29% optimize.opt_after_cconv.parameter_eliminate : 0.000004s : 0.04% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.06% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.cse : 0.000018s : 0.18% optimize.opt_after_cconv.renormalize : 0.000001s : 0.01% optimize.remove_dup_value : 0.000019s : 0.19% optimize.tuple_transform.d_1 : 0.000042s : 0.43% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.02% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000006s : 0.07% optimize.partial_unused_args_eliminate : 0.000005s : 0.05% optimize.add_recomputation : 0.000050s : 0.52% optimize.cse_after_recomputation.cse : 0.000011s : 0.12% optimize.environ_conv : 0.000008s : 0.09% optimize.swap_dp_allreduce_reducescatter : 0.000008s : 0.09% optimize.bias_add_comm_swap : 0.000005s : 0.05% optimize.label_micro_interleaved_index : 0.000007s : 0.08% optimize.label_fine_grained_interleaved_index : 0.000005s : 0.05% optimize.merge_cast_opt : 0.000004s : 0.04% optimize.slice_recompute_activation : 0.000004s : 0.05% optimize.micro_interleaved_order_control : 0.000005s : 0.05% optimize.assign_add_opt : 0.000004s : 0.04% optimize.ForceFp32Comm : 0.000003s : 0.03% optimize.remove_cast_before_assign_add : 0.000003s : 0.03% optimize.full_micro_interleaved_order_control : 0.000004s : 0.04% optimize.reorder_send_recv_between_fp_bp : 0.000006s : 0.06% optimize.comm_op_add_attrs : 0.000003s : 0.03% optimize.add_comm_op_reuse_tag : 0.000003s : 0.03% optimize.interleave_split_concat_branches : 0.000004s : 0.04% optimize.interleave_parallel_branches : 0.000003s : 0.04% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.04% optimize.overlap_opt_shard_grad_in_pipeline : 0.000004s : 0.04% optimize.control_data_broadcast_order : 0.000017s : 0.18% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.04% optimize.offloading_packed_experts : 0.000006s : 0.07% optimize.overlap_recompute_and_grad_model_parallel : 0.000007s : 0.07% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.04% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.04% optimize.overlap_recompute_comm : 0.000005s : 0.05% optimize.overlap_grad_ring_attention : 0.000007s : 0.07% optimize.overlap_grad_flash_sp : 0.000023s : 0.24% optimize.begin_end_overlap_inline : 0.000003s : 0.03% optimize.split_matmul_comm_elemetwise : 0.000004s : 0.05% optimize.split_layernorm_comm : 0.000004s : 0.04% optimize.handle_group_info : 0.000004s : 0.04% optimize.symbol_engine_optimizer.build : 0.000003s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000010s : 0.10% optimize.symbol_engine_optimizer.elim_not_effective : 0.000013s : 0.13% optimize.symbol_engine_optimizer.opt_reshape : 0.000007s : 0.07% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000010s : 0.10% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000003s : 0.03% pipeline_parallel_scheduler : 0.000002s : 0.02% auto_monad_reorder : 0.000018s : 0.19% get_jit_bprop_graph : 0.000002s : 0.02% rewriter_after_jit_bprop_graph : 0.000005s : 0.05% opt_after_jit_grad : 0.000499s : 5.15% validate : 0.000039s : 0.40% Time group info: ------[substitution.] 0.000197 25 35.89% : 0.000071s : 4: substitution.arithmetic_simplify 0.94% : 0.000002s : 2: substitution.elim_not_effective 0.71% : 0.000001s : 2: substitution.fold_const_symbol 2.86% : 0.000006s : 3: substitution.graph_param_transform 47.54% : 0.000094s : 2: substitution.inline 1.76% : 0.000003s : 4: substitution.j_node_and_user_rematch 6.25% : 0.000012s : 2: substitution.less_batch_normalization 2.33% : 0.000005s : 4: substitution.remove_not_recompute_node 1.72% : 0.000003s : 2: substitution.replace_old_param ------[type_inference.] 0.004767 2 91.27% : 0.004350s : 1: type_inference.infer 8.73% : 0.000416s : 1: type_inference.specialize ------[replace.] 0.000023 2 100.00% : 0.000023s : 2: replace.inline ------[match.] 0.000092 2 100.00% : 0.000092s : 2: match.inline ------[predicate.] 0.000136 754 0.77% : 0.000001s : 7: predicate.accumulaten_eliminater 0.92% : 0.000001s : 3: predicate.ad_related_special_op_eliminate 0.66% : 0.000001s : 6: predicate.addn_check_dump 0.93% : 0.000001s : 7: predicate.addn_zero_filter 0.62% : 0.000001s : 7: predicate.adjust_all_reduce_mul_add 2.84% : 0.000004s : 13: predicate.arithmetic_simplify 0.95% : 0.000001s : 7: predicate.cast_eliminate 0.76% : 0.000001s : 6: predicate.check_bprop_eliminate 0.68% : 0.000001s : 6: predicate.compare_switch_simplify 0.21% : 0.000000s : 3: predicate.const_output_eliminate 0.73% : 0.000001s : 6: predicate.depend_value_elim 0.79% : 0.000001s : 7: predicate.dict_get_item_const_eliminator 0.84% : 0.000001s : 7: predicate.dict_get_item_eliminator 0.78% : 0.000001s : 7: predicate.dict_set_item_eliminator 1.13% : 0.000002s : 6: predicate.dumpgradient_eliminate 0.32% : 0.000000s : 3: predicate.elim_not_effective 0.49% : 0.000001s : 3: predicate.elim_shapecalc_of_broadcastargs 1.11% : 0.000002s : 10: predicate.environ_add_const_eliminate 0.94% : 0.000001s : 10: predicate.environ_get_add_eliminate 1.04% : 0.000001s : 10: predicate.environ_get_depend_swap 1.75% : 0.000002s : 16: predicate.environ_get_eliminate 0.96% : 0.000001s : 10: predicate.environ_get_set_eliminate 0.90% : 0.000001s : 9: predicate.exchange_switch_depend_value 1.92% : 0.000003s : 9: predicate.float_depend_g_call 0.65% : 0.000001s : 6: predicate.float_environ_get_switch 0.96% : 0.000001s : 9: predicate.float_tuple_getitem_switch 0.23% : 0.000000s : 3: predicate.fold_const_symbol 0.81% : 0.000001s : 6: predicate.get_grad_eliminate 0.26% : 0.000000s : 3: predicate.graph_param_transform 0.88% : 0.000001s : 6: predicate.incorporate_call 0.66% : 0.000001s : 6: predicate.incorporate_call_switch 6.53% : 0.000009s : 34: predicate.inline 1.17% : 0.000002s : 6: predicate.inline_without_move 0.40% : 0.000001s : 6: predicate.j_node_and_user_rematch 1.18% : 0.000002s : 6: predicate.less_batch_normalization 1.62% : 0.000002s : 13: predicate.list_to_tuple_eliminator_ 1.97% : 0.000003s : 20: predicate.load_eliminater 1.12% : 0.000002s : 3: predicate.loop_unroll_after_grad 1.71% : 0.000002s : 14: predicate.loop_unroll_before_grad 1.80% : 0.000002s : 13: predicate.make_slice_get_slice_eliminator 0.71% : 0.000001s : 6: predicate.merge_addn 0.91% : 0.000001s : 6: predicate.micro_step_allgather_replace 0.70% : 0.000001s : 6: predicate.mini_step_allgather_replace 0.70% : 0.000001s : 7: predicate.minmaximum_grad 1.60% : 0.000002s : 3: predicate.mutable_eliminate 0.43% : 0.000001s : 3: predicate.opt_reshape 0.49% : 0.000001s : 3: predicate.parallel_virtual_node 1.26% : 0.000002s : 9: predicate.partial_defer_inline 1.16% : 0.000002s : 10: predicate.partial_eliminate 0.88% : 0.000001s : 7: predicate.print_const_string_wrapper 0.79% : 0.000001s : 6: predicate.reduce_all_const_elim 1.07% : 0.000001s : 7: predicate.reduce_eliminate 2.07% : 0.000003s : 20: predicate.redundant_stop_gradient_eliminater 0.84% : 0.000001s : 6: predicate.remove_not_recompute_node 1.13% : 0.000002s : 13: predicate.replace_applicator 0.60% : 0.000001s : 6: predicate.replace_old_param 0.32% : 0.000000s : 3: predicate.reset_defer_inline 0.79% : 0.000001s : 7: predicate.reshape_eliminate 0.70% : 0.000001s : 6: predicate.row_tensor_add_zeros_like 0.46% : 0.000001s : 3: predicate.row_tensor_eliminate 0.99% : 0.000001s : 6: predicate.same_eliminate 0.66% : 0.000001s : 6: predicate.set_cell_output_no_recompute 1.20% : 0.000002s : 6: predicate.shard_identity_eliminate 0.98% : 0.000001s : 6: predicate.special_op_eliminate 0.98% : 0.000001s : 6: predicate.specialize_transform 1.01% : 0.000001s : 6: predicate.split_environ_get_set_with_tuple_value 0.94% : 0.000001s : 6: predicate.stack_unstack_eliminate 0.45% : 0.000001s : 3: predicate.switch_call_monad_eliminater 1.01% : 0.000001s : 9: predicate.switch_defer_inline 1.76% : 0.000002s : 15: predicate.switch_layer_defer_inline 4.37% : 0.000006s : 32: predicate.switch_simplify 0.71% : 0.000001s : 7: predicate.tile_eliminate 0.86% : 0.000001s : 7: predicate.transpose_eliminate 1.85% : 0.000003s : 13: predicate.tuple_list_convert_item_index_to_positive 1.55% : 0.000002s : 13: predicate.tuple_list_get_item_const_eliminator 1.32% : 0.000002s : 13: predicate.tuple_list_get_item_depend_reorder 3.30% : 0.000005s : 19: predicate.tuple_list_get_item_eliminator 1.47% : 0.000002s : 13: predicate.tuple_list_get_set_item_eliminator 2.47% : 0.000003s : 19: predicate.tuple_list_set_item_eliminator 1.53% : 0.000002s : 13: predicate.tuple_to_list_eliminator_ 1.95% : 0.000003s : 20: predicate.updatestate_pure_node_eliminater 2.89% : 0.000004s : 26: predicate.updatestate_useless_node_eliminater 0.55% : 0.000001s : 3: predicate.value_based_eliminate 0.79% : 0.000001s : 6: predicate.virtual_dataset_eliminate 0.81% : 0.000001s : 6: predicate.virtual_output_eliminate 0.32% : 0.000000s : 3: predicate.virtual_view_grad_eliminate 0.62% : 0.000001s : 3: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000227 5 7.72% : 0.000018s : 1: func_graph_cloner_run.FuncGraphClonerGraph 92.28% : 0.000209s : 4: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.024268 192 0.03% : 0.000006s : 1: ForceFp32Comm 13.25% : 0.003216s : 1: add_attr 13.19% : 0.003200s : 1: add_attr_with_inline 0.02% : 0.000006s : 1: add_comm_op_reuse_tag 0.22% : 0.000054s : 1: add_recomputation 0.03% : 0.000007s : 1: assign_add_opt 0.26% : 0.000062s : 1: auto_monad 0.10% : 0.000025s : 1: auto_monad_reorder 0.02% : 0.000006s : 1: begin_end_overlap_inline 0.03% : 0.000008s : 1: bias_add_comm_swap 1.99% : 0.000483s : 1: bootstrap 0.14% : 0.000033s : 1: cconv 0.02% : 0.000006s : 1: comm_op_add_attrs 0.08% : 0.000020s : 1: control_data_broadcast_order 0.05% : 0.000013s : 1: convert_after_rewriter 0.13% : 0.000031s : 1: cse_after_recomputation 0.03% : 0.000008s : 1: dataset_repeat_opt 0.07% : 0.000017s : 1: detach_backward 0.05% : 0.000011s : 1: environ_conv 0.09% : 0.000022s : 1: event_method 0.03% : 0.000007s : 1: full_micro_interleaved_order_control 0.03% : 0.000008s : 1: get_jit_bprop_graph 0.05% : 0.000011s : 1: graph_reusing 0.03% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.03% : 0.000006s : 1: handle_group_info 0.03% : 0.000008s : 1: inline 0.04% : 0.000009s : 1: insert-virtual-dataset 0.03% : 0.000006s : 1: interleave_parallel_branches 0.03% : 0.000006s : 1: interleave_split_concat_branches 0.03% : 0.000008s : 1: label_fine_grained_interleaved_index 0.04% : 0.000010s : 1: label_micro_interleaved_index 1.96% : 0.000477s : 1: loop_unroll 0.03% : 0.000006s : 1: merge_cast_opt 0.03% : 0.000007s : 1: micro_interleaved_order_control 2.26% : 0.000549s : 1: mutable_eliminate 0.04% : 0.000009s : 1: offloading_packed_experts 0.06% : 0.000014s : 1: opt.transform.loop_unroll_optimizer 0.07% : 0.000016s : 1: opt.transform.mutable_eliminate 3.59% : 0.000871s : 78: opt.transform.opt_a 0.11% : 0.000027s : 1: opt.transform.opt_after_cconv 0.10% : 0.000025s : 1: opt.transform.opt_after_jit_grad 0.40% : 0.000097s : 28: opt.transform.opt_b 0.19% : 0.000046s : 2: opt.transform.opt_trans_graph 0.15% : 0.000036s : 4: opt.transform.symbol_engine_opt 10.80% : 0.002621s : 1: opt_a 0.53% : 0.000129s : 1: opt_after_cconv 2.10% : 0.000510s : 1: opt_after_jit_grad 1.12% : 0.000272s : 1: opt_b 21.59% : 0.005239s : 1: optimize 0.09% : 0.000023s : 1: optimize_parallel_all_gather_comm 0.05% : 0.000012s : 1: order_py_execute_after_rewriter 0.11% : 0.000026s : 1: overlap_grad_flash_sp 0.03% : 0.000006s : 1: overlap_grad_matmul_and_grad_allreduce 0.04% : 0.000010s : 1: overlap_grad_ring_attention 0.03% : 0.000007s : 1: overlap_opt_shard_grad_in_pipeline 0.03% : 0.000007s : 1: overlap_opt_shard_in_pipeline 0.03% : 0.000008s : 1: overlap_param_gather 0.03% : 0.000006s : 1: overlap_recompute_allgather_and_fa_grad 0.04% : 0.000010s : 1: overlap_recompute_and_grad_model_parallel 0.03% : 0.000007s : 1: overlap_recompute_comm 0.04% : 0.000011s : 1: parallel-infer-symbol 0.03% : 0.000006s : 1: parallel-infer-symbol-second 0.03% : 0.000008s : 1: partial_unused_args_eliminate 0.04% : 0.000009s : 1: pipeline_parallel_scheduler 0.03% : 0.000007s : 1: pipeline_split 0.14% : 0.000034s : 1: pre_auto_parallel 0.10% : 0.000025s : 1: py_interpret_to_execute 0.07% : 0.000016s : 1: py_interpret_to_execute_after_opt_a 0.02% : 0.000006s : 1: remove_cast_before_assign_add 0.09% : 0.000022s : 1: remove_dup_value 1.22% : 0.000297s : 1: renormalize.infer 1.00% : 0.000244s : 1: renormalize.specialize 0.03% : 0.000008s : 1: reorder_send_recv_between_fp_bp 0.05% : 0.000011s : 1: rewriter_after_jit_bprop_graph 0.19% : 0.000045s : 1: rewriter_after_opt_a 0.23% : 0.000056s : 1: rewriter_before_opt_a 0.03% : 0.000007s : 1: slice_cell_reuse_recomputed_activation 0.03% : 0.000007s : 1: slice_recompute_activation 0.03% : 0.000007s : 1: split_layernorm_comm 0.03% : 0.000007s : 1: split_matmul_comm_elemetwise 0.05% : 0.000011s : 1: swap_dp_allreduce_reducescatter 0.40% : 0.000098s : 1: symbol_engine_optimizer 0.37% : 0.000091s : 1: tuple_transform 19.94% : 0.004840s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:37.548.476 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0147068, [21] [bootstrap]: 0.00044641 [type_inference]: 0.00531496 [event_method]: 1.292e-05 [auto_monad]: 5.484e-05 [graph_reusing]: 5.72001e-06 [inline]: 2.25002e-06 [add_attr]: 0.0033994, [1] [add_attr_with_inline]: 0.00338838, [1] [Cycle 1]: 5.247e-05, [2] [tag_attr]: 1.556e-05 [meta_addattr_fg_expand]: 3.5e-06 [parallel-infer-symbol]: 3.37002e-06 [pre_auto_parallel]: 3.022e-05 [insert-virtual-dataset]: 2.39001e-06 [parallel-infer-symbol-second]: 8.59989e-07 [dataset_repeat_opt]: 1.85001e-06 [pipeline_split]: 1.55999e-06 [optimize]: 0.00468333, [53] [py_interpret_to_execute]: 1.86e-05 [rewriter_before_opt_a]: 4.78e-05 [opt_a]: 0.00242602, [2] [Cycle 1]: 0.00169201, [45] [expand_dump_flag]: 3.14999e-06 [switch_simplify]: 2.556e-05 [loop_unroll]: 1.348e-05 [a_1]: 0.00033292 [with_stream_mark]: 1.933e-05 [recompute_prepare]: 9.12001e-06 [updatestate_depend_eliminate]: 4.07e-06 [updatestate_assign_eliminate]: 3.71999e-06 [updatestate_loads_eliminate]: 3.75e-06 [parameter_eliminate]: 1.94e-06 [a_2]: 9.32e-05 [accelerated_algorithm]: 2.021e-05 [shard]: 2.04e-06 [meta_shard_fg_expand]: 1.94e-06 [shard_inline]: 6.54001e-06 [merge_send_recv]: 8.37998e-06 [auto_parallel]: 7.41999e-06 [parallel]: 1.965e-05 [flash_sp]: 8.02003e-06 [merge_comm]: 4.19002e-06 [allreduce_fusion]: 3.65e-06 [matmul_add_comm_reduction]: 1.073e-05 [allreduce_slice_to_reducescatter]: 6.19999e-07 [virtual_shard_identity]: 8.64003e-06 [virtual_dataset]: 6.93e-06 [get_grad_eliminate_]: 6.29001e-06 [virtual_output]: 6.41e-06 [merge_forward]: 4.15999e-06 [cell_reuse_recompute_pass]: 1.59998e-06 [offload_activation]: 9.80002e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.435e-05 [merge_recompute_call_nodes]: 1.42e-06 [before_grad]: 1.168e-05 [set_forward_comm_id_for_comm_node_pass]: 3.71999e-06 [meta_fg_expand]: 2.76e-06 [flash_sp_send_recv_attached]: 5.49e-06 [receive_attached]: 2.32999e-06 [after_resolve]: 1.068e-05 [a_after_grad]: 9.99001e-06 [renormalize]: 0.00061686 [add_forward_monad_depend]: 7.18998e-06 [auto_monad_grad]: 3.28998e-06 [auto_monad_eliminator]: 1.99e-05 [cse]: 3.315e-05 [a_3]: 4.938e-05 [Cycle 2]: 0.000723, [45] [expand_dump_flag]: 1.47999e-06 [switch_simplify]: 8.22e-06 [loop_unroll]: 5.96e-06 [a_1]: 0.00014107 [with_stream_mark]: 1.228e-05 [recompute_prepare]: 7.11001e-06 [updatestate_depend_eliminate]: 3.51999e-06 [updatestate_assign_eliminate]: 2.56e-06 [updatestate_loads_eliminate]: 3.5e-06 [parameter_eliminate]: 1.84e-06 [a_2]: 8.837e-05 [accelerated_algorithm]: 1.077e-05 [shard]: 1.05001e-06 [meta_shard_fg_expand]: 2.00002e-06 [shard_inline]: 6.57002e-06 [merge_send_recv]: 7.63001e-06 [auto_parallel]: 7.66001e-06 [parallel]: 6.78e-06 [flash_sp]: 3.4e-06 [merge_comm]: 3.85998e-06 [allreduce_fusion]: 3.43e-06 [matmul_add_comm_reduction]: 9.75002e-06 [allreduce_slice_to_reducescatter]: 4.80009e-07 [virtual_shard_identity]: 8.97999e-06 [virtual_dataset]: 5.87001e-06 [get_grad_eliminate_]: 5.56e-06 [virtual_output]: 5.45001e-06 [merge_forward]: 4.1e-06 [cell_reuse_recompute_pass]: 1.76e-06 [offload_activation]: 8.75001e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.206e-05 [merge_recompute_call_nodes]: 9.39996e-07 [before_grad]: 9.94999e-06 [set_forward_comm_id_for_comm_node_pass]: 4.15e-06 [meta_fg_expand]: 2.44001e-06 [flash_sp_send_recv_attached]: 1.22999e-06 [receive_attached]: 1.28002e-06 [after_resolve]: 1.021e-05 [a_after_grad]: 8.73001e-06 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 2.01003e-06 [auto_monad_grad]: 1.59e-06 [auto_monad_eliminator]: 1.008e-05 [cse]: 1.946e-05 [a_3]: 3.716e-05 [py_interpret_to_execute_after_opt_a]: 1.436e-05 [slice_cell_reuse_recomputed_activation]: 2.06998e-06 [rewriter_after_opt_a]: 4.08e-05 [convert_after_rewriter]: 7.01999e-06 [order_py_execute_after_rewriter]: 5.66e-06 [mutable_eliminate]: 0.00063628 [opt_b]: 0.00022267, [1] [Cycle 1]: 0.00021513, [7] [b_1]: 0.00012186 [b_2]: 8.22e-06 [updatestate_depend_eliminate]: 8.70999e-06 [updatestate_assign_eliminate]: 3.09001e-06 [updatestate_loads_eliminate]: 3.47002e-06 [renormalize]: 8.70001e-07 [cse]: 2.652e-05 [optimize_parallel_all_gather_comm]: 1.758e-05 [overlap_param_gather]: 2.06e-06 [cconv]: 3.243e-05 [loop_unroll]: 0.00053006 [opt_after_cconv]: 0.00010969, [1] [Cycle 1]: 0.00010289, [7] [c_1]: 2.753e-05 [parameter_eliminate]: 5.05999e-06 [updatestate_depend_eliminate]: 7.38999e-06 [updatestate_assign_eliminate]: 2.61e-06 [updatestate_loads_eliminate]: 2.43002e-06 [cse]: 2.321e-05 [renormalize]: 9.20001e-07 [remove_dup_value]: 1.516e-05 [tuple_transform]: 7.561e-05, [1] [Cycle 1]: 7.125e-05, [4] [d_1]: 4.326e-05 [none_parameter_eliminate]: 1.81998e-06 [renormalize]: 1.80007e-07 [switch_simplify]: 6.93998e-06 [partial_unused_args_eliminate]: 2.47001e-06 [add_recomputation]: 4.992e-05 [cse_after_recomputation]: 2.262e-05, [1] [Cycle 1]: 1.768e-05, [1] [cse]: 1.245e-05 [environ_conv]: 5.62999e-06 [swap_dp_allreduce_reducescatter]: 5.02e-06 [bias_add_comm_swap]: 2.85998e-06 [label_micro_interleaved_index]: 4.97999e-06 [label_fine_grained_interleaved_index]: 2.93e-06 [merge_cast_opt]: 1.42999e-06 [slice_recompute_activation]: 2.36e-06 [micro_interleaved_order_control]: 2.41e-06 [assign_add_opt]: 1.54e-06 [ForceFp32Comm]: 1.07998e-06 [remove_cast_before_assign_add]: 9.20001e-07 [full_micro_interleaved_order_control]: 2.07999e-06 [reorder_send_recv_between_fp_bp]: 2.77002e-06 [comm_op_add_attrs]: 1.00999e-06 [add_comm_op_reuse_tag]: 9.89996e-07 [interleave_split_concat_branches]: 1.15999e-06 [interleave_parallel_branches]: 1.07e-06 [overlap_opt_shard_in_pipeline]: 1.61998e-06 [overlap_opt_shard_grad_in_pipeline]: 1.78002e-06 [control_data_broadcast_order]: 1.426e-05 [grouped_pairwise_exchange_alltoall]: 1.41002e-06 [offloading_packed_experts]: 4.3e-06 [overlap_recompute_and_grad_model_parallel]: 5.59e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.12e-06 [overlap_recompute_allgather_and_fa_grad]: 1.19e-06 [overlap_recompute_comm]: 2.46e-06 [overlap_grad_ring_attention]: 3.88001e-06 [overlap_grad_flash_sp]: 1.946e-05 [begin_end_overlap_inline]: 8.10018e-07 [split_matmul_comm_elemetwise]: 2.44001e-06 [split_layernorm_comm]: 1.67999e-06 [handle_group_info]: 1.00001e-06 [symbol_engine_optimizer]: 7.656e-05, [1] [Cycle 1]: 7.203e-05, [6] [build]: 2.46e-06 [elim_shapecalc]: 1.006e-05 [elim_not_effective]: 1.416e-05 [opt_reshape]: 7.02002e-06 [fold_const_symbol]: 1.024e-05 [renormalize]: 1.80007e-07 [detach_backward]: 2.44001e-06 [pipeline_parallel_scheduler]: 1.52001e-06 [auto_monad_reorder]: 1.713e-05 [get_jit_bprop_graph]: 1.84e-06 [rewriter_after_jit_bprop_graph]: 6.06998e-06 [opt_after_jit_grad]: 0.0005234 [validate]: 4.157e-05 Sums bootstrap : 0.000446s : 4.33% type_inference : 0.005315s : 51.56% event_method : 0.000013s : 0.13% auto_monad : 0.000055s : 0.53% graph_reusing : 0.000006s : 0.06% inline : 0.000002s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000016s : 0.15% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000003s : 0.03% parallel-infer-symbol : 0.000003s : 0.03% pre_auto_parallel : 0.000030s : 0.29% insert-virtual-dataset : 0.000002s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.02% optimize.py_interpret_to_execute : 0.000019s : 0.18% optimize.rewriter_before_opt_a : 0.000048s : 0.46% optimize.opt_a.expand_dump_flag : 0.000005s : 0.04% optimize.opt_a.switch_simplify : 0.000034s : 0.33% optimize.opt_a.loop_unroll : 0.000019s : 0.19% optimize.opt_a.a_1 : 0.000474s : 4.60% optimize.opt_a.with_stream_mark : 0.000032s : 0.31% optimize.opt_a.recompute_prepare : 0.000016s : 0.16% optimize.opt_a.updatestate_depend_eliminate : 0.000008s : 0.07% optimize.opt_a.updatestate_assign_eliminate : 0.000006s : 0.06% optimize.opt_a.updatestate_loads_eliminate : 0.000007s : 0.07% optimize.opt_a.parameter_eliminate : 0.000004s : 0.04% optimize.opt_a.a_2 : 0.000182s : 1.76% optimize.opt_a.accelerated_algorithm : 0.000031s : 0.30% optimize.opt_a.shard : 0.000003s : 0.03% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.04% optimize.opt_a.shard_inline : 0.000013s : 0.13% optimize.opt_a.merge_send_recv : 0.000016s : 0.16% optimize.opt_a.auto_parallel : 0.000015s : 0.15% optimize.opt_a.parallel : 0.000026s : 0.26% optimize.opt_a.flash_sp : 0.000011s : 0.11% optimize.opt_a.merge_comm : 0.000008s : 0.08% optimize.opt_a.allreduce_fusion : 0.000007s : 0.07% optimize.opt_a.matmul_add_comm_reduction : 0.000020s : 0.20% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000018s : 0.17% optimize.opt_a.virtual_dataset : 0.000013s : 0.12% optimize.opt_a.get_grad_eliminate_ : 0.000012s : 0.11% optimize.opt_a.virtual_output : 0.000012s : 0.12% optimize.opt_a.merge_forward : 0.000008s : 0.08% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.03% optimize.opt_a.offload_activation : 0.000019s : 0.18% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000026s : 0.26% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.02% optimize.opt_a.before_grad : 0.000022s : 0.21% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000008s : 0.08% optimize.opt_a.meta_fg_expand : 0.000005s : 0.05% optimize.opt_a.flash_sp_send_recv_attached : 0.000007s : 0.07% optimize.opt_a.receive_attached : 0.000004s : 0.04% optimize.opt_a.after_resolve : 0.000021s : 0.20% optimize.opt_a.a_after_grad : 0.000019s : 0.18% optimize.opt_a.renormalize : 0.000617s : 5.99% optimize.opt_a.add_forward_monad_depend : 0.000009s : 0.09% optimize.opt_a.auto_monad_grad : 0.000005s : 0.05% optimize.opt_a.auto_monad_eliminator : 0.000030s : 0.29% optimize.opt_a.cse : 0.000053s : 0.51% optimize.opt_a.a_3 : 0.000087s : 0.84% optimize.py_interpret_to_execute_after_opt_a : 0.000014s : 0.14% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.02% optimize.rewriter_after_opt_a : 0.000041s : 0.40% optimize.convert_after_rewriter : 0.000007s : 0.07% optimize.order_py_execute_after_rewriter : 0.000006s : 0.05% optimize.mutable_eliminate : 0.000636s : 6.17% optimize.opt_b.b_1 : 0.000122s : 1.18% optimize.opt_b.b_2 : 0.000008s : 0.08% optimize.opt_b.updatestate_depend_eliminate : 0.000009s : 0.08% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_b.renormalize : 0.000001s : 0.01% optimize.opt_b.cse : 0.000027s : 0.26% optimize.optimize_parallel_all_gather_comm : 0.000018s : 0.17% optimize.overlap_param_gather : 0.000002s : 0.02% optimize.cconv : 0.000032s : 0.31% optimize.loop_unroll : 0.000530s : 5.14% optimize.opt_after_cconv.c_1 : 0.000028s : 0.27% optimize.opt_after_cconv.parameter_eliminate : 0.000005s : 0.05% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000007s : 0.07% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.02% optimize.opt_after_cconv.cse : 0.000023s : 0.23% optimize.opt_after_cconv.renormalize : 0.000001s : 0.01% optimize.remove_dup_value : 0.000015s : 0.15% optimize.tuple_transform.d_1 : 0.000043s : 0.42% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.02% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000007s : 0.07% optimize.partial_unused_args_eliminate : 0.000002s : 0.02% optimize.add_recomputation : 0.000050s : 0.48% optimize.cse_after_recomputation.cse : 0.000012s : 0.12% optimize.environ_conv : 0.000006s : 0.05% optimize.swap_dp_allreduce_reducescatter : 0.000005s : 0.05% optimize.bias_add_comm_swap : 0.000003s : 0.03% optimize.label_micro_interleaved_index : 0.000005s : 0.05% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.03% optimize.merge_cast_opt : 0.000001s : 0.01% optimize.slice_recompute_activation : 0.000002s : 0.02% optimize.micro_interleaved_order_control : 0.000002s : 0.02% optimize.assign_add_opt : 0.000002s : 0.01% optimize.ForceFp32Comm : 0.000001s : 0.01% optimize.remove_cast_before_assign_add : 0.000001s : 0.01% optimize.full_micro_interleaved_order_control : 0.000002s : 0.02% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.03% optimize.comm_op_add_attrs : 0.000001s : 0.01% optimize.add_comm_op_reuse_tag : 0.000001s : 0.01% optimize.interleave_split_concat_branches : 0.000001s : 0.01% optimize.interleave_parallel_branches : 0.000001s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000002s : 0.02% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.02% optimize.control_data_broadcast_order : 0.000014s : 0.14% optimize.grouped_pairwise_exchange_alltoall : 0.000001s : 0.01% optimize.offloading_packed_experts : 0.000004s : 0.04% optimize.overlap_recompute_and_grad_model_parallel : 0.000006s : 0.05% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.01% optimize.overlap_recompute_comm : 0.000002s : 0.02% optimize.overlap_grad_ring_attention : 0.000004s : 0.04% optimize.overlap_grad_flash_sp : 0.000019s : 0.19% optimize.begin_end_overlap_inline : 0.000001s : 0.01% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.02% optimize.split_layernorm_comm : 0.000002s : 0.02% optimize.handle_group_info : 0.000001s : 0.01% optimize.symbol_engine_optimizer.build : 0.000002s : 0.02% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000010s : 0.10% optimize.symbol_engine_optimizer.elim_not_effective : 0.000014s : 0.14% optimize.symbol_engine_optimizer.opt_reshape : 0.000007s : 0.07% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000010s : 0.10% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.02% pipeline_parallel_scheduler : 0.000002s : 0.01% auto_monad_reorder : 0.000017s : 0.17% get_jit_bprop_graph : 0.000002s : 0.02% rewriter_after_jit_bprop_graph : 0.000006s : 0.06% opt_after_jit_grad : 0.000523s : 5.08% validate : 0.000042s : 0.40% Time group info: ------[substitution.] 0.000203 25 35.14% : 0.000071s : 4: substitution.arithmetic_simplify 1.18% : 0.000002s : 2: substitution.elim_not_effective 0.63% : 0.000001s : 2: substitution.fold_const_symbol 2.76% : 0.000006s : 3: substitution.graph_param_transform 47.84% : 0.000097s : 2: substitution.inline 1.97% : 0.000004s : 4: substitution.j_node_and_user_rematch 5.88% : 0.000012s : 2: substitution.less_batch_normalization 2.54% : 0.000005s : 4: substitution.remove_not_recompute_node 2.06% : 0.000004s : 2: substitution.replace_old_param ------[type_inference.] 0.005267 2 90.94% : 0.004790s : 1: type_inference.infer 9.06% : 0.000477s : 1: type_inference.specialize ------[replace.] 0.000022 2 100.00% : 0.000022s : 2: replace.inline ------[match.] 0.000095 2 100.00% : 0.000095s : 2: match.inline ------[predicate.] 0.000142 754 0.91% : 0.000001s : 7: predicate.accumulaten_eliminater 1.09% : 0.000002s : 3: predicate.ad_related_special_op_eliminate 0.75% : 0.000001s : 6: predicate.addn_check_dump 0.82% : 0.000001s : 7: predicate.addn_zero_filter 0.77% : 0.000001s : 7: predicate.adjust_all_reduce_mul_add 3.00% : 0.000004s : 13: predicate.arithmetic_simplify 0.75% : 0.000001s : 7: predicate.cast_eliminate 0.67% : 0.000001s : 6: predicate.check_bprop_eliminate 0.75% : 0.000001s : 6: predicate.compare_switch_simplify 0.23% : 0.000000s : 3: predicate.const_output_eliminate 0.80% : 0.000001s : 6: predicate.depend_value_elim 0.82% : 0.000001s : 7: predicate.dict_get_item_const_eliminator 0.87% : 0.000001s : 7: predicate.dict_get_item_eliminator 0.71% : 0.000001s : 7: predicate.dict_set_item_eliminator 1.25% : 0.000002s : 6: predicate.dumpgradient_eliminate 0.55% : 0.000001s : 3: predicate.elim_not_effective 0.45% : 0.000001s : 3: predicate.elim_shapecalc_of_broadcastargs 1.06% : 0.000001s : 10: predicate.environ_add_const_eliminate 0.97% : 0.000001s : 10: predicate.environ_get_add_eliminate 0.97% : 0.000001s : 10: predicate.environ_get_depend_swap 1.63% : 0.000002s : 16: predicate.environ_get_eliminate 0.92% : 0.000001s : 10: predicate.environ_get_set_eliminate 0.88% : 0.000001s : 9: predicate.exchange_switch_depend_value 1.68% : 0.000002s : 9: predicate.float_depend_g_call 0.69% : 0.000001s : 6: predicate.float_environ_get_switch 1.08% : 0.000002s : 9: predicate.float_tuple_getitem_switch 0.20% : 0.000000s : 3: predicate.fold_const_symbol 0.86% : 0.000001s : 6: predicate.get_grad_eliminate 0.18% : 0.000000s : 3: predicate.graph_param_transform 0.81% : 0.000001s : 6: predicate.incorporate_call 0.61% : 0.000001s : 6: predicate.incorporate_call_switch 6.49% : 0.000009s : 34: predicate.inline 0.94% : 0.000001s : 6: predicate.inline_without_move 0.35% : 0.000000s : 6: predicate.j_node_and_user_rematch 1.39% : 0.000002s : 6: predicate.less_batch_normalization 1.74% : 0.000002s : 13: predicate.list_to_tuple_eliminator_ 2.02% : 0.000003s : 20: predicate.load_eliminater 1.44% : 0.000002s : 3: predicate.loop_unroll_after_grad 1.56% : 0.000002s : 14: predicate.loop_unroll_before_grad 1.73% : 0.000002s : 13: predicate.make_slice_get_slice_eliminator 0.67% : 0.000001s : 6: predicate.merge_addn 0.66% : 0.000001s : 6: predicate.micro_step_allgather_replace 0.62% : 0.000001s : 6: predicate.mini_step_allgather_replace 0.66% : 0.000001s : 7: predicate.minmaximum_grad 2.44% : 0.000003s : 3: predicate.mutable_eliminate 0.46% : 0.000001s : 3: predicate.opt_reshape 0.48% : 0.000001s : 3: predicate.parallel_virtual_node 1.42% : 0.000002s : 9: predicate.partial_defer_inline 1.15% : 0.000002s : 10: predicate.partial_eliminate 0.86% : 0.000001s : 7: predicate.print_const_string_wrapper 0.71% : 0.000001s : 6: predicate.reduce_all_const_elim 1.00% : 0.000001s : 7: predicate.reduce_eliminate 1.96% : 0.000003s : 20: predicate.redundant_stop_gradient_eliminater 0.97% : 0.000001s : 6: predicate.remove_not_recompute_node 1.18% : 0.000002s : 13: predicate.replace_applicator 0.75% : 0.000001s : 6: predicate.replace_old_param 0.50% : 0.000001s : 3: predicate.reset_defer_inline 0.77% : 0.000001s : 7: predicate.reshape_eliminate 0.70% : 0.000001s : 6: predicate.row_tensor_add_zeros_like 0.70% : 0.000001s : 3: predicate.row_tensor_eliminate 0.95% : 0.000001s : 6: predicate.same_eliminate 0.51% : 0.000001s : 6: predicate.set_cell_output_no_recompute 1.12% : 0.000002s : 6: predicate.shard_identity_eliminate 1.01% : 0.000001s : 6: predicate.special_op_eliminate 0.92% : 0.000001s : 6: predicate.specialize_transform 1.47% : 0.000002s : 6: predicate.split_environ_get_set_with_tuple_value 0.82% : 0.000001s : 6: predicate.stack_unstack_eliminate 0.42% : 0.000001s : 3: predicate.switch_call_monad_eliminater 1.00% : 0.000001s : 9: predicate.switch_defer_inline 1.65% : 0.000002s : 15: predicate.switch_layer_defer_inline 4.20% : 0.000006s : 32: predicate.switch_simplify 0.79% : 0.000001s : 7: predicate.tile_eliminate 0.73% : 0.000001s : 7: predicate.transpose_eliminate 1.71% : 0.000002s : 13: predicate.tuple_list_convert_item_index_to_positive 1.51% : 0.000002s : 13: predicate.tuple_list_get_item_const_eliminator 1.28% : 0.000002s : 13: predicate.tuple_list_get_item_depend_reorder 3.25% : 0.000005s : 19: predicate.tuple_list_get_item_eliminator 1.29% : 0.000002s : 13: predicate.tuple_list_get_set_item_eliminator 2.42% : 0.000003s : 19: predicate.tuple_list_set_item_eliminator 1.46% : 0.000002s : 13: predicate.tuple_to_list_eliminator_ 1.85% : 0.000003s : 20: predicate.updatestate_pure_node_eliminater 2.70% : 0.000004s : 26: predicate.updatestate_useless_node_eliminater 0.49% : 0.000001s : 3: predicate.value_based_eliminate 0.77% : 0.000001s : 6: predicate.virtual_dataset_eliminate 0.78% : 0.000001s : 6: predicate.virtual_output_eliminate 0.31% : 0.000000s : 3: predicate.virtual_view_grad_eliminate 0.49% : 0.000001s : 3: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000255 5 8.35% : 0.000021s : 1: func_graph_cloner_run.FuncGraphClonerGraph 91.65% : 0.000233s : 4: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.024438 192 0.02% : 0.000004s : 1: ForceFp32Comm 13.93% : 0.003405s : 1: add_attr 13.88% : 0.003392s : 1: add_attr_with_inline 0.02% : 0.000004s : 1: add_comm_op_reuse_tag 0.22% : 0.000054s : 1: add_recomputation 0.02% : 0.000004s : 1: assign_add_opt 0.24% : 0.000059s : 1: auto_monad 0.09% : 0.000021s : 1: auto_monad_reorder 0.01% : 0.000004s : 1: begin_end_overlap_inline 0.02% : 0.000006s : 1: bias_add_comm_swap 1.94% : 0.000475s : 1: bootstrap 0.15% : 0.000037s : 1: cconv 0.02% : 0.000004s : 1: comm_op_add_attrs 0.07% : 0.000017s : 1: control_data_broadcast_order 0.04% : 0.000011s : 1: convert_after_rewriter 0.10% : 0.000025s : 1: cse_after_recomputation 0.02% : 0.000005s : 1: dataset_repeat_opt 0.02% : 0.000006s : 1: detach_backward 0.04% : 0.000009s : 1: environ_conv 0.08% : 0.000019s : 1: event_method 0.02% : 0.000005s : 1: full_micro_interleaved_order_control 0.02% : 0.000005s : 1: get_jit_bprop_graph 0.04% : 0.000009s : 1: graph_reusing 0.02% : 0.000005s : 1: grouped_pairwise_exchange_alltoall 0.02% : 0.000004s : 1: handle_group_info 0.02% : 0.000005s : 1: inline 0.02% : 0.000006s : 1: insert-virtual-dataset 0.02% : 0.000004s : 1: interleave_parallel_branches 0.02% : 0.000004s : 1: interleave_split_concat_branches 0.02% : 0.000006s : 1: label_fine_grained_interleaved_index 0.03% : 0.000008s : 1: label_micro_interleaved_index 2.21% : 0.000540s : 1: loop_unroll 0.02% : 0.000004s : 1: merge_cast_opt 0.02% : 0.000005s : 1: micro_interleaved_order_control 2.65% : 0.000647s : 1: mutable_eliminate 0.03% : 0.000007s : 1: offloading_packed_experts 0.07% : 0.000017s : 1: opt.transform.loop_unroll_optimizer 0.08% : 0.000020s : 1: opt.transform.mutable_eliminate 3.66% : 0.000894s : 78: opt.transform.opt_a 0.11% : 0.000026s : 1: opt.transform.opt_after_cconv 0.11% : 0.000027s : 1: opt.transform.opt_after_jit_grad 0.40% : 0.000097s : 28: opt.transform.opt_b 0.20% : 0.000048s : 2: opt.transform.opt_trans_graph 0.15% : 0.000038s : 4: opt.transform.symbol_engine_opt 9.94% : 0.002430s : 1: opt_a 0.47% : 0.000114s : 1: opt_after_cconv 2.18% : 0.000534s : 1: opt_after_jit_grad 0.93% : 0.000227s : 1: opt_b 19.19% : 0.004689s : 1: optimize 0.09% : 0.000022s : 1: optimize_parallel_all_gather_comm 0.04% : 0.000009s : 1: order_py_execute_after_rewriter 0.09% : 0.000023s : 1: overlap_grad_flash_sp 0.02% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.03% : 0.000007s : 1: overlap_grad_ring_attention 0.02% : 0.000004s : 1: overlap_opt_shard_grad_in_pipeline 0.02% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.02% : 0.000005s : 1: overlap_param_gather 0.02% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.03% : 0.000008s : 1: overlap_recompute_and_grad_model_parallel 0.02% : 0.000005s : 1: overlap_recompute_comm 0.03% : 0.000007s : 1: parallel-infer-symbol 0.02% : 0.000004s : 1: parallel-infer-symbol-second 0.02% : 0.000005s : 1: partial_unused_args_eliminate 0.02% : 0.000005s : 1: pipeline_parallel_scheduler 0.02% : 0.000004s : 1: pipeline_split 0.14% : 0.000034s : 1: pre_auto_parallel 0.09% : 0.000022s : 1: py_interpret_to_execute 0.07% : 0.000018s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000004s : 1: remove_cast_before_assign_add 0.08% : 0.000019s : 1: remove_dup_value 1.35% : 0.000330s : 1: renormalize.infer 1.14% : 0.000278s : 1: renormalize.specialize 0.02% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.04% : 0.000009s : 1: rewriter_after_jit_bprop_graph 0.18% : 0.000045s : 1: rewriter_after_opt_a 0.21% : 0.000052s : 1: rewriter_before_opt_a 0.02% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.02% : 0.000005s : 1: slice_recompute_activation 0.02% : 0.000004s : 1: split_layernorm_comm 0.02% : 0.000005s : 1: split_matmul_comm_elemetwise 0.03% : 0.000008s : 1: swap_dp_allreduce_reducescatter 0.32% : 0.000079s : 1: symbol_engine_optimizer 0.32% : 0.000078s : 1: tuple_transform 21.82% : 0.005333s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:37.746.114 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:37.746.378 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0144206, [21] [bootstrap]: 0.00043757 [type_inference]: 0.00479264 [event_method]: 1.256e-05 [auto_monad]: 5.561e-05 [graph_reusing]: 5.81998e-06 [inline]: 2.45002e-06 [add_attr]: 0.00321771, [1] [add_attr_with_inline]: 0.00320782, [1] [Cycle 1]: 6.59e-05, [2] [tag_attr]: 1.417e-05 [meta_addattr_fg_expand]: 3.87002e-06 [parallel-infer-symbol]: 3.50003e-06 [pre_auto_parallel]: 2.659e-05 [insert-virtual-dataset]: 2.94999e-06 [parallel-infer-symbol-second]: 7.50006e-07 [dataset_repeat_opt]: 1.91e-06 [pipeline_split]: 1.77001e-06 [optimize]: 0.00472393, [53] [py_interpret_to_execute]: 2.039e-05 [rewriter_before_opt_a]: 4.982e-05 [opt_a]: 0.00248679, [2] [Cycle 1]: 0.00167997, [45] [expand_dump_flag]: 2.78e-06 [switch_simplify]: 2.482e-05 [loop_unroll]: 1.387e-05 [a_1]: 0.00028759 [with_stream_mark]: 1.794e-05 [recompute_prepare]: 8.62998e-06 [updatestate_depend_eliminate]: 3.74002e-06 [updatestate_assign_eliminate]: 3.45e-06 [updatestate_loads_eliminate]: 3.33998e-06 [parameter_eliminate]: 1.99e-06 [a_2]: 0.00010803 [accelerated_algorithm]: 6.66e-06 [shard]: 2.03002e-06 [meta_shard_fg_expand]: 1.69998e-06 [shard_inline]: 6.34999e-06 [merge_send_recv]: 8.23001e-06 [auto_parallel]: 6.64999e-06 [parallel]: 1.901e-05 [flash_sp]: 7.84002e-06 [merge_comm]: 4.12998e-06 [allreduce_fusion]: 3.48999e-06 [matmul_add_comm_reduction]: 9.67001e-06 [allreduce_slice_to_reducescatter]: 6.89994e-07 [virtual_shard_identity]: 7.85e-06 [virtual_dataset]: 6.56e-06 [get_grad_eliminate_]: 5.99e-06 [virtual_output]: 6.06e-06 [merge_forward]: 4.33999e-06 [cell_reuse_recompute_pass]: 1.31998e-06 [offload_activation]: 1.052e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.514e-05 [merge_recompute_call_nodes]: 1.69e-06 [before_grad]: 1.042e-05 [set_forward_comm_id_for_comm_node_pass]: 3.82002e-06 [meta_fg_expand]: 2.67001e-06 [flash_sp_send_recv_attached]: 2.88e-06 [receive_attached]: 2.67001e-06 [after_resolve]: 9.51e-06 [a_after_grad]: 9.62999e-06 [renormalize]: 0.00054475 [add_forward_monad_depend]: 5.17e-06 [auto_monad_grad]: 2.42001e-06 [auto_monad_eliminator]: 1.429e-05 [cse]: 2.781e-05 [a_3]: 5.868e-05 [Cycle 2]: 0.0007936, [45] [expand_dump_flag]: 1.32e-06 [switch_simplify]: 7.29001e-06 [loop_unroll]: 5.96e-06 [a_1]: 0.00010637 [with_stream_mark]: 1.081e-05 [recompute_prepare]: 6.24001e-06 [updatestate_depend_eliminate]: 3.01001e-06 [updatestate_assign_eliminate]: 2.46e-06 [updatestate_loads_eliminate]: 2.83e-06 [parameter_eliminate]: 1.07998e-06 [a_2]: 9.703e-05 [accelerated_algorithm]: 5.73002e-06 [shard]: 1.22e-06 [meta_shard_fg_expand]: 1.50999e-06 [shard_inline]: 6.23002e-06 [merge_send_recv]: 5.02e-06 [auto_parallel]: 6.09001e-06 [parallel]: 4.85999e-06 [flash_sp]: 3.45e-06 [merge_comm]: 3.65998e-06 [allreduce_fusion]: 3.48999e-06 [matmul_add_comm_reduction]: 6.23e-06 [allreduce_slice_to_reducescatter]: 3.30008e-07 [virtual_shard_identity]: 7.01999e-06 [virtual_dataset]: 6.04999e-06 [get_grad_eliminate_]: 5.78002e-06 [virtual_output]: 5.51e-06 [merge_forward]: 2.78e-06 [cell_reuse_recompute_pass]: 1.35999e-06 [offload_activation]: 7.15998e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.537e-05 [merge_recompute_call_nodes]: 7.2e-07 [before_grad]: 9.67999e-06 [set_forward_comm_id_for_comm_node_pass]: 4.32e-06 [meta_fg_expand]: 2.29999e-06 [flash_sp_send_recv_attached]: 9.40025e-07 [receive_attached]: 1.07e-06 [after_resolve]: 9.54e-06 [a_after_grad]: 8.49998e-06 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 1.42999e-06 [auto_monad_grad]: 1.67001e-06 [auto_monad_eliminator]: 7.58999e-06 [cse]: 1.472e-05 [a_3]: 5.025e-05 [py_interpret_to_execute_after_opt_a]: 1.136e-05 [slice_cell_reuse_recomputed_activation]: 5.34e-06 [rewriter_after_opt_a]: 4.012e-05 [convert_after_rewriter]: 9.76998e-06 [order_py_execute_after_rewriter]: 8.57e-06 [mutable_eliminate]: 0.00053482 [opt_b]: 0.00026496, [1] [Cycle 1]: 0.00025631, [7] [b_1]: 0.00016205 [b_2]: 7.49002e-06 [updatestate_depend_eliminate]: 5.35999e-06 [updatestate_assign_eliminate]: 2.58e-06 [updatestate_loads_eliminate]: 2.21998e-06 [renormalize]: 7.59988e-07 [cse]: 1.976e-05 [optimize_parallel_all_gather_comm]: 1.95e-05 [overlap_param_gather]: 4.90999e-06 [cconv]: 2.946e-05 [loop_unroll]: 0.00043971 [opt_after_cconv]: 0.00012242, [1] [Cycle 1]: 0.00011439, [7] [c_1]: 2.776e-05 [parameter_eliminate]: 3.19001e-06 [updatestate_depend_eliminate]: 5.40999e-06 [updatestate_assign_eliminate]: 2.46e-06 [updatestate_loads_eliminate]: 2.37999e-06 [cse]: 1.905e-05 [renormalize]: 6.50005e-07 [remove_dup_value]: 1.824e-05 [tuple_transform]: 8.853e-05, [1] [Cycle 1]: 8.143e-05, [4] [d_1]: 4.145e-05 [none_parameter_eliminate]: 1.97001e-06 [renormalize]: 2.10013e-07 [switch_simplify]: 6.90998e-06 [partial_unused_args_eliminate]: 4.48999e-06 [add_recomputation]: 4.956e-05 [cse_after_recomputation]: 2.776e-05, [1] [Cycle 1]: 2.077e-05, [1] [cse]: 1.16e-05 [environ_conv]: 8.25999e-06 [swap_dp_allreduce_reducescatter]: 7.54002e-06 [bias_add_comm_swap]: 6.14999e-06 [label_micro_interleaved_index]: 7.23e-06 [label_fine_grained_interleaved_index]: 5.52999e-06 [merge_cast_opt]: 3.9e-06 [slice_recompute_activation]: 5.05999e-06 [micro_interleaved_order_control]: 4.52e-06 [assign_add_opt]: 3.71999e-06 [ForceFp32Comm]: 3.23e-06 [remove_cast_before_assign_add]: 3.65998e-06 [full_micro_interleaved_order_control]: 4.62e-06 [reorder_send_recv_between_fp_bp]: 5.15999e-06 [comm_op_add_attrs]: 3.86001e-06 [add_comm_op_reuse_tag]: 3.31001e-06 [interleave_split_concat_branches]: 3.53999e-06 [interleave_parallel_branches]: 3.55e-06 [overlap_opt_shard_in_pipeline]: 4e-06 [overlap_opt_shard_grad_in_pipeline]: 4.54998e-06 [control_data_broadcast_order]: 1.625e-05 [grouped_pairwise_exchange_alltoall]: 3.85e-06 [offloading_packed_experts]: 7.23999e-06 [overlap_recompute_and_grad_model_parallel]: 7.48e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.56999e-06 [overlap_recompute_allgather_and_fa_grad]: 3.53999e-06 [overlap_recompute_comm]: 5.29e-06 [overlap_grad_ring_attention]: 6.70002e-06 [overlap_grad_flash_sp]: 1.928e-05 [begin_end_overlap_inline]: 3.03e-06 [split_matmul_comm_elemetwise]: 4.92999e-06 [split_layernorm_comm]: 3.88001e-06 [handle_group_info]: 3.88001e-06 [symbol_engine_optimizer]: 9.743e-05, [1] [Cycle 1]: 9e-05, [6] [build]: 2.36e-06 [elim_shapecalc]: 9.50001e-06 [elim_not_effective]: 1.312e-05 [opt_reshape]: 6.98e-06 [fold_const_symbol]: 1.015e-05 [renormalize]: 2.20025e-07 [detach_backward]: 3.67998e-06 [pipeline_parallel_scheduler]: 2.09e-06 [auto_monad_reorder]: 1.833e-05 [get_jit_bprop_graph]: 1.70001e-06 [rewriter_after_jit_bprop_graph]: 4.47e-06 [opt_after_jit_grad]: 0.00050346 [validate]: 3.642e-05 Sums bootstrap : 0.000438s : 4.61% type_inference : 0.004793s : 50.52% event_method : 0.000013s : 0.13% auto_monad : 0.000056s : 0.59% graph_reusing : 0.000006s : 0.06% inline : 0.000002s : 0.03% add_attr.add_attr_with_inline.tag_attr : 0.000014s : 0.15% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000004s : 0.04% parallel-infer-symbol : 0.000004s : 0.04% pre_auto_parallel : 0.000027s : 0.28% insert-virtual-dataset : 0.000003s : 0.03% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.02% optimize.py_interpret_to_execute : 0.000020s : 0.21% optimize.rewriter_before_opt_a : 0.000050s : 0.53% optimize.opt_a.expand_dump_flag : 0.000004s : 0.04% optimize.opt_a.switch_simplify : 0.000032s : 0.34% optimize.opt_a.loop_unroll : 0.000020s : 0.21% optimize.opt_a.a_1 : 0.000394s : 4.15% optimize.opt_a.with_stream_mark : 0.000029s : 0.30% optimize.opt_a.recompute_prepare : 0.000015s : 0.16% optimize.opt_a.updatestate_depend_eliminate : 0.000007s : 0.07% optimize.opt_a.updatestate_assign_eliminate : 0.000006s : 0.06% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.07% optimize.opt_a.parameter_eliminate : 0.000003s : 0.03% optimize.opt_a.a_2 : 0.000205s : 2.16% optimize.opt_a.accelerated_algorithm : 0.000012s : 0.13% optimize.opt_a.shard : 0.000003s : 0.03% optimize.opt_a.meta_shard_fg_expand : 0.000003s : 0.03% optimize.opt_a.shard_inline : 0.000013s : 0.13% optimize.opt_a.merge_send_recv : 0.000013s : 0.14% optimize.opt_a.auto_parallel : 0.000013s : 0.13% optimize.opt_a.parallel : 0.000024s : 0.25% optimize.opt_a.flash_sp : 0.000011s : 0.12% optimize.opt_a.merge_comm : 0.000008s : 0.08% optimize.opt_a.allreduce_fusion : 0.000007s : 0.07% optimize.opt_a.matmul_add_comm_reduction : 0.000016s : 0.17% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000015s : 0.16% optimize.opt_a.virtual_dataset : 0.000013s : 0.13% optimize.opt_a.get_grad_eliminate_ : 0.000012s : 0.12% optimize.opt_a.virtual_output : 0.000012s : 0.12% optimize.opt_a.merge_forward : 0.000007s : 0.08% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.03% optimize.opt_a.offload_activation : 0.000018s : 0.19% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000031s : 0.32% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.03% optimize.opt_a.before_grad : 0.000020s : 0.21% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000008s : 0.09% optimize.opt_a.meta_fg_expand : 0.000005s : 0.05% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.04% optimize.opt_a.receive_attached : 0.000004s : 0.04% optimize.opt_a.after_resolve : 0.000019s : 0.20% optimize.opt_a.a_after_grad : 0.000018s : 0.19% optimize.opt_a.renormalize : 0.000545s : 5.74% optimize.opt_a.add_forward_monad_depend : 0.000007s : 0.07% optimize.opt_a.auto_monad_grad : 0.000004s : 0.04% optimize.opt_a.auto_monad_eliminator : 0.000022s : 0.23% optimize.opt_a.cse : 0.000043s : 0.45% optimize.opt_a.a_3 : 0.000109s : 1.15% optimize.py_interpret_to_execute_after_opt_a : 0.000011s : 0.12% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.06% optimize.rewriter_after_opt_a : 0.000040s : 0.42% optimize.convert_after_rewriter : 0.000010s : 0.10% optimize.order_py_execute_after_rewriter : 0.000009s : 0.09% optimize.mutable_eliminate : 0.000535s : 5.64% optimize.opt_b.b_1 : 0.000162s : 1.71% optimize.opt_b.b_2 : 0.000007s : 0.08% optimize.opt_b.updatestate_depend_eliminate : 0.000005s : 0.06% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_b.updatestate_loads_eliminate : 0.000002s : 0.02% optimize.opt_b.renormalize : 0.000001s : 0.01% optimize.opt_b.cse : 0.000020s : 0.21% optimize.optimize_parallel_all_gather_comm : 0.000019s : 0.21% optimize.overlap_param_gather : 0.000005s : 0.05% optimize.cconv : 0.000029s : 0.31% optimize.loop_unroll : 0.000440s : 4.63% optimize.opt_after_cconv.c_1 : 0.000028s : 0.29% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000005s : 0.06% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000002s : 0.03% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.03% optimize.opt_after_cconv.cse : 0.000019s : 0.20% optimize.opt_after_cconv.renormalize : 0.000001s : 0.01% optimize.remove_dup_value : 0.000018s : 0.19% optimize.tuple_transform.d_1 : 0.000041s : 0.44% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.02% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000007s : 0.07% optimize.partial_unused_args_eliminate : 0.000004s : 0.05% optimize.add_recomputation : 0.000050s : 0.52% optimize.cse_after_recomputation.cse : 0.000012s : 0.12% optimize.environ_conv : 0.000008s : 0.09% optimize.swap_dp_allreduce_reducescatter : 0.000008s : 0.08% optimize.bias_add_comm_swap : 0.000006s : 0.06% optimize.label_micro_interleaved_index : 0.000007s : 0.08% optimize.label_fine_grained_interleaved_index : 0.000006s : 0.06% optimize.merge_cast_opt : 0.000004s : 0.04% optimize.slice_recompute_activation : 0.000005s : 0.05% optimize.micro_interleaved_order_control : 0.000005s : 0.05% optimize.assign_add_opt : 0.000004s : 0.04% optimize.ForceFp32Comm : 0.000003s : 0.03% optimize.remove_cast_before_assign_add : 0.000004s : 0.04% optimize.full_micro_interleaved_order_control : 0.000005s : 0.05% optimize.reorder_send_recv_between_fp_bp : 0.000005s : 0.05% optimize.comm_op_add_attrs : 0.000004s : 0.04% optimize.add_comm_op_reuse_tag : 0.000003s : 0.03% optimize.interleave_split_concat_branches : 0.000004s : 0.04% optimize.interleave_parallel_branches : 0.000004s : 0.04% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.04% optimize.overlap_opt_shard_grad_in_pipeline : 0.000005s : 0.05% optimize.control_data_broadcast_order : 0.000016s : 0.17% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.04% optimize.offloading_packed_experts : 0.000007s : 0.08% optimize.overlap_recompute_and_grad_model_parallel : 0.000007s : 0.08% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.04% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.04% optimize.overlap_recompute_comm : 0.000005s : 0.06% optimize.overlap_grad_ring_attention : 0.000007s : 0.07% optimize.overlap_grad_flash_sp : 0.000019s : 0.20% optimize.begin_end_overlap_inline : 0.000003s : 0.03% optimize.split_matmul_comm_elemetwise : 0.000005s : 0.05% optimize.split_layernorm_comm : 0.000004s : 0.04% optimize.handle_group_info : 0.000004s : 0.04% optimize.symbol_engine_optimizer.build : 0.000002s : 0.02% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000010s : 0.10% optimize.symbol_engine_optimizer.elim_not_effective : 0.000013s : 0.14% optimize.symbol_engine_optimizer.opt_reshape : 0.000007s : 0.07% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000010s : 0.11% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000004s : 0.04% pipeline_parallel_scheduler : 0.000002s : 0.02% auto_monad_reorder : 0.000018s : 0.19% get_jit_bprop_graph : 0.000002s : 0.02% rewriter_after_jit_bprop_graph : 0.000004s : 0.05% opt_after_jit_grad : 0.000503s : 5.31% validate : 0.000036s : 0.38% Time group info: ------[substitution.] 0.000110 19 1.75% : 0.000002s : 2: substitution.elim_not_effective 1.26% : 0.000001s : 2: substitution.fold_const_symbol 5.56% : 0.000006s : 3: substitution.graph_param_transform 80.58% : 0.000088s : 2: substitution.inline 3.13% : 0.000003s : 4: substitution.j_node_and_user_rematch 4.58% : 0.000005s : 4: substitution.remove_not_recompute_node 3.15% : 0.000003s : 2: substitution.replace_old_param ------[type_inference.] 0.004746 2 90.70% : 0.004304s : 1: type_inference.infer 9.30% : 0.000441s : 1: type_inference.specialize ------[replace.] 0.000021 2 100.00% : 0.000021s : 2: replace.inline ------[match.] 0.000087 2 100.00% : 0.000087s : 2: match.inline ------[predicate.] 0.000133 754 0.87% : 0.000001s : 7: predicate.accumulaten_eliminater 1.15% : 0.000002s : 3: predicate.ad_related_special_op_eliminate 0.69% : 0.000001s : 6: predicate.addn_check_dump 0.93% : 0.000001s : 7: predicate.addn_zero_filter 0.72% : 0.000001s : 7: predicate.adjust_all_reduce_mul_add 2.35% : 0.000003s : 13: predicate.arithmetic_simplify 0.78% : 0.000001s : 7: predicate.cast_eliminate 0.84% : 0.000001s : 6: predicate.check_bprop_eliminate 0.69% : 0.000001s : 6: predicate.compare_switch_simplify 0.22% : 0.000000s : 3: predicate.const_output_eliminate 0.81% : 0.000001s : 6: predicate.depend_value_elim 0.77% : 0.000001s : 7: predicate.dict_get_item_const_eliminator 0.90% : 0.000001s : 7: predicate.dict_get_item_eliminator 0.81% : 0.000001s : 7: predicate.dict_set_item_eliminator 1.21% : 0.000002s : 6: predicate.dumpgradient_eliminate 0.28% : 0.000000s : 3: predicate.elim_not_effective 0.53% : 0.000001s : 3: predicate.elim_shapecalc_of_broadcastargs 1.09% : 0.000001s : 10: predicate.environ_add_const_eliminate 0.99% : 0.000001s : 10: predicate.environ_get_add_eliminate 1.02% : 0.000001s : 10: predicate.environ_get_depend_swap 1.86% : 0.000002s : 16: predicate.environ_get_eliminate 1.02% : 0.000001s : 10: predicate.environ_get_set_eliminate 0.99% : 0.000001s : 9: predicate.exchange_switch_depend_value 2.03% : 0.000003s : 9: predicate.float_depend_g_call 0.65% : 0.000001s : 6: predicate.float_environ_get_switch 0.96% : 0.000001s : 9: predicate.float_tuple_getitem_switch 0.20% : 0.000000s : 3: predicate.fold_const_symbol 0.81% : 0.000001s : 6: predicate.get_grad_eliminate 0.27% : 0.000000s : 3: predicate.graph_param_transform 0.76% : 0.000001s : 6: predicate.incorporate_call 0.66% : 0.000001s : 6: predicate.incorporate_call_switch 6.29% : 0.000008s : 34: predicate.inline 1.01% : 0.000001s : 6: predicate.inline_without_move 0.47% : 0.000001s : 6: predicate.j_node_and_user_rematch 1.07% : 0.000001s : 6: predicate.less_batch_normalization 1.80% : 0.000002s : 13: predicate.list_to_tuple_eliminator_ 2.18% : 0.000003s : 20: predicate.load_eliminater 1.30% : 0.000002s : 3: predicate.loop_unroll_after_grad 1.64% : 0.000002s : 14: predicate.loop_unroll_before_grad 1.85% : 0.000002s : 13: predicate.make_slice_get_slice_eliminator 0.73% : 0.000001s : 6: predicate.merge_addn 0.69% : 0.000001s : 6: predicate.micro_step_allgather_replace 0.72% : 0.000001s : 6: predicate.mini_step_allgather_replace 0.70% : 0.000001s : 7: predicate.minmaximum_grad 1.17% : 0.000002s : 3: predicate.mutable_eliminate 0.52% : 0.000001s : 3: predicate.opt_reshape 0.42% : 0.000001s : 3: predicate.parallel_virtual_node 1.51% : 0.000002s : 9: predicate.partial_defer_inline 1.21% : 0.000002s : 10: predicate.partial_eliminate 0.76% : 0.000001s : 7: predicate.print_const_string_wrapper 0.79% : 0.000001s : 6: predicate.reduce_all_const_elim 0.92% : 0.000001s : 7: predicate.reduce_eliminate 2.04% : 0.000003s : 20: predicate.redundant_stop_gradient_eliminater 0.73% : 0.000001s : 6: predicate.remove_not_recompute_node 1.17% : 0.000002s : 13: predicate.replace_applicator 0.60% : 0.000001s : 6: predicate.replace_old_param 0.34% : 0.000000s : 3: predicate.reset_defer_inline 1.11% : 0.000001s : 7: predicate.reshape_eliminate 0.75% : 0.000001s : 6: predicate.row_tensor_add_zeros_like 0.44% : 0.000001s : 3: predicate.row_tensor_eliminate 1.00% : 0.000001s : 6: predicate.same_eliminate 0.56% : 0.000001s : 6: predicate.set_cell_output_no_recompute 1.01% : 0.000001s : 6: predicate.shard_identity_eliminate 1.06% : 0.000001s : 6: predicate.special_op_eliminate 1.01% : 0.000001s : 6: predicate.specialize_transform 1.17% : 0.000002s : 6: predicate.split_environ_get_set_with_tuple_value 0.99% : 0.000001s : 6: predicate.stack_unstack_eliminate 0.42% : 0.000001s : 3: predicate.switch_call_monad_eliminater 1.01% : 0.000001s : 9: predicate.switch_defer_inline 1.70% : 0.000002s : 15: predicate.switch_layer_defer_inline 4.30% : 0.000006s : 32: predicate.switch_simplify 0.78% : 0.000001s : 7: predicate.tile_eliminate 0.83% : 0.000001s : 7: predicate.transpose_eliminate 1.48% : 0.000002s : 13: predicate.tuple_list_convert_item_index_to_positive 1.63% : 0.000002s : 13: predicate.tuple_list_get_item_const_eliminator 1.46% : 0.000002s : 13: predicate.tuple_list_get_item_depend_reorder 3.13% : 0.000004s : 19: predicate.tuple_list_get_item_eliminator 1.45% : 0.000002s : 13: predicate.tuple_list_get_set_item_eliminator 2.44% : 0.000003s : 19: predicate.tuple_list_set_item_eliminator 1.71% : 0.000002s : 13: predicate.tuple_to_list_eliminator_ 2.11% : 0.000003s : 20: predicate.updatestate_pure_node_eliminater 3.01% : 0.000004s : 26: predicate.updatestate_useless_node_eliminater 0.44% : 0.000001s : 3: predicate.value_based_eliminate 0.81% : 0.000001s : 6: predicate.virtual_dataset_eliminate 0.74% : 0.000001s : 6: predicate.virtual_output_eliminate 0.39% : 0.000001s : 3: predicate.virtual_view_grad_eliminate 0.59% : 0.000001s : 3: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000217 5 7.80% : 0.000017s : 1: func_graph_cloner_run.FuncGraphClonerGraph 92.20% : 0.000200s : 4: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.023797 192 0.02% : 0.000006s : 1: ForceFp32Comm 13.56% : 0.003228s : 1: add_attr 13.50% : 0.003212s : 1: add_attr_with_inline 0.03% : 0.000006s : 1: add_comm_op_reuse_tag 0.22% : 0.000053s : 1: add_recomputation 0.03% : 0.000006s : 1: assign_add_opt 0.27% : 0.000065s : 1: auto_monad 0.11% : 0.000026s : 1: auto_monad_reorder 0.02% : 0.000006s : 1: begin_end_overlap_inline 0.04% : 0.000009s : 1: bias_add_comm_swap 2.01% : 0.000477s : 1: bootstrap 0.14% : 0.000033s : 1: cconv 0.03% : 0.000007s : 1: comm_op_add_attrs 0.08% : 0.000019s : 1: control_data_broadcast_order 0.05% : 0.000013s : 1: convert_after_rewriter 0.13% : 0.000031s : 1: cse_after_recomputation 0.03% : 0.000008s : 1: dataset_repeat_opt 0.07% : 0.000018s : 1: detach_backward 0.05% : 0.000011s : 1: environ_conv 0.10% : 0.000023s : 1: event_method 0.03% : 0.000007s : 1: full_micro_interleaved_order_control 0.03% : 0.000008s : 1: get_jit_bprop_graph 0.05% : 0.000012s : 1: graph_reusing 0.03% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.03% : 0.000007s : 1: handle_group_info 0.03% : 0.000008s : 1: inline 0.04% : 0.000009s : 1: insert-virtual-dataset 0.03% : 0.000006s : 1: interleave_parallel_branches 0.03% : 0.000006s : 1: interleave_split_concat_branches 0.03% : 0.000008s : 1: label_fine_grained_interleaved_index 0.04% : 0.000010s : 1: label_micro_interleaved_index 1.87% : 0.000445s : 1: loop_unroll 0.03% : 0.000007s : 1: merge_cast_opt 0.03% : 0.000007s : 1: micro_interleaved_order_control 2.27% : 0.000541s : 1: mutable_eliminate 0.04% : 0.000010s : 1: offloading_packed_experts 0.06% : 0.000014s : 1: opt.transform.loop_unroll_optimizer 0.06% : 0.000014s : 1: opt.transform.mutable_eliminate 3.21% : 0.000764s : 78: opt.transform.opt_a 0.11% : 0.000026s : 1: opt.transform.opt_after_cconv 0.10% : 0.000025s : 1: opt.transform.opt_after_jit_grad 0.41% : 0.000097s : 28: opt.transform.opt_b 0.19% : 0.000046s : 2: opt.transform.opt_trans_graph 0.15% : 0.000036s : 4: opt.transform.symbol_engine_opt 10.46% : 0.002490s : 1: opt_a 0.53% : 0.000126s : 1: opt_after_cconv 2.16% : 0.000514s : 1: opt_after_jit_grad 1.13% : 0.000268s : 1: opt_b 21.24% : 0.005054s : 1: optimize 0.10% : 0.000023s : 1: optimize_parallel_all_gather_comm 0.05% : 0.000012s : 1: order_py_execute_after_rewriter 0.09% : 0.000023s : 1: overlap_grad_flash_sp 0.03% : 0.000006s : 1: overlap_grad_matmul_and_grad_allreduce 0.04% : 0.000010s : 1: overlap_grad_ring_attention 0.03% : 0.000007s : 1: overlap_opt_shard_grad_in_pipeline 0.03% : 0.000007s : 1: overlap_opt_shard_in_pipeline 0.03% : 0.000008s : 1: overlap_param_gather 0.03% : 0.000006s : 1: overlap_recompute_allgather_and_fa_grad 0.05% : 0.000011s : 1: overlap_recompute_and_grad_model_parallel 0.03% : 0.000008s : 1: overlap_recompute_comm 0.04% : 0.000010s : 1: parallel-infer-symbol 0.03% : 0.000006s : 1: parallel-infer-symbol-second 0.03% : 0.000007s : 1: partial_unused_args_eliminate 0.04% : 0.000009s : 1: pipeline_parallel_scheduler 0.03% : 0.000007s : 1: pipeline_split 0.14% : 0.000034s : 1: pre_auto_parallel 0.10% : 0.000024s : 1: py_interpret_to_execute 0.06% : 0.000015s : 1: py_interpret_to_execute_after_opt_a 0.03% : 0.000006s : 1: remove_cast_before_assign_add 0.09% : 0.000022s : 1: remove_dup_value 1.21% : 0.000287s : 1: renormalize.infer 1.05% : 0.000250s : 1: renormalize.specialize 0.03% : 0.000008s : 1: reorder_send_recv_between_fp_bp 0.04% : 0.000010s : 1: rewriter_after_jit_bprop_graph 0.18% : 0.000044s : 1: rewriter_after_opt_a 0.22% : 0.000053s : 1: rewriter_before_opt_a 0.04% : 0.000008s : 1: slice_cell_reuse_recomputed_activation 0.03% : 0.000008s : 1: slice_recompute_activation 0.03% : 0.000007s : 1: split_layernorm_comm 0.03% : 0.000008s : 1: split_matmul_comm_elemetwise 0.04% : 0.000010s : 1: swap_dp_allreduce_reducescatter 0.42% : 0.000100s : 1: symbol_engine_optimizer 0.38% : 0.000092s : 1: tuple_transform 20.27% : 0.004824s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:37.994.274 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0134595, [21] [bootstrap]: 0.00042212 [type_inference]: 0.00478412 [event_method]: 1.198e-05 [auto_monad]: 5.693e-05 [graph_reusing]: 5.83002e-06 [inline]: 2.47001e-06 [add_attr]: 0.00328383, [1] [add_attr_with_inline]: 0.00327478, [1] [Cycle 1]: 5.003e-05, [2] [tag_attr]: 1.492e-05 [meta_addattr_fg_expand]: 3.57997e-06 [parallel-infer-symbol]: 3.38999e-06 [pre_auto_parallel]: 2.709e-05 [insert-virtual-dataset]: 2.39001e-06 [parallel-infer-symbol-second]: 9.5999e-07 [dataset_repeat_opt]: 1.81e-06 [pipeline_split]: 1.59e-06 [optimize]: 0.0041751, [53] [py_interpret_to_execute]: 1.8e-05 [rewriter_before_opt_a]: 4.661e-05 [opt_a]: 0.00216169, [2] [Cycle 1]: 0.00153141, [45] [expand_dump_flag]: 2.71e-06 [switch_simplify]: 2.399e-05 [loop_unroll]: 1.368e-05 [a_1]: 0.00029076 [with_stream_mark]: 1.673e-05 [recompute_prepare]: 7.92e-06 [updatestate_depend_eliminate]: 4.06001e-06 [updatestate_assign_eliminate]: 3.46999e-06 [updatestate_loads_eliminate]: 2.94999e-06 [parameter_eliminate]: 1.91e-06 [a_2]: 7.775e-05 [accelerated_algorithm]: 6.86001e-06 [shard]: 1.95001e-06 [meta_shard_fg_expand]: 2.11e-06 [shard_inline]: 6.76e-06 [merge_send_recv]: 8.67e-06 [auto_parallel]: 6.85998e-06 [parallel]: 1.867e-05 [flash_sp]: 7.7e-06 [merge_comm]: 3.75e-06 [allreduce_fusion]: 3.38e-06 [matmul_add_comm_reduction]: 8.48999e-06 [allreduce_slice_to_reducescatter]: 8.60018e-07 [virtual_shard_identity]: 7.71999e-06 [virtual_dataset]: 6.18998e-06 [get_grad_eliminate_]: 6.09001e-06 [virtual_output]: 5.87999e-06 [merge_forward]: 3.95998e-06 [cell_reuse_recompute_pass]: 1.37e-06 [offload_activation]: 9.72999e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.316e-05 [merge_recompute_call_nodes]: 1.64e-06 [before_grad]: 1.057e-05 [set_forward_comm_id_for_comm_node_pass]: 3.8e-06 [meta_fg_expand]: 2.83998e-06 [flash_sp_send_recv_attached]: 2.69999e-06 [receive_attached]: 2.68998e-06 [after_resolve]: 9.84001e-06 [a_after_grad]: 9.26002e-06 [renormalize]: 0.000573 [add_forward_monad_depend]: 5.59e-06 [auto_monad_grad]: 2.68e-06 [auto_monad_eliminator]: 1.57e-05 [cse]: 2.823e-05 [a_3]: 4.708e-05 [Cycle 2]: 0.00062038, [45] [expand_dump_flag]: 9.79984e-07 [switch_simplify]: 8.11002e-06 [loop_unroll]: 5.99e-06 [a_1]: 0.00010595 [with_stream_mark]: 1.139e-05 [recompute_prepare]: 6.59999e-06 [updatestate_depend_eliminate]: 3.26999e-06 [updatestate_assign_eliminate]: 2.24001e-06 [updatestate_loads_eliminate]: 2.78e-06 [parameter_eliminate]: 1.20999e-06 [a_2]: 6.871e-05 [accelerated_algorithm]: 6.43e-06 [shard]: 1.92001e-06 [meta_shard_fg_expand]: 1.71e-06 [shard_inline]: 5.97001e-06 [merge_send_recv]: 4.79998e-06 [auto_parallel]: 5.73002e-06 [parallel]: 5.15999e-06 [flash_sp]: 3.5e-06 [merge_comm]: 3.85e-06 [allreduce_fusion]: 3.53999e-06 [matmul_add_comm_reduction]: 6.09001e-06 [allreduce_slice_to_reducescatter]: 2.89991e-07 [virtual_shard_identity]: 6.95002e-06 [virtual_dataset]: 5.90002e-06 [get_grad_eliminate_]: 5.30999e-06 [virtual_output]: 5.20999e-06 [merge_forward]: 3.11999e-06 [cell_reuse_recompute_pass]: 1.48002e-06 [offload_activation]: 6.90998e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.332e-05 [merge_recompute_call_nodes]: 8.09989e-07 [before_grad]: 9.56e-06 [set_forward_comm_id_for_comm_node_pass]: 4.08999e-06 [meta_fg_expand]: 2.48998e-06 [flash_sp_send_recv_attached]: 1.08001e-06 [receive_attached]: 1.37e-06 [after_resolve]: 8.70001e-06 [a_after_grad]: 8.37998e-06 [renormalize]: 7.00238e-08 [add_forward_monad_depend]: 2.46e-06 [auto_monad_grad]: 1.00001e-06 [auto_monad_eliminator]: 6.94999e-06 [cse]: 1.531e-05 [a_3]: 3.422e-05 [py_interpret_to_execute_after_opt_a]: 1.054e-05 [slice_cell_reuse_recomputed_activation]: 1.96e-06 [rewriter_after_opt_a]: 3.515e-05 [convert_after_rewriter]: 6.53998e-06 [order_py_execute_after_rewriter]: 5.02e-06 [mutable_eliminate]: 0.00055626 [opt_b]: 0.00020374, [1] [Cycle 1]: 0.00019697, [7] [b_1]: 0.00011946 [b_2]: 7.15998e-06 [updatestate_depend_eliminate]: 6.68e-06 [updatestate_assign_eliminate]: 2.68e-06 [updatestate_loads_eliminate]: 2.64999e-06 [renormalize]: 5.59987e-07 [cse]: 2.101e-05 [optimize_parallel_all_gather_comm]: 1.587e-05 [overlap_param_gather]: 1.93002e-06 [cconv]: 2.662e-05 [loop_unroll]: 0.00043692 [opt_after_cconv]: 9.978e-05, [1] [Cycle 1]: 9.418e-05, [7] [c_1]: 2.743e-05 [parameter_eliminate]: 3.03998e-06 [updatestate_depend_eliminate]: 5.14998e-06 [updatestate_assign_eliminate]: 2.49999e-06 [updatestate_loads_eliminate]: 2.43e-06 [cse]: 1.849e-05 [renormalize]: 3.10014e-07 [remove_dup_value]: 1.638e-05 [tuple_transform]: 7.039e-05, [1] [Cycle 1]: 6.558e-05, [4] [d_1]: 3.826e-05 [none_parameter_eliminate]: 1.96e-06 [renormalize]: 1.69995e-07 [switch_simplify]: 6.93998e-06 [partial_unused_args_eliminate]: 1.78002e-06 [add_recomputation]: 4.716e-05 [cse_after_recomputation]: 2.197e-05, [1] [Cycle 1]: 1.744e-05, [1] [cse]: 1.196e-05 [environ_conv]: 5.44998e-06 [swap_dp_allreduce_reducescatter]: 5.42001e-06 [bias_add_comm_swap]: 3.03e-06 [label_micro_interleaved_index]: 4.43001e-06 [label_fine_grained_interleaved_index]: 3.06001e-06 [merge_cast_opt]: 1.29e-06 [slice_recompute_activation]: 2.12001e-06 [micro_interleaved_order_control]: 2.14e-06 [assign_add_opt]: 1.18001e-06 [ForceFp32Comm]: 8.00006e-07 [remove_cast_before_assign_add]: 9.89996e-07 [full_micro_interleaved_order_control]: 2.54001e-06 [reorder_send_recv_between_fp_bp]: 2.78e-06 [comm_op_add_attrs]: 1.03001e-06 [add_comm_op_reuse_tag]: 1.03001e-06 [interleave_split_concat_branches]: 1.09998e-06 [interleave_parallel_branches]: 1.00999e-06 [overlap_opt_shard_in_pipeline]: 1.16002e-06 [overlap_opt_shard_grad_in_pipeline]: 2.09e-06 [control_data_broadcast_order]: 1.195e-05 [grouped_pairwise_exchange_alltoall]: 1.57999e-06 [offloading_packed_experts]: 4.03999e-06 [overlap_recompute_and_grad_model_parallel]: 4.75999e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.25001e-06 [overlap_recompute_allgather_and_fa_grad]: 1.32e-06 [overlap_recompute_comm]: 2.11e-06 [overlap_grad_ring_attention]: 4.32e-06 [overlap_grad_flash_sp]: 1.838e-05 [begin_end_overlap_inline]: 5.09986e-07 [split_matmul_comm_elemetwise]: 2.11e-06 [split_layernorm_comm]: 1.96e-06 [handle_group_info]: 1.35999e-06 [symbol_engine_optimizer]: 7.442e-05, [1] [Cycle 1]: 6.998e-05, [6] [build]: 2.61e-06 [elim_shapecalc]: 9.92001e-06 [elim_not_effective]: 1.236e-05 [opt_reshape]: 6.74999e-06 [fold_const_symbol]: 9.50001e-06 [renormalize]: 2.19996e-07 [detach_backward]: 2.46e-06 [pipeline_parallel_scheduler]: 1.77001e-06 [auto_monad_reorder]: 1.622e-05 [get_jit_bprop_graph]: 1.81e-06 [rewriter_after_jit_bprop_graph]: 4.55999e-06 [opt_after_jit_grad]: 0.00046608 [validate]: 4.206e-05 Sums bootstrap : 0.000422s : 4.58% type_inference : 0.004784s : 51.88% event_method : 0.000012s : 0.13% auto_monad : 0.000057s : 0.62% graph_reusing : 0.000006s : 0.06% inline : 0.000002s : 0.03% add_attr.add_attr_with_inline.tag_attr : 0.000015s : 0.16% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000004s : 0.04% parallel-infer-symbol : 0.000003s : 0.04% pre_auto_parallel : 0.000027s : 0.29% insert-virtual-dataset : 0.000002s : 0.03% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.02% optimize.py_interpret_to_execute : 0.000018s : 0.20% optimize.rewriter_before_opt_a : 0.000047s : 0.51% optimize.opt_a.expand_dump_flag : 0.000004s : 0.04% optimize.opt_a.switch_simplify : 0.000032s : 0.35% optimize.opt_a.loop_unroll : 0.000020s : 0.21% optimize.opt_a.a_1 : 0.000397s : 4.30% optimize.opt_a.with_stream_mark : 0.000028s : 0.30% optimize.opt_a.recompute_prepare : 0.000015s : 0.16% optimize.opt_a.updatestate_depend_eliminate : 0.000007s : 0.08% optimize.opt_a.updatestate_assign_eliminate : 0.000006s : 0.06% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.06% optimize.opt_a.parameter_eliminate : 0.000003s : 0.03% optimize.opt_a.a_2 : 0.000146s : 1.59% optimize.opt_a.accelerated_algorithm : 0.000013s : 0.14% optimize.opt_a.shard : 0.000004s : 0.04% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.04% optimize.opt_a.shard_inline : 0.000013s : 0.14% optimize.opt_a.merge_send_recv : 0.000013s : 0.15% optimize.opt_a.auto_parallel : 0.000013s : 0.14% optimize.opt_a.parallel : 0.000024s : 0.26% optimize.opt_a.flash_sp : 0.000011s : 0.12% optimize.opt_a.merge_comm : 0.000008s : 0.08% optimize.opt_a.allreduce_fusion : 0.000007s : 0.08% optimize.opt_a.matmul_add_comm_reduction : 0.000015s : 0.16% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000015s : 0.16% optimize.opt_a.virtual_dataset : 0.000012s : 0.13% optimize.opt_a.get_grad_eliminate_ : 0.000011s : 0.12% optimize.opt_a.virtual_output : 0.000011s : 0.12% optimize.opt_a.merge_forward : 0.000007s : 0.08% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.03% optimize.opt_a.offload_activation : 0.000017s : 0.18% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000026s : 0.29% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.03% optimize.opt_a.before_grad : 0.000020s : 0.22% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000008s : 0.09% optimize.opt_a.meta_fg_expand : 0.000005s : 0.06% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.04% optimize.opt_a.receive_attached : 0.000004s : 0.04% optimize.opt_a.after_resolve : 0.000019s : 0.20% optimize.opt_a.a_after_grad : 0.000018s : 0.19% optimize.opt_a.renormalize : 0.000573s : 6.21% optimize.opt_a.add_forward_monad_depend : 0.000008s : 0.09% optimize.opt_a.auto_monad_grad : 0.000004s : 0.04% optimize.opt_a.auto_monad_eliminator : 0.000023s : 0.25% optimize.opt_a.cse : 0.000044s : 0.47% optimize.opt_a.a_3 : 0.000081s : 0.88% optimize.py_interpret_to_execute_after_opt_a : 0.000011s : 0.11% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.02% optimize.rewriter_after_opt_a : 0.000035s : 0.38% optimize.convert_after_rewriter : 0.000007s : 0.07% optimize.order_py_execute_after_rewriter : 0.000005s : 0.05% optimize.mutable_eliminate : 0.000556s : 6.03% optimize.opt_b.b_1 : 0.000119s : 1.30% optimize.opt_b.b_2 : 0.000007s : 0.08% optimize.opt_b.updatestate_depend_eliminate : 0.000007s : 0.07% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_b.renormalize : 0.000001s : 0.01% optimize.opt_b.cse : 0.000021s : 0.23% optimize.optimize_parallel_all_gather_comm : 0.000016s : 0.17% optimize.overlap_param_gather : 0.000002s : 0.02% optimize.cconv : 0.000027s : 0.29% optimize.loop_unroll : 0.000437s : 4.74% optimize.opt_after_cconv.c_1 : 0.000027s : 0.30% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000005s : 0.06% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000002s : 0.03% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.03% optimize.opt_after_cconv.cse : 0.000018s : 0.20% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000016s : 0.18% optimize.tuple_transform.d_1 : 0.000038s : 0.41% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.02% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000007s : 0.08% optimize.partial_unused_args_eliminate : 0.000002s : 0.02% optimize.add_recomputation : 0.000047s : 0.51% optimize.cse_after_recomputation.cse : 0.000012s : 0.13% optimize.environ_conv : 0.000005s : 0.06% optimize.swap_dp_allreduce_reducescatter : 0.000005s : 0.06% optimize.bias_add_comm_swap : 0.000003s : 0.03% optimize.label_micro_interleaved_index : 0.000004s : 0.05% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.03% optimize.merge_cast_opt : 0.000001s : 0.01% optimize.slice_recompute_activation : 0.000002s : 0.02% optimize.micro_interleaved_order_control : 0.000002s : 0.02% optimize.assign_add_opt : 0.000001s : 0.01% optimize.ForceFp32Comm : 0.000001s : 0.01% optimize.remove_cast_before_assign_add : 0.000001s : 0.01% optimize.full_micro_interleaved_order_control : 0.000003s : 0.03% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.03% optimize.comm_op_add_attrs : 0.000001s : 0.01% optimize.add_comm_op_reuse_tag : 0.000001s : 0.01% optimize.interleave_split_concat_branches : 0.000001s : 0.01% optimize.interleave_parallel_branches : 0.000001s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.02% optimize.control_data_broadcast_order : 0.000012s : 0.13% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.02% optimize.offloading_packed_experts : 0.000004s : 0.04% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.05% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.01% optimize.overlap_recompute_comm : 0.000002s : 0.02% optimize.overlap_grad_ring_attention : 0.000004s : 0.05% optimize.overlap_grad_flash_sp : 0.000018s : 0.20% optimize.begin_end_overlap_inline : 0.000001s : 0.01% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.02% optimize.split_layernorm_comm : 0.000002s : 0.02% optimize.handle_group_info : 0.000001s : 0.01% optimize.symbol_engine_optimizer.build : 0.000003s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000010s : 0.11% optimize.symbol_engine_optimizer.elim_not_effective : 0.000012s : 0.13% optimize.symbol_engine_optimizer.opt_reshape : 0.000007s : 0.07% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000010s : 0.10% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.03% pipeline_parallel_scheduler : 0.000002s : 0.02% auto_monad_reorder : 0.000016s : 0.18% get_jit_bprop_graph : 0.000002s : 0.02% rewriter_after_jit_bprop_graph : 0.000005s : 0.05% opt_after_jit_grad : 0.000466s : 5.05% validate : 0.000042s : 0.46% Time group info: ------[substitution.] 0.000112 19 1.71% : 0.000002s : 2: substitution.elim_not_effective 1.22% : 0.000001s : 2: substitution.fold_const_symbol 4.95% : 0.000006s : 3: substitution.graph_param_transform 81.80% : 0.000092s : 2: substitution.inline 3.28% : 0.000004s : 4: substitution.j_node_and_user_rematch 4.12% : 0.000005s : 4: substitution.remove_not_recompute_node 2.93% : 0.000003s : 2: substitution.replace_old_param ------[type_inference.] 0.004740 2 91.06% : 0.004316s : 1: type_inference.infer 8.94% : 0.000424s : 1: type_inference.specialize ------[replace.] 0.000021 2 100.00% : 0.000021s : 2: replace.inline ------[match.] 0.000090 2 100.00% : 0.000090s : 2: match.inline ------[predicate.] 0.000134 754 0.81% : 0.000001s : 7: predicate.accumulaten_eliminater 1.23% : 0.000002s : 3: predicate.ad_related_special_op_eliminate 0.63% : 0.000001s : 6: predicate.addn_check_dump 0.81% : 0.000001s : 7: predicate.addn_zero_filter 0.69% : 0.000001s : 7: predicate.adjust_all_reduce_mul_add 2.29% : 0.000003s : 13: predicate.arithmetic_simplify 0.89% : 0.000001s : 7: predicate.cast_eliminate 0.72% : 0.000001s : 6: predicate.check_bprop_eliminate 0.62% : 0.000001s : 6: predicate.compare_switch_simplify 0.21% : 0.000000s : 3: predicate.const_output_eliminate 0.73% : 0.000001s : 6: predicate.depend_value_elim 0.77% : 0.000001s : 7: predicate.dict_get_item_const_eliminator 1.03% : 0.000001s : 7: predicate.dict_get_item_eliminator 0.76% : 0.000001s : 7: predicate.dict_set_item_eliminator 1.23% : 0.000002s : 6: predicate.dumpgradient_eliminate 0.31% : 0.000000s : 3: predicate.elim_not_effective 0.67% : 0.000001s : 3: predicate.elim_shapecalc_of_broadcastargs 1.10% : 0.000001s : 10: predicate.environ_add_const_eliminate 1.02% : 0.000001s : 10: predicate.environ_get_add_eliminate 0.96% : 0.000001s : 10: predicate.environ_get_depend_swap 1.71% : 0.000002s : 16: predicate.environ_get_eliminate 1.01% : 0.000001s : 10: predicate.environ_get_set_eliminate 0.91% : 0.000001s : 9: predicate.exchange_switch_depend_value 2.01% : 0.000003s : 9: predicate.float_depend_g_call 0.61% : 0.000001s : 6: predicate.float_environ_get_switch 0.90% : 0.000001s : 9: predicate.float_tuple_getitem_switch 0.22% : 0.000000s : 3: predicate.fold_const_symbol 0.75% : 0.000001s : 6: predicate.get_grad_eliminate 0.37% : 0.000000s : 3: predicate.graph_param_transform 0.77% : 0.000001s : 6: predicate.incorporate_call 0.60% : 0.000001s : 6: predicate.incorporate_call_switch 6.61% : 0.000009s : 34: predicate.inline 0.98% : 0.000001s : 6: predicate.inline_without_move 0.41% : 0.000001s : 6: predicate.j_node_and_user_rematch 1.37% : 0.000002s : 6: predicate.less_batch_normalization 1.58% : 0.000002s : 13: predicate.list_to_tuple_eliminator_ 2.47% : 0.000003s : 20: predicate.load_eliminater 1.39% : 0.000002s : 3: predicate.loop_unroll_after_grad 1.58% : 0.000002s : 14: predicate.loop_unroll_before_grad 1.65% : 0.000002s : 13: predicate.make_slice_get_slice_eliminator 0.64% : 0.000001s : 6: predicate.merge_addn 0.72% : 0.000001s : 6: predicate.micro_step_allgather_replace 0.67% : 0.000001s : 6: predicate.mini_step_allgather_replace 0.67% : 0.000001s : 7: predicate.minmaximum_grad 1.88% : 0.000003s : 3: predicate.mutable_eliminate 0.47% : 0.000001s : 3: predicate.opt_reshape 0.50% : 0.000001s : 3: predicate.parallel_virtual_node 1.23% : 0.000002s : 9: predicate.partial_defer_inline 1.23% : 0.000002s : 10: predicate.partial_eliminate 0.74% : 0.000001s : 7: predicate.print_const_string_wrapper 0.69% : 0.000001s : 6: predicate.reduce_all_const_elim 1.31% : 0.000002s : 7: predicate.reduce_eliminate 2.02% : 0.000003s : 20: predicate.redundant_stop_gradient_eliminater 1.05% : 0.000001s : 6: predicate.remove_not_recompute_node 1.10% : 0.000001s : 13: predicate.replace_applicator 0.65% : 0.000001s : 6: predicate.replace_old_param 0.66% : 0.000001s : 3: predicate.reset_defer_inline 0.80% : 0.000001s : 7: predicate.reshape_eliminate 0.85% : 0.000001s : 6: predicate.row_tensor_add_zeros_like 0.45% : 0.000001s : 3: predicate.row_tensor_eliminate 1.04% : 0.000001s : 6: predicate.same_eliminate 0.56% : 0.000001s : 6: predicate.set_cell_output_no_recompute 0.96% : 0.000001s : 6: predicate.shard_identity_eliminate 0.85% : 0.000001s : 6: predicate.special_op_eliminate 0.90% : 0.000001s : 6: predicate.specialize_transform 1.10% : 0.000001s : 6: predicate.split_environ_get_set_with_tuple_value 0.89% : 0.000001s : 6: predicate.stack_unstack_eliminate 0.43% : 0.000001s : 3: predicate.switch_call_monad_eliminater 0.99% : 0.000001s : 9: predicate.switch_defer_inline 1.71% : 0.000002s : 15: predicate.switch_layer_defer_inline 4.48% : 0.000006s : 32: predicate.switch_simplify 0.75% : 0.000001s : 7: predicate.tile_eliminate 0.81% : 0.000001s : 7: predicate.transpose_eliminate 1.62% : 0.000002s : 13: predicate.tuple_list_convert_item_index_to_positive 1.66% : 0.000002s : 13: predicate.tuple_list_get_item_const_eliminator 1.41% : 0.000002s : 13: predicate.tuple_list_get_item_depend_reorder 3.20% : 0.000004s : 19: predicate.tuple_list_get_item_eliminator 1.41% : 0.000002s : 13: predicate.tuple_list_get_set_item_eliminator 2.20% : 0.000003s : 19: predicate.tuple_list_set_item_eliminator 1.71% : 0.000002s : 13: predicate.tuple_to_list_eliminator_ 1.91% : 0.000003s : 20: predicate.updatestate_pure_node_eliminater 2.67% : 0.000004s : 26: predicate.updatestate_useless_node_eliminater 0.43% : 0.000001s : 3: predicate.value_based_eliminate 0.81% : 0.000001s : 6: predicate.virtual_dataset_eliminate 0.79% : 0.000001s : 6: predicate.virtual_output_eliminate 0.34% : 0.000000s : 3: predicate.virtual_view_grad_eliminate 0.68% : 0.000001s : 3: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000214 5 8.24% : 0.000018s : 1: func_graph_cloner_run.FuncGraphClonerGraph 91.76% : 0.000196s : 4: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.022368 192 0.02% : 0.000004s : 1: ForceFp32Comm 14.71% : 0.003289s : 1: add_attr 14.66% : 0.003278s : 1: add_attr_with_inline 0.02% : 0.000004s : 1: add_comm_op_reuse_tag 0.23% : 0.000051s : 1: add_recomputation 0.02% : 0.000004s : 1: assign_add_opt 0.28% : 0.000062s : 1: auto_monad 0.09% : 0.000020s : 1: auto_monad_reorder 0.02% : 0.000003s : 1: begin_end_overlap_inline 0.03% : 0.000006s : 1: bias_add_comm_swap 1.99% : 0.000446s : 1: bootstrap 0.14% : 0.000030s : 1: cconv 0.02% : 0.000004s : 1: comm_op_add_attrs 0.07% : 0.000015s : 1: control_data_broadcast_order 0.04% : 0.000010s : 1: convert_after_rewriter 0.11% : 0.000025s : 1: cse_after_recomputation 0.02% : 0.000005s : 1: dataset_repeat_opt 0.03% : 0.000006s : 1: detach_backward 0.04% : 0.000008s : 1: environ_conv 0.08% : 0.000018s : 1: event_method 0.02% : 0.000005s : 1: full_micro_interleaved_order_control 0.02% : 0.000005s : 1: get_jit_bprop_graph 0.04% : 0.000009s : 1: graph_reusing 0.02% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.02% : 0.000004s : 1: handle_group_info 0.02% : 0.000005s : 1: inline 0.03% : 0.000006s : 1: insert-virtual-dataset 0.02% : 0.000004s : 1: interleave_parallel_branches 0.02% : 0.000004s : 1: interleave_split_concat_branches 0.03% : 0.000006s : 1: label_fine_grained_interleaved_index 0.03% : 0.000007s : 1: label_micro_interleaved_index 1.99% : 0.000445s : 1: loop_unroll 0.02% : 0.000004s : 1: merge_cast_opt 0.02% : 0.000005s : 1: micro_interleaved_order_control 2.53% : 0.000565s : 1: mutable_eliminate 0.03% : 0.000007s : 1: offloading_packed_experts 0.06% : 0.000014s : 1: opt.transform.loop_unroll_optimizer 0.07% : 0.000016s : 1: opt.transform.mutable_eliminate 3.39% : 0.000758s : 78: opt.transform.opt_a 0.12% : 0.000026s : 1: opt.transform.opt_after_cconv 0.10% : 0.000023s : 1: opt.transform.opt_after_jit_grad 0.42% : 0.000094s : 28: opt.transform.opt_b 0.19% : 0.000043s : 2: opt.transform.opt_trans_graph 0.15% : 0.000035s : 4: opt.transform.symbol_engine_opt 9.68% : 0.002165s : 1: opt_a 0.46% : 0.000103s : 1: opt_after_cconv 2.12% : 0.000475s : 1: opt_after_jit_grad 0.93% : 0.000207s : 1: opt_b 18.69% : 0.004180s : 1: optimize 0.09% : 0.000019s : 1: optimize_parallel_all_gather_comm 0.04% : 0.000008s : 1: order_py_execute_after_rewriter 0.10% : 0.000022s : 1: overlap_grad_flash_sp 0.02% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.03% : 0.000007s : 1: overlap_grad_ring_attention 0.02% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.02% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.02% : 0.000005s : 1: overlap_param_gather 0.02% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.03% : 0.000008s : 1: overlap_recompute_and_grad_model_parallel 0.02% : 0.000005s : 1: overlap_recompute_comm 0.03% : 0.000007s : 1: parallel-infer-symbol 0.02% : 0.000004s : 1: parallel-infer-symbol-second 0.02% : 0.000005s : 1: partial_unused_args_eliminate 0.02% : 0.000005s : 1: pipeline_parallel_scheduler 0.02% : 0.000004s : 1: pipeline_split 0.14% : 0.000031s : 1: pre_auto_parallel 0.10% : 0.000022s : 1: py_interpret_to_execute 0.06% : 0.000014s : 1: py_interpret_to_execute_after_opt_a 0.02% : 0.000004s : 1: remove_cast_before_assign_add 0.09% : 0.000020s : 1: remove_dup_value 1.35% : 0.000303s : 1: renormalize.infer 1.17% : 0.000262s : 1: renormalize.specialize 0.02% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.04% : 0.000008s : 1: rewriter_after_jit_bprop_graph 0.18% : 0.000039s : 1: rewriter_after_opt_a 0.23% : 0.000050s : 1: rewriter_before_opt_a 0.02% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.02% : 0.000005s : 1: slice_recompute_activation 0.02% : 0.000005s : 1: split_layernorm_comm 0.02% : 0.000005s : 1: split_matmul_comm_elemetwise 0.04% : 0.000008s : 1: swap_dp_allreduce_reducescatter 0.34% : 0.000077s : 1: symbol_engine_optimizer 0.33% : 0.000073s : 1: tuple_transform 21.47% : 0.004802s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:38.215.111 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:38.215.424 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0159603, [21] [bootstrap]: 0.00045904 [type_inference]: 0.00511516 [event_method]: 1.378e-05 [auto_monad]: 6.174e-05 [graph_reusing]: 6.16998e-06 [inline]: 2.94001e-06 [add_attr]: 0.00347792, [1] [add_attr_with_inline]: 0.00346795, [1] [Cycle 1]: 7.993e-05, [2] [tag_attr]: 1.76e-05 [meta_addattr_fg_expand]: 4.18999e-06 [parallel-infer-symbol]: 3.16999e-06 [pre_auto_parallel]: 3.066e-05 [insert-virtual-dataset]: 2.34001e-06 [parallel-infer-symbol-second]: 7.60017e-07 [dataset_repeat_opt]: 1.72999e-06 [pipeline_split]: 1.65001e-06 [optimize]: 0.00554586, [53] [py_interpret_to_execute]: 2.497e-05 [rewriter_before_opt_a]: 5.919e-05 [opt_a]: 0.00298301, [2] [Cycle 1]: 0.00206693, [45] [expand_dump_flag]: 2.99999e-06 [switch_simplify]: 2.68e-05 [loop_unroll]: 1.588e-05 [a_1]: 0.00036945 [with_stream_mark]: 1.769e-05 [recompute_prepare]: 1.049e-05 [updatestate_depend_eliminate]: 4.94998e-06 [updatestate_assign_eliminate]: 3.98999e-06 [updatestate_loads_eliminate]: 3.70998e-06 [parameter_eliminate]: 1.94e-06 [a_2]: 0.00014468 [accelerated_algorithm]: 8.42e-06 [shard]: 2.34001e-06 [meta_shard_fg_expand]: 2.21e-06 [shard_inline]: 7.8e-06 [merge_send_recv]: 9.87001e-06 [auto_parallel]: 8.46002e-06 [parallel]: 1.92e-05 [flash_sp]: 9.42001e-06 [merge_comm]: 5.61003e-06 [allreduce_fusion]: 4.73001e-06 [matmul_add_comm_reduction]: 1.065e-05 [allreduce_slice_to_reducescatter]: 6.59988e-07 [virtual_shard_identity]: 9.20001e-06 [virtual_dataset]: 7.45e-06 [get_grad_eliminate_]: 7.51001e-06 [virtual_output]: 7.45e-06 [merge_forward]: 4.37e-06 [cell_reuse_recompute_pass]: 1.61998e-06 [offload_activation]: 1.187e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.844e-05 [merge_recompute_call_nodes]: 1.64998e-06 [before_grad]: 1.371e-05 [set_forward_comm_id_for_comm_node_pass]: 4.80001e-06 [meta_fg_expand]: 3.43999e-06 [flash_sp_send_recv_attached]: 2.39999e-06 [receive_attached]: 2.39001e-06 [after_resolve]: 1.133e-05 [a_after_grad]: 1.146e-05 [renormalize]: 0.00073348 [add_forward_monad_depend]: 5.82001e-06 [auto_monad_grad]: 2.88998e-06 [auto_monad_eliminator]: 1.739e-05 [cse]: 3.418e-05 [a_3]: 7.271e-05 [Cycle 2]: 0.00090218, [45] [expand_dump_flag]: 1.80001e-06 [switch_simplify]: 9.15001e-06 [loop_unroll]: 7.58001e-06 [a_1]: 0.00015315 [with_stream_mark]: 1.357e-05 [recompute_prepare]: 7.7e-06 [updatestate_depend_eliminate]: 4.31002e-06 [updatestate_assign_eliminate]: 2.97002e-06 [updatestate_loads_eliminate]: 2.93e-06 [parameter_eliminate]: 1.36002e-06 [a_2]: 0.00011469 [accelerated_algorithm]: 7.31999e-06 [shard]: 1.76e-06 [meta_shard_fg_expand]: 1.87001e-06 [shard_inline]: 7.51001e-06 [merge_send_recv]: 7.15e-06 [auto_parallel]: 6.61e-06 [parallel]: 5.88002e-06 [flash_sp]: 3.45e-06 [merge_comm]: 5.00001e-06 [allreduce_fusion]: 4.09002e-06 [matmul_add_comm_reduction]: 7.23e-06 [allreduce_slice_to_reducescatter]: 5.69999e-07 [virtual_shard_identity]: 8.15999e-06 [virtual_dataset]: 6.98998e-06 [get_grad_eliminate_]: 6.66e-06 [virtual_output]: 6.66e-06 [merge_forward]: 3.79002e-06 [cell_reuse_recompute_pass]: 1.44e-06 [offload_activation]: 7.56999e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.702e-05 [merge_recompute_call_nodes]: 9.49978e-07 [before_grad]: 1.135e-05 [set_forward_comm_id_for_comm_node_pass]: 5.77999e-06 [meta_fg_expand]: 3.09001e-06 [flash_sp_send_recv_attached]: 1.12e-06 [receive_attached]: 1.37999e-06 [after_resolve]: 9.54999e-06 [a_after_grad]: 1.046e-05 [renormalize]: 7.99773e-08 [add_forward_monad_depend]: 1.94999e-06 [auto_monad_grad]: 1.49e-06 [auto_monad_eliminator]: 8.84e-06 [cse]: 1.977e-05 [a_3]: 5.597e-05 [py_interpret_to_execute_after_opt_a]: 1.455e-05 [slice_cell_reuse_recomputed_activation]: 4.68999e-06 [rewriter_after_opt_a]: 4.486e-05 [convert_after_rewriter]: 1.03e-05 [order_py_execute_after_rewriter]: 9.05001e-06 [mutable_eliminate]: 0.00065012 [opt_b]: 0.00030446, [1] [Cycle 1]: 0.00029535, [7] [b_1]: 0.00019181 [b_2]: 9.07001e-06 [updatestate_depend_eliminate]: 6.91001e-06 [updatestate_assign_eliminate]: 3.11001e-06 [updatestate_loads_eliminate]: 3.31999e-06 [renormalize]: 4.50003e-07 [cse]: 2.43e-05 [optimize_parallel_all_gather_comm]: 2.093e-05 [overlap_param_gather]: 4.91002e-06 [cconv]: 3.131e-05 [loop_unroll]: 0.00047077 [opt_after_cconv]: 0.00017555, [1] [Cycle 1]: 0.0001657, [7] [c_1]: 3.56e-05 [parameter_eliminate]: 3.21999e-06 [updatestate_depend_eliminate]: 6.39999e-06 [updatestate_assign_eliminate]: 3.53999e-06 [updatestate_loads_eliminate]: 3.58999e-06 [cse]: 2.402e-05 [renormalize]: 3.00002e-07 [remove_dup_value]: 2.115e-05 [tuple_transform]: 0.00010163, [1] [Cycle 1]: 9.324e-05, [4] [d_1]: 5.065e-05 [none_parameter_eliminate]: 2.48e-06 [renormalize]: 2.89991e-07 [switch_simplify]: 8.55001e-06 [partial_unused_args_eliminate]: 5.04e-06 [add_recomputation]: 5.802e-05 [cse_after_recomputation]: 3.333e-05, [1] [Cycle 1]: 2.587e-05, [1] [cse]: 1.672e-05 [environ_conv]: 1.1e-05 [swap_dp_allreduce_reducescatter]: 8.69998e-06 [bias_add_comm_swap]: 5.16998e-06 [label_micro_interleaved_index]: 7.35e-06 [label_fine_grained_interleaved_index]: 5.20999e-06 [merge_cast_opt]: 4.4e-06 [slice_recompute_activation]: 4.58999e-06 [micro_interleaved_order_control]: 4.79002e-06 [assign_add_opt]: 3.99002e-06 [ForceFp32Comm]: 3.57002e-06 [remove_cast_before_assign_add]: 3.53999e-06 [full_micro_interleaved_order_control]: 4.86997e-06 [reorder_send_recv_between_fp_bp]: 5.77999e-06 [comm_op_add_attrs]: 3.68999e-06 [add_comm_op_reuse_tag]: 3.35e-06 [interleave_split_concat_branches]: 3.46999e-06 [interleave_parallel_branches]: 3.5e-06 [overlap_opt_shard_in_pipeline]: 3.85e-06 [overlap_opt_shard_grad_in_pipeline]: 4.46002e-06 [control_data_broadcast_order]: 1.852e-05 [grouped_pairwise_exchange_alltoall]: 4.37e-06 [offloading_packed_experts]: 6.91999e-06 [overlap_recompute_and_grad_model_parallel]: 7.63001e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.58e-06 [overlap_recompute_allgather_and_fa_grad]: 3.71001e-06 [overlap_recompute_comm]: 5.20001e-06 [overlap_grad_ring_attention]: 7.25e-06 [overlap_grad_flash_sp]: 2.379e-05 [begin_end_overlap_inline]: 3.06999e-06 [split_matmul_comm_elemetwise]: 4.67e-06 [split_layernorm_comm]: 4.37e-06 [handle_group_info]: 3.88999e-06 [symbol_engine_optimizer]: 0.00010839, [1] [Cycle 1]: 0.00010084, [6] [build]: 3.8e-06 [elim_shapecalc]: 1.224e-05 [elim_not_effective]: 1.522e-05 [opt_reshape]: 8.42e-06 [fold_const_symbol]: 1.352e-05 [renormalize]: 2.00002e-07 [detach_backward]: 4.58001e-06 [pipeline_parallel_scheduler]: 1.70001e-06 [auto_monad_reorder]: 2.466e-05 [get_jit_bprop_graph]: 1.87999e-06 [rewriter_after_jit_bprop_graph]: 5.07e-06 [opt_after_jit_grad]: 0.00054691 [validate]: 4.387e-05 Sums bootstrap : 0.000459s : 4.30% type_inference : 0.005115s : 47.96% event_method : 0.000014s : 0.13% auto_monad : 0.000062s : 0.58% graph_reusing : 0.000006s : 0.06% inline : 0.000003s : 0.03% add_attr.add_attr_with_inline.tag_attr : 0.000018s : 0.17% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000004s : 0.04% parallel-infer-symbol : 0.000003s : 0.03% pre_auto_parallel : 0.000031s : 0.29% insert-virtual-dataset : 0.000002s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.02% optimize.py_interpret_to_execute : 0.000025s : 0.23% optimize.rewriter_before_opt_a : 0.000059s : 0.56% optimize.opt_a.expand_dump_flag : 0.000005s : 0.05% optimize.opt_a.switch_simplify : 0.000036s : 0.34% optimize.opt_a.loop_unroll : 0.000023s : 0.22% optimize.opt_a.a_1 : 0.000523s : 4.90% optimize.opt_a.with_stream_mark : 0.000031s : 0.29% optimize.opt_a.recompute_prepare : 0.000018s : 0.17% optimize.opt_a.updatestate_depend_eliminate : 0.000009s : 0.09% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.07% optimize.opt_a.updatestate_loads_eliminate : 0.000007s : 0.06% optimize.opt_a.parameter_eliminate : 0.000003s : 0.03% optimize.opt_a.a_2 : 0.000259s : 2.43% optimize.opt_a.accelerated_algorithm : 0.000016s : 0.15% optimize.opt_a.shard : 0.000004s : 0.04% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.04% optimize.opt_a.shard_inline : 0.000015s : 0.14% optimize.opt_a.merge_send_recv : 0.000017s : 0.16% optimize.opt_a.auto_parallel : 0.000015s : 0.14% optimize.opt_a.parallel : 0.000025s : 0.24% optimize.opt_a.flash_sp : 0.000013s : 0.12% optimize.opt_a.merge_comm : 0.000011s : 0.10% optimize.opt_a.allreduce_fusion : 0.000009s : 0.08% optimize.opt_a.matmul_add_comm_reduction : 0.000018s : 0.17% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000017s : 0.16% optimize.opt_a.virtual_dataset : 0.000014s : 0.14% optimize.opt_a.get_grad_eliminate_ : 0.000014s : 0.13% optimize.opt_a.virtual_output : 0.000014s : 0.13% optimize.opt_a.merge_forward : 0.000008s : 0.08% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.03% optimize.opt_a.offload_activation : 0.000019s : 0.18% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000035s : 0.33% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.02% optimize.opt_a.before_grad : 0.000025s : 0.23% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000011s : 0.10% optimize.opt_a.meta_fg_expand : 0.000007s : 0.06% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.03% optimize.opt_a.receive_attached : 0.000004s : 0.04% optimize.opt_a.after_resolve : 0.000021s : 0.20% optimize.opt_a.a_after_grad : 0.000022s : 0.21% optimize.opt_a.renormalize : 0.000734s : 6.88% optimize.opt_a.add_forward_monad_depend : 0.000008s : 0.07% optimize.opt_a.auto_monad_grad : 0.000004s : 0.04% optimize.opt_a.auto_monad_eliminator : 0.000026s : 0.25% optimize.opt_a.cse : 0.000054s : 0.51% optimize.opt_a.a_3 : 0.000129s : 1.21% optimize.py_interpret_to_execute_after_opt_a : 0.000015s : 0.14% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.04% optimize.rewriter_after_opt_a : 0.000045s : 0.42% optimize.convert_after_rewriter : 0.000010s : 0.10% optimize.order_py_execute_after_rewriter : 0.000009s : 0.08% optimize.mutable_eliminate : 0.000650s : 6.10% optimize.opt_b.b_1 : 0.000192s : 1.80% optimize.opt_b.b_2 : 0.000009s : 0.09% optimize.opt_b.updatestate_depend_eliminate : 0.000007s : 0.06% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_b.renormalize : 0.000000s : 0.00% optimize.opt_b.cse : 0.000024s : 0.23% optimize.optimize_parallel_all_gather_comm : 0.000021s : 0.20% optimize.overlap_param_gather : 0.000005s : 0.05% optimize.cconv : 0.000031s : 0.29% optimize.loop_unroll : 0.000471s : 4.41% optimize.opt_after_cconv.c_1 : 0.000036s : 0.33% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.06% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000004s : 0.03% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000004s : 0.03% optimize.opt_after_cconv.cse : 0.000024s : 0.23% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000021s : 0.20% optimize.tuple_transform.d_1 : 0.000051s : 0.47% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.02% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000009s : 0.08% optimize.partial_unused_args_eliminate : 0.000005s : 0.05% optimize.add_recomputation : 0.000058s : 0.54% optimize.cse_after_recomputation.cse : 0.000017s : 0.16% optimize.environ_conv : 0.000011s : 0.10% optimize.swap_dp_allreduce_reducescatter : 0.000009s : 0.08% optimize.bias_add_comm_swap : 0.000005s : 0.05% optimize.label_micro_interleaved_index : 0.000007s : 0.07% optimize.label_fine_grained_interleaved_index : 0.000005s : 0.05% optimize.merge_cast_opt : 0.000004s : 0.04% optimize.slice_recompute_activation : 0.000005s : 0.04% optimize.micro_interleaved_order_control : 0.000005s : 0.04% optimize.assign_add_opt : 0.000004s : 0.04% optimize.ForceFp32Comm : 0.000004s : 0.03% optimize.remove_cast_before_assign_add : 0.000004s : 0.03% optimize.full_micro_interleaved_order_control : 0.000005s : 0.05% optimize.reorder_send_recv_between_fp_bp : 0.000006s : 0.05% optimize.comm_op_add_attrs : 0.000004s : 0.03% optimize.add_comm_op_reuse_tag : 0.000003s : 0.03% optimize.interleave_split_concat_branches : 0.000003s : 0.03% optimize.interleave_parallel_branches : 0.000003s : 0.03% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.04% optimize.overlap_opt_shard_grad_in_pipeline : 0.000004s : 0.04% optimize.control_data_broadcast_order : 0.000019s : 0.17% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.04% optimize.offloading_packed_experts : 0.000007s : 0.06% optimize.overlap_recompute_and_grad_model_parallel : 0.000008s : 0.07% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.03% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.03% optimize.overlap_recompute_comm : 0.000005s : 0.05% optimize.overlap_grad_ring_attention : 0.000007s : 0.07% optimize.overlap_grad_flash_sp : 0.000024s : 0.22% optimize.begin_end_overlap_inline : 0.000003s : 0.03% optimize.split_matmul_comm_elemetwise : 0.000005s : 0.04% optimize.split_layernorm_comm : 0.000004s : 0.04% optimize.handle_group_info : 0.000004s : 0.04% optimize.symbol_engine_optimizer.build : 0.000004s : 0.04% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000012s : 0.11% optimize.symbol_engine_optimizer.elim_not_effective : 0.000015s : 0.14% optimize.symbol_engine_optimizer.opt_reshape : 0.000008s : 0.08% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000014s : 0.13% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000005s : 0.04% pipeline_parallel_scheduler : 0.000002s : 0.02% auto_monad_reorder : 0.000025s : 0.23% get_jit_bprop_graph : 0.000002s : 0.02% rewriter_after_jit_bprop_graph : 0.000005s : 0.05% opt_after_jit_grad : 0.000547s : 5.13% validate : 0.000044s : 0.41% Time group info: ------[substitution.] 0.000161 28 15.18% : 0.000025s : 2: substitution.cast_eliminate 1.40% : 0.000002s : 3: substitution.elim_not_effective 1.34% : 0.000002s : 3: substitution.fold_const_symbol 4.10% : 0.000007s : 4: substitution.graph_param_transform 69.38% : 0.000112s : 2: substitution.inline 2.63% : 0.000004s : 6: substitution.j_node_and_user_rematch 3.90% : 0.000006s : 6: substitution.remove_not_recompute_node 2.07% : 0.000003s : 2: substitution.replace_old_param ------[type_inference.] 0.005063 2 86.46% : 0.004378s : 1: type_inference.infer 13.54% : 0.000686s : 1: type_inference.specialize ------[replace.] 0.000023 2 100.00% : 0.000023s : 2: replace.inline ------[match.] 0.000110 2 100.00% : 0.000110s : 2: match.inline ------[predicate.] 0.000174 980 0.83% : 0.000001s : 9: predicate.accumulaten_eliminater 1.00% : 0.000002s : 4: predicate.ad_related_special_op_eliminate 0.66% : 0.000001s : 8: predicate.addn_check_dump 0.85% : 0.000001s : 9: predicate.addn_zero_filter 0.66% : 0.000001s : 9: predicate.adjust_all_reduce_mul_add 2.40% : 0.000004s : 17: predicate.arithmetic_simplify 0.89% : 0.000002s : 9: predicate.cast_eliminate 0.81% : 0.000001s : 8: predicate.check_bprop_eliminate 0.72% : 0.000001s : 8: predicate.compare_switch_simplify 0.23% : 0.000000s : 4: predicate.const_output_eliminate 0.78% : 0.000001s : 8: predicate.depend_value_elim 0.80% : 0.000001s : 9: predicate.dict_get_item_const_eliminator 0.91% : 0.000002s : 9: predicate.dict_get_item_eliminator 0.80% : 0.000001s : 9: predicate.dict_set_item_eliminator 1.38% : 0.000002s : 8: predicate.dumpgradient_eliminate 0.26% : 0.000000s : 4: predicate.elim_not_effective 0.50% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.23% : 0.000002s : 13: predicate.environ_add_const_eliminate 1.02% : 0.000002s : 13: predicate.environ_get_add_eliminate 1.08% : 0.000002s : 13: predicate.environ_get_depend_swap 1.79% : 0.000003s : 21: predicate.environ_get_eliminate 1.07% : 0.000002s : 13: predicate.environ_get_set_eliminate 0.88% : 0.000002s : 11: predicate.exchange_switch_depend_value 1.66% : 0.000003s : 11: predicate.float_depend_g_call 0.78% : 0.000001s : 8: predicate.float_environ_get_switch 1.03% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.22% : 0.000000s : 4: predicate.fold_const_symbol 0.86% : 0.000002s : 8: predicate.get_grad_eliminate 0.29% : 0.000001s : 4: predicate.graph_param_transform 0.78% : 0.000001s : 8: predicate.incorporate_call 0.64% : 0.000001s : 8: predicate.incorporate_call_switch 6.64% : 0.000012s : 44: predicate.inline 1.05% : 0.000002s : 8: predicate.inline_without_move 0.37% : 0.000001s : 8: predicate.j_node_and_user_rematch 1.04% : 0.000002s : 8: predicate.less_batch_normalization 1.71% : 0.000003s : 17: predicate.list_to_tuple_eliminator_ 2.17% : 0.000004s : 26: predicate.load_eliminater 1.28% : 0.000002s : 4: predicate.loop_unroll_after_grad 1.67% : 0.000003s : 16: predicate.loop_unroll_before_grad 1.73% : 0.000003s : 17: predicate.make_slice_get_slice_eliminator 0.80% : 0.000001s : 8: predicate.merge_addn 0.68% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.71% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.75% : 0.000001s : 9: predicate.minmaximum_grad 1.51% : 0.000003s : 4: predicate.mutable_eliminate 0.43% : 0.000001s : 4: predicate.opt_reshape 0.50% : 0.000001s : 4: predicate.parallel_virtual_node 1.27% : 0.000002s : 11: predicate.partial_defer_inline 1.25% : 0.000002s : 13: predicate.partial_eliminate 0.73% : 0.000001s : 9: predicate.print_const_string_wrapper 0.78% : 0.000001s : 8: predicate.reduce_all_const_elim 1.22% : 0.000002s : 9: predicate.reduce_eliminate 2.11% : 0.000004s : 26: predicate.redundant_stop_gradient_eliminater 0.48% : 0.000001s : 8: predicate.remove_not_recompute_node 1.12% : 0.000002s : 17: predicate.replace_applicator 0.68% : 0.000001s : 8: predicate.replace_old_param 0.35% : 0.000001s : 4: predicate.reset_defer_inline 0.80% : 0.000001s : 9: predicate.reshape_eliminate 0.81% : 0.000001s : 8: predicate.row_tensor_add_zeros_like 0.48% : 0.000001s : 4: predicate.row_tensor_eliminate 1.04% : 0.000002s : 8: predicate.same_eliminate 0.49% : 0.000001s : 8: predicate.set_cell_output_no_recompute 1.01% : 0.000002s : 8: predicate.shard_identity_eliminate 0.87% : 0.000002s : 8: predicate.special_op_eliminate 0.95% : 0.000002s : 8: predicate.specialize_transform 1.04% : 0.000002s : 8: predicate.split_environ_get_set_with_tuple_value 0.90% : 0.000002s : 8: predicate.stack_unstack_eliminate 0.54% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.15% : 0.000002s : 11: predicate.switch_defer_inline 1.67% : 0.000003s : 19: predicate.switch_layer_defer_inline 3.99% : 0.000007s : 39: predicate.switch_simplify 0.77% : 0.000001s : 9: predicate.tile_eliminate 0.83% : 0.000001s : 9: predicate.transpose_eliminate 1.75% : 0.000003s : 17: predicate.tuple_list_convert_item_index_to_positive 1.61% : 0.000003s : 17: predicate.tuple_list_get_item_const_eliminator 1.43% : 0.000002s : 17: predicate.tuple_list_get_item_depend_reorder 3.04% : 0.000005s : 25: predicate.tuple_list_get_item_eliminator 1.45% : 0.000003s : 17: predicate.tuple_list_get_set_item_eliminator 2.56% : 0.000004s : 25: predicate.tuple_list_set_item_eliminator 1.64% : 0.000003s : 17: predicate.tuple_to_list_eliminator_ 2.17% : 0.000004s : 26: predicate.updatestate_pure_node_eliminater 2.92% : 0.000005s : 34: predicate.updatestate_useless_node_eliminater 0.62% : 0.000001s : 4: predicate.value_based_eliminate 0.75% : 0.000001s : 8: predicate.virtual_dataset_eliminate 0.95% : 0.000002s : 8: predicate.virtual_output_eliminate 0.43% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.52% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000236 5 8.17% : 0.000019s : 1: func_graph_cloner_run.FuncGraphClonerGraph 91.83% : 0.000216s : 4: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.026893 192 0.02% : 0.000006s : 1: ForceFp32Comm 12.97% : 0.003488s : 1: add_attr 12.91% : 0.003472s : 1: add_attr_with_inline 0.02% : 0.000006s : 1: add_comm_op_reuse_tag 0.23% : 0.000062s : 1: add_recomputation 0.03% : 0.000007s : 1: assign_add_opt 0.26% : 0.000070s : 1: auto_monad 0.12% : 0.000032s : 1: auto_monad_reorder 0.02% : 0.000006s : 1: begin_end_overlap_inline 0.03% : 0.000008s : 1: bias_add_comm_swap 1.87% : 0.000502s : 1: bootstrap 0.13% : 0.000035s : 1: cconv 0.03% : 0.000007s : 1: comm_op_add_attrs 0.08% : 0.000022s : 1: control_data_broadcast_order 0.05% : 0.000013s : 1: convert_after_rewriter 0.14% : 0.000036s : 1: cse_after_recomputation 0.03% : 0.000008s : 1: dataset_repeat_opt 0.07% : 0.000020s : 1: detach_backward 0.05% : 0.000014s : 1: environ_conv 0.09% : 0.000024s : 1: event_method 0.03% : 0.000008s : 1: full_micro_interleaved_order_control 0.03% : 0.000008s : 1: get_jit_bprop_graph 0.05% : 0.000013s : 1: graph_reusing 0.03% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.02% : 0.000007s : 1: handle_group_info 0.03% : 0.000009s : 1: inline 0.03% : 0.000008s : 1: insert-virtual-dataset 0.02% : 0.000006s : 1: interleave_parallel_branches 0.02% : 0.000006s : 1: interleave_split_concat_branches 0.03% : 0.000008s : 1: label_fine_grained_interleaved_index 0.04% : 0.000010s : 1: label_micro_interleaved_index 1.78% : 0.000477s : 1: loop_unroll 0.03% : 0.000007s : 1: merge_cast_opt 0.03% : 0.000008s : 1: micro_interleaved_order_control 2.44% : 0.000657s : 1: mutable_eliminate 0.04% : 0.000010s : 1: offloading_packed_experts 0.06% : 0.000016s : 1: opt.transform.loop_unroll_optimizer 0.07% : 0.000018s : 1: opt.transform.mutable_eliminate 3.68% : 0.000988s : 78: opt.transform.opt_a 0.13% : 0.000034s : 1: opt.transform.opt_after_cconv 0.12% : 0.000032s : 1: opt.transform.opt_after_jit_grad 0.47% : 0.000127s : 28: opt.transform.opt_b 0.21% : 0.000056s : 2: opt.transform.opt_trans_graph 0.17% : 0.000046s : 4: opt.transform.symbol_engine_opt 11.10% : 0.002986s : 1: opt_a 0.67% : 0.000179s : 1: opt_after_cconv 2.07% : 0.000558s : 1: opt_after_jit_grad 1.15% : 0.000308s : 1: opt_b 21.97% : 0.005908s : 1: optimize 0.09% : 0.000024s : 1: optimize_parallel_all_gather_comm 0.04% : 0.000012s : 1: order_py_execute_after_rewriter 0.10% : 0.000027s : 1: overlap_grad_flash_sp 0.02% : 0.000006s : 1: overlap_grad_matmul_and_grad_allreduce 0.04% : 0.000010s : 1: overlap_grad_ring_attention 0.03% : 0.000007s : 1: overlap_opt_shard_grad_in_pipeline 0.02% : 0.000007s : 1: overlap_opt_shard_in_pipeline 0.03% : 0.000009s : 1: overlap_param_gather 0.02% : 0.000006s : 1: overlap_recompute_allgather_and_fa_grad 0.04% : 0.000010s : 1: overlap_recompute_and_grad_model_parallel 0.03% : 0.000008s : 1: overlap_recompute_comm 0.04% : 0.000010s : 1: parallel-infer-symbol 0.02% : 0.000006s : 1: parallel-infer-symbol-second 0.03% : 0.000008s : 1: partial_unused_args_eliminate 0.04% : 0.000010s : 1: pipeline_parallel_scheduler 0.03% : 0.000007s : 1: pipeline_split 0.14% : 0.000038s : 1: pre_auto_parallel 0.11% : 0.000028s : 1: py_interpret_to_execute 0.07% : 0.000018s : 1: py_interpret_to_execute_after_opt_a 0.02% : 0.000006s : 1: remove_cast_before_assign_add 0.09% : 0.000025s : 1: remove_dup_value 1.59% : 0.000429s : 1: renormalize.infer 1.10% : 0.000297s : 1: renormalize.specialize 0.03% : 0.000009s : 1: reorder_send_recv_between_fp_bp 0.04% : 0.000011s : 1: rewriter_after_jit_bprop_graph 0.18% : 0.000049s : 1: rewriter_after_opt_a 0.23% : 0.000063s : 1: rewriter_before_opt_a 0.03% : 0.000008s : 1: slice_cell_reuse_recomputed_activation 0.03% : 0.000007s : 1: slice_recompute_activation 0.03% : 0.000007s : 1: split_layernorm_comm 0.03% : 0.000007s : 1: split_matmul_comm_elemetwise 0.04% : 0.000012s : 1: swap_dp_allreduce_reducescatter 0.41% : 0.000111s : 1: symbol_engine_optimizer 0.39% : 0.000104s : 1: tuple_transform 19.16% : 0.005154s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:38.419.586 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0145404, [21] [bootstrap]: 0.00046005 [type_inference]: 0.00513373 [event_method]: 1.294e-05 [auto_monad]: 5.57e-05 [graph_reusing]: 4.79e-06 [inline]: 3.08e-06 [add_attr]: 0.00318707, [1] [add_attr_with_inline]: 0.00317713, [1] [Cycle 1]: 5.885e-05, [2] [tag_attr]: 1.563e-05 [meta_addattr_fg_expand]: 4.03999e-06 [parallel-infer-symbol]: 3.63e-06 [pre_auto_parallel]: 2.986e-05 [insert-virtual-dataset]: 2.69001e-06 [parallel-infer-symbol-second]: 1.29e-06 [dataset_repeat_opt]: 2.01e-06 [pipeline_split]: 1.84e-06 [optimize]: 0.0049344, [53] [py_interpret_to_execute]: 2.142e-05 [rewriter_before_opt_a]: 7.328e-05 [opt_a]: 0.00269533, [2] [Cycle 1]: 0.00191918, [45] [expand_dump_flag]: 3.51001e-06 [switch_simplify]: 2.67e-05 [loop_unroll]: 1.489e-05 [a_1]: 0.00037416 [with_stream_mark]: 1.865e-05 [recompute_prepare]: 1.043e-05 [updatestate_depend_eliminate]: 4.67e-06 [updatestate_assign_eliminate]: 3.87998e-06 [updatestate_loads_eliminate]: 3.68e-06 [parameter_eliminate]: 1.99999e-06 [a_2]: 0.00010055 [accelerated_algorithm]: 9.02999e-06 [shard]: 2.39999e-06 [meta_shard_fg_expand]: 2.70002e-06 [shard_inline]: 8.83001e-06 [merge_send_recv]: 1.056e-05 [auto_parallel]: 7.71001e-06 [parallel]: 1.913e-05 [flash_sp]: 8.85999e-06 [merge_comm]: 4.89e-06 [allreduce_fusion]: 4.74e-06 [matmul_add_comm_reduction]: 1.167e-05 [allreduce_slice_to_reducescatter]: 7.00005e-07 [virtual_shard_identity]: 1.024e-05 [virtual_dataset]: 7.70998e-06 [get_grad_eliminate_]: 7.4e-06 [virtual_output]: 7.98999e-06 [merge_forward]: 5.21002e-06 [cell_reuse_recompute_pass]: 1.37e-06 [offload_activation]: 1.197e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.588e-05 [merge_recompute_call_nodes]: 1.45999e-06 [before_grad]: 1.456e-05 [set_forward_comm_id_for_comm_node_pass]: 5.00001e-06 [meta_fg_expand]: 3.55998e-06 [flash_sp_send_recv_attached]: 2.85998e-06 [receive_attached]: 1.96e-06 [after_resolve]: 1.116e-05 [a_after_grad]: 1.133e-05 [renormalize]: 0.00076041 [add_forward_monad_depend]: 5.32001e-06 [auto_monad_grad]: 2.68003e-06 [auto_monad_eliminator]: 1.737e-05 [cse]: 3.946e-05 [a_3]: 5.872e-05 [Cycle 2]: 0.00076558, [45] [expand_dump_flag]: 2.02001e-06 [switch_simplify]: 8.74003e-06 [loop_unroll]: 8.17e-06 [a_1]: 0.00016312 [with_stream_mark]: 1.379e-05 [recompute_prepare]: 8.13001e-06 [updatestate_depend_eliminate]: 5.29e-06 [updatestate_assign_eliminate]: 3.13e-06 [updatestate_loads_eliminate]: 2.89001e-06 [parameter_eliminate]: 1.14e-06 [a_2]: 9.07e-05 [accelerated_algorithm]: 8.13999e-06 [shard]: 1.24998e-06 [meta_shard_fg_expand]: 1.79e-06 [shard_inline]: 7.21999e-06 [merge_send_recv]: 5.76e-06 [auto_parallel]: 6.42001e-06 [parallel]: 6.01e-06 [flash_sp]: 3.5e-06 [merge_comm]: 4.32e-06 [allreduce_fusion]: 3.94002e-06 [matmul_add_comm_reduction]: 7e-06 [allreduce_slice_to_reducescatter]: 5.3001e-07 [virtual_shard_identity]: 9.23002e-06 [virtual_dataset]: 7.21001e-06 [get_grad_eliminate_]: 6.58998e-06 [virtual_output]: 6.48998e-06 [merge_forward]: 3.51999e-06 [cell_reuse_recompute_pass]: 1.57999e-06 [offload_activation]: 7.9e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.604e-05 [merge_recompute_call_nodes]: 8.10018e-07 [before_grad]: 1.253e-05 [set_forward_comm_id_for_comm_node_pass]: 4.68999e-06 [meta_fg_expand]: 3.33998e-06 [flash_sp_send_recv_attached]: 9.29984e-07 [receive_attached]: 1.81e-06 [after_resolve]: 1.065e-05 [a_after_grad]: 1.079e-05 [renormalize]: 8.00064e-08 [add_forward_monad_depend]: 1.73002e-06 [auto_monad_grad]: 9.89996e-07 [auto_monad_eliminator]: 1.039e-05 [cse]: 1.984e-05 [a_3]: 4.558e-05 [py_interpret_to_execute_after_opt_a]: 1.412e-05 [slice_cell_reuse_recomputed_activation]: 2.42001e-06 [rewriter_after_opt_a]: 4.497e-05 [convert_after_rewriter]: 8.38001e-06 [order_py_execute_after_rewriter]: 6.14999e-06 [mutable_eliminate]: 0.00060848 [opt_b]: 0.00023783, [1] [Cycle 1]: 0.00023086, [7] [b_1]: 0.0001465 [b_2]: 8.23999e-06 [updatestate_depend_eliminate]: 7.35e-06 [updatestate_assign_eliminate]: 2.99999e-06 [updatestate_loads_eliminate]: 3.46001e-06 [renormalize]: 7.30011e-07 [cse]: 2.57e-05 [optimize_parallel_all_gather_comm]: 1.726e-05 [overlap_param_gather]: 1.82999e-06 [cconv]: 2.865e-05 [loop_unroll]: 0.00045748 [opt_after_cconv]: 0.00011335, [1] [Cycle 1]: 0.00010788, [7] [c_1]: 3.394e-05 [parameter_eliminate]: 3.17002e-06 [updatestate_depend_eliminate]: 6.01998e-06 [updatestate_assign_eliminate]: 3.18e-06 [updatestate_loads_eliminate]: 2.96001e-06 [cse]: 2.327e-05 [renormalize]: 3.69997e-07 [remove_dup_value]: 1.787e-05 [tuple_transform]: 8.069e-05, [1] [Cycle 1]: 7.619e-05, [4] [d_1]: 4.777e-05 [none_parameter_eliminate]: 1.75001e-06 [renormalize]: 3.29979e-07 [switch_simplify]: 7.77e-06 [partial_unused_args_eliminate]: 1.93002e-06 [add_recomputation]: 5.923e-05 [cse_after_recomputation]: 2.572e-05, [1] [Cycle 1]: 2.147e-05, [1] [cse]: 1.591e-05 [environ_conv]: 6.98998e-06 [swap_dp_allreduce_reducescatter]: 6.23e-06 [bias_add_comm_swap]: 2.57001e-06 [label_micro_interleaved_index]: 4.39998e-06 [label_fine_grained_interleaved_index]: 2.78e-06 [merge_cast_opt]: 1.37999e-06 [slice_recompute_activation]: 2.23002e-06 [micro_interleaved_order_control]: 2.11e-06 [assign_add_opt]: 1.33002e-06 [ForceFp32Comm]: 1.24e-06 [remove_cast_before_assign_add]: 1.02e-06 [full_micro_interleaved_order_control]: 1.99e-06 [reorder_send_recv_between_fp_bp]: 2.71999e-06 [comm_op_add_attrs]: 1.00999e-06 [add_comm_op_reuse_tag]: 9.70002e-07 [interleave_split_concat_branches]: 1.09998e-06 [interleave_parallel_branches]: 1.37e-06 [overlap_opt_shard_in_pipeline]: 1.16002e-06 [overlap_opt_shard_grad_in_pipeline]: 1.67999e-06 [control_data_broadcast_order]: 1.468e-05 [grouped_pairwise_exchange_alltoall]: 1.57001e-06 [offloading_packed_experts]: 4.12e-06 [overlap_recompute_and_grad_model_parallel]: 5.49e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.17999e-06 [overlap_recompute_allgather_and_fa_grad]: 1.35999e-06 [overlap_recompute_comm]: 2.24999e-06 [overlap_grad_ring_attention]: 4.80001e-06 [overlap_grad_flash_sp]: 2.297e-05 [begin_end_overlap_inline]: 4.89992e-07 [split_matmul_comm_elemetwise]: 2.26e-06 [split_layernorm_comm]: 1.55001e-06 [handle_group_info]: 1.03001e-06 [symbol_engine_optimizer]: 8.491e-05, [1] [Cycle 1]: 8.013e-05, [6] [build]: 2.99999e-06 [elim_shapecalc]: 1.201e-05 [elim_not_effective]: 1.542e-05 [opt_reshape]: 8.12998e-06 [fold_const_symbol]: 1.195e-05 [renormalize]: 2.10013e-07 [detach_backward]: 2.46e-06 [pipeline_parallel_scheduler]: 1.59e-06 [auto_monad_reorder]: 2.072e-05 [get_jit_bprop_graph]: 1.81e-06 [rewriter_after_jit_bprop_graph]: 4.97999e-06 [opt_after_jit_grad]: 0.00047393 [validate]: 4.421e-05 Sums bootstrap : 0.000460s : 4.45% type_inference : 0.005134s : 49.61% event_method : 0.000013s : 0.13% auto_monad : 0.000056s : 0.54% graph_reusing : 0.000005s : 0.05% inline : 0.000003s : 0.03% add_attr.add_attr_with_inline.tag_attr : 0.000016s : 0.15% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000004s : 0.04% parallel-infer-symbol : 0.000004s : 0.04% pre_auto_parallel : 0.000030s : 0.29% insert-virtual-dataset : 0.000003s : 0.03% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.02% optimize.py_interpret_to_execute : 0.000021s : 0.21% optimize.rewriter_before_opt_a : 0.000073s : 0.71% optimize.opt_a.expand_dump_flag : 0.000006s : 0.05% optimize.opt_a.switch_simplify : 0.000035s : 0.34% optimize.opt_a.loop_unroll : 0.000023s : 0.22% optimize.opt_a.a_1 : 0.000537s : 5.19% optimize.opt_a.with_stream_mark : 0.000032s : 0.31% optimize.opt_a.recompute_prepare : 0.000019s : 0.18% optimize.opt_a.updatestate_depend_eliminate : 0.000010s : 0.10% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.07% optimize.opt_a.updatestate_loads_eliminate : 0.000007s : 0.06% optimize.opt_a.parameter_eliminate : 0.000003s : 0.03% optimize.opt_a.a_2 : 0.000191s : 1.85% optimize.opt_a.accelerated_algorithm : 0.000017s : 0.17% optimize.opt_a.shard : 0.000004s : 0.04% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.04% optimize.opt_a.shard_inline : 0.000016s : 0.16% optimize.opt_a.merge_send_recv : 0.000016s : 0.16% optimize.opt_a.auto_parallel : 0.000014s : 0.14% optimize.opt_a.parallel : 0.000025s : 0.24% optimize.opt_a.flash_sp : 0.000012s : 0.12% optimize.opt_a.merge_comm : 0.000009s : 0.09% optimize.opt_a.allreduce_fusion : 0.000009s : 0.08% optimize.opt_a.matmul_add_comm_reduction : 0.000019s : 0.18% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000019s : 0.19% optimize.opt_a.virtual_dataset : 0.000015s : 0.14% optimize.opt_a.get_grad_eliminate_ : 0.000014s : 0.14% optimize.opt_a.virtual_output : 0.000014s : 0.14% optimize.opt_a.merge_forward : 0.000009s : 0.08% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.03% optimize.opt_a.offload_activation : 0.000020s : 0.19% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000032s : 0.31% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.02% optimize.opt_a.before_grad : 0.000027s : 0.26% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000010s : 0.09% optimize.opt_a.meta_fg_expand : 0.000007s : 0.07% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.04% optimize.opt_a.receive_attached : 0.000004s : 0.04% optimize.opt_a.after_resolve : 0.000022s : 0.21% optimize.opt_a.a_after_grad : 0.000022s : 0.21% optimize.opt_a.renormalize : 0.000760s : 7.35% optimize.opt_a.add_forward_monad_depend : 0.000007s : 0.07% optimize.opt_a.auto_monad_grad : 0.000004s : 0.04% optimize.opt_a.auto_monad_eliminator : 0.000028s : 0.27% optimize.opt_a.cse : 0.000059s : 0.57% optimize.opt_a.a_3 : 0.000104s : 1.01% optimize.py_interpret_to_execute_after_opt_a : 0.000014s : 0.14% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.02% optimize.rewriter_after_opt_a : 0.000045s : 0.43% optimize.convert_after_rewriter : 0.000008s : 0.08% optimize.order_py_execute_after_rewriter : 0.000006s : 0.06% optimize.mutable_eliminate : 0.000608s : 5.88% optimize.opt_b.b_1 : 0.000146s : 1.42% optimize.opt_b.b_2 : 0.000008s : 0.08% optimize.opt_b.updatestate_depend_eliminate : 0.000007s : 0.07% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_b.renormalize : 0.000001s : 0.01% optimize.opt_b.cse : 0.000026s : 0.25% optimize.optimize_parallel_all_gather_comm : 0.000017s : 0.17% optimize.overlap_param_gather : 0.000002s : 0.02% optimize.cconv : 0.000029s : 0.28% optimize.loop_unroll : 0.000457s : 4.42% optimize.opt_after_cconv.c_1 : 0.000034s : 0.33% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.06% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.cse : 0.000023s : 0.22% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000018s : 0.17% optimize.tuple_transform.d_1 : 0.000048s : 0.46% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.02% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000008s : 0.08% optimize.partial_unused_args_eliminate : 0.000002s : 0.02% optimize.add_recomputation : 0.000059s : 0.57% optimize.cse_after_recomputation.cse : 0.000016s : 0.15% optimize.environ_conv : 0.000007s : 0.07% optimize.swap_dp_allreduce_reducescatter : 0.000006s : 0.06% optimize.bias_add_comm_swap : 0.000003s : 0.02% optimize.label_micro_interleaved_index : 0.000004s : 0.04% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.03% optimize.merge_cast_opt : 0.000001s : 0.01% optimize.slice_recompute_activation : 0.000002s : 0.02% optimize.micro_interleaved_order_control : 0.000002s : 0.02% optimize.assign_add_opt : 0.000001s : 0.01% optimize.ForceFp32Comm : 0.000001s : 0.01% optimize.remove_cast_before_assign_add : 0.000001s : 0.01% optimize.full_micro_interleaved_order_control : 0.000002s : 0.02% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.03% optimize.comm_op_add_attrs : 0.000001s : 0.01% optimize.add_comm_op_reuse_tag : 0.000001s : 0.01% optimize.interleave_split_concat_branches : 0.000001s : 0.01% optimize.interleave_parallel_branches : 0.000001s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.02% optimize.control_data_broadcast_order : 0.000015s : 0.14% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.02% optimize.offloading_packed_experts : 0.000004s : 0.04% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.05% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.01% optimize.overlap_recompute_comm : 0.000002s : 0.02% optimize.overlap_grad_ring_attention : 0.000005s : 0.05% optimize.overlap_grad_flash_sp : 0.000023s : 0.22% optimize.begin_end_overlap_inline : 0.000000s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.02% optimize.split_layernorm_comm : 0.000002s : 0.01% optimize.handle_group_info : 0.000001s : 0.01% optimize.symbol_engine_optimizer.build : 0.000003s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000012s : 0.12% optimize.symbol_engine_optimizer.elim_not_effective : 0.000015s : 0.15% optimize.symbol_engine_optimizer.opt_reshape : 0.000008s : 0.08% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000012s : 0.12% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.02% pipeline_parallel_scheduler : 0.000002s : 0.02% auto_monad_reorder : 0.000021s : 0.20% get_jit_bprop_graph : 0.000002s : 0.02% rewriter_after_jit_bprop_graph : 0.000005s : 0.05% opt_after_jit_grad : 0.000474s : 4.58% validate : 0.000044s : 0.43% Time group info: ------[substitution.] 0.000161 28 14.23% : 0.000023s : 2: substitution.cast_eliminate 1.62% : 0.000003s : 3: substitution.elim_not_effective 1.03% : 0.000002s : 3: substitution.fold_const_symbol 4.01% : 0.000006s : 4: substitution.graph_param_transform 70.27% : 0.000113s : 2: substitution.inline 2.91% : 0.000005s : 6: substitution.j_node_and_user_rematch 4.02% : 0.000006s : 6: substitution.remove_not_recompute_node 1.92% : 0.000003s : 2: substitution.replace_old_param ------[type_inference.] 0.005084 2 90.85% : 0.004619s : 1: type_inference.infer 9.15% : 0.000465s : 1: type_inference.specialize ------[replace.] 0.000024 2 100.00% : 0.000024s : 2: replace.inline ------[match.] 0.000112 2 100.00% : 0.000112s : 2: match.inline ------[predicate.] 0.000175 980 0.81% : 0.000001s : 9: predicate.accumulaten_eliminater 1.00% : 0.000002s : 4: predicate.ad_related_special_op_eliminate 0.83% : 0.000001s : 8: predicate.addn_check_dump 0.94% : 0.000002s : 9: predicate.addn_zero_filter 0.69% : 0.000001s : 9: predicate.adjust_all_reduce_mul_add 2.48% : 0.000004s : 17: predicate.arithmetic_simplify 0.97% : 0.000002s : 9: predicate.cast_eliminate 0.66% : 0.000001s : 8: predicate.check_bprop_eliminate 0.65% : 0.000001s : 8: predicate.compare_switch_simplify 0.23% : 0.000000s : 4: predicate.const_output_eliminate 0.75% : 0.000001s : 8: predicate.depend_value_elim 0.83% : 0.000001s : 9: predicate.dict_get_item_const_eliminator 0.94% : 0.000002s : 9: predicate.dict_get_item_eliminator 0.82% : 0.000001s : 9: predicate.dict_set_item_eliminator 1.11% : 0.000002s : 8: predicate.dumpgradient_eliminate 0.26% : 0.000000s : 4: predicate.elim_not_effective 0.50% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.10% : 0.000002s : 13: predicate.environ_add_const_eliminate 1.05% : 0.000002s : 13: predicate.environ_get_add_eliminate 1.07% : 0.000002s : 13: predicate.environ_get_depend_swap 1.78% : 0.000003s : 21: predicate.environ_get_eliminate 1.08% : 0.000002s : 13: predicate.environ_get_set_eliminate 1.03% : 0.000002s : 11: predicate.exchange_switch_depend_value 1.80% : 0.000003s : 11: predicate.float_depend_g_call 0.74% : 0.000001s : 8: predicate.float_environ_get_switch 0.97% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.22% : 0.000000s : 4: predicate.fold_const_symbol 0.73% : 0.000001s : 8: predicate.get_grad_eliminate 0.27% : 0.000000s : 4: predicate.graph_param_transform 0.77% : 0.000001s : 8: predicate.incorporate_call 0.61% : 0.000001s : 8: predicate.incorporate_call_switch 6.56% : 0.000011s : 44: predicate.inline 1.31% : 0.000002s : 8: predicate.inline_without_move 0.42% : 0.000001s : 8: predicate.j_node_and_user_rematch 1.18% : 0.000002s : 8: predicate.less_batch_normalization 1.88% : 0.000003s : 17: predicate.list_to_tuple_eliminator_ 2.07% : 0.000004s : 26: predicate.load_eliminater 1.22% : 0.000002s : 4: predicate.loop_unroll_after_grad 1.36% : 0.000002s : 16: predicate.loop_unroll_before_grad 1.67% : 0.000003s : 17: predicate.make_slice_get_slice_eliminator 0.69% : 0.000001s : 8: predicate.merge_addn 0.98% : 0.000002s : 8: predicate.micro_step_allgather_replace 0.80% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.70% : 0.000001s : 9: predicate.minmaximum_grad 1.18% : 0.000002s : 4: predicate.mutable_eliminate 0.39% : 0.000001s : 4: predicate.opt_reshape 0.50% : 0.000001s : 4: predicate.parallel_virtual_node 1.40% : 0.000002s : 11: predicate.partial_defer_inline 1.27% : 0.000002s : 13: predicate.partial_eliminate 0.79% : 0.000001s : 9: predicate.print_const_string_wrapper 0.75% : 0.000001s : 8: predicate.reduce_all_const_elim 1.19% : 0.000002s : 9: predicate.reduce_eliminate 2.19% : 0.000004s : 26: predicate.redundant_stop_gradient_eliminater 0.53% : 0.000001s : 8: predicate.remove_not_recompute_node 1.14% : 0.000002s : 17: predicate.replace_applicator 0.60% : 0.000001s : 8: predicate.replace_old_param 0.29% : 0.000001s : 4: predicate.reset_defer_inline 0.90% : 0.000002s : 9: predicate.reshape_eliminate 0.82% : 0.000001s : 8: predicate.row_tensor_add_zeros_like 0.48% : 0.000001s : 4: predicate.row_tensor_eliminate 0.88% : 0.000002s : 8: predicate.same_eliminate 0.65% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.93% : 0.000002s : 8: predicate.shard_identity_eliminate 0.97% : 0.000002s : 8: predicate.special_op_eliminate 1.07% : 0.000002s : 8: predicate.specialize_transform 1.14% : 0.000002s : 8: predicate.split_environ_get_set_with_tuple_value 0.90% : 0.000002s : 8: predicate.stack_unstack_eliminate 0.44% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.05% : 0.000002s : 11: predicate.switch_defer_inline 1.71% : 0.000003s : 19: predicate.switch_layer_defer_inline 4.39% : 0.000008s : 39: predicate.switch_simplify 0.85% : 0.000001s : 9: predicate.tile_eliminate 0.85% : 0.000001s : 9: predicate.transpose_eliminate 1.53% : 0.000003s : 17: predicate.tuple_list_convert_item_index_to_positive 1.60% : 0.000003s : 17: predicate.tuple_list_get_item_const_eliminator 1.43% : 0.000003s : 17: predicate.tuple_list_get_item_depend_reorder 3.02% : 0.000005s : 25: predicate.tuple_list_get_item_eliminator 1.51% : 0.000003s : 17: predicate.tuple_list_get_set_item_eliminator 2.74% : 0.000005s : 25: predicate.tuple_list_set_item_eliminator 1.56% : 0.000003s : 17: predicate.tuple_to_list_eliminator_ 1.96% : 0.000003s : 26: predicate.updatestate_pure_node_eliminater 2.96% : 0.000005s : 34: predicate.updatestate_useless_node_eliminater 0.44% : 0.000001s : 4: predicate.value_based_eliminate 0.82% : 0.000001s : 8: predicate.virtual_dataset_eliminate 0.81% : 0.000001s : 8: predicate.virtual_output_eliminate 0.36% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.51% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000250 5 7.59% : 0.000019s : 1: func_graph_cloner_run.FuncGraphClonerGraph 92.41% : 0.000231s : 4: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.024607 192 0.02% : 0.000004s : 1: ForceFp32Comm 12.98% : 0.003193s : 1: add_attr 12.93% : 0.003181s : 1: add_attr_with_inline 0.01% : 0.000004s : 1: add_comm_op_reuse_tag 0.26% : 0.000063s : 1: add_recomputation 0.02% : 0.000004s : 1: assign_add_opt 0.25% : 0.000061s : 1: auto_monad 0.10% : 0.000025s : 1: auto_monad_reorder 0.01% : 0.000004s : 1: begin_end_overlap_inline 0.02% : 0.000006s : 1: bias_add_comm_swap 2.00% : 0.000492s : 1: bootstrap 0.13% : 0.000032s : 1: cconv 0.02% : 0.000004s : 1: comm_op_add_attrs 0.07% : 0.000018s : 1: control_data_broadcast_order 0.05% : 0.000012s : 1: convert_after_rewriter 0.12% : 0.000029s : 1: cse_after_recomputation 0.02% : 0.000005s : 1: dataset_repeat_opt 0.02% : 0.000006s : 1: detach_backward 0.04% : 0.000010s : 1: environ_conv 0.08% : 0.000019s : 1: event_method 0.02% : 0.000005s : 1: full_micro_interleaved_order_control 0.02% : 0.000005s : 1: get_jit_bprop_graph 0.03% : 0.000008s : 1: graph_reusing 0.02% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.02% : 0.000005s : 1: handle_group_info 0.02% : 0.000006s : 1: inline 0.02% : 0.000006s : 1: insert-virtual-dataset 0.02% : 0.000004s : 1: interleave_parallel_branches 0.02% : 0.000004s : 1: interleave_split_concat_branches 0.02% : 0.000006s : 1: label_fine_grained_interleaved_index 0.03% : 0.000007s : 1: label_micro_interleaved_index 1.89% : 0.000465s : 1: loop_unroll 0.02% : 0.000004s : 1: merge_cast_opt 0.02% : 0.000005s : 1: micro_interleaved_order_control 2.51% : 0.000617s : 1: mutable_eliminate 0.03% : 0.000007s : 1: offloading_packed_experts 0.07% : 0.000016s : 1: opt.transform.loop_unroll_optimizer 0.07% : 0.000018s : 1: opt.transform.mutable_eliminate 4.10% : 0.001008s : 78: opt.transform.opt_a 0.13% : 0.000032s : 1: opt.transform.opt_after_cconv 0.11% : 0.000028s : 1: opt.transform.opt_after_jit_grad 0.50% : 0.000123s : 28: opt.transform.opt_b 0.22% : 0.000053s : 2: opt.transform.opt_trans_graph 0.18% : 0.000043s : 4: opt.transform.symbol_engine_opt 10.97% : 0.002699s : 1: opt_a 0.47% : 0.000117s : 1: opt_after_cconv 1.96% : 0.000483s : 1: opt_after_jit_grad 0.98% : 0.000242s : 1: opt_b 20.07% : 0.004940s : 1: optimize 0.08% : 0.000021s : 1: optimize_parallel_all_gather_comm 0.04% : 0.000010s : 1: order_py_execute_after_rewriter 0.11% : 0.000027s : 1: overlap_grad_flash_sp 0.02% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.03% : 0.000008s : 1: overlap_grad_ring_attention 0.02% : 0.000004s : 1: overlap_opt_shard_grad_in_pipeline 0.02% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.02% : 0.000005s : 1: overlap_param_gather 0.02% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.03% : 0.000008s : 1: overlap_recompute_and_grad_model_parallel 0.02% : 0.000005s : 1: overlap_recompute_comm 0.03% : 0.000007s : 1: parallel-infer-symbol 0.02% : 0.000004s : 1: parallel-infer-symbol-second 0.02% : 0.000005s : 1: partial_unused_args_eliminate 0.02% : 0.000005s : 1: pipeline_parallel_scheduler 0.02% : 0.000005s : 1: pipeline_split 0.14% : 0.000034s : 1: pre_auto_parallel 0.10% : 0.000025s : 1: py_interpret_to_execute 0.07% : 0.000018s : 1: py_interpret_to_execute_after_opt_a 0.02% : 0.000004s : 1: remove_cast_before_assign_add 0.09% : 0.000022s : 1: remove_dup_value 1.81% : 0.000446s : 1: renormalize.infer 1.24% : 0.000305s : 1: renormalize.specialize 0.02% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.03% : 0.000008s : 1: rewriter_after_jit_bprop_graph 0.20% : 0.000050s : 1: rewriter_after_opt_a 0.32% : 0.000078s : 1: rewriter_before_opt_a 0.02% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.02% : 0.000005s : 1: slice_recompute_activation 0.02% : 0.000005s : 1: split_layernorm_comm 0.02% : 0.000006s : 1: split_matmul_comm_elemetwise 0.04% : 0.000009s : 1: swap_dp_allreduce_reducescatter 0.36% : 0.000088s : 1: symbol_engine_optimizer 0.34% : 0.000084s : 1: tuple_transform 20.94% : 0.005154s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:38.623.797 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:38.624.060 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0164147, [21] [bootstrap]: 0.00045041 [type_inference]: 0.00562153 [event_method]: 1.427e-05 [auto_monad]: 5.735e-05 [graph_reusing]: 5.34e-06 [inline]: 2.49999e-06 [add_attr]: 0.00335276, [1] [add_attr_with_inline]: 0.00334384, [1] [Cycle 1]: 7.632e-05, [2] [tag_attr]: 1.732e-05 [meta_addattr_fg_expand]: 3.78999e-06 [parallel-infer-symbol]: 3.78001e-06 [pre_auto_parallel]: 2.882e-05 [insert-virtual-dataset]: 2.31e-06 [parallel-infer-symbol-second]: 7.89994e-07 [dataset_repeat_opt]: 1.81e-06 [pipeline_split]: 1.95001e-06 [optimize]: 0.00558002, [53] [py_interpret_to_execute]: 2.432e-05 [rewriter_before_opt_a]: 5.689e-05 [opt_a]: 0.0030437, [2] [Cycle 1]: 0.00211271, [45] [expand_dump_flag]: 3.03e-06 [switch_simplify]: 2.791e-05 [loop_unroll]: 1.616e-05 [a_1]: 0.00035995 [with_stream_mark]: 1.758e-05 [recompute_prepare]: 1.027e-05 [updatestate_depend_eliminate]: 4.68001e-06 [updatestate_assign_eliminate]: 4.29997e-06 [updatestate_loads_eliminate]: 4.22e-06 [parameter_eliminate]: 1.87999e-06 [a_2]: 0.00013006 [accelerated_algorithm]: 9.42001e-06 [shard]: 2.61e-06 [meta_shard_fg_expand]: 2.93e-06 [shard_inline]: 8.43999e-06 [merge_send_recv]: 1.062e-05 [auto_parallel]: 8e-06 [parallel]: 2.133e-05 [flash_sp]: 1.18e-05 [merge_comm]: 1.793e-05 [allreduce_fusion]: 4.89998e-06 [matmul_add_comm_reduction]: 1.218e-05 [allreduce_slice_to_reducescatter]: 6.80011e-07 [virtual_shard_identity]: 1.311e-05 [virtual_dataset]: 8.37e-06 [get_grad_eliminate_]: 8e-06 [virtual_output]: 7.78001e-06 [merge_forward]: 4.70999e-06 [cell_reuse_recompute_pass]: 1.35999e-06 [offload_activation]: 1.206e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.99e-05 [merge_recompute_call_nodes]: 1.66998e-06 [before_grad]: 1.333e-05 [set_forward_comm_id_for_comm_node_pass]: 4.87998e-06 [meta_fg_expand]: 3.51001e-06 [flash_sp_send_recv_attached]: 2.53003e-06 [receive_attached]: 2.16998e-06 [after_resolve]: 1.223e-05 [a_after_grad]: 1.168e-05 [renormalize]: 0.0007295 [add_forward_monad_depend]: 5.90002e-06 [auto_monad_grad]: 2.58e-06 [auto_monad_eliminator]: 1.774e-05 [cse]: 3.783e-05 [a_3]: 7.066e-05 [Cycle 2]: 0.00091636, [45] [expand_dump_flag]: 1.58002e-06 [switch_simplify]: 9.04e-06 [loop_unroll]: 7.93001e-06 [a_1]: 0.00015314 [with_stream_mark]: 1.833e-05 [recompute_prepare]: 8.1e-06 [updatestate_depend_eliminate]: 4.38999e-06 [updatestate_assign_eliminate]: 3.06999e-06 [updatestate_loads_eliminate]: 2.91999e-06 [parameter_eliminate]: 1.51998e-06 [a_2]: 0.00011408 [accelerated_algorithm]: 6.93998e-06 [shard]: 1.46002e-06 [meta_shard_fg_expand]: 2.04e-06 [shard_inline]: 7.42002e-06 [merge_send_recv]: 6.86001e-06 [auto_parallel]: 6.93e-06 [parallel]: 6.38e-06 [flash_sp]: 3.77998e-06 [merge_comm]: 4.75999e-06 [allreduce_fusion]: 3.97e-06 [matmul_add_comm_reduction]: 7.8e-06 [allreduce_slice_to_reducescatter]: 8.49977e-07 [virtual_shard_identity]: 7.83001e-06 [virtual_dataset]: 6.96001e-06 [get_grad_eliminate_]: 6.89001e-06 [virtual_output]: 6.69001e-06 [merge_forward]: 3.50998e-06 [cell_reuse_recompute_pass]: 1.60999e-06 [offload_activation]: 8.83001e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.715e-05 [merge_recompute_call_nodes]: 1.03001e-06 [before_grad]: 1.212e-05 [set_forward_comm_id_for_comm_node_pass]: 5.28002e-06 [meta_fg_expand]: 2.90002e-06 [flash_sp_send_recv_attached]: 1.14e-06 [receive_attached]: 1.05001e-06 [after_resolve]: 1.118e-05 [a_after_grad]: 9.96e-06 [renormalize]: 6.00121e-08 [add_forward_monad_depend]: 2.04e-06 [auto_monad_grad]: 1.45999e-06 [auto_monad_eliminator]: 9.57999e-06 [cse]: 2.186e-05 [a_3]: 5.696e-05 [py_interpret_to_execute_after_opt_a]: 1.748e-05 [slice_cell_reuse_recomputed_activation]: 4.57998e-06 [rewriter_after_opt_a]: 4.941e-05 [convert_after_rewriter]: 1.163e-05 [order_py_execute_after_rewriter]: 9.47001e-06 [mutable_eliminate]: 0.00060645 [opt_b]: 0.00031019, [1] [Cycle 1]: 0.0003004, [7] [b_1]: 0.00019141 [b_2]: 9.19e-06 [updatestate_depend_eliminate]: 7.74002e-06 [updatestate_assign_eliminate]: 2.98998e-06 [updatestate_loads_eliminate]: 3.14999e-06 [renormalize]: 5.59987e-07 [cse]: 2.77e-05 [optimize_parallel_all_gather_comm]: 2.233e-05 [overlap_param_gather]: 4.63999e-06 [cconv]: 3.354e-05 [loop_unroll]: 0.00046205 [opt_after_cconv]: 0.00014702, [1] [Cycle 1]: 0.00013737, [7] [c_1]: 3.48e-05 [parameter_eliminate]: 4.1e-06 [updatestate_depend_eliminate]: 6.54999e-06 [updatestate_assign_eliminate]: 3.05998e-06 [updatestate_loads_eliminate]: 3.6e-06 [cse]: 2.638e-05 [renormalize]: 5.60016e-07 [remove_dup_value]: 2.411e-05 [tuple_transform]: 0.0001049, [1] [Cycle 1]: 9.71e-05, [4] [d_1]: 5.369e-05 [none_parameter_eliminate]: 1.80001e-06 [renormalize]: 3.09985e-07 [switch_simplify]: 8.37e-06 [partial_unused_args_eliminate]: 4.91002e-06 [add_recomputation]: 6.206e-05 [cse_after_recomputation]: 3.456e-05, [1] [Cycle 1]: 2.725e-05, [1] [cse]: 1.726e-05 [environ_conv]: 1.161e-05 [swap_dp_allreduce_reducescatter]: 8.77999e-06 [bias_add_comm_swap]: 5.59e-06 [label_micro_interleaved_index]: 7.87e-06 [label_fine_grained_interleaved_index]: 5.02999e-06 [merge_cast_opt]: 3.71999e-06 [slice_recompute_activation]: 4.70001e-06 [micro_interleaved_order_control]: 4.80001e-06 [assign_add_opt]: 3.75998e-06 [ForceFp32Comm]: 3.18e-06 [remove_cast_before_assign_add]: 3.54002e-06 [full_micro_interleaved_order_control]: 4.80999e-06 [reorder_send_recv_between_fp_bp]: 4.94e-06 [comm_op_add_attrs]: 3.60998e-06 [add_comm_op_reuse_tag]: 3.26001e-06 [interleave_split_concat_branches]: 4.07998e-06 [interleave_parallel_branches]: 3.75e-06 [overlap_opt_shard_in_pipeline]: 3.66999e-06 [overlap_opt_shard_grad_in_pipeline]: 4.57998e-06 [control_data_broadcast_order]: 1.956e-05 [grouped_pairwise_exchange_alltoall]: 3.88999e-06 [offloading_packed_experts]: 6.92002e-06 [overlap_recompute_and_grad_model_parallel]: 8.09002e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.71999e-06 [overlap_recompute_allgather_and_fa_grad]: 4.07e-06 [overlap_recompute_comm]: 5.44e-06 [overlap_grad_ring_attention]: 7e-06 [overlap_grad_flash_sp]: 2.854e-05 [begin_end_overlap_inline]: 2.94999e-06 [split_matmul_comm_elemetwise]: 4.97e-06 [split_layernorm_comm]: 4.12998e-06 [handle_group_info]: 3.54002e-06 [symbol_engine_optimizer]: 0.00010657, [1] [Cycle 1]: 9.918e-05, [6] [build]: 3.39001e-06 [elim_shapecalc]: 1.21e-05 [elim_not_effective]: 1.541e-05 [opt_reshape]: 8.47998e-06 [fold_const_symbol]: 1.241e-05 [renormalize]: 2.30008e-07 [detach_backward]: 4.08001e-06 [pipeline_parallel_scheduler]: 1.60999e-06 [auto_monad_reorder]: 2.358e-05 [get_jit_bprop_graph]: 1.84e-06 [rewriter_after_jit_bprop_graph]: 5.87001e-06 [opt_after_jit_grad]: 0.00058811 [validate]: 4.891e-05 Sums bootstrap : 0.000450s : 4.02% type_inference : 0.005622s : 50.21% event_method : 0.000014s : 0.13% auto_monad : 0.000057s : 0.51% graph_reusing : 0.000005s : 0.05% inline : 0.000002s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000017s : 0.15% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000004s : 0.03% parallel-infer-symbol : 0.000004s : 0.03% pre_auto_parallel : 0.000029s : 0.26% insert-virtual-dataset : 0.000002s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.02% optimize.py_interpret_to_execute : 0.000024s : 0.22% optimize.rewriter_before_opt_a : 0.000057s : 0.51% optimize.opt_a.expand_dump_flag : 0.000005s : 0.04% optimize.opt_a.switch_simplify : 0.000037s : 0.33% optimize.opt_a.loop_unroll : 0.000024s : 0.22% optimize.opt_a.a_1 : 0.000513s : 4.58% optimize.opt_a.with_stream_mark : 0.000036s : 0.32% optimize.opt_a.recompute_prepare : 0.000018s : 0.16% optimize.opt_a.updatestate_depend_eliminate : 0.000009s : 0.08% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.07% optimize.opt_a.updatestate_loads_eliminate : 0.000007s : 0.06% optimize.opt_a.parameter_eliminate : 0.000003s : 0.03% optimize.opt_a.a_2 : 0.000244s : 2.18% optimize.opt_a.accelerated_algorithm : 0.000016s : 0.15% optimize.opt_a.shard : 0.000004s : 0.04% optimize.opt_a.meta_shard_fg_expand : 0.000005s : 0.04% optimize.opt_a.shard_inline : 0.000016s : 0.14% optimize.opt_a.merge_send_recv : 0.000017s : 0.16% optimize.opt_a.auto_parallel : 0.000015s : 0.13% optimize.opt_a.parallel : 0.000028s : 0.25% optimize.opt_a.flash_sp : 0.000016s : 0.14% optimize.opt_a.merge_comm : 0.000023s : 0.20% optimize.opt_a.allreduce_fusion : 0.000009s : 0.08% optimize.opt_a.matmul_add_comm_reduction : 0.000020s : 0.18% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000002s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000021s : 0.19% optimize.opt_a.virtual_dataset : 0.000015s : 0.14% optimize.opt_a.get_grad_eliminate_ : 0.000015s : 0.13% optimize.opt_a.virtual_output : 0.000014s : 0.13% optimize.opt_a.merge_forward : 0.000008s : 0.07% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.03% optimize.opt_a.offload_activation : 0.000021s : 0.19% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000037s : 0.33% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.02% optimize.opt_a.before_grad : 0.000025s : 0.23% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000010s : 0.09% optimize.opt_a.meta_fg_expand : 0.000006s : 0.06% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.03% optimize.opt_a.receive_attached : 0.000003s : 0.03% optimize.opt_a.after_resolve : 0.000023s : 0.21% optimize.opt_a.a_after_grad : 0.000022s : 0.19% optimize.opt_a.renormalize : 0.000730s : 6.52% optimize.opt_a.add_forward_monad_depend : 0.000008s : 0.07% optimize.opt_a.auto_monad_grad : 0.000004s : 0.04% optimize.opt_a.auto_monad_eliminator : 0.000027s : 0.24% optimize.opt_a.cse : 0.000060s : 0.53% optimize.opt_a.a_3 : 0.000128s : 1.14% optimize.py_interpret_to_execute_after_opt_a : 0.000017s : 0.16% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.04% optimize.rewriter_after_opt_a : 0.000049s : 0.44% optimize.convert_after_rewriter : 0.000012s : 0.10% optimize.order_py_execute_after_rewriter : 0.000009s : 0.08% optimize.mutable_eliminate : 0.000606s : 5.42% optimize.opt_b.b_1 : 0.000191s : 1.71% optimize.opt_b.b_2 : 0.000009s : 0.08% optimize.opt_b.updatestate_depend_eliminate : 0.000008s : 0.07% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_b.renormalize : 0.000001s : 0.01% optimize.opt_b.cse : 0.000028s : 0.25% optimize.optimize_parallel_all_gather_comm : 0.000022s : 0.20% optimize.overlap_param_gather : 0.000005s : 0.04% optimize.cconv : 0.000034s : 0.30% optimize.loop_unroll : 0.000462s : 4.13% optimize.opt_after_cconv.c_1 : 0.000035s : 0.31% optimize.opt_after_cconv.parameter_eliminate : 0.000004s : 0.04% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000007s : 0.06% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000004s : 0.03% optimize.opt_after_cconv.cse : 0.000026s : 0.24% optimize.opt_after_cconv.renormalize : 0.000001s : 0.01% optimize.remove_dup_value : 0.000024s : 0.22% optimize.tuple_transform.d_1 : 0.000054s : 0.48% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.02% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000008s : 0.07% optimize.partial_unused_args_eliminate : 0.000005s : 0.04% optimize.add_recomputation : 0.000062s : 0.55% optimize.cse_after_recomputation.cse : 0.000017s : 0.15% optimize.environ_conv : 0.000012s : 0.10% optimize.swap_dp_allreduce_reducescatter : 0.000009s : 0.08% optimize.bias_add_comm_swap : 0.000006s : 0.05% optimize.label_micro_interleaved_index : 0.000008s : 0.07% optimize.label_fine_grained_interleaved_index : 0.000005s : 0.04% optimize.merge_cast_opt : 0.000004s : 0.03% optimize.slice_recompute_activation : 0.000005s : 0.04% optimize.micro_interleaved_order_control : 0.000005s : 0.04% optimize.assign_add_opt : 0.000004s : 0.03% optimize.ForceFp32Comm : 0.000003s : 0.03% optimize.remove_cast_before_assign_add : 0.000004s : 0.03% optimize.full_micro_interleaved_order_control : 0.000005s : 0.04% optimize.reorder_send_recv_between_fp_bp : 0.000005s : 0.04% optimize.comm_op_add_attrs : 0.000004s : 0.03% optimize.add_comm_op_reuse_tag : 0.000003s : 0.03% optimize.interleave_split_concat_branches : 0.000004s : 0.04% optimize.interleave_parallel_branches : 0.000004s : 0.03% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.03% optimize.overlap_opt_shard_grad_in_pipeline : 0.000005s : 0.04% optimize.control_data_broadcast_order : 0.000020s : 0.17% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.03% optimize.offloading_packed_experts : 0.000007s : 0.06% optimize.overlap_recompute_and_grad_model_parallel : 0.000008s : 0.07% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.03% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.04% optimize.overlap_recompute_comm : 0.000005s : 0.05% optimize.overlap_grad_ring_attention : 0.000007s : 0.06% optimize.overlap_grad_flash_sp : 0.000029s : 0.25% optimize.begin_end_overlap_inline : 0.000003s : 0.03% optimize.split_matmul_comm_elemetwise : 0.000005s : 0.04% optimize.split_layernorm_comm : 0.000004s : 0.04% optimize.handle_group_info : 0.000004s : 0.03% optimize.symbol_engine_optimizer.build : 0.000003s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000012s : 0.11% optimize.symbol_engine_optimizer.elim_not_effective : 0.000015s : 0.14% optimize.symbol_engine_optimizer.opt_reshape : 0.000008s : 0.08% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000012s : 0.11% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000004s : 0.04% pipeline_parallel_scheduler : 0.000002s : 0.01% auto_monad_reorder : 0.000024s : 0.21% get_jit_bprop_graph : 0.000002s : 0.02% rewriter_after_jit_bprop_graph : 0.000006s : 0.05% opt_after_jit_grad : 0.000588s : 5.25% validate : 0.000049s : 0.44% Time group info: ------[substitution.] 0.000156 28 14.59% : 0.000023s : 2: substitution.cast_eliminate 1.41% : 0.000002s : 3: substitution.elim_not_effective 1.18% : 0.000002s : 3: substitution.fold_const_symbol 4.75% : 0.000007s : 4: substitution.graph_param_transform 67.98% : 0.000106s : 2: substitution.inline 3.27% : 0.000005s : 6: substitution.j_node_and_user_rematch 3.91% : 0.000006s : 6: substitution.remove_not_recompute_node 2.91% : 0.000005s : 2: substitution.replace_old_param ------[type_inference.] 0.005569 2 91.11% : 0.005074s : 1: type_inference.infer 8.89% : 0.000495s : 1: type_inference.specialize ------[replace.] 0.000023 2 100.00% : 0.000023s : 2: replace.inline ------[match.] 0.000104 2 100.00% : 0.000104s : 2: match.inline ------[predicate.] 0.000173 980 0.81% : 0.000001s : 9: predicate.accumulaten_eliminater 1.11% : 0.000002s : 4: predicate.ad_related_special_op_eliminate 0.69% : 0.000001s : 8: predicate.addn_check_dump 0.82% : 0.000001s : 9: predicate.addn_zero_filter 0.73% : 0.000001s : 9: predicate.adjust_all_reduce_mul_add 2.40% : 0.000004s : 17: predicate.arithmetic_simplify 0.89% : 0.000002s : 9: predicate.cast_eliminate 0.82% : 0.000001s : 8: predicate.check_bprop_eliminate 0.66% : 0.000001s : 8: predicate.compare_switch_simplify 0.22% : 0.000000s : 4: predicate.const_output_eliminate 0.81% : 0.000001s : 8: predicate.depend_value_elim 0.74% : 0.000001s : 9: predicate.dict_get_item_const_eliminator 0.85% : 0.000001s : 9: predicate.dict_get_item_eliminator 0.86% : 0.000001s : 9: predicate.dict_set_item_eliminator 1.37% : 0.000002s : 8: predicate.dumpgradient_eliminate 0.23% : 0.000000s : 4: predicate.elim_not_effective 0.63% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.11% : 0.000002s : 13: predicate.environ_add_const_eliminate 1.03% : 0.000002s : 13: predicate.environ_get_add_eliminate 0.97% : 0.000002s : 13: predicate.environ_get_depend_swap 1.83% : 0.000003s : 21: predicate.environ_get_eliminate 1.00% : 0.000002s : 13: predicate.environ_get_set_eliminate 0.85% : 0.000001s : 11: predicate.exchange_switch_depend_value 1.84% : 0.000003s : 11: predicate.float_depend_g_call 0.66% : 0.000001s : 8: predicate.float_environ_get_switch 1.09% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.22% : 0.000000s : 4: predicate.fold_const_symbol 0.89% : 0.000002s : 8: predicate.get_grad_eliminate 0.27% : 0.000000s : 4: predicate.graph_param_transform 0.81% : 0.000001s : 8: predicate.incorporate_call 0.65% : 0.000001s : 8: predicate.incorporate_call_switch 6.76% : 0.000012s : 44: predicate.inline 0.95% : 0.000002s : 8: predicate.inline_without_move 0.35% : 0.000001s : 8: predicate.j_node_and_user_rematch 1.14% : 0.000002s : 8: predicate.less_batch_normalization 1.65% : 0.000003s : 17: predicate.list_to_tuple_eliminator_ 2.08% : 0.000004s : 26: predicate.load_eliminater 1.03% : 0.000002s : 4: predicate.loop_unroll_after_grad 1.48% : 0.000003s : 16: predicate.loop_unroll_before_grad 1.85% : 0.000003s : 17: predicate.make_slice_get_slice_eliminator 0.77% : 0.000001s : 8: predicate.merge_addn 0.73% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.74% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.69% : 0.000001s : 9: predicate.minmaximum_grad 1.59% : 0.000003s : 4: predicate.mutable_eliminate 0.46% : 0.000001s : 4: predicate.opt_reshape 0.54% : 0.000001s : 4: predicate.parallel_virtual_node 1.22% : 0.000002s : 11: predicate.partial_defer_inline 1.25% : 0.000002s : 13: predicate.partial_eliminate 0.81% : 0.000001s : 9: predicate.print_const_string_wrapper 0.89% : 0.000002s : 8: predicate.reduce_all_const_elim 1.17% : 0.000002s : 9: predicate.reduce_eliminate 2.17% : 0.000004s : 26: predicate.redundant_stop_gradient_eliminater 0.55% : 0.000001s : 8: predicate.remove_not_recompute_node 1.13% : 0.000002s : 17: predicate.replace_applicator 0.61% : 0.000001s : 8: predicate.replace_old_param 0.41% : 0.000001s : 4: predicate.reset_defer_inline 0.85% : 0.000001s : 9: predicate.reshape_eliminate 0.84% : 0.000001s : 8: predicate.row_tensor_add_zeros_like 0.51% : 0.000001s : 4: predicate.row_tensor_eliminate 0.99% : 0.000002s : 8: predicate.same_eliminate 0.65% : 0.000001s : 8: predicate.set_cell_output_no_recompute 1.19% : 0.000002s : 8: predicate.shard_identity_eliminate 1.00% : 0.000002s : 8: predicate.special_op_eliminate 0.92% : 0.000002s : 8: predicate.specialize_transform 1.08% : 0.000002s : 8: predicate.split_environ_get_set_with_tuple_value 0.94% : 0.000002s : 8: predicate.stack_unstack_eliminate 0.43% : 0.000001s : 4: predicate.switch_call_monad_eliminater 0.95% : 0.000002s : 11: predicate.switch_defer_inline 1.64% : 0.000003s : 19: predicate.switch_layer_defer_inline 4.06% : 0.000007s : 39: predicate.switch_simplify 0.83% : 0.000001s : 9: predicate.tile_eliminate 0.87% : 0.000002s : 9: predicate.transpose_eliminate 1.67% : 0.000003s : 17: predicate.tuple_list_convert_item_index_to_positive 1.60% : 0.000003s : 17: predicate.tuple_list_get_item_const_eliminator 1.44% : 0.000003s : 17: predicate.tuple_list_get_item_depend_reorder 3.16% : 0.000005s : 25: predicate.tuple_list_get_item_eliminator 1.52% : 0.000003s : 17: predicate.tuple_list_get_set_item_eliminator 2.53% : 0.000004s : 25: predicate.tuple_list_set_item_eliminator 1.58% : 0.000003s : 17: predicate.tuple_to_list_eliminator_ 2.00% : 0.000003s : 26: predicate.updatestate_pure_node_eliminater 2.95% : 0.000005s : 34: predicate.updatestate_useless_node_eliminater 0.48% : 0.000001s : 4: predicate.value_based_eliminate 0.81% : 0.000001s : 8: predicate.virtual_dataset_eliminate 0.74% : 0.000001s : 8: predicate.virtual_output_eliminate 0.33% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.53% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000248 5 7.98% : 0.000020s : 1: func_graph_cloner_run.FuncGraphClonerGraph 92.02% : 0.000229s : 4: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.027257 192 0.02% : 0.000006s : 1: ForceFp32Comm 12.34% : 0.003363s : 1: add_attr 12.28% : 0.003347s : 1: add_attr_with_inline 0.02% : 0.000006s : 1: add_comm_op_reuse_tag 0.24% : 0.000066s : 1: add_recomputation 0.02% : 0.000006s : 1: assign_add_opt 0.24% : 0.000066s : 1: auto_monad 0.12% : 0.000031s : 1: auto_monad_reorder 0.02% : 0.000006s : 1: begin_end_overlap_inline 0.03% : 0.000008s : 1: bias_add_comm_swap 1.83% : 0.000499s : 1: bootstrap 0.14% : 0.000037s : 1: cconv 0.02% : 0.000006s : 1: comm_op_add_attrs 0.08% : 0.000023s : 1: control_data_broadcast_order 0.06% : 0.000015s : 1: convert_after_rewriter 0.14% : 0.000038s : 1: cse_after_recomputation 0.03% : 0.000007s : 1: dataset_repeat_opt 0.08% : 0.000022s : 1: detach_backward 0.05% : 0.000015s : 1: environ_conv 0.09% : 0.000025s : 1: event_method 0.03% : 0.000008s : 1: full_micro_interleaved_order_control 0.03% : 0.000008s : 1: get_jit_bprop_graph 0.04% : 0.000011s : 1: graph_reusing 0.02% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.02% : 0.000006s : 1: handle_group_info 0.03% : 0.000008s : 1: inline 0.03% : 0.000008s : 1: insert-virtual-dataset 0.02% : 0.000006s : 1: interleave_parallel_branches 0.02% : 0.000007s : 1: interleave_split_concat_branches 0.03% : 0.000008s : 1: label_fine_grained_interleaved_index 0.04% : 0.000011s : 1: label_micro_interleaved_index 1.72% : 0.000469s : 1: loop_unroll 0.02% : 0.000006s : 1: merge_cast_opt 0.03% : 0.000007s : 1: micro_interleaved_order_control 2.25% : 0.000615s : 1: mutable_eliminate 0.04% : 0.000010s : 1: offloading_packed_experts 0.06% : 0.000018s : 1: opt.transform.loop_unroll_optimizer 0.08% : 0.000021s : 1: opt.transform.mutable_eliminate 3.64% : 0.000991s : 78: opt.transform.opt_a 0.12% : 0.000033s : 1: opt.transform.opt_after_cconv 0.11% : 0.000031s : 1: opt.transform.opt_after_jit_grad 0.47% : 0.000127s : 28: opt.transform.opt_b 0.22% : 0.000059s : 2: opt.transform.opt_trans_graph 0.16% : 0.000044s : 4: opt.transform.symbol_engine_opt 11.18% : 0.003047s : 1: opt_a 0.63% : 0.000173s : 1: opt_after_cconv 2.20% : 0.000599s : 1: opt_after_jit_grad 1.15% : 0.000314s : 1: opt_b 21.83% : 0.005949s : 1: optimize 0.09% : 0.000026s : 1: optimize_parallel_all_gather_comm 0.05% : 0.000013s : 1: order_py_execute_after_rewriter 0.12% : 0.000032s : 1: overlap_grad_flash_sp 0.02% : 0.000007s : 1: overlap_grad_matmul_and_grad_allreduce 0.04% : 0.000010s : 1: overlap_grad_ring_attention 0.03% : 0.000007s : 1: overlap_opt_shard_grad_in_pipeline 0.02% : 0.000006s : 1: overlap_opt_shard_in_pipeline 0.03% : 0.000008s : 1: overlap_param_gather 0.02% : 0.000007s : 1: overlap_recompute_allgather_and_fa_grad 0.04% : 0.000011s : 1: overlap_recompute_and_grad_model_parallel 0.03% : 0.000009s : 1: overlap_recompute_comm 0.04% : 0.000010s : 1: parallel-infer-symbol 0.02% : 0.000006s : 1: parallel-infer-symbol-second 0.03% : 0.000008s : 1: partial_unused_args_eliminate 0.03% : 0.000009s : 1: pipeline_parallel_scheduler 0.03% : 0.000007s : 1: pipeline_split 0.13% : 0.000036s : 1: pre_auto_parallel 0.10% : 0.000028s : 1: py_interpret_to_execute 0.08% : 0.000021s : 1: py_interpret_to_execute_after_opt_a 0.02% : 0.000006s : 1: remove_cast_before_assign_add 0.10% : 0.000029s : 1: remove_dup_value 1.56% : 0.000424s : 1: renormalize.infer 1.09% : 0.000297s : 1: renormalize.specialize 0.03% : 0.000008s : 1: reorder_send_recv_between_fp_bp 0.04% : 0.000012s : 1: rewriter_after_jit_bprop_graph 0.19% : 0.000053s : 1: rewriter_after_opt_a 0.22% : 0.000061s : 1: rewriter_before_opt_a 0.03% : 0.000007s : 1: slice_cell_reuse_recomputed_activation 0.03% : 0.000007s : 1: slice_recompute_activation 0.03% : 0.000007s : 1: split_layernorm_comm 0.03% : 0.000008s : 1: split_matmul_comm_elemetwise 0.04% : 0.000012s : 1: swap_dp_allreduce_reducescatter 0.40% : 0.000109s : 1: symbol_engine_optimizer 0.40% : 0.000108s : 1: tuple_transform 20.75% : 0.005655s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:38.826.447 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0151814, [21] [bootstrap]: 0.00045535 [type_inference]: 0.00553169 [event_method]: 1.275e-05 [auto_monad]: 5.958e-05 [graph_reusing]: 5.61e-06 [inline]: 2.14999e-06 [add_attr]: 0.00339111, [1] [add_attr_with_inline]: 0.00337934, [1] [Cycle 1]: 6.047e-05, [2] [tag_attr]: 1.711e-05 [meta_addattr_fg_expand]: 3.93001e-06 [parallel-infer-symbol]: 3.04001e-06 [pre_auto_parallel]: 2.917e-05 [insert-virtual-dataset]: 2.54999e-06 [parallel-infer-symbol-second]: 7.09988e-07 [dataset_repeat_opt]: 3.55e-06 [pipeline_split]: 1.64e-06 [optimize]: 0.00489831, [53] [py_interpret_to_execute]: 1.974e-05 [rewriter_before_opt_a]: 5.308e-05 [opt_a]: 0.00259423, [2] [Cycle 1]: 0.00182398, [45] [expand_dump_flag]: 3.14001e-06 [switch_simplify]: 2.741e-05 [loop_unroll]: 1.543e-05 [a_1]: 0.00035914 [with_stream_mark]: 1.695e-05 [recompute_prepare]: 9.32001e-06 [updatestate_depend_eliminate]: 5.77999e-06 [updatestate_assign_eliminate]: 3.99002e-06 [updatestate_loads_eliminate]: 3.83001e-06 [parameter_eliminate]: 1.82001e-06 [a_2]: 0.00011075 [accelerated_algorithm]: 8.14002e-06 [shard]: 2.19999e-06 [meta_shard_fg_expand]: 2.12001e-06 [shard_inline]: 7.41999e-06 [merge_send_recv]: 1.036e-05 [auto_parallel]: 7.53e-06 [parallel]: 1.802e-05 [flash_sp]: 8.03001e-06 [merge_comm]: 5.30999e-06 [allreduce_fusion]: 4.40999e-06 [matmul_add_comm_reduction]: 1.042e-05 [allreduce_slice_to_reducescatter]: 6.50005e-07 [virtual_shard_identity]: 9.11002e-06 [virtual_dataset]: 7.71999e-06 [get_grad_eliminate_]: 6.66999e-06 [virtual_output]: 7.83001e-06 [merge_forward]: 4.53001e-06 [cell_reuse_recompute_pass]: 1.37999e-06 [offload_activation]: 1.098e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.605e-05 [merge_recompute_call_nodes]: 1.69e-06 [before_grad]: 1.344e-05 [set_forward_comm_id_for_comm_node_pass]: 4.66002e-06 [meta_fg_expand]: 3.04999e-06 [flash_sp_send_recv_attached]: 2.71e-06 [receive_attached]: 2.55997e-06 [after_resolve]: 1.148e-05 [a_after_grad]: 1.131e-05 [renormalize]: 0.00069804 [add_forward_monad_depend]: 5.89999e-06 [auto_monad_grad]: 2.68998e-06 [auto_monad_eliminator]: 1.594e-05 [cse]: 3.886e-05 [a_3]: 6.029e-05 [Cycle 2]: 0.0007594, [45] [expand_dump_flag]: 1.01997e-06 [switch_simplify]: 9.87999e-06 [loop_unroll]: 8.05999e-06 [a_1]: 0.00015503 [with_stream_mark]: 1.364e-05 [recompute_prepare]: 8e-06 [updatestate_depend_eliminate]: 4.4e-06 [updatestate_assign_eliminate]: 3.65e-06 [updatestate_loads_eliminate]: 2.79001e-06 [parameter_eliminate]: 1.32e-06 [a_2]: 9.502e-05 [accelerated_algorithm]: 7.40998e-06 [shard]: 1.44e-06 [meta_shard_fg_expand]: 1.59e-06 [shard_inline]: 7.66001e-06 [merge_send_recv]: 6.26998e-06 [auto_parallel]: 7.28e-06 [parallel]: 5.42001e-06 [flash_sp]: 3.81001e-06 [merge_comm]: 4.60999e-06 [allreduce_fusion]: 4.45e-06 [matmul_add_comm_reduction]: 8.13999e-06 [allreduce_slice_to_reducescatter]: 5.29981e-07 [virtual_shard_identity]: 8.02e-06 [virtual_dataset]: 6.64999e-06 [get_grad_eliminate_]: 6.36e-06 [virtual_output]: 7.65e-06 [merge_forward]: 3.6e-06 [cell_reuse_recompute_pass]: 2.31e-06 [offload_activation]: 8.05e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.539e-05 [merge_recompute_call_nodes]: 1.12e-06 [before_grad]: 1.177e-05 [set_forward_comm_id_for_comm_node_pass]: 5.18002e-06 [meta_fg_expand]: 2.83e-06 [flash_sp_send_recv_attached]: 1.08001e-06 [receive_attached]: 1.15999e-06 [after_resolve]: 1.022e-05 [a_after_grad]: 1.088e-05 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 1.45999e-06 [auto_monad_grad]: 1.34e-06 [auto_monad_eliminator]: 8.74e-06 [cse]: 2.004e-05 [a_3]: 4.618e-05 [py_interpret_to_execute_after_opt_a]: 1.21e-05 [slice_cell_reuse_recomputed_activation]: 2.36e-06 [rewriter_after_opt_a]: 4.297e-05 [convert_after_rewriter]: 7.82e-06 [order_py_execute_after_rewriter]: 6.31e-06 [mutable_eliminate]: 0.00061345 [opt_b]: 0.00024289, [1] [Cycle 1]: 0.00023631, [7] [b_1]: 0.0001491 [b_2]: 8.59002e-06 [updatestate_depend_eliminate]: 6.68e-06 [updatestate_assign_eliminate]: 3.3e-06 [updatestate_loads_eliminate]: 3.5e-06 [renormalize]: 5.50004e-07 [cse]: 2.635e-05 [optimize_parallel_all_gather_comm]: 1.912e-05 [overlap_param_gather]: 1.89e-06 [cconv]: 2.849e-05 [loop_unroll]: 0.00046032 [opt_after_cconv]: 0.00012211, [1] [Cycle 1]: 0.00011582, [7] [c_1]: 3.597e-05 [parameter_eliminate]: 4.17e-06 [updatestate_depend_eliminate]: 6.19999e-06 [updatestate_assign_eliminate]: 3.11001e-06 [updatestate_loads_eliminate]: 3.06001e-06 [cse]: 2.572e-05 [renormalize]: 4.30009e-07 [remove_dup_value]: 1.68e-05 [tuple_transform]: 8.451e-05, [1] [Cycle 1]: 7.959e-05, [4] [d_1]: 5.022e-05 [none_parameter_eliminate]: 1.59998e-06 [renormalize]: 1.69995e-07 [switch_simplify]: 7.85e-06 [partial_unused_args_eliminate]: 2.26e-06 [add_recomputation]: 6.046e-05 [cse_after_recomputation]: 3.137e-05, [1] [Cycle 1]: 2.622e-05, [1] [cse]: 1.789e-05 [environ_conv]: 6.63e-06 [swap_dp_allreduce_reducescatter]: 6.22001e-06 [bias_add_comm_swap]: 2.75997e-06 [label_micro_interleaved_index]: 4.50999e-06 [label_fine_grained_interleaved_index]: 2.63e-06 [merge_cast_opt]: 1.36002e-06 [slice_recompute_activation]: 2.06e-06 [micro_interleaved_order_control]: 2.05002e-06 [assign_add_opt]: 1.38002e-06 [ForceFp32Comm]: 8.10018e-07 [remove_cast_before_assign_add]: 1.35001e-06 [full_micro_interleaved_order_control]: 1.99999e-06 [reorder_send_recv_between_fp_bp]: 2.84999e-06 [comm_op_add_attrs]: 1.02998e-06 [add_comm_op_reuse_tag]: 9.39996e-07 [interleave_split_concat_branches]: 1.39998e-06 [interleave_parallel_branches]: 1.00999e-06 [overlap_opt_shard_in_pipeline]: 1.44998e-06 [overlap_opt_shard_grad_in_pipeline]: 2.64999e-06 [control_data_broadcast_order]: 1.539e-05 [grouped_pairwise_exchange_alltoall]: 1.60001e-06 [offloading_packed_experts]: 4.47e-06 [overlap_recompute_and_grad_model_parallel]: 6.25002e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.19998e-06 [overlap_recompute_allgather_and_fa_grad]: 1.39998e-06 [overlap_recompute_comm]: 2.29999e-06 [overlap_grad_ring_attention]: 4.89e-06 [overlap_grad_flash_sp]: 2.574e-05 [begin_end_overlap_inline]: 5.79981e-07 [split_matmul_comm_elemetwise]: 2.61e-06 [split_layernorm_comm]: 1.96998e-06 [handle_group_info]: 1.09998e-06 [symbol_engine_optimizer]: 9.801e-05, [1] [Cycle 1]: 9.304e-05, [6] [build]: 3.46999e-06 [elim_shapecalc]: 1.671e-05 [elim_not_effective]: 1.708e-05 [opt_reshape]: 8.57e-06 [fold_const_symbol]: 1.298e-05 [renormalize]: 2.3999e-07 [detach_backward]: 2.07999e-06 [pipeline_parallel_scheduler]: 1.77999e-06 [auto_monad_reorder]: 2.229e-05 [get_jit_bprop_graph]: 1.92999e-06 [rewriter_after_jit_bprop_graph]: 4.58999e-06 [opt_after_jit_grad]: 0.00054468 [validate]: 4.842e-05 Sums bootstrap : 0.000455s : 4.24% type_inference : 0.005532s : 51.46% event_method : 0.000013s : 0.12% auto_monad : 0.000060s : 0.55% graph_reusing : 0.000006s : 0.05% inline : 0.000002s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000017s : 0.16% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000004s : 0.04% parallel-infer-symbol : 0.000003s : 0.03% pre_auto_parallel : 0.000029s : 0.27% insert-virtual-dataset : 0.000003s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000004s : 0.03% pipeline_split : 0.000002s : 0.02% optimize.py_interpret_to_execute : 0.000020s : 0.18% optimize.rewriter_before_opt_a : 0.000053s : 0.49% optimize.opt_a.expand_dump_flag : 0.000004s : 0.04% optimize.opt_a.switch_simplify : 0.000037s : 0.35% optimize.opt_a.loop_unroll : 0.000023s : 0.22% optimize.opt_a.a_1 : 0.000514s : 4.78% optimize.opt_a.with_stream_mark : 0.000031s : 0.28% optimize.opt_a.recompute_prepare : 0.000017s : 0.16% optimize.opt_a.updatestate_depend_eliminate : 0.000010s : 0.09% optimize.opt_a.updatestate_assign_eliminate : 0.000008s : 0.07% optimize.opt_a.updatestate_loads_eliminate : 0.000007s : 0.06% optimize.opt_a.parameter_eliminate : 0.000003s : 0.03% optimize.opt_a.a_2 : 0.000206s : 1.91% optimize.opt_a.accelerated_algorithm : 0.000016s : 0.14% optimize.opt_a.shard : 0.000004s : 0.03% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.03% optimize.opt_a.shard_inline : 0.000015s : 0.14% optimize.opt_a.merge_send_recv : 0.000017s : 0.15% optimize.opt_a.auto_parallel : 0.000015s : 0.14% optimize.opt_a.parallel : 0.000023s : 0.22% optimize.opt_a.flash_sp : 0.000012s : 0.11% optimize.opt_a.merge_comm : 0.000010s : 0.09% optimize.opt_a.allreduce_fusion : 0.000009s : 0.08% optimize.opt_a.matmul_add_comm_reduction : 0.000019s : 0.17% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000017s : 0.16% optimize.opt_a.virtual_dataset : 0.000014s : 0.13% optimize.opt_a.get_grad_eliminate_ : 0.000013s : 0.12% optimize.opt_a.virtual_output : 0.000015s : 0.14% optimize.opt_a.merge_forward : 0.000008s : 0.08% optimize.opt_a.cell_reuse_recompute_pass : 0.000004s : 0.03% optimize.opt_a.offload_activation : 0.000019s : 0.18% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000031s : 0.29% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.03% optimize.opt_a.before_grad : 0.000025s : 0.23% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000010s : 0.09% optimize.opt_a.meta_fg_expand : 0.000006s : 0.05% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.04% optimize.opt_a.receive_attached : 0.000004s : 0.03% optimize.opt_a.after_resolve : 0.000022s : 0.20% optimize.opt_a.a_after_grad : 0.000022s : 0.21% optimize.opt_a.renormalize : 0.000698s : 6.49% optimize.opt_a.add_forward_monad_depend : 0.000007s : 0.07% optimize.opt_a.auto_monad_grad : 0.000004s : 0.04% optimize.opt_a.auto_monad_eliminator : 0.000025s : 0.23% optimize.opt_a.cse : 0.000059s : 0.55% optimize.opt_a.a_3 : 0.000106s : 0.99% optimize.py_interpret_to_execute_after_opt_a : 0.000012s : 0.11% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.02% optimize.rewriter_after_opt_a : 0.000043s : 0.40% optimize.convert_after_rewriter : 0.000008s : 0.07% optimize.order_py_execute_after_rewriter : 0.000006s : 0.06% optimize.mutable_eliminate : 0.000613s : 5.71% optimize.opt_b.b_1 : 0.000149s : 1.39% optimize.opt_b.b_2 : 0.000009s : 0.08% optimize.opt_b.updatestate_depend_eliminate : 0.000007s : 0.06% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_b.renormalize : 0.000001s : 0.01% optimize.opt_b.cse : 0.000026s : 0.25% optimize.optimize_parallel_all_gather_comm : 0.000019s : 0.18% optimize.overlap_param_gather : 0.000002s : 0.02% optimize.cconv : 0.000028s : 0.27% optimize.loop_unroll : 0.000460s : 4.28% optimize.opt_after_cconv.c_1 : 0.000036s : 0.33% optimize.opt_after_cconv.parameter_eliminate : 0.000004s : 0.04% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.06% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.cse : 0.000026s : 0.24% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000017s : 0.16% optimize.tuple_transform.d_1 : 0.000050s : 0.47% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000008s : 0.07% optimize.partial_unused_args_eliminate : 0.000002s : 0.02% optimize.add_recomputation : 0.000060s : 0.56% optimize.cse_after_recomputation.cse : 0.000018s : 0.17% optimize.environ_conv : 0.000007s : 0.06% optimize.swap_dp_allreduce_reducescatter : 0.000006s : 0.06% optimize.bias_add_comm_swap : 0.000003s : 0.03% optimize.label_micro_interleaved_index : 0.000005s : 0.04% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.02% optimize.merge_cast_opt : 0.000001s : 0.01% optimize.slice_recompute_activation : 0.000002s : 0.02% optimize.micro_interleaved_order_control : 0.000002s : 0.02% optimize.assign_add_opt : 0.000001s : 0.01% optimize.ForceFp32Comm : 0.000001s : 0.01% optimize.remove_cast_before_assign_add : 0.000001s : 0.01% optimize.full_micro_interleaved_order_control : 0.000002s : 0.02% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.03% optimize.comm_op_add_attrs : 0.000001s : 0.01% optimize.add_comm_op_reuse_tag : 0.000001s : 0.01% optimize.interleave_split_concat_branches : 0.000001s : 0.01% optimize.interleave_parallel_branches : 0.000001s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000003s : 0.02% optimize.control_data_broadcast_order : 0.000015s : 0.14% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.01% optimize.offloading_packed_experts : 0.000004s : 0.04% optimize.overlap_recompute_and_grad_model_parallel : 0.000006s : 0.06% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.01% optimize.overlap_recompute_comm : 0.000002s : 0.02% optimize.overlap_grad_ring_attention : 0.000005s : 0.05% optimize.overlap_grad_flash_sp : 0.000026s : 0.24% optimize.begin_end_overlap_inline : 0.000001s : 0.01% optimize.split_matmul_comm_elemetwise : 0.000003s : 0.02% optimize.split_layernorm_comm : 0.000002s : 0.02% optimize.handle_group_info : 0.000001s : 0.01% optimize.symbol_engine_optimizer.build : 0.000003s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000017s : 0.16% optimize.symbol_engine_optimizer.elim_not_effective : 0.000017s : 0.16% optimize.symbol_engine_optimizer.opt_reshape : 0.000009s : 0.08% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000013s : 0.12% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.02% pipeline_parallel_scheduler : 0.000002s : 0.02% auto_monad_reorder : 0.000022s : 0.21% get_jit_bprop_graph : 0.000002s : 0.02% rewriter_after_jit_bprop_graph : 0.000005s : 0.04% opt_after_jit_grad : 0.000545s : 5.07% validate : 0.000048s : 0.45% Time group info: ------[substitution.] 0.000155 28 13.76% : 0.000021s : 2: substitution.cast_eliminate 1.45% : 0.000002s : 3: substitution.elim_not_effective 1.29% : 0.000002s : 3: substitution.fold_const_symbol 4.41% : 0.000007s : 4: substitution.graph_param_transform 69.28% : 0.000108s : 2: substitution.inline 3.57% : 0.000006s : 6: substitution.j_node_and_user_rematch 4.38% : 0.000007s : 6: substitution.remove_not_recompute_node 1.85% : 0.000003s : 2: substitution.replace_old_param ------[type_inference.] 0.005481 2 91.08% : 0.004993s : 1: type_inference.infer 8.92% : 0.000489s : 1: type_inference.specialize ------[replace.] 0.000024 2 100.00% : 0.000024s : 2: replace.inline ------[match.] 0.000106 2 100.00% : 0.000106s : 2: match.inline ------[predicate.] 0.000176 980 0.83% : 0.000001s : 9: predicate.accumulaten_eliminater 1.18% : 0.000002s : 4: predicate.ad_related_special_op_eliminate 0.68% : 0.000001s : 8: predicate.addn_check_dump 0.81% : 0.000001s : 9: predicate.addn_zero_filter 0.63% : 0.000001s : 9: predicate.adjust_all_reduce_mul_add 2.33% : 0.000004s : 17: predicate.arithmetic_simplify 1.02% : 0.000002s : 9: predicate.cast_eliminate 1.10% : 0.000002s : 8: predicate.check_bprop_eliminate 0.71% : 0.000001s : 8: predicate.compare_switch_simplify 0.21% : 0.000000s : 4: predicate.const_output_eliminate 0.70% : 0.000001s : 8: predicate.depend_value_elim 0.77% : 0.000001s : 9: predicate.dict_get_item_const_eliminator 0.89% : 0.000002s : 9: predicate.dict_get_item_eliminator 0.91% : 0.000002s : 9: predicate.dict_set_item_eliminator 1.31% : 0.000002s : 8: predicate.dumpgradient_eliminate 0.36% : 0.000001s : 4: predicate.elim_not_effective 0.85% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.07% : 0.000002s : 13: predicate.environ_add_const_eliminate 1.10% : 0.000002s : 13: predicate.environ_get_add_eliminate 1.03% : 0.000002s : 13: predicate.environ_get_depend_swap 1.83% : 0.000003s : 21: predicate.environ_get_eliminate 0.99% : 0.000002s : 13: predicate.environ_get_set_eliminate 0.89% : 0.000002s : 11: predicate.exchange_switch_depend_value 1.87% : 0.000003s : 11: predicate.float_depend_g_call 0.73% : 0.000001s : 8: predicate.float_environ_get_switch 1.12% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.29% : 0.000001s : 4: predicate.fold_const_symbol 0.74% : 0.000001s : 8: predicate.get_grad_eliminate 0.24% : 0.000000s : 4: predicate.graph_param_transform 0.81% : 0.000001s : 8: predicate.incorporate_call 0.64% : 0.000001s : 8: predicate.incorporate_call_switch 6.37% : 0.000011s : 44: predicate.inline 1.10% : 0.000002s : 8: predicate.inline_without_move 0.35% : 0.000001s : 8: predicate.j_node_and_user_rematch 1.12% : 0.000002s : 8: predicate.less_batch_normalization 1.63% : 0.000003s : 17: predicate.list_to_tuple_eliminator_ 2.13% : 0.000004s : 26: predicate.load_eliminater 1.19% : 0.000002s : 4: predicate.loop_unroll_after_grad 1.48% : 0.000003s : 16: predicate.loop_unroll_before_grad 1.72% : 0.000003s : 17: predicate.make_slice_get_slice_eliminator 0.74% : 0.000001s : 8: predicate.merge_addn 0.81% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.77% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.95% : 0.000002s : 9: predicate.minmaximum_grad 1.34% : 0.000002s : 4: predicate.mutable_eliminate 0.46% : 0.000001s : 4: predicate.opt_reshape 0.48% : 0.000001s : 4: predicate.parallel_virtual_node 1.19% : 0.000002s : 11: predicate.partial_defer_inline 1.26% : 0.000002s : 13: predicate.partial_eliminate 0.93% : 0.000002s : 9: predicate.print_const_string_wrapper 0.70% : 0.000001s : 8: predicate.reduce_all_const_elim 0.97% : 0.000002s : 9: predicate.reduce_eliminate 2.13% : 0.000004s : 26: predicate.redundant_stop_gradient_eliminater 0.49% : 0.000001s : 8: predicate.remove_not_recompute_node 1.09% : 0.000002s : 17: predicate.replace_applicator 0.70% : 0.000001s : 8: predicate.replace_old_param 0.29% : 0.000001s : 4: predicate.reset_defer_inline 0.77% : 0.000001s : 9: predicate.reshape_eliminate 0.96% : 0.000002s : 8: predicate.row_tensor_add_zeros_like 0.48% : 0.000001s : 4: predicate.row_tensor_eliminate 0.95% : 0.000002s : 8: predicate.same_eliminate 0.51% : 0.000001s : 8: predicate.set_cell_output_no_recompute 1.02% : 0.000002s : 8: predicate.shard_identity_eliminate 0.94% : 0.000002s : 8: predicate.special_op_eliminate 0.99% : 0.000002s : 8: predicate.specialize_transform 1.14% : 0.000002s : 8: predicate.split_environ_get_set_with_tuple_value 0.94% : 0.000002s : 8: predicate.stack_unstack_eliminate 0.45% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.06% : 0.000002s : 11: predicate.switch_defer_inline 1.70% : 0.000003s : 19: predicate.switch_layer_defer_inline 4.14% : 0.000007s : 39: predicate.switch_simplify 0.77% : 0.000001s : 9: predicate.tile_eliminate 0.83% : 0.000001s : 9: predicate.transpose_eliminate 1.50% : 0.000003s : 17: predicate.tuple_list_convert_item_index_to_positive 1.61% : 0.000003s : 17: predicate.tuple_list_get_item_const_eliminator 1.49% : 0.000003s : 17: predicate.tuple_list_get_item_depend_reorder 2.91% : 0.000005s : 25: predicate.tuple_list_get_item_eliminator 1.53% : 0.000003s : 17: predicate.tuple_list_get_set_item_eliminator 2.53% : 0.000004s : 25: predicate.tuple_list_set_item_eliminator 1.67% : 0.000003s : 17: predicate.tuple_to_list_eliminator_ 2.09% : 0.000004s : 26: predicate.updatestate_pure_node_eliminater 2.81% : 0.000005s : 34: predicate.updatestate_useless_node_eliminater 0.69% : 0.000001s : 4: predicate.value_based_eliminate 0.78% : 0.000001s : 8: predicate.virtual_dataset_eliminate 0.84% : 0.000001s : 8: predicate.virtual_output_eliminate 0.35% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.55% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000236 5 8.13% : 0.000019s : 1: func_graph_cloner_run.FuncGraphClonerGraph 91.87% : 0.000217s : 4: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.025342 192 0.01% : 0.000004s : 1: ForceFp32Comm 13.40% : 0.003397s : 1: add_attr 13.35% : 0.003384s : 1: add_attr_with_inline 0.15% : 0.000039s : 1: add_comm_op_reuse_tag 0.26% : 0.000065s : 1: add_recomputation 0.02% : 0.000004s : 1: assign_add_opt 0.25% : 0.000065s : 1: auto_monad 0.10% : 0.000026s : 1: auto_monad_reorder 0.01% : 0.000003s : 1: begin_end_overlap_inline 0.02% : 0.000006s : 1: bias_add_comm_swap 1.91% : 0.000483s : 1: bootstrap 0.13% : 0.000032s : 1: cconv 0.01% : 0.000004s : 1: comm_op_add_attrs 0.08% : 0.000019s : 1: control_data_broadcast_order 0.04% : 0.000011s : 1: convert_after_rewriter 0.14% : 0.000034s : 1: cse_after_recomputation 0.03% : 0.000006s : 1: dataset_repeat_opt 0.02% : 0.000006s : 1: detach_backward 0.04% : 0.000010s : 1: environ_conv 0.08% : 0.000019s : 1: event_method 0.02% : 0.000005s : 1: full_micro_interleaved_order_control 0.02% : 0.000005s : 1: get_jit_bprop_graph 0.03% : 0.000009s : 1: graph_reusing 0.02% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.02% : 0.000004s : 1: handle_group_info 0.02% : 0.000005s : 1: inline 0.02% : 0.000006s : 1: insert-virtual-dataset 0.02% : 0.000004s : 1: interleave_parallel_branches 0.02% : 0.000005s : 1: interleave_split_concat_branches 0.02% : 0.000006s : 1: label_fine_grained_interleaved_index 0.03% : 0.000007s : 1: label_micro_interleaved_index 1.85% : 0.000468s : 1: loop_unroll 0.02% : 0.000004s : 1: merge_cast_opt 0.02% : 0.000005s : 1: micro_interleaved_order_control 2.46% : 0.000623s : 1: mutable_eliminate 0.03% : 0.000008s : 1: offloading_packed_experts 0.07% : 0.000018s : 1: opt.transform.loop_unroll_optimizer 0.08% : 0.000019s : 1: opt.transform.mutable_eliminate 3.88% : 0.000983s : 78: opt.transform.opt_a 0.14% : 0.000035s : 1: opt.transform.opt_after_cconv 0.13% : 0.000032s : 1: opt.transform.opt_after_jit_grad 0.49% : 0.000125s : 28: opt.transform.opt_b 0.22% : 0.000055s : 2: opt.transform.opt_trans_graph 0.20% : 0.000050s : 4: opt.transform.symbol_engine_opt 10.25% : 0.002598s : 1: opt_a 0.50% : 0.000126s : 1: opt_after_cconv 2.19% : 0.000555s : 1: opt_after_jit_grad 0.97% : 0.000247s : 1: opt_b 19.35% : 0.004904s : 1: optimize 0.09% : 0.000023s : 1: optimize_parallel_all_gather_comm 0.04% : 0.000010s : 1: order_py_execute_after_rewriter 0.12% : 0.000030s : 1: overlap_grad_flash_sp 0.02% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.03% : 0.000008s : 1: overlap_grad_ring_attention 0.02% : 0.000006s : 1: overlap_opt_shard_grad_in_pipeline 0.02% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.02% : 0.000005s : 1: overlap_param_gather 0.02% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.04% : 0.000010s : 1: overlap_recompute_and_grad_model_parallel 0.02% : 0.000005s : 1: overlap_recompute_comm 0.03% : 0.000006s : 1: parallel-infer-symbol 0.01% : 0.000004s : 1: parallel-infer-symbol-second 0.02% : 0.000005s : 1: partial_unused_args_eliminate 0.02% : 0.000005s : 1: pipeline_parallel_scheduler 0.02% : 0.000004s : 1: pipeline_split 0.13% : 0.000034s : 1: pre_auto_parallel 0.09% : 0.000024s : 1: py_interpret_to_execute 0.06% : 0.000016s : 1: py_interpret_to_execute_after_opt_a 0.02% : 0.000004s : 1: remove_cast_before_assign_add 0.08% : 0.000021s : 1: remove_dup_value 1.61% : 0.000407s : 1: renormalize.infer 1.12% : 0.000283s : 1: renormalize.specialize 0.02% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.03% : 0.000008s : 1: rewriter_after_jit_bprop_graph 0.19% : 0.000047s : 1: rewriter_after_opt_a 0.22% : 0.000057s : 1: rewriter_before_opt_a 0.02% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.02% : 0.000005s : 1: slice_recompute_activation 0.02% : 0.000005s : 1: split_layernorm_comm 0.02% : 0.000005s : 1: split_matmul_comm_elemetwise 0.04% : 0.000009s : 1: swap_dp_allreduce_reducescatter 0.40% : 0.000101s : 1: symbol_engine_optimizer 0.35% : 0.000088s : 1: tuple_transform 21.91% : 0.005552s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:39.324.42 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:39.327.17 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0160001, [21] [bootstrap]: 0.00045088 [type_inference]: 0.00554218 [event_method]: 1.311e-05 [auto_monad]: 5.486e-05 [graph_reusing]: 5.71003e-06 [inline]: 2.86999e-06 [add_attr]: 0.00344443, [1] [add_attr_with_inline]: 0.00343476, [1] [Cycle 1]: 7.718e-05, [2] [tag_attr]: 1.568e-05 [meta_addattr_fg_expand]: 3.83001e-06 [parallel-infer-symbol]: 4.03999e-06 [pre_auto_parallel]: 2.9e-05 [insert-virtual-dataset]: 2.45002e-06 [parallel-infer-symbol-second]: 8.39995e-07 [dataset_repeat_opt]: 1.71e-06 [pipeline_split]: 1.63002e-06 [optimize]: 0.00518566, [53] [py_interpret_to_execute]: 2.351e-05 [rewriter_before_opt_a]: 5.504e-05 [opt_a]: 0.00270442, [2] [Cycle 1]: 0.00184399, [45] [expand_dump_flag]: 3.6e-06 [switch_simplify]: 2.672e-05 [loop_unroll]: 1.432e-05 [a_1]: 0.00030603 [with_stream_mark]: 1.976e-05 [recompute_prepare]: 8.92e-06 [updatestate_depend_eliminate]: 3.89002e-06 [updatestate_assign_eliminate]: 3.60998e-06 [updatestate_loads_eliminate]: 3.09999e-06 [parameter_eliminate]: 2.05002e-06 [a_2]: 0.00010946 [accelerated_algorithm]: 8.27e-06 [shard]: 2.19001e-06 [meta_shard_fg_expand]: 2.18002e-06 [shard_inline]: 6.64999e-06 [merge_send_recv]: 9.76003e-06 [auto_parallel]: 7.7e-06 [parallel]: 1.956e-05 [flash_sp]: 8.22e-06 [merge_comm]: 4.47e-06 [allreduce_fusion]: 3.68e-06 [matmul_add_comm_reduction]: 1.061e-05 [allreduce_slice_to_reducescatter]: 6.80011e-07 [virtual_shard_identity]: 8.84e-06 [virtual_dataset]: 7.11999e-06 [get_grad_eliminate_]: 6.48e-06 [virtual_output]: 6.24001e-06 [merge_forward]: 4.42998e-06 [cell_reuse_recompute_pass]: 1.54998e-06 [offload_activation]: 1.077e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.53e-05 [merge_recompute_call_nodes]: 1.71e-06 [before_grad]: 1.14e-05 [set_forward_comm_id_for_comm_node_pass]: 4.22e-06 [meta_fg_expand]: 2.76e-06 [flash_sp_send_recv_attached]: 2.49001e-06 [receive_attached]: 2.70002e-06 [after_resolve]: 1.057e-05 [a_after_grad]: 8.87e-06 [renormalize]: 0.00064399 [add_forward_monad_depend]: 6.12999e-06 [auto_monad_grad]: 2.68e-06 [auto_monad_eliminator]: 1.559e-05 [cse]: 3.25e-05 [a_3]: 6.301e-05 [Cycle 2]: 0.00084602, [45] [expand_dump_flag]: 1.37999e-06 [switch_simplify]: 7.38999e-06 [loop_unroll]: 6.28998e-06 [a_1]: 0.00011172 [with_stream_mark]: 1.675e-05 [recompute_prepare]: 6.78e-06 [updatestate_depend_eliminate]: 3.67002e-06 [updatestate_assign_eliminate]: 2.49999e-06 [updatestate_loads_eliminate]: 2.60002e-06 [parameter_eliminate]: 1.35001e-06 [a_2]: 0.00010533 [accelerated_algorithm]: 6.34001e-06 [shard]: 1.76e-06 [meta_shard_fg_expand]: 1.94e-06 [shard_inline]: 6.84001e-06 [merge_send_recv]: 5.70001e-06 [auto_parallel]: 6.36e-06 [parallel]: 6.21e-06 [flash_sp]: 3.59002e-06 [merge_comm]: 3.91001e-06 [allreduce_fusion]: 3.58e-06 [matmul_add_comm_reduction]: 6.50997e-06 [allreduce_slice_to_reducescatter]: 5.60016e-07 [virtual_shard_identity]: 7.15998e-06 [virtual_dataset]: 5.84e-06 [get_grad_eliminate_]: 5.67001e-06 [virtual_output]: 5.81e-06 [merge_forward]: 3.29001e-06 [cell_reuse_recompute_pass]: 1.84e-06 [offload_activation]: 7.04001e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.749e-05 [merge_recompute_call_nodes]: 1.19e-06 [before_grad]: 1.142e-05 [set_forward_comm_id_for_comm_node_pass]: 3.98001e-06 [meta_fg_expand]: 2.69001e-06 [flash_sp_send_recv_attached]: 9.20001e-07 [receive_attached]: 1.56002e-06 [after_resolve]: 1.063e-05 [a_after_grad]: 9.02e-06 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 1.45001e-06 [auto_monad_grad]: 1.44e-06 [auto_monad_eliminator]: 9.10001e-06 [cse]: 1.735e-05 [a_3]: 5.023e-05 [py_interpret_to_execute_after_opt_a]: 1.478e-05 [slice_cell_reuse_recomputed_activation]: 5.17999e-06 [rewriter_after_opt_a]: 4.31e-05 [convert_after_rewriter]: 1.015e-05 [order_py_execute_after_rewriter]: 8.82e-06 [mutable_eliminate]: 0.00065911 [opt_b]: 0.00028224, [1] [Cycle 1]: 0.00027185, [7] [b_1]: 0.00016697 [b_2]: 8.18999e-06 [updatestate_depend_eliminate]: 6.83e-06 [updatestate_assign_eliminate]: 2.56998e-06 [updatestate_loads_eliminate]: 2.39999e-06 [renormalize]: 4.30009e-07 [cse]: 2.43e-05 [optimize_parallel_all_gather_comm]: 2.088e-05 [overlap_param_gather]: 5.09e-06 [cconv]: 3.203e-05 [loop_unroll]: 0.00049919 [opt_after_cconv]: 0.00012988, [1] [Cycle 1]: 0.00012026, [7] [c_1]: 2.843e-05 [parameter_eliminate]: 4.43001e-06 [updatestate_depend_eliminate]: 5.67999e-06 [updatestate_assign_eliminate]: 2.42001e-06 [updatestate_loads_eliminate]: 2.69001e-06 [cse]: 2.016e-05 [renormalize]: 5.59987e-07 [remove_dup_value]: 1.869e-05 [tuple_transform]: 8.951e-05, [1] [Cycle 1]: 8.215e-05, [4] [d_1]: 4.145e-05 [none_parameter_eliminate]: 2.07001e-06 [renormalize]: 2.00002e-07 [switch_simplify]: 7.15e-06 [partial_unused_args_eliminate]: 4.72998e-06 [add_recomputation]: 5.009e-05 [cse_after_recomputation]: 2.832e-05, [1] [Cycle 1]: 2.112e-05, [1] [cse]: 1.158e-05 [environ_conv]: 8.72998e-06 [swap_dp_allreduce_reducescatter]: 7.70998e-06 [bias_add_comm_swap]: 5.80002e-06 [label_micro_interleaved_index]: 7.58001e-06 [label_fine_grained_interleaved_index]: 5.71003e-06 [merge_cast_opt]: 4.25e-06 [slice_recompute_activation]: 4.91002e-06 [micro_interleaved_order_control]: 4.62e-06 [assign_add_opt]: 3.67998e-06 [ForceFp32Comm]: 3.21999e-06 [remove_cast_before_assign_add]: 3.75e-06 [full_micro_interleaved_order_control]: 4.80001e-06 [reorder_send_recv_between_fp_bp]: 5.40999e-06 [comm_op_add_attrs]: 3.81999e-06 [add_comm_op_reuse_tag]: 3.24001e-06 [interleave_split_concat_branches]: 3.44001e-06 [interleave_parallel_branches]: 3.75e-06 [overlap_opt_shard_in_pipeline]: 3.8e-06 [overlap_opt_shard_grad_in_pipeline]: 4.40999e-06 [control_data_broadcast_order]: 1.736e-05 [grouped_pairwise_exchange_alltoall]: 4.09002e-06 [offloading_packed_experts]: 6.50002e-06 [overlap_recompute_and_grad_model_parallel]: 7.39002e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.49001e-06 [overlap_recompute_allgather_and_fa_grad]: 3.68e-06 [overlap_recompute_comm]: 5.15001e-06 [overlap_grad_ring_attention]: 6.83e-06 [overlap_grad_flash_sp]: 2.323e-05 [begin_end_overlap_inline]: 3.08998e-06 [split_matmul_comm_elemetwise]: 4.37e-06 [split_layernorm_comm]: 4.40999e-06 [handle_group_info]: 3.14999e-06 [symbol_engine_optimizer]: 0.00010083, [1] [Cycle 1]: 9.356e-05, [6] [build]: 3.16999e-06 [elim_shapecalc]: 1.042e-05 [elim_not_effective]: 1.36e-05 [opt_reshape]: 7.06001e-06 [fold_const_symbol]: 1.024e-05 [renormalize]: 2.50002e-07 [detach_backward]: 4.15999e-06 [pipeline_parallel_scheduler]: 1.94e-06 [auto_monad_reorder]: 2.031e-05 [get_jit_bprop_graph]: 1.70001e-06 [rewriter_after_jit_bprop_graph]: 6.02001e-06 [opt_after_jit_grad]: 0.00054307 [validate]: 4.225e-05 Sums bootstrap : 0.000451s : 4.21% type_inference : 0.005542s : 51.72% event_method : 0.000013s : 0.12% auto_monad : 0.000055s : 0.51% graph_reusing : 0.000006s : 0.05% inline : 0.000003s : 0.03% add_attr.add_attr_with_inline.tag_attr : 0.000016s : 0.15% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000004s : 0.04% parallel-infer-symbol : 0.000004s : 0.04% pre_auto_parallel : 0.000029s : 0.27% insert-virtual-dataset : 0.000002s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.02% optimize.py_interpret_to_execute : 0.000024s : 0.22% optimize.rewriter_before_opt_a : 0.000055s : 0.51% optimize.opt_a.expand_dump_flag : 0.000005s : 0.05% optimize.opt_a.switch_simplify : 0.000034s : 0.32% optimize.opt_a.loop_unroll : 0.000021s : 0.19% optimize.opt_a.a_1 : 0.000418s : 3.90% optimize.opt_a.with_stream_mark : 0.000037s : 0.34% optimize.opt_a.recompute_prepare : 0.000016s : 0.15% optimize.opt_a.updatestate_depend_eliminate : 0.000008s : 0.07% optimize.opt_a.updatestate_assign_eliminate : 0.000006s : 0.06% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.05% optimize.opt_a.parameter_eliminate : 0.000003s : 0.03% optimize.opt_a.a_2 : 0.000215s : 2.00% optimize.opt_a.accelerated_algorithm : 0.000015s : 0.14% optimize.opt_a.shard : 0.000004s : 0.04% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.04% optimize.opt_a.shard_inline : 0.000013s : 0.13% optimize.opt_a.merge_send_recv : 0.000015s : 0.14% optimize.opt_a.auto_parallel : 0.000014s : 0.13% optimize.opt_a.parallel : 0.000026s : 0.24% optimize.opt_a.flash_sp : 0.000012s : 0.11% optimize.opt_a.merge_comm : 0.000008s : 0.08% optimize.opt_a.allreduce_fusion : 0.000007s : 0.07% optimize.opt_a.matmul_add_comm_reduction : 0.000017s : 0.16% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000016s : 0.15% optimize.opt_a.virtual_dataset : 0.000013s : 0.12% optimize.opt_a.get_grad_eliminate_ : 0.000012s : 0.11% optimize.opt_a.virtual_output : 0.000012s : 0.11% optimize.opt_a.merge_forward : 0.000008s : 0.07% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.03% optimize.opt_a.offload_activation : 0.000018s : 0.17% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000033s : 0.31% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.03% optimize.opt_a.before_grad : 0.000023s : 0.21% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000008s : 0.08% optimize.opt_a.meta_fg_expand : 0.000005s : 0.05% optimize.opt_a.flash_sp_send_recv_attached : 0.000003s : 0.03% optimize.opt_a.receive_attached : 0.000004s : 0.04% optimize.opt_a.after_resolve : 0.000021s : 0.20% optimize.opt_a.a_after_grad : 0.000018s : 0.17% optimize.opt_a.renormalize : 0.000644s : 6.01% optimize.opt_a.add_forward_monad_depend : 0.000008s : 0.07% optimize.opt_a.auto_monad_grad : 0.000004s : 0.04% optimize.opt_a.auto_monad_eliminator : 0.000025s : 0.23% optimize.opt_a.cse : 0.000050s : 0.47% optimize.opt_a.a_3 : 0.000113s : 1.06% optimize.py_interpret_to_execute_after_opt_a : 0.000015s : 0.14% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.05% optimize.rewriter_after_opt_a : 0.000043s : 0.40% optimize.convert_after_rewriter : 0.000010s : 0.09% optimize.order_py_execute_after_rewriter : 0.000009s : 0.08% optimize.mutable_eliminate : 0.000659s : 6.15% optimize.opt_b.b_1 : 0.000167s : 1.56% optimize.opt_b.b_2 : 0.000008s : 0.08% optimize.opt_b.updatestate_depend_eliminate : 0.000007s : 0.06% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.02% optimize.opt_b.updatestate_loads_eliminate : 0.000002s : 0.02% optimize.opt_b.renormalize : 0.000000s : 0.00% optimize.opt_b.cse : 0.000024s : 0.23% optimize.optimize_parallel_all_gather_comm : 0.000021s : 0.19% optimize.overlap_param_gather : 0.000005s : 0.05% optimize.cconv : 0.000032s : 0.30% optimize.loop_unroll : 0.000499s : 4.66% optimize.opt_after_cconv.c_1 : 0.000028s : 0.27% optimize.opt_after_cconv.parameter_eliminate : 0.000004s : 0.04% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.05% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000002s : 0.02% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.cse : 0.000020s : 0.19% optimize.opt_after_cconv.renormalize : 0.000001s : 0.01% optimize.remove_dup_value : 0.000019s : 0.17% optimize.tuple_transform.d_1 : 0.000041s : 0.39% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.02% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000007s : 0.07% optimize.partial_unused_args_eliminate : 0.000005s : 0.04% optimize.add_recomputation : 0.000050s : 0.47% optimize.cse_after_recomputation.cse : 0.000012s : 0.11% optimize.environ_conv : 0.000009s : 0.08% optimize.swap_dp_allreduce_reducescatter : 0.000008s : 0.07% optimize.bias_add_comm_swap : 0.000006s : 0.05% optimize.label_micro_interleaved_index : 0.000008s : 0.07% optimize.label_fine_grained_interleaved_index : 0.000006s : 0.05% optimize.merge_cast_opt : 0.000004s : 0.04% optimize.slice_recompute_activation : 0.000005s : 0.05% optimize.micro_interleaved_order_control : 0.000005s : 0.04% optimize.assign_add_opt : 0.000004s : 0.03% optimize.ForceFp32Comm : 0.000003s : 0.03% optimize.remove_cast_before_assign_add : 0.000004s : 0.03% optimize.full_micro_interleaved_order_control : 0.000005s : 0.04% optimize.reorder_send_recv_between_fp_bp : 0.000005s : 0.05% optimize.comm_op_add_attrs : 0.000004s : 0.04% optimize.add_comm_op_reuse_tag : 0.000003s : 0.03% optimize.interleave_split_concat_branches : 0.000003s : 0.03% optimize.interleave_parallel_branches : 0.000004s : 0.03% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.04% optimize.overlap_opt_shard_grad_in_pipeline : 0.000004s : 0.04% optimize.control_data_broadcast_order : 0.000017s : 0.16% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.04% optimize.offloading_packed_experts : 0.000007s : 0.06% optimize.overlap_recompute_and_grad_model_parallel : 0.000007s : 0.07% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000003s : 0.03% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.03% optimize.overlap_recompute_comm : 0.000005s : 0.05% optimize.overlap_grad_ring_attention : 0.000007s : 0.06% optimize.overlap_grad_flash_sp : 0.000023s : 0.22% optimize.begin_end_overlap_inline : 0.000003s : 0.03% optimize.split_matmul_comm_elemetwise : 0.000004s : 0.04% optimize.split_layernorm_comm : 0.000004s : 0.04% optimize.handle_group_info : 0.000003s : 0.03% optimize.symbol_engine_optimizer.build : 0.000003s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000010s : 0.10% optimize.symbol_engine_optimizer.elim_not_effective : 0.000014s : 0.13% optimize.symbol_engine_optimizer.opt_reshape : 0.000007s : 0.07% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000010s : 0.10% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000004s : 0.04% pipeline_parallel_scheduler : 0.000002s : 0.02% auto_monad_reorder : 0.000020s : 0.19% get_jit_bprop_graph : 0.000002s : 0.02% rewriter_after_jit_bprop_graph : 0.000006s : 0.06% opt_after_jit_grad : 0.000543s : 5.07% validate : 0.000042s : 0.39% Time group info: ------[substitution.] 0.000121 19 1.63% : 0.000002s : 2: substitution.elim_not_effective 1.26% : 0.000002s : 2: substitution.fold_const_symbol 4.87% : 0.000006s : 3: substitution.graph_param_transform 81.51% : 0.000099s : 2: substitution.inline 3.53% : 0.000004s : 4: substitution.j_node_and_user_rematch 3.64% : 0.000004s : 4: substitution.remove_not_recompute_node 3.56% : 0.000004s : 2: substitution.replace_old_param ------[type_inference.] 0.005488 2 91.45% : 0.005018s : 1: type_inference.infer 8.55% : 0.000469s : 1: type_inference.specialize ------[replace.] 0.000022 2 100.00% : 0.000022s : 2: replace.inline ------[match.] 0.000097 2 100.00% : 0.000097s : 2: match.inline ------[predicate.] 0.000146 754 0.89% : 0.000001s : 7: predicate.accumulaten_eliminater 0.96% : 0.000001s : 3: predicate.ad_related_special_op_eliminate 0.64% : 0.000001s : 6: predicate.addn_check_dump 0.88% : 0.000001s : 7: predicate.addn_zero_filter 0.73% : 0.000001s : 7: predicate.adjust_all_reduce_mul_add 2.52% : 0.000004s : 13: predicate.arithmetic_simplify 0.79% : 0.000001s : 7: predicate.cast_eliminate 0.88% : 0.000001s : 6: predicate.check_bprop_eliminate 0.73% : 0.000001s : 6: predicate.compare_switch_simplify 0.19% : 0.000000s : 3: predicate.const_output_eliminate 0.77% : 0.000001s : 6: predicate.depend_value_elim 0.77% : 0.000001s : 7: predicate.dict_get_item_const_eliminator 0.92% : 0.000001s : 7: predicate.dict_get_item_eliminator 0.78% : 0.000001s : 7: predicate.dict_set_item_eliminator 1.35% : 0.000002s : 6: predicate.dumpgradient_eliminate 0.28% : 0.000000s : 3: predicate.elim_not_effective 0.53% : 0.000001s : 3: predicate.elim_shapecalc_of_broadcastargs 1.47% : 0.000002s : 10: predicate.environ_add_const_eliminate 0.96% : 0.000001s : 10: predicate.environ_get_add_eliminate 1.12% : 0.000002s : 10: predicate.environ_get_depend_swap 1.75% : 0.000003s : 16: predicate.environ_get_eliminate 1.09% : 0.000002s : 10: predicate.environ_get_set_eliminate 0.98% : 0.000001s : 9: predicate.exchange_switch_depend_value 2.04% : 0.000003s : 9: predicate.float_depend_g_call 0.64% : 0.000001s : 6: predicate.float_environ_get_switch 1.07% : 0.000002s : 9: predicate.float_tuple_getitem_switch 0.20% : 0.000000s : 3: predicate.fold_const_symbol 0.83% : 0.000001s : 6: predicate.get_grad_eliminate 0.34% : 0.000000s : 3: predicate.graph_param_transform 0.70% : 0.000001s : 6: predicate.incorporate_call 0.66% : 0.000001s : 6: predicate.incorporate_call_switch 6.44% : 0.000009s : 34: predicate.inline 1.05% : 0.000002s : 6: predicate.inline_without_move 0.40% : 0.000001s : 6: predicate.j_node_and_user_rematch 1.23% : 0.000002s : 6: predicate.less_batch_normalization 1.68% : 0.000002s : 13: predicate.list_to_tuple_eliminator_ 2.14% : 0.000003s : 20: predicate.load_eliminater 1.38% : 0.000002s : 3: predicate.loop_unroll_after_grad 1.77% : 0.000003s : 14: predicate.loop_unroll_before_grad 1.85% : 0.000003s : 13: predicate.make_slice_get_slice_eliminator 0.68% : 0.000001s : 6: predicate.merge_addn 0.70% : 0.000001s : 6: predicate.micro_step_allgather_replace 0.68% : 0.000001s : 6: predicate.mini_step_allgather_replace 0.67% : 0.000001s : 7: predicate.minmaximum_grad 1.58% : 0.000002s : 3: predicate.mutable_eliminate 0.46% : 0.000001s : 3: predicate.opt_reshape 0.48% : 0.000001s : 3: predicate.parallel_virtual_node 1.39% : 0.000002s : 9: predicate.partial_defer_inline 1.17% : 0.000002s : 10: predicate.partial_eliminate 0.99% : 0.000001s : 7: predicate.print_const_string_wrapper 0.86% : 0.000001s : 6: predicate.reduce_all_const_elim 1.21% : 0.000002s : 7: predicate.reduce_eliminate 2.15% : 0.000003s : 20: predicate.redundant_stop_gradient_eliminater 0.74% : 0.000001s : 6: predicate.remove_not_recompute_node 1.03% : 0.000002s : 13: predicate.replace_applicator 0.58% : 0.000001s : 6: predicate.replace_old_param 0.33% : 0.000000s : 3: predicate.reset_defer_inline 0.95% : 0.000001s : 7: predicate.reshape_eliminate 0.75% : 0.000001s : 6: predicate.row_tensor_add_zeros_like 0.41% : 0.000001s : 3: predicate.row_tensor_eliminate 1.07% : 0.000002s : 6: predicate.same_eliminate 0.47% : 0.000001s : 6: predicate.set_cell_output_no_recompute 0.92% : 0.000001s : 6: predicate.shard_identity_eliminate 0.92% : 0.000001s : 6: predicate.special_op_eliminate 1.05% : 0.000002s : 6: predicate.specialize_transform 0.99% : 0.000001s : 6: predicate.split_environ_get_set_with_tuple_value 0.90% : 0.000001s : 6: predicate.stack_unstack_eliminate 0.38% : 0.000001s : 3: predicate.switch_call_monad_eliminater 0.97% : 0.000001s : 9: predicate.switch_defer_inline 1.57% : 0.000002s : 15: predicate.switch_layer_defer_inline 4.04% : 0.000006s : 32: predicate.switch_simplify 0.83% : 0.000001s : 7: predicate.tile_eliminate 0.74% : 0.000001s : 7: predicate.transpose_eliminate 1.37% : 0.000002s : 13: predicate.tuple_list_convert_item_index_to_positive 1.46% : 0.000002s : 13: predicate.tuple_list_get_item_const_eliminator 1.30% : 0.000002s : 13: predicate.tuple_list_get_item_depend_reorder 3.01% : 0.000004s : 19: predicate.tuple_list_get_item_eliminator 1.28% : 0.000002s : 13: predicate.tuple_list_get_set_item_eliminator 2.41% : 0.000004s : 19: predicate.tuple_list_set_item_eliminator 1.59% : 0.000002s : 13: predicate.tuple_to_list_eliminator_ 1.92% : 0.000003s : 20: predicate.updatestate_pure_node_eliminater 2.78% : 0.000004s : 26: predicate.updatestate_useless_node_eliminater 0.81% : 0.000001s : 3: predicate.value_based_eliminate 1.30% : 0.000002s : 6: predicate.virtual_dataset_eliminate 0.74% : 0.000001s : 6: predicate.virtual_output_eliminate 0.35% : 0.000001s : 3: predicate.virtual_view_grad_eliminate 0.64% : 0.000001s : 3: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000238 5 8.19% : 0.000019s : 1: func_graph_cloner_run.FuncGraphClonerGraph 91.81% : 0.000218s : 4: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.026214 192 0.02% : 0.000006s : 1: ForceFp32Comm 13.18% : 0.003455s : 1: add_attr 13.12% : 0.003439s : 1: add_attr_with_inline 0.02% : 0.000006s : 1: add_comm_op_reuse_tag 0.21% : 0.000054s : 1: add_recomputation 0.02% : 0.000006s : 1: assign_add_opt 0.24% : 0.000064s : 1: auto_monad 0.10% : 0.000027s : 1: auto_monad_reorder 0.02% : 0.000006s : 1: begin_end_overlap_inline 0.03% : 0.000009s : 1: bias_add_comm_swap 1.90% : 0.000499s : 1: bootstrap 0.13% : 0.000035s : 1: cconv 0.02% : 0.000006s : 1: comm_op_add_attrs 0.08% : 0.000020s : 1: control_data_broadcast_order 0.05% : 0.000014s : 1: convert_after_rewriter 0.12% : 0.000031s : 1: cse_after_recomputation 0.03% : 0.000008s : 1: dataset_repeat_opt 0.08% : 0.000022s : 1: detach_backward 0.04% : 0.000012s : 1: environ_conv 0.09% : 0.000023s : 1: event_method 0.03% : 0.000007s : 1: full_micro_interleaved_order_control 0.03% : 0.000008s : 1: get_jit_bprop_graph 0.04% : 0.000012s : 1: graph_reusing 0.03% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.02% : 0.000006s : 1: handle_group_info 0.03% : 0.000009s : 1: inline 0.03% : 0.000009s : 1: insert-virtual-dataset 0.02% : 0.000006s : 1: interleave_parallel_branches 0.02% : 0.000006s : 1: interleave_split_concat_branches 0.03% : 0.000008s : 1: label_fine_grained_interleaved_index 0.04% : 0.000010s : 1: label_micro_interleaved_index 1.93% : 0.000506s : 1: loop_unroll 0.03% : 0.000007s : 1: merge_cast_opt 0.03% : 0.000008s : 1: micro_interleaved_order_control 2.54% : 0.000666s : 1: mutable_eliminate 0.04% : 0.000010s : 1: offloading_packed_experts 0.06% : 0.000016s : 1: opt.transform.loop_unroll_optimizer 0.07% : 0.000018s : 1: opt.transform.mutable_eliminate 3.08% : 0.000808s : 78: opt.transform.opt_a 0.10% : 0.000027s : 1: opt.transform.opt_after_cconv 0.10% : 0.000026s : 1: opt.transform.opt_after_jit_grad 0.38% : 0.000099s : 28: opt.transform.opt_b 0.18% : 0.000046s : 2: opt.transform.opt_trans_graph 0.14% : 0.000038s : 4: opt.transform.symbol_engine_opt 10.33% : 0.002708s : 1: opt_a 0.51% : 0.000133s : 1: opt_after_cconv 2.11% : 0.000554s : 1: opt_after_jit_grad 1.09% : 0.000286s : 1: opt_b 21.26% : 0.005573s : 1: optimize 0.09% : 0.000024s : 1: optimize_parallel_all_gather_comm 0.04% : 0.000012s : 1: order_py_execute_after_rewriter 0.10% : 0.000026s : 1: overlap_grad_flash_sp 0.02% : 0.000006s : 1: overlap_grad_matmul_and_grad_allreduce 0.04% : 0.000010s : 1: overlap_grad_ring_attention 0.03% : 0.000007s : 1: overlap_opt_shard_grad_in_pipeline 0.02% : 0.000006s : 1: overlap_opt_shard_in_pipeline 0.03% : 0.000008s : 1: overlap_param_gather 0.02% : 0.000006s : 1: overlap_recompute_allgather_and_fa_grad 0.04% : 0.000010s : 1: overlap_recompute_and_grad_model_parallel 0.03% : 0.000008s : 1: overlap_recompute_comm 0.04% : 0.000011s : 1: parallel-infer-symbol 0.03% : 0.000007s : 1: parallel-infer-symbol-second 0.03% : 0.000008s : 1: partial_unused_args_eliminate 0.04% : 0.000009s : 1: pipeline_parallel_scheduler 0.03% : 0.000007s : 1: pipeline_split 0.14% : 0.000037s : 1: pre_auto_parallel 0.10% : 0.000027s : 1: py_interpret_to_execute 0.07% : 0.000018s : 1: py_interpret_to_execute_after_opt_a 0.02% : 0.000006s : 1: remove_cast_before_assign_add 0.08% : 0.000022s : 1: remove_dup_value 1.37% : 0.000358s : 1: renormalize.infer 1.06% : 0.000278s : 1: renormalize.specialize 0.03% : 0.000008s : 1: reorder_send_recv_between_fp_bp 0.05% : 0.000012s : 1: rewriter_after_jit_bprop_graph 0.18% : 0.000047s : 1: rewriter_after_opt_a 0.22% : 0.000059s : 1: rewriter_before_opt_a 0.03% : 0.000008s : 1: slice_cell_reuse_recomputed_activation 0.03% : 0.000008s : 1: slice_recompute_activation 0.03% : 0.000007s : 1: split_layernorm_comm 0.03% : 0.000007s : 1: split_matmul_comm_elemetwise 0.04% : 0.000011s : 1: swap_dp_allreduce_reducescatter 0.40% : 0.000104s : 1: symbol_engine_optimizer 0.35% : 0.000093s : 1: tuple_transform 21.28% : 0.005577s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:39.233.485 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0135768, [21] [bootstrap]: 0.00042985 [type_inference]: 0.00523688 [event_method]: 1.164e-05 [auto_monad]: 5.446e-05 [graph_reusing]: 5.69e-06 [inline]: 2.15002e-06 [add_attr]: 0.00314259, [1] [add_attr_with_inline]: 0.00313424, [1] [Cycle 1]: 5.109e-05, [2] [tag_attr]: 1.341e-05 [meta_addattr_fg_expand]: 3.66999e-06 [parallel-infer-symbol]: 3.41001e-06 [pre_auto_parallel]: 2.539e-05 [insert-virtual-dataset]: 2.45997e-06 [parallel-infer-symbol-second]: 7.89994e-07 [dataset_repeat_opt]: 1.74e-06 [pipeline_split]: 1.58002e-06 [optimize]: 0.00398194, [53] [py_interpret_to_execute]: 1.729e-05 [rewriter_before_opt_a]: 4.543e-05 [opt_a]: 0.00206151, [2] [Cycle 1]: 0.00144473, [45] [expand_dump_flag]: 2.89001e-06 [switch_simplify]: 2.545e-05 [loop_unroll]: 1.414e-05 [a_1]: 0.00028139 [with_stream_mark]: 1.615e-05 [recompute_prepare]: 8.60999e-06 [updatestate_depend_eliminate]: 3.69002e-06 [updatestate_assign_eliminate]: 3.36001e-06 [updatestate_loads_eliminate]: 3.27002e-06 [parameter_eliminate]: 1.75001e-06 [a_2]: 8.208e-05 [accelerated_algorithm]: 7.21001e-06 [shard]: 2.07001e-06 [meta_shard_fg_expand]: 1.69e-06 [shard_inline]: 6.38e-06 [merge_send_recv]: 8.64e-06 [auto_parallel]: 6.12001e-06 [parallel]: 1.79e-05 [flash_sp]: 8.14997e-06 [merge_comm]: 3.97e-06 [allreduce_fusion]: 4.09002e-06 [matmul_add_comm_reduction]: 1.02e-05 [allreduce_slice_to_reducescatter]: 6.00005e-07 [virtual_shard_identity]: 7.87e-06 [virtual_dataset]: 6.81001e-06 [get_grad_eliminate_]: 6.38e-06 [virtual_output]: 6.23e-06 [merge_forward]: 4.28999e-06 [cell_reuse_recompute_pass]: 1.20999e-06 [offload_activation]: 9.61e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.359e-05 [merge_recompute_call_nodes]: 1.95001e-06 [before_grad]: 1.078e-05 [set_forward_comm_id_for_comm_node_pass]: 3.67002e-06 [meta_fg_expand]: 2.54999e-06 [flash_sp_send_recv_attached]: 2.41998e-06 [receive_attached]: 2.43e-06 [after_resolve]: 1.018e-05 [a_after_grad]: 9.39998e-06 [renormalize]: 0.00049242 [add_forward_monad_depend]: 4.72998e-06 [auto_monad_grad]: 2.01e-06 [auto_monad_eliminator]: 1.481e-05 [cse]: 3.032e-05 [a_3]: 4.477e-05 [Cycle 2]: 0.00060688, [45] [expand_dump_flag]: 1.03001e-06 [switch_simplify]: 7.2e-06 [loop_unroll]: 5.94e-06 [a_1]: 0.00010485 [with_stream_mark]: 1.528e-05 [recompute_prepare]: 6.16998e-06 [updatestate_depend_eliminate]: 2.89999e-06 [updatestate_assign_eliminate]: 2.14e-06 [updatestate_loads_eliminate]: 3.03e-06 [parameter_eliminate]: 1.44998e-06 [a_2]: 7.001e-05 [accelerated_algorithm]: 5.87999e-06 [shard]: 1.10001e-06 [meta_shard_fg_expand]: 1.59e-06 [shard_inline]: 6.11e-06 [merge_send_recv]: 4.81002e-06 [auto_parallel]: 5.62001e-06 [parallel]: 5.12e-06 [flash_sp]: 3.65998e-06 [merge_comm]: 3.06999e-06 [allreduce_fusion]: 3.04001e-06 [matmul_add_comm_reduction]: 5.74e-06 [allreduce_slice_to_reducescatter]: 3.60014e-07 [virtual_shard_identity]: 6.77002e-06 [virtual_dataset]: 5.60001e-06 [get_grad_eliminate_]: 5.42001e-06 [virtual_output]: 5.25999e-06 [merge_forward]: 2.51e-06 [cell_reuse_recompute_pass]: 1.52001e-06 [offload_activation]: 6.02999e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.212e-05 [merge_recompute_call_nodes]: 7.2e-07 [before_grad]: 8.52998e-06 [set_forward_comm_id_for_comm_node_pass]: 3.58999e-06 [meta_fg_expand]: 1.94e-06 [flash_sp_send_recv_attached]: 8.40024e-07 [receive_attached]: 1.04e-06 [after_resolve]: 8.70001e-06 [a_after_grad]: 8.33001e-06 [renormalize]: 9.00181e-08 [add_forward_monad_depend]: 1.45999e-06 [auto_monad_grad]: 1.07998e-06 [auto_monad_eliminator]: 7.18998e-06 [cse]: 1.452e-05 [a_3]: 3.454e-05 [py_interpret_to_execute_after_opt_a]: 8.40999e-06 [slice_cell_reuse_recomputed_activation]: 2.27999e-06 [rewriter_after_opt_a]: 3.524e-05 [convert_after_rewriter]: 6.14001e-06 [order_py_execute_after_rewriter]: 5.39e-06 [mutable_eliminate]: 0.00048026 [opt_b]: 0.00022243, [1] [Cycle 1]: 0.0002164, [7] [b_1]: 0.00014235 [b_2]: 7.73999e-06 [updatestate_depend_eliminate]: 5.05999e-06 [updatestate_assign_eliminate]: 2.32001e-06 [updatestate_loads_eliminate]: 2.57001e-06 [renormalize]: 5.59987e-07 [cse]: 1.905e-05 [optimize_parallel_all_gather_comm]: 1.558e-05 [overlap_param_gather]: 1.83002e-06 [cconv]: 2.377e-05 [loop_unroll]: 0.00042099 [opt_after_cconv]: 9.687e-05, [1] [Cycle 1]: 9.131e-05, [7] [c_1]: 2.671e-05 [parameter_eliminate]: 2.71e-06 [updatestate_depend_eliminate]: 5.35999e-06 [updatestate_assign_eliminate]: 2.29999e-06 [updatestate_loads_eliminate]: 2.14999e-06 [cse]: 1.789e-05 [renormalize]: 3.60014e-07 [remove_dup_value]: 1.573e-05 [tuple_transform]: 7.074e-05, [1] [Cycle 1]: 6.643e-05, [4] [d_1]: 3.914e-05 [none_parameter_eliminate]: 1.86e-06 [renormalize]: 1.99972e-07 [switch_simplify]: 6.53e-06 [partial_unused_args_eliminate]: 1.86e-06 [add_recomputation]: 4.462e-05 [cse_after_recomputation]: 2.152e-05, [1] [Cycle 1]: 1.681e-05, [1] [cse]: 1.125e-05 [environ_conv]: 5.25001e-06 [swap_dp_allreduce_reducescatter]: 4.87e-06 [bias_add_comm_swap]: 2.86999e-06 [label_micro_interleaved_index]: 4.13999e-06 [label_fine_grained_interleaved_index]: 2.51998e-06 [merge_cast_opt]: 1.25001e-06 [slice_recompute_activation]: 1.96e-06 [micro_interleaved_order_control]: 2.39001e-06 [assign_add_opt]: 1.31998e-06 [ForceFp32Comm]: 7.80012e-07 [remove_cast_before_assign_add]: 9.5999e-07 [full_micro_interleaved_order_control]: 2.31998e-06 [reorder_send_recv_between_fp_bp]: 2.56e-06 [comm_op_add_attrs]: 9.60019e-07 [add_comm_op_reuse_tag]: 9.39996e-07 [interleave_split_concat_branches]: 1.14e-06 [interleave_parallel_branches]: 1.02998e-06 [overlap_opt_shard_in_pipeline]: 1.20999e-06 [overlap_opt_shard_grad_in_pipeline]: 1.99999e-06 [control_data_broadcast_order]: 1.233e-05 [grouped_pairwise_exchange_alltoall]: 1.78002e-06 [offloading_packed_experts]: 3.89002e-06 [overlap_recompute_and_grad_model_parallel]: 4.56002e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.37e-06 [overlap_recompute_allgather_and_fa_grad]: 1.24e-06 [overlap_recompute_comm]: 2.24001e-06 [overlap_grad_ring_attention]: 4.27e-06 [overlap_grad_flash_sp]: 1.824e-05 [begin_end_overlap_inline]: 5.69999e-07 [split_matmul_comm_elemetwise]: 2.24999e-06 [split_layernorm_comm]: 1.92999e-06 [handle_group_info]: 9.79984e-07 [symbol_engine_optimizer]: 7.437e-05, [1] [Cycle 1]: 7.007e-05, [6] [build]: 2.62001e-06 [elim_shapecalc]: 9.56998e-06 [elim_not_effective]: 1.22e-05 [opt_reshape]: 6.69999e-06 [fold_const_symbol]: 9.88998e-06 [renormalize]: 2.10013e-07 [detach_backward]: 1.87001e-06 [pipeline_parallel_scheduler]: 1.48002e-06 [auto_monad_reorder]: 1.653e-05 [get_jit_bprop_graph]: 1.40999e-06 [rewriter_after_jit_bprop_graph]: 3.83001e-06 [opt_after_jit_grad]: 0.00046422 [validate]: 3.843e-05 Sums bootstrap : 0.000430s : 4.53% type_inference : 0.005237s : 55.18% event_method : 0.000012s : 0.12% auto_monad : 0.000054s : 0.57% graph_reusing : 0.000006s : 0.06% inline : 0.000002s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000013s : 0.14% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000004s : 0.04% parallel-infer-symbol : 0.000003s : 0.04% pre_auto_parallel : 0.000025s : 0.27% insert-virtual-dataset : 0.000002s : 0.03% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.02% optimize.py_interpret_to_execute : 0.000017s : 0.18% optimize.rewriter_before_opt_a : 0.000045s : 0.48% optimize.opt_a.expand_dump_flag : 0.000004s : 0.04% optimize.opt_a.switch_simplify : 0.000033s : 0.34% optimize.opt_a.loop_unroll : 0.000020s : 0.21% optimize.opt_a.a_1 : 0.000386s : 4.07% optimize.opt_a.with_stream_mark : 0.000031s : 0.33% optimize.opt_a.recompute_prepare : 0.000015s : 0.16% optimize.opt_a.updatestate_depend_eliminate : 0.000007s : 0.07% optimize.opt_a.updatestate_assign_eliminate : 0.000006s : 0.06% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.07% optimize.opt_a.parameter_eliminate : 0.000003s : 0.03% optimize.opt_a.a_2 : 0.000152s : 1.60% optimize.opt_a.accelerated_algorithm : 0.000013s : 0.14% optimize.opt_a.shard : 0.000003s : 0.03% optimize.opt_a.meta_shard_fg_expand : 0.000003s : 0.03% optimize.opt_a.shard_inline : 0.000012s : 0.13% optimize.opt_a.merge_send_recv : 0.000013s : 0.14% optimize.opt_a.auto_parallel : 0.000012s : 0.12% optimize.opt_a.parallel : 0.000023s : 0.24% optimize.opt_a.flash_sp : 0.000012s : 0.12% optimize.opt_a.merge_comm : 0.000007s : 0.07% optimize.opt_a.allreduce_fusion : 0.000007s : 0.08% optimize.opt_a.matmul_add_comm_reduction : 0.000016s : 0.17% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000015s : 0.15% optimize.opt_a.virtual_dataset : 0.000012s : 0.13% optimize.opt_a.get_grad_eliminate_ : 0.000012s : 0.12% optimize.opt_a.virtual_output : 0.000011s : 0.12% optimize.opt_a.merge_forward : 0.000007s : 0.07% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.03% optimize.opt_a.offload_activation : 0.000016s : 0.16% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000026s : 0.27% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.03% optimize.opt_a.before_grad : 0.000019s : 0.20% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000007s : 0.08% optimize.opt_a.meta_fg_expand : 0.000004s : 0.05% optimize.opt_a.flash_sp_send_recv_attached : 0.000003s : 0.03% optimize.opt_a.receive_attached : 0.000003s : 0.04% optimize.opt_a.after_resolve : 0.000019s : 0.20% optimize.opt_a.a_after_grad : 0.000018s : 0.19% optimize.opt_a.renormalize : 0.000493s : 5.19% optimize.opt_a.add_forward_monad_depend : 0.000006s : 0.07% optimize.opt_a.auto_monad_grad : 0.000003s : 0.03% optimize.opt_a.auto_monad_eliminator : 0.000022s : 0.23% optimize.opt_a.cse : 0.000045s : 0.47% optimize.opt_a.a_3 : 0.000079s : 0.84% optimize.py_interpret_to_execute_after_opt_a : 0.000008s : 0.09% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.02% optimize.rewriter_after_opt_a : 0.000035s : 0.37% optimize.convert_after_rewriter : 0.000006s : 0.06% optimize.order_py_execute_after_rewriter : 0.000005s : 0.06% optimize.mutable_eliminate : 0.000480s : 5.06% optimize.opt_b.b_1 : 0.000142s : 1.50% optimize.opt_b.b_2 : 0.000008s : 0.08% optimize.opt_b.updatestate_depend_eliminate : 0.000005s : 0.05% optimize.opt_b.updatestate_assign_eliminate : 0.000002s : 0.02% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_b.renormalize : 0.000001s : 0.01% optimize.opt_b.cse : 0.000019s : 0.20% optimize.optimize_parallel_all_gather_comm : 0.000016s : 0.16% optimize.overlap_param_gather : 0.000002s : 0.02% optimize.cconv : 0.000024s : 0.25% optimize.loop_unroll : 0.000421s : 4.44% optimize.opt_after_cconv.c_1 : 0.000027s : 0.28% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000005s : 0.06% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000002s : 0.02% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.02% optimize.opt_after_cconv.cse : 0.000018s : 0.19% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000016s : 0.17% optimize.tuple_transform.d_1 : 0.000039s : 0.41% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.02% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000007s : 0.07% optimize.partial_unused_args_eliminate : 0.000002s : 0.02% optimize.add_recomputation : 0.000045s : 0.47% optimize.cse_after_recomputation.cse : 0.000011s : 0.12% optimize.environ_conv : 0.000005s : 0.06% optimize.swap_dp_allreduce_reducescatter : 0.000005s : 0.05% optimize.bias_add_comm_swap : 0.000003s : 0.03% optimize.label_micro_interleaved_index : 0.000004s : 0.04% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.03% optimize.merge_cast_opt : 0.000001s : 0.01% optimize.slice_recompute_activation : 0.000002s : 0.02% optimize.micro_interleaved_order_control : 0.000002s : 0.03% optimize.assign_add_opt : 0.000001s : 0.01% optimize.ForceFp32Comm : 0.000001s : 0.01% optimize.remove_cast_before_assign_add : 0.000001s : 0.01% optimize.full_micro_interleaved_order_control : 0.000002s : 0.02% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.03% optimize.comm_op_add_attrs : 0.000001s : 0.01% optimize.add_comm_op_reuse_tag : 0.000001s : 0.01% optimize.interleave_split_concat_branches : 0.000001s : 0.01% optimize.interleave_parallel_branches : 0.000001s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.02% optimize.control_data_broadcast_order : 0.000012s : 0.13% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.02% optimize.offloading_packed_experts : 0.000004s : 0.04% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.05% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.01% optimize.overlap_recompute_comm : 0.000002s : 0.02% optimize.overlap_grad_ring_attention : 0.000004s : 0.04% optimize.overlap_grad_flash_sp : 0.000018s : 0.19% optimize.begin_end_overlap_inline : 0.000001s : 0.01% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.02% optimize.split_layernorm_comm : 0.000002s : 0.02% optimize.handle_group_info : 0.000001s : 0.01% optimize.symbol_engine_optimizer.build : 0.000003s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000010s : 0.10% optimize.symbol_engine_optimizer.elim_not_effective : 0.000012s : 0.13% optimize.symbol_engine_optimizer.opt_reshape : 0.000007s : 0.07% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000010s : 0.10% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.02% pipeline_parallel_scheduler : 0.000001s : 0.02% auto_monad_reorder : 0.000017s : 0.17% get_jit_bprop_graph : 0.000001s : 0.01% rewriter_after_jit_bprop_graph : 0.000004s : 0.04% opt_after_jit_grad : 0.000464s : 4.89% validate : 0.000038s : 0.40% Time group info: ------[substitution.] 0.000104 19 1.79% : 0.000002s : 2: substitution.elim_not_effective 1.52% : 0.000002s : 2: substitution.fold_const_symbol 5.29% : 0.000005s : 3: substitution.graph_param_transform 81.26% : 0.000084s : 2: substitution.inline 2.77% : 0.000003s : 4: substitution.j_node_and_user_rematch 4.64% : 0.000005s : 4: substitution.remove_not_recompute_node 2.74% : 0.000003s : 2: substitution.replace_old_param ------[type_inference.] 0.005189 2 91.57% : 0.004752s : 1: type_inference.infer 8.43% : 0.000437s : 1: type_inference.specialize ------[replace.] 0.000021 2 100.00% : 0.000021s : 2: replace.inline ------[match.] 0.000082 2 100.00% : 0.000082s : 2: match.inline ------[predicate.] 0.000130 754 0.77% : 0.000001s : 7: predicate.accumulaten_eliminater 1.34% : 0.000002s : 3: predicate.ad_related_special_op_eliminate 0.65% : 0.000001s : 6: predicate.addn_check_dump 0.84% : 0.000001s : 7: predicate.addn_zero_filter 0.66% : 0.000001s : 7: predicate.adjust_all_reduce_mul_add 2.25% : 0.000003s : 13: predicate.arithmetic_simplify 0.86% : 0.000001s : 7: predicate.cast_eliminate 0.72% : 0.000001s : 6: predicate.check_bprop_eliminate 0.71% : 0.000001s : 6: predicate.compare_switch_simplify 0.21% : 0.000000s : 3: predicate.const_output_eliminate 0.69% : 0.000001s : 6: predicate.depend_value_elim 0.75% : 0.000001s : 7: predicate.dict_get_item_const_eliminator 0.94% : 0.000001s : 7: predicate.dict_get_item_eliminator 0.78% : 0.000001s : 7: predicate.dict_set_item_eliminator 1.15% : 0.000001s : 6: predicate.dumpgradient_eliminate 0.29% : 0.000000s : 3: predicate.elim_not_effective 0.55% : 0.000001s : 3: predicate.elim_shapecalc_of_broadcastargs 1.15% : 0.000001s : 10: predicate.environ_add_const_eliminate 1.01% : 0.000001s : 10: predicate.environ_get_add_eliminate 1.01% : 0.000001s : 10: predicate.environ_get_depend_swap 1.90% : 0.000002s : 16: predicate.environ_get_eliminate 1.05% : 0.000001s : 10: predicate.environ_get_set_eliminate 0.95% : 0.000001s : 9: predicate.exchange_switch_depend_value 1.95% : 0.000003s : 9: predicate.float_depend_g_call 0.66% : 0.000001s : 6: predicate.float_environ_get_switch 1.01% : 0.000001s : 9: predicate.float_tuple_getitem_switch 0.23% : 0.000000s : 3: predicate.fold_const_symbol 0.91% : 0.000001s : 6: predicate.get_grad_eliminate 0.24% : 0.000000s : 3: predicate.graph_param_transform 0.81% : 0.000001s : 6: predicate.incorporate_call 0.66% : 0.000001s : 6: predicate.incorporate_call_switch 6.40% : 0.000008s : 34: predicate.inline 1.20% : 0.000002s : 6: predicate.inline_without_move 0.43% : 0.000001s : 6: predicate.j_node_and_user_rematch 1.00% : 0.000001s : 6: predicate.less_batch_normalization 1.59% : 0.000002s : 13: predicate.list_to_tuple_eliminator_ 2.24% : 0.000003s : 20: predicate.load_eliminater 1.30% : 0.000002s : 3: predicate.loop_unroll_after_grad 1.71% : 0.000002s : 14: predicate.loop_unroll_before_grad 1.62% : 0.000002s : 13: predicate.make_slice_get_slice_eliminator 0.66% : 0.000001s : 6: predicate.merge_addn 0.73% : 0.000001s : 6: predicate.micro_step_allgather_replace 0.68% : 0.000001s : 6: predicate.mini_step_allgather_replace 0.66% : 0.000001s : 7: predicate.minmaximum_grad 1.38% : 0.000002s : 3: predicate.mutable_eliminate 0.45% : 0.000001s : 3: predicate.opt_reshape 0.47% : 0.000001s : 3: predicate.parallel_virtual_node 1.13% : 0.000001s : 9: predicate.partial_defer_inline 1.25% : 0.000002s : 10: predicate.partial_eliminate 0.84% : 0.000001s : 7: predicate.print_const_string_wrapper 0.70% : 0.000001s : 6: predicate.reduce_all_const_elim 1.07% : 0.000001s : 7: predicate.reduce_eliminate 2.05% : 0.000003s : 20: predicate.redundant_stop_gradient_eliminater 0.63% : 0.000001s : 6: predicate.remove_not_recompute_node 1.18% : 0.000002s : 13: predicate.replace_applicator 0.65% : 0.000001s : 6: predicate.replace_old_param 0.35% : 0.000000s : 3: predicate.reset_defer_inline 0.86% : 0.000001s : 7: predicate.reshape_eliminate 0.72% : 0.000001s : 6: predicate.row_tensor_add_zeros_like 0.51% : 0.000001s : 3: predicate.row_tensor_eliminate 0.95% : 0.000001s : 6: predicate.same_eliminate 0.55% : 0.000001s : 6: predicate.set_cell_output_no_recompute 1.13% : 0.000001s : 6: predicate.shard_identity_eliminate 1.65% : 0.000002s : 6: predicate.special_op_eliminate 0.89% : 0.000001s : 6: predicate.specialize_transform 1.12% : 0.000001s : 6: predicate.split_environ_get_set_with_tuple_value 0.95% : 0.000001s : 6: predicate.stack_unstack_eliminate 0.46% : 0.000001s : 3: predicate.switch_call_monad_eliminater 1.01% : 0.000001s : 9: predicate.switch_defer_inline 1.74% : 0.000002s : 15: predicate.switch_layer_defer_inline 4.40% : 0.000006s : 32: predicate.switch_simplify 0.75% : 0.000001s : 7: predicate.tile_eliminate 0.79% : 0.000001s : 7: predicate.transpose_eliminate 1.60% : 0.000002s : 13: predicate.tuple_list_convert_item_index_to_positive 1.61% : 0.000002s : 13: predicate.tuple_list_get_item_const_eliminator 1.44% : 0.000002s : 13: predicate.tuple_list_get_item_depend_reorder 3.24% : 0.000004s : 19: predicate.tuple_list_get_item_eliminator 1.41% : 0.000002s : 13: predicate.tuple_list_get_set_item_eliminator 2.64% : 0.000003s : 19: predicate.tuple_list_set_item_eliminator 1.52% : 0.000002s : 13: predicate.tuple_to_list_eliminator_ 2.03% : 0.000003s : 20: predicate.updatestate_pure_node_eliminater 2.78% : 0.000004s : 26: predicate.updatestate_useless_node_eliminater 0.46% : 0.000001s : 3: predicate.value_based_eliminate 1.01% : 0.000001s : 6: predicate.virtual_dataset_eliminate 0.83% : 0.000001s : 6: predicate.virtual_output_eliminate 0.35% : 0.000000s : 3: predicate.virtual_view_grad_eliminate 0.56% : 0.000001s : 3: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000202 5 8.06% : 0.000016s : 1: func_graph_cloner_run.FuncGraphClonerGraph 91.94% : 0.000186s : 4: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.022094 192 0.02% : 0.000003s : 1: ForceFp32Comm 14.25% : 0.003148s : 1: add_attr 14.20% : 0.003138s : 1: add_attr_with_inline 0.02% : 0.000003s : 1: add_comm_op_reuse_tag 0.22% : 0.000049s : 1: add_recomputation 0.02% : 0.000004s : 1: assign_add_opt 0.27% : 0.000059s : 1: auto_monad 0.09% : 0.000020s : 1: auto_monad_reorder 0.01% : 0.000003s : 1: begin_end_overlap_inline 0.03% : 0.000006s : 1: bias_add_comm_swap 2.08% : 0.000460s : 1: bootstrap 0.12% : 0.000027s : 1: cconv 0.02% : 0.000004s : 1: comm_op_add_attrs 0.07% : 0.000015s : 1: control_data_broadcast_order 0.04% : 0.000009s : 1: convert_after_rewriter 0.11% : 0.000025s : 1: cse_after_recomputation 0.02% : 0.000004s : 1: dataset_repeat_opt 0.02% : 0.000005s : 1: detach_backward 0.04% : 0.000009s : 1: environ_conv 0.08% : 0.000018s : 1: event_method 0.02% : 0.000005s : 1: full_micro_interleaved_order_control 0.02% : 0.000005s : 1: get_jit_bprop_graph 0.04% : 0.000009s : 1: graph_reusing 0.02% : 0.000005s : 1: grouped_pairwise_exchange_alltoall 0.02% : 0.000004s : 1: handle_group_info 0.02% : 0.000005s : 1: inline 0.03% : 0.000006s : 1: insert-virtual-dataset 0.02% : 0.000004s : 1: interleave_parallel_branches 0.02% : 0.000004s : 1: interleave_split_concat_branches 0.02% : 0.000005s : 1: label_fine_grained_interleaved_index 0.03% : 0.000007s : 1: label_micro_interleaved_index 1.94% : 0.000429s : 1: loop_unroll 0.02% : 0.000004s : 1: merge_cast_opt 0.02% : 0.000005s : 1: micro_interleaved_order_control 2.21% : 0.000489s : 1: mutable_eliminate 0.03% : 0.000007s : 1: offloading_packed_experts 0.06% : 0.000014s : 1: opt.transform.loop_unroll_optimizer 0.06% : 0.000014s : 1: opt.transform.mutable_eliminate 3.40% : 0.000752s : 78: opt.transform.opt_a 0.11% : 0.000025s : 1: opt.transform.opt_after_cconv 0.11% : 0.000024s : 1: opt.transform.opt_after_jit_grad 0.53% : 0.000118s : 28: opt.transform.opt_b 0.20% : 0.000043s : 2: opt.transform.opt_trans_graph 0.16% : 0.000035s : 4: opt.transform.symbol_engine_opt 9.34% : 0.002064s : 1: opt_a 0.45% : 0.000100s : 1: opt_after_cconv 2.14% : 0.000474s : 1: opt_after_jit_grad 1.02% : 0.000226s : 1: opt_b 18.04% : 0.003987s : 1: optimize 0.09% : 0.000019s : 1: optimize_parallel_all_gather_comm 0.04% : 0.000008s : 1: order_py_execute_after_rewriter 0.10% : 0.000021s : 1: overlap_grad_flash_sp 0.02% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.03% : 0.000007s : 1: overlap_grad_ring_attention 0.02% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.02% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.02% : 0.000005s : 1: overlap_param_gather 0.02% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.03% : 0.000007s : 1: overlap_recompute_and_grad_model_parallel 0.02% : 0.000005s : 1: overlap_recompute_comm 0.03% : 0.000007s : 1: parallel-infer-symbol 0.02% : 0.000004s : 1: parallel-infer-symbol-second 0.02% : 0.000005s : 1: partial_unused_args_eliminate 0.02% : 0.000005s : 1: pipeline_parallel_scheduler 0.02% : 0.000004s : 1: pipeline_split 0.13% : 0.000029s : 1: pre_auto_parallel 0.10% : 0.000021s : 1: py_interpret_to_execute 0.05% : 0.000012s : 1: py_interpret_to_execute_after_opt_a 0.02% : 0.000004s : 1: remove_cast_before_assign_add 0.09% : 0.000019s : 1: remove_dup_value 1.21% : 0.000268s : 1: renormalize.infer 0.98% : 0.000216s : 1: renormalize.specialize 0.02% : 0.000005s : 1: reorder_send_recv_between_fp_bp 0.03% : 0.000007s : 1: rewriter_after_jit_bprop_graph 0.18% : 0.000039s : 1: rewriter_after_opt_a 0.22% : 0.000049s : 1: rewriter_before_opt_a 0.02% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.02% : 0.000005s : 1: slice_recompute_activation 0.02% : 0.000005s : 1: split_layernorm_comm 0.02% : 0.000005s : 1: split_matmul_comm_elemetwise 0.04% : 0.000008s : 1: swap_dp_allreduce_reducescatter 0.35% : 0.000077s : 1: symbol_engine_optimizer 0.33% : 0.000074s : 1: tuple_transform 23.78% : 0.005254s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:39.430.015 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:39.430.304 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.017513, [21] [bootstrap]: 0.00048596 [type_inference]: 0.00639716 [event_method]: 1.577e-05 [auto_monad]: 6.464e-05 [graph_reusing]: 5.87001e-06 [inline]: 2.81e-06 [add_attr]: 0.00379282, [1] [add_attr_with_inline]: 0.00378208, [1] [Cycle 1]: 8.902e-05, [2] [tag_attr]: 1.842e-05 [meta_addattr_fg_expand]: 4.22e-06 [parallel-infer-symbol]: 3.41001e-06 [pre_auto_parallel]: 3.147e-05 [insert-virtual-dataset]: 2.37001e-06 [parallel-infer-symbol-second]: 9.70002e-07 [dataset_repeat_opt]: 2.69001e-06 [pipeline_split]: 1.68002e-06 [optimize]: 0.00547707, [53] [py_interpret_to_execute]: 2.832e-05 [rewriter_before_opt_a]: 5.989e-05 [opt_a]: 0.00294625, [2] [Cycle 1]: 0.00199586, [45] [expand_dump_flag]: 3.43e-06 [switch_simplify]: 2.697e-05 [loop_unroll]: 1.699e-05 [a_1]: 0.0003218 [with_stream_mark]: 2.594e-05 [recompute_prepare]: 1.212e-05 [updatestate_depend_eliminate]: 4.70001e-06 [updatestate_assign_eliminate]: 4.07e-06 [updatestate_loads_eliminate]: 2.91999e-06 [parameter_eliminate]: 2.56e-06 [a_2]: 0.00011538 [accelerated_algorithm]: 7.36999e-06 [shard]: 2.49001e-06 [meta_shard_fg_expand]: 2.32999e-06 [shard_inline]: 6.31998e-06 [merge_send_recv]: 9.25001e-06 [auto_parallel]: 8.38999e-06 [parallel]: 1.902e-05 [flash_sp]: 1.024e-05 [merge_comm]: 4.93001e-06 [allreduce_fusion]: 4.29002e-06 [matmul_add_comm_reduction]: 1.003e-05 [allreduce_slice_to_reducescatter]: 8.50006e-07 [virtual_shard_identity]: 1.025e-05 [virtual_dataset]: 7.3e-06 [get_grad_eliminate_]: 6.12999e-06 [virtual_output]: 8.41002e-06 [merge_forward]: 4.62e-06 [cell_reuse_recompute_pass]: 1.27e-06 [offload_activation]: 1.114e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.969e-05 [merge_recompute_call_nodes]: 1.63002e-06 [before_grad]: 1.268e-05 [set_forward_comm_id_for_comm_node_pass]: 5.11002e-06 [meta_fg_expand]: 2.79001e-06 [flash_sp_send_recv_attached]: 3.39001e-06 [receive_attached]: 1.92001e-06 [after_resolve]: 1.372e-05 [a_after_grad]: 1.021e-05 [renormalize]: 0.00068827 [add_forward_monad_depend]: 7.92998e-06 [auto_monad_grad]: 2.49001e-06 [auto_monad_eliminator]: 1.782e-05 [cse]: 3.052e-05 [a_3]: 6.458e-05 [Cycle 2]: 0.0009341, [45] [expand_dump_flag]: 1.44e-06 [switch_simplify]: 8.03999e-06 [loop_unroll]: 6.71e-06 [a_1]: 0.00010599 [with_stream_mark]: 1.634e-05 [recompute_prepare]: 6.93998e-06 [updatestate_depend_eliminate]: 3.16999e-06 [updatestate_assign_eliminate]: 3.11999e-06 [updatestate_loads_eliminate]: 2.94999e-06 [parameter_eliminate]: 1.80001e-06 [a_2]: 9.975e-05 [accelerated_algorithm]: 6.28998e-06 [shard]: 2.31998e-06 [meta_shard_fg_expand]: 2.06e-06 [shard_inline]: 7.23999e-06 [merge_send_recv]: 6.84999e-06 [auto_parallel]: 7.96001e-06 [parallel]: 6.59001e-06 [flash_sp]: 4.68001e-06 [merge_comm]: 3.97e-06 [allreduce_fusion]: 3.75e-06 [matmul_add_comm_reduction]: 1.142e-05 [allreduce_slice_to_reducescatter]: 6.09987e-07 [virtual_shard_identity]: 9.70002e-06 [virtual_dataset]: 6.01998e-06 [get_grad_eliminate_]: 6.18002e-06 [virtual_output]: 5.99999e-06 [merge_forward]: 5.25001e-06 [cell_reuse_recompute_pass]: 1.59e-06 [offload_activation]: 9.25999e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.966e-05 [merge_recompute_call_nodes]: 1.69998e-06 [before_grad]: 1.108e-05 [set_forward_comm_id_for_comm_node_pass]: 5.34e-06 [meta_fg_expand]: 2.38002e-06 [flash_sp_send_recv_attached]: 1.63002e-06 [receive_attached]: 1.44e-06 [after_resolve]: 1.13e-05 [a_after_grad]: 9.76e-06 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 4.617e-05 [auto_monad_grad]: 1.88002e-06 [auto_monad_eliminator]: 1.385e-05 [cse]: 2.295e-05 [a_3]: 5.393e-05 [py_interpret_to_execute_after_opt_a]: 1.777e-05 [slice_cell_reuse_recomputed_activation]: 5.38002e-06 [rewriter_after_opt_a]: 4.968e-05 [convert_after_rewriter]: 1.008e-05 [order_py_execute_after_rewriter]: 8.72e-06 [mutable_eliminate]: 0.000621 [opt_b]: 0.00029431, [1] [Cycle 1]: 0.00028382, [7] [b_1]: 0.00016722 [b_2]: 9.93998e-06 [updatestate_depend_eliminate]: 8.41002e-06 [updatestate_assign_eliminate]: 2.89001e-06 [updatestate_loads_eliminate]: 3.03e-06 [renormalize]: 1.12999e-06 [cse]: 2.876e-05 [optimize_parallel_all_gather_comm]: 2.221e-05 [overlap_param_gather]: 5.64e-06 [cconv]: 3.808e-05 [loop_unroll]: 0.00050549 [opt_after_cconv]: 0.00013829, [1] [Cycle 1]: 0.00012886, [7] [c_1]: 2.858e-05 [parameter_eliminate]: 4.72e-06 [updatestate_depend_eliminate]: 7.01001e-06 [updatestate_assign_eliminate]: 2.78e-06 [updatestate_loads_eliminate]: 2.68e-06 [cse]: 2.403e-05 [renormalize]: 7.50006e-07 [remove_dup_value]: 1.87e-05 [tuple_transform]: 9.288e-05, [1] [Cycle 1]: 8.558e-05, [4] [d_1]: 4.364e-05 [none_parameter_eliminate]: 1.76e-06 [renormalize]: 1.8999e-07 [switch_simplify]: 7.35e-06 [partial_unused_args_eliminate]: 4.47e-06 [add_recomputation]: 5.523e-05 [cse_after_recomputation]: 3.121e-05, [1] [Cycle 1]: 2.368e-05, [1] [cse]: 1.28e-05 [environ_conv]: 9.24998e-06 [swap_dp_allreduce_reducescatter]: 8.64e-06 [bias_add_comm_swap]: 5.77999e-06 [label_micro_interleaved_index]: 7.45003e-06 [label_fine_grained_interleaved_index]: 5.04998e-06 [merge_cast_opt]: 3.63e-06 [slice_recompute_activation]: 4.45e-06 [micro_interleaved_order_control]: 5.12999e-06 [assign_add_opt]: 3.61999e-06 [ForceFp32Comm]: 3.46001e-06 [remove_cast_before_assign_add]: 3.78001e-06 [full_micro_interleaved_order_control]: 4.85999e-06 [reorder_send_recv_between_fp_bp]: 5.35001e-06 [comm_op_add_attrs]: 3.75998e-06 [add_comm_op_reuse_tag]: 3.28e-06 [interleave_split_concat_branches]: 3.53e-06 [interleave_parallel_branches]: 3.43e-06 [overlap_opt_shard_in_pipeline]: 3.53999e-06 [overlap_opt_shard_grad_in_pipeline]: 4.59002e-06 [control_data_broadcast_order]: 1.653e-05 [grouped_pairwise_exchange_alltoall]: 3.9e-06 [offloading_packed_experts]: 6.91999e-06 [overlap_recompute_and_grad_model_parallel]: 7.35e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.43999e-06 [overlap_recompute_allgather_and_fa_grad]: 3.44001e-06 [overlap_recompute_comm]: 4.85001e-06 [overlap_grad_ring_attention]: 6.66e-06 [overlap_grad_flash_sp]: 2.505e-05 [begin_end_overlap_inline]: 3.02002e-06 [split_matmul_comm_elemetwise]: 4.53999e-06 [split_layernorm_comm]: 4.1e-06 [handle_group_info]: 3.65e-06 [symbol_engine_optimizer]: 0.00010528, [1] [Cycle 1]: 9.769e-05, [6] [build]: 2.71e-06 [elim_shapecalc]: 1.209e-05 [elim_not_effective]: 1.425e-05 [opt_reshape]: 7.36999e-06 [fold_const_symbol]: 1.016e-05 [renormalize]: 3.80009e-07 [detach_backward]: 4.79998e-06 [pipeline_parallel_scheduler]: 1.91e-06 [auto_monad_reorder]: 1.989e-05 [get_jit_bprop_graph]: 1.96e-06 [rewriter_after_jit_bprop_graph]: 5.59e-06 [opt_after_jit_grad]: 0.00052947 [validate]: 4.325e-05 Sums bootstrap : 0.000486s : 4.12% type_inference : 0.006397s : 54.18% event_method : 0.000016s : 0.13% auto_monad : 0.000065s : 0.55% graph_reusing : 0.000006s : 0.05% inline : 0.000003s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000018s : 0.16% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000004s : 0.04% parallel-infer-symbol : 0.000003s : 0.03% pre_auto_parallel : 0.000031s : 0.27% insert-virtual-dataset : 0.000002s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000003s : 0.02% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000028s : 0.24% optimize.rewriter_before_opt_a : 0.000060s : 0.51% optimize.opt_a.expand_dump_flag : 0.000005s : 0.04% optimize.opt_a.switch_simplify : 0.000035s : 0.30% optimize.opt_a.loop_unroll : 0.000024s : 0.20% optimize.opt_a.a_1 : 0.000428s : 3.62% optimize.opt_a.with_stream_mark : 0.000042s : 0.36% optimize.opt_a.recompute_prepare : 0.000019s : 0.16% optimize.opt_a.updatestate_depend_eliminate : 0.000008s : 0.07% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.06% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.05% optimize.opt_a.parameter_eliminate : 0.000004s : 0.04% optimize.opt_a.a_2 : 0.000215s : 1.82% optimize.opt_a.accelerated_algorithm : 0.000014s : 0.12% optimize.opt_a.shard : 0.000005s : 0.04% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.04% optimize.opt_a.shard_inline : 0.000014s : 0.11% optimize.opt_a.merge_send_recv : 0.000016s : 0.14% optimize.opt_a.auto_parallel : 0.000016s : 0.14% optimize.opt_a.parallel : 0.000026s : 0.22% optimize.opt_a.flash_sp : 0.000015s : 0.13% optimize.opt_a.merge_comm : 0.000009s : 0.08% optimize.opt_a.allreduce_fusion : 0.000008s : 0.07% optimize.opt_a.matmul_add_comm_reduction : 0.000021s : 0.18% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000020s : 0.17% optimize.opt_a.virtual_dataset : 0.000013s : 0.11% optimize.opt_a.get_grad_eliminate_ : 0.000012s : 0.10% optimize.opt_a.virtual_output : 0.000014s : 0.12% optimize.opt_a.merge_forward : 0.000010s : 0.08% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.02% optimize.opt_a.offload_activation : 0.000020s : 0.17% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000039s : 0.33% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.03% optimize.opt_a.before_grad : 0.000024s : 0.20% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000010s : 0.09% optimize.opt_a.meta_fg_expand : 0.000005s : 0.04% optimize.opt_a.flash_sp_send_recv_attached : 0.000005s : 0.04% optimize.opt_a.receive_attached : 0.000003s : 0.03% optimize.opt_a.after_resolve : 0.000025s : 0.21% optimize.opt_a.a_after_grad : 0.000020s : 0.17% optimize.opt_a.renormalize : 0.000688s : 5.83% optimize.opt_a.add_forward_monad_depend : 0.000054s : 0.46% optimize.opt_a.auto_monad_grad : 0.000004s : 0.04% optimize.opt_a.auto_monad_eliminator : 0.000032s : 0.27% optimize.opt_a.cse : 0.000053s : 0.45% optimize.opt_a.a_3 : 0.000119s : 1.00% optimize.py_interpret_to_execute_after_opt_a : 0.000018s : 0.15% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.05% optimize.rewriter_after_opt_a : 0.000050s : 0.42% optimize.convert_after_rewriter : 0.000010s : 0.09% optimize.order_py_execute_after_rewriter : 0.000009s : 0.07% optimize.mutable_eliminate : 0.000621s : 5.26% optimize.opt_b.b_1 : 0.000167s : 1.42% optimize.opt_b.b_2 : 0.000010s : 0.08% optimize.opt_b.updatestate_depend_eliminate : 0.000008s : 0.07% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.02% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_b.renormalize : 0.000001s : 0.01% optimize.opt_b.cse : 0.000029s : 0.24% optimize.optimize_parallel_all_gather_comm : 0.000022s : 0.19% optimize.overlap_param_gather : 0.000006s : 0.05% optimize.cconv : 0.000038s : 0.32% optimize.loop_unroll : 0.000505s : 4.28% optimize.opt_after_cconv.c_1 : 0.000029s : 0.24% optimize.opt_after_cconv.parameter_eliminate : 0.000005s : 0.04% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000007s : 0.06% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.02% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.02% optimize.opt_after_cconv.cse : 0.000024s : 0.20% optimize.opt_after_cconv.renormalize : 0.000001s : 0.01% optimize.remove_dup_value : 0.000019s : 0.16% optimize.tuple_transform.d_1 : 0.000044s : 0.37% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000007s : 0.06% optimize.partial_unused_args_eliminate : 0.000004s : 0.04% optimize.add_recomputation : 0.000055s : 0.47% optimize.cse_after_recomputation.cse : 0.000013s : 0.11% optimize.environ_conv : 0.000009s : 0.08% optimize.swap_dp_allreduce_reducescatter : 0.000009s : 0.07% optimize.bias_add_comm_swap : 0.000006s : 0.05% optimize.label_micro_interleaved_index : 0.000007s : 0.06% optimize.label_fine_grained_interleaved_index : 0.000005s : 0.04% optimize.merge_cast_opt : 0.000004s : 0.03% optimize.slice_recompute_activation : 0.000004s : 0.04% optimize.micro_interleaved_order_control : 0.000005s : 0.04% optimize.assign_add_opt : 0.000004s : 0.03% optimize.ForceFp32Comm : 0.000003s : 0.03% optimize.remove_cast_before_assign_add : 0.000004s : 0.03% optimize.full_micro_interleaved_order_control : 0.000005s : 0.04% optimize.reorder_send_recv_between_fp_bp : 0.000005s : 0.05% optimize.comm_op_add_attrs : 0.000004s : 0.03% optimize.add_comm_op_reuse_tag : 0.000003s : 0.03% optimize.interleave_split_concat_branches : 0.000004s : 0.03% optimize.interleave_parallel_branches : 0.000003s : 0.03% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.03% optimize.overlap_opt_shard_grad_in_pipeline : 0.000005s : 0.04% optimize.control_data_broadcast_order : 0.000017s : 0.14% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.03% optimize.offloading_packed_experts : 0.000007s : 0.06% optimize.overlap_recompute_and_grad_model_parallel : 0.000007s : 0.06% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000003s : 0.03% optimize.overlap_recompute_allgather_and_fa_grad : 0.000003s : 0.03% optimize.overlap_recompute_comm : 0.000005s : 0.04% optimize.overlap_grad_ring_attention : 0.000007s : 0.06% optimize.overlap_grad_flash_sp : 0.000025s : 0.21% optimize.begin_end_overlap_inline : 0.000003s : 0.03% optimize.split_matmul_comm_elemetwise : 0.000005s : 0.04% optimize.split_layernorm_comm : 0.000004s : 0.03% optimize.handle_group_info : 0.000004s : 0.03% optimize.symbol_engine_optimizer.build : 0.000003s : 0.02% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000012s : 0.10% optimize.symbol_engine_optimizer.elim_not_effective : 0.000014s : 0.12% optimize.symbol_engine_optimizer.opt_reshape : 0.000007s : 0.06% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000010s : 0.09% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000005s : 0.04% pipeline_parallel_scheduler : 0.000002s : 0.02% auto_monad_reorder : 0.000020s : 0.17% get_jit_bprop_graph : 0.000002s : 0.02% rewriter_after_jit_bprop_graph : 0.000006s : 0.05% opt_after_jit_grad : 0.000529s : 4.48% validate : 0.000043s : 0.37% Time group info: ------[substitution.] 0.000136 19 1.34% : 0.000002s : 2: substitution.elim_not_effective 1.01% : 0.000001s : 2: substitution.fold_const_symbol 4.56% : 0.000006s : 3: substitution.graph_param_transform 81.52% : 0.000111s : 2: substitution.inline 3.66% : 0.000005s : 4: substitution.j_node_and_user_rematch 4.22% : 0.000006s : 4: substitution.remove_not_recompute_node 3.69% : 0.000005s : 2: substitution.replace_old_param ------[type_inference.] 0.006331 2 91.52% : 0.005795s : 1: type_inference.infer 8.48% : 0.000537s : 1: type_inference.specialize ------[replace.] 0.000025 2 100.00% : 0.000025s : 2: replace.inline ------[match.] 0.000109 2 100.00% : 0.000109s : 2: match.inline ------[predicate.] 0.000146 754 0.75% : 0.000001s : 7: predicate.accumulaten_eliminater 1.26% : 0.000002s : 3: predicate.ad_related_special_op_eliminate 0.64% : 0.000001s : 6: predicate.addn_check_dump 0.72% : 0.000001s : 7: predicate.addn_zero_filter 0.61% : 0.000001s : 7: predicate.adjust_all_reduce_mul_add 2.18% : 0.000003s : 13: predicate.arithmetic_simplify 0.86% : 0.000001s : 7: predicate.cast_eliminate 0.80% : 0.000001s : 6: predicate.check_bprop_eliminate 0.94% : 0.000001s : 6: predicate.compare_switch_simplify 0.20% : 0.000000s : 3: predicate.const_output_eliminate 0.74% : 0.000001s : 6: predicate.depend_value_elim 0.78% : 0.000001s : 7: predicate.dict_get_item_const_eliminator 0.82% : 0.000001s : 7: predicate.dict_get_item_eliminator 0.74% : 0.000001s : 7: predicate.dict_set_item_eliminator 1.50% : 0.000002s : 6: predicate.dumpgradient_eliminate 0.38% : 0.000001s : 3: predicate.elim_not_effective 0.46% : 0.000001s : 3: predicate.elim_shapecalc_of_broadcastargs 1.11% : 0.000002s : 10: predicate.environ_add_const_eliminate 0.92% : 0.000001s : 10: predicate.environ_get_add_eliminate 0.98% : 0.000001s : 10: predicate.environ_get_depend_swap 2.00% : 0.000003s : 16: predicate.environ_get_eliminate 0.96% : 0.000001s : 10: predicate.environ_get_set_eliminate 0.93% : 0.000001s : 9: predicate.exchange_switch_depend_value 1.82% : 0.000003s : 9: predicate.float_depend_g_call 0.61% : 0.000001s : 6: predicate.float_environ_get_switch 0.96% : 0.000001s : 9: predicate.float_tuple_getitem_switch 0.20% : 0.000000s : 3: predicate.fold_const_symbol 0.76% : 0.000001s : 6: predicate.get_grad_eliminate 0.26% : 0.000000s : 3: predicate.graph_param_transform 0.74% : 0.000001s : 6: predicate.incorporate_call 0.61% : 0.000001s : 6: predicate.incorporate_call_switch 6.75% : 0.000010s : 34: predicate.inline 1.39% : 0.000002s : 6: predicate.inline_without_move 0.50% : 0.000001s : 6: predicate.j_node_and_user_rematch 0.98% : 0.000001s : 6: predicate.less_batch_normalization 1.50% : 0.000002s : 13: predicate.list_to_tuple_eliminator_ 1.91% : 0.000003s : 20: predicate.load_eliminater 1.44% : 0.000002s : 3: predicate.loop_unroll_after_grad 2.07% : 0.000003s : 14: predicate.loop_unroll_before_grad 1.48% : 0.000002s : 13: predicate.make_slice_get_slice_eliminator 0.64% : 0.000001s : 6: predicate.merge_addn 0.66% : 0.000001s : 6: predicate.micro_step_allgather_replace 0.70% : 0.000001s : 6: predicate.mini_step_allgather_replace 0.61% : 0.000001s : 7: predicate.minmaximum_grad 2.24% : 0.000003s : 3: predicate.mutable_eliminate 0.45% : 0.000001s : 3: predicate.opt_reshape 0.42% : 0.000001s : 3: predicate.parallel_virtual_node 1.42% : 0.000002s : 9: predicate.partial_defer_inline 1.12% : 0.000002s : 10: predicate.partial_eliminate 0.78% : 0.000001s : 7: predicate.print_const_string_wrapper 0.83% : 0.000001s : 6: predicate.reduce_all_const_elim 0.95% : 0.000001s : 7: predicate.reduce_eliminate 2.00% : 0.000003s : 20: predicate.redundant_stop_gradient_eliminater 0.84% : 0.000001s : 6: predicate.remove_not_recompute_node 1.22% : 0.000002s : 13: predicate.replace_applicator 0.85% : 0.000001s : 6: predicate.replace_old_param 0.45% : 0.000001s : 3: predicate.reset_defer_inline 0.92% : 0.000001s : 7: predicate.reshape_eliminate 0.70% : 0.000001s : 6: predicate.row_tensor_add_zeros_like 0.68% : 0.000001s : 3: predicate.row_tensor_eliminate 1.33% : 0.000002s : 6: predicate.same_eliminate 0.59% : 0.000001s : 6: predicate.set_cell_output_no_recompute 1.15% : 0.000002s : 6: predicate.shard_identity_eliminate 0.89% : 0.000001s : 6: predicate.special_op_eliminate 0.97% : 0.000001s : 6: predicate.specialize_transform 1.41% : 0.000002s : 6: predicate.split_environ_get_set_with_tuple_value 0.95% : 0.000001s : 6: predicate.stack_unstack_eliminate 0.40% : 0.000001s : 3: predicate.switch_call_monad_eliminater 0.95% : 0.000001s : 9: predicate.switch_defer_inline 1.77% : 0.000003s : 15: predicate.switch_layer_defer_inline 4.38% : 0.000006s : 32: predicate.switch_simplify 0.68% : 0.000001s : 7: predicate.tile_eliminate 0.68% : 0.000001s : 7: predicate.transpose_eliminate 1.50% : 0.000002s : 13: predicate.tuple_list_convert_item_index_to_positive 1.50% : 0.000002s : 13: predicate.tuple_list_get_item_const_eliminator 1.43% : 0.000002s : 13: predicate.tuple_list_get_item_depend_reorder 3.26% : 0.000005s : 19: predicate.tuple_list_get_item_eliminator 1.38% : 0.000002s : 13: predicate.tuple_list_get_set_item_eliminator 2.31% : 0.000003s : 19: predicate.tuple_list_set_item_eliminator 1.34% : 0.000002s : 13: predicate.tuple_to_list_eliminator_ 1.80% : 0.000003s : 20: predicate.updatestate_pure_node_eliminater 2.69% : 0.000004s : 26: predicate.updatestate_useless_node_eliminater 0.43% : 0.000001s : 3: predicate.value_based_eliminate 0.79% : 0.000001s : 6: predicate.virtual_dataset_eliminate 0.76% : 0.000001s : 6: predicate.virtual_output_eliminate 0.29% : 0.000000s : 3: predicate.virtual_view_grad_eliminate 0.61% : 0.000001s : 3: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000268 5 8.75% : 0.000023s : 1: func_graph_cloner_run.FuncGraphClonerGraph 91.25% : 0.000244s : 4: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.028453 192 0.02% : 0.000006s : 1: ForceFp32Comm 13.37% : 0.003805s : 1: add_attr 13.31% : 0.003786s : 1: add_attr_with_inline 0.02% : 0.000006s : 1: add_comm_op_reuse_tag 0.21% : 0.000060s : 1: add_recomputation 0.02% : 0.000006s : 1: assign_add_opt 0.27% : 0.000076s : 1: auto_monad 0.09% : 0.000027s : 1: auto_monad_reorder 0.02% : 0.000006s : 1: begin_end_overlap_inline 0.03% : 0.000009s : 1: bias_add_comm_swap 1.88% : 0.000534s : 1: bootstrap 0.15% : 0.000042s : 1: cconv 0.02% : 0.000007s : 1: comm_op_add_attrs 0.07% : 0.000020s : 1: control_data_broadcast_order 0.05% : 0.000014s : 1: convert_after_rewriter 0.12% : 0.000035s : 1: cse_after_recomputation 0.03% : 0.000009s : 1: dataset_repeat_opt 0.08% : 0.000022s : 1: detach_backward 0.04% : 0.000012s : 1: environ_conv 0.09% : 0.000027s : 1: event_method 0.03% : 0.000008s : 1: full_micro_interleaved_order_control 0.03% : 0.000008s : 1: get_jit_bprop_graph 0.05% : 0.000013s : 1: graph_reusing 0.02% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.02% : 0.000007s : 1: handle_group_info 0.03% : 0.000009s : 1: inline 0.03% : 0.000009s : 1: insert-virtual-dataset 0.02% : 0.000006s : 1: interleave_parallel_branches 0.02% : 0.000006s : 1: interleave_split_concat_branches 0.03% : 0.000008s : 1: label_fine_grained_interleaved_index 0.04% : 0.000010s : 1: label_micro_interleaved_index 1.80% : 0.000514s : 1: loop_unroll 0.02% : 0.000006s : 1: merge_cast_opt 0.03% : 0.000008s : 1: micro_interleaved_order_control 2.22% : 0.000630s : 1: mutable_eliminate 0.04% : 0.000010s : 1: offloading_packed_experts 0.07% : 0.000019s : 1: opt.transform.loop_unroll_optimizer 0.08% : 0.000024s : 1: opt.transform.mutable_eliminate 2.96% : 0.000842s : 78: opt.transform.opt_a 0.09% : 0.000027s : 1: opt.transform.opt_after_cconv 0.10% : 0.000028s : 1: opt.transform.opt_after_jit_grad 0.36% : 0.000102s : 28: opt.transform.opt_b 0.17% : 0.000048s : 2: opt.transform.opt_trans_graph 0.14% : 0.000040s : 4: opt.transform.symbol_engine_opt 10.37% : 0.002950s : 1: opt_a 0.50% : 0.000142s : 1: opt_after_cconv 1.90% : 0.000540s : 1: opt_after_jit_grad 1.05% : 0.000299s : 1: opt_b 20.50% : 0.005834s : 1: optimize 0.10% : 0.000027s : 1: optimize_parallel_all_gather_comm 0.04% : 0.000012s : 1: order_py_execute_after_rewriter 0.10% : 0.000029s : 1: overlap_grad_flash_sp 0.02% : 0.000007s : 1: overlap_grad_matmul_and_grad_allreduce 0.04% : 0.000010s : 1: overlap_grad_ring_attention 0.03% : 0.000007s : 1: overlap_opt_shard_grad_in_pipeline 0.02% : 0.000006s : 1: overlap_opt_shard_in_pipeline 0.03% : 0.000010s : 1: overlap_param_gather 0.02% : 0.000007s : 1: overlap_recompute_allgather_and_fa_grad 0.04% : 0.000010s : 1: overlap_recompute_and_grad_model_parallel 0.03% : 0.000008s : 1: overlap_recompute_comm 0.03% : 0.000010s : 1: parallel-infer-symbol 0.03% : 0.000007s : 1: parallel-infer-symbol-second 0.03% : 0.000007s : 1: partial_unused_args_eliminate 0.03% : 0.000009s : 1: pipeline_parallel_scheduler 0.03% : 0.000007s : 1: pipeline_split 0.14% : 0.000039s : 1: pre_auto_parallel 0.11% : 0.000032s : 1: py_interpret_to_execute 0.08% : 0.000022s : 1: py_interpret_to_execute_after_opt_a 0.02% : 0.000007s : 1: remove_cast_before_assign_add 0.08% : 0.000022s : 1: remove_dup_value 1.36% : 0.000386s : 1: renormalize.infer 1.02% : 0.000291s : 1: renormalize.specialize 0.03% : 0.000008s : 1: reorder_send_recv_between_fp_bp 0.04% : 0.000011s : 1: rewriter_after_jit_bprop_graph 0.19% : 0.000054s : 1: rewriter_after_opt_a 0.22% : 0.000064s : 1: rewriter_before_opt_a 0.03% : 0.000009s : 1: slice_cell_reuse_recomputed_activation 0.02% : 0.000007s : 1: slice_recompute_activation 0.02% : 0.000007s : 1: split_layernorm_comm 0.03% : 0.000007s : 1: split_matmul_comm_elemetwise 0.04% : 0.000012s : 1: swap_dp_allreduce_reducescatter 0.38% : 0.000109s : 1: symbol_engine_optimizer 0.34% : 0.000096s : 1: tuple_transform 22.63% : 0.006438s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:39.634.940 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0148527, [21] [bootstrap]: 0.00044556 [type_inference]: 0.00556886 [event_method]: 1.325e-05 [auto_monad]: 5.647e-05 [graph_reusing]: 5.82001e-06 [inline]: 3.04001e-06 [add_attr]: 0.00338602, [1] [add_attr_with_inline]: 0.00337649, [1] [Cycle 1]: 5.982e-05, [2] [tag_attr]: 1.607e-05 [meta_addattr_fg_expand]: 3.56999e-06 [parallel-infer-symbol]: 3.04999e-06 [pre_auto_parallel]: 2.812e-05 [insert-virtual-dataset]: 2.51998e-06 [parallel-infer-symbol-second]: 8.10018e-07 [dataset_repeat_opt]: 1.89e-06 [pipeline_split]: 1.64998e-06 [optimize]: 0.00450134, [53] [py_interpret_to_execute]: 2.05e-05 [rewriter_before_opt_a]: 4.921e-05 [opt_a]: 0.00230838, [2] [Cycle 1]: 0.00164364, [45] [expand_dump_flag]: 2.78998e-06 [switch_simplify]: 2.642e-05 [loop_unroll]: 1.341e-05 [a_1]: 0.00030219 [with_stream_mark]: 1.828e-05 [recompute_prepare]: 8.08001e-06 [updatestate_depend_eliminate]: 4.4e-06 [updatestate_assign_eliminate]: 3.51001e-06 [updatestate_loads_eliminate]: 2.87002e-06 [parameter_eliminate]: 2.11e-06 [a_2]: 8.066e-05 [accelerated_algorithm]: 7.28e-06 [shard]: 2.26998e-06 [meta_shard_fg_expand]: 1.71e-06 [shard_inline]: 6.59999e-06 [merge_send_recv]: 9.50001e-06 [auto_parallel]: 6.79999e-06 [parallel]: 1.874e-05 [flash_sp]: 8.08999e-06 [merge_comm]: 3.93999e-06 [allreduce_fusion]: 3.46999e-06 [matmul_add_comm_reduction]: 1.009e-05 [allreduce_slice_to_reducescatter]: 6.39993e-07 [virtual_shard_identity]: 9.33997e-06 [virtual_dataset]: 6.24001e-06 [get_grad_eliminate_]: 6.41e-06 [virtual_output]: 6.80998e-06 [merge_forward]: 3.88001e-06 [cell_reuse_recompute_pass]: 1.38002e-06 [offload_activation]: 1.053e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.342e-05 [merge_recompute_call_nodes]: 1.52001e-06 [before_grad]: 1.125e-05 [set_forward_comm_id_for_comm_node_pass]: 3.48999e-06 [meta_fg_expand]: 2.68e-06 [flash_sp_send_recv_attached]: 3.86999e-06 [receive_attached]: 2.34001e-06 [after_resolve]: 1.047e-05 [a_after_grad]: 8.78001e-06 [renormalize]: 0.00060872 [add_forward_monad_depend]: 5.81998e-06 [auto_monad_grad]: 2.62001e-06 [auto_monad_eliminator]: 1.596e-05 [cse]: 3.254e-05 [a_3]: 4.948e-05 [Cycle 2]: 0.00065419, [45] [expand_dump_flag]: 1.52001e-06 [switch_simplify]: 7.53999e-06 [loop_unroll]: 5.94e-06 [a_1]: 0.00010666 [with_stream_mark]: 1.269e-05 [recompute_prepare]: 6.40002e-06 [updatestate_depend_eliminate]: 3.78001e-06 [updatestate_assign_eliminate]: 2.36e-06 [updatestate_loads_eliminate]: 3.01001e-06 [parameter_eliminate]: 1.27999e-06 [a_2]: 7.053e-05 [accelerated_algorithm]: 6.07999e-06 [shard]: 1.40999e-06 [meta_shard_fg_expand]: 1.54e-06 [shard_inline]: 5.99e-06 [merge_send_recv]: 5.81998e-06 [auto_parallel]: 6.73e-06 [parallel]: 6.02999e-06 [flash_sp]: 3.83001e-06 [merge_comm]: 3.25002e-06 [allreduce_fusion]: 3.16999e-06 [matmul_add_comm_reduction]: 6.87002e-06 [allreduce_slice_to_reducescatter]: 8.39995e-07 [virtual_shard_identity]: 8.48001e-06 [virtual_dataset]: 6.28e-06 [get_grad_eliminate_]: 5.98998e-06 [virtual_output]: 5.77001e-06 [merge_forward]: 3.45998e-06 [cell_reuse_recompute_pass]: 1.56002e-06 [offload_activation]: 8.12e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.415e-05 [merge_recompute_call_nodes]: 1.78002e-06 [before_grad]: 1.013e-05 [set_forward_comm_id_for_comm_node_pass]: 4.69002e-06 [meta_fg_expand]: 2.33998e-06 [flash_sp_send_recv_attached]: 1.05999e-06 [receive_attached]: 1.19e-06 [after_resolve]: 1.073e-05 [a_after_grad]: 9.07001e-06 [renormalize]: 8.00064e-08 [add_forward_monad_depend]: 1.51002e-06 [auto_monad_grad]: 1.37e-06 [auto_monad_eliminator]: 8.13001e-06 [cse]: 1.805e-05 [a_3]: 3.505e-05 [py_interpret_to_execute_after_opt_a]: 1.06e-05 [slice_cell_reuse_recomputed_activation]: 2.24001e-06 [rewriter_after_opt_a]: 3.913e-05 [convert_after_rewriter]: 6.36998e-06 [order_py_execute_after_rewriter]: 6.48998e-06 [mutable_eliminate]: 0.00056591 [opt_b]: 0.00022, [1] [Cycle 1]: 0.0002126, [7] [b_1]: 0.00012295 [b_2]: 7.82998e-06 [updatestate_depend_eliminate]: 8.55999e-06 [updatestate_assign_eliminate]: 2.63e-06 [updatestate_loads_eliminate]: 3.07002e-06 [renormalize]: 7.90023e-07 [cse]: 2.776e-05 [optimize_parallel_all_gather_comm]: 1.877e-05 [overlap_param_gather]: 2.36e-06 [cconv]: 3.125e-05 [loop_unroll]: 0.00049766 [opt_after_cconv]: 0.00010928, [1] [Cycle 1]: 0.00010306, [7] [c_1]: 2.963e-05 [parameter_eliminate]: 4.32e-06 [updatestate_depend_eliminate]: 6.61e-06 [updatestate_assign_eliminate]: 2.59001e-06 [updatestate_loads_eliminate]: 2.31e-06 [cse]: 2.144e-05 [renormalize]: 5.3001e-07 [remove_dup_value]: 1.613e-05 [tuple_transform]: 7.889e-05, [1] [Cycle 1]: 7.394e-05, [4] [d_1]: 4.475e-05 [none_parameter_eliminate]: 1.70001e-06 [renormalize]: 2.50002e-07 [switch_simplify]: 6.52001e-06 [partial_unused_args_eliminate]: 1.83002e-06 [add_recomputation]: 5.452e-05 [cse_after_recomputation]: 2.714e-05, [1] [Cycle 1]: 2.13e-05, [1] [cse]: 1.43e-05 [environ_conv]: 5.69e-06 [swap_dp_allreduce_reducescatter]: 5.48002e-06 [bias_add_comm_swap]: 3.09999e-06 [label_micro_interleaved_index]: 5.19e-06 [label_fine_grained_interleaved_index]: 3.29001e-06 [merge_cast_opt]: 1.35999e-06 [slice_recompute_activation]: 2.22001e-06 [micro_interleaved_order_control]: 2.39999e-06 [assign_add_opt]: 1.72999e-06 [ForceFp32Comm]: 1.22e-06 [remove_cast_before_assign_add]: 9.5999e-07 [full_micro_interleaved_order_control]: 2.64001e-06 [reorder_send_recv_between_fp_bp]: 2.96001e-06 [comm_op_add_attrs]: 1.32e-06 [add_comm_op_reuse_tag]: 1.25999e-06 [interleave_split_concat_branches]: 1.30001e-06 [interleave_parallel_branches]: 1.12e-06 [overlap_opt_shard_in_pipeline]: 1.27e-06 [overlap_opt_shard_grad_in_pipeline]: 1.92001e-06 [control_data_broadcast_order]: 1.456e-05 [grouped_pairwise_exchange_alltoall]: 1.45001e-06 [offloading_packed_experts]: 4.79e-06 [overlap_recompute_and_grad_model_parallel]: 4.84e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.54e-06 [overlap_recompute_allgather_and_fa_grad]: 1.29e-06 [overlap_recompute_comm]: 2.53e-06 [overlap_grad_ring_attention]: 4.86002e-06 [overlap_grad_flash_sp]: 2.104e-05 [begin_end_overlap_inline]: 6.00005e-07 [split_matmul_comm_elemetwise]: 2.31e-06 [split_layernorm_comm]: 1.64e-06 [handle_group_info]: 1.24e-06 [symbol_engine_optimizer]: 8.749e-05, [1] [Cycle 1]: 8.288e-05, [6] [build]: 3.51001e-06 [elim_shapecalc]: 1.302e-05 [elim_not_effective]: 1.478e-05 [opt_reshape]: 7.2e-06 [fold_const_symbol]: 1.122e-05 [renormalize]: 4.40021e-07 [detach_backward]: 2.37001e-06 [pipeline_parallel_scheduler]: 1.94e-06 [auto_monad_reorder]: 1.875e-05 [get_jit_bprop_graph]: 2.01e-06 [rewriter_after_jit_bprop_graph]: 5.56e-06 [opt_after_jit_grad]: 0.00059092 [validate]: 4.515e-05 Sums bootstrap : 0.000446s : 4.28% type_inference : 0.005569s : 53.54% event_method : 0.000013s : 0.13% auto_monad : 0.000056s : 0.54% graph_reusing : 0.000006s : 0.06% inline : 0.000003s : 0.03% add_attr.add_attr_with_inline.tag_attr : 0.000016s : 0.15% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000004s : 0.03% parallel-infer-symbol : 0.000003s : 0.03% pre_auto_parallel : 0.000028s : 0.27% insert-virtual-dataset : 0.000003s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.02% optimize.py_interpret_to_execute : 0.000021s : 0.20% optimize.rewriter_before_opt_a : 0.000049s : 0.47% optimize.opt_a.expand_dump_flag : 0.000004s : 0.04% optimize.opt_a.switch_simplify : 0.000034s : 0.33% optimize.opt_a.loop_unroll : 0.000019s : 0.19% optimize.opt_a.a_1 : 0.000409s : 3.93% optimize.opt_a.with_stream_mark : 0.000031s : 0.30% optimize.opt_a.recompute_prepare : 0.000014s : 0.14% optimize.opt_a.updatestate_depend_eliminate : 0.000008s : 0.08% optimize.opt_a.updatestate_assign_eliminate : 0.000006s : 0.06% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.06% optimize.opt_a.parameter_eliminate : 0.000003s : 0.03% optimize.opt_a.a_2 : 0.000151s : 1.45% optimize.opt_a.accelerated_algorithm : 0.000013s : 0.13% optimize.opt_a.shard : 0.000004s : 0.04% optimize.opt_a.meta_shard_fg_expand : 0.000003s : 0.03% optimize.opt_a.shard_inline : 0.000013s : 0.12% optimize.opt_a.merge_send_recv : 0.000015s : 0.15% optimize.opt_a.auto_parallel : 0.000014s : 0.13% optimize.opt_a.parallel : 0.000025s : 0.24% optimize.opt_a.flash_sp : 0.000012s : 0.11% optimize.opt_a.merge_comm : 0.000007s : 0.07% optimize.opt_a.allreduce_fusion : 0.000007s : 0.06% optimize.opt_a.matmul_add_comm_reduction : 0.000017s : 0.16% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000018s : 0.17% optimize.opt_a.virtual_dataset : 0.000013s : 0.12% optimize.opt_a.get_grad_eliminate_ : 0.000012s : 0.12% optimize.opt_a.virtual_output : 0.000013s : 0.12% optimize.opt_a.merge_forward : 0.000007s : 0.07% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.03% optimize.opt_a.offload_activation : 0.000019s : 0.18% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000028s : 0.27% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.03% optimize.opt_a.before_grad : 0.000021s : 0.21% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000008s : 0.08% optimize.opt_a.meta_fg_expand : 0.000005s : 0.05% optimize.opt_a.flash_sp_send_recv_attached : 0.000005s : 0.05% optimize.opt_a.receive_attached : 0.000004s : 0.03% optimize.opt_a.after_resolve : 0.000021s : 0.20% optimize.opt_a.a_after_grad : 0.000018s : 0.17% optimize.opt_a.renormalize : 0.000609s : 5.85% optimize.opt_a.add_forward_monad_depend : 0.000007s : 0.07% optimize.opt_a.auto_monad_grad : 0.000004s : 0.04% optimize.opt_a.auto_monad_eliminator : 0.000024s : 0.23% optimize.opt_a.cse : 0.000051s : 0.49% optimize.opt_a.a_3 : 0.000085s : 0.81% optimize.py_interpret_to_execute_after_opt_a : 0.000011s : 0.10% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.02% optimize.rewriter_after_opt_a : 0.000039s : 0.38% optimize.convert_after_rewriter : 0.000006s : 0.06% optimize.order_py_execute_after_rewriter : 0.000006s : 0.06% optimize.mutable_eliminate : 0.000566s : 5.44% optimize.opt_b.b_1 : 0.000123s : 1.18% optimize.opt_b.b_2 : 0.000008s : 0.08% optimize.opt_b.updatestate_depend_eliminate : 0.000009s : 0.08% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_b.renormalize : 0.000001s : 0.01% optimize.opt_b.cse : 0.000028s : 0.27% optimize.optimize_parallel_all_gather_comm : 0.000019s : 0.18% optimize.overlap_param_gather : 0.000002s : 0.02% optimize.cconv : 0.000031s : 0.30% optimize.loop_unroll : 0.000498s : 4.78% optimize.opt_after_cconv.c_1 : 0.000030s : 0.28% optimize.opt_after_cconv.parameter_eliminate : 0.000004s : 0.04% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000007s : 0.06% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.02% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.02% optimize.opt_after_cconv.cse : 0.000021s : 0.21% optimize.opt_after_cconv.renormalize : 0.000001s : 0.01% optimize.remove_dup_value : 0.000016s : 0.16% optimize.tuple_transform.d_1 : 0.000045s : 0.43% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.02% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000007s : 0.06% optimize.partial_unused_args_eliminate : 0.000002s : 0.02% optimize.add_recomputation : 0.000055s : 0.52% optimize.cse_after_recomputation.cse : 0.000014s : 0.14% optimize.environ_conv : 0.000006s : 0.05% optimize.swap_dp_allreduce_reducescatter : 0.000005s : 0.05% optimize.bias_add_comm_swap : 0.000003s : 0.03% optimize.label_micro_interleaved_index : 0.000005s : 0.05% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.03% optimize.merge_cast_opt : 0.000001s : 0.01% optimize.slice_recompute_activation : 0.000002s : 0.02% optimize.micro_interleaved_order_control : 0.000002s : 0.02% optimize.assign_add_opt : 0.000002s : 0.02% optimize.ForceFp32Comm : 0.000001s : 0.01% optimize.remove_cast_before_assign_add : 0.000001s : 0.01% optimize.full_micro_interleaved_order_control : 0.000003s : 0.03% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.03% optimize.comm_op_add_attrs : 0.000001s : 0.01% optimize.add_comm_op_reuse_tag : 0.000001s : 0.01% optimize.interleave_split_concat_branches : 0.000001s : 0.01% optimize.interleave_parallel_branches : 0.000001s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.02% optimize.control_data_broadcast_order : 0.000015s : 0.14% optimize.grouped_pairwise_exchange_alltoall : 0.000001s : 0.01% optimize.offloading_packed_experts : 0.000005s : 0.05% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.05% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000002s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.01% optimize.overlap_recompute_comm : 0.000003s : 0.02% optimize.overlap_grad_ring_attention : 0.000005s : 0.05% optimize.overlap_grad_flash_sp : 0.000021s : 0.20% optimize.begin_end_overlap_inline : 0.000001s : 0.01% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.02% optimize.split_layernorm_comm : 0.000002s : 0.02% optimize.handle_group_info : 0.000001s : 0.01% optimize.symbol_engine_optimizer.build : 0.000004s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000013s : 0.13% optimize.symbol_engine_optimizer.elim_not_effective : 0.000015s : 0.14% optimize.symbol_engine_optimizer.opt_reshape : 0.000007s : 0.07% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000011s : 0.11% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.02% pipeline_parallel_scheduler : 0.000002s : 0.02% auto_monad_reorder : 0.000019s : 0.18% get_jit_bprop_graph : 0.000002s : 0.02% rewriter_after_jit_bprop_graph : 0.000006s : 0.05% opt_after_jit_grad : 0.000591s : 5.68% validate : 0.000045s : 0.43% Time group info: ------[substitution.] 0.000118 19 1.71% : 0.000002s : 2: substitution.elim_not_effective 1.47% : 0.000002s : 2: substitution.fold_const_symbol 5.29% : 0.000006s : 3: substitution.graph_param_transform 81.48% : 0.000096s : 2: substitution.inline 3.17% : 0.000004s : 4: substitution.j_node_and_user_rematch 3.90% : 0.000005s : 4: substitution.remove_not_recompute_node 2.98% : 0.000004s : 2: substitution.replace_old_param ------[type_inference.] 0.005517 2 91.35% : 0.005040s : 1: type_inference.infer 8.65% : 0.000477s : 1: type_inference.specialize ------[replace.] 0.000023 2 100.00% : 0.000023s : 2: replace.inline ------[match.] 0.000094 2 100.00% : 0.000094s : 2: match.inline ------[predicate.] 0.000139 754 0.95% : 0.000001s : 7: predicate.accumulaten_eliminater 1.76% : 0.000002s : 3: predicate.ad_related_special_op_eliminate 0.65% : 0.000001s : 6: predicate.addn_check_dump 0.95% : 0.000001s : 7: predicate.addn_zero_filter 0.70% : 0.000001s : 7: predicate.adjust_all_reduce_mul_add 2.03% : 0.000003s : 13: predicate.arithmetic_simplify 0.77% : 0.000001s : 7: predicate.cast_eliminate 0.71% : 0.000001s : 6: predicate.check_bprop_eliminate 0.69% : 0.000001s : 6: predicate.compare_switch_simplify 0.22% : 0.000000s : 3: predicate.const_output_eliminate 0.68% : 0.000001s : 6: predicate.depend_value_elim 0.76% : 0.000001s : 7: predicate.dict_get_item_const_eliminator 0.97% : 0.000001s : 7: predicate.dict_get_item_eliminator 0.75% : 0.000001s : 7: predicate.dict_set_item_eliminator 1.36% : 0.000002s : 6: predicate.dumpgradient_eliminate 0.52% : 0.000001s : 3: predicate.elim_not_effective 0.49% : 0.000001s : 3: predicate.elim_shapecalc_of_broadcastargs 1.05% : 0.000001s : 10: predicate.environ_add_const_eliminate 1.14% : 0.000002s : 10: predicate.environ_get_add_eliminate 1.00% : 0.000001s : 10: predicate.environ_get_depend_swap 1.79% : 0.000002s : 16: predicate.environ_get_eliminate 0.97% : 0.000001s : 10: predicate.environ_get_set_eliminate 0.87% : 0.000001s : 9: predicate.exchange_switch_depend_value 1.84% : 0.000003s : 9: predicate.float_depend_g_call 0.62% : 0.000001s : 6: predicate.float_environ_get_switch 0.91% : 0.000001s : 9: predicate.float_tuple_getitem_switch 0.23% : 0.000000s : 3: predicate.fold_const_symbol 0.77% : 0.000001s : 6: predicate.get_grad_eliminate 0.24% : 0.000000s : 3: predicate.graph_param_transform 0.73% : 0.000001s : 6: predicate.incorporate_call 0.61% : 0.000001s : 6: predicate.incorporate_call_switch 6.30% : 0.000009s : 34: predicate.inline 0.96% : 0.000001s : 6: predicate.inline_without_move 0.37% : 0.000001s : 6: predicate.j_node_and_user_rematch 1.20% : 0.000002s : 6: predicate.less_batch_normalization 1.54% : 0.000002s : 13: predicate.list_to_tuple_eliminator_ 2.01% : 0.000003s : 20: predicate.load_eliminater 1.76% : 0.000002s : 3: predicate.loop_unroll_after_grad 1.51% : 0.000002s : 14: predicate.loop_unroll_before_grad 1.67% : 0.000002s : 13: predicate.make_slice_get_slice_eliminator 0.65% : 0.000001s : 6: predicate.merge_addn 0.70% : 0.000001s : 6: predicate.micro_step_allgather_replace 0.72% : 0.000001s : 6: predicate.mini_step_allgather_replace 0.67% : 0.000001s : 7: predicate.minmaximum_grad 1.84% : 0.000003s : 3: predicate.mutable_eliminate 0.47% : 0.000001s : 3: predicate.opt_reshape 0.71% : 0.000001s : 3: predicate.parallel_virtual_node 1.31% : 0.000002s : 9: predicate.partial_defer_inline 1.21% : 0.000002s : 10: predicate.partial_eliminate 0.82% : 0.000001s : 7: predicate.print_const_string_wrapper 0.65% : 0.000001s : 6: predicate.reduce_all_const_elim 0.96% : 0.000001s : 7: predicate.reduce_eliminate 1.99% : 0.000003s : 20: predicate.redundant_stop_gradient_eliminater 0.76% : 0.000001s : 6: predicate.remove_not_recompute_node 1.10% : 0.000002s : 13: predicate.replace_applicator 1.00% : 0.000001s : 6: predicate.replace_old_param 0.47% : 0.000001s : 3: predicate.reset_defer_inline 0.85% : 0.000001s : 7: predicate.reshape_eliminate 0.75% : 0.000001s : 6: predicate.row_tensor_add_zeros_like 0.42% : 0.000001s : 3: predicate.row_tensor_eliminate 0.90% : 0.000001s : 6: predicate.same_eliminate 0.67% : 0.000001s : 6: predicate.set_cell_output_no_recompute 1.26% : 0.000002s : 6: predicate.shard_identity_eliminate 1.02% : 0.000001s : 6: predicate.special_op_eliminate 0.89% : 0.000001s : 6: predicate.specialize_transform 1.11% : 0.000002s : 6: predicate.split_environ_get_set_with_tuple_value 0.96% : 0.000001s : 6: predicate.stack_unstack_eliminate 0.42% : 0.000001s : 3: predicate.switch_call_monad_eliminater 1.05% : 0.000001s : 9: predicate.switch_defer_inline 1.71% : 0.000002s : 15: predicate.switch_layer_defer_inline 4.25% : 0.000006s : 32: predicate.switch_simplify 0.80% : 0.000001s : 7: predicate.tile_eliminate 0.74% : 0.000001s : 7: predicate.transpose_eliminate 1.53% : 0.000002s : 13: predicate.tuple_list_convert_item_index_to_positive 1.53% : 0.000002s : 13: predicate.tuple_list_get_item_const_eliminator 1.28% : 0.000002s : 13: predicate.tuple_list_get_item_depend_reorder 3.14% : 0.000004s : 19: predicate.tuple_list_get_item_eliminator 1.65% : 0.000002s : 13: predicate.tuple_list_get_set_item_eliminator 2.58% : 0.000004s : 19: predicate.tuple_list_set_item_eliminator 1.46% : 0.000002s : 13: predicate.tuple_to_list_eliminator_ 1.90% : 0.000003s : 20: predicate.updatestate_pure_node_eliminater 2.73% : 0.000004s : 26: predicate.updatestate_useless_node_eliminater 0.51% : 0.000001s : 3: predicate.value_based_eliminate 0.85% : 0.000001s : 6: predicate.virtual_dataset_eliminate 1.07% : 0.000001s : 6: predicate.virtual_output_eliminate 0.33% : 0.000000s : 3: predicate.virtual_view_grad_eliminate 0.57% : 0.000001s : 3: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000229 5 9.30% : 0.000021s : 1: func_graph_cloner_run.FuncGraphClonerGraph 90.70% : 0.000208s : 4: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.024277 192 0.02% : 0.000005s : 1: ForceFp32Comm 13.97% : 0.003392s : 1: add_attr 13.92% : 0.003380s : 1: add_attr_with_inline 0.02% : 0.000004s : 1: add_comm_op_reuse_tag 0.24% : 0.000059s : 1: add_recomputation 0.02% : 0.000005s : 1: assign_add_opt 0.25% : 0.000061s : 1: auto_monad 0.09% : 0.000023s : 1: auto_monad_reorder 0.01% : 0.000004s : 1: begin_end_overlap_inline 0.02% : 0.000006s : 1: bias_add_comm_swap 1.96% : 0.000476s : 1: bootstrap 0.14% : 0.000035s : 1: cconv 0.02% : 0.000004s : 1: comm_op_add_attrs 0.07% : 0.000018s : 1: control_data_broadcast_order 0.04% : 0.000010s : 1: convert_after_rewriter 0.13% : 0.000030s : 1: cse_after_recomputation 0.02% : 0.000005s : 1: dataset_repeat_opt 0.03% : 0.000006s : 1: detach_backward 0.04% : 0.000009s : 1: environ_conv 0.08% : 0.000020s : 1: event_method 0.02% : 0.000005s : 1: full_micro_interleaved_order_control 0.02% : 0.000005s : 1: get_jit_bprop_graph 0.04% : 0.000009s : 1: graph_reusing 0.02% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.02% : 0.000004s : 1: handle_group_info 0.03% : 0.000006s : 1: inline 0.02% : 0.000006s : 1: insert-virtual-dataset 0.02% : 0.000004s : 1: interleave_parallel_branches 0.02% : 0.000004s : 1: interleave_split_concat_branches 0.03% : 0.000006s : 1: label_fine_grained_interleaved_index 0.03% : 0.000008s : 1: label_micro_interleaved_index 2.09% : 0.000508s : 1: loop_unroll 0.02% : 0.000004s : 1: merge_cast_opt 0.02% : 0.000005s : 1: micro_interleaved_order_control 2.38% : 0.000577s : 1: mutable_eliminate 0.03% : 0.000008s : 1: offloading_packed_experts 0.08% : 0.000020s : 1: opt.transform.loop_unroll_optimizer 0.08% : 0.000019s : 1: opt.transform.mutable_eliminate 3.23% : 0.000785s : 78: opt.transform.opt_a 0.12% : 0.000028s : 1: opt.transform.opt_after_cconv 0.13% : 0.000032s : 1: opt.transform.opt_after_jit_grad 0.40% : 0.000098s : 28: opt.transform.opt_b 0.20% : 0.000049s : 2: opt.transform.opt_trans_graph 0.17% : 0.000042s : 4: opt.transform.symbol_engine_opt 9.52% : 0.002312s : 1: opt_a 0.47% : 0.000113s : 1: opt_after_cconv 2.49% : 0.000604s : 1: opt_after_jit_grad 0.92% : 0.000224s : 1: opt_b 18.56% : 0.004507s : 1: optimize 0.09% : 0.000022s : 1: optimize_parallel_all_gather_comm 0.04% : 0.000010s : 1: order_py_execute_after_rewriter 0.10% : 0.000025s : 1: overlap_grad_flash_sp 0.02% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.03% : 0.000008s : 1: overlap_grad_ring_attention 0.02% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.02% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.02% : 0.000005s : 1: overlap_param_gather 0.02% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.03% : 0.000008s : 1: overlap_recompute_and_grad_model_parallel 0.02% : 0.000006s : 1: overlap_recompute_comm 0.03% : 0.000007s : 1: parallel-infer-symbol 0.02% : 0.000004s : 1: parallel-infer-symbol-second 0.02% : 0.000005s : 1: partial_unused_args_eliminate 0.02% : 0.000005s : 1: pipeline_parallel_scheduler 0.02% : 0.000004s : 1: pipeline_split 0.13% : 0.000032s : 1: pre_auto_parallel 0.10% : 0.000024s : 1: py_interpret_to_execute 0.06% : 0.000014s : 1: py_interpret_to_execute_after_opt_a 0.02% : 0.000004s : 1: remove_cast_before_assign_add 0.08% : 0.000020s : 1: remove_dup_value 1.39% : 0.000338s : 1: renormalize.infer 1.08% : 0.000263s : 1: renormalize.specialize 0.02% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.04% : 0.000009s : 1: rewriter_after_jit_bprop_graph 0.18% : 0.000045s : 1: rewriter_after_opt_a 0.22% : 0.000053s : 1: rewriter_before_opt_a 0.02% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.02% : 0.000005s : 1: slice_recompute_activation 0.02% : 0.000005s : 1: split_layernorm_comm 0.02% : 0.000005s : 1: split_matmul_comm_elemetwise 0.04% : 0.000009s : 1: swap_dp_allreduce_reducescatter 0.37% : 0.000090s : 1: symbol_engine_optimizer 0.34% : 0.000082s : 1: tuple_transform 23.03% : 0.005591s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:39.838.266 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:39.838.559 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0140946, [21] [bootstrap]: 0.00042099 [type_inference]: 0.00458492 [event_method]: 1.188e-05 [auto_monad]: 5.216e-05 [graph_reusing]: 5.08002e-06 [inline]: 2.16003e-06 [add_attr]: 0.00313548, [1] [add_attr_with_inline]: 0.00312687, [1] [Cycle 1]: 6.64e-05, [2] [tag_attr]: 1.32e-05 [meta_addattr_fg_expand]: 3.69002e-06 [parallel-infer-symbol]: 3.26001e-06 [pre_auto_parallel]: 2.56e-05 [insert-virtual-dataset]: 2.57001e-06 [parallel-infer-symbol-second]: 7.39994e-07 [dataset_repeat_opt]: 1.97001e-06 [pipeline_split]: 1.51002e-06 [optimize]: 0.00472324, [53] [py_interpret_to_execute]: 2.037e-05 [rewriter_before_opt_a]: 4.847e-05 [opt_a]: 0.00247173, [2] [Cycle 1]: 0.0016722, [45] [expand_dump_flag]: 3.01001e-06 [switch_simplify]: 2.387e-05 [loop_unroll]: 1.409e-05 [a_1]: 0.00028606 [with_stream_mark]: 1.638e-05 [recompute_prepare]: 7.75e-06 [updatestate_depend_eliminate]: 4.10998e-06 [updatestate_assign_eliminate]: 3.13e-06 [updatestate_loads_eliminate]: 2.88e-06 [parameter_eliminate]: 2.04e-06 [a_2]: 0.00010806 [accelerated_algorithm]: 6.93e-06 [shard]: 2.19001e-06 [meta_shard_fg_expand]: 1.76e-06 [shard_inline]: 6.49999e-06 [merge_send_recv]: 8.05e-06 [auto_parallel]: 6.94999e-06 [parallel]: 1.831e-05 [flash_sp]: 8.12e-06 [merge_comm]: 3.68e-06 [allreduce_fusion]: 3.37002e-06 [matmul_add_comm_reduction]: 9.24e-06 [allreduce_slice_to_reducescatter]: 5.89993e-07 [virtual_shard_identity]: 7.36999e-06 [virtual_dataset]: 5.99999e-06 [get_grad_eliminate_]: 6.36e-06 [virtual_output]: 6.40002e-06 [merge_forward]: 3.90998e-06 [cell_reuse_recompute_pass]: 1.30001e-06 [offload_activation]: 9.91e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.415e-05 [merge_recompute_call_nodes]: 1.44998e-06 [before_grad]: 1.007e-05 [set_forward_comm_id_for_comm_node_pass]: 3.76001e-06 [meta_fg_expand]: 2.68e-06 [flash_sp_send_recv_attached]: 2.81999e-06 [receive_attached]: 2.49001e-06 [after_resolve]: 9.71998e-06 [a_after_grad]: 8.87e-06 [renormalize]: 0.00054689 [add_forward_monad_depend]: 4.55001e-06 [auto_monad_grad]: 2.12001e-06 [auto_monad_eliminator]: 1.47e-05 [cse]: 2.983e-05 [a_3]: 5.919e-05 [Cycle 2]: 0.00078675, [45] [expand_dump_flag]: 1.05001e-06 [switch_simplify]: 7.43e-06 [loop_unroll]: 5.98002e-06 [a_1]: 0.00010548 [with_stream_mark]: 1.132e-05 [recompute_prepare]: 6.21e-06 [updatestate_depend_eliminate]: 3.19001e-06 [updatestate_assign_eliminate]: 2.29001e-06 [updatestate_loads_eliminate]: 2.37001e-06 [parameter_eliminate]: 1.13001e-06 [a_2]: 9.735e-05 [accelerated_algorithm]: 5.94999e-06 [shard]: 1.46002e-06 [meta_shard_fg_expand]: 1.37999e-06 [shard_inline]: 6.53003e-06 [merge_send_recv]: 4.66002e-06 [auto_parallel]: 5.52999e-06 [parallel]: 5.64e-06 [flash_sp]: 3.44001e-06 [merge_comm]: 3.36001e-06 [allreduce_fusion]: 2.95998e-06 [matmul_add_comm_reduction]: 5.44e-06 [allreduce_slice_to_reducescatter]: 5.09986e-07 [virtual_shard_identity]: 6.74999e-06 [virtual_dataset]: 5.74999e-06 [get_grad_eliminate_]: 5.49998e-06 [virtual_output]: 5.81e-06 [merge_forward]: 3.31001e-06 [cell_reuse_recompute_pass]: 1.59e-06 [offload_activation]: 7.11999e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.48e-05 [merge_recompute_call_nodes]: 8.70001e-07 [before_grad]: 9.29e-06 [set_forward_comm_id_for_comm_node_pass]: 4.2e-06 [meta_fg_expand]: 2.74999e-06 [flash_sp_send_recv_attached]: 9.09989e-07 [receive_attached]: 1.08001e-06 [after_resolve]: 8.56997e-06 [a_after_grad]: 8.48999e-06 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 1.21997e-06 [auto_monad_grad]: 1.05001e-06 [auto_monad_eliminator]: 7.60998e-06 [cse]: 1.558e-05 [a_3]: 4.889e-05 [py_interpret_to_execute_after_opt_a]: 1.28e-05 [slice_cell_reuse_recomputed_activation]: 4.91002e-06 [rewriter_after_opt_a]: 4.023e-05 [convert_after_rewriter]: 1.012e-05 [order_py_execute_after_rewriter]: 8.15999e-06 [mutable_eliminate]: 0.00052585 [opt_b]: 0.00027912, [1] [Cycle 1]: 0.00027084, [7] [b_1]: 0.00017484 [b_2]: 8.10999e-06 [updatestate_depend_eliminate]: 6.42001e-06 [updatestate_assign_eliminate]: 2.54001e-06 [updatestate_loads_eliminate]: 2.51e-06 [renormalize]: 5.59987e-07 [cse]: 2.017e-05 [optimize_parallel_all_gather_comm]: 1.955e-05 [overlap_param_gather]: 4.87e-06 [cconv]: 2.805e-05 [loop_unroll]: 0.00044919 [opt_after_cconv]: 0.00012411, [1] [Cycle 1]: 0.0001156, [7] [c_1]: 2.827e-05 [parameter_eliminate]: 2.66e-06 [updatestate_depend_eliminate]: 5.89999e-06 [updatestate_assign_eliminate]: 2.55997e-06 [updatestate_loads_eliminate]: 2.50002e-06 [cse]: 1.762e-05 [renormalize]: 5.49975e-07 [remove_dup_value]: 1.884e-05 [tuple_transform]: 8.598e-05, [1] [Cycle 1]: 7.882e-05, [4] [d_1]: 3.987e-05 [none_parameter_eliminate]: 1.59e-06 [renormalize]: 2.19996e-07 [switch_simplify]: 6.53e-06 [partial_unused_args_eliminate]: 4.35e-06 [add_recomputation]: 4.978e-05 [cse_after_recomputation]: 2.815e-05, [1] [Cycle 1]: 2.112e-05, [1] [cse]: 1.196e-05 [environ_conv]: 8.1e-06 [swap_dp_allreduce_reducescatter]: 8.46002e-06 [bias_add_comm_swap]: 5.25999e-06 [label_micro_interleaved_index]: 7.24001e-06 [label_fine_grained_interleaved_index]: 5.27001e-06 [merge_cast_opt]: 4.18999e-06 [slice_recompute_activation]: 4.63001e-06 [micro_interleaved_order_control]: 5.22e-06 [assign_add_opt]: 3.56999e-06 [ForceFp32Comm]: 3.51001e-06 [remove_cast_before_assign_add]: 3.33e-06 [full_micro_interleaved_order_control]: 4.57e-06 [reorder_send_recv_between_fp_bp]: 4.89e-06 [comm_op_add_attrs]: 4.02002e-06 [add_comm_op_reuse_tag]: 3.2e-06 [interleave_split_concat_branches]: 3.56999e-06 [interleave_parallel_branches]: 3.46999e-06 [overlap_opt_shard_in_pipeline]: 3.81999e-06 [overlap_opt_shard_grad_in_pipeline]: 4.19002e-06 [control_data_broadcast_order]: 1.526e-05 [grouped_pairwise_exchange_alltoall]: 4.3e-06 [offloading_packed_experts]: 6.70002e-06 [overlap_recompute_and_grad_model_parallel]: 7.47998e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.65998e-06 [overlap_recompute_allgather_and_fa_grad]: 3.81001e-06 [overlap_recompute_comm]: 5.00001e-06 [overlap_grad_ring_attention]: 6.77002e-06 [overlap_grad_flash_sp]: 2.226e-05 [begin_end_overlap_inline]: 2.84001e-06 [split_matmul_comm_elemetwise]: 4.52e-06 [split_layernorm_comm]: 4.09002e-06 [handle_group_info]: 3.79002e-06 [symbol_engine_optimizer]: 9.944e-05, [1] [Cycle 1]: 9.217e-05, [6] [build]: 2.82002e-06 [elim_shapecalc]: 9.92999e-06 [elim_not_effective]: 1.294e-05 [opt_reshape]: 7.09001e-06 [fold_const_symbol]: 1.077e-05 [renormalize]: 2.30008e-07 [detach_backward]: 3.99002e-06 [pipeline_parallel_scheduler]: 1.96998e-06 [auto_monad_reorder]: 1.826e-05 [get_jit_bprop_graph]: 1.72999e-06 [rewriter_after_jit_bprop_graph]: 5.37999e-06 [opt_after_jit_grad]: 0.00050529 [validate]: 3.945e-05 Sums bootstrap : 0.000421s : 4.54% type_inference : 0.004585s : 49.49% event_method : 0.000012s : 0.13% auto_monad : 0.000052s : 0.56% graph_reusing : 0.000005s : 0.05% inline : 0.000002s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000013s : 0.14% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000004s : 0.04% parallel-infer-symbol : 0.000003s : 0.04% pre_auto_parallel : 0.000026s : 0.28% insert-virtual-dataset : 0.000003s : 0.03% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.02% optimize.py_interpret_to_execute : 0.000020s : 0.22% optimize.rewriter_before_opt_a : 0.000048s : 0.52% optimize.opt_a.expand_dump_flag : 0.000004s : 0.04% optimize.opt_a.switch_simplify : 0.000031s : 0.34% optimize.opt_a.loop_unroll : 0.000020s : 0.22% optimize.opt_a.a_1 : 0.000392s : 4.23% optimize.opt_a.with_stream_mark : 0.000028s : 0.30% optimize.opt_a.recompute_prepare : 0.000014s : 0.15% optimize.opt_a.updatestate_depend_eliminate : 0.000007s : 0.08% optimize.opt_a.updatestate_assign_eliminate : 0.000005s : 0.06% optimize.opt_a.updatestate_loads_eliminate : 0.000005s : 0.06% optimize.opt_a.parameter_eliminate : 0.000003s : 0.03% optimize.opt_a.a_2 : 0.000205s : 2.22% optimize.opt_a.accelerated_algorithm : 0.000013s : 0.14% optimize.opt_a.shard : 0.000004s : 0.04% optimize.opt_a.meta_shard_fg_expand : 0.000003s : 0.03% optimize.opt_a.shard_inline : 0.000013s : 0.14% optimize.opt_a.merge_send_recv : 0.000013s : 0.14% optimize.opt_a.auto_parallel : 0.000012s : 0.13% optimize.opt_a.parallel : 0.000024s : 0.26% optimize.opt_a.flash_sp : 0.000012s : 0.12% optimize.opt_a.merge_comm : 0.000007s : 0.08% optimize.opt_a.allreduce_fusion : 0.000006s : 0.07% optimize.opt_a.matmul_add_comm_reduction : 0.000015s : 0.16% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000014s : 0.15% optimize.opt_a.virtual_dataset : 0.000012s : 0.13% optimize.opt_a.get_grad_eliminate_ : 0.000012s : 0.13% optimize.opt_a.virtual_output : 0.000012s : 0.13% optimize.opt_a.merge_forward : 0.000007s : 0.08% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.03% optimize.opt_a.offload_activation : 0.000017s : 0.18% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000029s : 0.31% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.03% optimize.opt_a.before_grad : 0.000019s : 0.21% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000008s : 0.09% optimize.opt_a.meta_fg_expand : 0.000005s : 0.06% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.04% optimize.opt_a.receive_attached : 0.000004s : 0.04% optimize.opt_a.after_resolve : 0.000018s : 0.20% optimize.opt_a.a_after_grad : 0.000017s : 0.19% optimize.opt_a.renormalize : 0.000547s : 5.90% optimize.opt_a.add_forward_monad_depend : 0.000006s : 0.06% optimize.opt_a.auto_monad_grad : 0.000003s : 0.03% optimize.opt_a.auto_monad_eliminator : 0.000022s : 0.24% optimize.opt_a.cse : 0.000045s : 0.49% optimize.opt_a.a_3 : 0.000108s : 1.17% optimize.py_interpret_to_execute_after_opt_a : 0.000013s : 0.14% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.05% optimize.rewriter_after_opt_a : 0.000040s : 0.43% optimize.convert_after_rewriter : 0.000010s : 0.11% optimize.order_py_execute_after_rewriter : 0.000008s : 0.09% optimize.mutable_eliminate : 0.000526s : 5.68% optimize.opt_b.b_1 : 0.000175s : 1.89% optimize.opt_b.b_2 : 0.000008s : 0.09% optimize.opt_b.updatestate_depend_eliminate : 0.000006s : 0.07% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_b.renormalize : 0.000001s : 0.01% optimize.opt_b.cse : 0.000020s : 0.22% optimize.optimize_parallel_all_gather_comm : 0.000020s : 0.21% optimize.overlap_param_gather : 0.000005s : 0.05% optimize.cconv : 0.000028s : 0.30% optimize.loop_unroll : 0.000449s : 4.85% optimize.opt_after_cconv.c_1 : 0.000028s : 0.31% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.06% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.cse : 0.000018s : 0.19% optimize.opt_after_cconv.renormalize : 0.000001s : 0.01% optimize.remove_dup_value : 0.000019s : 0.20% optimize.tuple_transform.d_1 : 0.000040s : 0.43% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.02% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000007s : 0.07% optimize.partial_unused_args_eliminate : 0.000004s : 0.05% optimize.add_recomputation : 0.000050s : 0.54% optimize.cse_after_recomputation.cse : 0.000012s : 0.13% optimize.environ_conv : 0.000008s : 0.09% optimize.swap_dp_allreduce_reducescatter : 0.000008s : 0.09% optimize.bias_add_comm_swap : 0.000005s : 0.06% optimize.label_micro_interleaved_index : 0.000007s : 0.08% optimize.label_fine_grained_interleaved_index : 0.000005s : 0.06% optimize.merge_cast_opt : 0.000004s : 0.05% optimize.slice_recompute_activation : 0.000005s : 0.05% optimize.micro_interleaved_order_control : 0.000005s : 0.06% optimize.assign_add_opt : 0.000004s : 0.04% optimize.ForceFp32Comm : 0.000004s : 0.04% optimize.remove_cast_before_assign_add : 0.000003s : 0.04% optimize.full_micro_interleaved_order_control : 0.000005s : 0.05% optimize.reorder_send_recv_between_fp_bp : 0.000005s : 0.05% optimize.comm_op_add_attrs : 0.000004s : 0.04% optimize.add_comm_op_reuse_tag : 0.000003s : 0.03% optimize.interleave_split_concat_branches : 0.000004s : 0.04% optimize.interleave_parallel_branches : 0.000003s : 0.04% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.04% optimize.overlap_opt_shard_grad_in_pipeline : 0.000004s : 0.05% optimize.control_data_broadcast_order : 0.000015s : 0.16% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.05% optimize.offloading_packed_experts : 0.000007s : 0.07% optimize.overlap_recompute_and_grad_model_parallel : 0.000007s : 0.08% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.04% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.04% optimize.overlap_recompute_comm : 0.000005s : 0.05% optimize.overlap_grad_ring_attention : 0.000007s : 0.07% optimize.overlap_grad_flash_sp : 0.000022s : 0.24% optimize.begin_end_overlap_inline : 0.000003s : 0.03% optimize.split_matmul_comm_elemetwise : 0.000005s : 0.05% optimize.split_layernorm_comm : 0.000004s : 0.04% optimize.handle_group_info : 0.000004s : 0.04% optimize.symbol_engine_optimizer.build : 0.000003s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000010s : 0.11% optimize.symbol_engine_optimizer.elim_not_effective : 0.000013s : 0.14% optimize.symbol_engine_optimizer.opt_reshape : 0.000007s : 0.08% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000011s : 0.12% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000004s : 0.04% pipeline_parallel_scheduler : 0.000002s : 0.02% auto_monad_reorder : 0.000018s : 0.20% get_jit_bprop_graph : 0.000002s : 0.02% rewriter_after_jit_bprop_graph : 0.000005s : 0.06% opt_after_jit_grad : 0.000505s : 5.45% validate : 0.000039s : 0.43% Time group info: ------[substitution.] 0.000107 19 1.73% : 0.000002s : 2: substitution.elim_not_effective 1.32% : 0.000001s : 2: substitution.fold_const_symbol 5.43% : 0.000006s : 3: substitution.graph_param_transform 81.60% : 0.000088s : 2: substitution.inline 2.94% : 0.000003s : 4: substitution.j_node_and_user_rematch 4.29% : 0.000005s : 4: substitution.remove_not_recompute_node 2.69% : 0.000003s : 2: substitution.replace_old_param ------[type_inference.] 0.004541 2 90.83% : 0.004125s : 1: type_inference.infer 9.17% : 0.000417s : 1: type_inference.specialize ------[replace.] 0.000021 2 100.00% : 0.000021s : 2: replace.inline ------[match.] 0.000086 2 100.00% : 0.000086s : 2: match.inline ------[predicate.] 0.000135 754 0.83% : 0.000001s : 7: predicate.accumulaten_eliminater 1.09% : 0.000001s : 3: predicate.ad_related_special_op_eliminate 0.74% : 0.000001s : 6: predicate.addn_check_dump 0.82% : 0.000001s : 7: predicate.addn_zero_filter 0.68% : 0.000001s : 7: predicate.adjust_all_reduce_mul_add 2.26% : 0.000003s : 13: predicate.arithmetic_simplify 1.01% : 0.000001s : 7: predicate.cast_eliminate 0.75% : 0.000001s : 6: predicate.check_bprop_eliminate 0.68% : 0.000001s : 6: predicate.compare_switch_simplify 0.22% : 0.000000s : 3: predicate.const_output_eliminate 0.79% : 0.000001s : 6: predicate.depend_value_elim 0.71% : 0.000001s : 7: predicate.dict_get_item_const_eliminator 0.88% : 0.000001s : 7: predicate.dict_get_item_eliminator 0.73% : 0.000001s : 7: predicate.dict_set_item_eliminator 1.42% : 0.000002s : 6: predicate.dumpgradient_eliminate 0.25% : 0.000000s : 3: predicate.elim_not_effective 0.68% : 0.000001s : 3: predicate.elim_shapecalc_of_broadcastargs 1.14% : 0.000002s : 10: predicate.environ_add_const_eliminate 1.09% : 0.000001s : 10: predicate.environ_get_add_eliminate 0.97% : 0.000001s : 10: predicate.environ_get_depend_swap 1.71% : 0.000002s : 16: predicate.environ_get_eliminate 0.99% : 0.000001s : 10: predicate.environ_get_set_eliminate 0.98% : 0.000001s : 9: predicate.exchange_switch_depend_value 1.90% : 0.000003s : 9: predicate.float_depend_g_call 0.69% : 0.000001s : 6: predicate.float_environ_get_switch 1.11% : 0.000001s : 9: predicate.float_tuple_getitem_switch 0.20% : 0.000000s : 3: predicate.fold_const_symbol 0.79% : 0.000001s : 6: predicate.get_grad_eliminate 0.29% : 0.000000s : 3: predicate.graph_param_transform 0.78% : 0.000001s : 6: predicate.incorporate_call 0.67% : 0.000001s : 6: predicate.incorporate_call_switch 6.43% : 0.000009s : 34: predicate.inline 0.99% : 0.000001s : 6: predicate.inline_without_move 0.45% : 0.000001s : 6: predicate.j_node_and_user_rematch 1.05% : 0.000001s : 6: predicate.less_batch_normalization 1.51% : 0.000002s : 13: predicate.list_to_tuple_eliminator_ 2.07% : 0.000003s : 20: predicate.load_eliminater 1.31% : 0.000002s : 3: predicate.loop_unroll_after_grad 1.65% : 0.000002s : 14: predicate.loop_unroll_before_grad 1.86% : 0.000003s : 13: predicate.make_slice_get_slice_eliminator 0.69% : 0.000001s : 6: predicate.merge_addn 0.68% : 0.000001s : 6: predicate.micro_step_allgather_replace 0.85% : 0.000001s : 6: predicate.mini_step_allgather_replace 0.66% : 0.000001s : 7: predicate.minmaximum_grad 1.50% : 0.000002s : 3: predicate.mutable_eliminate 0.42% : 0.000001s : 3: predicate.opt_reshape 0.78% : 0.000001s : 3: predicate.parallel_virtual_node 1.23% : 0.000002s : 9: predicate.partial_defer_inline 1.25% : 0.000002s : 10: predicate.partial_eliminate 0.82% : 0.000001s : 7: predicate.print_const_string_wrapper 0.71% : 0.000001s : 6: predicate.reduce_all_const_elim 1.11% : 0.000001s : 7: predicate.reduce_eliminate 2.20% : 0.000003s : 20: predicate.redundant_stop_gradient_eliminater 0.64% : 0.000001s : 6: predicate.remove_not_recompute_node 1.09% : 0.000001s : 13: predicate.replace_applicator 0.66% : 0.000001s : 6: predicate.replace_old_param 0.34% : 0.000000s : 3: predicate.reset_defer_inline 0.90% : 0.000001s : 7: predicate.reshape_eliminate 0.76% : 0.000001s : 6: predicate.row_tensor_add_zeros_like 0.65% : 0.000001s : 3: predicate.row_tensor_eliminate 0.97% : 0.000001s : 6: predicate.same_eliminate 0.52% : 0.000001s : 6: predicate.set_cell_output_no_recompute 1.11% : 0.000001s : 6: predicate.shard_identity_eliminate 1.35% : 0.000002s : 6: predicate.special_op_eliminate 0.92% : 0.000001s : 6: predicate.specialize_transform 0.99% : 0.000001s : 6: predicate.split_environ_get_set_with_tuple_value 0.91% : 0.000001s : 6: predicate.stack_unstack_eliminate 0.44% : 0.000001s : 3: predicate.switch_call_monad_eliminater 0.99% : 0.000001s : 9: predicate.switch_defer_inline 1.68% : 0.000002s : 15: predicate.switch_layer_defer_inline 4.21% : 0.000006s : 32: predicate.switch_simplify 0.87% : 0.000001s : 7: predicate.tile_eliminate 0.79% : 0.000001s : 7: predicate.transpose_eliminate 1.72% : 0.000002s : 13: predicate.tuple_list_convert_item_index_to_positive 1.71% : 0.000002s : 13: predicate.tuple_list_get_item_const_eliminator 1.42% : 0.000002s : 13: predicate.tuple_list_get_item_depend_reorder 3.06% : 0.000004s : 19: predicate.tuple_list_get_item_eliminator 1.33% : 0.000002s : 13: predicate.tuple_list_get_set_item_eliminator 2.66% : 0.000004s : 19: predicate.tuple_list_set_item_eliminator 1.54% : 0.000002s : 13: predicate.tuple_to_list_eliminator_ 1.96% : 0.000003s : 20: predicate.updatestate_pure_node_eliminater 2.81% : 0.000004s : 26: predicate.updatestate_useless_node_eliminater 0.42% : 0.000001s : 3: predicate.value_based_eliminate 0.82% : 0.000001s : 6: predicate.virtual_dataset_eliminate 0.76% : 0.000001s : 6: predicate.virtual_output_eliminate 0.34% : 0.000000s : 3: predicate.virtual_view_grad_eliminate 0.61% : 0.000001s : 3: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000211 5 8.23% : 0.000017s : 1: func_graph_cloner_run.FuncGraphClonerGraph 91.77% : 0.000194s : 4: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.023400 192 0.03% : 0.000006s : 1: ForceFp32Comm 13.44% : 0.003145s : 1: add_attr 13.38% : 0.003130s : 1: add_attr_with_inline 0.03% : 0.000006s : 1: add_comm_op_reuse_tag 0.23% : 0.000053s : 1: add_recomputation 0.03% : 0.000006s : 1: assign_add_opt 0.26% : 0.000061s : 1: auto_monad 0.11% : 0.000026s : 1: auto_monad_reorder 0.02% : 0.000006s : 1: begin_end_overlap_inline 0.03% : 0.000008s : 1: bias_add_comm_swap 1.98% : 0.000464s : 1: bootstrap 0.13% : 0.000031s : 1: cconv 0.03% : 0.000007s : 1: comm_op_add_attrs 0.08% : 0.000018s : 1: control_data_broadcast_order 0.06% : 0.000013s : 1: convert_after_rewriter 0.13% : 0.000031s : 1: cse_after_recomputation 0.03% : 0.000008s : 1: dataset_repeat_opt 0.08% : 0.000018s : 1: detach_backward 0.05% : 0.000011s : 1: environ_conv 0.09% : 0.000021s : 1: event_method 0.03% : 0.000007s : 1: full_micro_interleaved_order_control 0.03% : 0.000008s : 1: get_jit_bprop_graph 0.05% : 0.000011s : 1: graph_reusing 0.03% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.03% : 0.000007s : 1: handle_group_info 0.03% : 0.000008s : 1: inline 0.04% : 0.000009s : 1: insert-virtual-dataset 0.03% : 0.000006s : 1: interleave_parallel_branches 0.03% : 0.000006s : 1: interleave_split_concat_branches 0.03% : 0.000008s : 1: label_fine_grained_interleaved_index 0.04% : 0.000010s : 1: label_micro_interleaved_index 1.94% : 0.000455s : 1: loop_unroll 0.03% : 0.000007s : 1: merge_cast_opt 0.03% : 0.000008s : 1: micro_interleaved_order_control 2.27% : 0.000532s : 1: mutable_eliminate 0.04% : 0.000010s : 1: offloading_packed_experts 0.07% : 0.000015s : 1: opt.transform.loop_unroll_optimizer 0.07% : 0.000016s : 1: opt.transform.mutable_eliminate 3.23% : 0.000756s : 78: opt.transform.opt_a 0.11% : 0.000027s : 1: opt.transform.opt_after_cconv 0.11% : 0.000025s : 1: opt.transform.opt_after_jit_grad 0.47% : 0.000110s : 28: opt.transform.opt_b 0.19% : 0.000044s : 2: opt.transform.opt_trans_graph 0.16% : 0.000037s : 4: opt.transform.symbol_engine_opt 10.58% : 0.002475s : 1: opt_a 0.55% : 0.000128s : 1: opt_after_cconv 2.21% : 0.000516s : 1: opt_after_jit_grad 1.21% : 0.000282s : 1: opt_b 21.55% : 0.005043s : 1: optimize 0.10% : 0.000023s : 1: optimize_parallel_all_gather_comm 0.05% : 0.000011s : 1: order_py_execute_after_rewriter 0.11% : 0.000025s : 1: overlap_grad_flash_sp 0.03% : 0.000006s : 1: overlap_grad_matmul_and_grad_allreduce 0.04% : 0.000010s : 1: overlap_grad_ring_attention 0.03% : 0.000007s : 1: overlap_opt_shard_grad_in_pipeline 0.03% : 0.000007s : 1: overlap_opt_shard_in_pipeline 0.03% : 0.000008s : 1: overlap_param_gather 0.03% : 0.000006s : 1: overlap_recompute_allgather_and_fa_grad 0.04% : 0.000011s : 1: overlap_recompute_and_grad_model_parallel 0.03% : 0.000008s : 1: overlap_recompute_comm 0.04% : 0.000010s : 1: parallel-infer-symbol 0.03% : 0.000006s : 1: parallel-infer-symbol-second 0.03% : 0.000007s : 1: partial_unused_args_eliminate 0.04% : 0.000009s : 1: pipeline_parallel_scheduler 0.03% : 0.000007s : 1: pipeline_split 0.14% : 0.000033s : 1: pre_auto_parallel 0.10% : 0.000024s : 1: py_interpret_to_execute 0.07% : 0.000016s : 1: py_interpret_to_execute_after_opt_a 0.03% : 0.000006s : 1: remove_cast_before_assign_add 0.10% : 0.000023s : 1: remove_dup_value 1.35% : 0.000316s : 1: renormalize.infer 0.96% : 0.000224s : 1: renormalize.specialize 0.03% : 0.000008s : 1: reorder_send_recv_between_fp_bp 0.05% : 0.000011s : 1: rewriter_after_jit_bprop_graph 0.19% : 0.000044s : 1: rewriter_after_opt_a 0.22% : 0.000052s : 1: rewriter_before_opt_a 0.03% : 0.000008s : 1: slice_cell_reuse_recomputed_activation 0.03% : 0.000007s : 1: slice_recompute_activation 0.03% : 0.000007s : 1: split_layernorm_comm 0.03% : 0.000007s : 1: split_matmul_comm_elemetwise 0.05% : 0.000011s : 1: swap_dp_allreduce_reducescatter 0.44% : 0.000102s : 1: symbol_engine_optimizer 0.38% : 0.000089s : 1: tuple_transform 19.70% : 0.004610s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:40.369.18 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.013881, [21] [bootstrap]: 0.00042462 [type_inference]: 0.00494378 [event_method]: 1.186e-05 [auto_monad]: 5.578e-05 [graph_reusing]: 5.12999e-06 [inline]: 2.24999e-06 [add_attr]: 0.00327659, [1] [add_attr_with_inline]: 0.00326538, [1] [Cycle 1]: 5.894e-05, [2] [tag_attr]: 1.558e-05 [meta_addattr_fg_expand]: 3.45e-06 [parallel-infer-symbol]: 2.88e-06 [pre_auto_parallel]: 2.975e-05 [insert-virtual-dataset]: 2.34001e-06 [parallel-infer-symbol-second]: 9.09989e-07 [dataset_repeat_opt]: 2.26e-06 [pipeline_split]: 1.82999e-06 [optimize]: 0.00440564, [53] [py_interpret_to_execute]: 1.957e-05 [rewriter_before_opt_a]: 4.768e-05 [opt_a]: 0.00235738, [2] [Cycle 1]: 0.00171305, [45] [expand_dump_flag]: 2.71999e-06 [switch_simplify]: 2.592e-05 [loop_unroll]: 1.353e-05 [a_1]: 0.00031281 [with_stream_mark]: 1.966e-05 [recompute_prepare]: 8.03999e-06 [updatestate_depend_eliminate]: 4.69002e-06 [updatestate_assign_eliminate]: 3.08e-06 [updatestate_loads_eliminate]: 3.03e-06 [parameter_eliminate]: 1.76e-06 [a_2]: 7.81e-05 [accelerated_algorithm]: 7.26999e-06 [shard]: 2.31e-06 [meta_shard_fg_expand]: 1.62001e-06 [shard_inline]: 6.23002e-06 [merge_send_recv]: 9.48002e-06 [auto_parallel]: 6.93998e-06 [parallel]: 1.836e-05 [flash_sp]: 9.15999e-06 [merge_comm]: 4e-06 [allreduce_fusion]: 3.40003e-06 [matmul_add_comm_reduction]: 9.64999e-06 [allreduce_slice_to_reducescatter]: 6.50005e-07 [virtual_shard_identity]: 8.72998e-06 [virtual_dataset]: 6.25002e-06 [get_grad_eliminate_]: 5.96e-06 [virtual_output]: 5.82001e-06 [merge_forward]: 4.19002e-06 [cell_reuse_recompute_pass]: 1.37e-06 [offload_activation]: 9.87001e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.307e-05 [merge_recompute_call_nodes]: 1.37e-06 [before_grad]: 1.098e-05 [set_forward_comm_id_for_comm_node_pass]: 3.43999e-06 [meta_fg_expand]: 2.56998e-06 [flash_sp_send_recv_attached]: 2.83e-06 [receive_attached]: 2.44001e-06 [after_resolve]: 9.52999e-06 [a_after_grad]: 9.60001e-06 [renormalize]: 0.00070675 [add_forward_monad_depend]: 6.33002e-06 [auto_monad_grad]: 3.09999e-06 [auto_monad_eliminator]: 1.746e-05 [cse]: 3.218e-05 [a_3]: 5.014e-05 [Cycle 2]: 0.0006327, [45] [expand_dump_flag]: 1.84e-06 [switch_simplify]: 7.75998e-06 [loop_unroll]: 5.76e-06 [a_1]: 0.00010861 [with_stream_mark]: 1.453e-05 [recompute_prepare]: 6.12001e-06 [updatestate_depend_eliminate]: 3.33998e-06 [updatestate_assign_eliminate]: 2.62001e-06 [updatestate_loads_eliminate]: 3.5e-06 [parameter_eliminate]: 1.34e-06 [a_2]: 6.862e-05 [accelerated_algorithm]: 5.95002e-06 [shard]: 1.45001e-06 [meta_shard_fg_expand]: 1.59e-06 [shard_inline]: 5.72999e-06 [merge_send_recv]: 5.68002e-06 [auto_parallel]: 6.77002e-06 [parallel]: 6.41998e-06 [flash_sp]: 3.55e-06 [merge_comm]: 3.76999e-06 [allreduce_fusion]: 3.23e-06 [matmul_add_comm_reduction]: 7.18e-06 [allreduce_slice_to_reducescatter]: 3.50003e-07 [virtual_shard_identity]: 6.51999e-06 [virtual_dataset]: 5.34e-06 [get_grad_eliminate_]: 5.27999e-06 [virtual_output]: 5.64e-06 [merge_forward]: 3.20998e-06 [cell_reuse_recompute_pass]: 2.46e-06 [offload_activation]: 7.43999e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.311e-05 [merge_recompute_call_nodes]: 1.03001e-06 [before_grad]: 9.85002e-06 [set_forward_comm_id_for_comm_node_pass]: 4.1e-06 [meta_fg_expand]: 2.51e-06 [flash_sp_send_recv_attached]: 1.55999e-06 [receive_attached]: 1.47001e-06 [after_resolve]: 1.083e-05 [a_after_grad]: 8.65001e-06 [renormalize]: 7.00238e-08 [add_forward_monad_depend]: 1.50999e-06 [auto_monad_grad]: 1.25999e-06 [auto_monad_eliminator]: 7.52998e-06 [cse]: 1.612e-05 [a_3]: 3.404e-05 [py_interpret_to_execute_after_opt_a]: 1.031e-05 [slice_cell_reuse_recomputed_activation]: 2.09999e-06 [rewriter_after_opt_a]: 3.746e-05 [convert_after_rewriter]: 6.61e-06 [order_py_execute_after_rewriter]: 5.22999e-06 [mutable_eliminate]: 0.00058502 [opt_b]: 0.00020259, [1] [Cycle 1]: 0.00019622, [7] [b_1]: 0.00011833 [b_2]: 7.63001e-06 [updatestate_depend_eliminate]: 7.33e-06 [updatestate_assign_eliminate]: 2.46998e-06 [updatestate_loads_eliminate]: 2.94999e-06 [renormalize]: 3.39991e-07 [cse]: 2.135e-05 [optimize_parallel_all_gather_comm]: 1.732e-05 [overlap_param_gather]: 1.92999e-06 [cconv]: 2.772e-05 [loop_unroll]: 0.00043163 [opt_after_cconv]: 9.846e-05, [1] [Cycle 1]: 9.262e-05, [7] [c_1]: 2.758e-05 [parameter_eliminate]: 3.17002e-06 [updatestate_depend_eliminate]: 5.47001e-06 [updatestate_assign_eliminate]: 2.58998e-06 [updatestate_loads_eliminate]: 2.37001e-06 [cse]: 1.779e-05 [renormalize]: 3.19997e-07 [remove_dup_value]: 1.554e-05 [tuple_transform]: 7.261e-05, [1] [Cycle 1]: 6.777e-05, [4] [d_1]: 4.042e-05 [none_parameter_eliminate]: 1.78997e-06 [renormalize]: 2.00002e-07 [switch_simplify]: 6.71e-06 [partial_unused_args_eliminate]: 1.97001e-06 [add_recomputation]: 4.868e-05 [cse_after_recomputation]: 2.207e-05, [1] [Cycle 1]: 1.757e-05, [1] [cse]: 1.159e-05 [environ_conv]: 5.72001e-06 [swap_dp_allreduce_reducescatter]: 5.12e-06 [bias_add_comm_swap]: 2.56e-06 [label_micro_interleaved_index]: 3.72002e-06 [label_fine_grained_interleaved_index]: 2.94001e-06 [merge_cast_opt]: 1.55999e-06 [slice_recompute_activation]: 2.16998e-06 [micro_interleaved_order_control]: 2.27001e-06 [assign_add_opt]: 1.30999e-06 [ForceFp32Comm]: 1.00999e-06 [remove_cast_before_assign_add]: 9.39996e-07 [full_micro_interleaved_order_control]: 2.05002e-06 [reorder_send_recv_between_fp_bp]: 2.34999e-06 [comm_op_add_attrs]: 9.89996e-07 [add_comm_op_reuse_tag]: 9.29984e-07 [interleave_split_concat_branches]: 1.12e-06 [interleave_parallel_branches]: 9.89996e-07 [overlap_opt_shard_in_pipeline]: 1.10999e-06 [overlap_opt_shard_grad_in_pipeline]: 1.72999e-06 [control_data_broadcast_order]: 1.305e-05 [grouped_pairwise_exchange_alltoall]: 1.67999e-06 [offloading_packed_experts]: 3.93001e-06 [overlap_recompute_and_grad_model_parallel]: 4.92999e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.04998e-06 [overlap_recompute_allgather_and_fa_grad]: 1.54e-06 [overlap_recompute_comm]: 2.57001e-06 [overlap_grad_ring_attention]: 3.87998e-06 [overlap_grad_flash_sp]: 2.022e-05 [begin_end_overlap_inline]: 5.69999e-07 [split_matmul_comm_elemetwise]: 1.97999e-06 [split_layernorm_comm]: 1.59e-06 [handle_group_info]: 9.50007e-07 [symbol_engine_optimizer]: 7.57e-05, [1] [Cycle 1]: 7.129e-05, [6] [build]: 2.86e-06 [elim_shapecalc]: 9.49e-06 [elim_not_effective]: 1.301e-05 [opt_reshape]: 7.02002e-06 [fold_const_symbol]: 9.63002e-06 [renormalize]: 2.60014e-07 [detach_backward]: 1.91998e-06 [pipeline_parallel_scheduler]: 1.47999e-06 [auto_monad_reorder]: 1.618e-05 [get_jit_bprop_graph]: 1.77001e-06 [rewriter_after_jit_bprop_graph]: 4.75999e-06 [opt_after_jit_grad]: 0.00049103 [validate]: 4.376e-05 Sums bootstrap : 0.000425s : 4.41% type_inference : 0.004944s : 51.31% event_method : 0.000012s : 0.12% auto_monad : 0.000056s : 0.58% graph_reusing : 0.000005s : 0.05% inline : 0.000002s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000016s : 0.16% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000003s : 0.04% parallel-infer-symbol : 0.000003s : 0.03% pre_auto_parallel : 0.000030s : 0.31% insert-virtual-dataset : 0.000002s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.02% optimize.py_interpret_to_execute : 0.000020s : 0.20% optimize.rewriter_before_opt_a : 0.000048s : 0.49% optimize.opt_a.expand_dump_flag : 0.000005s : 0.05% optimize.opt_a.switch_simplify : 0.000034s : 0.35% optimize.opt_a.loop_unroll : 0.000019s : 0.20% optimize.opt_a.a_1 : 0.000421s : 4.37% optimize.opt_a.with_stream_mark : 0.000034s : 0.35% optimize.opt_a.recompute_prepare : 0.000014s : 0.15% optimize.opt_a.updatestate_depend_eliminate : 0.000008s : 0.08% optimize.opt_a.updatestate_assign_eliminate : 0.000006s : 0.06% optimize.opt_a.updatestate_loads_eliminate : 0.000007s : 0.07% optimize.opt_a.parameter_eliminate : 0.000003s : 0.03% optimize.opt_a.a_2 : 0.000147s : 1.52% optimize.opt_a.accelerated_algorithm : 0.000013s : 0.14% optimize.opt_a.shard : 0.000004s : 0.04% optimize.opt_a.meta_shard_fg_expand : 0.000003s : 0.03% optimize.opt_a.shard_inline : 0.000012s : 0.12% optimize.opt_a.merge_send_recv : 0.000015s : 0.16% optimize.opt_a.auto_parallel : 0.000014s : 0.14% optimize.opt_a.parallel : 0.000025s : 0.26% optimize.opt_a.flash_sp : 0.000013s : 0.13% optimize.opt_a.merge_comm : 0.000008s : 0.08% optimize.opt_a.allreduce_fusion : 0.000007s : 0.07% optimize.opt_a.matmul_add_comm_reduction : 0.000017s : 0.17% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000015s : 0.16% optimize.opt_a.virtual_dataset : 0.000012s : 0.12% optimize.opt_a.get_grad_eliminate_ : 0.000011s : 0.12% optimize.opt_a.virtual_output : 0.000011s : 0.12% optimize.opt_a.merge_forward : 0.000007s : 0.08% optimize.opt_a.cell_reuse_recompute_pass : 0.000004s : 0.04% optimize.opt_a.offload_activation : 0.000017s : 0.18% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000026s : 0.27% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.02% optimize.opt_a.before_grad : 0.000021s : 0.22% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000008s : 0.08% optimize.opt_a.meta_fg_expand : 0.000005s : 0.05% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.05% optimize.opt_a.receive_attached : 0.000004s : 0.04% optimize.opt_a.after_resolve : 0.000020s : 0.21% optimize.opt_a.a_after_grad : 0.000018s : 0.19% optimize.opt_a.renormalize : 0.000707s : 7.34% optimize.opt_a.add_forward_monad_depend : 0.000008s : 0.08% optimize.opt_a.auto_monad_grad : 0.000004s : 0.05% optimize.opt_a.auto_monad_eliminator : 0.000025s : 0.26% optimize.opt_a.cse : 0.000048s : 0.50% optimize.opt_a.a_3 : 0.000084s : 0.87% optimize.py_interpret_to_execute_after_opt_a : 0.000010s : 0.11% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.02% optimize.rewriter_after_opt_a : 0.000037s : 0.39% optimize.convert_after_rewriter : 0.000007s : 0.07% optimize.order_py_execute_after_rewriter : 0.000005s : 0.05% optimize.mutable_eliminate : 0.000585s : 6.07% optimize.opt_b.b_1 : 0.000118s : 1.23% optimize.opt_b.b_2 : 0.000008s : 0.08% optimize.opt_b.updatestate_depend_eliminate : 0.000007s : 0.08% optimize.opt_b.updatestate_assign_eliminate : 0.000002s : 0.03% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_b.renormalize : 0.000000s : 0.00% optimize.opt_b.cse : 0.000021s : 0.22% optimize.optimize_parallel_all_gather_comm : 0.000017s : 0.18% optimize.overlap_param_gather : 0.000002s : 0.02% optimize.cconv : 0.000028s : 0.29% optimize.loop_unroll : 0.000432s : 4.48% optimize.opt_after_cconv.c_1 : 0.000028s : 0.29% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000005s : 0.06% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.02% optimize.opt_after_cconv.cse : 0.000018s : 0.18% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000016s : 0.16% optimize.tuple_transform.d_1 : 0.000040s : 0.42% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.02% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000007s : 0.07% optimize.partial_unused_args_eliminate : 0.000002s : 0.02% optimize.add_recomputation : 0.000049s : 0.51% optimize.cse_after_recomputation.cse : 0.000012s : 0.12% optimize.environ_conv : 0.000006s : 0.06% optimize.swap_dp_allreduce_reducescatter : 0.000005s : 0.05% optimize.bias_add_comm_swap : 0.000003s : 0.03% optimize.label_micro_interleaved_index : 0.000004s : 0.04% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.03% optimize.merge_cast_opt : 0.000002s : 0.02% optimize.slice_recompute_activation : 0.000002s : 0.02% optimize.micro_interleaved_order_control : 0.000002s : 0.02% optimize.assign_add_opt : 0.000001s : 0.01% optimize.ForceFp32Comm : 0.000001s : 0.01% optimize.remove_cast_before_assign_add : 0.000001s : 0.01% optimize.full_micro_interleaved_order_control : 0.000002s : 0.02% optimize.reorder_send_recv_between_fp_bp : 0.000002s : 0.02% optimize.comm_op_add_attrs : 0.000001s : 0.01% optimize.add_comm_op_reuse_tag : 0.000001s : 0.01% optimize.interleave_split_concat_branches : 0.000001s : 0.01% optimize.interleave_parallel_branches : 0.000001s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.02% optimize.control_data_broadcast_order : 0.000013s : 0.14% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.02% optimize.offloading_packed_experts : 0.000004s : 0.04% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.05% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000002s : 0.02% optimize.overlap_recompute_comm : 0.000003s : 0.03% optimize.overlap_grad_ring_attention : 0.000004s : 0.04% optimize.overlap_grad_flash_sp : 0.000020s : 0.21% optimize.begin_end_overlap_inline : 0.000001s : 0.01% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.02% optimize.split_layernorm_comm : 0.000002s : 0.02% optimize.handle_group_info : 0.000001s : 0.01% optimize.symbol_engine_optimizer.build : 0.000003s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000009s : 0.10% optimize.symbol_engine_optimizer.elim_not_effective : 0.000013s : 0.14% optimize.symbol_engine_optimizer.opt_reshape : 0.000007s : 0.07% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000010s : 0.10% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.02% pipeline_parallel_scheduler : 0.000001s : 0.02% auto_monad_reorder : 0.000016s : 0.17% get_jit_bprop_graph : 0.000002s : 0.02% rewriter_after_jit_bprop_graph : 0.000005s : 0.05% opt_after_jit_grad : 0.000491s : 5.10% validate : 0.000044s : 0.45% Time group info: ------[substitution.] 0.000121 19 1.39% : 0.000002s : 2: substitution.elim_not_effective 1.12% : 0.000001s : 2: substitution.fold_const_symbol 4.96% : 0.000006s : 3: substitution.graph_param_transform 81.87% : 0.000099s : 2: substitution.inline 3.56% : 0.000004s : 4: substitution.j_node_and_user_rematch 3.59% : 0.000004s : 4: substitution.remove_not_recompute_node 3.50% : 0.000004s : 2: substitution.replace_old_param ------[type_inference.] 0.004892 2 90.94% : 0.004449s : 1: type_inference.infer 9.06% : 0.000443s : 1: type_inference.specialize ------[replace.] 0.000024 2 100.00% : 0.000024s : 2: replace.inline ------[match.] 0.000097 2 100.00% : 0.000097s : 2: match.inline ------[predicate.] 0.000138 754 0.85% : 0.000001s : 7: predicate.accumulaten_eliminater 1.61% : 0.000002s : 3: predicate.ad_related_special_op_eliminate 0.62% : 0.000001s : 6: predicate.addn_check_dump 0.83% : 0.000001s : 7: predicate.addn_zero_filter 0.70% : 0.000001s : 7: predicate.adjust_all_reduce_mul_add 2.56% : 0.000004s : 13: predicate.arithmetic_simplify 0.95% : 0.000001s : 7: predicate.cast_eliminate 0.63% : 0.000001s : 6: predicate.check_bprop_eliminate 0.68% : 0.000001s : 6: predicate.compare_switch_simplify 0.25% : 0.000000s : 3: predicate.const_output_eliminate 0.62% : 0.000001s : 6: predicate.depend_value_elim 0.83% : 0.000001s : 7: predicate.dict_get_item_const_eliminator 0.93% : 0.000001s : 7: predicate.dict_get_item_eliminator 0.79% : 0.000001s : 7: predicate.dict_set_item_eliminator 1.39% : 0.000002s : 6: predicate.dumpgradient_eliminate 0.27% : 0.000000s : 3: predicate.elim_not_effective 0.44% : 0.000001s : 3: predicate.elim_shapecalc_of_broadcastargs 1.04% : 0.000001s : 10: predicate.environ_add_const_eliminate 1.02% : 0.000001s : 10: predicate.environ_get_add_eliminate 1.03% : 0.000001s : 10: predicate.environ_get_depend_swap 1.71% : 0.000002s : 16: predicate.environ_get_eliminate 1.02% : 0.000001s : 10: predicate.environ_get_set_eliminate 0.99% : 0.000001s : 9: predicate.exchange_switch_depend_value 2.08% : 0.000003s : 9: predicate.float_depend_g_call 0.62% : 0.000001s : 6: predicate.float_environ_get_switch 0.92% : 0.000001s : 9: predicate.float_tuple_getitem_switch 0.23% : 0.000000s : 3: predicate.fold_const_symbol 0.80% : 0.000001s : 6: predicate.get_grad_eliminate 0.28% : 0.000000s : 3: predicate.graph_param_transform 0.74% : 0.000001s : 6: predicate.incorporate_call 0.64% : 0.000001s : 6: predicate.incorporate_call_switch 6.50% : 0.000009s : 34: predicate.inline 1.00% : 0.000001s : 6: predicate.inline_without_move 0.40% : 0.000001s : 6: predicate.j_node_and_user_rematch 0.95% : 0.000001s : 6: predicate.less_batch_normalization 1.76% : 0.000002s : 13: predicate.list_to_tuple_eliminator_ 2.08% : 0.000003s : 20: predicate.load_eliminater 1.62% : 0.000002s : 3: predicate.loop_unroll_after_grad 1.56% : 0.000002s : 14: predicate.loop_unroll_before_grad 1.83% : 0.000003s : 13: predicate.make_slice_get_slice_eliminator 0.71% : 0.000001s : 6: predicate.merge_addn 0.68% : 0.000001s : 6: predicate.micro_step_allgather_replace 0.69% : 0.000001s : 6: predicate.mini_step_allgather_replace 0.67% : 0.000001s : 7: predicate.minmaximum_grad 1.61% : 0.000002s : 3: predicate.mutable_eliminate 0.41% : 0.000001s : 3: predicate.opt_reshape 0.66% : 0.000001s : 3: predicate.parallel_virtual_node 1.25% : 0.000002s : 9: predicate.partial_defer_inline 1.18% : 0.000002s : 10: predicate.partial_eliminate 0.81% : 0.000001s : 7: predicate.print_const_string_wrapper 0.73% : 0.000001s : 6: predicate.reduce_all_const_elim 1.13% : 0.000002s : 7: predicate.reduce_eliminate 2.30% : 0.000003s : 20: predicate.redundant_stop_gradient_eliminater 0.76% : 0.000001s : 6: predicate.remove_not_recompute_node 1.31% : 0.000002s : 13: predicate.replace_applicator 0.68% : 0.000001s : 6: predicate.replace_old_param 0.49% : 0.000001s : 3: predicate.reset_defer_inline 0.78% : 0.000001s : 7: predicate.reshape_eliminate 0.68% : 0.000001s : 6: predicate.row_tensor_add_zeros_like 0.58% : 0.000001s : 3: predicate.row_tensor_eliminate 1.14% : 0.000002s : 6: predicate.same_eliminate 0.49% : 0.000001s : 6: predicate.set_cell_output_no_recompute 0.86% : 0.000001s : 6: predicate.shard_identity_eliminate 1.09% : 0.000002s : 6: predicate.special_op_eliminate 1.08% : 0.000001s : 6: predicate.specialize_transform 1.14% : 0.000002s : 6: predicate.split_environ_get_set_with_tuple_value 0.96% : 0.000001s : 6: predicate.stack_unstack_eliminate 0.44% : 0.000001s : 3: predicate.switch_call_monad_eliminater 1.00% : 0.000001s : 9: predicate.switch_defer_inline 1.65% : 0.000002s : 15: predicate.switch_layer_defer_inline 4.20% : 0.000006s : 32: predicate.switch_simplify 0.83% : 0.000001s : 7: predicate.tile_eliminate 0.86% : 0.000001s : 7: predicate.transpose_eliminate 1.50% : 0.000002s : 13: predicate.tuple_list_convert_item_index_to_positive 1.52% : 0.000002s : 13: predicate.tuple_list_get_item_const_eliminator 1.34% : 0.000002s : 13: predicate.tuple_list_get_item_depend_reorder 3.32% : 0.000005s : 19: predicate.tuple_list_get_item_eliminator 1.58% : 0.000002s : 13: predicate.tuple_list_get_set_item_eliminator 2.18% : 0.000003s : 19: predicate.tuple_list_set_item_eliminator 1.43% : 0.000002s : 13: predicate.tuple_to_list_eliminator_ 1.87% : 0.000003s : 20: predicate.updatestate_pure_node_eliminater 2.96% : 0.000004s : 26: predicate.updatestate_useless_node_eliminater 0.40% : 0.000001s : 3: predicate.value_based_eliminate 0.77% : 0.000001s : 6: predicate.virtual_dataset_eliminate 0.71% : 0.000001s : 6: predicate.virtual_output_eliminate 0.37% : 0.000001s : 3: predicate.virtual_view_grad_eliminate 0.46% : 0.000001s : 3: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000237 5 7.91% : 0.000019s : 1: func_graph_cloner_run.FuncGraphClonerGraph 92.09% : 0.000218s : 4: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.023177 192 0.02% : 0.000004s : 1: ForceFp32Comm 14.16% : 0.003282s : 1: add_attr 14.11% : 0.003270s : 1: add_attr_with_inline 0.02% : 0.000004s : 1: add_comm_op_reuse_tag 0.23% : 0.000053s : 1: add_recomputation 0.02% : 0.000004s : 1: assign_add_opt 0.26% : 0.000061s : 1: auto_monad 0.09% : 0.000020s : 1: auto_monad_reorder 0.01% : 0.000003s : 1: begin_end_overlap_inline 0.02% : 0.000005s : 1: bias_add_comm_swap 1.96% : 0.000453s : 1: bootstrap 0.13% : 0.000031s : 1: cconv 0.02% : 0.000004s : 1: comm_op_add_attrs 0.07% : 0.000016s : 1: control_data_broadcast_order 0.04% : 0.000010s : 1: convert_after_rewriter 0.11% : 0.000025s : 1: cse_after_recomputation 0.02% : 0.000005s : 1: dataset_repeat_opt 0.02% : 0.000005s : 1: detach_backward 0.04% : 0.000009s : 1: environ_conv 0.08% : 0.000018s : 1: event_method 0.02% : 0.000005s : 1: full_micro_interleaved_order_control 0.02% : 0.000005s : 1: get_jit_bprop_graph 0.04% : 0.000009s : 1: graph_reusing 0.02% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.02% : 0.000004s : 1: handle_group_info 0.02% : 0.000005s : 1: inline 0.02% : 0.000006s : 1: insert-virtual-dataset 0.02% : 0.000004s : 1: interleave_parallel_branches 0.02% : 0.000004s : 1: interleave_split_concat_branches 0.03% : 0.000006s : 1: label_fine_grained_interleaved_index 0.03% : 0.000006s : 1: label_micro_interleaved_index 1.90% : 0.000439s : 1: loop_unroll 0.02% : 0.000004s : 1: merge_cast_opt 0.02% : 0.000005s : 1: micro_interleaved_order_control 2.56% : 0.000593s : 1: mutable_eliminate 0.03% : 0.000007s : 1: offloading_packed_experts 0.06% : 0.000015s : 1: opt.transform.loop_unroll_optimizer 0.07% : 0.000016s : 1: opt.transform.mutable_eliminate 3.39% : 0.000786s : 78: opt.transform.opt_a 0.11% : 0.000026s : 1: opt.transform.opt_after_cconv 0.12% : 0.000028s : 1: opt.transform.opt_after_jit_grad 0.41% : 0.000095s : 28: opt.transform.opt_b 0.19% : 0.000045s : 2: opt.transform.opt_trans_graph 0.15% : 0.000035s : 4: opt.transform.symbol_engine_opt 10.18% : 0.002360s : 1: opt_a 0.44% : 0.000102s : 1: opt_after_cconv 2.16% : 0.000501s : 1: opt_after_jit_grad 0.89% : 0.000206s : 1: opt_b 19.03% : 0.004411s : 1: optimize 0.09% : 0.000021s : 1: optimize_parallel_all_gather_comm 0.04% : 0.000009s : 1: order_py_execute_after_rewriter 0.10% : 0.000024s : 1: overlap_grad_flash_sp 0.02% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.03% : 0.000007s : 1: overlap_grad_ring_attention 0.02% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.02% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.02% : 0.000005s : 1: overlap_param_gather 0.02% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.04% : 0.000008s : 1: overlap_recompute_and_grad_model_parallel 0.02% : 0.000005s : 1: overlap_recompute_comm 0.03% : 0.000006s : 1: parallel-infer-symbol 0.02% : 0.000004s : 1: parallel-infer-symbol-second 0.02% : 0.000005s : 1: partial_unused_args_eliminate 0.02% : 0.000004s : 1: pipeline_parallel_scheduler 0.02% : 0.000005s : 1: pipeline_split 0.15% : 0.000034s : 1: pre_auto_parallel 0.10% : 0.000023s : 1: py_interpret_to_execute 0.06% : 0.000014s : 1: py_interpret_to_execute_after_opt_a 0.02% : 0.000004s : 1: remove_cast_before_assign_add 0.08% : 0.000019s : 1: remove_dup_value 1.76% : 0.000408s : 1: renormalize.infer 1.25% : 0.000290s : 1: renormalize.specialize 0.02% : 0.000005s : 1: reorder_send_recv_between_fp_bp 0.03% : 0.000008s : 1: rewriter_after_jit_bprop_graph 0.18% : 0.000042s : 1: rewriter_after_opt_a 0.22% : 0.000052s : 1: rewriter_before_opt_a 0.02% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.02% : 0.000005s : 1: slice_recompute_activation 0.02% : 0.000004s : 1: split_layernorm_comm 0.02% : 0.000005s : 1: split_matmul_comm_elemetwise 0.03% : 0.000008s : 1: swap_dp_allreduce_reducescatter 0.34% : 0.000078s : 1: symbol_engine_optimizer 0.33% : 0.000075s : 1: tuple_transform 21.41% : 0.004962s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:40.232.451 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:40.232.716 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0161477, [21] [bootstrap]: 0.00045159 [type_inference]: 0.00559965 [event_method]: 1.468e-05 [auto_monad]: 6.015e-05 [graph_reusing]: 5.51e-06 [inline]: 3.16001e-06 [add_attr]: 0.00341228, [1] [add_attr_with_inline]: 0.00340253, [1] [Cycle 1]: 8.106e-05, [2] [tag_attr]: 1.674e-05 [meta_addattr_fg_expand]: 3.73001e-06 [parallel-infer-symbol]: 3.45e-06 [pre_auto_parallel]: 3.084e-05 [insert-virtual-dataset]: 2.39001e-06 [parallel-infer-symbol-second]: 9.00007e-07 [dataset_repeat_opt]: 1.73002e-06 [pipeline_split]: 1.78002e-06 [optimize]: 0.00531335, [53] [py_interpret_to_execute]: 2.726e-05 [rewriter_before_opt_a]: 5.599e-05 [opt_a]: 0.00277951, [2] [Cycle 1]: 0.00190412, [45] [expand_dump_flag]: 2.99999e-06 [switch_simplify]: 2.565e-05 [loop_unroll]: 1.532e-05 [a_1]: 0.00030924 [with_stream_mark]: 2.238e-05 [recompute_prepare]: 1.033e-05 [updatestate_depend_eliminate]: 4.41002e-06 [updatestate_assign_eliminate]: 3.86001e-06 [updatestate_loads_eliminate]: 3.10002e-06 [parameter_eliminate]: 1.89999e-06 [a_2]: 0.00011097 [accelerated_algorithm]: 7.58999e-06 [shard]: 2.12999e-06 [meta_shard_fg_expand]: 1.99999e-06 [shard_inline]: 7.21999e-06 [merge_send_recv]: 9.00999e-06 [auto_parallel]: 7.95998e-06 [parallel]: 1.87e-05 [flash_sp]: 1.104e-05 [merge_comm]: 5.25999e-06 [allreduce_fusion]: 3.71999e-06 [matmul_add_comm_reduction]: 1.031e-05 [allreduce_slice_to_reducescatter]: 7.7e-07 [virtual_shard_identity]: 9.69999e-06 [virtual_dataset]: 6.74001e-06 [get_grad_eliminate_]: 6.63e-06 [virtual_output]: 6.63e-06 [merge_forward]: 4.22998e-06 [cell_reuse_recompute_pass]: 1.30999e-06 [offload_activation]: 1.094e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.02e-05 [merge_recompute_call_nodes]: 1.80001e-06 [before_grad]: 1.17e-05 [set_forward_comm_id_for_comm_node_pass]: 5.15001e-06 [meta_fg_expand]: 2.61e-06 [flash_sp_send_recv_attached]: 3.93999e-06 [receive_attached]: 2.73e-06 [after_resolve]: 1.2e-05 [a_after_grad]: 1.003e-05 [renormalize]: 0.00065028 [add_forward_monad_depend]: 6.49999e-06 [auto_monad_grad]: 2.97002e-06 [auto_monad_eliminator]: 1.691e-05 [cse]: 3.068e-05 [a_3]: 6.7e-05 [Cycle 2]: 0.00085904, [45] [expand_dump_flag]: 1.54e-06 [switch_simplify]: 8.03001e-06 [loop_unroll]: 5.90002e-06 [a_1]: 0.00010867 [with_stream_mark]: 1.487e-05 [recompute_prepare]: 6.48e-06 [updatestate_depend_eliminate]: 3.55e-06 [updatestate_assign_eliminate]: 2.85002e-06 [updatestate_loads_eliminate]: 2.74001e-06 [parameter_eliminate]: 1.44e-06 [a_2]: 0.00010011 [accelerated_algorithm]: 6.17001e-06 [shard]: 1.66e-06 [meta_shard_fg_expand]: 1.81998e-06 [shard_inline]: 5.97001e-06 [merge_send_recv]: 6.52001e-06 [auto_parallel]: 7.54002e-06 [parallel]: 7.95e-06 [flash_sp]: 3.81001e-06 [merge_comm]: 4.55001e-06 [allreduce_fusion]: 3.53e-06 [matmul_add_comm_reduction]: 9.19e-06 [allreduce_slice_to_reducescatter]: 6.69999e-07 [virtual_shard_identity]: 8.00999e-06 [virtual_dataset]: 5.97999e-06 [get_grad_eliminate_]: 5.77001e-06 [virtual_output]: 5.89e-06 [merge_forward]: 4.52998e-06 [cell_reuse_recompute_pass]: 1.64998e-06 [offload_activation]: 8.57e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.864e-05 [merge_recompute_call_nodes]: 1.22e-06 [before_grad]: 1.083e-05 [set_forward_comm_id_for_comm_node_pass]: 4.94e-06 [meta_fg_expand]: 2.11998e-06 [flash_sp_send_recv_attached]: 1.55001e-06 [receive_attached]: 1.55001e-06 [after_resolve]: 1.047e-05 [a_after_grad]: 8.92e-06 [renormalize]: 8.00064e-08 [add_forward_monad_depend]: 3.62002e-06 [auto_monad_grad]: 1.96e-06 [auto_monad_eliminator]: 9.94999e-06 [cse]: 1.963e-05 [a_3]: 5.15e-05 [py_interpret_to_execute_after_opt_a]: 1.718e-05 [slice_cell_reuse_recomputed_activation]: 5.18002e-06 [rewriter_after_opt_a]: 4.765e-05 [convert_after_rewriter]: 9.67001e-06 [order_py_execute_after_rewriter]: 8.28001e-06 [mutable_eliminate]: 0.00066093 [opt_b]: 0.00028836, [1] [Cycle 1]: 0.00027808, [7] [b_1]: 0.00016823 [b_2]: 8.60999e-06 [updatestate_depend_eliminate]: 8.38999e-06 [updatestate_assign_eliminate]: 2.89001e-06 [updatestate_loads_eliminate]: 2.96001e-06 [renormalize]: 1.10001e-06 [cse]: 2.8e-05 [optimize_parallel_all_gather_comm]: 2.143e-05 [overlap_param_gather]: 5.18002e-06 [cconv]: 3.897e-05 [loop_unroll]: 0.00048886 [opt_after_cconv]: 0.00013979, [1] [Cycle 1]: 0.00013001, [7] [c_1]: 2.859e-05 [parameter_eliminate]: 5.31998e-06 [updatestate_depend_eliminate]: 7.26999e-06 [updatestate_assign_eliminate]: 2.83e-06 [updatestate_loads_eliminate]: 2.63e-06 [cse]: 2.499e-05 [renormalize]: 8.29983e-07 [remove_dup_value]: 1.996e-05 [tuple_transform]: 8.991e-05, [1] [Cycle 1]: 8.248e-05, [4] [d_1]: 4.246e-05 [none_parameter_eliminate]: 1.69998e-06 [renormalize]: 2.30008e-07 [switch_simplify]: 7.18e-06 [partial_unused_args_eliminate]: 4.46002e-06 [add_recomputation]: 5.748e-05 [cse_after_recomputation]: 2.966e-05, [1] [Cycle 1]: 2.272e-05, [1] [cse]: 1.323e-05 [environ_conv]: 9.48002e-06 [swap_dp_allreduce_reducescatter]: 8.19002e-06 [bias_add_comm_swap]: 5.06997e-06 [label_micro_interleaved_index]: 8.13001e-06 [label_fine_grained_interleaved_index]: 5.04998e-06 [merge_cast_opt]: 3.75998e-06 [slice_recompute_activation]: 4.94e-06 [micro_interleaved_order_control]: 5.02999e-06 [assign_add_opt]: 3.73001e-06 [ForceFp32Comm]: 3.48e-06 [remove_cast_before_assign_add]: 3.42002e-06 [full_micro_interleaved_order_control]: 5.07e-06 [reorder_send_recv_between_fp_bp]: 4.89998e-06 [comm_op_add_attrs]: 3.45e-06 [add_comm_op_reuse_tag]: 3.7e-06 [interleave_split_concat_branches]: 3.51001e-06 [interleave_parallel_branches]: 3.38e-06 [overlap_opt_shard_in_pipeline]: 3.53e-06 [overlap_opt_shard_grad_in_pipeline]: 4.27e-06 [control_data_broadcast_order]: 1.748e-05 [grouped_pairwise_exchange_alltoall]: 4.54002e-06 [offloading_packed_experts]: 7.53e-06 [overlap_recompute_and_grad_model_parallel]: 7.00998e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.63999e-06 [overlap_recompute_allgather_and_fa_grad]: 3.66999e-06 [overlap_recompute_comm]: 4.94e-06 [overlap_grad_ring_attention]: 7.08998e-06 [overlap_grad_flash_sp]: 2.375e-05 [begin_end_overlap_inline]: 3.23e-06 [split_matmul_comm_elemetwise]: 4.74002e-06 [split_layernorm_comm]: 4.30999e-06 [handle_group_info]: 3.29001e-06 [symbol_engine_optimizer]: 0.00010537, [1] [Cycle 1]: 9.73e-05, [6] [build]: 3.18998e-06 [elim_shapecalc]: 1.261e-05 [elim_not_effective]: 1.37e-05 [opt_reshape]: 6.82002e-06 [fold_const_symbol]: 9.84001e-06 [renormalize]: 6.00005e-07 [detach_backward]: 4.33001e-06 [pipeline_parallel_scheduler]: 1.92001e-06 [auto_monad_reorder]: 2.191e-05 [get_jit_bprop_graph]: 1.78002e-06 [rewriter_after_jit_bprop_graph]: 6.60997e-06 [opt_after_jit_grad]: 0.00056003 [validate]: 4.405e-05 Sums bootstrap : 0.000452s : 4.15% type_inference : 0.005600s : 51.44% event_method : 0.000015s : 0.13% auto_monad : 0.000060s : 0.55% graph_reusing : 0.000006s : 0.05% inline : 0.000003s : 0.03% add_attr.add_attr_with_inline.tag_attr : 0.000017s : 0.15% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000004s : 0.03% parallel-infer-symbol : 0.000003s : 0.03% pre_auto_parallel : 0.000031s : 0.28% insert-virtual-dataset : 0.000002s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.02% optimize.py_interpret_to_execute : 0.000027s : 0.25% optimize.rewriter_before_opt_a : 0.000056s : 0.51% optimize.opt_a.expand_dump_flag : 0.000005s : 0.04% optimize.opt_a.switch_simplify : 0.000034s : 0.31% optimize.opt_a.loop_unroll : 0.000021s : 0.19% optimize.opt_a.a_1 : 0.000418s : 3.84% optimize.opt_a.with_stream_mark : 0.000037s : 0.34% optimize.opt_a.recompute_prepare : 0.000017s : 0.15% optimize.opt_a.updatestate_depend_eliminate : 0.000008s : 0.07% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.06% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.05% optimize.opt_a.parameter_eliminate : 0.000003s : 0.03% optimize.opt_a.a_2 : 0.000211s : 1.94% optimize.opt_a.accelerated_algorithm : 0.000014s : 0.13% optimize.opt_a.shard : 0.000004s : 0.03% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.04% optimize.opt_a.shard_inline : 0.000013s : 0.12% optimize.opt_a.merge_send_recv : 0.000016s : 0.14% optimize.opt_a.auto_parallel : 0.000016s : 0.14% optimize.opt_a.parallel : 0.000027s : 0.24% optimize.opt_a.flash_sp : 0.000015s : 0.14% optimize.opt_a.merge_comm : 0.000010s : 0.09% optimize.opt_a.allreduce_fusion : 0.000007s : 0.07% optimize.opt_a.matmul_add_comm_reduction : 0.000020s : 0.18% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000018s : 0.16% optimize.opt_a.virtual_dataset : 0.000013s : 0.12% optimize.opt_a.get_grad_eliminate_ : 0.000012s : 0.11% optimize.opt_a.virtual_output : 0.000013s : 0.12% optimize.opt_a.merge_forward : 0.000009s : 0.08% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.03% optimize.opt_a.offload_activation : 0.000020s : 0.18% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000039s : 0.36% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.03% optimize.opt_a.before_grad : 0.000023s : 0.21% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000010s : 0.09% optimize.opt_a.meta_fg_expand : 0.000005s : 0.04% optimize.opt_a.flash_sp_send_recv_attached : 0.000005s : 0.05% optimize.opt_a.receive_attached : 0.000004s : 0.04% optimize.opt_a.after_resolve : 0.000022s : 0.21% optimize.opt_a.a_after_grad : 0.000019s : 0.17% optimize.opt_a.renormalize : 0.000650s : 5.97% optimize.opt_a.add_forward_monad_depend : 0.000010s : 0.09% optimize.opt_a.auto_monad_grad : 0.000005s : 0.05% optimize.opt_a.auto_monad_eliminator : 0.000027s : 0.25% optimize.opt_a.cse : 0.000050s : 0.46% optimize.opt_a.a_3 : 0.000119s : 1.09% optimize.py_interpret_to_execute_after_opt_a : 0.000017s : 0.16% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.05% optimize.rewriter_after_opt_a : 0.000048s : 0.44% optimize.convert_after_rewriter : 0.000010s : 0.09% optimize.order_py_execute_after_rewriter : 0.000008s : 0.08% optimize.mutable_eliminate : 0.000661s : 6.07% optimize.opt_b.b_1 : 0.000168s : 1.55% optimize.opt_b.b_2 : 0.000009s : 0.08% optimize.opt_b.updatestate_depend_eliminate : 0.000008s : 0.08% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_b.renormalize : 0.000001s : 0.01% optimize.opt_b.cse : 0.000028s : 0.26% optimize.optimize_parallel_all_gather_comm : 0.000021s : 0.20% optimize.overlap_param_gather : 0.000005s : 0.05% optimize.cconv : 0.000039s : 0.36% optimize.loop_unroll : 0.000489s : 4.49% optimize.opt_after_cconv.c_1 : 0.000029s : 0.26% optimize.opt_after_cconv.parameter_eliminate : 0.000005s : 0.05% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000007s : 0.07% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.02% optimize.opt_after_cconv.cse : 0.000025s : 0.23% optimize.opt_after_cconv.renormalize : 0.000001s : 0.01% optimize.remove_dup_value : 0.000020s : 0.18% optimize.tuple_transform.d_1 : 0.000042s : 0.39% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.02% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000007s : 0.07% optimize.partial_unused_args_eliminate : 0.000004s : 0.04% optimize.add_recomputation : 0.000057s : 0.53% optimize.cse_after_recomputation.cse : 0.000013s : 0.12% optimize.environ_conv : 0.000009s : 0.09% optimize.swap_dp_allreduce_reducescatter : 0.000008s : 0.08% optimize.bias_add_comm_swap : 0.000005s : 0.05% optimize.label_micro_interleaved_index : 0.000008s : 0.07% optimize.label_fine_grained_interleaved_index : 0.000005s : 0.05% optimize.merge_cast_opt : 0.000004s : 0.03% optimize.slice_recompute_activation : 0.000005s : 0.05% optimize.micro_interleaved_order_control : 0.000005s : 0.05% optimize.assign_add_opt : 0.000004s : 0.03% optimize.ForceFp32Comm : 0.000003s : 0.03% optimize.remove_cast_before_assign_add : 0.000003s : 0.03% optimize.full_micro_interleaved_order_control : 0.000005s : 0.05% optimize.reorder_send_recv_between_fp_bp : 0.000005s : 0.05% optimize.comm_op_add_attrs : 0.000003s : 0.03% optimize.add_comm_op_reuse_tag : 0.000004s : 0.03% optimize.interleave_split_concat_branches : 0.000004s : 0.03% optimize.interleave_parallel_branches : 0.000003s : 0.03% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.03% optimize.overlap_opt_shard_grad_in_pipeline : 0.000004s : 0.04% optimize.control_data_broadcast_order : 0.000017s : 0.16% optimize.grouped_pairwise_exchange_alltoall : 0.000005s : 0.04% optimize.offloading_packed_experts : 0.000008s : 0.07% optimize.overlap_recompute_and_grad_model_parallel : 0.000007s : 0.06% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.03% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.03% optimize.overlap_recompute_comm : 0.000005s : 0.05% optimize.overlap_grad_ring_attention : 0.000007s : 0.07% optimize.overlap_grad_flash_sp : 0.000024s : 0.22% optimize.begin_end_overlap_inline : 0.000003s : 0.03% optimize.split_matmul_comm_elemetwise : 0.000005s : 0.04% optimize.split_layernorm_comm : 0.000004s : 0.04% optimize.handle_group_info : 0.000003s : 0.03% optimize.symbol_engine_optimizer.build : 0.000003s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000013s : 0.12% optimize.symbol_engine_optimizer.elim_not_effective : 0.000014s : 0.13% optimize.symbol_engine_optimizer.opt_reshape : 0.000007s : 0.06% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000010s : 0.09% optimize.symbol_engine_optimizer.renormalize : 0.000001s : 0.01% detach_backward : 0.000004s : 0.04% pipeline_parallel_scheduler : 0.000002s : 0.02% auto_monad_reorder : 0.000022s : 0.20% get_jit_bprop_graph : 0.000002s : 0.02% rewriter_after_jit_bprop_graph : 0.000007s : 0.06% opt_after_jit_grad : 0.000560s : 5.14% validate : 0.000044s : 0.40% Time group info: ------[substitution.] 0.000128 19 1.65% : 0.000002s : 2: substitution.elim_not_effective 1.06% : 0.000001s : 2: substitution.fold_const_symbol 4.76% : 0.000006s : 3: substitution.graph_param_transform 82.23% : 0.000106s : 2: substitution.inline 3.13% : 0.000004s : 4: substitution.j_node_and_user_rematch 3.75% : 0.000005s : 4: substitution.remove_not_recompute_node 3.42% : 0.000004s : 2: substitution.replace_old_param ------[type_inference.] 0.005545 2 91.41% : 0.005069s : 1: type_inference.infer 8.59% : 0.000476s : 1: type_inference.specialize ------[replace.] 0.000024 2 100.00% : 0.000024s : 2: replace.inline ------[match.] 0.000103 2 100.00% : 0.000103s : 2: match.inline ------[predicate.] 0.000140 754 0.79% : 0.000001s : 7: predicate.accumulaten_eliminater 1.33% : 0.000002s : 3: predicate.ad_related_special_op_eliminate 0.63% : 0.000001s : 6: predicate.addn_check_dump 0.76% : 0.000001s : 7: predicate.addn_zero_filter 0.68% : 0.000001s : 7: predicate.adjust_all_reduce_mul_add 2.56% : 0.000004s : 13: predicate.arithmetic_simplify 0.73% : 0.000001s : 7: predicate.cast_eliminate 0.95% : 0.000001s : 6: predicate.check_bprop_eliminate 0.72% : 0.000001s : 6: predicate.compare_switch_simplify 0.22% : 0.000000s : 3: predicate.const_output_eliminate 0.73% : 0.000001s : 6: predicate.depend_value_elim 0.74% : 0.000001s : 7: predicate.dict_get_item_const_eliminator 0.92% : 0.000001s : 7: predicate.dict_get_item_eliminator 0.71% : 0.000001s : 7: predicate.dict_set_item_eliminator 1.16% : 0.000002s : 6: predicate.dumpgradient_eliminate 0.41% : 0.000001s : 3: predicate.elim_not_effective 0.51% : 0.000001s : 3: predicate.elim_shapecalc_of_broadcastargs 1.10% : 0.000002s : 10: predicate.environ_add_const_eliminate 1.25% : 0.000002s : 10: predicate.environ_get_add_eliminate 1.05% : 0.000001s : 10: predicate.environ_get_depend_swap 1.72% : 0.000002s : 16: predicate.environ_get_eliminate 0.95% : 0.000001s : 10: predicate.environ_get_set_eliminate 0.85% : 0.000001s : 9: predicate.exchange_switch_depend_value 2.08% : 0.000003s : 9: predicate.float_depend_g_call 0.78% : 0.000001s : 6: predicate.float_environ_get_switch 0.94% : 0.000001s : 9: predicate.float_tuple_getitem_switch 0.23% : 0.000000s : 3: predicate.fold_const_symbol 0.75% : 0.000001s : 6: predicate.get_grad_eliminate 0.24% : 0.000000s : 3: predicate.graph_param_transform 0.78% : 0.000001s : 6: predicate.incorporate_call 0.63% : 0.000001s : 6: predicate.incorporate_call_switch 6.88% : 0.000010s : 34: predicate.inline 1.20% : 0.000002s : 6: predicate.inline_without_move 0.36% : 0.000000s : 6: predicate.j_node_and_user_rematch 1.07% : 0.000002s : 6: predicate.less_batch_normalization 1.50% : 0.000002s : 13: predicate.list_to_tuple_eliminator_ 2.04% : 0.000003s : 20: predicate.load_eliminater 1.31% : 0.000002s : 3: predicate.loop_unroll_after_grad 1.62% : 0.000002s : 14: predicate.loop_unroll_before_grad 1.77% : 0.000002s : 13: predicate.make_slice_get_slice_eliminator 0.63% : 0.000001s : 6: predicate.merge_addn 0.67% : 0.000001s : 6: predicate.micro_step_allgather_replace 0.74% : 0.000001s : 6: predicate.mini_step_allgather_replace 0.65% : 0.000001s : 7: predicate.minmaximum_grad 1.85% : 0.000003s : 3: predicate.mutable_eliminate 0.43% : 0.000001s : 3: predicate.opt_reshape 0.40% : 0.000001s : 3: predicate.parallel_virtual_node 1.34% : 0.000002s : 9: predicate.partial_defer_inline 1.16% : 0.000002s : 10: predicate.partial_eliminate 0.80% : 0.000001s : 7: predicate.print_const_string_wrapper 0.72% : 0.000001s : 6: predicate.reduce_all_const_elim 0.95% : 0.000001s : 7: predicate.reduce_eliminate 1.99% : 0.000003s : 20: predicate.redundant_stop_gradient_eliminater 0.94% : 0.000001s : 6: predicate.remove_not_recompute_node 1.23% : 0.000002s : 13: predicate.replace_applicator 0.80% : 0.000001s : 6: predicate.replace_old_param 0.63% : 0.000001s : 3: predicate.reset_defer_inline 0.79% : 0.000001s : 7: predicate.reshape_eliminate 1.12% : 0.000002s : 6: predicate.row_tensor_add_zeros_like 0.58% : 0.000001s : 3: predicate.row_tensor_eliminate 1.25% : 0.000002s : 6: predicate.same_eliminate 0.49% : 0.000001s : 6: predicate.set_cell_output_no_recompute 0.90% : 0.000001s : 6: predicate.shard_identity_eliminate 1.00% : 0.000001s : 6: predicate.special_op_eliminate 0.92% : 0.000001s : 6: predicate.specialize_transform 1.39% : 0.000002s : 6: predicate.split_environ_get_set_with_tuple_value 0.84% : 0.000001s : 6: predicate.stack_unstack_eliminate 0.38% : 0.000001s : 3: predicate.switch_call_monad_eliminater 0.94% : 0.000001s : 9: predicate.switch_defer_inline 1.68% : 0.000002s : 15: predicate.switch_layer_defer_inline 4.09% : 0.000006s : 32: predicate.switch_simplify 0.73% : 0.000001s : 7: predicate.tile_eliminate 0.80% : 0.000001s : 7: predicate.transpose_eliminate 1.45% : 0.000002s : 13: predicate.tuple_list_convert_item_index_to_positive 1.52% : 0.000002s : 13: predicate.tuple_list_get_item_const_eliminator 1.35% : 0.000002s : 13: predicate.tuple_list_get_item_depend_reorder 3.37% : 0.000005s : 19: predicate.tuple_list_get_item_eliminator 1.35% : 0.000002s : 13: predicate.tuple_list_get_set_item_eliminator 2.39% : 0.000003s : 19: predicate.tuple_list_set_item_eliminator 1.37% : 0.000002s : 13: predicate.tuple_to_list_eliminator_ 1.89% : 0.000003s : 20: predicate.updatestate_pure_node_eliminater 2.67% : 0.000004s : 26: predicate.updatestate_useless_node_eliminater 0.41% : 0.000001s : 3: predicate.value_based_eliminate 0.86% : 0.000001s : 6: predicate.virtual_dataset_eliminate 0.80% : 0.000001s : 6: predicate.virtual_output_eliminate 0.34% : 0.000000s : 3: predicate.virtual_view_grad_eliminate 0.69% : 0.000001s : 3: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000248 5 9.43% : 0.000023s : 1: func_graph_cloner_run.FuncGraphClonerGraph 90.57% : 0.000225s : 4: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.026480 192 0.02% : 0.000006s : 1: ForceFp32Comm 12.92% : 0.003423s : 1: add_attr 12.86% : 0.003407s : 1: add_attr_with_inline 0.03% : 0.000007s : 1: add_comm_op_reuse_tag 0.23% : 0.000061s : 1: add_recomputation 0.02% : 0.000006s : 1: assign_add_opt 0.26% : 0.000069s : 1: auto_monad 0.11% : 0.000030s : 1: auto_monad_reorder 0.02% : 0.000006s : 1: begin_end_overlap_inline 0.03% : 0.000008s : 1: bias_add_comm_swap 1.88% : 0.000498s : 1: bootstrap 0.16% : 0.000043s : 1: cconv 0.02% : 0.000006s : 1: comm_op_add_attrs 0.08% : 0.000021s : 1: control_data_broadcast_order 0.05% : 0.000013s : 1: convert_after_rewriter 0.12% : 0.000033s : 1: cse_after_recomputation 0.03% : 0.000008s : 1: dataset_repeat_opt 0.09% : 0.000024s : 1: detach_backward 0.05% : 0.000012s : 1: environ_conv 0.10% : 0.000026s : 1: event_method 0.03% : 0.000008s : 1: full_micro_interleaved_order_control 0.03% : 0.000008s : 1: get_jit_bprop_graph 0.05% : 0.000012s : 1: graph_reusing 0.03% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.02% : 0.000006s : 1: handle_group_info 0.03% : 0.000009s : 1: inline 0.03% : 0.000008s : 1: insert-virtual-dataset 0.02% : 0.000006s : 1: interleave_parallel_branches 0.02% : 0.000006s : 1: interleave_split_concat_branches 0.03% : 0.000008s : 1: label_fine_grained_interleaved_index 0.04% : 0.000011s : 1: label_micro_interleaved_index 1.88% : 0.000497s : 1: loop_unroll 0.03% : 0.000007s : 1: merge_cast_opt 0.03% : 0.000008s : 1: micro_interleaved_order_control 2.53% : 0.000669s : 1: mutable_eliminate 0.04% : 0.000010s : 1: offloading_packed_experts 0.06% : 0.000017s : 1: opt.transform.loop_unroll_optimizer 0.08% : 0.000021s : 1: opt.transform.mutable_eliminate 3.10% : 0.000822s : 78: opt.transform.opt_a 0.10% : 0.000027s : 1: opt.transform.opt_after_cconv 0.11% : 0.000029s : 1: opt.transform.opt_after_jit_grad 0.38% : 0.000102s : 28: opt.transform.opt_b 0.18% : 0.000047s : 2: opt.transform.opt_trans_graph 0.15% : 0.000039s : 4: opt.transform.symbol_engine_opt 10.51% : 0.002783s : 1: opt_a 0.54% : 0.000144s : 1: opt_after_cconv 2.16% : 0.000573s : 1: opt_after_jit_grad 1.10% : 0.000292s : 1: opt_b 21.39% : 0.005663s : 1: optimize 0.09% : 0.000025s : 1: optimize_parallel_all_gather_comm 0.04% : 0.000012s : 1: order_py_execute_after_rewriter 0.10% : 0.000027s : 1: overlap_grad_flash_sp 0.03% : 0.000007s : 1: overlap_grad_matmul_and_grad_allreduce 0.04% : 0.000010s : 1: overlap_grad_ring_attention 0.03% : 0.000007s : 1: overlap_opt_shard_grad_in_pipeline 0.02% : 0.000006s : 1: overlap_opt_shard_in_pipeline 0.03% : 0.000008s : 1: overlap_param_gather 0.02% : 0.000006s : 1: overlap_recompute_allgather_and_fa_grad 0.04% : 0.000010s : 1: overlap_recompute_and_grad_model_parallel 0.03% : 0.000008s : 1: overlap_recompute_comm 0.04% : 0.000010s : 1: parallel-infer-symbol 0.02% : 0.000006s : 1: parallel-infer-symbol-second 0.03% : 0.000007s : 1: partial_unused_args_eliminate 0.03% : 0.000009s : 1: pipeline_parallel_scheduler 0.03% : 0.000007s : 1: pipeline_split 0.15% : 0.000039s : 1: pre_auto_parallel 0.12% : 0.000031s : 1: py_interpret_to_execute 0.08% : 0.000021s : 1: py_interpret_to_execute_after_opt_a 0.02% : 0.000006s : 1: remove_cast_before_assign_add 0.09% : 0.000023s : 1: remove_dup_value 1.31% : 0.000348s : 1: renormalize.infer 1.10% : 0.000292s : 1: renormalize.specialize 0.03% : 0.000007s : 1: reorder_send_recv_between_fp_bp 0.05% : 0.000012s : 1: rewriter_after_jit_bprop_graph 0.20% : 0.000052s : 1: rewriter_after_opt_a 0.22% : 0.000060s : 1: rewriter_before_opt_a 0.03% : 0.000008s : 1: slice_cell_reuse_recomputed_activation 0.03% : 0.000008s : 1: slice_recompute_activation 0.03% : 0.000007s : 1: split_layernorm_comm 0.03% : 0.000007s : 1: split_matmul_comm_elemetwise 0.04% : 0.000011s : 1: swap_dp_allreduce_reducescatter 0.41% : 0.000109s : 1: symbol_engine_optimizer 0.35% : 0.000093s : 1: tuple_transform 21.27% : 0.005633s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:40.435.351 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0144055, [21] [bootstrap]: 0.00044799 [type_inference]: 0.00542429 [event_method]: 1.428e-05 [auto_monad]: 5.706e-05 [graph_reusing]: 5.41998e-06 [inline]: 1.99e-06 [add_attr]: 0.00324422, [1] [add_attr_with_inline]: 0.00323378, [1] [Cycle 1]: 6.111e-05, [2] [tag_attr]: 1.597e-05 [meta_addattr_fg_expand]: 3.90998e-06 [parallel-infer-symbol]: 3.01999e-06 [pre_auto_parallel]: 2.884e-05 [insert-virtual-dataset]: 2.59999e-06 [parallel-infer-symbol-second]: 1.19e-06 [dataset_repeat_opt]: 2.43998e-06 [pipeline_split]: 1.73002e-06 [optimize]: 0.00443028, [53] [py_interpret_to_execute]: 2.018e-05 [rewriter_before_opt_a]: 4.911e-05 [opt_a]: 0.00238384, [2] [Cycle 1]: 0.00167598, [45] [expand_dump_flag]: 3.4e-06 [switch_simplify]: 2.658e-05 [loop_unroll]: 1.35e-05 [a_1]: 0.00030684 [with_stream_mark]: 2.066e-05 [recompute_prepare]: 1.061e-05 [updatestate_depend_eliminate]: 4.28001e-06 [updatestate_assign_eliminate]: 3.06999e-06 [updatestate_loads_eliminate]: 3.35003e-06 [parameter_eliminate]: 1.82999e-06 [a_2]: 0.00010359 [accelerated_algorithm]: 7.86001e-06 [shard]: 2.49001e-06 [meta_shard_fg_expand]: 1.77001e-06 [shard_inline]: 7.5e-06 [merge_send_recv]: 9.64999e-06 [auto_parallel]: 7.56999e-06 [parallel]: 1.999e-05 [flash_sp]: 9.73998e-06 [merge_comm]: 4.35e-06 [allreduce_fusion]: 3.96001e-06 [matmul_add_comm_reduction]: 1.098e-05 [allreduce_slice_to_reducescatter]: 6.80011e-07 [virtual_shard_identity]: 1.039e-05 [virtual_dataset]: 6.91001e-06 [get_grad_eliminate_]: 6.48e-06 [virtual_output]: 6.28998e-06 [merge_forward]: 4.67998e-06 [cell_reuse_recompute_pass]: 1.12999e-06 [offload_activation]: 1.073e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.703e-05 [merge_recompute_call_nodes]: 1.76e-06 [before_grad]: 1.179e-05 [set_forward_comm_id_for_comm_node_pass]: 5.66e-06 [meta_fg_expand]: 2.65002e-06 [flash_sp_send_recv_attached]: 3.76001e-06 [receive_attached]: 2.22999e-06 [after_resolve]: 1.105e-05 [a_after_grad]: 9.20999e-06 [renormalize]: 0.00060127 [add_forward_monad_depend]: 6.84999e-06 [auto_monad_grad]: 2.71e-06 [auto_monad_eliminator]: 1.785e-05 [cse]: 3.064e-05 [a_3]: 5.082e-05 [Cycle 2]: 0.00069703, [45] [expand_dump_flag]: 1.02e-06 [switch_simplify]: 8.48001e-06 [loop_unroll]: 5.96e-06 [a_1]: 0.00010744 [with_stream_mark]: 1.663e-05 [recompute_prepare]: 6.89001e-06 [updatestate_depend_eliminate]: 3.56999e-06 [updatestate_assign_eliminate]: 2.67001e-06 [updatestate_loads_eliminate]: 2.93e-06 [parameter_eliminate]: 2.01998e-06 [a_2]: 7.195e-05 [accelerated_algorithm]: 6.63998e-06 [shard]: 2.07999e-06 [meta_shard_fg_expand]: 2.11e-06 [shard_inline]: 6.61999e-06 [merge_send_recv]: 7.37997e-06 [auto_parallel]: 6.61999e-06 [parallel]: 6.71e-06 [flash_sp]: 3.43e-06 [merge_comm]: 3.80998e-06 [allreduce_fusion]: 3.20998e-06 [matmul_add_comm_reduction]: 1.13e-05 [allreduce_slice_to_reducescatter]: 3.69997e-07 [virtual_shard_identity]: 9.60001e-06 [virtual_dataset]: 5.72001e-06 [get_grad_eliminate_]: 5.76003e-06 [virtual_output]: 5.37999e-06 [merge_forward]: 3.78001e-06 [cell_reuse_recompute_pass]: 2.22001e-06 [offload_activation]: 8.80999e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.671e-05 [merge_recompute_call_nodes]: 1.22e-06 [before_grad]: 1.153e-05 [set_forward_comm_id_for_comm_node_pass]: 5.32999e-06 [meta_fg_expand]: 2.74999e-06 [flash_sp_send_recv_attached]: 1.39998e-06 [receive_attached]: 1.17999e-06 [after_resolve]: 1.014e-05 [a_after_grad]: 8.27e-06 [renormalize]: 8.00064e-08 [add_forward_monad_depend]: 3.47002e-06 [auto_monad_grad]: 1.64e-06 [auto_monad_eliminator]: 1.078e-05 [cse]: 2.017e-05 [a_3]: 3.908e-05 [py_interpret_to_execute_after_opt_a]: 1.062e-05 [slice_cell_reuse_recomputed_activation]: 1.86e-06 [rewriter_after_opt_a]: 4.042e-05 [convert_after_rewriter]: 6.91999e-06 [order_py_execute_after_rewriter]: 5.43002e-06 [mutable_eliminate]: 0.00055806 [opt_b]: 0.00021136, [1] [Cycle 1]: 0.00020451, [7] [b_1]: 0.00011911 [b_2]: 8.18001e-06 [updatestate_depend_eliminate]: 8.51002e-06 [updatestate_assign_eliminate]: 2.99001e-06 [updatestate_loads_eliminate]: 2.78e-06 [renormalize]: 8.79983e-07 [cse]: 2.551e-05 [optimize_parallel_all_gather_comm]: 1.875e-05 [overlap_param_gather]: 1.83002e-06 [cconv]: 3.026e-05 [loop_unroll]: 0.0004363 [opt_after_cconv]: 0.00010222, [1] [Cycle 1]: 9.586e-05, [7] [c_1]: 2.76e-05 [parameter_eliminate]: 3.57002e-06 [updatestate_depend_eliminate]: 5.80002e-06 [updatestate_assign_eliminate]: 2.64001e-06 [updatestate_loads_eliminate]: 2.52001e-06 [cse]: 1.88e-05 [renormalize]: 4.2998e-07 [remove_dup_value]: 1.525e-05 [tuple_transform]: 7.188e-05, [1] [Cycle 1]: 6.695e-05, [4] [d_1]: 3.995e-05 [none_parameter_eliminate]: 1.69e-06 [renormalize]: 2.09984e-07 [switch_simplify]: 6.85002e-06 [partial_unused_args_eliminate]: 1.89999e-06 [add_recomputation]: 4.974e-05 [cse_after_recomputation]: 2.196e-05, [1] [Cycle 1]: 1.736e-05, [1] [cse]: 1.212e-05 [environ_conv]: 5.02999e-06 [swap_dp_allreduce_reducescatter]: 4.85001e-06 [bias_add_comm_swap]: 2.61e-06 [label_micro_interleaved_index]: 4.12e-06 [label_fine_grained_interleaved_index]: 2.83998e-06 [merge_cast_opt]: 1.26997e-06 [slice_recompute_activation]: 2.07999e-06 [micro_interleaved_order_control]: 2.02001e-06 [assign_add_opt]: 1.29e-06 [ForceFp32Comm]: 7.80012e-07 [remove_cast_before_assign_add]: 1.19e-06 [full_micro_interleaved_order_control]: 1.97001e-06 [reorder_send_recv_between_fp_bp]: 2.83e-06 [comm_op_add_attrs]: 1.03001e-06 [add_comm_op_reuse_tag]: 9.39996e-07 [interleave_split_concat_branches]: 1.07e-06 [interleave_parallel_branches]: 1.00999e-06 [overlap_opt_shard_in_pipeline]: 1.15999e-06 [overlap_opt_shard_grad_in_pipeline]: 1.99e-06 [control_data_broadcast_order]: 1.264e-05 [grouped_pairwise_exchange_alltoall]: 1.40999e-06 [offloading_packed_experts]: 4.23999e-06 [overlap_recompute_and_grad_model_parallel]: 4.74998e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.15001e-06 [overlap_recompute_allgather_and_fa_grad]: 1.19e-06 [overlap_recompute_comm]: 2.52001e-06 [overlap_grad_ring_attention]: 4.20999e-06 [overlap_grad_flash_sp]: 1.979e-05 [begin_end_overlap_inline]: 5.79981e-07 [split_matmul_comm_elemetwise]: 2.09e-06 [split_layernorm_comm]: 1.74e-06 [handle_group_info]: 9.60019e-07 [symbol_engine_optimizer]: 7.478e-05, [1] [Cycle 1]: 7.055e-05, [6] [build]: 2.93e-06 [elim_shapecalc]: 9.46e-06 [elim_not_effective]: 1.247e-05 [opt_reshape]: 6.82002e-06 [fold_const_symbol]: 9.94001e-06 [renormalize]: 4.00003e-07 [detach_backward]: 2.11e-06 [pipeline_parallel_scheduler]: 1.55001e-06 [auto_monad_reorder]: 1.662e-05 [get_jit_bprop_graph]: 1.77999e-06 [rewriter_after_jit_bprop_graph]: 4.68999e-06 [opt_after_jit_grad]: 0.0005062 [validate]: 4.016e-05 Sums bootstrap : 0.000448s : 4.42% type_inference : 0.005424s : 53.54% event_method : 0.000014s : 0.14% auto_monad : 0.000057s : 0.56% graph_reusing : 0.000005s : 0.05% inline : 0.000002s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000016s : 0.16% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000004s : 0.04% parallel-infer-symbol : 0.000003s : 0.03% pre_auto_parallel : 0.000029s : 0.28% insert-virtual-dataset : 0.000003s : 0.03% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.02% optimize.py_interpret_to_execute : 0.000020s : 0.20% optimize.rewriter_before_opt_a : 0.000049s : 0.48% optimize.opt_a.expand_dump_flag : 0.000004s : 0.04% optimize.opt_a.switch_simplify : 0.000035s : 0.35% optimize.opt_a.loop_unroll : 0.000019s : 0.19% optimize.opt_a.a_1 : 0.000414s : 4.09% optimize.opt_a.with_stream_mark : 0.000037s : 0.37% optimize.opt_a.recompute_prepare : 0.000018s : 0.17% optimize.opt_a.updatestate_depend_eliminate : 0.000008s : 0.08% optimize.opt_a.updatestate_assign_eliminate : 0.000006s : 0.06% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.06% optimize.opt_a.parameter_eliminate : 0.000004s : 0.04% optimize.opt_a.a_2 : 0.000176s : 1.73% optimize.opt_a.accelerated_algorithm : 0.000014s : 0.14% optimize.opt_a.shard : 0.000005s : 0.05% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.04% optimize.opt_a.shard_inline : 0.000014s : 0.14% optimize.opt_a.merge_send_recv : 0.000017s : 0.17% optimize.opt_a.auto_parallel : 0.000014s : 0.14% optimize.opt_a.parallel : 0.000027s : 0.26% optimize.opt_a.flash_sp : 0.000013s : 0.13% optimize.opt_a.merge_comm : 0.000008s : 0.08% optimize.opt_a.allreduce_fusion : 0.000007s : 0.07% optimize.opt_a.matmul_add_comm_reduction : 0.000022s : 0.22% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000020s : 0.20% optimize.opt_a.virtual_dataset : 0.000013s : 0.12% optimize.opt_a.get_grad_eliminate_ : 0.000012s : 0.12% optimize.opt_a.virtual_output : 0.000012s : 0.12% optimize.opt_a.merge_forward : 0.000008s : 0.08% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.03% optimize.opt_a.offload_activation : 0.000020s : 0.19% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000034s : 0.33% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.03% optimize.opt_a.before_grad : 0.000023s : 0.23% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000011s : 0.11% optimize.opt_a.meta_fg_expand : 0.000005s : 0.05% optimize.opt_a.flash_sp_send_recv_attached : 0.000005s : 0.05% optimize.opt_a.receive_attached : 0.000003s : 0.03% optimize.opt_a.after_resolve : 0.000021s : 0.21% optimize.opt_a.a_after_grad : 0.000017s : 0.17% optimize.opt_a.renormalize : 0.000601s : 5.94% optimize.opt_a.add_forward_monad_depend : 0.000010s : 0.10% optimize.opt_a.auto_monad_grad : 0.000004s : 0.04% optimize.opt_a.auto_monad_eliminator : 0.000029s : 0.28% optimize.opt_a.cse : 0.000051s : 0.50% optimize.opt_a.a_3 : 0.000090s : 0.89% optimize.py_interpret_to_execute_after_opt_a : 0.000011s : 0.10% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.02% optimize.rewriter_after_opt_a : 0.000040s : 0.40% optimize.convert_after_rewriter : 0.000007s : 0.07% optimize.order_py_execute_after_rewriter : 0.000005s : 0.05% optimize.mutable_eliminate : 0.000558s : 5.51% optimize.opt_b.b_1 : 0.000119s : 1.18% optimize.opt_b.b_2 : 0.000008s : 0.08% optimize.opt_b.updatestate_depend_eliminate : 0.000009s : 0.08% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_b.renormalize : 0.000001s : 0.01% optimize.opt_b.cse : 0.000026s : 0.25% optimize.optimize_parallel_all_gather_comm : 0.000019s : 0.19% optimize.overlap_param_gather : 0.000002s : 0.02% optimize.cconv : 0.000030s : 0.30% optimize.loop_unroll : 0.000436s : 4.31% optimize.opt_after_cconv.c_1 : 0.000028s : 0.27% optimize.opt_after_cconv.parameter_eliminate : 0.000004s : 0.04% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.06% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.02% optimize.opt_after_cconv.cse : 0.000019s : 0.19% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000015s : 0.15% optimize.tuple_transform.d_1 : 0.000040s : 0.39% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.02% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000007s : 0.07% optimize.partial_unused_args_eliminate : 0.000002s : 0.02% optimize.add_recomputation : 0.000050s : 0.49% optimize.cse_after_recomputation.cse : 0.000012s : 0.12% optimize.environ_conv : 0.000005s : 0.05% optimize.swap_dp_allreduce_reducescatter : 0.000005s : 0.05% optimize.bias_add_comm_swap : 0.000003s : 0.03% optimize.label_micro_interleaved_index : 0.000004s : 0.04% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.03% optimize.merge_cast_opt : 0.000001s : 0.01% optimize.slice_recompute_activation : 0.000002s : 0.02% optimize.micro_interleaved_order_control : 0.000002s : 0.02% optimize.assign_add_opt : 0.000001s : 0.01% optimize.ForceFp32Comm : 0.000001s : 0.01% optimize.remove_cast_before_assign_add : 0.000001s : 0.01% optimize.full_micro_interleaved_order_control : 0.000002s : 0.02% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.03% optimize.comm_op_add_attrs : 0.000001s : 0.01% optimize.add_comm_op_reuse_tag : 0.000001s : 0.01% optimize.interleave_split_concat_branches : 0.000001s : 0.01% optimize.interleave_parallel_branches : 0.000001s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.02% optimize.control_data_broadcast_order : 0.000013s : 0.12% optimize.grouped_pairwise_exchange_alltoall : 0.000001s : 0.01% optimize.offloading_packed_experts : 0.000004s : 0.04% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.05% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.01% optimize.overlap_recompute_comm : 0.000003s : 0.02% optimize.overlap_grad_ring_attention : 0.000004s : 0.04% optimize.overlap_grad_flash_sp : 0.000020s : 0.20% optimize.begin_end_overlap_inline : 0.000001s : 0.01% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.02% optimize.split_layernorm_comm : 0.000002s : 0.02% optimize.handle_group_info : 0.000001s : 0.01% optimize.symbol_engine_optimizer.build : 0.000003s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000009s : 0.09% optimize.symbol_engine_optimizer.elim_not_effective : 0.000012s : 0.12% optimize.symbol_engine_optimizer.opt_reshape : 0.000007s : 0.07% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000010s : 0.10% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.02% pipeline_parallel_scheduler : 0.000002s : 0.02% auto_monad_reorder : 0.000017s : 0.16% get_jit_bprop_graph : 0.000002s : 0.02% rewriter_after_jit_bprop_graph : 0.000005s : 0.05% opt_after_jit_grad : 0.000506s : 5.00% validate : 0.000040s : 0.40% Time group info: ------[substitution.] 0.000121 19 1.43% : 0.000002s : 2: substitution.elim_not_effective 1.08% : 0.000001s : 2: substitution.fold_const_symbol 4.68% : 0.000006s : 3: substitution.graph_param_transform 81.82% : 0.000099s : 2: substitution.inline 3.69% : 0.000004s : 4: substitution.j_node_and_user_rematch 4.14% : 0.000005s : 4: substitution.remove_not_recompute_node 3.17% : 0.000004s : 2: substitution.replace_old_param ------[type_inference.] 0.005370 2 91.14% : 0.004894s : 1: type_inference.infer 8.86% : 0.000476s : 1: type_inference.specialize ------[replace.] 0.000025 2 100.00% : 0.000025s : 2: replace.inline ------[match.] 0.000097 2 100.00% : 0.000097s : 2: match.inline ------[predicate.] 0.000136 754 0.92% : 0.000001s : 7: predicate.accumulaten_eliminater 1.06% : 0.000001s : 3: predicate.ad_related_special_op_eliminate 0.62% : 0.000001s : 6: predicate.addn_check_dump 0.92% : 0.000001s : 7: predicate.addn_zero_filter 0.73% : 0.000001s : 7: predicate.adjust_all_reduce_mul_add 2.03% : 0.000003s : 13: predicate.arithmetic_simplify 0.99% : 0.000001s : 7: predicate.cast_eliminate 0.70% : 0.000001s : 6: predicate.check_bprop_eliminate 0.65% : 0.000001s : 6: predicate.compare_switch_simplify 0.21% : 0.000000s : 3: predicate.const_output_eliminate 0.71% : 0.000001s : 6: predicate.depend_value_elim 0.77% : 0.000001s : 7: predicate.dict_get_item_const_eliminator 0.86% : 0.000001s : 7: predicate.dict_get_item_eliminator 0.76% : 0.000001s : 7: predicate.dict_set_item_eliminator 1.31% : 0.000002s : 6: predicate.dumpgradient_eliminate 0.24% : 0.000000s : 3: predicate.elim_not_effective 0.59% : 0.000001s : 3: predicate.elim_shapecalc_of_broadcastargs 1.06% : 0.000001s : 10: predicate.environ_add_const_eliminate 1.06% : 0.000001s : 10: predicate.environ_get_add_eliminate 1.13% : 0.000002s : 10: predicate.environ_get_depend_swap 1.70% : 0.000002s : 16: predicate.environ_get_eliminate 1.00% : 0.000001s : 10: predicate.environ_get_set_eliminate 0.98% : 0.000001s : 9: predicate.exchange_switch_depend_value 1.92% : 0.000003s : 9: predicate.float_depend_g_call 0.62% : 0.000001s : 6: predicate.float_environ_get_switch 0.92% : 0.000001s : 9: predicate.float_tuple_getitem_switch 0.18% : 0.000000s : 3: predicate.fold_const_symbol 0.79% : 0.000001s : 6: predicate.get_grad_eliminate 0.24% : 0.000000s : 3: predicate.graph_param_transform 0.82% : 0.000001s : 6: predicate.incorporate_call 0.63% : 0.000001s : 6: predicate.incorporate_call_switch 6.76% : 0.000009s : 34: predicate.inline 0.96% : 0.000001s : 6: predicate.inline_without_move 0.41% : 0.000001s : 6: predicate.j_node_and_user_rematch 1.04% : 0.000001s : 6: predicate.less_batch_normalization 1.73% : 0.000002s : 13: predicate.list_to_tuple_eliminator_ 2.00% : 0.000003s : 20: predicate.load_eliminater 1.18% : 0.000002s : 3: predicate.loop_unroll_after_grad 1.49% : 0.000002s : 14: predicate.loop_unroll_before_grad 1.75% : 0.000002s : 13: predicate.make_slice_get_slice_eliminator 0.70% : 0.000001s : 6: predicate.merge_addn 0.69% : 0.000001s : 6: predicate.micro_step_allgather_replace 0.75% : 0.000001s : 6: predicate.mini_step_allgather_replace 0.70% : 0.000001s : 7: predicate.minmaximum_grad 1.49% : 0.000002s : 3: predicate.mutable_eliminate 0.40% : 0.000001s : 3: predicate.opt_reshape 0.63% : 0.000001s : 3: predicate.parallel_virtual_node 1.24% : 0.000002s : 9: predicate.partial_defer_inline 1.20% : 0.000002s : 10: predicate.partial_eliminate 0.71% : 0.000001s : 7: predicate.print_const_string_wrapper 0.70% : 0.000001s : 6: predicate.reduce_all_const_elim 0.88% : 0.000001s : 7: predicate.reduce_eliminate 2.08% : 0.000003s : 20: predicate.redundant_stop_gradient_eliminater 1.09% : 0.000001s : 6: predicate.remove_not_recompute_node 1.27% : 0.000002s : 13: predicate.replace_applicator 0.74% : 0.000001s : 6: predicate.replace_old_param 0.60% : 0.000001s : 3: predicate.reset_defer_inline 0.81% : 0.000001s : 7: predicate.reshape_eliminate 0.71% : 0.000001s : 6: predicate.row_tensor_add_zeros_like 0.53% : 0.000001s : 3: predicate.row_tensor_eliminate 1.01% : 0.000001s : 6: predicate.same_eliminate 0.56% : 0.000001s : 6: predicate.set_cell_output_no_recompute 1.29% : 0.000002s : 6: predicate.shard_identity_eliminate 0.90% : 0.000001s : 6: predicate.special_op_eliminate 1.24% : 0.000002s : 6: predicate.specialize_transform 1.15% : 0.000002s : 6: predicate.split_environ_get_set_with_tuple_value 0.90% : 0.000001s : 6: predicate.stack_unstack_eliminate 0.43% : 0.000001s : 3: predicate.switch_call_monad_eliminater 0.99% : 0.000001s : 9: predicate.switch_defer_inline 1.69% : 0.000002s : 15: predicate.switch_layer_defer_inline 4.93% : 0.000007s : 32: predicate.switch_simplify 0.79% : 0.000001s : 7: predicate.tile_eliminate 0.82% : 0.000001s : 7: predicate.transpose_eliminate 1.44% : 0.000002s : 13: predicate.tuple_list_convert_item_index_to_positive 1.51% : 0.000002s : 13: predicate.tuple_list_get_item_const_eliminator 1.45% : 0.000002s : 13: predicate.tuple_list_get_item_depend_reorder 3.46% : 0.000005s : 19: predicate.tuple_list_get_item_eliminator 1.37% : 0.000002s : 13: predicate.tuple_list_get_set_item_eliminator 2.25% : 0.000003s : 19: predicate.tuple_list_set_item_eliminator 1.50% : 0.000002s : 13: predicate.tuple_to_list_eliminator_ 2.01% : 0.000003s : 20: predicate.updatestate_pure_node_eliminater 2.75% : 0.000004s : 26: predicate.updatestate_useless_node_eliminater 0.70% : 0.000001s : 3: predicate.value_based_eliminate 0.85% : 0.000001s : 6: predicate.virtual_dataset_eliminate 0.79% : 0.000001s : 6: predicate.virtual_output_eliminate 0.36% : 0.000000s : 3: predicate.virtual_view_grad_eliminate 0.47% : 0.000001s : 3: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000241 5 8.35% : 0.000020s : 1: func_graph_cloner_run.FuncGraphClonerGraph 91.65% : 0.000221s : 4: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.023633 192 0.01% : 0.000003s : 1: ForceFp32Comm 13.75% : 0.003249s : 1: add_attr 13.70% : 0.003238s : 1: add_attr_with_inline 0.02% : 0.000004s : 1: add_comm_op_reuse_tag 0.23% : 0.000054s : 1: add_recomputation 0.02% : 0.000004s : 1: assign_add_opt 0.26% : 0.000062s : 1: auto_monad 0.09% : 0.000021s : 1: auto_monad_reorder 0.02% : 0.000004s : 1: begin_end_overlap_inline 0.02% : 0.000006s : 1: bias_add_comm_swap 2.03% : 0.000480s : 1: bootstrap 0.14% : 0.000034s : 1: cconv 0.02% : 0.000004s : 1: comm_op_add_attrs 0.07% : 0.000016s : 1: control_data_broadcast_order 0.05% : 0.000011s : 1: convert_after_rewriter 0.11% : 0.000025s : 1: cse_after_recomputation 0.02% : 0.000005s : 1: dataset_repeat_opt 0.02% : 0.000005s : 1: detach_backward 0.03% : 0.000008s : 1: environ_conv 0.09% : 0.000021s : 1: event_method 0.02% : 0.000005s : 1: full_micro_interleaved_order_control 0.02% : 0.000005s : 1: get_jit_bprop_graph 0.04% : 0.000009s : 1: graph_reusing 0.02% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.02% : 0.000004s : 1: handle_group_info 0.02% : 0.000005s : 1: inline 0.03% : 0.000006s : 1: insert-virtual-dataset 0.02% : 0.000004s : 1: interleave_parallel_branches 0.02% : 0.000004s : 1: interleave_split_concat_branches 0.02% : 0.000006s : 1: label_fine_grained_interleaved_index 0.03% : 0.000007s : 1: label_micro_interleaved_index 1.88% : 0.000444s : 1: loop_unroll 0.02% : 0.000004s : 1: merge_cast_opt 0.02% : 0.000005s : 1: micro_interleaved_order_control 2.40% : 0.000568s : 1: mutable_eliminate 0.03% : 0.000007s : 1: offloading_packed_experts 0.06% : 0.000015s : 1: opt.transform.loop_unroll_optimizer 0.07% : 0.000017s : 1: opt.transform.mutable_eliminate 3.50% : 0.000827s : 78: opt.transform.opt_a 0.11% : 0.000026s : 1: opt.transform.opt_after_cconv 0.11% : 0.000025s : 1: opt.transform.opt_after_jit_grad 0.40% : 0.000095s : 28: opt.transform.opt_b 0.19% : 0.000044s : 2: opt.transform.opt_trans_graph 0.15% : 0.000035s : 4: opt.transform.symbol_engine_opt 10.10% : 0.002387s : 1: opt_a 0.45% : 0.000106s : 1: opt_after_cconv 2.18% : 0.000515s : 1: opt_after_jit_grad 0.91% : 0.000215s : 1: opt_b 18.77% : 0.004435s : 1: optimize 0.09% : 0.000022s : 1: optimize_parallel_all_gather_comm 0.04% : 0.000009s : 1: order_py_execute_after_rewriter 0.10% : 0.000023s : 1: overlap_grad_flash_sp 0.02% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.03% : 0.000007s : 1: overlap_grad_ring_attention 0.02% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.02% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.02% : 0.000005s : 1: overlap_param_gather 0.02% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.03% : 0.000008s : 1: overlap_recompute_and_grad_model_parallel 0.02% : 0.000005s : 1: overlap_recompute_comm 0.03% : 0.000007s : 1: parallel-infer-symbol 0.02% : 0.000004s : 1: parallel-infer-symbol-second 0.02% : 0.000005s : 1: partial_unused_args_eliminate 0.02% : 0.000005s : 1: pipeline_parallel_scheduler 0.02% : 0.000004s : 1: pipeline_split 0.14% : 0.000033s : 1: pre_auto_parallel 0.10% : 0.000024s : 1: py_interpret_to_execute 0.06% : 0.000014s : 1: py_interpret_to_execute_after_opt_a 0.02% : 0.000004s : 1: remove_cast_before_assign_add 0.08% : 0.000019s : 1: remove_dup_value 1.39% : 0.000328s : 1: renormalize.infer 1.11% : 0.000263s : 1: renormalize.specialize 0.02% : 0.000005s : 1: reorder_send_recv_between_fp_bp 0.03% : 0.000008s : 1: rewriter_after_jit_bprop_graph 0.20% : 0.000047s : 1: rewriter_after_opt_a 0.23% : 0.000053s : 1: rewriter_before_opt_a 0.02% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.02% : 0.000005s : 1: slice_recompute_activation 0.02% : 0.000005s : 1: split_layernorm_comm 0.02% : 0.000005s : 1: split_matmul_comm_elemetwise 0.03% : 0.000008s : 1: swap_dp_allreduce_reducescatter 0.33% : 0.000078s : 1: symbol_engine_optimizer 0.32% : 0.000075s : 1: tuple_transform 23.08% : 0.005455s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:40.633.362 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:40.633.665 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0150472, [21] [bootstrap]: 0.00043877 [type_inference]: 0.00537376 [event_method]: 1.213e-05 [auto_monad]: 5.744e-05 [graph_reusing]: 6.01e-06 [inline]: 2.59001e-06 [add_attr]: 0.00321977, [1] [add_attr_with_inline]: 0.00321083, [1] [Cycle 1]: 7.018e-05, [2] [tag_attr]: 1.429e-05 [meta_addattr_fg_expand]: 3.88999e-06 [parallel-infer-symbol]: 3.55e-06 [pre_auto_parallel]: 2.647e-05 [insert-virtual-dataset]: 2.32001e-06 [parallel-infer-symbol-second]: 7.50006e-07 [dataset_repeat_opt]: 1.70001e-06 [pipeline_split]: 1.86e-06 [optimize]: 0.00475858, [53] [py_interpret_to_execute]: 2.233e-05 [rewriter_before_opt_a]: 4.9e-05 [opt_a]: 0.00250619, [2] [Cycle 1]: 0.0016766, [45] [expand_dump_flag]: 3.14999e-06 [switch_simplify]: 2.471e-05 [loop_unroll]: 1.41e-05 [a_1]: 0.00028888 [with_stream_mark]: 1.793e-05 [recompute_prepare]: 8.60001e-06 [updatestate_depend_eliminate]: 4.12e-06 [updatestate_assign_eliminate]: 3.55998e-06 [updatestate_loads_eliminate]: 3.11999e-06 [parameter_eliminate]: 1.84998e-06 [a_2]: 0.00010749 [accelerated_algorithm]: 6.54999e-06 [shard]: 2.49999e-06 [meta_shard_fg_expand]: 1.90001e-06 [shard_inline]: 6.59001e-06 [merge_send_recv]: 7.92003e-06 [auto_parallel]: 5.91e-06 [parallel]: 1.907e-05 [flash_sp]: 8.30999e-06 [merge_comm]: 3.93999e-06 [allreduce_fusion]: 3.51999e-06 [matmul_add_comm_reduction]: 9.93002e-06 [allreduce_slice_to_reducescatter]: 6.19999e-07 [virtual_shard_identity]: 7.88001e-06 [virtual_dataset]: 6.52001e-06 [get_grad_eliminate_]: 5.99999e-06 [virtual_output]: 5.92999e-06 [merge_forward]: 4.08001e-06 [cell_reuse_recompute_pass]: 1.13001e-06 [offload_activation]: 9.79999e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.468e-05 [merge_recompute_call_nodes]: 1.37999e-06 [before_grad]: 1.02e-05 [set_forward_comm_id_for_comm_node_pass]: 3.48999e-06 [meta_fg_expand]: 2.54999e-06 [flash_sp_send_recv_attached]: 2.73998e-06 [receive_attached]: 2.48002e-06 [after_resolve]: 9.63002e-06 [a_after_grad]: 9.89999e-06 [renormalize]: 0.00053517 [add_forward_monad_depend]: 5.38002e-06 [auto_monad_grad]: 2.34001e-06 [auto_monad_eliminator]: 1.557e-05 [cse]: 2.96e-05 [a_3]: 5.973e-05 [Cycle 2]: 0.00081545, [45] [expand_dump_flag]: 1.57999e-06 [switch_simplify]: 7.78001e-06 [loop_unroll]: 6.07001e-06 [a_1]: 0.00010584 [with_stream_mark]: 1.323e-05 [recompute_prepare]: 6.16e-06 [updatestate_depend_eliminate]: 3.01001e-06 [updatestate_assign_eliminate]: 2.63e-06 [updatestate_loads_eliminate]: 2.83e-06 [parameter_eliminate]: 1.05999e-06 [a_2]: 9.769e-05 [accelerated_algorithm]: 5.81998e-06 [shard]: 1.39e-06 [meta_shard_fg_expand]: 1.73002e-06 [shard_inline]: 7.08e-06 [merge_send_recv]: 6.34001e-06 [auto_parallel]: 6.21e-06 [parallel]: 6.09001e-06 [flash_sp]: 3.8e-06 [merge_comm]: 3.14001e-06 [allreduce_fusion]: 3.42002e-06 [matmul_add_comm_reduction]: 6.86999e-06 [allreduce_slice_to_reducescatter]: 5.59987e-07 [virtual_shard_identity]: 7.1e-06 [virtual_dataset]: 5.71e-06 [get_grad_eliminate_]: 6.25002e-06 [virtual_output]: 6.44999e-06 [merge_forward]: 3.36001e-06 [cell_reuse_recompute_pass]: 1.60001e-06 [offload_activation]: 7.21999e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.551e-05 [merge_recompute_call_nodes]: 6.89994e-07 [before_grad]: 9.63997e-06 [set_forward_comm_id_for_comm_node_pass]: 4.22998e-06 [meta_fg_expand]: 2.16e-06 [flash_sp_send_recv_attached]: 9.5999e-07 [receive_attached]: 1.35999e-06 [after_resolve]: 8.83001e-06 [a_after_grad]: 8.43001e-06 [renormalize]: 8.00064e-08 [add_forward_monad_depend]: 1.41002e-06 [auto_monad_grad]: 1.39003e-06 [auto_monad_eliminator]: 8.59998e-06 [cse]: 1.548e-05 [a_3]: 4.916e-05 [py_interpret_to_execute_after_opt_a]: 1.365e-05 [slice_cell_reuse_recomputed_activation]: 4.90999e-06 [rewriter_after_opt_a]: 4.14e-05 [convert_after_rewriter]: 9.44e-06 [order_py_execute_after_rewriter]: 8.42e-06 [mutable_eliminate]: 0.00054843 [opt_b]: 0.00026491, [1] [Cycle 1]: 0.00025534, [7] [b_1]: 0.00016242 [b_2]: 7.43e-06 [updatestate_depend_eliminate]: 6.74001e-06 [updatestate_assign_eliminate]: 2.74999e-06 [updatestate_loads_eliminate]: 2.34001e-06 [renormalize]: 6.50005e-07 [cse]: 1.882e-05 [optimize_parallel_all_gather_comm]: 1.852e-05 [overlap_param_gather]: 4.89003e-06 [cconv]: 3.179e-05 [loop_unroll]: 0.00043284 [opt_after_cconv]: 0.00012508, [1] [Cycle 1]: 0.0001159, [7] [c_1]: 2.837e-05 [parameter_eliminate]: 2.86e-06 [updatestate_depend_eliminate]: 5.42001e-06 [updatestate_assign_eliminate]: 2.56998e-06 [updatestate_loads_eliminate]: 2.35002e-06 [cse]: 1.793e-05 [renormalize]: 7.2e-07 [remove_dup_value]: 1.927e-05 [tuple_transform]: 8.807e-05, [1] [Cycle 1]: 8.082e-05, [4] [d_1]: 4.106e-05 [none_parameter_eliminate]: 1.66e-06 [renormalize]: 2.00002e-07 [switch_simplify]: 7.05e-06 [partial_unused_args_eliminate]: 4.75999e-06 [add_recomputation]: 5.111e-05 [cse_after_recomputation]: 2.842e-05, [1] [Cycle 1]: 2.094e-05, [1] [cse]: 1.173e-05 [environ_conv]: 8.25999e-06 [swap_dp_allreduce_reducescatter]: 7.61001e-06 [bias_add_comm_swap]: 4.67998e-06 [label_micro_interleaved_index]: 6.88998e-06 [label_fine_grained_interleaved_index]: 5.07999e-06 [merge_cast_opt]: 3.68e-06 [slice_recompute_activation]: 4.35999e-06 [micro_interleaved_order_control]: 4.63001e-06 [assign_add_opt]: 3.53e-06 [ForceFp32Comm]: 3.16001e-06 [remove_cast_before_assign_add]: 3.38999e-06 [full_micro_interleaved_order_control]: 4.52e-06 [reorder_send_recv_between_fp_bp]: 4.97999e-06 [comm_op_add_attrs]: 3.61999e-06 [add_comm_op_reuse_tag]: 3.21001e-06 [interleave_split_concat_branches]: 3.38e-06 [interleave_parallel_branches]: 3.59002e-06 [overlap_opt_shard_in_pipeline]: 3.9e-06 [overlap_opt_shard_grad_in_pipeline]: 4.36002e-06 [control_data_broadcast_order]: 1.574e-05 [grouped_pairwise_exchange_alltoall]: 4.48001e-06 [offloading_packed_experts]: 6.58e-06 [overlap_recompute_and_grad_model_parallel]: 8.15999e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.60998e-06 [overlap_recompute_allgather_and_fa_grad]: 3.58999e-06 [overlap_recompute_comm]: 5.25001e-06 [overlap_grad_ring_attention]: 6.36998e-06 [overlap_grad_flash_sp]: 2.187e-05 [begin_end_overlap_inline]: 3.01001e-06 [split_matmul_comm_elemetwise]: 4.65001e-06 [split_layernorm_comm]: 3.98999e-06 [handle_group_info]: 3.56999e-06 [symbol_engine_optimizer]: 9.574e-05, [1] [Cycle 1]: 8.892e-05, [6] [build]: 2.69999e-06 [elim_shapecalc]: 9.41998e-06 [elim_not_effective]: 1.262e-05 [opt_reshape]: 6.88e-06 [fold_const_symbol]: 1.02e-05 [renormalize]: 2.50002e-07 [detach_backward]: 3.55e-06 [pipeline_parallel_scheduler]: 1.87999e-06 [auto_monad_reorder]: 1.828e-05 [get_jit_bprop_graph]: 2.02999e-06 [rewriter_after_jit_bprop_graph]: 5.28002e-06 [opt_after_jit_grad]: 0.00049324 [validate]: 3.976e-05 Sums bootstrap : 0.000439s : 4.35% type_inference : 0.005374s : 53.31% event_method : 0.000012s : 0.12% auto_monad : 0.000057s : 0.57% graph_reusing : 0.000006s : 0.06% inline : 0.000003s : 0.03% add_attr.add_attr_with_inline.tag_attr : 0.000014s : 0.14% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000004s : 0.04% parallel-infer-symbol : 0.000004s : 0.04% pre_auto_parallel : 0.000026s : 0.26% insert-virtual-dataset : 0.000002s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.02% optimize.py_interpret_to_execute : 0.000022s : 0.22% optimize.rewriter_before_opt_a : 0.000049s : 0.49% optimize.opt_a.expand_dump_flag : 0.000005s : 0.05% optimize.opt_a.switch_simplify : 0.000032s : 0.32% optimize.opt_a.loop_unroll : 0.000020s : 0.20% optimize.opt_a.a_1 : 0.000395s : 3.92% optimize.opt_a.with_stream_mark : 0.000031s : 0.31% optimize.opt_a.recompute_prepare : 0.000015s : 0.15% optimize.opt_a.updatestate_depend_eliminate : 0.000007s : 0.07% optimize.opt_a.updatestate_assign_eliminate : 0.000006s : 0.06% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.06% optimize.opt_a.parameter_eliminate : 0.000003s : 0.03% optimize.opt_a.a_2 : 0.000205s : 2.04% optimize.opt_a.accelerated_algorithm : 0.000012s : 0.12% optimize.opt_a.shard : 0.000004s : 0.04% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.04% optimize.opt_a.shard_inline : 0.000014s : 0.14% optimize.opt_a.merge_send_recv : 0.000014s : 0.14% optimize.opt_a.auto_parallel : 0.000012s : 0.12% optimize.opt_a.parallel : 0.000025s : 0.25% optimize.opt_a.flash_sp : 0.000012s : 0.12% optimize.opt_a.merge_comm : 0.000007s : 0.07% optimize.opt_a.allreduce_fusion : 0.000007s : 0.07% optimize.opt_a.matmul_add_comm_reduction : 0.000017s : 0.17% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000015s : 0.15% optimize.opt_a.virtual_dataset : 0.000012s : 0.12% optimize.opt_a.get_grad_eliminate_ : 0.000012s : 0.12% optimize.opt_a.virtual_output : 0.000012s : 0.12% optimize.opt_a.merge_forward : 0.000007s : 0.07% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.03% optimize.opt_a.offload_activation : 0.000017s : 0.17% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000030s : 0.30% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.02% optimize.opt_a.before_grad : 0.000020s : 0.20% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000008s : 0.08% optimize.opt_a.meta_fg_expand : 0.000005s : 0.05% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.04% optimize.opt_a.receive_attached : 0.000004s : 0.04% optimize.opt_a.after_resolve : 0.000018s : 0.18% optimize.opt_a.a_after_grad : 0.000018s : 0.18% optimize.opt_a.renormalize : 0.000535s : 5.31% optimize.opt_a.add_forward_monad_depend : 0.000007s : 0.07% optimize.opt_a.auto_monad_grad : 0.000004s : 0.04% optimize.opt_a.auto_monad_eliminator : 0.000024s : 0.24% optimize.opt_a.cse : 0.000045s : 0.45% optimize.opt_a.a_3 : 0.000109s : 1.08% optimize.py_interpret_to_execute_after_opt_a : 0.000014s : 0.14% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.05% optimize.rewriter_after_opt_a : 0.000041s : 0.41% optimize.convert_after_rewriter : 0.000009s : 0.09% optimize.order_py_execute_after_rewriter : 0.000008s : 0.08% optimize.mutable_eliminate : 0.000548s : 5.44% optimize.opt_b.b_1 : 0.000162s : 1.61% optimize.opt_b.b_2 : 0.000007s : 0.07% optimize.opt_b.updatestate_depend_eliminate : 0.000007s : 0.07% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_b.updatestate_loads_eliminate : 0.000002s : 0.02% optimize.opt_b.renormalize : 0.000001s : 0.01% optimize.opt_b.cse : 0.000019s : 0.19% optimize.optimize_parallel_all_gather_comm : 0.000019s : 0.18% optimize.overlap_param_gather : 0.000005s : 0.05% optimize.cconv : 0.000032s : 0.32% optimize.loop_unroll : 0.000433s : 4.29% optimize.opt_after_cconv.c_1 : 0.000028s : 0.28% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000005s : 0.05% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.02% optimize.opt_after_cconv.cse : 0.000018s : 0.18% optimize.opt_after_cconv.renormalize : 0.000001s : 0.01% optimize.remove_dup_value : 0.000019s : 0.19% optimize.tuple_transform.d_1 : 0.000041s : 0.41% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.02% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000007s : 0.07% optimize.partial_unused_args_eliminate : 0.000005s : 0.05% optimize.add_recomputation : 0.000051s : 0.51% optimize.cse_after_recomputation.cse : 0.000012s : 0.12% optimize.environ_conv : 0.000008s : 0.08% optimize.swap_dp_allreduce_reducescatter : 0.000008s : 0.08% optimize.bias_add_comm_swap : 0.000005s : 0.05% optimize.label_micro_interleaved_index : 0.000007s : 0.07% optimize.label_fine_grained_interleaved_index : 0.000005s : 0.05% optimize.merge_cast_opt : 0.000004s : 0.04% optimize.slice_recompute_activation : 0.000004s : 0.04% optimize.micro_interleaved_order_control : 0.000005s : 0.05% optimize.assign_add_opt : 0.000004s : 0.04% optimize.ForceFp32Comm : 0.000003s : 0.03% optimize.remove_cast_before_assign_add : 0.000003s : 0.03% optimize.full_micro_interleaved_order_control : 0.000005s : 0.04% optimize.reorder_send_recv_between_fp_bp : 0.000005s : 0.05% optimize.comm_op_add_attrs : 0.000004s : 0.04% optimize.add_comm_op_reuse_tag : 0.000003s : 0.03% optimize.interleave_split_concat_branches : 0.000003s : 0.03% optimize.interleave_parallel_branches : 0.000004s : 0.04% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.04% optimize.overlap_opt_shard_grad_in_pipeline : 0.000004s : 0.04% optimize.control_data_broadcast_order : 0.000016s : 0.16% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.04% optimize.offloading_packed_experts : 0.000007s : 0.07% optimize.overlap_recompute_and_grad_model_parallel : 0.000008s : 0.08% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.04% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.04% optimize.overlap_recompute_comm : 0.000005s : 0.05% optimize.overlap_grad_ring_attention : 0.000006s : 0.06% optimize.overlap_grad_flash_sp : 0.000022s : 0.22% optimize.begin_end_overlap_inline : 0.000003s : 0.03% optimize.split_matmul_comm_elemetwise : 0.000005s : 0.05% optimize.split_layernorm_comm : 0.000004s : 0.04% optimize.handle_group_info : 0.000004s : 0.04% optimize.symbol_engine_optimizer.build : 0.000003s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000009s : 0.09% optimize.symbol_engine_optimizer.elim_not_effective : 0.000013s : 0.13% optimize.symbol_engine_optimizer.opt_reshape : 0.000007s : 0.07% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000010s : 0.10% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000004s : 0.04% pipeline_parallel_scheduler : 0.000002s : 0.02% auto_monad_reorder : 0.000018s : 0.18% get_jit_bprop_graph : 0.000002s : 0.02% rewriter_after_jit_bprop_graph : 0.000005s : 0.05% opt_after_jit_grad : 0.000493s : 4.89% validate : 0.000040s : 0.39% Time group info: ------[substitution.] 0.000111 19 1.73% : 0.000002s : 2: substitution.elim_not_effective 1.45% : 0.000002s : 2: substitution.fold_const_symbol 5.32% : 0.000006s : 3: substitution.graph_param_transform 81.30% : 0.000090s : 2: substitution.inline 3.07% : 0.000003s : 4: substitution.j_node_and_user_rematch 3.99% : 0.000004s : 4: substitution.remove_not_recompute_node 3.14% : 0.000003s : 2: substitution.replace_old_param ------[type_inference.] 0.005327 2 91.29% : 0.004863s : 1: type_inference.infer 8.71% : 0.000464s : 1: type_inference.specialize ------[replace.] 0.000021 2 100.00% : 0.000021s : 2: replace.inline ------[match.] 0.000089 2 100.00% : 0.000089s : 2: match.inline ------[predicate.] 0.000137 754 0.77% : 0.000001s : 7: predicate.accumulaten_eliminater 1.61% : 0.000002s : 3: predicate.ad_related_special_op_eliminate 0.61% : 0.000001s : 6: predicate.addn_check_dump 0.82% : 0.000001s : 7: predicate.addn_zero_filter 0.66% : 0.000001s : 7: predicate.adjust_all_reduce_mul_add 2.27% : 0.000003s : 13: predicate.arithmetic_simplify 0.87% : 0.000001s : 7: predicate.cast_eliminate 0.79% : 0.000001s : 6: predicate.check_bprop_eliminate 0.69% : 0.000001s : 6: predicate.compare_switch_simplify 0.23% : 0.000000s : 3: predicate.const_output_eliminate 0.88% : 0.000001s : 6: predicate.depend_value_elim 0.77% : 0.000001s : 7: predicate.dict_get_item_const_eliminator 0.94% : 0.000001s : 7: predicate.dict_get_item_eliminator 0.75% : 0.000001s : 7: predicate.dict_set_item_eliminator 1.37% : 0.000002s : 6: predicate.dumpgradient_eliminate 0.25% : 0.000000s : 3: predicate.elim_not_effective 0.46% : 0.000001s : 3: predicate.elim_shapecalc_of_broadcastargs 1.08% : 0.000001s : 10: predicate.environ_add_const_eliminate 0.96% : 0.000001s : 10: predicate.environ_get_add_eliminate 1.04% : 0.000001s : 10: predicate.environ_get_depend_swap 1.71% : 0.000002s : 16: predicate.environ_get_eliminate 0.99% : 0.000001s : 10: predicate.environ_get_set_eliminate 0.96% : 0.000001s : 9: predicate.exchange_switch_depend_value 1.89% : 0.000003s : 9: predicate.float_depend_g_call 0.68% : 0.000001s : 6: predicate.float_environ_get_switch 0.96% : 0.000001s : 9: predicate.float_tuple_getitem_switch 0.23% : 0.000000s : 3: predicate.fold_const_symbol 0.85% : 0.000001s : 6: predicate.get_grad_eliminate 0.26% : 0.000000s : 3: predicate.graph_param_transform 0.79% : 0.000001s : 6: predicate.incorporate_call 0.62% : 0.000001s : 6: predicate.incorporate_call_switch 6.63% : 0.000009s : 34: predicate.inline 1.07% : 0.000001s : 6: predicate.inline_without_move 0.36% : 0.000001s : 6: predicate.j_node_and_user_rematch 1.09% : 0.000001s : 6: predicate.less_batch_normalization 1.63% : 0.000002s : 13: predicate.list_to_tuple_eliminator_ 1.98% : 0.000003s : 20: predicate.load_eliminater 1.36% : 0.000002s : 3: predicate.loop_unroll_after_grad 1.71% : 0.000002s : 14: predicate.loop_unroll_before_grad 1.70% : 0.000002s : 13: predicate.make_slice_get_slice_eliminator 0.63% : 0.000001s : 6: predicate.merge_addn 0.75% : 0.000001s : 6: predicate.micro_step_allgather_replace 0.74% : 0.000001s : 6: predicate.mini_step_allgather_replace 0.67% : 0.000001s : 7: predicate.minmaximum_grad 2.14% : 0.000003s : 3: predicate.mutable_eliminate 0.43% : 0.000001s : 3: predicate.opt_reshape 0.50% : 0.000001s : 3: predicate.parallel_virtual_node 1.25% : 0.000002s : 9: predicate.partial_defer_inline 1.18% : 0.000002s : 10: predicate.partial_eliminate 0.81% : 0.000001s : 7: predicate.print_const_string_wrapper 0.71% : 0.000001s : 6: predicate.reduce_all_const_elim 1.04% : 0.000001s : 7: predicate.reduce_eliminate 2.09% : 0.000003s : 20: predicate.redundant_stop_gradient_eliminater 0.61% : 0.000001s : 6: predicate.remove_not_recompute_node 1.12% : 0.000002s : 13: predicate.replace_applicator 0.59% : 0.000001s : 6: predicate.replace_old_param 0.35% : 0.000000s : 3: predicate.reset_defer_inline 0.79% : 0.000001s : 7: predicate.reshape_eliminate 0.75% : 0.000001s : 6: predicate.row_tensor_add_zeros_like 0.57% : 0.000001s : 3: predicate.row_tensor_eliminate 1.02% : 0.000001s : 6: predicate.same_eliminate 0.55% : 0.000001s : 6: predicate.set_cell_output_no_recompute 0.93% : 0.000001s : 6: predicate.shard_identity_eliminate 0.93% : 0.000001s : 6: predicate.special_op_eliminate 1.05% : 0.000001s : 6: predicate.specialize_transform 1.09% : 0.000002s : 6: predicate.split_environ_get_set_with_tuple_value 0.94% : 0.000001s : 6: predicate.stack_unstack_eliminate 0.44% : 0.000001s : 3: predicate.switch_call_monad_eliminater 1.03% : 0.000001s : 9: predicate.switch_defer_inline 1.65% : 0.000002s : 15: predicate.switch_layer_defer_inline 4.35% : 0.000006s : 32: predicate.switch_simplify 0.72% : 0.000001s : 7: predicate.tile_eliminate 0.78% : 0.000001s : 7: predicate.transpose_eliminate 1.47% : 0.000002s : 13: predicate.tuple_list_convert_item_index_to_positive 1.60% : 0.000002s : 13: predicate.tuple_list_get_item_const_eliminator 1.43% : 0.000002s : 13: predicate.tuple_list_get_item_depend_reorder 3.30% : 0.000005s : 19: predicate.tuple_list_get_item_eliminator 1.47% : 0.000002s : 13: predicate.tuple_list_get_set_item_eliminator 2.56% : 0.000004s : 19: predicate.tuple_list_set_item_eliminator 1.60% : 0.000002s : 13: predicate.tuple_to_list_eliminator_ 1.98% : 0.000003s : 20: predicate.updatestate_pure_node_eliminater 3.08% : 0.000004s : 26: predicate.updatestate_useless_node_eliminater 0.43% : 0.000001s : 3: predicate.value_based_eliminate 0.74% : 0.000001s : 6: predicate.virtual_dataset_eliminate 0.95% : 0.000001s : 6: predicate.virtual_output_eliminate 0.34% : 0.000000s : 3: predicate.virtual_view_grad_eliminate 0.58% : 0.000001s : 3: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000226 5 8.54% : 0.000019s : 1: func_graph_cloner_run.FuncGraphClonerGraph 91.46% : 0.000207s : 4: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.024462 192 0.02% : 0.000006s : 1: ForceFp32Comm 13.20% : 0.003229s : 1: add_attr 13.14% : 0.003214s : 1: add_attr_with_inline 0.02% : 0.000006s : 1: add_comm_op_reuse_tag 0.22% : 0.000055s : 1: add_recomputation 0.03% : 0.000006s : 1: assign_add_opt 0.27% : 0.000066s : 1: auto_monad 0.10% : 0.000025s : 1: auto_monad_reorder 0.02% : 0.000006s : 1: begin_end_overlap_inline 0.03% : 0.000008s : 1: bias_add_comm_swap 1.98% : 0.000484s : 1: bootstrap 0.14% : 0.000035s : 1: cconv 0.03% : 0.000006s : 1: comm_op_add_attrs 0.08% : 0.000019s : 1: control_data_broadcast_order 0.05% : 0.000013s : 1: convert_after_rewriter 0.13% : 0.000032s : 1: cse_after_recomputation 0.03% : 0.000007s : 1: dataset_repeat_opt 0.08% : 0.000019s : 1: detach_backward 0.05% : 0.000011s : 1: environ_conv 0.09% : 0.000023s : 1: event_method 0.03% : 0.000007s : 1: full_micro_interleaved_order_control 0.03% : 0.000008s : 1: get_jit_bprop_graph 0.05% : 0.000012s : 1: graph_reusing 0.03% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.03% : 0.000006s : 1: handle_group_info 0.03% : 0.000008s : 1: inline 0.03% : 0.000008s : 1: insert-virtual-dataset 0.03% : 0.000006s : 1: interleave_parallel_branches 0.02% : 0.000006s : 1: interleave_split_concat_branches 0.03% : 0.000008s : 1: label_fine_grained_interleaved_index 0.04% : 0.000010s : 1: label_micro_interleaved_index 1.79% : 0.000439s : 1: loop_unroll 0.03% : 0.000006s : 1: merge_cast_opt 0.03% : 0.000007s : 1: micro_interleaved_order_control 2.27% : 0.000556s : 1: mutable_eliminate 0.04% : 0.000009s : 1: offloading_packed_experts 0.06% : 0.000015s : 1: opt.transform.loop_unroll_optimizer 0.07% : 0.000018s : 1: opt.transform.mutable_eliminate 3.14% : 0.000768s : 78: opt.transform.opt_a 0.11% : 0.000027s : 1: opt.transform.opt_after_cconv 0.10% : 0.000026s : 1: opt.transform.opt_after_jit_grad 0.40% : 0.000098s : 28: opt.transform.opt_b 0.19% : 0.000046s : 2: opt.transform.opt_trans_graph 0.15% : 0.000036s : 4: opt.transform.symbol_engine_opt 10.26% : 0.002509s : 1: opt_a 0.53% : 0.000128s : 1: opt_after_cconv 2.06% : 0.000504s : 1: opt_after_jit_grad 1.10% : 0.000268s : 1: opt_b 20.85% : 0.005101s : 1: optimize 0.09% : 0.000022s : 1: optimize_parallel_all_gather_comm 0.05% : 0.000011s : 1: order_py_execute_after_rewriter 0.10% : 0.000025s : 1: overlap_grad_flash_sp 0.03% : 0.000006s : 1: overlap_grad_matmul_and_grad_allreduce 0.04% : 0.000009s : 1: overlap_grad_ring_attention 0.03% : 0.000007s : 1: overlap_opt_shard_grad_in_pipeline 0.03% : 0.000007s : 1: overlap_opt_shard_in_pipeline 0.03% : 0.000008s : 1: overlap_param_gather 0.03% : 0.000006s : 1: overlap_recompute_allgather_and_fa_grad 0.04% : 0.000011s : 1: overlap_recompute_and_grad_model_parallel 0.03% : 0.000008s : 1: overlap_recompute_comm 0.04% : 0.000010s : 1: parallel-infer-symbol 0.03% : 0.000006s : 1: parallel-infer-symbol-second 0.03% : 0.000008s : 1: partial_unused_args_eliminate 0.04% : 0.000009s : 1: pipeline_parallel_scheduler 0.03% : 0.000008s : 1: pipeline_split 0.14% : 0.000034s : 1: pre_auto_parallel 0.11% : 0.000026s : 1: py_interpret_to_execute 0.07% : 0.000017s : 1: py_interpret_to_execute_after_opt_a 0.02% : 0.000006s : 1: remove_cast_before_assign_add 0.09% : 0.000023s : 1: remove_dup_value 1.19% : 0.000291s : 1: renormalize.infer 0.97% : 0.000236s : 1: renormalize.specialize 0.03% : 0.000008s : 1: reorder_send_recv_between_fp_bp 0.04% : 0.000011s : 1: rewriter_after_jit_bprop_graph 0.19% : 0.000045s : 1: rewriter_after_opt_a 0.22% : 0.000053s : 1: rewriter_before_opt_a 0.03% : 0.000008s : 1: slice_cell_reuse_recomputed_activation 0.03% : 0.000007s : 1: slice_recompute_activation 0.03% : 0.000007s : 1: split_layernorm_comm 0.03% : 0.000007s : 1: split_matmul_comm_elemetwise 0.04% : 0.000010s : 1: swap_dp_allreduce_reducescatter 0.40% : 0.000099s : 1: symbol_engine_optimizer 0.37% : 0.000091s : 1: tuple_transform 22.09% : 0.005403s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:40.830.394 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0137622, [21] [bootstrap]: 0.00044261 [type_inference]: 0.00528399 [event_method]: 1.228e-05 [auto_monad]: 5.442e-05 [graph_reusing]: 5.39e-06 [inline]: 2.35002e-06 [add_attr]: 0.00318888, [1] [add_attr_with_inline]: 0.00317929, [1] [Cycle 1]: 5.397e-05, [2] [tag_attr]: 1.463e-05 [meta_addattr_fg_expand]: 3.51001e-06 [parallel-infer-symbol]: 3.11001e-06 [pre_auto_parallel]: 2.693e-05 [insert-virtual-dataset]: 2.82002e-06 [parallel-infer-symbol-second]: 6.50005e-07 [dataset_repeat_opt]: 1.99e-06 [pipeline_split]: 1.56998e-06 [optimize]: 0.00407029, [53] [py_interpret_to_execute]: 1.91e-05 [rewriter_before_opt_a]: 4.461e-05 [opt_a]: 0.00212622, [2] [Cycle 1]: 0.00150348, [45] [expand_dump_flag]: 2.79001e-06 [switch_simplify]: 2.541e-05 [loop_unroll]: 1.37e-05 [a_1]: 0.00028895 [with_stream_mark]: 1.816e-05 [recompute_prepare]: 8.65001e-06 [updatestate_depend_eliminate]: 4.13999e-06 [updatestate_assign_eliminate]: 3.08e-06 [updatestate_loads_eliminate]: 3.13e-06 [parameter_eliminate]: 1.79e-06 [a_2]: 7.79e-05 [accelerated_algorithm]: 7.1e-06 [shard]: 2.12001e-06 [meta_shard_fg_expand]: 1.84e-06 [shard_inline]: 6.46e-06 [merge_send_recv]: 7.75e-06 [auto_parallel]: 6.56e-06 [parallel]: 1.729e-05 [flash_sp]: 7.8e-06 [merge_comm]: 4.05998e-06 [allreduce_fusion]: 3.40998e-06 [matmul_add_comm_reduction]: 9.22001e-06 [allreduce_slice_to_reducescatter]: 5.59987e-07 [virtual_shard_identity]: 8.42e-06 [virtual_dataset]: 6.14999e-06 [get_grad_eliminate_]: 6.51e-06 [virtual_output]: 5.99e-06 [merge_forward]: 3.67002e-06 [cell_reuse_recompute_pass]: 1.23002e-06 [offload_activation]: 1.024e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.367e-05 [merge_recompute_call_nodes]: 1.38002e-06 [before_grad]: 1.03e-05 [set_forward_comm_id_for_comm_node_pass]: 3.51001e-06 [meta_fg_expand]: 2.60997e-06 [flash_sp_send_recv_attached]: 2.33002e-06 [receive_attached]: 2.78e-06 [after_resolve]: 1.009e-05 [a_after_grad]: 8.82e-06 [renormalize]: 0.00054419 [add_forward_monad_depend]: 6.27001e-06 [auto_monad_grad]: 2.48998e-06 [auto_monad_eliminator]: 1.52e-05 [cse]: 3.052e-05 [a_3]: 4.549e-05 [Cycle 2]: 0.00061289, [45] [expand_dump_flag]: 1.54e-06 [switch_simplify]: 7.24001e-06 [loop_unroll]: 6.21e-06 [a_1]: 0.00010797 [with_stream_mark]: 1.124e-05 [recompute_prepare]: 6.24001e-06 [updatestate_depend_eliminate]: 2.84999e-06 [updatestate_assign_eliminate]: 2.37999e-06 [updatestate_loads_eliminate]: 2.84001e-06 [parameter_eliminate]: 9.89996e-07 [a_2]: 6.886e-05 [accelerated_algorithm]: 5.96e-06 [shard]: 1.24998e-06 [meta_shard_fg_expand]: 1.32e-06 [shard_inline]: 5.67999e-06 [merge_send_recv]: 5.06997e-06 [auto_parallel]: 5.25001e-06 [parallel]: 4.94998e-06 [flash_sp]: 3.38999e-06 [merge_comm]: 3.25e-06 [allreduce_fusion]: 3.55e-06 [matmul_add_comm_reduction]: 5.56998e-06 [allreduce_slice_to_reducescatter]: 3.19997e-07 [virtual_shard_identity]: 6.81001e-06 [virtual_dataset]: 5.58002e-06 [get_grad_eliminate_]: 5.19e-06 [virtual_output]: 5.10001e-06 [merge_forward]: 3.06001e-06 [cell_reuse_recompute_pass]: 1.69e-06 [offload_activation]: 6.21998e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.27e-05 [merge_recompute_call_nodes]: 7.60017e-07 [before_grad]: 9.84001e-06 [set_forward_comm_id_for_comm_node_pass]: 3.85998e-06 [meta_fg_expand]: 2.74001e-06 [flash_sp_send_recv_attached]: 1.04e-06 [receive_attached]: 1.27e-06 [after_resolve]: 9.89001e-06 [a_after_grad]: 8.08999e-06 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 1.42999e-06 [auto_monad_grad]: 1.29e-06 [auto_monad_eliminator]: 7e-06 [cse]: 1.391e-05 [a_3]: 3.42e-05 [py_interpret_to_execute_after_opt_a]: 9.84001e-06 [slice_cell_reuse_recomputed_activation]: 1.93002e-06 [rewriter_after_opt_a]: 3.522e-05 [convert_after_rewriter]: 6.29001e-06 [order_py_execute_after_rewriter]: 5.36002e-06 [mutable_eliminate]: 0.00050445 [opt_b]: 0.00019616, [1] [Cycle 1]: 0.00019066, [7] [b_1]: 0.00011872 [b_2]: 7.06999e-06 [updatestate_depend_eliminate]: 5.39998e-06 [updatestate_assign_eliminate]: 2.40002e-06 [updatestate_loads_eliminate]: 2.67001e-06 [renormalize]: 6.09987e-07 [cse]: 1.89e-05 [optimize_parallel_all_gather_comm]: 1.505e-05 [overlap_param_gather]: 2.27001e-06 [cconv]: 2.513e-05 [loop_unroll]: 0.00041536 [opt_after_cconv]: 0.00012422, [1] [Cycle 1]: 0.00011848, [7] [c_1]: 5.122e-05 [parameter_eliminate]: 3.19001e-06 [updatestate_depend_eliminate]: 5.00999e-06 [updatestate_assign_eliminate]: 2.42001e-06 [updatestate_loads_eliminate]: 2.18002e-06 [cse]: 1.762e-05 [renormalize]: 4.09986e-07 [remove_dup_value]: 1.445e-05 [tuple_transform]: 7.186e-05, [1] [Cycle 1]: 6.723e-05, [4] [d_1]: 4.022e-05 [none_parameter_eliminate]: 1.64e-06 [renormalize]: 2.00002e-07 [switch_simplify]: 6.66999e-06 [partial_unused_args_eliminate]: 1.74998e-06 [add_recomputation]: 4.815e-05 [cse_after_recomputation]: 2.105e-05, [1] [Cycle 1]: 1.656e-05, [1] [cse]: 1.135e-05 [environ_conv]: 5.42001e-06 [swap_dp_allreduce_reducescatter]: 5.14e-06 [bias_add_comm_swap]: 2.54001e-06 [label_micro_interleaved_index]: 4.03001e-06 [label_fine_grained_interleaved_index]: 2.91e-06 [merge_cast_opt]: 1.39998e-06 [slice_recompute_activation]: 2.26e-06 [micro_interleaved_order_control]: 2.24001e-06 [assign_add_opt]: 1.66002e-06 [ForceFp32Comm]: 9.50007e-07 [remove_cast_before_assign_add]: 9.20001e-07 [full_micro_interleaved_order_control]: 2.09999e-06 [reorder_send_recv_between_fp_bp]: 2.69001e-06 [comm_op_add_attrs]: 1.19998e-06 [add_comm_op_reuse_tag]: 9.80013e-07 [interleave_split_concat_branches]: 1.12e-06 [interleave_parallel_branches]: 1.07e-06 [overlap_opt_shard_in_pipeline]: 1.15001e-06 [overlap_opt_shard_grad_in_pipeline]: 1.71e-06 [control_data_broadcast_order]: 1.204e-05 [grouped_pairwise_exchange_alltoall]: 1.39998e-06 [offloading_packed_experts]: 4.02e-06 [overlap_recompute_and_grad_model_parallel]: 4.52e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.30999e-06 [overlap_recompute_allgather_and_fa_grad]: 1.49e-06 [overlap_recompute_comm]: 2.26e-06 [overlap_grad_ring_attention]: 4.27e-06 [overlap_grad_flash_sp]: 1.881e-05 [begin_end_overlap_inline]: 5.50004e-07 [split_matmul_comm_elemetwise]: 2.11003e-06 [split_layernorm_comm]: 1.52001e-06 [handle_group_info]: 1.03001e-06 [symbol_engine_optimizer]: 7.302e-05, [1] [Cycle 1]: 6.901e-05, [6] [build]: 3.10998e-06 [elim_shapecalc]: 8.85999e-06 [elim_not_effective]: 1.226e-05 [opt_reshape]: 7.21999e-06 [fold_const_symbol]: 9.51e-06 [renormalize]: 2.50002e-07 [detach_backward]: 2.12999e-06 [pipeline_parallel_scheduler]: 1.72001e-06 [auto_monad_reorder]: 1.551e-05 [get_jit_bprop_graph]: 1.65001e-06 [rewriter_after_jit_bprop_graph]: 3.97e-06 [opt_after_jit_grad]: 0.00045003 [validate]: 3.868e-05 Sums bootstrap : 0.000443s : 4.60% type_inference : 0.005284s : 54.90% event_method : 0.000012s : 0.13% auto_monad : 0.000054s : 0.57% graph_reusing : 0.000005s : 0.06% inline : 0.000002s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000015s : 0.15% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000004s : 0.04% parallel-infer-symbol : 0.000003s : 0.03% pre_auto_parallel : 0.000027s : 0.28% insert-virtual-dataset : 0.000003s : 0.03% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.02% optimize.py_interpret_to_execute : 0.000019s : 0.20% optimize.rewriter_before_opt_a : 0.000045s : 0.46% optimize.opt_a.expand_dump_flag : 0.000004s : 0.04% optimize.opt_a.switch_simplify : 0.000033s : 0.34% optimize.opt_a.loop_unroll : 0.000020s : 0.21% optimize.opt_a.a_1 : 0.000397s : 4.12% optimize.opt_a.with_stream_mark : 0.000029s : 0.31% optimize.opt_a.recompute_prepare : 0.000015s : 0.15% optimize.opt_a.updatestate_depend_eliminate : 0.000007s : 0.07% optimize.opt_a.updatestate_assign_eliminate : 0.000005s : 0.06% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.06% optimize.opt_a.parameter_eliminate : 0.000003s : 0.03% optimize.opt_a.a_2 : 0.000147s : 1.52% optimize.opt_a.accelerated_algorithm : 0.000013s : 0.14% optimize.opt_a.shard : 0.000003s : 0.04% optimize.opt_a.meta_shard_fg_expand : 0.000003s : 0.03% optimize.opt_a.shard_inline : 0.000012s : 0.13% optimize.opt_a.merge_send_recv : 0.000013s : 0.13% optimize.opt_a.auto_parallel : 0.000012s : 0.12% optimize.opt_a.parallel : 0.000022s : 0.23% optimize.opt_a.flash_sp : 0.000011s : 0.12% optimize.opt_a.merge_comm : 0.000007s : 0.08% optimize.opt_a.allreduce_fusion : 0.000007s : 0.07% optimize.opt_a.matmul_add_comm_reduction : 0.000015s : 0.15% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000015s : 0.16% optimize.opt_a.virtual_dataset : 0.000012s : 0.12% optimize.opt_a.get_grad_eliminate_ : 0.000012s : 0.12% optimize.opt_a.virtual_output : 0.000011s : 0.12% optimize.opt_a.merge_forward : 0.000007s : 0.07% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.03% optimize.opt_a.offload_activation : 0.000016s : 0.17% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000026s : 0.27% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.02% optimize.opt_a.before_grad : 0.000020s : 0.21% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000007s : 0.08% optimize.opt_a.meta_fg_expand : 0.000005s : 0.06% optimize.opt_a.flash_sp_send_recv_attached : 0.000003s : 0.04% optimize.opt_a.receive_attached : 0.000004s : 0.04% optimize.opt_a.after_resolve : 0.000020s : 0.21% optimize.opt_a.a_after_grad : 0.000017s : 0.18% optimize.opt_a.renormalize : 0.000544s : 5.66% optimize.opt_a.add_forward_monad_depend : 0.000008s : 0.08% optimize.opt_a.auto_monad_grad : 0.000004s : 0.04% optimize.opt_a.auto_monad_eliminator : 0.000022s : 0.23% optimize.opt_a.cse : 0.000044s : 0.46% optimize.opt_a.a_3 : 0.000080s : 0.83% optimize.py_interpret_to_execute_after_opt_a : 0.000010s : 0.10% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.02% optimize.rewriter_after_opt_a : 0.000035s : 0.37% optimize.convert_after_rewriter : 0.000006s : 0.07% optimize.order_py_execute_after_rewriter : 0.000005s : 0.06% optimize.mutable_eliminate : 0.000504s : 5.24% optimize.opt_b.b_1 : 0.000119s : 1.23% optimize.opt_b.b_2 : 0.000007s : 0.07% optimize.opt_b.updatestate_depend_eliminate : 0.000005s : 0.06% optimize.opt_b.updatestate_assign_eliminate : 0.000002s : 0.02% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_b.renormalize : 0.000001s : 0.01% optimize.opt_b.cse : 0.000019s : 0.20% optimize.optimize_parallel_all_gather_comm : 0.000015s : 0.16% optimize.overlap_param_gather : 0.000002s : 0.02% optimize.cconv : 0.000025s : 0.26% optimize.loop_unroll : 0.000415s : 4.32% optimize.opt_after_cconv.c_1 : 0.000051s : 0.53% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000005s : 0.05% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000002s : 0.03% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.02% optimize.opt_after_cconv.cse : 0.000018s : 0.18% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000014s : 0.15% optimize.tuple_transform.d_1 : 0.000040s : 0.42% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.02% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000007s : 0.07% optimize.partial_unused_args_eliminate : 0.000002s : 0.02% optimize.add_recomputation : 0.000048s : 0.50% optimize.cse_after_recomputation.cse : 0.000011s : 0.12% optimize.environ_conv : 0.000005s : 0.06% optimize.swap_dp_allreduce_reducescatter : 0.000005s : 0.05% optimize.bias_add_comm_swap : 0.000003s : 0.03% optimize.label_micro_interleaved_index : 0.000004s : 0.04% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.03% optimize.merge_cast_opt : 0.000001s : 0.01% optimize.slice_recompute_activation : 0.000002s : 0.02% optimize.micro_interleaved_order_control : 0.000002s : 0.02% optimize.assign_add_opt : 0.000002s : 0.02% optimize.ForceFp32Comm : 0.000001s : 0.01% optimize.remove_cast_before_assign_add : 0.000001s : 0.01% optimize.full_micro_interleaved_order_control : 0.000002s : 0.02% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.03% optimize.comm_op_add_attrs : 0.000001s : 0.01% optimize.add_comm_op_reuse_tag : 0.000001s : 0.01% optimize.interleave_split_concat_branches : 0.000001s : 0.01% optimize.interleave_parallel_branches : 0.000001s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.02% optimize.control_data_broadcast_order : 0.000012s : 0.13% optimize.grouped_pairwise_exchange_alltoall : 0.000001s : 0.01% optimize.offloading_packed_experts : 0.000004s : 0.04% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.05% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.02% optimize.overlap_recompute_comm : 0.000002s : 0.02% optimize.overlap_grad_ring_attention : 0.000004s : 0.04% optimize.overlap_grad_flash_sp : 0.000019s : 0.20% optimize.begin_end_overlap_inline : 0.000001s : 0.01% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.02% optimize.split_layernorm_comm : 0.000002s : 0.02% optimize.handle_group_info : 0.000001s : 0.01% optimize.symbol_engine_optimizer.build : 0.000003s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000009s : 0.09% optimize.symbol_engine_optimizer.elim_not_effective : 0.000012s : 0.13% optimize.symbol_engine_optimizer.opt_reshape : 0.000007s : 0.08% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000010s : 0.10% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.02% pipeline_parallel_scheduler : 0.000002s : 0.02% auto_monad_reorder : 0.000016s : 0.16% get_jit_bprop_graph : 0.000002s : 0.02% rewriter_after_jit_bprop_graph : 0.000004s : 0.04% opt_after_jit_grad : 0.000450s : 4.68% validate : 0.000039s : 0.40% Time group info: ------[substitution.] 0.000111 19 1.61% : 0.000002s : 2: substitution.elim_not_effective 1.19% : 0.000001s : 2: substitution.fold_const_symbol 5.27% : 0.000006s : 3: substitution.graph_param_transform 80.98% : 0.000090s : 2: substitution.inline 3.27% : 0.000004s : 4: substitution.j_node_and_user_rematch 4.03% : 0.000004s : 4: substitution.remove_not_recompute_node 3.65% : 0.000004s : 2: substitution.replace_old_param ------[type_inference.] 0.005233 2 91.44% : 0.004785s : 1: type_inference.infer 8.56% : 0.000448s : 1: type_inference.specialize ------[replace.] 0.000021 2 100.00% : 0.000021s : 2: replace.inline ------[match.] 0.000088 2 100.00% : 0.000088s : 2: match.inline ------[predicate.] 0.000131 754 1.08% : 0.000001s : 7: predicate.accumulaten_eliminater 1.02% : 0.000001s : 3: predicate.ad_related_special_op_eliminate 0.66% : 0.000001s : 6: predicate.addn_check_dump 0.82% : 0.000001s : 7: predicate.addn_zero_filter 0.68% : 0.000001s : 7: predicate.adjust_all_reduce_mul_add 2.25% : 0.000003s : 13: predicate.arithmetic_simplify 0.73% : 0.000001s : 7: predicate.cast_eliminate 0.73% : 0.000001s : 6: predicate.check_bprop_eliminate 0.69% : 0.000001s : 6: predicate.compare_switch_simplify 0.22% : 0.000000s : 3: predicate.const_output_eliminate 0.72% : 0.000001s : 6: predicate.depend_value_elim 0.88% : 0.000001s : 7: predicate.dict_get_item_const_eliminator 0.99% : 0.000001s : 7: predicate.dict_get_item_eliminator 0.77% : 0.000001s : 7: predicate.dict_set_item_eliminator 1.41% : 0.000002s : 6: predicate.dumpgradient_eliminate 0.28% : 0.000000s : 3: predicate.elim_not_effective 0.53% : 0.000001s : 3: predicate.elim_shapecalc_of_broadcastargs 1.13% : 0.000001s : 10: predicate.environ_add_const_eliminate 0.97% : 0.000001s : 10: predicate.environ_get_add_eliminate 0.97% : 0.000001s : 10: predicate.environ_get_depend_swap 1.79% : 0.000002s : 16: predicate.environ_get_eliminate 0.98% : 0.000001s : 10: predicate.environ_get_set_eliminate 0.94% : 0.000001s : 9: predicate.exchange_switch_depend_value 1.99% : 0.000003s : 9: predicate.float_depend_g_call 0.65% : 0.000001s : 6: predicate.float_environ_get_switch 0.94% : 0.000001s : 9: predicate.float_tuple_getitem_switch 0.21% : 0.000000s : 3: predicate.fold_const_symbol 1.05% : 0.000001s : 6: predicate.get_grad_eliminate 0.27% : 0.000000s : 3: predicate.graph_param_transform 0.78% : 0.000001s : 6: predicate.incorporate_call 0.67% : 0.000001s : 6: predicate.incorporate_call_switch 6.47% : 0.000009s : 34: predicate.inline 1.04% : 0.000001s : 6: predicate.inline_without_move 0.43% : 0.000001s : 6: predicate.j_node_and_user_rematch 1.05% : 0.000001s : 6: predicate.less_batch_normalization 1.59% : 0.000002s : 13: predicate.list_to_tuple_eliminator_ 2.15% : 0.000003s : 20: predicate.load_eliminater 1.17% : 0.000002s : 3: predicate.loop_unroll_after_grad 1.83% : 0.000002s : 14: predicate.loop_unroll_before_grad 1.71% : 0.000002s : 13: predicate.make_slice_get_slice_eliminator 0.67% : 0.000001s : 6: predicate.merge_addn 0.63% : 0.000001s : 6: predicate.micro_step_allgather_replace 0.68% : 0.000001s : 6: predicate.mini_step_allgather_replace 0.75% : 0.000001s : 7: predicate.minmaximum_grad 1.30% : 0.000002s : 3: predicate.mutable_eliminate 0.43% : 0.000001s : 3: predicate.opt_reshape 0.42% : 0.000001s : 3: predicate.parallel_virtual_node 1.35% : 0.000002s : 9: predicate.partial_defer_inline 1.19% : 0.000002s : 10: predicate.partial_eliminate 0.82% : 0.000001s : 7: predicate.print_const_string_wrapper 0.85% : 0.000001s : 6: predicate.reduce_all_const_elim 1.15% : 0.000002s : 7: predicate.reduce_eliminate 2.12% : 0.000003s : 20: predicate.redundant_stop_gradient_eliminater 0.65% : 0.000001s : 6: predicate.remove_not_recompute_node 1.23% : 0.000002s : 13: predicate.replace_applicator 0.72% : 0.000001s : 6: predicate.replace_old_param 0.37% : 0.000000s : 3: predicate.reset_defer_inline 1.06% : 0.000001s : 7: predicate.reshape_eliminate 0.72% : 0.000001s : 6: predicate.row_tensor_add_zeros_like 0.46% : 0.000001s : 3: predicate.row_tensor_eliminate 0.94% : 0.000001s : 6: predicate.same_eliminate 0.51% : 0.000001s : 6: predicate.set_cell_output_no_recompute 1.28% : 0.000002s : 6: predicate.shard_identity_eliminate 0.98% : 0.000001s : 6: predicate.special_op_eliminate 0.92% : 0.000001s : 6: predicate.specialize_transform 0.99% : 0.000001s : 6: predicate.split_environ_get_set_with_tuple_value 0.89% : 0.000001s : 6: predicate.stack_unstack_eliminate 0.47% : 0.000001s : 3: predicate.switch_call_monad_eliminater 1.05% : 0.000001s : 9: predicate.switch_defer_inline 1.75% : 0.000002s : 15: predicate.switch_layer_defer_inline 4.32% : 0.000006s : 32: predicate.switch_simplify 0.78% : 0.000001s : 7: predicate.tile_eliminate 0.80% : 0.000001s : 7: predicate.transpose_eliminate 1.65% : 0.000002s : 13: predicate.tuple_list_convert_item_index_to_positive 1.75% : 0.000002s : 13: predicate.tuple_list_get_item_const_eliminator 1.36% : 0.000002s : 13: predicate.tuple_list_get_item_depend_reorder 3.22% : 0.000004s : 19: predicate.tuple_list_get_item_eliminator 1.56% : 0.000002s : 13: predicate.tuple_list_get_set_item_eliminator 2.49% : 0.000003s : 19: predicate.tuple_list_set_item_eliminator 1.42% : 0.000002s : 13: predicate.tuple_to_list_eliminator_ 2.28% : 0.000003s : 20: predicate.updatestate_pure_node_eliminater 2.79% : 0.000004s : 26: predicate.updatestate_useless_node_eliminater 0.53% : 0.000001s : 3: predicate.value_based_eliminate 0.85% : 0.000001s : 6: predicate.virtual_dataset_eliminate 0.76% : 0.000001s : 6: predicate.virtual_output_eliminate 0.33% : 0.000000s : 3: predicate.virtual_view_grad_eliminate 0.53% : 0.000001s : 3: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000224 5 7.46% : 0.000017s : 1: func_graph_cloner_run.FuncGraphClonerGraph 92.54% : 0.000207s : 4: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.022471 192 0.02% : 0.000004s : 1: ForceFp32Comm 14.22% : 0.003195s : 1: add_attr 14.16% : 0.003183s : 1: add_attr_with_inline 0.02% : 0.000004s : 1: add_comm_op_reuse_tag 0.23% : 0.000052s : 1: add_recomputation 0.02% : 0.000004s : 1: assign_add_opt 0.26% : 0.000059s : 1: auto_monad 0.09% : 0.000019s : 1: auto_monad_reorder 0.01% : 0.000003s : 1: begin_end_overlap_inline 0.02% : 0.000005s : 1: bias_add_comm_swap 2.10% : 0.000473s : 1: bootstrap 0.13% : 0.000029s : 1: cconv 0.02% : 0.000004s : 1: comm_op_add_attrs 0.07% : 0.000016s : 1: control_data_broadcast_order 0.04% : 0.000009s : 1: convert_after_rewriter 0.11% : 0.000024s : 1: cse_after_recomputation 0.02% : 0.000005s : 1: dataset_repeat_opt 0.02% : 0.000006s : 1: detach_backward 0.04% : 0.000008s : 1: environ_conv 0.09% : 0.000020s : 1: event_method 0.02% : 0.000005s : 1: full_micro_interleaved_order_control 0.02% : 0.000005s : 1: get_jit_bprop_graph 0.04% : 0.000009s : 1: graph_reusing 0.02% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.02% : 0.000004s : 1: handle_group_info 0.02% : 0.000005s : 1: inline 0.03% : 0.000006s : 1: insert-virtual-dataset 0.02% : 0.000004s : 1: interleave_parallel_branches 0.02% : 0.000004s : 1: interleave_split_concat_branches 0.03% : 0.000006s : 1: label_fine_grained_interleaved_index 0.03% : 0.000007s : 1: label_micro_interleaved_index 1.88% : 0.000423s : 1: loop_unroll 0.02% : 0.000004s : 1: merge_cast_opt 0.02% : 0.000005s : 1: micro_interleaved_order_control 2.28% : 0.000512s : 1: mutable_eliminate 0.03% : 0.000007s : 1: offloading_packed_experts 0.06% : 0.000013s : 1: opt.transform.loop_unroll_optimizer 0.07% : 0.000015s : 1: opt.transform.mutable_eliminate 3.38% : 0.000759s : 78: opt.transform.opt_a 0.22% : 0.000050s : 1: opt.transform.opt_after_cconv 0.10% : 0.000022s : 1: opt.transform.opt_after_jit_grad 0.42% : 0.000094s : 28: opt.transform.opt_b 0.20% : 0.000045s : 2: opt.transform.opt_trans_graph 0.15% : 0.000034s : 4: opt.transform.symbol_engine_opt 9.48% : 0.002129s : 1: opt_a 0.57% : 0.000128s : 1: opt_after_cconv 2.04% : 0.000459s : 1: opt_after_jit_grad 0.89% : 0.000200s : 1: opt_b 18.13% : 0.004075s : 1: optimize 0.08% : 0.000019s : 1: optimize_parallel_all_gather_comm 0.04% : 0.000008s : 1: order_py_execute_after_rewriter 0.10% : 0.000022s : 1: overlap_grad_flash_sp 0.02% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.03% : 0.000007s : 1: overlap_grad_ring_attention 0.02% : 0.000004s : 1: overlap_opt_shard_grad_in_pipeline 0.02% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.02% : 0.000005s : 1: overlap_param_gather 0.02% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.03% : 0.000007s : 1: overlap_recompute_and_grad_model_parallel 0.02% : 0.000005s : 1: overlap_recompute_comm 0.03% : 0.000007s : 1: parallel-infer-symbol 0.02% : 0.000004s : 1: parallel-infer-symbol-second 0.02% : 0.000005s : 1: partial_unused_args_eliminate 0.02% : 0.000005s : 1: pipeline_parallel_scheduler 0.02% : 0.000004s : 1: pipeline_split 0.14% : 0.000031s : 1: pre_auto_parallel 0.10% : 0.000023s : 1: py_interpret_to_execute 0.06% : 0.000013s : 1: py_interpret_to_execute_after_opt_a 0.02% : 0.000003s : 1: remove_cast_before_assign_add 0.08% : 0.000018s : 1: remove_dup_value 1.35% : 0.000302s : 1: renormalize.infer 1.04% : 0.000234s : 1: renormalize.specialize 0.02% : 0.000005s : 1: reorder_send_recv_between_fp_bp 0.03% : 0.000007s : 1: rewriter_after_jit_bprop_graph 0.17% : 0.000039s : 1: rewriter_after_opt_a 0.22% : 0.000049s : 1: rewriter_before_opt_a 0.02% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.02% : 0.000005s : 1: slice_recompute_activation 0.02% : 0.000005s : 1: split_layernorm_comm 0.02% : 0.000005s : 1: split_matmul_comm_elemetwise 0.04% : 0.000008s : 1: swap_dp_allreduce_reducescatter 0.34% : 0.000076s : 1: symbol_engine_optimizer 0.33% : 0.000075s : 1: tuple_transform 23.59% : 0.005301s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:41.251.09 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:41.253.78 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0160005, [21] [bootstrap]: 0.00044838 [type_inference]: 0.00540056 [event_method]: 1.305e-05 [auto_monad]: 6.918e-05 [graph_reusing]: 6.01e-06 [inline]: 2.17001e-06 [add_attr]: 0.00336379, [1] [add_attr_with_inline]: 0.00335429, [1] [Cycle 1]: 7.377e-05, [2] [tag_attr]: 1.651e-05 [meta_addattr_fg_expand]: 3.84002e-06 [parallel-infer-symbol]: 3.04999e-06 [pre_auto_parallel]: 2.953e-05 [insert-virtual-dataset]: 2.64001e-06 [parallel-infer-symbol-second]: 7.90023e-07 [dataset_repeat_opt]: 1.90001e-06 [pipeline_split]: 1.60999e-06 [optimize]: 0.00541642, [53] [py_interpret_to_execute]: 2.305e-05 [rewriter_before_opt_a]: 5.637e-05 [opt_a]: 0.00294797, [2] [Cycle 1]: 0.00202671, [45] [expand_dump_flag]: 4.03999e-06 [switch_simplify]: 2.714e-05 [loop_unroll]: 1.557e-05 [a_1]: 0.0003771 [with_stream_mark]: 1.769e-05 [recompute_prepare]: 1.026e-05 [updatestate_depend_eliminate]: 4.89003e-06 [updatestate_assign_eliminate]: 4.25999e-06 [updatestate_loads_eliminate]: 3.86001e-06 [parameter_eliminate]: 1.90001e-06 [a_2]: 0.00013403 [accelerated_algorithm]: 1.003e-05 [shard]: 2.20002e-06 [meta_shard_fg_expand]: 2.27001e-06 [shard_inline]: 7.6e-06 [merge_send_recv]: 9.15999e-06 [auto_parallel]: 7.2e-06 [parallel]: 1.86e-05 [flash_sp]: 8.27e-06 [merge_comm]: 5.56002e-06 [allreduce_fusion]: 4.72998e-06 [matmul_add_comm_reduction]: 1.14e-05 [allreduce_slice_to_reducescatter]: 8.2e-07 [virtual_shard_identity]: 9.24e-06 [virtual_dataset]: 8.20999e-06 [get_grad_eliminate_]: 7.78001e-06 [virtual_output]: 7.41999e-06 [merge_forward]: 4.27e-06 [cell_reuse_recompute_pass]: 1.34e-06 [offload_activation]: 1.081e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.807e-05 [merge_recompute_call_nodes]: 1.74998e-06 [before_grad]: 1.338e-05 [set_forward_comm_id_for_comm_node_pass]: 4.67e-06 [meta_fg_expand]: 3.26999e-06 [flash_sp_send_recv_attached]: 3.03e-06 [receive_attached]: 2.43e-06 [after_resolve]: 1.156e-05 [a_after_grad]: 1.134e-05 [renormalize]: 0.00069549 [add_forward_monad_depend]: 5.00999e-06 [auto_monad_grad]: 2.56998e-06 [auto_monad_eliminator]: 1.69e-05 [cse]: 3.513e-05 [a_3]: 7.068e-05 [Cycle 2]: 0.00090679, [45] [expand_dump_flag]: 1.17e-06 [switch_simplify]: 9.09e-06 [loop_unroll]: 7.56001e-06 [a_1]: 0.00015282 [with_stream_mark]: 1.737e-05 [recompute_prepare]: 8.15e-06 [updatestate_depend_eliminate]: 4.45e-06 [updatestate_assign_eliminate]: 3.19001e-06 [updatestate_loads_eliminate]: 2.83e-06 [parameter_eliminate]: 1.14003e-06 [a_2]: 0.00011693 [accelerated_algorithm]: 7.71999e-06 [shard]: 1.32e-06 [meta_shard_fg_expand]: 1.76e-06 [shard_inline]: 7.50998e-06 [merge_send_recv]: 6.01e-06 [auto_parallel]: 6.34999e-06 [parallel]: 6.38e-06 [flash_sp]: 3.81999e-06 [merge_comm]: 4.18999e-06 [allreduce_fusion]: 3.96001e-06 [matmul_add_comm_reduction]: 7.58001e-06 [allreduce_slice_to_reducescatter]: 7.09988e-07 [virtual_shard_identity]: 8.72e-06 [virtual_dataset]: 7.10998e-06 [get_grad_eliminate_]: 6.60997e-06 [virtual_output]: 6.41e-06 [merge_forward]: 4.01001e-06 [cell_reuse_recompute_pass]: 1.60999e-06 [offload_activation]: 7.71999e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.626e-05 [merge_recompute_call_nodes]: 8.70001e-07 [before_grad]: 1.163e-05 [set_forward_comm_id_for_comm_node_pass]: 4.80001e-06 [meta_fg_expand]: 2.99001e-06 [flash_sp_send_recv_attached]: 9.50007e-07 [receive_attached]: 1.79e-06 [after_resolve]: 1.07e-05 [a_after_grad]: 1.034e-05 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 1.30001e-06 [auto_monad_grad]: 1.39e-06 [auto_monad_eliminator]: 9.38002e-06 [cse]: 2.008e-05 [a_3]: 5.651e-05 [py_interpret_to_execute_after_opt_a]: 1.616e-05 [slice_cell_reuse_recomputed_activation]: 4.50001e-06 [rewriter_after_opt_a]: 4.69e-05 [convert_after_rewriter]: 1.048e-05 [order_py_execute_after_rewriter]: 9.65002e-06 [mutable_eliminate]: 0.00058039 [opt_b]: 0.00031006, [1] [Cycle 1]: 0.00030091, [7] [b_1]: 0.0001921 [b_2]: 1.004e-05 [updatestate_depend_eliminate]: 7.45e-06 [updatestate_assign_eliminate]: 3.32002e-06 [updatestate_loads_eliminate]: 3.01001e-06 [renormalize]: 6.39993e-07 [cse]: 2.631e-05 [optimize_parallel_all_gather_comm]: 2.131e-05 [overlap_param_gather]: 4.92999e-06 [cconv]: 3.064e-05 [loop_unroll]: 0.00047978 [opt_after_cconv]: 0.00014265, [1] [Cycle 1]: 0.00013336, [7] [c_1]: 3.501e-05 [parameter_eliminate]: 3.62998e-06 [updatestate_depend_eliminate]: 6.68e-06 [updatestate_assign_eliminate]: 3.09999e-06 [updatestate_loads_eliminate]: 3.71999e-06 [cse]: 2.433e-05 [renormalize]: 5.8001e-07 [remove_dup_value]: 1.967e-05 [tuple_transform]: 9.913e-05, [1] [Cycle 1]: 9.174e-05, [4] [d_1]: 4.914e-05 [none_parameter_eliminate]: 1.67001e-06 [renormalize]: 2.09984e-07 [switch_simplify]: 8.40999e-06 [partial_unused_args_eliminate]: 4.57e-06 [add_recomputation]: 5.97e-05 [cse_after_recomputation]: 3.3e-05, [1] [Cycle 1]: 2.581e-05, [1] [cse]: 1.68e-05 [environ_conv]: 9.88002e-06 [swap_dp_allreduce_reducescatter]: 8.99e-06 [bias_add_comm_swap]: 5.31002e-06 [label_micro_interleaved_index]: 7.4e-06 [label_fine_grained_interleaved_index]: 5.38002e-06 [merge_cast_opt]: 3.85e-06 [slice_recompute_activation]: 4.67e-06 [micro_interleaved_order_control]: 4.88001e-06 [assign_add_opt]: 3.65e-06 [ForceFp32Comm]: 3.18e-06 [remove_cast_before_assign_add]: 3.95e-06 [full_micro_interleaved_order_control]: 4.74002e-06 [reorder_send_recv_between_fp_bp]: 5.44e-06 [comm_op_add_attrs]: 4.02998e-06 [add_comm_op_reuse_tag]: 3.46001e-06 [interleave_split_concat_branches]: 3.43999e-06 [interleave_parallel_branches]: 3.81999e-06 [overlap_opt_shard_in_pipeline]: 3.54002e-06 [overlap_opt_shard_grad_in_pipeline]: 4.50001e-06 [control_data_broadcast_order]: 1.85e-05 [grouped_pairwise_exchange_alltoall]: 4.16001e-06 [offloading_packed_experts]: 7.43999e-06 [overlap_recompute_and_grad_model_parallel]: 7.77e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.78001e-06 [overlap_recompute_allgather_and_fa_grad]: 3.61999e-06 [overlap_recompute_comm]: 5.39e-06 [overlap_grad_ring_attention]: 7.46999e-06 [overlap_grad_flash_sp]: 2.609e-05 [begin_end_overlap_inline]: 3.11001e-06 [split_matmul_comm_elemetwise]: 4.53001e-06 [split_layernorm_comm]: 4.09002e-06 [handle_group_info]: 3.31001e-06 [symbol_engine_optimizer]: 0.00010667, [1] [Cycle 1]: 9.961e-05, [6] [build]: 2.98e-06 [elim_shapecalc]: 1.246e-05 [elim_not_effective]: 1.582e-05 [opt_reshape]: 8.60999e-06 [fold_const_symbol]: 1.27e-05 [renormalize]: 2.50002e-07 [detach_backward]: 3.95e-06 [pipeline_parallel_scheduler]: 2.01e-06 [auto_monad_reorder]: 2.237e-05 [get_jit_bprop_graph]: 1.84998e-06 [rewriter_after_jit_bprop_graph]: 5.51e-06 [opt_after_jit_grad]: 0.00056262 [validate]: 4.8e-05 Sums bootstrap : 0.000448s : 4.13% type_inference : 0.005401s : 49.72% event_method : 0.000013s : 0.12% auto_monad : 0.000069s : 0.64% graph_reusing : 0.000006s : 0.06% inline : 0.000002s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000017s : 0.15% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000004s : 0.04% parallel-infer-symbol : 0.000003s : 0.03% pre_auto_parallel : 0.000030s : 0.27% insert-virtual-dataset : 0.000003s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000023s : 0.21% optimize.rewriter_before_opt_a : 0.000056s : 0.52% optimize.opt_a.expand_dump_flag : 0.000005s : 0.05% optimize.opt_a.switch_simplify : 0.000036s : 0.33% optimize.opt_a.loop_unroll : 0.000023s : 0.21% optimize.opt_a.a_1 : 0.000530s : 4.88% optimize.opt_a.with_stream_mark : 0.000035s : 0.32% optimize.opt_a.recompute_prepare : 0.000018s : 0.17% optimize.opt_a.updatestate_depend_eliminate : 0.000009s : 0.09% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.07% optimize.opt_a.updatestate_loads_eliminate : 0.000007s : 0.06% optimize.opt_a.parameter_eliminate : 0.000003s : 0.03% optimize.opt_a.a_2 : 0.000251s : 2.31% optimize.opt_a.accelerated_algorithm : 0.000018s : 0.16% optimize.opt_a.shard : 0.000004s : 0.03% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.04% optimize.opt_a.shard_inline : 0.000015s : 0.14% optimize.opt_a.merge_send_recv : 0.000015s : 0.14% optimize.opt_a.auto_parallel : 0.000014s : 0.12% optimize.opt_a.parallel : 0.000025s : 0.23% optimize.opt_a.flash_sp : 0.000012s : 0.11% optimize.opt_a.merge_comm : 0.000010s : 0.09% optimize.opt_a.allreduce_fusion : 0.000009s : 0.08% optimize.opt_a.matmul_add_comm_reduction : 0.000019s : 0.17% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000002s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000018s : 0.17% optimize.opt_a.virtual_dataset : 0.000015s : 0.14% optimize.opt_a.get_grad_eliminate_ : 0.000014s : 0.13% optimize.opt_a.virtual_output : 0.000014s : 0.13% optimize.opt_a.merge_forward : 0.000008s : 0.08% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.03% optimize.opt_a.offload_activation : 0.000019s : 0.17% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000034s : 0.32% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.02% optimize.opt_a.before_grad : 0.000025s : 0.23% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000009s : 0.09% optimize.opt_a.meta_fg_expand : 0.000006s : 0.06% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.04% optimize.opt_a.receive_attached : 0.000004s : 0.04% optimize.opt_a.after_resolve : 0.000022s : 0.20% optimize.opt_a.a_after_grad : 0.000022s : 0.20% optimize.opt_a.renormalize : 0.000696s : 6.40% optimize.opt_a.add_forward_monad_depend : 0.000006s : 0.06% optimize.opt_a.auto_monad_grad : 0.000004s : 0.04% optimize.opt_a.auto_monad_eliminator : 0.000026s : 0.24% optimize.opt_a.cse : 0.000055s : 0.51% optimize.opt_a.a_3 : 0.000127s : 1.17% optimize.py_interpret_to_execute_after_opt_a : 0.000016s : 0.15% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.04% optimize.rewriter_after_opt_a : 0.000047s : 0.43% optimize.convert_after_rewriter : 0.000010s : 0.10% optimize.order_py_execute_after_rewriter : 0.000010s : 0.09% optimize.mutable_eliminate : 0.000580s : 5.34% optimize.opt_b.b_1 : 0.000192s : 1.77% optimize.opt_b.b_2 : 0.000010s : 0.09% optimize.opt_b.updatestate_depend_eliminate : 0.000007s : 0.07% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_b.renormalize : 0.000001s : 0.01% optimize.opt_b.cse : 0.000026s : 0.24% optimize.optimize_parallel_all_gather_comm : 0.000021s : 0.20% optimize.overlap_param_gather : 0.000005s : 0.05% optimize.cconv : 0.000031s : 0.28% optimize.loop_unroll : 0.000480s : 4.42% optimize.opt_after_cconv.c_1 : 0.000035s : 0.32% optimize.opt_after_cconv.parameter_eliminate : 0.000004s : 0.03% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000007s : 0.06% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000004s : 0.03% optimize.opt_after_cconv.cse : 0.000024s : 0.22% optimize.opt_after_cconv.renormalize : 0.000001s : 0.01% optimize.remove_dup_value : 0.000020s : 0.18% optimize.tuple_transform.d_1 : 0.000049s : 0.45% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.02% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000008s : 0.08% optimize.partial_unused_args_eliminate : 0.000005s : 0.04% optimize.add_recomputation : 0.000060s : 0.55% optimize.cse_after_recomputation.cse : 0.000017s : 0.15% optimize.environ_conv : 0.000010s : 0.09% optimize.swap_dp_allreduce_reducescatter : 0.000009s : 0.08% optimize.bias_add_comm_swap : 0.000005s : 0.05% optimize.label_micro_interleaved_index : 0.000007s : 0.07% optimize.label_fine_grained_interleaved_index : 0.000005s : 0.05% optimize.merge_cast_opt : 0.000004s : 0.04% optimize.slice_recompute_activation : 0.000005s : 0.04% optimize.micro_interleaved_order_control : 0.000005s : 0.04% optimize.assign_add_opt : 0.000004s : 0.03% optimize.ForceFp32Comm : 0.000003s : 0.03% optimize.remove_cast_before_assign_add : 0.000004s : 0.04% optimize.full_micro_interleaved_order_control : 0.000005s : 0.04% optimize.reorder_send_recv_between_fp_bp : 0.000005s : 0.05% optimize.comm_op_add_attrs : 0.000004s : 0.04% optimize.add_comm_op_reuse_tag : 0.000003s : 0.03% optimize.interleave_split_concat_branches : 0.000003s : 0.03% optimize.interleave_parallel_branches : 0.000004s : 0.04% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.03% optimize.overlap_opt_shard_grad_in_pipeline : 0.000005s : 0.04% optimize.control_data_broadcast_order : 0.000018s : 0.17% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.04% optimize.offloading_packed_experts : 0.000007s : 0.07% optimize.overlap_recompute_and_grad_model_parallel : 0.000008s : 0.07% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.03% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.03% optimize.overlap_recompute_comm : 0.000005s : 0.05% optimize.overlap_grad_ring_attention : 0.000007s : 0.07% optimize.overlap_grad_flash_sp : 0.000026s : 0.24% optimize.begin_end_overlap_inline : 0.000003s : 0.03% optimize.split_matmul_comm_elemetwise : 0.000005s : 0.04% optimize.split_layernorm_comm : 0.000004s : 0.04% optimize.handle_group_info : 0.000003s : 0.03% optimize.symbol_engine_optimizer.build : 0.000003s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000012s : 0.11% optimize.symbol_engine_optimizer.elim_not_effective : 0.000016s : 0.15% optimize.symbol_engine_optimizer.opt_reshape : 0.000009s : 0.08% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000013s : 0.12% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000004s : 0.04% pipeline_parallel_scheduler : 0.000002s : 0.02% auto_monad_reorder : 0.000022s : 0.21% get_jit_bprop_graph : 0.000002s : 0.02% rewriter_after_jit_bprop_graph : 0.000006s : 0.05% opt_after_jit_grad : 0.000563s : 5.18% validate : 0.000048s : 0.44% Time group info: ------[substitution.] 0.000151 28 13.73% : 0.000021s : 2: substitution.cast_eliminate 1.53% : 0.000002s : 3: substitution.elim_not_effective 1.20% : 0.000002s : 3: substitution.fold_const_symbol 4.47% : 0.000007s : 4: substitution.graph_param_transform 69.62% : 0.000105s : 2: substitution.inline 3.02% : 0.000005s : 6: substitution.j_node_and_user_rematch 3.93% : 0.000006s : 6: substitution.remove_not_recompute_node 2.50% : 0.000004s : 2: substitution.replace_old_param ------[type_inference.] 0.005348 2 90.94% : 0.004863s : 1: type_inference.infer 9.06% : 0.000485s : 1: type_inference.specialize ------[replace.] 0.000022 2 100.00% : 0.000022s : 2: replace.inline ------[match.] 0.000103 2 100.00% : 0.000103s : 2: match.inline ------[predicate.] 0.000172 980 0.77% : 0.000001s : 9: predicate.accumulaten_eliminater 1.32% : 0.000002s : 4: predicate.ad_related_special_op_eliminate 0.79% : 0.000001s : 8: predicate.addn_check_dump 0.75% : 0.000001s : 9: predicate.addn_zero_filter 0.77% : 0.000001s : 9: predicate.adjust_all_reduce_mul_add 2.44% : 0.000004s : 17: predicate.arithmetic_simplify 0.96% : 0.000002s : 9: predicate.cast_eliminate 0.81% : 0.000001s : 8: predicate.check_bprop_eliminate 0.69% : 0.000001s : 8: predicate.compare_switch_simplify 0.22% : 0.000000s : 4: predicate.const_output_eliminate 0.82% : 0.000001s : 8: predicate.depend_value_elim 0.85% : 0.000001s : 9: predicate.dict_get_item_const_eliminator 0.97% : 0.000002s : 9: predicate.dict_get_item_eliminator 0.77% : 0.000001s : 9: predicate.dict_set_item_eliminator 1.16% : 0.000002s : 8: predicate.dumpgradient_eliminate 0.23% : 0.000000s : 4: predicate.elim_not_effective 0.55% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.13% : 0.000002s : 13: predicate.environ_add_const_eliminate 1.06% : 0.000002s : 13: predicate.environ_get_add_eliminate 1.03% : 0.000002s : 13: predicate.environ_get_depend_swap 1.83% : 0.000003s : 21: predicate.environ_get_eliminate 1.06% : 0.000002s : 13: predicate.environ_get_set_eliminate 0.91% : 0.000002s : 11: predicate.exchange_switch_depend_value 1.74% : 0.000003s : 11: predicate.float_depend_g_call 0.71% : 0.000001s : 8: predicate.float_environ_get_switch 1.07% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.23% : 0.000000s : 4: predicate.fold_const_symbol 0.81% : 0.000001s : 8: predicate.get_grad_eliminate 0.30% : 0.000001s : 4: predicate.graph_param_transform 0.85% : 0.000001s : 8: predicate.incorporate_call 0.64% : 0.000001s : 8: predicate.incorporate_call_switch 6.26% : 0.000011s : 44: predicate.inline 0.98% : 0.000002s : 8: predicate.inline_without_move 0.36% : 0.000001s : 8: predicate.j_node_and_user_rematch 1.10% : 0.000002s : 8: predicate.less_batch_normalization 1.75% : 0.000003s : 17: predicate.list_to_tuple_eliminator_ 2.11% : 0.000004s : 26: predicate.load_eliminater 1.53% : 0.000003s : 4: predicate.loop_unroll_after_grad 1.41% : 0.000002s : 16: predicate.loop_unroll_before_grad 1.77% : 0.000003s : 17: predicate.make_slice_get_slice_eliminator 0.74% : 0.000001s : 8: predicate.merge_addn 0.75% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.76% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.72% : 0.000001s : 9: predicate.minmaximum_grad 1.64% : 0.000003s : 4: predicate.mutable_eliminate 0.47% : 0.000001s : 4: predicate.opt_reshape 0.42% : 0.000001s : 4: predicate.parallel_virtual_node 1.25% : 0.000002s : 11: predicate.partial_defer_inline 1.24% : 0.000002s : 13: predicate.partial_eliminate 0.78% : 0.000001s : 9: predicate.print_const_string_wrapper 0.78% : 0.000001s : 8: predicate.reduce_all_const_elim 1.02% : 0.000002s : 9: predicate.reduce_eliminate 2.08% : 0.000004s : 26: predicate.redundant_stop_gradient_eliminater 0.51% : 0.000001s : 8: predicate.remove_not_recompute_node 1.17% : 0.000002s : 17: predicate.replace_applicator 0.61% : 0.000001s : 8: predicate.replace_old_param 0.31% : 0.000001s : 4: predicate.reset_defer_inline 0.95% : 0.000002s : 9: predicate.reshape_eliminate 0.85% : 0.000001s : 8: predicate.row_tensor_add_zeros_like 0.53% : 0.000001s : 4: predicate.row_tensor_eliminate 1.09% : 0.000002s : 8: predicate.same_eliminate 0.52% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.89% : 0.000002s : 8: predicate.shard_identity_eliminate 1.00% : 0.000002s : 8: predicate.special_op_eliminate 1.01% : 0.000002s : 8: predicate.specialize_transform 1.05% : 0.000002s : 8: predicate.split_environ_get_set_with_tuple_value 1.01% : 0.000002s : 8: predicate.stack_unstack_eliminate 0.43% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.04% : 0.000002s : 11: predicate.switch_defer_inline 1.65% : 0.000003s : 19: predicate.switch_layer_defer_inline 4.14% : 0.000007s : 39: predicate.switch_simplify 0.85% : 0.000001s : 9: predicate.tile_eliminate 0.92% : 0.000002s : 9: predicate.transpose_eliminate 1.61% : 0.000003s : 17: predicate.tuple_list_convert_item_index_to_positive 1.57% : 0.000003s : 17: predicate.tuple_list_get_item_const_eliminator 1.52% : 0.000003s : 17: predicate.tuple_list_get_item_depend_reorder 3.03% : 0.000005s : 25: predicate.tuple_list_get_item_eliminator 1.51% : 0.000003s : 17: predicate.tuple_list_get_set_item_eliminator 2.54% : 0.000004s : 25: predicate.tuple_list_set_item_eliminator 1.50% : 0.000003s : 17: predicate.tuple_to_list_eliminator_ 2.08% : 0.000004s : 26: predicate.updatestate_pure_node_eliminater 2.96% : 0.000005s : 34: predicate.updatestate_useless_node_eliminater 0.55% : 0.000001s : 4: predicate.value_based_eliminate 0.77% : 0.000001s : 8: predicate.virtual_dataset_eliminate 0.77% : 0.000001s : 8: predicate.virtual_output_eliminate 0.34% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.53% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000237 5 7.97% : 0.000019s : 1: func_graph_cloner_run.FuncGraphClonerGraph 92.03% : 0.000218s : 4: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.026665 192 0.02% : 0.000006s : 1: ForceFp32Comm 12.65% : 0.003374s : 1: add_attr 12.59% : 0.003358s : 1: add_attr_with_inline 0.02% : 0.000006s : 1: add_comm_op_reuse_tag 0.24% : 0.000063s : 1: add_recomputation 0.02% : 0.000006s : 1: assign_add_opt 0.30% : 0.000079s : 1: auto_monad 0.11% : 0.000029s : 1: auto_monad_reorder 0.02% : 0.000006s : 1: begin_end_overlap_inline 0.03% : 0.000008s : 1: bias_add_comm_swap 1.84% : 0.000492s : 1: bootstrap 0.13% : 0.000034s : 1: cconv 0.03% : 0.000007s : 1: comm_op_add_attrs 0.08% : 0.000022s : 1: control_data_broadcast_order 0.05% : 0.000014s : 1: convert_after_rewriter 0.14% : 0.000036s : 1: cse_after_recomputation 0.03% : 0.000007s : 1: dataset_repeat_opt 0.08% : 0.000020s : 1: detach_backward 0.05% : 0.000013s : 1: environ_conv 0.09% : 0.000024s : 1: event_method 0.03% : 0.000007s : 1: full_micro_interleaved_order_control 0.03% : 0.000008s : 1: get_jit_bprop_graph 0.05% : 0.000012s : 1: graph_reusing 0.03% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.02% : 0.000006s : 1: handle_group_info 0.03% : 0.000008s : 1: inline 0.03% : 0.000009s : 1: insert-virtual-dataset 0.02% : 0.000006s : 1: interleave_parallel_branches 0.02% : 0.000006s : 1: interleave_split_concat_branches 0.03% : 0.000008s : 1: label_fine_grained_interleaved_index 0.04% : 0.000010s : 1: label_micro_interleaved_index 1.82% : 0.000486s : 1: loop_unroll 0.02% : 0.000006s : 1: merge_cast_opt 0.03% : 0.000008s : 1: micro_interleaved_order_control 2.20% : 0.000587s : 1: mutable_eliminate 0.04% : 0.000010s : 1: offloading_packed_experts 0.06% : 0.000017s : 1: opt.transform.loop_unroll_optimizer 0.07% : 0.000019s : 1: opt.transform.mutable_eliminate 3.77% : 0.001005s : 78: opt.transform.opt_a 0.13% : 0.000034s : 1: opt.transform.opt_after_cconv 0.12% : 0.000031s : 1: opt.transform.opt_after_jit_grad 0.48% : 0.000129s : 28: opt.transform.opt_b 0.21% : 0.000055s : 2: opt.transform.opt_trans_graph 0.17% : 0.000046s : 4: opt.transform.symbol_engine_opt 11.07% : 0.002952s : 1: opt_a 0.55% : 0.000146s : 1: opt_after_cconv 2.15% : 0.000574s : 1: opt_after_jit_grad 1.18% : 0.000313s : 1: opt_b 21.62% : 0.005764s : 1: optimize 0.09% : 0.000025s : 1: optimize_parallel_all_gather_comm 0.05% : 0.000013s : 1: order_py_execute_after_rewriter 0.11% : 0.000029s : 1: overlap_grad_flash_sp 0.02% : 0.000007s : 1: overlap_grad_matmul_and_grad_allreduce 0.04% : 0.000010s : 1: overlap_grad_ring_attention 0.03% : 0.000007s : 1: overlap_opt_shard_grad_in_pipeline 0.02% : 0.000006s : 1: overlap_opt_shard_in_pipeline 0.03% : 0.000008s : 1: overlap_param_gather 0.02% : 0.000006s : 1: overlap_recompute_allgather_and_fa_grad 0.04% : 0.000011s : 1: overlap_recompute_and_grad_model_parallel 0.03% : 0.000008s : 1: overlap_recompute_comm 0.04% : 0.000010s : 1: parallel-infer-symbol 0.03% : 0.000007s : 1: parallel-infer-symbol-second 0.03% : 0.000008s : 1: partial_unused_args_eliminate 0.04% : 0.000010s : 1: pipeline_parallel_scheduler 0.03% : 0.000007s : 1: pipeline_split 0.14% : 0.000037s : 1: pre_auto_parallel 0.10% : 0.000027s : 1: py_interpret_to_execute 0.07% : 0.000020s : 1: py_interpret_to_execute_after_opt_a 0.02% : 0.000007s : 1: remove_cast_before_assign_add 0.09% : 0.000023s : 1: remove_dup_value 1.51% : 0.000402s : 1: renormalize.infer 1.07% : 0.000285s : 1: renormalize.specialize 0.03% : 0.000008s : 1: reorder_send_recv_between_fp_bp 0.04% : 0.000012s : 1: rewriter_after_jit_bprop_graph 0.19% : 0.000051s : 1: rewriter_after_opt_a 0.22% : 0.000060s : 1: rewriter_before_opt_a 0.03% : 0.000007s : 1: slice_cell_reuse_recomputed_activation 0.03% : 0.000008s : 1: slice_recompute_activation 0.03% : 0.000007s : 1: split_layernorm_comm 0.03% : 0.000007s : 1: split_matmul_comm_elemetwise 0.05% : 0.000012s : 1: swap_dp_allreduce_reducescatter 0.41% : 0.000110s : 1: symbol_engine_optimizer 0.38% : 0.000102s : 1: tuple_transform 20.38% : 0.005436s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:41.222.480 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.014933, [21] [bootstrap]: 0.00046493 [type_inference]: 0.00561695 [event_method]: 1.465e-05 [auto_monad]: 5.976e-05 [graph_reusing]: 5.93002e-06 [inline]: 2.48e-06 [add_attr]: 0.00332744, [1] [add_attr_with_inline]: 0.0033177, [1] [Cycle 1]: 5.782e-05, [2] [tag_attr]: 1.58e-05 [meta_addattr_fg_expand]: 3.93001e-06 [parallel-infer-symbol]: 3.36999e-06 [pre_auto_parallel]: 3.03e-05 [insert-virtual-dataset]: 2.39999e-06 [parallel-infer-symbol-second]: 8.00006e-07 [dataset_repeat_opt]: 1.92999e-06 [pipeline_split]: 1.67999e-06 [optimize]: 0.00469675, [53] [py_interpret_to_execute]: 1.958e-05 [rewriter_before_opt_a]: 5.125e-05 [opt_a]: 0.00260958, [2] [Cycle 1]: 0.00186457, [45] [expand_dump_flag]: 2.94001e-06 [switch_simplify]: 2.734e-05 [loop_unroll]: 1.497e-05 [a_1]: 0.00037407 [with_stream_mark]: 1.767e-05 [recompute_prepare]: 1.095e-05 [updatestate_depend_eliminate]: 4.4e-06 [updatestate_assign_eliminate]: 4.17003e-06 [updatestate_loads_eliminate]: 3.73001e-06 [parameter_eliminate]: 2.01e-06 [a_2]: 0.00010244 [accelerated_algorithm]: 9.43997e-06 [shard]: 2.14999e-06 [meta_shard_fg_expand]: 1.94e-06 [shard_inline]: 8.06001e-06 [merge_send_recv]: 9.64999e-06 [auto_parallel]: 7.22002e-06 [parallel]: 1.795e-05 [flash_sp]: 8.65001e-06 [merge_comm]: 4.71002e-06 [allreduce_fusion]: 4.83001e-06 [matmul_add_comm_reduction]: 1.069e-05 [allreduce_slice_to_reducescatter]: 6.30011e-07 [virtual_shard_identity]: 9.37001e-06 [virtual_dataset]: 7.58001e-06 [get_grad_eliminate_]: 7.38e-06 [virtual_output]: 7.25e-06 [merge_forward]: 4.37e-06 [cell_reuse_recompute_pass]: 1.28002e-06 [offload_activation]: 1.132e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.703e-05 [merge_recompute_call_nodes]: 1.87001e-06 [before_grad]: 1.359e-05 [set_forward_comm_id_for_comm_node_pass]: 4.94e-06 [meta_fg_expand]: 3.66999e-06 [flash_sp_send_recv_attached]: 2.51e-06 [receive_attached]: 2.61999e-06 [after_resolve]: 1.227e-05 [a_after_grad]: 1.134e-05 [renormalize]: 0.00071378 [add_forward_monad_depend]: 5.91e-06 [auto_monad_grad]: 2.38998e-06 [auto_monad_eliminator]: 1.679e-05 [cse]: 3.666e-05 [a_3]: 6.045e-05 [Cycle 2]: 0.00073471, [45] [expand_dump_flag]: 1.37999e-06 [switch_simplify]: 9.25999e-06 [loop_unroll]: 8.23999e-06 [a_1]: 0.00015541 [with_stream_mark]: 1.253e-05 [recompute_prepare]: 8.48999e-06 [updatestate_depend_eliminate]: 4.28999e-06 [updatestate_assign_eliminate]: 3.24001e-06 [updatestate_loads_eliminate]: 3.09999e-06 [parameter_eliminate]: 1.45001e-06 [a_2]: 8.694e-05 [accelerated_algorithm]: 7.71999e-06 [shard]: 1.14e-06 [meta_shard_fg_expand]: 2.26e-06 [shard_inline]: 7.21001e-06 [merge_send_recv]: 6.83e-06 [auto_parallel]: 6.49999e-06 [parallel]: 5.97999e-06 [flash_sp]: 3.36999e-06 [merge_comm]: 4.15e-06 [allreduce_fusion]: 4.15e-06 [matmul_add_comm_reduction]: 8.13001e-06 [allreduce_slice_to_reducescatter]: 3.19997e-07 [virtual_shard_identity]: 7.98001e-06 [virtual_dataset]: 6.69999e-06 [get_grad_eliminate_]: 6.54999e-06 [virtual_output]: 6.62002e-06 [merge_forward]: 3.41001e-06 [cell_reuse_recompute_pass]: 1.26997e-06 [offload_activation]: 8.20999e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.397e-05 [merge_recompute_call_nodes]: 1.04998e-06 [before_grad]: 1.115e-05 [set_forward_comm_id_for_comm_node_pass]: 4.43001e-06 [meta_fg_expand]: 2.68e-06 [flash_sp_send_recv_attached]: 8.30012e-07 [receive_attached]: 1.12999e-06 [after_resolve]: 1.05e-05 [a_after_grad]: 1.025e-05 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 1.36002e-06 [auto_monad_grad]: 1.19998e-06 [auto_monad_eliminator]: 9.09998e-06 [cse]: 1.912e-05 [a_3]: 4.289e-05 [py_interpret_to_execute_after_opt_a]: 1.042e-05 [slice_cell_reuse_recomputed_activation]: 2.06e-06 [rewriter_after_opt_a]: 4.123e-05 [convert_after_rewriter]: 7.68999e-06 [order_py_execute_after_rewriter]: 6.21e-06 [mutable_eliminate]: 0.00050834 [opt_b]: 0.00023941, [1] [Cycle 1]: 0.00023329, [7] [b_1]: 0.00014689 [b_2]: 8.79998e-06 [updatestate_depend_eliminate]: 7.67002e-06 [updatestate_assign_eliminate]: 3.08e-06 [updatestate_loads_eliminate]: 3.37002e-06 [renormalize]: 6.20028e-07 [cse]: 2.677e-05 [optimize_parallel_all_gather_comm]: 1.754e-05 [overlap_param_gather]: 2.11e-06 [cconv]: 2.671e-05 [loop_unroll]: 0.00042786 [opt_after_cconv]: 0.00011357, [1] [Cycle 1]: 0.00010785, [7] [c_1]: 3.339e-05 [parameter_eliminate]: 3.26001e-06 [updatestate_depend_eliminate]: 6.34001e-06 [updatestate_assign_eliminate]: 3.11001e-06 [updatestate_loads_eliminate]: 2.76e-06 [cse]: 2.401e-05 [renormalize]: 4.30009e-07 [remove_dup_value]: 1.769e-05 [tuple_transform]: 0.00010116, [1] [Cycle 1]: 9.641e-05, [4] [d_1]: 6.558e-05 [none_parameter_eliminate]: 2.06e-06 [renormalize]: 2.49973e-07 [switch_simplify]: 8.18999e-06 [partial_unused_args_eliminate]: 1.92001e-06 [add_recomputation]: 5.747e-05 [cse_after_recomputation]: 2.625e-05, [1] [Cycle 1]: 2.123e-05, [1] [cse]: 1.557e-05 [environ_conv]: 6.68e-06 [swap_dp_allreduce_reducescatter]: 6.71e-06 [bias_add_comm_swap]: 2.73e-06 [label_micro_interleaved_index]: 4.11001e-06 [label_fine_grained_interleaved_index]: 2.72001e-06 [merge_cast_opt]: 1.27999e-06 [slice_recompute_activation]: 2.11e-06 [micro_interleaved_order_control]: 2.29999e-06 [assign_add_opt]: 1.26997e-06 [ForceFp32Comm]: 8.09989e-07 [remove_cast_before_assign_add]: 9.50007e-07 [full_micro_interleaved_order_control]: 2.24001e-06 [reorder_send_recv_between_fp_bp]: 2.51e-06 [comm_op_add_attrs]: 1.00999e-06 [add_comm_op_reuse_tag]: 9.79984e-07 [interleave_split_concat_branches]: 1.15999e-06 [interleave_parallel_branches]: 1.04998e-06 [overlap_opt_shard_in_pipeline]: 1.14e-06 [overlap_opt_shard_grad_in_pipeline]: 1.80001e-06 [control_data_broadcast_order]: 1.502e-05 [grouped_pairwise_exchange_alltoall]: 1.64998e-06 [offloading_packed_experts]: 4.48001e-06 [overlap_recompute_and_grad_model_parallel]: 5.63002e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.10999e-06 [overlap_recompute_allgather_and_fa_grad]: 1.20999e-06 [overlap_recompute_comm]: 2.70002e-06 [overlap_grad_ring_attention]: 4.30999e-06 [overlap_grad_flash_sp]: 2.194e-05 [begin_end_overlap_inline]: 5.69999e-07 [split_matmul_comm_elemetwise]: 2.02001e-06 [split_layernorm_comm]: 1.81003e-06 [handle_group_info]: 1.30001e-06 [symbol_engine_optimizer]: 8.272e-05, [1] [Cycle 1]: 7.837e-05, [6] [build]: 2.87002e-06 [elim_shapecalc]: 1.094e-05 [elim_not_effective]: 1.596e-05 [opt_reshape]: 8.19002e-06 [fold_const_symbol]: 1.224e-05 [renormalize]: 2.10013e-07 [detach_backward]: 1.79e-06 [pipeline_parallel_scheduler]: 1.71e-06 [auto_monad_reorder]: 1.892e-05 [get_jit_bprop_graph]: 1.87001e-06 [rewriter_after_jit_bprop_graph]: 3.6e-06 [opt_after_jit_grad]: 0.00047528 [validate]: 4.437e-05 Sums bootstrap : 0.000465s : 4.38% type_inference : 0.005617s : 52.90% event_method : 0.000015s : 0.14% auto_monad : 0.000060s : 0.56% graph_reusing : 0.000006s : 0.06% inline : 0.000002s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000016s : 0.15% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000004s : 0.04% parallel-infer-symbol : 0.000003s : 0.03% pre_auto_parallel : 0.000030s : 0.29% insert-virtual-dataset : 0.000002s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.02% optimize.py_interpret_to_execute : 0.000020s : 0.18% optimize.rewriter_before_opt_a : 0.000051s : 0.48% optimize.opt_a.expand_dump_flag : 0.000004s : 0.04% optimize.opt_a.switch_simplify : 0.000037s : 0.34% optimize.opt_a.loop_unroll : 0.000023s : 0.22% optimize.opt_a.a_1 : 0.000529s : 4.99% optimize.opt_a.with_stream_mark : 0.000030s : 0.28% optimize.opt_a.recompute_prepare : 0.000019s : 0.18% optimize.opt_a.updatestate_depend_eliminate : 0.000009s : 0.08% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.07% optimize.opt_a.updatestate_loads_eliminate : 0.000007s : 0.06% optimize.opt_a.parameter_eliminate : 0.000003s : 0.03% optimize.opt_a.a_2 : 0.000189s : 1.78% optimize.opt_a.accelerated_algorithm : 0.000017s : 0.16% optimize.opt_a.shard : 0.000003s : 0.03% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.04% optimize.opt_a.shard_inline : 0.000015s : 0.14% optimize.opt_a.merge_send_recv : 0.000016s : 0.16% optimize.opt_a.auto_parallel : 0.000014s : 0.13% optimize.opt_a.parallel : 0.000024s : 0.23% optimize.opt_a.flash_sp : 0.000012s : 0.11% optimize.opt_a.merge_comm : 0.000009s : 0.08% optimize.opt_a.allreduce_fusion : 0.000009s : 0.08% optimize.opt_a.matmul_add_comm_reduction : 0.000019s : 0.18% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000017s : 0.16% optimize.opt_a.virtual_dataset : 0.000014s : 0.13% optimize.opt_a.get_grad_eliminate_ : 0.000014s : 0.13% optimize.opt_a.virtual_output : 0.000014s : 0.13% optimize.opt_a.merge_forward : 0.000008s : 0.07% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.02% optimize.opt_a.offload_activation : 0.000020s : 0.18% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000031s : 0.29% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.03% optimize.opt_a.before_grad : 0.000025s : 0.23% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000009s : 0.09% optimize.opt_a.meta_fg_expand : 0.000006s : 0.06% optimize.opt_a.flash_sp_send_recv_attached : 0.000003s : 0.03% optimize.opt_a.receive_attached : 0.000004s : 0.04% optimize.opt_a.after_resolve : 0.000023s : 0.21% optimize.opt_a.a_after_grad : 0.000022s : 0.20% optimize.opt_a.renormalize : 0.000714s : 6.72% optimize.opt_a.add_forward_monad_depend : 0.000007s : 0.07% optimize.opt_a.auto_monad_grad : 0.000004s : 0.03% optimize.opt_a.auto_monad_eliminator : 0.000026s : 0.24% optimize.opt_a.cse : 0.000056s : 0.53% optimize.opt_a.a_3 : 0.000103s : 0.97% optimize.py_interpret_to_execute_after_opt_a : 0.000010s : 0.10% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.02% optimize.rewriter_after_opt_a : 0.000041s : 0.39% optimize.convert_after_rewriter : 0.000008s : 0.07% optimize.order_py_execute_after_rewriter : 0.000006s : 0.06% optimize.mutable_eliminate : 0.000508s : 4.79% optimize.opt_b.b_1 : 0.000147s : 1.38% optimize.opt_b.b_2 : 0.000009s : 0.08% optimize.opt_b.updatestate_depend_eliminate : 0.000008s : 0.07% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_b.renormalize : 0.000001s : 0.01% optimize.opt_b.cse : 0.000027s : 0.25% optimize.optimize_parallel_all_gather_comm : 0.000018s : 0.17% optimize.overlap_param_gather : 0.000002s : 0.02% optimize.cconv : 0.000027s : 0.25% optimize.loop_unroll : 0.000428s : 4.03% optimize.opt_after_cconv.c_1 : 0.000033s : 0.31% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.06% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.cse : 0.000024s : 0.23% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000018s : 0.17% optimize.tuple_transform.d_1 : 0.000066s : 0.62% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.02% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000008s : 0.08% optimize.partial_unused_args_eliminate : 0.000002s : 0.02% optimize.add_recomputation : 0.000057s : 0.54% optimize.cse_after_recomputation.cse : 0.000016s : 0.15% optimize.environ_conv : 0.000007s : 0.06% optimize.swap_dp_allreduce_reducescatter : 0.000007s : 0.06% optimize.bias_add_comm_swap : 0.000003s : 0.03% optimize.label_micro_interleaved_index : 0.000004s : 0.04% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.03% optimize.merge_cast_opt : 0.000001s : 0.01% optimize.slice_recompute_activation : 0.000002s : 0.02% optimize.micro_interleaved_order_control : 0.000002s : 0.02% optimize.assign_add_opt : 0.000001s : 0.01% optimize.ForceFp32Comm : 0.000001s : 0.01% optimize.remove_cast_before_assign_add : 0.000001s : 0.01% optimize.full_micro_interleaved_order_control : 0.000002s : 0.02% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.02% optimize.comm_op_add_attrs : 0.000001s : 0.01% optimize.add_comm_op_reuse_tag : 0.000001s : 0.01% optimize.interleave_split_concat_branches : 0.000001s : 0.01% optimize.interleave_parallel_branches : 0.000001s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.02% optimize.control_data_broadcast_order : 0.000015s : 0.14% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.02% optimize.offloading_packed_experts : 0.000004s : 0.04% optimize.overlap_recompute_and_grad_model_parallel : 0.000006s : 0.05% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.01% optimize.overlap_recompute_comm : 0.000003s : 0.03% optimize.overlap_grad_ring_attention : 0.000004s : 0.04% optimize.overlap_grad_flash_sp : 0.000022s : 0.21% optimize.begin_end_overlap_inline : 0.000001s : 0.01% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.02% optimize.split_layernorm_comm : 0.000002s : 0.02% optimize.handle_group_info : 0.000001s : 0.01% optimize.symbol_engine_optimizer.build : 0.000003s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000011s : 0.10% optimize.symbol_engine_optimizer.elim_not_effective : 0.000016s : 0.15% optimize.symbol_engine_optimizer.opt_reshape : 0.000008s : 0.08% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000012s : 0.12% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.02% pipeline_parallel_scheduler : 0.000002s : 0.02% auto_monad_reorder : 0.000019s : 0.18% get_jit_bprop_graph : 0.000002s : 0.02% rewriter_after_jit_bprop_graph : 0.000004s : 0.03% opt_after_jit_grad : 0.000475s : 4.48% validate : 0.000044s : 0.42% Time group info: ------[substitution.] 0.000169 28 12.38% : 0.000021s : 2: substitution.cast_eliminate 1.58% : 0.000003s : 3: substitution.elim_not_effective 0.99% : 0.000002s : 3: substitution.fold_const_symbol 3.82% : 0.000006s : 4: substitution.graph_param_transform 73.07% : 0.000124s : 2: substitution.inline 2.60% : 0.000004s : 6: substitution.j_node_and_user_rematch 3.52% : 0.000006s : 6: substitution.remove_not_recompute_node 2.05% : 0.000003s : 2: substitution.replace_old_param ------[type_inference.] 0.005561 2 90.50% : 0.005032s : 1: type_inference.infer 9.50% : 0.000528s : 1: type_inference.specialize ------[replace.] 0.000022 2 100.00% : 0.000022s : 2: replace.inline ------[match.] 0.000122 2 100.00% : 0.000122s : 2: match.inline ------[predicate.] 0.000168 980 0.78% : 0.000001s : 9: predicate.accumulaten_eliminater 1.09% : 0.000002s : 4: predicate.ad_related_special_op_eliminate 0.68% : 0.000001s : 8: predicate.addn_check_dump 0.80% : 0.000001s : 9: predicate.addn_zero_filter 0.71% : 0.000001s : 9: predicate.adjust_all_reduce_mul_add 2.27% : 0.000004s : 17: predicate.arithmetic_simplify 0.95% : 0.000002s : 9: predicate.cast_eliminate 0.88% : 0.000001s : 8: predicate.check_bprop_eliminate 0.71% : 0.000001s : 8: predicate.compare_switch_simplify 0.24% : 0.000000s : 4: predicate.const_output_eliminate 0.78% : 0.000001s : 8: predicate.depend_value_elim 0.87% : 0.000001s : 9: predicate.dict_get_item_const_eliminator 0.95% : 0.000002s : 9: predicate.dict_get_item_eliminator 0.80% : 0.000001s : 9: predicate.dict_set_item_eliminator 1.38% : 0.000002s : 8: predicate.dumpgradient_eliminate 0.22% : 0.000000s : 4: predicate.elim_not_effective 0.55% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.08% : 0.000002s : 13: predicate.environ_add_const_eliminate 1.04% : 0.000002s : 13: predicate.environ_get_add_eliminate 1.04% : 0.000002s : 13: predicate.environ_get_depend_swap 1.85% : 0.000003s : 21: predicate.environ_get_eliminate 1.02% : 0.000002s : 13: predicate.environ_get_set_eliminate 0.94% : 0.000002s : 11: predicate.exchange_switch_depend_value 1.74% : 0.000003s : 11: predicate.float_depend_g_call 0.68% : 0.000001s : 8: predicate.float_environ_get_switch 1.03% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.21% : 0.000000s : 4: predicate.fold_const_symbol 0.95% : 0.000002s : 8: predicate.get_grad_eliminate 0.28% : 0.000000s : 4: predicate.graph_param_transform 0.81% : 0.000001s : 8: predicate.incorporate_call 0.65% : 0.000001s : 8: predicate.incorporate_call_switch 6.44% : 0.000011s : 44: predicate.inline 1.05% : 0.000002s : 8: predicate.inline_without_move 0.40% : 0.000001s : 8: predicate.j_node_and_user_rematch 1.18% : 0.000002s : 8: predicate.less_batch_normalization 1.69% : 0.000003s : 17: predicate.list_to_tuple_eliminator_ 2.16% : 0.000004s : 26: predicate.load_eliminater 1.11% : 0.000002s : 4: predicate.loop_unroll_after_grad 1.63% : 0.000003s : 16: predicate.loop_unroll_before_grad 1.89% : 0.000003s : 17: predicate.make_slice_get_slice_eliminator 0.76% : 0.000001s : 8: predicate.merge_addn 0.70% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.70% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.67% : 0.000001s : 9: predicate.minmaximum_grad 1.47% : 0.000002s : 4: predicate.mutable_eliminate 0.45% : 0.000001s : 4: predicate.opt_reshape 0.43% : 0.000001s : 4: predicate.parallel_virtual_node 1.31% : 0.000002s : 11: predicate.partial_defer_inline 1.28% : 0.000002s : 13: predicate.partial_eliminate 0.89% : 0.000002s : 9: predicate.print_const_string_wrapper 0.71% : 0.000001s : 8: predicate.reduce_all_const_elim 1.00% : 0.000002s : 9: predicate.reduce_eliminate 2.17% : 0.000004s : 26: predicate.redundant_stop_gradient_eliminater 0.50% : 0.000001s : 8: predicate.remove_not_recompute_node 1.26% : 0.000002s : 17: predicate.replace_applicator 0.59% : 0.000001s : 8: predicate.replace_old_param 0.33% : 0.000001s : 4: predicate.reset_defer_inline 0.87% : 0.000001s : 9: predicate.reshape_eliminate 0.74% : 0.000001s : 8: predicate.row_tensor_add_zeros_like 0.58% : 0.000001s : 4: predicate.row_tensor_eliminate 0.99% : 0.000002s : 8: predicate.same_eliminate 0.53% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.92% : 0.000002s : 8: predicate.shard_identity_eliminate 0.95% : 0.000002s : 8: predicate.special_op_eliminate 1.01% : 0.000002s : 8: predicate.specialize_transform 1.00% : 0.000002s : 8: predicate.split_environ_get_set_with_tuple_value 0.87% : 0.000001s : 8: predicate.stack_unstack_eliminate 0.43% : 0.000001s : 4: predicate.switch_call_monad_eliminater 0.96% : 0.000002s : 11: predicate.switch_defer_inline 1.73% : 0.000003s : 19: predicate.switch_layer_defer_inline 4.12% : 0.000007s : 39: predicate.switch_simplify 0.78% : 0.000001s : 9: predicate.tile_eliminate 0.81% : 0.000001s : 9: predicate.transpose_eliminate 1.62% : 0.000003s : 17: predicate.tuple_list_convert_item_index_to_positive 1.59% : 0.000003s : 17: predicate.tuple_list_get_item_const_eliminator 1.63% : 0.000003s : 17: predicate.tuple_list_get_item_depend_reorder 3.24% : 0.000005s : 25: predicate.tuple_list_get_item_eliminator 1.49% : 0.000003s : 17: predicate.tuple_list_get_set_item_eliminator 2.50% : 0.000004s : 25: predicate.tuple_list_set_item_eliminator 1.61% : 0.000003s : 17: predicate.tuple_to_list_eliminator_ 2.14% : 0.000004s : 26: predicate.updatestate_pure_node_eliminater 2.99% : 0.000005s : 34: predicate.updatestate_useless_node_eliminater 0.65% : 0.000001s : 4: predicate.value_based_eliminate 0.80% : 0.000001s : 8: predicate.virtual_dataset_eliminate 0.81% : 0.000001s : 8: predicate.virtual_output_eliminate 0.33% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.54% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000240 5 7.50% : 0.000018s : 1: func_graph_cloner_run.FuncGraphClonerGraph 92.50% : 0.000222s : 4: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.024871 192 0.01% : 0.000004s : 1: ForceFp32Comm 13.40% : 0.003333s : 1: add_attr 13.36% : 0.003322s : 1: add_attr_with_inline 0.01% : 0.000004s : 1: add_comm_op_reuse_tag 0.25% : 0.000061s : 1: add_recomputation 0.02% : 0.000004s : 1: assign_add_opt 0.26% : 0.000066s : 1: auto_monad 0.09% : 0.000023s : 1: auto_monad_reorder 0.01% : 0.000004s : 1: begin_end_overlap_inline 0.02% : 0.000006s : 1: bias_add_comm_swap 1.99% : 0.000495s : 1: bootstrap 0.12% : 0.000030s : 1: cconv 0.02% : 0.000004s : 1: comm_op_add_attrs 0.07% : 0.000018s : 1: control_data_broadcast_order 0.04% : 0.000011s : 1: convert_after_rewriter 0.12% : 0.000029s : 1: cse_after_recomputation 0.02% : 0.000005s : 1: dataset_repeat_opt 0.02% : 0.000005s : 1: detach_backward 0.04% : 0.000010s : 1: environ_conv 0.09% : 0.000023s : 1: event_method 0.02% : 0.000005s : 1: full_micro_interleaved_order_control 0.02% : 0.000005s : 1: get_jit_bprop_graph 0.04% : 0.000010s : 1: graph_reusing 0.02% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.02% : 0.000004s : 1: handle_group_info 0.02% : 0.000006s : 1: inline 0.02% : 0.000006s : 1: insert-virtual-dataset 0.02% : 0.000004s : 1: interleave_parallel_branches 0.02% : 0.000004s : 1: interleave_split_concat_branches 0.02% : 0.000006s : 1: label_fine_grained_interleaved_index 0.03% : 0.000007s : 1: label_micro_interleaved_index 1.75% : 0.000436s : 1: loop_unroll 0.02% : 0.000004s : 1: merge_cast_opt 0.02% : 0.000005s : 1: micro_interleaved_order_control 2.08% : 0.000517s : 1: mutable_eliminate 0.03% : 0.000007s : 1: offloading_packed_experts 0.07% : 0.000017s : 1: opt.transform.loop_unroll_optimizer 0.08% : 0.000019s : 1: opt.transform.mutable_eliminate 4.01% : 0.000997s : 78: opt.transform.opt_a 0.13% : 0.000032s : 1: opt.transform.opt_after_cconv 0.11% : 0.000028s : 1: opt.transform.opt_after_jit_grad 0.50% : 0.000124s : 28: opt.transform.opt_b 0.29% : 0.000071s : 2: opt.transform.opt_trans_graph 0.18% : 0.000044s : 4: opt.transform.symbol_engine_opt 10.51% : 0.002613s : 1: opt_a 0.47% : 0.000117s : 1: opt_after_cconv 1.95% : 0.000484s : 1: opt_after_jit_grad 0.98% : 0.000243s : 1: opt_b 18.90% : 0.004701s : 1: optimize 0.09% : 0.000021s : 1: optimize_parallel_all_gather_comm 0.04% : 0.000009s : 1: order_py_execute_after_rewriter 0.10% : 0.000026s : 1: overlap_grad_flash_sp 0.02% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.03% : 0.000007s : 1: overlap_grad_ring_attention 0.02% : 0.000004s : 1: overlap_opt_shard_grad_in_pipeline 0.02% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.02% : 0.000005s : 1: overlap_param_gather 0.02% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.03% : 0.000008s : 1: overlap_recompute_and_grad_model_parallel 0.02% : 0.000005s : 1: overlap_recompute_comm 0.03% : 0.000007s : 1: parallel-infer-symbol 0.02% : 0.000004s : 1: parallel-infer-symbol-second 0.02% : 0.000005s : 1: partial_unused_args_eliminate 0.02% : 0.000005s : 1: pipeline_parallel_scheduler 0.02% : 0.000004s : 1: pipeline_split 0.14% : 0.000034s : 1: pre_auto_parallel 0.09% : 0.000023s : 1: py_interpret_to_execute 0.06% : 0.000014s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000004s : 1: remove_cast_before_assign_add 0.09% : 0.000021s : 1: remove_dup_value 1.74% : 0.000433s : 1: renormalize.infer 1.09% : 0.000272s : 1: renormalize.specialize 0.02% : 0.000005s : 1: reorder_send_recv_between_fp_bp 0.03% : 0.000007s : 1: rewriter_after_jit_bprop_graph 0.19% : 0.000046s : 1: rewriter_after_opt_a 0.22% : 0.000055s : 1: rewriter_before_opt_a 0.02% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.02% : 0.000005s : 1: slice_recompute_activation 0.02% : 0.000005s : 1: split_layernorm_comm 0.02% : 0.000005s : 1: split_matmul_comm_elemetwise 0.04% : 0.000010s : 1: swap_dp_allreduce_reducescatter 0.34% : 0.000086s : 1: symbol_engine_optimizer 0.42% : 0.000105s : 1: tuple_transform 22.66% : 0.005637s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:41.434.308 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:41.434.581 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0155172, [21] [bootstrap]: 0.00044852 [type_inference]: 0.00501724 [event_method]: 1.476e-05 [auto_monad]: 5.95e-05 [graph_reusing]: 5.35999e-06 [inline]: 2.18998e-06 [add_attr]: 0.00324974, [1] [add_attr_with_inline]: 0.00323967, [1] [Cycle 1]: 7.78e-05, [2] [tag_attr]: 1.656e-05 [meta_addattr_fg_expand]: 3.98001e-06 [parallel-infer-symbol]: 2.99999e-06 [pre_auto_parallel]: 2.885e-05 [insert-virtual-dataset]: 2.91999e-06 [parallel-infer-symbol-second]: 9.50007e-07 [dataset_repeat_opt]: 2.19999e-06 [pipeline_split]: 1.62001e-06 [optimize]: 0.00546464, [53] [py_interpret_to_execute]: 2.466e-05 [rewriter_before_opt_a]: 5.584e-05 [opt_a]: 0.00298837, [2] [Cycle 1]: 0.00201639, [45] [expand_dump_flag]: 3.16999e-06 [switch_simplify]: 2.72e-05 [loop_unroll]: 1.536e-05 [a_1]: 0.00036063 [with_stream_mark]: 1.776e-05 [recompute_prepare]: 1.174e-05 [updatestate_depend_eliminate]: 4.75001e-06 [updatestate_assign_eliminate]: 4.15e-06 [updatestate_loads_eliminate]: 3.53999e-06 [parameter_eliminate]: 1.92001e-06 [a_2]: 0.00012799 [accelerated_algorithm]: 8.90999e-06 [shard]: 2.47001e-06 [meta_shard_fg_expand]: 2.36e-06 [shard_inline]: 8.02e-06 [merge_send_recv]: 1.002e-05 [auto_parallel]: 7.97e-06 [parallel]: 2.092e-05 [flash_sp]: 1.114e-05 [merge_comm]: 5.67999e-06 [allreduce_fusion]: 4.80001e-06 [matmul_add_comm_reduction]: 1.05e-05 [allreduce_slice_to_reducescatter]: 6.60017e-07 [virtual_shard_identity]: 1.101e-05 [virtual_dataset]: 7.82e-06 [get_grad_eliminate_]: 7.81001e-06 [virtual_output]: 8.35001e-06 [merge_forward]: 5.33002e-06 [cell_reuse_recompute_pass]: 1.40001e-06 [offload_activation]: 1.177e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.98e-05 [merge_recompute_call_nodes]: 1.86998e-06 [before_grad]: 1.605e-05 [set_forward_comm_id_for_comm_node_pass]: 5.24e-06 [meta_fg_expand]: 3.35003e-06 [flash_sp_send_recv_attached]: 2.61999e-06 [receive_attached]: 2.76999e-06 [after_resolve]: 1.147e-05 [a_after_grad]: 1.299e-05 [renormalize]: 0.00065909 [add_forward_monad_depend]: 6.26e-06 [auto_monad_grad]: 2.74999e-06 [auto_monad_eliminator]: 1.816e-05 [cse]: 3.696e-05 [a_3]: 7.306e-05 [Cycle 2]: 0.00095806, [45] [expand_dump_flag]: 1.44e-06 [switch_simplify]: 8.83001e-06 [loop_unroll]: 7.67998e-06 [a_1]: 0.00015241 [with_stream_mark]: 1.852e-05 [recompute_prepare]: 9.70002e-06 [updatestate_depend_eliminate]: 5.52001e-06 [updatestate_assign_eliminate]: 3.8e-06 [updatestate_loads_eliminate]: 3.80998e-06 [parameter_eliminate]: 1.54e-06 [a_2]: 0.00012916 [accelerated_algorithm]: 7.61999e-06 [shard]: 1.64e-06 [meta_shard_fg_expand]: 2.15002e-06 [shard_inline]: 7.55e-06 [merge_send_recv]: 7.99002e-06 [auto_parallel]: 8.08999e-06 [parallel]: 5.69e-06 [flash_sp]: 3.88001e-06 [merge_comm]: 4.33999e-06 [allreduce_fusion]: 4.00998e-06 [matmul_add_comm_reduction]: 8.74e-06 [allreduce_slice_to_reducescatter]: 5.59987e-07 [virtual_shard_identity]: 9.09e-06 [virtual_dataset]: 7.03e-06 [get_grad_eliminate_]: 6.83e-06 [virtual_output]: 6.54999e-06 [merge_forward]: 4.67e-06 [cell_reuse_recompute_pass]: 1.66e-06 [offload_activation]: 8.82e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.875e-05 [merge_recompute_call_nodes]: 8.89995e-07 [before_grad]: 1.218e-05 [set_forward_comm_id_for_comm_node_pass]: 4.97e-06 [meta_fg_expand]: 2.73e-06 [flash_sp_send_recv_attached]: 8.80013e-07 [receive_attached]: 1.09003e-06 [after_resolve]: 1.102e-05 [a_after_grad]: 1.055e-05 [renormalize]: 8.00064e-08 [add_forward_monad_depend]: 2.88e-06 [auto_monad_grad]: 1.62001e-06 [auto_monad_eliminator]: 1.027e-05 [cse]: 2.277e-05 [a_3]: 5.737e-05 [py_interpret_to_execute_after_opt_a]: 1.595e-05 [slice_cell_reuse_recomputed_activation]: 4.49998e-06 [rewriter_after_opt_a]: 4.765e-05 [convert_after_rewriter]: 1.019e-05 [order_py_execute_after_rewriter]: 9.58002e-06 [mutable_eliminate]: 0.0005746 [opt_b]: 0.0003217, [1] [Cycle 1]: 0.00031129, [7] [b_1]: 0.00019679 [b_2]: 8.40001e-06 [updatestate_depend_eliminate]: 8.07998e-06 [updatestate_assign_eliminate]: 3.36001e-06 [updatestate_loads_eliminate]: 3.30998e-06 [renormalize]: 1.10001e-06 [cse]: 3.084e-05 [optimize_parallel_all_gather_comm]: 2.231e-05 [overlap_param_gather]: 5.12999e-06 [cconv]: 3.185e-05 [loop_unroll]: 0.00047149 [opt_after_cconv]: 0.0001431, [1] [Cycle 1]: 0.00013409, [7] [c_1]: 3.419e-05 [parameter_eliminate]: 3.49001e-06 [updatestate_depend_eliminate]: 6.96999e-06 [updatestate_assign_eliminate]: 3.35e-06 [updatestate_loads_eliminate]: 3.66001e-06 [cse]: 2.522e-05 [renormalize]: 3.30008e-07 [remove_dup_value]: 2.006e-05 [tuple_transform]: 9.626e-05, [1] [Cycle 1]: 8.835e-05, [4] [d_1]: 4.882e-05 [none_parameter_eliminate]: 1.54e-06 [renormalize]: 2.10013e-07 [switch_simplify]: 7.93001e-06 [partial_unused_args_eliminate]: 4.48999e-06 [add_recomputation]: 6.062e-05 [cse_after_recomputation]: 3.338e-05, [1] [Cycle 1]: 2.638e-05, [1] [cse]: 1.596e-05 [environ_conv]: 8.89e-06 [swap_dp_allreduce_reducescatter]: 9.15001e-06 [bias_add_comm_swap]: 5.47001e-06 [label_micro_interleaved_index]: 7.25003e-06 [label_fine_grained_interleaved_index]: 5.32999e-06 [merge_cast_opt]: 3.81999e-06 [slice_recompute_activation]: 4.91002e-06 [micro_interleaved_order_control]: 4.55001e-06 [assign_add_opt]: 3.97e-06 [ForceFp32Comm]: 3.21999e-06 [remove_cast_before_assign_add]: 3.43999e-06 [full_micro_interleaved_order_control]: 4.48001e-06 [reorder_send_recv_between_fp_bp]: 4.90999e-06 [comm_op_add_attrs]: 3.62998e-06 [add_comm_op_reuse_tag]: 3.55e-06 [interleave_split_concat_branches]: 4.03001e-06 [interleave_parallel_branches]: 3.73999e-06 [overlap_opt_shard_in_pipeline]: 3.63e-06 [overlap_opt_shard_grad_in_pipeline]: 4.40999e-06 [control_data_broadcast_order]: 1.899e-05 [grouped_pairwise_exchange_alltoall]: 4.15e-06 [offloading_packed_experts]: 7.4e-06 [overlap_recompute_and_grad_model_parallel]: 8.33999e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.6e-06 [overlap_recompute_allgather_and_fa_grad]: 3.55e-06 [overlap_recompute_comm]: 4.81002e-06 [overlap_grad_ring_attention]: 7.34002e-06 [overlap_grad_flash_sp]: 2.691e-05 [begin_end_overlap_inline]: 2.94999e-06 [split_matmul_comm_elemetwise]: 4.43999e-06 [split_layernorm_comm]: 4.23999e-06 [handle_group_info]: 3.41001e-06 [symbol_engine_optimizer]: 0.0001111, [1] [Cycle 1]: 0.00010369, [6] [build]: 2.95998e-06 [elim_shapecalc]: 1.373e-05 [elim_not_effective]: 1.626e-05 [opt_reshape]: 8.43001e-06 [fold_const_symbol]: 1.279e-05 [renormalize]: 4.09986e-07 [detach_backward]: 3.76999e-06 [pipeline_parallel_scheduler]: 1.75001e-06 [auto_monad_reorder]: 2.365e-05 [get_jit_bprop_graph]: 2.12001e-06 [rewriter_after_jit_bprop_graph]: 5.71003e-06 [opt_after_jit_grad]: 0.00052812 [validate]: 4.69e-05 Sums bootstrap : 0.000449s : 4.30% type_inference : 0.005017s : 48.08% event_method : 0.000015s : 0.14% auto_monad : 0.000059s : 0.57% graph_reusing : 0.000005s : 0.05% inline : 0.000002s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000017s : 0.16% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000004s : 0.04% parallel-infer-symbol : 0.000003s : 0.03% pre_auto_parallel : 0.000029s : 0.28% insert-virtual-dataset : 0.000003s : 0.03% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.02% optimize.py_interpret_to_execute : 0.000025s : 0.24% optimize.rewriter_before_opt_a : 0.000056s : 0.54% optimize.opt_a.expand_dump_flag : 0.000005s : 0.04% optimize.opt_a.switch_simplify : 0.000036s : 0.35% optimize.opt_a.loop_unroll : 0.000023s : 0.22% optimize.opt_a.a_1 : 0.000513s : 4.92% optimize.opt_a.with_stream_mark : 0.000036s : 0.35% optimize.opt_a.recompute_prepare : 0.000021s : 0.21% optimize.opt_a.updatestate_depend_eliminate : 0.000010s : 0.10% optimize.opt_a.updatestate_assign_eliminate : 0.000008s : 0.08% optimize.opt_a.updatestate_loads_eliminate : 0.000007s : 0.07% optimize.opt_a.parameter_eliminate : 0.000003s : 0.03% optimize.opt_a.a_2 : 0.000257s : 2.46% optimize.opt_a.accelerated_algorithm : 0.000017s : 0.16% optimize.opt_a.shard : 0.000004s : 0.04% optimize.opt_a.meta_shard_fg_expand : 0.000005s : 0.04% optimize.opt_a.shard_inline : 0.000016s : 0.15% optimize.opt_a.merge_send_recv : 0.000018s : 0.17% optimize.opt_a.auto_parallel : 0.000016s : 0.15% optimize.opt_a.parallel : 0.000027s : 0.26% optimize.opt_a.flash_sp : 0.000015s : 0.14% optimize.opt_a.merge_comm : 0.000010s : 0.10% optimize.opt_a.allreduce_fusion : 0.000009s : 0.08% optimize.opt_a.matmul_add_comm_reduction : 0.000019s : 0.18% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000020s : 0.19% optimize.opt_a.virtual_dataset : 0.000015s : 0.14% optimize.opt_a.get_grad_eliminate_ : 0.000015s : 0.14% optimize.opt_a.virtual_output : 0.000015s : 0.14% optimize.opt_a.merge_forward : 0.000010s : 0.10% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.03% optimize.opt_a.offload_activation : 0.000021s : 0.20% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000039s : 0.37% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.03% optimize.opt_a.before_grad : 0.000028s : 0.27% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000010s : 0.10% optimize.opt_a.meta_fg_expand : 0.000006s : 0.06% optimize.opt_a.flash_sp_send_recv_attached : 0.000003s : 0.03% optimize.opt_a.receive_attached : 0.000004s : 0.04% optimize.opt_a.after_resolve : 0.000022s : 0.22% optimize.opt_a.a_after_grad : 0.000024s : 0.23% optimize.opt_a.renormalize : 0.000659s : 6.32% optimize.opt_a.add_forward_monad_depend : 0.000009s : 0.09% optimize.opt_a.auto_monad_grad : 0.000004s : 0.04% optimize.opt_a.auto_monad_eliminator : 0.000028s : 0.27% optimize.opt_a.cse : 0.000060s : 0.57% optimize.opt_a.a_3 : 0.000130s : 1.25% optimize.py_interpret_to_execute_after_opt_a : 0.000016s : 0.15% optimize.slice_cell_reuse_recomputed_activation : 0.000004s : 0.04% optimize.rewriter_after_opt_a : 0.000048s : 0.46% optimize.convert_after_rewriter : 0.000010s : 0.10% optimize.order_py_execute_after_rewriter : 0.000010s : 0.09% optimize.mutable_eliminate : 0.000575s : 5.51% optimize.opt_b.b_1 : 0.000197s : 1.89% optimize.opt_b.b_2 : 0.000008s : 0.08% optimize.opt_b.updatestate_depend_eliminate : 0.000008s : 0.08% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_b.renormalize : 0.000001s : 0.01% optimize.opt_b.cse : 0.000031s : 0.30% optimize.optimize_parallel_all_gather_comm : 0.000022s : 0.21% optimize.overlap_param_gather : 0.000005s : 0.05% optimize.cconv : 0.000032s : 0.31% optimize.loop_unroll : 0.000471s : 4.52% optimize.opt_after_cconv.c_1 : 0.000034s : 0.33% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000007s : 0.07% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000004s : 0.04% optimize.opt_after_cconv.cse : 0.000025s : 0.24% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000020s : 0.19% optimize.tuple_transform.d_1 : 0.000049s : 0.47% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000008s : 0.08% optimize.partial_unused_args_eliminate : 0.000004s : 0.04% optimize.add_recomputation : 0.000061s : 0.58% optimize.cse_after_recomputation.cse : 0.000016s : 0.15% optimize.environ_conv : 0.000009s : 0.09% optimize.swap_dp_allreduce_reducescatter : 0.000009s : 0.09% optimize.bias_add_comm_swap : 0.000005s : 0.05% optimize.label_micro_interleaved_index : 0.000007s : 0.07% optimize.label_fine_grained_interleaved_index : 0.000005s : 0.05% optimize.merge_cast_opt : 0.000004s : 0.04% optimize.slice_recompute_activation : 0.000005s : 0.05% optimize.micro_interleaved_order_control : 0.000005s : 0.04% optimize.assign_add_opt : 0.000004s : 0.04% optimize.ForceFp32Comm : 0.000003s : 0.03% optimize.remove_cast_before_assign_add : 0.000003s : 0.03% optimize.full_micro_interleaved_order_control : 0.000004s : 0.04% optimize.reorder_send_recv_between_fp_bp : 0.000005s : 0.05% optimize.comm_op_add_attrs : 0.000004s : 0.03% optimize.add_comm_op_reuse_tag : 0.000004s : 0.03% optimize.interleave_split_concat_branches : 0.000004s : 0.04% optimize.interleave_parallel_branches : 0.000004s : 0.04% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.03% optimize.overlap_opt_shard_grad_in_pipeline : 0.000004s : 0.04% optimize.control_data_broadcast_order : 0.000019s : 0.18% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.04% optimize.offloading_packed_experts : 0.000007s : 0.07% optimize.overlap_recompute_and_grad_model_parallel : 0.000008s : 0.08% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.03% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.03% optimize.overlap_recompute_comm : 0.000005s : 0.05% optimize.overlap_grad_ring_attention : 0.000007s : 0.07% optimize.overlap_grad_flash_sp : 0.000027s : 0.26% optimize.begin_end_overlap_inline : 0.000003s : 0.03% optimize.split_matmul_comm_elemetwise : 0.000004s : 0.04% optimize.split_layernorm_comm : 0.000004s : 0.04% optimize.handle_group_info : 0.000003s : 0.03% optimize.symbol_engine_optimizer.build : 0.000003s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000014s : 0.13% optimize.symbol_engine_optimizer.elim_not_effective : 0.000016s : 0.16% optimize.symbol_engine_optimizer.opt_reshape : 0.000008s : 0.08% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000013s : 0.12% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000004s : 0.04% pipeline_parallel_scheduler : 0.000002s : 0.02% auto_monad_reorder : 0.000024s : 0.23% get_jit_bprop_graph : 0.000002s : 0.02% rewriter_after_jit_bprop_graph : 0.000006s : 0.05% opt_after_jit_grad : 0.000528s : 5.06% validate : 0.000047s : 0.45% Time group info: ------[substitution.] 0.000159 28 15.09% : 0.000024s : 2: substitution.cast_eliminate 1.44% : 0.000002s : 3: substitution.elim_not_effective 1.17% : 0.000002s : 3: substitution.fold_const_symbol 4.00% : 0.000006s : 4: substitution.graph_param_transform 68.12% : 0.000108s : 2: substitution.inline 3.33% : 0.000005s : 6: substitution.j_node_and_user_rematch 3.94% : 0.000006s : 6: substitution.remove_not_recompute_node 2.90% : 0.000005s : 2: substitution.replace_old_param ------[type_inference.] 0.004965 2 90.67% : 0.004502s : 1: type_inference.infer 9.33% : 0.000463s : 1: type_inference.specialize ------[replace.] 0.000025 2 100.00% : 0.000025s : 2: replace.inline ------[match.] 0.000106 2 100.00% : 0.000106s : 2: match.inline ------[predicate.] 0.000173 980 0.82% : 0.000001s : 9: predicate.accumulaten_eliminater 1.04% : 0.000002s : 4: predicate.ad_related_special_op_eliminate 0.69% : 0.000001s : 8: predicate.addn_check_dump 0.99% : 0.000002s : 9: predicate.addn_zero_filter 0.67% : 0.000001s : 9: predicate.adjust_all_reduce_mul_add 2.16% : 0.000004s : 17: predicate.arithmetic_simplify 0.91% : 0.000002s : 9: predicate.cast_eliminate 0.77% : 0.000001s : 8: predicate.check_bprop_eliminate 0.66% : 0.000001s : 8: predicate.compare_switch_simplify 0.24% : 0.000000s : 4: predicate.const_output_eliminate 0.82% : 0.000001s : 8: predicate.depend_value_elim 0.80% : 0.000001s : 9: predicate.dict_get_item_const_eliminator 0.89% : 0.000002s : 9: predicate.dict_get_item_eliminator 0.76% : 0.000001s : 9: predicate.dict_set_item_eliminator 1.27% : 0.000002s : 8: predicate.dumpgradient_eliminate 0.28% : 0.000000s : 4: predicate.elim_not_effective 0.61% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.07% : 0.000002s : 13: predicate.environ_add_const_eliminate 1.17% : 0.000002s : 13: predicate.environ_get_add_eliminate 1.02% : 0.000002s : 13: predicate.environ_get_depend_swap 1.82% : 0.000003s : 21: predicate.environ_get_eliminate 1.04% : 0.000002s : 13: predicate.environ_get_set_eliminate 0.90% : 0.000002s : 11: predicate.exchange_switch_depend_value 1.66% : 0.000003s : 11: predicate.float_depend_g_call 0.66% : 0.000001s : 8: predicate.float_environ_get_switch 1.01% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.24% : 0.000000s : 4: predicate.fold_const_symbol 0.81% : 0.000001s : 8: predicate.get_grad_eliminate 0.25% : 0.000000s : 4: predicate.graph_param_transform 0.80% : 0.000001s : 8: predicate.incorporate_call 0.65% : 0.000001s : 8: predicate.incorporate_call_switch 6.64% : 0.000012s : 44: predicate.inline 1.53% : 0.000003s : 8: predicate.inline_without_move 0.40% : 0.000001s : 8: predicate.j_node_and_user_rematch 0.99% : 0.000002s : 8: predicate.less_batch_normalization 1.96% : 0.000003s : 17: predicate.list_to_tuple_eliminator_ 2.12% : 0.000004s : 26: predicate.load_eliminater 1.21% : 0.000002s : 4: predicate.loop_unroll_after_grad 1.58% : 0.000003s : 16: predicate.loop_unroll_before_grad 1.72% : 0.000003s : 17: predicate.make_slice_get_slice_eliminator 0.77% : 0.000001s : 8: predicate.merge_addn 0.72% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.72% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.70% : 0.000001s : 9: predicate.minmaximum_grad 1.49% : 0.000003s : 4: predicate.mutable_eliminate 0.55% : 0.000001s : 4: predicate.opt_reshape 0.43% : 0.000001s : 4: predicate.parallel_virtual_node 1.25% : 0.000002s : 11: predicate.partial_defer_inline 1.23% : 0.000002s : 13: predicate.partial_eliminate 0.74% : 0.000001s : 9: predicate.print_const_string_wrapper 0.74% : 0.000001s : 8: predicate.reduce_all_const_elim 0.97% : 0.000002s : 9: predicate.reduce_eliminate 2.23% : 0.000004s : 26: predicate.redundant_stop_gradient_eliminater 0.57% : 0.000001s : 8: predicate.remove_not_recompute_node 1.20% : 0.000002s : 17: predicate.replace_applicator 0.58% : 0.000001s : 8: predicate.replace_old_param 0.47% : 0.000001s : 4: predicate.reset_defer_inline 0.88% : 0.000002s : 9: predicate.reshape_eliminate 0.87% : 0.000001s : 8: predicate.row_tensor_add_zeros_like 0.51% : 0.000001s : 4: predicate.row_tensor_eliminate 0.82% : 0.000001s : 8: predicate.same_eliminate 0.73% : 0.000001s : 8: predicate.set_cell_output_no_recompute 1.12% : 0.000002s : 8: predicate.shard_identity_eliminate 0.97% : 0.000002s : 8: predicate.special_op_eliminate 0.97% : 0.000002s : 8: predicate.specialize_transform 1.00% : 0.000002s : 8: predicate.split_environ_get_set_with_tuple_value 0.92% : 0.000002s : 8: predicate.stack_unstack_eliminate 0.42% : 0.000001s : 4: predicate.switch_call_monad_eliminater 0.97% : 0.000002s : 11: predicate.switch_defer_inline 1.62% : 0.000003s : 19: predicate.switch_layer_defer_inline 4.44% : 0.000008s : 39: predicate.switch_simplify 0.72% : 0.000001s : 9: predicate.tile_eliminate 0.79% : 0.000001s : 9: predicate.transpose_eliminate 1.53% : 0.000003s : 17: predicate.tuple_list_convert_item_index_to_positive 1.54% : 0.000003s : 17: predicate.tuple_list_get_item_const_eliminator 1.41% : 0.000002s : 17: predicate.tuple_list_get_item_depend_reorder 2.91% : 0.000005s : 25: predicate.tuple_list_get_item_eliminator 1.42% : 0.000002s : 17: predicate.tuple_list_get_set_item_eliminator 2.47% : 0.000004s : 25: predicate.tuple_list_set_item_eliminator 1.52% : 0.000003s : 17: predicate.tuple_to_list_eliminator_ 2.09% : 0.000004s : 26: predicate.updatestate_pure_node_eliminater 2.99% : 0.000005s : 34: predicate.updatestate_useless_node_eliminater 0.48% : 0.000001s : 4: predicate.value_based_eliminate 0.76% : 0.000001s : 8: predicate.virtual_dataset_eliminate 1.07% : 0.000002s : 8: predicate.virtual_output_eliminate 0.40% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.63% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000234 5 8.31% : 0.000019s : 1: func_graph_cloner_run.FuncGraphClonerGraph 91.69% : 0.000215s : 4: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.026082 192 0.02% : 0.000006s : 1: ForceFp32Comm 12.50% : 0.003260s : 1: add_attr 12.44% : 0.003244s : 1: add_attr_with_inline 0.02% : 0.000006s : 1: add_comm_op_reuse_tag 0.25% : 0.000064s : 1: add_recomputation 0.03% : 0.000007s : 1: assign_add_opt 0.26% : 0.000068s : 1: auto_monad 0.12% : 0.000031s : 1: auto_monad_reorder 0.02% : 0.000006s : 1: begin_end_overlap_inline 0.03% : 0.000008s : 1: bias_add_comm_swap 1.90% : 0.000495s : 1: bootstrap 0.14% : 0.000035s : 1: cconv 0.02% : 0.000006s : 1: comm_op_add_attrs 0.09% : 0.000022s : 1: control_data_broadcast_order 0.05% : 0.000013s : 1: convert_after_rewriter 0.14% : 0.000037s : 1: cse_after_recomputation 0.03% : 0.000008s : 1: dataset_repeat_opt 0.09% : 0.000023s : 1: detach_backward 0.05% : 0.000012s : 1: environ_conv 0.09% : 0.000025s : 1: event_method 0.03% : 0.000007s : 1: full_micro_interleaved_order_control 0.03% : 0.000008s : 1: get_jit_bprop_graph 0.05% : 0.000012s : 1: graph_reusing 0.03% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.02% : 0.000006s : 1: handle_group_info 0.03% : 0.000008s : 1: inline 0.03% : 0.000009s : 1: insert-virtual-dataset 0.03% : 0.000007s : 1: interleave_parallel_branches 0.03% : 0.000007s : 1: interleave_split_concat_branches 0.03% : 0.000008s : 1: label_fine_grained_interleaved_index 0.04% : 0.000010s : 1: label_micro_interleaved_index 1.83% : 0.000478s : 1: loop_unroll 0.03% : 0.000007s : 1: merge_cast_opt 0.03% : 0.000007s : 1: micro_interleaved_order_control 2.23% : 0.000583s : 1: mutable_eliminate 0.04% : 0.000010s : 1: offloading_packed_experts 0.07% : 0.000017s : 1: opt.transform.loop_unroll_optimizer 0.08% : 0.000020s : 1: opt.transform.mutable_eliminate 3.82% : 0.000997s : 78: opt.transform.opt_a 0.13% : 0.000033s : 1: opt.transform.opt_after_cconv 0.12% : 0.000032s : 1: opt.transform.opt_after_jit_grad 0.50% : 0.000130s : 28: opt.transform.opt_b 0.21% : 0.000054s : 2: opt.transform.opt_trans_graph 0.18% : 0.000047s : 4: opt.transform.symbol_engine_opt 11.47% : 0.002992s : 1: opt_a 0.56% : 0.000147s : 1: opt_after_cconv 2.07% : 0.000539s : 1: opt_after_jit_grad 1.25% : 0.000325s : 1: opt_b 22.34% : 0.005826s : 1: optimize 0.10% : 0.000026s : 1: optimize_parallel_all_gather_comm 0.05% : 0.000013s : 1: order_py_execute_after_rewriter 0.12% : 0.000031s : 1: overlap_grad_flash_sp 0.02% : 0.000006s : 1: overlap_grad_matmul_and_grad_allreduce 0.04% : 0.000010s : 1: overlap_grad_ring_attention 0.03% : 0.000007s : 1: overlap_opt_shard_grad_in_pipeline 0.03% : 0.000007s : 1: overlap_opt_shard_in_pipeline 0.03% : 0.000009s : 1: overlap_param_gather 0.02% : 0.000006s : 1: overlap_recompute_allgather_and_fa_grad 0.05% : 0.000012s : 1: overlap_recompute_and_grad_model_parallel 0.03% : 0.000008s : 1: overlap_recompute_comm 0.04% : 0.000010s : 1: parallel-infer-symbol 0.02% : 0.000006s : 1: parallel-infer-symbol-second 0.03% : 0.000008s : 1: partial_unused_args_eliminate 0.04% : 0.000009s : 1: pipeline_parallel_scheduler 0.03% : 0.000007s : 1: pipeline_split 0.14% : 0.000036s : 1: pre_auto_parallel 0.11% : 0.000028s : 1: py_interpret_to_execute 0.07% : 0.000019s : 1: py_interpret_to_execute_after_opt_a 0.02% : 0.000006s : 1: remove_cast_before_assign_add 0.09% : 0.000024s : 1: remove_dup_value 1.44% : 0.000376s : 1: renormalize.infer 1.05% : 0.000274s : 1: renormalize.specialize 0.03% : 0.000008s : 1: reorder_send_recv_between_fp_bp 0.04% : 0.000012s : 1: rewriter_after_jit_bprop_graph 0.20% : 0.000051s : 1: rewriter_after_opt_a 0.23% : 0.000060s : 1: rewriter_before_opt_a 0.03% : 0.000007s : 1: slice_cell_reuse_recomputed_activation 0.03% : 0.000008s : 1: slice_recompute_activation 0.03% : 0.000007s : 1: split_layernorm_comm 0.03% : 0.000007s : 1: split_matmul_comm_elemetwise 0.05% : 0.000012s : 1: swap_dp_allreduce_reducescatter 0.44% : 0.000114s : 1: symbol_engine_optimizer 0.38% : 0.000100s : 1: tuple_transform 19.35% : 0.005046s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:41.635.656 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0150462, [21] [bootstrap]: 0.00044584 [type_inference]: 0.00489505 [event_method]: 1.298e-05 [auto_monad]: 5.932e-05 [graph_reusing]: 5.63002e-06 [inline]: 2.81e-06 [add_attr]: 0.00351056, [1] [add_attr_with_inline]: 0.00349764, [1] [Cycle 1]: 6.697e-05, [2] [tag_attr]: 1.715e-05 [meta_addattr_fg_expand]: 4.09002e-06 [parallel-infer-symbol]: 3.16999e-06 [pre_auto_parallel]: 3.212e-05 [insert-virtual-dataset]: 2.41998e-06 [parallel-infer-symbol-second]: 7.30011e-07 [dataset_repeat_opt]: 2.02001e-06 [pipeline_split]: 1.60999e-06 [optimize]: 0.00524243, [53] [py_interpret_to_execute]: 2.349e-05 [rewriter_before_opt_a]: 5.759e-05 [opt_a]: 0.00264858, [2] [Cycle 1]: 0.00189021, [45] [expand_dump_flag]: 3.39001e-06 [switch_simplify]: 2.732e-05 [loop_unroll]: 1.535e-05 [a_1]: 0.00039313 [with_stream_mark]: 1.856e-05 [recompute_prepare]: 9.95002e-06 [updatestate_depend_eliminate]: 4.80999e-06 [updatestate_assign_eliminate]: 3.84002e-06 [updatestate_loads_eliminate]: 3.49001e-06 [parameter_eliminate]: 1.92999e-06 [a_2]: 0.00010175 [accelerated_algorithm]: 9.18002e-06 [shard]: 2.31e-06 [meta_shard_fg_expand]: 1.96e-06 [shard_inline]: 7.40998e-06 [merge_send_recv]: 1.014e-05 [auto_parallel]: 6.91999e-06 [parallel]: 1.876e-05 [flash_sp]: 9.70002e-06 [merge_comm]: 4.55999e-06 [allreduce_fusion]: 4.2e-06 [matmul_add_comm_reduction]: 1.012e-05 [allreduce_slice_to_reducescatter]: 8.09989e-07 [virtual_shard_identity]: 1.019e-05 [virtual_dataset]: 7.67998e-06 [get_grad_eliminate_]: 7.4e-06 [virtual_output]: 7.66999e-06 [merge_forward]: 4.53999e-06 [cell_reuse_recompute_pass]: 1.68002e-06 [offload_activation]: 1.155e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.577e-05 [merge_recompute_call_nodes]: 1.39e-06 [before_grad]: 1.359e-05 [set_forward_comm_id_for_comm_node_pass]: 4.79e-06 [meta_fg_expand]: 3.63e-06 [flash_sp_send_recv_attached]: 2.91999e-06 [receive_attached]: 2.27001e-06 [after_resolve]: 1.171e-05 [a_after_grad]: 1.197e-05 [renormalize]: 0.00073068 [add_forward_monad_depend]: 5.72001e-06 [auto_monad_grad]: 2.59001e-06 [auto_monad_eliminator]: 1.69e-05 [cse]: 4.022e-05 [a_3]: 5.699e-05 [Cycle 2]: 0.00074776, [45] [expand_dump_flag]: 2.00002e-06 [switch_simplify]: 8.47e-06 [loop_unroll]: 7.18998e-06 [a_1]: 0.00015411 [with_stream_mark]: 1.456e-05 [recompute_prepare]: 7.50998e-06 [updatestate_depend_eliminate]: 4.45e-06 [updatestate_assign_eliminate]: 2.81999e-06 [updatestate_loads_eliminate]: 2.73e-06 [parameter_eliminate]: 1.24e-06 [a_2]: 8.931e-05 [accelerated_algorithm]: 7.33e-06 [shard]: 1.27e-06 [meta_shard_fg_expand]: 1.86e-06 [shard_inline]: 7.38999e-06 [merge_send_recv]: 6.17001e-06 [auto_parallel]: 6.17999e-06 [parallel]: 6.21e-06 [flash_sp]: 4.02998e-06 [merge_comm]: 4.15e-06 [allreduce_fusion]: 3.71999e-06 [matmul_add_comm_reduction]: 7.58001e-06 [allreduce_slice_to_reducescatter]: 4.80009e-07 [virtual_shard_identity]: 8.22e-06 [virtual_dataset]: 7e-06 [get_grad_eliminate_]: 6.33e-06 [virtual_output]: 6.49999e-06 [merge_forward]: 4.23001e-06 [cell_reuse_recompute_pass]: 1.87999e-06 [offload_activation]: 9.09e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.426e-05 [merge_recompute_call_nodes]: 1.15999e-06 [before_grad]: 1.178e-05 [set_forward_comm_id_for_comm_node_pass]: 4.67e-06 [meta_fg_expand]: 2.94999e-06 [flash_sp_send_recv_attached]: 1.24998e-06 [receive_attached]: 1.59998e-06 [after_resolve]: 1.036e-05 [a_after_grad]: 1.042e-05 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 1.42999e-06 [auto_monad_grad]: 1.19003e-06 [auto_monad_eliminator]: 9.85002e-06 [cse]: 2.206e-05 [a_3]: 4.703e-05 [py_interpret_to_execute_after_opt_a]: 1.589e-05 [slice_cell_reuse_recomputed_activation]: 2.31e-06 [rewriter_after_opt_a]: 4.403e-05 [convert_after_rewriter]: 7.73999e-06 [order_py_execute_after_rewriter]: 6.06998e-06 [mutable_eliminate]: 0.00073439 [opt_b]: 0.0002579, [1] [Cycle 1]: 0.00024923, [7] [b_1]: 0.000154 [b_2]: 9.63997e-06 [updatestate_depend_eliminate]: 8e-06 [updatestate_assign_eliminate]: 3.74002e-06 [updatestate_loads_eliminate]: 3.76001e-06 [renormalize]: 7.89994e-07 [cse]: 3.145e-05 [optimize_parallel_all_gather_comm]: 2.058e-05 [overlap_param_gather]: 1.90001e-06 [cconv]: 3.068e-05 [loop_unroll]: 0.0006016 [opt_after_cconv]: 0.00013512, [1] [Cycle 1]: 0.00012835, [7] [c_1]: 3.757e-05 [parameter_eliminate]: 5.03002e-06 [updatestate_depend_eliminate]: 8.18999e-06 [updatestate_assign_eliminate]: 4e-06 [updatestate_loads_eliminate]: 3.35003e-06 [cse]: 2.837e-05 [renormalize]: 7.30011e-07 [remove_dup_value]: 1.842e-05 [tuple_transform]: 9.381e-05, [1] [Cycle 1]: 8.828e-05, [4] [d_1]: 5.527e-05 [none_parameter_eliminate]: 2.32001e-06 [renormalize]: 2.69996e-07 [switch_simplify]: 9.10001e-06 [partial_unused_args_eliminate]: 2.17999e-06 [add_recomputation]: 6.63e-05 [cse_after_recomputation]: 3.017e-05, [1] [Cycle 1]: 2.497e-05, [1] [cse]: 1.85e-05 [environ_conv]: 8.28999e-06 [swap_dp_allreduce_reducescatter]: 5.99999e-06 [bias_add_comm_swap]: 3.09001e-06 [label_micro_interleaved_index]: 5.20001e-06 [label_fine_grained_interleaved_index]: 2.72001e-06 [merge_cast_opt]: 1.71e-06 [slice_recompute_activation]: 2.14e-06 [micro_interleaved_order_control]: 2.44001e-06 [assign_add_opt]: 1.32e-06 [ForceFp32Comm]: 8.59989e-07 [remove_cast_before_assign_add]: 1.05001e-06 [full_micro_interleaved_order_control]: 2.31e-06 [reorder_send_recv_between_fp_bp]: 2.61e-06 [comm_op_add_attrs]: 1.32999e-06 [add_comm_op_reuse_tag]: 9.10019e-07 [interleave_split_concat_branches]: 1.14998e-06 [interleave_parallel_branches]: 1.03001e-06 [overlap_opt_shard_in_pipeline]: 1.27e-06 [overlap_opt_shard_grad_in_pipeline]: 1.90001e-06 [control_data_broadcast_order]: 1.642e-05 [grouped_pairwise_exchange_alltoall]: 1.44e-06 [offloading_packed_experts]: 4.80001e-06 [overlap_recompute_and_grad_model_parallel]: 5.51e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.24e-06 [overlap_recompute_allgather_and_fa_grad]: 1.29e-06 [overlap_recompute_comm]: 2.84001e-06 [overlap_grad_ring_attention]: 4.45e-06 [overlap_grad_flash_sp]: 2.394e-05 [begin_end_overlap_inline]: 4.90021e-07 [split_matmul_comm_elemetwise]: 2.24999e-06 [split_layernorm_comm]: 2.01e-06 [handle_group_info]: 1.14e-06 [symbol_engine_optimizer]: 9.166e-05, [1] [Cycle 1]: 8.623e-05, [6] [build]: 4.73001e-06 [elim_shapecalc]: 1.228e-05 [elim_not_effective]: 1.593e-05 [opt_reshape]: 8.45999e-06 [fold_const_symbol]: 1.289e-05 [renormalize]: 1.8999e-07 [detach_backward]: 2.42001e-06 [pipeline_parallel_scheduler]: 1.33002e-06 [auto_monad_reorder]: 2.137e-05 [get_jit_bprop_graph]: 1.59e-06 [rewriter_after_jit_bprop_graph]: 5.81e-06 [opt_after_jit_grad]: 0.00056846 [validate]: 5.229e-05 Sums bootstrap : 0.000446s : 4.25% type_inference : 0.004895s : 46.62% event_method : 0.000013s : 0.12% auto_monad : 0.000059s : 0.56% graph_reusing : 0.000006s : 0.05% inline : 0.000003s : 0.03% add_attr.add_attr_with_inline.tag_attr : 0.000017s : 0.16% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000004s : 0.04% parallel-infer-symbol : 0.000003s : 0.03% pre_auto_parallel : 0.000032s : 0.31% insert-virtual-dataset : 0.000002s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.02% optimize.py_interpret_to_execute : 0.000023s : 0.22% optimize.rewriter_before_opt_a : 0.000058s : 0.55% optimize.opt_a.expand_dump_flag : 0.000005s : 0.05% optimize.opt_a.switch_simplify : 0.000036s : 0.34% optimize.opt_a.loop_unroll : 0.000023s : 0.21% optimize.opt_a.a_1 : 0.000547s : 5.21% optimize.opt_a.with_stream_mark : 0.000033s : 0.32% optimize.opt_a.recompute_prepare : 0.000017s : 0.17% optimize.opt_a.updatestate_depend_eliminate : 0.000009s : 0.09% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.06% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.06% optimize.opt_a.parameter_eliminate : 0.000003s : 0.03% optimize.opt_a.a_2 : 0.000191s : 1.82% optimize.opt_a.accelerated_algorithm : 0.000017s : 0.16% optimize.opt_a.shard : 0.000004s : 0.03% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.04% optimize.opt_a.shard_inline : 0.000015s : 0.14% optimize.opt_a.merge_send_recv : 0.000016s : 0.16% optimize.opt_a.auto_parallel : 0.000013s : 0.12% optimize.opt_a.parallel : 0.000025s : 0.24% optimize.opt_a.flash_sp : 0.000014s : 0.13% optimize.opt_a.merge_comm : 0.000009s : 0.08% optimize.opt_a.allreduce_fusion : 0.000008s : 0.08% optimize.opt_a.matmul_add_comm_reduction : 0.000018s : 0.17% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000018s : 0.18% optimize.opt_a.virtual_dataset : 0.000015s : 0.14% optimize.opt_a.get_grad_eliminate_ : 0.000014s : 0.13% optimize.opt_a.virtual_output : 0.000014s : 0.13% optimize.opt_a.merge_forward : 0.000009s : 0.08% optimize.opt_a.cell_reuse_recompute_pass : 0.000004s : 0.03% optimize.opt_a.offload_activation : 0.000021s : 0.20% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000030s : 0.29% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.02% optimize.opt_a.before_grad : 0.000025s : 0.24% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000009s : 0.09% optimize.opt_a.meta_fg_expand : 0.000007s : 0.06% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.04% optimize.opt_a.receive_attached : 0.000004s : 0.04% optimize.opt_a.after_resolve : 0.000022s : 0.21% optimize.opt_a.a_after_grad : 0.000022s : 0.21% optimize.opt_a.renormalize : 0.000731s : 6.96% optimize.opt_a.add_forward_monad_depend : 0.000007s : 0.07% optimize.opt_a.auto_monad_grad : 0.000004s : 0.04% optimize.opt_a.auto_monad_eliminator : 0.000027s : 0.25% optimize.opt_a.cse : 0.000062s : 0.59% optimize.opt_a.a_3 : 0.000104s : 0.99% optimize.py_interpret_to_execute_after_opt_a : 0.000016s : 0.15% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.02% optimize.rewriter_after_opt_a : 0.000044s : 0.42% optimize.convert_after_rewriter : 0.000008s : 0.07% optimize.order_py_execute_after_rewriter : 0.000006s : 0.06% optimize.mutable_eliminate : 0.000734s : 6.99% optimize.opt_b.b_1 : 0.000154s : 1.47% optimize.opt_b.b_2 : 0.000010s : 0.09% optimize.opt_b.updatestate_depend_eliminate : 0.000008s : 0.08% optimize.opt_b.updatestate_assign_eliminate : 0.000004s : 0.04% optimize.opt_b.updatestate_loads_eliminate : 0.000004s : 0.04% optimize.opt_b.renormalize : 0.000001s : 0.01% optimize.opt_b.cse : 0.000031s : 0.30% optimize.optimize_parallel_all_gather_comm : 0.000021s : 0.20% optimize.overlap_param_gather : 0.000002s : 0.02% optimize.cconv : 0.000031s : 0.29% optimize.loop_unroll : 0.000602s : 5.73% optimize.opt_after_cconv.c_1 : 0.000038s : 0.36% optimize.opt_after_cconv.parameter_eliminate : 0.000005s : 0.05% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000008s : 0.08% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000004s : 0.04% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.cse : 0.000028s : 0.27% optimize.opt_after_cconv.renormalize : 0.000001s : 0.01% optimize.remove_dup_value : 0.000018s : 0.18% optimize.tuple_transform.d_1 : 0.000055s : 0.53% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.02% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000009s : 0.09% optimize.partial_unused_args_eliminate : 0.000002s : 0.02% optimize.add_recomputation : 0.000066s : 0.63% optimize.cse_after_recomputation.cse : 0.000018s : 0.18% optimize.environ_conv : 0.000008s : 0.08% optimize.swap_dp_allreduce_reducescatter : 0.000006s : 0.06% optimize.bias_add_comm_swap : 0.000003s : 0.03% optimize.label_micro_interleaved_index : 0.000005s : 0.05% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.03% optimize.merge_cast_opt : 0.000002s : 0.02% optimize.slice_recompute_activation : 0.000002s : 0.02% optimize.micro_interleaved_order_control : 0.000002s : 0.02% optimize.assign_add_opt : 0.000001s : 0.01% optimize.ForceFp32Comm : 0.000001s : 0.01% optimize.remove_cast_before_assign_add : 0.000001s : 0.01% optimize.full_micro_interleaved_order_control : 0.000002s : 0.02% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.02% optimize.comm_op_add_attrs : 0.000001s : 0.01% optimize.add_comm_op_reuse_tag : 0.000001s : 0.01% optimize.interleave_split_concat_branches : 0.000001s : 0.01% optimize.interleave_parallel_branches : 0.000001s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.02% optimize.control_data_broadcast_order : 0.000016s : 0.16% optimize.grouped_pairwise_exchange_alltoall : 0.000001s : 0.01% optimize.offloading_packed_experts : 0.000005s : 0.05% optimize.overlap_recompute_and_grad_model_parallel : 0.000006s : 0.05% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.01% optimize.overlap_recompute_comm : 0.000003s : 0.03% optimize.overlap_grad_ring_attention : 0.000004s : 0.04% optimize.overlap_grad_flash_sp : 0.000024s : 0.23% optimize.begin_end_overlap_inline : 0.000000s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.02% optimize.split_layernorm_comm : 0.000002s : 0.02% optimize.handle_group_info : 0.000001s : 0.01% optimize.symbol_engine_optimizer.build : 0.000005s : 0.05% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000012s : 0.12% optimize.symbol_engine_optimizer.elim_not_effective : 0.000016s : 0.15% optimize.symbol_engine_optimizer.opt_reshape : 0.000008s : 0.08% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000013s : 0.12% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.02% pipeline_parallel_scheduler : 0.000001s : 0.01% auto_monad_reorder : 0.000021s : 0.20% get_jit_bprop_graph : 0.000002s : 0.02% rewriter_after_jit_bprop_graph : 0.000006s : 0.06% opt_after_jit_grad : 0.000568s : 5.41% validate : 0.000052s : 0.50% Time group info: ------[substitution.] 0.000183 28 11.84% : 0.000022s : 2: substitution.cast_eliminate 1.37% : 0.000003s : 3: substitution.elim_not_effective 1.05% : 0.000002s : 3: substitution.fold_const_symbol 3.93% : 0.000007s : 4: substitution.graph_param_transform 73.58% : 0.000135s : 2: substitution.inline 2.77% : 0.000005s : 6: substitution.j_node_and_user_rematch 3.38% : 0.000006s : 6: substitution.remove_not_recompute_node 2.09% : 0.000004s : 2: substitution.replace_old_param ------[type_inference.] 0.004814 2 90.57% : 0.004361s : 1: type_inference.infer 9.43% : 0.000454s : 1: type_inference.specialize ------[replace.] 0.000022 2 100.00% : 0.000022s : 2: replace.inline ------[match.] 0.000133 2 100.00% : 0.000133s : 2: match.inline ------[predicate.] 0.000180 980 0.85% : 0.000002s : 9: predicate.accumulaten_eliminater 0.96% : 0.000002s : 4: predicate.ad_related_special_op_eliminate 0.64% : 0.000001s : 8: predicate.addn_check_dump 0.89% : 0.000002s : 9: predicate.addn_zero_filter 0.71% : 0.000001s : 9: predicate.adjust_all_reduce_mul_add 2.13% : 0.000004s : 17: predicate.arithmetic_simplify 0.91% : 0.000002s : 9: predicate.cast_eliminate 0.71% : 0.000001s : 8: predicate.check_bprop_eliminate 0.69% : 0.000001s : 8: predicate.compare_switch_simplify 0.21% : 0.000000s : 4: predicate.const_output_eliminate 0.74% : 0.000001s : 8: predicate.depend_value_elim 0.89% : 0.000002s : 9: predicate.dict_get_item_const_eliminator 0.93% : 0.000002s : 9: predicate.dict_get_item_eliminator 0.81% : 0.000001s : 9: predicate.dict_set_item_eliminator 1.32% : 0.000002s : 8: predicate.dumpgradient_eliminate 0.26% : 0.000000s : 4: predicate.elim_not_effective 0.59% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.12% : 0.000002s : 13: predicate.environ_add_const_eliminate 1.04% : 0.000002s : 13: predicate.environ_get_add_eliminate 1.01% : 0.000002s : 13: predicate.environ_get_depend_swap 1.78% : 0.000003s : 21: predicate.environ_get_eliminate 0.99% : 0.000002s : 13: predicate.environ_get_set_eliminate 0.91% : 0.000002s : 11: predicate.exchange_switch_depend_value 1.77% : 0.000003s : 11: predicate.float_depend_g_call 0.72% : 0.000001s : 8: predicate.float_environ_get_switch 1.05% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.25% : 0.000000s : 4: predicate.fold_const_symbol 0.75% : 0.000001s : 8: predicate.get_grad_eliminate 0.24% : 0.000000s : 4: predicate.graph_param_transform 0.77% : 0.000001s : 8: predicate.incorporate_call 0.61% : 0.000001s : 8: predicate.incorporate_call_switch 6.17% : 0.000011s : 44: predicate.inline 1.12% : 0.000002s : 8: predicate.inline_without_move 0.36% : 0.000001s : 8: predicate.j_node_and_user_rematch 1.26% : 0.000002s : 8: predicate.less_batch_normalization 1.60% : 0.000003s : 17: predicate.list_to_tuple_eliminator_ 2.22% : 0.000004s : 26: predicate.load_eliminater 1.49% : 0.000003s : 4: predicate.loop_unroll_after_grad 1.52% : 0.000003s : 16: predicate.loop_unroll_before_grad 2.18% : 0.000004s : 17: predicate.make_slice_get_slice_eliminator 0.83% : 0.000001s : 8: predicate.merge_addn 0.84% : 0.000002s : 8: predicate.micro_step_allgather_replace 0.79% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.69% : 0.000001s : 9: predicate.minmaximum_grad 2.15% : 0.000004s : 4: predicate.mutable_eliminate 0.49% : 0.000001s : 4: predicate.opt_reshape 0.48% : 0.000001s : 4: predicate.parallel_virtual_node 1.20% : 0.000002s : 11: predicate.partial_defer_inline 1.22% : 0.000002s : 13: predicate.partial_eliminate 0.73% : 0.000001s : 9: predicate.print_const_string_wrapper 0.76% : 0.000001s : 8: predicate.reduce_all_const_elim 1.03% : 0.000002s : 9: predicate.reduce_eliminate 2.07% : 0.000004s : 26: predicate.redundant_stop_gradient_eliminater 0.42% : 0.000001s : 8: predicate.remove_not_recompute_node 1.10% : 0.000002s : 17: predicate.replace_applicator 0.57% : 0.000001s : 8: predicate.replace_old_param 0.49% : 0.000001s : 4: predicate.reset_defer_inline 0.94% : 0.000002s : 9: predicate.reshape_eliminate 0.79% : 0.000001s : 8: predicate.row_tensor_add_zeros_like 0.55% : 0.000001s : 4: predicate.row_tensor_eliminate 0.82% : 0.000001s : 8: predicate.same_eliminate 0.48% : 0.000001s : 8: predicate.set_cell_output_no_recompute 1.11% : 0.000002s : 8: predicate.shard_identity_eliminate 0.91% : 0.000002s : 8: predicate.special_op_eliminate 0.91% : 0.000002s : 8: predicate.specialize_transform 1.13% : 0.000002s : 8: predicate.split_environ_get_set_with_tuple_value 1.04% : 0.000002s : 8: predicate.stack_unstack_eliminate 0.46% : 0.000001s : 4: predicate.switch_call_monad_eliminater 0.92% : 0.000002s : 11: predicate.switch_defer_inline 1.68% : 0.000003s : 19: predicate.switch_layer_defer_inline 3.82% : 0.000007s : 39: predicate.switch_simplify 0.77% : 0.000001s : 9: predicate.tile_eliminate 0.78% : 0.000001s : 9: predicate.transpose_eliminate 1.62% : 0.000003s : 17: predicate.tuple_list_convert_item_index_to_positive 1.74% : 0.000003s : 17: predicate.tuple_list_get_item_const_eliminator 1.42% : 0.000003s : 17: predicate.tuple_list_get_item_depend_reorder 3.23% : 0.000006s : 25: predicate.tuple_list_get_item_eliminator 1.73% : 0.000003s : 17: predicate.tuple_list_get_set_item_eliminator 2.68% : 0.000005s : 25: predicate.tuple_list_set_item_eliminator 1.47% : 0.000003s : 17: predicate.tuple_to_list_eliminator_ 2.03% : 0.000004s : 26: predicate.updatestate_pure_node_eliminater 2.79% : 0.000005s : 34: predicate.updatestate_useless_node_eliminater 0.77% : 0.000001s : 4: predicate.value_based_eliminate 0.73% : 0.000001s : 8: predicate.virtual_dataset_eliminate 0.73% : 0.000001s : 8: predicate.virtual_output_eliminate 0.34% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.61% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000237 5 8.49% : 0.000020s : 1: func_graph_cloner_run.FuncGraphClonerGraph 91.51% : 0.000217s : 4: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.025728 192 0.01% : 0.000004s : 1: ForceFp32Comm 13.67% : 0.003517s : 1: add_attr 13.61% : 0.003502s : 1: add_attr_with_inline 0.01% : 0.000004s : 1: add_comm_op_reuse_tag 0.28% : 0.000071s : 1: add_recomputation 0.02% : 0.000004s : 1: assign_add_opt 0.25% : 0.000065s : 1: auto_monad 0.10% : 0.000025s : 1: auto_monad_reorder 0.01% : 0.000004s : 1: begin_end_overlap_inline 0.02% : 0.000006s : 1: bias_add_comm_swap 1.84% : 0.000474s : 1: bootstrap 0.13% : 0.000034s : 1: cconv 0.02% : 0.000004s : 1: comm_op_add_attrs 0.08% : 0.000020s : 1: control_data_broadcast_order 0.04% : 0.000011s : 1: convert_after_rewriter 0.13% : 0.000033s : 1: cse_after_recomputation 0.02% : 0.000006s : 1: dataset_repeat_opt 0.02% : 0.000006s : 1: detach_backward 0.05% : 0.000012s : 1: environ_conv 0.08% : 0.000020s : 1: event_method 0.02% : 0.000005s : 1: full_micro_interleaved_order_control 0.02% : 0.000005s : 1: get_jit_bprop_graph 0.04% : 0.000009s : 1: graph_reusing 0.02% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.02% : 0.000004s : 1: handle_group_info 0.02% : 0.000006s : 1: inline 0.02% : 0.000006s : 1: insert-virtual-dataset 0.02% : 0.000004s : 1: interleave_parallel_branches 0.02% : 0.000004s : 1: interleave_split_concat_branches 0.02% : 0.000006s : 1: label_fine_grained_interleaved_index 0.03% : 0.000008s : 1: label_micro_interleaved_index 2.38% : 0.000611s : 1: loop_unroll 0.02% : 0.000005s : 1: merge_cast_opt 0.02% : 0.000005s : 1: micro_interleaved_order_control 2.90% : 0.000745s : 1: mutable_eliminate 0.03% : 0.000008s : 1: offloading_packed_experts 0.08% : 0.000020s : 1: opt.transform.loop_unroll_optimizer 0.08% : 0.000021s : 1: opt.transform.mutable_eliminate 3.94% : 0.001013s : 78: opt.transform.opt_a 0.14% : 0.000036s : 1: opt.transform.opt_after_cconv 0.12% : 0.000030s : 1: opt.transform.opt_after_jit_grad 0.50% : 0.000129s : 28: opt.transform.opt_b 0.24% : 0.000061s : 2: opt.transform.opt_trans_graph 0.18% : 0.000045s : 4: opt.transform.symbol_engine_opt 10.31% : 0.002652s : 1: opt_a 0.54% : 0.000139s : 1: opt_after_cconv 2.25% : 0.000580s : 1: opt_after_jit_grad 1.02% : 0.000262s : 1: opt_b 20.40% : 0.005248s : 1: optimize 0.09% : 0.000024s : 1: optimize_parallel_all_gather_comm 0.04% : 0.000009s : 1: order_py_execute_after_rewriter 0.11% : 0.000027s : 1: overlap_grad_flash_sp 0.02% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.03% : 0.000007s : 1: overlap_grad_ring_attention 0.02% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.02% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.02% : 0.000005s : 1: overlap_param_gather 0.02% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.03% : 0.000008s : 1: overlap_recompute_and_grad_model_parallel 0.02% : 0.000006s : 1: overlap_recompute_comm 0.03% : 0.000007s : 1: parallel-infer-symbol 0.01% : 0.000004s : 1: parallel-infer-symbol-second 0.02% : 0.000006s : 1: partial_unused_args_eliminate 0.02% : 0.000004s : 1: pipeline_parallel_scheduler 0.02% : 0.000004s : 1: pipeline_split 0.14% : 0.000037s : 1: pre_auto_parallel 0.11% : 0.000027s : 1: py_interpret_to_execute 0.08% : 0.000020s : 1: py_interpret_to_execute_after_opt_a 0.02% : 0.000004s : 1: remove_cast_before_assign_add 0.09% : 0.000022s : 1: remove_dup_value 1.66% : 0.000427s : 1: renormalize.infer 1.14% : 0.000294s : 1: renormalize.specialize 0.02% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.04% : 0.000009s : 1: rewriter_after_jit_bprop_graph 0.19% : 0.000049s : 1: rewriter_after_opt_a 0.24% : 0.000062s : 1: rewriter_before_opt_a 0.02% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.02% : 0.000005s : 1: slice_recompute_activation 0.02% : 0.000005s : 1: split_layernorm_comm 0.02% : 0.000005s : 1: split_matmul_comm_elemetwise 0.03% : 0.000009s : 1: swap_dp_allreduce_reducescatter 0.37% : 0.000094s : 1: symbol_engine_optimizer 0.38% : 0.000097s : 1: tuple_transform 19.15% : 0.004927s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:41.835.999 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:41.836.275 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0141382, [21] [bootstrap]: 0.00045962 [type_inference]: 0.00471857 [event_method]: 1.132e-05 [auto_monad]: 5.539e-05 [graph_reusing]: 5.59e-06 [inline]: 2.09e-06 [add_attr]: 0.00311404, [1] [add_attr_with_inline]: 0.00310571, [1] [Cycle 1]: 6.383e-05, [2] [tag_attr]: 1.488e-05 [meta_addattr_fg_expand]: 3.78999e-06 [parallel-infer-symbol]: 3.25e-06 [pre_auto_parallel]: 2.495e-05 [insert-virtual-dataset]: 2.40002e-06 [parallel-infer-symbol-second]: 6.79982e-07 [dataset_repeat_opt]: 2.01998e-06 [pipeline_split]: 1.72001e-06 [optimize]: 0.00460862, [53] [py_interpret_to_execute]: 2.008e-05 [rewriter_before_opt_a]: 4.893e-05 [opt_a]: 0.00242747, [2] [Cycle 1]: 0.00164388, [45] [expand_dump_flag]: 2.76e-06 [switch_simplify]: 2.508e-05 [loop_unroll]: 1.366e-05 [a_1]: 0.00028409 [with_stream_mark]: 1.639e-05 [recompute_prepare]: 7.94002e-06 [updatestate_depend_eliminate]: 4.3e-06 [updatestate_assign_eliminate]: 3.48e-06 [updatestate_loads_eliminate]: 2.86999e-06 [parameter_eliminate]: 1.87999e-06 [a_2]: 0.00010755 [accelerated_algorithm]: 6.41e-06 [shard]: 2.22001e-06 [meta_shard_fg_expand]: 1.86003e-06 [shard_inline]: 5.89999e-06 [merge_send_recv]: 8.72e-06 [auto_parallel]: 6.54999e-06 [parallel]: 1.745e-05 [flash_sp]: 7.7e-06 [merge_comm]: 4.25999e-06 [allreduce_fusion]: 3.46001e-06 [matmul_add_comm_reduction]: 8.95999e-06 [allreduce_slice_to_reducescatter]: 5.99975e-07 [virtual_shard_identity]: 7.66001e-06 [virtual_dataset]: 6.07001e-06 [get_grad_eliminate_]: 5.82001e-06 [virtual_output]: 6.32001e-06 [merge_forward]: 3.66999e-06 [cell_reuse_recompute_pass]: 1.24e-06 [offload_activation]: 9.77001e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.398e-05 [merge_recompute_call_nodes]: 1.43002e-06 [before_grad]: 1.051e-05 [set_forward_comm_id_for_comm_node_pass]: 3.51001e-06 [meta_fg_expand]: 2.76999e-06 [flash_sp_send_recv_attached]: 2.51998e-06 [receive_attached]: 1.99e-06 [after_resolve]: 9.59999e-06 [a_after_grad]: 8.96998e-06 [renormalize]: 0.00052166 [add_forward_monad_depend]: 5.00999e-06 [auto_monad_grad]: 2.27999e-06 [auto_monad_eliminator]: 1.41e-05 [cse]: 3.057e-05 [a_3]: 5.91e-05 [Cycle 2]: 0.00077087, [45] [expand_dump_flag]: 1.13001e-06 [switch_simplify]: 6.84999e-06 [loop_unroll]: 5.72999e-06 [a_1]: 0.00010658 [with_stream_mark]: 1.101e-05 [recompute_prepare]: 6.22001e-06 [updatestate_depend_eliminate]: 3.06001e-06 [updatestate_assign_eliminate]: 2.47001e-06 [updatestate_loads_eliminate]: 2.66e-06 [parameter_eliminate]: 1.10999e-06 [a_2]: 9.577e-05 [accelerated_algorithm]: 6.23e-06 [shard]: 1.08001e-06 [meta_shard_fg_expand]: 1.31002e-06 [shard_inline]: 5.93998e-06 [merge_send_recv]: 4.97e-06 [auto_parallel]: 5.68997e-06 [parallel]: 4.44002e-06 [flash_sp]: 3.48999e-06 [merge_comm]: 3.3e-06 [allreduce_fusion]: 3.06001e-06 [matmul_add_comm_reduction]: 5.89999e-06 [allreduce_slice_to_reducescatter]: 3.89991e-07 [virtual_shard_identity]: 6.29999e-06 [virtual_dataset]: 5.77999e-06 [get_grad_eliminate_]: 5.53002e-06 [virtual_output]: 5.44e-06 [merge_forward]: 2.66e-06 [cell_reuse_recompute_pass]: 1.25999e-06 [offload_activation]: 6.21e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.492e-05 [merge_recompute_call_nodes]: 8.70001e-07 [before_grad]: 9.84001e-06 [set_forward_comm_id_for_comm_node_pass]: 3.56001e-06 [meta_fg_expand]: 2.09999e-06 [flash_sp_send_recv_attached]: 7.89994e-07 [receive_attached]: 1.00001e-06 [after_resolve]: 8.88002e-06 [a_after_grad]: 8.00999e-06 [renormalize]: 9.00181e-08 [add_forward_monad_depend]: 1.29e-06 [auto_monad_grad]: 7.80012e-07 [auto_monad_eliminator]: 7.81001e-06 [cse]: 1.345e-05 [a_3]: 4.779e-05 [py_interpret_to_execute_after_opt_a]: 1.064e-05 [slice_cell_reuse_recomputed_activation]: 4.47e-06 [rewriter_after_opt_a]: 3.668e-05 [convert_after_rewriter]: 9.83002e-06 [order_py_execute_after_rewriter]: 8.41002e-06 [mutable_eliminate]: 0.00048798 [opt_b]: 0.00026082, [1] [Cycle 1]: 0.00025243, [7] [b_1]: 0.00016255 [b_2]: 7.20003e-06 [updatestate_depend_eliminate]: 5.32999e-06 [updatestate_assign_eliminate]: 2.54999e-06 [updatestate_loads_eliminate]: 2.16998e-06 [renormalize]: 3.30008e-07 [cse]: 1.818e-05 [optimize_parallel_all_gather_comm]: 1.773e-05 [overlap_param_gather]: 4.80999e-06 [cconv]: 2.678e-05 [loop_unroll]: 0.00046215 [opt_after_cconv]: 0.00011984, [1] [Cycle 1]: 0.00011167, [7] [c_1]: 2.692e-05 [parameter_eliminate]: 2.31e-06 [updatestate_depend_eliminate]: 5.05999e-06 [updatestate_assign_eliminate]: 2.37001e-06 [updatestate_loads_eliminate]: 2.43e-06 [cse]: 1.713e-05 [renormalize]: 3.80009e-07 [remove_dup_value]: 1.726e-05 [tuple_transform]: 8.73e-05, [1] [Cycle 1]: 8.02e-05, [4] [d_1]: 4.043e-05 [none_parameter_eliminate]: 1.54998e-06 [renormalize]: 1.80007e-07 [switch_simplify]: 7.04001e-06 [partial_unused_args_eliminate]: 4.45e-06 [add_recomputation]: 4.898e-05 [cse_after_recomputation]: 2.818e-05, [1] [Cycle 1]: 2.108e-05, [1] [cse]: 1.209e-05 [environ_conv]: 8.40001e-06 [swap_dp_allreduce_reducescatter]: 7.88001e-06 [bias_add_comm_swap]: 5.36002e-06 [label_micro_interleaved_index]: 6.79999e-06 [label_fine_grained_interleaved_index]: 5.24e-06 [merge_cast_opt]: 3.76001e-06 [slice_recompute_activation]: 4.53999e-06 [micro_interleaved_order_control]: 4.36002e-06 [assign_add_opt]: 3.61001e-06 [ForceFp32Comm]: 3.41001e-06 [remove_cast_before_assign_add]: 3.23e-06 [full_micro_interleaved_order_control]: 4.45999e-06 [reorder_send_recv_between_fp_bp]: 5.13002e-06 [comm_op_add_attrs]: 3.93001e-06 [add_comm_op_reuse_tag]: 3.40003e-06 [interleave_split_concat_branches]: 3.64002e-06 [interleave_parallel_branches]: 3.42002e-06 [overlap_opt_shard_in_pipeline]: 3.58e-06 [overlap_opt_shard_grad_in_pipeline]: 4.13001e-06 [control_data_broadcast_order]: 1.608e-05 [grouped_pairwise_exchange_alltoall]: 4.28001e-06 [offloading_packed_experts]: 6.33002e-06 [overlap_recompute_and_grad_model_parallel]: 6.83e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.41999e-06 [overlap_recompute_allgather_and_fa_grad]: 3.36001e-06 [overlap_recompute_comm]: 4.60001e-06 [overlap_grad_ring_attention]: 6.91001e-06 [overlap_grad_flash_sp]: 2.135e-05 [begin_end_overlap_inline]: 3.21001e-06 [split_matmul_comm_elemetwise]: 4.37998e-06 [split_layernorm_comm]: 4.15999e-06 [handle_group_info]: 3.44001e-06 [symbol_engine_optimizer]: 9.494e-05, [1] [Cycle 1]: 8.814e-05, [6] [build]: 2.93e-06 [elim_shapecalc]: 9.61e-06 [elim_not_effective]: 1.308e-05 [opt_reshape]: 6.79999e-06 [fold_const_symbol]: 9.68997e-06 [renormalize]: 2.29978e-07 [detach_backward]: 3.45998e-06 [pipeline_parallel_scheduler]: 1.72999e-06 [auto_monad_reorder]: 1.949e-05 [get_jit_bprop_graph]: 1.39998e-06 [rewriter_after_jit_bprop_graph]: 4.60999e-06 [opt_after_jit_grad]: 0.00048893 [validate]: 3.667e-05 Sums bootstrap : 0.000460s : 4.93% type_inference : 0.004719s : 50.62% event_method : 0.000011s : 0.12% auto_monad : 0.000055s : 0.59% graph_reusing : 0.000006s : 0.06% inline : 0.000002s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000015s : 0.16% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000004s : 0.04% parallel-infer-symbol : 0.000003s : 0.03% pre_auto_parallel : 0.000025s : 0.27% insert-virtual-dataset : 0.000002s : 0.03% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.02% optimize.py_interpret_to_execute : 0.000020s : 0.22% optimize.rewriter_before_opt_a : 0.000049s : 0.52% optimize.opt_a.expand_dump_flag : 0.000004s : 0.04% optimize.opt_a.switch_simplify : 0.000032s : 0.34% optimize.opt_a.loop_unroll : 0.000019s : 0.21% optimize.opt_a.a_1 : 0.000391s : 4.19% optimize.opt_a.with_stream_mark : 0.000027s : 0.29% optimize.opt_a.recompute_prepare : 0.000014s : 0.15% optimize.opt_a.updatestate_depend_eliminate : 0.000007s : 0.08% optimize.opt_a.updatestate_assign_eliminate : 0.000006s : 0.06% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.06% optimize.opt_a.parameter_eliminate : 0.000003s : 0.03% optimize.opt_a.a_2 : 0.000203s : 2.18% optimize.opt_a.accelerated_algorithm : 0.000013s : 0.14% optimize.opt_a.shard : 0.000003s : 0.04% optimize.opt_a.meta_shard_fg_expand : 0.000003s : 0.03% optimize.opt_a.shard_inline : 0.000012s : 0.13% optimize.opt_a.merge_send_recv : 0.000014s : 0.15% optimize.opt_a.auto_parallel : 0.000012s : 0.13% optimize.opt_a.parallel : 0.000022s : 0.23% optimize.opt_a.flash_sp : 0.000011s : 0.12% optimize.opt_a.merge_comm : 0.000008s : 0.08% optimize.opt_a.allreduce_fusion : 0.000007s : 0.07% optimize.opt_a.matmul_add_comm_reduction : 0.000015s : 0.16% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000014s : 0.15% optimize.opt_a.virtual_dataset : 0.000012s : 0.13% optimize.opt_a.get_grad_eliminate_ : 0.000011s : 0.12% optimize.opt_a.virtual_output : 0.000012s : 0.13% optimize.opt_a.merge_forward : 0.000006s : 0.07% optimize.opt_a.cell_reuse_recompute_pass : 0.000002s : 0.03% optimize.opt_a.offload_activation : 0.000016s : 0.17% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000029s : 0.31% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.02% optimize.opt_a.before_grad : 0.000020s : 0.22% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000007s : 0.08% optimize.opt_a.meta_fg_expand : 0.000005s : 0.05% optimize.opt_a.flash_sp_send_recv_attached : 0.000003s : 0.04% optimize.opt_a.receive_attached : 0.000003s : 0.03% optimize.opt_a.after_resolve : 0.000018s : 0.20% optimize.opt_a.a_after_grad : 0.000017s : 0.18% optimize.opt_a.renormalize : 0.000522s : 5.60% optimize.opt_a.add_forward_monad_depend : 0.000006s : 0.07% optimize.opt_a.auto_monad_grad : 0.000003s : 0.03% optimize.opt_a.auto_monad_eliminator : 0.000022s : 0.24% optimize.opt_a.cse : 0.000044s : 0.47% optimize.opt_a.a_3 : 0.000107s : 1.15% optimize.py_interpret_to_execute_after_opt_a : 0.000011s : 0.11% optimize.slice_cell_reuse_recomputed_activation : 0.000004s : 0.05% optimize.rewriter_after_opt_a : 0.000037s : 0.39% optimize.convert_after_rewriter : 0.000010s : 0.11% optimize.order_py_execute_after_rewriter : 0.000008s : 0.09% optimize.mutable_eliminate : 0.000488s : 5.23% optimize.opt_b.b_1 : 0.000163s : 1.74% optimize.opt_b.b_2 : 0.000007s : 0.08% optimize.opt_b.updatestate_depend_eliminate : 0.000005s : 0.06% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_b.updatestate_loads_eliminate : 0.000002s : 0.02% optimize.opt_b.renormalize : 0.000000s : 0.00% optimize.opt_b.cse : 0.000018s : 0.20% optimize.optimize_parallel_all_gather_comm : 0.000018s : 0.19% optimize.overlap_param_gather : 0.000005s : 0.05% optimize.cconv : 0.000027s : 0.29% optimize.loop_unroll : 0.000462s : 4.96% optimize.opt_after_cconv.c_1 : 0.000027s : 0.29% optimize.opt_after_cconv.parameter_eliminate : 0.000002s : 0.02% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000005s : 0.05% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000002s : 0.03% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.03% optimize.opt_after_cconv.cse : 0.000017s : 0.18% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000017s : 0.19% optimize.tuple_transform.d_1 : 0.000040s : 0.43% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.02% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000007s : 0.08% optimize.partial_unused_args_eliminate : 0.000004s : 0.05% optimize.add_recomputation : 0.000049s : 0.53% optimize.cse_after_recomputation.cse : 0.000012s : 0.13% optimize.environ_conv : 0.000008s : 0.09% optimize.swap_dp_allreduce_reducescatter : 0.000008s : 0.08% optimize.bias_add_comm_swap : 0.000005s : 0.06% optimize.label_micro_interleaved_index : 0.000007s : 0.07% optimize.label_fine_grained_interleaved_index : 0.000005s : 0.06% optimize.merge_cast_opt : 0.000004s : 0.04% optimize.slice_recompute_activation : 0.000005s : 0.05% optimize.micro_interleaved_order_control : 0.000004s : 0.05% optimize.assign_add_opt : 0.000004s : 0.04% optimize.ForceFp32Comm : 0.000003s : 0.04% optimize.remove_cast_before_assign_add : 0.000003s : 0.03% optimize.full_micro_interleaved_order_control : 0.000004s : 0.05% optimize.reorder_send_recv_between_fp_bp : 0.000005s : 0.06% optimize.comm_op_add_attrs : 0.000004s : 0.04% optimize.add_comm_op_reuse_tag : 0.000003s : 0.04% optimize.interleave_split_concat_branches : 0.000004s : 0.04% optimize.interleave_parallel_branches : 0.000003s : 0.04% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.04% optimize.overlap_opt_shard_grad_in_pipeline : 0.000004s : 0.04% optimize.control_data_broadcast_order : 0.000016s : 0.17% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.05% optimize.offloading_packed_experts : 0.000006s : 0.07% optimize.overlap_recompute_and_grad_model_parallel : 0.000007s : 0.07% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000003s : 0.04% optimize.overlap_recompute_allgather_and_fa_grad : 0.000003s : 0.04% optimize.overlap_recompute_comm : 0.000005s : 0.05% optimize.overlap_grad_ring_attention : 0.000007s : 0.07% optimize.overlap_grad_flash_sp : 0.000021s : 0.23% optimize.begin_end_overlap_inline : 0.000003s : 0.03% optimize.split_matmul_comm_elemetwise : 0.000004s : 0.05% optimize.split_layernorm_comm : 0.000004s : 0.04% optimize.handle_group_info : 0.000003s : 0.04% optimize.symbol_engine_optimizer.build : 0.000003s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000010s : 0.10% optimize.symbol_engine_optimizer.elim_not_effective : 0.000013s : 0.14% optimize.symbol_engine_optimizer.opt_reshape : 0.000007s : 0.07% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000010s : 0.10% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000003s : 0.04% pipeline_parallel_scheduler : 0.000002s : 0.02% auto_monad_reorder : 0.000019s : 0.21% get_jit_bprop_graph : 0.000001s : 0.02% rewriter_after_jit_bprop_graph : 0.000005s : 0.05% opt_after_jit_grad : 0.000489s : 5.24% validate : 0.000037s : 0.39% Time group info: ------[substitution.] 0.000107 19 1.67% : 0.000002s : 2: substitution.elim_not_effective 1.22% : 0.000001s : 2: substitution.fold_const_symbol 5.34% : 0.000006s : 3: substitution.graph_param_transform 80.54% : 0.000086s : 2: substitution.inline 3.31% : 0.000004s : 4: substitution.j_node_and_user_rematch 4.18% : 0.000004s : 4: substitution.remove_not_recompute_node 3.73% : 0.000004s : 2: substitution.replace_old_param ------[type_inference.] 0.004671 2 91.18% : 0.004259s : 1: type_inference.infer 8.82% : 0.000412s : 1: type_inference.specialize ------[replace.] 0.000021 2 100.00% : 0.000021s : 2: replace.inline ------[match.] 0.000085 2 100.00% : 0.000085s : 2: match.inline ------[predicate.] 0.000131 754 0.83% : 0.000001s : 7: predicate.accumulaten_eliminater 0.92% : 0.000001s : 3: predicate.ad_related_special_op_eliminate 0.70% : 0.000001s : 6: predicate.addn_check_dump 0.75% : 0.000001s : 7: predicate.addn_zero_filter 0.73% : 0.000001s : 7: predicate.adjust_all_reduce_mul_add 2.22% : 0.000003s : 13: predicate.arithmetic_simplify 0.86% : 0.000001s : 7: predicate.cast_eliminate 0.73% : 0.000001s : 6: predicate.check_bprop_eliminate 0.64% : 0.000001s : 6: predicate.compare_switch_simplify 0.21% : 0.000000s : 3: predicate.const_output_eliminate 0.74% : 0.000001s : 6: predicate.depend_value_elim 0.80% : 0.000001s : 7: predicate.dict_get_item_const_eliminator 0.86% : 0.000001s : 7: predicate.dict_get_item_eliminator 0.77% : 0.000001s : 7: predicate.dict_set_item_eliminator 1.22% : 0.000002s : 6: predicate.dumpgradient_eliminate 0.24% : 0.000000s : 3: predicate.elim_not_effective 0.47% : 0.000001s : 3: predicate.elim_shapecalc_of_broadcastargs 1.06% : 0.000001s : 10: predicate.environ_add_const_eliminate 1.08% : 0.000001s : 10: predicate.environ_get_add_eliminate 1.03% : 0.000001s : 10: predicate.environ_get_depend_swap 1.81% : 0.000002s : 16: predicate.environ_get_eliminate 1.04% : 0.000001s : 10: predicate.environ_get_set_eliminate 0.96% : 0.000001s : 9: predicate.exchange_switch_depend_value 2.01% : 0.000003s : 9: predicate.float_depend_g_call 0.67% : 0.000001s : 6: predicate.float_environ_get_switch 0.96% : 0.000001s : 9: predicate.float_tuple_getitem_switch 0.21% : 0.000000s : 3: predicate.fold_const_symbol 1.00% : 0.000001s : 6: predicate.get_grad_eliminate 0.28% : 0.000000s : 3: predicate.graph_param_transform 0.87% : 0.000001s : 6: predicate.incorporate_call 0.68% : 0.000001s : 6: predicate.incorporate_call_switch 6.39% : 0.000008s : 34: predicate.inline 0.93% : 0.000001s : 6: predicate.inline_without_move 0.42% : 0.000001s : 6: predicate.j_node_and_user_rematch 1.06% : 0.000001s : 6: predicate.less_batch_normalization 1.76% : 0.000002s : 13: predicate.list_to_tuple_eliminator_ 2.04% : 0.000003s : 20: predicate.load_eliminater 1.35% : 0.000002s : 3: predicate.loop_unroll_after_grad 1.68% : 0.000002s : 14: predicate.loop_unroll_before_grad 1.80% : 0.000002s : 13: predicate.make_slice_get_slice_eliminator 0.69% : 0.000001s : 6: predicate.merge_addn 0.71% : 0.000001s : 6: predicate.micro_step_allgather_replace 0.76% : 0.000001s : 6: predicate.mini_step_allgather_replace 0.71% : 0.000001s : 7: predicate.minmaximum_grad 1.42% : 0.000002s : 3: predicate.mutable_eliminate 0.44% : 0.000001s : 3: predicate.opt_reshape 0.63% : 0.000001s : 3: predicate.parallel_virtual_node 1.24% : 0.000002s : 9: predicate.partial_defer_inline 1.24% : 0.000002s : 10: predicate.partial_eliminate 0.83% : 0.000001s : 7: predicate.print_const_string_wrapper 0.78% : 0.000001s : 6: predicate.reduce_all_const_elim 1.07% : 0.000001s : 7: predicate.reduce_eliminate 2.05% : 0.000003s : 20: predicate.redundant_stop_gradient_eliminater 0.66% : 0.000001s : 6: predicate.remove_not_recompute_node 1.20% : 0.000002s : 13: predicate.replace_applicator 0.72% : 0.000001s : 6: predicate.replace_old_param 0.35% : 0.000000s : 3: predicate.reset_defer_inline 0.80% : 0.000001s : 7: predicate.reshape_eliminate 0.79% : 0.000001s : 6: predicate.row_tensor_add_zeros_like 0.69% : 0.000001s : 3: predicate.row_tensor_eliminate 1.22% : 0.000002s : 6: predicate.same_eliminate 0.54% : 0.000001s : 6: predicate.set_cell_output_no_recompute 1.03% : 0.000001s : 6: predicate.shard_identity_eliminate 0.88% : 0.000001s : 6: predicate.special_op_eliminate 0.89% : 0.000001s : 6: predicate.specialize_transform 1.38% : 0.000002s : 6: predicate.split_environ_get_set_with_tuple_value 0.90% : 0.000001s : 6: predicate.stack_unstack_eliminate 0.49% : 0.000001s : 3: predicate.switch_call_monad_eliminater 0.98% : 0.000001s : 9: predicate.switch_defer_inline 1.74% : 0.000002s : 15: predicate.switch_layer_defer_inline 4.65% : 0.000006s : 32: predicate.switch_simplify 0.74% : 0.000001s : 7: predicate.tile_eliminate 0.98% : 0.000001s : 7: predicate.transpose_eliminate 1.58% : 0.000002s : 13: predicate.tuple_list_convert_item_index_to_positive 1.64% : 0.000002s : 13: predicate.tuple_list_get_item_const_eliminator 1.42% : 0.000002s : 13: predicate.tuple_list_get_item_depend_reorder 3.23% : 0.000004s : 19: predicate.tuple_list_get_item_eliminator 1.42% : 0.000002s : 13: predicate.tuple_list_get_set_item_eliminator 2.36% : 0.000003s : 19: predicate.tuple_list_set_item_eliminator 1.54% : 0.000002s : 13: predicate.tuple_to_list_eliminator_ 1.98% : 0.000003s : 20: predicate.updatestate_pure_node_eliminater 2.85% : 0.000004s : 26: predicate.updatestate_useless_node_eliminater 0.47% : 0.000001s : 3: predicate.value_based_eliminate 0.77% : 0.000001s : 6: predicate.virtual_dataset_eliminate 0.77% : 0.000001s : 6: predicate.virtual_output_eliminate 0.31% : 0.000000s : 3: predicate.virtual_view_grad_eliminate 0.64% : 0.000001s : 3: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000202 5 8.34% : 0.000017s : 1: func_graph_cloner_run.FuncGraphClonerGraph 91.66% : 0.000185s : 4: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.023267 192 0.03% : 0.000006s : 1: ForceFp32Comm 13.42% : 0.003123s : 1: add_attr 13.36% : 0.003109s : 1: add_attr_with_inline 0.03% : 0.000006s : 1: add_comm_op_reuse_tag 0.23% : 0.000052s : 1: add_recomputation 0.03% : 0.000006s : 1: assign_add_opt 0.28% : 0.000064s : 1: auto_monad 0.11% : 0.000026s : 1: auto_monad_reorder 0.03% : 0.000006s : 1: begin_end_overlap_inline 0.03% : 0.000008s : 1: bias_add_comm_swap 2.17% : 0.000504s : 1: bootstrap 0.13% : 0.000030s : 1: cconv 0.03% : 0.000007s : 1: comm_op_add_attrs 0.08% : 0.000019s : 1: control_data_broadcast_order 0.06% : 0.000013s : 1: convert_after_rewriter 0.13% : 0.000031s : 1: cse_after_recomputation 0.03% : 0.000008s : 1: dataset_repeat_opt 0.08% : 0.000019s : 1: detach_backward 0.05% : 0.000011s : 1: environ_conv 0.09% : 0.000021s : 1: event_method 0.03% : 0.000007s : 1: full_micro_interleaved_order_control 0.03% : 0.000007s : 1: get_jit_bprop_graph 0.05% : 0.000012s : 1: graph_reusing 0.03% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.03% : 0.000006s : 1: handle_group_info 0.03% : 0.000008s : 1: inline 0.04% : 0.000009s : 1: insert-virtual-dataset 0.03% : 0.000006s : 1: interleave_parallel_branches 0.03% : 0.000006s : 1: interleave_split_concat_branches 0.03% : 0.000008s : 1: label_fine_grained_interleaved_index 0.04% : 0.000010s : 1: label_micro_interleaved_index 2.01% : 0.000468s : 1: loop_unroll 0.03% : 0.000006s : 1: merge_cast_opt 0.03% : 0.000007s : 1: micro_interleaved_order_control 2.12% : 0.000494s : 1: mutable_eliminate 0.04% : 0.000009s : 1: offloading_packed_experts 0.06% : 0.000014s : 1: opt.transform.loop_unroll_optimizer 0.06% : 0.000014s : 1: opt.transform.mutable_eliminate 3.25% : 0.000756s : 78: opt.transform.opt_a 0.11% : 0.000026s : 1: opt.transform.opt_after_cconv 0.10% : 0.000024s : 1: opt.transform.opt_after_jit_grad 0.41% : 0.000097s : 28: opt.transform.opt_b 0.19% : 0.000045s : 2: opt.transform.opt_trans_graph 0.15% : 0.000035s : 4: opt.transform.symbol_engine_opt 10.45% : 0.002431s : 1: opt_a 0.53% : 0.000123s : 1: opt_after_cconv 2.15% : 0.000500s : 1: opt_after_jit_grad 1.14% : 0.000264s : 1: opt_b 21.26% : 0.004947s : 1: optimize 0.09% : 0.000021s : 1: optimize_parallel_all_gather_comm 0.05% : 0.000011s : 1: order_py_execute_after_rewriter 0.11% : 0.000024s : 1: overlap_grad_flash_sp 0.03% : 0.000006s : 1: overlap_grad_matmul_and_grad_allreduce 0.04% : 0.000010s : 1: overlap_grad_ring_attention 0.03% : 0.000007s : 1: overlap_opt_shard_grad_in_pipeline 0.03% : 0.000006s : 1: overlap_opt_shard_in_pipeline 0.03% : 0.000008s : 1: overlap_param_gather 0.03% : 0.000006s : 1: overlap_recompute_allgather_and_fa_grad 0.04% : 0.000010s : 1: overlap_recompute_and_grad_model_parallel 0.03% : 0.000008s : 1: overlap_recompute_comm 0.04% : 0.000010s : 1: parallel-infer-symbol 0.03% : 0.000006s : 1: parallel-infer-symbol-second 0.03% : 0.000007s : 1: partial_unused_args_eliminate 0.04% : 0.000009s : 1: pipeline_parallel_scheduler 0.03% : 0.000007s : 1: pipeline_split 0.14% : 0.000032s : 1: pre_auto_parallel 0.10% : 0.000023s : 1: py_interpret_to_execute 0.06% : 0.000014s : 1: py_interpret_to_execute_after_opt_a 0.03% : 0.000006s : 1: remove_cast_before_assign_add 0.09% : 0.000020s : 1: remove_dup_value 1.32% : 0.000308s : 1: renormalize.infer 0.89% : 0.000207s : 1: renormalize.specialize 0.03% : 0.000008s : 1: reorder_send_recv_between_fp_bp 0.04% : 0.000010s : 1: rewriter_after_jit_bprop_graph 0.17% : 0.000040s : 1: rewriter_after_opt_a 0.23% : 0.000052s : 1: rewriter_before_opt_a 0.03% : 0.000007s : 1: slice_cell_reuse_recomputed_activation 0.03% : 0.000007s : 1: slice_recompute_activation 0.03% : 0.000007s : 1: split_layernorm_comm 0.03% : 0.000007s : 1: split_matmul_comm_elemetwise 0.05% : 0.000011s : 1: swap_dp_allreduce_reducescatter 0.42% : 0.000098s : 1: symbol_engine_optimizer 0.39% : 0.000090s : 1: tuple_transform 20.40% : 0.004747s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:42.295.11 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0129814, [21] [bootstrap]: 0.00041112 [type_inference]: 0.00465023 [event_method]: 1.144e-05 [auto_monad]: 5.352e-05 [graph_reusing]: 4.80001e-06 [inline]: 2.07999e-06 [add_attr]: 0.00313171, [1] [add_attr_with_inline]: 0.00312302, [1] [Cycle 1]: 5.165e-05, [2] [tag_attr]: 1.418e-05 [meta_addattr_fg_expand]: 3.58e-06 [parallel-infer-symbol]: 2.98998e-06 [pre_auto_parallel]: 2.501e-05 [insert-virtual-dataset]: 2.44001e-06 [parallel-infer-symbol-second]: 7.80012e-07 [dataset_repeat_opt]: 1.89e-06 [pipeline_split]: 1.92999e-06 [optimize]: 0.00401188, [53] [py_interpret_to_execute]: 1.722e-05 [rewriter_before_opt_a]: 4.472e-05 [opt_a]: 0.0020693, [2] [Cycle 1]: 0.00145031, [45] [expand_dump_flag]: 2.91e-06 [switch_simplify]: 2.478e-05 [loop_unroll]: 1.404e-05 [a_1]: 0.00028165 [with_stream_mark]: 1.516e-05 [recompute_prepare]: 7.80998e-06 [updatestate_depend_eliminate]: 4.16001e-06 [updatestate_assign_eliminate]: 3.3e-06 [updatestate_loads_eliminate]: 3.75e-06 [parameter_eliminate]: 2.00002e-06 [a_2]: 7.835e-05 [accelerated_algorithm]: 6.49999e-06 [shard]: 1.90001e-06 [meta_shard_fg_expand]: 1.58002e-06 [shard_inline]: 5.69e-06 [merge_send_recv]: 8.34998e-06 [auto_parallel]: 6.44999e-06 [parallel]: 1.836e-05 [flash_sp]: 6.76e-06 [merge_comm]: 3.82002e-06 [allreduce_fusion]: 3.38999e-06 [matmul_add_comm_reduction]: 9.25999e-06 [allreduce_slice_to_reducescatter]: 6.80011e-07 [virtual_shard_identity]: 7.69002e-06 [virtual_dataset]: 6.11e-06 [get_grad_eliminate_]: 5.61003e-06 [virtual_output]: 5.92001e-06 [merge_forward]: 3.56999e-06 [cell_reuse_recompute_pass]: 1.40001e-06 [offload_activation]: 9.87999e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.356e-05 [merge_recompute_call_nodes]: 1.39e-06 [before_grad]: 9.77999e-06 [set_forward_comm_id_for_comm_node_pass]: 3.81001e-06 [meta_fg_expand]: 2.85002e-06 [flash_sp_send_recv_attached]: 2.56e-06 [receive_attached]: 1.93002e-06 [after_resolve]: 9.70002e-06 [a_after_grad]: 8.90001e-06 [renormalize]: 0.00051133 [add_forward_monad_depend]: 4.93001e-06 [auto_monad_grad]: 2.39001e-06 [auto_monad_eliminator]: 1.376e-05 [cse]: 3.049e-05 [a_3]: 4.661e-05 [Cycle 2]: 0.00060898, [45] [expand_dump_flag]: 1.41002e-06 [switch_simplify]: 7.3e-06 [loop_unroll]: 6.05002e-06 [a_1]: 0.00010678 [with_stream_mark]: 1.044e-05 [recompute_prepare]: 6.12999e-06 [updatestate_depend_eliminate]: 3.14999e-06 [updatestate_assign_eliminate]: 2.25002e-06 [updatestate_loads_eliminate]: 2.93e-06 [parameter_eliminate]: 9.20001e-07 [a_2]: 6.861e-05 [accelerated_algorithm]: 5.92001e-06 [shard]: 1.13001e-06 [meta_shard_fg_expand]: 1.35999e-06 [shard_inline]: 5.97999e-06 [merge_send_recv]: 4.92e-06 [auto_parallel]: 5.50001e-06 [parallel]: 4.81997e-06 [flash_sp]: 3.55e-06 [merge_comm]: 3.15002e-06 [allreduce_fusion]: 3.04001e-06 [matmul_add_comm_reduction]: 8.35001e-06 [allreduce_slice_to_reducescatter]: 3.59985e-07 [virtual_shard_identity]: 6.55002e-06 [virtual_dataset]: 6.26998e-06 [get_grad_eliminate_]: 6.15002e-06 [virtual_output]: 5.82999e-06 [merge_forward]: 2.48002e-06 [cell_reuse_recompute_pass]: 1.41998e-06 [offload_activation]: 6.48e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.334e-05 [merge_recompute_call_nodes]: 1.27e-06 [before_grad]: 8.77e-06 [set_forward_comm_id_for_comm_node_pass]: 3.85e-06 [meta_fg_expand]: 2.01003e-06 [flash_sp_send_recv_attached]: 9.20001e-07 [receive_attached]: 1.15001e-06 [after_resolve]: 8.99e-06 [a_after_grad]: 8.12e-06 [renormalize]: 6.00121e-08 [add_forward_monad_depend]: 1.35001e-06 [auto_monad_grad]: 1.22999e-06 [auto_monad_eliminator]: 6.89999e-06 [cse]: 1.364e-05 [a_3]: 3.455e-05 [py_interpret_to_execute_after_opt_a]: 8.32e-06 [slice_cell_reuse_recomputed_activation]: 2.43e-06 [rewriter_after_opt_a]: 3.434e-05 [convert_after_rewriter]: 6.29001e-06 [order_py_execute_after_rewriter]: 5.02999e-06 [mutable_eliminate]: 0.00051979 [opt_b]: 0.00019675, [1] [Cycle 1]: 0.00019062, [7] [b_1]: 0.0001181 [b_2]: 7.43999e-06 [updatestate_depend_eliminate]: 5.15999e-06 [updatestate_assign_eliminate]: 2.29001e-06 [updatestate_loads_eliminate]: 2.74999e-06 [renormalize]: 5.8001e-07 [cse]: 1.927e-05 [optimize_parallel_all_gather_comm]: 1.56e-05 [overlap_param_gather]: 1.96e-06 [cconv]: 2.485e-05 [loop_unroll]: 0.00042961 [opt_after_cconv]: 9.707e-05, [1] [Cycle 1]: 9.148e-05, [7] [c_1]: 2.796e-05 [parameter_eliminate]: 2.56e-06 [updatestate_depend_eliminate]: 4.67998e-06 [updatestate_assign_eliminate]: 2.40002e-06 [updatestate_loads_eliminate]: 2.18998e-06 [cse]: 1.73e-05 [renormalize]: 3.70026e-07 [remove_dup_value]: 1.5e-05 [tuple_transform]: 6.988e-05, [1] [Cycle 1]: 6.547e-05, [4] [d_1]: 3.906e-05 [none_parameter_eliminate]: 1.55999e-06 [renormalize]: 3.00002e-07 [switch_simplify]: 6.34999e-06 [partial_unused_args_eliminate]: 1.70001e-06 [add_recomputation]: 4.6e-05 [cse_after_recomputation]: 2.159e-05, [1] [Cycle 1]: 1.724e-05, [1] [cse]: 1.202e-05 [environ_conv]: 5.25001e-06 [swap_dp_allreduce_reducescatter]: 5.30999e-06 [bias_add_comm_swap]: 2.67001e-06 [label_micro_interleaved_index]: 3.84002e-06 [label_fine_grained_interleaved_index]: 2.57001e-06 [merge_cast_opt]: 1.30999e-06 [slice_recompute_activation]: 2.32999e-06 [micro_interleaved_order_control]: 2.44001e-06 [assign_add_opt]: 1.52001e-06 [ForceFp32Comm]: 7.89994e-07 [remove_cast_before_assign_add]: 9.39996e-07 [full_micro_interleaved_order_control]: 2.31e-06 [reorder_send_recv_between_fp_bp]: 2.84999e-06 [comm_op_add_attrs]: 1.14998e-06 [add_comm_op_reuse_tag]: 9.99979e-07 [interleave_split_concat_branches]: 1.15999e-06 [interleave_parallel_branches]: 1.01002e-06 [overlap_opt_shard_in_pipeline]: 1.17999e-06 [overlap_opt_shard_grad_in_pipeline]: 2.36e-06 [control_data_broadcast_order]: 1.184e-05 [grouped_pairwise_exchange_alltoall]: 1.38002e-06 [offloading_packed_experts]: 3.86001e-06 [overlap_recompute_and_grad_model_parallel]: 4.52e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.12e-06 [overlap_recompute_allgather_and_fa_grad]: 1.47001e-06 [overlap_recompute_comm]: 2.02999e-06 [overlap_grad_ring_attention]: 4.06001e-06 [overlap_grad_flash_sp]: 1.867e-05 [begin_end_overlap_inline]: 5.09986e-07 [split_matmul_comm_elemetwise]: 2.07999e-06 [split_layernorm_comm]: 1.76e-06 [handle_group_info]: 1.30999e-06 [symbol_engine_optimizer]: 7.257e-05, [1] [Cycle 1]: 6.837e-05, [6] [build]: 2.69001e-06 [elim_shapecalc]: 9.04e-06 [elim_not_effective]: 1.2e-05 [opt_reshape]: 6.84999e-06 [fold_const_symbol]: 9.70002e-06 [renormalize]: 2.00002e-07 [detach_backward]: 2.03997e-06 [pipeline_parallel_scheduler]: 1.48002e-06 [auto_monad_reorder]: 1.667e-05 [get_jit_bprop_graph]: 1.86e-06 [rewriter_after_jit_bprop_graph]: 4.08999e-06 [opt_after_jit_grad]: 0.00046 [validate]: 3.836e-05 Sums bootstrap : 0.000411s : 4.61% type_inference : 0.004650s : 52.17% event_method : 0.000011s : 0.13% auto_monad : 0.000054s : 0.60% graph_reusing : 0.000005s : 0.05% inline : 0.000002s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000014s : 0.16% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000004s : 0.04% parallel-infer-symbol : 0.000003s : 0.03% pre_auto_parallel : 0.000025s : 0.28% insert-virtual-dataset : 0.000002s : 0.03% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.02% optimize.py_interpret_to_execute : 0.000017s : 0.19% optimize.rewriter_before_opt_a : 0.000045s : 0.50% optimize.opt_a.expand_dump_flag : 0.000004s : 0.05% optimize.opt_a.switch_simplify : 0.000032s : 0.36% optimize.opt_a.loop_unroll : 0.000020s : 0.23% optimize.opt_a.a_1 : 0.000388s : 4.36% optimize.opt_a.with_stream_mark : 0.000026s : 0.29% optimize.opt_a.recompute_prepare : 0.000014s : 0.16% optimize.opt_a.updatestate_depend_eliminate : 0.000007s : 0.08% optimize.opt_a.updatestate_assign_eliminate : 0.000006s : 0.06% optimize.opt_a.updatestate_loads_eliminate : 0.000007s : 0.07% optimize.opt_a.parameter_eliminate : 0.000003s : 0.03% optimize.opt_a.a_2 : 0.000147s : 1.65% optimize.opt_a.accelerated_algorithm : 0.000012s : 0.14% optimize.opt_a.shard : 0.000003s : 0.03% optimize.opt_a.meta_shard_fg_expand : 0.000003s : 0.03% optimize.opt_a.shard_inline : 0.000012s : 0.13% optimize.opt_a.merge_send_recv : 0.000013s : 0.15% optimize.opt_a.auto_parallel : 0.000012s : 0.13% optimize.opt_a.parallel : 0.000023s : 0.26% optimize.opt_a.flash_sp : 0.000010s : 0.12% optimize.opt_a.merge_comm : 0.000007s : 0.08% optimize.opt_a.allreduce_fusion : 0.000006s : 0.07% optimize.opt_a.matmul_add_comm_reduction : 0.000018s : 0.20% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000014s : 0.16% optimize.opt_a.virtual_dataset : 0.000012s : 0.14% optimize.opt_a.get_grad_eliminate_ : 0.000012s : 0.13% optimize.opt_a.virtual_output : 0.000012s : 0.13% optimize.opt_a.merge_forward : 0.000006s : 0.07% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.03% optimize.opt_a.offload_activation : 0.000016s : 0.18% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000027s : 0.30% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.03% optimize.opt_a.before_grad : 0.000019s : 0.21% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000008s : 0.09% optimize.opt_a.meta_fg_expand : 0.000005s : 0.05% optimize.opt_a.flash_sp_send_recv_attached : 0.000003s : 0.04% optimize.opt_a.receive_attached : 0.000003s : 0.03% optimize.opt_a.after_resolve : 0.000019s : 0.21% optimize.opt_a.a_after_grad : 0.000017s : 0.19% optimize.opt_a.renormalize : 0.000511s : 5.74% optimize.opt_a.add_forward_monad_depend : 0.000006s : 0.07% optimize.opt_a.auto_monad_grad : 0.000004s : 0.04% optimize.opt_a.auto_monad_eliminator : 0.000021s : 0.23% optimize.opt_a.cse : 0.000044s : 0.50% optimize.opt_a.a_3 : 0.000081s : 0.91% optimize.py_interpret_to_execute_after_opt_a : 0.000008s : 0.09% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.03% optimize.rewriter_after_opt_a : 0.000034s : 0.39% optimize.convert_after_rewriter : 0.000006s : 0.07% optimize.order_py_execute_after_rewriter : 0.000005s : 0.06% optimize.mutable_eliminate : 0.000520s : 5.83% optimize.opt_b.b_1 : 0.000118s : 1.33% optimize.opt_b.b_2 : 0.000007s : 0.08% optimize.opt_b.updatestate_depend_eliminate : 0.000005s : 0.06% optimize.opt_b.updatestate_assign_eliminate : 0.000002s : 0.03% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_b.renormalize : 0.000001s : 0.01% optimize.opt_b.cse : 0.000019s : 0.22% optimize.optimize_parallel_all_gather_comm : 0.000016s : 0.18% optimize.overlap_param_gather : 0.000002s : 0.02% optimize.cconv : 0.000025s : 0.28% optimize.loop_unroll : 0.000430s : 4.82% optimize.opt_after_cconv.c_1 : 0.000028s : 0.31% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000005s : 0.05% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000002s : 0.03% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.02% optimize.opt_after_cconv.cse : 0.000017s : 0.19% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000015s : 0.17% optimize.tuple_transform.d_1 : 0.000039s : 0.44% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.02% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000006s : 0.07% optimize.partial_unused_args_eliminate : 0.000002s : 0.02% optimize.add_recomputation : 0.000046s : 0.52% optimize.cse_after_recomputation.cse : 0.000012s : 0.13% optimize.environ_conv : 0.000005s : 0.06% optimize.swap_dp_allreduce_reducescatter : 0.000005s : 0.06% optimize.bias_add_comm_swap : 0.000003s : 0.03% optimize.label_micro_interleaved_index : 0.000004s : 0.04% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.03% optimize.merge_cast_opt : 0.000001s : 0.01% optimize.slice_recompute_activation : 0.000002s : 0.03% optimize.micro_interleaved_order_control : 0.000002s : 0.03% optimize.assign_add_opt : 0.000002s : 0.02% optimize.ForceFp32Comm : 0.000001s : 0.01% optimize.remove_cast_before_assign_add : 0.000001s : 0.01% optimize.full_micro_interleaved_order_control : 0.000002s : 0.03% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.03% optimize.comm_op_add_attrs : 0.000001s : 0.01% optimize.add_comm_op_reuse_tag : 0.000001s : 0.01% optimize.interleave_split_concat_branches : 0.000001s : 0.01% optimize.interleave_parallel_branches : 0.000001s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.03% optimize.control_data_broadcast_order : 0.000012s : 0.13% optimize.grouped_pairwise_exchange_alltoall : 0.000001s : 0.02% optimize.offloading_packed_experts : 0.000004s : 0.04% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.05% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.02% optimize.overlap_recompute_comm : 0.000002s : 0.02% optimize.overlap_grad_ring_attention : 0.000004s : 0.05% optimize.overlap_grad_flash_sp : 0.000019s : 0.21% optimize.begin_end_overlap_inline : 0.000001s : 0.01% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.02% optimize.split_layernorm_comm : 0.000002s : 0.02% optimize.handle_group_info : 0.000001s : 0.01% optimize.symbol_engine_optimizer.build : 0.000003s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000009s : 0.10% optimize.symbol_engine_optimizer.elim_not_effective : 0.000012s : 0.13% optimize.symbol_engine_optimizer.opt_reshape : 0.000007s : 0.08% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000010s : 0.11% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.02% pipeline_parallel_scheduler : 0.000001s : 0.02% auto_monad_reorder : 0.000017s : 0.19% get_jit_bprop_graph : 0.000002s : 0.02% rewriter_after_jit_bprop_graph : 0.000004s : 0.05% opt_after_jit_grad : 0.000460s : 5.16% validate : 0.000038s : 0.43% Time group info: ------[substitution.] 0.000105 19 1.63% : 0.000002s : 2: substitution.elim_not_effective 1.53% : 0.000002s : 2: substitution.fold_const_symbol 5.51% : 0.000006s : 3: substitution.graph_param_transform 80.03% : 0.000084s : 2: substitution.inline 2.87% : 0.000003s : 4: substitution.j_node_and_user_rematch 5.19% : 0.000005s : 4: substitution.remove_not_recompute_node 3.23% : 0.000003s : 2: substitution.replace_old_param ------[type_inference.] 0.004604 2 90.96% : 0.004188s : 1: type_inference.infer 9.04% : 0.000416s : 1: type_inference.specialize ------[replace.] 0.000020 2 100.00% : 0.000020s : 2: replace.inline ------[match.] 0.000082 2 100.00% : 0.000082s : 2: match.inline ------[predicate.] 0.000131 754 0.89% : 0.000001s : 7: predicate.accumulaten_eliminater 1.27% : 0.000002s : 3: predicate.ad_related_special_op_eliminate 0.63% : 0.000001s : 6: predicate.addn_check_dump 0.95% : 0.000001s : 7: predicate.addn_zero_filter 0.68% : 0.000001s : 7: predicate.adjust_all_reduce_mul_add 2.24% : 0.000003s : 13: predicate.arithmetic_simplify 0.82% : 0.000001s : 7: predicate.cast_eliminate 0.76% : 0.000001s : 6: predicate.check_bprop_eliminate 0.75% : 0.000001s : 6: predicate.compare_switch_simplify 0.23% : 0.000000s : 3: predicate.const_output_eliminate 0.73% : 0.000001s : 6: predicate.depend_value_elim 0.91% : 0.000001s : 7: predicate.dict_get_item_const_eliminator 0.95% : 0.000001s : 7: predicate.dict_get_item_eliminator 0.72% : 0.000001s : 7: predicate.dict_set_item_eliminator 1.37% : 0.000002s : 6: predicate.dumpgradient_eliminate 0.27% : 0.000000s : 3: predicate.elim_not_effective 0.52% : 0.000001s : 3: predicate.elim_shapecalc_of_broadcastargs 1.09% : 0.000001s : 10: predicate.environ_add_const_eliminate 0.97% : 0.000001s : 10: predicate.environ_get_add_eliminate 0.99% : 0.000001s : 10: predicate.environ_get_depend_swap 1.67% : 0.000002s : 16: predicate.environ_get_eliminate 1.05% : 0.000001s : 10: predicate.environ_get_set_eliminate 0.92% : 0.000001s : 9: predicate.exchange_switch_depend_value 1.96% : 0.000003s : 9: predicate.float_depend_g_call 0.63% : 0.000001s : 6: predicate.float_environ_get_switch 0.97% : 0.000001s : 9: predicate.float_tuple_getitem_switch 0.24% : 0.000000s : 3: predicate.fold_const_symbol 0.84% : 0.000001s : 6: predicate.get_grad_eliminate 0.27% : 0.000000s : 3: predicate.graph_param_transform 0.76% : 0.000001s : 6: predicate.incorporate_call 0.65% : 0.000001s : 6: predicate.incorporate_call_switch 6.48% : 0.000008s : 34: predicate.inline 1.05% : 0.000001s : 6: predicate.inline_without_move 0.47% : 0.000001s : 6: predicate.j_node_and_user_rematch 0.99% : 0.000001s : 6: predicate.less_batch_normalization 1.83% : 0.000002s : 13: predicate.list_to_tuple_eliminator_ 2.16% : 0.000003s : 20: predicate.load_eliminater 1.35% : 0.000002s : 3: predicate.loop_unroll_after_grad 1.86% : 0.000002s : 14: predicate.loop_unroll_before_grad 1.75% : 0.000002s : 13: predicate.make_slice_get_slice_eliminator 0.73% : 0.000001s : 6: predicate.merge_addn 0.67% : 0.000001s : 6: predicate.micro_step_allgather_replace 0.73% : 0.000001s : 6: predicate.mini_step_allgather_replace 0.70% : 0.000001s : 7: predicate.minmaximum_grad 1.45% : 0.000002s : 3: predicate.mutable_eliminate 0.41% : 0.000001s : 3: predicate.opt_reshape 0.47% : 0.000001s : 3: predicate.parallel_virtual_node 1.22% : 0.000002s : 9: predicate.partial_defer_inline 1.23% : 0.000002s : 10: predicate.partial_eliminate 0.80% : 0.000001s : 7: predicate.print_const_string_wrapper 0.73% : 0.000001s : 6: predicate.reduce_all_const_elim 0.99% : 0.000001s : 7: predicate.reduce_eliminate 2.16% : 0.000003s : 20: predicate.redundant_stop_gradient_eliminater 0.67% : 0.000001s : 6: predicate.remove_not_recompute_node 1.24% : 0.000002s : 13: predicate.replace_applicator 0.59% : 0.000001s : 6: predicate.replace_old_param 0.37% : 0.000000s : 3: predicate.reset_defer_inline 0.99% : 0.000001s : 7: predicate.reshape_eliminate 0.75% : 0.000001s : 6: predicate.row_tensor_add_zeros_like 0.50% : 0.000001s : 3: predicate.row_tensor_eliminate 0.99% : 0.000001s : 6: predicate.same_eliminate 0.51% : 0.000001s : 6: predicate.set_cell_output_no_recompute 1.15% : 0.000002s : 6: predicate.shard_identity_eliminate 0.88% : 0.000001s : 6: predicate.special_op_eliminate 0.89% : 0.000001s : 6: predicate.specialize_transform 1.01% : 0.000001s : 6: predicate.split_environ_get_set_with_tuple_value 1.00% : 0.000001s : 6: predicate.stack_unstack_eliminate 0.43% : 0.000001s : 3: predicate.switch_call_monad_eliminater 1.04% : 0.000001s : 9: predicate.switch_defer_inline 1.64% : 0.000002s : 15: predicate.switch_layer_defer_inline 4.33% : 0.000006s : 32: predicate.switch_simplify 0.75% : 0.000001s : 7: predicate.tile_eliminate 0.83% : 0.000001s : 7: predicate.transpose_eliminate 1.66% : 0.000002s : 13: predicate.tuple_list_convert_item_index_to_positive 1.58% : 0.000002s : 13: predicate.tuple_list_get_item_const_eliminator 1.35% : 0.000002s : 13: predicate.tuple_list_get_item_depend_reorder 3.17% : 0.000004s : 19: predicate.tuple_list_get_item_eliminator 1.56% : 0.000002s : 13: predicate.tuple_list_get_set_item_eliminator 2.33% : 0.000003s : 19: predicate.tuple_list_set_item_eliminator 1.57% : 0.000002s : 13: predicate.tuple_to_list_eliminator_ 1.98% : 0.000003s : 20: predicate.updatestate_pure_node_eliminater 2.82% : 0.000004s : 26: predicate.updatestate_useless_node_eliminater 0.49% : 0.000001s : 3: predicate.value_based_eliminate 0.84% : 0.000001s : 6: predicate.virtual_dataset_eliminate 1.12% : 0.000001s : 6: predicate.virtual_output_eliminate 0.38% : 0.000001s : 3: predicate.virtual_view_grad_eliminate 0.66% : 0.000001s : 3: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000208 5 7.99% : 0.000017s : 1: func_graph_cloner_run.FuncGraphClonerGraph 92.01% : 0.000191s : 4: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.021511 192 0.02% : 0.000004s : 1: ForceFp32Comm 14.58% : 0.003136s : 1: add_attr 14.53% : 0.003126s : 1: add_attr_with_inline 0.02% : 0.000004s : 1: add_comm_op_reuse_tag 0.23% : 0.000050s : 1: add_recomputation 0.02% : 0.000004s : 1: assign_add_opt 0.27% : 0.000059s : 1: auto_monad 0.09% : 0.000020s : 1: auto_monad_reorder 0.02% : 0.000003s : 1: begin_end_overlap_inline 0.03% : 0.000005s : 1: bias_add_comm_swap 2.04% : 0.000438s : 1: bootstrap 0.13% : 0.000028s : 1: cconv 0.02% : 0.000004s : 1: comm_op_add_attrs 0.07% : 0.000015s : 1: control_data_broadcast_order 0.04% : 0.000009s : 1: convert_after_rewriter 0.11% : 0.000024s : 1: cse_after_recomputation 0.02% : 0.000005s : 1: dataset_repeat_opt 0.03% : 0.000006s : 1: detach_backward 0.04% : 0.000008s : 1: environ_conv 0.08% : 0.000018s : 1: event_method 0.02% : 0.000005s : 1: full_micro_interleaved_order_control 0.02% : 0.000005s : 1: get_jit_bprop_graph 0.04% : 0.000008s : 1: graph_reusing 0.02% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.02% : 0.000004s : 1: handle_group_info 0.02% : 0.000005s : 1: inline 0.03% : 0.000006s : 1: insert-virtual-dataset 0.02% : 0.000004s : 1: interleave_parallel_branches 0.02% : 0.000004s : 1: interleave_split_concat_branches 0.03% : 0.000005s : 1: label_fine_grained_interleaved_index 0.03% : 0.000007s : 1: label_micro_interleaved_index 2.03% : 0.000437s : 1: loop_unroll 0.02% : 0.000004s : 1: merge_cast_opt 0.02% : 0.000005s : 1: micro_interleaved_order_control 2.46% : 0.000529s : 1: mutable_eliminate 0.03% : 0.000007s : 1: offloading_packed_experts 0.07% : 0.000015s : 1: opt.transform.loop_unroll_optimizer 0.08% : 0.000016s : 1: opt.transform.mutable_eliminate 3.47% : 0.000746s : 78: opt.transform.opt_a 0.12% : 0.000026s : 1: opt.transform.opt_after_cconv 0.11% : 0.000024s : 1: opt.transform.opt_after_jit_grad 0.44% : 0.000094s : 28: opt.transform.opt_b 0.20% : 0.000043s : 2: opt.transform.opt_trans_graph 0.16% : 0.000034s : 4: opt.transform.symbol_engine_opt 9.63% : 0.002073s : 1: opt_a 0.47% : 0.000101s : 1: opt_after_cconv 2.18% : 0.000469s : 1: opt_after_jit_grad 0.93% : 0.000200s : 1: opt_b 18.67% : 0.004016s : 1: optimize 0.09% : 0.000019s : 1: optimize_parallel_all_gather_comm 0.04% : 0.000008s : 1: order_py_execute_after_rewriter 0.10% : 0.000022s : 1: overlap_grad_flash_sp 0.02% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.03% : 0.000007s : 1: overlap_grad_ring_attention 0.02% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.02% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.02% : 0.000005s : 1: overlap_param_gather 0.02% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.03% : 0.000007s : 1: overlap_recompute_and_grad_model_parallel 0.02% : 0.000005s : 1: overlap_recompute_comm 0.03% : 0.000007s : 1: parallel-infer-symbol 0.02% : 0.000004s : 1: parallel-infer-symbol-second 0.02% : 0.000005s : 1: partial_unused_args_eliminate 0.02% : 0.000005s : 1: pipeline_parallel_scheduler 0.02% : 0.000005s : 1: pipeline_split 0.13% : 0.000029s : 1: pre_auto_parallel 0.10% : 0.000021s : 1: py_interpret_to_execute 0.05% : 0.000012s : 1: py_interpret_to_execute_after_opt_a 0.02% : 0.000004s : 1: remove_cast_before_assign_add 0.09% : 0.000019s : 1: remove_dup_value 1.30% : 0.000280s : 1: renormalize.infer 1.04% : 0.000224s : 1: renormalize.specialize 0.03% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.03% : 0.000007s : 1: rewriter_after_jit_bprop_graph 0.18% : 0.000038s : 1: rewriter_after_opt_a 0.23% : 0.000049s : 1: rewriter_before_opt_a 0.02% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.02% : 0.000005s : 1: slice_recompute_activation 0.02% : 0.000005s : 1: split_layernorm_comm 0.02% : 0.000005s : 1: split_matmul_comm_elemetwise 0.04% : 0.000008s : 1: swap_dp_allreduce_reducescatter 0.35% : 0.000075s : 1: symbol_engine_optimizer 0.34% : 0.000073s : 1: tuple_transform 21.70% : 0.004668s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:42.223.088 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:42.223.360 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0161397, [21] [bootstrap]: 0.00044435 [type_inference]: 0.00575359 [event_method]: 1.411e-05 [auto_monad]: 5.899e-05 [graph_reusing]: 5.84e-06 [inline]: 2.87002e-06 [add_attr]: 0.00356212, [1] [add_attr_with_inline]: 0.00355136, [1] [Cycle 1]: 8.09e-05, [2] [tag_attr]: 1.648e-05 [meta_addattr_fg_expand]: 4.13001e-06 [parallel-infer-symbol]: 3.36999e-06 [pre_auto_parallel]: 2.983e-05 [insert-virtual-dataset]: 2.58e-06 [parallel-infer-symbol-second]: 6.59988e-07 [dataset_repeat_opt]: 1.97999e-06 [pipeline_split]: 1.87001e-06 [optimize]: 0.00508611, [53] [py_interpret_to_execute]: 2.404e-05 [rewriter_before_opt_a]: 5.49e-05 [opt_a]: 0.00266677, [2] [Cycle 1]: 0.00184507, [45] [expand_dump_flag]: 3.00998e-06 [switch_simplify]: 2.346e-05 [loop_unroll]: 1.438e-05 [a_1]: 0.0003059 [with_stream_mark]: 1.967e-05 [recompute_prepare]: 1.032e-05 [updatestate_depend_eliminate]: 4.90001e-06 [updatestate_assign_eliminate]: 3.45998e-06 [updatestate_loads_eliminate]: 2.88e-06 [parameter_eliminate]: 2.06e-06 [a_2]: 0.00010841 [accelerated_algorithm]: 7.31001e-06 [shard]: 2.24999e-06 [meta_shard_fg_expand]: 2.31e-06 [shard_inline]: 7.00002e-06 [merge_send_recv]: 9.02e-06 [auto_parallel]: 8.25e-06 [parallel]: 1.993e-05 [flash_sp]: 1.039e-05 [merge_comm]: 4.07e-06 [allreduce_fusion]: 3.65e-06 [matmul_add_comm_reduction]: 9.82999e-06 [allreduce_slice_to_reducescatter]: 8.30012e-07 [virtual_shard_identity]: 9.25001e-06 [virtual_dataset]: 6.11e-06 [get_grad_eliminate_]: 6.36e-06 [virtual_output]: 6.24001e-06 [merge_forward]: 4.1e-06 [cell_reuse_recompute_pass]: 1.71e-06 [offload_activation]: 1.108e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.654e-05 [merge_recompute_call_nodes]: 1.76e-06 [before_grad]: 1.128e-05 [set_forward_comm_id_for_comm_node_pass]: 4.43999e-06 [meta_fg_expand]: 2.73e-06 [flash_sp_send_recv_attached]: 3.3e-06 [receive_attached]: 2.40002e-06 [after_resolve]: 9.57001e-06 [a_after_grad]: 9.67001e-06 [renormalize]: 0.00064617 [add_forward_monad_depend]: 6.05002e-06 [auto_monad_grad]: 2.48e-06 [auto_monad_eliminator]: 1.635e-05 [cse]: 2.986e-05 [a_3]: 6.2e-05 [Cycle 2]: 0.00080759, [45] [expand_dump_flag]: 1.47001e-06 [switch_simplify]: 7.21999e-06 [loop_unroll]: 5.97001e-06 [a_1]: 0.00010603 [with_stream_mark]: 1.439e-05 [recompute_prepare]: 7.37997e-06 [updatestate_depend_eliminate]: 3.04999e-06 [updatestate_assign_eliminate]: 2.94001e-06 [updatestate_loads_eliminate]: 2.84999e-06 [parameter_eliminate]: 1.47999e-06 [a_2]: 9.699e-05 [accelerated_algorithm]: 6.34999e-06 [shard]: 1.24e-06 [meta_shard_fg_expand]: 2.12001e-06 [shard_inline]: 6.28998e-06 [merge_send_recv]: 5.72001e-06 [auto_parallel]: 5.92999e-06 [parallel]: 6.09999e-06 [flash_sp]: 3.74002e-06 [merge_comm]: 3.45e-06 [allreduce_fusion]: 3.25e-06 [matmul_add_comm_reduction]: 7.26001e-06 [allreduce_slice_to_reducescatter]: 3.50003e-07 [virtual_shard_identity]: 7.21001e-06 [virtual_dataset]: 5.75001e-06 [get_grad_eliminate_]: 5.28002e-06 [virtual_output]: 5.30999e-06 [merge_forward]: 3.11001e-06 [cell_reuse_recompute_pass]: 1.56998e-06 [offload_activation]: 7.43e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.615e-05 [merge_recompute_call_nodes]: 7.79983e-07 [before_grad]: 9.69e-06 [set_forward_comm_id_for_comm_node_pass]: 4.08999e-06 [meta_fg_expand]: 2.14e-06 [flash_sp_send_recv_attached]: 1.25999e-06 [receive_attached]: 1.86e-06 [after_resolve]: 9.49e-06 [a_after_grad]: 8.67e-06 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 1.47001e-06 [auto_monad_grad]: 1.35999e-06 [auto_monad_eliminator]: 9.39e-06 [cse]: 1.549e-05 [a_3]: 4.936e-05 [py_interpret_to_execute_after_opt_a]: 1.299e-05 [slice_cell_reuse_recomputed_activation]: 4.70001e-06 [rewriter_after_opt_a]: 4.212e-05 [convert_after_rewriter]: 9.99999e-06 [order_py_execute_after_rewriter]: 8.98002e-06 [mutable_eliminate]: 0.000624 [opt_b]: 0.00027766, [1] [Cycle 1]: 0.00026695, [7] [b_1]: 0.00016487 [b_2]: 7.73001e-06 [updatestate_depend_eliminate]: 7.53e-06 [updatestate_assign_eliminate]: 2.58003e-06 [updatestate_loads_eliminate]: 2.51e-06 [renormalize]: 6.59988e-07 [cse]: 2.341e-05 [optimize_parallel_all_gather_comm]: 2.047e-05 [overlap_param_gather]: 4.80001e-06 [cconv]: 3.386e-05 [loop_unroll]: 0.00047926 [opt_after_cconv]: 0.00013054, [1] [Cycle 1]: 0.00012067, [7] [c_1]: 2.891e-05 [parameter_eliminate]: 3.81001e-06 [updatestate_depend_eliminate]: 6.32001e-06 [updatestate_assign_eliminate]: 2.35002e-06 [updatestate_loads_eliminate]: 2.64001e-06 [cse]: 2.012e-05 [renormalize]: 3.80009e-07 [remove_dup_value]: 1.809e-05 [tuple_transform]: 8.786e-05, [1] [Cycle 1]: 8.067e-05, [4] [d_1]: 4.101e-05 [none_parameter_eliminate]: 1.67999e-06 [renormalize]: 2.50002e-07 [switch_simplify]: 6.76999e-06 [partial_unused_args_eliminate]: 4.55999e-06 [add_recomputation]: 5.09e-05 [cse_after_recomputation]: 2.913e-05, [1] [Cycle 1]: 2.23e-05, [1] [cse]: 1.202e-05 [environ_conv]: 8.41002e-06 [swap_dp_allreduce_reducescatter]: 7.95e-06 [bias_add_comm_swap]: 4.99e-06 [label_micro_interleaved_index]: 6.74999e-06 [label_fine_grained_interleaved_index]: 5.47001e-06 [merge_cast_opt]: 3.73999e-06 [slice_recompute_activation]: 4.57998e-06 [micro_interleaved_order_control]: 4.52998e-06 [assign_add_opt]: 3.76999e-06 [ForceFp32Comm]: 3.49001e-06 [remove_cast_before_assign_add]: 3.46001e-06 [full_micro_interleaved_order_control]: 4.53999e-06 [reorder_send_recv_between_fp_bp]: 5.74e-06 [comm_op_add_attrs]: 4.05998e-06 [add_comm_op_reuse_tag]: 3.38999e-06 [interleave_split_concat_branches]: 3.66001e-06 [interleave_parallel_branches]: 3.85e-06 [overlap_opt_shard_in_pipeline]: 3.66999e-06 [overlap_opt_shard_grad_in_pipeline]: 4.1e-06 [control_data_broadcast_order]: 1.619e-05 [grouped_pairwise_exchange_alltoall]: 3.95e-06 [offloading_packed_experts]: 6.81999e-06 [overlap_recompute_and_grad_model_parallel]: 7.52998e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.93999e-06 [overlap_recompute_allgather_and_fa_grad]: 3.63e-06 [overlap_recompute_comm]: 4.75999e-06 [overlap_grad_ring_attention]: 6.54999e-06 [overlap_grad_flash_sp]: 2.301e-05 [begin_end_overlap_inline]: 3.3e-06 [split_matmul_comm_elemetwise]: 5.47999e-06 [split_layernorm_comm]: 3.91999e-06 [handle_group_info]: 3.41999e-06 [symbol_engine_optimizer]: 0.00010237, [1] [Cycle 1]: 9.48e-05, [6] [build]: 3.05998e-06 [elim_shapecalc]: 1.166e-05 [elim_not_effective]: 1.314e-05 [opt_reshape]: 7.03998e-06 [fold_const_symbol]: 1.069e-05 [renormalize]: 2.80008e-07 [detach_backward]: 4.62e-06 [pipeline_parallel_scheduler]: 2.00002e-06 [auto_monad_reorder]: 1.868e-05 [get_jit_bprop_graph]: 1.95001e-06 [rewriter_after_jit_bprop_graph]: 5.73002e-06 [opt_after_jit_grad]: 0.00052086 [validate]: 4.092e-05 Sums bootstrap : 0.000444s : 4.11% type_inference : 0.005754s : 53.20% event_method : 0.000014s : 0.13% auto_monad : 0.000059s : 0.55% graph_reusing : 0.000006s : 0.05% inline : 0.000003s : 0.03% add_attr.add_attr_with_inline.tag_attr : 0.000016s : 0.15% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000004s : 0.04% parallel-infer-symbol : 0.000003s : 0.03% pre_auto_parallel : 0.000030s : 0.28% insert-virtual-dataset : 0.000003s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.02% optimize.py_interpret_to_execute : 0.000024s : 0.22% optimize.rewriter_before_opt_a : 0.000055s : 0.51% optimize.opt_a.expand_dump_flag : 0.000004s : 0.04% optimize.opt_a.switch_simplify : 0.000031s : 0.28% optimize.opt_a.loop_unroll : 0.000020s : 0.19% optimize.opt_a.a_1 : 0.000412s : 3.81% optimize.opt_a.with_stream_mark : 0.000034s : 0.31% optimize.opt_a.recompute_prepare : 0.000018s : 0.16% optimize.opt_a.updatestate_depend_eliminate : 0.000008s : 0.07% optimize.opt_a.updatestate_assign_eliminate : 0.000006s : 0.06% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.05% optimize.opt_a.parameter_eliminate : 0.000004s : 0.03% optimize.opt_a.a_2 : 0.000205s : 1.90% optimize.opt_a.accelerated_algorithm : 0.000014s : 0.13% optimize.opt_a.shard : 0.000003s : 0.03% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.04% optimize.opt_a.shard_inline : 0.000013s : 0.12% optimize.opt_a.merge_send_recv : 0.000015s : 0.14% optimize.opt_a.auto_parallel : 0.000014s : 0.13% optimize.opt_a.parallel : 0.000026s : 0.24% optimize.opt_a.flash_sp : 0.000014s : 0.13% optimize.opt_a.merge_comm : 0.000008s : 0.07% optimize.opt_a.allreduce_fusion : 0.000007s : 0.06% optimize.opt_a.matmul_add_comm_reduction : 0.000017s : 0.16% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000016s : 0.15% optimize.opt_a.virtual_dataset : 0.000012s : 0.11% optimize.opt_a.get_grad_eliminate_ : 0.000012s : 0.11% optimize.opt_a.virtual_output : 0.000012s : 0.11% optimize.opt_a.merge_forward : 0.000007s : 0.07% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.03% optimize.opt_a.offload_activation : 0.000019s : 0.17% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000033s : 0.30% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.02% optimize.opt_a.before_grad : 0.000021s : 0.19% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000009s : 0.08% optimize.opt_a.meta_fg_expand : 0.000005s : 0.05% optimize.opt_a.flash_sp_send_recv_attached : 0.000005s : 0.04% optimize.opt_a.receive_attached : 0.000004s : 0.04% optimize.opt_a.after_resolve : 0.000019s : 0.18% optimize.opt_a.a_after_grad : 0.000018s : 0.17% optimize.opt_a.renormalize : 0.000646s : 5.98% optimize.opt_a.add_forward_monad_depend : 0.000008s : 0.07% optimize.opt_a.auto_monad_grad : 0.000004s : 0.04% optimize.opt_a.auto_monad_eliminator : 0.000026s : 0.24% optimize.opt_a.cse : 0.000045s : 0.42% optimize.opt_a.a_3 : 0.000111s : 1.03% optimize.py_interpret_to_execute_after_opt_a : 0.000013s : 0.12% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.04% optimize.rewriter_after_opt_a : 0.000042s : 0.39% optimize.convert_after_rewriter : 0.000010s : 0.09% optimize.order_py_execute_after_rewriter : 0.000009s : 0.08% optimize.mutable_eliminate : 0.000624s : 5.77% optimize.opt_b.b_1 : 0.000165s : 1.52% optimize.opt_b.b_2 : 0.000008s : 0.07% optimize.opt_b.updatestate_depend_eliminate : 0.000008s : 0.07% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.02% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.02% optimize.opt_b.renormalize : 0.000001s : 0.01% optimize.opt_b.cse : 0.000023s : 0.22% optimize.optimize_parallel_all_gather_comm : 0.000020s : 0.19% optimize.overlap_param_gather : 0.000005s : 0.04% optimize.cconv : 0.000034s : 0.31% optimize.loop_unroll : 0.000479s : 4.43% optimize.opt_after_cconv.c_1 : 0.000029s : 0.27% optimize.opt_after_cconv.parameter_eliminate : 0.000004s : 0.04% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.06% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000002s : 0.02% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.02% optimize.opt_after_cconv.cse : 0.000020s : 0.19% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000018s : 0.17% optimize.tuple_transform.d_1 : 0.000041s : 0.38% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.02% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000007s : 0.06% optimize.partial_unused_args_eliminate : 0.000005s : 0.04% optimize.add_recomputation : 0.000051s : 0.47% optimize.cse_after_recomputation.cse : 0.000012s : 0.11% optimize.environ_conv : 0.000008s : 0.08% optimize.swap_dp_allreduce_reducescatter : 0.000008s : 0.07% optimize.bias_add_comm_swap : 0.000005s : 0.05% optimize.label_micro_interleaved_index : 0.000007s : 0.06% optimize.label_fine_grained_interleaved_index : 0.000005s : 0.05% optimize.merge_cast_opt : 0.000004s : 0.03% optimize.slice_recompute_activation : 0.000005s : 0.04% optimize.micro_interleaved_order_control : 0.000005s : 0.04% optimize.assign_add_opt : 0.000004s : 0.03% optimize.ForceFp32Comm : 0.000003s : 0.03% optimize.remove_cast_before_assign_add : 0.000003s : 0.03% optimize.full_micro_interleaved_order_control : 0.000005s : 0.04% optimize.reorder_send_recv_between_fp_bp : 0.000006s : 0.05% optimize.comm_op_add_attrs : 0.000004s : 0.04% optimize.add_comm_op_reuse_tag : 0.000003s : 0.03% optimize.interleave_split_concat_branches : 0.000004s : 0.03% optimize.interleave_parallel_branches : 0.000004s : 0.04% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.03% optimize.overlap_opt_shard_grad_in_pipeline : 0.000004s : 0.04% optimize.control_data_broadcast_order : 0.000016s : 0.15% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.04% optimize.offloading_packed_experts : 0.000007s : 0.06% optimize.overlap_recompute_and_grad_model_parallel : 0.000008s : 0.07% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.04% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.03% optimize.overlap_recompute_comm : 0.000005s : 0.04% optimize.overlap_grad_ring_attention : 0.000007s : 0.06% optimize.overlap_grad_flash_sp : 0.000023s : 0.21% optimize.begin_end_overlap_inline : 0.000003s : 0.03% optimize.split_matmul_comm_elemetwise : 0.000005s : 0.05% optimize.split_layernorm_comm : 0.000004s : 0.04% optimize.handle_group_info : 0.000003s : 0.03% optimize.symbol_engine_optimizer.build : 0.000003s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000012s : 0.11% optimize.symbol_engine_optimizer.elim_not_effective : 0.000013s : 0.12% optimize.symbol_engine_optimizer.opt_reshape : 0.000007s : 0.07% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000011s : 0.10% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000005s : 0.04% pipeline_parallel_scheduler : 0.000002s : 0.02% auto_monad_reorder : 0.000019s : 0.17% get_jit_bprop_graph : 0.000002s : 0.02% rewriter_after_jit_bprop_graph : 0.000006s : 0.05% opt_after_jit_grad : 0.000521s : 4.82% validate : 0.000041s : 0.38% Time group info: ------[substitution.] 0.000125 19 1.44% : 0.000002s : 2: substitution.elim_not_effective 1.24% : 0.000002s : 2: substitution.fold_const_symbol 4.65% : 0.000006s : 3: substitution.graph_param_transform 82.78% : 0.000103s : 2: substitution.inline 3.06% : 0.000004s : 4: substitution.j_node_and_user_rematch 3.85% : 0.000005s : 4: substitution.remove_not_recompute_node 2.99% : 0.000004s : 2: substitution.replace_old_param ------[type_inference.] 0.005691 2 91.16% : 0.005188s : 1: type_inference.infer 8.84% : 0.000503s : 1: type_inference.specialize ------[replace.] 0.000023 2 100.00% : 0.000023s : 2: replace.inline ------[match.] 0.000101 2 100.00% : 0.000101s : 2: match.inline ------[predicate.] 0.000141 754 0.85% : 0.000001s : 7: predicate.accumulaten_eliminater 0.98% : 0.000001s : 3: predicate.ad_related_special_op_eliminate 0.90% : 0.000001s : 6: predicate.addn_check_dump 0.78% : 0.000001s : 7: predicate.addn_zero_filter 0.67% : 0.000001s : 7: predicate.adjust_all_reduce_mul_add 2.30% : 0.000003s : 13: predicate.arithmetic_simplify 0.90% : 0.000001s : 7: predicate.cast_eliminate 0.87% : 0.000001s : 6: predicate.check_bprop_eliminate 0.66% : 0.000001s : 6: predicate.compare_switch_simplify 0.21% : 0.000000s : 3: predicate.const_output_eliminate 0.79% : 0.000001s : 6: predicate.depend_value_elim 0.76% : 0.000001s : 7: predicate.dict_get_item_const_eliminator 0.94% : 0.000001s : 7: predicate.dict_get_item_eliminator 0.72% : 0.000001s : 7: predicate.dict_set_item_eliminator 1.38% : 0.000002s : 6: predicate.dumpgradient_eliminate 0.30% : 0.000000s : 3: predicate.elim_not_effective 0.58% : 0.000001s : 3: predicate.elim_shapecalc_of_broadcastargs 1.10% : 0.000002s : 10: predicate.environ_add_const_eliminate 0.92% : 0.000001s : 10: predicate.environ_get_add_eliminate 0.94% : 0.000001s : 10: predicate.environ_get_depend_swap 1.77% : 0.000002s : 16: predicate.environ_get_eliminate 1.01% : 0.000001s : 10: predicate.environ_get_set_eliminate 0.87% : 0.000001s : 9: predicate.exchange_switch_depend_value 1.89% : 0.000003s : 9: predicate.float_depend_g_call 0.64% : 0.000001s : 6: predicate.float_environ_get_switch 0.97% : 0.000001s : 9: predicate.float_tuple_getitem_switch 0.23% : 0.000000s : 3: predicate.fold_const_symbol 0.85% : 0.000001s : 6: predicate.get_grad_eliminate 0.35% : 0.000000s : 3: predicate.graph_param_transform 0.79% : 0.000001s : 6: predicate.incorporate_call 0.61% : 0.000001s : 6: predicate.incorporate_call_switch 6.84% : 0.000010s : 34: predicate.inline 1.30% : 0.000002s : 6: predicate.inline_without_move 0.36% : 0.000001s : 6: predicate.j_node_and_user_rematch 1.10% : 0.000002s : 6: predicate.less_batch_normalization 1.48% : 0.000002s : 13: predicate.list_to_tuple_eliminator_ 1.94% : 0.000003s : 20: predicate.load_eliminater 1.84% : 0.000003s : 3: predicate.loop_unroll_after_grad 1.74% : 0.000002s : 14: predicate.loop_unroll_before_grad 1.62% : 0.000002s : 13: predicate.make_slice_get_slice_eliminator 0.77% : 0.000001s : 6: predicate.merge_addn 0.72% : 0.000001s : 6: predicate.micro_step_allgather_replace 0.74% : 0.000001s : 6: predicate.mini_step_allgather_replace 0.62% : 0.000001s : 7: predicate.minmaximum_grad 1.65% : 0.000002s : 3: predicate.mutable_eliminate 0.47% : 0.000001s : 3: predicate.opt_reshape 0.49% : 0.000001s : 3: predicate.parallel_virtual_node 1.34% : 0.000002s : 9: predicate.partial_defer_inline 1.17% : 0.000002s : 10: predicate.partial_eliminate 0.72% : 0.000001s : 7: predicate.print_const_string_wrapper 0.72% : 0.000001s : 6: predicate.reduce_all_const_elim 1.03% : 0.000001s : 7: predicate.reduce_eliminate 1.95% : 0.000003s : 20: predicate.redundant_stop_gradient_eliminater 0.70% : 0.000001s : 6: predicate.remove_not_recompute_node 1.40% : 0.000002s : 13: predicate.replace_applicator 0.63% : 0.000001s : 6: predicate.replace_old_param 0.34% : 0.000000s : 3: predicate.reset_defer_inline 0.82% : 0.000001s : 7: predicate.reshape_eliminate 0.84% : 0.000001s : 6: predicate.row_tensor_add_zeros_like 0.57% : 0.000001s : 3: predicate.row_tensor_eliminate 1.13% : 0.000002s : 6: predicate.same_eliminate 0.49% : 0.000001s : 6: predicate.set_cell_output_no_recompute 1.15% : 0.000002s : 6: predicate.shard_identity_eliminate 0.91% : 0.000001s : 6: predicate.special_op_eliminate 0.89% : 0.000001s : 6: predicate.specialize_transform 1.21% : 0.000002s : 6: predicate.split_environ_get_set_with_tuple_value 0.92% : 0.000001s : 6: predicate.stack_unstack_eliminate 0.46% : 0.000001s : 3: predicate.switch_call_monad_eliminater 1.04% : 0.000001s : 9: predicate.switch_defer_inline 1.66% : 0.000002s : 15: predicate.switch_layer_defer_inline 4.22% : 0.000006s : 32: predicate.switch_simplify 0.70% : 0.000001s : 7: predicate.tile_eliminate 0.81% : 0.000001s : 7: predicate.transpose_eliminate 1.56% : 0.000002s : 13: predicate.tuple_list_convert_item_index_to_positive 1.89% : 0.000003s : 13: predicate.tuple_list_get_item_const_eliminator 1.26% : 0.000002s : 13: predicate.tuple_list_get_item_depend_reorder 3.13% : 0.000004s : 19: predicate.tuple_list_get_item_eliminator 1.30% : 0.000002s : 13: predicate.tuple_list_get_set_item_eliminator 2.62% : 0.000004s : 19: predicate.tuple_list_set_item_eliminator 1.52% : 0.000002s : 13: predicate.tuple_to_list_eliminator_ 1.88% : 0.000003s : 20: predicate.updatestate_pure_node_eliminater 2.75% : 0.000004s : 26: predicate.updatestate_useless_node_eliminater 0.59% : 0.000001s : 3: predicate.value_based_eliminate 0.76% : 0.000001s : 6: predicate.virtual_dataset_eliminate 0.76% : 0.000001s : 6: predicate.virtual_output_eliminate 0.32% : 0.000000s : 3: predicate.virtual_view_grad_eliminate 0.66% : 0.000001s : 3: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000247 5 7.91% : 0.000020s : 1: func_graph_cloner_run.FuncGraphClonerGraph 92.09% : 0.000227s : 4: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.026355 192 0.02% : 0.000006s : 1: ForceFp32Comm 13.56% : 0.003573s : 1: add_attr 13.49% : 0.003555s : 1: add_attr_with_inline 0.02% : 0.000006s : 1: add_comm_op_reuse_tag 0.21% : 0.000054s : 1: add_recomputation 0.02% : 0.000006s : 1: assign_add_opt 0.26% : 0.000070s : 1: auto_monad 0.10% : 0.000027s : 1: auto_monad_reorder 0.02% : 0.000006s : 1: begin_end_overlap_inline 0.03% : 0.000008s : 1: bias_add_comm_swap 1.86% : 0.000491s : 1: bootstrap 0.14% : 0.000037s : 1: cconv 0.03% : 0.000007s : 1: comm_op_add_attrs 0.07% : 0.000019s : 1: control_data_broadcast_order 0.05% : 0.000014s : 1: convert_after_rewriter 0.12% : 0.000032s : 1: cse_after_recomputation 0.03% : 0.000007s : 1: dataset_repeat_opt 0.08% : 0.000020s : 1: detach_backward 0.04% : 0.000011s : 1: environ_conv 0.10% : 0.000025s : 1: event_method 0.03% : 0.000007s : 1: full_micro_interleaved_order_control 0.03% : 0.000008s : 1: get_jit_bprop_graph 0.05% : 0.000012s : 1: graph_reusing 0.03% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.02% : 0.000006s : 1: handle_group_info 0.03% : 0.000009s : 1: inline 0.03% : 0.000008s : 1: insert-virtual-dataset 0.02% : 0.000007s : 1: interleave_parallel_branches 0.02% : 0.000006s : 1: interleave_split_concat_branches 0.03% : 0.000008s : 1: label_fine_grained_interleaved_index 0.04% : 0.000010s : 1: label_micro_interleaved_index 1.84% : 0.000486s : 1: loop_unroll 0.02% : 0.000006s : 1: merge_cast_opt 0.03% : 0.000007s : 1: micro_interleaved_order_control 2.40% : 0.000632s : 1: mutable_eliminate 0.04% : 0.000010s : 1: offloading_packed_experts 0.06% : 0.000016s : 1: opt.transform.loop_unroll_optimizer 0.07% : 0.000018s : 1: opt.transform.mutable_eliminate 3.00% : 0.000792s : 78: opt.transform.opt_a 0.10% : 0.000027s : 1: opt.transform.opt_after_cconv 0.10% : 0.000026s : 1: opt.transform.opt_after_jit_grad 0.37% : 0.000098s : 28: opt.transform.opt_b 0.17% : 0.000045s : 2: opt.transform.opt_trans_graph 0.14% : 0.000038s : 4: opt.transform.symbol_engine_opt 10.13% : 0.002670s : 1: opt_a 0.51% : 0.000135s : 1: opt_after_cconv 2.02% : 0.000533s : 1: opt_after_jit_grad 1.07% : 0.000282s : 1: opt_b 20.52% : 0.005408s : 1: optimize 0.09% : 0.000024s : 1: optimize_parallel_all_gather_comm 0.05% : 0.000012s : 1: order_py_execute_after_rewriter 0.10% : 0.000026s : 1: overlap_grad_flash_sp 0.03% : 0.000007s : 1: overlap_grad_matmul_and_grad_allreduce 0.04% : 0.000009s : 1: overlap_grad_ring_attention 0.03% : 0.000007s : 1: overlap_opt_shard_grad_in_pipeline 0.02% : 0.000006s : 1: overlap_opt_shard_in_pipeline 0.03% : 0.000008s : 1: overlap_param_gather 0.02% : 0.000006s : 1: overlap_recompute_allgather_and_fa_grad 0.04% : 0.000011s : 1: overlap_recompute_and_grad_model_parallel 0.03% : 0.000008s : 1: overlap_recompute_comm 0.04% : 0.000010s : 1: parallel-infer-symbol 0.02% : 0.000006s : 1: parallel-infer-symbol-second 0.03% : 0.000007s : 1: partial_unused_args_eliminate 0.04% : 0.000009s : 1: pipeline_parallel_scheduler 0.03% : 0.000008s : 1: pipeline_split 0.14% : 0.000037s : 1: pre_auto_parallel 0.10% : 0.000027s : 1: py_interpret_to_execute 0.06% : 0.000017s : 1: py_interpret_to_execute_after_opt_a 0.02% : 0.000006s : 1: remove_cast_before_assign_add 0.08% : 0.000021s : 1: remove_dup_value 1.37% : 0.000360s : 1: renormalize.infer 1.05% : 0.000277s : 1: renormalize.specialize 0.03% : 0.000009s : 1: reorder_send_recv_between_fp_bp 0.04% : 0.000011s : 1: rewriter_after_jit_bprop_graph 0.17% : 0.000046s : 1: rewriter_after_opt_a 0.22% : 0.000058s : 1: rewriter_before_opt_a 0.03% : 0.000007s : 1: slice_cell_reuse_recomputed_activation 0.03% : 0.000007s : 1: slice_recompute_activation 0.03% : 0.000007s : 1: split_layernorm_comm 0.03% : 0.000008s : 1: split_matmul_comm_elemetwise 0.04% : 0.000011s : 1: swap_dp_allreduce_reducescatter 0.40% : 0.000105s : 1: symbol_engine_optimizer 0.34% : 0.000091s : 1: tuple_transform 21.98% : 0.005792s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:42.427.168 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0139287, [21] [bootstrap]: 0.00043403 [type_inference]: 0.00536581 [event_method]: 1.224e-05 [auto_monad]: 5.564e-05 [graph_reusing]: 5.39e-06 [inline]: 2.63e-06 [add_attr]: 0.00321637, [1] [add_attr_with_inline]: 0.00320764, [1] [Cycle 1]: 4.953e-05, [2] [tag_attr]: 1.431e-05 [meta_addattr_fg_expand]: 3.50003e-06 [parallel-infer-symbol]: 3.48e-06 [pre_auto_parallel]: 2.493e-05 [insert-virtual-dataset]: 2.65002e-06 [parallel-infer-symbol-second]: 7.60017e-07 [dataset_repeat_opt]: 2.41e-06 [pipeline_split]: 1.55999e-06 [optimize]: 0.00410942, [53] [py_interpret_to_execute]: 1.73e-05 [rewriter_before_opt_a]: 6.97e-05 [opt_a]: 0.00210235, [2] [Cycle 1]: 0.0014772, [45] [expand_dump_flag]: 2.56e-06 [switch_simplify]: 2.559e-05 [loop_unroll]: 1.362e-05 [a_1]: 0.0002934 [with_stream_mark]: 1.69e-05 [recompute_prepare]: 8.64e-06 [updatestate_depend_eliminate]: 3.73001e-06 [updatestate_assign_eliminate]: 2.94001e-06 [updatestate_loads_eliminate]: 3.00002e-06 [parameter_eliminate]: 1.84e-06 [a_2]: 7.923e-05 [accelerated_algorithm]: 6.86001e-06 [shard]: 2.29999e-06 [meta_shard_fg_expand]: 1.67001e-06 [shard_inline]: 6.49999e-06 [merge_send_recv]: 8.18001e-06 [auto_parallel]: 6.16998e-06 [parallel]: 1.753e-05 [flash_sp]: 8.36002e-06 [merge_comm]: 4.60001e-06 [allreduce_fusion]: 3.34001e-06 [matmul_add_comm_reduction]: 9.76998e-06 [allreduce_slice_to_reducescatter]: 6.50005e-07 [virtual_shard_identity]: 7.75998e-06 [virtual_dataset]: 6.58998e-06 [get_grad_eliminate_]: 6.16e-06 [virtual_output]: 5.84999e-06 [merge_forward]: 3.68e-06 [cell_reuse_recompute_pass]: 1.24e-06 [offload_activation]: 1.007e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.284e-05 [merge_recompute_call_nodes]: 1.55001e-06 [before_grad]: 1.052e-05 [set_forward_comm_id_for_comm_node_pass]: 3.53999e-06 [meta_fg_expand]: 2.48e-06 [flash_sp_send_recv_attached]: 2.44001e-06 [receive_attached]: 2.29001e-06 [after_resolve]: 9.87999e-06 [a_after_grad]: 9.25999e-06 [renormalize]: 0.00051566 [add_forward_monad_depend]: 5.40999e-06 [auto_monad_grad]: 2.49001e-06 [auto_monad_eliminator]: 1.488e-05 [cse]: 3.064e-05 [a_3]: 4.611e-05 [Cycle 2]: 0.00061492, [45] [expand_dump_flag]: 1.03001e-06 [switch_simplify]: 7.25e-06 [loop_unroll]: 5.89e-06 [a_1]: 0.00010591 [with_stream_mark]: 1.265e-05 [recompute_prepare]: 6.02999e-06 [updatestate_depend_eliminate]: 2.97002e-06 [updatestate_assign_eliminate]: 2.16998e-06 [updatestate_loads_eliminate]: 2.83998e-06 [parameter_eliminate]: 1.13001e-06 [a_2]: 7.255e-05 [accelerated_algorithm]: 5.76e-06 [shard]: 1.09003e-06 [meta_shard_fg_expand]: 1.57001e-06 [shard_inline]: 5.88002e-06 [merge_send_recv]: 5.04e-06 [auto_parallel]: 5.46e-06 [parallel]: 4.77998e-06 [flash_sp]: 3.31001e-06 [merge_comm]: 3.21001e-06 [allreduce_fusion]: 2.78998e-06 [matmul_add_comm_reduction]: 6.29001e-06 [allreduce_slice_to_reducescatter]: 3.50003e-07 [virtual_shard_identity]: 6.53e-06 [virtual_dataset]: 6.02999e-06 [get_grad_eliminate_]: 5.53002e-06 [virtual_output]: 5.63002e-06 [merge_forward]: 3.13998e-06 [cell_reuse_recompute_pass]: 1.39e-06 [offload_activation]: 6.63e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.277e-05 [merge_recompute_call_nodes]: 9.00007e-07 [before_grad]: 9.27999e-06 [set_forward_comm_id_for_comm_node_pass]: 3.86999e-06 [meta_fg_expand]: 2.19001e-06 [flash_sp_send_recv_attached]: 9.79984e-07 [receive_attached]: 9.80013e-07 [after_resolve]: 8.57e-06 [a_after_grad]: 7.78001e-06 [renormalize]: 8.00064e-08 [add_forward_monad_depend]: 1.89e-06 [auto_monad_grad]: 8.39995e-07 [auto_monad_eliminator]: 8.00999e-06 [cse]: 1.479e-05 [a_3]: 3.451e-05 [py_interpret_to_execute_after_opt_a]: 9.07001e-06 [slice_cell_reuse_recomputed_activation]: 1.86e-06 [rewriter_after_opt_a]: 3.448e-05 [convert_after_rewriter]: 6.94001e-06 [order_py_execute_after_rewriter]: 5.85002e-06 [mutable_eliminate]: 0.00050416 [opt_b]: 0.00019942, [1] [Cycle 1]: 0.00019288, [7] [b_1]: 0.00011866 [b_2]: 7.09001e-06 [updatestate_depend_eliminate]: 5.82001e-06 [updatestate_assign_eliminate]: 2.31998e-06 [updatestate_loads_eliminate]: 2.77002e-06 [renormalize]: 6.79982e-07 [cse]: 2.053e-05 [optimize_parallel_all_gather_comm]: 1.569e-05 [overlap_param_gather]: 1.94999e-06 [cconv]: 2.774e-05 [loop_unroll]: 0.00043352 [opt_after_cconv]: 0.00010137, [1] [Cycle 1]: 9.56e-05, [7] [c_1]: 2.717e-05 [parameter_eliminate]: 3.4e-06 [updatestate_depend_eliminate]: 5.68002e-06 [updatestate_assign_eliminate]: 2.51e-06 [updatestate_loads_eliminate]: 2.29001e-06 [cse]: 1.962e-05 [renormalize]: 4.89992e-07 [remove_dup_value]: 1.459e-05 [tuple_transform]: 7.178e-05, [1] [Cycle 1]: 6.727e-05, [4] [d_1]: 3.924e-05 [none_parameter_eliminate]: 1.66998e-06 [renormalize]: 2.70025e-07 [switch_simplify]: 6.56e-06 [partial_unused_args_eliminate]: 1.97999e-06 [add_recomputation]: 4.916e-05 [cse_after_recomputation]: 2.133e-05, [1] [Cycle 1]: 1.675e-05, [1] [cse]: 1.108e-05 [environ_conv]: 4.82e-06 [swap_dp_allreduce_reducescatter]: 5.09e-06 [bias_add_comm_swap]: 2.54999e-06 [label_micro_interleaved_index]: 4.55001e-06 [label_fine_grained_interleaved_index]: 2.63e-06 [merge_cast_opt]: 1.29e-06 [slice_recompute_activation]: 2.61e-06 [micro_interleaved_order_control]: 2.11998e-06 [assign_add_opt]: 1.30999e-06 [ForceFp32Comm]: 7.79983e-07 [remove_cast_before_assign_add]: 1.12e-06 [full_micro_interleaved_order_control]: 2.26e-06 [reorder_send_recv_between_fp_bp]: 2.81e-06 [comm_op_add_attrs]: 1.29998e-06 [add_comm_op_reuse_tag]: 1.32999e-06 [interleave_split_concat_branches]: 1.08001e-06 [interleave_parallel_branches]: 1.17e-06 [overlap_opt_shard_in_pipeline]: 1.17e-06 [overlap_opt_shard_grad_in_pipeline]: 1.71e-06 [control_data_broadcast_order]: 1.218e-05 [grouped_pairwise_exchange_alltoall]: 1.57001e-06 [offloading_packed_experts]: 3.96001e-06 [overlap_recompute_and_grad_model_parallel]: 4.58999e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.37999e-06 [overlap_recompute_allgather_and_fa_grad]: 1.57999e-06 [overlap_recompute_comm]: 2.64999e-06 [overlap_grad_ring_attention]: 4.25999e-06 [overlap_grad_flash_sp]: 1.832e-05 [begin_end_overlap_inline]: 5.69999e-07 [split_matmul_comm_elemetwise]: 2.49999e-06 [split_layernorm_comm]: 1.96e-06 [handle_group_info]: 1.02e-06 [symbol_engine_optimizer]: 0.00010226, [1] [Cycle 1]: 9.772e-05, [6] [build]: 2.95002e-06 [elim_shapecalc]: 9.52001e-06 [elim_not_effective]: 1.303e-05 [opt_reshape]: 3.105e-05 [fold_const_symbol]: 1.105e-05 [renormalize]: 2.10013e-07 [detach_backward]: 1.99e-06 [pipeline_parallel_scheduler]: 1.71e-06 [auto_monad_reorder]: 1.612e-05 [get_jit_bprop_graph]: 1.66e-06 [rewriter_after_jit_bprop_graph]: 4.12e-06 [opt_after_jit_grad]: 0.0004807 [validate]: 3.669e-05 Sums bootstrap : 0.000434s : 4.45% type_inference : 0.005366s : 54.99% event_method : 0.000012s : 0.13% auto_monad : 0.000056s : 0.57% graph_reusing : 0.000005s : 0.06% inline : 0.000003s : 0.03% add_attr.add_attr_with_inline.tag_attr : 0.000014s : 0.15% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000004s : 0.04% parallel-infer-symbol : 0.000003s : 0.04% pre_auto_parallel : 0.000025s : 0.26% insert-virtual-dataset : 0.000003s : 0.03% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.02% optimize.py_interpret_to_execute : 0.000017s : 0.18% optimize.rewriter_before_opt_a : 0.000070s : 0.71% optimize.opt_a.expand_dump_flag : 0.000004s : 0.04% optimize.opt_a.switch_simplify : 0.000033s : 0.34% optimize.opt_a.loop_unroll : 0.000020s : 0.20% optimize.opt_a.a_1 : 0.000399s : 4.09% optimize.opt_a.with_stream_mark : 0.000030s : 0.30% optimize.opt_a.recompute_prepare : 0.000015s : 0.15% optimize.opt_a.updatestate_depend_eliminate : 0.000007s : 0.07% optimize.opt_a.updatestate_assign_eliminate : 0.000005s : 0.05% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.06% optimize.opt_a.parameter_eliminate : 0.000003s : 0.03% optimize.opt_a.a_2 : 0.000152s : 1.56% optimize.opt_a.accelerated_algorithm : 0.000013s : 0.13% optimize.opt_a.shard : 0.000003s : 0.03% optimize.opt_a.meta_shard_fg_expand : 0.000003s : 0.03% optimize.opt_a.shard_inline : 0.000012s : 0.13% optimize.opt_a.merge_send_recv : 0.000013s : 0.14% optimize.opt_a.auto_parallel : 0.000012s : 0.12% optimize.opt_a.parallel : 0.000022s : 0.23% optimize.opt_a.flash_sp : 0.000012s : 0.12% optimize.opt_a.merge_comm : 0.000008s : 0.08% optimize.opt_a.allreduce_fusion : 0.000006s : 0.06% optimize.opt_a.matmul_add_comm_reduction : 0.000016s : 0.16% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000014s : 0.15% optimize.opt_a.virtual_dataset : 0.000013s : 0.13% optimize.opt_a.get_grad_eliminate_ : 0.000012s : 0.12% optimize.opt_a.virtual_output : 0.000011s : 0.12% optimize.opt_a.merge_forward : 0.000007s : 0.07% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.03% optimize.opt_a.offload_activation : 0.000017s : 0.17% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000026s : 0.26% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.03% optimize.opt_a.before_grad : 0.000020s : 0.20% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000007s : 0.08% optimize.opt_a.meta_fg_expand : 0.000005s : 0.05% optimize.opt_a.flash_sp_send_recv_attached : 0.000003s : 0.04% optimize.opt_a.receive_attached : 0.000003s : 0.03% optimize.opt_a.after_resolve : 0.000018s : 0.19% optimize.opt_a.a_after_grad : 0.000017s : 0.17% optimize.opt_a.renormalize : 0.000516s : 5.29% optimize.opt_a.add_forward_monad_depend : 0.000007s : 0.07% optimize.opt_a.auto_monad_grad : 0.000003s : 0.03% optimize.opt_a.auto_monad_eliminator : 0.000023s : 0.23% optimize.opt_a.cse : 0.000045s : 0.47% optimize.opt_a.a_3 : 0.000081s : 0.83% optimize.py_interpret_to_execute_after_opt_a : 0.000009s : 0.09% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.02% optimize.rewriter_after_opt_a : 0.000034s : 0.35% optimize.convert_after_rewriter : 0.000007s : 0.07% optimize.order_py_execute_after_rewriter : 0.000006s : 0.06% optimize.mutable_eliminate : 0.000504s : 5.17% optimize.opt_b.b_1 : 0.000119s : 1.22% optimize.opt_b.b_2 : 0.000007s : 0.07% optimize.opt_b.updatestate_depend_eliminate : 0.000006s : 0.06% optimize.opt_b.updatestate_assign_eliminate : 0.000002s : 0.02% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_b.renormalize : 0.000001s : 0.01% optimize.opt_b.cse : 0.000021s : 0.21% optimize.optimize_parallel_all_gather_comm : 0.000016s : 0.16% optimize.overlap_param_gather : 0.000002s : 0.02% optimize.cconv : 0.000028s : 0.28% optimize.loop_unroll : 0.000434s : 4.44% optimize.opt_after_cconv.c_1 : 0.000027s : 0.28% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.06% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.02% optimize.opt_after_cconv.cse : 0.000020s : 0.20% optimize.opt_after_cconv.renormalize : 0.000000s : 0.01% optimize.remove_dup_value : 0.000015s : 0.15% optimize.tuple_transform.d_1 : 0.000039s : 0.40% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.02% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000007s : 0.07% optimize.partial_unused_args_eliminate : 0.000002s : 0.02% optimize.add_recomputation : 0.000049s : 0.50% optimize.cse_after_recomputation.cse : 0.000011s : 0.11% optimize.environ_conv : 0.000005s : 0.05% optimize.swap_dp_allreduce_reducescatter : 0.000005s : 0.05% optimize.bias_add_comm_swap : 0.000003s : 0.03% optimize.label_micro_interleaved_index : 0.000005s : 0.05% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.03% optimize.merge_cast_opt : 0.000001s : 0.01% optimize.slice_recompute_activation : 0.000003s : 0.03% optimize.micro_interleaved_order_control : 0.000002s : 0.02% optimize.assign_add_opt : 0.000001s : 0.01% optimize.ForceFp32Comm : 0.000001s : 0.01% optimize.remove_cast_before_assign_add : 0.000001s : 0.01% optimize.full_micro_interleaved_order_control : 0.000002s : 0.02% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.03% optimize.comm_op_add_attrs : 0.000001s : 0.01% optimize.add_comm_op_reuse_tag : 0.000001s : 0.01% optimize.interleave_split_concat_branches : 0.000001s : 0.01% optimize.interleave_parallel_branches : 0.000001s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.02% optimize.control_data_broadcast_order : 0.000012s : 0.12% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.02% optimize.offloading_packed_experts : 0.000004s : 0.04% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.05% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000002s : 0.02% optimize.overlap_recompute_comm : 0.000003s : 0.03% optimize.overlap_grad_ring_attention : 0.000004s : 0.04% optimize.overlap_grad_flash_sp : 0.000018s : 0.19% optimize.begin_end_overlap_inline : 0.000001s : 0.01% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.03% optimize.split_layernorm_comm : 0.000002s : 0.02% optimize.handle_group_info : 0.000001s : 0.01% optimize.symbol_engine_optimizer.build : 0.000003s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000010s : 0.10% optimize.symbol_engine_optimizer.elim_not_effective : 0.000013s : 0.13% optimize.symbol_engine_optimizer.opt_reshape : 0.000031s : 0.32% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000011s : 0.11% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.02% pipeline_parallel_scheduler : 0.000002s : 0.02% auto_monad_reorder : 0.000016s : 0.17% get_jit_bprop_graph : 0.000002s : 0.02% rewriter_after_jit_bprop_graph : 0.000004s : 0.04% opt_after_jit_grad : 0.000481s : 4.93% validate : 0.000037s : 0.38% Time group info: ------[substitution.] 0.000111 19 1.82% : 0.000002s : 2: substitution.elim_not_effective 1.33% : 0.000001s : 2: substitution.fold_const_symbol 4.99% : 0.000006s : 3: substitution.graph_param_transform 82.33% : 0.000092s : 2: substitution.inline 3.10% : 0.000003s : 4: substitution.j_node_and_user_rematch 3.93% : 0.000004s : 4: substitution.remove_not_recompute_node 2.50% : 0.000003s : 2: substitution.replace_old_param ------[type_inference.] 0.005314 2 91.38% : 0.004856s : 1: type_inference.infer 8.62% : 0.000458s : 1: type_inference.specialize ------[replace.] 0.000021 2 100.00% : 0.000021s : 2: replace.inline ------[match.] 0.000090 2 100.00% : 0.000090s : 2: match.inline ------[predicate.] 0.000132 754 0.86% : 0.000001s : 7: predicate.accumulaten_eliminater 1.02% : 0.000001s : 3: predicate.ad_related_special_op_eliminate 0.67% : 0.000001s : 6: predicate.addn_check_dump 0.86% : 0.000001s : 7: predicate.addn_zero_filter 0.77% : 0.000001s : 7: predicate.adjust_all_reduce_mul_add 2.27% : 0.000003s : 13: predicate.arithmetic_simplify 0.85% : 0.000001s : 7: predicate.cast_eliminate 0.71% : 0.000001s : 6: predicate.check_bprop_eliminate 0.70% : 0.000001s : 6: predicate.compare_switch_simplify 0.21% : 0.000000s : 3: predicate.const_output_eliminate 0.73% : 0.000001s : 6: predicate.depend_value_elim 0.80% : 0.000001s : 7: predicate.dict_get_item_const_eliminator 0.98% : 0.000001s : 7: predicate.dict_get_item_eliminator 0.80% : 0.000001s : 7: predicate.dict_set_item_eliminator 1.25% : 0.000002s : 6: predicate.dumpgradient_eliminate 0.30% : 0.000000s : 3: predicate.elim_not_effective 0.45% : 0.000001s : 3: predicate.elim_shapecalc_of_broadcastargs 1.05% : 0.000001s : 10: predicate.environ_add_const_eliminate 1.01% : 0.000001s : 10: predicate.environ_get_add_eliminate 0.98% : 0.000001s : 10: predicate.environ_get_depend_swap 1.94% : 0.000003s : 16: predicate.environ_get_eliminate 1.00% : 0.000001s : 10: predicate.environ_get_set_eliminate 1.04% : 0.000001s : 9: predicate.exchange_switch_depend_value 1.86% : 0.000002s : 9: predicate.float_depend_g_call 0.63% : 0.000001s : 6: predicate.float_environ_get_switch 0.95% : 0.000001s : 9: predicate.float_tuple_getitem_switch 0.24% : 0.000000s : 3: predicate.fold_const_symbol 0.95% : 0.000001s : 6: predicate.get_grad_eliminate 0.39% : 0.000001s : 3: predicate.graph_param_transform 0.83% : 0.000001s : 6: predicate.incorporate_call 0.66% : 0.000001s : 6: predicate.incorporate_call_switch 6.05% : 0.000008s : 34: predicate.inline 1.05% : 0.000001s : 6: predicate.inline_without_move 0.38% : 0.000001s : 6: predicate.j_node_and_user_rematch 1.10% : 0.000001s : 6: predicate.less_batch_normalization 1.90% : 0.000003s : 13: predicate.list_to_tuple_eliminator_ 1.95% : 0.000003s : 20: predicate.load_eliminater 1.19% : 0.000002s : 3: predicate.loop_unroll_after_grad 1.70% : 0.000002s : 14: predicate.loop_unroll_before_grad 1.75% : 0.000002s : 13: predicate.make_slice_get_slice_eliminator 0.67% : 0.000001s : 6: predicate.merge_addn 0.65% : 0.000001s : 6: predicate.micro_step_allgather_replace 0.80% : 0.000001s : 6: predicate.mini_step_allgather_replace 0.70% : 0.000001s : 7: predicate.minmaximum_grad 1.27% : 0.000002s : 3: predicate.mutable_eliminate 0.45% : 0.000001s : 3: predicate.opt_reshape 0.45% : 0.000001s : 3: predicate.parallel_virtual_node 1.26% : 0.000002s : 9: predicate.partial_defer_inline 1.24% : 0.000002s : 10: predicate.partial_eliminate 0.93% : 0.000001s : 7: predicate.print_const_string_wrapper 0.67% : 0.000001s : 6: predicate.reduce_all_const_elim 1.06% : 0.000001s : 7: predicate.reduce_eliminate 2.52% : 0.000003s : 20: predicate.redundant_stop_gradient_eliminater 0.62% : 0.000001s : 6: predicate.remove_not_recompute_node 1.21% : 0.000002s : 13: predicate.replace_applicator 0.68% : 0.000001s : 6: predicate.replace_old_param 0.39% : 0.000001s : 3: predicate.reset_defer_inline 0.81% : 0.000001s : 7: predicate.reshape_eliminate 0.73% : 0.000001s : 6: predicate.row_tensor_add_zeros_like 0.45% : 0.000001s : 3: predicate.row_tensor_eliminate 0.94% : 0.000001s : 6: predicate.same_eliminate 0.52% : 0.000001s : 6: predicate.set_cell_output_no_recompute 1.00% : 0.000001s : 6: predicate.shard_identity_eliminate 1.14% : 0.000001s : 6: predicate.special_op_eliminate 0.96% : 0.000001s : 6: predicate.specialize_transform 1.08% : 0.000001s : 6: predicate.split_environ_get_set_with_tuple_value 0.87% : 0.000001s : 6: predicate.stack_unstack_eliminate 0.41% : 0.000001s : 3: predicate.switch_call_monad_eliminater 1.01% : 0.000001s : 9: predicate.switch_defer_inline 1.76% : 0.000002s : 15: predicate.switch_layer_defer_inline 4.58% : 0.000006s : 32: predicate.switch_simplify 0.83% : 0.000001s : 7: predicate.tile_eliminate 0.81% : 0.000001s : 7: predicate.transpose_eliminate 1.57% : 0.000002s : 13: predicate.tuple_list_convert_item_index_to_positive 1.55% : 0.000002s : 13: predicate.tuple_list_get_item_const_eliminator 1.48% : 0.000002s : 13: predicate.tuple_list_get_item_depend_reorder 3.17% : 0.000004s : 19: predicate.tuple_list_get_item_eliminator 1.45% : 0.000002s : 13: predicate.tuple_list_get_set_item_eliminator 2.43% : 0.000003s : 19: predicate.tuple_list_set_item_eliminator 1.55% : 0.000002s : 13: predicate.tuple_to_list_eliminator_ 1.99% : 0.000003s : 20: predicate.updatestate_pure_node_eliminater 3.11% : 0.000004s : 26: predicate.updatestate_useless_node_eliminater 0.48% : 0.000001s : 3: predicate.value_based_eliminate 1.02% : 0.000001s : 6: predicate.virtual_dataset_eliminate 0.77% : 0.000001s : 6: predicate.virtual_output_eliminate 0.34% : 0.000000s : 3: predicate.virtual_view_grad_eliminate 0.76% : 0.000001s : 3: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000222 5 8.42% : 0.000019s : 1: func_graph_cloner_run.FuncGraphClonerGraph 91.58% : 0.000204s : 4: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.022686 192 0.02% : 0.000004s : 1: ForceFp32Comm 14.20% : 0.003222s : 1: add_attr 14.16% : 0.003211s : 1: add_attr_with_inline 0.02% : 0.000004s : 1: add_comm_op_reuse_tag 0.24% : 0.000053s : 1: add_recomputation 0.02% : 0.000004s : 1: assign_add_opt 0.27% : 0.000061s : 1: auto_monad 0.09% : 0.000020s : 1: auto_monad_reorder 0.01% : 0.000003s : 1: begin_end_overlap_inline 0.02% : 0.000005s : 1: bias_add_comm_swap 2.04% : 0.000462s : 1: bootstrap 0.14% : 0.000031s : 1: cconv 0.02% : 0.000004s : 1: comm_op_add_attrs 0.07% : 0.000016s : 1: control_data_broadcast_order 0.04% : 0.000010s : 1: convert_after_rewriter 0.11% : 0.000024s : 1: cse_after_recomputation 0.02% : 0.000005s : 1: dataset_repeat_opt 0.02% : 0.000005s : 1: detach_backward 0.04% : 0.000008s : 1: environ_conv 0.08% : 0.000018s : 1: event_method 0.02% : 0.000005s : 1: full_micro_interleaved_order_control 0.02% : 0.000005s : 1: get_jit_bprop_graph 0.04% : 0.000009s : 1: graph_reusing 0.02% : 0.000005s : 1: grouped_pairwise_exchange_alltoall 0.02% : 0.000004s : 1: handle_group_info 0.02% : 0.000006s : 1: inline 0.03% : 0.000006s : 1: insert-virtual-dataset 0.02% : 0.000004s : 1: interleave_parallel_branches 0.02% : 0.000004s : 1: interleave_split_concat_branches 0.02% : 0.000006s : 1: label_fine_grained_interleaved_index 0.03% : 0.000007s : 1: label_micro_interleaved_index 1.95% : 0.000441s : 1: loop_unroll 0.02% : 0.000004s : 1: merge_cast_opt 0.02% : 0.000005s : 1: micro_interleaved_order_control 2.26% : 0.000514s : 1: mutable_eliminate 0.03% : 0.000007s : 1: offloading_packed_experts 0.07% : 0.000015s : 1: opt.transform.loop_unroll_optimizer 0.07% : 0.000015s : 1: opt.transform.mutable_eliminate 3.35% : 0.000760s : 78: opt.transform.opt_a 0.11% : 0.000026s : 1: opt.transform.opt_after_cconv 0.11% : 0.000024s : 1: opt.transform.opt_after_jit_grad 0.41% : 0.000094s : 28: opt.transform.opt_b 0.19% : 0.000043s : 2: opt.transform.opt_trans_graph 0.27% : 0.000061s : 4: opt.transform.symbol_engine_opt 9.28% : 0.002106s : 1: opt_a 0.46% : 0.000105s : 1: opt_after_cconv 2.16% : 0.000490s : 1: opt_after_jit_grad 0.89% : 0.000203s : 1: opt_b 18.14% : 0.004114s : 1: optimize 0.09% : 0.000019s : 1: optimize_parallel_all_gather_comm 0.04% : 0.000009s : 1: order_py_execute_after_rewriter 0.10% : 0.000022s : 1: overlap_grad_flash_sp 0.02% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.03% : 0.000007s : 1: overlap_grad_ring_attention 0.02% : 0.000004s : 1: overlap_opt_shard_grad_in_pipeline 0.02% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.02% : 0.000005s : 1: overlap_param_gather 0.02% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.03% : 0.000007s : 1: overlap_recompute_and_grad_model_parallel 0.03% : 0.000006s : 1: overlap_recompute_comm 0.03% : 0.000007s : 1: parallel-infer-symbol 0.02% : 0.000004s : 1: parallel-infer-symbol-second 0.02% : 0.000005s : 1: partial_unused_args_eliminate 0.02% : 0.000005s : 1: pipeline_parallel_scheduler 0.02% : 0.000004s : 1: pipeline_split 0.13% : 0.000029s : 1: pre_auto_parallel 0.09% : 0.000021s : 1: py_interpret_to_execute 0.06% : 0.000013s : 1: py_interpret_to_execute_after_opt_a 0.02% : 0.000004s : 1: remove_cast_before_assign_add 0.08% : 0.000018s : 1: remove_dup_value 1.24% : 0.000282s : 1: renormalize.infer 1.00% : 0.000227s : 1: renormalize.specialize 0.02% : 0.000005s : 1: reorder_send_recv_between_fp_bp 0.03% : 0.000007s : 1: rewriter_after_jit_bprop_graph 0.17% : 0.000039s : 1: rewriter_after_opt_a 0.33% : 0.000074s : 1: rewriter_before_opt_a 0.02% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.02% : 0.000005s : 1: slice_recompute_activation 0.02% : 0.000005s : 1: split_layernorm_comm 0.02% : 0.000005s : 1: split_matmul_comm_elemetwise 0.04% : 0.000008s : 1: swap_dp_allreduce_reducescatter 0.46% : 0.000105s : 1: symbol_engine_optimizer 0.33% : 0.000075s : 1: tuple_transform 23.73% : 0.005384s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:42.625.473 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:42.625.757 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0158905, [21] [bootstrap]: 0.00044345 [type_inference]: 0.00504351 [event_method]: 1.5e-05 [auto_monad]: 5.938e-05 [graph_reusing]: 5.44e-06 [inline]: 2.78998e-06 [add_attr]: 0.00332261, [1] [add_attr_with_inline]: 0.00331318, [1] [Cycle 1]: 7.54e-05, [2] [tag_attr]: 1.754e-05 [meta_addattr_fg_expand]: 4.17e-06 [parallel-infer-symbol]: 3.3e-06 [pre_auto_parallel]: 3.078e-05 [insert-virtual-dataset]: 2.68e-06 [parallel-infer-symbol-second]: 1.00001e-06 [dataset_repeat_opt]: 1.71e-06 [pipeline_split]: 1.64e-06 [optimize]: 0.00569399, [53] [py_interpret_to_execute]: 2.423e-05 [rewriter_before_opt_a]: 5.821e-05 [opt_a]: 0.00307663, [2] [Cycle 1]: 0.00211586, [45] [expand_dump_flag]: 3.09999e-06 [switch_simplify]: 2.762e-05 [loop_unroll]: 1.558e-05 [a_1]: 0.00038795 [with_stream_mark]: 1.908e-05 [recompute_prepare]: 1.251e-05 [updatestate_depend_eliminate]: 4.87998e-06 [updatestate_assign_eliminate]: 4.47e-06 [updatestate_loads_eliminate]: 3.67998e-06 [parameter_eliminate]: 2.02999e-06 [a_2]: 0.00012747 [accelerated_algorithm]: 8.99998e-06 [shard]: 2.33998e-06 [meta_shard_fg_expand]: 2.21998e-06 [shard_inline]: 7.77002e-06 [merge_send_recv]: 1.016e-05 [auto_parallel]: 7.78999e-06 [parallel]: 1.84e-05 [flash_sp]: 1.099e-05 [merge_comm]: 4.62998e-06 [allreduce_fusion]: 4.03001e-06 [matmul_add_comm_reduction]: 1.03e-05 [allreduce_slice_to_reducescatter]: 5.90022e-07 [virtual_shard_identity]: 1.071e-05 [virtual_dataset]: 7.71001e-06 [get_grad_eliminate_]: 8.84998e-06 [virtual_output]: 7.15e-06 [merge_forward]: 5.02e-06 [cell_reuse_recompute_pass]: 1.53002e-06 [offload_activation]: 1.228e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.035e-05 [merge_recompute_call_nodes]: 1.58002e-06 [before_grad]: 1.353e-05 [set_forward_comm_id_for_comm_node_pass]: 6.17999e-06 [meta_fg_expand]: 3.5e-06 [flash_sp_send_recv_attached]: 2.79001e-06 [receive_attached]: 2.06e-06 [after_resolve]: 1.182e-05 [a_after_grad]: 1.183e-05 [renormalize]: 0.00074356 [add_forward_monad_depend]: 6.46e-06 [auto_monad_grad]: 3.14999e-06 [auto_monad_eliminator]: 1.845e-05 [cse]: 3.776e-05 [a_3]: 7.359e-05 [Cycle 2]: 0.00094651, [45] [expand_dump_flag]: 1.21002e-06 [switch_simplify]: 8.84e-06 [loop_unroll]: 7.04001e-06 [a_1]: 0.00015537 [with_stream_mark]: 1.576e-05 [recompute_prepare]: 8.87e-06 [updatestate_depend_eliminate]: 4.74e-06 [updatestate_assign_eliminate]: 3.37002e-06 [updatestate_loads_eliminate]: 2.91e-06 [parameter_eliminate]: 1.65001e-06 [a_2]: 0.00011675 [accelerated_algorithm]: 8.22e-06 [shard]: 1.22e-06 [meta_shard_fg_expand]: 1.97001e-06 [shard_inline]: 7.65e-06 [merge_send_recv]: 7.36001e-06 [auto_parallel]: 6.92002e-06 [parallel]: 5.86e-06 [flash_sp]: 3.44001e-06 [merge_comm]: 4.41002e-06 [allreduce_fusion]: 4.52003e-06 [matmul_add_comm_reduction]: 8.65999e-06 [allreduce_slice_to_reducescatter]: 4.40021e-07 [virtual_shard_identity]: 8.97e-06 [virtual_dataset]: 7.36999e-06 [get_grad_eliminate_]: 6.86001e-06 [virtual_output]: 7.1e-06 [merge_forward]: 3.75998e-06 [cell_reuse_recompute_pass]: 1.65001e-06 [offload_activation]: 9.19998e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.892e-05 [merge_recompute_call_nodes]: 1.12999e-06 [before_grad]: 1.27e-05 [set_forward_comm_id_for_comm_node_pass]: 5.35999e-06 [meta_fg_expand]: 2.71999e-06 [flash_sp_send_recv_attached]: 9.89996e-07 [receive_attached]: 1.29998e-06 [after_resolve]: 1.053e-05 [a_after_grad]: 1.032e-05 [renormalize]: 1.00001e-07 [add_forward_monad_depend]: 1.86e-06 [auto_monad_grad]: 1.74e-06 [auto_monad_eliminator]: 1.071e-05 [cse]: 2.462e-05 [a_3]: 5.729e-05 [py_interpret_to_execute_after_opt_a]: 1.61e-05 [slice_cell_reuse_recomputed_activation]: 4.99e-06 [rewriter_after_opt_a]: 5.077e-05 [convert_after_rewriter]: 1.189e-05 [order_py_execute_after_rewriter]: 9.22999e-06 [mutable_eliminate]: 0.00065367 [opt_b]: 0.00031741, [1] [Cycle 1]: 0.00030774, [7] [b_1]: 0.00019203 [b_2]: 8.90999e-06 [updatestate_depend_eliminate]: 7.99002e-06 [updatestate_assign_eliminate]: 3.53e-06 [updatestate_loads_eliminate]: 3.39001e-06 [renormalize]: 1.12e-06 [cse]: 3.028e-05 [optimize_parallel_all_gather_comm]: 2.232e-05 [overlap_param_gather]: 5.40999e-06 [cconv]: 3.472e-05 [loop_unroll]: 0.00050657 [opt_after_cconv]: 0.0001523, [1] [Cycle 1]: 0.00014266, [7] [c_1]: 3.659e-05 [parameter_eliminate]: 4.21001e-06 [updatestate_depend_eliminate]: 8.03999e-06 [updatestate_assign_eliminate]: 3.2e-06 [updatestate_loads_eliminate]: 3.46999e-06 [cse]: 2.765e-05 [renormalize]: 6.80011e-07 [remove_dup_value]: 2.214e-05 [tuple_transform]: 0.00010135, [1] [Cycle 1]: 9.399e-05, [4] [d_1]: 5.046e-05 [none_parameter_eliminate]: 1.67001e-06 [renormalize]: 2.59985e-07 [switch_simplify]: 8.52e-06 [partial_unused_args_eliminate]: 5.09998e-06 [add_recomputation]: 6.248e-05 [cse_after_recomputation]: 3.418e-05, [1] [Cycle 1]: 2.688e-05, [1] [cse]: 1.756e-05 [environ_conv]: 1.001e-05 [swap_dp_allreduce_reducescatter]: 8.37998e-06 [bias_add_comm_swap]: 5.31998e-06 [label_micro_interleaved_index]: 7.52002e-06 [label_fine_grained_interleaved_index]: 5.40001e-06 [merge_cast_opt]: 3.98001e-06 [slice_recompute_activation]: 4.40999e-06 [micro_interleaved_order_control]: 4.52e-06 [assign_add_opt]: 3.98001e-06 [ForceFp32Comm]: 3.21999e-06 [remove_cast_before_assign_add]: 3.38999e-06 [full_micro_interleaved_order_control]: 4.38999e-06 [reorder_send_recv_between_fp_bp]: 5.32999e-06 [comm_op_add_attrs]: 3.75e-06 [add_comm_op_reuse_tag]: 3.26001e-06 [interleave_split_concat_branches]: 3.78001e-06 [interleave_parallel_branches]: 3.41999e-06 [overlap_opt_shard_in_pipeline]: 3.75e-06 [overlap_opt_shard_grad_in_pipeline]: 4.18999e-06 [control_data_broadcast_order]: 1.783e-05 [grouped_pairwise_exchange_alltoall]: 4.02e-06 [offloading_packed_experts]: 7.38e-06 [overlap_recompute_and_grad_model_parallel]: 7.63001e-06 [overlap_grad_matmul_and_grad_allreduce]: 4.01001e-06 [overlap_recompute_allgather_and_fa_grad]: 3.68e-06 [overlap_recompute_comm]: 4.97e-06 [overlap_grad_ring_attention]: 7.41999e-06 [overlap_grad_flash_sp]: 2.685e-05 [begin_end_overlap_inline]: 2.84001e-06 [split_matmul_comm_elemetwise]: 4.75001e-06 [split_layernorm_comm]: 4.08001e-06 [handle_group_info]: 3.86001e-06 [symbol_engine_optimizer]: 0.00011442, [1] [Cycle 1]: 0.0001069, [6] [build]: 3.53e-06 [elim_shapecalc]: 1.42e-05 [elim_not_effective]: 1.637e-05 [opt_reshape]: 8.63001e-06 [fold_const_symbol]: 1.324e-05 [renormalize]: 2.70025e-07 [detach_backward]: 3.95e-06 [pipeline_parallel_scheduler]: 1.84998e-06 [auto_monad_reorder]: 2.279e-05 [get_jit_bprop_graph]: 1.94999e-06 [rewriter_after_jit_bprop_graph]: 5.37999e-06 [opt_after_jit_grad]: 0.0005511 [validate]: 4.775e-05 Sums bootstrap : 0.000443s : 4.14% type_inference : 0.005044s : 47.09% event_method : 0.000015s : 0.14% auto_monad : 0.000059s : 0.55% graph_reusing : 0.000005s : 0.05% inline : 0.000003s : 0.03% add_attr.add_attr_with_inline.tag_attr : 0.000018s : 0.16% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000004s : 0.04% parallel-infer-symbol : 0.000003s : 0.03% pre_auto_parallel : 0.000031s : 0.29% insert-virtual-dataset : 0.000003s : 0.03% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.02% optimize.py_interpret_to_execute : 0.000024s : 0.23% optimize.rewriter_before_opt_a : 0.000058s : 0.54% optimize.opt_a.expand_dump_flag : 0.000004s : 0.04% optimize.opt_a.switch_simplify : 0.000036s : 0.34% optimize.opt_a.loop_unroll : 0.000023s : 0.21% optimize.opt_a.a_1 : 0.000543s : 5.07% optimize.opt_a.with_stream_mark : 0.000035s : 0.33% optimize.opt_a.recompute_prepare : 0.000021s : 0.20% optimize.opt_a.updatestate_depend_eliminate : 0.000010s : 0.09% optimize.opt_a.updatestate_assign_eliminate : 0.000008s : 0.07% optimize.opt_a.updatestate_loads_eliminate : 0.000007s : 0.06% optimize.opt_a.parameter_eliminate : 0.000004s : 0.03% optimize.opt_a.a_2 : 0.000244s : 2.28% optimize.opt_a.accelerated_algorithm : 0.000017s : 0.16% optimize.opt_a.shard : 0.000004s : 0.03% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.04% optimize.opt_a.shard_inline : 0.000015s : 0.14% optimize.opt_a.merge_send_recv : 0.000018s : 0.16% optimize.opt_a.auto_parallel : 0.000015s : 0.14% optimize.opt_a.parallel : 0.000024s : 0.23% optimize.opt_a.flash_sp : 0.000014s : 0.13% optimize.opt_a.merge_comm : 0.000009s : 0.08% optimize.opt_a.allreduce_fusion : 0.000009s : 0.08% optimize.opt_a.matmul_add_comm_reduction : 0.000019s : 0.18% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000020s : 0.18% optimize.opt_a.virtual_dataset : 0.000015s : 0.14% optimize.opt_a.get_grad_eliminate_ : 0.000016s : 0.15% optimize.opt_a.virtual_output : 0.000014s : 0.13% optimize.opt_a.merge_forward : 0.000009s : 0.08% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.03% optimize.opt_a.offload_activation : 0.000021s : 0.20% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000039s : 0.37% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.03% optimize.opt_a.before_grad : 0.000026s : 0.24% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000012s : 0.11% optimize.opt_a.meta_fg_expand : 0.000006s : 0.06% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.04% optimize.opt_a.receive_attached : 0.000003s : 0.03% optimize.opt_a.after_resolve : 0.000022s : 0.21% optimize.opt_a.a_after_grad : 0.000022s : 0.21% optimize.opt_a.renormalize : 0.000744s : 6.94% optimize.opt_a.add_forward_monad_depend : 0.000008s : 0.08% optimize.opt_a.auto_monad_grad : 0.000005s : 0.05% optimize.opt_a.auto_monad_eliminator : 0.000029s : 0.27% optimize.opt_a.cse : 0.000062s : 0.58% optimize.opt_a.a_3 : 0.000131s : 1.22% optimize.py_interpret_to_execute_after_opt_a : 0.000016s : 0.15% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.05% optimize.rewriter_after_opt_a : 0.000051s : 0.47% optimize.convert_after_rewriter : 0.000012s : 0.11% optimize.order_py_execute_after_rewriter : 0.000009s : 0.09% optimize.mutable_eliminate : 0.000654s : 6.10% optimize.opt_b.b_1 : 0.000192s : 1.79% optimize.opt_b.b_2 : 0.000009s : 0.08% optimize.opt_b.updatestate_depend_eliminate : 0.000008s : 0.07% optimize.opt_b.updatestate_assign_eliminate : 0.000004s : 0.03% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_b.renormalize : 0.000001s : 0.01% optimize.opt_b.cse : 0.000030s : 0.28% optimize.optimize_parallel_all_gather_comm : 0.000022s : 0.21% optimize.overlap_param_gather : 0.000005s : 0.05% optimize.cconv : 0.000035s : 0.32% optimize.loop_unroll : 0.000507s : 4.73% optimize.opt_after_cconv.c_1 : 0.000037s : 0.34% optimize.opt_after_cconv.parameter_eliminate : 0.000004s : 0.04% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000008s : 0.08% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.cse : 0.000028s : 0.26% optimize.opt_after_cconv.renormalize : 0.000001s : 0.01% optimize.remove_dup_value : 0.000022s : 0.21% optimize.tuple_transform.d_1 : 0.000050s : 0.47% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.02% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000009s : 0.08% optimize.partial_unused_args_eliminate : 0.000005s : 0.05% optimize.add_recomputation : 0.000062s : 0.58% optimize.cse_after_recomputation.cse : 0.000018s : 0.16% optimize.environ_conv : 0.000010s : 0.09% optimize.swap_dp_allreduce_reducescatter : 0.000008s : 0.08% optimize.bias_add_comm_swap : 0.000005s : 0.05% optimize.label_micro_interleaved_index : 0.000008s : 0.07% optimize.label_fine_grained_interleaved_index : 0.000005s : 0.05% optimize.merge_cast_opt : 0.000004s : 0.04% optimize.slice_recompute_activation : 0.000004s : 0.04% optimize.micro_interleaved_order_control : 0.000005s : 0.04% optimize.assign_add_opt : 0.000004s : 0.04% optimize.ForceFp32Comm : 0.000003s : 0.03% optimize.remove_cast_before_assign_add : 0.000003s : 0.03% optimize.full_micro_interleaved_order_control : 0.000004s : 0.04% optimize.reorder_send_recv_between_fp_bp : 0.000005s : 0.05% optimize.comm_op_add_attrs : 0.000004s : 0.04% optimize.add_comm_op_reuse_tag : 0.000003s : 0.03% optimize.interleave_split_concat_branches : 0.000004s : 0.04% optimize.interleave_parallel_branches : 0.000003s : 0.03% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.04% optimize.overlap_opt_shard_grad_in_pipeline : 0.000004s : 0.04% optimize.control_data_broadcast_order : 0.000018s : 0.17% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.04% optimize.offloading_packed_experts : 0.000007s : 0.07% optimize.overlap_recompute_and_grad_model_parallel : 0.000008s : 0.07% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.04% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.03% optimize.overlap_recompute_comm : 0.000005s : 0.05% optimize.overlap_grad_ring_attention : 0.000007s : 0.07% optimize.overlap_grad_flash_sp : 0.000027s : 0.25% optimize.begin_end_overlap_inline : 0.000003s : 0.03% optimize.split_matmul_comm_elemetwise : 0.000005s : 0.04% optimize.split_layernorm_comm : 0.000004s : 0.04% optimize.handle_group_info : 0.000004s : 0.04% optimize.symbol_engine_optimizer.build : 0.000004s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000014s : 0.13% optimize.symbol_engine_optimizer.elim_not_effective : 0.000016s : 0.15% optimize.symbol_engine_optimizer.opt_reshape : 0.000009s : 0.08% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000013s : 0.12% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000004s : 0.04% pipeline_parallel_scheduler : 0.000002s : 0.02% auto_monad_reorder : 0.000023s : 0.21% get_jit_bprop_graph : 0.000002s : 0.02% rewriter_after_jit_bprop_graph : 0.000005s : 0.05% opt_after_jit_grad : 0.000551s : 5.15% validate : 0.000048s : 0.45% Time group info: ------[substitution.] 0.000164 28 15.32% : 0.000025s : 2: substitution.cast_eliminate 1.45% : 0.000002s : 3: substitution.elim_not_effective 1.14% : 0.000002s : 3: substitution.fold_const_symbol 3.92% : 0.000006s : 4: substitution.graph_param_transform 68.93% : 0.000113s : 2: substitution.inline 3.12% : 0.000005s : 6: substitution.j_node_and_user_rematch 3.78% : 0.000006s : 6: substitution.remove_not_recompute_node 2.35% : 0.000004s : 2: substitution.replace_old_param ------[type_inference.] 0.004990 2 90.52% : 0.004517s : 1: type_inference.infer 9.48% : 0.000473s : 1: type_inference.specialize ------[replace.] 0.000023 2 100.00% : 0.000023s : 2: replace.inline ------[match.] 0.000111 2 100.00% : 0.000111s : 2: match.inline ------[predicate.] 0.000176 980 0.81% : 0.000001s : 9: predicate.accumulaten_eliminater 1.33% : 0.000002s : 4: predicate.ad_related_special_op_eliminate 0.67% : 0.000001s : 8: predicate.addn_check_dump 0.81% : 0.000001s : 9: predicate.addn_zero_filter 0.71% : 0.000001s : 9: predicate.adjust_all_reduce_mul_add 2.42% : 0.000004s : 17: predicate.arithmetic_simplify 0.91% : 0.000002s : 9: predicate.cast_eliminate 0.77% : 0.000001s : 8: predicate.check_bprop_eliminate 0.68% : 0.000001s : 8: predicate.compare_switch_simplify 0.21% : 0.000000s : 4: predicate.const_output_eliminate 0.75% : 0.000001s : 8: predicate.depend_value_elim 0.77% : 0.000001s : 9: predicate.dict_get_item_const_eliminator 0.92% : 0.000002s : 9: predicate.dict_get_item_eliminator 0.74% : 0.000001s : 9: predicate.dict_set_item_eliminator 1.09% : 0.000002s : 8: predicate.dumpgradient_eliminate 0.38% : 0.000001s : 4: predicate.elim_not_effective 0.60% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.09% : 0.000002s : 13: predicate.environ_add_const_eliminate 1.01% : 0.000002s : 13: predicate.environ_get_add_eliminate 1.10% : 0.000002s : 13: predicate.environ_get_depend_swap 1.79% : 0.000003s : 21: predicate.environ_get_eliminate 1.00% : 0.000002s : 13: predicate.environ_get_set_eliminate 0.89% : 0.000002s : 11: predicate.exchange_switch_depend_value 1.62% : 0.000003s : 11: predicate.float_depend_g_call 0.67% : 0.000001s : 8: predicate.float_environ_get_switch 0.96% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.23% : 0.000000s : 4: predicate.fold_const_symbol 0.87% : 0.000002s : 8: predicate.get_grad_eliminate 0.34% : 0.000001s : 4: predicate.graph_param_transform 0.78% : 0.000001s : 8: predicate.incorporate_call 0.62% : 0.000001s : 8: predicate.incorporate_call_switch 6.21% : 0.000011s : 44: predicate.inline 1.20% : 0.000002s : 8: predicate.inline_without_move 0.35% : 0.000001s : 8: predicate.j_node_and_user_rematch 1.03% : 0.000002s : 8: predicate.less_batch_normalization 1.71% : 0.000003s : 17: predicate.list_to_tuple_eliminator_ 2.16% : 0.000004s : 26: predicate.load_eliminater 1.15% : 0.000002s : 4: predicate.loop_unroll_after_grad 1.37% : 0.000002s : 16: predicate.loop_unroll_before_grad 1.72% : 0.000003s : 17: predicate.make_slice_get_slice_eliminator 0.74% : 0.000001s : 8: predicate.merge_addn 0.77% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.96% : 0.000002s : 8: predicate.mini_step_allgather_replace 0.66% : 0.000001s : 9: predicate.minmaximum_grad 1.64% : 0.000003s : 4: predicate.mutable_eliminate 0.56% : 0.000001s : 4: predicate.opt_reshape 0.46% : 0.000001s : 4: predicate.parallel_virtual_node 1.23% : 0.000002s : 11: predicate.partial_defer_inline 1.24% : 0.000002s : 13: predicate.partial_eliminate 0.95% : 0.000002s : 9: predicate.print_const_string_wrapper 0.70% : 0.000001s : 8: predicate.reduce_all_const_elim 1.16% : 0.000002s : 9: predicate.reduce_eliminate 2.12% : 0.000004s : 26: predicate.redundant_stop_gradient_eliminater 0.99% : 0.000002s : 8: predicate.remove_not_recompute_node 1.19% : 0.000002s : 17: predicate.replace_applicator 0.61% : 0.000001s : 8: predicate.replace_old_param 0.43% : 0.000001s : 4: predicate.reset_defer_inline 0.90% : 0.000002s : 9: predicate.reshape_eliminate 0.82% : 0.000001s : 8: predicate.row_tensor_add_zeros_like 0.52% : 0.000001s : 4: predicate.row_tensor_eliminate 1.08% : 0.000002s : 8: predicate.same_eliminate 0.94% : 0.000002s : 8: predicate.set_cell_output_no_recompute 1.08% : 0.000002s : 8: predicate.shard_identity_eliminate 0.91% : 0.000002s : 8: predicate.special_op_eliminate 1.12% : 0.000002s : 8: predicate.specialize_transform 1.20% : 0.000002s : 8: predicate.split_environ_get_set_with_tuple_value 0.81% : 0.000001s : 8: predicate.stack_unstack_eliminate 0.42% : 0.000001s : 4: predicate.switch_call_monad_eliminater 0.94% : 0.000002s : 11: predicate.switch_defer_inline 1.65% : 0.000003s : 19: predicate.switch_layer_defer_inline 4.02% : 0.000007s : 39: predicate.switch_simplify 0.76% : 0.000001s : 9: predicate.tile_eliminate 0.92% : 0.000002s : 9: predicate.transpose_eliminate 1.55% : 0.000003s : 17: predicate.tuple_list_convert_item_index_to_positive 1.54% : 0.000003s : 17: predicate.tuple_list_get_item_const_eliminator 1.53% : 0.000003s : 17: predicate.tuple_list_get_item_depend_reorder 3.03% : 0.000005s : 25: predicate.tuple_list_get_item_eliminator 1.49% : 0.000003s : 17: predicate.tuple_list_get_set_item_eliminator 2.54% : 0.000004s : 25: predicate.tuple_list_set_item_eliminator 1.50% : 0.000003s : 17: predicate.tuple_to_list_eliminator_ 2.01% : 0.000004s : 26: predicate.updatestate_pure_node_eliminater 2.83% : 0.000005s : 34: predicate.updatestate_useless_node_eliminater 0.43% : 0.000001s : 4: predicate.value_based_eliminate 0.87% : 0.000002s : 8: predicate.virtual_dataset_eliminate 0.81% : 0.000001s : 8: predicate.virtual_output_eliminate 0.33% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.57% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000242 5 8.47% : 0.000020s : 1: func_graph_cloner_run.FuncGraphClonerGraph 91.53% : 0.000221s : 4: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.026860 192 0.02% : 0.000006s : 1: ForceFp32Comm 12.41% : 0.003332s : 1: add_attr 12.35% : 0.003317s : 1: add_attr_with_inline 0.02% : 0.000006s : 1: add_comm_op_reuse_tag 0.25% : 0.000066s : 1: add_recomputation 0.02% : 0.000007s : 1: assign_add_opt 0.25% : 0.000068s : 1: auto_monad 0.11% : 0.000030s : 1: auto_monad_reorder 0.02% : 0.000006s : 1: begin_end_overlap_inline 0.03% : 0.000008s : 1: bias_add_comm_swap 1.81% : 0.000488s : 1: bootstrap 0.14% : 0.000038s : 1: cconv 0.02% : 0.000007s : 1: comm_op_add_attrs 0.08% : 0.000021s : 1: control_data_broadcast_order 0.06% : 0.000015s : 1: convert_after_rewriter 0.14% : 0.000037s : 1: cse_after_recomputation 0.03% : 0.000008s : 1: dataset_repeat_opt 0.08% : 0.000023s : 1: detach_backward 0.05% : 0.000013s : 1: environ_conv 0.09% : 0.000025s : 1: event_method 0.03% : 0.000007s : 1: full_micro_interleaved_order_control 0.03% : 0.000008s : 1: get_jit_bprop_graph 0.04% : 0.000012s : 1: graph_reusing 0.03% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.02% : 0.000007s : 1: handle_group_info 0.03% : 0.000009s : 1: inline 0.03% : 0.000009s : 1: insert-virtual-dataset 0.02% : 0.000006s : 1: interleave_parallel_branches 0.02% : 0.000006s : 1: interleave_split_concat_branches 0.03% : 0.000008s : 1: label_fine_grained_interleaved_index 0.04% : 0.000010s : 1: label_micro_interleaved_index 1.91% : 0.000514s : 1: loop_unroll 0.02% : 0.000007s : 1: merge_cast_opt 0.03% : 0.000007s : 1: micro_interleaved_order_control 2.46% : 0.000661s : 1: mutable_eliminate 0.04% : 0.000010s : 1: offloading_packed_experts 0.07% : 0.000018s : 1: opt.transform.loop_unroll_optimizer 0.07% : 0.000019s : 1: opt.transform.mutable_eliminate 3.81% : 0.001022s : 78: opt.transform.opt_a 0.13% : 0.000035s : 1: opt.transform.opt_after_cconv 0.12% : 0.000033s : 1: opt.transform.opt_after_jit_grad 0.47% : 0.000127s : 28: opt.transform.opt_b 0.21% : 0.000056s : 2: opt.transform.opt_trans_graph 0.18% : 0.000048s : 4: opt.transform.symbol_engine_opt 11.47% : 0.003080s : 1: opt_a 0.58% : 0.000156s : 1: opt_after_cconv 2.10% : 0.000563s : 1: opt_after_jit_grad 1.19% : 0.000321s : 1: opt_b 22.56% : 0.006060s : 1: optimize 0.10% : 0.000026s : 1: optimize_parallel_all_gather_comm 0.04% : 0.000012s : 1: order_py_execute_after_rewriter 0.11% : 0.000030s : 1: overlap_grad_flash_sp 0.03% : 0.000007s : 1: overlap_grad_matmul_and_grad_allreduce 0.04% : 0.000010s : 1: overlap_grad_ring_attention 0.03% : 0.000007s : 1: overlap_opt_shard_grad_in_pipeline 0.02% : 0.000007s : 1: overlap_opt_shard_in_pipeline 0.03% : 0.000009s : 1: overlap_param_gather 0.02% : 0.000006s : 1: overlap_recompute_allgather_and_fa_grad 0.04% : 0.000010s : 1: overlap_recompute_and_grad_model_parallel 0.03% : 0.000008s : 1: overlap_recompute_comm 0.04% : 0.000010s : 1: parallel-infer-symbol 0.02% : 0.000006s : 1: parallel-infer-symbol-second 0.03% : 0.000008s : 1: partial_unused_args_eliminate 0.04% : 0.000010s : 1: pipeline_parallel_scheduler 0.03% : 0.000007s : 1: pipeline_split 0.14% : 0.000039s : 1: pre_auto_parallel 0.10% : 0.000028s : 1: py_interpret_to_execute 0.07% : 0.000019s : 1: py_interpret_to_execute_after_opt_a 0.02% : 0.000006s : 1: remove_cast_before_assign_add 0.10% : 0.000026s : 1: remove_dup_value 1.62% : 0.000434s : 1: renormalize.infer 1.12% : 0.000301s : 1: renormalize.specialize 0.03% : 0.000008s : 1: reorder_send_recv_between_fp_bp 0.04% : 0.000011s : 1: rewriter_after_jit_bprop_graph 0.20% : 0.000054s : 1: rewriter_after_opt_a 0.23% : 0.000062s : 1: rewriter_before_opt_a 0.03% : 0.000008s : 1: slice_cell_reuse_recomputed_activation 0.03% : 0.000007s : 1: slice_recompute_activation 0.03% : 0.000007s : 1: split_layernorm_comm 0.03% : 0.000007s : 1: split_matmul_comm_elemetwise 0.04% : 0.000011s : 1: swap_dp_allreduce_reducescatter 0.44% : 0.000117s : 1: symbol_engine_optimizer 0.39% : 0.000105s : 1: tuple_transform 18.95% : 0.005090s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:42.827.413 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0154273, [21] [bootstrap]: 0.00047535 [type_inference]: 0.00532146 [event_method]: 1.333e-05 [auto_monad]: 6.117e-05 [graph_reusing]: 5.05001e-06 [inline]: 2.60002e-06 [add_attr]: 0.0035008, [1] [add_attr_with_inline]: 0.00348913, [1] [Cycle 1]: 6.762e-05, [2] [tag_attr]: 1.882e-05 [meta_addattr_fg_expand]: 4.29997e-06 [parallel-infer-symbol]: 3.66001e-06 [pre_auto_parallel]: 3.176e-05 [insert-virtual-dataset]: 2.49999e-06 [parallel-infer-symbol-second]: 7.30011e-07 [dataset_repeat_opt]: 2.02999e-06 [pipeline_split]: 1.87999e-06 [optimize]: 0.00523794, [53] [py_interpret_to_execute]: 2.171e-05 [rewriter_before_opt_a]: 5.791e-05 [opt_a]: 0.00276159, [2] [Cycle 1]: 0.0019227, [45] [expand_dump_flag]: 2.79999e-06 [switch_simplify]: 2.815e-05 [loop_unroll]: 1.528e-05 [a_1]: 0.00036819 [with_stream_mark]: 2.055e-05 [recompute_prepare]: 1.155e-05 [updatestate_depend_eliminate]: 4.30999e-06 [updatestate_assign_eliminate]: 3.6e-06 [updatestate_loads_eliminate]: 4.02002e-06 [parameter_eliminate]: 2.36e-06 [a_2]: 9.715e-05 [accelerated_algorithm]: 8.71002e-06 [shard]: 2.16998e-06 [meta_shard_fg_expand]: 2.19001e-06 [shard_inline]: 8.52e-06 [merge_send_recv]: 9.46003e-06 [auto_parallel]: 7.73001e-06 [parallel]: 1.948e-05 [flash_sp]: 1.001e-05 [merge_comm]: 4.55999e-06 [allreduce_fusion]: 4.32e-06 [matmul_add_comm_reduction]: 1.083e-05 [allreduce_slice_to_reducescatter]: 6.00005e-07 [virtual_shard_identity]: 1.097e-05 [virtual_dataset]: 7.7e-06 [get_grad_eliminate_]: 7.41999e-06 [virtual_output]: 7.4e-06 [merge_forward]: 4.55001e-06 [cell_reuse_recompute_pass]: 1.17e-06 [offload_activation]: 1.078e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.618e-05 [merge_recompute_call_nodes]: 1.79998e-06 [before_grad]: 1.327e-05 [set_forward_comm_id_for_comm_node_pass]: 5.17e-06 [meta_fg_expand]: 3.50003e-06 [flash_sp_send_recv_attached]: 2.99999e-06 [receive_attached]: 2.27999e-06 [after_resolve]: 1.163e-05 [a_after_grad]: 1.08e-05 [renormalize]: 0.00076587 [add_forward_monad_depend]: 7.01001e-06 [auto_monad_grad]: 2.58e-06 [auto_monad_eliminator]: 1.917e-05 [cse]: 4.141e-05 [a_3]: 6.321e-05 [Cycle 2]: 0.00082563, [45] [expand_dump_flag]: 1.92999e-06 [switch_simplify]: 1.024e-05 [loop_unroll]: 7.03998e-06 [a_1]: 0.00015722 [with_stream_mark]: 1.678e-05 [recompute_prepare]: 7.70998e-06 [updatestate_depend_eliminate]: 4.32998e-06 [updatestate_assign_eliminate]: 3.30003e-06 [updatestate_loads_eliminate]: 3.04001e-06 [parameter_eliminate]: 1.79998e-06 [a_2]: 8.772e-05 [accelerated_algorithm]: 8.68001e-06 [shard]: 1.58002e-06 [meta_shard_fg_expand]: 2.09999e-06 [shard_inline]: 7.2e-06 [merge_send_recv]: 8e-06 [auto_parallel]: 8.13999e-06 [parallel]: 7.28e-06 [flash_sp]: 3.92002e-06 [merge_comm]: 4.12e-06 [allreduce_fusion]: 4.29002e-06 [matmul_add_comm_reduction]: 9.41e-06 [allreduce_slice_to_reducescatter]: 3.9002e-07 [virtual_shard_identity]: 9.22001e-06 [virtual_dataset]: 6.60997e-06 [get_grad_eliminate_]: 6.32001e-06 [virtual_output]: 6.45002e-06 [merge_forward]: 4.53001e-06 [cell_reuse_recompute_pass]: 2.22999e-06 [offload_activation]: 9.89001e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.719e-05 [merge_recompute_call_nodes]: 1.06997e-06 [before_grad]: 1.226e-05 [set_forward_comm_id_for_comm_node_pass]: 4.90999e-06 [meta_fg_expand]: 2.98e-06 [flash_sp_send_recv_attached]: 1.60999e-06 [receive_attached]: 1.76998e-06 [after_resolve]: 1.138e-05 [a_after_grad]: 1.019e-05 [renormalize]: 7.00238e-08 [add_forward_monad_depend]: 3.35e-06 [auto_monad_grad]: 1.59998e-06 [auto_monad_eliminator]: 1.246e-05 [cse]: 6.003e-05 [a_3]: 4.673e-05 [py_interpret_to_execute_after_opt_a]: 1.638e-05 [slice_cell_reuse_recomputed_activation]: 2.27999e-06 [rewriter_after_opt_a]: 4.773e-05 [convert_after_rewriter]: 7.65e-06 [order_py_execute_after_rewriter]: 6.65998e-06 [mutable_eliminate]: 0.00071574 [opt_b]: 0.00026187, [1] [Cycle 1]: 0.00025356, [7] [b_1]: 0.00014867 [b_2]: 9.57999e-06 [updatestate_depend_eliminate]: 9.79999e-06 [updatestate_assign_eliminate]: 3.34001e-06 [updatestate_loads_eliminate]: 3.63e-06 [renormalize]: 7.30011e-07 [cse]: 3.846e-05 [optimize_parallel_all_gather_comm]: 2.111e-05 [overlap_param_gather]: 2.07001e-06 [cconv]: 3.657e-05 [loop_unroll]: 0.00050365 [opt_after_cconv]: 0.00012771, [1] [Cycle 1]: 0.00012081, [7] [c_1]: 3.433e-05 [parameter_eliminate]: 4.48001e-06 [updatestate_depend_eliminate]: 8.15999e-06 [updatestate_assign_eliminate]: 3.48e-06 [updatestate_loads_eliminate]: 3.16001e-06 [cse]: 2.971e-05 [renormalize]: 5.40022e-07 [remove_dup_value]: 1.732e-05 [tuple_transform]: 8.796e-05, [1] [Cycle 1]: 8.288e-05, [4] [d_1]: 5.243e-05 [none_parameter_eliminate]: 1.72999e-06 [renormalize]: 2.09984e-07 [switch_simplify]: 8.36002e-06 [partial_unused_args_eliminate]: 2.30002e-06 [add_recomputation]: 6.436e-05 [cse_after_recomputation]: 2.824e-05, [1] [Cycle 1]: 2.293e-05, [1] [cse]: 1.648e-05 [environ_conv]: 8.07e-06 [swap_dp_allreduce_reducescatter]: 6.33e-06 [bias_add_comm_swap]: 3.13e-06 [label_micro_interleaved_index]: 4.99998e-06 [label_fine_grained_interleaved_index]: 2.68e-06 [merge_cast_opt]: 1.35999e-06 [slice_recompute_activation]: 1.93002e-06 [micro_interleaved_order_control]: 2.34999e-06 [assign_add_opt]: 1.59998e-06 [ForceFp32Comm]: 8.29983e-07 [remove_cast_before_assign_add]: 1.07e-06 [full_micro_interleaved_order_control]: 2.43002e-06 [reorder_send_recv_between_fp_bp]: 2.43e-06 [comm_op_add_attrs]: 9.89996e-07 [add_comm_op_reuse_tag]: 9.29984e-07 [interleave_split_concat_branches]: 1.12999e-06 [interleave_parallel_branches]: 1.03001e-06 [overlap_opt_shard_in_pipeline]: 1.84998e-06 [overlap_opt_shard_grad_in_pipeline]: 2.11998e-06 [control_data_broadcast_order]: 1.551e-05 [grouped_pairwise_exchange_alltoall]: 1.89e-06 [offloading_packed_experts]: 5.49e-06 [overlap_recompute_and_grad_model_parallel]: 5.71e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.21997e-06 [overlap_recompute_allgather_and_fa_grad]: 1.36002e-06 [overlap_recompute_comm]: 2.11e-06 [overlap_grad_ring_attention]: 4.95001e-06 [overlap_grad_flash_sp]: 2.407e-05 [begin_end_overlap_inline]: 5.69999e-07 [split_matmul_comm_elemetwise]: 2.17001e-06 [split_layernorm_comm]: 1.54e-06 [handle_group_info]: 9.80013e-07 [symbol_engine_optimizer]: 9.282e-05, [1] [Cycle 1]: 8.78e-05, [6] [build]: 3.83999e-06 [elim_shapecalc]: 1.297e-05 [elim_not_effective]: 1.568e-05 [opt_reshape]: 9.12001e-06 [fold_const_symbol]: 1.287e-05 [renormalize]: 5.50004e-07 [detach_backward]: 2.19999e-06 [pipeline_parallel_scheduler]: 1.77999e-06 [auto_monad_reorder]: 2.08e-05 [get_jit_bprop_graph]: 2.19999e-06 [rewriter_after_jit_bprop_graph]: 6.01e-06 [opt_after_jit_grad]: 0.00051428 [validate]: 4.7e-05 Sums bootstrap : 0.000475s : 4.37% type_inference : 0.005321s : 48.95% event_method : 0.000013s : 0.12% auto_monad : 0.000061s : 0.56% graph_reusing : 0.000005s : 0.05% inline : 0.000003s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000019s : 0.17% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000004s : 0.04% parallel-infer-symbol : 0.000004s : 0.03% pre_auto_parallel : 0.000032s : 0.29% insert-virtual-dataset : 0.000002s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.02% optimize.py_interpret_to_execute : 0.000022s : 0.20% optimize.rewriter_before_opt_a : 0.000058s : 0.53% optimize.opt_a.expand_dump_flag : 0.000005s : 0.04% optimize.opt_a.switch_simplify : 0.000038s : 0.35% optimize.opt_a.loop_unroll : 0.000022s : 0.21% optimize.opt_a.a_1 : 0.000525s : 4.83% optimize.opt_a.with_stream_mark : 0.000037s : 0.34% optimize.opt_a.recompute_prepare : 0.000019s : 0.18% optimize.opt_a.updatestate_depend_eliminate : 0.000009s : 0.08% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.06% optimize.opt_a.updatestate_loads_eliminate : 0.000007s : 0.06% optimize.opt_a.parameter_eliminate : 0.000004s : 0.04% optimize.opt_a.a_2 : 0.000185s : 1.70% optimize.opt_a.accelerated_algorithm : 0.000017s : 0.16% optimize.opt_a.shard : 0.000004s : 0.03% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.04% optimize.opt_a.shard_inline : 0.000016s : 0.14% optimize.opt_a.merge_send_recv : 0.000017s : 0.16% optimize.opt_a.auto_parallel : 0.000016s : 0.15% optimize.opt_a.parallel : 0.000027s : 0.25% optimize.opt_a.flash_sp : 0.000014s : 0.13% optimize.opt_a.merge_comm : 0.000009s : 0.08% optimize.opt_a.allreduce_fusion : 0.000009s : 0.08% optimize.opt_a.matmul_add_comm_reduction : 0.000020s : 0.19% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000020s : 0.19% optimize.opt_a.virtual_dataset : 0.000014s : 0.13% optimize.opt_a.get_grad_eliminate_ : 0.000014s : 0.13% optimize.opt_a.virtual_output : 0.000014s : 0.13% optimize.opt_a.merge_forward : 0.000009s : 0.08% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.03% optimize.opt_a.offload_activation : 0.000021s : 0.19% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000033s : 0.31% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.03% optimize.opt_a.before_grad : 0.000026s : 0.23% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000010s : 0.09% optimize.opt_a.meta_fg_expand : 0.000006s : 0.06% optimize.opt_a.flash_sp_send_recv_attached : 0.000005s : 0.04% optimize.opt_a.receive_attached : 0.000004s : 0.04% optimize.opt_a.after_resolve : 0.000023s : 0.21% optimize.opt_a.a_after_grad : 0.000021s : 0.19% optimize.opt_a.renormalize : 0.000766s : 7.05% optimize.opt_a.add_forward_monad_depend : 0.000010s : 0.10% optimize.opt_a.auto_monad_grad : 0.000004s : 0.04% optimize.opt_a.auto_monad_eliminator : 0.000032s : 0.29% optimize.opt_a.cse : 0.000101s : 0.93% optimize.opt_a.a_3 : 0.000110s : 1.01% optimize.py_interpret_to_execute_after_opt_a : 0.000016s : 0.15% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.02% optimize.rewriter_after_opt_a : 0.000048s : 0.44% optimize.convert_after_rewriter : 0.000008s : 0.07% optimize.order_py_execute_after_rewriter : 0.000007s : 0.06% optimize.mutable_eliminate : 0.000716s : 6.58% optimize.opt_b.b_1 : 0.000149s : 1.37% optimize.opt_b.b_2 : 0.000010s : 0.09% optimize.opt_b.updatestate_depend_eliminate : 0.000010s : 0.09% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_b.updatestate_loads_eliminate : 0.000004s : 0.03% optimize.opt_b.renormalize : 0.000001s : 0.01% optimize.opt_b.cse : 0.000038s : 0.35% optimize.optimize_parallel_all_gather_comm : 0.000021s : 0.19% optimize.overlap_param_gather : 0.000002s : 0.02% optimize.cconv : 0.000037s : 0.34% optimize.loop_unroll : 0.000504s : 4.63% optimize.opt_after_cconv.c_1 : 0.000034s : 0.32% optimize.opt_after_cconv.parameter_eliminate : 0.000004s : 0.04% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000008s : 0.08% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.cse : 0.000030s : 0.27% optimize.opt_after_cconv.renormalize : 0.000001s : 0.00% optimize.remove_dup_value : 0.000017s : 0.16% optimize.tuple_transform.d_1 : 0.000052s : 0.48% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.02% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000008s : 0.08% optimize.partial_unused_args_eliminate : 0.000002s : 0.02% optimize.add_recomputation : 0.000064s : 0.59% optimize.cse_after_recomputation.cse : 0.000016s : 0.15% optimize.environ_conv : 0.000008s : 0.07% optimize.swap_dp_allreduce_reducescatter : 0.000006s : 0.06% optimize.bias_add_comm_swap : 0.000003s : 0.03% optimize.label_micro_interleaved_index : 0.000005s : 0.05% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.02% optimize.merge_cast_opt : 0.000001s : 0.01% optimize.slice_recompute_activation : 0.000002s : 0.02% optimize.micro_interleaved_order_control : 0.000002s : 0.02% optimize.assign_add_opt : 0.000002s : 0.01% optimize.ForceFp32Comm : 0.000001s : 0.01% optimize.remove_cast_before_assign_add : 0.000001s : 0.01% optimize.full_micro_interleaved_order_control : 0.000002s : 0.02% optimize.reorder_send_recv_between_fp_bp : 0.000002s : 0.02% optimize.comm_op_add_attrs : 0.000001s : 0.01% optimize.add_comm_op_reuse_tag : 0.000001s : 0.01% optimize.interleave_split_concat_branches : 0.000001s : 0.01% optimize.interleave_parallel_branches : 0.000001s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000002s : 0.02% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.02% optimize.control_data_broadcast_order : 0.000016s : 0.14% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.02% optimize.offloading_packed_experts : 0.000005s : 0.05% optimize.overlap_recompute_and_grad_model_parallel : 0.000006s : 0.05% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.01% optimize.overlap_recompute_comm : 0.000002s : 0.02% optimize.overlap_grad_ring_attention : 0.000005s : 0.05% optimize.overlap_grad_flash_sp : 0.000024s : 0.22% optimize.begin_end_overlap_inline : 0.000001s : 0.01% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.02% optimize.split_layernorm_comm : 0.000002s : 0.01% optimize.handle_group_info : 0.000001s : 0.01% optimize.symbol_engine_optimizer.build : 0.000004s : 0.04% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000013s : 0.12% optimize.symbol_engine_optimizer.elim_not_effective : 0.000016s : 0.14% optimize.symbol_engine_optimizer.opt_reshape : 0.000009s : 0.08% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000013s : 0.12% optimize.symbol_engine_optimizer.renormalize : 0.000001s : 0.01% detach_backward : 0.000002s : 0.02% pipeline_parallel_scheduler : 0.000002s : 0.02% auto_monad_reorder : 0.000021s : 0.19% get_jit_bprop_graph : 0.000002s : 0.02% rewriter_after_jit_bprop_graph : 0.000006s : 0.06% opt_after_jit_grad : 0.000514s : 4.73% validate : 0.000047s : 0.43% Time group info: ------[substitution.] 0.000168 28 14.17% : 0.000024s : 2: substitution.cast_eliminate 1.38% : 0.000002s : 3: substitution.elim_not_effective 1.00% : 0.000002s : 3: substitution.fold_const_symbol 4.16% : 0.000007s : 4: substitution.graph_param_transform 69.91% : 0.000118s : 2: substitution.inline 2.91% : 0.000005s : 6: substitution.j_node_and_user_rematch 3.82% : 0.000006s : 6: substitution.remove_not_recompute_node 2.64% : 0.000004s : 2: substitution.replace_old_param ------[type_inference.] 0.005265 2 90.74% : 0.004777s : 1: type_inference.infer 9.26% : 0.000488s : 1: type_inference.specialize ------[replace.] 0.000023 2 100.00% : 0.000023s : 2: replace.inline ------[match.] 0.000116 2 100.00% : 0.000116s : 2: match.inline ------[predicate.] 0.000176 980 0.77% : 0.000001s : 9: predicate.accumulaten_eliminater 1.10% : 0.000002s : 4: predicate.ad_related_special_op_eliminate 0.70% : 0.000001s : 8: predicate.addn_check_dump 0.88% : 0.000002s : 9: predicate.addn_zero_filter 0.66% : 0.000001s : 9: predicate.adjust_all_reduce_mul_add 2.15% : 0.000004s : 17: predicate.arithmetic_simplify 0.97% : 0.000002s : 9: predicate.cast_eliminate 0.91% : 0.000002s : 8: predicate.check_bprop_eliminate 0.69% : 0.000001s : 8: predicate.compare_switch_simplify 0.22% : 0.000000s : 4: predicate.const_output_eliminate 0.77% : 0.000001s : 8: predicate.depend_value_elim 0.79% : 0.000001s : 9: predicate.dict_get_item_const_eliminator 0.92% : 0.000002s : 9: predicate.dict_get_item_eliminator 0.76% : 0.000001s : 9: predicate.dict_set_item_eliminator 1.20% : 0.000002s : 8: predicate.dumpgradient_eliminate 0.27% : 0.000000s : 4: predicate.elim_not_effective 0.64% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.17% : 0.000002s : 13: predicate.environ_add_const_eliminate 1.12% : 0.000002s : 13: predicate.environ_get_add_eliminate 0.98% : 0.000002s : 13: predicate.environ_get_depend_swap 1.69% : 0.000003s : 21: predicate.environ_get_eliminate 1.02% : 0.000002s : 13: predicate.environ_get_set_eliminate 0.90% : 0.000002s : 11: predicate.exchange_switch_depend_value 1.70% : 0.000003s : 11: predicate.float_depend_g_call 0.66% : 0.000001s : 8: predicate.float_environ_get_switch 0.93% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.23% : 0.000000s : 4: predicate.fold_const_symbol 0.87% : 0.000002s : 8: predicate.get_grad_eliminate 0.33% : 0.000001s : 4: predicate.graph_param_transform 0.77% : 0.000001s : 8: predicate.incorporate_call 0.62% : 0.000001s : 8: predicate.incorporate_call_switch 6.43% : 0.000011s : 44: predicate.inline 1.05% : 0.000002s : 8: predicate.inline_without_move 0.37% : 0.000001s : 8: predicate.j_node_and_user_rematch 1.11% : 0.000002s : 8: predicate.less_batch_normalization 1.54% : 0.000003s : 17: predicate.list_to_tuple_eliminator_ 2.07% : 0.000004s : 26: predicate.load_eliminater 1.59% : 0.000003s : 4: predicate.loop_unroll_after_grad 1.45% : 0.000003s : 16: predicate.loop_unroll_before_grad 2.00% : 0.000004s : 17: predicate.make_slice_get_slice_eliminator 0.70% : 0.000001s : 8: predicate.merge_addn 0.77% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.72% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.68% : 0.000001s : 9: predicate.minmaximum_grad 1.88% : 0.000003s : 4: predicate.mutable_eliminate 0.37% : 0.000001s : 4: predicate.opt_reshape 0.47% : 0.000001s : 4: predicate.parallel_virtual_node 1.26% : 0.000002s : 11: predicate.partial_defer_inline 1.29% : 0.000002s : 13: predicate.partial_eliminate 0.81% : 0.000001s : 9: predicate.print_const_string_wrapper 0.74% : 0.000001s : 8: predicate.reduce_all_const_elim 1.12% : 0.000002s : 9: predicate.reduce_eliminate 2.02% : 0.000004s : 26: predicate.redundant_stop_gradient_eliminater 0.59% : 0.000001s : 8: predicate.remove_not_recompute_node 1.41% : 0.000002s : 17: predicate.replace_applicator 0.82% : 0.000001s : 8: predicate.replace_old_param 0.45% : 0.000001s : 4: predicate.reset_defer_inline 0.86% : 0.000002s : 9: predicate.reshape_eliminate 0.79% : 0.000001s : 8: predicate.row_tensor_add_zeros_like 0.45% : 0.000001s : 4: predicate.row_tensor_eliminate 1.18% : 0.000002s : 8: predicate.same_eliminate 0.51% : 0.000001s : 8: predicate.set_cell_output_no_recompute 1.08% : 0.000002s : 8: predicate.shard_identity_eliminate 0.87% : 0.000002s : 8: predicate.special_op_eliminate 0.93% : 0.000002s : 8: predicate.specialize_transform 1.00% : 0.000002s : 8: predicate.split_environ_get_set_with_tuple_value 0.96% : 0.000002s : 8: predicate.stack_unstack_eliminate 0.44% : 0.000001s : 4: predicate.switch_call_monad_eliminater 0.93% : 0.000002s : 11: predicate.switch_defer_inline 1.59% : 0.000003s : 19: predicate.switch_layer_defer_inline 4.50% : 0.000008s : 39: predicate.switch_simplify 0.73% : 0.000001s : 9: predicate.tile_eliminate 0.82% : 0.000001s : 9: predicate.transpose_eliminate 1.52% : 0.000003s : 17: predicate.tuple_list_convert_item_index_to_positive 1.70% : 0.000003s : 17: predicate.tuple_list_get_item_const_eliminator 1.36% : 0.000002s : 17: predicate.tuple_list_get_item_depend_reorder 3.39% : 0.000006s : 25: predicate.tuple_list_get_item_eliminator 1.63% : 0.000003s : 17: predicate.tuple_list_get_set_item_eliminator 2.36% : 0.000004s : 25: predicate.tuple_list_set_item_eliminator 1.46% : 0.000003s : 17: predicate.tuple_to_list_eliminator_ 2.02% : 0.000004s : 26: predicate.updatestate_pure_node_eliminater 2.86% : 0.000005s : 34: predicate.updatestate_useless_node_eliminater 0.52% : 0.000001s : 4: predicate.value_based_eliminate 0.83% : 0.000001s : 8: predicate.virtual_dataset_eliminate 0.76% : 0.000001s : 8: predicate.virtual_output_eliminate 0.34% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.51% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000253 5 9.15% : 0.000023s : 1: func_graph_cloner_run.FuncGraphClonerGraph 90.85% : 0.000230s : 4: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.026116 192 0.01% : 0.000004s : 1: ForceFp32Comm 13.43% : 0.003507s : 1: add_attr 13.38% : 0.003494s : 1: add_attr_with_inline 0.01% : 0.000004s : 1: add_comm_op_reuse_tag 0.26% : 0.000069s : 1: add_recomputation 0.02% : 0.000004s : 1: assign_add_opt 0.25% : 0.000066s : 1: auto_monad 0.09% : 0.000025s : 1: auto_monad_reorder 0.01% : 0.000004s : 1: begin_end_overlap_inline 0.02% : 0.000006s : 1: bias_add_comm_swap 1.95% : 0.000509s : 1: bootstrap 0.16% : 0.000041s : 1: cconv 0.02% : 0.000004s : 1: comm_op_add_attrs 0.07% : 0.000019s : 1: control_data_broadcast_order 0.04% : 0.000012s : 1: convert_after_rewriter 0.12% : 0.000032s : 1: cse_after_recomputation 0.02% : 0.000005s : 1: dataset_repeat_opt 0.02% : 0.000006s : 1: detach_backward 0.04% : 0.000011s : 1: environ_conv 0.07% : 0.000019s : 1: event_method 0.02% : 0.000006s : 1: full_micro_interleaved_order_control 0.02% : 0.000006s : 1: get_jit_bprop_graph 0.03% : 0.000008s : 1: graph_reusing 0.02% : 0.000005s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000004s : 1: handle_group_info 0.02% : 0.000006s : 1: inline 0.02% : 0.000006s : 1: insert-virtual-dataset 0.01% : 0.000004s : 1: interleave_parallel_branches 0.02% : 0.000004s : 1: interleave_split_concat_branches 0.02% : 0.000006s : 1: label_fine_grained_interleaved_index 0.03% : 0.000008s : 1: label_micro_interleaved_index 1.97% : 0.000514s : 1: loop_unroll 0.02% : 0.000004s : 1: merge_cast_opt 0.02% : 0.000005s : 1: micro_interleaved_order_control 2.79% : 0.000729s : 1: mutable_eliminate 0.03% : 0.000008s : 1: offloading_packed_experts 0.07% : 0.000019s : 1: opt.transform.loop_unroll_optimizer 0.09% : 0.000023s : 1: opt.transform.mutable_eliminate 3.81% : 0.000996s : 78: opt.transform.opt_a 0.13% : 0.000033s : 1: opt.transform.opt_after_cconv 0.12% : 0.000031s : 1: opt.transform.opt_after_jit_grad 0.48% : 0.000125s : 28: opt.transform.opt_b 0.22% : 0.000058s : 2: opt.transform.opt_trans_graph 0.18% : 0.000046s : 4: opt.transform.symbol_engine_opt 10.59% : 0.002766s : 1: opt_a 0.50% : 0.000131s : 1: opt_after_cconv 2.01% : 0.000524s : 1: opt_after_jit_grad 1.02% : 0.000266s : 1: opt_b 20.09% : 0.005246s : 1: optimize 0.10% : 0.000025s : 1: optimize_parallel_all_gather_comm 0.04% : 0.000010s : 1: order_py_execute_after_rewriter 0.11% : 0.000028s : 1: overlap_grad_flash_sp 0.02% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.03% : 0.000008s : 1: overlap_grad_ring_attention 0.02% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.02% : 0.000005s : 1: overlap_opt_shard_in_pipeline 0.02% : 0.000005s : 1: overlap_param_gather 0.02% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.03% : 0.000009s : 1: overlap_recompute_and_grad_model_parallel 0.02% : 0.000005s : 1: overlap_recompute_comm 0.03% : 0.000007s : 1: parallel-infer-symbol 0.01% : 0.000004s : 1: parallel-infer-symbol-second 0.02% : 0.000005s : 1: partial_unused_args_eliminate 0.02% : 0.000005s : 1: pipeline_parallel_scheduler 0.02% : 0.000005s : 1: pipeline_split 0.14% : 0.000036s : 1: pre_auto_parallel 0.10% : 0.000025s : 1: py_interpret_to_execute 0.08% : 0.000021s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000004s : 1: remove_cast_before_assign_add 0.08% : 0.000021s : 1: remove_dup_value 1.71% : 0.000447s : 1: renormalize.infer 1.19% : 0.000310s : 1: renormalize.specialize 0.02% : 0.000005s : 1: reorder_send_recv_between_fp_bp 0.03% : 0.000009s : 1: rewriter_after_jit_bprop_graph 0.20% : 0.000053s : 1: rewriter_after_opt_a 0.24% : 0.000062s : 1: rewriter_before_opt_a 0.02% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.02% : 0.000005s : 1: slice_recompute_activation 0.02% : 0.000004s : 1: split_layernorm_comm 0.02% : 0.000005s : 1: split_matmul_comm_elemetwise 0.04% : 0.000009s : 1: swap_dp_allreduce_reducescatter 0.37% : 0.000096s : 1: symbol_engine_optimizer 0.35% : 0.000091s : 1: tuple_transform 20.47% : 0.005346s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:43.341.16 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:43.344.09 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.017658, [21] [bootstrap]: 0.00046364 [type_inference]: 0.00548708 [event_method]: 1.515e-05 [auto_monad]: 6.458e-05 [graph_reusing]: 5.41002e-06 [inline]: 3.01999e-06 [add_attr]: 0.00377897, [1] [add_attr_with_inline]: 0.00376691, [1] [Cycle 1]: 9.315e-05, [2] [tag_attr]: 1.845e-05 [meta_addattr_fg_expand]: 4.07003e-06 [parallel-infer-symbol]: 3.54002e-06 [pre_auto_parallel]: 3.19e-05 [insert-virtual-dataset]: 2.68e-06 [parallel-infer-symbol-second]: 8.09989e-07 [dataset_repeat_opt]: 2.07999e-06 [pipeline_split]: 1.54e-06 [optimize]: 0.00632485, [53] [py_interpret_to_execute]: 2.467e-05 [rewriter_before_opt_a]: 6.148e-05 [opt_a]: 0.00327831, [2] [Cycle 1]: 0.00227144, [45] [expand_dump_flag]: 2.93e-06 [switch_simplify]: 2.68e-05 [loop_unroll]: 1.493e-05 [a_1]: 0.00038702 [with_stream_mark]: 1.812e-05 [recompute_prepare]: 1.137e-05 [updatestate_depend_eliminate]: 4.68001e-06 [updatestate_assign_eliminate]: 4.73001e-06 [updatestate_loads_eliminate]: 3.83999e-06 [parameter_eliminate]: 1.82001e-06 [a_2]: 0.00012854 [accelerated_algorithm]: 8.94e-06 [shard]: 2.44001e-06 [meta_shard_fg_expand]: 2.06e-06 [shard_inline]: 7.66999e-06 [merge_send_recv]: 9.81e-06 [auto_parallel]: 8.40999e-06 [parallel]: 1.891e-05 [flash_sp]: 9.97999e-06 [merge_comm]: 5.09e-06 [allreduce_fusion]: 5.36002e-06 [matmul_add_comm_reduction]: 1.13e-05 [allreduce_slice_to_reducescatter]: 7.59988e-07 [virtual_shard_identity]: 1.086e-05 [virtual_dataset]: 9.14998e-06 [get_grad_eliminate_]: 7.13998e-06 [virtual_output]: 8.74e-06 [merge_forward]: 4.70999e-06 [cell_reuse_recompute_pass]: 1.91e-06 [offload_activation]: 1.119e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.986e-05 [merge_recompute_call_nodes]: 1.71e-06 [before_grad]: 1.434e-05 [set_forward_comm_id_for_comm_node_pass]: 5.70001e-06 [meta_fg_expand]: 3.65e-06 [flash_sp_send_recv_attached]: 2.54999e-06 [receive_attached]: 2.10002e-06 [after_resolve]: 1.369e-05 [a_after_grad]: 1.184e-05 [renormalize]: 0.0008769 [add_forward_monad_depend]: 7.41001e-06 [auto_monad_grad]: 2.98e-06 [auto_monad_eliminator]: 2.014e-05 [cse]: 3.85e-05 [a_3]: 7.663e-05 [Cycle 2]: 0.00099033, [45] [expand_dump_flag]: 2.17999e-06 [switch_simplify]: 9.30001e-06 [loop_unroll]: 7.23e-06 [a_1]: 0.00016611 [with_stream_mark]: 2.065e-05 [recompute_prepare]: 7.75e-06 [updatestate_depend_eliminate]: 5.48002e-06 [updatestate_assign_eliminate]: 3.7e-06 [updatestate_loads_eliminate]: 3.24001e-06 [parameter_eliminate]: 2.00002e-06 [a_2]: 0.00011663 [accelerated_algorithm]: 7.88001e-06 [shard]: 2.44001e-06 [meta_shard_fg_expand]: 2.19001e-06 [shard_inline]: 7.57002e-06 [merge_send_recv]: 8.43001e-06 [auto_parallel]: 9.44e-06 [parallel]: 7.42002e-06 [flash_sp]: 4.27e-06 [merge_comm]: 4.22e-06 [allreduce_fusion]: 4.47998e-06 [matmul_add_comm_reduction]: 1.018e-05 [allreduce_slice_to_reducescatter]: 7.50006e-07 [virtual_shard_identity]: 8.56002e-06 [virtual_dataset]: 7.43e-06 [get_grad_eliminate_]: 7.33e-06 [virtual_output]: 8.97999e-06 [merge_forward]: 5.34998e-06 [cell_reuse_recompute_pass]: 3.29001e-06 [offload_activation]: 1.141e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.825e-05 [merge_recompute_call_nodes]: 1.40001e-06 [before_grad]: 1.304e-05 [set_forward_comm_id_for_comm_node_pass]: 6.49001e-06 [meta_fg_expand]: 3.51999e-06 [flash_sp_send_recv_attached]: 1.37e-06 [receive_attached]: 1.62999e-06 [after_resolve]: 1.11e-05 [a_after_grad]: 1.055e-05 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 2.73e-06 [auto_monad_grad]: 1.15001e-06 [auto_monad_eliminator]: 1.222e-05 [cse]: 2.733e-05 [a_3]: 5.804e-05 [py_interpret_to_execute_after_opt_a]: 2.033e-05 [slice_cell_reuse_recomputed_activation]: 4.92e-06 [rewriter_after_opt_a]: 5.308e-05 [convert_after_rewriter]: 1.118e-05 [order_py_execute_after_rewriter]: 1.046e-05 [mutable_eliminate]: 0.00083441 [opt_b]: 0.00034317, [1] [Cycle 1]: 0.00033039, [7] [b_1]: 0.00019993 [b_2]: 9.99999e-06 [updatestate_depend_eliminate]: 9.81e-06 [updatestate_assign_eliminate]: 3.83001e-06 [updatestate_loads_eliminate]: 3.38e-06 [renormalize]: 7.10017e-07 [cse]: 4.068e-05 [optimize_parallel_all_gather_comm]: 2.657e-05 [overlap_param_gather]: 5.39998e-06 [cconv]: 4.076e-05 [loop_unroll]: 0.0006053 [opt_after_cconv]: 0.00016248, [1] [Cycle 1]: 0.00015055, [7] [c_1]: 3.735e-05 [parameter_eliminate]: 6.19001e-06 [updatestate_depend_eliminate]: 8.37998e-06 [updatestate_assign_eliminate]: 3.21001e-06 [updatestate_loads_eliminate]: 4.23999e-06 [cse]: 3.124e-05 [renormalize]: 8.09989e-07 [remove_dup_value]: 2.259e-05 [tuple_transform]: 0.00013876, [1] [Cycle 1]: 0.00013041, [4] [d_1]: 8.553e-05 [none_parameter_eliminate]: 2.41e-06 [renormalize]: 2.19996e-07 [switch_simplify]: 8.90001e-06 [partial_unused_args_eliminate]: 4.95999e-06 [add_recomputation]: 6.847e-05 [cse_after_recomputation]: 3.92e-05, [1] [Cycle 1]: 3.192e-05, [1] [cse]: 2.166e-05 [environ_conv]: 1.162e-05 [swap_dp_allreduce_reducescatter]: 1.004e-05 [bias_add_comm_swap]: 5.75001e-06 [label_micro_interleaved_index]: 1.015e-05 [label_fine_grained_interleaved_index]: 5.15001e-06 [merge_cast_opt]: 4.22e-06 [slice_recompute_activation]: 4.99998e-06 [micro_interleaved_order_control]: 5.04e-06 [assign_add_opt]: 4.13999e-06 [ForceFp32Comm]: 3.9e-06 [remove_cast_before_assign_add]: 3.63e-06 [full_micro_interleaved_order_control]: 4.80001e-06 [reorder_send_recv_between_fp_bp]: 5.45001e-06 [comm_op_add_attrs]: 3.62998e-06 [add_comm_op_reuse_tag]: 3.64002e-06 [interleave_split_concat_branches]: 3.51999e-06 [interleave_parallel_branches]: 3.56001e-06 [overlap_opt_shard_in_pipeline]: 4.02002e-06 [overlap_opt_shard_grad_in_pipeline]: 4.50001e-06 [control_data_broadcast_order]: 1.998e-05 [grouped_pairwise_exchange_alltoall]: 4.27e-06 [offloading_packed_experts]: 7.63001e-06 [overlap_recompute_and_grad_model_parallel]: 7.84002e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.54002e-06 [overlap_recompute_allgather_and_fa_grad]: 3.99002e-06 [overlap_recompute_comm]: 5.74999e-06 [overlap_grad_ring_attention]: 6.86999e-06 [overlap_grad_flash_sp]: 2.875e-05 [begin_end_overlap_inline]: 3.45e-06 [split_matmul_comm_elemetwise]: 4.89003e-06 [split_layernorm_comm]: 4.22998e-06 [handle_group_info]: 3.51001e-06 [symbol_engine_optimizer]: 0.00012489, [1] [Cycle 1]: 0.00011636, [6] [build]: 3.84002e-06 [elim_shapecalc]: 1.739e-05 [elim_not_effective]: 1.764e-05 [opt_reshape]: 9.50001e-06 [fold_const_symbol]: 1.366e-05 [renormalize]: 2.19996e-07 [detach_backward]: 4.88001e-06 [pipeline_parallel_scheduler]: 2.17999e-06 [auto_monad_reorder]: 2.738e-05 [get_jit_bprop_graph]: 2.16e-06 [rewriter_after_jit_bprop_graph]: 7.47998e-06 [opt_after_jit_grad]: 0.00071772 [validate]: 5.382e-05 Sums bootstrap : 0.000464s : 3.88% type_inference : 0.005487s : 45.94% event_method : 0.000015s : 0.13% auto_monad : 0.000065s : 0.54% graph_reusing : 0.000005s : 0.05% inline : 0.000003s : 0.03% add_attr.add_attr_with_inline.tag_attr : 0.000018s : 0.15% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000004s : 0.03% parallel-infer-symbol : 0.000004s : 0.03% pre_auto_parallel : 0.000032s : 0.27% insert-virtual-dataset : 0.000003s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000025s : 0.21% optimize.rewriter_before_opt_a : 0.000061s : 0.51% optimize.opt_a.expand_dump_flag : 0.000005s : 0.04% optimize.opt_a.switch_simplify : 0.000036s : 0.30% optimize.opt_a.loop_unroll : 0.000022s : 0.19% optimize.opt_a.a_1 : 0.000553s : 4.63% optimize.opt_a.with_stream_mark : 0.000039s : 0.32% optimize.opt_a.recompute_prepare : 0.000019s : 0.16% optimize.opt_a.updatestate_depend_eliminate : 0.000010s : 0.09% optimize.opt_a.updatestate_assign_eliminate : 0.000008s : 0.07% optimize.opt_a.updatestate_loads_eliminate : 0.000007s : 0.06% optimize.opt_a.parameter_eliminate : 0.000004s : 0.03% optimize.opt_a.a_2 : 0.000245s : 2.05% optimize.opt_a.accelerated_algorithm : 0.000017s : 0.14% optimize.opt_a.shard : 0.000005s : 0.04% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.04% optimize.opt_a.shard_inline : 0.000015s : 0.13% optimize.opt_a.merge_send_recv : 0.000018s : 0.15% optimize.opt_a.auto_parallel : 0.000018s : 0.15% optimize.opt_a.parallel : 0.000026s : 0.22% optimize.opt_a.flash_sp : 0.000014s : 0.12% optimize.opt_a.merge_comm : 0.000009s : 0.08% optimize.opt_a.allreduce_fusion : 0.000010s : 0.08% optimize.opt_a.matmul_add_comm_reduction : 0.000021s : 0.18% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000002s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000019s : 0.16% optimize.opt_a.virtual_dataset : 0.000017s : 0.14% optimize.opt_a.get_grad_eliminate_ : 0.000014s : 0.12% optimize.opt_a.virtual_output : 0.000018s : 0.15% optimize.opt_a.merge_forward : 0.000010s : 0.08% optimize.opt_a.cell_reuse_recompute_pass : 0.000005s : 0.04% optimize.opt_a.offload_activation : 0.000023s : 0.19% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000038s : 0.32% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.03% optimize.opt_a.before_grad : 0.000027s : 0.23% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000012s : 0.10% optimize.opt_a.meta_fg_expand : 0.000007s : 0.06% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.03% optimize.opt_a.receive_attached : 0.000004s : 0.03% optimize.opt_a.after_resolve : 0.000025s : 0.21% optimize.opt_a.a_after_grad : 0.000022s : 0.19% optimize.opt_a.renormalize : 0.000877s : 7.34% optimize.opt_a.add_forward_monad_depend : 0.000010s : 0.08% optimize.opt_a.auto_monad_grad : 0.000004s : 0.03% optimize.opt_a.auto_monad_eliminator : 0.000032s : 0.27% optimize.opt_a.cse : 0.000066s : 0.55% optimize.opt_a.a_3 : 0.000135s : 1.13% optimize.py_interpret_to_execute_after_opt_a : 0.000020s : 0.17% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.04% optimize.rewriter_after_opt_a : 0.000053s : 0.44% optimize.convert_after_rewriter : 0.000011s : 0.09% optimize.order_py_execute_after_rewriter : 0.000010s : 0.09% optimize.mutable_eliminate : 0.000834s : 6.99% optimize.opt_b.b_1 : 0.000200s : 1.67% optimize.opt_b.b_2 : 0.000010s : 0.08% optimize.opt_b.updatestate_depend_eliminate : 0.000010s : 0.08% optimize.opt_b.updatestate_assign_eliminate : 0.000004s : 0.03% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_b.renormalize : 0.000001s : 0.01% optimize.opt_b.cse : 0.000041s : 0.34% optimize.optimize_parallel_all_gather_comm : 0.000027s : 0.22% optimize.overlap_param_gather : 0.000005s : 0.05% optimize.cconv : 0.000041s : 0.34% optimize.loop_unroll : 0.000605s : 5.07% optimize.opt_after_cconv.c_1 : 0.000037s : 0.31% optimize.opt_after_cconv.parameter_eliminate : 0.000006s : 0.05% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000008s : 0.07% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000004s : 0.04% optimize.opt_after_cconv.cse : 0.000031s : 0.26% optimize.opt_after_cconv.renormalize : 0.000001s : 0.01% optimize.remove_dup_value : 0.000023s : 0.19% optimize.tuple_transform.d_1 : 0.000086s : 0.72% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.02% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000009s : 0.07% optimize.partial_unused_args_eliminate : 0.000005s : 0.04% optimize.add_recomputation : 0.000068s : 0.57% optimize.cse_after_recomputation.cse : 0.000022s : 0.18% optimize.environ_conv : 0.000012s : 0.10% optimize.swap_dp_allreduce_reducescatter : 0.000010s : 0.08% optimize.bias_add_comm_swap : 0.000006s : 0.05% optimize.label_micro_interleaved_index : 0.000010s : 0.08% optimize.label_fine_grained_interleaved_index : 0.000005s : 0.04% optimize.merge_cast_opt : 0.000004s : 0.04% optimize.slice_recompute_activation : 0.000005s : 0.04% optimize.micro_interleaved_order_control : 0.000005s : 0.04% optimize.assign_add_opt : 0.000004s : 0.03% optimize.ForceFp32Comm : 0.000004s : 0.03% optimize.remove_cast_before_assign_add : 0.000004s : 0.03% optimize.full_micro_interleaved_order_control : 0.000005s : 0.04% optimize.reorder_send_recv_between_fp_bp : 0.000005s : 0.05% optimize.comm_op_add_attrs : 0.000004s : 0.03% optimize.add_comm_op_reuse_tag : 0.000004s : 0.03% optimize.interleave_split_concat_branches : 0.000004s : 0.03% optimize.interleave_parallel_branches : 0.000004s : 0.03% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.03% optimize.overlap_opt_shard_grad_in_pipeline : 0.000005s : 0.04% optimize.control_data_broadcast_order : 0.000020s : 0.17% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.04% optimize.offloading_packed_experts : 0.000008s : 0.06% optimize.overlap_recompute_and_grad_model_parallel : 0.000008s : 0.07% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.03% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.03% optimize.overlap_recompute_comm : 0.000006s : 0.05% optimize.overlap_grad_ring_attention : 0.000007s : 0.06% optimize.overlap_grad_flash_sp : 0.000029s : 0.24% optimize.begin_end_overlap_inline : 0.000003s : 0.03% optimize.split_matmul_comm_elemetwise : 0.000005s : 0.04% optimize.split_layernorm_comm : 0.000004s : 0.04% optimize.handle_group_info : 0.000004s : 0.03% optimize.symbol_engine_optimizer.build : 0.000004s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000017s : 0.15% optimize.symbol_engine_optimizer.elim_not_effective : 0.000018s : 0.15% optimize.symbol_engine_optimizer.opt_reshape : 0.000010s : 0.08% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000014s : 0.11% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000005s : 0.04% pipeline_parallel_scheduler : 0.000002s : 0.02% auto_monad_reorder : 0.000027s : 0.23% get_jit_bprop_graph : 0.000002s : 0.02% rewriter_after_jit_bprop_graph : 0.000007s : 0.06% opt_after_jit_grad : 0.000718s : 6.01% validate : 0.000054s : 0.45% Time group info: ------[substitution.] 0.000178 28 15.40% : 0.000027s : 2: substitution.cast_eliminate 1.48% : 0.000003s : 3: substitution.elim_not_effective 1.17% : 0.000002s : 3: substitution.fold_const_symbol 4.14% : 0.000007s : 4: substitution.graph_param_transform 68.59% : 0.000122s : 2: substitution.inline 2.92% : 0.000005s : 6: substitution.j_node_and_user_rematch 3.64% : 0.000006s : 6: substitution.remove_not_recompute_node 2.67% : 0.000005s : 2: substitution.replace_old_param ------[type_inference.] 0.005428 2 90.66% : 0.004921s : 1: type_inference.infer 9.34% : 0.000507s : 1: type_inference.specialize ------[replace.] 0.000025 2 100.00% : 0.000025s : 2: replace.inline ------[match.] 0.000121 2 100.00% : 0.000121s : 2: match.inline ------[predicate.] 0.000192 980 0.84% : 0.000002s : 9: predicate.accumulaten_eliminater 1.08% : 0.000002s : 4: predicate.ad_related_special_op_eliminate 0.69% : 0.000001s : 8: predicate.addn_check_dump 1.13% : 0.000002s : 9: predicate.addn_zero_filter 0.67% : 0.000001s : 9: predicate.adjust_all_reduce_mul_add 2.44% : 0.000005s : 17: predicate.arithmetic_simplify 0.93% : 0.000002s : 9: predicate.cast_eliminate 0.67% : 0.000001s : 8: predicate.check_bprop_eliminate 0.65% : 0.000001s : 8: predicate.compare_switch_simplify 0.21% : 0.000000s : 4: predicate.const_output_eliminate 0.73% : 0.000001s : 8: predicate.depend_value_elim 0.73% : 0.000001s : 9: predicate.dict_get_item_const_eliminator 0.86% : 0.000002s : 9: predicate.dict_get_item_eliminator 0.72% : 0.000001s : 9: predicate.dict_set_item_eliminator 1.36% : 0.000003s : 8: predicate.dumpgradient_eliminate 0.41% : 0.000001s : 4: predicate.elim_not_effective 0.48% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.08% : 0.000002s : 13: predicate.environ_add_const_eliminate 0.95% : 0.000002s : 13: predicate.environ_get_add_eliminate 1.00% : 0.000002s : 13: predicate.environ_get_depend_swap 1.64% : 0.000003s : 21: predicate.environ_get_eliminate 1.19% : 0.000002s : 13: predicate.environ_get_set_eliminate 0.97% : 0.000002s : 11: predicate.exchange_switch_depend_value 1.74% : 0.000003s : 11: predicate.float_depend_g_call 0.66% : 0.000001s : 8: predicate.float_environ_get_switch 0.93% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.20% : 0.000000s : 4: predicate.fold_const_symbol 0.80% : 0.000002s : 8: predicate.get_grad_eliminate 0.31% : 0.000001s : 4: predicate.graph_param_transform 0.72% : 0.000001s : 8: predicate.incorporate_call 0.56% : 0.000001s : 8: predicate.incorporate_call_switch 6.05% : 0.000012s : 44: predicate.inline 1.02% : 0.000002s : 8: predicate.inline_without_move 0.34% : 0.000001s : 8: predicate.j_node_and_user_rematch 1.00% : 0.000002s : 8: predicate.less_batch_normalization 1.59% : 0.000003s : 17: predicate.list_to_tuple_eliminator_ 2.32% : 0.000004s : 26: predicate.load_eliminater 1.01% : 0.000002s : 4: predicate.loop_unroll_after_grad 1.37% : 0.000003s : 16: predicate.loop_unroll_before_grad 1.79% : 0.000003s : 17: predicate.make_slice_get_slice_eliminator 0.95% : 0.000002s : 8: predicate.merge_addn 0.77% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.72% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.68% : 0.000001s : 9: predicate.minmaximum_grad 1.97% : 0.000004s : 4: predicate.mutable_eliminate 0.41% : 0.000001s : 4: predicate.opt_reshape 0.48% : 0.000001s : 4: predicate.parallel_virtual_node 1.25% : 0.000002s : 11: predicate.partial_defer_inline 1.13% : 0.000002s : 13: predicate.partial_eliminate 0.82% : 0.000002s : 9: predicate.print_const_string_wrapper 0.78% : 0.000001s : 8: predicate.reduce_all_const_elim 1.12% : 0.000002s : 9: predicate.reduce_eliminate 1.90% : 0.000004s : 26: predicate.redundant_stop_gradient_eliminater 0.47% : 0.000001s : 8: predicate.remove_not_recompute_node 1.11% : 0.000002s : 17: predicate.replace_applicator 0.70% : 0.000001s : 8: predicate.replace_old_param 0.39% : 0.000001s : 4: predicate.reset_defer_inline 0.92% : 0.000002s : 9: predicate.reshape_eliminate 0.81% : 0.000002s : 8: predicate.row_tensor_add_zeros_like 0.45% : 0.000001s : 4: predicate.row_tensor_eliminate 0.99% : 0.000002s : 8: predicate.same_eliminate 0.47% : 0.000001s : 8: predicate.set_cell_output_no_recompute 1.17% : 0.000002s : 8: predicate.shard_identity_eliminate 1.08% : 0.000002s : 8: predicate.special_op_eliminate 1.24% : 0.000002s : 8: predicate.specialize_transform 1.13% : 0.000002s : 8: predicate.split_environ_get_set_with_tuple_value 0.98% : 0.000002s : 8: predicate.stack_unstack_eliminate 0.49% : 0.000001s : 4: predicate.switch_call_monad_eliminater 0.88% : 0.000002s : 11: predicate.switch_defer_inline 1.56% : 0.000003s : 19: predicate.switch_layer_defer_inline 3.87% : 0.000007s : 39: predicate.switch_simplify 0.74% : 0.000001s : 9: predicate.tile_eliminate 0.82% : 0.000002s : 9: predicate.transpose_eliminate 1.64% : 0.000003s : 17: predicate.tuple_list_convert_item_index_to_positive 1.65% : 0.000003s : 17: predicate.tuple_list_get_item_const_eliminator 1.65% : 0.000003s : 17: predicate.tuple_list_get_item_depend_reorder 3.55% : 0.000007s : 25: predicate.tuple_list_get_item_eliminator 1.65% : 0.000003s : 17: predicate.tuple_list_get_set_item_eliminator 2.98% : 0.000006s : 25: predicate.tuple_list_set_item_eliminator 1.35% : 0.000003s : 17: predicate.tuple_to_list_eliminator_ 1.95% : 0.000004s : 26: predicate.updatestate_pure_node_eliminater 2.77% : 0.000005s : 34: predicate.updatestate_useless_node_eliminater 0.47% : 0.000001s : 4: predicate.value_based_eliminate 0.95% : 0.000002s : 8: predicate.virtual_dataset_eliminate 1.35% : 0.000003s : 8: predicate.virtual_output_eliminate 0.32% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.62% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000267 5 9.05% : 0.000024s : 1: func_graph_cloner_run.FuncGraphClonerGraph 90.95% : 0.000243s : 4: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.029901 192 0.02% : 0.000007s : 1: ForceFp32Comm 12.68% : 0.003791s : 1: add_attr 12.61% : 0.003771s : 1: add_attr_with_inline 0.02% : 0.000006s : 1: add_comm_op_reuse_tag 0.24% : 0.000073s : 1: add_recomputation 0.02% : 0.000007s : 1: assign_add_opt 0.25% : 0.000074s : 1: auto_monad 0.12% : 0.000034s : 1: auto_monad_reorder 0.02% : 0.000007s : 1: begin_end_overlap_inline 0.03% : 0.000009s : 1: bias_add_comm_swap 1.70% : 0.000509s : 1: bootstrap 0.15% : 0.000044s : 1: cconv 0.02% : 0.000007s : 1: comm_op_add_attrs 0.08% : 0.000024s : 1: control_data_broadcast_order 0.05% : 0.000015s : 1: convert_after_rewriter 0.14% : 0.000043s : 1: cse_after_recomputation 0.03% : 0.000008s : 1: dataset_repeat_opt 0.08% : 0.000025s : 1: detach_backward 0.05% : 0.000015s : 1: environ_conv 0.09% : 0.000026s : 1: event_method 0.03% : 0.000008s : 1: full_micro_interleaved_order_control 0.03% : 0.000008s : 1: get_jit_bprop_graph 0.04% : 0.000012s : 1: graph_reusing 0.02% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.02% : 0.000006s : 1: handle_group_info 0.03% : 0.000009s : 1: inline 0.03% : 0.000009s : 1: insert-virtual-dataset 0.02% : 0.000006s : 1: interleave_parallel_branches 0.02% : 0.000007s : 1: interleave_split_concat_branches 0.03% : 0.000008s : 1: label_fine_grained_interleaved_index 0.04% : 0.000013s : 1: label_micro_interleaved_index 2.05% : 0.000614s : 1: loop_unroll 0.02% : 0.000007s : 1: merge_cast_opt 0.03% : 0.000008s : 1: micro_interleaved_order_control 2.82% : 0.000843s : 1: mutable_eliminate 0.04% : 0.000011s : 1: offloading_packed_experts 0.08% : 0.000023s : 1: opt.transform.loop_unroll_optimizer 0.09% : 0.000026s : 1: opt.transform.mutable_eliminate 3.46% : 0.001034s : 78: opt.transform.opt_a 0.12% : 0.000035s : 1: opt.transform.opt_after_cconv 0.12% : 0.000036s : 1: opt.transform.opt_after_jit_grad 0.44% : 0.000132s : 28: opt.transform.opt_b 0.30% : 0.000091s : 2: opt.transform.opt_trans_graph 0.18% : 0.000053s : 4: opt.transform.symbol_engine_opt 10.98% : 0.003283s : 1: opt_a 0.56% : 0.000166s : 1: opt_after_cconv 2.45% : 0.000732s : 1: opt_after_jit_grad 1.16% : 0.000347s : 1: opt_b 22.46% : 0.006716s : 1: optimize 0.10% : 0.000030s : 1: optimize_parallel_all_gather_comm 0.05% : 0.000013s : 1: order_py_execute_after_rewriter 0.11% : 0.000033s : 1: overlap_grad_flash_sp 0.02% : 0.000006s : 1: overlap_grad_matmul_and_grad_allreduce 0.03% : 0.000010s : 1: overlap_grad_ring_attention 0.02% : 0.000007s : 1: overlap_opt_shard_grad_in_pipeline 0.02% : 0.000007s : 1: overlap_opt_shard_in_pipeline 0.03% : 0.000009s : 1: overlap_param_gather 0.02% : 0.000007s : 1: overlap_recompute_allgather_and_fa_grad 0.04% : 0.000011s : 1: overlap_recompute_and_grad_model_parallel 0.03% : 0.000009s : 1: overlap_recompute_comm 0.03% : 0.000010s : 1: parallel-infer-symbol 0.02% : 0.000006s : 1: parallel-infer-symbol-second 0.03% : 0.000008s : 1: partial_unused_args_eliminate 0.03% : 0.000010s : 1: pipeline_parallel_scheduler 0.02% : 0.000007s : 1: pipeline_split 0.13% : 0.000039s : 1: pre_auto_parallel 0.10% : 0.000028s : 1: py_interpret_to_execute 0.08% : 0.000024s : 1: py_interpret_to_execute_after_opt_a 0.02% : 0.000006s : 1: remove_cast_before_assign_add 0.09% : 0.000026s : 1: remove_dup_value 1.76% : 0.000526s : 1: renormalize.infer 1.13% : 0.000338s : 1: renormalize.specialize 0.03% : 0.000009s : 1: reorder_send_recv_between_fp_bp 0.05% : 0.000014s : 1: rewriter_after_jit_bprop_graph 0.19% : 0.000057s : 1: rewriter_after_opt_a 0.22% : 0.000065s : 1: rewriter_before_opt_a 0.03% : 0.000008s : 1: slice_cell_reuse_recomputed_activation 0.03% : 0.000008s : 1: slice_recompute_activation 0.02% : 0.000007s : 1: split_layernorm_comm 0.03% : 0.000008s : 1: split_matmul_comm_elemetwise 0.04% : 0.000013s : 1: swap_dp_allreduce_reducescatter 0.43% : 0.000128s : 1: symbol_engine_optimizer 0.47% : 0.000142s : 1: tuple_transform 18.48% : 0.005526s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:43.240.489 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0147803, [21] [bootstrap]: 0.00044642 [type_inference]: 0.00505714 [event_method]: 1.3e-05 [auto_monad]: 5.638e-05 [graph_reusing]: 5.49998e-06 [inline]: 2.26e-06 [add_attr]: 0.00344793, [1] [add_attr_with_inline]: 0.00343647, [1] [Cycle 1]: 6.446e-05, [2] [tag_attr]: 1.716e-05 [meta_addattr_fg_expand]: 3.95998e-06 [parallel-infer-symbol]: 3.53e-06 [pre_auto_parallel]: 3.067e-05 [insert-virtual-dataset]: 2.46998e-06 [parallel-infer-symbol-second]: 1.40001e-06 [dataset_repeat_opt]: 2.05002e-06 [pipeline_split]: 1.79e-06 [optimize]: 0.00496911, [53] [py_interpret_to_execute]: 2.17e-05 [rewriter_before_opt_a]: 5.259e-05 [opt_a]: 0.00267945, [2] [Cycle 1]: 0.00193076, [45] [expand_dump_flag]: 2.92002e-06 [switch_simplify]: 2.82e-05 [loop_unroll]: 1.481e-05 [a_1]: 0.0003934 [with_stream_mark]: 1.813e-05 [recompute_prepare]: 1.091e-05 [updatestate_depend_eliminate]: 5.14e-06 [updatestate_assign_eliminate]: 4.25e-06 [updatestate_loads_eliminate]: 4.05e-06 [parameter_eliminate]: 1.97999e-06 [a_2]: 0.00012134 [accelerated_algorithm]: 9.98998e-06 [shard]: 2.63e-06 [meta_shard_fg_expand]: 2.29001e-06 [shard_inline]: 7.51999e-06 [merge_send_recv]: 9.34998e-06 [auto_parallel]: 8.17e-06 [parallel]: 1.873e-05 [flash_sp]: 9.47999e-06 [merge_comm]: 4.84e-06 [allreduce_fusion]: 4.77998e-06 [matmul_add_comm_reduction]: 1.018e-05 [allreduce_slice_to_reducescatter]: 7.7e-07 [virtual_shard_identity]: 1.021e-05 [virtual_dataset]: 7.45e-06 [get_grad_eliminate_]: 7.64997e-06 [virtual_output]: 7.18e-06 [merge_forward]: 4.74e-06 [cell_reuse_recompute_pass]: 1.40999e-06 [offload_activation]: 1.065e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.623e-05 [merge_recompute_call_nodes]: 1.37999e-06 [before_grad]: 1.293e-05 [set_forward_comm_id_for_comm_node_pass]: 4.62998e-06 [meta_fg_expand]: 3.42002e-06 [flash_sp_send_recv_attached]: 2.94999e-06 [receive_attached]: 2.34001e-06 [after_resolve]: 1.081e-05 [a_after_grad]: 1.078e-05 [renormalize]: 0.00074931 [add_forward_monad_depend]: 4.85001e-06 [auto_monad_grad]: 2.14999e-06 [auto_monad_eliminator]: 1.665e-05 [cse]: 3.694e-05 [a_3]: 5.79e-05 [Cycle 2]: 0.00073907, [45] [expand_dump_flag]: 1.52001e-06 [switch_simplify]: 9.19e-06 [loop_unroll]: 7.26001e-06 [a_1]: 0.00015491 [with_stream_mark]: 1.658e-05 [recompute_prepare]: 7.9e-06 [updatestate_depend_eliminate]: 3.98999e-06 [updatestate_assign_eliminate]: 3.13e-06 [updatestate_loads_eliminate]: 2.94999e-06 [parameter_eliminate]: 1.04e-06 [a_2]: 8.857e-05 [accelerated_algorithm]: 7.82e-06 [shard]: 1.54e-06 [meta_shard_fg_expand]: 1.84998e-06 [shard_inline]: 7.36001e-06 [merge_send_recv]: 6.68e-06 [auto_parallel]: 7.45e-06 [parallel]: 4.87e-06 [flash_sp]: 3.58e-06 [merge_comm]: 4.03001e-06 [allreduce_fusion]: 4.08001e-06 [matmul_add_comm_reduction]: 7.11001e-06 [allreduce_slice_to_reducescatter]: 5.00004e-07 [virtual_shard_identity]: 8.11002e-06 [virtual_dataset]: 6.83e-06 [get_grad_eliminate_]: 6.87002e-06 [virtual_output]: 6.44001e-06 [merge_forward]: 3.61999e-06 [cell_reuse_recompute_pass]: 1.72999e-06 [offload_activation]: 8.29002e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.428e-05 [merge_recompute_call_nodes]: 9.20001e-07 [before_grad]: 1.233e-05 [set_forward_comm_id_for_comm_node_pass]: 4.67e-06 [meta_fg_expand]: 2.79001e-06 [flash_sp_send_recv_attached]: 8.70001e-07 [receive_attached]: 1.16002e-06 [after_resolve]: 9.59e-06 [a_after_grad]: 1.033e-05 [renormalize]: 8.00064e-08 [add_forward_monad_depend]: 1.42999e-06 [auto_monad_grad]: 1.69e-06 [auto_monad_eliminator]: 8.87e-06 [cse]: 1.923e-05 [a_3]: 4.43e-05 [py_interpret_to_execute_after_opt_a]: 1.243e-05 [slice_cell_reuse_recomputed_activation]: 2.21e-06 [rewriter_after_opt_a]: 4.258e-05 [convert_after_rewriter]: 7.7e-06 [order_py_execute_after_rewriter]: 6.53e-06 [mutable_eliminate]: 0.00062902 [opt_b]: 0.0002418, [1] [Cycle 1]: 0.00023509, [7] [b_1]: 0.00014866 [b_2]: 9.71e-06 [updatestate_depend_eliminate]: 6.38e-06 [updatestate_assign_eliminate]: 2.99001e-06 [updatestate_loads_eliminate]: 2.79001e-06 [renormalize]: 6.30011e-07 [cse]: 2.644e-05 [optimize_parallel_all_gather_comm]: 1.765e-05 [overlap_param_gather]: 2.39001e-06 [cconv]: 2.695e-05 [loop_unroll]: 0.00047619 [opt_after_cconv]: 0.00011701, [1] [Cycle 1]: 0.00011124, [7] [c_1]: 3.583e-05 [parameter_eliminate]: 2.72001e-06 [updatestate_depend_eliminate]: 6.24999e-06 [updatestate_assign_eliminate]: 3.02002e-06 [updatestate_loads_eliminate]: 3.58e-06 [cse]: 2.442e-05 [renormalize]: 7.2e-07 [remove_dup_value]: 1.843e-05 [tuple_transform]: 8.255e-05, [1] [Cycle 1]: 7.809e-05, [4] [d_1]: 4.909e-05 [none_parameter_eliminate]: 1.76e-06 [renormalize]: 1.39989e-07 [switch_simplify]: 8.54e-06 [partial_unused_args_eliminate]: 1.75001e-06 [add_recomputation]: 5.603e-05 [cse_after_recomputation]: 2.766e-05, [1] [Cycle 1]: 2.273e-05, [1] [cse]: 1.689e-05 [environ_conv]: 6.90002e-06 [swap_dp_allreduce_reducescatter]: 5.95002e-06 [bias_add_comm_swap]: 2.66e-06 [label_micro_interleaved_index]: 5.27999e-06 [label_fine_grained_interleaved_index]: 2.76999e-06 [merge_cast_opt]: 1.44998e-06 [slice_recompute_activation]: 2.17999e-06 [micro_interleaved_order_control]: 2.43998e-06 [assign_add_opt]: 1.32999e-06 [ForceFp32Comm]: 8.2e-07 [remove_cast_before_assign_add]: 9.79984e-07 [full_micro_interleaved_order_control]: 2.52001e-06 [reorder_send_recv_between_fp_bp]: 1.517e-05 [comm_op_add_attrs]: 1.07998e-06 [add_comm_op_reuse_tag]: 1.01997e-06 [interleave_split_concat_branches]: 1.33002e-06 [interleave_parallel_branches]: 1.13001e-06 [overlap_opt_shard_in_pipeline]: 1.12999e-06 [overlap_opt_shard_grad_in_pipeline]: 1.96e-06 [control_data_broadcast_order]: 1.579e-05 [grouped_pairwise_exchange_alltoall]: 1.54998e-06 [offloading_packed_experts]: 4.80999e-06 [overlap_recompute_and_grad_model_parallel]: 5.89999e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.26002e-06 [overlap_recompute_allgather_and_fa_grad]: 1.60999e-06 [overlap_recompute_comm]: 2.53e-06 [overlap_grad_ring_attention]: 4.62998e-06 [overlap_grad_flash_sp]: 2.136e-05 [begin_end_overlap_inline]: 5.49975e-07 [split_matmul_comm_elemetwise]: 2.43e-06 [split_layernorm_comm]: 1.87999e-06 [handle_group_info]: 1.05001e-06 [symbol_engine_optimizer]: 9.422e-05, [1] [Cycle 1]: 8.928e-05, [6] [build]: 3.81999e-06 [elim_shapecalc]: 1.368e-05 [elim_not_effective]: 1.804e-05 [opt_reshape]: 9.78002e-06 [fold_const_symbol]: 1.237e-05 [renormalize]: 1.80007e-07 [detach_backward]: 1.95001e-06 [pipeline_parallel_scheduler]: 2.07999e-06 [auto_monad_reorder]: 2.038e-05 [get_jit_bprop_graph]: 1.66002e-06 [rewriter_after_jit_bprop_graph]: 4.35e-06 [opt_after_jit_grad]: 0.00050626 [validate]: 4.572e-05 Sums bootstrap : 0.000446s : 4.32% type_inference : 0.005057s : 48.92% event_method : 0.000013s : 0.13% auto_monad : 0.000056s : 0.55% graph_reusing : 0.000005s : 0.05% inline : 0.000002s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000017s : 0.17% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000004s : 0.04% parallel-infer-symbol : 0.000004s : 0.03% pre_auto_parallel : 0.000031s : 0.30% insert-virtual-dataset : 0.000002s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.02% optimize.py_interpret_to_execute : 0.000022s : 0.21% optimize.rewriter_before_opt_a : 0.000053s : 0.51% optimize.opt_a.expand_dump_flag : 0.000004s : 0.04% optimize.opt_a.switch_simplify : 0.000037s : 0.36% optimize.opt_a.loop_unroll : 0.000022s : 0.21% optimize.opt_a.a_1 : 0.000548s : 5.30% optimize.opt_a.with_stream_mark : 0.000035s : 0.34% optimize.opt_a.recompute_prepare : 0.000019s : 0.18% optimize.opt_a.updatestate_depend_eliminate : 0.000009s : 0.09% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.07% optimize.opt_a.updatestate_loads_eliminate : 0.000007s : 0.07% optimize.opt_a.parameter_eliminate : 0.000003s : 0.03% optimize.opt_a.a_2 : 0.000210s : 2.03% optimize.opt_a.accelerated_algorithm : 0.000018s : 0.17% optimize.opt_a.shard : 0.000004s : 0.04% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.04% optimize.opt_a.shard_inline : 0.000015s : 0.14% optimize.opt_a.merge_send_recv : 0.000016s : 0.16% optimize.opt_a.auto_parallel : 0.000016s : 0.15% optimize.opt_a.parallel : 0.000024s : 0.23% optimize.opt_a.flash_sp : 0.000013s : 0.13% optimize.opt_a.merge_comm : 0.000009s : 0.09% optimize.opt_a.allreduce_fusion : 0.000009s : 0.09% optimize.opt_a.matmul_add_comm_reduction : 0.000017s : 0.17% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000018s : 0.18% optimize.opt_a.virtual_dataset : 0.000014s : 0.14% optimize.opt_a.get_grad_eliminate_ : 0.000015s : 0.14% optimize.opt_a.virtual_output : 0.000014s : 0.13% optimize.opt_a.merge_forward : 0.000008s : 0.08% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.03% optimize.opt_a.offload_activation : 0.000019s : 0.18% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000031s : 0.30% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.02% optimize.opt_a.before_grad : 0.000025s : 0.24% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000009s : 0.09% optimize.opt_a.meta_fg_expand : 0.000006s : 0.06% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.04% optimize.opt_a.receive_attached : 0.000004s : 0.03% optimize.opt_a.after_resolve : 0.000020s : 0.20% optimize.opt_a.a_after_grad : 0.000021s : 0.20% optimize.opt_a.renormalize : 0.000749s : 7.25% optimize.opt_a.add_forward_monad_depend : 0.000006s : 0.06% optimize.opt_a.auto_monad_grad : 0.000004s : 0.04% optimize.opt_a.auto_monad_eliminator : 0.000026s : 0.25% optimize.opt_a.cse : 0.000056s : 0.54% optimize.opt_a.a_3 : 0.000102s : 0.99% optimize.py_interpret_to_execute_after_opt_a : 0.000012s : 0.12% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.02% optimize.rewriter_after_opt_a : 0.000043s : 0.41% optimize.convert_after_rewriter : 0.000008s : 0.07% optimize.order_py_execute_after_rewriter : 0.000007s : 0.06% optimize.mutable_eliminate : 0.000629s : 6.09% optimize.opt_b.b_1 : 0.000149s : 1.44% optimize.opt_b.b_2 : 0.000010s : 0.09% optimize.opt_b.updatestate_depend_eliminate : 0.000006s : 0.06% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_b.renormalize : 0.000001s : 0.01% optimize.opt_b.cse : 0.000026s : 0.26% optimize.optimize_parallel_all_gather_comm : 0.000018s : 0.17% optimize.overlap_param_gather : 0.000002s : 0.02% optimize.cconv : 0.000027s : 0.26% optimize.loop_unroll : 0.000476s : 4.61% optimize.opt_after_cconv.c_1 : 0.000036s : 0.35% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.06% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000004s : 0.03% optimize.opt_after_cconv.cse : 0.000024s : 0.24% optimize.opt_after_cconv.renormalize : 0.000001s : 0.01% optimize.remove_dup_value : 0.000018s : 0.18% optimize.tuple_transform.d_1 : 0.000049s : 0.47% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.02% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000009s : 0.08% optimize.partial_unused_args_eliminate : 0.000002s : 0.02% optimize.add_recomputation : 0.000056s : 0.54% optimize.cse_after_recomputation.cse : 0.000017s : 0.16% optimize.environ_conv : 0.000007s : 0.07% optimize.swap_dp_allreduce_reducescatter : 0.000006s : 0.06% optimize.bias_add_comm_swap : 0.000003s : 0.03% optimize.label_micro_interleaved_index : 0.000005s : 0.05% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.03% optimize.merge_cast_opt : 0.000001s : 0.01% optimize.slice_recompute_activation : 0.000002s : 0.02% optimize.micro_interleaved_order_control : 0.000002s : 0.02% optimize.assign_add_opt : 0.000001s : 0.01% optimize.ForceFp32Comm : 0.000001s : 0.01% optimize.remove_cast_before_assign_add : 0.000001s : 0.01% optimize.full_micro_interleaved_order_control : 0.000003s : 0.02% optimize.reorder_send_recv_between_fp_bp : 0.000015s : 0.15% optimize.comm_op_add_attrs : 0.000001s : 0.01% optimize.add_comm_op_reuse_tag : 0.000001s : 0.01% optimize.interleave_split_concat_branches : 0.000001s : 0.01% optimize.interleave_parallel_branches : 0.000001s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.02% optimize.control_data_broadcast_order : 0.000016s : 0.15% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.01% optimize.offloading_packed_experts : 0.000005s : 0.05% optimize.overlap_recompute_and_grad_model_parallel : 0.000006s : 0.06% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000002s : 0.02% optimize.overlap_recompute_comm : 0.000003s : 0.02% optimize.overlap_grad_ring_attention : 0.000005s : 0.04% optimize.overlap_grad_flash_sp : 0.000021s : 0.21% optimize.begin_end_overlap_inline : 0.000001s : 0.01% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.02% optimize.split_layernorm_comm : 0.000002s : 0.02% optimize.handle_group_info : 0.000001s : 0.01% optimize.symbol_engine_optimizer.build : 0.000004s : 0.04% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000014s : 0.13% optimize.symbol_engine_optimizer.elim_not_effective : 0.000018s : 0.17% optimize.symbol_engine_optimizer.opt_reshape : 0.000010s : 0.09% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000012s : 0.12% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.02% pipeline_parallel_scheduler : 0.000002s : 0.02% auto_monad_reorder : 0.000020s : 0.20% get_jit_bprop_graph : 0.000002s : 0.02% rewriter_after_jit_bprop_graph : 0.000004s : 0.04% opt_after_jit_grad : 0.000506s : 4.90% validate : 0.000046s : 0.44% Time group info: ------[substitution.] 0.000154 28 13.72% : 0.000021s : 2: substitution.cast_eliminate 1.71% : 0.000003s : 3: substitution.elim_not_effective 1.23% : 0.000002s : 3: substitution.fold_const_symbol 3.98% : 0.000006s : 4: substitution.graph_param_transform 69.65% : 0.000107s : 2: substitution.inline 3.22% : 0.000005s : 6: substitution.j_node_and_user_rematch 4.28% : 0.000007s : 6: substitution.remove_not_recompute_node 2.20% : 0.000003s : 2: substitution.replace_old_param ------[type_inference.] 0.005007 2 90.61% : 0.004536s : 1: type_inference.infer 9.39% : 0.000470s : 1: type_inference.specialize ------[replace.] 0.000024 2 100.00% : 0.000024s : 2: replace.inline ------[match.] 0.000105 2 100.00% : 0.000105s : 2: match.inline ------[predicate.] 0.000181 980 0.80% : 0.000001s : 9: predicate.accumulaten_eliminater 0.91% : 0.000002s : 4: predicate.ad_related_special_op_eliminate 0.82% : 0.000001s : 8: predicate.addn_check_dump 0.82% : 0.000001s : 9: predicate.addn_zero_filter 0.71% : 0.000001s : 9: predicate.adjust_all_reduce_mul_add 2.51% : 0.000005s : 17: predicate.arithmetic_simplify 1.28% : 0.000002s : 9: predicate.cast_eliminate 0.71% : 0.000001s : 8: predicate.check_bprop_eliminate 0.69% : 0.000001s : 8: predicate.compare_switch_simplify 0.23% : 0.000000s : 4: predicate.const_output_eliminate 0.84% : 0.000002s : 8: predicate.depend_value_elim 0.98% : 0.000002s : 9: predicate.dict_get_item_const_eliminator 1.06% : 0.000002s : 9: predicate.dict_get_item_eliminator 0.83% : 0.000002s : 9: predicate.dict_set_item_eliminator 1.13% : 0.000002s : 8: predicate.dumpgradient_eliminate 0.23% : 0.000000s : 4: predicate.elim_not_effective 0.72% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.05% : 0.000002s : 13: predicate.environ_add_const_eliminate 1.05% : 0.000002s : 13: predicate.environ_get_add_eliminate 1.05% : 0.000002s : 13: predicate.environ_get_depend_swap 1.98% : 0.000004s : 21: predicate.environ_get_eliminate 1.02% : 0.000002s : 13: predicate.environ_get_set_eliminate 0.90% : 0.000002s : 11: predicate.exchange_switch_depend_value 1.78% : 0.000003s : 11: predicate.float_depend_g_call 0.68% : 0.000001s : 8: predicate.float_environ_get_switch 0.93% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.18% : 0.000000s : 4: predicate.fold_const_symbol 1.04% : 0.000002s : 8: predicate.get_grad_eliminate 0.25% : 0.000000s : 4: predicate.graph_param_transform 0.85% : 0.000002s : 8: predicate.incorporate_call 0.66% : 0.000001s : 8: predicate.incorporate_call_switch 6.26% : 0.000011s : 44: predicate.inline 1.02% : 0.000002s : 8: predicate.inline_without_move 0.36% : 0.000001s : 8: predicate.j_node_and_user_rematch 1.12% : 0.000002s : 8: predicate.less_batch_normalization 1.88% : 0.000003s : 17: predicate.list_to_tuple_eliminator_ 2.22% : 0.000004s : 26: predicate.load_eliminater 1.15% : 0.000002s : 4: predicate.loop_unroll_after_grad 1.34% : 0.000002s : 16: predicate.loop_unroll_before_grad 1.76% : 0.000003s : 17: predicate.make_slice_get_slice_eliminator 0.83% : 0.000002s : 8: predicate.merge_addn 0.67% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.67% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.88% : 0.000002s : 9: predicate.minmaximum_grad 1.26% : 0.000002s : 4: predicate.mutable_eliminate 0.51% : 0.000001s : 4: predicate.opt_reshape 0.59% : 0.000001s : 4: predicate.parallel_virtual_node 1.24% : 0.000002s : 11: predicate.partial_defer_inline 1.25% : 0.000002s : 13: predicate.partial_eliminate 0.88% : 0.000002s : 9: predicate.print_const_string_wrapper 0.83% : 0.000002s : 8: predicate.reduce_all_const_elim 1.23% : 0.000002s : 9: predicate.reduce_eliminate 2.18% : 0.000004s : 26: predicate.redundant_stop_gradient_eliminater 0.45% : 0.000001s : 8: predicate.remove_not_recompute_node 1.13% : 0.000002s : 17: predicate.replace_applicator 0.59% : 0.000001s : 8: predicate.replace_old_param 0.33% : 0.000001s : 4: predicate.reset_defer_inline 0.96% : 0.000002s : 9: predicate.reshape_eliminate 0.68% : 0.000001s : 8: predicate.row_tensor_add_zeros_like 0.46% : 0.000001s : 4: predicate.row_tensor_eliminate 0.96% : 0.000002s : 8: predicate.same_eliminate 0.54% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.97% : 0.000002s : 8: predicate.shard_identity_eliminate 0.92% : 0.000002s : 8: predicate.special_op_eliminate 1.02% : 0.000002s : 8: predicate.specialize_transform 1.01% : 0.000002s : 8: predicate.split_environ_get_set_with_tuple_value 0.91% : 0.000002s : 8: predicate.stack_unstack_eliminate 0.41% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.13% : 0.000002s : 11: predicate.switch_defer_inline 1.69% : 0.000003s : 19: predicate.switch_layer_defer_inline 3.91% : 0.000007s : 39: predicate.switch_simplify 0.83% : 0.000001s : 9: predicate.tile_eliminate 0.95% : 0.000002s : 9: predicate.transpose_eliminate 1.59% : 0.000003s : 17: predicate.tuple_list_convert_item_index_to_positive 1.64% : 0.000003s : 17: predicate.tuple_list_get_item_const_eliminator 1.59% : 0.000003s : 17: predicate.tuple_list_get_item_depend_reorder 3.01% : 0.000005s : 25: predicate.tuple_list_get_item_eliminator 1.61% : 0.000003s : 17: predicate.tuple_list_get_set_item_eliminator 2.66% : 0.000005s : 25: predicate.tuple_list_set_item_eliminator 1.51% : 0.000003s : 17: predicate.tuple_to_list_eliminator_ 2.00% : 0.000004s : 26: predicate.updatestate_pure_node_eliminater 2.89% : 0.000005s : 34: predicate.updatestate_useless_node_eliminater 0.43% : 0.000001s : 4: predicate.value_based_eliminate 0.77% : 0.000001s : 8: predicate.virtual_dataset_eliminate 0.76% : 0.000001s : 8: predicate.virtual_output_eliminate 0.33% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.54% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000235 5 7.32% : 0.000017s : 1: func_graph_cloner_run.FuncGraphClonerGraph 92.68% : 0.000218s : 4: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.025166 192 0.01% : 0.000004s : 1: ForceFp32Comm 13.73% : 0.003454s : 1: add_attr 13.67% : 0.003441s : 1: add_attr_with_inline 0.02% : 0.000004s : 1: add_comm_op_reuse_tag 0.24% : 0.000060s : 1: add_recomputation 0.02% : 0.000004s : 1: assign_add_opt 0.24% : 0.000061s : 1: auto_monad 0.10% : 0.000024s : 1: auto_monad_reorder 0.01% : 0.000004s : 1: begin_end_overlap_inline 0.02% : 0.000005s : 1: bias_add_comm_swap 1.89% : 0.000476s : 1: bootstrap 0.12% : 0.000031s : 1: cconv 0.02% : 0.000004s : 1: comm_op_add_attrs 0.08% : 0.000020s : 1: control_data_broadcast_order 0.05% : 0.000011s : 1: convert_after_rewriter 0.12% : 0.000031s : 1: cse_after_recomputation 0.02% : 0.000005s : 1: dataset_repeat_opt 0.02% : 0.000005s : 1: detach_backward 0.04% : 0.000010s : 1: environ_conv 0.08% : 0.000020s : 1: event_method 0.02% : 0.000006s : 1: full_micro_interleaved_order_control 0.02% : 0.000005s : 1: get_jit_bprop_graph 0.04% : 0.000009s : 1: graph_reusing 0.02% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.02% : 0.000004s : 1: handle_group_info 0.02% : 0.000005s : 1: inline 0.02% : 0.000006s : 1: insert-virtual-dataset 0.02% : 0.000004s : 1: interleave_parallel_branches 0.02% : 0.000004s : 1: interleave_split_concat_branches 0.02% : 0.000006s : 1: label_fine_grained_interleaved_index 0.03% : 0.000008s : 1: label_micro_interleaved_index 1.93% : 0.000486s : 1: loop_unroll 0.02% : 0.000004s : 1: merge_cast_opt 0.02% : 0.000006s : 1: micro_interleaved_order_control 2.53% : 0.000638s : 1: mutable_eliminate 0.03% : 0.000008s : 1: offloading_packed_experts 0.07% : 0.000018s : 1: opt.transform.loop_unroll_optimizer 0.07% : 0.000017s : 1: opt.transform.mutable_eliminate 4.10% : 0.001032s : 78: opt.transform.opt_a 0.14% : 0.000034s : 1: opt.transform.opt_after_cconv 0.12% : 0.000030s : 1: opt.transform.opt_after_jit_grad 0.50% : 0.000126s : 28: opt.transform.opt_b 0.22% : 0.000055s : 2: opt.transform.opt_trans_graph 0.20% : 0.000049s : 4: opt.transform.symbol_engine_opt 10.66% : 0.002683s : 1: opt_a 0.48% : 0.000121s : 1: opt_after_cconv 2.05% : 0.000516s : 1: opt_after_jit_grad 0.97% : 0.000245s : 1: opt_b 19.77% : 0.004974s : 1: optimize 0.08% : 0.000021s : 1: optimize_parallel_all_gather_comm 0.04% : 0.000010s : 1: order_py_execute_after_rewriter 0.10% : 0.000025s : 1: overlap_grad_flash_sp 0.02% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.03% : 0.000008s : 1: overlap_grad_ring_attention 0.02% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.02% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.02% : 0.000005s : 1: overlap_param_gather 0.02% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.04% : 0.000009s : 1: overlap_recompute_and_grad_model_parallel 0.02% : 0.000006s : 1: overlap_recompute_comm 0.03% : 0.000008s : 1: parallel-infer-symbol 0.02% : 0.000004s : 1: parallel-infer-symbol-second 0.02% : 0.000005s : 1: partial_unused_args_eliminate 0.02% : 0.000005s : 1: pipeline_parallel_scheduler 0.02% : 0.000005s : 1: pipeline_split 0.14% : 0.000035s : 1: pre_auto_parallel 0.10% : 0.000025s : 1: py_interpret_to_execute 0.06% : 0.000016s : 1: py_interpret_to_execute_after_opt_a 0.02% : 0.000004s : 1: remove_cast_before_assign_add 0.09% : 0.000022s : 1: remove_dup_value 1.80% : 0.000453s : 1: renormalize.infer 1.14% : 0.000288s : 1: renormalize.specialize 0.07% : 0.000019s : 1: reorder_send_recv_between_fp_bp 0.03% : 0.000007s : 1: rewriter_after_jit_bprop_graph 0.18% : 0.000046s : 1: rewriter_after_opt_a 0.22% : 0.000056s : 1: rewriter_before_opt_a 0.02% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.02% : 0.000005s : 1: slice_recompute_activation 0.02% : 0.000005s : 1: split_layernorm_comm 0.02% : 0.000005s : 1: split_matmul_comm_elemetwise 0.04% : 0.000009s : 1: swap_dp_allreduce_reducescatter 0.39% : 0.000097s : 1: symbol_engine_optimizer 0.34% : 0.000085s : 1: tuple_transform 20.18% : 0.005077s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:43.442.796 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:43.443.045 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0167506, [21] [bootstrap]: 0.00044737 [type_inference]: 0.00543401 [event_method]: 1.588e-05 [auto_monad]: 6.334e-05 [graph_reusing]: 5.17e-06 [inline]: 2.43e-06 [add_attr]: 0.00364989, [1] [add_attr_with_inline]: 0.00363863, [1] [Cycle 1]: 8.626e-05, [2] [tag_attr]: 1.674e-05 [meta_addattr_fg_expand]: 3.91001e-06 [parallel-infer-symbol]: 3.59002e-06 [pre_auto_parallel]: 3.338e-05 [insert-virtual-dataset]: 2.68003e-06 [parallel-infer-symbol-second]: 7.39994e-07 [dataset_repeat_opt]: 2.02999e-06 [pipeline_split]: 1.81e-06 [optimize]: 0.00581034, [53] [py_interpret_to_execute]: 2.482e-05 [rewriter_before_opt_a]: 6.094e-05 [opt_a]: 0.00317002, [2] [Cycle 1]: 0.00218436, [45] [expand_dump_flag]: 2.63e-06 [switch_simplify]: 2.828e-05 [loop_unroll]: 1.564e-05 [a_1]: 0.00037289 [with_stream_mark]: 1.891e-05 [recompute_prepare]: 9.52999e-06 [updatestate_depend_eliminate]: 5.48002e-06 [updatestate_assign_eliminate]: 4.08001e-06 [updatestate_loads_eliminate]: 3.92002e-06 [parameter_eliminate]: 2.44001e-06 [a_2]: 0.00012839 [accelerated_algorithm]: 8.13001e-06 [shard]: 2.01998e-06 [meta_shard_fg_expand]: 2.06998e-06 [shard_inline]: 8.74003e-06 [merge_send_recv]: 1.051e-05 [auto_parallel]: 8.43999e-06 [parallel]: 1.917e-05 [flash_sp]: 1.075e-05 [merge_comm]: 5.97001e-06 [allreduce_fusion]: 4.28999e-06 [matmul_add_comm_reduction]: 1.091e-05 [allreduce_slice_to_reducescatter]: 6.50005e-07 [virtual_shard_identity]: 1.145e-05 [virtual_dataset]: 7.51999e-06 [get_grad_eliminate_]: 7.48e-06 [virtual_output]: 7.9e-06 [merge_forward]: 4.70001e-06 [cell_reuse_recompute_pass]: 1.24e-06 [offload_activation]: 1.126e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.872e-05 [merge_recompute_call_nodes]: 1.84e-06 [before_grad]: 1.412e-05 [set_forward_comm_id_for_comm_node_pass]: 5.22e-06 [meta_fg_expand]: 3.60998e-06 [flash_sp_send_recv_attached]: 2.68e-06 [receive_attached]: 2.17999e-06 [after_resolve]: 1.116e-05 [a_after_grad]: 1.194e-05 [renormalize]: 0.00083407 [add_forward_monad_depend]: 6.01e-06 [auto_monad_grad]: 2.56e-06 [auto_monad_eliminator]: 1.904e-05 [cse]: 3.778e-05 [a_3]: 7.527e-05 [Cycle 2]: 0.00097013, [45] [expand_dump_flag]: 1.38002e-06 [switch_simplify]: 8.99e-06 [loop_unroll]: 7.38999e-06 [a_1]: 0.00015621 [with_stream_mark]: 1.656e-05 [recompute_prepare]: 8.52e-06 [updatestate_depend_eliminate]: 4.77998e-06 [updatestate_assign_eliminate]: 3.03998e-06 [updatestate_loads_eliminate]: 3.08e-06 [parameter_eliminate]: 1.37e-06 [a_2]: 0.00011522 [accelerated_algorithm]: 7.96001e-06 [shard]: 1.76e-06 [meta_shard_fg_expand]: 2.23998e-06 [shard_inline]: 7.69002e-06 [merge_send_recv]: 7.95998e-06 [auto_parallel]: 8.24998e-06 [parallel]: 6.34001e-06 [flash_sp]: 3.77002e-06 [merge_comm]: 4.46002e-06 [allreduce_fusion]: 4.24002e-06 [matmul_add_comm_reduction]: 8.42e-06 [allreduce_slice_to_reducescatter]: 5.69999e-07 [virtual_shard_identity]: 9.42001e-06 [virtual_dataset]: 7.23999e-06 [get_grad_eliminate_]: 6.48998e-06 [virtual_output]: 7.17002e-06 [merge_forward]: 4.3e-06 [cell_reuse_recompute_pass]: 1.54e-06 [offload_activation]: 9.62999e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.773e-05 [merge_recompute_call_nodes]: 1.04998e-06 [before_grad]: 1.184e-05 [set_forward_comm_id_for_comm_node_pass]: 5.51998e-06 [meta_fg_expand]: 3.13e-06 [flash_sp_send_recv_attached]: 1.38002e-06 [receive_attached]: 1.17e-06 [after_resolve]: 1.242e-05 [a_after_grad]: 1.146e-05 [renormalize]: 8.00064e-08 [add_forward_monad_depend]: 2.46e-06 [auto_monad_grad]: 1.25999e-06 [auto_monad_eliminator]: 1.288e-05 [cse]: 2.438e-05 [a_3]: 5.811e-05 [py_interpret_to_execute_after_opt_a]: 1.894e-05 [slice_cell_reuse_recomputed_activation]: 5.17999e-06 [rewriter_after_opt_a]: 5.179e-05 [convert_after_rewriter]: 1.039e-05 [order_py_execute_after_rewriter]: 9.71e-06 [mutable_eliminate]: 0.00068149 [opt_b]: 0.00031442, [1] [Cycle 1]: 0.000304, [7] [b_1]: 0.00019256 [b_2]: 9.15999e-06 [updatestate_depend_eliminate]: 8.55999e-06 [updatestate_assign_eliminate]: 3.18e-06 [updatestate_loads_eliminate]: 3.06001e-06 [renormalize]: 6.50005e-07 [cse]: 2.911e-05 [optimize_parallel_all_gather_comm]: 2.133e-05 [overlap_param_gather]: 5.04e-06 [cconv]: 3.475e-05 [loop_unroll]: 0.00050429 [opt_after_cconv]: 0.00014798, [1] [Cycle 1]: 0.00013849, [7] [c_1]: 3.573e-05 [parameter_eliminate]: 3.63e-06 [updatestate_depend_eliminate]: 7.26001e-06 [updatestate_assign_eliminate]: 3.3e-06 [updatestate_loads_eliminate]: 3.76001e-06 [cse]: 2.704e-05 [renormalize]: 3.29979e-07 [remove_dup_value]: 2.066e-05 [tuple_transform]: 0.00010085, [1] [Cycle 1]: 9.341e-05, [4] [d_1]: 5.159e-05 [none_parameter_eliminate]: 1.84998e-06 [renormalize]: 2.29978e-07 [switch_simplify]: 7.89002e-06 [partial_unused_args_eliminate]: 4.85001e-06 [add_recomputation]: 6.165e-05 [cse_after_recomputation]: 3.486e-05, [1] [Cycle 1]: 2.742e-05, [1] [cse]: 1.796e-05 [environ_conv]: 1.047e-05 [swap_dp_allreduce_reducescatter]: 8.69e-06 [bias_add_comm_swap]: 5.36002e-06 [label_micro_interleaved_index]: 7.51999e-06 [label_fine_grained_interleaved_index]: 4.94998e-06 [merge_cast_opt]: 4.38001e-06 [slice_recompute_activation]: 5.07e-06 [micro_interleaved_order_control]: 4.62e-06 [assign_add_opt]: 4.09002e-06 [ForceFp32Comm]: 3.5e-06 [remove_cast_before_assign_add]: 3.56001e-06 [full_micro_interleaved_order_control]: 5.19998e-06 [reorder_send_recv_between_fp_bp]: 5.40001e-06 [comm_op_add_attrs]: 3.8e-06 [add_comm_op_reuse_tag]: 3.67998e-06 [interleave_split_concat_branches]: 3.66999e-06 [interleave_parallel_branches]: 4.20999e-06 [overlap_opt_shard_in_pipeline]: 3.65e-06 [overlap_opt_shard_grad_in_pipeline]: 4.04997e-06 [control_data_broadcast_order]: 1.966e-05 [grouped_pairwise_exchange_alltoall]: 4.18001e-06 [offloading_packed_experts]: 6.79001e-06 [overlap_recompute_and_grad_model_parallel]: 8.08001e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.51001e-06 [overlap_recompute_allgather_and_fa_grad]: 3.91001e-06 [overlap_recompute_comm]: 5.36002e-06 [overlap_grad_ring_attention]: 6.56999e-06 [overlap_grad_flash_sp]: 2.708e-05 [begin_end_overlap_inline]: 2.85998e-06 [split_matmul_comm_elemetwise]: 4.63001e-06 [split_layernorm_comm]: 4.13001e-06 [handle_group_info]: 3.4e-06 [symbol_engine_optimizer]: 0.00011106, [1] [Cycle 1]: 0.00010383, [6] [build]: 4.18999e-06 [elim_shapecalc]: 1.309e-05 [elim_not_effective]: 1.627e-05 [opt_reshape]: 9.09998e-06 [fold_const_symbol]: 1.234e-05 [renormalize]: 3.09985e-07 [detach_backward]: 4.26001e-06 [pipeline_parallel_scheduler]: 1.99e-06 [auto_monad_reorder]: 2.523e-05 [get_jit_bprop_graph]: 2.04e-06 [rewriter_after_jit_bprop_graph]: 6.17999e-06 [opt_after_jit_grad]: 0.0005697 [validate]: 4.945e-05 Sums bootstrap : 0.000447s : 3.98% type_inference : 0.005434s : 48.31% event_method : 0.000016s : 0.14% auto_monad : 0.000063s : 0.56% graph_reusing : 0.000005s : 0.05% inline : 0.000002s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000017s : 0.15% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000004s : 0.03% parallel-infer-symbol : 0.000004s : 0.03% pre_auto_parallel : 0.000033s : 0.30% insert-virtual-dataset : 0.000003s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.02% optimize.py_interpret_to_execute : 0.000025s : 0.22% optimize.rewriter_before_opt_a : 0.000061s : 0.54% optimize.opt_a.expand_dump_flag : 0.000004s : 0.04% optimize.opt_a.switch_simplify : 0.000037s : 0.33% optimize.opt_a.loop_unroll : 0.000023s : 0.20% optimize.opt_a.a_1 : 0.000529s : 4.70% optimize.opt_a.with_stream_mark : 0.000035s : 0.32% optimize.opt_a.recompute_prepare : 0.000018s : 0.16% optimize.opt_a.updatestate_depend_eliminate : 0.000010s : 0.09% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.06% optimize.opt_a.updatestate_loads_eliminate : 0.000007s : 0.06% optimize.opt_a.parameter_eliminate : 0.000004s : 0.03% optimize.opt_a.a_2 : 0.000244s : 2.17% optimize.opt_a.accelerated_algorithm : 0.000016s : 0.14% optimize.opt_a.shard : 0.000004s : 0.03% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.04% optimize.opt_a.shard_inline : 0.000016s : 0.15% optimize.opt_a.merge_send_recv : 0.000018s : 0.16% optimize.opt_a.auto_parallel : 0.000017s : 0.15% optimize.opt_a.parallel : 0.000026s : 0.23% optimize.opt_a.flash_sp : 0.000015s : 0.13% optimize.opt_a.merge_comm : 0.000010s : 0.09% optimize.opt_a.allreduce_fusion : 0.000009s : 0.08% optimize.opt_a.matmul_add_comm_reduction : 0.000019s : 0.17% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000021s : 0.19% optimize.opt_a.virtual_dataset : 0.000015s : 0.13% optimize.opt_a.get_grad_eliminate_ : 0.000014s : 0.12% optimize.opt_a.virtual_output : 0.000015s : 0.13% optimize.opt_a.merge_forward : 0.000009s : 0.08% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.02% optimize.opt_a.offload_activation : 0.000021s : 0.19% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000036s : 0.32% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.03% optimize.opt_a.before_grad : 0.000026s : 0.23% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000011s : 0.10% optimize.opt_a.meta_fg_expand : 0.000007s : 0.06% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.04% optimize.opt_a.receive_attached : 0.000003s : 0.03% optimize.opt_a.after_resolve : 0.000024s : 0.21% optimize.opt_a.a_after_grad : 0.000023s : 0.21% optimize.opt_a.renormalize : 0.000834s : 7.42% optimize.opt_a.add_forward_monad_depend : 0.000008s : 0.08% optimize.opt_a.auto_monad_grad : 0.000004s : 0.03% optimize.opt_a.auto_monad_eliminator : 0.000032s : 0.28% optimize.opt_a.cse : 0.000062s : 0.55% optimize.opt_a.a_3 : 0.000133s : 1.19% optimize.py_interpret_to_execute_after_opt_a : 0.000019s : 0.17% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.05% optimize.rewriter_after_opt_a : 0.000052s : 0.46% optimize.convert_after_rewriter : 0.000010s : 0.09% optimize.order_py_execute_after_rewriter : 0.000010s : 0.09% optimize.mutable_eliminate : 0.000681s : 6.06% optimize.opt_b.b_1 : 0.000193s : 1.71% optimize.opt_b.b_2 : 0.000009s : 0.08% optimize.opt_b.updatestate_depend_eliminate : 0.000009s : 0.08% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_b.renormalize : 0.000001s : 0.01% optimize.opt_b.cse : 0.000029s : 0.26% optimize.optimize_parallel_all_gather_comm : 0.000021s : 0.19% optimize.overlap_param_gather : 0.000005s : 0.04% optimize.cconv : 0.000035s : 0.31% optimize.loop_unroll : 0.000504s : 4.48% optimize.opt_after_cconv.c_1 : 0.000036s : 0.32% optimize.opt_after_cconv.parameter_eliminate : 0.000004s : 0.03% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000007s : 0.06% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000004s : 0.03% optimize.opt_after_cconv.cse : 0.000027s : 0.24% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000021s : 0.18% optimize.tuple_transform.d_1 : 0.000052s : 0.46% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.02% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000008s : 0.07% optimize.partial_unused_args_eliminate : 0.000005s : 0.04% optimize.add_recomputation : 0.000062s : 0.55% optimize.cse_after_recomputation.cse : 0.000018s : 0.16% optimize.environ_conv : 0.000010s : 0.09% optimize.swap_dp_allreduce_reducescatter : 0.000009s : 0.08% optimize.bias_add_comm_swap : 0.000005s : 0.05% optimize.label_micro_interleaved_index : 0.000008s : 0.07% optimize.label_fine_grained_interleaved_index : 0.000005s : 0.04% optimize.merge_cast_opt : 0.000004s : 0.04% optimize.slice_recompute_activation : 0.000005s : 0.05% optimize.micro_interleaved_order_control : 0.000005s : 0.04% optimize.assign_add_opt : 0.000004s : 0.04% optimize.ForceFp32Comm : 0.000003s : 0.03% optimize.remove_cast_before_assign_add : 0.000004s : 0.03% optimize.full_micro_interleaved_order_control : 0.000005s : 0.05% optimize.reorder_send_recv_between_fp_bp : 0.000005s : 0.05% optimize.comm_op_add_attrs : 0.000004s : 0.03% optimize.add_comm_op_reuse_tag : 0.000004s : 0.03% optimize.interleave_split_concat_branches : 0.000004s : 0.03% optimize.interleave_parallel_branches : 0.000004s : 0.04% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.03% optimize.overlap_opt_shard_grad_in_pipeline : 0.000004s : 0.04% optimize.control_data_broadcast_order : 0.000020s : 0.17% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.04% optimize.offloading_packed_experts : 0.000007s : 0.06% optimize.overlap_recompute_and_grad_model_parallel : 0.000008s : 0.07% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.03% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.03% optimize.overlap_recompute_comm : 0.000005s : 0.05% optimize.overlap_grad_ring_attention : 0.000007s : 0.06% optimize.overlap_grad_flash_sp : 0.000027s : 0.24% optimize.begin_end_overlap_inline : 0.000003s : 0.03% optimize.split_matmul_comm_elemetwise : 0.000005s : 0.04% optimize.split_layernorm_comm : 0.000004s : 0.04% optimize.handle_group_info : 0.000003s : 0.03% optimize.symbol_engine_optimizer.build : 0.000004s : 0.04% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000013s : 0.12% optimize.symbol_engine_optimizer.elim_not_effective : 0.000016s : 0.14% optimize.symbol_engine_optimizer.opt_reshape : 0.000009s : 0.08% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000012s : 0.11% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000004s : 0.04% pipeline_parallel_scheduler : 0.000002s : 0.02% auto_monad_reorder : 0.000025s : 0.22% get_jit_bprop_graph : 0.000002s : 0.02% rewriter_after_jit_bprop_graph : 0.000006s : 0.05% opt_after_jit_grad : 0.000570s : 5.06% validate : 0.000049s : 0.44% Time group info: ------[substitution.] 0.000167 28 15.29% : 0.000026s : 2: substitution.cast_eliminate 1.40% : 0.000002s : 3: substitution.elim_not_effective 1.15% : 0.000002s : 3: substitution.fold_const_symbol 3.82% : 0.000006s : 4: substitution.graph_param_transform 69.32% : 0.000116s : 2: substitution.inline 3.19% : 0.000005s : 6: substitution.j_node_and_user_rematch 3.70% : 0.000006s : 6: substitution.remove_not_recompute_node 2.14% : 0.000004s : 2: substitution.replace_old_param ------[type_inference.] 0.005374 2 90.51% : 0.004865s : 1: type_inference.infer 9.49% : 0.000510s : 1: type_inference.specialize ------[replace.] 0.000022 2 100.00% : 0.000022s : 2: replace.inline ------[match.] 0.000114 2 100.00% : 0.000114s : 2: match.inline ------[predicate.] 0.000176 980 0.78% : 0.000001s : 9: predicate.accumulaten_eliminater 1.02% : 0.000002s : 4: predicate.ad_related_special_op_eliminate 0.75% : 0.000001s : 8: predicate.addn_check_dump 0.83% : 0.000001s : 9: predicate.addn_zero_filter 0.68% : 0.000001s : 9: predicate.adjust_all_reduce_mul_add 2.30% : 0.000004s : 17: predicate.arithmetic_simplify 0.93% : 0.000002s : 9: predicate.cast_eliminate 0.78% : 0.000001s : 8: predicate.check_bprop_eliminate 0.65% : 0.000001s : 8: predicate.compare_switch_simplify 0.23% : 0.000000s : 4: predicate.const_output_eliminate 0.81% : 0.000001s : 8: predicate.depend_value_elim 0.79% : 0.000001s : 9: predicate.dict_get_item_const_eliminator 0.96% : 0.000002s : 9: predicate.dict_get_item_eliminator 0.74% : 0.000001s : 9: predicate.dict_set_item_eliminator 1.42% : 0.000003s : 8: predicate.dumpgradient_eliminate 0.23% : 0.000000s : 4: predicate.elim_not_effective 0.78% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.02% : 0.000002s : 13: predicate.environ_add_const_eliminate 0.99% : 0.000002s : 13: predicate.environ_get_add_eliminate 0.99% : 0.000002s : 13: predicate.environ_get_depend_swap 1.87% : 0.000003s : 21: predicate.environ_get_eliminate 1.04% : 0.000002s : 13: predicate.environ_get_set_eliminate 0.90% : 0.000002s : 11: predicate.exchange_switch_depend_value 1.77% : 0.000003s : 11: predicate.float_depend_g_call 0.65% : 0.000001s : 8: predicate.float_environ_get_switch 1.06% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.22% : 0.000000s : 4: predicate.fold_const_symbol 0.85% : 0.000001s : 8: predicate.get_grad_eliminate 0.33% : 0.000001s : 4: predicate.graph_param_transform 0.81% : 0.000001s : 8: predicate.incorporate_call 0.65% : 0.000001s : 8: predicate.incorporate_call_switch 6.78% : 0.000012s : 44: predicate.inline 1.26% : 0.000002s : 8: predicate.inline_without_move 0.40% : 0.000001s : 8: predicate.j_node_and_user_rematch 0.98% : 0.000002s : 8: predicate.less_batch_normalization 1.69% : 0.000003s : 17: predicate.list_to_tuple_eliminator_ 2.05% : 0.000004s : 26: predicate.load_eliminater 1.44% : 0.000003s : 4: predicate.loop_unroll_after_grad 1.47% : 0.000003s : 16: predicate.loop_unroll_before_grad 1.91% : 0.000003s : 17: predicate.make_slice_get_slice_eliminator 0.72% : 0.000001s : 8: predicate.merge_addn 0.77% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.89% : 0.000002s : 8: predicate.mini_step_allgather_replace 0.65% : 0.000001s : 9: predicate.minmaximum_grad 1.67% : 0.000003s : 4: predicate.mutable_eliminate 0.49% : 0.000001s : 4: predicate.opt_reshape 0.51% : 0.000001s : 4: predicate.parallel_virtual_node 1.19% : 0.000002s : 11: predicate.partial_defer_inline 1.20% : 0.000002s : 13: predicate.partial_eliminate 0.90% : 0.000002s : 9: predicate.print_const_string_wrapper 0.73% : 0.000001s : 8: predicate.reduce_all_const_elim 1.05% : 0.000002s : 9: predicate.reduce_eliminate 2.21% : 0.000004s : 26: predicate.redundant_stop_gradient_eliminater 0.47% : 0.000001s : 8: predicate.remove_not_recompute_node 1.09% : 0.000002s : 17: predicate.replace_applicator 0.62% : 0.000001s : 8: predicate.replace_old_param 0.33% : 0.000001s : 4: predicate.reset_defer_inline 0.82% : 0.000001s : 9: predicate.reshape_eliminate 0.77% : 0.000001s : 8: predicate.row_tensor_add_zeros_like 0.49% : 0.000001s : 4: predicate.row_tensor_eliminate 1.07% : 0.000002s : 8: predicate.same_eliminate 0.51% : 0.000001s : 8: predicate.set_cell_output_no_recompute 1.04% : 0.000002s : 8: predicate.shard_identity_eliminate 1.01% : 0.000002s : 8: predicate.special_op_eliminate 1.07% : 0.000002s : 8: predicate.specialize_transform 1.00% : 0.000002s : 8: predicate.split_environ_get_set_with_tuple_value 0.97% : 0.000002s : 8: predicate.stack_unstack_eliminate 0.40% : 0.000001s : 4: predicate.switch_call_monad_eliminater 0.91% : 0.000002s : 11: predicate.switch_defer_inline 1.64% : 0.000003s : 19: predicate.switch_layer_defer_inline 4.15% : 0.000007s : 39: predicate.switch_simplify 0.74% : 0.000001s : 9: predicate.tile_eliminate 0.83% : 0.000001s : 9: predicate.transpose_eliminate 1.60% : 0.000003s : 17: predicate.tuple_list_convert_item_index_to_positive 1.56% : 0.000003s : 17: predicate.tuple_list_get_item_const_eliminator 1.56% : 0.000003s : 17: predicate.tuple_list_get_item_depend_reorder 3.00% : 0.000005s : 25: predicate.tuple_list_get_item_eliminator 1.38% : 0.000002s : 17: predicate.tuple_list_get_set_item_eliminator 2.24% : 0.000004s : 25: predicate.tuple_list_set_item_eliminator 1.61% : 0.000003s : 17: predicate.tuple_to_list_eliminator_ 2.13% : 0.000004s : 26: predicate.updatestate_pure_node_eliminater 2.95% : 0.000005s : 34: predicate.updatestate_useless_node_eliminater 0.49% : 0.000001s : 4: predicate.value_based_eliminate 0.92% : 0.000002s : 8: predicate.virtual_dataset_eliminate 0.91% : 0.000002s : 8: predicate.virtual_output_eliminate 0.33% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.58% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000258 5 8.07% : 0.000021s : 1: func_graph_cloner_run.FuncGraphClonerGraph 91.93% : 0.000237s : 4: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.028236 192 0.02% : 0.000006s : 1: ForceFp32Comm 12.96% : 0.003661s : 1: add_attr 12.90% : 0.003643s : 1: add_attr_with_inline 0.02% : 0.000007s : 1: add_comm_op_reuse_tag 0.23% : 0.000066s : 1: add_recomputation 0.02% : 0.000007s : 1: assign_add_opt 0.26% : 0.000073s : 1: auto_monad 0.11% : 0.000032s : 1: auto_monad_reorder 0.02% : 0.000006s : 1: begin_end_overlap_inline 0.03% : 0.000008s : 1: bias_add_comm_swap 1.76% : 0.000496s : 1: bootstrap 0.14% : 0.000038s : 1: cconv 0.02% : 0.000006s : 1: comm_op_add_attrs 0.08% : 0.000023s : 1: control_data_broadcast_order 0.05% : 0.000014s : 1: convert_after_rewriter 0.13% : 0.000038s : 1: cse_after_recomputation 0.03% : 0.000008s : 1: dataset_repeat_opt 0.08% : 0.000022s : 1: detach_backward 0.05% : 0.000013s : 1: environ_conv 0.10% : 0.000027s : 1: event_method 0.03% : 0.000008s : 1: full_micro_interleaved_order_control 0.03% : 0.000008s : 1: get_jit_bprop_graph 0.04% : 0.000011s : 1: graph_reusing 0.02% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.02% : 0.000006s : 1: handle_group_info 0.03% : 0.000008s : 1: inline 0.03% : 0.000009s : 1: insert-virtual-dataset 0.02% : 0.000007s : 1: interleave_parallel_branches 0.02% : 0.000006s : 1: interleave_split_concat_branches 0.03% : 0.000008s : 1: label_fine_grained_interleaved_index 0.04% : 0.000010s : 1: label_micro_interleaved_index 1.81% : 0.000511s : 1: loop_unroll 0.03% : 0.000007s : 1: merge_cast_opt 0.03% : 0.000008s : 1: micro_interleaved_order_control 2.44% : 0.000689s : 1: mutable_eliminate 0.03% : 0.000010s : 1: offloading_packed_experts 0.07% : 0.000019s : 1: opt.transform.loop_unroll_optimizer 0.08% : 0.000023s : 1: opt.transform.mutable_eliminate 3.56% : 0.001006s : 78: opt.transform.opt_a 0.12% : 0.000034s : 1: opt.transform.opt_after_cconv 0.11% : 0.000032s : 1: opt.transform.opt_after_jit_grad 0.45% : 0.000127s : 28: opt.transform.opt_b 0.20% : 0.000056s : 2: opt.transform.opt_trans_graph 0.16% : 0.000046s : 4: opt.transform.symbol_engine_opt 11.24% : 0.003173s : 1: opt_a 0.54% : 0.000152s : 1: opt_after_cconv 2.06% : 0.000582s : 1: opt_after_jit_grad 1.13% : 0.000318s : 1: opt_b 21.84% : 0.006167s : 1: optimize 0.09% : 0.000025s : 1: optimize_parallel_all_gather_comm 0.04% : 0.000013s : 1: order_py_execute_after_rewriter 0.11% : 0.000031s : 1: overlap_grad_flash_sp 0.02% : 0.000006s : 1: overlap_grad_matmul_and_grad_allreduce 0.03% : 0.000009s : 1: overlap_grad_ring_attention 0.02% : 0.000007s : 1: overlap_opt_shard_grad_in_pipeline 0.02% : 0.000006s : 1: overlap_opt_shard_in_pipeline 0.03% : 0.000008s : 1: overlap_param_gather 0.02% : 0.000006s : 1: overlap_recompute_allgather_and_fa_grad 0.04% : 0.000011s : 1: overlap_recompute_and_grad_model_parallel 0.03% : 0.000008s : 1: overlap_recompute_comm 0.04% : 0.000011s : 1: parallel-infer-symbol 0.02% : 0.000007s : 1: parallel-infer-symbol-second 0.03% : 0.000008s : 1: partial_unused_args_eliminate 0.03% : 0.000009s : 1: pipeline_parallel_scheduler 0.02% : 0.000007s : 1: pipeline_split 0.14% : 0.000041s : 1: pre_auto_parallel 0.10% : 0.000028s : 1: py_interpret_to_execute 0.08% : 0.000022s : 1: py_interpret_to_execute_after_opt_a 0.02% : 0.000006s : 1: remove_cast_before_assign_add 0.08% : 0.000024s : 1: remove_dup_value 1.83% : 0.000517s : 1: renormalize.infer 1.09% : 0.000308s : 1: renormalize.specialize 0.03% : 0.000008s : 1: reorder_send_recv_between_fp_bp 0.04% : 0.000012s : 1: rewriter_after_jit_bprop_graph 0.20% : 0.000056s : 1: rewriter_after_opt_a 0.23% : 0.000065s : 1: rewriter_before_opt_a 0.03% : 0.000008s : 1: slice_cell_reuse_recomputed_activation 0.03% : 0.000008s : 1: slice_recompute_activation 0.02% : 0.000007s : 1: split_layernorm_comm 0.03% : 0.000007s : 1: split_matmul_comm_elemetwise 0.04% : 0.000012s : 1: swap_dp_allreduce_reducescatter 0.40% : 0.000114s : 1: symbol_engine_optimizer 0.37% : 0.000104s : 1: tuple_transform 19.39% : 0.005475s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:43.645.566 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0136387, [21] [bootstrap]: 0.00043977 [type_inference]: 0.00477649 [event_method]: 1.288e-05 [auto_monad]: 5.54e-05 [graph_reusing]: 5.55001e-06 [inline]: 2.51998e-06 [add_attr]: 0.00321, [1] [add_attr_with_inline]: 0.00320186, [1] [Cycle 1]: 5.335e-05, [2] [tag_attr]: 1.536e-05 [meta_addattr_fg_expand]: 3.8e-06 [parallel-infer-symbol]: 3.87002e-06 [pre_auto_parallel]: 2.64e-05 [insert-virtual-dataset]: 2.61e-06 [parallel-infer-symbol-second]: 7.59988e-07 [dataset_repeat_opt]: 2.21998e-06 [pipeline_split]: 1.57999e-06 [optimize]: 0.00441028, [53] [py_interpret_to_execute]: 1.828e-05 [rewriter_before_opt_a]: 4.854e-05 [opt_a]: 0.00237345, [2] [Cycle 1]: 0.00165139, [45] [expand_dump_flag]: 2.71999e-06 [switch_simplify]: 2.766e-05 [loop_unroll]: 1.592e-05 [a_1]: 0.00034118 [with_stream_mark]: 1.536e-05 [recompute_prepare]: 9.32001e-06 [updatestate_depend_eliminate]: 4.35999e-06 [updatestate_assign_eliminate]: 4.03001e-06 [updatestate_loads_eliminate]: 3.93001e-06 [parameter_eliminate]: 1.79998e-06 [a_2]: 9.527e-05 [accelerated_algorithm]: 8.17e-06 [shard]: 1.98002e-06 [meta_shard_fg_expand]: 2.04e-06 [shard_inline]: 7.82e-06 [merge_send_recv]: 9.27999e-06 [auto_parallel]: 7.29001e-06 [parallel]: 1.718e-05 [flash_sp]: 8.22e-06 [merge_comm]: 4.69002e-06 [allreduce_fusion]: 4.10998e-06 [matmul_add_comm_reduction]: 1.018e-05 [allreduce_slice_to_reducescatter]: 6.59988e-07 [virtual_shard_identity]: 9.02999e-06 [virtual_dataset]: 7.02002e-06 [get_grad_eliminate_]: 7.38999e-06 [virtual_output]: 7.05e-06 [merge_forward]: 4.22998e-06 [cell_reuse_recompute_pass]: 1.26002e-06 [offload_activation]: 1.058e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.472e-05 [merge_recompute_call_nodes]: 1.55999e-06 [before_grad]: 1.24e-05 [set_forward_comm_id_for_comm_node_pass]: 4.43999e-06 [meta_fg_expand]: 3.03998e-06 [flash_sp_send_recv_attached]: 2.54999e-06 [receive_attached]: 1.94999e-06 [after_resolve]: 1.133e-05 [a_after_grad]: 1.072e-05 [renormalize]: 0.00058623 [add_forward_monad_depend]: 4.51002e-06 [auto_monad_grad]: 2.22001e-06 [auto_monad_eliminator]: 1.611e-05 [cse]: 3.642e-05 [a_3]: 5.547e-05 [Cycle 2]: 0.00071274, [45] [expand_dump_flag]: 9.80013e-07 [switch_simplify]: 8.3e-06 [loop_unroll]: 7.35e-06 [a_1]: 0.00015043 [with_stream_mark]: 1.216e-05 [recompute_prepare]: 7.71001e-06 [updatestate_depend_eliminate]: 4.12e-06 [updatestate_assign_eliminate]: 2.83998e-06 [updatestate_loads_eliminate]: 3.03e-06 [parameter_eliminate]: 1.06002e-06 [a_2]: 8.694e-05 [accelerated_algorithm]: 7.03e-06 [shard]: 9.89996e-07 [meta_shard_fg_expand]: 1.44998e-06 [shard_inline]: 7.28e-06 [merge_send_recv]: 5.35999e-06 [auto_parallel]: 6.79001e-06 [parallel]: 4.67e-06 [flash_sp]: 3.20002e-06 [merge_comm]: 4.07e-06 [allreduce_fusion]: 3.75e-06 [matmul_add_comm_reduction]: 6.49001e-06 [allreduce_slice_to_reducescatter]: 3.39991e-07 [virtual_shard_identity]: 7.6e-06 [virtual_dataset]: 6.83e-06 [get_grad_eliminate_]: 6.66999e-06 [virtual_output]: 6.46e-06 [merge_forward]: 3.27997e-06 [cell_reuse_recompute_pass]: 1.39998e-06 [offload_activation]: 7.29001e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.358e-05 [merge_recompute_call_nodes]: 8.89995e-07 [before_grad]: 1.107e-05 [set_forward_comm_id_for_comm_node_pass]: 4.55001e-06 [meta_fg_expand]: 2.49001e-06 [flash_sp_send_recv_attached]: 7.7e-07 [receive_attached]: 1.02e-06 [after_resolve]: 9.61e-06 [a_after_grad]: 1.006e-05 [renormalize]: 1.00001e-07 [add_forward_monad_depend]: 1.24e-06 [auto_monad_grad]: 8.49977e-07 [auto_monad_eliminator]: 8.26002e-06 [cse]: 1.842e-05 [a_3]: 4.389e-05 [py_interpret_to_execute_after_opt_a]: 8.91002e-06 [slice_cell_reuse_recomputed_activation]: 2.53e-06 [rewriter_after_opt_a]: 3.93e-05 [convert_after_rewriter]: 7.95e-06 [order_py_execute_after_rewriter]: 6.24999e-06 [mutable_eliminate]: 0.00048578 [opt_b]: 0.00024863, [1] [Cycle 1]: 0.0002424, [7] [b_1]: 0.00014676 [b_2]: 8.75999e-06 [updatestate_depend_eliminate]: 5.79e-06 [updatestate_assign_eliminate]: 2.96999e-06 [updatestate_loads_eliminate]: 2.89999e-06 [renormalize]: 3.7998e-07 [cse]: 2.431e-05 [optimize_parallel_all_gather_comm]: 1.766e-05 [overlap_param_gather]: 2.06e-06 [cconv]: 2.532e-05 [loop_unroll]: 0.00043721 [opt_after_cconv]: 0.00011281, [1] [Cycle 1]: 0.00010707, [7] [c_1]: 3.458e-05 [parameter_eliminate]: 2.54001e-06 [updatestate_depend_eliminate]: 5.74e-06 [updatestate_assign_eliminate]: 3.06999e-06 [updatestate_loads_eliminate]: 3.21999e-06 [cse]: 2.382e-05 [renormalize]: 4.00003e-07 [remove_dup_value]: 1.627e-05 [tuple_transform]: 7.976e-05, [1] [Cycle 1]: 7.54e-05, [4] [d_1]: 4.725e-05 [none_parameter_eliminate]: 1.55999e-06 [renormalize]: 2.50002e-07 [switch_simplify]: 7.86001e-06 [partial_unused_args_eliminate]: 1.59e-06 [add_recomputation]: 4.935e-05 [cse_after_recomputation]: 2.535e-05, [1] [Cycle 1]: 2.072e-05, [1] [cse]: 1.532e-05 [environ_conv]: 5.89e-06 [swap_dp_allreduce_reducescatter]: 5.79999e-06 [bias_add_comm_swap]: 3.08e-06 [label_micro_interleaved_index]: 4.15e-06 [label_fine_grained_interleaved_index]: 2.69999e-06 [merge_cast_opt]: 1.38002e-06 [slice_recompute_activation]: 2.46e-06 [micro_interleaved_order_control]: 1.99999e-06 [assign_add_opt]: 1.32e-06 [ForceFp32Comm]: 8.2e-07 [remove_cast_before_assign_add]: 1.14e-06 [full_micro_interleaved_order_control]: 1.99e-06 [reorder_send_recv_between_fp_bp]: 2.99001e-06 [comm_op_add_attrs]: 9.70002e-07 [add_comm_op_reuse_tag]: 9.60019e-07 [interleave_split_concat_branches]: 1.30999e-06 [interleave_parallel_branches]: 1.10999e-06 [overlap_opt_shard_in_pipeline]: 1.12e-06 [overlap_opt_shard_grad_in_pipeline]: 1.87001e-06 [control_data_broadcast_order]: 1.409e-05 [grouped_pairwise_exchange_alltoall]: 1.52001e-06 [offloading_packed_experts]: 4.63999e-06 [overlap_recompute_and_grad_model_parallel]: 5.07e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.09998e-06 [overlap_recompute_allgather_and_fa_grad]: 1.72999e-06 [overlap_recompute_comm]: 2.16e-06 [overlap_grad_ring_attention]: 4.42998e-06 [overlap_grad_flash_sp]: 2.067e-05 [begin_end_overlap_inline]: 4.59986e-07 [split_matmul_comm_elemetwise]: 2.41998e-06 [split_layernorm_comm]: 1.59e-06 [handle_group_info]: 1.07998e-06 [symbol_engine_optimizer]: 8.119e-05, [1] [Cycle 1]: 7.664e-05, [6] [build]: 3.16001e-06 [elim_shapecalc]: 1.05e-05 [elim_not_effective]: 1.47e-05 [opt_reshape]: 8.40001e-06 [fold_const_symbol]: 1.2e-05 [renormalize]: 2.50002e-07 [detach_backward]: 1.94e-06 [pipeline_parallel_scheduler]: 1.92001e-06 [auto_monad_reorder]: 1.935e-05 [get_jit_bprop_graph]: 1.02e-06 [rewriter_after_jit_bprop_graph]: 3.58e-06 [opt_after_jit_grad]: 0.00047315 [validate]: 4.21e-05 Sums bootstrap : 0.000440s : 4.65% type_inference : 0.004776s : 50.46% event_method : 0.000013s : 0.14% auto_monad : 0.000055s : 0.59% graph_reusing : 0.000006s : 0.06% inline : 0.000003s : 0.03% add_attr.add_attr_with_inline.tag_attr : 0.000015s : 0.16% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000004s : 0.04% parallel-infer-symbol : 0.000004s : 0.04% pre_auto_parallel : 0.000026s : 0.28% insert-virtual-dataset : 0.000003s : 0.03% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.02% optimize.py_interpret_to_execute : 0.000018s : 0.19% optimize.rewriter_before_opt_a : 0.000049s : 0.51% optimize.opt_a.expand_dump_flag : 0.000004s : 0.04% optimize.opt_a.switch_simplify : 0.000036s : 0.38% optimize.opt_a.loop_unroll : 0.000023s : 0.25% optimize.opt_a.a_1 : 0.000492s : 5.19% optimize.opt_a.with_stream_mark : 0.000028s : 0.29% optimize.opt_a.recompute_prepare : 0.000017s : 0.18% optimize.opt_a.updatestate_depend_eliminate : 0.000008s : 0.09% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.07% optimize.opt_a.updatestate_loads_eliminate : 0.000007s : 0.07% optimize.opt_a.parameter_eliminate : 0.000003s : 0.03% optimize.opt_a.a_2 : 0.000182s : 1.92% optimize.opt_a.accelerated_algorithm : 0.000015s : 0.16% optimize.opt_a.shard : 0.000003s : 0.03% optimize.opt_a.meta_shard_fg_expand : 0.000003s : 0.04% optimize.opt_a.shard_inline : 0.000015s : 0.16% optimize.opt_a.merge_send_recv : 0.000015s : 0.15% optimize.opt_a.auto_parallel : 0.000014s : 0.15% optimize.opt_a.parallel : 0.000022s : 0.23% optimize.opt_a.flash_sp : 0.000011s : 0.12% optimize.opt_a.merge_comm : 0.000009s : 0.09% optimize.opt_a.allreduce_fusion : 0.000008s : 0.08% optimize.opt_a.matmul_add_comm_reduction : 0.000017s : 0.18% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000017s : 0.18% optimize.opt_a.virtual_dataset : 0.000014s : 0.15% optimize.opt_a.get_grad_eliminate_ : 0.000014s : 0.15% optimize.opt_a.virtual_output : 0.000014s : 0.14% optimize.opt_a.merge_forward : 0.000008s : 0.08% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.03% optimize.opt_a.offload_activation : 0.000018s : 0.19% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000028s : 0.30% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.03% optimize.opt_a.before_grad : 0.000023s : 0.25% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000009s : 0.09% optimize.opt_a.meta_fg_expand : 0.000006s : 0.06% optimize.opt_a.flash_sp_send_recv_attached : 0.000003s : 0.04% optimize.opt_a.receive_attached : 0.000003s : 0.03% optimize.opt_a.after_resolve : 0.000021s : 0.22% optimize.opt_a.a_after_grad : 0.000021s : 0.22% optimize.opt_a.renormalize : 0.000586s : 6.19% optimize.opt_a.add_forward_monad_depend : 0.000006s : 0.06% optimize.opt_a.auto_monad_grad : 0.000003s : 0.03% optimize.opt_a.auto_monad_eliminator : 0.000024s : 0.26% optimize.opt_a.cse : 0.000055s : 0.58% optimize.opt_a.a_3 : 0.000099s : 1.05% optimize.py_interpret_to_execute_after_opt_a : 0.000009s : 0.09% optimize.slice_cell_reuse_recomputed_activation : 0.000003s : 0.03% optimize.rewriter_after_opt_a : 0.000039s : 0.42% optimize.convert_after_rewriter : 0.000008s : 0.08% optimize.order_py_execute_after_rewriter : 0.000006s : 0.07% optimize.mutable_eliminate : 0.000486s : 5.13% optimize.opt_b.b_1 : 0.000147s : 1.55% optimize.opt_b.b_2 : 0.000009s : 0.09% optimize.opt_b.updatestate_depend_eliminate : 0.000006s : 0.06% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_b.renormalize : 0.000000s : 0.00% optimize.opt_b.cse : 0.000024s : 0.26% optimize.optimize_parallel_all_gather_comm : 0.000018s : 0.19% optimize.overlap_param_gather : 0.000002s : 0.02% optimize.cconv : 0.000025s : 0.27% optimize.loop_unroll : 0.000437s : 4.62% optimize.opt_after_cconv.c_1 : 0.000035s : 0.37% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.06% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.cse : 0.000024s : 0.25% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000016s : 0.17% optimize.tuple_transform.d_1 : 0.000047s : 0.50% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.02% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000008s : 0.08% optimize.partial_unused_args_eliminate : 0.000002s : 0.02% optimize.add_recomputation : 0.000049s : 0.52% optimize.cse_after_recomputation.cse : 0.000015s : 0.16% optimize.environ_conv : 0.000006s : 0.06% optimize.swap_dp_allreduce_reducescatter : 0.000006s : 0.06% optimize.bias_add_comm_swap : 0.000003s : 0.03% optimize.label_micro_interleaved_index : 0.000004s : 0.04% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.03% optimize.merge_cast_opt : 0.000001s : 0.01% optimize.slice_recompute_activation : 0.000002s : 0.03% optimize.micro_interleaved_order_control : 0.000002s : 0.02% optimize.assign_add_opt : 0.000001s : 0.01% optimize.ForceFp32Comm : 0.000001s : 0.01% optimize.remove_cast_before_assign_add : 0.000001s : 0.01% optimize.full_micro_interleaved_order_control : 0.000002s : 0.02% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.03% optimize.comm_op_add_attrs : 0.000001s : 0.01% optimize.add_comm_op_reuse_tag : 0.000001s : 0.01% optimize.interleave_split_concat_branches : 0.000001s : 0.01% optimize.interleave_parallel_branches : 0.000001s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.02% optimize.control_data_broadcast_order : 0.000014s : 0.15% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.02% optimize.offloading_packed_experts : 0.000005s : 0.05% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.05% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000002s : 0.02% optimize.overlap_recompute_comm : 0.000002s : 0.02% optimize.overlap_grad_ring_attention : 0.000004s : 0.05% optimize.overlap_grad_flash_sp : 0.000021s : 0.22% optimize.begin_end_overlap_inline : 0.000000s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.03% optimize.split_layernorm_comm : 0.000002s : 0.02% optimize.handle_group_info : 0.000001s : 0.01% optimize.symbol_engine_optimizer.build : 0.000003s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000010s : 0.11% optimize.symbol_engine_optimizer.elim_not_effective : 0.000015s : 0.16% optimize.symbol_engine_optimizer.opt_reshape : 0.000008s : 0.09% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000012s : 0.13% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.02% pipeline_parallel_scheduler : 0.000002s : 0.02% auto_monad_reorder : 0.000019s : 0.20% get_jit_bprop_graph : 0.000001s : 0.01% rewriter_after_jit_bprop_graph : 0.000004s : 0.04% opt_after_jit_grad : 0.000473s : 5.00% validate : 0.000042s : 0.44% Time group info: ------[substitution.] 0.000137 28 13.07% : 0.000018s : 2: substitution.cast_eliminate 1.67% : 0.000002s : 3: substitution.elim_not_effective 1.19% : 0.000002s : 3: substitution.fold_const_symbol 3.84% : 0.000005s : 4: substitution.graph_param_transform 71.10% : 0.000098s : 2: substitution.inline 3.00% : 0.000004s : 6: substitution.j_node_and_user_rematch 4.06% : 0.000006s : 6: substitution.remove_not_recompute_node 2.08% : 0.000003s : 2: substitution.replace_old_param ------[type_inference.] 0.004729 2 90.70% : 0.004289s : 1: type_inference.infer 9.30% : 0.000440s : 1: type_inference.specialize ------[replace.] 0.000020 2 100.00% : 0.000020s : 2: replace.inline ------[match.] 0.000096 2 100.00% : 0.000096s : 2: match.inline ------[predicate.] 0.000166 980 0.86% : 0.000001s : 9: predicate.accumulaten_eliminater 1.03% : 0.000002s : 4: predicate.ad_related_special_op_eliminate 0.70% : 0.000001s : 8: predicate.addn_check_dump 0.95% : 0.000002s : 9: predicate.addn_zero_filter 0.71% : 0.000001s : 9: predicate.adjust_all_reduce_mul_add 2.37% : 0.000004s : 17: predicate.arithmetic_simplify 0.97% : 0.000002s : 9: predicate.cast_eliminate 0.74% : 0.000001s : 8: predicate.check_bprop_eliminate 0.68% : 0.000001s : 8: predicate.compare_switch_simplify 0.28% : 0.000000s : 4: predicate.const_output_eliminate 0.74% : 0.000001s : 8: predicate.depend_value_elim 0.80% : 0.000001s : 9: predicate.dict_get_item_const_eliminator 1.00% : 0.000002s : 9: predicate.dict_get_item_eliminator 0.78% : 0.000001s : 9: predicate.dict_set_item_eliminator 1.17% : 0.000002s : 8: predicate.dumpgradient_eliminate 0.28% : 0.000000s : 4: predicate.elim_not_effective 0.47% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.12% : 0.000002s : 13: predicate.environ_add_const_eliminate 1.06% : 0.000002s : 13: predicate.environ_get_add_eliminate 1.11% : 0.000002s : 13: predicate.environ_get_depend_swap 1.79% : 0.000003s : 21: predicate.environ_get_eliminate 1.06% : 0.000002s : 13: predicate.environ_get_set_eliminate 0.96% : 0.000002s : 11: predicate.exchange_switch_depend_value 1.81% : 0.000003s : 11: predicate.float_depend_g_call 0.70% : 0.000001s : 8: predicate.float_environ_get_switch 1.04% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.25% : 0.000000s : 4: predicate.fold_const_symbol 0.84% : 0.000001s : 8: predicate.get_grad_eliminate 0.37% : 0.000001s : 4: predicate.graph_param_transform 0.85% : 0.000001s : 8: predicate.incorporate_call 0.68% : 0.000001s : 8: predicate.incorporate_call_switch 6.22% : 0.000010s : 44: predicate.inline 1.10% : 0.000002s : 8: predicate.inline_without_move 0.39% : 0.000001s : 8: predicate.j_node_and_user_rematch 1.00% : 0.000002s : 8: predicate.less_batch_normalization 1.72% : 0.000003s : 17: predicate.list_to_tuple_eliminator_ 2.26% : 0.000004s : 26: predicate.load_eliminater 1.20% : 0.000002s : 4: predicate.loop_unroll_after_grad 1.55% : 0.000003s : 16: predicate.loop_unroll_before_grad 1.81% : 0.000003s : 17: predicate.make_slice_get_slice_eliminator 0.76% : 0.000001s : 8: predicate.merge_addn 0.72% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.77% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.73% : 0.000001s : 9: predicate.minmaximum_grad 1.20% : 0.000002s : 4: predicate.mutable_eliminate 0.47% : 0.000001s : 4: predicate.opt_reshape 0.43% : 0.000001s : 4: predicate.parallel_virtual_node 1.24% : 0.000002s : 11: predicate.partial_defer_inline 1.29% : 0.000002s : 13: predicate.partial_eliminate 0.82% : 0.000001s : 9: predicate.print_const_string_wrapper 0.72% : 0.000001s : 8: predicate.reduce_all_const_elim 1.05% : 0.000002s : 9: predicate.reduce_eliminate 2.20% : 0.000004s : 26: predicate.redundant_stop_gradient_eliminater 0.50% : 0.000001s : 8: predicate.remove_not_recompute_node 1.21% : 0.000002s : 17: predicate.replace_applicator 0.63% : 0.000001s : 8: predicate.replace_old_param 0.33% : 0.000001s : 4: predicate.reset_defer_inline 0.87% : 0.000001s : 9: predicate.reshape_eliminate 0.83% : 0.000001s : 8: predicate.row_tensor_add_zeros_like 0.45% : 0.000001s : 4: predicate.row_tensor_eliminate 0.97% : 0.000002s : 8: predicate.same_eliminate 0.53% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.91% : 0.000002s : 8: predicate.shard_identity_eliminate 0.98% : 0.000002s : 8: predicate.special_op_eliminate 0.97% : 0.000002s : 8: predicate.specialize_transform 1.21% : 0.000002s : 8: predicate.split_environ_get_set_with_tuple_value 0.94% : 0.000002s : 8: predicate.stack_unstack_eliminate 0.47% : 0.000001s : 4: predicate.switch_call_monad_eliminater 0.98% : 0.000002s : 11: predicate.switch_defer_inline 1.73% : 0.000003s : 19: predicate.switch_layer_defer_inline 4.32% : 0.000007s : 39: predicate.switch_simplify 0.80% : 0.000001s : 9: predicate.tile_eliminate 0.90% : 0.000001s : 9: predicate.transpose_eliminate 1.61% : 0.000003s : 17: predicate.tuple_list_convert_item_index_to_positive 1.63% : 0.000003s : 17: predicate.tuple_list_get_item_const_eliminator 1.52% : 0.000003s : 17: predicate.tuple_list_get_item_depend_reorder 3.09% : 0.000005s : 25: predicate.tuple_list_get_item_eliminator 1.54% : 0.000003s : 17: predicate.tuple_list_get_set_item_eliminator 2.60% : 0.000004s : 25: predicate.tuple_list_set_item_eliminator 1.59% : 0.000003s : 17: predicate.tuple_to_list_eliminator_ 2.08% : 0.000003s : 26: predicate.updatestate_pure_node_eliminater 3.07% : 0.000005s : 34: predicate.updatestate_useless_node_eliminater 0.41% : 0.000001s : 4: predicate.value_based_eliminate 0.76% : 0.000001s : 8: predicate.virtual_dataset_eliminate 0.85% : 0.000001s : 8: predicate.virtual_output_eliminate 0.33% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.56% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000216 5 7.95% : 0.000017s : 1: func_graph_cloner_run.FuncGraphClonerGraph 92.05% : 0.000199s : 4: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.022969 192 0.02% : 0.000004s : 1: ForceFp32Comm 13.99% : 0.003214s : 1: add_attr 13.95% : 0.003205s : 1: add_attr_with_inline 0.02% : 0.000004s : 1: add_comm_op_reuse_tag 0.23% : 0.000053s : 1: add_recomputation 0.02% : 0.000004s : 1: assign_add_opt 0.26% : 0.000060s : 1: auto_monad 0.10% : 0.000023s : 1: auto_monad_reorder 0.01% : 0.000003s : 1: begin_end_overlap_inline 0.03% : 0.000006s : 1: bias_add_comm_swap 2.04% : 0.000468s : 1: bootstrap 0.12% : 0.000029s : 1: cconv 0.02% : 0.000004s : 1: comm_op_add_attrs 0.08% : 0.000017s : 1: control_data_broadcast_order 0.05% : 0.000011s : 1: convert_after_rewriter 0.12% : 0.000028s : 1: cse_after_recomputation 0.02% : 0.000006s : 1: dataset_repeat_opt 0.02% : 0.000005s : 1: detach_backward 0.04% : 0.000009s : 1: environ_conv 0.08% : 0.000018s : 1: event_method 0.02% : 0.000005s : 1: full_micro_interleaved_order_control 0.02% : 0.000004s : 1: get_jit_bprop_graph 0.04% : 0.000009s : 1: graph_reusing 0.02% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.02% : 0.000004s : 1: handle_group_info 0.02% : 0.000005s : 1: inline 0.03% : 0.000006s : 1: insert-virtual-dataset 0.02% : 0.000004s : 1: interleave_parallel_branches 0.02% : 0.000004s : 1: interleave_split_concat_branches 0.02% : 0.000006s : 1: label_fine_grained_interleaved_index 0.03% : 0.000007s : 1: label_micro_interleaved_index 1.94% : 0.000445s : 1: loop_unroll 0.02% : 0.000004s : 1: merge_cast_opt 0.02% : 0.000005s : 1: micro_interleaved_order_control 2.15% : 0.000494s : 1: mutable_eliminate 0.03% : 0.000008s : 1: offloading_packed_experts 0.07% : 0.000017s : 1: opt.transform.loop_unroll_optimizer 0.07% : 0.000017s : 1: opt.transform.mutable_eliminate 4.09% : 0.000940s : 78: opt.transform.opt_a 0.14% : 0.000033s : 1: opt.transform.opt_after_cconv 0.12% : 0.000028s : 1: opt.transform.opt_after_jit_grad 0.54% : 0.000123s : 28: opt.transform.opt_b 0.23% : 0.000053s : 2: opt.transform.opt_trans_graph 0.18% : 0.000042s : 4: opt.transform.symbol_engine_opt 10.35% : 0.002376s : 1: opt_a 0.51% : 0.000116s : 1: opt_after_cconv 2.10% : 0.000482s : 1: opt_after_jit_grad 1.10% : 0.000252s : 1: opt_b 19.22% : 0.004415s : 1: optimize 0.09% : 0.000022s : 1: optimize_parallel_all_gather_comm 0.04% : 0.000009s : 1: order_py_execute_after_rewriter 0.10% : 0.000024s : 1: overlap_grad_flash_sp 0.02% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.03% : 0.000007s : 1: overlap_grad_ring_attention 0.02% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.02% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.02% : 0.000006s : 1: overlap_param_gather 0.02% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.03% : 0.000008s : 1: overlap_recompute_and_grad_model_parallel 0.02% : 0.000005s : 1: overlap_recompute_comm 0.03% : 0.000007s : 1: parallel-infer-symbol 0.02% : 0.000004s : 1: parallel-infer-symbol-second 0.02% : 0.000005s : 1: partial_unused_args_eliminate 0.02% : 0.000005s : 1: pipeline_parallel_scheduler 0.02% : 0.000004s : 1: pipeline_split 0.13% : 0.000030s : 1: pre_auto_parallel 0.10% : 0.000022s : 1: py_interpret_to_execute 0.05% : 0.000012s : 1: py_interpret_to_execute_after_opt_a 0.02% : 0.000004s : 1: remove_cast_before_assign_add 0.09% : 0.000020s : 1: remove_dup_value 1.47% : 0.000339s : 1: renormalize.infer 1.05% : 0.000240s : 1: renormalize.specialize 0.03% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.03% : 0.000007s : 1: rewriter_after_jit_bprop_graph 0.19% : 0.000043s : 1: rewriter_after_opt_a 0.23% : 0.000053s : 1: rewriter_before_opt_a 0.02% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.02% : 0.000005s : 1: slice_recompute_activation 0.02% : 0.000004s : 1: split_layernorm_comm 0.02% : 0.000005s : 1: split_matmul_comm_elemetwise 0.04% : 0.000009s : 1: swap_dp_allreduce_reducescatter 0.37% : 0.000084s : 1: symbol_engine_optimizer 0.36% : 0.000083s : 1: tuple_transform 20.87% : 0.004794s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:43.839.326 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:43.839.599 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0142226, [21] [bootstrap]: 0.00043018 [type_inference]: 0.00462988 [event_method]: 1.281e-05 [auto_monad]: 5.857e-05 [graph_reusing]: 5.32999e-06 [inline]: 2.15002e-06 [add_attr]: 0.0031728, [1] [add_attr_with_inline]: 0.00316383, [1] [Cycle 1]: 7.033e-05, [2] [tag_attr]: 1.49e-05 [meta_addattr_fg_expand]: 3.58e-06 [parallel-infer-symbol]: 3.75e-06 [pre_auto_parallel]: 2.757e-05 [insert-virtual-dataset]: 2.51998e-06 [parallel-infer-symbol-second]: 7.40023e-07 [dataset_repeat_opt]: 1.82001e-06 [pipeline_split]: 1.55999e-06 [optimize]: 0.00476683, [53] [py_interpret_to_execute]: 2.285e-05 [rewriter_before_opt_a]: 5.044e-05 [opt_a]: 0.00252053, [2] [Cycle 1]: 0.00170982, [45] [expand_dump_flag]: 3.22002e-06 [switch_simplify]: 2.554e-05 [loop_unroll]: 1.376e-05 [a_1]: 0.00028974 [with_stream_mark]: 1.861e-05 [recompute_prepare]: 8.43001e-06 [updatestate_depend_eliminate]: 4.33999e-06 [updatestate_assign_eliminate]: 3.11999e-06 [updatestate_loads_eliminate]: 2.86999e-06 [parameter_eliminate]: 2.03002e-06 [a_2]: 0.00010719 [accelerated_algorithm]: 7.55e-06 [shard]: 2.51e-06 [meta_shard_fg_expand]: 1.66e-06 [shard_inline]: 6.81999e-06 [merge_send_recv]: 8.52e-06 [auto_parallel]: 7.1e-06 [parallel]: 1.922e-05 [flash_sp]: 7.93001e-06 [merge_comm]: 4.40999e-06 [allreduce_fusion]: 3.71001e-06 [matmul_add_comm_reduction]: 8.90001e-06 [allreduce_slice_to_reducescatter]: 6.19999e-07 [virtual_shard_identity]: 8.52998e-06 [virtual_dataset]: 6.24001e-06 [get_grad_eliminate_]: 6.26e-06 [virtual_output]: 6.39999e-06 [merge_forward]: 3.68999e-06 [cell_reuse_recompute_pass]: 1.33002e-06 [offload_activation]: 9.67999e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.628e-05 [merge_recompute_call_nodes]: 1.45999e-06 [before_grad]: 1.089e-05 [set_forward_comm_id_for_comm_node_pass]: 4.33999e-06 [meta_fg_expand]: 2.63998e-06 [flash_sp_send_recv_attached]: 3.15002e-06 [receive_attached]: 2.50002e-06 [after_resolve]: 9.74e-06 [a_after_grad]: 9.67001e-06 [renormalize]: 0.00054222 [add_forward_monad_depend]: 5.10999e-06 [auto_monad_grad]: 2.33002e-06 [auto_monad_eliminator]: 1.588e-05 [cse]: 3.004e-05 [a_3]: 6.031e-05 [Cycle 2]: 0.00079665, [45] [expand_dump_flag]: 9.09989e-07 [switch_simplify]: 7.31999e-06 [loop_unroll]: 5.59998e-06 [a_1]: 0.00010516 [with_stream_mark]: 1.178e-05 [recompute_prepare]: 6.17001e-06 [updatestate_depend_eliminate]: 3.31001e-06 [updatestate_assign_eliminate]: 2.55002e-06 [updatestate_loads_eliminate]: 2.64999e-06 [parameter_eliminate]: 1.26002e-06 [a_2]: 9.614e-05 [accelerated_algorithm]: 6.06998e-06 [shard]: 1.55999e-06 [meta_shard_fg_expand]: 1.59e-06 [shard_inline]: 5.98002e-06 [merge_send_recv]: 5.69e-06 [auto_parallel]: 6.10002e-06 [parallel]: 5.56e-06 [flash_sp]: 3.53999e-06 [merge_comm]: 3.45e-06 [allreduce_fusion]: 3.26999e-06 [matmul_add_comm_reduction]: 6.63e-06 [allreduce_slice_to_reducescatter]: 5.69999e-07 [virtual_shard_identity]: 6.36e-06 [virtual_dataset]: 5.75001e-06 [get_grad_eliminate_]: 5.43002e-06 [virtual_output]: 5.39998e-06 [merge_forward]: 2.86999e-06 [cell_reuse_recompute_pass]: 1.49998e-06 [offload_activation]: 7.28999e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.499e-05 [merge_recompute_call_nodes]: 1.26997e-06 [before_grad]: 9.92001e-06 [set_forward_comm_id_for_comm_node_pass]: 3.65e-06 [meta_fg_expand]: 2.24001e-06 [flash_sp_send_recv_attached]: 1.00001e-06 [receive_attached]: 9.99979e-07 [after_resolve]: 9.40001e-06 [a_after_grad]: 8.07e-06 [renormalize]: 7.99773e-08 [add_forward_monad_depend]: 1.47001e-06 [auto_monad_grad]: 8.10018e-07 [auto_monad_eliminator]: 7.03e-06 [cse]: 1.37e-05 [a_3]: 4.72e-05 [py_interpret_to_execute_after_opt_a]: 1.465e-05 [slice_cell_reuse_recomputed_activation]: 4.87998e-06 [rewriter_after_opt_a]: 4.041e-05 [convert_after_rewriter]: 9.68002e-06 [order_py_execute_after_rewriter]: 9.36e-06 [mutable_eliminate]: 0.0005363 [opt_b]: 0.00026884, [1] [Cycle 1]: 0.00025953, [7] [b_1]: 0.00016214 [b_2]: 7.83001e-06 [updatestate_depend_eliminate]: 5.72999e-06 [updatestate_assign_eliminate]: 2.81999e-06 [updatestate_loads_eliminate]: 2.33002e-06 [renormalize]: 5.59987e-07 [cse]: 2.157e-05 [optimize_parallel_all_gather_comm]: 1.895e-05 [overlap_param_gather]: 4.95999e-06 [cconv]: 2.955e-05 [loop_unroll]: 0.00043875 [opt_after_cconv]: 0.00012368, [1] [Cycle 1]: 0.00011522, [7] [c_1]: 2.796e-05 [parameter_eliminate]: 3.16001e-06 [updatestate_depend_eliminate]: 5.22999e-06 [updatestate_assign_eliminate]: 2.50002e-06 [updatestate_loads_eliminate]: 2.71999e-06 [cse]: 1.824e-05 [renormalize]: 3.39991e-07 [remove_dup_value]: 1.865e-05 [tuple_transform]: 8.64e-05, [1] [Cycle 1]: 7.903e-05, [4] [d_1]: 4.053e-05 [none_parameter_eliminate]: 1.69998e-06 [renormalize]: 1.8999e-07 [switch_simplify]: 6.35002e-06 [partial_unused_args_eliminate]: 4.38999e-06 [add_recomputation]: 4.894e-05 [cse_after_recomputation]: 2.793e-05, [1] [Cycle 1]: 2.109e-05, [1] [cse]: 1.235e-05 [environ_conv]: 8.13999e-06 [swap_dp_allreduce_reducescatter]: 8.07e-06 [bias_add_comm_swap]: 4.91002e-06 [label_micro_interleaved_index]: 6.79999e-06 [label_fine_grained_interleaved_index]: 5.20999e-06 [merge_cast_opt]: 3.85e-06 [slice_recompute_activation]: 4.70001e-06 [micro_interleaved_order_control]: 4.50999e-06 [assign_add_opt]: 3.73001e-06 [ForceFp32Comm]: 3.16999e-06 [remove_cast_before_assign_add]: 3.78999e-06 [full_micro_interleaved_order_control]: 4.62e-06 [reorder_send_recv_between_fp_bp]: 5.00001e-06 [comm_op_add_attrs]: 3.4e-06 [add_comm_op_reuse_tag]: 3.25e-06 [interleave_split_concat_branches]: 3.41999e-06 [interleave_parallel_branches]: 3.81001e-06 [overlap_opt_shard_in_pipeline]: 3.53e-06 [overlap_opt_shard_grad_in_pipeline]: 4.58001e-06 [control_data_broadcast_order]: 1.555e-05 [grouped_pairwise_exchange_alltoall]: 3.99002e-06 [offloading_packed_experts]: 6.48003e-06 [overlap_recompute_and_grad_model_parallel]: 7.6e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.65e-06 [overlap_recompute_allgather_and_fa_grad]: 3.99997e-06 [overlap_recompute_comm]: 4.82998e-06 [overlap_grad_ring_attention]: 6.73e-06 [overlap_grad_flash_sp]: 2.266e-05 [begin_end_overlap_inline]: 2.98e-06 [split_matmul_comm_elemetwise]: 4.74e-06 [split_layernorm_comm]: 4.13999e-06 [handle_group_info]: 3.41001e-06 [symbol_engine_optimizer]: 9.647e-05, [1] [Cycle 1]: 8.902e-05, [6] [build]: 2.66e-06 [elim_shapecalc]: 1.057e-05 [elim_not_effective]: 1.33e-05 [opt_reshape]: 6.91999e-06 [fold_const_symbol]: 1.011e-05 [renormalize]: 3.10014e-07 [detach_backward]: 3.48999e-06 [pipeline_parallel_scheduler]: 1.81e-06 [auto_monad_reorder]: 1.722e-05 [get_jit_bprop_graph]: 2.61e-06 [rewriter_after_jit_bprop_graph]: 5.47001e-06 [opt_after_jit_grad]: 0.00047011 [validate]: 3.764e-05 Sums bootstrap : 0.000430s : 4.63% type_inference : 0.004630s : 49.79% event_method : 0.000013s : 0.14% auto_monad : 0.000059s : 0.63% graph_reusing : 0.000005s : 0.06% inline : 0.000002s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000015s : 0.16% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000004s : 0.04% parallel-infer-symbol : 0.000004s : 0.04% pre_auto_parallel : 0.000028s : 0.30% insert-virtual-dataset : 0.000003s : 0.03% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.02% optimize.py_interpret_to_execute : 0.000023s : 0.25% optimize.rewriter_before_opt_a : 0.000050s : 0.54% optimize.opt_a.expand_dump_flag : 0.000004s : 0.04% optimize.opt_a.switch_simplify : 0.000033s : 0.35% optimize.opt_a.loop_unroll : 0.000019s : 0.21% optimize.opt_a.a_1 : 0.000395s : 4.25% optimize.opt_a.with_stream_mark : 0.000030s : 0.33% optimize.opt_a.recompute_prepare : 0.000015s : 0.16% optimize.opt_a.updatestate_depend_eliminate : 0.000008s : 0.08% optimize.opt_a.updatestate_assign_eliminate : 0.000006s : 0.06% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.06% optimize.opt_a.parameter_eliminate : 0.000003s : 0.04% optimize.opt_a.a_2 : 0.000203s : 2.19% optimize.opt_a.accelerated_algorithm : 0.000014s : 0.15% optimize.opt_a.shard : 0.000004s : 0.04% optimize.opt_a.meta_shard_fg_expand : 0.000003s : 0.03% optimize.opt_a.shard_inline : 0.000013s : 0.14% optimize.opt_a.merge_send_recv : 0.000014s : 0.15% optimize.opt_a.auto_parallel : 0.000013s : 0.14% optimize.opt_a.parallel : 0.000025s : 0.27% optimize.opt_a.flash_sp : 0.000011s : 0.12% optimize.opt_a.merge_comm : 0.000008s : 0.08% optimize.opt_a.allreduce_fusion : 0.000007s : 0.08% optimize.opt_a.matmul_add_comm_reduction : 0.000016s : 0.17% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000015s : 0.16% optimize.opt_a.virtual_dataset : 0.000012s : 0.13% optimize.opt_a.get_grad_eliminate_ : 0.000012s : 0.13% optimize.opt_a.virtual_output : 0.000012s : 0.13% optimize.opt_a.merge_forward : 0.000007s : 0.07% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.03% optimize.opt_a.offload_activation : 0.000017s : 0.18% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000031s : 0.34% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.03% optimize.opt_a.before_grad : 0.000021s : 0.22% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000008s : 0.09% optimize.opt_a.meta_fg_expand : 0.000005s : 0.05% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.04% optimize.opt_a.receive_attached : 0.000003s : 0.04% optimize.opt_a.after_resolve : 0.000019s : 0.21% optimize.opt_a.a_after_grad : 0.000018s : 0.19% optimize.opt_a.renormalize : 0.000542s : 5.83% optimize.opt_a.add_forward_monad_depend : 0.000007s : 0.07% optimize.opt_a.auto_monad_grad : 0.000003s : 0.03% optimize.opt_a.auto_monad_eliminator : 0.000023s : 0.25% optimize.opt_a.cse : 0.000044s : 0.47% optimize.opt_a.a_3 : 0.000108s : 1.16% optimize.py_interpret_to_execute_after_opt_a : 0.000015s : 0.16% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.05% optimize.rewriter_after_opt_a : 0.000040s : 0.43% optimize.convert_after_rewriter : 0.000010s : 0.10% optimize.order_py_execute_after_rewriter : 0.000009s : 0.10% optimize.mutable_eliminate : 0.000536s : 5.77% optimize.opt_b.b_1 : 0.000162s : 1.74% optimize.opt_b.b_2 : 0.000008s : 0.08% optimize.opt_b.updatestate_depend_eliminate : 0.000006s : 0.06% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_b.updatestate_loads_eliminate : 0.000002s : 0.03% optimize.opt_b.renormalize : 0.000001s : 0.01% optimize.opt_b.cse : 0.000022s : 0.23% optimize.optimize_parallel_all_gather_comm : 0.000019s : 0.20% optimize.overlap_param_gather : 0.000005s : 0.05% optimize.cconv : 0.000030s : 0.32% optimize.loop_unroll : 0.000439s : 4.72% optimize.opt_after_cconv.c_1 : 0.000028s : 0.30% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000005s : 0.06% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.cse : 0.000018s : 0.20% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000019s : 0.20% optimize.tuple_transform.d_1 : 0.000041s : 0.44% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.02% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000006s : 0.07% optimize.partial_unused_args_eliminate : 0.000004s : 0.05% optimize.add_recomputation : 0.000049s : 0.53% optimize.cse_after_recomputation.cse : 0.000012s : 0.13% optimize.environ_conv : 0.000008s : 0.09% optimize.swap_dp_allreduce_reducescatter : 0.000008s : 0.09% optimize.bias_add_comm_swap : 0.000005s : 0.05% optimize.label_micro_interleaved_index : 0.000007s : 0.07% optimize.label_fine_grained_interleaved_index : 0.000005s : 0.06% optimize.merge_cast_opt : 0.000004s : 0.04% optimize.slice_recompute_activation : 0.000005s : 0.05% optimize.micro_interleaved_order_control : 0.000005s : 0.05% optimize.assign_add_opt : 0.000004s : 0.04% optimize.ForceFp32Comm : 0.000003s : 0.03% optimize.remove_cast_before_assign_add : 0.000004s : 0.04% optimize.full_micro_interleaved_order_control : 0.000005s : 0.05% optimize.reorder_send_recv_between_fp_bp : 0.000005s : 0.05% optimize.comm_op_add_attrs : 0.000003s : 0.04% optimize.add_comm_op_reuse_tag : 0.000003s : 0.03% optimize.interleave_split_concat_branches : 0.000003s : 0.04% optimize.interleave_parallel_branches : 0.000004s : 0.04% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.04% optimize.overlap_opt_shard_grad_in_pipeline : 0.000005s : 0.05% optimize.control_data_broadcast_order : 0.000016s : 0.17% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.04% optimize.offloading_packed_experts : 0.000006s : 0.07% optimize.overlap_recompute_and_grad_model_parallel : 0.000008s : 0.08% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.04% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.04% optimize.overlap_recompute_comm : 0.000005s : 0.05% optimize.overlap_grad_ring_attention : 0.000007s : 0.07% optimize.overlap_grad_flash_sp : 0.000023s : 0.24% optimize.begin_end_overlap_inline : 0.000003s : 0.03% optimize.split_matmul_comm_elemetwise : 0.000005s : 0.05% optimize.split_layernorm_comm : 0.000004s : 0.04% optimize.handle_group_info : 0.000003s : 0.04% optimize.symbol_engine_optimizer.build : 0.000003s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000011s : 0.11% optimize.symbol_engine_optimizer.elim_not_effective : 0.000013s : 0.14% optimize.symbol_engine_optimizer.opt_reshape : 0.000007s : 0.07% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000010s : 0.11% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000003s : 0.04% pipeline_parallel_scheduler : 0.000002s : 0.02% auto_monad_reorder : 0.000017s : 0.19% get_jit_bprop_graph : 0.000003s : 0.03% rewriter_after_jit_bprop_graph : 0.000005s : 0.06% opt_after_jit_grad : 0.000470s : 5.06% validate : 0.000038s : 0.40% Time group info: ------[substitution.] 0.000112 19 1.62% : 0.000002s : 2: substitution.elim_not_effective 1.35% : 0.000002s : 2: substitution.fold_const_symbol 4.84% : 0.000005s : 3: substitution.graph_param_transform 80.89% : 0.000091s : 2: substitution.inline 4.06% : 0.000005s : 4: substitution.j_node_and_user_rematch 4.03% : 0.000005s : 4: substitution.remove_not_recompute_node 3.22% : 0.000004s : 2: substitution.replace_old_param ------[type_inference.] 0.004586 2 90.76% : 0.004163s : 1: type_inference.infer 9.24% : 0.000424s : 1: type_inference.specialize ------[replace.] 0.000022 2 100.00% : 0.000022s : 2: replace.inline ------[match.] 0.000089 2 100.00% : 0.000089s : 2: match.inline ------[predicate.] 0.000134 754 0.87% : 0.000001s : 7: predicate.accumulaten_eliminater 0.99% : 0.000001s : 3: predicate.ad_related_special_op_eliminate 0.70% : 0.000001s : 6: predicate.addn_check_dump 0.85% : 0.000001s : 7: predicate.addn_zero_filter 0.73% : 0.000001s : 7: predicate.adjust_all_reduce_mul_add 2.41% : 0.000003s : 13: predicate.arithmetic_simplify 0.85% : 0.000001s : 7: predicate.cast_eliminate 0.76% : 0.000001s : 6: predicate.check_bprop_eliminate 0.79% : 0.000001s : 6: predicate.compare_switch_simplify 0.24% : 0.000000s : 3: predicate.const_output_eliminate 0.78% : 0.000001s : 6: predicate.depend_value_elim 0.78% : 0.000001s : 7: predicate.dict_get_item_const_eliminator 0.94% : 0.000001s : 7: predicate.dict_get_item_eliminator 0.79% : 0.000001s : 7: predicate.dict_set_item_eliminator 1.29% : 0.000002s : 6: predicate.dumpgradient_eliminate 0.26% : 0.000000s : 3: predicate.elim_not_effective 0.60% : 0.000001s : 3: predicate.elim_shapecalc_of_broadcastargs 1.08% : 0.000001s : 10: predicate.environ_add_const_eliminate 1.05% : 0.000001s : 10: predicate.environ_get_add_eliminate 0.98% : 0.000001s : 10: predicate.environ_get_depend_swap 2.00% : 0.000003s : 16: predicate.environ_get_eliminate 0.99% : 0.000001s : 10: predicate.environ_get_set_eliminate 0.95% : 0.000001s : 9: predicate.exchange_switch_depend_value 1.98% : 0.000003s : 9: predicate.float_depend_g_call 0.65% : 0.000001s : 6: predicate.float_environ_get_switch 1.02% : 0.000001s : 9: predicate.float_tuple_getitem_switch 0.24% : 0.000000s : 3: predicate.fold_const_symbol 0.75% : 0.000001s : 6: predicate.get_grad_eliminate 0.22% : 0.000000s : 3: predicate.graph_param_transform 0.75% : 0.000001s : 6: predicate.incorporate_call 0.62% : 0.000001s : 6: predicate.incorporate_call_switch 6.51% : 0.000009s : 34: predicate.inline 1.15% : 0.000002s : 6: predicate.inline_without_move 0.54% : 0.000001s : 6: predicate.j_node_and_user_rematch 1.08% : 0.000001s : 6: predicate.less_batch_normalization 1.75% : 0.000002s : 13: predicate.list_to_tuple_eliminator_ 2.09% : 0.000003s : 20: predicate.load_eliminater 1.07% : 0.000001s : 3: predicate.loop_unroll_after_grad 1.68% : 0.000002s : 14: predicate.loop_unroll_before_grad 1.78% : 0.000002s : 13: predicate.make_slice_get_slice_eliminator 0.73% : 0.000001s : 6: predicate.merge_addn 0.69% : 0.000001s : 6: predicate.micro_step_allgather_replace 0.73% : 0.000001s : 6: predicate.mini_step_allgather_replace 0.74% : 0.000001s : 7: predicate.minmaximum_grad 1.32% : 0.000002s : 3: predicate.mutable_eliminate 0.45% : 0.000001s : 3: predicate.opt_reshape 0.47% : 0.000001s : 3: predicate.parallel_virtual_node 1.28% : 0.000002s : 9: predicate.partial_defer_inline 1.26% : 0.000002s : 10: predicate.partial_eliminate 0.79% : 0.000001s : 7: predicate.print_const_string_wrapper 0.75% : 0.000001s : 6: predicate.reduce_all_const_elim 1.02% : 0.000001s : 7: predicate.reduce_eliminate 2.09% : 0.000003s : 20: predicate.redundant_stop_gradient_eliminater 0.66% : 0.000001s : 6: predicate.remove_not_recompute_node 1.17% : 0.000002s : 13: predicate.replace_applicator 0.69% : 0.000001s : 6: predicate.replace_old_param 0.36% : 0.000000s : 3: predicate.reset_defer_inline 0.85% : 0.000001s : 7: predicate.reshape_eliminate 0.95% : 0.000001s : 6: predicate.row_tensor_add_zeros_like 0.43% : 0.000001s : 3: predicate.row_tensor_eliminate 0.92% : 0.000001s : 6: predicate.same_eliminate 0.70% : 0.000001s : 6: predicate.set_cell_output_no_recompute 1.03% : 0.000001s : 6: predicate.shard_identity_eliminate 0.85% : 0.000001s : 6: predicate.special_op_eliminate 0.90% : 0.000001s : 6: predicate.specialize_transform 1.13% : 0.000002s : 6: predicate.split_environ_get_set_with_tuple_value 0.88% : 0.000001s : 6: predicate.stack_unstack_eliminate 0.46% : 0.000001s : 3: predicate.switch_call_monad_eliminater 1.02% : 0.000001s : 9: predicate.switch_defer_inline 1.77% : 0.000002s : 15: predicate.switch_layer_defer_inline 4.42% : 0.000006s : 32: predicate.switch_simplify 0.75% : 0.000001s : 7: predicate.tile_eliminate 0.80% : 0.000001s : 7: predicate.transpose_eliminate 1.53% : 0.000002s : 13: predicate.tuple_list_convert_item_index_to_positive 1.67% : 0.000002s : 13: predicate.tuple_list_get_item_const_eliminator 1.42% : 0.000002s : 13: predicate.tuple_list_get_item_depend_reorder 3.31% : 0.000004s : 19: predicate.tuple_list_get_item_eliminator 1.49% : 0.000002s : 13: predicate.tuple_list_get_set_item_eliminator 2.46% : 0.000003s : 19: predicate.tuple_list_set_item_eliminator 1.43% : 0.000002s : 13: predicate.tuple_to_list_eliminator_ 2.01% : 0.000003s : 20: predicate.updatestate_pure_node_eliminater 3.21% : 0.000004s : 26: predicate.updatestate_useless_node_eliminater 0.50% : 0.000001s : 3: predicate.value_based_eliminate 0.77% : 0.000001s : 6: predicate.virtual_dataset_eliminate 0.77% : 0.000001s : 6: predicate.virtual_output_eliminate 0.31% : 0.000000s : 3: predicate.virtual_view_grad_eliminate 0.55% : 0.000001s : 3: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000222 5 7.82% : 0.000017s : 1: func_graph_cloner_run.FuncGraphClonerGraph 92.18% : 0.000204s : 4: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.023594 192 0.02% : 0.000006s : 1: ForceFp32Comm 13.49% : 0.003182s : 1: add_attr 13.42% : 0.003167s : 1: add_attr_with_inline 0.03% : 0.000006s : 1: add_comm_op_reuse_tag 0.22% : 0.000052s : 1: add_recomputation 0.03% : 0.000006s : 1: assign_add_opt 0.28% : 0.000067s : 1: auto_monad 0.10% : 0.000024s : 1: auto_monad_reorder 0.02% : 0.000006s : 1: begin_end_overlap_inline 0.03% : 0.000008s : 1: bias_add_comm_swap 2.01% : 0.000474s : 1: bootstrap 0.14% : 0.000033s : 1: cconv 0.03% : 0.000006s : 1: comm_op_add_attrs 0.08% : 0.000019s : 1: control_data_broadcast_order 0.05% : 0.000013s : 1: convert_after_rewriter 0.13% : 0.000031s : 1: cse_after_recomputation 0.03% : 0.000007s : 1: dataset_repeat_opt 0.07% : 0.000017s : 1: detach_backward 0.05% : 0.000011s : 1: environ_conv 0.10% : 0.000023s : 1: event_method 0.03% : 0.000007s : 1: full_micro_interleaved_order_control 0.04% : 0.000009s : 1: get_jit_bprop_graph 0.05% : 0.000011s : 1: graph_reusing 0.03% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.03% : 0.000006s : 1: handle_group_info 0.03% : 0.000008s : 1: inline 0.04% : 0.000009s : 1: insert-virtual-dataset 0.03% : 0.000006s : 1: interleave_parallel_branches 0.03% : 0.000006s : 1: interleave_split_concat_branches 0.03% : 0.000008s : 1: label_fine_grained_interleaved_index 0.04% : 0.000010s : 1: label_micro_interleaved_index 1.88% : 0.000444s : 1: loop_unroll 0.03% : 0.000006s : 1: merge_cast_opt 0.03% : 0.000007s : 1: micro_interleaved_order_control 2.30% : 0.000543s : 1: mutable_eliminate 0.04% : 0.000009s : 1: offloading_packed_experts 0.06% : 0.000013s : 1: opt.transform.loop_unroll_optimizer 0.07% : 0.000016s : 1: opt.transform.mutable_eliminate 3.24% : 0.000763s : 78: opt.transform.opt_a 0.11% : 0.000026s : 1: opt.transform.opt_after_cconv 0.10% : 0.000023s : 1: opt.transform.opt_after_jit_grad 0.41% : 0.000096s : 28: opt.transform.opt_b 0.19% : 0.000045s : 2: opt.transform.opt_trans_graph 0.15% : 0.000036s : 4: opt.transform.symbol_engine_opt 10.70% : 0.002524s : 1: opt_a 0.54% : 0.000127s : 1: opt_after_cconv 2.04% : 0.000480s : 1: opt_after_jit_grad 1.15% : 0.000272s : 1: opt_b 21.64% : 0.005106s : 1: optimize 0.09% : 0.000022s : 1: optimize_parallel_all_gather_comm 0.05% : 0.000012s : 1: order_py_execute_after_rewriter 0.11% : 0.000026s : 1: overlap_grad_flash_sp 0.03% : 0.000006s : 1: overlap_grad_matmul_and_grad_allreduce 0.04% : 0.000010s : 1: overlap_grad_ring_attention 0.03% : 0.000007s : 1: overlap_opt_shard_grad_in_pipeline 0.03% : 0.000006s : 1: overlap_opt_shard_in_pipeline 0.03% : 0.000008s : 1: overlap_param_gather 0.03% : 0.000007s : 1: overlap_recompute_allgather_and_fa_grad 0.04% : 0.000010s : 1: overlap_recompute_and_grad_model_parallel 0.03% : 0.000008s : 1: overlap_recompute_comm 0.04% : 0.000010s : 1: parallel-infer-symbol 0.03% : 0.000006s : 1: parallel-infer-symbol-second 0.03% : 0.000007s : 1: partial_unused_args_eliminate 0.04% : 0.000009s : 1: pipeline_parallel_scheduler 0.03% : 0.000007s : 1: pipeline_split 0.15% : 0.000035s : 1: pre_auto_parallel 0.11% : 0.000026s : 1: py_interpret_to_execute 0.08% : 0.000018s : 1: py_interpret_to_execute_after_opt_a 0.03% : 0.000007s : 1: remove_cast_before_assign_add 0.09% : 0.000022s : 1: remove_dup_value 1.26% : 0.000298s : 1: renormalize.infer 1.00% : 0.000235s : 1: renormalize.specialize 0.03% : 0.000008s : 1: reorder_send_recv_between_fp_bp 0.05% : 0.000012s : 1: rewriter_after_jit_bprop_graph 0.19% : 0.000044s : 1: rewriter_after_opt_a 0.23% : 0.000054s : 1: rewriter_before_opt_a 0.03% : 0.000008s : 1: slice_cell_reuse_recomputed_activation 0.03% : 0.000007s : 1: slice_recompute_activation 0.03% : 0.000007s : 1: split_layernorm_comm 0.03% : 0.000007s : 1: split_matmul_comm_elemetwise 0.05% : 0.000011s : 1: swap_dp_allreduce_reducescatter 0.42% : 0.000099s : 1: symbol_engine_optimizer 0.38% : 0.000089s : 1: tuple_transform 19.73% : 0.004656s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:44.299.57 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0134457, [21] [bootstrap]: 0.0004464 [type_inference]: 0.00473758 [event_method]: 1.166e-05 [auto_monad]: 5.534e-05 [graph_reusing]: 5.12999e-06 [inline]: 2.22999e-06 [add_attr]: 0.00317959, [1] [add_attr_with_inline]: 0.00317087, [1] [Cycle 1]: 5.424e-05, [2] [tag_attr]: 1.433e-05 [meta_addattr_fg_expand]: 3.5e-06 [parallel-infer-symbol]: 3.03e-06 [pre_auto_parallel]: 2.68e-05 [insert-virtual-dataset]: 2.43998e-06 [parallel-infer-symbol-second]: 8.89995e-07 [dataset_repeat_opt]: 2.67001e-06 [pipeline_split]: 1.71002e-06 [optimize]: 0.00425748, [53] [py_interpret_to_execute]: 1.874e-05 [rewriter_before_opt_a]: 4.53e-05 [opt_a]: 0.00223026, [2] [Cycle 1]: 0.00156993, [45] [expand_dump_flag]: 3.09001e-06 [switch_simplify]: 2.646e-05 [loop_unroll]: 1.334e-05 [a_1]: 0.00028429 [with_stream_mark]: 1.774e-05 [recompute_prepare]: 9.07999e-06 [updatestate_depend_eliminate]: 3.76999e-06 [updatestate_assign_eliminate]: 3.61001e-06 [updatestate_loads_eliminate]: 3.5e-06 [parameter_eliminate]: 2.09999e-06 [a_2]: 7.995e-05 [accelerated_algorithm]: 7.18998e-06 [shard]: 2.59001e-06 [meta_shard_fg_expand]: 1.86998e-06 [shard_inline]: 6.02001e-06 [merge_send_recv]: 9.72999e-06 [auto_parallel]: 6.72002e-06 [parallel]: 1.812e-05 [flash_sp]: 8.54e-06 [merge_comm]: 4.37998e-06 [allreduce_fusion]: 4.19002e-06 [matmul_add_comm_reduction]: 9.20999e-06 [allreduce_slice_to_reducescatter]: 6.00005e-07 [virtual_shard_identity]: 9.29e-06 [virtual_dataset]: 6.53e-06 [get_grad_eliminate_]: 5.47999e-06 [virtual_output]: 6.33e-06 [merge_forward]: 3.86001e-06 [cell_reuse_recompute_pass]: 1.22999e-06 [offload_activation]: 9.10999e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.425e-05 [merge_recompute_call_nodes]: 1.42999e-06 [before_grad]: 1.05e-05 [set_forward_comm_id_for_comm_node_pass]: 4.23999e-06 [meta_fg_expand]: 2.59001e-06 [flash_sp_send_recv_attached]: 2.54001e-06 [receive_attached]: 2.17001e-06 [after_resolve]: 1.039e-05 [a_after_grad]: 9.31998e-06 [renormalize]: 0.00058131 [add_forward_monad_depend]: 5.61e-06 [auto_monad_grad]: 2.73998e-06 [auto_monad_eliminator]: 1.616e-05 [cse]: 3.219e-05 [a_3]: 4.676e-05 [Cycle 2]: 0.0006502, [45] [expand_dump_flag]: 1.15999e-06 [switch_simplify]: 7.25998e-06 [loop_unroll]: 5.77001e-06 [a_1]: 0.00010444 [with_stream_mark]: 1.545e-05 [recompute_prepare]: 7.41999e-06 [updatestate_depend_eliminate]: 3.63e-06 [updatestate_assign_eliminate]: 2.27999e-06 [updatestate_loads_eliminate]: 2.83e-06 [parameter_eliminate]: 1.59e-06 [a_2]: 6.956e-05 [accelerated_algorithm]: 6.36998e-06 [shard]: 1.92999e-06 [meta_shard_fg_expand]: 1.47001e-06 [shard_inline]: 6.47001e-06 [merge_send_recv]: 7.04001e-06 [auto_parallel]: 5.71e-06 [parallel]: 5.95002e-06 [flash_sp]: 4.05e-06 [merge_comm]: 3.36001e-06 [allreduce_fusion]: 3.46999e-06 [matmul_add_comm_reduction]: 6.55002e-06 [allreduce_slice_to_reducescatter]: 5.40022e-07 [virtual_shard_identity]: 6.89001e-06 [virtual_dataset]: 6.30002e-06 [get_grad_eliminate_]: 5.81e-06 [virtual_output]: 5.52999e-06 [merge_forward]: 3.14999e-06 [cell_reuse_recompute_pass]: 1.30001e-06 [offload_activation]: 7.01999e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.493e-05 [merge_recompute_call_nodes]: 1.19998e-06 [before_grad]: 1.008e-05 [set_forward_comm_id_for_comm_node_pass]: 3.98999e-06 [meta_fg_expand]: 2.38002e-06 [flash_sp_send_recv_attached]: 1.20001e-06 [receive_attached]: 1.30999e-06 [after_resolve]: 9.25001e-06 [a_after_grad]: 8.38001e-06 [renormalize]: 6.00121e-08 [add_forward_monad_depend]: 2.79999e-06 [auto_monad_grad]: 9.89996e-07 [auto_monad_eliminator]: 9.35001e-06 [cse]: 1.75e-05 [a_3]: 3.55e-05 [py_interpret_to_execute_after_opt_a]: 1.013e-05 [slice_cell_reuse_recomputed_activation]: 2.14999e-06 [rewriter_after_opt_a]: 3.64e-05 [convert_after_rewriter]: 6.58e-06 [order_py_execute_after_rewriter]: 5.54e-06 [mutable_eliminate]: 0.00052978 [opt_b]: 0.00020639, [1] [Cycle 1]: 0.00019982, [7] [b_1]: 0.00011754 [b_2]: 7.87e-06 [updatestate_depend_eliminate]: 7.38999e-06 [updatestate_assign_eliminate]: 2.26e-06 [updatestate_loads_eliminate]: 2.68998e-06 [renormalize]: 5.60016e-07 [cse]: 2.428e-05 [optimize_parallel_all_gather_comm]: 1.732e-05 [overlap_param_gather]: 2.01998e-06 [cconv]: 2.746e-05 [loop_unroll]: 0.00045013 [opt_after_cconv]: 0.00010457, [1] [Cycle 1]: 9.92e-05, [7] [c_1]: 2.766e-05 [parameter_eliminate]: 3.09999e-06 [updatestate_depend_eliminate]: 6.51e-06 [updatestate_assign_eliminate]: 2.20002e-06 [updatestate_loads_eliminate]: 2.59999e-06 [cse]: 2.241e-05 [renormalize]: 3.19997e-07 [remove_dup_value]: 1.594e-05 [tuple_transform]: 7.24e-05, [1] [Cycle 1]: 6.808e-05, [4] [d_1]: 4.089e-05 [none_parameter_eliminate]: 1.72001e-06 [renormalize]: 2.80008e-07 [switch_simplify]: 6.54999e-06 [partial_unused_args_eliminate]: 2.26998e-06 [add_recomputation]: 4.632e-05 [cse_after_recomputation]: 2.315e-05, [1] [Cycle 1]: 1.843e-05, [1] [cse]: 1.203e-05 [environ_conv]: 5.68002e-06 [swap_dp_allreduce_reducescatter]: 5.02e-06 [bias_add_comm_swap]: 2.32999e-06 [label_micro_interleaved_index]: 4.18001e-06 [label_fine_grained_interleaved_index]: 2.69001e-06 [merge_cast_opt]: 1.30001e-06 [slice_recompute_activation]: 2.00002e-06 [micro_interleaved_order_control]: 2.07001e-06 [assign_add_opt]: 1.17e-06 [ForceFp32Comm]: 1.19e-06 [remove_cast_before_assign_add]: 1.24e-06 [full_micro_interleaved_order_control]: 2.21e-06 [reorder_send_recv_between_fp_bp]: 2.71e-06 [comm_op_add_attrs]: 1.04003e-06 [add_comm_op_reuse_tag]: 9.70002e-07 [interleave_split_concat_branches]: 1.25999e-06 [interleave_parallel_branches]: 1.12e-06 [overlap_opt_shard_in_pipeline]: 1.32999e-06 [overlap_opt_shard_grad_in_pipeline]: 1.96e-06 [control_data_broadcast_order]: 1.368e-05 [grouped_pairwise_exchange_alltoall]: 1.65001e-06 [offloading_packed_experts]: 3.88001e-06 [overlap_recompute_and_grad_model_parallel]: 5.65001e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.12999e-06 [overlap_recompute_allgather_and_fa_grad]: 1.31998e-06 [overlap_recompute_comm]: 2.53998e-06 [overlap_grad_ring_attention]: 3.9e-06 [overlap_grad_flash_sp]: 1.849e-05 [begin_end_overlap_inline]: 6.00005e-07 [split_matmul_comm_elemetwise]: 2.32999e-06 [split_layernorm_comm]: 1.65001e-06 [handle_group_info]: 1.05001e-06 [symbol_engine_optimizer]: 8.105e-05, [1] [Cycle 1]: 7.697e-05, [6] [build]: 2.98998e-06 [elim_shapecalc]: 1.104e-05 [elim_not_effective]: 1.386e-05 [opt_reshape]: 7.30998e-06 [fold_const_symbol]: 1.033e-05 [renormalize]: 2.50002e-07 [detach_backward]: 2.59999e-06 [pipeline_parallel_scheduler]: 1.56998e-06 [auto_monad_reorder]: 1.872e-05 [get_jit_bprop_graph]: 1.87999e-06 [rewriter_after_jit_bprop_graph]: 4.53001e-06 [opt_after_jit_grad]: 0.00048731 [validate]: 3.994e-05 Sums bootstrap : 0.000446s : 4.82% type_inference : 0.004738s : 51.16% event_method : 0.000012s : 0.13% auto_monad : 0.000055s : 0.60% graph_reusing : 0.000005s : 0.06% inline : 0.000002s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000014s : 0.15% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000003s : 0.04% parallel-infer-symbol : 0.000003s : 0.03% pre_auto_parallel : 0.000027s : 0.29% insert-virtual-dataset : 0.000002s : 0.03% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000003s : 0.03% pipeline_split : 0.000002s : 0.02% optimize.py_interpret_to_execute : 0.000019s : 0.20% optimize.rewriter_before_opt_a : 0.000045s : 0.49% optimize.opt_a.expand_dump_flag : 0.000004s : 0.05% optimize.opt_a.switch_simplify : 0.000034s : 0.36% optimize.opt_a.loop_unroll : 0.000019s : 0.21% optimize.opt_a.a_1 : 0.000389s : 4.20% optimize.opt_a.with_stream_mark : 0.000033s : 0.36% optimize.opt_a.recompute_prepare : 0.000016s : 0.18% optimize.opt_a.updatestate_depend_eliminate : 0.000007s : 0.08% optimize.opt_a.updatestate_assign_eliminate : 0.000006s : 0.06% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.07% optimize.opt_a.parameter_eliminate : 0.000004s : 0.04% optimize.opt_a.a_2 : 0.000150s : 1.61% optimize.opt_a.accelerated_algorithm : 0.000014s : 0.15% optimize.opt_a.shard : 0.000005s : 0.05% optimize.opt_a.meta_shard_fg_expand : 0.000003s : 0.04% optimize.opt_a.shard_inline : 0.000012s : 0.13% optimize.opt_a.merge_send_recv : 0.000017s : 0.18% optimize.opt_a.auto_parallel : 0.000012s : 0.13% optimize.opt_a.parallel : 0.000024s : 0.26% optimize.opt_a.flash_sp : 0.000013s : 0.14% optimize.opt_a.merge_comm : 0.000008s : 0.08% optimize.opt_a.allreduce_fusion : 0.000008s : 0.08% optimize.opt_a.matmul_add_comm_reduction : 0.000016s : 0.17% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000016s : 0.17% optimize.opt_a.virtual_dataset : 0.000013s : 0.14% optimize.opt_a.get_grad_eliminate_ : 0.000011s : 0.12% optimize.opt_a.virtual_output : 0.000012s : 0.13% optimize.opt_a.merge_forward : 0.000007s : 0.08% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.03% optimize.opt_a.offload_activation : 0.000016s : 0.17% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000029s : 0.32% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.03% optimize.opt_a.before_grad : 0.000021s : 0.22% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000008s : 0.09% optimize.opt_a.meta_fg_expand : 0.000005s : 0.05% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.04% optimize.opt_a.receive_attached : 0.000003s : 0.04% optimize.opt_a.after_resolve : 0.000020s : 0.21% optimize.opt_a.a_after_grad : 0.000018s : 0.19% optimize.opt_a.renormalize : 0.000581s : 6.28% optimize.opt_a.add_forward_monad_depend : 0.000008s : 0.09% optimize.opt_a.auto_monad_grad : 0.000004s : 0.04% optimize.opt_a.auto_monad_eliminator : 0.000026s : 0.28% optimize.opt_a.cse : 0.000050s : 0.54% optimize.opt_a.a_3 : 0.000082s : 0.89% optimize.py_interpret_to_execute_after_opt_a : 0.000010s : 0.11% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.02% optimize.rewriter_after_opt_a : 0.000036s : 0.39% optimize.convert_after_rewriter : 0.000007s : 0.07% optimize.order_py_execute_after_rewriter : 0.000006s : 0.06% optimize.mutable_eliminate : 0.000530s : 5.72% optimize.opt_b.b_1 : 0.000118s : 1.27% optimize.opt_b.b_2 : 0.000008s : 0.08% optimize.opt_b.updatestate_depend_eliminate : 0.000007s : 0.08% optimize.opt_b.updatestate_assign_eliminate : 0.000002s : 0.02% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_b.renormalize : 0.000001s : 0.01% optimize.opt_b.cse : 0.000024s : 0.26% optimize.optimize_parallel_all_gather_comm : 0.000017s : 0.19% optimize.overlap_param_gather : 0.000002s : 0.02% optimize.cconv : 0.000027s : 0.30% optimize.loop_unroll : 0.000450s : 4.86% optimize.opt_after_cconv.c_1 : 0.000028s : 0.30% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000007s : 0.07% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000002s : 0.02% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.cse : 0.000022s : 0.24% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000016s : 0.17% optimize.tuple_transform.d_1 : 0.000041s : 0.44% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.02% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000007s : 0.07% optimize.partial_unused_args_eliminate : 0.000002s : 0.02% optimize.add_recomputation : 0.000046s : 0.50% optimize.cse_after_recomputation.cse : 0.000012s : 0.13% optimize.environ_conv : 0.000006s : 0.06% optimize.swap_dp_allreduce_reducescatter : 0.000005s : 0.05% optimize.bias_add_comm_swap : 0.000002s : 0.03% optimize.label_micro_interleaved_index : 0.000004s : 0.05% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.03% optimize.merge_cast_opt : 0.000001s : 0.01% optimize.slice_recompute_activation : 0.000002s : 0.02% optimize.micro_interleaved_order_control : 0.000002s : 0.02% optimize.assign_add_opt : 0.000001s : 0.01% optimize.ForceFp32Comm : 0.000001s : 0.01% optimize.remove_cast_before_assign_add : 0.000001s : 0.01% optimize.full_micro_interleaved_order_control : 0.000002s : 0.02% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.03% optimize.comm_op_add_attrs : 0.000001s : 0.01% optimize.add_comm_op_reuse_tag : 0.000001s : 0.01% optimize.interleave_split_concat_branches : 0.000001s : 0.01% optimize.interleave_parallel_branches : 0.000001s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.02% optimize.control_data_broadcast_order : 0.000014s : 0.15% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.02% optimize.offloading_packed_experts : 0.000004s : 0.04% optimize.overlap_recompute_and_grad_model_parallel : 0.000006s : 0.06% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.01% optimize.overlap_recompute_comm : 0.000003s : 0.03% optimize.overlap_grad_ring_attention : 0.000004s : 0.04% optimize.overlap_grad_flash_sp : 0.000018s : 0.20% optimize.begin_end_overlap_inline : 0.000001s : 0.01% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.03% optimize.split_layernorm_comm : 0.000002s : 0.02% optimize.handle_group_info : 0.000001s : 0.01% optimize.symbol_engine_optimizer.build : 0.000003s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000011s : 0.12% optimize.symbol_engine_optimizer.elim_not_effective : 0.000014s : 0.15% optimize.symbol_engine_optimizer.opt_reshape : 0.000007s : 0.08% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000010s : 0.11% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000003s : 0.03% pipeline_parallel_scheduler : 0.000002s : 0.02% auto_monad_reorder : 0.000019s : 0.20% get_jit_bprop_graph : 0.000002s : 0.02% rewriter_after_jit_bprop_graph : 0.000005s : 0.05% opt_after_jit_grad : 0.000487s : 5.26% validate : 0.000040s : 0.43% Time group info: ------[substitution.] 0.000109 19 1.94% : 0.000002s : 2: substitution.elim_not_effective 1.15% : 0.000001s : 2: substitution.fold_const_symbol 5.43% : 0.000006s : 3: substitution.graph_param_transform 80.53% : 0.000088s : 2: substitution.inline 3.82% : 0.000004s : 4: substitution.j_node_and_user_rematch 4.20% : 0.000005s : 4: substitution.remove_not_recompute_node 2.95% : 0.000003s : 2: substitution.replace_old_param ------[type_inference.] 0.004689 2 90.94% : 0.004264s : 1: type_inference.infer 9.06% : 0.000425s : 1: type_inference.specialize ------[replace.] 0.000022 2 100.00% : 0.000022s : 2: replace.inline ------[match.] 0.000086 2 100.00% : 0.000086s : 2: match.inline ------[predicate.] 0.000135 754 0.80% : 0.000001s : 7: predicate.accumulaten_eliminater 1.27% : 0.000002s : 3: predicate.ad_related_special_op_eliminate 0.65% : 0.000001s : 6: predicate.addn_check_dump 0.80% : 0.000001s : 7: predicate.addn_zero_filter 0.70% : 0.000001s : 7: predicate.adjust_all_reduce_mul_add 2.16% : 0.000003s : 13: predicate.arithmetic_simplify 0.78% : 0.000001s : 7: predicate.cast_eliminate 0.76% : 0.000001s : 6: predicate.check_bprop_eliminate 0.70% : 0.000001s : 6: predicate.compare_switch_simplify 0.24% : 0.000000s : 3: predicate.const_output_eliminate 0.71% : 0.000001s : 6: predicate.depend_value_elim 0.78% : 0.000001s : 7: predicate.dict_get_item_const_eliminator 1.00% : 0.000001s : 7: predicate.dict_get_item_eliminator 0.72% : 0.000001s : 7: predicate.dict_set_item_eliminator 1.30% : 0.000002s : 6: predicate.dumpgradient_eliminate 0.25% : 0.000000s : 3: predicate.elim_not_effective 0.58% : 0.000001s : 3: predicate.elim_shapecalc_of_broadcastargs 1.03% : 0.000001s : 10: predicate.environ_add_const_eliminate 0.95% : 0.000001s : 10: predicate.environ_get_add_eliminate 0.98% : 0.000001s : 10: predicate.environ_get_depend_swap 1.74% : 0.000002s : 16: predicate.environ_get_eliminate 0.96% : 0.000001s : 10: predicate.environ_get_set_eliminate 0.88% : 0.000001s : 9: predicate.exchange_switch_depend_value 1.90% : 0.000003s : 9: predicate.float_depend_g_call 0.61% : 0.000001s : 6: predicate.float_environ_get_switch 0.88% : 0.000001s : 9: predicate.float_tuple_getitem_switch 0.22% : 0.000000s : 3: predicate.fold_const_symbol 0.82% : 0.000001s : 6: predicate.get_grad_eliminate 0.35% : 0.000000s : 3: predicate.graph_param_transform 0.76% : 0.000001s : 6: predicate.incorporate_call 0.63% : 0.000001s : 6: predicate.incorporate_call_switch 6.44% : 0.000009s : 34: predicate.inline 1.31% : 0.000002s : 6: predicate.inline_without_move 0.39% : 0.000001s : 6: predicate.j_node_and_user_rematch 1.23% : 0.000002s : 6: predicate.less_batch_normalization 1.53% : 0.000002s : 13: predicate.list_to_tuple_eliminator_ 2.02% : 0.000003s : 20: predicate.load_eliminater 1.37% : 0.000002s : 3: predicate.loop_unroll_after_grad 1.66% : 0.000002s : 14: predicate.loop_unroll_before_grad 1.66% : 0.000002s : 13: predicate.make_slice_get_slice_eliminator 0.71% : 0.000001s : 6: predicate.merge_addn 0.62% : 0.000001s : 6: predicate.micro_step_allgather_replace 0.68% : 0.000001s : 6: predicate.mini_step_allgather_replace 0.68% : 0.000001s : 7: predicate.minmaximum_grad 1.75% : 0.000002s : 3: predicate.mutable_eliminate 0.83% : 0.000001s : 3: predicate.opt_reshape 0.72% : 0.000001s : 3: predicate.parallel_virtual_node 1.13% : 0.000002s : 9: predicate.partial_defer_inline 1.19% : 0.000002s : 10: predicate.partial_eliminate 0.75% : 0.000001s : 7: predicate.print_const_string_wrapper 0.75% : 0.000001s : 6: predicate.reduce_all_const_elim 0.95% : 0.000001s : 7: predicate.reduce_eliminate 2.00% : 0.000003s : 20: predicate.redundant_stop_gradient_eliminater 0.64% : 0.000001s : 6: predicate.remove_not_recompute_node 1.17% : 0.000002s : 13: predicate.replace_applicator 0.78% : 0.000001s : 6: predicate.replace_old_param 0.35% : 0.000000s : 3: predicate.reset_defer_inline 0.79% : 0.000001s : 7: predicate.reshape_eliminate 0.66% : 0.000001s : 6: predicate.row_tensor_add_zeros_like 0.57% : 0.000001s : 3: predicate.row_tensor_eliminate 1.04% : 0.000001s : 6: predicate.same_eliminate 0.83% : 0.000001s : 6: predicate.set_cell_output_no_recompute 1.22% : 0.000002s : 6: predicate.shard_identity_eliminate 0.82% : 0.000001s : 6: predicate.special_op_eliminate 1.20% : 0.000002s : 6: predicate.specialize_transform 1.26% : 0.000002s : 6: predicate.split_environ_get_set_with_tuple_value 0.83% : 0.000001s : 6: predicate.stack_unstack_eliminate 0.44% : 0.000001s : 3: predicate.switch_call_monad_eliminater 0.97% : 0.000001s : 9: predicate.switch_defer_inline 1.62% : 0.000002s : 15: predicate.switch_layer_defer_inline 4.51% : 0.000006s : 32: predicate.switch_simplify 0.69% : 0.000001s : 7: predicate.tile_eliminate 0.75% : 0.000001s : 7: predicate.transpose_eliminate 1.58% : 0.000002s : 13: predicate.tuple_list_convert_item_index_to_positive 1.61% : 0.000002s : 13: predicate.tuple_list_get_item_const_eliminator 1.37% : 0.000002s : 13: predicate.tuple_list_get_item_depend_reorder 3.32% : 0.000005s : 19: predicate.tuple_list_get_item_eliminator 1.50% : 0.000002s : 13: predicate.tuple_list_get_set_item_eliminator 2.47% : 0.000003s : 19: predicate.tuple_list_set_item_eliminator 1.47% : 0.000002s : 13: predicate.tuple_to_list_eliminator_ 1.90% : 0.000003s : 20: predicate.updatestate_pure_node_eliminater 2.95% : 0.000004s : 26: predicate.updatestate_useless_node_eliminater 0.66% : 0.000001s : 3: predicate.value_based_eliminate 0.95% : 0.000001s : 6: predicate.virtual_dataset_eliminate 0.92% : 0.000001s : 6: predicate.virtual_output_eliminate 0.32% : 0.000000s : 3: predicate.virtual_view_grad_eliminate 0.56% : 0.000001s : 3: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000213 5 8.69% : 0.000019s : 1: func_graph_cloner_run.FuncGraphClonerGraph 91.31% : 0.000195s : 4: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.022356 192 0.02% : 0.000004s : 1: ForceFp32Comm 14.24% : 0.003184s : 1: add_attr 14.20% : 0.003174s : 1: add_attr_with_inline 0.02% : 0.000004s : 1: add_comm_op_reuse_tag 0.23% : 0.000051s : 1: add_recomputation 0.02% : 0.000004s : 1: assign_add_opt 0.27% : 0.000061s : 1: auto_monad 0.11% : 0.000024s : 1: auto_monad_reorder 0.02% : 0.000004s : 1: begin_end_overlap_inline 0.02% : 0.000005s : 1: bias_add_comm_swap 2.13% : 0.000477s : 1: bootstrap 0.14% : 0.000031s : 1: cconv 0.02% : 0.000004s : 1: comm_op_add_attrs 0.08% : 0.000017s : 1: control_data_broadcast_order 0.04% : 0.000010s : 1: convert_after_rewriter 0.12% : 0.000026s : 1: cse_after_recomputation 0.02% : 0.000005s : 1: dataset_repeat_opt 0.03% : 0.000006s : 1: detach_backward 0.04% : 0.000009s : 1: environ_conv 0.08% : 0.000019s : 1: event_method 0.02% : 0.000005s : 1: full_micro_interleaved_order_control 0.02% : 0.000005s : 1: get_jit_bprop_graph 0.04% : 0.000008s : 1: graph_reusing 0.02% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.02% : 0.000004s : 1: handle_group_info 0.02% : 0.000005s : 1: inline 0.03% : 0.000006s : 1: insert-virtual-dataset 0.02% : 0.000004s : 1: interleave_parallel_branches 0.02% : 0.000004s : 1: interleave_split_concat_branches 0.03% : 0.000006s : 1: label_fine_grained_interleaved_index 0.03% : 0.000007s : 1: label_micro_interleaved_index 2.05% : 0.000459s : 1: loop_unroll 0.02% : 0.000004s : 1: merge_cast_opt 0.02% : 0.000005s : 1: micro_interleaved_order_control 2.41% : 0.000539s : 1: mutable_eliminate 0.03% : 0.000007s : 1: offloading_packed_experts 0.07% : 0.000017s : 1: opt.transform.loop_unroll_optimizer 0.07% : 0.000017s : 1: opt.transform.mutable_eliminate 3.39% : 0.000759s : 78: opt.transform.opt_a 0.12% : 0.000026s : 1: opt.transform.opt_after_cconv 0.12% : 0.000026s : 1: opt.transform.opt_after_jit_grad 0.42% : 0.000094s : 28: opt.transform.opt_b 0.20% : 0.000045s : 2: opt.transform.opt_trans_graph 0.17% : 0.000038s : 4: opt.transform.symbol_engine_opt 9.99% : 0.002233s : 1: opt_a 0.48% : 0.000108s : 1: opt_after_cconv 2.23% : 0.000498s : 1: opt_after_jit_grad 0.94% : 0.000210s : 1: opt_b 19.06% : 0.004262s : 1: optimize 0.10% : 0.000022s : 1: optimize_parallel_all_gather_comm 0.04% : 0.000009s : 1: order_py_execute_after_rewriter 0.10% : 0.000022s : 1: overlap_grad_flash_sp 0.02% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.03% : 0.000007s : 1: overlap_grad_ring_attention 0.02% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.02% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.02% : 0.000005s : 1: overlap_param_gather 0.02% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.04% : 0.000009s : 1: overlap_recompute_and_grad_model_parallel 0.02% : 0.000005s : 1: overlap_recompute_comm 0.03% : 0.000006s : 1: parallel-infer-symbol 0.02% : 0.000004s : 1: parallel-infer-symbol-second 0.02% : 0.000005s : 1: partial_unused_args_eliminate 0.02% : 0.000005s : 1: pipeline_parallel_scheduler 0.02% : 0.000005s : 1: pipeline_split 0.14% : 0.000031s : 1: pre_auto_parallel 0.10% : 0.000022s : 1: py_interpret_to_execute 0.06% : 0.000014s : 1: py_interpret_to_execute_after_opt_a 0.02% : 0.000005s : 1: remove_cast_before_assign_add 0.09% : 0.000019s : 1: remove_dup_value 1.56% : 0.000349s : 1: renormalize.infer 1.00% : 0.000224s : 1: renormalize.specialize 0.02% : 0.000005s : 1: reorder_send_recv_between_fp_bp 0.04% : 0.000008s : 1: rewriter_after_jit_bprop_graph 0.19% : 0.000041s : 1: rewriter_after_opt_a 0.22% : 0.000050s : 1: rewriter_before_opt_a 0.02% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.02% : 0.000005s : 1: slice_recompute_activation 0.02% : 0.000004s : 1: split_layernorm_comm 0.02% : 0.000005s : 1: split_matmul_comm_elemetwise 0.04% : 0.000008s : 1: swap_dp_allreduce_reducescatter 0.37% : 0.000084s : 1: symbol_engine_optimizer 0.34% : 0.000075s : 1: tuple_transform 21.27% : 0.004755s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:44.221.664 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:44.221.928 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.014027, [21] [bootstrap]: 0.000438 [type_inference]: 0.00464892 [event_method]: 1.145e-05 [auto_monad]: 6.506e-05 [graph_reusing]: 5.64e-06 [inline]: 2.33998e-06 [add_attr]: 0.00307215, [1] [add_attr_with_inline]: 0.00306392, [1] [Cycle 1]: 6.084e-05, [2] [tag_attr]: 1.411e-05 [meta_addattr_fg_expand]: 3.63999e-06 [parallel-infer-symbol]: 3.11999e-06 [pre_auto_parallel]: 2.426e-05 [insert-virtual-dataset]: 2.68e-06 [parallel-infer-symbol-second]: 7.10017e-07 [dataset_repeat_opt]: 1.95001e-06 [pipeline_split]: 1.55001e-06 [optimize]: 0.00463165, [53] [py_interpret_to_execute]: 2.062e-05 [rewriter_before_opt_a]: 4.719e-05 [opt_a]: 0.002475, [2] [Cycle 1]: 0.00164909, [45] [expand_dump_flag]: 3.41001e-06 [switch_simplify]: 2.48e-05 [loop_unroll]: 1.388e-05 [a_1]: 0.00031863 [with_stream_mark]: 1.491e-05 [recompute_prepare]: 8.49002e-06 [updatestate_depend_eliminate]: 4.1e-06 [updatestate_assign_eliminate]: 4.38001e-06 [updatestate_loads_eliminate]: 2.78998e-06 [parameter_eliminate]: 1.84e-06 [a_2]: 0.00011938 [accelerated_algorithm]: 6.93998e-06 [shard]: 2.21e-06 [meta_shard_fg_expand]: 1.74e-06 [shard_inline]: 5.94999e-06 [merge_send_recv]: 8.39002e-06 [auto_parallel]: 6.56e-06 [parallel]: 1.928e-05 [flash_sp]: 7.6e-06 [merge_comm]: 4.17e-06 [allreduce_fusion]: 3.93999e-06 [matmul_add_comm_reduction]: 9.44e-06 [allreduce_slice_to_reducescatter]: 5.8001e-07 [virtual_shard_identity]: 8.33999e-06 [virtual_dataset]: 6.48e-06 [get_grad_eliminate_]: 5.59e-06 [virtual_output]: 6.72002e-06 [merge_forward]: 3.61999e-06 [cell_reuse_recompute_pass]: 1.19e-06 [offload_activation]: 9.16998e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.449e-05 [merge_recompute_call_nodes]: 1.59e-06 [before_grad]: 1.017e-05 [set_forward_comm_id_for_comm_node_pass]: 3.92998e-06 [meta_fg_expand]: 2.52001e-06 [flash_sp_send_recv_attached]: 2.62001e-06 [receive_attached]: 2.26e-06 [after_resolve]: 9.67999e-06 [a_after_grad]: 9.43002e-06 [renormalize]: 0.00045359 [add_forward_monad_depend]: 4.48999e-06 [auto_monad_grad]: 1.94999e-06 [auto_monad_eliminator]: 1.634e-05 [cse]: 3.006e-05 [a_3]: 5.739e-05 [Cycle 2]: 0.00081342, [45] [expand_dump_flag]: 9.29984e-07 [switch_simplify]: 7.38e-06 [loop_unroll]: 6.66e-06 [a_1]: 0.0001298 [with_stream_mark]: 1.305e-05 [recompute_prepare]: 6.43e-06 [updatestate_depend_eliminate]: 3.09999e-06 [updatestate_assign_eliminate]: 2.52001e-06 [updatestate_loads_eliminate]: 2.56e-06 [parameter_eliminate]: 1.01002e-06 [a_2]: 0.00010983 [accelerated_algorithm]: 6.32001e-06 [shard]: 1.05001e-06 [meta_shard_fg_expand]: 1.34998e-06 [shard_inline]: 6.14001e-06 [merge_send_recv]: 4.72998e-06 [auto_parallel]: 5.67001e-06 [parallel]: 4.55001e-06 [flash_sp]: 3.23e-06 [merge_comm]: 3.09999e-06 [allreduce_fusion]: 2.82002e-06 [matmul_add_comm_reduction]: 6.08998e-06 [allreduce_slice_to_reducescatter]: 3.29979e-07 [virtual_shard_identity]: 6.39001e-06 [virtual_dataset]: 5.74e-06 [get_grad_eliminate_]: 5.24998e-06 [virtual_output]: 5.42999e-06 [merge_forward]: 2.89999e-06 [cell_reuse_recompute_pass]: 1.27e-06 [offload_activation]: 6.39001e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.291e-05 [merge_recompute_call_nodes]: 9.00007e-07 [before_grad]: 9.02e-06 [set_forward_comm_id_for_comm_node_pass]: 3.6e-06 [meta_fg_expand]: 2.14e-06 [flash_sp_send_recv_attached]: 9.10019e-07 [receive_attached]: 1.08001e-06 [after_resolve]: 8.76002e-06 [a_after_grad]: 8.05999e-06 [renormalize]: 6.00121e-08 [add_forward_monad_depend]: 1.19e-06 [auto_monad_grad]: 9.80013e-07 [auto_monad_eliminator]: 7.75998e-06 [cse]: 1.419e-05 [a_3]: 4.768e-05 [py_interpret_to_execute_after_opt_a]: 1.163e-05 [slice_cell_reuse_recomputed_activation]: 5.07e-06 [rewriter_after_opt_a]: 3.792e-05 [convert_after_rewriter]: 9.92001e-06 [order_py_execute_after_rewriter]: 8.65999e-06 [mutable_eliminate]: 0.00048568 [opt_b]: 0.00026154, [1] [Cycle 1]: 0.00025262, [7] [b_1]: 0.00016115 [b_2]: 7.9e-06 [updatestate_depend_eliminate]: 4.98001e-06 [updatestate_assign_eliminate]: 2.55002e-06 [updatestate_loads_eliminate]: 2.32999e-06 [renormalize]: 4.89992e-07 [cse]: 1.834e-05 [optimize_parallel_all_gather_comm]: 1.881e-05 [overlap_param_gather]: 4.77e-06 [cconv]: 2.596e-05 [loop_unroll]: 0.00043966 [opt_after_cconv]: 0.00012285, [1] [Cycle 1]: 0.00011468, [7] [c_1]: 2.705e-05 [parameter_eliminate]: 2.34001e-06 [updatestate_depend_eliminate]: 4.99003e-06 [updatestate_assign_eliminate]: 2.61e-06 [updatestate_loads_eliminate]: 2.50997e-06 [cse]: 1.82e-05 [renormalize]: 4.2998e-07 [remove_dup_value]: 1.624e-05 [tuple_transform]: 8.537e-05, [1] [Cycle 1]: 7.859e-05, [4] [d_1]: 4.073e-05 [none_parameter_eliminate]: 1.62999e-06 [renormalize]: 1.80007e-07 [switch_simplify]: 6.69999e-06 [partial_unused_args_eliminate]: 4.43999e-06 [add_recomputation]: 4.661e-05 [cse_after_recomputation]: 2.738e-05, [1] [Cycle 1]: 2.099e-05, [1] [cse]: 1.217e-05 [environ_conv]: 8.00999e-06 [swap_dp_allreduce_reducescatter]: 8.09002e-06 [bias_add_comm_swap]: 5.11997e-06 [label_micro_interleaved_index]: 6.74001e-06 [label_fine_grained_interleaved_index]: 5.44e-06 [merge_cast_opt]: 3.97002e-06 [slice_recompute_activation]: 4.31002e-06 [micro_interleaved_order_control]: 4.69002e-06 [assign_add_opt]: 3.8e-06 [ForceFp32Comm]: 3.45e-06 [remove_cast_before_assign_add]: 3.56999e-06 [full_micro_interleaved_order_control]: 4.96002e-06 [reorder_send_recv_between_fp_bp]: 5.39e-06 [comm_op_add_attrs]: 3.77002e-06 [add_comm_op_reuse_tag]: 3.33e-06 [interleave_split_concat_branches]: 3.65e-06 [interleave_parallel_branches]: 3.53e-06 [overlap_opt_shard_in_pipeline]: 3.80998e-06 [overlap_opt_shard_grad_in_pipeline]: 4.39998e-06 [control_data_broadcast_order]: 1.467e-05 [grouped_pairwise_exchange_alltoall]: 4.2e-06 [offloading_packed_experts]: 6.59999e-06 [overlap_recompute_and_grad_model_parallel]: 7.05998e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.49001e-06 [overlap_recompute_allgather_and_fa_grad]: 3.65e-06 [overlap_recompute_comm]: 4.99e-06 [overlap_grad_ring_attention]: 6.68e-06 [overlap_grad_flash_sp]: 2.032e-05 [begin_end_overlap_inline]: 2.89999e-06 [split_matmul_comm_elemetwise]: 4.59998e-06 [split_layernorm_comm]: 4.48999e-06 [handle_group_info]: 3.63999e-06 [symbol_engine_optimizer]: 9.408e-05, [1] [Cycle 1]: 8.702e-05, [6] [build]: 2.39001e-06 [elim_shapecalc]: 9.05001e-06 [elim_not_effective]: 1.269e-05 [opt_reshape]: 7.03e-06 [fold_const_symbol]: 9.91998e-06 [renormalize]: 2.29978e-07 [detach_backward]: 3.23e-06 [pipeline_parallel_scheduler]: 1.62999e-06 [auto_monad_reorder]: 1.793e-05 [get_jit_bprop_graph]: 1.14e-06 [rewriter_after_jit_bprop_graph]: 4.60001e-06 [opt_after_jit_grad]: 0.00047085 [validate]: 3.573e-05 Sums bootstrap : 0.000438s : 4.75% type_inference : 0.004649s : 50.46% event_method : 0.000011s : 0.12% auto_monad : 0.000065s : 0.71% graph_reusing : 0.000006s : 0.06% inline : 0.000002s : 0.03% add_attr.add_attr_with_inline.tag_attr : 0.000014s : 0.15% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000004s : 0.04% parallel-infer-symbol : 0.000003s : 0.03% pre_auto_parallel : 0.000024s : 0.26% insert-virtual-dataset : 0.000003s : 0.03% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.02% optimize.py_interpret_to_execute : 0.000021s : 0.22% optimize.rewriter_before_opt_a : 0.000047s : 0.51% optimize.opt_a.expand_dump_flag : 0.000004s : 0.05% optimize.opt_a.switch_simplify : 0.000032s : 0.35% optimize.opt_a.loop_unroll : 0.000021s : 0.22% optimize.opt_a.a_1 : 0.000448s : 4.87% optimize.opt_a.with_stream_mark : 0.000028s : 0.30% optimize.opt_a.recompute_prepare : 0.000015s : 0.16% optimize.opt_a.updatestate_depend_eliminate : 0.000007s : 0.08% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.07% optimize.opt_a.updatestate_loads_eliminate : 0.000005s : 0.06% optimize.opt_a.parameter_eliminate : 0.000003s : 0.03% optimize.opt_a.a_2 : 0.000229s : 2.49% optimize.opt_a.accelerated_algorithm : 0.000013s : 0.14% optimize.opt_a.shard : 0.000003s : 0.04% optimize.opt_a.meta_shard_fg_expand : 0.000003s : 0.03% optimize.opt_a.shard_inline : 0.000012s : 0.13% optimize.opt_a.merge_send_recv : 0.000013s : 0.14% optimize.opt_a.auto_parallel : 0.000012s : 0.13% optimize.opt_a.parallel : 0.000024s : 0.26% optimize.opt_a.flash_sp : 0.000011s : 0.12% optimize.opt_a.merge_comm : 0.000007s : 0.08% optimize.opt_a.allreduce_fusion : 0.000007s : 0.07% optimize.opt_a.matmul_add_comm_reduction : 0.000016s : 0.17% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000015s : 0.16% optimize.opt_a.virtual_dataset : 0.000012s : 0.13% optimize.opt_a.get_grad_eliminate_ : 0.000011s : 0.12% optimize.opt_a.virtual_output : 0.000012s : 0.13% optimize.opt_a.merge_forward : 0.000007s : 0.07% optimize.opt_a.cell_reuse_recompute_pass : 0.000002s : 0.03% optimize.opt_a.offload_activation : 0.000016s : 0.17% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000027s : 0.30% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.03% optimize.opt_a.before_grad : 0.000019s : 0.21% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000008s : 0.08% optimize.opt_a.meta_fg_expand : 0.000005s : 0.05% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.04% optimize.opt_a.receive_attached : 0.000003s : 0.04% optimize.opt_a.after_resolve : 0.000018s : 0.20% optimize.opt_a.a_after_grad : 0.000017s : 0.19% optimize.opt_a.renormalize : 0.000454s : 4.92% optimize.opt_a.add_forward_monad_depend : 0.000006s : 0.06% optimize.opt_a.auto_monad_grad : 0.000003s : 0.03% optimize.opt_a.auto_monad_eliminator : 0.000024s : 0.26% optimize.opt_a.cse : 0.000044s : 0.48% optimize.opt_a.a_3 : 0.000105s : 1.14% optimize.py_interpret_to_execute_after_opt_a : 0.000012s : 0.13% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.06% optimize.rewriter_after_opt_a : 0.000038s : 0.41% optimize.convert_after_rewriter : 0.000010s : 0.11% optimize.order_py_execute_after_rewriter : 0.000009s : 0.09% optimize.mutable_eliminate : 0.000486s : 5.27% optimize.opt_b.b_1 : 0.000161s : 1.75% optimize.opt_b.b_2 : 0.000008s : 0.09% optimize.opt_b.updatestate_depend_eliminate : 0.000005s : 0.05% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_b.updatestate_loads_eliminate : 0.000002s : 0.03% optimize.opt_b.renormalize : 0.000000s : 0.01% optimize.opt_b.cse : 0.000018s : 0.20% optimize.optimize_parallel_all_gather_comm : 0.000019s : 0.20% optimize.overlap_param_gather : 0.000005s : 0.05% optimize.cconv : 0.000026s : 0.28% optimize.loop_unroll : 0.000440s : 4.77% optimize.opt_after_cconv.c_1 : 0.000027s : 0.29% optimize.opt_after_cconv.parameter_eliminate : 0.000002s : 0.03% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000005s : 0.05% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.cse : 0.000018s : 0.20% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000016s : 0.18% optimize.tuple_transform.d_1 : 0.000041s : 0.44% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.02% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000007s : 0.07% optimize.partial_unused_args_eliminate : 0.000004s : 0.05% optimize.add_recomputation : 0.000047s : 0.51% optimize.cse_after_recomputation.cse : 0.000012s : 0.13% optimize.environ_conv : 0.000008s : 0.09% optimize.swap_dp_allreduce_reducescatter : 0.000008s : 0.09% optimize.bias_add_comm_swap : 0.000005s : 0.06% optimize.label_micro_interleaved_index : 0.000007s : 0.07% optimize.label_fine_grained_interleaved_index : 0.000005s : 0.06% optimize.merge_cast_opt : 0.000004s : 0.04% optimize.slice_recompute_activation : 0.000004s : 0.05% optimize.micro_interleaved_order_control : 0.000005s : 0.05% optimize.assign_add_opt : 0.000004s : 0.04% optimize.ForceFp32Comm : 0.000003s : 0.04% optimize.remove_cast_before_assign_add : 0.000004s : 0.04% optimize.full_micro_interleaved_order_control : 0.000005s : 0.05% optimize.reorder_send_recv_between_fp_bp : 0.000005s : 0.06% optimize.comm_op_add_attrs : 0.000004s : 0.04% optimize.add_comm_op_reuse_tag : 0.000003s : 0.04% optimize.interleave_split_concat_branches : 0.000004s : 0.04% optimize.interleave_parallel_branches : 0.000004s : 0.04% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.04% optimize.overlap_opt_shard_grad_in_pipeline : 0.000004s : 0.05% optimize.control_data_broadcast_order : 0.000015s : 0.16% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.05% optimize.offloading_packed_experts : 0.000007s : 0.07% optimize.overlap_recompute_and_grad_model_parallel : 0.000007s : 0.08% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000003s : 0.04% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.04% optimize.overlap_recompute_comm : 0.000005s : 0.05% optimize.overlap_grad_ring_attention : 0.000007s : 0.07% optimize.overlap_grad_flash_sp : 0.000020s : 0.22% optimize.begin_end_overlap_inline : 0.000003s : 0.03% optimize.split_matmul_comm_elemetwise : 0.000005s : 0.05% optimize.split_layernorm_comm : 0.000004s : 0.05% optimize.handle_group_info : 0.000004s : 0.04% optimize.symbol_engine_optimizer.build : 0.000002s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000009s : 0.10% optimize.symbol_engine_optimizer.elim_not_effective : 0.000013s : 0.14% optimize.symbol_engine_optimizer.opt_reshape : 0.000007s : 0.08% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000010s : 0.11% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000003s : 0.04% pipeline_parallel_scheduler : 0.000002s : 0.02% auto_monad_reorder : 0.000018s : 0.19% get_jit_bprop_graph : 0.000001s : 0.01% rewriter_after_jit_bprop_graph : 0.000005s : 0.05% opt_after_jit_grad : 0.000471s : 5.11% validate : 0.000036s : 0.39% Time group info: ------[substitution.] 0.000171 23 38.76% : 0.000066s : 4: substitution.arithmetic_simplify 1.14% : 0.000002s : 2: substitution.elim_not_effective 0.77% : 0.000001s : 2: substitution.fold_const_symbol 3.17% : 0.000005s : 3: substitution.graph_param_transform 49.70% : 0.000085s : 2: substitution.inline 1.82% : 0.000003s : 4: substitution.j_node_and_user_rematch 2.68% : 0.000005s : 4: substitution.remove_not_recompute_node 1.95% : 0.000003s : 2: substitution.replace_old_param ------[type_inference.] 0.004605 2 91.18% : 0.004199s : 1: type_inference.infer 8.82% : 0.000406s : 1: type_inference.specialize ------[replace.] 0.000022 2 100.00% : 0.000022s : 2: replace.inline ------[match.] 0.000083 2 100.00% : 0.000083s : 2: match.inline ------[predicate.] 0.000136 754 0.97% : 0.000001s : 7: predicate.accumulaten_eliminater 1.06% : 0.000001s : 3: predicate.ad_related_special_op_eliminate 0.66% : 0.000001s : 6: predicate.addn_check_dump 0.93% : 0.000001s : 7: predicate.addn_zero_filter 0.69% : 0.000001s : 7: predicate.adjust_all_reduce_mul_add 2.92% : 0.000004s : 13: predicate.arithmetic_simplify 0.85% : 0.000001s : 7: predicate.cast_eliminate 0.76% : 0.000001s : 6: predicate.check_bprop_eliminate 0.67% : 0.000001s : 6: predicate.compare_switch_simplify 0.20% : 0.000000s : 3: predicate.const_output_eliminate 0.80% : 0.000001s : 6: predicate.depend_value_elim 0.76% : 0.000001s : 7: predicate.dict_get_item_const_eliminator 1.08% : 0.000001s : 7: predicate.dict_get_item_eliminator 0.74% : 0.000001s : 7: predicate.dict_set_item_eliminator 1.27% : 0.000002s : 6: predicate.dumpgradient_eliminate 0.24% : 0.000000s : 3: predicate.elim_not_effective 0.49% : 0.000001s : 3: predicate.elim_shapecalc_of_broadcastargs 1.10% : 0.000001s : 10: predicate.environ_add_const_eliminate 1.05% : 0.000001s : 10: predicate.environ_get_add_eliminate 1.00% : 0.000001s : 10: predicate.environ_get_depend_swap 1.75% : 0.000002s : 16: predicate.environ_get_eliminate 1.00% : 0.000001s : 10: predicate.environ_get_set_eliminate 0.97% : 0.000001s : 9: predicate.exchange_switch_depend_value 1.92% : 0.000003s : 9: predicate.float_depend_g_call 0.71% : 0.000001s : 6: predicate.float_environ_get_switch 0.98% : 0.000001s : 9: predicate.float_tuple_getitem_switch 0.22% : 0.000000s : 3: predicate.fold_const_symbol 0.78% : 0.000001s : 6: predicate.get_grad_eliminate 0.35% : 0.000000s : 3: predicate.graph_param_transform 0.77% : 0.000001s : 6: predicate.incorporate_call 0.63% : 0.000001s : 6: predicate.incorporate_call_switch 6.43% : 0.000009s : 34: predicate.inline 0.97% : 0.000001s : 6: predicate.inline_without_move 0.37% : 0.000001s : 6: predicate.j_node_and_user_rematch 1.03% : 0.000001s : 6: predicate.less_batch_normalization 1.64% : 0.000002s : 13: predicate.list_to_tuple_eliminator_ 2.06% : 0.000003s : 20: predicate.load_eliminater 1.13% : 0.000002s : 3: predicate.loop_unroll_after_grad 1.67% : 0.000002s : 14: predicate.loop_unroll_before_grad 1.81% : 0.000002s : 13: predicate.make_slice_get_slice_eliminator 0.70% : 0.000001s : 6: predicate.merge_addn 0.70% : 0.000001s : 6: predicate.micro_step_allgather_replace 0.68% : 0.000001s : 6: predicate.mini_step_allgather_replace 0.66% : 0.000001s : 7: predicate.minmaximum_grad 1.13% : 0.000002s : 3: predicate.mutable_eliminate 0.53% : 0.000001s : 3: predicate.opt_reshape 0.49% : 0.000001s : 3: predicate.parallel_virtual_node 1.38% : 0.000002s : 9: predicate.partial_defer_inline 1.18% : 0.000002s : 10: predicate.partial_eliminate 0.77% : 0.000001s : 7: predicate.print_const_string_wrapper 0.71% : 0.000001s : 6: predicate.reduce_all_const_elim 1.19% : 0.000002s : 7: predicate.reduce_eliminate 2.02% : 0.000003s : 20: predicate.redundant_stop_gradient_eliminater 0.64% : 0.000001s : 6: predicate.remove_not_recompute_node 1.17% : 0.000002s : 13: predicate.replace_applicator 0.70% : 0.000001s : 6: predicate.replace_old_param 0.33% : 0.000000s : 3: predicate.reset_defer_inline 0.84% : 0.000001s : 7: predicate.reshape_eliminate 0.84% : 0.000001s : 6: predicate.row_tensor_add_zeros_like 0.57% : 0.000001s : 3: predicate.row_tensor_eliminate 0.88% : 0.000001s : 6: predicate.same_eliminate 0.55% : 0.000001s : 6: predicate.set_cell_output_no_recompute 1.00% : 0.000001s : 6: predicate.shard_identity_eliminate 0.94% : 0.000001s : 6: predicate.special_op_eliminate 1.08% : 0.000001s : 6: predicate.specialize_transform 1.19% : 0.000002s : 6: predicate.split_environ_get_set_with_tuple_value 0.87% : 0.000001s : 6: predicate.stack_unstack_eliminate 0.42% : 0.000001s : 3: predicate.switch_call_monad_eliminater 0.99% : 0.000001s : 9: predicate.switch_defer_inline 1.72% : 0.000002s : 15: predicate.switch_layer_defer_inline 4.61% : 0.000006s : 32: predicate.switch_simplify 0.70% : 0.000001s : 7: predicate.tile_eliminate 0.78% : 0.000001s : 7: predicate.transpose_eliminate 1.64% : 0.000002s : 13: predicate.tuple_list_convert_item_index_to_positive 1.64% : 0.000002s : 13: predicate.tuple_list_get_item_const_eliminator 1.41% : 0.000002s : 13: predicate.tuple_list_get_item_depend_reorder 3.02% : 0.000004s : 19: predicate.tuple_list_get_item_eliminator 1.71% : 0.000002s : 13: predicate.tuple_list_get_set_item_eliminator 2.80% : 0.000004s : 19: predicate.tuple_list_set_item_eliminator 1.72% : 0.000002s : 13: predicate.tuple_to_list_eliminator_ 1.95% : 0.000003s : 20: predicate.updatestate_pure_node_eliminater 2.80% : 0.000004s : 26: predicate.updatestate_useless_node_eliminater 0.53% : 0.000001s : 3: predicate.value_based_eliminate 0.82% : 0.000001s : 6: predicate.virtual_dataset_eliminate 0.77% : 0.000001s : 6: predicate.virtual_output_eliminate 0.33% : 0.000000s : 3: predicate.virtual_view_grad_eliminate 0.49% : 0.000001s : 3: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000193 5 8.21% : 0.000016s : 1: func_graph_cloner_run.FuncGraphClonerGraph 91.79% : 0.000177s : 4: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.023153 192 0.03% : 0.000006s : 1: ForceFp32Comm 13.31% : 0.003081s : 1: add_attr 13.25% : 0.003067s : 1: add_attr_with_inline 0.03% : 0.000006s : 1: add_comm_op_reuse_tag 0.22% : 0.000050s : 1: add_recomputation 0.03% : 0.000006s : 1: assign_add_opt 0.32% : 0.000074s : 1: auto_monad 0.11% : 0.000025s : 1: auto_monad_reorder 0.02% : 0.000006s : 1: begin_end_overlap_inline 0.03% : 0.000008s : 1: bias_add_comm_swap 2.08% : 0.000482s : 1: bootstrap 0.13% : 0.000029s : 1: cconv 0.03% : 0.000007s : 1: comm_op_add_attrs 0.08% : 0.000018s : 1: control_data_broadcast_order 0.06% : 0.000013s : 1: convert_after_rewriter 0.13% : 0.000030s : 1: cse_after_recomputation 0.03% : 0.000007s : 1: dataset_repeat_opt 0.08% : 0.000018s : 1: detach_backward 0.05% : 0.000011s : 1: environ_conv 0.09% : 0.000021s : 1: event_method 0.03% : 0.000008s : 1: full_micro_interleaved_order_control 0.03% : 0.000008s : 1: get_jit_bprop_graph 0.05% : 0.000012s : 1: graph_reusing 0.03% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.03% : 0.000006s : 1: handle_group_info 0.04% : 0.000008s : 1: inline 0.04% : 0.000009s : 1: insert-virtual-dataset 0.03% : 0.000006s : 1: interleave_parallel_branches 0.03% : 0.000006s : 1: interleave_split_concat_branches 0.04% : 0.000008s : 1: label_fine_grained_interleaved_index 0.04% : 0.000009s : 1: label_micro_interleaved_index 1.92% : 0.000445s : 1: loop_unroll 0.03% : 0.000007s : 1: merge_cast_opt 0.03% : 0.000007s : 1: micro_interleaved_order_control 2.12% : 0.000492s : 1: mutable_eliminate 0.04% : 0.000009s : 1: offloading_packed_experts 0.06% : 0.000013s : 1: opt.transform.loop_unroll_optimizer 0.06% : 0.000013s : 1: opt.transform.mutable_eliminate 3.61% : 0.000836s : 78: opt.transform.opt_a 0.11% : 0.000026s : 1: opt.transform.opt_after_cconv 0.10% : 0.000024s : 1: opt.transform.opt_after_jit_grad 0.42% : 0.000097s : 28: opt.transform.opt_b 0.20% : 0.000045s : 2: opt.transform.opt_trans_graph 0.15% : 0.000035s : 4: opt.transform.symbol_engine_opt 10.70% : 0.002478s : 1: opt_a 0.54% : 0.000126s : 1: opt_after_cconv 2.08% : 0.000481s : 1: opt_after_jit_grad 1.14% : 0.000265s : 1: opt_b 21.54% : 0.004988s : 1: optimize 0.10% : 0.000022s : 1: optimize_parallel_all_gather_comm 0.05% : 0.000012s : 1: order_py_execute_after_rewriter 0.10% : 0.000023s : 1: overlap_grad_flash_sp 0.03% : 0.000006s : 1: overlap_grad_matmul_and_grad_allreduce 0.04% : 0.000010s : 1: overlap_grad_ring_attention 0.03% : 0.000007s : 1: overlap_opt_shard_grad_in_pipeline 0.03% : 0.000007s : 1: overlap_opt_shard_in_pipeline 0.03% : 0.000008s : 1: overlap_param_gather 0.03% : 0.000006s : 1: overlap_recompute_allgather_and_fa_grad 0.04% : 0.000010s : 1: overlap_recompute_and_grad_model_parallel 0.03% : 0.000008s : 1: overlap_recompute_comm 0.04% : 0.000010s : 1: parallel-infer-symbol 0.03% : 0.000006s : 1: parallel-infer-symbol-second 0.03% : 0.000007s : 1: partial_unused_args_eliminate 0.04% : 0.000009s : 1: pipeline_parallel_scheduler 0.03% : 0.000007s : 1: pipeline_split 0.14% : 0.000032s : 1: pre_auto_parallel 0.10% : 0.000024s : 1: py_interpret_to_execute 0.07% : 0.000015s : 1: py_interpret_to_execute_after_opt_a 0.03% : 0.000006s : 1: remove_cast_before_assign_add 0.08% : 0.000020s : 1: remove_dup_value 1.07% : 0.000248s : 1: renormalize.infer 0.86% : 0.000199s : 1: renormalize.specialize 0.03% : 0.000008s : 1: reorder_send_recv_between_fp_bp 0.04% : 0.000010s : 1: rewriter_after_jit_bprop_graph 0.18% : 0.000041s : 1: rewriter_after_opt_a 0.22% : 0.000051s : 1: rewriter_before_opt_a 0.03% : 0.000008s : 1: slice_cell_reuse_recomputed_activation 0.03% : 0.000007s : 1: slice_recompute_activation 0.03% : 0.000007s : 1: split_layernorm_comm 0.03% : 0.000007s : 1: split_matmul_comm_elemetwise 0.05% : 0.000011s : 1: swap_dp_allreduce_reducescatter 0.42% : 0.000097s : 1: symbol_engine_optimizer 0.38% : 0.000089s : 1: tuple_transform 20.19% : 0.004674s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:44.410.473 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0135924, [21] [bootstrap]: 0.00043418 [type_inference]: 0.00484155 [event_method]: 1.204e-05 [auto_monad]: 5.585e-05 [graph_reusing]: 5.75001e-06 [inline]: 2.67001e-06 [add_attr]: 0.0031502, [1] [add_attr_with_inline]: 0.00314044, [1] [Cycle 1]: 5.269e-05, [2] [tag_attr]: 1.413e-05 [meta_addattr_fg_expand]: 4.18999e-06 [parallel-infer-symbol]: 2.96999e-06 [pre_auto_parallel]: 2.711e-05 [insert-virtual-dataset]: 2.70002e-06 [parallel-infer-symbol-second]: 7.7e-07 [dataset_repeat_opt]: 2.69999e-06 [pipeline_split]: 1.54e-06 [optimize]: 0.0043303, [53] [py_interpret_to_execute]: 1.934e-05 [rewriter_before_opt_a]: 4.374e-05 [opt_a]: 0.00229475, [2] [Cycle 1]: 0.00161275, [45] [expand_dump_flag]: 2.57001e-06 [switch_simplify]: 2.571e-05 [loop_unroll]: 1.374e-05 [a_1]: 0.00032668 [with_stream_mark]: 1.672e-05 [recompute_prepare]: 9.10001e-06 [updatestate_depend_eliminate]: 4.02998e-06 [updatestate_assign_eliminate]: 3.76001e-06 [updatestate_loads_eliminate]: 3.08e-06 [parameter_eliminate]: 2.149e-05 [a_2]: 9.633e-05 [accelerated_algorithm]: 7.96001e-06 [shard]: 2.16e-06 [meta_shard_fg_expand]: 1.72999e-06 [shard_inline]: 6.17001e-06 [merge_send_recv]: 8.82e-06 [auto_parallel]: 7.16999e-06 [parallel]: 1.837e-05 [flash_sp]: 9.02999e-06 [merge_comm]: 4e-06 [allreduce_fusion]: 3.93001e-06 [matmul_add_comm_reduction]: 9.55001e-06 [allreduce_slice_to_reducescatter]: 5.89993e-07 [virtual_shard_identity]: 8.35999e-06 [virtual_dataset]: 6.58e-06 [get_grad_eliminate_]: 5.94e-06 [virtual_output]: 6.46e-06 [merge_forward]: 3.83999e-06 [cell_reuse_recompute_pass]: 1.43002e-06 [offload_activation]: 1.009e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.361e-05 [merge_recompute_call_nodes]: 1.59998e-06 [before_grad]: 1.088e-05 [set_forward_comm_id_for_comm_node_pass]: 3.88999e-06 [meta_fg_expand]: 3.21001e-06 [flash_sp_send_recv_attached]: 2.71e-06 [receive_attached]: 2.23998e-06 [after_resolve]: 1.072e-05 [a_after_grad]: 9.70002e-06 [renormalize]: 0.00054873 [add_forward_monad_depend]: 5.29998e-06 [auto_monad_grad]: 2.34001e-06 [auto_monad_eliminator]: 1.574e-05 [cse]: 3.131e-05 [a_3]: 4.818e-05 [Cycle 2]: 0.00067209, [45] [expand_dump_flag]: 1.22e-06 [switch_simplify]: 7.29001e-06 [loop_unroll]: 5.89e-06 [a_1]: 0.00013065 [with_stream_mark]: 1.222e-05 [recompute_prepare]: 6.52001e-06 [updatestate_depend_eliminate]: 4.04002e-06 [updatestate_assign_eliminate]: 2.44999e-06 [updatestate_loads_eliminate]: 2.85998e-06 [parameter_eliminate]: 1.20001e-06 [a_2]: 8.135e-05 [accelerated_algorithm]: 6.62002e-06 [shard]: 1.34e-06 [meta_shard_fg_expand]: 1.37e-06 [shard_inline]: 5.86998e-06 [merge_send_recv]: 6.13998e-06 [auto_parallel]: 5.56e-06 [parallel]: 5.62999e-06 [flash_sp]: 4.02e-06 [merge_comm]: 3.86999e-06 [allreduce_fusion]: 3.2e-06 [matmul_add_comm_reduction]: 6.58998e-06 [allreduce_slice_to_reducescatter]: 5.40022e-07 [virtual_shard_identity]: 6.61e-06 [virtual_dataset]: 5.52001e-06 [get_grad_eliminate_]: 5.24998e-06 [virtual_output]: 5.25001e-06 [merge_forward]: 3.16999e-06 [cell_reuse_recompute_pass]: 1.44998e-06 [offload_activation]: 7.02002e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.136e-05 [merge_recompute_call_nodes]: 9.29984e-07 [before_grad]: 8.82999e-06 [set_forward_comm_id_for_comm_node_pass]: 4.08999e-06 [meta_fg_expand]: 2.55002e-06 [flash_sp_send_recv_attached]: 1.19003e-06 [receive_attached]: 1.50999e-06 [after_resolve]: 9.67001e-06 [a_after_grad]: 8.61002e-06 [renormalize]: 8.00064e-08 [add_forward_monad_depend]: 1.87999e-06 [auto_monad_grad]: 9.70002e-07 [auto_monad_eliminator]: 9.42001e-06 [cse]: 1.718e-05 [a_3]: 3.561e-05 [py_interpret_to_execute_after_opt_a]: 1.015e-05 [slice_cell_reuse_recomputed_activation]: 2.27999e-06 [rewriter_after_opt_a]: 3.66e-05 [convert_after_rewriter]: 7.29001e-06 [order_py_execute_after_rewriter]: 5.19998e-06 [mutable_eliminate]: 0.00052569 [opt_b]: 0.00021866, [1] [Cycle 1]: 0.00021174, [7] [b_1]: 0.00012627 [b_2]: 8.08001e-06 [updatestate_depend_eliminate]: 6.98998e-06 [updatestate_assign_eliminate]: 2.57001e-06 [updatestate_loads_eliminate]: 2.63998e-06 [renormalize]: 5.60016e-07 [cse]: 2.291e-05 [optimize_parallel_all_gather_comm]: 1.712e-05 [overlap_param_gather]: 1.93002e-06 [cconv]: 2.689e-05 [loop_unroll]: 0.00045768 [opt_after_cconv]: 0.00010315, [1] [Cycle 1]: 9.752e-05, [7] [c_1]: 2.733e-05 [parameter_eliminate]: 2.65002e-06 [updatestate_depend_eliminate]: 6.61e-06 [updatestate_assign_eliminate]: 2.32001e-06 [updatestate_loads_eliminate]: 2.66999e-06 [cse]: 2.101e-05 [renormalize]: 6.19999e-07 [remove_dup_value]: 1.471e-05 [tuple_transform]: 7.16e-05, [1] [Cycle 1]: 6.737e-05, [4] [d_1]: 3.985e-05 [none_parameter_eliminate]: 1.81e-06 [renormalize]: 2.09984e-07 [switch_simplify]: 6.86001e-06 [partial_unused_args_eliminate]: 2.15002e-06 [add_recomputation]: 4.762e-05 [cse_after_recomputation]: 2.207e-05, [1] [Cycle 1]: 1.756e-05, [1] [cse]: 1.197e-05 [environ_conv]: 5.66e-06 [swap_dp_allreduce_reducescatter]: 5.12e-06 [bias_add_comm_swap]: 2.49999e-06 [label_micro_interleaved_index]: 4.48001e-06 [label_fine_grained_interleaved_index]: 2.79999e-06 [merge_cast_opt]: 1.44e-06 [slice_recompute_activation]: 2.04e-06 [micro_interleaved_order_control]: 2.02001e-06 [assign_add_opt]: 1.24e-06 [ForceFp32Comm]: 1.04998e-06 [remove_cast_before_assign_add]: 1.27e-06 [full_micro_interleaved_order_control]: 2.16e-06 [reorder_send_recv_between_fp_bp]: 2.60002e-06 [comm_op_add_attrs]: 1.02998e-06 [add_comm_op_reuse_tag]: 9.70002e-07 [interleave_split_concat_branches]: 1.15999e-06 [interleave_parallel_branches]: 1.19003e-06 [overlap_opt_shard_in_pipeline]: 1.12e-06 [overlap_opt_shard_grad_in_pipeline]: 2.09999e-06 [control_data_broadcast_order]: 1.259e-05 [grouped_pairwise_exchange_alltoall]: 1.71e-06 [offloading_packed_experts]: 3.88001e-06 [overlap_recompute_and_grad_model_parallel]: 4.84998e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.07e-06 [overlap_recompute_allgather_and_fa_grad]: 1.35001e-06 [overlap_recompute_comm]: 2.49001e-06 [overlap_grad_ring_attention]: 3.91999e-06 [overlap_grad_flash_sp]: 2.037e-05 [begin_end_overlap_inline]: 1.00001e-06 [split_matmul_comm_elemetwise]: 2.22001e-06 [split_layernorm_comm]: 1.65001e-06 [handle_group_info]: 9.30013e-07 [symbol_engine_optimizer]: 7.895e-05, [1] [Cycle 1]: 7.448e-05, [6] [build]: 2.78e-06 [elim_shapecalc]: 1.11e-05 [elim_not_effective]: 1.275e-05 [opt_reshape]: 7.06001e-06 [fold_const_symbol]: 9.89001e-06 [renormalize]: 3.50003e-07 [detach_backward]: 2.13002e-06 [pipeline_parallel_scheduler]: 1.56002e-06 [auto_monad_reorder]: 1.737e-05 [get_jit_bprop_graph]: 1.77999e-06 [rewriter_after_jit_bprop_graph]: 4.02998e-06 [opt_after_jit_grad]: 0.00050067 [validate]: 3.791e-05 Sums bootstrap : 0.000434s : 4.60% type_inference : 0.004842s : 51.29% event_method : 0.000012s : 0.13% auto_monad : 0.000056s : 0.59% graph_reusing : 0.000006s : 0.06% inline : 0.000003s : 0.03% add_attr.add_attr_with_inline.tag_attr : 0.000014s : 0.15% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000004s : 0.04% parallel-infer-symbol : 0.000003s : 0.03% pre_auto_parallel : 0.000027s : 0.29% insert-virtual-dataset : 0.000003s : 0.03% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000003s : 0.03% pipeline_split : 0.000002s : 0.02% optimize.py_interpret_to_execute : 0.000019s : 0.20% optimize.rewriter_before_opt_a : 0.000044s : 0.46% optimize.opt_a.expand_dump_flag : 0.000004s : 0.04% optimize.opt_a.switch_simplify : 0.000033s : 0.35% optimize.opt_a.loop_unroll : 0.000020s : 0.21% optimize.opt_a.a_1 : 0.000457s : 4.84% optimize.opt_a.with_stream_mark : 0.000029s : 0.31% optimize.opt_a.recompute_prepare : 0.000016s : 0.17% optimize.opt_a.updatestate_depend_eliminate : 0.000008s : 0.09% optimize.opt_a.updatestate_assign_eliminate : 0.000006s : 0.07% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.06% optimize.opt_a.parameter_eliminate : 0.000023s : 0.24% optimize.opt_a.a_2 : 0.000178s : 1.88% optimize.opt_a.accelerated_algorithm : 0.000015s : 0.15% optimize.opt_a.shard : 0.000003s : 0.04% optimize.opt_a.meta_shard_fg_expand : 0.000003s : 0.03% optimize.opt_a.shard_inline : 0.000012s : 0.13% optimize.opt_a.merge_send_recv : 0.000015s : 0.16% optimize.opt_a.auto_parallel : 0.000013s : 0.13% optimize.opt_a.parallel : 0.000024s : 0.25% optimize.opt_a.flash_sp : 0.000013s : 0.14% optimize.opt_a.merge_comm : 0.000008s : 0.08% optimize.opt_a.allreduce_fusion : 0.000007s : 0.08% optimize.opt_a.matmul_add_comm_reduction : 0.000016s : 0.17% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000015s : 0.16% optimize.opt_a.virtual_dataset : 0.000012s : 0.13% optimize.opt_a.get_grad_eliminate_ : 0.000011s : 0.12% optimize.opt_a.virtual_output : 0.000012s : 0.12% optimize.opt_a.merge_forward : 0.000007s : 0.07% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.03% optimize.opt_a.offload_activation : 0.000017s : 0.18% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000025s : 0.26% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.03% optimize.opt_a.before_grad : 0.000020s : 0.21% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000008s : 0.08% optimize.opt_a.meta_fg_expand : 0.000006s : 0.06% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.04% optimize.opt_a.receive_attached : 0.000004s : 0.04% optimize.opt_a.after_resolve : 0.000020s : 0.22% optimize.opt_a.a_after_grad : 0.000018s : 0.19% optimize.opt_a.renormalize : 0.000549s : 5.81% optimize.opt_a.add_forward_monad_depend : 0.000007s : 0.08% optimize.opt_a.auto_monad_grad : 0.000003s : 0.04% optimize.opt_a.auto_monad_eliminator : 0.000025s : 0.27% optimize.opt_a.cse : 0.000048s : 0.51% optimize.opt_a.a_3 : 0.000084s : 0.89% optimize.py_interpret_to_execute_after_opt_a : 0.000010s : 0.11% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.02% optimize.rewriter_after_opt_a : 0.000037s : 0.39% optimize.convert_after_rewriter : 0.000007s : 0.08% optimize.order_py_execute_after_rewriter : 0.000005s : 0.06% optimize.mutable_eliminate : 0.000526s : 5.57% optimize.opt_b.b_1 : 0.000126s : 1.34% optimize.opt_b.b_2 : 0.000008s : 0.09% optimize.opt_b.updatestate_depend_eliminate : 0.000007s : 0.07% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_b.renormalize : 0.000001s : 0.01% optimize.opt_b.cse : 0.000023s : 0.24% optimize.optimize_parallel_all_gather_comm : 0.000017s : 0.18% optimize.overlap_param_gather : 0.000002s : 0.02% optimize.cconv : 0.000027s : 0.28% optimize.loop_unroll : 0.000458s : 4.85% optimize.opt_after_cconv.c_1 : 0.000027s : 0.29% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000007s : 0.07% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000002s : 0.02% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.cse : 0.000021s : 0.22% optimize.opt_after_cconv.renormalize : 0.000001s : 0.01% optimize.remove_dup_value : 0.000015s : 0.16% optimize.tuple_transform.d_1 : 0.000040s : 0.42% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.02% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000007s : 0.07% optimize.partial_unused_args_eliminate : 0.000002s : 0.02% optimize.add_recomputation : 0.000048s : 0.50% optimize.cse_after_recomputation.cse : 0.000012s : 0.13% optimize.environ_conv : 0.000006s : 0.06% optimize.swap_dp_allreduce_reducescatter : 0.000005s : 0.05% optimize.bias_add_comm_swap : 0.000002s : 0.03% optimize.label_micro_interleaved_index : 0.000004s : 0.05% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.03% optimize.merge_cast_opt : 0.000001s : 0.02% optimize.slice_recompute_activation : 0.000002s : 0.02% optimize.micro_interleaved_order_control : 0.000002s : 0.02% optimize.assign_add_opt : 0.000001s : 0.01% optimize.ForceFp32Comm : 0.000001s : 0.01% optimize.remove_cast_before_assign_add : 0.000001s : 0.01% optimize.full_micro_interleaved_order_control : 0.000002s : 0.02% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.03% optimize.comm_op_add_attrs : 0.000001s : 0.01% optimize.add_comm_op_reuse_tag : 0.000001s : 0.01% optimize.interleave_split_concat_branches : 0.000001s : 0.01% optimize.interleave_parallel_branches : 0.000001s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.02% optimize.control_data_broadcast_order : 0.000013s : 0.13% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.02% optimize.offloading_packed_experts : 0.000004s : 0.04% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.05% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.01% optimize.overlap_recompute_comm : 0.000002s : 0.03% optimize.overlap_grad_ring_attention : 0.000004s : 0.04% optimize.overlap_grad_flash_sp : 0.000020s : 0.22% optimize.begin_end_overlap_inline : 0.000001s : 0.01% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.02% optimize.split_layernorm_comm : 0.000002s : 0.02% optimize.handle_group_info : 0.000001s : 0.01% optimize.symbol_engine_optimizer.build : 0.000003s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000011s : 0.12% optimize.symbol_engine_optimizer.elim_not_effective : 0.000013s : 0.14% optimize.symbol_engine_optimizer.opt_reshape : 0.000007s : 0.07% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000010s : 0.10% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.02% pipeline_parallel_scheduler : 0.000002s : 0.02% auto_monad_reorder : 0.000017s : 0.18% get_jit_bprop_graph : 0.000002s : 0.02% rewriter_after_jit_bprop_graph : 0.000004s : 0.04% opt_after_jit_grad : 0.000501s : 5.30% validate : 0.000038s : 0.40% Time group info: ------[substitution.] 0.000180 23 39.17% : 0.000071s : 4: substitution.arithmetic_simplify 0.99% : 0.000002s : 2: substitution.elim_not_effective 0.72% : 0.000001s : 2: substitution.fold_const_symbol 3.15% : 0.000006s : 3: substitution.graph_param_transform 49.96% : 0.000090s : 2: substitution.inline 1.85% : 0.000003s : 4: substitution.j_node_and_user_rematch 2.36% : 0.000004s : 4: substitution.remove_not_recompute_node 1.80% : 0.000003s : 2: substitution.replace_old_param ------[type_inference.] 0.004788 2 90.30% : 0.004324s : 1: type_inference.infer 9.70% : 0.000464s : 1: type_inference.specialize ------[replace.] 0.000022 2 100.00% : 0.000022s : 2: replace.inline ------[match.] 0.000088 2 100.00% : 0.000088s : 2: match.inline ------[predicate.] 0.000137 754 0.78% : 0.000001s : 7: predicate.accumulaten_eliminater 1.34% : 0.000002s : 3: predicate.ad_related_special_op_eliminate 0.70% : 0.000001s : 6: predicate.addn_check_dump 0.87% : 0.000001s : 7: predicate.addn_zero_filter 0.67% : 0.000001s : 7: predicate.adjust_all_reduce_mul_add 3.21% : 0.000004s : 13: predicate.arithmetic_simplify 0.74% : 0.000001s : 7: predicate.cast_eliminate 0.72% : 0.000001s : 6: predicate.check_bprop_eliminate 0.79% : 0.000001s : 6: predicate.compare_switch_simplify 0.25% : 0.000000s : 3: predicate.const_output_eliminate 0.73% : 0.000001s : 6: predicate.depend_value_elim 0.73% : 0.000001s : 7: predicate.dict_get_item_const_eliminator 0.87% : 0.000001s : 7: predicate.dict_get_item_eliminator 0.90% : 0.000001s : 7: predicate.dict_set_item_eliminator 1.48% : 0.000002s : 6: predicate.dumpgradient_eliminate 0.23% : 0.000000s : 3: predicate.elim_not_effective 0.62% : 0.000001s : 3: predicate.elim_shapecalc_of_broadcastargs 1.14% : 0.000002s : 10: predicate.environ_add_const_eliminate 0.99% : 0.000001s : 10: predicate.environ_get_add_eliminate 0.94% : 0.000001s : 10: predicate.environ_get_depend_swap 1.79% : 0.000002s : 16: predicate.environ_get_eliminate 1.00% : 0.000001s : 10: predicate.environ_get_set_eliminate 0.90% : 0.000001s : 9: predicate.exchange_switch_depend_value 2.01% : 0.000003s : 9: predicate.float_depend_g_call 0.63% : 0.000001s : 6: predicate.float_environ_get_switch 0.92% : 0.000001s : 9: predicate.float_tuple_getitem_switch 0.21% : 0.000000s : 3: predicate.fold_const_symbol 0.76% : 0.000001s : 6: predicate.get_grad_eliminate 0.28% : 0.000000s : 3: predicate.graph_param_transform 0.75% : 0.000001s : 6: predicate.incorporate_call 0.62% : 0.000001s : 6: predicate.incorporate_call_switch 6.52% : 0.000009s : 34: predicate.inline 1.08% : 0.000001s : 6: predicate.inline_without_move 0.55% : 0.000001s : 6: predicate.j_node_and_user_rematch 1.32% : 0.000002s : 6: predicate.less_batch_normalization 1.65% : 0.000002s : 13: predicate.list_to_tuple_eliminator_ 2.07% : 0.000003s : 20: predicate.load_eliminater 1.53% : 0.000002s : 3: predicate.loop_unroll_after_grad 1.56% : 0.000002s : 14: predicate.loop_unroll_before_grad 1.73% : 0.000002s : 13: predicate.make_slice_get_slice_eliminator 0.65% : 0.000001s : 6: predicate.merge_addn 0.73% : 0.000001s : 6: predicate.micro_step_allgather_replace 0.65% : 0.000001s : 6: predicate.mini_step_allgather_replace 0.67% : 0.000001s : 7: predicate.minmaximum_grad 1.51% : 0.000002s : 3: predicate.mutable_eliminate 0.39% : 0.000001s : 3: predicate.opt_reshape 0.47% : 0.000001s : 3: predicate.parallel_virtual_node 1.58% : 0.000002s : 9: predicate.partial_defer_inline 1.22% : 0.000002s : 10: predicate.partial_eliminate 0.74% : 0.000001s : 7: predicate.print_const_string_wrapper 0.72% : 0.000001s : 6: predicate.reduce_all_const_elim 1.04% : 0.000001s : 7: predicate.reduce_eliminate 1.96% : 0.000003s : 20: predicate.redundant_stop_gradient_eliminater 0.64% : 0.000001s : 6: predicate.remove_not_recompute_node 1.16% : 0.000002s : 13: predicate.replace_applicator 0.78% : 0.000001s : 6: predicate.replace_old_param 0.37% : 0.000001s : 3: predicate.reset_defer_inline 0.76% : 0.000001s : 7: predicate.reshape_eliminate 0.68% : 0.000001s : 6: predicate.row_tensor_add_zeros_like 0.44% : 0.000001s : 3: predicate.row_tensor_eliminate 1.00% : 0.000001s : 6: predicate.same_eliminate 0.52% : 0.000001s : 6: predicate.set_cell_output_no_recompute 1.03% : 0.000001s : 6: predicate.shard_identity_eliminate 0.92% : 0.000001s : 6: predicate.special_op_eliminate 0.87% : 0.000001s : 6: predicate.specialize_transform 1.11% : 0.000002s : 6: predicate.split_environ_get_set_with_tuple_value 0.85% : 0.000001s : 6: predicate.stack_unstack_eliminate 0.39% : 0.000001s : 3: predicate.switch_call_monad_eliminater 1.02% : 0.000001s : 9: predicate.switch_defer_inline 1.68% : 0.000002s : 15: predicate.switch_layer_defer_inline 4.48% : 0.000006s : 32: predicate.switch_simplify 0.71% : 0.000001s : 7: predicate.tile_eliminate 0.74% : 0.000001s : 7: predicate.transpose_eliminate 1.46% : 0.000002s : 13: predicate.tuple_list_convert_item_index_to_positive 1.56% : 0.000002s : 13: predicate.tuple_list_get_item_const_eliminator 1.39% : 0.000002s : 13: predicate.tuple_list_get_item_depend_reorder 3.09% : 0.000004s : 19: predicate.tuple_list_get_item_eliminator 1.32% : 0.000002s : 13: predicate.tuple_list_get_set_item_eliminator 2.69% : 0.000004s : 19: predicate.tuple_list_set_item_eliminator 1.46% : 0.000002s : 13: predicate.tuple_to_list_eliminator_ 1.94% : 0.000003s : 20: predicate.updatestate_pure_node_eliminater 2.75% : 0.000004s : 26: predicate.updatestate_useless_node_eliminater 0.67% : 0.000001s : 3: predicate.value_based_eliminate 0.83% : 0.000001s : 6: predicate.virtual_dataset_eliminate 0.81% : 0.000001s : 6: predicate.virtual_output_eliminate 0.35% : 0.000000s : 3: predicate.virtual_view_grad_eliminate 0.60% : 0.000001s : 3: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000222 5 8.39% : 0.000019s : 1: func_graph_cloner_run.FuncGraphClonerGraph 91.61% : 0.000204s : 4: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.022608 192 0.02% : 0.000004s : 1: ForceFp32Comm 13.96% : 0.003155s : 1: add_attr 13.91% : 0.003144s : 1: add_attr_with_inline 0.02% : 0.000004s : 1: add_comm_op_reuse_tag 0.23% : 0.000052s : 1: add_recomputation 0.02% : 0.000004s : 1: assign_add_opt 0.27% : 0.000061s : 1: auto_monad 0.09% : 0.000021s : 1: auto_monad_reorder 0.02% : 0.000004s : 1: begin_end_overlap_inline 0.02% : 0.000005s : 1: bias_add_comm_swap 2.06% : 0.000465s : 1: bootstrap 0.13% : 0.000030s : 1: cconv 0.02% : 0.000004s : 1: comm_op_add_attrs 0.07% : 0.000016s : 1: control_data_broadcast_order 0.05% : 0.000010s : 1: convert_after_rewriter 0.11% : 0.000025s : 1: cse_after_recomputation 0.02% : 0.000006s : 1: dataset_repeat_opt 0.03% : 0.000006s : 1: detach_backward 0.04% : 0.000009s : 1: environ_conv 0.08% : 0.000019s : 1: event_method 0.02% : 0.000005s : 1: full_micro_interleaved_order_control 0.02% : 0.000005s : 1: get_jit_bprop_graph 0.04% : 0.000009s : 1: graph_reusing 0.02% : 0.000005s : 1: grouped_pairwise_exchange_alltoall 0.02% : 0.000004s : 1: handle_group_info 0.03% : 0.000006s : 1: inline 0.03% : 0.000006s : 1: insert-virtual-dataset 0.02% : 0.000004s : 1: interleave_parallel_branches 0.02% : 0.000004s : 1: interleave_split_concat_branches 0.03% : 0.000006s : 1: label_fine_grained_interleaved_index 0.03% : 0.000007s : 1: label_micro_interleaved_index 2.07% : 0.000467s : 1: loop_unroll 0.02% : 0.000004s : 1: merge_cast_opt 0.02% : 0.000005s : 1: micro_interleaved_order_control 2.37% : 0.000535s : 1: mutable_eliminate 0.03% : 0.000007s : 1: offloading_packed_experts 0.07% : 0.000015s : 1: opt.transform.loop_unroll_optimizer 0.07% : 0.000017s : 1: opt.transform.mutable_eliminate 3.76% : 0.000849s : 78: opt.transform.opt_a 0.11% : 0.000026s : 1: opt.transform.opt_after_cconv 0.12% : 0.000027s : 1: opt.transform.opt_after_jit_grad 0.44% : 0.000100s : 28: opt.transform.opt_b 0.20% : 0.000044s : 2: opt.transform.opt_trans_graph 0.16% : 0.000037s : 4: opt.transform.symbol_engine_opt 10.16% : 0.002298s : 1: opt_a 0.47% : 0.000107s : 1: opt_after_cconv 2.26% : 0.000511s : 1: opt_after_jit_grad 0.99% : 0.000223s : 1: opt_b 19.18% : 0.004335s : 1: optimize 0.09% : 0.000021s : 1: optimize_parallel_all_gather_comm 0.04% : 0.000008s : 1: order_py_execute_after_rewriter 0.11% : 0.000025s : 1: overlap_grad_flash_sp 0.02% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.03% : 0.000007s : 1: overlap_grad_ring_attention 0.02% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.02% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.02% : 0.000005s : 1: overlap_param_gather 0.02% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.03% : 0.000008s : 1: overlap_recompute_and_grad_model_parallel 0.02% : 0.000005s : 1: overlap_recompute_comm 0.03% : 0.000006s : 1: parallel-infer-symbol 0.02% : 0.000004s : 1: parallel-infer-symbol-second 0.02% : 0.000005s : 1: partial_unused_args_eliminate 0.02% : 0.000005s : 1: pipeline_parallel_scheduler 0.02% : 0.000004s : 1: pipeline_split 0.14% : 0.000031s : 1: pre_auto_parallel 0.10% : 0.000023s : 1: py_interpret_to_execute 0.06% : 0.000014s : 1: py_interpret_to_execute_after_opt_a 0.02% : 0.000004s : 1: remove_cast_before_assign_add 0.08% : 0.000018s : 1: remove_dup_value 1.36% : 0.000307s : 1: renormalize.infer 1.04% : 0.000234s : 1: renormalize.specialize 0.02% : 0.000005s : 1: reorder_send_recv_between_fp_bp 0.03% : 0.000007s : 1: rewriter_after_jit_bprop_graph 0.19% : 0.000042s : 1: rewriter_after_opt_a 0.21% : 0.000047s : 1: rewriter_before_opt_a 0.02% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.02% : 0.000005s : 1: slice_recompute_activation 0.02% : 0.000004s : 1: split_layernorm_comm 0.02% : 0.000005s : 1: split_matmul_comm_elemetwise 0.04% : 0.000008s : 1: swap_dp_allreduce_reducescatter 0.36% : 0.000082s : 1: symbol_engine_optimizer 0.33% : 0.000075s : 1: tuple_transform 21.50% : 0.004860s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:44.604.562 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:44.604.816 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0162946, [21] [bootstrap]: 0.00048122 [type_inference]: 0.00531236 [event_method]: 1.312e-05 [auto_monad]: 5.808e-05 [graph_reusing]: 5.62999e-06 [inline]: 2.84001e-06 [add_attr]: 0.00350377, [1] [add_attr_with_inline]: 0.00349259, [1] [Cycle 1]: 8.344e-05, [2] [tag_attr]: 1.578e-05 [meta_addattr_fg_expand]: 3.68e-06 [parallel-infer-symbol]: 3.71999e-06 [pre_auto_parallel]: 3.154e-05 [insert-virtual-dataset]: 2.57001e-06 [parallel-infer-symbol-second]: 8.30012e-07 [dataset_repeat_opt]: 1.92001e-06 [pipeline_split]: 2.24001e-06 [optimize]: 0.00560375, [53] [py_interpret_to_execute]: 2.6e-05 [rewriter_before_opt_a]: 5.667e-05 [opt_a]: 0.00304871, [2] [Cycle 1]: 0.00206141, [45] [expand_dump_flag]: 2.48e-06 [switch_simplify]: 2.684e-05 [loop_unroll]: 1.452e-05 [a_1]: 0.00035809 [with_stream_mark]: 2.115e-05 [recompute_prepare]: 1.18e-05 [updatestate_depend_eliminate]: 4.65999e-06 [updatestate_assign_eliminate]: 4.05e-06 [updatestate_loads_eliminate]: 3.09001e-06 [parameter_eliminate]: 2.24001e-06 [a_2]: 0.0001289 [accelerated_algorithm]: 8.41002e-06 [shard]: 2.05002e-06 [meta_shard_fg_expand]: 1.82001e-06 [shard_inline]: 7.38999e-06 [merge_send_recv]: 9.09e-06 [auto_parallel]: 8.92999e-06 [parallel]: 1.985e-05 [flash_sp]: 1.045e-05 [merge_comm]: 5.09e-06 [allreduce_fusion]: 3.28998e-06 [matmul_add_comm_reduction]: 1.01e-05 [allreduce_slice_to_reducescatter]: 8.09989e-07 [virtual_shard_identity]: 1.173e-05 [virtual_dataset]: 7.13e-06 [get_grad_eliminate_]: 6.54001e-06 [virtual_output]: 7.21999e-06 [merge_forward]: 4.85001e-06 [cell_reuse_recompute_pass]: 1.39998e-06 [offload_activation]: 1.082e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.721e-05 [merge_recompute_call_nodes]: 1.65001e-06 [before_grad]: 1.241e-05 [set_forward_comm_id_for_comm_node_pass]: 5.00999e-06 [meta_fg_expand]: 2.71999e-06 [flash_sp_send_recv_attached]: 3.14999e-06 [receive_attached]: 2.74001e-06 [after_resolve]: 1.191e-05 [a_after_grad]: 9.96e-06 [renormalize]: 0.00065848 [add_forward_monad_depend]: 8.27e-06 [auto_monad_grad]: 2.66999e-06 [auto_monad_eliminator]: 1.956e-05 [cse]: 3.282e-05 [a_3]: 6.769e-05 [Cycle 2]: 0.00097031, [45] [expand_dump_flag]: 1.56998e-06 [switch_simplify]: 8.74e-06 [loop_unroll]: 6.44001e-06 [a_1]: 0.00014568 [with_stream_mark]: 1.927e-05 [recompute_prepare]: 9.74999e-06 [updatestate_depend_eliminate]: 3.78001e-06 [updatestate_assign_eliminate]: 3.16999e-06 [updatestate_loads_eliminate]: 3.04999e-06 [parameter_eliminate]: 2.05002e-06 [a_2]: 0.00011482 [accelerated_algorithm]: 8.34002e-06 [shard]: 1.76003e-06 [meta_shard_fg_expand]: 1.92999e-06 [shard_inline]: 7.31999e-06 [merge_send_recv]: 7.82e-06 [auto_parallel]: 8.67e-06 [parallel]: 7.25998e-06 [flash_sp]: 4.05e-06 [merge_comm]: 3.80998e-06 [allreduce_fusion]: 3.31001e-06 [matmul_add_comm_reduction]: 7.89002e-06 [allreduce_slice_to_reducescatter]: 5.10016e-07 [virtual_shard_identity]: 9.84001e-06 [virtual_dataset]: 5.85002e-06 [get_grad_eliminate_]: 5.74999e-06 [virtual_output]: 6.50002e-06 [merge_forward]: 5.00999e-06 [cell_reuse_recompute_pass]: 2.10002e-06 [offload_activation]: 9.67001e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.728e-05 [merge_recompute_call_nodes]: 1.47999e-06 [before_grad]: 1.147e-05 [set_forward_comm_id_for_comm_node_pass]: 5.23002e-06 [meta_fg_expand]: 2.83998e-06 [flash_sp_send_recv_attached]: 1.64998e-06 [receive_attached]: 1.57001e-06 [after_resolve]: 1.128e-05 [a_after_grad]: 8.80001e-06 [renormalize]: 8.00064e-08 [add_forward_monad_depend]: 3.79002e-06 [auto_monad_grad]: 2.17999e-06 [auto_monad_eliminator]: 1.337e-05 [cse]: 2.316e-05 [a_3]: 5.193e-05 [py_interpret_to_execute_after_opt_a]: 1.753e-05 [slice_cell_reuse_recomputed_activation]: 5.12e-06 [rewriter_after_opt_a]: 4.936e-05 [convert_after_rewriter]: 1.085e-05 [order_py_execute_after_rewriter]: 9.04998e-06 [mutable_eliminate]: 0.00062639 [opt_b]: 0.00030424, [1] [Cycle 1]: 0.00029171, [7] [b_1]: 0.00017201 [b_2]: 9.74999e-06 [updatestate_depend_eliminate]: 9.76998e-06 [updatestate_assign_eliminate]: 3.08998e-06 [updatestate_loads_eliminate]: 2.72001e-06 [renormalize]: 8.80013e-07 [cse]: 2.677e-05 [optimize_parallel_all_gather_comm]: 2.38e-05 [overlap_param_gather]: 4.84998e-06 [cconv]: 3.738e-05 [loop_unroll]: 0.00048975 [opt_after_cconv]: 0.00014365, [1] [Cycle 1]: 0.00013328, [7] [c_1]: 2.947e-05 [parameter_eliminate]: 5.27001e-06 [updatestate_depend_eliminate]: 7.47002e-06 [updatestate_assign_eliminate]: 2.83e-06 [updatestate_loads_eliminate]: 3.11001e-06 [cse]: 2.431e-05 [renormalize]: 5.00004e-07 [remove_dup_value]: 1.976e-05 [tuple_transform]: 9.535e-05, [1] [Cycle 1]: 8.778e-05, [4] [d_1]: 4.431e-05 [none_parameter_eliminate]: 1.94e-06 [renormalize]: 1.50001e-07 [switch_simplify]: 8.57e-06 [partial_unused_args_eliminate]: 4.92e-06 [add_recomputation]: 5.488e-05 [cse_after_recomputation]: 3.3e-05, [1] [Cycle 1]: 2.524e-05, [1] [cse]: 1.447e-05 [environ_conv]: 9.29e-06 [swap_dp_allreduce_reducescatter]: 9.24e-06 [bias_add_comm_swap]: 5.61e-06 [label_micro_interleaved_index]: 7.35e-06 [label_fine_grained_interleaved_index]: 5.93998e-06 [merge_cast_opt]: 4.04002e-06 [slice_recompute_activation]: 4.85001e-06 [micro_interleaved_order_control]: 4.87e-06 [assign_add_opt]: 3.83001e-06 [ForceFp32Comm]: 3.33998e-06 [remove_cast_before_assign_add]: 3.33e-06 [full_micro_interleaved_order_control]: 4.57e-06 [reorder_send_recv_between_fp_bp]: 5.05999e-06 [comm_op_add_attrs]: 3.66001e-06 [add_comm_op_reuse_tag]: 3.89002e-06 [interleave_split_concat_branches]: 3.52997e-06 [interleave_parallel_branches]: 3.7e-06 [overlap_opt_shard_in_pipeline]: 3.50998e-06 [overlap_opt_shard_grad_in_pipeline]: 4.48001e-06 [control_data_broadcast_order]: 1.777e-05 [grouped_pairwise_exchange_alltoall]: 4.47e-06 [offloading_packed_experts]: 6.94001e-06 [overlap_recompute_and_grad_model_parallel]: 7.92e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.5e-06 [overlap_recompute_allgather_and_fa_grad]: 3.86999e-06 [overlap_recompute_comm]: 5.49998e-06 [overlap_grad_ring_attention]: 7.40998e-06 [overlap_grad_flash_sp]: 2.602e-05 [begin_end_overlap_inline]: 3.07002e-06 [split_matmul_comm_elemetwise]: 4.80001e-06 [split_layernorm_comm]: 4.27e-06 [handle_group_info]: 3.36999e-06 [symbol_engine_optimizer]: 0.00011179, [1] [Cycle 1]: 0.00010385, [6] [build]: 4.1e-06 [elim_shapecalc]: 1.345e-05 [elim_not_effective]: 1.54e-05 [opt_reshape]: 8.03999e-06 [fold_const_symbol]: 1.022e-05 [renormalize]: 5.89993e-07 [detach_backward]: 4.72e-06 [pipeline_parallel_scheduler]: 2.10002e-06 [auto_monad_reorder]: 2.042e-05 [get_jit_bprop_graph]: 1.96e-06 [rewriter_after_jit_bprop_graph]: 5.25999e-06 [opt_after_jit_grad]: 0.00054404 [validate]: 4.373e-05 Sums bootstrap : 0.000481s : 4.47% type_inference : 0.005312s : 49.38% event_method : 0.000013s : 0.12% auto_monad : 0.000058s : 0.54% graph_reusing : 0.000006s : 0.05% inline : 0.000003s : 0.03% add_attr.add_attr_with_inline.tag_attr : 0.000016s : 0.15% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000004s : 0.03% parallel-infer-symbol : 0.000004s : 0.03% pre_auto_parallel : 0.000032s : 0.29% insert-virtual-dataset : 0.000003s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.02% optimize.py_interpret_to_execute : 0.000026s : 0.24% optimize.rewriter_before_opt_a : 0.000057s : 0.53% optimize.opt_a.expand_dump_flag : 0.000004s : 0.04% optimize.opt_a.switch_simplify : 0.000036s : 0.33% optimize.opt_a.loop_unroll : 0.000021s : 0.19% optimize.opt_a.a_1 : 0.000504s : 4.68% optimize.opt_a.with_stream_mark : 0.000040s : 0.38% optimize.opt_a.recompute_prepare : 0.000022s : 0.20% optimize.opt_a.updatestate_depend_eliminate : 0.000008s : 0.08% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.07% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.06% optimize.opt_a.parameter_eliminate : 0.000004s : 0.04% optimize.opt_a.a_2 : 0.000244s : 2.27% optimize.opt_a.accelerated_algorithm : 0.000017s : 0.16% optimize.opt_a.shard : 0.000004s : 0.04% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.03% optimize.opt_a.shard_inline : 0.000015s : 0.14% optimize.opt_a.merge_send_recv : 0.000017s : 0.16% optimize.opt_a.auto_parallel : 0.000018s : 0.16% optimize.opt_a.parallel : 0.000027s : 0.25% optimize.opt_a.flash_sp : 0.000014s : 0.13% optimize.opt_a.merge_comm : 0.000009s : 0.08% optimize.opt_a.allreduce_fusion : 0.000007s : 0.06% optimize.opt_a.matmul_add_comm_reduction : 0.000018s : 0.17% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000022s : 0.20% optimize.opt_a.virtual_dataset : 0.000013s : 0.12% optimize.opt_a.get_grad_eliminate_ : 0.000012s : 0.11% optimize.opt_a.virtual_output : 0.000014s : 0.13% optimize.opt_a.merge_forward : 0.000010s : 0.09% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.03% optimize.opt_a.offload_activation : 0.000020s : 0.19% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000034s : 0.32% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.03% optimize.opt_a.before_grad : 0.000024s : 0.22% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000010s : 0.10% optimize.opt_a.meta_fg_expand : 0.000006s : 0.05% optimize.opt_a.flash_sp_send_recv_attached : 0.000005s : 0.04% optimize.opt_a.receive_attached : 0.000004s : 0.04% optimize.opt_a.after_resolve : 0.000023s : 0.22% optimize.opt_a.a_after_grad : 0.000019s : 0.17% optimize.opt_a.renormalize : 0.000659s : 6.12% optimize.opt_a.add_forward_monad_depend : 0.000012s : 0.11% optimize.opt_a.auto_monad_grad : 0.000005s : 0.05% optimize.opt_a.auto_monad_eliminator : 0.000033s : 0.31% optimize.opt_a.cse : 0.000056s : 0.52% optimize.opt_a.a_3 : 0.000120s : 1.11% optimize.py_interpret_to_execute_after_opt_a : 0.000018s : 0.16% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.05% optimize.rewriter_after_opt_a : 0.000049s : 0.46% optimize.convert_after_rewriter : 0.000011s : 0.10% optimize.order_py_execute_after_rewriter : 0.000009s : 0.08% optimize.mutable_eliminate : 0.000626s : 5.82% optimize.opt_b.b_1 : 0.000172s : 1.60% optimize.opt_b.b_2 : 0.000010s : 0.09% optimize.opt_b.updatestate_depend_eliminate : 0.000010s : 0.09% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_b.renormalize : 0.000001s : 0.01% optimize.opt_b.cse : 0.000027s : 0.25% optimize.optimize_parallel_all_gather_comm : 0.000024s : 0.22% optimize.overlap_param_gather : 0.000005s : 0.05% optimize.cconv : 0.000037s : 0.35% optimize.loop_unroll : 0.000490s : 4.55% optimize.opt_after_cconv.c_1 : 0.000029s : 0.27% optimize.opt_after_cconv.parameter_eliminate : 0.000005s : 0.05% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000007s : 0.07% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.cse : 0.000024s : 0.23% optimize.opt_after_cconv.renormalize : 0.000001s : 0.00% optimize.remove_dup_value : 0.000020s : 0.18% optimize.tuple_transform.d_1 : 0.000044s : 0.41% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.02% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000009s : 0.08% optimize.partial_unused_args_eliminate : 0.000005s : 0.05% optimize.add_recomputation : 0.000055s : 0.51% optimize.cse_after_recomputation.cse : 0.000014s : 0.13% optimize.environ_conv : 0.000009s : 0.09% optimize.swap_dp_allreduce_reducescatter : 0.000009s : 0.09% optimize.bias_add_comm_swap : 0.000006s : 0.05% optimize.label_micro_interleaved_index : 0.000007s : 0.07% optimize.label_fine_grained_interleaved_index : 0.000006s : 0.06% optimize.merge_cast_opt : 0.000004s : 0.04% optimize.slice_recompute_activation : 0.000005s : 0.05% optimize.micro_interleaved_order_control : 0.000005s : 0.05% optimize.assign_add_opt : 0.000004s : 0.04% optimize.ForceFp32Comm : 0.000003s : 0.03% optimize.remove_cast_before_assign_add : 0.000003s : 0.03% optimize.full_micro_interleaved_order_control : 0.000005s : 0.04% optimize.reorder_send_recv_between_fp_bp : 0.000005s : 0.05% optimize.comm_op_add_attrs : 0.000004s : 0.03% optimize.add_comm_op_reuse_tag : 0.000004s : 0.04% optimize.interleave_split_concat_branches : 0.000004s : 0.03% optimize.interleave_parallel_branches : 0.000004s : 0.03% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.03% optimize.overlap_opt_shard_grad_in_pipeline : 0.000004s : 0.04% optimize.control_data_broadcast_order : 0.000018s : 0.17% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.04% optimize.offloading_packed_experts : 0.000007s : 0.06% optimize.overlap_recompute_and_grad_model_parallel : 0.000008s : 0.07% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000003s : 0.03% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.04% optimize.overlap_recompute_comm : 0.000005s : 0.05% optimize.overlap_grad_ring_attention : 0.000007s : 0.07% optimize.overlap_grad_flash_sp : 0.000026s : 0.24% optimize.begin_end_overlap_inline : 0.000003s : 0.03% optimize.split_matmul_comm_elemetwise : 0.000005s : 0.04% optimize.split_layernorm_comm : 0.000004s : 0.04% optimize.handle_group_info : 0.000003s : 0.03% optimize.symbol_engine_optimizer.build : 0.000004s : 0.04% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000013s : 0.13% optimize.symbol_engine_optimizer.elim_not_effective : 0.000015s : 0.14% optimize.symbol_engine_optimizer.opt_reshape : 0.000008s : 0.07% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000010s : 0.09% optimize.symbol_engine_optimizer.renormalize : 0.000001s : 0.01% detach_backward : 0.000005s : 0.04% pipeline_parallel_scheduler : 0.000002s : 0.02% auto_monad_reorder : 0.000020s : 0.19% get_jit_bprop_graph : 0.000002s : 0.02% rewriter_after_jit_bprop_graph : 0.000005s : 0.05% opt_after_jit_grad : 0.000544s : 5.06% validate : 0.000044s : 0.41% Time group info: ------[substitution.] 0.000221 23 41.63% : 0.000092s : 4: substitution.arithmetic_simplify 0.96% : 0.000002s : 2: substitution.elim_not_effective 0.61% : 0.000001s : 2: substitution.fold_const_symbol 2.68% : 0.000006s : 3: substitution.graph_param_transform 47.48% : 0.000105s : 2: substitution.inline 2.12% : 0.000005s : 4: substitution.j_node_and_user_rematch 2.22% : 0.000005s : 4: substitution.remove_not_recompute_node 2.30% : 0.000005s : 2: substitution.replace_old_param ------[type_inference.] 0.005254 2 90.40% : 0.004749s : 1: type_inference.infer 9.60% : 0.000505s : 1: type_inference.specialize ------[replace.] 0.000024 2 100.00% : 0.000024s : 2: replace.inline ------[match.] 0.000103 2 100.00% : 0.000103s : 2: match.inline ------[predicate.] 0.000147 754 0.78% : 0.000001s : 7: predicate.accumulaten_eliminater 1.20% : 0.000002s : 3: predicate.ad_related_special_op_eliminate 0.60% : 0.000001s : 6: predicate.addn_check_dump 0.79% : 0.000001s : 7: predicate.addn_zero_filter 0.67% : 0.000001s : 7: predicate.adjust_all_reduce_mul_add 3.22% : 0.000005s : 13: predicate.arithmetic_simplify 0.93% : 0.000001s : 7: predicate.cast_eliminate 0.72% : 0.000001s : 6: predicate.check_bprop_eliminate 0.65% : 0.000001s : 6: predicate.compare_switch_simplify 0.20% : 0.000000s : 3: predicate.const_output_eliminate 0.69% : 0.000001s : 6: predicate.depend_value_elim 0.67% : 0.000001s : 7: predicate.dict_get_item_const_eliminator 0.90% : 0.000001s : 7: predicate.dict_get_item_eliminator 0.71% : 0.000001s : 7: predicate.dict_set_item_eliminator 1.36% : 0.000002s : 6: predicate.dumpgradient_eliminate 0.24% : 0.000000s : 3: predicate.elim_not_effective 0.58% : 0.000001s : 3: predicate.elim_shapecalc_of_broadcastargs 1.22% : 0.000002s : 10: predicate.environ_add_const_eliminate 1.00% : 0.000001s : 10: predicate.environ_get_add_eliminate 0.93% : 0.000001s : 10: predicate.environ_get_depend_swap 1.67% : 0.000002s : 16: predicate.environ_get_eliminate 0.92% : 0.000001s : 10: predicate.environ_get_set_eliminate 0.83% : 0.000001s : 9: predicate.exchange_switch_depend_value 2.05% : 0.000003s : 9: predicate.float_depend_g_call 0.60% : 0.000001s : 6: predicate.float_environ_get_switch 0.90% : 0.000001s : 9: predicate.float_tuple_getitem_switch 0.18% : 0.000000s : 3: predicate.fold_const_symbol 0.68% : 0.000001s : 6: predicate.get_grad_eliminate 0.22% : 0.000000s : 3: predicate.graph_param_transform 0.75% : 0.000001s : 6: predicate.incorporate_call 0.58% : 0.000001s : 6: predicate.incorporate_call_switch 6.83% : 0.000010s : 34: predicate.inline 1.08% : 0.000002s : 6: predicate.inline_without_move 0.40% : 0.000001s : 6: predicate.j_node_and_user_rematch 1.28% : 0.000002s : 6: predicate.less_batch_normalization 1.43% : 0.000002s : 13: predicate.list_to_tuple_eliminator_ 1.85% : 0.000003s : 20: predicate.load_eliminater 1.52% : 0.000002s : 3: predicate.loop_unroll_after_grad 1.62% : 0.000002s : 14: predicate.loop_unroll_before_grad 1.78% : 0.000003s : 13: predicate.make_slice_get_slice_eliminator 0.66% : 0.000001s : 6: predicate.merge_addn 0.94% : 0.000001s : 6: predicate.micro_step_allgather_replace 0.71% : 0.000001s : 6: predicate.mini_step_allgather_replace 0.66% : 0.000001s : 7: predicate.minmaximum_grad 1.82% : 0.000003s : 3: predicate.mutable_eliminate 0.37% : 0.000001s : 3: predicate.opt_reshape 0.40% : 0.000001s : 3: predicate.parallel_virtual_node 1.26% : 0.000002s : 9: predicate.partial_defer_inline 1.13% : 0.000002s : 10: predicate.partial_eliminate 0.90% : 0.000001s : 7: predicate.print_const_string_wrapper 0.68% : 0.000001s : 6: predicate.reduce_all_const_elim 0.93% : 0.000001s : 7: predicate.reduce_eliminate 1.95% : 0.000003s : 20: predicate.redundant_stop_gradient_eliminater 0.57% : 0.000001s : 6: predicate.remove_not_recompute_node 1.05% : 0.000002s : 13: predicate.replace_applicator 0.62% : 0.000001s : 6: predicate.replace_old_param 0.47% : 0.000001s : 3: predicate.reset_defer_inline 0.78% : 0.000001s : 7: predicate.reshape_eliminate 0.86% : 0.000001s : 6: predicate.row_tensor_add_zeros_like 0.64% : 0.000001s : 3: predicate.row_tensor_eliminate 1.15% : 0.000002s : 6: predicate.same_eliminate 0.71% : 0.000001s : 6: predicate.set_cell_output_no_recompute 1.38% : 0.000002s : 6: predicate.shard_identity_eliminate 0.92% : 0.000001s : 6: predicate.special_op_eliminate 1.18% : 0.000002s : 6: predicate.specialize_transform 1.24% : 0.000002s : 6: predicate.split_environ_get_set_with_tuple_value 0.92% : 0.000001s : 6: predicate.stack_unstack_eliminate 0.37% : 0.000001s : 3: predicate.switch_call_monad_eliminater 0.93% : 0.000001s : 9: predicate.switch_defer_inline 1.73% : 0.000003s : 15: predicate.switch_layer_defer_inline 4.62% : 0.000007s : 32: predicate.switch_simplify 0.73% : 0.000001s : 7: predicate.tile_eliminate 0.73% : 0.000001s : 7: predicate.transpose_eliminate 1.38% : 0.000002s : 13: predicate.tuple_list_convert_item_index_to_positive 1.60% : 0.000002s : 13: predicate.tuple_list_get_item_const_eliminator 1.23% : 0.000002s : 13: predicate.tuple_list_get_item_depend_reorder 3.32% : 0.000005s : 19: predicate.tuple_list_get_item_eliminator 1.38% : 0.000002s : 13: predicate.tuple_list_get_set_item_eliminator 2.53% : 0.000004s : 19: predicate.tuple_list_set_item_eliminator 1.55% : 0.000002s : 13: predicate.tuple_to_list_eliminator_ 1.78% : 0.000003s : 20: predicate.updatestate_pure_node_eliminater 2.68% : 0.000004s : 26: predicate.updatestate_useless_node_eliminater 0.48% : 0.000001s : 3: predicate.value_based_eliminate 1.09% : 0.000002s : 6: predicate.virtual_dataset_eliminate 0.92% : 0.000001s : 6: predicate.virtual_output_eliminate 0.28% : 0.000000s : 3: predicate.virtual_view_grad_eliminate 0.55% : 0.000001s : 3: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000261 5 8.07% : 0.000021s : 1: func_graph_cloner_run.FuncGraphClonerGraph 91.93% : 0.000240s : 4: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.027143 192 0.02% : 0.000006s : 1: ForceFp32Comm 12.95% : 0.003514s : 1: add_attr 12.88% : 0.003497s : 1: add_attr_with_inline 0.02% : 0.000007s : 1: add_comm_op_reuse_tag 0.22% : 0.000060s : 1: add_recomputation 0.02% : 0.000007s : 1: assign_add_opt 0.25% : 0.000068s : 1: auto_monad 0.10% : 0.000027s : 1: auto_monad_reorder 0.02% : 0.000006s : 1: begin_end_overlap_inline 0.03% : 0.000009s : 1: bias_add_comm_swap 1.95% : 0.000530s : 1: bootstrap 0.15% : 0.000041s : 1: cconv 0.02% : 0.000006s : 1: comm_op_add_attrs 0.08% : 0.000022s : 1: control_data_broadcast_order 0.06% : 0.000016s : 1: convert_after_rewriter 0.13% : 0.000036s : 1: cse_after_recomputation 0.03% : 0.000008s : 1: dataset_repeat_opt 0.14% : 0.000037s : 1: detach_backward 0.05% : 0.000012s : 1: environ_conv 0.09% : 0.000024s : 1: event_method 0.03% : 0.000007s : 1: full_micro_interleaved_order_control 0.03% : 0.000009s : 1: get_jit_bprop_graph 0.04% : 0.000012s : 1: graph_reusing 0.03% : 0.000008s : 1: grouped_pairwise_exchange_alltoall 0.02% : 0.000006s : 1: handle_group_info 0.03% : 0.000009s : 1: inline 0.03% : 0.000009s : 1: insert-virtual-dataset 0.02% : 0.000006s : 1: interleave_parallel_branches 0.02% : 0.000006s : 1: interleave_split_concat_branches 0.03% : 0.000009s : 1: label_fine_grained_interleaved_index 0.04% : 0.000010s : 1: label_micro_interleaved_index 1.83% : 0.000497s : 1: loop_unroll 0.02% : 0.000007s : 1: merge_cast_opt 0.03% : 0.000007s : 1: micro_interleaved_order_control 2.34% : 0.000635s : 1: mutable_eliminate 0.04% : 0.000010s : 1: offloading_packed_experts 0.07% : 0.000018s : 1: opt.transform.loop_unroll_optimizer 0.08% : 0.000022s : 1: opt.transform.mutable_eliminate 3.48% : 0.000944s : 78: opt.transform.opt_a 0.10% : 0.000028s : 1: opt.transform.opt_after_cconv 0.11% : 0.000030s : 1: opt.transform.opt_after_jit_grad 0.37% : 0.000100s : 28: opt.transform.opt_b 0.18% : 0.000049s : 2: opt.transform.opt_trans_graph 0.15% : 0.000042s : 4: opt.transform.symbol_engine_opt 11.25% : 0.003053s : 1: opt_a 0.54% : 0.000148s : 1: opt_after_cconv 2.06% : 0.000558s : 1: opt_after_jit_grad 1.14% : 0.000309s : 1: opt_b 22.02% : 0.005978s : 1: optimize 0.11% : 0.000029s : 1: optimize_parallel_all_gather_comm 0.05% : 0.000012s : 1: order_py_execute_after_rewriter 0.11% : 0.000030s : 1: overlap_grad_flash_sp 0.02% : 0.000006s : 1: overlap_grad_matmul_and_grad_allreduce 0.04% : 0.000010s : 1: overlap_grad_ring_attention 0.03% : 0.000008s : 1: overlap_opt_shard_grad_in_pipeline 0.02% : 0.000007s : 1: overlap_opt_shard_in_pipeline 0.03% : 0.000008s : 1: overlap_param_gather 0.03% : 0.000007s : 1: overlap_recompute_allgather_and_fa_grad 0.04% : 0.000011s : 1: overlap_recompute_and_grad_model_parallel 0.03% : 0.000008s : 1: overlap_recompute_comm 0.04% : 0.000010s : 1: parallel-infer-symbol 0.02% : 0.000007s : 1: parallel-infer-symbol-second 0.03% : 0.000008s : 1: partial_unused_args_eliminate 0.04% : 0.000010s : 1: pipeline_parallel_scheduler 0.03% : 0.000007s : 1: pipeline_split 0.15% : 0.000040s : 1: pre_auto_parallel 0.11% : 0.000029s : 1: py_interpret_to_execute 0.08% : 0.000022s : 1: py_interpret_to_execute_after_opt_a 0.02% : 0.000006s : 1: remove_cast_before_assign_add 0.09% : 0.000023s : 1: remove_dup_value 1.33% : 0.000360s : 1: renormalize.infer 1.06% : 0.000287s : 1: renormalize.specialize 0.03% : 0.000008s : 1: reorder_send_recv_between_fp_bp 0.04% : 0.000011s : 1: rewriter_after_jit_bprop_graph 0.20% : 0.000054s : 1: rewriter_after_opt_a 0.23% : 0.000062s : 1: rewriter_before_opt_a 0.03% : 0.000008s : 1: slice_cell_reuse_recomputed_activation 0.03% : 0.000007s : 1: slice_recompute_activation 0.03% : 0.000007s : 1: split_layernorm_comm 0.03% : 0.000008s : 1: split_matmul_comm_elemetwise 0.05% : 0.000013s : 1: swap_dp_allreduce_reducescatter 0.42% : 0.000115s : 1: symbol_engine_optimizer 0.36% : 0.000098s : 1: tuple_transform 19.70% : 0.005346s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:44.809.555 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0170521, [21] [bootstrap]: 0.00046592 [type_inference]: 0.00557222 [event_method]: 1.465e-05 [auto_monad]: 5.974e-05 [graph_reusing]: 5.56e-06 [inline]: 3.00002e-06 [add_attr]: 0.00402684, [1] [add_attr_with_inline]: 0.00401273, [1] [Cycle 1]: 7.144e-05, [2] [tag_attr]: 1.756e-05 [meta_addattr_fg_expand]: 4.20999e-06 [parallel-infer-symbol]: 3.68e-06 [pre_auto_parallel]: 3.29e-05 [insert-virtual-dataset]: 2.61e-06 [parallel-infer-symbol-second]: 8.2e-07 [dataset_repeat_opt]: 2.11e-06 [pipeline_split]: 1.84e-06 [optimize]: 0.00579134, [53] [py_interpret_to_execute]: 2.615e-05 [rewriter_before_opt_a]: 5.703e-05 [opt_a]: 0.0028323, [2] [Cycle 1]: 0.00198758, [45] [expand_dump_flag]: 3.01999e-06 [switch_simplify]: 2.896e-05 [loop_unroll]: 1.521e-05 [a_1]: 0.00038261 [with_stream_mark]: 2.356e-05 [recompute_prepare]: 6.066e-05 [updatestate_depend_eliminate]: 5.30001e-06 [updatestate_assign_eliminate]: 3.09001e-06 [updatestate_loads_eliminate]: 3.13e-06 [parameter_eliminate]: 2.81e-06 [a_2]: 0.00010548 [accelerated_algorithm]: 8.75999e-06 [shard]: 3.42002e-06 [meta_shard_fg_expand]: 2.16e-06 [shard_inline]: 6.89001e-06 [merge_send_recv]: 9.02e-06 [auto_parallel]: 8.20999e-06 [parallel]: 2.047e-05 [flash_sp]: 1.032e-05 [merge_comm]: 4.37e-06 [allreduce_fusion]: 3.76999e-06 [matmul_add_comm_reduction]: 1.136e-05 [allreduce_slice_to_reducescatter]: 6.30011e-07 [virtual_shard_identity]: 1.035e-05 [virtual_dataset]: 6.94999e-06 [get_grad_eliminate_]: 7.2e-06 [virtual_output]: 6.46999e-06 [merge_forward]: 4.36002e-06 [cell_reuse_recompute_pass]: 1.66998e-06 [offload_activation]: 9.56998e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.48e-05 [merge_recompute_call_nodes]: 1.75001e-06 [before_grad]: 1.234e-05 [set_forward_comm_id_for_comm_node_pass]: 3.95998e-06 [meta_fg_expand]: 2.69001e-06 [flash_sp_send_recv_attached]: 2.70997e-06 [receive_attached]: 2.98e-06 [after_resolve]: 1.319e-05 [a_after_grad]: 1.003e-05 [renormalize]: 0.00073389 [add_forward_monad_depend]: 7.33e-06 [auto_monad_grad]: 3.08e-06 [auto_monad_eliminator]: 1.977e-05 [cse]: 3.53e-05 [a_3]: 5.707e-05 [Cycle 2]: 0.00083158, [45] [expand_dump_flag]: 2.26e-06 [switch_simplify]: 8.32e-06 [loop_unroll]: 6.24001e-06 [a_1]: 0.00015598 [with_stream_mark]: 2.016e-05 [recompute_prepare]: 7.6e-06 [updatestate_depend_eliminate]: 4.05998e-06 [updatestate_assign_eliminate]: 2.93e-06 [updatestate_loads_eliminate]: 3.65e-06 [parameter_eliminate]: 1.96998e-06 [a_2]: 9.068e-05 [accelerated_algorithm]: 8.14002e-06 [shard]: 1.99e-06 [meta_shard_fg_expand]: 2.41e-06 [shard_inline]: 7.01999e-06 [merge_send_recv]: 9.32999e-06 [auto_parallel]: 9.89999e-06 [parallel]: 1.024e-05 [flash_sp]: 4.97e-06 [merge_comm]: 3.7e-06 [allreduce_fusion]: 3.33998e-06 [matmul_add_comm_reduction]: 1.051e-05 [allreduce_slice_to_reducescatter]: 6.09987e-07 [virtual_shard_identity]: 8.69e-06 [virtual_dataset]: 6.27001e-06 [get_grad_eliminate_]: 6.54001e-06 [virtual_output]: 6.06e-06 [merge_forward]: 4.33999e-06 [cell_reuse_recompute_pass]: 3.04999e-06 [offload_activation]: 1.168e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.548e-05 [merge_recompute_call_nodes]: 1.64998e-06 [before_grad]: 1.138e-05 [set_forward_comm_id_for_comm_node_pass]: 5.19998e-06 [meta_fg_expand]: 3.51999e-06 [flash_sp_send_recv_attached]: 1.81003e-06 [receive_attached]: 2.31e-06 [after_resolve]: 1.341e-05 [a_after_grad]: 8.81997e-06 [renormalize]: 6.00121e-08 [add_forward_monad_depend]: 4.20999e-06 [auto_monad_grad]: 2.41e-06 [auto_monad_eliminator]: 1.344e-05 [cse]: 2.994e-05 [a_3]: 4.105e-05 [py_interpret_to_execute_after_opt_a]: 1.938e-05 [slice_cell_reuse_recomputed_activation]: 2.16e-06 [rewriter_after_opt_a]: 4.741e-05 [convert_after_rewriter]: 6.99001e-06 [order_py_execute_after_rewriter]: 5.34998e-06 [mutable_eliminate]: 0.0009107 [opt_b]: 0.00026258, [1] [Cycle 1]: 0.00025404, [7] [b_1]: 0.00014104 [b_2]: 9.99999e-06 [updatestate_depend_eliminate]: 1.105e-05 [updatestate_assign_eliminate]: 2.89001e-06 [updatestate_loads_eliminate]: 4.22998e-06 [renormalize]: 1.29e-06 [cse]: 3.946e-05 [optimize_parallel_all_gather_comm]: 2.375e-05 [overlap_param_gather]: 2.71999e-06 [cconv]: 4.124e-05 [loop_unroll]: 0.00073117 [opt_after_cconv]: 0.00013902, [1] [Cycle 1]: 0.00013132, [7] [c_1]: 3.285e-05 [parameter_eliminate]: 6.89999e-06 [updatestate_depend_eliminate]: 1.009e-05 [updatestate_assign_eliminate]: 2.85002e-06 [updatestate_loads_eliminate]: 4.50001e-06 [cse]: 3.501e-05 [renormalize]: 1.21002e-06 [remove_dup_value]: 1.797e-05 [tuple_transform]: 8.727e-05, [1] [Cycle 1]: 8.121e-05, [4] [d_1]: 4.981e-05 [none_parameter_eliminate]: 1.91998e-06 [renormalize]: 4.30009e-07 [switch_simplify]: 7.62002e-06 [partial_unused_args_eliminate]: 2.21e-06 [add_recomputation]: 6.26e-05 [cse_after_recomputation]: 2.921e-05, [1] [Cycle 1]: 2.391e-05, [1] [cse]: 1.498e-05 [environ_conv]: 6.82002e-06 [swap_dp_allreduce_reducescatter]: 5.91e-06 [bias_add_comm_swap]: 5.39e-06 [label_micro_interleaved_index]: 5.70001e-06 [label_fine_grained_interleaved_index]: 3.05002e-06 [merge_cast_opt]: 1.59e-06 [slice_recompute_activation]: 2.25002e-06 [micro_interleaved_order_control]: 2.27001e-06 [assign_add_opt]: 1.43002e-06 [ForceFp32Comm]: 9.80013e-07 [remove_cast_before_assign_add]: 1.31002e-06 [full_micro_interleaved_order_control]: 2.58003e-06 [reorder_send_recv_between_fp_bp]: 2.69999e-06 [comm_op_add_attrs]: 1.06997e-06 [add_comm_op_reuse_tag]: 1.00999e-06 [interleave_split_concat_branches]: 1.18001e-06 [interleave_parallel_branches]: 1.09998e-06 [overlap_opt_shard_in_pipeline]: 1.23002e-06 [overlap_opt_shard_grad_in_pipeline]: 2.54001e-06 [control_data_broadcast_order]: 1.601e-05 [grouped_pairwise_exchange_alltoall]: 1.76e-06 [offloading_packed_experts]: 5.18002e-06 [overlap_recompute_and_grad_model_parallel]: 5.81e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.25999e-06 [overlap_recompute_allgather_and_fa_grad]: 1.36002e-06 [overlap_recompute_comm]: 2.36e-06 [overlap_grad_ring_attention]: 4.77998e-06 [overlap_grad_flash_sp]: 2.405e-05 [begin_end_overlap_inline]: 8.2e-07 [split_matmul_comm_elemetwise]: 2.64001e-06 [split_layernorm_comm]: 1.94e-06 [handle_group_info]: 1.59e-06 [symbol_engine_optimizer]: 9.879e-05, [1] [Cycle 1]: 9.271e-05, [6] [build]: 3.48e-06 [elim_shapecalc]: 1.703e-05 [elim_not_effective]: 1.524e-05 [opt_reshape]: 8.57e-06 [fold_const_symbol]: 1.106e-05 [renormalize]: 1.69995e-07 [detach_backward]: 2.71e-06 [pipeline_parallel_scheduler]: 1.96e-06 [auto_monad_reorder]: 2.29e-05 [get_jit_bprop_graph]: 2.63e-06 [rewriter_after_jit_bprop_graph]: 7.77e-06 [opt_after_jit_grad]: 0.00079007 [validate]: 5.394e-05 Sums bootstrap : 0.000466s : 3.93% type_inference : 0.005572s : 47.03% event_method : 0.000015s : 0.12% auto_monad : 0.000060s : 0.50% graph_reusing : 0.000006s : 0.05% inline : 0.000003s : 0.03% add_attr.add_attr_with_inline.tag_attr : 0.000018s : 0.15% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000004s : 0.04% parallel-infer-symbol : 0.000004s : 0.03% pre_auto_parallel : 0.000033s : 0.28% insert-virtual-dataset : 0.000003s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.02% optimize.py_interpret_to_execute : 0.000026s : 0.22% optimize.rewriter_before_opt_a : 0.000057s : 0.48% optimize.opt_a.expand_dump_flag : 0.000005s : 0.04% optimize.opt_a.switch_simplify : 0.000037s : 0.31% optimize.opt_a.loop_unroll : 0.000021s : 0.18% optimize.opt_a.a_1 : 0.000539s : 4.55% optimize.opt_a.with_stream_mark : 0.000044s : 0.37% optimize.opt_a.recompute_prepare : 0.000068s : 0.58% optimize.opt_a.updatestate_depend_eliminate : 0.000009s : 0.08% optimize.opt_a.updatestate_assign_eliminate : 0.000006s : 0.05% optimize.opt_a.updatestate_loads_eliminate : 0.000007s : 0.06% optimize.opt_a.parameter_eliminate : 0.000005s : 0.04% optimize.opt_a.a_2 : 0.000196s : 1.66% optimize.opt_a.accelerated_algorithm : 0.000017s : 0.14% optimize.opt_a.shard : 0.000005s : 0.05% optimize.opt_a.meta_shard_fg_expand : 0.000005s : 0.04% optimize.opt_a.shard_inline : 0.000014s : 0.12% optimize.opt_a.merge_send_recv : 0.000018s : 0.15% optimize.opt_a.auto_parallel : 0.000018s : 0.15% optimize.opt_a.parallel : 0.000031s : 0.26% optimize.opt_a.flash_sp : 0.000015s : 0.13% optimize.opt_a.merge_comm : 0.000008s : 0.07% optimize.opt_a.allreduce_fusion : 0.000007s : 0.06% optimize.opt_a.matmul_add_comm_reduction : 0.000022s : 0.18% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000019s : 0.16% optimize.opt_a.virtual_dataset : 0.000013s : 0.11% optimize.opt_a.get_grad_eliminate_ : 0.000014s : 0.12% optimize.opt_a.virtual_output : 0.000013s : 0.11% optimize.opt_a.merge_forward : 0.000009s : 0.07% optimize.opt_a.cell_reuse_recompute_pass : 0.000005s : 0.04% optimize.opt_a.offload_activation : 0.000021s : 0.18% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000030s : 0.26% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.03% optimize.opt_a.before_grad : 0.000024s : 0.20% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000009s : 0.08% optimize.opt_a.meta_fg_expand : 0.000006s : 0.05% optimize.opt_a.flash_sp_send_recv_attached : 0.000005s : 0.04% optimize.opt_a.receive_attached : 0.000005s : 0.04% optimize.opt_a.after_resolve : 0.000027s : 0.22% optimize.opt_a.a_after_grad : 0.000019s : 0.16% optimize.opt_a.renormalize : 0.000734s : 6.20% optimize.opt_a.add_forward_monad_depend : 0.000012s : 0.10% optimize.opt_a.auto_monad_grad : 0.000005s : 0.05% optimize.opt_a.auto_monad_eliminator : 0.000033s : 0.28% optimize.opt_a.cse : 0.000065s : 0.55% optimize.opt_a.a_3 : 0.000098s : 0.83% optimize.py_interpret_to_execute_after_opt_a : 0.000019s : 0.16% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.02% optimize.rewriter_after_opt_a : 0.000047s : 0.40% optimize.convert_after_rewriter : 0.000007s : 0.06% optimize.order_py_execute_after_rewriter : 0.000005s : 0.05% optimize.mutable_eliminate : 0.000911s : 7.69% optimize.opt_b.b_1 : 0.000141s : 1.19% optimize.opt_b.b_2 : 0.000010s : 0.08% optimize.opt_b.updatestate_depend_eliminate : 0.000011s : 0.09% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.02% optimize.opt_b.updatestate_loads_eliminate : 0.000004s : 0.04% optimize.opt_b.renormalize : 0.000001s : 0.01% optimize.opt_b.cse : 0.000039s : 0.33% optimize.optimize_parallel_all_gather_comm : 0.000024s : 0.20% optimize.overlap_param_gather : 0.000003s : 0.02% optimize.cconv : 0.000041s : 0.35% optimize.loop_unroll : 0.000731s : 6.17% optimize.opt_after_cconv.c_1 : 0.000033s : 0.28% optimize.opt_after_cconv.parameter_eliminate : 0.000007s : 0.06% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000010s : 0.09% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.02% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000005s : 0.04% optimize.opt_after_cconv.cse : 0.000035s : 0.30% optimize.opt_after_cconv.renormalize : 0.000001s : 0.01% optimize.remove_dup_value : 0.000018s : 0.15% optimize.tuple_transform.d_1 : 0.000050s : 0.42% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.02% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000008s : 0.06% optimize.partial_unused_args_eliminate : 0.000002s : 0.02% optimize.add_recomputation : 0.000063s : 0.53% optimize.cse_after_recomputation.cse : 0.000015s : 0.13% optimize.environ_conv : 0.000007s : 0.06% optimize.swap_dp_allreduce_reducescatter : 0.000006s : 0.05% optimize.bias_add_comm_swap : 0.000005s : 0.05% optimize.label_micro_interleaved_index : 0.000006s : 0.05% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.03% optimize.merge_cast_opt : 0.000002s : 0.01% optimize.slice_recompute_activation : 0.000002s : 0.02% optimize.micro_interleaved_order_control : 0.000002s : 0.02% optimize.assign_add_opt : 0.000001s : 0.01% optimize.ForceFp32Comm : 0.000001s : 0.01% optimize.remove_cast_before_assign_add : 0.000001s : 0.01% optimize.full_micro_interleaved_order_control : 0.000003s : 0.02% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.02% optimize.comm_op_add_attrs : 0.000001s : 0.01% optimize.add_comm_op_reuse_tag : 0.000001s : 0.01% optimize.interleave_split_concat_branches : 0.000001s : 0.01% optimize.interleave_parallel_branches : 0.000001s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000003s : 0.02% optimize.control_data_broadcast_order : 0.000016s : 0.14% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.01% optimize.offloading_packed_experts : 0.000005s : 0.04% optimize.overlap_recompute_and_grad_model_parallel : 0.000006s : 0.05% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.01% optimize.overlap_recompute_comm : 0.000002s : 0.02% optimize.overlap_grad_ring_attention : 0.000005s : 0.04% optimize.overlap_grad_flash_sp : 0.000024s : 0.20% optimize.begin_end_overlap_inline : 0.000001s : 0.01% optimize.split_matmul_comm_elemetwise : 0.000003s : 0.02% optimize.split_layernorm_comm : 0.000002s : 0.02% optimize.handle_group_info : 0.000002s : 0.01% optimize.symbol_engine_optimizer.build : 0.000003s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000017s : 0.14% optimize.symbol_engine_optimizer.elim_not_effective : 0.000015s : 0.13% optimize.symbol_engine_optimizer.opt_reshape : 0.000009s : 0.07% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000011s : 0.09% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000003s : 0.02% pipeline_parallel_scheduler : 0.000002s : 0.02% auto_monad_reorder : 0.000023s : 0.19% get_jit_bprop_graph : 0.000003s : 0.02% rewriter_after_jit_bprop_graph : 0.000008s : 0.07% opt_after_jit_grad : 0.000790s : 6.67% validate : 0.000054s : 0.46% Time group info: ------[substitution.] 0.000245 23 40.20% : 0.000099s : 4: substitution.arithmetic_simplify 0.95% : 0.000002s : 2: substitution.elim_not_effective 0.71% : 0.000002s : 2: substitution.fold_const_symbol 2.59% : 0.000006s : 3: substitution.graph_param_transform 48.60% : 0.000119s : 2: substitution.inline 2.21% : 0.000005s : 4: substitution.j_node_and_user_rematch 2.33% : 0.000006s : 4: substitution.remove_not_recompute_node 2.41% : 0.000006s : 2: substitution.replace_old_param ------[type_inference.] 0.005519 2 91.30% : 0.005039s : 1: type_inference.infer 8.70% : 0.000480s : 1: type_inference.specialize ------[replace.] 0.000026 2 100.00% : 0.000026s : 2: replace.inline ------[match.] 0.000117 2 100.00% : 0.000117s : 2: match.inline ------[predicate.] 0.000167 754 0.82% : 0.000001s : 7: predicate.accumulaten_eliminater 1.99% : 0.000003s : 3: predicate.ad_related_special_op_eliminate 0.67% : 0.000001s : 6: predicate.addn_check_dump 0.88% : 0.000001s : 7: predicate.addn_zero_filter 0.74% : 0.000001s : 7: predicate.adjust_all_reduce_mul_add 3.40% : 0.000006s : 13: predicate.arithmetic_simplify 0.94% : 0.000002s : 7: predicate.cast_eliminate 0.67% : 0.000001s : 6: predicate.check_bprop_eliminate 0.58% : 0.000001s : 6: predicate.compare_switch_simplify 0.18% : 0.000000s : 3: predicate.const_output_eliminate 0.71% : 0.000001s : 6: predicate.depend_value_elim 0.62% : 0.000001s : 7: predicate.dict_get_item_const_eliminator 0.86% : 0.000001s : 7: predicate.dict_get_item_eliminator 0.73% : 0.000001s : 7: predicate.dict_set_item_eliminator 1.52% : 0.000003s : 6: predicate.dumpgradient_eliminate 0.20% : 0.000000s : 3: predicate.elim_not_effective 0.43% : 0.000001s : 3: predicate.elim_shapecalc_of_broadcastargs 1.07% : 0.000002s : 10: predicate.environ_add_const_eliminate 0.82% : 0.000001s : 10: predicate.environ_get_add_eliminate 0.83% : 0.000001s : 10: predicate.environ_get_depend_swap 1.61% : 0.000003s : 16: predicate.environ_get_eliminate 0.86% : 0.000001s : 10: predicate.environ_get_set_eliminate 0.73% : 0.000001s : 9: predicate.exchange_switch_depend_value 1.56% : 0.000003s : 9: predicate.float_depend_g_call 0.65% : 0.000001s : 6: predicate.float_environ_get_switch 0.81% : 0.000001s : 9: predicate.float_tuple_getitem_switch 0.16% : 0.000000s : 3: predicate.fold_const_symbol 0.86% : 0.000001s : 6: predicate.get_grad_eliminate 0.29% : 0.000000s : 3: predicate.graph_param_transform 0.63% : 0.000001s : 6: predicate.incorporate_call 0.53% : 0.000001s : 6: predicate.incorporate_call_switch 6.10% : 0.000010s : 34: predicate.inline 0.95% : 0.000002s : 6: predicate.inline_without_move 0.32% : 0.000001s : 6: predicate.j_node_and_user_rematch 1.37% : 0.000002s : 6: predicate.less_batch_normalization 1.57% : 0.000003s : 13: predicate.list_to_tuple_eliminator_ 1.75% : 0.000003s : 20: predicate.load_eliminater 2.17% : 0.000004s : 3: predicate.loop_unroll_after_grad 1.58% : 0.000003s : 14: predicate.loop_unroll_before_grad 1.55% : 0.000003s : 13: predicate.make_slice_get_slice_eliminator 0.60% : 0.000001s : 6: predicate.merge_addn 0.56% : 0.000001s : 6: predicate.micro_step_allgather_replace 0.73% : 0.000001s : 6: predicate.mini_step_allgather_replace 0.60% : 0.000001s : 7: predicate.minmaximum_grad 2.08% : 0.000003s : 3: predicate.mutable_eliminate 0.40% : 0.000001s : 3: predicate.opt_reshape 0.70% : 0.000001s : 3: predicate.parallel_virtual_node 1.21% : 0.000002s : 9: predicate.partial_defer_inline 0.94% : 0.000002s : 10: predicate.partial_eliminate 0.75% : 0.000001s : 7: predicate.print_const_string_wrapper 0.91% : 0.000002s : 6: predicate.reduce_all_const_elim 1.50% : 0.000003s : 7: predicate.reduce_eliminate 2.29% : 0.000004s : 20: predicate.redundant_stop_gradient_eliminater 0.74% : 0.000001s : 6: predicate.remove_not_recompute_node 1.08% : 0.000002s : 13: predicate.replace_applicator 0.58% : 0.000001s : 6: predicate.replace_old_param 0.40% : 0.000001s : 3: predicate.reset_defer_inline 0.78% : 0.000001s : 7: predicate.reshape_eliminate 0.95% : 0.000002s : 6: predicate.row_tensor_add_zeros_like 0.79% : 0.000001s : 3: predicate.row_tensor_eliminate 0.95% : 0.000002s : 6: predicate.same_eliminate 0.56% : 0.000001s : 6: predicate.set_cell_output_no_recompute 1.03% : 0.000002s : 6: predicate.shard_identity_eliminate 1.15% : 0.000002s : 6: predicate.special_op_eliminate 0.75% : 0.000001s : 6: predicate.specialize_transform 1.47% : 0.000002s : 6: predicate.split_environ_get_set_with_tuple_value 1.06% : 0.000002s : 6: predicate.stack_unstack_eliminate 0.44% : 0.000001s : 3: predicate.switch_call_monad_eliminater 0.86% : 0.000001s : 9: predicate.switch_defer_inline 1.34% : 0.000002s : 15: predicate.switch_layer_defer_inline 3.91% : 0.000007s : 32: predicate.switch_simplify 0.80% : 0.000001s : 7: predicate.tile_eliminate 0.81% : 0.000001s : 7: predicate.transpose_eliminate 1.63% : 0.000003s : 13: predicate.tuple_list_convert_item_index_to_positive 1.60% : 0.000003s : 13: predicate.tuple_list_get_item_const_eliminator 1.30% : 0.000002s : 13: predicate.tuple_list_get_item_depend_reorder 3.73% : 0.000006s : 19: predicate.tuple_list_get_item_eliminator 1.27% : 0.000002s : 13: predicate.tuple_list_get_set_item_eliminator 3.05% : 0.000005s : 19: predicate.tuple_list_set_item_eliminator 1.67% : 0.000003s : 13: predicate.tuple_to_list_eliminator_ 1.69% : 0.000003s : 20: predicate.updatestate_pure_node_eliminater 2.39% : 0.000004s : 26: predicate.updatestate_useless_node_eliminater 0.50% : 0.000001s : 3: predicate.value_based_eliminate 0.96% : 0.000002s : 6: predicate.virtual_dataset_eliminate 0.86% : 0.000001s : 6: predicate.virtual_output_eliminate 0.35% : 0.000001s : 3: predicate.virtual_view_grad_eliminate 0.58% : 0.000001s : 3: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000258 5 9.43% : 0.000024s : 1: func_graph_cloner_run.FuncGraphClonerGraph 90.57% : 0.000234s : 4: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.028808 192 0.01% : 0.000004s : 1: ForceFp32Comm 14.00% : 0.004034s : 1: add_attr 13.95% : 0.004017s : 1: add_attr_with_inline 0.01% : 0.000004s : 1: add_comm_op_reuse_tag 0.24% : 0.000068s : 1: add_recomputation 0.02% : 0.000005s : 1: assign_add_opt 0.23% : 0.000065s : 1: auto_monad 0.09% : 0.000027s : 1: auto_monad_reorder 0.01% : 0.000004s : 1: begin_end_overlap_inline 0.03% : 0.000009s : 1: bias_add_comm_swap 1.73% : 0.000499s : 1: bootstrap 0.16% : 0.000046s : 1: cconv 0.01% : 0.000004s : 1: comm_op_add_attrs 0.07% : 0.000020s : 1: control_data_broadcast_order 0.04% : 0.000011s : 1: convert_after_rewriter 0.11% : 0.000032s : 1: cse_after_recomputation 0.02% : 0.000005s : 1: dataset_repeat_opt 0.02% : 0.000006s : 1: detach_backward 0.04% : 0.000011s : 1: environ_conv 0.08% : 0.000022s : 1: event_method 0.02% : 0.000005s : 1: full_micro_interleaved_order_control 0.02% : 0.000007s : 1: get_jit_bprop_graph 0.03% : 0.000009s : 1: graph_reusing 0.02% : 0.000006s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000004s : 1: handle_group_info 0.02% : 0.000006s : 1: inline 0.02% : 0.000006s : 1: insert-virtual-dataset 0.01% : 0.000004s : 1: interleave_parallel_branches 0.01% : 0.000004s : 1: interleave_split_concat_branches 0.02% : 0.000006s : 1: label_fine_grained_interleaved_index 0.03% : 0.000009s : 1: label_micro_interleaved_index 2.59% : 0.000747s : 1: loop_unroll 0.02% : 0.000004s : 1: merge_cast_opt 0.02% : 0.000005s : 1: micro_interleaved_order_control 3.22% : 0.000927s : 1: mutable_eliminate 0.03% : 0.000009s : 1: offloading_packed_experts 0.10% : 0.000029s : 1: opt.transform.loop_unroll_optimizer 0.11% : 0.000031s : 1: opt.transform.mutable_eliminate 3.58% : 0.001032s : 78: opt.transform.opt_a 0.11% : 0.000031s : 1: opt.transform.opt_after_cconv 0.13% : 0.000038s : 1: opt.transform.opt_after_jit_grad 0.38% : 0.000111s : 28: opt.transform.opt_b 0.19% : 0.000055s : 2: opt.transform.opt_trans_graph 0.16% : 0.000046s : 4: opt.transform.symbol_engine_opt 9.85% : 0.002837s : 1: opt_a 0.50% : 0.000143s : 1: opt_after_cconv 2.80% : 0.000807s : 1: opt_after_jit_grad 0.93% : 0.000267s : 1: opt_b 20.13% : 0.005799s : 1: optimize 0.10% : 0.000028s : 1: optimize_parallel_all_gather_comm 0.03% : 0.000009s : 1: order_py_execute_after_rewriter 0.10% : 0.000029s : 1: overlap_grad_flash_sp 0.02% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.03% : 0.000009s : 1: overlap_grad_ring_attention 0.02% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.02% : 0.000006s : 1: overlap_param_gather 0.02% : 0.000005s : 1: overlap_recompute_allgather_and_fa_grad 0.03% : 0.000009s : 1: overlap_recompute_and_grad_model_parallel 0.02% : 0.000005s : 1: overlap_recompute_comm 0.03% : 0.000008s : 1: parallel-infer-symbol 0.01% : 0.000004s : 1: parallel-infer-symbol-second 0.02% : 0.000006s : 1: partial_unused_args_eliminate 0.02% : 0.000006s : 1: pipeline_parallel_scheduler 0.02% : 0.000005s : 1: pipeline_split 0.13% : 0.000038s : 1: pre_auto_parallel 0.10% : 0.000030s : 1: py_interpret_to_execute 0.09% : 0.000025s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000004s : 1: remove_cast_before_assign_add 0.08% : 0.000022s : 1: remove_dup_value 1.39% : 0.000400s : 1: renormalize.infer 1.12% : 0.000322s : 1: renormalize.specialize 0.02% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.04% : 0.000012s : 1: rewriter_after_jit_bprop_graph 0.18% : 0.000053s : 1: rewriter_after_opt_a 0.21% : 0.000061s : 1: rewriter_before_opt_a 0.02% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.02% : 0.000005s : 1: slice_recompute_activation 0.02% : 0.000005s : 1: split_layernorm_comm 0.02% : 0.000005s : 1: split_matmul_comm_elemetwise 0.03% : 0.000009s : 1: swap_dp_allreduce_reducescatter 0.35% : 0.000101s : 1: symbol_engine_optimizer 0.31% : 0.000091s : 1: tuple_transform 19.43% : 0.005598s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:45.249.36 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:45.252.32 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0150103, [21] [bootstrap]: 0.00044901 [type_inference]: 0.00482696 [event_method]: 1.274e-05 [auto_monad]: 5.647e-05 [graph_reusing]: 5.61998e-06 [inline]: 2.78e-06 [add_attr]: 0.00325564, [1] [add_attr_with_inline]: 0.00324537, [1] [Cycle 1]: 7.337e-05, [2] [tag_attr]: 1.512e-05 [meta_addattr_fg_expand]: 3.31999e-06 [parallel-infer-symbol]: 3.75998e-06 [pre_auto_parallel]: 2.844e-05 [insert-virtual-dataset]: 2.40002e-06 [parallel-infer-symbol-second]: 6.69999e-07 [dataset_repeat_opt]: 2.11e-06 [pipeline_split]: 2.02001e-06 [optimize]: 0.00515724, [53] [py_interpret_to_execute]: 2.269e-05 [rewriter_before_opt_a]: 5.083e-05 [opt_a]: 0.00277336, [2] [Cycle 1]: 0.00187704, [45] [expand_dump_flag]: 3.31999e-06 [switch_simplify]: 2.605e-05 [loop_unroll]: 1.415e-05 [a_1]: 0.00033536 [with_stream_mark]: 2.032e-05 [recompute_prepare]: 9.64999e-06 [updatestate_depend_eliminate]: 4.64998e-06 [updatestate_assign_eliminate]: 3.83999e-06 [updatestate_loads_eliminate]: 2.86999e-06 [parameter_eliminate]: 2.06e-06 [a_2]: 0.00012527 [accelerated_algorithm]: 7.51999e-06 [shard]: 2.27999e-06 [meta_shard_fg_expand]: 1.67001e-06 [shard_inline]: 6.32001e-06 [merge_send_recv]: 9.36e-06 [auto_parallel]: 7.10998e-06 [parallel]: 1.948e-05 [flash_sp]: 9.66e-06 [merge_comm]: 4.37e-06 [allreduce_fusion]: 3.86001e-06 [matmul_add_comm_reduction]: 9.97999e-06 [allreduce_slice_to_reducescatter]: 6.89994e-07 [virtual_shard_identity]: 8.74e-06 [virtual_dataset]: 6.81999e-06 [get_grad_eliminate_]: 5.64e-06 [virtual_output]: 6.64999e-06 [merge_forward]: 4.43999e-06 [cell_reuse_recompute_pass]: 1.44e-06 [offload_activation]: 1.102e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.605e-05 [merge_recompute_call_nodes]: 1.84e-06 [before_grad]: 1.166e-05 [set_forward_comm_id_for_comm_node_pass]: 5.20999e-06 [meta_fg_expand]: 2.64999e-06 [flash_sp_send_recv_attached]: 3.2e-06 [receive_attached]: 2.51e-06 [after_resolve]: 1.181e-05 [a_after_grad]: 1.102e-05 [renormalize]: 0.00060128 [add_forward_monad_depend]: 5.14e-06 [auto_monad_grad]: 2.21e-06 [auto_monad_eliminator]: 1.664e-05 [cse]: 3.086e-05 [a_3]: 6.244e-05 [Cycle 2]: 0.00088194, [45] [expand_dump_flag]: 1.32999e-06 [switch_simplify]: 8.12e-06 [loop_unroll]: 6.08002e-06 [a_1]: 0.00013657 [with_stream_mark]: 1.746e-05 [recompute_prepare]: 7.35998e-06 [updatestate_depend_eliminate]: 3.23e-06 [updatestate_assign_eliminate]: 2.97002e-06 [updatestate_loads_eliminate]: 2.76e-06 [parameter_eliminate]: 1.45999e-06 [a_2]: 0.00011242 [accelerated_algorithm]: 6.04999e-06 [shard]: 1.50001e-06 [meta_shard_fg_expand]: 1.44998e-06 [shard_inline]: 6.78998e-06 [merge_send_recv]: 6.74001e-06 [auto_parallel]: 7.4e-06 [parallel]: 6.40002e-06 [flash_sp]: 3.75998e-06 [merge_comm]: 4.13001e-06 [allreduce_fusion]: 3.43999e-06 [matmul_add_comm_reduction]: 7.23e-06 [allreduce_slice_to_reducescatter]: 8.50006e-07 [virtual_shard_identity]: 7.53999e-06 [virtual_dataset]: 6.02999e-06 [get_grad_eliminate_]: 5.83002e-06 [virtual_output]: 5.25001e-06 [merge_forward]: 3.64002e-06 [cell_reuse_recompute_pass]: 1.64e-06 [offload_activation]: 7.76001e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.45e-05 [merge_recompute_call_nodes]: 1.29e-06 [before_grad]: 9.91e-06 [set_forward_comm_id_for_comm_node_pass]: 5.07999e-06 [meta_fg_expand]: 2.80002e-06 [flash_sp_send_recv_attached]: 1.43002e-06 [receive_attached]: 1.22e-06 [after_resolve]: 1.016e-05 [a_after_grad]: 7.93999e-06 [renormalize]: 8.00064e-08 [add_forward_monad_depend]: 2.56e-06 [auto_monad_grad]: 1.13001e-06 [auto_monad_eliminator]: 1.122e-05 [cse]: 1.96e-05 [a_3]: 4.877e-05 [py_interpret_to_execute_after_opt_a]: 1.462e-05 [slice_cell_reuse_recomputed_activation]: 5.35999e-06 [rewriter_after_opt_a]: 4.503e-05 [convert_after_rewriter]: 9.93002e-06 [order_py_execute_after_rewriter]: 7.87e-06 [mutable_eliminate]: 0.00058232 [opt_b]: 0.00027794, [1] [Cycle 1]: 0.00026764, [7] [b_1]: 0.00016582 [b_2]: 7.3e-06 [updatestate_depend_eliminate]: 7.41999e-06 [updatestate_assign_eliminate]: 2.43998e-06 [updatestate_loads_eliminate]: 2.39999e-06 [renormalize]: 5.29981e-07 [cse]: 2.431e-05 [optimize_parallel_all_gather_comm]: 2.022e-05 [overlap_param_gather]: 4.68999e-06 [cconv]: 3.412e-05 [loop_unroll]: 0.0004668 [opt_after_cconv]: 0.00012971, [1] [Cycle 1]: 0.00012011, [7] [c_1]: 2.719e-05 [parameter_eliminate]: 3.65998e-06 [updatestate_depend_eliminate]: 6.36998e-06 [updatestate_assign_eliminate]: 2.73e-06 [updatestate_loads_eliminate]: 2.61e-06 [cse]: 2.185e-05 [renormalize]: 3.60014e-07 [remove_dup_value]: 1.821e-05 [tuple_transform]: 8.758e-05, [1] [Cycle 1]: 7.994e-05, [4] [d_1]: 4.104e-05 [none_parameter_eliminate]: 1.81e-06 [renormalize]: 2.80008e-07 [switch_simplify]: 6.56999e-06 [partial_unused_args_eliminate]: 5.40999e-06 [add_recomputation]: 5.552e-05 [cse_after_recomputation]: 2.889e-05, [1] [Cycle 1]: 2.186e-05, [1] [cse]: 1.29e-05 [environ_conv]: 8.2e-06 [swap_dp_allreduce_reducescatter]: 8.12e-06 [bias_add_comm_swap]: 5.05001e-06 [label_micro_interleaved_index]: 7.55e-06 [label_fine_grained_interleaved_index]: 4.92999e-06 [merge_cast_opt]: 4.02e-06 [slice_recompute_activation]: 4.70001e-06 [micro_interleaved_order_control]: 4.80001e-06 [assign_add_opt]: 3.67002e-06 [ForceFp32Comm]: 3.13e-06 [remove_cast_before_assign_add]: 3.81999e-06 [full_micro_interleaved_order_control]: 4.70001e-06 [reorder_send_recv_between_fp_bp]: 4.95999e-06 [comm_op_add_attrs]: 3.9e-06 [add_comm_op_reuse_tag]: 3.9e-06 [interleave_split_concat_branches]: 3.79002e-06 [interleave_parallel_branches]: 3.6e-06 [overlap_opt_shard_in_pipeline]: 3.98001e-06 [overlap_opt_shard_grad_in_pipeline]: 4.67e-06 [control_data_broadcast_order]: 1.724e-05 [grouped_pairwise_exchange_alltoall]: 4.11001e-06 [offloading_packed_experts]: 7.38e-06 [overlap_recompute_and_grad_model_parallel]: 7.55003e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.92002e-06 [overlap_recompute_allgather_and_fa_grad]: 4.42e-06 [overlap_recompute_comm]: 5.40999e-06 [overlap_grad_ring_attention]: 7.19001e-06 [overlap_grad_flash_sp]: 2.591e-05 [begin_end_overlap_inline]: 2.84999e-06 [split_matmul_comm_elemetwise]: 4.62e-06 [split_layernorm_comm]: 4.1e-06 [handle_group_info]: 3.33e-06 [symbol_engine_optimizer]: 0.00010551, [1] [Cycle 1]: 9.742e-05, [6] [build]: 3.91999e-06 [elim_shapecalc]: 1.211e-05 [elim_not_effective]: 1.345e-05 [opt_reshape]: 7.08e-06 [fold_const_symbol]: 1.073e-05 [renormalize]: 1.8999e-07 [detach_backward]: 3.93001e-06 [pipeline_parallel_scheduler]: 2.09e-06 [auto_monad_reorder]: 1.961e-05 [get_jit_bprop_graph]: 1.94e-06 [rewriter_after_jit_bprop_graph]: 5.68997e-06 [opt_after_jit_grad]: 0.00053071 [validate]: 4.093e-05 Sums bootstrap : 0.000449s : 4.52% type_inference : 0.004827s : 48.64% event_method : 0.000013s : 0.13% auto_monad : 0.000056s : 0.57% graph_reusing : 0.000006s : 0.06% inline : 0.000003s : 0.03% add_attr.add_attr_with_inline.tag_attr : 0.000015s : 0.15% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000003s : 0.03% parallel-infer-symbol : 0.000004s : 0.04% pre_auto_parallel : 0.000028s : 0.29% insert-virtual-dataset : 0.000002s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.02% optimize.py_interpret_to_execute : 0.000023s : 0.23% optimize.rewriter_before_opt_a : 0.000051s : 0.51% optimize.opt_a.expand_dump_flag : 0.000005s : 0.05% optimize.opt_a.switch_simplify : 0.000034s : 0.34% optimize.opt_a.loop_unroll : 0.000020s : 0.20% optimize.opt_a.a_1 : 0.000472s : 4.76% optimize.opt_a.with_stream_mark : 0.000038s : 0.38% optimize.opt_a.recompute_prepare : 0.000017s : 0.17% optimize.opt_a.updatestate_depend_eliminate : 0.000008s : 0.08% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.07% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.06% optimize.opt_a.parameter_eliminate : 0.000004s : 0.04% optimize.opt_a.a_2 : 0.000238s : 2.39% optimize.opt_a.accelerated_algorithm : 0.000014s : 0.14% optimize.opt_a.shard : 0.000004s : 0.04% optimize.opt_a.meta_shard_fg_expand : 0.000003s : 0.03% optimize.opt_a.shard_inline : 0.000013s : 0.13% optimize.opt_a.merge_send_recv : 0.000016s : 0.16% optimize.opt_a.auto_parallel : 0.000015s : 0.15% optimize.opt_a.parallel : 0.000026s : 0.26% optimize.opt_a.flash_sp : 0.000013s : 0.14% optimize.opt_a.merge_comm : 0.000009s : 0.09% optimize.opt_a.allreduce_fusion : 0.000007s : 0.07% optimize.opt_a.matmul_add_comm_reduction : 0.000017s : 0.17% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000002s : 0.02% optimize.opt_a.virtual_shard_identity : 0.000016s : 0.16% optimize.opt_a.virtual_dataset : 0.000013s : 0.13% optimize.opt_a.get_grad_eliminate_ : 0.000011s : 0.12% optimize.opt_a.virtual_output : 0.000012s : 0.12% optimize.opt_a.merge_forward : 0.000008s : 0.08% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.03% optimize.opt_a.offload_activation : 0.000019s : 0.19% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000031s : 0.31% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.03% optimize.opt_a.before_grad : 0.000022s : 0.22% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000010s : 0.10% optimize.opt_a.meta_fg_expand : 0.000005s : 0.05% optimize.opt_a.flash_sp_send_recv_attached : 0.000005s : 0.05% optimize.opt_a.receive_attached : 0.000004s : 0.04% optimize.opt_a.after_resolve : 0.000022s : 0.22% optimize.opt_a.a_after_grad : 0.000019s : 0.19% optimize.opt_a.renormalize : 0.000601s : 6.06% optimize.opt_a.add_forward_monad_depend : 0.000008s : 0.08% optimize.opt_a.auto_monad_grad : 0.000003s : 0.03% optimize.opt_a.auto_monad_eliminator : 0.000028s : 0.28% optimize.opt_a.cse : 0.000050s : 0.51% optimize.opt_a.a_3 : 0.000111s : 1.12% optimize.py_interpret_to_execute_after_opt_a : 0.000015s : 0.15% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.05% optimize.rewriter_after_opt_a : 0.000045s : 0.45% optimize.convert_after_rewriter : 0.000010s : 0.10% optimize.order_py_execute_after_rewriter : 0.000008s : 0.08% optimize.mutable_eliminate : 0.000582s : 5.87% optimize.opt_b.b_1 : 0.000166s : 1.67% optimize.opt_b.b_2 : 0.000007s : 0.07% optimize.opt_b.updatestate_depend_eliminate : 0.000007s : 0.07% optimize.opt_b.updatestate_assign_eliminate : 0.000002s : 0.02% optimize.opt_b.updatestate_loads_eliminate : 0.000002s : 0.02% optimize.opt_b.renormalize : 0.000001s : 0.01% optimize.opt_b.cse : 0.000024s : 0.24% optimize.optimize_parallel_all_gather_comm : 0.000020s : 0.20% optimize.overlap_param_gather : 0.000005s : 0.05% optimize.cconv : 0.000034s : 0.34% optimize.loop_unroll : 0.000467s : 4.70% optimize.opt_after_cconv.c_1 : 0.000027s : 0.27% optimize.opt_after_cconv.parameter_eliminate : 0.000004s : 0.04% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.06% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.cse : 0.000022s : 0.22% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000018s : 0.18% optimize.tuple_transform.d_1 : 0.000041s : 0.41% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.02% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000007s : 0.07% optimize.partial_unused_args_eliminate : 0.000005s : 0.05% optimize.add_recomputation : 0.000056s : 0.56% optimize.cse_after_recomputation.cse : 0.000013s : 0.13% optimize.environ_conv : 0.000008s : 0.08% optimize.swap_dp_allreduce_reducescatter : 0.000008s : 0.08% optimize.bias_add_comm_swap : 0.000005s : 0.05% optimize.label_micro_interleaved_index : 0.000008s : 0.08% optimize.label_fine_grained_interleaved_index : 0.000005s : 0.05% optimize.merge_cast_opt : 0.000004s : 0.04% optimize.slice_recompute_activation : 0.000005s : 0.05% optimize.micro_interleaved_order_control : 0.000005s : 0.05% optimize.assign_add_opt : 0.000004s : 0.04% optimize.ForceFp32Comm : 0.000003s : 0.03% optimize.remove_cast_before_assign_add : 0.000004s : 0.04% optimize.full_micro_interleaved_order_control : 0.000005s : 0.05% optimize.reorder_send_recv_between_fp_bp : 0.000005s : 0.05% optimize.comm_op_add_attrs : 0.000004s : 0.04% optimize.add_comm_op_reuse_tag : 0.000004s : 0.04% optimize.interleave_split_concat_branches : 0.000004s : 0.04% optimize.interleave_parallel_branches : 0.000004s : 0.04% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.04% optimize.overlap_opt_shard_grad_in_pipeline : 0.000005s : 0.05% optimize.control_data_broadcast_order : 0.000017s : 0.17% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.04% optimize.offloading_packed_experts : 0.000007s : 0.07% optimize.overlap_recompute_and_grad_model_parallel : 0.000008s : 0.08% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.04% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.04% optimize.overlap_recompute_comm : 0.000005s : 0.05% optimize.overlap_grad_ring_attention : 0.000007s : 0.07% optimize.overlap_grad_flash_sp : 0.000026s : 0.26% optimize.begin_end_overlap_inline : 0.000003s : 0.03% optimize.split_matmul_comm_elemetwise : 0.000005s : 0.05% optimize.split_layernorm_comm : 0.000004s : 0.04% optimize.handle_group_info : 0.000003s : 0.03% optimize.symbol_engine_optimizer.build : 0.000004s : 0.04% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000012s : 0.12% optimize.symbol_engine_optimizer.elim_not_effective : 0.000013s : 0.14% optimize.symbol_engine_optimizer.opt_reshape : 0.000007s : 0.07% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000011s : 0.11% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000004s : 0.04% pipeline_parallel_scheduler : 0.000002s : 0.02% auto_monad_reorder : 0.000020s : 0.20% get_jit_bprop_graph : 0.000002s : 0.02% rewriter_after_jit_bprop_graph : 0.000006s : 0.06% opt_after_jit_grad : 0.000531s : 5.35% validate : 0.000041s : 0.41% Time group info: ------[substitution.] 0.000195 23 39.98% : 0.000078s : 4: substitution.arithmetic_simplify 0.95% : 0.000002s : 2: substitution.elim_not_effective 0.70% : 0.000001s : 2: substitution.fold_const_symbol 2.85% : 0.000006s : 3: substitution.graph_param_transform 48.98% : 0.000095s : 2: substitution.inline 2.07% : 0.000004s : 4: substitution.j_node_and_user_rematch 2.44% : 0.000005s : 4: substitution.remove_not_recompute_node 2.02% : 0.000004s : 2: substitution.replace_old_param ------[type_inference.] 0.004778 2 91.02% : 0.004349s : 1: type_inference.infer 8.98% : 0.000429s : 1: type_inference.specialize ------[replace.] 0.000022 2 100.00% : 0.000022s : 2: replace.inline ------[match.] 0.000094 2 100.00% : 0.000094s : 2: match.inline ------[predicate.] 0.000141 754 0.76% : 0.000001s : 7: predicate.accumulaten_eliminater 1.29% : 0.000002s : 3: predicate.ad_related_special_op_eliminate 0.64% : 0.000001s : 6: predicate.addn_check_dump 0.74% : 0.000001s : 7: predicate.addn_zero_filter 0.62% : 0.000001s : 7: predicate.adjust_all_reduce_mul_add 3.10% : 0.000004s : 13: predicate.arithmetic_simplify 0.84% : 0.000001s : 7: predicate.cast_eliminate 0.73% : 0.000001s : 6: predicate.check_bprop_eliminate 0.69% : 0.000001s : 6: predicate.compare_switch_simplify 0.21% : 0.000000s : 3: predicate.const_output_eliminate 0.73% : 0.000001s : 6: predicate.depend_value_elim 0.73% : 0.000001s : 7: predicate.dict_get_item_const_eliminator 1.01% : 0.000001s : 7: predicate.dict_get_item_eliminator 0.83% : 0.000001s : 7: predicate.dict_set_item_eliminator 1.37% : 0.000002s : 6: predicate.dumpgradient_eliminate 0.29% : 0.000000s : 3: predicate.elim_not_effective 0.55% : 0.000001s : 3: predicate.elim_shapecalc_of_broadcastargs 0.98% : 0.000001s : 10: predicate.environ_add_const_eliminate 0.95% : 0.000001s : 10: predicate.environ_get_add_eliminate 0.92% : 0.000001s : 10: predicate.environ_get_depend_swap 1.72% : 0.000002s : 16: predicate.environ_get_eliminate 0.90% : 0.000001s : 10: predicate.environ_get_set_eliminate 0.90% : 0.000001s : 9: predicate.exchange_switch_depend_value 2.03% : 0.000003s : 9: predicate.float_depend_g_call 0.89% : 0.000001s : 6: predicate.float_environ_get_switch 0.98% : 0.000001s : 9: predicate.float_tuple_getitem_switch 0.22% : 0.000000s : 3: predicate.fold_const_symbol 0.76% : 0.000001s : 6: predicate.get_grad_eliminate 0.38% : 0.000001s : 3: predicate.graph_param_transform 0.82% : 0.000001s : 6: predicate.incorporate_call 0.64% : 0.000001s : 6: predicate.incorporate_call_switch 6.54% : 0.000009s : 34: predicate.inline 1.49% : 0.000002s : 6: predicate.inline_without_move 0.38% : 0.000001s : 6: predicate.j_node_and_user_rematch 0.93% : 0.000001s : 6: predicate.less_batch_normalization 1.80% : 0.000003s : 13: predicate.list_to_tuple_eliminator_ 1.87% : 0.000003s : 20: predicate.load_eliminater 1.61% : 0.000002s : 3: predicate.loop_unroll_after_grad 1.81% : 0.000003s : 14: predicate.loop_unroll_before_grad 1.73% : 0.000002s : 13: predicate.make_slice_get_slice_eliminator 0.82% : 0.000001s : 6: predicate.merge_addn 0.67% : 0.000001s : 6: predicate.micro_step_allgather_replace 0.87% : 0.000001s : 6: predicate.mini_step_allgather_replace 1.07% : 0.000002s : 7: predicate.minmaximum_grad 1.72% : 0.000002s : 3: predicate.mutable_eliminate 0.45% : 0.000001s : 3: predicate.opt_reshape 0.55% : 0.000001s : 3: predicate.parallel_virtual_node 1.14% : 0.000002s : 9: predicate.partial_defer_inline 1.15% : 0.000002s : 10: predicate.partial_eliminate 0.73% : 0.000001s : 7: predicate.print_const_string_wrapper 0.72% : 0.000001s : 6: predicate.reduce_all_const_elim 0.99% : 0.000001s : 7: predicate.reduce_eliminate 2.05% : 0.000003s : 20: predicate.redundant_stop_gradient_eliminater 0.64% : 0.000001s : 6: predicate.remove_not_recompute_node 1.19% : 0.000002s : 13: predicate.replace_applicator 0.69% : 0.000001s : 6: predicate.replace_old_param 0.47% : 0.000001s : 3: predicate.reset_defer_inline 0.80% : 0.000001s : 7: predicate.reshape_eliminate 0.78% : 0.000001s : 6: predicate.row_tensor_add_zeros_like 0.40% : 0.000001s : 3: predicate.row_tensor_eliminate 0.93% : 0.000001s : 6: predicate.same_eliminate 0.45% : 0.000001s : 6: predicate.set_cell_output_no_recompute 1.10% : 0.000002s : 6: predicate.shard_identity_eliminate 1.01% : 0.000001s : 6: predicate.special_op_eliminate 1.19% : 0.000002s : 6: predicate.specialize_transform 1.19% : 0.000002s : 6: predicate.split_environ_get_set_with_tuple_value 0.93% : 0.000001s : 6: predicate.stack_unstack_eliminate 0.42% : 0.000001s : 3: predicate.switch_call_monad_eliminater 0.96% : 0.000001s : 9: predicate.switch_defer_inline 1.69% : 0.000002s : 15: predicate.switch_layer_defer_inline 4.04% : 0.000006s : 32: predicate.switch_simplify 0.69% : 0.000001s : 7: predicate.tile_eliminate 0.79% : 0.000001s : 7: predicate.transpose_eliminate 1.40% : 0.000002s : 13: predicate.tuple_list_convert_item_index_to_positive 1.57% : 0.000002s : 13: predicate.tuple_list_get_item_const_eliminator 1.29% : 0.000002s : 13: predicate.tuple_list_get_item_depend_reorder 3.22% : 0.000005s : 19: predicate.tuple_list_get_item_eliminator 1.31% : 0.000002s : 13: predicate.tuple_list_get_set_item_eliminator 2.40% : 0.000003s : 19: predicate.tuple_list_set_item_eliminator 1.45% : 0.000002s : 13: predicate.tuple_to_list_eliminator_ 1.85% : 0.000003s : 20: predicate.updatestate_pure_node_eliminater 2.97% : 0.000004s : 26: predicate.updatestate_useless_node_eliminater 0.55% : 0.000001s : 3: predicate.value_based_eliminate 0.76% : 0.000001s : 6: predicate.virtual_dataset_eliminate 0.73% : 0.000001s : 6: predicate.virtual_output_eliminate 0.32% : 0.000000s : 3: predicate.virtual_view_grad_eliminate 0.52% : 0.000001s : 3: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000230 5 8.78% : 0.000020s : 1: func_graph_cloner_run.FuncGraphClonerGraph 91.22% : 0.000210s : 4: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.025032 192 0.02% : 0.000006s : 1: ForceFp32Comm 13.05% : 0.003266s : 1: add_attr 12.98% : 0.003249s : 1: add_attr_with_inline 0.03% : 0.000007s : 1: add_comm_op_reuse_tag 0.24% : 0.000059s : 1: add_recomputation 0.03% : 0.000006s : 1: assign_add_opt 0.26% : 0.000065s : 1: auto_monad 0.11% : 0.000027s : 1: auto_monad_reorder 0.02% : 0.000006s : 1: begin_end_overlap_inline 0.03% : 0.000008s : 1: bias_add_comm_swap 1.96% : 0.000492s : 1: bootstrap 0.15% : 0.000037s : 1: cconv 0.03% : 0.000007s : 1: comm_op_add_attrs 0.08% : 0.000021s : 1: control_data_broadcast_order 0.05% : 0.000013s : 1: convert_after_rewriter 0.13% : 0.000032s : 1: cse_after_recomputation 0.03% : 0.000007s : 1: dataset_repeat_opt 0.09% : 0.000022s : 1: detach_backward 0.04% : 0.000011s : 1: environ_conv 0.09% : 0.000023s : 1: event_method 0.03% : 0.000008s : 1: full_micro_interleaved_order_control 0.03% : 0.000008s : 1: get_jit_bprop_graph 0.05% : 0.000012s : 1: graph_reusing 0.03% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.02% : 0.000006s : 1: handle_group_info 0.03% : 0.000009s : 1: inline 0.04% : 0.000009s : 1: insert-virtual-dataset 0.03% : 0.000006s : 1: interleave_parallel_branches 0.03% : 0.000007s : 1: interleave_split_concat_branches 0.03% : 0.000008s : 1: label_fine_grained_interleaved_index 0.04% : 0.000011s : 1: label_micro_interleaved_index 1.90% : 0.000475s : 1: loop_unroll 0.03% : 0.000007s : 1: merge_cast_opt 0.03% : 0.000008s : 1: micro_interleaved_order_control 2.35% : 0.000589s : 1: mutable_eliminate 0.04% : 0.000010s : 1: offloading_packed_experts 0.06% : 0.000016s : 1: opt.transform.loop_unroll_optimizer 0.07% : 0.000018s : 1: opt.transform.mutable_eliminate 3.52% : 0.000882s : 78: opt.transform.opt_a 0.10% : 0.000026s : 1: opt.transform.opt_after_cconv 0.11% : 0.000026s : 1: opt.transform.opt_after_jit_grad 0.38% : 0.000096s : 28: opt.transform.opt_b 0.18% : 0.000045s : 2: opt.transform.opt_trans_graph 0.15% : 0.000039s : 4: opt.transform.symbol_engine_opt 11.10% : 0.002777s : 1: opt_a 0.53% : 0.000133s : 1: opt_after_cconv 2.17% : 0.000543s : 1: opt_after_jit_grad 1.12% : 0.000282s : 1: opt_b 22.03% : 0.005514s : 1: optimize 0.10% : 0.000024s : 1: optimize_parallel_all_gather_comm 0.04% : 0.000011s : 1: order_py_execute_after_rewriter 0.12% : 0.000030s : 1: overlap_grad_flash_sp 0.03% : 0.000007s : 1: overlap_grad_matmul_and_grad_allreduce 0.04% : 0.000010s : 1: overlap_grad_ring_attention 0.03% : 0.000007s : 1: overlap_opt_shard_grad_in_pipeline 0.03% : 0.000007s : 1: overlap_opt_shard_in_pipeline 0.03% : 0.000008s : 1: overlap_param_gather 0.03% : 0.000007s : 1: overlap_recompute_allgather_and_fa_grad 0.04% : 0.000011s : 1: overlap_recompute_and_grad_model_parallel 0.03% : 0.000008s : 1: overlap_recompute_comm 0.04% : 0.000010s : 1: parallel-infer-symbol 0.03% : 0.000007s : 1: parallel-infer-symbol-second 0.03% : 0.000008s : 1: partial_unused_args_eliminate 0.04% : 0.000009s : 1: pipeline_parallel_scheduler 0.03% : 0.000007s : 1: pipeline_split 0.14% : 0.000036s : 1: pre_auto_parallel 0.10% : 0.000026s : 1: py_interpret_to_execute 0.07% : 0.000019s : 1: py_interpret_to_execute_after_opt_a 0.03% : 0.000006s : 1: remove_cast_before_assign_add 0.09% : 0.000022s : 1: remove_dup_value 1.33% : 0.000332s : 1: renormalize.infer 1.04% : 0.000260s : 1: renormalize.specialize 0.03% : 0.000008s : 1: reorder_send_recv_between_fp_bp 0.05% : 0.000012s : 1: rewriter_after_jit_bprop_graph 0.20% : 0.000049s : 1: rewriter_after_opt_a 0.22% : 0.000054s : 1: rewriter_before_opt_a 0.03% : 0.000008s : 1: slice_cell_reuse_recomputed_activation 0.03% : 0.000008s : 1: slice_recompute_activation 0.03% : 0.000007s : 1: split_layernorm_comm 0.03% : 0.000007s : 1: split_matmul_comm_elemetwise 0.04% : 0.000011s : 1: swap_dp_allreduce_reducescatter 0.43% : 0.000108s : 1: symbol_engine_optimizer 0.36% : 0.000091s : 1: tuple_transform 19.41% : 0.004858s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:45.232.456 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0133766, [21] [bootstrap]: 0.0004452 [type_inference]: 0.00469457 [event_method]: 1.282e-05 [auto_monad]: 5.488e-05 [graph_reusing]: 5.89e-06 [inline]: 2.28998e-06 [add_attr]: 0.00318744, [1] [add_attr_with_inline]: 0.00317902, [1] [Cycle 1]: 5.138e-05, [2] [tag_attr]: 1.457e-05 [meta_addattr_fg_expand]: 3.70998e-06 [parallel-infer-symbol]: 3.59002e-06 [pre_auto_parallel]: 2.669e-05 [insert-virtual-dataset]: 2.40002e-06 [parallel-infer-symbol-second]: 6.70028e-07 [dataset_repeat_opt]: 1.97999e-06 [pipeline_split]: 1.79e-06 [optimize]: 0.00425697, [53] [py_interpret_to_execute]: 1.925e-05 [rewriter_before_opt_a]: 4.662e-05 [opt_a]: 0.00227023, [2] [Cycle 1]: 0.00160371, [45] [expand_dump_flag]: 2.76999e-06 [switch_simplify]: 2.768e-05 [loop_unroll]: 1.366e-05 [a_1]: 0.00032741 [with_stream_mark]: 1.722e-05 [recompute_prepare]: 9.05999e-06 [updatestate_depend_eliminate]: 3.56999e-06 [updatestate_assign_eliminate]: 2.99999e-06 [updatestate_loads_eliminate]: 3.28e-06 [parameter_eliminate]: 1.92999e-06 [a_2]: 9.064e-05 [accelerated_algorithm]: 7.01999e-06 [shard]: 2.02999e-06 [meta_shard_fg_expand]: 1.74e-06 [shard_inline]: 6.16e-06 [merge_send_recv]: 9.71e-06 [auto_parallel]: 6.38e-06 [parallel]: 1.884e-05 [flash_sp]: 8.78001e-06 [merge_comm]: 4.18001e-06 [allreduce_fusion]: 3.41999e-06 [matmul_add_comm_reduction]: 1.058e-05 [allreduce_slice_to_reducescatter]: 6.10016e-07 [virtual_shard_identity]: 8.25999e-06 [virtual_dataset]: 6.59999e-06 [get_grad_eliminate_]: 6.44999e-06 [virtual_output]: 6.47001e-06 [merge_forward]: 3.60003e-06 [cell_reuse_recompute_pass]: 1.33002e-06 [offload_activation]: 9.76998e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.204e-05 [merge_recompute_call_nodes]: 1.45999e-06 [before_grad]: 1.052e-05 [set_forward_comm_id_for_comm_node_pass]: 3.49001e-06 [meta_fg_expand]: 2.52001e-06 [flash_sp_send_recv_attached]: 2.71999e-06 [receive_attached]: 2.34999e-06 [after_resolve]: 1.024e-05 [a_after_grad]: 9.56003e-06 [renormalize]: 0.00058157 [add_forward_monad_depend]: 5.94e-06 [auto_monad_grad]: 2.13998e-06 [auto_monad_eliminator]: 1.477e-05 [cse]: 3.118e-05 [a_3]: 4.783e-05 [Cycle 2]: 0.00065573, [45] [expand_dump_flag]: 9.70002e-07 [switch_simplify]: 7.44002e-06 [loop_unroll]: 6.08002e-06 [a_1]: 0.00012995 [with_stream_mark]: 1.137e-05 [recompute_prepare]: 6.45002e-06 [updatestate_depend_eliminate]: 3.46001e-06 [updatestate_assign_eliminate]: 2.54001e-06 [updatestate_loads_eliminate]: 2.79001e-06 [parameter_eliminate]: 1.20999e-06 [a_2]: 8.185e-05 [accelerated_algorithm]: 6.44001e-06 [shard]: 1.18001e-06 [meta_shard_fg_expand]: 1.49e-06 [shard_inline]: 5.97001e-06 [merge_send_recv]: 5.78997e-06 [auto_parallel]: 5.48997e-06 [parallel]: 5.20999e-06 [flash_sp]: 3.49001e-06 [merge_comm]: 3.18e-06 [allreduce_fusion]: 3.46001e-06 [matmul_add_comm_reduction]: 6.88e-06 [allreduce_slice_to_reducescatter]: 3.00002e-07 [virtual_shard_identity]: 6.63998e-06 [virtual_dataset]: 5.89e-06 [get_grad_eliminate_]: 5.55001e-06 [virtual_output]: 5.17999e-06 [merge_forward]: 2.94001e-06 [cell_reuse_recompute_pass]: 1.57999e-06 [offload_activation]: 6.61999e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.125e-05 [merge_recompute_call_nodes]: 1.04e-06 [before_grad]: 9.41e-06 [set_forward_comm_id_for_comm_node_pass]: 4.17998e-06 [meta_fg_expand]: 2.32001e-06 [flash_sp_send_recv_attached]: 1.04998e-06 [receive_attached]: 9.60019e-07 [after_resolve]: 9.00001e-06 [a_after_grad]: 8.05e-06 [renormalize]: 8.00064e-08 [add_forward_monad_depend]: 2.04e-06 [auto_monad_grad]: 1.11002e-06 [auto_monad_eliminator]: 7.50003e-06 [cse]: 1.539e-05 [a_3]: 3.551e-05 [py_interpret_to_execute_after_opt_a]: 9.43002e-06 [slice_cell_reuse_recomputed_activation]: 2.19999e-06 [rewriter_after_opt_a]: 3.533e-05 [convert_after_rewriter]: 6.59001e-06 [order_py_execute_after_rewriter]: 5.43002e-06 [mutable_eliminate]: 0.00050382 [opt_b]: 0.00020271, [1] [Cycle 1]: 0.00019674, [7] [b_1]: 0.00011754 [b_2]: 7.14001e-06 [updatestate_depend_eliminate]: 6.93e-06 [updatestate_assign_eliminate]: 2.64001e-06 [updatestate_loads_eliminate]: 2.22999e-06 [renormalize]: 9.39996e-07 [cse]: 2.121e-05 [optimize_parallel_all_gather_comm]: 1.701e-05 [overlap_param_gather]: 2.32999e-06 [cconv]: 2.879e-05 [loop_unroll]: 0.00045887 [opt_after_cconv]: 0.00010136, [1] [Cycle 1]: 9.591e-05, [7] [c_1]: 2.694e-05 [parameter_eliminate]: 3.51999e-06 [updatestate_depend_eliminate]: 6.06998e-06 [updatestate_assign_eliminate]: 2.49001e-06 [updatestate_loads_eliminate]: 2.58e-06 [cse]: 1.995e-05 [renormalize]: 3.50003e-07 [remove_dup_value]: 1.543e-05 [tuple_transform]: 7.223e-05, [1] [Cycle 1]: 6.783e-05, [4] [d_1]: 4.089e-05 [none_parameter_eliminate]: 1.65001e-06 [renormalize]: 2.3999e-07 [switch_simplify]: 7.09001e-06 [partial_unused_args_eliminate]: 1.69998e-06 [add_recomputation]: 4.721e-05 [cse_after_recomputation]: 2.173e-05, [1] [Cycle 1]: 1.675e-05, [1] [cse]: 1.14e-05 [environ_conv]: 5.74999e-06 [swap_dp_allreduce_reducescatter]: 5.11002e-06 [bias_add_comm_swap]: 3.35e-06 [label_micro_interleaved_index]: 4.43999e-06 [label_fine_grained_interleaved_index]: 3.03e-06 [merge_cast_opt]: 1.28002e-06 [slice_recompute_activation]: 1.92999e-06 [micro_interleaved_order_control]: 2.14e-06 [assign_add_opt]: 1.20999e-06 [ForceFp32Comm]: 8.09989e-07 [remove_cast_before_assign_add]: 1.32999e-06 [full_micro_interleaved_order_control]: 3.08998e-06 [reorder_send_recv_between_fp_bp]: 2.39999e-06 [comm_op_add_attrs]: 1.09e-06 [add_comm_op_reuse_tag]: 1.04e-06 [interleave_split_concat_branches]: 1.10001e-06 [interleave_parallel_branches]: 1.08001e-06 [overlap_opt_shard_in_pipeline]: 1.07998e-06 [overlap_opt_shard_grad_in_pipeline]: 2.02999e-06 [control_data_broadcast_order]: 1.303e-05 [grouped_pairwise_exchange_alltoall]: 1.99999e-06 [offloading_packed_experts]: 4.13001e-06 [overlap_recompute_and_grad_model_parallel]: 4.18001e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.10999e-06 [overlap_recompute_allgather_and_fa_grad]: 1.35001e-06 [overlap_recompute_comm]: 2.26e-06 [overlap_grad_ring_attention]: 3.91999e-06 [overlap_grad_flash_sp]: 2.025e-05 [begin_end_overlap_inline]: 6.19999e-07 [split_matmul_comm_elemetwise]: 2.44001e-06 [split_layernorm_comm]: 1.63002e-06 [handle_group_info]: 9.89996e-07 [symbol_engine_optimizer]: 7.206e-05, [1] [Cycle 1]: 6.742e-05, [6] [build]: 2.55002e-06 [elim_shapecalc]: 9.25001e-06 [elim_not_effective]: 1.212e-05 [opt_reshape]: 6.66e-06 [fold_const_symbol]: 9.42001e-06 [renormalize]: 2.19996e-07 [detach_backward]: 1.88997e-06 [pipeline_parallel_scheduler]: 1.62001e-06 [auto_monad_reorder]: 1.668e-05 [get_jit_bprop_graph]: 2.19001e-06 [rewriter_after_jit_bprop_graph]: 4.17998e-06 [opt_after_jit_grad]: 0.00046809 [validate]: 3.659e-05 Sums bootstrap : 0.000445s : 4.82% type_inference : 0.004695s : 50.85% event_method : 0.000013s : 0.14% auto_monad : 0.000055s : 0.59% graph_reusing : 0.000006s : 0.06% inline : 0.000002s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000015s : 0.16% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000004s : 0.04% parallel-infer-symbol : 0.000004s : 0.04% pre_auto_parallel : 0.000027s : 0.29% insert-virtual-dataset : 0.000002s : 0.03% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.02% optimize.py_interpret_to_execute : 0.000019s : 0.21% optimize.rewriter_before_opt_a : 0.000047s : 0.50% optimize.opt_a.expand_dump_flag : 0.000004s : 0.04% optimize.opt_a.switch_simplify : 0.000035s : 0.38% optimize.opt_a.loop_unroll : 0.000020s : 0.21% optimize.opt_a.a_1 : 0.000457s : 4.95% optimize.opt_a.with_stream_mark : 0.000029s : 0.31% optimize.opt_a.recompute_prepare : 0.000016s : 0.17% optimize.opt_a.updatestate_depend_eliminate : 0.000007s : 0.08% optimize.opt_a.updatestate_assign_eliminate : 0.000006s : 0.06% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.07% optimize.opt_a.parameter_eliminate : 0.000003s : 0.03% optimize.opt_a.a_2 : 0.000172s : 1.87% optimize.opt_a.accelerated_algorithm : 0.000013s : 0.15% optimize.opt_a.shard : 0.000003s : 0.03% optimize.opt_a.meta_shard_fg_expand : 0.000003s : 0.03% optimize.opt_a.shard_inline : 0.000012s : 0.13% optimize.opt_a.merge_send_recv : 0.000015s : 0.17% optimize.opt_a.auto_parallel : 0.000012s : 0.13% optimize.opt_a.parallel : 0.000024s : 0.26% optimize.opt_a.flash_sp : 0.000012s : 0.13% optimize.opt_a.merge_comm : 0.000007s : 0.08% optimize.opt_a.allreduce_fusion : 0.000007s : 0.07% optimize.opt_a.matmul_add_comm_reduction : 0.000017s : 0.19% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000015s : 0.16% optimize.opt_a.virtual_dataset : 0.000012s : 0.14% optimize.opt_a.get_grad_eliminate_ : 0.000012s : 0.13% optimize.opt_a.virtual_output : 0.000012s : 0.13% optimize.opt_a.merge_forward : 0.000007s : 0.07% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.03% optimize.opt_a.offload_activation : 0.000016s : 0.18% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000023s : 0.25% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.03% optimize.opt_a.before_grad : 0.000020s : 0.22% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000008s : 0.08% optimize.opt_a.meta_fg_expand : 0.000005s : 0.05% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.04% optimize.opt_a.receive_attached : 0.000003s : 0.04% optimize.opt_a.after_resolve : 0.000019s : 0.21% optimize.opt_a.a_after_grad : 0.000018s : 0.19% optimize.opt_a.renormalize : 0.000582s : 6.30% optimize.opt_a.add_forward_monad_depend : 0.000008s : 0.09% optimize.opt_a.auto_monad_grad : 0.000003s : 0.04% optimize.opt_a.auto_monad_eliminator : 0.000022s : 0.24% optimize.opt_a.cse : 0.000047s : 0.50% optimize.opt_a.a_3 : 0.000083s : 0.90% optimize.py_interpret_to_execute_after_opt_a : 0.000009s : 0.10% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.02% optimize.rewriter_after_opt_a : 0.000035s : 0.38% optimize.convert_after_rewriter : 0.000007s : 0.07% optimize.order_py_execute_after_rewriter : 0.000005s : 0.06% optimize.mutable_eliminate : 0.000504s : 5.46% optimize.opt_b.b_1 : 0.000118s : 1.27% optimize.opt_b.b_2 : 0.000007s : 0.08% optimize.opt_b.updatestate_depend_eliminate : 0.000007s : 0.08% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_b.updatestate_loads_eliminate : 0.000002s : 0.02% optimize.opt_b.renormalize : 0.000001s : 0.01% optimize.opt_b.cse : 0.000021s : 0.23% optimize.optimize_parallel_all_gather_comm : 0.000017s : 0.18% optimize.overlap_param_gather : 0.000002s : 0.03% optimize.cconv : 0.000029s : 0.31% optimize.loop_unroll : 0.000459s : 4.97% optimize.opt_after_cconv.c_1 : 0.000027s : 0.29% optimize.opt_after_cconv.parameter_eliminate : 0.000004s : 0.04% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.07% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000002s : 0.03% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.cse : 0.000020s : 0.22% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000015s : 0.17% optimize.tuple_transform.d_1 : 0.000041s : 0.44% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.02% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000007s : 0.08% optimize.partial_unused_args_eliminate : 0.000002s : 0.02% optimize.add_recomputation : 0.000047s : 0.51% optimize.cse_after_recomputation.cse : 0.000011s : 0.12% optimize.environ_conv : 0.000006s : 0.06% optimize.swap_dp_allreduce_reducescatter : 0.000005s : 0.06% optimize.bias_add_comm_swap : 0.000003s : 0.04% optimize.label_micro_interleaved_index : 0.000004s : 0.05% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.03% optimize.merge_cast_opt : 0.000001s : 0.01% optimize.slice_recompute_activation : 0.000002s : 0.02% optimize.micro_interleaved_order_control : 0.000002s : 0.02% optimize.assign_add_opt : 0.000001s : 0.01% optimize.ForceFp32Comm : 0.000001s : 0.01% optimize.remove_cast_before_assign_add : 0.000001s : 0.01% optimize.full_micro_interleaved_order_control : 0.000003s : 0.03% optimize.reorder_send_recv_between_fp_bp : 0.000002s : 0.03% optimize.comm_op_add_attrs : 0.000001s : 0.01% optimize.add_comm_op_reuse_tag : 0.000001s : 0.01% optimize.interleave_split_concat_branches : 0.000001s : 0.01% optimize.interleave_parallel_branches : 0.000001s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.02% optimize.control_data_broadcast_order : 0.000013s : 0.14% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.02% optimize.offloading_packed_experts : 0.000004s : 0.04% optimize.overlap_recompute_and_grad_model_parallel : 0.000004s : 0.05% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.01% optimize.overlap_recompute_comm : 0.000002s : 0.02% optimize.overlap_grad_ring_attention : 0.000004s : 0.04% optimize.overlap_grad_flash_sp : 0.000020s : 0.22% optimize.begin_end_overlap_inline : 0.000001s : 0.01% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.03% optimize.split_layernorm_comm : 0.000002s : 0.02% optimize.handle_group_info : 0.000001s : 0.01% optimize.symbol_engine_optimizer.build : 0.000003s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000009s : 0.10% optimize.symbol_engine_optimizer.elim_not_effective : 0.000012s : 0.13% optimize.symbol_engine_optimizer.opt_reshape : 0.000007s : 0.07% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000009s : 0.10% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.02% pipeline_parallel_scheduler : 0.000002s : 0.02% auto_monad_reorder : 0.000017s : 0.18% get_jit_bprop_graph : 0.000002s : 0.02% rewriter_after_jit_bprop_graph : 0.000004s : 0.05% opt_after_jit_grad : 0.000468s : 5.07% validate : 0.000037s : 0.40% Time group info: ------[substitution.] 0.000183 23 38.24% : 0.000070s : 4: substitution.arithmetic_simplify 1.05% : 0.000002s : 2: substitution.elim_not_effective 0.73% : 0.000001s : 2: substitution.fold_const_symbol 2.95% : 0.000005s : 3: substitution.graph_param_transform 50.53% : 0.000092s : 2: substitution.inline 2.23% : 0.000004s : 4: substitution.j_node_and_user_rematch 2.37% : 0.000004s : 4: substitution.remove_not_recompute_node 1.91% : 0.000004s : 2: substitution.replace_old_param ------[type_inference.] 0.004647 2 90.81% : 0.004220s : 1: type_inference.infer 9.19% : 0.000427s : 1: type_inference.specialize ------[replace.] 0.000022 2 100.00% : 0.000022s : 2: replace.inline ------[match.] 0.000091 2 100.00% : 0.000091s : 2: match.inline ------[predicate.] 0.000132 754 0.87% : 0.000001s : 7: predicate.accumulaten_eliminater 1.19% : 0.000002s : 3: predicate.ad_related_special_op_eliminate 0.67% : 0.000001s : 6: predicate.addn_check_dump 0.86% : 0.000001s : 7: predicate.addn_zero_filter 0.72% : 0.000001s : 7: predicate.adjust_all_reduce_mul_add 2.78% : 0.000004s : 13: predicate.arithmetic_simplify 0.95% : 0.000001s : 7: predicate.cast_eliminate 0.74% : 0.000001s : 6: predicate.check_bprop_eliminate 0.64% : 0.000001s : 6: predicate.compare_switch_simplify 0.24% : 0.000000s : 3: predicate.const_output_eliminate 0.70% : 0.000001s : 6: predicate.depend_value_elim 0.77% : 0.000001s : 7: predicate.dict_get_item_const_eliminator 0.89% : 0.000001s : 7: predicate.dict_get_item_eliminator 0.76% : 0.000001s : 7: predicate.dict_set_item_eliminator 1.23% : 0.000002s : 6: predicate.dumpgradient_eliminate 0.27% : 0.000000s : 3: predicate.elim_not_effective 0.49% : 0.000001s : 3: predicate.elim_shapecalc_of_broadcastargs 1.08% : 0.000001s : 10: predicate.environ_add_const_eliminate 0.99% : 0.000001s : 10: predicate.environ_get_add_eliminate 0.99% : 0.000001s : 10: predicate.environ_get_depend_swap 1.70% : 0.000002s : 16: predicate.environ_get_eliminate 0.99% : 0.000001s : 10: predicate.environ_get_set_eliminate 0.96% : 0.000001s : 9: predicate.exchange_switch_depend_value 2.05% : 0.000003s : 9: predicate.float_depend_g_call 0.64% : 0.000001s : 6: predicate.float_environ_get_switch 1.02% : 0.000001s : 9: predicate.float_tuple_getitem_switch 0.23% : 0.000000s : 3: predicate.fold_const_symbol 0.82% : 0.000001s : 6: predicate.get_grad_eliminate 0.36% : 0.000000s : 3: predicate.graph_param_transform 0.86% : 0.000001s : 6: predicate.incorporate_call 0.66% : 0.000001s : 6: predicate.incorporate_call_switch 6.56% : 0.000009s : 34: predicate.inline 1.09% : 0.000001s : 6: predicate.inline_without_move 0.43% : 0.000001s : 6: predicate.j_node_and_user_rematch 1.02% : 0.000001s : 6: predicate.less_batch_normalization 1.54% : 0.000002s : 13: predicate.list_to_tuple_eliminator_ 2.10% : 0.000003s : 20: predicate.load_eliminater 1.29% : 0.000002s : 3: predicate.loop_unroll_after_grad 1.66% : 0.000002s : 14: predicate.loop_unroll_before_grad 1.62% : 0.000002s : 13: predicate.make_slice_get_slice_eliminator 0.70% : 0.000001s : 6: predicate.merge_addn 0.67% : 0.000001s : 6: predicate.micro_step_allgather_replace 0.67% : 0.000001s : 6: predicate.mini_step_allgather_replace 0.68% : 0.000001s : 7: predicate.minmaximum_grad 1.58% : 0.000002s : 3: predicate.mutable_eliminate 0.42% : 0.000001s : 3: predicate.opt_reshape 0.44% : 0.000001s : 3: predicate.parallel_virtual_node 1.26% : 0.000002s : 9: predicate.partial_defer_inline 1.23% : 0.000002s : 10: predicate.partial_eliminate 0.75% : 0.000001s : 7: predicate.print_const_string_wrapper 0.80% : 0.000001s : 6: predicate.reduce_all_const_elim 0.92% : 0.000001s : 7: predicate.reduce_eliminate 2.02% : 0.000003s : 20: predicate.redundant_stop_gradient_eliminater 0.65% : 0.000001s : 6: predicate.remove_not_recompute_node 1.57% : 0.000002s : 13: predicate.replace_applicator 0.70% : 0.000001s : 6: predicate.replace_old_param 0.37% : 0.000000s : 3: predicate.reset_defer_inline 0.89% : 0.000001s : 7: predicate.reshape_eliminate 0.74% : 0.000001s : 6: predicate.row_tensor_add_zeros_like 0.45% : 0.000001s : 3: predicate.row_tensor_eliminate 0.87% : 0.000001s : 6: predicate.same_eliminate 0.58% : 0.000001s : 6: predicate.set_cell_output_no_recompute 0.90% : 0.000001s : 6: predicate.shard_identity_eliminate 0.89% : 0.000001s : 6: predicate.special_op_eliminate 0.93% : 0.000001s : 6: predicate.specialize_transform 1.07% : 0.000001s : 6: predicate.split_environ_get_set_with_tuple_value 0.97% : 0.000001s : 6: predicate.stack_unstack_eliminate 0.43% : 0.000001s : 3: predicate.switch_call_monad_eliminater 1.01% : 0.000001s : 9: predicate.switch_defer_inline 1.76% : 0.000002s : 15: predicate.switch_layer_defer_inline 4.49% : 0.000006s : 32: predicate.switch_simplify 0.76% : 0.000001s : 7: predicate.tile_eliminate 0.89% : 0.000001s : 7: predicate.transpose_eliminate 1.52% : 0.000002s : 13: predicate.tuple_list_convert_item_index_to_positive 1.68% : 0.000002s : 13: predicate.tuple_list_get_item_const_eliminator 1.39% : 0.000002s : 13: predicate.tuple_list_get_item_depend_reorder 3.22% : 0.000004s : 19: predicate.tuple_list_get_item_eliminator 1.46% : 0.000002s : 13: predicate.tuple_list_get_set_item_eliminator 2.90% : 0.000004s : 19: predicate.tuple_list_set_item_eliminator 1.54% : 0.000002s : 13: predicate.tuple_to_list_eliminator_ 1.94% : 0.000003s : 20: predicate.updatestate_pure_node_eliminater 2.86% : 0.000004s : 26: predicate.updatestate_useless_node_eliminater 0.39% : 0.000001s : 3: predicate.value_based_eliminate 0.87% : 0.000001s : 6: predicate.virtual_dataset_eliminate 0.81% : 0.000001s : 6: predicate.virtual_output_eliminate 0.33% : 0.000000s : 3: predicate.virtual_view_grad_eliminate 0.61% : 0.000001s : 3: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000220 5 8.79% : 0.000019s : 1: func_graph_cloner_run.FuncGraphClonerGraph 91.21% : 0.000201s : 4: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.022379 192 0.02% : 0.000003s : 1: ForceFp32Comm 14.26% : 0.003192s : 1: add_attr 14.22% : 0.003182s : 1: add_attr_with_inline 0.02% : 0.000004s : 1: add_comm_op_reuse_tag 0.23% : 0.000051s : 1: add_recomputation 0.02% : 0.000004s : 1: assign_add_opt 0.27% : 0.000060s : 1: auto_monad 0.09% : 0.000020s : 1: auto_monad_reorder 0.02% : 0.000004s : 1: begin_end_overlap_inline 0.03% : 0.000006s : 1: bias_add_comm_swap 2.12% : 0.000474s : 1: bootstrap 0.14% : 0.000032s : 1: cconv 0.02% : 0.000004s : 1: comm_op_add_attrs 0.07% : 0.000017s : 1: control_data_broadcast_order 0.04% : 0.000010s : 1: convert_after_rewriter 0.11% : 0.000025s : 1: cse_after_recomputation 0.02% : 0.000005s : 1: dataset_repeat_opt 0.02% : 0.000006s : 1: detach_backward 0.04% : 0.000009s : 1: environ_conv 0.08% : 0.000018s : 1: event_method 0.03% : 0.000006s : 1: full_micro_interleaved_order_control 0.02% : 0.000005s : 1: get_jit_bprop_graph 0.04% : 0.000009s : 1: graph_reusing 0.02% : 0.000005s : 1: grouped_pairwise_exchange_alltoall 0.02% : 0.000004s : 1: handle_group_info 0.02% : 0.000005s : 1: inline 0.03% : 0.000006s : 1: insert-virtual-dataset 0.02% : 0.000004s : 1: interleave_parallel_branches 0.02% : 0.000004s : 1: interleave_split_concat_branches 0.03% : 0.000006s : 1: label_fine_grained_interleaved_index 0.03% : 0.000007s : 1: label_micro_interleaved_index 2.09% : 0.000467s : 1: loop_unroll 0.02% : 0.000004s : 1: merge_cast_opt 0.02% : 0.000005s : 1: micro_interleaved_order_control 2.29% : 0.000513s : 1: mutable_eliminate 0.03% : 0.000007s : 1: offloading_packed_experts 0.06% : 0.000014s : 1: opt.transform.loop_unroll_optimizer 0.07% : 0.000016s : 1: opt.transform.mutable_eliminate 3.79% : 0.000847s : 78: opt.transform.opt_a 0.11% : 0.000026s : 1: opt.transform.opt_after_cconv 0.11% : 0.000024s : 1: opt.transform.opt_after_jit_grad 0.42% : 0.000094s : 28: opt.transform.opt_b 0.20% : 0.000046s : 2: opt.transform.opt_trans_graph 0.15% : 0.000034s : 4: opt.transform.symbol_engine_opt 10.16% : 0.002273s : 1: opt_a 0.47% : 0.000105s : 1: opt_after_cconv 2.13% : 0.000477s : 1: opt_after_jit_grad 0.92% : 0.000206s : 1: opt_b 19.04% : 0.004262s : 1: optimize 0.09% : 0.000020s : 1: optimize_parallel_all_gather_comm 0.04% : 0.000008s : 1: order_py_execute_after_rewriter 0.10% : 0.000023s : 1: overlap_grad_flash_sp 0.02% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.03% : 0.000007s : 1: overlap_grad_ring_attention 0.02% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.02% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.02% : 0.000005s : 1: overlap_param_gather 0.02% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.03% : 0.000007s : 1: overlap_recompute_and_grad_model_parallel 0.02% : 0.000005s : 1: overlap_recompute_comm 0.03% : 0.000007s : 1: parallel-infer-symbol 0.02% : 0.000004s : 1: parallel-infer-symbol-second 0.02% : 0.000005s : 1: partial_unused_args_eliminate 0.02% : 0.000005s : 1: pipeline_parallel_scheduler 0.02% : 0.000004s : 1: pipeline_split 0.14% : 0.000031s : 1: pre_auto_parallel 0.10% : 0.000023s : 1: py_interpret_to_execute 0.06% : 0.000013s : 1: py_interpret_to_execute_after_opt_a 0.02% : 0.000004s : 1: remove_cast_before_assign_add 0.08% : 0.000019s : 1: remove_dup_value 1.53% : 0.000342s : 1: renormalize.infer 1.04% : 0.000232s : 1: renormalize.specialize 0.02% : 0.000005s : 1: reorder_send_recv_between_fp_bp 0.03% : 0.000007s : 1: rewriter_after_jit_bprop_graph 0.18% : 0.000040s : 1: rewriter_after_opt_a 0.22% : 0.000050s : 1: rewriter_before_opt_a 0.02% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.02% : 0.000005s : 1: slice_recompute_activation 0.02% : 0.000004s : 1: split_layernorm_comm 0.02% : 0.000005s : 1: split_matmul_comm_elemetwise 0.04% : 0.000008s : 1: swap_dp_allreduce_reducescatter 0.33% : 0.000075s : 1: symbol_engine_optimizer 0.34% : 0.000075s : 1: tuple_transform 21.05% : 0.004711s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:45.429.663 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:45.429.956 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0151515, [21] [bootstrap]: 0.00046039 [type_inference]: 0.00495611 [event_method]: 1.256e-05 [auto_monad]: 5.563e-05 [graph_reusing]: 6.29001e-06 [inline]: 2.50002e-06 [add_attr]: 0.00326905, [1] [add_attr_with_inline]: 0.00325979, [1] [Cycle 1]: 7.956e-05, [2] [tag_attr]: 1.53e-05 [meta_addattr_fg_expand]: 4e-06 [parallel-infer-symbol]: 3.73001e-06 [pre_auto_parallel]: 2.74e-05 [insert-virtual-dataset]: 2.43002e-06 [parallel-infer-symbol-second]: 8.39995e-07 [dataset_repeat_opt]: 2.09e-06 [pipeline_split]: 1.62001e-06 [optimize]: 0.00515678, [53] [py_interpret_to_execute]: 2.352e-05 [rewriter_before_opt_a]: 5.126e-05 [opt_a]: 0.00275359, [2] [Cycle 1]: 0.00186149, [45] [expand_dump_flag]: 3.21001e-06 [switch_simplify]: 2.625e-05 [loop_unroll]: 1.407e-05 [a_1]: 0.00033221 [with_stream_mark]: 1.969e-05 [recompute_prepare]: 9.69e-06 [updatestate_depend_eliminate]: 4.07e-06 [updatestate_assign_eliminate]: 3.78999e-06 [updatestate_loads_eliminate]: 2.89999e-06 [parameter_eliminate]: 2.07999e-06 [a_2]: 0.00012636 [accelerated_algorithm]: 7.80998e-06 [shard]: 2.02001e-06 [meta_shard_fg_expand]: 1.92001e-06 [shard_inline]: 6.44999e-06 [merge_send_recv]: 9.65002e-06 [auto_parallel]: 7.23e-06 [parallel]: 1.978e-05 [flash_sp]: 9.90002e-06 [merge_comm]: 4.18999e-06 [allreduce_fusion]: 3.66001e-06 [matmul_add_comm_reduction]: 9.74999e-06 [allreduce_slice_to_reducescatter]: 5.69999e-07 [virtual_shard_identity]: 9.54999e-06 [virtual_dataset]: 6.89999e-06 [get_grad_eliminate_]: 6.43e-06 [virtual_output]: 6.96999e-06 [merge_forward]: 4.46002e-06 [cell_reuse_recompute_pass]: 1.42999e-06 [offload_activation]: 1.069e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.769e-05 [merge_recompute_call_nodes]: 1.61998e-06 [before_grad]: 1.19e-05 [set_forward_comm_id_for_comm_node_pass]: 4.02e-06 [meta_fg_expand]: 2.67001e-06 [flash_sp_send_recv_attached]: 3.13998e-06 [receive_attached]: 2.37999e-06 [after_resolve]: 1.151e-05 [a_after_grad]: 9.45001e-06 [renormalize]: 0.00056523 [add_forward_monad_depend]: 5.96e-06 [auto_monad_grad]: 3.53e-06 [auto_monad_eliminator]: 1.578e-05 [cse]: 3.052e-05 [a_3]: 6.354e-05 [Cycle 2]: 0.00087758, [45] [expand_dump_flag]: 1.02e-06 [switch_simplify]: 7.97e-06 [loop_unroll]: 5.79e-06 [a_1]: 0.00013651 [with_stream_mark]: 1.799e-05 [recompute_prepare]: 7.67002e-06 [updatestate_depend_eliminate]: 3.21001e-06 [updatestate_assign_eliminate]: 2.93998e-06 [updatestate_loads_eliminate]: 2.95998e-06 [parameter_eliminate]: 1.59e-06 [a_2]: 0.00011143 [accelerated_algorithm]: 6.98e-06 [shard]: 1.14e-06 [meta_shard_fg_expand]: 1.59998e-06 [shard_inline]: 6.34999e-06 [merge_send_recv]: 6.79999e-06 [auto_parallel]: 6.27001e-06 [parallel]: 5.92999e-06 [flash_sp]: 3.63e-06 [merge_comm]: 3.73999e-06 [allreduce_fusion]: 3.63999e-06 [matmul_add_comm_reduction]: 7.56999e-06 [allreduce_slice_to_reducescatter]: 3.59985e-07 [virtual_shard_identity]: 8.55001e-06 [virtual_dataset]: 6.23e-06 [get_grad_eliminate_]: 5.99999e-06 [virtual_output]: 5.42999e-06 [merge_forward]: 3.33e-06 [cell_reuse_recompute_pass]: 1.44e-06 [offload_activation]: 7.78999e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.56e-05 [merge_recompute_call_nodes]: 1.22999e-06 [before_grad]: 9.66e-06 [set_forward_comm_id_for_comm_node_pass]: 4.68999e-06 [meta_fg_expand]: 2.68e-06 [flash_sp_send_recv_attached]: 1.04e-06 [receive_attached]: 1.30001e-06 [after_resolve]: 1e-05 [a_after_grad]: 8.77999e-06 [renormalize]: 6.00121e-08 [add_forward_monad_depend]: 2.48e-06 [auto_monad_grad]: 2.25002e-06 [auto_monad_eliminator]: 9.68997e-06 [cse]: 1.932e-05 [a_3]: 4.807e-05 [py_interpret_to_execute_after_opt_a]: 1.424e-05 [slice_cell_reuse_recomputed_activation]: 4.60999e-06 [rewriter_after_opt_a]: 4.32e-05 [convert_after_rewriter]: 1.026e-05 [order_py_execute_after_rewriter]: 8.26002e-06 [mutable_eliminate]: 0.00057644 [opt_b]: 0.00028069, [1] [Cycle 1]: 0.00027088, [7] [b_1]: 0.00016644 [b_2]: 7.92e-06 [updatestate_depend_eliminate]: 7.73001e-06 [updatestate_assign_eliminate]: 2.80002e-06 [updatestate_loads_eliminate]: 2.49999e-06 [renormalize]: 7.00005e-07 [cse]: 2.261e-05 [optimize_parallel_all_gather_comm]: 2.102e-05 [overlap_param_gather]: 5.24e-06 [cconv]: 3.452e-05 [loop_unroll]: 0.00048641 [opt_after_cconv]: 0.0001359, [1] [Cycle 1]: 0.00012601, [7] [c_1]: 2.886e-05 [parameter_eliminate]: 4.28001e-06 [updatestate_depend_eliminate]: 6.69999e-06 [updatestate_assign_eliminate]: 2.86e-06 [updatestate_loads_eliminate]: 3.09999e-06 [cse]: 2.207e-05 [renormalize]: 7.59988e-07 [remove_dup_value]: 1.765e-05 [tuple_transform]: 8.802e-05, [1] [Cycle 1]: 8.1e-05, [4] [d_1]: 4.012e-05 [none_parameter_eliminate]: 1.90001e-06 [renormalize]: 2.59985e-07 [switch_simplify]: 7.77e-06 [partial_unused_args_eliminate]: 5.63002e-06 [add_recomputation]: 5.386e-05 [cse_after_recomputation]: 3.182e-05, [1] [Cycle 1]: 2.387e-05, [1] [cse]: 1.37e-05 [environ_conv]: 8.77e-06 [swap_dp_allreduce_reducescatter]: 8.47e-06 [bias_add_comm_swap]: 5.35999e-06 [label_micro_interleaved_index]: 8.21002e-06 [label_fine_grained_interleaved_index]: 5.74e-06 [merge_cast_opt]: 3.7e-06 [slice_recompute_activation]: 4.52e-06 [micro_interleaved_order_control]: 4.77998e-06 [assign_add_opt]: 3.55998e-06 [ForceFp32Comm]: 3.41999e-06 [remove_cast_before_assign_add]: 3.54002e-06 [full_micro_interleaved_order_control]: 4.57e-06 [reorder_send_recv_between_fp_bp]: 5.67001e-06 [comm_op_add_attrs]: 3.95e-06 [add_comm_op_reuse_tag]: 3.37997e-06 [interleave_split_concat_branches]: 3.56999e-06 [interleave_parallel_branches]: 3.48e-06 [overlap_opt_shard_in_pipeline]: 3.54002e-06 [overlap_opt_shard_grad_in_pipeline]: 4.53001e-06 [control_data_broadcast_order]: 1.548e-05 [grouped_pairwise_exchange_alltoall]: 4.11001e-06 [offloading_packed_experts]: 6.56e-06 [overlap_recompute_and_grad_model_parallel]: 7.73999e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.51001e-06 [overlap_recompute_allgather_and_fa_grad]: 3.8e-06 [overlap_recompute_comm]: 4.77e-06 [overlap_grad_ring_attention]: 7.41999e-06 [overlap_grad_flash_sp]: 2.326e-05 [begin_end_overlap_inline]: 2.93e-06 [split_matmul_comm_elemetwise]: 4.43001e-06 [split_layernorm_comm]: 3.94002e-06 [handle_group_info]: 3.39001e-06 [symbol_engine_optimizer]: 0.00010447, [1] [Cycle 1]: 9.744e-05, [6] [build]: 2.81e-06 [elim_shapecalc]: 1.271e-05 [elim_not_effective]: 1.401e-05 [opt_reshape]: 7.12002e-06 [fold_const_symbol]: 9.96e-06 [renormalize]: 2.50002e-07 [detach_backward]: 4.79e-06 [pipeline_parallel_scheduler]: 2.16e-06 [auto_monad_reorder]: 2.018e-05 [get_jit_bprop_graph]: 1.87001e-06 [rewriter_after_jit_bprop_graph]: 5.39e-06 [opt_after_jit_grad]: 0.00052847 [validate]: 4.072e-05 Sums bootstrap : 0.000460s : 4.59% type_inference : 0.004956s : 49.38% event_method : 0.000013s : 0.13% auto_monad : 0.000056s : 0.55% graph_reusing : 0.000006s : 0.06% inline : 0.000003s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000015s : 0.15% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000004s : 0.04% parallel-infer-symbol : 0.000004s : 0.04% pre_auto_parallel : 0.000027s : 0.27% insert-virtual-dataset : 0.000002s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.02% optimize.py_interpret_to_execute : 0.000024s : 0.23% optimize.rewriter_before_opt_a : 0.000051s : 0.51% optimize.opt_a.expand_dump_flag : 0.000004s : 0.04% optimize.opt_a.switch_simplify : 0.000034s : 0.34% optimize.opt_a.loop_unroll : 0.000020s : 0.20% optimize.opt_a.a_1 : 0.000469s : 4.67% optimize.opt_a.with_stream_mark : 0.000038s : 0.38% optimize.opt_a.recompute_prepare : 0.000017s : 0.17% optimize.opt_a.updatestate_depend_eliminate : 0.000007s : 0.07% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.07% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.06% optimize.opt_a.parameter_eliminate : 0.000004s : 0.04% optimize.opt_a.a_2 : 0.000238s : 2.37% optimize.opt_a.accelerated_algorithm : 0.000015s : 0.15% optimize.opt_a.shard : 0.000003s : 0.03% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.04% optimize.opt_a.shard_inline : 0.000013s : 0.13% optimize.opt_a.merge_send_recv : 0.000016s : 0.16% optimize.opt_a.auto_parallel : 0.000014s : 0.13% optimize.opt_a.parallel : 0.000026s : 0.26% optimize.opt_a.flash_sp : 0.000014s : 0.13% optimize.opt_a.merge_comm : 0.000008s : 0.08% optimize.opt_a.allreduce_fusion : 0.000007s : 0.07% optimize.opt_a.matmul_add_comm_reduction : 0.000017s : 0.17% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000018s : 0.18% optimize.opt_a.virtual_dataset : 0.000013s : 0.13% optimize.opt_a.get_grad_eliminate_ : 0.000012s : 0.12% optimize.opt_a.virtual_output : 0.000012s : 0.12% optimize.opt_a.merge_forward : 0.000008s : 0.08% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.03% optimize.opt_a.offload_activation : 0.000018s : 0.18% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000033s : 0.33% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.03% optimize.opt_a.before_grad : 0.000022s : 0.21% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000009s : 0.09% optimize.opt_a.meta_fg_expand : 0.000005s : 0.05% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.04% optimize.opt_a.receive_attached : 0.000004s : 0.04% optimize.opt_a.after_resolve : 0.000022s : 0.21% optimize.opt_a.a_after_grad : 0.000018s : 0.18% optimize.opt_a.renormalize : 0.000565s : 5.63% optimize.opt_a.add_forward_monad_depend : 0.000008s : 0.08% optimize.opt_a.auto_monad_grad : 0.000006s : 0.06% optimize.opt_a.auto_monad_eliminator : 0.000025s : 0.25% optimize.opt_a.cse : 0.000050s : 0.50% optimize.opt_a.a_3 : 0.000112s : 1.11% optimize.py_interpret_to_execute_after_opt_a : 0.000014s : 0.14% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.05% optimize.rewriter_after_opt_a : 0.000043s : 0.43% optimize.convert_after_rewriter : 0.000010s : 0.10% optimize.order_py_execute_after_rewriter : 0.000008s : 0.08% optimize.mutable_eliminate : 0.000576s : 5.74% optimize.opt_b.b_1 : 0.000166s : 1.66% optimize.opt_b.b_2 : 0.000008s : 0.08% optimize.opt_b.updatestate_depend_eliminate : 0.000008s : 0.08% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_b.updatestate_loads_eliminate : 0.000002s : 0.02% optimize.opt_b.renormalize : 0.000001s : 0.01% optimize.opt_b.cse : 0.000023s : 0.23% optimize.optimize_parallel_all_gather_comm : 0.000021s : 0.21% optimize.overlap_param_gather : 0.000005s : 0.05% optimize.cconv : 0.000035s : 0.34% optimize.loop_unroll : 0.000486s : 4.85% optimize.opt_after_cconv.c_1 : 0.000029s : 0.29% optimize.opt_after_cconv.parameter_eliminate : 0.000004s : 0.04% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000007s : 0.07% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.cse : 0.000022s : 0.22% optimize.opt_after_cconv.renormalize : 0.000001s : 0.01% optimize.remove_dup_value : 0.000018s : 0.18% optimize.tuple_transform.d_1 : 0.000040s : 0.40% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.02% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000008s : 0.08% optimize.partial_unused_args_eliminate : 0.000006s : 0.06% optimize.add_recomputation : 0.000054s : 0.54% optimize.cse_after_recomputation.cse : 0.000014s : 0.14% optimize.environ_conv : 0.000009s : 0.09% optimize.swap_dp_allreduce_reducescatter : 0.000008s : 0.08% optimize.bias_add_comm_swap : 0.000005s : 0.05% optimize.label_micro_interleaved_index : 0.000008s : 0.08% optimize.label_fine_grained_interleaved_index : 0.000006s : 0.06% optimize.merge_cast_opt : 0.000004s : 0.04% optimize.slice_recompute_activation : 0.000005s : 0.05% optimize.micro_interleaved_order_control : 0.000005s : 0.05% optimize.assign_add_opt : 0.000004s : 0.04% optimize.ForceFp32Comm : 0.000003s : 0.03% optimize.remove_cast_before_assign_add : 0.000004s : 0.04% optimize.full_micro_interleaved_order_control : 0.000005s : 0.05% optimize.reorder_send_recv_between_fp_bp : 0.000006s : 0.06% optimize.comm_op_add_attrs : 0.000004s : 0.04% optimize.add_comm_op_reuse_tag : 0.000003s : 0.03% optimize.interleave_split_concat_branches : 0.000004s : 0.04% optimize.interleave_parallel_branches : 0.000003s : 0.03% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.04% optimize.overlap_opt_shard_grad_in_pipeline : 0.000005s : 0.05% optimize.control_data_broadcast_order : 0.000015s : 0.15% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.04% optimize.offloading_packed_experts : 0.000007s : 0.07% optimize.overlap_recompute_and_grad_model_parallel : 0.000008s : 0.08% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.03% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.04% optimize.overlap_recompute_comm : 0.000005s : 0.05% optimize.overlap_grad_ring_attention : 0.000007s : 0.07% optimize.overlap_grad_flash_sp : 0.000023s : 0.23% optimize.begin_end_overlap_inline : 0.000003s : 0.03% optimize.split_matmul_comm_elemetwise : 0.000004s : 0.04% optimize.split_layernorm_comm : 0.000004s : 0.04% optimize.handle_group_info : 0.000003s : 0.03% optimize.symbol_engine_optimizer.build : 0.000003s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000013s : 0.13% optimize.symbol_engine_optimizer.elim_not_effective : 0.000014s : 0.14% optimize.symbol_engine_optimizer.opt_reshape : 0.000007s : 0.07% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000010s : 0.10% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000005s : 0.05% pipeline_parallel_scheduler : 0.000002s : 0.02% auto_monad_reorder : 0.000020s : 0.20% get_jit_bprop_graph : 0.000002s : 0.02% rewriter_after_jit_bprop_graph : 0.000005s : 0.05% opt_after_jit_grad : 0.000528s : 5.27% validate : 0.000041s : 0.41% Time group info: ------[substitution.] 0.000200 23 41.70% : 0.000083s : 4: substitution.arithmetic_simplify 0.95% : 0.000002s : 2: substitution.elim_not_effective 0.67% : 0.000001s : 2: substitution.fold_const_symbol 3.00% : 0.000006s : 3: substitution.graph_param_transform 47.36% : 0.000095s : 2: substitution.inline 2.07% : 0.000004s : 4: substitution.j_node_and_user_rematch 2.37% : 0.000005s : 4: substitution.remove_not_recompute_node 1.89% : 0.000004s : 2: substitution.replace_old_param ------[type_inference.] 0.004904 2 91.29% : 0.004477s : 1: type_inference.infer 8.71% : 0.000427s : 1: type_inference.specialize ------[replace.] 0.000021 2 100.00% : 0.000021s : 2: replace.inline ------[match.] 0.000093 2 100.00% : 0.000093s : 2: match.inline ------[predicate.] 0.000138 754 0.88% : 0.000001s : 7: predicate.accumulaten_eliminater 1.16% : 0.000002s : 3: predicate.ad_related_special_op_eliminate 0.65% : 0.000001s : 6: predicate.addn_check_dump 0.87% : 0.000001s : 7: predicate.addn_zero_filter 0.63% : 0.000001s : 7: predicate.adjust_all_reduce_mul_add 2.68% : 0.000004s : 13: predicate.arithmetic_simplify 0.83% : 0.000001s : 7: predicate.cast_eliminate 0.77% : 0.000001s : 6: predicate.check_bprop_eliminate 0.69% : 0.000001s : 6: predicate.compare_switch_simplify 0.22% : 0.000000s : 3: predicate.const_output_eliminate 0.75% : 0.000001s : 6: predicate.depend_value_elim 0.72% : 0.000001s : 7: predicate.dict_get_item_const_eliminator 0.88% : 0.000001s : 7: predicate.dict_get_item_eliminator 0.71% : 0.000001s : 7: predicate.dict_set_item_eliminator 1.22% : 0.000002s : 6: predicate.dumpgradient_eliminate 0.44% : 0.000001s : 3: predicate.elim_not_effective 0.68% : 0.000001s : 3: predicate.elim_shapecalc_of_broadcastargs 1.05% : 0.000001s : 10: predicate.environ_add_const_eliminate 0.97% : 0.000001s : 10: predicate.environ_get_add_eliminate 0.94% : 0.000001s : 10: predicate.environ_get_depend_swap 1.77% : 0.000002s : 16: predicate.environ_get_eliminate 1.14% : 0.000002s : 10: predicate.environ_get_set_eliminate 0.93% : 0.000001s : 9: predicate.exchange_switch_depend_value 1.82% : 0.000003s : 9: predicate.float_depend_g_call 0.66% : 0.000001s : 6: predicate.float_environ_get_switch 0.93% : 0.000001s : 9: predicate.float_tuple_getitem_switch 0.22% : 0.000000s : 3: predicate.fold_const_symbol 0.77% : 0.000001s : 6: predicate.get_grad_eliminate 0.37% : 0.000001s : 3: predicate.graph_param_transform 0.78% : 0.000001s : 6: predicate.incorporate_call 0.64% : 0.000001s : 6: predicate.incorporate_call_switch 6.52% : 0.000009s : 34: predicate.inline 1.42% : 0.000002s : 6: predicate.inline_without_move 0.37% : 0.000001s : 6: predicate.j_node_and_user_rematch 1.00% : 0.000001s : 6: predicate.less_batch_normalization 1.56% : 0.000002s : 13: predicate.list_to_tuple_eliminator_ 2.11% : 0.000003s : 20: predicate.load_eliminater 1.55% : 0.000002s : 3: predicate.loop_unroll_after_grad 1.59% : 0.000002s : 14: predicate.loop_unroll_before_grad 1.63% : 0.000002s : 13: predicate.make_slice_get_slice_eliminator 0.67% : 0.000001s : 6: predicate.merge_addn 0.72% : 0.000001s : 6: predicate.micro_step_allgather_replace 0.87% : 0.000001s : 6: predicate.mini_step_allgather_replace 0.67% : 0.000001s : 7: predicate.minmaximum_grad 1.36% : 0.000002s : 3: predicate.mutable_eliminate 0.40% : 0.000001s : 3: predicate.opt_reshape 0.94% : 0.000001s : 3: predicate.parallel_virtual_node 1.18% : 0.000002s : 9: predicate.partial_defer_inline 1.24% : 0.000002s : 10: predicate.partial_eliminate 0.69% : 0.000001s : 7: predicate.print_const_string_wrapper 0.74% : 0.000001s : 6: predicate.reduce_all_const_elim 1.05% : 0.000001s : 7: predicate.reduce_eliminate 1.99% : 0.000003s : 20: predicate.redundant_stop_gradient_eliminater 0.89% : 0.000001s : 6: predicate.remove_not_recompute_node 1.29% : 0.000002s : 13: predicate.replace_applicator 0.77% : 0.000001s : 6: predicate.replace_old_param 0.50% : 0.000001s : 3: predicate.reset_defer_inline 0.78% : 0.000001s : 7: predicate.reshape_eliminate 0.75% : 0.000001s : 6: predicate.row_tensor_add_zeros_like 0.53% : 0.000001s : 3: predicate.row_tensor_eliminate 0.93% : 0.000001s : 6: predicate.same_eliminate 0.66% : 0.000001s : 6: predicate.set_cell_output_no_recompute 1.30% : 0.000002s : 6: predicate.shard_identity_eliminate 0.84% : 0.000001s : 6: predicate.special_op_eliminate 0.91% : 0.000001s : 6: predicate.specialize_transform 1.20% : 0.000002s : 6: predicate.split_environ_get_set_with_tuple_value 0.81% : 0.000001s : 6: predicate.stack_unstack_eliminate 0.44% : 0.000001s : 3: predicate.switch_call_monad_eliminater 1.01% : 0.000001s : 9: predicate.switch_defer_inline 1.68% : 0.000002s : 15: predicate.switch_layer_defer_inline 4.39% : 0.000006s : 32: predicate.switch_simplify 0.73% : 0.000001s : 7: predicate.tile_eliminate 0.72% : 0.000001s : 7: predicate.transpose_eliminate 1.51% : 0.000002s : 13: predicate.tuple_list_convert_item_index_to_positive 1.55% : 0.000002s : 13: predicate.tuple_list_get_item_const_eliminator 1.30% : 0.000002s : 13: predicate.tuple_list_get_item_depend_reorder 3.15% : 0.000004s : 19: predicate.tuple_list_get_item_eliminator 1.45% : 0.000002s : 13: predicate.tuple_list_get_set_item_eliminator 2.44% : 0.000003s : 19: predicate.tuple_list_set_item_eliminator 1.61% : 0.000002s : 13: predicate.tuple_to_list_eliminator_ 1.99% : 0.000003s : 20: predicate.updatestate_pure_node_eliminater 2.90% : 0.000004s : 26: predicate.updatestate_useless_node_eliminater 0.43% : 0.000001s : 3: predicate.value_based_eliminate 0.75% : 0.000001s : 6: predicate.virtual_dataset_eliminate 0.72% : 0.000001s : 6: predicate.virtual_output_eliminate 0.29% : 0.000000s : 3: predicate.virtual_view_grad_eliminate 0.73% : 0.000001s : 3: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000222 5 9.24% : 0.000020s : 1: func_graph_cloner_run.FuncGraphClonerGraph 90.76% : 0.000201s : 4: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.025158 192 0.02% : 0.000006s : 1: ForceFp32Comm 13.03% : 0.003279s : 1: add_attr 12.97% : 0.003264s : 1: add_attr_with_inline 0.02% : 0.000006s : 1: add_comm_op_reuse_tag 0.23% : 0.000059s : 1: add_recomputation 0.02% : 0.000006s : 1: assign_add_opt 0.26% : 0.000064s : 1: auto_monad 0.11% : 0.000028s : 1: auto_monad_reorder 0.02% : 0.000006s : 1: begin_end_overlap_inline 0.03% : 0.000008s : 1: bias_add_comm_swap 2.00% : 0.000504s : 1: bootstrap 0.15% : 0.000038s : 1: cconv 0.03% : 0.000007s : 1: comm_op_add_attrs 0.08% : 0.000019s : 1: control_data_broadcast_order 0.05% : 0.000014s : 1: convert_after_rewriter 0.14% : 0.000035s : 1: cse_after_recomputation 0.03% : 0.000007s : 1: dataset_repeat_opt 0.10% : 0.000024s : 1: detach_backward 0.05% : 0.000012s : 1: environ_conv 0.09% : 0.000023s : 1: event_method 0.03% : 0.000007s : 1: full_micro_interleaved_order_control 0.03% : 0.000008s : 1: get_jit_bprop_graph 0.05% : 0.000013s : 1: graph_reusing 0.03% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.02% : 0.000006s : 1: handle_group_info 0.03% : 0.000008s : 1: inline 0.03% : 0.000008s : 1: insert-virtual-dataset 0.02% : 0.000006s : 1: interleave_parallel_branches 0.02% : 0.000006s : 1: interleave_split_concat_branches 0.03% : 0.000008s : 1: label_fine_grained_interleaved_index 0.04% : 0.000011s : 1: label_micro_interleaved_index 1.96% : 0.000494s : 1: loop_unroll 0.03% : 0.000006s : 1: merge_cast_opt 0.03% : 0.000008s : 1: micro_interleaved_order_control 2.32% : 0.000584s : 1: mutable_eliminate 0.04% : 0.000009s : 1: offloading_packed_experts 0.07% : 0.000017s : 1: opt.transform.loop_unroll_optimizer 0.07% : 0.000018s : 1: opt.transform.mutable_eliminate 3.51% : 0.000883s : 78: opt.transform.opt_a 0.11% : 0.000027s : 1: opt.transform.opt_after_cconv 0.10% : 0.000025s : 1: opt.transform.opt_after_jit_grad 0.39% : 0.000098s : 28: opt.transform.opt_b 0.18% : 0.000045s : 2: opt.transform.opt_trans_graph 0.16% : 0.000039s : 4: opt.transform.symbol_engine_opt 10.96% : 0.002757s : 1: opt_a 0.55% : 0.000139s : 1: opt_after_cconv 2.15% : 0.000540s : 1: opt_after_jit_grad 1.13% : 0.000284s : 1: opt_b 21.88% : 0.005504s : 1: optimize 0.10% : 0.000025s : 1: optimize_parallel_all_gather_comm 0.05% : 0.000011s : 1: order_py_execute_after_rewriter 0.11% : 0.000027s : 1: overlap_grad_flash_sp 0.03% : 0.000007s : 1: overlap_grad_matmul_and_grad_allreduce 0.04% : 0.000010s : 1: overlap_grad_ring_attention 0.03% : 0.000007s : 1: overlap_opt_shard_grad_in_pipeline 0.03% : 0.000006s : 1: overlap_opt_shard_in_pipeline 0.03% : 0.000008s : 1: overlap_param_gather 0.03% : 0.000007s : 1: overlap_recompute_allgather_and_fa_grad 0.04% : 0.000011s : 1: overlap_recompute_and_grad_model_parallel 0.03% : 0.000008s : 1: overlap_recompute_comm 0.04% : 0.000010s : 1: parallel-infer-symbol 0.02% : 0.000006s : 1: parallel-infer-symbol-second 0.04% : 0.000009s : 1: partial_unused_args_eliminate 0.04% : 0.000009s : 1: pipeline_parallel_scheduler 0.03% : 0.000008s : 1: pipeline_split 0.14% : 0.000035s : 1: pre_auto_parallel 0.11% : 0.000027s : 1: py_interpret_to_execute 0.07% : 0.000018s : 1: py_interpret_to_execute_after_opt_a 0.02% : 0.000006s : 1: remove_cast_before_assign_add 0.08% : 0.000021s : 1: remove_dup_value 1.23% : 0.000310s : 1: renormalize.infer 0.98% : 0.000246s : 1: renormalize.specialize 0.03% : 0.000008s : 1: reorder_send_recv_between_fp_bp 0.05% : 0.000013s : 1: rewriter_after_jit_bprop_graph 0.19% : 0.000047s : 1: rewriter_after_opt_a 0.22% : 0.000054s : 1: rewriter_before_opt_a 0.03% : 0.000008s : 1: slice_cell_reuse_recomputed_activation 0.03% : 0.000007s : 1: slice_recompute_activation 0.03% : 0.000007s : 1: split_layernorm_comm 0.03% : 0.000007s : 1: split_matmul_comm_elemetwise 0.05% : 0.000012s : 1: swap_dp_allreduce_reducescatter 0.43% : 0.000107s : 1: symbol_engine_optimizer 0.36% : 0.000091s : 1: tuple_transform 19.82% : 0.004986s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:45.631.520 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0133319, [21] [bootstrap]: 0.00046112 [type_inference]: 0.00459461 [event_method]: 1.149e-05 [auto_monad]: 5.343e-05 [graph_reusing]: 4.97999e-06 [inline]: 2.35002e-06 [add_attr]: 0.00311427, [1] [add_attr_with_inline]: 0.00310435, [1] [Cycle 1]: 5.483e-05, [2] [tag_attr]: 1.511e-05 [meta_addattr_fg_expand]: 3.81001e-06 [parallel-infer-symbol]: 3.26999e-06 [pre_auto_parallel]: 2.797e-05 [insert-virtual-dataset]: 2.79999e-06 [parallel-infer-symbol-second]: 7.50006e-07 [dataset_repeat_opt]: 2.48e-06 [pipeline_split]: 1.62001e-06 [optimize]: 0.00437873, [53] [py_interpret_to_execute]: 1.896e-05 [rewriter_before_opt_a]: 4.701e-05 [opt_a]: 0.00229415, [2] [Cycle 1]: 0.00160615, [45] [expand_dump_flag]: 3.23e-06 [switch_simplify]: 2.9e-05 [loop_unroll]: 1.391e-05 [a_1]: 0.00034105 [with_stream_mark]: 1.946e-05 [recompute_prepare]: 1.039e-05 [updatestate_depend_eliminate]: 4.02998e-06 [updatestate_assign_eliminate]: 3.11001e-06 [updatestate_loads_eliminate]: 3.95998e-06 [parameter_eliminate]: 1.93002e-06 [a_2]: 9.779e-05 [accelerated_algorithm]: 7.53e-06 [shard]: 2.19001e-06 [meta_shard_fg_expand]: 2.46998e-06 [shard_inline]: 6.49001e-06 [merge_send_recv]: 9.44998e-06 [auto_parallel]: 7.8e-06 [parallel]: 1.981e-05 [flash_sp]: 9.37001e-06 [merge_comm]: 4.47e-06 [allreduce_fusion]: 3.54002e-06 [matmul_add_comm_reduction]: 1.1e-05 [allreduce_slice_to_reducescatter]: 5.8001e-07 [virtual_shard_identity]: 8.45001e-06 [virtual_dataset]: 6.64001e-06 [get_grad_eliminate_]: 6.28e-06 [virtual_output]: 5.77001e-06 [merge_forward]: 3.91999e-06 [cell_reuse_recompute_pass]: 1.38002e-06 [offload_activation]: 9.79999e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.276e-05 [merge_recompute_call_nodes]: 1.47001e-06 [before_grad]: 1.148e-05 [set_forward_comm_id_for_comm_node_pass]: 4.08999e-06 [meta_fg_expand]: 2.61e-06 [flash_sp_send_recv_attached]: 2.27999e-06 [receive_attached]: 2.37999e-06 [after_resolve]: 1.161e-05 [a_after_grad]: 9.15999e-06 [renormalize]: 0.00052838 [add_forward_monad_depend]: 5.79999e-06 [auto_monad_grad]: 2.44001e-06 [auto_monad_eliminator]: 1.471e-05 [cse]: 3.247e-05 [a_3]: 4.723e-05 [Cycle 2]: 0.00067825, [45] [expand_dump_flag]: 1.19e-06 [switch_simplify]: 7.05002e-06 [loop_unroll]: 5.78002e-06 [a_1]: 0.00012879 [with_stream_mark]: 1.086e-05 [recompute_prepare]: 7.58001e-06 [updatestate_depend_eliminate]: 2.96999e-06 [updatestate_assign_eliminate]: 2.69999e-06 [updatestate_loads_eliminate]: 3.28e-06 [parameter_eliminate]: 1.62001e-06 [a_2]: 8.301e-05 [accelerated_algorithm]: 6.93e-06 [shard]: 1.50001e-06 [meta_shard_fg_expand]: 1.66e-06 [shard_inline]: 6.33e-06 [merge_send_recv]: 6.33e-06 [auto_parallel]: 6.51999e-06 [parallel]: 6.28e-06 [flash_sp]: 3.3e-06 [merge_comm]: 3.45e-06 [allreduce_fusion]: 3.14001e-06 [matmul_add_comm_reduction]: 7.20998e-06 [allreduce_slice_to_reducescatter]: 2.99973e-07 [virtual_shard_identity]: 8.18999e-06 [virtual_dataset]: 5.40999e-06 [get_grad_eliminate_]: 5.40001e-06 [virtual_output]: 5.60001e-06 [merge_forward]: 3.8e-06 [cell_reuse_recompute_pass]: 1.68002e-06 [offload_activation]: 7.62998e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.207e-05 [merge_recompute_call_nodes]: 1.12e-06 [before_grad]: 9.05001e-06 [set_forward_comm_id_for_comm_node_pass]: 4.55999e-06 [meta_fg_expand]: 2.16e-06 [flash_sp_send_recv_attached]: 8.99978e-07 [receive_attached]: 1.04998e-06 [after_resolve]: 1.039e-05 [a_after_grad]: 7.98999e-06 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 2.22001e-06 [auto_monad_grad]: 1.74e-06 [auto_monad_eliminator]: 8.77999e-06 [cse]: 1.882e-05 [a_3]: 3.61e-05 [py_interpret_to_execute_after_opt_a]: 9.89999e-06 [slice_cell_reuse_recomputed_activation]: 1.98997e-06 [rewriter_after_opt_a]: 3.698e-05 [convert_after_rewriter]: 6.58e-06 [order_py_execute_after_rewriter]: 5.22e-06 [mutable_eliminate]: 0.00054923 [opt_b]: 0.00020879, [1] [Cycle 1]: 0.00020227, [7] [b_1]: 0.00012074 [b_2]: 8.34002e-06 [updatestate_depend_eliminate]: 7.25e-06 [updatestate_assign_eliminate]: 2.56e-06 [updatestate_loads_eliminate]: 3.00998e-06 [renormalize]: 1.19e-06 [cse]: 2.245e-05 [optimize_parallel_all_gather_comm]: 1.716e-05 [overlap_param_gather]: 2.02001e-06 [cconv]: 2.92e-05 [loop_unroll]: 0.00045418 [opt_after_cconv]: 0.00013436, [1] [Cycle 1]: 0.00012773, [7] [c_1]: 2.754e-05 [parameter_eliminate]: 3.36001e-06 [updatestate_depend_eliminate]: 6.48e-06 [updatestate_assign_eliminate]: 3.28e-06 [updatestate_loads_eliminate]: 2.60002e-06 [cse]: 4.693e-05 [renormalize]: 6.69999e-07 [remove_dup_value]: 1.648e-05 [tuple_transform]: 7.37e-05, [1] [Cycle 1]: 6.878e-05, [4] [d_1]: 4.023e-05 [none_parameter_eliminate]: 2.02001e-06 [renormalize]: 2.60014e-07 [switch_simplify]: 7.14001e-06 [partial_unused_args_eliminate]: 1.77999e-06 [add_recomputation]: 4.767e-05 [cse_after_recomputation]: 2.463e-05, [1] [Cycle 1]: 1.916e-05, [1] [cse]: 1.309e-05 [environ_conv]: 5.66e-06 [swap_dp_allreduce_reducescatter]: 4.90999e-06 [bias_add_comm_swap]: 2.58e-06 [label_micro_interleaved_index]: 4.58001e-06 [label_fine_grained_interleaved_index]: 2.64999e-06 [merge_cast_opt]: 1.29998e-06 [slice_recompute_activation]: 1.91e-06 [micro_interleaved_order_control]: 2.24999e-06 [assign_add_opt]: 1.22e-06 [ForceFp32Comm]: 8.2e-07 [remove_cast_before_assign_add]: 9.79984e-07 [full_micro_interleaved_order_control]: 2.51998e-06 [reorder_send_recv_between_fp_bp]: 3.08e-06 [comm_op_add_attrs]: 1.07e-06 [add_comm_op_reuse_tag]: 1.13001e-06 [interleave_split_concat_branches]: 1.19e-06 [interleave_parallel_branches]: 1.05001e-06 [overlap_opt_shard_in_pipeline]: 1.37e-06 [overlap_opt_shard_grad_in_pipeline]: 2.19999e-06 [control_data_broadcast_order]: 1.318e-05 [grouped_pairwise_exchange_alltoall]: 1.49e-06 [offloading_packed_experts]: 4.31002e-06 [overlap_recompute_and_grad_model_parallel]: 4.90999e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.36002e-06 [overlap_recompute_allgather_and_fa_grad]: 1.35001e-06 [overlap_recompute_comm]: 1.99999e-06 [overlap_grad_ring_attention]: 4.11001e-06 [overlap_grad_flash_sp]: 1.934e-05 [begin_end_overlap_inline]: 5.50004e-07 [split_matmul_comm_elemetwise]: 2.46e-06 [split_layernorm_comm]: 1.73002e-06 [handle_group_info]: 9.89996e-07 [symbol_engine_optimizer]: 7.46e-05, [1] [Cycle 1]: 6.996e-05, [6] [build]: 2.58e-06 [elim_shapecalc]: 1.004e-05 [elim_not_effective]: 1.208e-05 [opt_reshape]: 6.98e-06 [fold_const_symbol]: 9.59e-06 [renormalize]: 2.30008e-07 [detach_backward]: 1.89e-06 [pipeline_parallel_scheduler]: 1.67999e-06 [auto_monad_reorder]: 1.57e-05 [get_jit_bprop_graph]: 2.50002e-06 [rewriter_after_jit_bprop_graph]: 4.17003e-06 [opt_after_jit_grad]: 0.00046666 [validate]: 3.707e-05 Sums bootstrap : 0.000461s : 5.00% type_inference : 0.004595s : 49.80% event_method : 0.000011s : 0.12% auto_monad : 0.000053s : 0.58% graph_reusing : 0.000005s : 0.05% inline : 0.000002s : 0.03% add_attr.add_attr_with_inline.tag_attr : 0.000015s : 0.16% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000004s : 0.04% parallel-infer-symbol : 0.000003s : 0.04% pre_auto_parallel : 0.000028s : 0.30% insert-virtual-dataset : 0.000003s : 0.03% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.03% pipeline_split : 0.000002s : 0.02% optimize.py_interpret_to_execute : 0.000019s : 0.21% optimize.rewriter_before_opt_a : 0.000047s : 0.51% optimize.opt_a.expand_dump_flag : 0.000004s : 0.05% optimize.opt_a.switch_simplify : 0.000036s : 0.39% optimize.opt_a.loop_unroll : 0.000020s : 0.21% optimize.opt_a.a_1 : 0.000470s : 5.09% optimize.opt_a.with_stream_mark : 0.000030s : 0.33% optimize.opt_a.recompute_prepare : 0.000018s : 0.19% optimize.opt_a.updatestate_depend_eliminate : 0.000007s : 0.08% optimize.opt_a.updatestate_assign_eliminate : 0.000006s : 0.06% optimize.opt_a.updatestate_loads_eliminate : 0.000007s : 0.08% optimize.opt_a.parameter_eliminate : 0.000004s : 0.04% optimize.opt_a.a_2 : 0.000181s : 1.96% optimize.opt_a.accelerated_algorithm : 0.000014s : 0.16% optimize.opt_a.shard : 0.000004s : 0.04% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.04% optimize.opt_a.shard_inline : 0.000013s : 0.14% optimize.opt_a.merge_send_recv : 0.000016s : 0.17% optimize.opt_a.auto_parallel : 0.000014s : 0.16% optimize.opt_a.parallel : 0.000026s : 0.28% optimize.opt_a.flash_sp : 0.000013s : 0.14% optimize.opt_a.merge_comm : 0.000008s : 0.09% optimize.opt_a.allreduce_fusion : 0.000007s : 0.07% optimize.opt_a.matmul_add_comm_reduction : 0.000018s : 0.20% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000017s : 0.18% optimize.opt_a.virtual_dataset : 0.000012s : 0.13% optimize.opt_a.get_grad_eliminate_ : 0.000012s : 0.13% optimize.opt_a.virtual_output : 0.000011s : 0.12% optimize.opt_a.merge_forward : 0.000008s : 0.08% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.03% optimize.opt_a.offload_activation : 0.000017s : 0.19% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000025s : 0.27% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.03% optimize.opt_a.before_grad : 0.000021s : 0.22% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000009s : 0.09% optimize.opt_a.meta_fg_expand : 0.000005s : 0.05% optimize.opt_a.flash_sp_send_recv_attached : 0.000003s : 0.03% optimize.opt_a.receive_attached : 0.000003s : 0.04% optimize.opt_a.after_resolve : 0.000022s : 0.24% optimize.opt_a.a_after_grad : 0.000017s : 0.19% optimize.opt_a.renormalize : 0.000528s : 5.73% optimize.opt_a.add_forward_monad_depend : 0.000008s : 0.09% optimize.opt_a.auto_monad_grad : 0.000004s : 0.05% optimize.opt_a.auto_monad_eliminator : 0.000023s : 0.25% optimize.opt_a.cse : 0.000051s : 0.56% optimize.opt_a.a_3 : 0.000083s : 0.90% optimize.py_interpret_to_execute_after_opt_a : 0.000010s : 0.11% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.02% optimize.rewriter_after_opt_a : 0.000037s : 0.40% optimize.convert_after_rewriter : 0.000007s : 0.07% optimize.order_py_execute_after_rewriter : 0.000005s : 0.06% optimize.mutable_eliminate : 0.000549s : 5.95% optimize.opt_b.b_1 : 0.000121s : 1.31% optimize.opt_b.b_2 : 0.000008s : 0.09% optimize.opt_b.updatestate_depend_eliminate : 0.000007s : 0.08% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_b.renormalize : 0.000001s : 0.01% optimize.opt_b.cse : 0.000022s : 0.24% optimize.optimize_parallel_all_gather_comm : 0.000017s : 0.19% optimize.overlap_param_gather : 0.000002s : 0.02% optimize.cconv : 0.000029s : 0.32% optimize.loop_unroll : 0.000454s : 4.92% optimize.opt_after_cconv.c_1 : 0.000028s : 0.30% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.04% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.07% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.04% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.cse : 0.000047s : 0.51% optimize.opt_after_cconv.renormalize : 0.000001s : 0.01% optimize.remove_dup_value : 0.000016s : 0.18% optimize.tuple_transform.d_1 : 0.000040s : 0.44% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.02% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000007s : 0.08% optimize.partial_unused_args_eliminate : 0.000002s : 0.02% optimize.add_recomputation : 0.000048s : 0.52% optimize.cse_after_recomputation.cse : 0.000013s : 0.14% optimize.environ_conv : 0.000006s : 0.06% optimize.swap_dp_allreduce_reducescatter : 0.000005s : 0.05% optimize.bias_add_comm_swap : 0.000003s : 0.03% optimize.label_micro_interleaved_index : 0.000005s : 0.05% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.03% optimize.merge_cast_opt : 0.000001s : 0.01% optimize.slice_recompute_activation : 0.000002s : 0.02% optimize.micro_interleaved_order_control : 0.000002s : 0.02% optimize.assign_add_opt : 0.000001s : 0.01% optimize.ForceFp32Comm : 0.000001s : 0.01% optimize.remove_cast_before_assign_add : 0.000001s : 0.01% optimize.full_micro_interleaved_order_control : 0.000003s : 0.03% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.03% optimize.comm_op_add_attrs : 0.000001s : 0.01% optimize.add_comm_op_reuse_tag : 0.000001s : 0.01% optimize.interleave_split_concat_branches : 0.000001s : 0.01% optimize.interleave_parallel_branches : 0.000001s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.02% optimize.control_data_broadcast_order : 0.000013s : 0.14% optimize.grouped_pairwise_exchange_alltoall : 0.000001s : 0.02% optimize.offloading_packed_experts : 0.000004s : 0.05% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.05% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.01% optimize.overlap_recompute_comm : 0.000002s : 0.02% optimize.overlap_grad_ring_attention : 0.000004s : 0.04% optimize.overlap_grad_flash_sp : 0.000019s : 0.21% optimize.begin_end_overlap_inline : 0.000001s : 0.01% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.03% optimize.split_layernorm_comm : 0.000002s : 0.02% optimize.handle_group_info : 0.000001s : 0.01% optimize.symbol_engine_optimizer.build : 0.000003s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000010s : 0.11% optimize.symbol_engine_optimizer.elim_not_effective : 0.000012s : 0.13% optimize.symbol_engine_optimizer.opt_reshape : 0.000007s : 0.08% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000010s : 0.10% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.02% pipeline_parallel_scheduler : 0.000002s : 0.02% auto_monad_reorder : 0.000016s : 0.17% get_jit_bprop_graph : 0.000003s : 0.03% rewriter_after_jit_bprop_graph : 0.000004s : 0.05% opt_after_jit_grad : 0.000467s : 5.06% validate : 0.000037s : 0.40% Time group info: ------[substitution.] 0.000193 23 39.16% : 0.000075s : 4: substitution.arithmetic_simplify 0.93% : 0.000002s : 2: substitution.elim_not_effective 0.72% : 0.000001s : 2: substitution.fold_const_symbol 2.90% : 0.000006s : 3: substitution.graph_param_transform 49.95% : 0.000096s : 2: substitution.inline 2.10% : 0.000004s : 4: substitution.j_node_and_user_rematch 2.31% : 0.000004s : 4: substitution.remove_not_recompute_node 1.93% : 0.000004s : 2: substitution.replace_old_param ------[type_inference.] 0.004549 2 91.16% : 0.004147s : 1: type_inference.infer 8.84% : 0.000402s : 1: type_inference.specialize ------[replace.] 0.000024 2 100.00% : 0.000024s : 2: replace.inline ------[match.] 0.000094 2 100.00% : 0.000094s : 2: match.inline ------[predicate.] 0.000136 754 0.79% : 0.000001s : 7: predicate.accumulaten_eliminater 0.91% : 0.000001s : 3: predicate.ad_related_special_op_eliminate 0.64% : 0.000001s : 6: predicate.addn_check_dump 0.87% : 0.000001s : 7: predicate.addn_zero_filter 0.69% : 0.000001s : 7: predicate.adjust_all_reduce_mul_add 2.57% : 0.000003s : 13: predicate.arithmetic_simplify 0.83% : 0.000001s : 7: predicate.cast_eliminate 0.79% : 0.000001s : 6: predicate.check_bprop_eliminate 0.67% : 0.000001s : 6: predicate.compare_switch_simplify 0.24% : 0.000000s : 3: predicate.const_output_eliminate 0.74% : 0.000001s : 6: predicate.depend_value_elim 0.75% : 0.000001s : 7: predicate.dict_get_item_const_eliminator 0.94% : 0.000001s : 7: predicate.dict_get_item_eliminator 0.82% : 0.000001s : 7: predicate.dict_set_item_eliminator 1.23% : 0.000002s : 6: predicate.dumpgradient_eliminate 0.26% : 0.000000s : 3: predicate.elim_not_effective 0.52% : 0.000001s : 3: predicate.elim_shapecalc_of_broadcastargs 1.20% : 0.000002s : 10: predicate.environ_add_const_eliminate 0.99% : 0.000001s : 10: predicate.environ_get_add_eliminate 1.04% : 0.000001s : 10: predicate.environ_get_depend_swap 1.85% : 0.000003s : 16: predicate.environ_get_eliminate 0.96% : 0.000001s : 10: predicate.environ_get_set_eliminate 1.02% : 0.000001s : 9: predicate.exchange_switch_depend_value 2.00% : 0.000003s : 9: predicate.float_depend_g_call 0.72% : 0.000001s : 6: predicate.float_environ_get_switch 0.98% : 0.000001s : 9: predicate.float_tuple_getitem_switch 0.22% : 0.000000s : 3: predicate.fold_const_symbol 0.85% : 0.000001s : 6: predicate.get_grad_eliminate 0.24% : 0.000000s : 3: predicate.graph_param_transform 0.82% : 0.000001s : 6: predicate.incorporate_call 0.65% : 0.000001s : 6: predicate.incorporate_call_switch 6.33% : 0.000009s : 34: predicate.inline 0.96% : 0.000001s : 6: predicate.inline_without_move 0.38% : 0.000001s : 6: predicate.j_node_and_user_rematch 1.33% : 0.000002s : 6: predicate.less_batch_normalization 1.57% : 0.000002s : 13: predicate.list_to_tuple_eliminator_ 1.94% : 0.000003s : 20: predicate.load_eliminater 1.44% : 0.000002s : 3: predicate.loop_unroll_after_grad 1.66% : 0.000002s : 14: predicate.loop_unroll_before_grad 1.78% : 0.000002s : 13: predicate.make_slice_get_slice_eliminator 0.68% : 0.000001s : 6: predicate.merge_addn 0.68% : 0.000001s : 6: predicate.micro_step_allgather_replace 0.66% : 0.000001s : 6: predicate.mini_step_allgather_replace 0.68% : 0.000001s : 7: predicate.minmaximum_grad 1.63% : 0.000002s : 3: predicate.mutable_eliminate 0.74% : 0.000001s : 3: predicate.opt_reshape 0.49% : 0.000001s : 3: predicate.parallel_virtual_node 1.26% : 0.000002s : 9: predicate.partial_defer_inline 1.18% : 0.000002s : 10: predicate.partial_eliminate 0.79% : 0.000001s : 7: predicate.print_const_string_wrapper 0.71% : 0.000001s : 6: predicate.reduce_all_const_elim 1.38% : 0.000002s : 7: predicate.reduce_eliminate 1.99% : 0.000003s : 20: predicate.redundant_stop_gradient_eliminater 0.63% : 0.000001s : 6: predicate.remove_not_recompute_node 1.28% : 0.000002s : 13: predicate.replace_applicator 0.92% : 0.000001s : 6: predicate.replace_old_param 0.71% : 0.000001s : 3: predicate.reset_defer_inline 0.79% : 0.000001s : 7: predicate.reshape_eliminate 0.70% : 0.000001s : 6: predicate.row_tensor_add_zeros_like 0.57% : 0.000001s : 3: predicate.row_tensor_eliminate 1.09% : 0.000001s : 6: predicate.same_eliminate 0.54% : 0.000001s : 6: predicate.set_cell_output_no_recompute 1.13% : 0.000002s : 6: predicate.shard_identity_eliminate 0.90% : 0.000001s : 6: predicate.special_op_eliminate 0.93% : 0.000001s : 6: predicate.specialize_transform 1.18% : 0.000002s : 6: predicate.split_environ_get_set_with_tuple_value 0.82% : 0.000001s : 6: predicate.stack_unstack_eliminate 0.43% : 0.000001s : 3: predicate.switch_call_monad_eliminater 0.98% : 0.000001s : 9: predicate.switch_defer_inline 1.83% : 0.000002s : 15: predicate.switch_layer_defer_inline 4.18% : 0.000006s : 32: predicate.switch_simplify 0.80% : 0.000001s : 7: predicate.tile_eliminate 0.73% : 0.000001s : 7: predicate.transpose_eliminate 1.73% : 0.000002s : 13: predicate.tuple_list_convert_item_index_to_positive 1.74% : 0.000002s : 13: predicate.tuple_list_get_item_const_eliminator 1.30% : 0.000002s : 13: predicate.tuple_list_get_item_depend_reorder 3.05% : 0.000004s : 19: predicate.tuple_list_get_item_eliminator 1.46% : 0.000002s : 13: predicate.tuple_list_get_set_item_eliminator 2.47% : 0.000003s : 19: predicate.tuple_list_set_item_eliminator 1.63% : 0.000002s : 13: predicate.tuple_to_list_eliminator_ 1.88% : 0.000003s : 20: predicate.updatestate_pure_node_eliminater 2.75% : 0.000004s : 26: predicate.updatestate_useless_node_eliminater 0.45% : 0.000001s : 3: predicate.value_based_eliminate 0.71% : 0.000001s : 6: predicate.virtual_dataset_eliminate 0.75% : 0.000001s : 6: predicate.virtual_output_eliminate 0.36% : 0.000000s : 3: predicate.virtual_view_grad_eliminate 0.54% : 0.000001s : 3: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000209 5 9.16% : 0.000019s : 1: func_graph_cloner_run.FuncGraphClonerGraph 90.84% : 0.000190s : 4: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.022356 192 0.02% : 0.000004s : 1: ForceFp32Comm 13.95% : 0.003119s : 1: add_attr 13.90% : 0.003109s : 1: add_attr_with_inline 0.02% : 0.000004s : 1: add_comm_op_reuse_tag 0.23% : 0.000052s : 1: add_recomputation 0.02% : 0.000004s : 1: assign_add_opt 0.26% : 0.000058s : 1: auto_monad 0.09% : 0.000020s : 1: auto_monad_reorder 0.01% : 0.000003s : 1: begin_end_overlap_inline 0.02% : 0.000005s : 1: bias_add_comm_swap 2.18% : 0.000488s : 1: bootstrap 0.15% : 0.000033s : 1: cconv 0.02% : 0.000004s : 1: comm_op_add_attrs 0.08% : 0.000017s : 1: control_data_broadcast_order 0.04% : 0.000010s : 1: convert_after_rewriter 0.12% : 0.000028s : 1: cse_after_recomputation 0.02% : 0.000005s : 1: dataset_repeat_opt 0.02% : 0.000005s : 1: detach_backward 0.04% : 0.000009s : 1: environ_conv 0.08% : 0.000017s : 1: event_method 0.02% : 0.000005s : 1: full_micro_interleaved_order_control 0.02% : 0.000006s : 1: get_jit_bprop_graph 0.04% : 0.000008s : 1: graph_reusing 0.02% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.02% : 0.000004s : 1: handle_group_info 0.02% : 0.000005s : 1: inline 0.03% : 0.000006s : 1: insert-virtual-dataset 0.02% : 0.000004s : 1: interleave_parallel_branches 0.02% : 0.000004s : 1: interleave_split_concat_branches 0.02% : 0.000005s : 1: label_fine_grained_interleaved_index 0.03% : 0.000008s : 1: label_micro_interleaved_index 2.07% : 0.000463s : 1: loop_unroll 0.02% : 0.000004s : 1: merge_cast_opt 0.02% : 0.000005s : 1: micro_interleaved_order_control 2.50% : 0.000559s : 1: mutable_eliminate 0.03% : 0.000007s : 1: offloading_packed_experts 0.07% : 0.000015s : 1: opt.transform.loop_unroll_optimizer 0.08% : 0.000017s : 1: opt.transform.mutable_eliminate 3.90% : 0.000873s : 78: opt.transform.opt_a 0.12% : 0.000026s : 1: opt.transform.opt_after_cconv 0.10% : 0.000022s : 1: opt.transform.opt_after_jit_grad 0.43% : 0.000095s : 28: opt.transform.opt_b 0.20% : 0.000045s : 2: opt.transform.opt_trans_graph 0.16% : 0.000035s : 4: opt.transform.symbol_engine_opt 10.28% : 0.002298s : 1: opt_a 0.62% : 0.000139s : 1: opt_after_cconv 2.13% : 0.000475s : 1: opt_after_jit_grad 0.95% : 0.000212s : 1: opt_b 19.61% : 0.004383s : 1: optimize 0.09% : 0.000021s : 1: optimize_parallel_all_gather_comm 0.04% : 0.000008s : 1: order_py_execute_after_rewriter 0.10% : 0.000023s : 1: overlap_grad_flash_sp 0.02% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.03% : 0.000007s : 1: overlap_grad_ring_attention 0.02% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.02% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.02% : 0.000005s : 1: overlap_param_gather 0.02% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.03% : 0.000008s : 1: overlap_recompute_and_grad_model_parallel 0.02% : 0.000005s : 1: overlap_recompute_comm 0.03% : 0.000007s : 1: parallel-infer-symbol 0.02% : 0.000004s : 1: parallel-infer-symbol-second 0.02% : 0.000005s : 1: partial_unused_args_eliminate 0.02% : 0.000005s : 1: pipeline_parallel_scheduler 0.02% : 0.000004s : 1: pipeline_split 0.15% : 0.000033s : 1: pre_auto_parallel 0.10% : 0.000023s : 1: py_interpret_to_execute 0.06% : 0.000013s : 1: py_interpret_to_execute_after_opt_a 0.02% : 0.000004s : 1: remove_cast_before_assign_add 0.09% : 0.000020s : 1: remove_dup_value 1.27% : 0.000283s : 1: renormalize.infer 1.06% : 0.000237s : 1: renormalize.specialize 0.03% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.03% : 0.000007s : 1: rewriter_after_jit_bprop_graph 0.19% : 0.000042s : 1: rewriter_after_opt_a 0.23% : 0.000051s : 1: rewriter_before_opt_a 0.02% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.02% : 0.000005s : 1: slice_recompute_activation 0.02% : 0.000005s : 1: split_layernorm_comm 0.02% : 0.000005s : 1: split_matmul_comm_elemetwise 0.04% : 0.000008s : 1: swap_dp_allreduce_reducescatter 0.35% : 0.000077s : 1: symbol_engine_optimizer 0.34% : 0.000077s : 1: tuple_transform 20.62% : 0.004609s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:45.828.609 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:45.828.869 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0139747, [21] [bootstrap]: 0.00044065 [type_inference]: 0.00458479 [event_method]: 1.127e-05 [auto_monad]: 5.492e-05 [graph_reusing]: 5.44e-06 [inline]: 2.32999e-06 [add_attr]: 0.00305712, [1] [add_attr_with_inline]: 0.0030485, [1] [Cycle 1]: 6.488e-05, [2] [tag_attr]: 1.382e-05 [meta_addattr_fg_expand]: 3.51999e-06 [parallel-infer-symbol]: 3.11999e-06 [pre_auto_parallel]: 2.435e-05 [insert-virtual-dataset]: 2.28002e-06 [parallel-infer-symbol-second]: 7.59988e-07 [dataset_repeat_opt]: 2.21e-06 [pipeline_split]: 1.51002e-06 [optimize]: 0.0046862, [53] [py_interpret_to_execute]: 1.933e-05 [rewriter_before_opt_a]: 4.738e-05 [opt_a]: 0.00247615, [2] [Cycle 1]: 0.00162475, [45] [expand_dump_flag]: 3.13998e-06 [switch_simplify]: 2.441e-05 [loop_unroll]: 1.386e-05 [a_1]: 0.00031482 [with_stream_mark]: 1.586e-05 [recompute_prepare]: 8.25e-06 [updatestate_depend_eliminate]: 4.08999e-06 [updatestate_assign_eliminate]: 3.33e-06 [updatestate_loads_eliminate]: 3.46999e-06 [parameter_eliminate]: 1.91e-06 [a_2]: 0.00012085 [accelerated_algorithm]: 7.56999e-06 [shard]: 2.19999e-06 [meta_shard_fg_expand]: 1.86e-06 [shard_inline]: 6.61e-06 [merge_send_recv]: 7.82e-06 [auto_parallel]: 6.66e-06 [parallel]: 1.726e-05 [flash_sp]: 7.51999e-06 [merge_comm]: 4.10998e-06 [allreduce_fusion]: 3.56999e-06 [matmul_add_comm_reduction]: 9.84001e-06 [allreduce_slice_to_reducescatter]: 5.69999e-07 [virtual_shard_identity]: 8.17e-06 [virtual_dataset]: 6.83998e-06 [get_grad_eliminate_]: 5.89e-06 [virtual_output]: 6.16e-06 [merge_forward]: 4e-06 [cell_reuse_recompute_pass]: 1.35999e-06 [offload_activation]: 9.15999e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.493e-05 [merge_recompute_call_nodes]: 1.70001e-06 [before_grad]: 1.08e-05 [set_forward_comm_id_for_comm_node_pass]: 4.23001e-06 [meta_fg_expand]: 2.52001e-06 [flash_sp_send_recv_attached]: 2.54999e-06 [receive_attached]: 2.76999e-06 [after_resolve]: 1.044e-05 [a_after_grad]: 1.008e-05 [renormalize]: 0.00043511 [add_forward_monad_depend]: 5.35999e-06 [auto_monad_grad]: 2.66e-06 [auto_monad_eliminator]: 1.558e-05 [cse]: 2.994e-05 [a_3]: 6.05e-05 [Cycle 2]: 0.00083856, [45] [expand_dump_flag]: 1.04e-06 [switch_simplify]: 7.5e-06 [loop_unroll]: 6.14999e-06 [a_1]: 0.00012973 [with_stream_mark]: 1.271e-05 [recompute_prepare]: 7.2e-06 [updatestate_depend_eliminate]: 3.45e-06 [updatestate_assign_eliminate]: 2.89999e-06 [updatestate_loads_eliminate]: 2.66999e-06 [parameter_eliminate]: 1.49e-06 [a_2]: 0.00011013 [accelerated_algorithm]: 6.19001e-06 [shard]: 1.12999e-06 [meta_shard_fg_expand]: 1.62999e-06 [shard_inline]: 6.54001e-06 [merge_send_recv]: 5.69e-06 [auto_parallel]: 5.67001e-06 [parallel]: 5.39e-06 [flash_sp]: 3.23998e-06 [merge_comm]: 3.55e-06 [allreduce_fusion]: 3.53e-06 [matmul_add_comm_reduction]: 7.09001e-06 [allreduce_slice_to_reducescatter]: 3.69997e-07 [virtual_shard_identity]: 7.15e-06 [virtual_dataset]: 5.82999e-06 [get_grad_eliminate_]: 5.52001e-06 [virtual_output]: 5.35999e-06 [merge_forward]: 3.27002e-06 [cell_reuse_recompute_pass]: 1.48002e-06 [offload_activation]: 7.66001e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.35e-05 [merge_recompute_call_nodes]: 1.06002e-06 [before_grad]: 9.17001e-06 [set_forward_comm_id_for_comm_node_pass]: 3.79002e-06 [meta_fg_expand]: 2.34999e-06 [flash_sp_send_recv_attached]: 1.03001e-06 [receive_attached]: 9.70002e-07 [after_resolve]: 8.75001e-06 [a_after_grad]: 7.83001e-06 [renormalize]: 8.9989e-08 [add_forward_monad_depend]: 1.87999e-06 [auto_monad_grad]: 1.07998e-06 [auto_monad_eliminator]: 8.72e-06 [cse]: 1.755e-05 [a_3]: 4.906e-05 [py_interpret_to_execute_after_opt_a]: 1.342e-05 [slice_cell_reuse_recomputed_activation]: 5.22e-06 [rewriter_after_opt_a]: 3.862e-05 [convert_after_rewriter]: 9.24e-06 [order_py_execute_after_rewriter]: 8.18999e-06 [mutable_eliminate]: 0.00053565 [opt_b]: 0.00026663, [1] [Cycle 1]: 0.00025815, [7] [b_1]: 0.00016396 [b_2]: 7.46999e-06 [updatestate_depend_eliminate]: 6.49999e-06 [updatestate_assign_eliminate]: 2.46998e-06 [updatestate_loads_eliminate]: 2.27999e-06 [renormalize]: 4.7998e-07 [cse]: 1.89e-05 [optimize_parallel_all_gather_comm]: 1.861e-05 [overlap_param_gather]: 4.96002e-06 [cconv]: 2.766e-05 [loop_unroll]: 0.00043296 [opt_after_cconv]: 0.00012526, [1] [Cycle 1]: 0.00011684, [7] [c_1]: 2.817e-05 [parameter_eliminate]: 3.81999e-06 [updatestate_depend_eliminate]: 6.01998e-06 [updatestate_assign_eliminate]: 2.51e-06 [updatestate_loads_eliminate]: 2.68e-06 [cse]: 1.836e-05 [renormalize]: 3.69997e-07 [remove_dup_value]: 1.706e-05 [tuple_transform]: 8.677e-05, [1] [Cycle 1]: 7.964e-05, [4] [d_1]: 4.097e-05 [none_parameter_eliminate]: 1.61998e-06 [renormalize]: 1.8999e-07 [switch_simplify]: 6.94001e-06 [partial_unused_args_eliminate]: 4.58001e-06 [add_recomputation]: 4.876e-05 [cse_after_recomputation]: 2.732e-05, [1] [Cycle 1]: 2.045e-05, [1] [cse]: 1.145e-05 [environ_conv]: 7.53999e-06 [swap_dp_allreduce_reducescatter]: 7.78999e-06 [bias_add_comm_swap]: 5.39e-06 [label_micro_interleaved_index]: 6.36998e-06 [label_fine_grained_interleaved_index]: 5.36002e-06 [merge_cast_opt]: 3.58999e-06 [slice_recompute_activation]: 4.33999e-06 [micro_interleaved_order_control]: 4.79e-06 [assign_add_opt]: 3.65998e-06 [ForceFp32Comm]: 3.21001e-06 [remove_cast_before_assign_add]: 3.56001e-06 [full_micro_interleaved_order_control]: 4.76002e-06 [reorder_send_recv_between_fp_bp]: 5.00999e-06 [comm_op_add_attrs]: 3.76999e-06 [add_comm_op_reuse_tag]: 3.22002e-06 [interleave_split_concat_branches]: 3.49001e-06 [interleave_parallel_branches]: 3.46999e-06 [overlap_opt_shard_in_pipeline]: 3.67002e-06 [overlap_opt_shard_grad_in_pipeline]: 3.89002e-06 [control_data_broadcast_order]: 1.43e-05 [grouped_pairwise_exchange_alltoall]: 3.66001e-06 [offloading_packed_experts]: 6.81001e-06 [overlap_recompute_and_grad_model_parallel]: 6.70002e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.58999e-06 [overlap_recompute_allgather_and_fa_grad]: 3.70998e-06 [overlap_recompute_comm]: 4.79e-06 [overlap_grad_ring_attention]: 6.24999e-06 [overlap_grad_flash_sp]: 2.004e-05 [begin_end_overlap_inline]: 2.83e-06 [split_matmul_comm_elemetwise]: 4.52e-06 [split_layernorm_comm]: 4.01001e-06 [handle_group_info]: 3.45e-06 [symbol_engine_optimizer]: 9.471e-05, [1] [Cycle 1]: 8.815e-05, [6] [build]: 2.43e-06 [elim_shapecalc]: 9.56003e-06 [elim_not_effective]: 1.276e-05 [opt_reshape]: 6.84999e-06 [fold_const_symbol]: 1.029e-05 [renormalize]: 2.50002e-07 [detach_backward]: 3.35e-06 [pipeline_parallel_scheduler]: 1.65001e-06 [auto_monad_reorder]: 1.828e-05 [get_jit_bprop_graph]: 1.66998e-06 [rewriter_after_jit_bprop_graph]: 3.33998e-06 [opt_after_jit_grad]: 0.00049329 [validate]: 3.59e-05 Sums bootstrap : 0.000441s : 4.78% type_inference : 0.004585s : 49.74% event_method : 0.000011s : 0.12% auto_monad : 0.000055s : 0.60% graph_reusing : 0.000005s : 0.06% inline : 0.000002s : 0.03% add_attr.add_attr_with_inline.tag_attr : 0.000014s : 0.15% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000004s : 0.04% parallel-infer-symbol : 0.000003s : 0.03% pre_auto_parallel : 0.000024s : 0.26% insert-virtual-dataset : 0.000002s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.02% optimize.py_interpret_to_execute : 0.000019s : 0.21% optimize.rewriter_before_opt_a : 0.000047s : 0.51% optimize.opt_a.expand_dump_flag : 0.000004s : 0.05% optimize.opt_a.switch_simplify : 0.000032s : 0.35% optimize.opt_a.loop_unroll : 0.000020s : 0.22% optimize.opt_a.a_1 : 0.000445s : 4.82% optimize.opt_a.with_stream_mark : 0.000029s : 0.31% optimize.opt_a.recompute_prepare : 0.000015s : 0.17% optimize.opt_a.updatestate_depend_eliminate : 0.000008s : 0.08% optimize.opt_a.updatestate_assign_eliminate : 0.000006s : 0.07% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.07% optimize.opt_a.parameter_eliminate : 0.000003s : 0.04% optimize.opt_a.a_2 : 0.000231s : 2.51% optimize.opt_a.accelerated_algorithm : 0.000014s : 0.15% optimize.opt_a.shard : 0.000003s : 0.04% optimize.opt_a.meta_shard_fg_expand : 0.000003s : 0.04% optimize.opt_a.shard_inline : 0.000013s : 0.14% optimize.opt_a.merge_send_recv : 0.000014s : 0.15% optimize.opt_a.auto_parallel : 0.000012s : 0.13% optimize.opt_a.parallel : 0.000023s : 0.25% optimize.opt_a.flash_sp : 0.000011s : 0.12% optimize.opt_a.merge_comm : 0.000008s : 0.08% optimize.opt_a.allreduce_fusion : 0.000007s : 0.08% optimize.opt_a.matmul_add_comm_reduction : 0.000017s : 0.18% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000015s : 0.17% optimize.opt_a.virtual_dataset : 0.000013s : 0.14% optimize.opt_a.get_grad_eliminate_ : 0.000011s : 0.12% optimize.opt_a.virtual_output : 0.000012s : 0.12% optimize.opt_a.merge_forward : 0.000007s : 0.08% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.03% optimize.opt_a.offload_activation : 0.000017s : 0.18% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000028s : 0.31% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.03% optimize.opt_a.before_grad : 0.000020s : 0.22% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000008s : 0.09% optimize.opt_a.meta_fg_expand : 0.000005s : 0.05% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.04% optimize.opt_a.receive_attached : 0.000004s : 0.04% optimize.opt_a.after_resolve : 0.000019s : 0.21% optimize.opt_a.a_after_grad : 0.000018s : 0.19% optimize.opt_a.renormalize : 0.000435s : 4.72% optimize.opt_a.add_forward_monad_depend : 0.000007s : 0.08% optimize.opt_a.auto_monad_grad : 0.000004s : 0.04% optimize.opt_a.auto_monad_eliminator : 0.000024s : 0.26% optimize.opt_a.cse : 0.000047s : 0.52% optimize.opt_a.a_3 : 0.000110s : 1.19% optimize.py_interpret_to_execute_after_opt_a : 0.000013s : 0.15% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.06% optimize.rewriter_after_opt_a : 0.000039s : 0.42% optimize.convert_after_rewriter : 0.000009s : 0.10% optimize.order_py_execute_after_rewriter : 0.000008s : 0.09% optimize.mutable_eliminate : 0.000536s : 5.81% optimize.opt_b.b_1 : 0.000164s : 1.78% optimize.opt_b.b_2 : 0.000007s : 0.08% optimize.opt_b.updatestate_depend_eliminate : 0.000006s : 0.07% optimize.opt_b.updatestate_assign_eliminate : 0.000002s : 0.03% optimize.opt_b.updatestate_loads_eliminate : 0.000002s : 0.02% optimize.opt_b.renormalize : 0.000000s : 0.01% optimize.opt_b.cse : 0.000019s : 0.21% optimize.optimize_parallel_all_gather_comm : 0.000019s : 0.20% optimize.overlap_param_gather : 0.000005s : 0.05% optimize.cconv : 0.000028s : 0.30% optimize.loop_unroll : 0.000433s : 4.70% optimize.opt_after_cconv.c_1 : 0.000028s : 0.31% optimize.opt_after_cconv.parameter_eliminate : 0.000004s : 0.04% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.07% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.cse : 0.000018s : 0.20% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000017s : 0.19% optimize.tuple_transform.d_1 : 0.000041s : 0.44% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.02% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000007s : 0.08% optimize.partial_unused_args_eliminate : 0.000005s : 0.05% optimize.add_recomputation : 0.000049s : 0.53% optimize.cse_after_recomputation.cse : 0.000011s : 0.12% optimize.environ_conv : 0.000008s : 0.08% optimize.swap_dp_allreduce_reducescatter : 0.000008s : 0.08% optimize.bias_add_comm_swap : 0.000005s : 0.06% optimize.label_micro_interleaved_index : 0.000006s : 0.07% optimize.label_fine_grained_interleaved_index : 0.000005s : 0.06% optimize.merge_cast_opt : 0.000004s : 0.04% optimize.slice_recompute_activation : 0.000004s : 0.05% optimize.micro_interleaved_order_control : 0.000005s : 0.05% optimize.assign_add_opt : 0.000004s : 0.04% optimize.ForceFp32Comm : 0.000003s : 0.03% optimize.remove_cast_before_assign_add : 0.000004s : 0.04% optimize.full_micro_interleaved_order_control : 0.000005s : 0.05% optimize.reorder_send_recv_between_fp_bp : 0.000005s : 0.05% optimize.comm_op_add_attrs : 0.000004s : 0.04% optimize.add_comm_op_reuse_tag : 0.000003s : 0.03% optimize.interleave_split_concat_branches : 0.000003s : 0.04% optimize.interleave_parallel_branches : 0.000003s : 0.04% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.04% optimize.overlap_opt_shard_grad_in_pipeline : 0.000004s : 0.04% optimize.control_data_broadcast_order : 0.000014s : 0.16% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.04% optimize.offloading_packed_experts : 0.000007s : 0.07% optimize.overlap_recompute_and_grad_model_parallel : 0.000007s : 0.07% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.04% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.04% optimize.overlap_recompute_comm : 0.000005s : 0.05% optimize.overlap_grad_ring_attention : 0.000006s : 0.07% optimize.overlap_grad_flash_sp : 0.000020s : 0.22% optimize.begin_end_overlap_inline : 0.000003s : 0.03% optimize.split_matmul_comm_elemetwise : 0.000005s : 0.05% optimize.split_layernorm_comm : 0.000004s : 0.04% optimize.handle_group_info : 0.000003s : 0.04% optimize.symbol_engine_optimizer.build : 0.000002s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000010s : 0.10% optimize.symbol_engine_optimizer.elim_not_effective : 0.000013s : 0.14% optimize.symbol_engine_optimizer.opt_reshape : 0.000007s : 0.07% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000010s : 0.11% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000003s : 0.04% pipeline_parallel_scheduler : 0.000002s : 0.02% auto_monad_reorder : 0.000018s : 0.20% get_jit_bprop_graph : 0.000002s : 0.02% rewriter_after_jit_bprop_graph : 0.000003s : 0.04% opt_after_jit_grad : 0.000493s : 5.35% validate : 0.000036s : 0.39% Time group info: ------[substitution.] 0.000176 23 39.71% : 0.000070s : 4: substitution.arithmetic_simplify 0.98% : 0.000002s : 2: substitution.elim_not_effective 0.76% : 0.000001s : 2: substitution.fold_const_symbol 3.25% : 0.000006s : 3: substitution.graph_param_transform 48.37% : 0.000085s : 2: substitution.inline 1.96% : 0.000003s : 4: substitution.j_node_and_user_rematch 2.98% : 0.000005s : 4: substitution.remove_not_recompute_node 1.99% : 0.000003s : 2: substitution.replace_old_param ------[type_inference.] 0.004542 2 91.13% : 0.004139s : 1: type_inference.infer 8.87% : 0.000403s : 1: type_inference.specialize ------[replace.] 0.000020 2 100.00% : 0.000020s : 2: replace.inline ------[match.] 0.000083 2 100.00% : 0.000083s : 2: match.inline ------[predicate.] 0.000134 754 0.86% : 0.000001s : 7: predicate.accumulaten_eliminater 1.18% : 0.000002s : 3: predicate.ad_related_special_op_eliminate 0.66% : 0.000001s : 6: predicate.addn_check_dump 0.77% : 0.000001s : 7: predicate.addn_zero_filter 0.70% : 0.000001s : 7: predicate.adjust_all_reduce_mul_add 2.66% : 0.000004s : 13: predicate.arithmetic_simplify 0.81% : 0.000001s : 7: predicate.cast_eliminate 0.77% : 0.000001s : 6: predicate.check_bprop_eliminate 0.67% : 0.000001s : 6: predicate.compare_switch_simplify 0.25% : 0.000000s : 3: predicate.const_output_eliminate 0.74% : 0.000001s : 6: predicate.depend_value_elim 0.77% : 0.000001s : 7: predicate.dict_get_item_const_eliminator 0.94% : 0.000001s : 7: predicate.dict_get_item_eliminator 0.75% : 0.000001s : 7: predicate.dict_set_item_eliminator 1.16% : 0.000002s : 6: predicate.dumpgradient_eliminate 0.28% : 0.000000s : 3: predicate.elim_not_effective 0.53% : 0.000001s : 3: predicate.elim_shapecalc_of_broadcastargs 1.11% : 0.000001s : 10: predicate.environ_add_const_eliminate 0.97% : 0.000001s : 10: predicate.environ_get_add_eliminate 1.09% : 0.000001s : 10: predicate.environ_get_depend_swap 1.72% : 0.000002s : 16: predicate.environ_get_eliminate 1.03% : 0.000001s : 10: predicate.environ_get_set_eliminate 0.91% : 0.000001s : 9: predicate.exchange_switch_depend_value 1.95% : 0.000003s : 9: predicate.float_depend_g_call 0.68% : 0.000001s : 6: predicate.float_environ_get_switch 1.03% : 0.000001s : 9: predicate.float_tuple_getitem_switch 0.22% : 0.000000s : 3: predicate.fold_const_symbol 0.82% : 0.000001s : 6: predicate.get_grad_eliminate 0.40% : 0.000001s : 3: predicate.graph_param_transform 0.85% : 0.000001s : 6: predicate.incorporate_call 0.65% : 0.000001s : 6: predicate.incorporate_call_switch 6.66% : 0.000009s : 34: predicate.inline 1.09% : 0.000001s : 6: predicate.inline_without_move 0.41% : 0.000001s : 6: predicate.j_node_and_user_rematch 1.07% : 0.000001s : 6: predicate.less_batch_normalization 1.46% : 0.000002s : 13: predicate.list_to_tuple_eliminator_ 2.13% : 0.000003s : 20: predicate.load_eliminater 1.17% : 0.000002s : 3: predicate.loop_unroll_after_grad 1.62% : 0.000002s : 14: predicate.loop_unroll_before_grad 1.79% : 0.000002s : 13: predicate.make_slice_get_slice_eliminator 0.71% : 0.000001s : 6: predicate.merge_addn 0.77% : 0.000001s : 6: predicate.micro_step_allgather_replace 0.83% : 0.000001s : 6: predicate.mini_step_allgather_replace 0.65% : 0.000001s : 7: predicate.minmaximum_grad 1.28% : 0.000002s : 3: predicate.mutable_eliminate 0.41% : 0.000001s : 3: predicate.opt_reshape 0.43% : 0.000001s : 3: predicate.parallel_virtual_node 1.20% : 0.000002s : 9: predicate.partial_defer_inline 1.27% : 0.000002s : 10: predicate.partial_eliminate 1.06% : 0.000001s : 7: predicate.print_const_string_wrapper 1.04% : 0.000001s : 6: predicate.reduce_all_const_elim 1.03% : 0.000001s : 7: predicate.reduce_eliminate 2.06% : 0.000003s : 20: predicate.redundant_stop_gradient_eliminater 0.64% : 0.000001s : 6: predicate.remove_not_recompute_node 1.32% : 0.000002s : 13: predicate.replace_applicator 0.69% : 0.000001s : 6: predicate.replace_old_param 0.49% : 0.000001s : 3: predicate.reset_defer_inline 0.80% : 0.000001s : 7: predicate.reshape_eliminate 0.80% : 0.000001s : 6: predicate.row_tensor_add_zeros_like 0.42% : 0.000001s : 3: predicate.row_tensor_eliminate 1.04% : 0.000001s : 6: predicate.same_eliminate 0.60% : 0.000001s : 6: predicate.set_cell_output_no_recompute 0.96% : 0.000001s : 6: predicate.shard_identity_eliminate 0.95% : 0.000001s : 6: predicate.special_op_eliminate 1.01% : 0.000001s : 6: predicate.specialize_transform 1.12% : 0.000002s : 6: predicate.split_environ_get_set_with_tuple_value 0.89% : 0.000001s : 6: predicate.stack_unstack_eliminate 0.45% : 0.000001s : 3: predicate.switch_call_monad_eliminater 0.98% : 0.000001s : 9: predicate.switch_defer_inline 1.78% : 0.000002s : 15: predicate.switch_layer_defer_inline 4.44% : 0.000006s : 32: predicate.switch_simplify 0.80% : 0.000001s : 7: predicate.tile_eliminate 0.75% : 0.000001s : 7: predicate.transpose_eliminate 1.46% : 0.000002s : 13: predicate.tuple_list_convert_item_index_to_positive 1.61% : 0.000002s : 13: predicate.tuple_list_get_item_const_eliminator 1.29% : 0.000002s : 13: predicate.tuple_list_get_item_depend_reorder 3.27% : 0.000004s : 19: predicate.tuple_list_get_item_eliminator 1.49% : 0.000002s : 13: predicate.tuple_list_get_set_item_eliminator 2.43% : 0.000003s : 19: predicate.tuple_list_set_item_eliminator 1.45% : 0.000002s : 13: predicate.tuple_to_list_eliminator_ 1.98% : 0.000003s : 20: predicate.updatestate_pure_node_eliminater 2.88% : 0.000004s : 26: predicate.updatestate_useless_node_eliminater 0.56% : 0.000001s : 3: predicate.value_based_eliminate 0.78% : 0.000001s : 6: predicate.virtual_dataset_eliminate 0.78% : 0.000001s : 6: predicate.virtual_output_eliminate 0.33% : 0.000000s : 3: predicate.virtual_view_grad_eliminate 0.70% : 0.000001s : 3: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000195 5 7.89% : 0.000015s : 1: func_graph_cloner_run.FuncGraphClonerGraph 92.11% : 0.000179s : 4: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.023127 192 0.03% : 0.000006s : 1: ForceFp32Comm 13.26% : 0.003066s : 1: add_attr 13.20% : 0.003053s : 1: add_attr_with_inline 0.03% : 0.000006s : 1: add_comm_op_reuse_tag 0.23% : 0.000052s : 1: add_recomputation 0.03% : 0.000006s : 1: assign_add_opt 0.27% : 0.000063s : 1: auto_monad 0.11% : 0.000025s : 1: auto_monad_reorder 0.02% : 0.000006s : 1: begin_end_overlap_inline 0.04% : 0.000008s : 1: bias_add_comm_swap 2.10% : 0.000485s : 1: bootstrap 0.13% : 0.000031s : 1: cconv 0.03% : 0.000007s : 1: comm_op_add_attrs 0.07% : 0.000017s : 1: control_data_broadcast_order 0.05% : 0.000013s : 1: convert_after_rewriter 0.13% : 0.000030s : 1: cse_after_recomputation 0.04% : 0.000008s : 1: dataset_repeat_opt 0.08% : 0.000019s : 1: detach_backward 0.04% : 0.000010s : 1: environ_conv 0.09% : 0.000020s : 1: event_method 0.03% : 0.000007s : 1: full_micro_interleaved_order_control 0.03% : 0.000008s : 1: get_jit_bprop_graph 0.05% : 0.000011s : 1: graph_reusing 0.03% : 0.000006s : 1: grouped_pairwise_exchange_alltoall 0.03% : 0.000006s : 1: handle_group_info 0.03% : 0.000008s : 1: inline 0.04% : 0.000008s : 1: insert-virtual-dataset 0.03% : 0.000006s : 1: interleave_parallel_branches 0.03% : 0.000006s : 1: interleave_split_concat_branches 0.03% : 0.000008s : 1: label_fine_grained_interleaved_index 0.04% : 0.000009s : 1: label_micro_interleaved_index 1.90% : 0.000439s : 1: loop_unroll 0.03% : 0.000006s : 1: merge_cast_opt 0.03% : 0.000007s : 1: micro_interleaved_order_control 2.34% : 0.000542s : 1: mutable_eliminate 0.04% : 0.000010s : 1: offloading_packed_experts 0.06% : 0.000014s : 1: opt.transform.loop_unroll_optimizer 0.06% : 0.000015s : 1: opt.transform.mutable_eliminate 3.63% : 0.000840s : 78: opt.transform.opt_a 0.11% : 0.000027s : 1: opt.transform.opt_after_cconv 0.11% : 0.000025s : 1: opt.transform.opt_after_jit_grad 0.42% : 0.000097s : 28: opt.transform.opt_b 0.20% : 0.000045s : 2: opt.transform.opt_trans_graph 0.15% : 0.000036s : 4: opt.transform.symbol_engine_opt 10.72% : 0.002479s : 1: opt_a 0.56% : 0.000129s : 1: opt_after_cconv 2.18% : 0.000504s : 1: opt_after_jit_grad 1.17% : 0.000270s : 1: opt_b 21.64% : 0.005004s : 1: optimize 0.09% : 0.000022s : 1: optimize_parallel_all_gather_comm 0.05% : 0.000011s : 1: order_py_execute_after_rewriter 0.10% : 0.000023s : 1: overlap_grad_flash_sp 0.03% : 0.000006s : 1: overlap_grad_matmul_and_grad_allreduce 0.04% : 0.000009s : 1: overlap_grad_ring_attention 0.03% : 0.000006s : 1: overlap_opt_shard_grad_in_pipeline 0.03% : 0.000006s : 1: overlap_opt_shard_in_pipeline 0.04% : 0.000008s : 1: overlap_param_gather 0.03% : 0.000006s : 1: overlap_recompute_allgather_and_fa_grad 0.04% : 0.000010s : 1: overlap_recompute_and_grad_model_parallel 0.03% : 0.000008s : 1: overlap_recompute_comm 0.04% : 0.000010s : 1: parallel-infer-symbol 0.03% : 0.000006s : 1: parallel-infer-symbol-second 0.03% : 0.000007s : 1: partial_unused_args_eliminate 0.04% : 0.000008s : 1: pipeline_parallel_scheduler 0.03% : 0.000007s : 1: pipeline_split 0.14% : 0.000032s : 1: pre_auto_parallel 0.10% : 0.000023s : 1: py_interpret_to_execute 0.07% : 0.000017s : 1: py_interpret_to_execute_after_opt_a 0.03% : 0.000006s : 1: remove_cast_before_assign_add 0.09% : 0.000020s : 1: remove_dup_value 0.99% : 0.000230s : 1: renormalize.infer 0.86% : 0.000198s : 1: renormalize.specialize 0.03% : 0.000008s : 1: reorder_send_recv_between_fp_bp 0.04% : 0.000009s : 1: rewriter_after_jit_bprop_graph 0.18% : 0.000042s : 1: rewriter_after_opt_a 0.22% : 0.000051s : 1: rewriter_before_opt_a 0.03% : 0.000008s : 1: slice_cell_reuse_recomputed_activation 0.03% : 0.000007s : 1: slice_recompute_activation 0.03% : 0.000007s : 1: split_layernorm_comm 0.03% : 0.000007s : 1: split_matmul_comm_elemetwise 0.05% : 0.000011s : 1: swap_dp_allreduce_reducescatter 0.42% : 0.000098s : 1: symbol_engine_optimizer 0.39% : 0.000090s : 1: tuple_transform 19.92% : 0.004607s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:46.168.44 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0129152, [21] [bootstrap]: 0.00043273 [type_inference]: 0.00465309 [event_method]: 1.12e-05 [auto_monad]: 5.485e-05 [graph_reusing]: 5.04e-06 [inline]: 2.81999e-06 [add_attr]: 0.00305713, [1] [add_attr_with_inline]: 0.00304931, [1] [Cycle 1]: 5.129e-05, [2] [tag_attr]: 1.393e-05 [meta_addattr_fg_expand]: 3.78999e-06 [parallel-infer-symbol]: 3.3e-06 [pre_auto_parallel]: 2.365e-05 [insert-virtual-dataset]: 2.49999e-06 [parallel-infer-symbol-second]: 7.09988e-07 [dataset_repeat_opt]: 2.36e-06 [pipeline_split]: 1.54e-06 [optimize]: 0.00400983, [53] [py_interpret_to_execute]: 1.632e-05 [rewriter_before_opt_a]: 4.507e-05 [opt_a]: 0.00214015, [2] [Cycle 1]: 0.00150024, [45] [expand_dump_flag]: 2.74999e-06 [switch_simplify]: 2.597e-05 [loop_unroll]: 1.352e-05 [a_1]: 0.00031301 [with_stream_mark]: 1.501e-05 [recompute_prepare]: 8.03001e-06 [updatestate_depend_eliminate]: 3.85e-06 [updatestate_assign_eliminate]: 3.55e-06 [updatestate_loads_eliminate]: 3.66999e-06 [parameter_eliminate]: 1.82001e-06 [a_2]: 9.095e-05 [accelerated_algorithm]: 6.76e-06 [shard]: 2.17001e-06 [meta_shard_fg_expand]: 1.75001e-06 [shard_inline]: 5.89999e-06 [merge_send_recv]: 7.78001e-06 [auto_parallel]: 6.12999e-06 [parallel]: 1.782e-05 [flash_sp]: 7.59002e-06 [merge_comm]: 3.66999e-06 [allreduce_fusion]: 3.51999e-06 [matmul_add_comm_reduction]: 9.07001e-06 [allreduce_slice_to_reducescatter]: 6.60017e-07 [virtual_shard_identity]: 7.83001e-06 [virtual_dataset]: 6.87002e-06 [get_grad_eliminate_]: 5.86003e-06 [virtual_output]: 5.69e-06 [merge_forward]: 3.67998e-06 [cell_reuse_recompute_pass]: 1.21997e-06 [offload_activation]: 9.04e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.164e-05 [merge_recompute_call_nodes]: 1.79998e-06 [before_grad]: 9.99999e-06 [set_forward_comm_id_for_comm_node_pass]: 3.54002e-06 [meta_fg_expand]: 2.53e-06 [flash_sp_send_recv_attached]: 2.42001e-06 [receive_attached]: 2.29001e-06 [after_resolve]: 9.94999e-06 [a_after_grad]: 8.84e-06 [renormalize]: 0.0005217 [add_forward_monad_depend]: 5.05001e-06 [auto_monad_grad]: 2.11e-06 [auto_monad_eliminator]: 1.372e-05 [cse]: 2.888e-05 [a_3]: 4.399e-05 [Cycle 2]: 0.00063085, [45] [expand_dump_flag]: 9.89996e-07 [switch_simplify]: 6.88e-06 [loop_unroll]: 5.64e-06 [a_1]: 0.00012557 [with_stream_mark]: 1.322e-05 [recompute_prepare]: 6.53003e-06 [updatestate_depend_eliminate]: 2.91999e-06 [updatestate_assign_eliminate]: 2.23002e-06 [updatestate_loads_eliminate]: 2.86999e-06 [parameter_eliminate]: 9.89996e-07 [a_2]: 8.001e-05 [accelerated_algorithm]: 5.92999e-06 [shard]: 1.07e-06 [meta_shard_fg_expand]: 1.33002e-06 [shard_inline]: 6.02999e-06 [merge_send_recv]: 5.08002e-06 [auto_parallel]: 5.92999e-06 [parallel]: 4.4e-06 [flash_sp]: 3.66999e-06 [merge_comm]: 3.11999e-06 [allreduce_fusion]: 2.84001e-06 [matmul_add_comm_reduction]: 5.75001e-06 [allreduce_slice_to_reducescatter]: 3.59985e-07 [virtual_shard_identity]: 6.14001e-06 [virtual_dataset]: 5.49e-06 [get_grad_eliminate_]: 5.44e-06 [virtual_output]: 5.35001e-06 [merge_forward]: 2.61e-06 [cell_reuse_recompute_pass]: 1.27e-06 [offload_activation]: 6.34999e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.05e-05 [merge_recompute_call_nodes]: 7.50006e-07 [before_grad]: 8.52998e-06 [set_forward_comm_id_for_comm_node_pass]: 3.56001e-06 [meta_fg_expand]: 1.94999e-06 [flash_sp_send_recv_attached]: 8.49977e-07 [receive_attached]: 1.07998e-06 [after_resolve]: 8.25e-06 [a_after_grad]: 7.78999e-06 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 1.25001e-06 [auto_monad_grad]: 9.5999e-07 [auto_monad_eliminator]: 7.25e-06 [cse]: 1.487e-05 [a_3]: 3.422e-05 [py_interpret_to_execute_after_opt_a]: 7.53e-06 [slice_cell_reuse_recomputed_activation]: 2.43e-06 [rewriter_after_opt_a]: 3.473e-05 [convert_after_rewriter]: 6.53e-06 [order_py_execute_after_rewriter]: 5.18002e-06 [mutable_eliminate]: 0.00045781 [opt_b]: 0.00019674, [1] [Cycle 1]: 0.00019063, [7] [b_1]: 0.00011736 [b_2]: 7.15e-06 [updatestate_depend_eliminate]: 5.12e-06 [updatestate_assign_eliminate]: 2.37001e-06 [updatestate_loads_eliminate]: 2.31e-06 [renormalize]: 3.69997e-07 [cse]: 1.927e-05 [optimize_parallel_all_gather_comm]: 1.621e-05 [overlap_param_gather]: 2.02001e-06 [cconv]: 2.384e-05 [loop_unroll]: 0.00042471 [opt_after_cconv]: 9.824e-05, [1] [Cycle 1]: 9.269e-05, [7] [c_1]: 2.72e-05 [parameter_eliminate]: 2.99999e-06 [updatestate_depend_eliminate]: 4.89e-06 [updatestate_assign_eliminate]: 2.24999e-06 [updatestate_loads_eliminate]: 2.43e-06 [cse]: 1.845e-05 [renormalize]: 5.69999e-07 [remove_dup_value]: 1.408e-05 [tuple_transform]: 6.992e-05, [1] [Cycle 1]: 6.563e-05, [4] [d_1]: 3.905e-05 [none_parameter_eliminate]: 1.55999e-06 [renormalize]: 1.70025e-07 [switch_simplify]: 6.36e-06 [partial_unused_args_eliminate]: 1.79998e-06 [add_recomputation]: 4.377e-05 [cse_after_recomputation]: 2.099e-05, [1] [Cycle 1]: 1.663e-05, [1] [cse]: 1.133e-05 [environ_conv]: 5.36998e-06 [swap_dp_allreduce_reducescatter]: 4.72e-06 [bias_add_comm_swap]: 2.88e-06 [label_micro_interleaved_index]: 4.13999e-06 [label_fine_grained_interleaved_index]: 2.80997e-06 [merge_cast_opt]: 1.57999e-06 [slice_recompute_activation]: 2.21e-06 [micro_interleaved_order_control]: 2.57001e-06 [assign_add_opt]: 1.22999e-06 [ForceFp32Comm]: 1.08001e-06 [remove_cast_before_assign_add]: 1.39e-06 [full_micro_interleaved_order_control]: 2.04999e-06 [reorder_send_recv_between_fp_bp]: 2.98e-06 [comm_op_add_attrs]: 1.02e-06 [add_comm_op_reuse_tag]: 9.60019e-07 [interleave_split_concat_branches]: 1.19e-06 [interleave_parallel_branches]: 1.02e-06 [overlap_opt_shard_in_pipeline]: 1.19e-06 [overlap_opt_shard_grad_in_pipeline]: 1.97001e-06 [control_data_broadcast_order]: 1.208e-05 [grouped_pairwise_exchange_alltoall]: 1.43002e-06 [offloading_packed_experts]: 3.9e-06 [overlap_recompute_and_grad_model_parallel]: 4.66002e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.07998e-06 [overlap_recompute_allgather_and_fa_grad]: 1.38002e-06 [overlap_recompute_comm]: 1.92999e-06 [overlap_grad_ring_attention]: 3.98001e-06 [overlap_grad_flash_sp]: 1.874e-05 [begin_end_overlap_inline]: 4.80009e-07 [split_matmul_comm_elemetwise]: 2.04999e-06 [split_layernorm_comm]: 2.14999e-06 [handle_group_info]: 1.00999e-06 [symbol_engine_optimizer]: 7.213e-05, [1] [Cycle 1]: 6.797e-05, [6] [build]: 2.52001e-06 [elim_shapecalc]: 9.35001e-06 [elim_not_effective]: 1.247e-05 [opt_reshape]: 6.50002e-06 [fold_const_symbol]: 9.49e-06 [renormalize]: 2.3999e-07 [detach_backward]: 2.17999e-06 [pipeline_parallel_scheduler]: 1.52001e-06 [auto_monad_reorder]: 1.605e-05 [get_jit_bprop_graph]: 1.27999e-06 [rewriter_after_jit_bprop_graph]: 4.55001e-06 [opt_after_jit_grad]: 0.00045064 [validate]: 3.704e-05 Sums bootstrap : 0.000433s : 4.85% type_inference : 0.004653s : 52.12% event_method : 0.000011s : 0.13% auto_monad : 0.000055s : 0.61% graph_reusing : 0.000005s : 0.06% inline : 0.000003s : 0.03% add_attr.add_attr_with_inline.tag_attr : 0.000014s : 0.16% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000004s : 0.04% parallel-infer-symbol : 0.000003s : 0.04% pre_auto_parallel : 0.000024s : 0.26% insert-virtual-dataset : 0.000002s : 0.03% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.03% pipeline_split : 0.000002s : 0.02% optimize.py_interpret_to_execute : 0.000016s : 0.18% optimize.rewriter_before_opt_a : 0.000045s : 0.50% optimize.opt_a.expand_dump_flag : 0.000004s : 0.04% optimize.opt_a.switch_simplify : 0.000033s : 0.37% optimize.opt_a.loop_unroll : 0.000019s : 0.21% optimize.opt_a.a_1 : 0.000439s : 4.91% optimize.opt_a.with_stream_mark : 0.000028s : 0.32% optimize.opt_a.recompute_prepare : 0.000015s : 0.16% optimize.opt_a.updatestate_depend_eliminate : 0.000007s : 0.08% optimize.opt_a.updatestate_assign_eliminate : 0.000006s : 0.06% optimize.opt_a.updatestate_loads_eliminate : 0.000007s : 0.07% optimize.opt_a.parameter_eliminate : 0.000003s : 0.03% optimize.opt_a.a_2 : 0.000171s : 1.92% optimize.opt_a.accelerated_algorithm : 0.000013s : 0.14% optimize.opt_a.shard : 0.000003s : 0.04% optimize.opt_a.meta_shard_fg_expand : 0.000003s : 0.03% optimize.opt_a.shard_inline : 0.000012s : 0.13% optimize.opt_a.merge_send_recv : 0.000013s : 0.14% optimize.opt_a.auto_parallel : 0.000012s : 0.14% optimize.opt_a.parallel : 0.000022s : 0.25% optimize.opt_a.flash_sp : 0.000011s : 0.13% optimize.opt_a.merge_comm : 0.000007s : 0.08% optimize.opt_a.allreduce_fusion : 0.000006s : 0.07% optimize.opt_a.matmul_add_comm_reduction : 0.000015s : 0.17% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000014s : 0.16% optimize.opt_a.virtual_dataset : 0.000012s : 0.14% optimize.opt_a.get_grad_eliminate_ : 0.000011s : 0.13% optimize.opt_a.virtual_output : 0.000011s : 0.12% optimize.opt_a.merge_forward : 0.000006s : 0.07% optimize.opt_a.cell_reuse_recompute_pass : 0.000002s : 0.03% optimize.opt_a.offload_activation : 0.000015s : 0.17% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000022s : 0.25% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.03% optimize.opt_a.before_grad : 0.000019s : 0.21% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000007s : 0.08% optimize.opt_a.meta_fg_expand : 0.000004s : 0.05% optimize.opt_a.flash_sp_send_recv_attached : 0.000003s : 0.04% optimize.opt_a.receive_attached : 0.000003s : 0.04% optimize.opt_a.after_resolve : 0.000018s : 0.20% optimize.opt_a.a_after_grad : 0.000017s : 0.19% optimize.opt_a.renormalize : 0.000522s : 5.84% optimize.opt_a.add_forward_monad_depend : 0.000006s : 0.07% optimize.opt_a.auto_monad_grad : 0.000003s : 0.03% optimize.opt_a.auto_monad_eliminator : 0.000021s : 0.23% optimize.opt_a.cse : 0.000044s : 0.49% optimize.opt_a.a_3 : 0.000078s : 0.88% optimize.py_interpret_to_execute_after_opt_a : 0.000008s : 0.08% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.03% optimize.rewriter_after_opt_a : 0.000035s : 0.39% optimize.convert_after_rewriter : 0.000007s : 0.07% optimize.order_py_execute_after_rewriter : 0.000005s : 0.06% optimize.mutable_eliminate : 0.000458s : 5.13% optimize.opt_b.b_1 : 0.000117s : 1.31% optimize.opt_b.b_2 : 0.000007s : 0.08% optimize.opt_b.updatestate_depend_eliminate : 0.000005s : 0.06% optimize.opt_b.updatestate_assign_eliminate : 0.000002s : 0.03% optimize.opt_b.updatestate_loads_eliminate : 0.000002s : 0.03% optimize.opt_b.renormalize : 0.000000s : 0.00% optimize.opt_b.cse : 0.000019s : 0.22% optimize.optimize_parallel_all_gather_comm : 0.000016s : 0.18% optimize.overlap_param_gather : 0.000002s : 0.02% optimize.cconv : 0.000024s : 0.27% optimize.loop_unroll : 0.000425s : 4.76% optimize.opt_after_cconv.c_1 : 0.000027s : 0.30% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000005s : 0.05% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000002s : 0.03% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.03% optimize.opt_after_cconv.cse : 0.000018s : 0.21% optimize.opt_after_cconv.renormalize : 0.000001s : 0.01% optimize.remove_dup_value : 0.000014s : 0.16% optimize.tuple_transform.d_1 : 0.000039s : 0.44% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.02% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000006s : 0.07% optimize.partial_unused_args_eliminate : 0.000002s : 0.02% optimize.add_recomputation : 0.000044s : 0.49% optimize.cse_after_recomputation.cse : 0.000011s : 0.13% optimize.environ_conv : 0.000005s : 0.06% optimize.swap_dp_allreduce_reducescatter : 0.000005s : 0.05% optimize.bias_add_comm_swap : 0.000003s : 0.03% optimize.label_micro_interleaved_index : 0.000004s : 0.05% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.03% optimize.merge_cast_opt : 0.000002s : 0.02% optimize.slice_recompute_activation : 0.000002s : 0.02% optimize.micro_interleaved_order_control : 0.000003s : 0.03% optimize.assign_add_opt : 0.000001s : 0.01% optimize.ForceFp32Comm : 0.000001s : 0.01% optimize.remove_cast_before_assign_add : 0.000001s : 0.02% optimize.full_micro_interleaved_order_control : 0.000002s : 0.02% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.03% optimize.comm_op_add_attrs : 0.000001s : 0.01% optimize.add_comm_op_reuse_tag : 0.000001s : 0.01% optimize.interleave_split_concat_branches : 0.000001s : 0.01% optimize.interleave_parallel_branches : 0.000001s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.02% optimize.control_data_broadcast_order : 0.000012s : 0.14% optimize.grouped_pairwise_exchange_alltoall : 0.000001s : 0.02% optimize.offloading_packed_experts : 0.000004s : 0.04% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.05% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.02% optimize.overlap_recompute_comm : 0.000002s : 0.02% optimize.overlap_grad_ring_attention : 0.000004s : 0.04% optimize.overlap_grad_flash_sp : 0.000019s : 0.21% optimize.begin_end_overlap_inline : 0.000000s : 0.01% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.02% optimize.split_layernorm_comm : 0.000002s : 0.02% optimize.handle_group_info : 0.000001s : 0.01% optimize.symbol_engine_optimizer.build : 0.000003s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000009s : 0.10% optimize.symbol_engine_optimizer.elim_not_effective : 0.000012s : 0.14% optimize.symbol_engine_optimizer.opt_reshape : 0.000007s : 0.07% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000009s : 0.11% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.02% pipeline_parallel_scheduler : 0.000002s : 0.02% auto_monad_reorder : 0.000016s : 0.18% get_jit_bprop_graph : 0.000001s : 0.01% rewriter_after_jit_bprop_graph : 0.000005s : 0.05% opt_after_jit_grad : 0.000451s : 5.05% validate : 0.000037s : 0.41% Time group info: ------[substitution.] 0.000166 23 38.40% : 0.000064s : 4: substitution.arithmetic_simplify 1.20% : 0.000002s : 2: substitution.elim_not_effective 0.83% : 0.000001s : 2: substitution.fold_const_symbol 3.41% : 0.000006s : 3: substitution.graph_param_transform 49.58% : 0.000082s : 2: substitution.inline 2.04% : 0.000003s : 4: substitution.j_node_and_user_rematch 2.77% : 0.000005s : 4: substitution.remove_not_recompute_node 1.78% : 0.000003s : 2: substitution.replace_old_param ------[type_inference.] 0.004606 2 91.03% : 0.004193s : 1: type_inference.infer 8.97% : 0.000413s : 1: type_inference.specialize ------[replace.] 0.000021 2 100.00% : 0.000021s : 2: replace.inline ------[match.] 0.000081 2 100.00% : 0.000081s : 2: match.inline ------[predicate.] 0.000131 754 0.82% : 0.000001s : 7: predicate.accumulaten_eliminater 0.95% : 0.000001s : 3: predicate.ad_related_special_op_eliminate 0.64% : 0.000001s : 6: predicate.addn_check_dump 0.87% : 0.000001s : 7: predicate.addn_zero_filter 0.72% : 0.000001s : 7: predicate.adjust_all_reduce_mul_add 2.79% : 0.000004s : 13: predicate.arithmetic_simplify 0.85% : 0.000001s : 7: predicate.cast_eliminate 0.75% : 0.000001s : 6: predicate.check_bprop_eliminate 0.74% : 0.000001s : 6: predicate.compare_switch_simplify 0.20% : 0.000000s : 3: predicate.const_output_eliminate 0.75% : 0.000001s : 6: predicate.depend_value_elim 0.82% : 0.000001s : 7: predicate.dict_get_item_const_eliminator 0.89% : 0.000001s : 7: predicate.dict_get_item_eliminator 0.94% : 0.000001s : 7: predicate.dict_set_item_eliminator 1.22% : 0.000002s : 6: predicate.dumpgradient_eliminate 0.26% : 0.000000s : 3: predicate.elim_not_effective 0.52% : 0.000001s : 3: predicate.elim_shapecalc_of_broadcastargs 1.06% : 0.000001s : 10: predicate.environ_add_const_eliminate 1.02% : 0.000001s : 10: predicate.environ_get_add_eliminate 1.05% : 0.000001s : 10: predicate.environ_get_depend_swap 1.82% : 0.000002s : 16: predicate.environ_get_eliminate 1.03% : 0.000001s : 10: predicate.environ_get_set_eliminate 0.92% : 0.000001s : 9: predicate.exchange_switch_depend_value 2.23% : 0.000003s : 9: predicate.float_depend_g_call 0.66% : 0.000001s : 6: predicate.float_environ_get_switch 0.93% : 0.000001s : 9: predicate.float_tuple_getitem_switch 0.27% : 0.000000s : 3: predicate.fold_const_symbol 0.84% : 0.000001s : 6: predicate.get_grad_eliminate 0.41% : 0.000001s : 3: predicate.graph_param_transform 0.84% : 0.000001s : 6: predicate.incorporate_call 0.70% : 0.000001s : 6: predicate.incorporate_call_switch 6.24% : 0.000008s : 34: predicate.inline 0.98% : 0.000001s : 6: predicate.inline_without_move 0.40% : 0.000001s : 6: predicate.j_node_and_user_rematch 1.00% : 0.000001s : 6: predicate.less_batch_normalization 1.89% : 0.000002s : 13: predicate.list_to_tuple_eliminator_ 2.34% : 0.000003s : 20: predicate.load_eliminater 1.26% : 0.000002s : 3: predicate.loop_unroll_after_grad 1.59% : 0.000002s : 14: predicate.loop_unroll_before_grad 1.77% : 0.000002s : 13: predicate.make_slice_get_slice_eliminator 0.66% : 0.000001s : 6: predicate.merge_addn 0.75% : 0.000001s : 6: predicate.micro_step_allgather_replace 0.67% : 0.000001s : 6: predicate.mini_step_allgather_replace 0.71% : 0.000001s : 7: predicate.minmaximum_grad 1.50% : 0.000002s : 3: predicate.mutable_eliminate 0.43% : 0.000001s : 3: predicate.opt_reshape 0.42% : 0.000001s : 3: predicate.parallel_virtual_node 1.37% : 0.000002s : 9: predicate.partial_defer_inline 1.31% : 0.000002s : 10: predicate.partial_eliminate 0.89% : 0.000001s : 7: predicate.print_const_string_wrapper 0.67% : 0.000001s : 6: predicate.reduce_all_const_elim 1.08% : 0.000001s : 7: predicate.reduce_eliminate 2.08% : 0.000003s : 20: predicate.redundant_stop_gradient_eliminater 0.50% : 0.000001s : 6: predicate.remove_not_recompute_node 1.17% : 0.000002s : 13: predicate.replace_applicator 0.66% : 0.000001s : 6: predicate.replace_old_param 0.37% : 0.000000s : 3: predicate.reset_defer_inline 0.80% : 0.000001s : 7: predicate.reshape_eliminate 0.73% : 0.000001s : 6: predicate.row_tensor_add_zeros_like 0.47% : 0.000001s : 3: predicate.row_tensor_eliminate 1.07% : 0.000001s : 6: predicate.same_eliminate 0.55% : 0.000001s : 6: predicate.set_cell_output_no_recompute 0.89% : 0.000001s : 6: predicate.shard_identity_eliminate 0.94% : 0.000001s : 6: predicate.special_op_eliminate 0.93% : 0.000001s : 6: predicate.specialize_transform 1.03% : 0.000001s : 6: predicate.split_environ_get_set_with_tuple_value 0.86% : 0.000001s : 6: predicate.stack_unstack_eliminate 0.47% : 0.000001s : 3: predicate.switch_call_monad_eliminater 1.03% : 0.000001s : 9: predicate.switch_defer_inline 1.73% : 0.000002s : 15: predicate.switch_layer_defer_inline 4.38% : 0.000006s : 32: predicate.switch_simplify 0.79% : 0.000001s : 7: predicate.tile_eliminate 0.79% : 0.000001s : 7: predicate.transpose_eliminate 1.53% : 0.000002s : 13: predicate.tuple_list_convert_item_index_to_positive 1.73% : 0.000002s : 13: predicate.tuple_list_get_item_const_eliminator 1.32% : 0.000002s : 13: predicate.tuple_list_get_item_depend_reorder 3.37% : 0.000004s : 19: predicate.tuple_list_get_item_eliminator 1.38% : 0.000002s : 13: predicate.tuple_list_get_set_item_eliminator 2.47% : 0.000003s : 19: predicate.tuple_list_set_item_eliminator 1.51% : 0.000002s : 13: predicate.tuple_to_list_eliminator_ 1.98% : 0.000003s : 20: predicate.updatestate_pure_node_eliminater 2.92% : 0.000004s : 26: predicate.updatestate_useless_node_eliminater 0.47% : 0.000001s : 3: predicate.value_based_eliminate 0.98% : 0.000001s : 6: predicate.virtual_dataset_eliminate 0.75% : 0.000001s : 6: predicate.virtual_output_eliminate 0.37% : 0.000000s : 3: predicate.virtual_view_grad_eliminate 0.54% : 0.000001s : 3: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000201 5 7.95% : 0.000016s : 1: func_graph_cloner_run.FuncGraphClonerGraph 92.05% : 0.000185s : 4: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.021443 192 0.02% : 0.000004s : 1: ForceFp32Comm 14.28% : 0.003062s : 1: add_attr 14.24% : 0.003053s : 1: add_attr_with_inline 0.02% : 0.000004s : 1: add_comm_op_reuse_tag 0.22% : 0.000048s : 1: add_recomputation 0.02% : 0.000004s : 1: assign_add_opt 0.28% : 0.000060s : 1: auto_monad 0.09% : 0.000020s : 1: auto_monad_reorder 0.02% : 0.000003s : 1: begin_end_overlap_inline 0.03% : 0.000006s : 1: bias_add_comm_swap 2.15% : 0.000460s : 1: bootstrap 0.13% : 0.000027s : 1: cconv 0.02% : 0.000004s : 1: comm_op_add_attrs 0.07% : 0.000015s : 1: control_data_broadcast_order 0.05% : 0.000010s : 1: convert_after_rewriter 0.11% : 0.000024s : 1: cse_after_recomputation 0.02% : 0.000005s : 1: dataset_repeat_opt 0.03% : 0.000005s : 1: detach_backward 0.04% : 0.000009s : 1: environ_conv 0.08% : 0.000017s : 1: event_method 0.02% : 0.000005s : 1: full_micro_interleaved_order_control 0.02% : 0.000004s : 1: get_jit_bprop_graph 0.04% : 0.000008s : 1: graph_reusing 0.02% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.02% : 0.000004s : 1: handle_group_info 0.03% : 0.000006s : 1: inline 0.03% : 0.000006s : 1: insert-virtual-dataset 0.02% : 0.000004s : 1: interleave_parallel_branches 0.02% : 0.000004s : 1: interleave_split_concat_branches 0.03% : 0.000006s : 1: label_fine_grained_interleaved_index 0.03% : 0.000007s : 1: label_micro_interleaved_index 2.01% : 0.000432s : 1: loop_unroll 0.02% : 0.000004s : 1: merge_cast_opt 0.03% : 0.000005s : 1: micro_interleaved_order_control 2.17% : 0.000466s : 1: mutable_eliminate 0.03% : 0.000007s : 1: offloading_packed_experts 0.06% : 0.000014s : 1: opt.transform.loop_unroll_optimizer 0.07% : 0.000014s : 1: opt.transform.mutable_eliminate 3.79% : 0.000812s : 78: opt.transform.opt_a 0.12% : 0.000026s : 1: opt.transform.opt_after_cconv 0.10% : 0.000022s : 1: opt.transform.opt_after_jit_grad 0.44% : 0.000094s : 28: opt.transform.opt_b 0.20% : 0.000043s : 2: opt.transform.opt_trans_graph 0.16% : 0.000034s : 4: opt.transform.symbol_engine_opt 9.99% : 0.002143s : 1: opt_a 0.47% : 0.000102s : 1: opt_after_cconv 2.14% : 0.000459s : 1: opt_after_jit_grad 0.93% : 0.000200s : 1: opt_b 18.72% : 0.004014s : 1: optimize 0.09% : 0.000020s : 1: optimize_parallel_all_gather_comm 0.04% : 0.000008s : 1: order_py_execute_after_rewriter 0.10% : 0.000022s : 1: overlap_grad_flash_sp 0.02% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.03% : 0.000007s : 1: overlap_grad_ring_attention 0.02% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.02% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.02% : 0.000005s : 1: overlap_param_gather 0.02% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.04% : 0.000008s : 1: overlap_recompute_and_grad_model_parallel 0.02% : 0.000005s : 1: overlap_recompute_comm 0.03% : 0.000007s : 1: parallel-infer-symbol 0.02% : 0.000004s : 1: parallel-infer-symbol-second 0.02% : 0.000005s : 1: partial_unused_args_eliminate 0.02% : 0.000005s : 1: pipeline_parallel_scheduler 0.02% : 0.000004s : 1: pipeline_split 0.13% : 0.000028s : 1: pre_auto_parallel 0.09% : 0.000020s : 1: py_interpret_to_execute 0.05% : 0.000011s : 1: py_interpret_to_execute_after_opt_a 0.02% : 0.000004s : 1: remove_cast_before_assign_add 0.08% : 0.000018s : 1: remove_dup_value 1.36% : 0.000291s : 1: renormalize.infer 1.04% : 0.000224s : 1: renormalize.specialize 0.03% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.04% : 0.000008s : 1: rewriter_after_jit_bprop_graph 0.18% : 0.000039s : 1: rewriter_after_opt_a 0.23% : 0.000049s : 1: rewriter_before_opt_a 0.02% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.02% : 0.000005s : 1: slice_recompute_activation 0.02% : 0.000005s : 1: split_layernorm_comm 0.02% : 0.000005s : 1: split_matmul_comm_elemetwise 0.04% : 0.000008s : 1: swap_dp_allreduce_reducescatter 0.35% : 0.000075s : 1: symbol_engine_optimizer 0.34% : 0.000073s : 1: tuple_transform 21.77% : 0.004668s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:46.206.733 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:46.206.990 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0139522, [21] [bootstrap]: 0.00043532 [type_inference]: 0.00463004 [event_method]: 1.22e-05 [auto_monad]: 5.259e-05 [graph_reusing]: 5.29e-06 [inline]: 2.36e-06 [add_attr]: 0.00304902, [1] [add_attr_with_inline]: 0.0030414, [1] [Cycle 1]: 5.978e-05, [2] [tag_attr]: 1.393e-05 [meta_addattr_fg_expand]: 3.6e-06 [parallel-infer-symbol]: 3.09001e-06 [pre_auto_parallel]: 2.326e-05 [insert-virtual-dataset]: 2.49001e-06 [parallel-infer-symbol-second]: 7.89994e-07 [dataset_repeat_opt]: 2.17999e-06 [pipeline_split]: 1.67001e-06 [optimize]: 0.00464542, [53] [py_interpret_to_execute]: 2.197e-05 [rewriter_before_opt_a]: 4.662e-05 [opt_a]: 0.00243574, [2] [Cycle 1]: 0.0016016, [45] [expand_dump_flag]: 3.13e-06 [switch_simplify]: 2.4e-05 [loop_unroll]: 1.373e-05 [a_1]: 0.00031732 [with_stream_mark]: 1.436e-05 [recompute_prepare]: 8.12e-06 [updatestate_depend_eliminate]: 3.73999e-06 [updatestate_assign_eliminate]: 3.63e-06 [updatestate_loads_eliminate]: 2.91999e-06 [parameter_eliminate]: 1.80001e-06 [a_2]: 0.00012068 [accelerated_algorithm]: 7.23999e-06 [shard]: 2.26e-06 [meta_shard_fg_expand]: 1.67999e-06 [shard_inline]: 6.68e-06 [merge_send_recv]: 7.87003e-06 [auto_parallel]: 5.80002e-06 [parallel]: 1.937e-05 [flash_sp]: 7.53e-06 [merge_comm]: 3.90998e-06 [allreduce_fusion]: 3.61999e-06 [matmul_add_comm_reduction]: 9.81e-06 [allreduce_slice_to_reducescatter]: 5.50004e-07 [virtual_shard_identity]: 7.73001e-06 [virtual_dataset]: 6.41e-06 [get_grad_eliminate_]: 6.28e-06 [virtual_output]: 6.04001e-06 [merge_forward]: 3.67998e-06 [cell_reuse_recompute_pass]: 1.42e-06 [offload_activation]: 9.12999e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.338e-05 [merge_recompute_call_nodes]: 1.59e-06 [before_grad]: 1.034e-05 [set_forward_comm_id_for_comm_node_pass]: 3.67998e-06 [meta_fg_expand]: 2.63e-06 [flash_sp_send_recv_attached]: 3.00998e-06 [receive_attached]: 3.36001e-06 [after_resolve]: 9.76e-06 [a_after_grad]: 8.70999e-06 [renormalize]: 0.00043113 [add_forward_monad_depend]: 4.75999e-06 [auto_monad_grad]: 2.22999e-06 [auto_monad_eliminator]: 1.407e-05 [cse]: 2.918e-05 [a_3]: 5.982e-05 [Cycle 2]: 0.00082175, [45] [expand_dump_flag]: 8.89995e-07 [switch_simplify]: 7.26001e-06 [loop_unroll]: 5.72001e-06 [a_1]: 0.00013227 [with_stream_mark]: 1.064e-05 [recompute_prepare]: 6.91999e-06 [updatestate_depend_eliminate]: 2.96999e-06 [updatestate_assign_eliminate]: 2.46998e-06 [updatestate_loads_eliminate]: 2.68e-06 [parameter_eliminate]: 1.10001e-06 [a_2]: 0.00011226 [accelerated_algorithm]: 6.48e-06 [shard]: 1.04e-06 [meta_shard_fg_expand]: 1.27999e-06 [shard_inline]: 6.43998e-06 [merge_send_recv]: 4.76002e-06 [auto_parallel]: 5.52001e-06 [parallel]: 4.27998e-06 [flash_sp]: 3.23998e-06 [merge_comm]: 3.36999e-06 [allreduce_fusion]: 3.14999e-06 [matmul_add_comm_reduction]: 5.49e-06 [allreduce_slice_to_reducescatter]: 3.7998e-07 [virtual_shard_identity]: 6.42001e-06 [virtual_dataset]: 5.75001e-06 [get_grad_eliminate_]: 5.52999e-06 [virtual_output]: 5.56e-06 [merge_forward]: 2.73998e-06 [cell_reuse_recompute_pass]: 1.45001e-06 [offload_activation]: 6.49001e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.283e-05 [merge_recompute_call_nodes]: 7.00005e-07 [before_grad]: 8.84998e-06 [set_forward_comm_id_for_comm_node_pass]: 4.15e-06 [meta_fg_expand]: 2.01e-06 [flash_sp_send_recv_attached]: 8.50006e-07 [receive_attached]: 9.70002e-07 [after_resolve]: 8.76997e-06 [a_after_grad]: 8.38999e-06 [renormalize]: 5.9983e-08 [add_forward_monad_depend]: 1.30999e-06 [auto_monad_grad]: 1.12e-06 [auto_monad_eliminator]: 7.4e-06 [cse]: 1.476e-05 [a_3]: 4.948e-05 [py_interpret_to_execute_after_opt_a]: 1.102e-05 [slice_cell_reuse_recomputed_activation]: 4.65999e-06 [rewriter_after_opt_a]: 3.848e-05 [convert_after_rewriter]: 9.57001e-06 [order_py_execute_after_rewriter]: 8.02e-06 [mutable_eliminate]: 0.00049338 [opt_b]: 0.00026129, [1] [Cycle 1]: 0.00025218, [7] [b_1]: 0.00016054 [b_2]: 7.50003e-06 [updatestate_depend_eliminate]: 5.51e-06 [updatestate_assign_eliminate]: 2.44001e-06 [updatestate_loads_eliminate]: 2.31e-06 [renormalize]: 4.7998e-07 [cse]: 1.851e-05 [optimize_parallel_all_gather_comm]: 1.974e-05 [overlap_param_gather]: 4.80001e-06 [cconv]: 5.324e-05 [loop_unroll]: 0.00044686 [opt_after_cconv]: 0.0001238, [1] [Cycle 1]: 0.00011547, [7] [c_1]: 2.815e-05 [parameter_eliminate]: 2.58e-06 [updatestate_depend_eliminate]: 5.00001e-06 [updatestate_assign_eliminate]: 2.61999e-06 [updatestate_loads_eliminate]: 2.72001e-06 [cse]: 1.805e-05 [renormalize]: 3.89991e-07 [remove_dup_value]: 1.606e-05 [tuple_transform]: 8.681e-05, [1] [Cycle 1]: 7.992e-05, [4] [d_1]: 4.037e-05 [none_parameter_eliminate]: 1.63002e-06 [renormalize]: 1.80007e-07 [switch_simplify]: 6.91001e-06 [partial_unused_args_eliminate]: 4.94e-06 [add_recomputation]: 5e-05 [cse_after_recomputation]: 2.729e-05, [1] [Cycle 1]: 2.04e-05, [1] [cse]: 1.159e-05 [environ_conv]: 8.58001e-06 [swap_dp_allreduce_reducescatter]: 7.68001e-06 [bias_add_comm_swap]: 5.03002e-06 [label_micro_interleaved_index]: 6.91999e-06 [label_fine_grained_interleaved_index]: 5.22e-06 [merge_cast_opt]: 3.81001e-06 [slice_recompute_activation]: 4.72998e-06 [micro_interleaved_order_control]: 4.56002e-06 [assign_add_opt]: 3.77002e-06 [ForceFp32Comm]: 3.3e-06 [remove_cast_before_assign_add]: 3.50998e-06 [full_micro_interleaved_order_control]: 4.93001e-06 [reorder_send_recv_between_fp_bp]: 5.34e-06 [comm_op_add_attrs]: 3.58e-06 [add_comm_op_reuse_tag]: 3.31999e-06 [interleave_split_concat_branches]: 3.52997e-06 [interleave_parallel_branches]: 3.7e-06 [overlap_opt_shard_in_pipeline]: 3.56999e-06 [overlap_opt_shard_grad_in_pipeline]: 4.12e-06 [control_data_broadcast_order]: 1.495e-05 [grouped_pairwise_exchange_alltoall]: 3.86999e-06 [offloading_packed_experts]: 6.57002e-06 [overlap_recompute_and_grad_model_parallel]: 7.50003e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.66001e-06 [overlap_recompute_allgather_and_fa_grad]: 3.76001e-06 [overlap_recompute_comm]: 4.85001e-06 [overlap_grad_ring_attention]: 6.77002e-06 [overlap_grad_flash_sp]: 2.147e-05 [begin_end_overlap_inline]: 2.94001e-06 [split_matmul_comm_elemetwise]: 4.52998e-06 [split_layernorm_comm]: 3.85e-06 [handle_group_info]: 3.39001e-06 [symbol_engine_optimizer]: 9.468e-05, [1] [Cycle 1]: 8.812e-05, [6] [build]: 2.42001e-06 [elim_shapecalc]: 1.008e-05 [elim_not_effective]: 1.247e-05 [opt_reshape]: 6.94999e-06 [fold_const_symbol]: 9.77999e-06 [renormalize]: 2.19996e-07 [detach_backward]: 3.31001e-06 [pipeline_parallel_scheduler]: 1.93002e-06 [auto_monad_reorder]: 1.821e-05 [get_jit_bprop_graph]: 1.64e-06 [rewriter_after_jit_bprop_graph]: 4.82e-06 [opt_after_jit_grad]: 0.00047475 [validate]: 3.503e-05 Sums bootstrap : 0.000435s : 4.73% type_inference : 0.004630s : 50.28% event_method : 0.000012s : 0.13% auto_monad : 0.000053s : 0.57% graph_reusing : 0.000005s : 0.06% inline : 0.000002s : 0.03% add_attr.add_attr_with_inline.tag_attr : 0.000014s : 0.15% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000004s : 0.04% parallel-infer-symbol : 0.000003s : 0.03% pre_auto_parallel : 0.000023s : 0.25% insert-virtual-dataset : 0.000002s : 0.03% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.02% optimize.py_interpret_to_execute : 0.000022s : 0.24% optimize.rewriter_before_opt_a : 0.000047s : 0.51% optimize.opt_a.expand_dump_flag : 0.000004s : 0.04% optimize.opt_a.switch_simplify : 0.000031s : 0.34% optimize.opt_a.loop_unroll : 0.000019s : 0.21% optimize.opt_a.a_1 : 0.000450s : 4.88% optimize.opt_a.with_stream_mark : 0.000025s : 0.27% optimize.opt_a.recompute_prepare : 0.000015s : 0.16% optimize.opt_a.updatestate_depend_eliminate : 0.000007s : 0.07% optimize.opt_a.updatestate_assign_eliminate : 0.000006s : 0.07% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.06% optimize.opt_a.parameter_eliminate : 0.000003s : 0.03% optimize.opt_a.a_2 : 0.000233s : 2.53% optimize.opt_a.accelerated_algorithm : 0.000014s : 0.15% optimize.opt_a.shard : 0.000003s : 0.04% optimize.opt_a.meta_shard_fg_expand : 0.000003s : 0.03% optimize.opt_a.shard_inline : 0.000013s : 0.14% optimize.opt_a.merge_send_recv : 0.000013s : 0.14% optimize.opt_a.auto_parallel : 0.000011s : 0.12% optimize.opt_a.parallel : 0.000024s : 0.26% optimize.opt_a.flash_sp : 0.000011s : 0.12% optimize.opt_a.merge_comm : 0.000007s : 0.08% optimize.opt_a.allreduce_fusion : 0.000007s : 0.07% optimize.opt_a.matmul_add_comm_reduction : 0.000015s : 0.17% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000014s : 0.15% optimize.opt_a.virtual_dataset : 0.000012s : 0.13% optimize.opt_a.get_grad_eliminate_ : 0.000012s : 0.13% optimize.opt_a.virtual_output : 0.000012s : 0.13% optimize.opt_a.merge_forward : 0.000006s : 0.07% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.03% optimize.opt_a.offload_activation : 0.000016s : 0.17% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000026s : 0.28% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.02% optimize.opt_a.before_grad : 0.000019s : 0.21% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000008s : 0.09% optimize.opt_a.meta_fg_expand : 0.000005s : 0.05% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.04% optimize.opt_a.receive_attached : 0.000004s : 0.05% optimize.opt_a.after_resolve : 0.000019s : 0.20% optimize.opt_a.a_after_grad : 0.000017s : 0.19% optimize.opt_a.renormalize : 0.000431s : 4.68% optimize.opt_a.add_forward_monad_depend : 0.000006s : 0.07% optimize.opt_a.auto_monad_grad : 0.000003s : 0.04% optimize.opt_a.auto_monad_eliminator : 0.000021s : 0.23% optimize.opt_a.cse : 0.000044s : 0.48% optimize.opt_a.a_3 : 0.000109s : 1.19% optimize.py_interpret_to_execute_after_opt_a : 0.000011s : 0.12% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.05% optimize.rewriter_after_opt_a : 0.000038s : 0.42% optimize.convert_after_rewriter : 0.000010s : 0.10% optimize.order_py_execute_after_rewriter : 0.000008s : 0.09% optimize.mutable_eliminate : 0.000493s : 5.36% optimize.opt_b.b_1 : 0.000161s : 1.74% optimize.opt_b.b_2 : 0.000008s : 0.08% optimize.opt_b.updatestate_depend_eliminate : 0.000006s : 0.06% optimize.opt_b.updatestate_assign_eliminate : 0.000002s : 0.03% optimize.opt_b.updatestate_loads_eliminate : 0.000002s : 0.03% optimize.opt_b.renormalize : 0.000000s : 0.01% optimize.opt_b.cse : 0.000019s : 0.20% optimize.optimize_parallel_all_gather_comm : 0.000020s : 0.21% optimize.overlap_param_gather : 0.000005s : 0.05% optimize.cconv : 0.000053s : 0.58% optimize.loop_unroll : 0.000447s : 4.85% optimize.opt_after_cconv.c_1 : 0.000028s : 0.31% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000005s : 0.05% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.cse : 0.000018s : 0.20% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000016s : 0.17% optimize.tuple_transform.d_1 : 0.000040s : 0.44% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.02% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000007s : 0.08% optimize.partial_unused_args_eliminate : 0.000005s : 0.05% optimize.add_recomputation : 0.000050s : 0.54% optimize.cse_after_recomputation.cse : 0.000012s : 0.13% optimize.environ_conv : 0.000009s : 0.09% optimize.swap_dp_allreduce_reducescatter : 0.000008s : 0.08% optimize.bias_add_comm_swap : 0.000005s : 0.05% optimize.label_micro_interleaved_index : 0.000007s : 0.08% optimize.label_fine_grained_interleaved_index : 0.000005s : 0.06% optimize.merge_cast_opt : 0.000004s : 0.04% optimize.slice_recompute_activation : 0.000005s : 0.05% optimize.micro_interleaved_order_control : 0.000005s : 0.05% optimize.assign_add_opt : 0.000004s : 0.04% optimize.ForceFp32Comm : 0.000003s : 0.04% optimize.remove_cast_before_assign_add : 0.000004s : 0.04% optimize.full_micro_interleaved_order_control : 0.000005s : 0.05% optimize.reorder_send_recv_between_fp_bp : 0.000005s : 0.06% optimize.comm_op_add_attrs : 0.000004s : 0.04% optimize.add_comm_op_reuse_tag : 0.000003s : 0.04% optimize.interleave_split_concat_branches : 0.000004s : 0.04% optimize.interleave_parallel_branches : 0.000004s : 0.04% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.04% optimize.overlap_opt_shard_grad_in_pipeline : 0.000004s : 0.04% optimize.control_data_broadcast_order : 0.000015s : 0.16% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.04% optimize.offloading_packed_experts : 0.000007s : 0.07% optimize.overlap_recompute_and_grad_model_parallel : 0.000008s : 0.08% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.04% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.04% optimize.overlap_recompute_comm : 0.000005s : 0.05% optimize.overlap_grad_ring_attention : 0.000007s : 0.07% optimize.overlap_grad_flash_sp : 0.000021s : 0.23% optimize.begin_end_overlap_inline : 0.000003s : 0.03% optimize.split_matmul_comm_elemetwise : 0.000005s : 0.05% optimize.split_layernorm_comm : 0.000004s : 0.04% optimize.handle_group_info : 0.000003s : 0.04% optimize.symbol_engine_optimizer.build : 0.000002s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000010s : 0.11% optimize.symbol_engine_optimizer.elim_not_effective : 0.000012s : 0.14% optimize.symbol_engine_optimizer.opt_reshape : 0.000007s : 0.08% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000010s : 0.11% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000003s : 0.04% pipeline_parallel_scheduler : 0.000002s : 0.02% auto_monad_reorder : 0.000018s : 0.20% get_jit_bprop_graph : 0.000002s : 0.02% rewriter_after_jit_bprop_graph : 0.000005s : 0.05% opt_after_jit_grad : 0.000475s : 5.16% validate : 0.000035s : 0.38% Time group info: ------[substitution.] 0.000171 23 39.78% : 0.000068s : 4: substitution.arithmetic_simplify 1.07% : 0.000002s : 2: substitution.elim_not_effective 0.77% : 0.000001s : 2: substitution.fold_const_symbol 3.32% : 0.000006s : 3: substitution.graph_param_transform 48.92% : 0.000083s : 2: substitution.inline 1.77% : 0.000003s : 4: substitution.j_node_and_user_rematch 2.50% : 0.000004s : 4: substitution.remove_not_recompute_node 1.88% : 0.000003s : 2: substitution.replace_old_param ------[type_inference.] 0.004585 2 91.15% : 0.004179s : 1: type_inference.infer 8.85% : 0.000406s : 1: type_inference.specialize ------[replace.] 0.000021 2 100.00% : 0.000021s : 2: replace.inline ------[match.] 0.000082 2 100.00% : 0.000082s : 2: match.inline ------[predicate.] 0.000135 754 0.83% : 0.000001s : 7: predicate.accumulaten_eliminater 0.94% : 0.000001s : 3: predicate.ad_related_special_op_eliminate 0.73% : 0.000001s : 6: predicate.addn_check_dump 0.92% : 0.000001s : 7: predicate.addn_zero_filter 0.67% : 0.000001s : 7: predicate.adjust_all_reduce_mul_add 2.54% : 0.000003s : 13: predicate.arithmetic_simplify 0.79% : 0.000001s : 7: predicate.cast_eliminate 0.76% : 0.000001s : 6: predicate.check_bprop_eliminate 0.78% : 0.000001s : 6: predicate.compare_switch_simplify 0.21% : 0.000000s : 3: predicate.const_output_eliminate 0.77% : 0.000001s : 6: predicate.depend_value_elim 0.78% : 0.000001s : 7: predicate.dict_get_item_const_eliminator 0.89% : 0.000001s : 7: predicate.dict_get_item_eliminator 0.82% : 0.000001s : 7: predicate.dict_set_item_eliminator 1.24% : 0.000002s : 6: predicate.dumpgradient_eliminate 0.26% : 0.000000s : 3: predicate.elim_not_effective 0.50% : 0.000001s : 3: predicate.elim_shapecalc_of_broadcastargs 1.11% : 0.000001s : 10: predicate.environ_add_const_eliminate 1.05% : 0.000001s : 10: predicate.environ_get_add_eliminate 0.97% : 0.000001s : 10: predicate.environ_get_depend_swap 1.84% : 0.000002s : 16: predicate.environ_get_eliminate 1.01% : 0.000001s : 10: predicate.environ_get_set_eliminate 0.92% : 0.000001s : 9: predicate.exchange_switch_depend_value 2.15% : 0.000003s : 9: predicate.float_depend_g_call 0.69% : 0.000001s : 6: predicate.float_environ_get_switch 0.98% : 0.000001s : 9: predicate.float_tuple_getitem_switch 0.23% : 0.000000s : 3: predicate.fold_const_symbol 0.82% : 0.000001s : 6: predicate.get_grad_eliminate 0.24% : 0.000000s : 3: predicate.graph_param_transform 0.77% : 0.000001s : 6: predicate.incorporate_call 0.67% : 0.000001s : 6: predicate.incorporate_call_switch 6.44% : 0.000009s : 34: predicate.inline 1.10% : 0.000001s : 6: predicate.inline_without_move 0.39% : 0.000001s : 6: predicate.j_node_and_user_rematch 1.06% : 0.000001s : 6: predicate.less_batch_normalization 1.66% : 0.000002s : 13: predicate.list_to_tuple_eliminator_ 2.03% : 0.000003s : 20: predicate.load_eliminater 1.18% : 0.000002s : 3: predicate.loop_unroll_after_grad 1.68% : 0.000002s : 14: predicate.loop_unroll_before_grad 1.78% : 0.000002s : 13: predicate.make_slice_get_slice_eliminator 0.73% : 0.000001s : 6: predicate.merge_addn 1.00% : 0.000001s : 6: predicate.micro_step_allgather_replace 0.71% : 0.000001s : 6: predicate.mini_step_allgather_replace 0.70% : 0.000001s : 7: predicate.minmaximum_grad 1.31% : 0.000002s : 3: predicate.mutable_eliminate 0.41% : 0.000001s : 3: predicate.opt_reshape 0.65% : 0.000001s : 3: predicate.parallel_virtual_node 1.32% : 0.000002s : 9: predicate.partial_defer_inline 1.28% : 0.000002s : 10: predicate.partial_eliminate 0.74% : 0.000001s : 7: predicate.print_const_string_wrapper 0.73% : 0.000001s : 6: predicate.reduce_all_const_elim 1.20% : 0.000002s : 7: predicate.reduce_eliminate 2.09% : 0.000003s : 20: predicate.redundant_stop_gradient_eliminater 0.64% : 0.000001s : 6: predicate.remove_not_recompute_node 1.15% : 0.000002s : 13: predicate.replace_applicator 0.66% : 0.000001s : 6: predicate.replace_old_param 0.36% : 0.000000s : 3: predicate.reset_defer_inline 0.84% : 0.000001s : 7: predicate.reshape_eliminate 0.90% : 0.000001s : 6: predicate.row_tensor_add_zeros_like 0.44% : 0.000001s : 3: predicate.row_tensor_eliminate 1.07% : 0.000001s : 6: predicate.same_eliminate 0.49% : 0.000001s : 6: predicate.set_cell_output_no_recompute 0.95% : 0.000001s : 6: predicate.shard_identity_eliminate 1.05% : 0.000001s : 6: predicate.special_op_eliminate 1.33% : 0.000002s : 6: predicate.specialize_transform 1.10% : 0.000001s : 6: predicate.split_environ_get_set_with_tuple_value 0.82% : 0.000001s : 6: predicate.stack_unstack_eliminate 0.46% : 0.000001s : 3: predicate.switch_call_monad_eliminater 1.04% : 0.000001s : 9: predicate.switch_defer_inline 1.86% : 0.000003s : 15: predicate.switch_layer_defer_inline 4.24% : 0.000006s : 32: predicate.switch_simplify 0.76% : 0.000001s : 7: predicate.tile_eliminate 0.93% : 0.000001s : 7: predicate.transpose_eliminate 1.56% : 0.000002s : 13: predicate.tuple_list_convert_item_index_to_positive 1.71% : 0.000002s : 13: predicate.tuple_list_get_item_const_eliminator 1.35% : 0.000002s : 13: predicate.tuple_list_get_item_depend_reorder 3.23% : 0.000004s : 19: predicate.tuple_list_get_item_eliminator 1.42% : 0.000002s : 13: predicate.tuple_list_get_set_item_eliminator 2.45% : 0.000003s : 19: predicate.tuple_list_set_item_eliminator 1.42% : 0.000002s : 13: predicate.tuple_to_list_eliminator_ 1.95% : 0.000003s : 20: predicate.updatestate_pure_node_eliminater 2.90% : 0.000004s : 26: predicate.updatestate_useless_node_eliminater 0.40% : 0.000001s : 3: predicate.value_based_eliminate 0.84% : 0.000001s : 6: predicate.virtual_dataset_eliminate 0.78% : 0.000001s : 6: predicate.virtual_output_eliminate 0.30% : 0.000000s : 3: predicate.virtual_view_grad_eliminate 0.53% : 0.000001s : 3: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000220 5 19.27% : 0.000042s : 1: func_graph_cloner_run.FuncGraphClonerGraph 80.73% : 0.000178s : 4: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.023051 192 0.03% : 0.000006s : 1: ForceFp32Comm 13.27% : 0.003058s : 1: add_attr 13.21% : 0.003045s : 1: add_attr_with_inline 0.03% : 0.000006s : 1: add_comm_op_reuse_tag 0.23% : 0.000054s : 1: add_recomputation 0.03% : 0.000007s : 1: assign_add_opt 0.26% : 0.000061s : 1: auto_monad 0.11% : 0.000025s : 1: auto_monad_reorder 0.02% : 0.000006s : 1: begin_end_overlap_inline 0.03% : 0.000008s : 1: bias_add_comm_swap 2.07% : 0.000478s : 1: bootstrap 0.25% : 0.000057s : 1: cconv 0.03% : 0.000006s : 1: comm_op_add_attrs 0.08% : 0.000018s : 1: control_data_broadcast_order 0.05% : 0.000013s : 1: convert_after_rewriter 0.13% : 0.000030s : 1: cse_after_recomputation 0.03% : 0.000008s : 1: dataset_repeat_opt 0.08% : 0.000018s : 1: detach_backward 0.05% : 0.000011s : 1: environ_conv 0.09% : 0.000022s : 1: event_method 0.03% : 0.000008s : 1: full_micro_interleaved_order_control 0.03% : 0.000008s : 1: get_jit_bprop_graph 0.05% : 0.000012s : 1: graph_reusing 0.03% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.03% : 0.000006s : 1: handle_group_info 0.04% : 0.000008s : 1: inline 0.04% : 0.000009s : 1: insert-virtual-dataset 0.03% : 0.000006s : 1: interleave_parallel_branches 0.03% : 0.000006s : 1: interleave_split_concat_branches 0.03% : 0.000008s : 1: label_fine_grained_interleaved_index 0.04% : 0.000010s : 1: label_micro_interleaved_index 1.96% : 0.000453s : 1: loop_unroll 0.03% : 0.000007s : 1: merge_cast_opt 0.03% : 0.000007s : 1: micro_interleaved_order_control 2.17% : 0.000500s : 1: mutable_eliminate 0.04% : 0.000009s : 1: offloading_packed_experts 0.06% : 0.000014s : 1: opt.transform.loop_unroll_optimizer 0.06% : 0.000014s : 1: opt.transform.mutable_eliminate 3.64% : 0.000839s : 78: opt.transform.opt_a 0.12% : 0.000027s : 1: opt.transform.opt_after_cconv 0.11% : 0.000024s : 1: opt.transform.opt_after_jit_grad 0.42% : 0.000096s : 28: opt.transform.opt_b 0.19% : 0.000045s : 2: opt.transform.opt_trans_graph 0.15% : 0.000036s : 4: opt.transform.symbol_engine_opt 10.58% : 0.002439s : 1: opt_a 0.55% : 0.000127s : 1: opt_after_cconv 2.10% : 0.000485s : 1: opt_after_jit_grad 1.15% : 0.000265s : 1: opt_b 21.55% : 0.004968s : 1: optimize 0.10% : 0.000023s : 1: optimize_parallel_all_gather_comm 0.05% : 0.000011s : 1: order_py_execute_after_rewriter 0.11% : 0.000024s : 1: overlap_grad_flash_sp 0.03% : 0.000006s : 1: overlap_grad_matmul_and_grad_allreduce 0.04% : 0.000010s : 1: overlap_grad_ring_attention 0.03% : 0.000007s : 1: overlap_opt_shard_grad_in_pipeline 0.03% : 0.000006s : 1: overlap_opt_shard_in_pipeline 0.03% : 0.000008s : 1: overlap_param_gather 0.03% : 0.000006s : 1: overlap_recompute_allgather_and_fa_grad 0.04% : 0.000010s : 1: overlap_recompute_and_grad_model_parallel 0.03% : 0.000008s : 1: overlap_recompute_comm 0.04% : 0.000010s : 1: parallel-infer-symbol 0.03% : 0.000007s : 1: parallel-infer-symbol-second 0.03% : 0.000008s : 1: partial_unused_args_eliminate 0.04% : 0.000009s : 1: pipeline_parallel_scheduler 0.03% : 0.000007s : 1: pipeline_split 0.13% : 0.000031s : 1: pre_auto_parallel 0.11% : 0.000025s : 1: py_interpret_to_execute 0.06% : 0.000014s : 1: py_interpret_to_execute_after_opt_a 0.03% : 0.000006s : 1: remove_cast_before_assign_add 0.08% : 0.000019s : 1: remove_dup_value 1.01% : 0.000233s : 1: renormalize.infer 0.83% : 0.000191s : 1: renormalize.specialize 0.04% : 0.000008s : 1: reorder_send_recv_between_fp_bp 0.05% : 0.000011s : 1: rewriter_after_jit_bprop_graph 0.18% : 0.000042s : 1: rewriter_after_opt_a 0.22% : 0.000050s : 1: rewriter_before_opt_a 0.03% : 0.000007s : 1: slice_cell_reuse_recomputed_activation 0.03% : 0.000007s : 1: slice_recompute_activation 0.03% : 0.000007s : 1: split_layernorm_comm 0.03% : 0.000007s : 1: split_matmul_comm_elemetwise 0.05% : 0.000010s : 1: swap_dp_allreduce_reducescatter 0.42% : 0.000097s : 1: symbol_engine_optimizer 0.39% : 0.000090s : 1: tuple_transform 20.19% : 0.004654s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:46.396.402 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0130166, [21] [bootstrap]: 0.00045116 [type_inference]: 0.00467915 [event_method]: 1.186e-05 [auto_monad]: 5.44e-05 [graph_reusing]: 5.19998e-06 [inline]: 2.35002e-06 [add_attr]: 0.00305249, [1] [add_attr_with_inline]: 0.0030441, [1] [Cycle 1]: 5.066e-05, [2] [tag_attr]: 1.409e-05 [meta_addattr_fg_expand]: 3.4e-06 [parallel-infer-symbol]: 3.04001e-06 [pre_auto_parallel]: 2.459e-05 [insert-virtual-dataset]: 2.43002e-06 [parallel-infer-symbol-second]: 7.39994e-07 [dataset_repeat_opt]: 1.94999e-06 [pipeline_split]: 1.59e-06 [optimize]: 0.00404598, [53] [py_interpret_to_execute]: 1.623e-05 [rewriter_before_opt_a]: 4.322e-05 [opt_a]: 0.00213156, [2] [Cycle 1]: 0.00144276, [45] [expand_dump_flag]: 3.15998e-06 [switch_simplify]: 2.601e-05 [loop_unroll]: 1.329e-05 [a_1]: 0.00031974 [with_stream_mark]: 1.534e-05 [recompute_prepare]: 8.60999e-06 [updatestate_depend_eliminate]: 3.92998e-06 [updatestate_assign_eliminate]: 3.68999e-06 [updatestate_loads_eliminate]: 3.31999e-06 [parameter_eliminate]: 1.96e-06 [a_2]: 9.189e-05 [accelerated_algorithm]: 7.28e-06 [shard]: 2.21e-06 [meta_shard_fg_expand]: 1.70001e-06 [shard_inline]: 6.17001e-06 [merge_send_recv]: 8.52998e-06 [auto_parallel]: 6.59999e-06 [parallel]: 1.831e-05 [flash_sp]: 8.00999e-06 [merge_comm]: 4.01001e-06 [allreduce_fusion]: 3.85e-06 [matmul_add_comm_reduction]: 8.89e-06 [allreduce_slice_to_reducescatter]: 6.89994e-07 [virtual_shard_identity]: 8.23999e-06 [virtual_dataset]: 6.34001e-06 [get_grad_eliminate_]: 5.69e-06 [virtual_output]: 6.26998e-06 [merge_forward]: 3.61999e-06 [cell_reuse_recompute_pass]: 1.34e-06 [offload_activation]: 9.58002e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.272e-05 [merge_recompute_call_nodes]: 1.64e-06 [before_grad]: 1.018e-05 [set_forward_comm_id_for_comm_node_pass]: 3.98999e-06 [meta_fg_expand]: 2.83e-06 [flash_sp_send_recv_attached]: 2.63e-06 [receive_attached]: 1.85001e-06 [after_resolve]: 9.91998e-06 [a_after_grad]: 9.20999e-06 [renormalize]: 0.00044306 [add_forward_monad_depend]: 5.10001e-06 [auto_monad_grad]: 2.07999e-06 [auto_monad_eliminator]: 1.479e-05 [cse]: 2.951e-05 [a_3]: 4.557e-05 [Cycle 2]: 0.00067855, [45] [expand_dump_flag]: 8.90024e-07 [switch_simplify]: 7.83001e-06 [loop_unroll]: 5.84e-06 [a_1]: 0.00015032 [with_stream_mark]: 1.233e-05 [recompute_prepare]: 6.88e-06 [updatestate_depend_eliminate]: 3.10998e-06 [updatestate_assign_eliminate]: 2.36998e-06 [updatestate_loads_eliminate]: 3.2e-06 [parameter_eliminate]: 1.04e-06 [a_2]: 8.214e-05 [accelerated_algorithm]: 6.27001e-06 [shard]: 1.43002e-06 [meta_shard_fg_expand]: 1.43002e-06 [shard_inline]: 6.17999e-06 [merge_send_recv]: 6.12001e-06 [auto_parallel]: 5.72001e-06 [parallel]: 5.15999e-06 [flash_sp]: 3.51999e-06 [merge_comm]: 3.24001e-06 [allreduce_fusion]: 3.32997e-06 [matmul_add_comm_reduction]: 6.76e-06 [allreduce_slice_to_reducescatter]: 4.39992e-07 [virtual_shard_identity]: 6.73e-06 [virtual_dataset]: 5.71003e-06 [get_grad_eliminate_]: 5.50001e-06 [virtual_output]: 5.03002e-06 [merge_forward]: 2.56e-06 [cell_reuse_recompute_pass]: 1.28002e-06 [offload_activation]: 7.01001e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.074e-05 [merge_recompute_call_nodes]: 1.00999e-06 [before_grad]: 9.29e-06 [set_forward_comm_id_for_comm_node_pass]: 4.02e-06 [meta_fg_expand]: 2.60002e-06 [flash_sp_send_recv_attached]: 9.10019e-07 [receive_attached]: 9.89996e-07 [after_resolve]: 8.62e-06 [a_after_grad]: 7.9e-06 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 1.82999e-06 [auto_monad_grad]: 1.13001e-06 [auto_monad_eliminator]: 8.19002e-06 [cse]: 1.546e-05 [a_3]: 3.486e-05 [py_interpret_to_execute_after_opt_a]: 9.91e-06 [slice_cell_reuse_recomputed_activation]: 1.96e-06 [rewriter_after_opt_a]: 3.405e-05 [convert_after_rewriter]: 6.53e-06 [order_py_execute_after_rewriter]: 5.45001e-06 [mutable_eliminate]: 0.00048392 [opt_b]: 0.00020076, [1] [Cycle 1]: 0.00019419, [7] [b_1]: 0.00011816 [b_2]: 6.96999e-06 [updatestate_depend_eliminate]: 6.65998e-06 [updatestate_assign_eliminate]: 2.39999e-06 [updatestate_loads_eliminate]: 2.37001e-06 [renormalize]: 8.19971e-07 [cse]: 1.979e-05 [optimize_parallel_all_gather_comm]: 1.605e-05 [overlap_param_gather]: 1.82001e-06 [cconv]: 2.549e-05 [loop_unroll]: 0.00042526 [opt_after_cconv]: 9.861e-05, [1] [Cycle 1]: 9.321e-05, [7] [c_1]: 2.692e-05 [parameter_eliminate]: 2.66e-06 [updatestate_depend_eliminate]: 5.05001e-06 [updatestate_assign_eliminate]: 2.33998e-06 [updatestate_loads_eliminate]: 2.55002e-06 [cse]: 1.813e-05 [renormalize]: 8.70001e-07 [remove_dup_value]: 1.422e-05 [tuple_transform]: 7.136e-05, [1] [Cycle 1]: 6.685e-05, [4] [d_1]: 4.011e-05 [none_parameter_eliminate]: 1.56998e-06 [renormalize]: 2.19996e-07 [switch_simplify]: 6.39001e-06 [partial_unused_args_eliminate]: 1.84e-06 [add_recomputation]: 4.839e-05 [cse_after_recomputation]: 2.145e-05, [1] [Cycle 1]: 1.709e-05, [1] [cse]: 1.143e-05 [environ_conv]: 5.07e-06 [swap_dp_allreduce_reducescatter]: 5.77001e-06 [bias_add_comm_swap]: 2.91999e-06 [label_micro_interleaved_index]: 4.32e-06 [label_fine_grained_interleaved_index]: 2.75997e-06 [merge_cast_opt]: 1.42999e-06 [slice_recompute_activation]: 2.30002e-06 [micro_interleaved_order_control]: 2.11998e-06 [assign_add_opt]: 1.26002e-06 [ForceFp32Comm]: 7.80012e-07 [remove_cast_before_assign_add]: 9.70002e-07 [full_micro_interleaved_order_control]: 2.29999e-06 [reorder_send_recv_between_fp_bp]: 2.71e-06 [comm_op_add_attrs]: 9.5999e-07 [add_comm_op_reuse_tag]: 9.50007e-07 [interleave_split_concat_branches]: 1.53002e-06 [interleave_parallel_branches]: 1.06002e-06 [overlap_opt_shard_in_pipeline]: 1.13001e-06 [overlap_opt_shard_grad_in_pipeline]: 1.74e-06 [control_data_broadcast_order]: 1.232e-05 [grouped_pairwise_exchange_alltoall]: 1.53002e-06 [offloading_packed_experts]: 3.91001e-06 [overlap_recompute_and_grad_model_parallel]: 4.97e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.14e-06 [overlap_recompute_allgather_and_fa_grad]: 1.48002e-06 [overlap_recompute_comm]: 2.31e-06 [overlap_grad_ring_attention]: 4.12998e-06 [overlap_grad_flash_sp]: 1.809e-05 [begin_end_overlap_inline]: 7.10017e-07 [split_matmul_comm_elemetwise]: 2.13002e-06 [split_layernorm_comm]: 1.95001e-06 [handle_group_info]: 1.04003e-06 [symbol_engine_optimizer]: 7.317e-05, [1] [Cycle 1]: 6.877e-05, [6] [build]: 2.83003e-06 [elim_shapecalc]: 8.59998e-06 [elim_not_effective]: 1.18e-05 [opt_reshape]: 6.86001e-06 [fold_const_symbol]: 9.47001e-06 [renormalize]: 2.40019e-07 [detach_backward]: 1.91003e-06 [pipeline_parallel_scheduler]: 1.74e-06 [auto_monad_reorder]: 1.662e-05 [get_jit_bprop_graph]: 1.62999e-06 [rewriter_after_jit_bprop_graph]: 3.95998e-06 [opt_after_jit_grad]: 0.00046936 [validate]: 3.632e-05 Sums bootstrap : 0.000451s : 5.01% type_inference : 0.004679s : 51.96% event_method : 0.000012s : 0.13% auto_monad : 0.000054s : 0.60% graph_reusing : 0.000005s : 0.06% inline : 0.000002s : 0.03% add_attr.add_attr_with_inline.tag_attr : 0.000014s : 0.16% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000003s : 0.04% parallel-infer-symbol : 0.000003s : 0.03% pre_auto_parallel : 0.000025s : 0.27% insert-virtual-dataset : 0.000002s : 0.03% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.02% optimize.py_interpret_to_execute : 0.000016s : 0.18% optimize.rewriter_before_opt_a : 0.000043s : 0.48% optimize.opt_a.expand_dump_flag : 0.000004s : 0.04% optimize.opt_a.switch_simplify : 0.000034s : 0.38% optimize.opt_a.loop_unroll : 0.000019s : 0.21% optimize.opt_a.a_1 : 0.000470s : 5.22% optimize.opt_a.with_stream_mark : 0.000028s : 0.31% optimize.opt_a.recompute_prepare : 0.000015s : 0.17% optimize.opt_a.updatestate_depend_eliminate : 0.000007s : 0.08% optimize.opt_a.updatestate_assign_eliminate : 0.000006s : 0.07% optimize.opt_a.updatestate_loads_eliminate : 0.000007s : 0.07% optimize.opt_a.parameter_eliminate : 0.000003s : 0.03% optimize.opt_a.a_2 : 0.000174s : 1.93% optimize.opt_a.accelerated_algorithm : 0.000014s : 0.15% optimize.opt_a.shard : 0.000004s : 0.04% optimize.opt_a.meta_shard_fg_expand : 0.000003s : 0.03% optimize.opt_a.shard_inline : 0.000012s : 0.14% optimize.opt_a.merge_send_recv : 0.000015s : 0.16% optimize.opt_a.auto_parallel : 0.000012s : 0.14% optimize.opt_a.parallel : 0.000023s : 0.26% optimize.opt_a.flash_sp : 0.000012s : 0.13% optimize.opt_a.merge_comm : 0.000007s : 0.08% optimize.opt_a.allreduce_fusion : 0.000007s : 0.08% optimize.opt_a.matmul_add_comm_reduction : 0.000016s : 0.17% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000015s : 0.17% optimize.opt_a.virtual_dataset : 0.000012s : 0.13% optimize.opt_a.get_grad_eliminate_ : 0.000011s : 0.12% optimize.opt_a.virtual_output : 0.000011s : 0.13% optimize.opt_a.merge_forward : 0.000006s : 0.07% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.03% optimize.opt_a.offload_activation : 0.000017s : 0.18% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000023s : 0.26% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.03% optimize.opt_a.before_grad : 0.000019s : 0.22% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000008s : 0.09% optimize.opt_a.meta_fg_expand : 0.000005s : 0.06% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.04% optimize.opt_a.receive_attached : 0.000003s : 0.03% optimize.opt_a.after_resolve : 0.000019s : 0.21% optimize.opt_a.a_after_grad : 0.000017s : 0.19% optimize.opt_a.renormalize : 0.000443s : 4.92% optimize.opt_a.add_forward_monad_depend : 0.000007s : 0.08% optimize.opt_a.auto_monad_grad : 0.000003s : 0.04% optimize.opt_a.auto_monad_eliminator : 0.000023s : 0.26% optimize.opt_a.cse : 0.000045s : 0.50% optimize.opt_a.a_3 : 0.000080s : 0.89% optimize.py_interpret_to_execute_after_opt_a : 0.000010s : 0.11% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.02% optimize.rewriter_after_opt_a : 0.000034s : 0.38% optimize.convert_after_rewriter : 0.000007s : 0.07% optimize.order_py_execute_after_rewriter : 0.000005s : 0.06% optimize.mutable_eliminate : 0.000484s : 5.37% optimize.opt_b.b_1 : 0.000118s : 1.31% optimize.opt_b.b_2 : 0.000007s : 0.08% optimize.opt_b.updatestate_depend_eliminate : 0.000007s : 0.07% optimize.opt_b.updatestate_assign_eliminate : 0.000002s : 0.03% optimize.opt_b.updatestate_loads_eliminate : 0.000002s : 0.03% optimize.opt_b.renormalize : 0.000001s : 0.01% optimize.opt_b.cse : 0.000020s : 0.22% optimize.optimize_parallel_all_gather_comm : 0.000016s : 0.18% optimize.overlap_param_gather : 0.000002s : 0.02% optimize.cconv : 0.000025s : 0.28% optimize.loop_unroll : 0.000425s : 4.72% optimize.opt_after_cconv.c_1 : 0.000027s : 0.30% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000005s : 0.06% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000002s : 0.03% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.cse : 0.000018s : 0.20% optimize.opt_after_cconv.renormalize : 0.000001s : 0.01% optimize.remove_dup_value : 0.000014s : 0.16% optimize.tuple_transform.d_1 : 0.000040s : 0.45% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.02% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000006s : 0.07% optimize.partial_unused_args_eliminate : 0.000002s : 0.02% optimize.add_recomputation : 0.000048s : 0.54% optimize.cse_after_recomputation.cse : 0.000011s : 0.13% optimize.environ_conv : 0.000005s : 0.06% optimize.swap_dp_allreduce_reducescatter : 0.000006s : 0.06% optimize.bias_add_comm_swap : 0.000003s : 0.03% optimize.label_micro_interleaved_index : 0.000004s : 0.05% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.03% optimize.merge_cast_opt : 0.000001s : 0.02% optimize.slice_recompute_activation : 0.000002s : 0.03% optimize.micro_interleaved_order_control : 0.000002s : 0.02% optimize.assign_add_opt : 0.000001s : 0.01% optimize.ForceFp32Comm : 0.000001s : 0.01% optimize.remove_cast_before_assign_add : 0.000001s : 0.01% optimize.full_micro_interleaved_order_control : 0.000002s : 0.03% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.03% optimize.comm_op_add_attrs : 0.000001s : 0.01% optimize.add_comm_op_reuse_tag : 0.000001s : 0.01% optimize.interleave_split_concat_branches : 0.000002s : 0.02% optimize.interleave_parallel_branches : 0.000001s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.02% optimize.control_data_broadcast_order : 0.000012s : 0.14% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.02% optimize.offloading_packed_experts : 0.000004s : 0.04% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.06% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.02% optimize.overlap_recompute_comm : 0.000002s : 0.03% optimize.overlap_grad_ring_attention : 0.000004s : 0.05% optimize.overlap_grad_flash_sp : 0.000018s : 0.20% optimize.begin_end_overlap_inline : 0.000001s : 0.01% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.02% optimize.split_layernorm_comm : 0.000002s : 0.02% optimize.handle_group_info : 0.000001s : 0.01% optimize.symbol_engine_optimizer.build : 0.000003s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000009s : 0.10% optimize.symbol_engine_optimizer.elim_not_effective : 0.000012s : 0.13% optimize.symbol_engine_optimizer.opt_reshape : 0.000007s : 0.08% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000009s : 0.11% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.02% pipeline_parallel_scheduler : 0.000002s : 0.02% auto_monad_reorder : 0.000017s : 0.18% get_jit_bprop_graph : 0.000002s : 0.02% rewriter_after_jit_bprop_graph : 0.000004s : 0.04% opt_after_jit_grad : 0.000469s : 5.21% validate : 0.000036s : 0.40% Time group info: ------[substitution.] 0.000176 23 39.61% : 0.000070s : 4: substitution.arithmetic_simplify 1.09% : 0.000002s : 2: substitution.elim_not_effective 0.73% : 0.000001s : 2: substitution.fold_const_symbol 3.23% : 0.000006s : 3: substitution.graph_param_transform 48.98% : 0.000086s : 2: substitution.inline 2.00% : 0.000004s : 4: substitution.j_node_and_user_rematch 2.39% : 0.000004s : 4: substitution.remove_not_recompute_node 1.96% : 0.000003s : 2: substitution.replace_old_param ------[type_inference.] 0.004632 2 91.36% : 0.004232s : 1: type_inference.infer 8.64% : 0.000400s : 1: type_inference.specialize ------[replace.] 0.000020 2 100.00% : 0.000020s : 2: replace.inline ------[match.] 0.000084 2 100.00% : 0.000084s : 2: match.inline ------[predicate.] 0.000132 754 1.02% : 0.000001s : 7: predicate.accumulaten_eliminater 0.98% : 0.000001s : 3: predicate.ad_related_special_op_eliminate 0.67% : 0.000001s : 6: predicate.addn_check_dump 0.86% : 0.000001s : 7: predicate.addn_zero_filter 0.69% : 0.000001s : 7: predicate.adjust_all_reduce_mul_add 2.83% : 0.000004s : 13: predicate.arithmetic_simplify 0.91% : 0.000001s : 7: predicate.cast_eliminate 0.74% : 0.000001s : 6: predicate.check_bprop_eliminate 0.69% : 0.000001s : 6: predicate.compare_switch_simplify 0.23% : 0.000000s : 3: predicate.const_output_eliminate 0.78% : 0.000001s : 6: predicate.depend_value_elim 0.81% : 0.000001s : 7: predicate.dict_get_item_const_eliminator 0.94% : 0.000001s : 7: predicate.dict_get_item_eliminator 0.76% : 0.000001s : 7: predicate.dict_set_item_eliminator 1.32% : 0.000002s : 6: predicate.dumpgradient_eliminate 0.25% : 0.000000s : 3: predicate.elim_not_effective 0.47% : 0.000001s : 3: predicate.elim_shapecalc_of_broadcastargs 1.11% : 0.000001s : 10: predicate.environ_add_const_eliminate 1.01% : 0.000001s : 10: predicate.environ_get_add_eliminate 1.05% : 0.000001s : 10: predicate.environ_get_depend_swap 1.78% : 0.000002s : 16: predicate.environ_get_eliminate 1.03% : 0.000001s : 10: predicate.environ_get_set_eliminate 0.94% : 0.000001s : 9: predicate.exchange_switch_depend_value 2.06% : 0.000003s : 9: predicate.float_depend_g_call 0.65% : 0.000001s : 6: predicate.float_environ_get_switch 0.96% : 0.000001s : 9: predicate.float_tuple_getitem_switch 0.23% : 0.000000s : 3: predicate.fold_const_symbol 0.81% : 0.000001s : 6: predicate.get_grad_eliminate 0.39% : 0.000001s : 3: predicate.graph_param_transform 0.81% : 0.000001s : 6: predicate.incorporate_call 0.67% : 0.000001s : 6: predicate.incorporate_call_switch 6.73% : 0.000009s : 34: predicate.inline 0.97% : 0.000001s : 6: predicate.inline_without_move 0.40% : 0.000001s : 6: predicate.j_node_and_user_rematch 1.05% : 0.000001s : 6: predicate.less_batch_normalization 1.75% : 0.000002s : 13: predicate.list_to_tuple_eliminator_ 2.05% : 0.000003s : 20: predicate.load_eliminater 1.36% : 0.000002s : 3: predicate.loop_unroll_after_grad 1.66% : 0.000002s : 14: predicate.loop_unroll_before_grad 1.80% : 0.000002s : 13: predicate.make_slice_get_slice_eliminator 0.64% : 0.000001s : 6: predicate.merge_addn 0.67% : 0.000001s : 6: predicate.micro_step_allgather_replace 0.67% : 0.000001s : 6: predicate.mini_step_allgather_replace 0.67% : 0.000001s : 7: predicate.minmaximum_grad 1.70% : 0.000002s : 3: predicate.mutable_eliminate 0.49% : 0.000001s : 3: predicate.opt_reshape 0.44% : 0.000001s : 3: predicate.parallel_virtual_node 1.32% : 0.000002s : 9: predicate.partial_defer_inline 1.26% : 0.000002s : 10: predicate.partial_eliminate 0.74% : 0.000001s : 7: predicate.print_const_string_wrapper 0.92% : 0.000001s : 6: predicate.reduce_all_const_elim 0.94% : 0.000001s : 7: predicate.reduce_eliminate 1.98% : 0.000003s : 20: predicate.redundant_stop_gradient_eliminater 0.64% : 0.000001s : 6: predicate.remove_not_recompute_node 1.16% : 0.000002s : 13: predicate.replace_applicator 0.67% : 0.000001s : 6: predicate.replace_old_param 0.40% : 0.000001s : 3: predicate.reset_defer_inline 0.88% : 0.000001s : 7: predicate.reshape_eliminate 0.81% : 0.000001s : 6: predicate.row_tensor_add_zeros_like 0.45% : 0.000001s : 3: predicate.row_tensor_eliminate 0.92% : 0.000001s : 6: predicate.same_eliminate 0.57% : 0.000001s : 6: predicate.set_cell_output_no_recompute 1.17% : 0.000002s : 6: predicate.shard_identity_eliminate 0.86% : 0.000001s : 6: predicate.special_op_eliminate 0.94% : 0.000001s : 6: predicate.specialize_transform 1.09% : 0.000001s : 6: predicate.split_environ_get_set_with_tuple_value 0.81% : 0.000001s : 6: predicate.stack_unstack_eliminate 0.42% : 0.000001s : 3: predicate.switch_call_monad_eliminater 1.05% : 0.000001s : 9: predicate.switch_defer_inline 1.63% : 0.000002s : 15: predicate.switch_layer_defer_inline 4.23% : 0.000006s : 32: predicate.switch_simplify 0.73% : 0.000001s : 7: predicate.tile_eliminate 0.79% : 0.000001s : 7: predicate.transpose_eliminate 1.64% : 0.000002s : 13: predicate.tuple_list_convert_item_index_to_positive 1.84% : 0.000002s : 13: predicate.tuple_list_get_item_const_eliminator 1.43% : 0.000002s : 13: predicate.tuple_list_get_item_depend_reorder 3.23% : 0.000004s : 19: predicate.tuple_list_get_item_eliminator 1.43% : 0.000002s : 13: predicate.tuple_list_get_set_item_eliminator 2.49% : 0.000003s : 19: predicate.tuple_list_set_item_eliminator 1.52% : 0.000002s : 13: predicate.tuple_to_list_eliminator_ 1.98% : 0.000003s : 20: predicate.updatestate_pure_node_eliminater 2.86% : 0.000004s : 26: predicate.updatestate_useless_node_eliminater 0.38% : 0.000001s : 3: predicate.value_based_eliminate 0.76% : 0.000001s : 6: predicate.virtual_dataset_eliminate 0.75% : 0.000001s : 6: predicate.virtual_output_eliminate 0.36% : 0.000000s : 3: predicate.virtual_view_grad_eliminate 0.47% : 0.000001s : 3: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000194 5 9.13% : 0.000018s : 1: func_graph_cloner_run.FuncGraphClonerGraph 90.87% : 0.000176s : 4: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.021539 192 0.02% : 0.000003s : 1: ForceFp32Comm 14.19% : 0.003057s : 1: add_attr 14.15% : 0.003048s : 1: add_attr_with_inline 0.02% : 0.000004s : 1: add_comm_op_reuse_tag 0.24% : 0.000053s : 1: add_recomputation 0.02% : 0.000004s : 1: assign_add_opt 0.27% : 0.000059s : 1: auto_monad 0.09% : 0.000020s : 1: auto_monad_reorder 0.02% : 0.000004s : 1: begin_end_overlap_inline 0.03% : 0.000006s : 1: bias_add_comm_swap 2.23% : 0.000480s : 1: bootstrap 0.13% : 0.000029s : 1: cconv 0.02% : 0.000004s : 1: comm_op_add_attrs 0.07% : 0.000015s : 1: control_data_broadcast_order 0.04% : 0.000010s : 1: convert_after_rewriter 0.11% : 0.000024s : 1: cse_after_recomputation 0.02% : 0.000005s : 1: dataset_repeat_opt 0.02% : 0.000005s : 1: detach_backward 0.04% : 0.000008s : 1: environ_conv 0.09% : 0.000018s : 1: event_method 0.02% : 0.000005s : 1: full_micro_interleaved_order_control 0.02% : 0.000005s : 1: get_jit_bprop_graph 0.04% : 0.000009s : 1: graph_reusing 0.02% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.02% : 0.000004s : 1: handle_group_info 0.02% : 0.000005s : 1: inline 0.03% : 0.000006s : 1: insert-virtual-dataset 0.02% : 0.000004s : 1: interleave_parallel_branches 0.02% : 0.000004s : 1: interleave_split_concat_branches 0.03% : 0.000006s : 1: label_fine_grained_interleaved_index 0.03% : 0.000007s : 1: label_micro_interleaved_index 2.01% : 0.000433s : 1: loop_unroll 0.02% : 0.000004s : 1: merge_cast_opt 0.02% : 0.000005s : 1: micro_interleaved_order_control 2.29% : 0.000494s : 1: mutable_eliminate 0.03% : 0.000007s : 1: offloading_packed_experts 0.07% : 0.000014s : 1: opt.transform.loop_unroll_optimizer 0.07% : 0.000016s : 1: opt.transform.mutable_eliminate 3.96% : 0.000853s : 78: opt.transform.opt_a 0.12% : 0.000026s : 1: opt.transform.opt_after_cconv 0.11% : 0.000024s : 1: opt.transform.opt_after_jit_grad 0.43% : 0.000093s : 28: opt.transform.opt_b 0.21% : 0.000044s : 2: opt.transform.opt_trans_graph 0.15% : 0.000033s : 4: opt.transform.symbol_engine_opt 9.91% : 0.002135s : 1: opt_a 0.48% : 0.000103s : 1: opt_after_cconv 2.22% : 0.000479s : 1: opt_after_jit_grad 0.95% : 0.000205s : 1: opt_b 18.81% : 0.004051s : 1: optimize 0.09% : 0.000019s : 1: optimize_parallel_all_gather_comm 0.04% : 0.000008s : 1: order_py_execute_after_rewriter 0.10% : 0.000021s : 1: overlap_grad_flash_sp 0.02% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.03% : 0.000007s : 1: overlap_grad_ring_attention 0.02% : 0.000004s : 1: overlap_opt_shard_grad_in_pipeline 0.02% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.02% : 0.000005s : 1: overlap_param_gather 0.02% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.04% : 0.000008s : 1: overlap_recompute_and_grad_model_parallel 0.02% : 0.000005s : 1: overlap_recompute_comm 0.03% : 0.000006s : 1: parallel-infer-symbol 0.02% : 0.000004s : 1: parallel-infer-symbol-second 0.02% : 0.000005s : 1: partial_unused_args_eliminate 0.02% : 0.000005s : 1: pipeline_parallel_scheduler 0.02% : 0.000004s : 1: pipeline_split 0.13% : 0.000029s : 1: pre_auto_parallel 0.09% : 0.000020s : 1: py_interpret_to_execute 0.06% : 0.000013s : 1: py_interpret_to_execute_after_opt_a 0.02% : 0.000004s : 1: remove_cast_before_assign_add 0.08% : 0.000018s : 1: remove_dup_value 1.12% : 0.000240s : 1: renormalize.infer 0.91% : 0.000195s : 1: renormalize.specialize 0.03% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.03% : 0.000007s : 1: rewriter_after_jit_bprop_graph 0.18% : 0.000038s : 1: rewriter_after_opt_a 0.22% : 0.000047s : 1: rewriter_before_opt_a 0.02% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.02% : 0.000005s : 1: slice_recompute_activation 0.02% : 0.000005s : 1: split_layernorm_comm 0.02% : 0.000005s : 1: split_matmul_comm_elemetwise 0.04% : 0.000009s : 1: swap_dp_allreduce_reducescatter 0.35% : 0.000076s : 1: symbol_engine_optimizer 0.34% : 0.000074s : 1: tuple_transform 21.80% : 0.004696s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:46.590.771 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:46.591.052 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0147977, [21] [bootstrap]: 0.00044364 [type_inference]: 0.00482157 [event_method]: 1.247e-05 [auto_monad]: 5.562e-05 [graph_reusing]: 5.62999e-06 [inline]: 2.32999e-06 [add_attr]: 0.00324648, [1] [add_attr_with_inline]: 0.00323767, [1] [Cycle 1]: 6.766e-05, [2] [tag_attr]: 1.423e-05 [meta_addattr_fg_expand]: 3.73999e-06 [parallel-infer-symbol]: 3.21999e-06 [pre_auto_parallel]: 2.601e-05 [insert-virtual-dataset]: 2.64001e-06 [parallel-infer-symbol-second]: 7.60017e-07 [dataset_repeat_opt]: 1.92001e-06 [pipeline_split]: 1.68002e-06 [optimize]: 0.00499515, [53] [py_interpret_to_execute]: 2.124e-05 [rewriter_before_opt_a]: 4.966e-05 [opt_a]: 0.00270116, [2] [Cycle 1]: 0.00179812, [45] [expand_dump_flag]: 2.46e-06 [switch_simplify]: 2.505e-05 [loop_unroll]: 1.418e-05 [a_1]: 0.00032647 [with_stream_mark]: 1.653e-05 [recompute_prepare]: 9.51e-06 [updatestate_depend_eliminate]: 3.65e-06 [updatestate_assign_eliminate]: 3.08e-06 [updatestate_loads_eliminate]: 3.33e-06 [parameter_eliminate]: 2.09999e-06 [a_2]: 0.00012394 [accelerated_algorithm]: 7.83001e-06 [shard]: 2.35002e-06 [meta_shard_fg_expand]: 1.93002e-06 [shard_inline]: 6.31e-06 [merge_send_recv]: 8.85001e-06 [auto_parallel]: 7.33999e-06 [parallel]: 1.85e-05 [flash_sp]: 8.46002e-06 [merge_comm]: 4.48001e-06 [allreduce_fusion]: 3.9e-06 [matmul_add_comm_reduction]: 9.62999e-06 [allreduce_slice_to_reducescatter]: 6.69999e-07 [virtual_shard_identity]: 8.91002e-06 [virtual_dataset]: 6.49999e-06 [get_grad_eliminate_]: 5.86e-06 [virtual_output]: 6.39001e-06 [merge_forward]: 4.06001e-06 [cell_reuse_recompute_pass]: 1.62001e-06 [offload_activation]: 9.67001e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.508e-05 [merge_recompute_call_nodes]: 1.59998e-06 [before_grad]: 1.13e-05 [set_forward_comm_id_for_comm_node_pass]: 4.52998e-06 [meta_fg_expand]: 2.73e-06 [flash_sp_send_recv_attached]: 2.84999e-06 [receive_attached]: 2.94999e-06 [after_resolve]: 1.102e-05 [a_after_grad]: 9.89001e-06 [renormalize]: 0.00056727 [add_forward_monad_depend]: 5.19e-06 [auto_monad_grad]: 2.58998e-06 [auto_monad_eliminator]: 1.552e-05 [cse]: 2.899e-05 [a_3]: 6.208e-05 [Cycle 2]: 0.00088938, [45] [expand_dump_flag]: 9.60019e-07 [switch_simplify]: 7.18e-06 [loop_unroll]: 6.01e-06 [a_1]: 0.00015049 [with_stream_mark]: 1.71e-05 [recompute_prepare]: 7.3e-06 [updatestate_depend_eliminate]: 3.55998e-06 [updatestate_assign_eliminate]: 2.54001e-06 [updatestate_loads_eliminate]: 2.83e-06 [parameter_eliminate]: 1.38002e-06 [a_2]: 0.00011348 [accelerated_algorithm]: 6.89999e-06 [shard]: 1.39e-06 [meta_shard_fg_expand]: 1.59e-06 [shard_inline]: 6.59001e-06 [merge_send_recv]: 5.57999e-06 [auto_parallel]: 6.63998e-06 [parallel]: 5.61e-06 [flash_sp]: 3.58e-06 [merge_comm]: 3.7e-06 [allreduce_fusion]: 3.41999e-06 [matmul_add_comm_reduction]: 6.58e-06 [allreduce_slice_to_reducescatter]: 4.69998e-07 [virtual_shard_identity]: 7.07002e-06 [virtual_dataset]: 5.68997e-06 [get_grad_eliminate_]: 6.21998e-06 [virtual_output]: 5.59998e-06 [merge_forward]: 3.33e-06 [cell_reuse_recompute_pass]: 1.34e-06 [offload_activation]: 7.45e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.432e-05 [merge_recompute_call_nodes]: 1.04e-06 [before_grad]: 9.80002e-06 [set_forward_comm_id_for_comm_node_pass]: 3.93999e-06 [meta_fg_expand]: 2.40002e-06 [flash_sp_send_recv_attached]: 1.50001e-06 [receive_attached]: 1.39e-06 [after_resolve]: 9.52999e-06 [a_after_grad]: 8.32003e-06 [renormalize]: 6.00121e-08 [add_forward_monad_depend]: 2.54999e-06 [auto_monad_grad]: 1.28002e-06 [auto_monad_eliminator]: 9.57999e-06 [cse]: 1.915e-05 [a_3]: 5.108e-05 [py_interpret_to_execute_after_opt_a]: 1.334e-05 [slice_cell_reuse_recomputed_activation]: 4.65001e-06 [rewriter_after_opt_a]: 4.06e-05 [convert_after_rewriter]: 1.021e-05 [order_py_execute_after_rewriter]: 8.05e-06 [mutable_eliminate]: 0.00052409 [opt_b]: 0.0002817, [1] [Cycle 1]: 0.00027162, [7] [b_1]: 0.00016747 [b_2]: 7.18e-06 [updatestate_depend_eliminate]: 7.24001e-06 [updatestate_assign_eliminate]: 2.71999e-06 [updatestate_loads_eliminate]: 2.55002e-06 [renormalize]: 5.59987e-07 [cse]: 2.293e-05 [optimize_parallel_all_gather_comm]: 2.006e-05 [overlap_param_gather]: 4.75001e-06 [cconv]: 3.012e-05 [loop_unroll]: 0.00046153 [opt_after_cconv]: 0.00013061, [1] [Cycle 1]: 0.00012191, [7] [c_1]: 2.882e-05 [parameter_eliminate]: 3.32997e-06 [updatestate_depend_eliminate]: 5.97001e-06 [updatestate_assign_eliminate]: 2.69001e-06 [updatestate_loads_eliminate]: 3.06999e-06 [cse]: 2.199e-05 [renormalize]: 2.69996e-07 [remove_dup_value]: 1.766e-05 [tuple_transform]: 8.986e-05, [1] [Cycle 1]: 8.295e-05, [4] [d_1]: 4.203e-05 [none_parameter_eliminate]: 1.89e-06 [renormalize]: 2.19996e-07 [switch_simplify]: 6.90998e-06 [partial_unused_args_eliminate]: 4.95001e-06 [add_recomputation]: 5.058e-05 [cse_after_recomputation]: 2.873e-05, [1] [Cycle 1]: 2.211e-05, [1] [cse]: 1.281e-05 [environ_conv]: 8.66997e-06 [swap_dp_allreduce_reducescatter]: 7.70998e-06 [bias_add_comm_swap]: 4.68999e-06 [label_micro_interleaved_index]: 6.74999e-06 [label_fine_grained_interleaved_index]: 4.89e-06 [merge_cast_opt]: 3.71999e-06 [slice_recompute_activation]: 4.53999e-06 [micro_interleaved_order_control]: 4.83001e-06 [assign_add_opt]: 3.8e-06 [ForceFp32Comm]: 3.16001e-06 [remove_cast_before_assign_add]: 3.66999e-06 [full_micro_interleaved_order_control]: 4.68999e-06 [reorder_send_recv_between_fp_bp]: 5.49e-06 [comm_op_add_attrs]: 3.83001e-06 [add_comm_op_reuse_tag]: 3.2e-06 [interleave_split_concat_branches]: 3.53e-06 [interleave_parallel_branches]: 3.48999e-06 [overlap_opt_shard_in_pipeline]: 3.85e-06 [overlap_opt_shard_grad_in_pipeline]: 4.32998e-06 [control_data_broadcast_order]: 1.637e-05 [grouped_pairwise_exchange_alltoall]: 4.19997e-06 [offloading_packed_experts]: 7.43999e-06 [overlap_recompute_and_grad_model_parallel]: 7.16001e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.48999e-06 [overlap_recompute_allgather_and_fa_grad]: 3.73999e-06 [overlap_recompute_comm]: 5.43002e-06 [overlap_grad_ring_attention]: 6.76999e-06 [overlap_grad_flash_sp]: 2.363e-05 [begin_end_overlap_inline]: 3.21999e-06 [split_matmul_comm_elemetwise]: 4.45e-06 [split_layernorm_comm]: 3.95e-06 [handle_group_info]: 3.2e-06 [symbol_engine_optimizer]: 0.00010347, [1] [Cycle 1]: 9.628e-05, [6] [build]: 3.28e-06 [elim_shapecalc]: 1.195e-05 [elim_not_effective]: 1.361e-05 [opt_reshape]: 7.21001e-06 [fold_const_symbol]: 1.028e-05 [renormalize]: 1.50001e-07 [detach_backward]: 4.23999e-06 [pipeline_parallel_scheduler]: 1.74998e-06 [auto_monad_reorder]: 2.018e-05 [get_jit_bprop_graph]: 1.77001e-06 [rewriter_after_jit_bprop_graph]: 4.79e-06 [opt_after_jit_grad]: 0.00050093 [validate]: 6.393e-05 Sums bootstrap : 0.000444s : 4.55% type_inference : 0.004822s : 49.41% event_method : 0.000012s : 0.13% auto_monad : 0.000056s : 0.57% graph_reusing : 0.000006s : 0.06% inline : 0.000002s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000014s : 0.15% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000004s : 0.04% parallel-infer-symbol : 0.000003s : 0.03% pre_auto_parallel : 0.000026s : 0.27% insert-virtual-dataset : 0.000003s : 0.03% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.02% optimize.py_interpret_to_execute : 0.000021s : 0.22% optimize.rewriter_before_opt_a : 0.000050s : 0.51% optimize.opt_a.expand_dump_flag : 0.000003s : 0.04% optimize.opt_a.switch_simplify : 0.000032s : 0.33% optimize.opt_a.loop_unroll : 0.000020s : 0.21% optimize.opt_a.a_1 : 0.000477s : 4.89% optimize.opt_a.with_stream_mark : 0.000034s : 0.34% optimize.opt_a.recompute_prepare : 0.000017s : 0.17% optimize.opt_a.updatestate_depend_eliminate : 0.000007s : 0.07% optimize.opt_a.updatestate_assign_eliminate : 0.000006s : 0.06% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.06% optimize.opt_a.parameter_eliminate : 0.000003s : 0.04% optimize.opt_a.a_2 : 0.000237s : 2.43% optimize.opt_a.accelerated_algorithm : 0.000015s : 0.15% optimize.opt_a.shard : 0.000004s : 0.04% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.04% optimize.opt_a.shard_inline : 0.000013s : 0.13% optimize.opt_a.merge_send_recv : 0.000014s : 0.15% optimize.opt_a.auto_parallel : 0.000014s : 0.14% optimize.opt_a.parallel : 0.000024s : 0.25% optimize.opt_a.flash_sp : 0.000012s : 0.12% optimize.opt_a.merge_comm : 0.000008s : 0.08% optimize.opt_a.allreduce_fusion : 0.000007s : 0.08% optimize.opt_a.matmul_add_comm_reduction : 0.000016s : 0.17% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000016s : 0.16% optimize.opt_a.virtual_dataset : 0.000012s : 0.12% optimize.opt_a.get_grad_eliminate_ : 0.000012s : 0.12% optimize.opt_a.virtual_output : 0.000012s : 0.12% optimize.opt_a.merge_forward : 0.000007s : 0.08% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.03% optimize.opt_a.offload_activation : 0.000017s : 0.18% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000029s : 0.30% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.03% optimize.opt_a.before_grad : 0.000021s : 0.22% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000008s : 0.09% optimize.opt_a.meta_fg_expand : 0.000005s : 0.05% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.04% optimize.opt_a.receive_attached : 0.000004s : 0.04% optimize.opt_a.after_resolve : 0.000021s : 0.21% optimize.opt_a.a_after_grad : 0.000018s : 0.19% optimize.opt_a.renormalize : 0.000567s : 5.81% optimize.opt_a.add_forward_monad_depend : 0.000008s : 0.08% optimize.opt_a.auto_monad_grad : 0.000004s : 0.04% optimize.opt_a.auto_monad_eliminator : 0.000025s : 0.26% optimize.opt_a.cse : 0.000048s : 0.49% optimize.opt_a.a_3 : 0.000113s : 1.16% optimize.py_interpret_to_execute_after_opt_a : 0.000013s : 0.14% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.05% optimize.rewriter_after_opt_a : 0.000041s : 0.42% optimize.convert_after_rewriter : 0.000010s : 0.10% optimize.order_py_execute_after_rewriter : 0.000008s : 0.08% optimize.mutable_eliminate : 0.000524s : 5.37% optimize.opt_b.b_1 : 0.000167s : 1.72% optimize.opt_b.b_2 : 0.000007s : 0.07% optimize.opt_b.updatestate_depend_eliminate : 0.000007s : 0.07% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_b.renormalize : 0.000001s : 0.01% optimize.opt_b.cse : 0.000023s : 0.23% optimize.optimize_parallel_all_gather_comm : 0.000020s : 0.21% optimize.overlap_param_gather : 0.000005s : 0.05% optimize.cconv : 0.000030s : 0.31% optimize.loop_unroll : 0.000462s : 4.73% optimize.opt_after_cconv.c_1 : 0.000029s : 0.30% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.06% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.cse : 0.000022s : 0.23% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000018s : 0.18% optimize.tuple_transform.d_1 : 0.000042s : 0.43% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.02% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000007s : 0.07% optimize.partial_unused_args_eliminate : 0.000005s : 0.05% optimize.add_recomputation : 0.000051s : 0.52% optimize.cse_after_recomputation.cse : 0.000013s : 0.13% optimize.environ_conv : 0.000009s : 0.09% optimize.swap_dp_allreduce_reducescatter : 0.000008s : 0.08% optimize.bias_add_comm_swap : 0.000005s : 0.05% optimize.label_micro_interleaved_index : 0.000007s : 0.07% optimize.label_fine_grained_interleaved_index : 0.000005s : 0.05% optimize.merge_cast_opt : 0.000004s : 0.04% optimize.slice_recompute_activation : 0.000005s : 0.05% optimize.micro_interleaved_order_control : 0.000005s : 0.05% optimize.assign_add_opt : 0.000004s : 0.04% optimize.ForceFp32Comm : 0.000003s : 0.03% optimize.remove_cast_before_assign_add : 0.000004s : 0.04% optimize.full_micro_interleaved_order_control : 0.000005s : 0.05% optimize.reorder_send_recv_between_fp_bp : 0.000005s : 0.06% optimize.comm_op_add_attrs : 0.000004s : 0.04% optimize.add_comm_op_reuse_tag : 0.000003s : 0.03% optimize.interleave_split_concat_branches : 0.000004s : 0.04% optimize.interleave_parallel_branches : 0.000003s : 0.04% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.04% optimize.overlap_opt_shard_grad_in_pipeline : 0.000004s : 0.04% optimize.control_data_broadcast_order : 0.000016s : 0.17% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.04% optimize.offloading_packed_experts : 0.000007s : 0.08% optimize.overlap_recompute_and_grad_model_parallel : 0.000007s : 0.07% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000003s : 0.04% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.04% optimize.overlap_recompute_comm : 0.000005s : 0.06% optimize.overlap_grad_ring_attention : 0.000007s : 0.07% optimize.overlap_grad_flash_sp : 0.000024s : 0.24% optimize.begin_end_overlap_inline : 0.000003s : 0.03% optimize.split_matmul_comm_elemetwise : 0.000004s : 0.05% optimize.split_layernorm_comm : 0.000004s : 0.04% optimize.handle_group_info : 0.000003s : 0.03% optimize.symbol_engine_optimizer.build : 0.000003s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000012s : 0.12% optimize.symbol_engine_optimizer.elim_not_effective : 0.000014s : 0.14% optimize.symbol_engine_optimizer.opt_reshape : 0.000007s : 0.07% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000010s : 0.11% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000004s : 0.04% pipeline_parallel_scheduler : 0.000002s : 0.02% auto_monad_reorder : 0.000020s : 0.21% get_jit_bprop_graph : 0.000002s : 0.02% rewriter_after_jit_bprop_graph : 0.000005s : 0.05% opt_after_jit_grad : 0.000501s : 5.13% validate : 0.000064s : 0.66% Time group info: ------[substitution.] 0.000185 23 40.27% : 0.000074s : 4: substitution.arithmetic_simplify 1.04% : 0.000002s : 2: substitution.elim_not_effective 0.72% : 0.000001s : 2: substitution.fold_const_symbol 3.05% : 0.000006s : 3: substitution.graph_param_transform 47.85% : 0.000088s : 2: substitution.inline 2.53% : 0.000005s : 4: substitution.j_node_and_user_rematch 2.50% : 0.000005s : 4: substitution.remove_not_recompute_node 2.04% : 0.000004s : 2: substitution.replace_old_param ------[type_inference.] 0.004770 2 91.16% : 0.004349s : 1: type_inference.infer 8.84% : 0.000422s : 1: type_inference.specialize ------[replace.] 0.000021 2 100.00% : 0.000021s : 2: replace.inline ------[match.] 0.000087 2 100.00% : 0.000087s : 2: match.inline ------[predicate.] 0.000138 754 0.82% : 0.000001s : 7: predicate.accumulaten_eliminater 1.01% : 0.000001s : 3: predicate.ad_related_special_op_eliminate 0.63% : 0.000001s : 6: predicate.addn_check_dump 0.85% : 0.000001s : 7: predicate.addn_zero_filter 0.67% : 0.000001s : 7: predicate.adjust_all_reduce_mul_add 2.88% : 0.000004s : 13: predicate.arithmetic_simplify 0.74% : 0.000001s : 7: predicate.cast_eliminate 0.99% : 0.000001s : 6: predicate.check_bprop_eliminate 0.67% : 0.000001s : 6: predicate.compare_switch_simplify 0.22% : 0.000000s : 3: predicate.const_output_eliminate 0.93% : 0.000001s : 6: predicate.depend_value_elim 0.74% : 0.000001s : 7: predicate.dict_get_item_const_eliminator 0.90% : 0.000001s : 7: predicate.dict_get_item_eliminator 0.72% : 0.000001s : 7: predicate.dict_set_item_eliminator 1.26% : 0.000002s : 6: predicate.dumpgradient_eliminate 0.24% : 0.000000s : 3: predicate.elim_not_effective 0.48% : 0.000001s : 3: predicate.elim_shapecalc_of_broadcastargs 1.03% : 0.000001s : 10: predicate.environ_add_const_eliminate 1.00% : 0.000001s : 10: predicate.environ_get_add_eliminate 0.93% : 0.000001s : 10: predicate.environ_get_depend_swap 1.91% : 0.000003s : 16: predicate.environ_get_eliminate 0.93% : 0.000001s : 10: predicate.environ_get_set_eliminate 0.89% : 0.000001s : 9: predicate.exchange_switch_depend_value 1.97% : 0.000003s : 9: predicate.float_depend_g_call 0.64% : 0.000001s : 6: predicate.float_environ_get_switch 1.03% : 0.000001s : 9: predicate.float_tuple_getitem_switch 0.21% : 0.000000s : 3: predicate.fold_const_symbol 0.76% : 0.000001s : 6: predicate.get_grad_eliminate 0.22% : 0.000000s : 3: predicate.graph_param_transform 0.82% : 0.000001s : 6: predicate.incorporate_call 0.66% : 0.000001s : 6: predicate.incorporate_call_switch 7.09% : 0.000010s : 34: predicate.inline 1.00% : 0.000001s : 6: predicate.inline_without_move 0.40% : 0.000001s : 6: predicate.j_node_and_user_rematch 1.04% : 0.000001s : 6: predicate.less_batch_normalization 1.59% : 0.000002s : 13: predicate.list_to_tuple_eliminator_ 1.99% : 0.000003s : 20: predicate.load_eliminater 1.62% : 0.000002s : 3: predicate.loop_unroll_after_grad 1.79% : 0.000002s : 14: predicate.loop_unroll_before_grad 1.80% : 0.000002s : 13: predicate.make_slice_get_slice_eliminator 0.79% : 0.000001s : 6: predicate.merge_addn 0.74% : 0.000001s : 6: predicate.micro_step_allgather_replace 0.67% : 0.000001s : 6: predicate.mini_step_allgather_replace 0.66% : 0.000001s : 7: predicate.minmaximum_grad 1.76% : 0.000002s : 3: predicate.mutable_eliminate 0.48% : 0.000001s : 3: predicate.opt_reshape 0.44% : 0.000001s : 3: predicate.parallel_virtual_node 1.24% : 0.000002s : 9: predicate.partial_defer_inline 1.19% : 0.000002s : 10: predicate.partial_eliminate 0.74% : 0.000001s : 7: predicate.print_const_string_wrapper 0.71% : 0.000001s : 6: predicate.reduce_all_const_elim 0.96% : 0.000001s : 7: predicate.reduce_eliminate 2.06% : 0.000003s : 20: predicate.redundant_stop_gradient_eliminater 0.62% : 0.000001s : 6: predicate.remove_not_recompute_node 1.14% : 0.000002s : 13: predicate.replace_applicator 0.67% : 0.000001s : 6: predicate.replace_old_param 0.35% : 0.000000s : 3: predicate.reset_defer_inline 0.81% : 0.000001s : 7: predicate.reshape_eliminate 0.77% : 0.000001s : 6: predicate.row_tensor_add_zeros_like 0.57% : 0.000001s : 3: predicate.row_tensor_eliminate 1.17% : 0.000002s : 6: predicate.same_eliminate 0.65% : 0.000001s : 6: predicate.set_cell_output_no_recompute 0.96% : 0.000001s : 6: predicate.shard_identity_eliminate 0.97% : 0.000001s : 6: predicate.special_op_eliminate 1.11% : 0.000002s : 6: predicate.specialize_transform 1.11% : 0.000002s : 6: predicate.split_environ_get_set_with_tuple_value 0.99% : 0.000001s : 6: predicate.stack_unstack_eliminate 0.40% : 0.000001s : 3: predicate.switch_call_monad_eliminater 1.00% : 0.000001s : 9: predicate.switch_defer_inline 1.68% : 0.000002s : 15: predicate.switch_layer_defer_inline 4.33% : 0.000006s : 32: predicate.switch_simplify 0.75% : 0.000001s : 7: predicate.tile_eliminate 0.81% : 0.000001s : 7: predicate.transpose_eliminate 1.49% : 0.000002s : 13: predicate.tuple_list_convert_item_index_to_positive 1.50% : 0.000002s : 13: predicate.tuple_list_get_item_const_eliminator 1.29% : 0.000002s : 13: predicate.tuple_list_get_item_depend_reorder 3.20% : 0.000004s : 19: predicate.tuple_list_get_item_eliminator 1.40% : 0.000002s : 13: predicate.tuple_list_get_set_item_eliminator 2.40% : 0.000003s : 19: predicate.tuple_list_set_item_eliminator 1.52% : 0.000002s : 13: predicate.tuple_to_list_eliminator_ 1.97% : 0.000003s : 20: predicate.updatestate_pure_node_eliminater 2.91% : 0.000004s : 26: predicate.updatestate_useless_node_eliminater 0.35% : 0.000000s : 3: predicate.value_based_eliminate 0.72% : 0.000001s : 6: predicate.virtual_dataset_eliminate 0.80% : 0.000001s : 6: predicate.virtual_output_eliminate 0.37% : 0.000001s : 3: predicate.virtual_view_grad_eliminate 0.72% : 0.000001s : 3: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000214 5 8.46% : 0.000018s : 1: func_graph_cloner_run.FuncGraphClonerGraph 91.54% : 0.000196s : 4: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.024603 192 0.02% : 0.000006s : 1: ForceFp32Comm 13.23% : 0.003256s : 1: add_attr 13.18% : 0.003242s : 1: add_attr_with_inline 0.02% : 0.000006s : 1: add_comm_op_reuse_tag 0.22% : 0.000055s : 1: add_recomputation 0.03% : 0.000006s : 1: assign_add_opt 0.26% : 0.000064s : 1: auto_monad 0.11% : 0.000027s : 1: auto_monad_reorder 0.02% : 0.000006s : 1: begin_end_overlap_inline 0.03% : 0.000007s : 1: bias_add_comm_swap 1.98% : 0.000488s : 1: bootstrap 0.14% : 0.000034s : 1: cconv 0.03% : 0.000007s : 1: comm_op_add_attrs 0.08% : 0.000020s : 1: control_data_broadcast_order 0.05% : 0.000013s : 1: convert_after_rewriter 0.13% : 0.000032s : 1: cse_after_recomputation 0.03% : 0.000008s : 1: dataset_repeat_opt 0.09% : 0.000022s : 1: detach_backward 0.05% : 0.000012s : 1: environ_conv 0.09% : 0.000023s : 1: event_method 0.03% : 0.000008s : 1: full_micro_interleaved_order_control 0.03% : 0.000008s : 1: get_jit_bprop_graph 0.05% : 0.000012s : 1: graph_reusing 0.03% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.02% : 0.000006s : 1: handle_group_info 0.03% : 0.000008s : 1: inline 0.04% : 0.000009s : 1: insert-virtual-dataset 0.02% : 0.000006s : 1: interleave_parallel_branches 0.03% : 0.000006s : 1: interleave_split_concat_branches 0.03% : 0.000008s : 1: label_fine_grained_interleaved_index 0.04% : 0.000010s : 1: label_micro_interleaved_index 1.90% : 0.000469s : 1: loop_unroll 0.03% : 0.000007s : 1: merge_cast_opt 0.03% : 0.000007s : 1: micro_interleaved_order_control 2.16% : 0.000531s : 1: mutable_eliminate 0.04% : 0.000011s : 1: offloading_packed_experts 0.07% : 0.000016s : 1: opt.transform.loop_unroll_optimizer 0.07% : 0.000017s : 1: opt.transform.mutable_eliminate 3.60% : 0.000885s : 78: opt.transform.opt_a 0.11% : 0.000027s : 1: opt.transform.opt_after_cconv 0.10% : 0.000025s : 1: opt.transform.opt_after_jit_grad 0.40% : 0.000098s : 28: opt.transform.opt_b 0.18% : 0.000045s : 2: opt.transform.opt_trans_graph 0.16% : 0.000038s : 4: opt.transform.symbol_engine_opt 10.99% : 0.002705s : 1: opt_a 0.55% : 0.000134s : 1: opt_after_cconv 2.08% : 0.000512s : 1: opt_after_jit_grad 1.16% : 0.000286s : 1: opt_b 21.70% : 0.005340s : 1: optimize 0.10% : 0.000024s : 1: optimize_parallel_all_gather_comm 0.04% : 0.000011s : 1: order_py_execute_after_rewriter 0.11% : 0.000027s : 1: overlap_grad_flash_sp 0.03% : 0.000006s : 1: overlap_grad_matmul_and_grad_allreduce 0.04% : 0.000010s : 1: overlap_grad_ring_attention 0.03% : 0.000007s : 1: overlap_opt_shard_grad_in_pipeline 0.03% : 0.000007s : 1: overlap_opt_shard_in_pipeline 0.03% : 0.000008s : 1: overlap_param_gather 0.03% : 0.000006s : 1: overlap_recompute_allgather_and_fa_grad 0.04% : 0.000010s : 1: overlap_recompute_and_grad_model_parallel 0.03% : 0.000008s : 1: overlap_recompute_comm 0.04% : 0.000010s : 1: parallel-infer-symbol 0.03% : 0.000006s : 1: parallel-infer-symbol-second 0.03% : 0.000008s : 1: partial_unused_args_eliminate 0.04% : 0.000009s : 1: pipeline_parallel_scheduler 0.03% : 0.000007s : 1: pipeline_split 0.14% : 0.000033s : 1: pre_auto_parallel 0.10% : 0.000025s : 1: py_interpret_to_execute 0.07% : 0.000017s : 1: py_interpret_to_execute_after_opt_a 0.03% : 0.000006s : 1: remove_cast_before_assign_add 0.09% : 0.000021s : 1: remove_dup_value 1.37% : 0.000337s : 1: renormalize.infer 0.90% : 0.000221s : 1: renormalize.specialize 0.03% : 0.000008s : 1: reorder_send_recv_between_fp_bp 0.04% : 0.000011s : 1: rewriter_after_jit_bprop_graph 0.18% : 0.000044s : 1: rewriter_after_opt_a 0.22% : 0.000053s : 1: rewriter_before_opt_a 0.03% : 0.000008s : 1: slice_cell_reuse_recomputed_activation 0.03% : 0.000007s : 1: slice_recompute_activation 0.03% : 0.000007s : 1: split_layernorm_comm 0.03% : 0.000007s : 1: split_matmul_comm_elemetwise 0.04% : 0.000011s : 1: swap_dp_allreduce_reducescatter 0.43% : 0.000107s : 1: symbol_engine_optimizer 0.38% : 0.000093s : 1: tuple_transform 19.71% : 0.004849s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:46.782.166 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0136671, [21] [bootstrap]: 0.00044339 [type_inference]: 0.00487507 [event_method]: 1.23e-05 [auto_monad]: 5.6e-05 [graph_reusing]: 5.37001e-06 [inline]: 2.48998e-06 [add_attr]: 0.0031378, [1] [add_attr_with_inline]: 0.00312901, [1] [Cycle 1]: 5.435e-05, [2] [tag_attr]: 1.491e-05 [meta_addattr_fg_expand]: 4.47e-06 [parallel-infer-symbol]: 3.78001e-06 [pre_auto_parallel]: 2.685e-05 [insert-virtual-dataset]: 2.68998e-06 [parallel-infer-symbol-second]: 6.50005e-07 [dataset_repeat_opt]: 1.84e-06 [pipeline_split]: 1.87999e-06 [optimize]: 0.00438949, [53] [py_interpret_to_execute]: 1.897e-05 [rewriter_before_opt_a]: 4.61e-05 [opt_a]: 0.00237971, [2] [Cycle 1]: 0.00170317, [45] [expand_dump_flag]: 2.67001e-06 [switch_simplify]: 2.555e-05 [loop_unroll]: 1.357e-05 [a_1]: 0.00038819 [with_stream_mark]: 1.914e-05 [recompute_prepare]: 8.92999e-06 [updatestate_depend_eliminate]: 4.25e-06 [updatestate_assign_eliminate]: 3.24001e-06 [updatestate_loads_eliminate]: 3.23e-06 [parameter_eliminate]: 2.06e-06 [a_2]: 9.255e-05 [accelerated_algorithm]: 7.77e-06 [shard]: 2.93e-06 [meta_shard_fg_expand]: 1.82001e-06 [shard_inline]: 6.66e-06 [merge_send_recv]: 9.13002e-06 [auto_parallel]: 6.30002e-06 [parallel]: 1.921e-05 [flash_sp]: 9.85002e-06 [merge_comm]: 3.81001e-06 [allreduce_fusion]: 3.3e-06 [matmul_add_comm_reduction]: 8.78001e-06 [allreduce_slice_to_reducescatter]: 7.79983e-07 [virtual_shard_identity]: 8.28999e-06 [virtual_dataset]: 6.21e-06 [get_grad_eliminate_]: 6.01e-06 [virtual_output]: 6.13998e-06 [merge_forward]: 4.17e-06 [cell_reuse_recompute_pass]: 1.60999e-06 [offload_activation]: 1.025e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.211e-05 [merge_recompute_call_nodes]: 1.67999e-06 [before_grad]: 1.077e-05 [set_forward_comm_id_for_comm_node_pass]: 3.65998e-06 [meta_fg_expand]: 2.89001e-06 [flash_sp_send_recv_attached]: 2.69001e-06 [receive_attached]: 2.42001e-06 [after_resolve]: 1.011e-05 [a_after_grad]: 9.19e-06 [renormalize]: 0.0006125 [add_forward_monad_depend]: 5.19e-06 [auto_monad_grad]: 2.54001e-06 [auto_monad_eliminator]: 1.564e-05 [cse]: 3.124e-05 [a_3]: 4.622e-05 [Cycle 2]: 0.00066704, [45] [expand_dump_flag]: 1.12e-06 [switch_simplify]: 7.55003e-06 [loop_unroll]: 5.53997e-06 [a_1]: 0.00013036 [with_stream_mark]: 1.316e-05 [recompute_prepare]: 6.69999e-06 [updatestate_depend_eliminate]: 3.17002e-06 [updatestate_assign_eliminate]: 2.29001e-06 [updatestate_loads_eliminate]: 3.03e-06 [parameter_eliminate]: 9.60019e-07 [a_2]: 8.207e-05 [accelerated_algorithm]: 6.30002e-06 [shard]: 1.45001e-06 [meta_shard_fg_expand]: 1.42e-06 [shard_inline]: 6.12999e-06 [merge_send_recv]: 5.30999e-06 [auto_parallel]: 5.72001e-06 [parallel]: 5.30999e-06 [flash_sp]: 3.71999e-06 [merge_comm]: 3.25002e-06 [allreduce_fusion]: 3.65e-06 [matmul_add_comm_reduction]: 6.31e-06 [allreduce_slice_to_reducescatter]: 4.89992e-07 [virtual_shard_identity]: 6.82002e-06 [virtual_dataset]: 5.74e-06 [get_grad_eliminate_]: 5.46e-06 [virtual_output]: 5.43002e-06 [merge_forward]: 2.95998e-06 [cell_reuse_recompute_pass]: 1.56998e-06 [offload_activation]: 7.45998e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.132e-05 [merge_recompute_call_nodes]: 1.02e-06 [before_grad]: 9.67001e-06 [set_forward_comm_id_for_comm_node_pass]: 3.9e-06 [meta_fg_expand]: 2.32001e-06 [flash_sp_send_recv_attached]: 1.15001e-06 [receive_attached]: 1.12999e-06 [after_resolve]: 8.67998e-06 [a_after_grad]: 8.55999e-06 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 1.94e-06 [auto_monad_grad]: 1.35999e-06 [auto_monad_eliminator]: 8.95001e-06 [cse]: 1.641e-05 [a_3]: 3.547e-05 [py_interpret_to_execute_after_opt_a]: 8.85001e-06 [slice_cell_reuse_recomputed_activation]: 2.58e-06 [rewriter_after_opt_a]: 3.629e-05 [convert_after_rewriter]: 6.32001e-06 [order_py_execute_after_rewriter]: 5.29e-06 [mutable_eliminate]: 0.00049466 [opt_b]: 0.00020462, [1] [Cycle 1]: 0.00019817, [7] [b_1]: 0.0001193 [b_2]: 7.21001e-06 [updatestate_depend_eliminate]: 6.19001e-06 [updatestate_assign_eliminate]: 2.39999e-06 [updatestate_loads_eliminate]: 2.73998e-06 [renormalize]: 6.00005e-07 [cse]: 2.297e-05 [optimize_parallel_all_gather_comm]: 1.767e-05 [overlap_param_gather]: 2.58998e-06 [cconv]: 2.808e-05 [loop_unroll]: 0.00043752 [opt_after_cconv]: 0.00010062, [1] [Cycle 1]: 9.464e-05, [7] [c_1]: 2.697e-05 [parameter_eliminate]: 2.69999e-06 [updatestate_depend_eliminate]: 6.26e-06 [updatestate_assign_eliminate]: 2.61e-06 [updatestate_loads_eliminate]: 2.53e-06 [cse]: 1.899e-05 [renormalize]: 5.69999e-07 [remove_dup_value]: 1.468e-05 [tuple_transform]: 7.049e-05, [1] [Cycle 1]: 6.629e-05, [4] [d_1]: 3.923e-05 [none_parameter_eliminate]: 1.55001e-06 [renormalize]: 2.00002e-07 [switch_simplify]: 6.51e-06 [partial_unused_args_eliminate]: 1.84e-06 [add_recomputation]: 4.761e-05 [cse_after_recomputation]: 2.101e-05, [1] [Cycle 1]: 1.646e-05, [1] [cse]: 1.106e-05 [environ_conv]: 5.17e-06 [swap_dp_allreduce_reducescatter]: 5.19e-06 [bias_add_comm_swap]: 2.88e-06 [label_micro_interleaved_index]: 4.3e-06 [label_fine_grained_interleaved_index]: 2.64999e-06 [merge_cast_opt]: 1.64998e-06 [slice_recompute_activation]: 2.31e-06 [micro_interleaved_order_control]: 2.72001e-06 [assign_add_opt]: 1.29998e-06 [ForceFp32Comm]: 8.89995e-07 [remove_cast_before_assign_add]: 1.06002e-06 [full_micro_interleaved_order_control]: 2.01e-06 [reorder_send_recv_between_fp_bp]: 2.62001e-06 [comm_op_add_attrs]: 1.00001e-06 [add_comm_op_reuse_tag]: 1.32e-06 [interleave_split_concat_branches]: 1.12e-06 [interleave_parallel_branches]: 1.06002e-06 [overlap_opt_shard_in_pipeline]: 1.15001e-06 [overlap_opt_shard_grad_in_pipeline]: 1.96e-06 [control_data_broadcast_order]: 1.284e-05 [grouped_pairwise_exchange_alltoall]: 1.89999e-06 [offloading_packed_experts]: 4.08001e-06 [overlap_recompute_and_grad_model_parallel]: 4.72e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.10999e-06 [overlap_recompute_allgather_and_fa_grad]: 1.32999e-06 [overlap_recompute_comm]: 2.18002e-06 [overlap_grad_ring_attention]: 4.75999e-06 [overlap_grad_flash_sp]: 1.918e-05 [begin_end_overlap_inline]: 7.09988e-07 [split_matmul_comm_elemetwise]: 2.26e-06 [split_layernorm_comm]: 2.02999e-06 [handle_group_info]: 1.17999e-06 [symbol_engine_optimizer]: 7.983e-05, [1] [Cycle 1]: 7.53e-05, [6] [build]: 3.18e-06 [elim_shapecalc]: 1.031e-05 [elim_not_effective]: 1.38e-05 [opt_reshape]: 7.1e-06 [fold_const_symbol]: 1.023e-05 [renormalize]: 1.8999e-07 [detach_backward]: 1.89999e-06 [pipeline_parallel_scheduler]: 1.72001e-06 [auto_monad_reorder]: 1.686e-05 [get_jit_bprop_graph]: 1.89999e-06 [rewriter_after_jit_bprop_graph]: 4.65001e-06 [opt_after_jit_grad]: 0.00048927 [validate]: 3.838e-05 Sums bootstrap : 0.000443s : 4.66% type_inference : 0.004875s : 51.27% event_method : 0.000012s : 0.13% auto_monad : 0.000056s : 0.59% graph_reusing : 0.000005s : 0.06% inline : 0.000002s : 0.03% add_attr.add_attr_with_inline.tag_attr : 0.000015s : 0.16% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000004s : 0.05% parallel-infer-symbol : 0.000004s : 0.04% pre_auto_parallel : 0.000027s : 0.28% insert-virtual-dataset : 0.000003s : 0.03% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.02% optimize.py_interpret_to_execute : 0.000019s : 0.20% optimize.rewriter_before_opt_a : 0.000046s : 0.48% optimize.opt_a.expand_dump_flag : 0.000004s : 0.04% optimize.opt_a.switch_simplify : 0.000033s : 0.35% optimize.opt_a.loop_unroll : 0.000019s : 0.20% optimize.opt_a.a_1 : 0.000519s : 5.45% optimize.opt_a.with_stream_mark : 0.000032s : 0.34% optimize.opt_a.recompute_prepare : 0.000016s : 0.16% optimize.opt_a.updatestate_depend_eliminate : 0.000007s : 0.08% optimize.opt_a.updatestate_assign_eliminate : 0.000006s : 0.06% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.07% optimize.opt_a.parameter_eliminate : 0.000003s : 0.03% optimize.opt_a.a_2 : 0.000175s : 1.84% optimize.opt_a.accelerated_algorithm : 0.000014s : 0.15% optimize.opt_a.shard : 0.000004s : 0.05% optimize.opt_a.meta_shard_fg_expand : 0.000003s : 0.03% optimize.opt_a.shard_inline : 0.000013s : 0.13% optimize.opt_a.merge_send_recv : 0.000014s : 0.15% optimize.opt_a.auto_parallel : 0.000012s : 0.13% optimize.opt_a.parallel : 0.000025s : 0.26% optimize.opt_a.flash_sp : 0.000014s : 0.14% optimize.opt_a.merge_comm : 0.000007s : 0.07% optimize.opt_a.allreduce_fusion : 0.000007s : 0.07% optimize.opt_a.matmul_add_comm_reduction : 0.000015s : 0.16% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000015s : 0.16% optimize.opt_a.virtual_dataset : 0.000012s : 0.13% optimize.opt_a.get_grad_eliminate_ : 0.000011s : 0.12% optimize.opt_a.virtual_output : 0.000012s : 0.12% optimize.opt_a.merge_forward : 0.000007s : 0.07% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.03% optimize.opt_a.offload_activation : 0.000018s : 0.19% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000023s : 0.25% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.03% optimize.opt_a.before_grad : 0.000020s : 0.21% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000008s : 0.08% optimize.opt_a.meta_fg_expand : 0.000005s : 0.05% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.04% optimize.opt_a.receive_attached : 0.000004s : 0.04% optimize.opt_a.after_resolve : 0.000019s : 0.20% optimize.opt_a.a_after_grad : 0.000018s : 0.19% optimize.opt_a.renormalize : 0.000613s : 6.44% optimize.opt_a.add_forward_monad_depend : 0.000007s : 0.07% optimize.opt_a.auto_monad_grad : 0.000004s : 0.04% optimize.opt_a.auto_monad_eliminator : 0.000025s : 0.26% optimize.opt_a.cse : 0.000048s : 0.50% optimize.opt_a.a_3 : 0.000082s : 0.86% optimize.py_interpret_to_execute_after_opt_a : 0.000009s : 0.09% optimize.slice_cell_reuse_recomputed_activation : 0.000003s : 0.03% optimize.rewriter_after_opt_a : 0.000036s : 0.38% optimize.convert_after_rewriter : 0.000006s : 0.07% optimize.order_py_execute_after_rewriter : 0.000005s : 0.06% optimize.mutable_eliminate : 0.000495s : 5.20% optimize.opt_b.b_1 : 0.000119s : 1.25% optimize.opt_b.b_2 : 0.000007s : 0.08% optimize.opt_b.updatestate_depend_eliminate : 0.000006s : 0.07% optimize.opt_b.updatestate_assign_eliminate : 0.000002s : 0.03% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_b.renormalize : 0.000001s : 0.01% optimize.opt_b.cse : 0.000023s : 0.24% optimize.optimize_parallel_all_gather_comm : 0.000018s : 0.19% optimize.overlap_param_gather : 0.000003s : 0.03% optimize.cconv : 0.000028s : 0.30% optimize.loop_unroll : 0.000438s : 4.60% optimize.opt_after_cconv.c_1 : 0.000027s : 0.28% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.07% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.cse : 0.000019s : 0.20% optimize.opt_after_cconv.renormalize : 0.000001s : 0.01% optimize.remove_dup_value : 0.000015s : 0.15% optimize.tuple_transform.d_1 : 0.000039s : 0.41% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.02% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000007s : 0.07% optimize.partial_unused_args_eliminate : 0.000002s : 0.02% optimize.add_recomputation : 0.000048s : 0.50% optimize.cse_after_recomputation.cse : 0.000011s : 0.12% optimize.environ_conv : 0.000005s : 0.05% optimize.swap_dp_allreduce_reducescatter : 0.000005s : 0.05% optimize.bias_add_comm_swap : 0.000003s : 0.03% optimize.label_micro_interleaved_index : 0.000004s : 0.05% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.03% optimize.merge_cast_opt : 0.000002s : 0.02% optimize.slice_recompute_activation : 0.000002s : 0.02% optimize.micro_interleaved_order_control : 0.000003s : 0.03% optimize.assign_add_opt : 0.000001s : 0.01% optimize.ForceFp32Comm : 0.000001s : 0.01% optimize.remove_cast_before_assign_add : 0.000001s : 0.01% optimize.full_micro_interleaved_order_control : 0.000002s : 0.02% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.03% optimize.comm_op_add_attrs : 0.000001s : 0.01% optimize.add_comm_op_reuse_tag : 0.000001s : 0.01% optimize.interleave_split_concat_branches : 0.000001s : 0.01% optimize.interleave_parallel_branches : 0.000001s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.02% optimize.control_data_broadcast_order : 0.000013s : 0.14% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.02% optimize.offloading_packed_experts : 0.000004s : 0.04% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.05% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.01% optimize.overlap_recompute_comm : 0.000002s : 0.02% optimize.overlap_grad_ring_attention : 0.000005s : 0.05% optimize.overlap_grad_flash_sp : 0.000019s : 0.20% optimize.begin_end_overlap_inline : 0.000001s : 0.01% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.02% optimize.split_layernorm_comm : 0.000002s : 0.02% optimize.handle_group_info : 0.000001s : 0.01% optimize.symbol_engine_optimizer.build : 0.000003s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000010s : 0.11% optimize.symbol_engine_optimizer.elim_not_effective : 0.000014s : 0.15% optimize.symbol_engine_optimizer.opt_reshape : 0.000007s : 0.07% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000010s : 0.11% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.02% pipeline_parallel_scheduler : 0.000002s : 0.02% auto_monad_reorder : 0.000017s : 0.18% get_jit_bprop_graph : 0.000002s : 0.02% rewriter_after_jit_bprop_graph : 0.000005s : 0.05% opt_after_jit_grad : 0.000489s : 5.15% validate : 0.000038s : 0.40% Time group info: ------[substitution.] 0.000237 23 30.66% : 0.000073s : 4: substitution.arithmetic_simplify 0.91% : 0.000002s : 2: substitution.elim_not_effective 0.60% : 0.000001s : 2: substitution.fold_const_symbol 2.41% : 0.000006s : 3: substitution.graph_param_transform 60.42% : 0.000143s : 2: substitution.inline 1.74% : 0.000004s : 4: substitution.j_node_and_user_rematch 1.92% : 0.000005s : 4: substitution.remove_not_recompute_node 1.34% : 0.000003s : 2: substitution.replace_old_param ------[type_inference.] 0.004823 2 90.06% : 0.004344s : 1: type_inference.infer 9.94% : 0.000480s : 1: type_inference.specialize ------[replace.] 0.000024 2 100.00% : 0.000024s : 2: replace.inline ------[match.] 0.000142 2 100.00% : 0.000142s : 2: match.inline ------[predicate.] 0.000136 754 0.79% : 0.000001s : 7: predicate.accumulaten_eliminater 1.20% : 0.000002s : 3: predicate.ad_related_special_op_eliminate 0.64% : 0.000001s : 6: predicate.addn_check_dump 0.78% : 0.000001s : 7: predicate.addn_zero_filter 0.65% : 0.000001s : 7: predicate.adjust_all_reduce_mul_add 3.08% : 0.000004s : 13: predicate.arithmetic_simplify 0.80% : 0.000001s : 7: predicate.cast_eliminate 0.70% : 0.000001s : 6: predicate.check_bprop_eliminate 0.67% : 0.000001s : 6: predicate.compare_switch_simplify 0.25% : 0.000000s : 3: predicate.const_output_eliminate 0.70% : 0.000001s : 6: predicate.depend_value_elim 0.79% : 0.000001s : 7: predicate.dict_get_item_const_eliminator 0.87% : 0.000001s : 7: predicate.dict_get_item_eliminator 0.73% : 0.000001s : 7: predicate.dict_set_item_eliminator 1.35% : 0.000002s : 6: predicate.dumpgradient_eliminate 0.23% : 0.000000s : 3: predicate.elim_not_effective 0.64% : 0.000001s : 3: predicate.elim_shapecalc_of_broadcastargs 1.11% : 0.000002s : 10: predicate.environ_add_const_eliminate 1.00% : 0.000001s : 10: predicate.environ_get_add_eliminate 0.92% : 0.000001s : 10: predicate.environ_get_depend_swap 1.74% : 0.000002s : 16: predicate.environ_get_eliminate 0.95% : 0.000001s : 10: predicate.environ_get_set_eliminate 0.92% : 0.000001s : 9: predicate.exchange_switch_depend_value 1.88% : 0.000003s : 9: predicate.float_depend_g_call 0.65% : 0.000001s : 6: predicate.float_environ_get_switch 0.92% : 0.000001s : 9: predicate.float_tuple_getitem_switch 0.21% : 0.000000s : 3: predicate.fold_const_symbol 0.75% : 0.000001s : 6: predicate.get_grad_eliminate 0.42% : 0.000001s : 3: predicate.graph_param_transform 0.81% : 0.000001s : 6: predicate.incorporate_call 0.64% : 0.000001s : 6: predicate.incorporate_call_switch 6.77% : 0.000009s : 34: predicate.inline 1.31% : 0.000002s : 6: predicate.inline_without_move 0.40% : 0.000001s : 6: predicate.j_node_and_user_rematch 1.14% : 0.000002s : 6: predicate.less_batch_normalization 1.72% : 0.000002s : 13: predicate.list_to_tuple_eliminator_ 2.07% : 0.000003s : 20: predicate.load_eliminater 1.53% : 0.000002s : 3: predicate.loop_unroll_after_grad 1.64% : 0.000002s : 14: predicate.loop_unroll_before_grad 1.71% : 0.000002s : 13: predicate.make_slice_get_slice_eliminator 0.64% : 0.000001s : 6: predicate.merge_addn 0.64% : 0.000001s : 6: predicate.micro_step_allgather_replace 0.76% : 0.000001s : 6: predicate.mini_step_allgather_replace 0.67% : 0.000001s : 7: predicate.minmaximum_grad 1.58% : 0.000002s : 3: predicate.mutable_eliminate 0.41% : 0.000001s : 3: predicate.opt_reshape 0.43% : 0.000001s : 3: predicate.parallel_virtual_node 1.30% : 0.000002s : 9: predicate.partial_defer_inline 1.19% : 0.000002s : 10: predicate.partial_eliminate 0.81% : 0.000001s : 7: predicate.print_const_string_wrapper 0.70% : 0.000001s : 6: predicate.reduce_all_const_elim 0.94% : 0.000001s : 7: predicate.reduce_eliminate 2.04% : 0.000003s : 20: predicate.redundant_stop_gradient_eliminater 0.59% : 0.000001s : 6: predicate.remove_not_recompute_node 1.12% : 0.000002s : 13: predicate.replace_applicator 0.70% : 0.000001s : 6: predicate.replace_old_param 0.38% : 0.000001s : 3: predicate.reset_defer_inline 0.78% : 0.000001s : 7: predicate.reshape_eliminate 0.68% : 0.000001s : 6: predicate.row_tensor_add_zeros_like 0.48% : 0.000001s : 3: predicate.row_tensor_eliminate 0.95% : 0.000001s : 6: predicate.same_eliminate 0.53% : 0.000001s : 6: predicate.set_cell_output_no_recompute 1.22% : 0.000002s : 6: predicate.shard_identity_eliminate 0.84% : 0.000001s : 6: predicate.special_op_eliminate 0.95% : 0.000001s : 6: predicate.specialize_transform 1.09% : 0.000001s : 6: predicate.split_environ_get_set_with_tuple_value 0.86% : 0.000001s : 6: predicate.stack_unstack_eliminate 0.42% : 0.000001s : 3: predicate.switch_call_monad_eliminater 0.96% : 0.000001s : 9: predicate.switch_defer_inline 1.69% : 0.000002s : 15: predicate.switch_layer_defer_inline 4.52% : 0.000006s : 32: predicate.switch_simplify 0.71% : 0.000001s : 7: predicate.tile_eliminate 0.78% : 0.000001s : 7: predicate.transpose_eliminate 1.49% : 0.000002s : 13: predicate.tuple_list_convert_item_index_to_positive 1.63% : 0.000002s : 13: predicate.tuple_list_get_item_const_eliminator 1.41% : 0.000002s : 13: predicate.tuple_list_get_item_depend_reorder 3.39% : 0.000005s : 19: predicate.tuple_list_get_item_eliminator 1.43% : 0.000002s : 13: predicate.tuple_list_get_set_item_eliminator 2.91% : 0.000004s : 19: predicate.tuple_list_set_item_eliminator 1.45% : 0.000002s : 13: predicate.tuple_to_list_eliminator_ 1.91% : 0.000003s : 20: predicate.updatestate_pure_node_eliminater 2.86% : 0.000004s : 26: predicate.updatestate_useless_node_eliminater 0.67% : 0.000001s : 3: predicate.value_based_eliminate 0.75% : 0.000001s : 6: predicate.virtual_dataset_eliminate 0.72% : 0.000001s : 6: predicate.virtual_output_eliminate 0.32% : 0.000000s : 3: predicate.virtual_view_grad_eliminate 0.65% : 0.000001s : 3: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000224 5 8.57% : 0.000019s : 1: func_graph_cloner_run.FuncGraphClonerGraph 91.43% : 0.000205s : 4: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.022844 192 0.02% : 0.000004s : 1: ForceFp32Comm 13.76% : 0.003143s : 1: add_attr 13.71% : 0.003133s : 1: add_attr_with_inline 0.02% : 0.000004s : 1: add_comm_op_reuse_tag 0.23% : 0.000052s : 1: add_recomputation 0.02% : 0.000004s : 1: assign_add_opt 0.27% : 0.000061s : 1: auto_monad 0.09% : 0.000021s : 1: auto_monad_reorder 0.02% : 0.000004s : 1: begin_end_overlap_inline 0.03% : 0.000006s : 1: bias_add_comm_swap 2.07% : 0.000473s : 1: bootstrap 0.14% : 0.000032s : 1: cconv 0.02% : 0.000004s : 1: comm_op_add_attrs 0.07% : 0.000016s : 1: control_data_broadcast_order 0.04% : 0.000009s : 1: convert_after_rewriter 0.10% : 0.000024s : 1: cse_after_recomputation 0.02% : 0.000005s : 1: dataset_repeat_opt 0.02% : 0.000005s : 1: detach_backward 0.04% : 0.000008s : 1: environ_conv 0.08% : 0.000019s : 1: event_method 0.02% : 0.000005s : 1: full_micro_interleaved_order_control 0.02% : 0.000005s : 1: get_jit_bprop_graph 0.04% : 0.000009s : 1: graph_reusing 0.02% : 0.000005s : 1: grouped_pairwise_exchange_alltoall 0.03% : 0.000006s : 1: handle_group_info 0.02% : 0.000006s : 1: inline 0.03% : 0.000006s : 1: insert-virtual-dataset 0.02% : 0.000004s : 1: interleave_parallel_branches 0.02% : 0.000004s : 1: interleave_split_concat_branches 0.02% : 0.000006s : 1: label_fine_grained_interleaved_index 0.03% : 0.000007s : 1: label_micro_interleaved_index 1.95% : 0.000446s : 1: loop_unroll 0.02% : 0.000004s : 1: merge_cast_opt 0.03% : 0.000006s : 1: micro_interleaved_order_control 2.21% : 0.000505s : 1: mutable_eliminate 0.03% : 0.000007s : 1: offloading_packed_experts 0.06% : 0.000015s : 1: opt.transform.loop_unroll_optimizer 0.07% : 0.000016s : 1: opt.transform.mutable_eliminate 3.97% : 0.000907s : 78: opt.transform.opt_a 0.11% : 0.000026s : 1: opt.transform.opt_after_cconv 0.11% : 0.000026s : 1: opt.transform.opt_after_jit_grad 0.41% : 0.000094s : 28: opt.transform.opt_b 0.19% : 0.000043s : 2: opt.transform.opt_trans_graph 0.16% : 0.000037s : 4: opt.transform.symbol_engine_opt 10.43% : 0.002383s : 1: opt_a 0.46% : 0.000104s : 1: opt_after_cconv 2.19% : 0.000500s : 1: opt_after_jit_grad 0.91% : 0.000208s : 1: opt_b 19.24% : 0.004395s : 1: optimize 0.09% : 0.000021s : 1: optimize_parallel_all_gather_comm 0.04% : 0.000008s : 1: order_py_execute_after_rewriter 0.10% : 0.000023s : 1: overlap_grad_flash_sp 0.02% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.03% : 0.000008s : 1: overlap_grad_ring_attention 0.02% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.02% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.02% : 0.000005s : 1: overlap_param_gather 0.02% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.03% : 0.000008s : 1: overlap_recompute_and_grad_model_parallel 0.02% : 0.000005s : 1: overlap_recompute_comm 0.03% : 0.000008s : 1: parallel-infer-symbol 0.02% : 0.000004s : 1: parallel-infer-symbol-second 0.02% : 0.000005s : 1: partial_unused_args_eliminate 0.02% : 0.000005s : 1: pipeline_parallel_scheduler 0.02% : 0.000005s : 1: pipeline_split 0.14% : 0.000031s : 1: pre_auto_parallel 0.10% : 0.000022s : 1: py_interpret_to_execute 0.05% : 0.000012s : 1: py_interpret_to_execute_after_opt_a 0.02% : 0.000004s : 1: remove_cast_before_assign_add 0.08% : 0.000018s : 1: remove_dup_value 1.60% : 0.000366s : 1: renormalize.infer 1.04% : 0.000238s : 1: renormalize.specialize 0.02% : 0.000005s : 1: reorder_send_recv_between_fp_bp 0.03% : 0.000008s : 1: rewriter_after_jit_bprop_graph 0.18% : 0.000040s : 1: rewriter_after_opt_a 0.22% : 0.000050s : 1: rewriter_before_opt_a 0.02% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.02% : 0.000005s : 1: slice_recompute_activation 0.20% : 0.000046s : 1: split_layernorm_comm 0.02% : 0.000005s : 1: split_matmul_comm_elemetwise 0.04% : 0.000008s : 1: swap_dp_allreduce_reducescatter 0.36% : 0.000083s : 1: symbol_engine_optimizer 0.32% : 0.000073s : 1: tuple_transform 21.41% : 0.004892s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:46.976.973 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:46.977.252 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0147305, [21] [bootstrap]: 0.00047635 [type_inference]: 0.00485032 [event_method]: 1.243e-05 [auto_monad]: 5.484e-05 [graph_reusing]: 5.34998e-06 [inline]: 2.68e-06 [add_attr]: 0.00316237, [1] [add_attr_with_inline]: 0.00315307, [1] [Cycle 1]: 7.184e-05, [2] [tag_attr]: 1.464e-05 [meta_addattr_fg_expand]: 3.43e-06 [parallel-infer-symbol]: 3.00002e-06 [pre_auto_parallel]: 2.509e-05 [insert-virtual-dataset]: 2.32999e-06 [parallel-infer-symbol-second]: 7.29982e-07 [dataset_repeat_opt]: 1.89e-06 [pipeline_split]: 1.60999e-06 [optimize]: 0.00496526, [53] [py_interpret_to_execute]: 2.242e-05 [rewriter_before_opt_a]: 5.06e-05 [opt_a]: 0.00269293, [2] [Cycle 1]: 0.00183538, [45] [expand_dump_flag]: 2.89999e-06 [switch_simplify]: 2.541e-05 [loop_unroll]: 1.345e-05 [a_1]: 0.00032835 [with_stream_mark]: 1.652e-05 [recompute_prepare]: 9.77001e-06 [updatestate_depend_eliminate]: 3.88001e-06 [updatestate_assign_eliminate]: 3.6e-06 [updatestate_loads_eliminate]: 3.07002e-06 [parameter_eliminate]: 2.02001e-06 [a_2]: 0.00012123 [accelerated_algorithm]: 8.05e-06 [shard]: 2.37001e-06 [meta_shard_fg_expand]: 1.81e-06 [shard_inline]: 6.71999e-06 [merge_send_recv]: 9.05001e-06 [auto_parallel]: 7.01001e-06 [parallel]: 1.901e-05 [flash_sp]: 8.60999e-06 [merge_comm]: 4.13001e-06 [allreduce_fusion]: 3.73999e-06 [matmul_add_comm_reduction]: 9.27001e-06 [allreduce_slice_to_reducescatter]: 5.60016e-07 [virtual_shard_identity]: 8.82999e-06 [virtual_dataset]: 6.56999e-06 [get_grad_eliminate_]: 6.26998e-06 [virtual_output]: 5.98002e-06 [merge_forward]: 3.93001e-06 [cell_reuse_recompute_pass]: 1.36002e-06 [offload_activation]: 1.007e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.476e-05 [merge_recompute_call_nodes]: 1.67001e-06 [before_grad]: 1.092e-05 [set_forward_comm_id_for_comm_node_pass]: 3.78001e-06 [meta_fg_expand]: 2.61999e-06 [flash_sp_send_recv_attached]: 2.79001e-06 [receive_attached]: 2.34001e-06 [after_resolve]: 1.014e-05 [a_after_grad]: 1.03e-05 [renormalize]: 0.00061706 [add_forward_monad_depend]: 5.73002e-06 [auto_monad_grad]: 2.46e-06 [auto_monad_eliminator]: 1.592e-05 [cse]: 2.879e-05 [a_3]: 5.99e-05 [Cycle 2]: 0.00084449, [45] [expand_dump_flag]: 1.00001e-06 [switch_simplify]: 8.07e-06 [loop_unroll]: 6.01e-06 [a_1]: 0.00013013 [with_stream_mark]: 1.239e-05 [recompute_prepare]: 6.49001e-06 [updatestate_depend_eliminate]: 3.25998e-06 [updatestate_assign_eliminate]: 2.51998e-06 [updatestate_loads_eliminate]: 2.86999e-06 [parameter_eliminate]: 1.17e-06 [a_2]: 0.00010881 [accelerated_algorithm]: 6.46999e-06 [shard]: 1.66e-06 [meta_shard_fg_expand]: 1.50999e-06 [shard_inline]: 6.52001e-06 [merge_send_recv]: 5.41998e-06 [auto_parallel]: 5.91e-06 [parallel]: 5.81998e-06 [flash_sp]: 3.64002e-06 [merge_comm]: 3.65e-06 [allreduce_fusion]: 3.35998e-06 [matmul_add_comm_reduction]: 7.04001e-06 [allreduce_slice_to_reducescatter]: 5.09986e-07 [virtual_shard_identity]: 6.91999e-06 [virtual_dataset]: 5.72001e-06 [get_grad_eliminate_]: 5.52999e-06 [virtual_output]: 5.70001e-06 [merge_forward]: 3.02002e-06 [cell_reuse_recompute_pass]: 1.91e-06 [offload_activation]: 7.65998e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.318e-05 [merge_recompute_call_nodes]: 1.06997e-06 [before_grad]: 1.008e-05 [set_forward_comm_id_for_comm_node_pass]: 4.08999e-06 [meta_fg_expand]: 2.37001e-06 [flash_sp_send_recv_attached]: 9.29984e-07 [receive_attached]: 9.70002e-07 [after_resolve]: 9.12999e-06 [a_after_grad]: 8.79998e-06 [renormalize]: 8.00064e-08 [add_forward_monad_depend]: 1.79998e-06 [auto_monad_grad]: 1.07998e-06 [auto_monad_eliminator]: 8.89e-06 [cse]: 1.74e-05 [a_3]: 4.763e-05 [py_interpret_to_execute_after_opt_a]: 1.279e-05 [slice_cell_reuse_recomputed_activation]: 4.85001e-06 [rewriter_after_opt_a]: 3.83e-05 [convert_after_rewriter]: 1.051e-05 [order_py_execute_after_rewriter]: 8.2e-06 [mutable_eliminate]: 0.00052052 [opt_b]: 0.0002898, [1] [Cycle 1]: 0.00028043, [7] [b_1]: 0.00015982 [b_2]: 7.85e-06 [updatestate_depend_eliminate]: 6.48e-06 [updatestate_assign_eliminate]: 2.57001e-06 [updatestate_loads_eliminate]: 2.44001e-06 [renormalize]: 5.19998e-07 [cse]: 2.027e-05 [optimize_parallel_all_gather_comm]: 2.082e-05 [overlap_param_gather]: 4.79e-06 [cconv]: 2.995e-05 [loop_unroll]: 0.00044803 [opt_after_cconv]: 0.00012615, [1] [Cycle 1]: 0.000118, [7] [c_1]: 2.807e-05 [parameter_eliminate]: 2.99001e-06 [updatestate_depend_eliminate]: 5.58997e-06 [updatestate_assign_eliminate]: 2.58998e-06 [updatestate_loads_eliminate]: 2.66999e-06 [cse]: 1.854e-05 [renormalize]: 4.00003e-07 [remove_dup_value]: 1.754e-05 [tuple_transform]: 8.618e-05, [1] [Cycle 1]: 7.973e-05, [4] [d_1]: 4.091e-05 [none_parameter_eliminate]: 1.86e-06 [renormalize]: 2.10013e-07 [switch_simplify]: 6.54001e-06 [partial_unused_args_eliminate]: 5.27999e-06 [add_recomputation]: 5.268e-05 [cse_after_recomputation]: 2.894e-05, [1] [Cycle 1]: 2.186e-05, [1] [cse]: 1.232e-05 [environ_conv]: 8.57e-06 [swap_dp_allreduce_reducescatter]: 7.96001e-06 [bias_add_comm_swap]: 4.94e-06 [label_micro_interleaved_index]: 7.35e-06 [label_fine_grained_interleaved_index]: 5.02999e-06 [merge_cast_opt]: 4.07e-06 [slice_recompute_activation]: 4.70999e-06 [micro_interleaved_order_control]: 4.61002e-06 [assign_add_opt]: 3.55e-06 [ForceFp32Comm]: 3.46999e-06 [remove_cast_before_assign_add]: 3.46999e-06 [full_micro_interleaved_order_control]: 4.75001e-06 [reorder_send_recv_between_fp_bp]: 5.33002e-06 [comm_op_add_attrs]: 3.58999e-06 [add_comm_op_reuse_tag]: 3.31001e-06 [interleave_split_concat_branches]: 3.80998e-06 [interleave_parallel_branches]: 3.76001e-06 [overlap_opt_shard_in_pipeline]: 3.53999e-06 [overlap_opt_shard_grad_in_pipeline]: 5.05001e-06 [control_data_broadcast_order]: 1.635e-05 [grouped_pairwise_exchange_alltoall]: 3.90998e-06 [offloading_packed_experts]: 7.39002e-06 [overlap_recompute_and_grad_model_parallel]: 7.28999e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.48999e-06 [overlap_recompute_allgather_and_fa_grad]: 4.05998e-06 [overlap_recompute_comm]: 4.95001e-06 [overlap_grad_ring_attention]: 6.52001e-06 [overlap_grad_flash_sp]: 2.211e-05 [begin_end_overlap_inline]: 2.90998e-06 [split_matmul_comm_elemetwise]: 4.93001e-06 [split_layernorm_comm]: 4.53001e-06 [handle_group_info]: 3.32997e-06 [symbol_engine_optimizer]: 9.668e-05, [1] [Cycle 1]: 9.017e-05, [6] [build]: 3.53999e-06 [elim_shapecalc]: 9.50001e-06 [elim_not_effective]: 1.279e-05 [opt_reshape]: 7.13e-06 [fold_const_symbol]: 9.94001e-06 [renormalize]: 3.00002e-07 [detach_backward]: 3.97e-06 [pipeline_parallel_scheduler]: 2.81999e-06 [auto_monad_reorder]: 1.923e-05 [get_jit_bprop_graph]: 1.95001e-06 [rewriter_after_jit_bprop_graph]: 4.61002e-06 [opt_after_jit_grad]: 0.00049915 [validate]: 3.859e-05 Sums bootstrap : 0.000476s : 4.88% type_inference : 0.004850s : 49.69% event_method : 0.000012s : 0.13% auto_monad : 0.000055s : 0.56% graph_reusing : 0.000005s : 0.05% inline : 0.000003s : 0.03% add_attr.add_attr_with_inline.tag_attr : 0.000015s : 0.15% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000003s : 0.04% parallel-infer-symbol : 0.000003s : 0.03% pre_auto_parallel : 0.000025s : 0.26% insert-virtual-dataset : 0.000002s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.02% optimize.py_interpret_to_execute : 0.000022s : 0.23% optimize.rewriter_before_opt_a : 0.000051s : 0.52% optimize.opt_a.expand_dump_flag : 0.000004s : 0.04% optimize.opt_a.switch_simplify : 0.000033s : 0.34% optimize.opt_a.loop_unroll : 0.000019s : 0.20% optimize.opt_a.a_1 : 0.000458s : 4.70% optimize.opt_a.with_stream_mark : 0.000029s : 0.30% optimize.opt_a.recompute_prepare : 0.000016s : 0.17% optimize.opt_a.updatestate_depend_eliminate : 0.000007s : 0.07% optimize.opt_a.updatestate_assign_eliminate : 0.000006s : 0.06% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.06% optimize.opt_a.parameter_eliminate : 0.000003s : 0.03% optimize.opt_a.a_2 : 0.000230s : 2.36% optimize.opt_a.accelerated_algorithm : 0.000015s : 0.15% optimize.opt_a.shard : 0.000004s : 0.04% optimize.opt_a.meta_shard_fg_expand : 0.000003s : 0.03% optimize.opt_a.shard_inline : 0.000013s : 0.14% optimize.opt_a.merge_send_recv : 0.000014s : 0.15% optimize.opt_a.auto_parallel : 0.000013s : 0.13% optimize.opt_a.parallel : 0.000025s : 0.25% optimize.opt_a.flash_sp : 0.000012s : 0.13% optimize.opt_a.merge_comm : 0.000008s : 0.08% optimize.opt_a.allreduce_fusion : 0.000007s : 0.07% optimize.opt_a.matmul_add_comm_reduction : 0.000016s : 0.17% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000016s : 0.16% optimize.opt_a.virtual_dataset : 0.000012s : 0.13% optimize.opt_a.get_grad_eliminate_ : 0.000012s : 0.12% optimize.opt_a.virtual_output : 0.000012s : 0.12% optimize.opt_a.merge_forward : 0.000007s : 0.07% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.03% optimize.opt_a.offload_activation : 0.000018s : 0.18% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000028s : 0.29% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.03% optimize.opt_a.before_grad : 0.000021s : 0.22% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000008s : 0.08% optimize.opt_a.meta_fg_expand : 0.000005s : 0.05% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.04% optimize.opt_a.receive_attached : 0.000003s : 0.03% optimize.opt_a.after_resolve : 0.000019s : 0.20% optimize.opt_a.a_after_grad : 0.000019s : 0.20% optimize.opt_a.renormalize : 0.000617s : 6.32% optimize.opt_a.add_forward_monad_depend : 0.000008s : 0.08% optimize.opt_a.auto_monad_grad : 0.000004s : 0.04% optimize.opt_a.auto_monad_eliminator : 0.000025s : 0.25% optimize.opt_a.cse : 0.000046s : 0.47% optimize.opt_a.a_3 : 0.000108s : 1.10% optimize.py_interpret_to_execute_after_opt_a : 0.000013s : 0.13% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.05% optimize.rewriter_after_opt_a : 0.000038s : 0.39% optimize.convert_after_rewriter : 0.000011s : 0.11% optimize.order_py_execute_after_rewriter : 0.000008s : 0.08% optimize.mutable_eliminate : 0.000521s : 5.33% optimize.opt_b.b_1 : 0.000160s : 1.64% optimize.opt_b.b_2 : 0.000008s : 0.08% optimize.opt_b.updatestate_depend_eliminate : 0.000006s : 0.07% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_b.updatestate_loads_eliminate : 0.000002s : 0.02% optimize.opt_b.renormalize : 0.000001s : 0.01% optimize.opt_b.cse : 0.000020s : 0.21% optimize.optimize_parallel_all_gather_comm : 0.000021s : 0.21% optimize.overlap_param_gather : 0.000005s : 0.05% optimize.cconv : 0.000030s : 0.31% optimize.loop_unroll : 0.000448s : 4.59% optimize.opt_after_cconv.c_1 : 0.000028s : 0.29% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.06% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.cse : 0.000019s : 0.19% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000018s : 0.18% optimize.tuple_transform.d_1 : 0.000041s : 0.42% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.02% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000007s : 0.07% optimize.partial_unused_args_eliminate : 0.000005s : 0.05% optimize.add_recomputation : 0.000053s : 0.54% optimize.cse_after_recomputation.cse : 0.000012s : 0.13% optimize.environ_conv : 0.000009s : 0.09% optimize.swap_dp_allreduce_reducescatter : 0.000008s : 0.08% optimize.bias_add_comm_swap : 0.000005s : 0.05% optimize.label_micro_interleaved_index : 0.000007s : 0.08% optimize.label_fine_grained_interleaved_index : 0.000005s : 0.05% optimize.merge_cast_opt : 0.000004s : 0.04% optimize.slice_recompute_activation : 0.000005s : 0.05% optimize.micro_interleaved_order_control : 0.000005s : 0.05% optimize.assign_add_opt : 0.000004s : 0.04% optimize.ForceFp32Comm : 0.000003s : 0.04% optimize.remove_cast_before_assign_add : 0.000003s : 0.04% optimize.full_micro_interleaved_order_control : 0.000005s : 0.05% optimize.reorder_send_recv_between_fp_bp : 0.000005s : 0.05% optimize.comm_op_add_attrs : 0.000004s : 0.04% optimize.add_comm_op_reuse_tag : 0.000003s : 0.03% optimize.interleave_split_concat_branches : 0.000004s : 0.04% optimize.interleave_parallel_branches : 0.000004s : 0.04% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.04% optimize.overlap_opt_shard_grad_in_pipeline : 0.000005s : 0.05% optimize.control_data_broadcast_order : 0.000016s : 0.17% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.04% optimize.offloading_packed_experts : 0.000007s : 0.08% optimize.overlap_recompute_and_grad_model_parallel : 0.000007s : 0.07% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000003s : 0.04% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.04% optimize.overlap_recompute_comm : 0.000005s : 0.05% optimize.overlap_grad_ring_attention : 0.000007s : 0.07% optimize.overlap_grad_flash_sp : 0.000022s : 0.23% optimize.begin_end_overlap_inline : 0.000003s : 0.03% optimize.split_matmul_comm_elemetwise : 0.000005s : 0.05% optimize.split_layernorm_comm : 0.000005s : 0.05% optimize.handle_group_info : 0.000003s : 0.03% optimize.symbol_engine_optimizer.build : 0.000004s : 0.04% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000010s : 0.10% optimize.symbol_engine_optimizer.elim_not_effective : 0.000013s : 0.13% optimize.symbol_engine_optimizer.opt_reshape : 0.000007s : 0.07% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000010s : 0.10% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000004s : 0.04% pipeline_parallel_scheduler : 0.000003s : 0.03% auto_monad_reorder : 0.000019s : 0.20% get_jit_bprop_graph : 0.000002s : 0.02% rewriter_after_jit_bprop_graph : 0.000005s : 0.05% opt_after_jit_grad : 0.000499s : 5.11% validate : 0.000039s : 0.40% Time group info: ------[substitution.] 0.000187 23 38.73% : 0.000072s : 4: substitution.arithmetic_simplify 1.02% : 0.000002s : 2: substitution.elim_not_effective 0.76% : 0.000001s : 2: substitution.fold_const_symbol 3.03% : 0.000006s : 3: substitution.graph_param_transform 50.04% : 0.000093s : 2: substitution.inline 1.97% : 0.000004s : 4: substitution.j_node_and_user_rematch 2.46% : 0.000005s : 4: substitution.remove_not_recompute_node 2.00% : 0.000004s : 2: substitution.replace_old_param ------[type_inference.] 0.004800 2 91.08% : 0.004372s : 1: type_inference.infer 8.92% : 0.000428s : 1: type_inference.specialize ------[replace.] 0.000023 2 100.00% : 0.000023s : 2: replace.inline ------[match.] 0.000092 2 100.00% : 0.000092s : 2: match.inline ------[predicate.] 0.000136 754 0.75% : 0.000001s : 7: predicate.accumulaten_eliminater 1.65% : 0.000002s : 3: predicate.ad_related_special_op_eliminate 0.64% : 0.000001s : 6: predicate.addn_check_dump 0.79% : 0.000001s : 7: predicate.addn_zero_filter 0.64% : 0.000001s : 7: predicate.adjust_all_reduce_mul_add 3.06% : 0.000004s : 13: predicate.arithmetic_simplify 0.99% : 0.000001s : 7: predicate.cast_eliminate 0.77% : 0.000001s : 6: predicate.check_bprop_eliminate 0.64% : 0.000001s : 6: predicate.compare_switch_simplify 0.22% : 0.000000s : 3: predicate.const_output_eliminate 0.77% : 0.000001s : 6: predicate.depend_value_elim 0.78% : 0.000001s : 7: predicate.dict_get_item_const_eliminator 0.85% : 0.000001s : 7: predicate.dict_get_item_eliminator 0.72% : 0.000001s : 7: predicate.dict_set_item_eliminator 1.19% : 0.000002s : 6: predicate.dumpgradient_eliminate 0.21% : 0.000000s : 3: predicate.elim_not_effective 0.40% : 0.000001s : 3: predicate.elim_shapecalc_of_broadcastargs 1.02% : 0.000001s : 10: predicate.environ_add_const_eliminate 0.95% : 0.000001s : 10: predicate.environ_get_add_eliminate 0.97% : 0.000001s : 10: predicate.environ_get_depend_swap 1.87% : 0.000003s : 16: predicate.environ_get_eliminate 0.94% : 0.000001s : 10: predicate.environ_get_set_eliminate 0.88% : 0.000001s : 9: predicate.exchange_switch_depend_value 2.00% : 0.000003s : 9: predicate.float_depend_g_call 0.68% : 0.000001s : 6: predicate.float_environ_get_switch 0.94% : 0.000001s : 9: predicate.float_tuple_getitem_switch 0.34% : 0.000000s : 3: predicate.fold_const_symbol 0.76% : 0.000001s : 6: predicate.get_grad_eliminate 0.24% : 0.000000s : 3: predicate.graph_param_transform 0.85% : 0.000001s : 6: predicate.incorporate_call 0.66% : 0.000001s : 6: predicate.incorporate_call_switch 6.82% : 0.000009s : 34: predicate.inline 1.44% : 0.000002s : 6: predicate.inline_without_move 0.52% : 0.000001s : 6: predicate.j_node_and_user_rematch 1.10% : 0.000001s : 6: predicate.less_batch_normalization 1.60% : 0.000002s : 13: predicate.list_to_tuple_eliminator_ 1.99% : 0.000003s : 20: predicate.load_eliminater 1.08% : 0.000001s : 3: predicate.loop_unroll_after_grad 1.60% : 0.000002s : 14: predicate.loop_unroll_before_grad 1.74% : 0.000002s : 13: predicate.make_slice_get_slice_eliminator 0.72% : 0.000001s : 6: predicate.merge_addn 0.76% : 0.000001s : 6: predicate.micro_step_allgather_replace 0.68% : 0.000001s : 6: predicate.mini_step_allgather_replace 0.68% : 0.000001s : 7: predicate.minmaximum_grad 1.35% : 0.000002s : 3: predicate.mutable_eliminate 0.46% : 0.000001s : 3: predicate.opt_reshape 0.43% : 0.000001s : 3: predicate.parallel_virtual_node 1.31% : 0.000002s : 9: predicate.partial_defer_inline 1.24% : 0.000002s : 10: predicate.partial_eliminate 0.83% : 0.000001s : 7: predicate.print_const_string_wrapper 0.71% : 0.000001s : 6: predicate.reduce_all_const_elim 0.90% : 0.000001s : 7: predicate.reduce_eliminate 2.06% : 0.000003s : 20: predicate.redundant_stop_gradient_eliminater 0.63% : 0.000001s : 6: predicate.remove_not_recompute_node 1.18% : 0.000002s : 13: predicate.replace_applicator 0.89% : 0.000001s : 6: predicate.replace_old_param 0.34% : 0.000000s : 3: predicate.reset_defer_inline 0.78% : 0.000001s : 7: predicate.reshape_eliminate 0.75% : 0.000001s : 6: predicate.row_tensor_add_zeros_like 0.49% : 0.000001s : 3: predicate.row_tensor_eliminate 0.90% : 0.000001s : 6: predicate.same_eliminate 0.68% : 0.000001s : 6: predicate.set_cell_output_no_recompute 1.16% : 0.000002s : 6: predicate.shard_identity_eliminate 0.93% : 0.000001s : 6: predicate.special_op_eliminate 1.05% : 0.000001s : 6: predicate.specialize_transform 1.05% : 0.000001s : 6: predicate.split_environ_get_set_with_tuple_value 1.21% : 0.000002s : 6: predicate.stack_unstack_eliminate 0.40% : 0.000001s : 3: predicate.switch_call_monad_eliminater 0.96% : 0.000001s : 9: predicate.switch_defer_inline 1.86% : 0.000003s : 15: predicate.switch_layer_defer_inline 4.23% : 0.000006s : 32: predicate.switch_simplify 0.74% : 0.000001s : 7: predicate.tile_eliminate 0.79% : 0.000001s : 7: predicate.transpose_eliminate 1.49% : 0.000002s : 13: predicate.tuple_list_convert_item_index_to_positive 1.56% : 0.000002s : 13: predicate.tuple_list_get_item_const_eliminator 1.41% : 0.000002s : 13: predicate.tuple_list_get_item_depend_reorder 3.14% : 0.000004s : 19: predicate.tuple_list_get_item_eliminator 1.50% : 0.000002s : 13: predicate.tuple_list_get_set_item_eliminator 2.43% : 0.000003s : 19: predicate.tuple_list_set_item_eliminator 1.48% : 0.000002s : 13: predicate.tuple_to_list_eliminator_ 1.94% : 0.000003s : 20: predicate.updatestate_pure_node_eliminater 2.72% : 0.000004s : 26: predicate.updatestate_useless_node_eliminater 0.46% : 0.000001s : 3: predicate.value_based_eliminate 0.95% : 0.000001s : 6: predicate.virtual_dataset_eliminate 0.81% : 0.000001s : 6: predicate.virtual_output_eliminate 0.35% : 0.000000s : 3: predicate.virtual_view_grad_eliminate 0.51% : 0.000001s : 3: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000220 5 8.50% : 0.000019s : 1: func_graph_cloner_run.FuncGraphClonerGraph 91.50% : 0.000201s : 4: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.024455 192 0.03% : 0.000006s : 1: ForceFp32Comm 12.97% : 0.003172s : 1: add_attr 12.91% : 0.003157s : 1: add_attr_with_inline 0.02% : 0.000006s : 1: add_comm_op_reuse_tag 0.23% : 0.000056s : 1: add_recomputation 0.03% : 0.000006s : 1: assign_add_opt 0.26% : 0.000063s : 1: auto_monad 0.11% : 0.000026s : 1: auto_monad_reorder 0.02% : 0.000006s : 1: begin_end_overlap_inline 0.03% : 0.000008s : 1: bias_add_comm_swap 2.16% : 0.000528s : 1: bootstrap 0.14% : 0.000033s : 1: cconv 0.03% : 0.000006s : 1: comm_op_add_attrs 0.08% : 0.000020s : 1: control_data_broadcast_order 0.06% : 0.000014s : 1: convert_after_rewriter 0.13% : 0.000032s : 1: cse_after_recomputation 0.03% : 0.000008s : 1: dataset_repeat_opt 0.08% : 0.000020s : 1: detach_backward 0.05% : 0.000012s : 1: environ_conv 0.09% : 0.000023s : 1: event_method 0.03% : 0.000007s : 1: full_micro_interleaved_order_control 0.03% : 0.000008s : 1: get_jit_bprop_graph 0.05% : 0.000011s : 1: graph_reusing 0.03% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.02% : 0.000006s : 1: handle_group_info 0.03% : 0.000009s : 1: inline 0.04% : 0.000009s : 1: insert-virtual-dataset 0.03% : 0.000006s : 1: interleave_parallel_branches 0.03% : 0.000006s : 1: interleave_split_concat_branches 0.03% : 0.000008s : 1: label_fine_grained_interleaved_index 0.04% : 0.000010s : 1: label_micro_interleaved_index 1.86% : 0.000455s : 1: loop_unroll 0.03% : 0.000007s : 1: merge_cast_opt 0.03% : 0.000008s : 1: micro_interleaved_order_control 2.16% : 0.000528s : 1: mutable_eliminate 0.04% : 0.000010s : 1: offloading_packed_experts 0.06% : 0.000015s : 1: opt.transform.loop_unroll_optimizer 0.06% : 0.000015s : 1: opt.transform.mutable_eliminate 3.49% : 0.000854s : 78: opt.transform.opt_a 0.11% : 0.000026s : 1: opt.transform.opt_after_cconv 0.11% : 0.000026s : 1: opt.transform.opt_after_jit_grad 0.39% : 0.000096s : 28: opt.transform.opt_b 0.18% : 0.000045s : 2: opt.transform.opt_trans_graph 0.14% : 0.000035s : 4: opt.transform.symbol_engine_opt 11.03% : 0.002696s : 1: opt_a 0.53% : 0.000130s : 1: opt_after_cconv 2.09% : 0.000510s : 1: opt_after_jit_grad 1.20% : 0.000294s : 1: opt_b 21.75% : 0.005318s : 1: optimize 0.10% : 0.000024s : 1: optimize_parallel_all_gather_comm 0.05% : 0.000011s : 1: order_py_execute_after_rewriter 0.10% : 0.000026s : 1: overlap_grad_flash_sp 0.03% : 0.000006s : 1: overlap_grad_matmul_and_grad_allreduce 0.04% : 0.000009s : 1: overlap_grad_ring_attention 0.03% : 0.000008s : 1: overlap_opt_shard_grad_in_pipeline 0.03% : 0.000006s : 1: overlap_opt_shard_in_pipeline 0.03% : 0.000008s : 1: overlap_param_gather 0.03% : 0.000007s : 1: overlap_recompute_allgather_and_fa_grad 0.04% : 0.000010s : 1: overlap_recompute_and_grad_model_parallel 0.03% : 0.000008s : 1: overlap_recompute_comm 0.04% : 0.000009s : 1: parallel-infer-symbol 0.03% : 0.000006s : 1: parallel-infer-symbol-second 0.03% : 0.000008s : 1: partial_unused_args_eliminate 0.04% : 0.000010s : 1: pipeline_parallel_scheduler 0.03% : 0.000007s : 1: pipeline_split 0.13% : 0.000032s : 1: pre_auto_parallel 0.11% : 0.000026s : 1: py_interpret_to_execute 0.07% : 0.000016s : 1: py_interpret_to_execute_after_opt_a 0.03% : 0.000007s : 1: remove_cast_before_assign_add 0.09% : 0.000021s : 1: remove_dup_value 1.54% : 0.000376s : 1: renormalize.infer 0.95% : 0.000233s : 1: renormalize.specialize 0.03% : 0.000008s : 1: reorder_send_recv_between_fp_bp 0.04% : 0.000011s : 1: rewriter_after_jit_bprop_graph 0.17% : 0.000042s : 1: rewriter_after_opt_a 0.22% : 0.000054s : 1: rewriter_before_opt_a 0.03% : 0.000008s : 1: slice_cell_reuse_recomputed_activation 0.03% : 0.000007s : 1: slice_recompute_activation 0.03% : 0.000007s : 1: split_layernorm_comm 0.03% : 0.000008s : 1: split_matmul_comm_elemetwise 0.04% : 0.000011s : 1: swap_dp_allreduce_reducescatter 0.41% : 0.000100s : 1: symbol_engine_optimizer 0.36% : 0.000089s : 1: tuple_transform 19.95% : 0.004879s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:47.173.531 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0143498, [21] [bootstrap]: 0.00044994 [type_inference]: 0.00495013 [event_method]: 1.249e-05 [auto_monad]: 5.705e-05 [graph_reusing]: 6.25002e-06 [inline]: 2.83998e-06 [add_attr]: 0.00342564, [1] [add_attr_with_inline]: 0.00341623, [1] [Cycle 1]: 6.235e-05, [2] [tag_attr]: 1.496e-05 [meta_addattr_fg_expand]: 3.37002e-06 [parallel-infer-symbol]: 3.56001e-06 [pre_auto_parallel]: 3.016e-05 [insert-virtual-dataset]: 2.71999e-06 [parallel-infer-symbol-second]: 7.80012e-07 [dataset_repeat_opt]: 2.63e-06 [pipeline_split]: 1.66998e-06 [optimize]: 0.00467491, [53] [py_interpret_to_execute]: 1.932e-05 [rewriter_before_opt_a]: 5.023e-05 [opt_a]: 0.00250968, [2] [Cycle 1]: 0.00178923, [45] [expand_dump_flag]: 3.26999e-06 [switch_simplify]: 2.768e-05 [loop_unroll]: 1.324e-05 [a_1]: 0.0003391 [with_stream_mark]: 1.97e-05 [recompute_prepare]: 1.005e-05 [updatestate_depend_eliminate]: 3.88001e-06 [updatestate_assign_eliminate]: 3.01001e-06 [updatestate_loads_eliminate]: 3.55998e-06 [parameter_eliminate]: 1.92001e-06 [a_2]: 9.296e-05 [accelerated_algorithm]: 7.89002e-06 [shard]: 2.11e-06 [meta_shard_fg_expand]: 1.74e-06 [shard_inline]: 6.06e-06 [merge_send_recv]: 9.41998e-06 [auto_parallel]: 7.15e-06 [parallel]: 2.039e-05 [flash_sp]: 9.12001e-06 [merge_comm]: 3.90998e-06 [allreduce_fusion]: 3.36001e-06 [matmul_add_comm_reduction]: 1.024e-05 [allreduce_slice_to_reducescatter]: 1.22999e-06 [virtual_shard_identity]: 7.61999e-06 [virtual_dataset]: 6.76e-06 [get_grad_eliminate_]: 5.89e-06 [virtual_output]: 5.77001e-06 [merge_forward]: 4.17998e-06 [cell_reuse_recompute_pass]: 1.16997e-06 [offload_activation]: 9.64999e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.562e-05 [merge_recompute_call_nodes]: 1.82001e-06 [before_grad]: 1.081e-05 [set_forward_comm_id_for_comm_node_pass]: 4.25e-06 [meta_fg_expand]: 3.09001e-06 [flash_sp_send_recv_attached]: 2.77002e-06 [receive_attached]: 2.37001e-06 [after_resolve]: 1.13e-05 [a_after_grad]: 1.05e-05 [renormalize]: 0.00071248 [add_forward_monad_depend]: 5.98002e-06 [auto_monad_grad]: 2.46e-06 [auto_monad_eliminator]: 1.824e-05 [cse]: 3.037e-05 [a_3]: 5.036e-05 [Cycle 2]: 0.00070961, [45] [expand_dump_flag]: 1.39998e-06 [switch_simplify]: 7.48e-06 [loop_unroll]: 6.19001e-06 [a_1]: 0.00013736 [with_stream_mark]: 1.316e-05 [recompute_prepare]: 7.06999e-06 [updatestate_depend_eliminate]: 3.81001e-06 [updatestate_assign_eliminate]: 2.46998e-06 [updatestate_loads_eliminate]: 3.09999e-06 [parameter_eliminate]: 1.55001e-06 [a_2]: 8.404e-05 [accelerated_algorithm]: 6.41998e-06 [shard]: 1.46998e-06 [meta_shard_fg_expand]: 2.07999e-06 [shard_inline]: 6.54999e-06 [merge_send_recv]: 6.36998e-06 [auto_parallel]: 6.18998e-06 [parallel]: 6.74001e-06 [flash_sp]: 4.07003e-06 [merge_comm]: 3.55e-06 [allreduce_fusion]: 3.41999e-06 [matmul_add_comm_reduction]: 8.31002e-06 [allreduce_slice_to_reducescatter]: 5.10016e-07 [virtual_shard_identity]: 8.15e-06 [virtual_dataset]: 5.79e-06 [get_grad_eliminate_]: 5.98002e-06 [virtual_output]: 5.54e-06 [merge_forward]: 3.66001e-06 [cell_reuse_recompute_pass]: 1.75001e-06 [offload_activation]: 8.94e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.221e-05 [merge_recompute_call_nodes]: 1.46002e-06 [before_grad]: 9.96e-06 [set_forward_comm_id_for_comm_node_pass]: 4.23999e-06 [meta_fg_expand]: 2.51998e-06 [flash_sp_send_recv_attached]: 9.29984e-07 [receive_attached]: 1.30999e-06 [after_resolve]: 1.012e-05 [a_after_grad]: 9.07999e-06 [renormalize]: 8.00064e-08 [add_forward_monad_depend]: 2.39999e-06 [auto_monad_grad]: 1.08001e-06 [auto_monad_eliminator]: 1.057e-05 [cse]: 1.851e-05 [a_3]: 3.511e-05 [py_interpret_to_execute_after_opt_a]: 1.157e-05 [slice_cell_reuse_recomputed_activation]: 2.68e-06 [rewriter_after_opt_a]: 3.826e-05 [convert_after_rewriter]: 7.56999e-06 [order_py_execute_after_rewriter]: 5.65001e-06 [mutable_eliminate]: 0.00062304 [opt_b]: 0.00021493, [1] [Cycle 1]: 0.00020738, [7] [b_1]: 0.00012123 [b_2]: 7.56999e-06 [updatestate_depend_eliminate]: 8.62998e-06 [updatestate_assign_eliminate]: 3.01001e-06 [updatestate_loads_eliminate]: 2.66e-06 [renormalize]: 7.59988e-07 [cse]: 2.461e-05 [optimize_parallel_all_gather_comm]: 1.871e-05 [overlap_param_gather]: 2.16e-06 [cconv]: 3.139e-05 [loop_unroll]: 0.00045538 [opt_after_cconv]: 0.00010389, [1] [Cycle 1]: 9.789e-05, [7] [c_1]: 2.811e-05 [parameter_eliminate]: 4.64002e-06 [updatestate_depend_eliminate]: 5.51002e-06 [updatestate_assign_eliminate]: 2.42001e-06 [updatestate_loads_eliminate]: 2.58e-06 [cse]: 1.979e-05 [renormalize]: 2.80008e-07 [remove_dup_value]: 1.558e-05 [tuple_transform]: 7.565e-05, [1] [Cycle 1]: 7.118e-05, [4] [d_1]: 4.259e-05 [none_parameter_eliminate]: 1.97001e-06 [renormalize]: 1.8999e-07 [switch_simplify]: 7.6e-06 [partial_unused_args_eliminate]: 1.97999e-06 [add_recomputation]: 5.129e-05 [cse_after_recomputation]: 2.319e-05, [1] [Cycle 1]: 1.89e-05, [1] [cse]: 1.218e-05 [environ_conv]: 5.76e-06 [swap_dp_allreduce_reducescatter]: 5.69999e-06 [bias_add_comm_swap]: 2.85002e-06 [label_micro_interleaved_index]: 5.71998e-06 [label_fine_grained_interleaved_index]: 3.06001e-06 [merge_cast_opt]: 1.29998e-06 [slice_recompute_activation]: 2.23002e-06 [micro_interleaved_order_control]: 2.16e-06 [assign_add_opt]: 1.24e-06 [ForceFp32Comm]: 8.09989e-07 [remove_cast_before_assign_add]: 1.31002e-06 [full_micro_interleaved_order_control]: 2.78e-06 [reorder_send_recv_between_fp_bp]: 3.04001e-06 [comm_op_add_attrs]: 1.56002e-06 [add_comm_op_reuse_tag]: 1.09998e-06 [interleave_split_concat_branches]: 1.12e-06 [interleave_parallel_branches]: 1.08001e-06 [overlap_opt_shard_in_pipeline]: 1.15999e-06 [overlap_opt_shard_grad_in_pipeline]: 2.11998e-06 [control_data_broadcast_order]: 1.386e-05 [grouped_pairwise_exchange_alltoall]: 1.66e-06 [offloading_packed_experts]: 4.13001e-06 [overlap_recompute_and_grad_model_parallel]: 5.18002e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.22999e-06 [overlap_recompute_allgather_and_fa_grad]: 1.34998e-06 [overlap_recompute_comm]: 2.27001e-06 [overlap_grad_ring_attention]: 4.03999e-06 [overlap_grad_flash_sp]: 2.127e-05 [begin_end_overlap_inline]: 6.10016e-07 [split_matmul_comm_elemetwise]: 2.70002e-06 [split_layernorm_comm]: 1.79998e-06 [handle_group_info]: 1.04e-06 [symbol_engine_optimizer]: 7.756e-05, [1] [Cycle 1]: 7.329e-05, [6] [build]: 3.41001e-06 [elim_shapecalc]: 1.102e-05 [elim_not_effective]: 1.288e-05 [opt_reshape]: 6.80002e-06 [fold_const_symbol]: 9.71e-06 [renormalize]: 1.69995e-07 [detach_backward]: 2.27999e-06 [pipeline_parallel_scheduler]: 2.12001e-06 [auto_monad_reorder]: 1.634e-05 [get_jit_bprop_graph]: 1.88002e-06 [rewriter_after_jit_bprop_graph]: 5.54e-06 [opt_after_jit_grad]: 0.0004992 [validate]: 4.237e-05 Sums bootstrap : 0.000450s : 4.55% type_inference : 0.004950s : 50.01% event_method : 0.000012s : 0.13% auto_monad : 0.000057s : 0.58% graph_reusing : 0.000006s : 0.06% inline : 0.000003s : 0.03% add_attr.add_attr_with_inline.tag_attr : 0.000015s : 0.15% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000003s : 0.03% parallel-infer-symbol : 0.000004s : 0.04% pre_auto_parallel : 0.000030s : 0.30% insert-virtual-dataset : 0.000003s : 0.03% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000003s : 0.03% pipeline_split : 0.000002s : 0.02% optimize.py_interpret_to_execute : 0.000019s : 0.20% optimize.rewriter_before_opt_a : 0.000050s : 0.51% optimize.opt_a.expand_dump_flag : 0.000005s : 0.05% optimize.opt_a.switch_simplify : 0.000035s : 0.36% optimize.opt_a.loop_unroll : 0.000019s : 0.20% optimize.opt_a.a_1 : 0.000476s : 4.81% optimize.opt_a.with_stream_mark : 0.000033s : 0.33% optimize.opt_a.recompute_prepare : 0.000017s : 0.17% optimize.opt_a.updatestate_depend_eliminate : 0.000008s : 0.08% optimize.opt_a.updatestate_assign_eliminate : 0.000005s : 0.06% optimize.opt_a.updatestate_loads_eliminate : 0.000007s : 0.07% optimize.opt_a.parameter_eliminate : 0.000003s : 0.04% optimize.opt_a.a_2 : 0.000177s : 1.79% optimize.opt_a.accelerated_algorithm : 0.000014s : 0.14% optimize.opt_a.shard : 0.000004s : 0.04% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.04% optimize.opt_a.shard_inline : 0.000013s : 0.13% optimize.opt_a.merge_send_recv : 0.000016s : 0.16% optimize.opt_a.auto_parallel : 0.000013s : 0.13% optimize.opt_a.parallel : 0.000027s : 0.27% optimize.opt_a.flash_sp : 0.000013s : 0.13% optimize.opt_a.merge_comm : 0.000007s : 0.08% optimize.opt_a.allreduce_fusion : 0.000007s : 0.07% optimize.opt_a.matmul_add_comm_reduction : 0.000019s : 0.19% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000002s : 0.02% optimize.opt_a.virtual_shard_identity : 0.000016s : 0.16% optimize.opt_a.virtual_dataset : 0.000013s : 0.13% optimize.opt_a.get_grad_eliminate_ : 0.000012s : 0.12% optimize.opt_a.virtual_output : 0.000011s : 0.11% optimize.opt_a.merge_forward : 0.000008s : 0.08% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.03% optimize.opt_a.offload_activation : 0.000019s : 0.19% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000028s : 0.28% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.03% optimize.opt_a.before_grad : 0.000021s : 0.21% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000008s : 0.09% optimize.opt_a.meta_fg_expand : 0.000006s : 0.06% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.04% optimize.opt_a.receive_attached : 0.000004s : 0.04% optimize.opt_a.after_resolve : 0.000021s : 0.22% optimize.opt_a.a_after_grad : 0.000020s : 0.20% optimize.opt_a.renormalize : 0.000713s : 7.20% optimize.opt_a.add_forward_monad_depend : 0.000008s : 0.08% optimize.opt_a.auto_monad_grad : 0.000004s : 0.04% optimize.opt_a.auto_monad_eliminator : 0.000029s : 0.29% optimize.opt_a.cse : 0.000049s : 0.49% optimize.opt_a.a_3 : 0.000085s : 0.86% optimize.py_interpret_to_execute_after_opt_a : 0.000012s : 0.12% optimize.slice_cell_reuse_recomputed_activation : 0.000003s : 0.03% optimize.rewriter_after_opt_a : 0.000038s : 0.39% optimize.convert_after_rewriter : 0.000008s : 0.08% optimize.order_py_execute_after_rewriter : 0.000006s : 0.06% optimize.mutable_eliminate : 0.000623s : 6.29% optimize.opt_b.b_1 : 0.000121s : 1.22% optimize.opt_b.b_2 : 0.000008s : 0.08% optimize.opt_b.updatestate_depend_eliminate : 0.000009s : 0.09% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_b.renormalize : 0.000001s : 0.01% optimize.opt_b.cse : 0.000025s : 0.25% optimize.optimize_parallel_all_gather_comm : 0.000019s : 0.19% optimize.overlap_param_gather : 0.000002s : 0.02% optimize.cconv : 0.000031s : 0.32% optimize.loop_unroll : 0.000455s : 4.60% optimize.opt_after_cconv.c_1 : 0.000028s : 0.28% optimize.opt_after_cconv.parameter_eliminate : 0.000005s : 0.05% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.06% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000002s : 0.02% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.cse : 0.000020s : 0.20% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000016s : 0.16% optimize.tuple_transform.d_1 : 0.000043s : 0.43% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.02% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000008s : 0.08% optimize.partial_unused_args_eliminate : 0.000002s : 0.02% optimize.add_recomputation : 0.000051s : 0.52% optimize.cse_after_recomputation.cse : 0.000012s : 0.12% optimize.environ_conv : 0.000006s : 0.06% optimize.swap_dp_allreduce_reducescatter : 0.000006s : 0.06% optimize.bias_add_comm_swap : 0.000003s : 0.03% optimize.label_micro_interleaved_index : 0.000006s : 0.06% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.03% optimize.merge_cast_opt : 0.000001s : 0.01% optimize.slice_recompute_activation : 0.000002s : 0.02% optimize.micro_interleaved_order_control : 0.000002s : 0.02% optimize.assign_add_opt : 0.000001s : 0.01% optimize.ForceFp32Comm : 0.000001s : 0.01% optimize.remove_cast_before_assign_add : 0.000001s : 0.01% optimize.full_micro_interleaved_order_control : 0.000003s : 0.03% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.03% optimize.comm_op_add_attrs : 0.000002s : 0.02% optimize.add_comm_op_reuse_tag : 0.000001s : 0.01% optimize.interleave_split_concat_branches : 0.000001s : 0.01% optimize.interleave_parallel_branches : 0.000001s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.02% optimize.control_data_broadcast_order : 0.000014s : 0.14% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.02% optimize.offloading_packed_experts : 0.000004s : 0.04% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.05% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.01% optimize.overlap_recompute_comm : 0.000002s : 0.02% optimize.overlap_grad_ring_attention : 0.000004s : 0.04% optimize.overlap_grad_flash_sp : 0.000021s : 0.21% optimize.begin_end_overlap_inline : 0.000001s : 0.01% optimize.split_matmul_comm_elemetwise : 0.000003s : 0.03% optimize.split_layernorm_comm : 0.000002s : 0.02% optimize.handle_group_info : 0.000001s : 0.01% optimize.symbol_engine_optimizer.build : 0.000003s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000011s : 0.11% optimize.symbol_engine_optimizer.elim_not_effective : 0.000013s : 0.13% optimize.symbol_engine_optimizer.opt_reshape : 0.000007s : 0.07% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000010s : 0.10% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.02% pipeline_parallel_scheduler : 0.000002s : 0.02% auto_monad_reorder : 0.000016s : 0.17% get_jit_bprop_graph : 0.000002s : 0.02% rewriter_after_jit_bprop_graph : 0.000006s : 0.06% opt_after_jit_grad : 0.000499s : 5.04% validate : 0.000042s : 0.43% Time group info: ------[substitution.] 0.000200 23 40.95% : 0.000082s : 4: substitution.arithmetic_simplify 0.97% : 0.000002s : 2: substitution.elim_not_effective 0.68% : 0.000001s : 2: substitution.fold_const_symbol 3.13% : 0.000006s : 3: substitution.graph_param_transform 47.43% : 0.000095s : 2: substitution.inline 2.22% : 0.000004s : 4: substitution.j_node_and_user_rematch 2.72% : 0.000005s : 4: substitution.remove_not_recompute_node 1.88% : 0.000004s : 2: substitution.replace_old_param ------[type_inference.] 0.004897 2 90.75% : 0.004444s : 1: type_inference.infer 9.25% : 0.000453s : 1: type_inference.specialize ------[replace.] 0.000023 2 100.00% : 0.000023s : 2: replace.inline ------[match.] 0.000093 2 100.00% : 0.000093s : 2: match.inline ------[predicate.] 0.000142 754 1.07% : 0.000002s : 7: predicate.accumulaten_eliminater 1.35% : 0.000002s : 3: predicate.ad_related_special_op_eliminate 0.69% : 0.000001s : 6: predicate.addn_check_dump 0.79% : 0.000001s : 7: predicate.addn_zero_filter 0.68% : 0.000001s : 7: predicate.adjust_all_reduce_mul_add 2.99% : 0.000004s : 13: predicate.arithmetic_simplify 0.79% : 0.000001s : 7: predicate.cast_eliminate 1.13% : 0.000002s : 6: predicate.check_bprop_eliminate 0.64% : 0.000001s : 6: predicate.compare_switch_simplify 0.23% : 0.000000s : 3: predicate.const_output_eliminate 0.78% : 0.000001s : 6: predicate.depend_value_elim 0.84% : 0.000001s : 7: predicate.dict_get_item_const_eliminator 0.96% : 0.000001s : 7: predicate.dict_get_item_eliminator 0.73% : 0.000001s : 7: predicate.dict_set_item_eliminator 1.30% : 0.000002s : 6: predicate.dumpgradient_eliminate 0.28% : 0.000000s : 3: predicate.elim_not_effective 0.53% : 0.000001s : 3: predicate.elim_shapecalc_of_broadcastargs 0.98% : 0.000001s : 10: predicate.environ_add_const_eliminate 0.94% : 0.000001s : 10: predicate.environ_get_add_eliminate 0.93% : 0.000001s : 10: predicate.environ_get_depend_swap 1.64% : 0.000002s : 16: predicate.environ_get_eliminate 0.97% : 0.000001s : 10: predicate.environ_get_set_eliminate 0.87% : 0.000001s : 9: predicate.exchange_switch_depend_value 1.82% : 0.000003s : 9: predicate.float_depend_g_call 0.63% : 0.000001s : 6: predicate.float_environ_get_switch 0.91% : 0.000001s : 9: predicate.float_tuple_getitem_switch 0.19% : 0.000000s : 3: predicate.fold_const_symbol 0.76% : 0.000001s : 6: predicate.get_grad_eliminate 0.42% : 0.000001s : 3: predicate.graph_param_transform 0.73% : 0.000001s : 6: predicate.incorporate_call 0.65% : 0.000001s : 6: predicate.incorporate_call_switch 6.37% : 0.000009s : 34: predicate.inline 1.15% : 0.000002s : 6: predicate.inline_without_move 0.39% : 0.000001s : 6: predicate.j_node_and_user_rematch 1.14% : 0.000002s : 6: predicate.less_batch_normalization 1.52% : 0.000002s : 13: predicate.list_to_tuple_eliminator_ 1.95% : 0.000003s : 20: predicate.load_eliminater 1.48% : 0.000002s : 3: predicate.loop_unroll_after_grad 1.69% : 0.000002s : 14: predicate.loop_unroll_before_grad 1.69% : 0.000002s : 13: predicate.make_slice_get_slice_eliminator 0.65% : 0.000001s : 6: predicate.merge_addn 0.59% : 0.000001s : 6: predicate.micro_step_allgather_replace 0.71% : 0.000001s : 6: predicate.mini_step_allgather_replace 0.64% : 0.000001s : 7: predicate.minmaximum_grad 2.10% : 0.000003s : 3: predicate.mutable_eliminate 0.41% : 0.000001s : 3: predicate.opt_reshape 0.66% : 0.000001s : 3: predicate.parallel_virtual_node 1.19% : 0.000002s : 9: predicate.partial_defer_inline 1.15% : 0.000002s : 10: predicate.partial_eliminate 0.74% : 0.000001s : 7: predicate.print_const_string_wrapper 0.76% : 0.000001s : 6: predicate.reduce_all_const_elim 1.23% : 0.000002s : 7: predicate.reduce_eliminate 2.03% : 0.000003s : 20: predicate.redundant_stop_gradient_eliminater 0.86% : 0.000001s : 6: predicate.remove_not_recompute_node 1.31% : 0.000002s : 13: predicate.replace_applicator 0.62% : 0.000001s : 6: predicate.replace_old_param 0.36% : 0.000001s : 3: predicate.reset_defer_inline 0.77% : 0.000001s : 7: predicate.reshape_eliminate 0.74% : 0.000001s : 6: predicate.row_tensor_add_zeros_like 0.41% : 0.000001s : 3: predicate.row_tensor_eliminate 1.14% : 0.000002s : 6: predicate.same_eliminate 0.67% : 0.000001s : 6: predicate.set_cell_output_no_recompute 1.13% : 0.000002s : 6: predicate.shard_identity_eliminate 0.89% : 0.000001s : 6: predicate.special_op_eliminate 0.86% : 0.000001s : 6: predicate.specialize_transform 1.14% : 0.000002s : 6: predicate.split_environ_get_set_with_tuple_value 1.05% : 0.000001s : 6: predicate.stack_unstack_eliminate 0.39% : 0.000001s : 3: predicate.switch_call_monad_eliminater 0.96% : 0.000001s : 9: predicate.switch_defer_inline 1.65% : 0.000002s : 15: predicate.switch_layer_defer_inline 4.36% : 0.000006s : 32: predicate.switch_simplify 0.80% : 0.000001s : 7: predicate.tile_eliminate 0.79% : 0.000001s : 7: predicate.transpose_eliminate 1.78% : 0.000003s : 13: predicate.tuple_list_convert_item_index_to_positive 1.51% : 0.000002s : 13: predicate.tuple_list_get_item_const_eliminator 1.40% : 0.000002s : 13: predicate.tuple_list_get_item_depend_reorder 3.51% : 0.000005s : 19: predicate.tuple_list_get_item_eliminator 1.39% : 0.000002s : 13: predicate.tuple_list_get_set_item_eliminator 2.23% : 0.000003s : 19: predicate.tuple_list_set_item_eliminator 1.36% : 0.000002s : 13: predicate.tuple_to_list_eliminator_ 1.78% : 0.000003s : 20: predicate.updatestate_pure_node_eliminater 2.70% : 0.000004s : 26: predicate.updatestate_useless_node_eliminater 0.57% : 0.000001s : 3: predicate.value_based_eliminate 0.90% : 0.000001s : 6: predicate.virtual_dataset_eliminate 0.67% : 0.000001s : 6: predicate.virtual_output_eliminate 0.30% : 0.000000s : 3: predicate.virtual_view_grad_eliminate 0.53% : 0.000001s : 3: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000251 5 7.99% : 0.000020s : 1: func_graph_cloner_run.FuncGraphClonerGraph 92.01% : 0.000231s : 4: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.024177 192 0.02% : 0.000004s : 1: ForceFp32Comm 14.19% : 0.003431s : 1: add_attr 14.15% : 0.003420s : 1: add_attr_with_inline 0.02% : 0.000004s : 1: add_comm_op_reuse_tag 0.23% : 0.000055s : 1: add_recomputation 0.02% : 0.000004s : 1: assign_add_opt 0.26% : 0.000062s : 1: auto_monad 0.08% : 0.000020s : 1: auto_monad_reorder 0.01% : 0.000003s : 1: begin_end_overlap_inline 0.02% : 0.000006s : 1: bias_add_comm_swap 1.98% : 0.000480s : 1: bootstrap 0.15% : 0.000035s : 1: cconv 0.02% : 0.000004s : 1: comm_op_add_attrs 0.07% : 0.000017s : 1: control_data_broadcast_order 0.05% : 0.000011s : 1: convert_after_rewriter 0.11% : 0.000026s : 1: cse_after_recomputation 0.02% : 0.000006s : 1: dataset_repeat_opt 0.02% : 0.000006s : 1: detach_backward 0.04% : 0.000010s : 1: environ_conv 0.08% : 0.000020s : 1: event_method 0.02% : 0.000006s : 1: full_micro_interleaved_order_control 0.02% : 0.000005s : 1: get_jit_bprop_graph 0.04% : 0.000009s : 1: graph_reusing 0.02% : 0.000005s : 1: grouped_pairwise_exchange_alltoall 0.02% : 0.000004s : 1: handle_group_info 0.02% : 0.000006s : 1: inline 0.03% : 0.000006s : 1: insert-virtual-dataset 0.02% : 0.000004s : 1: interleave_parallel_branches 0.02% : 0.000004s : 1: interleave_split_concat_branches 0.02% : 0.000006s : 1: label_fine_grained_interleaved_index 0.04% : 0.000009s : 1: label_micro_interleaved_index 1.92% : 0.000464s : 1: loop_unroll 0.02% : 0.000004s : 1: merge_cast_opt 0.02% : 0.000005s : 1: micro_interleaved_order_control 2.62% : 0.000634s : 1: mutable_eliminate 0.03% : 0.000007s : 1: offloading_packed_experts 0.07% : 0.000016s : 1: opt.transform.loop_unroll_optimizer 0.08% : 0.000020s : 1: opt.transform.mutable_eliminate 3.64% : 0.000880s : 78: opt.transform.opt_a 0.11% : 0.000027s : 1: opt.transform.opt_after_cconv 0.11% : 0.000027s : 1: opt.transform.opt_after_jit_grad 0.40% : 0.000096s : 28: opt.transform.opt_b 0.20% : 0.000048s : 2: opt.transform.opt_trans_graph 0.15% : 0.000036s : 4: opt.transform.symbol_engine_opt 10.39% : 0.002513s : 1: opt_a 0.44% : 0.000107s : 1: opt_after_cconv 2.11% : 0.000510s : 1: opt_after_jit_grad 0.91% : 0.000219s : 1: opt_b 19.36% : 0.004681s : 1: optimize 0.09% : 0.000022s : 1: optimize_parallel_all_gather_comm 0.04% : 0.000009s : 1: order_py_execute_after_rewriter 0.10% : 0.000025s : 1: overlap_grad_flash_sp 0.02% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.03% : 0.000007s : 1: overlap_grad_ring_attention 0.02% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.02% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.02% : 0.000005s : 1: overlap_param_gather 0.02% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.03% : 0.000008s : 1: overlap_recompute_and_grad_model_parallel 0.02% : 0.000005s : 1: overlap_recompute_comm 0.03% : 0.000007s : 1: parallel-infer-symbol 0.02% : 0.000004s : 1: parallel-infer-symbol-second 0.02% : 0.000006s : 1: partial_unused_args_eliminate 0.02% : 0.000006s : 1: pipeline_parallel_scheduler 0.02% : 0.000004s : 1: pipeline_split 0.14% : 0.000034s : 1: pre_auto_parallel 0.10% : 0.000023s : 1: py_interpret_to_execute 0.06% : 0.000016s : 1: py_interpret_to_execute_after_opt_a 0.02% : 0.000004s : 1: remove_cast_before_assign_add 0.08% : 0.000019s : 1: remove_dup_value 1.76% : 0.000424s : 1: renormalize.infer 1.15% : 0.000279s : 1: renormalize.specialize 0.02% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.04% : 0.000009s : 1: rewriter_after_jit_bprop_graph 0.18% : 0.000044s : 1: rewriter_after_opt_a 0.23% : 0.000055s : 1: rewriter_before_opt_a 0.02% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.02% : 0.000005s : 1: slice_recompute_activation 0.02% : 0.000005s : 1: split_layernorm_comm 0.02% : 0.000005s : 1: split_matmul_comm_elemetwise 0.04% : 0.000009s : 1: swap_dp_allreduce_reducescatter 0.33% : 0.000080s : 1: symbol_engine_optimizer 0.32% : 0.000078s : 1: tuple_transform 20.56% : 0.004972s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:47.371.185 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:47.371.454 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0151226, [21] [bootstrap]: 0.00050256 [type_inference]: 0.00496942 [event_method]: 1.273e-05 [auto_monad]: 5.494e-05 [graph_reusing]: 5.86e-06 [inline]: 2.57001e-06 [add_attr]: 0.00332348, [1] [add_attr_with_inline]: 0.00331412, [1] [Cycle 1]: 7.931e-05, [2] [tag_attr]: 1.628e-05 [meta_addattr_fg_expand]: 4.03999e-06 [parallel-infer-symbol]: 3.56999e-06 [pre_auto_parallel]: 2.732e-05 [insert-virtual-dataset]: 2.64001e-06 [parallel-infer-symbol-second]: 8.30012e-07 [dataset_repeat_opt]: 1.89999e-06 [pipeline_split]: 1.59998e-06 [optimize]: 0.00506739, [53] [py_interpret_to_execute]: 2.308e-05 [rewriter_before_opt_a]: 5.043e-05 [opt_a]: 0.00269468, [2] [Cycle 1]: 0.00182189, [45] [expand_dump_flag]: 2.76e-06 [switch_simplify]: 2.583e-05 [loop_unroll]: 1.411e-05 [a_1]: 0.00033466 [with_stream_mark]: 1.775e-05 [recompute_prepare]: 9.26002e-06 [updatestate_depend_eliminate]: 3.83001e-06 [updatestate_assign_eliminate]: 3.31999e-06 [updatestate_loads_eliminate]: 3.57997e-06 [parameter_eliminate]: 1.86e-06 [a_2]: 0.00012395 [accelerated_algorithm]: 7.51999e-06 [shard]: 2.02999e-06 [meta_shard_fg_expand]: 1.87999e-06 [shard_inline]: 6.04999e-06 [merge_send_recv]: 8.94998e-06 [auto_parallel]: 7.37997e-06 [parallel]: 1.778e-05 [flash_sp]: 8.84e-06 [merge_comm]: 4.27e-06 [allreduce_fusion]: 3.76001e-06 [matmul_add_comm_reduction]: 9.57999e-06 [allreduce_slice_to_reducescatter]: 5.69999e-07 [virtual_shard_identity]: 7.76001e-06 [virtual_dataset]: 6.60002e-06 [get_grad_eliminate_]: 5.64998e-06 [virtual_output]: 6.25002e-06 [merge_forward]: 4.52e-06 [cell_reuse_recompute_pass]: 1.32999e-06 [offload_activation]: 1.085e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.566e-05 [merge_recompute_call_nodes]: 1.94e-06 [before_grad]: 1.082e-05 [set_forward_comm_id_for_comm_node_pass]: 3.97e-06 [meta_fg_expand]: 2.76e-06 [flash_sp_send_recv_attached]: 2.88e-06 [receive_attached]: 2.35002e-06 [after_resolve]: 1.044e-05 [a_after_grad]: 9.54e-06 [renormalize]: 0.00058955 [add_forward_monad_depend]: 5.16002e-06 [auto_monad_grad]: 2.47001e-06 [auto_monad_eliminator]: 1.628e-05 [cse]: 2.849e-05 [a_3]: 6.183e-05 [Cycle 2]: 0.00085816, [45] [expand_dump_flag]: 1.20001e-06 [switch_simplify]: 7.61001e-06 [loop_unroll]: 6.44999e-06 [a_1]: 0.0001326 [with_stream_mark]: 1.196e-05 [recompute_prepare]: 7.03e-06 [updatestate_depend_eliminate]: 3.3e-06 [updatestate_assign_eliminate]: 2.69001e-06 [updatestate_loads_eliminate]: 2.82002e-06 [parameter_eliminate]: 1.92001e-06 [a_2]: 0.00011202 [accelerated_algorithm]: 6.31e-06 [shard]: 1.14e-06 [meta_shard_fg_expand]: 1.42999e-06 [shard_inline]: 6.63998e-06 [merge_send_recv]: 6.24999e-06 [auto_parallel]: 5.49e-06 [parallel]: 6.13998e-06 [flash_sp]: 3.44001e-06 [merge_comm]: 3.45e-06 [allreduce_fusion]: 3.08e-06 [matmul_add_comm_reduction]: 7.71001e-06 [allreduce_slice_to_reducescatter]: 4.50003e-07 [virtual_shard_identity]: 7.63001e-06 [virtual_dataset]: 5.84999e-06 [get_grad_eliminate_]: 5.63002e-06 [virtual_output]: 5.65001e-06 [merge_forward]: 3.31999e-06 [cell_reuse_recompute_pass]: 1.57001e-06 [offload_activation]: 7.4e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.402e-05 [merge_recompute_call_nodes]: 1.12e-06 [before_grad]: 9.56998e-06 [set_forward_comm_id_for_comm_node_pass]: 3.83001e-06 [meta_fg_expand]: 2.31e-06 [flash_sp_send_recv_attached]: 9.49978e-07 [receive_attached]: 1.24998e-06 [after_resolve]: 1.006e-05 [a_after_grad]: 8.18001e-06 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 2.39001e-06 [auto_monad_grad]: 1.84e-06 [auto_monad_eliminator]: 9.45001e-06 [cse]: 1.8e-05 [a_3]: 4.951e-05 [py_interpret_to_execute_after_opt_a]: 1.372e-05 [slice_cell_reuse_recomputed_activation]: 5.32999e-06 [rewriter_after_opt_a]: 4.32e-05 [convert_after_rewriter]: 9.65002e-06 [order_py_execute_after_rewriter]: 8.50999e-06 [mutable_eliminate]: 0.00055713 [opt_b]: 0.00028315, [1] [Cycle 1]: 0.0002739, [7] [b_1]: 0.00017087 [b_2]: 7.63001e-06 [updatestate_depend_eliminate]: 6.79999e-06 [updatestate_assign_eliminate]: 2.71e-06 [updatestate_loads_eliminate]: 2.43e-06 [renormalize]: 5.99975e-07 [cse]: 2.334e-05 [optimize_parallel_all_gather_comm]: 2.462e-05 [overlap_param_gather]: 5.21002e-06 [cconv]: 3.374e-05 [loop_unroll]: 0.0004695 [opt_after_cconv]: 0.00013058, [1] [Cycle 1]: 0.00012174, [7] [c_1]: 2.877e-05 [parameter_eliminate]: 3.51001e-06 [updatestate_depend_eliminate]: 5.74e-06 [updatestate_assign_eliminate]: 2.58998e-06 [updatestate_loads_eliminate]: 2.87002e-06 [cse]: 2.151e-05 [renormalize]: 5.40022e-07 [remove_dup_value]: 1.817e-05 [tuple_transform]: 8.699e-05, [1] [Cycle 1]: 8.01e-05, [4] [d_1]: 4.105e-05 [none_parameter_eliminate]: 1.86998e-06 [renormalize]: 1.99972e-07 [switch_simplify]: 6.58003e-06 [partial_unused_args_eliminate]: 4.53999e-06 [add_recomputation]: 5.117e-05 [cse_after_recomputation]: 3.029e-05, [1] [Cycle 1]: 2.266e-05, [1] [cse]: 1.291e-05 [environ_conv]: 8.48999e-06 [swap_dp_allreduce_reducescatter]: 7.43999e-06 [bias_add_comm_swap]: 4.92e-06 [label_micro_interleaved_index]: 7.93999e-06 [label_fine_grained_interleaved_index]: 5.20001e-06 [merge_cast_opt]: 4.61002e-06 [slice_recompute_activation]: 4.87998e-06 [micro_interleaved_order_control]: 4.84e-06 [assign_add_opt]: 3.66001e-06 [ForceFp32Comm]: 3.13e-06 [remove_cast_before_assign_add]: 3.75998e-06 [full_micro_interleaved_order_control]: 5.12e-06 [reorder_send_recv_between_fp_bp]: 5.45001e-06 [comm_op_add_attrs]: 3.56999e-06 [add_comm_op_reuse_tag]: 3.63e-06 [interleave_split_concat_branches]: 3.82002e-06 [interleave_parallel_branches]: 3.43e-06 [overlap_opt_shard_in_pipeline]: 3.7e-06 [overlap_opt_shard_grad_in_pipeline]: 4.32998e-06 [control_data_broadcast_order]: 1.599e-05 [grouped_pairwise_exchange_alltoall]: 3.78999e-06 [offloading_packed_experts]: 6.81001e-06 [overlap_recompute_and_grad_model_parallel]: 6.98998e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.61999e-06 [overlap_recompute_allgather_and_fa_grad]: 3.73999e-06 [overlap_recompute_comm]: 4.79e-06 [overlap_grad_ring_attention]: 6.51e-06 [overlap_grad_flash_sp]: 2.352e-05 [begin_end_overlap_inline]: 3.21001e-06 [split_matmul_comm_elemetwise]: 4.71002e-06 [split_layernorm_comm]: 4.43001e-06 [handle_group_info]: 3.35e-06 [symbol_engine_optimizer]: 0.00010007, [1] [Cycle 1]: 9.311e-05, [6] [build]: 2.81e-06 [elim_shapecalc]: 1.004e-05 [elim_not_effective]: 1.424e-05 [opt_reshape]: 6.85002e-06 [fold_const_symbol]: 9.92999e-06 [renormalize]: 4.39992e-07 [detach_backward]: 4.3e-06 [pipeline_parallel_scheduler]: 1.79e-06 [auto_monad_reorder]: 1.941e-05 [get_jit_bprop_graph]: 1.99e-06 [rewriter_after_jit_bprop_graph]: 4.37e-06 [opt_after_jit_grad]: 0.00051166 [validate]: 3.777e-05 Sums bootstrap : 0.000503s : 5.02% type_inference : 0.004969s : 49.63% event_method : 0.000013s : 0.13% auto_monad : 0.000055s : 0.55% graph_reusing : 0.000006s : 0.06% inline : 0.000003s : 0.03% add_attr.add_attr_with_inline.tag_attr : 0.000016s : 0.16% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000004s : 0.04% parallel-infer-symbol : 0.000004s : 0.04% pre_auto_parallel : 0.000027s : 0.27% insert-virtual-dataset : 0.000003s : 0.03% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.02% optimize.py_interpret_to_execute : 0.000023s : 0.23% optimize.rewriter_before_opt_a : 0.000050s : 0.50% optimize.opt_a.expand_dump_flag : 0.000004s : 0.04% optimize.opt_a.switch_simplify : 0.000033s : 0.33% optimize.opt_a.loop_unroll : 0.000021s : 0.21% optimize.opt_a.a_1 : 0.000467s : 4.67% optimize.opt_a.with_stream_mark : 0.000030s : 0.30% optimize.opt_a.recompute_prepare : 0.000016s : 0.16% optimize.opt_a.updatestate_depend_eliminate : 0.000007s : 0.07% optimize.opt_a.updatestate_assign_eliminate : 0.000006s : 0.06% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.06% optimize.opt_a.parameter_eliminate : 0.000004s : 0.04% optimize.opt_a.a_2 : 0.000236s : 2.36% optimize.opt_a.accelerated_algorithm : 0.000014s : 0.14% optimize.opt_a.shard : 0.000003s : 0.03% optimize.opt_a.meta_shard_fg_expand : 0.000003s : 0.03% optimize.opt_a.shard_inline : 0.000013s : 0.13% optimize.opt_a.merge_send_recv : 0.000015s : 0.15% optimize.opt_a.auto_parallel : 0.000013s : 0.13% optimize.opt_a.parallel : 0.000024s : 0.24% optimize.opt_a.flash_sp : 0.000012s : 0.12% optimize.opt_a.merge_comm : 0.000008s : 0.08% optimize.opt_a.allreduce_fusion : 0.000007s : 0.07% optimize.opt_a.matmul_add_comm_reduction : 0.000017s : 0.17% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000015s : 0.15% optimize.opt_a.virtual_dataset : 0.000012s : 0.12% optimize.opt_a.get_grad_eliminate_ : 0.000011s : 0.11% optimize.opt_a.virtual_output : 0.000012s : 0.12% optimize.opt_a.merge_forward : 0.000008s : 0.08% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.03% optimize.opt_a.offload_activation : 0.000018s : 0.18% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000030s : 0.30% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.03% optimize.opt_a.before_grad : 0.000020s : 0.20% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000008s : 0.08% optimize.opt_a.meta_fg_expand : 0.000005s : 0.05% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.04% optimize.opt_a.receive_attached : 0.000004s : 0.04% optimize.opt_a.after_resolve : 0.000020s : 0.20% optimize.opt_a.a_after_grad : 0.000018s : 0.18% optimize.opt_a.renormalize : 0.000590s : 5.89% optimize.opt_a.add_forward_monad_depend : 0.000008s : 0.08% optimize.opt_a.auto_monad_grad : 0.000004s : 0.04% optimize.opt_a.auto_monad_eliminator : 0.000026s : 0.26% optimize.opt_a.cse : 0.000046s : 0.46% optimize.opt_a.a_3 : 0.000111s : 1.11% optimize.py_interpret_to_execute_after_opt_a : 0.000014s : 0.14% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.05% optimize.rewriter_after_opt_a : 0.000043s : 0.43% optimize.convert_after_rewriter : 0.000010s : 0.10% optimize.order_py_execute_after_rewriter : 0.000009s : 0.08% optimize.mutable_eliminate : 0.000557s : 5.56% optimize.opt_b.b_1 : 0.000171s : 1.71% optimize.opt_b.b_2 : 0.000008s : 0.08% optimize.opt_b.updatestate_depend_eliminate : 0.000007s : 0.07% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_b.updatestate_loads_eliminate : 0.000002s : 0.02% optimize.opt_b.renormalize : 0.000001s : 0.01% optimize.opt_b.cse : 0.000023s : 0.23% optimize.optimize_parallel_all_gather_comm : 0.000025s : 0.25% optimize.overlap_param_gather : 0.000005s : 0.05% optimize.cconv : 0.000034s : 0.34% optimize.loop_unroll : 0.000469s : 4.69% optimize.opt_after_cconv.c_1 : 0.000029s : 0.29% optimize.opt_after_cconv.parameter_eliminate : 0.000004s : 0.04% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.06% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.cse : 0.000022s : 0.21% optimize.opt_after_cconv.renormalize : 0.000001s : 0.01% optimize.remove_dup_value : 0.000018s : 0.18% optimize.tuple_transform.d_1 : 0.000041s : 0.41% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.02% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000007s : 0.07% optimize.partial_unused_args_eliminate : 0.000005s : 0.05% optimize.add_recomputation : 0.000051s : 0.51% optimize.cse_after_recomputation.cse : 0.000013s : 0.13% optimize.environ_conv : 0.000008s : 0.08% optimize.swap_dp_allreduce_reducescatter : 0.000007s : 0.07% optimize.bias_add_comm_swap : 0.000005s : 0.05% optimize.label_micro_interleaved_index : 0.000008s : 0.08% optimize.label_fine_grained_interleaved_index : 0.000005s : 0.05% optimize.merge_cast_opt : 0.000005s : 0.05% optimize.slice_recompute_activation : 0.000005s : 0.05% optimize.micro_interleaved_order_control : 0.000005s : 0.05% optimize.assign_add_opt : 0.000004s : 0.04% optimize.ForceFp32Comm : 0.000003s : 0.03% optimize.remove_cast_before_assign_add : 0.000004s : 0.04% optimize.full_micro_interleaved_order_control : 0.000005s : 0.05% optimize.reorder_send_recv_between_fp_bp : 0.000005s : 0.05% optimize.comm_op_add_attrs : 0.000004s : 0.04% optimize.add_comm_op_reuse_tag : 0.000004s : 0.04% optimize.interleave_split_concat_branches : 0.000004s : 0.04% optimize.interleave_parallel_branches : 0.000003s : 0.03% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.04% optimize.overlap_opt_shard_grad_in_pipeline : 0.000004s : 0.04% optimize.control_data_broadcast_order : 0.000016s : 0.16% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.04% optimize.offloading_packed_experts : 0.000007s : 0.07% optimize.overlap_recompute_and_grad_model_parallel : 0.000007s : 0.07% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.04% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.04% optimize.overlap_recompute_comm : 0.000005s : 0.05% optimize.overlap_grad_ring_attention : 0.000007s : 0.07% optimize.overlap_grad_flash_sp : 0.000024s : 0.23% optimize.begin_end_overlap_inline : 0.000003s : 0.03% optimize.split_matmul_comm_elemetwise : 0.000005s : 0.05% optimize.split_layernorm_comm : 0.000004s : 0.04% optimize.handle_group_info : 0.000003s : 0.03% optimize.symbol_engine_optimizer.build : 0.000003s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000010s : 0.10% optimize.symbol_engine_optimizer.elim_not_effective : 0.000014s : 0.14% optimize.symbol_engine_optimizer.opt_reshape : 0.000007s : 0.07% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000010s : 0.10% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000004s : 0.04% pipeline_parallel_scheduler : 0.000002s : 0.02% auto_monad_reorder : 0.000019s : 0.19% get_jit_bprop_graph : 0.000002s : 0.02% rewriter_after_jit_bprop_graph : 0.000004s : 0.04% opt_after_jit_grad : 0.000512s : 5.11% validate : 0.000038s : 0.38% Time group info: ------[substitution.] 0.000192 23 39.02% : 0.000075s : 4: substitution.arithmetic_simplify 0.94% : 0.000002s : 2: substitution.elim_not_effective 0.69% : 0.000001s : 2: substitution.fold_const_symbol 3.02% : 0.000006s : 3: substitution.graph_param_transform 49.91% : 0.000096s : 2: substitution.inline 2.11% : 0.000004s : 4: substitution.j_node_and_user_rematch 2.33% : 0.000004s : 4: substitution.remove_not_recompute_node 1.99% : 0.000004s : 2: substitution.replace_old_param ------[type_inference.] 0.004918 2 90.99% : 0.004475s : 1: type_inference.infer 9.01% : 0.000443s : 1: type_inference.specialize ------[replace.] 0.000022 2 100.00% : 0.000022s : 2: replace.inline ------[match.] 0.000094 2 100.00% : 0.000094s : 2: match.inline ------[predicate.] 0.000137 754 0.82% : 0.000001s : 7: predicate.accumulaten_eliminater 1.11% : 0.000002s : 3: predicate.ad_related_special_op_eliminate 0.68% : 0.000001s : 6: predicate.addn_check_dump 0.79% : 0.000001s : 7: predicate.addn_zero_filter 0.68% : 0.000001s : 7: predicate.adjust_all_reduce_mul_add 2.79% : 0.000004s : 13: predicate.arithmetic_simplify 0.78% : 0.000001s : 7: predicate.cast_eliminate 0.78% : 0.000001s : 6: predicate.check_bprop_eliminate 0.67% : 0.000001s : 6: predicate.compare_switch_simplify 0.21% : 0.000000s : 3: predicate.const_output_eliminate 0.71% : 0.000001s : 6: predicate.depend_value_elim 0.72% : 0.000001s : 7: predicate.dict_get_item_const_eliminator 0.88% : 0.000001s : 7: predicate.dict_get_item_eliminator 0.73% : 0.000001s : 7: predicate.dict_set_item_eliminator 1.11% : 0.000002s : 6: predicate.dumpgradient_eliminate 0.26% : 0.000000s : 3: predicate.elim_not_effective 0.54% : 0.000001s : 3: predicate.elim_shapecalc_of_broadcastargs 1.10% : 0.000002s : 10: predicate.environ_add_const_eliminate 1.00% : 0.000001s : 10: predicate.environ_get_add_eliminate 1.30% : 0.000002s : 10: predicate.environ_get_depend_swap 1.81% : 0.000002s : 16: predicate.environ_get_eliminate 1.00% : 0.000001s : 10: predicate.environ_get_set_eliminate 0.87% : 0.000001s : 9: predicate.exchange_switch_depend_value 1.91% : 0.000003s : 9: predicate.float_depend_g_call 0.65% : 0.000001s : 6: predicate.float_environ_get_switch 0.95% : 0.000001s : 9: predicate.float_tuple_getitem_switch 0.25% : 0.000000s : 3: predicate.fold_const_symbol 0.79% : 0.000001s : 6: predicate.get_grad_eliminate 0.27% : 0.000000s : 3: predicate.graph_param_transform 0.77% : 0.000001s : 6: predicate.incorporate_call 0.67% : 0.000001s : 6: predicate.incorporate_call_switch 6.42% : 0.000009s : 34: predicate.inline 1.07% : 0.000001s : 6: predicate.inline_without_move 0.39% : 0.000001s : 6: predicate.j_node_and_user_rematch 1.01% : 0.000001s : 6: predicate.less_batch_normalization 1.54% : 0.000002s : 13: predicate.list_to_tuple_eliminator_ 2.11% : 0.000003s : 20: predicate.load_eliminater 1.19% : 0.000002s : 3: predicate.loop_unroll_after_grad 1.87% : 0.000003s : 14: predicate.loop_unroll_before_grad 1.98% : 0.000003s : 13: predicate.make_slice_get_slice_eliminator 0.69% : 0.000001s : 6: predicate.merge_addn 0.67% : 0.000001s : 6: predicate.micro_step_allgather_replace 0.74% : 0.000001s : 6: predicate.mini_step_allgather_replace 0.66% : 0.000001s : 7: predicate.minmaximum_grad 1.32% : 0.000002s : 3: predicate.mutable_eliminate 0.44% : 0.000001s : 3: predicate.opt_reshape 0.73% : 0.000001s : 3: predicate.parallel_virtual_node 1.33% : 0.000002s : 9: predicate.partial_defer_inline 1.23% : 0.000002s : 10: predicate.partial_eliminate 0.82% : 0.000001s : 7: predicate.print_const_string_wrapper 0.86% : 0.000001s : 6: predicate.reduce_all_const_elim 0.91% : 0.000001s : 7: predicate.reduce_eliminate 2.12% : 0.000003s : 20: predicate.redundant_stop_gradient_eliminater 0.75% : 0.000001s : 6: predicate.remove_not_recompute_node 1.49% : 0.000002s : 13: predicate.replace_applicator 0.78% : 0.000001s : 6: predicate.replace_old_param 0.47% : 0.000001s : 3: predicate.reset_defer_inline 0.88% : 0.000001s : 7: predicate.reshape_eliminate 0.84% : 0.000001s : 6: predicate.row_tensor_add_zeros_like 0.41% : 0.000001s : 3: predicate.row_tensor_eliminate 1.09% : 0.000001s : 6: predicate.same_eliminate 0.47% : 0.000001s : 6: predicate.set_cell_output_no_recompute 1.08% : 0.000001s : 6: predicate.shard_identity_eliminate 0.96% : 0.000001s : 6: predicate.special_op_eliminate 1.01% : 0.000001s : 6: predicate.specialize_transform 1.07% : 0.000001s : 6: predicate.split_environ_get_set_with_tuple_value 0.84% : 0.000001s : 6: predicate.stack_unstack_eliminate 0.43% : 0.000001s : 3: predicate.switch_call_monad_eliminater 0.97% : 0.000001s : 9: predicate.switch_defer_inline 1.84% : 0.000003s : 15: predicate.switch_layer_defer_inline 4.28% : 0.000006s : 32: predicate.switch_simplify 0.78% : 0.000001s : 7: predicate.tile_eliminate 0.81% : 0.000001s : 7: predicate.transpose_eliminate 1.49% : 0.000002s : 13: predicate.tuple_list_convert_item_index_to_positive 1.67% : 0.000002s : 13: predicate.tuple_list_get_item_const_eliminator 1.47% : 0.000002s : 13: predicate.tuple_list_get_item_depend_reorder 2.96% : 0.000004s : 19: predicate.tuple_list_get_item_eliminator 1.36% : 0.000002s : 13: predicate.tuple_list_get_set_item_eliminator 2.64% : 0.000004s : 19: predicate.tuple_list_set_item_eliminator 1.47% : 0.000002s : 13: predicate.tuple_to_list_eliminator_ 2.02% : 0.000003s : 20: predicate.updatestate_pure_node_eliminater 2.96% : 0.000004s : 26: predicate.updatestate_useless_node_eliminater 0.42% : 0.000001s : 3: predicate.value_based_eliminate 0.82% : 0.000001s : 6: predicate.virtual_dataset_eliminate 0.82% : 0.000001s : 6: predicate.virtual_output_eliminate 0.40% : 0.000001s : 3: predicate.virtual_view_grad_eliminate 0.59% : 0.000001s : 3: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000232 5 8.82% : 0.000020s : 1: func_graph_cloner_run.FuncGraphClonerGraph 91.18% : 0.000211s : 4: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.025076 192 0.02% : 0.000006s : 1: ForceFp32Comm 13.29% : 0.003333s : 1: add_attr 13.23% : 0.003318s : 1: add_attr_with_inline 0.03% : 0.000006s : 1: add_comm_op_reuse_tag 0.22% : 0.000055s : 1: add_recomputation 0.03% : 0.000006s : 1: assign_add_opt 0.25% : 0.000064s : 1: auto_monad 0.11% : 0.000027s : 1: auto_monad_reorder 0.02% : 0.000006s : 1: begin_end_overlap_inline 0.03% : 0.000008s : 1: bias_add_comm_swap 2.20% : 0.000551s : 1: bootstrap 0.15% : 0.000037s : 1: cconv 0.03% : 0.000006s : 1: comm_op_add_attrs 0.08% : 0.000019s : 1: control_data_broadcast_order 0.05% : 0.000013s : 1: convert_after_rewriter 0.13% : 0.000033s : 1: cse_after_recomputation 0.03% : 0.000008s : 1: dataset_repeat_opt 0.09% : 0.000022s : 1: detach_backward 0.04% : 0.000011s : 1: environ_conv 0.09% : 0.000022s : 1: event_method 0.03% : 0.000008s : 1: full_micro_interleaved_order_control 0.03% : 0.000008s : 1: get_jit_bprop_graph 0.05% : 0.000012s : 1: graph_reusing 0.03% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.02% : 0.000006s : 1: handle_group_info 0.03% : 0.000009s : 1: inline 0.03% : 0.000009s : 1: insert-virtual-dataset 0.03% : 0.000006s : 1: interleave_parallel_branches 0.03% : 0.000006s : 1: interleave_split_concat_branches 0.03% : 0.000008s : 1: label_fine_grained_interleaved_index 0.04% : 0.000011s : 1: label_micro_interleaved_index 1.90% : 0.000476s : 1: loop_unroll 0.03% : 0.000007s : 1: merge_cast_opt 0.03% : 0.000008s : 1: micro_interleaved_order_control 2.25% : 0.000564s : 1: mutable_eliminate 0.04% : 0.000010s : 1: offloading_packed_experts 0.06% : 0.000014s : 1: opt.transform.loop_unroll_optimizer 0.06% : 0.000016s : 1: opt.transform.mutable_eliminate 3.47% : 0.000871s : 78: opt.transform.opt_a 0.11% : 0.000027s : 1: opt.transform.opt_after_cconv 0.10% : 0.000025s : 1: opt.transform.opt_after_jit_grad 0.39% : 0.000098s : 28: opt.transform.opt_b 0.18% : 0.000045s : 2: opt.transform.opt_trans_graph 0.15% : 0.000037s : 4: opt.transform.symbol_engine_opt 10.76% : 0.002698s : 1: opt_a 0.53% : 0.000134s : 1: opt_after_cconv 2.08% : 0.000523s : 1: opt_after_jit_grad 1.14% : 0.000287s : 1: opt_b 21.48% : 0.005386s : 1: optimize 0.11% : 0.000029s : 1: optimize_parallel_all_gather_comm 0.05% : 0.000012s : 1: order_py_execute_after_rewriter 0.11% : 0.000027s : 1: overlap_grad_flash_sp 0.03% : 0.000006s : 1: overlap_grad_matmul_and_grad_allreduce 0.04% : 0.000009s : 1: overlap_grad_ring_attention 0.03% : 0.000007s : 1: overlap_opt_shard_grad_in_pipeline 0.03% : 0.000007s : 1: overlap_opt_shard_in_pipeline 0.03% : 0.000008s : 1: overlap_param_gather 0.03% : 0.000006s : 1: overlap_recompute_allgather_and_fa_grad 0.04% : 0.000010s : 1: overlap_recompute_and_grad_model_parallel 0.03% : 0.000008s : 1: overlap_recompute_comm 0.04% : 0.000010s : 1: parallel-infer-symbol 0.03% : 0.000007s : 1: parallel-infer-symbol-second 0.03% : 0.000008s : 1: partial_unused_args_eliminate 0.03% : 0.000009s : 1: pipeline_parallel_scheduler 0.03% : 0.000007s : 1: pipeline_split 0.14% : 0.000035s : 1: pre_auto_parallel 0.10% : 0.000026s : 1: py_interpret_to_execute 0.07% : 0.000017s : 1: py_interpret_to_execute_after_opt_a 0.03% : 0.000006s : 1: remove_cast_before_assign_add 0.09% : 0.000022s : 1: remove_dup_value 1.30% : 0.000325s : 1: renormalize.infer 1.02% : 0.000256s : 1: renormalize.specialize 0.03% : 0.000008s : 1: reorder_send_recv_between_fp_bp 0.04% : 0.000010s : 1: rewriter_after_jit_bprop_graph 0.19% : 0.000047s : 1: rewriter_after_opt_a 0.22% : 0.000054s : 1: rewriter_before_opt_a 0.03% : 0.000008s : 1: slice_cell_reuse_recomputed_activation 0.03% : 0.000008s : 1: slice_recompute_activation 0.03% : 0.000007s : 1: split_layernorm_comm 0.03% : 0.000007s : 1: split_matmul_comm_elemetwise 0.04% : 0.000010s : 1: swap_dp_allreduce_reducescatter 0.41% : 0.000103s : 1: symbol_engine_optimizer 0.36% : 0.000090s : 1: tuple_transform 19.94% : 0.005001s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:47.570.332 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0129125, [21] [bootstrap]: 0.00044557 [type_inference]: 0.00463482 [event_method]: 1.104e-05 [auto_monad]: 5.259e-05 [graph_reusing]: 5.86998e-06 [inline]: 2.34001e-06 [add_attr]: 0.0030831, [1] [add_attr_with_inline]: 0.00307452, [1] [Cycle 1]: 4.911e-05, [2] [tag_attr]: 1.397e-05 [meta_addattr_fg_expand]: 3.71001e-06 [parallel-infer-symbol]: 3.50998e-06 [pre_auto_parallel]: 2.409e-05 [insert-virtual-dataset]: 2.39001e-06 [parallel-infer-symbol-second]: 6.89994e-07 [dataset_repeat_opt]: 2.02999e-06 [pipeline_split]: 1.49e-06 [optimize]: 0.00399475, [53] [py_interpret_to_execute]: 1.607e-05 [rewriter_before_opt_a]: 4.405e-05 [opt_a]: 0.00211614, [2] [Cycle 1]: 0.00144372, [45] [expand_dump_flag]: 2.88e-06 [switch_simplify]: 2.506e-05 [loop_unroll]: 1.385e-05 [a_1]: 0.00032259 [with_stream_mark]: 1.532e-05 [recompute_prepare]: 8.69e-06 [updatestate_depend_eliminate]: 3.51999e-06 [updatestate_assign_eliminate]: 3.11001e-06 [updatestate_loads_eliminate]: 3.91999e-06 [parameter_eliminate]: 1.89e-06 [a_2]: 9.114e-05 [accelerated_algorithm]: 6.87002e-06 [shard]: 1.89999e-06 [meta_shard_fg_expand]: 1.81e-06 [shard_inline]: 5.96e-06 [merge_send_recv]: 8.52e-06 [auto_parallel]: 6.04001e-06 [parallel]: 1.833e-05 [flash_sp]: 7.71999e-06 [merge_comm]: 4.10998e-06 [allreduce_fusion]: 3.78001e-06 [matmul_add_comm_reduction]: 9.46e-06 [allreduce_slice_to_reducescatter]: 7.30011e-07 [virtual_shard_identity]: 7.46999e-06 [virtual_dataset]: 7.26999e-06 [get_grad_eliminate_]: 6.04001e-06 [virtual_output]: 6.28998e-06 [merge_forward]: 3.78999e-06 [cell_reuse_recompute_pass]: 1.30001e-06 [offload_activation]: 9.38002e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.256e-05 [merge_recompute_call_nodes]: 1.69998e-06 [before_grad]: 1.027e-05 [set_forward_comm_id_for_comm_node_pass]: 3.8e-06 [meta_fg_expand]: 3.02002e-06 [flash_sp_send_recv_attached]: 2.73e-06 [receive_attached]: 2.09e-06 [after_resolve]: 1.024e-05 [a_after_grad]: 9.12001e-06 [renormalize]: 0.00044638 [add_forward_monad_depend]: 4.85999e-06 [auto_monad_grad]: 2.24001e-06 [auto_monad_eliminator]: 1.369e-05 [cse]: 2.959e-05 [a_3]: 4.512e-05 [Cycle 2]: 0.0006629, [45] [expand_dump_flag]: 1.03001e-06 [switch_simplify]: 6.93e-06 [loop_unroll]: 5.73002e-06 [a_1]: 0.00012772 [with_stream_mark]: 9.92999e-06 [recompute_prepare]: 6.66e-06 [updatestate_depend_eliminate]: 2.96999e-06 [updatestate_assign_eliminate]: 2.43e-06 [updatestate_loads_eliminate]: 2.91999e-06 [parameter_eliminate]: 1.00999e-06 [a_2]: 9.915e-05 [accelerated_algorithm]: 6.49999e-06 [shard]: 1.15999e-06 [meta_shard_fg_expand]: 1.66e-06 [shard_inline]: 6.24999e-06 [merge_send_recv]: 5.14998e-06 [auto_parallel]: 5.44e-06 [parallel]: 4.81002e-06 [flash_sp]: 3.69002e-06 [merge_comm]: 3.11001e-06 [allreduce_fusion]: 3.09001e-06 [matmul_add_comm_reduction]: 6.12999e-06 [allreduce_slice_to_reducescatter]: 3.50003e-07 [virtual_shard_identity]: 6.46e-06 [virtual_dataset]: 5.69e-06 [get_grad_eliminate_]: 5.52001e-06 [virtual_output]: 5.32001e-06 [merge_forward]: 2.71e-06 [cell_reuse_recompute_pass]: 1.32999e-06 [offload_activation]: 6.61999e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.06e-05 [merge_recompute_call_nodes]: 7.89994e-07 [before_grad]: 8.69e-06 [set_forward_comm_id_for_comm_node_pass]: 3.54002e-06 [meta_fg_expand]: 2.11e-06 [flash_sp_send_recv_attached]: 8.50006e-07 [receive_attached]: 1.00001e-06 [after_resolve]: 9.01998e-06 [a_after_grad]: 8.12e-06 [renormalize]: 8.9989e-08 [add_forward_monad_depend]: 1.47999e-06 [auto_monad_grad]: 9.10019e-07 [auto_monad_eliminator]: 7.1e-06 [cse]: 1.549e-05 [a_3]: 3.51e-05 [py_interpret_to_execute_after_opt_a]: 8.07e-06 [slice_cell_reuse_recomputed_activation]: 2.12999e-06 [rewriter_after_opt_a]: 3.394e-05 [convert_after_rewriter]: 6.39001e-06 [order_py_execute_after_rewriter]: 5.25999e-06 [mutable_eliminate]: 0.00047341 [opt_b]: 0.00019729, [1] [Cycle 1]: 0.00019102, [7] [b_1]: 0.00011735 [b_2]: 7.3e-06 [updatestate_depend_eliminate]: 5.28002e-06 [updatestate_assign_eliminate]: 2.44001e-06 [updatestate_loads_eliminate]: 2.42001e-06 [renormalize]: 4.00003e-07 [cse]: 1.738e-05 [optimize_parallel_all_gather_comm]: 1.601e-05 [overlap_param_gather]: 2.41e-06 [cconv]: 2.369e-05 [loop_unroll]: 0.00041374 [opt_after_cconv]: 9.689e-05, [1] [Cycle 1]: 9.133e-05, [7] [c_1]: 2.735e-05 [parameter_eliminate]: 2.55002e-06 [updatestate_depend_eliminate]: 5.10001e-06 [updatestate_assign_eliminate]: 2.34999e-06 [updatestate_loads_eliminate]: 2.91e-06 [cse]: 1.722e-05 [renormalize]: 4.30009e-07 [remove_dup_value]: 1.449e-05 [tuple_transform]: 7.059e-05, [1] [Cycle 1]: 6.624e-05, [4] [d_1]: 3.906e-05 [none_parameter_eliminate]: 1.66e-06 [renormalize]: 1.8999e-07 [switch_simplify]: 7.00002e-06 [partial_unused_args_eliminate]: 1.72999e-06 [add_recomputation]: 4.711e-05 [cse_after_recomputation]: 2.176e-05, [1] [Cycle 1]: 1.753e-05, [1] [cse]: 1.196e-05 [environ_conv]: 5.17e-06 [swap_dp_allreduce_reducescatter]: 5.24998e-06 [bias_add_comm_swap]: 2.26e-06 [label_micro_interleaved_index]: 3.91999e-06 [label_fine_grained_interleaved_index]: 2.58e-06 [merge_cast_opt]: 1.21002e-06 [slice_recompute_activation]: 2.18002e-06 [micro_interleaved_order_control]: 2.05002e-06 [assign_add_opt]: 1.57001e-06 [ForceFp32Comm]: 1.16002e-06 [remove_cast_before_assign_add]: 1.02e-06 [full_micro_interleaved_order_control]: 1.96e-06 [reorder_send_recv_between_fp_bp]: 3.04999e-06 [comm_op_add_attrs]: 1.05999e-06 [add_comm_op_reuse_tag]: 9.79984e-07 [interleave_split_concat_branches]: 1.20001e-06 [interleave_parallel_branches]: 1.34e-06 [overlap_opt_shard_in_pipeline]: 1.15999e-06 [overlap_opt_shard_grad_in_pipeline]: 1.76998e-06 [control_data_broadcast_order]: 1.226e-05 [grouped_pairwise_exchange_alltoall]: 1.97001e-06 [offloading_packed_experts]: 3.71999e-06 [overlap_recompute_and_grad_model_parallel]: 4.72e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.15001e-06 [overlap_recompute_allgather_and_fa_grad]: 1.37999e-06 [overlap_recompute_comm]: 2.69001e-06 [overlap_grad_ring_attention]: 4e-06 [overlap_grad_flash_sp]: 1.732e-05 [begin_end_overlap_inline]: 5.69999e-07 [split_matmul_comm_elemetwise]: 2.28002e-06 [split_layernorm_comm]: 1.79e-06 [handle_group_info]: 9.80013e-07 [symbol_engine_optimizer]: 7.388e-05, [1] [Cycle 1]: 6.964e-05, [6] [build]: 2.47001e-06 [elim_shapecalc]: 9.46e-06 [elim_not_effective]: 1.187e-05 [opt_reshape]: 6.81001e-06 [fold_const_symbol]: 1.005e-05 [renormalize]: 2.10013e-07 [detach_backward]: 1.86e-06 [pipeline_parallel_scheduler]: 1.60999e-06 [auto_monad_reorder]: 1.488e-05 [get_jit_bprop_graph]: 1.39998e-06 [rewriter_after_jit_bprop_graph]: 4.13001e-06 [opt_after_jit_grad]: 0.00044985 [validate]: 3.488e-05 Sums bootstrap : 0.000446s : 5.01% type_inference : 0.004635s : 52.15% event_method : 0.000011s : 0.12% auto_monad : 0.000053s : 0.59% graph_reusing : 0.000006s : 0.07% inline : 0.000002s : 0.03% add_attr.add_attr_with_inline.tag_attr : 0.000014s : 0.16% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000004s : 0.04% parallel-infer-symbol : 0.000004s : 0.04% pre_auto_parallel : 0.000024s : 0.27% insert-virtual-dataset : 0.000002s : 0.03% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000001s : 0.02% optimize.py_interpret_to_execute : 0.000016s : 0.18% optimize.rewriter_before_opt_a : 0.000044s : 0.50% optimize.opt_a.expand_dump_flag : 0.000004s : 0.04% optimize.opt_a.switch_simplify : 0.000032s : 0.36% optimize.opt_a.loop_unroll : 0.000020s : 0.22% optimize.opt_a.a_1 : 0.000450s : 5.07% optimize.opt_a.with_stream_mark : 0.000025s : 0.28% optimize.opt_a.recompute_prepare : 0.000015s : 0.17% optimize.opt_a.updatestate_depend_eliminate : 0.000006s : 0.07% optimize.opt_a.updatestate_assign_eliminate : 0.000006s : 0.06% optimize.opt_a.updatestate_loads_eliminate : 0.000007s : 0.08% optimize.opt_a.parameter_eliminate : 0.000003s : 0.03% optimize.opt_a.a_2 : 0.000190s : 2.14% optimize.opt_a.accelerated_algorithm : 0.000013s : 0.15% optimize.opt_a.shard : 0.000003s : 0.03% optimize.opt_a.meta_shard_fg_expand : 0.000003s : 0.04% optimize.opt_a.shard_inline : 0.000012s : 0.14% optimize.opt_a.merge_send_recv : 0.000014s : 0.15% optimize.opt_a.auto_parallel : 0.000011s : 0.13% optimize.opt_a.parallel : 0.000023s : 0.26% optimize.opt_a.flash_sp : 0.000011s : 0.13% optimize.opt_a.merge_comm : 0.000007s : 0.08% optimize.opt_a.allreduce_fusion : 0.000007s : 0.08% optimize.opt_a.matmul_add_comm_reduction : 0.000016s : 0.18% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000014s : 0.16% optimize.opt_a.virtual_dataset : 0.000013s : 0.15% optimize.opt_a.get_grad_eliminate_ : 0.000012s : 0.13% optimize.opt_a.virtual_output : 0.000012s : 0.13% optimize.opt_a.merge_forward : 0.000006s : 0.07% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.03% optimize.opt_a.offload_activation : 0.000016s : 0.18% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000023s : 0.26% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.03% optimize.opt_a.before_grad : 0.000019s : 0.21% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000007s : 0.08% optimize.opt_a.meta_fg_expand : 0.000005s : 0.06% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.04% optimize.opt_a.receive_attached : 0.000003s : 0.03% optimize.opt_a.after_resolve : 0.000019s : 0.22% optimize.opt_a.a_after_grad : 0.000017s : 0.19% optimize.opt_a.renormalize : 0.000446s : 5.02% optimize.opt_a.add_forward_monad_depend : 0.000006s : 0.07% optimize.opt_a.auto_monad_grad : 0.000003s : 0.04% optimize.opt_a.auto_monad_eliminator : 0.000021s : 0.23% optimize.opt_a.cse : 0.000045s : 0.51% optimize.opt_a.a_3 : 0.000080s : 0.90% optimize.py_interpret_to_execute_after_opt_a : 0.000008s : 0.09% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.02% optimize.rewriter_after_opt_a : 0.000034s : 0.38% optimize.convert_after_rewriter : 0.000006s : 0.07% optimize.order_py_execute_after_rewriter : 0.000005s : 0.06% optimize.mutable_eliminate : 0.000473s : 5.33% optimize.opt_b.b_1 : 0.000117s : 1.32% optimize.opt_b.b_2 : 0.000007s : 0.08% optimize.opt_b.updatestate_depend_eliminate : 0.000005s : 0.06% optimize.opt_b.updatestate_assign_eliminate : 0.000002s : 0.03% optimize.opt_b.updatestate_loads_eliminate : 0.000002s : 0.03% optimize.opt_b.renormalize : 0.000000s : 0.00% optimize.opt_b.cse : 0.000017s : 0.20% optimize.optimize_parallel_all_gather_comm : 0.000016s : 0.18% optimize.overlap_param_gather : 0.000002s : 0.03% optimize.cconv : 0.000024s : 0.27% optimize.loop_unroll : 0.000414s : 4.66% optimize.opt_after_cconv.c_1 : 0.000027s : 0.31% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000005s : 0.06% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000002s : 0.03% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.cse : 0.000017s : 0.19% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000014s : 0.16% optimize.tuple_transform.d_1 : 0.000039s : 0.44% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.02% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000007s : 0.08% optimize.partial_unused_args_eliminate : 0.000002s : 0.02% optimize.add_recomputation : 0.000047s : 0.53% optimize.cse_after_recomputation.cse : 0.000012s : 0.13% optimize.environ_conv : 0.000005s : 0.06% optimize.swap_dp_allreduce_reducescatter : 0.000005s : 0.06% optimize.bias_add_comm_swap : 0.000002s : 0.03% optimize.label_micro_interleaved_index : 0.000004s : 0.04% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.03% optimize.merge_cast_opt : 0.000001s : 0.01% optimize.slice_recompute_activation : 0.000002s : 0.02% optimize.micro_interleaved_order_control : 0.000002s : 0.02% optimize.assign_add_opt : 0.000002s : 0.02% optimize.ForceFp32Comm : 0.000001s : 0.01% optimize.remove_cast_before_assign_add : 0.000001s : 0.01% optimize.full_micro_interleaved_order_control : 0.000002s : 0.02% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.03% optimize.comm_op_add_attrs : 0.000001s : 0.01% optimize.add_comm_op_reuse_tag : 0.000001s : 0.01% optimize.interleave_split_concat_branches : 0.000001s : 0.01% optimize.interleave_parallel_branches : 0.000001s : 0.02% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.02% optimize.control_data_broadcast_order : 0.000012s : 0.14% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.02% optimize.offloading_packed_experts : 0.000004s : 0.04% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.05% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.02% optimize.overlap_recompute_comm : 0.000003s : 0.03% optimize.overlap_grad_ring_attention : 0.000004s : 0.05% optimize.overlap_grad_flash_sp : 0.000017s : 0.19% optimize.begin_end_overlap_inline : 0.000001s : 0.01% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.03% optimize.split_layernorm_comm : 0.000002s : 0.02% optimize.handle_group_info : 0.000001s : 0.01% optimize.symbol_engine_optimizer.build : 0.000002s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000009s : 0.11% optimize.symbol_engine_optimizer.elim_not_effective : 0.000012s : 0.13% optimize.symbol_engine_optimizer.opt_reshape : 0.000007s : 0.08% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000010s : 0.11% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.02% pipeline_parallel_scheduler : 0.000002s : 0.02% auto_monad_reorder : 0.000015s : 0.17% get_jit_bprop_graph : 0.000001s : 0.02% rewriter_after_jit_bprop_graph : 0.000004s : 0.05% opt_after_jit_grad : 0.000450s : 5.06% validate : 0.000035s : 0.39% Time group info: ------[substitution.] 0.000174 23 39.16% : 0.000068s : 4: substitution.arithmetic_simplify 1.01% : 0.000002s : 2: substitution.elim_not_effective 0.77% : 0.000001s : 2: substitution.fold_const_symbol 3.24% : 0.000006s : 3: substitution.graph_param_transform 49.40% : 0.000086s : 2: substitution.inline 1.83% : 0.000003s : 4: substitution.j_node_and_user_rematch 2.83% : 0.000005s : 4: substitution.remove_not_recompute_node 1.77% : 0.000003s : 2: substitution.replace_old_param ------[type_inference.] 0.004591 2 91.07% : 0.004181s : 1: type_inference.infer 8.93% : 0.000410s : 1: type_inference.specialize ------[replace.] 0.000023 2 100.00% : 0.000023s : 2: replace.inline ------[match.] 0.000084 2 100.00% : 0.000084s : 2: match.inline ------[predicate.] 0.000131 754 0.84% : 0.000001s : 7: predicate.accumulaten_eliminater 1.00% : 0.000001s : 3: predicate.ad_related_special_op_eliminate 0.68% : 0.000001s : 6: predicate.addn_check_dump 0.85% : 0.000001s : 7: predicate.addn_zero_filter 0.72% : 0.000001s : 7: predicate.adjust_all_reduce_mul_add 2.69% : 0.000004s : 13: predicate.arithmetic_simplify 0.81% : 0.000001s : 7: predicate.cast_eliminate 0.87% : 0.000001s : 6: predicate.check_bprop_eliminate 0.71% : 0.000001s : 6: predicate.compare_switch_simplify 0.22% : 0.000000s : 3: predicate.const_output_eliminate 0.74% : 0.000001s : 6: predicate.depend_value_elim 0.78% : 0.000001s : 7: predicate.dict_get_item_const_eliminator 1.03% : 0.000001s : 7: predicate.dict_get_item_eliminator 0.74% : 0.000001s : 7: predicate.dict_set_item_eliminator 1.27% : 0.000002s : 6: predicate.dumpgradient_eliminate 0.27% : 0.000000s : 3: predicate.elim_not_effective 0.46% : 0.000001s : 3: predicate.elim_shapecalc_of_broadcastargs 1.05% : 0.000001s : 10: predicate.environ_add_const_eliminate 1.03% : 0.000001s : 10: predicate.environ_get_add_eliminate 1.15% : 0.000002s : 10: predicate.environ_get_depend_swap 1.74% : 0.000002s : 16: predicate.environ_get_eliminate 1.03% : 0.000001s : 10: predicate.environ_get_set_eliminate 0.96% : 0.000001s : 9: predicate.exchange_switch_depend_value 2.12% : 0.000003s : 9: predicate.float_depend_g_call 0.65% : 0.000001s : 6: predicate.float_environ_get_switch 1.03% : 0.000001s : 9: predicate.float_tuple_getitem_switch 0.24% : 0.000000s : 3: predicate.fold_const_symbol 0.82% : 0.000001s : 6: predicate.get_grad_eliminate 0.40% : 0.000001s : 3: predicate.graph_param_transform 0.85% : 0.000001s : 6: predicate.incorporate_call 0.69% : 0.000001s : 6: predicate.incorporate_call_switch 6.58% : 0.000009s : 34: predicate.inline 1.04% : 0.000001s : 6: predicate.inline_without_move 0.42% : 0.000001s : 6: predicate.j_node_and_user_rematch 0.99% : 0.000001s : 6: predicate.less_batch_normalization 1.67% : 0.000002s : 13: predicate.list_to_tuple_eliminator_ 2.01% : 0.000003s : 20: predicate.load_eliminater 1.20% : 0.000002s : 3: predicate.loop_unroll_after_grad 1.83% : 0.000002s : 14: predicate.loop_unroll_before_grad 1.77% : 0.000002s : 13: predicate.make_slice_get_slice_eliminator 0.82% : 0.000001s : 6: predicate.merge_addn 0.67% : 0.000001s : 6: predicate.micro_step_allgather_replace 0.70% : 0.000001s : 6: predicate.mini_step_allgather_replace 0.68% : 0.000001s : 7: predicate.minmaximum_grad 1.46% : 0.000002s : 3: predicate.mutable_eliminate 0.42% : 0.000001s : 3: predicate.opt_reshape 0.56% : 0.000001s : 3: predicate.parallel_virtual_node 1.32% : 0.000002s : 9: predicate.partial_defer_inline 1.22% : 0.000002s : 10: predicate.partial_eliminate 0.81% : 0.000001s : 7: predicate.print_const_string_wrapper 0.74% : 0.000001s : 6: predicate.reduce_all_const_elim 0.92% : 0.000001s : 7: predicate.reduce_eliminate 2.07% : 0.000003s : 20: predicate.redundant_stop_gradient_eliminater 0.62% : 0.000001s : 6: predicate.remove_not_recompute_node 1.20% : 0.000002s : 13: predicate.replace_applicator 0.67% : 0.000001s : 6: predicate.replace_old_param 0.38% : 0.000001s : 3: predicate.reset_defer_inline 0.81% : 0.000001s : 7: predicate.reshape_eliminate 0.74% : 0.000001s : 6: predicate.row_tensor_add_zeros_like 0.43% : 0.000001s : 3: predicate.row_tensor_eliminate 0.92% : 0.000001s : 6: predicate.same_eliminate 0.60% : 0.000001s : 6: predicate.set_cell_output_no_recompute 0.98% : 0.000001s : 6: predicate.shard_identity_eliminate 0.84% : 0.000001s : 6: predicate.special_op_eliminate 1.03% : 0.000001s : 6: predicate.specialize_transform 1.24% : 0.000002s : 6: predicate.split_environ_get_set_with_tuple_value 0.82% : 0.000001s : 6: predicate.stack_unstack_eliminate 0.43% : 0.000001s : 3: predicate.switch_call_monad_eliminater 1.09% : 0.000001s : 9: predicate.switch_defer_inline 1.70% : 0.000002s : 15: predicate.switch_layer_defer_inline 4.31% : 0.000006s : 32: predicate.switch_simplify 0.79% : 0.000001s : 7: predicate.tile_eliminate 0.80% : 0.000001s : 7: predicate.transpose_eliminate 1.74% : 0.000002s : 13: predicate.tuple_list_convert_item_index_to_positive 1.61% : 0.000002s : 13: predicate.tuple_list_get_item_const_eliminator 1.64% : 0.000002s : 13: predicate.tuple_list_get_item_depend_reorder 2.98% : 0.000004s : 19: predicate.tuple_list_get_item_eliminator 1.43% : 0.000002s : 13: predicate.tuple_list_get_set_item_eliminator 2.53% : 0.000003s : 19: predicate.tuple_list_set_item_eliminator 1.49% : 0.000002s : 13: predicate.tuple_to_list_eliminator_ 2.00% : 0.000003s : 20: predicate.updatestate_pure_node_eliminater 2.88% : 0.000004s : 26: predicate.updatestate_useless_node_eliminater 0.40% : 0.000001s : 3: predicate.value_based_eliminate 0.88% : 0.000001s : 6: predicate.virtual_dataset_eliminate 0.78% : 0.000001s : 6: predicate.virtual_output_eliminate 0.37% : 0.000000s : 3: predicate.virtual_view_grad_eliminate 0.53% : 0.000001s : 3: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000194 5 8.47% : 0.000016s : 1: func_graph_cloner_run.FuncGraphClonerGraph 91.53% : 0.000177s : 4: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.021402 192 0.02% : 0.000004s : 1: ForceFp32Comm 14.43% : 0.003087s : 1: add_attr 14.38% : 0.003078s : 1: add_attr_with_inline 0.02% : 0.000004s : 1: add_comm_op_reuse_tag 0.24% : 0.000051s : 1: add_recomputation 0.02% : 0.000004s : 1: assign_add_opt 0.27% : 0.000058s : 1: auto_monad 0.09% : 0.000019s : 1: auto_monad_reorder 0.02% : 0.000003s : 1: begin_end_overlap_inline 0.02% : 0.000005s : 1: bias_add_comm_swap 2.21% : 0.000473s : 1: bootstrap 0.13% : 0.000027s : 1: cconv 0.02% : 0.000004s : 1: comm_op_add_attrs 0.07% : 0.000016s : 1: control_data_broadcast_order 0.04% : 0.000009s : 1: convert_after_rewriter 0.12% : 0.000025s : 1: cse_after_recomputation 0.02% : 0.000005s : 1: dataset_repeat_opt 0.02% : 0.000005s : 1: detach_backward 0.04% : 0.000009s : 1: environ_conv 0.08% : 0.000017s : 1: event_method 0.02% : 0.000005s : 1: full_micro_interleaved_order_control 0.02% : 0.000005s : 1: get_jit_bprop_graph 0.04% : 0.000009s : 1: graph_reusing 0.02% : 0.000005s : 1: grouped_pairwise_exchange_alltoall 0.02% : 0.000004s : 1: handle_group_info 0.03% : 0.000005s : 1: inline 0.03% : 0.000006s : 1: insert-virtual-dataset 0.02% : 0.000004s : 1: interleave_parallel_branches 0.02% : 0.000004s : 1: interleave_split_concat_branches 0.03% : 0.000006s : 1: label_fine_grained_interleaved_index 0.03% : 0.000007s : 1: label_micro_interleaved_index 1.97% : 0.000421s : 1: loop_unroll 0.02% : 0.000004s : 1: merge_cast_opt 0.02% : 0.000005s : 1: micro_interleaved_order_control 2.25% : 0.000482s : 1: mutable_eliminate 0.03% : 0.000007s : 1: offloading_packed_experts 0.06% : 0.000013s : 1: opt.transform.loop_unroll_optimizer 0.07% : 0.000014s : 1: opt.transform.mutable_eliminate 3.90% : 0.000835s : 78: opt.transform.opt_a 0.12% : 0.000026s : 1: opt.transform.opt_after_cconv 0.11% : 0.000023s : 1: opt.transform.opt_after_jit_grad 0.44% : 0.000094s : 28: opt.transform.opt_b 0.21% : 0.000044s : 2: opt.transform.opt_trans_graph 0.16% : 0.000035s : 4: opt.transform.symbol_engine_opt 9.90% : 0.002119s : 1: opt_a 0.47% : 0.000101s : 1: opt_after_cconv 2.14% : 0.000458s : 1: opt_after_jit_grad 0.94% : 0.000201s : 1: opt_b 18.68% : 0.003999s : 1: optimize 0.09% : 0.000020s : 1: optimize_parallel_all_gather_comm 0.04% : 0.000008s : 1: order_py_execute_after_rewriter 0.10% : 0.000021s : 1: overlap_grad_flash_sp 0.02% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.03% : 0.000007s : 1: overlap_grad_ring_attention 0.02% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.02% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.03% : 0.000006s : 1: overlap_param_gather 0.02% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.04% : 0.000008s : 1: overlap_recompute_and_grad_model_parallel 0.03% : 0.000006s : 1: overlap_recompute_comm 0.03% : 0.000007s : 1: parallel-infer-symbol 0.02% : 0.000004s : 1: parallel-infer-symbol-second 0.02% : 0.000005s : 1: partial_unused_args_eliminate 0.02% : 0.000005s : 1: pipeline_parallel_scheduler 0.02% : 0.000004s : 1: pipeline_split 0.13% : 0.000028s : 1: pre_auto_parallel 0.09% : 0.000020s : 1: py_interpret_to_execute 0.05% : 0.000011s : 1: py_interpret_to_execute_after_opt_a 0.02% : 0.000004s : 1: remove_cast_before_assign_add 0.08% : 0.000018s : 1: remove_dup_value 1.13% : 0.000242s : 1: renormalize.infer 0.92% : 0.000197s : 1: renormalize.specialize 0.03% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.03% : 0.000007s : 1: rewriter_after_jit_bprop_graph 0.18% : 0.000038s : 1: rewriter_after_opt_a 0.22% : 0.000048s : 1: rewriter_before_opt_a 0.02% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.02% : 0.000005s : 1: slice_recompute_activation 0.02% : 0.000005s : 1: split_layernorm_comm 0.02% : 0.000005s : 1: split_matmul_comm_elemetwise 0.04% : 0.000008s : 1: swap_dp_allreduce_reducescatter 0.36% : 0.000077s : 1: symbol_engine_optimizer 0.34% : 0.000073s : 1: tuple_transform 21.72% : 0.004649s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:47.763.220 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:47.763.485 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0139418, [21] [bootstrap]: 0.00043906 [type_inference]: 0.00465022 [event_method]: 1.101e-05 [auto_monad]: 5.612e-05 [graph_reusing]: 5.72001e-06 [inline]: 2.49001e-06 [add_attr]: 0.00305084, [1] [add_attr_with_inline]: 0.00304326, [1] [Cycle 1]: 6.153e-05, [2] [tag_attr]: 1.329e-05 [meta_addattr_fg_expand]: 3.67998e-06 [parallel-infer-symbol]: 3.13998e-06 [pre_auto_parallel]: 2.399e-05 [insert-virtual-dataset]: 2.73998e-06 [parallel-infer-symbol-second]: 7.39994e-07 [dataset_repeat_opt]: 1.74e-06 [pipeline_split]: 1.72001e-06 [optimize]: 0.00462407, [53] [py_interpret_to_execute]: 1.961e-05 [rewriter_before_opt_a]: 4.533e-05 [opt_a]: 0.00246364, [2] [Cycle 1]: 0.00161255, [45] [expand_dump_flag]: 3.32002e-06 [switch_simplify]: 2.366e-05 [loop_unroll]: 1.38e-05 [a_1]: 0.00031161 [with_stream_mark]: 1.54e-05 [recompute_prepare]: 8.77999e-06 [updatestate_depend_eliminate]: 3.42997e-06 [updatestate_assign_eliminate]: 3.98999e-06 [updatestate_loads_eliminate]: 2.84999e-06 [parameter_eliminate]: 2.52001e-06 [a_2]: 0.00012339 [accelerated_algorithm]: 6.86999e-06 [shard]: 2.16e-06 [meta_shard_fg_expand]: 1.79e-06 [shard_inline]: 6.53998e-06 [merge_send_recv]: 8.40001e-06 [auto_parallel]: 6.43e-06 [parallel]: 1.869e-05 [flash_sp]: 8.15e-06 [merge_comm]: 3.73001e-06 [allreduce_fusion]: 3.49001e-06 [matmul_add_comm_reduction]: 9.31e-06 [allreduce_slice_to_reducescatter]: 8.80013e-07 [virtual_shard_identity]: 8.02e-06 [virtual_dataset]: 6.54001e-06 [get_grad_eliminate_]: 6.10002e-06 [virtual_output]: 6.43998e-06 [merge_forward]: 3.61001e-06 [cell_reuse_recompute_pass]: 1.25001e-06 [offload_activation]: 9.19e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.363e-05 [merge_recompute_call_nodes]: 1.79998e-06 [before_grad]: 1.085e-05 [set_forward_comm_id_for_comm_node_pass]: 3.96001e-06 [meta_fg_expand]: 2.48e-06 [flash_sp_send_recv_attached]: 2.46e-06 [receive_attached]: 2.04e-06 [after_resolve]: 9.99999e-06 [a_after_grad]: 9.04e-06 [renormalize]: 0.00044271 [add_forward_monad_depend]: 4.48999e-06 [auto_monad_grad]: 2.16e-06 [auto_monad_eliminator]: 1.423e-05 [cse]: 2.693e-05 [a_3]: 5.835e-05 [Cycle 2]: 0.00083809, [45] [expand_dump_flag]: 9.29984e-07 [switch_simplify]: 7.28e-06 [loop_unroll]: 6.45002e-06 [a_1]: 0.00012765 [with_stream_mark]: 9.92001e-06 [recompute_prepare]: 7.01001e-06 [updatestate_depend_eliminate]: 3.10002e-06 [updatestate_assign_eliminate]: 2.44999e-06 [updatestate_loads_eliminate]: 2.66e-06 [parameter_eliminate]: 9.90025e-07 [a_2]: 0.00012664 [accelerated_algorithm]: 6.66e-06 [shard]: 9.80013e-07 [meta_shard_fg_expand]: 1.31002e-06 [shard_inline]: 6.34001e-06 [merge_send_recv]: 5.02999e-06 [auto_parallel]: 5.60001e-06 [parallel]: 4.97e-06 [flash_sp]: 3.99002e-06 [merge_comm]: 3.44001e-06 [allreduce_fusion]: 2.88998e-06 [matmul_add_comm_reduction]: 5.87001e-06 [allreduce_slice_to_reducescatter]: 3.69997e-07 [virtual_shard_identity]: 6.68e-06 [virtual_dataset]: 6.06e-06 [get_grad_eliminate_]: 5.77999e-06 [virtual_output]: 5.37999e-06 [merge_forward]: 2.86e-06 [cell_reuse_recompute_pass]: 1.38002e-06 [offload_activation]: 6.36e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.301e-05 [merge_recompute_call_nodes]: 8.80013e-07 [before_grad]: 8.59e-06 [set_forward_comm_id_for_comm_node_pass]: 3.79002e-06 [meta_fg_expand]: 2.22001e-06 [flash_sp_send_recv_attached]: 8.29983e-07 [receive_attached]: 9.50007e-07 [after_resolve]: 8.29998e-06 [a_after_grad]: 8.35001e-06 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 1.36002e-06 [auto_monad_grad]: 9.80013e-07 [auto_monad_eliminator]: 7.44002e-06 [cse]: 1.525e-05 [a_3]: 4.913e-05 [py_interpret_to_execute_after_opt_a]: 1.155e-05 [slice_cell_reuse_recomputed_activation]: 4.49002e-06 [rewriter_after_opt_a]: 3.942e-05 [convert_after_rewriter]: 9.47001e-06 [order_py_execute_after_rewriter]: 8.17e-06 [mutable_eliminate]: 0.00049116 [opt_b]: 0.00026487, [1] [Cycle 1]: 0.00025497, [7] [b_1]: 0.00016074 [b_2]: 8.30999e-06 [updatestate_depend_eliminate]: 5.61e-06 [updatestate_assign_eliminate]: 2.46e-06 [updatestate_loads_eliminate]: 2.29999e-06 [renormalize]: 4.19997e-07 [cse]: 1.742e-05 [optimize_parallel_all_gather_comm]: 1.964e-05 [overlap_param_gather]: 4.85999e-06 [cconv]: 2.718e-05 [loop_unroll]: 0.00043341 [opt_after_cconv]: 0.00012212, [1] [Cycle 1]: 0.00011373, [7] [c_1]: 2.846e-05 [parameter_eliminate]: 2.61999e-06 [updatestate_depend_eliminate]: 4.99e-06 [updatestate_assign_eliminate]: 2.46e-06 [updatestate_loads_eliminate]: 2.73e-06 [cse]: 1.746e-05 [renormalize]: 5.69999e-07 [remove_dup_value]: 1.749e-05 [tuple_transform]: 8.698e-05, [1] [Cycle 1]: 7.979e-05, [4] [d_1]: 4.024e-05 [none_parameter_eliminate]: 1.82999e-06 [renormalize]: 2.29978e-07 [switch_simplify]: 7.05e-06 [partial_unused_args_eliminate]: 4.70001e-06 [add_recomputation]: 4.676e-05 [cse_after_recomputation]: 2.622e-05, [1] [Cycle 1]: 1.957e-05, [1] [cse]: 1.107e-05 [environ_conv]: 8.49002e-06 [swap_dp_allreduce_reducescatter]: 7.67998e-06 [bias_add_comm_swap]: 4.75999e-06 [label_micro_interleaved_index]: 6.46e-06 [label_fine_grained_interleaved_index]: 5.11002e-06 [merge_cast_opt]: 3.83001e-06 [slice_recompute_activation]: 4.28001e-06 [micro_interleaved_order_control]: 4.46002e-06 [assign_add_opt]: 3.56999e-06 [ForceFp32Comm]: 3.11001e-06 [remove_cast_before_assign_add]: 3.62002e-06 [full_micro_interleaved_order_control]: 4.50001e-06 [reorder_send_recv_between_fp_bp]: 5.44998e-06 [comm_op_add_attrs]: 3.75e-06 [add_comm_op_reuse_tag]: 3.29001e-06 [interleave_split_concat_branches]: 3.44001e-06 [interleave_parallel_branches]: 3.43999e-06 [overlap_opt_shard_in_pipeline]: 3.78999e-06 [overlap_opt_shard_grad_in_pipeline]: 4.27e-06 [control_data_broadcast_order]: 1.571e-05 [grouped_pairwise_exchange_alltoall]: 3.75e-06 [offloading_packed_experts]: 6.86999e-06 [overlap_recompute_and_grad_model_parallel]: 7.31999e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.6e-06 [overlap_recompute_allgather_and_fa_grad]: 3.9e-06 [overlap_recompute_comm]: 4.78001e-06 [overlap_grad_ring_attention]: 6.83998e-06 [overlap_grad_flash_sp]: 2.097e-05 [begin_end_overlap_inline]: 2.84001e-06 [split_matmul_comm_elemetwise]: 4.74998e-06 [split_layernorm_comm]: 4.45e-06 [handle_group_info]: 3.32002e-06 [symbol_engine_optimizer]: 9.336e-05, [1] [Cycle 1]: 8.68e-05, [6] [build]: 2.64001e-06 [elim_shapecalc]: 9.02999e-06 [elim_not_effective]: 1.217e-05 [opt_reshape]: 7.1e-06 [fold_const_symbol]: 1.013e-05 [renormalize]: 2.20025e-07 [detach_backward]: 3.53e-06 [pipeline_parallel_scheduler]: 1.97001e-06 [auto_monad_reorder]: 1.732e-05 [get_jit_bprop_graph]: 1.10999e-06 [rewriter_after_jit_bprop_graph]: 4.18999e-06 [opt_after_jit_grad]: 0.00047342 [validate]: 3.315e-05 Sums bootstrap : 0.000439s : 4.77% type_inference : 0.004650s : 50.54% event_method : 0.000011s : 0.12% auto_monad : 0.000056s : 0.61% graph_reusing : 0.000006s : 0.06% inline : 0.000002s : 0.03% add_attr.add_attr_with_inline.tag_attr : 0.000013s : 0.14% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000004s : 0.04% parallel-infer-symbol : 0.000003s : 0.03% pre_auto_parallel : 0.000024s : 0.26% insert-virtual-dataset : 0.000003s : 0.03% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.02% optimize.py_interpret_to_execute : 0.000020s : 0.21% optimize.rewriter_before_opt_a : 0.000045s : 0.49% optimize.opt_a.expand_dump_flag : 0.000004s : 0.05% optimize.opt_a.switch_simplify : 0.000031s : 0.34% optimize.opt_a.loop_unroll : 0.000020s : 0.22% optimize.opt_a.a_1 : 0.000439s : 4.77% optimize.opt_a.with_stream_mark : 0.000025s : 0.28% optimize.opt_a.recompute_prepare : 0.000016s : 0.17% optimize.opt_a.updatestate_depend_eliminate : 0.000007s : 0.07% optimize.opt_a.updatestate_assign_eliminate : 0.000006s : 0.07% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.06% optimize.opt_a.parameter_eliminate : 0.000004s : 0.04% optimize.opt_a.a_2 : 0.000250s : 2.72% optimize.opt_a.accelerated_algorithm : 0.000014s : 0.15% optimize.opt_a.shard : 0.000003s : 0.03% optimize.opt_a.meta_shard_fg_expand : 0.000003s : 0.03% optimize.opt_a.shard_inline : 0.000013s : 0.14% optimize.opt_a.merge_send_recv : 0.000013s : 0.15% optimize.opt_a.auto_parallel : 0.000012s : 0.13% optimize.opt_a.parallel : 0.000024s : 0.26% optimize.opt_a.flash_sp : 0.000012s : 0.13% optimize.opt_a.merge_comm : 0.000007s : 0.08% optimize.opt_a.allreduce_fusion : 0.000006s : 0.07% optimize.opt_a.matmul_add_comm_reduction : 0.000015s : 0.16% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000015s : 0.16% optimize.opt_a.virtual_dataset : 0.000013s : 0.14% optimize.opt_a.get_grad_eliminate_ : 0.000012s : 0.13% optimize.opt_a.virtual_output : 0.000012s : 0.13% optimize.opt_a.merge_forward : 0.000006s : 0.07% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.03% optimize.opt_a.offload_activation : 0.000016s : 0.17% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000027s : 0.29% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.03% optimize.opt_a.before_grad : 0.000019s : 0.21% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000008s : 0.08% optimize.opt_a.meta_fg_expand : 0.000005s : 0.05% optimize.opt_a.flash_sp_send_recv_attached : 0.000003s : 0.04% optimize.opt_a.receive_attached : 0.000003s : 0.03% optimize.opt_a.after_resolve : 0.000018s : 0.20% optimize.opt_a.a_after_grad : 0.000017s : 0.19% optimize.opt_a.renormalize : 0.000443s : 4.81% optimize.opt_a.add_forward_monad_depend : 0.000006s : 0.06% optimize.opt_a.auto_monad_grad : 0.000003s : 0.03% optimize.opt_a.auto_monad_eliminator : 0.000022s : 0.24% optimize.opt_a.cse : 0.000042s : 0.46% optimize.opt_a.a_3 : 0.000107s : 1.17% optimize.py_interpret_to_execute_after_opt_a : 0.000012s : 0.13% optimize.slice_cell_reuse_recomputed_activation : 0.000004s : 0.05% optimize.rewriter_after_opt_a : 0.000039s : 0.43% optimize.convert_after_rewriter : 0.000009s : 0.10% optimize.order_py_execute_after_rewriter : 0.000008s : 0.09% optimize.mutable_eliminate : 0.000491s : 5.34% optimize.opt_b.b_1 : 0.000161s : 1.75% optimize.opt_b.b_2 : 0.000008s : 0.09% optimize.opt_b.updatestate_depend_eliminate : 0.000006s : 0.06% optimize.opt_b.updatestate_assign_eliminate : 0.000002s : 0.03% optimize.opt_b.updatestate_loads_eliminate : 0.000002s : 0.02% optimize.opt_b.renormalize : 0.000000s : 0.00% optimize.opt_b.cse : 0.000017s : 0.19% optimize.optimize_parallel_all_gather_comm : 0.000020s : 0.21% optimize.overlap_param_gather : 0.000005s : 0.05% optimize.cconv : 0.000027s : 0.30% optimize.loop_unroll : 0.000433s : 4.71% optimize.opt_after_cconv.c_1 : 0.000028s : 0.31% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000005s : 0.05% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000002s : 0.03% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.cse : 0.000017s : 0.19% optimize.opt_after_cconv.renormalize : 0.000001s : 0.01% optimize.remove_dup_value : 0.000017s : 0.19% optimize.tuple_transform.d_1 : 0.000040s : 0.44% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.02% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000007s : 0.08% optimize.partial_unused_args_eliminate : 0.000005s : 0.05% optimize.add_recomputation : 0.000047s : 0.51% optimize.cse_after_recomputation.cse : 0.000011s : 0.12% optimize.environ_conv : 0.000008s : 0.09% optimize.swap_dp_allreduce_reducescatter : 0.000008s : 0.08% optimize.bias_add_comm_swap : 0.000005s : 0.05% optimize.label_micro_interleaved_index : 0.000006s : 0.07% optimize.label_fine_grained_interleaved_index : 0.000005s : 0.06% optimize.merge_cast_opt : 0.000004s : 0.04% optimize.slice_recompute_activation : 0.000004s : 0.05% optimize.micro_interleaved_order_control : 0.000004s : 0.05% optimize.assign_add_opt : 0.000004s : 0.04% optimize.ForceFp32Comm : 0.000003s : 0.03% optimize.remove_cast_before_assign_add : 0.000004s : 0.04% optimize.full_micro_interleaved_order_control : 0.000005s : 0.05% optimize.reorder_send_recv_between_fp_bp : 0.000005s : 0.06% optimize.comm_op_add_attrs : 0.000004s : 0.04% optimize.add_comm_op_reuse_tag : 0.000003s : 0.04% optimize.interleave_split_concat_branches : 0.000003s : 0.04% optimize.interleave_parallel_branches : 0.000003s : 0.04% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.04% optimize.overlap_opt_shard_grad_in_pipeline : 0.000004s : 0.05% optimize.control_data_broadcast_order : 0.000016s : 0.17% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.04% optimize.offloading_packed_experts : 0.000007s : 0.07% optimize.overlap_recompute_and_grad_model_parallel : 0.000007s : 0.08% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.04% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.04% optimize.overlap_recompute_comm : 0.000005s : 0.05% optimize.overlap_grad_ring_attention : 0.000007s : 0.07% optimize.overlap_grad_flash_sp : 0.000021s : 0.23% optimize.begin_end_overlap_inline : 0.000003s : 0.03% optimize.split_matmul_comm_elemetwise : 0.000005s : 0.05% optimize.split_layernorm_comm : 0.000004s : 0.05% optimize.handle_group_info : 0.000003s : 0.04% optimize.symbol_engine_optimizer.build : 0.000003s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000009s : 0.10% optimize.symbol_engine_optimizer.elim_not_effective : 0.000012s : 0.13% optimize.symbol_engine_optimizer.opt_reshape : 0.000007s : 0.08% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000010s : 0.11% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000004s : 0.04% pipeline_parallel_scheduler : 0.000002s : 0.02% auto_monad_reorder : 0.000017s : 0.19% get_jit_bprop_graph : 0.000001s : 0.01% rewriter_after_jit_bprop_graph : 0.000004s : 0.05% opt_after_jit_grad : 0.000473s : 5.14% validate : 0.000033s : 0.36% Time group info: ------[substitution.] 0.000168 23 40.19% : 0.000068s : 4: substitution.arithmetic_simplify 1.07% : 0.000002s : 2: substitution.elim_not_effective 0.84% : 0.000001s : 2: substitution.fold_const_symbol 3.42% : 0.000006s : 3: substitution.graph_param_transform 48.21% : 0.000081s : 2: substitution.inline 1.93% : 0.000003s : 4: substitution.j_node_and_user_rematch 2.51% : 0.000004s : 4: substitution.remove_not_recompute_node 1.83% : 0.000003s : 2: substitution.replace_old_param ------[type_inference.] 0.004605 2 91.08% : 0.004194s : 1: type_inference.infer 8.92% : 0.000411s : 1: type_inference.specialize ------[replace.] 0.000020 2 100.00% : 0.000020s : 2: replace.inline ------[match.] 0.000079 2 100.00% : 0.000079s : 2: match.inline ------[predicate.] 0.000134 754 0.86% : 0.000001s : 7: predicate.accumulaten_eliminater 0.81% : 0.000001s : 3: predicate.ad_related_special_op_eliminate 0.74% : 0.000001s : 6: predicate.addn_check_dump 0.86% : 0.000001s : 7: predicate.addn_zero_filter 0.70% : 0.000001s : 7: predicate.adjust_all_reduce_mul_add 2.66% : 0.000004s : 13: predicate.arithmetic_simplify 0.76% : 0.000001s : 7: predicate.cast_eliminate 0.78% : 0.000001s : 6: predicate.check_bprop_eliminate 0.74% : 0.000001s : 6: predicate.compare_switch_simplify 0.24% : 0.000000s : 3: predicate.const_output_eliminate 1.07% : 0.000001s : 6: predicate.depend_value_elim 0.76% : 0.000001s : 7: predicate.dict_get_item_const_eliminator 0.87% : 0.000001s : 7: predicate.dict_get_item_eliminator 0.79% : 0.000001s : 7: predicate.dict_set_item_eliminator 1.23% : 0.000002s : 6: predicate.dumpgradient_eliminate 0.26% : 0.000000s : 3: predicate.elim_not_effective 0.52% : 0.000001s : 3: predicate.elim_shapecalc_of_broadcastargs 1.09% : 0.000001s : 10: predicate.environ_add_const_eliminate 0.97% : 0.000001s : 10: predicate.environ_get_add_eliminate 1.01% : 0.000001s : 10: predicate.environ_get_depend_swap 1.91% : 0.000003s : 16: predicate.environ_get_eliminate 1.02% : 0.000001s : 10: predicate.environ_get_set_eliminate 0.90% : 0.000001s : 9: predicate.exchange_switch_depend_value 1.99% : 0.000003s : 9: predicate.float_depend_g_call 0.75% : 0.000001s : 6: predicate.float_environ_get_switch 0.99% : 0.000001s : 9: predicate.float_tuple_getitem_switch 0.21% : 0.000000s : 3: predicate.fold_const_symbol 0.78% : 0.000001s : 6: predicate.get_grad_eliminate 0.37% : 0.000001s : 3: predicate.graph_param_transform 0.81% : 0.000001s : 6: predicate.incorporate_call 0.69% : 0.000001s : 6: predicate.incorporate_call_switch 6.45% : 0.000009s : 34: predicate.inline 0.96% : 0.000001s : 6: predicate.inline_without_move 0.41% : 0.000001s : 6: predicate.j_node_and_user_rematch 1.14% : 0.000002s : 6: predicate.less_batch_normalization 1.60% : 0.000002s : 13: predicate.list_to_tuple_eliminator_ 2.04% : 0.000003s : 20: predicate.load_eliminater 1.20% : 0.000002s : 3: predicate.loop_unroll_after_grad 2.37% : 0.000003s : 14: predicate.loop_unroll_before_grad 1.77% : 0.000002s : 13: predicate.make_slice_get_slice_eliminator 0.69% : 0.000001s : 6: predicate.merge_addn 0.74% : 0.000001s : 6: predicate.micro_step_allgather_replace 0.73% : 0.000001s : 6: predicate.mini_step_allgather_replace 0.66% : 0.000001s : 7: predicate.minmaximum_grad 1.30% : 0.000002s : 3: predicate.mutable_eliminate 0.45% : 0.000001s : 3: predicate.opt_reshape 0.46% : 0.000001s : 3: predicate.parallel_virtual_node 1.23% : 0.000002s : 9: predicate.partial_defer_inline 1.26% : 0.000002s : 10: predicate.partial_eliminate 0.75% : 0.000001s : 7: predicate.print_const_string_wrapper 0.73% : 0.000001s : 6: predicate.reduce_all_const_elim 0.96% : 0.000001s : 7: predicate.reduce_eliminate 2.02% : 0.000003s : 20: predicate.redundant_stop_gradient_eliminater 0.62% : 0.000001s : 6: predicate.remove_not_recompute_node 1.13% : 0.000002s : 13: predicate.replace_applicator 0.66% : 0.000001s : 6: predicate.replace_old_param 0.38% : 0.000001s : 3: predicate.reset_defer_inline 0.80% : 0.000001s : 7: predicate.reshape_eliminate 0.75% : 0.000001s : 6: predicate.row_tensor_add_zeros_like 0.49% : 0.000001s : 3: predicate.row_tensor_eliminate 1.12% : 0.000001s : 6: predicate.same_eliminate 0.57% : 0.000001s : 6: predicate.set_cell_output_no_recompute 0.99% : 0.000001s : 6: predicate.shard_identity_eliminate 1.10% : 0.000001s : 6: predicate.special_op_eliminate 1.02% : 0.000001s : 6: predicate.specialize_transform 1.07% : 0.000001s : 6: predicate.split_environ_get_set_with_tuple_value 0.88% : 0.000001s : 6: predicate.stack_unstack_eliminate 0.43% : 0.000001s : 3: predicate.switch_call_monad_eliminater 1.01% : 0.000001s : 9: predicate.switch_defer_inline 1.80% : 0.000002s : 15: predicate.switch_layer_defer_inline 4.41% : 0.000006s : 32: predicate.switch_simplify 0.72% : 0.000001s : 7: predicate.tile_eliminate 0.80% : 0.000001s : 7: predicate.transpose_eliminate 1.58% : 0.000002s : 13: predicate.tuple_list_convert_item_index_to_positive 1.61% : 0.000002s : 13: predicate.tuple_list_get_item_const_eliminator 1.42% : 0.000002s : 13: predicate.tuple_list_get_item_depend_reorder 3.20% : 0.000004s : 19: predicate.tuple_list_get_item_eliminator 1.41% : 0.000002s : 13: predicate.tuple_list_get_set_item_eliminator 2.47% : 0.000003s : 19: predicate.tuple_list_set_item_eliminator 1.55% : 0.000002s : 13: predicate.tuple_to_list_eliminator_ 1.97% : 0.000003s : 20: predicate.updatestate_pure_node_eliminater 2.91% : 0.000004s : 26: predicate.updatestate_useless_node_eliminater 0.49% : 0.000001s : 3: predicate.value_based_eliminate 0.87% : 0.000001s : 6: predicate.virtual_dataset_eliminate 0.81% : 0.000001s : 6: predicate.virtual_output_eliminate 0.35% : 0.000000s : 3: predicate.virtual_view_grad_eliminate 0.58% : 0.000001s : 3: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000200 5 8.29% : 0.000017s : 1: func_graph_cloner_run.FuncGraphClonerGraph 91.71% : 0.000184s : 4: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.023038 192 0.03% : 0.000006s : 1: ForceFp32Comm 13.28% : 0.003060s : 1: add_attr 13.23% : 0.003047s : 1: add_attr_with_inline 0.03% : 0.000006s : 1: add_comm_op_reuse_tag 0.22% : 0.000050s : 1: add_recomputation 0.03% : 0.000006s : 1: assign_add_opt 0.28% : 0.000064s : 1: auto_monad 0.11% : 0.000024s : 1: auto_monad_reorder 0.02% : 0.000006s : 1: begin_end_overlap_inline 0.03% : 0.000008s : 1: bias_add_comm_swap 2.09% : 0.000482s : 1: bootstrap 0.13% : 0.000030s : 1: cconv 0.03% : 0.000007s : 1: comm_op_add_attrs 0.08% : 0.000019s : 1: control_data_broadcast_order 0.05% : 0.000013s : 1: convert_after_rewriter 0.13% : 0.000029s : 1: cse_after_recomputation 0.03% : 0.000007s : 1: dataset_repeat_opt 0.08% : 0.000018s : 1: detach_backward 0.05% : 0.000011s : 1: environ_conv 0.09% : 0.000021s : 1: event_method 0.03% : 0.000007s : 1: full_micro_interleaved_order_control 0.03% : 0.000007s : 1: get_jit_bprop_graph 0.05% : 0.000012s : 1: graph_reusing 0.03% : 0.000006s : 1: grouped_pairwise_exchange_alltoall 0.03% : 0.000006s : 1: handle_group_info 0.03% : 0.000008s : 1: inline 0.04% : 0.000009s : 1: insert-virtual-dataset 0.03% : 0.000006s : 1: interleave_parallel_branches 0.03% : 0.000006s : 1: interleave_split_concat_branches 0.03% : 0.000008s : 1: label_fine_grained_interleaved_index 0.04% : 0.000009s : 1: label_micro_interleaved_index 1.91% : 0.000439s : 1: loop_unroll 0.03% : 0.000007s : 1: merge_cast_opt 0.03% : 0.000007s : 1: micro_interleaved_order_control 2.16% : 0.000497s : 1: mutable_eliminate 0.04% : 0.000010s : 1: offloading_packed_experts 0.06% : 0.000014s : 1: opt.transform.loop_unroll_optimizer 0.06% : 0.000014s : 1: opt.transform.mutable_eliminate 3.67% : 0.000845s : 78: opt.transform.opt_a 0.12% : 0.000027s : 1: opt.transform.opt_after_cconv 0.10% : 0.000024s : 1: opt.transform.opt_after_jit_grad 0.42% : 0.000097s : 28: opt.transform.opt_b 0.19% : 0.000045s : 2: opt.transform.opt_trans_graph 0.15% : 0.000035s : 4: opt.transform.symbol_engine_opt 10.71% : 0.002467s : 1: opt_a 0.55% : 0.000126s : 1: opt_after_cconv 2.10% : 0.000484s : 1: opt_after_jit_grad 1.16% : 0.000268s : 1: opt_b 21.41% : 0.004932s : 1: optimize 0.10% : 0.000023s : 1: optimize_parallel_all_gather_comm 0.05% : 0.000011s : 1: order_py_execute_after_rewriter 0.10% : 0.000024s : 1: overlap_grad_flash_sp 0.03% : 0.000006s : 1: overlap_grad_matmul_and_grad_allreduce 0.04% : 0.000010s : 1: overlap_grad_ring_attention 0.03% : 0.000007s : 1: overlap_opt_shard_grad_in_pipeline 0.03% : 0.000007s : 1: overlap_opt_shard_in_pipeline 0.03% : 0.000008s : 1: overlap_param_gather 0.03% : 0.000007s : 1: overlap_recompute_allgather_and_fa_grad 0.04% : 0.000010s : 1: overlap_recompute_and_grad_model_parallel 0.03% : 0.000008s : 1: overlap_recompute_comm 0.04% : 0.000010s : 1: parallel-infer-symbol 0.03% : 0.000006s : 1: parallel-infer-symbol-second 0.03% : 0.000008s : 1: partial_unused_args_eliminate 0.04% : 0.000009s : 1: pipeline_parallel_scheduler 0.03% : 0.000008s : 1: pipeline_split 0.14% : 0.000031s : 1: pre_auto_parallel 0.10% : 0.000023s : 1: py_interpret_to_execute 0.06% : 0.000015s : 1: py_interpret_to_execute_after_opt_a 0.03% : 0.000006s : 1: remove_cast_before_assign_add 0.09% : 0.000021s : 1: remove_dup_value 1.02% : 0.000236s : 1: renormalize.infer 0.87% : 0.000199s : 1: renormalize.specialize 0.04% : 0.000008s : 1: reorder_send_recv_between_fp_bp 0.04% : 0.000010s : 1: rewriter_after_jit_bprop_graph 0.19% : 0.000043s : 1: rewriter_after_opt_a 0.21% : 0.000049s : 1: rewriter_before_opt_a 0.03% : 0.000007s : 1: slice_cell_reuse_recomputed_activation 0.03% : 0.000007s : 1: slice_recompute_activation 0.03% : 0.000007s : 1: split_layernorm_comm 0.03% : 0.000008s : 1: split_matmul_comm_elemetwise 0.05% : 0.000011s : 1: swap_dp_allreduce_reducescatter 0.42% : 0.000096s : 1: symbol_engine_optimizer 0.39% : 0.000090s : 1: tuple_transform 20.29% : 0.004674s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:47.953.517 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0129169, [21] [bootstrap]: 0.0004507 [type_inference]: 0.0046511 [event_method]: 1.059e-05 [auto_monad]: 5.247e-05 [graph_reusing]: 5.29e-06 [inline]: 2.55002e-06 [add_attr]: 0.00308814, [1] [add_attr_with_inline]: 0.00308025, [1] [Cycle 1]: 5.166e-05, [2] [tag_attr]: 1.421e-05 [meta_addattr_fg_expand]: 3.80998e-06 [parallel-infer-symbol]: 3.61999e-06 [pre_auto_parallel]: 2.3e-05 [insert-virtual-dataset]: 2.96999e-06 [parallel-infer-symbol-second]: 1.47999e-06 [dataset_repeat_opt]: 2.01e-06 [pipeline_split]: 1.65001e-06 [optimize]: 0.00395791, [53] [py_interpret_to_execute]: 1.768e-05 [rewriter_before_opt_a]: 4.411e-05 [opt_a]: 0.0020578, [2] [Cycle 1]: 0.00141218, [45] [expand_dump_flag]: 2.70002e-06 [switch_simplify]: 2.588e-05 [loop_unroll]: 1.39e-05 [a_1]: 0.00031805 [with_stream_mark]: 1.48e-05 [recompute_prepare]: 8.54e-06 [updatestate_depend_eliminate]: 4.13999e-06 [updatestate_assign_eliminate]: 3.24001e-06 [updatestate_loads_eliminate]: 3.17002e-06 [parameter_eliminate]: 1.93002e-06 [a_2]: 9.249e-05 [accelerated_algorithm]: 7.26001e-06 [shard]: 2.30002e-06 [meta_shard_fg_expand]: 1.89999e-06 [shard_inline]: 6.24001e-06 [merge_send_recv]: 7.88001e-06 [auto_parallel]: 6.16998e-06 [parallel]: 2.199e-05 [flash_sp]: 7.13e-06 [merge_comm]: 4.33001e-06 [allreduce_fusion]: 3.34001e-06 [matmul_add_comm_reduction]: 9.00999e-06 [allreduce_slice_to_reducescatter]: 5.50004e-07 [virtual_shard_identity]: 7.56999e-06 [virtual_dataset]: 6.29999e-06 [get_grad_eliminate_]: 5.67001e-06 [virtual_output]: 6.27001e-06 [merge_forward]: 4.01001e-06 [cell_reuse_recompute_pass]: 1.25001e-06 [offload_activation]: 1.003e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.167e-05 [merge_recompute_call_nodes]: 1.42e-06 [before_grad]: 1.13e-05 [set_forward_comm_id_for_comm_node_pass]: 3.50998e-06 [meta_fg_expand]: 2.86e-06 [flash_sp_send_recv_attached]: 2.43e-06 [receive_attached]: 2.15002e-06 [after_resolve]: 1.004e-05 [a_after_grad]: 9.09003e-06 [renormalize]: 0.00041816 [add_forward_monad_depend]: 4.52e-06 [auto_monad_grad]: 1.72001e-06 [auto_monad_eliminator]: 1.4e-05 [cse]: 2.961e-05 [a_3]: 4.439e-05 [Cycle 2]: 0.00063593, [45] [expand_dump_flag]: 8.89995e-07 [switch_simplify]: 7.2e-06 [loop_unroll]: 5.87001e-06 [a_1]: 0.00012616 [with_stream_mark]: 9.75002e-06 [recompute_prepare]: 6.54999e-06 [updatestate_depend_eliminate]: 2.90002e-06 [updatestate_assign_eliminate]: 2.26e-06 [updatestate_loads_eliminate]: 2.80002e-06 [parameter_eliminate]: 9.49978e-07 [a_2]: 8.053e-05 [accelerated_algorithm]: 6.20002e-06 [shard]: 1.00999e-06 [meta_shard_fg_expand]: 1.27999e-06 [shard_inline]: 5.84e-06 [merge_send_recv]: 4.65001e-06 [auto_parallel]: 5.61e-06 [parallel]: 4.90001e-06 [flash_sp]: 3.48e-06 [merge_comm]: 3.2e-06 [allreduce_fusion]: 2.97002e-06 [matmul_add_comm_reduction]: 5.64e-06 [allreduce_slice_to_reducescatter]: 3.09985e-07 [virtual_shard_identity]: 6.39001e-06 [virtual_dataset]: 5.62001e-06 [get_grad_eliminate_]: 5.46e-06 [virtual_output]: 5.23002e-06 [merge_forward]: 2.69999e-06 [cell_reuse_recompute_pass]: 1.37e-06 [offload_activation]: 6.17999e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.074e-05 [merge_recompute_call_nodes]: 6.89994e-07 [before_grad]: 8.73001e-06 [set_forward_comm_id_for_comm_node_pass]: 3.56999e-06 [meta_fg_expand]: 2.09e-06 [flash_sp_send_recv_attached]: 7.79983e-07 [receive_attached]: 9.5999e-07 [after_resolve]: 8.38999e-06 [a_after_grad]: 8.35001e-06 [renormalize]: 9.00181e-08 [add_forward_monad_depend]: 1.32e-06 [auto_monad_grad]: 8.89995e-07 [auto_monad_eliminator]: 7.1e-06 [cse]: 1.46e-05 [a_3]: 3.508e-05 [py_interpret_to_execute_after_opt_a]: 7.77e-06 [slice_cell_reuse_recomputed_activation]: 2.25002e-06 [rewriter_after_opt_a]: 3.353e-05 [convert_after_rewriter]: 6.58e-06 [order_py_execute_after_rewriter]: 5.25999e-06 [mutable_eliminate]: 0.00045706 [opt_b]: 0.00019407, [1] [Cycle 1]: 0.00018784, [7] [b_1]: 0.00011805 [b_2]: 6.95002e-06 [updatestate_depend_eliminate]: 4.96002e-06 [updatestate_assign_eliminate]: 2.27999e-06 [updatestate_loads_eliminate]: 2.49999e-06 [renormalize]: 3.39991e-07 [cse]: 1.85e-05 [optimize_parallel_all_gather_comm]: 1.51e-05 [overlap_param_gather]: 4.05998e-06 [cconv]: 2.233e-05 [loop_unroll]: 0.00041578 [opt_after_cconv]: 0.00013089, [1] [Cycle 1]: 9.274e-05, [7] [c_1]: 2.697e-05 [parameter_eliminate]: 2.37001e-06 [updatestate_depend_eliminate]: 5.15001e-06 [updatestate_assign_eliminate]: 2.45002e-06 [updatestate_loads_eliminate]: 2.32001e-06 [cse]: 1.788e-05 [renormalize]: 4.89992e-07 [remove_dup_value]: 1.468e-05 [tuple_transform]: 7.305e-05, [1] [Cycle 1]: 6.841e-05, [4] [d_1]: 3.95e-05 [none_parameter_eliminate]: 2.13998e-06 [renormalize]: 2.19996e-07 [switch_simplify]: 6.94001e-06 [partial_unused_args_eliminate]: 1.76e-06 [add_recomputation]: 4.276e-05 [cse_after_recomputation]: 2.226e-05, [1] [Cycle 1]: 1.766e-05, [1] [cse]: 1.182e-05 [environ_conv]: 5.39998e-06 [swap_dp_allreduce_reducescatter]: 5.39e-06 [bias_add_comm_swap]: 2.21998e-06 [label_micro_interleaved_index]: 4.35999e-06 [label_fine_grained_interleaved_index]: 2.53e-06 [merge_cast_opt]: 1.34e-06 [slice_recompute_activation]: 1.94e-06 [micro_interleaved_order_control]: 2.06998e-06 [assign_add_opt]: 1.53002e-06 [ForceFp32Comm]: 7.60017e-07 [remove_cast_before_assign_add]: 1.27999e-06 [full_micro_interleaved_order_control]: 2.22999e-06 [reorder_send_recv_between_fp_bp]: 2.69001e-06 [comm_op_add_attrs]: 1.02e-06 [add_comm_op_reuse_tag]: 9.89996e-07 [interleave_split_concat_branches]: 1.10999e-06 [interleave_parallel_branches]: 1.40001e-06 [overlap_opt_shard_in_pipeline]: 3.51999e-06 [overlap_opt_shard_grad_in_pipeline]: 2.14999e-06 [control_data_broadcast_order]: 1.219e-05 [grouped_pairwise_exchange_alltoall]: 1.50999e-06 [offloading_packed_experts]: 3.81001e-06 [overlap_recompute_and_grad_model_parallel]: 5.10999e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.21002e-06 [overlap_recompute_allgather_and_fa_grad]: 1.67001e-06 [overlap_recompute_comm]: 2.17999e-06 [overlap_grad_ring_attention]: 4.17e-06 [overlap_grad_flash_sp]: 1.918e-05 [begin_end_overlap_inline]: 5.00004e-07 [split_matmul_comm_elemetwise]: 2.10002e-06 [split_layernorm_comm]: 1.62001e-06 [handle_group_info]: 9.50007e-07 [symbol_engine_optimizer]: 7.369e-05, [1] [Cycle 1]: 6.956e-05, [6] [build]: 2.53e-06 [elim_shapecalc]: 9.76998e-06 [elim_not_effective]: 1.255e-05 [opt_reshape]: 6.71999e-06 [fold_const_symbol]: 9.81998e-06 [renormalize]: 2.3999e-07 [detach_backward]: 1.72999e-06 [pipeline_parallel_scheduler]: 1.84e-06 [auto_monad_reorder]: 1.612e-05 [get_jit_bprop_graph]: 1.09e-06 [rewriter_after_jit_bprop_graph]: 3.48e-06 [opt_after_jit_grad]: 0.00046524 [validate]: 3.341e-05 Sums bootstrap : 0.000451s : 5.09% type_inference : 0.004651s : 52.53% event_method : 0.000011s : 0.12% auto_monad : 0.000052s : 0.59% graph_reusing : 0.000005s : 0.06% inline : 0.000003s : 0.03% add_attr.add_attr_with_inline.tag_attr : 0.000014s : 0.16% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000004s : 0.04% parallel-infer-symbol : 0.000004s : 0.04% pre_auto_parallel : 0.000023s : 0.26% insert-virtual-dataset : 0.000003s : 0.03% parallel-infer-symbol-second : 0.000001s : 0.02% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.02% optimize.py_interpret_to_execute : 0.000018s : 0.20% optimize.rewriter_before_opt_a : 0.000044s : 0.50% optimize.opt_a.expand_dump_flag : 0.000004s : 0.04% optimize.opt_a.switch_simplify : 0.000033s : 0.37% optimize.opt_a.loop_unroll : 0.000020s : 0.22% optimize.opt_a.a_1 : 0.000444s : 5.02% optimize.opt_a.with_stream_mark : 0.000025s : 0.28% optimize.opt_a.recompute_prepare : 0.000015s : 0.17% optimize.opt_a.updatestate_depend_eliminate : 0.000007s : 0.08% optimize.opt_a.updatestate_assign_eliminate : 0.000006s : 0.06% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.07% optimize.opt_a.parameter_eliminate : 0.000003s : 0.03% optimize.opt_a.a_2 : 0.000173s : 1.95% optimize.opt_a.accelerated_algorithm : 0.000013s : 0.15% optimize.opt_a.shard : 0.000003s : 0.04% optimize.opt_a.meta_shard_fg_expand : 0.000003s : 0.04% optimize.opt_a.shard_inline : 0.000012s : 0.14% optimize.opt_a.merge_send_recv : 0.000013s : 0.14% optimize.opt_a.auto_parallel : 0.000012s : 0.13% optimize.opt_a.parallel : 0.000027s : 0.30% optimize.opt_a.flash_sp : 0.000011s : 0.12% optimize.opt_a.merge_comm : 0.000008s : 0.09% optimize.opt_a.allreduce_fusion : 0.000006s : 0.07% optimize.opt_a.matmul_add_comm_reduction : 0.000015s : 0.17% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000014s : 0.16% optimize.opt_a.virtual_dataset : 0.000012s : 0.13% optimize.opt_a.get_grad_eliminate_ : 0.000011s : 0.13% optimize.opt_a.virtual_output : 0.000012s : 0.13% optimize.opt_a.merge_forward : 0.000007s : 0.08% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.03% optimize.opt_a.offload_activation : 0.000016s : 0.18% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000022s : 0.25% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.02% optimize.opt_a.before_grad : 0.000020s : 0.23% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000007s : 0.08% optimize.opt_a.meta_fg_expand : 0.000005s : 0.06% optimize.opt_a.flash_sp_send_recv_attached : 0.000003s : 0.04% optimize.opt_a.receive_attached : 0.000003s : 0.04% optimize.opt_a.after_resolve : 0.000018s : 0.21% optimize.opt_a.a_after_grad : 0.000017s : 0.20% optimize.opt_a.renormalize : 0.000418s : 4.72% optimize.opt_a.add_forward_monad_depend : 0.000006s : 0.07% optimize.opt_a.auto_monad_grad : 0.000003s : 0.03% optimize.opt_a.auto_monad_eliminator : 0.000021s : 0.24% optimize.opt_a.cse : 0.000044s : 0.50% optimize.opt_a.a_3 : 0.000079s : 0.90% optimize.py_interpret_to_execute_after_opt_a : 0.000008s : 0.09% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.03% optimize.rewriter_after_opt_a : 0.000034s : 0.38% optimize.convert_after_rewriter : 0.000007s : 0.07% optimize.order_py_execute_after_rewriter : 0.000005s : 0.06% optimize.mutable_eliminate : 0.000457s : 5.16% optimize.opt_b.b_1 : 0.000118s : 1.33% optimize.opt_b.b_2 : 0.000007s : 0.08% optimize.opt_b.updatestate_depend_eliminate : 0.000005s : 0.06% optimize.opt_b.updatestate_assign_eliminate : 0.000002s : 0.03% optimize.opt_b.updatestate_loads_eliminate : 0.000002s : 0.03% optimize.opt_b.renormalize : 0.000000s : 0.00% optimize.opt_b.cse : 0.000018s : 0.21% optimize.optimize_parallel_all_gather_comm : 0.000015s : 0.17% optimize.overlap_param_gather : 0.000004s : 0.05% optimize.cconv : 0.000022s : 0.25% optimize.loop_unroll : 0.000416s : 4.70% optimize.opt_after_cconv.c_1 : 0.000027s : 0.30% optimize.opt_after_cconv.parameter_eliminate : 0.000002s : 0.03% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000005s : 0.06% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000002s : 0.03% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.03% optimize.opt_after_cconv.cse : 0.000018s : 0.20% optimize.opt_after_cconv.renormalize : 0.000000s : 0.01% optimize.remove_dup_value : 0.000015s : 0.17% optimize.tuple_transform.d_1 : 0.000040s : 0.45% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.02% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000007s : 0.08% optimize.partial_unused_args_eliminate : 0.000002s : 0.02% optimize.add_recomputation : 0.000043s : 0.48% optimize.cse_after_recomputation.cse : 0.000012s : 0.13% optimize.environ_conv : 0.000005s : 0.06% optimize.swap_dp_allreduce_reducescatter : 0.000005s : 0.06% optimize.bias_add_comm_swap : 0.000002s : 0.03% optimize.label_micro_interleaved_index : 0.000004s : 0.05% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.03% optimize.merge_cast_opt : 0.000001s : 0.02% optimize.slice_recompute_activation : 0.000002s : 0.02% optimize.micro_interleaved_order_control : 0.000002s : 0.02% optimize.assign_add_opt : 0.000002s : 0.02% optimize.ForceFp32Comm : 0.000001s : 0.01% optimize.remove_cast_before_assign_add : 0.000001s : 0.01% optimize.full_micro_interleaved_order_control : 0.000002s : 0.03% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.03% optimize.comm_op_add_attrs : 0.000001s : 0.01% optimize.add_comm_op_reuse_tag : 0.000001s : 0.01% optimize.interleave_split_concat_branches : 0.000001s : 0.01% optimize.interleave_parallel_branches : 0.000001s : 0.02% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.04% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.02% optimize.control_data_broadcast_order : 0.000012s : 0.14% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.02% optimize.offloading_packed_experts : 0.000004s : 0.04% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.06% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000002s : 0.02% optimize.overlap_recompute_comm : 0.000002s : 0.02% optimize.overlap_grad_ring_attention : 0.000004s : 0.05% optimize.overlap_grad_flash_sp : 0.000019s : 0.22% optimize.begin_end_overlap_inline : 0.000001s : 0.01% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.02% optimize.split_layernorm_comm : 0.000002s : 0.02% optimize.handle_group_info : 0.000001s : 0.01% optimize.symbol_engine_optimizer.build : 0.000003s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000010s : 0.11% optimize.symbol_engine_optimizer.elim_not_effective : 0.000013s : 0.14% optimize.symbol_engine_optimizer.opt_reshape : 0.000007s : 0.08% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000010s : 0.11% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.02% pipeline_parallel_scheduler : 0.000002s : 0.02% auto_monad_reorder : 0.000016s : 0.18% get_jit_bprop_graph : 0.000001s : 0.01% rewriter_after_jit_bprop_graph : 0.000003s : 0.04% opt_after_jit_grad : 0.000465s : 5.25% validate : 0.000033s : 0.38% Time group info: ------[substitution.] 0.000171 23 39.47% : 0.000067s : 4: substitution.arithmetic_simplify 1.05% : 0.000002s : 2: substitution.elim_not_effective 0.80% : 0.000001s : 2: substitution.fold_const_symbol 3.07% : 0.000005s : 3: substitution.graph_param_transform 49.08% : 0.000084s : 2: substitution.inline 2.19% : 0.000004s : 4: substitution.j_node_and_user_rematch 2.53% : 0.000004s : 4: substitution.remove_not_recompute_node 1.81% : 0.000003s : 2: substitution.replace_old_param ------[type_inference.] 0.004607 2 91.07% : 0.004196s : 1: type_inference.infer 8.93% : 0.000412s : 1: type_inference.specialize ------[replace.] 0.000020 2 100.00% : 0.000020s : 2: replace.inline ------[match.] 0.000082 2 100.00% : 0.000082s : 2: match.inline ------[predicate.] 0.000132 754 0.83% : 0.000001s : 7: predicate.accumulaten_eliminater 1.01% : 0.000001s : 3: predicate.ad_related_special_op_eliminate 0.77% : 0.000001s : 6: predicate.addn_check_dump 0.81% : 0.000001s : 7: predicate.addn_zero_filter 0.75% : 0.000001s : 7: predicate.adjust_all_reduce_mul_add 2.83% : 0.000004s : 13: predicate.arithmetic_simplify 0.81% : 0.000001s : 7: predicate.cast_eliminate 0.86% : 0.000001s : 6: predicate.check_bprop_eliminate 0.73% : 0.000001s : 6: predicate.compare_switch_simplify 0.22% : 0.000000s : 3: predicate.const_output_eliminate 0.76% : 0.000001s : 6: predicate.depend_value_elim 0.80% : 0.000001s : 7: predicate.dict_get_item_const_eliminator 1.01% : 0.000001s : 7: predicate.dict_get_item_eliminator 0.78% : 0.000001s : 7: predicate.dict_set_item_eliminator 1.44% : 0.000002s : 6: predicate.dumpgradient_eliminate 0.29% : 0.000000s : 3: predicate.elim_not_effective 0.51% : 0.000001s : 3: predicate.elim_shapecalc_of_broadcastargs 1.17% : 0.000002s : 10: predicate.environ_add_const_eliminate 1.00% : 0.000001s : 10: predicate.environ_get_add_eliminate 0.98% : 0.000001s : 10: predicate.environ_get_depend_swap 1.84% : 0.000002s : 16: predicate.environ_get_eliminate 0.99% : 0.000001s : 10: predicate.environ_get_set_eliminate 0.95% : 0.000001s : 9: predicate.exchange_switch_depend_value 1.93% : 0.000003s : 9: predicate.float_depend_g_call 0.69% : 0.000001s : 6: predicate.float_environ_get_switch 0.95% : 0.000001s : 9: predicate.float_tuple_getitem_switch 0.22% : 0.000000s : 3: predicate.fold_const_symbol 0.78% : 0.000001s : 6: predicate.get_grad_eliminate 0.36% : 0.000000s : 3: predicate.graph_param_transform 0.80% : 0.000001s : 6: predicate.incorporate_call 0.67% : 0.000001s : 6: predicate.incorporate_call_switch 6.33% : 0.000008s : 34: predicate.inline 1.02% : 0.000001s : 6: predicate.inline_without_move 0.40% : 0.000001s : 6: predicate.j_node_and_user_rematch 1.02% : 0.000001s : 6: predicate.less_batch_normalization 1.63% : 0.000002s : 13: predicate.list_to_tuple_eliminator_ 1.98% : 0.000003s : 20: predicate.load_eliminater 1.22% : 0.000002s : 3: predicate.loop_unroll_after_grad 1.66% : 0.000002s : 14: predicate.loop_unroll_before_grad 1.66% : 0.000002s : 13: predicate.make_slice_get_slice_eliminator 0.64% : 0.000001s : 6: predicate.merge_addn 0.72% : 0.000001s : 6: predicate.micro_step_allgather_replace 0.68% : 0.000001s : 6: predicate.mini_step_allgather_replace 0.73% : 0.000001s : 7: predicate.minmaximum_grad 1.26% : 0.000002s : 3: predicate.mutable_eliminate 0.44% : 0.000001s : 3: predicate.opt_reshape 0.47% : 0.000001s : 3: predicate.parallel_virtual_node 1.28% : 0.000002s : 9: predicate.partial_defer_inline 1.22% : 0.000002s : 10: predicate.partial_eliminate 0.76% : 0.000001s : 7: predicate.print_const_string_wrapper 0.77% : 0.000001s : 6: predicate.reduce_all_const_elim 1.16% : 0.000002s : 7: predicate.reduce_eliminate 2.16% : 0.000003s : 20: predicate.redundant_stop_gradient_eliminater 0.65% : 0.000001s : 6: predicate.remove_not_recompute_node 1.22% : 0.000002s : 13: predicate.replace_applicator 0.65% : 0.000001s : 6: predicate.replace_old_param 0.39% : 0.000001s : 3: predicate.reset_defer_inline 0.85% : 0.000001s : 7: predicate.reshape_eliminate 0.75% : 0.000001s : 6: predicate.row_tensor_add_zeros_like 0.44% : 0.000001s : 3: predicate.row_tensor_eliminate 0.95% : 0.000001s : 6: predicate.same_eliminate 0.53% : 0.000001s : 6: predicate.set_cell_output_no_recompute 0.91% : 0.000001s : 6: predicate.shard_identity_eliminate 1.07% : 0.000001s : 6: predicate.special_op_eliminate 0.95% : 0.000001s : 6: predicate.specialize_transform 1.00% : 0.000001s : 6: predicate.split_environ_get_set_with_tuple_value 0.88% : 0.000001s : 6: predicate.stack_unstack_eliminate 0.41% : 0.000001s : 3: predicate.switch_call_monad_eliminater 1.00% : 0.000001s : 9: predicate.switch_defer_inline 1.67% : 0.000002s : 15: predicate.switch_layer_defer_inline 4.49% : 0.000006s : 32: predicate.switch_simplify 0.94% : 0.000001s : 7: predicate.tile_eliminate 0.82% : 0.000001s : 7: predicate.transpose_eliminate 1.53% : 0.000002s : 13: predicate.tuple_list_convert_item_index_to_positive 1.74% : 0.000002s : 13: predicate.tuple_list_get_item_const_eliminator 1.55% : 0.000002s : 13: predicate.tuple_list_get_item_depend_reorder 3.03% : 0.000004s : 19: predicate.tuple_list_get_item_eliminator 1.44% : 0.000002s : 13: predicate.tuple_list_get_set_item_eliminator 2.84% : 0.000004s : 19: predicate.tuple_list_set_item_eliminator 1.53% : 0.000002s : 13: predicate.tuple_to_list_eliminator_ 2.01% : 0.000003s : 20: predicate.updatestate_pure_node_eliminater 2.76% : 0.000004s : 26: predicate.updatestate_useless_node_eliminater 0.51% : 0.000001s : 3: predicate.value_based_eliminate 0.85% : 0.000001s : 6: predicate.virtual_dataset_eliminate 0.80% : 0.000001s : 6: predicate.virtual_output_eliminate 0.39% : 0.000001s : 3: predicate.virtual_view_grad_eliminate 0.70% : 0.000001s : 3: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000192 5 8.09% : 0.000016s : 1: func_graph_cloner_run.FuncGraphClonerGraph 91.91% : 0.000177s : 4: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.021338 192 0.02% : 0.000003s : 1: ForceFp32Comm 14.49% : 0.003092s : 1: add_attr 14.45% : 0.003084s : 1: add_attr_with_inline 0.02% : 0.000004s : 1: add_comm_op_reuse_tag 0.22% : 0.000047s : 1: add_recomputation 0.02% : 0.000004s : 1: assign_add_opt 0.27% : 0.000057s : 1: auto_monad 0.09% : 0.000020s : 1: auto_monad_reorder 0.02% : 0.000003s : 1: begin_end_overlap_inline 0.02% : 0.000005s : 1: bias_add_comm_swap 2.25% : 0.000480s : 1: bootstrap 0.12% : 0.000026s : 1: cconv 0.02% : 0.000004s : 1: comm_op_add_attrs 0.07% : 0.000015s : 1: control_data_broadcast_order 0.05% : 0.000010s : 1: convert_after_rewriter 0.12% : 0.000025s : 1: cse_after_recomputation 0.02% : 0.000005s : 1: dataset_repeat_opt 0.02% : 0.000005s : 1: detach_backward 0.04% : 0.000009s : 1: environ_conv 0.07% : 0.000016s : 1: event_method 0.02% : 0.000005s : 1: full_micro_interleaved_order_control 0.02% : 0.000004s : 1: get_jit_bprop_graph 0.04% : 0.000009s : 1: graph_reusing 0.02% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.02% : 0.000004s : 1: handle_group_info 0.03% : 0.000005s : 1: inline 0.03% : 0.000006s : 1: insert-virtual-dataset 0.02% : 0.000004s : 1: interleave_parallel_branches 0.02% : 0.000004s : 1: interleave_split_concat_branches 0.03% : 0.000006s : 1: label_fine_grained_interleaved_index 0.03% : 0.000007s : 1: label_micro_interleaved_index 1.98% : 0.000423s : 1: loop_unroll 0.02% : 0.000004s : 1: merge_cast_opt 0.02% : 0.000005s : 1: micro_interleaved_order_control 2.18% : 0.000465s : 1: mutable_eliminate 0.03% : 0.000007s : 1: offloading_packed_experts 0.06% : 0.000013s : 1: opt.transform.loop_unroll_optimizer 0.06% : 0.000014s : 1: opt.transform.mutable_eliminate 3.87% : 0.000826s : 78: opt.transform.opt_a 0.12% : 0.000026s : 1: opt.transform.opt_after_cconv 0.11% : 0.000023s : 1: opt.transform.opt_after_jit_grad 0.44% : 0.000094s : 28: opt.transform.opt_b 0.21% : 0.000044s : 2: opt.transform.opt_trans_graph 0.16% : 0.000035s : 4: opt.transform.symbol_engine_opt 9.66% : 0.002061s : 1: opt_a 0.64% : 0.000136s : 1: opt_after_cconv 2.22% : 0.000474s : 1: opt_after_jit_grad 0.93% : 0.000197s : 1: opt_b 18.57% : 0.003962s : 1: optimize 0.09% : 0.000018s : 1: optimize_parallel_all_gather_comm 0.04% : 0.000008s : 1: order_py_execute_after_rewriter 0.11% : 0.000022s : 1: overlap_grad_flash_sp 0.02% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.03% : 0.000007s : 1: overlap_grad_ring_attention 0.02% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.03% : 0.000007s : 1: overlap_opt_shard_in_pipeline 0.03% : 0.000007s : 1: overlap_param_gather 0.02% : 0.000005s : 1: overlap_recompute_allgather_and_fa_grad 0.04% : 0.000008s : 1: overlap_recompute_and_grad_model_parallel 0.02% : 0.000005s : 1: overlap_recompute_comm 0.03% : 0.000007s : 1: parallel-infer-symbol 0.02% : 0.000004s : 1: parallel-infer-symbol-second 0.02% : 0.000005s : 1: partial_unused_args_eliminate 0.02% : 0.000005s : 1: pipeline_parallel_scheduler 0.02% : 0.000004s : 1: pipeline_split 0.13% : 0.000027s : 1: pre_auto_parallel 0.10% : 0.000021s : 1: py_interpret_to_execute 0.05% : 0.000011s : 1: py_interpret_to_execute_after_opt_a 0.02% : 0.000004s : 1: remove_cast_before_assign_add 0.09% : 0.000018s : 1: remove_dup_value 1.05% : 0.000224s : 1: renormalize.infer 0.88% : 0.000188s : 1: renormalize.specialize 0.03% : 0.000005s : 1: reorder_send_recv_between_fp_bp 0.03% : 0.000007s : 1: rewriter_after_jit_bprop_graph 0.17% : 0.000037s : 1: rewriter_after_opt_a 0.22% : 0.000048s : 1: rewriter_before_opt_a 0.02% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.02% : 0.000005s : 1: slice_recompute_activation 0.02% : 0.000005s : 1: split_layernorm_comm 0.02% : 0.000005s : 1: split_matmul_comm_elemetwise 0.04% : 0.000008s : 1: swap_dp_allreduce_reducescatter 0.36% : 0.000077s : 1: symbol_engine_optimizer 0.36% : 0.000076s : 1: tuple_transform 21.86% : 0.004664s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:48.149.488 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:48.149.751 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0143806, [21] [bootstrap]: 0.00043032 [type_inference]: 0.00474424 [event_method]: 1.242e-05 [auto_monad]: 5.616e-05 [graph_reusing]: 5.99999e-06 [inline]: 2.77002e-06 [add_attr]: 0.00318307, [1] [add_attr_with_inline]: 0.00317516, [1] [Cycle 1]: 6.99e-05, [2] [tag_attr]: 1.384e-05 [meta_addattr_fg_expand]: 4e-06 [parallel-infer-symbol]: 3.08e-06 [pre_auto_parallel]: 2.527e-05 [insert-virtual-dataset]: 2.53e-06 [parallel-infer-symbol-second]: 7.2e-07 [dataset_repeat_opt]: 2.02001e-06 [pipeline_split]: 1.54e-06 [optimize]: 0.00478143, [53] [py_interpret_to_execute]: 2.16e-05 [rewriter_before_opt_a]: 4.989e-05 [opt_a]: 0.0025215, [2] [Cycle 1]: 0.00168785, [45] [expand_dump_flag]: 4.13001e-06 [switch_simplify]: 2.603e-05 [loop_unroll]: 1.495e-05 [a_1]: 0.00032591 [with_stream_mark]: 1.61e-05 [recompute_prepare]: 8.88002e-06 [updatestate_depend_eliminate]: 4.03001e-06 [updatestate_assign_eliminate]: 3.66999e-06 [updatestate_loads_eliminate]: 3.04999e-06 [parameter_eliminate]: 2.11e-06 [a_2]: 0.00012465 [accelerated_algorithm]: 7.25e-06 [shard]: 2.17999e-06 [meta_shard_fg_expand]: 1.82001e-06 [shard_inline]: 6.50002e-06 [merge_send_recv]: 8.71002e-06 [auto_parallel]: 6.86001e-06 [parallel]: 1.832e-05 [flash_sp]: 8.53001e-06 [merge_comm]: 4.17e-06 [allreduce_fusion]: 3.68e-06 [matmul_add_comm_reduction]: 9.25999e-06 [allreduce_slice_to_reducescatter]: 7.39994e-07 [virtual_shard_identity]: 8.07998e-06 [virtual_dataset]: 6.17999e-06 [get_grad_eliminate_]: 6.61e-06 [virtual_output]: 6.46999e-06 [merge_forward]: 4.18001e-06 [cell_reuse_recompute_pass]: 1.37e-06 [offload_activation]: 9.50001e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.436e-05 [merge_recompute_call_nodes]: 1.59e-06 [before_grad]: 1.131e-05 [set_forward_comm_id_for_comm_node_pass]: 3.83001e-06 [meta_fg_expand]: 2.76999e-06 [flash_sp_send_recv_attached]: 2.56e-06 [receive_attached]: 2.02001e-06 [after_resolve]: 9.86e-06 [a_after_grad]: 9.47999e-06 [renormalize]: 0.00048426 [add_forward_monad_depend]: 5.02e-06 [auto_monad_grad]: 2.06e-06 [auto_monad_eliminator]: 1.448e-05 [cse]: 2.929e-05 [a_3]: 5.842e-05 [Cycle 2]: 0.00082042, [45] [expand_dump_flag]: 1.00999e-06 [switch_simplify]: 6.93998e-06 [loop_unroll]: 5.94999e-06 [a_1]: 0.00012788 [with_stream_mark]: 1.064e-05 [recompute_prepare]: 7.14001e-06 [updatestate_depend_eliminate]: 3.11001e-06 [updatestate_assign_eliminate]: 2.55002e-06 [updatestate_loads_eliminate]: 2.83998e-06 [parameter_eliminate]: 9.80013e-07 [a_2]: 0.0001101 [accelerated_algorithm]: 6.27001e-06 [shard]: 1.20001e-06 [meta_shard_fg_expand]: 1.43002e-06 [shard_inline]: 6.14001e-06 [merge_send_recv]: 4.98001e-06 [auto_parallel]: 5.64e-06 [parallel]: 5.12e-06 [flash_sp]: 3.30998e-06 [merge_comm]: 3.46999e-06 [allreduce_fusion]: 3.21999e-06 [matmul_add_comm_reduction]: 6.06998e-06 [allreduce_slice_to_reducescatter]: 2.89991e-07 [virtual_shard_identity]: 6.59001e-06 [virtual_dataset]: 5.91e-06 [get_grad_eliminate_]: 5.60001e-06 [virtual_output]: 5.59e-06 [merge_forward]: 2.69001e-06 [cell_reuse_recompute_pass]: 1.30999e-06 [offload_activation]: 6.79999e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.314e-05 [merge_recompute_call_nodes]: 1.00001e-06 [before_grad]: 8.98002e-06 [set_forward_comm_id_for_comm_node_pass]: 3.93999e-06 [meta_fg_expand]: 2.18998e-06 [flash_sp_send_recv_attached]: 8.39995e-07 [receive_attached]: 1.12e-06 [after_resolve]: 9.23002e-06 [a_after_grad]: 8.54e-06 [renormalize]: 8.00064e-08 [add_forward_monad_depend]: 1.49e-06 [auto_monad_grad]: 1.00999e-06 [auto_monad_eliminator]: 7.37997e-06 [cse]: 1.554e-05 [a_3]: 4.734e-05 [py_interpret_to_execute_after_opt_a]: 1.199e-05 [slice_cell_reuse_recomputed_activation]: 4.77e-06 [rewriter_after_opt_a]: 3.726e-05 [convert_after_rewriter]: 1.025e-05 [order_py_execute_after_rewriter]: 8.48001e-06 [mutable_eliminate]: 0.00050125 [opt_b]: 0.00026731, [1] [Cycle 1]: 0.00025798, [7] [b_1]: 0.0001617 [b_2]: 7.82002e-06 [updatestate_depend_eliminate]: 5.82999e-06 [updatestate_assign_eliminate]: 2.64001e-06 [updatestate_loads_eliminate]: 2.38998e-06 [renormalize]: 5.89993e-07 [cse]: 2.07e-05 [optimize_parallel_all_gather_comm]: 1.853e-05 [overlap_param_gather]: 4.73001e-06 [cconv]: 2.921e-05 [loop_unroll]: 0.0004421 [opt_after_cconv]: 0.00014662, [1] [Cycle 1]: 0.00013763, [7] [c_1]: 2.892e-05 [parameter_eliminate]: 2.63e-06 [updatestate_depend_eliminate]: 2.1e-05 [updatestate_assign_eliminate]: 2.54001e-06 [updatestate_loads_eliminate]: 2.88998e-06 [cse]: 1.946e-05 [renormalize]: 4.10015e-07 [remove_dup_value]: 1.869e-05 [tuple_transform]: 8.839e-05, [1] [Cycle 1]: 8.086e-05, [4] [d_1]: 4.033e-05 [none_parameter_eliminate]: 1.86e-06 [renormalize]: 2.59985e-07 [switch_simplify]: 7.15e-06 [partial_unused_args_eliminate]: 4.48999e-06 [add_recomputation]: 5.031e-05 [cse_after_recomputation]: 2.737e-05, [1] [Cycle 1]: 2.061e-05, [1] [cse]: 1.161e-05 [environ_conv]: 8.23999e-06 [swap_dp_allreduce_reducescatter]: 7.73999e-06 [bias_add_comm_swap]: 5.29e-06 [label_micro_interleaved_index]: 6.67002e-06 [label_fine_grained_interleaved_index]: 5.92999e-06 [merge_cast_opt]: 3.98999e-06 [slice_recompute_activation]: 4.52998e-06 [micro_interleaved_order_control]: 4.77998e-06 [assign_add_opt]: 3.84002e-06 [ForceFp32Comm]: 3.38999e-06 [remove_cast_before_assign_add]: 3.48999e-06 [full_micro_interleaved_order_control]: 4.57e-06 [reorder_send_recv_between_fp_bp]: 5.38002e-06 [comm_op_add_attrs]: 3.98001e-06 [add_comm_op_reuse_tag]: 3.75e-06 [interleave_split_concat_branches]: 3.91001e-06 [interleave_parallel_branches]: 3.51001e-06 [overlap_opt_shard_in_pipeline]: 3.75998e-06 [overlap_opt_shard_grad_in_pipeline]: 4.14997e-06 [control_data_broadcast_order]: 1.528e-05 [grouped_pairwise_exchange_alltoall]: 4.07998e-06 [offloading_packed_experts]: 6.56999e-06 [overlap_recompute_and_grad_model_parallel]: 7.92e-06 [overlap_grad_matmul_and_grad_allreduce]: 4.07e-06 [overlap_recompute_allgather_and_fa_grad]: 3.78999e-06 [overlap_recompute_comm]: 4.85999e-06 [overlap_grad_ring_attention]: 6.73998e-06 [overlap_grad_flash_sp]: 2.095e-05 [begin_end_overlap_inline]: 2.86999e-06 [split_matmul_comm_elemetwise]: 4.53001e-06 [split_layernorm_comm]: 4.03001e-06 [handle_group_info]: 3.28e-06 [symbol_engine_optimizer]: 9.482e-05, [1] [Cycle 1]: 8.816e-05, [6] [build]: 2.58e-06 [elim_shapecalc]: 1.006e-05 [elim_not_effective]: 1.252e-05 [opt_reshape]: 7.18998e-06 [fold_const_symbol]: 9.82001e-06 [renormalize]: 2.19996e-07 [detach_backward]: 3.39001e-06 [pipeline_parallel_scheduler]: 2.09e-06 [auto_monad_reorder]: 1.844e-05 [get_jit_bprop_graph]: 1.39998e-06 [rewriter_after_jit_bprop_graph]: 4.1e-06 [opt_after_jit_grad]: 0.00047536 [validate]: 3.492e-05 Sums bootstrap : 0.000430s : 4.57% type_inference : 0.004744s : 50.42% event_method : 0.000012s : 0.13% auto_monad : 0.000056s : 0.60% graph_reusing : 0.000006s : 0.06% inline : 0.000003s : 0.03% add_attr.add_attr_with_inline.tag_attr : 0.000014s : 0.15% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000004s : 0.04% parallel-infer-symbol : 0.000003s : 0.03% pre_auto_parallel : 0.000025s : 0.27% insert-virtual-dataset : 0.000003s : 0.03% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.02% optimize.py_interpret_to_execute : 0.000022s : 0.23% optimize.rewriter_before_opt_a : 0.000050s : 0.53% optimize.opt_a.expand_dump_flag : 0.000005s : 0.05% optimize.opt_a.switch_simplify : 0.000033s : 0.35% optimize.opt_a.loop_unroll : 0.000021s : 0.22% optimize.opt_a.a_1 : 0.000454s : 4.82% optimize.opt_a.with_stream_mark : 0.000027s : 0.28% optimize.opt_a.recompute_prepare : 0.000016s : 0.17% optimize.opt_a.updatestate_depend_eliminate : 0.000007s : 0.08% optimize.opt_a.updatestate_assign_eliminate : 0.000006s : 0.07% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.06% optimize.opt_a.parameter_eliminate : 0.000003s : 0.03% optimize.opt_a.a_2 : 0.000235s : 2.49% optimize.opt_a.accelerated_algorithm : 0.000014s : 0.14% optimize.opt_a.shard : 0.000003s : 0.04% optimize.opt_a.meta_shard_fg_expand : 0.000003s : 0.03% optimize.opt_a.shard_inline : 0.000013s : 0.13% optimize.opt_a.merge_send_recv : 0.000014s : 0.15% optimize.opt_a.auto_parallel : 0.000013s : 0.13% optimize.opt_a.parallel : 0.000023s : 0.25% optimize.opt_a.flash_sp : 0.000012s : 0.13% optimize.opt_a.merge_comm : 0.000008s : 0.08% optimize.opt_a.allreduce_fusion : 0.000007s : 0.07% optimize.opt_a.matmul_add_comm_reduction : 0.000015s : 0.16% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000015s : 0.16% optimize.opt_a.virtual_dataset : 0.000012s : 0.13% optimize.opt_a.get_grad_eliminate_ : 0.000012s : 0.13% optimize.opt_a.virtual_output : 0.000012s : 0.13% optimize.opt_a.merge_forward : 0.000007s : 0.07% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.03% optimize.opt_a.offload_activation : 0.000016s : 0.17% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000028s : 0.29% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.03% optimize.opt_a.before_grad : 0.000020s : 0.22% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000008s : 0.08% optimize.opt_a.meta_fg_expand : 0.000005s : 0.05% optimize.opt_a.flash_sp_send_recv_attached : 0.000003s : 0.04% optimize.opt_a.receive_attached : 0.000003s : 0.03% optimize.opt_a.after_resolve : 0.000019s : 0.20% optimize.opt_a.a_after_grad : 0.000018s : 0.19% optimize.opt_a.renormalize : 0.000484s : 5.15% optimize.opt_a.add_forward_monad_depend : 0.000007s : 0.07% optimize.opt_a.auto_monad_grad : 0.000003s : 0.03% optimize.opt_a.auto_monad_eliminator : 0.000022s : 0.23% optimize.opt_a.cse : 0.000045s : 0.48% optimize.opt_a.a_3 : 0.000106s : 1.12% optimize.py_interpret_to_execute_after_opt_a : 0.000012s : 0.13% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.05% optimize.rewriter_after_opt_a : 0.000037s : 0.40% optimize.convert_after_rewriter : 0.000010s : 0.11% optimize.order_py_execute_after_rewriter : 0.000008s : 0.09% optimize.mutable_eliminate : 0.000501s : 5.33% optimize.opt_b.b_1 : 0.000162s : 1.72% optimize.opt_b.b_2 : 0.000008s : 0.08% optimize.opt_b.updatestate_depend_eliminate : 0.000006s : 0.06% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_b.updatestate_loads_eliminate : 0.000002s : 0.03% optimize.opt_b.renormalize : 0.000001s : 0.01% optimize.opt_b.cse : 0.000021s : 0.22% optimize.optimize_parallel_all_gather_comm : 0.000019s : 0.20% optimize.overlap_param_gather : 0.000005s : 0.05% optimize.cconv : 0.000029s : 0.31% optimize.loop_unroll : 0.000442s : 4.70% optimize.opt_after_cconv.c_1 : 0.000029s : 0.31% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000021s : 0.22% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.cse : 0.000019s : 0.21% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000019s : 0.20% optimize.tuple_transform.d_1 : 0.000040s : 0.43% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.02% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000007s : 0.08% optimize.partial_unused_args_eliminate : 0.000004s : 0.05% optimize.add_recomputation : 0.000050s : 0.53% optimize.cse_after_recomputation.cse : 0.000012s : 0.12% optimize.environ_conv : 0.000008s : 0.09% optimize.swap_dp_allreduce_reducescatter : 0.000008s : 0.08% optimize.bias_add_comm_swap : 0.000005s : 0.06% optimize.label_micro_interleaved_index : 0.000007s : 0.07% optimize.label_fine_grained_interleaved_index : 0.000006s : 0.06% optimize.merge_cast_opt : 0.000004s : 0.04% optimize.slice_recompute_activation : 0.000005s : 0.05% optimize.micro_interleaved_order_control : 0.000005s : 0.05% optimize.assign_add_opt : 0.000004s : 0.04% optimize.ForceFp32Comm : 0.000003s : 0.04% optimize.remove_cast_before_assign_add : 0.000003s : 0.04% optimize.full_micro_interleaved_order_control : 0.000005s : 0.05% optimize.reorder_send_recv_between_fp_bp : 0.000005s : 0.06% optimize.comm_op_add_attrs : 0.000004s : 0.04% optimize.add_comm_op_reuse_tag : 0.000004s : 0.04% optimize.interleave_split_concat_branches : 0.000004s : 0.04% optimize.interleave_parallel_branches : 0.000004s : 0.04% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.04% optimize.overlap_opt_shard_grad_in_pipeline : 0.000004s : 0.04% optimize.control_data_broadcast_order : 0.000015s : 0.16% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.04% optimize.offloading_packed_experts : 0.000007s : 0.07% optimize.overlap_recompute_and_grad_model_parallel : 0.000008s : 0.08% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.04% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.04% optimize.overlap_recompute_comm : 0.000005s : 0.05% optimize.overlap_grad_ring_attention : 0.000007s : 0.07% optimize.overlap_grad_flash_sp : 0.000021s : 0.22% optimize.begin_end_overlap_inline : 0.000003s : 0.03% optimize.split_matmul_comm_elemetwise : 0.000005s : 0.05% optimize.split_layernorm_comm : 0.000004s : 0.04% optimize.handle_group_info : 0.000003s : 0.03% optimize.symbol_engine_optimizer.build : 0.000003s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000010s : 0.11% optimize.symbol_engine_optimizer.elim_not_effective : 0.000013s : 0.13% optimize.symbol_engine_optimizer.opt_reshape : 0.000007s : 0.08% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000010s : 0.10% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000003s : 0.04% pipeline_parallel_scheduler : 0.000002s : 0.02% auto_monad_reorder : 0.000018s : 0.20% get_jit_bprop_graph : 0.000001s : 0.01% rewriter_after_jit_bprop_graph : 0.000004s : 0.04% opt_after_jit_grad : 0.000475s : 5.05% validate : 0.000035s : 0.37% Time group info: ------[substitution.] 0.000180 23 37.93% : 0.000068s : 4: substitution.arithmetic_simplify 1.16% : 0.000002s : 2: substitution.elim_not_effective 0.79% : 0.000001s : 2: substitution.fold_const_symbol 3.26% : 0.000006s : 3: substitution.graph_param_transform 50.28% : 0.000090s : 2: substitution.inline 2.16% : 0.000004s : 4: substitution.j_node_and_user_rematch 2.47% : 0.000004s : 4: substitution.remove_not_recompute_node 1.96% : 0.000004s : 2: substitution.replace_old_param ------[type_inference.] 0.004679 2 90.19% : 0.004220s : 1: type_inference.infer 9.81% : 0.000459s : 1: type_inference.specialize ------[replace.] 0.000022 2 100.00% : 0.000022s : 2: replace.inline ------[match.] 0.000089 2 100.00% : 0.000089s : 2: match.inline ------[predicate.] 0.000137 754 0.82% : 0.000001s : 7: predicate.accumulaten_eliminater 1.05% : 0.000001s : 3: predicate.ad_related_special_op_eliminate 0.66% : 0.000001s : 6: predicate.addn_check_dump 0.76% : 0.000001s : 7: predicate.addn_zero_filter 0.68% : 0.000001s : 7: predicate.adjust_all_reduce_mul_add 2.60% : 0.000004s : 13: predicate.arithmetic_simplify 0.84% : 0.000001s : 7: predicate.cast_eliminate 0.75% : 0.000001s : 6: predicate.check_bprop_eliminate 0.73% : 0.000001s : 6: predicate.compare_switch_simplify 0.22% : 0.000000s : 3: predicate.const_output_eliminate 0.85% : 0.000001s : 6: predicate.depend_value_elim 0.90% : 0.000001s : 7: predicate.dict_get_item_const_eliminator 0.97% : 0.000001s : 7: predicate.dict_get_item_eliminator 0.72% : 0.000001s : 7: predicate.dict_set_item_eliminator 1.12% : 0.000002s : 6: predicate.dumpgradient_eliminate 0.29% : 0.000000s : 3: predicate.elim_not_effective 0.59% : 0.000001s : 3: predicate.elim_shapecalc_of_broadcastargs 1.16% : 0.000002s : 10: predicate.environ_add_const_eliminate 1.05% : 0.000001s : 10: predicate.environ_get_add_eliminate 0.97% : 0.000001s : 10: predicate.environ_get_depend_swap 1.75% : 0.000002s : 16: predicate.environ_get_eliminate 1.03% : 0.000001s : 10: predicate.environ_get_set_eliminate 0.90% : 0.000001s : 9: predicate.exchange_switch_depend_value 1.92% : 0.000003s : 9: predicate.float_depend_g_call 0.67% : 0.000001s : 6: predicate.float_environ_get_switch 0.97% : 0.000001s : 9: predicate.float_tuple_getitem_switch 0.22% : 0.000000s : 3: predicate.fold_const_symbol 0.75% : 0.000001s : 6: predicate.get_grad_eliminate 0.27% : 0.000000s : 3: predicate.graph_param_transform 0.83% : 0.000001s : 6: predicate.incorporate_call 0.69% : 0.000001s : 6: predicate.incorporate_call_switch 6.33% : 0.000009s : 34: predicate.inline 1.14% : 0.000002s : 6: predicate.inline_without_move 0.39% : 0.000001s : 6: predicate.j_node_and_user_rematch 1.20% : 0.000002s : 6: predicate.less_batch_normalization 1.55% : 0.000002s : 13: predicate.list_to_tuple_eliminator_ 2.19% : 0.000003s : 20: predicate.load_eliminater 1.38% : 0.000002s : 3: predicate.loop_unroll_after_grad 1.79% : 0.000002s : 14: predicate.loop_unroll_before_grad 1.66% : 0.000002s : 13: predicate.make_slice_get_slice_eliminator 0.70% : 0.000001s : 6: predicate.merge_addn 0.66% : 0.000001s : 6: predicate.micro_step_allgather_replace 0.73% : 0.000001s : 6: predicate.mini_step_allgather_replace 0.66% : 0.000001s : 7: predicate.minmaximum_grad 1.23% : 0.000002s : 3: predicate.mutable_eliminate 0.53% : 0.000001s : 3: predicate.opt_reshape 0.69% : 0.000001s : 3: predicate.parallel_virtual_node 1.27% : 0.000002s : 9: predicate.partial_defer_inline 1.28% : 0.000002s : 10: predicate.partial_eliminate 0.75% : 0.000001s : 7: predicate.print_const_string_wrapper 0.73% : 0.000001s : 6: predicate.reduce_all_const_elim 1.22% : 0.000002s : 7: predicate.reduce_eliminate 2.10% : 0.000003s : 20: predicate.redundant_stop_gradient_eliminater 0.63% : 0.000001s : 6: predicate.remove_not_recompute_node 1.23% : 0.000002s : 13: predicate.replace_applicator 0.76% : 0.000001s : 6: predicate.replace_old_param 0.33% : 0.000000s : 3: predicate.reset_defer_inline 0.80% : 0.000001s : 7: predicate.reshape_eliminate 0.76% : 0.000001s : 6: predicate.row_tensor_add_zeros_like 0.48% : 0.000001s : 3: predicate.row_tensor_eliminate 1.06% : 0.000001s : 6: predicate.same_eliminate 0.51% : 0.000001s : 6: predicate.set_cell_output_no_recompute 0.97% : 0.000001s : 6: predicate.shard_identity_eliminate 1.13% : 0.000002s : 6: predicate.special_op_eliminate 0.94% : 0.000001s : 6: predicate.specialize_transform 1.04% : 0.000001s : 6: predicate.split_environ_get_set_with_tuple_value 0.97% : 0.000001s : 6: predicate.stack_unstack_eliminate 0.47% : 0.000001s : 3: predicate.switch_call_monad_eliminater 0.99% : 0.000001s : 9: predicate.switch_defer_inline 1.90% : 0.000003s : 15: predicate.switch_layer_defer_inline 4.34% : 0.000006s : 32: predicate.switch_simplify 0.91% : 0.000001s : 7: predicate.tile_eliminate 0.83% : 0.000001s : 7: predicate.transpose_eliminate 1.59% : 0.000002s : 13: predicate.tuple_list_convert_item_index_to_positive 1.63% : 0.000002s : 13: predicate.tuple_list_get_item_const_eliminator 1.40% : 0.000002s : 13: predicate.tuple_list_get_item_depend_reorder 3.15% : 0.000004s : 19: predicate.tuple_list_get_item_eliminator 1.43% : 0.000002s : 13: predicate.tuple_list_get_set_item_eliminator 2.45% : 0.000003s : 19: predicate.tuple_list_set_item_eliminator 1.65% : 0.000002s : 13: predicate.tuple_to_list_eliminator_ 2.04% : 0.000003s : 20: predicate.updatestate_pure_node_eliminater 2.81% : 0.000004s : 26: predicate.updatestate_useless_node_eliminater 0.48% : 0.000001s : 3: predicate.value_based_eliminate 0.80% : 0.000001s : 6: predicate.virtual_dataset_eliminate 0.79% : 0.000001s : 6: predicate.virtual_output_eliminate 0.34% : 0.000000s : 3: predicate.virtual_view_grad_eliminate 0.54% : 0.000001s : 3: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000222 5 8.17% : 0.000018s : 1: func_graph_cloner_run.FuncGraphClonerGraph 91.83% : 0.000204s : 4: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.023812 192 0.03% : 0.000006s : 1: ForceFp32Comm 13.40% : 0.003191s : 1: add_attr 13.35% : 0.003179s : 1: add_attr_with_inline 0.03% : 0.000007s : 1: add_comm_op_reuse_tag 0.23% : 0.000054s : 1: add_recomputation 0.03% : 0.000007s : 1: assign_add_opt 0.27% : 0.000064s : 1: auto_monad 0.11% : 0.000026s : 1: auto_monad_reorder 0.02% : 0.000006s : 1: begin_end_overlap_inline 0.03% : 0.000008s : 1: bias_add_comm_swap 1.99% : 0.000474s : 1: bootstrap 0.14% : 0.000032s : 1: cconv 0.03% : 0.000007s : 1: comm_op_add_attrs 0.08% : 0.000018s : 1: control_data_broadcast_order 0.06% : 0.000013s : 1: convert_after_rewriter 0.13% : 0.000030s : 1: cse_after_recomputation 0.03% : 0.000008s : 1: dataset_repeat_opt 0.08% : 0.000019s : 1: detach_backward 0.05% : 0.000011s : 1: environ_conv 0.09% : 0.000022s : 1: event_method 0.03% : 0.000007s : 1: full_micro_interleaved_order_control 0.03% : 0.000008s : 1: get_jit_bprop_graph 0.05% : 0.000012s : 1: graph_reusing 0.03% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.03% : 0.000006s : 1: handle_group_info 0.04% : 0.000009s : 1: inline 0.04% : 0.000009s : 1: insert-virtual-dataset 0.03% : 0.000006s : 1: interleave_parallel_branches 0.03% : 0.000007s : 1: interleave_split_concat_branches 0.04% : 0.000009s : 1: label_fine_grained_interleaved_index 0.04% : 0.000010s : 1: label_micro_interleaved_index 1.88% : 0.000448s : 1: loop_unroll 0.03% : 0.000007s : 1: merge_cast_opt 0.03% : 0.000008s : 1: micro_interleaved_order_control 2.13% : 0.000508s : 1: mutable_eliminate 0.04% : 0.000010s : 1: offloading_packed_experts 0.06% : 0.000014s : 1: opt.transform.loop_unroll_optimizer 0.06% : 0.000015s : 1: opt.transform.mutable_eliminate 3.57% : 0.000850s : 78: opt.transform.opt_a 0.11% : 0.000027s : 1: opt.transform.opt_after_cconv 0.10% : 0.000024s : 1: opt.transform.opt_after_jit_grad 0.41% : 0.000097s : 28: opt.transform.opt_b 0.19% : 0.000045s : 2: opt.transform.opt_trans_graph 0.15% : 0.000036s : 4: opt.transform.symbol_engine_opt 10.60% : 0.002525s : 1: opt_a 0.63% : 0.000150s : 1: opt_after_cconv 2.04% : 0.000487s : 1: opt_after_jit_grad 1.14% : 0.000271s : 1: opt_b 21.57% : 0.005137s : 1: optimize 0.09% : 0.000022s : 1: optimize_parallel_all_gather_comm 0.05% : 0.000011s : 1: order_py_execute_after_rewriter 0.10% : 0.000024s : 1: overlap_grad_flash_sp 0.03% : 0.000007s : 1: overlap_grad_matmul_and_grad_allreduce 0.04% : 0.000010s : 1: overlap_grad_ring_attention 0.03% : 0.000007s : 1: overlap_opt_shard_grad_in_pipeline 0.03% : 0.000006s : 1: overlap_opt_shard_in_pipeline 0.03% : 0.000008s : 1: overlap_param_gather 0.03% : 0.000006s : 1: overlap_recompute_allgather_and_fa_grad 0.05% : 0.000011s : 1: overlap_recompute_and_grad_model_parallel 0.03% : 0.000008s : 1: overlap_recompute_comm 0.04% : 0.000009s : 1: parallel-infer-symbol 0.03% : 0.000006s : 1: parallel-infer-symbol-second 0.03% : 0.000008s : 1: partial_unused_args_eliminate 0.04% : 0.000009s : 1: pipeline_parallel_scheduler 0.03% : 0.000007s : 1: pipeline_split 0.14% : 0.000033s : 1: pre_auto_parallel 0.10% : 0.000025s : 1: py_interpret_to_execute 0.06% : 0.000015s : 1: py_interpret_to_execute_after_opt_a 0.03% : 0.000006s : 1: remove_cast_before_assign_add 0.09% : 0.000022s : 1: remove_dup_value 1.09% : 0.000260s : 1: renormalize.infer 0.91% : 0.000217s : 1: renormalize.specialize 0.03% : 0.000008s : 1: reorder_send_recv_between_fp_bp 0.04% : 0.000010s : 1: rewriter_after_jit_bprop_graph 0.17% : 0.000041s : 1: rewriter_after_opt_a 0.35% : 0.000083s : 1: rewriter_before_opt_a 0.03% : 0.000007s : 1: slice_cell_reuse_recomputed_activation 0.03% : 0.000007s : 1: slice_recompute_activation 0.03% : 0.000007s : 1: split_layernorm_comm 0.03% : 0.000007s : 1: split_matmul_comm_elemetwise 0.04% : 0.000011s : 1: swap_dp_allreduce_reducescatter 0.41% : 0.000098s : 1: symbol_engine_optimizer 0.38% : 0.000092s : 1: tuple_transform 20.04% : 0.004773s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:48.335.892 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0131803, [21] [bootstrap]: 0.00042422 [type_inference]: 0.00470778 [event_method]: 1.079e-05 [auto_monad]: 5.344e-05 [graph_reusing]: 5.12e-06 [inline]: 2.38002e-06 [add_attr]: 0.00311327, [1] [add_attr_with_inline]: 0.00310505, [1] [Cycle 1]: 4.654e-05, [2] [tag_attr]: 1.37e-05 [meta_addattr_fg_expand]: 3.42002e-06 [parallel-infer-symbol]: 2.74001e-06 [pre_auto_parallel]: 2.405e-05 [insert-virtual-dataset]: 2.56998e-06 [parallel-infer-symbol-second]: 7.2e-07 [dataset_repeat_opt]: 2.12999e-06 [pipeline_split]: 1.82001e-06 [optimize]: 0.00414825, [53] [py_interpret_to_execute]: 1.58e-05 [rewriter_before_opt_a]: 4.332e-05 [opt_a]: 0.00219807, [2] [Cycle 1]: 0.00148348, [45] [expand_dump_flag]: 2.98e-06 [switch_simplify]: 2.517e-05 [loop_unroll]: 1.433e-05 [a_1]: 0.00032679 [with_stream_mark]: 1.562e-05 [recompute_prepare]: 8.37e-06 [updatestate_depend_eliminate]: 3.82998e-06 [updatestate_assign_eliminate]: 3.34001e-06 [updatestate_loads_eliminate]: 3.51001e-06 [parameter_eliminate]: 1.81e-06 [a_2]: 9.252e-05 [accelerated_algorithm]: 6.89999e-06 [shard]: 1.97999e-06 [meta_shard_fg_expand]: 1.81e-06 [shard_inline]: 6.09001e-06 [merge_send_recv]: 9.09003e-06 [auto_parallel]: 5.98002e-06 [parallel]: 1.837e-05 [flash_sp]: 8.62e-06 [merge_comm]: 4.43001e-06 [allreduce_fusion]: 3.81001e-06 [matmul_add_comm_reduction]: 9.20001e-06 [allreduce_slice_to_reducescatter]: 5.8001e-07 [virtual_shard_identity]: 7.55998e-06 [virtual_dataset]: 6.36998e-06 [get_grad_eliminate_]: 5.82001e-06 [virtual_output]: 6.11e-06 [merge_forward]: 3.76001e-06 [cell_reuse_recompute_pass]: 1.25999e-06 [offload_activation]: 9.99001e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.287e-05 [merge_recompute_call_nodes]: 1.47999e-06 [before_grad]: 1.054e-05 [set_forward_comm_id_for_comm_node_pass]: 3.59002e-06 [meta_fg_expand]: 2.75002e-06 [flash_sp_send_recv_attached]: 2.44999e-06 [receive_attached]: 2.31998e-06 [after_resolve]: 9.76998e-06 [a_after_grad]: 9.76e-06 [renormalize]: 0.00046459 [add_forward_monad_depend]: 5.77999e-06 [auto_monad_grad]: 1.86e-06 [auto_monad_eliminator]: 1.39e-05 [cse]: 3.117e-05 [a_3]: 4.744e-05 [Cycle 2]: 0.00070488, [45] [expand_dump_flag]: 8.89995e-07 [switch_simplify]: 7.45998e-06 [loop_unroll]: 5.88002e-06 [a_1]: 0.00013022 [with_stream_mark]: 1.145e-05 [recompute_prepare]: 6.29001e-06 [updatestate_depend_eliminate]: 3.03998e-06 [updatestate_assign_eliminate]: 2.51e-06 [updatestate_loads_eliminate]: 2.93e-06 [parameter_eliminate]: 1.30001e-06 [a_2]: 0.00010246 [accelerated_algorithm]: 6.59001e-06 [shard]: 1.17e-06 [meta_shard_fg_expand]: 1.87999e-06 [shard_inline]: 7.35e-06 [merge_send_recv]: 6.91001e-06 [auto_parallel]: 7e-06 [parallel]: 6.15002e-06 [flash_sp]: 3.63e-06 [merge_comm]: 3.49001e-06 [allreduce_fusion]: 3.41999e-06 [matmul_add_comm_reduction]: 7.55e-06 [allreduce_slice_to_reducescatter]: 3.50003e-07 [virtual_shard_identity]: 6.78998e-06 [virtual_dataset]: 5.57001e-06 [get_grad_eliminate_]: 5.46e-06 [virtual_output]: 5.37999e-06 [merge_forward]: 3.06001e-06 [cell_reuse_recompute_pass]: 1.89e-06 [offload_activation]: 7.6e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.19e-05 [merge_recompute_call_nodes]: 1.07998e-06 [before_grad]: 8.92e-06 [set_forward_comm_id_for_comm_node_pass]: 4.02002e-06 [meta_fg_expand]: 2.14e-06 [flash_sp_send_recv_attached]: 8.2e-07 [receive_attached]: 1.15999e-06 [after_resolve]: 9.24e-06 [a_after_grad]: 8.75999e-06 [renormalize]: 7.00238e-08 [add_forward_monad_depend]: 2.73e-06 [auto_monad_grad]: 1.84e-06 [auto_monad_eliminator]: 9.62999e-06 [cse]: 1.878e-05 [a_3]: 3.609e-05 [py_interpret_to_execute_after_opt_a]: 9.05001e-06 [slice_cell_reuse_recomputed_activation]: 1.99e-06 [rewriter_after_opt_a]: 3.737e-05 [convert_after_rewriter]: 6.89999e-06 [order_py_execute_after_rewriter]: 5.56998e-06 [mutable_eliminate]: 0.00051996 [opt_b]: 0.00019979, [1] [Cycle 1]: 0.00019365, [7] [b_1]: 0.00011773 [b_2]: 7.38999e-06 [updatestate_depend_eliminate]: 6.23998e-06 [updatestate_assign_eliminate]: 2.61e-06 [updatestate_loads_eliminate]: 2.41e-06 [renormalize]: 5.39992e-07 [cse]: 1.995e-05 [optimize_parallel_all_gather_comm]: 1.668e-05 [overlap_param_gather]: 2.19001e-06 [cconv]: 2.608e-05 [loop_unroll]: 0.00041776 [opt_after_cconv]: 9.951e-05, [1] [Cycle 1]: 9.434e-05, [7] [c_1]: 2.759e-05 [parameter_eliminate]: 2.76e-06 [updatestate_depend_eliminate]: 5.00999e-06 [updatestate_assign_eliminate]: 2.47001e-06 [updatestate_loads_eliminate]: 2.83e-06 [cse]: 1.87e-05 [renormalize]: 3.59985e-07 [remove_dup_value]: 1.359e-05 [tuple_transform]: 7.265e-05, [1] [Cycle 1]: 6.8e-05, [4] [d_1]: 3.995e-05 [none_parameter_eliminate]: 1.84e-06 [renormalize]: 1.70025e-07 [switch_simplify]: 6.66e-06 [partial_unused_args_eliminate]: 1.76e-06 [add_recomputation]: 4.617e-05 [cse_after_recomputation]: 2.202e-05, [1] [Cycle 1]: 1.741e-05, [1] [cse]: 1.171e-05 [environ_conv]: 5.17999e-06 [swap_dp_allreduce_reducescatter]: 5.06997e-06 [bias_add_comm_swap]: 2.44999e-06 [label_micro_interleaved_index]: 4.10998e-06 [label_fine_grained_interleaved_index]: 2.65997e-06 [merge_cast_opt]: 1.25001e-06 [slice_recompute_activation]: 2.04999e-06 [micro_interleaved_order_control]: 1.99e-06 [assign_add_opt]: 1.49e-06 [ForceFp32Comm]: 1.12999e-06 [remove_cast_before_assign_add]: 9.80013e-07 [full_micro_interleaved_order_control]: 2.02001e-06 [reorder_send_recv_between_fp_bp]: 2.82002e-06 [comm_op_add_attrs]: 1.22e-06 [add_comm_op_reuse_tag]: 9.70002e-07 [interleave_split_concat_branches]: 1.10001e-06 [interleave_parallel_branches]: 1.05999e-06 [overlap_opt_shard_in_pipeline]: 1.84e-06 [overlap_opt_shard_grad_in_pipeline]: 1.86003e-06 [control_data_broadcast_order]: 1.205e-05 [grouped_pairwise_exchange_alltoall]: 1.58002e-06 [offloading_packed_experts]: 4.28999e-06 [overlap_recompute_and_grad_model_parallel]: 4.77998e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.37e-06 [overlap_recompute_allgather_and_fa_grad]: 1.34e-06 [overlap_recompute_comm]: 2.24001e-06 [overlap_grad_ring_attention]: 4.25999e-06 [overlap_grad_flash_sp]: 1.947e-05 [begin_end_overlap_inline]: 5.50004e-07 [split_matmul_comm_elemetwise]: 2.54999e-06 [split_layernorm_comm]: 1.96e-06 [handle_group_info]: 1.30999e-06 [symbol_engine_optimizer]: 7.517e-05, [1] [Cycle 1]: 7.082e-05, [6] [build]: 2.81999e-06 [elim_shapecalc]: 1.078e-05 [elim_not_effective]: 1.204e-05 [opt_reshape]: 6.71999e-06 [fold_const_symbol]: 9.62999e-06 [renormalize]: 2.80008e-07 [detach_backward]: 2.21e-06 [pipeline_parallel_scheduler]: 1.62001e-06 [auto_monad_reorder]: 1.699e-05 [get_jit_bprop_graph]: 1.72001e-06 [rewriter_after_jit_bprop_graph]: 4.03001e-06 [opt_after_jit_grad]: 0.00047351 [validate]: 3.564e-05 Sums bootstrap : 0.000424s : 4.67% type_inference : 0.004708s : 51.78% event_method : 0.000011s : 0.12% auto_monad : 0.000053s : 0.59% graph_reusing : 0.000005s : 0.06% inline : 0.000002s : 0.03% add_attr.add_attr_with_inline.tag_attr : 0.000014s : 0.15% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000003s : 0.04% parallel-infer-symbol : 0.000003s : 0.03% pre_auto_parallel : 0.000024s : 0.26% insert-virtual-dataset : 0.000003s : 0.03% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.02% optimize.py_interpret_to_execute : 0.000016s : 0.17% optimize.rewriter_before_opt_a : 0.000043s : 0.48% optimize.opt_a.expand_dump_flag : 0.000004s : 0.04% optimize.opt_a.switch_simplify : 0.000033s : 0.36% optimize.opt_a.loop_unroll : 0.000020s : 0.22% optimize.opt_a.a_1 : 0.000457s : 5.03% optimize.opt_a.with_stream_mark : 0.000027s : 0.30% optimize.opt_a.recompute_prepare : 0.000015s : 0.16% optimize.opt_a.updatestate_depend_eliminate : 0.000007s : 0.08% optimize.opt_a.updatestate_assign_eliminate : 0.000006s : 0.06% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.07% optimize.opt_a.parameter_eliminate : 0.000003s : 0.03% optimize.opt_a.a_2 : 0.000195s : 2.14% optimize.opt_a.accelerated_algorithm : 0.000013s : 0.15% optimize.opt_a.shard : 0.000003s : 0.03% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.04% optimize.opt_a.shard_inline : 0.000013s : 0.15% optimize.opt_a.merge_send_recv : 0.000016s : 0.18% optimize.opt_a.auto_parallel : 0.000013s : 0.14% optimize.opt_a.parallel : 0.000025s : 0.27% optimize.opt_a.flash_sp : 0.000012s : 0.13% optimize.opt_a.merge_comm : 0.000008s : 0.09% optimize.opt_a.allreduce_fusion : 0.000007s : 0.08% optimize.opt_a.matmul_add_comm_reduction : 0.000017s : 0.18% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000014s : 0.16% optimize.opt_a.virtual_dataset : 0.000012s : 0.13% optimize.opt_a.get_grad_eliminate_ : 0.000011s : 0.12% optimize.opt_a.virtual_output : 0.000011s : 0.13% optimize.opt_a.merge_forward : 0.000007s : 0.08% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.03% optimize.opt_a.offload_activation : 0.000018s : 0.19% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000025s : 0.27% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.03% optimize.opt_a.before_grad : 0.000019s : 0.21% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000008s : 0.08% optimize.opt_a.meta_fg_expand : 0.000005s : 0.05% optimize.opt_a.flash_sp_send_recv_attached : 0.000003s : 0.04% optimize.opt_a.receive_attached : 0.000003s : 0.04% optimize.opt_a.after_resolve : 0.000019s : 0.21% optimize.opt_a.a_after_grad : 0.000019s : 0.20% optimize.opt_a.renormalize : 0.000465s : 5.11% optimize.opt_a.add_forward_monad_depend : 0.000009s : 0.09% optimize.opt_a.auto_monad_grad : 0.000004s : 0.04% optimize.opt_a.auto_monad_eliminator : 0.000024s : 0.26% optimize.opt_a.cse : 0.000050s : 0.55% optimize.opt_a.a_3 : 0.000084s : 0.92% optimize.py_interpret_to_execute_after_opt_a : 0.000009s : 0.10% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.02% optimize.rewriter_after_opt_a : 0.000037s : 0.41% optimize.convert_after_rewriter : 0.000007s : 0.08% optimize.order_py_execute_after_rewriter : 0.000006s : 0.06% optimize.mutable_eliminate : 0.000520s : 5.72% optimize.opt_b.b_1 : 0.000118s : 1.29% optimize.opt_b.b_2 : 0.000007s : 0.08% optimize.opt_b.updatestate_depend_eliminate : 0.000006s : 0.07% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_b.updatestate_loads_eliminate : 0.000002s : 0.03% optimize.opt_b.renormalize : 0.000001s : 0.01% optimize.opt_b.cse : 0.000020s : 0.22% optimize.optimize_parallel_all_gather_comm : 0.000017s : 0.18% optimize.overlap_param_gather : 0.000002s : 0.02% optimize.cconv : 0.000026s : 0.29% optimize.loop_unroll : 0.000418s : 4.59% optimize.opt_after_cconv.c_1 : 0.000028s : 0.30% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000005s : 0.06% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000002s : 0.03% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.cse : 0.000019s : 0.21% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000014s : 0.15% optimize.tuple_transform.d_1 : 0.000040s : 0.44% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.02% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000007s : 0.07% optimize.partial_unused_args_eliminate : 0.000002s : 0.02% optimize.add_recomputation : 0.000046s : 0.51% optimize.cse_after_recomputation.cse : 0.000012s : 0.13% optimize.environ_conv : 0.000005s : 0.06% optimize.swap_dp_allreduce_reducescatter : 0.000005s : 0.06% optimize.bias_add_comm_swap : 0.000002s : 0.03% optimize.label_micro_interleaved_index : 0.000004s : 0.05% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.03% optimize.merge_cast_opt : 0.000001s : 0.01% optimize.slice_recompute_activation : 0.000002s : 0.02% optimize.micro_interleaved_order_control : 0.000002s : 0.02% optimize.assign_add_opt : 0.000001s : 0.02% optimize.ForceFp32Comm : 0.000001s : 0.01% optimize.remove_cast_before_assign_add : 0.000001s : 0.01% optimize.full_micro_interleaved_order_control : 0.000002s : 0.02% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.03% optimize.comm_op_add_attrs : 0.000001s : 0.01% optimize.add_comm_op_reuse_tag : 0.000001s : 0.01% optimize.interleave_split_concat_branches : 0.000001s : 0.01% optimize.interleave_parallel_branches : 0.000001s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000002s : 0.02% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.02% optimize.control_data_broadcast_order : 0.000012s : 0.13% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.02% optimize.offloading_packed_experts : 0.000004s : 0.05% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.05% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.02% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.01% optimize.overlap_recompute_comm : 0.000002s : 0.02% optimize.overlap_grad_ring_attention : 0.000004s : 0.05% optimize.overlap_grad_flash_sp : 0.000019s : 0.21% optimize.begin_end_overlap_inline : 0.000001s : 0.01% optimize.split_matmul_comm_elemetwise : 0.000003s : 0.03% optimize.split_layernorm_comm : 0.000002s : 0.02% optimize.handle_group_info : 0.000001s : 0.01% optimize.symbol_engine_optimizer.build : 0.000003s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000011s : 0.12% optimize.symbol_engine_optimizer.elim_not_effective : 0.000012s : 0.13% optimize.symbol_engine_optimizer.opt_reshape : 0.000007s : 0.07% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000010s : 0.11% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.02% pipeline_parallel_scheduler : 0.000002s : 0.02% auto_monad_reorder : 0.000017s : 0.19% get_jit_bprop_graph : 0.000002s : 0.02% rewriter_after_jit_bprop_graph : 0.000004s : 0.04% opt_after_jit_grad : 0.000474s : 5.21% validate : 0.000036s : 0.39% Time group info: ------[substitution.] 0.000181 23 40.11% : 0.000072s : 4: substitution.arithmetic_simplify 1.05% : 0.000002s : 2: substitution.elim_not_effective 0.73% : 0.000001s : 2: substitution.fold_const_symbol 3.19% : 0.000006s : 3: substitution.graph_param_transform 48.64% : 0.000088s : 2: substitution.inline 1.98% : 0.000004s : 4: substitution.j_node_and_user_rematch 2.63% : 0.000005s : 4: substitution.remove_not_recompute_node 1.68% : 0.000003s : 2: substitution.replace_old_param ------[type_inference.] 0.004662 2 90.78% : 0.004233s : 1: type_inference.infer 9.22% : 0.000430s : 1: type_inference.specialize ------[replace.] 0.000021 2 100.00% : 0.000021s : 2: replace.inline ------[match.] 0.000086 2 100.00% : 0.000086s : 2: match.inline ------[predicate.] 0.000134 754 1.14% : 0.000002s : 7: predicate.accumulaten_eliminater 1.03% : 0.000001s : 3: predicate.ad_related_special_op_eliminate 0.74% : 0.000001s : 6: predicate.addn_check_dump 0.95% : 0.000001s : 7: predicate.addn_zero_filter 0.74% : 0.000001s : 7: predicate.adjust_all_reduce_mul_add 2.79% : 0.000004s : 13: predicate.arithmetic_simplify 0.96% : 0.000001s : 7: predicate.cast_eliminate 0.69% : 0.000001s : 6: predicate.check_bprop_eliminate 0.66% : 0.000001s : 6: predicate.compare_switch_simplify 0.20% : 0.000000s : 3: predicate.const_output_eliminate 0.79% : 0.000001s : 6: predicate.depend_value_elim 0.80% : 0.000001s : 7: predicate.dict_get_item_const_eliminator 0.97% : 0.000001s : 7: predicate.dict_get_item_eliminator 0.77% : 0.000001s : 7: predicate.dict_set_item_eliminator 1.27% : 0.000002s : 6: predicate.dumpgradient_eliminate 0.28% : 0.000000s : 3: predicate.elim_not_effective 0.64% : 0.000001s : 3: predicate.elim_shapecalc_of_broadcastargs 1.04% : 0.000001s : 10: predicate.environ_add_const_eliminate 1.06% : 0.000001s : 10: predicate.environ_get_add_eliminate 0.98% : 0.000001s : 10: predicate.environ_get_depend_swap 1.65% : 0.000002s : 16: predicate.environ_get_eliminate 1.03% : 0.000001s : 10: predicate.environ_get_set_eliminate 0.93% : 0.000001s : 9: predicate.exchange_switch_depend_value 1.85% : 0.000002s : 9: predicate.float_depend_g_call 0.67% : 0.000001s : 6: predicate.float_environ_get_switch 1.01% : 0.000001s : 9: predicate.float_tuple_getitem_switch 0.25% : 0.000000s : 3: predicate.fold_const_symbol 0.78% : 0.000001s : 6: predicate.get_grad_eliminate 0.22% : 0.000000s : 3: predicate.graph_param_transform 0.82% : 0.000001s : 6: predicate.incorporate_call 0.66% : 0.000001s : 6: predicate.incorporate_call_switch 6.29% : 0.000008s : 34: predicate.inline 1.06% : 0.000001s : 6: predicate.inline_without_move 0.42% : 0.000001s : 6: predicate.j_node_and_user_rematch 1.38% : 0.000002s : 6: predicate.less_batch_normalization 1.47% : 0.000002s : 13: predicate.list_to_tuple_eliminator_ 2.05% : 0.000003s : 20: predicate.load_eliminater 1.16% : 0.000002s : 3: predicate.loop_unroll_after_grad 1.67% : 0.000002s : 14: predicate.loop_unroll_before_grad 1.81% : 0.000002s : 13: predicate.make_slice_get_slice_eliminator 0.69% : 0.000001s : 6: predicate.merge_addn 0.68% : 0.000001s : 6: predicate.micro_step_allgather_replace 0.70% : 0.000001s : 6: predicate.mini_step_allgather_replace 0.70% : 0.000001s : 7: predicate.minmaximum_grad 1.02% : 0.000001s : 3: predicate.mutable_eliminate 0.45% : 0.000001s : 3: predicate.opt_reshape 0.43% : 0.000001s : 3: predicate.parallel_virtual_node 1.32% : 0.000002s : 9: predicate.partial_defer_inline 1.17% : 0.000002s : 10: predicate.partial_eliminate 0.79% : 0.000001s : 7: predicate.print_const_string_wrapper 0.94% : 0.000001s : 6: predicate.reduce_all_const_elim 1.16% : 0.000002s : 7: predicate.reduce_eliminate 2.11% : 0.000003s : 20: predicate.redundant_stop_gradient_eliminater 0.62% : 0.000001s : 6: predicate.remove_not_recompute_node 1.43% : 0.000002s : 13: predicate.replace_applicator 0.67% : 0.000001s : 6: predicate.replace_old_param 0.35% : 0.000000s : 3: predicate.reset_defer_inline 0.90% : 0.000001s : 7: predicate.reshape_eliminate 0.77% : 0.000001s : 6: predicate.row_tensor_add_zeros_like 0.50% : 0.000001s : 3: predicate.row_tensor_eliminate 0.96% : 0.000001s : 6: predicate.same_eliminate 0.57% : 0.000001s : 6: predicate.set_cell_output_no_recompute 1.06% : 0.000001s : 6: predicate.shard_identity_eliminate 0.91% : 0.000001s : 6: predicate.special_op_eliminate 0.93% : 0.000001s : 6: predicate.specialize_transform 1.09% : 0.000001s : 6: predicate.split_environ_get_set_with_tuple_value 1.01% : 0.000001s : 6: predicate.stack_unstack_eliminate 0.43% : 0.000001s : 3: predicate.switch_call_monad_eliminater 1.24% : 0.000002s : 9: predicate.switch_defer_inline 1.68% : 0.000002s : 15: predicate.switch_layer_defer_inline 4.25% : 0.000006s : 32: predicate.switch_simplify 0.80% : 0.000001s : 7: predicate.tile_eliminate 0.79% : 0.000001s : 7: predicate.transpose_eliminate 1.63% : 0.000002s : 13: predicate.tuple_list_convert_item_index_to_positive 1.64% : 0.000002s : 13: predicate.tuple_list_get_item_const_eliminator 1.49% : 0.000002s : 13: predicate.tuple_list_get_item_depend_reorder 3.27% : 0.000004s : 19: predicate.tuple_list_get_item_eliminator 1.48% : 0.000002s : 13: predicate.tuple_list_get_set_item_eliminator 2.45% : 0.000003s : 19: predicate.tuple_list_set_item_eliminator 1.56% : 0.000002s : 13: predicate.tuple_to_list_eliminator_ 1.99% : 0.000003s : 20: predicate.updatestate_pure_node_eliminater 2.86% : 0.000004s : 26: predicate.updatestate_useless_node_eliminater 0.42% : 0.000001s : 3: predicate.value_based_eliminate 0.74% : 0.000001s : 6: predicate.virtual_dataset_eliminate 0.73% : 0.000001s : 6: predicate.virtual_output_eliminate 0.38% : 0.000001s : 3: predicate.virtual_view_grad_eliminate 0.54% : 0.000001s : 3: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000208 5 8.42% : 0.000018s : 1: func_graph_cloner_run.FuncGraphClonerGraph 91.58% : 0.000191s : 4: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.021903 192 0.02% : 0.000004s : 1: ForceFp32Comm 14.23% : 0.003118s : 1: add_attr 14.19% : 0.003109s : 1: add_attr_with_inline 0.02% : 0.000004s : 1: add_comm_op_reuse_tag 0.23% : 0.000050s : 1: add_recomputation 0.02% : 0.000004s : 1: assign_add_opt 0.27% : 0.000059s : 1: auto_monad 0.10% : 0.000021s : 1: auto_monad_reorder 0.02% : 0.000003s : 1: begin_end_overlap_inline 0.02% : 0.000005s : 1: bias_add_comm_swap 2.06% : 0.000452s : 1: bootstrap 0.13% : 0.000029s : 1: cconv 0.02% : 0.000004s : 1: comm_op_add_attrs 0.07% : 0.000015s : 1: control_data_broadcast_order 0.05% : 0.000010s : 1: convert_after_rewriter 0.11% : 0.000025s : 1: cse_after_recomputation 0.02% : 0.000005s : 1: dataset_repeat_opt 0.03% : 0.000006s : 1: detach_backward 0.04% : 0.000008s : 1: environ_conv 0.08% : 0.000016s : 1: event_method 0.02% : 0.000005s : 1: full_micro_interleaved_order_control 0.02% : 0.000005s : 1: get_jit_bprop_graph 0.04% : 0.000009s : 1: graph_reusing 0.02% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.02% : 0.000004s : 1: handle_group_info 0.03% : 0.000006s : 1: inline 0.03% : 0.000006s : 1: insert-virtual-dataset 0.02% : 0.000004s : 1: interleave_parallel_branches 0.02% : 0.000004s : 1: interleave_split_concat_branches 0.03% : 0.000006s : 1: label_fine_grained_interleaved_index 0.03% : 0.000007s : 1: label_micro_interleaved_index 1.94% : 0.000425s : 1: loop_unroll 0.02% : 0.000004s : 1: merge_cast_opt 0.02% : 0.000005s : 1: micro_interleaved_order_control 2.41% : 0.000529s : 1: mutable_eliminate 0.03% : 0.000007s : 1: offloading_packed_experts 0.06% : 0.000014s : 1: opt.transform.loop_unroll_optimizer 0.07% : 0.000014s : 1: opt.transform.mutable_eliminate 3.95% : 0.000866s : 78: opt.transform.opt_a 0.12% : 0.000026s : 1: opt.transform.opt_after_cconv 0.11% : 0.000025s : 1: opt.transform.opt_after_jit_grad 0.43% : 0.000094s : 28: opt.transform.opt_b 0.20% : 0.000044s : 2: opt.transform.opt_trans_graph 0.16% : 0.000036s : 4: opt.transform.symbol_engine_opt 10.05% : 0.002201s : 1: opt_a 0.47% : 0.000104s : 1: opt_after_cconv 2.20% : 0.000482s : 1: opt_after_jit_grad 0.93% : 0.000203s : 1: opt_b 18.96% : 0.004153s : 1: optimize 0.09% : 0.000020s : 1: optimize_parallel_all_gather_comm 0.04% : 0.000009s : 1: order_py_execute_after_rewriter 0.10% : 0.000023s : 1: overlap_grad_flash_sp 0.02% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.03% : 0.000007s : 1: overlap_grad_ring_attention 0.02% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.02% : 0.000005s : 1: overlap_opt_shard_in_pipeline 0.02% : 0.000005s : 1: overlap_param_gather 0.02% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.04% : 0.000008s : 1: overlap_recompute_and_grad_model_parallel 0.02% : 0.000005s : 1: overlap_recompute_comm 0.03% : 0.000006s : 1: parallel-infer-symbol 0.02% : 0.000004s : 1: parallel-infer-symbol-second 0.02% : 0.000005s : 1: partial_unused_args_eliminate 0.02% : 0.000005s : 1: pipeline_parallel_scheduler 0.02% : 0.000005s : 1: pipeline_split 0.13% : 0.000028s : 1: pre_auto_parallel 0.09% : 0.000019s : 1: py_interpret_to_execute 0.06% : 0.000012s : 1: py_interpret_to_execute_after_opt_a 0.02% : 0.000004s : 1: remove_cast_before_assign_add 0.08% : 0.000017s : 1: remove_dup_value 1.10% : 0.000241s : 1: renormalize.infer 0.99% : 0.000216s : 1: renormalize.specialize 0.03% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.04% : 0.000008s : 1: rewriter_after_jit_bprop_graph 0.19% : 0.000043s : 1: rewriter_after_opt_a 0.22% : 0.000047s : 1: rewriter_before_opt_a 0.02% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.02% : 0.000005s : 1: slice_recompute_activation 0.02% : 0.000005s : 1: split_layernorm_comm 0.03% : 0.000006s : 1: split_matmul_comm_elemetwise 0.04% : 0.000008s : 1: swap_dp_allreduce_reducescatter 0.36% : 0.000078s : 1: symbol_engine_optimizer 0.35% : 0.000076s : 1: tuple_transform 21.57% : 0.004724s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:48.523.175 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:48.523.430 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0141986, [21] [bootstrap]: 0.00044694 [type_inference]: 0.00473596 [event_method]: 1.205e-05 [auto_monad]: 5.379e-05 [graph_reusing]: 5.35999e-06 [inline]: 2.19001e-06 [add_attr]: 0.00310851, [1] [add_attr_with_inline]: 0.00310079, [1] [Cycle 1]: 6.659e-05, [2] [tag_attr]: 1.383e-05 [meta_addattr_fg_expand]: 4.13999e-06 [parallel-infer-symbol]: 2.91e-06 [pre_auto_parallel]: 2.389e-05 [insert-virtual-dataset]: 2.54999e-06 [parallel-infer-symbol-second]: 7.2e-07 [dataset_repeat_opt]: 2.37999e-06 [pipeline_split]: 1.81e-06 [optimize]: 0.00465045, [53] [py_interpret_to_execute]: 2.082e-05 [rewriter_before_opt_a]: 4.63e-05 [opt_a]: 0.00244946, [2] [Cycle 1]: 0.00161663, [45] [expand_dump_flag]: 2.85998e-06 [switch_simplify]: 2.456e-05 [loop_unroll]: 1.372e-05 [a_1]: 0.00031779 [with_stream_mark]: 1.612e-05 [recompute_prepare]: 8.22e-06 [updatestate_depend_eliminate]: 3.7e-06 [updatestate_assign_eliminate]: 3.86001e-06 [updatestate_loads_eliminate]: 3.45e-06 [parameter_eliminate]: 1.99999e-06 [a_2]: 0.00012174 [accelerated_algorithm]: 7.11001e-06 [shard]: 2.02999e-06 [meta_shard_fg_expand]: 1.72999e-06 [shard_inline]: 6.29999e-06 [merge_send_recv]: 8.42e-06 [auto_parallel]: 6.44001e-06 [parallel]: 1.871e-05 [flash_sp]: 7.84002e-06 [merge_comm]: 4.07e-06 [allreduce_fusion]: 3.68999e-06 [matmul_add_comm_reduction]: 9.08002e-06 [allreduce_slice_to_reducescatter]: 5.50004e-07 [virtual_shard_identity]: 7.14001e-06 [virtual_dataset]: 6.43e-06 [get_grad_eliminate_]: 5.91003e-06 [virtual_output]: 6.09999e-06 [merge_forward]: 3.58e-06 [cell_reuse_recompute_pass]: 1.41002e-06 [offload_activation]: 9.91998e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.481e-05 [merge_recompute_call_nodes]: 1.55001e-06 [before_grad]: 1.051e-05 [set_forward_comm_id_for_comm_node_pass]: 3.68e-06 [meta_fg_expand]: 2.68e-06 [flash_sp_send_recv_attached]: 2.49999e-06 [receive_attached]: 2.37999e-06 [after_resolve]: 9.59999e-06 [a_after_grad]: 9.71998e-06 [renormalize]: 0.00044437 [add_forward_monad_depend]: 4.77e-06 [auto_monad_grad]: 2.16003e-06 [auto_monad_eliminator]: 1.308e-05 [cse]: 2.786e-05 [a_3]: 5.833e-05 [Cycle 2]: 0.0008205, [45] [expand_dump_flag]: 1.07998e-06 [switch_simplify]: 6.87002e-06 [loop_unroll]: 5.94e-06 [a_1]: 0.00012953 [with_stream_mark]: 1.096e-05 [recompute_prepare]: 6.46e-06 [updatestate_depend_eliminate]: 3.09999e-06 [updatestate_assign_eliminate]: 2.41e-06 [updatestate_loads_eliminate]: 2.58e-06 [parameter_eliminate]: 9.39996e-07 [a_2]: 0.00011051 [accelerated_algorithm]: 6.34999e-06 [shard]: 1.02998e-06 [meta_shard_fg_expand]: 1.30999e-06 [shard_inline]: 6.36e-06 [merge_send_recv]: 5.29998e-06 [auto_parallel]: 5.76998e-06 [parallel]: 4.36002e-06 [flash_sp]: 3.7e-06 [merge_comm]: 3.21999e-06 [allreduce_fusion]: 2.99999e-06 [matmul_add_comm_reduction]: 5.62001e-06 [allreduce_slice_to_reducescatter]: 3.00002e-07 [virtual_shard_identity]: 6.44001e-06 [virtual_dataset]: 5.67001e-06 [get_grad_eliminate_]: 5.67999e-06 [virtual_output]: 5.53002e-06 [merge_forward]: 2.48e-06 [cell_reuse_recompute_pass]: 1.51998e-06 [offload_activation]: 6.61e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.308e-05 [merge_recompute_call_nodes]: 1.00001e-06 [before_grad]: 9.20999e-06 [set_forward_comm_id_for_comm_node_pass]: 3.7e-06 [meta_fg_expand]: 2.14e-06 [flash_sp_send_recv_attached]: 9.20001e-07 [receive_attached]: 9.70002e-07 [after_resolve]: 8.65999e-06 [a_after_grad]: 8.25999e-06 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 1.49e-06 [auto_monad_grad]: 1.04e-06 [auto_monad_eliminator]: 7.35e-06 [cse]: 1.598e-05 [a_3]: 4.927e-05 [py_interpret_to_execute_after_opt_a]: 1.174e-05 [slice_cell_reuse_recomputed_activation]: 4.72998e-06 [rewriter_after_opt_a]: 3.761e-05 [convert_after_rewriter]: 9.66e-06 [order_py_execute_after_rewriter]: 8.15e-06 [mutable_eliminate]: 0.0005103 [opt_b]: 0.00026234, [1] [Cycle 1]: 0.00025322, [7] [b_1]: 0.00015933 [b_2]: 7.6e-06 [updatestate_depend_eliminate]: 5.78997e-06 [updatestate_assign_eliminate]: 2.57001e-06 [updatestate_loads_eliminate]: 2.64001e-06 [renormalize]: 4.89992e-07 [cse]: 1.851e-05 [optimize_parallel_all_gather_comm]: 1.912e-05 [overlap_param_gather]: 5.31002e-06 [cconv]: 2.86e-05 [loop_unroll]: 0.00043678 [opt_after_cconv]: 0.00012967, [1] [Cycle 1]: 0.00012044, [7] [c_1]: 2.751e-05 [parameter_eliminate]: 2.80997e-06 [updatestate_depend_eliminate]: 5.19998e-06 [updatestate_assign_eliminate]: 2.71999e-06 [updatestate_loads_eliminate]: 2.74999e-06 [cse]: 2.079e-05 [renormalize]: 4.39992e-07 [remove_dup_value]: 1.708e-05 [tuple_transform]: 8.693e-05, [1] [Cycle 1]: 7.966e-05, [4] [d_1]: 3.915e-05 [none_parameter_eliminate]: 1.56002e-06 [renormalize]: 1.80007e-07 [switch_simplify]: 7.01999e-06 [partial_unused_args_eliminate]: 4.63999e-06 [add_recomputation]: 5.059e-05 [cse_after_recomputation]: 2.772e-05, [1] [Cycle 1]: 2.099e-05, [1] [cse]: 1.171e-05 [environ_conv]: 7.66999e-06 [swap_dp_allreduce_reducescatter]: 7.9e-06 [bias_add_comm_swap]: 5.23002e-06 [label_micro_interleaved_index]: 7.05998e-06 [label_fine_grained_interleaved_index]: 4.95999e-06 [merge_cast_opt]: 3.8e-06 [slice_recompute_activation]: 4.3e-06 [micro_interleaved_order_control]: 4.40999e-06 [assign_add_opt]: 3.95e-06 [ForceFp32Comm]: 3.16001e-06 [remove_cast_before_assign_add]: 3.36999e-06 [full_micro_interleaved_order_control]: 4.45e-06 [reorder_send_recv_between_fp_bp]: 5.46e-06 [comm_op_add_attrs]: 3.56001e-06 [add_comm_op_reuse_tag]: 3.29001e-06 [interleave_split_concat_branches]: 3.88999e-06 [interleave_parallel_branches]: 3.52997e-06 [overlap_opt_shard_in_pipeline]: 3.58e-06 [overlap_opt_shard_grad_in_pipeline]: 4.07e-06 [control_data_broadcast_order]: 1.537e-05 [grouped_pairwise_exchange_alltoall]: 3.8e-06 [offloading_packed_experts]: 6.62002e-06 [overlap_recompute_and_grad_model_parallel]: 7.23e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.56999e-06 [overlap_recompute_allgather_and_fa_grad]: 3.69002e-06 [overlap_recompute_comm]: 4.97e-06 [overlap_grad_ring_attention]: 6.71e-06 [overlap_grad_flash_sp]: 2.16e-05 [begin_end_overlap_inline]: 3.03e-06 [split_matmul_comm_elemetwise]: 5.04998e-06 [split_layernorm_comm]: 3.98999e-06 [handle_group_info]: 3.55e-06 [symbol_engine_optimizer]: 9.65e-05, [1] [Cycle 1]: 8.986e-05, [6] [build]: 2.56e-06 [elim_shapecalc]: 1.01e-05 [elim_not_effective]: 1.334e-05 [opt_reshape]: 7.17002e-06 [fold_const_symbol]: 1.042e-05 [renormalize]: 2.69996e-07 [detach_backward]: 3.55998e-06 [pipeline_parallel_scheduler]: 1.64e-06 [auto_monad_reorder]: 1.845e-05 [get_jit_bprop_graph]: 1.77999e-06 [rewriter_after_jit_bprop_graph]: 4.12e-06 [opt_after_jit_grad]: 0.00049958 [validate]: 4.209e-05 Sums bootstrap : 0.000447s : 4.78% type_inference : 0.004736s : 50.61% event_method : 0.000012s : 0.13% auto_monad : 0.000054s : 0.57% graph_reusing : 0.000005s : 0.06% inline : 0.000002s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000014s : 0.15% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000004s : 0.04% parallel-infer-symbol : 0.000003s : 0.03% pre_auto_parallel : 0.000024s : 0.26% insert-virtual-dataset : 0.000003s : 0.03% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.03% pipeline_split : 0.000002s : 0.02% optimize.py_interpret_to_execute : 0.000021s : 0.22% optimize.rewriter_before_opt_a : 0.000046s : 0.49% optimize.opt_a.expand_dump_flag : 0.000004s : 0.04% optimize.opt_a.switch_simplify : 0.000031s : 0.34% optimize.opt_a.loop_unroll : 0.000020s : 0.21% optimize.opt_a.a_1 : 0.000447s : 4.78% optimize.opt_a.with_stream_mark : 0.000027s : 0.29% optimize.opt_a.recompute_prepare : 0.000015s : 0.16% optimize.opt_a.updatestate_depend_eliminate : 0.000007s : 0.07% optimize.opt_a.updatestate_assign_eliminate : 0.000006s : 0.07% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.06% optimize.opt_a.parameter_eliminate : 0.000003s : 0.03% optimize.opt_a.a_2 : 0.000232s : 2.48% optimize.opt_a.accelerated_algorithm : 0.000013s : 0.14% optimize.opt_a.shard : 0.000003s : 0.03% optimize.opt_a.meta_shard_fg_expand : 0.000003s : 0.03% optimize.opt_a.shard_inline : 0.000013s : 0.14% optimize.opt_a.merge_send_recv : 0.000014s : 0.15% optimize.opt_a.auto_parallel : 0.000012s : 0.13% optimize.opt_a.parallel : 0.000023s : 0.25% optimize.opt_a.flash_sp : 0.000012s : 0.12% optimize.opt_a.merge_comm : 0.000007s : 0.08% optimize.opt_a.allreduce_fusion : 0.000007s : 0.07% optimize.opt_a.matmul_add_comm_reduction : 0.000015s : 0.16% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000014s : 0.15% optimize.opt_a.virtual_dataset : 0.000012s : 0.13% optimize.opt_a.get_grad_eliminate_ : 0.000012s : 0.12% optimize.opt_a.virtual_output : 0.000012s : 0.12% optimize.opt_a.merge_forward : 0.000006s : 0.06% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.03% optimize.opt_a.offload_activation : 0.000017s : 0.18% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000028s : 0.30% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.03% optimize.opt_a.before_grad : 0.000020s : 0.21% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000007s : 0.08% optimize.opt_a.meta_fg_expand : 0.000005s : 0.05% optimize.opt_a.flash_sp_send_recv_attached : 0.000003s : 0.04% optimize.opt_a.receive_attached : 0.000003s : 0.04% optimize.opt_a.after_resolve : 0.000018s : 0.20% optimize.opt_a.a_after_grad : 0.000018s : 0.19% optimize.opt_a.renormalize : 0.000444s : 4.75% optimize.opt_a.add_forward_monad_depend : 0.000006s : 0.07% optimize.opt_a.auto_monad_grad : 0.000003s : 0.03% optimize.opt_a.auto_monad_eliminator : 0.000020s : 0.22% optimize.opt_a.cse : 0.000044s : 0.47% optimize.opt_a.a_3 : 0.000108s : 1.15% optimize.py_interpret_to_execute_after_opt_a : 0.000012s : 0.13% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.05% optimize.rewriter_after_opt_a : 0.000038s : 0.40% optimize.convert_after_rewriter : 0.000010s : 0.10% optimize.order_py_execute_after_rewriter : 0.000008s : 0.09% optimize.mutable_eliminate : 0.000510s : 5.45% optimize.opt_b.b_1 : 0.000159s : 1.70% optimize.opt_b.b_2 : 0.000008s : 0.08% optimize.opt_b.updatestate_depend_eliminate : 0.000006s : 0.06% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_b.renormalize : 0.000000s : 0.01% optimize.opt_b.cse : 0.000019s : 0.20% optimize.optimize_parallel_all_gather_comm : 0.000019s : 0.20% optimize.overlap_param_gather : 0.000005s : 0.06% optimize.cconv : 0.000029s : 0.31% optimize.loop_unroll : 0.000437s : 4.67% optimize.opt_after_cconv.c_1 : 0.000028s : 0.29% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000005s : 0.06% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.cse : 0.000021s : 0.22% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000017s : 0.18% optimize.tuple_transform.d_1 : 0.000039s : 0.42% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.02% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000007s : 0.08% optimize.partial_unused_args_eliminate : 0.000005s : 0.05% optimize.add_recomputation : 0.000051s : 0.54% optimize.cse_after_recomputation.cse : 0.000012s : 0.13% optimize.environ_conv : 0.000008s : 0.08% optimize.swap_dp_allreduce_reducescatter : 0.000008s : 0.08% optimize.bias_add_comm_swap : 0.000005s : 0.06% optimize.label_micro_interleaved_index : 0.000007s : 0.08% optimize.label_fine_grained_interleaved_index : 0.000005s : 0.05% optimize.merge_cast_opt : 0.000004s : 0.04% optimize.slice_recompute_activation : 0.000004s : 0.05% optimize.micro_interleaved_order_control : 0.000004s : 0.05% optimize.assign_add_opt : 0.000004s : 0.04% optimize.ForceFp32Comm : 0.000003s : 0.03% optimize.remove_cast_before_assign_add : 0.000003s : 0.04% optimize.full_micro_interleaved_order_control : 0.000004s : 0.05% optimize.reorder_send_recv_between_fp_bp : 0.000005s : 0.06% optimize.comm_op_add_attrs : 0.000004s : 0.04% optimize.add_comm_op_reuse_tag : 0.000003s : 0.04% optimize.interleave_split_concat_branches : 0.000004s : 0.04% optimize.interleave_parallel_branches : 0.000004s : 0.04% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.04% optimize.overlap_opt_shard_grad_in_pipeline : 0.000004s : 0.04% optimize.control_data_broadcast_order : 0.000015s : 0.16% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.04% optimize.offloading_packed_experts : 0.000007s : 0.07% optimize.overlap_recompute_and_grad_model_parallel : 0.000007s : 0.08% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.04% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.04% optimize.overlap_recompute_comm : 0.000005s : 0.05% optimize.overlap_grad_ring_attention : 0.000007s : 0.07% optimize.overlap_grad_flash_sp : 0.000022s : 0.23% optimize.begin_end_overlap_inline : 0.000003s : 0.03% optimize.split_matmul_comm_elemetwise : 0.000005s : 0.05% optimize.split_layernorm_comm : 0.000004s : 0.04% optimize.handle_group_info : 0.000004s : 0.04% optimize.symbol_engine_optimizer.build : 0.000003s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000010s : 0.11% optimize.symbol_engine_optimizer.elim_not_effective : 0.000013s : 0.14% optimize.symbol_engine_optimizer.opt_reshape : 0.000007s : 0.08% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000010s : 0.11% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000004s : 0.04% pipeline_parallel_scheduler : 0.000002s : 0.02% auto_monad_reorder : 0.000018s : 0.20% get_jit_bprop_graph : 0.000002s : 0.02% rewriter_after_jit_bprop_graph : 0.000004s : 0.04% opt_after_jit_grad : 0.000500s : 5.34% validate : 0.000042s : 0.45% Time group info: ------[substitution.] 0.000171 23 39.91% : 0.000068s : 4: substitution.arithmetic_simplify 1.11% : 0.000002s : 2: substitution.elim_not_effective 0.82% : 0.000001s : 2: substitution.fold_const_symbol 3.11% : 0.000005s : 3: substitution.graph_param_transform 48.41% : 0.000083s : 2: substitution.inline 2.02% : 0.000003s : 4: substitution.j_node_and_user_rematch 2.94% : 0.000005s : 4: substitution.remove_not_recompute_node 1.69% : 0.000003s : 2: substitution.replace_old_param ------[type_inference.] 0.004688 2 91.08% : 0.004270s : 1: type_inference.infer 8.92% : 0.000418s : 1: type_inference.specialize ------[replace.] 0.000021 2 100.00% : 0.000021s : 2: replace.inline ------[match.] 0.000081 2 100.00% : 0.000081s : 2: match.inline ------[predicate.] 0.000135 754 0.87% : 0.000001s : 7: predicate.accumulaten_eliminater 1.35% : 0.000002s : 3: predicate.ad_related_special_op_eliminate 0.65% : 0.000001s : 6: predicate.addn_check_dump 0.90% : 0.000001s : 7: predicate.addn_zero_filter 0.67% : 0.000001s : 7: predicate.adjust_all_reduce_mul_add 2.96% : 0.000004s : 13: predicate.arithmetic_simplify 0.82% : 0.000001s : 7: predicate.cast_eliminate 0.75% : 0.000001s : 6: predicate.check_bprop_eliminate 0.66% : 0.000001s : 6: predicate.compare_switch_simplify 0.23% : 0.000000s : 3: predicate.const_output_eliminate 0.75% : 0.000001s : 6: predicate.depend_value_elim 0.74% : 0.000001s : 7: predicate.dict_get_item_const_eliminator 0.87% : 0.000001s : 7: predicate.dict_get_item_eliminator 0.73% : 0.000001s : 7: predicate.dict_set_item_eliminator 1.18% : 0.000002s : 6: predicate.dumpgradient_eliminate 0.25% : 0.000000s : 3: predicate.elim_not_effective 0.77% : 0.000001s : 3: predicate.elim_shapecalc_of_broadcastargs 1.03% : 0.000001s : 10: predicate.environ_add_const_eliminate 0.95% : 0.000001s : 10: predicate.environ_get_add_eliminate 0.99% : 0.000001s : 10: predicate.environ_get_depend_swap 1.78% : 0.000002s : 16: predicate.environ_get_eliminate 1.04% : 0.000001s : 10: predicate.environ_get_set_eliminate 0.96% : 0.000001s : 9: predicate.exchange_switch_depend_value 2.03% : 0.000003s : 9: predicate.float_depend_g_call 0.64% : 0.000001s : 6: predicate.float_environ_get_switch 0.98% : 0.000001s : 9: predicate.float_tuple_getitem_switch 0.24% : 0.000000s : 3: predicate.fold_const_symbol 0.87% : 0.000001s : 6: predicate.get_grad_eliminate 0.24% : 0.000000s : 3: predicate.graph_param_transform 0.76% : 0.000001s : 6: predicate.incorporate_call 0.67% : 0.000001s : 6: predicate.incorporate_call_switch 6.30% : 0.000009s : 34: predicate.inline 0.98% : 0.000001s : 6: predicate.inline_without_move 0.44% : 0.000001s : 6: predicate.j_node_and_user_rematch 1.18% : 0.000002s : 6: predicate.less_batch_normalization 1.52% : 0.000002s : 13: predicate.list_to_tuple_eliminator_ 2.06% : 0.000003s : 20: predicate.load_eliminater 1.39% : 0.000002s : 3: predicate.loop_unroll_after_grad 1.69% : 0.000002s : 14: predicate.loop_unroll_before_grad 1.63% : 0.000002s : 13: predicate.make_slice_get_slice_eliminator 0.72% : 0.000001s : 6: predicate.merge_addn 0.73% : 0.000001s : 6: predicate.micro_step_allgather_replace 0.72% : 0.000001s : 6: predicate.mini_step_allgather_replace 0.72% : 0.000001s : 7: predicate.minmaximum_grad 1.22% : 0.000002s : 3: predicate.mutable_eliminate 0.50% : 0.000001s : 3: predicate.opt_reshape 0.50% : 0.000001s : 3: predicate.parallel_virtual_node 1.36% : 0.000002s : 9: predicate.partial_defer_inline 1.18% : 0.000002s : 10: predicate.partial_eliminate 0.77% : 0.000001s : 7: predicate.print_const_string_wrapper 0.78% : 0.000001s : 6: predicate.reduce_all_const_elim 0.96% : 0.000001s : 7: predicate.reduce_eliminate 2.11% : 0.000003s : 20: predicate.redundant_stop_gradient_eliminater 0.85% : 0.000001s : 6: predicate.remove_not_recompute_node 1.16% : 0.000002s : 13: predicate.replace_applicator 0.67% : 0.000001s : 6: predicate.replace_old_param 0.34% : 0.000000s : 3: predicate.reset_defer_inline 0.77% : 0.000001s : 7: predicate.reshape_eliminate 0.78% : 0.000001s : 6: predicate.row_tensor_add_zeros_like 0.44% : 0.000001s : 3: predicate.row_tensor_eliminate 1.04% : 0.000001s : 6: predicate.same_eliminate 0.55% : 0.000001s : 6: predicate.set_cell_output_no_recompute 1.21% : 0.000002s : 6: predicate.shard_identity_eliminate 0.90% : 0.000001s : 6: predicate.special_op_eliminate 0.98% : 0.000001s : 6: predicate.specialize_transform 1.08% : 0.000001s : 6: predicate.split_environ_get_set_with_tuple_value 1.01% : 0.000001s : 6: predicate.stack_unstack_eliminate 0.44% : 0.000001s : 3: predicate.switch_call_monad_eliminater 0.99% : 0.000001s : 9: predicate.switch_defer_inline 1.72% : 0.000002s : 15: predicate.switch_layer_defer_inline 4.43% : 0.000006s : 32: predicate.switch_simplify 0.72% : 0.000001s : 7: predicate.tile_eliminate 0.78% : 0.000001s : 7: predicate.transpose_eliminate 1.63% : 0.000002s : 13: predicate.tuple_list_convert_item_index_to_positive 1.71% : 0.000002s : 13: predicate.tuple_list_get_item_const_eliminator 1.40% : 0.000002s : 13: predicate.tuple_list_get_item_depend_reorder 3.23% : 0.000004s : 19: predicate.tuple_list_get_item_eliminator 1.43% : 0.000002s : 13: predicate.tuple_list_get_set_item_eliminator 2.60% : 0.000004s : 19: predicate.tuple_list_set_item_eliminator 1.57% : 0.000002s : 13: predicate.tuple_to_list_eliminator_ 1.97% : 0.000003s : 20: predicate.updatestate_pure_node_eliminater 2.84% : 0.000004s : 26: predicate.updatestate_useless_node_eliminater 0.44% : 0.000001s : 3: predicate.value_based_eliminate 0.82% : 0.000001s : 6: predicate.virtual_dataset_eliminate 0.83% : 0.000001s : 6: predicate.virtual_output_eliminate 0.33% : 0.000000s : 3: predicate.virtual_view_grad_eliminate 0.55% : 0.000001s : 3: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000201 5 8.35% : 0.000017s : 1: func_graph_cloner_run.FuncGraphClonerGraph 91.65% : 0.000184s : 4: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.023364 192 0.02% : 0.000006s : 1: ForceFp32Comm 13.34% : 0.003117s : 1: add_attr 13.29% : 0.003104s : 1: add_attr_with_inline 0.03% : 0.000006s : 1: add_comm_op_reuse_tag 0.23% : 0.000054s : 1: add_recomputation 0.03% : 0.000007s : 1: assign_add_opt 0.27% : 0.000062s : 1: auto_monad 0.11% : 0.000025s : 1: auto_monad_reorder 0.03% : 0.000006s : 1: begin_end_overlap_inline 0.03% : 0.000008s : 1: bias_add_comm_swap 2.09% : 0.000489s : 1: bootstrap 0.14% : 0.000032s : 1: cconv 0.03% : 0.000006s : 1: comm_op_add_attrs 0.08% : 0.000018s : 1: control_data_broadcast_order 0.05% : 0.000013s : 1: convert_after_rewriter 0.13% : 0.000031s : 1: cse_after_recomputation 0.03% : 0.000008s : 1: dataset_repeat_opt 0.08% : 0.000018s : 1: detach_backward 0.05% : 0.000011s : 1: environ_conv 0.09% : 0.000021s : 1: event_method 0.03% : 0.000007s : 1: full_micro_interleaved_order_control 0.03% : 0.000008s : 1: get_jit_bprop_graph 0.05% : 0.000012s : 1: graph_reusing 0.03% : 0.000006s : 1: grouped_pairwise_exchange_alltoall 0.03% : 0.000006s : 1: handle_group_info 0.04% : 0.000008s : 1: inline 0.04% : 0.000009s : 1: insert-virtual-dataset 0.03% : 0.000006s : 1: interleave_parallel_branches 0.03% : 0.000007s : 1: interleave_split_concat_branches 0.03% : 0.000008s : 1: label_fine_grained_interleaved_index 0.04% : 0.000010s : 1: label_micro_interleaved_index 1.89% : 0.000443s : 1: loop_unroll 0.03% : 0.000006s : 1: merge_cast_opt 0.03% : 0.000007s : 1: micro_interleaved_order_control 2.21% : 0.000516s : 1: mutable_eliminate 0.04% : 0.000009s : 1: offloading_packed_experts 0.06% : 0.000014s : 1: opt.transform.loop_unroll_optimizer 0.06% : 0.000014s : 1: opt.transform.mutable_eliminate 3.59% : 0.000838s : 78: opt.transform.opt_a 0.11% : 0.000026s : 1: opt.transform.opt_after_cconv 0.11% : 0.000026s : 1: opt.transform.opt_after_jit_grad 0.41% : 0.000096s : 28: opt.transform.opt_b 0.19% : 0.000044s : 2: opt.transform.opt_trans_graph 0.16% : 0.000037s : 4: opt.transform.symbol_engine_opt 10.50% : 0.002453s : 1: opt_a 0.57% : 0.000134s : 1: opt_after_cconv 2.19% : 0.000512s : 1: opt_after_jit_grad 1.14% : 0.000266s : 1: opt_b 21.41% : 0.005002s : 1: optimize 0.10% : 0.000022s : 1: optimize_parallel_all_gather_comm 0.05% : 0.000011s : 1: order_py_execute_after_rewriter 0.11% : 0.000025s : 1: overlap_grad_flash_sp 0.03% : 0.000006s : 1: overlap_grad_matmul_and_grad_allreduce 0.04% : 0.000010s : 1: overlap_grad_ring_attention 0.03% : 0.000007s : 1: overlap_opt_shard_grad_in_pipeline 0.03% : 0.000006s : 1: overlap_opt_shard_in_pipeline 0.04% : 0.000008s : 1: overlap_param_gather 0.03% : 0.000006s : 1: overlap_recompute_allgather_and_fa_grad 0.04% : 0.000010s : 1: overlap_recompute_and_grad_model_parallel 0.03% : 0.000008s : 1: overlap_recompute_comm 0.04% : 0.000010s : 1: parallel-infer-symbol 0.03% : 0.000006s : 1: parallel-infer-symbol-second 0.03% : 0.000008s : 1: partial_unused_args_eliminate 0.04% : 0.000009s : 1: pipeline_parallel_scheduler 0.03% : 0.000007s : 1: pipeline_split 0.14% : 0.000032s : 1: pre_auto_parallel 0.10% : 0.000024s : 1: py_interpret_to_execute 0.06% : 0.000015s : 1: py_interpret_to_execute_after_opt_a 0.03% : 0.000006s : 1: remove_cast_before_assign_add 0.09% : 0.000020s : 1: remove_dup_value 1.02% : 0.000237s : 1: renormalize.infer 0.85% : 0.000199s : 1: renormalize.specialize 0.04% : 0.000009s : 1: reorder_send_recv_between_fp_bp 0.04% : 0.000010s : 1: rewriter_after_jit_bprop_graph 0.18% : 0.000041s : 1: rewriter_after_opt_a 0.21% : 0.000050s : 1: rewriter_before_opt_a 0.03% : 0.000008s : 1: slice_cell_reuse_recomputed_activation 0.03% : 0.000007s : 1: slice_recompute_activation 0.03% : 0.000007s : 1: split_layernorm_comm 0.03% : 0.000008s : 1: split_matmul_comm_elemetwise 0.05% : 0.000011s : 1: swap_dp_allreduce_reducescatter 0.43% : 0.000099s : 1: symbol_engine_optimizer 0.38% : 0.000090s : 1: tuple_transform 20.38% : 0.004761s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:48.713.784 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0130642, [21] [bootstrap]: 0.00043515 [type_inference]: 0.00472844 [event_method]: 1.199e-05 [auto_monad]: 5.399e-05 [graph_reusing]: 5.37001e-06 [inline]: 2.55002e-06 [add_attr]: 0.00308708, [1] [add_attr_with_inline]: 0.0030789, [1] [Cycle 1]: 5.461e-05, [2] [tag_attr]: 1.406e-05 [meta_addattr_fg_expand]: 4e-06 [parallel-infer-symbol]: 3.45998e-06 [pre_auto_parallel]: 2.505e-05 [insert-virtual-dataset]: 2.78e-06 [parallel-infer-symbol-second]: 7.39994e-07 [dataset_repeat_opt]: 1.82999e-06 [pipeline_split]: 1.58002e-06 [optimize]: 0.0040235, [53] [py_interpret_to_execute]: 1.772e-05 [rewriter_before_opt_a]: 4.479e-05 [opt_a]: 0.00213802, [2] [Cycle 1]: 0.00147584, [45] [expand_dump_flag]: 2.83e-06 [switch_simplify]: 2.652e-05 [loop_unroll]: 1.358e-05 [a_1]: 0.0003336 [with_stream_mark]: 1.495e-05 [recompute_prepare]: 8.61997e-06 [updatestate_depend_eliminate]: 4.18001e-06 [updatestate_assign_eliminate]: 3.73001e-06 [updatestate_loads_eliminate]: 3.50998e-06 [parameter_eliminate]: 1.79998e-06 [a_2]: 9.253e-05 [accelerated_algorithm]: 7.09001e-06 [shard]: 2.19999e-06 [meta_shard_fg_expand]: 1.66e-06 [shard_inline]: 6.04999e-06 [merge_send_recv]: 8.22998e-06 [auto_parallel]: 6.30002e-06 [parallel]: 1.756e-05 [flash_sp]: 8.11002e-06 [merge_comm]: 3.86999e-06 [allreduce_fusion]: 3.29001e-06 [matmul_add_comm_reduction]: 9.25999e-06 [allreduce_slice_to_reducescatter]: 8.89995e-07 [virtual_shard_identity]: 8.14002e-06 [virtual_dataset]: 6.30002e-06 [get_grad_eliminate_]: 6.07999e-06 [virtual_output]: 5.80002e-06 [merge_forward]: 4.08999e-06 [cell_reuse_recompute_pass]: 1.45001e-06 [offload_activation]: 9.22999e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.239e-05 [merge_recompute_call_nodes]: 1.60001e-06 [before_grad]: 9.57001e-06 [set_forward_comm_id_for_comm_node_pass]: 3.81001e-06 [meta_fg_expand]: 2.68e-06 [flash_sp_send_recv_attached]: 2.39001e-06 [receive_attached]: 2.34001e-06 [after_resolve]: 1.002e-05 [a_after_grad]: 8.77999e-06 [renormalize]: 0.00045794 [add_forward_monad_depend]: 4.96002e-06 [auto_monad_grad]: 2.09e-06 [auto_monad_eliminator]: 1.448e-05 [cse]: 3.058e-05 [a_3]: 4.545e-05 [Cycle 2]: 0.00065259, [45] [expand_dump_flag]: 1.09e-06 [switch_simplify]: 7.6e-06 [loop_unroll]: 5.78002e-06 [a_1]: 0.00013062 [with_stream_mark]: 1.079e-05 [recompute_prepare]: 6.33e-06 [updatestate_depend_eliminate]: 3.34001e-06 [updatestate_assign_eliminate]: 2.26e-06 [updatestate_loads_eliminate]: 3.04999e-06 [parameter_eliminate]: 1.01002e-06 [a_2]: 8.301e-05 [accelerated_algorithm]: 6.28e-06 [shard]: 1.21002e-06 [meta_shard_fg_expand]: 1.37e-06 [shard_inline]: 6.11e-06 [merge_send_recv]: 5.07e-06 [auto_parallel]: 5.81e-06 [parallel]: 4.77998e-06 [flash_sp]: 3.61001e-06 [merge_comm]: 3.27997e-06 [allreduce_fusion]: 3.06001e-06 [matmul_add_comm_reduction]: 5.79e-06 [allreduce_slice_to_reducescatter]: 3.19997e-07 [virtual_shard_identity]: 6.71999e-06 [virtual_dataset]: 5.69e-06 [get_grad_eliminate_]: 5.41998e-06 [virtual_output]: 5.29e-06 [merge_forward]: 3.2e-06 [cell_reuse_recompute_pass]: 1.45001e-06 [offload_activation]: 7.25e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.045e-05 [merge_recompute_call_nodes]: 8.29983e-07 [before_grad]: 8.94998e-06 [set_forward_comm_id_for_comm_node_pass]: 3.96001e-06 [meta_fg_expand]: 2.21e-06 [flash_sp_send_recv_attached]: 9.70002e-07 [receive_attached]: 1.03001e-06 [after_resolve]: 9.00999e-06 [a_after_grad]: 8.20999e-06 [renormalize]: 8.00064e-08 [add_forward_monad_depend]: 1.49e-06 [auto_monad_grad]: 9.50007e-07 [auto_monad_eliminator]: 7.61001e-06 [cse]: 1.486e-05 [a_3]: 3.442e-05 [py_interpret_to_execute_after_opt_a]: 8.79003e-06 [slice_cell_reuse_recomputed_activation]: 1.95001e-06 [rewriter_after_opt_a]: 3.409e-05 [convert_after_rewriter]: 6.68998e-06 [order_py_execute_after_rewriter]: 5.30001e-06 [mutable_eliminate]: 0.0004798 [opt_b]: 0.00019642, [1] [Cycle 1]: 0.00019017, [7] [b_1]: 0.00011703 [b_2]: 7.52002e-06 [updatestate_depend_eliminate]: 5.67999e-06 [updatestate_assign_eliminate]: 2.30002e-06 [updatestate_loads_eliminate]: 2.22001e-06 [renormalize]: 4.69998e-07 [cse]: 1.856e-05 [optimize_parallel_all_gather_comm]: 1.566e-05 [overlap_param_gather]: 2.00002e-06 [cconv]: 2.511e-05 [loop_unroll]: 0.00041585 [opt_after_cconv]: 9.616e-05, [1] [Cycle 1]: 9.092e-05, [7] [c_1]: 2.754e-05 [parameter_eliminate]: 2.70997e-06 [updatestate_depend_eliminate]: 5.08002e-06 [updatestate_assign_eliminate]: 2.39001e-06 [updatestate_loads_eliminate]: 2.37999e-06 [cse]: 1.719e-05 [renormalize]: 4.00003e-07 [remove_dup_value]: 1.365e-05 [tuple_transform]: 7.199e-05, [1] [Cycle 1]: 6.733e-05, [4] [d_1]: 4.037e-05 [none_parameter_eliminate]: 1.62999e-06 [renormalize]: 1.69995e-07 [switch_simplify]: 6.51e-06 [partial_unused_args_eliminate]: 1.91e-06 [add_recomputation]: 4.403e-05 [cse_after_recomputation]: 2.149e-05, [1] [Cycle 1]: 1.685e-05, [1] [cse]: 1.165e-05 [environ_conv]: 5.39998e-06 [swap_dp_allreduce_reducescatter]: 5.26998e-06 [bias_add_comm_swap]: 2.40002e-06 [label_micro_interleaved_index]: 4.08001e-06 [label_fine_grained_interleaved_index]: 2.69001e-06 [merge_cast_opt]: 1.32e-06 [slice_recompute_activation]: 2.23002e-06 [micro_interleaved_order_control]: 2.06998e-06 [assign_add_opt]: 1.19e-06 [ForceFp32Comm]: 1.09003e-06 [remove_cast_before_assign_add]: 1.24998e-06 [full_micro_interleaved_order_control]: 2.21e-06 [reorder_send_recv_between_fp_bp]: 2.79999e-06 [comm_op_add_attrs]: 1.08001e-06 [add_comm_op_reuse_tag]: 1.40999e-06 [interleave_split_concat_branches]: 1.09e-06 [interleave_parallel_branches]: 1.02e-06 [overlap_opt_shard_in_pipeline]: 1.12999e-06 [overlap_opt_shard_grad_in_pipeline]: 1.67999e-06 [control_data_broadcast_order]: 1.19e-05 [grouped_pairwise_exchange_alltoall]: 1.54e-06 [offloading_packed_experts]: 3.76001e-06 [overlap_recompute_and_grad_model_parallel]: 5.00001e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.16002e-06 [overlap_recompute_allgather_and_fa_grad]: 1.52001e-06 [overlap_recompute_comm]: 2.23002e-06 [overlap_grad_ring_attention]: 4.2e-06 [overlap_grad_flash_sp]: 1.734e-05 [begin_end_overlap_inline]: 5.69999e-07 [split_matmul_comm_elemetwise]: 2.49001e-06 [split_layernorm_comm]: 1.61002e-06 [handle_group_info]: 1.47999e-06 [symbol_engine_optimizer]: 7.282e-05, [1] [Cycle 1]: 6.833e-05, [6] [build]: 2.51e-06 [elim_shapecalc]: 9.47999e-06 [elim_not_effective]: 1.239e-05 [opt_reshape]: 6.61e-06 [fold_const_symbol]: 9.44e-06 [renormalize]: 2.50002e-07 [detach_backward]: 2.27001e-06 [pipeline_parallel_scheduler]: 1.57999e-06 [auto_monad_reorder]: 1.666e-05 [get_jit_bprop_graph]: 1.20001e-06 [rewriter_after_jit_bprop_graph]: 4.23999e-06 [opt_after_jit_grad]: 0.00047223 [validate]: 3.641e-05 Sums bootstrap : 0.000435s : 4.82% type_inference : 0.004728s : 52.40% event_method : 0.000012s : 0.13% auto_monad : 0.000054s : 0.60% graph_reusing : 0.000005s : 0.06% inline : 0.000003s : 0.03% add_attr.add_attr_with_inline.tag_attr : 0.000014s : 0.16% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000004s : 0.04% parallel-infer-symbol : 0.000003s : 0.04% pre_auto_parallel : 0.000025s : 0.28% insert-virtual-dataset : 0.000003s : 0.03% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.02% optimize.py_interpret_to_execute : 0.000018s : 0.20% optimize.rewriter_before_opt_a : 0.000045s : 0.50% optimize.opt_a.expand_dump_flag : 0.000004s : 0.04% optimize.opt_a.switch_simplify : 0.000034s : 0.38% optimize.opt_a.loop_unroll : 0.000019s : 0.21% optimize.opt_a.a_1 : 0.000464s : 5.14% optimize.opt_a.with_stream_mark : 0.000026s : 0.29% optimize.opt_a.recompute_prepare : 0.000015s : 0.17% optimize.opt_a.updatestate_depend_eliminate : 0.000008s : 0.08% optimize.opt_a.updatestate_assign_eliminate : 0.000006s : 0.07% optimize.opt_a.updatestate_loads_eliminate : 0.000007s : 0.07% optimize.opt_a.parameter_eliminate : 0.000003s : 0.03% optimize.opt_a.a_2 : 0.000176s : 1.95% optimize.opt_a.accelerated_algorithm : 0.000013s : 0.15% optimize.opt_a.shard : 0.000003s : 0.04% optimize.opt_a.meta_shard_fg_expand : 0.000003s : 0.03% optimize.opt_a.shard_inline : 0.000012s : 0.13% optimize.opt_a.merge_send_recv : 0.000013s : 0.15% optimize.opt_a.auto_parallel : 0.000012s : 0.13% optimize.opt_a.parallel : 0.000022s : 0.25% optimize.opt_a.flash_sp : 0.000012s : 0.13% optimize.opt_a.merge_comm : 0.000007s : 0.08% optimize.opt_a.allreduce_fusion : 0.000006s : 0.07% optimize.opt_a.matmul_add_comm_reduction : 0.000015s : 0.17% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000015s : 0.16% optimize.opt_a.virtual_dataset : 0.000012s : 0.13% optimize.opt_a.get_grad_eliminate_ : 0.000011s : 0.13% optimize.opt_a.virtual_output : 0.000011s : 0.12% optimize.opt_a.merge_forward : 0.000007s : 0.08% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.03% optimize.opt_a.offload_activation : 0.000016s : 0.18% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000023s : 0.25% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.03% optimize.opt_a.before_grad : 0.000019s : 0.21% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000008s : 0.09% optimize.opt_a.meta_fg_expand : 0.000005s : 0.05% optimize.opt_a.flash_sp_send_recv_attached : 0.000003s : 0.04% optimize.opt_a.receive_attached : 0.000003s : 0.04% optimize.opt_a.after_resolve : 0.000019s : 0.21% optimize.opt_a.a_after_grad : 0.000017s : 0.19% optimize.opt_a.renormalize : 0.000458s : 5.08% optimize.opt_a.add_forward_monad_depend : 0.000006s : 0.07% optimize.opt_a.auto_monad_grad : 0.000003s : 0.03% optimize.opt_a.auto_monad_eliminator : 0.000022s : 0.24% optimize.opt_a.cse : 0.000045s : 0.50% optimize.opt_a.a_3 : 0.000080s : 0.89% optimize.py_interpret_to_execute_after_opt_a : 0.000009s : 0.10% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.02% optimize.rewriter_after_opt_a : 0.000034s : 0.38% optimize.convert_after_rewriter : 0.000007s : 0.07% optimize.order_py_execute_after_rewriter : 0.000005s : 0.06% optimize.mutable_eliminate : 0.000480s : 5.32% optimize.opt_b.b_1 : 0.000117s : 1.30% optimize.opt_b.b_2 : 0.000008s : 0.08% optimize.opt_b.updatestate_depend_eliminate : 0.000006s : 0.06% optimize.opt_b.updatestate_assign_eliminate : 0.000002s : 0.03% optimize.opt_b.updatestate_loads_eliminate : 0.000002s : 0.02% optimize.opt_b.renormalize : 0.000000s : 0.01% optimize.opt_b.cse : 0.000019s : 0.21% optimize.optimize_parallel_all_gather_comm : 0.000016s : 0.17% optimize.overlap_param_gather : 0.000002s : 0.02% optimize.cconv : 0.000025s : 0.28% optimize.loop_unroll : 0.000416s : 4.61% optimize.opt_after_cconv.c_1 : 0.000028s : 0.31% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000005s : 0.06% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000002s : 0.03% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.03% optimize.opt_after_cconv.cse : 0.000017s : 0.19% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000014s : 0.15% optimize.tuple_transform.d_1 : 0.000040s : 0.45% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.02% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000007s : 0.07% optimize.partial_unused_args_eliminate : 0.000002s : 0.02% optimize.add_recomputation : 0.000044s : 0.49% optimize.cse_after_recomputation.cse : 0.000012s : 0.13% optimize.environ_conv : 0.000005s : 0.06% optimize.swap_dp_allreduce_reducescatter : 0.000005s : 0.06% optimize.bias_add_comm_swap : 0.000002s : 0.03% optimize.label_micro_interleaved_index : 0.000004s : 0.05% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.03% optimize.merge_cast_opt : 0.000001s : 0.01% optimize.slice_recompute_activation : 0.000002s : 0.02% optimize.micro_interleaved_order_control : 0.000002s : 0.02% optimize.assign_add_opt : 0.000001s : 0.01% optimize.ForceFp32Comm : 0.000001s : 0.01% optimize.remove_cast_before_assign_add : 0.000001s : 0.01% optimize.full_micro_interleaved_order_control : 0.000002s : 0.02% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.03% optimize.comm_op_add_attrs : 0.000001s : 0.01% optimize.add_comm_op_reuse_tag : 0.000001s : 0.02% optimize.interleave_split_concat_branches : 0.000001s : 0.01% optimize.interleave_parallel_branches : 0.000001s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.02% optimize.control_data_broadcast_order : 0.000012s : 0.13% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.02% optimize.offloading_packed_experts : 0.000004s : 0.04% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.06% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000002s : 0.02% optimize.overlap_recompute_comm : 0.000002s : 0.02% optimize.overlap_grad_ring_attention : 0.000004s : 0.05% optimize.overlap_grad_flash_sp : 0.000017s : 0.19% optimize.begin_end_overlap_inline : 0.000001s : 0.01% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.03% optimize.split_layernorm_comm : 0.000002s : 0.02% optimize.handle_group_info : 0.000001s : 0.02% optimize.symbol_engine_optimizer.build : 0.000003s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000009s : 0.11% optimize.symbol_engine_optimizer.elim_not_effective : 0.000012s : 0.14% optimize.symbol_engine_optimizer.opt_reshape : 0.000007s : 0.07% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000009s : 0.10% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.03% pipeline_parallel_scheduler : 0.000002s : 0.02% auto_monad_reorder : 0.000017s : 0.18% get_jit_bprop_graph : 0.000001s : 0.01% rewriter_after_jit_bprop_graph : 0.000004s : 0.05% opt_after_jit_grad : 0.000472s : 5.23% validate : 0.000036s : 0.40% Time group info: ------[substitution.] 0.000176 23 40.40% : 0.000071s : 4: substitution.arithmetic_simplify 1.06% : 0.000002s : 2: substitution.elim_not_effective 0.71% : 0.000001s : 2: substitution.fold_const_symbol 3.19% : 0.000006s : 3: substitution.graph_param_transform 48.21% : 0.000085s : 2: substitution.inline 1.82% : 0.000003s : 4: substitution.j_node_and_user_rematch 2.63% : 0.000005s : 4: substitution.remove_not_recompute_node 1.99% : 0.000004s : 2: substitution.replace_old_param ------[type_inference.] 0.004682 2 90.35% : 0.004230s : 1: type_inference.infer 9.65% : 0.000452s : 1: type_inference.specialize ------[replace.] 0.000033 2 100.00% : 0.000033s : 2: replace.inline ------[match.] 0.000083 2 100.00% : 0.000083s : 2: match.inline ------[predicate.] 0.000134 754 0.83% : 0.000001s : 7: predicate.accumulaten_eliminater 1.46% : 0.000002s : 3: predicate.ad_related_special_op_eliminate 0.63% : 0.000001s : 6: predicate.addn_check_dump 0.82% : 0.000001s : 7: predicate.addn_zero_filter 0.70% : 0.000001s : 7: predicate.adjust_all_reduce_mul_add 2.70% : 0.000004s : 13: predicate.arithmetic_simplify 0.79% : 0.000001s : 7: predicate.cast_eliminate 0.68% : 0.000001s : 6: predicate.check_bprop_eliminate 0.71% : 0.000001s : 6: predicate.compare_switch_simplify 0.22% : 0.000000s : 3: predicate.const_output_eliminate 0.71% : 0.000001s : 6: predicate.depend_value_elim 0.78% : 0.000001s : 7: predicate.dict_get_item_const_eliminator 0.96% : 0.000001s : 7: predicate.dict_get_item_eliminator 0.79% : 0.000001s : 7: predicate.dict_set_item_eliminator 1.28% : 0.000002s : 6: predicate.dumpgradient_eliminate 0.28% : 0.000000s : 3: predicate.elim_not_effective 0.48% : 0.000001s : 3: predicate.elim_shapecalc_of_broadcastargs 1.01% : 0.000001s : 10: predicate.environ_add_const_eliminate 1.10% : 0.000001s : 10: predicate.environ_get_add_eliminate 0.92% : 0.000001s : 10: predicate.environ_get_depend_swap 1.70% : 0.000002s : 16: predicate.environ_get_eliminate 1.02% : 0.000001s : 10: predicate.environ_get_set_eliminate 0.94% : 0.000001s : 9: predicate.exchange_switch_depend_value 1.98% : 0.000003s : 9: predicate.float_depend_g_call 0.68% : 0.000001s : 6: predicate.float_environ_get_switch 1.18% : 0.000002s : 9: predicate.float_tuple_getitem_switch 0.22% : 0.000000s : 3: predicate.fold_const_symbol 0.77% : 0.000001s : 6: predicate.get_grad_eliminate 0.41% : 0.000001s : 3: predicate.graph_param_transform 0.81% : 0.000001s : 6: predicate.incorporate_call 0.67% : 0.000001s : 6: predicate.incorporate_call_switch 6.50% : 0.000009s : 34: predicate.inline 1.01% : 0.000001s : 6: predicate.inline_without_move 0.39% : 0.000001s : 6: predicate.j_node_and_user_rematch 1.08% : 0.000001s : 6: predicate.less_batch_normalization 1.65% : 0.000002s : 13: predicate.list_to_tuple_eliminator_ 2.06% : 0.000003s : 20: predicate.load_eliminater 1.13% : 0.000002s : 3: predicate.loop_unroll_after_grad 1.64% : 0.000002s : 14: predicate.loop_unroll_before_grad 1.73% : 0.000002s : 13: predicate.make_slice_get_slice_eliminator 0.75% : 0.000001s : 6: predicate.merge_addn 0.77% : 0.000001s : 6: predicate.micro_step_allgather_replace 0.72% : 0.000001s : 6: predicate.mini_step_allgather_replace 0.93% : 0.000001s : 7: predicate.minmaximum_grad 1.38% : 0.000002s : 3: predicate.mutable_eliminate 0.45% : 0.000001s : 3: predicate.opt_reshape 0.42% : 0.000001s : 3: predicate.parallel_virtual_node 1.33% : 0.000002s : 9: predicate.partial_defer_inline 1.23% : 0.000002s : 10: predicate.partial_eliminate 0.75% : 0.000001s : 7: predicate.print_const_string_wrapper 0.76% : 0.000001s : 6: predicate.reduce_all_const_elim 1.08% : 0.000001s : 7: predicate.reduce_eliminate 2.06% : 0.000003s : 20: predicate.redundant_stop_gradient_eliminater 0.63% : 0.000001s : 6: predicate.remove_not_recompute_node 1.19% : 0.000002s : 13: predicate.replace_applicator 0.75% : 0.000001s : 6: predicate.replace_old_param 0.34% : 0.000000s : 3: predicate.reset_defer_inline 0.98% : 0.000001s : 7: predicate.reshape_eliminate 0.70% : 0.000001s : 6: predicate.row_tensor_add_zeros_like 0.51% : 0.000001s : 3: predicate.row_tensor_eliminate 0.95% : 0.000001s : 6: predicate.same_eliminate 0.55% : 0.000001s : 6: predicate.set_cell_output_no_recompute 1.04% : 0.000001s : 6: predicate.shard_identity_eliminate 0.89% : 0.000001s : 6: predicate.special_op_eliminate 0.92% : 0.000001s : 6: predicate.specialize_transform 1.23% : 0.000002s : 6: predicate.split_environ_get_set_with_tuple_value 0.98% : 0.000001s : 6: predicate.stack_unstack_eliminate 0.41% : 0.000001s : 3: predicate.switch_call_monad_eliminater 1.01% : 0.000001s : 9: predicate.switch_defer_inline 1.71% : 0.000002s : 15: predicate.switch_layer_defer_inline 4.66% : 0.000006s : 32: predicate.switch_simplify 0.81% : 0.000001s : 7: predicate.tile_eliminate 0.82% : 0.000001s : 7: predicate.transpose_eliminate 1.51% : 0.000002s : 13: predicate.tuple_list_convert_item_index_to_positive 1.56% : 0.000002s : 13: predicate.tuple_list_get_item_const_eliminator 1.64% : 0.000002s : 13: predicate.tuple_list_get_item_depend_reorder 3.03% : 0.000004s : 19: predicate.tuple_list_get_item_eliminator 1.35% : 0.000002s : 13: predicate.tuple_list_get_set_item_eliminator 2.43% : 0.000003s : 19: predicate.tuple_list_set_item_eliminator 1.56% : 0.000002s : 13: predicate.tuple_to_list_eliminator_ 1.94% : 0.000003s : 20: predicate.updatestate_pure_node_eliminater 2.91% : 0.000004s : 26: predicate.updatestate_useless_node_eliminater 0.52% : 0.000001s : 3: predicate.value_based_eliminate 0.80% : 0.000001s : 6: predicate.virtual_dataset_eliminate 0.76% : 0.000001s : 6: predicate.virtual_output_eliminate 0.37% : 0.000000s : 3: predicate.virtual_view_grad_eliminate 0.53% : 0.000001s : 3: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000230 5 7.35% : 0.000017s : 1: func_graph_cloner_run.FuncGraphClonerGraph 92.65% : 0.000213s : 4: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.021610 192 0.02% : 0.000004s : 1: ForceFp32Comm 14.31% : 0.003092s : 1: add_attr 14.26% : 0.003082s : 1: add_attr_with_inline 0.02% : 0.000004s : 1: add_comm_op_reuse_tag 0.22% : 0.000048s : 1: add_recomputation 0.02% : 0.000004s : 1: assign_add_opt 0.27% : 0.000059s : 1: auto_monad 0.10% : 0.000021s : 1: auto_monad_reorder 0.02% : 0.000003s : 1: begin_end_overlap_inline 0.02% : 0.000005s : 1: bias_add_comm_swap 2.15% : 0.000465s : 1: bootstrap 0.13% : 0.000028s : 1: cconv 0.02% : 0.000004s : 1: comm_op_add_attrs 0.07% : 0.000015s : 1: control_data_broadcast_order 0.05% : 0.000010s : 1: convert_after_rewriter 0.11% : 0.000024s : 1: cse_after_recomputation 0.02% : 0.000005s : 1: dataset_repeat_opt 0.03% : 0.000006s : 1: detach_backward 0.04% : 0.000008s : 1: environ_conv 0.08% : 0.000018s : 1: event_method 0.02% : 0.000005s : 1: full_micro_interleaved_order_control 0.02% : 0.000004s : 1: get_jit_bprop_graph 0.04% : 0.000009s : 1: graph_reusing 0.02% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.02% : 0.000004s : 1: handle_group_info 0.03% : 0.000005s : 1: inline 0.03% : 0.000006s : 1: insert-virtual-dataset 0.02% : 0.000004s : 1: interleave_parallel_branches 0.02% : 0.000004s : 1: interleave_split_concat_branches 0.03% : 0.000006s : 1: label_fine_grained_interleaved_index 0.03% : 0.000007s : 1: label_micro_interleaved_index 1.96% : 0.000424s : 1: loop_unroll 0.02% : 0.000004s : 1: merge_cast_opt 0.02% : 0.000005s : 1: micro_interleaved_order_control 2.26% : 0.000488s : 1: mutable_eliminate 0.03% : 0.000007s : 1: offloading_packed_experts 0.06% : 0.000014s : 1: opt.transform.loop_unroll_optimizer 0.07% : 0.000014s : 1: opt.transform.mutable_eliminate 3.92% : 0.000848s : 78: opt.transform.opt_a 0.12% : 0.000026s : 1: opt.transform.opt_after_cconv 0.11% : 0.000024s : 1: opt.transform.opt_after_jit_grad 0.43% : 0.000094s : 28: opt.transform.opt_b 0.21% : 0.000044s : 2: opt.transform.opt_trans_graph 0.16% : 0.000034s : 4: opt.transform.symbol_engine_opt 9.91% : 0.002141s : 1: opt_a 0.46% : 0.000100s : 1: opt_after_cconv 2.23% : 0.000481s : 1: opt_after_jit_grad 0.92% : 0.000200s : 1: opt_b 18.64% : 0.004028s : 1: optimize 0.09% : 0.000019s : 1: optimize_parallel_all_gather_comm 0.04% : 0.000008s : 1: order_py_execute_after_rewriter 0.10% : 0.000021s : 1: overlap_grad_flash_sp 0.02% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.03% : 0.000007s : 1: overlap_grad_ring_attention 0.02% : 0.000004s : 1: overlap_opt_shard_grad_in_pipeline 0.02% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.02% : 0.000005s : 1: overlap_param_gather 0.02% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.04% : 0.000008s : 1: overlap_recompute_and_grad_model_parallel 0.02% : 0.000005s : 1: overlap_recompute_comm 0.03% : 0.000007s : 1: parallel-infer-symbol 0.02% : 0.000004s : 1: parallel-infer-symbol-second 0.02% : 0.000005s : 1: partial_unused_args_eliminate 0.02% : 0.000005s : 1: pipeline_parallel_scheduler 0.02% : 0.000004s : 1: pipeline_split 0.13% : 0.000029s : 1: pre_auto_parallel 0.10% : 0.000021s : 1: py_interpret_to_execute 0.06% : 0.000012s : 1: py_interpret_to_execute_after_opt_a 0.02% : 0.000004s : 1: remove_cast_before_assign_add 0.08% : 0.000017s : 1: remove_dup_value 1.13% : 0.000245s : 1: renormalize.infer 0.95% : 0.000206s : 1: renormalize.specialize 0.03% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.03% : 0.000008s : 1: rewriter_after_jit_bprop_graph 0.18% : 0.000038s : 1: rewriter_after_opt_a 0.23% : 0.000049s : 1: rewriter_before_opt_a 0.02% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.02% : 0.000005s : 1: slice_recompute_activation 0.02% : 0.000005s : 1: split_layernorm_comm 0.02% : 0.000005s : 1: split_matmul_comm_elemetwise 0.04% : 0.000008s : 1: swap_dp_allreduce_reducescatter 0.35% : 0.000076s : 1: symbol_engine_optimizer 0.35% : 0.000075s : 1: tuple_transform 21.95% : 0.004743s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:48.904.413 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:48.904.693 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0150458, [21] [bootstrap]: 0.00043474 [type_inference]: 0.00493707 [event_method]: 1.331e-05 [auto_monad]: 5.517e-05 [graph_reusing]: 5.20001e-06 [inline]: 2.59001e-06 [add_attr]: 0.0032779, [1] [add_attr_with_inline]: 0.00326905, [1] [Cycle 1]: 7.376e-05, [2] [tag_attr]: 1.457e-05 [meta_addattr_fg_expand]: 4.24997e-06 [parallel-infer-symbol]: 3.76999e-06 [pre_auto_parallel]: 2.663e-05 [insert-virtual-dataset]: 2.59001e-06 [parallel-infer-symbol-second]: 7.59988e-07 [dataset_repeat_opt]: 2.07001e-06 [pipeline_split]: 1.81e-06 [optimize]: 0.00508305, [53] [py_interpret_to_execute]: 2.277e-05 [rewriter_before_opt_a]: 5.074e-05 [opt_a]: 0.0027369, [2] [Cycle 1]: 0.0018137, [45] [expand_dump_flag]: 2.73e-06 [switch_simplify]: 2.54e-05 [loop_unroll]: 1.395e-05 [a_1]: 0.00033375 [with_stream_mark]: 1.866e-05 [recompute_prepare]: 8.50999e-06 [updatestate_depend_eliminate]: 4.08001e-06 [updatestate_assign_eliminate]: 3.6e-06 [updatestate_loads_eliminate]: 3.34001e-06 [parameter_eliminate]: 2.06998e-06 [a_2]: 0.00012605 [accelerated_algorithm]: 7.25e-06 [shard]: 2.26e-06 [meta_shard_fg_expand]: 1.71e-06 [shard_inline]: 6.93e-06 [merge_send_recv]: 8.85001e-06 [auto_parallel]: 7.23999e-06 [parallel]: 1.796e-05 [flash_sp]: 8.53001e-06 [merge_comm]: 3.99002e-06 [allreduce_fusion]: 3.68e-06 [matmul_add_comm_reduction]: 9.41998e-06 [allreduce_slice_to_reducescatter]: 9.70002e-07 [virtual_shard_identity]: 8.66002e-06 [virtual_dataset]: 6.50002e-06 [get_grad_eliminate_]: 6.58e-06 [virtual_output]: 6.02999e-06 [merge_forward]: 4.2e-06 [cell_reuse_recompute_pass]: 1.42e-06 [offload_activation]: 1.164e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.451e-05 [merge_recompute_call_nodes]: 1.52001e-06 [before_grad]: 1.155e-05 [set_forward_comm_id_for_comm_node_pass]: 4.79e-06 [meta_fg_expand]: 3.06999e-06 [flash_sp_send_recv_attached]: 2.76e-06 [receive_attached]: 2.04e-06 [after_resolve]: 1.133e-05 [a_after_grad]: 9.80002e-06 [renormalize]: 0.00056256 [add_forward_monad_depend]: 5.71e-06 [auto_monad_grad]: 2.20002e-06 [auto_monad_eliminator]: 1.538e-05 [cse]: 2.887e-05 [a_3]: 6.365e-05 [Cycle 2]: 0.00090874, [45] [expand_dump_flag]: 1.20999e-06 [switch_simplify]: 7.54002e-06 [loop_unroll]: 6.19001e-06 [a_1]: 0.00013701 [with_stream_mark]: 1.316e-05 [recompute_prepare]: 7.6e-06 [updatestate_depend_eliminate]: 3.66001e-06 [updatestate_assign_eliminate]: 2.46e-06 [updatestate_loads_eliminate]: 2.71999e-06 [parameter_eliminate]: 1.35999e-06 [a_2]: 0.00011348 [accelerated_algorithm]: 7.14001e-06 [shard]: 1.38002e-06 [meta_shard_fg_expand]: 1.41002e-06 [shard_inline]: 6.79999e-06 [merge_send_recv]: 7.18e-06 [auto_parallel]: 6.49999e-06 [parallel]: 5.59e-06 [flash_sp]: 3.66001e-06 [merge_comm]: 3.78999e-06 [allreduce_fusion]: 3.38999e-06 [matmul_add_comm_reduction]: 6.73e-06 [allreduce_slice_to_reducescatter]: 5.50004e-07 [virtual_shard_identity]: 7.25e-06 [virtual_dataset]: 6.33e-06 [get_grad_eliminate_]: 5.52001e-06 [virtual_output]: 5.56e-06 [merge_forward]: 3.31001e-06 [cell_reuse_recompute_pass]: 1.47999e-06 [offload_activation]: 8.37e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.405e-05 [merge_recompute_call_nodes]: 1.20001e-06 [before_grad]: 9.86e-06 [set_forward_comm_id_for_comm_node_pass]: 4.35e-06 [meta_fg_expand]: 2.71999e-06 [flash_sp_send_recv_attached]: 1.04e-06 [receive_attached]: 1.60001e-06 [after_resolve]: 1.089e-05 [a_after_grad]: 9.15999e-06 [renormalize]: 8.00064e-08 [add_forward_monad_depend]: 2.78e-06 [auto_monad_grad]: 1.69998e-06 [auto_monad_eliminator]: 1.118e-05 [cse]: 1.972e-05 [a_3]: 5.015e-05 [py_interpret_to_execute_after_opt_a]: 1.394e-05 [slice_cell_reuse_recomputed_activation]: 4.60999e-06 [rewriter_after_opt_a]: 4.271e-05 [convert_after_rewriter]: 1.011e-05 [order_py_execute_after_rewriter]: 8.22998e-06 [mutable_eliminate]: 0.00055216 [opt_b]: 0.00027862, [1] [Cycle 1]: 0.00026854, [7] [b_1]: 0.00016576 [b_2]: 7.89002e-06 [updatestate_depend_eliminate]: 7.06999e-06 [updatestate_assign_eliminate]: 2.84001e-06 [updatestate_loads_eliminate]: 2.53e-06 [renormalize]: 6.69999e-07 [cse]: 2.522e-05 [optimize_parallel_all_gather_comm]: 2.075e-05 [overlap_param_gather]: 5.05001e-06 [cconv]: 3.208e-05 [loop_unroll]: 0.00046409 [opt_after_cconv]: 0.00013355, [1] [Cycle 1]: 0.00012423, [7] [c_1]: 2.856e-05 [parameter_eliminate]: 3.86999e-06 [updatestate_depend_eliminate]: 6.02999e-06 [updatestate_assign_eliminate]: 2.76e-06 [updatestate_loads_eliminate]: 2.91999e-06 [cse]: 2.26e-05 [renormalize]: 4.50003e-07 [remove_dup_value]: 1.776e-05 [tuple_transform]: 8.983e-05, [1] [Cycle 1]: 8.227e-05, [4] [d_1]: 4.225e-05 [none_parameter_eliminate]: 2.02999e-06 [renormalize]: 2.00002e-07 [switch_simplify]: 6.74001e-06 [partial_unused_args_eliminate]: 4.61002e-06 [add_recomputation]: 5.111e-05 [cse_after_recomputation]: 2.774e-05, [1] [Cycle 1]: 2.107e-05, [1] [cse]: 1.162e-05 [environ_conv]: 8.10999e-06 [swap_dp_allreduce_reducescatter]: 8.05e-06 [bias_add_comm_swap]: 5.24998e-06 [label_micro_interleaved_index]: 6.83e-06 [label_fine_grained_interleaved_index]: 5.25999e-06 [merge_cast_opt]: 3.9e-06 [slice_recompute_activation]: 4.85999e-06 [micro_interleaved_order_control]: 5.17e-06 [assign_add_opt]: 3.68999e-06 [ForceFp32Comm]: 3.18e-06 [remove_cast_before_assign_add]: 3.92998e-06 [full_micro_interleaved_order_control]: 4.97e-06 [reorder_send_recv_between_fp_bp]: 5.65001e-06 [comm_op_add_attrs]: 3.93001e-06 [add_comm_op_reuse_tag]: 3.33e-06 [interleave_split_concat_branches]: 3.45e-06 [interleave_parallel_branches]: 3.60998e-06 [overlap_opt_shard_in_pipeline]: 3.76001e-06 [overlap_opt_shard_grad_in_pipeline]: 4.35e-06 [control_data_broadcast_order]: 1.636e-05 [grouped_pairwise_exchange_alltoall]: 4.24002e-06 [offloading_packed_experts]: 7.44002e-06 [overlap_recompute_and_grad_model_parallel]: 7.7e-06 [overlap_grad_matmul_and_grad_allreduce]: 4.04002e-06 [overlap_recompute_allgather_and_fa_grad]: 3.71001e-06 [overlap_recompute_comm]: 5.27001e-06 [overlap_grad_ring_attention]: 7.25003e-06 [overlap_grad_flash_sp]: 2.377e-05 [begin_end_overlap_inline]: 3.06999e-06 [split_matmul_comm_elemetwise]: 4.64002e-06 [split_layernorm_comm]: 4.28999e-06 [handle_group_info]: 3.38999e-06 [symbol_engine_optimizer]: 0.00010717, [1] [Cycle 1]: 9.928e-05, [6] [build]: 3.23e-06 [elim_shapecalc]: 1.226e-05 [elim_not_effective]: 1.366e-05 [opt_reshape]: 7.66999e-06 [fold_const_symbol]: 1.071e-05 [renormalize]: 2.09984e-07 [detach_backward]: 3.28e-06 [pipeline_parallel_scheduler]: 1.84998e-06 [auto_monad_reorder]: 2.068e-05 [get_jit_bprop_graph]: 1.89e-06 [rewriter_after_jit_bprop_graph]: 5.17e-06 [opt_after_jit_grad]: 0.00052103 [validate]: 4.061e-05 Sums bootstrap : 0.000435s : 4.39% type_inference : 0.004937s : 49.82% event_method : 0.000013s : 0.13% auto_monad : 0.000055s : 0.56% graph_reusing : 0.000005s : 0.05% inline : 0.000003s : 0.03% add_attr.add_attr_with_inline.tag_attr : 0.000015s : 0.15% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000004s : 0.04% parallel-infer-symbol : 0.000004s : 0.04% pre_auto_parallel : 0.000027s : 0.27% insert-virtual-dataset : 0.000003s : 0.03% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.02% optimize.py_interpret_to_execute : 0.000023s : 0.23% optimize.rewriter_before_opt_a : 0.000051s : 0.51% optimize.opt_a.expand_dump_flag : 0.000004s : 0.04% optimize.opt_a.switch_simplify : 0.000033s : 0.33% optimize.opt_a.loop_unroll : 0.000020s : 0.20% optimize.opt_a.a_1 : 0.000471s : 4.75% optimize.opt_a.with_stream_mark : 0.000032s : 0.32% optimize.opt_a.recompute_prepare : 0.000016s : 0.16% optimize.opt_a.updatestate_depend_eliminate : 0.000008s : 0.08% optimize.opt_a.updatestate_assign_eliminate : 0.000006s : 0.06% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.06% optimize.opt_a.parameter_eliminate : 0.000003s : 0.03% optimize.opt_a.a_2 : 0.000240s : 2.42% optimize.opt_a.accelerated_algorithm : 0.000014s : 0.15% optimize.opt_a.shard : 0.000004s : 0.04% optimize.opt_a.meta_shard_fg_expand : 0.000003s : 0.03% optimize.opt_a.shard_inline : 0.000014s : 0.14% optimize.opt_a.merge_send_recv : 0.000016s : 0.16% optimize.opt_a.auto_parallel : 0.000014s : 0.14% optimize.opt_a.parallel : 0.000024s : 0.24% optimize.opt_a.flash_sp : 0.000012s : 0.12% optimize.opt_a.merge_comm : 0.000008s : 0.08% optimize.opt_a.allreduce_fusion : 0.000007s : 0.07% optimize.opt_a.matmul_add_comm_reduction : 0.000016s : 0.16% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000002s : 0.02% optimize.opt_a.virtual_shard_identity : 0.000016s : 0.16% optimize.opt_a.virtual_dataset : 0.000013s : 0.13% optimize.opt_a.get_grad_eliminate_ : 0.000012s : 0.12% optimize.opt_a.virtual_output : 0.000012s : 0.12% optimize.opt_a.merge_forward : 0.000008s : 0.08% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.03% optimize.opt_a.offload_activation : 0.000020s : 0.20% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000029s : 0.29% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.03% optimize.opt_a.before_grad : 0.000021s : 0.22% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000009s : 0.09% optimize.opt_a.meta_fg_expand : 0.000006s : 0.06% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.04% optimize.opt_a.receive_attached : 0.000004s : 0.04% optimize.opt_a.after_resolve : 0.000022s : 0.22% optimize.opt_a.a_after_grad : 0.000019s : 0.19% optimize.opt_a.renormalize : 0.000563s : 5.68% optimize.opt_a.add_forward_monad_depend : 0.000008s : 0.09% optimize.opt_a.auto_monad_grad : 0.000004s : 0.04% optimize.opt_a.auto_monad_eliminator : 0.000027s : 0.27% optimize.opt_a.cse : 0.000049s : 0.49% optimize.opt_a.a_3 : 0.000114s : 1.15% optimize.py_interpret_to_execute_after_opt_a : 0.000014s : 0.14% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.05% optimize.rewriter_after_opt_a : 0.000043s : 0.43% optimize.convert_after_rewriter : 0.000010s : 0.10% optimize.order_py_execute_after_rewriter : 0.000008s : 0.08% optimize.mutable_eliminate : 0.000552s : 5.57% optimize.opt_b.b_1 : 0.000166s : 1.67% optimize.opt_b.b_2 : 0.000008s : 0.08% optimize.opt_b.updatestate_depend_eliminate : 0.000007s : 0.07% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_b.renormalize : 0.000001s : 0.01% optimize.opt_b.cse : 0.000025s : 0.25% optimize.optimize_parallel_all_gather_comm : 0.000021s : 0.21% optimize.overlap_param_gather : 0.000005s : 0.05% optimize.cconv : 0.000032s : 0.32% optimize.loop_unroll : 0.000464s : 4.68% optimize.opt_after_cconv.c_1 : 0.000029s : 0.29% optimize.opt_after_cconv.parameter_eliminate : 0.000004s : 0.04% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.06% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.cse : 0.000023s : 0.23% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000018s : 0.18% optimize.tuple_transform.d_1 : 0.000042s : 0.43% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.02% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000007s : 0.07% optimize.partial_unused_args_eliminate : 0.000005s : 0.05% optimize.add_recomputation : 0.000051s : 0.52% optimize.cse_after_recomputation.cse : 0.000012s : 0.12% optimize.environ_conv : 0.000008s : 0.08% optimize.swap_dp_allreduce_reducescatter : 0.000008s : 0.08% optimize.bias_add_comm_swap : 0.000005s : 0.05% optimize.label_micro_interleaved_index : 0.000007s : 0.07% optimize.label_fine_grained_interleaved_index : 0.000005s : 0.05% optimize.merge_cast_opt : 0.000004s : 0.04% optimize.slice_recompute_activation : 0.000005s : 0.05% optimize.micro_interleaved_order_control : 0.000005s : 0.05% optimize.assign_add_opt : 0.000004s : 0.04% optimize.ForceFp32Comm : 0.000003s : 0.03% optimize.remove_cast_before_assign_add : 0.000004s : 0.04% optimize.full_micro_interleaved_order_control : 0.000005s : 0.05% optimize.reorder_send_recv_between_fp_bp : 0.000006s : 0.06% optimize.comm_op_add_attrs : 0.000004s : 0.04% optimize.add_comm_op_reuse_tag : 0.000003s : 0.03% optimize.interleave_split_concat_branches : 0.000003s : 0.03% optimize.interleave_parallel_branches : 0.000004s : 0.04% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.04% optimize.overlap_opt_shard_grad_in_pipeline : 0.000004s : 0.04% optimize.control_data_broadcast_order : 0.000016s : 0.17% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.04% optimize.offloading_packed_experts : 0.000007s : 0.08% optimize.overlap_recompute_and_grad_model_parallel : 0.000008s : 0.08% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.04% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.04% optimize.overlap_recompute_comm : 0.000005s : 0.05% optimize.overlap_grad_ring_attention : 0.000007s : 0.07% optimize.overlap_grad_flash_sp : 0.000024s : 0.24% optimize.begin_end_overlap_inline : 0.000003s : 0.03% optimize.split_matmul_comm_elemetwise : 0.000005s : 0.05% optimize.split_layernorm_comm : 0.000004s : 0.04% optimize.handle_group_info : 0.000003s : 0.03% optimize.symbol_engine_optimizer.build : 0.000003s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000012s : 0.12% optimize.symbol_engine_optimizer.elim_not_effective : 0.000014s : 0.14% optimize.symbol_engine_optimizer.opt_reshape : 0.000008s : 0.08% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000011s : 0.11% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000003s : 0.03% pipeline_parallel_scheduler : 0.000002s : 0.02% auto_monad_reorder : 0.000021s : 0.21% get_jit_bprop_graph : 0.000002s : 0.02% rewriter_after_jit_bprop_graph : 0.000005s : 0.05% opt_after_jit_grad : 0.000521s : 5.26% validate : 0.000041s : 0.41% Time group info: ------[substitution.] 0.000191 23 40.04% : 0.000077s : 4: substitution.arithmetic_simplify 1.02% : 0.000002s : 2: substitution.elim_not_effective 0.68% : 0.000001s : 2: substitution.fold_const_symbol 2.78% : 0.000005s : 3: substitution.graph_param_transform 48.56% : 0.000093s : 2: substitution.inline 2.04% : 0.000004s : 4: substitution.j_node_and_user_rematch 2.30% : 0.000004s : 4: substitution.remove_not_recompute_node 2.58% : 0.000005s : 2: substitution.replace_old_param ------[type_inference.] 0.004884 2 90.98% : 0.004443s : 1: type_inference.infer 9.02% : 0.000441s : 1: type_inference.specialize ------[replace.] 0.000023 2 100.00% : 0.000023s : 2: replace.inline ------[match.] 0.000091 2 100.00% : 0.000091s : 2: match.inline ------[predicate.] 0.000143 754 0.73% : 0.000001s : 7: predicate.accumulaten_eliminater 1.55% : 0.000002s : 3: predicate.ad_related_special_op_eliminate 0.60% : 0.000001s : 6: predicate.addn_check_dump 1.01% : 0.000001s : 7: predicate.addn_zero_filter 0.62% : 0.000001s : 7: predicate.adjust_all_reduce_mul_add 3.62% : 0.000005s : 13: predicate.arithmetic_simplify 0.78% : 0.000001s : 7: predicate.cast_eliminate 0.95% : 0.000001s : 6: predicate.check_bprop_eliminate 0.67% : 0.000001s : 6: predicate.compare_switch_simplify 0.23% : 0.000000s : 3: predicate.const_output_eliminate 0.79% : 0.000001s : 6: predicate.depend_value_elim 0.80% : 0.000001s : 7: predicate.dict_get_item_const_eliminator 0.90% : 0.000001s : 7: predicate.dict_get_item_eliminator 0.73% : 0.000001s : 7: predicate.dict_set_item_eliminator 1.36% : 0.000002s : 6: predicate.dumpgradient_eliminate 0.26% : 0.000000s : 3: predicate.elim_not_effective 0.52% : 0.000001s : 3: predicate.elim_shapecalc_of_broadcastargs 1.06% : 0.000002s : 10: predicate.environ_add_const_eliminate 0.99% : 0.000001s : 10: predicate.environ_get_add_eliminate 0.95% : 0.000001s : 10: predicate.environ_get_depend_swap 1.80% : 0.000003s : 16: predicate.environ_get_eliminate 0.91% : 0.000001s : 10: predicate.environ_get_set_eliminate 0.88% : 0.000001s : 9: predicate.exchange_switch_depend_value 1.81% : 0.000003s : 9: predicate.float_depend_g_call 0.64% : 0.000001s : 6: predicate.float_environ_get_switch 0.97% : 0.000001s : 9: predicate.float_tuple_getitem_switch 0.20% : 0.000000s : 3: predicate.fold_const_symbol 0.76% : 0.000001s : 6: predicate.get_grad_eliminate 0.50% : 0.000001s : 3: predicate.graph_param_transform 0.83% : 0.000001s : 6: predicate.incorporate_call 0.63% : 0.000001s : 6: predicate.incorporate_call_switch 6.50% : 0.000009s : 34: predicate.inline 1.34% : 0.000002s : 6: predicate.inline_without_move 0.40% : 0.000001s : 6: predicate.j_node_and_user_rematch 0.93% : 0.000001s : 6: predicate.less_batch_normalization 1.46% : 0.000002s : 13: predicate.list_to_tuple_eliminator_ 1.90% : 0.000003s : 20: predicate.load_eliminater 1.50% : 0.000002s : 3: predicate.loop_unroll_after_grad 1.71% : 0.000002s : 14: predicate.loop_unroll_before_grad 1.68% : 0.000002s : 13: predicate.make_slice_get_slice_eliminator 0.63% : 0.000001s : 6: predicate.merge_addn 0.69% : 0.000001s : 6: predicate.micro_step_allgather_replace 0.74% : 0.000001s : 6: predicate.mini_step_allgather_replace 0.63% : 0.000001s : 7: predicate.minmaximum_grad 1.98% : 0.000003s : 3: predicate.mutable_eliminate 0.60% : 0.000001s : 3: predicate.opt_reshape 0.46% : 0.000001s : 3: predicate.parallel_virtual_node 1.22% : 0.000002s : 9: predicate.partial_defer_inline 1.14% : 0.000002s : 10: predicate.partial_eliminate 0.73% : 0.000001s : 7: predicate.print_const_string_wrapper 0.66% : 0.000001s : 6: predicate.reduce_all_const_elim 0.96% : 0.000001s : 7: predicate.reduce_eliminate 2.03% : 0.000003s : 20: predicate.redundant_stop_gradient_eliminater 0.56% : 0.000001s : 6: predicate.remove_not_recompute_node 1.09% : 0.000002s : 13: predicate.replace_applicator 0.75% : 0.000001s : 6: predicate.replace_old_param 0.35% : 0.000001s : 3: predicate.reset_defer_inline 1.04% : 0.000001s : 7: predicate.reshape_eliminate 0.66% : 0.000001s : 6: predicate.row_tensor_add_zeros_like 0.64% : 0.000001s : 3: predicate.row_tensor_eliminate 1.08% : 0.000002s : 6: predicate.same_eliminate 0.46% : 0.000001s : 6: predicate.set_cell_output_no_recompute 0.92% : 0.000001s : 6: predicate.shard_identity_eliminate 0.82% : 0.000001s : 6: predicate.special_op_eliminate 1.06% : 0.000002s : 6: predicate.specialize_transform 1.20% : 0.000002s : 6: predicate.split_environ_get_set_with_tuple_value 0.92% : 0.000001s : 6: predicate.stack_unstack_eliminate 0.43% : 0.000001s : 3: predicate.switch_call_monad_eliminater 0.94% : 0.000001s : 9: predicate.switch_defer_inline 1.61% : 0.000002s : 15: predicate.switch_layer_defer_inline 4.32% : 0.000006s : 32: predicate.switch_simplify 0.77% : 0.000001s : 7: predicate.tile_eliminate 0.75% : 0.000001s : 7: predicate.transpose_eliminate 1.42% : 0.000002s : 13: predicate.tuple_list_convert_item_index_to_positive 1.54% : 0.000002s : 13: predicate.tuple_list_get_item_const_eliminator 1.33% : 0.000002s : 13: predicate.tuple_list_get_item_depend_reorder 3.28% : 0.000005s : 19: predicate.tuple_list_get_item_eliminator 1.40% : 0.000002s : 13: predicate.tuple_list_get_set_item_eliminator 2.60% : 0.000004s : 19: predicate.tuple_list_set_item_eliminator 1.45% : 0.000002s : 13: predicate.tuple_to_list_eliminator_ 1.91% : 0.000003s : 20: predicate.updatestate_pure_node_eliminater 2.88% : 0.000004s : 26: predicate.updatestate_useless_node_eliminater 0.43% : 0.000001s : 3: predicate.value_based_eliminate 0.76% : 0.000001s : 6: predicate.virtual_dataset_eliminate 0.81% : 0.000001s : 6: predicate.virtual_output_eliminate 0.29% : 0.000000s : 3: predicate.virtual_view_grad_eliminate 0.60% : 0.000001s : 3: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000224 5 8.52% : 0.000019s : 1: func_graph_cloner_run.FuncGraphClonerGraph 91.48% : 0.000205s : 4: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.024988 192 0.02% : 0.000006s : 1: ForceFp32Comm 13.16% : 0.003288s : 1: add_attr 13.10% : 0.003273s : 1: add_attr_with_inline 0.02% : 0.000006s : 1: add_comm_op_reuse_tag 0.22% : 0.000055s : 1: add_recomputation 0.03% : 0.000006s : 1: assign_add_opt 0.25% : 0.000064s : 1: auto_monad 0.11% : 0.000028s : 1: auto_monad_reorder 0.02% : 0.000006s : 1: begin_end_overlap_inline 0.03% : 0.000008s : 1: bias_add_comm_swap 1.90% : 0.000476s : 1: bootstrap 0.14% : 0.000035s : 1: cconv 0.03% : 0.000007s : 1: comm_op_add_attrs 0.08% : 0.000020s : 1: control_data_broadcast_order 0.05% : 0.000013s : 1: convert_after_rewriter 0.12% : 0.000031s : 1: cse_after_recomputation 0.03% : 0.000007s : 1: dataset_repeat_opt 0.08% : 0.000021s : 1: detach_backward 0.04% : 0.000011s : 1: environ_conv 0.09% : 0.000023s : 1: event_method 0.03% : 0.000008s : 1: full_micro_interleaved_order_control 0.03% : 0.000008s : 1: get_jit_bprop_graph 0.05% : 0.000012s : 1: graph_reusing 0.03% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.02% : 0.000006s : 1: handle_group_info 0.04% : 0.000009s : 1: inline 0.04% : 0.000009s : 1: insert-virtual-dataset 0.03% : 0.000006s : 1: interleave_parallel_branches 0.03% : 0.000006s : 1: interleave_split_concat_branches 0.03% : 0.000008s : 1: label_fine_grained_interleaved_index 0.04% : 0.000010s : 1: label_micro_interleaved_index 1.88% : 0.000471s : 1: loop_unroll 0.03% : 0.000007s : 1: merge_cast_opt 0.03% : 0.000008s : 1: micro_interleaved_order_control 2.24% : 0.000560s : 1: mutable_eliminate 0.04% : 0.000010s : 1: offloading_packed_experts 0.06% : 0.000016s : 1: opt.transform.loop_unroll_optimizer 0.07% : 0.000018s : 1: opt.transform.mutable_eliminate 3.53% : 0.000882s : 78: opt.transform.opt_a 0.11% : 0.000027s : 1: opt.transform.opt_after_cconv 0.12% : 0.000029s : 1: opt.transform.opt_after_jit_grad 0.40% : 0.000099s : 28: opt.transform.opt_b 0.18% : 0.000046s : 2: opt.transform.opt_trans_graph 0.16% : 0.000040s : 4: opt.transform.symbol_engine_opt 10.97% : 0.002740s : 1: opt_a 0.55% : 0.000137s : 1: opt_after_cconv 2.13% : 0.000533s : 1: opt_after_jit_grad 1.13% : 0.000283s : 1: opt_b 21.79% : 0.005446s : 1: optimize 0.10% : 0.000024s : 1: optimize_parallel_all_gather_comm 0.04% : 0.000011s : 1: order_py_execute_after_rewriter 0.11% : 0.000028s : 1: overlap_grad_flash_sp 0.03% : 0.000007s : 1: overlap_grad_matmul_and_grad_allreduce 0.04% : 0.000010s : 1: overlap_grad_ring_attention 0.03% : 0.000007s : 1: overlap_opt_shard_grad_in_pipeline 0.03% : 0.000007s : 1: overlap_opt_shard_in_pipeline 0.03% : 0.000008s : 1: overlap_param_gather 0.03% : 0.000006s : 1: overlap_recompute_allgather_and_fa_grad 0.04% : 0.000011s : 1: overlap_recompute_and_grad_model_parallel 0.03% : 0.000008s : 1: overlap_recompute_comm 0.04% : 0.000010s : 1: parallel-infer-symbol 0.03% : 0.000006s : 1: parallel-infer-symbol-second 0.03% : 0.000008s : 1: partial_unused_args_eliminate 0.04% : 0.000009s : 1: pipeline_parallel_scheduler 0.03% : 0.000007s : 1: pipeline_split 0.14% : 0.000034s : 1: pre_auto_parallel 0.11% : 0.000026s : 1: py_interpret_to_execute 0.07% : 0.000017s : 1: py_interpret_to_execute_after_opt_a 0.03% : 0.000007s : 1: remove_cast_before_assign_add 0.08% : 0.000021s : 1: remove_dup_value 1.23% : 0.000308s : 1: renormalize.infer 0.98% : 0.000246s : 1: renormalize.specialize 0.03% : 0.000009s : 1: reorder_send_recv_between_fp_bp 0.05% : 0.000011s : 1: rewriter_after_jit_bprop_graph 0.19% : 0.000047s : 1: rewriter_after_opt_a 0.22% : 0.000054s : 1: rewriter_before_opt_a 0.03% : 0.000007s : 1: slice_cell_reuse_recomputed_activation 0.03% : 0.000008s : 1: slice_recompute_activation 0.03% : 0.000007s : 1: split_layernorm_comm 0.03% : 0.000007s : 1: split_matmul_comm_elemetwise 0.04% : 0.000011s : 1: swap_dp_allreduce_reducescatter 0.44% : 0.000110s : 1: symbol_engine_optimizer 0.37% : 0.000093s : 1: tuple_transform 19.89% : 0.004970s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:49.991.41 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0144543, [21] [bootstrap]: 0.00044768 [type_inference]: 0.00514033 [event_method]: 1.389e-05 [auto_monad]: 5.869e-05 [graph_reusing]: 5.54998e-06 [inline]: 2.78e-06 [add_attr]: 0.0034898, [1] [add_attr_with_inline]: 0.00347974, [1] [Cycle 1]: 5.946e-05, [2] [tag_attr]: 1.645e-05 [meta_addattr_fg_expand]: 3.48e-06 [parallel-infer-symbol]: 3.24001e-06 [pre_auto_parallel]: 2.909e-05 [insert-virtual-dataset]: 2.79001e-06 [parallel-infer-symbol-second]: 7.60017e-07 [dataset_repeat_opt]: 2.61999e-06 [pipeline_split]: 1.54998e-06 [optimize]: 0.00448637, [53] [py_interpret_to_execute]: 2.01e-05 [rewriter_before_opt_a]: 4.857e-05 [opt_a]: 0.00239959, [2] [Cycle 1]: 0.00171396, [45] [expand_dump_flag]: 2.95998e-06 [switch_simplify]: 2.652e-05 [loop_unroll]: 1.388e-05 [a_1]: 0.00034597 [with_stream_mark]: 1.864e-05 [recompute_prepare]: 8.64e-06 [updatestate_depend_eliminate]: 4.79998e-06 [updatestate_assign_eliminate]: 3.67002e-06 [updatestate_loads_eliminate]: 4.2e-06 [parameter_eliminate]: 1.94e-06 [a_2]: 9.19e-05 [accelerated_algorithm]: 7.40998e-06 [shard]: 2.50002e-06 [meta_shard_fg_expand]: 1.96e-06 [shard_inline]: 6.39001e-06 [merge_send_recv]: 8.70001e-06 [auto_parallel]: 6.72002e-06 [parallel]: 1.879e-05 [flash_sp]: 8.45999e-06 [merge_comm]: 3.127e-05 [allreduce_fusion]: 3.59002e-06 [matmul_add_comm_reduction]: 9.54e-06 [allreduce_slice_to_reducescatter]: 8.2e-07 [virtual_shard_identity]: 9.09e-06 [virtual_dataset]: 6.94999e-06 [get_grad_eliminate_]: 6.06e-06 [virtual_output]: 6.56e-06 [merge_forward]: 3.97e-06 [cell_reuse_recompute_pass]: 2.27999e-06 [offload_activation]: 1.066e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.321e-05 [merge_recompute_call_nodes]: 1.45999e-06 [before_grad]: 1.093e-05 [set_forward_comm_id_for_comm_node_pass]: 3.80003e-06 [meta_fg_expand]: 3.09999e-06 [flash_sp_send_recv_attached]: 2.58e-06 [receive_attached]: 2.27001e-06 [after_resolve]: 1.034e-05 [a_after_grad]: 9.82999e-06 [renormalize]: 0.00062186 [add_forward_monad_depend]: 6.58e-06 [auto_monad_grad]: 2.63e-06 [auto_monad_eliminator]: 1.596e-05 [cse]: 3.068e-05 [a_3]: 4.795e-05 [Cycle 2]: 0.00067449, [45] [expand_dump_flag]: 1.27e-06 [switch_simplify]: 7.2e-06 [loop_unroll]: 5.75001e-06 [a_1]: 0.0001322 [with_stream_mark]: 1.339e-05 [recompute_prepare]: 6.88e-06 [updatestate_depend_eliminate]: 3.11001e-06 [updatestate_assign_eliminate]: 2.32001e-06 [updatestate_loads_eliminate]: 2.81e-06 [parameter_eliminate]: 1.42e-06 [a_2]: 8.03e-05 [accelerated_algorithm]: 6.12001e-06 [shard]: 1.44e-06 [meta_shard_fg_expand]: 1.57999e-06 [shard_inline]: 6.37001e-06 [merge_send_recv]: 5.70001e-06 [auto_parallel]: 6.64001e-06 [parallel]: 5.07999e-06 [flash_sp]: 3.66001e-06 [merge_comm]: 3.53999e-06 [allreduce_fusion]: 3.5e-06 [matmul_add_comm_reduction]: 6.54001e-06 [allreduce_slice_to_reducescatter]: 5.39992e-07 [virtual_shard_identity]: 6.86001e-06 [virtual_dataset]: 5.99e-06 [get_grad_eliminate_]: 5.71e-06 [virtual_output]: 5.31998e-06 [merge_forward]: 3.21999e-06 [cell_reuse_recompute_pass]: 1.39e-06 [offload_activation]: 7.90998e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.139e-05 [merge_recompute_call_nodes]: 9.5999e-07 [before_grad]: 9.27001e-06 [set_forward_comm_id_for_comm_node_pass]: 3.77998e-06 [meta_fg_expand]: 2.50002e-06 [flash_sp_send_recv_attached]: 8.90024e-07 [receive_attached]: 9.89996e-07 [after_resolve]: 9.42001e-06 [a_after_grad]: 8.17998e-06 [renormalize]: 7.00238e-08 [add_forward_monad_depend]: 2.39999e-06 [auto_monad_grad]: 9.20001e-07 [auto_monad_eliminator]: 8.79e-06 [cse]: 1.693e-05 [a_3]: 3.576e-05 [py_interpret_to_execute_after_opt_a]: 1.119e-05 [slice_cell_reuse_recomputed_activation]: 2.37001e-06 [rewriter_after_opt_a]: 3.669e-05 [convert_after_rewriter]: 6.65002e-06 [order_py_execute_after_rewriter]: 5.25999e-06 [mutable_eliminate]: 0.00057105 [opt_b]: 0.00021022, [1] [Cycle 1]: 0.00020309, [7] [b_1]: 0.00011902 [b_2]: 8.65999e-06 [updatestate_depend_eliminate]: 7.3e-06 [updatestate_assign_eliminate]: 2.33002e-06 [updatestate_loads_eliminate]: 2.73e-06 [renormalize]: 6.50005e-07 [cse]: 2.456e-05 [optimize_parallel_all_gather_comm]: 1.74e-05 [overlap_param_gather]: 1.92999e-06 [cconv]: 2.64e-05 [loop_unroll]: 0.00046163 [opt_after_cconv]: 0.00010477, [1] [Cycle 1]: 9.883e-05, [7] [c_1]: 2.909e-05 [parameter_eliminate]: 3.48e-06 [updatestate_depend_eliminate]: 5.56e-06 [updatestate_assign_eliminate]: 2.52001e-06 [updatestate_loads_eliminate]: 2.51e-06 [cse]: 2.116e-05 [renormalize]: 5.00004e-07 [remove_dup_value]: 1.388e-05 [tuple_transform]: 7.161e-05, [1] [Cycle 1]: 6.73e-05, [4] [d_1]: 3.92e-05 [none_parameter_eliminate]: 1.89e-06 [renormalize]: 1.69995e-07 [switch_simplify]: 6.61999e-06 [partial_unused_args_eliminate]: 2.06e-06 [add_recomputation]: 4.809e-05 [cse_after_recomputation]: 2.245e-05, [1] [Cycle 1]: 1.733e-05, [1] [cse]: 1.152e-05 [environ_conv]: 5.35999e-06 [swap_dp_allreduce_reducescatter]: 6.04001e-06 [bias_add_comm_swap]: 2.34999e-06 [label_micro_interleaved_index]: 4.37998e-06 [label_fine_grained_interleaved_index]: 2.58e-06 [merge_cast_opt]: 1.22e-06 [slice_recompute_activation]: 2.18998e-06 [micro_interleaved_order_control]: 2.46e-06 [assign_add_opt]: 1.50001e-06 [ForceFp32Comm]: 8.59989e-07 [remove_cast_before_assign_add]: 9.89996e-07 [full_micro_interleaved_order_control]: 2.15002e-06 [reorder_send_recv_between_fp_bp]: 2.81e-06 [comm_op_add_attrs]: 1.04e-06 [add_comm_op_reuse_tag]: 9.70002e-07 [interleave_split_concat_branches]: 1.15999e-06 [interleave_parallel_branches]: 1.01997e-06 [overlap_opt_shard_in_pipeline]: 1.07e-06 [overlap_opt_shard_grad_in_pipeline]: 2.15002e-06 [control_data_broadcast_order]: 1.358e-05 [grouped_pairwise_exchange_alltoall]: 1.74998e-06 [offloading_packed_experts]: 4.03001e-06 [overlap_recompute_and_grad_model_parallel]: 4.84e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.10999e-06 [overlap_recompute_allgather_and_fa_grad]: 1.39e-06 [overlap_recompute_comm]: 2.94999e-06 [overlap_grad_ring_attention]: 4.42e-06 [overlap_grad_flash_sp]: 2.108e-05 [begin_end_overlap_inline]: 5.50004e-07 [split_matmul_comm_elemetwise]: 2.23002e-06 [split_layernorm_comm]: 1.84998e-06 [handle_group_info]: 1.43002e-06 [symbol_engine_optimizer]: 7.963e-05, [1] [Cycle 1]: 7.491e-05, [6] [build]: 3.30998e-06 [elim_shapecalc]: 1.035e-05 [elim_not_effective]: 1.286e-05 [opt_reshape]: 6.75002e-06 [fold_const_symbol]: 1.041e-05 [renormalize]: 3.00002e-07 [detach_backward]: 2.17001e-06 [pipeline_parallel_scheduler]: 1.66e-06 [auto_monad_reorder]: 1.845e-05 [get_jit_bprop_graph]: 2.02999e-06 [rewriter_after_jit_bprop_graph]: 4.84e-06 [opt_after_jit_grad]: 0.00054268 [validate]: 3.978e-05 Sums bootstrap : 0.000448s : 4.49% type_inference : 0.005140s : 51.59% event_method : 0.000014s : 0.14% auto_monad : 0.000059s : 0.59% graph_reusing : 0.000006s : 0.06% inline : 0.000003s : 0.03% add_attr.add_attr_with_inline.tag_attr : 0.000016s : 0.17% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000003s : 0.03% parallel-infer-symbol : 0.000003s : 0.03% pre_auto_parallel : 0.000029s : 0.29% insert-virtual-dataset : 0.000003s : 0.03% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000003s : 0.03% pipeline_split : 0.000002s : 0.02% optimize.py_interpret_to_execute : 0.000020s : 0.20% optimize.rewriter_before_opt_a : 0.000049s : 0.49% optimize.opt_a.expand_dump_flag : 0.000004s : 0.04% optimize.opt_a.switch_simplify : 0.000034s : 0.34% optimize.opt_a.loop_unroll : 0.000020s : 0.20% optimize.opt_a.a_1 : 0.000478s : 4.80% optimize.opt_a.with_stream_mark : 0.000032s : 0.32% optimize.opt_a.recompute_prepare : 0.000016s : 0.16% optimize.opt_a.updatestate_depend_eliminate : 0.000008s : 0.08% optimize.opt_a.updatestate_assign_eliminate : 0.000006s : 0.06% optimize.opt_a.updatestate_loads_eliminate : 0.000007s : 0.07% optimize.opt_a.parameter_eliminate : 0.000003s : 0.03% optimize.opt_a.a_2 : 0.000172s : 1.73% optimize.opt_a.accelerated_algorithm : 0.000014s : 0.14% optimize.opt_a.shard : 0.000004s : 0.04% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.04% optimize.opt_a.shard_inline : 0.000013s : 0.13% optimize.opt_a.merge_send_recv : 0.000014s : 0.14% optimize.opt_a.auto_parallel : 0.000013s : 0.13% optimize.opt_a.parallel : 0.000024s : 0.24% optimize.opt_a.flash_sp : 0.000012s : 0.12% optimize.opt_a.merge_comm : 0.000035s : 0.35% optimize.opt_a.allreduce_fusion : 0.000007s : 0.07% optimize.opt_a.matmul_add_comm_reduction : 0.000016s : 0.16% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000016s : 0.16% optimize.opt_a.virtual_dataset : 0.000013s : 0.13% optimize.opt_a.get_grad_eliminate_ : 0.000012s : 0.12% optimize.opt_a.virtual_output : 0.000012s : 0.12% optimize.opt_a.merge_forward : 0.000007s : 0.07% optimize.opt_a.cell_reuse_recompute_pass : 0.000004s : 0.04% optimize.opt_a.offload_activation : 0.000019s : 0.19% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000025s : 0.25% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.02% optimize.opt_a.before_grad : 0.000020s : 0.20% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000008s : 0.08% optimize.opt_a.meta_fg_expand : 0.000006s : 0.06% optimize.opt_a.flash_sp_send_recv_attached : 0.000003s : 0.03% optimize.opt_a.receive_attached : 0.000003s : 0.03% optimize.opt_a.after_resolve : 0.000020s : 0.20% optimize.opt_a.a_after_grad : 0.000018s : 0.18% optimize.opt_a.renormalize : 0.000622s : 6.24% optimize.opt_a.add_forward_monad_depend : 0.000009s : 0.09% optimize.opt_a.auto_monad_grad : 0.000004s : 0.04% optimize.opt_a.auto_monad_eliminator : 0.000025s : 0.25% optimize.opt_a.cse : 0.000048s : 0.48% optimize.opt_a.a_3 : 0.000084s : 0.84% optimize.py_interpret_to_execute_after_opt_a : 0.000011s : 0.11% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.02% optimize.rewriter_after_opt_a : 0.000037s : 0.37% optimize.convert_after_rewriter : 0.000007s : 0.07% optimize.order_py_execute_after_rewriter : 0.000005s : 0.05% optimize.mutable_eliminate : 0.000571s : 5.73% optimize.opt_b.b_1 : 0.000119s : 1.19% optimize.opt_b.b_2 : 0.000009s : 0.09% optimize.opt_b.updatestate_depend_eliminate : 0.000007s : 0.07% optimize.opt_b.updatestate_assign_eliminate : 0.000002s : 0.02% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_b.renormalize : 0.000001s : 0.01% optimize.opt_b.cse : 0.000025s : 0.25% optimize.optimize_parallel_all_gather_comm : 0.000017s : 0.17% optimize.overlap_param_gather : 0.000002s : 0.02% optimize.cconv : 0.000026s : 0.26% optimize.loop_unroll : 0.000462s : 4.63% optimize.opt_after_cconv.c_1 : 0.000029s : 0.29% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.06% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.cse : 0.000021s : 0.21% optimize.opt_after_cconv.renormalize : 0.000001s : 0.01% optimize.remove_dup_value : 0.000014s : 0.14% optimize.tuple_transform.d_1 : 0.000039s : 0.39% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.02% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000007s : 0.07% optimize.partial_unused_args_eliminate : 0.000002s : 0.02% optimize.add_recomputation : 0.000048s : 0.48% optimize.cse_after_recomputation.cse : 0.000012s : 0.12% optimize.environ_conv : 0.000005s : 0.05% optimize.swap_dp_allreduce_reducescatter : 0.000006s : 0.06% optimize.bias_add_comm_swap : 0.000002s : 0.02% optimize.label_micro_interleaved_index : 0.000004s : 0.04% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.03% optimize.merge_cast_opt : 0.000001s : 0.01% optimize.slice_recompute_activation : 0.000002s : 0.02% optimize.micro_interleaved_order_control : 0.000002s : 0.02% optimize.assign_add_opt : 0.000002s : 0.02% optimize.ForceFp32Comm : 0.000001s : 0.01% optimize.remove_cast_before_assign_add : 0.000001s : 0.01% optimize.full_micro_interleaved_order_control : 0.000002s : 0.02% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.03% optimize.comm_op_add_attrs : 0.000001s : 0.01% optimize.add_comm_op_reuse_tag : 0.000001s : 0.01% optimize.interleave_split_concat_branches : 0.000001s : 0.01% optimize.interleave_parallel_branches : 0.000001s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.02% optimize.control_data_broadcast_order : 0.000014s : 0.14% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.02% optimize.offloading_packed_experts : 0.000004s : 0.04% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.05% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.01% optimize.overlap_recompute_comm : 0.000003s : 0.03% optimize.overlap_grad_ring_attention : 0.000004s : 0.04% optimize.overlap_grad_flash_sp : 0.000021s : 0.21% optimize.begin_end_overlap_inline : 0.000001s : 0.01% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.02% optimize.split_layernorm_comm : 0.000002s : 0.02% optimize.handle_group_info : 0.000001s : 0.01% optimize.symbol_engine_optimizer.build : 0.000003s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000010s : 0.10% optimize.symbol_engine_optimizer.elim_not_effective : 0.000013s : 0.13% optimize.symbol_engine_optimizer.opt_reshape : 0.000007s : 0.07% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000010s : 0.10% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.02% pipeline_parallel_scheduler : 0.000002s : 0.02% auto_monad_reorder : 0.000018s : 0.19% get_jit_bprop_graph : 0.000002s : 0.02% rewriter_after_jit_bprop_graph : 0.000005s : 0.05% opt_after_jit_grad : 0.000543s : 5.45% validate : 0.000040s : 0.40% Time group info: ------[substitution.] 0.000193 23 37.79% : 0.000073s : 4: substitution.arithmetic_simplify 0.99% : 0.000002s : 2: substitution.elim_not_effective 0.71% : 0.000001s : 2: substitution.fold_const_symbol 2.94% : 0.000006s : 3: substitution.graph_param_transform 51.05% : 0.000099s : 2: substitution.inline 2.02% : 0.000004s : 4: substitution.j_node_and_user_rematch 2.71% : 0.000005s : 4: substitution.remove_not_recompute_node 1.79% : 0.000003s : 2: substitution.replace_old_param ------[type_inference.] 0.005087 2 91.03% : 0.004631s : 1: type_inference.infer 8.97% : 0.000456s : 1: type_inference.specialize ------[replace.] 0.000024 2 100.00% : 0.000024s : 2: replace.inline ------[match.] 0.000097 2 100.00% : 0.000097s : 2: match.inline ------[predicate.] 0.000140 754 0.89% : 0.000001s : 7: predicate.accumulaten_eliminater 1.33% : 0.000002s : 3: predicate.ad_related_special_op_eliminate 0.60% : 0.000001s : 6: predicate.addn_check_dump 0.89% : 0.000001s : 7: predicate.addn_zero_filter 0.78% : 0.000001s : 7: predicate.adjust_all_reduce_mul_add 2.69% : 0.000004s : 13: predicate.arithmetic_simplify 0.80% : 0.000001s : 7: predicate.cast_eliminate 0.75% : 0.000001s : 6: predicate.check_bprop_eliminate 0.67% : 0.000001s : 6: predicate.compare_switch_simplify 0.21% : 0.000000s : 3: predicate.const_output_eliminate 0.73% : 0.000001s : 6: predicate.depend_value_elim 0.73% : 0.000001s : 7: predicate.dict_get_item_const_eliminator 0.82% : 0.000001s : 7: predicate.dict_get_item_eliminator 0.99% : 0.000001s : 7: predicate.dict_set_item_eliminator 1.13% : 0.000002s : 6: predicate.dumpgradient_eliminate 0.21% : 0.000000s : 3: predicate.elim_not_effective 0.51% : 0.000001s : 3: predicate.elim_shapecalc_of_broadcastargs 1.05% : 0.000001s : 10: predicate.environ_add_const_eliminate 0.95% : 0.000001s : 10: predicate.environ_get_add_eliminate 0.93% : 0.000001s : 10: predicate.environ_get_depend_swap 1.65% : 0.000002s : 16: predicate.environ_get_eliminate 0.93% : 0.000001s : 10: predicate.environ_get_set_eliminate 0.94% : 0.000001s : 9: predicate.exchange_switch_depend_value 1.82% : 0.000003s : 9: predicate.float_depend_g_call 0.61% : 0.000001s : 6: predicate.float_environ_get_switch 0.92% : 0.000001s : 9: predicate.float_tuple_getitem_switch 0.18% : 0.000000s : 3: predicate.fold_const_symbol 0.79% : 0.000001s : 6: predicate.get_grad_eliminate 0.34% : 0.000000s : 3: predicate.graph_param_transform 0.78% : 0.000001s : 6: predicate.incorporate_call 0.60% : 0.000001s : 6: predicate.incorporate_call_switch 6.87% : 0.000010s : 34: predicate.inline 1.08% : 0.000002s : 6: predicate.inline_without_move 0.59% : 0.000001s : 6: predicate.j_node_and_user_rematch 1.05% : 0.000001s : 6: predicate.less_batch_normalization 1.51% : 0.000002s : 13: predicate.list_to_tuple_eliminator_ 1.99% : 0.000003s : 20: predicate.load_eliminater 1.51% : 0.000002s : 3: predicate.loop_unroll_after_grad 1.63% : 0.000002s : 14: predicate.loop_unroll_before_grad 1.53% : 0.000002s : 13: predicate.make_slice_get_slice_eliminator 0.68% : 0.000001s : 6: predicate.merge_addn 0.73% : 0.000001s : 6: predicate.micro_step_allgather_replace 0.73% : 0.000001s : 6: predicate.mini_step_allgather_replace 0.70% : 0.000001s : 7: predicate.minmaximum_grad 1.93% : 0.000003s : 3: predicate.mutable_eliminate 0.45% : 0.000001s : 3: predicate.opt_reshape 0.48% : 0.000001s : 3: predicate.parallel_virtual_node 1.46% : 0.000002s : 9: predicate.partial_defer_inline 1.18% : 0.000002s : 10: predicate.partial_eliminate 0.78% : 0.000001s : 7: predicate.print_const_string_wrapper 0.70% : 0.000001s : 6: predicate.reduce_all_const_elim 1.04% : 0.000001s : 7: predicate.reduce_eliminate 2.05% : 0.000003s : 20: predicate.redundant_stop_gradient_eliminater 0.60% : 0.000001s : 6: predicate.remove_not_recompute_node 1.13% : 0.000002s : 13: predicate.replace_applicator 0.66% : 0.000001s : 6: predicate.replace_old_param 0.33% : 0.000000s : 3: predicate.reset_defer_inline 1.01% : 0.000001s : 7: predicate.reshape_eliminate 0.73% : 0.000001s : 6: predicate.row_tensor_add_zeros_like 0.81% : 0.000001s : 3: predicate.row_tensor_eliminate 1.28% : 0.000002s : 6: predicate.same_eliminate 0.55% : 0.000001s : 6: predicate.set_cell_output_no_recompute 0.87% : 0.000001s : 6: predicate.shard_identity_eliminate 0.92% : 0.000001s : 6: predicate.special_op_eliminate 1.34% : 0.000002s : 6: predicate.specialize_transform 1.15% : 0.000002s : 6: predicate.split_environ_get_set_with_tuple_value 0.96% : 0.000001s : 6: predicate.stack_unstack_eliminate 0.40% : 0.000001s : 3: predicate.switch_call_monad_eliminater 0.98% : 0.000001s : 9: predicate.switch_defer_inline 1.73% : 0.000002s : 15: predicate.switch_layer_defer_inline 4.41% : 0.000006s : 32: predicate.switch_simplify 0.80% : 0.000001s : 7: predicate.tile_eliminate 0.89% : 0.000001s : 7: predicate.transpose_eliminate 1.52% : 0.000002s : 13: predicate.tuple_list_convert_item_index_to_positive 1.40% : 0.000002s : 13: predicate.tuple_list_get_item_const_eliminator 1.23% : 0.000002s : 13: predicate.tuple_list_get_item_depend_reorder 3.07% : 0.000004s : 19: predicate.tuple_list_get_item_eliminator 1.33% : 0.000002s : 13: predicate.tuple_list_get_set_item_eliminator 2.31% : 0.000003s : 19: predicate.tuple_list_set_item_eliminator 1.66% : 0.000002s : 13: predicate.tuple_to_list_eliminator_ 1.95% : 0.000003s : 20: predicate.updatestate_pure_node_eliminater 2.72% : 0.000004s : 26: predicate.updatestate_useless_node_eliminater 0.52% : 0.000001s : 3: predicate.value_based_eliminate 0.82% : 0.000001s : 6: predicate.virtual_dataset_eliminate 1.02% : 0.000001s : 6: predicate.virtual_output_eliminate 0.31% : 0.000000s : 3: predicate.virtual_view_grad_eliminate 0.73% : 0.000001s : 3: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000241 5 7.51% : 0.000018s : 1: func_graph_cloner_run.FuncGraphClonerGraph 92.49% : 0.000223s : 4: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.024053 192 0.01% : 0.000003s : 1: ForceFp32Comm 14.53% : 0.003495s : 1: add_attr 14.48% : 0.003484s : 1: add_attr_with_inline 0.01% : 0.000004s : 1: add_comm_op_reuse_tag 0.22% : 0.000052s : 1: add_recomputation 0.02% : 0.000004s : 1: assign_add_opt 0.27% : 0.000064s : 1: auto_monad 0.09% : 0.000022s : 1: auto_monad_reorder 0.01% : 0.000003s : 1: begin_end_overlap_inline 0.02% : 0.000005s : 1: bias_add_comm_swap 1.98% : 0.000477s : 1: bootstrap 0.12% : 0.000030s : 1: cconv 0.02% : 0.000004s : 1: comm_op_add_attrs 0.07% : 0.000017s : 1: control_data_broadcast_order 0.04% : 0.000010s : 1: convert_after_rewriter 0.11% : 0.000026s : 1: cse_after_recomputation 0.02% : 0.000006s : 1: dataset_repeat_opt 0.02% : 0.000006s : 1: detach_backward 0.04% : 0.000009s : 1: environ_conv 0.09% : 0.000021s : 1: event_method 0.02% : 0.000005s : 1: full_micro_interleaved_order_control 0.02% : 0.000005s : 1: get_jit_bprop_graph 0.04% : 0.000009s : 1: graph_reusing 0.02% : 0.000005s : 1: grouped_pairwise_exchange_alltoall 0.02% : 0.000004s : 1: handle_group_info 0.02% : 0.000006s : 1: inline 0.03% : 0.000006s : 1: insert-virtual-dataset 0.02% : 0.000004s : 1: interleave_parallel_branches 0.02% : 0.000004s : 1: interleave_split_concat_branches 0.02% : 0.000006s : 1: label_fine_grained_interleaved_index 0.03% : 0.000007s : 1: label_micro_interleaved_index 1.96% : 0.000471s : 1: loop_unroll 0.02% : 0.000004s : 1: merge_cast_opt 0.02% : 0.000005s : 1: micro_interleaved_order_control 2.42% : 0.000581s : 1: mutable_eliminate 0.03% : 0.000007s : 1: offloading_packed_experts 0.07% : 0.000016s : 1: opt.transform.loop_unroll_optimizer 0.07% : 0.000018s : 1: opt.transform.mutable_eliminate 3.61% : 0.000869s : 78: opt.transform.opt_a 0.11% : 0.000027s : 1: opt.transform.opt_after_cconv 0.11% : 0.000025s : 1: opt.transform.opt_after_jit_grad 0.40% : 0.000095s : 28: opt.transform.opt_b 0.18% : 0.000044s : 2: opt.transform.opt_trans_graph 0.15% : 0.000036s : 4: opt.transform.symbol_engine_opt 9.99% : 0.002403s : 1: opt_a 0.45% : 0.000108s : 1: opt_after_cconv 2.30% : 0.000552s : 1: opt_after_jit_grad 0.89% : 0.000214s : 1: opt_b 18.68% : 0.004492s : 1: optimize 0.09% : 0.000021s : 1: optimize_parallel_all_gather_comm 0.03% : 0.000008s : 1: order_py_execute_after_rewriter 0.10% : 0.000025s : 1: overlap_grad_flash_sp 0.02% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.03% : 0.000007s : 1: overlap_grad_ring_attention 0.02% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.02% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.02% : 0.000005s : 1: overlap_param_gather 0.02% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.03% : 0.000008s : 1: overlap_recompute_and_grad_model_parallel 0.03% : 0.000006s : 1: overlap_recompute_comm 0.03% : 0.000007s : 1: parallel-infer-symbol 0.02% : 0.000004s : 1: parallel-infer-symbol-second 0.02% : 0.000005s : 1: partial_unused_args_eliminate 0.02% : 0.000005s : 1: pipeline_parallel_scheduler 0.02% : 0.000004s : 1: pipeline_split 0.14% : 0.000033s : 1: pre_auto_parallel 0.10% : 0.000024s : 1: py_interpret_to_execute 0.06% : 0.000015s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000003s : 1: remove_cast_before_assign_add 0.07% : 0.000018s : 1: remove_dup_value 1.43% : 0.000345s : 1: renormalize.infer 1.11% : 0.000267s : 1: renormalize.specialize 0.02% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.03% : 0.000008s : 1: rewriter_after_jit_bprop_graph 0.17% : 0.000042s : 1: rewriter_after_opt_a 0.22% : 0.000052s : 1: rewriter_before_opt_a 0.02% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.02% : 0.000005s : 1: slice_recompute_activation 0.02% : 0.000005s : 1: split_layernorm_comm 0.02% : 0.000005s : 1: split_matmul_comm_elemetwise 0.04% : 0.000009s : 1: swap_dp_allreduce_reducescatter 0.34% : 0.000082s : 1: symbol_engine_optimizer 0.31% : 0.000075s : 1: tuple_transform 21.46% : 0.005162s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:49.296.856 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:49.297.143 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0148272, [21] [bootstrap]: 0.00044634 [type_inference]: 0.00486774 [event_method]: 1.293e-05 [auto_monad]: 5.524e-05 [graph_reusing]: 5.79e-06 [inline]: 2.21e-06 [add_attr]: 0.00319286, [1] [add_attr_with_inline]: 0.00318359, [1] [Cycle 1]: 7.412e-05, [2] [tag_attr]: 1.509e-05 [meta_addattr_fg_expand]: 3.91999e-06 [parallel-infer-symbol]: 3.29001e-06 [pre_auto_parallel]: 2.867e-05 [insert-virtual-dataset]: 2.54001e-06 [parallel-infer-symbol-second]: 6.79982e-07 [dataset_repeat_opt]: 2.03997e-06 [pipeline_split]: 1.69e-06 [optimize]: 0.00503998, [53] [py_interpret_to_execute]: 2.298e-05 [rewriter_before_opt_a]: 4.979e-05 [opt_a]: 0.00269514, [2] [Cycle 1]: 0.0018151, [45] [expand_dump_flag]: 2.98e-06 [switch_simplify]: 2.473e-05 [loop_unroll]: 1.405e-05 [a_1]: 0.00033889 [with_stream_mark]: 1.935e-05 [recompute_prepare]: 9.85002e-06 [updatestate_depend_eliminate]: 4.11001e-06 [updatestate_assign_eliminate]: 3.55e-06 [updatestate_loads_eliminate]: 3.14001e-06 [parameter_eliminate]: 2.02001e-06 [a_2]: 0.00012901 [accelerated_algorithm]: 7.91001e-06 [shard]: 2.47001e-06 [meta_shard_fg_expand]: 1.92999e-06 [shard_inline]: 6.32001e-06 [merge_send_recv]: 9.19998e-06 [auto_parallel]: 6.78e-06 [parallel]: 1.909e-05 [flash_sp]: 8.79e-06 [merge_comm]: 3.92998e-06 [allreduce_fusion]: 3.4e-06 [matmul_add_comm_reduction]: 9.42001e-06 [allreduce_slice_to_reducescatter]: 5.89993e-07 [virtual_shard_identity]: 8.43001e-06 [virtual_dataset]: 6.14001e-06 [get_grad_eliminate_]: 6.46e-06 [virtual_output]: 6.42001e-06 [merge_forward]: 3.91999e-06 [cell_reuse_recompute_pass]: 1.29e-06 [offload_activation]: 1.12e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.432e-05 [merge_recompute_call_nodes]: 1.48002e-06 [before_grad]: 1.117e-05 [set_forward_comm_id_for_comm_node_pass]: 4.27998e-06 [meta_fg_expand]: 2.56998e-06 [flash_sp_send_recv_attached]: 2.51e-06 [receive_attached]: 2.39001e-06 [after_resolve]: 1.08e-05 [a_after_grad]: 9.31002e-06 [renormalize]: 0.0005656 [add_forward_monad_depend]: 5.52999e-06 [auto_monad_grad]: 2.66e-06 [auto_monad_eliminator]: 1.559e-05 [cse]: 2.863e-05 [a_3]: 6.207e-05 [Cycle 2]: 0.00086711, [45] [expand_dump_flag]: 1.09e-06 [switch_simplify]: 7.36999e-06 [loop_unroll]: 5.87001e-06 [a_1]: 0.00013211 [with_stream_mark]: 1.226e-05 [recompute_prepare]: 7.26001e-06 [updatestate_depend_eliminate]: 3.21001e-06 [updatestate_assign_eliminate]: 2.86e-06 [updatestate_loads_eliminate]: 2.71e-06 [parameter_eliminate]: 1.30999e-06 [a_2]: 0.00011384 [accelerated_algorithm]: 6.77002e-06 [shard]: 1.50999e-06 [meta_shard_fg_expand]: 1.44e-06 [shard_inline]: 6.66e-06 [merge_send_recv]: 6.35002e-06 [auto_parallel]: 6.24001e-06 [parallel]: 5.52001e-06 [flash_sp]: 3.98999e-06 [merge_comm]: 3.47002e-06 [allreduce_fusion]: 3.35e-06 [matmul_add_comm_reduction]: 6.80998e-06 [allreduce_slice_to_reducescatter]: 5.49975e-07 [virtual_shard_identity]: 7.2e-06 [virtual_dataset]: 5.67001e-06 [get_grad_eliminate_]: 5.86e-06 [virtual_output]: 5.40001e-06 [merge_forward]: 3.25002e-06 [cell_reuse_recompute_pass]: 1.44e-06 [offload_activation]: 8.12e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.391e-05 [merge_recompute_call_nodes]: 9.70002e-07 [before_grad]: 9.68002e-06 [set_forward_comm_id_for_comm_node_pass]: 5.07e-06 [meta_fg_expand]: 2.44999e-06 [flash_sp_send_recv_attached]: 1.20999e-06 [receive_attached]: 1.54e-06 [after_resolve]: 9.51e-06 [a_after_grad]: 8.97e-06 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 2.44001e-06 [auto_monad_grad]: 1.32e-06 [auto_monad_eliminator]: 8.64003e-06 [cse]: 1.833e-05 [a_3]: 4.964e-05 [py_interpret_to_execute_after_opt_a]: 1.317e-05 [slice_cell_reuse_recomputed_activation]: 5.05999e-06 [rewriter_after_opt_a]: 4.198e-05 [convert_after_rewriter]: 9.22999e-06 [order_py_execute_after_rewriter]: 8.65999e-06 [mutable_eliminate]: 0.00056009 [opt_b]: 0.00027819, [1] [Cycle 1]: 0.00026822, [7] [b_1]: 0.00016678 [b_2]: 8.33001e-06 [updatestate_depend_eliminate]: 7.1e-06 [updatestate_assign_eliminate]: 2.71e-06 [updatestate_loads_eliminate]: 2.76e-06 [renormalize]: 6.19999e-07 [cse]: 2.309e-05 [optimize_parallel_all_gather_comm]: 2.102e-05 [overlap_param_gather]: 4.95001e-06 [cconv]: 3.065e-05 [loop_unroll]: 0.0004807 [opt_after_cconv]: 0.00012766, [1] [Cycle 1]: 0.0001194, [7] [c_1]: 2.827e-05 [parameter_eliminate]: 2.91e-06 [updatestate_depend_eliminate]: 6.21998e-06 [updatestate_assign_eliminate]: 2.41e-06 [updatestate_loads_eliminate]: 2.77002e-06 [cse]: 2.07e-05 [renormalize]: 3.80009e-07 [remove_dup_value]: 1.729e-05 [tuple_transform]: 8.786e-05, [1] [Cycle 1]: 8.101e-05, [4] [d_1]: 4.025e-05 [none_parameter_eliminate]: 1.92001e-06 [renormalize]: 2.3999e-07 [switch_simplify]: 6.62002e-06 [partial_unused_args_eliminate]: 4.55999e-06 [add_recomputation]: 5.131e-05 [cse_after_recomputation]: 2.809e-05, [1] [Cycle 1]: 2.089e-05, [1] [cse]: 1.189e-05 [environ_conv]: 8.25e-06 [swap_dp_allreduce_reducescatter]: 7.88001e-06 [bias_add_comm_swap]: 5.44998e-06 [label_micro_interleaved_index]: 7.32002e-06 [label_fine_grained_interleaved_index]: 5.81e-06 [merge_cast_opt]: 3.78999e-06 [slice_recompute_activation]: 4.51002e-06 [micro_interleaved_order_control]: 4.55001e-06 [assign_add_opt]: 3.75e-06 [ForceFp32Comm]: 3.23e-06 [remove_cast_before_assign_add]: 3.49001e-06 [full_micro_interleaved_order_control]: 4.69998e-06 [reorder_send_recv_between_fp_bp]: 5.12999e-06 [comm_op_add_attrs]: 4.11001e-06 [add_comm_op_reuse_tag]: 3.25e-06 [interleave_split_concat_branches]: 3.55e-06 [interleave_parallel_branches]: 3.48999e-06 [overlap_opt_shard_in_pipeline]: 3.51001e-06 [overlap_opt_shard_grad_in_pipeline]: 4.25e-06 [control_data_broadcast_order]: 1.66e-05 [grouped_pairwise_exchange_alltoall]: 4.03999e-06 [offloading_packed_experts]: 6.83998e-06 [overlap_recompute_and_grad_model_parallel]: 7.05e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.76999e-06 [overlap_recompute_allgather_and_fa_grad]: 4.12e-06 [overlap_recompute_comm]: 5.34998e-06 [overlap_grad_ring_attention]: 7.55e-06 [overlap_grad_flash_sp]: 2.388e-05 [begin_end_overlap_inline]: 3.04001e-06 [split_matmul_comm_elemetwise]: 4.61002e-06 [split_layernorm_comm]: 4.18999e-06 [handle_group_info]: 3.99002e-06 [symbol_engine_optimizer]: 0.00010015, [1] [Cycle 1]: 9.264e-05, [6] [build]: 3.07002e-06 [elim_shapecalc]: 1.089e-05 [elim_not_effective]: 1.319e-05 [opt_reshape]: 7.09001e-06 [fold_const_symbol]: 1.02e-05 [renormalize]: 2.50002e-07 [detach_backward]: 3.63e-06 [pipeline_parallel_scheduler]: 2.17001e-06 [auto_monad_reorder]: 1.834e-05 [get_jit_bprop_graph]: 1.60001e-06 [rewriter_after_jit_bprop_graph]: 4.68999e-06 [opt_after_jit_grad]: 0.00049721 [validate]: 3.868e-05 Sums bootstrap : 0.000446s : 4.54% type_inference : 0.004868s : 49.52% event_method : 0.000013s : 0.13% auto_monad : 0.000055s : 0.56% graph_reusing : 0.000006s : 0.06% inline : 0.000002s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000015s : 0.15% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000004s : 0.04% parallel-infer-symbol : 0.000003s : 0.03% pre_auto_parallel : 0.000029s : 0.29% insert-virtual-dataset : 0.000003s : 0.03% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.02% optimize.py_interpret_to_execute : 0.000023s : 0.23% optimize.rewriter_before_opt_a : 0.000050s : 0.51% optimize.opt_a.expand_dump_flag : 0.000004s : 0.04% optimize.opt_a.switch_simplify : 0.000032s : 0.33% optimize.opt_a.loop_unroll : 0.000020s : 0.20% optimize.opt_a.a_1 : 0.000471s : 4.79% optimize.opt_a.with_stream_mark : 0.000032s : 0.32% optimize.opt_a.recompute_prepare : 0.000017s : 0.17% optimize.opt_a.updatestate_depend_eliminate : 0.000007s : 0.07% optimize.opt_a.updatestate_assign_eliminate : 0.000006s : 0.07% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.06% optimize.opt_a.parameter_eliminate : 0.000003s : 0.03% optimize.opt_a.a_2 : 0.000243s : 2.47% optimize.opt_a.accelerated_algorithm : 0.000015s : 0.15% optimize.opt_a.shard : 0.000004s : 0.04% optimize.opt_a.meta_shard_fg_expand : 0.000003s : 0.03% optimize.opt_a.shard_inline : 0.000013s : 0.13% optimize.opt_a.merge_send_recv : 0.000016s : 0.16% optimize.opt_a.auto_parallel : 0.000013s : 0.13% optimize.opt_a.parallel : 0.000025s : 0.25% optimize.opt_a.flash_sp : 0.000013s : 0.13% optimize.opt_a.merge_comm : 0.000007s : 0.08% optimize.opt_a.allreduce_fusion : 0.000007s : 0.07% optimize.opt_a.matmul_add_comm_reduction : 0.000016s : 0.17% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000016s : 0.16% optimize.opt_a.virtual_dataset : 0.000012s : 0.12% optimize.opt_a.get_grad_eliminate_ : 0.000012s : 0.13% optimize.opt_a.virtual_output : 0.000012s : 0.12% optimize.opt_a.merge_forward : 0.000007s : 0.07% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.03% optimize.opt_a.offload_activation : 0.000019s : 0.20% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000028s : 0.29% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.02% optimize.opt_a.before_grad : 0.000021s : 0.21% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000009s : 0.10% optimize.opt_a.meta_fg_expand : 0.000005s : 0.05% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.04% optimize.opt_a.receive_attached : 0.000004s : 0.04% optimize.opt_a.after_resolve : 0.000020s : 0.21% optimize.opt_a.a_after_grad : 0.000018s : 0.19% optimize.opt_a.renormalize : 0.000566s : 5.76% optimize.opt_a.add_forward_monad_depend : 0.000008s : 0.08% optimize.opt_a.auto_monad_grad : 0.000004s : 0.04% optimize.opt_a.auto_monad_eliminator : 0.000024s : 0.25% optimize.opt_a.cse : 0.000047s : 0.48% optimize.opt_a.a_3 : 0.000112s : 1.14% optimize.py_interpret_to_execute_after_opt_a : 0.000013s : 0.13% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.05% optimize.rewriter_after_opt_a : 0.000042s : 0.43% optimize.convert_after_rewriter : 0.000009s : 0.09% optimize.order_py_execute_after_rewriter : 0.000009s : 0.09% optimize.mutable_eliminate : 0.000560s : 5.70% optimize.opt_b.b_1 : 0.000167s : 1.70% optimize.opt_b.b_2 : 0.000008s : 0.08% optimize.opt_b.updatestate_depend_eliminate : 0.000007s : 0.07% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_b.renormalize : 0.000001s : 0.01% optimize.opt_b.cse : 0.000023s : 0.23% optimize.optimize_parallel_all_gather_comm : 0.000021s : 0.21% optimize.overlap_param_gather : 0.000005s : 0.05% optimize.cconv : 0.000031s : 0.31% optimize.loop_unroll : 0.000481s : 4.89% optimize.opt_after_cconv.c_1 : 0.000028s : 0.29% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.06% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000002s : 0.02% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.cse : 0.000021s : 0.21% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000017s : 0.18% optimize.tuple_transform.d_1 : 0.000040s : 0.41% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.02% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000007s : 0.07% optimize.partial_unused_args_eliminate : 0.000005s : 0.05% optimize.add_recomputation : 0.000051s : 0.52% optimize.cse_after_recomputation.cse : 0.000012s : 0.12% optimize.environ_conv : 0.000008s : 0.08% optimize.swap_dp_allreduce_reducescatter : 0.000008s : 0.08% optimize.bias_add_comm_swap : 0.000005s : 0.06% optimize.label_micro_interleaved_index : 0.000007s : 0.07% optimize.label_fine_grained_interleaved_index : 0.000006s : 0.06% optimize.merge_cast_opt : 0.000004s : 0.04% optimize.slice_recompute_activation : 0.000005s : 0.05% optimize.micro_interleaved_order_control : 0.000005s : 0.05% optimize.assign_add_opt : 0.000004s : 0.04% optimize.ForceFp32Comm : 0.000003s : 0.03% optimize.remove_cast_before_assign_add : 0.000003s : 0.04% optimize.full_micro_interleaved_order_control : 0.000005s : 0.05% optimize.reorder_send_recv_between_fp_bp : 0.000005s : 0.05% optimize.comm_op_add_attrs : 0.000004s : 0.04% optimize.add_comm_op_reuse_tag : 0.000003s : 0.03% optimize.interleave_split_concat_branches : 0.000004s : 0.04% optimize.interleave_parallel_branches : 0.000003s : 0.04% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.04% optimize.overlap_opt_shard_grad_in_pipeline : 0.000004s : 0.04% optimize.control_data_broadcast_order : 0.000017s : 0.17% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.04% optimize.offloading_packed_experts : 0.000007s : 0.07% optimize.overlap_recompute_and_grad_model_parallel : 0.000007s : 0.07% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.04% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.04% optimize.overlap_recompute_comm : 0.000005s : 0.05% optimize.overlap_grad_ring_attention : 0.000008s : 0.08% optimize.overlap_grad_flash_sp : 0.000024s : 0.24% optimize.begin_end_overlap_inline : 0.000003s : 0.03% optimize.split_matmul_comm_elemetwise : 0.000005s : 0.05% optimize.split_layernorm_comm : 0.000004s : 0.04% optimize.handle_group_info : 0.000004s : 0.04% optimize.symbol_engine_optimizer.build : 0.000003s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000011s : 0.11% optimize.symbol_engine_optimizer.elim_not_effective : 0.000013s : 0.13% optimize.symbol_engine_optimizer.opt_reshape : 0.000007s : 0.07% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000010s : 0.10% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000004s : 0.04% pipeline_parallel_scheduler : 0.000002s : 0.02% auto_monad_reorder : 0.000018s : 0.19% get_jit_bprop_graph : 0.000002s : 0.02% rewriter_after_jit_bprop_graph : 0.000005s : 0.05% opt_after_jit_grad : 0.000497s : 5.06% validate : 0.000039s : 0.39% Time group info: ------[substitution.] 0.000194 23 40.88% : 0.000079s : 4: substitution.arithmetic_simplify 1.03% : 0.000002s : 2: substitution.elim_not_effective 0.69% : 0.000001s : 2: substitution.fold_const_symbol 2.84% : 0.000006s : 3: substitution.graph_param_transform 48.43% : 0.000094s : 2: substitution.inline 2.18% : 0.000004s : 4: substitution.j_node_and_user_rematch 2.23% : 0.000004s : 4: substitution.remove_not_recompute_node 1.72% : 0.000003s : 2: substitution.replace_old_param ------[type_inference.] 0.004817 2 90.77% : 0.004373s : 1: type_inference.infer 9.23% : 0.000445s : 1: type_inference.specialize ------[replace.] 0.000022 2 100.00% : 0.000022s : 2: replace.inline ------[match.] 0.000092 2 100.00% : 0.000092s : 2: match.inline ------[predicate.] 0.000138 754 0.81% : 0.000001s : 7: predicate.accumulaten_eliminater 0.93% : 0.000001s : 3: predicate.ad_related_special_op_eliminate 0.65% : 0.000001s : 6: predicate.addn_check_dump 0.91% : 0.000001s : 7: predicate.addn_zero_filter 0.67% : 0.000001s : 7: predicate.adjust_all_reduce_mul_add 3.11% : 0.000004s : 13: predicate.arithmetic_simplify 0.83% : 0.000001s : 7: predicate.cast_eliminate 0.88% : 0.000001s : 6: predicate.check_bprop_eliminate 0.65% : 0.000001s : 6: predicate.compare_switch_simplify 0.21% : 0.000000s : 3: predicate.const_output_eliminate 0.82% : 0.000001s : 6: predicate.depend_value_elim 0.78% : 0.000001s : 7: predicate.dict_get_item_const_eliminator 0.89% : 0.000001s : 7: predicate.dict_get_item_eliminator 0.77% : 0.000001s : 7: predicate.dict_set_item_eliminator 1.50% : 0.000002s : 6: predicate.dumpgradient_eliminate 0.25% : 0.000000s : 3: predicate.elim_not_effective 0.50% : 0.000001s : 3: predicate.elim_shapecalc_of_broadcastargs 1.01% : 0.000001s : 10: predicate.environ_add_const_eliminate 1.12% : 0.000002s : 10: predicate.environ_get_add_eliminate 1.07% : 0.000001s : 10: predicate.environ_get_depend_swap 1.88% : 0.000003s : 16: predicate.environ_get_eliminate 1.10% : 0.000002s : 10: predicate.environ_get_set_eliminate 0.89% : 0.000001s : 9: predicate.exchange_switch_depend_value 1.92% : 0.000003s : 9: predicate.float_depend_g_call 0.68% : 0.000001s : 6: predicate.float_environ_get_switch 1.00% : 0.000001s : 9: predicate.float_tuple_getitem_switch 0.23% : 0.000000s : 3: predicate.fold_const_symbol 0.86% : 0.000001s : 6: predicate.get_grad_eliminate 0.23% : 0.000000s : 3: predicate.graph_param_transform 0.84% : 0.000001s : 6: predicate.incorporate_call 0.65% : 0.000001s : 6: predicate.incorporate_call_switch 6.64% : 0.000009s : 34: predicate.inline 0.99% : 0.000001s : 6: predicate.inline_without_move 0.40% : 0.000001s : 6: predicate.j_node_and_user_rematch 1.04% : 0.000001s : 6: predicate.less_batch_normalization 1.64% : 0.000002s : 13: predicate.list_to_tuple_eliminator_ 2.11% : 0.000003s : 20: predicate.load_eliminater 1.49% : 0.000002s : 3: predicate.loop_unroll_after_grad 1.76% : 0.000002s : 14: predicate.loop_unroll_before_grad 1.62% : 0.000002s : 13: predicate.make_slice_get_slice_eliminator 0.65% : 0.000001s : 6: predicate.merge_addn 0.75% : 0.000001s : 6: predicate.micro_step_allgather_replace 0.69% : 0.000001s : 6: predicate.mini_step_allgather_replace 0.66% : 0.000001s : 7: predicate.minmaximum_grad 1.48% : 0.000002s : 3: predicate.mutable_eliminate 0.44% : 0.000001s : 3: predicate.opt_reshape 0.42% : 0.000001s : 3: predicate.parallel_virtual_node 1.38% : 0.000002s : 9: predicate.partial_defer_inline 1.23% : 0.000002s : 10: predicate.partial_eliminate 0.70% : 0.000001s : 7: predicate.print_const_string_wrapper 0.82% : 0.000001s : 6: predicate.reduce_all_const_elim 0.93% : 0.000001s : 7: predicate.reduce_eliminate 2.11% : 0.000003s : 20: predicate.redundant_stop_gradient_eliminater 0.61% : 0.000001s : 6: predicate.remove_not_recompute_node 1.16% : 0.000002s : 13: predicate.replace_applicator 0.62% : 0.000001s : 6: predicate.replace_old_param 0.34% : 0.000000s : 3: predicate.reset_defer_inline 1.00% : 0.000001s : 7: predicate.reshape_eliminate 0.98% : 0.000001s : 6: predicate.row_tensor_add_zeros_like 0.70% : 0.000001s : 3: predicate.row_tensor_eliminate 1.22% : 0.000002s : 6: predicate.same_eliminate 0.57% : 0.000001s : 6: predicate.set_cell_output_no_recompute 0.98% : 0.000001s : 6: predicate.shard_identity_eliminate 0.89% : 0.000001s : 6: predicate.special_op_eliminate 0.94% : 0.000001s : 6: predicate.specialize_transform 1.37% : 0.000002s : 6: predicate.split_environ_get_set_with_tuple_value 0.87% : 0.000001s : 6: predicate.stack_unstack_eliminate 0.36% : 0.000000s : 3: predicate.switch_call_monad_eliminater 1.02% : 0.000001s : 9: predicate.switch_defer_inline 1.63% : 0.000002s : 15: predicate.switch_layer_defer_inline 4.16% : 0.000006s : 32: predicate.switch_simplify 0.79% : 0.000001s : 7: predicate.tile_eliminate 0.77% : 0.000001s : 7: predicate.transpose_eliminate 1.42% : 0.000002s : 13: predicate.tuple_list_convert_item_index_to_positive 1.57% : 0.000002s : 13: predicate.tuple_list_get_item_const_eliminator 1.34% : 0.000002s : 13: predicate.tuple_list_get_item_depend_reorder 3.04% : 0.000004s : 19: predicate.tuple_list_get_item_eliminator 1.42% : 0.000002s : 13: predicate.tuple_list_get_set_item_eliminator 2.62% : 0.000004s : 19: predicate.tuple_list_set_item_eliminator 1.49% : 0.000002s : 13: predicate.tuple_to_list_eliminator_ 1.96% : 0.000003s : 20: predicate.updatestate_pure_node_eliminater 2.84% : 0.000004s : 26: predicate.updatestate_useless_node_eliminater 0.41% : 0.000001s : 3: predicate.value_based_eliminate 0.76% : 0.000001s : 6: predicate.virtual_dataset_eliminate 0.80% : 0.000001s : 6: predicate.virtual_output_eliminate 0.32% : 0.000000s : 3: predicate.virtual_view_grad_eliminate 0.46% : 0.000001s : 3: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000226 5 8.18% : 0.000018s : 1: func_graph_cloner_run.FuncGraphClonerGraph 91.82% : 0.000208s : 4: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.024632 192 0.02% : 0.000006s : 1: ForceFp32Comm 13.00% : 0.003202s : 1: add_attr 12.94% : 0.003187s : 1: add_attr_with_inline 0.02% : 0.000006s : 1: add_comm_op_reuse_tag 0.22% : 0.000055s : 1: add_recomputation 0.03% : 0.000007s : 1: assign_add_opt 0.26% : 0.000063s : 1: auto_monad 0.11% : 0.000026s : 1: auto_monad_reorder 0.03% : 0.000006s : 1: begin_end_overlap_inline 0.03% : 0.000008s : 1: bias_add_comm_swap 1.99% : 0.000490s : 1: bootstrap 0.14% : 0.000034s : 1: cconv 0.03% : 0.000007s : 1: comm_op_add_attrs 0.08% : 0.000020s : 1: control_data_broadcast_order 0.05% : 0.000012s : 1: convert_after_rewriter 0.13% : 0.000031s : 1: cse_after_recomputation 0.04% : 0.000009s : 1: dataset_repeat_opt 0.08% : 0.000020s : 1: detach_backward 0.05% : 0.000011s : 1: environ_conv 0.10% : 0.000024s : 1: event_method 0.03% : 0.000007s : 1: full_micro_interleaved_order_control 0.03% : 0.000008s : 1: get_jit_bprop_graph 0.05% : 0.000012s : 1: graph_reusing 0.03% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.03% : 0.000007s : 1: handle_group_info 0.03% : 0.000008s : 1: inline 0.03% : 0.000008s : 1: insert-virtual-dataset 0.03% : 0.000006s : 1: interleave_parallel_branches 0.03% : 0.000006s : 1: interleave_split_concat_branches 0.04% : 0.000009s : 1: label_fine_grained_interleaved_index 0.04% : 0.000010s : 1: label_micro_interleaved_index 1.98% : 0.000487s : 1: loop_unroll 0.03% : 0.000006s : 1: merge_cast_opt 0.03% : 0.000007s : 1: micro_interleaved_order_control 2.30% : 0.000567s : 1: mutable_eliminate 0.04% : 0.000010s : 1: offloading_packed_experts 0.06% : 0.000016s : 1: opt.transform.loop_unroll_optimizer 0.07% : 0.000017s : 1: opt.transform.mutable_eliminate 3.56% : 0.000877s : 78: opt.transform.opt_a 0.11% : 0.000027s : 1: opt.transform.opt_after_cconv 0.10% : 0.000025s : 1: opt.transform.opt_after_jit_grad 0.40% : 0.000098s : 28: opt.transform.opt_b 0.18% : 0.000045s : 2: opt.transform.opt_trans_graph 0.15% : 0.000037s : 4: opt.transform.symbol_engine_opt 10.95% : 0.002698s : 1: opt_a 0.53% : 0.000131s : 1: opt_after_cconv 2.06% : 0.000509s : 1: opt_after_jit_grad 1.14% : 0.000282s : 1: opt_b 21.87% : 0.005386s : 1: optimize 0.10% : 0.000024s : 1: optimize_parallel_all_gather_comm 0.05% : 0.000012s : 1: order_py_execute_after_rewriter 0.11% : 0.000028s : 1: overlap_grad_flash_sp 0.03% : 0.000007s : 1: overlap_grad_matmul_and_grad_allreduce 0.04% : 0.000011s : 1: overlap_grad_ring_attention 0.03% : 0.000007s : 1: overlap_opt_shard_grad_in_pipeline 0.03% : 0.000006s : 1: overlap_opt_shard_in_pipeline 0.03% : 0.000008s : 1: overlap_param_gather 0.03% : 0.000007s : 1: overlap_recompute_allgather_and_fa_grad 0.04% : 0.000010s : 1: overlap_recompute_and_grad_model_parallel 0.03% : 0.000008s : 1: overlap_recompute_comm 0.04% : 0.000010s : 1: parallel-infer-symbol 0.07% : 0.000017s : 1: parallel-infer-symbol-second 0.03% : 0.000007s : 1: partial_unused_args_eliminate 0.04% : 0.000010s : 1: pipeline_parallel_scheduler 0.03% : 0.000007s : 1: pipeline_split 0.15% : 0.000036s : 1: pre_auto_parallel 0.11% : 0.000026s : 1: py_interpret_to_execute 0.07% : 0.000017s : 1: py_interpret_to_execute_after_opt_a 0.02% : 0.000006s : 1: remove_cast_before_assign_add 0.08% : 0.000021s : 1: remove_dup_value 1.29% : 0.000317s : 1: renormalize.infer 0.98% : 0.000240s : 1: renormalize.specialize 0.03% : 0.000008s : 1: reorder_send_recv_between_fp_bp 0.04% : 0.000011s : 1: rewriter_after_jit_bprop_graph 0.18% : 0.000045s : 1: rewriter_after_opt_a 0.22% : 0.000053s : 1: rewriter_before_opt_a 0.03% : 0.000008s : 1: slice_cell_reuse_recomputed_activation 0.03% : 0.000007s : 1: slice_recompute_activation 0.03% : 0.000007s : 1: split_layernorm_comm 0.03% : 0.000007s : 1: split_matmul_comm_elemetwise 0.04% : 0.000011s : 1: swap_dp_allreduce_reducescatter 0.42% : 0.000103s : 1: symbol_engine_optimizer 0.37% : 0.000091s : 1: tuple_transform 19.89% : 0.004900s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:49.497.424 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0133245, [21] [bootstrap]: 0.00043911 [type_inference]: 0.00465763 [event_method]: 1.273e-05 [auto_monad]: 5.354e-05 [graph_reusing]: 5.79999e-06 [inline]: 2.47001e-06 [add_attr]: 0.00320921, [1] [add_attr_with_inline]: 0.00320006, [1] [Cycle 1]: 5.285e-05, [2] [tag_attr]: 1.375e-05 [meta_addattr_fg_expand]: 4.80001e-06 [parallel-infer-symbol]: 3.31001e-06 [pre_auto_parallel]: 2.539e-05 [insert-virtual-dataset]: 2.35002e-06 [parallel-infer-symbol-second]: 7.39994e-07 [dataset_repeat_opt]: 1.96e-06 [pipeline_split]: 1.78002e-06 [optimize]: 0.00421465, [53] [py_interpret_to_execute]: 1.831e-05 [rewriter_before_opt_a]: 4.958e-05 [opt_a]: 0.00226741, [2] [Cycle 1]: 0.00158389, [45] [expand_dump_flag]: 2.88e-06 [switch_simplify]: 2.527e-05 [loop_unroll]: 1.432e-05 [a_1]: 0.00034724 [with_stream_mark]: 1.75e-05 [recompute_prepare]: 1.053e-05 [updatestate_depend_eliminate]: 4.26001e-06 [updatestate_assign_eliminate]: 3.16001e-06 [updatestate_loads_eliminate]: 3.48e-06 [parameter_eliminate]: 1.94e-06 [a_2]: 0.00010528 [accelerated_algorithm]: 8.25999e-06 [shard]: 2.48e-06 [meta_shard_fg_expand]: 1.82001e-06 [shard_inline]: 7.1e-06 [merge_send_recv]: 8.38999e-06 [auto_parallel]: 6.39999e-06 [parallel]: 1.883e-05 [flash_sp]: 8.59e-06 [merge_comm]: 3.86999e-06 [allreduce_fusion]: 3.3e-06 [matmul_add_comm_reduction]: 9.77999e-06 [allreduce_slice_to_reducescatter]: 7.10017e-07 [virtual_shard_identity]: 8.22e-06 [virtual_dataset]: 6.17001e-06 [get_grad_eliminate_]: 6.48e-06 [virtual_output]: 5.97001e-06 [merge_forward]: 3.81001e-06 [cell_reuse_recompute_pass]: 1.58002e-06 [offload_activation]: 9.32001e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.438e-05 [merge_recompute_call_nodes]: 1.39998e-06 [before_grad]: 1.05e-05 [set_forward_comm_id_for_comm_node_pass]: 3.66001e-06 [meta_fg_expand]: 2.58e-06 [flash_sp_send_recv_attached]: 2.50997e-06 [receive_attached]: 2.39001e-06 [after_resolve]: 9.69999e-06 [a_after_grad]: 9.08002e-06 [renormalize]: 0.00050691 [add_forward_monad_depend]: 5.39e-06 [auto_monad_grad]: 2.23998e-06 [auto_monad_eliminator]: 1.604e-05 [cse]: 3.089e-05 [a_3]: 4.915e-05 [Cycle 2]: 0.00067356, [45] [expand_dump_flag]: 1.02e-06 [switch_simplify]: 7.17002e-06 [loop_unroll]: 5.61e-06 [a_1]: 0.00013172 [with_stream_mark]: 1.217e-05 [recompute_prepare]: 7.21001e-06 [updatestate_depend_eliminate]: 3.54002e-06 [updatestate_assign_eliminate]: 2.32001e-06 [updatestate_loads_eliminate]: 3.06001e-06 [parameter_eliminate]: 1.09e-06 [a_2]: 8.272e-05 [accelerated_algorithm]: 6.35002e-06 [shard]: 1.35999e-06 [meta_shard_fg_expand]: 1.67999e-06 [shard_inline]: 6.15997e-06 [merge_send_recv]: 5.37999e-06 [auto_parallel]: 6.01e-06 [parallel]: 5.56998e-06 [flash_sp]: 3.68999e-06 [merge_comm]: 3.21999e-06 [allreduce_fusion]: 3.77002e-06 [matmul_add_comm_reduction]: 6.46999e-06 [allreduce_slice_to_reducescatter]: 6.90023e-07 [virtual_shard_identity]: 7.61001e-06 [virtual_dataset]: 5.77999e-06 [get_grad_eliminate_]: 5.44e-06 [virtual_output]: 5.39e-06 [merge_forward]: 2.79999e-06 [cell_reuse_recompute_pass]: 1.87001e-06 [offload_activation]: 7.33e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.099e-05 [merge_recompute_call_nodes]: 9.60019e-07 [before_grad]: 9.19998e-06 [set_forward_comm_id_for_comm_node_pass]: 3.81001e-06 [meta_fg_expand]: 2.27999e-06 [flash_sp_send_recv_attached]: 9.89996e-07 [receive_attached]: 1.00001e-06 [after_resolve]: 9.02999e-06 [a_after_grad]: 7.92e-06 [renormalize]: 7.99773e-08 [add_forward_monad_depend]: 1.81e-06 [auto_monad_grad]: 1.35001e-06 [auto_monad_eliminator]: 9.11998e-06 [cse]: 1.687e-05 [a_3]: 3.467e-05 [py_interpret_to_execute_after_opt_a]: 9.76e-06 [slice_cell_reuse_recomputed_activation]: 2.05002e-06 [rewriter_after_opt_a]: 3.56e-05 [convert_after_rewriter]: 7.31001e-06 [order_py_execute_after_rewriter]: 5.25999e-06 [mutable_eliminate]: 0.00048777 [opt_b]: 0.00020396, [1] [Cycle 1]: 0.00019761, [7] [b_1]: 0.00011906 [b_2]: 7.43e-06 [updatestate_depend_eliminate]: 6.66999e-06 [updatestate_assign_eliminate]: 2.45997e-06 [updatestate_loads_eliminate]: 2.30002e-06 [renormalize]: 5.50004e-07 [cse]: 2.145e-05 [optimize_parallel_all_gather_comm]: 1.736e-05 [overlap_param_gather]: 1.89e-06 [cconv]: 2.525e-05 [loop_unroll]: 0.00043482 [opt_after_cconv]: 0.00010135, [1] [Cycle 1]: 9.574e-05, [7] [c_1]: 2.813e-05 [parameter_eliminate]: 2.88003e-06 [updatestate_depend_eliminate]: 5.73002e-06 [updatestate_assign_eliminate]: 2.29001e-06 [updatestate_loads_eliminate]: 2.24999e-06 [cse]: 2.04e-05 [renormalize]: 4.60015e-07 [remove_dup_value]: 1.438e-05 [tuple_transform]: 6.968e-05, [1] [Cycle 1]: 6.517e-05, [4] [d_1]: 3.812e-05 [none_parameter_eliminate]: 1.60001e-06 [renormalize]: 2.59985e-07 [switch_simplify]: 6.49001e-06 [partial_unused_args_eliminate]: 2.53e-06 [add_recomputation]: 4.679e-05 [cse_after_recomputation]: 2.193e-05, [1] [Cycle 1]: 1.749e-05, [1] [cse]: 1.222e-05 [environ_conv]: 5.48002e-06 [swap_dp_allreduce_reducescatter]: 4.84998e-06 [bias_add_comm_swap]: 2.19999e-06 [label_micro_interleaved_index]: 3.88999e-06 [label_fine_grained_interleaved_index]: 2.93e-06 [merge_cast_opt]: 1.32e-06 [slice_recompute_activation]: 2.16e-06 [micro_interleaved_order_control]: 2.31e-06 [assign_add_opt]: 1.15999e-06 [ForceFp32Comm]: 8.00006e-07 [remove_cast_before_assign_add]: 9.5999e-07 [full_micro_interleaved_order_control]: 2.08002e-06 [reorder_send_recv_between_fp_bp]: 2.71e-06 [comm_op_add_attrs]: 1.00001e-06 [add_comm_op_reuse_tag]: 9.60019e-07 [interleave_split_concat_branches]: 1.15001e-06 [interleave_parallel_branches]: 1.07e-06 [overlap_opt_shard_in_pipeline]: 1.55001e-06 [overlap_opt_shard_grad_in_pipeline]: 1.83997e-06 [control_data_broadcast_order]: 1.27e-05 [grouped_pairwise_exchange_alltoall]: 1.79e-06 [offloading_packed_experts]: 3.91001e-06 [overlap_recompute_and_grad_model_parallel]: 4.95001e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.15001e-06 [overlap_recompute_allgather_and_fa_grad]: 1.60999e-06 [overlap_recompute_comm]: 2.04999e-06 [overlap_grad_ring_attention]: 4.08999e-06 [overlap_grad_flash_sp]: 1.873e-05 [begin_end_overlap_inline]: 4.59986e-07 [split_matmul_comm_elemetwise]: 2.31998e-06 [split_layernorm_comm]: 1.77001e-06 [handle_group_info]: 1.20001e-06 [symbol_engine_optimizer]: 7.591e-05, [1] [Cycle 1]: 7.174e-05, [6] [build]: 2.91999e-06 [elim_shapecalc]: 1.041e-05 [elim_not_effective]: 1.228e-05 [opt_reshape]: 6.39999e-06 [fold_const_symbol]: 9.77999e-06 [renormalize]: 2.10013e-07 [detach_backward]: 2.37001e-06 [pipeline_parallel_scheduler]: 1.61002e-06 [auto_monad_reorder]: 1.642e-05 [get_jit_bprop_graph]: 1.69e-06 [rewriter_after_jit_bprop_graph]: 4.57998e-06 [opt_after_jit_grad]: 0.00047715 [validate]: 3.919e-05 Sums bootstrap : 0.000439s : 4.81% type_inference : 0.004658s : 51.04% event_method : 0.000013s : 0.14% auto_monad : 0.000054s : 0.59% graph_reusing : 0.000006s : 0.06% inline : 0.000002s : 0.03% add_attr.add_attr_with_inline.tag_attr : 0.000014s : 0.15% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000005s : 0.05% parallel-infer-symbol : 0.000003s : 0.04% pre_auto_parallel : 0.000025s : 0.28% insert-virtual-dataset : 0.000002s : 0.03% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.02% optimize.py_interpret_to_execute : 0.000018s : 0.20% optimize.rewriter_before_opt_a : 0.000050s : 0.54% optimize.opt_a.expand_dump_flag : 0.000004s : 0.04% optimize.opt_a.switch_simplify : 0.000032s : 0.36% optimize.opt_a.loop_unroll : 0.000020s : 0.22% optimize.opt_a.a_1 : 0.000479s : 5.25% optimize.opt_a.with_stream_mark : 0.000030s : 0.33% optimize.opt_a.recompute_prepare : 0.000018s : 0.19% optimize.opt_a.updatestate_depend_eliminate : 0.000008s : 0.09% optimize.opt_a.updatestate_assign_eliminate : 0.000005s : 0.06% optimize.opt_a.updatestate_loads_eliminate : 0.000007s : 0.07% optimize.opt_a.parameter_eliminate : 0.000003s : 0.03% optimize.opt_a.a_2 : 0.000188s : 2.06% optimize.opt_a.accelerated_algorithm : 0.000015s : 0.16% optimize.opt_a.shard : 0.000004s : 0.04% optimize.opt_a.meta_shard_fg_expand : 0.000003s : 0.04% optimize.opt_a.shard_inline : 0.000013s : 0.15% optimize.opt_a.merge_send_recv : 0.000014s : 0.15% optimize.opt_a.auto_parallel : 0.000012s : 0.14% optimize.opt_a.parallel : 0.000024s : 0.27% optimize.opt_a.flash_sp : 0.000012s : 0.13% optimize.opt_a.merge_comm : 0.000007s : 0.08% optimize.opt_a.allreduce_fusion : 0.000007s : 0.08% optimize.opt_a.matmul_add_comm_reduction : 0.000016s : 0.18% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.02% optimize.opt_a.virtual_shard_identity : 0.000016s : 0.17% optimize.opt_a.virtual_dataset : 0.000012s : 0.13% optimize.opt_a.get_grad_eliminate_ : 0.000012s : 0.13% optimize.opt_a.virtual_output : 0.000011s : 0.12% optimize.opt_a.merge_forward : 0.000007s : 0.07% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.04% optimize.opt_a.offload_activation : 0.000017s : 0.18% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000025s : 0.28% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.03% optimize.opt_a.before_grad : 0.000020s : 0.22% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000007s : 0.08% optimize.opt_a.meta_fg_expand : 0.000005s : 0.05% optimize.opt_a.flash_sp_send_recv_attached : 0.000003s : 0.04% optimize.opt_a.receive_attached : 0.000003s : 0.04% optimize.opt_a.after_resolve : 0.000019s : 0.21% optimize.opt_a.a_after_grad : 0.000017s : 0.19% optimize.opt_a.renormalize : 0.000507s : 5.56% optimize.opt_a.add_forward_monad_depend : 0.000007s : 0.08% optimize.opt_a.auto_monad_grad : 0.000004s : 0.04% optimize.opt_a.auto_monad_eliminator : 0.000025s : 0.28% optimize.opt_a.cse : 0.000048s : 0.52% optimize.opt_a.a_3 : 0.000084s : 0.92% optimize.py_interpret_to_execute_after_opt_a : 0.000010s : 0.11% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.02% optimize.rewriter_after_opt_a : 0.000036s : 0.39% optimize.convert_after_rewriter : 0.000007s : 0.08% optimize.order_py_execute_after_rewriter : 0.000005s : 0.06% optimize.mutable_eliminate : 0.000488s : 5.35% optimize.opt_b.b_1 : 0.000119s : 1.30% optimize.opt_b.b_2 : 0.000007s : 0.08% optimize.opt_b.updatestate_depend_eliminate : 0.000007s : 0.07% optimize.opt_b.updatestate_assign_eliminate : 0.000002s : 0.03% optimize.opt_b.updatestate_loads_eliminate : 0.000002s : 0.03% optimize.opt_b.renormalize : 0.000001s : 0.01% optimize.opt_b.cse : 0.000021s : 0.24% optimize.optimize_parallel_all_gather_comm : 0.000017s : 0.19% optimize.overlap_param_gather : 0.000002s : 0.02% optimize.cconv : 0.000025s : 0.28% optimize.loop_unroll : 0.000435s : 4.77% optimize.opt_after_cconv.c_1 : 0.000028s : 0.31% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.06% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000002s : 0.03% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.02% optimize.opt_after_cconv.cse : 0.000020s : 0.22% optimize.opt_after_cconv.renormalize : 0.000000s : 0.01% optimize.remove_dup_value : 0.000014s : 0.16% optimize.tuple_transform.d_1 : 0.000038s : 0.42% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.02% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000006s : 0.07% optimize.partial_unused_args_eliminate : 0.000003s : 0.03% optimize.add_recomputation : 0.000047s : 0.51% optimize.cse_after_recomputation.cse : 0.000012s : 0.13% optimize.environ_conv : 0.000005s : 0.06% optimize.swap_dp_allreduce_reducescatter : 0.000005s : 0.05% optimize.bias_add_comm_swap : 0.000002s : 0.02% optimize.label_micro_interleaved_index : 0.000004s : 0.04% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.03% optimize.merge_cast_opt : 0.000001s : 0.01% optimize.slice_recompute_activation : 0.000002s : 0.02% optimize.micro_interleaved_order_control : 0.000002s : 0.03% optimize.assign_add_opt : 0.000001s : 0.01% optimize.ForceFp32Comm : 0.000001s : 0.01% optimize.remove_cast_before_assign_add : 0.000001s : 0.01% optimize.full_micro_interleaved_order_control : 0.000002s : 0.02% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.03% optimize.comm_op_add_attrs : 0.000001s : 0.01% optimize.add_comm_op_reuse_tag : 0.000001s : 0.01% optimize.interleave_split_concat_branches : 0.000001s : 0.01% optimize.interleave_parallel_branches : 0.000001s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000002s : 0.02% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.02% optimize.control_data_broadcast_order : 0.000013s : 0.14% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.02% optimize.offloading_packed_experts : 0.000004s : 0.04% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.05% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000002s : 0.02% optimize.overlap_recompute_comm : 0.000002s : 0.02% optimize.overlap_grad_ring_attention : 0.000004s : 0.04% optimize.overlap_grad_flash_sp : 0.000019s : 0.21% optimize.begin_end_overlap_inline : 0.000000s : 0.01% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.03% optimize.split_layernorm_comm : 0.000002s : 0.02% optimize.handle_group_info : 0.000001s : 0.01% optimize.symbol_engine_optimizer.build : 0.000003s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000010s : 0.11% optimize.symbol_engine_optimizer.elim_not_effective : 0.000012s : 0.13% optimize.symbol_engine_optimizer.opt_reshape : 0.000006s : 0.07% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000010s : 0.11% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.03% pipeline_parallel_scheduler : 0.000002s : 0.02% auto_monad_reorder : 0.000016s : 0.18% get_jit_bprop_graph : 0.000002s : 0.02% rewriter_after_jit_bprop_graph : 0.000005s : 0.05% opt_after_jit_grad : 0.000477s : 5.23% validate : 0.000039s : 0.43% Time group info: ------[substitution.] 0.000188 23 40.66% : 0.000076s : 4: substitution.arithmetic_simplify 1.11% : 0.000002s : 2: substitution.elim_not_effective 0.88% : 0.000002s : 2: substitution.fold_const_symbol 2.80% : 0.000005s : 3: substitution.graph_param_transform 48.38% : 0.000091s : 2: substitution.inline 1.79% : 0.000003s : 4: substitution.j_node_and_user_rematch 2.60% : 0.000005s : 4: substitution.remove_not_recompute_node 1.79% : 0.000003s : 2: substitution.replace_old_param ------[type_inference.] 0.004611 2 91.17% : 0.004203s : 1: type_inference.infer 8.83% : 0.000407s : 1: type_inference.specialize ------[replace.] 0.000022 2 100.00% : 0.000022s : 2: replace.inline ------[match.] 0.000089 2 100.00% : 0.000089s : 2: match.inline ------[predicate.] 0.000136 754 0.81% : 0.000001s : 7: predicate.accumulaten_eliminater 1.03% : 0.000001s : 3: predicate.ad_related_special_op_eliminate 0.74% : 0.000001s : 6: predicate.addn_check_dump 0.88% : 0.000001s : 7: predicate.addn_zero_filter 0.74% : 0.000001s : 7: predicate.adjust_all_reduce_mul_add 3.16% : 0.000004s : 13: predicate.arithmetic_simplify 0.76% : 0.000001s : 7: predicate.cast_eliminate 0.92% : 0.000001s : 6: predicate.check_bprop_eliminate 0.65% : 0.000001s : 6: predicate.compare_switch_simplify 0.21% : 0.000000s : 3: predicate.const_output_eliminate 0.79% : 0.000001s : 6: predicate.depend_value_elim 0.81% : 0.000001s : 7: predicate.dict_get_item_const_eliminator 0.90% : 0.000001s : 7: predicate.dict_get_item_eliminator 0.81% : 0.000001s : 7: predicate.dict_set_item_eliminator 1.34% : 0.000002s : 6: predicate.dumpgradient_eliminate 0.25% : 0.000000s : 3: predicate.elim_not_effective 0.57% : 0.000001s : 3: predicate.elim_shapecalc_of_broadcastargs 1.12% : 0.000002s : 10: predicate.environ_add_const_eliminate 1.04% : 0.000001s : 10: predicate.environ_get_add_eliminate 0.97% : 0.000001s : 10: predicate.environ_get_depend_swap 1.82% : 0.000002s : 16: predicate.environ_get_eliminate 0.98% : 0.000001s : 10: predicate.environ_get_set_eliminate 0.92% : 0.000001s : 9: predicate.exchange_switch_depend_value 1.86% : 0.000003s : 9: predicate.float_depend_g_call 0.70% : 0.000001s : 6: predicate.float_environ_get_switch 1.04% : 0.000001s : 9: predicate.float_tuple_getitem_switch 0.23% : 0.000000s : 3: predicate.fold_const_symbol 0.76% : 0.000001s : 6: predicate.get_grad_eliminate 0.23% : 0.000000s : 3: predicate.graph_param_transform 0.85% : 0.000001s : 6: predicate.incorporate_call 0.69% : 0.000001s : 6: predicate.incorporate_call_switch 6.61% : 0.000009s : 34: predicate.inline 0.92% : 0.000001s : 6: predicate.inline_without_move 0.39% : 0.000001s : 6: predicate.j_node_and_user_rematch 0.99% : 0.000001s : 6: predicate.less_batch_normalization 1.62% : 0.000002s : 13: predicate.list_to_tuple_eliminator_ 2.03% : 0.000003s : 20: predicate.load_eliminater 1.31% : 0.000002s : 3: predicate.loop_unroll_after_grad 1.76% : 0.000002s : 14: predicate.loop_unroll_before_grad 1.79% : 0.000002s : 13: predicate.make_slice_get_slice_eliminator 0.76% : 0.000001s : 6: predicate.merge_addn 0.62% : 0.000001s : 6: predicate.micro_step_allgather_replace 0.72% : 0.000001s : 6: predicate.mini_step_allgather_replace 0.73% : 0.000001s : 7: predicate.minmaximum_grad 1.59% : 0.000002s : 3: predicate.mutable_eliminate 0.41% : 0.000001s : 3: predicate.opt_reshape 0.43% : 0.000001s : 3: predicate.parallel_virtual_node 1.37% : 0.000002s : 9: predicate.partial_defer_inline 1.28% : 0.000002s : 10: predicate.partial_eliminate 0.76% : 0.000001s : 7: predicate.print_const_string_wrapper 0.83% : 0.000001s : 6: predicate.reduce_all_const_elim 0.98% : 0.000001s : 7: predicate.reduce_eliminate 2.06% : 0.000003s : 20: predicate.redundant_stop_gradient_eliminater 0.63% : 0.000001s : 6: predicate.remove_not_recompute_node 1.20% : 0.000002s : 13: predicate.replace_applicator 0.62% : 0.000001s : 6: predicate.replace_old_param 0.36% : 0.000000s : 3: predicate.reset_defer_inline 0.93% : 0.000001s : 7: predicate.reshape_eliminate 0.70% : 0.000001s : 6: predicate.row_tensor_add_zeros_like 0.55% : 0.000001s : 3: predicate.row_tensor_eliminate 1.18% : 0.000002s : 6: predicate.same_eliminate 0.69% : 0.000001s : 6: predicate.set_cell_output_no_recompute 1.00% : 0.000001s : 6: predicate.shard_identity_eliminate 0.84% : 0.000001s : 6: predicate.special_op_eliminate 0.97% : 0.000001s : 6: predicate.specialize_transform 0.95% : 0.000001s : 6: predicate.split_environ_get_set_with_tuple_value 0.81% : 0.000001s : 6: predicate.stack_unstack_eliminate 0.48% : 0.000001s : 3: predicate.switch_call_monad_eliminater 1.03% : 0.000001s : 9: predicate.switch_defer_inline 1.70% : 0.000002s : 15: predicate.switch_layer_defer_inline 4.29% : 0.000006s : 32: predicate.switch_simplify 0.75% : 0.000001s : 7: predicate.tile_eliminate 0.83% : 0.000001s : 7: predicate.transpose_eliminate 1.56% : 0.000002s : 13: predicate.tuple_list_convert_item_index_to_positive 1.70% : 0.000002s : 13: predicate.tuple_list_get_item_const_eliminator 1.26% : 0.000002s : 13: predicate.tuple_list_get_item_depend_reorder 3.16% : 0.000004s : 19: predicate.tuple_list_get_item_eliminator 1.45% : 0.000002s : 13: predicate.tuple_list_get_set_item_eliminator 2.49% : 0.000003s : 19: predicate.tuple_list_set_item_eliminator 1.55% : 0.000002s : 13: predicate.tuple_to_list_eliminator_ 2.00% : 0.000003s : 20: predicate.updatestate_pure_node_eliminater 3.00% : 0.000004s : 26: predicate.updatestate_useless_node_eliminater 0.47% : 0.000001s : 3: predicate.value_based_eliminate 0.78% : 0.000001s : 6: predicate.virtual_dataset_eliminate 0.74% : 0.000001s : 6: predicate.virtual_output_eliminate 0.39% : 0.000001s : 3: predicate.virtual_view_grad_eliminate 0.42% : 0.000001s : 3: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000202 5 8.75% : 0.000018s : 1: func_graph_cloner_run.FuncGraphClonerGraph 91.25% : 0.000185s : 4: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.022265 192 0.02% : 0.000003s : 1: ForceFp32Comm 14.44% : 0.003215s : 1: add_attr 14.39% : 0.003204s : 1: add_attr_with_inline 0.02% : 0.000004s : 1: add_comm_op_reuse_tag 0.23% : 0.000051s : 1: add_recomputation 0.02% : 0.000004s : 1: assign_add_opt 0.26% : 0.000058s : 1: auto_monad 0.09% : 0.000020s : 1: auto_monad_reorder 0.01% : 0.000003s : 1: begin_end_overlap_inline 0.02% : 0.000005s : 1: bias_add_comm_swap 2.10% : 0.000468s : 1: bootstrap 0.13% : 0.000029s : 1: cconv 0.02% : 0.000004s : 1: comm_op_add_attrs 0.07% : 0.000017s : 1: control_data_broadcast_order 0.05% : 0.000010s : 1: convert_after_rewriter 0.11% : 0.000025s : 1: cse_after_recomputation 0.02% : 0.000005s : 1: dataset_repeat_opt 0.03% : 0.000006s : 1: detach_backward 0.04% : 0.000009s : 1: environ_conv 0.09% : 0.000019s : 1: event_method 0.02% : 0.000005s : 1: full_micro_interleaved_order_control 0.02% : 0.000005s : 1: get_jit_bprop_graph 0.04% : 0.000009s : 1: graph_reusing 0.02% : 0.000005s : 1: grouped_pairwise_exchange_alltoall 0.02% : 0.000004s : 1: handle_group_info 0.02% : 0.000005s : 1: inline 0.03% : 0.000006s : 1: insert-virtual-dataset 0.02% : 0.000004s : 1: interleave_parallel_branches 0.02% : 0.000004s : 1: interleave_split_concat_branches 0.03% : 0.000006s : 1: label_fine_grained_interleaved_index 0.03% : 0.000007s : 1: label_micro_interleaved_index 1.99% : 0.000443s : 1: loop_unroll 0.02% : 0.000004s : 1: merge_cast_opt 0.02% : 0.000005s : 1: micro_interleaved_order_control 2.23% : 0.000497s : 1: mutable_eliminate 0.03% : 0.000007s : 1: offloading_packed_experts 0.07% : 0.000016s : 1: opt.transform.loop_unroll_optimizer 0.07% : 0.000016s : 1: opt.transform.mutable_eliminate 3.96% : 0.000881s : 78: opt.transform.opt_a 0.12% : 0.000026s : 1: opt.transform.opt_after_cconv 0.11% : 0.000025s : 1: opt.transform.opt_after_jit_grad 0.42% : 0.000095s : 28: opt.transform.opt_b 0.19% : 0.000042s : 2: opt.transform.opt_trans_graph 0.16% : 0.000035s : 4: opt.transform.symbol_engine_opt 10.20% : 0.002270s : 1: opt_a 0.47% : 0.000105s : 1: opt_after_cconv 2.19% : 0.000487s : 1: opt_after_jit_grad 0.93% : 0.000207s : 1: opt_b 18.95% : 0.004220s : 1: optimize 0.09% : 0.000021s : 1: optimize_parallel_all_gather_comm 0.04% : 0.000008s : 1: order_py_execute_after_rewriter 0.10% : 0.000022s : 1: overlap_grad_flash_sp 0.02% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.03% : 0.000007s : 1: overlap_grad_ring_attention 0.02% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.02% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.02% : 0.000005s : 1: overlap_param_gather 0.02% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.04% : 0.000008s : 1: overlap_recompute_and_grad_model_parallel 0.02% : 0.000005s : 1: overlap_recompute_comm 0.03% : 0.000008s : 1: parallel-infer-symbol 0.02% : 0.000004s : 1: parallel-infer-symbol-second 0.03% : 0.000006s : 1: partial_unused_args_eliminate 0.02% : 0.000005s : 1: pipeline_parallel_scheduler 0.02% : 0.000004s : 1: pipeline_split 0.13% : 0.000030s : 1: pre_auto_parallel 0.10% : 0.000022s : 1: py_interpret_to_execute 0.06% : 0.000013s : 1: py_interpret_to_execute_after_opt_a 0.02% : 0.000004s : 1: remove_cast_before_assign_add 0.08% : 0.000018s : 1: remove_dup_value 1.21% : 0.000269s : 1: renormalize.infer 1.04% : 0.000230s : 1: renormalize.specialize 0.02% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.03% : 0.000008s : 1: rewriter_after_jit_bprop_graph 0.18% : 0.000040s : 1: rewriter_after_opt_a 0.25% : 0.000055s : 1: rewriter_before_opt_a 0.02% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.02% : 0.000005s : 1: slice_recompute_activation 0.02% : 0.000005s : 1: split_layernorm_comm 0.02% : 0.000005s : 1: split_matmul_comm_elemetwise 0.03% : 0.000008s : 1: swap_dp_allreduce_reducescatter 0.35% : 0.000079s : 1: symbol_engine_optimizer 0.33% : 0.000073s : 1: tuple_transform 20.99% : 0.004673s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:49.691.508 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:49.691.769 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0153025, [21] [bootstrap]: 0.00045326 [type_inference]: 0.00491934 [event_method]: 1.321e-05 [auto_monad]: 5.829e-05 [graph_reusing]: 5.91998e-06 [inline]: 2.45002e-06 [add_attr]: 0.00324983, [1] [add_attr_with_inline]: 0.00323982, [1] [Cycle 1]: 8.029e-05, [2] [tag_attr]: 1.584e-05 [meta_addattr_fg_expand]: 4e-06 [parallel-infer-symbol]: 3.37002e-06 [pre_auto_parallel]: 2.833e-05 [insert-virtual-dataset]: 3.02002e-06 [parallel-infer-symbol-second]: 9.5999e-07 [dataset_repeat_opt]: 2.16998e-06 [pipeline_split]: 1.66998e-06 [optimize]: 0.00530583, [53] [py_interpret_to_execute]: 2.523e-05 [rewriter_before_opt_a]: 5.31e-05 [opt_a]: 0.00284901, [2] [Cycle 1]: 0.0019336, [45] [expand_dump_flag]: 2.84999e-06 [switch_simplify]: 2.639e-05 [loop_unroll]: 1.416e-05 [a_1]: 0.00036529 [with_stream_mark]: 2.286e-05 [recompute_prepare]: 1.084e-05 [updatestate_depend_eliminate]: 4.03001e-06 [updatestate_assign_eliminate]: 3.41999e-06 [updatestate_loads_eliminate]: 2.99001e-06 [parameter_eliminate]: 1.99e-06 [a_2]: 0.00012755 [accelerated_algorithm]: 8.48001e-06 [shard]: 2.76e-06 [meta_shard_fg_expand]: 1.95001e-06 [shard_inline]: 6.05002e-06 [merge_send_recv]: 9.75002e-06 [auto_parallel]: 7.9e-06 [parallel]: 1.795e-05 [flash_sp]: 9.74999e-06 [merge_comm]: 4.52e-06 [allreduce_fusion]: 3.83001e-06 [matmul_add_comm_reduction]: 9.76e-06 [allreduce_slice_to_reducescatter]: 6.69999e-07 [virtual_shard_identity]: 1.207e-05 [virtual_dataset]: 6.53998e-06 [get_grad_eliminate_]: 6.36998e-06 [virtual_output]: 6.79999e-06 [merge_forward]: 3.99002e-06 [cell_reuse_recompute_pass]: 1.64e-06 [offload_activation]: 1.102e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.789e-05 [merge_recompute_call_nodes]: 1.64e-06 [before_grad]: 1.243e-05 [set_forward_comm_id_for_comm_node_pass]: 5.59e-06 [meta_fg_expand]: 2.74001e-06 [flash_sp_send_recv_attached]: 2.93e-06 [receive_attached]: 2.69001e-06 [after_resolve]: 1.284e-05 [a_after_grad]: 1.013e-05 [renormalize]: 0.00060434 [add_forward_monad_depend]: 6.81999e-06 [auto_monad_grad]: 2.73e-06 [auto_monad_eliminator]: 1.723e-05 [cse]: 2.866e-05 [a_3]: 6.041e-05 [Cycle 2]: 0.00090031, [45] [expand_dump_flag]: 1.17e-06 [switch_simplify]: 7.71001e-06 [loop_unroll]: 5.82999e-06 [a_1]: 0.00013343 [with_stream_mark]: 1.593e-05 [recompute_prepare]: 7.03e-06 [updatestate_depend_eliminate]: 3.34001e-06 [updatestate_assign_eliminate]: 2.92002e-06 [updatestate_loads_eliminate]: 2.74999e-06 [parameter_eliminate]: 2.01e-06 [a_2]: 0.00011309 [accelerated_algorithm]: 7.30998e-06 [shard]: 1.97001e-06 [meta_shard_fg_expand]: 1.83002e-06 [shard_inline]: 7.3e-06 [merge_send_recv]: 7.88999e-06 [auto_parallel]: 6.89001e-06 [parallel]: 6.42001e-06 [flash_sp]: 4.12e-06 [merge_comm]: 3.98999e-06 [allreduce_fusion]: 3.48e-06 [matmul_add_comm_reduction]: 7.3e-06 [allreduce_slice_to_reducescatter]: 5.3001e-07 [virtual_shard_identity]: 8.27e-06 [virtual_dataset]: 6.21e-06 [get_grad_eliminate_]: 5.32999e-06 [virtual_output]: 6.29999e-06 [merge_forward]: 3.58e-06 [cell_reuse_recompute_pass]: 1.66e-06 [offload_activation]: 7.61999e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.475e-05 [merge_recompute_call_nodes]: 1.14e-06 [before_grad]: 9.72999e-06 [set_forward_comm_id_for_comm_node_pass]: 5.51e-06 [meta_fg_expand]: 3.06001e-06 [flash_sp_send_recv_attached]: 1.29998e-06 [receive_attached]: 1.17999e-06 [after_resolve]: 9.99001e-06 [a_after_grad]: 9.49999e-06 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 2.46998e-06 [auto_monad_grad]: 1.59e-06 [auto_monad_eliminator]: 1.116e-05 [cse]: 2.042e-05 [a_3]: 5.044e-05 [py_interpret_to_execute_after_opt_a]: 1.601e-05 [slice_cell_reuse_recomputed_activation]: 4.84998e-06 [rewriter_after_opt_a]: 4.596e-05 [convert_after_rewriter]: 1.053e-05 [order_py_execute_after_rewriter]: 8.91002e-06 [mutable_eliminate]: 0.00060519 [opt_b]: 0.0002844, [1] [Cycle 1]: 0.00027378, [7] [b_1]: 0.00016723 [b_2]: 8.44998e-06 [updatestate_depend_eliminate]: 7.64002e-06 [updatestate_assign_eliminate]: 2.45002e-06 [updatestate_loads_eliminate]: 2.41998e-06 [renormalize]: 5.99975e-07 [cse]: 2.388e-05 [optimize_parallel_all_gather_comm]: 2.091e-05 [overlap_param_gather]: 4.84003e-06 [cconv]: 3.442e-05 [loop_unroll]: 0.00047419 [opt_after_cconv]: 0.00013585, [1] [Cycle 1]: 0.00012569, [7] [c_1]: 2.781e-05 [parameter_eliminate]: 5.28002e-06 [updatestate_depend_eliminate]: 7e-06 [updatestate_assign_eliminate]: 2.52001e-06 [updatestate_loads_eliminate]: 2.84999e-06 [cse]: 2.248e-05 [renormalize]: 5.59987e-07 [remove_dup_value]: 1.791e-05 [tuple_transform]: 9.01e-05, [1] [Cycle 1]: 8.263e-05, [4] [d_1]: 4.183e-05 [none_parameter_eliminate]: 1.69998e-06 [renormalize]: 1.39989e-07 [switch_simplify]: 7.15e-06 [partial_unused_args_eliminate]: 4.78001e-06 [add_recomputation]: 5.313e-05 [cse_after_recomputation]: 3.071e-05, [1] [Cycle 1]: 2.354e-05, [1] [cse]: 1.344e-05 [environ_conv]: 9.35001e-06 [swap_dp_allreduce_reducescatter]: 8.18999e-06 [bias_add_comm_swap]: 4.89e-06 [label_micro_interleaved_index]: 7.16999e-06 [label_fine_grained_interleaved_index]: 5.25999e-06 [merge_cast_opt]: 3.86001e-06 [slice_recompute_activation]: 4.66002e-06 [micro_interleaved_order_control]: 5.34e-06 [assign_add_opt]: 3.71001e-06 [ForceFp32Comm]: 3.32002e-06 [remove_cast_before_assign_add]: 3.45e-06 [full_micro_interleaved_order_control]: 4.89e-06 [reorder_send_recv_between_fp_bp]: 5.29e-06 [comm_op_add_attrs]: 4.1e-06 [add_comm_op_reuse_tag]: 3.28e-06 [interleave_split_concat_branches]: 3.75e-06 [interleave_parallel_branches]: 3.48e-06 [overlap_opt_shard_in_pipeline]: 3.68e-06 [overlap_opt_shard_grad_in_pipeline]: 4.62e-06 [control_data_broadcast_order]: 1.756e-05 [grouped_pairwise_exchange_alltoall]: 4.17e-06 [offloading_packed_experts]: 6.96001e-06 [overlap_recompute_and_grad_model_parallel]: 7.61999e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.45e-06 [overlap_recompute_allgather_and_fa_grad]: 3.83999e-06 [overlap_recompute_comm]: 5.09e-06 [overlap_grad_ring_attention]: 7.93001e-06 [overlap_grad_flash_sp]: 2.613e-05 [begin_end_overlap_inline]: 2.83e-06 [split_matmul_comm_elemetwise]: 4.32e-06 [split_layernorm_comm]: 3.90998e-06 [handle_group_info]: 3.63999e-06 [symbol_engine_optimizer]: 0.00010415, [1] [Cycle 1]: 9.673e-05, [6] [build]: 3.97e-06 [elim_shapecalc]: 1.27e-05 [elim_not_effective]: 1.393e-05 [opt_reshape]: 6.87002e-06 [fold_const_symbol]: 1.031e-05 [renormalize]: 2.19996e-07 [detach_backward]: 4.66002e-06 [pipeline_parallel_scheduler]: 2.14999e-06 [auto_monad_reorder]: 1.981e-05 [get_jit_bprop_graph]: 2.09999e-06 [rewriter_after_jit_bprop_graph]: 5.92999e-06 [opt_after_jit_grad]: 0.00052341 [validate]: 4.052e-05 Sums bootstrap : 0.000453s : 4.48% type_inference : 0.004919s : 48.65% event_method : 0.000013s : 0.13% auto_monad : 0.000058s : 0.58% graph_reusing : 0.000006s : 0.06% inline : 0.000002s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000016s : 0.16% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000004s : 0.04% parallel-infer-symbol : 0.000003s : 0.03% pre_auto_parallel : 0.000028s : 0.28% insert-virtual-dataset : 0.000003s : 0.03% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.02% optimize.py_interpret_to_execute : 0.000025s : 0.25% optimize.rewriter_before_opt_a : 0.000053s : 0.53% optimize.opt_a.expand_dump_flag : 0.000004s : 0.04% optimize.opt_a.switch_simplify : 0.000034s : 0.34% optimize.opt_a.loop_unroll : 0.000020s : 0.20% optimize.opt_a.a_1 : 0.000499s : 4.93% optimize.opt_a.with_stream_mark : 0.000039s : 0.38% optimize.opt_a.recompute_prepare : 0.000018s : 0.18% optimize.opt_a.updatestate_depend_eliminate : 0.000007s : 0.07% optimize.opt_a.updatestate_assign_eliminate : 0.000006s : 0.06% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.06% optimize.opt_a.parameter_eliminate : 0.000004s : 0.04% optimize.opt_a.a_2 : 0.000241s : 2.38% optimize.opt_a.accelerated_algorithm : 0.000016s : 0.16% optimize.opt_a.shard : 0.000005s : 0.05% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.04% optimize.opt_a.shard_inline : 0.000013s : 0.13% optimize.opt_a.merge_send_recv : 0.000018s : 0.17% optimize.opt_a.auto_parallel : 0.000015s : 0.15% optimize.opt_a.parallel : 0.000024s : 0.24% optimize.opt_a.flash_sp : 0.000014s : 0.14% optimize.opt_a.merge_comm : 0.000009s : 0.08% optimize.opt_a.allreduce_fusion : 0.000007s : 0.07% optimize.opt_a.matmul_add_comm_reduction : 0.000017s : 0.17% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000020s : 0.20% optimize.opt_a.virtual_dataset : 0.000013s : 0.13% optimize.opt_a.get_grad_eliminate_ : 0.000012s : 0.12% optimize.opt_a.virtual_output : 0.000013s : 0.13% optimize.opt_a.merge_forward : 0.000008s : 0.07% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.03% optimize.opt_a.offload_activation : 0.000019s : 0.18% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000033s : 0.32% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.03% optimize.opt_a.before_grad : 0.000022s : 0.22% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000011s : 0.11% optimize.opt_a.meta_fg_expand : 0.000006s : 0.06% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.04% optimize.opt_a.receive_attached : 0.000004s : 0.04% optimize.opt_a.after_resolve : 0.000023s : 0.23% optimize.opt_a.a_after_grad : 0.000020s : 0.19% optimize.opt_a.renormalize : 0.000604s : 5.98% optimize.opt_a.add_forward_monad_depend : 0.000009s : 0.09% optimize.opt_a.auto_monad_grad : 0.000004s : 0.04% optimize.opt_a.auto_monad_eliminator : 0.000028s : 0.28% optimize.opt_a.cse : 0.000049s : 0.49% optimize.opt_a.a_3 : 0.000111s : 1.10% optimize.py_interpret_to_execute_after_opt_a : 0.000016s : 0.16% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.05% optimize.rewriter_after_opt_a : 0.000046s : 0.45% optimize.convert_after_rewriter : 0.000011s : 0.10% optimize.order_py_execute_after_rewriter : 0.000009s : 0.09% optimize.mutable_eliminate : 0.000605s : 5.98% optimize.opt_b.b_1 : 0.000167s : 1.65% optimize.opt_b.b_2 : 0.000008s : 0.08% optimize.opt_b.updatestate_depend_eliminate : 0.000008s : 0.08% optimize.opt_b.updatestate_assign_eliminate : 0.000002s : 0.02% optimize.opt_b.updatestate_loads_eliminate : 0.000002s : 0.02% optimize.opt_b.renormalize : 0.000001s : 0.01% optimize.opt_b.cse : 0.000024s : 0.24% optimize.optimize_parallel_all_gather_comm : 0.000021s : 0.21% optimize.overlap_param_gather : 0.000005s : 0.05% optimize.cconv : 0.000034s : 0.34% optimize.loop_unroll : 0.000474s : 4.69% optimize.opt_after_cconv.c_1 : 0.000028s : 0.28% optimize.opt_after_cconv.parameter_eliminate : 0.000005s : 0.05% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000007s : 0.07% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.02% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.cse : 0.000022s : 0.22% optimize.opt_after_cconv.renormalize : 0.000001s : 0.01% optimize.remove_dup_value : 0.000018s : 0.18% optimize.tuple_transform.d_1 : 0.000042s : 0.41% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.02% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000007s : 0.07% optimize.partial_unused_args_eliminate : 0.000005s : 0.05% optimize.add_recomputation : 0.000053s : 0.53% optimize.cse_after_recomputation.cse : 0.000013s : 0.13% optimize.environ_conv : 0.000009s : 0.09% optimize.swap_dp_allreduce_reducescatter : 0.000008s : 0.08% optimize.bias_add_comm_swap : 0.000005s : 0.05% optimize.label_micro_interleaved_index : 0.000007s : 0.07% optimize.label_fine_grained_interleaved_index : 0.000005s : 0.05% optimize.merge_cast_opt : 0.000004s : 0.04% optimize.slice_recompute_activation : 0.000005s : 0.05% optimize.micro_interleaved_order_control : 0.000005s : 0.05% optimize.assign_add_opt : 0.000004s : 0.04% optimize.ForceFp32Comm : 0.000003s : 0.03% optimize.remove_cast_before_assign_add : 0.000003s : 0.03% optimize.full_micro_interleaved_order_control : 0.000005s : 0.05% optimize.reorder_send_recv_between_fp_bp : 0.000005s : 0.05% optimize.comm_op_add_attrs : 0.000004s : 0.04% optimize.add_comm_op_reuse_tag : 0.000003s : 0.03% optimize.interleave_split_concat_branches : 0.000004s : 0.04% optimize.interleave_parallel_branches : 0.000003s : 0.03% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.04% optimize.overlap_opt_shard_grad_in_pipeline : 0.000005s : 0.05% optimize.control_data_broadcast_order : 0.000018s : 0.17% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.04% optimize.offloading_packed_experts : 0.000007s : 0.07% optimize.overlap_recompute_and_grad_model_parallel : 0.000008s : 0.08% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000003s : 0.03% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.04% optimize.overlap_recompute_comm : 0.000005s : 0.05% optimize.overlap_grad_ring_attention : 0.000008s : 0.08% optimize.overlap_grad_flash_sp : 0.000026s : 0.26% optimize.begin_end_overlap_inline : 0.000003s : 0.03% optimize.split_matmul_comm_elemetwise : 0.000004s : 0.04% optimize.split_layernorm_comm : 0.000004s : 0.04% optimize.handle_group_info : 0.000004s : 0.04% optimize.symbol_engine_optimizer.build : 0.000004s : 0.04% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000013s : 0.13% optimize.symbol_engine_optimizer.elim_not_effective : 0.000014s : 0.14% optimize.symbol_engine_optimizer.opt_reshape : 0.000007s : 0.07% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000010s : 0.10% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000005s : 0.05% pipeline_parallel_scheduler : 0.000002s : 0.02% auto_monad_reorder : 0.000020s : 0.20% get_jit_bprop_graph : 0.000002s : 0.02% rewriter_after_jit_bprop_graph : 0.000006s : 0.06% opt_after_jit_grad : 0.000523s : 5.18% validate : 0.000041s : 0.40% Time group info: ------[substitution.] 0.000203 23 41.04% : 0.000083s : 4: substitution.arithmetic_simplify 1.03% : 0.000002s : 2: substitution.elim_not_effective 0.67% : 0.000001s : 2: substitution.fold_const_symbol 2.69% : 0.000005s : 3: substitution.graph_param_transform 48.13% : 0.000098s : 2: substitution.inline 2.24% : 0.000005s : 4: substitution.j_node_and_user_rematch 2.20% : 0.000004s : 4: substitution.remove_not_recompute_node 2.00% : 0.000004s : 2: substitution.replace_old_param ------[type_inference.] 0.004868 2 90.89% : 0.004424s : 1: type_inference.infer 9.11% : 0.000444s : 1: type_inference.specialize ------[replace.] 0.000023 2 100.00% : 0.000023s : 2: replace.inline ------[match.] 0.000096 2 100.00% : 0.000096s : 2: match.inline ------[predicate.] 0.000143 754 0.76% : 0.000001s : 7: predicate.accumulaten_eliminater 1.17% : 0.000002s : 3: predicate.ad_related_special_op_eliminate 0.67% : 0.000001s : 6: predicate.addn_check_dump 0.75% : 0.000001s : 7: predicate.addn_zero_filter 0.65% : 0.000001s : 7: predicate.adjust_all_reduce_mul_add 3.16% : 0.000005s : 13: predicate.arithmetic_simplify 0.74% : 0.000001s : 7: predicate.cast_eliminate 0.70% : 0.000001s : 6: predicate.check_bprop_eliminate 0.65% : 0.000001s : 6: predicate.compare_switch_simplify 0.22% : 0.000000s : 3: predicate.const_output_eliminate 0.72% : 0.000001s : 6: predicate.depend_value_elim 0.71% : 0.000001s : 7: predicate.dict_get_item_const_eliminator 0.83% : 0.000001s : 7: predicate.dict_get_item_eliminator 0.70% : 0.000001s : 7: predicate.dict_set_item_eliminator 1.48% : 0.000002s : 6: predicate.dumpgradient_eliminate 0.22% : 0.000000s : 3: predicate.elim_not_effective 0.46% : 0.000001s : 3: predicate.elim_shapecalc_of_broadcastargs 1.00% : 0.000001s : 10: predicate.environ_add_const_eliminate 0.90% : 0.000001s : 10: predicate.environ_get_add_eliminate 0.93% : 0.000001s : 10: predicate.environ_get_depend_swap 1.80% : 0.000003s : 16: predicate.environ_get_eliminate 0.97% : 0.000001s : 10: predicate.environ_get_set_eliminate 0.88% : 0.000001s : 9: predicate.exchange_switch_depend_value 1.76% : 0.000003s : 9: predicate.float_depend_g_call 0.64% : 0.000001s : 6: predicate.float_environ_get_switch 0.93% : 0.000001s : 9: predicate.float_tuple_getitem_switch 0.22% : 0.000000s : 3: predicate.fold_const_symbol 0.72% : 0.000001s : 6: predicate.get_grad_eliminate 0.39% : 0.000001s : 3: predicate.graph_param_transform 0.81% : 0.000001s : 6: predicate.incorporate_call 0.63% : 0.000001s : 6: predicate.incorporate_call_switch 6.72% : 0.000010s : 34: predicate.inline 1.28% : 0.000002s : 6: predicate.inline_without_move 0.36% : 0.000001s : 6: predicate.j_node_and_user_rematch 1.09% : 0.000002s : 6: predicate.less_batch_normalization 1.48% : 0.000002s : 13: predicate.list_to_tuple_eliminator_ 1.91% : 0.000003s : 20: predicate.load_eliminater 1.49% : 0.000002s : 3: predicate.loop_unroll_after_grad 1.89% : 0.000003s : 14: predicate.loop_unroll_before_grad 1.70% : 0.000002s : 13: predicate.make_slice_get_slice_eliminator 0.67% : 0.000001s : 6: predicate.merge_addn 0.64% : 0.000001s : 6: predicate.micro_step_allgather_replace 0.77% : 0.000001s : 6: predicate.mini_step_allgather_replace 0.65% : 0.000001s : 7: predicate.minmaximum_grad 1.56% : 0.000002s : 3: predicate.mutable_eliminate 0.57% : 0.000001s : 3: predicate.opt_reshape 0.59% : 0.000001s : 3: predicate.parallel_virtual_node 1.30% : 0.000002s : 9: predicate.partial_defer_inline 1.19% : 0.000002s : 10: predicate.partial_eliminate 0.74% : 0.000001s : 7: predicate.print_const_string_wrapper 0.67% : 0.000001s : 6: predicate.reduce_all_const_elim 0.84% : 0.000001s : 7: predicate.reduce_eliminate 1.97% : 0.000003s : 20: predicate.redundant_stop_gradient_eliminater 0.74% : 0.000001s : 6: predicate.remove_not_recompute_node 1.03% : 0.000001s : 13: predicate.replace_applicator 1.38% : 0.000002s : 6: predicate.replace_old_param 0.79% : 0.000001s : 3: predicate.reset_defer_inline 0.80% : 0.000001s : 7: predicate.reshape_eliminate 0.74% : 0.000001s : 6: predicate.row_tensor_add_zeros_like 0.48% : 0.000001s : 3: predicate.row_tensor_eliminate 1.53% : 0.000002s : 6: predicate.same_eliminate 0.87% : 0.000001s : 6: predicate.set_cell_output_no_recompute 1.26% : 0.000002s : 6: predicate.shard_identity_eliminate 0.97% : 0.000001s : 6: predicate.special_op_eliminate 1.16% : 0.000002s : 6: predicate.specialize_transform 1.02% : 0.000001s : 6: predicate.split_environ_get_set_with_tuple_value 0.88% : 0.000001s : 6: predicate.stack_unstack_eliminate 0.41% : 0.000001s : 3: predicate.switch_call_monad_eliminater 0.93% : 0.000001s : 9: predicate.switch_defer_inline 1.59% : 0.000002s : 15: predicate.switch_layer_defer_inline 4.40% : 0.000006s : 32: predicate.switch_simplify 0.69% : 0.000001s : 7: predicate.tile_eliminate 0.71% : 0.000001s : 7: predicate.transpose_eliminate 1.40% : 0.000002s : 13: predicate.tuple_list_convert_item_index_to_positive 1.53% : 0.000002s : 13: predicate.tuple_list_get_item_const_eliminator 1.40% : 0.000002s : 13: predicate.tuple_list_get_item_depend_reorder 2.89% : 0.000004s : 19: predicate.tuple_list_get_item_eliminator 1.28% : 0.000002s : 13: predicate.tuple_list_get_set_item_eliminator 2.52% : 0.000004s : 19: predicate.tuple_list_set_item_eliminator 1.44% : 0.000002s : 13: predicate.tuple_to_list_eliminator_ 1.82% : 0.000003s : 20: predicate.updatestate_pure_node_eliminater 2.68% : 0.000004s : 26: predicate.updatestate_useless_node_eliminater 0.66% : 0.000001s : 3: predicate.value_based_eliminate 1.09% : 0.000002s : 6: predicate.virtual_dataset_eliminate 0.74% : 0.000001s : 6: predicate.virtual_output_eliminate 0.34% : 0.000000s : 3: predicate.virtual_view_grad_eliminate 0.58% : 0.000001s : 3: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000238 5 8.76% : 0.000021s : 1: func_graph_cloner_run.FuncGraphClonerGraph 91.24% : 0.000217s : 4: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.025520 192 0.02% : 0.000006s : 1: ForceFp32Comm 12.77% : 0.003260s : 1: add_attr 12.71% : 0.003244s : 1: add_attr_with_inline 0.02% : 0.000006s : 1: add_comm_op_reuse_tag 0.23% : 0.000058s : 1: add_recomputation 0.03% : 0.000007s : 1: assign_add_opt 0.26% : 0.000067s : 1: auto_monad 0.11% : 0.000027s : 1: auto_monad_reorder 0.02% : 0.000006s : 1: begin_end_overlap_inline 0.03% : 0.000008s : 1: bias_add_comm_swap 1.96% : 0.000500s : 1: bootstrap 0.15% : 0.000038s : 1: cconv 0.03% : 0.000007s : 1: comm_op_add_attrs 0.08% : 0.000022s : 1: control_data_broadcast_order 0.06% : 0.000015s : 1: convert_after_rewriter 0.14% : 0.000035s : 1: cse_after_recomputation 0.03% : 0.000008s : 1: dataset_repeat_opt 0.10% : 0.000024s : 1: detach_backward 0.05% : 0.000013s : 1: environ_conv 0.09% : 0.000023s : 1: event_method 0.03% : 0.000008s : 1: full_micro_interleaved_order_control 0.03% : 0.000008s : 1: get_jit_bprop_graph 0.05% : 0.000012s : 1: graph_reusing 0.03% : 0.000008s : 1: grouped_pairwise_exchange_alltoall 0.03% : 0.000007s : 1: handle_group_info 0.03% : 0.000008s : 1: inline 0.04% : 0.000009s : 1: insert-virtual-dataset 0.02% : 0.000006s : 1: interleave_parallel_branches 0.03% : 0.000006s : 1: interleave_split_concat_branches 0.03% : 0.000008s : 1: label_fine_grained_interleaved_index 0.04% : 0.000010s : 1: label_micro_interleaved_index 1.93% : 0.000494s : 1: loop_unroll 0.03% : 0.000007s : 1: merge_cast_opt 0.03% : 0.000008s : 1: micro_interleaved_order_control 2.40% : 0.000613s : 1: mutable_eliminate 0.04% : 0.000010s : 1: offloading_packed_experts 0.07% : 0.000018s : 1: opt.transform.loop_unroll_optimizer 0.07% : 0.000019s : 1: opt.transform.mutable_eliminate 3.61% : 0.000922s : 78: opt.transform.opt_a 0.10% : 0.000026s : 1: opt.transform.opt_after_cconv 0.11% : 0.000027s : 1: opt.transform.opt_after_jit_grad 0.39% : 0.000099s : 28: opt.transform.opt_b 0.18% : 0.000046s : 2: opt.transform.opt_trans_graph 0.15% : 0.000039s : 4: opt.transform.symbol_engine_opt 11.18% : 0.002853s : 1: opt_a 0.55% : 0.000140s : 1: opt_after_cconv 2.10% : 0.000536s : 1: opt_after_jit_grad 1.13% : 0.000288s : 1: opt_b 22.32% : 0.005696s : 1: optimize 0.10% : 0.000025s : 1: optimize_parallel_all_gather_comm 0.05% : 0.000012s : 1: order_py_execute_after_rewriter 0.12% : 0.000031s : 1: overlap_grad_flash_sp 0.02% : 0.000006s : 1: overlap_grad_matmul_and_grad_allreduce 0.04% : 0.000011s : 1: overlap_grad_ring_attention 0.03% : 0.000008s : 1: overlap_opt_shard_grad_in_pipeline 0.03% : 0.000007s : 1: overlap_opt_shard_in_pipeline 0.03% : 0.000008s : 1: overlap_param_gather 0.03% : 0.000006s : 1: overlap_recompute_allgather_and_fa_grad 0.04% : 0.000010s : 1: overlap_recompute_and_grad_model_parallel 0.03% : 0.000008s : 1: overlap_recompute_comm 0.04% : 0.000010s : 1: parallel-infer-symbol 0.03% : 0.000006s : 1: parallel-infer-symbol-second 0.03% : 0.000008s : 1: partial_unused_args_eliminate 0.04% : 0.000010s : 1: pipeline_parallel_scheduler 0.03% : 0.000008s : 1: pipeline_split 0.14% : 0.000036s : 1: pre_auto_parallel 0.11% : 0.000029s : 1: py_interpret_to_execute 0.08% : 0.000020s : 1: py_interpret_to_execute_after_opt_a 0.02% : 0.000006s : 1: remove_cast_before_assign_add 0.08% : 0.000022s : 1: remove_dup_value 1.30% : 0.000331s : 1: renormalize.infer 1.03% : 0.000263s : 1: renormalize.specialize 0.03% : 0.000008s : 1: reorder_send_recv_between_fp_bp 0.05% : 0.000012s : 1: rewriter_after_jit_bprop_graph 0.20% : 0.000051s : 1: rewriter_after_opt_a 0.22% : 0.000057s : 1: rewriter_before_opt_a 0.03% : 0.000008s : 1: slice_cell_reuse_recomputed_activation 0.03% : 0.000007s : 1: slice_recompute_activation 0.03% : 0.000007s : 1: split_layernorm_comm 0.03% : 0.000007s : 1: split_matmul_comm_elemetwise 0.04% : 0.000011s : 1: swap_dp_allreduce_reducescatter 0.42% : 0.000107s : 1: symbol_engine_optimizer 0.36% : 0.000093s : 1: tuple_transform 19.47% : 0.004968s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:49.890.868 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0128391, [21] [bootstrap]: 0.00043703 [type_inference]: 0.00466267 [event_method]: 1.151e-05 [auto_monad]: 5.337e-05 [graph_reusing]: 5.48002e-06 [inline]: 1.99999e-06 [add_attr]: 0.00304161, [1] [add_attr_with_inline]: 0.00303367, [1] [Cycle 1]: 4.652e-05, [2] [tag_attr]: 1.462e-05 [meta_addattr_fg_expand]: 3.45e-06 [parallel-infer-symbol]: 2.98e-06 [pre_auto_parallel]: 2.189e-05 [insert-virtual-dataset]: 2.31e-06 [parallel-infer-symbol-second]: 7.00005e-07 [dataset_repeat_opt]: 1.95001e-06 [pipeline_split]: 1.92001e-06 [optimize]: 0.00394938, [53] [py_interpret_to_execute]: 1.598e-05 [rewriter_before_opt_a]: 4.225e-05 [opt_a]: 0.00205137, [2] [Cycle 1]: 0.00140653, [45] [expand_dump_flag]: 2.62001e-06 [switch_simplify]: 2.572e-05 [loop_unroll]: 1.371e-05 [a_1]: 0.00031494 [with_stream_mark]: 1.415e-05 [recompute_prepare]: 8.32998e-06 [updatestate_depend_eliminate]: 3.6e-06 [updatestate_assign_eliminate]: 2.93e-06 [updatestate_loads_eliminate]: 3.09999e-06 [parameter_eliminate]: 2.11e-06 [a_2]: 9.066e-05 [accelerated_algorithm]: 7.6e-06 [shard]: 1.96e-06 [meta_shard_fg_expand]: 1.80001e-06 [shard_inline]: 6.79999e-06 [merge_send_recv]: 8.38001e-06 [auto_parallel]: 6.00002e-06 [parallel]: 1.778e-05 [flash_sp]: 8.05999e-06 [merge_comm]: 3.76001e-06 [allreduce_fusion]: 3.76001e-06 [matmul_add_comm_reduction]: 9.53002e-06 [allreduce_slice_to_reducescatter]: 6.00005e-07 [virtual_shard_identity]: 8.15999e-06 [virtual_dataset]: 6.11998e-06 [get_grad_eliminate_]: 6.56e-06 [virtual_output]: 6.02001e-06 [merge_forward]: 3.83001e-06 [cell_reuse_recompute_pass]: 1.19003e-06 [offload_activation]: 9.56e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.207e-05 [merge_recompute_call_nodes]: 1.65001e-06 [before_grad]: 1.008e-05 [set_forward_comm_id_for_comm_node_pass]: 3.55e-06 [meta_fg_expand]: 2.76e-06 [flash_sp_send_recv_attached]: 2.43e-06 [receive_attached]: 2.18998e-06 [after_resolve]: 9.83998e-06 [a_after_grad]: 8.58001e-06 [renormalize]: 0.00042098 [add_forward_monad_depend]: 4.22e-06 [auto_monad_grad]: 2.05002e-06 [auto_monad_eliminator]: 1.503e-05 [cse]: 3.08e-05 [a_3]: 4.439e-05 [Cycle 2]: 0.00063519, [45] [expand_dump_flag]: 1.19e-06 [switch_simplify]: 6.79001e-06 [loop_unroll]: 5.63002e-06 [a_1]: 0.00012615 [with_stream_mark]: 1.029e-05 [recompute_prepare]: 6.68003e-06 [updatestate_depend_eliminate]: 2.82002e-06 [updatestate_assign_eliminate]: 2.31998e-06 [updatestate_loads_eliminate]: 2.86e-06 [parameter_eliminate]: 9.79984e-07 [a_2]: 8.009e-05 [accelerated_algorithm]: 6.07999e-06 [shard]: 1.04e-06 [meta_shard_fg_expand]: 1.27999e-06 [shard_inline]: 6.12999e-06 [merge_send_recv]: 4.74998e-06 [auto_parallel]: 5.56002e-06 [parallel]: 4.32e-06 [flash_sp]: 3.54002e-06 [merge_comm]: 3.39001e-06 [allreduce_fusion]: 3.06001e-06 [matmul_add_comm_reduction]: 5.76e-06 [allreduce_slice_to_reducescatter]: 4.30009e-07 [virtual_shard_identity]: 6.33e-06 [virtual_dataset]: 5.44e-06 [get_grad_eliminate_]: 5.30001e-06 [virtual_output]: 5.18002e-06 [merge_forward]: 2.61e-06 [cell_reuse_recompute_pass]: 1.39e-06 [offload_activation]: 6.02001e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.039e-05 [merge_recompute_call_nodes]: 9.39996e-07 [before_grad]: 8.83001e-06 [set_forward_comm_id_for_comm_node_pass]: 3.71001e-06 [meta_fg_expand]: 2.41e-06 [flash_sp_send_recv_attached]: 9.50007e-07 [receive_attached]: 9.70002e-07 [after_resolve]: 8.3e-06 [a_after_grad]: 7.56001e-06 [renormalize]: 8.00064e-08 [add_forward_monad_depend]: 1.19e-06 [auto_monad_grad]: 9.09989e-07 [auto_monad_eliminator]: 7.27002e-06 [cse]: 1.467e-05 [a_3]: 3.417e-05 [py_interpret_to_execute_after_opt_a]: 7.51999e-06 [slice_cell_reuse_recomputed_activation]: 2.02999e-06 [rewriter_after_opt_a]: 3.301e-05 [convert_after_rewriter]: 6.47001e-06 [order_py_execute_after_rewriter]: 5.19e-06 [mutable_eliminate]: 0.00049609 [opt_b]: 0.00019647, [1] [Cycle 1]: 0.0001906, [7] [b_1]: 0.00011853 [b_2]: 7.13e-06 [updatestate_depend_eliminate]: 5.17999e-06 [updatestate_assign_eliminate]: 2.53e-06 [updatestate_loads_eliminate]: 2.73e-06 [renormalize]: 5.79981e-07 [cse]: 1.856e-05 [optimize_parallel_all_gather_comm]: 1.664e-05 [overlap_param_gather]: 2.27999e-06 [cconv]: 2.448e-05 [loop_unroll]: 0.00041944 [opt_after_cconv]: 9.736e-05, [1] [Cycle 1]: 9.199e-05, [7] [c_1]: 2.68e-05 [parameter_eliminate]: 2.71e-06 [updatestate_depend_eliminate]: 4.92999e-06 [updatestate_assign_eliminate]: 2.43e-06 [updatestate_loads_eliminate]: 2.37001e-06 [cse]: 1.757e-05 [renormalize]: 4.09986e-07 [remove_dup_value]: 1.401e-05 [tuple_transform]: 6.981e-05, [1] [Cycle 1]: 6.573e-05, [4] [d_1]: 3.891e-05 [none_parameter_eliminate]: 1.76003e-06 [renormalize]: 2.19996e-07 [switch_simplify]: 6.44999e-06 [partial_unused_args_eliminate]: 1.71e-06 [add_recomputation]: 4.44e-05 [cse_after_recomputation]: 2.103e-05, [1] [Cycle 1]: 1.64e-05, [1] [cse]: 1.123e-05 [environ_conv]: 4.63001e-06 [swap_dp_allreduce_reducescatter]: 4.90999e-06 [bias_add_comm_swap]: 3.27002e-06 [label_micro_interleaved_index]: 4.08999e-06 [label_fine_grained_interleaved_index]: 2.99999e-06 [merge_cast_opt]: 1.22999e-06 [slice_recompute_activation]: 2.00002e-06 [micro_interleaved_order_control]: 2.11e-06 [assign_add_opt]: 1.25999e-06 [ForceFp32Comm]: 8.00006e-07 [remove_cast_before_assign_add]: 9.70002e-07 [full_micro_interleaved_order_control]: 2.09e-06 [reorder_send_recv_between_fp_bp]: 2.74001e-06 [comm_op_add_attrs]: 9.80013e-07 [add_comm_op_reuse_tag]: 9.70002e-07 [interleave_split_concat_branches]: 1.17999e-06 [interleave_parallel_branches]: 1.04e-06 [overlap_opt_shard_in_pipeline]: 1.74e-06 [overlap_opt_shard_grad_in_pipeline]: 2.01998e-06 [control_data_broadcast_order]: 1.145e-05 [grouped_pairwise_exchange_alltoall]: 1.57999e-06 [offloading_packed_experts]: 3.78001e-06 [overlap_recompute_and_grad_model_parallel]: 4.62e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.37e-06 [overlap_recompute_allgather_and_fa_grad]: 1.69e-06 [overlap_recompute_comm]: 2.76999e-06 [overlap_grad_ring_attention]: 4.21001e-06 [overlap_grad_flash_sp]: 1.786e-05 [begin_end_overlap_inline]: 4.80009e-07 [split_matmul_comm_elemetwise]: 2.17001e-06 [split_layernorm_comm]: 1.82001e-06 [handle_group_info]: 9.89996e-07 [symbol_engine_optimizer]: 7.395e-05, [1] [Cycle 1]: 6.995e-05, [6] [build]: 2.93003e-06 [elim_shapecalc]: 9.69e-06 [elim_not_effective]: 1.267e-05 [opt_reshape]: 6.74001e-06 [fold_const_symbol]: 9.84001e-06 [renormalize]: 2.00002e-07 [detach_backward]: 2.04999e-06 [pipeline_parallel_scheduler]: 1.62001e-06 [auto_monad_reorder]: 1.568e-05 [get_jit_bprop_graph]: 1.54998e-06 [rewriter_after_jit_bprop_graph]: 3.7e-06 [opt_after_jit_grad]: 0.00044522 [validate]: 3.487e-05 Sums bootstrap : 0.000437s : 4.93% type_inference : 0.004663s : 52.61% event_method : 0.000012s : 0.13% auto_monad : 0.000053s : 0.60% graph_reusing : 0.000005s : 0.06% inline : 0.000002s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000015s : 0.16% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000003s : 0.04% parallel-infer-symbol : 0.000003s : 0.03% pre_auto_parallel : 0.000022s : 0.25% insert-virtual-dataset : 0.000002s : 0.03% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.02% optimize.py_interpret_to_execute : 0.000016s : 0.18% optimize.rewriter_before_opt_a : 0.000042s : 0.48% optimize.opt_a.expand_dump_flag : 0.000004s : 0.04% optimize.opt_a.switch_simplify : 0.000033s : 0.37% optimize.opt_a.loop_unroll : 0.000019s : 0.22% optimize.opt_a.a_1 : 0.000441s : 4.98% optimize.opt_a.with_stream_mark : 0.000024s : 0.28% optimize.opt_a.recompute_prepare : 0.000015s : 0.17% optimize.opt_a.updatestate_depend_eliminate : 0.000006s : 0.07% optimize.opt_a.updatestate_assign_eliminate : 0.000005s : 0.06% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.07% optimize.opt_a.parameter_eliminate : 0.000003s : 0.03% optimize.opt_a.a_2 : 0.000171s : 1.93% optimize.opt_a.accelerated_algorithm : 0.000014s : 0.15% optimize.opt_a.shard : 0.000003s : 0.03% optimize.opt_a.meta_shard_fg_expand : 0.000003s : 0.03% optimize.opt_a.shard_inline : 0.000013s : 0.15% optimize.opt_a.merge_send_recv : 0.000013s : 0.15% optimize.opt_a.auto_parallel : 0.000012s : 0.13% optimize.opt_a.parallel : 0.000022s : 0.25% optimize.opt_a.flash_sp : 0.000012s : 0.13% optimize.opt_a.merge_comm : 0.000007s : 0.08% optimize.opt_a.allreduce_fusion : 0.000007s : 0.08% optimize.opt_a.matmul_add_comm_reduction : 0.000015s : 0.17% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000014s : 0.16% optimize.opt_a.virtual_dataset : 0.000012s : 0.13% optimize.opt_a.get_grad_eliminate_ : 0.000012s : 0.13% optimize.opt_a.virtual_output : 0.000011s : 0.13% optimize.opt_a.merge_forward : 0.000006s : 0.07% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.03% optimize.opt_a.offload_activation : 0.000016s : 0.18% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000022s : 0.25% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.03% optimize.opt_a.before_grad : 0.000019s : 0.21% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000007s : 0.08% optimize.opt_a.meta_fg_expand : 0.000005s : 0.06% optimize.opt_a.flash_sp_send_recv_attached : 0.000003s : 0.04% optimize.opt_a.receive_attached : 0.000003s : 0.04% optimize.opt_a.after_resolve : 0.000018s : 0.20% optimize.opt_a.a_after_grad : 0.000016s : 0.18% optimize.opt_a.renormalize : 0.000421s : 4.75% optimize.opt_a.add_forward_monad_depend : 0.000005s : 0.06% optimize.opt_a.auto_monad_grad : 0.000003s : 0.03% optimize.opt_a.auto_monad_eliminator : 0.000022s : 0.25% optimize.opt_a.cse : 0.000045s : 0.51% optimize.opt_a.a_3 : 0.000079s : 0.89% optimize.py_interpret_to_execute_after_opt_a : 0.000008s : 0.08% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.02% optimize.rewriter_after_opt_a : 0.000033s : 0.37% optimize.convert_after_rewriter : 0.000006s : 0.07% optimize.order_py_execute_after_rewriter : 0.000005s : 0.06% optimize.mutable_eliminate : 0.000496s : 5.60% optimize.opt_b.b_1 : 0.000119s : 1.34% optimize.opt_b.b_2 : 0.000007s : 0.08% optimize.opt_b.updatestate_depend_eliminate : 0.000005s : 0.06% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_b.renormalize : 0.000001s : 0.01% optimize.opt_b.cse : 0.000019s : 0.21% optimize.optimize_parallel_all_gather_comm : 0.000017s : 0.19% optimize.overlap_param_gather : 0.000002s : 0.03% optimize.cconv : 0.000024s : 0.28% optimize.loop_unroll : 0.000419s : 4.73% optimize.opt_after_cconv.c_1 : 0.000027s : 0.30% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000005s : 0.06% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000002s : 0.03% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.03% optimize.opt_after_cconv.cse : 0.000018s : 0.20% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000014s : 0.16% optimize.tuple_transform.d_1 : 0.000039s : 0.44% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.02% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000006s : 0.07% optimize.partial_unused_args_eliminate : 0.000002s : 0.02% optimize.add_recomputation : 0.000044s : 0.50% optimize.cse_after_recomputation.cse : 0.000011s : 0.13% optimize.environ_conv : 0.000005s : 0.05% optimize.swap_dp_allreduce_reducescatter : 0.000005s : 0.06% optimize.bias_add_comm_swap : 0.000003s : 0.04% optimize.label_micro_interleaved_index : 0.000004s : 0.05% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.03% optimize.merge_cast_opt : 0.000001s : 0.01% optimize.slice_recompute_activation : 0.000002s : 0.02% optimize.micro_interleaved_order_control : 0.000002s : 0.02% optimize.assign_add_opt : 0.000001s : 0.01% optimize.ForceFp32Comm : 0.000001s : 0.01% optimize.remove_cast_before_assign_add : 0.000001s : 0.01% optimize.full_micro_interleaved_order_control : 0.000002s : 0.02% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.03% optimize.comm_op_add_attrs : 0.000001s : 0.01% optimize.add_comm_op_reuse_tag : 0.000001s : 0.01% optimize.interleave_split_concat_branches : 0.000001s : 0.01% optimize.interleave_parallel_branches : 0.000001s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000002s : 0.02% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.02% optimize.control_data_broadcast_order : 0.000011s : 0.13% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.02% optimize.offloading_packed_experts : 0.000004s : 0.04% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.05% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.02% optimize.overlap_recompute_allgather_and_fa_grad : 0.000002s : 0.02% optimize.overlap_recompute_comm : 0.000003s : 0.03% optimize.overlap_grad_ring_attention : 0.000004s : 0.05% optimize.overlap_grad_flash_sp : 0.000018s : 0.20% optimize.begin_end_overlap_inline : 0.000000s : 0.01% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.02% optimize.split_layernorm_comm : 0.000002s : 0.02% optimize.handle_group_info : 0.000001s : 0.01% optimize.symbol_engine_optimizer.build : 0.000003s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000010s : 0.11% optimize.symbol_engine_optimizer.elim_not_effective : 0.000013s : 0.14% optimize.symbol_engine_optimizer.opt_reshape : 0.000007s : 0.08% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000010s : 0.11% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.02% pipeline_parallel_scheduler : 0.000002s : 0.02% auto_monad_reorder : 0.000016s : 0.18% get_jit_bprop_graph : 0.000002s : 0.02% rewriter_after_jit_bprop_graph : 0.000004s : 0.04% opt_after_jit_grad : 0.000445s : 5.02% validate : 0.000035s : 0.39% Time group info: ------[substitution.] 0.000168 23 37.84% : 0.000064s : 4: substitution.arithmetic_simplify 1.09% : 0.000002s : 2: substitution.elim_not_effective 0.85% : 0.000001s : 2: substitution.fold_const_symbol 3.27% : 0.000005s : 3: substitution.graph_param_transform 50.67% : 0.000085s : 2: substitution.inline 1.72% : 0.000003s : 4: substitution.j_node_and_user_rematch 2.93% : 0.000005s : 4: substitution.remove_not_recompute_node 1.63% : 0.000003s : 2: substitution.replace_old_param ------[type_inference.] 0.004620 2 90.94% : 0.004201s : 1: type_inference.infer 9.06% : 0.000419s : 1: type_inference.specialize ------[replace.] 0.000020 2 100.00% : 0.000020s : 2: replace.inline ------[match.] 0.000083 2 100.00% : 0.000083s : 2: match.inline ------[predicate.] 0.000131 754 0.83% : 0.000001s : 7: predicate.accumulaten_eliminater 0.94% : 0.000001s : 3: predicate.ad_related_special_op_eliminate 0.66% : 0.000001s : 6: predicate.addn_check_dump 0.85% : 0.000001s : 7: predicate.addn_zero_filter 0.75% : 0.000001s : 7: predicate.adjust_all_reduce_mul_add 2.65% : 0.000003s : 13: predicate.arithmetic_simplify 1.00% : 0.000001s : 7: predicate.cast_eliminate 0.84% : 0.000001s : 6: predicate.check_bprop_eliminate 0.69% : 0.000001s : 6: predicate.compare_switch_simplify 0.22% : 0.000000s : 3: predicate.const_output_eliminate 0.73% : 0.000001s : 6: predicate.depend_value_elim 0.78% : 0.000001s : 7: predicate.dict_get_item_const_eliminator 0.87% : 0.000001s : 7: predicate.dict_get_item_eliminator 0.73% : 0.000001s : 7: predicate.dict_set_item_eliminator 1.27% : 0.000002s : 6: predicate.dumpgradient_eliminate 0.30% : 0.000000s : 3: predicate.elim_not_effective 0.50% : 0.000001s : 3: predicate.elim_shapecalc_of_broadcastargs 1.11% : 0.000001s : 10: predicate.environ_add_const_eliminate 1.07% : 0.000001s : 10: predicate.environ_get_add_eliminate 0.98% : 0.000001s : 10: predicate.environ_get_depend_swap 1.77% : 0.000002s : 16: predicate.environ_get_eliminate 1.04% : 0.000001s : 10: predicate.environ_get_set_eliminate 0.92% : 0.000001s : 9: predicate.exchange_switch_depend_value 1.97% : 0.000003s : 9: predicate.float_depend_g_call 0.68% : 0.000001s : 6: predicate.float_environ_get_switch 1.04% : 0.000001s : 9: predicate.float_tuple_getitem_switch 0.24% : 0.000000s : 3: predicate.fold_const_symbol 0.84% : 0.000001s : 6: predicate.get_grad_eliminate 0.38% : 0.000001s : 3: predicate.graph_param_transform 0.89% : 0.000001s : 6: predicate.incorporate_call 0.72% : 0.000001s : 6: predicate.incorporate_call_switch 6.39% : 0.000008s : 34: predicate.inline 1.01% : 0.000001s : 6: predicate.inline_without_move 0.37% : 0.000000s : 6: predicate.j_node_and_user_rematch 1.20% : 0.000002s : 6: predicate.less_batch_normalization 1.65% : 0.000002s : 13: predicate.list_to_tuple_eliminator_ 2.13% : 0.000003s : 20: predicate.load_eliminater 1.13% : 0.000001s : 3: predicate.loop_unroll_after_grad 1.61% : 0.000002s : 14: predicate.loop_unroll_before_grad 1.80% : 0.000002s : 13: predicate.make_slice_get_slice_eliminator 0.66% : 0.000001s : 6: predicate.merge_addn 0.72% : 0.000001s : 6: predicate.micro_step_allgather_replace 0.68% : 0.000001s : 6: predicate.mini_step_allgather_replace 0.70% : 0.000001s : 7: predicate.minmaximum_grad 1.40% : 0.000002s : 3: predicate.mutable_eliminate 0.42% : 0.000001s : 3: predicate.opt_reshape 0.44% : 0.000001s : 3: predicate.parallel_virtual_node 1.25% : 0.000002s : 9: predicate.partial_defer_inline 1.29% : 0.000002s : 10: predicate.partial_eliminate 0.78% : 0.000001s : 7: predicate.print_const_string_wrapper 0.85% : 0.000001s : 6: predicate.reduce_all_const_elim 1.10% : 0.000001s : 7: predicate.reduce_eliminate 2.19% : 0.000003s : 20: predicate.redundant_stop_gradient_eliminater 0.61% : 0.000001s : 6: predicate.remove_not_recompute_node 1.19% : 0.000002s : 13: predicate.replace_applicator 0.82% : 0.000001s : 6: predicate.replace_old_param 0.40% : 0.000001s : 3: predicate.reset_defer_inline 0.88% : 0.000001s : 7: predicate.reshape_eliminate 0.76% : 0.000001s : 6: predicate.row_tensor_add_zeros_like 0.45% : 0.000001s : 3: predicate.row_tensor_eliminate 0.91% : 0.000001s : 6: predicate.same_eliminate 0.53% : 0.000001s : 6: predicate.set_cell_output_no_recompute 1.29% : 0.000002s : 6: predicate.shard_identity_eliminate 0.84% : 0.000001s : 6: predicate.special_op_eliminate 0.98% : 0.000001s : 6: predicate.specialize_transform 1.10% : 0.000001s : 6: predicate.split_environ_get_set_with_tuple_value 0.85% : 0.000001s : 6: predicate.stack_unstack_eliminate 0.46% : 0.000001s : 3: predicate.switch_call_monad_eliminater 1.05% : 0.000001s : 9: predicate.switch_defer_inline 1.82% : 0.000002s : 15: predicate.switch_layer_defer_inline 4.27% : 0.000006s : 32: predicate.switch_simplify 0.73% : 0.000001s : 7: predicate.tile_eliminate 0.82% : 0.000001s : 7: predicate.transpose_eliminate 1.56% : 0.000002s : 13: predicate.tuple_list_convert_item_index_to_positive 1.75% : 0.000002s : 13: predicate.tuple_list_get_item_const_eliminator 1.31% : 0.000002s : 13: predicate.tuple_list_get_item_depend_reorder 3.23% : 0.000004s : 19: predicate.tuple_list_get_item_eliminator 1.52% : 0.000002s : 13: predicate.tuple_list_get_set_item_eliminator 2.71% : 0.000004s : 19: predicate.tuple_list_set_item_eliminator 1.55% : 0.000002s : 13: predicate.tuple_to_list_eliminator_ 2.00% : 0.000003s : 20: predicate.updatestate_pure_node_eliminater 2.88% : 0.000004s : 26: predicate.updatestate_useless_node_eliminater 0.44% : 0.000001s : 3: predicate.value_based_eliminate 0.73% : 0.000001s : 6: predicate.virtual_dataset_eliminate 0.78% : 0.000001s : 6: predicate.virtual_output_eliminate 0.34% : 0.000000s : 3: predicate.virtual_view_grad_eliminate 0.45% : 0.000001s : 3: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000197 5 8.57% : 0.000017s : 1: func_graph_cloner_run.FuncGraphClonerGraph 91.43% : 0.000181s : 4: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.021200 192 0.02% : 0.000004s : 1: ForceFp32Comm 14.37% : 0.003046s : 1: add_attr 14.33% : 0.003037s : 1: add_attr_with_inline 0.02% : 0.000004s : 1: add_comm_op_reuse_tag 0.23% : 0.000048s : 1: add_recomputation 0.02% : 0.000004s : 1: assign_add_opt 0.28% : 0.000058s : 1: auto_monad 0.09% : 0.000019s : 1: auto_monad_reorder 0.02% : 0.000003s : 1: begin_end_overlap_inline 0.03% : 0.000006s : 1: bias_add_comm_swap 2.20% : 0.000466s : 1: bootstrap 0.13% : 0.000028s : 1: cconv 0.02% : 0.000004s : 1: comm_op_add_attrs 0.07% : 0.000015s : 1: control_data_broadcast_order 0.05% : 0.000010s : 1: convert_after_rewriter 0.11% : 0.000024s : 1: cse_after_recomputation 0.02% : 0.000005s : 1: dataset_repeat_opt 0.03% : 0.000005s : 1: detach_backward 0.04% : 0.000008s : 1: environ_conv 0.08% : 0.000017s : 1: event_method 0.02% : 0.000005s : 1: full_micro_interleaved_order_control 0.02% : 0.000005s : 1: get_jit_bprop_graph 0.04% : 0.000009s : 1: graph_reusing 0.02% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.02% : 0.000004s : 1: handle_group_info 0.02% : 0.000005s : 1: inline 0.03% : 0.000006s : 1: insert-virtual-dataset 0.02% : 0.000004s : 1: interleave_parallel_branches 0.02% : 0.000004s : 1: interleave_split_concat_branches 0.03% : 0.000006s : 1: label_fine_grained_interleaved_index 0.03% : 0.000007s : 1: label_micro_interleaved_index 2.01% : 0.000427s : 1: loop_unroll 0.02% : 0.000004s : 1: merge_cast_opt 0.02% : 0.000005s : 1: micro_interleaved_order_control 2.38% : 0.000504s : 1: mutable_eliminate 0.03% : 0.000007s : 1: offloading_packed_experts 0.06% : 0.000013s : 1: opt.transform.loop_unroll_optimizer 0.07% : 0.000014s : 1: opt.transform.mutable_eliminate 3.86% : 0.000819s : 78: opt.transform.opt_a 0.12% : 0.000025s : 1: opt.transform.opt_after_cconv 0.10% : 0.000022s : 1: opt.transform.opt_after_jit_grad 0.44% : 0.000094s : 28: opt.transform.opt_b 0.20% : 0.000043s : 2: opt.transform.opt_trans_graph 0.17% : 0.000035s : 4: opt.transform.symbol_engine_opt 9.69% : 0.002054s : 1: opt_a 0.48% : 0.000101s : 1: opt_after_cconv 2.14% : 0.000453s : 1: opt_after_jit_grad 0.94% : 0.000200s : 1: opt_b 18.65% : 0.003953s : 1: optimize 0.10% : 0.000020s : 1: optimize_parallel_all_gather_comm 0.04% : 0.000008s : 1: order_py_execute_after_rewriter 0.10% : 0.000021s : 1: overlap_grad_flash_sp 0.02% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.04% : 0.000007s : 1: overlap_grad_ring_attention 0.02% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.02% : 0.000005s : 1: overlap_opt_shard_in_pipeline 0.02% : 0.000005s : 1: overlap_param_gather 0.02% : 0.000005s : 1: overlap_recompute_allgather_and_fa_grad 0.04% : 0.000008s : 1: overlap_recompute_and_grad_model_parallel 0.03% : 0.000006s : 1: overlap_recompute_comm 0.03% : 0.000007s : 1: parallel-infer-symbol 0.02% : 0.000004s : 1: parallel-infer-symbol-second 0.02% : 0.000005s : 1: partial_unused_args_eliminate 0.02% : 0.000005s : 1: pipeline_parallel_scheduler 0.02% : 0.000005s : 1: pipeline_split 0.12% : 0.000026s : 1: pre_auto_parallel 0.09% : 0.000020s : 1: py_interpret_to_execute 0.05% : 0.000011s : 1: py_interpret_to_execute_after_opt_a 0.02% : 0.000004s : 1: remove_cast_before_assign_add 0.08% : 0.000017s : 1: remove_dup_value 1.05% : 0.000223s : 1: renormalize.infer 0.90% : 0.000191s : 1: renormalize.specialize 0.03% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.03% : 0.000007s : 1: rewriter_after_jit_bprop_graph 0.17% : 0.000037s : 1: rewriter_after_opt_a 0.22% : 0.000046s : 1: rewriter_before_opt_a 0.02% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.02% : 0.000005s : 1: slice_recompute_activation 0.02% : 0.000005s : 1: split_layernorm_comm 0.02% : 0.000005s : 1: split_matmul_comm_elemetwise 0.04% : 0.000008s : 1: swap_dp_allreduce_reducescatter 0.36% : 0.000077s : 1: symbol_engine_optimizer 0.34% : 0.000072s : 1: tuple_transform 22.06% : 0.004676s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:50.896.37 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:50.898.94 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0137871, [21] [bootstrap]: 0.00043153 [type_inference]: 0.00452044 [event_method]: 1.136e-05 [auto_monad]: 5.167e-05 [graph_reusing]: 5.46998e-06 [inline]: 2.22999e-06 [add_attr]: 0.00302362, [1] [add_attr_with_inline]: 0.00301586, [1] [Cycle 1]: 5.674e-05, [2] [tag_attr]: 1.33e-05 [meta_addattr_fg_expand]: 3.54002e-06 [parallel-infer-symbol]: 2.90998e-06 [pre_auto_parallel]: 2.234e-05 [insert-virtual-dataset]: 2.36e-06 [parallel-infer-symbol-second]: 7.59988e-07 [dataset_repeat_opt]: 1.71002e-06 [pipeline_split]: 1.59998e-06 [optimize]: 0.00459069, [53] [py_interpret_to_execute]: 2.036e-05 [rewriter_before_opt_a]: 4.691e-05 [opt_a]: 0.00238745, [2] [Cycle 1]: 0.00156391, [45] [expand_dump_flag]: 2.84001e-06 [switch_simplify]: 2.441e-05 [loop_unroll]: 1.359e-05 [a_1]: 0.00030864 [with_stream_mark]: 1.472e-05 [recompute_prepare]: 8.56002e-06 [updatestate_depend_eliminate]: 3.53999e-06 [updatestate_assign_eliminate]: 3.32997e-06 [updatestate_loads_eliminate]: 2.91e-06 [parameter_eliminate]: 1.84e-06 [a_2]: 0.00012315 [accelerated_algorithm]: 6.82002e-06 [shard]: 1.91e-06 [meta_shard_fg_expand]: 1.69e-06 [shard_inline]: 6.86999e-06 [merge_send_recv]: 8.65001e-06 [auto_parallel]: 5.96e-06 [parallel]: 1.76e-05 [flash_sp]: 7.71001e-06 [merge_comm]: 4.03001e-06 [allreduce_fusion]: 3.33e-06 [matmul_add_comm_reduction]: 9.41e-06 [allreduce_slice_to_reducescatter]: 5.79981e-07 [virtual_shard_identity]: 7.82e-06 [virtual_dataset]: 6.63e-06 [get_grad_eliminate_]: 6.41998e-06 [virtual_output]: 6.02999e-06 [merge_forward]: 3.80998e-06 [cell_reuse_recompute_pass]: 1.34e-06 [offload_activation]: 9.51e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.374e-05 [merge_recompute_call_nodes]: 1.56002e-06 [before_grad]: 1.034e-05 [set_forward_comm_id_for_comm_node_pass]: 3.52002e-06 [meta_fg_expand]: 2.69999e-06 [flash_sp_send_recv_attached]: 2.59999e-06 [receive_attached]: 2.19001e-06 [after_resolve]: 9.79e-06 [a_after_grad]: 9.05001e-06 [renormalize]: 0.00040286 [add_forward_monad_depend]: 4.52e-06 [auto_monad_grad]: 2.48e-06 [auto_monad_eliminator]: 1.361e-05 [cse]: 2.842e-05 [a_3]: 5.756e-05 [Cycle 2]: 0.00081107, [45] [expand_dump_flag]: 9.79984e-07 [switch_simplify]: 7.28e-06 [loop_unroll]: 5.82999e-06 [a_1]: 0.00012811 [with_stream_mark]: 1.041e-05 [recompute_prepare]: 6.30002e-06 [updatestate_depend_eliminate]: 3.26001e-06 [updatestate_assign_eliminate]: 2.64999e-06 [updatestate_loads_eliminate]: 2.56e-06 [parameter_eliminate]: 1.06002e-06 [a_2]: 0.00010848 [accelerated_algorithm]: 6.28002e-06 [shard]: 1.11997e-06 [meta_shard_fg_expand]: 1.34e-06 [shard_inline]: 6.33e-06 [merge_send_recv]: 4.86002e-06 [auto_parallel]: 5.19e-06 [parallel]: 4.28999e-06 [flash_sp]: 3.35998e-06 [merge_comm]: 3.24001e-06 [allreduce_fusion]: 3.25e-06 [matmul_add_comm_reduction]: 6.63e-06 [allreduce_slice_to_reducescatter]: 3.29979e-07 [virtual_shard_identity]: 6.91001e-06 [virtual_dataset]: 5.89e-06 [get_grad_eliminate_]: 5.74e-06 [virtual_output]: 5.29e-06 [merge_forward]: 2.74999e-06 [cell_reuse_recompute_pass]: 1.44e-06 [offload_activation]: 6.83998e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.279e-05 [merge_recompute_call_nodes]: 9.39996e-07 [before_grad]: 8.90001e-06 [set_forward_comm_id_for_comm_node_pass]: 3.53999e-06 [meta_fg_expand]: 2.29999e-06 [flash_sp_send_recv_attached]: 9.50007e-07 [receive_attached]: 9.50007e-07 [after_resolve]: 8.50999e-06 [a_after_grad]: 8.46002e-06 [renormalize]: 8.00064e-08 [add_forward_monad_depend]: 1.12e-06 [auto_monad_grad]: 1.00001e-06 [auto_monad_eliminator]: 7.48e-06 [cse]: 1.48e-05 [a_3]: 4.697e-05 [py_interpret_to_execute_after_opt_a]: 1.102e-05 [slice_cell_reuse_recomputed_activation]: 4.81002e-06 [rewriter_after_opt_a]: 3.666e-05 [convert_after_rewriter]: 9.64e-06 [order_py_execute_after_rewriter]: 8.17e-06 [mutable_eliminate]: 0.00052021 [opt_b]: 0.000267, [1] [Cycle 1]: 0.00025838, [7] [b_1]: 0.00016234 [b_2]: 7.71001e-06 [updatestate_depend_eliminate]: 6.91001e-06 [updatestate_assign_eliminate]: 2.60002e-06 [updatestate_loads_eliminate]: 2.37999e-06 [renormalize]: 5.19998e-07 [cse]: 1.796e-05 [optimize_parallel_all_gather_comm]: 1.941e-05 [overlap_param_gather]: 5.15999e-06 [cconv]: 2.692e-05 [loop_unroll]: 0.00043723 [opt_after_cconv]: 0.00012587, [1] [Cycle 1]: 0.0001176, [7] [c_1]: 2.879e-05 [parameter_eliminate]: 3.4e-06 [updatestate_depend_eliminate]: 5.20999e-06 [updatestate_assign_eliminate]: 2.46998e-06 [updatestate_loads_eliminate]: 2.38998e-06 [cse]: 1.786e-05 [renormalize]: 5.09986e-07 [remove_dup_value]: 1.801e-05 [tuple_transform]: 8.692e-05, [1] [Cycle 1]: 7.92e-05, [4] [d_1]: 3.974e-05 [none_parameter_eliminate]: 1.64e-06 [renormalize]: 2.40019e-07 [switch_simplify]: 6.83e-06 [partial_unused_args_eliminate]: 4.71002e-06 [add_recomputation]: 4.828e-05 [cse_after_recomputation]: 2.818e-05, [1] [Cycle 1]: 2.098e-05, [1] [cse]: 1.186e-05 [environ_conv]: 7.8e-06 [swap_dp_allreduce_reducescatter]: 8.13001e-06 [bias_add_comm_swap]: 4.92999e-06 [label_micro_interleaved_index]: 6.63e-06 [label_fine_grained_interleaved_index]: 5.34998e-06 [merge_cast_opt]: 4.29002e-06 [slice_recompute_activation]: 4.51002e-06 [micro_interleaved_order_control]: 4.65999e-06 [assign_add_opt]: 3.76001e-06 [ForceFp32Comm]: 3.51001e-06 [remove_cast_before_assign_add]: 3.32002e-06 [full_micro_interleaved_order_control]: 4.42003e-06 [reorder_send_recv_between_fp_bp]: 4.89998e-06 [comm_op_add_attrs]: 3.71999e-06 [add_comm_op_reuse_tag]: 3.44001e-06 [interleave_split_concat_branches]: 3.61001e-06 [interleave_parallel_branches]: 3.58e-06 [overlap_opt_shard_in_pipeline]: 3.51999e-06 [overlap_opt_shard_grad_in_pipeline]: 4.45999e-06 [control_data_broadcast_order]: 1.497e-05 [grouped_pairwise_exchange_alltoall]: 3.91001e-06 [offloading_packed_experts]: 7.2e-06 [overlap_recompute_and_grad_model_parallel]: 7.15e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.5e-06 [overlap_recompute_allgather_and_fa_grad]: 3.68e-06 [overlap_recompute_comm]: 5.08002e-06 [overlap_grad_ring_attention]: 6.56e-06 [overlap_grad_flash_sp]: 2.197e-05 [begin_end_overlap_inline]: 3.36001e-06 [split_matmul_comm_elemetwise]: 4.67e-06 [split_layernorm_comm]: 3.95e-06 [handle_group_info]: 3.53e-06 [symbol_engine_optimizer]: 9.524e-05, [1] [Cycle 1]: 8.865e-05, [6] [build]: 2.46e-06 [elim_shapecalc]: 9.45001e-06 [elim_not_effective]: 1.287e-05 [opt_reshape]: 6.98e-06 [fold_const_symbol]: 9.87001e-06 [renormalize]: 2.50002e-07 [detach_backward]: 3.18e-06 [pipeline_parallel_scheduler]: 1.80001e-06 [auto_monad_reorder]: 1.898e-05 [get_jit_bprop_graph]: 1.44e-06 [rewriter_after_jit_bprop_graph]: 3.90998e-06 [opt_after_jit_grad]: 0.00050602 [validate]: 3.531e-05 Sums bootstrap : 0.000432s : 4.76% type_inference : 0.004520s : 49.86% event_method : 0.000011s : 0.13% auto_monad : 0.000052s : 0.57% graph_reusing : 0.000005s : 0.06% inline : 0.000002s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000013s : 0.15% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000004s : 0.04% parallel-infer-symbol : 0.000003s : 0.03% pre_auto_parallel : 0.000022s : 0.25% insert-virtual-dataset : 0.000002s : 0.03% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.02% optimize.py_interpret_to_execute : 0.000020s : 0.22% optimize.rewriter_before_opt_a : 0.000047s : 0.52% optimize.opt_a.expand_dump_flag : 0.000004s : 0.04% optimize.opt_a.switch_simplify : 0.000032s : 0.35% optimize.opt_a.loop_unroll : 0.000019s : 0.21% optimize.opt_a.a_1 : 0.000437s : 4.82% optimize.opt_a.with_stream_mark : 0.000025s : 0.28% optimize.opt_a.recompute_prepare : 0.000015s : 0.16% optimize.opt_a.updatestate_depend_eliminate : 0.000007s : 0.08% optimize.opt_a.updatestate_assign_eliminate : 0.000006s : 0.07% optimize.opt_a.updatestate_loads_eliminate : 0.000005s : 0.06% optimize.opt_a.parameter_eliminate : 0.000003s : 0.03% optimize.opt_a.a_2 : 0.000232s : 2.55% optimize.opt_a.accelerated_algorithm : 0.000013s : 0.14% optimize.opt_a.shard : 0.000003s : 0.03% optimize.opt_a.meta_shard_fg_expand : 0.000003s : 0.03% optimize.opt_a.shard_inline : 0.000013s : 0.15% optimize.opt_a.merge_send_recv : 0.000014s : 0.15% optimize.opt_a.auto_parallel : 0.000011s : 0.12% optimize.opt_a.parallel : 0.000022s : 0.24% optimize.opt_a.flash_sp : 0.000011s : 0.12% optimize.opt_a.merge_comm : 0.000007s : 0.08% optimize.opt_a.allreduce_fusion : 0.000007s : 0.07% optimize.opt_a.matmul_add_comm_reduction : 0.000016s : 0.18% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000015s : 0.16% optimize.opt_a.virtual_dataset : 0.000013s : 0.14% optimize.opt_a.get_grad_eliminate_ : 0.000012s : 0.13% optimize.opt_a.virtual_output : 0.000011s : 0.12% optimize.opt_a.merge_forward : 0.000007s : 0.07% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.03% optimize.opt_a.offload_activation : 0.000016s : 0.18% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000027s : 0.29% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.03% optimize.opt_a.before_grad : 0.000019s : 0.21% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000007s : 0.08% optimize.opt_a.meta_fg_expand : 0.000005s : 0.06% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.04% optimize.opt_a.receive_attached : 0.000003s : 0.03% optimize.opt_a.after_resolve : 0.000018s : 0.20% optimize.opt_a.a_after_grad : 0.000018s : 0.19% optimize.opt_a.renormalize : 0.000403s : 4.44% optimize.opt_a.add_forward_monad_depend : 0.000006s : 0.06% optimize.opt_a.auto_monad_grad : 0.000003s : 0.04% optimize.opt_a.auto_monad_eliminator : 0.000021s : 0.23% optimize.opt_a.cse : 0.000043s : 0.48% optimize.opt_a.a_3 : 0.000105s : 1.15% optimize.py_interpret_to_execute_after_opt_a : 0.000011s : 0.12% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.05% optimize.rewriter_after_opt_a : 0.000037s : 0.40% optimize.convert_after_rewriter : 0.000010s : 0.11% optimize.order_py_execute_after_rewriter : 0.000008s : 0.09% optimize.mutable_eliminate : 0.000520s : 5.74% optimize.opt_b.b_1 : 0.000162s : 1.79% optimize.opt_b.b_2 : 0.000008s : 0.09% optimize.opt_b.updatestate_depend_eliminate : 0.000007s : 0.08% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_b.updatestate_loads_eliminate : 0.000002s : 0.03% optimize.opt_b.renormalize : 0.000001s : 0.01% optimize.opt_b.cse : 0.000018s : 0.20% optimize.optimize_parallel_all_gather_comm : 0.000019s : 0.21% optimize.overlap_param_gather : 0.000005s : 0.06% optimize.cconv : 0.000027s : 0.30% optimize.loop_unroll : 0.000437s : 4.82% optimize.opt_after_cconv.c_1 : 0.000029s : 0.32% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.04% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000005s : 0.06% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000002s : 0.03% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.03% optimize.opt_after_cconv.cse : 0.000018s : 0.20% optimize.opt_after_cconv.renormalize : 0.000001s : 0.01% optimize.remove_dup_value : 0.000018s : 0.20% optimize.tuple_transform.d_1 : 0.000040s : 0.44% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.02% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000007s : 0.08% optimize.partial_unused_args_eliminate : 0.000005s : 0.05% optimize.add_recomputation : 0.000048s : 0.53% optimize.cse_after_recomputation.cse : 0.000012s : 0.13% optimize.environ_conv : 0.000008s : 0.09% optimize.swap_dp_allreduce_reducescatter : 0.000008s : 0.09% optimize.bias_add_comm_swap : 0.000005s : 0.05% optimize.label_micro_interleaved_index : 0.000007s : 0.07% optimize.label_fine_grained_interleaved_index : 0.000005s : 0.06% optimize.merge_cast_opt : 0.000004s : 0.05% optimize.slice_recompute_activation : 0.000005s : 0.05% optimize.micro_interleaved_order_control : 0.000005s : 0.05% optimize.assign_add_opt : 0.000004s : 0.04% optimize.ForceFp32Comm : 0.000004s : 0.04% optimize.remove_cast_before_assign_add : 0.000003s : 0.04% optimize.full_micro_interleaved_order_control : 0.000004s : 0.05% optimize.reorder_send_recv_between_fp_bp : 0.000005s : 0.05% optimize.comm_op_add_attrs : 0.000004s : 0.04% optimize.add_comm_op_reuse_tag : 0.000003s : 0.04% optimize.interleave_split_concat_branches : 0.000004s : 0.04% optimize.interleave_parallel_branches : 0.000004s : 0.04% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.04% optimize.overlap_opt_shard_grad_in_pipeline : 0.000004s : 0.05% optimize.control_data_broadcast_order : 0.000015s : 0.17% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.04% optimize.offloading_packed_experts : 0.000007s : 0.08% optimize.overlap_recompute_and_grad_model_parallel : 0.000007s : 0.08% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000003s : 0.04% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.04% optimize.overlap_recompute_comm : 0.000005s : 0.06% optimize.overlap_grad_ring_attention : 0.000007s : 0.07% optimize.overlap_grad_flash_sp : 0.000022s : 0.24% optimize.begin_end_overlap_inline : 0.000003s : 0.04% optimize.split_matmul_comm_elemetwise : 0.000005s : 0.05% optimize.split_layernorm_comm : 0.000004s : 0.04% optimize.handle_group_info : 0.000004s : 0.04% optimize.symbol_engine_optimizer.build : 0.000002s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000009s : 0.10% optimize.symbol_engine_optimizer.elim_not_effective : 0.000013s : 0.14% optimize.symbol_engine_optimizer.opt_reshape : 0.000007s : 0.08% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000010s : 0.11% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000003s : 0.04% pipeline_parallel_scheduler : 0.000002s : 0.02% auto_monad_reorder : 0.000019s : 0.21% get_jit_bprop_graph : 0.000001s : 0.02% rewriter_after_jit_bprop_graph : 0.000004s : 0.04% opt_after_jit_grad : 0.000506s : 5.58% validate : 0.000035s : 0.39% Time group info: ------[substitution.] 0.000167 23 39.79% : 0.000066s : 4: substitution.arithmetic_simplify 1.28% : 0.000002s : 2: substitution.elim_not_effective 0.88% : 0.000001s : 2: substitution.fold_const_symbol 3.16% : 0.000005s : 3: substitution.graph_param_transform 48.58% : 0.000081s : 2: substitution.inline 1.79% : 0.000003s : 4: substitution.j_node_and_user_rematch 2.69% : 0.000004s : 4: substitution.remove_not_recompute_node 1.83% : 0.000003s : 2: substitution.replace_old_param ------[type_inference.] 0.004477 2 90.97% : 0.004072s : 1: type_inference.infer 9.03% : 0.000404s : 1: type_inference.specialize ------[replace.] 0.000019 2 100.00% : 0.000019s : 2: replace.inline ------[match.] 0.000080 2 100.00% : 0.000080s : 2: match.inline ------[predicate.] 0.000134 754 0.80% : 0.000001s : 7: predicate.accumulaten_eliminater 1.08% : 0.000001s : 3: predicate.ad_related_special_op_eliminate 0.72% : 0.000001s : 6: predicate.addn_check_dump 0.81% : 0.000001s : 7: predicate.addn_zero_filter 0.66% : 0.000001s : 7: predicate.adjust_all_reduce_mul_add 3.00% : 0.000004s : 13: predicate.arithmetic_simplify 0.79% : 0.000001s : 7: predicate.cast_eliminate 0.75% : 0.000001s : 6: predicate.check_bprop_eliminate 0.66% : 0.000001s : 6: predicate.compare_switch_simplify 0.22% : 0.000000s : 3: predicate.const_output_eliminate 0.76% : 0.000001s : 6: predicate.depend_value_elim 0.81% : 0.000001s : 7: predicate.dict_get_item_const_eliminator 0.90% : 0.000001s : 7: predicate.dict_get_item_eliminator 0.74% : 0.000001s : 7: predicate.dict_set_item_eliminator 1.26% : 0.000002s : 6: predicate.dumpgradient_eliminate 0.41% : 0.000001s : 3: predicate.elim_not_effective 0.49% : 0.000001s : 3: predicate.elim_shapecalc_of_broadcastargs 1.08% : 0.000001s : 10: predicate.environ_add_const_eliminate 1.04% : 0.000001s : 10: predicate.environ_get_add_eliminate 1.08% : 0.000001s : 10: predicate.environ_get_depend_swap 1.85% : 0.000002s : 16: predicate.environ_get_eliminate 0.98% : 0.000001s : 10: predicate.environ_get_set_eliminate 0.90% : 0.000001s : 9: predicate.exchange_switch_depend_value 2.14% : 0.000003s : 9: predicate.float_depend_g_call 0.69% : 0.000001s : 6: predicate.float_environ_get_switch 1.02% : 0.000001s : 9: predicate.float_tuple_getitem_switch 0.22% : 0.000000s : 3: predicate.fold_const_symbol 0.77% : 0.000001s : 6: predicate.get_grad_eliminate 0.41% : 0.000001s : 3: predicate.graph_param_transform 0.84% : 0.000001s : 6: predicate.incorporate_call 0.68% : 0.000001s : 6: predicate.incorporate_call_switch 6.39% : 0.000009s : 34: predicate.inline 1.10% : 0.000001s : 6: predicate.inline_without_move 0.41% : 0.000001s : 6: predicate.j_node_and_user_rematch 1.14% : 0.000002s : 6: predicate.less_batch_normalization 1.67% : 0.000002s : 13: predicate.list_to_tuple_eliminator_ 1.99% : 0.000003s : 20: predicate.load_eliminater 1.20% : 0.000002s : 3: predicate.loop_unroll_after_grad 1.65% : 0.000002s : 14: predicate.loop_unroll_before_grad 1.60% : 0.000002s : 13: predicate.make_slice_get_slice_eliminator 0.72% : 0.000001s : 6: predicate.merge_addn 0.68% : 0.000001s : 6: predicate.micro_step_allgather_replace 0.71% : 0.000001s : 6: predicate.mini_step_allgather_replace 0.67% : 0.000001s : 7: predicate.minmaximum_grad 1.28% : 0.000002s : 3: predicate.mutable_eliminate 0.43% : 0.000001s : 3: predicate.opt_reshape 0.42% : 0.000001s : 3: predicate.parallel_virtual_node 1.25% : 0.000002s : 9: predicate.partial_defer_inline 1.26% : 0.000002s : 10: predicate.partial_eliminate 0.73% : 0.000001s : 7: predicate.print_const_string_wrapper 0.93% : 0.000001s : 6: predicate.reduce_all_const_elim 1.25% : 0.000002s : 7: predicate.reduce_eliminate 2.14% : 0.000003s : 20: predicate.redundant_stop_gradient_eliminater 0.61% : 0.000001s : 6: predicate.remove_not_recompute_node 1.20% : 0.000002s : 13: predicate.replace_applicator 0.68% : 0.000001s : 6: predicate.replace_old_param 0.37% : 0.000000s : 3: predicate.reset_defer_inline 0.78% : 0.000001s : 7: predicate.reshape_eliminate 0.78% : 0.000001s : 6: predicate.row_tensor_add_zeros_like 0.40% : 0.000001s : 3: predicate.row_tensor_eliminate 0.95% : 0.000001s : 6: predicate.same_eliminate 0.55% : 0.000001s : 6: predicate.set_cell_output_no_recompute 0.93% : 0.000001s : 6: predicate.shard_identity_eliminate 0.91% : 0.000001s : 6: predicate.special_op_eliminate 1.08% : 0.000001s : 6: predicate.specialize_transform 1.05% : 0.000001s : 6: predicate.split_environ_get_set_with_tuple_value 0.89% : 0.000001s : 6: predicate.stack_unstack_eliminate 0.49% : 0.000001s : 3: predicate.switch_call_monad_eliminater 0.99% : 0.000001s : 9: predicate.switch_defer_inline 1.74% : 0.000002s : 15: predicate.switch_layer_defer_inline 4.34% : 0.000006s : 32: predicate.switch_simplify 0.84% : 0.000001s : 7: predicate.tile_eliminate 0.81% : 0.000001s : 7: predicate.transpose_eliminate 1.52% : 0.000002s : 13: predicate.tuple_list_convert_item_index_to_positive 1.70% : 0.000002s : 13: predicate.tuple_list_get_item_const_eliminator 1.60% : 0.000002s : 13: predicate.tuple_list_get_item_depend_reorder 3.29% : 0.000004s : 19: predicate.tuple_list_get_item_eliminator 1.54% : 0.000002s : 13: predicate.tuple_list_get_set_item_eliminator 2.50% : 0.000003s : 19: predicate.tuple_list_set_item_eliminator 1.48% : 0.000002s : 13: predicate.tuple_to_list_eliminator_ 1.99% : 0.000003s : 20: predicate.updatestate_pure_node_eliminater 2.87% : 0.000004s : 26: predicate.updatestate_useless_node_eliminater 0.50% : 0.000001s : 3: predicate.value_based_eliminate 0.76% : 0.000001s : 6: predicate.virtual_dataset_eliminate 0.68% : 0.000001s : 6: predicate.virtual_output_eliminate 0.33% : 0.000000s : 3: predicate.virtual_view_grad_eliminate 0.66% : 0.000001s : 3: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000190 5 8.48% : 0.000016s : 1: func_graph_cloner_run.FuncGraphClonerGraph 91.52% : 0.000174s : 4: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.022763 192 0.03% : 0.000006s : 1: ForceFp32Comm 13.32% : 0.003032s : 1: add_attr 13.27% : 0.003020s : 1: add_attr_with_inline 0.03% : 0.000006s : 1: add_comm_op_reuse_tag 0.23% : 0.000052s : 1: add_recomputation 0.03% : 0.000006s : 1: assign_add_opt 0.26% : 0.000060s : 1: auto_monad 0.11% : 0.000026s : 1: auto_monad_reorder 0.03% : 0.000006s : 1: begin_end_overlap_inline 0.03% : 0.000008s : 1: bias_add_comm_swap 2.08% : 0.000473s : 1: bootstrap 0.13% : 0.000030s : 1: cconv 0.03% : 0.000006s : 1: comm_op_add_attrs 0.08% : 0.000018s : 1: control_data_broadcast_order 0.06% : 0.000013s : 1: convert_after_rewriter 0.14% : 0.000031s : 1: cse_after_recomputation 0.03% : 0.000007s : 1: dataset_repeat_opt 0.08% : 0.000019s : 1: detach_backward 0.05% : 0.000011s : 1: environ_conv 0.09% : 0.000020s : 1: event_method 0.03% : 0.000007s : 1: full_micro_interleaved_order_control 0.03% : 0.000008s : 1: get_jit_bprop_graph 0.05% : 0.000012s : 1: graph_reusing 0.03% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.03% : 0.000006s : 1: handle_group_info 0.03% : 0.000008s : 1: inline 0.04% : 0.000008s : 1: insert-virtual-dataset 0.03% : 0.000006s : 1: interleave_parallel_branches 0.03% : 0.000006s : 1: interleave_split_concat_branches 0.04% : 0.000008s : 1: label_fine_grained_interleaved_index 0.04% : 0.000009s : 1: label_micro_interleaved_index 1.95% : 0.000443s : 1: loop_unroll 0.03% : 0.000007s : 1: merge_cast_opt 0.03% : 0.000007s : 1: micro_interleaved_order_control 2.31% : 0.000527s : 1: mutable_eliminate 0.04% : 0.000010s : 1: offloading_packed_experts 0.06% : 0.000014s : 1: opt.transform.loop_unroll_optimizer 0.06% : 0.000014s : 1: opt.transform.mutable_eliminate 3.62% : 0.000824s : 78: opt.transform.opt_a 0.12% : 0.000027s : 1: opt.transform.opt_after_cconv 0.11% : 0.000025s : 1: opt.transform.opt_after_jit_grad 0.42% : 0.000097s : 28: opt.transform.opt_b 0.20% : 0.000045s : 2: opt.transform.opt_trans_graph 0.16% : 0.000036s : 4: opt.transform.symbol_engine_opt 10.50% : 0.002391s : 1: opt_a 0.57% : 0.000129s : 1: opt_after_cconv 2.28% : 0.000518s : 1: opt_after_jit_grad 1.19% : 0.000271s : 1: opt_b 21.61% : 0.004919s : 1: optimize 0.10% : 0.000023s : 1: optimize_parallel_all_gather_comm 0.05% : 0.000011s : 1: order_py_execute_after_rewriter 0.11% : 0.000025s : 1: overlap_grad_flash_sp 0.03% : 0.000006s : 1: overlap_grad_matmul_and_grad_allreduce 0.04% : 0.000009s : 1: overlap_grad_ring_attention 0.03% : 0.000007s : 1: overlap_opt_shard_grad_in_pipeline 0.03% : 0.000006s : 1: overlap_opt_shard_in_pipeline 0.04% : 0.000008s : 1: overlap_param_gather 0.03% : 0.000006s : 1: overlap_recompute_allgather_and_fa_grad 0.04% : 0.000010s : 1: overlap_recompute_and_grad_model_parallel 0.03% : 0.000008s : 1: overlap_recompute_comm 0.04% : 0.000009s : 1: parallel-infer-symbol 0.03% : 0.000006s : 1: parallel-infer-symbol-second 0.03% : 0.000008s : 1: partial_unused_args_eliminate 0.04% : 0.000009s : 1: pipeline_parallel_scheduler 0.03% : 0.000007s : 1: pipeline_split 0.13% : 0.000030s : 1: pre_auto_parallel 0.10% : 0.000024s : 1: py_interpret_to_execute 0.06% : 0.000014s : 1: py_interpret_to_execute_after_opt_a 0.03% : 0.000006s : 1: remove_cast_before_assign_add 0.09% : 0.000021s : 1: remove_dup_value 0.92% : 0.000209s : 1: renormalize.infer 0.82% : 0.000186s : 1: renormalize.specialize 0.03% : 0.000007s : 1: reorder_send_recv_between_fp_bp 0.04% : 0.000010s : 1: rewriter_after_jit_bprop_graph 0.18% : 0.000040s : 1: rewriter_after_opt_a 0.22% : 0.000050s : 1: rewriter_before_opt_a 0.03% : 0.000007s : 1: slice_cell_reuse_recomputed_activation 0.03% : 0.000007s : 1: slice_recompute_activation 0.03% : 0.000007s : 1: split_layernorm_comm 0.03% : 0.000007s : 1: split_matmul_comm_elemetwise 0.05% : 0.000011s : 1: swap_dp_allreduce_reducescatter 0.43% : 0.000098s : 1: symbol_engine_optimizer 0.39% : 0.000090s : 1: tuple_transform 19.95% : 0.004541s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:50.276.593 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0128688, [21] [bootstrap]: 0.00046705 [type_inference]: 0.00466183 [event_method]: 1.175e-05 [auto_monad]: 5.313e-05 [graph_reusing]: 5.80002e-06 [inline]: 2.48e-06 [add_attr]: 0.00304894, [1] [add_attr_with_inline]: 0.00302384, [1] [Cycle 1]: 5.021e-05, [2] [tag_attr]: 1.338e-05 [meta_addattr_fg_expand]: 3.98001e-06 [parallel-infer-symbol]: 3.01001e-06 [pre_auto_parallel]: 2.43e-05 [insert-virtual-dataset]: 2.37999e-06 [parallel-infer-symbol-second]: 7.2e-07 [dataset_repeat_opt]: 2.05002e-06 [pipeline_split]: 1.92001e-06 [optimize]: 0.00390327, [53] [py_interpret_to_execute]: 1.732e-05 [rewriter_before_opt_a]: 4.387e-05 [opt_a]: 0.0020524, [2] [Cycle 1]: 0.00140109, [45] [expand_dump_flag]: 2.74999e-06 [switch_simplify]: 2.459e-05 [loop_unroll]: 1.352e-05 [a_1]: 0.00031655 [with_stream_mark]: 1.598e-05 [recompute_prepare]: 9.00999e-06 [updatestate_depend_eliminate]: 4.07e-06 [updatestate_assign_eliminate]: 3.45e-06 [updatestate_loads_eliminate]: 3.23998e-06 [parameter_eliminate]: 2.20002e-06 [a_2]: 9.218e-05 [accelerated_algorithm]: 7.32997e-06 [shard]: 2.01e-06 [meta_shard_fg_expand]: 1.76003e-06 [shard_inline]: 6.51999e-06 [merge_send_recv]: 8.28001e-06 [auto_parallel]: 6.70998e-06 [parallel]: 1.808e-05 [flash_sp]: 8.75999e-06 [merge_comm]: 3.71999e-06 [allreduce_fusion]: 3.75e-06 [matmul_add_comm_reduction]: 9.32001e-06 [allreduce_slice_to_reducescatter]: 6.89994e-07 [virtual_shard_identity]: 8.2e-06 [virtual_dataset]: 6.89999e-06 [get_grad_eliminate_]: 5.92999e-06 [virtual_output]: 6.21e-06 [merge_forward]: 3.93999e-06 [cell_reuse_recompute_pass]: 1.24e-06 [offload_activation]: 9.00001e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.206e-05 [merge_recompute_call_nodes]: 1.47001e-06 [before_grad]: 9.92001e-06 [set_forward_comm_id_for_comm_node_pass]: 3.61999e-06 [meta_fg_expand]: 2.53003e-06 [flash_sp_send_recv_attached]: 2.54001e-06 [receive_attached]: 2.34001e-06 [after_resolve]: 1.041e-05 [a_after_grad]: 9.53997e-06 [renormalize]: 0.0004146 [add_forward_monad_depend]: 4.63999e-06 [auto_monad_grad]: 2.03002e-06 [auto_monad_eliminator]: 1.346e-05 [cse]: 2.726e-05 [a_3]: 4.414e-05 [Cycle 2]: 0.00064169, [45] [expand_dump_flag]: 8.99978e-07 [switch_simplify]: 7.53e-06 [loop_unroll]: 5.59e-06 [a_1]: 0.00012606 [with_stream_mark]: 1.011e-05 [recompute_prepare]: 6.80002e-06 [updatestate_depend_eliminate]: 2.89001e-06 [updatestate_assign_eliminate]: 2.36998e-06 [updatestate_loads_eliminate]: 2.79001e-06 [parameter_eliminate]: 9.50007e-07 [a_2]: 8.321e-05 [accelerated_algorithm]: 6.24001e-06 [shard]: 1.19e-06 [meta_shard_fg_expand]: 1.34998e-06 [shard_inline]: 6.27001e-06 [merge_send_recv]: 5.35999e-06 [auto_parallel]: 5.57001e-06 [parallel]: 4.70001e-06 [flash_sp]: 3.33e-06 [merge_comm]: 3.2e-06 [allreduce_fusion]: 3.07002e-06 [matmul_add_comm_reduction]: 5.92999e-06 [allreduce_slice_to_reducescatter]: 3.09985e-07 [virtual_shard_identity]: 6.61e-06 [virtual_dataset]: 5.66e-06 [get_grad_eliminate_]: 5.52001e-06 [virtual_output]: 5.24e-06 [merge_forward]: 2.83e-06 [cell_reuse_recompute_pass]: 1.30999e-06 [offload_activation]: 6.27001e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.075e-05 [merge_recompute_call_nodes]: 9.00007e-07 [before_grad]: 8.69e-06 [set_forward_comm_id_for_comm_node_pass]: 3.85e-06 [meta_fg_expand]: 2.11e-06 [flash_sp_send_recv_attached]: 8.49977e-07 [receive_attached]: 9.30013e-07 [after_resolve]: 8.21002e-06 [a_after_grad]: 8.13999e-06 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 1.54998e-06 [auto_monad_grad]: 9.00007e-07 [auto_monad_eliminator]: 7.13e-06 [cse]: 1.533e-05 [a_3]: 3.364e-05 [py_interpret_to_execute_after_opt_a]: 7.51001e-06 [slice_cell_reuse_recomputed_activation]: 1.90001e-06 [rewriter_after_opt_a]: 3.248e-05 [convert_after_rewriter]: 6.62002e-06 [order_py_execute_after_rewriter]: 5.02e-06 [mutable_eliminate]: 0.00045234 [opt_b]: 0.00019368, [1] [Cycle 1]: 0.00018785, [7] [b_1]: 0.00011746 [b_2]: 7.18998e-06 [updatestate_depend_eliminate]: 4.74e-06 [updatestate_assign_eliminate]: 2.22001e-06 [updatestate_loads_eliminate]: 2.27001e-06 [renormalize]: 4.19997e-07 [cse]: 1.834e-05 [optimize_parallel_all_gather_comm]: 1.576e-05 [overlap_param_gather]: 1.92999e-06 [cconv]: 2.428e-05 [loop_unroll]: 0.00041666 [opt_after_cconv]: 9.873e-05, [1] [Cycle 1]: 9.32e-05, [7] [c_1]: 2.836e-05 [parameter_eliminate]: 2.44999e-06 [updatestate_depend_eliminate]: 5.05999e-06 [updatestate_assign_eliminate]: 2.32001e-06 [updatestate_loads_eliminate]: 2.10002e-06 [cse]: 1.784e-05 [renormalize]: 3.19997e-07 [remove_dup_value]: 1.354e-05 [tuple_transform]: 7.034e-05, [1] [Cycle 1]: 6.59e-05, [4] [d_1]: 3.854e-05 [none_parameter_eliminate]: 1.58002e-06 [renormalize]: 7.30011e-07 [switch_simplify]: 6.59001e-06 [partial_unused_args_eliminate]: 2.01e-06 [add_recomputation]: 4.28e-05 [cse_after_recomputation]: 2.171e-05, [1] [Cycle 1]: 1.717e-05, [1] [cse]: 1.141e-05 [environ_conv]: 4.68001e-06 [swap_dp_allreduce_reducescatter]: 5.02e-06 [bias_add_comm_swap]: 2.48e-06 [label_micro_interleaved_index]: 4.32e-06 [label_fine_grained_interleaved_index]: 2.76e-06 [merge_cast_opt]: 1.30999e-06 [slice_recompute_activation]: 1.99e-06 [micro_interleaved_order_control]: 2.34001e-06 [assign_add_opt]: 1.20001e-06 [ForceFp32Comm]: 8.2e-07 [remove_cast_before_assign_add]: 1.07e-06 [full_micro_interleaved_order_control]: 2.39001e-06 [reorder_send_recv_between_fp_bp]: 2.98e-06 [comm_op_add_attrs]: 1.02998e-06 [add_comm_op_reuse_tag]: 1.30001e-06 [interleave_split_concat_branches]: 1.37999e-06 [interleave_parallel_branches]: 1.05001e-06 [overlap_opt_shard_in_pipeline]: 1.19e-06 [overlap_opt_shard_grad_in_pipeline]: 2.07999e-06 [control_data_broadcast_order]: 1.165e-05 [grouped_pairwise_exchange_alltoall]: 1.79e-06 [offloading_packed_experts]: 4.08001e-06 [overlap_recompute_and_grad_model_parallel]: 4.84e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.13001e-06 [overlap_recompute_allgather_and_fa_grad]: 1.34e-06 [overlap_recompute_comm]: 2.24999e-06 [overlap_grad_ring_attention]: 4.09002e-06 [overlap_grad_flash_sp]: 1.745e-05 [begin_end_overlap_inline]: 6.50005e-07 [split_matmul_comm_elemetwise]: 2.68e-06 [split_layernorm_comm]: 1.70001e-06 [handle_group_info]: 1.12999e-06 [symbol_engine_optimizer]: 7.255e-05, [1] [Cycle 1]: 6.82e-05, [6] [build]: 2.39999e-06 [elim_shapecalc]: 9.36002e-06 [elim_not_effective]: 1.227e-05 [opt_reshape]: 6.59001e-06 [fold_const_symbol]: 9.67001e-06 [renormalize]: 2.50002e-07 [detach_backward]: 1.82999e-06 [pipeline_parallel_scheduler]: 2.36998e-06 [auto_monad_reorder]: 1.675e-05 [get_jit_bprop_graph]: 1.27e-06 [rewriter_after_jit_bprop_graph]: 3.48e-06 [opt_after_jit_grad]: 0.00045635 [validate]: 3.591e-05 Sums bootstrap : 0.000467s : 5.27% type_inference : 0.004662s : 52.61% event_method : 0.000012s : 0.13% auto_monad : 0.000053s : 0.60% graph_reusing : 0.000006s : 0.07% inline : 0.000002s : 0.03% add_attr.add_attr_with_inline.tag_attr : 0.000013s : 0.15% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000004s : 0.04% parallel-infer-symbol : 0.000003s : 0.03% pre_auto_parallel : 0.000024s : 0.27% insert-virtual-dataset : 0.000002s : 0.03% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.02% optimize.py_interpret_to_execute : 0.000017s : 0.20% optimize.rewriter_before_opt_a : 0.000044s : 0.50% optimize.opt_a.expand_dump_flag : 0.000004s : 0.04% optimize.opt_a.switch_simplify : 0.000032s : 0.36% optimize.opt_a.loop_unroll : 0.000019s : 0.22% optimize.opt_a.a_1 : 0.000443s : 4.99% optimize.opt_a.with_stream_mark : 0.000026s : 0.29% optimize.opt_a.recompute_prepare : 0.000016s : 0.18% optimize.opt_a.updatestate_depend_eliminate : 0.000007s : 0.08% optimize.opt_a.updatestate_assign_eliminate : 0.000006s : 0.07% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.07% optimize.opt_a.parameter_eliminate : 0.000003s : 0.04% optimize.opt_a.a_2 : 0.000175s : 1.98% optimize.opt_a.accelerated_algorithm : 0.000014s : 0.15% optimize.opt_a.shard : 0.000003s : 0.04% optimize.opt_a.meta_shard_fg_expand : 0.000003s : 0.04% optimize.opt_a.shard_inline : 0.000013s : 0.14% optimize.opt_a.merge_send_recv : 0.000014s : 0.15% optimize.opt_a.auto_parallel : 0.000012s : 0.14% optimize.opt_a.parallel : 0.000023s : 0.26% optimize.opt_a.flash_sp : 0.000012s : 0.14% optimize.opt_a.merge_comm : 0.000007s : 0.08% optimize.opt_a.allreduce_fusion : 0.000007s : 0.08% optimize.opt_a.matmul_add_comm_reduction : 0.000015s : 0.17% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000015s : 0.17% optimize.opt_a.virtual_dataset : 0.000013s : 0.14% optimize.opt_a.get_grad_eliminate_ : 0.000011s : 0.13% optimize.opt_a.virtual_output : 0.000011s : 0.13% optimize.opt_a.merge_forward : 0.000007s : 0.08% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.03% optimize.opt_a.offload_activation : 0.000015s : 0.17% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000023s : 0.26% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.03% optimize.opt_a.before_grad : 0.000019s : 0.21% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000007s : 0.08% optimize.opt_a.meta_fg_expand : 0.000005s : 0.05% optimize.opt_a.flash_sp_send_recv_attached : 0.000003s : 0.04% optimize.opt_a.receive_attached : 0.000003s : 0.04% optimize.opt_a.after_resolve : 0.000019s : 0.21% optimize.opt_a.a_after_grad : 0.000018s : 0.20% optimize.opt_a.renormalize : 0.000415s : 4.68% optimize.opt_a.add_forward_monad_depend : 0.000006s : 0.07% optimize.opt_a.auto_monad_grad : 0.000003s : 0.03% optimize.opt_a.auto_monad_eliminator : 0.000021s : 0.23% optimize.opt_a.cse : 0.000043s : 0.48% optimize.opt_a.a_3 : 0.000078s : 0.88% optimize.py_interpret_to_execute_after_opt_a : 0.000008s : 0.08% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.02% optimize.rewriter_after_opt_a : 0.000032s : 0.37% optimize.convert_after_rewriter : 0.000007s : 0.07% optimize.order_py_execute_after_rewriter : 0.000005s : 0.06% optimize.mutable_eliminate : 0.000452s : 5.10% optimize.opt_b.b_1 : 0.000117s : 1.33% optimize.opt_b.b_2 : 0.000007s : 0.08% optimize.opt_b.updatestate_depend_eliminate : 0.000005s : 0.05% optimize.opt_b.updatestate_assign_eliminate : 0.000002s : 0.03% optimize.opt_b.updatestate_loads_eliminate : 0.000002s : 0.03% optimize.opt_b.renormalize : 0.000000s : 0.00% optimize.opt_b.cse : 0.000018s : 0.21% optimize.optimize_parallel_all_gather_comm : 0.000016s : 0.18% optimize.overlap_param_gather : 0.000002s : 0.02% optimize.cconv : 0.000024s : 0.27% optimize.loop_unroll : 0.000417s : 4.70% optimize.opt_after_cconv.c_1 : 0.000028s : 0.32% optimize.opt_after_cconv.parameter_eliminate : 0.000002s : 0.03% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000005s : 0.06% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000002s : 0.03% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.02% optimize.opt_after_cconv.cse : 0.000018s : 0.20% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000014s : 0.15% optimize.tuple_transform.d_1 : 0.000039s : 0.43% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.02% optimize.tuple_transform.renormalize : 0.000001s : 0.01% optimize.tuple_transform.switch_simplify : 0.000007s : 0.07% optimize.partial_unused_args_eliminate : 0.000002s : 0.02% optimize.add_recomputation : 0.000043s : 0.48% optimize.cse_after_recomputation.cse : 0.000011s : 0.13% optimize.environ_conv : 0.000005s : 0.05% optimize.swap_dp_allreduce_reducescatter : 0.000005s : 0.06% optimize.bias_add_comm_swap : 0.000002s : 0.03% optimize.label_micro_interleaved_index : 0.000004s : 0.05% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.03% optimize.merge_cast_opt : 0.000001s : 0.01% optimize.slice_recompute_activation : 0.000002s : 0.02% optimize.micro_interleaved_order_control : 0.000002s : 0.03% optimize.assign_add_opt : 0.000001s : 0.01% optimize.ForceFp32Comm : 0.000001s : 0.01% optimize.remove_cast_before_assign_add : 0.000001s : 0.01% optimize.full_micro_interleaved_order_control : 0.000002s : 0.03% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.03% optimize.comm_op_add_attrs : 0.000001s : 0.01% optimize.add_comm_op_reuse_tag : 0.000001s : 0.01% optimize.interleave_split_concat_branches : 0.000001s : 0.02% optimize.interleave_parallel_branches : 0.000001s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.02% optimize.control_data_broadcast_order : 0.000012s : 0.13% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.02% optimize.offloading_packed_experts : 0.000004s : 0.05% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.05% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.02% optimize.overlap_recompute_comm : 0.000002s : 0.03% optimize.overlap_grad_ring_attention : 0.000004s : 0.05% optimize.overlap_grad_flash_sp : 0.000017s : 0.20% optimize.begin_end_overlap_inline : 0.000001s : 0.01% optimize.split_matmul_comm_elemetwise : 0.000003s : 0.03% optimize.split_layernorm_comm : 0.000002s : 0.02% optimize.handle_group_info : 0.000001s : 0.01% optimize.symbol_engine_optimizer.build : 0.000002s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000009s : 0.11% optimize.symbol_engine_optimizer.elim_not_effective : 0.000012s : 0.14% optimize.symbol_engine_optimizer.opt_reshape : 0.000007s : 0.07% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000010s : 0.11% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.02% pipeline_parallel_scheduler : 0.000002s : 0.03% auto_monad_reorder : 0.000017s : 0.19% get_jit_bprop_graph : 0.000001s : 0.01% rewriter_after_jit_bprop_graph : 0.000003s : 0.04% opt_after_jit_grad : 0.000456s : 5.15% validate : 0.000036s : 0.41% Time group info: ------[substitution.] 0.000171 23 40.04% : 0.000068s : 4: substitution.arithmetic_simplify 1.10% : 0.000002s : 2: substitution.elim_not_effective 0.79% : 0.000001s : 2: substitution.fold_const_symbol 3.21% : 0.000005s : 3: substitution.graph_param_transform 48.98% : 0.000084s : 2: substitution.inline 1.80% : 0.000003s : 4: substitution.j_node_and_user_rematch 2.41% : 0.000004s : 4: substitution.remove_not_recompute_node 1.68% : 0.000003s : 2: substitution.replace_old_param ------[type_inference.] 0.004618 2 91.34% : 0.004218s : 1: type_inference.infer 8.66% : 0.000400s : 1: type_inference.specialize ------[replace.] 0.000021 2 100.00% : 0.000021s : 2: replace.inline ------[match.] 0.000082 2 100.00% : 0.000082s : 2: match.inline ------[predicate.] 0.000130 754 0.83% : 0.000001s : 7: predicate.accumulaten_eliminater 0.94% : 0.000001s : 3: predicate.ad_related_special_op_eliminate 0.69% : 0.000001s : 6: predicate.addn_check_dump 0.90% : 0.000001s : 7: predicate.addn_zero_filter 0.77% : 0.000001s : 7: predicate.adjust_all_reduce_mul_add 2.85% : 0.000004s : 13: predicate.arithmetic_simplify 0.83% : 0.000001s : 7: predicate.cast_eliminate 0.73% : 0.000001s : 6: predicate.check_bprop_eliminate 0.69% : 0.000001s : 6: predicate.compare_switch_simplify 0.25% : 0.000000s : 3: predicate.const_output_eliminate 0.72% : 0.000001s : 6: predicate.depend_value_elim 0.86% : 0.000001s : 7: predicate.dict_get_item_const_eliminator 0.84% : 0.000001s : 7: predicate.dict_get_item_eliminator 0.80% : 0.000001s : 7: predicate.dict_set_item_eliminator 1.21% : 0.000002s : 6: predicate.dumpgradient_eliminate 0.25% : 0.000000s : 3: predicate.elim_not_effective 0.52% : 0.000001s : 3: predicate.elim_shapecalc_of_broadcastargs 1.12% : 0.000001s : 10: predicate.environ_add_const_eliminate 1.09% : 0.000001s : 10: predicate.environ_get_add_eliminate 1.03% : 0.000001s : 10: predicate.environ_get_depend_swap 1.94% : 0.000003s : 16: predicate.environ_get_eliminate 1.04% : 0.000001s : 10: predicate.environ_get_set_eliminate 0.99% : 0.000001s : 9: predicate.exchange_switch_depend_value 2.05% : 0.000003s : 9: predicate.float_depend_g_call 0.62% : 0.000001s : 6: predicate.float_environ_get_switch 1.03% : 0.000001s : 9: predicate.float_tuple_getitem_switch 0.22% : 0.000000s : 3: predicate.fold_const_symbol 0.86% : 0.000001s : 6: predicate.get_grad_eliminate 0.36% : 0.000000s : 3: predicate.graph_param_transform 0.87% : 0.000001s : 6: predicate.incorporate_call 0.69% : 0.000001s : 6: predicate.incorporate_call_switch 6.42% : 0.000008s : 34: predicate.inline 0.99% : 0.000001s : 6: predicate.inline_without_move 0.39% : 0.000000s : 6: predicate.j_node_and_user_rematch 1.07% : 0.000001s : 6: predicate.less_batch_normalization 1.90% : 0.000002s : 13: predicate.list_to_tuple_eliminator_ 2.14% : 0.000003s : 20: predicate.load_eliminater 1.15% : 0.000001s : 3: predicate.loop_unroll_after_grad 1.68% : 0.000002s : 14: predicate.loop_unroll_before_grad 1.76% : 0.000002s : 13: predicate.make_slice_get_slice_eliminator 0.72% : 0.000001s : 6: predicate.merge_addn 0.69% : 0.000001s : 6: predicate.micro_step_allgather_replace 0.76% : 0.000001s : 6: predicate.mini_step_allgather_replace 0.68% : 0.000001s : 7: predicate.minmaximum_grad 1.17% : 0.000002s : 3: predicate.mutable_eliminate 0.46% : 0.000001s : 3: predicate.opt_reshape 0.41% : 0.000001s : 3: predicate.parallel_virtual_node 1.40% : 0.000002s : 9: predicate.partial_defer_inline 1.26% : 0.000002s : 10: predicate.partial_eliminate 0.77% : 0.000001s : 7: predicate.print_const_string_wrapper 0.77% : 0.000001s : 6: predicate.reduce_all_const_elim 1.04% : 0.000001s : 7: predicate.reduce_eliminate 2.11% : 0.000003s : 20: predicate.redundant_stop_gradient_eliminater 0.77% : 0.000001s : 6: predicate.remove_not_recompute_node 1.20% : 0.000002s : 13: predicate.replace_applicator 0.69% : 0.000001s : 6: predicate.replace_old_param 0.39% : 0.000001s : 3: predicate.reset_defer_inline 0.83% : 0.000001s : 7: predicate.reshape_eliminate 0.74% : 0.000001s : 6: predicate.row_tensor_add_zeros_like 0.46% : 0.000001s : 3: predicate.row_tensor_eliminate 0.99% : 0.000001s : 6: predicate.same_eliminate 0.56% : 0.000001s : 6: predicate.set_cell_output_no_recompute 0.95% : 0.000001s : 6: predicate.shard_identity_eliminate 0.91% : 0.000001s : 6: predicate.special_op_eliminate 0.97% : 0.000001s : 6: predicate.specialize_transform 1.16% : 0.000001s : 6: predicate.split_environ_get_set_with_tuple_value 0.91% : 0.000001s : 6: predicate.stack_unstack_eliminate 0.45% : 0.000001s : 3: predicate.switch_call_monad_eliminater 1.05% : 0.000001s : 9: predicate.switch_defer_inline 1.72% : 0.000002s : 15: predicate.switch_layer_defer_inline 4.36% : 0.000006s : 32: predicate.switch_simplify 0.79% : 0.000001s : 7: predicate.tile_eliminate 0.83% : 0.000001s : 7: predicate.transpose_eliminate 1.53% : 0.000002s : 13: predicate.tuple_list_convert_item_index_to_positive 1.63% : 0.000002s : 13: predicate.tuple_list_get_item_const_eliminator 1.41% : 0.000002s : 13: predicate.tuple_list_get_item_depend_reorder 3.09% : 0.000004s : 19: predicate.tuple_list_get_item_eliminator 1.40% : 0.000002s : 13: predicate.tuple_list_get_set_item_eliminator 2.40% : 0.000003s : 19: predicate.tuple_list_set_item_eliminator 1.52% : 0.000002s : 13: predicate.tuple_to_list_eliminator_ 2.04% : 0.000003s : 20: predicate.updatestate_pure_node_eliminater 2.87% : 0.000004s : 26: predicate.updatestate_useless_node_eliminater 0.43% : 0.000001s : 3: predicate.value_based_eliminate 0.87% : 0.000001s : 6: predicate.virtual_dataset_eliminate 0.79% : 0.000001s : 6: predicate.virtual_output_eliminate 0.40% : 0.000001s : 3: predicate.virtual_view_grad_eliminate 0.52% : 0.000001s : 3: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000194 5 8.88% : 0.000017s : 1: func_graph_cloner_run.FuncGraphClonerGraph 91.12% : 0.000177s : 4: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.021156 192 0.02% : 0.000003s : 1: ForceFp32Comm 14.44% : 0.003055s : 1: add_attr 14.31% : 0.003027s : 1: add_attr_with_inline 0.02% : 0.000004s : 1: add_comm_op_reuse_tag 0.22% : 0.000047s : 1: add_recomputation 0.02% : 0.000004s : 1: assign_add_opt 0.27% : 0.000058s : 1: auto_monad 0.10% : 0.000021s : 1: auto_monad_reorder 0.02% : 0.000004s : 1: begin_end_overlap_inline 0.03% : 0.000005s : 1: bias_add_comm_swap 2.35% : 0.000496s : 1: bootstrap 0.13% : 0.000028s : 1: cconv 0.02% : 0.000004s : 1: comm_op_add_attrs 0.07% : 0.000015s : 1: control_data_broadcast_order 0.05% : 0.000010s : 1: convert_after_rewriter 0.12% : 0.000025s : 1: cse_after_recomputation 0.02% : 0.000005s : 1: dataset_repeat_opt 0.02% : 0.000005s : 1: detach_backward 0.04% : 0.000008s : 1: environ_conv 0.08% : 0.000018s : 1: event_method 0.02% : 0.000005s : 1: full_micro_interleaved_order_control 0.02% : 0.000004s : 1: get_jit_bprop_graph 0.04% : 0.000009s : 1: graph_reusing 0.02% : 0.000005s : 1: grouped_pairwise_exchange_alltoall 0.02% : 0.000004s : 1: handle_group_info 0.03% : 0.000005s : 1: inline 0.03% : 0.000006s : 1: insert-virtual-dataset 0.02% : 0.000004s : 1: interleave_parallel_branches 0.02% : 0.000004s : 1: interleave_split_concat_branches 0.03% : 0.000006s : 1: label_fine_grained_interleaved_index 0.03% : 0.000007s : 1: label_micro_interleaved_index 2.01% : 0.000424s : 1: loop_unroll 0.02% : 0.000004s : 1: merge_cast_opt 0.02% : 0.000005s : 1: micro_interleaved_order_control 2.17% : 0.000460s : 1: mutable_eliminate 0.03% : 0.000007s : 1: offloading_packed_experts 0.06% : 0.000014s : 1: opt.transform.loop_unroll_optimizer 0.06% : 0.000014s : 1: opt.transform.mutable_eliminate 3.90% : 0.000825s : 78: opt.transform.opt_a 0.13% : 0.000027s : 1: opt.transform.opt_after_cconv 0.11% : 0.000023s : 1: opt.transform.opt_after_jit_grad 0.44% : 0.000094s : 28: opt.transform.opt_b 0.20% : 0.000043s : 2: opt.transform.opt_trans_graph 0.16% : 0.000034s : 4: opt.transform.symbol_engine_opt 9.71% : 0.002055s : 1: opt_a 0.48% : 0.000102s : 1: opt_after_cconv 2.20% : 0.000465s : 1: opt_after_jit_grad 0.93% : 0.000197s : 1: opt_b 18.47% : 0.003907s : 1: optimize 0.09% : 0.000019s : 1: optimize_parallel_all_gather_comm 0.04% : 0.000008s : 1: order_py_execute_after_rewriter 0.10% : 0.000021s : 1: overlap_grad_flash_sp 0.02% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.03% : 0.000007s : 1: overlap_grad_ring_attention 0.02% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.02% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.02% : 0.000005s : 1: overlap_param_gather 0.02% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.04% : 0.000008s : 1: overlap_recompute_and_grad_model_parallel 0.02% : 0.000005s : 1: overlap_recompute_comm 0.03% : 0.000007s : 1: parallel-infer-symbol 0.02% : 0.000004s : 1: parallel-infer-symbol-second 0.02% : 0.000005s : 1: partial_unused_args_eliminate 0.03% : 0.000006s : 1: pipeline_parallel_scheduler 0.02% : 0.000005s : 1: pipeline_split 0.13% : 0.000029s : 1: pre_auto_parallel 0.10% : 0.000021s : 1: py_interpret_to_execute 0.05% : 0.000011s : 1: py_interpret_to_execute_after_opt_a 0.02% : 0.000004s : 1: remove_cast_before_assign_add 0.08% : 0.000017s : 1: remove_dup_value 1.04% : 0.000220s : 1: renormalize.infer 0.89% : 0.000188s : 1: renormalize.specialize 0.03% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.03% : 0.000007s : 1: rewriter_after_jit_bprop_graph 0.17% : 0.000036s : 1: rewriter_after_opt_a 0.22% : 0.000048s : 1: rewriter_before_opt_a 0.02% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.02% : 0.000005s : 1: slice_recompute_activation 0.02% : 0.000005s : 1: split_layernorm_comm 0.03% : 0.000006s : 1: split_matmul_comm_elemetwise 0.04% : 0.000008s : 1: swap_dp_allreduce_reducescatter 0.36% : 0.000075s : 1: symbol_engine_optimizer 0.35% : 0.000073s : 1: tuple_transform 22.10% : 0.004676s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:50.464.035 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:50.464.303 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0142605, [21] [bootstrap]: 0.00045618 [type_inference]: 0.00491877 [event_method]: 1.167e-05 [auto_monad]: 5.487e-05 [graph_reusing]: 5.29e-06 [inline]: 2.49001e-06 [add_attr]: 0.00307327, [1] [add_attr_with_inline]: 0.00306579, [1] [Cycle 1]: 6.586e-05, [2] [tag_attr]: 1.456e-05 [meta_addattr_fg_expand]: 3.85e-06 [parallel-infer-symbol]: 3.03e-06 [pre_auto_parallel]: 2.388e-05 [insert-virtual-dataset]: 2.76e-06 [parallel-infer-symbol-second]: 7.00005e-07 [dataset_repeat_opt]: 1.88002e-06 [pipeline_split]: 1.73002e-06 [optimize]: 0.00461951, [53] [py_interpret_to_execute]: 2.089e-05 [rewriter_before_opt_a]: 4.879e-05 [opt_a]: 0.00237329, [2] [Cycle 1]: 0.00158548, [45] [expand_dump_flag]: 2.88e-06 [switch_simplify]: 2.44e-05 [loop_unroll]: 1.427e-05 [a_1]: 0.00028224 [with_stream_mark]: 1.49e-05 [recompute_prepare]: 8.17e-06 [updatestate_depend_eliminate]: 4.08999e-06 [updatestate_assign_eliminate]: 3.53999e-06 [updatestate_loads_eliminate]: 3.34001e-06 [parameter_eliminate]: 1.76e-06 [a_2]: 0.0001059 [accelerated_algorithm]: 6.63998e-06 [shard]: 1.96003e-06 [meta_shard_fg_expand]: 1.65001e-06 [shard_inline]: 6.04999e-06 [merge_send_recv]: 7.96001e-06 [auto_parallel]: 6.83e-06 [parallel]: 1.716e-05 [flash_sp]: 7.40998e-06 [merge_comm]: 3.73999e-06 [allreduce_fusion]: 3.54002e-06 [matmul_add_comm_reduction]: 9.38002e-06 [allreduce_slice_to_reducescatter]: 7.2e-07 [virtual_shard_identity]: 7.61999e-06 [virtual_dataset]: 6.81999e-06 [get_grad_eliminate_]: 5.79e-06 [virtual_output]: 6.19001e-06 [merge_forward]: 4.47998e-06 [cell_reuse_recompute_pass]: 1.40999e-06 [offload_activation]: 9.77999e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.582e-05 [merge_recompute_call_nodes]: 1.45001e-06 [before_grad]: 1.019e-05 [set_forward_comm_id_for_comm_node_pass]: 3.89002e-06 [meta_fg_expand]: 2.76e-06 [flash_sp_send_recv_attached]: 2.73e-06 [receive_attached]: 2.41e-06 [after_resolve]: 1.01e-05 [a_after_grad]: 9.04e-06 [renormalize]: 0.00045636 [add_forward_monad_depend]: 4.92e-06 [auto_monad_grad]: 2.08002e-06 [auto_monad_eliminator]: 1.513e-05 [cse]: 2.785e-05 [a_3]: 6.033e-05 [Cycle 2]: 0.00077516, [45] [expand_dump_flag]: 1.40999e-06 [switch_simplify]: 7.26999e-06 [loop_unroll]: 5.97001e-06 [a_1]: 0.00010578 [with_stream_mark]: 1.251e-05 [recompute_prepare]: 5.97001e-06 [updatestate_depend_eliminate]: 3.09001e-06 [updatestate_assign_eliminate]: 2.43002e-06 [updatestate_loads_eliminate]: 2.54001e-06 [parameter_eliminate]: 1.34e-06 [a_2]: 9.687e-05 [accelerated_algorithm]: 6.02999e-06 [shard]: 1.06002e-06 [meta_shard_fg_expand]: 1.49e-06 [shard_inline]: 6.07001e-06 [merge_send_recv]: 4.70999e-06 [auto_parallel]: 5.51e-06 [parallel]: 4.70999e-06 [flash_sp]: 3.43e-06 [merge_comm]: 3.33e-06 [allreduce_fusion]: 3.00998e-06 [matmul_add_comm_reduction]: 6.16e-06 [allreduce_slice_to_reducescatter]: 3.49974e-07 [virtual_shard_identity]: 6.24999e-06 [virtual_dataset]: 5.64e-06 [get_grad_eliminate_]: 5.59998e-06 [virtual_output]: 5.80002e-06 [merge_forward]: 3.11001e-06 [cell_reuse_recompute_pass]: 1.57999e-06 [offload_activation]: 6.68e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.468e-05 [merge_recompute_call_nodes]: 7.59988e-07 [before_grad]: 8.89e-06 [set_forward_comm_id_for_comm_node_pass]: 3.75998e-06 [meta_fg_expand]: 2.06998e-06 [flash_sp_send_recv_attached]: 8.39995e-07 [receive_attached]: 1.17e-06 [after_resolve]: 8.57998e-06 [a_after_grad]: 8.05e-06 [renormalize]: 8.00064e-08 [add_forward_monad_depend]: 1.29e-06 [auto_monad_grad]: 9.10019e-07 [auto_monad_eliminator]: 7.24001e-06 [cse]: 1.43e-05 [a_3]: 4.732e-05 [py_interpret_to_execute_after_opt_a]: 1.117e-05 [slice_cell_reuse_recomputed_activation]: 5.14e-06 [rewriter_after_opt_a]: 4.027e-05 [convert_after_rewriter]: 9.67001e-06 [order_py_execute_after_rewriter]: 8.08001e-06 [mutable_eliminate]: 0.00050546 [opt_b]: 0.00028004, [1] [Cycle 1]: 0.00027148, [7] [b_1]: 0.00017745 [b_2]: 7.21999e-06 [updatestate_depend_eliminate]: 6.29999e-06 [updatestate_assign_eliminate]: 2.54001e-06 [updatestate_loads_eliminate]: 2.54999e-06 [renormalize]: 5.59987e-07 [cse]: 1.843e-05 [optimize_parallel_all_gather_comm]: 1.916e-05 [overlap_param_gather]: 4.99e-06 [cconv]: 2.898e-05 [loop_unroll]: 0.00043128 [opt_after_cconv]: 0.00014656, [1] [Cycle 1]: 0.00013626, [7] [c_1]: 2.707e-05 [parameter_eliminate]: 2.44999e-06 [updatestate_depend_eliminate]: 5.40001e-06 [updatestate_assign_eliminate]: 2.91e-06 [updatestate_loads_eliminate]: 2.40002e-06 [cse]: 1.727e-05 [renormalize]: 8.10018e-07 [remove_dup_value]: 1.723e-05 [tuple_transform]: 8.309e-05, [1] [Cycle 1]: 7.609e-05, [4] [d_1]: 3.775e-05 [none_parameter_eliminate]: 1.81998e-06 [renormalize]: 3.00002e-07 [switch_simplify]: 6.56e-06 [partial_unused_args_eliminate]: 4.37e-06 [add_recomputation]: 4.757e-05 [cse_after_recomputation]: 2.678e-05, [1] [Cycle 1]: 2.028e-05, [1] [cse]: 1.174e-05 [environ_conv]: 7.66001e-06 [swap_dp_allreduce_reducescatter]: 7.51001e-06 [bias_add_comm_swap]: 4.77998e-06 [label_micro_interleaved_index]: 6.43e-06 [label_fine_grained_interleaved_index]: 5.04e-06 [merge_cast_opt]: 3.58999e-06 [slice_recompute_activation]: 4.35e-06 [micro_interleaved_order_control]: 4.53001e-06 [assign_add_opt]: 3.86001e-06 [ForceFp32Comm]: 3.31001e-06 [remove_cast_before_assign_add]: 3.33998e-06 [full_micro_interleaved_order_control]: 4.51002e-06 [reorder_send_recv_between_fp_bp]: 5.00001e-06 [comm_op_add_attrs]: 3.65e-06 [add_comm_op_reuse_tag]: 3.31001e-06 [interleave_split_concat_branches]: 3.46001e-06 [interleave_parallel_branches]: 3.6e-06 [overlap_opt_shard_in_pipeline]: 3.62002e-06 [overlap_opt_shard_grad_in_pipeline]: 4.85001e-06 [control_data_broadcast_order]: 1.472e-05 [grouped_pairwise_exchange_alltoall]: 3.97e-06 [offloading_packed_experts]: 7.15998e-06 [overlap_recompute_and_grad_model_parallel]: 7.82e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.59002e-06 [overlap_recompute_allgather_and_fa_grad]: 3.68e-06 [overlap_recompute_comm]: 4.80999e-06 [overlap_grad_ring_attention]: 6.27001e-06 [overlap_grad_flash_sp]: 2.151e-05 [begin_end_overlap_inline]: 2.81e-06 [split_matmul_comm_elemetwise]: 4.35e-06 [split_layernorm_comm]: 4e-06 [handle_group_info]: 3.36001e-06 [symbol_engine_optimizer]: 9.319e-05, [1] [Cycle 1]: 8.656e-05, [6] [build]: 2.37999e-06 [elim_shapecalc]: 8.84e-06 [elim_not_effective]: 1.249e-05 [opt_reshape]: 7.21001e-06 [fold_const_symbol]: 9.71e-06 [renormalize]: 2.50002e-07 [detach_backward]: 3.17002e-06 [pipeline_parallel_scheduler]: 2.30002e-06 [auto_monad_reorder]: 1.809e-05 [get_jit_bprop_graph]: 1.17999e-06 [rewriter_after_jit_bprop_graph]: 3.7e-06 [opt_after_jit_grad]: 0.00047266 [validate]: 3.587e-05 Sums bootstrap : 0.000456s : 4.83% type_inference : 0.004919s : 52.11% event_method : 0.000012s : 0.12% auto_monad : 0.000055s : 0.58% graph_reusing : 0.000005s : 0.06% inline : 0.000002s : 0.03% add_attr.add_attr_with_inline.tag_attr : 0.000015s : 0.15% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000004s : 0.04% parallel-infer-symbol : 0.000003s : 0.03% pre_auto_parallel : 0.000024s : 0.25% insert-virtual-dataset : 0.000003s : 0.03% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.02% optimize.py_interpret_to_execute : 0.000021s : 0.22% optimize.rewriter_before_opt_a : 0.000049s : 0.52% optimize.opt_a.expand_dump_flag : 0.000004s : 0.05% optimize.opt_a.switch_simplify : 0.000032s : 0.34% optimize.opt_a.loop_unroll : 0.000020s : 0.21% optimize.opt_a.a_1 : 0.000388s : 4.11% optimize.opt_a.with_stream_mark : 0.000027s : 0.29% optimize.opt_a.recompute_prepare : 0.000014s : 0.15% optimize.opt_a.updatestate_depend_eliminate : 0.000007s : 0.08% optimize.opt_a.updatestate_assign_eliminate : 0.000006s : 0.06% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.06% optimize.opt_a.parameter_eliminate : 0.000003s : 0.03% optimize.opt_a.a_2 : 0.000203s : 2.15% optimize.opt_a.accelerated_algorithm : 0.000013s : 0.13% optimize.opt_a.shard : 0.000003s : 0.03% optimize.opt_a.meta_shard_fg_expand : 0.000003s : 0.03% optimize.opt_a.shard_inline : 0.000012s : 0.13% optimize.opt_a.merge_send_recv : 0.000013s : 0.13% optimize.opt_a.auto_parallel : 0.000012s : 0.13% optimize.opt_a.parallel : 0.000022s : 0.23% optimize.opt_a.flash_sp : 0.000011s : 0.11% optimize.opt_a.merge_comm : 0.000007s : 0.07% optimize.opt_a.allreduce_fusion : 0.000007s : 0.07% optimize.opt_a.matmul_add_comm_reduction : 0.000016s : 0.16% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000014s : 0.15% optimize.opt_a.virtual_dataset : 0.000012s : 0.13% optimize.opt_a.get_grad_eliminate_ : 0.000011s : 0.12% optimize.opt_a.virtual_output : 0.000012s : 0.13% optimize.opt_a.merge_forward : 0.000008s : 0.08% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.03% optimize.opt_a.offload_activation : 0.000016s : 0.17% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000030s : 0.32% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.02% optimize.opt_a.before_grad : 0.000019s : 0.20% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000008s : 0.08% optimize.opt_a.meta_fg_expand : 0.000005s : 0.05% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.04% optimize.opt_a.receive_attached : 0.000004s : 0.04% optimize.opt_a.after_resolve : 0.000019s : 0.20% optimize.opt_a.a_after_grad : 0.000017s : 0.18% optimize.opt_a.renormalize : 0.000456s : 4.84% optimize.opt_a.add_forward_monad_depend : 0.000006s : 0.07% optimize.opt_a.auto_monad_grad : 0.000003s : 0.03% optimize.opt_a.auto_monad_eliminator : 0.000022s : 0.24% optimize.opt_a.cse : 0.000042s : 0.45% optimize.opt_a.a_3 : 0.000108s : 1.14% optimize.py_interpret_to_execute_after_opt_a : 0.000011s : 0.12% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.05% optimize.rewriter_after_opt_a : 0.000040s : 0.43% optimize.convert_after_rewriter : 0.000010s : 0.10% optimize.order_py_execute_after_rewriter : 0.000008s : 0.09% optimize.mutable_eliminate : 0.000505s : 5.35% optimize.opt_b.b_1 : 0.000177s : 1.88% optimize.opt_b.b_2 : 0.000007s : 0.08% optimize.opt_b.updatestate_depend_eliminate : 0.000006s : 0.07% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_b.renormalize : 0.000001s : 0.01% optimize.opt_b.cse : 0.000018s : 0.20% optimize.optimize_parallel_all_gather_comm : 0.000019s : 0.20% optimize.overlap_param_gather : 0.000005s : 0.05% optimize.cconv : 0.000029s : 0.31% optimize.loop_unroll : 0.000431s : 4.57% optimize.opt_after_cconv.c_1 : 0.000027s : 0.29% optimize.opt_after_cconv.parameter_eliminate : 0.000002s : 0.03% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000005s : 0.06% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.03% optimize.opt_after_cconv.cse : 0.000017s : 0.18% optimize.opt_after_cconv.renormalize : 0.000001s : 0.01% optimize.remove_dup_value : 0.000017s : 0.18% optimize.tuple_transform.d_1 : 0.000038s : 0.40% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.02% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000007s : 0.07% optimize.partial_unused_args_eliminate : 0.000004s : 0.05% optimize.add_recomputation : 0.000048s : 0.50% optimize.cse_after_recomputation.cse : 0.000012s : 0.12% optimize.environ_conv : 0.000008s : 0.08% optimize.swap_dp_allreduce_reducescatter : 0.000008s : 0.08% optimize.bias_add_comm_swap : 0.000005s : 0.05% optimize.label_micro_interleaved_index : 0.000006s : 0.07% optimize.label_fine_grained_interleaved_index : 0.000005s : 0.05% optimize.merge_cast_opt : 0.000004s : 0.04% optimize.slice_recompute_activation : 0.000004s : 0.05% optimize.micro_interleaved_order_control : 0.000005s : 0.05% optimize.assign_add_opt : 0.000004s : 0.04% optimize.ForceFp32Comm : 0.000003s : 0.04% optimize.remove_cast_before_assign_add : 0.000003s : 0.04% optimize.full_micro_interleaved_order_control : 0.000005s : 0.05% optimize.reorder_send_recv_between_fp_bp : 0.000005s : 0.05% optimize.comm_op_add_attrs : 0.000004s : 0.04% optimize.add_comm_op_reuse_tag : 0.000003s : 0.04% optimize.interleave_split_concat_branches : 0.000003s : 0.04% optimize.interleave_parallel_branches : 0.000004s : 0.04% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.04% optimize.overlap_opt_shard_grad_in_pipeline : 0.000005s : 0.05% optimize.control_data_broadcast_order : 0.000015s : 0.16% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.04% optimize.offloading_packed_experts : 0.000007s : 0.08% optimize.overlap_recompute_and_grad_model_parallel : 0.000008s : 0.08% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.04% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.04% optimize.overlap_recompute_comm : 0.000005s : 0.05% optimize.overlap_grad_ring_attention : 0.000006s : 0.07% optimize.overlap_grad_flash_sp : 0.000022s : 0.23% optimize.begin_end_overlap_inline : 0.000003s : 0.03% optimize.split_matmul_comm_elemetwise : 0.000004s : 0.05% optimize.split_layernorm_comm : 0.000004s : 0.04% optimize.handle_group_info : 0.000003s : 0.04% optimize.symbol_engine_optimizer.build : 0.000002s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000009s : 0.09% optimize.symbol_engine_optimizer.elim_not_effective : 0.000012s : 0.13% optimize.symbol_engine_optimizer.opt_reshape : 0.000007s : 0.08% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000010s : 0.10% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000003s : 0.03% pipeline_parallel_scheduler : 0.000002s : 0.02% auto_monad_reorder : 0.000018s : 0.19% get_jit_bprop_graph : 0.000001s : 0.01% rewriter_after_jit_bprop_graph : 0.000004s : 0.04% opt_after_jit_grad : 0.000473s : 5.01% validate : 0.000036s : 0.38% Time group info: ------[substitution.] 0.000116 20 1.53% : 0.000002s : 2: substitution.elim_not_effective 1.15% : 0.000001s : 2: substitution.fold_const_symbol 4.57% : 0.000005s : 3: substitution.graph_param_transform 74.18% : 0.000086s : 2: substitution.inline 2.68% : 0.000003s : 4: substitution.j_node_and_user_rematch 3.94% : 0.000005s : 4: substitution.remove_not_recompute_node 2.57% : 0.000003s : 2: substitution.replace_old_param 9.39% : 0.000011s : 1: substitution.value_based_eliminate ------[type_inference.] 0.004871 2 91.45% : 0.004455s : 1: type_inference.infer 8.55% : 0.000417s : 1: type_inference.specialize ------[replace.] 0.000021 2 100.00% : 0.000021s : 2: replace.inline ------[match.] 0.000084 2 100.00% : 0.000084s : 2: match.inline ------[predicate.] 0.000133 754 1.03% : 0.000001s : 7: predicate.accumulaten_eliminater 1.04% : 0.000001s : 3: predicate.ad_related_special_op_eliminate 0.67% : 0.000001s : 6: predicate.addn_check_dump 1.07% : 0.000001s : 7: predicate.addn_zero_filter 0.70% : 0.000001s : 7: predicate.adjust_all_reduce_mul_add 2.44% : 0.000003s : 13: predicate.arithmetic_simplify 0.79% : 0.000001s : 7: predicate.cast_eliminate 0.80% : 0.000001s : 6: predicate.check_bprop_eliminate 0.73% : 0.000001s : 6: predicate.compare_switch_simplify 0.24% : 0.000000s : 3: predicate.const_output_eliminate 0.80% : 0.000001s : 6: predicate.depend_value_elim 0.76% : 0.000001s : 7: predicate.dict_get_item_const_eliminator 1.15% : 0.000002s : 7: predicate.dict_get_item_eliminator 0.76% : 0.000001s : 7: predicate.dict_set_item_eliminator 1.24% : 0.000002s : 6: predicate.dumpgradient_eliminate 0.25% : 0.000000s : 3: predicate.elim_not_effective 0.43% : 0.000001s : 3: predicate.elim_shapecalc_of_broadcastargs 1.09% : 0.000001s : 10: predicate.environ_add_const_eliminate 0.99% : 0.000001s : 10: predicate.environ_get_add_eliminate 1.01% : 0.000001s : 10: predicate.environ_get_depend_swap 1.80% : 0.000002s : 16: predicate.environ_get_eliminate 0.99% : 0.000001s : 10: predicate.environ_get_set_eliminate 0.97% : 0.000001s : 9: predicate.exchange_switch_depend_value 2.01% : 0.000003s : 9: predicate.float_depend_g_call 0.71% : 0.000001s : 6: predicate.float_environ_get_switch 1.07% : 0.000001s : 9: predicate.float_tuple_getitem_switch 0.20% : 0.000000s : 3: predicate.fold_const_symbol 0.84% : 0.000001s : 6: predicate.get_grad_eliminate 0.26% : 0.000000s : 3: predicate.graph_param_transform 0.76% : 0.000001s : 6: predicate.incorporate_call 0.65% : 0.000001s : 6: predicate.incorporate_call_switch 6.13% : 0.000008s : 34: predicate.inline 0.98% : 0.000001s : 6: predicate.inline_without_move 0.43% : 0.000001s : 6: predicate.j_node_and_user_rematch 1.00% : 0.000001s : 6: predicate.less_batch_normalization 1.70% : 0.000002s : 13: predicate.list_to_tuple_eliminator_ 2.06% : 0.000003s : 20: predicate.load_eliminater 1.20% : 0.000002s : 3: predicate.loop_unroll_after_grad 1.63% : 0.000002s : 14: predicate.loop_unroll_before_grad 1.78% : 0.000002s : 13: predicate.make_slice_get_slice_eliminator 0.69% : 0.000001s : 6: predicate.merge_addn 0.75% : 0.000001s : 6: predicate.micro_step_allgather_replace 0.74% : 0.000001s : 6: predicate.mini_step_allgather_replace 0.67% : 0.000001s : 7: predicate.minmaximum_grad 1.47% : 0.000002s : 3: predicate.mutable_eliminate 0.43% : 0.000001s : 3: predicate.opt_reshape 0.52% : 0.000001s : 3: predicate.parallel_virtual_node 1.33% : 0.000002s : 9: predicate.partial_defer_inline 1.21% : 0.000002s : 10: predicate.partial_eliminate 0.74% : 0.000001s : 7: predicate.print_const_string_wrapper 0.76% : 0.000001s : 6: predicate.reduce_all_const_elim 1.01% : 0.000001s : 7: predicate.reduce_eliminate 2.02% : 0.000003s : 20: predicate.redundant_stop_gradient_eliminater 0.61% : 0.000001s : 6: predicate.remove_not_recompute_node 1.12% : 0.000002s : 13: predicate.replace_applicator 0.69% : 0.000001s : 6: predicate.replace_old_param 0.38% : 0.000001s : 3: predicate.reset_defer_inline 0.85% : 0.000001s : 7: predicate.reshape_eliminate 0.80% : 0.000001s : 6: predicate.row_tensor_add_zeros_like 0.48% : 0.000001s : 3: predicate.row_tensor_eliminate 1.12% : 0.000001s : 6: predicate.same_eliminate 0.52% : 0.000001s : 6: predicate.set_cell_output_no_recompute 0.95% : 0.000001s : 6: predicate.shard_identity_eliminate 0.95% : 0.000001s : 6: predicate.special_op_eliminate 0.92% : 0.000001s : 6: predicate.specialize_transform 1.34% : 0.000002s : 6: predicate.split_environ_get_set_with_tuple_value 0.82% : 0.000001s : 6: predicate.stack_unstack_eliminate 0.37% : 0.000000s : 3: predicate.switch_call_monad_eliminater 1.06% : 0.000001s : 9: predicate.switch_defer_inline 1.75% : 0.000002s : 15: predicate.switch_layer_defer_inline 4.54% : 0.000006s : 32: predicate.switch_simplify 0.83% : 0.000001s : 7: predicate.tile_eliminate 0.77% : 0.000001s : 7: predicate.transpose_eliminate 1.57% : 0.000002s : 13: predicate.tuple_list_convert_item_index_to_positive 1.72% : 0.000002s : 13: predicate.tuple_list_get_item_const_eliminator 1.46% : 0.000002s : 13: predicate.tuple_list_get_item_depend_reorder 3.66% : 0.000005s : 19: predicate.tuple_list_get_item_eliminator 1.33% : 0.000002s : 13: predicate.tuple_list_get_set_item_eliminator 2.49% : 0.000003s : 19: predicate.tuple_list_set_item_eliminator 1.46% : 0.000002s : 13: predicate.tuple_to_list_eliminator_ 1.99% : 0.000003s : 20: predicate.updatestate_pure_node_eliminater 2.75% : 0.000004s : 26: predicate.updatestate_useless_node_eliminater 0.57% : 0.000001s : 3: predicate.value_based_eliminate 0.80% : 0.000001s : 6: predicate.virtual_dataset_eliminate 0.84% : 0.000001s : 6: predicate.virtual_output_eliminate 0.35% : 0.000000s : 3: predicate.virtual_view_grad_eliminate 0.58% : 0.000001s : 3: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000203 5 8.92% : 0.000018s : 1: func_graph_cloner_run.FuncGraphClonerGraph 91.08% : 0.000185s : 4: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.023274 192 0.03% : 0.000006s : 1: ForceFp32Comm 13.24% : 0.003081s : 1: add_attr 13.19% : 0.003069s : 1: add_attr_with_inline 0.03% : 0.000006s : 1: add_comm_op_reuse_tag 0.22% : 0.000051s : 1: add_recomputation 0.03% : 0.000006s : 1: assign_add_opt 0.27% : 0.000063s : 1: auto_monad 0.11% : 0.000026s : 1: auto_monad_reorder 0.02% : 0.000006s : 1: begin_end_overlap_inline 0.03% : 0.000008s : 1: bias_add_comm_swap 2.16% : 0.000502s : 1: bootstrap 0.14% : 0.000032s : 1: cconv 0.03% : 0.000007s : 1: comm_op_add_attrs 0.08% : 0.000018s : 1: control_data_broadcast_order 0.05% : 0.000013s : 1: convert_after_rewriter 0.13% : 0.000030s : 1: cse_after_recomputation 0.03% : 0.000008s : 1: dataset_repeat_opt 0.07% : 0.000017s : 1: detach_backward 0.05% : 0.000011s : 1: environ_conv 0.09% : 0.000020s : 1: event_method 0.03% : 0.000007s : 1: full_micro_interleaved_order_control 0.03% : 0.000007s : 1: get_jit_bprop_graph 0.05% : 0.000011s : 1: graph_reusing 0.03% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.03% : 0.000006s : 1: handle_group_info 0.04% : 0.000008s : 1: inline 0.04% : 0.000009s : 1: insert-virtual-dataset 0.03% : 0.000006s : 1: interleave_parallel_branches 0.03% : 0.000006s : 1: interleave_split_concat_branches 0.03% : 0.000008s : 1: label_fine_grained_interleaved_index 0.04% : 0.000009s : 1: label_micro_interleaved_index 1.88% : 0.000437s : 1: loop_unroll 0.03% : 0.000006s : 1: merge_cast_opt 0.03% : 0.000007s : 1: micro_interleaved_order_control 2.20% : 0.000512s : 1: mutable_eliminate 0.04% : 0.000010s : 1: offloading_packed_experts 0.06% : 0.000014s : 1: opt.transform.loop_unroll_optimizer 0.07% : 0.000016s : 1: opt.transform.mutable_eliminate 3.24% : 0.000754s : 78: opt.transform.opt_a 0.11% : 0.000026s : 1: opt.transform.opt_after_cconv 0.10% : 0.000023s : 1: opt.transform.opt_after_jit_grad 0.48% : 0.000112s : 28: opt.transform.opt_b 0.18% : 0.000042s : 2: opt.transform.opt_trans_graph 0.15% : 0.000035s : 4: opt.transform.symbol_engine_opt 10.21% : 0.002376s : 1: opt_a 0.65% : 0.000151s : 1: opt_after_cconv 2.08% : 0.000483s : 1: opt_after_jit_grad 1.22% : 0.000283s : 1: opt_b 21.23% : 0.004941s : 1: optimize 0.10% : 0.000022s : 1: optimize_parallel_all_gather_comm 0.05% : 0.000011s : 1: order_py_execute_after_rewriter 0.11% : 0.000025s : 1: overlap_grad_flash_sp 0.03% : 0.000006s : 1: overlap_grad_matmul_and_grad_allreduce 0.04% : 0.000009s : 1: overlap_grad_ring_attention 0.03% : 0.000007s : 1: overlap_opt_shard_grad_in_pipeline 0.03% : 0.000006s : 1: overlap_opt_shard_in_pipeline 0.03% : 0.000008s : 1: overlap_param_gather 0.03% : 0.000006s : 1: overlap_recompute_allgather_and_fa_grad 0.05% : 0.000011s : 1: overlap_recompute_and_grad_model_parallel 0.03% : 0.000008s : 1: overlap_recompute_comm 0.04% : 0.000009s : 1: parallel-infer-symbol 0.03% : 0.000006s : 1: parallel-infer-symbol-second 0.03% : 0.000007s : 1: partial_unused_args_eliminate 0.04% : 0.000009s : 1: pipeline_parallel_scheduler 0.03% : 0.000007s : 1: pipeline_split 0.13% : 0.000031s : 1: pre_auto_parallel 0.11% : 0.000025s : 1: py_interpret_to_execute 0.06% : 0.000014s : 1: py_interpret_to_execute_after_opt_a 0.03% : 0.000006s : 1: remove_cast_before_assign_add 0.09% : 0.000021s : 1: remove_dup_value 1.06% : 0.000246s : 1: renormalize.infer 0.87% : 0.000203s : 1: renormalize.specialize 0.03% : 0.000008s : 1: reorder_send_recv_between_fp_bp 0.04% : 0.000010s : 1: rewriter_after_jit_bprop_graph 0.19% : 0.000044s : 1: rewriter_after_opt_a 0.22% : 0.000052s : 1: rewriter_before_opt_a 0.03% : 0.000008s : 1: slice_cell_reuse_recomputed_activation 0.03% : 0.000007s : 1: slice_recompute_activation 0.03% : 0.000007s : 1: split_layernorm_comm 0.03% : 0.000007s : 1: split_matmul_comm_elemetwise 0.04% : 0.000010s : 1: swap_dp_allreduce_reducescatter 0.41% : 0.000096s : 1: symbol_engine_optimizer 0.37% : 0.000086s : 1: tuple_transform 21.24% : 0.004944s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:50.655.366 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0128205, [21] [bootstrap]: 0.00043222 [type_inference]: 0.00465549 [event_method]: 1.13e-05 [auto_monad]: 5.33e-05 [graph_reusing]: 5.09998e-06 [inline]: 2.00002e-06 [add_attr]: 0.00309032, [1] [add_attr_with_inline]: 0.00308221, [1] [Cycle 1]: 5.426e-05, [2] [tag_attr]: 1.442e-05 [meta_addattr_fg_expand]: 4.08001e-06 [parallel-infer-symbol]: 3.01001e-06 [pre_auto_parallel]: 2.27e-05 [insert-virtual-dataset]: 2.39999e-06 [parallel-infer-symbol-second]: 9.90025e-07 [dataset_repeat_opt]: 2.15002e-06 [pipeline_split]: 1.82999e-06 [optimize]: 0.00387591, [53] [py_interpret_to_execute]: 1.6e-05 [rewriter_before_opt_a]: 4.311e-05 [opt_a]: 0.00199471, [2] [Cycle 1]: 0.00138752, [45] [expand_dump_flag]: 2.68e-06 [switch_simplify]: 2.54e-05 [loop_unroll]: 1.379e-05 [a_1]: 0.00027798 [with_stream_mark]: 1.478e-05 [recompute_prepare]: 7.82e-06 [updatestate_depend_eliminate]: 3.48e-06 [updatestate_assign_eliminate]: 3.4e-06 [updatestate_loads_eliminate]: 2.98998e-06 [parameter_eliminate]: 1.73997e-06 [a_2]: 7.823e-05 [accelerated_algorithm]: 6.64001e-06 [shard]: 2.31e-06 [meta_shard_fg_expand]: 1.66998e-06 [shard_inline]: 6.07001e-06 [merge_send_recv]: 8.07003e-06 [auto_parallel]: 5.77999e-06 [parallel]: 1.751e-05 [flash_sp]: 7.63999e-06 [merge_comm]: 4e-06 [allreduce_fusion]: 3.38e-06 [matmul_add_comm_reduction]: 9.99001e-06 [allreduce_slice_to_reducescatter]: 5.60016e-07 [virtual_shard_identity]: 7.4e-06 [virtual_dataset]: 6.16e-06 [get_grad_eliminate_]: 5.89999e-06 [virtual_output]: 5.71998e-06 [merge_forward]: 4.4e-06 [cell_reuse_recompute_pass]: 1.20001e-06 [offload_activation]: 9.52001e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.316e-05 [merge_recompute_call_nodes]: 1.44998e-06 [before_grad]: 1.018e-05 [set_forward_comm_id_for_comm_node_pass]: 3.56001e-06 [meta_fg_expand]: 2.62001e-06 [flash_sp_send_recv_attached]: 2.96001e-06 [receive_attached]: 2.81999e-06 [after_resolve]: 9.68002e-06 [a_after_grad]: 8.65001e-06 [renormalize]: 0.00045452 [add_forward_monad_depend]: 4.72e-06 [auto_monad_grad]: 2.07999e-06 [auto_monad_eliminator]: 1.409e-05 [cse]: 2.998e-05 [a_3]: 4.45e-05 [Cycle 2]: 0.00059717, [45] [expand_dump_flag]: 9.10019e-07 [switch_simplify]: 7.08e-06 [loop_unroll]: 5.99e-06 [a_1]: 0.00010561 [with_stream_mark]: 9.82999e-06 [recompute_prepare]: 5.82999e-06 [updatestate_depend_eliminate]: 2.98e-06 [updatestate_assign_eliminate]: 2.27001e-06 [updatestate_loads_eliminate]: 2.64999e-06 [parameter_eliminate]: 1.02e-06 [a_2]: 6.918e-05 [accelerated_algorithm]: 5.79e-06 [shard]: 1.10001e-06 [meta_shard_fg_expand]: 1.27e-06 [shard_inline]: 5.92999e-06 [merge_send_recv]: 4.92999e-06 [auto_parallel]: 5.27999e-06 [parallel]: 4.33999e-06 [flash_sp]: 3.56001e-06 [merge_comm]: 3.41999e-06 [allreduce_fusion]: 2.74001e-06 [matmul_add_comm_reduction]: 5.27999e-06 [allreduce_slice_to_reducescatter]: 3.19997e-07 [virtual_shard_identity]: 6.34001e-06 [virtual_dataset]: 5.63002e-06 [get_grad_eliminate_]: 5.27999e-06 [virtual_output]: 5.11997e-06 [merge_forward]: 2.59001e-06 [cell_reuse_recompute_pass]: 1.27999e-06 [offload_activation]: 6.26e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.185e-05 [merge_recompute_call_nodes]: 7.09988e-07 [before_grad]: 8.76002e-06 [set_forward_comm_id_for_comm_node_pass]: 3.58e-06 [meta_fg_expand]: 1.91e-06 [flash_sp_send_recv_attached]: 8.50006e-07 [receive_attached]: 1.00999e-06 [after_resolve]: 8.45999e-06 [a_after_grad]: 7.98999e-06 [renormalize]: 8.00064e-08 [add_forward_monad_depend]: 1.07e-06 [auto_monad_grad]: 8.70001e-07 [auto_monad_eliminator]: 6.73998e-06 [cse]: 1.279e-05 [a_3]: 3.479e-05 [py_interpret_to_execute_after_opt_a]: 7.16999e-06 [slice_cell_reuse_recomputed_activation]: 2.49999e-06 [rewriter_after_opt_a]: 3.381e-05 [convert_after_rewriter]: 6.51999e-06 [order_py_execute_after_rewriter]: 4.90001e-06 [mutable_eliminate]: 0.00045608 [opt_b]: 0.00021041, [1] [Cycle 1]: 0.00020385, [7] [b_1]: 0.00013192 [b_2]: 7.53e-06 [updatestate_depend_eliminate]: 5.15001e-06 [updatestate_assign_eliminate]: 2.37999e-06 [updatestate_loads_eliminate]: 2.28002e-06 [renormalize]: 6.09987e-07 [cse]: 1.818e-05 [optimize_parallel_all_gather_comm]: 1.627e-05 [overlap_param_gather]: 2.19999e-06 [cconv]: 2.338e-05 [loop_unroll]: 0.00043016 [opt_after_cconv]: 9.638e-05, [1] [Cycle 1]: 9.078e-05, [7] [c_1]: 2.741e-05 [parameter_eliminate]: 2.43002e-06 [updatestate_depend_eliminate]: 4.89003e-06 [updatestate_assign_eliminate]: 2.22001e-06 [updatestate_loads_eliminate]: 2.19001e-06 [cse]: 1.767e-05 [renormalize]: 3.80009e-07 [remove_dup_value]: 1.384e-05 [tuple_transform]: 6.87e-05, [1] [Cycle 1]: 6.402e-05, [4] [d_1]: 3.719e-05 [none_parameter_eliminate]: 1.67999e-06 [renormalize]: 3.70026e-07 [switch_simplify]: 6.53e-06 [partial_unused_args_eliminate]: 1.71e-06 [add_recomputation]: 4.629e-05 [cse_after_recomputation]: 2.19e-05, [1] [Cycle 1]: 1.737e-05, [1] [cse]: 1.151e-05 [environ_conv]: 4.70001e-06 [swap_dp_allreduce_reducescatter]: 4.84e-06 [bias_add_comm_swap]: 2.26e-06 [label_micro_interleaved_index]: 4.42998e-06 [label_fine_grained_interleaved_index]: 2.63e-06 [merge_cast_opt]: 1.54e-06 [slice_recompute_activation]: 2.21e-06 [micro_interleaved_order_control]: 2.01e-06 [assign_add_opt]: 1.40001e-06 [ForceFp32Comm]: 7.79983e-07 [remove_cast_before_assign_add]: 9.70002e-07 [full_micro_interleaved_order_control]: 2.16e-06 [reorder_send_recv_between_fp_bp]: 3.01001e-06 [comm_op_add_attrs]: 1.17e-06 [add_comm_op_reuse_tag]: 9.89996e-07 [interleave_split_concat_branches]: 1.28002e-06 [interleave_parallel_branches]: 1.03001e-06 [overlap_opt_shard_in_pipeline]: 1.05001e-06 [overlap_opt_shard_grad_in_pipeline]: 1.91e-06 [control_data_broadcast_order]: 1.202e-05 [grouped_pairwise_exchange_alltoall]: 1.55001e-06 [offloading_packed_experts]: 4.01001e-06 [overlap_recompute_and_grad_model_parallel]: 4.74e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.10999e-06 [overlap_recompute_allgather_and_fa_grad]: 1.34e-06 [overlap_recompute_comm]: 2.43e-06 [overlap_grad_ring_attention]: 4.05998e-06 [overlap_grad_flash_sp]: 1.801e-05 [begin_end_overlap_inline]: 7.2e-07 [split_matmul_comm_elemetwise]: 2.04e-06 [split_layernorm_comm]: 1.78002e-06 [handle_group_info]: 9.89996e-07 [symbol_engine_optimizer]: 7.313e-05, [1] [Cycle 1]: 6.9e-05, [6] [build]: 2.60997e-06 [elim_shapecalc]: 9.68002e-06 [elim_not_effective]: 1.294e-05 [opt_reshape]: 6.58e-06 [fold_const_symbol]: 9.48997e-06 [renormalize]: 2.09984e-07 [detach_backward]: 1.89e-06 [pipeline_parallel_scheduler]: 1.67999e-06 [auto_monad_reorder]: 1.593e-05 [get_jit_bprop_graph]: 1.55001e-06 [rewriter_after_jit_bprop_graph]: 4.08001e-06 [opt_after_jit_grad]: 0.0004606 [validate]: 3.71e-05 Sums bootstrap : 0.000432s : 4.91% type_inference : 0.004655s : 52.91% event_method : 0.000011s : 0.13% auto_monad : 0.000053s : 0.61% graph_reusing : 0.000005s : 0.06% inline : 0.000002s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000014s : 0.16% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000004s : 0.05% parallel-infer-symbol : 0.000003s : 0.03% pre_auto_parallel : 0.000023s : 0.26% insert-virtual-dataset : 0.000002s : 0.03% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.02% optimize.py_interpret_to_execute : 0.000016s : 0.18% optimize.rewriter_before_opt_a : 0.000043s : 0.49% optimize.opt_a.expand_dump_flag : 0.000004s : 0.04% optimize.opt_a.switch_simplify : 0.000032s : 0.37% optimize.opt_a.loop_unroll : 0.000020s : 0.22% optimize.opt_a.a_1 : 0.000384s : 4.36% optimize.opt_a.with_stream_mark : 0.000025s : 0.28% optimize.opt_a.recompute_prepare : 0.000014s : 0.16% optimize.opt_a.updatestate_depend_eliminate : 0.000006s : 0.07% optimize.opt_a.updatestate_assign_eliminate : 0.000006s : 0.06% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.06% optimize.opt_a.parameter_eliminate : 0.000003s : 0.03% optimize.opt_a.a_2 : 0.000147s : 1.68% optimize.opt_a.accelerated_algorithm : 0.000012s : 0.14% optimize.opt_a.shard : 0.000003s : 0.04% optimize.opt_a.meta_shard_fg_expand : 0.000003s : 0.03% optimize.opt_a.shard_inline : 0.000012s : 0.14% optimize.opt_a.merge_send_recv : 0.000013s : 0.15% optimize.opt_a.auto_parallel : 0.000011s : 0.13% optimize.opt_a.parallel : 0.000022s : 0.25% optimize.opt_a.flash_sp : 0.000011s : 0.13% optimize.opt_a.merge_comm : 0.000007s : 0.08% optimize.opt_a.allreduce_fusion : 0.000006s : 0.07% optimize.opt_a.matmul_add_comm_reduction : 0.000015s : 0.17% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000014s : 0.16% optimize.opt_a.virtual_dataset : 0.000012s : 0.13% optimize.opt_a.get_grad_eliminate_ : 0.000011s : 0.13% optimize.opt_a.virtual_output : 0.000011s : 0.12% optimize.opt_a.merge_forward : 0.000007s : 0.08% optimize.opt_a.cell_reuse_recompute_pass : 0.000002s : 0.03% optimize.opt_a.offload_activation : 0.000016s : 0.18% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000025s : 0.28% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.02% optimize.opt_a.before_grad : 0.000019s : 0.22% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000007s : 0.08% optimize.opt_a.meta_fg_expand : 0.000005s : 0.05% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.04% optimize.opt_a.receive_attached : 0.000004s : 0.04% optimize.opt_a.after_resolve : 0.000018s : 0.21% optimize.opt_a.a_after_grad : 0.000017s : 0.19% optimize.opt_a.renormalize : 0.000455s : 5.17% optimize.opt_a.add_forward_monad_depend : 0.000006s : 0.07% optimize.opt_a.auto_monad_grad : 0.000003s : 0.03% optimize.opt_a.auto_monad_eliminator : 0.000021s : 0.24% optimize.opt_a.cse : 0.000043s : 0.49% optimize.opt_a.a_3 : 0.000079s : 0.90% optimize.py_interpret_to_execute_after_opt_a : 0.000007s : 0.08% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.03% optimize.rewriter_after_opt_a : 0.000034s : 0.38% optimize.convert_after_rewriter : 0.000007s : 0.07% optimize.order_py_execute_after_rewriter : 0.000005s : 0.06% optimize.mutable_eliminate : 0.000456s : 5.18% optimize.opt_b.b_1 : 0.000132s : 1.50% optimize.opt_b.b_2 : 0.000008s : 0.09% optimize.opt_b.updatestate_depend_eliminate : 0.000005s : 0.06% optimize.opt_b.updatestate_assign_eliminate : 0.000002s : 0.03% optimize.opt_b.updatestate_loads_eliminate : 0.000002s : 0.03% optimize.opt_b.renormalize : 0.000001s : 0.01% optimize.opt_b.cse : 0.000018s : 0.21% optimize.optimize_parallel_all_gather_comm : 0.000016s : 0.18% optimize.overlap_param_gather : 0.000002s : 0.03% optimize.cconv : 0.000023s : 0.27% optimize.loop_unroll : 0.000430s : 4.89% optimize.opt_after_cconv.c_1 : 0.000027s : 0.31% optimize.opt_after_cconv.parameter_eliminate : 0.000002s : 0.03% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000005s : 0.06% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000002s : 0.03% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.02% optimize.opt_after_cconv.cse : 0.000018s : 0.20% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000014s : 0.16% optimize.tuple_transform.d_1 : 0.000037s : 0.42% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.02% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000007s : 0.07% optimize.partial_unused_args_eliminate : 0.000002s : 0.02% optimize.add_recomputation : 0.000046s : 0.53% optimize.cse_after_recomputation.cse : 0.000012s : 0.13% optimize.environ_conv : 0.000005s : 0.05% optimize.swap_dp_allreduce_reducescatter : 0.000005s : 0.06% optimize.bias_add_comm_swap : 0.000002s : 0.03% optimize.label_micro_interleaved_index : 0.000004s : 0.05% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.03% optimize.merge_cast_opt : 0.000002s : 0.02% optimize.slice_recompute_activation : 0.000002s : 0.03% optimize.micro_interleaved_order_control : 0.000002s : 0.02% optimize.assign_add_opt : 0.000001s : 0.02% optimize.ForceFp32Comm : 0.000001s : 0.01% optimize.remove_cast_before_assign_add : 0.000001s : 0.01% optimize.full_micro_interleaved_order_control : 0.000002s : 0.02% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.03% optimize.comm_op_add_attrs : 0.000001s : 0.01% optimize.add_comm_op_reuse_tag : 0.000001s : 0.01% optimize.interleave_split_concat_branches : 0.000001s : 0.01% optimize.interleave_parallel_branches : 0.000001s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.02% optimize.control_data_broadcast_order : 0.000012s : 0.14% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.02% optimize.offloading_packed_experts : 0.000004s : 0.05% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.05% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.02% optimize.overlap_recompute_comm : 0.000002s : 0.03% optimize.overlap_grad_ring_attention : 0.000004s : 0.05% optimize.overlap_grad_flash_sp : 0.000018s : 0.20% optimize.begin_end_overlap_inline : 0.000001s : 0.01% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.02% optimize.split_layernorm_comm : 0.000002s : 0.02% optimize.handle_group_info : 0.000001s : 0.01% optimize.symbol_engine_optimizer.build : 0.000003s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000010s : 0.11% optimize.symbol_engine_optimizer.elim_not_effective : 0.000013s : 0.15% optimize.symbol_engine_optimizer.opt_reshape : 0.000007s : 0.07% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000009s : 0.11% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.02% pipeline_parallel_scheduler : 0.000002s : 0.02% auto_monad_reorder : 0.000016s : 0.18% get_jit_bprop_graph : 0.000002s : 0.02% rewriter_after_jit_bprop_graph : 0.000004s : 0.05% opt_after_jit_grad : 0.000461s : 5.23% validate : 0.000037s : 0.42% Time group info: ------[substitution.] 0.000111 20 1.85% : 0.000002s : 2: substitution.elim_not_effective 1.15% : 0.000001s : 2: substitution.fold_const_symbol 4.92% : 0.000005s : 3: substitution.graph_param_transform 74.01% : 0.000082s : 2: substitution.inline 2.82% : 0.000003s : 4: substitution.j_node_and_user_rematch 4.08% : 0.000005s : 4: substitution.remove_not_recompute_node 2.86% : 0.000003s : 2: substitution.replace_old_param 8.32% : 0.000009s : 1: substitution.value_based_eliminate ------[type_inference.] 0.004614 2 91.30% : 0.004213s : 1: type_inference.infer 8.70% : 0.000401s : 1: type_inference.specialize ------[replace.] 0.000020 2 100.00% : 0.000020s : 2: replace.inline ------[match.] 0.000080 2 100.00% : 0.000080s : 2: match.inline ------[predicate.] 0.000131 754 0.77% : 0.000001s : 7: predicate.accumulaten_eliminater 1.20% : 0.000002s : 3: predicate.ad_related_special_op_eliminate 0.70% : 0.000001s : 6: predicate.addn_check_dump 1.00% : 0.000001s : 7: predicate.addn_zero_filter 0.67% : 0.000001s : 7: predicate.adjust_all_reduce_mul_add 2.09% : 0.000003s : 13: predicate.arithmetic_simplify 0.82% : 0.000001s : 7: predicate.cast_eliminate 0.74% : 0.000001s : 6: predicate.check_bprop_eliminate 0.71% : 0.000001s : 6: predicate.compare_switch_simplify 0.24% : 0.000000s : 3: predicate.const_output_eliminate 0.95% : 0.000001s : 6: predicate.depend_value_elim 0.77% : 0.000001s : 7: predicate.dict_get_item_const_eliminator 0.96% : 0.000001s : 7: predicate.dict_get_item_eliminator 0.80% : 0.000001s : 7: predicate.dict_set_item_eliminator 1.50% : 0.000002s : 6: predicate.dumpgradient_eliminate 0.23% : 0.000000s : 3: predicate.elim_not_effective 0.63% : 0.000001s : 3: predicate.elim_shapecalc_of_broadcastargs 1.15% : 0.000002s : 10: predicate.environ_add_const_eliminate 0.97% : 0.000001s : 10: predicate.environ_get_add_eliminate 1.05% : 0.000001s : 10: predicate.environ_get_depend_swap 1.73% : 0.000002s : 16: predicate.environ_get_eliminate 1.00% : 0.000001s : 10: predicate.environ_get_set_eliminate 0.92% : 0.000001s : 9: predicate.exchange_switch_depend_value 1.82% : 0.000002s : 9: predicate.float_depend_g_call 0.71% : 0.000001s : 6: predicate.float_environ_get_switch 0.95% : 0.000001s : 9: predicate.float_tuple_getitem_switch 0.21% : 0.000000s : 3: predicate.fold_const_symbol 0.80% : 0.000001s : 6: predicate.get_grad_eliminate 0.26% : 0.000000s : 3: predicate.graph_param_transform 0.83% : 0.000001s : 6: predicate.incorporate_call 0.67% : 0.000001s : 6: predicate.incorporate_call_switch 6.31% : 0.000008s : 34: predicate.inline 0.98% : 0.000001s : 6: predicate.inline_without_move 0.40% : 0.000001s : 6: predicate.j_node_and_user_rematch 1.10% : 0.000001s : 6: predicate.less_batch_normalization 1.60% : 0.000002s : 13: predicate.list_to_tuple_eliminator_ 2.01% : 0.000003s : 20: predicate.load_eliminater 1.23% : 0.000002s : 3: predicate.loop_unroll_after_grad 1.82% : 0.000002s : 14: predicate.loop_unroll_before_grad 1.76% : 0.000002s : 13: predicate.make_slice_get_slice_eliminator 0.74% : 0.000001s : 6: predicate.merge_addn 0.70% : 0.000001s : 6: predicate.micro_step_allgather_replace 0.66% : 0.000001s : 6: predicate.mini_step_allgather_replace 0.70% : 0.000001s : 7: predicate.minmaximum_grad 1.36% : 0.000002s : 3: predicate.mutable_eliminate 0.45% : 0.000001s : 3: predicate.opt_reshape 0.57% : 0.000001s : 3: predicate.parallel_virtual_node 1.26% : 0.000002s : 9: predicate.partial_defer_inline 1.28% : 0.000002s : 10: predicate.partial_eliminate 0.75% : 0.000001s : 7: predicate.print_const_string_wrapper 0.72% : 0.000001s : 6: predicate.reduce_all_const_elim 1.29% : 0.000002s : 7: predicate.reduce_eliminate 2.15% : 0.000003s : 20: predicate.redundant_stop_gradient_eliminater 0.61% : 0.000001s : 6: predicate.remove_not_recompute_node 1.26% : 0.000002s : 13: predicate.replace_applicator 0.73% : 0.000001s : 6: predicate.replace_old_param 0.37% : 0.000000s : 3: predicate.reset_defer_inline 0.88% : 0.000001s : 7: predicate.reshape_eliminate 0.81% : 0.000001s : 6: predicate.row_tensor_add_zeros_like 0.58% : 0.000001s : 3: predicate.row_tensor_eliminate 0.91% : 0.000001s : 6: predicate.same_eliminate 0.54% : 0.000001s : 6: predicate.set_cell_output_no_recompute 0.97% : 0.000001s : 6: predicate.shard_identity_eliminate 0.93% : 0.000001s : 6: predicate.special_op_eliminate 0.87% : 0.000001s : 6: predicate.specialize_transform 1.13% : 0.000001s : 6: predicate.split_environ_get_set_with_tuple_value 0.90% : 0.000001s : 6: predicate.stack_unstack_eliminate 0.44% : 0.000001s : 3: predicate.switch_call_monad_eliminater 1.07% : 0.000001s : 9: predicate.switch_defer_inline 1.63% : 0.000002s : 15: predicate.switch_layer_defer_inline 4.60% : 0.000006s : 32: predicate.switch_simplify 0.76% : 0.000001s : 7: predicate.tile_eliminate 0.84% : 0.000001s : 7: predicate.transpose_eliminate 1.54% : 0.000002s : 13: predicate.tuple_list_convert_item_index_to_positive 1.65% : 0.000002s : 13: predicate.tuple_list_get_item_const_eliminator 1.43% : 0.000002s : 13: predicate.tuple_list_get_item_depend_reorder 3.09% : 0.000004s : 19: predicate.tuple_list_get_item_eliminator 1.43% : 0.000002s : 13: predicate.tuple_list_get_set_item_eliminator 2.45% : 0.000003s : 19: predicate.tuple_list_set_item_eliminator 1.53% : 0.000002s : 13: predicate.tuple_to_list_eliminator_ 1.94% : 0.000003s : 20: predicate.updatestate_pure_node_eliminater 3.19% : 0.000004s : 26: predicate.updatestate_useless_node_eliminater 0.64% : 0.000001s : 3: predicate.value_based_eliminate 0.80% : 0.000001s : 6: predicate.virtual_dataset_eliminate 0.84% : 0.000001s : 6: predicate.virtual_output_eliminate 0.34% : 0.000000s : 3: predicate.virtual_view_grad_eliminate 0.60% : 0.000001s : 3: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000193 5 8.03% : 0.000016s : 1: func_graph_cloner_run.FuncGraphClonerGraph 91.97% : 0.000178s : 4: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.021120 192 0.02% : 0.000003s : 1: ForceFp32Comm 14.65% : 0.003095s : 1: add_attr 14.61% : 0.003085s : 1: add_attr_with_inline 0.02% : 0.000004s : 1: add_comm_op_reuse_tag 0.24% : 0.000050s : 1: add_recomputation 0.02% : 0.000004s : 1: assign_add_opt 0.27% : 0.000058s : 1: auto_monad 0.09% : 0.000020s : 1: auto_monad_reorder 0.02% : 0.000004s : 1: begin_end_overlap_inline 0.03% : 0.000005s : 1: bias_add_comm_swap 2.18% : 0.000460s : 1: bootstrap 0.13% : 0.000027s : 1: cconv 0.02% : 0.000004s : 1: comm_op_add_attrs 0.07% : 0.000015s : 1: control_data_broadcast_order 0.05% : 0.000010s : 1: convert_after_rewriter 0.12% : 0.000025s : 1: cse_after_recomputation 0.02% : 0.000005s : 1: dataset_repeat_opt 0.02% : 0.000005s : 1: detach_backward 0.04% : 0.000008s : 1: environ_conv 0.08% : 0.000016s : 1: event_method 0.02% : 0.000005s : 1: full_micro_interleaved_order_control 0.02% : 0.000005s : 1: get_jit_bprop_graph 0.04% : 0.000009s : 1: graph_reusing 0.02% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.02% : 0.000004s : 1: handle_group_info 0.02% : 0.000005s : 1: inline 0.03% : 0.000006s : 1: insert-virtual-dataset 0.02% : 0.000004s : 1: interleave_parallel_branches 0.02% : 0.000004s : 1: interleave_split_concat_branches 0.03% : 0.000005s : 1: label_fine_grained_interleaved_index 0.04% : 0.000007s : 1: label_micro_interleaved_index 2.07% : 0.000438s : 1: loop_unroll 0.02% : 0.000004s : 1: merge_cast_opt 0.02% : 0.000005s : 1: micro_interleaved_order_control 2.20% : 0.000464s : 1: mutable_eliminate 0.03% : 0.000007s : 1: offloading_packed_experts 0.06% : 0.000013s : 1: opt.transform.loop_unroll_optimizer 0.06% : 0.000013s : 1: opt.transform.mutable_eliminate 3.49% : 0.000737s : 78: opt.transform.opt_a 0.12% : 0.000026s : 1: opt.transform.opt_after_cconv 0.12% : 0.000026s : 1: opt.transform.opt_after_jit_grad 0.51% : 0.000108s : 28: opt.transform.opt_b 0.20% : 0.000042s : 2: opt.transform.opt_trans_graph 0.17% : 0.000035s : 4: opt.transform.symbol_engine_opt 9.46% : 0.001998s : 1: opt_a 0.47% : 0.000100s : 1: opt_after_cconv 2.22% : 0.000470s : 1: opt_after_jit_grad 1.01% : 0.000214s : 1: opt_b 18.37% : 0.003880s : 1: optimize 0.09% : 0.000020s : 1: optimize_parallel_all_gather_comm 0.04% : 0.000008s : 1: order_py_execute_after_rewriter 0.10% : 0.000021s : 1: overlap_grad_flash_sp 0.02% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.03% : 0.000007s : 1: overlap_grad_ring_attention 0.02% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.02% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.02% : 0.000005s : 1: overlap_param_gather 0.02% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.04% : 0.000008s : 1: overlap_recompute_and_grad_model_parallel 0.02% : 0.000005s : 1: overlap_recompute_comm 0.03% : 0.000006s : 1: parallel-infer-symbol 0.02% : 0.000004s : 1: parallel-infer-symbol-second 0.02% : 0.000005s : 1: partial_unused_args_eliminate 0.02% : 0.000005s : 1: pipeline_parallel_scheduler 0.02% : 0.000005s : 1: pipeline_split 0.13% : 0.000027s : 1: pre_auto_parallel 0.09% : 0.000019s : 1: py_interpret_to_execute 0.05% : 0.000011s : 1: py_interpret_to_execute_after_opt_a 0.02% : 0.000004s : 1: remove_cast_before_assign_add 0.08% : 0.000017s : 1: remove_dup_value 1.19% : 0.000252s : 1: renormalize.infer 0.93% : 0.000196s : 1: renormalize.specialize 0.03% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.03% : 0.000007s : 1: rewriter_after_jit_bprop_graph 0.18% : 0.000038s : 1: rewriter_after_opt_a 0.22% : 0.000047s : 1: rewriter_before_opt_a 0.02% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.02% : 0.000005s : 1: slice_recompute_activation 0.02% : 0.000005s : 1: split_layernorm_comm 0.02% : 0.000005s : 1: split_matmul_comm_elemetwise 0.04% : 0.000008s : 1: swap_dp_allreduce_reducescatter 0.36% : 0.000076s : 1: symbol_engine_optimizer 0.34% : 0.000072s : 1: tuple_transform 22.10% : 0.004668s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:50.845.778 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:50.846.045 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0150802, [21] [bootstrap]: 0.00046682 [type_inference]: 0.00484436 [event_method]: 1.257e-05 [auto_monad]: 5.772e-05 [graph_reusing]: 5.29e-06 [inline]: 1.94e-06 [add_attr]: 0.00309238, [1] [add_attr_with_inline]: 0.00308418, [1] [Cycle 1]: 6.537e-05, [2] [tag_attr]: 1.505e-05 [meta_addattr_fg_expand]: 3.61999e-06 [parallel-infer-symbol]: 2.88e-06 [pre_auto_parallel]: 2.597e-05 [insert-virtual-dataset]: 2.86e-06 [parallel-infer-symbol-second]: 7.00005e-07 [dataset_repeat_opt]: 2.14999e-06 [pipeline_split]: 1.76e-06 [optimize]: 0.00539748, [53] [py_interpret_to_execute]: 2.113e-05 [rewriter_before_opt_a]: 5.333e-05 [opt_a]: 0.00286686, [2] [Cycle 1]: 0.00194311, [45] [expand_dump_flag]: 2.79999e-06 [switch_simplify]: 2.653e-05 [loop_unroll]: 1.529e-05 [a_1]: 0.00034485 [with_stream_mark]: 1.635e-05 [recompute_prepare]: 1.01e-05 [updatestate_depend_eliminate]: 4.48999e-06 [updatestate_assign_eliminate]: 4.58001e-06 [updatestate_loads_eliminate]: 4.48999e-06 [parameter_eliminate]: 1.97001e-06 [a_2]: 0.00012638 [accelerated_algorithm]: 7.83001e-06 [shard]: 2.37001e-06 [meta_shard_fg_expand]: 2.26e-06 [shard_inline]: 7.88001e-06 [merge_send_recv]: 9.67999e-06 [auto_parallel]: 7.08e-06 [parallel]: 1.768e-05 [flash_sp]: 8.94003e-06 [merge_comm]: 4.69002e-06 [allreduce_fusion]: 4.94998e-06 [matmul_add_comm_reduction]: 1.085e-05 [allreduce_slice_to_reducescatter]: 6.10016e-07 [virtual_shard_identity]: 8.65999e-06 [virtual_dataset]: 7.61999e-06 [get_grad_eliminate_]: 7.47998e-06 [virtual_output]: 7.28999e-06 [merge_forward]: 4.27998e-06 [cell_reuse_recompute_pass]: 1.23002e-06 [offload_activation]: 1.157e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.704e-05 [merge_recompute_call_nodes]: 1.39998e-06 [before_grad]: 1.3e-05 [set_forward_comm_id_for_comm_node_pass]: 4.89e-06 [meta_fg_expand]: 3.39001e-06 [flash_sp_send_recv_attached]: 2.49001e-06 [receive_attached]: 1.96e-06 [after_resolve]: 1.107e-05 [a_after_grad]: 1.11e-05 [renormalize]: 0.00067254 [add_forward_monad_depend]: 5.12999e-06 [auto_monad_grad]: 2.32999e-06 [auto_monad_eliminator]: 1.644e-05 [cse]: 3.595e-05 [a_3]: 7.154e-05 [Cycle 2]: 0.00091031, [45] [expand_dump_flag]: 1.41998e-06 [switch_simplify]: 9.07999e-06 [loop_unroll]: 7.55998e-06 [a_1]: 0.00015381 [with_stream_mark]: 1.6e-05 [recompute_prepare]: 7.79002e-06 [updatestate_depend_eliminate]: 4.53999e-06 [updatestate_assign_eliminate]: 3.03e-06 [updatestate_loads_eliminate]: 2.96999e-06 [parameter_eliminate]: 1.00999e-06 [a_2]: 0.00011679 [accelerated_algorithm]: 7.18e-06 [shard]: 1.04e-06 [meta_shard_fg_expand]: 1.77001e-06 [shard_inline]: 7.38e-06 [merge_send_recv]: 6.12999e-06 [auto_parallel]: 6.58e-06 [parallel]: 5.12999e-06 [flash_sp]: 3.93999e-06 [merge_comm]: 4.23999e-06 [allreduce_fusion]: 4.55001e-06 [matmul_add_comm_reduction]: 7.68999e-06 [allreduce_slice_to_reducescatter]: 5.3001e-07 [virtual_shard_identity]: 8.29998e-06 [virtual_dataset]: 6.86999e-06 [get_grad_eliminate_]: 6.62002e-06 [virtual_output]: 6.73e-06 [merge_forward]: 4.07998e-06 [cell_reuse_recompute_pass]: 1.37999e-06 [offload_activation]: 8.43001e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.686e-05 [merge_recompute_call_nodes]: 1.09e-06 [before_grad]: 1.236e-05 [set_forward_comm_id_for_comm_node_pass]: 4.43001e-06 [meta_fg_expand]: 2.81e-06 [flash_sp_send_recv_attached]: 1.35001e-06 [receive_attached]: 1.09e-06 [after_resolve]: 9.81e-06 [a_after_grad]: 1.029e-05 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 1.30999e-06 [auto_monad_grad]: 1.10001e-06 [auto_monad_eliminator]: 9.32001e-06 [cse]: 1.963e-05 [a_3]: 5.623e-05 [py_interpret_to_execute_after_opt_a]: 1.424e-05 [slice_cell_reuse_recomputed_activation]: 4.48999e-06 [rewriter_after_opt_a]: 4.652e-05 [convert_after_rewriter]: 1.105e-05 [order_py_execute_after_rewriter]: 9.42999e-06 [mutable_eliminate]: 0.00051323 [opt_b]: 0.00046791, [1] [Cycle 1]: 0.00045765, [7] [b_1]: 0.00034306 [b_2]: 9.47999e-06 [updatestate_depend_eliminate]: 8.16002e-06 [updatestate_assign_eliminate]: 3.37002e-06 [updatestate_loads_eliminate]: 3.85e-06 [renormalize]: 9.30013e-07 [cse]: 2.772e-05 [optimize_parallel_all_gather_comm]: 2.167e-05 [overlap_param_gather]: 5.35999e-06 [cconv]: 3.166e-05 [loop_unroll]: 0.00046301 [opt_after_cconv]: 0.00013953, [1] [Cycle 1]: 0.0001312, [7] [c_1]: 3.427e-05 [parameter_eliminate]: 3.08e-06 [updatestate_depend_eliminate]: 6.55997e-06 [updatestate_assign_eliminate]: 3.14001e-06 [updatestate_loads_eliminate]: 3.51999e-06 [cse]: 2.383e-05 [renormalize]: 4.09986e-07 [remove_dup_value]: 2.071e-05 [tuple_transform]: 9.707e-05, [1] [Cycle 1]: 9e-05, [4] [d_1]: 4.778e-05 [none_parameter_eliminate]: 1.76e-06 [renormalize]: 2.30008e-07 [switch_simplify]: 8.55999e-06 [partial_unused_args_eliminate]: 4.82e-06 [add_recomputation]: 5.985e-05 [cse_after_recomputation]: 3.158e-05, [1] [Cycle 1]: 2.479e-05, [1] [cse]: 1.598e-05 [environ_conv]: 1.04e-05 [swap_dp_allreduce_reducescatter]: 9.21998e-06 [bias_add_comm_swap]: 5.79e-06 [label_micro_interleaved_index]: 7.63001e-06 [label_fine_grained_interleaved_index]: 5.69999e-06 [merge_cast_opt]: 4.38001e-06 [slice_recompute_activation]: 4.90999e-06 [micro_interleaved_order_control]: 4.87e-06 [assign_add_opt]: 3.76999e-06 [ForceFp32Comm]: 3.26999e-06 [remove_cast_before_assign_add]: 3.33998e-06 [full_micro_interleaved_order_control]: 4.53001e-06 [reorder_send_recv_between_fp_bp]: 5.10999e-06 [comm_op_add_attrs]: 3.86001e-06 [add_comm_op_reuse_tag]: 3.18998e-06 [interleave_split_concat_branches]: 3.51001e-06 [interleave_parallel_branches]: 3.91999e-06 [overlap_opt_shard_in_pipeline]: 3.61001e-06 [overlap_opt_shard_grad_in_pipeline]: 4.40999e-06 [control_data_broadcast_order]: 1.765e-05 [grouped_pairwise_exchange_alltoall]: 4.00998e-06 [offloading_packed_experts]: 7.03e-06 [overlap_recompute_and_grad_model_parallel]: 7.9e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.64002e-06 [overlap_recompute_allgather_and_fa_grad]: 3.63999e-06 [overlap_recompute_comm]: 4.84e-06 [overlap_grad_ring_attention]: 7.17002e-06 [overlap_grad_flash_sp]: 2.564e-05 [begin_end_overlap_inline]: 3.01999e-06 [split_matmul_comm_elemetwise]: 4.37e-06 [split_layernorm_comm]: 4.25e-06 [handle_group_info]: 3.79002e-06 [symbol_engine_optimizer]: 0.00010512, [1] [Cycle 1]: 9.856e-05, [6] [build]: 2.73e-06 [elim_shapecalc]: 1.175e-05 [elim_not_effective]: 1.582e-05 [opt_reshape]: 8.08001e-06 [fold_const_symbol]: 1.222e-05 [renormalize]: 2.30008e-07 [detach_backward]: 3.41999e-06 [pipeline_parallel_scheduler]: 1.75001e-06 [auto_monad_reorder]: 2.207e-05 [get_jit_bprop_graph]: 1.59e-06 [rewriter_after_jit_bprop_graph]: 4.89e-06 [opt_after_jit_grad]: 0.00049857 [validate]: 4.239e-05 Sums bootstrap : 0.000467s : 4.57% type_inference : 0.004844s : 47.40% event_method : 0.000013s : 0.12% auto_monad : 0.000058s : 0.56% graph_reusing : 0.000005s : 0.05% inline : 0.000002s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000015s : 0.15% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000004s : 0.04% parallel-infer-symbol : 0.000003s : 0.03% pre_auto_parallel : 0.000026s : 0.25% insert-virtual-dataset : 0.000003s : 0.03% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.02% optimize.py_interpret_to_execute : 0.000021s : 0.21% optimize.rewriter_before_opt_a : 0.000053s : 0.52% optimize.opt_a.expand_dump_flag : 0.000004s : 0.04% optimize.opt_a.switch_simplify : 0.000036s : 0.35% optimize.opt_a.loop_unroll : 0.000023s : 0.22% optimize.opt_a.a_1 : 0.000499s : 4.88% optimize.opt_a.with_stream_mark : 0.000032s : 0.32% optimize.opt_a.recompute_prepare : 0.000018s : 0.18% optimize.opt_a.updatestate_depend_eliminate : 0.000009s : 0.09% optimize.opt_a.updatestate_assign_eliminate : 0.000008s : 0.07% optimize.opt_a.updatestate_loads_eliminate : 0.000007s : 0.07% optimize.opt_a.parameter_eliminate : 0.000003s : 0.03% optimize.opt_a.a_2 : 0.000243s : 2.38% optimize.opt_a.accelerated_algorithm : 0.000015s : 0.15% optimize.opt_a.shard : 0.000003s : 0.03% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.04% optimize.opt_a.shard_inline : 0.000015s : 0.15% optimize.opt_a.merge_send_recv : 0.000016s : 0.15% optimize.opt_a.auto_parallel : 0.000014s : 0.13% optimize.opt_a.parallel : 0.000023s : 0.22% optimize.opt_a.flash_sp : 0.000013s : 0.13% optimize.opt_a.merge_comm : 0.000009s : 0.09% optimize.opt_a.allreduce_fusion : 0.000009s : 0.09% optimize.opt_a.matmul_add_comm_reduction : 0.000019s : 0.18% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000017s : 0.17% optimize.opt_a.virtual_dataset : 0.000014s : 0.14% optimize.opt_a.get_grad_eliminate_ : 0.000014s : 0.14% optimize.opt_a.virtual_output : 0.000014s : 0.14% optimize.opt_a.merge_forward : 0.000008s : 0.08% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.03% optimize.opt_a.offload_activation : 0.000020s : 0.20% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000034s : 0.33% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.02% optimize.opt_a.before_grad : 0.000025s : 0.25% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000009s : 0.09% optimize.opt_a.meta_fg_expand : 0.000006s : 0.06% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.04% optimize.opt_a.receive_attached : 0.000003s : 0.03% optimize.opt_a.after_resolve : 0.000021s : 0.20% optimize.opt_a.a_after_grad : 0.000021s : 0.21% optimize.opt_a.renormalize : 0.000673s : 6.58% optimize.opt_a.add_forward_monad_depend : 0.000006s : 0.06% optimize.opt_a.auto_monad_grad : 0.000003s : 0.03% optimize.opt_a.auto_monad_eliminator : 0.000026s : 0.25% optimize.opt_a.cse : 0.000056s : 0.54% optimize.opt_a.a_3 : 0.000128s : 1.25% optimize.py_interpret_to_execute_after_opt_a : 0.000014s : 0.14% optimize.slice_cell_reuse_recomputed_activation : 0.000004s : 0.04% optimize.rewriter_after_opt_a : 0.000047s : 0.46% optimize.convert_after_rewriter : 0.000011s : 0.11% optimize.order_py_execute_after_rewriter : 0.000009s : 0.09% optimize.mutable_eliminate : 0.000513s : 5.02% optimize.opt_b.b_1 : 0.000343s : 3.36% optimize.opt_b.b_2 : 0.000009s : 0.09% optimize.opt_b.updatestate_depend_eliminate : 0.000008s : 0.08% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_b.updatestate_loads_eliminate : 0.000004s : 0.04% optimize.opt_b.renormalize : 0.000001s : 0.01% optimize.opt_b.cse : 0.000028s : 0.27% optimize.optimize_parallel_all_gather_comm : 0.000022s : 0.21% optimize.overlap_param_gather : 0.000005s : 0.05% optimize.cconv : 0.000032s : 0.31% optimize.loop_unroll : 0.000463s : 4.53% optimize.opt_after_cconv.c_1 : 0.000034s : 0.34% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000007s : 0.06% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000004s : 0.03% optimize.opt_after_cconv.cse : 0.000024s : 0.23% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000021s : 0.20% optimize.tuple_transform.d_1 : 0.000048s : 0.47% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.02% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000009s : 0.08% optimize.partial_unused_args_eliminate : 0.000005s : 0.05% optimize.add_recomputation : 0.000060s : 0.59% optimize.cse_after_recomputation.cse : 0.000016s : 0.16% optimize.environ_conv : 0.000010s : 0.10% optimize.swap_dp_allreduce_reducescatter : 0.000009s : 0.09% optimize.bias_add_comm_swap : 0.000006s : 0.06% optimize.label_micro_interleaved_index : 0.000008s : 0.07% optimize.label_fine_grained_interleaved_index : 0.000006s : 0.06% optimize.merge_cast_opt : 0.000004s : 0.04% optimize.slice_recompute_activation : 0.000005s : 0.05% optimize.micro_interleaved_order_control : 0.000005s : 0.05% optimize.assign_add_opt : 0.000004s : 0.04% optimize.ForceFp32Comm : 0.000003s : 0.03% optimize.remove_cast_before_assign_add : 0.000003s : 0.03% optimize.full_micro_interleaved_order_control : 0.000005s : 0.04% optimize.reorder_send_recv_between_fp_bp : 0.000005s : 0.05% optimize.comm_op_add_attrs : 0.000004s : 0.04% optimize.add_comm_op_reuse_tag : 0.000003s : 0.03% optimize.interleave_split_concat_branches : 0.000004s : 0.03% optimize.interleave_parallel_branches : 0.000004s : 0.04% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.04% optimize.overlap_opt_shard_grad_in_pipeline : 0.000004s : 0.04% optimize.control_data_broadcast_order : 0.000018s : 0.17% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.04% optimize.offloading_packed_experts : 0.000007s : 0.07% optimize.overlap_recompute_and_grad_model_parallel : 0.000008s : 0.08% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.04% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.04% optimize.overlap_recompute_comm : 0.000005s : 0.05% optimize.overlap_grad_ring_attention : 0.000007s : 0.07% optimize.overlap_grad_flash_sp : 0.000026s : 0.25% optimize.begin_end_overlap_inline : 0.000003s : 0.03% optimize.split_matmul_comm_elemetwise : 0.000004s : 0.04% optimize.split_layernorm_comm : 0.000004s : 0.04% optimize.handle_group_info : 0.000004s : 0.04% optimize.symbol_engine_optimizer.build : 0.000003s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000012s : 0.11% optimize.symbol_engine_optimizer.elim_not_effective : 0.000016s : 0.15% optimize.symbol_engine_optimizer.opt_reshape : 0.000008s : 0.08% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000012s : 0.12% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000003s : 0.03% pipeline_parallel_scheduler : 0.000002s : 0.02% auto_monad_reorder : 0.000022s : 0.22% get_jit_bprop_graph : 0.000002s : 0.02% rewriter_after_jit_bprop_graph : 0.000005s : 0.05% opt_after_jit_grad : 0.000499s : 4.88% validate : 0.000042s : 0.41% Time group info: ------[substitution.] 0.000171 29 11.77% : 0.000020s : 2: substitution.cast_eliminate 1.35% : 0.000002s : 3: substitution.elim_not_effective 1.01% : 0.000002s : 3: substitution.fold_const_symbol 3.71% : 0.000006s : 4: substitution.graph_param_transform 57.41% : 0.000098s : 2: substitution.inline 2.50% : 0.000004s : 6: substitution.j_node_and_user_rematch 3.47% : 0.000006s : 6: substitution.remove_not_recompute_node 2.00% : 0.000003s : 2: substitution.replace_old_param 16.78% : 0.000029s : 1: substitution.value_based_eliminate ------[type_inference.] 0.004798 2 90.74% : 0.004354s : 1: type_inference.infer 9.26% : 0.000444s : 1: type_inference.specialize ------[replace.] 0.000022 2 100.00% : 0.000022s : 2: replace.inline ------[match.] 0.000097 2 100.00% : 0.000097s : 2: match.inline ------[predicate.] 0.000170 980 0.78% : 0.000001s : 9: predicate.accumulaten_eliminater 0.96% : 0.000002s : 4: predicate.ad_related_special_op_eliminate 0.70% : 0.000001s : 8: predicate.addn_check_dump 0.77% : 0.000001s : 9: predicate.addn_zero_filter 0.76% : 0.000001s : 9: predicate.adjust_all_reduce_mul_add 2.18% : 0.000004s : 17: predicate.arithmetic_simplify 1.15% : 0.000002s : 9: predicate.cast_eliminate 0.86% : 0.000001s : 8: predicate.check_bprop_eliminate 0.74% : 0.000001s : 8: predicate.compare_switch_simplify 0.25% : 0.000000s : 4: predicate.const_output_eliminate 0.89% : 0.000002s : 8: predicate.depend_value_elim 0.79% : 0.000001s : 9: predicate.dict_get_item_const_eliminator 0.94% : 0.000002s : 9: predicate.dict_get_item_eliminator 0.80% : 0.000001s : 9: predicate.dict_set_item_eliminator 1.22% : 0.000002s : 8: predicate.dumpgradient_eliminate 0.32% : 0.000001s : 4: predicate.elim_not_effective 0.55% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.12% : 0.000002s : 13: predicate.environ_add_const_eliminate 1.08% : 0.000002s : 13: predicate.environ_get_add_eliminate 1.05% : 0.000002s : 13: predicate.environ_get_depend_swap 1.85% : 0.000003s : 21: predicate.environ_get_eliminate 1.03% : 0.000002s : 13: predicate.environ_get_set_eliminate 0.96% : 0.000002s : 11: predicate.exchange_switch_depend_value 1.89% : 0.000003s : 11: predicate.float_depend_g_call 0.68% : 0.000001s : 8: predicate.float_environ_get_switch 1.00% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.24% : 0.000000s : 4: predicate.fold_const_symbol 0.78% : 0.000001s : 8: predicate.get_grad_eliminate 0.27% : 0.000000s : 4: predicate.graph_param_transform 0.80% : 0.000001s : 8: predicate.incorporate_call 0.69% : 0.000001s : 8: predicate.incorporate_call_switch 6.43% : 0.000011s : 44: predicate.inline 1.04% : 0.000002s : 8: predicate.inline_without_move 0.37% : 0.000001s : 8: predicate.j_node_and_user_rematch 0.99% : 0.000002s : 8: predicate.less_batch_normalization 1.75% : 0.000003s : 17: predicate.list_to_tuple_eliminator_ 2.13% : 0.000004s : 26: predicate.load_eliminater 1.17% : 0.000002s : 4: predicate.loop_unroll_after_grad 1.40% : 0.000002s : 16: predicate.loop_unroll_before_grad 1.88% : 0.000003s : 17: predicate.make_slice_get_slice_eliminator 0.79% : 0.000001s : 8: predicate.merge_addn 0.74% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.77% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.72% : 0.000001s : 9: predicate.minmaximum_grad 1.21% : 0.000002s : 4: predicate.mutable_eliminate 0.47% : 0.000001s : 4: predicate.opt_reshape 0.42% : 0.000001s : 4: predicate.parallel_virtual_node 1.23% : 0.000002s : 11: predicate.partial_defer_inline 1.35% : 0.000002s : 13: predicate.partial_eliminate 0.75% : 0.000001s : 9: predicate.print_const_string_wrapper 0.74% : 0.000001s : 8: predicate.reduce_all_const_elim 1.16% : 0.000002s : 9: predicate.reduce_eliminate 2.19% : 0.000004s : 26: predicate.redundant_stop_gradient_eliminater 0.50% : 0.000001s : 8: predicate.remove_not_recompute_node 1.15% : 0.000002s : 17: predicate.replace_applicator 0.65% : 0.000001s : 8: predicate.replace_old_param 0.34% : 0.000001s : 4: predicate.reset_defer_inline 0.89% : 0.000002s : 9: predicate.reshape_eliminate 0.76% : 0.000001s : 8: predicate.row_tensor_add_zeros_like 0.44% : 0.000001s : 4: predicate.row_tensor_eliminate 0.89% : 0.000002s : 8: predicate.same_eliminate 0.54% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.85% : 0.000001s : 8: predicate.shard_identity_eliminate 0.96% : 0.000002s : 8: predicate.special_op_eliminate 1.06% : 0.000002s : 8: predicate.specialize_transform 1.10% : 0.000002s : 8: predicate.split_environ_get_set_with_tuple_value 0.91% : 0.000002s : 8: predicate.stack_unstack_eliminate 0.43% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.00% : 0.000002s : 11: predicate.switch_defer_inline 1.65% : 0.000003s : 19: predicate.switch_layer_defer_inline 4.08% : 0.000007s : 39: predicate.switch_simplify 0.84% : 0.000001s : 9: predicate.tile_eliminate 0.84% : 0.000001s : 9: predicate.transpose_eliminate 1.63% : 0.000003s : 17: predicate.tuple_list_convert_item_index_to_positive 1.62% : 0.000003s : 17: predicate.tuple_list_get_item_const_eliminator 1.53% : 0.000003s : 17: predicate.tuple_list_get_item_depend_reorder 3.39% : 0.000006s : 25: predicate.tuple_list_get_item_eliminator 1.47% : 0.000003s : 17: predicate.tuple_list_get_set_item_eliminator 2.45% : 0.000004s : 25: predicate.tuple_list_set_item_eliminator 1.52% : 0.000003s : 17: predicate.tuple_to_list_eliminator_ 2.19% : 0.000004s : 26: predicate.updatestate_pure_node_eliminater 2.96% : 0.000005s : 34: predicate.updatestate_useless_node_eliminater 0.91% : 0.000002s : 4: predicate.value_based_eliminate 0.99% : 0.000002s : 8: predicate.virtual_dataset_eliminate 0.85% : 0.000001s : 8: predicate.virtual_output_eliminate 0.34% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.49% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000220 5 8.81% : 0.000019s : 1: func_graph_cloner_run.FuncGraphClonerGraph 91.19% : 0.000200s : 4: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.025444 192 0.02% : 0.000006s : 1: ForceFp32Comm 12.19% : 0.003102s : 1: add_attr 12.14% : 0.003088s : 1: add_attr_with_inline 0.02% : 0.000006s : 1: add_comm_op_reuse_tag 0.25% : 0.000064s : 1: add_recomputation 0.03% : 0.000007s : 1: assign_add_opt 0.26% : 0.000066s : 1: auto_monad 0.11% : 0.000029s : 1: auto_monad_reorder 0.02% : 0.000006s : 1: begin_end_overlap_inline 0.03% : 0.000009s : 1: bias_add_comm_swap 2.01% : 0.000511s : 1: bootstrap 0.14% : 0.000035s : 1: cconv 0.03% : 0.000007s : 1: comm_op_add_attrs 0.08% : 0.000021s : 1: control_data_broadcast_order 0.06% : 0.000015s : 1: convert_after_rewriter 0.14% : 0.000035s : 1: cse_after_recomputation 0.03% : 0.000008s : 1: dataset_repeat_opt 0.08% : 0.000019s : 1: detach_backward 0.05% : 0.000013s : 1: environ_conv 0.09% : 0.000022s : 1: event_method 0.03% : 0.000007s : 1: full_micro_interleaved_order_control 0.03% : 0.000008s : 1: get_jit_bprop_graph 0.05% : 0.000012s : 1: graph_reusing 0.03% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.03% : 0.000006s : 1: handle_group_info 0.03% : 0.000008s : 1: inline 0.03% : 0.000009s : 1: insert-virtual-dataset 0.03% : 0.000007s : 1: interleave_parallel_branches 0.02% : 0.000006s : 1: interleave_split_concat_branches 0.03% : 0.000009s : 1: label_fine_grained_interleaved_index 0.04% : 0.000011s : 1: label_micro_interleaved_index 1.84% : 0.000469s : 1: loop_unroll 0.03% : 0.000007s : 1: merge_cast_opt 0.03% : 0.000008s : 1: micro_interleaved_order_control 2.04% : 0.000520s : 1: mutable_eliminate 0.04% : 0.000010s : 1: offloading_packed_experts 0.07% : 0.000017s : 1: opt.transform.loop_unroll_optimizer 0.07% : 0.000018s : 1: opt.transform.mutable_eliminate 3.78% : 0.000961s : 78: opt.transform.opt_a 0.13% : 0.000033s : 1: opt.transform.opt_after_cconv 0.11% : 0.000029s : 1: opt.transform.opt_after_jit_grad 0.71% : 0.000180s : 28: opt.transform.opt_b 0.21% : 0.000054s : 2: opt.transform.opt_trans_graph 0.17% : 0.000044s : 4: opt.transform.symbol_engine_opt 11.28% : 0.002870s : 1: opt_a 0.56% : 0.000143s : 1: opt_after_cconv 2.00% : 0.000509s : 1: opt_after_jit_grad 1.86% : 0.000472s : 1: opt_b 22.63% : 0.005759s : 1: optimize 0.10% : 0.000025s : 1: optimize_parallel_all_gather_comm 0.05% : 0.000012s : 1: order_py_execute_after_rewriter 0.11% : 0.000029s : 1: overlap_grad_flash_sp 0.03% : 0.000006s : 1: overlap_grad_matmul_and_grad_allreduce 0.04% : 0.000010s : 1: overlap_grad_ring_attention 0.03% : 0.000007s : 1: overlap_opt_shard_grad_in_pipeline 0.02% : 0.000006s : 1: overlap_opt_shard_in_pipeline 0.03% : 0.000008s : 1: overlap_param_gather 0.02% : 0.000006s : 1: overlap_recompute_allgather_and_fa_grad 0.04% : 0.000011s : 1: overlap_recompute_and_grad_model_parallel 0.03% : 0.000008s : 1: overlap_recompute_comm 0.04% : 0.000009s : 1: parallel-infer-symbol 0.02% : 0.000006s : 1: parallel-infer-symbol-second 0.03% : 0.000008s : 1: partial_unused_args_eliminate 0.04% : 0.000009s : 1: pipeline_parallel_scheduler 0.03% : 0.000007s : 1: pipeline_split 0.13% : 0.000033s : 1: pre_auto_parallel 0.10% : 0.000024s : 1: py_interpret_to_execute 0.07% : 0.000017s : 1: py_interpret_to_execute_after_opt_a 0.02% : 0.000006s : 1: remove_cast_before_assign_add 0.09% : 0.000024s : 1: remove_dup_value 1.66% : 0.000423s : 1: renormalize.infer 0.95% : 0.000242s : 1: renormalize.specialize 0.03% : 0.000008s : 1: reorder_send_recv_between_fp_bp 0.04% : 0.000011s : 1: rewriter_after_jit_bprop_graph 0.20% : 0.000050s : 1: rewriter_after_opt_a 0.22% : 0.000057s : 1: rewriter_before_opt_a 0.03% : 0.000007s : 1: slice_cell_reuse_recomputed_activation 0.03% : 0.000008s : 1: slice_recompute_activation 0.03% : 0.000007s : 1: split_layernorm_comm 0.03% : 0.000007s : 1: split_matmul_comm_elemetwise 0.05% : 0.000012s : 1: swap_dp_allreduce_reducescatter 0.42% : 0.000108s : 1: symbol_engine_optimizer 0.39% : 0.000100s : 1: tuple_transform 19.14% : 0.004870s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:51.375.55 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.014077, [21] [bootstrap]: 0.00044632 [type_inference]: 0.00504814 [event_method]: 1.306e-05 [auto_monad]: 5.745e-05 [graph_reusing]: 5.15001e-06 [inline]: 2.19001e-06 [add_attr]: 0.00316468, [1] [add_attr_with_inline]: 0.00315552, [1] [Cycle 1]: 5.705e-05, [2] [tag_attr]: 1.51e-05 [meta_addattr_fg_expand]: 3.83001e-06 [parallel-infer-symbol]: 3.13e-06 [pre_auto_parallel]: 2.987e-05 [insert-virtual-dataset]: 2.45002e-06 [parallel-infer-symbol-second]: 7.7e-07 [dataset_repeat_opt]: 1.78002e-06 [pipeline_split]: 1.74e-06 [optimize]: 0.00460147, [53] [py_interpret_to_execute]: 1.968e-05 [rewriter_before_opt_a]: 5.018e-05 [opt_a]: 0.00247869, [2] [Cycle 1]: 0.00174059, [45] [expand_dump_flag]: 3.14999e-06 [switch_simplify]: 2.683e-05 [loop_unroll]: 1.437e-05 [a_1]: 0.00035083 [with_stream_mark]: 1.525e-05 [recompute_prepare]: 9.67999e-06 [updatestate_depend_eliminate]: 4.66002e-06 [updatestate_assign_eliminate]: 5.09e-06 [updatestate_loads_eliminate]: 3.96001e-06 [parameter_eliminate]: 1.72001e-06 [a_2]: 9.599e-05 [accelerated_algorithm]: 7.9e-06 [shard]: 2.41998e-06 [meta_shard_fg_expand]: 1.89999e-06 [shard_inline]: 7.45e-06 [merge_send_recv]: 9.10001e-06 [auto_parallel]: 8.68001e-06 [parallel]: 1.869e-05 [flash_sp]: 8.37e-06 [merge_comm]: 4.50001e-06 [allreduce_fusion]: 4.47e-06 [matmul_add_comm_reduction]: 9.99999e-06 [allreduce_slice_to_reducescatter]: 7.49977e-07 [virtual_shard_identity]: 9.41e-06 [virtual_dataset]: 8.21002e-06 [get_grad_eliminate_]: 7.17002e-06 [virtual_output]: 7.9e-06 [merge_forward]: 4.80999e-06 [cell_reuse_recompute_pass]: 1.29998e-06 [offload_activation]: 1.14e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.554e-05 [merge_recompute_call_nodes]: 1.47001e-06 [before_grad]: 1.23e-05 [set_forward_comm_id_for_comm_node_pass]: 4.72e-06 [meta_fg_expand]: 3.09999e-06 [flash_sp_send_recv_attached]: 2.45002e-06 [receive_attached]: 2.16e-06 [after_resolve]: 1.13e-05 [a_after_grad]: 1.194e-05 [renormalize]: 0.00064853 [add_forward_monad_depend]: 5.20999e-06 [auto_monad_grad]: 2.24001e-06 [auto_monad_eliminator]: 1.692e-05 [cse]: 3.734e-05 [a_3]: 5.628e-05 [Cycle 2]: 0.00072777, [45] [expand_dump_flag]: 1.35001e-06 [switch_simplify]: 8.70001e-06 [loop_unroll]: 7.25e-06 [a_1]: 0.00015281 [with_stream_mark]: 1.66e-05 [recompute_prepare]: 8.32e-06 [updatestate_depend_eliminate]: 4.58999e-06 [updatestate_assign_eliminate]: 3.10998e-06 [updatestate_loads_eliminate]: 3.06001e-06 [parameter_eliminate]: 1.14e-06 [a_2]: 8.647e-05 [accelerated_algorithm]: 6.88e-06 [shard]: 1.23002e-06 [meta_shard_fg_expand]: 1.96998e-06 [shard_inline]: 7.38e-06 [merge_send_recv]: 6.58e-06 [auto_parallel]: 6.44999e-06 [parallel]: 5.84e-06 [flash_sp]: 4.45e-06 [merge_comm]: 4.27e-06 [allreduce_fusion]: 4.08999e-06 [matmul_add_comm_reduction]: 7.45998e-06 [allreduce_slice_to_reducescatter]: 5.09986e-07 [virtual_shard_identity]: 8.23001e-06 [virtual_dataset]: 6.68e-06 [get_grad_eliminate_]: 6.56999e-06 [virtual_output]: 6.48e-06 [merge_forward]: 3.58e-06 [cell_reuse_recompute_pass]: 1.47001e-06 [offload_activation]: 7.65e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.348e-05 [merge_recompute_call_nodes]: 9.89996e-07 [before_grad]: 1.144e-05 [set_forward_comm_id_for_comm_node_pass]: 4.37e-06 [meta_fg_expand]: 2.75997e-06 [flash_sp_send_recv_attached]: 8.59989e-07 [receive_attached]: 1.22e-06 [after_resolve]: 9.73998e-06 [a_after_grad]: 1.023e-05 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 1.52001e-06 [auto_monad_grad]: 1.52999e-06 [auto_monad_eliminator]: 9.00999e-06 [cse]: 2.035e-05 [a_3]: 4.33e-05 [py_interpret_to_execute_after_opt_a]: 1.123e-05 [slice_cell_reuse_recomputed_activation]: 2.38998e-06 [rewriter_after_opt_a]: 4.187e-05 [convert_after_rewriter]: 6.93998e-06 [order_py_execute_after_rewriter]: 6.21e-06 [mutable_eliminate]: 0.00051789 [opt_b]: 0.00027716, [1] [Cycle 1]: 0.00027059, [7] [b_1]: 0.00018404 [b_2]: 9.00999e-06 [updatestate_depend_eliminate]: 7.48e-06 [updatestate_assign_eliminate]: 3.21999e-06 [updatestate_loads_eliminate]: 2.89001e-06 [renormalize]: 3.10014e-07 [cse]: 2.577e-05 [optimize_parallel_all_gather_comm]: 1.784e-05 [overlap_param_gather]: 2.01e-06 [cconv]: 2.743e-05 [loop_unroll]: 0.00043003 [opt_after_cconv]: 0.00011434, [1] [Cycle 1]: 0.00010834, [7] [c_1]: 3.41e-05 [parameter_eliminate]: 3.18998e-06 [updatestate_depend_eliminate]: 6.38998e-06 [updatestate_assign_eliminate]: 3.23e-06 [updatestate_loads_eliminate]: 2.92002e-06 [cse]: 2.331e-05 [renormalize]: 4.19997e-07 [remove_dup_value]: 1.75e-05 [tuple_transform]: 8.023e-05, [1] [Cycle 1]: 7.576e-05, [4] [d_1]: 4.758e-05 [none_parameter_eliminate]: 1.90001e-06 [renormalize]: 2.10013e-07 [switch_simplify]: 8.15e-06 [partial_unused_args_eliminate]: 2.02999e-06 [add_recomputation]: 5.661e-05 [cse_after_recomputation]: 2.664e-05, [1] [Cycle 1]: 2.195e-05, [1] [cse]: 1.593e-05 [environ_conv]: 6.96001e-06 [swap_dp_allreduce_reducescatter]: 5.74e-06 [bias_add_comm_swap]: 2.74001e-06 [label_micro_interleaved_index]: 5.51e-06 [label_fine_grained_interleaved_index]: 3.3e-06 [merge_cast_opt]: 1.41002e-06 [slice_recompute_activation]: 2.24999e-06 [micro_interleaved_order_control]: 2.32001e-06 [assign_add_opt]: 1.19998e-06 [ForceFp32Comm]: 7.79983e-07 [remove_cast_before_assign_add]: 1.01002e-06 [full_micro_interleaved_order_control]: 2.22001e-06 [reorder_send_recv_between_fp_bp]: 2.93e-06 [comm_op_add_attrs]: 1.02e-06 [add_comm_op_reuse_tag]: 9.39996e-07 [interleave_split_concat_branches]: 1.12e-06 [interleave_parallel_branches]: 9.99979e-07 [overlap_opt_shard_in_pipeline]: 1.18001e-06 [overlap_opt_shard_grad_in_pipeline]: 1.81e-06 [control_data_broadcast_order]: 1.463e-05 [grouped_pairwise_exchange_alltoall]: 2.09e-06 [offloading_packed_experts]: 4.45999e-06 [overlap_recompute_and_grad_model_parallel]: 5.25001e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.17999e-06 [overlap_recompute_allgather_and_fa_grad]: 1.40001e-06 [overlap_recompute_comm]: 2.54999e-06 [overlap_grad_ring_attention]: 4.64998e-06 [overlap_grad_flash_sp]: 2.141e-05 [begin_end_overlap_inline]: 5.70028e-07 [split_matmul_comm_elemetwise]: 2.54999e-06 [split_layernorm_comm]: 1.72999e-06 [handle_group_info]: 1.25001e-06 [symbol_engine_optimizer]: 8.34e-05, [1] [Cycle 1]: 7.909e-05, [6] [build]: 3.07002e-06 [elim_shapecalc]: 1.056e-05 [elim_not_effective]: 1.549e-05 [opt_reshape]: 8.35001e-06 [fold_const_symbol]: 1.213e-05 [renormalize]: 2.09984e-07 [detach_backward]: 2.07999e-06 [pipeline_parallel_scheduler]: 1.48002e-06 [auto_monad_reorder]: 2.058e-05 [get_jit_bprop_graph]: 1.58002e-06 [rewriter_after_jit_bprop_graph]: 5.02e-06 [opt_after_jit_grad]: 0.0004714 [validate]: 4.175e-05 Sums bootstrap : 0.000446s : 4.49% type_inference : 0.005048s : 50.77% event_method : 0.000013s : 0.13% auto_monad : 0.000057s : 0.58% graph_reusing : 0.000005s : 0.05% inline : 0.000002s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000015s : 0.15% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000004s : 0.04% parallel-infer-symbol : 0.000003s : 0.03% pre_auto_parallel : 0.000030s : 0.30% insert-virtual-dataset : 0.000002s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.02% optimize.py_interpret_to_execute : 0.000020s : 0.20% optimize.rewriter_before_opt_a : 0.000050s : 0.50% optimize.opt_a.expand_dump_flag : 0.000005s : 0.05% optimize.opt_a.switch_simplify : 0.000036s : 0.36% optimize.opt_a.loop_unroll : 0.000022s : 0.22% optimize.opt_a.a_1 : 0.000504s : 5.07% optimize.opt_a.with_stream_mark : 0.000032s : 0.32% optimize.opt_a.recompute_prepare : 0.000018s : 0.18% optimize.opt_a.updatestate_depend_eliminate : 0.000009s : 0.09% optimize.opt_a.updatestate_assign_eliminate : 0.000008s : 0.08% optimize.opt_a.updatestate_loads_eliminate : 0.000007s : 0.07% optimize.opt_a.parameter_eliminate : 0.000003s : 0.03% optimize.opt_a.a_2 : 0.000182s : 1.84% optimize.opt_a.accelerated_algorithm : 0.000015s : 0.15% optimize.opt_a.shard : 0.000004s : 0.04% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.04% optimize.opt_a.shard_inline : 0.000015s : 0.15% optimize.opt_a.merge_send_recv : 0.000016s : 0.16% optimize.opt_a.auto_parallel : 0.000015s : 0.15% optimize.opt_a.parallel : 0.000025s : 0.25% optimize.opt_a.flash_sp : 0.000013s : 0.13% optimize.opt_a.merge_comm : 0.000009s : 0.09% optimize.opt_a.allreduce_fusion : 0.000009s : 0.09% optimize.opt_a.matmul_add_comm_reduction : 0.000017s : 0.18% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000018s : 0.18% optimize.opt_a.virtual_dataset : 0.000015s : 0.15% optimize.opt_a.get_grad_eliminate_ : 0.000014s : 0.14% optimize.opt_a.virtual_output : 0.000014s : 0.14% optimize.opt_a.merge_forward : 0.000008s : 0.08% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.03% optimize.opt_a.offload_activation : 0.000019s : 0.19% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000029s : 0.29% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.02% optimize.opt_a.before_grad : 0.000024s : 0.24% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000009s : 0.09% optimize.opt_a.meta_fg_expand : 0.000006s : 0.06% optimize.opt_a.flash_sp_send_recv_attached : 0.000003s : 0.03% optimize.opt_a.receive_attached : 0.000003s : 0.03% optimize.opt_a.after_resolve : 0.000021s : 0.21% optimize.opt_a.a_after_grad : 0.000022s : 0.22% optimize.opt_a.renormalize : 0.000649s : 6.52% optimize.opt_a.add_forward_monad_depend : 0.000007s : 0.07% optimize.opt_a.auto_monad_grad : 0.000004s : 0.04% optimize.opt_a.auto_monad_eliminator : 0.000026s : 0.26% optimize.opt_a.cse : 0.000058s : 0.58% optimize.opt_a.a_3 : 0.000100s : 1.00% optimize.py_interpret_to_execute_after_opt_a : 0.000011s : 0.11% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.02% optimize.rewriter_after_opt_a : 0.000042s : 0.42% optimize.convert_after_rewriter : 0.000007s : 0.07% optimize.order_py_execute_after_rewriter : 0.000006s : 0.06% optimize.mutable_eliminate : 0.000518s : 5.21% optimize.opt_b.b_1 : 0.000184s : 1.85% optimize.opt_b.b_2 : 0.000009s : 0.09% optimize.opt_b.updatestate_depend_eliminate : 0.000007s : 0.08% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_b.renormalize : 0.000000s : 0.00% optimize.opt_b.cse : 0.000026s : 0.26% optimize.optimize_parallel_all_gather_comm : 0.000018s : 0.18% optimize.overlap_param_gather : 0.000002s : 0.02% optimize.cconv : 0.000027s : 0.28% optimize.loop_unroll : 0.000430s : 4.32% optimize.opt_after_cconv.c_1 : 0.000034s : 0.34% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.06% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.cse : 0.000023s : 0.23% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000017s : 0.18% optimize.tuple_transform.d_1 : 0.000048s : 0.48% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.02% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000008s : 0.08% optimize.partial_unused_args_eliminate : 0.000002s : 0.02% optimize.add_recomputation : 0.000057s : 0.57% optimize.cse_after_recomputation.cse : 0.000016s : 0.16% optimize.environ_conv : 0.000007s : 0.07% optimize.swap_dp_allreduce_reducescatter : 0.000006s : 0.06% optimize.bias_add_comm_swap : 0.000003s : 0.03% optimize.label_micro_interleaved_index : 0.000006s : 0.06% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.03% optimize.merge_cast_opt : 0.000001s : 0.01% optimize.slice_recompute_activation : 0.000002s : 0.02% optimize.micro_interleaved_order_control : 0.000002s : 0.02% optimize.assign_add_opt : 0.000001s : 0.01% optimize.ForceFp32Comm : 0.000001s : 0.01% optimize.remove_cast_before_assign_add : 0.000001s : 0.01% optimize.full_micro_interleaved_order_control : 0.000002s : 0.02% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.03% optimize.comm_op_add_attrs : 0.000001s : 0.01% optimize.add_comm_op_reuse_tag : 0.000001s : 0.01% optimize.interleave_split_concat_branches : 0.000001s : 0.01% optimize.interleave_parallel_branches : 0.000001s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.02% optimize.control_data_broadcast_order : 0.000015s : 0.15% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.02% optimize.offloading_packed_experts : 0.000004s : 0.04% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.05% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.01% optimize.overlap_recompute_comm : 0.000003s : 0.03% optimize.overlap_grad_ring_attention : 0.000005s : 0.05% optimize.overlap_grad_flash_sp : 0.000021s : 0.22% optimize.begin_end_overlap_inline : 0.000001s : 0.01% optimize.split_matmul_comm_elemetwise : 0.000003s : 0.03% optimize.split_layernorm_comm : 0.000002s : 0.02% optimize.handle_group_info : 0.000001s : 0.01% optimize.symbol_engine_optimizer.build : 0.000003s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000011s : 0.11% optimize.symbol_engine_optimizer.elim_not_effective : 0.000015s : 0.16% optimize.symbol_engine_optimizer.opt_reshape : 0.000008s : 0.08% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000012s : 0.12% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.02% pipeline_parallel_scheduler : 0.000001s : 0.01% auto_monad_reorder : 0.000021s : 0.21% get_jit_bprop_graph : 0.000002s : 0.02% rewriter_after_jit_bprop_graph : 0.000005s : 0.05% opt_after_jit_grad : 0.000471s : 4.74% validate : 0.000042s : 0.42% Time group info: ------[substitution.] 0.000174 29 11.70% : 0.000020s : 2: substitution.cast_eliminate 1.28% : 0.000002s : 3: substitution.elim_not_effective 0.99% : 0.000002s : 3: substitution.fold_const_symbol 3.67% : 0.000006s : 4: substitution.graph_param_transform 59.04% : 0.000103s : 2: substitution.inline 2.37% : 0.000004s : 6: substitution.j_node_and_user_rematch 3.24% : 0.000006s : 6: substitution.remove_not_recompute_node 1.80% : 0.000003s : 2: substitution.replace_old_param 15.90% : 0.000028s : 1: substitution.value_based_eliminate ------[type_inference.] 0.004998 2 90.88% : 0.004542s : 1: type_inference.infer 9.12% : 0.000456s : 1: type_inference.specialize ------[replace.] 0.000022 2 100.00% : 0.000022s : 2: replace.inline ------[match.] 0.000101 2 100.00% : 0.000101s : 2: match.inline ------[predicate.] 0.000168 980 0.92% : 0.000002s : 9: predicate.accumulaten_eliminater 1.00% : 0.000002s : 4: predicate.ad_related_special_op_eliminate 0.68% : 0.000001s : 8: predicate.addn_check_dump 0.83% : 0.000001s : 9: predicate.addn_zero_filter 0.73% : 0.000001s : 9: predicate.adjust_all_reduce_mul_add 2.45% : 0.000004s : 17: predicate.arithmetic_simplify 0.90% : 0.000002s : 9: predicate.cast_eliminate 0.83% : 0.000001s : 8: predicate.check_bprop_eliminate 0.68% : 0.000001s : 8: predicate.compare_switch_simplify 0.24% : 0.000000s : 4: predicate.const_output_eliminate 0.74% : 0.000001s : 8: predicate.depend_value_elim 0.80% : 0.000001s : 9: predicate.dict_get_item_const_eliminator 0.92% : 0.000002s : 9: predicate.dict_get_item_eliminator 0.78% : 0.000001s : 9: predicate.dict_set_item_eliminator 1.28% : 0.000002s : 8: predicate.dumpgradient_eliminate 0.27% : 0.000000s : 4: predicate.elim_not_effective 0.51% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.19% : 0.000002s : 13: predicate.environ_add_const_eliminate 1.08% : 0.000002s : 13: predicate.environ_get_add_eliminate 1.04% : 0.000002s : 13: predicate.environ_get_depend_swap 1.98% : 0.000003s : 21: predicate.environ_get_eliminate 1.01% : 0.000002s : 13: predicate.environ_get_set_eliminate 0.93% : 0.000002s : 11: predicate.exchange_switch_depend_value 1.77% : 0.000003s : 11: predicate.float_depend_g_call 0.68% : 0.000001s : 8: predicate.float_environ_get_switch 0.99% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.27% : 0.000000s : 4: predicate.fold_const_symbol 0.78% : 0.000001s : 8: predicate.get_grad_eliminate 0.24% : 0.000000s : 4: predicate.graph_param_transform 0.82% : 0.000001s : 8: predicate.incorporate_call 0.63% : 0.000001s : 8: predicate.incorporate_call_switch 6.16% : 0.000010s : 44: predicate.inline 1.10% : 0.000002s : 8: predicate.inline_without_move 0.36% : 0.000001s : 8: predicate.j_node_and_user_rematch 1.04% : 0.000002s : 8: predicate.less_batch_normalization 1.60% : 0.000003s : 17: predicate.list_to_tuple_eliminator_ 2.21% : 0.000004s : 26: predicate.load_eliminater 1.06% : 0.000002s : 4: predicate.loop_unroll_after_grad 1.48% : 0.000002s : 16: predicate.loop_unroll_before_grad 1.70% : 0.000003s : 17: predicate.make_slice_get_slice_eliminator 0.72% : 0.000001s : 8: predicate.merge_addn 0.73% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.71% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.74% : 0.000001s : 9: predicate.minmaximum_grad 1.60% : 0.000003s : 4: predicate.mutable_eliminate 0.47% : 0.000001s : 4: predicate.opt_reshape 0.43% : 0.000001s : 4: predicate.parallel_virtual_node 1.15% : 0.000002s : 11: predicate.partial_defer_inline 1.25% : 0.000002s : 13: predicate.partial_eliminate 0.77% : 0.000001s : 9: predicate.print_const_string_wrapper 0.77% : 0.000001s : 8: predicate.reduce_all_const_elim 0.96% : 0.000002s : 9: predicate.reduce_eliminate 2.08% : 0.000004s : 26: predicate.redundant_stop_gradient_eliminater 0.53% : 0.000001s : 8: predicate.remove_not_recompute_node 1.16% : 0.000002s : 17: predicate.replace_applicator 0.92% : 0.000002s : 8: predicate.replace_old_param 0.31% : 0.000001s : 4: predicate.reset_defer_inline 1.19% : 0.000002s : 9: predicate.reshape_eliminate 0.75% : 0.000001s : 8: predicate.row_tensor_add_zeros_like 0.41% : 0.000001s : 4: predicate.row_tensor_eliminate 1.04% : 0.000002s : 8: predicate.same_eliminate 0.49% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.87% : 0.000001s : 8: predicate.shard_identity_eliminate 0.89% : 0.000001s : 8: predicate.special_op_eliminate 1.08% : 0.000002s : 8: predicate.specialize_transform 1.04% : 0.000002s : 8: predicate.split_environ_get_set_with_tuple_value 1.59% : 0.000003s : 8: predicate.stack_unstack_eliminate 0.45% : 0.000001s : 4: predicate.switch_call_monad_eliminater 0.95% : 0.000002s : 11: predicate.switch_defer_inline 1.70% : 0.000003s : 19: predicate.switch_layer_defer_inline 4.23% : 0.000007s : 39: predicate.switch_simplify 0.78% : 0.000001s : 9: predicate.tile_eliminate 0.78% : 0.000001s : 9: predicate.transpose_eliminate 1.57% : 0.000003s : 17: predicate.tuple_list_convert_item_index_to_positive 1.66% : 0.000003s : 17: predicate.tuple_list_get_item_const_eliminator 1.47% : 0.000002s : 17: predicate.tuple_list_get_item_depend_reorder 2.96% : 0.000005s : 25: predicate.tuple_list_get_item_eliminator 1.44% : 0.000002s : 17: predicate.tuple_list_get_set_item_eliminator 2.69% : 0.000005s : 25: predicate.tuple_list_set_item_eliminator 1.52% : 0.000003s : 17: predicate.tuple_to_list_eliminator_ 2.11% : 0.000004s : 26: predicate.updatestate_pure_node_eliminater 2.97% : 0.000005s : 34: predicate.updatestate_useless_node_eliminater 0.81% : 0.000001s : 4: predicate.value_based_eliminate 0.77% : 0.000001s : 8: predicate.virtual_dataset_eliminate 0.85% : 0.000001s : 8: predicate.virtual_output_eliminate 0.37% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.53% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000225 5 7.67% : 0.000017s : 1: func_graph_cloner_run.FuncGraphClonerGraph 92.33% : 0.000208s : 4: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.023663 192 0.01% : 0.000004s : 1: ForceFp32Comm 13.40% : 0.003170s : 1: add_attr 13.35% : 0.003159s : 1: add_attr_with_inline 0.02% : 0.000004s : 1: add_comm_op_reuse_tag 0.26% : 0.000061s : 1: add_recomputation 0.02% : 0.000004s : 1: assign_add_opt 0.27% : 0.000063s : 1: auto_monad 0.11% : 0.000025s : 1: auto_monad_reorder 0.01% : 0.000004s : 1: begin_end_overlap_inline 0.02% : 0.000006s : 1: bias_add_comm_swap 2.02% : 0.000478s : 1: bootstrap 0.13% : 0.000031s : 1: cconv 0.02% : 0.000004s : 1: comm_op_add_attrs 0.08% : 0.000018s : 1: control_data_broadcast_order 0.04% : 0.000010s : 1: convert_after_rewriter 0.13% : 0.000030s : 1: cse_after_recomputation 0.02% : 0.000005s : 1: dataset_repeat_opt 0.02% : 0.000006s : 1: detach_backward 0.04% : 0.000010s : 1: environ_conv 0.08% : 0.000019s : 1: event_method 0.02% : 0.000005s : 1: full_micro_interleaved_order_control 0.02% : 0.000005s : 1: get_jit_bprop_graph 0.04% : 0.000009s : 1: graph_reusing 0.02% : 0.000005s : 1: grouped_pairwise_exchange_alltoall 0.02% : 0.000004s : 1: handle_group_info 0.02% : 0.000005s : 1: inline 0.03% : 0.000006s : 1: insert-virtual-dataset 0.02% : 0.000004s : 1: interleave_parallel_branches 0.02% : 0.000004s : 1: interleave_split_concat_branches 0.03% : 0.000006s : 1: label_fine_grained_interleaved_index 0.04% : 0.000008s : 1: label_micro_interleaved_index 1.85% : 0.000438s : 1: loop_unroll 0.02% : 0.000004s : 1: merge_cast_opt 0.02% : 0.000005s : 1: micro_interleaved_order_control 2.23% : 0.000527s : 1: mutable_eliminate 0.03% : 0.000007s : 1: offloading_packed_experts 0.07% : 0.000016s : 1: opt.transform.loop_unroll_optimizer 0.08% : 0.000019s : 1: opt.transform.mutable_eliminate 4.03% : 0.000954s : 78: opt.transform.opt_a 0.14% : 0.000032s : 1: opt.transform.opt_after_cconv 0.12% : 0.000028s : 1: opt.transform.opt_after_jit_grad 0.67% : 0.000159s : 28: opt.transform.opt_b 0.23% : 0.000054s : 2: opt.transform.opt_trans_graph 0.18% : 0.000043s : 4: opt.transform.symbol_engine_opt 10.49% : 0.002482s : 1: opt_a 0.50% : 0.000118s : 1: opt_after_cconv 2.03% : 0.000480s : 1: opt_after_jit_grad 1.19% : 0.000281s : 1: opt_b 19.47% : 0.004606s : 1: optimize 0.09% : 0.000021s : 1: optimize_parallel_all_gather_comm 0.04% : 0.000009s : 1: order_py_execute_after_rewriter 0.10% : 0.000025s : 1: overlap_grad_flash_sp 0.02% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.03% : 0.000008s : 1: overlap_grad_ring_attention 0.02% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.02% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.02% : 0.000005s : 1: overlap_param_gather 0.02% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.03% : 0.000008s : 1: overlap_recompute_and_grad_model_parallel 0.02% : 0.000006s : 1: overlap_recompute_comm 0.03% : 0.000006s : 1: parallel-infer-symbol 0.02% : 0.000004s : 1: parallel-infer-symbol-second 0.02% : 0.000005s : 1: partial_unused_args_eliminate 0.02% : 0.000005s : 1: pipeline_parallel_scheduler 0.02% : 0.000005s : 1: pipeline_split 0.14% : 0.000034s : 1: pre_auto_parallel 0.10% : 0.000023s : 1: py_interpret_to_execute 0.06% : 0.000015s : 1: py_interpret_to_execute_after_opt_a 0.02% : 0.000004s : 1: remove_cast_before_assign_add 0.09% : 0.000021s : 1: remove_dup_value 1.52% : 0.000359s : 1: renormalize.infer 1.19% : 0.000281s : 1: renormalize.specialize 0.02% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.03% : 0.000008s : 1: rewriter_after_jit_bprop_graph 0.20% : 0.000046s : 1: rewriter_after_opt_a 0.23% : 0.000054s : 1: rewriter_before_opt_a 0.02% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.02% : 0.000005s : 1: slice_recompute_activation 0.02% : 0.000004s : 1: split_layernorm_comm 0.02% : 0.000005s : 1: split_matmul_comm_elemetwise 0.04% : 0.000009s : 1: swap_dp_allreduce_reducescatter 0.36% : 0.000086s : 1: symbol_engine_optimizer 0.35% : 0.000083s : 1: tuple_transform 21.41% : 0.005066s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:51.232.758 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:51.233.026 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0152275, [21] [bootstrap]: 0.00045259 [type_inference]: 0.0053439 [event_method]: 1.308e-05 [auto_monad]: 5.735e-05 [graph_reusing]: 5.74999e-06 [inline]: 2.32999e-06 [add_attr]: 0.00311668, [1] [add_attr_with_inline]: 0.00310811, [1] [Cycle 1]: 6.882e-05, [2] [tag_attr]: 1.52e-05 [meta_addattr_fg_expand]: 4.34002e-06 [parallel-infer-symbol]: 3.30998e-06 [pre_auto_parallel]: 2.552e-05 [insert-virtual-dataset]: 2.37999e-06 [parallel-infer-symbol-second]: 7.59988e-07 [dataset_repeat_opt]: 2.12999e-06 [pipeline_split]: 1.67999e-06 [optimize]: 0.0050519, [53] [py_interpret_to_execute]: 2.23e-05 [rewriter_before_opt_a]: 5.216e-05 [opt_a]: 0.00269982, [2] [Cycle 1]: 0.00179725, [45] [expand_dump_flag]: 2.99001e-06 [switch_simplify]: 2.532e-05 [loop_unroll]: 1.537e-05 [a_1]: 0.00033667 [with_stream_mark]: 1.53e-05 [recompute_prepare]: 9.64e-06 [updatestate_depend_eliminate]: 5.22e-06 [updatestate_assign_eliminate]: 4.48999e-06 [updatestate_loads_eliminate]: 3.55e-06 [parameter_eliminate]: 2.18002e-06 [a_2]: 0.00012506 [accelerated_algorithm]: 8.33001e-06 [shard]: 2.39001e-06 [meta_shard_fg_expand]: 2.01998e-06 [shard_inline]: 7.54002e-06 [merge_send_recv]: 8.69e-06 [auto_parallel]: 7.93001e-06 [parallel]: 1.779e-05 [flash_sp]: 7.72002e-06 [merge_comm]: 5.17e-06 [allreduce_fusion]: 4.48999e-06 [matmul_add_comm_reduction]: 1.11e-05 [allreduce_slice_to_reducescatter]: 9.5999e-07 [virtual_shard_identity]: 9.47001e-06 [virtual_dataset]: 7.33999e-06 [get_grad_eliminate_]: 7.02002e-06 [virtual_output]: 7.55e-06 [merge_forward]: 4.62e-06 [cell_reuse_recompute_pass]: 1.53002e-06 [offload_activation]: 1.074e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.64e-05 [merge_recompute_call_nodes]: 1.80001e-06 [before_grad]: 1.332e-05 [set_forward_comm_id_for_comm_node_pass]: 4.48999e-06 [meta_fg_expand]: 3.69002e-06 [flash_sp_send_recv_attached]: 2.75002e-06 [receive_attached]: 2.06998e-06 [after_resolve]: 1.094e-05 [a_after_grad]: 1.143e-05 [renormalize]: 0.0005413 [add_forward_monad_depend]: 4.77e-06 [auto_monad_grad]: 1.92999e-06 [auto_monad_eliminator]: 1.653e-05 [cse]: 3.641e-05 [a_3]: 6.863e-05 [Cycle 2]: 0.00088965, [45] [expand_dump_flag]: 1.01997e-06 [switch_simplify]: 8.63001e-06 [loop_unroll]: 7.46999e-06 [a_1]: 0.0001508 [with_stream_mark]: 1.223e-05 [recompute_prepare]: 7.83001e-06 [updatestate_depend_eliminate]: 4.87e-06 [updatestate_assign_eliminate]: 2.94999e-06 [updatestate_loads_eliminate]: 2.80997e-06 [parameter_eliminate]: 9.5999e-07 [a_2]: 0.0001148 [accelerated_algorithm]: 7.41001e-06 [shard]: 1.64e-06 [meta_shard_fg_expand]: 1.47999e-06 [shard_inline]: 7.32997e-06 [merge_send_recv]: 5.94999e-06 [auto_parallel]: 6.81999e-06 [parallel]: 5.07e-06 [flash_sp]: 3.76001e-06 [merge_comm]: 4.12998e-06 [allreduce_fusion]: 4.2e-06 [matmul_add_comm_reduction]: 7.04001e-06 [allreduce_slice_to_reducescatter]: 5.19998e-07 [virtual_shard_identity]: 7.91001e-06 [virtual_dataset]: 6.96001e-06 [get_grad_eliminate_]: 6.93e-06 [virtual_output]: 6.50002e-06 [merge_forward]: 3.41001e-06 [cell_reuse_recompute_pass]: 1.26002e-06 [offload_activation]: 7.43999e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.564e-05 [merge_recompute_call_nodes]: 8.30012e-07 [before_grad]: 1.202e-05 [set_forward_comm_id_for_comm_node_pass]: 4.45e-06 [meta_fg_expand]: 2.71999e-06 [flash_sp_send_recv_attached]: 9.00007e-07 [receive_attached]: 1.33002e-06 [after_resolve]: 1.007e-05 [a_after_grad]: 1.043e-05 [renormalize]: 8.9989e-08 [add_forward_monad_depend]: 1.15001e-06 [auto_monad_grad]: 1.45999e-06 [auto_monad_eliminator]: 8.52998e-06 [cse]: 1.884e-05 [a_3]: 5.623e-05 [py_interpret_to_execute_after_opt_a]: 1.425e-05 [slice_cell_reuse_recomputed_activation]: 4.75001e-06 [rewriter_after_opt_a]: 4.601e-05 [convert_after_rewriter]: 1.024e-05 [order_py_execute_after_rewriter]: 9.01998e-06 [mutable_eliminate]: 0.00052123 [opt_b]: 0.0003347, [1] [Cycle 1]: 0.00032551, [7] [b_1]: 0.00021811 [b_2]: 9.15001e-06 [updatestate_depend_eliminate]: 7.71001e-06 [updatestate_assign_eliminate]: 3.09999e-06 [updatestate_loads_eliminate]: 2.89999e-06 [renormalize]: 4.2998e-07 [cse]: 2.567e-05 [optimize_parallel_all_gather_comm]: 2.042e-05 [overlap_param_gather]: 5.05999e-06 [cconv]: 2.92e-05 [loop_unroll]: 0.00044321 [opt_after_cconv]: 0.00013745, [1] [Cycle 1]: 0.00012857, [7] [c_1]: 3.378e-05 [parameter_eliminate]: 2.80997e-06 [updatestate_depend_eliminate]: 5.92001e-06 [updatestate_assign_eliminate]: 3.23e-06 [updatestate_loads_eliminate]: 3.09001e-06 [cse]: 2.33e-05 [renormalize]: 3.39991e-07 [remove_dup_value]: 2.067e-05 [tuple_transform]: 9.371e-05, [1] [Cycle 1]: 8.658e-05, [4] [d_1]: 4.647e-05 [none_parameter_eliminate]: 1.62001e-06 [renormalize]: 1.70025e-07 [switch_simplify]: 8e-06 [partial_unused_args_eliminate]: 4.84e-06 [add_recomputation]: 5.541e-05 [cse_after_recomputation]: 3.197e-05, [1] [Cycle 1]: 2.487e-05, [1] [cse]: 1.593e-05 [environ_conv]: 9.87999e-06 [swap_dp_allreduce_reducescatter]: 8.64e-06 [bias_add_comm_swap]: 4.79e-06 [label_micro_interleaved_index]: 7.08e-06 [label_fine_grained_interleaved_index]: 5.55001e-06 [merge_cast_opt]: 4.17e-06 [slice_recompute_activation]: 4.82998e-06 [micro_interleaved_order_control]: 4.48999e-06 [assign_add_opt]: 3.64002e-06 [ForceFp32Comm]: 3.12002e-06 [remove_cast_before_assign_add]: 3.5e-06 [full_micro_interleaved_order_control]: 4.63001e-06 [reorder_send_recv_between_fp_bp]: 5.27999e-06 [comm_op_add_attrs]: 3.68999e-06 [add_comm_op_reuse_tag]: 3.21001e-06 [interleave_split_concat_branches]: 3.5e-06 [interleave_parallel_branches]: 3.86999e-06 [overlap_opt_shard_in_pipeline]: 3.69002e-06 [overlap_opt_shard_grad_in_pipeline]: 4.58001e-06 [control_data_broadcast_order]: 1.8e-05 [grouped_pairwise_exchange_alltoall]: 3.91999e-06 [offloading_packed_experts]: 6.98e-06 [overlap_recompute_and_grad_model_parallel]: 7.85998e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.50998e-06 [overlap_recompute_allgather_and_fa_grad]: 3.61001e-06 [overlap_recompute_comm]: 4.48001e-06 [overlap_grad_ring_attention]: 7.71999e-06 [overlap_grad_flash_sp]: 2.347e-05 [begin_end_overlap_inline]: 3.08e-06 [split_matmul_comm_elemetwise]: 4.64998e-06 [split_layernorm_comm]: 4.11001e-06 [handle_group_info]: 3.21999e-06 [symbol_engine_optimizer]: 0.00010277, [1] [Cycle 1]: 9.592e-05, [6] [build]: 2.96001e-06 [elim_shapecalc]: 1.082e-05 [elim_not_effective]: 1.493e-05 [opt_reshape]: 8.35001e-06 [fold_const_symbol]: 1.243e-05 [renormalize]: 2.10013e-07 [detach_backward]: 3.93001e-06 [pipeline_parallel_scheduler]: 1.96e-06 [auto_monad_reorder]: 2.122e-05 [get_jit_bprop_graph]: 1.59e-06 [rewriter_after_jit_bprop_graph]: 4.50001e-06 [opt_after_jit_grad]: 0.00049801 [validate]: 4.022e-05 Sums bootstrap : 0.000453s : 4.36% type_inference : 0.005344s : 51.49% event_method : 0.000013s : 0.13% auto_monad : 0.000057s : 0.55% graph_reusing : 0.000006s : 0.06% inline : 0.000002s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000015s : 0.15% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000004s : 0.04% parallel-infer-symbol : 0.000003s : 0.03% pre_auto_parallel : 0.000026s : 0.25% insert-virtual-dataset : 0.000002s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.02% optimize.py_interpret_to_execute : 0.000022s : 0.21% optimize.rewriter_before_opt_a : 0.000052s : 0.50% optimize.opt_a.expand_dump_flag : 0.000004s : 0.04% optimize.opt_a.switch_simplify : 0.000034s : 0.33% optimize.opt_a.loop_unroll : 0.000023s : 0.22% optimize.opt_a.a_1 : 0.000487s : 4.70% optimize.opt_a.with_stream_mark : 0.000028s : 0.27% optimize.opt_a.recompute_prepare : 0.000017s : 0.17% optimize.opt_a.updatestate_depend_eliminate : 0.000010s : 0.10% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.07% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.06% optimize.opt_a.parameter_eliminate : 0.000003s : 0.03% optimize.opt_a.a_2 : 0.000240s : 2.31% optimize.opt_a.accelerated_algorithm : 0.000016s : 0.15% optimize.opt_a.shard : 0.000004s : 0.04% optimize.opt_a.meta_shard_fg_expand : 0.000003s : 0.03% optimize.opt_a.shard_inline : 0.000015s : 0.14% optimize.opt_a.merge_send_recv : 0.000015s : 0.14% optimize.opt_a.auto_parallel : 0.000015s : 0.14% optimize.opt_a.parallel : 0.000023s : 0.22% optimize.opt_a.flash_sp : 0.000011s : 0.11% optimize.opt_a.merge_comm : 0.000009s : 0.09% optimize.opt_a.allreduce_fusion : 0.000009s : 0.08% optimize.opt_a.matmul_add_comm_reduction : 0.000018s : 0.17% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000017s : 0.17% optimize.opt_a.virtual_dataset : 0.000014s : 0.14% optimize.opt_a.get_grad_eliminate_ : 0.000014s : 0.13% optimize.opt_a.virtual_output : 0.000014s : 0.14% optimize.opt_a.merge_forward : 0.000008s : 0.08% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.03% optimize.opt_a.offload_activation : 0.000018s : 0.18% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000032s : 0.31% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.03% optimize.opt_a.before_grad : 0.000025s : 0.24% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000009s : 0.09% optimize.opt_a.meta_fg_expand : 0.000006s : 0.06% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.04% optimize.opt_a.receive_attached : 0.000003s : 0.03% optimize.opt_a.after_resolve : 0.000021s : 0.20% optimize.opt_a.a_after_grad : 0.000022s : 0.21% optimize.opt_a.renormalize : 0.000541s : 5.22% optimize.opt_a.add_forward_monad_depend : 0.000006s : 0.06% optimize.opt_a.auto_monad_grad : 0.000003s : 0.03% optimize.opt_a.auto_monad_eliminator : 0.000025s : 0.24% optimize.opt_a.cse : 0.000055s : 0.53% optimize.opt_a.a_3 : 0.000125s : 1.20% optimize.py_interpret_to_execute_after_opt_a : 0.000014s : 0.14% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.05% optimize.rewriter_after_opt_a : 0.000046s : 0.44% optimize.convert_after_rewriter : 0.000010s : 0.10% optimize.order_py_execute_after_rewriter : 0.000009s : 0.09% optimize.mutable_eliminate : 0.000521s : 5.02% optimize.opt_b.b_1 : 0.000218s : 2.10% optimize.opt_b.b_2 : 0.000009s : 0.09% optimize.opt_b.updatestate_depend_eliminate : 0.000008s : 0.07% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_b.renormalize : 0.000000s : 0.00% optimize.opt_b.cse : 0.000026s : 0.25% optimize.optimize_parallel_all_gather_comm : 0.000020s : 0.20% optimize.overlap_param_gather : 0.000005s : 0.05% optimize.cconv : 0.000029s : 0.28% optimize.loop_unroll : 0.000443s : 4.27% optimize.opt_after_cconv.c_1 : 0.000034s : 0.33% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.06% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.cse : 0.000023s : 0.22% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000021s : 0.20% optimize.tuple_transform.d_1 : 0.000046s : 0.45% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.02% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000008s : 0.08% optimize.partial_unused_args_eliminate : 0.000005s : 0.05% optimize.add_recomputation : 0.000055s : 0.53% optimize.cse_after_recomputation.cse : 0.000016s : 0.15% optimize.environ_conv : 0.000010s : 0.10% optimize.swap_dp_allreduce_reducescatter : 0.000009s : 0.08% optimize.bias_add_comm_swap : 0.000005s : 0.05% optimize.label_micro_interleaved_index : 0.000007s : 0.07% optimize.label_fine_grained_interleaved_index : 0.000006s : 0.05% optimize.merge_cast_opt : 0.000004s : 0.04% optimize.slice_recompute_activation : 0.000005s : 0.05% optimize.micro_interleaved_order_control : 0.000004s : 0.04% optimize.assign_add_opt : 0.000004s : 0.04% optimize.ForceFp32Comm : 0.000003s : 0.03% optimize.remove_cast_before_assign_add : 0.000003s : 0.03% optimize.full_micro_interleaved_order_control : 0.000005s : 0.04% optimize.reorder_send_recv_between_fp_bp : 0.000005s : 0.05% optimize.comm_op_add_attrs : 0.000004s : 0.04% optimize.add_comm_op_reuse_tag : 0.000003s : 0.03% optimize.interleave_split_concat_branches : 0.000003s : 0.03% optimize.interleave_parallel_branches : 0.000004s : 0.04% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.04% optimize.overlap_opt_shard_grad_in_pipeline : 0.000005s : 0.04% optimize.control_data_broadcast_order : 0.000018s : 0.17% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.04% optimize.offloading_packed_experts : 0.000007s : 0.07% optimize.overlap_recompute_and_grad_model_parallel : 0.000008s : 0.08% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.03% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.03% optimize.overlap_recompute_comm : 0.000004s : 0.04% optimize.overlap_grad_ring_attention : 0.000008s : 0.07% optimize.overlap_grad_flash_sp : 0.000023s : 0.23% optimize.begin_end_overlap_inline : 0.000003s : 0.03% optimize.split_matmul_comm_elemetwise : 0.000005s : 0.04% optimize.split_layernorm_comm : 0.000004s : 0.04% optimize.handle_group_info : 0.000003s : 0.03% optimize.symbol_engine_optimizer.build : 0.000003s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000011s : 0.10% optimize.symbol_engine_optimizer.elim_not_effective : 0.000015s : 0.14% optimize.symbol_engine_optimizer.opt_reshape : 0.000008s : 0.08% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000012s : 0.12% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000004s : 0.04% pipeline_parallel_scheduler : 0.000002s : 0.02% auto_monad_reorder : 0.000021s : 0.20% get_jit_bprop_graph : 0.000002s : 0.02% rewriter_after_jit_bprop_graph : 0.000005s : 0.04% opt_after_jit_grad : 0.000498s : 4.80% validate : 0.000040s : 0.39% Time group info: ------[substitution.] 0.000159 29 11.67% : 0.000019s : 2: substitution.cast_eliminate 1.34% : 0.000002s : 3: substitution.elim_not_effective 1.24% : 0.000002s : 3: substitution.fold_const_symbol 4.14% : 0.000007s : 4: substitution.graph_param_transform 59.01% : 0.000094s : 2: substitution.inline 2.97% : 0.000005s : 6: substitution.j_node_and_user_rematch 3.45% : 0.000005s : 6: substitution.remove_not_recompute_node 2.11% : 0.000003s : 2: substitution.replace_old_param 14.07% : 0.000022s : 1: substitution.value_based_eliminate ------[type_inference.] 0.005295 2 91.00% : 0.004819s : 1: type_inference.infer 9.00% : 0.000477s : 1: type_inference.specialize ------[replace.] 0.000022 2 100.00% : 0.000022s : 2: replace.inline ------[match.] 0.000092 2 100.00% : 0.000092s : 2: match.inline ------[predicate.] 0.000169 980 0.83% : 0.000001s : 9: predicate.accumulaten_eliminater 1.38% : 0.000002s : 4: predicate.ad_related_special_op_eliminate 0.67% : 0.000001s : 8: predicate.addn_check_dump 0.89% : 0.000002s : 9: predicate.addn_zero_filter 0.68% : 0.000001s : 9: predicate.adjust_all_reduce_mul_add 2.13% : 0.000004s : 17: predicate.arithmetic_simplify 1.22% : 0.000002s : 9: predicate.cast_eliminate 0.78% : 0.000001s : 8: predicate.check_bprop_eliminate 0.71% : 0.000001s : 8: predicate.compare_switch_simplify 0.24% : 0.000000s : 4: predicate.const_output_eliminate 0.83% : 0.000001s : 8: predicate.depend_value_elim 0.83% : 0.000001s : 9: predicate.dict_get_item_const_eliminator 0.94% : 0.000002s : 9: predicate.dict_get_item_eliminator 0.80% : 0.000001s : 9: predicate.dict_set_item_eliminator 1.22% : 0.000002s : 8: predicate.dumpgradient_eliminate 0.24% : 0.000000s : 4: predicate.elim_not_effective 0.49% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.10% : 0.000002s : 13: predicate.environ_add_const_eliminate 1.06% : 0.000002s : 13: predicate.environ_get_add_eliminate 1.04% : 0.000002s : 13: predicate.environ_get_depend_swap 1.83% : 0.000003s : 21: predicate.environ_get_eliminate 1.01% : 0.000002s : 13: predicate.environ_get_set_eliminate 0.92% : 0.000002s : 11: predicate.exchange_switch_depend_value 1.74% : 0.000003s : 11: predicate.float_depend_g_call 0.74% : 0.000001s : 8: predicate.float_environ_get_switch 1.02% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.22% : 0.000000s : 4: predicate.fold_const_symbol 1.01% : 0.000002s : 8: predicate.get_grad_eliminate 0.33% : 0.000001s : 4: predicate.graph_param_transform 0.79% : 0.000001s : 8: predicate.incorporate_call 0.67% : 0.000001s : 8: predicate.incorporate_call_switch 6.16% : 0.000010s : 44: predicate.inline 1.09% : 0.000002s : 8: predicate.inline_without_move 0.37% : 0.000001s : 8: predicate.j_node_and_user_rematch 1.26% : 0.000002s : 8: predicate.less_batch_normalization 1.57% : 0.000003s : 17: predicate.list_to_tuple_eliminator_ 2.25% : 0.000004s : 26: predicate.load_eliminater 1.18% : 0.000002s : 4: predicate.loop_unroll_after_grad 1.61% : 0.000003s : 16: predicate.loop_unroll_before_grad 1.62% : 0.000003s : 17: predicate.make_slice_get_slice_eliminator 0.80% : 0.000001s : 8: predicate.merge_addn 0.81% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.77% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.69% : 0.000001s : 9: predicate.minmaximum_grad 1.39% : 0.000002s : 4: predicate.mutable_eliminate 0.43% : 0.000001s : 4: predicate.opt_reshape 0.57% : 0.000001s : 4: predicate.parallel_virtual_node 1.18% : 0.000002s : 11: predicate.partial_defer_inline 1.28% : 0.000002s : 13: predicate.partial_eliminate 0.99% : 0.000002s : 9: predicate.print_const_string_wrapper 0.77% : 0.000001s : 8: predicate.reduce_all_const_elim 1.07% : 0.000002s : 9: predicate.reduce_eliminate 2.10% : 0.000004s : 26: predicate.redundant_stop_gradient_eliminater 0.48% : 0.000001s : 8: predicate.remove_not_recompute_node 1.12% : 0.000002s : 17: predicate.replace_applicator 0.60% : 0.000001s : 8: predicate.replace_old_param 0.32% : 0.000001s : 4: predicate.reset_defer_inline 0.83% : 0.000001s : 9: predicate.reshape_eliminate 0.79% : 0.000001s : 8: predicate.row_tensor_add_zeros_like 0.50% : 0.000001s : 4: predicate.row_tensor_eliminate 0.99% : 0.000002s : 8: predicate.same_eliminate 0.60% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.93% : 0.000002s : 8: predicate.shard_identity_eliminate 0.98% : 0.000002s : 8: predicate.special_op_eliminate 1.02% : 0.000002s : 8: predicate.specialize_transform 1.05% : 0.000002s : 8: predicate.split_environ_get_set_with_tuple_value 0.95% : 0.000002s : 8: predicate.stack_unstack_eliminate 0.41% : 0.000001s : 4: predicate.switch_call_monad_eliminater 0.96% : 0.000002s : 11: predicate.switch_defer_inline 1.87% : 0.000003s : 19: predicate.switch_layer_defer_inline 4.05% : 0.000007s : 39: predicate.switch_simplify 0.75% : 0.000001s : 9: predicate.tile_eliminate 0.79% : 0.000001s : 9: predicate.transpose_eliminate 1.71% : 0.000003s : 17: predicate.tuple_list_convert_item_index_to_positive 1.60% : 0.000003s : 17: predicate.tuple_list_get_item_const_eliminator 1.57% : 0.000003s : 17: predicate.tuple_list_get_item_depend_reorder 2.88% : 0.000005s : 25: predicate.tuple_list_get_item_eliminator 1.48% : 0.000003s : 17: predicate.tuple_list_get_set_item_eliminator 2.35% : 0.000004s : 25: predicate.tuple_list_set_item_eliminator 1.61% : 0.000003s : 17: predicate.tuple_to_list_eliminator_ 2.12% : 0.000004s : 26: predicate.updatestate_pure_node_eliminater 3.03% : 0.000005s : 34: predicate.updatestate_useless_node_eliminater 0.65% : 0.000001s : 4: predicate.value_based_eliminate 0.81% : 0.000001s : 8: predicate.virtual_dataset_eliminate 0.80% : 0.000001s : 8: predicate.virtual_output_eliminate 0.44% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.64% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000213 5 8.35% : 0.000018s : 1: func_graph_cloner_run.FuncGraphClonerGraph 91.65% : 0.000195s : 4: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.025097 192 0.02% : 0.000006s : 1: ForceFp32Comm 12.46% : 0.003126s : 1: add_attr 12.40% : 0.003112s : 1: add_attr_with_inline 0.02% : 0.000006s : 1: add_comm_op_reuse_tag 0.24% : 0.000059s : 1: add_recomputation 0.02% : 0.000006s : 1: assign_add_opt 0.26% : 0.000066s : 1: auto_monad 0.12% : 0.000029s : 1: auto_monad_reorder 0.02% : 0.000006s : 1: begin_end_overlap_inline 0.03% : 0.000007s : 1: bias_add_comm_swap 1.98% : 0.000497s : 1: bootstrap 0.13% : 0.000032s : 1: cconv 0.03% : 0.000006s : 1: comm_op_add_attrs 0.08% : 0.000021s : 1: control_data_broadcast_order 0.05% : 0.000013s : 1: convert_after_rewriter 0.14% : 0.000035s : 1: cse_after_recomputation 0.03% : 0.000008s : 1: dataset_repeat_opt 0.08% : 0.000019s : 1: detach_backward 0.05% : 0.000013s : 1: environ_conv 0.09% : 0.000023s : 1: event_method 0.03% : 0.000008s : 1: full_micro_interleaved_order_control 0.03% : 0.000008s : 1: get_jit_bprop_graph 0.05% : 0.000012s : 1: graph_reusing 0.03% : 0.000006s : 1: grouped_pairwise_exchange_alltoall 0.02% : 0.000006s : 1: handle_group_info 0.03% : 0.000008s : 1: inline 0.03% : 0.000009s : 1: insert-virtual-dataset 0.03% : 0.000006s : 1: interleave_parallel_branches 0.02% : 0.000006s : 1: interleave_split_concat_branches 0.03% : 0.000008s : 1: label_fine_grained_interleaved_index 0.04% : 0.000010s : 1: label_micro_interleaved_index 1.79% : 0.000449s : 1: loop_unroll 0.03% : 0.000007s : 1: merge_cast_opt 0.03% : 0.000007s : 1: micro_interleaved_order_control 2.10% : 0.000528s : 1: mutable_eliminate 0.04% : 0.000010s : 1: offloading_packed_experts 0.06% : 0.000015s : 1: opt.transform.loop_unroll_optimizer 0.07% : 0.000017s : 1: opt.transform.mutable_eliminate 3.76% : 0.000944s : 78: opt.transform.opt_a 0.13% : 0.000032s : 1: opt.transform.opt_after_cconv 0.12% : 0.000030s : 1: opt.transform.opt_after_jit_grad 0.61% : 0.000153s : 28: opt.transform.opt_b 0.21% : 0.000052s : 2: opt.transform.opt_trans_graph 0.17% : 0.000043s : 4: opt.transform.symbol_engine_opt 10.77% : 0.002703s : 1: opt_a 0.56% : 0.000141s : 1: opt_after_cconv 2.03% : 0.000509s : 1: opt_after_jit_grad 1.35% : 0.000338s : 1: opt_b 21.50% : 0.005396s : 1: optimize 0.09% : 0.000024s : 1: optimize_parallel_all_gather_comm 0.05% : 0.000012s : 1: order_py_execute_after_rewriter 0.11% : 0.000027s : 1: overlap_grad_flash_sp 0.03% : 0.000006s : 1: overlap_grad_matmul_and_grad_allreduce 0.04% : 0.000011s : 1: overlap_grad_ring_attention 0.03% : 0.000007s : 1: overlap_opt_shard_grad_in_pipeline 0.03% : 0.000006s : 1: overlap_opt_shard_in_pipeline 0.03% : 0.000008s : 1: overlap_param_gather 0.02% : 0.000006s : 1: overlap_recompute_allgather_and_fa_grad 0.04% : 0.000011s : 1: overlap_recompute_and_grad_model_parallel 0.03% : 0.000007s : 1: overlap_recompute_comm 0.04% : 0.000009s : 1: parallel-infer-symbol 0.02% : 0.000006s : 1: parallel-infer-symbol-second 0.03% : 0.000008s : 1: partial_unused_args_eliminate 0.04% : 0.000010s : 1: pipeline_parallel_scheduler 0.03% : 0.000007s : 1: pipeline_split 0.13% : 0.000033s : 1: pre_auto_parallel 0.10% : 0.000026s : 1: py_interpret_to_execute 0.07% : 0.000017s : 1: py_interpret_to_execute_after_opt_a 0.02% : 0.000006s : 1: remove_cast_before_assign_add 0.10% : 0.000024s : 1: remove_dup_value 1.24% : 0.000310s : 1: renormalize.infer 0.89% : 0.000224s : 1: renormalize.specialize 0.03% : 0.000008s : 1: reorder_send_recv_between_fp_bp 0.04% : 0.000010s : 1: rewriter_after_jit_bprop_graph 0.20% : 0.000049s : 1: rewriter_after_opt_a 0.22% : 0.000056s : 1: rewriter_before_opt_a 0.03% : 0.000007s : 1: slice_cell_reuse_recomputed_activation 0.03% : 0.000007s : 1: slice_recompute_activation 0.03% : 0.000007s : 1: split_layernorm_comm 0.03% : 0.000007s : 1: split_matmul_comm_elemetwise 0.05% : 0.000011s : 1: swap_dp_allreduce_reducescatter 0.42% : 0.000106s : 1: symbol_engine_optimizer 0.38% : 0.000097s : 1: tuple_transform 21.41% : 0.005373s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:51.425.958 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0143511, [21] [bootstrap]: 0.0004467 [type_inference]: 0.00540803 [event_method]: 1.322e-05 [auto_monad]: 5.68e-05 [graph_reusing]: 5.09e-06 [inline]: 2.36e-06 [add_attr]: 0.00314337, [1] [add_attr_with_inline]: 0.00313413, [1] [Cycle 1]: 5.376e-05, [2] [tag_attr]: 1.533e-05 [meta_addattr_fg_expand]: 3.93001e-06 [parallel-infer-symbol]: 3.11001e-06 [pre_auto_parallel]: 2.743e-05 [insert-virtual-dataset]: 2.36e-06 [parallel-infer-symbol-second]: 7.90023e-07 [dataset_repeat_opt]: 2.91e-06 [pipeline_split]: 1.84e-06 [optimize]: 0.00455585, [53] [py_interpret_to_execute]: 1.759e-05 [rewriter_before_opt_a]: 4.904e-05 [opt_a]: 0.00241452, [2] [Cycle 1]: 0.00168934, [45] [expand_dump_flag]: 3.19001e-06 [switch_simplify]: 2.739e-05 [loop_unroll]: 1.484e-05 [a_1]: 0.0003439 [with_stream_mark]: 1.649e-05 [recompute_prepare]: 9.57999e-06 [updatestate_depend_eliminate]: 5.05001e-06 [updatestate_assign_eliminate]: 3.63e-06 [updatestate_loads_eliminate]: 3.95e-06 [parameter_eliminate]: 1.99e-06 [a_2]: 9.789e-05 [accelerated_algorithm]: 7.93999e-06 [shard]: 2.35002e-06 [meta_shard_fg_expand]: 1.99999e-06 [shard_inline]: 7.43999e-06 [merge_send_recv]: 9.66e-06 [auto_parallel]: 6.83e-06 [parallel]: 1.79e-05 [flash_sp]: 7.86001e-06 [merge_comm]: 4.96002e-06 [allreduce_fusion]: 4.13001e-06 [matmul_add_comm_reduction]: 1.067e-05 [allreduce_slice_to_reducescatter]: 6.19999e-07 [virtual_shard_identity]: 9.75002e-06 [virtual_dataset]: 7.57998e-06 [get_grad_eliminate_]: 6.76999e-06 [virtual_output]: 7.74002e-06 [merge_forward]: 4.58999e-06 [cell_reuse_recompute_pass]: 1.25999e-06 [offload_activation]: 1.043e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.442e-05 [merge_recompute_call_nodes]: 1.59e-06 [before_grad]: 1.334e-05 [set_forward_comm_id_for_comm_node_pass]: 4.37e-06 [meta_fg_expand]: 3.16001e-06 [flash_sp_send_recv_attached]: 2.91e-06 [receive_attached]: 2.16e-06 [after_resolve]: 1.038e-05 [a_after_grad]: 1.132e-05 [renormalize]: 0.00060945 [add_forward_monad_depend]: 4.94003e-06 [auto_monad_grad]: 2.48e-06 [auto_monad_eliminator]: 1.666e-05 [cse]: 3.778e-05 [a_3]: 5.689e-05 [Cycle 2]: 0.0007153, [45] [expand_dump_flag]: 1.51002e-06 [switch_simplify]: 8.87e-06 [loop_unroll]: 7.03e-06 [a_1]: 0.00015084 [with_stream_mark]: 1.243e-05 [recompute_prepare]: 8.15999e-06 [updatestate_depend_eliminate]: 4.2e-06 [updatestate_assign_eliminate]: 3.10998e-06 [updatestate_loads_eliminate]: 2.84999e-06 [parameter_eliminate]: 9.60019e-07 [a_2]: 8.671e-05 [accelerated_algorithm]: 7.29001e-06 [shard]: 1.22e-06 [meta_shard_fg_expand]: 1.66002e-06 [shard_inline]: 7.46001e-06 [merge_send_recv]: 5.86e-06 [auto_parallel]: 6.29999e-06 [parallel]: 5.25999e-06 [flash_sp]: 3.46999e-06 [merge_comm]: 3.88999e-06 [allreduce_fusion]: 3.53e-06 [matmul_add_comm_reduction]: 1.015e-05 [allreduce_slice_to_reducescatter]: 3.69997e-07 [virtual_shard_identity]: 8.15999e-06 [virtual_dataset]: 6.57002e-06 [get_grad_eliminate_]: 6.49999e-06 [virtual_output]: 6.24999e-06 [merge_forward]: 3.36001e-06 [cell_reuse_recompute_pass]: 1.49e-06 [offload_activation]: 7.51999e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.41e-05 [merge_recompute_call_nodes]: 8.70001e-07 [before_grad]: 1.139e-05 [set_forward_comm_id_for_comm_node_pass]: 4.50999e-06 [meta_fg_expand]: 2.56e-06 [flash_sp_send_recv_attached]: 1.04998e-06 [receive_attached]: 1.13001e-06 [after_resolve]: 9.56998e-06 [a_after_grad]: 1.003e-05 [renormalize]: 8.00064e-08 [add_forward_monad_depend]: 1.26002e-06 [auto_monad_grad]: 9.10019e-07 [auto_monad_eliminator]: 8.19002e-06 [cse]: 1.853e-05 [a_3]: 4.274e-05 [py_interpret_to_execute_after_opt_a]: 1.027e-05 [slice_cell_reuse_recomputed_activation]: 2.16e-06 [rewriter_after_opt_a]: 4.055e-05 [convert_after_rewriter]: 7.77e-06 [order_py_execute_after_rewriter]: 6.09999e-06 [mutable_eliminate]: 0.00051708 [opt_b]: 0.00029675, [1] [Cycle 1]: 0.00029017, [7] [b_1]: 0.00018028 [b_2]: 9.07001e-06 [updatestate_depend_eliminate]: 6.81001e-06 [updatestate_assign_eliminate]: 3.22002e-06 [updatestate_loads_eliminate]: 3.03e-06 [renormalize]: 9.80013e-07 [cse]: 2.834e-05 [optimize_parallel_all_gather_comm]: 1.877e-05 [overlap_param_gather]: 2.06e-06 [cconv]: 2.748e-05 [loop_unroll]: 0.00044725 [opt_after_cconv]: 0.00011465, [1] [Cycle 1]: 0.00010867, [7] [c_1]: 3.346e-05 [parameter_eliminate]: 3.49001e-06 [updatestate_depend_eliminate]: 6.14999e-06 [updatestate_assign_eliminate]: 2.99001e-06 [updatestate_loads_eliminate]: 3.39001e-06 [cse]: 2.392e-05 [renormalize]: 5.19998e-07 [remove_dup_value]: 1.711e-05 [tuple_transform]: 7.968e-05, [1] [Cycle 1]: 7.551e-05, [4] [d_1]: 4.717e-05 [none_parameter_eliminate]: 1.60001e-06 [renormalize]: 1.90019e-07 [switch_simplify]: 7.71999e-06 [partial_unused_args_eliminate]: 1.66e-06 [add_recomputation]: 5.446e-05 [cse_after_recomputation]: 2.597e-05, [1] [Cycle 1]: 2.114e-05, [1] [cse]: 1.559e-05 [environ_conv]: 7.21001e-06 [swap_dp_allreduce_reducescatter]: 6.07999e-06 [bias_add_comm_swap]: 2.73e-06 [label_micro_interleaved_index]: 4.18999e-06 [label_fine_grained_interleaved_index]: 3.18998e-06 [merge_cast_opt]: 1.28002e-06 [slice_recompute_activation]: 2.25002e-06 [micro_interleaved_order_control]: 2.44999e-06 [assign_add_opt]: 1.56002e-06 [ForceFp32Comm]: 7.79983e-07 [remove_cast_before_assign_add]: 1.24e-06 [full_micro_interleaved_order_control]: 1.99e-06 [reorder_send_recv_between_fp_bp]: 2.71e-06 [comm_op_add_attrs]: 1.20001e-06 [add_comm_op_reuse_tag]: 1.00001e-06 [interleave_split_concat_branches]: 1.13001e-06 [interleave_parallel_branches]: 1.07e-06 [overlap_opt_shard_in_pipeline]: 1.05001e-06 [overlap_opt_shard_grad_in_pipeline]: 2.08998e-06 [control_data_broadcast_order]: 1.427e-05 [grouped_pairwise_exchange_alltoall]: 1.70001e-06 [offloading_packed_experts]: 4.50999e-06 [overlap_recompute_and_grad_model_parallel]: 5.13002e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.15999e-06 [overlap_recompute_allgather_and_fa_grad]: 1.59e-06 [overlap_recompute_comm]: 2.17999e-06 [overlap_grad_ring_attention]: 4.54998e-06 [overlap_grad_flash_sp]: 2.201e-05 [begin_end_overlap_inline]: 5.19998e-07 [split_matmul_comm_elemetwise]: 2.09999e-06 [split_layernorm_comm]: 1.82001e-06 [handle_group_info]: 9.80013e-07 [symbol_engine_optimizer]: 8.117e-05, [1] [Cycle 1]: 7.685e-05, [6] [build]: 3.18e-06 [elim_shapecalc]: 1.061e-05 [elim_not_effective]: 1.44e-05 [opt_reshape]: 7.51001e-06 [fold_const_symbol]: 1.182e-05 [renormalize]: 2.30008e-07 [detach_backward]: 2.11e-06 [pipeline_parallel_scheduler]: 1.54e-06 [auto_monad_reorder]: 1.973e-05 [get_jit_bprop_graph]: 2.04999e-06 [rewriter_after_jit_bprop_graph]: 4.52e-06 [opt_after_jit_grad]: 0.00045919 [validate]: 4.282e-05 Sums bootstrap : 0.000447s : 4.36% type_inference : 0.005408s : 52.84% event_method : 0.000013s : 0.13% auto_monad : 0.000057s : 0.55% graph_reusing : 0.000005s : 0.05% inline : 0.000002s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000015s : 0.15% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000004s : 0.04% parallel-infer-symbol : 0.000003s : 0.03% pre_auto_parallel : 0.000027s : 0.27% insert-virtual-dataset : 0.000002s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000003s : 0.03% pipeline_split : 0.000002s : 0.02% optimize.py_interpret_to_execute : 0.000018s : 0.17% optimize.rewriter_before_opt_a : 0.000049s : 0.48% optimize.opt_a.expand_dump_flag : 0.000005s : 0.05% optimize.opt_a.switch_simplify : 0.000036s : 0.35% optimize.opt_a.loop_unroll : 0.000022s : 0.21% optimize.opt_a.a_1 : 0.000495s : 4.83% optimize.opt_a.with_stream_mark : 0.000029s : 0.28% optimize.opt_a.recompute_prepare : 0.000018s : 0.17% optimize.opt_a.updatestate_depend_eliminate : 0.000009s : 0.09% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.07% optimize.opt_a.updatestate_loads_eliminate : 0.000007s : 0.07% optimize.opt_a.parameter_eliminate : 0.000003s : 0.03% optimize.opt_a.a_2 : 0.000185s : 1.80% optimize.opt_a.accelerated_algorithm : 0.000015s : 0.15% optimize.opt_a.shard : 0.000004s : 0.03% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.04% optimize.opt_a.shard_inline : 0.000015s : 0.15% optimize.opt_a.merge_send_recv : 0.000016s : 0.15% optimize.opt_a.auto_parallel : 0.000013s : 0.13% optimize.opt_a.parallel : 0.000023s : 0.23% optimize.opt_a.flash_sp : 0.000011s : 0.11% optimize.opt_a.merge_comm : 0.000009s : 0.09% optimize.opt_a.allreduce_fusion : 0.000008s : 0.07% optimize.opt_a.matmul_add_comm_reduction : 0.000021s : 0.20% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000018s : 0.17% optimize.opt_a.virtual_dataset : 0.000014s : 0.14% optimize.opt_a.get_grad_eliminate_ : 0.000013s : 0.13% optimize.opt_a.virtual_output : 0.000014s : 0.14% optimize.opt_a.merge_forward : 0.000008s : 0.08% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.03% optimize.opt_a.offload_activation : 0.000018s : 0.18% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000029s : 0.28% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.02% optimize.opt_a.before_grad : 0.000025s : 0.24% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000009s : 0.09% optimize.opt_a.meta_fg_expand : 0.000006s : 0.06% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.04% optimize.opt_a.receive_attached : 0.000003s : 0.03% optimize.opt_a.after_resolve : 0.000020s : 0.19% optimize.opt_a.a_after_grad : 0.000021s : 0.21% optimize.opt_a.renormalize : 0.000610s : 5.96% optimize.opt_a.add_forward_monad_depend : 0.000006s : 0.06% optimize.opt_a.auto_monad_grad : 0.000003s : 0.03% optimize.opt_a.auto_monad_eliminator : 0.000025s : 0.24% optimize.opt_a.cse : 0.000056s : 0.55% optimize.opt_a.a_3 : 0.000100s : 0.97% optimize.py_interpret_to_execute_after_opt_a : 0.000010s : 0.10% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.02% optimize.rewriter_after_opt_a : 0.000041s : 0.40% optimize.convert_after_rewriter : 0.000008s : 0.08% optimize.order_py_execute_after_rewriter : 0.000006s : 0.06% optimize.mutable_eliminate : 0.000517s : 5.05% optimize.opt_b.b_1 : 0.000180s : 1.76% optimize.opt_b.b_2 : 0.000009s : 0.09% optimize.opt_b.updatestate_depend_eliminate : 0.000007s : 0.07% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_b.renormalize : 0.000001s : 0.01% optimize.opt_b.cse : 0.000028s : 0.28% optimize.optimize_parallel_all_gather_comm : 0.000019s : 0.18% optimize.overlap_param_gather : 0.000002s : 0.02% optimize.cconv : 0.000027s : 0.27% optimize.loop_unroll : 0.000447s : 4.37% optimize.opt_after_cconv.c_1 : 0.000033s : 0.33% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.06% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.cse : 0.000024s : 0.23% optimize.opt_after_cconv.renormalize : 0.000001s : 0.01% optimize.remove_dup_value : 0.000017s : 0.17% optimize.tuple_transform.d_1 : 0.000047s : 0.46% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.02% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000008s : 0.08% optimize.partial_unused_args_eliminate : 0.000002s : 0.02% optimize.add_recomputation : 0.000054s : 0.53% optimize.cse_after_recomputation.cse : 0.000016s : 0.15% optimize.environ_conv : 0.000007s : 0.07% optimize.swap_dp_allreduce_reducescatter : 0.000006s : 0.06% optimize.bias_add_comm_swap : 0.000003s : 0.03% optimize.label_micro_interleaved_index : 0.000004s : 0.04% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.03% optimize.merge_cast_opt : 0.000001s : 0.01% optimize.slice_recompute_activation : 0.000002s : 0.02% optimize.micro_interleaved_order_control : 0.000002s : 0.02% optimize.assign_add_opt : 0.000002s : 0.02% optimize.ForceFp32Comm : 0.000001s : 0.01% optimize.remove_cast_before_assign_add : 0.000001s : 0.01% optimize.full_micro_interleaved_order_control : 0.000002s : 0.02% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.03% optimize.comm_op_add_attrs : 0.000001s : 0.01% optimize.add_comm_op_reuse_tag : 0.000001s : 0.01% optimize.interleave_split_concat_branches : 0.000001s : 0.01% optimize.interleave_parallel_branches : 0.000001s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.02% optimize.control_data_broadcast_order : 0.000014s : 0.14% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.02% optimize.offloading_packed_experts : 0.000005s : 0.04% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.05% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000002s : 0.02% optimize.overlap_recompute_comm : 0.000002s : 0.02% optimize.overlap_grad_ring_attention : 0.000005s : 0.04% optimize.overlap_grad_flash_sp : 0.000022s : 0.22% optimize.begin_end_overlap_inline : 0.000001s : 0.01% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.02% optimize.split_layernorm_comm : 0.000002s : 0.02% optimize.handle_group_info : 0.000001s : 0.01% optimize.symbol_engine_optimizer.build : 0.000003s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000011s : 0.10% optimize.symbol_engine_optimizer.elim_not_effective : 0.000014s : 0.14% optimize.symbol_engine_optimizer.opt_reshape : 0.000008s : 0.07% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000012s : 0.12% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.02% pipeline_parallel_scheduler : 0.000002s : 0.02% auto_monad_reorder : 0.000020s : 0.19% get_jit_bprop_graph : 0.000002s : 0.02% rewriter_after_jit_bprop_graph : 0.000005s : 0.04% opt_after_jit_grad : 0.000459s : 4.49% validate : 0.000043s : 0.42% Time group info: ------[substitution.] 0.000170 29 11.57% : 0.000020s : 2: substitution.cast_eliminate 1.24% : 0.000002s : 3: substitution.elim_not_effective 1.04% : 0.000002s : 3: substitution.fold_const_symbol 3.99% : 0.000007s : 4: substitution.graph_param_transform 57.76% : 0.000098s : 2: substitution.inline 2.75% : 0.000005s : 6: substitution.j_node_and_user_rematch 3.31% : 0.000006s : 6: substitution.remove_not_recompute_node 1.83% : 0.000003s : 2: substitution.replace_old_param 16.52% : 0.000028s : 1: substitution.value_based_eliminate ------[type_inference.] 0.005362 2 91.46% : 0.004904s : 1: type_inference.infer 8.54% : 0.000458s : 1: type_inference.specialize ------[replace.] 0.000022 2 100.00% : 0.000022s : 2: replace.inline ------[match.] 0.000096 2 100.00% : 0.000096s : 2: match.inline ------[predicate.] 0.000166 980 0.79% : 0.000001s : 9: predicate.accumulaten_eliminater 0.93% : 0.000002s : 4: predicate.ad_related_special_op_eliminate 0.70% : 0.000001s : 8: predicate.addn_check_dump 1.11% : 0.000002s : 9: predicate.addn_zero_filter 0.75% : 0.000001s : 9: predicate.adjust_all_reduce_mul_add 2.18% : 0.000004s : 17: predicate.arithmetic_simplify 0.96% : 0.000002s : 9: predicate.cast_eliminate 0.75% : 0.000001s : 8: predicate.check_bprop_eliminate 0.70% : 0.000001s : 8: predicate.compare_switch_simplify 0.23% : 0.000000s : 4: predicate.const_output_eliminate 0.85% : 0.000001s : 8: predicate.depend_value_elim 0.79% : 0.000001s : 9: predicate.dict_get_item_const_eliminator 0.90% : 0.000001s : 9: predicate.dict_get_item_eliminator 0.80% : 0.000001s : 9: predicate.dict_set_item_eliminator 1.14% : 0.000002s : 8: predicate.dumpgradient_eliminate 0.28% : 0.000000s : 4: predicate.elim_not_effective 0.51% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.14% : 0.000002s : 13: predicate.environ_add_const_eliminate 1.08% : 0.000002s : 13: predicate.environ_get_add_eliminate 1.04% : 0.000002s : 13: predicate.environ_get_depend_swap 1.80% : 0.000003s : 21: predicate.environ_get_eliminate 1.05% : 0.000002s : 13: predicate.environ_get_set_eliminate 0.94% : 0.000002s : 11: predicate.exchange_switch_depend_value 1.81% : 0.000003s : 11: predicate.float_depend_g_call 0.73% : 0.000001s : 8: predicate.float_environ_get_switch 1.01% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.24% : 0.000000s : 4: predicate.fold_const_symbol 0.84% : 0.000001s : 8: predicate.get_grad_eliminate 0.35% : 0.000001s : 4: predicate.graph_param_transform 0.87% : 0.000001s : 8: predicate.incorporate_call 0.67% : 0.000001s : 8: predicate.incorporate_call_switch 6.48% : 0.000011s : 44: predicate.inline 1.00% : 0.000002s : 8: predicate.inline_without_move 0.40% : 0.000001s : 8: predicate.j_node_and_user_rematch 1.09% : 0.000002s : 8: predicate.less_batch_normalization 1.60% : 0.000003s : 17: predicate.list_to_tuple_eliminator_ 2.21% : 0.000004s : 26: predicate.load_eliminater 1.30% : 0.000002s : 4: predicate.loop_unroll_after_grad 1.50% : 0.000002s : 16: predicate.loop_unroll_before_grad 1.68% : 0.000003s : 17: predicate.make_slice_get_slice_eliminator 0.76% : 0.000001s : 8: predicate.merge_addn 0.72% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.70% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.73% : 0.000001s : 9: predicate.minmaximum_grad 1.31% : 0.000002s : 4: predicate.mutable_eliminate 0.43% : 0.000001s : 4: predicate.opt_reshape 0.43% : 0.000001s : 4: predicate.parallel_virtual_node 1.20% : 0.000002s : 11: predicate.partial_defer_inline 1.33% : 0.000002s : 13: predicate.partial_eliminate 0.85% : 0.000001s : 9: predicate.print_const_string_wrapper 0.71% : 0.000001s : 8: predicate.reduce_all_const_elim 1.09% : 0.000002s : 9: predicate.reduce_eliminate 2.13% : 0.000004s : 26: predicate.redundant_stop_gradient_eliminater 0.53% : 0.000001s : 8: predicate.remove_not_recompute_node 1.19% : 0.000002s : 17: predicate.replace_applicator 0.59% : 0.000001s : 8: predicate.replace_old_param 0.30% : 0.000001s : 4: predicate.reset_defer_inline 0.84% : 0.000001s : 9: predicate.reshape_eliminate 0.81% : 0.000001s : 8: predicate.row_tensor_add_zeros_like 0.46% : 0.000001s : 4: predicate.row_tensor_eliminate 0.96% : 0.000002s : 8: predicate.same_eliminate 0.55% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.98% : 0.000002s : 8: predicate.shard_identity_eliminate 1.09% : 0.000002s : 8: predicate.special_op_eliminate 1.01% : 0.000002s : 8: predicate.specialize_transform 1.26% : 0.000002s : 8: predicate.split_environ_get_set_with_tuple_value 0.89% : 0.000001s : 8: predicate.stack_unstack_eliminate 0.48% : 0.000001s : 4: predicate.switch_call_monad_eliminater 0.95% : 0.000002s : 11: predicate.switch_defer_inline 1.75% : 0.000003s : 19: predicate.switch_layer_defer_inline 4.29% : 0.000007s : 39: predicate.switch_simplify 0.82% : 0.000001s : 9: predicate.tile_eliminate 0.83% : 0.000001s : 9: predicate.transpose_eliminate 1.75% : 0.000003s : 17: predicate.tuple_list_convert_item_index_to_positive 1.63% : 0.000003s : 17: predicate.tuple_list_get_item_const_eliminator 1.47% : 0.000002s : 17: predicate.tuple_list_get_item_depend_reorder 3.06% : 0.000005s : 25: predicate.tuple_list_get_item_eliminator 1.49% : 0.000002s : 17: predicate.tuple_list_get_set_item_eliminator 2.61% : 0.000004s : 25: predicate.tuple_list_set_item_eliminator 1.53% : 0.000003s : 17: predicate.tuple_to_list_eliminator_ 2.13% : 0.000004s : 26: predicate.updatestate_pure_node_eliminater 3.03% : 0.000005s : 34: predicate.updatestate_useless_node_eliminater 0.59% : 0.000001s : 4: predicate.value_based_eliminate 0.76% : 0.000001s : 8: predicate.virtual_dataset_eliminate 0.87% : 0.000001s : 8: predicate.virtual_output_eliminate 0.32% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.57% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000218 5 8.78% : 0.000019s : 1: func_graph_cloner_run.FuncGraphClonerGraph 91.22% : 0.000199s : 4: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.023819 192 0.01% : 0.000003s : 1: ForceFp32Comm 13.22% : 0.003148s : 1: add_attr 13.17% : 0.003138s : 1: add_attr_with_inline 0.02% : 0.000004s : 1: add_comm_op_reuse_tag 0.25% : 0.000058s : 1: add_recomputation 0.02% : 0.000004s : 1: assign_add_opt 0.26% : 0.000062s : 1: auto_monad 0.10% : 0.000024s : 1: auto_monad_reorder 0.01% : 0.000003s : 1: begin_end_overlap_inline 0.02% : 0.000006s : 1: bias_add_comm_swap 2.01% : 0.000478s : 1: bootstrap 0.13% : 0.000031s : 1: cconv 0.02% : 0.000004s : 1: comm_op_add_attrs 0.07% : 0.000017s : 1: control_data_broadcast_order 0.05% : 0.000011s : 1: convert_after_rewriter 0.12% : 0.000029s : 1: cse_after_recomputation 0.02% : 0.000006s : 1: dataset_repeat_opt 0.02% : 0.000005s : 1: detach_backward 0.04% : 0.000010s : 1: environ_conv 0.08% : 0.000020s : 1: event_method 0.02% : 0.000005s : 1: full_micro_interleaved_order_control 0.02% : 0.000005s : 1: get_jit_bprop_graph 0.04% : 0.000009s : 1: graph_reusing 0.02% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.02% : 0.000004s : 1: handle_group_info 0.02% : 0.000005s : 1: inline 0.02% : 0.000006s : 1: insert-virtual-dataset 0.02% : 0.000004s : 1: interleave_parallel_branches 0.02% : 0.000004s : 1: interleave_split_concat_branches 0.03% : 0.000006s : 1: label_fine_grained_interleaved_index 0.03% : 0.000007s : 1: label_micro_interleaved_index 1.91% : 0.000455s : 1: loop_unroll 0.02% : 0.000004s : 1: merge_cast_opt 0.02% : 0.000005s : 1: micro_interleaved_order_control 2.20% : 0.000525s : 1: mutable_eliminate 0.03% : 0.000007s : 1: offloading_packed_experts 0.07% : 0.000017s : 1: opt.transform.loop_unroll_optimizer 0.07% : 0.000018s : 1: opt.transform.mutable_eliminate 3.97% : 0.000946s : 78: opt.transform.opt_a 0.13% : 0.000032s : 1: opt.transform.opt_after_cconv 0.11% : 0.000027s : 1: opt.transform.opt_after_jit_grad 0.66% : 0.000157s : 28: opt.transform.opt_b 0.22% : 0.000053s : 2: opt.transform.opt_trans_graph 0.17% : 0.000041s : 4: opt.transform.symbol_engine_opt 10.15% : 0.002417s : 1: opt_a 0.50% : 0.000118s : 1: opt_after_cconv 1.96% : 0.000468s : 1: opt_after_jit_grad 1.26% : 0.000300s : 1: opt_b 19.15% : 0.004561s : 1: optimize 0.09% : 0.000022s : 1: optimize_parallel_all_gather_comm 0.04% : 0.000009s : 1: order_py_execute_after_rewriter 0.11% : 0.000025s : 1: overlap_grad_flash_sp 0.02% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.03% : 0.000007s : 1: overlap_grad_ring_attention 0.02% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.02% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.02% : 0.000006s : 1: overlap_param_gather 0.02% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.03% : 0.000008s : 1: overlap_recompute_and_grad_model_parallel 0.02% : 0.000005s : 1: overlap_recompute_comm 0.03% : 0.000006s : 1: parallel-infer-symbol 0.02% : 0.000004s : 1: parallel-infer-symbol-second 0.02% : 0.000005s : 1: partial_unused_args_eliminate 0.02% : 0.000005s : 1: pipeline_parallel_scheduler 0.02% : 0.000005s : 1: pipeline_split 0.13% : 0.000031s : 1: pre_auto_parallel 0.09% : 0.000021s : 1: py_interpret_to_execute 0.06% : 0.000013s : 1: py_interpret_to_execute_after_opt_a 0.02% : 0.000004s : 1: remove_cast_before_assign_add 0.09% : 0.000021s : 1: remove_dup_value 1.48% : 0.000352s : 1: renormalize.infer 1.05% : 0.000250s : 1: renormalize.specialize 0.02% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.03% : 0.000008s : 1: rewriter_after_jit_bprop_graph 0.19% : 0.000045s : 1: rewriter_after_opt_a 0.22% : 0.000053s : 1: rewriter_before_opt_a 0.02% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.02% : 0.000005s : 1: slice_recompute_activation 0.02% : 0.000005s : 1: split_layernorm_comm 0.02% : 0.000005s : 1: split_matmul_comm_elemetwise 0.04% : 0.000009s : 1: swap_dp_allreduce_reducescatter 0.35% : 0.000084s : 1: symbol_engine_optimizer 0.35% : 0.000083s : 1: tuple_transform 22.77% : 0.005425s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:51.626.576 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:51.626.843 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0159275, [21] [bootstrap]: 0.00042989 [type_inference]: 0.00567361 [event_method]: 1.386e-05 [auto_monad]: 5.684e-05 [graph_reusing]: 5.47001e-06 [inline]: 2.61999e-06 [add_attr]: 0.00347897, [1] [add_attr_with_inline]: 0.00346979, [1] [Cycle 1]: 7.013e-05, [2] [tag_attr]: 1.515e-05 [meta_addattr_fg_expand]: 4.2e-06 [parallel-infer-symbol]: 3.26999e-06 [pre_auto_parallel]: 2.925e-05 [insert-virtual-dataset]: 2.68998e-06 [parallel-infer-symbol-second]: 7.79983e-07 [dataset_repeat_opt]: 2.04e-06 [pipeline_split]: 1.67999e-06 [optimize]: 0.00500034, [53] [py_interpret_to_execute]: 2.238e-05 [rewriter_before_opt_a]: 5.452e-05 [opt_a]: 0.00256515, [2] [Cycle 1]: 0.00174054, [45] [expand_dump_flag]: 2.69999e-06 [switch_simplify]: 2.631e-05 [loop_unroll]: 1.42e-05 [a_1]: 0.00030784 [with_stream_mark]: 1.504e-05 [recompute_prepare]: 9.05001e-06 [updatestate_depend_eliminate]: 4.35999e-06 [updatestate_assign_eliminate]: 3.56999e-06 [updatestate_loads_eliminate]: 3.35003e-06 [parameter_eliminate]: 1.99e-06 [a_2]: 0.00011441 [accelerated_algorithm]: 7.04001e-06 [shard]: 1.95001e-06 [meta_shard_fg_expand]: 2.02001e-06 [shard_inline]: 5.97001e-06 [merge_send_recv]: 8.75001e-06 [auto_parallel]: 6.76999e-06 [parallel]: 1.706e-05 [flash_sp]: 8.54998e-06 [merge_comm]: 4.57e-06 [allreduce_fusion]: 3.72002e-06 [matmul_add_comm_reduction]: 9.57001e-06 [allreduce_slice_to_reducescatter]: 6.80011e-07 [virtual_shard_identity]: 7.55e-06 [virtual_dataset]: 6.34001e-06 [get_grad_eliminate_]: 5.89e-06 [virtual_output]: 6.47001e-06 [merge_forward]: 3.54002e-06 [cell_reuse_recompute_pass]: 1.35001e-06 [offload_activation]: 1.034e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.578e-05 [merge_recompute_call_nodes]: 1.60999e-06 [before_grad]: 1.088e-05 [set_forward_comm_id_for_comm_node_pass]: 3.63e-06 [meta_fg_expand]: 2.66e-06 [flash_sp_send_recv_attached]: 3.13998e-06 [receive_attached]: 2.21e-06 [after_resolve]: 1.011e-05 [a_after_grad]: 9.39998e-06 [renormalize]: 0.00055446 [add_forward_monad_depend]: 5.89e-06 [auto_monad_grad]: 2.46e-06 [auto_monad_eliminator]: 1.612e-05 [cse]: 3.098e-05 [a_3]: 6.183e-05 [Cycle 2]: 0.00081065, [45] [expand_dump_flag]: 1.25001e-06 [switch_simplify]: 7.33e-06 [loop_unroll]: 6.29999e-06 [a_1]: 0.0001096 [with_stream_mark]: 1.257e-05 [recompute_prepare]: 6.39001e-06 [updatestate_depend_eliminate]: 3.2e-06 [updatestate_assign_eliminate]: 2.55002e-06 [updatestate_loads_eliminate]: 2.82002e-06 [parameter_eliminate]: 1.13001e-06 [a_2]: 9.747e-05 [accelerated_algorithm]: 6.04001e-06 [shard]: 1.47001e-06 [meta_shard_fg_expand]: 1.60999e-06 [shard_inline]: 6.34001e-06 [merge_send_recv]: 6.02999e-06 [auto_parallel]: 6.31998e-06 [parallel]: 5.52001e-06 [flash_sp]: 3.88001e-06 [merge_comm]: 3.42997e-06 [allreduce_fusion]: 3.55e-06 [matmul_add_comm_reduction]: 6.99001e-06 [allreduce_slice_to_reducescatter]: 6.39993e-07 [virtual_shard_identity]: 7.74002e-06 [virtual_dataset]: 5.60001e-06 [get_grad_eliminate_]: 5.82999e-06 [virtual_output]: 5.35999e-06 [merge_forward]: 3.70998e-06 [cell_reuse_recompute_pass]: 1.90001e-06 [offload_activation]: 8.30999e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.541e-05 [merge_recompute_call_nodes]: 9.00007e-07 [before_grad]: 9.24e-06 [set_forward_comm_id_for_comm_node_pass]: 4.25e-06 [meta_fg_expand]: 2.53e-06 [flash_sp_send_recv_attached]: 8.89995e-07 [receive_attached]: 1.10999e-06 [after_resolve]: 9.31e-06 [a_after_grad]: 8.28999e-06 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 2.14e-06 [auto_monad_grad]: 1.20999e-06 [auto_monad_eliminator]: 8.45001e-06 [cse]: 1.736e-05 [a_3]: 4.893e-05 [py_interpret_to_execute_after_opt_a]: 1.3e-05 [slice_cell_reuse_recomputed_activation]: 4.75001e-06 [rewriter_after_opt_a]: 4.056e-05 [convert_after_rewriter]: 9.70002e-06 [order_py_execute_after_rewriter]: 9.14e-06 [mutable_eliminate]: 0.00062058 [opt_b]: 0.00029013, [1] [Cycle 1]: 0.00028029, [7] [b_1]: 0.00017862 [b_2]: 8.02003e-06 [updatestate_depend_eliminate]: 6.04001e-06 [updatestate_assign_eliminate]: 2.53998e-06 [updatestate_loads_eliminate]: 2.41e-06 [renormalize]: 6.00005e-07 [cse]: 2.178e-05 [optimize_parallel_all_gather_comm]: 2.15e-05 [overlap_param_gather]: 4.68001e-06 [cconv]: 2.759e-05 [loop_unroll]: 0.00049518 [opt_after_cconv]: 0.00012899, [1] [Cycle 1]: 0.00011935, [7] [c_1]: 2.807e-05 [parameter_eliminate]: 3.46999e-06 [updatestate_depend_eliminate]: 6.28e-06 [updatestate_assign_eliminate]: 2.78e-06 [updatestate_loads_eliminate]: 2.36998e-06 [cse]: 1.968e-05 [renormalize]: 5.50004e-07 [remove_dup_value]: 1.852e-05 [tuple_transform]: 8.85e-05, [1] [Cycle 1]: 8.132e-05, [4] [d_1]: 4.123e-05 [none_parameter_eliminate]: 1.70001e-06 [renormalize]: 1.60013e-07 [switch_simplify]: 7.57002e-06 [partial_unused_args_eliminate]: 4.57e-06 [add_recomputation]: 4.983e-05 [cse_after_recomputation]: 2.864e-05, [1] [Cycle 1]: 2.153e-05, [1] [cse]: 1.247e-05 [environ_conv]: 8.77e-06 [swap_dp_allreduce_reducescatter]: 7.93001e-06 [bias_add_comm_swap]: 5.30999e-06 [label_micro_interleaved_index]: 7.09001e-06 [label_fine_grained_interleaved_index]: 5.20001e-06 [merge_cast_opt]: 4.28001e-06 [slice_recompute_activation]: 4.51002e-06 [micro_interleaved_order_control]: 4.39002e-06 [assign_add_opt]: 3.78001e-06 [ForceFp32Comm]: 3.28998e-06 [remove_cast_before_assign_add]: 3.51999e-06 [full_micro_interleaved_order_control]: 4.95001e-06 [reorder_send_recv_between_fp_bp]: 5.42001e-06 [comm_op_add_attrs]: 3.8e-06 [add_comm_op_reuse_tag]: 3.53e-06 [interleave_split_concat_branches]: 3.74002e-06 [interleave_parallel_branches]: 3.52002e-06 [overlap_opt_shard_in_pipeline]: 3.72002e-06 [overlap_opt_shard_grad_in_pipeline]: 4.07998e-06 [control_data_broadcast_order]: 1.61e-05 [grouped_pairwise_exchange_alltoall]: 3.86999e-06 [offloading_packed_experts]: 6.83e-06 [overlap_recompute_and_grad_model_parallel]: 7.23999e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.80998e-06 [overlap_recompute_allgather_and_fa_grad]: 3.78001e-06 [overlap_recompute_comm]: 4.97e-06 [overlap_grad_ring_attention]: 6.54001e-06 [overlap_grad_flash_sp]: 2.284e-05 [begin_end_overlap_inline]: 2.89001e-06 [split_matmul_comm_elemetwise]: 4.87e-06 [split_layernorm_comm]: 4.28999e-06 [handle_group_info]: 3.36999e-06 [symbol_engine_optimizer]: 0.00010199, [1] [Cycle 1]: 9.48e-05, [6] [build]: 2.91e-06 [elim_shapecalc]: 1.211e-05 [elim_not_effective]: 1.322e-05 [opt_reshape]: 7.11001e-06 [fold_const_symbol]: 1.02e-05 [renormalize]: 2.19996e-07 [detach_backward]: 3.98001e-06 [pipeline_parallel_scheduler]: 1.97999e-06 [auto_monad_reorder]: 2.2e-05 [get_jit_bprop_graph]: 1.57999e-06 [rewriter_after_jit_bprop_graph]: 5.54998e-06 [opt_after_jit_grad]: 0.00055712 [validate]: 3.964e-05 Sums bootstrap : 0.000430s : 4.03% type_inference : 0.005674s : 53.19% event_method : 0.000014s : 0.13% auto_monad : 0.000057s : 0.53% graph_reusing : 0.000005s : 0.05% inline : 0.000003s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000015s : 0.14% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000004s : 0.04% parallel-infer-symbol : 0.000003s : 0.03% pre_auto_parallel : 0.000029s : 0.27% insert-virtual-dataset : 0.000003s : 0.03% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.02% optimize.py_interpret_to_execute : 0.000022s : 0.21% optimize.rewriter_before_opt_a : 0.000055s : 0.51% optimize.opt_a.expand_dump_flag : 0.000004s : 0.04% optimize.opt_a.switch_simplify : 0.000034s : 0.32% optimize.opt_a.loop_unroll : 0.000021s : 0.19% optimize.opt_a.a_1 : 0.000417s : 3.91% optimize.opt_a.with_stream_mark : 0.000028s : 0.26% optimize.opt_a.recompute_prepare : 0.000015s : 0.14% optimize.opt_a.updatestate_depend_eliminate : 0.000008s : 0.07% optimize.opt_a.updatestate_assign_eliminate : 0.000006s : 0.06% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.06% optimize.opt_a.parameter_eliminate : 0.000003s : 0.03% optimize.opt_a.a_2 : 0.000212s : 1.99% optimize.opt_a.accelerated_algorithm : 0.000013s : 0.12% optimize.opt_a.shard : 0.000003s : 0.03% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.03% optimize.opt_a.shard_inline : 0.000012s : 0.12% optimize.opt_a.merge_send_recv : 0.000015s : 0.14% optimize.opt_a.auto_parallel : 0.000013s : 0.12% optimize.opt_a.parallel : 0.000023s : 0.21% optimize.opt_a.flash_sp : 0.000012s : 0.12% optimize.opt_a.merge_comm : 0.000008s : 0.07% optimize.opt_a.allreduce_fusion : 0.000007s : 0.07% optimize.opt_a.matmul_add_comm_reduction : 0.000017s : 0.16% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000015s : 0.14% optimize.opt_a.virtual_dataset : 0.000012s : 0.11% optimize.opt_a.get_grad_eliminate_ : 0.000012s : 0.11% optimize.opt_a.virtual_output : 0.000012s : 0.11% optimize.opt_a.merge_forward : 0.000007s : 0.07% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.03% optimize.opt_a.offload_activation : 0.000019s : 0.17% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000031s : 0.29% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.02% optimize.opt_a.before_grad : 0.000020s : 0.19% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000008s : 0.07% optimize.opt_a.meta_fg_expand : 0.000005s : 0.05% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.04% optimize.opt_a.receive_attached : 0.000003s : 0.03% optimize.opt_a.after_resolve : 0.000019s : 0.18% optimize.opt_a.a_after_grad : 0.000018s : 0.17% optimize.opt_a.renormalize : 0.000555s : 5.20% optimize.opt_a.add_forward_monad_depend : 0.000008s : 0.08% optimize.opt_a.auto_monad_grad : 0.000004s : 0.03% optimize.opt_a.auto_monad_eliminator : 0.000025s : 0.23% optimize.opt_a.cse : 0.000048s : 0.45% optimize.opt_a.a_3 : 0.000111s : 1.04% optimize.py_interpret_to_execute_after_opt_a : 0.000013s : 0.12% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.04% optimize.rewriter_after_opt_a : 0.000041s : 0.38% optimize.convert_after_rewriter : 0.000010s : 0.09% optimize.order_py_execute_after_rewriter : 0.000009s : 0.09% optimize.mutable_eliminate : 0.000621s : 5.82% optimize.opt_b.b_1 : 0.000179s : 1.67% optimize.opt_b.b_2 : 0.000008s : 0.08% optimize.opt_b.updatestate_depend_eliminate : 0.000006s : 0.06% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.02% optimize.opt_b.updatestate_loads_eliminate : 0.000002s : 0.02% optimize.opt_b.renormalize : 0.000001s : 0.01% optimize.opt_b.cse : 0.000022s : 0.20% optimize.optimize_parallel_all_gather_comm : 0.000021s : 0.20% optimize.overlap_param_gather : 0.000005s : 0.04% optimize.cconv : 0.000028s : 0.26% optimize.loop_unroll : 0.000495s : 4.64% optimize.opt_after_cconv.c_1 : 0.000028s : 0.26% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.06% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.02% optimize.opt_after_cconv.cse : 0.000020s : 0.18% optimize.opt_after_cconv.renormalize : 0.000001s : 0.01% optimize.remove_dup_value : 0.000019s : 0.17% optimize.tuple_transform.d_1 : 0.000041s : 0.39% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.02% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000008s : 0.07% optimize.partial_unused_args_eliminate : 0.000005s : 0.04% optimize.add_recomputation : 0.000050s : 0.47% optimize.cse_after_recomputation.cse : 0.000012s : 0.12% optimize.environ_conv : 0.000009s : 0.08% optimize.swap_dp_allreduce_reducescatter : 0.000008s : 0.07% optimize.bias_add_comm_swap : 0.000005s : 0.05% optimize.label_micro_interleaved_index : 0.000007s : 0.07% optimize.label_fine_grained_interleaved_index : 0.000005s : 0.05% optimize.merge_cast_opt : 0.000004s : 0.04% optimize.slice_recompute_activation : 0.000005s : 0.04% optimize.micro_interleaved_order_control : 0.000004s : 0.04% optimize.assign_add_opt : 0.000004s : 0.04% optimize.ForceFp32Comm : 0.000003s : 0.03% optimize.remove_cast_before_assign_add : 0.000004s : 0.03% optimize.full_micro_interleaved_order_control : 0.000005s : 0.05% optimize.reorder_send_recv_between_fp_bp : 0.000005s : 0.05% optimize.comm_op_add_attrs : 0.000004s : 0.04% optimize.add_comm_op_reuse_tag : 0.000004s : 0.03% optimize.interleave_split_concat_branches : 0.000004s : 0.04% optimize.interleave_parallel_branches : 0.000004s : 0.03% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.03% optimize.overlap_opt_shard_grad_in_pipeline : 0.000004s : 0.04% optimize.control_data_broadcast_order : 0.000016s : 0.15% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.04% optimize.offloading_packed_experts : 0.000007s : 0.06% optimize.overlap_recompute_and_grad_model_parallel : 0.000007s : 0.07% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.04% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.04% optimize.overlap_recompute_comm : 0.000005s : 0.05% optimize.overlap_grad_ring_attention : 0.000007s : 0.06% optimize.overlap_grad_flash_sp : 0.000023s : 0.21% optimize.begin_end_overlap_inline : 0.000003s : 0.03% optimize.split_matmul_comm_elemetwise : 0.000005s : 0.05% optimize.split_layernorm_comm : 0.000004s : 0.04% optimize.handle_group_info : 0.000003s : 0.03% optimize.symbol_engine_optimizer.build : 0.000003s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000012s : 0.11% optimize.symbol_engine_optimizer.elim_not_effective : 0.000013s : 0.12% optimize.symbol_engine_optimizer.opt_reshape : 0.000007s : 0.07% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000010s : 0.10% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000004s : 0.04% pipeline_parallel_scheduler : 0.000002s : 0.02% auto_monad_reorder : 0.000022s : 0.21% get_jit_bprop_graph : 0.000002s : 0.01% rewriter_after_jit_bprop_graph : 0.000006s : 0.05% opt_after_jit_grad : 0.000557s : 5.22% validate : 0.000040s : 0.37% Time group info: ------[substitution.] 0.000121 20 1.57% : 0.000002s : 2: substitution.elim_not_effective 1.27% : 0.000002s : 2: substitution.fold_const_symbol 4.58% : 0.000006s : 3: substitution.graph_param_transform 74.95% : 0.000091s : 2: substitution.inline 3.06% : 0.000004s : 4: substitution.j_node_and_user_rematch 3.64% : 0.000004s : 4: substitution.remove_not_recompute_node 2.75% : 0.000003s : 2: substitution.replace_old_param 8.17% : 0.000010s : 1: substitution.value_based_eliminate ------[type_inference.] 0.005618 2 91.23% : 0.005126s : 1: type_inference.infer 8.77% : 0.000493s : 1: type_inference.specialize ------[replace.] 0.000023 2 100.00% : 0.000023s : 2: replace.inline ------[match.] 0.000089 2 100.00% : 0.000089s : 2: match.inline ------[predicate.] 0.000142 754 0.87% : 0.000001s : 7: predicate.accumulaten_eliminater 1.18% : 0.000002s : 3: predicate.ad_related_special_op_eliminate 0.67% : 0.000001s : 6: predicate.addn_check_dump 0.84% : 0.000001s : 7: predicate.addn_zero_filter 0.73% : 0.000001s : 7: predicate.adjust_all_reduce_mul_add 2.11% : 0.000003s : 13: predicate.arithmetic_simplify 0.79% : 0.000001s : 7: predicate.cast_eliminate 0.74% : 0.000001s : 6: predicate.check_bprop_eliminate 0.73% : 0.000001s : 6: predicate.compare_switch_simplify 0.22% : 0.000000s : 3: predicate.const_output_eliminate 0.73% : 0.000001s : 6: predicate.depend_value_elim 0.73% : 0.000001s : 7: predicate.dict_get_item_const_eliminator 0.94% : 0.000001s : 7: predicate.dict_get_item_eliminator 0.85% : 0.000001s : 7: predicate.dict_set_item_eliminator 1.31% : 0.000002s : 6: predicate.dumpgradient_eliminate 0.26% : 0.000000s : 3: predicate.elim_not_effective 0.71% : 0.000001s : 3: predicate.elim_shapecalc_of_broadcastargs 1.05% : 0.000001s : 10: predicate.environ_add_const_eliminate 0.98% : 0.000001s : 10: predicate.environ_get_add_eliminate 0.96% : 0.000001s : 10: predicate.environ_get_depend_swap 1.74% : 0.000002s : 16: predicate.environ_get_eliminate 1.11% : 0.000002s : 10: predicate.environ_get_set_eliminate 0.92% : 0.000001s : 9: predicate.exchange_switch_depend_value 1.99% : 0.000003s : 9: predicate.float_depend_g_call 0.63% : 0.000001s : 6: predicate.float_environ_get_switch 0.95% : 0.000001s : 9: predicate.float_tuple_getitem_switch 0.22% : 0.000000s : 3: predicate.fold_const_symbol 0.80% : 0.000001s : 6: predicate.get_grad_eliminate 0.34% : 0.000000s : 3: predicate.graph_param_transform 0.75% : 0.000001s : 6: predicate.incorporate_call 0.62% : 0.000001s : 6: predicate.incorporate_call_switch 6.71% : 0.000010s : 34: predicate.inline 0.95% : 0.000001s : 6: predicate.inline_without_move 0.37% : 0.000001s : 6: predicate.j_node_and_user_rematch 1.13% : 0.000002s : 6: predicate.less_batch_normalization 1.75% : 0.000002s : 13: predicate.list_to_tuple_eliminator_ 2.01% : 0.000003s : 20: predicate.load_eliminater 1.37% : 0.000002s : 3: predicate.loop_unroll_after_grad 1.63% : 0.000002s : 14: predicate.loop_unroll_before_grad 2.25% : 0.000003s : 13: predicate.make_slice_get_slice_eliminator 0.77% : 0.000001s : 6: predicate.merge_addn 0.77% : 0.000001s : 6: predicate.micro_step_allgather_replace 0.80% : 0.000001s : 6: predicate.mini_step_allgather_replace 0.66% : 0.000001s : 7: predicate.minmaximum_grad 1.65% : 0.000002s : 3: predicate.mutable_eliminate 0.39% : 0.000001s : 3: predicate.opt_reshape 0.53% : 0.000001s : 3: predicate.parallel_virtual_node 1.20% : 0.000002s : 9: predicate.partial_defer_inline 1.16% : 0.000002s : 10: predicate.partial_eliminate 0.87% : 0.000001s : 7: predicate.print_const_string_wrapper 0.66% : 0.000001s : 6: predicate.reduce_all_const_elim 1.06% : 0.000001s : 7: predicate.reduce_eliminate 2.20% : 0.000003s : 20: predicate.redundant_stop_gradient_eliminater 0.69% : 0.000001s : 6: predicate.remove_not_recompute_node 1.13% : 0.000002s : 13: predicate.replace_applicator 0.64% : 0.000001s : 6: predicate.replace_old_param 0.29% : 0.000000s : 3: predicate.reset_defer_inline 0.90% : 0.000001s : 7: predicate.reshape_eliminate 0.77% : 0.000001s : 6: predicate.row_tensor_add_zeros_like 0.51% : 0.000001s : 3: predicate.row_tensor_eliminate 0.97% : 0.000001s : 6: predicate.same_eliminate 0.50% : 0.000001s : 6: predicate.set_cell_output_no_recompute 1.22% : 0.000002s : 6: predicate.shard_identity_eliminate 0.84% : 0.000001s : 6: predicate.special_op_eliminate 1.20% : 0.000002s : 6: predicate.specialize_transform 1.04% : 0.000001s : 6: predicate.split_environ_get_set_with_tuple_value 0.87% : 0.000001s : 6: predicate.stack_unstack_eliminate 0.42% : 0.000001s : 3: predicate.switch_call_monad_eliminater 1.06% : 0.000002s : 9: predicate.switch_defer_inline 1.66% : 0.000002s : 15: predicate.switch_layer_defer_inline 4.11% : 0.000006s : 32: predicate.switch_simplify 0.89% : 0.000001s : 7: predicate.tile_eliminate 0.89% : 0.000001s : 7: predicate.transpose_eliminate 1.64% : 0.000002s : 13: predicate.tuple_list_convert_item_index_to_positive 1.65% : 0.000002s : 13: predicate.tuple_list_get_item_const_eliminator 1.40% : 0.000002s : 13: predicate.tuple_list_get_item_depend_reorder 3.06% : 0.000004s : 19: predicate.tuple_list_get_item_eliminator 1.43% : 0.000002s : 13: predicate.tuple_list_get_set_item_eliminator 2.41% : 0.000003s : 19: predicate.tuple_list_set_item_eliminator 1.49% : 0.000002s : 13: predicate.tuple_to_list_eliminator_ 2.02% : 0.000003s : 20: predicate.updatestate_pure_node_eliminater 2.89% : 0.000004s : 26: predicate.updatestate_useless_node_eliminater 0.66% : 0.000001s : 3: predicate.value_based_eliminate 0.77% : 0.000001s : 6: predicate.virtual_dataset_eliminate 0.77% : 0.000001s : 6: predicate.virtual_output_eliminate 0.38% : 0.000001s : 3: predicate.virtual_view_grad_eliminate 0.48% : 0.000001s : 3: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000222 5 8.00% : 0.000018s : 1: func_graph_cloner_run.FuncGraphClonerGraph 92.00% : 0.000204s : 4: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.025899 192 0.02% : 0.000006s : 1: ForceFp32Comm 13.47% : 0.003489s : 1: add_attr 13.41% : 0.003473s : 1: add_attr_with_inline 0.02% : 0.000006s : 1: add_comm_op_reuse_tag 0.21% : 0.000054s : 1: add_recomputation 0.02% : 0.000006s : 1: assign_add_opt 0.25% : 0.000065s : 1: auto_monad 0.11% : 0.000029s : 1: auto_monad_reorder 0.02% : 0.000006s : 1: begin_end_overlap_inline 0.03% : 0.000008s : 1: bias_add_comm_swap 1.84% : 0.000478s : 1: bootstrap 0.12% : 0.000031s : 1: cconv 0.03% : 0.000007s : 1: comm_op_add_attrs 0.08% : 0.000020s : 1: control_data_broadcast_order 0.05% : 0.000013s : 1: convert_after_rewriter 0.12% : 0.000032s : 1: cse_after_recomputation 0.03% : 0.000008s : 1: dataset_repeat_opt 0.08% : 0.000021s : 1: detach_backward 0.04% : 0.000012s : 1: environ_conv 0.10% : 0.000025s : 1: event_method 0.03% : 0.000008s : 1: full_micro_interleaved_order_control 0.03% : 0.000008s : 1: get_jit_bprop_graph 0.05% : 0.000012s : 1: graph_reusing 0.03% : 0.000006s : 1: grouped_pairwise_exchange_alltoall 0.02% : 0.000006s : 1: handle_group_info 0.03% : 0.000008s : 1: inline 0.03% : 0.000009s : 1: insert-virtual-dataset 0.02% : 0.000006s : 1: interleave_parallel_branches 0.02% : 0.000006s : 1: interleave_split_concat_branches 0.03% : 0.000008s : 1: label_fine_grained_interleaved_index 0.04% : 0.000010s : 1: label_micro_interleaved_index 1.94% : 0.000502s : 1: loop_unroll 0.03% : 0.000007s : 1: merge_cast_opt 0.03% : 0.000007s : 1: micro_interleaved_order_control 2.42% : 0.000628s : 1: mutable_eliminate 0.04% : 0.000010s : 1: offloading_packed_experts 0.06% : 0.000015s : 1: opt.transform.loop_unroll_optimizer 0.06% : 0.000017s : 1: opt.transform.mutable_eliminate 3.07% : 0.000794s : 78: opt.transform.opt_a 0.10% : 0.000026s : 1: opt.transform.opt_after_cconv 0.11% : 0.000027s : 1: opt.transform.opt_after_jit_grad 0.43% : 0.000112s : 28: opt.transform.opt_b 0.18% : 0.000047s : 2: opt.transform.opt_trans_graph 0.15% : 0.000038s : 4: opt.transform.symbol_engine_opt 9.92% : 0.002569s : 1: opt_a 0.51% : 0.000133s : 1: opt_after_cconv 2.19% : 0.000568s : 1: opt_after_jit_grad 1.14% : 0.000294s : 1: opt_b 20.64% : 0.005346s : 1: optimize 0.10% : 0.000025s : 1: optimize_parallel_all_gather_comm 0.05% : 0.000012s : 1: order_py_execute_after_rewriter 0.10% : 0.000026s : 1: overlap_grad_flash_sp 0.02% : 0.000006s : 1: overlap_grad_matmul_and_grad_allreduce 0.04% : 0.000009s : 1: overlap_grad_ring_attention 0.03% : 0.000007s : 1: overlap_opt_shard_grad_in_pipeline 0.03% : 0.000007s : 1: overlap_opt_shard_in_pipeline 0.03% : 0.000008s : 1: overlap_param_gather 0.02% : 0.000006s : 1: overlap_recompute_allgather_and_fa_grad 0.04% : 0.000010s : 1: overlap_recompute_and_grad_model_parallel 0.03% : 0.000008s : 1: overlap_recompute_comm 0.04% : 0.000010s : 1: parallel-infer-symbol 0.03% : 0.000007s : 1: parallel-infer-symbol-second 0.03% : 0.000008s : 1: partial_unused_args_eliminate 0.03% : 0.000009s : 1: pipeline_parallel_scheduler 0.03% : 0.000007s : 1: pipeline_split 0.15% : 0.000038s : 1: pre_auto_parallel 0.10% : 0.000026s : 1: py_interpret_to_execute 0.06% : 0.000017s : 1: py_interpret_to_execute_after_opt_a 0.02% : 0.000006s : 1: remove_cast_before_assign_add 0.09% : 0.000022s : 1: remove_dup_value 1.11% : 0.000288s : 1: renormalize.infer 1.00% : 0.000259s : 1: renormalize.specialize 0.03% : 0.000008s : 1: reorder_send_recv_between_fp_bp 0.04% : 0.000011s : 1: rewriter_after_jit_bprop_graph 0.17% : 0.000045s : 1: rewriter_after_opt_a 0.22% : 0.000058s : 1: rewriter_before_opt_a 0.03% : 0.000008s : 1: slice_cell_reuse_recomputed_activation 0.03% : 0.000007s : 1: slice_recompute_activation 0.03% : 0.000007s : 1: split_layernorm_comm 0.03% : 0.000007s : 1: split_matmul_comm_elemetwise 0.04% : 0.000011s : 1: swap_dp_allreduce_reducescatter 0.40% : 0.000105s : 1: symbol_engine_optimizer 0.35% : 0.000091s : 1: tuple_transform 22.03% : 0.005705s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:51.824.247 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0132618, [21] [bootstrap]: 0.0003865 [type_inference]: 0.0050344 [event_method]: 1.082e-05 [auto_monad]: 5.087e-05 [graph_reusing]: 5.40001e-06 [inline]: 1.82999e-06 [add_attr]: 0.00310007, [1] [add_attr_with_inline]: 0.00309019, [1] [Cycle 1]: 4.549e-05, [2] [tag_attr]: 1.321e-05 [meta_addattr_fg_expand]: 3.76999e-06 [parallel-infer-symbol]: 3.01001e-06 [pre_auto_parallel]: 2.395e-05 [insert-virtual-dataset]: 2.57001e-06 [parallel-infer-symbol-second]: 6.90023e-07 [dataset_repeat_opt]: 1.87001e-06 [pipeline_split]: 2.03002e-06 [optimize]: 0.00398638, [53] [py_interpret_to_execute]: 1.729e-05 [rewriter_before_opt_a]: 4.42e-05 [opt_a]: 0.00203934, [2] [Cycle 1]: 0.0014267, [45] [expand_dump_flag]: 2.76999e-06 [switch_simplify]: 2.346e-05 [loop_unroll]: 1.402e-05 [a_1]: 0.00028176 [with_stream_mark]: 1.533e-05 [recompute_prepare]: 7.63001e-06 [updatestate_depend_eliminate]: 3.62002e-06 [updatestate_assign_eliminate]: 3.81999e-06 [updatestate_loads_eliminate]: 3.04001e-06 [parameter_eliminate]: 1.56998e-06 [a_2]: 7.893e-05 [accelerated_algorithm]: 6.28998e-06 [shard]: 1.95001e-06 [meta_shard_fg_expand]: 1.87999e-06 [shard_inline]: 5.84999e-06 [merge_send_recv]: 8.14997e-06 [auto_parallel]: 7.50998e-06 [parallel]: 1.78e-05 [flash_sp]: 8.42998e-06 [merge_comm]: 3.81999e-06 [allreduce_fusion]: 4e-06 [matmul_add_comm_reduction]: 8.89998e-06 [allreduce_slice_to_reducescatter]: 5.69999e-07 [virtual_shard_identity]: 8.18001e-06 [virtual_dataset]: 6.51e-06 [get_grad_eliminate_]: 5.87999e-06 [virtual_output]: 6.41e-06 [merge_forward]: 3.65e-06 [cell_reuse_recompute_pass]: 1.20001e-06 [offload_activation]: 9.14e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.246e-05 [merge_recompute_call_nodes]: 1.42e-06 [before_grad]: 9.87999e-06 [set_forward_comm_id_for_comm_node_pass]: 3.87998e-06 [meta_fg_expand]: 2.61999e-06 [flash_sp_send_recv_attached]: 2.46e-06 [receive_attached]: 2.36e-06 [after_resolve]: 9.05001e-06 [a_after_grad]: 9.82999e-06 [renormalize]: 0.00049836 [add_forward_monad_depend]: 4.77e-06 [auto_monad_grad]: 1.93002e-06 [auto_monad_eliminator]: 1.386e-05 [cse]: 2.16e-05 [a_3]: 4.43e-05 [Cycle 2]: 0.00060319, [45] [expand_dump_flag]: 1.15001e-06 [switch_simplify]: 7.06001e-06 [loop_unroll]: 5.81998e-06 [a_1]: 0.00010714 [with_stream_mark]: 1.102e-05 [recompute_prepare]: 6.29001e-06 [updatestate_depend_eliminate]: 3.01001e-06 [updatestate_assign_eliminate]: 2.20002e-06 [updatestate_loads_eliminate]: 2.62001e-06 [parameter_eliminate]: 9.89996e-07 [a_2]: 6.912e-05 [accelerated_algorithm]: 5.82001e-06 [shard]: 1.42e-06 [meta_shard_fg_expand]: 1.34e-06 [shard_inline]: 6.24001e-06 [merge_send_recv]: 4.85999e-06 [auto_parallel]: 5.55001e-06 [parallel]: 4.48001e-06 [flash_sp]: 3.38999e-06 [merge_comm]: 3.14999e-06 [allreduce_fusion]: 3.10998e-06 [matmul_add_comm_reduction]: 5.62999e-06 [allreduce_slice_to_reducescatter]: 3.39991e-07 [virtual_shard_identity]: 6.69999e-06 [virtual_dataset]: 5.81e-06 [get_grad_eliminate_]: 5.34e-06 [virtual_output]: 5.18002e-06 [merge_forward]: 2.78e-06 [cell_reuse_recompute_pass]: 1.57999e-06 [offload_activation]: 6.53e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.215e-05 [merge_recompute_call_nodes]: 7.2e-07 [before_grad]: 9.24e-06 [set_forward_comm_id_for_comm_node_pass]: 3.6e-06 [meta_fg_expand]: 1.97001e-06 [flash_sp_send_recv_attached]: 9.50007e-07 [receive_attached]: 1.05001e-06 [after_resolve]: 8.42e-06 [a_after_grad]: 8.1e-06 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 1.19e-06 [auto_monad_grad]: 9.20001e-07 [auto_monad_eliminator]: 6.89999e-06 [cse]: 1.333e-05 [a_3]: 3.377e-05 [py_interpret_to_execute_after_opt_a]: 1.772e-05 [slice_cell_reuse_recomputed_activation]: 2.17999e-06 [rewriter_after_opt_a]: 3.396e-05 [convert_after_rewriter]: 6.27001e-06 [order_py_execute_after_rewriter]: 5.64e-06 [mutable_eliminate]: 0.0004871 [opt_b]: 0.00021635, [1] [Cycle 1]: 0.00021028, [7] [b_1]: 0.00013438 [b_2]: 8.92e-06 [updatestate_depend_eliminate]: 5.34e-06 [updatestate_assign_eliminate]: 2.63e-06 [updatestate_loads_eliminate]: 2.21e-06 [renormalize]: 6.00005e-07 [cse]: 1.891e-05 [optimize_parallel_all_gather_comm]: 1.693e-05 [overlap_param_gather]: 2.01e-06 [cconv]: 2.43e-05 [loop_unroll]: 0.00042679 [opt_after_cconv]: 0.00010234, [1] [Cycle 1]: 9.67e-05, [7] [c_1]: 2.709e-05 [parameter_eliminate]: 2.79999e-06 [updatestate_depend_eliminate]: 5.94e-06 [updatestate_assign_eliminate]: 2.49999e-06 [updatestate_loads_eliminate]: 2.27999e-06 [cse]: 1.933e-05 [renormalize]: 3.4002e-07 [remove_dup_value]: 1.343e-05 [tuple_transform]: 7.253e-05, [1] [Cycle 1]: 6.794e-05, [4] [d_1]: 3.967e-05 [none_parameter_eliminate]: 1.57999e-06 [renormalize]: 4.60015e-07 [switch_simplify]: 6.92002e-06 [partial_unused_args_eliminate]: 1.94999e-06 [add_recomputation]: 4.338e-05 [cse_after_recomputation]: 1.99e-05, [1] [Cycle 1]: 1.559e-05, [1] [cse]: 1.033e-05 [environ_conv]: 5.30001e-06 [swap_dp_allreduce_reducescatter]: 5.59e-06 [bias_add_comm_swap]: 2.44001e-06 [label_micro_interleaved_index]: 4.51002e-06 [label_fine_grained_interleaved_index]: 2.67001e-06 [merge_cast_opt]: 1.40999e-06 [slice_recompute_activation]: 2.11998e-06 [micro_interleaved_order_control]: 2.78e-06 [assign_add_opt]: 1.22999e-06 [ForceFp32Comm]: 7.90023e-07 [remove_cast_before_assign_add]: 1.02e-06 [full_micro_interleaved_order_control]: 2.27001e-06 [reorder_send_recv_between_fp_bp]: 2.83998e-06 [comm_op_add_attrs]: 1.11997e-06 [add_comm_op_reuse_tag]: 9.50007e-07 [interleave_split_concat_branches]: 1.29e-06 [interleave_parallel_branches]: 1.04e-06 [overlap_opt_shard_in_pipeline]: 1.07e-06 [overlap_opt_shard_grad_in_pipeline]: 2.06e-06 [control_data_broadcast_order]: 1.211e-05 [grouped_pairwise_exchange_alltoall]: 1.77001e-06 [offloading_packed_experts]: 4.22e-06 [overlap_recompute_and_grad_model_parallel]: 4.95999e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.24e-06 [overlap_recompute_allgather_and_fa_grad]: 1.40001e-06 [overlap_recompute_comm]: 2.50002e-06 [overlap_grad_ring_attention]: 4.17e-06 [overlap_grad_flash_sp]: 1.738e-05 [begin_end_overlap_inline]: 5.3001e-07 [split_matmul_comm_elemetwise]: 2.63e-06 [split_layernorm_comm]: 1.89999e-06 [handle_group_info]: 1.07e-06 [symbol_engine_optimizer]: 7.524e-05, [1] [Cycle 1]: 7.078e-05, [6] [build]: 2.47001e-06 [elim_shapecalc]: 9.82999e-06 [elim_not_effective]: 1.293e-05 [opt_reshape]: 7.30998e-06 [fold_const_symbol]: 9.67999e-06 [renormalize]: 1.8999e-07 [detach_backward]: 1.84e-06 [pipeline_parallel_scheduler]: 1.66002e-06 [auto_monad_reorder]: 1.586e-05 [get_jit_bprop_graph]: 1.60001e-06 [rewriter_after_jit_bprop_graph]: 4.63001e-06 [opt_after_jit_grad]: 0.00045451 [validate]: 3.614e-05 Sums bootstrap : 0.000387s : 4.19% type_inference : 0.005034s : 54.59% event_method : 0.000011s : 0.12% auto_monad : 0.000051s : 0.55% graph_reusing : 0.000005s : 0.06% inline : 0.000002s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000013s : 0.14% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000004s : 0.04% parallel-infer-symbol : 0.000003s : 0.03% pre_auto_parallel : 0.000024s : 0.26% insert-virtual-dataset : 0.000003s : 0.03% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.02% optimize.py_interpret_to_execute : 0.000017s : 0.19% optimize.rewriter_before_opt_a : 0.000044s : 0.48% optimize.opt_a.expand_dump_flag : 0.000004s : 0.04% optimize.opt_a.switch_simplify : 0.000031s : 0.33% optimize.opt_a.loop_unroll : 0.000020s : 0.22% optimize.opt_a.a_1 : 0.000389s : 4.22% optimize.opt_a.with_stream_mark : 0.000026s : 0.29% optimize.opt_a.recompute_prepare : 0.000014s : 0.15% optimize.opt_a.updatestate_depend_eliminate : 0.000007s : 0.07% optimize.opt_a.updatestate_assign_eliminate : 0.000006s : 0.07% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.06% optimize.opt_a.parameter_eliminate : 0.000003s : 0.03% optimize.opt_a.a_2 : 0.000148s : 1.61% optimize.opt_a.accelerated_algorithm : 0.000012s : 0.13% optimize.opt_a.shard : 0.000003s : 0.04% optimize.opt_a.meta_shard_fg_expand : 0.000003s : 0.03% optimize.opt_a.shard_inline : 0.000012s : 0.13% optimize.opt_a.merge_send_recv : 0.000013s : 0.14% optimize.opt_a.auto_parallel : 0.000013s : 0.14% optimize.opt_a.parallel : 0.000022s : 0.24% optimize.opt_a.flash_sp : 0.000012s : 0.13% optimize.opt_a.merge_comm : 0.000007s : 0.08% optimize.opt_a.allreduce_fusion : 0.000007s : 0.08% optimize.opt_a.matmul_add_comm_reduction : 0.000015s : 0.16% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000015s : 0.16% optimize.opt_a.virtual_dataset : 0.000012s : 0.13% optimize.opt_a.get_grad_eliminate_ : 0.000011s : 0.12% optimize.opt_a.virtual_output : 0.000012s : 0.13% optimize.opt_a.merge_forward : 0.000006s : 0.07% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.03% optimize.opt_a.offload_activation : 0.000016s : 0.17% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000025s : 0.27% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.02% optimize.opt_a.before_grad : 0.000019s : 0.21% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000007s : 0.08% optimize.opt_a.meta_fg_expand : 0.000005s : 0.05% optimize.opt_a.flash_sp_send_recv_attached : 0.000003s : 0.04% optimize.opt_a.receive_attached : 0.000003s : 0.04% optimize.opt_a.after_resolve : 0.000017s : 0.19% optimize.opt_a.a_after_grad : 0.000018s : 0.19% optimize.opt_a.renormalize : 0.000498s : 5.40% optimize.opt_a.add_forward_monad_depend : 0.000006s : 0.06% optimize.opt_a.auto_monad_grad : 0.000003s : 0.03% optimize.opt_a.auto_monad_eliminator : 0.000021s : 0.23% optimize.opt_a.cse : 0.000035s : 0.38% optimize.opt_a.a_3 : 0.000078s : 0.85% optimize.py_interpret_to_execute_after_opt_a : 0.000018s : 0.19% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.02% optimize.rewriter_after_opt_a : 0.000034s : 0.37% optimize.convert_after_rewriter : 0.000006s : 0.07% optimize.order_py_execute_after_rewriter : 0.000006s : 0.06% optimize.mutable_eliminate : 0.000487s : 5.28% optimize.opt_b.b_1 : 0.000134s : 1.46% optimize.opt_b.b_2 : 0.000009s : 0.10% optimize.opt_b.updatestate_depend_eliminate : 0.000005s : 0.06% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_b.updatestate_loads_eliminate : 0.000002s : 0.02% optimize.opt_b.renormalize : 0.000001s : 0.01% optimize.opt_b.cse : 0.000019s : 0.21% optimize.optimize_parallel_all_gather_comm : 0.000017s : 0.18% optimize.overlap_param_gather : 0.000002s : 0.02% optimize.cconv : 0.000024s : 0.26% optimize.loop_unroll : 0.000427s : 4.63% optimize.opt_after_cconv.c_1 : 0.000027s : 0.29% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.06% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000002s : 0.03% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.02% optimize.opt_after_cconv.cse : 0.000019s : 0.21% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000013s : 0.15% optimize.tuple_transform.d_1 : 0.000040s : 0.43% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.02% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000007s : 0.08% optimize.partial_unused_args_eliminate : 0.000002s : 0.02% optimize.add_recomputation : 0.000043s : 0.47% optimize.cse_after_recomputation.cse : 0.000010s : 0.11% optimize.environ_conv : 0.000005s : 0.06% optimize.swap_dp_allreduce_reducescatter : 0.000006s : 0.06% optimize.bias_add_comm_swap : 0.000002s : 0.03% optimize.label_micro_interleaved_index : 0.000005s : 0.05% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.03% optimize.merge_cast_opt : 0.000001s : 0.02% optimize.slice_recompute_activation : 0.000002s : 0.02% optimize.micro_interleaved_order_control : 0.000003s : 0.03% optimize.assign_add_opt : 0.000001s : 0.01% optimize.ForceFp32Comm : 0.000001s : 0.01% optimize.remove_cast_before_assign_add : 0.000001s : 0.01% optimize.full_micro_interleaved_order_control : 0.000002s : 0.02% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.03% optimize.comm_op_add_attrs : 0.000001s : 0.01% optimize.add_comm_op_reuse_tag : 0.000001s : 0.01% optimize.interleave_split_concat_branches : 0.000001s : 0.01% optimize.interleave_parallel_branches : 0.000001s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.02% optimize.control_data_broadcast_order : 0.000012s : 0.13% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.02% optimize.offloading_packed_experts : 0.000004s : 0.05% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.05% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.02% optimize.overlap_recompute_comm : 0.000003s : 0.03% optimize.overlap_grad_ring_attention : 0.000004s : 0.05% optimize.overlap_grad_flash_sp : 0.000017s : 0.19% optimize.begin_end_overlap_inline : 0.000001s : 0.01% optimize.split_matmul_comm_elemetwise : 0.000003s : 0.03% optimize.split_layernorm_comm : 0.000002s : 0.02% optimize.handle_group_info : 0.000001s : 0.01% optimize.symbol_engine_optimizer.build : 0.000002s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000010s : 0.11% optimize.symbol_engine_optimizer.elim_not_effective : 0.000013s : 0.14% optimize.symbol_engine_optimizer.opt_reshape : 0.000007s : 0.08% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000010s : 0.10% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.02% pipeline_parallel_scheduler : 0.000002s : 0.02% auto_monad_reorder : 0.000016s : 0.17% get_jit_bprop_graph : 0.000002s : 0.02% rewriter_after_jit_bprop_graph : 0.000005s : 0.05% opt_after_jit_grad : 0.000455s : 4.93% validate : 0.000036s : 0.39% Time group info: ------[substitution.] 0.000113 20 1.59% : 0.000002s : 2: substitution.elim_not_effective 1.05% : 0.000001s : 2: substitution.fold_const_symbol 4.89% : 0.000006s : 3: substitution.graph_param_transform 74.94% : 0.000085s : 2: substitution.inline 2.72% : 0.000003s : 4: substitution.j_node_and_user_rematch 3.86% : 0.000004s : 4: substitution.remove_not_recompute_node 2.46% : 0.000003s : 2: substitution.replace_old_param 8.48% : 0.000010s : 1: substitution.value_based_eliminate ------[type_inference.] 0.004993 2 91.74% : 0.004580s : 1: type_inference.infer 8.26% : 0.000412s : 1: type_inference.specialize ------[replace.] 0.000021 2 100.00% : 0.000021s : 2: replace.inline ------[match.] 0.000083 2 100.00% : 0.000083s : 2: match.inline ------[predicate.] 0.000132 754 0.77% : 0.000001s : 7: predicate.accumulaten_eliminater 1.31% : 0.000002s : 3: predicate.ad_related_special_op_eliminate 0.63% : 0.000001s : 6: predicate.addn_check_dump 0.78% : 0.000001s : 7: predicate.addn_zero_filter 0.70% : 0.000001s : 7: predicate.adjust_all_reduce_mul_add 2.29% : 0.000003s : 13: predicate.arithmetic_simplify 0.77% : 0.000001s : 7: predicate.cast_eliminate 0.72% : 0.000001s : 6: predicate.check_bprop_eliminate 0.75% : 0.000001s : 6: predicate.compare_switch_simplify 0.21% : 0.000000s : 3: predicate.const_output_eliminate 0.73% : 0.000001s : 6: predicate.depend_value_elim 0.76% : 0.000001s : 7: predicate.dict_get_item_const_eliminator 0.88% : 0.000001s : 7: predicate.dict_get_item_eliminator 0.71% : 0.000001s : 7: predicate.dict_set_item_eliminator 1.27% : 0.000002s : 6: predicate.dumpgradient_eliminate 0.28% : 0.000000s : 3: predicate.elim_not_effective 0.51% : 0.000001s : 3: predicate.elim_shapecalc_of_broadcastargs 1.09% : 0.000001s : 10: predicate.environ_add_const_eliminate 0.98% : 0.000001s : 10: predicate.environ_get_add_eliminate 1.03% : 0.000001s : 10: predicate.environ_get_depend_swap 1.68% : 0.000002s : 16: predicate.environ_get_eliminate 1.03% : 0.000001s : 10: predicate.environ_get_set_eliminate 0.93% : 0.000001s : 9: predicate.exchange_switch_depend_value 1.95% : 0.000003s : 9: predicate.float_depend_g_call 0.61% : 0.000001s : 6: predicate.float_environ_get_switch 0.96% : 0.000001s : 9: predicate.float_tuple_getitem_switch 0.23% : 0.000000s : 3: predicate.fold_const_symbol 0.81% : 0.000001s : 6: predicate.get_grad_eliminate 0.41% : 0.000001s : 3: predicate.graph_param_transform 0.81% : 0.000001s : 6: predicate.incorporate_call 0.65% : 0.000001s : 6: predicate.incorporate_call_switch 6.35% : 0.000008s : 34: predicate.inline 1.19% : 0.000002s : 6: predicate.inline_without_move 0.41% : 0.000001s : 6: predicate.j_node_and_user_rematch 1.04% : 0.000001s : 6: predicate.less_batch_normalization 1.68% : 0.000002s : 13: predicate.list_to_tuple_eliminator_ 2.12% : 0.000003s : 20: predicate.load_eliminater 1.22% : 0.000002s : 3: predicate.loop_unroll_after_grad 1.73% : 0.000002s : 14: predicate.loop_unroll_before_grad 1.71% : 0.000002s : 13: predicate.make_slice_get_slice_eliminator 0.72% : 0.000001s : 6: predicate.merge_addn 0.66% : 0.000001s : 6: predicate.micro_step_allgather_replace 0.80% : 0.000001s : 6: predicate.mini_step_allgather_replace 0.68% : 0.000001s : 7: predicate.minmaximum_grad 1.33% : 0.000002s : 3: predicate.mutable_eliminate 0.50% : 0.000001s : 3: predicate.opt_reshape 0.60% : 0.000001s : 3: predicate.parallel_virtual_node 1.25% : 0.000002s : 9: predicate.partial_defer_inline 1.21% : 0.000002s : 10: predicate.partial_eliminate 1.00% : 0.000001s : 7: predicate.print_const_string_wrapper 0.76% : 0.000001s : 6: predicate.reduce_all_const_elim 1.07% : 0.000001s : 7: predicate.reduce_eliminate 2.09% : 0.000003s : 20: predicate.redundant_stop_gradient_eliminater 0.69% : 0.000001s : 6: predicate.remove_not_recompute_node 1.22% : 0.000002s : 13: predicate.replace_applicator 0.65% : 0.000001s : 6: predicate.replace_old_param 0.43% : 0.000001s : 3: predicate.reset_defer_inline 0.88% : 0.000001s : 7: predicate.reshape_eliminate 0.75% : 0.000001s : 6: predicate.row_tensor_add_zeros_like 0.66% : 0.000001s : 3: predicate.row_tensor_eliminate 0.91% : 0.000001s : 6: predicate.same_eliminate 0.63% : 0.000001s : 6: predicate.set_cell_output_no_recompute 0.94% : 0.000001s : 6: predicate.shard_identity_eliminate 0.91% : 0.000001s : 6: predicate.special_op_eliminate 0.86% : 0.000001s : 6: predicate.specialize_transform 1.07% : 0.000001s : 6: predicate.split_environ_get_set_with_tuple_value 0.94% : 0.000001s : 6: predicate.stack_unstack_eliminate 0.39% : 0.000001s : 3: predicate.switch_call_monad_eliminater 1.08% : 0.000001s : 9: predicate.switch_defer_inline 1.68% : 0.000002s : 15: predicate.switch_layer_defer_inline 4.55% : 0.000006s : 32: predicate.switch_simplify 0.81% : 0.000001s : 7: predicate.tile_eliminate 0.81% : 0.000001s : 7: predicate.transpose_eliminate 1.77% : 0.000002s : 13: predicate.tuple_list_convert_item_index_to_positive 1.64% : 0.000002s : 13: predicate.tuple_list_get_item_const_eliminator 1.43% : 0.000002s : 13: predicate.tuple_list_get_item_depend_reorder 2.98% : 0.000004s : 19: predicate.tuple_list_get_item_eliminator 1.42% : 0.000002s : 13: predicate.tuple_list_get_set_item_eliminator 2.68% : 0.000004s : 19: predicate.tuple_list_set_item_eliminator 1.56% : 0.000002s : 13: predicate.tuple_to_list_eliminator_ 2.00% : 0.000003s : 20: predicate.updatestate_pure_node_eliminater 2.99% : 0.000004s : 26: predicate.updatestate_useless_node_eliminater 0.66% : 0.000001s : 3: predicate.value_based_eliminate 0.77% : 0.000001s : 6: predicate.virtual_dataset_eliminate 0.97% : 0.000001s : 6: predicate.virtual_output_eliminate 0.36% : 0.000000s : 3: predicate.virtual_view_grad_eliminate 0.59% : 0.000001s : 3: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000198 5 8.37% : 0.000017s : 1: func_graph_cloner_run.FuncGraphClonerGraph 91.63% : 0.000181s : 4: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.021735 192 0.02% : 0.000003s : 1: ForceFp32Comm 14.28% : 0.003104s : 1: add_attr 14.24% : 0.003094s : 1: add_attr_with_inline 0.02% : 0.000004s : 1: add_comm_op_reuse_tag 0.22% : 0.000047s : 1: add_recomputation 0.02% : 0.000004s : 1: assign_add_opt 0.26% : 0.000056s : 1: auto_monad 0.09% : 0.000020s : 1: auto_monad_reorder 0.02% : 0.000003s : 1: begin_end_overlap_inline 0.02% : 0.000005s : 1: bias_add_comm_swap 1.88% : 0.000409s : 1: bootstrap 0.13% : 0.000028s : 1: cconv 0.02% : 0.000004s : 1: comm_op_add_attrs 0.07% : 0.000016s : 1: control_data_broadcast_order 0.04% : 0.000009s : 1: convert_after_rewriter 0.11% : 0.000023s : 1: cse_after_recomputation 0.02% : 0.000005s : 1: dataset_repeat_opt 0.02% : 0.000005s : 1: detach_backward 0.04% : 0.000008s : 1: environ_conv 0.08% : 0.000017s : 1: event_method 0.02% : 0.000005s : 1: full_micro_interleaved_order_control 0.02% : 0.000005s : 1: get_jit_bprop_graph 0.04% : 0.000009s : 1: graph_reusing 0.02% : 0.000005s : 1: grouped_pairwise_exchange_alltoall 0.02% : 0.000004s : 1: handle_group_info 0.02% : 0.000005s : 1: inline 0.03% : 0.000006s : 1: insert-virtual-dataset 0.02% : 0.000004s : 1: interleave_parallel_branches 0.02% : 0.000004s : 1: interleave_split_concat_branches 0.03% : 0.000006s : 1: label_fine_grained_interleaved_index 0.03% : 0.000007s : 1: label_micro_interleaved_index 2.00% : 0.000435s : 1: loop_unroll 0.02% : 0.000004s : 1: merge_cast_opt 0.03% : 0.000005s : 1: micro_interleaved_order_control 2.28% : 0.000496s : 1: mutable_eliminate 0.03% : 0.000007s : 1: offloading_packed_experts 0.06% : 0.000013s : 1: opt.transform.loop_unroll_optimizer 0.06% : 0.000014s : 1: opt.transform.mutable_eliminate 3.42% : 0.000744s : 78: opt.transform.opt_a 0.12% : 0.000026s : 1: opt.transform.opt_after_cconv 0.11% : 0.000023s : 1: opt.transform.opt_after_jit_grad 0.51% : 0.000111s : 28: opt.transform.opt_b 0.20% : 0.000044s : 2: opt.transform.opt_trans_graph 0.17% : 0.000036s : 4: opt.transform.symbol_engine_opt 9.40% : 0.002042s : 1: opt_a 0.49% : 0.000106s : 1: opt_after_cconv 2.13% : 0.000463s : 1: opt_after_jit_grad 1.01% : 0.000220s : 1: opt_b 18.36% : 0.003991s : 1: optimize 0.10% : 0.000021s : 1: optimize_parallel_all_gather_comm 0.04% : 0.000009s : 1: order_py_execute_after_rewriter 0.10% : 0.000021s : 1: overlap_grad_flash_sp 0.02% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.03% : 0.000007s : 1: overlap_grad_ring_attention 0.02% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.02% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.02% : 0.000005s : 1: overlap_param_gather 0.02% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.04% : 0.000008s : 1: overlap_recompute_and_grad_model_parallel 0.02% : 0.000005s : 1: overlap_recompute_comm 0.03% : 0.000007s : 1: parallel-infer-symbol 0.02% : 0.000004s : 1: parallel-infer-symbol-second 0.02% : 0.000005s : 1: partial_unused_args_eliminate 0.02% : 0.000005s : 1: pipeline_parallel_scheduler 0.02% : 0.000005s : 1: pipeline_split 0.13% : 0.000028s : 1: pre_auto_parallel 0.10% : 0.000021s : 1: py_interpret_to_execute 0.10% : 0.000022s : 1: py_interpret_to_execute_after_opt_a 0.02% : 0.000004s : 1: remove_cast_before_assign_add 0.08% : 0.000017s : 1: remove_dup_value 1.26% : 0.000274s : 1: renormalize.infer 1.00% : 0.000218s : 1: renormalize.specialize 0.03% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.04% : 0.000008s : 1: rewriter_after_jit_bprop_graph 0.17% : 0.000038s : 1: rewriter_after_opt_a 0.22% : 0.000048s : 1: rewriter_before_opt_a 0.02% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.02% : 0.000005s : 1: slice_recompute_activation 0.02% : 0.000005s : 1: split_layernorm_comm 0.02% : 0.000005s : 1: split_matmul_comm_elemetwise 0.04% : 0.000008s : 1: swap_dp_allreduce_reducescatter 0.36% : 0.000078s : 1: symbol_engine_optimizer 0.35% : 0.000076s : 1: tuple_transform 23.23% : 0.005049s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:52.188.84 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:52.191.61 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0153634, [21] [bootstrap]: 0.00037764 [type_inference]: 0.00548025 [event_method]: 1.201e-05 [auto_monad]: 5.705e-05 [graph_reusing]: 5.46e-06 [inline]: 2.34999e-06 [add_attr]: 0.00336246, [1] [add_attr_with_inline]: 0.00335272, [1] [Cycle 1]: 6.911e-05, [2] [tag_attr]: 1.506e-05 [meta_addattr_fg_expand]: 3.68e-06 [parallel-infer-symbol]: 3.46001e-06 [pre_auto_parallel]: 2.668e-05 [insert-virtual-dataset]: 2.82002e-06 [parallel-infer-symbol-second]: 7.2e-07 [dataset_repeat_opt]: 1.97001e-06 [pipeline_split]: 1.92999e-06 [optimize]: 0.00489851, [53] [py_interpret_to_execute]: 2.281e-05 [rewriter_before_opt_a]: 4.958e-05 [opt_a]: 0.00250269, [2] [Cycle 1]: 0.00170807, [45] [expand_dump_flag]: 3.41001e-06 [switch_simplify]: 2.412e-05 [loop_unroll]: 1.409e-05 [a_1]: 0.00031073 [with_stream_mark]: 1.769e-05 [recompute_prepare]: 7.85e-06 [updatestate_depend_eliminate]: 4.22998e-06 [updatestate_assign_eliminate]: 3.79002e-06 [updatestate_loads_eliminate]: 2.96999e-06 [parameter_eliminate]: 2.03002e-06 [a_2]: 0.00011401 [accelerated_algorithm]: 7.75e-06 [shard]: 1.94999e-06 [meta_shard_fg_expand]: 2.24001e-06 [shard_inline]: 6.12999e-06 [merge_send_recv]: 9.14998e-06 [auto_parallel]: 6.11998e-06 [parallel]: 1.958e-05 [flash_sp]: 7.63999e-06 [merge_comm]: 3.72002e-06 [allreduce_fusion]: 3.82002e-06 [matmul_add_comm_reduction]: 9.67999e-06 [allreduce_slice_to_reducescatter]: 5.89993e-07 [virtual_shard_identity]: 7.58001e-06 [virtual_dataset]: 7.39002e-06 [get_grad_eliminate_]: 7.55998e-06 [virtual_output]: 6.58e-06 [merge_forward]: 4.43999e-06 [cell_reuse_recompute_pass]: 1.71e-06 [offload_activation]: 1.021e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.481e-05 [merge_recompute_call_nodes]: 1.50999e-06 [before_grad]: 1.073e-05 [set_forward_comm_id_for_comm_node_pass]: 3.77002e-06 [meta_fg_expand]: 2.80002e-06 [flash_sp_send_recv_attached]: 2.81e-06 [receive_attached]: 2.49999e-06 [after_resolve]: 9.96e-06 [a_after_grad]: 9.51998e-06 [renormalize]: 0.00052169 [add_forward_monad_depend]: 5.22e-06 [auto_monad_grad]: 2.34001e-06 [auto_monad_eliminator]: 1.526e-05 [cse]: 3.047e-05 [a_3]: 5.93e-05 [Cycle 2]: 0.00078175, [45] [expand_dump_flag]: 9.50007e-07 [switch_simplify]: 6.98998e-06 [loop_unroll]: 5.98998e-06 [a_1]: 0.00010961 [with_stream_mark]: 1.085e-05 [recompute_prepare]: 6.23e-06 [updatestate_depend_eliminate]: 3.14999e-06 [updatestate_assign_eliminate]: 2.41e-06 [updatestate_loads_eliminate]: 2.71e-06 [parameter_eliminate]: 1.02998e-06 [a_2]: 9.641e-05 [accelerated_algorithm]: 6.04001e-06 [shard]: 1.04e-06 [meta_shard_fg_expand]: 1.34998e-06 [shard_inline]: 6.14001e-06 [merge_send_recv]: 4.93001e-06 [auto_parallel]: 5.93998e-06 [parallel]: 5.97999e-06 [flash_sp]: 3.26001e-06 [merge_comm]: 3.45e-06 [allreduce_fusion]: 2.96001e-06 [matmul_add_comm_reduction]: 9.44998e-06 [allreduce_slice_to_reducescatter]: 3.00002e-07 [virtual_shard_identity]: 6.16e-06 [virtual_dataset]: 5.86e-06 [get_grad_eliminate_]: 5.49998e-06 [virtual_output]: 5.32999e-06 [merge_forward]: 2.84999e-06 [cell_reuse_recompute_pass]: 2.31e-06 [offload_activation]: 7.06999e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.431e-05 [merge_recompute_call_nodes]: 7.60017e-07 [before_grad]: 9.04998e-06 [set_forward_comm_id_for_comm_node_pass]: 3.71001e-06 [meta_fg_expand]: 2.06e-06 [flash_sp_send_recv_attached]: 9.09989e-07 [receive_attached]: 1.24e-06 [after_resolve]: 8.72998e-06 [a_after_grad]: 8.55999e-06 [renormalize]: 7.00238e-08 [add_forward_monad_depend]: 1.18001e-06 [auto_monad_grad]: 8.39995e-07 [auto_monad_eliminator]: 7.15e-06 [cse]: 1.465e-05 [a_3]: 4.729e-05 [py_interpret_to_execute_after_opt_a]: 1.274e-05 [slice_cell_reuse_recomputed_activation]: 4.85001e-06 [rewriter_after_opt_a]: 4.174e-05 [convert_after_rewriter]: 1.003e-05 [order_py_execute_after_rewriter]: 8.35999e-06 [mutable_eliminate]: 0.00056595 [opt_b]: 0.00030607, [1] [Cycle 1]: 0.00029661, [7] [b_1]: 0.00020022 [b_2]: 8.36002e-06 [updatestate_depend_eliminate]: 5.45001e-06 [updatestate_assign_eliminate]: 2.66999e-06 [updatestate_loads_eliminate]: 2.26998e-06 [renormalize]: 4.2998e-07 [cse]: 1.956e-05 [optimize_parallel_all_gather_comm]: 1.974e-05 [overlap_param_gather]: 4.58999e-06 [cconv]: 2.779e-05 [loop_unroll]: 0.00049136 [opt_after_cconv]: 0.00012995, [1] [Cycle 1]: 0.00012085, [7] [c_1]: 3.073e-05 [parameter_eliminate]: 2.50002e-06 [updatestate_depend_eliminate]: 5.45001e-06 [updatestate_assign_eliminate]: 2.48e-06 [updatestate_loads_eliminate]: 2.86999e-06 [cse]: 2.063e-05 [renormalize]: 4.30009e-07 [remove_dup_value]: 1.841e-05 [tuple_transform]: 9.17e-05, [1] [Cycle 1]: 8.438e-05, [4] [d_1]: 4.449e-05 [none_parameter_eliminate]: 1.66002e-06 [renormalize]: 2.09984e-07 [switch_simplify]: 6.97002e-06 [partial_unused_args_eliminate]: 4.52e-06 [add_recomputation]: 6.218e-05 [cse_after_recomputation]: 2.944e-05, [1] [Cycle 1]: 2.259e-05, [1] [cse]: 1.368e-05 [environ_conv]: 8.55999e-06 [swap_dp_allreduce_reducescatter]: 8.02e-06 [bias_add_comm_swap]: 5.30999e-06 [label_micro_interleaved_index]: 7.25998e-06 [label_fine_grained_interleaved_index]: 5.44e-06 [merge_cast_opt]: 4.10998e-06 [slice_recompute_activation]: 5.09e-06 [micro_interleaved_order_control]: 4.52e-06 [assign_add_opt]: 4.03999e-06 [ForceFp32Comm]: 3.45e-06 [remove_cast_before_assign_add]: 3.45e-06 [full_micro_interleaved_order_control]: 4.45999e-06 [reorder_send_recv_between_fp_bp]: 5.22e-06 [comm_op_add_attrs]: 3.5e-06 [add_comm_op_reuse_tag]: 3.4e-06 [interleave_split_concat_branches]: 3.81001e-06 [interleave_parallel_branches]: 3.40998e-06 [overlap_opt_shard_in_pipeline]: 3.7e-06 [overlap_opt_shard_grad_in_pipeline]: 4.32e-06 [control_data_broadcast_order]: 1.558e-05 [grouped_pairwise_exchange_alltoall]: 4.45e-06 [offloading_packed_experts]: 6.86001e-06 [overlap_recompute_and_grad_model_parallel]: 7.08e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.63e-06 [overlap_recompute_allgather_and_fa_grad]: 4e-06 [overlap_recompute_comm]: 5.36002e-06 [overlap_grad_ring_attention]: 6.58e-06 [overlap_grad_flash_sp]: 2.214e-05 [begin_end_overlap_inline]: 2.89999e-06 [split_matmul_comm_elemetwise]: 4.67998e-06 [split_layernorm_comm]: 4.29002e-06 [handle_group_info]: 3.29001e-06 [symbol_engine_optimizer]: 9.802e-05, [1] [Cycle 1]: 9.133e-05, [6] [build]: 3.23998e-06 [elim_shapecalc]: 1.021e-05 [elim_not_effective]: 1.344e-05 [opt_reshape]: 7.38999e-06 [fold_const_symbol]: 1.025e-05 [renormalize]: 2.50002e-07 [detach_backward]: 3.64002e-06 [pipeline_parallel_scheduler]: 1.86e-06 [auto_monad_reorder]: 1.743e-05 [get_jit_bprop_graph]: 2.39999e-06 [rewriter_after_jit_bprop_graph]: 4.09002e-06 [opt_after_jit_grad]: 0.00052311 [validate]: 3.697e-05 Sums bootstrap : 0.000378s : 3.67% type_inference : 0.005480s : 53.22% event_method : 0.000012s : 0.12% auto_monad : 0.000057s : 0.55% graph_reusing : 0.000005s : 0.05% inline : 0.000002s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000015s : 0.15% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000004s : 0.04% parallel-infer-symbol : 0.000003s : 0.03% pre_auto_parallel : 0.000027s : 0.26% insert-virtual-dataset : 0.000003s : 0.03% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.02% optimize.py_interpret_to_execute : 0.000023s : 0.22% optimize.rewriter_before_opt_a : 0.000050s : 0.48% optimize.opt_a.expand_dump_flag : 0.000004s : 0.04% optimize.opt_a.switch_simplify : 0.000031s : 0.30% optimize.opt_a.loop_unroll : 0.000020s : 0.20% optimize.opt_a.a_1 : 0.000420s : 4.08% optimize.opt_a.with_stream_mark : 0.000029s : 0.28% optimize.opt_a.recompute_prepare : 0.000014s : 0.14% optimize.opt_a.updatestate_depend_eliminate : 0.000007s : 0.07% optimize.opt_a.updatestate_assign_eliminate : 0.000006s : 0.06% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.06% optimize.opt_a.parameter_eliminate : 0.000003s : 0.03% optimize.opt_a.a_2 : 0.000210s : 2.04% optimize.opt_a.accelerated_algorithm : 0.000014s : 0.13% optimize.opt_a.shard : 0.000003s : 0.03% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.03% optimize.opt_a.shard_inline : 0.000012s : 0.12% optimize.opt_a.merge_send_recv : 0.000014s : 0.14% optimize.opt_a.auto_parallel : 0.000012s : 0.12% optimize.opt_a.parallel : 0.000026s : 0.25% optimize.opt_a.flash_sp : 0.000011s : 0.11% optimize.opt_a.merge_comm : 0.000007s : 0.07% optimize.opt_a.allreduce_fusion : 0.000007s : 0.07% optimize.opt_a.matmul_add_comm_reduction : 0.000019s : 0.19% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000014s : 0.13% optimize.opt_a.virtual_dataset : 0.000013s : 0.13% optimize.opt_a.get_grad_eliminate_ : 0.000013s : 0.13% optimize.opt_a.virtual_output : 0.000012s : 0.12% optimize.opt_a.merge_forward : 0.000007s : 0.07% optimize.opt_a.cell_reuse_recompute_pass : 0.000004s : 0.04% optimize.opt_a.offload_activation : 0.000017s : 0.17% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000029s : 0.28% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.02% optimize.opt_a.before_grad : 0.000020s : 0.19% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000007s : 0.07% optimize.opt_a.meta_fg_expand : 0.000005s : 0.05% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.04% optimize.opt_a.receive_attached : 0.000004s : 0.04% optimize.opt_a.after_resolve : 0.000019s : 0.18% optimize.opt_a.a_after_grad : 0.000018s : 0.18% optimize.opt_a.renormalize : 0.000522s : 5.07% optimize.opt_a.add_forward_monad_depend : 0.000006s : 0.06% optimize.opt_a.auto_monad_grad : 0.000003s : 0.03% optimize.opt_a.auto_monad_eliminator : 0.000022s : 0.22% optimize.opt_a.cse : 0.000045s : 0.44% optimize.opt_a.a_3 : 0.000107s : 1.04% optimize.py_interpret_to_execute_after_opt_a : 0.000013s : 0.12% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.05% optimize.rewriter_after_opt_a : 0.000042s : 0.41% optimize.convert_after_rewriter : 0.000010s : 0.10% optimize.order_py_execute_after_rewriter : 0.000008s : 0.08% optimize.mutable_eliminate : 0.000566s : 5.50% optimize.opt_b.b_1 : 0.000200s : 1.94% optimize.opt_b.b_2 : 0.000008s : 0.08% optimize.opt_b.updatestate_depend_eliminate : 0.000005s : 0.05% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_b.updatestate_loads_eliminate : 0.000002s : 0.02% optimize.opt_b.renormalize : 0.000000s : 0.00% optimize.opt_b.cse : 0.000020s : 0.19% optimize.optimize_parallel_all_gather_comm : 0.000020s : 0.19% optimize.overlap_param_gather : 0.000005s : 0.04% optimize.cconv : 0.000028s : 0.27% optimize.loop_unroll : 0.000491s : 4.77% optimize.opt_after_cconv.c_1 : 0.000031s : 0.30% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.02% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000005s : 0.05% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000002s : 0.02% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.cse : 0.000021s : 0.20% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000018s : 0.18% optimize.tuple_transform.d_1 : 0.000044s : 0.43% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.02% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000007s : 0.07% optimize.partial_unused_args_eliminate : 0.000005s : 0.04% optimize.add_recomputation : 0.000062s : 0.60% optimize.cse_after_recomputation.cse : 0.000014s : 0.13% optimize.environ_conv : 0.000009s : 0.08% optimize.swap_dp_allreduce_reducescatter : 0.000008s : 0.08% optimize.bias_add_comm_swap : 0.000005s : 0.05% optimize.label_micro_interleaved_index : 0.000007s : 0.07% optimize.label_fine_grained_interleaved_index : 0.000005s : 0.05% optimize.merge_cast_opt : 0.000004s : 0.04% optimize.slice_recompute_activation : 0.000005s : 0.05% optimize.micro_interleaved_order_control : 0.000005s : 0.04% optimize.assign_add_opt : 0.000004s : 0.04% optimize.ForceFp32Comm : 0.000003s : 0.03% optimize.remove_cast_before_assign_add : 0.000003s : 0.03% optimize.full_micro_interleaved_order_control : 0.000004s : 0.04% optimize.reorder_send_recv_between_fp_bp : 0.000005s : 0.05% optimize.comm_op_add_attrs : 0.000003s : 0.03% optimize.add_comm_op_reuse_tag : 0.000003s : 0.03% optimize.interleave_split_concat_branches : 0.000004s : 0.04% optimize.interleave_parallel_branches : 0.000003s : 0.03% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.04% optimize.overlap_opt_shard_grad_in_pipeline : 0.000004s : 0.04% optimize.control_data_broadcast_order : 0.000016s : 0.15% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.04% optimize.offloading_packed_experts : 0.000007s : 0.07% optimize.overlap_recompute_and_grad_model_parallel : 0.000007s : 0.07% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.04% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.04% optimize.overlap_recompute_comm : 0.000005s : 0.05% optimize.overlap_grad_ring_attention : 0.000007s : 0.06% optimize.overlap_grad_flash_sp : 0.000022s : 0.22% optimize.begin_end_overlap_inline : 0.000003s : 0.03% optimize.split_matmul_comm_elemetwise : 0.000005s : 0.05% optimize.split_layernorm_comm : 0.000004s : 0.04% optimize.handle_group_info : 0.000003s : 0.03% optimize.symbol_engine_optimizer.build : 0.000003s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000010s : 0.10% optimize.symbol_engine_optimizer.elim_not_effective : 0.000013s : 0.13% optimize.symbol_engine_optimizer.opt_reshape : 0.000007s : 0.07% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000010s : 0.10% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000004s : 0.04% pipeline_parallel_scheduler : 0.000002s : 0.02% auto_monad_reorder : 0.000017s : 0.17% get_jit_bprop_graph : 0.000002s : 0.02% rewriter_after_jit_bprop_graph : 0.000004s : 0.04% opt_after_jit_grad : 0.000523s : 5.08% validate : 0.000037s : 0.36% Time group info: ------[substitution.] 0.000135 20 1.49% : 0.000002s : 2: substitution.elim_not_effective 1.04% : 0.000001s : 2: substitution.fold_const_symbol 4.41% : 0.000006s : 3: substitution.graph_param_transform 65.11% : 0.000088s : 2: substitution.inline 2.49% : 0.000003s : 4: substitution.j_node_and_user_rematch 3.58% : 0.000005s : 4: substitution.remove_not_recompute_node 2.75% : 0.000004s : 2: substitution.replace_old_param 19.13% : 0.000026s : 1: substitution.value_based_eliminate ------[type_inference.] 0.005430 2 91.03% : 0.004943s : 1: type_inference.infer 8.97% : 0.000487s : 1: type_inference.specialize ------[replace.] 0.000021 2 100.00% : 0.000021s : 2: replace.inline ------[match.] 0.000086 2 100.00% : 0.000086s : 2: match.inline ------[predicate.] 0.000141 754 0.80% : 0.000001s : 7: predicate.accumulaten_eliminater 0.94% : 0.000001s : 3: predicate.ad_related_special_op_eliminate 0.71% : 0.000001s : 6: predicate.addn_check_dump 0.80% : 0.000001s : 7: predicate.addn_zero_filter 0.66% : 0.000001s : 7: predicate.adjust_all_reduce_mul_add 2.24% : 0.000003s : 13: predicate.arithmetic_simplify 0.83% : 0.000001s : 7: predicate.cast_eliminate 0.83% : 0.000001s : 6: predicate.check_bprop_eliminate 0.67% : 0.000001s : 6: predicate.compare_switch_simplify 0.20% : 0.000000s : 3: predicate.const_output_eliminate 0.76% : 0.000001s : 6: predicate.depend_value_elim 0.85% : 0.000001s : 7: predicate.dict_get_item_const_eliminator 0.90% : 0.000001s : 7: predicate.dict_get_item_eliminator 0.86% : 0.000001s : 7: predicate.dict_set_item_eliminator 1.21% : 0.000002s : 6: predicate.dumpgradient_eliminate 0.23% : 0.000000s : 3: predicate.elim_not_effective 0.46% : 0.000001s : 3: predicate.elim_shapecalc_of_broadcastargs 1.09% : 0.000002s : 10: predicate.environ_add_const_eliminate 1.01% : 0.000001s : 10: predicate.environ_get_add_eliminate 0.92% : 0.000001s : 10: predicate.environ_get_depend_swap 1.86% : 0.000003s : 16: predicate.environ_get_eliminate 1.10% : 0.000002s : 10: predicate.environ_get_set_eliminate 1.02% : 0.000001s : 9: predicate.exchange_switch_depend_value 1.94% : 0.000003s : 9: predicate.float_depend_g_call 0.68% : 0.000001s : 6: predicate.float_environ_get_switch 1.00% : 0.000001s : 9: predicate.float_tuple_getitem_switch 0.21% : 0.000000s : 3: predicate.fold_const_symbol 1.12% : 0.000002s : 6: predicate.get_grad_eliminate 0.21% : 0.000000s : 3: predicate.graph_param_transform 0.77% : 0.000001s : 6: predicate.incorporate_call 0.65% : 0.000001s : 6: predicate.incorporate_call_switch 6.61% : 0.000009s : 34: predicate.inline 0.98% : 0.000001s : 6: predicate.inline_without_move 0.36% : 0.000001s : 6: predicate.j_node_and_user_rematch 1.10% : 0.000002s : 6: predicate.less_batch_normalization 1.55% : 0.000002s : 13: predicate.list_to_tuple_eliminator_ 2.06% : 0.000003s : 20: predicate.load_eliminater 1.18% : 0.000002s : 3: predicate.loop_unroll_after_grad 1.69% : 0.000002s : 14: predicate.loop_unroll_before_grad 1.89% : 0.000003s : 13: predicate.make_slice_get_slice_eliminator 0.83% : 0.000001s : 6: predicate.merge_addn 0.73% : 0.000001s : 6: predicate.micro_step_allgather_replace 0.65% : 0.000001s : 6: predicate.mini_step_allgather_replace 0.67% : 0.000001s : 7: predicate.minmaximum_grad 1.32% : 0.000002s : 3: predicate.mutable_eliminate 0.48% : 0.000001s : 3: predicate.opt_reshape 0.69% : 0.000001s : 3: predicate.parallel_virtual_node 1.42% : 0.000002s : 9: predicate.partial_defer_inline 1.17% : 0.000002s : 10: predicate.partial_eliminate 0.85% : 0.000001s : 7: predicate.print_const_string_wrapper 0.73% : 0.000001s : 6: predicate.reduce_all_const_elim 0.86% : 0.000001s : 7: predicate.reduce_eliminate 2.26% : 0.000003s : 20: predicate.redundant_stop_gradient_eliminater 0.69% : 0.000001s : 6: predicate.remove_not_recompute_node 1.16% : 0.000002s : 13: predicate.replace_applicator 0.63% : 0.000001s : 6: predicate.replace_old_param 0.33% : 0.000000s : 3: predicate.reset_defer_inline 0.78% : 0.000001s : 7: predicate.reshape_eliminate 0.85% : 0.000001s : 6: predicate.row_tensor_add_zeros_like 0.56% : 0.000001s : 3: predicate.row_tensor_eliminate 0.91% : 0.000001s : 6: predicate.same_eliminate 0.49% : 0.000001s : 6: predicate.set_cell_output_no_recompute 0.91% : 0.000001s : 6: predicate.shard_identity_eliminate 0.89% : 0.000001s : 6: predicate.special_op_eliminate 0.85% : 0.000001s : 6: predicate.specialize_transform 1.22% : 0.000002s : 6: predicate.split_environ_get_set_with_tuple_value 0.97% : 0.000001s : 6: predicate.stack_unstack_eliminate 0.42% : 0.000001s : 3: predicate.switch_call_monad_eliminater 1.09% : 0.000002s : 9: predicate.switch_defer_inline 1.78% : 0.000002s : 15: predicate.switch_layer_defer_inline 4.45% : 0.000006s : 32: predicate.switch_simplify 0.91% : 0.000001s : 7: predicate.tile_eliminate 0.76% : 0.000001s : 7: predicate.transpose_eliminate 1.59% : 0.000002s : 13: predicate.tuple_list_convert_item_index_to_positive 1.73% : 0.000002s : 13: predicate.tuple_list_get_item_const_eliminator 1.59% : 0.000002s : 13: predicate.tuple_list_get_item_depend_reorder 3.29% : 0.000005s : 19: predicate.tuple_list_get_item_eliminator 1.39% : 0.000002s : 13: predicate.tuple_list_get_set_item_eliminator 2.38% : 0.000003s : 19: predicate.tuple_list_set_item_eliminator 1.48% : 0.000002s : 13: predicate.tuple_to_list_eliminator_ 1.89% : 0.000003s : 20: predicate.updatestate_pure_node_eliminater 2.97% : 0.000004s : 26: predicate.updatestate_useless_node_eliminater 0.70% : 0.000001s : 3: predicate.value_based_eliminate 0.99% : 0.000001s : 6: predicate.virtual_dataset_eliminate 0.77% : 0.000001s : 6: predicate.virtual_output_eliminate 0.32% : 0.000000s : 3: predicate.virtual_view_grad_eliminate 0.65% : 0.000001s : 3: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000216 5 7.88% : 0.000017s : 1: func_graph_cloner_run.FuncGraphClonerGraph 92.12% : 0.000199s : 4: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.025107 192 0.03% : 0.000006s : 1: ForceFp32Comm 13.43% : 0.003371s : 1: add_attr 13.37% : 0.003356s : 1: add_attr_with_inline 0.02% : 0.000006s : 1: add_comm_op_reuse_tag 0.26% : 0.000066s : 1: add_recomputation 0.03% : 0.000007s : 1: assign_add_opt 0.26% : 0.000066s : 1: auto_monad 0.10% : 0.000024s : 1: auto_monad_reorder 0.02% : 0.000006s : 1: begin_end_overlap_inline 0.03% : 0.000008s : 1: bias_add_comm_swap 1.67% : 0.000420s : 1: bootstrap 0.12% : 0.000031s : 1: cconv 0.03% : 0.000006s : 1: comm_op_add_attrs 0.07% : 0.000019s : 1: control_data_broadcast_order 0.05% : 0.000013s : 1: convert_after_rewriter 0.13% : 0.000032s : 1: cse_after_recomputation 0.03% : 0.000008s : 1: dataset_repeat_opt 0.07% : 0.000018s : 1: detach_backward 0.05% : 0.000011s : 1: environ_conv 0.13% : 0.000033s : 1: event_method 0.03% : 0.000007s : 1: full_micro_interleaved_order_control 0.03% : 0.000009s : 1: get_jit_bprop_graph 0.05% : 0.000012s : 1: graph_reusing 0.03% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.02% : 0.000006s : 1: handle_group_info 0.03% : 0.000009s : 1: inline 0.04% : 0.000010s : 1: insert-virtual-dataset 0.02% : 0.000006s : 1: interleave_parallel_branches 0.03% : 0.000007s : 1: interleave_split_concat_branches 0.03% : 0.000008s : 1: label_fine_grained_interleaved_index 0.04% : 0.000010s : 1: label_micro_interleaved_index 1.98% : 0.000498s : 1: loop_unroll 0.03% : 0.000007s : 1: merge_cast_opt 0.03% : 0.000007s : 1: micro_interleaved_order_control 2.28% : 0.000572s : 1: mutable_eliminate 0.04% : 0.000010s : 1: offloading_packed_experts 0.06% : 0.000015s : 1: opt.transform.loop_unroll_optimizer 0.06% : 0.000015s : 1: opt.transform.mutable_eliminate 3.16% : 0.000792s : 78: opt.transform.opt_a 0.11% : 0.000029s : 1: opt.transform.opt_after_cconv 0.10% : 0.000026s : 1: opt.transform.opt_after_jit_grad 0.52% : 0.000131s : 28: opt.transform.opt_b 0.19% : 0.000048s : 2: opt.transform.opt_trans_graph 0.15% : 0.000037s : 4: opt.transform.symbol_engine_opt 9.98% : 0.002506s : 1: opt_a 0.53% : 0.000133s : 1: opt_after_cconv 2.12% : 0.000533s : 1: opt_after_jit_grad 1.23% : 0.000310s : 1: opt_b 20.68% : 0.005191s : 1: optimize 0.09% : 0.000023s : 1: optimize_parallel_all_gather_comm 0.05% : 0.000011s : 1: order_py_execute_after_rewriter 0.10% : 0.000025s : 1: overlap_grad_flash_sp 0.03% : 0.000007s : 1: overlap_grad_matmul_and_grad_allreduce 0.04% : 0.000009s : 1: overlap_grad_ring_attention 0.03% : 0.000007s : 1: overlap_opt_shard_grad_in_pipeline 0.03% : 0.000006s : 1: overlap_opt_shard_in_pipeline 0.03% : 0.000008s : 1: overlap_param_gather 0.03% : 0.000007s : 1: overlap_recompute_allgather_and_fa_grad 0.04% : 0.000010s : 1: overlap_recompute_and_grad_model_parallel 0.03% : 0.000008s : 1: overlap_recompute_comm 0.04% : 0.000010s : 1: parallel-infer-symbol 0.03% : 0.000007s : 1: parallel-infer-symbol-second 0.03% : 0.000007s : 1: partial_unused_args_eliminate 0.04% : 0.000009s : 1: pipeline_parallel_scheduler 0.03% : 0.000008s : 1: pipeline_split 0.14% : 0.000035s : 1: pre_auto_parallel 0.10% : 0.000026s : 1: py_interpret_to_execute 0.06% : 0.000016s : 1: py_interpret_to_execute_after_opt_a 0.02% : 0.000006s : 1: remove_cast_before_assign_add 0.09% : 0.000022s : 1: remove_dup_value 1.14% : 0.000287s : 1: renormalize.infer 0.90% : 0.000227s : 1: renormalize.specialize 0.03% : 0.000008s : 1: reorder_send_recv_between_fp_bp 0.04% : 0.000010s : 1: rewriter_after_jit_bprop_graph 0.18% : 0.000045s : 1: rewriter_after_opt_a 0.21% : 0.000053s : 1: rewriter_before_opt_a 0.03% : 0.000008s : 1: slice_cell_reuse_recomputed_activation 0.03% : 0.000008s : 1: slice_recompute_activation 0.03% : 0.000007s : 1: split_layernorm_comm 0.03% : 0.000008s : 1: split_matmul_comm_elemetwise 0.04% : 0.000011s : 1: swap_dp_allreduce_reducescatter 0.40% : 0.000101s : 1: symbol_engine_optimizer 0.38% : 0.000095s : 1: tuple_transform 21.95% : 0.005511s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:52.217.421 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0172656, [21] [bootstrap]: 0.00054045 [type_inference]: 0.00663833 [event_method]: 1.485e-05 [auto_monad]: 6.097e-05 [graph_reusing]: 6.24999e-06 [inline]: 3.48e-06 [add_attr]: 0.00389101, [1] [add_attr_with_inline]: 0.00387824, [1] [Cycle 1]: 6.997e-05, [2] [tag_attr]: 1.851e-05 [meta_addattr_fg_expand]: 3.88001e-06 [parallel-infer-symbol]: 3.88001e-06 [pre_auto_parallel]: 3.201e-05 [insert-virtual-dataset]: 2.58e-06 [parallel-infer-symbol-second]: 6.80011e-07 [dataset_repeat_opt]: 3.55e-06 [pipeline_split]: 1.76998e-06 [optimize]: 0.00517656, [53] [py_interpret_to_execute]: 2.137e-05 [rewriter_before_opt_a]: 5.269e-05 [opt_a]: 0.00254441, [2] [Cycle 1]: 0.00181161, [45] [expand_dump_flag]: 2.83e-06 [switch_simplify]: 2.81e-05 [loop_unroll]: 1.373e-05 [a_1]: 0.00032494 [with_stream_mark]: 2.121e-05 [recompute_prepare]: 1.083e-05 [updatestate_depend_eliminate]: 4.05e-06 [updatestate_assign_eliminate]: 2.99999e-06 [updatestate_loads_eliminate]: 3.33e-06 [parameter_eliminate]: 2.25002e-06 [a_2]: 8.291e-05 [accelerated_algorithm]: 7.71999e-06 [shard]: 2.67001e-06 [meta_shard_fg_expand]: 1.96e-06 [shard_inline]: 6.93998e-06 [merge_send_recv]: 9.01002e-06 [auto_parallel]: 8e-06 [parallel]: 1.966e-05 [flash_sp]: 1.148e-05 [merge_comm]: 3.98001e-06 [allreduce_fusion]: 4.12e-06 [matmul_add_comm_reduction]: 1.082e-05 [allreduce_slice_to_reducescatter]: 9.79984e-07 [virtual_shard_identity]: 8.93002e-06 [virtual_dataset]: 6.45002e-06 [get_grad_eliminate_]: 7.38999e-06 [virtual_output]: 6.14001e-06 [merge_forward]: 4.34997e-06 [cell_reuse_recompute_pass]: 1.59998e-06 [offload_activation]: 1.152e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.431e-05 [merge_recompute_call_nodes]: 1.72999e-06 [before_grad]: 1.212e-05 [set_forward_comm_id_for_comm_node_pass]: 3.75e-06 [meta_fg_expand]: 3.2e-06 [flash_sp_send_recv_attached]: 3.03e-06 [receive_attached]: 2.29999e-06 [after_resolve]: 1.264e-05 [a_after_grad]: 9.48002e-06 [renormalize]: 0.00072711 [add_forward_monad_depend]: 7.48e-06 [auto_monad_grad]: 3.41001e-06 [auto_monad_eliminator]: 1.925e-05 [cse]: 3.537e-05 [a_3]: 5.264e-05 [Cycle 2]: 0.00072094, [45] [expand_dump_flag]: 1.52999e-06 [switch_simplify]: 8.54002e-06 [loop_unroll]: 6.47001e-06 [a_1]: 0.000114 [with_stream_mark]: 1.594e-05 [recompute_prepare]: 7.1e-06 [updatestate_depend_eliminate]: 3.05998e-06 [updatestate_assign_eliminate]: 2.65002e-06 [updatestate_loads_eliminate]: 3.50998e-06 [parameter_eliminate]: 1.64e-06 [a_2]: 7.144e-05 [accelerated_algorithm]: 6.69999e-06 [shard]: 1.62999e-06 [meta_shard_fg_expand]: 2.09e-06 [shard_inline]: 5.87001e-06 [merge_send_recv]: 7.01001e-06 [auto_parallel]: 6.96999e-06 [parallel]: 7.25998e-06 [flash_sp]: 4.23001e-06 [merge_comm]: 4.49998e-06 [allreduce_fusion]: 3.31001e-06 [matmul_add_comm_reduction]: 8.28999e-06 [allreduce_slice_to_reducescatter]: 3.09985e-07 [virtual_shard_identity]: 8.37e-06 [virtual_dataset]: 6.98998e-06 [get_grad_eliminate_]: 5.79e-06 [virtual_output]: 5.46e-06 [merge_forward]: 5.17999e-06 [cell_reuse_recompute_pass]: 2.62001e-06 [offload_activation]: 9.00999e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.883e-05 [merge_recompute_call_nodes]: 1.84e-06 [before_grad]: 1.064e-05 [set_forward_comm_id_for_comm_node_pass]: 4.90001e-06 [meta_fg_expand]: 2.49001e-06 [flash_sp_send_recv_attached]: 1.06002e-06 [receive_attached]: 1.66e-06 [after_resolve]: 1.138e-05 [a_after_grad]: 9.01002e-06 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 2.72001e-06 [auto_monad_grad]: 2.04e-06 [auto_monad_eliminator]: 1.17e-05 [cse]: 2.23e-05 [a_3]: 3.757e-05 [py_interpret_to_execute_after_opt_a]: 1.394e-05 [slice_cell_reuse_recomputed_activation]: 2.46e-06 [rewriter_after_opt_a]: 4.282e-05 [convert_after_rewriter]: 7.13e-06 [order_py_execute_after_rewriter]: 5.80002e-06 [mutable_eliminate]: 0.00079091 [opt_b]: 0.00028076, [1] [Cycle 1]: 0.0002709, [7] [b_1]: 0.00017272 [b_2]: 9.59e-06 [updatestate_depend_eliminate]: 8.97e-06 [updatestate_assign_eliminate]: 2.79999e-06 [updatestate_loads_eliminate]: 2.79999e-06 [renormalize]: 9.30013e-07 [cse]: 3.108e-05 [optimize_parallel_all_gather_comm]: 2.099e-05 [overlap_param_gather]: 1.94999e-06 [cconv]: 3.562e-05 [loop_unroll]: 0.00060771 [opt_after_cconv]: 0.00012387, [1] [Cycle 1]: 0.00011654, [7] [c_1]: 3.152e-05 [parameter_eliminate]: 5.74999e-06 [updatestate_depend_eliminate]: 7.26999e-06 [updatestate_assign_eliminate]: 4.52e-06 [updatestate_loads_eliminate]: 2.44001e-06 [cse]: 2.655e-05 [renormalize]: 6.00005e-07 [remove_dup_value]: 1.564e-05 [tuple_transform]: 7.943e-05, [1] [Cycle 1]: 7.369e-05, [4] [d_1]: 4.457e-05 [none_parameter_eliminate]: 2.16e-06 [renormalize]: 2.19996e-07 [switch_simplify]: 7.36999e-06 [partial_unused_args_eliminate]: 1.84e-06 [add_recomputation]: 5.464e-05 [cse_after_recomputation]: 2.661e-05, [1] [Cycle 1]: 2.044e-05, [1] [cse]: 1.35e-05 [environ_conv]: 6.16e-06 [swap_dp_allreduce_reducescatter]: 5.34003e-06 [bias_add_comm_swap]: 2.76e-06 [label_micro_interleaved_index]: 6.30002e-06 [label_fine_grained_interleaved_index]: 2.94001e-06 [merge_cast_opt]: 1.69e-06 [slice_recompute_activation]: 2.11e-06 [micro_interleaved_order_control]: 2.26e-06 [assign_add_opt]: 1.45001e-06 [ForceFp32Comm]: 9.70002e-07 [remove_cast_before_assign_add]: 1.10001e-06 [full_micro_interleaved_order_control]: 2.14e-06 [reorder_send_recv_between_fp_bp]: 2.94999e-06 [comm_op_add_attrs]: 1.00999e-06 [add_comm_op_reuse_tag]: 9.5999e-07 [interleave_split_concat_branches]: 1.31002e-06 [interleave_parallel_branches]: 1.09e-06 [overlap_opt_shard_in_pipeline]: 1.27e-06 [overlap_opt_shard_grad_in_pipeline]: 2.45002e-06 [control_data_broadcast_order]: 1.47e-05 [grouped_pairwise_exchange_alltoall]: 1.81998e-06 [offloading_packed_experts]: 4.59998e-06 [overlap_recompute_and_grad_model_parallel]: 5.10999e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.10999e-06 [overlap_recompute_allgather_and_fa_grad]: 1.40999e-06 [overlap_recompute_comm]: 2.31e-06 [overlap_grad_ring_attention]: 4.62e-06 [overlap_grad_flash_sp]: 2.185e-05 [begin_end_overlap_inline]: 6.19999e-07 [split_matmul_comm_elemetwise]: 2.22999e-06 [split_layernorm_comm]: 1.98002e-06 [handle_group_info]: 1.22e-06 [symbol_engine_optimizer]: 8.999e-05, [1] [Cycle 1]: 8.398e-05, [6] [build]: 3.6e-06 [elim_shapecalc]: 1.353e-05 [elim_not_effective]: 1.356e-05 [opt_reshape]: 7.54002e-06 [fold_const_symbol]: 1.089e-05 [renormalize]: 4.19997e-07 [detach_backward]: 2.31e-06 [pipeline_parallel_scheduler]: 1.84e-06 [auto_monad_reorder]: 1.954e-05 [get_jit_bprop_graph]: 2.06e-06 [rewriter_after_jit_bprop_graph]: 6.48e-06 [opt_after_jit_grad]: 0.00062889 [validate]: 4.903e-05 Sums bootstrap : 0.000540s : 4.41% type_inference : 0.006638s : 54.12% event_method : 0.000015s : 0.12% auto_monad : 0.000061s : 0.50% graph_reusing : 0.000006s : 0.05% inline : 0.000003s : 0.03% add_attr.add_attr_with_inline.tag_attr : 0.000019s : 0.15% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000004s : 0.03% parallel-infer-symbol : 0.000004s : 0.03% pre_auto_parallel : 0.000032s : 0.26% insert-virtual-dataset : 0.000003s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000004s : 0.03% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000021s : 0.17% optimize.rewriter_before_opt_a : 0.000053s : 0.43% optimize.opt_a.expand_dump_flag : 0.000004s : 0.04% optimize.opt_a.switch_simplify : 0.000037s : 0.30% optimize.opt_a.loop_unroll : 0.000020s : 0.16% optimize.opt_a.a_1 : 0.000439s : 3.58% optimize.opt_a.with_stream_mark : 0.000037s : 0.30% optimize.opt_a.recompute_prepare : 0.000018s : 0.15% optimize.opt_a.updatestate_depend_eliminate : 0.000007s : 0.06% optimize.opt_a.updatestate_assign_eliminate : 0.000006s : 0.05% optimize.opt_a.updatestate_loads_eliminate : 0.000007s : 0.06% optimize.opt_a.parameter_eliminate : 0.000004s : 0.03% optimize.opt_a.a_2 : 0.000154s : 1.26% optimize.opt_a.accelerated_algorithm : 0.000014s : 0.12% optimize.opt_a.shard : 0.000004s : 0.04% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.03% optimize.opt_a.shard_inline : 0.000013s : 0.10% optimize.opt_a.merge_send_recv : 0.000016s : 0.13% optimize.opt_a.auto_parallel : 0.000015s : 0.12% optimize.opt_a.parallel : 0.000027s : 0.22% optimize.opt_a.flash_sp : 0.000016s : 0.13% optimize.opt_a.merge_comm : 0.000008s : 0.07% optimize.opt_a.allreduce_fusion : 0.000007s : 0.06% optimize.opt_a.matmul_add_comm_reduction : 0.000019s : 0.16% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000017s : 0.14% optimize.opt_a.virtual_dataset : 0.000013s : 0.11% optimize.opt_a.get_grad_eliminate_ : 0.000013s : 0.11% optimize.opt_a.virtual_output : 0.000012s : 0.09% optimize.opt_a.merge_forward : 0.000010s : 0.08% optimize.opt_a.cell_reuse_recompute_pass : 0.000004s : 0.03% optimize.opt_a.offload_activation : 0.000021s : 0.17% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000033s : 0.27% optimize.opt_a.merge_recompute_call_nodes : 0.000004s : 0.03% optimize.opt_a.before_grad : 0.000023s : 0.19% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000009s : 0.07% optimize.opt_a.meta_fg_expand : 0.000006s : 0.05% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.03% optimize.opt_a.receive_attached : 0.000004s : 0.03% optimize.opt_a.after_resolve : 0.000024s : 0.20% optimize.opt_a.a_after_grad : 0.000018s : 0.15% optimize.opt_a.renormalize : 0.000727s : 5.93% optimize.opt_a.add_forward_monad_depend : 0.000010s : 0.08% optimize.opt_a.auto_monad_grad : 0.000005s : 0.04% optimize.opt_a.auto_monad_eliminator : 0.000031s : 0.25% optimize.opt_a.cse : 0.000058s : 0.47% optimize.opt_a.a_3 : 0.000090s : 0.74% optimize.py_interpret_to_execute_after_opt_a : 0.000014s : 0.11% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.02% optimize.rewriter_after_opt_a : 0.000043s : 0.35% optimize.convert_after_rewriter : 0.000007s : 0.06% optimize.order_py_execute_after_rewriter : 0.000006s : 0.05% optimize.mutable_eliminate : 0.000791s : 6.45% optimize.opt_b.b_1 : 0.000173s : 1.41% optimize.opt_b.b_2 : 0.000010s : 0.08% optimize.opt_b.updatestate_depend_eliminate : 0.000009s : 0.07% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.02% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.02% optimize.opt_b.renormalize : 0.000001s : 0.01% optimize.opt_b.cse : 0.000031s : 0.25% optimize.optimize_parallel_all_gather_comm : 0.000021s : 0.17% optimize.overlap_param_gather : 0.000002s : 0.02% optimize.cconv : 0.000036s : 0.29% optimize.loop_unroll : 0.000608s : 4.95% optimize.opt_after_cconv.c_1 : 0.000032s : 0.26% optimize.opt_after_cconv.parameter_eliminate : 0.000006s : 0.05% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000007s : 0.06% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000005s : 0.04% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.02% optimize.opt_after_cconv.cse : 0.000027s : 0.22% optimize.opt_after_cconv.renormalize : 0.000001s : 0.00% optimize.remove_dup_value : 0.000016s : 0.13% optimize.tuple_transform.d_1 : 0.000045s : 0.36% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.02% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000007s : 0.06% optimize.partial_unused_args_eliminate : 0.000002s : 0.02% optimize.add_recomputation : 0.000055s : 0.45% optimize.cse_after_recomputation.cse : 0.000013s : 0.11% optimize.environ_conv : 0.000006s : 0.05% optimize.swap_dp_allreduce_reducescatter : 0.000005s : 0.04% optimize.bias_add_comm_swap : 0.000003s : 0.02% optimize.label_micro_interleaved_index : 0.000006s : 0.05% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.02% optimize.merge_cast_opt : 0.000002s : 0.01% optimize.slice_recompute_activation : 0.000002s : 0.02% optimize.micro_interleaved_order_control : 0.000002s : 0.02% optimize.assign_add_opt : 0.000001s : 0.01% optimize.ForceFp32Comm : 0.000001s : 0.01% optimize.remove_cast_before_assign_add : 0.000001s : 0.01% optimize.full_micro_interleaved_order_control : 0.000002s : 0.02% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.02% optimize.comm_op_add_attrs : 0.000001s : 0.01% optimize.add_comm_op_reuse_tag : 0.000001s : 0.01% optimize.interleave_split_concat_branches : 0.000001s : 0.01% optimize.interleave_parallel_branches : 0.000001s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.02% optimize.control_data_broadcast_order : 0.000015s : 0.12% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.01% optimize.offloading_packed_experts : 0.000005s : 0.04% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.04% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.01% optimize.overlap_recompute_comm : 0.000002s : 0.02% optimize.overlap_grad_ring_attention : 0.000005s : 0.04% optimize.overlap_grad_flash_sp : 0.000022s : 0.18% optimize.begin_end_overlap_inline : 0.000001s : 0.01% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.02% optimize.split_layernorm_comm : 0.000002s : 0.02% optimize.handle_group_info : 0.000001s : 0.01% optimize.symbol_engine_optimizer.build : 0.000004s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000014s : 0.11% optimize.symbol_engine_optimizer.elim_not_effective : 0.000014s : 0.11% optimize.symbol_engine_optimizer.opt_reshape : 0.000008s : 0.06% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000011s : 0.09% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.02% pipeline_parallel_scheduler : 0.000002s : 0.02% auto_monad_reorder : 0.000020s : 0.16% get_jit_bprop_graph : 0.000002s : 0.02% rewriter_after_jit_bprop_graph : 0.000006s : 0.05% opt_after_jit_grad : 0.000629s : 5.13% validate : 0.000049s : 0.40% Time group info: ------[substitution.] 0.000167 20 1.24% : 0.000002s : 2: substitution.elim_not_effective 0.83% : 0.000001s : 2: substitution.fold_const_symbol 3.59% : 0.000006s : 3: substitution.graph_param_transform 65.28% : 0.000109s : 2: substitution.inline 2.83% : 0.000005s : 4: substitution.j_node_and_user_rematch 2.93% : 0.000005s : 4: substitution.remove_not_recompute_node 3.11% : 0.000005s : 2: substitution.replace_old_param 20.19% : 0.000034s : 1: substitution.value_based_eliminate ------[type_inference.] 0.006575 2 92.00% : 0.006049s : 1: type_inference.infer 8.00% : 0.000526s : 1: type_inference.specialize ------[replace.] 0.000025 2 100.00% : 0.000025s : 2: replace.inline ------[match.] 0.000107 2 100.00% : 0.000107s : 2: match.inline ------[predicate.] 0.000152 754 0.75% : 0.000001s : 7: predicate.accumulaten_eliminater 1.58% : 0.000002s : 3: predicate.ad_related_special_op_eliminate 0.64% : 0.000001s : 6: predicate.addn_check_dump 0.83% : 0.000001s : 7: predicate.addn_zero_filter 0.63% : 0.000001s : 7: predicate.adjust_all_reduce_mul_add 2.33% : 0.000004s : 13: predicate.arithmetic_simplify 0.85% : 0.000001s : 7: predicate.cast_eliminate 0.72% : 0.000001s : 6: predicate.check_bprop_eliminate 0.64% : 0.000001s : 6: predicate.compare_switch_simplify 0.22% : 0.000000s : 3: predicate.const_output_eliminate 0.75% : 0.000001s : 6: predicate.depend_value_elim 0.67% : 0.000001s : 7: predicate.dict_get_item_const_eliminator 0.79% : 0.000001s : 7: predicate.dict_get_item_eliminator 0.69% : 0.000001s : 7: predicate.dict_set_item_eliminator 1.20% : 0.000002s : 6: predicate.dumpgradient_eliminate 0.20% : 0.000000s : 3: predicate.elim_not_effective 0.60% : 0.000001s : 3: predicate.elim_shapecalc_of_broadcastargs 1.04% : 0.000002s : 10: predicate.environ_add_const_eliminate 1.02% : 0.000002s : 10: predicate.environ_get_add_eliminate 0.87% : 0.000001s : 10: predicate.environ_get_depend_swap 1.54% : 0.000002s : 16: predicate.environ_get_eliminate 0.89% : 0.000001s : 10: predicate.environ_get_set_eliminate 0.84% : 0.000001s : 9: predicate.exchange_switch_depend_value 1.86% : 0.000003s : 9: predicate.float_depend_g_call 0.56% : 0.000001s : 6: predicate.float_environ_get_switch 0.91% : 0.000001s : 9: predicate.float_tuple_getitem_switch 0.21% : 0.000000s : 3: predicate.fold_const_symbol 0.81% : 0.000001s : 6: predicate.get_grad_eliminate 0.37% : 0.000001s : 3: predicate.graph_param_transform 0.70% : 0.000001s : 6: predicate.incorporate_call 0.56% : 0.000001s : 6: predicate.incorporate_call_switch 5.90% : 0.000009s : 34: predicate.inline 1.22% : 0.000002s : 6: predicate.inline_without_move 0.53% : 0.000001s : 6: predicate.j_node_and_user_rematch 1.68% : 0.000003s : 6: predicate.less_batch_normalization 1.94% : 0.000003s : 13: predicate.list_to_tuple_eliminator_ 1.93% : 0.000003s : 20: predicate.load_eliminater 1.59% : 0.000002s : 3: predicate.loop_unroll_after_grad 1.48% : 0.000002s : 14: predicate.loop_unroll_before_grad 1.68% : 0.000003s : 13: predicate.make_slice_get_slice_eliminator 0.58% : 0.000001s : 6: predicate.merge_addn 0.76% : 0.000001s : 6: predicate.micro_step_allgather_replace 0.63% : 0.000001s : 6: predicate.mini_step_allgather_replace 0.62% : 0.000001s : 7: predicate.minmaximum_grad 2.15% : 0.000003s : 3: predicate.mutable_eliminate 0.43% : 0.000001s : 3: predicate.opt_reshape 0.62% : 0.000001s : 3: predicate.parallel_virtual_node 1.20% : 0.000002s : 9: predicate.partial_defer_inline 1.12% : 0.000002s : 10: predicate.partial_eliminate 0.78% : 0.000001s : 7: predicate.print_const_string_wrapper 0.68% : 0.000001s : 6: predicate.reduce_all_const_elim 1.39% : 0.000002s : 7: predicate.reduce_eliminate 1.96% : 0.000003s : 20: predicate.redundant_stop_gradient_eliminater 0.79% : 0.000001s : 6: predicate.remove_not_recompute_node 1.12% : 0.000002s : 13: predicate.replace_applicator 0.78% : 0.000001s : 6: predicate.replace_old_param 0.46% : 0.000001s : 3: predicate.reset_defer_inline 0.89% : 0.000001s : 7: predicate.reshape_eliminate 0.77% : 0.000001s : 6: predicate.row_tensor_add_zeros_like 0.74% : 0.000001s : 3: predicate.row_tensor_eliminate 1.03% : 0.000002s : 6: predicate.same_eliminate 0.58% : 0.000001s : 6: predicate.set_cell_output_no_recompute 1.22% : 0.000002s : 6: predicate.shard_identity_eliminate 0.80% : 0.000001s : 6: predicate.special_op_eliminate 0.84% : 0.000001s : 6: predicate.specialize_transform 1.35% : 0.000002s : 6: predicate.split_environ_get_set_with_tuple_value 0.82% : 0.000001s : 6: predicate.stack_unstack_eliminate 0.45% : 0.000001s : 3: predicate.switch_call_monad_eliminater 0.93% : 0.000001s : 9: predicate.switch_defer_inline 1.58% : 0.000002s : 15: predicate.switch_layer_defer_inline 4.07% : 0.000006s : 32: predicate.switch_simplify 0.81% : 0.000001s : 7: predicate.tile_eliminate 0.93% : 0.000001s : 7: predicate.transpose_eliminate 1.41% : 0.000002s : 13: predicate.tuple_list_convert_item_index_to_positive 1.92% : 0.000003s : 13: predicate.tuple_list_get_item_const_eliminator 1.40% : 0.000002s : 13: predicate.tuple_list_get_item_depend_reorder 3.42% : 0.000005s : 19: predicate.tuple_list_get_item_eliminator 1.67% : 0.000003s : 13: predicate.tuple_list_get_set_item_eliminator 2.67% : 0.000004s : 19: predicate.tuple_list_set_item_eliminator 1.66% : 0.000003s : 13: predicate.tuple_to_list_eliminator_ 1.79% : 0.000003s : 20: predicate.updatestate_pure_node_eliminater 2.61% : 0.000004s : 26: predicate.updatestate_useless_node_eliminater 0.73% : 0.000001s : 3: predicate.value_based_eliminate 0.90% : 0.000001s : 6: predicate.virtual_dataset_eliminate 0.78% : 0.000001s : 6: predicate.virtual_output_eliminate 0.31% : 0.000000s : 3: predicate.virtual_view_grad_eliminate 0.58% : 0.000001s : 3: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000270 5 8.52% : 0.000023s : 1: func_graph_cloner_run.FuncGraphClonerGraph 91.48% : 0.000247s : 4: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.028077 192 0.01% : 0.000004s : 1: ForceFp32Comm 13.88% : 0.003898s : 1: add_attr 13.83% : 0.003883s : 1: add_attr_with_inline 0.01% : 0.000004s : 1: add_comm_op_reuse_tag 0.21% : 0.000059s : 1: add_recomputation 0.02% : 0.000005s : 1: assign_add_opt 0.24% : 0.000066s : 1: auto_monad 0.08% : 0.000024s : 1: auto_monad_reorder 0.01% : 0.000004s : 1: begin_end_overlap_inline 0.02% : 0.000006s : 1: bias_add_comm_swap 2.05% : 0.000576s : 1: bootstrap 0.14% : 0.000039s : 1: cconv 0.01% : 0.000004s : 1: comm_op_add_attrs 0.06% : 0.000018s : 1: control_data_broadcast_order 0.04% : 0.000011s : 1: convert_after_rewriter 0.11% : 0.000030s : 1: cse_after_recomputation 0.02% : 0.000006s : 1: dataset_repeat_opt 0.02% : 0.000006s : 1: detach_backward 0.03% : 0.000009s : 1: environ_conv 0.08% : 0.000022s : 1: event_method 0.02% : 0.000005s : 1: full_micro_interleaved_order_control 0.02% : 0.000005s : 1: get_jit_bprop_graph 0.04% : 0.000010s : 1: graph_reusing 0.02% : 0.000005s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000004s : 1: handle_group_info 0.02% : 0.000007s : 1: inline 0.02% : 0.000006s : 1: insert-virtual-dataset 0.02% : 0.000005s : 1: interleave_parallel_branches 0.01% : 0.000004s : 1: interleave_split_concat_branches 0.02% : 0.000006s : 1: label_fine_grained_interleaved_index 0.03% : 0.000009s : 1: label_micro_interleaved_index 2.21% : 0.000620s : 1: loop_unroll 0.02% : 0.000005s : 1: merge_cast_opt 0.02% : 0.000005s : 1: micro_interleaved_order_control 2.86% : 0.000804s : 1: mutable_eliminate 0.03% : 0.000008s : 1: offloading_packed_experts 0.07% : 0.000020s : 1: opt.transform.loop_unroll_optimizer 0.08% : 0.000023s : 1: opt.transform.mutable_eliminate 2.97% : 0.000835s : 78: opt.transform.opt_a 0.11% : 0.000030s : 1: opt.transform.opt_after_cconv 0.11% : 0.000031s : 1: opt.transform.opt_after_jit_grad 0.51% : 0.000144s : 28: opt.transform.opt_b 0.17% : 0.000049s : 2: opt.transform.opt_trans_graph 0.15% : 0.000041s : 4: opt.transform.symbol_engine_opt 9.08% : 0.002548s : 1: opt_a 0.46% : 0.000128s : 1: opt_after_cconv 2.28% : 0.000641s : 1: opt_after_jit_grad 1.01% : 0.000285s : 1: opt_b 18.46% : 0.005184s : 1: optimize 0.09% : 0.000025s : 1: optimize_parallel_all_gather_comm 0.03% : 0.000009s : 1: order_py_execute_after_rewriter 0.09% : 0.000026s : 1: overlap_grad_flash_sp 0.01% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.03% : 0.000008s : 1: overlap_grad_ring_attention 0.02% : 0.000006s : 1: overlap_opt_shard_grad_in_pipeline 0.02% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.02% : 0.000005s : 1: overlap_param_gather 0.02% : 0.000005s : 1: overlap_recompute_allgather_and_fa_grad 0.03% : 0.000008s : 1: overlap_recompute_and_grad_model_parallel 0.02% : 0.000005s : 1: overlap_recompute_comm 0.03% : 0.000008s : 1: parallel-infer-symbol 0.01% : 0.000004s : 1: parallel-infer-symbol-second 0.02% : 0.000005s : 1: partial_unused_args_eliminate 0.02% : 0.000005s : 1: pipeline_parallel_scheduler 0.02% : 0.000005s : 1: pipeline_split 0.13% : 0.000036s : 1: pre_auto_parallel 0.09% : 0.000026s : 1: py_interpret_to_execute 0.06% : 0.000017s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000004s : 1: remove_cast_before_assign_add 0.07% : 0.000019s : 1: remove_dup_value 1.43% : 0.000400s : 1: renormalize.infer 1.13% : 0.000317s : 1: renormalize.specialize 0.02% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.03% : 0.000010s : 1: rewriter_after_jit_bprop_graph 0.17% : 0.000048s : 1: rewriter_after_opt_a 0.20% : 0.000057s : 1: rewriter_before_opt_a 0.02% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.02% : 0.000006s : 1: slice_recompute_activation 0.02% : 0.000005s : 1: split_layernorm_comm 0.02% : 0.000005s : 1: split_matmul_comm_elemetwise 0.03% : 0.000009s : 1: swap_dp_allreduce_reducescatter 0.33% : 0.000093s : 1: symbol_engine_optimizer 0.29% : 0.000082s : 1: tuple_transform 23.73% : 0.006664s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:52.430.912 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:52.431.231 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0174303, [21] [bootstrap]: 0.00047535 [type_inference]: 0.00574322 [event_method]: 1.437e-05 [auto_monad]: 6.021e-05 [graph_reusing]: 5.61e-06 [inline]: 2.36998e-06 [add_attr]: 0.00384268, [1] [add_attr_with_inline]: 0.00383103, [1] [Cycle 1]: 8.881e-05, [2] [tag_attr]: 1.815e-05 [meta_addattr_fg_expand]: 3.97002e-06 [parallel-infer-symbol]: 3.86001e-06 [pre_auto_parallel]: 3.204e-05 [insert-virtual-dataset]: 2.56998e-06 [parallel-infer-symbol-second]: 7.2e-07 [dataset_repeat_opt]: 2.56e-06 [pipeline_split]: 1.88997e-06 [optimize]: 0.00582149, [53] [py_interpret_to_execute]: 2.652e-05 [rewriter_before_opt_a]: 5.611e-05 [opt_a]: 0.0028952, [2] [Cycle 1]: 0.00201225, [45] [expand_dump_flag]: 3.38e-06 [switch_simplify]: 2.715e-05 [loop_unroll]: 1.451e-05 [a_1]: 0.00032086 [with_stream_mark]: 2.361e-05 [recompute_prepare]: 1.047e-05 [updatestate_depend_eliminate]: 5.09e-06 [updatestate_assign_eliminate]: 3.28e-06 [updatestate_loads_eliminate]: 3.06999e-06 [parameter_eliminate]: 2.29999e-06 [a_2]: 0.00011126 [accelerated_algorithm]: 6.73e-06 [shard]: 2.86e-06 [meta_shard_fg_expand]: 1.71e-06 [shard_inline]: 6.06e-06 [merge_send_recv]: 8.85001e-06 [auto_parallel]: 8.79e-06 [parallel]: 2.015e-05 [flash_sp]: 9.61e-06 [merge_comm]: 5.52001e-06 [allreduce_fusion]: 3.8e-06 [matmul_add_comm_reduction]: 9.99001e-06 [allreduce_slice_to_reducescatter]: 8.10018e-07 [virtual_shard_identity]: 9.56e-06 [virtual_dataset]: 7e-06 [get_grad_eliminate_]: 6.04001e-06 [virtual_output]: 7.11001e-06 [merge_forward]: 3.98999e-06 [cell_reuse_recompute_pass]: 1.56002e-06 [offload_activation]: 1.166e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.681e-05 [merge_recompute_call_nodes]: 1.75001e-06 [before_grad]: 1.254e-05 [set_forward_comm_id_for_comm_node_pass]: 3.93999e-06 [meta_fg_expand]: 2.94001e-06 [flash_sp_send_recv_attached]: 2.68e-06 [receive_attached]: 2.36998e-06 [after_resolve]: 1.114e-05 [a_after_grad]: 1.134e-05 [renormalize]: 0.00074659 [add_forward_monad_depend]: 6.11e-06 [auto_monad_grad]: 2.64999e-06 [auto_monad_eliminator]: 1.905e-05 [cse]: 3.111e-05 [a_3]: 6.787e-05 [Cycle 2]: 0.00086556, [45] [expand_dump_flag]: 1.84e-06 [switch_simplify]: 8.25e-06 [loop_unroll]: 6.36e-06 [a_1]: 0.00011573 [with_stream_mark]: 1.754e-05 [recompute_prepare]: 7.66001e-06 [updatestate_depend_eliminate]: 3.75e-06 [updatestate_assign_eliminate]: 2.71e-06 [updatestate_loads_eliminate]: 3.06001e-06 [parameter_eliminate]: 1.18001e-06 [a_2]: 9.909e-05 [accelerated_algorithm]: 6.12999e-06 [shard]: 2.36e-06 [meta_shard_fg_expand]: 2.66e-06 [shard_inline]: 6.88e-06 [merge_send_recv]: 6.16e-06 [auto_parallel]: 6.76e-06 [parallel]: 8.13999e-06 [flash_sp]: 4.57e-06 [merge_comm]: 3.23998e-06 [allreduce_fusion]: 3.61001e-06 [matmul_add_comm_reduction]: 8.50001e-06 [allreduce_slice_to_reducescatter]: 5.00004e-07 [virtual_shard_identity]: 7.97998e-06 [virtual_dataset]: 6.51e-06 [get_grad_eliminate_]: 5.67999e-06 [virtual_output]: 5.72001e-06 [merge_forward]: 3.9e-06 [cell_reuse_recompute_pass]: 2.06e-06 [offload_activation]: 9.00001e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.752e-05 [merge_recompute_call_nodes]: 1.33002e-06 [before_grad]: 1.08e-05 [set_forward_comm_id_for_comm_node_pass]: 5.66e-06 [meta_fg_expand]: 2.81e-06 [flash_sp_send_recv_attached]: 1.59e-06 [receive_attached]: 1.65001e-06 [after_resolve]: 1.051e-05 [a_after_grad]: 9.39998e-06 [renormalize]: 7.99773e-08 [add_forward_monad_depend]: 2.04999e-06 [auto_monad_grad]: 1.44998e-06 [auto_monad_eliminator]: 1.024e-05 [cse]: 1.979e-05 [a_3]: 5.057e-05 [py_interpret_to_execute_after_opt_a]: 1.803e-05 [slice_cell_reuse_recomputed_activation]: 4.89003e-06 [rewriter_after_opt_a]: 4.886e-05 [convert_after_rewriter]: 1.042e-05 [order_py_execute_after_rewriter]: 9.31e-06 [mutable_eliminate]: 0.00078474 [opt_b]: 0.00034344, [1] [Cycle 1]: 0.00033118, [7] [b_1]: 0.0002166 [b_2]: 8.63001e-06 [updatestate_depend_eliminate]: 8.80001e-06 [updatestate_assign_eliminate]: 2.80997e-06 [updatestate_loads_eliminate]: 2.50002e-06 [renormalize]: 1.02e-06 [cse]: 2.946e-05 [optimize_parallel_all_gather_comm]: 2.301e-05 [overlap_param_gather]: 5.12e-06 [cconv]: 3.696e-05 [loop_unroll]: 0.00061337 [opt_after_cconv]: 0.00014917, [1] [Cycle 1]: 0.00013886, [7] [c_1]: 3.209e-05 [parameter_eliminate]: 5.24e-06 [updatestate_depend_eliminate]: 7.58001e-06 [updatestate_assign_eliminate]: 2.94001e-06 [updatestate_loads_eliminate]: 3.24001e-06 [cse]: 2.787e-05 [renormalize]: 1.8999e-07 [remove_dup_value]: 1.96e-05 [tuple_transform]: 0.00013667, [1] [Cycle 1]: 0.00012806, [4] [d_1]: 8.051e-05 [none_parameter_eliminate]: 1.94e-06 [renormalize]: 1.90019e-07 [switch_simplify]: 9.05999e-06 [partial_unused_args_eliminate]: 5.67001e-06 [add_recomputation]: 5.812e-05 [cse_after_recomputation]: 3.448e-05, [1] [Cycle 1]: 2.626e-05, [1] [cse]: 1.601e-05 [environ_conv]: 1.002e-05 [swap_dp_allreduce_reducescatter]: 8.45999e-06 [bias_add_comm_swap]: 5.74999e-06 [label_micro_interleaved_index]: 8.11002e-06 [label_fine_grained_interleaved_index]: 5.30999e-06 [merge_cast_opt]: 4.07e-06 [slice_recompute_activation]: 5.10001e-06 [micro_interleaved_order_control]: 4.63001e-06 [assign_add_opt]: 4.32998e-06 [ForceFp32Comm]: 3.55e-06 [remove_cast_before_assign_add]: 3.53e-06 [full_micro_interleaved_order_control]: 4.74e-06 [reorder_send_recv_between_fp_bp]: 5.34e-06 [comm_op_add_attrs]: 3.58e-06 [add_comm_op_reuse_tag]: 3.44001e-06 [interleave_split_concat_branches]: 3.9e-06 [interleave_parallel_branches]: 3.60998e-06 [overlap_opt_shard_in_pipeline]: 3.69002e-06 [overlap_opt_shard_grad_in_pipeline]: 4.23999e-06 [control_data_broadcast_order]: 1.838e-05 [grouped_pairwise_exchange_alltoall]: 4.1e-06 [offloading_packed_experts]: 8.05e-06 [overlap_recompute_and_grad_model_parallel]: 7.73001e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.68999e-06 [overlap_recompute_allgather_and_fa_grad]: 3.73999e-06 [overlap_recompute_comm]: 5.08002e-06 [overlap_grad_ring_attention]: 7.01999e-06 [overlap_grad_flash_sp]: 2.67e-05 [begin_end_overlap_inline]: 2.82002e-06 [split_matmul_comm_elemetwise]: 4.79002e-06 [split_layernorm_comm]: 4.43999e-06 [handle_group_info]: 3.44001e-06 [symbol_engine_optimizer]: 0.00011241, [1] [Cycle 1]: 0.00010478, [6] [build]: 4.18999e-06 [elim_shapecalc]: 1.477e-05 [elim_not_effective]: 1.559e-05 [opt_reshape]: 8.35001e-06 [fold_const_symbol]: 1.033e-05 [renormalize]: 2.40019e-07 [detach_backward]: 4.97999e-06 [pipeline_parallel_scheduler]: 2.06003e-06 [auto_monad_reorder]: 2.286e-05 [get_jit_bprop_graph]: 2.11e-06 [rewriter_after_jit_bprop_graph]: 5.91998e-06 [opt_after_jit_grad]: 0.00067695 [validate]: 4.832e-05 Sums bootstrap : 0.000475s : 4.08% type_inference : 0.005743s : 49.23% event_method : 0.000014s : 0.12% auto_monad : 0.000060s : 0.52% graph_reusing : 0.000006s : 0.05% inline : 0.000002s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000018s : 0.16% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000004s : 0.03% parallel-infer-symbol : 0.000004s : 0.03% pre_auto_parallel : 0.000032s : 0.27% insert-virtual-dataset : 0.000003s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000003s : 0.02% pipeline_split : 0.000002s : 0.02% optimize.py_interpret_to_execute : 0.000027s : 0.23% optimize.rewriter_before_opt_a : 0.000056s : 0.48% optimize.opt_a.expand_dump_flag : 0.000005s : 0.04% optimize.opt_a.switch_simplify : 0.000035s : 0.30% optimize.opt_a.loop_unroll : 0.000021s : 0.18% optimize.opt_a.a_1 : 0.000437s : 3.74% optimize.opt_a.with_stream_mark : 0.000041s : 0.35% optimize.opt_a.recompute_prepare : 0.000018s : 0.16% optimize.opt_a.updatestate_depend_eliminate : 0.000009s : 0.08% optimize.opt_a.updatestate_assign_eliminate : 0.000006s : 0.05% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.05% optimize.opt_a.parameter_eliminate : 0.000003s : 0.03% optimize.opt_a.a_2 : 0.000210s : 1.80% optimize.opt_a.accelerated_algorithm : 0.000013s : 0.11% optimize.opt_a.shard : 0.000005s : 0.04% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.04% optimize.opt_a.shard_inline : 0.000013s : 0.11% optimize.opt_a.merge_send_recv : 0.000015s : 0.13% optimize.opt_a.auto_parallel : 0.000016s : 0.13% optimize.opt_a.parallel : 0.000028s : 0.24% optimize.opt_a.flash_sp : 0.000014s : 0.12% optimize.opt_a.merge_comm : 0.000009s : 0.08% optimize.opt_a.allreduce_fusion : 0.000007s : 0.06% optimize.opt_a.matmul_add_comm_reduction : 0.000018s : 0.16% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000018s : 0.15% optimize.opt_a.virtual_dataset : 0.000014s : 0.12% optimize.opt_a.get_grad_eliminate_ : 0.000012s : 0.10% optimize.opt_a.virtual_output : 0.000013s : 0.11% optimize.opt_a.merge_forward : 0.000008s : 0.07% optimize.opt_a.cell_reuse_recompute_pass : 0.000004s : 0.03% optimize.opt_a.offload_activation : 0.000021s : 0.18% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000034s : 0.29% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.03% optimize.opt_a.before_grad : 0.000023s : 0.20% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000010s : 0.08% optimize.opt_a.meta_fg_expand : 0.000006s : 0.05% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.04% optimize.opt_a.receive_attached : 0.000004s : 0.03% optimize.opt_a.after_resolve : 0.000022s : 0.19% optimize.opt_a.a_after_grad : 0.000021s : 0.18% optimize.opt_a.renormalize : 0.000747s : 6.40% optimize.opt_a.add_forward_monad_depend : 0.000008s : 0.07% optimize.opt_a.auto_monad_grad : 0.000004s : 0.04% optimize.opt_a.auto_monad_eliminator : 0.000029s : 0.25% optimize.opt_a.cse : 0.000051s : 0.44% optimize.opt_a.a_3 : 0.000118s : 1.02% optimize.py_interpret_to_execute_after_opt_a : 0.000018s : 0.15% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.04% optimize.rewriter_after_opt_a : 0.000049s : 0.42% optimize.convert_after_rewriter : 0.000010s : 0.09% optimize.order_py_execute_after_rewriter : 0.000009s : 0.08% optimize.mutable_eliminate : 0.000785s : 6.73% optimize.opt_b.b_1 : 0.000217s : 1.86% optimize.opt_b.b_2 : 0.000009s : 0.07% optimize.opt_b.updatestate_depend_eliminate : 0.000009s : 0.08% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.02% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.02% optimize.opt_b.renormalize : 0.000001s : 0.01% optimize.opt_b.cse : 0.000029s : 0.25% optimize.optimize_parallel_all_gather_comm : 0.000023s : 0.20% optimize.overlap_param_gather : 0.000005s : 0.04% optimize.cconv : 0.000037s : 0.32% optimize.loop_unroll : 0.000613s : 5.26% optimize.opt_after_cconv.c_1 : 0.000032s : 0.28% optimize.opt_after_cconv.parameter_eliminate : 0.000005s : 0.04% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000008s : 0.06% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.cse : 0.000028s : 0.24% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000020s : 0.17% optimize.tuple_transform.d_1 : 0.000081s : 0.69% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.02% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000009s : 0.08% optimize.partial_unused_args_eliminate : 0.000006s : 0.05% optimize.add_recomputation : 0.000058s : 0.50% optimize.cse_after_recomputation.cse : 0.000016s : 0.14% optimize.environ_conv : 0.000010s : 0.09% optimize.swap_dp_allreduce_reducescatter : 0.000008s : 0.07% optimize.bias_add_comm_swap : 0.000006s : 0.05% optimize.label_micro_interleaved_index : 0.000008s : 0.07% optimize.label_fine_grained_interleaved_index : 0.000005s : 0.05% optimize.merge_cast_opt : 0.000004s : 0.03% optimize.slice_recompute_activation : 0.000005s : 0.04% optimize.micro_interleaved_order_control : 0.000005s : 0.04% optimize.assign_add_opt : 0.000004s : 0.04% optimize.ForceFp32Comm : 0.000004s : 0.03% optimize.remove_cast_before_assign_add : 0.000004s : 0.03% optimize.full_micro_interleaved_order_control : 0.000005s : 0.04% optimize.reorder_send_recv_between_fp_bp : 0.000005s : 0.05% optimize.comm_op_add_attrs : 0.000004s : 0.03% optimize.add_comm_op_reuse_tag : 0.000003s : 0.03% optimize.interleave_split_concat_branches : 0.000004s : 0.03% optimize.interleave_parallel_branches : 0.000004s : 0.03% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.03% optimize.overlap_opt_shard_grad_in_pipeline : 0.000004s : 0.04% optimize.control_data_broadcast_order : 0.000018s : 0.16% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.04% optimize.offloading_packed_experts : 0.000008s : 0.07% optimize.overlap_recompute_and_grad_model_parallel : 0.000008s : 0.07% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.03% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.03% optimize.overlap_recompute_comm : 0.000005s : 0.04% optimize.overlap_grad_ring_attention : 0.000007s : 0.06% optimize.overlap_grad_flash_sp : 0.000027s : 0.23% optimize.begin_end_overlap_inline : 0.000003s : 0.02% optimize.split_matmul_comm_elemetwise : 0.000005s : 0.04% optimize.split_layernorm_comm : 0.000004s : 0.04% optimize.handle_group_info : 0.000003s : 0.03% optimize.symbol_engine_optimizer.build : 0.000004s : 0.04% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000015s : 0.13% optimize.symbol_engine_optimizer.elim_not_effective : 0.000016s : 0.13% optimize.symbol_engine_optimizer.opt_reshape : 0.000008s : 0.07% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000010s : 0.09% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000005s : 0.04% pipeline_parallel_scheduler : 0.000002s : 0.02% auto_monad_reorder : 0.000023s : 0.20% get_jit_bprop_graph : 0.000002s : 0.02% rewriter_after_jit_bprop_graph : 0.000006s : 0.05% opt_after_jit_grad : 0.000677s : 5.80% validate : 0.000048s : 0.41% Time group info: ------[substitution.] 0.000162 20 1.20% : 0.000002s : 2: substitution.elim_not_effective 0.84% : 0.000001s : 2: substitution.fold_const_symbol 3.71% : 0.000006s : 3: substitution.graph_param_transform 65.58% : 0.000106s : 2: substitution.inline 3.46% : 0.000006s : 4: substitution.j_node_and_user_rematch 3.12% : 0.000005s : 4: substitution.remove_not_recompute_node 2.48% : 0.000004s : 2: substitution.replace_old_param 19.61% : 0.000032s : 1: substitution.value_based_eliminate ------[type_inference.] 0.005685 2 91.38% : 0.005195s : 1: type_inference.infer 8.62% : 0.000490s : 1: type_inference.specialize ------[replace.] 0.000023 2 100.00% : 0.000023s : 2: replace.inline ------[match.] 0.000104 2 100.00% : 0.000104s : 2: match.inline ------[predicate.] 0.000156 754 0.71% : 0.000001s : 7: predicate.accumulaten_eliminater 1.30% : 0.000002s : 3: predicate.ad_related_special_op_eliminate 0.60% : 0.000001s : 6: predicate.addn_check_dump 1.03% : 0.000002s : 7: predicate.addn_zero_filter 0.60% : 0.000001s : 7: predicate.adjust_all_reduce_mul_add 2.40% : 0.000004s : 13: predicate.arithmetic_simplify 0.83% : 0.000001s : 7: predicate.cast_eliminate 0.74% : 0.000001s : 6: predicate.check_bprop_eliminate 0.78% : 0.000001s : 6: predicate.compare_switch_simplify 0.18% : 0.000000s : 3: predicate.const_output_eliminate 0.77% : 0.000001s : 6: predicate.depend_value_elim 0.69% : 0.000001s : 7: predicate.dict_get_item_const_eliminator 1.14% : 0.000002s : 7: predicate.dict_get_item_eliminator 0.72% : 0.000001s : 7: predicate.dict_set_item_eliminator 1.61% : 0.000003s : 6: predicate.dumpgradient_eliminate 0.33% : 0.000001s : 3: predicate.elim_not_effective 0.57% : 0.000001s : 3: predicate.elim_shapecalc_of_broadcastargs 1.12% : 0.000002s : 10: predicate.environ_add_const_eliminate 0.94% : 0.000001s : 10: predicate.environ_get_add_eliminate 1.22% : 0.000002s : 10: predicate.environ_get_depend_swap 1.70% : 0.000003s : 16: predicate.environ_get_eliminate 0.87% : 0.000001s : 10: predicate.environ_get_set_eliminate 0.88% : 0.000001s : 9: predicate.exchange_switch_depend_value 1.76% : 0.000003s : 9: predicate.float_depend_g_call 0.60% : 0.000001s : 6: predicate.float_environ_get_switch 0.95% : 0.000001s : 9: predicate.float_tuple_getitem_switch 0.21% : 0.000000s : 3: predicate.fold_const_symbol 0.78% : 0.000001s : 6: predicate.get_grad_eliminate 0.33% : 0.000001s : 3: predicate.graph_param_transform 0.65% : 0.000001s : 6: predicate.incorporate_call 0.60% : 0.000001s : 6: predicate.incorporate_call_switch 6.11% : 0.000010s : 34: predicate.inline 0.99% : 0.000002s : 6: predicate.inline_without_move 0.37% : 0.000001s : 6: predicate.j_node_and_user_rematch 1.03% : 0.000002s : 6: predicate.less_batch_normalization 1.87% : 0.000003s : 13: predicate.list_to_tuple_eliminator_ 2.09% : 0.000003s : 20: predicate.load_eliminater 1.93% : 0.000003s : 3: predicate.loop_unroll_after_grad 1.65% : 0.000003s : 14: predicate.loop_unroll_before_grad 1.71% : 0.000003s : 13: predicate.make_slice_get_slice_eliminator 0.67% : 0.000001s : 6: predicate.merge_addn 0.76% : 0.000001s : 6: predicate.micro_step_allgather_replace 0.74% : 0.000001s : 6: predicate.mini_step_allgather_replace 0.62% : 0.000001s : 7: predicate.minmaximum_grad 1.83% : 0.000003s : 3: predicate.mutable_eliminate 0.44% : 0.000001s : 3: predicate.opt_reshape 0.58% : 0.000001s : 3: predicate.parallel_virtual_node 1.13% : 0.000002s : 9: predicate.partial_defer_inline 1.10% : 0.000002s : 10: predicate.partial_eliminate 0.78% : 0.000001s : 7: predicate.print_const_string_wrapper 0.82% : 0.000001s : 6: predicate.reduce_all_const_elim 0.99% : 0.000002s : 7: predicate.reduce_eliminate 1.95% : 0.000003s : 20: predicate.redundant_stop_gradient_eliminater 0.66% : 0.000001s : 6: predicate.remove_not_recompute_node 0.94% : 0.000001s : 13: predicate.replace_applicator 0.60% : 0.000001s : 6: predicate.replace_old_param 0.42% : 0.000001s : 3: predicate.reset_defer_inline 0.96% : 0.000001s : 7: predicate.reshape_eliminate 0.77% : 0.000001s : 6: predicate.row_tensor_add_zeros_like 0.56% : 0.000001s : 3: predicate.row_tensor_eliminate 0.99% : 0.000002s : 6: predicate.same_eliminate 0.68% : 0.000001s : 6: predicate.set_cell_output_no_recompute 1.22% : 0.000002s : 6: predicate.shard_identity_eliminate 0.92% : 0.000001s : 6: predicate.special_op_eliminate 0.79% : 0.000001s : 6: predicate.specialize_transform 1.24% : 0.000002s : 6: predicate.split_environ_get_set_with_tuple_value 1.28% : 0.000002s : 6: predicate.stack_unstack_eliminate 0.53% : 0.000001s : 3: predicate.switch_call_monad_eliminater 0.95% : 0.000001s : 9: predicate.switch_defer_inline 1.68% : 0.000003s : 15: predicate.switch_layer_defer_inline 4.73% : 0.000007s : 32: predicate.switch_simplify 0.78% : 0.000001s : 7: predicate.tile_eliminate 0.76% : 0.000001s : 7: predicate.transpose_eliminate 1.58% : 0.000002s : 13: predicate.tuple_list_convert_item_index_to_positive 1.85% : 0.000003s : 13: predicate.tuple_list_get_item_const_eliminator 1.40% : 0.000002s : 13: predicate.tuple_list_get_item_depend_reorder 3.19% : 0.000005s : 19: predicate.tuple_list_get_item_eliminator 1.33% : 0.000002s : 13: predicate.tuple_list_get_set_item_eliminator 2.43% : 0.000004s : 19: predicate.tuple_list_set_item_eliminator 1.44% : 0.000002s : 13: predicate.tuple_to_list_eliminator_ 1.80% : 0.000003s : 20: predicate.updatestate_pure_node_eliminater 2.46% : 0.000004s : 26: predicate.updatestate_useless_node_eliminater 0.83% : 0.000001s : 3: predicate.value_based_eliminate 0.92% : 0.000001s : 6: predicate.virtual_dataset_eliminate 0.78% : 0.000001s : 6: predicate.virtual_output_eliminate 0.33% : 0.000001s : 3: predicate.virtual_view_grad_eliminate 0.47% : 0.000001s : 3: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000265 5 8.01% : 0.000021s : 1: func_graph_cloner_run.FuncGraphClonerGraph 91.99% : 0.000244s : 4: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.028894 192 0.02% : 0.000007s : 1: ForceFp32Comm 13.34% : 0.003854s : 1: add_attr 13.27% : 0.003835s : 1: add_attr_with_inline 0.02% : 0.000006s : 1: add_comm_op_reuse_tag 0.22% : 0.000063s : 1: add_recomputation 0.02% : 0.000007s : 1: assign_add_opt 0.24% : 0.000069s : 1: auto_monad 0.11% : 0.000031s : 1: auto_monad_reorder 0.02% : 0.000006s : 1: begin_end_overlap_inline 0.03% : 0.000009s : 1: bias_add_comm_swap 1.81% : 0.000523s : 1: bootstrap 0.14% : 0.000041s : 1: cconv 0.02% : 0.000006s : 1: comm_op_add_attrs 0.08% : 0.000022s : 1: control_data_broadcast_order 0.05% : 0.000014s : 1: convert_after_rewriter 0.13% : 0.000037s : 1: cse_after_recomputation 0.03% : 0.000008s : 1: dataset_repeat_opt 0.09% : 0.000025s : 1: detach_backward 0.04% : 0.000013s : 1: environ_conv 0.09% : 0.000026s : 1: event_method 0.03% : 0.000008s : 1: full_micro_interleaved_order_control 0.03% : 0.000008s : 1: get_jit_bprop_graph 0.04% : 0.000012s : 1: graph_reusing 0.02% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.02% : 0.000006s : 1: handle_group_info 0.03% : 0.000008s : 1: inline 0.03% : 0.000009s : 1: insert-virtual-dataset 0.02% : 0.000007s : 1: interleave_parallel_branches 0.02% : 0.000007s : 1: interleave_split_concat_branches 0.03% : 0.000008s : 1: label_fine_grained_interleaved_index 0.04% : 0.000011s : 1: label_micro_interleaved_index 2.15% : 0.000621s : 1: loop_unroll 0.02% : 0.000007s : 1: merge_cast_opt 0.03% : 0.000008s : 1: micro_interleaved_order_control 2.74% : 0.000793s : 1: mutable_eliminate 0.04% : 0.000011s : 1: offloading_packed_experts 0.07% : 0.000021s : 1: opt.transform.loop_unroll_optimizer 0.07% : 0.000021s : 1: opt.transform.mutable_eliminate 2.89% : 0.000835s : 78: opt.transform.opt_a 0.10% : 0.000030s : 1: opt.transform.opt_after_cconv 0.11% : 0.000032s : 1: opt.transform.opt_after_jit_grad 0.49% : 0.000142s : 28: opt.transform.opt_b 0.30% : 0.000086s : 2: opt.transform.opt_trans_graph 0.15% : 0.000044s : 4: opt.transform.symbol_engine_opt 10.03% : 0.002899s : 1: opt_a 0.53% : 0.000153s : 1: opt_after_cconv 2.39% : 0.000690s : 1: opt_after_jit_grad 1.20% : 0.000348s : 1: opt_b 21.49% : 0.006210s : 1: optimize 0.09% : 0.000027s : 1: optimize_parallel_all_gather_comm 0.04% : 0.000012s : 1: order_py_execute_after_rewriter 0.10% : 0.000030s : 1: overlap_grad_flash_sp 0.02% : 0.000006s : 1: overlap_grad_matmul_and_grad_allreduce 0.04% : 0.000010s : 1: overlap_grad_ring_attention 0.02% : 0.000007s : 1: overlap_opt_shard_grad_in_pipeline 0.02% : 0.000006s : 1: overlap_opt_shard_in_pipeline 0.03% : 0.000008s : 1: overlap_param_gather 0.02% : 0.000006s : 1: overlap_recompute_allgather_and_fa_grad 0.04% : 0.000011s : 1: overlap_recompute_and_grad_model_parallel 0.03% : 0.000008s : 1: overlap_recompute_comm 0.04% : 0.000010s : 1: parallel-infer-symbol 0.02% : 0.000006s : 1: parallel-infer-symbol-second 0.03% : 0.000009s : 1: partial_unused_args_eliminate 0.03% : 0.000010s : 1: pipeline_parallel_scheduler 0.03% : 0.000008s : 1: pipeline_split 0.14% : 0.000040s : 1: pre_auto_parallel 0.10% : 0.000030s : 1: py_interpret_to_execute 0.08% : 0.000022s : 1: py_interpret_to_execute_after_opt_a 0.02% : 0.000006s : 1: remove_cast_before_assign_add 0.08% : 0.000024s : 1: remove_dup_value 1.48% : 0.000428s : 1: renormalize.infer 1.07% : 0.000310s : 1: renormalize.specialize 0.03% : 0.000009s : 1: reorder_send_recv_between_fp_bp 0.04% : 0.000012s : 1: rewriter_after_jit_bprop_graph 0.19% : 0.000053s : 1: rewriter_after_opt_a 0.21% : 0.000060s : 1: rewriter_before_opt_a 0.03% : 0.000008s : 1: slice_cell_reuse_recomputed_activation 0.03% : 0.000008s : 1: slice_recompute_activation 0.03% : 0.000008s : 1: split_layernorm_comm 0.03% : 0.000008s : 1: split_matmul_comm_elemetwise 0.04% : 0.000011s : 1: swap_dp_allreduce_reducescatter 0.40% : 0.000115s : 1: symbol_engine_optimizer 0.48% : 0.000140s : 1: tuple_transform 20.02% : 0.005785s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:52.647.092 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0152606, [21] [bootstrap]: 0.00045088 [type_inference]: 0.00537997 [event_method]: 1.334e-05 [auto_monad]: 5.846e-05 [graph_reusing]: 5.35999e-06 [inline]: 2.63e-06 [add_attr]: 0.00351355, [1] [add_attr_with_inline]: 0.0035022, [1] [Cycle 1]: 6.655e-05, [2] [tag_attr]: 1.596e-05 [meta_addattr_fg_expand]: 3.9e-06 [parallel-infer-symbol]: 3.68999e-06 [pre_auto_parallel]: 3.246e-05 [insert-virtual-dataset]: 2.40002e-06 [parallel-infer-symbol-second]: 7.2e-07 [dataset_repeat_opt]: 2.04e-06 [pipeline_split]: 1.51002e-06 [optimize]: 0.00495027, [53] [py_interpret_to_execute]: 2.212e-05 [rewriter_before_opt_a]: 4.897e-05 [opt_a]: 0.0024016, [2] [Cycle 1]: 0.00170298, [45] [expand_dump_flag]: 2.39999e-06 [switch_simplify]: 3.997e-05 [loop_unroll]: 1.428e-05 [a_1]: 0.00030686 [with_stream_mark]: 1.909e-05 [recompute_prepare]: 8.75001e-06 [updatestate_depend_eliminate]: 4.10998e-06 [updatestate_assign_eliminate]: 3.30998e-06 [updatestate_loads_eliminate]: 2.91999e-06 [parameter_eliminate]: 2.31e-06 [a_2]: 7.977e-05 [accelerated_algorithm]: 7.5e-06 [shard]: 2.84001e-06 [meta_shard_fg_expand]: 2.06e-06 [shard_inline]: 6.58e-06 [merge_send_recv]: 8.50999e-06 [auto_parallel]: 7.53e-06 [parallel]: 2.23e-05 [flash_sp]: 9.13002e-06 [merge_comm]: 5.09e-06 [allreduce_fusion]: 3.88001e-06 [matmul_add_comm_reduction]: 1.092e-05 [allreduce_slice_to_reducescatter]: 6.20028e-07 [virtual_shard_identity]: 1.041e-05 [virtual_dataset]: 6.76e-06 [get_grad_eliminate_]: 5.97999e-06 [virtual_output]: 6.20002e-06 [merge_forward]: 4.89e-06 [cell_reuse_recompute_pass]: 1.40999e-06 [offload_activation]: 1.033e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.49e-05 [merge_recompute_call_nodes]: 1.40999e-06 [before_grad]: 1.119e-05 [set_forward_comm_id_for_comm_node_pass]: 3.68999e-06 [meta_fg_expand]: 2.60002e-06 [flash_sp_send_recv_attached]: 2.66e-06 [receive_attached]: 2.44001e-06 [after_resolve]: 1.128e-05 [a_after_grad]: 9.66998e-06 [renormalize]: 0.00066208 [add_forward_monad_depend]: 5.76e-06 [auto_monad_grad]: 3.02002e-06 [auto_monad_eliminator]: 1.727e-05 [cse]: 3.125e-05 [a_3]: 5.076e-05 [Cycle 2]: 0.0006867, [45] [expand_dump_flag]: 1.42999e-06 [switch_simplify]: 7.59002e-06 [loop_unroll]: 5.81998e-06 [a_1]: 0.00011045 [with_stream_mark]: 1.336e-05 [recompute_prepare]: 7.30998e-06 [updatestate_depend_eliminate]: 3.68999e-06 [updatestate_assign_eliminate]: 2.41998e-06 [updatestate_loads_eliminate]: 3.26999e-06 [parameter_eliminate]: 1.25999e-06 [a_2]: 7.226e-05 [accelerated_algorithm]: 6.29001e-06 [shard]: 1.86e-06 [meta_shard_fg_expand]: 1.53002e-06 [shard_inline]: 6.02999e-06 [merge_send_recv]: 6.21e-06 [auto_parallel]: 6.89999e-06 [parallel]: 6.53003e-06 [flash_sp]: 3.58999e-06 [merge_comm]: 3.13e-06 [allreduce_fusion]: 3.55e-06 [matmul_add_comm_reduction]: 1.171e-05 [allreduce_slice_to_reducescatter]: 5.50004e-07 [virtual_shard_identity]: 7.03e-06 [virtual_dataset]: 6.07001e-06 [get_grad_eliminate_]: 5.88002e-06 [virtual_output]: 5.52001e-06 [merge_forward]: 3.76001e-06 [cell_reuse_recompute_pass]: 2.30002e-06 [offload_activation]: 9.36998e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.581e-05 [merge_recompute_call_nodes]: 1.35999e-06 [before_grad]: 1.037e-05 [set_forward_comm_id_for_comm_node_pass]: 4.61002e-06 [meta_fg_expand]: 2.69001e-06 [flash_sp_send_recv_attached]: 1.08001e-06 [receive_attached]: 1.59e-06 [after_resolve]: 1.085e-05 [a_after_grad]: 9.07001e-06 [renormalize]: 9.00181e-08 [add_forward_monad_depend]: 3.71999e-06 [auto_monad_grad]: 1.57001e-06 [auto_monad_eliminator]: 1.206e-05 [cse]: 2.116e-05 [a_3]: 3.874e-05 [py_interpret_to_execute_after_opt_a]: 1.307e-05 [slice_cell_reuse_recomputed_activation]: 2.73e-06 [rewriter_after_opt_a]: 4.318e-05 [convert_after_rewriter]: 7.28999e-06 [order_py_execute_after_rewriter]: 5.51998e-06 [mutable_eliminate]: 0.00072945 [opt_b]: 0.00027611, [1] [Cycle 1]: 0.00026714, [7] [b_1]: 0.00017159 [b_2]: 8.89e-06 [updatestate_depend_eliminate]: 7.36999e-06 [updatestate_assign_eliminate]: 2.83e-06 [updatestate_loads_eliminate]: 3.04001e-06 [renormalize]: 1.30999e-06 [cse]: 3.005e-05 [optimize_parallel_all_gather_comm]: 1.941e-05 [overlap_param_gather]: 2.26e-06 [cconv]: 3.343e-05 [loop_unroll]: 0.00060585 [opt_after_cconv]: 0.00011846, [1] [Cycle 1]: 0.00011174, [7] [c_1]: 3.09e-05 [parameter_eliminate]: 4.60001e-06 [updatestate_depend_eliminate]: 8.07e-06 [updatestate_assign_eliminate]: 3.15002e-06 [updatestate_loads_eliminate]: 2.53e-06 [cse]: 2.49e-05 [renormalize]: 6.89994e-07 [remove_dup_value]: 1.579e-05 [tuple_transform]: 7.857e-05, [1] [Cycle 1]: 7.339e-05, [4] [d_1]: 4.451e-05 [none_parameter_eliminate]: 1.84e-06 [renormalize]: 2.30008e-07 [switch_simplify]: 7.68001e-06 [partial_unused_args_eliminate]: 2.04e-06 [add_recomputation]: 5.492e-05 [cse_after_recomputation]: 2.404e-05, [1] [Cycle 1]: 1.92e-05, [1] [cse]: 1.288e-05 [environ_conv]: 6.37001e-06 [swap_dp_allreduce_reducescatter]: 4.99e-06 [bias_add_comm_swap]: 3.05002e-06 [label_micro_interleaved_index]: 5.67999e-06 [label_fine_grained_interleaved_index]: 3.01001e-06 [merge_cast_opt]: 1.77999e-06 [slice_recompute_activation]: 2.19999e-06 [micro_interleaved_order_control]: 2.69999e-06 [assign_add_opt]: 1.32999e-06 [ForceFp32Comm]: 1.10001e-06 [remove_cast_before_assign_add]: 1.07998e-06 [full_micro_interleaved_order_control]: 2.26e-06 [reorder_send_recv_between_fp_bp]: 2.76999e-06 [comm_op_add_attrs]: 1.24998e-06 [add_comm_op_reuse_tag]: 1.04e-06 [interleave_split_concat_branches]: 1.13001e-06 [interleave_parallel_branches]: 1.05001e-06 [overlap_opt_shard_in_pipeline]: 1.37e-06 [overlap_opt_shard_grad_in_pipeline]: 2.31e-06 [control_data_broadcast_order]: 1.525e-05 [grouped_pairwise_exchange_alltoall]: 1.81998e-06 [offloading_packed_experts]: 4.92999e-06 [overlap_recompute_and_grad_model_parallel]: 5.19e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.15001e-06 [overlap_recompute_allgather_and_fa_grad]: 1.36998e-06 [overlap_recompute_comm]: 1.97999e-06 [overlap_grad_ring_attention]: 4.46002e-06 [overlap_grad_flash_sp]: 2.103e-05 [begin_end_overlap_inline]: 5.8001e-07 [split_matmul_comm_elemetwise]: 2.24001e-06 [split_layernorm_comm]: 1.91e-06 [handle_group_info]: 1.40999e-06 [symbol_engine_optimizer]: 9.064e-05, [1] [Cycle 1]: 8.509e-05, [6] [build]: 3.7e-06 [elim_shapecalc]: 1.373e-05 [elim_not_effective]: 1.448e-05 [opt_reshape]: 7.60998e-06 [fold_const_symbol]: 1.17e-05 [renormalize]: 4.10015e-07 [detach_backward]: 2.84999e-06 [pipeline_parallel_scheduler]: 1.71e-06 [auto_monad_reorder]: 1.914e-05 [get_jit_bprop_graph]: 2.73e-06 [rewriter_after_jit_bprop_graph]: 6.19999e-06 [opt_after_jit_grad]: 0.00060104 [validate]: 4.684e-05 Sums bootstrap : 0.000451s : 4.22% type_inference : 0.005380s : 50.30% event_method : 0.000013s : 0.12% auto_monad : 0.000058s : 0.55% graph_reusing : 0.000005s : 0.05% inline : 0.000003s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000016s : 0.15% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000004s : 0.04% parallel-infer-symbol : 0.000004s : 0.03% pre_auto_parallel : 0.000032s : 0.30% insert-virtual-dataset : 0.000002s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000022s : 0.21% optimize.rewriter_before_opt_a : 0.000049s : 0.46% optimize.opt_a.expand_dump_flag : 0.000004s : 0.04% optimize.opt_a.switch_simplify : 0.000048s : 0.44% optimize.opt_a.loop_unroll : 0.000020s : 0.19% optimize.opt_a.a_1 : 0.000417s : 3.90% optimize.opt_a.with_stream_mark : 0.000032s : 0.30% optimize.opt_a.recompute_prepare : 0.000016s : 0.15% optimize.opt_a.updatestate_depend_eliminate : 0.000008s : 0.07% optimize.opt_a.updatestate_assign_eliminate : 0.000006s : 0.05% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.06% optimize.opt_a.parameter_eliminate : 0.000004s : 0.03% optimize.opt_a.a_2 : 0.000152s : 1.42% optimize.opt_a.accelerated_algorithm : 0.000014s : 0.13% optimize.opt_a.shard : 0.000005s : 0.04% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.03% optimize.opt_a.shard_inline : 0.000013s : 0.12% optimize.opt_a.merge_send_recv : 0.000015s : 0.14% optimize.opt_a.auto_parallel : 0.000014s : 0.13% optimize.opt_a.parallel : 0.000029s : 0.27% optimize.opt_a.flash_sp : 0.000013s : 0.12% optimize.opt_a.merge_comm : 0.000008s : 0.08% optimize.opt_a.allreduce_fusion : 0.000007s : 0.07% optimize.opt_a.matmul_add_comm_reduction : 0.000023s : 0.21% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000017s : 0.16% optimize.opt_a.virtual_dataset : 0.000013s : 0.12% optimize.opt_a.get_grad_eliminate_ : 0.000012s : 0.11% optimize.opt_a.virtual_output : 0.000012s : 0.11% optimize.opt_a.merge_forward : 0.000009s : 0.08% optimize.opt_a.cell_reuse_recompute_pass : 0.000004s : 0.03% optimize.opt_a.offload_activation : 0.000020s : 0.18% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000031s : 0.29% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.03% optimize.opt_a.before_grad : 0.000022s : 0.20% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000008s : 0.08% optimize.opt_a.meta_fg_expand : 0.000005s : 0.05% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.03% optimize.opt_a.receive_attached : 0.000004s : 0.04% optimize.opt_a.after_resolve : 0.000022s : 0.21% optimize.opt_a.a_after_grad : 0.000019s : 0.18% optimize.opt_a.renormalize : 0.000662s : 6.19% optimize.opt_a.add_forward_monad_depend : 0.000009s : 0.09% optimize.opt_a.auto_monad_grad : 0.000005s : 0.04% optimize.opt_a.auto_monad_eliminator : 0.000029s : 0.27% optimize.opt_a.cse : 0.000052s : 0.49% optimize.opt_a.a_3 : 0.000089s : 0.84% optimize.py_interpret_to_execute_after_opt_a : 0.000013s : 0.12% optimize.slice_cell_reuse_recomputed_activation : 0.000003s : 0.03% optimize.rewriter_after_opt_a : 0.000043s : 0.40% optimize.convert_after_rewriter : 0.000007s : 0.07% optimize.order_py_execute_after_rewriter : 0.000006s : 0.05% optimize.mutable_eliminate : 0.000729s : 6.82% optimize.opt_b.b_1 : 0.000172s : 1.60% optimize.opt_b.b_2 : 0.000009s : 0.08% optimize.opt_b.updatestate_depend_eliminate : 0.000007s : 0.07% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_b.renormalize : 0.000001s : 0.01% optimize.opt_b.cse : 0.000030s : 0.28% optimize.optimize_parallel_all_gather_comm : 0.000019s : 0.18% optimize.overlap_param_gather : 0.000002s : 0.02% optimize.cconv : 0.000033s : 0.31% optimize.loop_unroll : 0.000606s : 5.66% optimize.opt_after_cconv.c_1 : 0.000031s : 0.29% optimize.opt_after_cconv.parameter_eliminate : 0.000005s : 0.04% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000008s : 0.08% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.02% optimize.opt_after_cconv.cse : 0.000025s : 0.23% optimize.opt_after_cconv.renormalize : 0.000001s : 0.01% optimize.remove_dup_value : 0.000016s : 0.15% optimize.tuple_transform.d_1 : 0.000045s : 0.42% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.02% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000008s : 0.07% optimize.partial_unused_args_eliminate : 0.000002s : 0.02% optimize.add_recomputation : 0.000055s : 0.51% optimize.cse_after_recomputation.cse : 0.000013s : 0.12% optimize.environ_conv : 0.000006s : 0.06% optimize.swap_dp_allreduce_reducescatter : 0.000005s : 0.05% optimize.bias_add_comm_swap : 0.000003s : 0.03% optimize.label_micro_interleaved_index : 0.000006s : 0.05% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.03% optimize.merge_cast_opt : 0.000002s : 0.02% optimize.slice_recompute_activation : 0.000002s : 0.02% optimize.micro_interleaved_order_control : 0.000003s : 0.03% optimize.assign_add_opt : 0.000001s : 0.01% optimize.ForceFp32Comm : 0.000001s : 0.01% optimize.remove_cast_before_assign_add : 0.000001s : 0.01% optimize.full_micro_interleaved_order_control : 0.000002s : 0.02% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.03% optimize.comm_op_add_attrs : 0.000001s : 0.01% optimize.add_comm_op_reuse_tag : 0.000001s : 0.01% optimize.interleave_split_concat_branches : 0.000001s : 0.01% optimize.interleave_parallel_branches : 0.000001s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.02% optimize.control_data_broadcast_order : 0.000015s : 0.14% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.02% optimize.offloading_packed_experts : 0.000005s : 0.05% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.05% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.01% optimize.overlap_recompute_comm : 0.000002s : 0.02% optimize.overlap_grad_ring_attention : 0.000004s : 0.04% optimize.overlap_grad_flash_sp : 0.000021s : 0.20% optimize.begin_end_overlap_inline : 0.000001s : 0.01% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.02% optimize.split_layernorm_comm : 0.000002s : 0.02% optimize.handle_group_info : 0.000001s : 0.01% optimize.symbol_engine_optimizer.build : 0.000004s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000014s : 0.13% optimize.symbol_engine_optimizer.elim_not_effective : 0.000014s : 0.14% optimize.symbol_engine_optimizer.opt_reshape : 0.000008s : 0.07% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000012s : 0.11% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000003s : 0.03% pipeline_parallel_scheduler : 0.000002s : 0.02% auto_monad_reorder : 0.000019s : 0.18% get_jit_bprop_graph : 0.000003s : 0.03% rewriter_after_jit_bprop_graph : 0.000006s : 0.06% opt_after_jit_grad : 0.000601s : 5.62% validate : 0.000047s : 0.44% Time group info: ------[substitution.] 0.000153 20 1.28% : 0.000002s : 2: substitution.elim_not_effective 0.99% : 0.000002s : 2: substitution.fold_const_symbol 3.98% : 0.000006s : 3: substitution.graph_param_transform 63.58% : 0.000097s : 2: substitution.inline 2.72% : 0.000004s : 4: substitution.j_node_and_user_rematch 3.48% : 0.000005s : 4: substitution.remove_not_recompute_node 3.10% : 0.000005s : 2: substitution.replace_old_param 20.87% : 0.000032s : 1: substitution.value_based_eliminate ------[type_inference.] 0.005325 2 91.11% : 0.004852s : 1: type_inference.infer 8.89% : 0.000474s : 1: type_inference.specialize ------[replace.] 0.000022 2 100.00% : 0.000022s : 2: replace.inline ------[match.] 0.000096 2 100.00% : 0.000096s : 2: match.inline ------[predicate.] 0.000146 754 0.80% : 0.000001s : 7: predicate.accumulaten_eliminater 1.41% : 0.000002s : 3: predicate.ad_related_special_op_eliminate 0.57% : 0.000001s : 6: predicate.addn_check_dump 0.77% : 0.000001s : 7: predicate.addn_zero_filter 0.61% : 0.000001s : 7: predicate.adjust_all_reduce_mul_add 2.32% : 0.000003s : 13: predicate.arithmetic_simplify 0.75% : 0.000001s : 7: predicate.cast_eliminate 0.71% : 0.000001s : 6: predicate.check_bprop_eliminate 0.71% : 0.000001s : 6: predicate.compare_switch_simplify 0.24% : 0.000000s : 3: predicate.const_output_eliminate 0.73% : 0.000001s : 6: predicate.depend_value_elim 0.72% : 0.000001s : 7: predicate.dict_get_item_const_eliminator 1.10% : 0.000002s : 7: predicate.dict_get_item_eliminator 0.69% : 0.000001s : 7: predicate.dict_set_item_eliminator 1.11% : 0.000002s : 6: predicate.dumpgradient_eliminate 0.46% : 0.000001s : 3: predicate.elim_not_effective 0.49% : 0.000001s : 3: predicate.elim_shapecalc_of_broadcastargs 1.16% : 0.000002s : 10: predicate.environ_add_const_eliminate 0.95% : 0.000001s : 10: predicate.environ_get_add_eliminate 1.10% : 0.000002s : 10: predicate.environ_get_depend_swap 1.93% : 0.000003s : 16: predicate.environ_get_eliminate 0.92% : 0.000001s : 10: predicate.environ_get_set_eliminate 0.93% : 0.000001s : 9: predicate.exchange_switch_depend_value 1.72% : 0.000003s : 9: predicate.float_depend_g_call 0.59% : 0.000001s : 6: predicate.float_environ_get_switch 0.90% : 0.000001s : 9: predicate.float_tuple_getitem_switch 0.21% : 0.000000s : 3: predicate.fold_const_symbol 0.79% : 0.000001s : 6: predicate.get_grad_eliminate 0.34% : 0.000000s : 3: predicate.graph_param_transform 0.74% : 0.000001s : 6: predicate.incorporate_call 0.62% : 0.000001s : 6: predicate.incorporate_call_switch 6.76% : 0.000010s : 34: predicate.inline 1.19% : 0.000002s : 6: predicate.inline_without_move 0.39% : 0.000001s : 6: predicate.j_node_and_user_rematch 1.31% : 0.000002s : 6: predicate.less_batch_normalization 1.67% : 0.000002s : 13: predicate.list_to_tuple_eliminator_ 1.92% : 0.000003s : 20: predicate.load_eliminater 1.75% : 0.000003s : 3: predicate.loop_unroll_after_grad 1.60% : 0.000002s : 14: predicate.loop_unroll_before_grad 1.78% : 0.000003s : 13: predicate.make_slice_get_slice_eliminator 0.62% : 0.000001s : 6: predicate.merge_addn 0.64% : 0.000001s : 6: predicate.micro_step_allgather_replace 0.71% : 0.000001s : 6: predicate.mini_step_allgather_replace 0.64% : 0.000001s : 7: predicate.minmaximum_grad 2.33% : 0.000003s : 3: predicate.mutable_eliminate 0.58% : 0.000001s : 3: predicate.opt_reshape 0.48% : 0.000001s : 3: predicate.parallel_virtual_node 1.21% : 0.000002s : 9: predicate.partial_defer_inline 1.21% : 0.000002s : 10: predicate.partial_eliminate 0.77% : 0.000001s : 7: predicate.print_const_string_wrapper 0.63% : 0.000001s : 6: predicate.reduce_all_const_elim 0.97% : 0.000001s : 7: predicate.reduce_eliminate 2.35% : 0.000003s : 20: predicate.redundant_stop_gradient_eliminater 0.72% : 0.000001s : 6: predicate.remove_not_recompute_node 1.05% : 0.000002s : 13: predicate.replace_applicator 0.64% : 0.000001s : 6: predicate.replace_old_param 0.45% : 0.000001s : 3: predicate.reset_defer_inline 0.71% : 0.000001s : 7: predicate.reshape_eliminate 0.73% : 0.000001s : 6: predicate.row_tensor_add_zeros_like 0.75% : 0.000001s : 3: predicate.row_tensor_eliminate 1.13% : 0.000002s : 6: predicate.same_eliminate 0.46% : 0.000001s : 6: predicate.set_cell_output_no_recompute 0.94% : 0.000001s : 6: predicate.shard_identity_eliminate 0.94% : 0.000001s : 6: predicate.special_op_eliminate 0.88% : 0.000001s : 6: predicate.specialize_transform 1.08% : 0.000002s : 6: predicate.split_environ_get_set_with_tuple_value 0.98% : 0.000001s : 6: predicate.stack_unstack_eliminate 0.40% : 0.000001s : 3: predicate.switch_call_monad_eliminater 0.93% : 0.000001s : 9: predicate.switch_defer_inline 1.62% : 0.000002s : 15: predicate.switch_layer_defer_inline 4.51% : 0.000007s : 32: predicate.switch_simplify 0.69% : 0.000001s : 7: predicate.tile_eliminate 0.74% : 0.000001s : 7: predicate.transpose_eliminate 1.51% : 0.000002s : 13: predicate.tuple_list_convert_item_index_to_positive 1.73% : 0.000003s : 13: predicate.tuple_list_get_item_const_eliminator 1.43% : 0.000002s : 13: predicate.tuple_list_get_item_depend_reorder 3.01% : 0.000004s : 19: predicate.tuple_list_get_item_eliminator 1.38% : 0.000002s : 13: predicate.tuple_list_get_set_item_eliminator 2.45% : 0.000004s : 19: predicate.tuple_list_set_item_eliminator 1.47% : 0.000002s : 13: predicate.tuple_to_list_eliminator_ 2.11% : 0.000003s : 20: predicate.updatestate_pure_node_eliminater 2.75% : 0.000004s : 26: predicate.updatestate_useless_node_eliminater 0.80% : 0.000001s : 3: predicate.value_based_eliminate 0.84% : 0.000001s : 6: predicate.virtual_dataset_eliminate 0.73% : 0.000001s : 6: predicate.virtual_output_eliminate 0.31% : 0.000000s : 3: predicate.virtual_view_grad_eliminate 0.53% : 0.000001s : 3: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000244 5 9.06% : 0.000022s : 1: func_graph_cloner_run.FuncGraphClonerGraph 90.94% : 0.000222s : 4: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.025389 192 0.02% : 0.000004s : 1: ForceFp32Comm 13.86% : 0.003519s : 1: add_attr 13.81% : 0.003507s : 1: add_attr_with_inline 0.01% : 0.000004s : 1: add_comm_op_reuse_tag 0.23% : 0.000059s : 1: add_recomputation 0.02% : 0.000004s : 1: assign_add_opt 0.25% : 0.000063s : 1: auto_monad 0.09% : 0.000023s : 1: auto_monad_reorder 0.01% : 0.000004s : 1: begin_end_overlap_inline 0.02% : 0.000006s : 1: bias_add_comm_swap 1.89% : 0.000480s : 1: bootstrap 0.15% : 0.000038s : 1: cconv 0.02% : 0.000004s : 1: comm_op_add_attrs 0.08% : 0.000019s : 1: control_data_broadcast_order 0.04% : 0.000011s : 1: convert_after_rewriter 0.11% : 0.000027s : 1: cse_after_recomputation 0.02% : 0.000005s : 1: dataset_repeat_opt 0.03% : 0.000006s : 1: detach_backward 0.04% : 0.000011s : 1: environ_conv 0.08% : 0.000020s : 1: event_method 0.02% : 0.000005s : 1: full_micro_interleaved_order_control 0.02% : 0.000006s : 1: get_jit_bprop_graph 0.03% : 0.000009s : 1: graph_reusing 0.02% : 0.000005s : 1: grouped_pairwise_exchange_alltoall 0.02% : 0.000004s : 1: handle_group_info 0.02% : 0.000005s : 1: inline 0.02% : 0.000006s : 1: insert-virtual-dataset 0.02% : 0.000004s : 1: interleave_parallel_branches 0.02% : 0.000004s : 1: interleave_split_concat_branches 0.02% : 0.000006s : 1: label_fine_grained_interleaved_index 0.03% : 0.000009s : 1: label_micro_interleaved_index 2.43% : 0.000617s : 1: loop_unroll 0.02% : 0.000005s : 1: merge_cast_opt 0.02% : 0.000006s : 1: micro_interleaved_order_control 2.92% : 0.000742s : 1: mutable_eliminate 0.03% : 0.000008s : 1: offloading_packed_experts 0.08% : 0.000020s : 1: opt.transform.loop_unroll_optimizer 0.09% : 0.000022s : 1: opt.transform.mutable_eliminate 3.21% : 0.000815s : 78: opt.transform.opt_a 0.11% : 0.000029s : 1: opt.transform.opt_after_cconv 0.11% : 0.000029s : 1: opt.transform.opt_after_jit_grad 0.57% : 0.000144s : 28: opt.transform.opt_b 0.20% : 0.000050s : 2: opt.transform.opt_trans_graph 0.17% : 0.000043s : 4: opt.transform.symbol_engine_opt 9.47% : 0.002405s : 1: opt_a 0.48% : 0.000122s : 1: opt_after_cconv 2.42% : 0.000614s : 1: opt_after_jit_grad 1.10% : 0.000280s : 1: opt_b 19.52% : 0.004957s : 1: optimize 0.09% : 0.000023s : 1: optimize_parallel_all_gather_comm 0.03% : 0.000009s : 1: order_py_execute_after_rewriter 0.10% : 0.000025s : 1: overlap_grad_flash_sp 0.02% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.03% : 0.000008s : 1: overlap_grad_ring_attention 0.02% : 0.000006s : 1: overlap_opt_shard_grad_in_pipeline 0.02% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.02% : 0.000005s : 1: overlap_param_gather 0.02% : 0.000005s : 1: overlap_recompute_allgather_and_fa_grad 0.03% : 0.000008s : 1: overlap_recompute_and_grad_model_parallel 0.02% : 0.000005s : 1: overlap_recompute_comm 0.03% : 0.000007s : 1: parallel-infer-symbol 0.02% : 0.000004s : 1: parallel-infer-symbol-second 0.02% : 0.000005s : 1: partial_unused_args_eliminate 0.02% : 0.000005s : 1: pipeline_parallel_scheduler 0.02% : 0.000004s : 1: pipeline_split 0.14% : 0.000036s : 1: pre_auto_parallel 0.10% : 0.000026s : 1: py_interpret_to_execute 0.07% : 0.000017s : 1: py_interpret_to_execute_after_opt_a 0.02% : 0.000004s : 1: remove_cast_before_assign_add 0.08% : 0.000020s : 1: remove_dup_value 1.46% : 0.000370s : 1: renormalize.infer 1.11% : 0.000283s : 1: renormalize.specialize 0.02% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.04% : 0.000010s : 1: rewriter_after_jit_bprop_graph 0.19% : 0.000048s : 1: rewriter_after_opt_a 0.21% : 0.000052s : 1: rewriter_before_opt_a 0.02% : 0.000006s : 1: slice_cell_reuse_recomputed_activation 0.02% : 0.000005s : 1: slice_recompute_activation 0.02% : 0.000005s : 1: split_layernorm_comm 0.02% : 0.000005s : 1: split_matmul_comm_elemetwise 0.03% : 0.000008s : 1: swap_dp_allreduce_reducescatter 0.37% : 0.000093s : 1: symbol_engine_optimizer 0.33% : 0.000083s : 1: tuple_transform 21.27% : 0.005400s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:52.851.063 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:52.851.375 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0154894, [21] [bootstrap]: 0.00047605 [type_inference]: 0.00547106 [event_method]: 1.262e-05 [auto_monad]: 5.707e-05 [graph_reusing]: 6.31e-06 [inline]: 2.24001e-06 [add_attr]: 0.00327717, [1] [add_attr_with_inline]: 0.00326824, [1] [Cycle 1]: 7.195e-05, [2] [tag_attr]: 1.456e-05 [meta_addattr_fg_expand]: 3.46001e-06 [parallel-infer-symbol]: 3.75e-06 [pre_auto_parallel]: 2.705e-05 [insert-virtual-dataset]: 2.51998e-06 [parallel-infer-symbol-second]: 7.10017e-07 [dataset_repeat_opt]: 1.84e-06 [pipeline_split]: 1.97999e-06 [optimize]: 0.00498067, [53] [py_interpret_to_execute]: 2.023e-05 [rewriter_before_opt_a]: 5.248e-05 [opt_a]: 0.00261534, [2] [Cycle 1]: 0.00175449, [45] [expand_dump_flag]: 2.91999e-06 [switch_simplify]: 2.372e-05 [loop_unroll]: 1.37e-05 [a_1]: 0.0003005 [with_stream_mark]: 1.804e-05 [recompute_prepare]: 8.70001e-06 [updatestate_depend_eliminate]: 4.65001e-06 [updatestate_assign_eliminate]: 3.2e-06 [updatestate_loads_eliminate]: 2.81999e-06 [parameter_eliminate]: 1.94e-06 [a_2]: 0.00010917 [accelerated_algorithm]: 7.28e-06 [shard]: 2.31998e-06 [meta_shard_fg_expand]: 1.75001e-06 [shard_inline]: 6.01e-06 [merge_send_recv]: 9.28002e-06 [auto_parallel]: 6.51e-06 [parallel]: 1.776e-05 [flash_sp]: 8.17e-06 [merge_comm]: 4.79e-06 [allreduce_fusion]: 3.54002e-06 [matmul_add_comm_reduction]: 9.69e-06 [allreduce_slice_to_reducescatter]: 6.00005e-07 [virtual_shard_identity]: 8.35999e-06 [virtual_dataset]: 6.33e-06 [get_grad_eliminate_]: 5.81e-06 [virtual_output]: 6.41e-06 [merge_forward]: 4.70999e-06 [cell_reuse_recompute_pass]: 1.39e-06 [offload_activation]: 9.86998e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.608e-05 [merge_recompute_call_nodes]: 1.47999e-06 [before_grad]: 1.148e-05 [set_forward_comm_id_for_comm_node_pass]: 3.86999e-06 [meta_fg_expand]: 2.66e-06 [flash_sp_send_recv_attached]: 2.45002e-06 [receive_attached]: 2.49999e-06 [after_resolve]: 1.073e-05 [a_after_grad]: 1.034e-05 [renormalize]: 0.00056631 [add_forward_monad_depend]: 5.46e-06 [auto_monad_grad]: 2.72001e-06 [auto_monad_eliminator]: 1.652e-05 [cse]: 3.085e-05 [a_3]: 6.366e-05 [Cycle 2]: 0.00084718, [45] [expand_dump_flag]: 1.07e-06 [switch_simplify]: 7.75e-06 [loop_unroll]: 6.14999e-06 [a_1]: 0.00010801 [with_stream_mark]: 1.284e-05 [recompute_prepare]: 6.83998e-06 [updatestate_depend_eliminate]: 3.08e-06 [updatestate_assign_eliminate]: 2.74999e-06 [updatestate_loads_eliminate]: 3.14999e-06 [parameter_eliminate]: 1.10999e-06 [a_2]: 9.781e-05 [accelerated_algorithm]: 6.39999e-06 [shard]: 1.98002e-06 [meta_shard_fg_expand]: 1.49998e-06 [shard_inline]: 6.29001e-06 [merge_send_recv]: 6.04001e-06 [auto_parallel]: 6.57002e-06 [parallel]: 6.24001e-06 [flash_sp]: 3.87002e-06 [merge_comm]: 3.77998e-06 [allreduce_fusion]: 3.17002e-06 [matmul_add_comm_reduction]: 7.59002e-06 [allreduce_slice_to_reducescatter]: 6.10016e-07 [virtual_shard_identity]: 6.89999e-06 [virtual_dataset]: 6.26e-06 [get_grad_eliminate_]: 5.64998e-06 [virtual_output]: 5.48002e-06 [merge_forward]: 3.51999e-06 [cell_reuse_recompute_pass]: 1.45001e-06 [offload_activation]: 7.98999e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.581e-05 [merge_recompute_call_nodes]: 1.10001e-06 [before_grad]: 1.105e-05 [set_forward_comm_id_for_comm_node_pass]: 4.50999e-06 [meta_fg_expand]: 2.68998e-06 [flash_sp_send_recv_attached]: 1.47001e-06 [receive_attached]: 1.17999e-06 [after_resolve]: 2.279e-05 [a_after_grad]: 9.07001e-06 [renormalize]: 5.9983e-08 [add_forward_monad_depend]: 2.39999e-06 [auto_monad_grad]: 9.5999e-07 [auto_monad_eliminator]: 1.008e-05 [cse]: 1.784e-05 [a_3]: 4.963e-05 [py_interpret_to_execute_after_opt_a]: 1.451e-05 [slice_cell_reuse_recomputed_activation]: 4.87998e-06 [rewriter_after_opt_a]: 4.211e-05 [convert_after_rewriter]: 1.044e-05 [order_py_execute_after_rewriter]: 9.03002e-06 [mutable_eliminate]: 0.00054003 [opt_b]: 0.00031914, [1] [Cycle 1]: 0.00030993, [7] [b_1]: 0.00020463 [b_2]: 8.69e-06 [updatestate_depend_eliminate]: 7.21999e-06 [updatestate_assign_eliminate]: 2.48e-06 [updatestate_loads_eliminate]: 2.74999e-06 [renormalize]: 4.50003e-07 [cse]: 2.297e-05 [optimize_parallel_all_gather_comm]: 2.09e-05 [overlap_param_gather]: 4.73001e-06 [cconv]: 3.081e-05 [loop_unroll]: 0.00046897 [opt_after_cconv]: 0.00013089, [1] [Cycle 1]: 0.00012216, [7] [c_1]: 2.842e-05 [parameter_eliminate]: 4.12e-06 [updatestate_depend_eliminate]: 6.69999e-06 [updatestate_assign_eliminate]: 2.58e-06 [updatestate_loads_eliminate]: 2.82002e-06 [cse]: 2.151e-05 [renormalize]: 4.00003e-07 [remove_dup_value]: 1.72e-05 [tuple_transform]: 8.904e-05, [1] [Cycle 1]: 8.16e-05, [4] [d_1]: 4.194e-05 [none_parameter_eliminate]: 1.87999e-06 [renormalize]: 1.80007e-07 [switch_simplify]: 6.83003e-06 [partial_unused_args_eliminate]: 4.58999e-06 [add_recomputation]: 5.134e-05 [cse_after_recomputation]: 2.817e-05, [1] [Cycle 1]: 2.148e-05, [1] [cse]: 1.285e-05 [environ_conv]: 9.49999e-06 [swap_dp_allreduce_reducescatter]: 8.60001e-06 [bias_add_comm_swap]: 4.89998e-06 [label_micro_interleaved_index]: 8.46002e-06 [label_fine_grained_interleaved_index]: 5.09e-06 [merge_cast_opt]: 3.66001e-06 [slice_recompute_activation]: 4.97e-06 [micro_interleaved_order_control]: 4.94998e-06 [assign_add_opt]: 3.48999e-06 [ForceFp32Comm]: 3.33e-06 [remove_cast_before_assign_add]: 3.64002e-06 [full_micro_interleaved_order_control]: 4.55001e-06 [reorder_send_recv_between_fp_bp]: 5.85002e-06 [comm_op_add_attrs]: 3.93001e-06 [add_comm_op_reuse_tag]: 3.51999e-06 [interleave_split_concat_branches]: 3.48e-06 [interleave_parallel_branches]: 3.65e-06 [overlap_opt_shard_in_pipeline]: 3.55e-06 [overlap_opt_shard_grad_in_pipeline]: 4.93001e-06 [control_data_broadcast_order]: 1.608e-05 [grouped_pairwise_exchange_alltoall]: 3.95e-06 [offloading_packed_experts]: 6.61999e-06 [overlap_recompute_and_grad_model_parallel]: 7.45998e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.61001e-06 [overlap_recompute_allgather_and_fa_grad]: 3.76999e-06 [overlap_recompute_comm]: 4.81002e-06 [overlap_grad_ring_attention]: 6.87002e-06 [overlap_grad_flash_sp]: 2.332e-05 [begin_end_overlap_inline]: 2.99001e-06 [split_matmul_comm_elemetwise]: 4.53999e-06 [split_layernorm_comm]: 4.22998e-06 [handle_group_info]: 3.61001e-06 [symbol_engine_optimizer]: 0.00010131, [1] [Cycle 1]: 9.446e-05, [6] [build]: 3.43e-06 [elim_shapecalc]: 1.181e-05 [elim_not_effective]: 1.354e-05 [opt_reshape]: 6.93e-06 [fold_const_symbol]: 9.92999e-06 [renormalize]: 2.9002e-07 [detach_backward]: 3.95e-06 [pipeline_parallel_scheduler]: 1.74e-06 [auto_monad_reorder]: 2.079e-05 [get_jit_bprop_graph]: 1.67001e-06 [rewriter_after_jit_bprop_graph]: 4.55999e-06 [opt_after_jit_grad]: 0.00051061 [validate]: 3.922e-05 Sums bootstrap : 0.000476s : 4.57% type_inference : 0.005471s : 52.52% event_method : 0.000013s : 0.12% auto_monad : 0.000057s : 0.55% graph_reusing : 0.000006s : 0.06% inline : 0.000002s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000015s : 0.14% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000003s : 0.03% parallel-infer-symbol : 0.000004s : 0.04% pre_auto_parallel : 0.000027s : 0.26% insert-virtual-dataset : 0.000003s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.02% optimize.py_interpret_to_execute : 0.000020s : 0.19% optimize.rewriter_before_opt_a : 0.000052s : 0.50% optimize.opt_a.expand_dump_flag : 0.000004s : 0.04% optimize.opt_a.switch_simplify : 0.000031s : 0.30% optimize.opt_a.loop_unroll : 0.000020s : 0.19% optimize.opt_a.a_1 : 0.000409s : 3.92% optimize.opt_a.with_stream_mark : 0.000031s : 0.30% optimize.opt_a.recompute_prepare : 0.000016s : 0.15% optimize.opt_a.updatestate_depend_eliminate : 0.000008s : 0.07% optimize.opt_a.updatestate_assign_eliminate : 0.000006s : 0.06% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.06% optimize.opt_a.parameter_eliminate : 0.000003s : 0.03% optimize.opt_a.a_2 : 0.000207s : 1.99% optimize.opt_a.accelerated_algorithm : 0.000014s : 0.13% optimize.opt_a.shard : 0.000004s : 0.04% optimize.opt_a.meta_shard_fg_expand : 0.000003s : 0.03% optimize.opt_a.shard_inline : 0.000012s : 0.12% optimize.opt_a.merge_send_recv : 0.000015s : 0.15% optimize.opt_a.auto_parallel : 0.000013s : 0.13% optimize.opt_a.parallel : 0.000024s : 0.23% optimize.opt_a.flash_sp : 0.000012s : 0.12% optimize.opt_a.merge_comm : 0.000009s : 0.08% optimize.opt_a.allreduce_fusion : 0.000007s : 0.06% optimize.opt_a.matmul_add_comm_reduction : 0.000017s : 0.17% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000015s : 0.15% optimize.opt_a.virtual_dataset : 0.000013s : 0.12% optimize.opt_a.get_grad_eliminate_ : 0.000011s : 0.11% optimize.opt_a.virtual_output : 0.000012s : 0.11% optimize.opt_a.merge_forward : 0.000008s : 0.08% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.03% optimize.opt_a.offload_activation : 0.000018s : 0.17% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000032s : 0.31% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.02% optimize.opt_a.before_grad : 0.000023s : 0.22% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000008s : 0.08% optimize.opt_a.meta_fg_expand : 0.000005s : 0.05% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.04% optimize.opt_a.receive_attached : 0.000004s : 0.04% optimize.opt_a.after_resolve : 0.000034s : 0.32% optimize.opt_a.a_after_grad : 0.000019s : 0.19% optimize.opt_a.renormalize : 0.000566s : 5.44% optimize.opt_a.add_forward_monad_depend : 0.000008s : 0.08% optimize.opt_a.auto_monad_grad : 0.000004s : 0.04% optimize.opt_a.auto_monad_eliminator : 0.000027s : 0.26% optimize.opt_a.cse : 0.000049s : 0.47% optimize.opt_a.a_3 : 0.000113s : 1.09% optimize.py_interpret_to_execute_after_opt_a : 0.000015s : 0.14% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.05% optimize.rewriter_after_opt_a : 0.000042s : 0.40% optimize.convert_after_rewriter : 0.000010s : 0.10% optimize.order_py_execute_after_rewriter : 0.000009s : 0.09% optimize.mutable_eliminate : 0.000540s : 5.18% optimize.opt_b.b_1 : 0.000205s : 1.96% optimize.opt_b.b_2 : 0.000009s : 0.08% optimize.opt_b.updatestate_depend_eliminate : 0.000007s : 0.07% optimize.opt_b.updatestate_assign_eliminate : 0.000002s : 0.02% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_b.renormalize : 0.000000s : 0.00% optimize.opt_b.cse : 0.000023s : 0.22% optimize.optimize_parallel_all_gather_comm : 0.000021s : 0.20% optimize.overlap_param_gather : 0.000005s : 0.05% optimize.cconv : 0.000031s : 0.30% optimize.loop_unroll : 0.000469s : 4.50% optimize.opt_after_cconv.c_1 : 0.000028s : 0.27% optimize.opt_after_cconv.parameter_eliminate : 0.000004s : 0.04% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000007s : 0.06% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.02% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.cse : 0.000022s : 0.21% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000017s : 0.17% optimize.tuple_transform.d_1 : 0.000042s : 0.40% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.02% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000007s : 0.07% optimize.partial_unused_args_eliminate : 0.000005s : 0.04% optimize.add_recomputation : 0.000051s : 0.49% optimize.cse_after_recomputation.cse : 0.000013s : 0.12% optimize.environ_conv : 0.000009s : 0.09% optimize.swap_dp_allreduce_reducescatter : 0.000009s : 0.08% optimize.bias_add_comm_swap : 0.000005s : 0.05% optimize.label_micro_interleaved_index : 0.000008s : 0.08% optimize.label_fine_grained_interleaved_index : 0.000005s : 0.05% optimize.merge_cast_opt : 0.000004s : 0.04% optimize.slice_recompute_activation : 0.000005s : 0.05% optimize.micro_interleaved_order_control : 0.000005s : 0.05% optimize.assign_add_opt : 0.000003s : 0.03% optimize.ForceFp32Comm : 0.000003s : 0.03% optimize.remove_cast_before_assign_add : 0.000004s : 0.03% optimize.full_micro_interleaved_order_control : 0.000005s : 0.04% optimize.reorder_send_recv_between_fp_bp : 0.000006s : 0.06% optimize.comm_op_add_attrs : 0.000004s : 0.04% optimize.add_comm_op_reuse_tag : 0.000004s : 0.03% optimize.interleave_split_concat_branches : 0.000003s : 0.03% optimize.interleave_parallel_branches : 0.000004s : 0.04% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.03% optimize.overlap_opt_shard_grad_in_pipeline : 0.000005s : 0.05% optimize.control_data_broadcast_order : 0.000016s : 0.15% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.04% optimize.offloading_packed_experts : 0.000007s : 0.06% optimize.overlap_recompute_and_grad_model_parallel : 0.000007s : 0.07% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.03% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.04% optimize.overlap_recompute_comm : 0.000005s : 0.05% optimize.overlap_grad_ring_attention : 0.000007s : 0.07% optimize.overlap_grad_flash_sp : 0.000023s : 0.22% optimize.begin_end_overlap_inline : 0.000003s : 0.03% optimize.split_matmul_comm_elemetwise : 0.000005s : 0.04% optimize.split_layernorm_comm : 0.000004s : 0.04% optimize.handle_group_info : 0.000004s : 0.03% optimize.symbol_engine_optimizer.build : 0.000003s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000012s : 0.11% optimize.symbol_engine_optimizer.elim_not_effective : 0.000014s : 0.13% optimize.symbol_engine_optimizer.opt_reshape : 0.000007s : 0.07% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000010s : 0.10% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000004s : 0.04% pipeline_parallel_scheduler : 0.000002s : 0.02% auto_monad_reorder : 0.000021s : 0.20% get_jit_bprop_graph : 0.000002s : 0.02% rewriter_after_jit_bprop_graph : 0.000005s : 0.04% opt_after_jit_grad : 0.000511s : 4.90% validate : 0.000039s : 0.38% Time group info: ------[substitution.] 0.000140 20 1.47% : 0.000002s : 2: substitution.elim_not_effective 0.95% : 0.000001s : 2: substitution.fold_const_symbol 3.94% : 0.000006s : 3: substitution.graph_param_transform 65.74% : 0.000092s : 2: substitution.inline 3.37% : 0.000005s : 4: substitution.j_node_and_user_rematch 3.32% : 0.000005s : 4: substitution.remove_not_recompute_node 2.60% : 0.000004s : 2: substitution.replace_old_param 18.62% : 0.000026s : 1: substitution.value_based_eliminate ------[type_inference.] 0.005426 2 91.52% : 0.004966s : 1: type_inference.infer 8.48% : 0.000460s : 1: type_inference.specialize ------[replace.] 0.000023 2 100.00% : 0.000023s : 2: replace.inline ------[match.] 0.000091 2 100.00% : 0.000091s : 2: match.inline ------[predicate.] 0.000141 754 0.79% : 0.000001s : 7: predicate.accumulaten_eliminater 1.31% : 0.000002s : 3: predicate.ad_related_special_op_eliminate 0.80% : 0.000001s : 6: predicate.addn_check_dump 0.94% : 0.000001s : 7: predicate.addn_zero_filter 0.65% : 0.000001s : 7: predicate.adjust_all_reduce_mul_add 2.24% : 0.000003s : 13: predicate.arithmetic_simplify 0.81% : 0.000001s : 7: predicate.cast_eliminate 0.77% : 0.000001s : 6: predicate.check_bprop_eliminate 0.65% : 0.000001s : 6: predicate.compare_switch_simplify 0.22% : 0.000000s : 3: predicate.const_output_eliminate 0.81% : 0.000001s : 6: predicate.depend_value_elim 1.05% : 0.000001s : 7: predicate.dict_get_item_const_eliminator 0.87% : 0.000001s : 7: predicate.dict_get_item_eliminator 0.77% : 0.000001s : 7: predicate.dict_set_item_eliminator 1.31% : 0.000002s : 6: predicate.dumpgradient_eliminate 0.26% : 0.000000s : 3: predicate.elim_not_effective 0.60% : 0.000001s : 3: predicate.elim_shapecalc_of_broadcastargs 1.22% : 0.000002s : 10: predicate.environ_add_const_eliminate 0.95% : 0.000001s : 10: predicate.environ_get_add_eliminate 1.00% : 0.000001s : 10: predicate.environ_get_depend_swap 1.84% : 0.000003s : 16: predicate.environ_get_eliminate 0.97% : 0.000001s : 10: predicate.environ_get_set_eliminate 0.98% : 0.000001s : 9: predicate.exchange_switch_depend_value 1.85% : 0.000003s : 9: predicate.float_depend_g_call 0.67% : 0.000001s : 6: predicate.float_environ_get_switch 0.97% : 0.000001s : 9: predicate.float_tuple_getitem_switch 0.23% : 0.000000s : 3: predicate.fold_const_symbol 0.74% : 0.000001s : 6: predicate.get_grad_eliminate 0.26% : 0.000000s : 3: predicate.graph_param_transform 0.76% : 0.000001s : 6: predicate.incorporate_call 0.67% : 0.000001s : 6: predicate.incorporate_call_switch 6.45% : 0.000009s : 34: predicate.inline 1.17% : 0.000002s : 6: predicate.inline_without_move 0.39% : 0.000001s : 6: predicate.j_node_and_user_rematch 1.07% : 0.000001s : 6: predicate.less_batch_normalization 1.68% : 0.000002s : 13: predicate.list_to_tuple_eliminator_ 2.11% : 0.000003s : 20: predicate.load_eliminater 1.32% : 0.000002s : 3: predicate.loop_unroll_after_grad 1.65% : 0.000002s : 14: predicate.loop_unroll_before_grad 1.90% : 0.000003s : 13: predicate.make_slice_get_slice_eliminator 0.70% : 0.000001s : 6: predicate.merge_addn 1.02% : 0.000001s : 6: predicate.micro_step_allgather_replace 0.68% : 0.000001s : 6: predicate.mini_step_allgather_replace 0.67% : 0.000001s : 7: predicate.minmaximum_grad 1.84% : 0.000003s : 3: predicate.mutable_eliminate 0.46% : 0.000001s : 3: predicate.opt_reshape 0.48% : 0.000001s : 3: predicate.parallel_virtual_node 1.36% : 0.000002s : 9: predicate.partial_defer_inline 1.20% : 0.000002s : 10: predicate.partial_eliminate 0.84% : 0.000001s : 7: predicate.print_const_string_wrapper 0.68% : 0.000001s : 6: predicate.reduce_all_const_elim 1.11% : 0.000002s : 7: predicate.reduce_eliminate 1.98% : 0.000003s : 20: predicate.redundant_stop_gradient_eliminater 0.62% : 0.000001s : 6: predicate.remove_not_recompute_node 1.10% : 0.000002s : 13: predicate.replace_applicator 0.65% : 0.000001s : 6: predicate.replace_old_param 0.33% : 0.000000s : 3: predicate.reset_defer_inline 0.86% : 0.000001s : 7: predicate.reshape_eliminate 0.80% : 0.000001s : 6: predicate.row_tensor_add_zeros_like 0.63% : 0.000001s : 3: predicate.row_tensor_eliminate 0.90% : 0.000001s : 6: predicate.same_eliminate 0.50% : 0.000001s : 6: predicate.set_cell_output_no_recompute 1.06% : 0.000001s : 6: predicate.shard_identity_eliminate 0.99% : 0.000001s : 6: predicate.special_op_eliminate 0.84% : 0.000001s : 6: predicate.specialize_transform 1.15% : 0.000002s : 6: predicate.split_environ_get_set_with_tuple_value 0.81% : 0.000001s : 6: predicate.stack_unstack_eliminate 0.42% : 0.000001s : 3: predicate.switch_call_monad_eliminater 1.05% : 0.000001s : 9: predicate.switch_defer_inline 1.73% : 0.000002s : 15: predicate.switch_layer_defer_inline 4.23% : 0.000006s : 32: predicate.switch_simplify 0.70% : 0.000001s : 7: predicate.tile_eliminate 0.84% : 0.000001s : 7: predicate.transpose_eliminate 1.47% : 0.000002s : 13: predicate.tuple_list_convert_item_index_to_positive 1.46% : 0.000002s : 13: predicate.tuple_list_get_item_const_eliminator 1.30% : 0.000002s : 13: predicate.tuple_list_get_item_depend_reorder 3.26% : 0.000005s : 19: predicate.tuple_list_get_item_eliminator 1.38% : 0.000002s : 13: predicate.tuple_list_get_set_item_eliminator 2.59% : 0.000004s : 19: predicate.tuple_list_set_item_eliminator 1.47% : 0.000002s : 13: predicate.tuple_to_list_eliminator_ 2.02% : 0.000003s : 20: predicate.updatestate_pure_node_eliminater 2.99% : 0.000004s : 26: predicate.updatestate_useless_node_eliminater 0.68% : 0.000001s : 3: predicate.value_based_eliminate 0.77% : 0.000001s : 6: predicate.virtual_dataset_eliminate 0.76% : 0.000001s : 6: predicate.virtual_output_eliminate 0.33% : 0.000000s : 3: predicate.virtual_view_grad_eliminate 0.58% : 0.000001s : 3: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000226 5 8.37% : 0.000019s : 1: func_graph_cloner_run.FuncGraphClonerGraph 91.63% : 0.000207s : 4: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.025284 192 0.02% : 0.000006s : 1: ForceFp32Comm 13.00% : 0.003287s : 1: add_attr 12.94% : 0.003272s : 1: add_attr_with_inline 0.02% : 0.000006s : 1: add_comm_op_reuse_tag 0.22% : 0.000055s : 1: add_recomputation 0.02% : 0.000006s : 1: assign_add_opt 0.26% : 0.000066s : 1: auto_monad 0.11% : 0.000029s : 1: auto_monad_reorder 0.02% : 0.000006s : 1: begin_end_overlap_inline 0.03% : 0.000008s : 1: bias_add_comm_swap 2.06% : 0.000521s : 1: bootstrap 0.14% : 0.000034s : 1: cconv 0.03% : 0.000007s : 1: comm_op_add_attrs 0.08% : 0.000019s : 1: control_data_broadcast_order 0.05% : 0.000014s : 1: convert_after_rewriter 0.12% : 0.000031s : 1: cse_after_recomputation 0.03% : 0.000007s : 1: dataset_repeat_opt 0.08% : 0.000021s : 1: detach_backward 0.05% : 0.000012s : 1: environ_conv 0.09% : 0.000023s : 1: event_method 0.03% : 0.000007s : 1: full_micro_interleaved_order_control 0.03% : 0.000008s : 1: get_jit_bprop_graph 0.05% : 0.000013s : 1: graph_reusing 0.03% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.02% : 0.000006s : 1: handle_group_info 0.03% : 0.000008s : 1: inline 0.04% : 0.000009s : 1: insert-virtual-dataset 0.03% : 0.000006s : 1: interleave_parallel_branches 0.03% : 0.000006s : 1: interleave_split_concat_branches 0.03% : 0.000008s : 1: label_fine_grained_interleaved_index 0.04% : 0.000011s : 1: label_micro_interleaved_index 1.88% : 0.000475s : 1: loop_unroll 0.03% : 0.000006s : 1: merge_cast_opt 0.03% : 0.000008s : 1: micro_interleaved_order_control 2.17% : 0.000548s : 1: mutable_eliminate 0.04% : 0.000010s : 1: offloading_packed_experts 0.06% : 0.000016s : 1: opt.transform.loop_unroll_optimizer 0.07% : 0.000018s : 1: opt.transform.mutable_eliminate 3.17% : 0.000801s : 78: opt.transform.opt_a 0.11% : 0.000027s : 1: opt.transform.opt_after_cconv 0.11% : 0.000027s : 1: opt.transform.opt_after_jit_grad 0.52% : 0.000132s : 28: opt.transform.opt_b 0.18% : 0.000046s : 2: opt.transform.opt_trans_graph 0.15% : 0.000038s : 4: opt.transform.symbol_engine_opt 10.36% : 0.002619s : 1: opt_a 0.53% : 0.000135s : 1: opt_after_cconv 2.07% : 0.000523s : 1: opt_after_jit_grad 1.28% : 0.000323s : 1: opt_b 21.06% : 0.005325s : 1: optimize 0.10% : 0.000024s : 1: optimize_parallel_all_gather_comm 0.05% : 0.000013s : 1: order_py_execute_after_rewriter 0.11% : 0.000027s : 1: overlap_grad_flash_sp 0.03% : 0.000006s : 1: overlap_grad_matmul_and_grad_allreduce 0.04% : 0.000010s : 1: overlap_grad_ring_attention 0.03% : 0.000008s : 1: overlap_opt_shard_grad_in_pipeline 0.03% : 0.000007s : 1: overlap_opt_shard_in_pipeline 0.03% : 0.000008s : 1: overlap_param_gather 0.03% : 0.000006s : 1: overlap_recompute_allgather_and_fa_grad 0.04% : 0.000010s : 1: overlap_recompute_and_grad_model_parallel 0.03% : 0.000008s : 1: overlap_recompute_comm 0.04% : 0.000010s : 1: parallel-infer-symbol 0.02% : 0.000006s : 1: parallel-infer-symbol-second 0.03% : 0.000008s : 1: partial_unused_args_eliminate 0.04% : 0.000009s : 1: pipeline_parallel_scheduler 0.03% : 0.000007s : 1: pipeline_split 0.14% : 0.000034s : 1: pre_auto_parallel 0.09% : 0.000024s : 1: py_interpret_to_execute 0.07% : 0.000018s : 1: py_interpret_to_execute_after_opt_a 0.02% : 0.000006s : 1: remove_cast_before_assign_add 0.08% : 0.000020s : 1: remove_dup_value 1.25% : 0.000315s : 1: renormalize.infer 0.96% : 0.000244s : 1: renormalize.specialize 0.03% : 0.000009s : 1: reorder_send_recv_between_fp_bp 0.04% : 0.000010s : 1: rewriter_after_jit_bprop_graph 0.18% : 0.000046s : 1: rewriter_after_opt_a 0.22% : 0.000056s : 1: rewriter_before_opt_a 0.03% : 0.000008s : 1: slice_cell_reuse_recomputed_activation 0.03% : 0.000008s : 1: slice_recompute_activation 0.03% : 0.000007s : 1: split_layernorm_comm 0.03% : 0.000007s : 1: split_matmul_comm_elemetwise 0.05% : 0.000011s : 1: swap_dp_allreduce_reducescatter 0.41% : 0.000104s : 1: symbol_engine_optimizer 0.36% : 0.000092s : 1: tuple_transform 21.75% : 0.005500s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:53.508.11 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0154501, [21] [bootstrap]: 0.00047134 [type_inference]: 0.00578744 [event_method]: 1.369e-05 [auto_monad]: 5.974e-05 [graph_reusing]: 5.25999e-06 [inline]: 2.63998e-06 [add_attr]: 0.00359315, [1] [add_attr_with_inline]: 0.00358278, [1] [Cycle 1]: 6.21e-05, [2] [tag_attr]: 1.664e-05 [meta_addattr_fg_expand]: 3.75e-06 [parallel-infer-symbol]: 3.7e-06 [pre_auto_parallel]: 2.896e-05 [insert-virtual-dataset]: 2.74001e-06 [parallel-infer-symbol-second]: 7.50006e-07 [dataset_repeat_opt]: 1.63002e-06 [pipeline_split]: 1.96998e-06 [optimize]: 0.00467473, [53] [py_interpret_to_execute]: 2.045e-05 [rewriter_before_opt_a]: 5.017e-05 [opt_a]: 0.0023895, [2] [Cycle 1]: 0.00173037, [45] [expand_dump_flag]: 2.96999e-06 [switch_simplify]: 2.717e-05 [loop_unroll]: 1.386e-05 [a_1]: 0.00030753 [with_stream_mark]: 1.823e-05 [recompute_prepare]: 8.67e-06 [updatestate_depend_eliminate]: 4.03999e-06 [updatestate_assign_eliminate]: 3.41001e-06 [updatestate_loads_eliminate]: 2.99001e-06 [parameter_eliminate]: 2.01003e-06 [a_2]: 8.154e-05 [accelerated_algorithm]: 7.26999e-06 [shard]: 2.58e-06 [meta_shard_fg_expand]: 2.04e-06 [shard_inline]: 7.03e-06 [merge_send_recv]: 8.43999e-06 [auto_parallel]: 6.92002e-06 [parallel]: 1.775e-05 [flash_sp]: 9.19e-06 [merge_comm]: 4.2e-06 [allreduce_fusion]: 3.73001e-06 [matmul_add_comm_reduction]: 1.049e-05 [allreduce_slice_to_reducescatter]: 6.60017e-07 [virtual_shard_identity]: 9.32001e-06 [virtual_dataset]: 6.20002e-06 [get_grad_eliminate_]: 6.50002e-06 [virtual_output]: 6.38e-06 [merge_forward]: 4.23999e-06 [cell_reuse_recompute_pass]: 1.39998e-06 [offload_activation]: 1.018e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.311e-05 [merge_recompute_call_nodes]: 1.49e-06 [before_grad]: 1.128e-05 [set_forward_comm_id_for_comm_node_pass]: 3.65e-06 [meta_fg_expand]: 2.74999e-06 [flash_sp_send_recv_attached]: 2.71999e-06 [receive_attached]: 2.32999e-06 [after_resolve]: 1.065e-05 [a_after_grad]: 9.81e-06 [renormalize]: 0.00070949 [add_forward_monad_depend]: 5.70001e-06 [auto_monad_grad]: 2.53e-06 [auto_monad_eliminator]: 1.554e-05 [cse]: 3.314e-05 [a_3]: 5.039e-05 [Cycle 2]: 0.00064779, [45] [expand_dump_flag]: 1.04e-06 [switch_simplify]: 7.68999e-06 [loop_unroll]: 5.86e-06 [a_1]: 0.00010699 [with_stream_mark]: 1.551e-05 [recompute_prepare]: 6.16e-06 [updatestate_depend_eliminate]: 3.01001e-06 [updatestate_assign_eliminate]: 2.19999e-06 [updatestate_loads_eliminate]: 3.06999e-06 [parameter_eliminate]: 1.17999e-06 [a_2]: 6.967e-05 [accelerated_algorithm]: 6.00002e-06 [shard]: 1.72999e-06 [meta_shard_fg_expand]: 1.84998e-06 [shard_inline]: 6.14001e-06 [merge_send_recv]: 5.67999e-06 [auto_parallel]: 6.78e-06 [parallel]: 6.04001e-06 [flash_sp]: 3.55e-06 [merge_comm]: 3.67002e-06 [allreduce_fusion]: 3.06001e-06 [matmul_add_comm_reduction]: 7.36001e-06 [allreduce_slice_to_reducescatter]: 9.70002e-07 [virtual_shard_identity]: 6.92002e-06 [virtual_dataset]: 5.71998e-06 [get_grad_eliminate_]: 5.43997e-06 [virtual_output]: 5.74999e-06 [merge_forward]: 3.08998e-06 [cell_reuse_recompute_pass]: 2.63e-06 [offload_activation]: 7.45e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.487e-05 [merge_recompute_call_nodes]: 1.17e-06 [before_grad]: 9.38997e-06 [set_forward_comm_id_for_comm_node_pass]: 4.30999e-06 [meta_fg_expand]: 2.54001e-06 [flash_sp_send_recv_attached]: 1.07e-06 [receive_attached]: 1.35999e-06 [after_resolve]: 9.61998e-06 [a_after_grad]: 8.31002e-06 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 2.29001e-06 [auto_monad_grad]: 1.36002e-06 [auto_monad_eliminator]: 8.70001e-06 [cse]: 1.552e-05 [a_3]: 3.584e-05 [py_interpret_to_execute_after_opt_a]: 1.092e-05 [slice_cell_reuse_recomputed_activation]: 1.83002e-06 [rewriter_after_opt_a]: 4.034e-05 [convert_after_rewriter]: 6.91999e-06 [order_py_execute_after_rewriter]: 5.51002e-06 [mutable_eliminate]: 0.00062647 [opt_b]: 0.00027827, [1] [Cycle 1]: 0.00027094, [7] [b_1]: 0.0001834 [b_2]: 9.36e-06 [updatestate_depend_eliminate]: 6.76e-06 [updatestate_assign_eliminate]: 2.53e-06 [updatestate_loads_eliminate]: 2.53e-06 [renormalize]: 7.90023e-07 [cse]: 2.499e-05 [optimize_parallel_all_gather_comm]: 1.808e-05 [overlap_param_gather]: 1.84e-06 [cconv]: 2.869e-05 [loop_unroll]: 0.00050233 [opt_after_cconv]: 0.00010931, [1] [Cycle 1]: 0.00010184, [7] [c_1]: 2.843e-05 [parameter_eliminate]: 3.63e-06 [updatestate_depend_eliminate]: 6.48e-06 [updatestate_assign_eliminate]: 2.33002e-06 [updatestate_loads_eliminate]: 2.35002e-06 [cse]: 2.217e-05 [renormalize]: 5.19998e-07 [remove_dup_value]: 1.529e-05 [tuple_transform]: 7.392e-05, [1] [Cycle 1]: 6.942e-05, [4] [d_1]: 4.122e-05 [none_parameter_eliminate]: 1.60999e-06 [renormalize]: 1.69995e-07 [switch_simplify]: 6.59999e-06 [partial_unused_args_eliminate]: 1.96e-06 [add_recomputation]: 5.186e-05 [cse_after_recomputation]: 2.292e-05, [1] [Cycle 1]: 1.764e-05, [1] [cse]: 1.145e-05 [environ_conv]: 5.70001e-06 [swap_dp_allreduce_reducescatter]: 5.42001e-06 [bias_add_comm_swap]: 3.33e-06 [label_micro_interleaved_index]: 4.42e-06 [label_fine_grained_interleaved_index]: 3.19001e-06 [merge_cast_opt]: 1.35999e-06 [slice_recompute_activation]: 1.97001e-06 [micro_interleaved_order_control]: 2.71e-06 [assign_add_opt]: 1.32e-06 [ForceFp32Comm]: 9.20001e-07 [remove_cast_before_assign_add]: 1.14e-06 [full_micro_interleaved_order_control]: 2.29001e-06 [reorder_send_recv_between_fp_bp]: 2.78e-06 [comm_op_add_attrs]: 1.05999e-06 [add_comm_op_reuse_tag]: 9.99979e-07 [interleave_split_concat_branches]: 1.49e-06 [interleave_parallel_branches]: 1.07e-06 [overlap_opt_shard_in_pipeline]: 1.59998e-06 [overlap_opt_shard_grad_in_pipeline]: 2.41998e-06 [control_data_broadcast_order]: 1.297e-05 [grouped_pairwise_exchange_alltoall]: 1.59e-06 [offloading_packed_experts]: 3.91001e-06 [overlap_recompute_and_grad_model_parallel]: 4.94e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.24e-06 [overlap_recompute_allgather_and_fa_grad]: 1.43002e-06 [overlap_recompute_comm]: 2.41998e-06 [overlap_grad_ring_attention]: 4.35999e-06 [overlap_grad_flash_sp]: 2.118e-05 [begin_end_overlap_inline]: 5.39992e-07 [split_matmul_comm_elemetwise]: 2.81e-06 [split_layernorm_comm]: 1.72001e-06 [handle_group_info]: 1.17e-06 [symbol_engine_optimizer]: 8.242e-05, [1] [Cycle 1]: 7.793e-05, [6] [build]: 3.14001e-06 [elim_shapecalc]: 1.124e-05 [elim_not_effective]: 1.362e-05 [opt_reshape]: 7.76001e-06 [fold_const_symbol]: 1.05e-05 [renormalize]: 7.99977e-07 [detach_backward]: 2.32999e-06 [pipeline_parallel_scheduler]: 1.60001e-06 [auto_monad_reorder]: 1.789e-05 [get_jit_bprop_graph]: 2.08002e-06 [rewriter_after_jit_bprop_graph]: 4.87998e-06 [opt_after_jit_grad]: 0.00056372 [validate]: 4.403e-05 Sums bootstrap : 0.000471s : 4.35% type_inference : 0.005787s : 53.44% event_method : 0.000014s : 0.13% auto_monad : 0.000060s : 0.55% graph_reusing : 0.000005s : 0.05% inline : 0.000003s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000017s : 0.15% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000004s : 0.03% parallel-infer-symbol : 0.000004s : 0.03% pre_auto_parallel : 0.000029s : 0.27% insert-virtual-dataset : 0.000003s : 0.03% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.02% optimize.py_interpret_to_execute : 0.000020s : 0.19% optimize.rewriter_before_opt_a : 0.000050s : 0.46% optimize.opt_a.expand_dump_flag : 0.000004s : 0.04% optimize.opt_a.switch_simplify : 0.000035s : 0.32% optimize.opt_a.loop_unroll : 0.000020s : 0.18% optimize.opt_a.a_1 : 0.000415s : 3.83% optimize.opt_a.with_stream_mark : 0.000034s : 0.31% optimize.opt_a.recompute_prepare : 0.000015s : 0.14% optimize.opt_a.updatestate_depend_eliminate : 0.000007s : 0.07% optimize.opt_a.updatestate_assign_eliminate : 0.000006s : 0.05% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.06% optimize.opt_a.parameter_eliminate : 0.000003s : 0.03% optimize.opt_a.a_2 : 0.000151s : 1.40% optimize.opt_a.accelerated_algorithm : 0.000013s : 0.12% optimize.opt_a.shard : 0.000004s : 0.04% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.04% optimize.opt_a.shard_inline : 0.000013s : 0.12% optimize.opt_a.merge_send_recv : 0.000014s : 0.13% optimize.opt_a.auto_parallel : 0.000014s : 0.13% optimize.opt_a.parallel : 0.000024s : 0.22% optimize.opt_a.flash_sp : 0.000013s : 0.12% optimize.opt_a.merge_comm : 0.000008s : 0.07% optimize.opt_a.allreduce_fusion : 0.000007s : 0.06% optimize.opt_a.matmul_add_comm_reduction : 0.000018s : 0.16% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000002s : 0.02% optimize.opt_a.virtual_shard_identity : 0.000016s : 0.15% optimize.opt_a.virtual_dataset : 0.000012s : 0.11% optimize.opt_a.get_grad_eliminate_ : 0.000012s : 0.11% optimize.opt_a.virtual_output : 0.000012s : 0.11% optimize.opt_a.merge_forward : 0.000007s : 0.07% optimize.opt_a.cell_reuse_recompute_pass : 0.000004s : 0.04% optimize.opt_a.offload_activation : 0.000018s : 0.16% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000028s : 0.26% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.02% optimize.opt_a.before_grad : 0.000021s : 0.19% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000008s : 0.07% optimize.opt_a.meta_fg_expand : 0.000005s : 0.05% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.03% optimize.opt_a.receive_attached : 0.000004s : 0.03% optimize.opt_a.after_resolve : 0.000020s : 0.19% optimize.opt_a.a_after_grad : 0.000018s : 0.17% optimize.opt_a.renormalize : 0.000710s : 6.55% optimize.opt_a.add_forward_monad_depend : 0.000008s : 0.07% optimize.opt_a.auto_monad_grad : 0.000004s : 0.04% optimize.opt_a.auto_monad_eliminator : 0.000024s : 0.22% optimize.opt_a.cse : 0.000049s : 0.45% optimize.opt_a.a_3 : 0.000086s : 0.80% optimize.py_interpret_to_execute_after_opt_a : 0.000011s : 0.10% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.02% optimize.rewriter_after_opt_a : 0.000040s : 0.37% optimize.convert_after_rewriter : 0.000007s : 0.06% optimize.order_py_execute_after_rewriter : 0.000006s : 0.05% optimize.mutable_eliminate : 0.000626s : 5.78% optimize.opt_b.b_1 : 0.000183s : 1.69% optimize.opt_b.b_2 : 0.000009s : 0.09% optimize.opt_b.updatestate_depend_eliminate : 0.000007s : 0.06% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.02% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.02% optimize.opt_b.renormalize : 0.000001s : 0.01% optimize.opt_b.cse : 0.000025s : 0.23% optimize.optimize_parallel_all_gather_comm : 0.000018s : 0.17% optimize.overlap_param_gather : 0.000002s : 0.02% optimize.cconv : 0.000029s : 0.26% optimize.loop_unroll : 0.000502s : 4.64% optimize.opt_after_cconv.c_1 : 0.000028s : 0.26% optimize.opt_after_cconv.parameter_eliminate : 0.000004s : 0.03% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.06% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000002s : 0.02% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.02% optimize.opt_after_cconv.cse : 0.000022s : 0.20% optimize.opt_after_cconv.renormalize : 0.000001s : 0.00% optimize.remove_dup_value : 0.000015s : 0.14% optimize.tuple_transform.d_1 : 0.000041s : 0.38% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000007s : 0.06% optimize.partial_unused_args_eliminate : 0.000002s : 0.02% optimize.add_recomputation : 0.000052s : 0.48% optimize.cse_after_recomputation.cse : 0.000011s : 0.11% optimize.environ_conv : 0.000006s : 0.05% optimize.swap_dp_allreduce_reducescatter : 0.000005s : 0.05% optimize.bias_add_comm_swap : 0.000003s : 0.03% optimize.label_micro_interleaved_index : 0.000004s : 0.04% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.03% optimize.merge_cast_opt : 0.000001s : 0.01% optimize.slice_recompute_activation : 0.000002s : 0.02% optimize.micro_interleaved_order_control : 0.000003s : 0.03% optimize.assign_add_opt : 0.000001s : 0.01% optimize.ForceFp32Comm : 0.000001s : 0.01% optimize.remove_cast_before_assign_add : 0.000001s : 0.01% optimize.full_micro_interleaved_order_control : 0.000002s : 0.02% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.03% optimize.comm_op_add_attrs : 0.000001s : 0.01% optimize.add_comm_op_reuse_tag : 0.000001s : 0.01% optimize.interleave_split_concat_branches : 0.000001s : 0.01% optimize.interleave_parallel_branches : 0.000001s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000002s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.02% optimize.control_data_broadcast_order : 0.000013s : 0.12% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.01% optimize.offloading_packed_experts : 0.000004s : 0.04% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.05% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.01% optimize.overlap_recompute_comm : 0.000002s : 0.02% optimize.overlap_grad_ring_attention : 0.000004s : 0.04% optimize.overlap_grad_flash_sp : 0.000021s : 0.20% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000003s : 0.03% optimize.split_layernorm_comm : 0.000002s : 0.02% optimize.handle_group_info : 0.000001s : 0.01% optimize.symbol_engine_optimizer.build : 0.000003s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000011s : 0.10% optimize.symbol_engine_optimizer.elim_not_effective : 0.000014s : 0.13% optimize.symbol_engine_optimizer.opt_reshape : 0.000008s : 0.07% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000010s : 0.10% optimize.symbol_engine_optimizer.renormalize : 0.000001s : 0.01% detach_backward : 0.000002s : 0.02% pipeline_parallel_scheduler : 0.000002s : 0.01% auto_monad_reorder : 0.000018s : 0.17% get_jit_bprop_graph : 0.000002s : 0.02% rewriter_after_jit_bprop_graph : 0.000005s : 0.05% opt_after_jit_grad : 0.000564s : 5.21% validate : 0.000044s : 0.41% Time group info: ------[substitution.] 0.000150 20 1.23% : 0.000002s : 2: substitution.elim_not_effective 1.22% : 0.000002s : 2: substitution.fold_const_symbol 3.94% : 0.000006s : 3: substitution.graph_param_transform 66.94% : 0.000100s : 2: substitution.inline 2.29% : 0.000003s : 4: substitution.j_node_and_user_rematch 3.19% : 0.000005s : 4: substitution.remove_not_recompute_node 2.27% : 0.000003s : 2: substitution.replace_old_param 18.92% : 0.000028s : 1: substitution.value_based_eliminate ------[type_inference.] 0.005730 2 91.32% : 0.005233s : 1: type_inference.infer 8.68% : 0.000497s : 1: type_inference.specialize ------[replace.] 0.000023 2 100.00% : 0.000023s : 2: replace.inline ------[match.] 0.000099 2 100.00% : 0.000099s : 2: match.inline ------[predicate.] 0.000143 754 1.01% : 0.000001s : 7: predicate.accumulaten_eliminater 1.38% : 0.000002s : 3: predicate.ad_related_special_op_eliminate 0.62% : 0.000001s : 6: predicate.addn_check_dump 0.83% : 0.000001s : 7: predicate.addn_zero_filter 0.61% : 0.000001s : 7: predicate.adjust_all_reduce_mul_add 2.29% : 0.000003s : 13: predicate.arithmetic_simplify 0.78% : 0.000001s : 7: predicate.cast_eliminate 0.72% : 0.000001s : 6: predicate.check_bprop_eliminate 0.70% : 0.000001s : 6: predicate.compare_switch_simplify 0.24% : 0.000000s : 3: predicate.const_output_eliminate 0.73% : 0.000001s : 6: predicate.depend_value_elim 0.69% : 0.000001s : 7: predicate.dict_get_item_const_eliminator 0.84% : 0.000001s : 7: predicate.dict_get_item_eliminator 0.75% : 0.000001s : 7: predicate.dict_set_item_eliminator 1.47% : 0.000002s : 6: predicate.dumpgradient_eliminate 0.27% : 0.000000s : 3: predicate.elim_not_effective 0.60% : 0.000001s : 3: predicate.elim_shapecalc_of_broadcastargs 1.43% : 0.000002s : 10: predicate.environ_add_const_eliminate 0.95% : 0.000001s : 10: predicate.environ_get_add_eliminate 0.90% : 0.000001s : 10: predicate.environ_get_depend_swap 1.83% : 0.000003s : 16: predicate.environ_get_eliminate 1.00% : 0.000001s : 10: predicate.environ_get_set_eliminate 0.94% : 0.000001s : 9: predicate.exchange_switch_depend_value 1.83% : 0.000003s : 9: predicate.float_depend_g_call 0.84% : 0.000001s : 6: predicate.float_environ_get_switch 0.89% : 0.000001s : 9: predicate.float_tuple_getitem_switch 0.20% : 0.000000s : 3: predicate.fold_const_symbol 0.73% : 0.000001s : 6: predicate.get_grad_eliminate 0.22% : 0.000000s : 3: predicate.graph_param_transform 0.75% : 0.000001s : 6: predicate.incorporate_call 0.60% : 0.000001s : 6: predicate.incorporate_call_switch 6.68% : 0.000010s : 34: predicate.inline 1.06% : 0.000002s : 6: predicate.inline_without_move 0.43% : 0.000001s : 6: predicate.j_node_and_user_rematch 1.01% : 0.000001s : 6: predicate.less_batch_normalization 1.47% : 0.000002s : 13: predicate.list_to_tuple_eliminator_ 1.89% : 0.000003s : 20: predicate.load_eliminater 1.53% : 0.000002s : 3: predicate.loop_unroll_after_grad 1.55% : 0.000002s : 14: predicate.loop_unroll_before_grad 1.76% : 0.000003s : 13: predicate.make_slice_get_slice_eliminator 0.65% : 0.000001s : 6: predicate.merge_addn 0.80% : 0.000001s : 6: predicate.micro_step_allgather_replace 0.62% : 0.000001s : 6: predicate.mini_step_allgather_replace 0.85% : 0.000001s : 7: predicate.minmaximum_grad 1.46% : 0.000002s : 3: predicate.mutable_eliminate 0.41% : 0.000001s : 3: predicate.opt_reshape 0.65% : 0.000001s : 3: predicate.parallel_virtual_node 1.20% : 0.000002s : 9: predicate.partial_defer_inline 1.15% : 0.000002s : 10: predicate.partial_eliminate 0.74% : 0.000001s : 7: predicate.print_const_string_wrapper 0.71% : 0.000001s : 6: predicate.reduce_all_const_elim 1.12% : 0.000002s : 7: predicate.reduce_eliminate 1.95% : 0.000003s : 20: predicate.redundant_stop_gradient_eliminater 0.71% : 0.000001s : 6: predicate.remove_not_recompute_node 1.28% : 0.000002s : 13: predicate.replace_applicator 0.78% : 0.000001s : 6: predicate.replace_old_param 0.34% : 0.000000s : 3: predicate.reset_defer_inline 0.81% : 0.000001s : 7: predicate.reshape_eliminate 0.80% : 0.000001s : 6: predicate.row_tensor_add_zeros_like 0.83% : 0.000001s : 3: predicate.row_tensor_eliminate 1.03% : 0.000001s : 6: predicate.same_eliminate 0.47% : 0.000001s : 6: predicate.set_cell_output_no_recompute 1.56% : 0.000002s : 6: predicate.shard_identity_eliminate 0.90% : 0.000001s : 6: predicate.special_op_eliminate 0.86% : 0.000001s : 6: predicate.specialize_transform 1.13% : 0.000002s : 6: predicate.split_environ_get_set_with_tuple_value 0.87% : 0.000001s : 6: predicate.stack_unstack_eliminate 0.38% : 0.000001s : 3: predicate.switch_call_monad_eliminater 0.89% : 0.000001s : 9: predicate.switch_defer_inline 1.55% : 0.000002s : 15: predicate.switch_layer_defer_inline 4.36% : 0.000006s : 32: predicate.switch_simplify 0.73% : 0.000001s : 7: predicate.tile_eliminate 0.79% : 0.000001s : 7: predicate.transpose_eliminate 1.71% : 0.000002s : 13: predicate.tuple_list_convert_item_index_to_positive 1.80% : 0.000003s : 13: predicate.tuple_list_get_item_const_eliminator 1.30% : 0.000002s : 13: predicate.tuple_list_get_item_depend_reorder 3.43% : 0.000005s : 19: predicate.tuple_list_get_item_eliminator 1.59% : 0.000002s : 13: predicate.tuple_list_get_set_item_eliminator 2.39% : 0.000003s : 19: predicate.tuple_list_set_item_eliminator 1.78% : 0.000003s : 13: predicate.tuple_to_list_eliminator_ 1.91% : 0.000003s : 20: predicate.updatestate_pure_node_eliminater 2.47% : 0.000004s : 26: predicate.updatestate_useless_node_eliminater 0.55% : 0.000001s : 3: predicate.value_based_eliminate 0.78% : 0.000001s : 6: predicate.virtual_dataset_eliminate 1.03% : 0.000001s : 6: predicate.virtual_output_eliminate 0.33% : 0.000000s : 3: predicate.virtual_view_grad_eliminate 0.56% : 0.000001s : 3: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000250 5 7.75% : 0.000019s : 1: func_graph_cloner_run.FuncGraphClonerGraph 92.25% : 0.000231s : 4: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.025408 192 0.01% : 0.000004s : 1: ForceFp32Comm 14.17% : 0.003599s : 1: add_attr 14.12% : 0.003587s : 1: add_attr_with_inline 0.01% : 0.000004s : 1: add_comm_op_reuse_tag 0.22% : 0.000056s : 1: add_recomputation 0.02% : 0.000004s : 1: assign_add_opt 0.26% : 0.000065s : 1: auto_monad 0.09% : 0.000022s : 1: auto_monad_reorder 0.01% : 0.000004s : 1: begin_end_overlap_inline 0.03% : 0.000006s : 1: bias_add_comm_swap 1.98% : 0.000503s : 1: bootstrap 0.13% : 0.000032s : 1: cconv 0.02% : 0.000004s : 1: comm_op_add_attrs 0.06% : 0.000016s : 1: control_data_broadcast_order 0.04% : 0.000010s : 1: convert_after_rewriter 0.10% : 0.000026s : 1: cse_after_recomputation 0.02% : 0.000004s : 1: dataset_repeat_opt 0.02% : 0.000006s : 1: detach_backward 0.04% : 0.000009s : 1: environ_conv 0.08% : 0.000021s : 1: event_method 0.02% : 0.000005s : 1: full_micro_interleaved_order_control 0.02% : 0.000006s : 1: get_jit_bprop_graph 0.03% : 0.000009s : 1: graph_reusing 0.02% : 0.000005s : 1: grouped_pairwise_exchange_alltoall 0.02% : 0.000004s : 1: handle_group_info 0.02% : 0.000006s : 1: inline 0.02% : 0.000006s : 1: insert-virtual-dataset 0.02% : 0.000004s : 1: interleave_parallel_branches 0.02% : 0.000005s : 1: interleave_split_concat_branches 0.02% : 0.000006s : 1: label_fine_grained_interleaved_index 0.03% : 0.000007s : 1: label_micro_interleaved_index 2.01% : 0.000511s : 1: loop_unroll 0.02% : 0.000004s : 1: merge_cast_opt 0.02% : 0.000005s : 1: micro_interleaved_order_control 2.51% : 0.000637s : 1: mutable_eliminate 0.03% : 0.000007s : 1: offloading_packed_experts 0.07% : 0.000018s : 1: opt.transform.loop_unroll_optimizer 0.07% : 0.000018s : 1: opt.transform.mutable_eliminate 3.11% : 0.000791s : 78: opt.transform.opt_a 0.11% : 0.000027s : 1: opt.transform.opt_after_cconv 0.11% : 0.000029s : 1: opt.transform.opt_after_jit_grad 0.61% : 0.000156s : 28: opt.transform.opt_b 0.17% : 0.000044s : 2: opt.transform.opt_trans_graph 0.15% : 0.000038s : 4: opt.transform.symbol_engine_opt 9.42% : 0.002393s : 1: opt_a 0.44% : 0.000113s : 1: opt_after_cconv 2.26% : 0.000575s : 1: opt_after_jit_grad 1.11% : 0.000283s : 1: opt_b 18.42% : 0.004681s : 1: optimize 0.08% : 0.000022s : 1: optimize_parallel_all_gather_comm 0.03% : 0.000009s : 1: order_py_execute_after_rewriter 0.10% : 0.000025s : 1: overlap_grad_flash_sp 0.02% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.03% : 0.000007s : 1: overlap_grad_ring_attention 0.02% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.02% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.02% : 0.000005s : 1: overlap_param_gather 0.02% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.03% : 0.000008s : 1: overlap_recompute_and_grad_model_parallel 0.02% : 0.000005s : 1: overlap_recompute_comm 0.03% : 0.000007s : 1: parallel-infer-symbol 0.01% : 0.000004s : 1: parallel-infer-symbol-second 0.02% : 0.000006s : 1: partial_unused_args_eliminate 0.02% : 0.000005s : 1: pipeline_parallel_scheduler 0.02% : 0.000005s : 1: pipeline_split 0.13% : 0.000033s : 1: pre_auto_parallel 0.10% : 0.000024s : 1: py_interpret_to_execute 0.06% : 0.000015s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000004s : 1: remove_cast_before_assign_add 0.07% : 0.000019s : 1: remove_dup_value 1.67% : 0.000423s : 1: renormalize.infer 1.09% : 0.000278s : 1: renormalize.specialize 0.02% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.03% : 0.000008s : 1: rewriter_after_jit_bprop_graph 0.18% : 0.000046s : 1: rewriter_after_opt_a 0.21% : 0.000055s : 1: rewriter_before_opt_a 0.02% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.02% : 0.000005s : 1: slice_recompute_activation 0.02% : 0.000004s : 1: split_layernorm_comm 0.02% : 0.000006s : 1: split_matmul_comm_elemetwise 0.03% : 0.000009s : 1: swap_dp_allreduce_reducescatter 0.34% : 0.000085s : 1: symbol_engine_optimizer 0.30% : 0.000077s : 1: tuple_transform 22.86% : 0.005807s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:53.252.614 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:53.252.912 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0163848, [21] [bootstrap]: 0.00045801 [type_inference]: 0.00570609 [event_method]: 1.295e-05 [auto_monad]: 5.618e-05 [graph_reusing]: 5.87001e-06 [inline]: 2.39999e-06 [add_attr]: 0.0034421, [1] [add_attr_with_inline]: 0.00343104, [1] [Cycle 1]: 8.445e-05, [2] [tag_attr]: 1.736e-05 [meta_addattr_fg_expand]: 3.86999e-06 [parallel-infer-symbol]: 3.81999e-06 [pre_auto_parallel]: 3.302e-05 [insert-virtual-dataset]: 2.46e-06 [parallel-infer-symbol-second]: 6.99976e-07 [dataset_repeat_opt]: 2.21998e-06 [pipeline_split]: 1.64e-06 [optimize]: 0.0054133, [53] [py_interpret_to_execute]: 2.543e-05 [rewriter_before_opt_a]: 5.508e-05 [opt_a]: 0.00283217, [2] [Cycle 1]: 0.0019707, [45] [expand_dump_flag]: 3.26999e-06 [switch_simplify]: 2.642e-05 [loop_unroll]: 1.437e-05 [a_1]: 0.00035136 [with_stream_mark]: 2.155e-05 [recompute_prepare]: 1.038e-05 [updatestate_depend_eliminate]: 4.05e-06 [updatestate_assign_eliminate]: 4.01001e-06 [updatestate_loads_eliminate]: 3.04999e-06 [parameter_eliminate]: 2.50997e-06 [a_2]: 0.00010968 [accelerated_algorithm]: 7.58999e-06 [shard]: 3.18e-06 [meta_shard_fg_expand]: 2.19001e-06 [shard_inline]: 6.29999e-06 [merge_send_recv]: 9.73998e-06 [auto_parallel]: 8.59e-06 [parallel]: 1.979e-05 [flash_sp]: 1.022e-05 [merge_comm]: 3.76001e-06 [allreduce_fusion]: 3.70998e-06 [matmul_add_comm_reduction]: 1.131e-05 [allreduce_slice_to_reducescatter]: 6.40022e-07 [virtual_shard_identity]: 1.094e-05 [virtual_dataset]: 6.73e-06 [get_grad_eliminate_]: 6.68e-06 [virtual_output]: 6.68e-06 [merge_forward]: 5.25001e-06 [cell_reuse_recompute_pass]: 1.30999e-06 [offload_activation]: 1.132e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.123e-05 [merge_recompute_call_nodes]: 1.87999e-06 [before_grad]: 1.301e-05 [set_forward_comm_id_for_comm_node_pass]: 5.56002e-06 [meta_fg_expand]: 2.79999e-06 [flash_sp_send_recv_attached]: 3.84002e-06 [receive_attached]: 2.73e-06 [after_resolve]: 1.318e-05 [a_after_grad]: 9.49e-06 [renormalize]: 0.00066624 [add_forward_monad_depend]: 6.23998e-06 [auto_monad_grad]: 2.97002e-06 [auto_monad_eliminator]: 1.767e-05 [cse]: 3.072e-05 [a_3]: 6.325e-05 [Cycle 2]: 0.00084535, [45] [expand_dump_flag]: 1.31002e-06 [switch_simplify]: 7.58001e-06 [loop_unroll]: 6.01e-06 [a_1]: 0.00010717 [with_stream_mark]: 1.364e-05 [recompute_prepare]: 6.19001e-06 [updatestate_depend_eliminate]: 3.03998e-06 [updatestate_assign_eliminate]: 2.58e-06 [updatestate_loads_eliminate]: 3.03e-06 [parameter_eliminate]: 1.60999e-06 [a_2]: 9.864e-05 [accelerated_algorithm]: 7.71999e-06 [shard]: 1.35001e-06 [meta_shard_fg_expand]: 1.55999e-06 [shard_inline]: 6.38e-06 [merge_send_recv]: 6.51e-06 [auto_parallel]: 7.28e-06 [parallel]: 5.52001e-06 [flash_sp]: 3.61001e-06 [merge_comm]: 3.58e-06 [allreduce_fusion]: 3.67998e-06 [matmul_add_comm_reduction]: 7.66999e-06 [allreduce_slice_to_reducescatter]: 6.80011e-07 [virtual_shard_identity]: 7.98999e-06 [virtual_dataset]: 6.07999e-06 [get_grad_eliminate_]: 5.62001e-06 [virtual_output]: 5.51e-06 [merge_forward]: 3.66001e-06 [cell_reuse_recompute_pass]: 1.71002e-06 [offload_activation]: 8.70999e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.737e-05 [merge_recompute_call_nodes]: 1.62001e-06 [before_grad]: 9.94001e-06 [set_forward_comm_id_for_comm_node_pass]: 4.73001e-06 [meta_fg_expand]: 2.41e-06 [flash_sp_send_recv_attached]: 1.15999e-06 [receive_attached]: 1.32999e-06 [after_resolve]: 9.53997e-06 [a_after_grad]: 9.20001e-06 [renormalize]: 6.00121e-08 [add_forward_monad_depend]: 2.59001e-06 [auto_monad_grad]: 1.70001e-06 [auto_monad_eliminator]: 1.074e-05 [cse]: 1.841e-05 [a_3]: 5.098e-05 [py_interpret_to_execute_after_opt_a]: 1.683e-05 [slice_cell_reuse_recomputed_activation]: 4.74998e-06 [rewriter_after_opt_a]: 4.848e-05 [convert_after_rewriter]: 1.085e-05 [order_py_execute_after_rewriter]: 8.65999e-06 [mutable_eliminate]: 0.00063109 [opt_b]: 0.00033063, [1] [Cycle 1]: 0.00031967, [7] [b_1]: 0.00020584 [b_2]: 9.00001e-06 [updatestate_depend_eliminate]: 8.38999e-06 [updatestate_assign_eliminate]: 2.88e-06 [updatestate_loads_eliminate]: 2.77002e-06 [renormalize]: 6.80011e-07 [cse]: 2.786e-05 [optimize_parallel_all_gather_comm]: 2.184e-05 [overlap_param_gather]: 4.80999e-06 [cconv]: 3.594e-05 [loop_unroll]: 0.00053185 [opt_after_cconv]: 0.00013344, [1] [Cycle 1]: 0.00012355, [7] [c_1]: 2.84e-05 [parameter_eliminate]: 5.22999e-06 [updatestate_depend_eliminate]: 6.37001e-06 [updatestate_assign_eliminate]: 3.01999e-06 [updatestate_loads_eliminate]: 2.48998e-06 [cse]: 2.171e-05 [renormalize]: 5.79981e-07 [remove_dup_value]: 1.874e-05 [tuple_transform]: 8.84e-05, [1] [Cycle 1]: 8.061e-05, [4] [d_1]: 4.03e-05 [none_parameter_eliminate]: 1.60999e-06 [renormalize]: 3.60014e-07 [switch_simplify]: 6.81999e-06 [partial_unused_args_eliminate]: 5.16998e-06 [add_recomputation]: 5.508e-05 [cse_after_recomputation]: 3.214e-05, [1] [Cycle 1]: 2.481e-05, [1] [cse]: 1.434e-05 [environ_conv]: 8.77999e-06 [swap_dp_allreduce_reducescatter]: 8.28001e-06 [bias_add_comm_swap]: 5.15001e-06 [label_micro_interleaved_index]: 7.78001e-06 [label_fine_grained_interleaved_index]: 6.02999e-06 [merge_cast_opt]: 4.01001e-06 [slice_recompute_activation]: 4.67e-06 [micro_interleaved_order_control]: 4.48001e-06 [assign_add_opt]: 3.73001e-06 [ForceFp32Comm]: 3.11999e-06 [remove_cast_before_assign_add]: 3.33998e-06 [full_micro_interleaved_order_control]: 4.57998e-06 [reorder_send_recv_between_fp_bp]: 5.63002e-06 [comm_op_add_attrs]: 3.71999e-06 [add_comm_op_reuse_tag]: 3.3e-06 [interleave_split_concat_branches]: 3.45e-06 [interleave_parallel_branches]: 3.6e-06 [overlap_opt_shard_in_pipeline]: 3.76999e-06 [overlap_opt_shard_grad_in_pipeline]: 4.66002e-06 [control_data_broadcast_order]: 1.866e-05 [grouped_pairwise_exchange_alltoall]: 4.12e-06 [offloading_packed_experts]: 7.13e-06 [overlap_recompute_and_grad_model_parallel]: 7.6e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.57997e-06 [overlap_recompute_allgather_and_fa_grad]: 3.98999e-06 [overlap_recompute_comm]: 5.17999e-06 [overlap_grad_ring_attention]: 6.83e-06 [overlap_grad_flash_sp]: 2.466e-05 [begin_end_overlap_inline]: 2.99999e-06 [split_matmul_comm_elemetwise]: 4.4e-06 [split_layernorm_comm]: 4.37e-06 [handle_group_info]: 3.28e-06 [symbol_engine_optimizer]: 0.00010598, [1] [Cycle 1]: 9.915e-05, [6] [build]: 3.29001e-06 [elim_shapecalc]: 1.214e-05 [elim_not_effective]: 1.442e-05 [opt_reshape]: 7.34002e-06 [fold_const_symbol]: 1.076e-05 [renormalize]: 2.10013e-07 [detach_backward]: 4.87e-06 [pipeline_parallel_scheduler]: 1.99e-06 [auto_monad_reorder]: 2.105e-05 [get_jit_bprop_graph]: 2.34001e-06 [rewriter_after_jit_bprop_graph]: 6.63e-06 [opt_after_jit_grad]: 0.00054514 [validate]: 3.994e-05 Sums bootstrap : 0.000458s : 4.14% type_inference : 0.005706s : 51.58% event_method : 0.000013s : 0.12% auto_monad : 0.000056s : 0.51% graph_reusing : 0.000006s : 0.05% inline : 0.000002s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000017s : 0.16% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000004s : 0.03% parallel-infer-symbol : 0.000004s : 0.03% pre_auto_parallel : 0.000033s : 0.30% insert-virtual-dataset : 0.000002s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000025s : 0.23% optimize.rewriter_before_opt_a : 0.000055s : 0.50% optimize.opt_a.expand_dump_flag : 0.000005s : 0.04% optimize.opt_a.switch_simplify : 0.000034s : 0.31% optimize.opt_a.loop_unroll : 0.000020s : 0.18% optimize.opt_a.a_1 : 0.000459s : 4.14% optimize.opt_a.with_stream_mark : 0.000035s : 0.32% optimize.opt_a.recompute_prepare : 0.000017s : 0.15% optimize.opt_a.updatestate_depend_eliminate : 0.000007s : 0.06% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.06% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.05% optimize.opt_a.parameter_eliminate : 0.000004s : 0.04% optimize.opt_a.a_2 : 0.000208s : 1.88% optimize.opt_a.accelerated_algorithm : 0.000015s : 0.14% optimize.opt_a.shard : 0.000005s : 0.04% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.03% optimize.opt_a.shard_inline : 0.000013s : 0.11% optimize.opt_a.merge_send_recv : 0.000016s : 0.15% optimize.opt_a.auto_parallel : 0.000016s : 0.14% optimize.opt_a.parallel : 0.000025s : 0.23% optimize.opt_a.flash_sp : 0.000014s : 0.13% optimize.opt_a.merge_comm : 0.000007s : 0.07% optimize.opt_a.allreduce_fusion : 0.000007s : 0.07% optimize.opt_a.matmul_add_comm_reduction : 0.000019s : 0.17% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000019s : 0.17% optimize.opt_a.virtual_dataset : 0.000013s : 0.12% optimize.opt_a.get_grad_eliminate_ : 0.000012s : 0.11% optimize.opt_a.virtual_output : 0.000012s : 0.11% optimize.opt_a.merge_forward : 0.000009s : 0.08% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.03% optimize.opt_a.offload_activation : 0.000020s : 0.18% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000039s : 0.35% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.03% optimize.opt_a.before_grad : 0.000023s : 0.21% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000010s : 0.09% optimize.opt_a.meta_fg_expand : 0.000005s : 0.05% optimize.opt_a.flash_sp_send_recv_attached : 0.000005s : 0.05% optimize.opt_a.receive_attached : 0.000004s : 0.04% optimize.opt_a.after_resolve : 0.000023s : 0.21% optimize.opt_a.a_after_grad : 0.000019s : 0.17% optimize.opt_a.renormalize : 0.000666s : 6.02% optimize.opt_a.add_forward_monad_depend : 0.000009s : 0.08% optimize.opt_a.auto_monad_grad : 0.000005s : 0.04% optimize.opt_a.auto_monad_eliminator : 0.000028s : 0.26% optimize.opt_a.cse : 0.000049s : 0.44% optimize.opt_a.a_3 : 0.000114s : 1.03% optimize.py_interpret_to_execute_after_opt_a : 0.000017s : 0.15% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.04% optimize.rewriter_after_opt_a : 0.000048s : 0.44% optimize.convert_after_rewriter : 0.000011s : 0.10% optimize.order_py_execute_after_rewriter : 0.000009s : 0.08% optimize.mutable_eliminate : 0.000631s : 5.70% optimize.opt_b.b_1 : 0.000206s : 1.86% optimize.opt_b.b_2 : 0.000009s : 0.08% optimize.opt_b.updatestate_depend_eliminate : 0.000008s : 0.08% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_b.renormalize : 0.000001s : 0.01% optimize.opt_b.cse : 0.000028s : 0.25% optimize.optimize_parallel_all_gather_comm : 0.000022s : 0.20% optimize.overlap_param_gather : 0.000005s : 0.04% optimize.cconv : 0.000036s : 0.32% optimize.loop_unroll : 0.000532s : 4.81% optimize.opt_after_cconv.c_1 : 0.000028s : 0.26% optimize.opt_after_cconv.parameter_eliminate : 0.000005s : 0.05% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.06% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.02% optimize.opt_after_cconv.cse : 0.000022s : 0.20% optimize.opt_after_cconv.renormalize : 0.000001s : 0.01% optimize.remove_dup_value : 0.000019s : 0.17% optimize.tuple_transform.d_1 : 0.000040s : 0.36% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000007s : 0.06% optimize.partial_unused_args_eliminate : 0.000005s : 0.05% optimize.add_recomputation : 0.000055s : 0.50% optimize.cse_after_recomputation.cse : 0.000014s : 0.13% optimize.environ_conv : 0.000009s : 0.08% optimize.swap_dp_allreduce_reducescatter : 0.000008s : 0.07% optimize.bias_add_comm_swap : 0.000005s : 0.05% optimize.label_micro_interleaved_index : 0.000008s : 0.07% optimize.label_fine_grained_interleaved_index : 0.000006s : 0.05% optimize.merge_cast_opt : 0.000004s : 0.04% optimize.slice_recompute_activation : 0.000005s : 0.04% optimize.micro_interleaved_order_control : 0.000004s : 0.04% optimize.assign_add_opt : 0.000004s : 0.03% optimize.ForceFp32Comm : 0.000003s : 0.03% optimize.remove_cast_before_assign_add : 0.000003s : 0.03% optimize.full_micro_interleaved_order_control : 0.000005s : 0.04% optimize.reorder_send_recv_between_fp_bp : 0.000006s : 0.05% optimize.comm_op_add_attrs : 0.000004s : 0.03% optimize.add_comm_op_reuse_tag : 0.000003s : 0.03% optimize.interleave_split_concat_branches : 0.000003s : 0.03% optimize.interleave_parallel_branches : 0.000004s : 0.03% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.03% optimize.overlap_opt_shard_grad_in_pipeline : 0.000005s : 0.04% optimize.control_data_broadcast_order : 0.000019s : 0.17% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.04% optimize.offloading_packed_experts : 0.000007s : 0.06% optimize.overlap_recompute_and_grad_model_parallel : 0.000008s : 0.07% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.03% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.04% optimize.overlap_recompute_comm : 0.000005s : 0.05% optimize.overlap_grad_ring_attention : 0.000007s : 0.06% optimize.overlap_grad_flash_sp : 0.000025s : 0.22% optimize.begin_end_overlap_inline : 0.000003s : 0.03% optimize.split_matmul_comm_elemetwise : 0.000004s : 0.04% optimize.split_layernorm_comm : 0.000004s : 0.04% optimize.handle_group_info : 0.000003s : 0.03% optimize.symbol_engine_optimizer.build : 0.000003s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000012s : 0.11% optimize.symbol_engine_optimizer.elim_not_effective : 0.000014s : 0.13% optimize.symbol_engine_optimizer.opt_reshape : 0.000007s : 0.07% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000011s : 0.10% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000005s : 0.04% pipeline_parallel_scheduler : 0.000002s : 0.02% auto_monad_reorder : 0.000021s : 0.19% get_jit_bprop_graph : 0.000002s : 0.02% rewriter_after_jit_bprop_graph : 0.000007s : 0.06% opt_after_jit_grad : 0.000545s : 4.93% validate : 0.000040s : 0.36% Time group info: ------[substitution.] 0.000165 20 1.10% : 0.000002s : 2: substitution.elim_not_effective 0.98% : 0.000002s : 2: substitution.fold_const_symbol 3.69% : 0.000006s : 3: substitution.graph_param_transform 66.04% : 0.000109s : 2: substitution.inline 2.57% : 0.000004s : 4: substitution.j_node_and_user_rematch 3.43% : 0.000006s : 4: substitution.remove_not_recompute_node 2.51% : 0.000004s : 2: substitution.replace_old_param 19.66% : 0.000032s : 1: substitution.value_based_eliminate ------[type_inference.] 0.005650 2 91.58% : 0.005174s : 1: type_inference.infer 8.42% : 0.000476s : 1: type_inference.specialize ------[replace.] 0.000025 2 100.00% : 0.000025s : 2: replace.inline ------[match.] 0.000106 2 100.00% : 0.000106s : 2: match.inline ------[predicate.] 0.000143 754 0.96% : 0.000001s : 7: predicate.accumulaten_eliminater 1.09% : 0.000002s : 3: predicate.ad_related_special_op_eliminate 0.59% : 0.000001s : 6: predicate.addn_check_dump 0.69% : 0.000001s : 7: predicate.addn_zero_filter 0.68% : 0.000001s : 7: predicate.adjust_all_reduce_mul_add 2.14% : 0.000003s : 13: predicate.arithmetic_simplify 0.72% : 0.000001s : 7: predicate.cast_eliminate 0.77% : 0.000001s : 6: predicate.check_bprop_eliminate 0.64% : 0.000001s : 6: predicate.compare_switch_simplify 0.20% : 0.000000s : 3: predicate.const_output_eliminate 0.91% : 0.000001s : 6: predicate.depend_value_elim 0.70% : 0.000001s : 7: predicate.dict_get_item_const_eliminator 0.96% : 0.000001s : 7: predicate.dict_get_item_eliminator 0.91% : 0.000001s : 7: predicate.dict_set_item_eliminator 1.41% : 0.000002s : 6: predicate.dumpgradient_eliminate 0.23% : 0.000000s : 3: predicate.elim_not_effective 0.45% : 0.000001s : 3: predicate.elim_shapecalc_of_broadcastargs 1.08% : 0.000002s : 10: predicate.environ_add_const_eliminate 1.02% : 0.000001s : 10: predicate.environ_get_add_eliminate 1.01% : 0.000001s : 10: predicate.environ_get_depend_swap 1.79% : 0.000003s : 16: predicate.environ_get_eliminate 0.95% : 0.000001s : 10: predicate.environ_get_set_eliminate 0.84% : 0.000001s : 9: predicate.exchange_switch_depend_value 1.86% : 0.000003s : 9: predicate.float_depend_g_call 0.68% : 0.000001s : 6: predicate.float_environ_get_switch 0.98% : 0.000001s : 9: predicate.float_tuple_getitem_switch 0.23% : 0.000000s : 3: predicate.fold_const_symbol 0.80% : 0.000001s : 6: predicate.get_grad_eliminate 0.36% : 0.000001s : 3: predicate.graph_param_transform 0.77% : 0.000001s : 6: predicate.incorporate_call 0.66% : 0.000001s : 6: predicate.incorporate_call_switch 6.97% : 0.000010s : 34: predicate.inline 1.35% : 0.000002s : 6: predicate.inline_without_move 0.42% : 0.000001s : 6: predicate.j_node_and_user_rematch 1.34% : 0.000002s : 6: predicate.less_batch_normalization 1.52% : 0.000002s : 13: predicate.list_to_tuple_eliminator_ 1.87% : 0.000003s : 20: predicate.load_eliminater 1.35% : 0.000002s : 3: predicate.loop_unroll_after_grad 1.62% : 0.000002s : 14: predicate.loop_unroll_before_grad 1.81% : 0.000003s : 13: predicate.make_slice_get_slice_eliminator 0.73% : 0.000001s : 6: predicate.merge_addn 0.69% : 0.000001s : 6: predicate.micro_step_allgather_replace 0.70% : 0.000001s : 6: predicate.mini_step_allgather_replace 0.64% : 0.000001s : 7: predicate.minmaximum_grad 1.97% : 0.000003s : 3: predicate.mutable_eliminate 0.45% : 0.000001s : 3: predicate.opt_reshape 0.65% : 0.000001s : 3: predicate.parallel_virtual_node 1.21% : 0.000002s : 9: predicate.partial_defer_inline 1.19% : 0.000002s : 10: predicate.partial_eliminate 0.70% : 0.000001s : 7: predicate.print_const_string_wrapper 0.70% : 0.000001s : 6: predicate.reduce_all_const_elim 1.01% : 0.000001s : 7: predicate.reduce_eliminate 1.93% : 0.000003s : 20: predicate.redundant_stop_gradient_eliminater 0.78% : 0.000001s : 6: predicate.remove_not_recompute_node 1.23% : 0.000002s : 13: predicate.replace_applicator 0.79% : 0.000001s : 6: predicate.replace_old_param 0.47% : 0.000001s : 3: predicate.reset_defer_inline 0.82% : 0.000001s : 7: predicate.reshape_eliminate 0.79% : 0.000001s : 6: predicate.row_tensor_add_zeros_like 0.66% : 0.000001s : 3: predicate.row_tensor_eliminate 1.15% : 0.000002s : 6: predicate.same_eliminate 0.49% : 0.000001s : 6: predicate.set_cell_output_no_recompute 1.19% : 0.000002s : 6: predicate.shard_identity_eliminate 0.88% : 0.000001s : 6: predicate.special_op_eliminate 0.94% : 0.000001s : 6: predicate.specialize_transform 1.25% : 0.000002s : 6: predicate.split_environ_get_set_with_tuple_value 1.15% : 0.000002s : 6: predicate.stack_unstack_eliminate 0.40% : 0.000001s : 3: predicate.switch_call_monad_eliminater 0.97% : 0.000001s : 9: predicate.switch_defer_inline 1.61% : 0.000002s : 15: predicate.switch_layer_defer_inline 4.26% : 0.000006s : 32: predicate.switch_simplify 0.90% : 0.000001s : 7: predicate.tile_eliminate 1.01% : 0.000001s : 7: predicate.transpose_eliminate 1.39% : 0.000002s : 13: predicate.tuple_list_convert_item_index_to_positive 1.38% : 0.000002s : 13: predicate.tuple_list_get_item_const_eliminator 1.30% : 0.000002s : 13: predicate.tuple_list_get_item_depend_reorder 2.92% : 0.000004s : 19: predicate.tuple_list_get_item_eliminator 1.45% : 0.000002s : 13: predicate.tuple_list_get_set_item_eliminator 2.27% : 0.000003s : 19: predicate.tuple_list_set_item_eliminator 1.44% : 0.000002s : 13: predicate.tuple_to_list_eliminator_ 2.18% : 0.000003s : 20: predicate.updatestate_pure_node_eliminater 2.85% : 0.000004s : 26: predicate.updatestate_useless_node_eliminater 0.65% : 0.000001s : 3: predicate.value_based_eliminate 0.84% : 0.000001s : 6: predicate.virtual_dataset_eliminate 0.71% : 0.000001s : 6: predicate.virtual_output_eliminate 0.36% : 0.000001s : 3: predicate.virtual_view_grad_eliminate 0.59% : 0.000001s : 3: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000247 5 8.54% : 0.000021s : 1: func_graph_cloner_run.FuncGraphClonerGraph 91.46% : 0.000226s : 4: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.026936 192 0.02% : 0.000006s : 1: ForceFp32Comm 12.82% : 0.003453s : 1: add_attr 12.75% : 0.003435s : 1: add_attr_with_inline 0.02% : 0.000006s : 1: add_comm_op_reuse_tag 0.22% : 0.000059s : 1: add_recomputation 0.02% : 0.000006s : 1: assign_add_opt 0.29% : 0.000078s : 1: auto_monad 0.11% : 0.000029s : 1: auto_monad_reorder 0.02% : 0.000006s : 1: begin_end_overlap_inline 0.03% : 0.000008s : 1: bias_add_comm_swap 1.87% : 0.000504s : 1: bootstrap 0.15% : 0.000039s : 1: cconv 0.02% : 0.000007s : 1: comm_op_add_attrs 0.08% : 0.000022s : 1: control_data_broadcast_order 0.05% : 0.000014s : 1: convert_after_rewriter 0.13% : 0.000035s : 1: cse_after_recomputation 0.03% : 0.000008s : 1: dataset_repeat_opt 0.09% : 0.000024s : 1: detach_backward 0.04% : 0.000012s : 1: environ_conv 0.09% : 0.000024s : 1: event_method 0.03% : 0.000007s : 1: full_micro_interleaved_order_control 0.03% : 0.000008s : 1: get_jit_bprop_graph 0.05% : 0.000013s : 1: graph_reusing 0.03% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.02% : 0.000006s : 1: handle_group_info 0.03% : 0.000008s : 1: inline 0.03% : 0.000009s : 1: insert-virtual-dataset 0.02% : 0.000006s : 1: interleave_parallel_branches 0.02% : 0.000006s : 1: interleave_split_concat_branches 0.03% : 0.000009s : 1: label_fine_grained_interleaved_index 0.04% : 0.000010s : 1: label_micro_interleaved_index 2.00% : 0.000540s : 1: loop_unroll 0.03% : 0.000007s : 1: merge_cast_opt 0.03% : 0.000007s : 1: micro_interleaved_order_control 2.37% : 0.000639s : 1: mutable_eliminate 0.04% : 0.000011s : 1: offloading_packed_experts 0.07% : 0.000018s : 1: opt.transform.loop_unroll_optimizer 0.08% : 0.000021s : 1: opt.transform.mutable_eliminate 3.17% : 0.000853s : 78: opt.transform.opt_a 0.10% : 0.000027s : 1: opt.transform.opt_after_cconv 0.11% : 0.000029s : 1: opt.transform.opt_after_jit_grad 0.51% : 0.000138s : 28: opt.transform.opt_b 0.17% : 0.000045s : 2: opt.transform.opt_trans_graph 0.15% : 0.000040s : 4: opt.transform.symbol_engine_opt 10.53% : 0.002836s : 1: opt_a 0.51% : 0.000137s : 1: opt_after_cconv 2.07% : 0.000557s : 1: opt_after_jit_grad 1.24% : 0.000335s : 1: opt_b 21.42% : 0.005769s : 1: optimize 0.09% : 0.000025s : 1: optimize_parallel_all_gather_comm 0.04% : 0.000012s : 1: order_py_execute_after_rewriter 0.11% : 0.000028s : 1: overlap_grad_flash_sp 0.02% : 0.000006s : 1: overlap_grad_matmul_and_grad_allreduce 0.04% : 0.000010s : 1: overlap_grad_ring_attention 0.03% : 0.000007s : 1: overlap_opt_shard_grad_in_pipeline 0.02% : 0.000006s : 1: overlap_opt_shard_in_pipeline 0.03% : 0.000008s : 1: overlap_param_gather 0.02% : 0.000007s : 1: overlap_recompute_allgather_and_fa_grad 0.04% : 0.000011s : 1: overlap_recompute_and_grad_model_parallel 0.03% : 0.000008s : 1: overlap_recompute_comm 0.04% : 0.000010s : 1: parallel-infer-symbol 0.02% : 0.000006s : 1: parallel-infer-symbol-second 0.03% : 0.000008s : 1: partial_unused_args_eliminate 0.04% : 0.000010s : 1: pipeline_parallel_scheduler 0.03% : 0.000007s : 1: pipeline_split 0.15% : 0.000041s : 1: pre_auto_parallel 0.11% : 0.000029s : 1: py_interpret_to_execute 0.07% : 0.000020s : 1: py_interpret_to_execute_after_opt_a 0.02% : 0.000006s : 1: remove_cast_before_assign_add 0.08% : 0.000022s : 1: remove_dup_value 1.38% : 0.000371s : 1: renormalize.infer 1.06% : 0.000287s : 1: renormalize.specialize 0.03% : 0.000008s : 1: reorder_send_recv_between_fp_bp 0.05% : 0.000013s : 1: rewriter_after_jit_bprop_graph 0.19% : 0.000052s : 1: rewriter_after_opt_a 0.22% : 0.000059s : 1: rewriter_before_opt_a 0.03% : 0.000008s : 1: slice_cell_reuse_recomputed_activation 0.03% : 0.000007s : 1: slice_recompute_activation 0.03% : 0.000007s : 1: split_layernorm_comm 0.03% : 0.000007s : 1: split_matmul_comm_elemetwise 0.04% : 0.000011s : 1: swap_dp_allreduce_reducescatter 0.41% : 0.000109s : 1: symbol_engine_optimizer 0.34% : 0.000092s : 1: tuple_transform 21.31% : 0.005739s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:53.462.617 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.014517, [21] [bootstrap]: 0.00044553 [type_inference]: 0.005456 [event_method]: 1.262e-05 [auto_monad]: 5.519e-05 [graph_reusing]: 5.22999e-06 [inline]: 2.77002e-06 [add_attr]: 0.00328831, [1] [add_attr_with_inline]: 0.00327905, [1] [Cycle 1]: 5.756e-05, [2] [tag_attr]: 1.536e-05 [meta_addattr_fg_expand]: 3.63999e-06 [parallel-infer-symbol]: 3.63999e-06 [pre_auto_parallel]: 2.804e-05 [insert-virtual-dataset]: 2.48e-06 [parallel-infer-symbol-second]: 7.39994e-07 [dataset_repeat_opt]: 2.64001e-06 [pipeline_split]: 1.60999e-06 [optimize]: 0.00448726, [53] [py_interpret_to_execute]: 1.879e-05 [rewriter_before_opt_a]: 4.768e-05 [opt_a]: 0.00234029, [2] [Cycle 1]: 0.00167765, [45] [expand_dump_flag]: 2.81e-06 [switch_simplify]: 2.606e-05 [loop_unroll]: 1.391e-05 [a_1]: 0.00030184 [with_stream_mark]: 1.97e-05 [recompute_prepare]: 1.028e-05 [updatestate_depend_eliminate]: 4e-06 [updatestate_assign_eliminate]: 3.4e-06 [updatestate_loads_eliminate]: 3.12002e-06 [parameter_eliminate]: 2.34001e-06 [a_2]: 8.038e-05 [accelerated_algorithm]: 7.48e-06 [shard]: 2.74999e-06 [meta_shard_fg_expand]: 1.73997e-06 [shard_inline]: 6.38e-06 [merge_send_recv]: 9.62999e-06 [auto_parallel]: 6.88998e-06 [parallel]: 2.027e-05 [flash_sp]: 8.99e-06 [merge_comm]: 4.63001e-06 [allreduce_fusion]: 3.30998e-06 [matmul_add_comm_reduction]: 1.032e-05 [allreduce_slice_to_reducescatter]: 6.69999e-07 [virtual_shard_identity]: 1.023e-05 [virtual_dataset]: 6.60002e-06 [get_grad_eliminate_]: 6.06e-06 [virtual_output]: 5.66e-06 [merge_forward]: 4.15999e-06 [cell_reuse_recompute_pass]: 1.32999e-06 [offload_activation]: 1.053e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.57e-05 [merge_recompute_call_nodes]: 1.52001e-06 [before_grad]: 1.204e-05 [set_forward_comm_id_for_comm_node_pass]: 3.86001e-06 [meta_fg_expand]: 2.79001e-06 [flash_sp_send_recv_attached]: 2.81999e-06 [receive_attached]: 2.58e-06 [after_resolve]: 1.06e-05 [a_after_grad]: 9.15001e-06 [renormalize]: 0.00065291 [add_forward_monad_depend]: 6.11998e-06 [auto_monad_grad]: 2.63e-06 [auto_monad_eliminator]: 1.623e-05 [cse]: 3.217e-05 [a_3]: 4.753e-05 [Cycle 2]: 0.00065205, [45] [expand_dump_flag]: 1.05999e-06 [switch_simplify]: 7.6e-06 [loop_unroll]: 6.09999e-06 [a_1]: 0.00010454 [with_stream_mark]: 1.724e-05 [recompute_prepare]: 6.86999e-06 [updatestate_depend_eliminate]: 3.81001e-06 [updatestate_assign_eliminate]: 2.42001e-06 [updatestate_loads_eliminate]: 2.99999e-06 [parameter_eliminate]: 1.66e-06 [a_2]: 7.093e-05 [accelerated_algorithm]: 7.7e-06 [shard]: 1.91e-06 [meta_shard_fg_expand]: 1.54e-06 [shard_inline]: 6.19001e-06 [merge_send_recv]: 5.37001e-06 [auto_parallel]: 6.98e-06 [parallel]: 5.51e-06 [flash_sp]: 3.61001e-06 [merge_comm]: 3.63999e-06 [allreduce_fusion]: 3.09001e-06 [matmul_add_comm_reduction]: 6.84999e-06 [allreduce_slice_to_reducescatter]: 5.3001e-07 [virtual_shard_identity]: 8.58001e-06 [virtual_dataset]: 5.87999e-06 [get_grad_eliminate_]: 5.35999e-06 [virtual_output]: 5.46e-06 [merge_forward]: 3.63e-06 [cell_reuse_recompute_pass]: 1.55999e-06 [offload_activation]: 7.28e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.392e-05 [merge_recompute_call_nodes]: 9.50007e-07 [before_grad]: 9.77001e-06 [set_forward_comm_id_for_comm_node_pass]: 4.1e-06 [meta_fg_expand]: 2.48998e-06 [flash_sp_send_recv_attached]: 1.13001e-06 [receive_attached]: 1.29998e-06 [after_resolve]: 9.36e-06 [a_after_grad]: 8.21002e-06 [renormalize]: 7.99773e-08 [add_forward_monad_depend]: 2.26998e-06 [auto_monad_grad]: 1.19e-06 [auto_monad_eliminator]: 9.77001e-06 [cse]: 1.654e-05 [a_3]: 3.634e-05 [py_interpret_to_execute_after_opt_a]: 1.083e-05 [slice_cell_reuse_recomputed_activation]: 2.01e-06 [rewriter_after_opt_a]: 3.81e-05 [convert_after_rewriter]: 6.68e-06 [order_py_execute_after_rewriter]: 5.19e-06 [mutable_eliminate]: 0.00057242 [opt_b]: 0.00024687, [1] [Cycle 1]: 0.00024029, [7] [b_1]: 0.00015597 [b_2]: 8.70001e-06 [updatestate_depend_eliminate]: 7.05002e-06 [updatestate_assign_eliminate]: 2.27999e-06 [updatestate_loads_eliminate]: 2.36998e-06 [renormalize]: 6.10016e-07 [cse]: 2.358e-05 [optimize_parallel_all_gather_comm]: 1.667e-05 [overlap_param_gather]: 1.94999e-06 [cconv]: 2.813e-05 [loop_unroll]: 0.00046428 [opt_after_cconv]: 0.00010901, [1] [Cycle 1]: 0.00010308, [7] [c_1]: 2.807e-05 [parameter_eliminate]: 3.78999e-06 [updatestate_depend_eliminate]: 6.05002e-06 [updatestate_assign_eliminate]: 3.49001e-06 [updatestate_loads_eliminate]: 2.31e-06 [cse]: 2.416e-05 [renormalize]: 6.80011e-07 [remove_dup_value]: 1.457e-05 [tuple_transform]: 7.175e-05, [1] [Cycle 1]: 6.665e-05, [4] [d_1]: 3.919e-05 [none_parameter_eliminate]: 1.67999e-06 [renormalize]: 2.3999e-07 [switch_simplify]: 6.58e-06 [partial_unused_args_eliminate]: 1.94999e-06 [add_recomputation]: 5.227e-05 [cse_after_recomputation]: 2.294e-05, [1] [Cycle 1]: 1.866e-05, [1] [cse]: 1.248e-05 [environ_conv]: 5.49e-06 [swap_dp_allreduce_reducescatter]: 4.98001e-06 [bias_add_comm_swap]: 2.67001e-06 [label_micro_interleaved_index]: 4.4e-06 [label_fine_grained_interleaved_index]: 2.78e-06 [merge_cast_opt]: 1.32e-06 [slice_recompute_activation]: 1.97001e-06 [micro_interleaved_order_control]: 2.34001e-06 [assign_add_opt]: 1.34e-06 [ForceFp32Comm]: 8.09989e-07 [remove_cast_before_assign_add]: 9.89996e-07 [full_micro_interleaved_order_control]: 2.51998e-06 [reorder_send_recv_between_fp_bp]: 2.73e-06 [comm_op_add_attrs]: 1.03001e-06 [add_comm_op_reuse_tag]: 9.89996e-07 [interleave_split_concat_branches]: 1.32e-06 [interleave_parallel_branches]: 1.05001e-06 [overlap_opt_shard_in_pipeline]: 1.18001e-06 [overlap_opt_shard_grad_in_pipeline]: 2.09999e-06 [control_data_broadcast_order]: 1.295e-05 [grouped_pairwise_exchange_alltoall]: 2.11e-06 [offloading_packed_experts]: 4.23999e-06 [overlap_recompute_and_grad_model_parallel]: 5.17e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.30999e-06 [overlap_recompute_allgather_and_fa_grad]: 1.45999e-06 [overlap_recompute_comm]: 2.14e-06 [overlap_grad_ring_attention]: 3.95e-06 [overlap_grad_flash_sp]: 2.124e-05 [begin_end_overlap_inline]: 5.59987e-07 [split_matmul_comm_elemetwise]: 2.36e-06 [split_layernorm_comm]: 1.92999e-06 [handle_group_info]: 1.25999e-06 [symbol_engine_optimizer]: 8.408e-05, [1] [Cycle 1]: 7.923e-05, [6] [build]: 3.16999e-06 [elim_shapecalc]: 1.255e-05 [elim_not_effective]: 1.428e-05 [opt_reshape]: 7.23999e-06 [fold_const_symbol]: 9.97001e-06 [renormalize]: 2.50002e-07 [detach_backward]: 2.19999e-06 [pipeline_parallel_scheduler]: 1.76e-06 [auto_monad_reorder]: 1.899e-05 [get_jit_bprop_graph]: 2.17999e-06 [rewriter_after_jit_bprop_graph]: 4.49002e-06 [opt_after_jit_grad]: 0.00049501 [validate]: 4.067e-05 Sums bootstrap : 0.000446s : 4.36% type_inference : 0.005456s : 53.44% event_method : 0.000013s : 0.12% auto_monad : 0.000055s : 0.54% graph_reusing : 0.000005s : 0.05% inline : 0.000003s : 0.03% add_attr.add_attr_with_inline.tag_attr : 0.000015s : 0.15% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000004s : 0.04% parallel-infer-symbol : 0.000004s : 0.04% pre_auto_parallel : 0.000028s : 0.27% insert-virtual-dataset : 0.000002s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000003s : 0.03% pipeline_split : 0.000002s : 0.02% optimize.py_interpret_to_execute : 0.000019s : 0.18% optimize.rewriter_before_opt_a : 0.000048s : 0.47% optimize.opt_a.expand_dump_flag : 0.000004s : 0.04% optimize.opt_a.switch_simplify : 0.000034s : 0.33% optimize.opt_a.loop_unroll : 0.000020s : 0.20% optimize.opt_a.a_1 : 0.000406s : 3.98% optimize.opt_a.with_stream_mark : 0.000037s : 0.36% optimize.opt_a.recompute_prepare : 0.000017s : 0.17% optimize.opt_a.updatestate_depend_eliminate : 0.000008s : 0.08% optimize.opt_a.updatestate_assign_eliminate : 0.000006s : 0.06% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.06% optimize.opt_a.parameter_eliminate : 0.000004s : 0.04% optimize.opt_a.a_2 : 0.000151s : 1.48% optimize.opt_a.accelerated_algorithm : 0.000015s : 0.15% optimize.opt_a.shard : 0.000005s : 0.05% optimize.opt_a.meta_shard_fg_expand : 0.000003s : 0.03% optimize.opt_a.shard_inline : 0.000013s : 0.12% optimize.opt_a.merge_send_recv : 0.000015s : 0.15% optimize.opt_a.auto_parallel : 0.000014s : 0.14% optimize.opt_a.parallel : 0.000026s : 0.25% optimize.opt_a.flash_sp : 0.000013s : 0.12% optimize.opt_a.merge_comm : 0.000008s : 0.08% optimize.opt_a.allreduce_fusion : 0.000006s : 0.06% optimize.opt_a.matmul_add_comm_reduction : 0.000017s : 0.17% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000019s : 0.18% optimize.opt_a.virtual_dataset : 0.000012s : 0.12% optimize.opt_a.get_grad_eliminate_ : 0.000011s : 0.11% optimize.opt_a.virtual_output : 0.000011s : 0.11% optimize.opt_a.merge_forward : 0.000008s : 0.08% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.03% optimize.opt_a.offload_activation : 0.000018s : 0.17% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000030s : 0.29% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.02% optimize.opt_a.before_grad : 0.000022s : 0.21% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000008s : 0.08% optimize.opt_a.meta_fg_expand : 0.000005s : 0.05% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.04% optimize.opt_a.receive_attached : 0.000004s : 0.04% optimize.opt_a.after_resolve : 0.000020s : 0.20% optimize.opt_a.a_after_grad : 0.000017s : 0.17% optimize.opt_a.renormalize : 0.000653s : 6.40% optimize.opt_a.add_forward_monad_depend : 0.000008s : 0.08% optimize.opt_a.auto_monad_grad : 0.000004s : 0.04% optimize.opt_a.auto_monad_eliminator : 0.000026s : 0.25% optimize.opt_a.cse : 0.000049s : 0.48% optimize.opt_a.a_3 : 0.000084s : 0.82% optimize.py_interpret_to_execute_after_opt_a : 0.000011s : 0.11% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.02% optimize.rewriter_after_opt_a : 0.000038s : 0.37% optimize.convert_after_rewriter : 0.000007s : 0.07% optimize.order_py_execute_after_rewriter : 0.000005s : 0.05% optimize.mutable_eliminate : 0.000572s : 5.61% optimize.opt_b.b_1 : 0.000156s : 1.53% optimize.opt_b.b_2 : 0.000009s : 0.09% optimize.opt_b.updatestate_depend_eliminate : 0.000007s : 0.07% optimize.opt_b.updatestate_assign_eliminate : 0.000002s : 0.02% optimize.opt_b.updatestate_loads_eliminate : 0.000002s : 0.02% optimize.opt_b.renormalize : 0.000001s : 0.01% optimize.opt_b.cse : 0.000024s : 0.23% optimize.optimize_parallel_all_gather_comm : 0.000017s : 0.16% optimize.overlap_param_gather : 0.000002s : 0.02% optimize.cconv : 0.000028s : 0.28% optimize.loop_unroll : 0.000464s : 4.55% optimize.opt_after_cconv.c_1 : 0.000028s : 0.27% optimize.opt_after_cconv.parameter_eliminate : 0.000004s : 0.04% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.06% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.02% optimize.opt_after_cconv.cse : 0.000024s : 0.24% optimize.opt_after_cconv.renormalize : 0.000001s : 0.01% optimize.remove_dup_value : 0.000015s : 0.14% optimize.tuple_transform.d_1 : 0.000039s : 0.38% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.02% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000007s : 0.06% optimize.partial_unused_args_eliminate : 0.000002s : 0.02% optimize.add_recomputation : 0.000052s : 0.51% optimize.cse_after_recomputation.cse : 0.000012s : 0.12% optimize.environ_conv : 0.000005s : 0.05% optimize.swap_dp_allreduce_reducescatter : 0.000005s : 0.05% optimize.bias_add_comm_swap : 0.000003s : 0.03% optimize.label_micro_interleaved_index : 0.000004s : 0.04% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.03% optimize.merge_cast_opt : 0.000001s : 0.01% optimize.slice_recompute_activation : 0.000002s : 0.02% optimize.micro_interleaved_order_control : 0.000002s : 0.02% optimize.assign_add_opt : 0.000001s : 0.01% optimize.ForceFp32Comm : 0.000001s : 0.01% optimize.remove_cast_before_assign_add : 0.000001s : 0.01% optimize.full_micro_interleaved_order_control : 0.000003s : 0.02% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.03% optimize.comm_op_add_attrs : 0.000001s : 0.01% optimize.add_comm_op_reuse_tag : 0.000001s : 0.01% optimize.interleave_split_concat_branches : 0.000001s : 0.01% optimize.interleave_parallel_branches : 0.000001s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.02% optimize.control_data_broadcast_order : 0.000013s : 0.13% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.02% optimize.offloading_packed_experts : 0.000004s : 0.04% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.05% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.01% optimize.overlap_recompute_comm : 0.000002s : 0.02% optimize.overlap_grad_ring_attention : 0.000004s : 0.04% optimize.overlap_grad_flash_sp : 0.000021s : 0.21% optimize.begin_end_overlap_inline : 0.000001s : 0.01% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.02% optimize.split_layernorm_comm : 0.000002s : 0.02% optimize.handle_group_info : 0.000001s : 0.01% optimize.symbol_engine_optimizer.build : 0.000003s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000013s : 0.12% optimize.symbol_engine_optimizer.elim_not_effective : 0.000014s : 0.14% optimize.symbol_engine_optimizer.opt_reshape : 0.000007s : 0.07% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000010s : 0.10% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.02% pipeline_parallel_scheduler : 0.000002s : 0.02% auto_monad_reorder : 0.000019s : 0.19% get_jit_bprop_graph : 0.000002s : 0.02% rewriter_after_jit_bprop_graph : 0.000004s : 0.04% opt_after_jit_grad : 0.000495s : 4.85% validate : 0.000041s : 0.40% Time group info: ------[substitution.] 0.000148 20 1.32% : 0.000002s : 2: substitution.elim_not_effective 0.90% : 0.000001s : 2: substitution.fold_const_symbol 3.62% : 0.000005s : 3: substitution.graph_param_transform 65.92% : 0.000097s : 2: substitution.inline 3.19% : 0.000005s : 4: substitution.j_node_and_user_rematch 3.39% : 0.000005s : 4: substitution.remove_not_recompute_node 2.41% : 0.000004s : 2: substitution.replace_old_param 19.25% : 0.000028s : 1: substitution.value_based_eliminate ------[type_inference.] 0.005405 2 91.57% : 0.004949s : 1: type_inference.infer 8.43% : 0.000456s : 1: type_inference.specialize ------[replace.] 0.000023 2 100.00% : 0.000023s : 2: replace.inline ------[match.] 0.000096 2 100.00% : 0.000096s : 2: match.inline ------[predicate.] 0.000137 754 0.82% : 0.000001s : 7: predicate.accumulaten_eliminater 1.17% : 0.000002s : 3: predicate.ad_related_special_op_eliminate 0.63% : 0.000001s : 6: predicate.addn_check_dump 0.79% : 0.000001s : 7: predicate.addn_zero_filter 0.66% : 0.000001s : 7: predicate.adjust_all_reduce_mul_add 2.23% : 0.000003s : 13: predicate.arithmetic_simplify 0.73% : 0.000001s : 7: predicate.cast_eliminate 0.82% : 0.000001s : 6: predicate.check_bprop_eliminate 0.64% : 0.000001s : 6: predicate.compare_switch_simplify 0.23% : 0.000000s : 3: predicate.const_output_eliminate 0.73% : 0.000001s : 6: predicate.depend_value_elim 0.79% : 0.000001s : 7: predicate.dict_get_item_const_eliminator 0.90% : 0.000001s : 7: predicate.dict_get_item_eliminator 0.76% : 0.000001s : 7: predicate.dict_set_item_eliminator 1.34% : 0.000002s : 6: predicate.dumpgradient_eliminate 0.28% : 0.000000s : 3: predicate.elim_not_effective 0.53% : 0.000001s : 3: predicate.elim_shapecalc_of_broadcastargs 1.19% : 0.000002s : 10: predicate.environ_add_const_eliminate 0.99% : 0.000001s : 10: predicate.environ_get_add_eliminate 1.12% : 0.000002s : 10: predicate.environ_get_depend_swap 1.91% : 0.000003s : 16: predicate.environ_get_eliminate 0.94% : 0.000001s : 10: predicate.environ_get_set_eliminate 0.92% : 0.000001s : 9: predicate.exchange_switch_depend_value 1.68% : 0.000002s : 9: predicate.float_depend_g_call 0.61% : 0.000001s : 6: predicate.float_environ_get_switch 0.94% : 0.000001s : 9: predicate.float_tuple_getitem_switch 0.21% : 0.000000s : 3: predicate.fold_const_symbol 0.76% : 0.000001s : 6: predicate.get_grad_eliminate 0.23% : 0.000000s : 3: predicate.graph_param_transform 0.79% : 0.000001s : 6: predicate.incorporate_call 0.60% : 0.000001s : 6: predicate.incorporate_call_switch 6.70% : 0.000009s : 34: predicate.inline 1.26% : 0.000002s : 6: predicate.inline_without_move 0.35% : 0.000000s : 6: predicate.j_node_and_user_rematch 1.85% : 0.000003s : 6: predicate.less_batch_normalization 1.56% : 0.000002s : 13: predicate.list_to_tuple_eliminator_ 2.02% : 0.000003s : 20: predicate.load_eliminater 1.70% : 0.000002s : 3: predicate.loop_unroll_after_grad 1.59% : 0.000002s : 14: predicate.loop_unroll_before_grad 1.65% : 0.000002s : 13: predicate.make_slice_get_slice_eliminator 0.71% : 0.000001s : 6: predicate.merge_addn 0.71% : 0.000001s : 6: predicate.micro_step_allgather_replace 0.76% : 0.000001s : 6: predicate.mini_step_allgather_replace 0.65% : 0.000001s : 7: predicate.minmaximum_grad 1.58% : 0.000002s : 3: predicate.mutable_eliminate 0.59% : 0.000001s : 3: predicate.opt_reshape 0.57% : 0.000001s : 3: predicate.parallel_virtual_node 1.57% : 0.000002s : 9: predicate.partial_defer_inline 1.30% : 0.000002s : 10: predicate.partial_eliminate 0.74% : 0.000001s : 7: predicate.print_const_string_wrapper 0.71% : 0.000001s : 6: predicate.reduce_all_const_elim 1.06% : 0.000001s : 7: predicate.reduce_eliminate 2.02% : 0.000003s : 20: predicate.redundant_stop_gradient_eliminater 0.71% : 0.000001s : 6: predicate.remove_not_recompute_node 1.19% : 0.000002s : 13: predicate.replace_applicator 0.66% : 0.000001s : 6: predicate.replace_old_param 0.36% : 0.000001s : 3: predicate.reset_defer_inline 0.86% : 0.000001s : 7: predicate.reshape_eliminate 0.73% : 0.000001s : 6: predicate.row_tensor_add_zeros_like 0.55% : 0.000001s : 3: predicate.row_tensor_eliminate 0.87% : 0.000001s : 6: predicate.same_eliminate 0.63% : 0.000001s : 6: predicate.set_cell_output_no_recompute 1.36% : 0.000002s : 6: predicate.shard_identity_eliminate 0.86% : 0.000001s : 6: predicate.special_op_eliminate 0.86% : 0.000001s : 6: predicate.specialize_transform 1.27% : 0.000002s : 6: predicate.split_environ_get_set_with_tuple_value 0.81% : 0.000001s : 6: predicate.stack_unstack_eliminate 0.41% : 0.000001s : 3: predicate.switch_call_monad_eliminater 1.02% : 0.000001s : 9: predicate.switch_defer_inline 1.56% : 0.000002s : 15: predicate.switch_layer_defer_inline 4.40% : 0.000006s : 32: predicate.switch_simplify 0.74% : 0.000001s : 7: predicate.tile_eliminate 0.87% : 0.000001s : 7: predicate.transpose_eliminate 1.47% : 0.000002s : 13: predicate.tuple_list_convert_item_index_to_positive 1.56% : 0.000002s : 13: predicate.tuple_list_get_item_const_eliminator 1.24% : 0.000002s : 13: predicate.tuple_list_get_item_depend_reorder 3.31% : 0.000005s : 19: predicate.tuple_list_get_item_eliminator 1.45% : 0.000002s : 13: predicate.tuple_list_get_set_item_eliminator 2.47% : 0.000003s : 19: predicate.tuple_list_set_item_eliminator 1.37% : 0.000002s : 13: predicate.tuple_to_list_eliminator_ 1.87% : 0.000003s : 20: predicate.updatestate_pure_node_eliminater 2.78% : 0.000004s : 26: predicate.updatestate_useless_node_eliminater 0.72% : 0.000001s : 3: predicate.value_based_eliminate 0.77% : 0.000001s : 6: predicate.virtual_dataset_eliminate 0.76% : 0.000001s : 6: predicate.virtual_output_eliminate 0.39% : 0.000001s : 3: predicate.virtual_view_grad_eliminate 0.54% : 0.000001s : 3: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000236 5 8.12% : 0.000019s : 1: func_graph_cloner_run.FuncGraphClonerGraph 91.88% : 0.000217s : 4: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.023896 192 0.02% : 0.000004s : 1: ForceFp32Comm 13.79% : 0.003294s : 1: add_attr 13.74% : 0.003283s : 1: add_attr_with_inline 0.02% : 0.000004s : 1: add_comm_op_reuse_tag 0.24% : 0.000057s : 1: add_recomputation 0.02% : 0.000004s : 1: assign_add_opt 0.25% : 0.000060s : 1: auto_monad 0.10% : 0.000024s : 1: auto_monad_reorder 0.02% : 0.000004s : 1: begin_end_overlap_inline 0.02% : 0.000005s : 1: bias_add_comm_swap 1.99% : 0.000475s : 1: bootstrap 0.14% : 0.000032s : 1: cconv 0.02% : 0.000004s : 1: comm_op_add_attrs 0.07% : 0.000017s : 1: control_data_broadcast_order 0.04% : 0.000010s : 1: convert_after_rewriter 0.11% : 0.000026s : 1: cse_after_recomputation 0.02% : 0.000006s : 1: dataset_repeat_opt 0.02% : 0.000006s : 1: detach_backward 0.04% : 0.000009s : 1: environ_conv 0.08% : 0.000020s : 1: event_method 0.02% : 0.000005s : 1: full_micro_interleaved_order_control 0.02% : 0.000005s : 1: get_jit_bprop_graph 0.04% : 0.000009s : 1: graph_reusing 0.02% : 0.000005s : 1: grouped_pairwise_exchange_alltoall 0.02% : 0.000004s : 1: handle_group_info 0.02% : 0.000006s : 1: inline 0.02% : 0.000006s : 1: insert-virtual-dataset 0.02% : 0.000004s : 1: interleave_parallel_branches 0.02% : 0.000004s : 1: interleave_split_concat_branches 0.02% : 0.000006s : 1: label_fine_grained_interleaved_index 0.03% : 0.000007s : 1: label_micro_interleaved_index 1.98% : 0.000473s : 1: loop_unroll 0.02% : 0.000004s : 1: merge_cast_opt 0.02% : 0.000005s : 1: micro_interleaved_order_control 2.44% : 0.000582s : 1: mutable_eliminate 0.03% : 0.000007s : 1: offloading_packed_experts 0.07% : 0.000017s : 1: opt.transform.loop_unroll_optimizer 0.07% : 0.000017s : 1: opt.transform.mutable_eliminate 3.28% : 0.000784s : 78: opt.transform.opt_a 0.11% : 0.000026s : 1: opt.transform.opt_after_cconv 0.11% : 0.000026s : 1: opt.transform.opt_after_jit_grad 0.55% : 0.000131s : 28: opt.transform.opt_b 0.18% : 0.000043s : 2: opt.transform.opt_trans_graph 0.16% : 0.000039s : 4: opt.transform.symbol_engine_opt 9.81% : 0.002344s : 1: opt_a 0.47% : 0.000113s : 1: opt_after_cconv 2.12% : 0.000506s : 1: opt_after_jit_grad 1.05% : 0.000251s : 1: opt_b 18.80% : 0.004492s : 1: optimize 0.09% : 0.000021s : 1: optimize_parallel_all_gather_comm 0.03% : 0.000008s : 1: order_py_execute_after_rewriter 0.11% : 0.000025s : 1: overlap_grad_flash_sp 0.02% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.03% : 0.000007s : 1: overlap_grad_ring_attention 0.02% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.02% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.02% : 0.000005s : 1: overlap_param_gather 0.02% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.03% : 0.000008s : 1: overlap_recompute_and_grad_model_parallel 0.02% : 0.000005s : 1: overlap_recompute_comm 0.03% : 0.000007s : 1: parallel-infer-symbol 0.02% : 0.000004s : 1: parallel-infer-symbol-second 0.02% : 0.000005s : 1: partial_unused_args_eliminate 0.02% : 0.000005s : 1: pipeline_parallel_scheduler 0.02% : 0.000004s : 1: pipeline_split 0.14% : 0.000032s : 1: pre_auto_parallel 0.09% : 0.000022s : 1: py_interpret_to_execute 0.06% : 0.000015s : 1: py_interpret_to_execute_after_opt_a 0.02% : 0.000004s : 1: remove_cast_before_assign_add 0.08% : 0.000018s : 1: remove_dup_value 1.58% : 0.000377s : 1: renormalize.infer 1.12% : 0.000267s : 1: renormalize.specialize 0.02% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.03% : 0.000008s : 1: rewriter_after_jit_bprop_graph 0.18% : 0.000042s : 1: rewriter_after_opt_a 0.22% : 0.000052s : 1: rewriter_before_opt_a 0.02% : 0.000006s : 1: slice_cell_reuse_recomputed_activation 0.02% : 0.000005s : 1: slice_recompute_activation 0.02% : 0.000005s : 1: split_layernorm_comm 0.02% : 0.000005s : 1: split_matmul_comm_elemetwise 0.03% : 0.000008s : 1: swap_dp_allreduce_reducescatter 0.36% : 0.000087s : 1: symbol_engine_optimizer 0.31% : 0.000075s : 1: tuple_transform 22.91% : 0.005474s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:53.657.663 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:53.658.122 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0157577, [21] [bootstrap]: 0.00045599 [type_inference]: 0.00553652 [event_method]: 1.332e-05 [auto_monad]: 5.778e-05 [graph_reusing]: 5.17e-06 [inline]: 2.84999e-06 [add_attr]: 0.00313221, [1] [add_attr_with_inline]: 0.00312363, [1] [Cycle 1]: 6.803e-05, [2] [tag_attr]: 1.547e-05 [meta_addattr_fg_expand]: 4.05e-06 [parallel-infer-symbol]: 2.99001e-06 [pre_auto_parallel]: 2.63e-05 [insert-virtual-dataset]: 2.90002e-06 [parallel-infer-symbol-second]: 7.50006e-07 [dataset_repeat_opt]: 2.18002e-06 [pipeline_split]: 1.77001e-06 [optimize]: 0.0053531, [53] [py_interpret_to_execute]: 2.128e-05 [rewriter_before_opt_a]: 5.207e-05 [opt_a]: 0.00290451, [2] [Cycle 1]: 0.00197218, [45] [expand_dump_flag]: 3.08998e-06 [switch_simplify]: 2.63e-05 [loop_unroll]: 1.494e-05 [a_1]: 0.00038072 [with_stream_mark]: 1.552e-05 [recompute_prepare]: 9.59999e-06 [updatestate_depend_eliminate]: 5.34e-06 [updatestate_assign_eliminate]: 3.79002e-06 [updatestate_loads_eliminate]: 3.58999e-06 [parameter_eliminate]: 2.21e-06 [a_2]: 0.0001243 [accelerated_algorithm]: 8.09997e-06 [shard]: 2.46e-06 [meta_shard_fg_expand]: 2.11e-06 [shard_inline]: 7.61001e-06 [merge_send_recv]: 9.21002e-06 [auto_parallel]: 7.21001e-06 [parallel]: 1.727e-05 [flash_sp]: 8.16002e-06 [merge_comm]: 4.62e-06 [allreduce_fusion]: 4.07e-06 [matmul_add_comm_reduction]: 1.055e-05 [allreduce_slice_to_reducescatter]: 5.89993e-07 [virtual_shard_identity]: 9.44e-06 [virtual_dataset]: 7.31001e-06 [get_grad_eliminate_]: 7.48e-06 [virtual_output]: 7.43e-06 [merge_forward]: 4.79002e-06 [cell_reuse_recompute_pass]: 1.45999e-06 [offload_activation]: 1.016e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.783e-05 [merge_recompute_call_nodes]: 1.37e-06 [before_grad]: 1.335e-05 [set_forward_comm_id_for_comm_node_pass]: 4.76002e-06 [meta_fg_expand]: 3.03998e-06 [flash_sp_send_recv_attached]: 2.49001e-06 [receive_attached]: 2.14e-06 [after_resolve]: 1.153e-05 [a_after_grad]: 1.09e-05 [renormalize]: 0.00066673 [add_forward_monad_depend]: 5.56e-06 [auto_monad_grad]: 2.36e-06 [auto_monad_eliminator]: 1.804e-05 [cse]: 3.505e-05 [a_3]: 7.206e-05 [Cycle 2]: 0.00091931, [45] [expand_dump_flag]: 1.15999e-06 [switch_simplify]: 8.90999e-06 [loop_unroll]: 7.29001e-06 [a_1]: 0.00015272 [with_stream_mark]: 1.38e-05 [recompute_prepare]: 7.93999e-06 [updatestate_depend_eliminate]: 4.24002e-06 [updatestate_assign_eliminate]: 3.01001e-06 [updatestate_loads_eliminate]: 2.91999e-06 [parameter_eliminate]: 1.10999e-06 [a_2]: 0.00011642 [accelerated_algorithm]: 7.1e-06 [shard]: 1.67999e-06 [meta_shard_fg_expand]: 2.02999e-06 [shard_inline]: 7.68001e-06 [merge_send_recv]: 6.12001e-06 [auto_parallel]: 7.03e-06 [parallel]: 5.15001e-06 [flash_sp]: 3.42997e-06 [merge_comm]: 4.26001e-06 [allreduce_fusion]: 4.47e-06 [matmul_add_comm_reduction]: 7.62002e-06 [allreduce_slice_to_reducescatter]: 5.19998e-07 [virtual_shard_identity]: 8.47998e-06 [virtual_dataset]: 6.99001e-06 [get_grad_eliminate_]: 6.76999e-06 [virtual_output]: 6.58e-06 [merge_forward]: 3.75e-06 [cell_reuse_recompute_pass]: 1.57001e-06 [offload_activation]: 8.17e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.639e-05 [merge_recompute_call_nodes]: 9.10019e-07 [before_grad]: 1.152e-05 [set_forward_comm_id_for_comm_node_pass]: 5.12999e-06 [meta_fg_expand]: 2.89999e-06 [flash_sp_send_recv_attached]: 1.32e-06 [receive_attached]: 1.30001e-06 [after_resolve]: 1.005e-05 [a_after_grad]: 1.066e-05 [renormalize]: 7.99773e-08 [add_forward_monad_depend]: 1.91e-06 [auto_monad_grad]: 1.11002e-06 [auto_monad_eliminator]: 1.039e-05 [cse]: 2.236e-05 [a_3]: 5.811e-05 [py_interpret_to_execute_after_opt_a]: 1.451e-05 [slice_cell_reuse_recomputed_activation]: 4.70001e-06 [rewriter_after_opt_a]: 4.673e-05 [convert_after_rewriter]: 1.07e-05 [order_py_execute_after_rewriter]: 9.24e-06 [mutable_eliminate]: 0.00051401 [opt_b]: 0.00034369, [1] [Cycle 1]: 0.00033451, [7] [b_1]: 0.00022541 [b_2]: 9.05001e-06 [updatestate_depend_eliminate]: 7.18e-06 [updatestate_assign_eliminate]: 3.35e-06 [updatestate_loads_eliminate]: 3.14999e-06 [renormalize]: 6.40022e-07 [cse]: 2.583e-05 [optimize_parallel_all_gather_comm]: 2.243e-05 [overlap_param_gather]: 4.53001e-06 [cconv]: 2.889e-05 [loop_unroll]: 0.00050328 [opt_after_cconv]: 0.00014576, [1] [Cycle 1]: 0.00013722, [7] [c_1]: 3.574e-05 [parameter_eliminate]: 3.01001e-06 [updatestate_depend_eliminate]: 7.59002e-06 [updatestate_assign_eliminate]: 3.12002e-06 [updatestate_loads_eliminate]: 3.68999e-06 [cse]: 2.661e-05 [renormalize]: 6.10016e-07 [remove_dup_value]: 1.947e-05 [tuple_transform]: 9.908e-05, [1] [Cycle 1]: 9.219e-05, [4] [d_1]: 5.129e-05 [none_parameter_eliminate]: 1.76e-06 [renormalize]: 1.8999e-07 [switch_simplify]: 8.22e-06 [partial_unused_args_eliminate]: 4.72998e-06 [add_recomputation]: 6.061e-05 [cse_after_recomputation]: 3.32e-05, [1] [Cycle 1]: 2.602e-05, [1] [cse]: 1.664e-05 [environ_conv]: 9.46e-06 [swap_dp_allreduce_reducescatter]: 8.99e-06 [bias_add_comm_swap]: 4.96997e-06 [label_micro_interleaved_index]: 7.99002e-06 [label_fine_grained_interleaved_index]: 5.81998e-06 [merge_cast_opt]: 4.08999e-06 [slice_recompute_activation]: 4.88001e-06 [micro_interleaved_order_control]: 4.55001e-06 [assign_add_opt]: 3.78001e-06 [ForceFp32Comm]: 3.23998e-06 [remove_cast_before_assign_add]: 3.45e-06 [full_micro_interleaved_order_control]: 4.47998e-06 [reorder_send_recv_between_fp_bp]: 5.10999e-06 [comm_op_add_attrs]: 3.7e-06 [add_comm_op_reuse_tag]: 3.25e-06 [interleave_split_concat_branches]: 3.83001e-06 [interleave_parallel_branches]: 3.41001e-06 [overlap_opt_shard_in_pipeline]: 3.5e-06 [overlap_opt_shard_grad_in_pipeline]: 4.43001e-06 [control_data_broadcast_order]: 1.829e-05 [grouped_pairwise_exchange_alltoall]: 4.37e-06 [offloading_packed_experts]: 7.26001e-06 [overlap_recompute_and_grad_model_parallel]: 8.57e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.71001e-06 [overlap_recompute_allgather_and_fa_grad]: 3.67998e-06 [overlap_recompute_comm]: 4.49002e-06 [overlap_grad_ring_attention]: 7.08e-06 [overlap_grad_flash_sp]: 2.507e-05 [begin_end_overlap_inline]: 2.93998e-06 [split_matmul_comm_elemetwise]: 4.3e-06 [split_layernorm_comm]: 4.13001e-06 [handle_group_info]: 3.26001e-06 [symbol_engine_optimizer]: 0.00010769, [1] [Cycle 1]: 0.00010062, [6] [build]: 2.96999e-06 [elim_shapecalc]: 1.351e-05 [elim_not_effective]: 1.624e-05 [opt_reshape]: 8.27e-06 [fold_const_symbol]: 1.249e-05 [renormalize]: 3.00002e-07 [detach_backward]: 3.96001e-06 [pipeline_parallel_scheduler]: 1.94999e-06 [auto_monad_reorder]: 2.427e-05 [get_jit_bprop_graph]: 1.88002e-06 [rewriter_after_jit_bprop_graph]: 5.46998e-06 [opt_after_jit_grad]: 0.00052258 [validate]: 4.446e-05 Sums bootstrap : 0.000456s : 4.19% type_inference : 0.005537s : 50.86% event_method : 0.000013s : 0.12% auto_monad : 0.000058s : 0.53% graph_reusing : 0.000005s : 0.05% inline : 0.000003s : 0.03% add_attr.add_attr_with_inline.tag_attr : 0.000015s : 0.14% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000004s : 0.04% parallel-infer-symbol : 0.000003s : 0.03% pre_auto_parallel : 0.000026s : 0.24% insert-virtual-dataset : 0.000003s : 0.03% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.02% optimize.py_interpret_to_execute : 0.000021s : 0.20% optimize.rewriter_before_opt_a : 0.000052s : 0.48% optimize.opt_a.expand_dump_flag : 0.000004s : 0.04% optimize.opt_a.switch_simplify : 0.000035s : 0.32% optimize.opt_a.loop_unroll : 0.000022s : 0.20% optimize.opt_a.a_1 : 0.000533s : 4.90% optimize.opt_a.with_stream_mark : 0.000029s : 0.27% optimize.opt_a.recompute_prepare : 0.000018s : 0.16% optimize.opt_a.updatestate_depend_eliminate : 0.000010s : 0.09% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.06% optimize.opt_a.updatestate_loads_eliminate : 0.000007s : 0.06% optimize.opt_a.parameter_eliminate : 0.000003s : 0.03% optimize.opt_a.a_2 : 0.000241s : 2.21% optimize.opt_a.accelerated_algorithm : 0.000015s : 0.14% optimize.opt_a.shard : 0.000004s : 0.04% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.04% optimize.opt_a.shard_inline : 0.000015s : 0.14% optimize.opt_a.merge_send_recv : 0.000015s : 0.14% optimize.opt_a.auto_parallel : 0.000014s : 0.13% optimize.opt_a.parallel : 0.000022s : 0.21% optimize.opt_a.flash_sp : 0.000012s : 0.11% optimize.opt_a.merge_comm : 0.000009s : 0.08% optimize.opt_a.allreduce_fusion : 0.000009s : 0.08% optimize.opt_a.matmul_add_comm_reduction : 0.000018s : 0.17% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000018s : 0.16% optimize.opt_a.virtual_dataset : 0.000014s : 0.13% optimize.opt_a.get_grad_eliminate_ : 0.000014s : 0.13% optimize.opt_a.virtual_output : 0.000014s : 0.13% optimize.opt_a.merge_forward : 0.000009s : 0.08% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.03% optimize.opt_a.offload_activation : 0.000018s : 0.17% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000034s : 0.31% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.02% optimize.opt_a.before_grad : 0.000025s : 0.23% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000010s : 0.09% optimize.opt_a.meta_fg_expand : 0.000006s : 0.05% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.03% optimize.opt_a.receive_attached : 0.000003s : 0.03% optimize.opt_a.after_resolve : 0.000022s : 0.20% optimize.opt_a.a_after_grad : 0.000022s : 0.20% optimize.opt_a.renormalize : 0.000667s : 6.13% optimize.opt_a.add_forward_monad_depend : 0.000007s : 0.07% optimize.opt_a.auto_monad_grad : 0.000003s : 0.03% optimize.opt_a.auto_monad_eliminator : 0.000028s : 0.26% optimize.opt_a.cse : 0.000057s : 0.53% optimize.opt_a.a_3 : 0.000130s : 1.20% optimize.py_interpret_to_execute_after_opt_a : 0.000015s : 0.13% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.04% optimize.rewriter_after_opt_a : 0.000047s : 0.43% optimize.convert_after_rewriter : 0.000011s : 0.10% optimize.order_py_execute_after_rewriter : 0.000009s : 0.08% optimize.mutable_eliminate : 0.000514s : 4.72% optimize.opt_b.b_1 : 0.000225s : 2.07% optimize.opt_b.b_2 : 0.000009s : 0.08% optimize.opt_b.updatestate_depend_eliminate : 0.000007s : 0.07% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_b.renormalize : 0.000001s : 0.01% optimize.opt_b.cse : 0.000026s : 0.24% optimize.optimize_parallel_all_gather_comm : 0.000022s : 0.21% optimize.overlap_param_gather : 0.000005s : 0.04% optimize.cconv : 0.000029s : 0.27% optimize.loop_unroll : 0.000503s : 4.62% optimize.opt_after_cconv.c_1 : 0.000036s : 0.33% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000008s : 0.07% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000004s : 0.03% optimize.opt_after_cconv.cse : 0.000027s : 0.24% optimize.opt_after_cconv.renormalize : 0.000001s : 0.01% optimize.remove_dup_value : 0.000019s : 0.18% optimize.tuple_transform.d_1 : 0.000051s : 0.47% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.02% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000008s : 0.08% optimize.partial_unused_args_eliminate : 0.000005s : 0.04% optimize.add_recomputation : 0.000061s : 0.56% optimize.cse_after_recomputation.cse : 0.000017s : 0.15% optimize.environ_conv : 0.000009s : 0.09% optimize.swap_dp_allreduce_reducescatter : 0.000009s : 0.08% optimize.bias_add_comm_swap : 0.000005s : 0.05% optimize.label_micro_interleaved_index : 0.000008s : 0.07% optimize.label_fine_grained_interleaved_index : 0.000006s : 0.05% optimize.merge_cast_opt : 0.000004s : 0.04% optimize.slice_recompute_activation : 0.000005s : 0.04% optimize.micro_interleaved_order_control : 0.000005s : 0.04% optimize.assign_add_opt : 0.000004s : 0.03% optimize.ForceFp32Comm : 0.000003s : 0.03% optimize.remove_cast_before_assign_add : 0.000003s : 0.03% optimize.full_micro_interleaved_order_control : 0.000004s : 0.04% optimize.reorder_send_recv_between_fp_bp : 0.000005s : 0.05% optimize.comm_op_add_attrs : 0.000004s : 0.03% optimize.add_comm_op_reuse_tag : 0.000003s : 0.03% optimize.interleave_split_concat_branches : 0.000004s : 0.04% optimize.interleave_parallel_branches : 0.000003s : 0.03% optimize.overlap_opt_shard_in_pipeline : 0.000003s : 0.03% optimize.overlap_opt_shard_grad_in_pipeline : 0.000004s : 0.04% optimize.control_data_broadcast_order : 0.000018s : 0.17% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.04% optimize.offloading_packed_experts : 0.000007s : 0.07% optimize.overlap_recompute_and_grad_model_parallel : 0.000009s : 0.08% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.03% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.03% optimize.overlap_recompute_comm : 0.000004s : 0.04% optimize.overlap_grad_ring_attention : 0.000007s : 0.07% optimize.overlap_grad_flash_sp : 0.000025s : 0.23% optimize.begin_end_overlap_inline : 0.000003s : 0.03% optimize.split_matmul_comm_elemetwise : 0.000004s : 0.04% optimize.split_layernorm_comm : 0.000004s : 0.04% optimize.handle_group_info : 0.000003s : 0.03% optimize.symbol_engine_optimizer.build : 0.000003s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000014s : 0.12% optimize.symbol_engine_optimizer.elim_not_effective : 0.000016s : 0.15% optimize.symbol_engine_optimizer.opt_reshape : 0.000008s : 0.08% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000012s : 0.11% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000004s : 0.04% pipeline_parallel_scheduler : 0.000002s : 0.02% auto_monad_reorder : 0.000024s : 0.22% get_jit_bprop_graph : 0.000002s : 0.02% rewriter_after_jit_bprop_graph : 0.000005s : 0.05% opt_after_jit_grad : 0.000523s : 4.80% validate : 0.000044s : 0.41% Time group info: ------[substitution.] 0.000202 29 10.67% : 0.000022s : 2: substitution.cast_eliminate 1.30% : 0.000003s : 3: substitution.elim_not_effective 0.80% : 0.000002s : 3: substitution.fold_const_symbol 3.08% : 0.000006s : 4: substitution.graph_param_transform 64.99% : 0.000131s : 2: substitution.inline 2.25% : 0.000005s : 6: substitution.j_node_and_user_rematch 3.09% : 0.000006s : 6: substitution.remove_not_recompute_node 1.78% : 0.000004s : 2: substitution.replace_old_param 12.03% : 0.000024s : 1: substitution.value_based_eliminate ------[type_inference.] 0.005488 2 90.86% : 0.004986s : 1: type_inference.infer 9.14% : 0.000502s : 1: type_inference.specialize ------[replace.] 0.000023 2 100.00% : 0.000023s : 2: replace.inline ------[match.] 0.000129 2 100.00% : 0.000129s : 2: match.inline ------[predicate.] 0.000171 980 0.83% : 0.000001s : 9: predicate.accumulaten_eliminater 1.03% : 0.000002s : 4: predicate.ad_related_special_op_eliminate 0.64% : 0.000001s : 8: predicate.addn_check_dump 0.82% : 0.000001s : 9: predicate.addn_zero_filter 0.73% : 0.000001s : 9: predicate.adjust_all_reduce_mul_add 2.35% : 0.000004s : 17: predicate.arithmetic_simplify 0.91% : 0.000002s : 9: predicate.cast_eliminate 0.74% : 0.000001s : 8: predicate.check_bprop_eliminate 0.67% : 0.000001s : 8: predicate.compare_switch_simplify 0.22% : 0.000000s : 4: predicate.const_output_eliminate 0.77% : 0.000001s : 8: predicate.depend_value_elim 0.83% : 0.000001s : 9: predicate.dict_get_item_const_eliminator 0.99% : 0.000002s : 9: predicate.dict_get_item_eliminator 0.76% : 0.000001s : 9: predicate.dict_set_item_eliminator 1.29% : 0.000002s : 8: predicate.dumpgradient_eliminate 0.29% : 0.000000s : 4: predicate.elim_not_effective 0.59% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.32% : 0.000002s : 13: predicate.environ_add_const_eliminate 1.11% : 0.000002s : 13: predicate.environ_get_add_eliminate 1.14% : 0.000002s : 13: predicate.environ_get_depend_swap 1.91% : 0.000003s : 21: predicate.environ_get_eliminate 1.04% : 0.000002s : 13: predicate.environ_get_set_eliminate 0.88% : 0.000001s : 11: predicate.exchange_switch_depend_value 1.71% : 0.000003s : 11: predicate.float_depend_g_call 0.71% : 0.000001s : 8: predicate.float_environ_get_switch 1.01% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.27% : 0.000000s : 4: predicate.fold_const_symbol 0.80% : 0.000001s : 8: predicate.get_grad_eliminate 0.36% : 0.000001s : 4: predicate.graph_param_transform 0.84% : 0.000001s : 8: predicate.incorporate_call 0.67% : 0.000001s : 8: predicate.incorporate_call_switch 6.72% : 0.000011s : 44: predicate.inline 1.07% : 0.000002s : 8: predicate.inline_without_move 0.38% : 0.000001s : 8: predicate.j_node_and_user_rematch 0.98% : 0.000002s : 8: predicate.less_batch_normalization 1.80% : 0.000003s : 17: predicate.list_to_tuple_eliminator_ 2.10% : 0.000004s : 26: predicate.load_eliminater 1.14% : 0.000002s : 4: predicate.loop_unroll_after_grad 1.48% : 0.000003s : 16: predicate.loop_unroll_before_grad 1.81% : 0.000003s : 17: predicate.make_slice_get_slice_eliminator 0.73% : 0.000001s : 8: predicate.merge_addn 0.69% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.71% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.71% : 0.000001s : 9: predicate.minmaximum_grad 1.26% : 0.000002s : 4: predicate.mutable_eliminate 0.52% : 0.000001s : 4: predicate.opt_reshape 0.54% : 0.000001s : 4: predicate.parallel_virtual_node 1.18% : 0.000002s : 11: predicate.partial_defer_inline 1.24% : 0.000002s : 13: predicate.partial_eliminate 0.81% : 0.000001s : 9: predicate.print_const_string_wrapper 0.77% : 0.000001s : 8: predicate.reduce_all_const_elim 1.06% : 0.000002s : 9: predicate.reduce_eliminate 2.25% : 0.000004s : 26: predicate.redundant_stop_gradient_eliminater 0.50% : 0.000001s : 8: predicate.remove_not_recompute_node 1.12% : 0.000002s : 17: predicate.replace_applicator 0.73% : 0.000001s : 8: predicate.replace_old_param 0.35% : 0.000001s : 4: predicate.reset_defer_inline 0.99% : 0.000002s : 9: predicate.reshape_eliminate 0.78% : 0.000001s : 8: predicate.row_tensor_add_zeros_like 0.46% : 0.000001s : 4: predicate.row_tensor_eliminate 0.97% : 0.000002s : 8: predicate.same_eliminate 0.52% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.97% : 0.000002s : 8: predicate.shard_identity_eliminate 0.91% : 0.000002s : 8: predicate.special_op_eliminate 1.09% : 0.000002s : 8: predicate.specialize_transform 1.14% : 0.000002s : 8: predicate.split_environ_get_set_with_tuple_value 0.92% : 0.000002s : 8: predicate.stack_unstack_eliminate 0.46% : 0.000001s : 4: predicate.switch_call_monad_eliminater 0.93% : 0.000002s : 11: predicate.switch_defer_inline 1.70% : 0.000003s : 19: predicate.switch_layer_defer_inline 4.06% : 0.000007s : 39: predicate.switch_simplify 0.74% : 0.000001s : 9: predicate.tile_eliminate 0.82% : 0.000001s : 9: predicate.transpose_eliminate 1.67% : 0.000003s : 17: predicate.tuple_list_convert_item_index_to_positive 1.61% : 0.000003s : 17: predicate.tuple_list_get_item_const_eliminator 1.54% : 0.000003s : 17: predicate.tuple_list_get_item_depend_reorder 3.03% : 0.000005s : 25: predicate.tuple_list_get_item_eliminator 1.50% : 0.000003s : 17: predicate.tuple_list_get_set_item_eliminator 2.41% : 0.000004s : 25: predicate.tuple_list_set_item_eliminator 1.57% : 0.000003s : 17: predicate.tuple_to_list_eliminator_ 2.02% : 0.000003s : 26: predicate.updatestate_pure_node_eliminater 3.07% : 0.000005s : 34: predicate.updatestate_useless_node_eliminater 0.68% : 0.000001s : 4: predicate.value_based_eliminate 0.74% : 0.000001s : 8: predicate.virtual_dataset_eliminate 0.92% : 0.000002s : 8: predicate.virtual_output_eliminate 0.35% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.52% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000229 5 7.82% : 0.000018s : 1: func_graph_cloner_run.FuncGraphClonerGraph 92.18% : 0.000211s : 4: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.026127 192 0.02% : 0.000006s : 1: ForceFp32Comm 12.02% : 0.003141s : 1: add_attr 11.97% : 0.003127s : 1: add_attr_with_inline 0.02% : 0.000006s : 1: add_comm_op_reuse_tag 0.25% : 0.000064s : 1: add_recomputation 0.02% : 0.000006s : 1: assign_add_opt 0.25% : 0.000066s : 1: auto_monad 0.12% : 0.000031s : 1: auto_monad_reorder 0.02% : 0.000006s : 1: begin_end_overlap_inline 0.03% : 0.000008s : 1: bias_add_comm_swap 1.92% : 0.000501s : 1: bootstrap 0.12% : 0.000032s : 1: cconv 0.02% : 0.000006s : 1: comm_op_add_attrs 0.08% : 0.000021s : 1: control_data_broadcast_order 0.05% : 0.000014s : 1: convert_after_rewriter 0.14% : 0.000036s : 1: cse_after_recomputation 0.03% : 0.000008s : 1: dataset_repeat_opt 0.08% : 0.000021s : 1: detach_backward 0.05% : 0.000012s : 1: environ_conv 0.09% : 0.000023s : 1: event_method 0.03% : 0.000007s : 1: full_micro_interleaved_order_control 0.03% : 0.000008s : 1: get_jit_bprop_graph 0.04% : 0.000011s : 1: graph_reusing 0.03% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.02% : 0.000006s : 1: handle_group_info 0.03% : 0.000009s : 1: inline 0.03% : 0.000009s : 1: insert-virtual-dataset 0.02% : 0.000006s : 1: interleave_parallel_branches 0.02% : 0.000006s : 1: interleave_split_concat_branches 0.03% : 0.000009s : 1: label_fine_grained_interleaved_index 0.04% : 0.000011s : 1: label_micro_interleaved_index 1.95% : 0.000510s : 1: loop_unroll 0.03% : 0.000007s : 1: merge_cast_opt 0.03% : 0.000008s : 1: micro_interleaved_order_control 1.99% : 0.000521s : 1: mutable_eliminate 0.04% : 0.000010s : 1: offloading_packed_experts 0.06% : 0.000017s : 1: opt.transform.loop_unroll_optimizer 0.07% : 0.000019s : 1: opt.transform.mutable_eliminate 3.81% : 0.000995s : 78: opt.transform.opt_a 0.13% : 0.000034s : 1: opt.transform.opt_after_cconv 0.12% : 0.000031s : 1: opt.transform.opt_after_jit_grad 0.60% : 0.000156s : 28: opt.transform.opt_b 0.22% : 0.000057s : 2: opt.transform.opt_trans_graph 0.18% : 0.000046s : 4: opt.transform.symbol_engine_opt 11.13% : 0.002908s : 1: opt_a 0.57% : 0.000150s : 1: opt_after_cconv 2.04% : 0.000534s : 1: opt_after_jit_grad 1.33% : 0.000348s : 1: opt_b 21.72% : 0.005675s : 1: optimize 0.10% : 0.000026s : 1: optimize_parallel_all_gather_comm 0.05% : 0.000012s : 1: order_py_execute_after_rewriter 0.11% : 0.000028s : 1: overlap_grad_flash_sp 0.02% : 0.000006s : 1: overlap_grad_matmul_and_grad_allreduce 0.04% : 0.000010s : 1: overlap_grad_ring_attention 0.03% : 0.000007s : 1: overlap_opt_shard_grad_in_pipeline 0.02% : 0.000006s : 1: overlap_opt_shard_in_pipeline 0.03% : 0.000007s : 1: overlap_param_gather 0.02% : 0.000006s : 1: overlap_recompute_allgather_and_fa_grad 0.04% : 0.000012s : 1: overlap_recompute_and_grad_model_parallel 0.03% : 0.000007s : 1: overlap_recompute_comm 0.04% : 0.000009s : 1: parallel-infer-symbol 0.02% : 0.000006s : 1: parallel-infer-symbol-second 0.03% : 0.000008s : 1: partial_unused_args_eliminate 0.04% : 0.000010s : 1: pipeline_parallel_scheduler 0.03% : 0.000007s : 1: pipeline_split 0.13% : 0.000033s : 1: pre_auto_parallel 0.09% : 0.000025s : 1: py_interpret_to_execute 0.07% : 0.000018s : 1: py_interpret_to_execute_after_opt_a 0.02% : 0.000006s : 1: remove_cast_before_assign_add 0.09% : 0.000023s : 1: remove_dup_value 1.59% : 0.000416s : 1: renormalize.infer 0.93% : 0.000242s : 1: renormalize.specialize 0.03% : 0.000008s : 1: reorder_send_recv_between_fp_bp 0.04% : 0.000011s : 1: rewriter_after_jit_bprop_graph 0.19% : 0.000050s : 1: rewriter_after_opt_a 0.21% : 0.000055s : 1: rewriter_before_opt_a 0.03% : 0.000008s : 1: slice_cell_reuse_recomputed_activation 0.03% : 0.000008s : 1: slice_recompute_activation 0.03% : 0.000007s : 1: split_layernorm_comm 0.03% : 0.000007s : 1: split_matmul_comm_elemetwise 0.05% : 0.000012s : 1: swap_dp_allreduce_reducescatter 0.42% : 0.000111s : 1: symbol_engine_optimizer 0.39% : 0.000102s : 1: tuple_transform 21.30% : 0.005564s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:53.857.604 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0144073, [21] [bootstrap]: 0.00044044 [type_inference]: 0.00547401 [event_method]: 1.304e-05 [auto_monad]: 5.887e-05 [graph_reusing]: 4.97999e-06 [inline]: 2.46e-06 [add_attr]: 0.00310634, [1] [add_attr_with_inline]: 0.00309742, [1] [Cycle 1]: 5.447e-05, [2] [tag_attr]: 1.487e-05 [meta_addattr_fg_expand]: 3.77002e-06 [parallel-infer-symbol]: 3.06001e-06 [pre_auto_parallel]: 2.57e-05 [insert-virtual-dataset]: 2.59999e-06 [parallel-infer-symbol-second]: 6.50005e-07 [dataset_repeat_opt]: 1.86003e-06 [pipeline_split]: 1.57999e-06 [optimize]: 0.00451127, [53] [py_interpret_to_execute]: 1.802e-05 [rewriter_before_opt_a]: 5.06e-05 [opt_a]: 0.00243136, [2] [Cycle 1]: 0.00169139, [45] [expand_dump_flag]: 2.91e-06 [switch_simplify]: 2.776e-05 [loop_unroll]: 1.59e-05 [a_1]: 0.00034731 [with_stream_mark]: 1.61e-05 [recompute_prepare]: 9.81e-06 [updatestate_depend_eliminate]: 4.35e-06 [updatestate_assign_eliminate]: 4.17998e-06 [updatestate_loads_eliminate]: 4.68001e-06 [parameter_eliminate]: 1.84e-06 [a_2]: 9.734e-05 [accelerated_algorithm]: 8.07e-06 [shard]: 1.97999e-06 [meta_shard_fg_expand]: 2.21e-06 [shard_inline]: 7.66999e-06 [merge_send_recv]: 8.95001e-06 [auto_parallel]: 6.89999e-06 [parallel]: 1.787e-05 [flash_sp]: 8.2e-06 [merge_comm]: 4.65001e-06 [allreduce_fusion]: 4.25999e-06 [matmul_add_comm_reduction]: 1.113e-05 [allreduce_slice_to_reducescatter]: 6.00005e-07 [virtual_shard_identity]: 9.52001e-06 [virtual_dataset]: 7.93999e-06 [get_grad_eliminate_]: 7.11001e-06 [virtual_output]: 1.005e-05 [merge_forward]: 4.70001e-06 [cell_reuse_recompute_pass]: 1.30999e-06 [offload_activation]: 1.051e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.742e-05 [merge_recompute_call_nodes]: 1.49e-06 [before_grad]: 1.322e-05 [set_forward_comm_id_for_comm_node_pass]: 4.71002e-06 [meta_fg_expand]: 3.04999e-06 [flash_sp_send_recv_attached]: 2.99999e-06 [receive_attached]: 2.47001e-06 [after_resolve]: 1.107e-05 [a_after_grad]: 1.158e-05 [renormalize]: 0.0005627 [add_forward_monad_depend]: 5.50001e-06 [auto_monad_grad]: 2.31998e-06 [auto_monad_eliminator]: 1.683e-05 [cse]: 3.682e-05 [a_3]: 5.731e-05 [Cycle 2]: 0.00073021, [45] [expand_dump_flag]: 9.80013e-07 [switch_simplify]: 8.63001e-06 [loop_unroll]: 7.4e-06 [a_1]: 0.00015405 [with_stream_mark]: 1.578e-05 [recompute_prepare]: 7.65e-06 [updatestate_depend_eliminate]: 4.2e-06 [updatestate_assign_eliminate]: 2.98e-06 [updatestate_loads_eliminate]: 2.99999e-06 [parameter_eliminate]: 1.52001e-06 [a_2]: 8.805e-05 [accelerated_algorithm]: 7.16999e-06 [shard]: 1.10001e-06 [meta_shard_fg_expand]: 1.82001e-06 [shard_inline]: 7.51999e-06 [merge_send_recv]: 5.85002e-06 [auto_parallel]: 6.34001e-06 [parallel]: 5.31002e-06 [flash_sp]: 3.75998e-06 [merge_comm]: 3.93999e-06 [allreduce_fusion]: 3.98001e-06 [matmul_add_comm_reduction]: 7e-06 [allreduce_slice_to_reducescatter]: 3.50003e-07 [virtual_shard_identity]: 8.26002e-06 [virtual_dataset]: 6.94001e-06 [get_grad_eliminate_]: 6.84999e-06 [virtual_output]: 6.56999e-06 [merge_forward]: 3.53999e-06 [cell_reuse_recompute_pass]: 1.71e-06 [offload_activation]: 9.02e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.556e-05 [merge_recompute_call_nodes]: 9.40025e-07 [before_grad]: 1.241e-05 [set_forward_comm_id_for_comm_node_pass]: 5.01002e-06 [meta_fg_expand]: 2.73e-06 [flash_sp_send_recv_attached]: 9.10019e-07 [receive_attached]: 1.14998e-06 [after_resolve]: 1.001e-05 [a_after_grad]: 1.022e-05 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 1.07998e-06 [auto_monad_grad]: 9.30013e-07 [auto_monad_eliminator]: 8.01001e-06 [cse]: 1.895e-05 [a_3]: 4.291e-05 [py_interpret_to_execute_after_opt_a]: 9.62999e-06 [slice_cell_reuse_recomputed_activation]: 2.84999e-06 [rewriter_after_opt_a]: 4.081e-05 [convert_after_rewriter]: 7.64002e-06 [order_py_execute_after_rewriter]: 6.07999e-06 [mutable_eliminate]: 0.00048276 [opt_b]: 0.0002652, [1] [Cycle 1]: 0.00025894, [7] [b_1]: 0.00017553 [b_2]: 9.34e-06 [updatestate_depend_eliminate]: 7.05002e-06 [updatestate_assign_eliminate]: 2.94999e-06 [updatestate_loads_eliminate]: 3.03e-06 [renormalize]: 7.09988e-07 [cse]: 2.338e-05 [optimize_parallel_all_gather_comm]: 1.711e-05 [overlap_param_gather]: 2.12999e-06 [cconv]: 2.599e-05 [loop_unroll]: 0.00042963 [opt_after_cconv]: 0.00011391, [1] [Cycle 1]: 0.0001086, [7] [c_1]: 3.455e-05 [parameter_eliminate]: 3.43e-06 [updatestate_depend_eliminate]: 5.89999e-06 [updatestate_assign_eliminate]: 2.86999e-06 [updatestate_loads_eliminate]: 3.21999e-06 [cse]: 2.283e-05 [renormalize]: 3.39991e-07 [remove_dup_value]: 1.535e-05 [tuple_transform]: 8.068e-05, [1] [Cycle 1]: 7.607e-05, [4] [d_1]: 4.819e-05 [none_parameter_eliminate]: 1.59e-06 [renormalize]: 1.60013e-07 [switch_simplify]: 8.24002e-06 [partial_unused_args_eliminate]: 1.73002e-06 [add_recomputation]: 5.5e-05 [cse_after_recomputation]: 2.602e-05, [1] [Cycle 1]: 2.111e-05, [1] [cse]: 1.56e-05 [environ_conv]: 7.11001e-06 [swap_dp_allreduce_reducescatter]: 7.35e-06 [bias_add_comm_swap]: 2.92002e-06 [label_micro_interleaved_index]: 5.19e-06 [label_fine_grained_interleaved_index]: 2.85998e-06 [merge_cast_opt]: 1.24998e-06 [slice_recompute_activation]: 2.03002e-06 [micro_interleaved_order_control]: 2.62001e-06 [assign_add_opt]: 1.29e-06 [ForceFp32Comm]: 8.40024e-07 [remove_cast_before_assign_add]: 1.17e-06 [full_micro_interleaved_order_control]: 2.16e-06 [reorder_send_recv_between_fp_bp]: 3.06999e-06 [comm_op_add_attrs]: 1.19998e-06 [add_comm_op_reuse_tag]: 9.79984e-07 [interleave_split_concat_branches]: 1.12999e-06 [interleave_parallel_branches]: 1.07998e-06 [overlap_opt_shard_in_pipeline]: 1.60001e-06 [overlap_opt_shard_grad_in_pipeline]: 1.78002e-06 [control_data_broadcast_order]: 1.449e-05 [grouped_pairwise_exchange_alltoall]: 1.65001e-06 [offloading_packed_experts]: 5.32001e-06 [overlap_recompute_and_grad_model_parallel]: 5.46998e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.17e-06 [overlap_recompute_allgather_and_fa_grad]: 1.47999e-06 [overlap_recompute_comm]: 2.17001e-06 [overlap_grad_ring_attention]: 4.81002e-06 [overlap_grad_flash_sp]: 2.186e-05 [begin_end_overlap_inline]: 8.10018e-07 [split_matmul_comm_elemetwise]: 2.26998e-06 [split_layernorm_comm]: 2.05002e-06 [handle_group_info]: 9.70002e-07 [symbol_engine_optimizer]: 8.998e-05, [1] [Cycle 1]: 8.553e-05, [6] [build]: 2.96001e-06 [elim_shapecalc]: 1.396e-05 [elim_not_effective]: 1.539e-05 [opt_reshape]: 8.14002e-06 [fold_const_symbol]: 1.239e-05 [renormalize]: 6.29982e-07 [detach_backward]: 1.86e-06 [pipeline_parallel_scheduler]: 1.74998e-06 [auto_monad_reorder]: 2.055e-05 [get_jit_bprop_graph]: 1.42999e-06 [rewriter_after_jit_bprop_graph]: 4.03999e-06 [opt_after_jit_grad]: 0.00053682 [validate]: 4.241e-05 Sums bootstrap : 0.000440s : 4.28% type_inference : 0.005474s : 53.18% event_method : 0.000013s : 0.13% auto_monad : 0.000059s : 0.57% graph_reusing : 0.000005s : 0.05% inline : 0.000002s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000015s : 0.14% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000004s : 0.04% parallel-infer-symbol : 0.000003s : 0.03% pre_auto_parallel : 0.000026s : 0.25% insert-virtual-dataset : 0.000003s : 0.03% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.02% optimize.py_interpret_to_execute : 0.000018s : 0.18% optimize.rewriter_before_opt_a : 0.000051s : 0.49% optimize.opt_a.expand_dump_flag : 0.000004s : 0.04% optimize.opt_a.switch_simplify : 0.000036s : 0.35% optimize.opt_a.loop_unroll : 0.000023s : 0.23% optimize.opt_a.a_1 : 0.000501s : 4.87% optimize.opt_a.with_stream_mark : 0.000032s : 0.31% optimize.opt_a.recompute_prepare : 0.000017s : 0.17% optimize.opt_a.updatestate_depend_eliminate : 0.000009s : 0.08% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.07% optimize.opt_a.updatestate_loads_eliminate : 0.000008s : 0.07% optimize.opt_a.parameter_eliminate : 0.000003s : 0.03% optimize.opt_a.a_2 : 0.000185s : 1.80% optimize.opt_a.accelerated_algorithm : 0.000015s : 0.15% optimize.opt_a.shard : 0.000003s : 0.03% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.04% optimize.opt_a.shard_inline : 0.000015s : 0.15% optimize.opt_a.merge_send_recv : 0.000015s : 0.14% optimize.opt_a.auto_parallel : 0.000013s : 0.13% optimize.opt_a.parallel : 0.000023s : 0.23% optimize.opt_a.flash_sp : 0.000012s : 0.12% optimize.opt_a.merge_comm : 0.000009s : 0.08% optimize.opt_a.allreduce_fusion : 0.000008s : 0.08% optimize.opt_a.matmul_add_comm_reduction : 0.000018s : 0.18% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000018s : 0.17% optimize.opt_a.virtual_dataset : 0.000015s : 0.14% optimize.opt_a.get_grad_eliminate_ : 0.000014s : 0.14% optimize.opt_a.virtual_output : 0.000017s : 0.16% optimize.opt_a.merge_forward : 0.000008s : 0.08% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.03% optimize.opt_a.offload_activation : 0.000020s : 0.19% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000033s : 0.32% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.02% optimize.opt_a.before_grad : 0.000026s : 0.25% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000010s : 0.09% optimize.opt_a.meta_fg_expand : 0.000006s : 0.06% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.04% optimize.opt_a.receive_attached : 0.000004s : 0.04% optimize.opt_a.after_resolve : 0.000021s : 0.20% optimize.opt_a.a_after_grad : 0.000022s : 0.21% optimize.opt_a.renormalize : 0.000563s : 5.47% optimize.opt_a.add_forward_monad_depend : 0.000007s : 0.06% optimize.opt_a.auto_monad_grad : 0.000003s : 0.03% optimize.opt_a.auto_monad_eliminator : 0.000025s : 0.24% optimize.opt_a.cse : 0.000056s : 0.54% optimize.opt_a.a_3 : 0.000100s : 0.97% optimize.py_interpret_to_execute_after_opt_a : 0.000010s : 0.09% optimize.slice_cell_reuse_recomputed_activation : 0.000003s : 0.03% optimize.rewriter_after_opt_a : 0.000041s : 0.40% optimize.convert_after_rewriter : 0.000008s : 0.07% optimize.order_py_execute_after_rewriter : 0.000006s : 0.06% optimize.mutable_eliminate : 0.000483s : 4.69% optimize.opt_b.b_1 : 0.000176s : 1.71% optimize.opt_b.b_2 : 0.000009s : 0.09% optimize.opt_b.updatestate_depend_eliminate : 0.000007s : 0.07% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_b.renormalize : 0.000001s : 0.01% optimize.opt_b.cse : 0.000023s : 0.23% optimize.optimize_parallel_all_gather_comm : 0.000017s : 0.17% optimize.overlap_param_gather : 0.000002s : 0.02% optimize.cconv : 0.000026s : 0.25% optimize.loop_unroll : 0.000430s : 4.17% optimize.opt_after_cconv.c_1 : 0.000035s : 0.34% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.06% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.cse : 0.000023s : 0.22% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000015s : 0.15% optimize.tuple_transform.d_1 : 0.000048s : 0.47% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.02% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000008s : 0.08% optimize.partial_unused_args_eliminate : 0.000002s : 0.02% optimize.add_recomputation : 0.000055s : 0.53% optimize.cse_after_recomputation.cse : 0.000016s : 0.15% optimize.environ_conv : 0.000007s : 0.07% optimize.swap_dp_allreduce_reducescatter : 0.000007s : 0.07% optimize.bias_add_comm_swap : 0.000003s : 0.03% optimize.label_micro_interleaved_index : 0.000005s : 0.05% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.03% optimize.merge_cast_opt : 0.000001s : 0.01% optimize.slice_recompute_activation : 0.000002s : 0.02% optimize.micro_interleaved_order_control : 0.000003s : 0.03% optimize.assign_add_opt : 0.000001s : 0.01% optimize.ForceFp32Comm : 0.000001s : 0.01% optimize.remove_cast_before_assign_add : 0.000001s : 0.01% optimize.full_micro_interleaved_order_control : 0.000002s : 0.02% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.03% optimize.comm_op_add_attrs : 0.000001s : 0.01% optimize.add_comm_op_reuse_tag : 0.000001s : 0.01% optimize.interleave_split_concat_branches : 0.000001s : 0.01% optimize.interleave_parallel_branches : 0.000001s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000002s : 0.02% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.02% optimize.control_data_broadcast_order : 0.000014s : 0.14% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.02% optimize.offloading_packed_experts : 0.000005s : 0.05% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.05% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.01% optimize.overlap_recompute_comm : 0.000002s : 0.02% optimize.overlap_grad_ring_attention : 0.000005s : 0.05% optimize.overlap_grad_flash_sp : 0.000022s : 0.21% optimize.begin_end_overlap_inline : 0.000001s : 0.01% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.02% optimize.split_layernorm_comm : 0.000002s : 0.02% optimize.handle_group_info : 0.000001s : 0.01% optimize.symbol_engine_optimizer.build : 0.000003s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000014s : 0.14% optimize.symbol_engine_optimizer.elim_not_effective : 0.000015s : 0.15% optimize.symbol_engine_optimizer.opt_reshape : 0.000008s : 0.08% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000012s : 0.12% optimize.symbol_engine_optimizer.renormalize : 0.000001s : 0.01% detach_backward : 0.000002s : 0.02% pipeline_parallel_scheduler : 0.000002s : 0.02% auto_monad_reorder : 0.000021s : 0.20% get_jit_bprop_graph : 0.000001s : 0.01% rewriter_after_jit_bprop_graph : 0.000004s : 0.04% opt_after_jit_grad : 0.000537s : 5.21% validate : 0.000042s : 0.41% Time group info: ------[substitution.] 0.000166 29 11.65% : 0.000019s : 2: substitution.cast_eliminate 1.40% : 0.000002s : 3: substitution.elim_not_effective 1.00% : 0.000002s : 3: substitution.fold_const_symbol 3.91% : 0.000006s : 4: substitution.graph_param_transform 59.86% : 0.000099s : 2: substitution.inline 2.74% : 0.000005s : 6: substitution.j_node_and_user_rematch 3.93% : 0.000007s : 6: substitution.remove_not_recompute_node 2.26% : 0.000004s : 2: substitution.replace_old_param 13.24% : 0.000022s : 1: substitution.value_based_eliminate ------[type_inference.] 0.005424 2 91.16% : 0.004944s : 1: type_inference.infer 8.84% : 0.000480s : 1: type_inference.specialize ------[replace.] 0.000023 2 100.00% : 0.000023s : 2: replace.inline ------[match.] 0.000097 2 100.00% : 0.000097s : 2: match.inline ------[predicate.] 0.000169 980 0.78% : 0.000001s : 9: predicate.accumulaten_eliminater 1.13% : 0.000002s : 4: predicate.ad_related_special_op_eliminate 0.66% : 0.000001s : 8: predicate.addn_check_dump 0.78% : 0.000001s : 9: predicate.addn_zero_filter 0.71% : 0.000001s : 9: predicate.adjust_all_reduce_mul_add 2.09% : 0.000004s : 17: predicate.arithmetic_simplify 0.94% : 0.000002s : 9: predicate.cast_eliminate 0.78% : 0.000001s : 8: predicate.check_bprop_eliminate 0.71% : 0.000001s : 8: predicate.compare_switch_simplify 0.28% : 0.000000s : 4: predicate.const_output_eliminate 0.79% : 0.000001s : 8: predicate.depend_value_elim 0.85% : 0.000001s : 9: predicate.dict_get_item_const_eliminator 0.94% : 0.000002s : 9: predicate.dict_get_item_eliminator 0.77% : 0.000001s : 9: predicate.dict_set_item_eliminator 1.28% : 0.000002s : 8: predicate.dumpgradient_eliminate 0.24% : 0.000000s : 4: predicate.elim_not_effective 0.54% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.10% : 0.000002s : 13: predicate.environ_add_const_eliminate 1.09% : 0.000002s : 13: predicate.environ_get_add_eliminate 1.01% : 0.000002s : 13: predicate.environ_get_depend_swap 1.91% : 0.000003s : 21: predicate.environ_get_eliminate 1.10% : 0.000002s : 13: predicate.environ_get_set_eliminate 0.95% : 0.000002s : 11: predicate.exchange_switch_depend_value 1.78% : 0.000003s : 11: predicate.float_depend_g_call 0.68% : 0.000001s : 8: predicate.float_environ_get_switch 0.97% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.20% : 0.000000s : 4: predicate.fold_const_symbol 0.83% : 0.000001s : 8: predicate.get_grad_eliminate 0.26% : 0.000000s : 4: predicate.graph_param_transform 0.78% : 0.000001s : 8: predicate.incorporate_call 0.67% : 0.000001s : 8: predicate.incorporate_call_switch 6.32% : 0.000011s : 44: predicate.inline 1.10% : 0.000002s : 8: predicate.inline_without_move 0.44% : 0.000001s : 8: predicate.j_node_and_user_rematch 0.98% : 0.000002s : 8: predicate.less_batch_normalization 1.61% : 0.000003s : 17: predicate.list_to_tuple_eliminator_ 2.09% : 0.000004s : 26: predicate.load_eliminater 1.26% : 0.000002s : 4: predicate.loop_unroll_after_grad 1.59% : 0.000003s : 16: predicate.loop_unroll_before_grad 1.79% : 0.000003s : 17: predicate.make_slice_get_slice_eliminator 0.79% : 0.000001s : 8: predicate.merge_addn 0.70% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.92% : 0.000002s : 8: predicate.mini_step_allgather_replace 0.75% : 0.000001s : 9: predicate.minmaximum_grad 1.29% : 0.000002s : 4: predicate.mutable_eliminate 0.45% : 0.000001s : 4: predicate.opt_reshape 0.44% : 0.000001s : 4: predicate.parallel_virtual_node 1.28% : 0.000002s : 11: predicate.partial_defer_inline 1.30% : 0.000002s : 13: predicate.partial_eliminate 0.78% : 0.000001s : 9: predicate.print_const_string_wrapper 0.70% : 0.000001s : 8: predicate.reduce_all_const_elim 1.00% : 0.000002s : 9: predicate.reduce_eliminate 2.14% : 0.000004s : 26: predicate.redundant_stop_gradient_eliminater 0.53% : 0.000001s : 8: predicate.remove_not_recompute_node 1.51% : 0.000003s : 17: predicate.replace_applicator 0.63% : 0.000001s : 8: predicate.replace_old_param 0.39% : 0.000001s : 4: predicate.reset_defer_inline 0.86% : 0.000001s : 9: predicate.reshape_eliminate 0.77% : 0.000001s : 8: predicate.row_tensor_add_zeros_like 0.50% : 0.000001s : 4: predicate.row_tensor_eliminate 0.99% : 0.000002s : 8: predicate.same_eliminate 0.54% : 0.000001s : 8: predicate.set_cell_output_no_recompute 1.04% : 0.000002s : 8: predicate.shard_identity_eliminate 0.85% : 0.000001s : 8: predicate.special_op_eliminate 0.96% : 0.000002s : 8: predicate.specialize_transform 0.99% : 0.000002s : 8: predicate.split_environ_get_set_with_tuple_value 0.87% : 0.000001s : 8: predicate.stack_unstack_eliminate 0.43% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.00% : 0.000002s : 11: predicate.switch_defer_inline 1.73% : 0.000003s : 19: predicate.switch_layer_defer_inline 4.18% : 0.000007s : 39: predicate.switch_simplify 0.78% : 0.000001s : 9: predicate.tile_eliminate 0.87% : 0.000001s : 9: predicate.transpose_eliminate 1.51% : 0.000003s : 17: predicate.tuple_list_convert_item_index_to_positive 1.60% : 0.000003s : 17: predicate.tuple_list_get_item_const_eliminator 1.52% : 0.000003s : 17: predicate.tuple_list_get_item_depend_reorder 3.22% : 0.000005s : 25: predicate.tuple_list_get_item_eliminator 1.54% : 0.000003s : 17: predicate.tuple_list_get_set_item_eliminator 2.69% : 0.000005s : 25: predicate.tuple_list_set_item_eliminator 1.58% : 0.000003s : 17: predicate.tuple_to_list_eliminator_ 2.06% : 0.000003s : 26: predicate.updatestate_pure_node_eliminater 2.93% : 0.000005s : 34: predicate.updatestate_useless_node_eliminater 0.69% : 0.000001s : 4: predicate.value_based_eliminate 0.88% : 0.000001s : 8: predicate.virtual_dataset_eliminate 0.91% : 0.000002s : 8: predicate.virtual_output_eliminate 0.38% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.72% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000226 5 8.12% : 0.000018s : 1: func_graph_cloner_run.FuncGraphClonerGraph 91.88% : 0.000207s : 4: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.023767 192 0.02% : 0.000004s : 1: ForceFp32Comm 13.09% : 0.003111s : 1: add_attr 13.05% : 0.003101s : 1: add_attr_with_inline 0.02% : 0.000004s : 1: add_comm_op_reuse_tag 0.25% : 0.000059s : 1: add_recomputation 0.02% : 0.000004s : 1: assign_add_opt 0.27% : 0.000063s : 1: auto_monad 0.11% : 0.000025s : 1: auto_monad_reorder 0.02% : 0.000004s : 1: begin_end_overlap_inline 0.03% : 0.000006s : 1: bias_add_comm_swap 1.98% : 0.000471s : 1: bootstrap 0.12% : 0.000030s : 1: cconv 0.02% : 0.000004s : 1: comm_op_add_attrs 0.08% : 0.000018s : 1: control_data_broadcast_order 0.05% : 0.000011s : 1: convert_after_rewriter 0.12% : 0.000029s : 1: cse_after_recomputation 0.02% : 0.000005s : 1: dataset_repeat_opt 0.02% : 0.000005s : 1: detach_backward 0.05% : 0.000011s : 1: environ_conv 0.08% : 0.000019s : 1: event_method 0.02% : 0.000005s : 1: full_micro_interleaved_order_control 0.02% : 0.000005s : 1: get_jit_bprop_graph 0.03% : 0.000008s : 1: graph_reusing 0.02% : 0.000005s : 1: grouped_pairwise_exchange_alltoall 0.02% : 0.000004s : 1: handle_group_info 0.02% : 0.000005s : 1: inline 0.03% : 0.000006s : 1: insert-virtual-dataset 0.02% : 0.000004s : 1: interleave_parallel_branches 0.02% : 0.000004s : 1: interleave_split_concat_branches 0.02% : 0.000006s : 1: label_fine_grained_interleaved_index 0.04% : 0.000008s : 1: label_micro_interleaved_index 1.84% : 0.000437s : 1: loop_unroll 0.02% : 0.000004s : 1: merge_cast_opt 0.02% : 0.000006s : 1: micro_interleaved_order_control 2.06% : 0.000491s : 1: mutable_eliminate 0.04% : 0.000009s : 1: offloading_packed_experts 0.07% : 0.000016s : 1: opt.transform.loop_unroll_optimizer 0.07% : 0.000016s : 1: opt.transform.mutable_eliminate 4.06% : 0.000965s : 78: opt.transform.opt_a 0.14% : 0.000033s : 1: opt.transform.opt_after_cconv 0.13% : 0.000031s : 1: opt.transform.opt_after_jit_grad 0.64% : 0.000152s : 28: opt.transform.opt_b 0.23% : 0.000054s : 2: opt.transform.opt_trans_graph 0.19% : 0.000045s : 4: opt.transform.symbol_engine_opt 10.24% : 0.002435s : 1: opt_a 0.50% : 0.000118s : 1: opt_after_cconv 2.31% : 0.000549s : 1: opt_after_jit_grad 1.13% : 0.000269s : 1: opt_b 19.00% : 0.004516s : 1: optimize 0.09% : 0.000020s : 1: optimize_parallel_all_gather_comm 0.04% : 0.000009s : 1: order_py_execute_after_rewriter 0.11% : 0.000026s : 1: overlap_grad_flash_sp 0.02% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.03% : 0.000008s : 1: overlap_grad_ring_attention 0.02% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.02% : 0.000005s : 1: overlap_opt_shard_in_pipeline 0.02% : 0.000005s : 1: overlap_param_gather 0.02% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.04% : 0.000009s : 1: overlap_recompute_and_grad_model_parallel 0.02% : 0.000005s : 1: overlap_recompute_comm 0.03% : 0.000007s : 1: parallel-infer-symbol 0.02% : 0.000004s : 1: parallel-infer-symbol-second 0.02% : 0.000005s : 1: partial_unused_args_eliminate 0.02% : 0.000005s : 1: pipeline_parallel_scheduler 0.02% : 0.000004s : 1: pipeline_split 0.12% : 0.000030s : 1: pre_auto_parallel 0.09% : 0.000022s : 1: py_interpret_to_execute 0.05% : 0.000013s : 1: py_interpret_to_execute_after_opt_a 0.02% : 0.000005s : 1: remove_cast_before_assign_add 0.08% : 0.000019s : 1: remove_dup_value 1.34% : 0.000319s : 1: renormalize.infer 0.99% : 0.000236s : 1: renormalize.specialize 0.03% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.03% : 0.000007s : 1: rewriter_after_jit_bprop_graph 0.19% : 0.000045s : 1: rewriter_after_opt_a 0.23% : 0.000054s : 1: rewriter_before_opt_a 0.02% : 0.000006s : 1: slice_cell_reuse_recomputed_activation 0.02% : 0.000005s : 1: slice_recompute_activation 0.02% : 0.000005s : 1: split_layernorm_comm 0.02% : 0.000005s : 1: split_matmul_comm_elemetwise 0.05% : 0.000011s : 1: swap_dp_allreduce_reducescatter 0.39% : 0.000093s : 1: symbol_engine_optimizer 0.35% : 0.000084s : 1: tuple_transform 23.10% : 0.005490s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:54.534.19 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:54.537.07 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0160321, [21] [bootstrap]: 0.00044839 [type_inference]: 0.00516837 [event_method]: 1.554e-05 [auto_monad]: 5.872e-05 [graph_reusing]: 6.06998e-06 [inline]: 2.57001e-06 [add_attr]: 0.00328948, [1] [add_attr_with_inline]: 0.0032793, [1] [Cycle 1]: 8.142e-05, [2] [tag_attr]: 1.623e-05 [meta_addattr_fg_expand]: 4.18001e-06 [parallel-infer-symbol]: 3.48e-06 [pre_auto_parallel]: 2.898e-05 [insert-virtual-dataset]: 2.56e-06 [parallel-infer-symbol-second]: 6.90023e-07 [dataset_repeat_opt]: 2.04e-06 [pipeline_split]: 1.57001e-06 [optimize]: 0.00570772, [53] [py_interpret_to_execute]: 2.478e-05 [rewriter_before_opt_a]: 5.282e-05 [opt_a]: 0.0031087, [2] [Cycle 1]: 0.00212823, [45] [expand_dump_flag]: 2.81e-06 [switch_simplify]: 2.754e-05 [loop_unroll]: 1.473e-05 [a_1]: 0.00035468 [with_stream_mark]: 1.682e-05 [recompute_prepare]: 1.045e-05 [updatestate_depend_eliminate]: 4.95001e-06 [updatestate_assign_eliminate]: 4.23999e-06 [updatestate_loads_eliminate]: 3.81999e-06 [parameter_eliminate]: 2.24001e-06 [a_2]: 0.00012464 [accelerated_algorithm]: 8.54002e-06 [shard]: 3.14001e-06 [meta_shard_fg_expand]: 2.44999e-06 [shard_inline]: 8.52e-06 [merge_send_recv]: 9.67999e-06 [auto_parallel]: 7.97e-06 [parallel]: 1.869e-05 [flash_sp]: 9.34e-06 [merge_comm]: 4.94e-06 [allreduce_fusion]: 4.16001e-06 [matmul_add_comm_reduction]: 1.077e-05 [allreduce_slice_to_reducescatter]: 7.00005e-07 [virtual_shard_identity]: 1.058e-05 [virtual_dataset]: 7.55e-06 [get_grad_eliminate_]: 8.34998e-06 [virtual_output]: 7.7e-06 [merge_forward]: 4.60001e-06 [cell_reuse_recompute_pass]: 1.38002e-06 [offload_activation]: 1.154e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.826e-05 [merge_recompute_call_nodes]: 1.67001e-06 [before_grad]: 1.444e-05 [set_forward_comm_id_for_comm_node_pass]: 4.65001e-06 [meta_fg_expand]: 3.53999e-06 [flash_sp_send_recv_attached]: 3.09001e-06 [receive_attached]: 2.27999e-06 [after_resolve]: 1.173e-05 [a_after_grad]: 1.091e-05 [renormalize]: 0.00080157 [add_forward_monad_depend]: 6.09999e-06 [auto_monad_grad]: 2.48e-06 [auto_monad_eliminator]: 1.932e-05 [cse]: 3.852e-05 [a_3]: 7.455e-05 [Cycle 2]: 0.0009651, [45] [expand_dump_flag]: 1.67999e-06 [switch_simplify]: 9.53002e-06 [loop_unroll]: 7.38999e-06 [a_1]: 0.00015521 [with_stream_mark]: 1.474e-05 [recompute_prepare]: 8.90999e-06 [updatestate_depend_eliminate]: 4.60999e-06 [updatestate_assign_eliminate]: 3.47002e-06 [updatestate_loads_eliminate]: 3.26001e-06 [parameter_eliminate]: 1.34e-06 [a_2]: 0.00011602 [accelerated_algorithm]: 7.35998e-06 [shard]: 1.55001e-06 [meta_shard_fg_expand]: 1.79e-06 [shard_inline]: 7.71999e-06 [merge_send_recv]: 7.63001e-06 [auto_parallel]: 7.82e-06 [parallel]: 5.85002e-06 [flash_sp]: 4.20999e-06 [merge_comm]: 4.51002e-06 [allreduce_fusion]: 4.38999e-06 [matmul_add_comm_reduction]: 9.41998e-06 [allreduce_slice_to_reducescatter]: 8.89995e-07 [virtual_shard_identity]: 1.014e-05 [virtual_dataset]: 6.94001e-06 [get_grad_eliminate_]: 6.67002e-06 [virtual_output]: 6.34001e-06 [merge_forward]: 4.57e-06 [cell_reuse_recompute_pass]: 1.72001e-06 [offload_activation]: 1.038e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.749e-05 [merge_recompute_call_nodes]: 1.23002e-06 [before_grad]: 1.295e-05 [set_forward_comm_id_for_comm_node_pass]: 5.60001e-06 [meta_fg_expand]: 2.88998e-06 [flash_sp_send_recv_attached]: 1.52001e-06 [receive_attached]: 1.57999e-06 [after_resolve]: 1.155e-05 [a_after_grad]: 1.226e-05 [renormalize]: 5.9983e-08 [add_forward_monad_depend]: 2.72001e-06 [auto_monad_grad]: 1.44e-06 [auto_monad_eliminator]: 1.378e-05 [cse]: 2.507e-05 [a_3]: 5.945e-05 [py_interpret_to_execute_after_opt_a]: 1.727e-05 [slice_cell_reuse_recomputed_activation]: 5.34e-06 [rewriter_after_opt_a]: 5.075e-05 [convert_after_rewriter]: 1.119e-05 [order_py_execute_after_rewriter]: 9.01002e-06 [mutable_eliminate]: 0.00061246 [opt_b]: 0.00036798, [1] [Cycle 1]: 0.00035689, [7] [b_1]: 0.00023595 [b_2]: 9.59e-06 [updatestate_depend_eliminate]: 9.02999e-06 [updatestate_assign_eliminate]: 3.43e-06 [updatestate_loads_eliminate]: 3.59002e-06 [renormalize]: 7.2e-07 [cse]: 3.256e-05 [optimize_parallel_all_gather_comm]: 2.392e-05 [overlap_param_gather]: 4.60999e-06 [cconv]: 3.361e-05 [loop_unroll]: 0.00049187 [opt_after_cconv]: 0.00014844, [1] [Cycle 1]: 0.0001385, [7] [c_1]: 3.342e-05 [parameter_eliminate]: 4.00998e-06 [updatestate_depend_eliminate]: 7.3e-06 [updatestate_assign_eliminate]: 2.94999e-06 [updatestate_loads_eliminate]: 3.34001e-06 [cse]: 2.786e-05 [renormalize]: 3.80009e-07 [remove_dup_value]: 2.051e-05 [tuple_transform]: 9.796e-05, [1] [Cycle 1]: 9.061e-05, [4] [d_1]: 4.979e-05 [none_parameter_eliminate]: 1.67999e-06 [renormalize]: 2.09984e-07 [switch_simplify]: 8.03001e-06 [partial_unused_args_eliminate]: 4.77e-06 [add_recomputation]: 6.03e-05 [cse_after_recomputation]: 3.343e-05, [1] [Cycle 1]: 2.688e-05, [1] [cse]: 1.69e-05 [environ_conv]: 1.001e-05 [swap_dp_allreduce_reducescatter]: 8.58001e-06 [bias_add_comm_swap]: 5.25999e-06 [label_micro_interleaved_index]: 7.71999e-06 [label_fine_grained_interleaved_index]: 5.00999e-06 [merge_cast_opt]: 3.73001e-06 [slice_recompute_activation]: 4.90999e-06 [micro_interleaved_order_control]: 4.48001e-06 [assign_add_opt]: 3.69002e-06 [ForceFp32Comm]: 3.09001e-06 [remove_cast_before_assign_add]: 3.45e-06 [full_micro_interleaved_order_control]: 4.92999e-06 [reorder_send_recv_between_fp_bp]: 5.23002e-06 [comm_op_add_attrs]: 3.58e-06 [add_comm_op_reuse_tag]: 3.43e-06 [interleave_split_concat_branches]: 3.49001e-06 [interleave_parallel_branches]: 3.58e-06 [overlap_opt_shard_in_pipeline]: 3.63e-06 [overlap_opt_shard_grad_in_pipeline]: 4.68999e-06 [control_data_broadcast_order]: 2.074e-05 [grouped_pairwise_exchange_alltoall]: 4.2e-06 [offloading_packed_experts]: 7.86001e-06 [overlap_recompute_and_grad_model_parallel]: 8.1e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.65e-06 [overlap_recompute_allgather_and_fa_grad]: 3.72998e-06 [overlap_recompute_comm]: 4.60999e-06 [overlap_grad_ring_attention]: 7.02002e-06 [overlap_grad_flash_sp]: 2.644e-05 [begin_end_overlap_inline]: 2.94001e-06 [split_matmul_comm_elemetwise]: 5.17e-06 [split_layernorm_comm]: 4.68001e-06 [handle_group_info]: 4.18001e-06 [symbol_engine_optimizer]: 0.00010999, [1] [Cycle 1]: 0.00010203, [6] [build]: 4.1e-06 [elim_shapecalc]: 1.204e-05 [elim_not_effective]: 1.642e-05 [opt_reshape]: 8.47998e-06 [fold_const_symbol]: 1.225e-05 [renormalize]: 2.30008e-07 [detach_backward]: 4.40999e-06 [pipeline_parallel_scheduler]: 2.17999e-06 [auto_monad_reorder]: 2.492e-05 [get_jit_bprop_graph]: 2.06998e-06 [rewriter_after_jit_bprop_graph]: 6.01e-06 [opt_after_jit_grad]: 0.0005461 [validate]: 4.558e-05 Sums bootstrap : 0.000448s : 4.14% type_inference : 0.005168s : 47.68% event_method : 0.000016s : 0.14% auto_monad : 0.000059s : 0.54% graph_reusing : 0.000006s : 0.06% inline : 0.000003s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000016s : 0.15% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000004s : 0.04% parallel-infer-symbol : 0.000003s : 0.03% pre_auto_parallel : 0.000029s : 0.27% insert-virtual-dataset : 0.000003s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000025s : 0.23% optimize.rewriter_before_opt_a : 0.000053s : 0.49% optimize.opt_a.expand_dump_flag : 0.000004s : 0.04% optimize.opt_a.switch_simplify : 0.000037s : 0.34% optimize.opt_a.loop_unroll : 0.000022s : 0.20% optimize.opt_a.a_1 : 0.000510s : 4.70% optimize.opt_a.with_stream_mark : 0.000032s : 0.29% optimize.opt_a.recompute_prepare : 0.000019s : 0.18% optimize.opt_a.updatestate_depend_eliminate : 0.000010s : 0.09% optimize.opt_a.updatestate_assign_eliminate : 0.000008s : 0.07% optimize.opt_a.updatestate_loads_eliminate : 0.000007s : 0.07% optimize.opt_a.parameter_eliminate : 0.000004s : 0.03% optimize.opt_a.a_2 : 0.000241s : 2.22% optimize.opt_a.accelerated_algorithm : 0.000016s : 0.15% optimize.opt_a.shard : 0.000005s : 0.04% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.04% optimize.opt_a.shard_inline : 0.000016s : 0.15% optimize.opt_a.merge_send_recv : 0.000017s : 0.16% optimize.opt_a.auto_parallel : 0.000016s : 0.15% optimize.opt_a.parallel : 0.000025s : 0.23% optimize.opt_a.flash_sp : 0.000014s : 0.12% optimize.opt_a.merge_comm : 0.000009s : 0.09% optimize.opt_a.allreduce_fusion : 0.000009s : 0.08% optimize.opt_a.matmul_add_comm_reduction : 0.000020s : 0.19% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000002s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000021s : 0.19% optimize.opt_a.virtual_dataset : 0.000014s : 0.13% optimize.opt_a.get_grad_eliminate_ : 0.000015s : 0.14% optimize.opt_a.virtual_output : 0.000014s : 0.13% optimize.opt_a.merge_forward : 0.000009s : 0.08% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.03% optimize.opt_a.offload_activation : 0.000022s : 0.20% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000036s : 0.33% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.03% optimize.opt_a.before_grad : 0.000027s : 0.25% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000010s : 0.09% optimize.opt_a.meta_fg_expand : 0.000006s : 0.06% optimize.opt_a.flash_sp_send_recv_attached : 0.000005s : 0.04% optimize.opt_a.receive_attached : 0.000004s : 0.04% optimize.opt_a.after_resolve : 0.000023s : 0.21% optimize.opt_a.a_after_grad : 0.000023s : 0.21% optimize.opt_a.renormalize : 0.000802s : 7.39% optimize.opt_a.add_forward_monad_depend : 0.000009s : 0.08% optimize.opt_a.auto_monad_grad : 0.000004s : 0.04% optimize.opt_a.auto_monad_eliminator : 0.000033s : 0.31% optimize.opt_a.cse : 0.000064s : 0.59% optimize.opt_a.a_3 : 0.000134s : 1.24% optimize.py_interpret_to_execute_after_opt_a : 0.000017s : 0.16% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.05% optimize.rewriter_after_opt_a : 0.000051s : 0.47% optimize.convert_after_rewriter : 0.000011s : 0.10% optimize.order_py_execute_after_rewriter : 0.000009s : 0.08% optimize.mutable_eliminate : 0.000612s : 5.65% optimize.opt_b.b_1 : 0.000236s : 2.18% optimize.opt_b.b_2 : 0.000010s : 0.09% optimize.opt_b.updatestate_depend_eliminate : 0.000009s : 0.08% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_b.updatestate_loads_eliminate : 0.000004s : 0.03% optimize.opt_b.renormalize : 0.000001s : 0.01% optimize.opt_b.cse : 0.000033s : 0.30% optimize.optimize_parallel_all_gather_comm : 0.000024s : 0.22% optimize.overlap_param_gather : 0.000005s : 0.04% optimize.cconv : 0.000034s : 0.31% optimize.loop_unroll : 0.000492s : 4.54% optimize.opt_after_cconv.c_1 : 0.000033s : 0.31% optimize.opt_after_cconv.parameter_eliminate : 0.000004s : 0.04% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000007s : 0.07% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.cse : 0.000028s : 0.26% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000021s : 0.19% optimize.tuple_transform.d_1 : 0.000050s : 0.46% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.02% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000008s : 0.07% optimize.partial_unused_args_eliminate : 0.000005s : 0.04% optimize.add_recomputation : 0.000060s : 0.56% optimize.cse_after_recomputation.cse : 0.000017s : 0.16% optimize.environ_conv : 0.000010s : 0.09% optimize.swap_dp_allreduce_reducescatter : 0.000009s : 0.08% optimize.bias_add_comm_swap : 0.000005s : 0.05% optimize.label_micro_interleaved_index : 0.000008s : 0.07% optimize.label_fine_grained_interleaved_index : 0.000005s : 0.05% optimize.merge_cast_opt : 0.000004s : 0.03% optimize.slice_recompute_activation : 0.000005s : 0.05% optimize.micro_interleaved_order_control : 0.000004s : 0.04% optimize.assign_add_opt : 0.000004s : 0.03% optimize.ForceFp32Comm : 0.000003s : 0.03% optimize.remove_cast_before_assign_add : 0.000003s : 0.03% optimize.full_micro_interleaved_order_control : 0.000005s : 0.05% optimize.reorder_send_recv_between_fp_bp : 0.000005s : 0.05% optimize.comm_op_add_attrs : 0.000004s : 0.03% optimize.add_comm_op_reuse_tag : 0.000003s : 0.03% optimize.interleave_split_concat_branches : 0.000003s : 0.03% optimize.interleave_parallel_branches : 0.000004s : 0.03% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.03% optimize.overlap_opt_shard_grad_in_pipeline : 0.000005s : 0.04% optimize.control_data_broadcast_order : 0.000021s : 0.19% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.04% optimize.offloading_packed_experts : 0.000008s : 0.07% optimize.overlap_recompute_and_grad_model_parallel : 0.000008s : 0.07% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.03% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.03% optimize.overlap_recompute_comm : 0.000005s : 0.04% optimize.overlap_grad_ring_attention : 0.000007s : 0.06% optimize.overlap_grad_flash_sp : 0.000026s : 0.24% optimize.begin_end_overlap_inline : 0.000003s : 0.03% optimize.split_matmul_comm_elemetwise : 0.000005s : 0.05% optimize.split_layernorm_comm : 0.000005s : 0.04% optimize.handle_group_info : 0.000004s : 0.04% optimize.symbol_engine_optimizer.build : 0.000004s : 0.04% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000012s : 0.11% optimize.symbol_engine_optimizer.elim_not_effective : 0.000016s : 0.15% optimize.symbol_engine_optimizer.opt_reshape : 0.000008s : 0.08% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000012s : 0.11% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000004s : 0.04% pipeline_parallel_scheduler : 0.000002s : 0.02% auto_monad_reorder : 0.000025s : 0.23% get_jit_bprop_graph : 0.000002s : 0.02% rewriter_after_jit_bprop_graph : 0.000006s : 0.06% opt_after_jit_grad : 0.000546s : 5.04% validate : 0.000046s : 0.42% Time group info: ------[substitution.] 0.000186 29 12.89% : 0.000024s : 2: substitution.cast_eliminate 1.43% : 0.000003s : 3: substitution.elim_not_effective 0.96% : 0.000002s : 3: substitution.fold_const_symbol 3.42% : 0.000006s : 4: substitution.graph_param_transform 56.85% : 0.000106s : 2: substitution.inline 2.88% : 0.000005s : 6: substitution.j_node_and_user_rematch 3.05% : 0.000006s : 6: substitution.remove_not_recompute_node 2.30% : 0.000004s : 2: substitution.replace_old_param 16.22% : 0.000030s : 1: substitution.value_based_eliminate ------[type_inference.] 0.005113 2 90.61% : 0.004633s : 1: type_inference.infer 9.39% : 0.000480s : 1: type_inference.specialize ------[replace.] 0.000022 2 100.00% : 0.000022s : 2: replace.inline ------[match.] 0.000104 2 100.00% : 0.000104s : 2: match.inline ------[predicate.] 0.000176 980 0.74% : 0.000001s : 9: predicate.accumulaten_eliminater 1.00% : 0.000002s : 4: predicate.ad_related_special_op_eliminate 0.66% : 0.000001s : 8: predicate.addn_check_dump 0.77% : 0.000001s : 9: predicate.addn_zero_filter 0.68% : 0.000001s : 9: predicate.adjust_all_reduce_mul_add 2.06% : 0.000004s : 17: predicate.arithmetic_simplify 0.90% : 0.000002s : 9: predicate.cast_eliminate 1.01% : 0.000002s : 8: predicate.check_bprop_eliminate 0.67% : 0.000001s : 8: predicate.compare_switch_simplify 0.23% : 0.000000s : 4: predicate.const_output_eliminate 0.80% : 0.000001s : 8: predicate.depend_value_elim 0.77% : 0.000001s : 9: predicate.dict_get_item_const_eliminator 0.86% : 0.000002s : 9: predicate.dict_get_item_eliminator 0.74% : 0.000001s : 9: predicate.dict_set_item_eliminator 1.33% : 0.000002s : 8: predicate.dumpgradient_eliminate 0.22% : 0.000000s : 4: predicate.elim_not_effective 0.45% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.07% : 0.000002s : 13: predicate.environ_add_const_eliminate 1.02% : 0.000002s : 13: predicate.environ_get_add_eliminate 0.97% : 0.000002s : 13: predicate.environ_get_depend_swap 1.87% : 0.000003s : 21: predicate.environ_get_eliminate 0.99% : 0.000002s : 13: predicate.environ_get_set_eliminate 0.88% : 0.000002s : 11: predicate.exchange_switch_depend_value 1.66% : 0.000003s : 11: predicate.float_depend_g_call 0.71% : 0.000001s : 8: predicate.float_environ_get_switch 1.02% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.19% : 0.000000s : 4: predicate.fold_const_symbol 0.78% : 0.000001s : 8: predicate.get_grad_eliminate 0.23% : 0.000000s : 4: predicate.graph_param_transform 0.79% : 0.000001s : 8: predicate.incorporate_call 0.65% : 0.000001s : 8: predicate.incorporate_call_switch 7.20% : 0.000013s : 44: predicate.inline 1.30% : 0.000002s : 8: predicate.inline_without_move 0.39% : 0.000001s : 8: predicate.j_node_and_user_rematch 1.09% : 0.000002s : 8: predicate.less_batch_normalization 1.64% : 0.000003s : 17: predicate.list_to_tuple_eliminator_ 2.09% : 0.000004s : 26: predicate.load_eliminater 1.65% : 0.000003s : 4: predicate.loop_unroll_after_grad 1.43% : 0.000003s : 16: predicate.loop_unroll_before_grad 1.62% : 0.000003s : 17: predicate.make_slice_get_slice_eliminator 0.71% : 0.000001s : 8: predicate.merge_addn 0.71% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.70% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.68% : 0.000001s : 9: predicate.minmaximum_grad 1.88% : 0.000003s : 4: predicate.mutable_eliminate 0.71% : 0.000001s : 4: predicate.opt_reshape 0.39% : 0.000001s : 4: predicate.parallel_virtual_node 1.30% : 0.000002s : 11: predicate.partial_defer_inline 1.20% : 0.000002s : 13: predicate.partial_eliminate 0.75% : 0.000001s : 9: predicate.print_const_string_wrapper 0.71% : 0.000001s : 8: predicate.reduce_all_const_elim 0.90% : 0.000002s : 9: predicate.reduce_eliminate 2.38% : 0.000004s : 26: predicate.redundant_stop_gradient_eliminater 0.45% : 0.000001s : 8: predicate.remove_not_recompute_node 1.10% : 0.000002s : 17: predicate.replace_applicator 0.66% : 0.000001s : 8: predicate.replace_old_param 0.32% : 0.000001s : 4: predicate.reset_defer_inline 0.96% : 0.000002s : 9: predicate.reshape_eliminate 0.76% : 0.000001s : 8: predicate.row_tensor_add_zeros_like 0.54% : 0.000001s : 4: predicate.row_tensor_eliminate 1.16% : 0.000002s : 8: predicate.same_eliminate 0.47% : 0.000001s : 8: predicate.set_cell_output_no_recompute 1.48% : 0.000003s : 8: predicate.shard_identity_eliminate 0.89% : 0.000002s : 8: predicate.special_op_eliminate 0.95% : 0.000002s : 8: predicate.specialize_transform 1.42% : 0.000003s : 8: predicate.split_environ_get_set_with_tuple_value 1.01% : 0.000002s : 8: predicate.stack_unstack_eliminate 0.41% : 0.000001s : 4: predicate.switch_call_monad_eliminater 0.91% : 0.000002s : 11: predicate.switch_defer_inline 1.73% : 0.000003s : 19: predicate.switch_layer_defer_inline 4.40% : 0.000008s : 39: predicate.switch_simplify 0.75% : 0.000001s : 9: predicate.tile_eliminate 0.75% : 0.000001s : 9: predicate.transpose_eliminate 1.50% : 0.000003s : 17: predicate.tuple_list_convert_item_index_to_positive 1.59% : 0.000003s : 17: predicate.tuple_list_get_item_const_eliminator 1.41% : 0.000002s : 17: predicate.tuple_list_get_item_depend_reorder 2.88% : 0.000005s : 25: predicate.tuple_list_get_item_eliminator 1.46% : 0.000003s : 17: predicate.tuple_list_get_set_item_eliminator 2.39% : 0.000004s : 25: predicate.tuple_list_set_item_eliminator 1.45% : 0.000003s : 17: predicate.tuple_to_list_eliminator_ 2.01% : 0.000004s : 26: predicate.updatestate_pure_node_eliminater 2.88% : 0.000005s : 34: predicate.updatestate_useless_node_eliminater 0.56% : 0.000001s : 4: predicate.value_based_eliminate 0.78% : 0.000001s : 8: predicate.virtual_dataset_eliminate 0.76% : 0.000001s : 8: predicate.virtual_output_eliminate 0.33% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.72% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000258 5 7.77% : 0.000020s : 1: func_graph_cloner_run.FuncGraphClonerGraph 92.23% : 0.000238s : 4: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.027043 192 0.02% : 0.000006s : 1: ForceFp32Comm 12.20% : 0.003300s : 1: add_attr 12.14% : 0.003283s : 1: add_attr_with_inline 0.02% : 0.000006s : 1: add_comm_op_reuse_tag 0.24% : 0.000064s : 1: add_recomputation 0.02% : 0.000006s : 1: assign_add_opt 0.25% : 0.000068s : 1: auto_monad 0.12% : 0.000032s : 1: auto_monad_reorder 0.02% : 0.000006s : 1: begin_end_overlap_inline 0.03% : 0.000008s : 1: bias_add_comm_swap 1.83% : 0.000495s : 1: bootstrap 0.14% : 0.000037s : 1: cconv 0.02% : 0.000007s : 1: comm_op_add_attrs 0.09% : 0.000024s : 1: control_data_broadcast_order 0.05% : 0.000015s : 1: convert_after_rewriter 0.13% : 0.000036s : 1: cse_after_recomputation 0.03% : 0.000008s : 1: dataset_repeat_opt 0.09% : 0.000025s : 1: detach_backward 0.05% : 0.000013s : 1: environ_conv 0.10% : 0.000026s : 1: event_method 0.03% : 0.000008s : 1: full_micro_interleaved_order_control 0.03% : 0.000008s : 1: get_jit_bprop_graph 0.05% : 0.000012s : 1: graph_reusing 0.03% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.03% : 0.000007s : 1: handle_group_info 0.03% : 0.000008s : 1: inline 0.03% : 0.000009s : 1: insert-virtual-dataset 0.02% : 0.000006s : 1: interleave_parallel_branches 0.02% : 0.000006s : 1: interleave_split_concat_branches 0.03% : 0.000008s : 1: label_fine_grained_interleaved_index 0.04% : 0.000010s : 1: label_micro_interleaved_index 1.84% : 0.000499s : 1: loop_unroll 0.02% : 0.000006s : 1: merge_cast_opt 0.03% : 0.000007s : 1: micro_interleaved_order_control 2.30% : 0.000621s : 1: mutable_eliminate 0.04% : 0.000011s : 1: offloading_packed_experts 0.07% : 0.000019s : 1: opt.transform.loop_unroll_optimizer 0.07% : 0.000020s : 1: opt.transform.mutable_eliminate 3.65% : 0.000988s : 78: opt.transform.opt_a 0.12% : 0.000032s : 1: opt.transform.opt_after_cconv 0.12% : 0.000031s : 1: opt.transform.opt_after_jit_grad 0.61% : 0.000164s : 28: opt.transform.opt_b 0.21% : 0.000056s : 2: opt.transform.opt_trans_graph 0.17% : 0.000045s : 4: opt.transform.symbol_engine_opt 11.51% : 0.003112s : 1: opt_a 0.56% : 0.000152s : 1: opt_after_cconv 2.06% : 0.000558s : 1: opt_after_jit_grad 1.37% : 0.000372s : 1: opt_b 22.64% : 0.006123s : 1: optimize 0.10% : 0.000028s : 1: optimize_parallel_all_gather_comm 0.05% : 0.000012s : 1: order_py_execute_after_rewriter 0.12% : 0.000031s : 1: overlap_grad_flash_sp 0.02% : 0.000007s : 1: overlap_grad_matmul_and_grad_allreduce 0.04% : 0.000010s : 1: overlap_grad_ring_attention 0.03% : 0.000007s : 1: overlap_opt_shard_grad_in_pipeline 0.02% : 0.000007s : 1: overlap_opt_shard_in_pipeline 0.03% : 0.000007s : 1: overlap_param_gather 0.02% : 0.000006s : 1: overlap_recompute_allgather_and_fa_grad 0.04% : 0.000011s : 1: overlap_recompute_and_grad_model_parallel 0.03% : 0.000007s : 1: overlap_recompute_comm 0.04% : 0.000010s : 1: parallel-infer-symbol 0.02% : 0.000006s : 1: parallel-infer-symbol-second 0.03% : 0.000008s : 1: partial_unused_args_eliminate 0.04% : 0.000010s : 1: pipeline_parallel_scheduler 0.03% : 0.000007s : 1: pipeline_split 0.14% : 0.000037s : 1: pre_auto_parallel 0.10% : 0.000028s : 1: py_interpret_to_execute 0.08% : 0.000021s : 1: py_interpret_to_execute_after_opt_a 0.02% : 0.000006s : 1: remove_cast_before_assign_add 0.09% : 0.000024s : 1: remove_dup_value 1.81% : 0.000490s : 1: renormalize.infer 1.12% : 0.000302s : 1: renormalize.specialize 0.03% : 0.000008s : 1: reorder_send_recv_between_fp_bp 0.04% : 0.000012s : 1: rewriter_after_jit_bprop_graph 0.21% : 0.000056s : 1: rewriter_after_opt_a 0.21% : 0.000057s : 1: rewriter_before_opt_a 0.03% : 0.000008s : 1: slice_cell_reuse_recomputed_activation 0.03% : 0.000008s : 1: slice_recompute_activation 0.03% : 0.000008s : 1: split_layernorm_comm 0.03% : 0.000008s : 1: split_matmul_comm_elemetwise 0.04% : 0.000012s : 1: swap_dp_allreduce_reducescatter 0.42% : 0.000113s : 1: symbol_engine_optimizer 0.37% : 0.000101s : 1: tuple_transform 19.23% : 0.005200s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:54.257.405 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0145372, [21] [bootstrap]: 0.00046903 [type_inference]: 0.00527722 [event_method]: 1.315e-05 [auto_monad]: 5.915e-05 [graph_reusing]: 5.06002e-06 [inline]: 2.07999e-06 [add_attr]: 0.00317766, [1] [add_attr_with_inline]: 0.00316898, [1] [Cycle 1]: 5.745e-05, [2] [tag_attr]: 1.587e-05 [meta_addattr_fg_expand]: 3.93001e-06 [parallel-infer-symbol]: 2.93e-06 [pre_auto_parallel]: 2.687e-05 [insert-virtual-dataset]: 2.49001e-06 [parallel-infer-symbol-second]: 7.2e-07 [dataset_repeat_opt]: 1.89e-06 [pipeline_split]: 1.64998e-06 [optimize]: 0.00477522, [53] [py_interpret_to_execute]: 1.876e-05 [rewriter_before_opt_a]: 4.953e-05 [opt_a]: 0.00254772, [2] [Cycle 1]: 0.00178261, [45] [expand_dump_flag]: 3.36999e-06 [switch_simplify]: 2.795e-05 [loop_unroll]: 1.488e-05 [a_1]: 0.00034914 [with_stream_mark]: 1.644e-05 [recompute_prepare]: 1.126e-05 [updatestate_depend_eliminate]: 4.70001e-06 [updatestate_assign_eliminate]: 3.63e-06 [updatestate_loads_eliminate]: 3.62998e-06 [parameter_eliminate]: 2.40002e-06 [a_2]: 9.815e-05 [accelerated_algorithm]: 8.95999e-06 [shard]: 2.26e-06 [meta_shard_fg_expand]: 1.94e-06 [shard_inline]: 8.62e-06 [merge_send_recv]: 9.77001e-06 [auto_parallel]: 7.07002e-06 [parallel]: 1.974e-05 [flash_sp]: 8.04002e-06 [merge_comm]: 5.09e-06 [allreduce_fusion]: 4.00998e-06 [matmul_add_comm_reduction]: 1.003e-05 [allreduce_slice_to_reducescatter]: 9.50007e-07 [virtual_shard_identity]: 1.051e-05 [virtual_dataset]: 7.05002e-06 [get_grad_eliminate_]: 7.30998e-06 [virtual_output]: 7.15e-06 [merge_forward]: 5.02e-06 [cell_reuse_recompute_pass]: 1.19e-06 [offload_activation]: 1.077e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.638e-05 [merge_recompute_call_nodes]: 1.38002e-06 [before_grad]: 1.398e-05 [set_forward_comm_id_for_comm_node_pass]: 4.60999e-06 [meta_fg_expand]: 3.08998e-06 [flash_sp_send_recv_attached]: 2.51e-06 [receive_attached]: 1.99e-06 [after_resolve]: 1.131e-05 [a_after_grad]: 1.13e-05 [renormalize]: 0.00067189 [add_forward_monad_depend]: 5.27999e-06 [auto_monad_grad]: 2.35002e-06 [auto_monad_eliminator]: 1.728e-05 [cse]: 3.842e-05 [a_3]: 5.654e-05 [Cycle 2]: 0.00075438, [45] [expand_dump_flag]: 1.00999e-06 [switch_simplify]: 8.33001e-06 [loop_unroll]: 7.1e-06 [a_1]: 0.00015257 [with_stream_mark]: 1.635e-05 [recompute_prepare]: 7.75e-06 [updatestate_depend_eliminate]: 4.80999e-06 [updatestate_assign_eliminate]: 2.94999e-06 [updatestate_loads_eliminate]: 2.93e-06 [parameter_eliminate]: 1.32e-06 [a_2]: 8.684e-05 [accelerated_algorithm]: 7.47002e-06 [shard]: 1.67001e-06 [meta_shard_fg_expand]: 1.89999e-06 [shard_inline]: 7.62002e-06 [merge_send_recv]: 7.03e-06 [auto_parallel]: 7.08998e-06 [parallel]: 5.60001e-06 [flash_sp]: 3.89002e-06 [merge_comm]: 3.95e-06 [allreduce_fusion]: 3.94997e-06 [matmul_add_comm_reduction]: 7.33e-06 [allreduce_slice_to_reducescatter]: 6.60017e-07 [virtual_shard_identity]: 9.52001e-06 [virtual_dataset]: 7.15e-06 [get_grad_eliminate_]: 6.88e-06 [virtual_output]: 6.39999e-06 [merge_forward]: 3.97998e-06 [cell_reuse_recompute_pass]: 1.54e-06 [offload_activation]: 8.14997e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.464e-05 [merge_recompute_call_nodes]: 9.50007e-07 [before_grad]: 1.226e-05 [set_forward_comm_id_for_comm_node_pass]: 5.17999e-06 [meta_fg_expand]: 3.45e-06 [flash_sp_send_recv_attached]: 1.26997e-06 [receive_attached]: 9.70002e-07 [after_resolve]: 1.116e-05 [a_after_grad]: 1.042e-05 [renormalize]: 8.00064e-08 [add_forward_monad_depend]: 2.84999e-06 [auto_monad_grad]: 1.10001e-06 [auto_monad_eliminator]: 1.15e-05 [cse]: 2.379e-05 [a_3]: 4.423e-05 [py_interpret_to_execute_after_opt_a]: 1.121e-05 [slice_cell_reuse_recomputed_activation]: 2.25002e-06 [rewriter_after_opt_a]: 4.326e-05 [convert_after_rewriter]: 7.33999e-06 [order_py_execute_after_rewriter]: 5.82001e-06 [mutable_eliminate]: 0.00052961 [opt_b]: 0.00030698, [1] [Cycle 1]: 0.0003, [7] [b_1]: 0.00020735 [b_2]: 9.94999e-06 [updatestate_depend_eliminate]: 7.46001e-06 [updatestate_assign_eliminate]: 2.94001e-06 [updatestate_loads_eliminate]: 3.01001e-06 [renormalize]: 5.79981e-07 [cse]: 2.866e-05 [optimize_parallel_all_gather_comm]: 1.881e-05 [overlap_param_gather]: 2.01998e-06 [cconv]: 2.694e-05 [loop_unroll]: 0.00046616 [opt_after_cconv]: 0.00012287, [1] [Cycle 1]: 0.00011693, [7] [c_1]: 3.504e-05 [parameter_eliminate]: 3.7e-06 [updatestate_depend_eliminate]: 8.35999e-06 [updatestate_assign_eliminate]: 3.25998e-06 [updatestate_loads_eliminate]: 2.96999e-06 [cse]: 2.878e-05 [renormalize]: 4.09986e-07 [remove_dup_value]: 1.727e-05 [tuple_transform]: 8.098e-05, [1] [Cycle 1]: 7.654e-05, [4] [d_1]: 4.748e-05 [none_parameter_eliminate]: 1.78002e-06 [renormalize]: 1.69995e-07 [switch_simplify]: 7.58001e-06 [partial_unused_args_eliminate]: 2.10002e-06 [add_recomputation]: 5.686e-05 [cse_after_recomputation]: 2.788e-05, [1] [Cycle 1]: 2.275e-05, [1] [cse]: 1.716e-05 [environ_conv]: 6.69999e-06 [swap_dp_allreduce_reducescatter]: 6.14999e-06 [bias_add_comm_swap]: 2.59001e-06 [label_micro_interleaved_index]: 4.02e-06 [label_fine_grained_interleaved_index]: 2.68e-06 [merge_cast_opt]: 1.24e-06 [slice_recompute_activation]: 2.04e-06 [micro_interleaved_order_control]: 2.41e-06 [assign_add_opt]: 1.24e-06 [ForceFp32Comm]: 1.17e-06 [remove_cast_before_assign_add]: 1.27e-06 [full_micro_interleaved_order_control]: 2.24999e-06 [reorder_send_recv_between_fp_bp]: 2.71999e-06 [comm_op_add_attrs]: 1.00999e-06 [add_comm_op_reuse_tag]: 9.39996e-07 [interleave_split_concat_branches]: 1.15001e-06 [interleave_parallel_branches]: 1.23002e-06 [overlap_opt_shard_in_pipeline]: 1.14003e-06 [overlap_opt_shard_grad_in_pipeline]: 2.04e-06 [control_data_broadcast_order]: 1.556e-05 [grouped_pairwise_exchange_alltoall]: 2.01e-06 [offloading_packed_experts]: 5.00001e-06 [overlap_recompute_and_grad_model_parallel]: 5.42999e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.47001e-06 [overlap_recompute_allgather_and_fa_grad]: 1.33002e-06 [overlap_recompute_comm]: 2.56e-06 [overlap_grad_ring_attention]: 5.00001e-06 [overlap_grad_flash_sp]: 2.214e-05 [begin_end_overlap_inline]: 5.3001e-07 [split_matmul_comm_elemetwise]: 2.09e-06 [split_layernorm_comm]: 2.07001e-06 [handle_group_info]: 1.15001e-06 [symbol_engine_optimizer]: 9.381e-05, [1] [Cycle 1]: 8.884e-05, [6] [build]: 3.79002e-06 [elim_shapecalc]: 1.346e-05 [elim_not_effective]: 1.66e-05 [opt_reshape]: 8.81002e-06 [fold_const_symbol]: 1.209e-05 [renormalize]: 2.20025e-07 [detach_backward]: 2.06e-06 [pipeline_parallel_scheduler]: 1.77999e-06 [auto_monad_reorder]: 2.228e-05 [get_jit_bprop_graph]: 1.55001e-06 [rewriter_after_jit_bprop_graph]: 4.48001e-06 [opt_after_jit_grad]: 0.00048862 [validate]: 4.438e-05 Sums bootstrap : 0.000469s : 4.53% type_inference : 0.005277s : 50.97% event_method : 0.000013s : 0.13% auto_monad : 0.000059s : 0.57% graph_reusing : 0.000005s : 0.05% inline : 0.000002s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000016s : 0.15% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000004s : 0.04% parallel-infer-symbol : 0.000003s : 0.03% pre_auto_parallel : 0.000027s : 0.26% insert-virtual-dataset : 0.000002s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.02% optimize.py_interpret_to_execute : 0.000019s : 0.18% optimize.rewriter_before_opt_a : 0.000050s : 0.48% optimize.opt_a.expand_dump_flag : 0.000004s : 0.04% optimize.opt_a.switch_simplify : 0.000036s : 0.35% optimize.opt_a.loop_unroll : 0.000022s : 0.21% optimize.opt_a.a_1 : 0.000502s : 4.85% optimize.opt_a.with_stream_mark : 0.000033s : 0.32% optimize.opt_a.recompute_prepare : 0.000019s : 0.18% optimize.opt_a.updatestate_depend_eliminate : 0.000010s : 0.09% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.06% optimize.opt_a.updatestate_loads_eliminate : 0.000007s : 0.06% optimize.opt_a.parameter_eliminate : 0.000004s : 0.04% optimize.opt_a.a_2 : 0.000185s : 1.79% optimize.opt_a.accelerated_algorithm : 0.000016s : 0.16% optimize.opt_a.shard : 0.000004s : 0.04% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.04% optimize.opt_a.shard_inline : 0.000016s : 0.16% optimize.opt_a.merge_send_recv : 0.000017s : 0.16% optimize.opt_a.auto_parallel : 0.000014s : 0.14% optimize.opt_a.parallel : 0.000025s : 0.24% optimize.opt_a.flash_sp : 0.000012s : 0.12% optimize.opt_a.merge_comm : 0.000009s : 0.09% optimize.opt_a.allreduce_fusion : 0.000008s : 0.08% optimize.opt_a.matmul_add_comm_reduction : 0.000017s : 0.17% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000002s : 0.02% optimize.opt_a.virtual_shard_identity : 0.000020s : 0.19% optimize.opt_a.virtual_dataset : 0.000014s : 0.14% optimize.opt_a.get_grad_eliminate_ : 0.000014s : 0.14% optimize.opt_a.virtual_output : 0.000014s : 0.13% optimize.opt_a.merge_forward : 0.000009s : 0.09% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.03% optimize.opt_a.offload_activation : 0.000019s : 0.18% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000031s : 0.30% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.02% optimize.opt_a.before_grad : 0.000026s : 0.25% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000010s : 0.09% optimize.opt_a.meta_fg_expand : 0.000007s : 0.06% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.04% optimize.opt_a.receive_attached : 0.000003s : 0.03% optimize.opt_a.after_resolve : 0.000022s : 0.22% optimize.opt_a.a_after_grad : 0.000022s : 0.21% optimize.opt_a.renormalize : 0.000672s : 6.49% optimize.opt_a.add_forward_monad_depend : 0.000008s : 0.08% optimize.opt_a.auto_monad_grad : 0.000003s : 0.03% optimize.opt_a.auto_monad_eliminator : 0.000029s : 0.28% optimize.opt_a.cse : 0.000062s : 0.60% optimize.opt_a.a_3 : 0.000101s : 0.97% optimize.py_interpret_to_execute_after_opt_a : 0.000011s : 0.11% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.02% optimize.rewriter_after_opt_a : 0.000043s : 0.42% optimize.convert_after_rewriter : 0.000007s : 0.07% optimize.order_py_execute_after_rewriter : 0.000006s : 0.06% optimize.mutable_eliminate : 0.000530s : 5.12% optimize.opt_b.b_1 : 0.000207s : 2.00% optimize.opt_b.b_2 : 0.000010s : 0.10% optimize.opt_b.updatestate_depend_eliminate : 0.000007s : 0.07% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_b.renormalize : 0.000001s : 0.01% optimize.opt_b.cse : 0.000029s : 0.28% optimize.optimize_parallel_all_gather_comm : 0.000019s : 0.18% optimize.overlap_param_gather : 0.000002s : 0.02% optimize.cconv : 0.000027s : 0.26% optimize.loop_unroll : 0.000466s : 4.50% optimize.opt_after_cconv.c_1 : 0.000035s : 0.34% optimize.opt_after_cconv.parameter_eliminate : 0.000004s : 0.04% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000008s : 0.08% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.cse : 0.000029s : 0.28% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000017s : 0.17% optimize.tuple_transform.d_1 : 0.000047s : 0.46% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.02% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000008s : 0.07% optimize.partial_unused_args_eliminate : 0.000002s : 0.02% optimize.add_recomputation : 0.000057s : 0.55% optimize.cse_after_recomputation.cse : 0.000017s : 0.17% optimize.environ_conv : 0.000007s : 0.06% optimize.swap_dp_allreduce_reducescatter : 0.000006s : 0.06% optimize.bias_add_comm_swap : 0.000003s : 0.03% optimize.label_micro_interleaved_index : 0.000004s : 0.04% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.03% optimize.merge_cast_opt : 0.000001s : 0.01% optimize.slice_recompute_activation : 0.000002s : 0.02% optimize.micro_interleaved_order_control : 0.000002s : 0.02% optimize.assign_add_opt : 0.000001s : 0.01% optimize.ForceFp32Comm : 0.000001s : 0.01% optimize.remove_cast_before_assign_add : 0.000001s : 0.01% optimize.full_micro_interleaved_order_control : 0.000002s : 0.02% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.03% optimize.comm_op_add_attrs : 0.000001s : 0.01% optimize.add_comm_op_reuse_tag : 0.000001s : 0.01% optimize.interleave_split_concat_branches : 0.000001s : 0.01% optimize.interleave_parallel_branches : 0.000001s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.02% optimize.control_data_broadcast_order : 0.000016s : 0.15% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.02% optimize.offloading_packed_experts : 0.000005s : 0.05% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.05% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.01% optimize.overlap_recompute_comm : 0.000003s : 0.02% optimize.overlap_grad_ring_attention : 0.000005s : 0.05% optimize.overlap_grad_flash_sp : 0.000022s : 0.21% optimize.begin_end_overlap_inline : 0.000001s : 0.01% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.02% optimize.split_layernorm_comm : 0.000002s : 0.02% optimize.handle_group_info : 0.000001s : 0.01% optimize.symbol_engine_optimizer.build : 0.000004s : 0.04% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000013s : 0.13% optimize.symbol_engine_optimizer.elim_not_effective : 0.000017s : 0.16% optimize.symbol_engine_optimizer.opt_reshape : 0.000009s : 0.09% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000012s : 0.12% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.02% pipeline_parallel_scheduler : 0.000002s : 0.02% auto_monad_reorder : 0.000022s : 0.22% get_jit_bprop_graph : 0.000002s : 0.01% rewriter_after_jit_bprop_graph : 0.000004s : 0.04% opt_after_jit_grad : 0.000489s : 4.72% validate : 0.000044s : 0.43% Time group info: ------[substitution.] 0.000175 29 12.24% : 0.000021s : 2: substitution.cast_eliminate 1.29% : 0.000002s : 3: substitution.elim_not_effective 0.91% : 0.000002s : 3: substitution.fold_const_symbol 3.59% : 0.000006s : 4: substitution.graph_param_transform 57.17% : 0.000100s : 2: substitution.inline 3.08% : 0.000005s : 6: substitution.j_node_and_user_rematch 3.29% : 0.000006s : 6: substitution.remove_not_recompute_node 2.00% : 0.000004s : 2: substitution.replace_old_param 16.43% : 0.000029s : 1: substitution.value_based_eliminate ------[type_inference.] 0.005223 2 91.06% : 0.004756s : 1: type_inference.infer 8.94% : 0.000467s : 1: type_inference.specialize ------[replace.] 0.000022 2 100.00% : 0.000022s : 2: replace.inline ------[match.] 0.000098 2 100.00% : 0.000098s : 2: match.inline ------[predicate.] 0.000168 980 0.93% : 0.000002s : 9: predicate.accumulaten_eliminater 1.21% : 0.000002s : 4: predicate.ad_related_special_op_eliminate 0.64% : 0.000001s : 8: predicate.addn_check_dump 0.86% : 0.000001s : 9: predicate.addn_zero_filter 0.67% : 0.000001s : 9: predicate.adjust_all_reduce_mul_add 2.12% : 0.000004s : 17: predicate.arithmetic_simplify 0.96% : 0.000002s : 9: predicate.cast_eliminate 0.72% : 0.000001s : 8: predicate.check_bprop_eliminate 0.65% : 0.000001s : 8: predicate.compare_switch_simplify 0.24% : 0.000000s : 4: predicate.const_output_eliminate 0.74% : 0.000001s : 8: predicate.depend_value_elim 0.81% : 0.000001s : 9: predicate.dict_get_item_const_eliminator 0.93% : 0.000002s : 9: predicate.dict_get_item_eliminator 0.80% : 0.000001s : 9: predicate.dict_set_item_eliminator 1.31% : 0.000002s : 8: predicate.dumpgradient_eliminate 0.26% : 0.000000s : 4: predicate.elim_not_effective 0.50% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.14% : 0.000002s : 13: predicate.environ_add_const_eliminate 1.21% : 0.000002s : 13: predicate.environ_get_add_eliminate 1.04% : 0.000002s : 13: predicate.environ_get_depend_swap 1.79% : 0.000003s : 21: predicate.environ_get_eliminate 1.07% : 0.000002s : 13: predicate.environ_get_set_eliminate 0.92% : 0.000002s : 11: predicate.exchange_switch_depend_value 1.63% : 0.000003s : 11: predicate.float_depend_g_call 0.63% : 0.000001s : 8: predicate.float_environ_get_switch 0.97% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.23% : 0.000000s : 4: predicate.fold_const_symbol 0.78% : 0.000001s : 8: predicate.get_grad_eliminate 0.37% : 0.000001s : 4: predicate.graph_param_transform 0.80% : 0.000001s : 8: predicate.incorporate_call 0.66% : 0.000001s : 8: predicate.incorporate_call_switch 6.43% : 0.000011s : 44: predicate.inline 1.11% : 0.000002s : 8: predicate.inline_without_move 0.39% : 0.000001s : 8: predicate.j_node_and_user_rematch 1.14% : 0.000002s : 8: predicate.less_batch_normalization 1.59% : 0.000003s : 17: predicate.list_to_tuple_eliminator_ 2.27% : 0.000004s : 26: predicate.load_eliminater 1.25% : 0.000002s : 4: predicate.loop_unroll_after_grad 1.57% : 0.000003s : 16: predicate.loop_unroll_before_grad 1.76% : 0.000003s : 17: predicate.make_slice_get_slice_eliminator 0.74% : 0.000001s : 8: predicate.merge_addn 0.69% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.72% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.71% : 0.000001s : 9: predicate.minmaximum_grad 1.57% : 0.000003s : 4: predicate.mutable_eliminate 0.58% : 0.000001s : 4: predicate.opt_reshape 0.52% : 0.000001s : 4: predicate.parallel_virtual_node 1.17% : 0.000002s : 11: predicate.partial_defer_inline 1.31% : 0.000002s : 13: predicate.partial_eliminate 0.86% : 0.000001s : 9: predicate.print_const_string_wrapper 0.70% : 0.000001s : 8: predicate.reduce_all_const_elim 1.08% : 0.000002s : 9: predicate.reduce_eliminate 2.21% : 0.000004s : 26: predicate.redundant_stop_gradient_eliminater 0.44% : 0.000001s : 8: predicate.remove_not_recompute_node 1.14% : 0.000002s : 17: predicate.replace_applicator 0.81% : 0.000001s : 8: predicate.replace_old_param 0.35% : 0.000001s : 4: predicate.reset_defer_inline 0.86% : 0.000001s : 9: predicate.reshape_eliminate 0.76% : 0.000001s : 8: predicate.row_tensor_add_zeros_like 0.53% : 0.000001s : 4: predicate.row_tensor_eliminate 0.86% : 0.000001s : 8: predicate.same_eliminate 0.63% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.98% : 0.000002s : 8: predicate.shard_identity_eliminate 0.91% : 0.000002s : 8: predicate.special_op_eliminate 0.95% : 0.000002s : 8: predicate.specialize_transform 1.05% : 0.000002s : 8: predicate.split_environ_get_set_with_tuple_value 0.92% : 0.000002s : 8: predicate.stack_unstack_eliminate 0.43% : 0.000001s : 4: predicate.switch_call_monad_eliminater 0.94% : 0.000002s : 11: predicate.switch_defer_inline 1.72% : 0.000003s : 19: predicate.switch_layer_defer_inline 4.18% : 0.000007s : 39: predicate.switch_simplify 0.81% : 0.000001s : 9: predicate.tile_eliminate 0.82% : 0.000001s : 9: predicate.transpose_eliminate 1.66% : 0.000003s : 17: predicate.tuple_list_convert_item_index_to_positive 1.68% : 0.000003s : 17: predicate.tuple_list_get_item_const_eliminator 1.49% : 0.000002s : 17: predicate.tuple_list_get_item_depend_reorder 3.09% : 0.000005s : 25: predicate.tuple_list_get_item_eliminator 1.53% : 0.000003s : 17: predicate.tuple_list_get_set_item_eliminator 2.67% : 0.000005s : 25: predicate.tuple_list_set_item_eliminator 1.50% : 0.000003s : 17: predicate.tuple_to_list_eliminator_ 2.10% : 0.000004s : 26: predicate.updatestate_pure_node_eliminater 3.02% : 0.000005s : 34: predicate.updatestate_useless_node_eliminater 0.70% : 0.000001s : 4: predicate.value_based_eliminate 0.84% : 0.000001s : 8: predicate.virtual_dataset_eliminate 0.80% : 0.000001s : 8: predicate.virtual_output_eliminate 0.34% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.54% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000228 5 7.94% : 0.000018s : 1: func_graph_cloner_run.FuncGraphClonerGraph 92.06% : 0.000210s : 4: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.024374 192 0.02% : 0.000004s : 1: ForceFp32Comm 13.06% : 0.003183s : 1: add_attr 13.02% : 0.003173s : 1: add_attr_with_inline 0.01% : 0.000004s : 1: add_comm_op_reuse_tag 0.25% : 0.000061s : 1: add_recomputation 0.02% : 0.000004s : 1: assign_add_opt 0.26% : 0.000064s : 1: auto_monad 0.11% : 0.000026s : 1: auto_monad_reorder 0.01% : 0.000003s : 1: begin_end_overlap_inline 0.02% : 0.000006s : 1: bias_add_comm_swap 2.06% : 0.000502s : 1: bootstrap 0.13% : 0.000031s : 1: cconv 0.02% : 0.000004s : 1: comm_op_add_attrs 0.08% : 0.000020s : 1: control_data_broadcast_order 0.04% : 0.000011s : 1: convert_after_rewriter 0.13% : 0.000031s : 1: cse_after_recomputation 0.02% : 0.000005s : 1: dataset_repeat_opt 0.02% : 0.000005s : 1: detach_backward 0.04% : 0.000010s : 1: environ_conv 0.08% : 0.000021s : 1: event_method 0.02% : 0.000005s : 1: full_micro_interleaved_order_control 0.02% : 0.000005s : 1: get_jit_bprop_graph 0.04% : 0.000009s : 1: graph_reusing 0.02% : 0.000005s : 1: grouped_pairwise_exchange_alltoall 0.02% : 0.000004s : 1: handle_group_info 0.02% : 0.000005s : 1: inline 0.02% : 0.000006s : 1: insert-virtual-dataset 0.02% : 0.000004s : 1: interleave_parallel_branches 0.02% : 0.000004s : 1: interleave_split_concat_branches 0.02% : 0.000006s : 1: label_fine_grained_interleaved_index 0.03% : 0.000007s : 1: label_micro_interleaved_index 1.95% : 0.000475s : 1: loop_unroll 0.02% : 0.000004s : 1: merge_cast_opt 0.02% : 0.000005s : 1: micro_interleaved_order_control 2.21% : 0.000540s : 1: mutable_eliminate 0.03% : 0.000008s : 1: offloading_packed_experts 0.08% : 0.000019s : 1: opt.transform.loop_unroll_optimizer 0.08% : 0.000019s : 1: opt.transform.mutable_eliminate 3.95% : 0.000962s : 78: opt.transform.opt_a 0.14% : 0.000033s : 1: opt.transform.opt_after_cconv 0.12% : 0.000029s : 1: opt.transform.opt_after_jit_grad 0.75% : 0.000182s : 28: opt.transform.opt_b 0.22% : 0.000053s : 2: opt.transform.opt_trans_graph 0.19% : 0.000046s : 4: opt.transform.symbol_engine_opt 10.47% : 0.002551s : 1: opt_a 0.52% : 0.000126s : 1: opt_after_cconv 2.05% : 0.000499s : 1: opt_after_jit_grad 1.28% : 0.000311s : 1: opt_b 19.61% : 0.004780s : 1: optimize 0.09% : 0.000022s : 1: optimize_parallel_all_gather_comm 0.04% : 0.000009s : 1: order_py_execute_after_rewriter 0.11% : 0.000027s : 1: overlap_grad_flash_sp 0.02% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.03% : 0.000008s : 1: overlap_grad_ring_attention 0.02% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.02% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.02% : 0.000005s : 1: overlap_param_gather 0.02% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.04% : 0.000009s : 1: overlap_recompute_and_grad_model_parallel 0.02% : 0.000005s : 1: overlap_recompute_comm 0.03% : 0.000007s : 1: parallel-infer-symbol 0.01% : 0.000004s : 1: parallel-infer-symbol-second 0.02% : 0.000005s : 1: partial_unused_args_eliminate 0.02% : 0.000005s : 1: pipeline_parallel_scheduler 0.02% : 0.000004s : 1: pipeline_split 0.13% : 0.000031s : 1: pre_auto_parallel 0.09% : 0.000022s : 1: py_interpret_to_execute 0.06% : 0.000015s : 1: py_interpret_to_execute_after_opt_a 0.02% : 0.000004s : 1: remove_cast_before_assign_add 0.09% : 0.000021s : 1: remove_dup_value 1.69% : 0.000411s : 1: renormalize.infer 1.04% : 0.000253s : 1: renormalize.specialize 0.02% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.03% : 0.000008s : 1: rewriter_after_jit_bprop_graph 0.20% : 0.000049s : 1: rewriter_after_opt_a 0.22% : 0.000054s : 1: rewriter_before_opt_a 0.02% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.02% : 0.000005s : 1: slice_recompute_activation 0.02% : 0.000005s : 1: split_layernorm_comm 0.02% : 0.000005s : 1: split_matmul_comm_elemetwise 0.04% : 0.000009s : 1: swap_dp_allreduce_reducescatter 0.40% : 0.000097s : 1: symbol_engine_optimizer 0.34% : 0.000084s : 1: tuple_transform 21.72% : 0.005295s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:54.460.563 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:54.460.833 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0151393, [21] [bootstrap]: 0.00045408 [type_inference]: 0.00501433 [event_method]: 1.295e-05 [auto_monad]: 5.768e-05 [graph_reusing]: 5.32999e-06 [inline]: 2.49999e-06 [add_attr]: 0.00324161, [1] [add_attr_with_inline]: 0.00323231, [1] [Cycle 1]: 7.077e-05, [2] [tag_attr]: 1.453e-05 [meta_addattr_fg_expand]: 3.86001e-06 [parallel-infer-symbol]: 3.22002e-06 [pre_auto_parallel]: 2.805e-05 [insert-virtual-dataset]: 2.48e-06 [parallel-infer-symbol-second]: 7.39994e-07 [dataset_repeat_opt]: 2.48e-06 [pipeline_split]: 1.74998e-06 [optimize]: 0.00513717, [53] [py_interpret_to_execute]: 2.36e-05 [rewriter_before_opt_a]: 4.921e-05 [opt_a]: 0.00267779, [2] [Cycle 1]: 0.00178716, [45] [expand_dump_flag]: 3.36001e-06 [switch_simplify]: 2.633e-05 [loop_unroll]: 1.421e-05 [a_1]: 0.00029421 [with_stream_mark]: 1.918e-05 [recompute_prepare]: 8.96998e-06 [updatestate_depend_eliminate]: 4.51002e-06 [updatestate_assign_eliminate]: 3.37997e-06 [updatestate_loads_eliminate]: 3.04999e-06 [parameter_eliminate]: 2.25002e-06 [a_2]: 0.00011122 [accelerated_algorithm]: 7.13e-06 [shard]: 2.31998e-06 [meta_shard_fg_expand]: 1.89999e-06 [shard_inline]: 6.12999e-06 [merge_send_recv]: 8.37e-06 [auto_parallel]: 6.88e-06 [parallel]: 1.789e-05 [flash_sp]: 9.56e-06 [merge_comm]: 4.57e-06 [allreduce_fusion]: 3.58e-06 [matmul_add_comm_reduction]: 9.71998e-06 [allreduce_slice_to_reducescatter]: 7.50006e-07 [virtual_shard_identity]: 9.52999e-06 [virtual_dataset]: 6.17001e-06 [get_grad_eliminate_]: 5.89e-06 [virtual_output]: 6.83e-06 [merge_forward]: 4.52e-06 [cell_reuse_recompute_pass]: 1.54998e-06 [offload_activation]: 1.179e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.776e-05 [merge_recompute_call_nodes]: 1.62999e-06 [before_grad]: 1.142e-05 [set_forward_comm_id_for_comm_node_pass]: 4.32998e-06 [meta_fg_expand]: 2.89001e-06 [flash_sp_send_recv_attached]: 2.68998e-06 [receive_attached]: 2.34001e-06 [after_resolve]: 1.023e-05 [a_after_grad]: 9.47001e-06 [renormalize]: 0.00058153 [add_forward_monad_depend]: 6.11998e-06 [auto_monad_grad]: 2.26e-06 [auto_monad_eliminator]: 1.741e-05 [cse]: 2.978e-05 [a_3]: 6.321e-05 [Cycle 2]: 0.00087659, [45] [expand_dump_flag]: 1.51002e-06 [switch_simplify]: 7.81001e-06 [loop_unroll]: 6.07999e-06 [a_1]: 0.00010689 [with_stream_mark]: 1.385e-05 [recompute_prepare]: 7.95e-06 [updatestate_depend_eliminate]: 3.73001e-06 [updatestate_assign_eliminate]: 2.81e-06 [updatestate_loads_eliminate]: 2.79999e-06 [parameter_eliminate]: 1.66002e-06 [a_2]: 0.00012057 [accelerated_algorithm]: 7.23e-06 [shard]: 2.14999e-06 [meta_shard_fg_expand]: 1.50999e-06 [shard_inline]: 7.03998e-06 [merge_send_recv]: 7.43999e-06 [auto_parallel]: 6.66999e-06 [parallel]: 6.53e-06 [flash_sp]: 4.00998e-06 [merge_comm]: 3.31999e-06 [allreduce_fusion]: 3.71001e-06 [matmul_add_comm_reduction]: 8.08001e-06 [allreduce_slice_to_reducescatter]: 5.00004e-07 [virtual_shard_identity]: 7.95e-06 [virtual_dataset]: 6.00002e-06 [get_grad_eliminate_]: 5.62001e-06 [virtual_output]: 5.52001e-06 [merge_forward]: 3.33e-06 [cell_reuse_recompute_pass]: 1.89999e-06 [offload_activation]: 8.37e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.651e-05 [merge_recompute_call_nodes]: 1.16002e-06 [before_grad]: 1.004e-05 [set_forward_comm_id_for_comm_node_pass]: 4.57e-06 [meta_fg_expand]: 2.16998e-06 [flash_sp_send_recv_attached]: 1.32999e-06 [receive_attached]: 1.10001e-06 [after_resolve]: 1.014e-05 [a_after_grad]: 8.80999e-06 [renormalize]: 8.9989e-08 [add_forward_monad_depend]: 2.79999e-06 [auto_monad_grad]: 1.70001e-06 [auto_monad_eliminator]: 1.005e-05 [cse]: 1.894e-05 [a_3]: 5.021e-05 [py_interpret_to_execute_after_opt_a]: 1.398e-05 [slice_cell_reuse_recomputed_activation]: 5.11002e-06 [rewriter_after_opt_a]: 4.427e-05 [convert_after_rewriter]: 1.017e-05 [order_py_execute_after_rewriter]: 8.85001e-06 [mutable_eliminate]: 0.00059101 [opt_b]: 0.00030459, [1] [Cycle 1]: 0.00029379, [7] [b_1]: 0.00018408 [b_2]: 9.15001e-06 [updatestate_depend_eliminate]: 6.96001e-06 [updatestate_assign_eliminate]: 2.57001e-06 [updatestate_loads_eliminate]: 2.49001e-06 [renormalize]: 1.57999e-06 [cse]: 2.58e-05 [optimize_parallel_all_gather_comm]: 2.14e-05 [overlap_param_gather]: 5.24e-06 [cconv]: 3.328e-05 [loop_unroll]: 0.00047821 [opt_after_cconv]: 0.00013676, [1] [Cycle 1]: 0.0001272, [7] [c_1]: 2.888e-05 [parameter_eliminate]: 4.33001e-06 [updatestate_depend_eliminate]: 6.86001e-06 [updatestate_assign_eliminate]: 2.56e-06 [updatestate_loads_eliminate]: 2.83e-06 [cse]: 2.301e-05 [renormalize]: 4.30009e-07 [remove_dup_value]: 1.737e-05 [tuple_transform]: 9.148e-05, [1] [Cycle 1]: 8.453e-05, [4] [d_1]: 4.293e-05 [none_parameter_eliminate]: 2.12999e-06 [renormalize]: 1.80007e-07 [switch_simplify]: 7.2e-06 [partial_unused_args_eliminate]: 5.42999e-06 [add_recomputation]: 5.109e-05 [cse_after_recomputation]: 3.201e-05, [1] [Cycle 1]: 2.547e-05, [1] [cse]: 1.41e-05 [environ_conv]: 9.82999e-06 [swap_dp_allreduce_reducescatter]: 9.06998e-06 [bias_add_comm_swap]: 5.40001e-06 [label_micro_interleaved_index]: 7.15e-06 [label_fine_grained_interleaved_index]: 6.01e-06 [merge_cast_opt]: 4.17e-06 [slice_recompute_activation]: 4.97e-06 [micro_interleaved_order_control]: 4.57998e-06 [assign_add_opt]: 3.68e-06 [ForceFp32Comm]: 3.28e-06 [remove_cast_before_assign_add]: 3.65e-06 [full_micro_interleaved_order_control]: 4.87e-06 [reorder_send_recv_between_fp_bp]: 6.04001e-06 [comm_op_add_attrs]: 3.58e-06 [add_comm_op_reuse_tag]: 3.42002e-06 [interleave_split_concat_branches]: 3.85e-06 [interleave_parallel_branches]: 3.41001e-06 [overlap_opt_shard_in_pipeline]: 3.98001e-06 [overlap_opt_shard_grad_in_pipeline]: 4.87e-06 [control_data_broadcast_order]: 1.97e-05 [grouped_pairwise_exchange_alltoall]: 4.35e-06 [offloading_packed_experts]: 7.21001e-06 [overlap_recompute_and_grad_model_parallel]: 8.04002e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.5e-06 [overlap_recompute_allgather_and_fa_grad]: 3.72002e-06 [overlap_recompute_comm]: 5.34998e-06 [overlap_grad_ring_attention]: 6.85002e-06 [overlap_grad_flash_sp]: 2.453e-05 [begin_end_overlap_inline]: 2.89999e-06 [split_matmul_comm_elemetwise]: 5.57001e-06 [split_layernorm_comm]: 4.69002e-06 [handle_group_info]: 3.78001e-06 [symbol_engine_optimizer]: 0.00011274, [1] [Cycle 1]: 0.00010433, [6] [build]: 3.70998e-06 [elim_shapecalc]: 1.411e-05 [elim_not_effective]: 1.543e-05 [opt_reshape]: 7.70998e-06 [fold_const_symbol]: 9.81e-06 [renormalize]: 3.19997e-07 [detach_backward]: 3.82002e-06 [pipeline_parallel_scheduler]: 1.84998e-06 [auto_monad_reorder]: 2.012e-05 [get_jit_bprop_graph]: 2.00002e-06 [rewriter_after_jit_bprop_graph]: 5.81e-06 [opt_after_jit_grad]: 0.00052307 [validate]: 4.026e-05 Sums bootstrap : 0.000454s : 4.51% type_inference : 0.005014s : 49.82% event_method : 0.000013s : 0.13% auto_monad : 0.000058s : 0.57% graph_reusing : 0.000005s : 0.05% inline : 0.000002s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000015s : 0.14% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000004s : 0.04% parallel-infer-symbol : 0.000003s : 0.03% pre_auto_parallel : 0.000028s : 0.28% insert-virtual-dataset : 0.000002s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.02% optimize.py_interpret_to_execute : 0.000024s : 0.23% optimize.rewriter_before_opt_a : 0.000049s : 0.49% optimize.opt_a.expand_dump_flag : 0.000005s : 0.05% optimize.opt_a.switch_simplify : 0.000034s : 0.34% optimize.opt_a.loop_unroll : 0.000020s : 0.20% optimize.opt_a.a_1 : 0.000401s : 3.98% optimize.opt_a.with_stream_mark : 0.000033s : 0.33% optimize.opt_a.recompute_prepare : 0.000017s : 0.17% optimize.opt_a.updatestate_depend_eliminate : 0.000008s : 0.08% optimize.opt_a.updatestate_assign_eliminate : 0.000006s : 0.06% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.06% optimize.opt_a.parameter_eliminate : 0.000004s : 0.04% optimize.opt_a.a_2 : 0.000232s : 2.30% optimize.opt_a.accelerated_algorithm : 0.000014s : 0.14% optimize.opt_a.shard : 0.000004s : 0.04% optimize.opt_a.meta_shard_fg_expand : 0.000003s : 0.03% optimize.opt_a.shard_inline : 0.000013s : 0.13% optimize.opt_a.merge_send_recv : 0.000016s : 0.16% optimize.opt_a.auto_parallel : 0.000014s : 0.13% optimize.opt_a.parallel : 0.000024s : 0.24% optimize.opt_a.flash_sp : 0.000014s : 0.13% optimize.opt_a.merge_comm : 0.000008s : 0.08% optimize.opt_a.allreduce_fusion : 0.000007s : 0.07% optimize.opt_a.matmul_add_comm_reduction : 0.000018s : 0.18% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000017s : 0.17% optimize.opt_a.virtual_dataset : 0.000012s : 0.12% optimize.opt_a.get_grad_eliminate_ : 0.000012s : 0.11% optimize.opt_a.virtual_output : 0.000012s : 0.12% optimize.opt_a.merge_forward : 0.000008s : 0.08% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.03% optimize.opt_a.offload_activation : 0.000020s : 0.20% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000034s : 0.34% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.03% optimize.opt_a.before_grad : 0.000021s : 0.21% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000009s : 0.09% optimize.opt_a.meta_fg_expand : 0.000005s : 0.05% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.04% optimize.opt_a.receive_attached : 0.000003s : 0.03% optimize.opt_a.after_resolve : 0.000020s : 0.20% optimize.opt_a.a_after_grad : 0.000018s : 0.18% optimize.opt_a.renormalize : 0.000582s : 5.78% optimize.opt_a.add_forward_monad_depend : 0.000009s : 0.09% optimize.opt_a.auto_monad_grad : 0.000004s : 0.04% optimize.opt_a.auto_monad_eliminator : 0.000027s : 0.27% optimize.opt_a.cse : 0.000049s : 0.48% optimize.opt_a.a_3 : 0.000113s : 1.13% optimize.py_interpret_to_execute_after_opt_a : 0.000014s : 0.14% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.05% optimize.rewriter_after_opt_a : 0.000044s : 0.44% optimize.convert_after_rewriter : 0.000010s : 0.10% optimize.order_py_execute_after_rewriter : 0.000009s : 0.09% optimize.mutable_eliminate : 0.000591s : 5.87% optimize.opt_b.b_1 : 0.000184s : 1.83% optimize.opt_b.b_2 : 0.000009s : 0.09% optimize.opt_b.updatestate_depend_eliminate : 0.000007s : 0.07% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_b.updatestate_loads_eliminate : 0.000002s : 0.02% optimize.opt_b.renormalize : 0.000002s : 0.02% optimize.opt_b.cse : 0.000026s : 0.26% optimize.optimize_parallel_all_gather_comm : 0.000021s : 0.21% optimize.overlap_param_gather : 0.000005s : 0.05% optimize.cconv : 0.000033s : 0.33% optimize.loop_unroll : 0.000478s : 4.75% optimize.opt_after_cconv.c_1 : 0.000029s : 0.29% optimize.opt_after_cconv.parameter_eliminate : 0.000004s : 0.04% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000007s : 0.07% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.cse : 0.000023s : 0.23% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000017s : 0.17% optimize.tuple_transform.d_1 : 0.000043s : 0.43% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.02% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000007s : 0.07% optimize.partial_unused_args_eliminate : 0.000005s : 0.05% optimize.add_recomputation : 0.000051s : 0.51% optimize.cse_after_recomputation.cse : 0.000014s : 0.14% optimize.environ_conv : 0.000010s : 0.10% optimize.swap_dp_allreduce_reducescatter : 0.000009s : 0.09% optimize.bias_add_comm_swap : 0.000005s : 0.05% optimize.label_micro_interleaved_index : 0.000007s : 0.07% optimize.label_fine_grained_interleaved_index : 0.000006s : 0.06% optimize.merge_cast_opt : 0.000004s : 0.04% optimize.slice_recompute_activation : 0.000005s : 0.05% optimize.micro_interleaved_order_control : 0.000005s : 0.05% optimize.assign_add_opt : 0.000004s : 0.04% optimize.ForceFp32Comm : 0.000003s : 0.03% optimize.remove_cast_before_assign_add : 0.000004s : 0.04% optimize.full_micro_interleaved_order_control : 0.000005s : 0.05% optimize.reorder_send_recv_between_fp_bp : 0.000006s : 0.06% optimize.comm_op_add_attrs : 0.000004s : 0.04% optimize.add_comm_op_reuse_tag : 0.000003s : 0.03% optimize.interleave_split_concat_branches : 0.000004s : 0.04% optimize.interleave_parallel_branches : 0.000003s : 0.03% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.04% optimize.overlap_opt_shard_grad_in_pipeline : 0.000005s : 0.05% optimize.control_data_broadcast_order : 0.000020s : 0.20% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.04% optimize.offloading_packed_experts : 0.000007s : 0.07% optimize.overlap_recompute_and_grad_model_parallel : 0.000008s : 0.08% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000003s : 0.03% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.04% optimize.overlap_recompute_comm : 0.000005s : 0.05% optimize.overlap_grad_ring_attention : 0.000007s : 0.07% optimize.overlap_grad_flash_sp : 0.000025s : 0.24% optimize.begin_end_overlap_inline : 0.000003s : 0.03% optimize.split_matmul_comm_elemetwise : 0.000006s : 0.06% optimize.split_layernorm_comm : 0.000005s : 0.05% optimize.handle_group_info : 0.000004s : 0.04% optimize.symbol_engine_optimizer.build : 0.000004s : 0.04% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000014s : 0.14% optimize.symbol_engine_optimizer.elim_not_effective : 0.000015s : 0.15% optimize.symbol_engine_optimizer.opt_reshape : 0.000008s : 0.08% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000010s : 0.10% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000004s : 0.04% pipeline_parallel_scheduler : 0.000002s : 0.02% auto_monad_reorder : 0.000020s : 0.20% get_jit_bprop_graph : 0.000002s : 0.02% rewriter_after_jit_bprop_graph : 0.000006s : 0.06% opt_after_jit_grad : 0.000523s : 5.20% validate : 0.000040s : 0.40% Time group info: ------[substitution.] 0.000128 20 1.41% : 0.000002s : 2: substitution.elim_not_effective 1.03% : 0.000001s : 2: substitution.fold_const_symbol 4.61% : 0.000006s : 3: substitution.graph_param_transform 74.71% : 0.000095s : 2: substitution.inline 3.24% : 0.000004s : 4: substitution.j_node_and_user_rematch 3.61% : 0.000005s : 4: substitution.remove_not_recompute_node 2.94% : 0.000004s : 2: substitution.replace_old_param 8.45% : 0.000011s : 1: substitution.value_based_eliminate ------[type_inference.] 0.004962 2 90.95% : 0.004513s : 1: type_inference.infer 9.05% : 0.000449s : 1: type_inference.specialize ------[replace.] 0.000022 2 100.00% : 0.000022s : 2: replace.inline ------[match.] 0.000093 2 100.00% : 0.000093s : 2: match.inline ------[predicate.] 0.000139 754 0.78% : 0.000001s : 7: predicate.accumulaten_eliminater 1.12% : 0.000002s : 3: predicate.ad_related_special_op_eliminate 0.64% : 0.000001s : 6: predicate.addn_check_dump 0.75% : 0.000001s : 7: predicate.addn_zero_filter 0.73% : 0.000001s : 7: predicate.adjust_all_reduce_mul_add 2.05% : 0.000003s : 13: predicate.arithmetic_simplify 0.77% : 0.000001s : 7: predicate.cast_eliminate 0.82% : 0.000001s : 6: predicate.check_bprop_eliminate 0.67% : 0.000001s : 6: predicate.compare_switch_simplify 0.20% : 0.000000s : 3: predicate.const_output_eliminate 0.73% : 0.000001s : 6: predicate.depend_value_elim 0.75% : 0.000001s : 7: predicate.dict_get_item_const_eliminator 0.89% : 0.000001s : 7: predicate.dict_get_item_eliminator 0.79% : 0.000001s : 7: predicate.dict_set_item_eliminator 1.15% : 0.000002s : 6: predicate.dumpgradient_eliminate 0.47% : 0.000001s : 3: predicate.elim_not_effective 0.69% : 0.000001s : 3: predicate.elim_shapecalc_of_broadcastargs 1.04% : 0.000001s : 10: predicate.environ_add_const_eliminate 0.97% : 0.000001s : 10: predicate.environ_get_add_eliminate 0.95% : 0.000001s : 10: predicate.environ_get_depend_swap 1.92% : 0.000003s : 16: predicate.environ_get_eliminate 0.95% : 0.000001s : 10: predicate.environ_get_set_eliminate 0.91% : 0.000001s : 9: predicate.exchange_switch_depend_value 1.76% : 0.000002s : 9: predicate.float_depend_g_call 0.65% : 0.000001s : 6: predicate.float_environ_get_switch 0.94% : 0.000001s : 9: predicate.float_tuple_getitem_switch 0.20% : 0.000000s : 3: predicate.fold_const_symbol 0.74% : 0.000001s : 6: predicate.get_grad_eliminate 0.37% : 0.000001s : 3: predicate.graph_param_transform 0.74% : 0.000001s : 6: predicate.incorporate_call 0.65% : 0.000001s : 6: predicate.incorporate_call_switch 6.88% : 0.000010s : 34: predicate.inline 1.27% : 0.000002s : 6: predicate.inline_without_move 0.44% : 0.000001s : 6: predicate.j_node_and_user_rematch 1.20% : 0.000002s : 6: predicate.less_batch_normalization 1.51% : 0.000002s : 13: predicate.list_to_tuple_eliminator_ 1.97% : 0.000003s : 20: predicate.load_eliminater 1.60% : 0.000002s : 3: predicate.loop_unroll_after_grad 1.59% : 0.000002s : 14: predicate.loop_unroll_before_grad 1.74% : 0.000002s : 13: predicate.make_slice_get_slice_eliminator 0.69% : 0.000001s : 6: predicate.merge_addn 0.66% : 0.000001s : 6: predicate.micro_step_allgather_replace 0.67% : 0.000001s : 6: predicate.mini_step_allgather_replace 0.67% : 0.000001s : 7: predicate.minmaximum_grad 1.84% : 0.000003s : 3: predicate.mutable_eliminate 0.57% : 0.000001s : 3: predicate.opt_reshape 0.58% : 0.000001s : 3: predicate.parallel_virtual_node 1.33% : 0.000002s : 9: predicate.partial_defer_inline 1.14% : 0.000002s : 10: predicate.partial_eliminate 0.70% : 0.000001s : 7: predicate.print_const_string_wrapper 0.81% : 0.000001s : 6: predicate.reduce_all_const_elim 1.00% : 0.000001s : 7: predicate.reduce_eliminate 2.04% : 0.000003s : 20: predicate.redundant_stop_gradient_eliminater 0.76% : 0.000001s : 6: predicate.remove_not_recompute_node 1.24% : 0.000002s : 13: predicate.replace_applicator 0.98% : 0.000001s : 6: predicate.replace_old_param 0.47% : 0.000001s : 3: predicate.reset_defer_inline 0.80% : 0.000001s : 7: predicate.reshape_eliminate 0.82% : 0.000001s : 6: predicate.row_tensor_add_zeros_like 0.60% : 0.000001s : 3: predicate.row_tensor_eliminate 0.90% : 0.000001s : 6: predicate.same_eliminate 0.67% : 0.000001s : 6: predicate.set_cell_output_no_recompute 1.31% : 0.000002s : 6: predicate.shard_identity_eliminate 0.83% : 0.000001s : 6: predicate.special_op_eliminate 1.22% : 0.000002s : 6: predicate.specialize_transform 1.08% : 0.000002s : 6: predicate.split_environ_get_set_with_tuple_value 0.84% : 0.000001s : 6: predicate.stack_unstack_eliminate 0.39% : 0.000001s : 3: predicate.switch_call_monad_eliminater 0.96% : 0.000001s : 9: predicate.switch_defer_inline 1.61% : 0.000002s : 15: predicate.switch_layer_defer_inline 4.44% : 0.000006s : 32: predicate.switch_simplify 0.81% : 0.000001s : 7: predicate.tile_eliminate 0.85% : 0.000001s : 7: predicate.transpose_eliminate 1.46% : 0.000002s : 13: predicate.tuple_list_convert_item_index_to_positive 1.63% : 0.000002s : 13: predicate.tuple_list_get_item_const_eliminator 1.36% : 0.000002s : 13: predicate.tuple_list_get_item_depend_reorder 3.36% : 0.000005s : 19: predicate.tuple_list_get_item_eliminator 1.36% : 0.000002s : 13: predicate.tuple_list_get_set_item_eliminator 2.46% : 0.000003s : 19: predicate.tuple_list_set_item_eliminator 1.46% : 0.000002s : 13: predicate.tuple_to_list_eliminator_ 1.94% : 0.000003s : 20: predicate.updatestate_pure_node_eliminater 2.79% : 0.000004s : 26: predicate.updatestate_useless_node_eliminater 0.70% : 0.000001s : 3: predicate.value_based_eliminate 0.69% : 0.000001s : 6: predicate.virtual_dataset_eliminate 0.72% : 0.000001s : 6: predicate.virtual_output_eliminate 0.31% : 0.000000s : 3: predicate.virtual_view_grad_eliminate 0.50% : 0.000001s : 3: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000236 5 8.60% : 0.000020s : 1: func_graph_cloner_run.FuncGraphClonerGraph 91.40% : 0.000216s : 4: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.025062 192 0.02% : 0.000006s : 1: ForceFp32Comm 12.97% : 0.003251s : 1: add_attr 12.91% : 0.003236s : 1: add_attr_with_inline 0.03% : 0.000006s : 1: add_comm_op_reuse_tag 0.22% : 0.000055s : 1: add_recomputation 0.03% : 0.000006s : 1: assign_add_opt 0.27% : 0.000067s : 1: auto_monad 0.12% : 0.000029s : 1: auto_monad_reorder 0.02% : 0.000006s : 1: begin_end_overlap_inline 0.03% : 0.000008s : 1: bias_add_comm_swap 2.00% : 0.000500s : 1: bootstrap 0.15% : 0.000037s : 1: cconv 0.03% : 0.000006s : 1: comm_op_add_attrs 0.09% : 0.000023s : 1: control_data_broadcast_order 0.05% : 0.000013s : 1: convert_after_rewriter 0.14% : 0.000036s : 1: cse_after_recomputation 0.03% : 0.000008s : 1: dataset_repeat_opt 0.08% : 0.000021s : 1: detach_backward 0.05% : 0.000013s : 1: environ_conv 0.09% : 0.000024s : 1: event_method 0.03% : 0.000008s : 1: full_micro_interleaved_order_control 0.03% : 0.000008s : 1: get_jit_bprop_graph 0.05% : 0.000011s : 1: graph_reusing 0.03% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.03% : 0.000007s : 1: handle_group_info 0.03% : 0.000008s : 1: inline 0.03% : 0.000009s : 1: insert-virtual-dataset 0.02% : 0.000006s : 1: interleave_parallel_branches 0.03% : 0.000007s : 1: interleave_split_concat_branches 0.04% : 0.000009s : 1: label_fine_grained_interleaved_index 0.04% : 0.000010s : 1: label_micro_interleaved_index 1.94% : 0.000486s : 1: loop_unroll 0.03% : 0.000007s : 1: merge_cast_opt 0.03% : 0.000008s : 1: micro_interleaved_order_control 2.39% : 0.000600s : 1: mutable_eliminate 0.04% : 0.000010s : 1: offloading_packed_experts 0.07% : 0.000017s : 1: opt.transform.loop_unroll_optimizer 0.07% : 0.000019s : 1: opt.transform.mutable_eliminate 3.23% : 0.000809s : 78: opt.transform.opt_a 0.11% : 0.000027s : 1: opt.transform.opt_after_cconv 0.11% : 0.000026s : 1: opt.transform.opt_after_jit_grad 0.46% : 0.000115s : 28: opt.transform.opt_b 0.19% : 0.000047s : 2: opt.transform.opt_trans_graph 0.17% : 0.000041s : 4: opt.transform.symbol_engine_opt 10.70% : 0.002682s : 1: opt_a 0.56% : 0.000140s : 1: opt_after_cconv 2.14% : 0.000535s : 1: opt_after_jit_grad 1.23% : 0.000309s : 1: opt_b 21.81% : 0.005467s : 1: optimize 0.10% : 0.000026s : 1: optimize_parallel_all_gather_comm 0.05% : 0.000012s : 1: order_py_execute_after_rewriter 0.11% : 0.000028s : 1: overlap_grad_flash_sp 0.02% : 0.000006s : 1: overlap_grad_matmul_and_grad_allreduce 0.04% : 0.000010s : 1: overlap_grad_ring_attention 0.03% : 0.000008s : 1: overlap_opt_shard_grad_in_pipeline 0.03% : 0.000007s : 1: overlap_opt_shard_in_pipeline 0.03% : 0.000008s : 1: overlap_param_gather 0.03% : 0.000006s : 1: overlap_recompute_allgather_and_fa_grad 0.04% : 0.000011s : 1: overlap_recompute_and_grad_model_parallel 0.03% : 0.000008s : 1: overlap_recompute_comm 0.04% : 0.000010s : 1: parallel-infer-symbol 0.02% : 0.000006s : 1: parallel-infer-symbol-second 0.03% : 0.000008s : 1: partial_unused_args_eliminate 0.04% : 0.000009s : 1: pipeline_parallel_scheduler 0.03% : 0.000007s : 1: pipeline_split 0.14% : 0.000036s : 1: pre_auto_parallel 0.11% : 0.000027s : 1: py_interpret_to_execute 0.07% : 0.000018s : 1: py_interpret_to_execute_after_opt_a 0.03% : 0.000006s : 1: remove_cast_before_assign_add 0.08% : 0.000021s : 1: remove_dup_value 1.29% : 0.000324s : 1: renormalize.infer 0.99% : 0.000248s : 1: renormalize.specialize 0.04% : 0.000009s : 1: reorder_send_recv_between_fp_bp 0.05% : 0.000012s : 1: rewriter_after_jit_bprop_graph 0.19% : 0.000048s : 1: rewriter_after_opt_a 0.21% : 0.000053s : 1: rewriter_before_opt_a 0.03% : 0.000008s : 1: slice_cell_reuse_recomputed_activation 0.03% : 0.000008s : 1: slice_recompute_activation 0.03% : 0.000008s : 1: split_layernorm_comm 0.03% : 0.000008s : 1: split_matmul_comm_elemetwise 0.05% : 0.000012s : 1: swap_dp_allreduce_reducescatter 0.46% : 0.000116s : 1: symbol_engine_optimizer 0.38% : 0.000095s : 1: tuple_transform 20.14% : 0.005047s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:54.664.410 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0133041, [21] [bootstrap]: 0.00043083 [type_inference]: 0.00468422 [event_method]: 1.208e-05 [auto_monad]: 5.42e-05 [graph_reusing]: 4.94e-06 [inline]: 2.26e-06 [add_attr]: 0.00315774, [1] [add_attr_with_inline]: 0.00314933, [1] [Cycle 1]: 5.385e-05, [2] [tag_attr]: 1.427e-05 [meta_addattr_fg_expand]: 3.65e-06 [parallel-infer-symbol]: 3.32002e-06 [pre_auto_parallel]: 2.61e-05 [insert-virtual-dataset]: 2.68e-06 [parallel-infer-symbol-second]: 6.89994e-07 [dataset_repeat_opt]: 2.42001e-06 [pipeline_split]: 1.48002e-06 [optimize]: 0.00418892, [53] [py_interpret_to_execute]: 1.707e-05 [rewriter_before_opt_a]: 4.483e-05 [opt_a]: 0.00213167, [2] [Cycle 1]: 0.00147077, [45] [expand_dump_flag]: 2.79001e-06 [switch_simplify]: 2.592e-05 [loop_unroll]: 1.374e-05 [a_1]: 0.00028085 [with_stream_mark]: 1.615e-05 [recompute_prepare]: 7.58999e-06 [updatestate_depend_eliminate]: 3.56999e-06 [updatestate_assign_eliminate]: 3.58e-06 [updatestate_loads_eliminate]: 2.86999e-06 [parameter_eliminate]: 1.79e-06 [a_2]: 7.768e-05 [accelerated_algorithm]: 6.83e-06 [shard]: 2.01e-06 [meta_shard_fg_expand]: 1.62999e-06 [shard_inline]: 6.16e-06 [merge_send_recv]: 7.85998e-06 [auto_parallel]: 5.96e-06 [parallel]: 1.842e-05 [flash_sp]: 7.91001e-06 [merge_comm]: 3.85e-06 [allreduce_fusion]: 3.65e-06 [matmul_add_comm_reduction]: 9.77001e-06 [allreduce_slice_to_reducescatter]: 5.60016e-07 [virtual_shard_identity]: 8.3e-06 [virtual_dataset]: 5.92999e-06 [get_grad_eliminate_]: 5.75001e-06 [virtual_output]: 5.74999e-06 [merge_forward]: 4.15e-06 [cell_reuse_recompute_pass]: 1.34e-06 [offload_activation]: 1.021e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.399e-05 [merge_recompute_call_nodes]: 1.44e-06 [before_grad]: 1.019e-05 [set_forward_comm_id_for_comm_node_pass]: 3.88001e-06 [meta_fg_expand]: 2.53e-06 [flash_sp_send_recv_attached]: 2.76999e-06 [receive_attached]: 2.66999e-06 [after_resolve]: 9.03002e-06 [a_after_grad]: 8.94998e-06 [renormalize]: 0.00051734 [add_forward_monad_depend]: 5.78997e-06 [auto_monad_grad]: 2.36e-06 [auto_monad_eliminator]: 1.597e-05 [cse]: 3.283e-05 [a_3]: 4.738e-05 [Cycle 2]: 0.0006503, [45] [expand_dump_flag]: 1.09e-06 [switch_simplify]: 7.43e-06 [loop_unroll]: 5.54e-06 [a_1]: 0.0001053 [with_stream_mark]: 1.408e-05 [recompute_prepare]: 5.76e-06 [updatestate_depend_eliminate]: 3.06001e-06 [updatestate_assign_eliminate]: 2.63e-06 [updatestate_loads_eliminate]: 3.28e-06 [parameter_eliminate]: 1.58002e-06 [a_2]: 6.949e-05 [accelerated_algorithm]: 6.16998e-06 [shard]: 1.19e-06 [meta_shard_fg_expand]: 1.68002e-06 [shard_inline]: 6.77002e-06 [merge_send_recv]: 7.3e-06 [auto_parallel]: 5.71e-06 [parallel]: 6.07999e-06 [flash_sp]: 3.43e-06 [merge_comm]: 3.41999e-06 [allreduce_fusion]: 3.46001e-06 [matmul_add_comm_reduction]: 8.01001e-06 [allreduce_slice_to_reducescatter]: 5.09986e-07 [virtual_shard_identity]: 9.45001e-06 [virtual_dataset]: 6.20002e-06 [get_grad_eliminate_]: 5.46998e-06 [virtual_output]: 5.10999e-06 [merge_forward]: 3.61999e-06 [cell_reuse_recompute_pass]: 1.74e-06 [offload_activation]: 7.04001e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.583e-05 [merge_recompute_call_nodes]: 1.25001e-06 [before_grad]: 9.87001e-06 [set_forward_comm_id_for_comm_node_pass]: 5.19003e-06 [meta_fg_expand]: 2.15002e-06 [flash_sp_send_recv_attached]: 8.79983e-07 [receive_attached]: 1.04e-06 [after_resolve]: 9.05999e-06 [a_after_grad]: 8.50001e-06 [renormalize]: 7.00238e-08 [add_forward_monad_depend]: 2.58e-06 [auto_monad_grad]: 1.35001e-06 [auto_monad_eliminator]: 8.97999e-06 [cse]: 1.673e-05 [a_3]: 3.655e-05 [py_interpret_to_execute_after_opt_a]: 9.57999e-06 [slice_cell_reuse_recomputed_activation]: 2.17001e-06 [rewriter_after_opt_a]: 3.571e-05 [convert_after_rewriter]: 7.03e-06 [order_py_execute_after_rewriter]: 5.72999e-06 [mutable_eliminate]: 0.00050802 [opt_b]: 0.00022204, [1] [Cycle 1]: 0.00021593, [7] [b_1]: 0.00013355 [b_2]: 9.39e-06 [updatestate_depend_eliminate]: 6.88998e-06 [updatestate_assign_eliminate]: 2.63e-06 [updatestate_loads_eliminate]: 2.43e-06 [renormalize]: 7.2e-07 [cse]: 2.256e-05 [optimize_parallel_all_gather_comm]: 1.687e-05 [overlap_param_gather]: 2.08002e-06 [cconv]: 2.908e-05 [loop_unroll]: 0.0004656 [opt_after_cconv]: 0.0001105, [1] [Cycle 1]: 0.0001044, [7] [c_1]: 2.794e-05 [parameter_eliminate]: 3.35e-06 [updatestate_depend_eliminate]: 6.61999e-06 [updatestate_assign_eliminate]: 2.59001e-06 [updatestate_loads_eliminate]: 2.91999e-06 [cse]: 2.422e-05 [renormalize]: 6.89994e-07 [remove_dup_value]: 1.494e-05 [tuple_transform]: 7.436e-05, [1] [Cycle 1]: 6.925e-05, [4] [d_1]: 4.034e-05 [none_parameter_eliminate]: 1.75001e-06 [renormalize]: 1.80007e-07 [switch_simplify]: 6.98e-06 [partial_unused_args_eliminate]: 1.92001e-06 [add_recomputation]: 5.013e-05 [cse_after_recomputation]: 2.329e-05, [1] [Cycle 1]: 1.858e-05, [1] [cse]: 1.222e-05 [environ_conv]: 5.59e-06 [swap_dp_allreduce_reducescatter]: 5.42999e-06 [bias_add_comm_swap]: 2.55997e-06 [label_micro_interleaved_index]: 5.02e-06 [label_fine_grained_interleaved_index]: 2.61e-06 [merge_cast_opt]: 1.47999e-06 [slice_recompute_activation]: 2.24001e-06 [micro_interleaved_order_control]: 2.14999e-06 [assign_add_opt]: 1.29e-06 [ForceFp32Comm]: 7.89994e-07 [remove_cast_before_assign_add]: 9.89996e-07 [full_micro_interleaved_order_control]: 2.43002e-06 [reorder_send_recv_between_fp_bp]: 2.91999e-06 [comm_op_add_attrs]: 1.02e-06 [add_comm_op_reuse_tag]: 9.40025e-07 [interleave_split_concat_branches]: 1.15001e-06 [interleave_parallel_branches]: 1.29e-06 [overlap_opt_shard_in_pipeline]: 1.20999e-06 [overlap_opt_shard_grad_in_pipeline]: 2.02001e-06 [control_data_broadcast_order]: 1.464e-05 [grouped_pairwise_exchange_alltoall]: 1.55999e-06 [offloading_packed_experts]: 4.15999e-06 [overlap_recompute_and_grad_model_parallel]: 5.27001e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.10999e-06 [overlap_recompute_allgather_and_fa_grad]: 1.39998e-06 [overlap_recompute_comm]: 2.02001e-06 [overlap_grad_ring_attention]: 4.33001e-06 [overlap_grad_flash_sp]: 2.04e-05 [begin_end_overlap_inline]: 7.89994e-07 [split_matmul_comm_elemetwise]: 2.12001e-06 [split_layernorm_comm]: 1.63002e-06 [handle_group_info]: 1.20001e-06 [symbol_engine_optimizer]: 8.768e-05, [1] [Cycle 1]: 8.289e-05, [6] [build]: 2.93e-06 [elim_shapecalc]: 1.496e-05 [elim_not_effective]: 1.498e-05 [opt_reshape]: 6.84999e-06 [fold_const_symbol]: 1.019e-05 [renormalize]: 4.00003e-07 [detach_backward]: 2.31998e-06 [pipeline_parallel_scheduler]: 1.49e-06 [auto_monad_reorder]: 1.9e-05 [get_jit_bprop_graph]: 1.76e-06 [rewriter_after_jit_bprop_graph]: 4.42e-06 [opt_after_jit_grad]: 0.00051338 [validate]: 4.082e-05 Sums bootstrap : 0.000431s : 4.70% type_inference : 0.004684s : 51.14% event_method : 0.000012s : 0.13% auto_monad : 0.000054s : 0.59% graph_reusing : 0.000005s : 0.05% inline : 0.000002s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000014s : 0.16% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000004s : 0.04% parallel-infer-symbol : 0.000003s : 0.04% pre_auto_parallel : 0.000026s : 0.28% insert-virtual-dataset : 0.000003s : 0.03% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.03% pipeline_split : 0.000001s : 0.02% optimize.py_interpret_to_execute : 0.000017s : 0.19% optimize.rewriter_before_opt_a : 0.000045s : 0.49% optimize.opt_a.expand_dump_flag : 0.000004s : 0.04% optimize.opt_a.switch_simplify : 0.000033s : 0.36% optimize.opt_a.loop_unroll : 0.000019s : 0.21% optimize.opt_a.a_1 : 0.000386s : 4.22% optimize.opt_a.with_stream_mark : 0.000030s : 0.33% optimize.opt_a.recompute_prepare : 0.000013s : 0.15% optimize.opt_a.updatestate_depend_eliminate : 0.000007s : 0.07% optimize.opt_a.updatestate_assign_eliminate : 0.000006s : 0.07% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.07% optimize.opt_a.parameter_eliminate : 0.000003s : 0.04% optimize.opt_a.a_2 : 0.000147s : 1.61% optimize.opt_a.accelerated_algorithm : 0.000013s : 0.14% optimize.opt_a.shard : 0.000003s : 0.03% optimize.opt_a.meta_shard_fg_expand : 0.000003s : 0.04% optimize.opt_a.shard_inline : 0.000013s : 0.14% optimize.opt_a.merge_send_recv : 0.000015s : 0.17% optimize.opt_a.auto_parallel : 0.000012s : 0.13% optimize.opt_a.parallel : 0.000024s : 0.27% optimize.opt_a.flash_sp : 0.000011s : 0.12% optimize.opt_a.merge_comm : 0.000007s : 0.08% optimize.opt_a.allreduce_fusion : 0.000007s : 0.08% optimize.opt_a.matmul_add_comm_reduction : 0.000018s : 0.19% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000018s : 0.19% optimize.opt_a.virtual_dataset : 0.000012s : 0.13% optimize.opt_a.get_grad_eliminate_ : 0.000011s : 0.12% optimize.opt_a.virtual_output : 0.000011s : 0.12% optimize.opt_a.merge_forward : 0.000008s : 0.08% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.03% optimize.opt_a.offload_activation : 0.000017s : 0.19% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000030s : 0.33% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.03% optimize.opt_a.before_grad : 0.000020s : 0.22% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000009s : 0.10% optimize.opt_a.meta_fg_expand : 0.000005s : 0.05% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.04% optimize.opt_a.receive_attached : 0.000004s : 0.04% optimize.opt_a.after_resolve : 0.000018s : 0.20% optimize.opt_a.a_after_grad : 0.000017s : 0.19% optimize.opt_a.renormalize : 0.000517s : 5.65% optimize.opt_a.add_forward_monad_depend : 0.000008s : 0.09% optimize.opt_a.auto_monad_grad : 0.000004s : 0.04% optimize.opt_a.auto_monad_eliminator : 0.000025s : 0.27% optimize.opt_a.cse : 0.000050s : 0.54% optimize.opt_a.a_3 : 0.000084s : 0.92% optimize.py_interpret_to_execute_after_opt_a : 0.000010s : 0.10% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.02% optimize.rewriter_after_opt_a : 0.000036s : 0.39% optimize.convert_after_rewriter : 0.000007s : 0.08% optimize.order_py_execute_after_rewriter : 0.000006s : 0.06% optimize.mutable_eliminate : 0.000508s : 5.55% optimize.opt_b.b_1 : 0.000134s : 1.46% optimize.opt_b.b_2 : 0.000009s : 0.10% optimize.opt_b.updatestate_depend_eliminate : 0.000007s : 0.08% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_b.updatestate_loads_eliminate : 0.000002s : 0.03% optimize.opt_b.renormalize : 0.000001s : 0.01% optimize.opt_b.cse : 0.000023s : 0.25% optimize.optimize_parallel_all_gather_comm : 0.000017s : 0.18% optimize.overlap_param_gather : 0.000002s : 0.02% optimize.cconv : 0.000029s : 0.32% optimize.loop_unroll : 0.000466s : 5.08% optimize.opt_after_cconv.c_1 : 0.000028s : 0.31% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.04% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000007s : 0.07% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.cse : 0.000024s : 0.26% optimize.opt_after_cconv.renormalize : 0.000001s : 0.01% optimize.remove_dup_value : 0.000015s : 0.16% optimize.tuple_transform.d_1 : 0.000040s : 0.44% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.02% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000007s : 0.08% optimize.partial_unused_args_eliminate : 0.000002s : 0.02% optimize.add_recomputation : 0.000050s : 0.55% optimize.cse_after_recomputation.cse : 0.000012s : 0.13% optimize.environ_conv : 0.000006s : 0.06% optimize.swap_dp_allreduce_reducescatter : 0.000005s : 0.06% optimize.bias_add_comm_swap : 0.000003s : 0.03% optimize.label_micro_interleaved_index : 0.000005s : 0.05% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.03% optimize.merge_cast_opt : 0.000001s : 0.02% optimize.slice_recompute_activation : 0.000002s : 0.02% optimize.micro_interleaved_order_control : 0.000002s : 0.02% optimize.assign_add_opt : 0.000001s : 0.01% optimize.ForceFp32Comm : 0.000001s : 0.01% optimize.remove_cast_before_assign_add : 0.000001s : 0.01% optimize.full_micro_interleaved_order_control : 0.000002s : 0.03% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.03% optimize.comm_op_add_attrs : 0.000001s : 0.01% optimize.add_comm_op_reuse_tag : 0.000001s : 0.01% optimize.interleave_split_concat_branches : 0.000001s : 0.01% optimize.interleave_parallel_branches : 0.000001s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.02% optimize.control_data_broadcast_order : 0.000015s : 0.16% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.02% optimize.offloading_packed_experts : 0.000004s : 0.05% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.06% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.02% optimize.overlap_recompute_comm : 0.000002s : 0.02% optimize.overlap_grad_ring_attention : 0.000004s : 0.05% optimize.overlap_grad_flash_sp : 0.000020s : 0.22% optimize.begin_end_overlap_inline : 0.000001s : 0.01% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.02% optimize.split_layernorm_comm : 0.000002s : 0.02% optimize.handle_group_info : 0.000001s : 0.01% optimize.symbol_engine_optimizer.build : 0.000003s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000015s : 0.16% optimize.symbol_engine_optimizer.elim_not_effective : 0.000015s : 0.16% optimize.symbol_engine_optimizer.opt_reshape : 0.000007s : 0.07% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000010s : 0.11% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.03% pipeline_parallel_scheduler : 0.000001s : 0.02% auto_monad_reorder : 0.000019s : 0.21% get_jit_bprop_graph : 0.000002s : 0.02% rewriter_after_jit_bprop_graph : 0.000004s : 0.05% opt_after_jit_grad : 0.000513s : 5.60% validate : 0.000041s : 0.45% Time group info: ------[substitution.] 0.000116 20 1.65% : 0.000002s : 2: substitution.elim_not_effective 1.29% : 0.000001s : 2: substitution.fold_const_symbol 5.03% : 0.000006s : 3: substitution.graph_param_transform 73.34% : 0.000085s : 2: substitution.inline 3.07% : 0.000004s : 4: substitution.j_node_and_user_rematch 4.39% : 0.000005s : 4: substitution.remove_not_recompute_node 2.72% : 0.000003s : 2: substitution.replace_old_param 8.50% : 0.000010s : 1: substitution.value_based_eliminate ------[type_inference.] 0.004637 2 91.06% : 0.004222s : 1: type_inference.infer 8.94% : 0.000415s : 1: type_inference.specialize ------[replace.] 0.000021 2 100.00% : 0.000021s : 2: replace.inline ------[match.] 0.000084 2 100.00% : 0.000084s : 2: match.inline ------[predicate.] 0.000133 754 0.83% : 0.000001s : 7: predicate.accumulaten_eliminater 1.26% : 0.000002s : 3: predicate.ad_related_special_op_eliminate 0.66% : 0.000001s : 6: predicate.addn_check_dump 0.81% : 0.000001s : 7: predicate.addn_zero_filter 0.64% : 0.000001s : 7: predicate.adjust_all_reduce_mul_add 2.08% : 0.000003s : 13: predicate.arithmetic_simplify 0.82% : 0.000001s : 7: predicate.cast_eliminate 0.75% : 0.000001s : 6: predicate.check_bprop_eliminate 0.68% : 0.000001s : 6: predicate.compare_switch_simplify 0.21% : 0.000000s : 3: predicate.const_output_eliminate 0.76% : 0.000001s : 6: predicate.depend_value_elim 0.79% : 0.000001s : 7: predicate.dict_get_item_const_eliminator 1.06% : 0.000001s : 7: predicate.dict_get_item_eliminator 0.79% : 0.000001s : 7: predicate.dict_set_item_eliminator 1.35% : 0.000002s : 6: predicate.dumpgradient_eliminate 0.59% : 0.000001s : 3: predicate.elim_not_effective 0.92% : 0.000001s : 3: predicate.elim_shapecalc_of_broadcastargs 1.11% : 0.000001s : 10: predicate.environ_add_const_eliminate 0.99% : 0.000001s : 10: predicate.environ_get_add_eliminate 0.96% : 0.000001s : 10: predicate.environ_get_depend_swap 1.99% : 0.000003s : 16: predicate.environ_get_eliminate 0.95% : 0.000001s : 10: predicate.environ_get_set_eliminate 0.97% : 0.000001s : 9: predicate.exchange_switch_depend_value 2.01% : 0.000003s : 9: predicate.float_depend_g_call 0.63% : 0.000001s : 6: predicate.float_environ_get_switch 0.91% : 0.000001s : 9: predicate.float_tuple_getitem_switch 0.23% : 0.000000s : 3: predicate.fold_const_symbol 0.78% : 0.000001s : 6: predicate.get_grad_eliminate 0.37% : 0.000000s : 3: predicate.graph_param_transform 0.79% : 0.000001s : 6: predicate.incorporate_call 0.63% : 0.000001s : 6: predicate.incorporate_call_switch 6.29% : 0.000008s : 34: predicate.inline 1.21% : 0.000002s : 6: predicate.inline_without_move 0.38% : 0.000001s : 6: predicate.j_node_and_user_rematch 1.14% : 0.000002s : 6: predicate.less_batch_normalization 1.59% : 0.000002s : 13: predicate.list_to_tuple_eliminator_ 2.10% : 0.000003s : 20: predicate.load_eliminater 1.67% : 0.000002s : 3: predicate.loop_unroll_after_grad 1.63% : 0.000002s : 14: predicate.loop_unroll_before_grad 1.69% : 0.000002s : 13: predicate.make_slice_get_slice_eliminator 0.64% : 0.000001s : 6: predicate.merge_addn 0.68% : 0.000001s : 6: predicate.micro_step_allgather_replace 0.70% : 0.000001s : 6: predicate.mini_step_allgather_replace 0.71% : 0.000001s : 7: predicate.minmaximum_grad 1.65% : 0.000002s : 3: predicate.mutable_eliminate 0.53% : 0.000001s : 3: predicate.opt_reshape 0.51% : 0.000001s : 3: predicate.parallel_virtual_node 1.28% : 0.000002s : 9: predicate.partial_defer_inline 1.21% : 0.000002s : 10: predicate.partial_eliminate 0.71% : 0.000001s : 7: predicate.print_const_string_wrapper 0.69% : 0.000001s : 6: predicate.reduce_all_const_elim 0.97% : 0.000001s : 7: predicate.reduce_eliminate 2.08% : 0.000003s : 20: predicate.redundant_stop_gradient_eliminater 0.80% : 0.000001s : 6: predicate.remove_not_recompute_node 1.36% : 0.000002s : 13: predicate.replace_applicator 0.65% : 0.000001s : 6: predicate.replace_old_param 0.53% : 0.000001s : 3: predicate.reset_defer_inline 0.86% : 0.000001s : 7: predicate.reshape_eliminate 0.78% : 0.000001s : 6: predicate.row_tensor_add_zeros_like 0.47% : 0.000001s : 3: predicate.row_tensor_eliminate 1.36% : 0.000002s : 6: predicate.same_eliminate 0.51% : 0.000001s : 6: predicate.set_cell_output_no_recompute 1.04% : 0.000001s : 6: predicate.shard_identity_eliminate 0.89% : 0.000001s : 6: predicate.special_op_eliminate 0.93% : 0.000001s : 6: predicate.specialize_transform 1.11% : 0.000001s : 6: predicate.split_environ_get_set_with_tuple_value 0.88% : 0.000001s : 6: predicate.stack_unstack_eliminate 0.43% : 0.000001s : 3: predicate.switch_call_monad_eliminater 1.02% : 0.000001s : 9: predicate.switch_defer_inline 1.71% : 0.000002s : 15: predicate.switch_layer_defer_inline 4.35% : 0.000006s : 32: predicate.switch_simplify 0.77% : 0.000001s : 7: predicate.tile_eliminate 0.85% : 0.000001s : 7: predicate.transpose_eliminate 1.59% : 0.000002s : 13: predicate.tuple_list_convert_item_index_to_positive 1.64% : 0.000002s : 13: predicate.tuple_list_get_item_const_eliminator 1.39% : 0.000002s : 13: predicate.tuple_list_get_item_depend_reorder 3.05% : 0.000004s : 19: predicate.tuple_list_get_item_eliminator 1.35% : 0.000002s : 13: predicate.tuple_list_get_set_item_eliminator 2.35% : 0.000003s : 19: predicate.tuple_list_set_item_eliminator 1.39% : 0.000002s : 13: predicate.tuple_to_list_eliminator_ 1.95% : 0.000003s : 20: predicate.updatestate_pure_node_eliminater 2.65% : 0.000004s : 26: predicate.updatestate_useless_node_eliminater 0.53% : 0.000001s : 3: predicate.value_based_eliminate 0.77% : 0.000001s : 6: predicate.virtual_dataset_eliminate 0.74% : 0.000001s : 6: predicate.virtual_output_eliminate 0.33% : 0.000000s : 3: predicate.virtual_view_grad_eliminate 0.54% : 0.000001s : 3: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000213 5 8.73% : 0.000019s : 1: func_graph_cloner_run.FuncGraphClonerGraph 91.27% : 0.000194s : 4: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.022072 192 0.02% : 0.000004s : 1: ForceFp32Comm 14.33% : 0.003162s : 1: add_attr 14.28% : 0.003153s : 1: add_attr_with_inline 0.02% : 0.000004s : 1: add_comm_op_reuse_tag 0.25% : 0.000055s : 1: add_recomputation 0.02% : 0.000004s : 1: assign_add_opt 0.27% : 0.000059s : 1: auto_monad 0.11% : 0.000024s : 1: auto_monad_reorder 0.02% : 0.000004s : 1: begin_end_overlap_inline 0.03% : 0.000006s : 1: bias_add_comm_swap 2.08% : 0.000459s : 1: bootstrap 0.15% : 0.000032s : 1: cconv 0.02% : 0.000004s : 1: comm_op_add_attrs 0.08% : 0.000018s : 1: control_data_broadcast_order 0.05% : 0.000011s : 1: convert_after_rewriter 0.12% : 0.000026s : 1: cse_after_recomputation 0.02% : 0.000005s : 1: dataset_repeat_opt 0.02% : 0.000005s : 1: detach_backward 0.04% : 0.000009s : 1: environ_conv 0.08% : 0.000018s : 1: event_method 0.02% : 0.000005s : 1: full_micro_interleaved_order_control 0.02% : 0.000005s : 1: get_jit_bprop_graph 0.04% : 0.000008s : 1: graph_reusing 0.02% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.02% : 0.000004s : 1: handle_group_info 0.02% : 0.000005s : 1: inline 0.03% : 0.000006s : 1: insert-virtual-dataset 0.02% : 0.000004s : 1: interleave_parallel_branches 0.02% : 0.000004s : 1: interleave_split_concat_branches 0.03% : 0.000006s : 1: label_fine_grained_interleaved_index 0.04% : 0.000008s : 1: label_micro_interleaved_index 2.15% : 0.000475s : 1: loop_unroll 0.02% : 0.000004s : 1: merge_cast_opt 0.02% : 0.000005s : 1: micro_interleaved_order_control 2.34% : 0.000517s : 1: mutable_eliminate 0.03% : 0.000007s : 1: offloading_packed_experts 0.07% : 0.000016s : 1: opt.transform.loop_unroll_optimizer 0.08% : 0.000017s : 1: opt.transform.mutable_eliminate 3.40% : 0.000751s : 78: opt.transform.opt_a 0.12% : 0.000026s : 1: opt.transform.opt_after_cconv 0.12% : 0.000026s : 1: opt.transform.opt_after_jit_grad 0.50% : 0.000110s : 28: opt.transform.opt_b 0.20% : 0.000045s : 2: opt.transform.opt_trans_graph 0.19% : 0.000042s : 4: opt.transform.symbol_engine_opt 9.67% : 0.002135s : 1: opt_a 0.52% : 0.000114s : 1: opt_after_cconv 2.38% : 0.000525s : 1: opt_after_jit_grad 1.02% : 0.000226s : 1: opt_b 19.00% : 0.004195s : 1: optimize 0.09% : 0.000020s : 1: optimize_parallel_all_gather_comm 0.04% : 0.000009s : 1: order_py_execute_after_rewriter 0.12% : 0.000026s : 1: overlap_grad_flash_sp 0.02% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.04% : 0.000008s : 1: overlap_grad_ring_attention 0.02% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.02% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.02% : 0.000005s : 1: overlap_param_gather 0.02% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.04% : 0.000008s : 1: overlap_recompute_and_grad_model_parallel 0.02% : 0.000005s : 1: overlap_recompute_comm 0.03% : 0.000007s : 1: parallel-infer-symbol 0.02% : 0.000004s : 1: parallel-infer-symbol-second 0.02% : 0.000005s : 1: partial_unused_args_eliminate 0.02% : 0.000005s : 1: pipeline_parallel_scheduler 0.02% : 0.000004s : 1: pipeline_split 0.14% : 0.000030s : 1: pre_auto_parallel 0.09% : 0.000020s : 1: py_interpret_to_execute 0.06% : 0.000013s : 1: py_interpret_to_execute_after_opt_a 0.02% : 0.000004s : 1: remove_cast_before_assign_add 0.09% : 0.000019s : 1: remove_dup_value 1.26% : 0.000279s : 1: renormalize.infer 1.05% : 0.000231s : 1: renormalize.specialize 0.03% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.03% : 0.000008s : 1: rewriter_after_jit_bprop_graph 0.18% : 0.000040s : 1: rewriter_after_opt_a 0.22% : 0.000048s : 1: rewriter_before_opt_a 0.02% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.02% : 0.000005s : 1: slice_recompute_activation 0.02% : 0.000005s : 1: split_layernorm_comm 0.02% : 0.000005s : 1: split_matmul_comm_elemetwise 0.04% : 0.000008s : 1: swap_dp_allreduce_reducescatter 0.41% : 0.000090s : 1: symbol_engine_optimizer 0.35% : 0.000077s : 1: tuple_transform 21.29% : 0.004698s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:54.854.777 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:54.855.057 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.014743, [21] [bootstrap]: 0.00042449 [type_inference]: 0.0053848 [event_method]: 1.126e-05 [auto_monad]: 5.424e-05 [graph_reusing]: 5.14e-06 [inline]: 2.27999e-06 [add_attr]: 0.00310935, [1] [add_attr_with_inline]: 0.00310099, [1] [Cycle 1]: 6.572e-05, [2] [tag_attr]: 1.475e-05 [meta_addattr_fg_expand]: 3.85e-06 [parallel-infer-symbol]: 3.78001e-06 [pre_auto_parallel]: 2.538e-05 [insert-virtual-dataset]: 2.51e-06 [parallel-infer-symbol-second]: 7.10017e-07 [dataset_repeat_opt]: 1.81e-06 [pipeline_split]: 1.76e-06 [optimize]: 0.00460211, [53] [py_interpret_to_execute]: 1.993e-05 [rewriter_before_opt_a]: 4.757e-05 [opt_a]: 0.00242338, [2] [Cycle 1]: 0.00161526, [45] [expand_dump_flag]: 2.83998e-06 [switch_simplify]: 2.546e-05 [loop_unroll]: 1.449e-05 [a_1]: 0.00028843 [with_stream_mark]: 1.632e-05 [recompute_prepare]: 8.28001e-06 [updatestate_depend_eliminate]: 4.45999e-06 [updatestate_assign_eliminate]: 3.13e-06 [updatestate_loads_eliminate]: 3.23e-06 [parameter_eliminate]: 1.94e-06 [a_2]: 0.00010735 [accelerated_algorithm]: 7.48999e-06 [shard]: 2.23002e-06 [meta_shard_fg_expand]: 1.71998e-06 [shard_inline]: 6.48e-06 [merge_send_recv]: 7.7e-06 [auto_parallel]: 6.37001e-06 [parallel]: 1.674e-05 [flash_sp]: 9.02e-06 [merge_comm]: 3.53e-06 [allreduce_fusion]: 3.64002e-06 [matmul_add_comm_reduction]: 9.91e-06 [allreduce_slice_to_reducescatter]: 8.09989e-07 [virtual_shard_identity]: 7.91001e-06 [virtual_dataset]: 6.71e-06 [get_grad_eliminate_]: 6.36998e-06 [virtual_output]: 6.51e-06 [merge_forward]: 3.9e-06 [cell_reuse_recompute_pass]: 1.45001e-06 [offload_activation]: 9.37001e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.451e-05 [merge_recompute_call_nodes]: 1.37999e-06 [before_grad]: 9.88998e-06 [set_forward_comm_id_for_comm_node_pass]: 3.62998e-06 [meta_fg_expand]: 3.05998e-06 [flash_sp_send_recv_attached]: 2.46e-06 [receive_attached]: 1.97999e-06 [after_resolve]: 9.54e-06 [a_after_grad]: 8.77999e-06 [renormalize]: 0.00048683 [add_forward_monad_depend]: 4.68999e-06 [auto_monad_grad]: 2.27001e-06 [auto_monad_eliminator]: 1.456e-05 [cse]: 2.743e-05 [a_3]: 5.863e-05 [Cycle 2]: 0.00079475, [45] [expand_dump_flag]: 1.23002e-06 [switch_simplify]: 6.88e-06 [loop_unroll]: 5.76e-06 [a_1]: 0.00012041 [with_stream_mark]: 1.32e-05 [recompute_prepare]: 6.48e-06 [updatestate_depend_eliminate]: 3.26999e-06 [updatestate_assign_eliminate]: 2.44999e-06 [updatestate_loads_eliminate]: 2.66e-06 [parameter_eliminate]: 1.10001e-06 [a_2]: 9.859e-05 [accelerated_algorithm]: 5.87001e-06 [shard]: 1.12e-06 [meta_shard_fg_expand]: 1.22999e-06 [shard_inline]: 6.59999e-06 [merge_send_recv]: 4.85001e-06 [auto_parallel]: 5.80002e-06 [parallel]: 4.37e-06 [flash_sp]: 3.68e-06 [merge_comm]: 3.4e-06 [allreduce_fusion]: 3.06999e-06 [matmul_add_comm_reduction]: 5.95002e-06 [allreduce_slice_to_reducescatter]: 3.39991e-07 [virtual_shard_identity]: 6.41e-06 [virtual_dataset]: 5.75001e-06 [get_grad_eliminate_]: 5.91e-06 [virtual_output]: 5.50001e-06 [merge_forward]: 3.04999e-06 [cell_reuse_recompute_pass]: 1.25001e-06 [offload_activation]: 6.48998e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.501e-05 [merge_recompute_call_nodes]: 9.80013e-07 [before_grad]: 9.22001e-06 [set_forward_comm_id_for_comm_node_pass]: 4.06001e-06 [meta_fg_expand]: 2.05002e-06 [flash_sp_send_recv_attached]: 8.89995e-07 [receive_attached]: 1.20999e-06 [after_resolve]: 9.26998e-06 [a_after_grad]: 8.33001e-06 [renormalize]: 8.00064e-08 [add_forward_monad_depend]: 1.20001e-06 [auto_monad_grad]: 8.50006e-07 [auto_monad_eliminator]: 7.23999e-06 [cse]: 1.421e-05 [a_3]: 4.752e-05 [py_interpret_to_execute_after_opt_a]: 1.182e-05 [slice_cell_reuse_recomputed_activation]: 4.42e-06 [rewriter_after_opt_a]: 3.913e-05 [convert_after_rewriter]: 9.79e-06 [order_py_execute_after_rewriter]: 8.43999e-06 [mutable_eliminate]: 0.00049883 [opt_b]: 0.00027767, [1] [Cycle 1]: 0.00026898, [7] [b_1]: 0.00017416 [b_2]: 7.64002e-06 [updatestate_depend_eliminate]: 5.62001e-06 [updatestate_assign_eliminate]: 2.46998e-06 [updatestate_loads_eliminate]: 2.28002e-06 [renormalize]: 3.19997e-07 [cse]: 1.821e-05 [optimize_parallel_all_gather_comm]: 1.981e-05 [overlap_param_gather]: 4.85999e-06 [cconv]: 2.693e-05 [loop_unroll]: 0.00043325 [opt_after_cconv]: 0.00012063, [1] [Cycle 1]: 0.00011264, [7] [c_1]: 2.791e-05 [parameter_eliminate]: 2.88e-06 [updatestate_depend_eliminate]: 5.00001e-06 [updatestate_assign_eliminate]: 2.37999e-06 [updatestate_loads_eliminate]: 2.74999e-06 [cse]: 1.651e-05 [renormalize]: 3.69997e-07 [remove_dup_value]: 1.704e-05 [tuple_transform]: 8.234e-05, [1] [Cycle 1]: 7.534e-05, [4] [d_1]: 3.749e-05 [none_parameter_eliminate]: 1.60001e-06 [renormalize]: 2.00002e-07 [switch_simplify]: 6.44999e-06 [partial_unused_args_eliminate]: 4.41002e-06 [add_recomputation]: 4.738e-05 [cse_after_recomputation]: 2.691e-05, [1] [Cycle 1]: 2.042e-05, [1] [cse]: 1.167e-05 [environ_conv]: 8.43999e-06 [swap_dp_allreduce_reducescatter]: 8.11002e-06 [bias_add_comm_swap]: 5.00001e-06 [label_micro_interleaved_index]: 6.89001e-06 [label_fine_grained_interleaved_index]: 5.07e-06 [merge_cast_opt]: 3.63e-06 [slice_recompute_activation]: 4.56002e-06 [micro_interleaved_order_control]: 4.65001e-06 [assign_add_opt]: 3.64002e-06 [ForceFp32Comm]: 3.12002e-06 [remove_cast_before_assign_add]: 3.27002e-06 [full_micro_interleaved_order_control]: 4.38001e-06 [reorder_send_recv_between_fp_bp]: 5.19e-06 [comm_op_add_attrs]: 3.5e-06 [add_comm_op_reuse_tag]: 3.45e-06 [interleave_split_concat_branches]: 3.55e-06 [interleave_parallel_branches]: 3.54002e-06 [overlap_opt_shard_in_pipeline]: 3.7e-06 [overlap_opt_shard_grad_in_pipeline]: 4.28999e-06 [control_data_broadcast_order]: 1.528e-05 [grouped_pairwise_exchange_alltoall]: 3.82998e-06 [offloading_packed_experts]: 6.81001e-06 [overlap_recompute_and_grad_model_parallel]: 8.18001e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.78999e-06 [overlap_recompute_allgather_and_fa_grad]: 3.66001e-06 [overlap_recompute_comm]: 4.96002e-06 [overlap_grad_ring_attention]: 6.81001e-06 [overlap_grad_flash_sp]: 2.05e-05 [begin_end_overlap_inline]: 2.99999e-06 [split_matmul_comm_elemetwise]: 4.62e-06 [split_layernorm_comm]: 4.15e-06 [handle_group_info]: 3.63e-06 [symbol_engine_optimizer]: 9.578e-05, [1] [Cycle 1]: 8.896e-05, [6] [build]: 2.57001e-06 [elim_shapecalc]: 9.70002e-06 [elim_not_effective]: 1.312e-05 [opt_reshape]: 6.97002e-06 [fold_const_symbol]: 1.015e-05 [renormalize]: 2.3999e-07 [detach_backward]: 3.35e-06 [pipeline_parallel_scheduler]: 1.84e-06 [auto_monad_reorder]: 1.833e-05 [get_jit_bprop_graph]: 1.32e-06 [rewriter_after_jit_bprop_graph]: 3.80998e-06 [opt_after_jit_grad]: 0.00050098 [validate]: 3.679e-05 Sums bootstrap : 0.000424s : 4.27% type_inference : 0.005385s : 54.11% event_method : 0.000011s : 0.11% auto_monad : 0.000054s : 0.55% graph_reusing : 0.000005s : 0.05% inline : 0.000002s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000015s : 0.15% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000004s : 0.04% parallel-infer-symbol : 0.000004s : 0.04% pre_auto_parallel : 0.000025s : 0.26% insert-virtual-dataset : 0.000003s : 0.03% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.02% optimize.py_interpret_to_execute : 0.000020s : 0.20% optimize.rewriter_before_opt_a : 0.000048s : 0.48% optimize.opt_a.expand_dump_flag : 0.000004s : 0.04% optimize.opt_a.switch_simplify : 0.000032s : 0.32% optimize.opt_a.loop_unroll : 0.000020s : 0.20% optimize.opt_a.a_1 : 0.000409s : 4.11% optimize.opt_a.with_stream_mark : 0.000030s : 0.30% optimize.opt_a.recompute_prepare : 0.000015s : 0.15% optimize.opt_a.updatestate_depend_eliminate : 0.000008s : 0.08% optimize.opt_a.updatestate_assign_eliminate : 0.000006s : 0.06% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.06% optimize.opt_a.parameter_eliminate : 0.000003s : 0.03% optimize.opt_a.a_2 : 0.000206s : 2.07% optimize.opt_a.accelerated_algorithm : 0.000013s : 0.13% optimize.opt_a.shard : 0.000003s : 0.03% optimize.opt_a.meta_shard_fg_expand : 0.000003s : 0.03% optimize.opt_a.shard_inline : 0.000013s : 0.13% optimize.opt_a.merge_send_recv : 0.000013s : 0.13% optimize.opt_a.auto_parallel : 0.000012s : 0.12% optimize.opt_a.parallel : 0.000021s : 0.21% optimize.opt_a.flash_sp : 0.000013s : 0.13% optimize.opt_a.merge_comm : 0.000007s : 0.07% optimize.opt_a.allreduce_fusion : 0.000007s : 0.07% optimize.opt_a.matmul_add_comm_reduction : 0.000016s : 0.16% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000014s : 0.14% optimize.opt_a.virtual_dataset : 0.000012s : 0.13% optimize.opt_a.get_grad_eliminate_ : 0.000012s : 0.12% optimize.opt_a.virtual_output : 0.000012s : 0.12% optimize.opt_a.merge_forward : 0.000007s : 0.07% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.03% optimize.opt_a.offload_activation : 0.000016s : 0.16% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000030s : 0.30% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.02% optimize.opt_a.before_grad : 0.000019s : 0.19% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000008s : 0.08% optimize.opt_a.meta_fg_expand : 0.000005s : 0.05% optimize.opt_a.flash_sp_send_recv_attached : 0.000003s : 0.03% optimize.opt_a.receive_attached : 0.000003s : 0.03% optimize.opt_a.after_resolve : 0.000019s : 0.19% optimize.opt_a.a_after_grad : 0.000017s : 0.17% optimize.opt_a.renormalize : 0.000487s : 4.89% optimize.opt_a.add_forward_monad_depend : 0.000006s : 0.06% optimize.opt_a.auto_monad_grad : 0.000003s : 0.03% optimize.opt_a.auto_monad_eliminator : 0.000022s : 0.22% optimize.opt_a.cse : 0.000042s : 0.42% optimize.opt_a.a_3 : 0.000106s : 1.07% optimize.py_interpret_to_execute_after_opt_a : 0.000012s : 0.12% optimize.slice_cell_reuse_recomputed_activation : 0.000004s : 0.04% optimize.rewriter_after_opt_a : 0.000039s : 0.39% optimize.convert_after_rewriter : 0.000010s : 0.10% optimize.order_py_execute_after_rewriter : 0.000008s : 0.08% optimize.mutable_eliminate : 0.000499s : 5.01% optimize.opt_b.b_1 : 0.000174s : 1.75% optimize.opt_b.b_2 : 0.000008s : 0.08% optimize.opt_b.updatestate_depend_eliminate : 0.000006s : 0.06% optimize.opt_b.updatestate_assign_eliminate : 0.000002s : 0.02% optimize.opt_b.updatestate_loads_eliminate : 0.000002s : 0.02% optimize.opt_b.renormalize : 0.000000s : 0.00% optimize.opt_b.cse : 0.000018s : 0.18% optimize.optimize_parallel_all_gather_comm : 0.000020s : 0.20% optimize.overlap_param_gather : 0.000005s : 0.05% optimize.cconv : 0.000027s : 0.27% optimize.loop_unroll : 0.000433s : 4.35% optimize.opt_after_cconv.c_1 : 0.000028s : 0.28% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000005s : 0.05% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000002s : 0.02% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.cse : 0.000017s : 0.17% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000017s : 0.17% optimize.tuple_transform.d_1 : 0.000037s : 0.38% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.02% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000006s : 0.06% optimize.partial_unused_args_eliminate : 0.000004s : 0.04% optimize.add_recomputation : 0.000047s : 0.48% optimize.cse_after_recomputation.cse : 0.000012s : 0.12% optimize.environ_conv : 0.000008s : 0.08% optimize.swap_dp_allreduce_reducescatter : 0.000008s : 0.08% optimize.bias_add_comm_swap : 0.000005s : 0.05% optimize.label_micro_interleaved_index : 0.000007s : 0.07% optimize.label_fine_grained_interleaved_index : 0.000005s : 0.05% optimize.merge_cast_opt : 0.000004s : 0.04% optimize.slice_recompute_activation : 0.000005s : 0.05% optimize.micro_interleaved_order_control : 0.000005s : 0.05% optimize.assign_add_opt : 0.000004s : 0.04% optimize.ForceFp32Comm : 0.000003s : 0.03% optimize.remove_cast_before_assign_add : 0.000003s : 0.03% optimize.full_micro_interleaved_order_control : 0.000004s : 0.04% optimize.reorder_send_recv_between_fp_bp : 0.000005s : 0.05% optimize.comm_op_add_attrs : 0.000003s : 0.04% optimize.add_comm_op_reuse_tag : 0.000003s : 0.03% optimize.interleave_split_concat_branches : 0.000004s : 0.04% optimize.interleave_parallel_branches : 0.000004s : 0.04% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.04% optimize.overlap_opt_shard_grad_in_pipeline : 0.000004s : 0.04% optimize.control_data_broadcast_order : 0.000015s : 0.15% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.04% optimize.offloading_packed_experts : 0.000007s : 0.07% optimize.overlap_recompute_and_grad_model_parallel : 0.000008s : 0.08% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.04% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.04% optimize.overlap_recompute_comm : 0.000005s : 0.05% optimize.overlap_grad_ring_attention : 0.000007s : 0.07% optimize.overlap_grad_flash_sp : 0.000021s : 0.21% optimize.begin_end_overlap_inline : 0.000003s : 0.03% optimize.split_matmul_comm_elemetwise : 0.000005s : 0.05% optimize.split_layernorm_comm : 0.000004s : 0.04% optimize.handle_group_info : 0.000004s : 0.04% optimize.symbol_engine_optimizer.build : 0.000003s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000010s : 0.10% optimize.symbol_engine_optimizer.elim_not_effective : 0.000013s : 0.13% optimize.symbol_engine_optimizer.opt_reshape : 0.000007s : 0.07% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000010s : 0.10% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000003s : 0.03% pipeline_parallel_scheduler : 0.000002s : 0.02% auto_monad_reorder : 0.000018s : 0.18% get_jit_bprop_graph : 0.000001s : 0.01% rewriter_after_jit_bprop_graph : 0.000004s : 0.04% opt_after_jit_grad : 0.000501s : 5.03% validate : 0.000037s : 0.37% Time group info: ------[substitution.] 0.000115 20 1.60% : 0.000002s : 2: substitution.elim_not_effective 1.17% : 0.000001s : 2: substitution.fold_const_symbol 4.62% : 0.000005s : 3: substitution.graph_param_transform 75.17% : 0.000086s : 2: substitution.inline 2.66% : 0.000003s : 4: substitution.j_node_and_user_rematch 3.71% : 0.000004s : 4: substitution.remove_not_recompute_node 2.77% : 0.000003s : 2: substitution.replace_old_param 8.30% : 0.000010s : 1: substitution.value_based_eliminate ------[type_inference.] 0.005341 2 91.68% : 0.004896s : 1: type_inference.infer 8.32% : 0.000444s : 1: type_inference.specialize ------[replace.] 0.000020 2 100.00% : 0.000020s : 2: replace.inline ------[match.] 0.000085 2 100.00% : 0.000085s : 2: match.inline ------[predicate.] 0.000136 754 0.82% : 0.000001s : 7: predicate.accumulaten_eliminater 1.03% : 0.000001s : 3: predicate.ad_related_special_op_eliminate 0.68% : 0.000001s : 6: predicate.addn_check_dump 1.01% : 0.000001s : 7: predicate.addn_zero_filter 0.68% : 0.000001s : 7: predicate.adjust_all_reduce_mul_add 2.60% : 0.000004s : 13: predicate.arithmetic_simplify 0.84% : 0.000001s : 7: predicate.cast_eliminate 0.79% : 0.000001s : 6: predicate.check_bprop_eliminate 0.66% : 0.000001s : 6: predicate.compare_switch_simplify 0.21% : 0.000000s : 3: predicate.const_output_eliminate 0.73% : 0.000001s : 6: predicate.depend_value_elim 0.76% : 0.000001s : 7: predicate.dict_get_item_const_eliminator 0.97% : 0.000001s : 7: predicate.dict_get_item_eliminator 0.75% : 0.000001s : 7: predicate.dict_set_item_eliminator 1.14% : 0.000002s : 6: predicate.dumpgradient_eliminate 0.29% : 0.000000s : 3: predicate.elim_not_effective 0.44% : 0.000001s : 3: predicate.elim_shapecalc_of_broadcastargs 1.13% : 0.000002s : 10: predicate.environ_add_const_eliminate 1.02% : 0.000001s : 10: predicate.environ_get_add_eliminate 1.05% : 0.000001s : 10: predicate.environ_get_depend_swap 1.77% : 0.000002s : 16: predicate.environ_get_eliminate 1.06% : 0.000001s : 10: predicate.environ_get_set_eliminate 0.99% : 0.000001s : 9: predicate.exchange_switch_depend_value 2.00% : 0.000003s : 9: predicate.float_depend_g_call 0.62% : 0.000001s : 6: predicate.float_environ_get_switch 0.99% : 0.000001s : 9: predicate.float_tuple_getitem_switch 0.23% : 0.000000s : 3: predicate.fold_const_symbol 0.79% : 0.000001s : 6: predicate.get_grad_eliminate 0.23% : 0.000000s : 3: predicate.graph_param_transform 0.76% : 0.000001s : 6: predicate.incorporate_call 0.65% : 0.000001s : 6: predicate.incorporate_call_switch 6.30% : 0.000009s : 34: predicate.inline 0.97% : 0.000001s : 6: predicate.inline_without_move 0.43% : 0.000001s : 6: predicate.j_node_and_user_rematch 1.39% : 0.000002s : 6: predicate.less_batch_normalization 1.66% : 0.000002s : 13: predicate.list_to_tuple_eliminator_ 2.15% : 0.000003s : 20: predicate.load_eliminater 1.09% : 0.000001s : 3: predicate.loop_unroll_after_grad 1.59% : 0.000002s : 14: predicate.loop_unroll_before_grad 1.67% : 0.000002s : 13: predicate.make_slice_get_slice_eliminator 0.82% : 0.000001s : 6: predicate.merge_addn 0.69% : 0.000001s : 6: predicate.micro_step_allgather_replace 0.70% : 0.000001s : 6: predicate.mini_step_allgather_replace 0.84% : 0.000001s : 7: predicate.minmaximum_grad 1.71% : 0.000002s : 3: predicate.mutable_eliminate 0.45% : 0.000001s : 3: predicate.opt_reshape 0.45% : 0.000001s : 3: predicate.parallel_virtual_node 1.24% : 0.000002s : 9: predicate.partial_defer_inline 1.28% : 0.000002s : 10: predicate.partial_eliminate 0.84% : 0.000001s : 7: predicate.print_const_string_wrapper 0.68% : 0.000001s : 6: predicate.reduce_all_const_elim 0.98% : 0.000001s : 7: predicate.reduce_eliminate 2.04% : 0.000003s : 20: predicate.redundant_stop_gradient_eliminater 0.76% : 0.000001s : 6: predicate.remove_not_recompute_node 1.11% : 0.000002s : 13: predicate.replace_applicator 0.65% : 0.000001s : 6: predicate.replace_old_param 0.49% : 0.000001s : 3: predicate.reset_defer_inline 0.89% : 0.000001s : 7: predicate.reshape_eliminate 0.93% : 0.000001s : 6: predicate.row_tensor_add_zeros_like 0.44% : 0.000001s : 3: predicate.row_tensor_eliminate 0.97% : 0.000001s : 6: predicate.same_eliminate 0.51% : 0.000001s : 6: predicate.set_cell_output_no_recompute 1.01% : 0.000001s : 6: predicate.shard_identity_eliminate 1.01% : 0.000001s : 6: predicate.special_op_eliminate 1.17% : 0.000002s : 6: predicate.specialize_transform 1.09% : 0.000001s : 6: predicate.split_environ_get_set_with_tuple_value 0.87% : 0.000001s : 6: predicate.stack_unstack_eliminate 0.45% : 0.000001s : 3: predicate.switch_call_monad_eliminater 1.03% : 0.000001s : 9: predicate.switch_defer_inline 1.74% : 0.000002s : 15: predicate.switch_layer_defer_inline 4.29% : 0.000006s : 32: predicate.switch_simplify 0.73% : 0.000001s : 7: predicate.tile_eliminate 0.78% : 0.000001s : 7: predicate.transpose_eliminate 1.66% : 0.000002s : 13: predicate.tuple_list_convert_item_index_to_positive 1.65% : 0.000002s : 13: predicate.tuple_list_get_item_const_eliminator 1.42% : 0.000002s : 13: predicate.tuple_list_get_item_depend_reorder 2.98% : 0.000004s : 19: predicate.tuple_list_get_item_eliminator 1.43% : 0.000002s : 13: predicate.tuple_list_get_set_item_eliminator 2.38% : 0.000003s : 19: predicate.tuple_list_set_item_eliminator 1.54% : 0.000002s : 13: predicate.tuple_to_list_eliminator_ 2.10% : 0.000003s : 20: predicate.updatestate_pure_node_eliminater 2.81% : 0.000004s : 26: predicate.updatestate_useless_node_eliminater 0.55% : 0.000001s : 3: predicate.value_based_eliminate 1.14% : 0.000002s : 6: predicate.virtual_dataset_eliminate 0.82% : 0.000001s : 6: predicate.virtual_output_eliminate 0.34% : 0.000000s : 3: predicate.virtual_view_grad_eliminate 0.58% : 0.000001s : 3: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000210 5 7.77% : 0.000016s : 1: func_graph_cloner_run.FuncGraphClonerGraph 92.23% : 0.000194s : 4: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.023867 192 0.02% : 0.000006s : 1: ForceFp32Comm 13.07% : 0.003119s : 1: add_attr 13.01% : 0.003105s : 1: add_attr_with_inline 0.03% : 0.000006s : 1: add_comm_op_reuse_tag 0.21% : 0.000051s : 1: add_recomputation 0.03% : 0.000006s : 1: assign_add_opt 0.26% : 0.000063s : 1: auto_monad 0.11% : 0.000025s : 1: auto_monad_reorder 0.02% : 0.000006s : 1: begin_end_overlap_inline 0.03% : 0.000008s : 1: bias_add_comm_swap 1.96% : 0.000468s : 1: bootstrap 0.13% : 0.000030s : 1: cconv 0.03% : 0.000006s : 1: comm_op_add_attrs 0.08% : 0.000018s : 1: control_data_broadcast_order 0.05% : 0.000013s : 1: convert_after_rewriter 0.13% : 0.000030s : 1: cse_after_recomputation 0.03% : 0.000008s : 1: dataset_repeat_opt 0.07% : 0.000018s : 1: detach_backward 0.05% : 0.000011s : 1: environ_conv 0.09% : 0.000020s : 1: event_method 0.03% : 0.000007s : 1: full_micro_interleaved_order_control 0.03% : 0.000008s : 1: get_jit_bprop_graph 0.05% : 0.000012s : 1: graph_reusing 0.03% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.03% : 0.000006s : 1: handle_group_info 0.03% : 0.000008s : 1: inline 0.04% : 0.000008s : 1: insert-virtual-dataset 0.03% : 0.000006s : 1: interleave_parallel_branches 0.03% : 0.000006s : 1: interleave_split_concat_branches 0.03% : 0.000008s : 1: label_fine_grained_interleaved_index 0.04% : 0.000010s : 1: label_micro_interleaved_index 1.84% : 0.000439s : 1: loop_unroll 0.03% : 0.000006s : 1: merge_cast_opt 0.03% : 0.000007s : 1: micro_interleaved_order_control 2.12% : 0.000505s : 1: mutable_eliminate 0.04% : 0.000010s : 1: offloading_packed_experts 0.06% : 0.000013s : 1: opt.transform.loop_unroll_optimizer 0.07% : 0.000017s : 1: opt.transform.mutable_eliminate 3.26% : 0.000778s : 78: opt.transform.opt_a 0.11% : 0.000026s : 1: opt.transform.opt_after_cconv 0.11% : 0.000026s : 1: opt.transform.opt_after_jit_grad 0.46% : 0.000110s : 28: opt.transform.opt_b 0.18% : 0.000042s : 2: opt.transform.opt_trans_graph 0.15% : 0.000036s : 4: opt.transform.symbol_engine_opt 10.17% : 0.002427s : 1: opt_a 0.52% : 0.000124s : 1: opt_after_cconv 2.15% : 0.000512s : 1: opt_after_jit_grad 1.18% : 0.000281s : 1: opt_b 20.64% : 0.004926s : 1: optimize 0.10% : 0.000023s : 1: optimize_parallel_all_gather_comm 0.05% : 0.000011s : 1: order_py_execute_after_rewriter 0.10% : 0.000024s : 1: overlap_grad_flash_sp 0.03% : 0.000006s : 1: overlap_grad_matmul_and_grad_allreduce 0.04% : 0.000010s : 1: overlap_grad_ring_attention 0.03% : 0.000007s : 1: overlap_opt_shard_grad_in_pipeline 0.03% : 0.000006s : 1: overlap_opt_shard_in_pipeline 0.03% : 0.000008s : 1: overlap_param_gather 0.03% : 0.000006s : 1: overlap_recompute_allgather_and_fa_grad 0.05% : 0.000011s : 1: overlap_recompute_and_grad_model_parallel 0.03% : 0.000008s : 1: overlap_recompute_comm 0.04% : 0.000010s : 1: parallel-infer-symbol 0.03% : 0.000006s : 1: parallel-infer-symbol-second 0.03% : 0.000007s : 1: partial_unused_args_eliminate 0.04% : 0.000009s : 1: pipeline_parallel_scheduler 0.03% : 0.000007s : 1: pipeline_split 0.14% : 0.000033s : 1: pre_auto_parallel 0.10% : 0.000023s : 1: py_interpret_to_execute 0.06% : 0.000015s : 1: py_interpret_to_execute_after_opt_a 0.02% : 0.000006s : 1: remove_cast_before_assign_add 0.08% : 0.000020s : 1: remove_dup_value 1.11% : 0.000266s : 1: renormalize.infer 0.89% : 0.000213s : 1: renormalize.specialize 0.03% : 0.000008s : 1: reorder_send_recv_between_fp_bp 0.04% : 0.000010s : 1: rewriter_after_jit_bprop_graph 0.18% : 0.000043s : 1: rewriter_after_opt_a 0.21% : 0.000051s : 1: rewriter_before_opt_a 0.03% : 0.000007s : 1: slice_cell_reuse_recomputed_activation 0.03% : 0.000007s : 1: slice_recompute_activation 0.03% : 0.000007s : 1: split_layernorm_comm 0.03% : 0.000007s : 1: split_matmul_comm_elemetwise 0.05% : 0.000011s : 1: swap_dp_allreduce_reducescatter 0.41% : 0.000099s : 1: symbol_engine_optimizer 0.36% : 0.000085s : 1: tuple_transform 22.66% : 0.005408s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:55.466.62 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0134696, [21] [bootstrap]: 0.00042922 [type_inference]: 0.00528159 [event_method]: 1.141e-05 [auto_monad]: 5.225e-05 [graph_reusing]: 5.10001e-06 [inline]: 2.19999e-06 [add_attr]: 0.00307166, [1] [add_attr_with_inline]: 0.00306334, [1] [Cycle 1]: 5.225e-05, [2] [tag_attr]: 1.389e-05 [meta_addattr_fg_expand]: 4.10998e-06 [parallel-infer-symbol]: 3.40998e-06 [pre_auto_parallel]: 2.376e-05 [insert-virtual-dataset]: 2.21998e-06 [parallel-infer-symbol-second]: 7.7e-07 [dataset_repeat_opt]: 2.46998e-06 [pipeline_split]: 1.60999e-06 [optimize]: 0.00392528, [53] [py_interpret_to_execute]: 1.607e-05 [rewriter_before_opt_a]: 4.217e-05 [opt_a]: 0.00206086, [2] [Cycle 1]: 0.00145117, [45] [expand_dump_flag]: 2.77002e-06 [switch_simplify]: 2.459e-05 [loop_unroll]: 1.347e-05 [a_1]: 0.00027802 [with_stream_mark]: 1.521e-05 [recompute_prepare]: 7.99002e-06 [updatestate_depend_eliminate]: 3.56001e-06 [updatestate_assign_eliminate]: 3.25e-06 [updatestate_loads_eliminate]: 3.29001e-06 [parameter_eliminate]: 2.11e-06 [a_2]: 7.741e-05 [accelerated_algorithm]: 6.58e-06 [shard]: 2.21998e-06 [meta_shard_fg_expand]: 1.63002e-06 [shard_inline]: 5.84999e-06 [merge_send_recv]: 8.54e-06 [auto_parallel]: 6.69999e-06 [parallel]: 1.738e-05 [flash_sp]: 7.21001e-06 [merge_comm]: 3.9e-06 [allreduce_fusion]: 3.61999e-06 [matmul_add_comm_reduction]: 8.98002e-06 [allreduce_slice_to_reducescatter]: 6.10016e-07 [virtual_shard_identity]: 7.68001e-06 [virtual_dataset]: 6.17001e-06 [get_grad_eliminate_]: 5.96998e-06 [virtual_output]: 6.23e-06 [merge_forward]: 4.74e-06 [cell_reuse_recompute_pass]: 1.19003e-06 [offload_activation]: 1.009e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.283e-05 [merge_recompute_call_nodes]: 1.79998e-06 [before_grad]: 1.043e-05 [set_forward_comm_id_for_comm_node_pass]: 3.62002e-06 [meta_fg_expand]: 2.71e-06 [flash_sp_send_recv_attached]: 2.55002e-06 [receive_attached]: 2.22001e-06 [after_resolve]: 9.49e-06 [a_after_grad]: 8.52e-06 [renormalize]: 0.00051541 [add_forward_monad_depend]: 5.21998e-06 [auto_monad_grad]: 2.30002e-06 [auto_monad_eliminator]: 1.406e-05 [cse]: 2.942e-05 [a_3]: 4.617e-05 [Cycle 2]: 0.00060048, [45] [expand_dump_flag]: 9.89996e-07 [switch_simplify]: 6.67002e-06 [loop_unroll]: 5.74e-06 [a_1]: 0.00010518 [with_stream_mark]: 1.07e-05 [recompute_prepare]: 5.99999e-06 [updatestate_depend_eliminate]: 2.83e-06 [updatestate_assign_eliminate]: 2.38002e-06 [updatestate_loads_eliminate]: 2.66e-06 [parameter_eliminate]: 1.03001e-06 [a_2]: 6.897e-05 [accelerated_algorithm]: 5.88002e-06 [shard]: 1.09e-06 [meta_shard_fg_expand]: 1.22999e-06 [shard_inline]: 5.94e-06 [merge_send_recv]: 4.68999e-06 [auto_parallel]: 5.37999e-06 [parallel]: 4.53999e-06 [flash_sp]: 3.51001e-06 [merge_comm]: 3.09999e-06 [allreduce_fusion]: 3.16999e-06 [matmul_add_comm_reduction]: 5.58002e-06 [allreduce_slice_to_reducescatter]: 3.30008e-07 [virtual_shard_identity]: 6.69999e-06 [virtual_dataset]: 5.64998e-06 [get_grad_eliminate_]: 5.46e-06 [virtual_output]: 5.27999e-06 [merge_forward]: 2.63e-06 [cell_reuse_recompute_pass]: 1.43002e-06 [offload_activation]: 6.51999e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.274e-05 [merge_recompute_call_nodes]: 7.39994e-07 [before_grad]: 8.89e-06 [set_forward_comm_id_for_comm_node_pass]: 3.66999e-06 [meta_fg_expand]: 2.01998e-06 [flash_sp_send_recv_attached]: 8.79983e-07 [receive_attached]: 1.14998e-06 [after_resolve]: 8.87999e-06 [a_after_grad]: 8.05e-06 [renormalize]: 5.9983e-08 [add_forward_monad_depend]: 1.15999e-06 [auto_monad_grad]: 9.29984e-07 [auto_monad_eliminator]: 6.88998e-06 [cse]: 1.322e-05 [a_3]: 3.462e-05 [py_interpret_to_execute_after_opt_a]: 7.58999e-06 [slice_cell_reuse_recomputed_activation]: 2.16e-06 [rewriter_after_opt_a]: 3.41e-05 [convert_after_rewriter]: 6.76e-06 [order_py_execute_after_rewriter]: 5.09998e-06 [mutable_eliminate]: 0.00045774 [opt_b]: 0.00020905, [1] [Cycle 1]: 0.00020305, [7] [b_1]: 0.00013195 [b_2]: 7.78999e-06 [updatestate_depend_eliminate]: 4.85001e-06 [updatestate_assign_eliminate]: 2.29001e-06 [updatestate_loads_eliminate]: 2.42001e-06 [renormalize]: 4.69998e-07 [cse]: 1.766e-05 [optimize_parallel_all_gather_comm]: 1.496e-05 [overlap_param_gather]: 1.96e-06 [cconv]: 2.358e-05 [loop_unroll]: 0.00041629 [opt_after_cconv]: 9.633e-05, [1] [Cycle 1]: 9.09e-05, [7] [c_1]: 2.711e-05 [parameter_eliminate]: 2.29999e-06 [updatestate_depend_eliminate]: 4.77998e-06 [updatestate_assign_eliminate]: 2.71e-06 [updatestate_loads_eliminate]: 2.22999e-06 [cse]: 1.743e-05 [renormalize]: 3.30008e-07 [remove_dup_value]: 1.426e-05 [tuple_transform]: 6.956e-05, [1] [Cycle 1]: 6.468e-05, [4] [d_1]: 3.801e-05 [none_parameter_eliminate]: 1.59998e-06 [renormalize]: 2.10013e-07 [switch_simplify]: 6.49001e-06 [partial_unused_args_eliminate]: 1.72001e-06 [add_recomputation]: 4.403e-05 [cse_after_recomputation]: 2.185e-05, [1] [Cycle 1]: 1.729e-05, [1] [cse]: 1.167e-05 [environ_conv]: 4.68999e-06 [swap_dp_allreduce_reducescatter]: 5.27999e-06 [bias_add_comm_swap]: 2.44001e-06 [label_micro_interleaved_index]: 4.08001e-06 [label_fine_grained_interleaved_index]: 3.19001e-06 [merge_cast_opt]: 1.55999e-06 [slice_recompute_activation]: 2.17999e-06 [micro_interleaved_order_control]: 2.05002e-06 [assign_add_opt]: 1.57001e-06 [ForceFp32Comm]: 1.15999e-06 [remove_cast_before_assign_add]: 9.30013e-07 [full_micro_interleaved_order_control]: 2.21e-06 [reorder_send_recv_between_fp_bp]: 2.70002e-06 [comm_op_add_attrs]: 1.49e-06 [add_comm_op_reuse_tag]: 1.05001e-06 [interleave_split_concat_branches]: 1.15999e-06 [interleave_parallel_branches]: 1.06002e-06 [overlap_opt_shard_in_pipeline]: 1.08001e-06 [overlap_opt_shard_grad_in_pipeline]: 1.85001e-06 [control_data_broadcast_order]: 1.211e-05 [grouped_pairwise_exchange_alltoall]: 1.50999e-06 [offloading_packed_experts]: 3.81999e-06 [overlap_recompute_and_grad_model_parallel]: 4.17e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.09998e-06 [overlap_recompute_allgather_and_fa_grad]: 1.37999e-06 [overlap_recompute_comm]: 2.58998e-06 [overlap_grad_ring_attention]: 4.31002e-06 [overlap_grad_flash_sp]: 1.748e-05 [begin_end_overlap_inline]: 5.60016e-07 [split_matmul_comm_elemetwise]: 2.38002e-06 [split_layernorm_comm]: 1.62999e-06 [handle_group_info]: 9.99979e-07 [symbol_engine_optimizer]: 7.277e-05, [1] [Cycle 1]: 6.776e-05, [6] [build]: 2.32999e-06 [elim_shapecalc]: 9.39e-06 [elim_not_effective]: 1.253e-05 [opt_reshape]: 6.39001e-06 [fold_const_symbol]: 9.51998e-06 [renormalize]: 2.09984e-07 [detach_backward]: 1.87999e-06 [pipeline_parallel_scheduler]: 1.81998e-06 [auto_monad_reorder]: 1.538e-05 [get_jit_bprop_graph]: 1.35999e-06 [rewriter_after_jit_bprop_graph]: 3.27002e-06 [opt_after_jit_grad]: 0.00045773 [validate]: 3.544e-05 Sums bootstrap : 0.000429s : 4.53% type_inference : 0.005282s : 55.79% event_method : 0.000011s : 0.12% auto_monad : 0.000052s : 0.55% graph_reusing : 0.000005s : 0.05% inline : 0.000002s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000014s : 0.15% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000004s : 0.04% parallel-infer-symbol : 0.000003s : 0.04% pre_auto_parallel : 0.000024s : 0.25% insert-virtual-dataset : 0.000002s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.03% pipeline_split : 0.000002s : 0.02% optimize.py_interpret_to_execute : 0.000016s : 0.17% optimize.rewriter_before_opt_a : 0.000042s : 0.45% optimize.opt_a.expand_dump_flag : 0.000004s : 0.04% optimize.opt_a.switch_simplify : 0.000031s : 0.33% optimize.opt_a.loop_unroll : 0.000019s : 0.20% optimize.opt_a.a_1 : 0.000383s : 4.05% optimize.opt_a.with_stream_mark : 0.000026s : 0.27% optimize.opt_a.recompute_prepare : 0.000014s : 0.15% optimize.opt_a.updatestate_depend_eliminate : 0.000006s : 0.07% optimize.opt_a.updatestate_assign_eliminate : 0.000006s : 0.06% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.06% optimize.opt_a.parameter_eliminate : 0.000003s : 0.03% optimize.opt_a.a_2 : 0.000146s : 1.55% optimize.opt_a.accelerated_algorithm : 0.000012s : 0.13% optimize.opt_a.shard : 0.000003s : 0.03% optimize.opt_a.meta_shard_fg_expand : 0.000003s : 0.03% optimize.opt_a.shard_inline : 0.000012s : 0.12% optimize.opt_a.merge_send_recv : 0.000013s : 0.14% optimize.opt_a.auto_parallel : 0.000012s : 0.13% optimize.opt_a.parallel : 0.000022s : 0.23% optimize.opt_a.flash_sp : 0.000011s : 0.11% optimize.opt_a.merge_comm : 0.000007s : 0.07% optimize.opt_a.allreduce_fusion : 0.000007s : 0.07% optimize.opt_a.matmul_add_comm_reduction : 0.000015s : 0.15% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000014s : 0.15% optimize.opt_a.virtual_dataset : 0.000012s : 0.12% optimize.opt_a.get_grad_eliminate_ : 0.000011s : 0.12% optimize.opt_a.virtual_output : 0.000012s : 0.12% optimize.opt_a.merge_forward : 0.000007s : 0.08% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.03% optimize.opt_a.offload_activation : 0.000017s : 0.18% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000026s : 0.27% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.03% optimize.opt_a.before_grad : 0.000019s : 0.20% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000007s : 0.08% optimize.opt_a.meta_fg_expand : 0.000005s : 0.05% optimize.opt_a.flash_sp_send_recv_attached : 0.000003s : 0.04% optimize.opt_a.receive_attached : 0.000003s : 0.04% optimize.opt_a.after_resolve : 0.000018s : 0.19% optimize.opt_a.a_after_grad : 0.000017s : 0.18% optimize.opt_a.renormalize : 0.000515s : 5.45% optimize.opt_a.add_forward_monad_depend : 0.000006s : 0.07% optimize.opt_a.auto_monad_grad : 0.000003s : 0.03% optimize.opt_a.auto_monad_eliminator : 0.000021s : 0.22% optimize.opt_a.cse : 0.000043s : 0.45% optimize.opt_a.a_3 : 0.000081s : 0.85% optimize.py_interpret_to_execute_after_opt_a : 0.000008s : 0.08% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.02% optimize.rewriter_after_opt_a : 0.000034s : 0.36% optimize.convert_after_rewriter : 0.000007s : 0.07% optimize.order_py_execute_after_rewriter : 0.000005s : 0.05% optimize.mutable_eliminate : 0.000458s : 4.84% optimize.opt_b.b_1 : 0.000132s : 1.39% optimize.opt_b.b_2 : 0.000008s : 0.08% optimize.opt_b.updatestate_depend_eliminate : 0.000005s : 0.05% optimize.opt_b.updatestate_assign_eliminate : 0.000002s : 0.02% optimize.opt_b.updatestate_loads_eliminate : 0.000002s : 0.03% optimize.opt_b.renormalize : 0.000000s : 0.00% optimize.opt_b.cse : 0.000018s : 0.19% optimize.optimize_parallel_all_gather_comm : 0.000015s : 0.16% optimize.overlap_param_gather : 0.000002s : 0.02% optimize.cconv : 0.000024s : 0.25% optimize.loop_unroll : 0.000416s : 4.40% optimize.opt_after_cconv.c_1 : 0.000027s : 0.29% optimize.opt_after_cconv.parameter_eliminate : 0.000002s : 0.02% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000005s : 0.05% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.02% optimize.opt_after_cconv.cse : 0.000017s : 0.18% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000014s : 0.15% optimize.tuple_transform.d_1 : 0.000038s : 0.40% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.02% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000006s : 0.07% optimize.partial_unused_args_eliminate : 0.000002s : 0.02% optimize.add_recomputation : 0.000044s : 0.47% optimize.cse_after_recomputation.cse : 0.000012s : 0.12% optimize.environ_conv : 0.000005s : 0.05% optimize.swap_dp_allreduce_reducescatter : 0.000005s : 0.06% optimize.bias_add_comm_swap : 0.000002s : 0.03% optimize.label_micro_interleaved_index : 0.000004s : 0.04% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.03% optimize.merge_cast_opt : 0.000002s : 0.02% optimize.slice_recompute_activation : 0.000002s : 0.02% optimize.micro_interleaved_order_control : 0.000002s : 0.02% optimize.assign_add_opt : 0.000002s : 0.02% optimize.ForceFp32Comm : 0.000001s : 0.01% optimize.remove_cast_before_assign_add : 0.000001s : 0.01% optimize.full_micro_interleaved_order_control : 0.000002s : 0.02% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.03% optimize.comm_op_add_attrs : 0.000001s : 0.02% optimize.add_comm_op_reuse_tag : 0.000001s : 0.01% optimize.interleave_split_concat_branches : 0.000001s : 0.01% optimize.interleave_parallel_branches : 0.000001s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.02% optimize.control_data_broadcast_order : 0.000012s : 0.13% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.02% optimize.offloading_packed_experts : 0.000004s : 0.04% optimize.overlap_recompute_and_grad_model_parallel : 0.000004s : 0.04% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.01% optimize.overlap_recompute_comm : 0.000003s : 0.03% optimize.overlap_grad_ring_attention : 0.000004s : 0.05% optimize.overlap_grad_flash_sp : 0.000017s : 0.18% optimize.begin_end_overlap_inline : 0.000001s : 0.01% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.03% optimize.split_layernorm_comm : 0.000002s : 0.02% optimize.handle_group_info : 0.000001s : 0.01% optimize.symbol_engine_optimizer.build : 0.000002s : 0.02% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000009s : 0.10% optimize.symbol_engine_optimizer.elim_not_effective : 0.000013s : 0.13% optimize.symbol_engine_optimizer.opt_reshape : 0.000006s : 0.07% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000010s : 0.10% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.02% pipeline_parallel_scheduler : 0.000002s : 0.02% auto_monad_reorder : 0.000015s : 0.16% get_jit_bprop_graph : 0.000001s : 0.01% rewriter_after_jit_bprop_graph : 0.000003s : 0.03% opt_after_jit_grad : 0.000458s : 4.84% validate : 0.000035s : 0.37% Time group info: ------[substitution.] 0.000112 20 1.65% : 0.000002s : 2: substitution.elim_not_effective 1.29% : 0.000001s : 2: substitution.fold_const_symbol 4.71% : 0.000005s : 3: substitution.graph_param_transform 74.11% : 0.000083s : 2: substitution.inline 2.58% : 0.000003s : 4: substitution.j_node_and_user_rematch 4.45% : 0.000005s : 4: substitution.remove_not_recompute_node 2.78% : 0.000003s : 2: substitution.replace_old_param 8.44% : 0.000009s : 1: substitution.value_based_eliminate ------[type_inference.] 0.005237 2 91.80% : 0.004807s : 1: type_inference.infer 8.20% : 0.000429s : 1: type_inference.specialize ------[replace.] 0.000020 2 100.00% : 0.000020s : 2: replace.inline ------[match.] 0.000081 2 100.00% : 0.000081s : 2: match.inline ------[predicate.] 0.000130 754 0.87% : 0.000001s : 7: predicate.accumulaten_eliminater 1.10% : 0.000001s : 3: predicate.ad_related_special_op_eliminate 0.68% : 0.000001s : 6: predicate.addn_check_dump 0.87% : 0.000001s : 7: predicate.addn_zero_filter 0.76% : 0.000001s : 7: predicate.adjust_all_reduce_mul_add 2.12% : 0.000003s : 13: predicate.arithmetic_simplify 0.72% : 0.000001s : 7: predicate.cast_eliminate 0.72% : 0.000001s : 6: predicate.check_bprop_eliminate 0.67% : 0.000001s : 6: predicate.compare_switch_simplify 0.24% : 0.000000s : 3: predicate.const_output_eliminate 0.76% : 0.000001s : 6: predicate.depend_value_elim 0.75% : 0.000001s : 7: predicate.dict_get_item_const_eliminator 0.89% : 0.000001s : 7: predicate.dict_get_item_eliminator 0.73% : 0.000001s : 7: predicate.dict_set_item_eliminator 1.32% : 0.000002s : 6: predicate.dumpgradient_eliminate 0.28% : 0.000000s : 3: predicate.elim_not_effective 0.53% : 0.000001s : 3: predicate.elim_shapecalc_of_broadcastargs 1.11% : 0.000001s : 10: predicate.environ_add_const_eliminate 1.03% : 0.000001s : 10: predicate.environ_get_add_eliminate 0.99% : 0.000001s : 10: predicate.environ_get_depend_swap 1.80% : 0.000002s : 16: predicate.environ_get_eliminate 0.99% : 0.000001s : 10: predicate.environ_get_set_eliminate 0.94% : 0.000001s : 9: predicate.exchange_switch_depend_value 1.89% : 0.000002s : 9: predicate.float_depend_g_call 0.71% : 0.000001s : 6: predicate.float_environ_get_switch 0.94% : 0.000001s : 9: predicate.float_tuple_getitem_switch 0.23% : 0.000000s : 3: predicate.fold_const_symbol 0.98% : 0.000001s : 6: predicate.get_grad_eliminate 0.27% : 0.000000s : 3: predicate.graph_param_transform 0.83% : 0.000001s : 6: predicate.incorporate_call 0.64% : 0.000001s : 6: predicate.incorporate_call_switch 6.46% : 0.000008s : 34: predicate.inline 1.08% : 0.000001s : 6: predicate.inline_without_move 0.48% : 0.000001s : 6: predicate.j_node_and_user_rematch 1.02% : 0.000001s : 6: predicate.less_batch_normalization 1.61% : 0.000002s : 13: predicate.list_to_tuple_eliminator_ 2.14% : 0.000003s : 20: predicate.load_eliminater 1.25% : 0.000002s : 3: predicate.loop_unroll_after_grad 1.67% : 0.000002s : 14: predicate.loop_unroll_before_grad 1.76% : 0.000002s : 13: predicate.make_slice_get_slice_eliminator 0.68% : 0.000001s : 6: predicate.merge_addn 0.69% : 0.000001s : 6: predicate.micro_step_allgather_replace 0.69% : 0.000001s : 6: predicate.mini_step_allgather_replace 0.71% : 0.000001s : 7: predicate.minmaximum_grad 1.29% : 0.000002s : 3: predicate.mutable_eliminate 0.44% : 0.000001s : 3: predicate.opt_reshape 0.51% : 0.000001s : 3: predicate.parallel_virtual_node 1.29% : 0.000002s : 9: predicate.partial_defer_inline 1.24% : 0.000002s : 10: predicate.partial_eliminate 1.03% : 0.000001s : 7: predicate.print_const_string_wrapper 0.73% : 0.000001s : 6: predicate.reduce_all_const_elim 1.11% : 0.000001s : 7: predicate.reduce_eliminate 2.10% : 0.000003s : 20: predicate.redundant_stop_gradient_eliminater 0.97% : 0.000001s : 6: predicate.remove_not_recompute_node 1.24% : 0.000002s : 13: predicate.replace_applicator 0.73% : 0.000001s : 6: predicate.replace_old_param 0.38% : 0.000000s : 3: predicate.reset_defer_inline 0.81% : 0.000001s : 7: predicate.reshape_eliminate 0.82% : 0.000001s : 6: predicate.row_tensor_add_zeros_like 0.51% : 0.000001s : 3: predicate.row_tensor_eliminate 1.04% : 0.000001s : 6: predicate.same_eliminate 0.55% : 0.000001s : 6: predicate.set_cell_output_no_recompute 0.91% : 0.000001s : 6: predicate.shard_identity_eliminate 0.91% : 0.000001s : 6: predicate.special_op_eliminate 0.93% : 0.000001s : 6: predicate.specialize_transform 1.22% : 0.000002s : 6: predicate.split_environ_get_set_with_tuple_value 0.88% : 0.000001s : 6: predicate.stack_unstack_eliminate 0.44% : 0.000001s : 3: predicate.switch_call_monad_eliminater 1.04% : 0.000001s : 9: predicate.switch_defer_inline 1.76% : 0.000002s : 15: predicate.switch_layer_defer_inline 4.25% : 0.000006s : 32: predicate.switch_simplify 0.83% : 0.000001s : 7: predicate.tile_eliminate 0.80% : 0.000001s : 7: predicate.transpose_eliminate 1.53% : 0.000002s : 13: predicate.tuple_list_convert_item_index_to_positive 1.70% : 0.000002s : 13: predicate.tuple_list_get_item_const_eliminator 1.43% : 0.000002s : 13: predicate.tuple_list_get_item_depend_reorder 3.61% : 0.000005s : 19: predicate.tuple_list_get_item_eliminator 1.56% : 0.000002s : 13: predicate.tuple_list_get_set_item_eliminator 2.51% : 0.000003s : 19: predicate.tuple_list_set_item_eliminator 1.47% : 0.000002s : 13: predicate.tuple_to_list_eliminator_ 2.04% : 0.000003s : 20: predicate.updatestate_pure_node_eliminater 2.82% : 0.000004s : 26: predicate.updatestate_useless_node_eliminater 0.57% : 0.000001s : 3: predicate.value_based_eliminate 0.81% : 0.000001s : 6: predicate.virtual_dataset_eliminate 0.79% : 0.000001s : 6: predicate.virtual_output_eliminate 0.31% : 0.000000s : 3: predicate.virtual_view_grad_eliminate 0.51% : 0.000001s : 3: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000195 5 8.10% : 0.000016s : 1: func_graph_cloner_run.FuncGraphClonerGraph 91.90% : 0.000179s : 4: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.021864 192 0.02% : 0.000004s : 1: ForceFp32Comm 14.07% : 0.003076s : 1: add_attr 14.03% : 0.003067s : 1: add_attr_with_inline 0.02% : 0.000004s : 1: add_comm_op_reuse_tag 0.22% : 0.000048s : 1: add_recomputation 0.02% : 0.000004s : 1: assign_add_opt 0.26% : 0.000057s : 1: auto_monad 0.09% : 0.000019s : 1: auto_monad_reorder 0.02% : 0.000003s : 1: begin_end_overlap_inline 0.02% : 0.000005s : 1: bias_add_comm_swap 2.09% : 0.000457s : 1: bootstrap 0.12% : 0.000027s : 1: cconv 0.02% : 0.000004s : 1: comm_op_add_attrs 0.07% : 0.000015s : 1: control_data_broadcast_order 0.05% : 0.000010s : 1: convert_after_rewriter 0.11% : 0.000025s : 1: cse_after_recomputation 0.02% : 0.000005s : 1: dataset_repeat_opt 0.02% : 0.000005s : 1: detach_backward 0.04% : 0.000008s : 1: environ_conv 0.08% : 0.000018s : 1: event_method 0.02% : 0.000005s : 1: full_micro_interleaved_order_control 0.02% : 0.000004s : 1: get_jit_bprop_graph 0.04% : 0.000008s : 1: graph_reusing 0.02% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.02% : 0.000004s : 1: handle_group_info 0.02% : 0.000005s : 1: inline 0.03% : 0.000006s : 1: insert-virtual-dataset 0.02% : 0.000004s : 1: interleave_parallel_branches 0.02% : 0.000004s : 1: interleave_split_concat_branches 0.03% : 0.000006s : 1: label_fine_grained_interleaved_index 0.03% : 0.000007s : 1: label_micro_interleaved_index 1.94% : 0.000424s : 1: loop_unroll 0.02% : 0.000004s : 1: merge_cast_opt 0.02% : 0.000005s : 1: micro_interleaved_order_control 2.13% : 0.000465s : 1: mutable_eliminate 0.03% : 0.000007s : 1: offloading_packed_experts 0.06% : 0.000013s : 1: opt.transform.loop_unroll_optimizer 0.06% : 0.000014s : 1: opt.transform.mutable_eliminate 3.38% : 0.000739s : 78: opt.transform.opt_a 0.12% : 0.000026s : 1: opt.transform.opt_after_cconv 0.11% : 0.000024s : 1: opt.transform.opt_after_jit_grad 0.50% : 0.000109s : 28: opt.transform.opt_b 0.19% : 0.000042s : 2: opt.transform.opt_trans_graph 0.16% : 0.000034s : 4: opt.transform.symbol_engine_opt 9.44% : 0.002064s : 1: opt_a 0.46% : 0.000100s : 1: opt_after_cconv 2.13% : 0.000467s : 1: opt_after_jit_grad 0.97% : 0.000213s : 1: opt_b 17.97% : 0.003929s : 1: optimize 0.08% : 0.000018s : 1: optimize_parallel_all_gather_comm 0.04% : 0.000008s : 1: order_py_execute_after_rewriter 0.09% : 0.000021s : 1: overlap_grad_flash_sp 0.02% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.03% : 0.000007s : 1: overlap_grad_ring_attention 0.02% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.02% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.02% : 0.000005s : 1: overlap_param_gather 0.02% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.03% : 0.000007s : 1: overlap_recompute_and_grad_model_parallel 0.02% : 0.000005s : 1: overlap_recompute_comm 0.03% : 0.000007s : 1: parallel-infer-symbol 0.02% : 0.000004s : 1: parallel-infer-symbol-second 0.02% : 0.000005s : 1: partial_unused_args_eliminate 0.02% : 0.000005s : 1: pipeline_parallel_scheduler 0.02% : 0.000004s : 1: pipeline_split 0.13% : 0.000028s : 1: pre_auto_parallel 0.09% : 0.000020s : 1: py_interpret_to_execute 0.05% : 0.000011s : 1: py_interpret_to_execute_after_opt_a 0.02% : 0.000004s : 1: remove_cast_before_assign_add 0.08% : 0.000018s : 1: remove_dup_value 1.40% : 0.000306s : 1: renormalize.infer 0.92% : 0.000202s : 1: renormalize.specialize 0.03% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.03% : 0.000006s : 1: rewriter_after_jit_bprop_graph 0.17% : 0.000038s : 1: rewriter_after_opt_a 0.21% : 0.000046s : 1: rewriter_before_opt_a 0.02% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.02% : 0.000005s : 1: slice_recompute_activation 0.02% : 0.000004s : 1: split_layernorm_comm 0.02% : 0.000005s : 1: split_matmul_comm_elemetwise 0.04% : 0.000008s : 1: swap_dp_allreduce_reducescatter 0.34% : 0.000075s : 1: symbol_engine_optimizer 0.33% : 0.000072s : 1: tuple_transform 24.22% : 0.005296s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:55.235.862 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:55.236.126 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.014419, [21] [bootstrap]: 0.0004315 [type_inference]: 0.00476369 [event_method]: 1.279e-05 [auto_monad]: 5.451e-05 [graph_reusing]: 5.53002e-06 [inline]: 2.07001e-06 [add_attr]: 0.0030621, [1] [add_attr_with_inline]: 0.00305447, [1] [Cycle 1]: 6.434e-05, [2] [tag_attr]: 1.406e-05 [meta_addattr_fg_expand]: 4.47998e-06 [parallel-infer-symbol]: 2.78998e-06 [pre_auto_parallel]: 2.451e-05 [insert-virtual-dataset]: 2.34999e-06 [parallel-infer-symbol-second]: 8.50006e-07 [dataset_repeat_opt]: 1.72999e-06 [pipeline_split]: 1.52001e-06 [optimize]: 0.00495251, [53] [py_interpret_to_execute]: 2.196e-05 [rewriter_before_opt_a]: 5.144e-05 [opt_a]: 0.00264942, [2] [Cycle 1]: 0.00175291, [45] [expand_dump_flag]: 2.79999e-06 [switch_simplify]: 2.67e-05 [loop_unroll]: 1.515e-05 [a_1]: 0.00033916 [with_stream_mark]: 1.487e-05 [recompute_prepare]: 9.25999e-06 [updatestate_depend_eliminate]: 4.77998e-06 [updatestate_assign_eliminate]: 4.07e-06 [updatestate_loads_eliminate]: 4.05e-06 [parameter_eliminate]: 2.19999e-06 [a_2]: 0.00012559 [accelerated_algorithm]: 8.28999e-06 [shard]: 2.12001e-06 [meta_shard_fg_expand]: 1.99e-06 [shard_inline]: 7.48e-06 [merge_send_recv]: 8.58001e-06 [auto_parallel]: 7.66001e-06 [parallel]: 1.713e-05 [flash_sp]: 8.25e-06 [merge_comm]: 4.65999e-06 [allreduce_fusion]: 4.24002e-06 [matmul_add_comm_reduction]: 9.61e-06 [allreduce_slice_to_reducescatter]: 6.00005e-07 [virtual_shard_identity]: 8.90999e-06 [virtual_dataset]: 7.75998e-06 [get_grad_eliminate_]: 7.02002e-06 [virtual_output]: 7.1e-06 [merge_forward]: 4.31002e-06 [cell_reuse_recompute_pass]: 1.17e-06 [offload_activation]: 9.89001e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.702e-05 [merge_recompute_call_nodes]: 1.39e-06 [before_grad]: 1.329e-05 [set_forward_comm_id_for_comm_node_pass]: 4.51002e-06 [meta_fg_expand]: 3.43999e-06 [flash_sp_send_recv_attached]: 2.40002e-06 [receive_attached]: 2.27001e-06 [after_resolve]: 1.092e-05 [a_after_grad]: 1.145e-05 [renormalize]: 0.0005109 [add_forward_monad_depend]: 5.07999e-06 [auto_monad_grad]: 1.92001e-06 [auto_monad_eliminator]: 1.623e-05 [cse]: 3.728e-05 [a_3]: 6.736e-05 [Cycle 2]: 0.00088333, [45] [expand_dump_flag]: 9.80013e-07 [switch_simplify]: 8.65001e-06 [loop_unroll]: 7.51999e-06 [a_1]: 0.00015002 [with_stream_mark]: 1.119e-05 [recompute_prepare]: 7.4e-06 [updatestate_depend_eliminate]: 4.27998e-06 [updatestate_assign_eliminate]: 3.23e-06 [updatestate_loads_eliminate]: 3.18e-06 [parameter_eliminate]: 9.89996e-07 [a_2]: 0.0001153 [accelerated_algorithm]: 7.08998e-06 [shard]: 1.09e-06 [meta_shard_fg_expand]: 1.54e-06 [shard_inline]: 7.16999e-06 [merge_send_recv]: 5.46e-06 [auto_parallel]: 6.48e-06 [parallel]: 5.03002e-06 [flash_sp]: 3.34001e-06 [merge_comm]: 4e-06 [allreduce_fusion]: 3.61999e-06 [matmul_add_comm_reduction]: 6.93998e-06 [allreduce_slice_to_reducescatter]: 3.50003e-07 [virtual_shard_identity]: 7.93001e-06 [virtual_dataset]: 7.16001e-06 [get_grad_eliminate_]: 6.74001e-06 [virtual_output]: 6.53e-06 [merge_forward]: 3.20998e-06 [cell_reuse_recompute_pass]: 1.25001e-06 [offload_activation]: 7.31001e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.603e-05 [merge_recompute_call_nodes]: 7.29982e-07 [before_grad]: 1.109e-05 [set_forward_comm_id_for_comm_node_pass]: 4.52e-06 [meta_fg_expand]: 2.61e-06 [flash_sp_send_recv_attached]: 8.29983e-07 [receive_attached]: 1.01002e-06 [after_resolve]: 9.51e-06 [a_after_grad]: 1.041e-05 [renormalize]: 7.00238e-08 [add_forward_monad_depend]: 1.66e-06 [auto_monad_grad]: 9.29984e-07 [auto_monad_eliminator]: 8.37998e-06 [cse]: 1.922e-05 [a_3]: 5.674e-05 [py_interpret_to_execute_after_opt_a]: 1.322e-05 [slice_cell_reuse_recomputed_activation]: 4.55001e-06 [rewriter_after_opt_a]: 4.333e-05 [convert_after_rewriter]: 1.039e-05 [order_py_execute_after_rewriter]: 9.25999e-06 [mutable_eliminate]: 0.00048651 [opt_b]: 0.00033595, [1] [Cycle 1]: 0.00032714, [7] [b_1]: 0.00022437 [b_2]: 9.41e-06 [updatestate_depend_eliminate]: 6.48e-06 [updatestate_assign_eliminate]: 3.18998e-06 [updatestate_loads_eliminate]: 2.96001e-06 [renormalize]: 3.80009e-07 [cse]: 2.324e-05 [optimize_parallel_all_gather_comm]: 2.102e-05 [overlap_param_gather]: 4.57998e-06 [cconv]: 2.788e-05 [loop_unroll]: 0.00043949 [opt_after_cconv]: 0.00013573, [1] [Cycle 1]: 0.00012668, [7] [c_1]: 3.37e-05 [parameter_eliminate]: 2.46e-06 [updatestate_depend_eliminate]: 6.01e-06 [updatestate_assign_eliminate]: 3.13e-06 [updatestate_loads_eliminate]: 3.43e-06 [cse]: 2.23e-05 [renormalize]: 5.09986e-07 [remove_dup_value]: 1.987e-05 [tuple_transform]: 9.401e-05, [1] [Cycle 1]: 8.736e-05, [4] [d_1]: 4.705e-05 [none_parameter_eliminate]: 1.71998e-06 [renormalize]: 2.59985e-07 [switch_simplify]: 8e-06 [partial_unused_args_eliminate]: 4.74e-06 [add_recomputation]: 5.711e-05 [cse_after_recomputation]: 3.115e-05, [1] [Cycle 1]: 2.445e-05, [1] [cse]: 1.541e-05 [environ_conv]: 9.07001e-06 [swap_dp_allreduce_reducescatter]: 8.50001e-06 [bias_add_comm_swap]: 5.56e-06 [label_micro_interleaved_index]: 6.46e-06 [label_fine_grained_interleaved_index]: 4.94e-06 [merge_cast_opt]: 3.82002e-06 [slice_recompute_activation]: 4.60001e-06 [micro_interleaved_order_control]: 4.84998e-06 [assign_add_opt]: 3.78999e-06 [ForceFp32Comm]: 3.16001e-06 [remove_cast_before_assign_add]: 3.31999e-06 [full_micro_interleaved_order_control]: 4.63001e-06 [reorder_send_recv_between_fp_bp]: 5.05999e-06 [comm_op_add_attrs]: 3.64002e-06 [add_comm_op_reuse_tag]: 3.13e-06 [interleave_split_concat_branches]: 3.55998e-06 [interleave_parallel_branches]: 3.42997e-06 [overlap_opt_shard_in_pipeline]: 3.58e-06 [overlap_opt_shard_grad_in_pipeline]: 4.33999e-06 [control_data_broadcast_order]: 1.725e-05 [grouped_pairwise_exchange_alltoall]: 4e-06 [offloading_packed_experts]: 7.07997e-06 [overlap_recompute_and_grad_model_parallel]: 7.73999e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.72002e-06 [overlap_recompute_allgather_and_fa_grad]: 3.66999e-06 [overlap_recompute_comm]: 5.08002e-06 [overlap_grad_ring_attention]: 6.86999e-06 [overlap_grad_flash_sp]: 2.287e-05 [begin_end_overlap_inline]: 2.81e-06 [split_matmul_comm_elemetwise]: 4.53999e-06 [split_layernorm_comm]: 4.25999e-06 [handle_group_info]: 3.33998e-06 [symbol_engine_optimizer]: 0.0001024, [1] [Cycle 1]: 9.575e-05, [6] [build]: 2.80002e-06 [elim_shapecalc]: 1.092e-05 [elim_not_effective]: 1.524e-05 [opt_reshape]: 7.85e-06 [fold_const_symbol]: 1.233e-05 [renormalize]: 2.09984e-07 [detach_backward]: 3.06001e-06 [pipeline_parallel_scheduler]: 1.63002e-06 [auto_monad_reorder]: 2.19e-05 [get_jit_bprop_graph]: 1.49e-06 [rewriter_after_jit_bprop_graph]: 4.06001e-06 [opt_after_jit_grad]: 0.00047604 [validate]: 3.942e-05 Sums bootstrap : 0.000431s : 4.47% type_inference : 0.004764s : 49.31% event_method : 0.000013s : 0.13% auto_monad : 0.000055s : 0.56% graph_reusing : 0.000006s : 0.06% inline : 0.000002s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000014s : 0.15% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000004s : 0.05% parallel-infer-symbol : 0.000003s : 0.03% pre_auto_parallel : 0.000025s : 0.25% insert-virtual-dataset : 0.000002s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.02% optimize.py_interpret_to_execute : 0.000022s : 0.23% optimize.rewriter_before_opt_a : 0.000051s : 0.53% optimize.opt_a.expand_dump_flag : 0.000004s : 0.04% optimize.opt_a.switch_simplify : 0.000035s : 0.37% optimize.opt_a.loop_unroll : 0.000023s : 0.23% optimize.opt_a.a_1 : 0.000489s : 5.06% optimize.opt_a.with_stream_mark : 0.000026s : 0.27% optimize.opt_a.recompute_prepare : 0.000017s : 0.17% optimize.opt_a.updatestate_depend_eliminate : 0.000009s : 0.09% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.08% optimize.opt_a.updatestate_loads_eliminate : 0.000007s : 0.07% optimize.opt_a.parameter_eliminate : 0.000003s : 0.03% optimize.opt_a.a_2 : 0.000241s : 2.49% optimize.opt_a.accelerated_algorithm : 0.000015s : 0.16% optimize.opt_a.shard : 0.000003s : 0.03% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.04% optimize.opt_a.shard_inline : 0.000015s : 0.15% optimize.opt_a.merge_send_recv : 0.000014s : 0.15% optimize.opt_a.auto_parallel : 0.000014s : 0.15% optimize.opt_a.parallel : 0.000022s : 0.23% optimize.opt_a.flash_sp : 0.000012s : 0.12% optimize.opt_a.merge_comm : 0.000009s : 0.09% optimize.opt_a.allreduce_fusion : 0.000008s : 0.08% optimize.opt_a.matmul_add_comm_reduction : 0.000017s : 0.17% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000017s : 0.17% optimize.opt_a.virtual_dataset : 0.000015s : 0.15% optimize.opt_a.get_grad_eliminate_ : 0.000014s : 0.14% optimize.opt_a.virtual_output : 0.000014s : 0.14% optimize.opt_a.merge_forward : 0.000008s : 0.08% optimize.opt_a.cell_reuse_recompute_pass : 0.000002s : 0.03% optimize.opt_a.offload_activation : 0.000017s : 0.18% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000033s : 0.34% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.02% optimize.opt_a.before_grad : 0.000024s : 0.25% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000009s : 0.09% optimize.opt_a.meta_fg_expand : 0.000006s : 0.06% optimize.opt_a.flash_sp_send_recv_attached : 0.000003s : 0.03% optimize.opt_a.receive_attached : 0.000003s : 0.03% optimize.opt_a.after_resolve : 0.000020s : 0.21% optimize.opt_a.a_after_grad : 0.000022s : 0.23% optimize.opt_a.renormalize : 0.000511s : 5.29% optimize.opt_a.add_forward_monad_depend : 0.000007s : 0.07% optimize.opt_a.auto_monad_grad : 0.000003s : 0.03% optimize.opt_a.auto_monad_eliminator : 0.000025s : 0.25% optimize.opt_a.cse : 0.000056s : 0.58% optimize.opt_a.a_3 : 0.000124s : 1.28% optimize.py_interpret_to_execute_after_opt_a : 0.000013s : 0.14% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.05% optimize.rewriter_after_opt_a : 0.000043s : 0.45% optimize.convert_after_rewriter : 0.000010s : 0.11% optimize.order_py_execute_after_rewriter : 0.000009s : 0.10% optimize.mutable_eliminate : 0.000487s : 5.04% optimize.opt_b.b_1 : 0.000224s : 2.32% optimize.opt_b.b_2 : 0.000009s : 0.10% optimize.opt_b.updatestate_depend_eliminate : 0.000006s : 0.07% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_b.renormalize : 0.000000s : 0.00% optimize.opt_b.cse : 0.000023s : 0.24% optimize.optimize_parallel_all_gather_comm : 0.000021s : 0.22% optimize.overlap_param_gather : 0.000005s : 0.05% optimize.cconv : 0.000028s : 0.29% optimize.loop_unroll : 0.000439s : 4.55% optimize.opt_after_cconv.c_1 : 0.000034s : 0.35% optimize.opt_after_cconv.parameter_eliminate : 0.000002s : 0.03% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.06% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.04% optimize.opt_after_cconv.cse : 0.000022s : 0.23% optimize.opt_after_cconv.renormalize : 0.000001s : 0.01% optimize.remove_dup_value : 0.000020s : 0.21% optimize.tuple_transform.d_1 : 0.000047s : 0.49% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.02% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000008s : 0.08% optimize.partial_unused_args_eliminate : 0.000005s : 0.05% optimize.add_recomputation : 0.000057s : 0.59% optimize.cse_after_recomputation.cse : 0.000015s : 0.16% optimize.environ_conv : 0.000009s : 0.09% optimize.swap_dp_allreduce_reducescatter : 0.000009s : 0.09% optimize.bias_add_comm_swap : 0.000006s : 0.06% optimize.label_micro_interleaved_index : 0.000006s : 0.07% optimize.label_fine_grained_interleaved_index : 0.000005s : 0.05% optimize.merge_cast_opt : 0.000004s : 0.04% optimize.slice_recompute_activation : 0.000005s : 0.05% optimize.micro_interleaved_order_control : 0.000005s : 0.05% optimize.assign_add_opt : 0.000004s : 0.04% optimize.ForceFp32Comm : 0.000003s : 0.03% optimize.remove_cast_before_assign_add : 0.000003s : 0.03% optimize.full_micro_interleaved_order_control : 0.000005s : 0.05% optimize.reorder_send_recv_between_fp_bp : 0.000005s : 0.05% optimize.comm_op_add_attrs : 0.000004s : 0.04% optimize.add_comm_op_reuse_tag : 0.000003s : 0.03% optimize.interleave_split_concat_branches : 0.000004s : 0.04% optimize.interleave_parallel_branches : 0.000003s : 0.04% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.04% optimize.overlap_opt_shard_grad_in_pipeline : 0.000004s : 0.04% optimize.control_data_broadcast_order : 0.000017s : 0.18% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.04% optimize.offloading_packed_experts : 0.000007s : 0.07% optimize.overlap_recompute_and_grad_model_parallel : 0.000008s : 0.08% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.04% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.04% optimize.overlap_recompute_comm : 0.000005s : 0.05% optimize.overlap_grad_ring_attention : 0.000007s : 0.07% optimize.overlap_grad_flash_sp : 0.000023s : 0.24% optimize.begin_end_overlap_inline : 0.000003s : 0.03% optimize.split_matmul_comm_elemetwise : 0.000005s : 0.05% optimize.split_layernorm_comm : 0.000004s : 0.04% optimize.handle_group_info : 0.000003s : 0.03% optimize.symbol_engine_optimizer.build : 0.000003s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000011s : 0.11% optimize.symbol_engine_optimizer.elim_not_effective : 0.000015s : 0.16% optimize.symbol_engine_optimizer.opt_reshape : 0.000008s : 0.08% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000012s : 0.13% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000003s : 0.03% pipeline_parallel_scheduler : 0.000002s : 0.02% auto_monad_reorder : 0.000022s : 0.23% get_jit_bprop_graph : 0.000001s : 0.02% rewriter_after_jit_bprop_graph : 0.000004s : 0.04% opt_after_jit_grad : 0.000476s : 4.93% validate : 0.000039s : 0.41% Time group info: ------[substitution.] 0.000145 29 12.33% : 0.000018s : 2: substitution.cast_eliminate 1.62% : 0.000002s : 3: substitution.elim_not_effective 1.15% : 0.000002s : 3: substitution.fold_const_symbol 4.11% : 0.000006s : 4: substitution.graph_param_transform 65.94% : 0.000096s : 2: substitution.inline 2.65% : 0.000004s : 6: substitution.j_node_and_user_rematch 4.07% : 0.000006s : 6: substitution.remove_not_recompute_node 2.09% : 0.000003s : 2: substitution.replace_old_param 6.05% : 0.000009s : 1: substitution.value_based_eliminate ------[type_inference.] 0.004719 2 90.58% : 0.004274s : 1: type_inference.infer 9.42% : 0.000444s : 1: type_inference.specialize ------[replace.] 0.000020 2 100.00% : 0.000020s : 2: replace.inline ------[match.] 0.000094 2 100.00% : 0.000094s : 2: match.inline ------[predicate.] 0.000167 980 0.82% : 0.000001s : 9: predicate.accumulaten_eliminater 0.98% : 0.000002s : 4: predicate.ad_related_special_op_eliminate 0.80% : 0.000001s : 8: predicate.addn_check_dump 0.98% : 0.000002s : 9: predicate.addn_zero_filter 0.70% : 0.000001s : 9: predicate.adjust_all_reduce_mul_add 2.39% : 0.000004s : 17: predicate.arithmetic_simplify 1.01% : 0.000002s : 9: predicate.cast_eliminate 0.78% : 0.000001s : 8: predicate.check_bprop_eliminate 0.82% : 0.000001s : 8: predicate.compare_switch_simplify 0.23% : 0.000000s : 4: predicate.const_output_eliminate 0.81% : 0.000001s : 8: predicate.depend_value_elim 0.80% : 0.000001s : 9: predicate.dict_get_item_const_eliminator 0.91% : 0.000002s : 9: predicate.dict_get_item_eliminator 0.97% : 0.000002s : 9: predicate.dict_set_item_eliminator 1.16% : 0.000002s : 8: predicate.dumpgradient_eliminate 0.28% : 0.000000s : 4: predicate.elim_not_effective 0.49% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.15% : 0.000002s : 13: predicate.environ_add_const_eliminate 1.06% : 0.000002s : 13: predicate.environ_get_add_eliminate 1.15% : 0.000002s : 13: predicate.environ_get_depend_swap 1.98% : 0.000003s : 21: predicate.environ_get_eliminate 1.10% : 0.000002s : 13: predicate.environ_get_set_eliminate 0.90% : 0.000001s : 11: predicate.exchange_switch_depend_value 2.00% : 0.000003s : 11: predicate.float_depend_g_call 0.71% : 0.000001s : 8: predicate.float_environ_get_switch 1.04% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.23% : 0.000000s : 4: predicate.fold_const_symbol 0.82% : 0.000001s : 8: predicate.get_grad_eliminate 0.33% : 0.000001s : 4: predicate.graph_param_transform 0.82% : 0.000001s : 8: predicate.incorporate_call 0.68% : 0.000001s : 8: predicate.incorporate_call_switch 6.14% : 0.000010s : 44: predicate.inline 1.10% : 0.000002s : 8: predicate.inline_without_move 0.40% : 0.000001s : 8: predicate.j_node_and_user_rematch 1.09% : 0.000002s : 8: predicate.less_batch_normalization 1.96% : 0.000003s : 17: predicate.list_to_tuple_eliminator_ 2.10% : 0.000003s : 26: predicate.load_eliminater 1.13% : 0.000002s : 4: predicate.loop_unroll_after_grad 1.49% : 0.000002s : 16: predicate.loop_unroll_before_grad 1.80% : 0.000003s : 17: predicate.make_slice_get_slice_eliminator 0.79% : 0.000001s : 8: predicate.merge_addn 0.73% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.74% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.68% : 0.000001s : 9: predicate.minmaximum_grad 1.20% : 0.000002s : 4: predicate.mutable_eliminate 0.42% : 0.000001s : 4: predicate.opt_reshape 0.53% : 0.000001s : 4: predicate.parallel_virtual_node 1.18% : 0.000002s : 11: predicate.partial_defer_inline 1.30% : 0.000002s : 13: predicate.partial_eliminate 0.78% : 0.000001s : 9: predicate.print_const_string_wrapper 0.73% : 0.000001s : 8: predicate.reduce_all_const_elim 1.26% : 0.000002s : 9: predicate.reduce_eliminate 2.11% : 0.000004s : 26: predicate.redundant_stop_gradient_eliminater 0.48% : 0.000001s : 8: predicate.remove_not_recompute_node 1.15% : 0.000002s : 17: predicate.replace_applicator 0.57% : 0.000001s : 8: predicate.replace_old_param 0.34% : 0.000001s : 4: predicate.reset_defer_inline 0.87% : 0.000001s : 9: predicate.reshape_eliminate 0.83% : 0.000001s : 8: predicate.row_tensor_add_zeros_like 0.53% : 0.000001s : 4: predicate.row_tensor_eliminate 0.89% : 0.000001s : 8: predicate.same_eliminate 0.53% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.94% : 0.000002s : 8: predicate.shard_identity_eliminate 0.88% : 0.000001s : 8: predicate.special_op_eliminate 1.06% : 0.000002s : 8: predicate.specialize_transform 1.02% : 0.000002s : 8: predicate.split_environ_get_set_with_tuple_value 0.89% : 0.000001s : 8: predicate.stack_unstack_eliminate 0.45% : 0.000001s : 4: predicate.switch_call_monad_eliminater 0.95% : 0.000002s : 11: predicate.switch_defer_inline 1.66% : 0.000003s : 19: predicate.switch_layer_defer_inline 4.15% : 0.000007s : 39: predicate.switch_simplify 0.77% : 0.000001s : 9: predicate.tile_eliminate 0.82% : 0.000001s : 9: predicate.transpose_eliminate 1.60% : 0.000003s : 17: predicate.tuple_list_convert_item_index_to_positive 1.66% : 0.000003s : 17: predicate.tuple_list_get_item_const_eliminator 1.42% : 0.000002s : 17: predicate.tuple_list_get_item_depend_reorder 3.20% : 0.000005s : 25: predicate.tuple_list_get_item_eliminator 1.47% : 0.000002s : 17: predicate.tuple_list_get_set_item_eliminator 2.53% : 0.000004s : 25: predicate.tuple_list_set_item_eliminator 1.56% : 0.000003s : 17: predicate.tuple_to_list_eliminator_ 2.07% : 0.000003s : 26: predicate.updatestate_pure_node_eliminater 3.02% : 0.000005s : 34: predicate.updatestate_useless_node_eliminater 0.58% : 0.000001s : 4: predicate.value_based_eliminate 0.82% : 0.000001s : 8: predicate.virtual_dataset_eliminate 0.82% : 0.000001s : 8: predicate.virtual_output_eliminate 0.40% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.52% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000205 5 8.48% : 0.000017s : 1: func_graph_cloner_run.FuncGraphClonerGraph 91.52% : 0.000187s : 4: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.024109 192 0.02% : 0.000006s : 1: ForceFp32Comm 12.74% : 0.003071s : 1: add_attr 12.68% : 0.003058s : 1: add_attr_with_inline 0.02% : 0.000006s : 1: add_comm_op_reuse_tag 0.25% : 0.000061s : 1: add_recomputation 0.03% : 0.000007s : 1: assign_add_opt 0.26% : 0.000063s : 1: auto_monad 0.12% : 0.000029s : 1: auto_monad_reorder 0.02% : 0.000006s : 1: begin_end_overlap_inline 0.03% : 0.000008s : 1: bias_add_comm_swap 1.97% : 0.000475s : 1: bootstrap 0.13% : 0.000031s : 1: cconv 0.03% : 0.000006s : 1: comm_op_add_attrs 0.08% : 0.000020s : 1: control_data_broadcast_order 0.06% : 0.000013s : 1: convert_after_rewriter 0.14% : 0.000034s : 1: cse_after_recomputation 0.03% : 0.000007s : 1: dataset_repeat_opt 0.07% : 0.000018s : 1: detach_backward 0.05% : 0.000012s : 1: environ_conv 0.09% : 0.000022s : 1: event_method 0.03% : 0.000007s : 1: full_micro_interleaved_order_control 0.03% : 0.000008s : 1: get_jit_bprop_graph 0.05% : 0.000012s : 1: graph_reusing 0.03% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.02% : 0.000006s : 1: handle_group_info 0.03% : 0.000008s : 1: inline 0.03% : 0.000008s : 1: insert-virtual-dataset 0.02% : 0.000006s : 1: interleave_parallel_branches 0.03% : 0.000006s : 1: interleave_split_concat_branches 0.03% : 0.000008s : 1: label_fine_grained_interleaved_index 0.04% : 0.000009s : 1: label_micro_interleaved_index 1.85% : 0.000445s : 1: loop_unroll 0.03% : 0.000006s : 1: merge_cast_opt 0.03% : 0.000008s : 1: micro_interleaved_order_control 2.04% : 0.000492s : 1: mutable_eliminate 0.04% : 0.000010s : 1: offloading_packed_experts 0.06% : 0.000015s : 1: opt.transform.loop_unroll_optimizer 0.06% : 0.000016s : 1: opt.transform.mutable_eliminate 3.93% : 0.000947s : 78: opt.transform.opt_a 0.13% : 0.000032s : 1: opt.transform.opt_after_cconv 0.12% : 0.000028s : 1: opt.transform.opt_after_jit_grad 0.65% : 0.000158s : 28: opt.transform.opt_b 0.22% : 0.000052s : 2: opt.transform.opt_trans_graph 0.18% : 0.000043s : 4: opt.transform.symbol_engine_opt 11.00% : 0.002653s : 1: opt_a 0.58% : 0.000139s : 1: opt_after_cconv 2.02% : 0.000486s : 1: opt_after_jit_grad 1.41% : 0.000339s : 1: opt_b 21.92% : 0.005285s : 1: optimize 0.10% : 0.000024s : 1: optimize_parallel_all_gather_comm 0.05% : 0.000012s : 1: order_py_execute_after_rewriter 0.11% : 0.000026s : 1: overlap_grad_flash_sp 0.03% : 0.000006s : 1: overlap_grad_matmul_and_grad_allreduce 0.04% : 0.000010s : 1: overlap_grad_ring_attention 0.03% : 0.000007s : 1: overlap_opt_shard_grad_in_pipeline 0.03% : 0.000006s : 1: overlap_opt_shard_in_pipeline 0.03% : 0.000007s : 1: overlap_param_gather 0.03% : 0.000006s : 1: overlap_recompute_allgather_and_fa_grad 0.04% : 0.000011s : 1: overlap_recompute_and_grad_model_parallel 0.03% : 0.000008s : 1: overlap_recompute_comm 0.04% : 0.000009s : 1: parallel-infer-symbol 0.03% : 0.000007s : 1: parallel-infer-symbol-second 0.03% : 0.000008s : 1: partial_unused_args_eliminate 0.03% : 0.000008s : 1: pipeline_parallel_scheduler 0.03% : 0.000007s : 1: pipeline_split 0.13% : 0.000032s : 1: pre_auto_parallel 0.10% : 0.000025s : 1: py_interpret_to_execute 0.07% : 0.000016s : 1: py_interpret_to_execute_after_opt_a 0.02% : 0.000006s : 1: remove_cast_before_assign_add 0.10% : 0.000023s : 1: remove_dup_value 1.20% : 0.000290s : 1: renormalize.infer 0.89% : 0.000214s : 1: renormalize.specialize 0.03% : 0.000008s : 1: reorder_send_recv_between_fp_bp 0.04% : 0.000010s : 1: rewriter_after_jit_bprop_graph 0.19% : 0.000047s : 1: rewriter_after_opt_a 0.23% : 0.000055s : 1: rewriter_before_opt_a 0.03% : 0.000007s : 1: slice_cell_reuse_recomputed_activation 0.03% : 0.000007s : 1: slice_recompute_activation 0.03% : 0.000007s : 1: split_layernorm_comm 0.03% : 0.000007s : 1: split_matmul_comm_elemetwise 0.05% : 0.000011s : 1: swap_dp_allreduce_reducescatter 0.44% : 0.000105s : 1: symbol_engine_optimizer 0.40% : 0.000097s : 1: tuple_transform 19.85% : 0.004785s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:55.429.722 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0143482, [21] [bootstrap]: 0.00048547 [type_inference]: 0.00505816 [event_method]: 1.324e-05 [auto_monad]: 6.08e-05 [graph_reusing]: 4.94e-06 [inline]: 2.69999e-06 [add_attr]: 0.00321901, [1] [add_attr_with_inline]: 0.00320967, [1] [Cycle 1]: 5.678e-05, [2] [tag_attr]: 1.579e-05 [meta_addattr_fg_expand]: 3.84002e-06 [parallel-infer-symbol]: 3.29001e-06 [pre_auto_parallel]: 2.836e-05 [insert-virtual-dataset]: 2.43998e-06 [parallel-infer-symbol-second]: 7.60017e-07 [dataset_repeat_opt]: 2.14e-06 [pipeline_split]: 1.57001e-06 [optimize]: 0.004738, [53] [py_interpret_to_execute]: 1.924e-05 [rewriter_before_opt_a]: 5.147e-05 [opt_a]: 0.002545, [2] [Cycle 1]: 0.00179089, [45] [expand_dump_flag]: 3.56999e-06 [switch_simplify]: 2.724e-05 [loop_unroll]: 1.535e-05 [a_1]: 0.00035661 [with_stream_mark]: 1.657e-05 [recompute_prepare]: 1.083e-05 [updatestate_depend_eliminate]: 4.4e-06 [updatestate_assign_eliminate]: 3.97e-06 [updatestate_loads_eliminate]: 4.61002e-06 [parameter_eliminate]: 1.77999e-06 [a_2]: 9.818e-05 [accelerated_algorithm]: 8.27998e-06 [shard]: 2.04e-06 [meta_shard_fg_expand]: 2.18998e-06 [shard_inline]: 7.82e-06 [merge_send_recv]: 9.85002e-06 [auto_parallel]: 7.65998e-06 [parallel]: 1.866e-05 [flash_sp]: 1.075e-05 [merge_comm]: 4.67e-06 [allreduce_fusion]: 4.22e-06 [matmul_add_comm_reduction]: 1.04e-05 [allreduce_slice_to_reducescatter]: 9.19972e-07 [virtual_shard_identity]: 1.042e-05 [virtual_dataset]: 7.56001e-06 [get_grad_eliminate_]: 7.5e-06 [virtual_output]: 7.04001e-06 [merge_forward]: 4.3e-06 [cell_reuse_recompute_pass]: 1.50001e-06 [offload_activation]: 1.214e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.609e-05 [merge_recompute_call_nodes]: 1.39998e-06 [before_grad]: 1.289e-05 [set_forward_comm_id_for_comm_node_pass]: 4.94e-06 [meta_fg_expand]: 3.33998e-06 [flash_sp_send_recv_attached]: 2.58e-06 [receive_attached]: 2.36e-06 [after_resolve]: 1.207e-05 [a_after_grad]: 1.143e-05 [renormalize]: 0.00066912 [add_forward_monad_depend]: 6.23998e-06 [auto_monad_grad]: 2.59999e-06 [auto_monad_eliminator]: 1.824e-05 [cse]: 3.807e-05 [a_3]: 5.682e-05 [Cycle 2]: 0.00074308, [45] [expand_dump_flag]: 9.89996e-07 [switch_simplify]: 8.58001e-06 [loop_unroll]: 7.39002e-06 [a_1]: 0.00015304 [with_stream_mark]: 1.68e-05 [recompute_prepare]: 8.18001e-06 [updatestate_depend_eliminate]: 4.22e-06 [updatestate_assign_eliminate]: 3.26001e-06 [updatestate_loads_eliminate]: 2.81e-06 [parameter_eliminate]: 1.44e-06 [a_2]: 8.737e-05 [accelerated_algorithm]: 7.29001e-06 [shard]: 1.28002e-06 [meta_shard_fg_expand]: 1.67001e-06 [shard_inline]: 7.15003e-06 [merge_send_recv]: 6.81001e-06 [auto_parallel]: 6.39999e-06 [parallel]: 5.51e-06 [flash_sp]: 3.31999e-06 [merge_comm]: 3.93001e-06 [allreduce_fusion]: 4.05998e-06 [matmul_add_comm_reduction]: 7.63999e-06 [allreduce_slice_to_reducescatter]: 7.39994e-07 [virtual_shard_identity]: 8.32998e-06 [virtual_dataset]: 6.78998e-06 [get_grad_eliminate_]: 6.74999e-06 [virtual_output]: 6.32001e-06 [merge_forward]: 3.5e-06 [cell_reuse_recompute_pass]: 1.73002e-06 [offload_activation]: 9.20999e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.488e-05 [merge_recompute_call_nodes]: 1.02e-06 [before_grad]: 1.19e-05 [set_forward_comm_id_for_comm_node_pass]: 4.83001e-06 [meta_fg_expand]: 2.77002e-06 [flash_sp_send_recv_attached]: 1.00001e-06 [receive_attached]: 1.09998e-06 [after_resolve]: 1.023e-05 [a_after_grad]: 1.006e-05 [renormalize]: 9.00181e-08 [add_forward_monad_depend]: 1.49e-06 [auto_monad_grad]: 1.86e-06 [auto_monad_eliminator]: 1.111e-05 [cse]: 2.056e-05 [a_3]: 4.3e-05 [py_interpret_to_execute_after_opt_a]: 1.117e-05 [slice_cell_reuse_recomputed_activation]: 2.19999e-06 [rewriter_after_opt_a]: 4.242e-05 [convert_after_rewriter]: 7.69002e-06 [order_py_execute_after_rewriter]: 6.58e-06 [mutable_eliminate]: 0.00053769 [opt_b]: 0.00028062, [1] [Cycle 1]: 0.0002731, [7] [b_1]: 0.0001629 [b_2]: 8.84998e-06 [updatestate_depend_eliminate]: 8.01001e-06 [updatestate_assign_eliminate]: 3.24001e-06 [updatestate_loads_eliminate]: 3.14999e-06 [renormalize]: 6.40022e-07 [cse]: 2.821e-05 [optimize_parallel_all_gather_comm]: 1.897e-05 [overlap_param_gather]: 2.07001e-06 [cconv]: 2.772e-05 [loop_unroll]: 0.00045096 [opt_after_cconv]: 0.00012047, [1] [Cycle 1]: 0.00011432, [7] [c_1]: 3.615e-05 [parameter_eliminate]: 3.41999e-06 [updatestate_depend_eliminate]: 6.26e-06 [updatestate_assign_eliminate]: 3.04001e-06 [updatestate_loads_eliminate]: 3.38e-06 [cse]: 2.568e-05 [renormalize]: 4.89992e-07 [remove_dup_value]: 1.733e-05 [tuple_transform]: 8.245e-05, [1] [Cycle 1]: 7.798e-05, [4] [d_1]: 4.83e-05 [none_parameter_eliminate]: 1.69e-06 [renormalize]: 2.10013e-07 [switch_simplify]: 8.87e-06 [partial_unused_args_eliminate]: 1.72001e-06 [add_recomputation]: 5.6e-05 [cse_after_recomputation]: 2.78e-05, [1] [Cycle 1]: 2.233e-05, [1] [cse]: 1.61e-05 [environ_conv]: 6.93e-06 [swap_dp_allreduce_reducescatter]: 6.54999e-06 [bias_add_comm_swap]: 2.59001e-06 [label_micro_interleaved_index]: 5.00999e-06 [label_fine_grained_interleaved_index]: 2.81e-06 [merge_cast_opt]: 1.27e-06 [slice_recompute_activation]: 2.11003e-06 [micro_interleaved_order_control]: 2.14e-06 [assign_add_opt]: 1.52999e-06 [ForceFp32Comm]: 8.09989e-07 [remove_cast_before_assign_add]: 9.50007e-07 [full_micro_interleaved_order_control]: 2.01e-06 [reorder_send_recv_between_fp_bp]: 2.68e-06 [comm_op_add_attrs]: 1.10001e-06 [add_comm_op_reuse_tag]: 1.01997e-06 [interleave_split_concat_branches]: 1.19e-06 [interleave_parallel_branches]: 1.19e-06 [overlap_opt_shard_in_pipeline]: 1.25999e-06 [overlap_opt_shard_grad_in_pipeline]: 2.01e-06 [control_data_broadcast_order]: 1.535e-05 [grouped_pairwise_exchange_alltoall]: 1.80001e-06 [offloading_packed_experts]: 4.84998e-06 [overlap_recompute_and_grad_model_parallel]: 5.32001e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.20001e-06 [overlap_recompute_allgather_and_fa_grad]: 1.47001e-06 [overlap_recompute_comm]: 2.39001e-06 [overlap_grad_ring_attention]: 5.16998e-06 [overlap_grad_flash_sp]: 2.343e-05 [begin_end_overlap_inline]: 5.10016e-07 [split_matmul_comm_elemetwise]: 2.44001e-06 [split_layernorm_comm]: 1.61998e-06 [handle_group_info]: 1.39e-06 [symbol_engine_optimizer]: 9.196e-05, [1] [Cycle 1]: 8.687e-05, [6] [build]: 3.06001e-06 [elim_shapecalc]: 1.388e-05 [elim_not_effective]: 1.604e-05 [opt_reshape]: 8.18999e-06 [fold_const_symbol]: 1.219e-05 [renormalize]: 6.59988e-07 [detach_backward]: 1.99e-06 [pipeline_parallel_scheduler]: 1.44e-06 [auto_monad_reorder]: 2.317e-05 [get_jit_bprop_graph]: 2.09e-06 [rewriter_after_jit_bprop_graph]: 4.00998e-06 [opt_after_jit_grad]: 0.00049792 [validate]: 4.333e-05 Sums bootstrap : 0.000485s : 4.80% type_inference : 0.005058s : 50.02% event_method : 0.000013s : 0.13% auto_monad : 0.000061s : 0.60% graph_reusing : 0.000005s : 0.05% inline : 0.000003s : 0.03% add_attr.add_attr_with_inline.tag_attr : 0.000016s : 0.16% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000004s : 0.04% parallel-infer-symbol : 0.000003s : 0.03% pre_auto_parallel : 0.000028s : 0.28% insert-virtual-dataset : 0.000002s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.02% optimize.py_interpret_to_execute : 0.000019s : 0.19% optimize.rewriter_before_opt_a : 0.000051s : 0.51% optimize.opt_a.expand_dump_flag : 0.000005s : 0.05% optimize.opt_a.switch_simplify : 0.000036s : 0.35% optimize.opt_a.loop_unroll : 0.000023s : 0.22% optimize.opt_a.a_1 : 0.000510s : 5.04% optimize.opt_a.with_stream_mark : 0.000033s : 0.33% optimize.opt_a.recompute_prepare : 0.000019s : 0.19% optimize.opt_a.updatestate_depend_eliminate : 0.000009s : 0.09% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.07% optimize.opt_a.updatestate_loads_eliminate : 0.000007s : 0.07% optimize.opt_a.parameter_eliminate : 0.000003s : 0.03% optimize.opt_a.a_2 : 0.000186s : 1.83% optimize.opt_a.accelerated_algorithm : 0.000016s : 0.15% optimize.opt_a.shard : 0.000003s : 0.03% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.04% optimize.opt_a.shard_inline : 0.000015s : 0.15% optimize.opt_a.merge_send_recv : 0.000017s : 0.16% optimize.opt_a.auto_parallel : 0.000014s : 0.14% optimize.opt_a.parallel : 0.000024s : 0.24% optimize.opt_a.flash_sp : 0.000014s : 0.14% optimize.opt_a.merge_comm : 0.000009s : 0.09% optimize.opt_a.allreduce_fusion : 0.000008s : 0.08% optimize.opt_a.matmul_add_comm_reduction : 0.000018s : 0.18% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000002s : 0.02% optimize.opt_a.virtual_shard_identity : 0.000019s : 0.19% optimize.opt_a.virtual_dataset : 0.000014s : 0.14% optimize.opt_a.get_grad_eliminate_ : 0.000014s : 0.14% optimize.opt_a.virtual_output : 0.000013s : 0.13% optimize.opt_a.merge_forward : 0.000008s : 0.08% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.03% optimize.opt_a.offload_activation : 0.000021s : 0.21% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000031s : 0.31% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.02% optimize.opt_a.before_grad : 0.000025s : 0.25% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000010s : 0.10% optimize.opt_a.meta_fg_expand : 0.000006s : 0.06% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.04% optimize.opt_a.receive_attached : 0.000003s : 0.03% optimize.opt_a.after_resolve : 0.000022s : 0.22% optimize.opt_a.a_after_grad : 0.000021s : 0.21% optimize.opt_a.renormalize : 0.000669s : 6.62% optimize.opt_a.add_forward_monad_depend : 0.000008s : 0.08% optimize.opt_a.auto_monad_grad : 0.000004s : 0.04% optimize.opt_a.auto_monad_eliminator : 0.000029s : 0.29% optimize.opt_a.cse : 0.000059s : 0.58% optimize.opt_a.a_3 : 0.000100s : 0.99% optimize.py_interpret_to_execute_after_opt_a : 0.000011s : 0.11% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.02% optimize.rewriter_after_opt_a : 0.000042s : 0.42% optimize.convert_after_rewriter : 0.000008s : 0.08% optimize.order_py_execute_after_rewriter : 0.000007s : 0.07% optimize.mutable_eliminate : 0.000538s : 5.32% optimize.opt_b.b_1 : 0.000163s : 1.61% optimize.opt_b.b_2 : 0.000009s : 0.09% optimize.opt_b.updatestate_depend_eliminate : 0.000008s : 0.08% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_b.renormalize : 0.000001s : 0.01% optimize.opt_b.cse : 0.000028s : 0.28% optimize.optimize_parallel_all_gather_comm : 0.000019s : 0.19% optimize.overlap_param_gather : 0.000002s : 0.02% optimize.cconv : 0.000028s : 0.27% optimize.loop_unroll : 0.000451s : 4.46% optimize.opt_after_cconv.c_1 : 0.000036s : 0.36% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.06% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.cse : 0.000026s : 0.25% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000017s : 0.17% optimize.tuple_transform.d_1 : 0.000048s : 0.48% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.02% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000009s : 0.09% optimize.partial_unused_args_eliminate : 0.000002s : 0.02% optimize.add_recomputation : 0.000056s : 0.55% optimize.cse_after_recomputation.cse : 0.000016s : 0.16% optimize.environ_conv : 0.000007s : 0.07% optimize.swap_dp_allreduce_reducescatter : 0.000007s : 0.06% optimize.bias_add_comm_swap : 0.000003s : 0.03% optimize.label_micro_interleaved_index : 0.000005s : 0.05% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.03% optimize.merge_cast_opt : 0.000001s : 0.01% optimize.slice_recompute_activation : 0.000002s : 0.02% optimize.micro_interleaved_order_control : 0.000002s : 0.02% optimize.assign_add_opt : 0.000002s : 0.02% optimize.ForceFp32Comm : 0.000001s : 0.01% optimize.remove_cast_before_assign_add : 0.000001s : 0.01% optimize.full_micro_interleaved_order_control : 0.000002s : 0.02% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.03% optimize.comm_op_add_attrs : 0.000001s : 0.01% optimize.add_comm_op_reuse_tag : 0.000001s : 0.01% optimize.interleave_split_concat_branches : 0.000001s : 0.01% optimize.interleave_parallel_branches : 0.000001s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.02% optimize.control_data_broadcast_order : 0.000015s : 0.15% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.02% optimize.offloading_packed_experts : 0.000005s : 0.05% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.05% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.01% optimize.overlap_recompute_comm : 0.000002s : 0.02% optimize.overlap_grad_ring_attention : 0.000005s : 0.05% optimize.overlap_grad_flash_sp : 0.000023s : 0.23% optimize.begin_end_overlap_inline : 0.000001s : 0.01% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.02% optimize.split_layernorm_comm : 0.000002s : 0.02% optimize.handle_group_info : 0.000001s : 0.01% optimize.symbol_engine_optimizer.build : 0.000003s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000014s : 0.14% optimize.symbol_engine_optimizer.elim_not_effective : 0.000016s : 0.16% optimize.symbol_engine_optimizer.opt_reshape : 0.000008s : 0.08% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000012s : 0.12% optimize.symbol_engine_optimizer.renormalize : 0.000001s : 0.01% detach_backward : 0.000002s : 0.02% pipeline_parallel_scheduler : 0.000001s : 0.01% auto_monad_reorder : 0.000023s : 0.23% get_jit_bprop_graph : 0.000002s : 0.02% rewriter_after_jit_bprop_graph : 0.000004s : 0.04% opt_after_jit_grad : 0.000498s : 4.92% validate : 0.000043s : 0.43% Time group info: ------[substitution.] 0.000161 29 12.98% : 0.000021s : 2: substitution.cast_eliminate 1.36% : 0.000002s : 3: substitution.elim_not_effective 1.00% : 0.000002s : 3: substitution.fold_const_symbol 3.78% : 0.000006s : 4: substitution.graph_param_transform 65.76% : 0.000106s : 2: substitution.inline 3.11% : 0.000005s : 6: substitution.j_node_and_user_rematch 3.60% : 0.000006s : 6: substitution.remove_not_recompute_node 2.38% : 0.000004s : 2: substitution.replace_old_param 6.02% : 0.000010s : 1: substitution.value_based_eliminate ------[type_inference.] 0.005006 2 90.58% : 0.004534s : 1: type_inference.infer 9.42% : 0.000472s : 1: type_inference.specialize ------[replace.] 0.000022 2 100.00% : 0.000022s : 2: replace.inline ------[match.] 0.000104 2 100.00% : 0.000104s : 2: match.inline ------[predicate.] 0.000170 980 0.98% : 0.000002s : 9: predicate.accumulaten_eliminater 1.18% : 0.000002s : 4: predicate.ad_related_special_op_eliminate 0.67% : 0.000001s : 8: predicate.addn_check_dump 0.87% : 0.000001s : 9: predicate.addn_zero_filter 0.73% : 0.000001s : 9: predicate.adjust_all_reduce_mul_add 2.30% : 0.000004s : 17: predicate.arithmetic_simplify 0.97% : 0.000002s : 9: predicate.cast_eliminate 0.81% : 0.000001s : 8: predicate.check_bprop_eliminate 0.67% : 0.000001s : 8: predicate.compare_switch_simplify 0.24% : 0.000000s : 4: predicate.const_output_eliminate 0.77% : 0.000001s : 8: predicate.depend_value_elim 0.81% : 0.000001s : 9: predicate.dict_get_item_const_eliminator 0.97% : 0.000002s : 9: predicate.dict_get_item_eliminator 0.78% : 0.000001s : 9: predicate.dict_set_item_eliminator 1.13% : 0.000002s : 8: predicate.dumpgradient_eliminate 0.40% : 0.000001s : 4: predicate.elim_not_effective 0.56% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.23% : 0.000002s : 13: predicate.environ_add_const_eliminate 1.04% : 0.000002s : 13: predicate.environ_get_add_eliminate 1.05% : 0.000002s : 13: predicate.environ_get_depend_swap 1.90% : 0.000003s : 21: predicate.environ_get_eliminate 1.04% : 0.000002s : 13: predicate.environ_get_set_eliminate 0.92% : 0.000002s : 11: predicate.exchange_switch_depend_value 1.76% : 0.000003s : 11: predicate.float_depend_g_call 0.66% : 0.000001s : 8: predicate.float_environ_get_switch 1.00% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.19% : 0.000000s : 4: predicate.fold_const_symbol 0.85% : 0.000001s : 8: predicate.get_grad_eliminate 0.29% : 0.000000s : 4: predicate.graph_param_transform 0.82% : 0.000001s : 8: predicate.incorporate_call 0.67% : 0.000001s : 8: predicate.incorporate_call_switch 6.34% : 0.000011s : 44: predicate.inline 1.04% : 0.000002s : 8: predicate.inline_without_move 0.36% : 0.000001s : 8: predicate.j_node_and_user_rematch 1.17% : 0.000002s : 8: predicate.less_batch_normalization 1.63% : 0.000003s : 17: predicate.list_to_tuple_eliminator_ 2.20% : 0.000004s : 26: predicate.load_eliminater 1.47% : 0.000002s : 4: predicate.loop_unroll_after_grad 1.61% : 0.000003s : 16: predicate.loop_unroll_before_grad 1.73% : 0.000003s : 17: predicate.make_slice_get_slice_eliminator 0.72% : 0.000001s : 8: predicate.merge_addn 0.70% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.70% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.67% : 0.000001s : 9: predicate.minmaximum_grad 1.27% : 0.000002s : 4: predicate.mutable_eliminate 0.44% : 0.000001s : 4: predicate.opt_reshape 0.42% : 0.000001s : 4: predicate.parallel_virtual_node 1.15% : 0.000002s : 11: predicate.partial_defer_inline 1.30% : 0.000002s : 13: predicate.partial_eliminate 0.77% : 0.000001s : 9: predicate.print_const_string_wrapper 0.74% : 0.000001s : 8: predicate.reduce_all_const_elim 1.07% : 0.000002s : 9: predicate.reduce_eliminate 2.21% : 0.000004s : 26: predicate.redundant_stop_gradient_eliminater 0.59% : 0.000001s : 8: predicate.remove_not_recompute_node 1.24% : 0.000002s : 17: predicate.replace_applicator 0.71% : 0.000001s : 8: predicate.replace_old_param 0.32% : 0.000001s : 4: predicate.reset_defer_inline 0.97% : 0.000002s : 9: predicate.reshape_eliminate 0.76% : 0.000001s : 8: predicate.row_tensor_add_zeros_like 0.45% : 0.000001s : 4: predicate.row_tensor_eliminate 0.96% : 0.000002s : 8: predicate.same_eliminate 0.52% : 0.000001s : 8: predicate.set_cell_output_no_recompute 1.14% : 0.000002s : 8: predicate.shard_identity_eliminate 0.90% : 0.000002s : 8: predicate.special_op_eliminate 0.96% : 0.000002s : 8: predicate.specialize_transform 1.10% : 0.000002s : 8: predicate.split_environ_get_set_with_tuple_value 1.07% : 0.000002s : 8: predicate.stack_unstack_eliminate 0.44% : 0.000001s : 4: predicate.switch_call_monad_eliminater 0.95% : 0.000002s : 11: predicate.switch_defer_inline 1.79% : 0.000003s : 19: predicate.switch_layer_defer_inline 3.99% : 0.000007s : 39: predicate.switch_simplify 0.81% : 0.000001s : 9: predicate.tile_eliminate 0.83% : 0.000001s : 9: predicate.transpose_eliminate 1.55% : 0.000003s : 17: predicate.tuple_list_convert_item_index_to_positive 1.70% : 0.000003s : 17: predicate.tuple_list_get_item_const_eliminator 1.45% : 0.000002s : 17: predicate.tuple_list_get_item_depend_reorder 3.09% : 0.000005s : 25: predicate.tuple_list_get_item_eliminator 1.57% : 0.000003s : 17: predicate.tuple_list_get_set_item_eliminator 2.52% : 0.000004s : 25: predicate.tuple_list_set_item_eliminator 1.54% : 0.000003s : 17: predicate.tuple_to_list_eliminator_ 2.05% : 0.000003s : 26: predicate.updatestate_pure_node_eliminater 2.94% : 0.000005s : 34: predicate.updatestate_useless_node_eliminater 0.73% : 0.000001s : 4: predicate.value_based_eliminate 0.78% : 0.000001s : 8: predicate.virtual_dataset_eliminate 0.80% : 0.000001s : 8: predicate.virtual_output_eliminate 0.34% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.50% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000237 5 7.90% : 0.000019s : 1: func_graph_cloner_run.FuncGraphClonerGraph 92.10% : 0.000218s : 4: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.024151 192 0.01% : 0.000004s : 1: ForceFp32Comm 13.35% : 0.003224s : 1: add_attr 13.31% : 0.003213s : 1: add_attr_with_inline 0.02% : 0.000004s : 1: add_comm_op_reuse_tag 0.25% : 0.000061s : 1: add_recomputation 0.02% : 0.000004s : 1: assign_add_opt 0.27% : 0.000066s : 1: auto_monad 0.11% : 0.000027s : 1: auto_monad_reorder 0.01% : 0.000003s : 1: begin_end_overlap_inline 0.02% : 0.000006s : 1: bias_add_comm_swap 2.14% : 0.000516s : 1: bootstrap 0.13% : 0.000031s : 1: cconv 0.02% : 0.000004s : 1: comm_op_add_attrs 0.08% : 0.000019s : 1: control_data_broadcast_order 0.05% : 0.000011s : 1: convert_after_rewriter 0.13% : 0.000031s : 1: cse_after_recomputation 0.02% : 0.000006s : 1: dataset_repeat_opt 0.02% : 0.000005s : 1: detach_backward 0.04% : 0.000011s : 1: environ_conv 0.09% : 0.000021s : 1: event_method 0.02% : 0.000005s : 1: full_micro_interleaved_order_control 0.02% : 0.000005s : 1: get_jit_bprop_graph 0.03% : 0.000008s : 1: graph_reusing 0.02% : 0.000005s : 1: grouped_pairwise_exchange_alltoall 0.02% : 0.000004s : 1: handle_group_info 0.02% : 0.000006s : 1: inline 0.02% : 0.000006s : 1: insert-virtual-dataset 0.02% : 0.000004s : 1: interleave_parallel_branches 0.02% : 0.000004s : 1: interleave_split_concat_branches 0.02% : 0.000006s : 1: label_fine_grained_interleaved_index 0.03% : 0.000008s : 1: label_micro_interleaved_index 1.90% : 0.000459s : 1: loop_unroll 0.02% : 0.000004s : 1: merge_cast_opt 0.02% : 0.000005s : 1: micro_interleaved_order_control 2.26% : 0.000547s : 1: mutable_eliminate 0.03% : 0.000008s : 1: offloading_packed_experts 0.08% : 0.000019s : 1: opt.transform.loop_unroll_optimizer 0.08% : 0.000020s : 1: opt.transform.mutable_eliminate 4.01% : 0.000967s : 78: opt.transform.opt_a 0.14% : 0.000034s : 1: opt.transform.opt_after_cconv 0.12% : 0.000030s : 1: opt.transform.opt_after_jit_grad 0.57% : 0.000138s : 28: opt.transform.opt_b 0.23% : 0.000055s : 2: opt.transform.opt_trans_graph 0.19% : 0.000046s : 4: opt.transform.symbol_engine_opt 10.55% : 0.002548s : 1: opt_a 0.52% : 0.000124s : 1: opt_after_cconv 2.10% : 0.000508s : 1: opt_after_jit_grad 1.18% : 0.000284s : 1: opt_b 19.64% : 0.004743s : 1: optimize 0.09% : 0.000023s : 1: optimize_parallel_all_gather_comm 0.04% : 0.000010s : 1: order_py_execute_after_rewriter 0.11% : 0.000027s : 1: overlap_grad_flash_sp 0.02% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.03% : 0.000008s : 1: overlap_grad_ring_attention 0.02% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.02% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.02% : 0.000005s : 1: overlap_param_gather 0.02% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.03% : 0.000008s : 1: overlap_recompute_and_grad_model_parallel 0.02% : 0.000005s : 1: overlap_recompute_comm 0.03% : 0.000007s : 1: parallel-infer-symbol 0.02% : 0.000004s : 1: parallel-infer-symbol-second 0.02% : 0.000005s : 1: partial_unused_args_eliminate 0.02% : 0.000004s : 1: pipeline_parallel_scheduler 0.02% : 0.000004s : 1: pipeline_split 0.13% : 0.000033s : 1: pre_auto_parallel 0.10% : 0.000023s : 1: py_interpret_to_execute 0.06% : 0.000014s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000004s : 1: remove_cast_before_assign_add 0.09% : 0.000021s : 1: remove_dup_value 1.62% : 0.000391s : 1: renormalize.infer 1.12% : 0.000271s : 1: renormalize.specialize 0.02% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.03% : 0.000007s : 1: rewriter_after_jit_bprop_graph 0.20% : 0.000047s : 1: rewriter_after_opt_a 0.23% : 0.000056s : 1: rewriter_before_opt_a 0.02% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.02% : 0.000005s : 1: slice_recompute_activation 0.02% : 0.000005s : 1: split_layernorm_comm 0.02% : 0.000005s : 1: split_matmul_comm_elemetwise 0.04% : 0.000010s : 1: swap_dp_allreduce_reducescatter 0.39% : 0.000095s : 1: symbol_engine_optimizer 0.35% : 0.000085s : 1: tuple_transform 21.01% : 0.005075s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:55.628.141 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:55.628.415 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0153476, [21] [bootstrap]: 0.00044694 [type_inference]: 0.00503799 [event_method]: 1.318e-05 [auto_monad]: 5.961e-05 [graph_reusing]: 5.20001e-06 [inline]: 2.21e-06 [add_attr]: 0.00324476, [1] [add_attr_with_inline]: 0.00323538, [1] [Cycle 1]: 7.35e-05, [2] [tag_attr]: 1.54e-05 [meta_addattr_fg_expand]: 3.78999e-06 [parallel-infer-symbol]: 3.01001e-06 [pre_auto_parallel]: 2.687e-05 [insert-virtual-dataset]: 2.48e-06 [parallel-infer-symbol-second]: 7.7e-07 [dataset_repeat_opt]: 2.11e-06 [pipeline_split]: 1.79998e-06 [optimize]: 0.00535917, [53] [py_interpret_to_execute]: 2.345e-05 [rewriter_before_opt_a]: 5.443e-05 [opt_a]: 0.00301181, [2] [Cycle 1]: 0.0020638, [45] [expand_dump_flag]: 3.81001e-06 [switch_simplify]: 2.715e-05 [loop_unroll]: 1.53e-05 [a_1]: 0.00036505 [with_stream_mark]: 1.858e-05 [recompute_prepare]: 1.22e-05 [updatestate_depend_eliminate]: 4.57e-06 [updatestate_assign_eliminate]: 3.77998e-06 [updatestate_loads_eliminate]: 4.3e-06 [parameter_eliminate]: 2.16e-06 [a_2]: 0.00012591 [accelerated_algorithm]: 8.72e-06 [shard]: 2.27001e-06 [meta_shard_fg_expand]: 2.01e-06 [shard_inline]: 8.17e-06 [merge_send_recv]: 1.01e-05 [auto_parallel]: 7.8e-06 [parallel]: 1.875e-05 [flash_sp]: 9.47001e-06 [merge_comm]: 5.27999e-06 [allreduce_fusion]: 4.62998e-06 [matmul_add_comm_reduction]: 1.053e-05 [allreduce_slice_to_reducescatter]: 5.69999e-07 [virtual_shard_identity]: 1.158e-05 [virtual_dataset]: 8.17003e-06 [get_grad_eliminate_]: 7.73001e-06 [virtual_output]: 7.33e-06 [merge_forward]: 4.80999e-06 [cell_reuse_recompute_pass]: 1.42e-06 [offload_activation]: 1.146e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.032e-05 [merge_recompute_call_nodes]: 1.42999e-06 [before_grad]: 1.304e-05 [set_forward_comm_id_for_comm_node_pass]: 5.79999e-06 [meta_fg_expand]: 3.39001e-06 [flash_sp_send_recv_attached]: 2.77002e-06 [receive_attached]: 2.63e-06 [after_resolve]: 1.223e-05 [a_after_grad]: 1.187e-05 [renormalize]: 0.00071809 [add_forward_monad_depend]: 5.82001e-06 [auto_monad_grad]: 2.19001e-06 [auto_monad_eliminator]: 1.833e-05 [cse]: 3.684e-05 [a_3]: 7.092e-05 [Cycle 2]: 0.00093519, [45] [expand_dump_flag]: 1.70001e-06 [switch_simplify]: 8.66002e-06 [loop_unroll]: 7.17002e-06 [a_1]: 0.00017937 [with_stream_mark]: 1.759e-05 [recompute_prepare]: 8.18999e-06 [updatestate_depend_eliminate]: 4.56002e-06 [updatestate_assign_eliminate]: 3.14999e-06 [updatestate_loads_eliminate]: 3.11999e-06 [parameter_eliminate]: 1.27999e-06 [a_2]: 0.00011548 [accelerated_algorithm]: 7.13998e-06 [shard]: 1.69e-06 [meta_shard_fg_expand]: 1.72001e-06 [shard_inline]: 7.33e-06 [merge_send_recv]: 6.51999e-06 [auto_parallel]: 7.05e-06 [parallel]: 5.87999e-06 [flash_sp]: 3.80998e-06 [merge_comm]: 4.33001e-06 [allreduce_fusion]: 4.36002e-06 [matmul_add_comm_reduction]: 7.03e-06 [allreduce_slice_to_reducescatter]: 5.00004e-07 [virtual_shard_identity]: 7.77e-06 [virtual_dataset]: 6.93998e-06 [get_grad_eliminate_]: 6.55002e-06 [virtual_output]: 6.53e-06 [merge_forward]: 3.86001e-06 [cell_reuse_recompute_pass]: 1.51002e-06 [offload_activation]: 8.28001e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.596e-05 [merge_recompute_call_nodes]: 9.20001e-07 [before_grad]: 1.149e-05 [set_forward_comm_id_for_comm_node_pass]: 4.87e-06 [meta_fg_expand]: 3.01001e-06 [flash_sp_send_recv_attached]: 1.03001e-06 [receive_attached]: 1.40001e-06 [after_resolve]: 9.74e-06 [a_after_grad]: 1.039e-05 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 1.81e-06 [auto_monad_grad]: 1.07e-06 [auto_monad_eliminator]: 9.26998e-06 [cse]: 1.954e-05 [a_3]: 5.702e-05 [py_interpret_to_execute_after_opt_a]: 1.467e-05 [slice_cell_reuse_recomputed_activation]: 4.42e-06 [rewriter_after_opt_a]: 4.679e-05 [convert_after_rewriter]: 1.035e-05 [order_py_execute_after_rewriter]: 8.81002e-06 [mutable_eliminate]: 0.00051432 [opt_b]: 0.00033688, [1] [Cycle 1]: 0.0003279, [7] [b_1]: 0.00022242 [b_2]: 9.13002e-06 [updatestate_depend_eliminate]: 6.96999e-06 [updatestate_assign_eliminate]: 3.15998e-06 [updatestate_loads_eliminate]: 2.91999e-06 [renormalize]: 6.19999e-07 [cse]: 2.345e-05 [optimize_parallel_all_gather_comm]: 2.083e-05 [overlap_param_gather]: 4.62e-06 [cconv]: 2.878e-05 [loop_unroll]: 0.00044001 [opt_after_cconv]: 0.00013449, [1] [Cycle 1]: 0.00012606, [7] [c_1]: 3.336e-05 [parameter_eliminate]: 2.41998e-06 [updatestate_depend_eliminate]: 5.74e-06 [updatestate_assign_eliminate]: 3.01001e-06 [updatestate_loads_eliminate]: 3.38e-06 [cse]: 2.238e-05 [renormalize]: 3.89991e-07 [remove_dup_value]: 1.973e-05 [tuple_transform]: 9.443e-05, [1] [Cycle 1]: 8.74e-05, [4] [d_1]: 4.722e-05 [none_parameter_eliminate]: 1.55999e-06 [renormalize]: 2.19996e-07 [switch_simplify]: 8.38001e-06 [partial_unused_args_eliminate]: 4.60001e-06 [add_recomputation]: 5.704e-05 [cse_after_recomputation]: 3.084e-05, [1] [Cycle 1]: 2.409e-05, [1] [cse]: 1.493e-05 [environ_conv]: 9.19e-06 [swap_dp_allreduce_reducescatter]: 8.70001e-06 [bias_add_comm_swap]: 5.06002e-06 [label_micro_interleaved_index]: 6.58e-06 [label_fine_grained_interleaved_index]: 5.20999e-06 [merge_cast_opt]: 3.66999e-06 [slice_recompute_activation]: 4.54998e-06 [micro_interleaved_order_control]: 4.57998e-06 [assign_add_opt]: 3.7e-06 [ForceFp32Comm]: 3.51001e-06 [remove_cast_before_assign_add]: 3.74002e-06 [full_micro_interleaved_order_control]: 4.75999e-06 [reorder_send_recv_between_fp_bp]: 5.22999e-06 [comm_op_add_attrs]: 3.79002e-06 [add_comm_op_reuse_tag]: 3.33998e-06 [interleave_split_concat_branches]: 3.47002e-06 [interleave_parallel_branches]: 3.94002e-06 [overlap_opt_shard_in_pipeline]: 3.45e-06 [overlap_opt_shard_grad_in_pipeline]: 4.02e-06 [control_data_broadcast_order]: 1.716e-05 [grouped_pairwise_exchange_alltoall]: 4.13001e-06 [offloading_packed_experts]: 8.00999e-06 [overlap_recompute_and_grad_model_parallel]: 8.00999e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.58e-06 [overlap_recompute_allgather_and_fa_grad]: 3.66999e-06 [overlap_recompute_comm]: 4.50999e-06 [overlap_grad_ring_attention]: 6.80998e-06 [overlap_grad_flash_sp]: 2.21e-05 [begin_end_overlap_inline]: 3.06001e-06 [split_matmul_comm_elemetwise]: 4.63999e-06 [split_layernorm_comm]: 4.27003e-06 [handle_group_info]: 3.36999e-06 [symbol_engine_optimizer]: 0.00010298, [1] [Cycle 1]: 9.607e-05, [6] [build]: 2.98998e-06 [elim_shapecalc]: 1.125e-05 [elim_not_effective]: 1.542e-05 [opt_reshape]: 8.17e-06 [fold_const_symbol]: 1.235e-05 [renormalize]: 3.00002e-07 [detach_backward]: 3.53999e-06 [pipeline_parallel_scheduler]: 1.92999e-06 [auto_monad_reorder]: 2.134e-05 [get_jit_bprop_graph]: 1.59e-06 [rewriter_after_jit_bprop_graph]: 4.67e-06 [opt_after_jit_grad]: 0.00048013 [validate]: 4.198e-05 Sums bootstrap : 0.000447s : 4.33% type_inference : 0.005038s : 48.81% event_method : 0.000013s : 0.13% auto_monad : 0.000060s : 0.58% graph_reusing : 0.000005s : 0.05% inline : 0.000002s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000015s : 0.15% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000004s : 0.04% parallel-infer-symbol : 0.000003s : 0.03% pre_auto_parallel : 0.000027s : 0.26% insert-virtual-dataset : 0.000002s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.02% optimize.py_interpret_to_execute : 0.000023s : 0.23% optimize.rewriter_before_opt_a : 0.000054s : 0.53% optimize.opt_a.expand_dump_flag : 0.000006s : 0.05% optimize.opt_a.switch_simplify : 0.000036s : 0.35% optimize.opt_a.loop_unroll : 0.000022s : 0.22% optimize.opt_a.a_1 : 0.000544s : 5.27% optimize.opt_a.with_stream_mark : 0.000036s : 0.35% optimize.opt_a.recompute_prepare : 0.000020s : 0.20% optimize.opt_a.updatestate_depend_eliminate : 0.000009s : 0.09% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.07% optimize.opt_a.updatestate_loads_eliminate : 0.000007s : 0.07% optimize.opt_a.parameter_eliminate : 0.000003s : 0.03% optimize.opt_a.a_2 : 0.000241s : 2.34% optimize.opt_a.accelerated_algorithm : 0.000016s : 0.15% optimize.opt_a.shard : 0.000004s : 0.04% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.04% optimize.opt_a.shard_inline : 0.000016s : 0.15% optimize.opt_a.merge_send_recv : 0.000017s : 0.16% optimize.opt_a.auto_parallel : 0.000015s : 0.14% optimize.opt_a.parallel : 0.000025s : 0.24% optimize.opt_a.flash_sp : 0.000013s : 0.13% optimize.opt_a.merge_comm : 0.000010s : 0.09% optimize.opt_a.allreduce_fusion : 0.000009s : 0.09% optimize.opt_a.matmul_add_comm_reduction : 0.000018s : 0.17% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000019s : 0.19% optimize.opt_a.virtual_dataset : 0.000015s : 0.15% optimize.opt_a.get_grad_eliminate_ : 0.000014s : 0.14% optimize.opt_a.virtual_output : 0.000014s : 0.13% optimize.opt_a.merge_forward : 0.000009s : 0.08% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.03% optimize.opt_a.offload_activation : 0.000020s : 0.19% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000036s : 0.35% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.02% optimize.opt_a.before_grad : 0.000025s : 0.24% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000011s : 0.10% optimize.opt_a.meta_fg_expand : 0.000006s : 0.06% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.04% optimize.opt_a.receive_attached : 0.000004s : 0.04% optimize.opt_a.after_resolve : 0.000022s : 0.21% optimize.opt_a.a_after_grad : 0.000022s : 0.22% optimize.opt_a.renormalize : 0.000718s : 6.96% optimize.opt_a.add_forward_monad_depend : 0.000008s : 0.07% optimize.opt_a.auto_monad_grad : 0.000003s : 0.03% optimize.opt_a.auto_monad_eliminator : 0.000028s : 0.27% optimize.opt_a.cse : 0.000056s : 0.55% optimize.opt_a.a_3 : 0.000128s : 1.24% optimize.py_interpret_to_execute_after_opt_a : 0.000015s : 0.14% optimize.slice_cell_reuse_recomputed_activation : 0.000004s : 0.04% optimize.rewriter_after_opt_a : 0.000047s : 0.45% optimize.convert_after_rewriter : 0.000010s : 0.10% optimize.order_py_execute_after_rewriter : 0.000009s : 0.09% optimize.mutable_eliminate : 0.000514s : 4.98% optimize.opt_b.b_1 : 0.000222s : 2.15% optimize.opt_b.b_2 : 0.000009s : 0.09% optimize.opt_b.updatestate_depend_eliminate : 0.000007s : 0.07% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_b.renormalize : 0.000001s : 0.01% optimize.opt_b.cse : 0.000023s : 0.23% optimize.optimize_parallel_all_gather_comm : 0.000021s : 0.20% optimize.overlap_param_gather : 0.000005s : 0.04% optimize.cconv : 0.000029s : 0.28% optimize.loop_unroll : 0.000440s : 4.26% optimize.opt_after_cconv.c_1 : 0.000033s : 0.32% optimize.opt_after_cconv.parameter_eliminate : 0.000002s : 0.02% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.06% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.cse : 0.000022s : 0.22% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000020s : 0.19% optimize.tuple_transform.d_1 : 0.000047s : 0.46% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.02% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000008s : 0.08% optimize.partial_unused_args_eliminate : 0.000005s : 0.04% optimize.add_recomputation : 0.000057s : 0.55% optimize.cse_after_recomputation.cse : 0.000015s : 0.14% optimize.environ_conv : 0.000009s : 0.09% optimize.swap_dp_allreduce_reducescatter : 0.000009s : 0.08% optimize.bias_add_comm_swap : 0.000005s : 0.05% optimize.label_micro_interleaved_index : 0.000007s : 0.06% optimize.label_fine_grained_interleaved_index : 0.000005s : 0.05% optimize.merge_cast_opt : 0.000004s : 0.04% optimize.slice_recompute_activation : 0.000005s : 0.04% optimize.micro_interleaved_order_control : 0.000005s : 0.04% optimize.assign_add_opt : 0.000004s : 0.04% optimize.ForceFp32Comm : 0.000004s : 0.03% optimize.remove_cast_before_assign_add : 0.000004s : 0.04% optimize.full_micro_interleaved_order_control : 0.000005s : 0.05% optimize.reorder_send_recv_between_fp_bp : 0.000005s : 0.05% optimize.comm_op_add_attrs : 0.000004s : 0.04% optimize.add_comm_op_reuse_tag : 0.000003s : 0.03% optimize.interleave_split_concat_branches : 0.000003s : 0.03% optimize.interleave_parallel_branches : 0.000004s : 0.04% optimize.overlap_opt_shard_in_pipeline : 0.000003s : 0.03% optimize.overlap_opt_shard_grad_in_pipeline : 0.000004s : 0.04% optimize.control_data_broadcast_order : 0.000017s : 0.17% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.04% optimize.offloading_packed_experts : 0.000008s : 0.08% optimize.overlap_recompute_and_grad_model_parallel : 0.000008s : 0.08% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.03% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.04% optimize.overlap_recompute_comm : 0.000005s : 0.04% optimize.overlap_grad_ring_attention : 0.000007s : 0.07% optimize.overlap_grad_flash_sp : 0.000022s : 0.21% optimize.begin_end_overlap_inline : 0.000003s : 0.03% optimize.split_matmul_comm_elemetwise : 0.000005s : 0.04% optimize.split_layernorm_comm : 0.000004s : 0.04% optimize.handle_group_info : 0.000003s : 0.03% optimize.symbol_engine_optimizer.build : 0.000003s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000011s : 0.11% optimize.symbol_engine_optimizer.elim_not_effective : 0.000015s : 0.15% optimize.symbol_engine_optimizer.opt_reshape : 0.000008s : 0.08% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000012s : 0.12% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000004s : 0.03% pipeline_parallel_scheduler : 0.000002s : 0.02% auto_monad_reorder : 0.000021s : 0.21% get_jit_bprop_graph : 0.000002s : 0.02% rewriter_after_jit_bprop_graph : 0.000005s : 0.05% opt_after_jit_grad : 0.000480s : 4.65% validate : 0.000042s : 0.41% Time group info: ------[substitution.] 0.000176 29 13.33% : 0.000024s : 2: substitution.cast_eliminate 1.27% : 0.000002s : 3: substitution.elim_not_effective 1.03% : 0.000002s : 3: substitution.fold_const_symbol 3.61% : 0.000006s : 4: substitution.graph_param_transform 57.71% : 0.000102s : 2: substitution.inline 2.46% : 0.000004s : 6: substitution.j_node_and_user_rematch 3.25% : 0.000006s : 6: substitution.remove_not_recompute_node 2.00% : 0.000004s : 2: substitution.replace_old_param 15.35% : 0.000027s : 1: substitution.value_based_eliminate ------[type_inference.] 0.004987 2 90.79% : 0.004528s : 1: type_inference.infer 9.21% : 0.000459s : 1: type_inference.specialize ------[replace.] 0.000024 2 100.00% : 0.000024s : 2: replace.inline ------[match.] 0.000100 2 100.00% : 0.000100s : 2: match.inline ------[predicate.] 0.000195 980 0.74% : 0.000001s : 9: predicate.accumulaten_eliminater 0.78% : 0.000002s : 4: predicate.ad_related_special_op_eliminate 0.59% : 0.000001s : 8: predicate.addn_check_dump 0.85% : 0.000002s : 9: predicate.addn_zero_filter 0.60% : 0.000001s : 9: predicate.adjust_all_reduce_mul_add 2.08% : 0.000004s : 17: predicate.arithmetic_simplify 0.91% : 0.000002s : 9: predicate.cast_eliminate 0.66% : 0.000001s : 8: predicate.check_bprop_eliminate 0.67% : 0.000001s : 8: predicate.compare_switch_simplify 0.18% : 0.000000s : 4: predicate.const_output_eliminate 0.68% : 0.000001s : 8: predicate.depend_value_elim 0.69% : 0.000001s : 9: predicate.dict_get_item_const_eliminator 0.81% : 0.000002s : 9: predicate.dict_get_item_eliminator 0.68% : 0.000001s : 9: predicate.dict_set_item_eliminator 1.15% : 0.000002s : 8: predicate.dumpgradient_eliminate 0.23% : 0.000000s : 4: predicate.elim_not_effective 0.39% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 12.67% : 0.000025s : 13: predicate.environ_add_const_eliminate 0.94% : 0.000002s : 13: predicate.environ_get_add_eliminate 0.96% : 0.000002s : 13: predicate.environ_get_depend_swap 1.55% : 0.000003s : 21: predicate.environ_get_eliminate 0.93% : 0.000002s : 13: predicate.environ_get_set_eliminate 0.83% : 0.000002s : 11: predicate.exchange_switch_depend_value 1.57% : 0.000003s : 11: predicate.float_depend_g_call 0.57% : 0.000001s : 8: predicate.float_environ_get_switch 0.89% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.21% : 0.000000s : 4: predicate.fold_const_symbol 0.68% : 0.000001s : 8: predicate.get_grad_eliminate 0.23% : 0.000000s : 4: predicate.graph_param_transform 0.69% : 0.000001s : 8: predicate.incorporate_call 0.58% : 0.000001s : 8: predicate.incorporate_call_switch 5.95% : 0.000012s : 44: predicate.inline 1.16% : 0.000002s : 8: predicate.inline_without_move 0.33% : 0.000001s : 8: predicate.j_node_and_user_rematch 0.95% : 0.000002s : 8: predicate.less_batch_normalization 1.57% : 0.000003s : 17: predicate.list_to_tuple_eliminator_ 2.07% : 0.000004s : 26: predicate.load_eliminater 0.89% : 0.000002s : 4: predicate.loop_unroll_after_grad 1.29% : 0.000003s : 16: predicate.loop_unroll_before_grad 1.56% : 0.000003s : 17: predicate.make_slice_get_slice_eliminator 0.70% : 0.000001s : 8: predicate.merge_addn 0.85% : 0.000002s : 8: predicate.micro_step_allgather_replace 0.64% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.60% : 0.000001s : 9: predicate.minmaximum_grad 0.93% : 0.000002s : 4: predicate.mutable_eliminate 0.39% : 0.000001s : 4: predicate.opt_reshape 0.54% : 0.000001s : 4: predicate.parallel_virtual_node 1.16% : 0.000002s : 11: predicate.partial_defer_inline 1.13% : 0.000002s : 13: predicate.partial_eliminate 0.80% : 0.000002s : 9: predicate.print_const_string_wrapper 0.60% : 0.000001s : 8: predicate.reduce_all_const_elim 0.82% : 0.000002s : 9: predicate.reduce_eliminate 1.84% : 0.000004s : 26: predicate.redundant_stop_gradient_eliminater 0.52% : 0.000001s : 8: predicate.remove_not_recompute_node 0.99% : 0.000002s : 17: predicate.replace_applicator 0.55% : 0.000001s : 8: predicate.replace_old_param 0.24% : 0.000000s : 4: predicate.reset_defer_inline 0.84% : 0.000002s : 9: predicate.reshape_eliminate 0.69% : 0.000001s : 8: predicate.row_tensor_add_zeros_like 0.44% : 0.000001s : 4: predicate.row_tensor_eliminate 0.82% : 0.000002s : 8: predicate.same_eliminate 0.56% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.97% : 0.000002s : 8: predicate.shard_identity_eliminate 0.97% : 0.000002s : 8: predicate.special_op_eliminate 1.00% : 0.000002s : 8: predicate.specialize_transform 0.94% : 0.000002s : 8: predicate.split_environ_get_set_with_tuple_value 0.71% : 0.000001s : 8: predicate.stack_unstack_eliminate 0.37% : 0.000001s : 4: predicate.switch_call_monad_eliminater 0.86% : 0.000002s : 11: predicate.switch_defer_inline 1.48% : 0.000003s : 19: predicate.switch_layer_defer_inline 4.00% : 0.000008s : 39: predicate.switch_simplify 0.76% : 0.000001s : 9: predicate.tile_eliminate 0.70% : 0.000001s : 9: predicate.transpose_eliminate 1.38% : 0.000003s : 17: predicate.tuple_list_convert_item_index_to_positive 1.52% : 0.000003s : 17: predicate.tuple_list_get_item_const_eliminator 1.22% : 0.000002s : 17: predicate.tuple_list_get_item_depend_reorder 2.77% : 0.000005s : 25: predicate.tuple_list_get_item_eliminator 1.30% : 0.000003s : 17: predicate.tuple_list_get_set_item_eliminator 2.10% : 0.000004s : 25: predicate.tuple_list_set_item_eliminator 1.37% : 0.000003s : 17: predicate.tuple_to_list_eliminator_ 1.82% : 0.000004s : 26: predicate.updatestate_pure_node_eliminater 2.61% : 0.000005s : 34: predicate.updatestate_useless_node_eliminater 0.61% : 0.000001s : 4: predicate.value_based_eliminate 0.68% : 0.000001s : 8: predicate.virtual_dataset_eliminate 0.70% : 0.000001s : 8: predicate.virtual_output_eliminate 0.31% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.39% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000234 5 7.52% : 0.000018s : 1: func_graph_cloner_run.FuncGraphClonerGraph 92.48% : 0.000216s : 4: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.025893 192 0.02% : 0.000006s : 1: ForceFp32Comm 12.57% : 0.003254s : 1: add_attr 12.51% : 0.003239s : 1: add_attr_with_inline 0.02% : 0.000006s : 1: add_comm_op_reuse_tag 0.23% : 0.000061s : 1: add_recomputation 0.02% : 0.000006s : 1: assign_add_opt 0.26% : 0.000068s : 1: auto_monad 0.11% : 0.000028s : 1: auto_monad_reorder 0.02% : 0.000006s : 1: begin_end_overlap_inline 0.03% : 0.000008s : 1: bias_add_comm_swap 1.90% : 0.000491s : 1: bootstrap 0.12% : 0.000032s : 1: cconv 0.03% : 0.000007s : 1: comm_op_add_attrs 0.08% : 0.000020s : 1: control_data_broadcast_order 0.05% : 0.000014s : 1: convert_after_rewriter 0.13% : 0.000034s : 1: cse_after_recomputation 0.03% : 0.000008s : 1: dataset_repeat_opt 0.07% : 0.000019s : 1: detach_backward 0.05% : 0.000012s : 1: environ_conv 0.09% : 0.000023s : 1: event_method 0.03% : 0.000007s : 1: full_micro_interleaved_order_control 0.03% : 0.000008s : 1: get_jit_bprop_graph 0.04% : 0.000011s : 1: graph_reusing 0.03% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.02% : 0.000006s : 1: handle_group_info 0.03% : 0.000008s : 1: inline 0.03% : 0.000009s : 1: insert-virtual-dataset 0.03% : 0.000007s : 1: interleave_parallel_branches 0.02% : 0.000006s : 1: interleave_split_concat_branches 0.03% : 0.000008s : 1: label_fine_grained_interleaved_index 0.04% : 0.000009s : 1: label_micro_interleaved_index 1.72% : 0.000446s : 1: loop_unroll 0.02% : 0.000006s : 1: merge_cast_opt 0.03% : 0.000007s : 1: micro_interleaved_order_control 2.01% : 0.000521s : 1: mutable_eliminate 0.04% : 0.000011s : 1: offloading_packed_experts 0.06% : 0.000015s : 1: opt.transform.loop_unroll_optimizer 0.06% : 0.000016s : 1: opt.transform.mutable_eliminate 3.91% : 0.001014s : 78: opt.transform.opt_a 0.12% : 0.000032s : 1: opt.transform.opt_after_cconv 0.10% : 0.000027s : 1: opt.transform.opt_after_jit_grad 0.61% : 0.000157s : 28: opt.transform.opt_b 0.21% : 0.000053s : 2: opt.transform.opt_trans_graph 0.17% : 0.000043s : 4: opt.transform.symbol_engine_opt 11.64% : 0.003015s : 1: opt_a 0.53% : 0.000138s : 1: opt_after_cconv 1.95% : 0.000505s : 1: opt_after_jit_grad 1.32% : 0.000341s : 1: opt_b 22.01% : 0.005700s : 1: optimize 0.09% : 0.000024s : 1: optimize_parallel_all_gather_comm 0.05% : 0.000012s : 1: order_py_execute_after_rewriter 0.10% : 0.000025s : 1: overlap_grad_flash_sp 0.02% : 0.000006s : 1: overlap_grad_matmul_and_grad_allreduce 0.04% : 0.000010s : 1: overlap_grad_ring_attention 0.03% : 0.000007s : 1: overlap_opt_shard_grad_in_pipeline 0.02% : 0.000006s : 1: overlap_opt_shard_in_pipeline 0.03% : 0.000008s : 1: overlap_param_gather 0.02% : 0.000006s : 1: overlap_recompute_allgather_and_fa_grad 0.04% : 0.000011s : 1: overlap_recompute_and_grad_model_parallel 0.03% : 0.000007s : 1: overlap_recompute_comm 0.04% : 0.000009s : 1: parallel-infer-symbol 0.03% : 0.000007s : 1: parallel-infer-symbol-second 0.03% : 0.000008s : 1: partial_unused_args_eliminate 0.03% : 0.000009s : 1: pipeline_parallel_scheduler 0.03% : 0.000007s : 1: pipeline_split 0.13% : 0.000034s : 1: pre_auto_parallel 0.10% : 0.000027s : 1: py_interpret_to_execute 0.07% : 0.000018s : 1: py_interpret_to_execute_after_opt_a 0.02% : 0.000006s : 1: remove_cast_before_assign_add 0.09% : 0.000023s : 1: remove_dup_value 1.71% : 0.000442s : 1: renormalize.infer 1.03% : 0.000267s : 1: renormalize.specialize 0.03% : 0.000008s : 1: reorder_send_recv_between_fp_bp 0.04% : 0.000011s : 1: rewriter_after_jit_bprop_graph 0.20% : 0.000051s : 1: rewriter_after_opt_a 0.22% : 0.000058s : 1: rewriter_before_opt_a 0.03% : 0.000007s : 1: slice_cell_reuse_recomputed_activation 0.03% : 0.000007s : 1: slice_recompute_activation 0.03% : 0.000007s : 1: split_layernorm_comm 0.03% : 0.000008s : 1: split_matmul_comm_elemetwise 0.04% : 0.000012s : 1: swap_dp_allreduce_reducescatter 0.41% : 0.000106s : 1: symbol_engine_optimizer 0.38% : 0.000097s : 1: tuple_transform 19.56% : 0.005065s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:55.822.830 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0134724, [21] [bootstrap]: 0.00042864 [type_inference]: 0.00474826 [event_method]: 1.175e-05 [auto_monad]: 5.568e-05 [graph_reusing]: 5.20999e-06 [inline]: 2.34999e-06 [add_attr]: 0.00305798, [1] [add_attr_with_inline]: 0.00304951, [1] [Cycle 1]: 5.231e-05, [2] [tag_attr]: 1.501e-05 [meta_addattr_fg_expand]: 3.78999e-06 [parallel-infer-symbol]: 3.56999e-06 [pre_auto_parallel]: 2.498e-05 [insert-virtual-dataset]: 2.36998e-06 [parallel-infer-symbol-second]: 6.49976e-07 [dataset_repeat_opt]: 2.36998e-06 [pipeline_split]: 1.85001e-06 [optimize]: 0.00445181, [53] [py_interpret_to_execute]: 1.734e-05 [rewriter_before_opt_a]: 4.708e-05 [opt_a]: 0.00240627, [2] [Cycle 1]: 0.00166608, [45] [expand_dump_flag]: 2.94001e-06 [switch_simplify]: 2.719e-05 [loop_unroll]: 1.5e-05 [a_1]: 0.00034106 [with_stream_mark]: 1.544e-05 [recompute_prepare]: 9.56e-06 [updatestate_depend_eliminate]: 4.28001e-06 [updatestate_assign_eliminate]: 4.1e-06 [updatestate_loads_eliminate]: 3.58e-06 [parameter_eliminate]: 1.71998e-06 [a_2]: 9.718e-05 [accelerated_algorithm]: 7.74997e-06 [shard]: 2.07999e-06 [meta_shard_fg_expand]: 1.85001e-06 [shard_inline]: 8.12e-06 [merge_send_recv]: 8.89e-06 [auto_parallel]: 7.26001e-06 [parallel]: 1.838e-05 [flash_sp]: 8.69e-06 [merge_comm]: 4.88001e-06 [allreduce_fusion]: 4.12e-06 [matmul_add_comm_reduction]: 1.025e-05 [allreduce_slice_to_reducescatter]: 6.69999e-07 [virtual_shard_identity]: 9.56e-06 [virtual_dataset]: 7.59002e-06 [get_grad_eliminate_]: 6.99001e-06 [virtual_output]: 7.08998e-06 [merge_forward]: 4.32e-06 [cell_reuse_recompute_pass]: 1.25001e-06 [offload_activation]: 1.09e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.444e-05 [merge_recompute_call_nodes]: 1.69e-06 [before_grad]: 1.222e-05 [set_forward_comm_id_for_comm_node_pass]: 4.48999e-06 [meta_fg_expand]: 3.13998e-06 [flash_sp_send_recv_attached]: 2.41e-06 [receive_attached]: 2.16998e-06 [after_resolve]: 1.095e-05 [a_after_grad]: 1.046e-05 [renormalize]: 0.00059822 [add_forward_monad_depend]: 5.16002e-06 [auto_monad_grad]: 2.09e-06 [auto_monad_eliminator]: 1.653e-05 [cse]: 3.431e-05 [a_3]: 5.554e-05 [Cycle 2]: 0.00073051, [45] [expand_dump_flag]: 1.04e-06 [switch_simplify]: 8.67e-06 [loop_unroll]: 7.31999e-06 [a_1]: 0.00016138 [with_stream_mark]: 1.243e-05 [recompute_prepare]: 7.38e-06 [updatestate_depend_eliminate]: 4.37e-06 [updatestate_assign_eliminate]: 3.11001e-06 [updatestate_loads_eliminate]: 2.78e-06 [parameter_eliminate]: 1.04998e-06 [a_2]: 8.667e-05 [accelerated_algorithm]: 7.35e-06 [shard]: 1.40999e-06 [meta_shard_fg_expand]: 1.49e-06 [shard_inline]: 7.61001e-06 [merge_send_recv]: 6.09999e-06 [auto_parallel]: 6.05002e-06 [parallel]: 5.12e-06 [flash_sp]: 3.81999e-06 [merge_comm]: 4.02e-06 [allreduce_fusion]: 4.22e-06 [matmul_add_comm_reduction]: 7.03e-06 [allreduce_slice_to_reducescatter]: 3.19997e-07 [virtual_shard_identity]: 7.8e-06 [virtual_dataset]: 6.74999e-06 [get_grad_eliminate_]: 6.78e-06 [virtual_output]: 6.50002e-06 [merge_forward]: 3.3e-06 [cell_reuse_recompute_pass]: 1.59998e-06 [offload_activation]: 7.55e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.412e-05 [merge_recompute_call_nodes]: 9.00007e-07 [before_grad]: 1.104e-05 [set_forward_comm_id_for_comm_node_pass]: 4.33999e-06 [meta_fg_expand]: 2.51998e-06 [flash_sp_send_recv_attached]: 9.10019e-07 [receive_attached]: 1.07e-06 [after_resolve]: 9.68002e-06 [a_after_grad]: 1.02e-05 [renormalize]: 6.00121e-08 [add_forward_monad_depend]: 1.29e-06 [auto_monad_grad]: 9.39996e-07 [auto_monad_eliminator]: 8.23001e-06 [cse]: 1.929e-05 [a_3]: 4.347e-05 [py_interpret_to_execute_after_opt_a]: 9.97999e-06 [slice_cell_reuse_recomputed_activation]: 2.01003e-06 [rewriter_after_opt_a]: 4.107e-05 [convert_after_rewriter]: 7.05e-06 [order_py_execute_after_rewriter]: 5.82001e-06 [mutable_eliminate]: 0.00048802 [opt_b]: 0.00026713, [1] [Cycle 1]: 0.0002611, [7] [b_1]: 0.00017794 [b_2]: 9.15001e-06 [updatestate_depend_eliminate]: 6.71e-06 [updatestate_assign_eliminate]: 3.11999e-06 [updatestate_loads_eliminate]: 3.08e-06 [renormalize]: 4.39992e-07 [cse]: 2.429e-05 [optimize_parallel_all_gather_comm]: 1.712e-05 [overlap_param_gather]: 1.92001e-06 [cconv]: 2.448e-05 [loop_unroll]: 0.00042382 [opt_after_cconv]: 0.00011164, [1] [Cycle 1]: 0.00010638, [7] [c_1]: 3.34e-05 [parameter_eliminate]: 2.58e-06 [updatestate_depend_eliminate]: 5.92001e-06 [updatestate_assign_eliminate]: 3.08e-06 [updatestate_loads_eliminate]: 3.01999e-06 [cse]: 2.365e-05 [renormalize]: 3.9002e-07 [remove_dup_value]: 1.696e-05 [tuple_transform]: 8.005e-05, [1] [Cycle 1]: 7.5e-05, [4] [d_1]: 4.641e-05 [none_parameter_eliminate]: 1.60999e-06 [renormalize]: 2.69996e-07 [switch_simplify]: 7.93001e-06 [partial_unused_args_eliminate]: 2.01e-06 [add_recomputation]: 5.479e-05 [cse_after_recomputation]: 2.541e-05, [1] [Cycle 1]: 2.044e-05, [1] [cse]: 1.494e-05 [environ_conv]: 6.31e-06 [swap_dp_allreduce_reducescatter]: 6.34001e-06 [bias_add_comm_swap]: 2.20002e-06 [label_micro_interleaved_index]: 4.38999e-06 [label_fine_grained_interleaved_index]: 2.62001e-06 [merge_cast_opt]: 1.34e-06 [slice_recompute_activation]: 1.92001e-06 [micro_interleaved_order_control]: 2.29001e-06 [assign_add_opt]: 1.17e-06 [ForceFp32Comm]: 1.18001e-06 [remove_cast_before_assign_add]: 1.14e-06 [full_micro_interleaved_order_control]: 2.14999e-06 [reorder_send_recv_between_fp_bp]: 2.71e-06 [comm_op_add_attrs]: 9.70002e-07 [add_comm_op_reuse_tag]: 9.5999e-07 [interleave_split_concat_branches]: 1.16002e-06 [interleave_parallel_branches]: 1.37e-06 [overlap_opt_shard_in_pipeline]: 1.12999e-06 [overlap_opt_shard_grad_in_pipeline]: 1.88002e-06 [control_data_broadcast_order]: 1.424e-05 [grouped_pairwise_exchange_alltoall]: 1.71e-06 [offloading_packed_experts]: 4.45e-06 [overlap_recompute_and_grad_model_parallel]: 5.07999e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.17999e-06 [overlap_recompute_allgather_and_fa_grad]: 1.34e-06 [overlap_recompute_comm]: 2.12999e-06 [overlap_grad_ring_attention]: 4.61002e-06 [overlap_grad_flash_sp]: 2.079e-05 [begin_end_overlap_inline]: 9.5999e-07 [split_matmul_comm_elemetwise]: 2.32999e-06 [split_layernorm_comm]: 1.72001e-06 [handle_group_info]: 1.15001e-06 [symbol_engine_optimizer]: 8.381e-05, [1] [Cycle 1]: 7.971e-05, [6] [build]: 3.09001e-06 [elim_shapecalc]: 1.177e-05 [elim_not_effective]: 1.506e-05 [opt_reshape]: 8.26002e-06 [fold_const_symbol]: 1.222e-05 [renormalize]: 3.00002e-07 [detach_backward]: 1.80001e-06 [pipeline_parallel_scheduler]: 1.47001e-06 [auto_monad_reorder]: 1.882e-05 [get_jit_bprop_graph]: 1.50001e-06 [rewriter_after_jit_bprop_graph]: 4.15e-06 [opt_after_jit_grad]: 0.00046707 [validate]: 3.977e-05 Sums bootstrap : 0.000429s : 4.53% type_inference : 0.004748s : 50.14% event_method : 0.000012s : 0.12% auto_monad : 0.000056s : 0.59% graph_reusing : 0.000005s : 0.06% inline : 0.000002s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000015s : 0.16% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000004s : 0.04% parallel-infer-symbol : 0.000004s : 0.04% pre_auto_parallel : 0.000025s : 0.26% insert-virtual-dataset : 0.000002s : 0.03% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.03% pipeline_split : 0.000002s : 0.02% optimize.py_interpret_to_execute : 0.000017s : 0.18% optimize.rewriter_before_opt_a : 0.000047s : 0.50% optimize.opt_a.expand_dump_flag : 0.000004s : 0.04% optimize.opt_a.switch_simplify : 0.000036s : 0.38% optimize.opt_a.loop_unroll : 0.000022s : 0.24% optimize.opt_a.a_1 : 0.000502s : 5.31% optimize.opt_a.with_stream_mark : 0.000028s : 0.29% optimize.opt_a.recompute_prepare : 0.000017s : 0.18% optimize.opt_a.updatestate_depend_eliminate : 0.000009s : 0.09% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.08% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.07% optimize.opt_a.parameter_eliminate : 0.000003s : 0.03% optimize.opt_a.a_2 : 0.000184s : 1.94% optimize.opt_a.accelerated_algorithm : 0.000015s : 0.16% optimize.opt_a.shard : 0.000003s : 0.04% optimize.opt_a.meta_shard_fg_expand : 0.000003s : 0.04% optimize.opt_a.shard_inline : 0.000016s : 0.17% optimize.opt_a.merge_send_recv : 0.000015s : 0.16% optimize.opt_a.auto_parallel : 0.000013s : 0.14% optimize.opt_a.parallel : 0.000024s : 0.25% optimize.opt_a.flash_sp : 0.000013s : 0.13% optimize.opt_a.merge_comm : 0.000009s : 0.09% optimize.opt_a.allreduce_fusion : 0.000008s : 0.09% optimize.opt_a.matmul_add_comm_reduction : 0.000017s : 0.18% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000017s : 0.18% optimize.opt_a.virtual_dataset : 0.000014s : 0.15% optimize.opt_a.get_grad_eliminate_ : 0.000014s : 0.15% optimize.opt_a.virtual_output : 0.000014s : 0.14% optimize.opt_a.merge_forward : 0.000008s : 0.08% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.03% optimize.opt_a.offload_activation : 0.000018s : 0.19% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000029s : 0.30% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.03% optimize.opt_a.before_grad : 0.000023s : 0.25% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000009s : 0.09% optimize.opt_a.meta_fg_expand : 0.000006s : 0.06% optimize.opt_a.flash_sp_send_recv_attached : 0.000003s : 0.04% optimize.opt_a.receive_attached : 0.000003s : 0.03% optimize.opt_a.after_resolve : 0.000021s : 0.22% optimize.opt_a.a_after_grad : 0.000021s : 0.22% optimize.opt_a.renormalize : 0.000598s : 6.32% optimize.opt_a.add_forward_monad_depend : 0.000006s : 0.07% optimize.opt_a.auto_monad_grad : 0.000003s : 0.03% optimize.opt_a.auto_monad_eliminator : 0.000025s : 0.26% optimize.opt_a.cse : 0.000054s : 0.57% optimize.opt_a.a_3 : 0.000099s : 1.05% optimize.py_interpret_to_execute_after_opt_a : 0.000010s : 0.11% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.02% optimize.rewriter_after_opt_a : 0.000041s : 0.43% optimize.convert_after_rewriter : 0.000007s : 0.07% optimize.order_py_execute_after_rewriter : 0.000006s : 0.06% optimize.mutable_eliminate : 0.000488s : 5.15% optimize.opt_b.b_1 : 0.000178s : 1.88% optimize.opt_b.b_2 : 0.000009s : 0.10% optimize.opt_b.updatestate_depend_eliminate : 0.000007s : 0.07% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_b.renormalize : 0.000000s : 0.00% optimize.opt_b.cse : 0.000024s : 0.26% optimize.optimize_parallel_all_gather_comm : 0.000017s : 0.18% optimize.overlap_param_gather : 0.000002s : 0.02% optimize.cconv : 0.000024s : 0.26% optimize.loop_unroll : 0.000424s : 4.48% optimize.opt_after_cconv.c_1 : 0.000033s : 0.35% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.06% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.cse : 0.000024s : 0.25% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000017s : 0.18% optimize.tuple_transform.d_1 : 0.000046s : 0.49% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.02% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000008s : 0.08% optimize.partial_unused_args_eliminate : 0.000002s : 0.02% optimize.add_recomputation : 0.000055s : 0.58% optimize.cse_after_recomputation.cse : 0.000015s : 0.16% optimize.environ_conv : 0.000006s : 0.07% optimize.swap_dp_allreduce_reducescatter : 0.000006s : 0.07% optimize.bias_add_comm_swap : 0.000002s : 0.02% optimize.label_micro_interleaved_index : 0.000004s : 0.05% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.03% optimize.merge_cast_opt : 0.000001s : 0.01% optimize.slice_recompute_activation : 0.000002s : 0.02% optimize.micro_interleaved_order_control : 0.000002s : 0.02% optimize.assign_add_opt : 0.000001s : 0.01% optimize.ForceFp32Comm : 0.000001s : 0.01% optimize.remove_cast_before_assign_add : 0.000001s : 0.01% optimize.full_micro_interleaved_order_control : 0.000002s : 0.02% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.03% optimize.comm_op_add_attrs : 0.000001s : 0.01% optimize.add_comm_op_reuse_tag : 0.000001s : 0.01% optimize.interleave_split_concat_branches : 0.000001s : 0.01% optimize.interleave_parallel_branches : 0.000001s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.02% optimize.control_data_broadcast_order : 0.000014s : 0.15% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.02% optimize.offloading_packed_experts : 0.000004s : 0.05% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.05% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.01% optimize.overlap_recompute_comm : 0.000002s : 0.02% optimize.overlap_grad_ring_attention : 0.000005s : 0.05% optimize.overlap_grad_flash_sp : 0.000021s : 0.22% optimize.begin_end_overlap_inline : 0.000001s : 0.01% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.02% optimize.split_layernorm_comm : 0.000002s : 0.02% optimize.handle_group_info : 0.000001s : 0.01% optimize.symbol_engine_optimizer.build : 0.000003s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000012s : 0.12% optimize.symbol_engine_optimizer.elim_not_effective : 0.000015s : 0.16% optimize.symbol_engine_optimizer.opt_reshape : 0.000008s : 0.09% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000012s : 0.13% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.02% pipeline_parallel_scheduler : 0.000001s : 0.02% auto_monad_reorder : 0.000019s : 0.20% get_jit_bprop_graph : 0.000002s : 0.02% rewriter_after_jit_bprop_graph : 0.000004s : 0.04% opt_after_jit_grad : 0.000467s : 4.93% validate : 0.000040s : 0.42% Time group info: ------[substitution.] 0.000163 29 11.93% : 0.000019s : 2: substitution.cast_eliminate 1.39% : 0.000002s : 3: substitution.elim_not_effective 1.11% : 0.000002s : 3: substitution.fold_const_symbol 3.70% : 0.000006s : 4: substitution.graph_param_transform 58.11% : 0.000095s : 2: substitution.inline 2.45% : 0.000004s : 6: substitution.j_node_and_user_rematch 3.33% : 0.000005s : 6: substitution.remove_not_recompute_node 1.99% : 0.000003s : 2: substitution.replace_old_param 16.00% : 0.000026s : 1: substitution.value_based_eliminate ------[type_inference.] 0.004703 2 90.67% : 0.004264s : 1: type_inference.infer 9.33% : 0.000439s : 1: type_inference.specialize ------[replace.] 0.000022 2 100.00% : 0.000022s : 2: replace.inline ------[match.] 0.000093 2 100.00% : 0.000093s : 2: match.inline ------[predicate.] 0.000167 980 0.84% : 0.000001s : 9: predicate.accumulaten_eliminater 0.96% : 0.000002s : 4: predicate.ad_related_special_op_eliminate 0.66% : 0.000001s : 8: predicate.addn_check_dump 0.84% : 0.000001s : 9: predicate.addn_zero_filter 0.68% : 0.000001s : 9: predicate.adjust_all_reduce_mul_add 2.53% : 0.000004s : 17: predicate.arithmetic_simplify 1.01% : 0.000002s : 9: predicate.cast_eliminate 0.74% : 0.000001s : 8: predicate.check_bprop_eliminate 0.66% : 0.000001s : 8: predicate.compare_switch_simplify 0.26% : 0.000000s : 4: predicate.const_output_eliminate 0.79% : 0.000001s : 8: predicate.depend_value_elim 0.82% : 0.000001s : 9: predicate.dict_get_item_const_eliminator 0.91% : 0.000002s : 9: predicate.dict_get_item_eliminator 0.82% : 0.000001s : 9: predicate.dict_set_item_eliminator 1.19% : 0.000002s : 8: predicate.dumpgradient_eliminate 0.32% : 0.000001s : 4: predicate.elim_not_effective 0.53% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.15% : 0.000002s : 13: predicate.environ_add_const_eliminate 1.05% : 0.000002s : 13: predicate.environ_get_add_eliminate 1.04% : 0.000002s : 13: predicate.environ_get_depend_swap 2.05% : 0.000003s : 21: predicate.environ_get_eliminate 1.03% : 0.000002s : 13: predicate.environ_get_set_eliminate 0.91% : 0.000002s : 11: predicate.exchange_switch_depend_value 1.61% : 0.000003s : 11: predicate.float_depend_g_call 0.69% : 0.000001s : 8: predicate.float_environ_get_switch 1.00% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.25% : 0.000000s : 4: predicate.fold_const_symbol 0.83% : 0.000001s : 8: predicate.get_grad_eliminate 0.26% : 0.000000s : 4: predicate.graph_param_transform 0.84% : 0.000001s : 8: predicate.incorporate_call 0.64% : 0.000001s : 8: predicate.incorporate_call_switch 6.59% : 0.000011s : 44: predicate.inline 0.99% : 0.000002s : 8: predicate.inline_without_move 0.41% : 0.000001s : 8: predicate.j_node_and_user_rematch 1.03% : 0.000002s : 8: predicate.less_batch_normalization 1.73% : 0.000003s : 17: predicate.list_to_tuple_eliminator_ 2.21% : 0.000004s : 26: predicate.load_eliminater 1.27% : 0.000002s : 4: predicate.loop_unroll_after_grad 1.56% : 0.000003s : 16: predicate.loop_unroll_before_grad 1.73% : 0.000003s : 17: predicate.make_slice_get_slice_eliminator 0.78% : 0.000001s : 8: predicate.merge_addn 0.73% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.72% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.72% : 0.000001s : 9: predicate.minmaximum_grad 1.31% : 0.000002s : 4: predicate.mutable_eliminate 0.44% : 0.000001s : 4: predicate.opt_reshape 0.65% : 0.000001s : 4: predicate.parallel_virtual_node 1.34% : 0.000002s : 11: predicate.partial_defer_inline 1.28% : 0.000002s : 13: predicate.partial_eliminate 0.78% : 0.000001s : 9: predicate.print_const_string_wrapper 0.74% : 0.000001s : 8: predicate.reduce_all_const_elim 1.24% : 0.000002s : 9: predicate.reduce_eliminate 2.16% : 0.000004s : 26: predicate.redundant_stop_gradient_eliminater 0.60% : 0.000001s : 8: predicate.remove_not_recompute_node 1.21% : 0.000002s : 17: predicate.replace_applicator 0.63% : 0.000001s : 8: predicate.replace_old_param 0.33% : 0.000001s : 4: predicate.reset_defer_inline 0.87% : 0.000001s : 9: predicate.reshape_eliminate 0.78% : 0.000001s : 8: predicate.row_tensor_add_zeros_like 0.49% : 0.000001s : 4: predicate.row_tensor_eliminate 0.88% : 0.000001s : 8: predicate.same_eliminate 0.54% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.92% : 0.000002s : 8: predicate.shard_identity_eliminate 0.89% : 0.000001s : 8: predicate.special_op_eliminate 1.01% : 0.000002s : 8: predicate.specialize_transform 1.08% : 0.000002s : 8: predicate.split_environ_get_set_with_tuple_value 0.85% : 0.000001s : 8: predicate.stack_unstack_eliminate 0.42% : 0.000001s : 4: predicate.switch_call_monad_eliminater 0.96% : 0.000002s : 11: predicate.switch_defer_inline 1.65% : 0.000003s : 19: predicate.switch_layer_defer_inline 4.21% : 0.000007s : 39: predicate.switch_simplify 0.79% : 0.000001s : 9: predicate.tile_eliminate 0.79% : 0.000001s : 9: predicate.transpose_eliminate 1.65% : 0.000003s : 17: predicate.tuple_list_convert_item_index_to_positive 1.61% : 0.000003s : 17: predicate.tuple_list_get_item_const_eliminator 1.48% : 0.000002s : 17: predicate.tuple_list_get_item_depend_reorder 2.96% : 0.000005s : 25: predicate.tuple_list_get_item_eliminator 1.64% : 0.000003s : 17: predicate.tuple_list_get_set_item_eliminator 2.48% : 0.000004s : 25: predicate.tuple_list_set_item_eliminator 1.70% : 0.000003s : 17: predicate.tuple_to_list_eliminator_ 2.06% : 0.000003s : 26: predicate.updatestate_pure_node_eliminater 2.94% : 0.000005s : 34: predicate.updatestate_useless_node_eliminater 0.63% : 0.000001s : 4: predicate.value_based_eliminate 0.84% : 0.000001s : 8: predicate.virtual_dataset_eliminate 0.91% : 0.000002s : 8: predicate.virtual_output_eliminate 0.33% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.53% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000209 5 8.10% : 0.000017s : 1: func_graph_cloner_run.FuncGraphClonerGraph 91.90% : 0.000192s : 4: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.022752 192 0.02% : 0.000004s : 1: ForceFp32Comm 13.46% : 0.003062s : 1: add_attr 13.42% : 0.003053s : 1: add_attr_with_inline 0.02% : 0.000004s : 1: add_comm_op_reuse_tag 0.26% : 0.000059s : 1: add_recomputation 0.02% : 0.000004s : 1: assign_add_opt 0.27% : 0.000061s : 1: auto_monad 0.10% : 0.000023s : 1: auto_monad_reorder 0.02% : 0.000004s : 1: begin_end_overlap_inline 0.02% : 0.000005s : 1: bias_add_comm_swap 2.01% : 0.000457s : 1: bootstrap 0.12% : 0.000028s : 1: cconv 0.02% : 0.000004s : 1: comm_op_add_attrs 0.08% : 0.000018s : 1: control_data_broadcast_order 0.04% : 0.000010s : 1: convert_after_rewriter 0.12% : 0.000028s : 1: cse_after_recomputation 0.03% : 0.000006s : 1: dataset_repeat_opt 0.02% : 0.000005s : 1: detach_backward 0.04% : 0.000010s : 1: environ_conv 0.08% : 0.000017s : 1: event_method 0.02% : 0.000005s : 1: full_micro_interleaved_order_control 0.02% : 0.000005s : 1: get_jit_bprop_graph 0.04% : 0.000009s : 1: graph_reusing 0.02% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.02% : 0.000004s : 1: handle_group_info 0.02% : 0.000005s : 1: inline 0.03% : 0.000006s : 1: insert-virtual-dataset 0.02% : 0.000004s : 1: interleave_parallel_branches 0.02% : 0.000004s : 1: interleave_split_concat_branches 0.02% : 0.000006s : 1: label_fine_grained_interleaved_index 0.03% : 0.000007s : 1: label_micro_interleaved_index 1.90% : 0.000432s : 1: loop_unroll 0.02% : 0.000004s : 1: merge_cast_opt 0.02% : 0.000005s : 1: micro_interleaved_order_control 2.18% : 0.000497s : 1: mutable_eliminate 0.03% : 0.000007s : 1: offloading_packed_experts 0.07% : 0.000015s : 1: opt.transform.loop_unroll_optimizer 0.07% : 0.000017s : 1: opt.transform.mutable_eliminate 4.18% : 0.000951s : 78: opt.transform.opt_a 0.14% : 0.000032s : 1: opt.transform.opt_after_cconv 0.12% : 0.000028s : 1: opt.transform.opt_after_jit_grad 0.68% : 0.000155s : 28: opt.transform.opt_b 0.23% : 0.000052s : 2: opt.transform.opt_trans_graph 0.19% : 0.000044s : 4: opt.transform.symbol_engine_opt 10.59% : 0.002409s : 1: opt_a 0.50% : 0.000115s : 1: opt_after_cconv 2.09% : 0.000476s : 1: opt_after_jit_grad 1.19% : 0.000271s : 1: opt_b 19.59% : 0.004456s : 1: optimize 0.09% : 0.000021s : 1: optimize_parallel_all_gather_comm 0.04% : 0.000009s : 1: order_py_execute_after_rewriter 0.11% : 0.000024s : 1: overlap_grad_flash_sp 0.02% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.03% : 0.000008s : 1: overlap_grad_ring_attention 0.02% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.02% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.02% : 0.000005s : 1: overlap_param_gather 0.02% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.04% : 0.000008s : 1: overlap_recompute_and_grad_model_parallel 0.02% : 0.000005s : 1: overlap_recompute_comm 0.03% : 0.000007s : 1: parallel-infer-symbol 0.02% : 0.000004s : 1: parallel-infer-symbol-second 0.02% : 0.000005s : 1: partial_unused_args_eliminate 0.02% : 0.000004s : 1: pipeline_parallel_scheduler 0.02% : 0.000005s : 1: pipeline_split 0.13% : 0.000029s : 1: pre_auto_parallel 0.09% : 0.000021s : 1: py_interpret_to_execute 0.06% : 0.000013s : 1: py_interpret_to_execute_after_opt_a 0.02% : 0.000004s : 1: remove_cast_before_assign_add 0.09% : 0.000020s : 1: remove_dup_value 1.59% : 0.000362s : 1: renormalize.infer 1.00% : 0.000228s : 1: renormalize.specialize 0.02% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.03% : 0.000007s : 1: rewriter_after_jit_bprop_graph 0.20% : 0.000045s : 1: rewriter_after_opt_a 0.22% : 0.000051s : 1: rewriter_before_opt_a 0.02% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.02% : 0.000005s : 1: slice_recompute_activation 0.02% : 0.000004s : 1: split_layernorm_comm 0.02% : 0.000005s : 1: split_matmul_comm_elemetwise 0.04% : 0.000009s : 1: swap_dp_allreduce_reducescatter 0.38% : 0.000087s : 1: symbol_engine_optimizer 0.36% : 0.000083s : 1: tuple_transform 20.93% : 0.004763s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:56.158.80 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:56.161.35 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0151836, [21] [bootstrap]: 0.0004586 [type_inference]: 0.00498498 [event_method]: 1.306e-05 [auto_monad]: 5.73e-05 [graph_reusing]: 5.71e-06 [inline]: 2.49001e-06 [add_attr]: 0.00329786, [1] [add_attr_with_inline]: 0.00328895, [1] [Cycle 1]: 7.431e-05, [2] [tag_attr]: 1.497e-05 [meta_addattr_fg_expand]: 3.91999e-06 [parallel-infer-symbol]: 3.12002e-06 [pre_auto_parallel]: 2.709e-05 [insert-virtual-dataset]: 2.46998e-06 [parallel-infer-symbol-second]: 7.40023e-07 [dataset_repeat_opt]: 2.24001e-06 [pipeline_split]: 1.76998e-06 [optimize]: 0.00515503, [53] [py_interpret_to_execute]: 2.19e-05 [rewriter_before_opt_a]: 5.264e-05 [opt_a]: 0.00281624, [2] [Cycle 1]: 0.00191594, [45] [expand_dump_flag]: 3.32002e-06 [switch_simplify]: 2.675e-05 [loop_unroll]: 1.509e-05 [a_1]: 0.00034622 [with_stream_mark]: 1.619e-05 [recompute_prepare]: 9.74e-06 [updatestate_depend_eliminate]: 4.48999e-06 [updatestate_assign_eliminate]: 3.80998e-06 [updatestate_loads_eliminate]: 4.4e-06 [parameter_eliminate]: 1.94e-06 [a_2]: 0.0001235 [accelerated_algorithm]: 8.05e-06 [shard]: 2.17001e-06 [meta_shard_fg_expand]: 2.07001e-06 [shard_inline]: 7.18e-06 [merge_send_recv]: 9.76e-06 [auto_parallel]: 7.82e-06 [parallel]: 1.878e-05 [flash_sp]: 7.9e-06 [merge_comm]: 4.79e-06 [allreduce_fusion]: 4.50999e-06 [matmul_add_comm_reduction]: 1.08e-05 [allreduce_slice_to_reducescatter]: 8.29983e-07 [virtual_shard_identity]: 9.36e-06 [virtual_dataset]: 7.85e-06 [get_grad_eliminate_]: 6.91001e-06 [virtual_output]: 7.13e-06 [merge_forward]: 4.38999e-06 [cell_reuse_recompute_pass]: 1.23002e-06 [offload_activation]: 1.067e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.718e-05 [merge_recompute_call_nodes]: 1.41002e-06 [before_grad]: 1.31e-05 [set_forward_comm_id_for_comm_node_pass]: 4.66002e-06 [meta_fg_expand]: 3.24001e-06 [flash_sp_send_recv_attached]: 2.71e-06 [receive_attached]: 2.48e-06 [after_resolve]: 1.095e-05 [a_after_grad]: 1.2e-05 [renormalize]: 0.00065096 [add_forward_monad_depend]: 5.64e-06 [auto_monad_grad]: 2.41e-06 [auto_monad_eliminator]: 1.694e-05 [cse]: 3.469e-05 [a_3]: 6.931e-05 [Cycle 2]: 0.00088771, [45] [expand_dump_flag]: 1.25001e-06 [switch_simplify]: 8.74998e-06 [loop_unroll]: 7.16001e-06 [a_1]: 0.00015032 [with_stream_mark]: 1.273e-05 [recompute_prepare]: 7.4e-06 [updatestate_depend_eliminate]: 4.71002e-06 [updatestate_assign_eliminate]: 3.03e-06 [updatestate_loads_eliminate]: 2.76999e-06 [parameter_eliminate]: 1.20999e-06 [a_2]: 0.00011451 [accelerated_algorithm]: 6.96001e-06 [shard]: 1.05999e-06 [meta_shard_fg_expand]: 1.52999e-06 [shard_inline]: 7.25e-06 [merge_send_recv]: 5.71998e-06 [auto_parallel]: 6.76999e-06 [parallel]: 4.44002e-06 [flash_sp]: 3.75e-06 [merge_comm]: 4.13999e-06 [allreduce_fusion]: 4e-06 [matmul_add_comm_reduction]: 7.01999e-06 [allreduce_slice_to_reducescatter]: 3.60014e-07 [virtual_shard_identity]: 8.02998e-06 [virtual_dataset]: 6.87002e-06 [get_grad_eliminate_]: 6.64001e-06 [virtual_output]: 6.41998e-06 [merge_forward]: 3.45e-06 [cell_reuse_recompute_pass]: 1.32e-06 [offload_activation]: 7.95e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.563e-05 [merge_recompute_call_nodes]: 8.50006e-07 [before_grad]: 1.098e-05 [set_forward_comm_id_for_comm_node_pass]: 4.66002e-06 [meta_fg_expand]: 2.81999e-06 [flash_sp_send_recv_attached]: 9.79984e-07 [receive_attached]: 1.16002e-06 [after_resolve]: 9.69e-06 [a_after_grad]: 1.031e-05 [renormalize]: 6.00121e-08 [add_forward_monad_depend]: 1.40001e-06 [auto_monad_grad]: 9.40025e-07 [auto_monad_eliminator]: 8.52998e-06 [cse]: 1.792e-05 [a_3]: 5.703e-05 [py_interpret_to_execute_after_opt_a]: 1.325e-05 [slice_cell_reuse_recomputed_activation]: 4.64002e-06 [rewriter_after_opt_a]: 4.411e-05 [convert_after_rewriter]: 1.05e-05 [order_py_execute_after_rewriter]: 9.39e-06 [mutable_eliminate]: 0.00051156 [opt_b]: 0.00031739, [1] [Cycle 1]: 0.0003077, [7] [b_1]: 0.00020449 [b_2]: 9.17999e-06 [updatestate_depend_eliminate]: 6.06e-06 [updatestate_assign_eliminate]: 3.03998e-06 [updatestate_loads_eliminate]: 2.79001e-06 [renormalize]: 4.59986e-07 [cse]: 2.485e-05 [optimize_parallel_all_gather_comm]: 2.005e-05 [overlap_param_gather]: 4.63001e-06 [cconv]: 2.72e-05 [loop_unroll]: 0.00045373 [opt_after_cconv]: 0.00013733, [1] [Cycle 1]: 0.00012861, [7] [c_1]: 3.448e-05 [parameter_eliminate]: 3.5e-06 [updatestate_depend_eliminate]: 6.16e-06 [updatestate_assign_eliminate]: 3.12002e-06 [updatestate_loads_eliminate]: 3.6e-06 [cse]: 2.354e-05 [renormalize]: 4.30009e-07 [remove_dup_value]: 1.922e-05 [tuple_transform]: 9.517e-05, [1] [Cycle 1]: 8.786e-05, [4] [d_1]: 4.819e-05 [none_parameter_eliminate]: 1.79998e-06 [renormalize]: 3.00002e-07 [switch_simplify]: 7.67998e-06 [partial_unused_args_eliminate]: 4.95999e-06 [add_recomputation]: 5.745e-05 [cse_after_recomputation]: 3.18e-05, [1] [Cycle 1]: 2.477e-05, [1] [cse]: 1.584e-05 [environ_conv]: 9.86e-06 [swap_dp_allreduce_reducescatter]: 8.94998e-06 [bias_add_comm_swap]: 4.95001e-06 [label_micro_interleaved_index]: 7.15e-06 [label_fine_grained_interleaved_index]: 5.19e-06 [merge_cast_opt]: 3.78001e-06 [slice_recompute_activation]: 4.45e-06 [micro_interleaved_order_control]: 4.67e-06 [assign_add_opt]: 4.02e-06 [ForceFp32Comm]: 3.21001e-06 [remove_cast_before_assign_add]: 3.37997e-06 [full_micro_interleaved_order_control]: 4.70001e-06 [reorder_send_recv_between_fp_bp]: 5.42999e-06 [comm_op_add_attrs]: 3.6e-06 [add_comm_op_reuse_tag]: 3.41999e-06 [interleave_split_concat_branches]: 3.61999e-06 [interleave_parallel_branches]: 3.46999e-06 [overlap_opt_shard_in_pipeline]: 3.56001e-06 [overlap_opt_shard_grad_in_pipeline]: 4.28001e-06 [control_data_broadcast_order]: 1.76e-05 [grouped_pairwise_exchange_alltoall]: 3.93999e-06 [offloading_packed_experts]: 7.31001e-06 [overlap_recompute_and_grad_model_parallel]: 8.08001e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.54002e-06 [overlap_recompute_allgather_and_fa_grad]: 3.67002e-06 [overlap_recompute_comm]: 4.57998e-06 [overlap_grad_ring_attention]: 7.36001e-06 [overlap_grad_flash_sp]: 2.429e-05 [begin_end_overlap_inline]: 2.91e-06 [split_matmul_comm_elemetwise]: 4.39002e-06 [split_layernorm_comm]: 4.07998e-06 [handle_group_info]: 3.45e-06 [symbol_engine_optimizer]: 0.00010574, [1] [Cycle 1]: 9.898e-05, [6] [build]: 3.24001e-06 [elim_shapecalc]: 1.155e-05 [elim_not_effective]: 1.562e-05 [opt_reshape]: 9.01998e-06 [fold_const_symbol]: 1.347e-05 [renormalize]: 2.40019e-07 [detach_backward]: 3.76001e-06 [pipeline_parallel_scheduler]: 1.64e-06 [auto_monad_reorder]: 2.212e-05 [get_jit_bprop_graph]: 2.59999e-06 [rewriter_after_jit_bprop_graph]: 5.57999e-06 [opt_after_jit_grad]: 0.00050391 [validate]: 4.245e-05 Sums bootstrap : 0.000459s : 4.52% type_inference : 0.004985s : 49.16% event_method : 0.000013s : 0.13% auto_monad : 0.000057s : 0.57% graph_reusing : 0.000006s : 0.06% inline : 0.000002s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000015s : 0.15% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000004s : 0.04% parallel-infer-symbol : 0.000003s : 0.03% pre_auto_parallel : 0.000027s : 0.27% insert-virtual-dataset : 0.000002s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.02% optimize.py_interpret_to_execute : 0.000022s : 0.22% optimize.rewriter_before_opt_a : 0.000053s : 0.52% optimize.opt_a.expand_dump_flag : 0.000005s : 0.05% optimize.opt_a.switch_simplify : 0.000035s : 0.35% optimize.opt_a.loop_unroll : 0.000022s : 0.22% optimize.opt_a.a_1 : 0.000497s : 4.90% optimize.opt_a.with_stream_mark : 0.000029s : 0.29% optimize.opt_a.recompute_prepare : 0.000017s : 0.17% optimize.opt_a.updatestate_depend_eliminate : 0.000009s : 0.09% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.07% optimize.opt_a.updatestate_loads_eliminate : 0.000007s : 0.07% optimize.opt_a.parameter_eliminate : 0.000003s : 0.03% optimize.opt_a.a_2 : 0.000238s : 2.35% optimize.opt_a.accelerated_algorithm : 0.000015s : 0.15% optimize.opt_a.shard : 0.000003s : 0.03% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.04% optimize.opt_a.shard_inline : 0.000014s : 0.14% optimize.opt_a.merge_send_recv : 0.000015s : 0.15% optimize.opt_a.auto_parallel : 0.000015s : 0.14% optimize.opt_a.parallel : 0.000023s : 0.23% optimize.opt_a.flash_sp : 0.000012s : 0.11% optimize.opt_a.merge_comm : 0.000009s : 0.09% optimize.opt_a.allreduce_fusion : 0.000009s : 0.08% optimize.opt_a.matmul_add_comm_reduction : 0.000018s : 0.18% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000017s : 0.17% optimize.opt_a.virtual_dataset : 0.000015s : 0.15% optimize.opt_a.get_grad_eliminate_ : 0.000014s : 0.13% optimize.opt_a.virtual_output : 0.000014s : 0.13% optimize.opt_a.merge_forward : 0.000008s : 0.08% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.03% optimize.opt_a.offload_activation : 0.000019s : 0.18% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000033s : 0.32% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.02% optimize.opt_a.before_grad : 0.000024s : 0.24% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000009s : 0.09% optimize.opt_a.meta_fg_expand : 0.000006s : 0.06% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.04% optimize.opt_a.receive_attached : 0.000004s : 0.04% optimize.opt_a.after_resolve : 0.000021s : 0.20% optimize.opt_a.a_after_grad : 0.000022s : 0.22% optimize.opt_a.renormalize : 0.000651s : 6.42% optimize.opt_a.add_forward_monad_depend : 0.000007s : 0.07% optimize.opt_a.auto_monad_grad : 0.000003s : 0.03% optimize.opt_a.auto_monad_eliminator : 0.000025s : 0.25% optimize.opt_a.cse : 0.000053s : 0.52% optimize.opt_a.a_3 : 0.000126s : 1.25% optimize.py_interpret_to_execute_after_opt_a : 0.000013s : 0.13% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.05% optimize.rewriter_after_opt_a : 0.000044s : 0.43% optimize.convert_after_rewriter : 0.000010s : 0.10% optimize.order_py_execute_after_rewriter : 0.000009s : 0.09% optimize.mutable_eliminate : 0.000512s : 5.04% optimize.opt_b.b_1 : 0.000204s : 2.02% optimize.opt_b.b_2 : 0.000009s : 0.09% optimize.opt_b.updatestate_depend_eliminate : 0.000006s : 0.06% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_b.renormalize : 0.000000s : 0.00% optimize.opt_b.cse : 0.000025s : 0.25% optimize.optimize_parallel_all_gather_comm : 0.000020s : 0.20% optimize.overlap_param_gather : 0.000005s : 0.05% optimize.cconv : 0.000027s : 0.27% optimize.loop_unroll : 0.000454s : 4.47% optimize.opt_after_cconv.c_1 : 0.000034s : 0.34% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.06% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000004s : 0.04% optimize.opt_after_cconv.cse : 0.000024s : 0.23% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000019s : 0.19% optimize.tuple_transform.d_1 : 0.000048s : 0.48% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.02% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000008s : 0.08% optimize.partial_unused_args_eliminate : 0.000005s : 0.05% optimize.add_recomputation : 0.000057s : 0.57% optimize.cse_after_recomputation.cse : 0.000016s : 0.16% optimize.environ_conv : 0.000010s : 0.10% optimize.swap_dp_allreduce_reducescatter : 0.000009s : 0.09% optimize.bias_add_comm_swap : 0.000005s : 0.05% optimize.label_micro_interleaved_index : 0.000007s : 0.07% optimize.label_fine_grained_interleaved_index : 0.000005s : 0.05% optimize.merge_cast_opt : 0.000004s : 0.04% optimize.slice_recompute_activation : 0.000004s : 0.04% optimize.micro_interleaved_order_control : 0.000005s : 0.05% optimize.assign_add_opt : 0.000004s : 0.04% optimize.ForceFp32Comm : 0.000003s : 0.03% optimize.remove_cast_before_assign_add : 0.000003s : 0.03% optimize.full_micro_interleaved_order_control : 0.000005s : 0.05% optimize.reorder_send_recv_between_fp_bp : 0.000005s : 0.05% optimize.comm_op_add_attrs : 0.000004s : 0.04% optimize.add_comm_op_reuse_tag : 0.000003s : 0.03% optimize.interleave_split_concat_branches : 0.000004s : 0.04% optimize.interleave_parallel_branches : 0.000003s : 0.03% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.04% optimize.overlap_opt_shard_grad_in_pipeline : 0.000004s : 0.04% optimize.control_data_broadcast_order : 0.000018s : 0.17% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.04% optimize.offloading_packed_experts : 0.000007s : 0.07% optimize.overlap_recompute_and_grad_model_parallel : 0.000008s : 0.08% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.03% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.04% optimize.overlap_recompute_comm : 0.000005s : 0.05% optimize.overlap_grad_ring_attention : 0.000007s : 0.07% optimize.overlap_grad_flash_sp : 0.000024s : 0.24% optimize.begin_end_overlap_inline : 0.000003s : 0.03% optimize.split_matmul_comm_elemetwise : 0.000004s : 0.04% optimize.split_layernorm_comm : 0.000004s : 0.04% optimize.handle_group_info : 0.000003s : 0.03% optimize.symbol_engine_optimizer.build : 0.000003s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000012s : 0.11% optimize.symbol_engine_optimizer.elim_not_effective : 0.000016s : 0.15% optimize.symbol_engine_optimizer.opt_reshape : 0.000009s : 0.09% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000013s : 0.13% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000004s : 0.04% pipeline_parallel_scheduler : 0.000002s : 0.02% auto_monad_reorder : 0.000022s : 0.22% get_jit_bprop_graph : 0.000003s : 0.03% rewriter_after_jit_bprop_graph : 0.000006s : 0.06% opt_after_jit_grad : 0.000504s : 4.97% validate : 0.000042s : 0.42% Time group info: ------[substitution.] 0.000155 29 13.00% : 0.000020s : 2: substitution.cast_eliminate 1.55% : 0.000002s : 3: substitution.elim_not_effective 1.28% : 0.000002s : 3: substitution.fold_const_symbol 3.95% : 0.000006s : 4: substitution.graph_param_transform 65.19% : 0.000101s : 2: substitution.inline 2.67% : 0.000004s : 6: substitution.j_node_and_user_rematch 3.97% : 0.000006s : 6: substitution.remove_not_recompute_node 2.20% : 0.000003s : 2: substitution.replace_old_param 6.19% : 0.000010s : 1: substitution.value_based_eliminate ------[type_inference.] 0.004934 2 90.64% : 0.004473s : 1: type_inference.infer 9.36% : 0.000462s : 1: type_inference.specialize ------[replace.] 0.000020 2 100.00% : 0.000020s : 2: replace.inline ------[match.] 0.000099 2 100.00% : 0.000099s : 2: match.inline ------[predicate.] 0.000168 980 0.80% : 0.000001s : 9: predicate.accumulaten_eliminater 0.92% : 0.000002s : 4: predicate.ad_related_special_op_eliminate 0.67% : 0.000001s : 8: predicate.addn_check_dump 0.83% : 0.000001s : 9: predicate.addn_zero_filter 0.71% : 0.000001s : 9: predicate.adjust_all_reduce_mul_add 2.26% : 0.000004s : 17: predicate.arithmetic_simplify 1.01% : 0.000002s : 9: predicate.cast_eliminate 0.73% : 0.000001s : 8: predicate.check_bprop_eliminate 0.70% : 0.000001s : 8: predicate.compare_switch_simplify 0.24% : 0.000000s : 4: predicate.const_output_eliminate 0.79% : 0.000001s : 8: predicate.depend_value_elim 0.82% : 0.000001s : 9: predicate.dict_get_item_const_eliminator 1.01% : 0.000002s : 9: predicate.dict_get_item_eliminator 0.86% : 0.000001s : 9: predicate.dict_set_item_eliminator 1.22% : 0.000002s : 8: predicate.dumpgradient_eliminate 0.29% : 0.000000s : 4: predicate.elim_not_effective 0.44% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.09% : 0.000002s : 13: predicate.environ_add_const_eliminate 1.03% : 0.000002s : 13: predicate.environ_get_add_eliminate 1.03% : 0.000002s : 13: predicate.environ_get_depend_swap 1.86% : 0.000003s : 21: predicate.environ_get_eliminate 1.11% : 0.000002s : 13: predicate.environ_get_set_eliminate 0.92% : 0.000002s : 11: predicate.exchange_switch_depend_value 1.98% : 0.000003s : 11: predicate.float_depend_g_call 0.71% : 0.000001s : 8: predicate.float_environ_get_switch 1.03% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.23% : 0.000000s : 4: predicate.fold_const_symbol 0.79% : 0.000001s : 8: predicate.get_grad_eliminate 0.29% : 0.000000s : 4: predicate.graph_param_transform 0.84% : 0.000001s : 8: predicate.incorporate_call 0.66% : 0.000001s : 8: predicate.incorporate_call_switch 6.33% : 0.000011s : 44: predicate.inline 1.06% : 0.000002s : 8: predicate.inline_without_move 0.40% : 0.000001s : 8: predicate.j_node_and_user_rematch 0.97% : 0.000002s : 8: predicate.less_batch_normalization 1.99% : 0.000003s : 17: predicate.list_to_tuple_eliminator_ 2.11% : 0.000004s : 26: predicate.load_eliminater 1.16% : 0.000002s : 4: predicate.loop_unroll_after_grad 1.49% : 0.000002s : 16: predicate.loop_unroll_before_grad 1.74% : 0.000003s : 17: predicate.make_slice_get_slice_eliminator 0.79% : 0.000001s : 8: predicate.merge_addn 0.74% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.76% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.70% : 0.000001s : 9: predicate.minmaximum_grad 1.19% : 0.000002s : 4: predicate.mutable_eliminate 0.43% : 0.000001s : 4: predicate.opt_reshape 0.54% : 0.000001s : 4: predicate.parallel_virtual_node 1.37% : 0.000002s : 11: predicate.partial_defer_inline 1.26% : 0.000002s : 13: predicate.partial_eliminate 0.90% : 0.000002s : 9: predicate.print_const_string_wrapper 0.79% : 0.000001s : 8: predicate.reduce_all_const_elim 1.14% : 0.000002s : 9: predicate.reduce_eliminate 2.11% : 0.000004s : 26: predicate.redundant_stop_gradient_eliminater 0.43% : 0.000001s : 8: predicate.remove_not_recompute_node 1.19% : 0.000002s : 17: predicate.replace_applicator 0.61% : 0.000001s : 8: predicate.replace_old_param 0.33% : 0.000001s : 4: predicate.reset_defer_inline 0.83% : 0.000001s : 9: predicate.reshape_eliminate 0.86% : 0.000001s : 8: predicate.row_tensor_add_zeros_like 0.49% : 0.000001s : 4: predicate.row_tensor_eliminate 1.01% : 0.000002s : 8: predicate.same_eliminate 0.55% : 0.000001s : 8: predicate.set_cell_output_no_recompute 1.02% : 0.000002s : 8: predicate.shard_identity_eliminate 0.97% : 0.000002s : 8: predicate.special_op_eliminate 1.01% : 0.000002s : 8: predicate.specialize_transform 1.14% : 0.000002s : 8: predicate.split_environ_get_set_with_tuple_value 0.94% : 0.000002s : 8: predicate.stack_unstack_eliminate 0.45% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.02% : 0.000002s : 11: predicate.switch_defer_inline 1.83% : 0.000003s : 19: predicate.switch_layer_defer_inline 4.28% : 0.000007s : 39: predicate.switch_simplify 0.73% : 0.000001s : 9: predicate.tile_eliminate 0.81% : 0.000001s : 9: predicate.transpose_eliminate 1.50% : 0.000003s : 17: predicate.tuple_list_convert_item_index_to_positive 1.62% : 0.000003s : 17: predicate.tuple_list_get_item_const_eliminator 1.53% : 0.000003s : 17: predicate.tuple_list_get_item_depend_reorder 2.97% : 0.000005s : 25: predicate.tuple_list_get_item_eliminator 1.43% : 0.000002s : 17: predicate.tuple_list_get_set_item_eliminator 2.49% : 0.000004s : 25: predicate.tuple_list_set_item_eliminator 1.54% : 0.000003s : 17: predicate.tuple_to_list_eliminator_ 2.29% : 0.000004s : 26: predicate.updatestate_pure_node_eliminater 3.22% : 0.000005s : 34: predicate.updatestate_useless_node_eliminater 0.65% : 0.000001s : 4: predicate.value_based_eliminate 0.84% : 0.000001s : 8: predicate.virtual_dataset_eliminate 0.79% : 0.000001s : 8: predicate.virtual_output_eliminate 0.35% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.49% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000223 5 7.43% : 0.000017s : 1: func_graph_cloner_run.FuncGraphClonerGraph 92.57% : 0.000206s : 4: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.025438 192 0.02% : 0.000006s : 1: ForceFp32Comm 13.00% : 0.003307s : 1: add_attr 12.94% : 0.003292s : 1: add_attr_with_inline 0.02% : 0.000006s : 1: add_comm_op_reuse_tag 0.24% : 0.000061s : 1: add_recomputation 0.03% : 0.000007s : 1: assign_add_opt 0.26% : 0.000066s : 1: auto_monad 0.11% : 0.000029s : 1: auto_monad_reorder 0.02% : 0.000006s : 1: begin_end_overlap_inline 0.03% : 0.000008s : 1: bias_add_comm_swap 1.97% : 0.000502s : 1: bootstrap 0.12% : 0.000030s : 1: cconv 0.03% : 0.000006s : 1: comm_op_add_attrs 0.08% : 0.000021s : 1: control_data_broadcast_order 0.05% : 0.000014s : 1: convert_after_rewriter 0.14% : 0.000035s : 1: cse_after_recomputation 0.03% : 0.000008s : 1: dataset_repeat_opt 0.08% : 0.000020s : 1: detach_backward 0.05% : 0.000013s : 1: environ_conv 0.09% : 0.000022s : 1: event_method 0.03% : 0.000008s : 1: full_micro_interleaved_order_control 0.03% : 0.000008s : 1: get_jit_bprop_graph 0.05% : 0.000012s : 1: graph_reusing 0.03% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.02% : 0.000006s : 1: handle_group_info 0.03% : 0.000008s : 1: inline 0.03% : 0.000008s : 1: insert-virtual-dataset 0.02% : 0.000006s : 1: interleave_parallel_branches 0.03% : 0.000006s : 1: interleave_split_concat_branches 0.03% : 0.000008s : 1: label_fine_grained_interleaved_index 0.04% : 0.000010s : 1: label_micro_interleaved_index 1.81% : 0.000460s : 1: loop_unroll 0.03% : 0.000007s : 1: merge_cast_opt 0.03% : 0.000008s : 1: micro_interleaved_order_control 2.03% : 0.000518s : 1: mutable_eliminate 0.04% : 0.000010s : 1: offloading_packed_experts 0.06% : 0.000016s : 1: opt.transform.loop_unroll_optimizer 0.06% : 0.000016s : 1: opt.transform.mutable_eliminate 3.74% : 0.000952s : 78: opt.transform.opt_a 0.13% : 0.000033s : 1: opt.transform.opt_after_cconv 0.12% : 0.000031s : 1: opt.transform.opt_after_jit_grad 0.55% : 0.000139s : 28: opt.transform.opt_b 0.21% : 0.000053s : 2: opt.transform.opt_trans_graph 0.18% : 0.000045s : 4: opt.transform.symbol_engine_opt 11.08% : 0.002820s : 1: opt_a 0.55% : 0.000141s : 1: opt_after_cconv 2.02% : 0.000514s : 1: opt_after_jit_grad 1.26% : 0.000321s : 1: opt_b 21.67% : 0.005513s : 1: optimize 0.09% : 0.000023s : 1: optimize_parallel_all_gather_comm 0.05% : 0.000012s : 1: order_py_execute_after_rewriter 0.11% : 0.000027s : 1: overlap_grad_flash_sp 0.02% : 0.000006s : 1: overlap_grad_matmul_and_grad_allreduce 0.04% : 0.000010s : 1: overlap_grad_ring_attention 0.03% : 0.000007s : 1: overlap_opt_shard_grad_in_pipeline 0.02% : 0.000006s : 1: overlap_opt_shard_in_pipeline 0.03% : 0.000007s : 1: overlap_param_gather 0.02% : 0.000006s : 1: overlap_recompute_allgather_and_fa_grad 0.04% : 0.000011s : 1: overlap_recompute_and_grad_model_parallel 0.03% : 0.000007s : 1: overlap_recompute_comm 0.04% : 0.000010s : 1: parallel-infer-symbol 0.03% : 0.000007s : 1: parallel-infer-symbol-second 0.03% : 0.000008s : 1: partial_unused_args_eliminate 0.03% : 0.000009s : 1: pipeline_parallel_scheduler 0.03% : 0.000007s : 1: pipeline_split 0.14% : 0.000035s : 1: pre_auto_parallel 0.10% : 0.000025s : 1: py_interpret_to_execute 0.06% : 0.000017s : 1: py_interpret_to_execute_after_opt_a 0.02% : 0.000006s : 1: remove_cast_before_assign_add 0.09% : 0.000023s : 1: remove_dup_value 1.55% : 0.000394s : 1: renormalize.infer 0.98% : 0.000249s : 1: renormalize.specialize 0.03% : 0.000008s : 1: reorder_send_recv_between_fp_bp 0.04% : 0.000011s : 1: rewriter_after_jit_bprop_graph 0.19% : 0.000048s : 1: rewriter_after_opt_a 0.22% : 0.000056s : 1: rewriter_before_opt_a 0.03% : 0.000007s : 1: slice_cell_reuse_recomputed_activation 0.03% : 0.000007s : 1: slice_recompute_activation 0.03% : 0.000007s : 1: split_layernorm_comm 0.03% : 0.000007s : 1: split_matmul_comm_elemetwise 0.05% : 0.000012s : 1: swap_dp_allreduce_reducescatter 0.43% : 0.000109s : 1: symbol_engine_optimizer 0.39% : 0.000098s : 1: tuple_transform 19.71% : 0.005013s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:56.208.731 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0133564, [21] [bootstrap]: 0.00043988 [type_inference]: 0.00480415 [event_method]: 1.201e-05 [auto_monad]: 5.779e-05 [graph_reusing]: 5.39998e-06 [inline]: 2.17001e-06 [add_attr]: 0.0030184, [1] [add_attr_with_inline]: 0.00301033, [1] [Cycle 1]: 4.957e-05, [2] [tag_attr]: 1.514e-05 [meta_addattr_fg_expand]: 4.05998e-06 [parallel-infer-symbol]: 3.26999e-06 [pre_auto_parallel]: 2.44e-05 [insert-virtual-dataset]: 2.34001e-06 [parallel-infer-symbol-second]: 1.14e-06 [dataset_repeat_opt]: 1.87999e-06 [pipeline_split]: 1.51998e-06 [optimize]: 0.00430466, [53] [py_interpret_to_execute]: 1.661e-05 [rewriter_before_opt_a]: 4.678e-05 [opt_a]: 0.00227793, [2] [Cycle 1]: 0.00156569, [45] [expand_dump_flag]: 2.78e-06 [switch_simplify]: 2.675e-05 [loop_unroll]: 1.476e-05 [a_1]: 0.0003494 [with_stream_mark]: 1.524e-05 [recompute_prepare]: 8.69e-06 [updatestate_depend_eliminate]: 4.18999e-06 [updatestate_assign_eliminate]: 3.56001e-06 [updatestate_loads_eliminate]: 4.08999e-06 [parameter_eliminate]: 2.12001e-06 [a_2]: 9.542e-05 [accelerated_algorithm]: 7.75e-06 [shard]: 2.04e-06 [meta_shard_fg_expand]: 1.87999e-06 [shard_inline]: 7.00998e-06 [merge_send_recv]: 9.10001e-06 [auto_parallel]: 6.71e-06 [parallel]: 1.778e-05 [flash_sp]: 7.34002e-06 [merge_comm]: 4.38999e-06 [allreduce_fusion]: 4.07e-06 [matmul_add_comm_reduction]: 1.027e-05 [allreduce_slice_to_reducescatter]: 6.39993e-07 [virtual_shard_identity]: 8.79003e-06 [virtual_dataset]: 7.61999e-06 [get_grad_eliminate_]: 6.67002e-06 [virtual_output]: 6.80002e-06 [merge_forward]: 4.48999e-06 [cell_reuse_recompute_pass]: 1.20999e-06 [offload_activation]: 1.063e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.51e-05 [merge_recompute_call_nodes]: 2.01e-06 [before_grad]: 1.226e-05 [set_forward_comm_id_for_comm_node_pass]: 4.32003e-06 [meta_fg_expand]: 3.2e-06 [flash_sp_send_recv_attached]: 2.46e-06 [receive_attached]: 2.07999e-06 [after_resolve]: 1.082e-05 [a_after_grad]: 1.072e-05 [renormalize]: 0.0005026 [add_forward_monad_depend]: 4.50999e-06 [auto_monad_grad]: 2.44999e-06 [auto_monad_eliminator]: 1.585e-05 [cse]: 3.531e-05 [a_3]: 5.423e-05 [Cycle 2]: 0.0007026, [45] [expand_dump_flag]: 1.07e-06 [switch_simplify]: 8.44998e-06 [loop_unroll]: 7.00998e-06 [a_1]: 0.00014915 [with_stream_mark]: 1.109e-05 [recompute_prepare]: 7.16999e-06 [updatestate_depend_eliminate]: 3.86999e-06 [updatestate_assign_eliminate]: 2.80997e-06 [updatestate_loads_eliminate]: 2.63e-06 [parameter_eliminate]: 1.04e-06 [a_2]: 8.703e-05 [accelerated_algorithm]: 7.15e-06 [shard]: 1.05001e-06 [meta_shard_fg_expand]: 1.57001e-06 [shard_inline]: 7.41001e-06 [merge_send_recv]: 5.52001e-06 [auto_parallel]: 6.09001e-06 [parallel]: 4.45999e-06 [flash_sp]: 3.31001e-06 [merge_comm]: 3.85e-06 [allreduce_fusion]: 4.13001e-06 [matmul_add_comm_reduction]: 6.54001e-06 [allreduce_slice_to_reducescatter]: 3.59985e-07 [virtual_shard_identity]: 7.98999e-06 [virtual_dataset]: 6.84999e-06 [get_grad_eliminate_]: 6.61e-06 [virtual_output]: 6.53e-06 [merge_forward]: 3.25002e-06 [cell_reuse_recompute_pass]: 1.42e-06 [offload_activation]: 7.40998e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.336e-05 [merge_recompute_call_nodes]: 7.2e-07 [before_grad]: 1.087e-05 [set_forward_comm_id_for_comm_node_pass]: 4.3e-06 [meta_fg_expand]: 2.66e-06 [flash_sp_send_recv_attached]: 8.79983e-07 [receive_attached]: 9.80013e-07 [after_resolve]: 9.54999e-06 [a_after_grad]: 1.003e-05 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 1.35001e-06 [auto_monad_grad]: 8.70001e-07 [auto_monad_eliminator]: 8.12e-06 [cse]: 1.85e-05 [a_3]: 4.279e-05 [py_interpret_to_execute_after_opt_a]: 8.58001e-06 [slice_cell_reuse_recomputed_activation]: 2.00002e-06 [rewriter_after_opt_a]: 3.895e-05 [convert_after_rewriter]: 7.47002e-06 [order_py_execute_after_rewriter]: 5.89e-06 [mutable_eliminate]: 0.00047598 [opt_b]: 0.00025017, [1] [Cycle 1]: 0.00024417, [7] [b_1]: 0.00016112 [b_2]: 9.05001e-06 [updatestate_depend_eliminate]: 7.08998e-06 [updatestate_assign_eliminate]: 2.91999e-06 [updatestate_loads_eliminate]: 3.12002e-06 [renormalize]: 4.50003e-07 [cse]: 2.41e-05 [optimize_parallel_all_gather_comm]: 1.779e-05 [overlap_param_gather]: 2.21998e-06 [cconv]: 2.682e-05 [loop_unroll]: 0.00042245 [opt_after_cconv]: 0.00011192, [1] [Cycle 1]: 0.00010628, [7] [c_1]: 3.422e-05 [parameter_eliminate]: 2.76e-06 [updatestate_depend_eliminate]: 6.17999e-06 [updatestate_assign_eliminate]: 3.02002e-06 [updatestate_loads_eliminate]: 3.14999e-06 [cse]: 2.243e-05 [renormalize]: 3.59985e-07 [remove_dup_value]: 1.557e-05 [tuple_transform]: 7.931e-05, [1] [Cycle 1]: 7.486e-05, [4] [d_1]: 4.621e-05 [none_parameter_eliminate]: 1.55999e-06 [renormalize]: 2.3999e-07 [switch_simplify]: 7.94002e-06 [partial_unused_args_eliminate]: 1.92999e-06 [add_recomputation]: 5.366e-05 [cse_after_recomputation]: 2.496e-05, [1] [Cycle 1]: 2.055e-05, [1] [cse]: 1.525e-05 [environ_conv]: 6.36998e-06 [swap_dp_allreduce_reducescatter]: 5.86e-06 [bias_add_comm_swap]: 2.14999e-06 [label_micro_interleaved_index]: 3.77002e-06 [label_fine_grained_interleaved_index]: 2.96001e-06 [merge_cast_opt]: 1.29e-06 [slice_recompute_activation]: 2.38998e-06 [micro_interleaved_order_control]: 2.27999e-06 [assign_add_opt]: 1.19998e-06 [ForceFp32Comm]: 7.7e-07 [remove_cast_before_assign_add]: 9.70002e-07 [full_micro_interleaved_order_control]: 2.00002e-06 [reorder_send_recv_between_fp_bp]: 2.64999e-06 [comm_op_add_attrs]: 1.09e-06 [add_comm_op_reuse_tag]: 1.02e-06 [interleave_split_concat_branches]: 1.14e-06 [interleave_parallel_branches]: 1.00999e-06 [overlap_opt_shard_in_pipeline]: 1.17e-06 [overlap_opt_shard_grad_in_pipeline]: 1.91e-06 [control_data_broadcast_order]: 1.513e-05 [grouped_pairwise_exchange_alltoall]: 1.49998e-06 [offloading_packed_experts]: 4.57e-06 [overlap_recompute_and_grad_model_parallel]: 5.10999e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.17e-06 [overlap_recompute_allgather_and_fa_grad]: 1.32e-06 [overlap_recompute_comm]: 1.664e-05 [overlap_grad_ring_attention]: 5.09e-06 [overlap_grad_flash_sp]: 2.162e-05 [begin_end_overlap_inline]: 5.00004e-07 [split_matmul_comm_elemetwise]: 2.44001e-06 [split_layernorm_comm]: 1.91e-06 [handle_group_info]: 1.04003e-06 [symbol_engine_optimizer]: 8.344e-05, [1] [Cycle 1]: 7.922e-05, [6] [build]: 3.55e-06 [elim_shapecalc]: 1.159e-05 [elim_not_effective]: 1.541e-05 [opt_reshape]: 7.9e-06 [fold_const_symbol]: 1.183e-05 [renormalize]: 1.90019e-07 [detach_backward]: 1.86e-06 [pipeline_parallel_scheduler]: 1.60999e-06 [auto_monad_reorder]: 1.957e-05 [get_jit_bprop_graph]: 1.74e-06 [rewriter_after_jit_bprop_graph]: 3.72998e-06 [opt_after_jit_grad]: 0.0004677 [validate]: 4.22e-05 Sums bootstrap : 0.000440s : 4.68% type_inference : 0.004804s : 51.08% event_method : 0.000012s : 0.13% auto_monad : 0.000058s : 0.61% graph_reusing : 0.000005s : 0.06% inline : 0.000002s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000015s : 0.16% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000004s : 0.04% parallel-infer-symbol : 0.000003s : 0.03% pre_auto_parallel : 0.000024s : 0.26% insert-virtual-dataset : 0.000002s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.02% optimize.py_interpret_to_execute : 0.000017s : 0.18% optimize.rewriter_before_opt_a : 0.000047s : 0.50% optimize.opt_a.expand_dump_flag : 0.000004s : 0.04% optimize.opt_a.switch_simplify : 0.000035s : 0.37% optimize.opt_a.loop_unroll : 0.000022s : 0.23% optimize.opt_a.a_1 : 0.000499s : 5.30% optimize.opt_a.with_stream_mark : 0.000026s : 0.28% optimize.opt_a.recompute_prepare : 0.000016s : 0.17% optimize.opt_a.updatestate_depend_eliminate : 0.000008s : 0.09% optimize.opt_a.updatestate_assign_eliminate : 0.000006s : 0.07% optimize.opt_a.updatestate_loads_eliminate : 0.000007s : 0.07% optimize.opt_a.parameter_eliminate : 0.000003s : 0.03% optimize.opt_a.a_2 : 0.000182s : 1.94% optimize.opt_a.accelerated_algorithm : 0.000015s : 0.16% optimize.opt_a.shard : 0.000003s : 0.03% optimize.opt_a.meta_shard_fg_expand : 0.000003s : 0.04% optimize.opt_a.shard_inline : 0.000014s : 0.15% optimize.opt_a.merge_send_recv : 0.000015s : 0.16% optimize.opt_a.auto_parallel : 0.000013s : 0.14% optimize.opt_a.parallel : 0.000022s : 0.24% optimize.opt_a.flash_sp : 0.000011s : 0.11% optimize.opt_a.merge_comm : 0.000008s : 0.09% optimize.opt_a.allreduce_fusion : 0.000008s : 0.09% optimize.opt_a.matmul_add_comm_reduction : 0.000017s : 0.18% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000017s : 0.18% optimize.opt_a.virtual_dataset : 0.000014s : 0.15% optimize.opt_a.get_grad_eliminate_ : 0.000013s : 0.14% optimize.opt_a.virtual_output : 0.000013s : 0.14% optimize.opt_a.merge_forward : 0.000008s : 0.08% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.03% optimize.opt_a.offload_activation : 0.000018s : 0.19% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000028s : 0.30% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.03% optimize.opt_a.before_grad : 0.000023s : 0.25% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000009s : 0.09% optimize.opt_a.meta_fg_expand : 0.000006s : 0.06% optimize.opt_a.flash_sp_send_recv_attached : 0.000003s : 0.04% optimize.opt_a.receive_attached : 0.000003s : 0.03% optimize.opt_a.after_resolve : 0.000020s : 0.22% optimize.opt_a.a_after_grad : 0.000021s : 0.22% optimize.opt_a.renormalize : 0.000503s : 5.34% optimize.opt_a.add_forward_monad_depend : 0.000006s : 0.06% optimize.opt_a.auto_monad_grad : 0.000003s : 0.04% optimize.opt_a.auto_monad_eliminator : 0.000024s : 0.25% optimize.opt_a.cse : 0.000054s : 0.57% optimize.opt_a.a_3 : 0.000097s : 1.03% optimize.py_interpret_to_execute_after_opt_a : 0.000009s : 0.09% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.02% optimize.rewriter_after_opt_a : 0.000039s : 0.41% optimize.convert_after_rewriter : 0.000007s : 0.08% optimize.order_py_execute_after_rewriter : 0.000006s : 0.06% optimize.mutable_eliminate : 0.000476s : 5.06% optimize.opt_b.b_1 : 0.000161s : 1.71% optimize.opt_b.b_2 : 0.000009s : 0.10% optimize.opt_b.updatestate_depend_eliminate : 0.000007s : 0.08% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_b.renormalize : 0.000000s : 0.00% optimize.opt_b.cse : 0.000024s : 0.26% optimize.optimize_parallel_all_gather_comm : 0.000018s : 0.19% optimize.overlap_param_gather : 0.000002s : 0.02% optimize.cconv : 0.000027s : 0.29% optimize.loop_unroll : 0.000422s : 4.49% optimize.opt_after_cconv.c_1 : 0.000034s : 0.36% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.07% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.cse : 0.000022s : 0.24% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000016s : 0.17% optimize.tuple_transform.d_1 : 0.000046s : 0.49% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.02% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000008s : 0.08% optimize.partial_unused_args_eliminate : 0.000002s : 0.02% optimize.add_recomputation : 0.000054s : 0.57% optimize.cse_after_recomputation.cse : 0.000015s : 0.16% optimize.environ_conv : 0.000006s : 0.07% optimize.swap_dp_allreduce_reducescatter : 0.000006s : 0.06% optimize.bias_add_comm_swap : 0.000002s : 0.02% optimize.label_micro_interleaved_index : 0.000004s : 0.04% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.03% optimize.merge_cast_opt : 0.000001s : 0.01% optimize.slice_recompute_activation : 0.000002s : 0.03% optimize.micro_interleaved_order_control : 0.000002s : 0.02% optimize.assign_add_opt : 0.000001s : 0.01% optimize.ForceFp32Comm : 0.000001s : 0.01% optimize.remove_cast_before_assign_add : 0.000001s : 0.01% optimize.full_micro_interleaved_order_control : 0.000002s : 0.02% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.03% optimize.comm_op_add_attrs : 0.000001s : 0.01% optimize.add_comm_op_reuse_tag : 0.000001s : 0.01% optimize.interleave_split_concat_branches : 0.000001s : 0.01% optimize.interleave_parallel_branches : 0.000001s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.02% optimize.control_data_broadcast_order : 0.000015s : 0.16% optimize.grouped_pairwise_exchange_alltoall : 0.000001s : 0.02% optimize.offloading_packed_experts : 0.000005s : 0.05% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.05% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.01% optimize.overlap_recompute_comm : 0.000017s : 0.18% optimize.overlap_grad_ring_attention : 0.000005s : 0.05% optimize.overlap_grad_flash_sp : 0.000022s : 0.23% optimize.begin_end_overlap_inline : 0.000001s : 0.01% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.03% optimize.split_layernorm_comm : 0.000002s : 0.02% optimize.handle_group_info : 0.000001s : 0.01% optimize.symbol_engine_optimizer.build : 0.000004s : 0.04% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000012s : 0.12% optimize.symbol_engine_optimizer.elim_not_effective : 0.000015s : 0.16% optimize.symbol_engine_optimizer.opt_reshape : 0.000008s : 0.08% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000012s : 0.13% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.02% pipeline_parallel_scheduler : 0.000002s : 0.02% auto_monad_reorder : 0.000020s : 0.21% get_jit_bprop_graph : 0.000002s : 0.02% rewriter_after_jit_bprop_graph : 0.000004s : 0.04% opt_after_jit_grad : 0.000468s : 4.97% validate : 0.000042s : 0.45% Time group info: ------[substitution.] 0.000143 29 11.78% : 0.000017s : 2: substitution.cast_eliminate 1.57% : 0.000002s : 3: substitution.elim_not_effective 1.22% : 0.000002s : 3: substitution.fold_const_symbol 4.48% : 0.000006s : 4: substitution.graph_param_transform 65.52% : 0.000094s : 2: substitution.inline 2.69% : 0.000004s : 6: substitution.j_node_and_user_rematch 4.28% : 0.000006s : 6: substitution.remove_not_recompute_node 2.26% : 0.000003s : 2: substitution.replace_old_param 6.21% : 0.000009s : 1: substitution.value_based_eliminate ------[type_inference.] 0.004761 2 90.91% : 0.004329s : 1: type_inference.infer 9.09% : 0.000433s : 1: type_inference.specialize ------[replace.] 0.000020 2 100.00% : 0.000020s : 2: replace.inline ------[match.] 0.000092 2 100.00% : 0.000092s : 2: match.inline ------[predicate.] 0.000177 980 0.76% : 0.000001s : 9: predicate.accumulaten_eliminater 0.96% : 0.000002s : 4: predicate.ad_related_special_op_eliminate 0.61% : 0.000001s : 8: predicate.addn_check_dump 0.79% : 0.000001s : 9: predicate.addn_zero_filter 0.68% : 0.000001s : 9: predicate.adjust_all_reduce_mul_add 2.01% : 0.000004s : 17: predicate.arithmetic_simplify 0.91% : 0.000002s : 9: predicate.cast_eliminate 0.68% : 0.000001s : 8: predicate.check_bprop_eliminate 0.63% : 0.000001s : 8: predicate.compare_switch_simplify 0.21% : 0.000000s : 4: predicate.const_output_eliminate 0.73% : 0.000001s : 8: predicate.depend_value_elim 0.80% : 0.000001s : 9: predicate.dict_get_item_const_eliminator 0.88% : 0.000002s : 9: predicate.dict_get_item_eliminator 0.73% : 0.000001s : 9: predicate.dict_set_item_eliminator 1.19% : 0.000002s : 8: predicate.dumpgradient_eliminate 0.28% : 0.000000s : 4: predicate.elim_not_effective 0.45% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.03% : 0.000002s : 13: predicate.environ_add_const_eliminate 0.96% : 0.000002s : 13: predicate.environ_get_add_eliminate 0.98% : 0.000002s : 13: predicate.environ_get_depend_swap 1.73% : 0.000003s : 21: predicate.environ_get_eliminate 1.02% : 0.000002s : 13: predicate.environ_get_set_eliminate 0.88% : 0.000002s : 11: predicate.exchange_switch_depend_value 1.66% : 0.000003s : 11: predicate.float_depend_g_call 0.63% : 0.000001s : 8: predicate.float_environ_get_switch 0.94% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.21% : 0.000000s : 4: predicate.fold_const_symbol 0.75% : 0.000001s : 8: predicate.get_grad_eliminate 0.24% : 0.000000s : 4: predicate.graph_param_transform 0.81% : 0.000001s : 8: predicate.incorporate_call 0.63% : 0.000001s : 8: predicate.incorporate_call_switch 5.87% : 0.000010s : 44: predicate.inline 1.03% : 0.000002s : 8: predicate.inline_without_move 0.37% : 0.000001s : 8: predicate.j_node_and_user_rematch 0.99% : 0.000002s : 8: predicate.less_batch_normalization 1.51% : 0.000003s : 17: predicate.list_to_tuple_eliminator_ 2.10% : 0.000004s : 26: predicate.load_eliminater 1.07% : 0.000002s : 4: predicate.loop_unroll_after_grad 1.35% : 0.000002s : 16: predicate.loop_unroll_before_grad 1.57% : 0.000003s : 17: predicate.make_slice_get_slice_eliminator 0.71% : 0.000001s : 8: predicate.merge_addn 0.62% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.90% : 0.000002s : 8: predicate.mini_step_allgather_replace 0.66% : 0.000001s : 9: predicate.minmaximum_grad 1.11% : 0.000002s : 4: predicate.mutable_eliminate 0.41% : 0.000001s : 4: predicate.opt_reshape 0.59% : 0.000001s : 4: predicate.parallel_virtual_node 1.08% : 0.000002s : 11: predicate.partial_defer_inline 1.24% : 0.000002s : 13: predicate.partial_eliminate 0.80% : 0.000001s : 9: predicate.print_const_string_wrapper 0.67% : 0.000001s : 8: predicate.reduce_all_const_elim 0.99% : 0.000002s : 9: predicate.reduce_eliminate 2.38% : 0.000004s : 26: predicate.redundant_stop_gradient_eliminater 0.44% : 0.000001s : 8: predicate.remove_not_recompute_node 1.17% : 0.000002s : 17: predicate.replace_applicator 0.63% : 0.000001s : 8: predicate.replace_old_param 0.34% : 0.000001s : 4: predicate.reset_defer_inline 0.89% : 0.000002s : 9: predicate.reshape_eliminate 0.72% : 0.000001s : 8: predicate.row_tensor_add_zeros_like 0.45% : 0.000001s : 4: predicate.row_tensor_eliminate 0.92% : 0.000002s : 8: predicate.same_eliminate 0.49% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.99% : 0.000002s : 8: predicate.shard_identity_eliminate 0.81% : 0.000001s : 8: predicate.special_op_eliminate 0.90% : 0.000002s : 8: predicate.specialize_transform 1.06% : 0.000002s : 8: predicate.split_environ_get_set_with_tuple_value 0.84% : 0.000001s : 8: predicate.stack_unstack_eliminate 0.42% : 0.000001s : 4: predicate.switch_call_monad_eliminater 7.39% : 0.000013s : 11: predicate.switch_defer_inline 1.63% : 0.000003s : 19: predicate.switch_layer_defer_inline 3.92% : 0.000007s : 39: predicate.switch_simplify 0.73% : 0.000001s : 9: predicate.tile_eliminate 0.77% : 0.000001s : 9: predicate.transpose_eliminate 1.60% : 0.000003s : 17: predicate.tuple_list_convert_item_index_to_positive 1.50% : 0.000003s : 17: predicate.tuple_list_get_item_const_eliminator 1.44% : 0.000003s : 17: predicate.tuple_list_get_item_depend_reorder 2.88% : 0.000005s : 25: predicate.tuple_list_get_item_eliminator 1.41% : 0.000002s : 17: predicate.tuple_list_get_set_item_eliminator 2.36% : 0.000004s : 25: predicate.tuple_list_set_item_eliminator 1.60% : 0.000003s : 17: predicate.tuple_to_list_eliminator_ 1.99% : 0.000004s : 26: predicate.updatestate_pure_node_eliminater 2.92% : 0.000005s : 34: predicate.updatestate_useless_node_eliminater 0.50% : 0.000001s : 4: predicate.value_based_eliminate 0.84% : 0.000001s : 8: predicate.virtual_dataset_eliminate 0.73% : 0.000001s : 8: predicate.virtual_output_eliminate 0.36% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.53% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000205 5 9.03% : 0.000019s : 1: func_graph_cloner_run.FuncGraphClonerGraph 90.97% : 0.000187s : 4: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.022325 192 0.02% : 0.000003s : 1: ForceFp32Comm 13.54% : 0.003023s : 1: add_attr 13.50% : 0.003014s : 1: add_attr_with_inline 0.02% : 0.000004s : 1: add_comm_op_reuse_tag 0.26% : 0.000058s : 1: add_recomputation 0.02% : 0.000004s : 1: assign_add_opt 0.28% : 0.000063s : 1: auto_monad 0.10% : 0.000023s : 1: auto_monad_reorder 0.01% : 0.000003s : 1: begin_end_overlap_inline 0.02% : 0.000005s : 1: bias_add_comm_swap 2.09% : 0.000467s : 1: bootstrap 0.14% : 0.000030s : 1: cconv 0.02% : 0.000004s : 1: comm_op_add_attrs 0.08% : 0.000018s : 1: control_data_broadcast_order 0.05% : 0.000011s : 1: convert_after_rewriter 0.12% : 0.000028s : 1: cse_after_recomputation 0.02% : 0.000005s : 1: dataset_repeat_opt 0.02% : 0.000005s : 1: detach_backward 0.04% : 0.000010s : 1: environ_conv 0.08% : 0.000017s : 1: event_method 0.02% : 0.000005s : 1: full_micro_interleaved_order_control 0.02% : 0.000005s : 1: get_jit_bprop_graph 0.04% : 0.000009s : 1: graph_reusing 0.02% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.02% : 0.000004s : 1: handle_group_info 0.02% : 0.000005s : 1: inline 0.03% : 0.000006s : 1: insert-virtual-dataset 0.02% : 0.000004s : 1: interleave_parallel_branches 0.02% : 0.000004s : 1: interleave_split_concat_branches 0.03% : 0.000006s : 1: label_fine_grained_interleaved_index 0.03% : 0.000007s : 1: label_micro_interleaved_index 1.93% : 0.000430s : 1: loop_unroll 0.02% : 0.000004s : 1: merge_cast_opt 0.02% : 0.000005s : 1: micro_interleaved_order_control 2.17% : 0.000484s : 1: mutable_eliminate 0.03% : 0.000007s : 1: offloading_packed_experts 0.07% : 0.000016s : 1: opt.transform.loop_unroll_optimizer 0.07% : 0.000016s : 1: opt.transform.mutable_eliminate 4.22% : 0.000943s : 78: opt.transform.opt_a 0.15% : 0.000033s : 1: opt.transform.opt_after_cconv 0.12% : 0.000027s : 1: opt.transform.opt_after_jit_grad 0.62% : 0.000138s : 28: opt.transform.opt_b 0.23% : 0.000052s : 2: opt.transform.opt_trans_graph 0.19% : 0.000043s : 4: opt.transform.symbol_engine_opt 10.22% : 0.002281s : 1: opt_a 0.52% : 0.000116s : 1: opt_after_cconv 2.14% : 0.000477s : 1: opt_after_jit_grad 1.14% : 0.000254s : 1: opt_b 19.30% : 0.004309s : 1: optimize 0.10% : 0.000021s : 1: optimize_parallel_all_gather_comm 0.04% : 0.000009s : 1: order_py_execute_after_rewriter 0.11% : 0.000025s : 1: overlap_grad_flash_sp 0.02% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.04% : 0.000008s : 1: overlap_grad_ring_attention 0.02% : 0.000004s : 1: overlap_opt_shard_grad_in_pipeline 0.02% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.02% : 0.000005s : 1: overlap_param_gather 0.02% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.04% : 0.000008s : 1: overlap_recompute_and_grad_model_parallel 0.09% : 0.000020s : 1: overlap_recompute_comm 0.03% : 0.000007s : 1: parallel-infer-symbol 0.02% : 0.000004s : 1: parallel-infer-symbol-second 0.02% : 0.000005s : 1: partial_unused_args_eliminate 0.02% : 0.000005s : 1: pipeline_parallel_scheduler 0.02% : 0.000004s : 1: pipeline_split 0.13% : 0.000029s : 1: pre_auto_parallel 0.09% : 0.000020s : 1: py_interpret_to_execute 0.05% : 0.000012s : 1: py_interpret_to_execute_after_opt_a 0.02% : 0.000004s : 1: remove_cast_before_assign_add 0.09% : 0.000019s : 1: remove_dup_value 1.26% : 0.000282s : 1: renormalize.infer 0.95% : 0.000213s : 1: renormalize.specialize 0.02% : 0.000005s : 1: reorder_send_recv_between_fp_bp 0.03% : 0.000007s : 1: rewriter_after_jit_bprop_graph 0.19% : 0.000043s : 1: rewriter_after_opt_a 0.23% : 0.000051s : 1: rewriter_before_opt_a 0.02% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.02% : 0.000005s : 1: slice_recompute_activation 0.02% : 0.000005s : 1: split_layernorm_comm 0.02% : 0.000005s : 1: split_matmul_comm_elemetwise 0.04% : 0.000009s : 1: swap_dp_allreduce_reducescatter 0.39% : 0.000086s : 1: symbol_engine_optimizer 0.37% : 0.000082s : 1: tuple_transform 21.58% : 0.004817s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:56.402.010 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:56.402.273 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0144822, [21] [bootstrap]: 0.00043876 [type_inference]: 0.00490832 [event_method]: 1.215e-05 [auto_monad]: 5.8e-05 [graph_reusing]: 6.08998e-06 [inline]: 2.43e-06 [add_attr]: 0.00320612, [1] [add_attr_with_inline]: 0.00319801, [1] [Cycle 1]: 6.378e-05, [2] [tag_attr]: 1.403e-05 [meta_addattr_fg_expand]: 3.84002e-06 [parallel-infer-symbol]: 3.22002e-06 [pre_auto_parallel]: 2.45e-05 [insert-virtual-dataset]: 2.36e-06 [parallel-infer-symbol-second]: 7.59988e-07 [dataset_repeat_opt]: 1.87001e-06 [pipeline_split]: 1.57001e-06 [optimize]: 0.00472179, [53] [py_interpret_to_execute]: 2.007e-05 [rewriter_before_opt_a]: 4.935e-05 [opt_a]: 0.00251566, [2] [Cycle 1]: 0.00167801, [45] [expand_dump_flag]: 3.09999e-06 [switch_simplify]: 2.45e-05 [loop_unroll]: 1.341e-05 [a_1]: 0.00028431 [with_stream_mark]: 1.567e-05 [recompute_prepare]: 7.90998e-06 [updatestate_depend_eliminate]: 4.03001e-06 [updatestate_assign_eliminate]: 3.83001e-06 [updatestate_loads_eliminate]: 3.08e-06 [parameter_eliminate]: 1.89e-06 [a_2]: 0.00010825 [accelerated_algorithm]: 6.67002e-06 [shard]: 2.16e-06 [meta_shard_fg_expand]: 1.77999e-06 [shard_inline]: 6.15002e-06 [merge_send_recv]: 8.37998e-06 [auto_parallel]: 6.98e-06 [parallel]: 1.784e-05 [flash_sp]: 7.76001e-06 [merge_comm]: 3.90998e-06 [allreduce_fusion]: 4e-06 [matmul_add_comm_reduction]: 9.32001e-06 [allreduce_slice_to_reducescatter]: 9.50007e-07 [virtual_shard_identity]: 7.6e-06 [virtual_dataset]: 6.29001e-06 [get_grad_eliminate_]: 5.91998e-06 [virtual_output]: 6.17001e-06 [merge_forward]: 4.3e-06 [cell_reuse_recompute_pass]: 1.14e-06 [offload_activation]: 1.068e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.517e-05 [merge_recompute_call_nodes]: 1.52999e-06 [before_grad]: 1.053e-05 [set_forward_comm_id_for_comm_node_pass]: 5.05999e-06 [meta_fg_expand]: 2.53998e-06 [flash_sp_send_recv_attached]: 2.56e-06 [receive_attached]: 2.04e-06 [after_resolve]: 1.111e-05 [a_after_grad]: 9.98998e-06 [renormalize]: 0.00053035 [add_forward_monad_depend]: 6.49001e-06 [auto_monad_grad]: 2.19001e-06 [auto_monad_eliminator]: 1.658e-05 [cse]: 2.934e-05 [a_3]: 6.189e-05 [Cycle 2]: 0.00082417, [45] [expand_dump_flag]: 1.15999e-06 [switch_simplify]: 7.03998e-06 [loop_unroll]: 5.67001e-06 [a_1]: 0.00010758 [with_stream_mark]: 1.241e-05 [recompute_prepare]: 6.16e-06 [updatestate_depend_eliminate]: 3.18e-06 [updatestate_assign_eliminate]: 2.47001e-06 [updatestate_loads_eliminate]: 2.69001e-06 [parameter_eliminate]: 1.50999e-06 [a_2]: 9.668e-05 [accelerated_algorithm]: 5.89999e-06 [shard]: 1.41998e-06 [meta_shard_fg_expand]: 1.72001e-06 [shard_inline]: 6.51e-06 [merge_send_recv]: 5.70001e-06 [auto_parallel]: 5.86e-06 [parallel]: 6.26e-06 [flash_sp]: 3.45e-06 [merge_comm]: 3.14001e-06 [allreduce_fusion]: 3.13998e-06 [matmul_add_comm_reduction]: 6.95002e-06 [allreduce_slice_to_reducescatter]: 3.60014e-07 [virtual_shard_identity]: 6.25002e-06 [virtual_dataset]: 5.95002e-06 [get_grad_eliminate_]: 5.47001e-06 [virtual_output]: 5.52001e-06 [merge_forward]: 2.51998e-06 [cell_reuse_recompute_pass]: 1.40999e-06 [offload_activation]: 7.7e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.436e-05 [merge_recompute_call_nodes]: 1.08001e-06 [before_grad]: 9.89001e-06 [set_forward_comm_id_for_comm_node_pass]: 3.53e-06 [meta_fg_expand]: 2.37999e-06 [flash_sp_send_recv_attached]: 1.00999e-06 [receive_attached]: 1.62001e-06 [after_resolve]: 8.62e-06 [a_after_grad]: 8.35999e-06 [renormalize]: 5.9983e-08 [add_forward_monad_depend]: 1.45001e-06 [auto_monad_grad]: 9.50007e-07 [auto_monad_eliminator]: 7.55e-06 [cse]: 1.462e-05 [a_3]: 4.796e-05 [py_interpret_to_execute_after_opt_a]: 1.229e-05 [slice_cell_reuse_recomputed_activation]: 4.58001e-06 [rewriter_after_opt_a]: 4.05e-05 [convert_after_rewriter]: 9.51e-06 [order_py_execute_after_rewriter]: 8.11002e-06 [mutable_eliminate]: 0.00051557 [opt_b]: 0.00028099, [1] [Cycle 1]: 0.00027179, [7] [b_1]: 0.00017688 [b_2]: 7.83999e-06 [updatestate_depend_eliminate]: 5.94999e-06 [updatestate_assign_eliminate]: 2.59001e-06 [updatestate_loads_eliminate]: 2.24999e-06 [renormalize]: 4.39992e-07 [cse]: 1.827e-05 [optimize_parallel_all_gather_comm]: 1.979e-05 [overlap_param_gather]: 4.84e-06 [cconv]: 2.859e-05 [loop_unroll]: 0.00043538 [opt_after_cconv]: 0.00012039, [1] [Cycle 1]: 0.00011192, [7] [c_1]: 2.709e-05 [parameter_eliminate]: 2.53e-06 [updatestate_depend_eliminate]: 5.07e-06 [updatestate_assign_eliminate]: 2.39001e-06 [updatestate_loads_eliminate]: 2.47001e-06 [cse]: 1.817e-05 [renormalize]: 3.39991e-07 [remove_dup_value]: 1.697e-05 [tuple_transform]: 8.409e-05, [1] [Cycle 1]: 7.71e-05, [4] [d_1]: 3.855e-05 [none_parameter_eliminate]: 1.55001e-06 [renormalize]: 1.69995e-07 [switch_simplify]: 6.86001e-06 [partial_unused_args_eliminate]: 4.43999e-06 [add_recomputation]: 4.726e-05 [cse_after_recomputation]: 2.656e-05, [1] [Cycle 1]: 1.993e-05, [1] [cse]: 1.136e-05 [environ_conv]: 7.75e-06 [swap_dp_allreduce_reducescatter]: 7.75e-06 [bias_add_comm_swap]: 4.95999e-06 [label_micro_interleaved_index]: 6.39999e-06 [label_fine_grained_interleaved_index]: 5.30999e-06 [merge_cast_opt]: 3.81001e-06 [slice_recompute_activation]: 4.4e-06 [micro_interleaved_order_control]: 4.45999e-06 [assign_add_opt]: 3.65998e-06 [ForceFp32Comm]: 3.23e-06 [remove_cast_before_assign_add]: 3.35998e-06 [full_micro_interleaved_order_control]: 4.43999e-06 [reorder_send_recv_between_fp_bp]: 5.20999e-06 [comm_op_add_attrs]: 3.63999e-06 [add_comm_op_reuse_tag]: 3.23e-06 [interleave_split_concat_branches]: 3.66999e-06 [interleave_parallel_branches]: 3.63e-06 [overlap_opt_shard_in_pipeline]: 3.76001e-06 [overlap_opt_shard_grad_in_pipeline]: 4.27998e-06 [control_data_broadcast_order]: 1.43e-05 [grouped_pairwise_exchange_alltoall]: 3.83001e-06 [offloading_packed_experts]: 6.75998e-06 [overlap_recompute_and_grad_model_parallel]: 7.05e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.85e-06 [overlap_recompute_allgather_and_fa_grad]: 3.65998e-06 [overlap_recompute_comm]: 4.85001e-06 [overlap_grad_ring_attention]: 6.43e-06 [overlap_grad_flash_sp]: 2.231e-05 [begin_end_overlap_inline]: 2.94999e-06 [split_matmul_comm_elemetwise]: 4.56002e-06 [split_layernorm_comm]: 4.15e-06 [handle_group_info]: 3.28e-06 [symbol_engine_optimizer]: 9.425e-05, [1] [Cycle 1]: 8.767e-05, [6] [build]: 3.05002e-06 [elim_shapecalc]: 9.69e-06 [elim_not_effective]: 1.303e-05 [opt_reshape]: 6.74999e-06 [fold_const_symbol]: 9.79999e-06 [renormalize]: 2.19996e-07 [detach_backward]: 3.03e-06 [pipeline_parallel_scheduler]: 1.81998e-06 [auto_monad_reorder]: 1.803e-05 [get_jit_bprop_graph]: 1.63002e-06 [rewriter_after_jit_bprop_graph]: 4.05998e-06 [opt_after_jit_grad]: 0.00047393 [validate]: 3.69e-05 Sums bootstrap : 0.000439s : 4.60% type_inference : 0.004908s : 51.49% event_method : 0.000012s : 0.13% auto_monad : 0.000058s : 0.61% graph_reusing : 0.000006s : 0.06% inline : 0.000002s : 0.03% add_attr.add_attr_with_inline.tag_attr : 0.000014s : 0.15% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000004s : 0.04% parallel-infer-symbol : 0.000003s : 0.03% pre_auto_parallel : 0.000025s : 0.26% insert-virtual-dataset : 0.000002s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.02% optimize.py_interpret_to_execute : 0.000020s : 0.21% optimize.rewriter_before_opt_a : 0.000049s : 0.52% optimize.opt_a.expand_dump_flag : 0.000004s : 0.04% optimize.opt_a.switch_simplify : 0.000032s : 0.33% optimize.opt_a.loop_unroll : 0.000019s : 0.20% optimize.opt_a.a_1 : 0.000392s : 4.11% optimize.opt_a.with_stream_mark : 0.000028s : 0.29% optimize.opt_a.recompute_prepare : 0.000014s : 0.15% optimize.opt_a.updatestate_depend_eliminate : 0.000007s : 0.08% optimize.opt_a.updatestate_assign_eliminate : 0.000006s : 0.07% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.06% optimize.opt_a.parameter_eliminate : 0.000003s : 0.04% optimize.opt_a.a_2 : 0.000205s : 2.15% optimize.opt_a.accelerated_algorithm : 0.000013s : 0.13% optimize.opt_a.shard : 0.000004s : 0.04% optimize.opt_a.meta_shard_fg_expand : 0.000003s : 0.04% optimize.opt_a.shard_inline : 0.000013s : 0.13% optimize.opt_a.merge_send_recv : 0.000014s : 0.15% optimize.opt_a.auto_parallel : 0.000013s : 0.13% optimize.opt_a.parallel : 0.000024s : 0.25% optimize.opt_a.flash_sp : 0.000011s : 0.12% optimize.opt_a.merge_comm : 0.000007s : 0.07% optimize.opt_a.allreduce_fusion : 0.000007s : 0.07% optimize.opt_a.matmul_add_comm_reduction : 0.000016s : 0.17% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000014s : 0.15% optimize.opt_a.virtual_dataset : 0.000012s : 0.13% optimize.opt_a.get_grad_eliminate_ : 0.000011s : 0.12% optimize.opt_a.virtual_output : 0.000012s : 0.12% optimize.opt_a.merge_forward : 0.000007s : 0.07% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.03% optimize.opt_a.offload_activation : 0.000018s : 0.19% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000030s : 0.31% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.03% optimize.opt_a.before_grad : 0.000020s : 0.21% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000009s : 0.09% optimize.opt_a.meta_fg_expand : 0.000005s : 0.05% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.04% optimize.opt_a.receive_attached : 0.000004s : 0.04% optimize.opt_a.after_resolve : 0.000020s : 0.21% optimize.opt_a.a_after_grad : 0.000018s : 0.19% optimize.opt_a.renormalize : 0.000530s : 5.56% optimize.opt_a.add_forward_monad_depend : 0.000008s : 0.08% optimize.opt_a.auto_monad_grad : 0.000003s : 0.03% optimize.opt_a.auto_monad_eliminator : 0.000024s : 0.25% optimize.opt_a.cse : 0.000044s : 0.46% optimize.opt_a.a_3 : 0.000110s : 1.15% optimize.py_interpret_to_execute_after_opt_a : 0.000012s : 0.13% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.05% optimize.rewriter_after_opt_a : 0.000040s : 0.42% optimize.convert_after_rewriter : 0.000010s : 0.10% optimize.order_py_execute_after_rewriter : 0.000008s : 0.09% optimize.mutable_eliminate : 0.000516s : 5.41% optimize.opt_b.b_1 : 0.000177s : 1.86% optimize.opt_b.b_2 : 0.000008s : 0.08% optimize.opt_b.updatestate_depend_eliminate : 0.000006s : 0.06% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_b.updatestate_loads_eliminate : 0.000002s : 0.02% optimize.opt_b.renormalize : 0.000000s : 0.00% optimize.opt_b.cse : 0.000018s : 0.19% optimize.optimize_parallel_all_gather_comm : 0.000020s : 0.21% optimize.overlap_param_gather : 0.000005s : 0.05% optimize.cconv : 0.000029s : 0.30% optimize.loop_unroll : 0.000435s : 4.57% optimize.opt_after_cconv.c_1 : 0.000027s : 0.28% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000005s : 0.05% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000002s : 0.03% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.03% optimize.opt_after_cconv.cse : 0.000018s : 0.19% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000017s : 0.18% optimize.tuple_transform.d_1 : 0.000039s : 0.40% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.02% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000007s : 0.07% optimize.partial_unused_args_eliminate : 0.000004s : 0.05% optimize.add_recomputation : 0.000047s : 0.50% optimize.cse_after_recomputation.cse : 0.000011s : 0.12% optimize.environ_conv : 0.000008s : 0.08% optimize.swap_dp_allreduce_reducescatter : 0.000008s : 0.08% optimize.bias_add_comm_swap : 0.000005s : 0.05% optimize.label_micro_interleaved_index : 0.000006s : 0.07% optimize.label_fine_grained_interleaved_index : 0.000005s : 0.06% optimize.merge_cast_opt : 0.000004s : 0.04% optimize.slice_recompute_activation : 0.000004s : 0.05% optimize.micro_interleaved_order_control : 0.000004s : 0.05% optimize.assign_add_opt : 0.000004s : 0.04% optimize.ForceFp32Comm : 0.000003s : 0.03% optimize.remove_cast_before_assign_add : 0.000003s : 0.04% optimize.full_micro_interleaved_order_control : 0.000004s : 0.05% optimize.reorder_send_recv_between_fp_bp : 0.000005s : 0.05% optimize.comm_op_add_attrs : 0.000004s : 0.04% optimize.add_comm_op_reuse_tag : 0.000003s : 0.03% optimize.interleave_split_concat_branches : 0.000004s : 0.04% optimize.interleave_parallel_branches : 0.000004s : 0.04% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.04% optimize.overlap_opt_shard_grad_in_pipeline : 0.000004s : 0.04% optimize.control_data_broadcast_order : 0.000014s : 0.15% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.04% optimize.offloading_packed_experts : 0.000007s : 0.07% optimize.overlap_recompute_and_grad_model_parallel : 0.000007s : 0.07% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.04% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.04% optimize.overlap_recompute_comm : 0.000005s : 0.05% optimize.overlap_grad_ring_attention : 0.000006s : 0.07% optimize.overlap_grad_flash_sp : 0.000022s : 0.23% optimize.begin_end_overlap_inline : 0.000003s : 0.03% optimize.split_matmul_comm_elemetwise : 0.000005s : 0.05% optimize.split_layernorm_comm : 0.000004s : 0.04% optimize.handle_group_info : 0.000003s : 0.03% optimize.symbol_engine_optimizer.build : 0.000003s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000010s : 0.10% optimize.symbol_engine_optimizer.elim_not_effective : 0.000013s : 0.14% optimize.symbol_engine_optimizer.opt_reshape : 0.000007s : 0.07% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000010s : 0.10% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000003s : 0.03% pipeline_parallel_scheduler : 0.000002s : 0.02% auto_monad_reorder : 0.000018s : 0.19% get_jit_bprop_graph : 0.000002s : 0.02% rewriter_after_jit_bprop_graph : 0.000004s : 0.04% opt_after_jit_grad : 0.000474s : 4.97% validate : 0.000037s : 0.39% Time group info: ------[substitution.] 0.000117 20 1.81% : 0.000002s : 2: substitution.elim_not_effective 1.13% : 0.000001s : 2: substitution.fold_const_symbol 4.75% : 0.000006s : 3: substitution.graph_param_transform 73.89% : 0.000086s : 2: substitution.inline 3.49% : 0.000004s : 4: substitution.j_node_and_user_rematch 3.90% : 0.000005s : 4: substitution.remove_not_recompute_node 3.11% : 0.000004s : 2: substitution.replace_old_param 7.93% : 0.000009s : 1: substitution.value_based_eliminate ------[type_inference.] 0.004855 2 91.02% : 0.004419s : 1: type_inference.infer 8.98% : 0.000436s : 1: type_inference.specialize ------[replace.] 0.000020 2 100.00% : 0.000020s : 2: replace.inline ------[match.] 0.000084 2 100.00% : 0.000084s : 2: match.inline ------[predicate.] 0.000133 754 0.89% : 0.000001s : 7: predicate.accumulaten_eliminater 1.22% : 0.000002s : 3: predicate.ad_related_special_op_eliminate 0.68% : 0.000001s : 6: predicate.addn_check_dump 0.78% : 0.000001s : 7: predicate.addn_zero_filter 0.66% : 0.000001s : 7: predicate.adjust_all_reduce_mul_add 2.32% : 0.000003s : 13: predicate.arithmetic_simplify 0.78% : 0.000001s : 7: predicate.cast_eliminate 0.68% : 0.000001s : 6: predicate.check_bprop_eliminate 0.66% : 0.000001s : 6: predicate.compare_switch_simplify 0.25% : 0.000000s : 3: predicate.const_output_eliminate 0.75% : 0.000001s : 6: predicate.depend_value_elim 0.74% : 0.000001s : 7: predicate.dict_get_item_const_eliminator 0.88% : 0.000001s : 7: predicate.dict_get_item_eliminator 0.79% : 0.000001s : 7: predicate.dict_set_item_eliminator 1.18% : 0.000002s : 6: predicate.dumpgradient_eliminate 0.25% : 0.000000s : 3: predicate.elim_not_effective 0.54% : 0.000001s : 3: predicate.elim_shapecalc_of_broadcastargs 1.09% : 0.000001s : 10: predicate.environ_add_const_eliminate 0.99% : 0.000001s : 10: predicate.environ_get_add_eliminate 1.03% : 0.000001s : 10: predicate.environ_get_depend_swap 1.74% : 0.000002s : 16: predicate.environ_get_eliminate 1.04% : 0.000001s : 10: predicate.environ_get_set_eliminate 0.97% : 0.000001s : 9: predicate.exchange_switch_depend_value 2.01% : 0.000003s : 9: predicate.float_depend_g_call 0.68% : 0.000001s : 6: predicate.float_environ_get_switch 0.98% : 0.000001s : 9: predicate.float_tuple_getitem_switch 0.23% : 0.000000s : 3: predicate.fold_const_symbol 0.88% : 0.000001s : 6: predicate.get_grad_eliminate 0.24% : 0.000000s : 3: predicate.graph_param_transform 0.83% : 0.000001s : 6: predicate.incorporate_call 0.69% : 0.000001s : 6: predicate.incorporate_call_switch 6.85% : 0.000009s : 34: predicate.inline 1.02% : 0.000001s : 6: predicate.inline_without_move 0.40% : 0.000001s : 6: predicate.j_node_and_user_rematch 1.09% : 0.000001s : 6: predicate.less_batch_normalization 1.83% : 0.000002s : 13: predicate.list_to_tuple_eliminator_ 2.10% : 0.000003s : 20: predicate.load_eliminater 1.34% : 0.000002s : 3: predicate.loop_unroll_after_grad 1.67% : 0.000002s : 14: predicate.loop_unroll_before_grad 1.90% : 0.000003s : 13: predicate.make_slice_get_slice_eliminator 0.75% : 0.000001s : 6: predicate.merge_addn 0.81% : 0.000001s : 6: predicate.micro_step_allgather_replace 0.69% : 0.000001s : 6: predicate.mini_step_allgather_replace 0.64% : 0.000001s : 7: predicate.minmaximum_grad 1.31% : 0.000002s : 3: predicate.mutable_eliminate 0.42% : 0.000001s : 3: predicate.opt_reshape 0.54% : 0.000001s : 3: predicate.parallel_virtual_node 1.31% : 0.000002s : 9: predicate.partial_defer_inline 1.24% : 0.000002s : 10: predicate.partial_eliminate 0.78% : 0.000001s : 7: predicate.print_const_string_wrapper 0.77% : 0.000001s : 6: predicate.reduce_all_const_elim 1.06% : 0.000001s : 7: predicate.reduce_eliminate 2.03% : 0.000003s : 20: predicate.redundant_stop_gradient_eliminater 0.80% : 0.000001s : 6: predicate.remove_not_recompute_node 1.27% : 0.000002s : 13: predicate.replace_applicator 0.69% : 0.000001s : 6: predicate.replace_old_param 0.34% : 0.000000s : 3: predicate.reset_defer_inline 0.85% : 0.000001s : 7: predicate.reshape_eliminate 0.73% : 0.000001s : 6: predicate.row_tensor_add_zeros_like 0.43% : 0.000001s : 3: predicate.row_tensor_eliminate 0.94% : 0.000001s : 6: predicate.same_eliminate 0.52% : 0.000001s : 6: predicate.set_cell_output_no_recompute 1.12% : 0.000001s : 6: predicate.shard_identity_eliminate 1.02% : 0.000001s : 6: predicate.special_op_eliminate 0.94% : 0.000001s : 6: predicate.specialize_transform 1.10% : 0.000001s : 6: predicate.split_environ_get_set_with_tuple_value 0.89% : 0.000001s : 6: predicate.stack_unstack_eliminate 0.44% : 0.000001s : 3: predicate.switch_call_monad_eliminater 1.03% : 0.000001s : 9: predicate.switch_defer_inline 1.74% : 0.000002s : 15: predicate.switch_layer_defer_inline 4.50% : 0.000006s : 32: predicate.switch_simplify 0.72% : 0.000001s : 7: predicate.tile_eliminate 0.85% : 0.000001s : 7: predicate.transpose_eliminate 1.62% : 0.000002s : 13: predicate.tuple_list_convert_item_index_to_positive 1.65% : 0.000002s : 13: predicate.tuple_list_get_item_const_eliminator 1.57% : 0.000002s : 13: predicate.tuple_list_get_item_depend_reorder 3.27% : 0.000004s : 19: predicate.tuple_list_get_item_eliminator 1.40% : 0.000002s : 13: predicate.tuple_list_get_set_item_eliminator 2.40% : 0.000003s : 19: predicate.tuple_list_set_item_eliminator 1.52% : 0.000002s : 13: predicate.tuple_to_list_eliminator_ 1.97% : 0.000003s : 20: predicate.updatestate_pure_node_eliminater 2.83% : 0.000004s : 26: predicate.updatestate_useless_node_eliminater 0.51% : 0.000001s : 3: predicate.value_based_eliminate 0.76% : 0.000001s : 6: predicate.virtual_dataset_eliminate 0.80% : 0.000001s : 6: predicate.virtual_output_eliminate 0.35% : 0.000000s : 3: predicate.virtual_view_grad_eliminate 0.42% : 0.000001s : 3: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000219 5 7.83% : 0.000017s : 1: func_graph_cloner_run.FuncGraphClonerGraph 92.17% : 0.000202s : 4: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.023841 192 0.02% : 0.000006s : 1: ForceFp32Comm 13.49% : 0.003215s : 1: add_attr 13.43% : 0.003202s : 1: add_attr_with_inline 0.02% : 0.000006s : 1: add_comm_op_reuse_tag 0.21% : 0.000051s : 1: add_recomputation 0.03% : 0.000006s : 1: assign_add_opt 0.28% : 0.000067s : 1: auto_monad 0.11% : 0.000025s : 1: auto_monad_reorder 0.02% : 0.000006s : 1: begin_end_overlap_inline 0.03% : 0.000008s : 1: bias_add_comm_swap 2.02% : 0.000481s : 1: bootstrap 0.13% : 0.000032s : 1: cconv 0.03% : 0.000006s : 1: comm_op_add_attrs 0.07% : 0.000017s : 1: control_data_broadcast_order 0.05% : 0.000012s : 1: convert_after_rewriter 0.12% : 0.000030s : 1: cse_after_recomputation 0.03% : 0.000007s : 1: dataset_repeat_opt 0.08% : 0.000018s : 1: detach_backward 0.04% : 0.000011s : 1: environ_conv 0.10% : 0.000024s : 1: event_method 0.03% : 0.000007s : 1: full_micro_interleaved_order_control 0.03% : 0.000008s : 1: get_jit_bprop_graph 0.05% : 0.000012s : 1: graph_reusing 0.03% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.02% : 0.000006s : 1: handle_group_info 0.04% : 0.000008s : 1: inline 0.03% : 0.000008s : 1: insert-virtual-dataset 0.03% : 0.000006s : 1: interleave_parallel_branches 0.03% : 0.000006s : 1: interleave_split_concat_branches 0.03% : 0.000008s : 1: label_fine_grained_interleaved_index 0.04% : 0.000009s : 1: label_micro_interleaved_index 1.85% : 0.000441s : 1: loop_unroll 0.03% : 0.000007s : 1: merge_cast_opt 0.03% : 0.000007s : 1: micro_interleaved_order_control 2.19% : 0.000522s : 1: mutable_eliminate 0.04% : 0.000010s : 1: offloading_packed_experts 0.06% : 0.000014s : 1: opt.transform.loop_unroll_optimizer 0.06% : 0.000014s : 1: opt.transform.mutable_eliminate 3.19% : 0.000761s : 78: opt.transform.opt_a 0.11% : 0.000026s : 1: opt.transform.opt_after_cconv 0.10% : 0.000023s : 1: opt.transform.opt_after_jit_grad 0.47% : 0.000111s : 28: opt.transform.opt_b 0.18% : 0.000043s : 2: opt.transform.opt_trans_graph 0.15% : 0.000035s : 4: opt.transform.symbol_engine_opt 10.57% : 0.002519s : 1: opt_a 0.52% : 0.000124s : 1: opt_after_cconv 2.03% : 0.000484s : 1: opt_after_jit_grad 1.19% : 0.000285s : 1: opt_b 21.15% : 0.005042s : 1: optimize 0.10% : 0.000023s : 1: optimize_parallel_all_gather_comm 0.05% : 0.000011s : 1: order_py_execute_after_rewriter 0.11% : 0.000025s : 1: overlap_grad_flash_sp 0.03% : 0.000007s : 1: overlap_grad_matmul_and_grad_allreduce 0.04% : 0.000009s : 1: overlap_grad_ring_attention 0.03% : 0.000007s : 1: overlap_opt_shard_grad_in_pipeline 0.03% : 0.000007s : 1: overlap_opt_shard_in_pipeline 0.03% : 0.000008s : 1: overlap_param_gather 0.03% : 0.000006s : 1: overlap_recompute_allgather_and_fa_grad 0.04% : 0.000010s : 1: overlap_recompute_and_grad_model_parallel 0.03% : 0.000008s : 1: overlap_recompute_comm 0.04% : 0.000010s : 1: parallel-infer-symbol 0.03% : 0.000007s : 1: parallel-infer-symbol-second 0.03% : 0.000007s : 1: partial_unused_args_eliminate 0.04% : 0.000009s : 1: pipeline_parallel_scheduler 0.03% : 0.000007s : 1: pipeline_split 0.13% : 0.000032s : 1: pre_auto_parallel 0.10% : 0.000024s : 1: py_interpret_to_execute 0.07% : 0.000016s : 1: py_interpret_to_execute_after_opt_a 0.03% : 0.000006s : 1: remove_cast_before_assign_add 0.08% : 0.000020s : 1: remove_dup_value 1.22% : 0.000292s : 1: renormalize.infer 0.97% : 0.000231s : 1: renormalize.specialize 0.03% : 0.000008s : 1: reorder_send_recv_between_fp_bp 0.04% : 0.000011s : 1: rewriter_after_jit_bprop_graph 0.18% : 0.000044s : 1: rewriter_after_opt_a 0.22% : 0.000053s : 1: rewriter_before_opt_a 0.03% : 0.000007s : 1: slice_cell_reuse_recomputed_activation 0.03% : 0.000007s : 1: slice_recompute_activation 0.03% : 0.000007s : 1: split_layernorm_comm 0.03% : 0.000007s : 1: split_matmul_comm_elemetwise 0.04% : 0.000010s : 1: swap_dp_allreduce_reducescatter 0.41% : 0.000097s : 1: symbol_engine_optimizer 0.37% : 0.000087s : 1: tuple_transform 20.71% : 0.004937s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:56.589.897 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0132508, [21] [bootstrap]: 0.00043222 [type_inference]: 0.00479085 [event_method]: 1.118e-05 [auto_monad]: 5.405e-05 [graph_reusing]: 5.20999e-06 [inline]: 2.37001e-06 [add_attr]: 0.00310683, [1] [add_attr_with_inline]: 0.00309789, [1] [Cycle 1]: 5.376e-05, [2] [tag_attr]: 1.391e-05 [meta_addattr_fg_expand]: 3.66001e-06 [parallel-infer-symbol]: 3.27002e-06 [pre_auto_parallel]: 2.597e-05 [insert-virtual-dataset]: 2.91e-06 [parallel-infer-symbol-second]: 6.89994e-07 [dataset_repeat_opt]: 1.83997e-06 [pipeline_split]: 1.70001e-06 [optimize]: 0.00413933, [53] [py_interpret_to_execute]: 1.802e-05 [rewriter_before_opt_a]: 4.518e-05 [opt_a]: 0.00218839, [2] [Cycle 1]: 0.00152827, [45] [expand_dump_flag]: 3.31999e-06 [switch_simplify]: 2.488e-05 [loop_unroll]: 1.362e-05 [a_1]: 0.00028465 [with_stream_mark]: 1.587e-05 [recompute_prepare]: 8.43001e-06 [updatestate_depend_eliminate]: 3.61001e-06 [updatestate_assign_eliminate]: 4.12003e-06 [updatestate_loads_eliminate]: 3.39001e-06 [parameter_eliminate]: 1.78002e-06 [a_2]: 7.783e-05 [accelerated_algorithm]: 6.88e-06 [shard]: 2.14e-06 [meta_shard_fg_expand]: 1.64e-06 [shard_inline]: 5.83002e-06 [merge_send_recv]: 7.93001e-06 [auto_parallel]: 7.36001e-06 [parallel]: 1.954e-05 [flash_sp]: 7.35e-06 [merge_comm]: 3.97e-06 [allreduce_fusion]: 4.74998e-06 [matmul_add_comm_reduction]: 9.32999e-06 [allreduce_slice_to_reducescatter]: 6.19999e-07 [virtual_shard_identity]: 7.46001e-06 [virtual_dataset]: 6.59001e-06 [get_grad_eliminate_]: 5.65001e-06 [virtual_output]: 6.00002e-06 [merge_forward]: 3.56001e-06 [cell_reuse_recompute_pass]: 1.34e-06 [offload_activation]: 9.49e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.261e-05 [merge_recompute_call_nodes]: 1.78002e-06 [before_grad]: 1.019e-05 [set_forward_comm_id_for_comm_node_pass]: 4.19002e-06 [meta_fg_expand]: 2.43e-06 [flash_sp_send_recv_attached]: 2.39999e-06 [receive_attached]: 2.24001e-06 [after_resolve]: 9.14998e-06 [a_after_grad]: 8.83001e-06 [renormalize]: 0.000582 [add_forward_monad_depend]: 4.87e-06 [auto_monad_grad]: 2.31e-06 [auto_monad_eliminator]: 1.564e-05 [cse]: 3.017e-05 [a_3]: 4.449e-05 [Cycle 2]: 0.00065064, [45] [expand_dump_flag]: 1.13001e-06 [switch_simplify]: 7.43999e-06 [loop_unroll]: 5.91e-06 [a_1]: 0.00010644 [with_stream_mark]: 1.067e-05 [recompute_prepare]: 5.81e-06 [updatestate_depend_eliminate]: 3.09001e-06 [updatestate_assign_eliminate]: 2.56998e-06 [updatestate_loads_eliminate]: 2.88998e-06 [parameter_eliminate]: 1.39e-06 [a_2]: 6.931e-05 [accelerated_algorithm]: 5.70001e-06 [shard]: 1.34e-06 [meta_shard_fg_expand]: 1.57001e-06 [shard_inline]: 6.09999e-06 [merge_send_recv]: 5.15999e-06 [auto_parallel]: 5.92999e-06 [parallel]: 4.53999e-06 [flash_sp]: 3.44001e-06 [merge_comm]: 3.13998e-06 [allreduce_fusion]: 3.14001e-06 [matmul_add_comm_reduction]: 6.21998e-06 [allreduce_slice_to_reducescatter]: 4.69998e-07 [virtual_shard_identity]: 6.34001e-06 [virtual_dataset]: 5.50001e-06 [get_grad_eliminate_]: 5.50001e-06 [virtual_output]: 5.19998e-06 [merge_forward]: 2.58003e-06 [cell_reuse_recompute_pass]: 1.83002e-06 [offload_activation]: 6.59999e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.228e-05 [merge_recompute_call_nodes]: 7.29982e-07 [before_grad]: 9.58002e-06 [set_forward_comm_id_for_comm_node_pass]: 3.75998e-06 [meta_fg_expand]: 2.46e-06 [flash_sp_send_recv_attached]: 1.28002e-06 [receive_attached]: 1.39e-06 [after_resolve]: 9.91e-06 [a_after_grad]: 8.46002e-06 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 1.45001e-06 [auto_monad_grad]: 1.28002e-06 [auto_monad_eliminator]: 7.48999e-06 [cse]: 1.569e-05 [a_3]: 3.477e-05 [py_interpret_to_execute_after_opt_a]: 8.58001e-06 [slice_cell_reuse_recomputed_activation]: 1.99999e-06 [rewriter_after_opt_a]: 3.452e-05 [convert_after_rewriter]: 6.29999e-06 [order_py_execute_after_rewriter]: 5.65001e-06 [mutable_eliminate]: 0.00050321 [opt_b]: 0.00021753, [1] [Cycle 1]: 0.00021163, [7] [b_1]: 0.00013525 [b_2]: 7.83999e-06 [updatestate_depend_eliminate]: 5.82001e-06 [updatestate_assign_eliminate]: 2.64999e-06 [updatestate_loads_eliminate]: 2.47001e-06 [renormalize]: 3.39991e-07 [cse]: 2.133e-05 [optimize_parallel_all_gather_comm]: 1.608e-05 [overlap_param_gather]: 1.84e-06 [cconv]: 2.567e-05 [loop_unroll]: 0.00042363 [opt_after_cconv]: 0.00010161, [1] [Cycle 1]: 9.554e-05, [7] [c_1]: 2.723e-05 [parameter_eliminate]: 3.73999e-06 [updatestate_depend_eliminate]: 4.95999e-06 [updatestate_assign_eliminate]: 2.42001e-06 [updatestate_loads_eliminate]: 2.19999e-06 [cse]: 1.913e-05 [renormalize]: 3.69997e-07 [remove_dup_value]: 1.329e-05 [tuple_transform]: 7.201e-05, [1] [Cycle 1]: 6.759e-05, [4] [d_1]: 3.958e-05 [none_parameter_eliminate]: 1.64e-06 [renormalize]: 2.00002e-07 [switch_simplify]: 7.56001e-06 [partial_unused_args_eliminate]: 1.89999e-06 [add_recomputation]: 4.713e-05 [cse_after_recomputation]: 2.213e-05, [1] [Cycle 1]: 1.739e-05, [1] [cse]: 1.206e-05 [environ_conv]: 4.95001e-06 [swap_dp_allreduce_reducescatter]: 5.39998e-06 [bias_add_comm_swap]: 2.21e-06 [label_micro_interleaved_index]: 4.45999e-06 [label_fine_grained_interleaved_index]: 2.71999e-06 [merge_cast_opt]: 1.34998e-06 [slice_recompute_activation]: 2.16998e-06 [micro_interleaved_order_control]: 2.43e-06 [assign_add_opt]: 1.49e-06 [ForceFp32Comm]: 9.00007e-07 [remove_cast_before_assign_add]: 9.70002e-07 [full_micro_interleaved_order_control]: 1.94e-06 [reorder_send_recv_between_fp_bp]: 2.69001e-06 [comm_op_add_attrs]: 1.00001e-06 [add_comm_op_reuse_tag]: 9.79984e-07 [interleave_split_concat_branches]: 1.20999e-06 [interleave_parallel_branches]: 1.09e-06 [overlap_opt_shard_in_pipeline]: 1.12999e-06 [overlap_opt_shard_grad_in_pipeline]: 1.92001e-06 [control_data_broadcast_order]: 1.281e-05 [grouped_pairwise_exchange_alltoall]: 1.93997e-06 [offloading_packed_experts]: 3.73999e-06 [overlap_recompute_and_grad_model_parallel]: 4.93001e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.14e-06 [overlap_recompute_allgather_and_fa_grad]: 1.54e-06 [overlap_recompute_comm]: 2.26e-06 [overlap_grad_ring_attention]: 4.1e-06 [overlap_grad_flash_sp]: 1.914e-05 [begin_end_overlap_inline]: 5.3001e-07 [split_matmul_comm_elemetwise]: 2.19999e-06 [split_layernorm_comm]: 1.87001e-06 [handle_group_info]: 9.90025e-07 [symbol_engine_optimizer]: 7.367e-05, [1] [Cycle 1]: 6.951e-05, [6] [build]: 2.86999e-06 [elim_shapecalc]: 9.69e-06 [elim_not_effective]: 1.234e-05 [opt_reshape]: 6.59999e-06 [fold_const_symbol]: 9.84001e-06 [renormalize]: 2.20025e-07 [detach_backward]: 1.74998e-06 [pipeline_parallel_scheduler]: 1.59e-06 [auto_monad_reorder]: 1.556e-05 [get_jit_bprop_graph]: 1.40999e-06 [rewriter_after_jit_bprop_graph]: 4.93001e-06 [opt_after_jit_grad]: 0.00046422 [validate]: 3.805e-05 Sums bootstrap : 0.000432s : 4.72% type_inference : 0.004791s : 52.26% event_method : 0.000011s : 0.12% auto_monad : 0.000054s : 0.59% graph_reusing : 0.000005s : 0.06% inline : 0.000002s : 0.03% add_attr.add_attr_with_inline.tag_attr : 0.000014s : 0.15% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000004s : 0.04% parallel-infer-symbol : 0.000003s : 0.04% pre_auto_parallel : 0.000026s : 0.28% insert-virtual-dataset : 0.000003s : 0.03% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.02% optimize.py_interpret_to_execute : 0.000018s : 0.20% optimize.rewriter_before_opt_a : 0.000045s : 0.49% optimize.opt_a.expand_dump_flag : 0.000004s : 0.05% optimize.opt_a.switch_simplify : 0.000032s : 0.35% optimize.opt_a.loop_unroll : 0.000020s : 0.21% optimize.opt_a.a_1 : 0.000391s : 4.27% optimize.opt_a.with_stream_mark : 0.000027s : 0.29% optimize.opt_a.recompute_prepare : 0.000014s : 0.16% optimize.opt_a.updatestate_depend_eliminate : 0.000007s : 0.07% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.07% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.07% optimize.opt_a.parameter_eliminate : 0.000003s : 0.03% optimize.opt_a.a_2 : 0.000147s : 1.61% optimize.opt_a.accelerated_algorithm : 0.000013s : 0.14% optimize.opt_a.shard : 0.000003s : 0.04% optimize.opt_a.meta_shard_fg_expand : 0.000003s : 0.04% optimize.opt_a.shard_inline : 0.000012s : 0.13% optimize.opt_a.merge_send_recv : 0.000013s : 0.14% optimize.opt_a.auto_parallel : 0.000013s : 0.14% optimize.opt_a.parallel : 0.000024s : 0.26% optimize.opt_a.flash_sp : 0.000011s : 0.12% optimize.opt_a.merge_comm : 0.000007s : 0.08% optimize.opt_a.allreduce_fusion : 0.000008s : 0.09% optimize.opt_a.matmul_add_comm_reduction : 0.000016s : 0.17% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000014s : 0.15% optimize.opt_a.virtual_dataset : 0.000012s : 0.13% optimize.opt_a.get_grad_eliminate_ : 0.000011s : 0.12% optimize.opt_a.virtual_output : 0.000011s : 0.12% optimize.opt_a.merge_forward : 0.000006s : 0.07% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.03% optimize.opt_a.offload_activation : 0.000016s : 0.18% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000025s : 0.27% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.03% optimize.opt_a.before_grad : 0.000020s : 0.22% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000008s : 0.09% optimize.opt_a.meta_fg_expand : 0.000005s : 0.05% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.04% optimize.opt_a.receive_attached : 0.000004s : 0.04% optimize.opt_a.after_resolve : 0.000019s : 0.21% optimize.opt_a.a_after_grad : 0.000017s : 0.19% optimize.opt_a.renormalize : 0.000582s : 6.35% optimize.opt_a.add_forward_monad_depend : 0.000006s : 0.07% optimize.opt_a.auto_monad_grad : 0.000004s : 0.04% optimize.opt_a.auto_monad_eliminator : 0.000023s : 0.25% optimize.opt_a.cse : 0.000046s : 0.50% optimize.opt_a.a_3 : 0.000079s : 0.86% optimize.py_interpret_to_execute_after_opt_a : 0.000009s : 0.09% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.02% optimize.rewriter_after_opt_a : 0.000035s : 0.38% optimize.convert_after_rewriter : 0.000006s : 0.07% optimize.order_py_execute_after_rewriter : 0.000006s : 0.06% optimize.mutable_eliminate : 0.000503s : 5.49% optimize.opt_b.b_1 : 0.000135s : 1.48% optimize.opt_b.b_2 : 0.000008s : 0.09% optimize.opt_b.updatestate_depend_eliminate : 0.000006s : 0.06% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_b.updatestate_loads_eliminate : 0.000002s : 0.03% optimize.opt_b.renormalize : 0.000000s : 0.00% optimize.opt_b.cse : 0.000021s : 0.23% optimize.optimize_parallel_all_gather_comm : 0.000016s : 0.18% optimize.overlap_param_gather : 0.000002s : 0.02% optimize.cconv : 0.000026s : 0.28% optimize.loop_unroll : 0.000424s : 4.62% optimize.opt_after_cconv.c_1 : 0.000027s : 0.30% optimize.opt_after_cconv.parameter_eliminate : 0.000004s : 0.04% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000005s : 0.05% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000002s : 0.03% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.02% optimize.opt_after_cconv.cse : 0.000019s : 0.21% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000013s : 0.14% optimize.tuple_transform.d_1 : 0.000040s : 0.43% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.02% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000008s : 0.08% optimize.partial_unused_args_eliminate : 0.000002s : 0.02% optimize.add_recomputation : 0.000047s : 0.51% optimize.cse_after_recomputation.cse : 0.000012s : 0.13% optimize.environ_conv : 0.000005s : 0.05% optimize.swap_dp_allreduce_reducescatter : 0.000005s : 0.06% optimize.bias_add_comm_swap : 0.000002s : 0.02% optimize.label_micro_interleaved_index : 0.000004s : 0.05% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.03% optimize.merge_cast_opt : 0.000001s : 0.01% optimize.slice_recompute_activation : 0.000002s : 0.02% optimize.micro_interleaved_order_control : 0.000002s : 0.03% optimize.assign_add_opt : 0.000001s : 0.02% optimize.ForceFp32Comm : 0.000001s : 0.01% optimize.remove_cast_before_assign_add : 0.000001s : 0.01% optimize.full_micro_interleaved_order_control : 0.000002s : 0.02% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.03% optimize.comm_op_add_attrs : 0.000001s : 0.01% optimize.add_comm_op_reuse_tag : 0.000001s : 0.01% optimize.interleave_split_concat_branches : 0.000001s : 0.01% optimize.interleave_parallel_branches : 0.000001s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.02% optimize.control_data_broadcast_order : 0.000013s : 0.14% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.02% optimize.offloading_packed_experts : 0.000004s : 0.04% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.05% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000002s : 0.02% optimize.overlap_recompute_comm : 0.000002s : 0.02% optimize.overlap_grad_ring_attention : 0.000004s : 0.04% optimize.overlap_grad_flash_sp : 0.000019s : 0.21% optimize.begin_end_overlap_inline : 0.000001s : 0.01% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.02% optimize.split_layernorm_comm : 0.000002s : 0.02% optimize.handle_group_info : 0.000001s : 0.01% optimize.symbol_engine_optimizer.build : 0.000003s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000010s : 0.11% optimize.symbol_engine_optimizer.elim_not_effective : 0.000012s : 0.13% optimize.symbol_engine_optimizer.opt_reshape : 0.000007s : 0.07% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000010s : 0.11% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.02% pipeline_parallel_scheduler : 0.000002s : 0.02% auto_monad_reorder : 0.000016s : 0.17% get_jit_bprop_graph : 0.000001s : 0.02% rewriter_after_jit_bprop_graph : 0.000005s : 0.05% opt_after_jit_grad : 0.000464s : 5.06% validate : 0.000038s : 0.42% Time group info: ------[substitution.] 0.000118 20 1.61% : 0.000002s : 2: substitution.elim_not_effective 1.16% : 0.000001s : 2: substitution.fold_const_symbol 4.64% : 0.000005s : 3: substitution.graph_param_transform 74.04% : 0.000087s : 2: substitution.inline 2.76% : 0.000003s : 4: substitution.j_node_and_user_rematch 3.60% : 0.000004s : 4: substitution.remove_not_recompute_node 2.93% : 0.000003s : 2: substitution.replace_old_param 9.27% : 0.000011s : 1: substitution.value_based_eliminate ------[type_inference.] 0.004743 2 91.30% : 0.004331s : 1: type_inference.infer 8.70% : 0.000413s : 1: type_inference.specialize ------[replace.] 0.000022 2 100.00% : 0.000022s : 2: replace.inline ------[match.] 0.000085 2 100.00% : 0.000085s : 2: match.inline ------[predicate.] 0.000131 754 0.83% : 0.000001s : 7: predicate.accumulaten_eliminater 1.08% : 0.000001s : 3: predicate.ad_related_special_op_eliminate 0.68% : 0.000001s : 6: predicate.addn_check_dump 0.88% : 0.000001s : 7: predicate.addn_zero_filter 0.69% : 0.000001s : 7: predicate.adjust_all_reduce_mul_add 2.17% : 0.000003s : 13: predicate.arithmetic_simplify 0.81% : 0.000001s : 7: predicate.cast_eliminate 0.72% : 0.000001s : 6: predicate.check_bprop_eliminate 0.69% : 0.000001s : 6: predicate.compare_switch_simplify 0.24% : 0.000000s : 3: predicate.const_output_eliminate 0.72% : 0.000001s : 6: predicate.depend_value_elim 0.81% : 0.000001s : 7: predicate.dict_get_item_const_eliminator 0.87% : 0.000001s : 7: predicate.dict_get_item_eliminator 0.76% : 0.000001s : 7: predicate.dict_set_item_eliminator 1.25% : 0.000002s : 6: predicate.dumpgradient_eliminate 0.27% : 0.000000s : 3: predicate.elim_not_effective 0.70% : 0.000001s : 3: predicate.elim_shapecalc_of_broadcastargs 1.08% : 0.000001s : 10: predicate.environ_add_const_eliminate 1.04% : 0.000001s : 10: predicate.environ_get_add_eliminate 1.04% : 0.000001s : 10: predicate.environ_get_depend_swap 1.78% : 0.000002s : 16: predicate.environ_get_eliminate 0.99% : 0.000001s : 10: predicate.environ_get_set_eliminate 0.95% : 0.000001s : 9: predicate.exchange_switch_depend_value 1.87% : 0.000002s : 9: predicate.float_depend_g_call 0.61% : 0.000001s : 6: predicate.float_environ_get_switch 0.95% : 0.000001s : 9: predicate.float_tuple_getitem_switch 0.23% : 0.000000s : 3: predicate.fold_const_symbol 0.85% : 0.000001s : 6: predicate.get_grad_eliminate 0.34% : 0.000000s : 3: predicate.graph_param_transform 0.76% : 0.000001s : 6: predicate.incorporate_call 0.63% : 0.000001s : 6: predicate.incorporate_call_switch 6.67% : 0.000009s : 34: predicate.inline 1.14% : 0.000001s : 6: predicate.inline_without_move 0.37% : 0.000000s : 6: predicate.j_node_and_user_rematch 1.17% : 0.000002s : 6: predicate.less_batch_normalization 1.81% : 0.000002s : 13: predicate.list_to_tuple_eliminator_ 2.07% : 0.000003s : 20: predicate.load_eliminater 1.38% : 0.000002s : 3: predicate.loop_unroll_after_grad 1.69% : 0.000002s : 14: predicate.loop_unroll_before_grad 1.62% : 0.000002s : 13: predicate.make_slice_get_slice_eliminator 0.65% : 0.000001s : 6: predicate.merge_addn 0.65% : 0.000001s : 6: predicate.micro_step_allgather_replace 0.72% : 0.000001s : 6: predicate.mini_step_allgather_replace 0.71% : 0.000001s : 7: predicate.minmaximum_grad 1.79% : 0.000002s : 3: predicate.mutable_eliminate 0.45% : 0.000001s : 3: predicate.opt_reshape 0.56% : 0.000001s : 3: predicate.parallel_virtual_node 1.26% : 0.000002s : 9: predicate.partial_defer_inline 1.25% : 0.000002s : 10: predicate.partial_eliminate 0.94% : 0.000001s : 7: predicate.print_const_string_wrapper 0.72% : 0.000001s : 6: predicate.reduce_all_const_elim 1.03% : 0.000001s : 7: predicate.reduce_eliminate 2.09% : 0.000003s : 20: predicate.redundant_stop_gradient_eliminater 0.60% : 0.000001s : 6: predicate.remove_not_recompute_node 1.18% : 0.000002s : 13: predicate.replace_applicator 0.69% : 0.000001s : 6: predicate.replace_old_param 0.32% : 0.000000s : 3: predicate.reset_defer_inline 0.80% : 0.000001s : 7: predicate.reshape_eliminate 0.73% : 0.000001s : 6: predicate.row_tensor_add_zeros_like 0.48% : 0.000001s : 3: predicate.row_tensor_eliminate 1.04% : 0.000001s : 6: predicate.same_eliminate 0.52% : 0.000001s : 6: predicate.set_cell_output_no_recompute 1.04% : 0.000001s : 6: predicate.shard_identity_eliminate 0.95% : 0.000001s : 6: predicate.special_op_eliminate 0.92% : 0.000001s : 6: predicate.specialize_transform 1.14% : 0.000001s : 6: predicate.split_environ_get_set_with_tuple_value 0.90% : 0.000001s : 6: predicate.stack_unstack_eliminate 0.45% : 0.000001s : 3: predicate.switch_call_monad_eliminater 1.04% : 0.000001s : 9: predicate.switch_defer_inline 1.72% : 0.000002s : 15: predicate.switch_layer_defer_inline 4.36% : 0.000006s : 32: predicate.switch_simplify 0.74% : 0.000001s : 7: predicate.tile_eliminate 0.85% : 0.000001s : 7: predicate.transpose_eliminate 1.53% : 0.000002s : 13: predicate.tuple_list_convert_item_index_to_positive 1.70% : 0.000002s : 13: predicate.tuple_list_get_item_const_eliminator 1.40% : 0.000002s : 13: predicate.tuple_list_get_item_depend_reorder 3.15% : 0.000004s : 19: predicate.tuple_list_get_item_eliminator 1.48% : 0.000002s : 13: predicate.tuple_list_get_set_item_eliminator 2.34% : 0.000003s : 19: predicate.tuple_list_set_item_eliminator 1.50% : 0.000002s : 13: predicate.tuple_to_list_eliminator_ 1.98% : 0.000003s : 20: predicate.updatestate_pure_node_eliminater 3.11% : 0.000004s : 26: predicate.updatestate_useless_node_eliminater 0.71% : 0.000001s : 3: predicate.value_based_eliminate 0.86% : 0.000001s : 6: predicate.virtual_dataset_eliminate 0.85% : 0.000001s : 6: predicate.virtual_output_eliminate 0.32% : 0.000000s : 3: predicate.virtual_view_grad_eliminate 0.53% : 0.000001s : 3: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000211 5 8.22% : 0.000017s : 1: func_graph_cloner_run.FuncGraphClonerGraph 91.78% : 0.000194s : 4: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.021971 192 0.02% : 0.000004s : 1: ForceFp32Comm 14.16% : 0.003112s : 1: add_attr 14.12% : 0.003101s : 1: add_attr_with_inline 0.02% : 0.000004s : 1: add_comm_op_reuse_tag 0.23% : 0.000051s : 1: add_recomputation 0.02% : 0.000004s : 1: assign_add_opt 0.27% : 0.000059s : 1: auto_monad 0.09% : 0.000019s : 1: auto_monad_reorder 0.01% : 0.000003s : 1: begin_end_overlap_inline 0.02% : 0.000005s : 1: bias_add_comm_swap 2.10% : 0.000461s : 1: bootstrap 0.13% : 0.000029s : 1: cconv 0.02% : 0.000004s : 1: comm_op_add_attrs 0.07% : 0.000016s : 1: control_data_broadcast_order 0.04% : 0.000010s : 1: convert_after_rewriter 0.11% : 0.000025s : 1: cse_after_recomputation 0.02% : 0.000005s : 1: dataset_repeat_opt 0.02% : 0.000005s : 1: detach_backward 0.04% : 0.000008s : 1: environ_conv 0.08% : 0.000018s : 1: event_method 0.02% : 0.000005s : 1: full_micro_interleaved_order_control 0.02% : 0.000004s : 1: get_jit_bprop_graph 0.04% : 0.000009s : 1: graph_reusing 0.02% : 0.000005s : 1: grouped_pairwise_exchange_alltoall 0.02% : 0.000004s : 1: handle_group_info 0.02% : 0.000005s : 1: inline 0.03% : 0.000006s : 1: insert-virtual-dataset 0.02% : 0.000004s : 1: interleave_parallel_branches 0.02% : 0.000004s : 1: interleave_split_concat_branches 0.03% : 0.000006s : 1: label_fine_grained_interleaved_index 0.03% : 0.000007s : 1: label_micro_interleaved_index 1.97% : 0.000432s : 1: loop_unroll 0.02% : 0.000004s : 1: merge_cast_opt 0.02% : 0.000005s : 1: micro_interleaved_order_control 2.33% : 0.000511s : 1: mutable_eliminate 0.03% : 0.000007s : 1: offloading_packed_experts 0.06% : 0.000014s : 1: opt.transform.loop_unroll_optimizer 0.07% : 0.000015s : 1: opt.transform.mutable_eliminate 3.40% : 0.000748s : 78: opt.transform.opt_a 0.12% : 0.000026s : 1: opt.transform.opt_after_cconv 0.10% : 0.000023s : 1: opt.transform.opt_after_jit_grad 0.50% : 0.000111s : 28: opt.transform.opt_b 0.20% : 0.000045s : 2: opt.transform.opt_trans_graph 0.16% : 0.000035s : 4: opt.transform.symbol_engine_opt 9.97% : 0.002191s : 1: opt_a 0.48% : 0.000105s : 1: opt_after_cconv 2.15% : 0.000473s : 1: opt_after_jit_grad 1.01% : 0.000221s : 1: opt_b 18.86% : 0.004144s : 1: optimize 0.09% : 0.000020s : 1: optimize_parallel_all_gather_comm 0.04% : 0.000009s : 1: order_py_execute_after_rewriter 0.10% : 0.000022s : 1: overlap_grad_flash_sp 0.02% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.03% : 0.000007s : 1: overlap_grad_ring_attention 0.02% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.02% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.02% : 0.000005s : 1: overlap_param_gather 0.02% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.04% : 0.000008s : 1: overlap_recompute_and_grad_model_parallel 0.02% : 0.000005s : 1: overlap_recompute_comm 0.03% : 0.000007s : 1: parallel-infer-symbol 0.02% : 0.000003s : 1: parallel-infer-symbol-second 0.02% : 0.000005s : 1: partial_unused_args_eliminate 0.02% : 0.000005s : 1: pipeline_parallel_scheduler 0.02% : 0.000005s : 1: pipeline_split 0.14% : 0.000030s : 1: pre_auto_parallel 0.10% : 0.000021s : 1: py_interpret_to_execute 0.05% : 0.000012s : 1: py_interpret_to_execute_after_opt_a 0.02% : 0.000004s : 1: remove_cast_before_assign_add 0.08% : 0.000017s : 1: remove_dup_value 1.63% : 0.000359s : 1: renormalize.infer 0.98% : 0.000215s : 1: renormalize.specialize 0.03% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.04% : 0.000008s : 1: rewriter_after_jit_bprop_graph 0.17% : 0.000038s : 1: rewriter_after_opt_a 0.22% : 0.000049s : 1: rewriter_before_opt_a 0.02% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.02% : 0.000005s : 1: slice_recompute_activation 0.02% : 0.000005s : 1: split_layernorm_comm 0.02% : 0.000005s : 1: split_matmul_comm_elemetwise 0.04% : 0.000008s : 1: swap_dp_allreduce_reducescatter 0.35% : 0.000077s : 1: symbol_engine_optimizer 0.34% : 0.000075s : 1: tuple_transform 21.87% : 0.004805s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:56.782.662 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:56.782.939 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0155213, [21] [bootstrap]: 0.0004427 [type_inference]: 0.00528186 [event_method]: 1.218e-05 [auto_monad]: 5.826e-05 [graph_reusing]: 5.57999e-06 [inline]: 2.58998e-06 [add_attr]: 0.00350323, [1] [add_attr_with_inline]: 0.00349367, [1] [Cycle 1]: 7.871e-05, [2] [tag_attr]: 1.517e-05 [meta_addattr_fg_expand]: 4.11001e-06 [parallel-infer-symbol]: 3.33998e-06 [pre_auto_parallel]: 2.743e-05 [insert-virtual-dataset]: 2.63e-06 [parallel-infer-symbol-second]: 8.00006e-07 [dataset_repeat_opt]: 2.18998e-06 [pipeline_split]: 1.65001e-06 [optimize]: 0.00496745, [53] [py_interpret_to_execute]: 2.127e-05 [rewriter_before_opt_a]: 5.252e-05 [opt_a]: 0.00255501, [2] [Cycle 1]: 0.0017451, [45] [expand_dump_flag]: 3.08998e-06 [switch_simplify]: 2.425e-05 [loop_unroll]: 1.376e-05 [a_1]: 0.00030791 [with_stream_mark]: 1.72e-05 [recompute_prepare]: 8.03001e-06 [updatestate_depend_eliminate]: 4.23999e-06 [updatestate_assign_eliminate]: 3.75e-06 [updatestate_loads_eliminate]: 2.85002e-06 [parameter_eliminate]: 1.86998e-06 [a_2]: 0.00010805 [accelerated_algorithm]: 6.83e-06 [shard]: 2.19001e-06 [meta_shard_fg_expand]: 1.96e-06 [shard_inline]: 6.58e-06 [merge_send_recv]: 9.82999e-06 [auto_parallel]: 6.94001e-06 [parallel]: 1.836e-05 [flash_sp]: 9.36e-06 [merge_comm]: 3.66999e-06 [allreduce_fusion]: 3.71001e-06 [matmul_add_comm_reduction]: 1.057e-05 [allreduce_slice_to_reducescatter]: 8.49977e-07 [virtual_shard_identity]: 8.51002e-06 [virtual_dataset]: 6.29001e-06 [get_grad_eliminate_]: 6.14999e-06 [virtual_output]: 5.76e-06 [merge_forward]: 4.28999e-06 [cell_reuse_recompute_pass]: 1.30001e-06 [offload_activation]: 1.081e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.563e-05 [merge_recompute_call_nodes]: 1.42999e-06 [before_grad]: 1.078e-05 [set_forward_comm_id_for_comm_node_pass]: 4.12003e-06 [meta_fg_expand]: 2.68e-06 [flash_sp_send_recv_attached]: 2.76e-06 [receive_attached]: 2.38998e-06 [after_resolve]: 1.017e-05 [a_after_grad]: 1.003e-05 [renormalize]: 0.00055888 [add_forward_monad_depend]: 6.69999e-06 [auto_monad_grad]: 2.22999e-06 [auto_monad_eliminator]: 1.571e-05 [cse]: 2.997e-05 [a_3]: 6.377e-05 [Cycle 2]: 0.00079607, [45] [expand_dump_flag]: 1.18001e-06 [switch_simplify]: 7.35e-06 [loop_unroll]: 6.11e-06 [a_1]: 0.00010703 [with_stream_mark]: 1.26e-05 [recompute_prepare]: 6.19001e-06 [updatestate_depend_eliminate]: 3.16001e-06 [updatestate_assign_eliminate]: 2.59999e-06 [updatestate_loads_eliminate]: 3.02002e-06 [parameter_eliminate]: 1.12999e-06 [a_2]: 9.734e-05 [accelerated_algorithm]: 5.81998e-06 [shard]: 1.40999e-06 [meta_shard_fg_expand]: 1.52999e-06 [shard_inline]: 6.28998e-06 [merge_send_recv]: 5.69e-06 [auto_parallel]: 6.18002e-06 [parallel]: 5.38002e-06 [flash_sp]: 3.46999e-06 [merge_comm]: 3.48999e-06 [allreduce_fusion]: 3.48e-06 [matmul_add_comm_reduction]: 6.54001e-06 [allreduce_slice_to_reducescatter]: 4.50003e-07 [virtual_shard_identity]: 6.72002e-06 [virtual_dataset]: 6.51e-06 [get_grad_eliminate_]: 5.48997e-06 [virtual_output]: 5.50001e-06 [merge_forward]: 3.26001e-06 [cell_reuse_recompute_pass]: 1.51002e-06 [offload_activation]: 7.41001e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.565e-05 [merge_recompute_call_nodes]: 1.10999e-06 [before_grad]: 9.39998e-06 [set_forward_comm_id_for_comm_node_pass]: 3.74002e-06 [meta_fg_expand]: 2.31e-06 [flash_sp_send_recv_attached]: 1.32e-06 [receive_attached]: 1.09998e-06 [after_resolve]: 9.02e-06 [a_after_grad]: 8.11002e-06 [renormalize]: 5.9983e-08 [add_forward_monad_depend]: 1.37e-06 [auto_monad_grad]: 8.89995e-07 [auto_monad_eliminator]: 7.51999e-06 [cse]: 1.701e-05 [a_3]: 4.864e-05 [py_interpret_to_execute_after_opt_a]: 1.411e-05 [slice_cell_reuse_recomputed_activation]: 5.27001e-06 [rewriter_after_opt_a]: 4.101e-05 [convert_after_rewriter]: 9.44e-06 [order_py_execute_after_rewriter]: 8.98002e-06 [mutable_eliminate]: 0.00056921 [opt_b]: 0.00032461, [1] [Cycle 1]: 0.00031501, [7] [b_1]: 0.00020825 [b_2]: 8.14002e-06 [updatestate_depend_eliminate]: 7.85998e-06 [updatestate_assign_eliminate]: 2.78e-06 [updatestate_loads_eliminate]: 2.60002e-06 [renormalize]: 7.89994e-07 [cse]: 2.469e-05 [optimize_parallel_all_gather_comm]: 2.132e-05 [overlap_param_gather]: 4.53001e-06 [cconv]: 3.144e-05 [loop_unroll]: 0.00048145 [opt_after_cconv]: 0.00012931, [1] [Cycle 1]: 0.00012053, [7] [c_1]: 2.924e-05 [parameter_eliminate]: 3.46001e-06 [updatestate_depend_eliminate]: 5.79e-06 [updatestate_assign_eliminate]: 2.70002e-06 [updatestate_loads_eliminate]: 2.48e-06 [cse]: 2.124e-05 [renormalize]: 4.30009e-07 [remove_dup_value]: 1.674e-05 [tuple_transform]: 0.00010754, [1] [Cycle 1]: 0.0001002, [4] [d_1]: 3.909e-05 [none_parameter_eliminate]: 1.53002e-06 [renormalize]: 2.59985e-07 [switch_simplify]: 7.94002e-06 [partial_unused_args_eliminate]: 4.43001e-06 [add_recomputation]: 5.182e-05 [cse_after_recomputation]: 2.82e-05, [1] [Cycle 1]: 2.14e-05, [1] [cse]: 1.252e-05 [environ_conv]: 8.55001e-06 [swap_dp_allreduce_reducescatter]: 7.85e-06 [bias_add_comm_swap]: 5.25999e-06 [label_micro_interleaved_index]: 8.33999e-06 [label_fine_grained_interleaved_index]: 5.05001e-06 [merge_cast_opt]: 3.97e-06 [slice_recompute_activation]: 4.41002e-06 [micro_interleaved_order_control]: 4.65001e-06 [assign_add_opt]: 3.8e-06 [ForceFp32Comm]: 3.09999e-06 [remove_cast_before_assign_add]: 3.26999e-06 [full_micro_interleaved_order_control]: 4.52e-06 [reorder_send_recv_between_fp_bp]: 5.05001e-06 [comm_op_add_attrs]: 3.61999e-06 [add_comm_op_reuse_tag]: 3.61001e-06 [interleave_split_concat_branches]: 3.51999e-06 [interleave_parallel_branches]: 3.65e-06 [overlap_opt_shard_in_pipeline]: 3.52997e-06 [overlap_opt_shard_grad_in_pipeline]: 4.13999e-06 [control_data_broadcast_order]: 1.535e-05 [grouped_pairwise_exchange_alltoall]: 4.2e-06 [offloading_packed_experts]: 6.55002e-06 [overlap_recompute_and_grad_model_parallel]: 7.48999e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.70998e-06 [overlap_recompute_allgather_and_fa_grad]: 3.73001e-06 [overlap_recompute_comm]: 4.73001e-06 [overlap_grad_ring_attention]: 7.03e-06 [overlap_grad_flash_sp]: 2.158e-05 [begin_end_overlap_inline]: 2.95002e-06 [split_matmul_comm_elemetwise]: 4.67998e-06 [split_layernorm_comm]: 3.97998e-06 [handle_group_info]: 3.43999e-06 [symbol_engine_optimizer]: 9.823e-05, [1] [Cycle 1]: 9.075e-05, [6] [build]: 3.22002e-06 [elim_shapecalc]: 9.63002e-06 [elim_not_effective]: 1.339e-05 [opt_reshape]: 7.40003e-06 [fold_const_symbol]: 1.071e-05 [renormalize]: 1.69995e-07 [detach_backward]: 4.48999e-06 [pipeline_parallel_scheduler]: 1.71002e-06 [auto_monad_reorder]: 1.98e-05 [get_jit_bprop_graph]: 1.76e-06 [rewriter_after_jit_bprop_graph]: 5.29e-06 [opt_after_jit_grad]: 0.0005368 [validate]: 3.802e-05 Sums bootstrap : 0.000443s : 4.33% type_inference : 0.005282s : 51.66% event_method : 0.000012s : 0.12% auto_monad : 0.000058s : 0.57% graph_reusing : 0.000006s : 0.05% inline : 0.000003s : 0.03% add_attr.add_attr_with_inline.tag_attr : 0.000015s : 0.15% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000004s : 0.04% parallel-infer-symbol : 0.000003s : 0.03% pre_auto_parallel : 0.000027s : 0.27% insert-virtual-dataset : 0.000003s : 0.03% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.02% optimize.py_interpret_to_execute : 0.000021s : 0.21% optimize.rewriter_before_opt_a : 0.000053s : 0.51% optimize.opt_a.expand_dump_flag : 0.000004s : 0.04% optimize.opt_a.switch_simplify : 0.000032s : 0.31% optimize.opt_a.loop_unroll : 0.000020s : 0.19% optimize.opt_a.a_1 : 0.000415s : 4.06% optimize.opt_a.with_stream_mark : 0.000030s : 0.29% optimize.opt_a.recompute_prepare : 0.000014s : 0.14% optimize.opt_a.updatestate_depend_eliminate : 0.000007s : 0.07% optimize.opt_a.updatestate_assign_eliminate : 0.000006s : 0.06% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.06% optimize.opt_a.parameter_eliminate : 0.000003s : 0.03% optimize.opt_a.a_2 : 0.000205s : 2.01% optimize.opt_a.accelerated_algorithm : 0.000013s : 0.12% optimize.opt_a.shard : 0.000004s : 0.04% optimize.opt_a.meta_shard_fg_expand : 0.000003s : 0.03% optimize.opt_a.shard_inline : 0.000013s : 0.13% optimize.opt_a.merge_send_recv : 0.000016s : 0.15% optimize.opt_a.auto_parallel : 0.000013s : 0.13% optimize.opt_a.parallel : 0.000024s : 0.23% optimize.opt_a.flash_sp : 0.000013s : 0.13% optimize.opt_a.merge_comm : 0.000007s : 0.07% optimize.opt_a.allreduce_fusion : 0.000007s : 0.07% optimize.opt_a.matmul_add_comm_reduction : 0.000017s : 0.17% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000015s : 0.15% optimize.opt_a.virtual_dataset : 0.000013s : 0.13% optimize.opt_a.get_grad_eliminate_ : 0.000012s : 0.11% optimize.opt_a.virtual_output : 0.000011s : 0.11% optimize.opt_a.merge_forward : 0.000008s : 0.07% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.03% optimize.opt_a.offload_activation : 0.000018s : 0.18% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000031s : 0.31% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.02% optimize.opt_a.before_grad : 0.000020s : 0.20% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000008s : 0.08% optimize.opt_a.meta_fg_expand : 0.000005s : 0.05% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.04% optimize.opt_a.receive_attached : 0.000003s : 0.03% optimize.opt_a.after_resolve : 0.000019s : 0.19% optimize.opt_a.a_after_grad : 0.000018s : 0.18% optimize.opt_a.renormalize : 0.000559s : 5.47% optimize.opt_a.add_forward_monad_depend : 0.000008s : 0.08% optimize.opt_a.auto_monad_grad : 0.000003s : 0.03% optimize.opt_a.auto_monad_eliminator : 0.000023s : 0.23% optimize.opt_a.cse : 0.000047s : 0.46% optimize.opt_a.a_3 : 0.000112s : 1.10% optimize.py_interpret_to_execute_after_opt_a : 0.000014s : 0.14% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.05% optimize.rewriter_after_opt_a : 0.000041s : 0.40% optimize.convert_after_rewriter : 0.000009s : 0.09% optimize.order_py_execute_after_rewriter : 0.000009s : 0.09% optimize.mutable_eliminate : 0.000569s : 5.57% optimize.opt_b.b_1 : 0.000208s : 2.04% optimize.opt_b.b_2 : 0.000008s : 0.08% optimize.opt_b.updatestate_depend_eliminate : 0.000008s : 0.08% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_b.renormalize : 0.000001s : 0.01% optimize.opt_b.cse : 0.000025s : 0.24% optimize.optimize_parallel_all_gather_comm : 0.000021s : 0.21% optimize.overlap_param_gather : 0.000005s : 0.04% optimize.cconv : 0.000031s : 0.31% optimize.loop_unroll : 0.000481s : 4.71% optimize.opt_after_cconv.c_1 : 0.000029s : 0.29% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.06% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.02% optimize.opt_after_cconv.cse : 0.000021s : 0.21% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000017s : 0.16% optimize.tuple_transform.d_1 : 0.000039s : 0.38% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000008s : 0.08% optimize.partial_unused_args_eliminate : 0.000004s : 0.04% optimize.add_recomputation : 0.000052s : 0.51% optimize.cse_after_recomputation.cse : 0.000013s : 0.12% optimize.environ_conv : 0.000009s : 0.08% optimize.swap_dp_allreduce_reducescatter : 0.000008s : 0.08% optimize.bias_add_comm_swap : 0.000005s : 0.05% optimize.label_micro_interleaved_index : 0.000008s : 0.08% optimize.label_fine_grained_interleaved_index : 0.000005s : 0.05% optimize.merge_cast_opt : 0.000004s : 0.04% optimize.slice_recompute_activation : 0.000004s : 0.04% optimize.micro_interleaved_order_control : 0.000005s : 0.05% optimize.assign_add_opt : 0.000004s : 0.04% optimize.ForceFp32Comm : 0.000003s : 0.03% optimize.remove_cast_before_assign_add : 0.000003s : 0.03% optimize.full_micro_interleaved_order_control : 0.000005s : 0.04% optimize.reorder_send_recv_between_fp_bp : 0.000005s : 0.05% optimize.comm_op_add_attrs : 0.000004s : 0.04% optimize.add_comm_op_reuse_tag : 0.000004s : 0.04% optimize.interleave_split_concat_branches : 0.000004s : 0.03% optimize.interleave_parallel_branches : 0.000004s : 0.04% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.03% optimize.overlap_opt_shard_grad_in_pipeline : 0.000004s : 0.04% optimize.control_data_broadcast_order : 0.000015s : 0.15% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.04% optimize.offloading_packed_experts : 0.000007s : 0.06% optimize.overlap_recompute_and_grad_model_parallel : 0.000007s : 0.07% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.04% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.04% optimize.overlap_recompute_comm : 0.000005s : 0.05% optimize.overlap_grad_ring_attention : 0.000007s : 0.07% optimize.overlap_grad_flash_sp : 0.000022s : 0.21% optimize.begin_end_overlap_inline : 0.000003s : 0.03% optimize.split_matmul_comm_elemetwise : 0.000005s : 0.05% optimize.split_layernorm_comm : 0.000004s : 0.04% optimize.handle_group_info : 0.000003s : 0.03% optimize.symbol_engine_optimizer.build : 0.000003s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000010s : 0.09% optimize.symbol_engine_optimizer.elim_not_effective : 0.000013s : 0.13% optimize.symbol_engine_optimizer.opt_reshape : 0.000007s : 0.07% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000011s : 0.10% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000004s : 0.04% pipeline_parallel_scheduler : 0.000002s : 0.02% auto_monad_reorder : 0.000020s : 0.19% get_jit_bprop_graph : 0.000002s : 0.02% rewriter_after_jit_bprop_graph : 0.000005s : 0.05% opt_after_jit_grad : 0.000537s : 5.25% validate : 0.000038s : 0.37% Time group info: ------[substitution.] 0.000149 20 1.35% : 0.000002s : 2: substitution.elim_not_effective 1.09% : 0.000002s : 2: substitution.fold_const_symbol 3.56% : 0.000005s : 3: substitution.graph_param_transform 63.03% : 0.000094s : 2: substitution.inline 2.63% : 0.000004s : 4: substitution.j_node_and_user_rematch 2.98% : 0.000004s : 4: substitution.remove_not_recompute_node 2.46% : 0.000004s : 2: substitution.replace_old_param 22.89% : 0.000034s : 1: substitution.value_based_eliminate ------[type_inference.] 0.005225 2 90.40% : 0.004724s : 1: type_inference.infer 9.60% : 0.000501s : 1: type_inference.specialize ------[replace.] 0.000023 2 100.00% : 0.000023s : 2: replace.inline ------[match.] 0.000092 2 100.00% : 0.000092s : 2: match.inline ------[predicate.] 0.000140 754 0.81% : 0.000001s : 7: predicate.accumulaten_eliminater 1.17% : 0.000002s : 3: predicate.ad_related_special_op_eliminate 0.69% : 0.000001s : 6: predicate.addn_check_dump 0.88% : 0.000001s : 7: predicate.addn_zero_filter 0.66% : 0.000001s : 7: predicate.adjust_all_reduce_mul_add 2.44% : 0.000003s : 13: predicate.arithmetic_simplify 0.87% : 0.000001s : 7: predicate.cast_eliminate 0.76% : 0.000001s : 6: predicate.check_bprop_eliminate 0.68% : 0.000001s : 6: predicate.compare_switch_simplify 0.26% : 0.000000s : 3: predicate.const_output_eliminate 0.75% : 0.000001s : 6: predicate.depend_value_elim 0.74% : 0.000001s : 7: predicate.dict_get_item_const_eliminator 0.97% : 0.000001s : 7: predicate.dict_get_item_eliminator 0.69% : 0.000001s : 7: predicate.dict_set_item_eliminator 1.39% : 0.000002s : 6: predicate.dumpgradient_eliminate 0.24% : 0.000000s : 3: predicate.elim_not_effective 0.49% : 0.000001s : 3: predicate.elim_shapecalc_of_broadcastargs 1.04% : 0.000001s : 10: predicate.environ_add_const_eliminate 0.95% : 0.000001s : 10: predicate.environ_get_add_eliminate 0.99% : 0.000001s : 10: predicate.environ_get_depend_swap 1.68% : 0.000002s : 16: predicate.environ_get_eliminate 1.01% : 0.000001s : 10: predicate.environ_get_set_eliminate 0.90% : 0.000001s : 9: predicate.exchange_switch_depend_value 1.93% : 0.000003s : 9: predicate.float_depend_g_call 0.67% : 0.000001s : 6: predicate.float_environ_get_switch 0.92% : 0.000001s : 9: predicate.float_tuple_getitem_switch 0.19% : 0.000000s : 3: predicate.fold_const_symbol 0.81% : 0.000001s : 6: predicate.get_grad_eliminate 0.28% : 0.000000s : 3: predicate.graph_param_transform 0.74% : 0.000001s : 6: predicate.incorporate_call 0.66% : 0.000001s : 6: predicate.incorporate_call_switch 6.21% : 0.000009s : 34: predicate.inline 0.94% : 0.000001s : 6: predicate.inline_without_move 0.39% : 0.000001s : 6: predicate.j_node_and_user_rematch 0.97% : 0.000001s : 6: predicate.less_batch_normalization 1.69% : 0.000002s : 13: predicate.list_to_tuple_eliminator_ 1.95% : 0.000003s : 20: predicate.load_eliminater 1.37% : 0.000002s : 3: predicate.loop_unroll_after_grad 1.64% : 0.000002s : 14: predicate.loop_unroll_before_grad 1.82% : 0.000003s : 13: predicate.make_slice_get_slice_eliminator 0.66% : 0.000001s : 6: predicate.merge_addn 0.71% : 0.000001s : 6: predicate.micro_step_allgather_replace 0.71% : 0.000001s : 6: predicate.mini_step_allgather_replace 0.64% : 0.000001s : 7: predicate.minmaximum_grad 1.88% : 0.000003s : 3: predicate.mutable_eliminate 0.43% : 0.000001s : 3: predicate.opt_reshape 0.64% : 0.000001s : 3: predicate.parallel_virtual_node 1.35% : 0.000002s : 9: predicate.partial_defer_inline 1.17% : 0.000002s : 10: predicate.partial_eliminate 0.75% : 0.000001s : 7: predicate.print_const_string_wrapper 0.74% : 0.000001s : 6: predicate.reduce_all_const_elim 1.21% : 0.000002s : 7: predicate.reduce_eliminate 1.98% : 0.000003s : 20: predicate.redundant_stop_gradient_eliminater 0.86% : 0.000001s : 6: predicate.remove_not_recompute_node 1.17% : 0.000002s : 13: predicate.replace_applicator 0.64% : 0.000001s : 6: predicate.replace_old_param 0.74% : 0.000001s : 3: predicate.reset_defer_inline 0.76% : 0.000001s : 7: predicate.reshape_eliminate 0.72% : 0.000001s : 6: predicate.row_tensor_add_zeros_like 0.51% : 0.000001s : 3: predicate.row_tensor_eliminate 1.47% : 0.000002s : 6: predicate.same_eliminate 0.57% : 0.000001s : 6: predicate.set_cell_output_no_recompute 1.14% : 0.000002s : 6: predicate.shard_identity_eliminate 0.79% : 0.000001s : 6: predicate.special_op_eliminate 0.95% : 0.000001s : 6: predicate.specialize_transform 1.42% : 0.000002s : 6: predicate.split_environ_get_set_with_tuple_value 1.34% : 0.000002s : 6: predicate.stack_unstack_eliminate 0.40% : 0.000001s : 3: predicate.switch_call_monad_eliminater 0.94% : 0.000001s : 9: predicate.switch_defer_inline 1.61% : 0.000002s : 15: predicate.switch_layer_defer_inline 4.45% : 0.000006s : 32: predicate.switch_simplify 0.89% : 0.000001s : 7: predicate.tile_eliminate 0.78% : 0.000001s : 7: predicate.transpose_eliminate 1.54% : 0.000002s : 13: predicate.tuple_list_convert_item_index_to_positive 1.71% : 0.000002s : 13: predicate.tuple_list_get_item_const_eliminator 1.42% : 0.000002s : 13: predicate.tuple_list_get_item_depend_reorder 3.00% : 0.000004s : 19: predicate.tuple_list_get_item_eliminator 1.33% : 0.000002s : 13: predicate.tuple_list_get_set_item_eliminator 2.50% : 0.000004s : 19: predicate.tuple_list_set_item_eliminator 1.47% : 0.000002s : 13: predicate.tuple_to_list_eliminator_ 1.85% : 0.000003s : 20: predicate.updatestate_pure_node_eliminater 2.74% : 0.000004s : 26: predicate.updatestate_useless_node_eliminater 0.63% : 0.000001s : 3: predicate.value_based_eliminate 0.76% : 0.000001s : 6: predicate.virtual_dataset_eliminate 0.74% : 0.000001s : 6: predicate.virtual_output_eliminate 0.41% : 0.000001s : 3: predicate.virtual_view_grad_eliminate 0.63% : 0.000001s : 3: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000249 5 7.41% : 0.000018s : 1: func_graph_cloner_run.FuncGraphClonerGraph 92.59% : 0.000231s : 4: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.025515 192 0.02% : 0.000006s : 1: ForceFp32Comm 13.77% : 0.003513s : 1: add_attr 13.71% : 0.003497s : 1: add_attr_with_inline 0.02% : 0.000006s : 1: add_comm_op_reuse_tag 0.22% : 0.000055s : 1: add_recomputation 0.03% : 0.000007s : 1: assign_add_opt 0.26% : 0.000066s : 1: auto_monad 0.11% : 0.000027s : 1: auto_monad_reorder 0.02% : 0.000006s : 1: begin_end_overlap_inline 0.03% : 0.000008s : 1: bias_add_comm_swap 1.91% : 0.000487s : 1: bootstrap 0.14% : 0.000034s : 1: cconv 0.03% : 0.000006s : 1: comm_op_add_attrs 0.07% : 0.000018s : 1: control_data_broadcast_order 0.05% : 0.000012s : 1: convert_after_rewriter 0.12% : 0.000031s : 1: cse_after_recomputation 0.03% : 0.000008s : 1: dataset_repeat_opt 0.08% : 0.000021s : 1: detach_backward 0.04% : 0.000011s : 1: environ_conv 0.09% : 0.000023s : 1: event_method 0.03% : 0.000007s : 1: full_micro_interleaved_order_control 0.03% : 0.000008s : 1: get_jit_bprop_graph 0.05% : 0.000012s : 1: graph_reusing 0.03% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.02% : 0.000006s : 1: handle_group_info 0.03% : 0.000009s : 1: inline 0.03% : 0.000008s : 1: insert-virtual-dataset 0.03% : 0.000007s : 1: interleave_parallel_branches 0.02% : 0.000006s : 1: interleave_split_concat_branches 0.03% : 0.000008s : 1: label_fine_grained_interleaved_index 0.04% : 0.000011s : 1: label_micro_interleaved_index 1.91% : 0.000488s : 1: loop_unroll 0.03% : 0.000007s : 1: merge_cast_opt 0.03% : 0.000007s : 1: micro_interleaved_order_control 2.26% : 0.000576s : 1: mutable_eliminate 0.04% : 0.000009s : 1: offloading_packed_experts 0.06% : 0.000015s : 1: opt.transform.loop_unroll_optimizer 0.08% : 0.000020s : 1: opt.transform.mutable_eliminate 3.08% : 0.000787s : 78: opt.transform.opt_a 0.11% : 0.000028s : 1: opt.transform.opt_after_cconv 0.10% : 0.000026s : 1: opt.transform.opt_after_jit_grad 0.55% : 0.000140s : 28: opt.transform.opt_b 0.18% : 0.000045s : 2: opt.transform.opt_trans_graph 0.15% : 0.000038s : 4: opt.transform.symbol_engine_opt 10.03% : 0.002559s : 1: opt_a 0.52% : 0.000133s : 1: opt_after_cconv 2.15% : 0.000548s : 1: opt_after_jit_grad 1.29% : 0.000328s : 1: opt_b 20.86% : 0.005324s : 1: optimize 0.10% : 0.000025s : 1: optimize_parallel_all_gather_comm 0.05% : 0.000012s : 1: order_py_execute_after_rewriter 0.10% : 0.000025s : 1: overlap_grad_flash_sp 0.03% : 0.000007s : 1: overlap_grad_matmul_and_grad_allreduce 0.04% : 0.000010s : 1: overlap_grad_ring_attention 0.03% : 0.000007s : 1: overlap_opt_shard_grad_in_pipeline 0.02% : 0.000006s : 1: overlap_opt_shard_in_pipeline 0.03% : 0.000008s : 1: overlap_param_gather 0.03% : 0.000006s : 1: overlap_recompute_allgather_and_fa_grad 0.04% : 0.000010s : 1: overlap_recompute_and_grad_model_parallel 0.03% : 0.000008s : 1: overlap_recompute_comm 0.04% : 0.000010s : 1: parallel-infer-symbol 0.03% : 0.000006s : 1: parallel-infer-symbol-second 0.03% : 0.000007s : 1: partial_unused_args_eliminate 0.03% : 0.000009s : 1: pipeline_parallel_scheduler 0.03% : 0.000007s : 1: pipeline_split 0.14% : 0.000035s : 1: pre_auto_parallel 0.10% : 0.000025s : 1: py_interpret_to_execute 0.07% : 0.000018s : 1: py_interpret_to_execute_after_opt_a 0.02% : 0.000006s : 1: remove_cast_before_assign_add 0.08% : 0.000020s : 1: remove_dup_value 1.18% : 0.000302s : 1: renormalize.infer 0.97% : 0.000249s : 1: renormalize.specialize 0.03% : 0.000008s : 1: reorder_send_recv_between_fp_bp 0.04% : 0.000011s : 1: rewriter_after_jit_bprop_graph 0.18% : 0.000045s : 1: rewriter_after_opt_a 0.22% : 0.000056s : 1: rewriter_before_opt_a 0.03% : 0.000008s : 1: slice_cell_reuse_recomputed_activation 0.03% : 0.000007s : 1: slice_recompute_activation 0.03% : 0.000007s : 1: split_layernorm_comm 0.03% : 0.000007s : 1: split_matmul_comm_elemetwise 0.04% : 0.000011s : 1: swap_dp_allreduce_reducescatter 0.40% : 0.000101s : 1: symbol_engine_optimizer 0.43% : 0.000111s : 1: tuple_transform 20.84% : 0.005318s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:56.977.073 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0134334, [21] [bootstrap]: 0.00043687 [type_inference]: 0.00495709 [event_method]: 1.178e-05 [auto_monad]: 5.708e-05 [graph_reusing]: 5.14e-06 [inline]: 2.46e-06 [add_attr]: 0.00318175, [1] [add_attr_with_inline]: 0.0031724, [1] [Cycle 1]: 5.1e-05, [2] [tag_attr]: 1.4e-05 [meta_addattr_fg_expand]: 3.31999e-06 [parallel-infer-symbol]: 2.96001e-06 [pre_auto_parallel]: 2.709e-05 [insert-virtual-dataset]: 2.29999e-06 [parallel-infer-symbol-second]: 7.50006e-07 [dataset_repeat_opt]: 1.97001e-06 [pipeline_split]: 1.62001e-06 [optimize]: 0.00406719, [53] [py_interpret_to_execute]: 1.903e-05 [rewriter_before_opt_a]: 4.599e-05 [opt_a]: 0.00208744, [2] [Cycle 1]: 0.00145856, [45] [expand_dump_flag]: 3.21999e-06 [switch_simplify]: 2.586e-05 [loop_unroll]: 1.358e-05 [a_1]: 0.00029035 [with_stream_mark]: 1.768e-05 [recompute_prepare]: 7.9e-06 [updatestate_depend_eliminate]: 3.48999e-06 [updatestate_assign_eliminate]: 3.35e-06 [updatestate_loads_eliminate]: 3.46001e-06 [parameter_eliminate]: 1.91e-06 [a_2]: 8.34e-05 [accelerated_algorithm]: 6.23e-06 [shard]: 2.26e-06 [meta_shard_fg_expand]: 1.69998e-06 [shard_inline]: 5.89e-06 [merge_send_recv]: 7.84002e-06 [auto_parallel]: 6.21998e-06 [parallel]: 1.743e-05 [flash_sp]: 7.61001e-06 [merge_comm]: 3.83001e-06 [allreduce_fusion]: 3.45003e-06 [matmul_add_comm_reduction]: 9.87001e-06 [allreduce_slice_to_reducescatter]: 6.50005e-07 [virtual_shard_identity]: 6.98998e-06 [virtual_dataset]: 6.21e-06 [get_grad_eliminate_]: 5.59e-06 [virtual_output]: 5.48002e-06 [merge_forward]: 3.65e-06 [cell_reuse_recompute_pass]: 1.16997e-06 [offload_activation]: 9.91e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.328e-05 [merge_recompute_call_nodes]: 1.37e-06 [before_grad]: 9.97001e-06 [set_forward_comm_id_for_comm_node_pass]: 3.53e-06 [meta_fg_expand]: 2.60002e-06 [flash_sp_send_recv_attached]: 2.64001e-06 [receive_attached]: 1.99999e-06 [after_resolve]: 9.36e-06 [a_after_grad]: 8.90999e-06 [renormalize]: 0.00050016 [add_forward_monad_depend]: 4.75001e-06 [auto_monad_grad]: 2.25002e-06 [auto_monad_eliminator]: 1.534e-05 [cse]: 3.052e-05 [a_3]: 4.726e-05 [Cycle 2]: 0.00061847, [45] [expand_dump_flag]: 1.19e-06 [switch_simplify]: 6.91999e-06 [loop_unroll]: 5.96998e-06 [a_1]: 0.00010863 [with_stream_mark]: 1.193e-05 [recompute_prepare]: 6.40002e-06 [updatestate_depend_eliminate]: 3.38999e-06 [updatestate_assign_eliminate]: 2.51998e-06 [updatestate_loads_eliminate]: 3.01001e-06 [parameter_eliminate]: 1.39e-06 [a_2]: 7.01e-05 [accelerated_algorithm]: 6.10002e-06 [shard]: 1.28002e-06 [meta_shard_fg_expand]: 1.55001e-06 [shard_inline]: 6.27001e-06 [merge_send_recv]: 5.67999e-06 [auto_parallel]: 5.76998e-06 [parallel]: 5.38002e-06 [flash_sp]: 4.19002e-06 [merge_comm]: 3.09999e-06 [allreduce_fusion]: 3.05998e-06 [matmul_add_comm_reduction]: 6.41e-06 [allreduce_slice_to_reducescatter]: 3.69997e-07 [virtual_shard_identity]: 6.54001e-06 [virtual_dataset]: 5.64998e-06 [get_grad_eliminate_]: 5.50001e-06 [virtual_output]: 5.25999e-06 [merge_forward]: 2.74999e-06 [cell_reuse_recompute_pass]: 1.54e-06 [offload_activation]: 6.83e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.311e-05 [merge_recompute_call_nodes]: 1.00001e-06 [before_grad]: 9.54e-06 [set_forward_comm_id_for_comm_node_pass]: 3.66999e-06 [meta_fg_expand]: 2.15002e-06 [flash_sp_send_recv_attached]: 8.59989e-07 [receive_attached]: 1.02998e-06 [after_resolve]: 8.45001e-06 [a_after_grad]: 7.98001e-06 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 1.33002e-06 [auto_monad_grad]: 8.29983e-07 [auto_monad_eliminator]: 7.08998e-06 [cse]: 1.506e-05 [a_3]: 3.445e-05 [py_interpret_to_execute_after_opt_a]: 9.14e-06 [slice_cell_reuse_recomputed_activation]: 1.95001e-06 [rewriter_after_opt_a]: 3.435e-05 [convert_after_rewriter]: 6.55002e-06 [order_py_execute_after_rewriter]: 5.36002e-06 [mutable_eliminate]: 0.00049731 [opt_b]: 0.00024125, [1] [Cycle 1]: 0.00023509, [7] [b_1]: 0.00015663 [b_2]: 8.03001e-06 [updatestate_depend_eliminate]: 6.16998e-06 [updatestate_assign_eliminate]: 2.49001e-06 [updatestate_loads_eliminate]: 2.44001e-06 [renormalize]: 9.09989e-07 [cse]: 2.091e-05 [optimize_parallel_all_gather_comm]: 1.658e-05 [overlap_param_gather]: 2.07001e-06 [cconv]: 2.733e-05 [loop_unroll]: 0.00043086 [opt_after_cconv]: 0.00010065, [1] [Cycle 1]: 9.468e-05, [7] [c_1]: 2.733e-05 [parameter_eliminate]: 3.38e-06 [updatestate_depend_eliminate]: 5.84e-06 [updatestate_assign_eliminate]: 2.39001e-06 [updatestate_loads_eliminate]: 2.69999e-06 [cse]: 1.865e-05 [renormalize]: 2.80008e-07 [remove_dup_value]: 1.421e-05 [tuple_transform]: 7.023e-05, [1] [Cycle 1]: 6.555e-05, [4] [d_1]: 3.869e-05 [none_parameter_eliminate]: 1.51998e-06 [renormalize]: 1.69995e-07 [switch_simplify]: 6.90998e-06 [partial_unused_args_eliminate]: 1.96e-06 [add_recomputation]: 4.559e-05 [cse_after_recomputation]: 2.168e-05, [1] [Cycle 1]: 1.678e-05, [1] [cse]: 1.144e-05 [environ_conv]: 5.41002e-06 [swap_dp_allreduce_reducescatter]: 5.03002e-06 [bias_add_comm_swap]: 2.64999e-06 [label_micro_interleaved_index]: 4.72e-06 [label_fine_grained_interleaved_index]: 2.54001e-06 [merge_cast_opt]: 1.34e-06 [slice_recompute_activation]: 1.94e-06 [micro_interleaved_order_control]: 2.53e-06 [assign_add_opt]: 1.15999e-06 [ForceFp32Comm]: 1.00001e-06 [remove_cast_before_assign_add]: 1.35999e-06 [full_micro_interleaved_order_control]: 2.28998e-06 [reorder_send_recv_between_fp_bp]: 2.61e-06 [comm_op_add_attrs]: 1.00001e-06 [add_comm_op_reuse_tag]: 9.70002e-07 [interleave_split_concat_branches]: 1.17e-06 [interleave_parallel_branches]: 1.07e-06 [overlap_opt_shard_in_pipeline]: 1.50999e-06 [overlap_opt_shard_grad_in_pipeline]: 1.80001e-06 [control_data_broadcast_order]: 1.209e-05 [grouped_pairwise_exchange_alltoall]: 1.91998e-06 [offloading_packed_experts]: 4.04002e-06 [overlap_recompute_and_grad_model_parallel]: 4.51002e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.34998e-06 [overlap_recompute_allgather_and_fa_grad]: 1.47999e-06 [overlap_recompute_comm]: 2.48e-06 [overlap_grad_ring_attention]: 4.12e-06 [overlap_grad_flash_sp]: 1.828e-05 [begin_end_overlap_inline]: 5.79981e-07 [split_matmul_comm_elemetwise]: 2.28002e-06 [split_layernorm_comm]: 1.60001e-06 [handle_group_info]: 1.05999e-06 [symbol_engine_optimizer]: 7.403e-05, [1] [Cycle 1]: 6.962e-05, [6] [build]: 3.03998e-06 [elim_shapecalc]: 9.39e-06 [elim_not_effective]: 1.284e-05 [opt_reshape]: 7.04001e-06 [fold_const_symbol]: 9.47999e-06 [renormalize]: 2.20025e-07 [detach_backward]: 1.87001e-06 [pipeline_parallel_scheduler]: 1.72999e-06 [auto_monad_reorder]: 1.691e-05 [get_jit_bprop_graph]: 1.87999e-06 [rewriter_after_jit_bprop_graph]: 4.07003e-06 [opt_after_jit_grad]: 0.00046885 [validate]: 3.592e-05 Sums bootstrap : 0.000437s : 4.70% type_inference : 0.004957s : 53.29% event_method : 0.000012s : 0.13% auto_monad : 0.000057s : 0.61% graph_reusing : 0.000005s : 0.06% inline : 0.000002s : 0.03% add_attr.add_attr_with_inline.tag_attr : 0.000014s : 0.15% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000003s : 0.04% parallel-infer-symbol : 0.000003s : 0.03% pre_auto_parallel : 0.000027s : 0.29% insert-virtual-dataset : 0.000002s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.02% optimize.py_interpret_to_execute : 0.000019s : 0.20% optimize.rewriter_before_opt_a : 0.000046s : 0.49% optimize.opt_a.expand_dump_flag : 0.000004s : 0.05% optimize.opt_a.switch_simplify : 0.000033s : 0.35% optimize.opt_a.loop_unroll : 0.000020s : 0.21% optimize.opt_a.a_1 : 0.000399s : 4.29% optimize.opt_a.with_stream_mark : 0.000030s : 0.32% optimize.opt_a.recompute_prepare : 0.000014s : 0.15% optimize.opt_a.updatestate_depend_eliminate : 0.000007s : 0.07% optimize.opt_a.updatestate_assign_eliminate : 0.000006s : 0.06% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.07% optimize.opt_a.parameter_eliminate : 0.000003s : 0.04% optimize.opt_a.a_2 : 0.000153s : 1.65% optimize.opt_a.accelerated_algorithm : 0.000012s : 0.13% optimize.opt_a.shard : 0.000004s : 0.04% optimize.opt_a.meta_shard_fg_expand : 0.000003s : 0.03% optimize.opt_a.shard_inline : 0.000012s : 0.13% optimize.opt_a.merge_send_recv : 0.000014s : 0.15% optimize.opt_a.auto_parallel : 0.000012s : 0.13% optimize.opt_a.parallel : 0.000023s : 0.25% optimize.opt_a.flash_sp : 0.000012s : 0.13% optimize.opt_a.merge_comm : 0.000007s : 0.07% optimize.opt_a.allreduce_fusion : 0.000007s : 0.07% optimize.opt_a.matmul_add_comm_reduction : 0.000016s : 0.18% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000014s : 0.15% optimize.opt_a.virtual_dataset : 0.000012s : 0.13% optimize.opt_a.get_grad_eliminate_ : 0.000011s : 0.12% optimize.opt_a.virtual_output : 0.000011s : 0.12% optimize.opt_a.merge_forward : 0.000006s : 0.07% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.03% optimize.opt_a.offload_activation : 0.000017s : 0.18% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000026s : 0.28% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.03% optimize.opt_a.before_grad : 0.000020s : 0.21% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000007s : 0.08% optimize.opt_a.meta_fg_expand : 0.000005s : 0.05% optimize.opt_a.flash_sp_send_recv_attached : 0.000003s : 0.04% optimize.opt_a.receive_attached : 0.000003s : 0.03% optimize.opt_a.after_resolve : 0.000018s : 0.19% optimize.opt_a.a_after_grad : 0.000017s : 0.18% optimize.opt_a.renormalize : 0.000500s : 5.38% optimize.opt_a.add_forward_monad_depend : 0.000006s : 0.07% optimize.opt_a.auto_monad_grad : 0.000003s : 0.03% optimize.opt_a.auto_monad_eliminator : 0.000022s : 0.24% optimize.opt_a.cse : 0.000046s : 0.49% optimize.opt_a.a_3 : 0.000082s : 0.88% optimize.py_interpret_to_execute_after_opt_a : 0.000009s : 0.10% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.02% optimize.rewriter_after_opt_a : 0.000034s : 0.37% optimize.convert_after_rewriter : 0.000007s : 0.07% optimize.order_py_execute_after_rewriter : 0.000005s : 0.06% optimize.mutable_eliminate : 0.000497s : 5.35% optimize.opt_b.b_1 : 0.000157s : 1.68% optimize.opt_b.b_2 : 0.000008s : 0.09% optimize.opt_b.updatestate_depend_eliminate : 0.000006s : 0.07% optimize.opt_b.updatestate_assign_eliminate : 0.000002s : 0.03% optimize.opt_b.updatestate_loads_eliminate : 0.000002s : 0.03% optimize.opt_b.renormalize : 0.000001s : 0.01% optimize.opt_b.cse : 0.000021s : 0.22% optimize.optimize_parallel_all_gather_comm : 0.000017s : 0.18% optimize.overlap_param_gather : 0.000002s : 0.02% optimize.cconv : 0.000027s : 0.29% optimize.loop_unroll : 0.000431s : 4.63% optimize.opt_after_cconv.c_1 : 0.000027s : 0.29% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.04% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.06% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000002s : 0.03% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.cse : 0.000019s : 0.20% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000014s : 0.15% optimize.tuple_transform.d_1 : 0.000039s : 0.42% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.02% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000007s : 0.07% optimize.partial_unused_args_eliminate : 0.000002s : 0.02% optimize.add_recomputation : 0.000046s : 0.49% optimize.cse_after_recomputation.cse : 0.000011s : 0.12% optimize.environ_conv : 0.000005s : 0.06% optimize.swap_dp_allreduce_reducescatter : 0.000005s : 0.05% optimize.bias_add_comm_swap : 0.000003s : 0.03% optimize.label_micro_interleaved_index : 0.000005s : 0.05% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.03% optimize.merge_cast_opt : 0.000001s : 0.01% optimize.slice_recompute_activation : 0.000002s : 0.02% optimize.micro_interleaved_order_control : 0.000003s : 0.03% optimize.assign_add_opt : 0.000001s : 0.01% optimize.ForceFp32Comm : 0.000001s : 0.01% optimize.remove_cast_before_assign_add : 0.000001s : 0.01% optimize.full_micro_interleaved_order_control : 0.000002s : 0.02% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.03% optimize.comm_op_add_attrs : 0.000001s : 0.01% optimize.add_comm_op_reuse_tag : 0.000001s : 0.01% optimize.interleave_split_concat_branches : 0.000001s : 0.01% optimize.interleave_parallel_branches : 0.000001s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000002s : 0.02% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.02% optimize.control_data_broadcast_order : 0.000012s : 0.13% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.02% optimize.offloading_packed_experts : 0.000004s : 0.04% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.05% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.02% optimize.overlap_recompute_comm : 0.000002s : 0.03% optimize.overlap_grad_ring_attention : 0.000004s : 0.04% optimize.overlap_grad_flash_sp : 0.000018s : 0.20% optimize.begin_end_overlap_inline : 0.000001s : 0.01% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.02% optimize.split_layernorm_comm : 0.000002s : 0.02% optimize.handle_group_info : 0.000001s : 0.01% optimize.symbol_engine_optimizer.build : 0.000003s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000009s : 0.10% optimize.symbol_engine_optimizer.elim_not_effective : 0.000013s : 0.14% optimize.symbol_engine_optimizer.opt_reshape : 0.000007s : 0.08% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000009s : 0.10% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.02% pipeline_parallel_scheduler : 0.000002s : 0.02% auto_monad_reorder : 0.000017s : 0.18% get_jit_bprop_graph : 0.000002s : 0.02% rewriter_after_jit_bprop_graph : 0.000004s : 0.04% opt_after_jit_grad : 0.000469s : 5.04% validate : 0.000036s : 0.39% Time group info: ------[substitution.] 0.000137 20 1.45% : 0.000002s : 2: substitution.elim_not_effective 0.95% : 0.000001s : 2: substitution.fold_const_symbol 3.89% : 0.000005s : 3: substitution.graph_param_transform 63.70% : 0.000087s : 2: substitution.inline 2.49% : 0.000003s : 4: substitution.j_node_and_user_rematch 3.53% : 0.000005s : 4: substitution.remove_not_recompute_node 1.84% : 0.000003s : 2: substitution.replace_old_param 22.16% : 0.000030s : 1: substitution.value_based_eliminate ------[type_inference.] 0.004908 2 91.26% : 0.004479s : 1: type_inference.infer 8.74% : 0.000429s : 1: type_inference.specialize ------[replace.] 0.000022 2 100.00% : 0.000022s : 2: replace.inline ------[match.] 0.000085 2 100.00% : 0.000085s : 2: match.inline ------[predicate.] 0.000133 754 1.08% : 0.000001s : 7: predicate.accumulaten_eliminater 1.20% : 0.000002s : 3: predicate.ad_related_special_op_eliminate 0.78% : 0.000001s : 6: predicate.addn_check_dump 0.85% : 0.000001s : 7: predicate.addn_zero_filter 0.65% : 0.000001s : 7: predicate.adjust_all_reduce_mul_add 2.19% : 0.000003s : 13: predicate.arithmetic_simplify 0.96% : 0.000001s : 7: predicate.cast_eliminate 1.08% : 0.000001s : 6: predicate.check_bprop_eliminate 0.79% : 0.000001s : 6: predicate.compare_switch_simplify 0.22% : 0.000000s : 3: predicate.const_output_eliminate 0.74% : 0.000001s : 6: predicate.depend_value_elim 0.89% : 0.000001s : 7: predicate.dict_get_item_const_eliminator 1.01% : 0.000001s : 7: predicate.dict_get_item_eliminator 0.80% : 0.000001s : 7: predicate.dict_set_item_eliminator 1.23% : 0.000002s : 6: predicate.dumpgradient_eliminate 0.36% : 0.000000s : 3: predicate.elim_not_effective 0.57% : 0.000001s : 3: predicate.elim_shapecalc_of_broadcastargs 1.11% : 0.000001s : 10: predicate.environ_add_const_eliminate 0.98% : 0.000001s : 10: predicate.environ_get_add_eliminate 0.99% : 0.000001s : 10: predicate.environ_get_depend_swap 1.81% : 0.000002s : 16: predicate.environ_get_eliminate 1.04% : 0.000001s : 10: predicate.environ_get_set_eliminate 0.98% : 0.000001s : 9: predicate.exchange_switch_depend_value 1.93% : 0.000003s : 9: predicate.float_depend_g_call 0.70% : 0.000001s : 6: predicate.float_environ_get_switch 0.96% : 0.000001s : 9: predicate.float_tuple_getitem_switch 0.21% : 0.000000s : 3: predicate.fold_const_symbol 0.87% : 0.000001s : 6: predicate.get_grad_eliminate 0.26% : 0.000000s : 3: predicate.graph_param_transform 0.77% : 0.000001s : 6: predicate.incorporate_call 0.65% : 0.000001s : 6: predicate.incorporate_call_switch 6.29% : 0.000008s : 34: predicate.inline 1.02% : 0.000001s : 6: predicate.inline_without_move 0.41% : 0.000001s : 6: predicate.j_node_and_user_rematch 1.02% : 0.000001s : 6: predicate.less_batch_normalization 1.62% : 0.000002s : 13: predicate.list_to_tuple_eliminator_ 1.99% : 0.000003s : 20: predicate.load_eliminater 1.40% : 0.000002s : 3: predicate.loop_unroll_after_grad 1.73% : 0.000002s : 14: predicate.loop_unroll_before_grad 1.80% : 0.000002s : 13: predicate.make_slice_get_slice_eliminator 0.80% : 0.000001s : 6: predicate.merge_addn 0.76% : 0.000001s : 6: predicate.micro_step_allgather_replace 0.66% : 0.000001s : 6: predicate.mini_step_allgather_replace 0.70% : 0.000001s : 7: predicate.minmaximum_grad 1.25% : 0.000002s : 3: predicate.mutable_eliminate 0.44% : 0.000001s : 3: predicate.opt_reshape 0.60% : 0.000001s : 3: predicate.parallel_virtual_node 1.32% : 0.000002s : 9: predicate.partial_defer_inline 1.23% : 0.000002s : 10: predicate.partial_eliminate 0.79% : 0.000001s : 7: predicate.print_const_string_wrapper 0.75% : 0.000001s : 6: predicate.reduce_all_const_elim 0.98% : 0.000001s : 7: predicate.reduce_eliminate 2.07% : 0.000003s : 20: predicate.redundant_stop_gradient_eliminater 0.65% : 0.000001s : 6: predicate.remove_not_recompute_node 1.22% : 0.000002s : 13: predicate.replace_applicator 0.65% : 0.000001s : 6: predicate.replace_old_param 0.39% : 0.000001s : 3: predicate.reset_defer_inline 0.80% : 0.000001s : 7: predicate.reshape_eliminate 0.70% : 0.000001s : 6: predicate.row_tensor_add_zeros_like 0.47% : 0.000001s : 3: predicate.row_tensor_eliminate 0.94% : 0.000001s : 6: predicate.same_eliminate 0.52% : 0.000001s : 6: predicate.set_cell_output_no_recompute 0.96% : 0.000001s : 6: predicate.shard_identity_eliminate 0.87% : 0.000001s : 6: predicate.special_op_eliminate 0.91% : 0.000001s : 6: predicate.specialize_transform 1.05% : 0.000001s : 6: predicate.split_environ_get_set_with_tuple_value 0.88% : 0.000001s : 6: predicate.stack_unstack_eliminate 0.43% : 0.000001s : 3: predicate.switch_call_monad_eliminater 1.08% : 0.000001s : 9: predicate.switch_defer_inline 1.74% : 0.000002s : 15: predicate.switch_layer_defer_inline 4.24% : 0.000006s : 32: predicate.switch_simplify 0.80% : 0.000001s : 7: predicate.tile_eliminate 0.77% : 0.000001s : 7: predicate.transpose_eliminate 1.66% : 0.000002s : 13: predicate.tuple_list_convert_item_index_to_positive 1.66% : 0.000002s : 13: predicate.tuple_list_get_item_const_eliminator 1.44% : 0.000002s : 13: predicate.tuple_list_get_item_depend_reorder 3.14% : 0.000004s : 19: predicate.tuple_list_get_item_eliminator 1.47% : 0.000002s : 13: predicate.tuple_list_get_set_item_eliminator 2.68% : 0.000004s : 19: predicate.tuple_list_set_item_eliminator 1.61% : 0.000002s : 13: predicate.tuple_to_list_eliminator_ 1.93% : 0.000003s : 20: predicate.updatestate_pure_node_eliminater 3.02% : 0.000004s : 26: predicate.updatestate_useless_node_eliminater 0.57% : 0.000001s : 3: predicate.value_based_eliminate 0.80% : 0.000001s : 6: predicate.virtual_dataset_eliminate 0.82% : 0.000001s : 6: predicate.virtual_output_eliminate 0.35% : 0.000000s : 3: predicate.virtual_view_grad_eliminate 0.49% : 0.000001s : 3: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000221 5 8.15% : 0.000018s : 1: func_graph_cloner_run.FuncGraphClonerGraph 91.85% : 0.000203s : 4: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.022111 192 0.02% : 0.000004s : 1: ForceFp32Comm 14.41% : 0.003186s : 1: add_attr 14.37% : 0.003176s : 1: add_attr_with_inline 0.02% : 0.000004s : 1: add_comm_op_reuse_tag 0.22% : 0.000049s : 1: add_recomputation 0.02% : 0.000004s : 1: assign_add_opt 0.28% : 0.000062s : 1: auto_monad 0.09% : 0.000021s : 1: auto_monad_reorder 0.01% : 0.000003s : 1: begin_end_overlap_inline 0.02% : 0.000005s : 1: bias_add_comm_swap 2.10% : 0.000465s : 1: bootstrap 0.14% : 0.000031s : 1: cconv 0.02% : 0.000004s : 1: comm_op_add_attrs 0.07% : 0.000015s : 1: control_data_broadcast_order 0.04% : 0.000010s : 1: convert_after_rewriter 0.11% : 0.000025s : 1: cse_after_recomputation 0.02% : 0.000005s : 1: dataset_repeat_opt 0.02% : 0.000005s : 1: detach_backward 0.04% : 0.000009s : 1: environ_conv 0.08% : 0.000018s : 1: event_method 0.02% : 0.000005s : 1: full_micro_interleaved_order_control 0.02% : 0.000005s : 1: get_jit_bprop_graph 0.04% : 0.000009s : 1: graph_reusing 0.02% : 0.000005s : 1: grouped_pairwise_exchange_alltoall 0.02% : 0.000004s : 1: handle_group_info 0.02% : 0.000005s : 1: inline 0.03% : 0.000006s : 1: insert-virtual-dataset 0.02% : 0.000004s : 1: interleave_parallel_branches 0.02% : 0.000004s : 1: interleave_split_concat_branches 0.02% : 0.000005s : 1: label_fine_grained_interleaved_index 0.03% : 0.000007s : 1: label_micro_interleaved_index 1.99% : 0.000439s : 1: loop_unroll 0.02% : 0.000004s : 1: merge_cast_opt 0.02% : 0.000005s : 1: micro_interleaved_order_control 2.29% : 0.000507s : 1: mutable_eliminate 0.03% : 0.000007s : 1: offloading_packed_experts 0.06% : 0.000014s : 1: opt.transform.loop_unroll_optimizer 0.07% : 0.000015s : 1: opt.transform.mutable_eliminate 3.44% : 0.000762s : 78: opt.transform.opt_a 0.12% : 0.000026s : 1: opt.transform.opt_after_cconv 0.10% : 0.000023s : 1: opt.transform.opt_after_jit_grad 0.60% : 0.000132s : 28: opt.transform.opt_b 0.19% : 0.000043s : 2: opt.transform.opt_trans_graph 0.16% : 0.000035s : 4: opt.transform.symbol_engine_opt 9.46% : 0.002091s : 1: opt_a 0.47% : 0.000104s : 1: opt_after_cconv 2.17% : 0.000479s : 1: opt_after_jit_grad 1.11% : 0.000245s : 1: opt_b 18.42% : 0.004072s : 1: optimize 0.09% : 0.000020s : 1: optimize_parallel_all_gather_comm 0.04% : 0.000008s : 1: order_py_execute_after_rewriter 0.10% : 0.000022s : 1: overlap_grad_flash_sp 0.02% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.03% : 0.000007s : 1: overlap_grad_ring_attention 0.02% : 0.000004s : 1: overlap_opt_shard_grad_in_pipeline 0.02% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.02% : 0.000005s : 1: overlap_param_gather 0.02% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.03% : 0.000008s : 1: overlap_recompute_and_grad_model_parallel 0.02% : 0.000005s : 1: overlap_recompute_comm 0.03% : 0.000006s : 1: parallel-infer-symbol 0.02% : 0.000004s : 1: parallel-infer-symbol-second 0.02% : 0.000005s : 1: partial_unused_args_eliminate 0.02% : 0.000005s : 1: pipeline_parallel_scheduler 0.02% : 0.000004s : 1: pipeline_split 0.14% : 0.000031s : 1: pre_auto_parallel 0.10% : 0.000023s : 1: py_interpret_to_execute 0.06% : 0.000012s : 1: py_interpret_to_execute_after_opt_a 0.02% : 0.000004s : 1: remove_cast_before_assign_add 0.08% : 0.000018s : 1: remove_dup_value 1.15% : 0.000254s : 1: renormalize.infer 1.08% : 0.000240s : 1: renormalize.specialize 0.02% : 0.000005s : 1: reorder_send_recv_between_fp_bp 0.03% : 0.000007s : 1: rewriter_after_jit_bprop_graph 0.17% : 0.000039s : 1: rewriter_after_opt_a 0.22% : 0.000050s : 1: rewriter_before_opt_a 0.02% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.02% : 0.000005s : 1: slice_recompute_activation 0.02% : 0.000004s : 1: split_layernorm_comm 0.02% : 0.000005s : 1: split_matmul_comm_elemetwise 0.04% : 0.000008s : 1: swap_dp_allreduce_reducescatter 0.35% : 0.000077s : 1: symbol_engine_optimizer 0.33% : 0.000073s : 1: tuple_transform 22.50% : 0.004974s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:57.171.209 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:57.171.469 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0154858, [21] [bootstrap]: 0.00044752 [type_inference]: 0.00520217 [event_method]: 1.213e-05 [auto_monad]: 5.55e-05 [graph_reusing]: 4.99e-06 [inline]: 2.29001e-06 [add_attr]: 0.00314728, [1] [add_attr_with_inline]: 0.00313769, [1] [Cycle 1]: 7.3e-05, [2] [tag_attr]: 1.506e-05 [meta_addattr_fg_expand]: 3.88999e-06 [parallel-infer-symbol]: 2.94001e-06 [pre_auto_parallel]: 2.713e-05 [insert-virtual-dataset]: 2.43e-06 [parallel-infer-symbol-second]: 9.20001e-07 [dataset_repeat_opt]: 2.18998e-06 [pipeline_split]: 1.55001e-06 [optimize]: 0.00537144, [53] [py_interpret_to_execute]: 2.339e-05 [rewriter_before_opt_a]: 5.434e-05 [opt_a]: 0.00290898, [2] [Cycle 1]: 0.00194107, [45] [expand_dump_flag]: 2.81e-06 [switch_simplify]: 2.754e-05 [loop_unroll]: 1.572e-05 [a_1]: 0.00035118 [with_stream_mark]: 1.816e-05 [recompute_prepare]: 1.122e-05 [updatestate_depend_eliminate]: 4.72e-06 [updatestate_assign_eliminate]: 3.93999e-06 [updatestate_loads_eliminate]: 3.93999e-06 [parameter_eliminate]: 2.01e-06 [a_2]: 0.00012627 [accelerated_algorithm]: 8.2e-06 [shard]: 2.12999e-06 [meta_shard_fg_expand]: 2.02999e-06 [shard_inline]: 7.78001e-06 [merge_send_recv]: 9.67001e-06 [auto_parallel]: 7.58001e-06 [parallel]: 1.782e-05 [flash_sp]: 8.87e-06 [merge_comm]: 4.82e-06 [allreduce_fusion]: 4.22003e-06 [matmul_add_comm_reduction]: 1.121e-05 [allreduce_slice_to_reducescatter]: 6.50005e-07 [virtual_shard_identity]: 9.58002e-06 [virtual_dataset]: 7.31999e-06 [get_grad_eliminate_]: 7.53e-06 [virtual_output]: 7.01001e-06 [merge_forward]: 4.63999e-06 [cell_reuse_recompute_pass]: 1.30999e-06 [offload_activation]: 1.108e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.775e-05 [merge_recompute_call_nodes]: 1.39e-06 [before_grad]: 1.329e-05 [set_forward_comm_id_for_comm_node_pass]: 4.87e-06 [meta_fg_expand]: 3.21999e-06 [flash_sp_send_recv_attached]: 2.64001e-06 [receive_attached]: 2.16e-06 [after_resolve]: 1.115e-05 [a_after_grad]: 1.108e-05 [renormalize]: 0.0006385 [add_forward_monad_depend]: 5.98998e-06 [auto_monad_grad]: 2.32001e-06 [auto_monad_eliminator]: 1.769e-05 [cse]: 3.697e-05 [a_3]: 7.346e-05 [Cycle 2]: 0.00095388, [45] [expand_dump_flag]: 1.22e-06 [switch_simplify]: 8.82999e-06 [loop_unroll]: 7.06001e-06 [a_1]: 0.00015418 [with_stream_mark]: 1.314e-05 [recompute_prepare]: 8.12e-06 [updatestate_depend_eliminate]: 4.60001e-06 [updatestate_assign_eliminate]: 3.02002e-06 [updatestate_loads_eliminate]: 3.03e-06 [parameter_eliminate]: 1.64e-06 [a_2]: 0.00011778 [accelerated_algorithm]: 7.23e-06 [shard]: 1.66998e-06 [meta_shard_fg_expand]: 2.00002e-06 [shard_inline]: 8.03999e-06 [merge_send_recv]: 6.72002e-06 [auto_parallel]: 6.79999e-06 [parallel]: 6.43e-06 [flash_sp]: 3.51001e-06 [merge_comm]: 4.2e-06 [allreduce_fusion]: 4.43001e-06 [matmul_add_comm_reduction]: 8.1e-06 [allreduce_slice_to_reducescatter]: 7.7e-07 [virtual_shard_identity]: 1.114e-05 [virtual_dataset]: 7.36999e-06 [get_grad_eliminate_]: 7.04001e-06 [virtual_output]: 7.1e-06 [merge_forward]: 3.66001e-06 [cell_reuse_recompute_pass]: 1.42e-06 [offload_activation]: 9.39998e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.872e-05 [merge_recompute_call_nodes]: 1.17999e-06 [before_grad]: 1.231e-05 [set_forward_comm_id_for_comm_node_pass]: 6.19999e-06 [meta_fg_expand]: 2.99999e-06 [flash_sp_send_recv_attached]: 1.37999e-06 [receive_attached]: 1.22999e-06 [after_resolve]: 1.101e-05 [a_after_grad]: 1.095e-05 [renormalize]: 5.9983e-08 [add_forward_monad_depend]: 2.74999e-06 [auto_monad_grad]: 1.12e-06 [auto_monad_eliminator]: 1.169e-05 [cse]: 2.441e-05 [a_3]: 5.923e-05 [py_interpret_to_execute_after_opt_a]: 1.595e-05 [slice_cell_reuse_recomputed_activation]: 5.34e-06 [rewriter_after_opt_a]: 4.994e-05 [convert_after_rewriter]: 1.022e-05 [order_py_execute_after_rewriter]: 8.84e-06 [mutable_eliminate]: 0.00056201 [opt_b]: 0.00035592, [1] [Cycle 1]: 0.00034625, [7] [b_1]: 0.00023491 [b_2]: 9.28002e-06 [updatestate_depend_eliminate]: 8.15999e-06 [updatestate_assign_eliminate]: 3.3e-06 [updatestate_loads_eliminate]: 2.99999e-06 [renormalize]: 6.19999e-07 [cse]: 2.902e-05 [optimize_parallel_all_gather_comm]: 2.173e-05 [overlap_param_gather]: 4.83001e-06 [cconv]: 2.956e-05 [loop_unroll]: 0.00044685 [opt_after_cconv]: 0.00013984, [1] [Cycle 1]: 0.00013128, [7] [c_1]: 3.523e-05 [parameter_eliminate]: 3.33998e-06 [updatestate_depend_eliminate]: 6.72002e-06 [updatestate_assign_eliminate]: 2.94999e-06 [updatestate_loads_eliminate]: 3.41001e-06 [cse]: 2.387e-05 [renormalize]: 3.50003e-07 [remove_dup_value]: 2.052e-05 [tuple_transform]: 9.816e-05, [1] [Cycle 1]: 9.117e-05, [4] [d_1]: 4.968e-05 [none_parameter_eliminate]: 1.55001e-06 [renormalize]: 2.00002e-07 [switch_simplify]: 8.53001e-06 [partial_unused_args_eliminate]: 4.95999e-06 [add_recomputation]: 5.905e-05 [cse_after_recomputation]: 3.272e-05, [1] [Cycle 1]: 2.609e-05, [1] [cse]: 1.697e-05 [environ_conv]: 1.022e-05 [swap_dp_allreduce_reducescatter]: 8.35001e-06 [bias_add_comm_swap]: 4.82e-06 [label_micro_interleaved_index]: 6.96001e-06 [label_fine_grained_interleaved_index]: 5.11997e-06 [merge_cast_opt]: 3.88001e-06 [slice_recompute_activation]: 4.44998e-06 [micro_interleaved_order_control]: 5.09e-06 [assign_add_opt]: 3.6e-06 [ForceFp32Comm]: 3.09999e-06 [remove_cast_before_assign_add]: 3.33e-06 [full_micro_interleaved_order_control]: 4.63999e-06 [reorder_send_recv_between_fp_bp]: 4.97999e-06 [comm_op_add_attrs]: 3.66001e-06 [add_comm_op_reuse_tag]: 3.6e-06 [interleave_split_concat_branches]: 3.91999e-06 [interleave_parallel_branches]: 3.4e-06 [overlap_opt_shard_in_pipeline]: 3.63999e-06 [overlap_opt_shard_grad_in_pipeline]: 4.63999e-06 [control_data_broadcast_order]: 1.846e-05 [grouped_pairwise_exchange_alltoall]: 4.23999e-06 [offloading_packed_experts]: 6.93e-06 [overlap_recompute_and_grad_model_parallel]: 7.26001e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.78001e-06 [overlap_recompute_allgather_and_fa_grad]: 3.58e-06 [overlap_recompute_comm]: 5.05999e-06 [overlap_grad_ring_attention]: 7.53999e-06 [overlap_grad_flash_sp]: 2.607e-05 [begin_end_overlap_inline]: 3.16999e-06 [split_matmul_comm_elemetwise]: 4.51002e-06 [split_layernorm_comm]: 4.07998e-06 [handle_group_info]: 3.61001e-06 [symbol_engine_optimizer]: 0.0001108, [1] [Cycle 1]: 0.00010265, [6] [build]: 3.44001e-06 [elim_shapecalc]: 1.296e-05 [elim_not_effective]: 1.564e-05 [opt_reshape]: 8.58001e-06 [fold_const_symbol]: 1.253e-05 [renormalize]: 2.89991e-07 [detach_backward]: 3.98001e-06 [pipeline_parallel_scheduler]: 2.11998e-06 [auto_monad_reorder]: 2.375e-05 [get_jit_bprop_graph]: 1.70001e-06 [rewriter_after_jit_bprop_graph]: 5.04998e-06 [opt_after_jit_grad]: 0.00052304 [validate]: 4.622e-05 Sums bootstrap : 0.000448s : 4.25% type_inference : 0.005202s : 49.42% event_method : 0.000012s : 0.12% auto_monad : 0.000055s : 0.53% graph_reusing : 0.000005s : 0.05% inline : 0.000002s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000015s : 0.14% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000004s : 0.04% parallel-infer-symbol : 0.000003s : 0.03% pre_auto_parallel : 0.000027s : 0.26% insert-virtual-dataset : 0.000002s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000023s : 0.22% optimize.rewriter_before_opt_a : 0.000054s : 0.52% optimize.opt_a.expand_dump_flag : 0.000004s : 0.04% optimize.opt_a.switch_simplify : 0.000036s : 0.35% optimize.opt_a.loop_unroll : 0.000023s : 0.22% optimize.opt_a.a_1 : 0.000505s : 4.80% optimize.opt_a.with_stream_mark : 0.000031s : 0.30% optimize.opt_a.recompute_prepare : 0.000019s : 0.18% optimize.opt_a.updatestate_depend_eliminate : 0.000009s : 0.09% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.07% optimize.opt_a.updatestate_loads_eliminate : 0.000007s : 0.07% optimize.opt_a.parameter_eliminate : 0.000004s : 0.03% optimize.opt_a.a_2 : 0.000244s : 2.32% optimize.opt_a.accelerated_algorithm : 0.000015s : 0.15% optimize.opt_a.shard : 0.000004s : 0.04% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.04% optimize.opt_a.shard_inline : 0.000016s : 0.15% optimize.opt_a.merge_send_recv : 0.000016s : 0.16% optimize.opt_a.auto_parallel : 0.000014s : 0.14% optimize.opt_a.parallel : 0.000024s : 0.23% optimize.opt_a.flash_sp : 0.000012s : 0.12% optimize.opt_a.merge_comm : 0.000009s : 0.09% optimize.opt_a.allreduce_fusion : 0.000009s : 0.08% optimize.opt_a.matmul_add_comm_reduction : 0.000019s : 0.18% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000021s : 0.20% optimize.opt_a.virtual_dataset : 0.000015s : 0.14% optimize.opt_a.get_grad_eliminate_ : 0.000015s : 0.14% optimize.opt_a.virtual_output : 0.000014s : 0.13% optimize.opt_a.merge_forward : 0.000008s : 0.08% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.03% optimize.opt_a.offload_activation : 0.000020s : 0.19% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000036s : 0.35% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.02% optimize.opt_a.before_grad : 0.000026s : 0.24% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000011s : 0.11% optimize.opt_a.meta_fg_expand : 0.000006s : 0.06% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.04% optimize.opt_a.receive_attached : 0.000003s : 0.03% optimize.opt_a.after_resolve : 0.000022s : 0.21% optimize.opt_a.a_after_grad : 0.000022s : 0.21% optimize.opt_a.renormalize : 0.000639s : 6.07% optimize.opt_a.add_forward_monad_depend : 0.000009s : 0.08% optimize.opt_a.auto_monad_grad : 0.000003s : 0.03% optimize.opt_a.auto_monad_eliminator : 0.000029s : 0.28% optimize.opt_a.cse : 0.000061s : 0.58% optimize.opt_a.a_3 : 0.000133s : 1.26% optimize.py_interpret_to_execute_after_opt_a : 0.000016s : 0.15% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.05% optimize.rewriter_after_opt_a : 0.000050s : 0.47% optimize.convert_after_rewriter : 0.000010s : 0.10% optimize.order_py_execute_after_rewriter : 0.000009s : 0.08% optimize.mutable_eliminate : 0.000562s : 5.34% optimize.opt_b.b_1 : 0.000235s : 2.23% optimize.opt_b.b_2 : 0.000009s : 0.09% optimize.opt_b.updatestate_depend_eliminate : 0.000008s : 0.08% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_b.renormalize : 0.000001s : 0.01% optimize.opt_b.cse : 0.000029s : 0.28% optimize.optimize_parallel_all_gather_comm : 0.000022s : 0.21% optimize.overlap_param_gather : 0.000005s : 0.05% optimize.cconv : 0.000030s : 0.28% optimize.loop_unroll : 0.000447s : 4.24% optimize.opt_after_cconv.c_1 : 0.000035s : 0.33% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000007s : 0.06% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.cse : 0.000024s : 0.23% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000021s : 0.19% optimize.tuple_transform.d_1 : 0.000050s : 0.47% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000009s : 0.08% optimize.partial_unused_args_eliminate : 0.000005s : 0.05% optimize.add_recomputation : 0.000059s : 0.56% optimize.cse_after_recomputation.cse : 0.000017s : 0.16% optimize.environ_conv : 0.000010s : 0.10% optimize.swap_dp_allreduce_reducescatter : 0.000008s : 0.08% optimize.bias_add_comm_swap : 0.000005s : 0.05% optimize.label_micro_interleaved_index : 0.000007s : 0.07% optimize.label_fine_grained_interleaved_index : 0.000005s : 0.05% optimize.merge_cast_opt : 0.000004s : 0.04% optimize.slice_recompute_activation : 0.000004s : 0.04% optimize.micro_interleaved_order_control : 0.000005s : 0.05% optimize.assign_add_opt : 0.000004s : 0.03% optimize.ForceFp32Comm : 0.000003s : 0.03% optimize.remove_cast_before_assign_add : 0.000003s : 0.03% optimize.full_micro_interleaved_order_control : 0.000005s : 0.04% optimize.reorder_send_recv_between_fp_bp : 0.000005s : 0.05% optimize.comm_op_add_attrs : 0.000004s : 0.03% optimize.add_comm_op_reuse_tag : 0.000004s : 0.03% optimize.interleave_split_concat_branches : 0.000004s : 0.04% optimize.interleave_parallel_branches : 0.000003s : 0.03% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.03% optimize.overlap_opt_shard_grad_in_pipeline : 0.000005s : 0.04% optimize.control_data_broadcast_order : 0.000018s : 0.18% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.04% optimize.offloading_packed_experts : 0.000007s : 0.07% optimize.overlap_recompute_and_grad_model_parallel : 0.000007s : 0.07% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.04% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.03% optimize.overlap_recompute_comm : 0.000005s : 0.05% optimize.overlap_grad_ring_attention : 0.000008s : 0.07% optimize.overlap_grad_flash_sp : 0.000026s : 0.25% optimize.begin_end_overlap_inline : 0.000003s : 0.03% optimize.split_matmul_comm_elemetwise : 0.000005s : 0.04% optimize.split_layernorm_comm : 0.000004s : 0.04% optimize.handle_group_info : 0.000004s : 0.03% optimize.symbol_engine_optimizer.build : 0.000003s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000013s : 0.12% optimize.symbol_engine_optimizer.elim_not_effective : 0.000016s : 0.15% optimize.symbol_engine_optimizer.opt_reshape : 0.000009s : 0.08% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000013s : 0.12% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000004s : 0.04% pipeline_parallel_scheduler : 0.000002s : 0.02% auto_monad_reorder : 0.000024s : 0.23% get_jit_bprop_graph : 0.000002s : 0.02% rewriter_after_jit_bprop_graph : 0.000005s : 0.05% opt_after_jit_grad : 0.000523s : 4.97% validate : 0.000046s : 0.44% Time group info: ------[substitution.] 0.000180 29 11.91% : 0.000021s : 2: substitution.cast_eliminate 1.29% : 0.000002s : 3: substitution.elim_not_effective 0.96% : 0.000002s : 3: substitution.fold_const_symbol 3.69% : 0.000007s : 4: substitution.graph_param_transform 56.21% : 0.000101s : 2: substitution.inline 2.63% : 0.000005s : 6: substitution.j_node_and_user_rematch 3.20% : 0.000006s : 6: substitution.remove_not_recompute_node 1.77% : 0.000003s : 2: substitution.replace_old_param 18.35% : 0.000033s : 1: substitution.value_based_eliminate ------[type_inference.] 0.005152 2 91.42% : 0.004710s : 1: type_inference.infer 8.58% : 0.000442s : 1: type_inference.specialize ------[replace.] 0.000022 2 100.00% : 0.000022s : 2: replace.inline ------[match.] 0.000100 2 100.00% : 0.000100s : 2: match.inline ------[predicate.] 0.000173 980 0.81% : 0.000001s : 9: predicate.accumulaten_eliminater 1.14% : 0.000002s : 4: predicate.ad_related_special_op_eliminate 0.73% : 0.000001s : 8: predicate.addn_check_dump 0.80% : 0.000001s : 9: predicate.addn_zero_filter 0.65% : 0.000001s : 9: predicate.adjust_all_reduce_mul_add 2.27% : 0.000004s : 17: predicate.arithmetic_simplify 0.91% : 0.000002s : 9: predicate.cast_eliminate 0.88% : 0.000002s : 8: predicate.check_bprop_eliminate 0.71% : 0.000001s : 8: predicate.compare_switch_simplify 0.23% : 0.000000s : 4: predicate.const_output_eliminate 0.76% : 0.000001s : 8: predicate.depend_value_elim 0.81% : 0.000001s : 9: predicate.dict_get_item_const_eliminator 0.99% : 0.000002s : 9: predicate.dict_get_item_eliminator 0.76% : 0.000001s : 9: predicate.dict_set_item_eliminator 1.13% : 0.000002s : 8: predicate.dumpgradient_eliminate 0.24% : 0.000000s : 4: predicate.elim_not_effective 0.54% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.16% : 0.000002s : 13: predicate.environ_add_const_eliminate 1.07% : 0.000002s : 13: predicate.environ_get_add_eliminate 1.02% : 0.000002s : 13: predicate.environ_get_depend_swap 1.82% : 0.000003s : 21: predicate.environ_get_eliminate 1.08% : 0.000002s : 13: predicate.environ_get_set_eliminate 0.90% : 0.000002s : 11: predicate.exchange_switch_depend_value 1.76% : 0.000003s : 11: predicate.float_depend_g_call 0.71% : 0.000001s : 8: predicate.float_environ_get_switch 1.03% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.24% : 0.000000s : 4: predicate.fold_const_symbol 0.86% : 0.000001s : 8: predicate.get_grad_eliminate 0.26% : 0.000000s : 4: predicate.graph_param_transform 0.80% : 0.000001s : 8: predicate.incorporate_call 0.64% : 0.000001s : 8: predicate.incorporate_call_switch 6.76% : 0.000012s : 44: predicate.inline 1.37% : 0.000002s : 8: predicate.inline_without_move 0.38% : 0.000001s : 8: predicate.j_node_and_user_rematch 0.96% : 0.000002s : 8: predicate.less_batch_normalization 1.63% : 0.000003s : 17: predicate.list_to_tuple_eliminator_ 2.14% : 0.000004s : 26: predicate.load_eliminater 1.35% : 0.000002s : 4: predicate.loop_unroll_after_grad 1.48% : 0.000003s : 16: predicate.loop_unroll_before_grad 1.69% : 0.000003s : 17: predicate.make_slice_get_slice_eliminator 0.79% : 0.000001s : 8: predicate.merge_addn 0.69% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.72% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.68% : 0.000001s : 9: predicate.minmaximum_grad 1.64% : 0.000003s : 4: predicate.mutable_eliminate 0.44% : 0.000001s : 4: predicate.opt_reshape 0.46% : 0.000001s : 4: predicate.parallel_virtual_node 1.16% : 0.000002s : 11: predicate.partial_defer_inline 1.28% : 0.000002s : 13: predicate.partial_eliminate 0.75% : 0.000001s : 9: predicate.print_const_string_wrapper 0.85% : 0.000001s : 8: predicate.reduce_all_const_elim 0.91% : 0.000002s : 9: predicate.reduce_eliminate 2.02% : 0.000003s : 26: predicate.redundant_stop_gradient_eliminater 0.46% : 0.000001s : 8: predicate.remove_not_recompute_node 1.24% : 0.000002s : 17: predicate.replace_applicator 0.58% : 0.000001s : 8: predicate.replace_old_param 0.34% : 0.000001s : 4: predicate.reset_defer_inline 0.88% : 0.000002s : 9: predicate.reshape_eliminate 0.79% : 0.000001s : 8: predicate.row_tensor_add_zeros_like 0.42% : 0.000001s : 4: predicate.row_tensor_eliminate 1.01% : 0.000002s : 8: predicate.same_eliminate 0.61% : 0.000001s : 8: predicate.set_cell_output_no_recompute 1.14% : 0.000002s : 8: predicate.shard_identity_eliminate 0.97% : 0.000002s : 8: predicate.special_op_eliminate 1.10% : 0.000002s : 8: predicate.specialize_transform 1.09% : 0.000002s : 8: predicate.split_environ_get_set_with_tuple_value 0.93% : 0.000002s : 8: predicate.stack_unstack_eliminate 0.44% : 0.000001s : 4: predicate.switch_call_monad_eliminater 0.97% : 0.000002s : 11: predicate.switch_defer_inline 1.63% : 0.000003s : 19: predicate.switch_layer_defer_inline 4.42% : 0.000008s : 39: predicate.switch_simplify 0.78% : 0.000001s : 9: predicate.tile_eliminate 0.77% : 0.000001s : 9: predicate.transpose_eliminate 1.61% : 0.000003s : 17: predicate.tuple_list_convert_item_index_to_positive 1.66% : 0.000003s : 17: predicate.tuple_list_get_item_const_eliminator 1.44% : 0.000002s : 17: predicate.tuple_list_get_item_depend_reorder 3.13% : 0.000005s : 25: predicate.tuple_list_get_item_eliminator 1.43% : 0.000002s : 17: predicate.tuple_list_get_set_item_eliminator 2.59% : 0.000004s : 25: predicate.tuple_list_set_item_eliminator 1.46% : 0.000003s : 17: predicate.tuple_to_list_eliminator_ 1.99% : 0.000003s : 26: predicate.updatestate_pure_node_eliminater 2.97% : 0.000005s : 34: predicate.updatestate_useless_node_eliminater 0.69% : 0.000001s : 4: predicate.value_based_eliminate 0.82% : 0.000001s : 8: predicate.virtual_dataset_eliminate 0.76% : 0.000001s : 8: predicate.virtual_output_eliminate 0.34% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.70% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000223 5 8.00% : 0.000018s : 1: func_graph_cloner_run.FuncGraphClonerGraph 92.00% : 0.000205s : 4: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.025851 192 0.02% : 0.000006s : 1: ForceFp32Comm 12.21% : 0.003157s : 1: add_attr 12.15% : 0.003142s : 1: add_attr_with_inline 0.02% : 0.000006s : 1: add_comm_op_reuse_tag 0.24% : 0.000063s : 1: add_recomputation 0.02% : 0.000006s : 1: assign_add_opt 0.25% : 0.000064s : 1: auto_monad 0.12% : 0.000032s : 1: auto_monad_reorder 0.03% : 0.000007s : 1: begin_end_overlap_inline 0.03% : 0.000008s : 1: bias_add_comm_swap 1.91% : 0.000494s : 1: bootstrap 0.13% : 0.000033s : 1: cconv 0.02% : 0.000006s : 1: comm_op_add_attrs 0.08% : 0.000022s : 1: control_data_broadcast_order 0.05% : 0.000014s : 1: convert_after_rewriter 0.14% : 0.000036s : 1: cse_after_recomputation 0.03% : 0.000008s : 1: dataset_repeat_opt 0.08% : 0.000021s : 1: detach_backward 0.05% : 0.000013s : 1: environ_conv 0.08% : 0.000022s : 1: event_method 0.03% : 0.000007s : 1: full_micro_interleaved_order_control 0.03% : 0.000008s : 1: get_jit_bprop_graph 0.04% : 0.000011s : 1: graph_reusing 0.03% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.02% : 0.000006s : 1: handle_group_info 0.03% : 0.000008s : 1: inline 0.03% : 0.000009s : 1: insert-virtual-dataset 0.02% : 0.000006s : 1: interleave_parallel_branches 0.03% : 0.000007s : 1: interleave_split_concat_branches 0.03% : 0.000008s : 1: label_fine_grained_interleaved_index 0.04% : 0.000010s : 1: label_micro_interleaved_index 1.75% : 0.000453s : 1: loop_unroll 0.03% : 0.000007s : 1: merge_cast_opt 0.03% : 0.000008s : 1: micro_interleaved_order_control 2.20% : 0.000570s : 1: mutable_eliminate 0.04% : 0.000010s : 1: offloading_packed_experts 0.06% : 0.000016s : 1: opt.transform.loop_unroll_optimizer 0.08% : 0.000020s : 1: opt.transform.mutable_eliminate 3.79% : 0.000981s : 78: opt.transform.opt_a 0.13% : 0.000034s : 1: opt.transform.opt_after_cconv 0.11% : 0.000030s : 1: opt.transform.opt_after_jit_grad 0.64% : 0.000166s : 28: opt.transform.opt_b 0.22% : 0.000056s : 2: opt.transform.opt_trans_graph 0.18% : 0.000046s : 4: opt.transform.symbol_engine_opt 11.27% : 0.002912s : 1: opt_a 0.55% : 0.000143s : 1: opt_after_cconv 2.07% : 0.000535s : 1: opt_after_jit_grad 1.39% : 0.000360s : 1: opt_b 22.15% : 0.005727s : 1: optimize 0.10% : 0.000025s : 1: optimize_parallel_all_gather_comm 0.05% : 0.000012s : 1: order_py_execute_after_rewriter 0.12% : 0.000030s : 1: overlap_grad_flash_sp 0.03% : 0.000007s : 1: overlap_grad_matmul_and_grad_allreduce 0.04% : 0.000010s : 1: overlap_grad_ring_attention 0.03% : 0.000007s : 1: overlap_opt_shard_grad_in_pipeline 0.02% : 0.000006s : 1: overlap_opt_shard_in_pipeline 0.03% : 0.000008s : 1: overlap_param_gather 0.02% : 0.000006s : 1: overlap_recompute_allgather_and_fa_grad 0.04% : 0.000010s : 1: overlap_recompute_and_grad_model_parallel 0.03% : 0.000008s : 1: overlap_recompute_comm 0.04% : 0.000009s : 1: parallel-infer-symbol 0.02% : 0.000006s : 1: parallel-infer-symbol-second 0.03% : 0.000008s : 1: partial_unused_args_eliminate 0.04% : 0.000009s : 1: pipeline_parallel_scheduler 0.03% : 0.000007s : 1: pipeline_split 0.13% : 0.000035s : 1: pre_auto_parallel 0.10% : 0.000027s : 1: py_interpret_to_execute 0.08% : 0.000020s : 1: py_interpret_to_execute_after_opt_a 0.02% : 0.000006s : 1: remove_cast_before_assign_add 0.09% : 0.000024s : 1: remove_dup_value 1.44% : 0.000373s : 1: renormalize.infer 0.99% : 0.000257s : 1: renormalize.specialize 0.03% : 0.000008s : 1: reorder_send_recv_between_fp_bp 0.04% : 0.000011s : 1: rewriter_after_jit_bprop_graph 0.21% : 0.000054s : 1: rewriter_after_opt_a 0.22% : 0.000058s : 1: rewriter_before_opt_a 0.03% : 0.000008s : 1: slice_cell_reuse_recomputed_activation 0.03% : 0.000007s : 1: slice_recompute_activation 0.03% : 0.000007s : 1: split_layernorm_comm 0.03% : 0.000008s : 1: split_matmul_comm_elemetwise 0.04% : 0.000011s : 1: swap_dp_allreduce_reducescatter 0.44% : 0.000114s : 1: symbol_engine_optimizer 0.39% : 0.000101s : 1: tuple_transform 20.24% : 0.005233s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:57.361.810 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0137416, [21] [bootstrap]: 0.00044076 [type_inference]: 0.00497496 [event_method]: 1.209e-05 [auto_monad]: 5.506e-05 [graph_reusing]: 5.04e-06 [inline]: 2.63003e-06 [add_attr]: 0.00309668, [1] [add_attr_with_inline]: 0.00308842, [1] [Cycle 1]: 4.951e-05, [2] [tag_attr]: 1.403e-05 [meta_addattr_fg_expand]: 3.87002e-06 [parallel-infer-symbol]: 3.38e-06 [pre_auto_parallel]: 2.439e-05 [insert-virtual-dataset]: 2.35002e-06 [parallel-infer-symbol-second]: 1.15999e-06 [dataset_repeat_opt]: 1.81e-06 [pipeline_split]: 1.54e-06 [optimize]: 0.00443595, [53] [py_interpret_to_execute]: 1.847e-05 [rewriter_before_opt_a]: 4.996e-05 [opt_a]: 0.00236071, [2] [Cycle 1]: 0.00163265, [45] [expand_dump_flag]: 3.08e-06 [switch_simplify]: 2.689e-05 [loop_unroll]: 1.486e-05 [a_1]: 0.00034639 [with_stream_mark]: 1.672e-05 [recompute_prepare]: 8.69e-06 [updatestate_depend_eliminate]: 4.50001e-06 [updatestate_assign_eliminate]: 3.66999e-06 [updatestate_loads_eliminate]: 4.18999e-06 [parameter_eliminate]: 2.09e-06 [a_2]: 9.924e-05 [accelerated_algorithm]: 8.07e-06 [shard]: 2.58998e-06 [meta_shard_fg_expand]: 1.94e-06 [shard_inline]: 7.33e-06 [merge_send_recv]: 8.89998e-06 [auto_parallel]: 6.81999e-06 [parallel]: 1.897e-05 [flash_sp]: 8.79e-06 [merge_comm]: 4.50999e-06 [allreduce_fusion]: 4.11001e-06 [matmul_add_comm_reduction]: 1.006e-05 [allreduce_slice_to_reducescatter]: 9.00007e-07 [virtual_shard_identity]: 8.97999e-06 [virtual_dataset]: 7.63999e-06 [get_grad_eliminate_]: 6.86001e-06 [virtual_output]: 6.98998e-06 [merge_forward]: 4.43999e-06 [cell_reuse_recompute_pass]: 1.52999e-06 [offload_activation]: 1.066e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.553e-05 [merge_recompute_call_nodes]: 1.41002e-06 [before_grad]: 1.254e-05 [set_forward_comm_id_for_comm_node_pass]: 4.85001e-06 [meta_fg_expand]: 3.74002e-06 [flash_sp_send_recv_attached]: 2.65002e-06 [receive_attached]: 2.32999e-06 [after_resolve]: 1.078e-05 [a_after_grad]: 1.112e-05 [renormalize]: 0.00053766 [add_forward_monad_depend]: 5.42999e-06 [auto_monad_grad]: 2.04999e-06 [auto_monad_eliminator]: 1.611e-05 [cse]: 4.571e-05 [a_3]: 5.682e-05 [Cycle 2]: 0.00071822, [45] [expand_dump_flag]: 1.18001e-06 [switch_simplify]: 8.54e-06 [loop_unroll]: 7.38999e-06 [a_1]: 0.00015318 [with_stream_mark]: 1.234e-05 [recompute_prepare]: 7.94997e-06 [updatestate_depend_eliminate]: 4.23001e-06 [updatestate_assign_eliminate]: 2.92002e-06 [updatestate_loads_eliminate]: 3.03998e-06 [parameter_eliminate]: 1.07998e-06 [a_2]: 8.775e-05 [accelerated_algorithm]: 7.08998e-06 [shard]: 1.11002e-06 [meta_shard_fg_expand]: 1.66e-06 [shard_inline]: 7.15998e-06 [merge_send_recv]: 6.11998e-06 [auto_parallel]: 6.28998e-06 [parallel]: 4.63001e-06 [flash_sp]: 3.14001e-06 [merge_comm]: 3.86001e-06 [allreduce_fusion]: 3.79002e-06 [matmul_add_comm_reduction]: 7.94002e-06 [allreduce_slice_to_reducescatter]: 3.80009e-07 [virtual_shard_identity]: 7.93001e-06 [virtual_dataset]: 6.64999e-06 [get_grad_eliminate_]: 6.46e-06 [virtual_output]: 6.24999e-06 [merge_forward]: 3.77998e-06 [cell_reuse_recompute_pass]: 1.62001e-06 [offload_activation]: 7.55e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.358e-05 [merge_recompute_call_nodes]: 9.80013e-07 [before_grad]: 1.133e-05 [set_forward_comm_id_for_comm_node_pass]: 4.57e-06 [meta_fg_expand]: 2.76e-06 [flash_sp_send_recv_attached]: 8.29983e-07 [receive_attached]: 1.02998e-06 [after_resolve]: 9.37001e-06 [a_after_grad]: 1.069e-05 [renormalize]: 7.99773e-08 [add_forward_monad_depend]: 1.25001e-06 [auto_monad_grad]: 1.26002e-06 [auto_monad_eliminator]: 8.60999e-06 [cse]: 1.882e-05 [a_3]: 4.313e-05 [py_interpret_to_execute_after_opt_a]: 1.02e-05 [slice_cell_reuse_recomputed_activation]: 1.95001e-06 [rewriter_after_opt_a]: 4.074e-05 [convert_after_rewriter]: 7.08e-06 [order_py_execute_after_rewriter]: 5.78997e-06 [mutable_eliminate]: 0.00048873 [opt_b]: 0.00027891, [1] [Cycle 1]: 0.00027202, [7] [b_1]: 0.00018495 [b_2]: 9.07001e-06 [updatestate_depend_eliminate]: 6.94999e-06 [updatestate_assign_eliminate]: 3.06001e-06 [updatestate_loads_eliminate]: 2.84999e-06 [renormalize]: 5.3001e-07 [cse]: 2.55e-05 [optimize_parallel_all_gather_comm]: 1.792e-05 [overlap_param_gather]: 2.19001e-06 [cconv]: 2.553e-05 [loop_unroll]: 0.00043108 [opt_after_cconv]: 0.00011228, [1] [Cycle 1]: 0.00010699, [7] [c_1]: 3.315e-05 [parameter_eliminate]: 3.26001e-06 [updatestate_depend_eliminate]: 6.43998e-06 [updatestate_assign_eliminate]: 2.99999e-06 [updatestate_loads_eliminate]: 3.04999e-06 [cse]: 2.395e-05 [renormalize]: 4.89992e-07 [remove_dup_value]: 1.634e-05 [tuple_transform]: 8.109e-05, [1] [Cycle 1]: 7.653e-05, [4] [d_1]: 4.786e-05 [none_parameter_eliminate]: 1.59e-06 [renormalize]: 1.90019e-07 [switch_simplify]: 7.97e-06 [partial_unused_args_eliminate]: 2.06998e-06 [add_recomputation]: 5.534e-05 [cse_after_recomputation]: 2.552e-05, [1] [Cycle 1]: 2.108e-05, [1] [cse]: 1.554e-05 [environ_conv]: 6.64999e-06 [swap_dp_allreduce_reducescatter]: 6.01998e-06 [bias_add_comm_swap]: 2.54999e-06 [label_micro_interleaved_index]: 4.65999e-06 [label_fine_grained_interleaved_index]: 2.79001e-06 [merge_cast_opt]: 1.50999e-06 [slice_recompute_activation]: 1.95001e-06 [micro_interleaved_order_control]: 2.30002e-06 [assign_add_opt]: 1.34e-06 [ForceFp32Comm]: 8.09989e-07 [remove_cast_before_assign_add]: 9.89996e-07 [full_micro_interleaved_order_control]: 2.16e-06 [reorder_send_recv_between_fp_bp]: 2.59001e-06 [comm_op_add_attrs]: 9.80013e-07 [add_comm_op_reuse_tag]: 9.09989e-07 [interleave_split_concat_branches]: 1.14e-06 [interleave_parallel_branches]: 1.03001e-06 [overlap_opt_shard_in_pipeline]: 1.48002e-06 [overlap_opt_shard_grad_in_pipeline]: 1.86e-06 [control_data_broadcast_order]: 1.452e-05 [grouped_pairwise_exchange_alltoall]: 1.63002e-06 [offloading_packed_experts]: 4.23001e-06 [overlap_recompute_and_grad_model_parallel]: 5.12e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.48002e-06 [overlap_recompute_allgather_and_fa_grad]: 1.35001e-06 [overlap_recompute_comm]: 2.19001e-06 [overlap_grad_ring_attention]: 4.57e-06 [overlap_grad_flash_sp]: 2.019e-05 [begin_end_overlap_inline]: 5.49975e-07 [split_matmul_comm_elemetwise]: 2.29001e-06 [split_layernorm_comm]: 1.67999e-06 [handle_group_info]: 1.23002e-06 [symbol_engine_optimizer]: 8.249e-05, [1] [Cycle 1]: 7.805e-05, [6] [build]: 2.88e-06 [elim_shapecalc]: 1.083e-05 [elim_not_effective]: 1.483e-05 [opt_reshape]: 8.34002e-06 [fold_const_symbol]: 1.234e-05 [renormalize]: 2.00002e-07 [detach_backward]: 1.89999e-06 [pipeline_parallel_scheduler]: 1.86e-06 [auto_monad_reorder]: 2.048e-05 [get_jit_bprop_graph]: 1.34998e-06 [rewriter_after_jit_bprop_graph]: 3.68999e-06 [opt_after_jit_grad]: 0.00046887 [validate]: 4.124e-05 Sums bootstrap : 0.000441s : 4.55% type_inference : 0.004975s : 51.34% event_method : 0.000012s : 0.12% auto_monad : 0.000055s : 0.57% graph_reusing : 0.000005s : 0.05% inline : 0.000003s : 0.03% add_attr.add_attr_with_inline.tag_attr : 0.000014s : 0.14% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000004s : 0.04% parallel-infer-symbol : 0.000003s : 0.03% pre_auto_parallel : 0.000024s : 0.25% insert-virtual-dataset : 0.000002s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.02% optimize.py_interpret_to_execute : 0.000018s : 0.19% optimize.rewriter_before_opt_a : 0.000050s : 0.52% optimize.opt_a.expand_dump_flag : 0.000004s : 0.04% optimize.opt_a.switch_simplify : 0.000035s : 0.37% optimize.opt_a.loop_unroll : 0.000022s : 0.23% optimize.opt_a.a_1 : 0.000500s : 5.16% optimize.opt_a.with_stream_mark : 0.000029s : 0.30% optimize.opt_a.recompute_prepare : 0.000017s : 0.17% optimize.opt_a.updatestate_depend_eliminate : 0.000009s : 0.09% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.07% optimize.opt_a.updatestate_loads_eliminate : 0.000007s : 0.07% optimize.opt_a.parameter_eliminate : 0.000003s : 0.03% optimize.opt_a.a_2 : 0.000187s : 1.93% optimize.opt_a.accelerated_algorithm : 0.000015s : 0.16% optimize.opt_a.shard : 0.000004s : 0.04% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.04% optimize.opt_a.shard_inline : 0.000014s : 0.15% optimize.opt_a.merge_send_recv : 0.000015s : 0.16% optimize.opt_a.auto_parallel : 0.000013s : 0.14% optimize.opt_a.parallel : 0.000024s : 0.24% optimize.opt_a.flash_sp : 0.000012s : 0.12% optimize.opt_a.merge_comm : 0.000008s : 0.09% optimize.opt_a.allreduce_fusion : 0.000008s : 0.08% optimize.opt_a.matmul_add_comm_reduction : 0.000018s : 0.19% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000017s : 0.17% optimize.opt_a.virtual_dataset : 0.000014s : 0.15% optimize.opt_a.get_grad_eliminate_ : 0.000013s : 0.14% optimize.opt_a.virtual_output : 0.000013s : 0.14% optimize.opt_a.merge_forward : 0.000008s : 0.08% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.03% optimize.opt_a.offload_activation : 0.000018s : 0.19% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000029s : 0.30% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.02% optimize.opt_a.before_grad : 0.000024s : 0.25% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000009s : 0.10% optimize.opt_a.meta_fg_expand : 0.000007s : 0.07% optimize.opt_a.flash_sp_send_recv_attached : 0.000003s : 0.04% optimize.opt_a.receive_attached : 0.000003s : 0.03% optimize.opt_a.after_resolve : 0.000020s : 0.21% optimize.opt_a.a_after_grad : 0.000022s : 0.23% optimize.opt_a.renormalize : 0.000538s : 5.55% optimize.opt_a.add_forward_monad_depend : 0.000007s : 0.07% optimize.opt_a.auto_monad_grad : 0.000003s : 0.03% optimize.opt_a.auto_monad_eliminator : 0.000025s : 0.26% optimize.opt_a.cse : 0.000065s : 0.67% optimize.opt_a.a_3 : 0.000100s : 1.03% optimize.py_interpret_to_execute_after_opt_a : 0.000010s : 0.11% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.02% optimize.rewriter_after_opt_a : 0.000041s : 0.42% optimize.convert_after_rewriter : 0.000007s : 0.07% optimize.order_py_execute_after_rewriter : 0.000006s : 0.06% optimize.mutable_eliminate : 0.000489s : 5.04% optimize.opt_b.b_1 : 0.000185s : 1.91% optimize.opt_b.b_2 : 0.000009s : 0.09% optimize.opt_b.updatestate_depend_eliminate : 0.000007s : 0.07% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_b.renormalize : 0.000001s : 0.01% optimize.opt_b.cse : 0.000025s : 0.26% optimize.optimize_parallel_all_gather_comm : 0.000018s : 0.18% optimize.overlap_param_gather : 0.000002s : 0.02% optimize.cconv : 0.000026s : 0.26% optimize.loop_unroll : 0.000431s : 4.45% optimize.opt_after_cconv.c_1 : 0.000033s : 0.34% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.07% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.cse : 0.000024s : 0.25% optimize.opt_after_cconv.renormalize : 0.000000s : 0.01% optimize.remove_dup_value : 0.000016s : 0.17% optimize.tuple_transform.d_1 : 0.000048s : 0.49% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.02% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000008s : 0.08% optimize.partial_unused_args_eliminate : 0.000002s : 0.02% optimize.add_recomputation : 0.000055s : 0.57% optimize.cse_after_recomputation.cse : 0.000016s : 0.16% optimize.environ_conv : 0.000007s : 0.07% optimize.swap_dp_allreduce_reducescatter : 0.000006s : 0.06% optimize.bias_add_comm_swap : 0.000003s : 0.03% optimize.label_micro_interleaved_index : 0.000005s : 0.05% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.03% optimize.merge_cast_opt : 0.000002s : 0.02% optimize.slice_recompute_activation : 0.000002s : 0.02% optimize.micro_interleaved_order_control : 0.000002s : 0.02% optimize.assign_add_opt : 0.000001s : 0.01% optimize.ForceFp32Comm : 0.000001s : 0.01% optimize.remove_cast_before_assign_add : 0.000001s : 0.01% optimize.full_micro_interleaved_order_control : 0.000002s : 0.02% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.03% optimize.comm_op_add_attrs : 0.000001s : 0.01% optimize.add_comm_op_reuse_tag : 0.000001s : 0.01% optimize.interleave_split_concat_branches : 0.000001s : 0.01% optimize.interleave_parallel_branches : 0.000001s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.02% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.02% optimize.control_data_broadcast_order : 0.000015s : 0.15% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.02% optimize.offloading_packed_experts : 0.000004s : 0.04% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.05% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.02% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.01% optimize.overlap_recompute_comm : 0.000002s : 0.02% optimize.overlap_grad_ring_attention : 0.000005s : 0.05% optimize.overlap_grad_flash_sp : 0.000020s : 0.21% optimize.begin_end_overlap_inline : 0.000001s : 0.01% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.02% optimize.split_layernorm_comm : 0.000002s : 0.02% optimize.handle_group_info : 0.000001s : 0.01% optimize.symbol_engine_optimizer.build : 0.000003s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000011s : 0.11% optimize.symbol_engine_optimizer.elim_not_effective : 0.000015s : 0.15% optimize.symbol_engine_optimizer.opt_reshape : 0.000008s : 0.09% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000012s : 0.13% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.02% pipeline_parallel_scheduler : 0.000002s : 0.02% auto_monad_reorder : 0.000020s : 0.21% get_jit_bprop_graph : 0.000001s : 0.01% rewriter_after_jit_bprop_graph : 0.000004s : 0.04% opt_after_jit_grad : 0.000469s : 4.84% validate : 0.000041s : 0.43% Time group info: ------[substitution.] 0.000174 29 11.29% : 0.000020s : 2: substitution.cast_eliminate 1.34% : 0.000002s : 3: substitution.elim_not_effective 1.09% : 0.000002s : 3: substitution.fold_const_symbol 3.39% : 0.000006s : 4: substitution.graph_param_transform 56.48% : 0.000098s : 2: substitution.inline 2.72% : 0.000005s : 6: substitution.j_node_and_user_rematch 3.47% : 0.000006s : 6: substitution.remove_not_recompute_node 1.78% : 0.000003s : 2: substitution.replace_old_param 18.44% : 0.000032s : 1: substitution.value_based_eliminate ------[type_inference.] 0.004930 2 90.99% : 0.004485s : 1: type_inference.infer 9.01% : 0.000444s : 1: type_inference.specialize ------[replace.] 0.000022 2 100.00% : 0.000022s : 2: replace.inline ------[match.] 0.000097 2 100.00% : 0.000097s : 2: match.inline ------[predicate.] 0.000168 980 0.78% : 0.000001s : 9: predicate.accumulaten_eliminater 1.19% : 0.000002s : 4: predicate.ad_related_special_op_eliminate 0.69% : 0.000001s : 8: predicate.addn_check_dump 0.84% : 0.000001s : 9: predicate.addn_zero_filter 0.72% : 0.000001s : 9: predicate.adjust_all_reduce_mul_add 2.21% : 0.000004s : 17: predicate.arithmetic_simplify 0.96% : 0.000002s : 9: predicate.cast_eliminate 0.91% : 0.000002s : 8: predicate.check_bprop_eliminate 0.68% : 0.000001s : 8: predicate.compare_switch_simplify 0.24% : 0.000000s : 4: predicate.const_output_eliminate 0.81% : 0.000001s : 8: predicate.depend_value_elim 0.89% : 0.000002s : 9: predicate.dict_get_item_const_eliminator 0.95% : 0.000002s : 9: predicate.dict_get_item_eliminator 0.83% : 0.000001s : 9: predicate.dict_set_item_eliminator 1.17% : 0.000002s : 8: predicate.dumpgradient_eliminate 0.26% : 0.000000s : 4: predicate.elim_not_effective 0.55% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.12% : 0.000002s : 13: predicate.environ_add_const_eliminate 1.07% : 0.000002s : 13: predicate.environ_get_add_eliminate 1.03% : 0.000002s : 13: predicate.environ_get_depend_swap 1.84% : 0.000003s : 21: predicate.environ_get_eliminate 1.03% : 0.000002s : 13: predicate.environ_get_set_eliminate 0.89% : 0.000001s : 11: predicate.exchange_switch_depend_value 1.98% : 0.000003s : 11: predicate.float_depend_g_call 0.70% : 0.000001s : 8: predicate.float_environ_get_switch 1.08% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.22% : 0.000000s : 4: predicate.fold_const_symbol 0.82% : 0.000001s : 8: predicate.get_grad_eliminate 0.24% : 0.000000s : 4: predicate.graph_param_transform 0.85% : 0.000001s : 8: predicate.incorporate_call 0.67% : 0.000001s : 8: predicate.incorporate_call_switch 6.08% : 0.000010s : 44: predicate.inline 1.07% : 0.000002s : 8: predicate.inline_without_move 0.39% : 0.000001s : 8: predicate.j_node_and_user_rematch 1.01% : 0.000002s : 8: predicate.less_batch_normalization 1.62% : 0.000003s : 17: predicate.list_to_tuple_eliminator_ 2.19% : 0.000004s : 26: predicate.load_eliminater 1.14% : 0.000002s : 4: predicate.loop_unroll_after_grad 1.45% : 0.000002s : 16: predicate.loop_unroll_before_grad 1.67% : 0.000003s : 17: predicate.make_slice_get_slice_eliminator 0.73% : 0.000001s : 8: predicate.merge_addn 0.90% : 0.000002s : 8: predicate.micro_step_allgather_replace 0.71% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.78% : 0.000001s : 9: predicate.minmaximum_grad 1.11% : 0.000002s : 4: predicate.mutable_eliminate 0.40% : 0.000001s : 4: predicate.opt_reshape 0.48% : 0.000001s : 4: predicate.parallel_virtual_node 1.28% : 0.000002s : 11: predicate.partial_defer_inline 1.28% : 0.000002s : 13: predicate.partial_eliminate 0.87% : 0.000001s : 9: predicate.print_const_string_wrapper 0.66% : 0.000001s : 8: predicate.reduce_all_const_elim 1.06% : 0.000002s : 9: predicate.reduce_eliminate 2.10% : 0.000004s : 26: predicate.redundant_stop_gradient_eliminater 0.49% : 0.000001s : 8: predicate.remove_not_recompute_node 1.17% : 0.000002s : 17: predicate.replace_applicator 0.63% : 0.000001s : 8: predicate.replace_old_param 0.43% : 0.000001s : 4: predicate.reset_defer_inline 0.86% : 0.000001s : 9: predicate.reshape_eliminate 0.73% : 0.000001s : 8: predicate.row_tensor_add_zeros_like 0.44% : 0.000001s : 4: predicate.row_tensor_eliminate 1.02% : 0.000002s : 8: predicate.same_eliminate 0.51% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.99% : 0.000002s : 8: predicate.shard_identity_eliminate 0.95% : 0.000002s : 8: predicate.special_op_eliminate 1.02% : 0.000002s : 8: predicate.specialize_transform 1.16% : 0.000002s : 8: predicate.split_environ_get_set_with_tuple_value 0.96% : 0.000002s : 8: predicate.stack_unstack_eliminate 0.47% : 0.000001s : 4: predicate.switch_call_monad_eliminater 0.98% : 0.000002s : 11: predicate.switch_defer_inline 1.65% : 0.000003s : 19: predicate.switch_layer_defer_inline 4.15% : 0.000007s : 39: predicate.switch_simplify 0.77% : 0.000001s : 9: predicate.tile_eliminate 0.96% : 0.000002s : 9: predicate.transpose_eliminate 1.81% : 0.000003s : 17: predicate.tuple_list_convert_item_index_to_positive 1.70% : 0.000003s : 17: predicate.tuple_list_get_item_const_eliminator 1.50% : 0.000003s : 17: predicate.tuple_list_get_item_depend_reorder 3.13% : 0.000005s : 25: predicate.tuple_list_get_item_eliminator 1.43% : 0.000002s : 17: predicate.tuple_list_get_set_item_eliminator 2.60% : 0.000004s : 25: predicate.tuple_list_set_item_eliminator 1.80% : 0.000003s : 17: predicate.tuple_to_list_eliminator_ 2.10% : 0.000004s : 26: predicate.updatestate_pure_node_eliminater 3.04% : 0.000005s : 34: predicate.updatestate_useless_node_eliminater 0.62% : 0.000001s : 4: predicate.value_based_eliminate 0.89% : 0.000001s : 8: predicate.virtual_dataset_eliminate 0.79% : 0.000001s : 8: predicate.virtual_output_eliminate 0.32% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.68% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000209 5 8.25% : 0.000017s : 1: func_graph_cloner_run.FuncGraphClonerGraph 91.75% : 0.000192s : 4: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.022987 192 0.02% : 0.000004s : 1: ForceFp32Comm 13.49% : 0.003101s : 1: add_attr 13.45% : 0.003092s : 1: add_attr_with_inline 0.02% : 0.000004s : 1: add_comm_op_reuse_tag 0.26% : 0.000059s : 1: add_recomputation 0.02% : 0.000004s : 1: assign_add_opt 0.26% : 0.000060s : 1: auto_monad 0.11% : 0.000025s : 1: auto_monad_reorder 0.01% : 0.000003s : 1: begin_end_overlap_inline 0.02% : 0.000005s : 1: bias_add_comm_swap 2.04% : 0.000469s : 1: bootstrap 0.13% : 0.000029s : 1: cconv 0.02% : 0.000004s : 1: comm_op_add_attrs 0.08% : 0.000018s : 1: control_data_broadcast_order 0.05% : 0.000011s : 1: convert_after_rewriter 0.12% : 0.000029s : 1: cse_after_recomputation 0.02% : 0.000005s : 1: dataset_repeat_opt 0.02% : 0.000005s : 1: detach_backward 0.04% : 0.000010s : 1: environ_conv 0.08% : 0.000018s : 1: event_method 0.02% : 0.000005s : 1: full_micro_interleaved_order_control 0.02% : 0.000005s : 1: get_jit_bprop_graph 0.04% : 0.000008s : 1: graph_reusing 0.02% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.02% : 0.000004s : 1: handle_group_info 0.02% : 0.000005s : 1: inline 0.02% : 0.000006s : 1: insert-virtual-dataset 0.02% : 0.000004s : 1: interleave_parallel_branches 0.02% : 0.000004s : 1: interleave_split_concat_branches 0.02% : 0.000006s : 1: label_fine_grained_interleaved_index 0.03% : 0.000007s : 1: label_micro_interleaved_index 1.91% : 0.000439s : 1: loop_unroll 0.02% : 0.000004s : 1: merge_cast_opt 0.02% : 0.000005s : 1: micro_interleaved_order_control 2.16% : 0.000497s : 1: mutable_eliminate 0.03% : 0.000007s : 1: offloading_packed_experts 0.07% : 0.000015s : 1: opt.transform.loop_unroll_optimizer 0.08% : 0.000017s : 1: opt.transform.mutable_eliminate 4.14% : 0.000952s : 78: opt.transform.opt_a 0.14% : 0.000032s : 1: opt.transform.opt_after_cconv 0.12% : 0.000028s : 1: opt.transform.opt_after_jit_grad 0.70% : 0.000161s : 28: opt.transform.opt_b 0.23% : 0.000054s : 2: opt.transform.opt_trans_graph 0.19% : 0.000043s : 4: opt.transform.symbol_engine_opt 10.28% : 0.002364s : 1: opt_a 0.51% : 0.000116s : 1: opt_after_cconv 2.08% : 0.000478s : 1: opt_after_jit_grad 1.23% : 0.000282s : 1: opt_b 19.32% : 0.004440s : 1: optimize 0.09% : 0.000021s : 1: optimize_parallel_all_gather_comm 0.04% : 0.000009s : 1: order_py_execute_after_rewriter 0.10% : 0.000024s : 1: overlap_grad_flash_sp 0.02% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.03% : 0.000007s : 1: overlap_grad_ring_attention 0.02% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.02% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.02% : 0.000006s : 1: overlap_param_gather 0.02% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.03% : 0.000008s : 1: overlap_recompute_and_grad_model_parallel 0.02% : 0.000005s : 1: overlap_recompute_comm 0.03% : 0.000007s : 1: parallel-infer-symbol 0.02% : 0.000004s : 1: parallel-infer-symbol-second 0.02% : 0.000005s : 1: partial_unused_args_eliminate 0.02% : 0.000005s : 1: pipeline_parallel_scheduler 0.02% : 0.000004s : 1: pipeline_split 0.12% : 0.000029s : 1: pre_auto_parallel 0.10% : 0.000022s : 1: py_interpret_to_execute 0.06% : 0.000013s : 1: py_interpret_to_execute_after_opt_a 0.02% : 0.000004s : 1: remove_cast_before_assign_add 0.09% : 0.000020s : 1: remove_dup_value 1.30% : 0.000300s : 1: renormalize.infer 1.00% : 0.000231s : 1: renormalize.specialize 0.02% : 0.000005s : 1: reorder_send_recv_between_fp_bp 0.03% : 0.000007s : 1: rewriter_after_jit_bprop_graph 0.20% : 0.000045s : 1: rewriter_after_opt_a 0.23% : 0.000054s : 1: rewriter_before_opt_a 0.02% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.02% : 0.000005s : 1: slice_recompute_activation 0.02% : 0.000004s : 1: split_layernorm_comm 0.02% : 0.000005s : 1: split_matmul_comm_elemetwise 0.04% : 0.000009s : 1: swap_dp_allreduce_reducescatter 0.37% : 0.000085s : 1: symbol_engine_optimizer 0.37% : 0.000084s : 1: tuple_transform 21.71% : 0.004991s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:57.556.420 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:57.556.675 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0152515, [21] [bootstrap]: 0.00042936 [type_inference]: 0.00538397 [event_method]: 1.262e-05 [auto_monad]: 5.556e-05 [graph_reusing]: 5.35999e-06 [inline]: 2.13002e-06 [add_attr]: 0.00311229, [1] [add_attr_with_inline]: 0.00310471, [1] [Cycle 1]: 6.234e-05, [2] [tag_attr]: 1.421e-05 [meta_addattr_fg_expand]: 4.05e-06 [parallel-infer-symbol]: 2.93e-06 [pre_auto_parallel]: 2.513e-05 [insert-virtual-dataset]: 2.26003e-06 [parallel-infer-symbol-second]: 7.29982e-07 [dataset_repeat_opt]: 1.92001e-06 [pipeline_split]: 1.64998e-06 [optimize]: 0.0050654, [53] [py_interpret_to_execute]: 2.282e-05 [rewriter_before_opt_a]: 5.137e-05 [opt_a]: 0.00273848, [2] [Cycle 1]: 0.00180873, [45] [expand_dump_flag]: 3.33998e-06 [switch_simplify]: 2.557e-05 [loop_unroll]: 1.484e-05 [a_1]: 0.00033771 [with_stream_mark]: 1.55e-05 [recompute_prepare]: 9.29e-06 [updatestate_depend_eliminate]: 4.05e-06 [updatestate_assign_eliminate]: 4.03001e-06 [updatestate_loads_eliminate]: 3.67998e-06 [parameter_eliminate]: 1.99999e-06 [a_2]: 0.0001251 [accelerated_algorithm]: 8.33001e-06 [shard]: 2.38002e-06 [meta_shard_fg_expand]: 1.99999e-06 [shard_inline]: 7.23e-06 [merge_send_recv]: 9.76e-06 [auto_parallel]: 7.12002e-06 [parallel]: 1.773e-05 [flash_sp]: 8.41002e-06 [merge_comm]: 4.55001e-06 [allreduce_fusion]: 4.27998e-06 [matmul_add_comm_reduction]: 9.81e-06 [allreduce_slice_to_reducescatter]: 6.29982e-07 [virtual_shard_identity]: 9.30001e-06 [virtual_dataset]: 7.28999e-06 [get_grad_eliminate_]: 6.93998e-06 [virtual_output]: 6.91001e-06 [merge_forward]: 4.23001e-06 [cell_reuse_recompute_pass]: 1.21002e-06 [offload_activation]: 1.129e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.68e-05 [merge_recompute_call_nodes]: 1.49998e-06 [before_grad]: 1.264e-05 [set_forward_comm_id_for_comm_node_pass]: 4.39002e-06 [meta_fg_expand]: 3.2e-06 [flash_sp_send_recv_attached]: 2.39001e-06 [receive_attached]: 2.32001e-06 [after_resolve]: 1.078e-05 [a_after_grad]: 1.175e-05 [renormalize]: 0.00056023 [add_forward_monad_depend]: 5.25999e-06 [auto_monad_grad]: 2.23002e-06 [auto_monad_eliminator]: 1.653e-05 [cse]: 3.546e-05 [a_3]: 7.058e-05 [Cycle 2]: 0.00091722, [45] [expand_dump_flag]: 1.17e-06 [switch_simplify]: 8.67998e-06 [loop_unroll]: 7.35e-06 [a_1]: 0.00015092 [with_stream_mark]: 1.164e-05 [recompute_prepare]: 7.83999e-06 [updatestate_depend_eliminate]: 4.25e-06 [updatestate_assign_eliminate]: 2.94999e-06 [updatestate_loads_eliminate]: 2.84999e-06 [parameter_eliminate]: 1.00999e-06 [a_2]: 0.00011466 [accelerated_algorithm]: 7.26001e-06 [shard]: 1.07998e-06 [meta_shard_fg_expand]: 1.63002e-06 [shard_inline]: 7.5e-06 [merge_send_recv]: 6.23e-06 [auto_parallel]: 5.97999e-06 [parallel]: 5.12e-06 [flash_sp]: 3.56999e-06 [merge_comm]: 4.68001e-06 [allreduce_fusion]: 4.58001e-06 [matmul_add_comm_reduction]: 1.103e-05 [allreduce_slice_to_reducescatter]: 3.69997e-07 [virtual_shard_identity]: 8.72e-06 [virtual_dataset]: 7.38999e-06 [get_grad_eliminate_]: 6.64999e-06 [virtual_output]: 6.51e-06 [merge_forward]: 3.34001e-06 [cell_reuse_recompute_pass]: 1.44e-06 [offload_activation]: 7.83999e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.637e-05 [merge_recompute_call_nodes]: 8.89995e-07 [before_grad]: 1.156e-05 [set_forward_comm_id_for_comm_node_pass]: 4.75001e-06 [meta_fg_expand]: 2.64999e-06 [flash_sp_send_recv_attached]: 8.79983e-07 [receive_attached]: 1.04e-06 [after_resolve]: 9.24e-06 [a_after_grad]: 1.026e-05 [renormalize]: 6.00121e-08 [add_forward_monad_depend]: 1.27e-06 [auto_monad_grad]: 8.50006e-07 [auto_monad_eliminator]: 8.97e-06 [cse]: 1.965e-05 [a_3]: 5.685e-05 [py_interpret_to_execute_after_opt_a]: 1.321e-05 [slice_cell_reuse_recomputed_activation]: 4.49002e-06 [rewriter_after_opt_a]: 4.48e-05 [convert_after_rewriter]: 9.84001e-06 [order_py_execute_after_rewriter]: 9.20999e-06 [mutable_eliminate]: 0.0004961 [opt_b]: 0.00034108, [1] [Cycle 1]: 0.0003323, [7] [b_1]: 0.00022713 [b_2]: 8.87e-06 [updatestate_depend_eliminate]: 6.91001e-06 [updatestate_assign_eliminate]: 3.14999e-06 [updatestate_loads_eliminate]: 2.88998e-06 [renormalize]: 5.3001e-07 [cse]: 2.34e-05 [optimize_parallel_all_gather_comm]: 2.075e-05 [overlap_param_gather]: 4.55001e-06 [cconv]: 2.898e-05 [loop_unroll]: 0.00044 [opt_after_cconv]: 0.00013817, [1] [Cycle 1]: 0.00012952, [7] [c_1]: 3.426e-05 [parameter_eliminate]: 3.01999e-06 [updatestate_depend_eliminate]: 6.19001e-06 [updatestate_assign_eliminate]: 2.94001e-06 [updatestate_loads_eliminate]: 3.20002e-06 [cse]: 2.389e-05 [renormalize]: 4.00003e-07 [remove_dup_value]: 2.023e-05 [tuple_transform]: 9.538e-05, [1] [Cycle 1]: 8.806e-05, [4] [d_1]: 4.78e-05 [none_parameter_eliminate]: 1.86998e-06 [renormalize]: 2.50002e-07 [switch_simplify]: 7.99997e-06 [partial_unused_args_eliminate]: 4.12e-06 [add_recomputation]: 5.571e-05 [cse_after_recomputation]: 3.121e-05, [1] [Cycle 1]: 2.435e-05, [1] [cse]: 1.543e-05 [environ_conv]: 9.69e-06 [swap_dp_allreduce_reducescatter]: 8.75999e-06 [bias_add_comm_swap]: 5.46e-06 [label_micro_interleaved_index]: 6.83e-06 [label_fine_grained_interleaved_index]: 5.12e-06 [merge_cast_opt]: 4.02e-06 [slice_recompute_activation]: 4.92e-06 [micro_interleaved_order_control]: 5.42001e-06 [assign_add_opt]: 3.93001e-06 [ForceFp32Comm]: 3.40998e-06 [remove_cast_before_assign_add]: 3.68e-06 [full_micro_interleaved_order_control]: 4.60001e-06 [reorder_send_recv_between_fp_bp]: 4.97e-06 [comm_op_add_attrs]: 3.36001e-06 [add_comm_op_reuse_tag]: 3.24001e-06 [interleave_split_concat_branches]: 3.66999e-06 [interleave_parallel_branches]: 3.45e-06 [overlap_opt_shard_in_pipeline]: 3.83001e-06 [overlap_opt_shard_grad_in_pipeline]: 4.31002e-06 [control_data_broadcast_order]: 1.822e-05 [grouped_pairwise_exchange_alltoall]: 3.97e-06 [offloading_packed_experts]: 7.10002e-06 [overlap_recompute_and_grad_model_parallel]: 7.63001e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.88999e-06 [overlap_recompute_allgather_and_fa_grad]: 3.73999e-06 [overlap_recompute_comm]: 4.82998e-06 [overlap_grad_ring_attention]: 6.77002e-06 [overlap_grad_flash_sp]: 2.348e-05 [begin_end_overlap_inline]: 2.82002e-06 [split_matmul_comm_elemetwise]: 4.65999e-06 [split_layernorm_comm]: 4.25999e-06 [handle_group_info]: 3.24001e-06 [symbol_engine_optimizer]: 0.00010286, [1] [Cycle 1]: 9.638e-05, [6] [build]: 2.84999e-06 [elim_shapecalc]: 1.088e-05 [elim_not_effective]: 1.493e-05 [opt_reshape]: 8.57e-06 [fold_const_symbol]: 1.212e-05 [renormalize]: 2.00002e-07 [detach_backward]: 3.01001e-06 [pipeline_parallel_scheduler]: 1.75001e-06 [auto_monad_reorder]: 2.261e-05 [get_jit_bprop_graph]: 1.40999e-06 [rewriter_after_jit_bprop_graph]: 4.07e-06 [opt_after_jit_grad]: 0.00049456 [validate]: 4.214e-05 Sums bootstrap : 0.000429s : 4.13% type_inference : 0.005384s : 51.85% event_method : 0.000013s : 0.12% auto_monad : 0.000056s : 0.54% graph_reusing : 0.000005s : 0.05% inline : 0.000002s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000014s : 0.14% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000004s : 0.04% parallel-infer-symbol : 0.000003s : 0.03% pre_auto_parallel : 0.000025s : 0.24% insert-virtual-dataset : 0.000002s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.02% optimize.py_interpret_to_execute : 0.000023s : 0.22% optimize.rewriter_before_opt_a : 0.000051s : 0.49% optimize.opt_a.expand_dump_flag : 0.000005s : 0.04% optimize.opt_a.switch_simplify : 0.000034s : 0.33% optimize.opt_a.loop_unroll : 0.000022s : 0.21% optimize.opt_a.a_1 : 0.000489s : 4.71% optimize.opt_a.with_stream_mark : 0.000027s : 0.26% optimize.opt_a.recompute_prepare : 0.000017s : 0.16% optimize.opt_a.updatestate_depend_eliminate : 0.000008s : 0.08% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.07% optimize.opt_a.updatestate_loads_eliminate : 0.000007s : 0.06% optimize.opt_a.parameter_eliminate : 0.000003s : 0.03% optimize.opt_a.a_2 : 0.000240s : 2.31% optimize.opt_a.accelerated_algorithm : 0.000016s : 0.15% optimize.opt_a.shard : 0.000003s : 0.03% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.03% optimize.opt_a.shard_inline : 0.000015s : 0.14% optimize.opt_a.merge_send_recv : 0.000016s : 0.15% optimize.opt_a.auto_parallel : 0.000013s : 0.13% optimize.opt_a.parallel : 0.000023s : 0.22% optimize.opt_a.flash_sp : 0.000012s : 0.12% optimize.opt_a.merge_comm : 0.000009s : 0.09% optimize.opt_a.allreduce_fusion : 0.000009s : 0.09% optimize.opt_a.matmul_add_comm_reduction : 0.000021s : 0.20% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000018s : 0.17% optimize.opt_a.virtual_dataset : 0.000015s : 0.14% optimize.opt_a.get_grad_eliminate_ : 0.000014s : 0.13% optimize.opt_a.virtual_output : 0.000013s : 0.13% optimize.opt_a.merge_forward : 0.000008s : 0.07% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.03% optimize.opt_a.offload_activation : 0.000019s : 0.18% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000033s : 0.32% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.02% optimize.opt_a.before_grad : 0.000024s : 0.23% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000009s : 0.09% optimize.opt_a.meta_fg_expand : 0.000006s : 0.06% optimize.opt_a.flash_sp_send_recv_attached : 0.000003s : 0.03% optimize.opt_a.receive_attached : 0.000003s : 0.03% optimize.opt_a.after_resolve : 0.000020s : 0.19% optimize.opt_a.a_after_grad : 0.000022s : 0.21% optimize.opt_a.renormalize : 0.000560s : 5.40% optimize.opt_a.add_forward_monad_depend : 0.000007s : 0.06% optimize.opt_a.auto_monad_grad : 0.000003s : 0.03% optimize.opt_a.auto_monad_eliminator : 0.000025s : 0.25% optimize.opt_a.cse : 0.000055s : 0.53% optimize.opt_a.a_3 : 0.000127s : 1.23% optimize.py_interpret_to_execute_after_opt_a : 0.000013s : 0.13% optimize.slice_cell_reuse_recomputed_activation : 0.000004s : 0.04% optimize.rewriter_after_opt_a : 0.000045s : 0.43% optimize.convert_after_rewriter : 0.000010s : 0.09% optimize.order_py_execute_after_rewriter : 0.000009s : 0.09% optimize.mutable_eliminate : 0.000496s : 4.78% optimize.opt_b.b_1 : 0.000227s : 2.19% optimize.opt_b.b_2 : 0.000009s : 0.09% optimize.opt_b.updatestate_depend_eliminate : 0.000007s : 0.07% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_b.renormalize : 0.000001s : 0.01% optimize.opt_b.cse : 0.000023s : 0.23% optimize.optimize_parallel_all_gather_comm : 0.000021s : 0.20% optimize.overlap_param_gather : 0.000005s : 0.04% optimize.cconv : 0.000029s : 0.28% optimize.loop_unroll : 0.000440s : 4.24% optimize.opt_after_cconv.c_1 : 0.000034s : 0.33% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.06% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.cse : 0.000024s : 0.23% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000020s : 0.19% optimize.tuple_transform.d_1 : 0.000048s : 0.46% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.02% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000008s : 0.08% optimize.partial_unused_args_eliminate : 0.000004s : 0.04% optimize.add_recomputation : 0.000056s : 0.54% optimize.cse_after_recomputation.cse : 0.000015s : 0.15% optimize.environ_conv : 0.000010s : 0.09% optimize.swap_dp_allreduce_reducescatter : 0.000009s : 0.08% optimize.bias_add_comm_swap : 0.000005s : 0.05% optimize.label_micro_interleaved_index : 0.000007s : 0.07% optimize.label_fine_grained_interleaved_index : 0.000005s : 0.05% optimize.merge_cast_opt : 0.000004s : 0.04% optimize.slice_recompute_activation : 0.000005s : 0.05% optimize.micro_interleaved_order_control : 0.000005s : 0.05% optimize.assign_add_opt : 0.000004s : 0.04% optimize.ForceFp32Comm : 0.000003s : 0.03% optimize.remove_cast_before_assign_add : 0.000004s : 0.04% optimize.full_micro_interleaved_order_control : 0.000005s : 0.04% optimize.reorder_send_recv_between_fp_bp : 0.000005s : 0.05% optimize.comm_op_add_attrs : 0.000003s : 0.03% optimize.add_comm_op_reuse_tag : 0.000003s : 0.03% optimize.interleave_split_concat_branches : 0.000004s : 0.04% optimize.interleave_parallel_branches : 0.000003s : 0.03% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.04% optimize.overlap_opt_shard_grad_in_pipeline : 0.000004s : 0.04% optimize.control_data_broadcast_order : 0.000018s : 0.18% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.04% optimize.offloading_packed_experts : 0.000007s : 0.07% optimize.overlap_recompute_and_grad_model_parallel : 0.000008s : 0.07% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.04% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.04% optimize.overlap_recompute_comm : 0.000005s : 0.05% optimize.overlap_grad_ring_attention : 0.000007s : 0.07% optimize.overlap_grad_flash_sp : 0.000023s : 0.23% optimize.begin_end_overlap_inline : 0.000003s : 0.03% optimize.split_matmul_comm_elemetwise : 0.000005s : 0.04% optimize.split_layernorm_comm : 0.000004s : 0.04% optimize.handle_group_info : 0.000003s : 0.03% optimize.symbol_engine_optimizer.build : 0.000003s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000011s : 0.10% optimize.symbol_engine_optimizer.elim_not_effective : 0.000015s : 0.14% optimize.symbol_engine_optimizer.opt_reshape : 0.000009s : 0.08% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000012s : 0.12% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000003s : 0.03% pipeline_parallel_scheduler : 0.000002s : 0.02% auto_monad_reorder : 0.000023s : 0.22% get_jit_bprop_graph : 0.000001s : 0.01% rewriter_after_jit_bprop_graph : 0.000004s : 0.04% opt_after_jit_grad : 0.000495s : 4.76% validate : 0.000042s : 0.41% Time group info: ------[substitution.] 0.000165 29 11.15% : 0.000018s : 2: substitution.cast_eliminate 1.37% : 0.000002s : 3: substitution.elim_not_effective 1.04% : 0.000002s : 3: substitution.fold_const_symbol 4.01% : 0.000007s : 4: substitution.graph_param_transform 57.05% : 0.000094s : 2: substitution.inline 2.72% : 0.000004s : 6: substitution.j_node_and_user_rematch 3.83% : 0.000006s : 6: substitution.remove_not_recompute_node 1.84% : 0.000003s : 2: substitution.replace_old_param 16.97% : 0.000028s : 1: substitution.value_based_eliminate ------[type_inference.] 0.005338 2 91.25% : 0.004871s : 1: type_inference.infer 8.75% : 0.000467s : 1: type_inference.specialize ------[replace.] 0.000020 2 100.00% : 0.000020s : 2: replace.inline ------[match.] 0.000093 2 100.00% : 0.000093s : 2: match.inline ------[predicate.] 0.000168 980 0.84% : 0.000001s : 9: predicate.accumulaten_eliminater 1.15% : 0.000002s : 4: predicate.ad_related_special_op_eliminate 0.68% : 0.000001s : 8: predicate.addn_check_dump 0.83% : 0.000001s : 9: predicate.addn_zero_filter 0.73% : 0.000001s : 9: predicate.adjust_all_reduce_mul_add 2.28% : 0.000004s : 17: predicate.arithmetic_simplify 0.93% : 0.000002s : 9: predicate.cast_eliminate 0.82% : 0.000001s : 8: predicate.check_bprop_eliminate 0.71% : 0.000001s : 8: predicate.compare_switch_simplify 0.25% : 0.000000s : 4: predicate.const_output_eliminate 0.80% : 0.000001s : 8: predicate.depend_value_elim 0.80% : 0.000001s : 9: predicate.dict_get_item_const_eliminator 0.90% : 0.000002s : 9: predicate.dict_get_item_eliminator 0.75% : 0.000001s : 9: predicate.dict_set_item_eliminator 1.17% : 0.000002s : 8: predicate.dumpgradient_eliminate 0.24% : 0.000000s : 4: predicate.elim_not_effective 0.49% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.11% : 0.000002s : 13: predicate.environ_add_const_eliminate 1.01% : 0.000002s : 13: predicate.environ_get_add_eliminate 1.03% : 0.000002s : 13: predicate.environ_get_depend_swap 1.89% : 0.000003s : 21: predicate.environ_get_eliminate 1.05% : 0.000002s : 13: predicate.environ_get_set_eliminate 0.95% : 0.000002s : 11: predicate.exchange_switch_depend_value 1.89% : 0.000003s : 11: predicate.float_depend_g_call 0.68% : 0.000001s : 8: predicate.float_environ_get_switch 1.02% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.20% : 0.000000s : 4: predicate.fold_const_symbol 0.78% : 0.000001s : 8: predicate.get_grad_eliminate 0.26% : 0.000000s : 4: predicate.graph_param_transform 0.79% : 0.000001s : 8: predicate.incorporate_call 0.68% : 0.000001s : 8: predicate.incorporate_call_switch 6.34% : 0.000011s : 44: predicate.inline 1.05% : 0.000002s : 8: predicate.inline_without_move 0.37% : 0.000001s : 8: predicate.j_node_and_user_rematch 1.12% : 0.000002s : 8: predicate.less_batch_normalization 1.65% : 0.000003s : 17: predicate.list_to_tuple_eliminator_ 2.14% : 0.000004s : 26: predicate.load_eliminater 1.16% : 0.000002s : 4: predicate.loop_unroll_after_grad 1.53% : 0.000003s : 16: predicate.loop_unroll_before_grad 1.77% : 0.000003s : 17: predicate.make_slice_get_slice_eliminator 0.73% : 0.000001s : 8: predicate.merge_addn 0.74% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.76% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.73% : 0.000001s : 9: predicate.minmaximum_grad 1.22% : 0.000002s : 4: predicate.mutable_eliminate 0.47% : 0.000001s : 4: predicate.opt_reshape 0.49% : 0.000001s : 4: predicate.parallel_virtual_node 1.30% : 0.000002s : 11: predicate.partial_defer_inline 1.30% : 0.000002s : 13: predicate.partial_eliminate 0.81% : 0.000001s : 9: predicate.print_const_string_wrapper 0.77% : 0.000001s : 8: predicate.reduce_all_const_elim 1.05% : 0.000002s : 9: predicate.reduce_eliminate 2.09% : 0.000004s : 26: predicate.redundant_stop_gradient_eliminater 0.46% : 0.000001s : 8: predicate.remove_not_recompute_node 1.18% : 0.000002s : 17: predicate.replace_applicator 0.62% : 0.000001s : 8: predicate.replace_old_param 0.35% : 0.000001s : 4: predicate.reset_defer_inline 0.90% : 0.000002s : 9: predicate.reshape_eliminate 0.78% : 0.000001s : 8: predicate.row_tensor_add_zeros_like 0.48% : 0.000001s : 4: predicate.row_tensor_eliminate 0.95% : 0.000002s : 8: predicate.same_eliminate 0.54% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.98% : 0.000002s : 8: predicate.shard_identity_eliminate 1.15% : 0.000002s : 8: predicate.special_op_eliminate 0.99% : 0.000002s : 8: predicate.specialize_transform 1.15% : 0.000002s : 8: predicate.split_environ_get_set_with_tuple_value 1.09% : 0.000002s : 8: predicate.stack_unstack_eliminate 0.43% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.01% : 0.000002s : 11: predicate.switch_defer_inline 1.65% : 0.000003s : 19: predicate.switch_layer_defer_inline 4.08% : 0.000007s : 39: predicate.switch_simplify 0.79% : 0.000001s : 9: predicate.tile_eliminate 0.96% : 0.000002s : 9: predicate.transpose_eliminate 1.69% : 0.000003s : 17: predicate.tuple_list_convert_item_index_to_positive 1.65% : 0.000003s : 17: predicate.tuple_list_get_item_const_eliminator 1.45% : 0.000002s : 17: predicate.tuple_list_get_item_depend_reorder 3.25% : 0.000005s : 25: predicate.tuple_list_get_item_eliminator 1.49% : 0.000003s : 17: predicate.tuple_list_get_set_item_eliminator 2.37% : 0.000004s : 25: predicate.tuple_list_set_item_eliminator 1.63% : 0.000003s : 17: predicate.tuple_to_list_eliminator_ 2.13% : 0.000004s : 26: predicate.updatestate_pure_node_eliminater 3.09% : 0.000005s : 34: predicate.updatestate_useless_node_eliminater 0.60% : 0.000001s : 4: predicate.value_based_eliminate 0.87% : 0.000001s : 8: predicate.virtual_dataset_eliminate 0.93% : 0.000002s : 8: predicate.virtual_output_eliminate 0.37% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.62% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000220 5 7.96% : 0.000018s : 1: func_graph_cloner_run.FuncGraphClonerGraph 92.04% : 0.000202s : 4: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.025153 192 0.02% : 0.000006s : 1: ForceFp32Comm 12.41% : 0.003121s : 1: add_attr 12.36% : 0.003108s : 1: add_attr_with_inline 0.02% : 0.000006s : 1: add_comm_op_reuse_tag 0.24% : 0.000059s : 1: add_recomputation 0.03% : 0.000007s : 1: assign_add_opt 0.25% : 0.000064s : 1: auto_monad 0.12% : 0.000030s : 1: auto_monad_reorder 0.02% : 0.000006s : 1: begin_end_overlap_inline 0.03% : 0.000008s : 1: bias_add_comm_swap 1.89% : 0.000476s : 1: bootstrap 0.13% : 0.000032s : 1: cconv 0.02% : 0.000006s : 1: comm_op_add_attrs 0.08% : 0.000021s : 1: control_data_broadcast_order 0.05% : 0.000013s : 1: convert_after_rewriter 0.14% : 0.000034s : 1: cse_after_recomputation 0.03% : 0.000008s : 1: dataset_repeat_opt 0.07% : 0.000019s : 1: detach_backward 0.05% : 0.000013s : 1: environ_conv 0.09% : 0.000022s : 1: event_method 0.03% : 0.000007s : 1: full_micro_interleaved_order_control 0.03% : 0.000008s : 1: get_jit_bprop_graph 0.05% : 0.000012s : 1: graph_reusing 0.03% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.02% : 0.000006s : 1: handle_group_info 0.03% : 0.000008s : 1: inline 0.03% : 0.000008s : 1: insert-virtual-dataset 0.02% : 0.000006s : 1: interleave_parallel_branches 0.03% : 0.000006s : 1: interleave_split_concat_branches 0.03% : 0.000008s : 1: label_fine_grained_interleaved_index 0.04% : 0.000010s : 1: label_micro_interleaved_index 1.77% : 0.000446s : 1: loop_unroll 0.03% : 0.000007s : 1: merge_cast_opt 0.03% : 0.000008s : 1: micro_interleaved_order_control 1.99% : 0.000502s : 1: mutable_eliminate 0.04% : 0.000010s : 1: offloading_packed_experts 0.06% : 0.000016s : 1: opt.transform.loop_unroll_optimizer 0.06% : 0.000016s : 1: opt.transform.mutable_eliminate 3.75% : 0.000944s : 78: opt.transform.opt_a 0.13% : 0.000033s : 1: opt.transform.opt_after_cconv 0.12% : 0.000029s : 1: opt.transform.opt_after_jit_grad 0.64% : 0.000160s : 28: opt.transform.opt_b 0.21% : 0.000054s : 2: opt.transform.opt_trans_graph 0.17% : 0.000043s : 4: opt.transform.symbol_engine_opt 10.90% : 0.002742s : 1: opt_a 0.56% : 0.000142s : 1: opt_after_cconv 2.01% : 0.000505s : 1: opt_after_jit_grad 1.37% : 0.000345s : 1: opt_b 21.52% : 0.005413s : 1: optimize 0.10% : 0.000024s : 1: optimize_parallel_all_gather_comm 0.05% : 0.000012s : 1: order_py_execute_after_rewriter 0.11% : 0.000027s : 1: overlap_grad_flash_sp 0.03% : 0.000007s : 1: overlap_grad_matmul_and_grad_allreduce 0.04% : 0.000010s : 1: overlap_grad_ring_attention 0.03% : 0.000007s : 1: overlap_opt_shard_grad_in_pipeline 0.03% : 0.000006s : 1: overlap_opt_shard_in_pipeline 0.03% : 0.000007s : 1: overlap_param_gather 0.03% : 0.000006s : 1: overlap_recompute_allgather_and_fa_grad 0.04% : 0.000010s : 1: overlap_recompute_and_grad_model_parallel 0.03% : 0.000008s : 1: overlap_recompute_comm 0.04% : 0.000009s : 1: parallel-infer-symbol 0.03% : 0.000007s : 1: parallel-infer-symbol-second 0.03% : 0.000007s : 1: partial_unused_args_eliminate 0.04% : 0.000009s : 1: pipeline_parallel_scheduler 0.03% : 0.000007s : 1: pipeline_split 0.13% : 0.000033s : 1: pre_auto_parallel 0.11% : 0.000026s : 1: py_interpret_to_execute 0.07% : 0.000017s : 1: py_interpret_to_execute_after_opt_a 0.03% : 0.000006s : 1: remove_cast_before_assign_add 0.09% : 0.000024s : 1: remove_dup_value 1.27% : 0.000320s : 1: renormalize.infer 0.92% : 0.000233s : 1: renormalize.specialize 0.03% : 0.000008s : 1: reorder_send_recv_between_fp_bp 0.04% : 0.000010s : 1: rewriter_after_jit_bprop_graph 0.19% : 0.000048s : 1: rewriter_after_opt_a 0.22% : 0.000055s : 1: rewriter_before_opt_a 0.03% : 0.000007s : 1: slice_cell_reuse_recomputed_activation 0.03% : 0.000008s : 1: slice_recompute_activation 0.03% : 0.000007s : 1: split_layernorm_comm 0.03% : 0.000007s : 1: split_matmul_comm_elemetwise 0.05% : 0.000012s : 1: swap_dp_allreduce_reducescatter 0.42% : 0.000106s : 1: symbol_engine_optimizer 0.39% : 0.000098s : 1: tuple_transform 21.51% : 0.005411s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:57.746.222 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0141197, [21] [bootstrap]: 0.00042814 [type_inference]: 0.00542738 [event_method]: 1.255e-05 [auto_monad]: 5.529e-05 [graph_reusing]: 5.40001e-06 [inline]: 2.45002e-06 [add_attr]: 0.0031014, [1] [add_attr_with_inline]: 0.00309345, [1] [Cycle 1]: 4.702e-05, [2] [tag_attr]: 1.401e-05 [meta_addattr_fg_expand]: 3.71999e-06 [parallel-infer-symbol]: 3.03e-06 [pre_auto_parallel]: 2.5e-05 [insert-virtual-dataset]: 2.41e-06 [parallel-infer-symbol-second]: 7.50006e-07 [dataset_repeat_opt]: 2.06e-06 [pipeline_split]: 1.49998e-06 [optimize]: 0.00438326, [53] [py_interpret_to_execute]: 1.63e-05 [rewriter_before_opt_a]: 4.801e-05 [opt_a]: 0.00235629, [2] [Cycle 1]: 0.00164225, [45] [expand_dump_flag]: 2.89001e-06 [switch_simplify]: 2.692e-05 [loop_unroll]: 1.514e-05 [a_1]: 0.00033614 [with_stream_mark]: 1.504e-05 [recompute_prepare]: 8.83001e-06 [updatestate_depend_eliminate]: 4.16001e-06 [updatestate_assign_eliminate]: 4.25e-06 [updatestate_loads_eliminate]: 3.52002e-06 [parameter_eliminate]: 1.77001e-06 [a_2]: 9.604e-05 [accelerated_algorithm]: 7.85e-06 [shard]: 2.27999e-06 [meta_shard_fg_expand]: 2.31e-06 [shard_inline]: 7.18e-06 [merge_send_recv]: 9.10999e-06 [auto_parallel]: 6.98998e-06 [parallel]: 1.828e-05 [flash_sp]: 8.25999e-06 [merge_comm]: 4.43001e-06 [allreduce_fusion]: 4.57e-06 [matmul_add_comm_reduction]: 1.06e-05 [allreduce_slice_to_reducescatter]: 6.50005e-07 [virtual_shard_identity]: 8.74e-06 [virtual_dataset]: 7.43e-06 [get_grad_eliminate_]: 6.73e-06 [virtual_output]: 7.73999e-06 [merge_forward]: 4.1e-06 [cell_reuse_recompute_pass]: 1.29e-06 [offload_activation]: 1.124e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.458e-05 [merge_recompute_call_nodes]: 1.54e-06 [before_grad]: 1.276e-05 [set_forward_comm_id_for_comm_node_pass]: 4.70999e-06 [meta_fg_expand]: 3.45998e-06 [flash_sp_send_recv_attached]: 2.85002e-06 [receive_attached]: 2.22001e-06 [after_resolve]: 1.06e-05 [a_after_grad]: 1.109e-05 [renormalize]: 0.00058579 [add_forward_monad_depend]: 5.36002e-06 [auto_monad_grad]: 2.19999e-06 [auto_monad_eliminator]: 1.652e-05 [cse]: 3.528e-05 [a_3]: 5.402e-05 [Cycle 2]: 0.00070418, [45] [expand_dump_flag]: 1.10999e-06 [switch_simplify]: 8.83001e-06 [loop_unroll]: 7.13e-06 [a_1]: 0.00014979 [with_stream_mark]: 1.176e-05 [recompute_prepare]: 7.35998e-06 [updatestate_depend_eliminate]: 4.52e-06 [updatestate_assign_eliminate]: 2.96001e-06 [updatestate_loads_eliminate]: 2.68998e-06 [parameter_eliminate]: 9.00007e-07 [a_2]: 8.611e-05 [accelerated_algorithm]: 7.05998e-06 [shard]: 1.00001e-06 [meta_shard_fg_expand]: 1.70001e-06 [shard_inline]: 7.05e-06 [merge_send_recv]: 5.35001e-06 [auto_parallel]: 6.05002e-06 [parallel]: 4.35999e-06 [flash_sp]: 3.26001e-06 [merge_comm]: 3.93999e-06 [allreduce_fusion]: 3.79002e-06 [matmul_add_comm_reduction]: 9.54e-06 [allreduce_slice_to_reducescatter]: 4.10015e-07 [virtual_shard_identity]: 7.82e-06 [virtual_dataset]: 6.71e-06 [get_grad_eliminate_]: 6.66e-06 [virtual_output]: 6.21e-06 [merge_forward]: 3.23e-06 [cell_reuse_recompute_pass]: 1.50999e-06 [offload_activation]: 7.31999e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.385e-05 [merge_recompute_call_nodes]: 7.39994e-07 [before_grad]: 1.143e-05 [set_forward_comm_id_for_comm_node_pass]: 4.40999e-06 [meta_fg_expand]: 2.51998e-06 [flash_sp_send_recv_attached]: 7.7e-07 [receive_attached]: 1.02e-06 [after_resolve]: 9.17999e-06 [a_after_grad]: 9.76e-06 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 1.35999e-06 [auto_monad_grad]: 8.50006e-07 [auto_monad_eliminator]: 8.15999e-06 [cse]: 1.776e-05 [a_3]: 4.366e-05 [py_interpret_to_execute_after_opt_a]: 9.17999e-06 [slice_cell_reuse_recomputed_activation]: 1.89e-06 [rewriter_after_opt_a]: 4.046e-05 [convert_after_rewriter]: 6.83003e-06 [order_py_execute_after_rewriter]: 5.89e-06 [mutable_eliminate]: 0.00047344 [opt_b]: 0.0002679, [1] [Cycle 1]: 0.00026155, [7] [b_1]: 0.00018031 [b_2]: 9.04e-06 [updatestate_depend_eliminate]: 6.12001e-06 [updatestate_assign_eliminate]: 3.03e-06 [updatestate_loads_eliminate]: 2.74999e-06 [renormalize]: 4.69998e-07 [cse]: 2.277e-05 [optimize_parallel_all_gather_comm]: 1.669e-05 [overlap_param_gather]: 1.96998e-06 [cconv]: 2.433e-05 [loop_unroll]: 0.00043065 [opt_after_cconv]: 0.00011211, [1] [Cycle 1]: 0.00010695, [7] [c_1]: 3.397e-05 [parameter_eliminate]: 2.79999e-06 [updatestate_depend_eliminate]: 5.64e-06 [updatestate_assign_eliminate]: 3.04001e-06 [updatestate_loads_eliminate]: 3.53e-06 [cse]: 2.32e-05 [renormalize]: 3.89991e-07 [remove_dup_value]: 1.634e-05 [tuple_transform]: 7.841e-05, [1] [Cycle 1]: 7.382e-05, [4] [d_1]: 4.564e-05 [none_parameter_eliminate]: 1.71e-06 [renormalize]: 2.69996e-07 [switch_simplify]: 7.93999e-06 [partial_unused_args_eliminate]: 2.27999e-06 [add_recomputation]: 5.201e-05 [cse_after_recomputation]: 2.544e-05, [1] [Cycle 1]: 2.106e-05, [1] [cse]: 1.522e-05 [environ_conv]: 6.34001e-06 [swap_dp_allreduce_reducescatter]: 5.99999e-06 [bias_add_comm_swap]: 2.96001e-06 [label_micro_interleaved_index]: 4.20999e-06 [label_fine_grained_interleaved_index]: 2.69999e-06 [merge_cast_opt]: 1.27999e-06 [slice_recompute_activation]: 2.06e-06 [micro_interleaved_order_control]: 2.49999e-06 [assign_add_opt]: 1.19e-06 [ForceFp32Comm]: 7.49977e-07 [remove_cast_before_assign_add]: 9.80013e-07 [full_micro_interleaved_order_control]: 1.99999e-06 [reorder_send_recv_between_fp_bp]: 2.88998e-06 [comm_op_add_attrs]: 9.50007e-07 [add_comm_op_reuse_tag]: 9.19972e-07 [interleave_split_concat_branches]: 1.17e-06 [interleave_parallel_branches]: 1.02e-06 [overlap_opt_shard_in_pipeline]: 1.22e-06 [overlap_opt_shard_grad_in_pipeline]: 1.99e-06 [control_data_broadcast_order]: 1.472e-05 [grouped_pairwise_exchange_alltoall]: 1.61998e-06 [offloading_packed_experts]: 4.25e-06 [overlap_recompute_and_grad_model_parallel]: 5.09998e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.17999e-06 [overlap_recompute_allgather_and_fa_grad]: 1.32999e-06 [overlap_recompute_comm]: 2.31e-06 [overlap_grad_ring_attention]: 4.62e-06 [overlap_grad_flash_sp]: 2.093e-05 [begin_end_overlap_inline]: 6.80011e-07 [split_matmul_comm_elemetwise]: 1.99e-06 [split_layernorm_comm]: 1.49998e-06 [handle_group_info]: 1.02998e-06 [symbol_engine_optimizer]: 8.087e-05, [1] [Cycle 1]: 7.658e-05, [6] [build]: 2.91e-06 [elim_shapecalc]: 1.099e-05 [elim_not_effective]: 1.462e-05 [opt_reshape]: 8.05e-06 [fold_const_symbol]: 1.165e-05 [renormalize]: 2.50002e-07 [detach_backward]: 1.88002e-06 [pipeline_parallel_scheduler]: 1.87999e-06 [auto_monad_reorder]: 1.999e-05 [get_jit_bprop_graph]: 1.20999e-06 [rewriter_after_jit_bprop_graph]: 3.83999e-06 [opt_after_jit_grad]: 0.00045846 [validate]: 3.888e-05 Sums bootstrap : 0.000428s : 4.24% type_inference : 0.005427s : 53.81% event_method : 0.000013s : 0.12% auto_monad : 0.000055s : 0.55% graph_reusing : 0.000005s : 0.05% inline : 0.000002s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000014s : 0.14% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000004s : 0.04% parallel-infer-symbol : 0.000003s : 0.03% pre_auto_parallel : 0.000025s : 0.25% insert-virtual-dataset : 0.000002s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000001s : 0.01% optimize.py_interpret_to_execute : 0.000016s : 0.16% optimize.rewriter_before_opt_a : 0.000048s : 0.48% optimize.opt_a.expand_dump_flag : 0.000004s : 0.04% optimize.opt_a.switch_simplify : 0.000036s : 0.35% optimize.opt_a.loop_unroll : 0.000022s : 0.22% optimize.opt_a.a_1 : 0.000486s : 4.82% optimize.opt_a.with_stream_mark : 0.000027s : 0.27% optimize.opt_a.recompute_prepare : 0.000016s : 0.16% optimize.opt_a.updatestate_depend_eliminate : 0.000009s : 0.09% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.07% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.06% optimize.opt_a.parameter_eliminate : 0.000003s : 0.03% optimize.opt_a.a_2 : 0.000182s : 1.81% optimize.opt_a.accelerated_algorithm : 0.000015s : 0.15% optimize.opt_a.shard : 0.000003s : 0.03% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.04% optimize.opt_a.shard_inline : 0.000014s : 0.14% optimize.opt_a.merge_send_recv : 0.000014s : 0.14% optimize.opt_a.auto_parallel : 0.000013s : 0.13% optimize.opt_a.parallel : 0.000023s : 0.22% optimize.opt_a.flash_sp : 0.000012s : 0.11% optimize.opt_a.merge_comm : 0.000008s : 0.08% optimize.opt_a.allreduce_fusion : 0.000008s : 0.08% optimize.opt_a.matmul_add_comm_reduction : 0.000020s : 0.20% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000017s : 0.16% optimize.opt_a.virtual_dataset : 0.000014s : 0.14% optimize.opt_a.get_grad_eliminate_ : 0.000013s : 0.13% optimize.opt_a.virtual_output : 0.000014s : 0.14% optimize.opt_a.merge_forward : 0.000007s : 0.07% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.03% optimize.opt_a.offload_activation : 0.000019s : 0.18% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000028s : 0.28% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.02% optimize.opt_a.before_grad : 0.000024s : 0.24% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000009s : 0.09% optimize.opt_a.meta_fg_expand : 0.000006s : 0.06% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.04% optimize.opt_a.receive_attached : 0.000003s : 0.03% optimize.opt_a.after_resolve : 0.000020s : 0.20% optimize.opt_a.a_after_grad : 0.000021s : 0.21% optimize.opt_a.renormalize : 0.000586s : 5.81% optimize.opt_a.add_forward_monad_depend : 0.000007s : 0.07% optimize.opt_a.auto_monad_grad : 0.000003s : 0.03% optimize.opt_a.auto_monad_eliminator : 0.000025s : 0.24% optimize.opt_a.cse : 0.000053s : 0.53% optimize.opt_a.a_3 : 0.000098s : 0.97% optimize.py_interpret_to_execute_after_opt_a : 0.000009s : 0.09% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.02% optimize.rewriter_after_opt_a : 0.000040s : 0.40% optimize.convert_after_rewriter : 0.000007s : 0.07% optimize.order_py_execute_after_rewriter : 0.000006s : 0.06% optimize.mutable_eliminate : 0.000473s : 4.69% optimize.opt_b.b_1 : 0.000180s : 1.79% optimize.opt_b.b_2 : 0.000009s : 0.09% optimize.opt_b.updatestate_depend_eliminate : 0.000006s : 0.06% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_b.renormalize : 0.000000s : 0.00% optimize.opt_b.cse : 0.000023s : 0.23% optimize.optimize_parallel_all_gather_comm : 0.000017s : 0.17% optimize.overlap_param_gather : 0.000002s : 0.02% optimize.cconv : 0.000024s : 0.24% optimize.loop_unroll : 0.000431s : 4.27% optimize.opt_after_cconv.c_1 : 0.000034s : 0.34% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.06% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000004s : 0.03% optimize.opt_after_cconv.cse : 0.000023s : 0.23% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000016s : 0.16% optimize.tuple_transform.d_1 : 0.000046s : 0.45% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.02% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000008s : 0.08% optimize.partial_unused_args_eliminate : 0.000002s : 0.02% optimize.add_recomputation : 0.000052s : 0.52% optimize.cse_after_recomputation.cse : 0.000015s : 0.15% optimize.environ_conv : 0.000006s : 0.06% optimize.swap_dp_allreduce_reducescatter : 0.000006s : 0.06% optimize.bias_add_comm_swap : 0.000003s : 0.03% optimize.label_micro_interleaved_index : 0.000004s : 0.04% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.03% optimize.merge_cast_opt : 0.000001s : 0.01% optimize.slice_recompute_activation : 0.000002s : 0.02% optimize.micro_interleaved_order_control : 0.000002s : 0.02% optimize.assign_add_opt : 0.000001s : 0.01% optimize.ForceFp32Comm : 0.000001s : 0.01% optimize.remove_cast_before_assign_add : 0.000001s : 0.01% optimize.full_micro_interleaved_order_control : 0.000002s : 0.02% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.03% optimize.comm_op_add_attrs : 0.000001s : 0.01% optimize.add_comm_op_reuse_tag : 0.000001s : 0.01% optimize.interleave_split_concat_branches : 0.000001s : 0.01% optimize.interleave_parallel_branches : 0.000001s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.02% optimize.control_data_broadcast_order : 0.000015s : 0.15% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.02% optimize.offloading_packed_experts : 0.000004s : 0.04% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.05% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.01% optimize.overlap_recompute_comm : 0.000002s : 0.02% optimize.overlap_grad_ring_attention : 0.000005s : 0.05% optimize.overlap_grad_flash_sp : 0.000021s : 0.21% optimize.begin_end_overlap_inline : 0.000001s : 0.01% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.02% optimize.split_layernorm_comm : 0.000001s : 0.01% optimize.handle_group_info : 0.000001s : 0.01% optimize.symbol_engine_optimizer.build : 0.000003s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000011s : 0.11% optimize.symbol_engine_optimizer.elim_not_effective : 0.000015s : 0.14% optimize.symbol_engine_optimizer.opt_reshape : 0.000008s : 0.08% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000012s : 0.12% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.02% pipeline_parallel_scheduler : 0.000002s : 0.02% auto_monad_reorder : 0.000020s : 0.20% get_jit_bprop_graph : 0.000001s : 0.01% rewriter_after_jit_bprop_graph : 0.000004s : 0.04% opt_after_jit_grad : 0.000458s : 4.55% validate : 0.000039s : 0.39% Time group info: ------[substitution.] 0.000161 29 10.83% : 0.000017s : 2: substitution.cast_eliminate 1.25% : 0.000002s : 3: substitution.elim_not_effective 1.03% : 0.000002s : 3: substitution.fold_const_symbol 3.76% : 0.000006s : 4: substitution.graph_param_transform 58.34% : 0.000094s : 2: substitution.inline 2.77% : 0.000004s : 6: substitution.j_node_and_user_rematch 3.54% : 0.000006s : 6: substitution.remove_not_recompute_node 1.74% : 0.000003s : 2: substitution.replace_old_param 16.74% : 0.000027s : 1: substitution.value_based_eliminate ------[type_inference.] 0.005383 2 91.40% : 0.004920s : 1: type_inference.infer 8.60% : 0.000463s : 1: type_inference.specialize ------[replace.] 0.000021 2 100.00% : 0.000021s : 2: replace.inline ------[match.] 0.000092 2 100.00% : 0.000092s : 2: match.inline ------[predicate.] 0.000165 980 0.81% : 0.000001s : 9: predicate.accumulaten_eliminater 0.98% : 0.000002s : 4: predicate.ad_related_special_op_eliminate 0.70% : 0.000001s : 8: predicate.addn_check_dump 0.91% : 0.000001s : 9: predicate.addn_zero_filter 0.72% : 0.000001s : 9: predicate.adjust_all_reduce_mul_add 2.19% : 0.000004s : 17: predicate.arithmetic_simplify 0.98% : 0.000002s : 9: predicate.cast_eliminate 0.74% : 0.000001s : 8: predicate.check_bprop_eliminate 0.66% : 0.000001s : 8: predicate.compare_switch_simplify 0.23% : 0.000000s : 4: predicate.const_output_eliminate 0.98% : 0.000002s : 8: predicate.depend_value_elim 0.81% : 0.000001s : 9: predicate.dict_get_item_const_eliminator 1.01% : 0.000002s : 9: predicate.dict_get_item_eliminator 0.75% : 0.000001s : 9: predicate.dict_set_item_eliminator 1.10% : 0.000002s : 8: predicate.dumpgradient_eliminate 0.32% : 0.000001s : 4: predicate.elim_not_effective 0.49% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.11% : 0.000002s : 13: predicate.environ_add_const_eliminate 1.06% : 0.000002s : 13: predicate.environ_get_add_eliminate 1.04% : 0.000002s : 13: predicate.environ_get_depend_swap 1.91% : 0.000003s : 21: predicate.environ_get_eliminate 1.14% : 0.000002s : 13: predicate.environ_get_set_eliminate 0.95% : 0.000002s : 11: predicate.exchange_switch_depend_value 1.74% : 0.000003s : 11: predicate.float_depend_g_call 0.72% : 0.000001s : 8: predicate.float_environ_get_switch 1.02% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.24% : 0.000000s : 4: predicate.fold_const_symbol 0.85% : 0.000001s : 8: predicate.get_grad_eliminate 0.26% : 0.000000s : 4: predicate.graph_param_transform 0.88% : 0.000001s : 8: predicate.incorporate_call 0.69% : 0.000001s : 8: predicate.incorporate_call_switch 6.35% : 0.000010s : 44: predicate.inline 1.05% : 0.000002s : 8: predicate.inline_without_move 0.39% : 0.000001s : 8: predicate.j_node_and_user_rematch 1.19% : 0.000002s : 8: predicate.less_batch_normalization 1.65% : 0.000003s : 17: predicate.list_to_tuple_eliminator_ 2.13% : 0.000004s : 26: predicate.load_eliminater 1.04% : 0.000002s : 4: predicate.loop_unroll_after_grad 1.49% : 0.000002s : 16: predicate.loop_unroll_before_grad 2.02% : 0.000003s : 17: predicate.make_slice_get_slice_eliminator 0.73% : 0.000001s : 8: predicate.merge_addn 0.71% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.70% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.74% : 0.000001s : 9: predicate.minmaximum_grad 1.06% : 0.000002s : 4: predicate.mutable_eliminate 0.46% : 0.000001s : 4: predicate.opt_reshape 0.52% : 0.000001s : 4: predicate.parallel_virtual_node 1.26% : 0.000002s : 11: predicate.partial_defer_inline 1.29% : 0.000002s : 13: predicate.partial_eliminate 0.86% : 0.000001s : 9: predicate.print_const_string_wrapper 0.71% : 0.000001s : 8: predicate.reduce_all_const_elim 0.98% : 0.000002s : 9: predicate.reduce_eliminate 2.35% : 0.000004s : 26: predicate.redundant_stop_gradient_eliminater 0.50% : 0.000001s : 8: predicate.remove_not_recompute_node 1.17% : 0.000002s : 17: predicate.replace_applicator 0.64% : 0.000001s : 8: predicate.replace_old_param 0.31% : 0.000001s : 4: predicate.reset_defer_inline 0.87% : 0.000001s : 9: predicate.reshape_eliminate 0.83% : 0.000001s : 8: predicate.row_tensor_add_zeros_like 0.46% : 0.000001s : 4: predicate.row_tensor_eliminate 0.95% : 0.000002s : 8: predicate.same_eliminate 0.52% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.92% : 0.000002s : 8: predicate.shard_identity_eliminate 0.87% : 0.000001s : 8: predicate.special_op_eliminate 0.96% : 0.000002s : 8: predicate.specialize_transform 1.22% : 0.000002s : 8: predicate.split_environ_get_set_with_tuple_value 0.95% : 0.000002s : 8: predicate.stack_unstack_eliminate 0.46% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.05% : 0.000002s : 11: predicate.switch_defer_inline 1.69% : 0.000003s : 19: predicate.switch_layer_defer_inline 4.29% : 0.000007s : 39: predicate.switch_simplify 0.77% : 0.000001s : 9: predicate.tile_eliminate 0.83% : 0.000001s : 9: predicate.transpose_eliminate 1.77% : 0.000003s : 17: predicate.tuple_list_convert_item_index_to_positive 1.75% : 0.000003s : 17: predicate.tuple_list_get_item_const_eliminator 1.56% : 0.000003s : 17: predicate.tuple_list_get_item_depend_reorder 2.94% : 0.000005s : 25: predicate.tuple_list_get_item_eliminator 1.60% : 0.000003s : 17: predicate.tuple_list_get_set_item_eliminator 2.56% : 0.000004s : 25: predicate.tuple_list_set_item_eliminator 1.54% : 0.000003s : 17: predicate.tuple_to_list_eliminator_ 2.09% : 0.000003s : 26: predicate.updatestate_pure_node_eliminater 3.07% : 0.000005s : 34: predicate.updatestate_useless_node_eliminater 0.64% : 0.000001s : 4: predicate.value_based_eliminate 0.84% : 0.000001s : 8: predicate.virtual_dataset_eliminate 0.84% : 0.000001s : 8: predicate.virtual_output_eliminate 0.36% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.47% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000230 5 7.20% : 0.000017s : 1: func_graph_cloner_run.FuncGraphClonerGraph 92.80% : 0.000213s : 4: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.023338 192 0.01% : 0.000003s : 1: ForceFp32Comm 13.31% : 0.003106s : 1: add_attr 13.27% : 0.003097s : 1: add_attr_with_inline 0.01% : 0.000003s : 1: add_comm_op_reuse_tag 0.24% : 0.000056s : 1: add_recomputation 0.02% : 0.000004s : 1: assign_add_opt 0.26% : 0.000060s : 1: auto_monad 0.10% : 0.000024s : 1: auto_monad_reorder 0.02% : 0.000004s : 1: begin_end_overlap_inline 0.03% : 0.000006s : 1: bias_add_comm_swap 1.96% : 0.000458s : 1: bootstrap 0.12% : 0.000028s : 1: cconv 0.02% : 0.000004s : 1: comm_op_add_attrs 0.08% : 0.000018s : 1: control_data_broadcast_order 0.04% : 0.000010s : 1: convert_after_rewriter 0.12% : 0.000028s : 1: cse_after_recomputation 0.02% : 0.000005s : 1: dataset_repeat_opt 0.02% : 0.000005s : 1: detach_backward 0.04% : 0.000010s : 1: environ_conv 0.08% : 0.000018s : 1: event_method 0.02% : 0.000005s : 1: full_micro_interleaved_order_control 0.02% : 0.000004s : 1: get_jit_bprop_graph 0.04% : 0.000009s : 1: graph_reusing 0.02% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.02% : 0.000004s : 1: handle_group_info 0.02% : 0.000005s : 1: inline 0.03% : 0.000006s : 1: insert-virtual-dataset 0.02% : 0.000004s : 1: interleave_parallel_branches 0.02% : 0.000004s : 1: interleave_split_concat_branches 0.02% : 0.000006s : 1: label_fine_grained_interleaved_index 0.03% : 0.000007s : 1: label_micro_interleaved_index 1.88% : 0.000438s : 1: loop_unroll 0.02% : 0.000004s : 1: merge_cast_opt 0.02% : 0.000005s : 1: micro_interleaved_order_control 2.06% : 0.000482s : 1: mutable_eliminate 0.03% : 0.000007s : 1: offloading_packed_experts 0.06% : 0.000015s : 1: opt.transform.loop_unroll_optimizer 0.07% : 0.000015s : 1: opt.transform.mutable_eliminate 3.98% : 0.000930s : 78: opt.transform.opt_a 0.14% : 0.000033s : 1: opt.transform.opt_after_cconv 0.11% : 0.000026s : 1: opt.transform.opt_after_jit_grad 0.67% : 0.000157s : 28: opt.transform.opt_b 0.22% : 0.000051s : 2: opt.transform.opt_trans_graph 0.18% : 0.000041s : 4: opt.transform.symbol_engine_opt 10.11% : 0.002359s : 1: opt_a 0.50% : 0.000116s : 1: opt_after_cconv 2.00% : 0.000467s : 1: opt_after_jit_grad 1.16% : 0.000271s : 1: opt_b 18.80% : 0.004387s : 1: optimize 0.09% : 0.000020s : 1: optimize_parallel_all_gather_comm 0.04% : 0.000009s : 1: order_py_execute_after_rewriter 0.10% : 0.000024s : 1: overlap_grad_flash_sp 0.02% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.03% : 0.000008s : 1: overlap_grad_ring_attention 0.02% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.02% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.02% : 0.000005s : 1: overlap_param_gather 0.02% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.03% : 0.000008s : 1: overlap_recompute_and_grad_model_parallel 0.02% : 0.000005s : 1: overlap_recompute_comm 0.03% : 0.000006s : 1: parallel-infer-symbol 0.02% : 0.000004s : 1: parallel-infer-symbol-second 0.02% : 0.000005s : 1: partial_unused_args_eliminate 0.02% : 0.000005s : 1: pipeline_parallel_scheduler 0.02% : 0.000004s : 1: pipeline_split 0.12% : 0.000029s : 1: pre_auto_parallel 0.09% : 0.000020s : 1: py_interpret_to_execute 0.05% : 0.000013s : 1: py_interpret_to_execute_after_opt_a 0.02% : 0.000004s : 1: remove_cast_before_assign_add 0.08% : 0.000020s : 1: remove_dup_value 1.43% : 0.000333s : 1: renormalize.infer 1.05% : 0.000246s : 1: renormalize.specialize 0.02% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.03% : 0.000007s : 1: rewriter_after_jit_bprop_graph 0.19% : 0.000044s : 1: rewriter_after_opt_a 0.22% : 0.000052s : 1: rewriter_before_opt_a 0.02% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.02% : 0.000005s : 1: slice_recompute_activation 0.02% : 0.000004s : 1: split_layernorm_comm 0.02% : 0.000005s : 1: split_matmul_comm_elemetwise 0.04% : 0.000009s : 1: swap_dp_allreduce_reducescatter 0.36% : 0.000084s : 1: symbol_engine_optimizer 0.35% : 0.000081s : 1: tuple_transform 23.32% : 0.005443s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:57.936.045 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:57.936.315 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0153892, [21] [bootstrap]: 0.00042868 [type_inference]: 0.00556868 [event_method]: 1.202e-05 [auto_monad]: 5.413e-05 [graph_reusing]: 5.40001e-06 [inline]: 2.17999e-06 [add_attr]: 0.00328758, [1] [add_attr_with_inline]: 0.0032789, [1] [Cycle 1]: 7.402e-05, [2] [tag_attr]: 1.498e-05 [meta_addattr_fg_expand]: 3.78999e-06 [parallel-infer-symbol]: 3.6e-06 [pre_auto_parallel]: 2.722e-05 [insert-virtual-dataset]: 2.14e-06 [parallel-infer-symbol-second]: 7.49977e-07 [dataset_repeat_opt]: 2.17999e-06 [pipeline_split]: 1.57001e-06 [optimize]: 0.00482788, [53] [py_interpret_to_execute]: 2.197e-05 [rewriter_before_opt_a]: 4.931e-05 [opt_a]: 0.00253887, [2] [Cycle 1]: 0.00174, [45] [expand_dump_flag]: 2.80002e-06 [switch_simplify]: 2.56e-05 [loop_unroll]: 1.393e-05 [a_1]: 0.00028919 [with_stream_mark]: 1.766e-05 [recompute_prepare]: 8e-06 [updatestate_depend_eliminate]: 3.76001e-06 [updatestate_assign_eliminate]: 3.28998e-06 [updatestate_loads_eliminate]: 3.55e-06 [parameter_eliminate]: 1.89999e-06 [a_2]: 0.00010713 [accelerated_algorithm]: 6.59001e-06 [shard]: 2.21e-06 [meta_shard_fg_expand]: 1.92001e-06 [shard_inline]: 5.99e-06 [merge_send_recv]: 8.12e-06 [auto_parallel]: 7.01999e-06 [parallel]: 1.786e-05 [flash_sp]: 8.03999e-06 [merge_comm]: 4.43999e-06 [allreduce_fusion]: 3.29001e-06 [matmul_add_comm_reduction]: 9.62001e-06 [allreduce_slice_to_reducescatter]: 6.59988e-07 [virtual_shard_identity]: 7.56999e-06 [virtual_dataset]: 6.92002e-06 [get_grad_eliminate_]: 6.53e-06 [virtual_output]: 5.97999e-06 [merge_forward]: 3.76001e-06 [cell_reuse_recompute_pass]: 1.33002e-06 [offload_activation]: 1.077e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.557e-05 [merge_recompute_call_nodes]: 1.44998e-06 [before_grad]: 1.079e-05 [set_forward_comm_id_for_comm_node_pass]: 3.86999e-06 [meta_fg_expand]: 2.68998e-06 [flash_sp_send_recv_attached]: 3.20998e-06 [receive_attached]: 1.96998e-06 [after_resolve]: 1.042e-05 [a_after_grad]: 9.99999e-06 [renormalize]: 0.00058749 [add_forward_monad_depend]: 5.31002e-06 [auto_monad_grad]: 2.44001e-06 [auto_monad_eliminator]: 1.503e-05 [cse]: 3.056e-05 [a_3]: 6.054e-05 [Cycle 2]: 0.00078526, [45] [expand_dump_flag]: 1.10001e-06 [switch_simplify]: 7.65998e-06 [loop_unroll]: 6.13002e-06 [a_1]: 0.00010374 [with_stream_mark]: 1.229e-05 [recompute_prepare]: 6.07001e-06 [updatestate_depend_eliminate]: 3.25e-06 [updatestate_assign_eliminate]: 2.78e-06 [updatestate_loads_eliminate]: 2.99999e-06 [parameter_eliminate]: 1.33002e-06 [a_2]: 9.632e-05 [accelerated_algorithm]: 5.97999e-06 [shard]: 1.24e-06 [meta_shard_fg_expand]: 1.33002e-06 [shard_inline]: 6.15002e-06 [merge_send_recv]: 5.42999e-06 [auto_parallel]: 5.67999e-06 [parallel]: 5.33002e-06 [flash_sp]: 3.51999e-06 [merge_comm]: 3.23998e-06 [allreduce_fusion]: 2.82002e-06 [matmul_add_comm_reduction]: 6.16998e-06 [allreduce_slice_to_reducescatter]: 6.69999e-07 [virtual_shard_identity]: 6.32001e-06 [virtual_dataset]: 5.48002e-06 [get_grad_eliminate_]: 5.39e-06 [virtual_output]: 5.69e-06 [merge_forward]: 2.50002e-06 [cell_reuse_recompute_pass]: 1.60001e-06 [offload_activation]: 6.63e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.46e-05 [merge_recompute_call_nodes]: 9.30013e-07 [before_grad]: 9.99001e-06 [set_forward_comm_id_for_comm_node_pass]: 3.56999e-06 [meta_fg_expand]: 2.13002e-06 [flash_sp_send_recv_attached]: 1.02e-06 [receive_attached]: 1.24e-06 [after_resolve]: 9.41998e-06 [a_after_grad]: 8.88002e-06 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 1.42e-06 [auto_monad_grad]: 7.60017e-07 [auto_monad_eliminator]: 7.31001e-06 [cse]: 1.551e-05 [a_3]: 4.825e-05 [py_interpret_to_execute_after_opt_a]: 1.285e-05 [slice_cell_reuse_recomputed_activation]: 4.89e-06 [rewriter_after_opt_a]: 4.098e-05 [convert_after_rewriter]: 9.23002e-06 [order_py_execute_after_rewriter]: 8.26002e-06 [mutable_eliminate]: 0.00052821 [opt_b]: 0.00031274, [1] [Cycle 1]: 0.00030318, [7] [b_1]: 0.00020187 [b_2]: 8.05999e-06 [updatestate_depend_eliminate]: 7.56999e-06 [updatestate_assign_eliminate]: 3.14999e-06 [updatestate_loads_eliminate]: 2.27001e-06 [renormalize]: 4.69998e-07 [cse]: 2.084e-05 [optimize_parallel_all_gather_comm]: 2.042e-05 [overlap_param_gather]: 4.37e-06 [cconv]: 3.077e-05 [loop_unroll]: 0.00044883 [opt_after_cconv]: 0.00012776, [1] [Cycle 1]: 0.00011849, [7] [c_1]: 2.727e-05 [parameter_eliminate]: 3.26999e-06 [updatestate_depend_eliminate]: 6.36998e-06 [updatestate_assign_eliminate]: 2.43998e-06 [updatestate_loads_eliminate]: 2.27999e-06 [cse]: 1.987e-05 [renormalize]: 3.7998e-07 [remove_dup_value]: 1.769e-05 [tuple_transform]: 8.589e-05, [1] [Cycle 1]: 7.838e-05, [4] [d_1]: 3.893e-05 [none_parameter_eliminate]: 1.79998e-06 [renormalize]: 1.69995e-07 [switch_simplify]: 7.26999e-06 [partial_unused_args_eliminate]: 4.35999e-06 [add_recomputation]: 4.916e-05 [cse_after_recomputation]: 2.861e-05, [1] [Cycle 1]: 2.161e-05, [1] [cse]: 1.278e-05 [environ_conv]: 8.52e-06 [swap_dp_allreduce_reducescatter]: 7.56999e-06 [bias_add_comm_swap]: 4.85999e-06 [label_micro_interleaved_index]: 7.05e-06 [label_fine_grained_interleaved_index]: 5.35999e-06 [merge_cast_opt]: 3.81001e-06 [slice_recompute_activation]: 4.48001e-06 [micro_interleaved_order_control]: 4.67998e-06 [assign_add_opt]: 3.9e-06 [ForceFp32Comm]: 3.29001e-06 [remove_cast_before_assign_add]: 3.43e-06 [full_micro_interleaved_order_control]: 5.00999e-06 [reorder_send_recv_between_fp_bp]: 4.99e-06 [comm_op_add_attrs]: 3.68e-06 [add_comm_op_reuse_tag]: 3.86001e-06 [interleave_split_concat_branches]: 3.70998e-06 [interleave_parallel_branches]: 3.73999e-06 [overlap_opt_shard_in_pipeline]: 4.00998e-06 [overlap_opt_shard_grad_in_pipeline]: 4.68999e-06 [control_data_broadcast_order]: 1.464e-05 [grouped_pairwise_exchange_alltoall]: 4.55001e-06 [offloading_packed_experts]: 6.97002e-06 [overlap_recompute_and_grad_model_parallel]: 7.26999e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.62002e-06 [overlap_recompute_allgather_and_fa_grad]: 3.95e-06 [overlap_recompute_comm]: 4.87e-06 [overlap_grad_ring_attention]: 6.78998e-06 [overlap_grad_flash_sp]: 2.255e-05 [begin_end_overlap_inline]: 2.79999e-06 [split_matmul_comm_elemetwise]: 4.30999e-06 [split_layernorm_comm]: 3.98001e-06 [handle_group_info]: 3.32002e-06 [symbol_engine_optimizer]: 9.505e-05, [1] [Cycle 1]: 8.869e-05, [6] [build]: 2.83e-06 [elim_shapecalc]: 9.73002e-06 [elim_not_effective]: 1.321e-05 [opt_reshape]: 6.46e-06 [fold_const_symbol]: 9.81e-06 [renormalize]: 1.50001e-07 [detach_backward]: 3.55e-06 [pipeline_parallel_scheduler]: 1.79e-06 [auto_monad_reorder]: 1.806e-05 [get_jit_bprop_graph]: 1.81998e-06 [rewriter_after_jit_bprop_graph]: 5.40001e-06 [opt_after_jit_grad]: 0.00051743 [validate]: 4.039e-05 Sums bootstrap : 0.000429s : 4.14% type_inference : 0.005569s : 53.73% event_method : 0.000012s : 0.12% auto_monad : 0.000054s : 0.52% graph_reusing : 0.000005s : 0.05% inline : 0.000002s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000015s : 0.14% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000004s : 0.04% parallel-infer-symbol : 0.000004s : 0.03% pre_auto_parallel : 0.000027s : 0.26% insert-virtual-dataset : 0.000002s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.02% optimize.py_interpret_to_execute : 0.000022s : 0.21% optimize.rewriter_before_opt_a : 0.000049s : 0.48% optimize.opt_a.expand_dump_flag : 0.000004s : 0.04% optimize.opt_a.switch_simplify : 0.000033s : 0.32% optimize.opt_a.loop_unroll : 0.000020s : 0.19% optimize.opt_a.a_1 : 0.000393s : 3.79% optimize.opt_a.with_stream_mark : 0.000030s : 0.29% optimize.opt_a.recompute_prepare : 0.000014s : 0.14% optimize.opt_a.updatestate_depend_eliminate : 0.000007s : 0.07% optimize.opt_a.updatestate_assign_eliminate : 0.000006s : 0.06% optimize.opt_a.updatestate_loads_eliminate : 0.000007s : 0.06% optimize.opt_a.parameter_eliminate : 0.000003s : 0.03% optimize.opt_a.a_2 : 0.000203s : 1.96% optimize.opt_a.accelerated_algorithm : 0.000013s : 0.12% optimize.opt_a.shard : 0.000003s : 0.03% optimize.opt_a.meta_shard_fg_expand : 0.000003s : 0.03% optimize.opt_a.shard_inline : 0.000012s : 0.12% optimize.opt_a.merge_send_recv : 0.000014s : 0.13% optimize.opt_a.auto_parallel : 0.000013s : 0.12% optimize.opt_a.parallel : 0.000023s : 0.22% optimize.opt_a.flash_sp : 0.000012s : 0.11% optimize.opt_a.merge_comm : 0.000008s : 0.07% optimize.opt_a.allreduce_fusion : 0.000006s : 0.06% optimize.opt_a.matmul_add_comm_reduction : 0.000016s : 0.15% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000014s : 0.13% optimize.opt_a.virtual_dataset : 0.000012s : 0.12% optimize.opt_a.get_grad_eliminate_ : 0.000012s : 0.12% optimize.opt_a.virtual_output : 0.000012s : 0.11% optimize.opt_a.merge_forward : 0.000006s : 0.06% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.03% optimize.opt_a.offload_activation : 0.000017s : 0.17% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000030s : 0.29% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.02% optimize.opt_a.before_grad : 0.000021s : 0.20% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000007s : 0.07% optimize.opt_a.meta_fg_expand : 0.000005s : 0.05% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.04% optimize.opt_a.receive_attached : 0.000003s : 0.03% optimize.opt_a.after_resolve : 0.000020s : 0.19% optimize.opt_a.a_after_grad : 0.000019s : 0.18% optimize.opt_a.renormalize : 0.000588s : 5.67% optimize.opt_a.add_forward_monad_depend : 0.000007s : 0.06% optimize.opt_a.auto_monad_grad : 0.000003s : 0.03% optimize.opt_a.auto_monad_eliminator : 0.000022s : 0.22% optimize.opt_a.cse : 0.000046s : 0.44% optimize.opt_a.a_3 : 0.000109s : 1.05% optimize.py_interpret_to_execute_after_opt_a : 0.000013s : 0.12% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.05% optimize.rewriter_after_opt_a : 0.000041s : 0.40% optimize.convert_after_rewriter : 0.000009s : 0.09% optimize.order_py_execute_after_rewriter : 0.000008s : 0.08% optimize.mutable_eliminate : 0.000528s : 5.10% optimize.opt_b.b_1 : 0.000202s : 1.95% optimize.opt_b.b_2 : 0.000008s : 0.08% optimize.opt_b.updatestate_depend_eliminate : 0.000008s : 0.07% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_b.updatestate_loads_eliminate : 0.000002s : 0.02% optimize.opt_b.renormalize : 0.000000s : 0.00% optimize.opt_b.cse : 0.000021s : 0.20% optimize.optimize_parallel_all_gather_comm : 0.000020s : 0.20% optimize.overlap_param_gather : 0.000004s : 0.04% optimize.cconv : 0.000031s : 0.30% optimize.loop_unroll : 0.000449s : 4.33% optimize.opt_after_cconv.c_1 : 0.000027s : 0.26% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.06% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000002s : 0.02% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.02% optimize.opt_after_cconv.cse : 0.000020s : 0.19% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000018s : 0.17% optimize.tuple_transform.d_1 : 0.000039s : 0.38% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.02% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000007s : 0.07% optimize.partial_unused_args_eliminate : 0.000004s : 0.04% optimize.add_recomputation : 0.000049s : 0.47% optimize.cse_after_recomputation.cse : 0.000013s : 0.12% optimize.environ_conv : 0.000009s : 0.08% optimize.swap_dp_allreduce_reducescatter : 0.000008s : 0.07% optimize.bias_add_comm_swap : 0.000005s : 0.05% optimize.label_micro_interleaved_index : 0.000007s : 0.07% optimize.label_fine_grained_interleaved_index : 0.000005s : 0.05% optimize.merge_cast_opt : 0.000004s : 0.04% optimize.slice_recompute_activation : 0.000004s : 0.04% optimize.micro_interleaved_order_control : 0.000005s : 0.05% optimize.assign_add_opt : 0.000004s : 0.04% optimize.ForceFp32Comm : 0.000003s : 0.03% optimize.remove_cast_before_assign_add : 0.000003s : 0.03% optimize.full_micro_interleaved_order_control : 0.000005s : 0.05% optimize.reorder_send_recv_between_fp_bp : 0.000005s : 0.05% optimize.comm_op_add_attrs : 0.000004s : 0.04% optimize.add_comm_op_reuse_tag : 0.000004s : 0.04% optimize.interleave_split_concat_branches : 0.000004s : 0.04% optimize.interleave_parallel_branches : 0.000004s : 0.04% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.04% optimize.overlap_opt_shard_grad_in_pipeline : 0.000005s : 0.05% optimize.control_data_broadcast_order : 0.000015s : 0.14% optimize.grouped_pairwise_exchange_alltoall : 0.000005s : 0.04% optimize.offloading_packed_experts : 0.000007s : 0.07% optimize.overlap_recompute_and_grad_model_parallel : 0.000007s : 0.07% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.03% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.04% optimize.overlap_recompute_comm : 0.000005s : 0.05% optimize.overlap_grad_ring_attention : 0.000007s : 0.07% optimize.overlap_grad_flash_sp : 0.000023s : 0.22% optimize.begin_end_overlap_inline : 0.000003s : 0.03% optimize.split_matmul_comm_elemetwise : 0.000004s : 0.04% optimize.split_layernorm_comm : 0.000004s : 0.04% optimize.handle_group_info : 0.000003s : 0.03% optimize.symbol_engine_optimizer.build : 0.000003s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000010s : 0.09% optimize.symbol_engine_optimizer.elim_not_effective : 0.000013s : 0.13% optimize.symbol_engine_optimizer.opt_reshape : 0.000006s : 0.06% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000010s : 0.09% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000004s : 0.03% pipeline_parallel_scheduler : 0.000002s : 0.02% auto_monad_reorder : 0.000018s : 0.17% get_jit_bprop_graph : 0.000002s : 0.02% rewriter_after_jit_bprop_graph : 0.000005s : 0.05% opt_after_jit_grad : 0.000517s : 4.99% validate : 0.000040s : 0.39% Time group info: ------[substitution.] 0.000143 20 1.32% : 0.000002s : 2: substitution.elim_not_effective 0.91% : 0.000001s : 2: substitution.fold_const_symbol 3.59% : 0.000005s : 3: substitution.graph_param_transform 63.35% : 0.000091s : 2: substitution.inline 2.80% : 0.000004s : 4: substitution.j_node_and_user_rematch 3.22% : 0.000005s : 4: substitution.remove_not_recompute_node 2.24% : 0.000003s : 2: substitution.replace_old_param 22.57% : 0.000032s : 1: substitution.value_based_eliminate ------[type_inference.] 0.005520 2 91.60% : 0.005056s : 1: type_inference.infer 8.40% : 0.000464s : 1: type_inference.specialize ------[replace.] 0.000021 2 100.00% : 0.000021s : 2: replace.inline ------[match.] 0.000089 2 100.00% : 0.000089s : 2: match.inline ------[predicate.] 0.000135 754 0.90% : 0.000001s : 7: predicate.accumulaten_eliminater 1.01% : 0.000001s : 3: predicate.ad_related_special_op_eliminate 0.67% : 0.000001s : 6: predicate.addn_check_dump 0.85% : 0.000001s : 7: predicate.addn_zero_filter 0.69% : 0.000001s : 7: predicate.adjust_all_reduce_mul_add 2.31% : 0.000003s : 13: predicate.arithmetic_simplify 0.83% : 0.000001s : 7: predicate.cast_eliminate 0.82% : 0.000001s : 6: predicate.check_bprop_eliminate 0.79% : 0.000001s : 6: predicate.compare_switch_simplify 0.24% : 0.000000s : 3: predicate.const_output_eliminate 0.73% : 0.000001s : 6: predicate.depend_value_elim 0.81% : 0.000001s : 7: predicate.dict_get_item_const_eliminator 0.96% : 0.000001s : 7: predicate.dict_get_item_eliminator 0.72% : 0.000001s : 7: predicate.dict_set_item_eliminator 1.39% : 0.000002s : 6: predicate.dumpgradient_eliminate 0.43% : 0.000001s : 3: predicate.elim_not_effective 0.59% : 0.000001s : 3: predicate.elim_shapecalc_of_broadcastargs 1.03% : 0.000001s : 10: predicate.environ_add_const_eliminate 0.99% : 0.000001s : 10: predicate.environ_get_add_eliminate 1.05% : 0.000001s : 10: predicate.environ_get_depend_swap 1.67% : 0.000002s : 16: predicate.environ_get_eliminate 0.97% : 0.000001s : 10: predicate.environ_get_set_eliminate 0.95% : 0.000001s : 9: predicate.exchange_switch_depend_value 1.97% : 0.000003s : 9: predicate.float_depend_g_call 0.65% : 0.000001s : 6: predicate.float_environ_get_switch 0.95% : 0.000001s : 9: predicate.float_tuple_getitem_switch 0.21% : 0.000000s : 3: predicate.fold_const_symbol 1.06% : 0.000001s : 6: predicate.get_grad_eliminate 0.31% : 0.000000s : 3: predicate.graph_param_transform 0.78% : 0.000001s : 6: predicate.incorporate_call 0.66% : 0.000001s : 6: predicate.incorporate_call_switch 6.21% : 0.000008s : 34: predicate.inline 1.14% : 0.000002s : 6: predicate.inline_without_move 0.44% : 0.000001s : 6: predicate.j_node_and_user_rematch 0.99% : 0.000001s : 6: predicate.less_batch_normalization 1.68% : 0.000002s : 13: predicate.list_to_tuple_eliminator_ 1.96% : 0.000003s : 20: predicate.load_eliminater 1.33% : 0.000002s : 3: predicate.loop_unroll_after_grad 1.54% : 0.000002s : 14: predicate.loop_unroll_before_grad 1.65% : 0.000002s : 13: predicate.make_slice_get_slice_eliminator 0.67% : 0.000001s : 6: predicate.merge_addn 0.71% : 0.000001s : 6: predicate.micro_step_allgather_replace 0.71% : 0.000001s : 6: predicate.mini_step_allgather_replace 0.67% : 0.000001s : 7: predicate.minmaximum_grad 1.94% : 0.000003s : 3: predicate.mutable_eliminate 0.42% : 0.000001s : 3: predicate.opt_reshape 0.50% : 0.000001s : 3: predicate.parallel_virtual_node 1.30% : 0.000002s : 9: predicate.partial_defer_inline 1.16% : 0.000002s : 10: predicate.partial_eliminate 0.82% : 0.000001s : 7: predicate.print_const_string_wrapper 0.80% : 0.000001s : 6: predicate.reduce_all_const_elim 1.04% : 0.000001s : 7: predicate.reduce_eliminate 1.99% : 0.000003s : 20: predicate.redundant_stop_gradient_eliminater 0.67% : 0.000001s : 6: predicate.remove_not_recompute_node 1.15% : 0.000002s : 13: predicate.replace_applicator 0.79% : 0.000001s : 6: predicate.replace_old_param 0.34% : 0.000000s : 3: predicate.reset_defer_inline 0.87% : 0.000001s : 7: predicate.reshape_eliminate 0.94% : 0.000001s : 6: predicate.row_tensor_add_zeros_like 0.48% : 0.000001s : 3: predicate.row_tensor_eliminate 1.08% : 0.000001s : 6: predicate.same_eliminate 0.54% : 0.000001s : 6: predicate.set_cell_output_no_recompute 1.05% : 0.000001s : 6: predicate.shard_identity_eliminate 0.96% : 0.000001s : 6: predicate.special_op_eliminate 0.96% : 0.000001s : 6: predicate.specialize_transform 1.27% : 0.000002s : 6: predicate.split_environ_get_set_with_tuple_value 1.03% : 0.000001s : 6: predicate.stack_unstack_eliminate 0.42% : 0.000001s : 3: predicate.switch_call_monad_eliminater 0.99% : 0.000001s : 9: predicate.switch_defer_inline 1.61% : 0.000002s : 15: predicate.switch_layer_defer_inline 4.42% : 0.000006s : 32: predicate.switch_simplify 0.71% : 0.000001s : 7: predicate.tile_eliminate 0.79% : 0.000001s : 7: predicate.transpose_eliminate 1.52% : 0.000002s : 13: predicate.tuple_list_convert_item_index_to_positive 1.49% : 0.000002s : 13: predicate.tuple_list_get_item_const_eliminator 1.33% : 0.000002s : 13: predicate.tuple_list_get_item_depend_reorder 3.54% : 0.000005s : 19: predicate.tuple_list_get_item_eliminator 1.48% : 0.000002s : 13: predicate.tuple_list_get_set_item_eliminator 2.42% : 0.000003s : 19: predicate.tuple_list_set_item_eliminator 1.50% : 0.000002s : 13: predicate.tuple_to_list_eliminator_ 1.94% : 0.000003s : 20: predicate.updatestate_pure_node_eliminater 3.02% : 0.000004s : 26: predicate.updatestate_useless_node_eliminater 0.57% : 0.000001s : 3: predicate.value_based_eliminate 0.77% : 0.000001s : 6: predicate.virtual_dataset_eliminate 0.76% : 0.000001s : 6: predicate.virtual_output_eliminate 0.30% : 0.000000s : 3: predicate.virtual_view_grad_eliminate 0.66% : 0.000001s : 3: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000250 5 7.10% : 0.000018s : 1: func_graph_cloner_run.FuncGraphClonerGraph 92.90% : 0.000232s : 4: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.025017 192 0.02% : 0.000006s : 1: ForceFp32Comm 13.18% : 0.003297s : 1: add_attr 13.12% : 0.003282s : 1: add_attr_with_inline 0.03% : 0.000006s : 1: add_comm_op_reuse_tag 0.21% : 0.000053s : 1: add_recomputation 0.03% : 0.000006s : 1: assign_add_opt 0.25% : 0.000063s : 1: auto_monad 0.10% : 0.000025s : 1: auto_monad_reorder 0.02% : 0.000006s : 1: begin_end_overlap_inline 0.03% : 0.000008s : 1: bias_add_comm_swap 1.89% : 0.000472s : 1: bootstrap 0.14% : 0.000034s : 1: cconv 0.03% : 0.000006s : 1: comm_op_add_attrs 0.07% : 0.000018s : 1: control_data_broadcast_order 0.05% : 0.000012s : 1: convert_after_rewriter 0.13% : 0.000032s : 1: cse_after_recomputation 0.03% : 0.000008s : 1: dataset_repeat_opt 0.07% : 0.000019s : 1: detach_backward 0.05% : 0.000011s : 1: environ_conv 0.09% : 0.000022s : 1: event_method 0.03% : 0.000008s : 1: full_micro_interleaved_order_control 0.03% : 0.000008s : 1: get_jit_bprop_graph 0.05% : 0.000011s : 1: graph_reusing 0.03% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.02% : 0.000006s : 1: handle_group_info 0.03% : 0.000008s : 1: inline 0.03% : 0.000008s : 1: insert-virtual-dataset 0.03% : 0.000006s : 1: interleave_parallel_branches 0.03% : 0.000006s : 1: interleave_split_concat_branches 0.03% : 0.000008s : 1: label_fine_grained_interleaved_index 0.04% : 0.000010s : 1: label_micro_interleaved_index 1.82% : 0.000455s : 1: loop_unroll 0.03% : 0.000006s : 1: merge_cast_opt 0.03% : 0.000007s : 1: micro_interleaved_order_control 2.14% : 0.000534s : 1: mutable_eliminate 0.04% : 0.000010s : 1: offloading_packed_experts 0.06% : 0.000014s : 1: opt.transform.loop_unroll_optimizer 0.06% : 0.000016s : 1: opt.transform.mutable_eliminate 3.05% : 0.000762s : 78: opt.transform.opt_a 0.10% : 0.000026s : 1: opt.transform.opt_after_cconv 0.10% : 0.000025s : 1: opt.transform.opt_after_jit_grad 0.54% : 0.000134s : 28: opt.transform.opt_b 0.18% : 0.000044s : 2: opt.transform.opt_trans_graph 0.14% : 0.000035s : 4: opt.transform.symbol_engine_opt 10.16% : 0.002542s : 1: opt_a 0.52% : 0.000131s : 1: opt_after_cconv 2.11% : 0.000529s : 1: opt_after_jit_grad 1.27% : 0.000317s : 1: opt_b 20.67% : 0.005171s : 1: optimize 0.09% : 0.000024s : 1: optimize_parallel_all_gather_comm 0.04% : 0.000011s : 1: order_py_execute_after_rewriter 0.10% : 0.000026s : 1: overlap_grad_flash_sp 0.03% : 0.000006s : 1: overlap_grad_matmul_and_grad_allreduce 0.04% : 0.000009s : 1: overlap_grad_ring_attention 0.03% : 0.000007s : 1: overlap_opt_shard_grad_in_pipeline 0.03% : 0.000007s : 1: overlap_opt_shard_in_pipeline 0.03% : 0.000007s : 1: overlap_param_gather 0.03% : 0.000007s : 1: overlap_recompute_allgather_and_fa_grad 0.04% : 0.000010s : 1: overlap_recompute_and_grad_model_parallel 0.03% : 0.000008s : 1: overlap_recompute_comm 0.04% : 0.000010s : 1: parallel-infer-symbol 0.02% : 0.000006s : 1: parallel-infer-symbol-second 0.03% : 0.000007s : 1: partial_unused_args_eliminate 0.03% : 0.000008s : 1: pipeline_parallel_scheduler 0.03% : 0.000007s : 1: pipeline_split 0.14% : 0.000034s : 1: pre_auto_parallel 0.10% : 0.000025s : 1: py_interpret_to_execute 0.07% : 0.000016s : 1: py_interpret_to_execute_after_opt_a 0.02% : 0.000006s : 1: remove_cast_before_assign_add 0.08% : 0.000021s : 1: remove_dup_value 1.28% : 0.000321s : 1: renormalize.infer 1.03% : 0.000258s : 1: renormalize.specialize 0.03% : 0.000008s : 1: reorder_send_recv_between_fp_bp 0.05% : 0.000011s : 1: rewriter_after_jit_bprop_graph 0.18% : 0.000044s : 1: rewriter_after_opt_a 0.21% : 0.000053s : 1: rewriter_before_opt_a 0.03% : 0.000008s : 1: slice_cell_reuse_recomputed_activation 0.03% : 0.000007s : 1: slice_recompute_activation 0.03% : 0.000007s : 1: split_layernorm_comm 0.03% : 0.000007s : 1: split_matmul_comm_elemetwise 0.04% : 0.000010s : 1: swap_dp_allreduce_reducescatter 0.39% : 0.000098s : 1: symbol_engine_optimizer 0.36% : 0.000089s : 1: tuple_transform 22.38% : 0.005599s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:58.126.600 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0144275, [21] [bootstrap]: 0.00046393 [type_inference]: 0.00573939 [event_method]: 1.235e-05 [auto_monad]: 5.612e-05 [graph_reusing]: 6.46e-06 [inline]: 2.31e-06 [add_attr]: 0.00327005, [1] [add_attr_with_inline]: 0.00326017, [1] [Cycle 1]: 5.898e-05, [2] [tag_attr]: 1.476e-05 [meta_addattr_fg_expand]: 3.75e-06 [parallel-infer-symbol]: 2.99001e-06 [pre_auto_parallel]: 2.686e-05 [insert-virtual-dataset]: 2.27999e-06 [parallel-infer-symbol-second]: 6.99976e-07 [dataset_repeat_opt]: 2.33998e-06 [pipeline_split]: 1.60001e-06 [optimize]: 0.0041407, [53] [py_interpret_to_execute]: 1.75e-05 [rewriter_before_opt_a]: 4.564e-05 [opt_a]: 0.0021412, [2] [Cycle 1]: 0.00151294, [45] [expand_dump_flag]: 2.89999e-06 [switch_simplify]: 2.431e-05 [loop_unroll]: 1.37e-05 [a_1]: 0.00031822 [with_stream_mark]: 1.694e-05 [recompute_prepare]: 7.65e-06 [updatestate_depend_eliminate]: 4.89e-06 [updatestate_assign_eliminate]: 2.99001e-06 [updatestate_loads_eliminate]: 2.93e-06 [parameter_eliminate]: 1.85001e-06 [a_2]: 7.985e-05 [accelerated_algorithm]: 6.16998e-06 [shard]: 1.94e-06 [meta_shard_fg_expand]: 1.74e-06 [shard_inline]: 6.44001e-06 [merge_send_recv]: 7.93001e-06 [auto_parallel]: 6.53e-06 [parallel]: 1.82e-05 [flash_sp]: 8.29002e-06 [merge_comm]: 4.23999e-06 [allreduce_fusion]: 3.35e-06 [matmul_add_comm_reduction]: 1.032e-05 [allreduce_slice_to_reducescatter]: 7.2e-07 [virtual_shard_identity]: 8.27e-06 [virtual_dataset]: 6.01e-06 [get_grad_eliminate_]: 5.87999e-06 [virtual_output]: 6.31998e-06 [merge_forward]: 3.78001e-06 [cell_reuse_recompute_pass]: 1.32e-06 [offload_activation]: 9.55001e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.262e-05 [merge_recompute_call_nodes]: 1.44e-06 [before_grad]: 1.034e-05 [set_forward_comm_id_for_comm_node_pass]: 3.53999e-06 [meta_fg_expand]: 2.68e-06 [flash_sp_send_recv_attached]: 2.53e-06 [receive_attached]: 1.96003e-06 [after_resolve]: 9.63002e-06 [a_after_grad]: 9.05999e-06 [renormalize]: 0.00052526 [add_forward_monad_depend]: 5.37001e-06 [auto_monad_grad]: 2.22001e-06 [auto_monad_eliminator]: 1.357e-05 [cse]: 2.993e-05 [a_3]: 4.639e-05 [Cycle 2]: 0.0006182, [45] [expand_dump_flag]: 1.49998e-06 [switch_simplify]: 6.94999e-06 [loop_unroll]: 5.94e-06 [a_1]: 0.00010684 [with_stream_mark]: 1.178e-05 [recompute_prepare]: 6.17999e-06 [updatestate_depend_eliminate]: 3.03998e-06 [updatestate_assign_eliminate]: 2.44001e-06 [updatestate_loads_eliminate]: 3.10998e-06 [parameter_eliminate]: 1.15999e-06 [a_2]: 6.979e-05 [accelerated_algorithm]: 6.07001e-06 [shard]: 1.27999e-06 [meta_shard_fg_expand]: 1.70001e-06 [shard_inline]: 5.82001e-06 [merge_send_recv]: 5.29e-06 [auto_parallel]: 5.42001e-06 [parallel]: 5.47001e-06 [flash_sp]: 3.18e-06 [merge_comm]: 3.51001e-06 [allreduce_fusion]: 2.91e-06 [matmul_add_comm_reduction]: 9.46998e-06 [allreduce_slice_to_reducescatter]: 3.80009e-07 [virtual_shard_identity]: 6.84001e-06 [virtual_dataset]: 5.81e-06 [get_grad_eliminate_]: 5.84e-06 [virtual_output]: 5.27001e-06 [merge_forward]: 2.73e-06 [cell_reuse_recompute_pass]: 1.57999e-06 [offload_activation]: 6.70998e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.283e-05 [merge_recompute_call_nodes]: 9.89996e-07 [before_grad]: 9.17001e-06 [set_forward_comm_id_for_comm_node_pass]: 3.61001e-06 [meta_fg_expand]: 2.20002e-06 [flash_sp_send_recv_attached]: 8.59989e-07 [receive_attached]: 1.41002e-06 [after_resolve]: 8.75999e-06 [a_after_grad]: 8.99e-06 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 1.14998e-06 [auto_monad_grad]: 1.03001e-06 [auto_monad_eliminator]: 8.07e-06 [cse]: 1.525e-05 [a_3]: 3.497e-05 [py_interpret_to_execute_after_opt_a]: 8.72e-06 [slice_cell_reuse_recomputed_activation]: 1.82001e-06 [rewriter_after_opt_a]: 3.57e-05 [convert_after_rewriter]: 6.63e-06 [order_py_execute_after_rewriter]: 5.10001e-06 [mutable_eliminate]: 0.00051565 [opt_b]: 0.00024293, [1] [Cycle 1]: 0.00023667, [7] [b_1]: 0.00015666 [b_2]: 7.93001e-06 [updatestate_depend_eliminate]: 6.68e-06 [updatestate_assign_eliminate]: 2.64999e-06 [updatestate_loads_eliminate]: 2.32999e-06 [renormalize]: 4.59986e-07 [cse]: 2.185e-05 [optimize_parallel_all_gather_comm]: 1.577e-05 [overlap_param_gather]: 1.93997e-06 [cconv]: 2.794e-05 [loop_unroll]: 0.00042691 [opt_after_cconv]: 9.942e-05, [1] [Cycle 1]: 9.338e-05, [7] [c_1]: 2.669e-05 [parameter_eliminate]: 4.02e-06 [updatestate_depend_eliminate]: 4.95001e-06 [updatestate_assign_eliminate]: 2.34001e-06 [updatestate_loads_eliminate]: 2.85998e-06 [cse]: 1.905e-05 [renormalize]: 3.49974e-07 [remove_dup_value]: 1.513e-05 [tuple_transform]: 6.944e-05, [1] [Cycle 1]: 6.466e-05, [4] [d_1]: 3.814e-05 [none_parameter_eliminate]: 2.19999e-06 [renormalize]: 1.69995e-07 [switch_simplify]: 6.17999e-06 [partial_unused_args_eliminate]: 2.37999e-06 [add_recomputation]: 4.786e-05 [cse_after_recomputation]: 2.157e-05, [1] [Cycle 1]: 1.737e-05, [1] [cse]: 1.178e-05 [environ_conv]: 4.91002e-06 [swap_dp_allreduce_reducescatter]: 4.94e-06 [bias_add_comm_swap]: 2.48e-06 [label_micro_interleaved_index]: 4.58999e-06 [label_fine_grained_interleaved_index]: 2.63e-06 [merge_cast_opt]: 1.55999e-06 [slice_recompute_activation]: 2.61e-06 [micro_interleaved_order_control]: 2.71999e-06 [assign_add_opt]: 1.29e-06 [ForceFp32Comm]: 8.70001e-07 [remove_cast_before_assign_add]: 1.29998e-06 [full_micro_interleaved_order_control]: 2.49999e-06 [reorder_send_recv_between_fp_bp]: 3.03e-06 [comm_op_add_attrs]: 1.00999e-06 [add_comm_op_reuse_tag]: 9.60019e-07 [interleave_split_concat_branches]: 1.17e-06 [interleave_parallel_branches]: 1.34e-06 [overlap_opt_shard_in_pipeline]: 1.19e-06 [overlap_opt_shard_grad_in_pipeline]: 1.99e-06 [control_data_broadcast_order]: 1.279e-05 [grouped_pairwise_exchange_alltoall]: 1.71e-06 [offloading_packed_experts]: 4e-06 [overlap_recompute_and_grad_model_parallel]: 4.84e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.14e-06 [overlap_recompute_allgather_and_fa_grad]: 1.37999e-06 [overlap_recompute_comm]: 2.45002e-06 [overlap_grad_ring_attention]: 4.15999e-06 [overlap_grad_flash_sp]: 1.936e-05 [begin_end_overlap_inline]: 4.69998e-07 [split_matmul_comm_elemetwise]: 2.04999e-06 [split_layernorm_comm]: 1.91e-06 [handle_group_info]: 9.40025e-07 [symbol_engine_optimizer]: 7.456e-05, [1] [Cycle 1]: 7.021e-05, [6] [build]: 2.84999e-06 [elim_shapecalc]: 9.74e-06 [elim_not_effective]: 1.27e-05 [opt_reshape]: 6.98e-06 [fold_const_symbol]: 1.002e-05 [renormalize]: 1.79978e-07 [detach_backward]: 1.87999e-06 [pipeline_parallel_scheduler]: 1.79998e-06 [auto_monad_reorder]: 1.596e-05 [get_jit_bprop_graph]: 2.24001e-06 [rewriter_after_jit_bprop_graph]: 4.95001e-06 [opt_after_jit_grad]: 0.00046699 [validate]: 3.742e-05 Sums bootstrap : 0.000464s : 4.56% type_inference : 0.005739s : 56.35% event_method : 0.000012s : 0.12% auto_monad : 0.000056s : 0.55% graph_reusing : 0.000006s : 0.06% inline : 0.000002s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000015s : 0.14% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000004s : 0.04% parallel-infer-symbol : 0.000003s : 0.03% pre_auto_parallel : 0.000027s : 0.26% insert-virtual-dataset : 0.000002s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.02% optimize.py_interpret_to_execute : 0.000017s : 0.17% optimize.rewriter_before_opt_a : 0.000046s : 0.45% optimize.opt_a.expand_dump_flag : 0.000004s : 0.04% optimize.opt_a.switch_simplify : 0.000031s : 0.31% optimize.opt_a.loop_unroll : 0.000020s : 0.19% optimize.opt_a.a_1 : 0.000425s : 4.17% optimize.opt_a.with_stream_mark : 0.000029s : 0.28% optimize.opt_a.recompute_prepare : 0.000014s : 0.14% optimize.opt_a.updatestate_depend_eliminate : 0.000008s : 0.08% optimize.opt_a.updatestate_assign_eliminate : 0.000005s : 0.05% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.06% optimize.opt_a.parameter_eliminate : 0.000003s : 0.03% optimize.opt_a.a_2 : 0.000150s : 1.47% optimize.opt_a.accelerated_algorithm : 0.000012s : 0.12% optimize.opt_a.shard : 0.000003s : 0.03% optimize.opt_a.meta_shard_fg_expand : 0.000003s : 0.03% optimize.opt_a.shard_inline : 0.000012s : 0.12% optimize.opt_a.merge_send_recv : 0.000013s : 0.13% optimize.opt_a.auto_parallel : 0.000012s : 0.12% optimize.opt_a.parallel : 0.000024s : 0.23% optimize.opt_a.flash_sp : 0.000011s : 0.11% optimize.opt_a.merge_comm : 0.000008s : 0.08% optimize.opt_a.allreduce_fusion : 0.000006s : 0.06% optimize.opt_a.matmul_add_comm_reduction : 0.000020s : 0.19% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000015s : 0.15% optimize.opt_a.virtual_dataset : 0.000012s : 0.12% optimize.opt_a.get_grad_eliminate_ : 0.000012s : 0.12% optimize.opt_a.virtual_output : 0.000012s : 0.11% optimize.opt_a.merge_forward : 0.000007s : 0.06% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.03% optimize.opt_a.offload_activation : 0.000016s : 0.16% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000025s : 0.25% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.02% optimize.opt_a.before_grad : 0.000020s : 0.19% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000007s : 0.07% optimize.opt_a.meta_fg_expand : 0.000005s : 0.05% optimize.opt_a.flash_sp_send_recv_attached : 0.000003s : 0.03% optimize.opt_a.receive_attached : 0.000003s : 0.03% optimize.opt_a.after_resolve : 0.000018s : 0.18% optimize.opt_a.a_after_grad : 0.000018s : 0.18% optimize.opt_a.renormalize : 0.000525s : 5.16% optimize.opt_a.add_forward_monad_depend : 0.000007s : 0.06% optimize.opt_a.auto_monad_grad : 0.000003s : 0.03% optimize.opt_a.auto_monad_eliminator : 0.000022s : 0.21% optimize.opt_a.cse : 0.000045s : 0.44% optimize.opt_a.a_3 : 0.000081s : 0.80% optimize.py_interpret_to_execute_after_opt_a : 0.000009s : 0.09% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.02% optimize.rewriter_after_opt_a : 0.000036s : 0.35% optimize.convert_after_rewriter : 0.000007s : 0.07% optimize.order_py_execute_after_rewriter : 0.000005s : 0.05% optimize.mutable_eliminate : 0.000516s : 5.06% optimize.opt_b.b_1 : 0.000157s : 1.54% optimize.opt_b.b_2 : 0.000008s : 0.08% optimize.opt_b.updatestate_depend_eliminate : 0.000007s : 0.07% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_b.updatestate_loads_eliminate : 0.000002s : 0.02% optimize.opt_b.renormalize : 0.000000s : 0.00% optimize.opt_b.cse : 0.000022s : 0.21% optimize.optimize_parallel_all_gather_comm : 0.000016s : 0.15% optimize.overlap_param_gather : 0.000002s : 0.02% optimize.cconv : 0.000028s : 0.27% optimize.loop_unroll : 0.000427s : 4.19% optimize.opt_after_cconv.c_1 : 0.000027s : 0.26% optimize.opt_after_cconv.parameter_eliminate : 0.000004s : 0.04% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000005s : 0.05% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000002s : 0.02% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.cse : 0.000019s : 0.19% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000015s : 0.15% optimize.tuple_transform.d_1 : 0.000038s : 0.37% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.02% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000006s : 0.06% optimize.partial_unused_args_eliminate : 0.000002s : 0.02% optimize.add_recomputation : 0.000048s : 0.47% optimize.cse_after_recomputation.cse : 0.000012s : 0.12% optimize.environ_conv : 0.000005s : 0.05% optimize.swap_dp_allreduce_reducescatter : 0.000005s : 0.05% optimize.bias_add_comm_swap : 0.000002s : 0.02% optimize.label_micro_interleaved_index : 0.000005s : 0.05% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.03% optimize.merge_cast_opt : 0.000002s : 0.02% optimize.slice_recompute_activation : 0.000003s : 0.03% optimize.micro_interleaved_order_control : 0.000003s : 0.03% optimize.assign_add_opt : 0.000001s : 0.01% optimize.ForceFp32Comm : 0.000001s : 0.01% optimize.remove_cast_before_assign_add : 0.000001s : 0.01% optimize.full_micro_interleaved_order_control : 0.000002s : 0.02% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.03% optimize.comm_op_add_attrs : 0.000001s : 0.01% optimize.add_comm_op_reuse_tag : 0.000001s : 0.01% optimize.interleave_split_concat_branches : 0.000001s : 0.01% optimize.interleave_parallel_branches : 0.000001s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.02% optimize.control_data_broadcast_order : 0.000013s : 0.13% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.02% optimize.offloading_packed_experts : 0.000004s : 0.04% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.05% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.01% optimize.overlap_recompute_comm : 0.000002s : 0.02% optimize.overlap_grad_ring_attention : 0.000004s : 0.04% optimize.overlap_grad_flash_sp : 0.000019s : 0.19% optimize.begin_end_overlap_inline : 0.000000s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.02% optimize.split_layernorm_comm : 0.000002s : 0.02% optimize.handle_group_info : 0.000001s : 0.01% optimize.symbol_engine_optimizer.build : 0.000003s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000010s : 0.10% optimize.symbol_engine_optimizer.elim_not_effective : 0.000013s : 0.12% optimize.symbol_engine_optimizer.opt_reshape : 0.000007s : 0.07% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000010s : 0.10% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.02% pipeline_parallel_scheduler : 0.000002s : 0.02% auto_monad_reorder : 0.000016s : 0.16% get_jit_bprop_graph : 0.000002s : 0.02% rewriter_after_jit_bprop_graph : 0.000005s : 0.05% opt_after_jit_grad : 0.000467s : 4.59% validate : 0.000037s : 0.37% Time group info: ------[substitution.] 0.000141 20 1.35% : 0.000002s : 2: substitution.elim_not_effective 0.99% : 0.000001s : 2: substitution.fold_const_symbol 4.37% : 0.000006s : 3: substitution.graph_param_transform 63.59% : 0.000090s : 2: substitution.inline 2.67% : 0.000004s : 4: substitution.j_node_and_user_rematch 3.09% : 0.000004s : 4: substitution.remove_not_recompute_node 2.65% : 0.000004s : 2: substitution.replace_old_param 21.30% : 0.000030s : 1: substitution.value_based_eliminate ------[type_inference.] 0.005687 2 91.31% : 0.005193s : 1: type_inference.infer 8.69% : 0.000494s : 1: type_inference.specialize ------[replace.] 0.000021 2 100.00% : 0.000021s : 2: replace.inline ------[match.] 0.000088 2 100.00% : 0.000088s : 2: match.inline ------[predicate.] 0.000137 754 0.91% : 0.000001s : 7: predicate.accumulaten_eliminater 1.21% : 0.000002s : 3: predicate.ad_related_special_op_eliminate 0.65% : 0.000001s : 6: predicate.addn_check_dump 0.83% : 0.000001s : 7: predicate.addn_zero_filter 0.77% : 0.000001s : 7: predicate.adjust_all_reduce_mul_add 2.87% : 0.000004s : 13: predicate.arithmetic_simplify 0.70% : 0.000001s : 7: predicate.cast_eliminate 0.79% : 0.000001s : 6: predicate.check_bprop_eliminate 0.67% : 0.000001s : 6: predicate.compare_switch_simplify 0.24% : 0.000000s : 3: predicate.const_output_eliminate 0.73% : 0.000001s : 6: predicate.depend_value_elim 0.78% : 0.000001s : 7: predicate.dict_get_item_const_eliminator 1.00% : 0.000001s : 7: predicate.dict_get_item_eliminator 0.74% : 0.000001s : 7: predicate.dict_set_item_eliminator 1.32% : 0.000002s : 6: predicate.dumpgradient_eliminate 0.26% : 0.000000s : 3: predicate.elim_not_effective 0.76% : 0.000001s : 3: predicate.elim_shapecalc_of_broadcastargs 1.22% : 0.000002s : 10: predicate.environ_add_const_eliminate 1.05% : 0.000001s : 10: predicate.environ_get_add_eliminate 0.94% : 0.000001s : 10: predicate.environ_get_depend_swap 1.68% : 0.000002s : 16: predicate.environ_get_eliminate 0.92% : 0.000001s : 10: predicate.environ_get_set_eliminate 0.92% : 0.000001s : 9: predicate.exchange_switch_depend_value 2.06% : 0.000003s : 9: predicate.float_depend_g_call 0.58% : 0.000001s : 6: predicate.float_environ_get_switch 0.89% : 0.000001s : 9: predicate.float_tuple_getitem_switch 0.21% : 0.000000s : 3: predicate.fold_const_symbol 0.84% : 0.000001s : 6: predicate.get_grad_eliminate 0.26% : 0.000000s : 3: predicate.graph_param_transform 0.73% : 0.000001s : 6: predicate.incorporate_call 0.64% : 0.000001s : 6: predicate.incorporate_call_switch 6.26% : 0.000009s : 34: predicate.inline 0.99% : 0.000001s : 6: predicate.inline_without_move 0.38% : 0.000001s : 6: predicate.j_node_and_user_rematch 0.98% : 0.000001s : 6: predicate.less_batch_normalization 1.75% : 0.000002s : 13: predicate.list_to_tuple_eliminator_ 1.94% : 0.000003s : 20: predicate.load_eliminater 1.42% : 0.000002s : 3: predicate.loop_unroll_after_grad 1.71% : 0.000002s : 14: predicate.loop_unroll_before_grad 1.80% : 0.000002s : 13: predicate.make_slice_get_slice_eliminator 0.67% : 0.000001s : 6: predicate.merge_addn 0.65% : 0.000001s : 6: predicate.micro_step_allgather_replace 0.70% : 0.000001s : 6: predicate.mini_step_allgather_replace 0.72% : 0.000001s : 7: predicate.minmaximum_grad 2.00% : 0.000003s : 3: predicate.mutable_eliminate 0.44% : 0.000001s : 3: predicate.opt_reshape 0.55% : 0.000001s : 3: predicate.parallel_virtual_node 1.18% : 0.000002s : 9: predicate.partial_defer_inline 1.21% : 0.000002s : 10: predicate.partial_eliminate 1.08% : 0.000001s : 7: predicate.print_const_string_wrapper 0.71% : 0.000001s : 6: predicate.reduce_all_const_elim 1.07% : 0.000001s : 7: predicate.reduce_eliminate 1.97% : 0.000003s : 20: predicate.redundant_stop_gradient_eliminater 0.62% : 0.000001s : 6: predicate.remove_not_recompute_node 1.13% : 0.000002s : 13: predicate.replace_applicator 0.82% : 0.000001s : 6: predicate.replace_old_param 0.36% : 0.000000s : 3: predicate.reset_defer_inline 0.88% : 0.000001s : 7: predicate.reshape_eliminate 0.70% : 0.000001s : 6: predicate.row_tensor_add_zeros_like 0.36% : 0.000001s : 3: predicate.row_tensor_eliminate 1.04% : 0.000001s : 6: predicate.same_eliminate 0.56% : 0.000001s : 6: predicate.set_cell_output_no_recompute 1.64% : 0.000002s : 6: predicate.shard_identity_eliminate 1.05% : 0.000001s : 6: predicate.special_op_eliminate 1.02% : 0.000001s : 6: predicate.specialize_transform 1.08% : 0.000001s : 6: predicate.split_environ_get_set_with_tuple_value 0.94% : 0.000001s : 6: predicate.stack_unstack_eliminate 0.44% : 0.000001s : 3: predicate.switch_call_monad_eliminater 0.98% : 0.000001s : 9: predicate.switch_defer_inline 1.61% : 0.000002s : 15: predicate.switch_layer_defer_inline 4.16% : 0.000006s : 32: predicate.switch_simplify 0.80% : 0.000001s : 7: predicate.tile_eliminate 0.81% : 0.000001s : 7: predicate.transpose_eliminate 1.53% : 0.000002s : 13: predicate.tuple_list_convert_item_index_to_positive 1.51% : 0.000002s : 13: predicate.tuple_list_get_item_const_eliminator 1.32% : 0.000002s : 13: predicate.tuple_list_get_item_depend_reorder 3.06% : 0.000004s : 19: predicate.tuple_list_get_item_eliminator 1.37% : 0.000002s : 13: predicate.tuple_list_get_set_item_eliminator 2.54% : 0.000003s : 19: predicate.tuple_list_set_item_eliminator 1.49% : 0.000002s : 13: predicate.tuple_to_list_eliminator_ 1.92% : 0.000003s : 20: predicate.updatestate_pure_node_eliminater 2.79% : 0.000004s : 26: predicate.updatestate_useless_node_eliminater 0.65% : 0.000001s : 3: predicate.value_based_eliminate 0.77% : 0.000001s : 6: predicate.virtual_dataset_eliminate 0.81% : 0.000001s : 6: predicate.virtual_output_eliminate 0.32% : 0.000000s : 3: predicate.virtual_view_grad_eliminate 0.57% : 0.000001s : 3: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000257 5 6.95% : 0.000018s : 1: func_graph_cloner_run.FuncGraphClonerGraph 93.05% : 0.000239s : 4: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.023300 192 0.02% : 0.000004s : 1: ForceFp32Comm 14.06% : 0.003275s : 1: add_attr 14.01% : 0.003264s : 1: add_attr_with_inline 0.02% : 0.000004s : 1: add_comm_op_reuse_tag 0.22% : 0.000052s : 1: add_recomputation 0.02% : 0.000004s : 1: assign_add_opt 0.26% : 0.000061s : 1: auto_monad 0.08% : 0.000020s : 1: auto_monad_reorder 0.01% : 0.000003s : 1: begin_end_overlap_inline 0.02% : 0.000006s : 1: bias_add_comm_swap 2.12% : 0.000495s : 1: bootstrap 0.13% : 0.000031s : 1: cconv 0.02% : 0.000004s : 1: comm_op_add_attrs 0.07% : 0.000016s : 1: control_data_broadcast_order 0.04% : 0.000010s : 1: convert_after_rewriter 0.10% : 0.000024s : 1: cse_after_recomputation 0.02% : 0.000005s : 1: dataset_repeat_opt 0.02% : 0.000005s : 1: detach_backward 0.03% : 0.000008s : 1: environ_conv 0.08% : 0.000020s : 1: event_method 0.02% : 0.000005s : 1: full_micro_interleaved_order_control 0.03% : 0.000007s : 1: get_jit_bprop_graph 0.04% : 0.000010s : 1: graph_reusing 0.02% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.02% : 0.000004s : 1: handle_group_info 0.02% : 0.000005s : 1: inline 0.02% : 0.000006s : 1: insert-virtual-dataset 0.02% : 0.000004s : 1: interleave_parallel_branches 0.02% : 0.000004s : 1: interleave_split_concat_branches 0.02% : 0.000005s : 1: label_fine_grained_interleaved_index 0.03% : 0.000008s : 1: label_micro_interleaved_index 1.87% : 0.000435s : 1: loop_unroll 0.02% : 0.000004s : 1: merge_cast_opt 0.02% : 0.000006s : 1: micro_interleaved_order_control 2.25% : 0.000524s : 1: mutable_eliminate 0.03% : 0.000007s : 1: offloading_packed_experts 0.06% : 0.000014s : 1: opt.transform.loop_unroll_optimizer 0.07% : 0.000016s : 1: opt.transform.mutable_eliminate 3.37% : 0.000785s : 78: opt.transform.opt_a 0.11% : 0.000025s : 1: opt.transform.opt_after_cconv 0.11% : 0.000025s : 1: opt.transform.opt_after_jit_grad 0.56% : 0.000131s : 28: opt.transform.opt_b 0.18% : 0.000042s : 2: opt.transform.opt_trans_graph 0.15% : 0.000036s : 4: opt.transform.symbol_engine_opt 9.20% : 0.002144s : 1: opt_a 0.44% : 0.000103s : 1: opt_after_cconv 2.04% : 0.000476s : 1: opt_after_jit_grad 1.06% : 0.000247s : 1: opt_b 17.79% : 0.004146s : 1: optimize 0.08% : 0.000019s : 1: optimize_parallel_all_gather_comm 0.03% : 0.000008s : 1: order_py_execute_after_rewriter 0.10% : 0.000023s : 1: overlap_grad_flash_sp 0.02% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.03% : 0.000007s : 1: overlap_grad_ring_attention 0.02% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.02% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.02% : 0.000005s : 1: overlap_param_gather 0.02% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.03% : 0.000008s : 1: overlap_recompute_and_grad_model_parallel 0.02% : 0.000005s : 1: overlap_recompute_comm 0.03% : 0.000006s : 1: parallel-infer-symbol 0.02% : 0.000004s : 1: parallel-infer-symbol-second 0.02% : 0.000005s : 1: partial_unused_args_eliminate 0.02% : 0.000005s : 1: pipeline_parallel_scheduler 0.02% : 0.000004s : 1: pipeline_split 0.13% : 0.000031s : 1: pre_auto_parallel 0.09% : 0.000021s : 1: py_interpret_to_execute 0.05% : 0.000012s : 1: py_interpret_to_execute_after_opt_a 0.02% : 0.000004s : 1: remove_cast_before_assign_add 0.08% : 0.000019s : 1: remove_dup_value 1.21% : 0.000282s : 1: renormalize.infer 1.01% : 0.000236s : 1: renormalize.specialize 0.02% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.04% : 0.000008s : 1: rewriter_after_jit_bprop_graph 0.17% : 0.000040s : 1: rewriter_after_opt_a 0.21% : 0.000050s : 1: rewriter_before_opt_a 0.02% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.02% : 0.000005s : 1: slice_recompute_activation 0.02% : 0.000005s : 1: split_layernorm_comm 0.02% : 0.000005s : 1: split_matmul_comm_elemetwise 0.03% : 0.000008s : 1: swap_dp_allreduce_reducescatter 0.33% : 0.000077s : 1: symbol_engine_optimizer 0.31% : 0.000072s : 1: tuple_transform 24.72% : 0.005759s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:58.324.688 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:58.324.967 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0156473, [21] [bootstrap]: 0.00044466 [type_inference]: 0.00575615 [event_method]: 1.301e-05 [auto_monad]: 5.443e-05 [graph_reusing]: 5.04998e-06 [inline]: 2.47001e-06 [add_attr]: 0.00323413, [1] [add_attr_with_inline]: 0.00322533, [1] [Cycle 1]: 7.314e-05, [2] [tag_attr]: 1.38e-05 [meta_addattr_fg_expand]: 3.51999e-06 [parallel-infer-symbol]: 3.06001e-06 [pre_auto_parallel]: 2.805e-05 [insert-virtual-dataset]: 2.49001e-06 [parallel-infer-symbol-second]: 7.50006e-07 [dataset_repeat_opt]: 1.67001e-06 [pipeline_split]: 1.74e-06 [optimize]: 0.00490501, [53] [py_interpret_to_execute]: 2.193e-05 [rewriter_before_opt_a]: 5.077e-05 [opt_a]: 0.00257836, [2] [Cycle 1]: 0.00175114, [45] [expand_dump_flag]: 2.74999e-06 [switch_simplify]: 2.486e-05 [loop_unroll]: 1.401e-05 [a_1]: 0.00028686 [with_stream_mark]: 1.647e-05 [recompute_prepare]: 8.70999e-06 [updatestate_depend_eliminate]: 4.33001e-06 [updatestate_assign_eliminate]: 3.85998e-06 [updatestate_loads_eliminate]: 2.94001e-06 [parameter_eliminate]: 2.48e-06 [a_2]: 0.00010845 [accelerated_algorithm]: 6.73e-06 [shard]: 2.46e-06 [meta_shard_fg_expand]: 1.99999e-06 [shard_inline]: 6.16e-06 [merge_send_recv]: 8.82e-06 [auto_parallel]: 6.29001e-06 [parallel]: 1.816e-05 [flash_sp]: 8.23999e-06 [merge_comm]: 3.9e-06 [allreduce_fusion]: 3.41001e-06 [matmul_add_comm_reduction]: 1.047e-05 [allreduce_slice_to_reducescatter]: 6.09987e-07 [virtual_shard_identity]: 8.94e-06 [virtual_dataset]: 6.61999e-06 [get_grad_eliminate_]: 5.84e-06 [virtual_output]: 5.91e-06 [merge_forward]: 4.33001e-06 [cell_reuse_recompute_pass]: 1.47001e-06 [offload_activation]: 9.86998e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.527e-05 [merge_recompute_call_nodes]: 1.74e-06 [before_grad]: 1.026e-05 [set_forward_comm_id_for_comm_node_pass]: 4.25e-06 [meta_fg_expand]: 2.62001e-06 [flash_sp_send_recv_attached]: 2.69999e-06 [receive_attached]: 2.46e-06 [after_resolve]: 9.74e-06 [a_after_grad]: 9.67001e-06 [renormalize]: 0.00059296 [add_forward_monad_depend]: 5.20999e-06 [auto_monad_grad]: 2.32999e-06 [auto_monad_eliminator]: 1.506e-05 [cse]: 3.12e-05 [a_3]: 6.152e-05 [Cycle 2]: 0.00081451, [45] [expand_dump_flag]: 9.30013e-07 [switch_simplify]: 7.39002e-06 [loop_unroll]: 6.01e-06 [a_1]: 0.00010714 [with_stream_mark]: 1.186e-05 [recompute_prepare]: 7.23e-06 [updatestate_depend_eliminate]: 3.27002e-06 [updatestate_assign_eliminate]: 2.64999e-06 [updatestate_loads_eliminate]: 2.69999e-06 [parameter_eliminate]: 1.84998e-06 [a_2]: 9.822e-05 [accelerated_algorithm]: 6.28e-06 [shard]: 2.08002e-06 [meta_shard_fg_expand]: 1.69e-06 [shard_inline]: 6.30002e-06 [merge_send_recv]: 5.49e-06 [auto_parallel]: 6.16e-06 [parallel]: 5.43002e-06 [flash_sp]: 3.60998e-06 [merge_comm]: 3.21001e-06 [allreduce_fusion]: 3.35e-06 [matmul_add_comm_reduction]: 1.006e-05 [allreduce_slice_to_reducescatter]: 5.19998e-07 [virtual_shard_identity]: 6.81001e-06 [virtual_dataset]: 6.04001e-06 [get_grad_eliminate_]: 5.33002e-06 [virtual_output]: 5.42001e-06 [merge_forward]: 2.96001e-06 [cell_reuse_recompute_pass]: 1.53002e-06 [offload_activation]: 7.19001e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.482e-05 [merge_recompute_call_nodes]: 1.45001e-06 [before_grad]: 9.52001e-06 [set_forward_comm_id_for_comm_node_pass]: 4.57998e-06 [meta_fg_expand]: 2.16e-06 [flash_sp_send_recv_attached]: 1.11002e-06 [receive_attached]: 1.05001e-06 [after_resolve]: 9.32999e-06 [a_after_grad]: 8.64e-06 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 2.02999e-06 [auto_monad_grad]: 1.39e-06 [auto_monad_eliminator]: 9.87999e-06 [cse]: 1.787e-05 [a_3]: 4.901e-05 [py_interpret_to_execute_after_opt_a]: 1.269e-05 [slice_cell_reuse_recomputed_activation]: 4.93001e-06 [rewriter_after_opt_a]: 4.359e-05 [convert_after_rewriter]: 1.008e-05 [order_py_execute_after_rewriter]: 8.43001e-06 [mutable_eliminate]: 0.00053376 [opt_b]: 0.00031754, [1] [Cycle 1]: 0.00030744, [7] [b_1]: 0.00020268 [b_2]: 7.95998e-06 [updatestate_depend_eliminate]: 8.18999e-06 [updatestate_assign_eliminate]: 2.71e-06 [updatestate_loads_eliminate]: 2.36e-06 [renormalize]: 5.00004e-07 [cse]: 2.209e-05 [optimize_parallel_all_gather_comm]: 2.003e-05 [overlap_param_gather]: 4.58999e-06 [cconv]: 2.96e-05 [loop_unroll]: 0.00046735 [opt_after_cconv]: 0.0001267, [1] [Cycle 1]: 0.00011752, [7] [c_1]: 2.807e-05 [parameter_eliminate]: 3.25998e-06 [updatestate_depend_eliminate]: 5.57001e-06 [updatestate_assign_eliminate]: 2.67001e-06 [updatestate_loads_eliminate]: 2.46e-06 [cse]: 1.932e-05 [renormalize]: 3.50003e-07 [remove_dup_value]: 1.855e-05 [tuple_transform]: 8.485e-05, [1] [Cycle 1]: 7.752e-05, [4] [d_1]: 3.812e-05 [none_parameter_eliminate]: 2.03997e-06 [renormalize]: 1.60013e-07 [switch_simplify]: 6.78e-06 [partial_unused_args_eliminate]: 4.48999e-06 [add_recomputation]: 4.775e-05 [cse_after_recomputation]: 2.774e-05, [1] [Cycle 1]: 2.111e-05, [1] [cse]: 1.205e-05 [environ_conv]: 8.02e-06 [swap_dp_allreduce_reducescatter]: 7.8e-06 [bias_add_comm_swap]: 5.04003e-06 [label_micro_interleaved_index]: 7.28e-06 [label_fine_grained_interleaved_index]: 5.00001e-06 [merge_cast_opt]: 3.77002e-06 [slice_recompute_activation]: 4.68001e-06 [micro_interleaved_order_control]: 5.03002e-06 [assign_add_opt]: 3.76999e-06 [ForceFp32Comm]: 3.55e-06 [remove_cast_before_assign_add]: 3.5e-06 [full_micro_interleaved_order_control]: 4.42998e-06 [reorder_send_recv_between_fp_bp]: 5.46e-06 [comm_op_add_attrs]: 3.80998e-06 [add_comm_op_reuse_tag]: 3.31001e-06 [interleave_split_concat_branches]: 3.54002e-06 [interleave_parallel_branches]: 3.69002e-06 [overlap_opt_shard_in_pipeline]: 3.81999e-06 [overlap_opt_shard_grad_in_pipeline]: 5.00001e-06 [control_data_broadcast_order]: 1.565e-05 [grouped_pairwise_exchange_alltoall]: 4.27998e-06 [offloading_packed_experts]: 6.66999e-06 [overlap_recompute_and_grad_model_parallel]: 7.90998e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.86001e-06 [overlap_recompute_allgather_and_fa_grad]: 3.63e-06 [overlap_recompute_comm]: 4.88001e-06 [overlap_grad_ring_attention]: 6.87002e-06 [overlap_grad_flash_sp]: 2.414e-05 [begin_end_overlap_inline]: 3.13e-06 [split_matmul_comm_elemetwise]: 4.82998e-06 [split_layernorm_comm]: 4.1e-06 [handle_group_info]: 3.37997e-06 [symbol_engine_optimizer]: 9.822e-05, [1] [Cycle 1]: 9.096e-05, [6] [build]: 3.38e-06 [elim_shapecalc]: 1.085e-05 [elim_not_effective]: 1.264e-05 [opt_reshape]: 7.33e-06 [fold_const_symbol]: 9.92001e-06 [renormalize]: 1.50001e-07 [detach_backward]: 4.03001e-06 [pipeline_parallel_scheduler]: 1.72999e-06 [auto_monad_reorder]: 1.977e-05 [get_jit_bprop_graph]: 1.47001e-06 [rewriter_after_jit_bprop_graph]: 4.95999e-06 [opt_after_jit_grad]: 0.00051465 [validate]: 3.81e-05 Sums bootstrap : 0.000445s : 4.19% type_inference : 0.005756s : 54.18% event_method : 0.000013s : 0.12% auto_monad : 0.000054s : 0.51% graph_reusing : 0.000005s : 0.05% inline : 0.000002s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000014s : 0.13% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000004s : 0.03% parallel-infer-symbol : 0.000003s : 0.03% pre_auto_parallel : 0.000028s : 0.26% insert-virtual-dataset : 0.000002s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.02% optimize.py_interpret_to_execute : 0.000022s : 0.21% optimize.rewriter_before_opt_a : 0.000051s : 0.48% optimize.opt_a.expand_dump_flag : 0.000004s : 0.03% optimize.opt_a.switch_simplify : 0.000032s : 0.30% optimize.opt_a.loop_unroll : 0.000020s : 0.19% optimize.opt_a.a_1 : 0.000394s : 3.71% optimize.opt_a.with_stream_mark : 0.000028s : 0.27% optimize.opt_a.recompute_prepare : 0.000016s : 0.15% optimize.opt_a.updatestate_depend_eliminate : 0.000008s : 0.07% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.06% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.05% optimize.opt_a.parameter_eliminate : 0.000004s : 0.04% optimize.opt_a.a_2 : 0.000207s : 1.95% optimize.opt_a.accelerated_algorithm : 0.000013s : 0.12% optimize.opt_a.shard : 0.000005s : 0.04% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.03% optimize.opt_a.shard_inline : 0.000012s : 0.12% optimize.opt_a.merge_send_recv : 0.000014s : 0.13% optimize.opt_a.auto_parallel : 0.000012s : 0.12% optimize.opt_a.parallel : 0.000024s : 0.22% optimize.opt_a.flash_sp : 0.000012s : 0.11% optimize.opt_a.merge_comm : 0.000007s : 0.07% optimize.opt_a.allreduce_fusion : 0.000007s : 0.06% optimize.opt_a.matmul_add_comm_reduction : 0.000021s : 0.19% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000016s : 0.15% optimize.opt_a.virtual_dataset : 0.000013s : 0.12% optimize.opt_a.get_grad_eliminate_ : 0.000011s : 0.11% optimize.opt_a.virtual_output : 0.000011s : 0.11% optimize.opt_a.merge_forward : 0.000007s : 0.07% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.03% optimize.opt_a.offload_activation : 0.000017s : 0.16% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000030s : 0.28% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.03% optimize.opt_a.before_grad : 0.000020s : 0.19% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000009s : 0.08% optimize.opt_a.meta_fg_expand : 0.000005s : 0.04% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.04% optimize.opt_a.receive_attached : 0.000004s : 0.03% optimize.opt_a.after_resolve : 0.000019s : 0.18% optimize.opt_a.a_after_grad : 0.000018s : 0.17% optimize.opt_a.renormalize : 0.000593s : 5.58% optimize.opt_a.add_forward_monad_depend : 0.000007s : 0.07% optimize.opt_a.auto_monad_grad : 0.000004s : 0.04% optimize.opt_a.auto_monad_eliminator : 0.000025s : 0.23% optimize.opt_a.cse : 0.000049s : 0.46% optimize.opt_a.a_3 : 0.000111s : 1.04% optimize.py_interpret_to_execute_after_opt_a : 0.000013s : 0.12% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.05% optimize.rewriter_after_opt_a : 0.000044s : 0.41% optimize.convert_after_rewriter : 0.000010s : 0.09% optimize.order_py_execute_after_rewriter : 0.000008s : 0.08% optimize.mutable_eliminate : 0.000534s : 5.02% optimize.opt_b.b_1 : 0.000203s : 1.91% optimize.opt_b.b_2 : 0.000008s : 0.07% optimize.opt_b.updatestate_depend_eliminate : 0.000008s : 0.08% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_b.updatestate_loads_eliminate : 0.000002s : 0.02% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000022s : 0.21% optimize.optimize_parallel_all_gather_comm : 0.000020s : 0.19% optimize.overlap_param_gather : 0.000005s : 0.04% optimize.cconv : 0.000030s : 0.28% optimize.loop_unroll : 0.000467s : 4.40% optimize.opt_after_cconv.c_1 : 0.000028s : 0.26% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.05% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.02% optimize.opt_after_cconv.cse : 0.000019s : 0.18% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000019s : 0.17% optimize.tuple_transform.d_1 : 0.000038s : 0.36% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.02% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000007s : 0.06% optimize.partial_unused_args_eliminate : 0.000004s : 0.04% optimize.add_recomputation : 0.000048s : 0.45% optimize.cse_after_recomputation.cse : 0.000012s : 0.11% optimize.environ_conv : 0.000008s : 0.08% optimize.swap_dp_allreduce_reducescatter : 0.000008s : 0.07% optimize.bias_add_comm_swap : 0.000005s : 0.05% optimize.label_micro_interleaved_index : 0.000007s : 0.07% optimize.label_fine_grained_interleaved_index : 0.000005s : 0.05% optimize.merge_cast_opt : 0.000004s : 0.04% optimize.slice_recompute_activation : 0.000005s : 0.04% optimize.micro_interleaved_order_control : 0.000005s : 0.05% optimize.assign_add_opt : 0.000004s : 0.04% optimize.ForceFp32Comm : 0.000004s : 0.03% optimize.remove_cast_before_assign_add : 0.000003s : 0.03% optimize.full_micro_interleaved_order_control : 0.000004s : 0.04% optimize.reorder_send_recv_between_fp_bp : 0.000005s : 0.05% optimize.comm_op_add_attrs : 0.000004s : 0.04% optimize.add_comm_op_reuse_tag : 0.000003s : 0.03% optimize.interleave_split_concat_branches : 0.000004s : 0.03% optimize.interleave_parallel_branches : 0.000004s : 0.03% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.04% optimize.overlap_opt_shard_grad_in_pipeline : 0.000005s : 0.05% optimize.control_data_broadcast_order : 0.000016s : 0.15% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.04% optimize.offloading_packed_experts : 0.000007s : 0.06% optimize.overlap_recompute_and_grad_model_parallel : 0.000008s : 0.07% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.04% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.03% optimize.overlap_recompute_comm : 0.000005s : 0.05% optimize.overlap_grad_ring_attention : 0.000007s : 0.06% optimize.overlap_grad_flash_sp : 0.000024s : 0.23% optimize.begin_end_overlap_inline : 0.000003s : 0.03% optimize.split_matmul_comm_elemetwise : 0.000005s : 0.05% optimize.split_layernorm_comm : 0.000004s : 0.04% optimize.handle_group_info : 0.000003s : 0.03% optimize.symbol_engine_optimizer.build : 0.000003s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000011s : 0.10% optimize.symbol_engine_optimizer.elim_not_effective : 0.000013s : 0.12% optimize.symbol_engine_optimizer.opt_reshape : 0.000007s : 0.07% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000010s : 0.09% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000004s : 0.04% pipeline_parallel_scheduler : 0.000002s : 0.02% auto_monad_reorder : 0.000020s : 0.19% get_jit_bprop_graph : 0.000001s : 0.01% rewriter_after_jit_bprop_graph : 0.000005s : 0.05% opt_after_jit_grad : 0.000515s : 4.84% validate : 0.000038s : 0.36% Time group info: ------[substitution.] 0.000139 20 1.31% : 0.000002s : 2: substitution.elim_not_effective 0.97% : 0.000001s : 2: substitution.fold_const_symbol 3.87% : 0.000005s : 3: substitution.graph_param_transform 63.12% : 0.000087s : 2: substitution.inline 2.87% : 0.000004s : 4: substitution.j_node_and_user_rematch 3.25% : 0.000005s : 4: substitution.remove_not_recompute_node 2.51% : 0.000003s : 2: substitution.replace_old_param 22.11% : 0.000031s : 1: substitution.value_based_eliminate ------[type_inference.] 0.005704 2 91.94% : 0.005245s : 1: type_inference.infer 8.06% : 0.000460s : 1: type_inference.specialize ------[replace.] 0.000022 2 100.00% : 0.000022s : 2: replace.inline ------[match.] 0.000086 2 100.00% : 0.000086s : 2: match.inline ------[predicate.] 0.000138 754 0.84% : 0.000001s : 7: predicate.accumulaten_eliminater 1.06% : 0.000001s : 3: predicate.ad_related_special_op_eliminate 0.65% : 0.000001s : 6: predicate.addn_check_dump 0.93% : 0.000001s : 7: predicate.addn_zero_filter 0.65% : 0.000001s : 7: predicate.adjust_all_reduce_mul_add 2.40% : 0.000003s : 13: predicate.arithmetic_simplify 0.71% : 0.000001s : 7: predicate.cast_eliminate 0.91% : 0.000001s : 6: predicate.check_bprop_eliminate 0.72% : 0.000001s : 6: predicate.compare_switch_simplify 0.23% : 0.000000s : 3: predicate.const_output_eliminate 0.71% : 0.000001s : 6: predicate.depend_value_elim 0.82% : 0.000001s : 7: predicate.dict_get_item_const_eliminator 0.95% : 0.000001s : 7: predicate.dict_get_item_eliminator 0.75% : 0.000001s : 7: predicate.dict_set_item_eliminator 1.17% : 0.000002s : 6: predicate.dumpgradient_eliminate 0.29% : 0.000000s : 3: predicate.elim_not_effective 0.47% : 0.000001s : 3: predicate.elim_shapecalc_of_broadcastargs 1.04% : 0.000001s : 10: predicate.environ_add_const_eliminate 0.98% : 0.000001s : 10: predicate.environ_get_add_eliminate 1.09% : 0.000002s : 10: predicate.environ_get_depend_swap 1.67% : 0.000002s : 16: predicate.environ_get_eliminate 0.97% : 0.000001s : 10: predicate.environ_get_set_eliminate 0.87% : 0.000001s : 9: predicate.exchange_switch_depend_value 1.85% : 0.000003s : 9: predicate.float_depend_g_call 0.63% : 0.000001s : 6: predicate.float_environ_get_switch 0.91% : 0.000001s : 9: predicate.float_tuple_getitem_switch 0.24% : 0.000000s : 3: predicate.fold_const_symbol 0.74% : 0.000001s : 6: predicate.get_grad_eliminate 0.26% : 0.000000s : 3: predicate.graph_param_transform 0.76% : 0.000001s : 6: predicate.incorporate_call 0.63% : 0.000001s : 6: predicate.incorporate_call_switch 6.41% : 0.000009s : 34: predicate.inline 1.14% : 0.000002s : 6: predicate.inline_without_move 0.41% : 0.000001s : 6: predicate.j_node_and_user_rematch 1.10% : 0.000002s : 6: predicate.less_batch_normalization 1.74% : 0.000002s : 13: predicate.list_to_tuple_eliminator_ 2.01% : 0.000003s : 20: predicate.load_eliminater 1.27% : 0.000002s : 3: predicate.loop_unroll_after_grad 1.62% : 0.000002s : 14: predicate.loop_unroll_before_grad 1.85% : 0.000003s : 13: predicate.make_slice_get_slice_eliminator 0.69% : 0.000001s : 6: predicate.merge_addn 0.72% : 0.000001s : 6: predicate.micro_step_allgather_replace 0.77% : 0.000001s : 6: predicate.mini_step_allgather_replace 0.63% : 0.000001s : 7: predicate.minmaximum_grad 1.65% : 0.000002s : 3: predicate.mutable_eliminate 0.44% : 0.000001s : 3: predicate.opt_reshape 0.49% : 0.000001s : 3: predicate.parallel_virtual_node 1.25% : 0.000002s : 9: predicate.partial_defer_inline 1.20% : 0.000002s : 10: predicate.partial_eliminate 0.75% : 0.000001s : 7: predicate.print_const_string_wrapper 0.67% : 0.000001s : 6: predicate.reduce_all_const_elim 0.96% : 0.000001s : 7: predicate.reduce_eliminate 2.00% : 0.000003s : 20: predicate.redundant_stop_gradient_eliminater 0.61% : 0.000001s : 6: predicate.remove_not_recompute_node 1.15% : 0.000002s : 13: predicate.replace_applicator 0.66% : 0.000001s : 6: predicate.replace_old_param 0.33% : 0.000000s : 3: predicate.reset_defer_inline 1.03% : 0.000001s : 7: predicate.reshape_eliminate 0.74% : 0.000001s : 6: predicate.row_tensor_add_zeros_like 0.42% : 0.000001s : 3: predicate.row_tensor_eliminate 1.17% : 0.000002s : 6: predicate.same_eliminate 0.74% : 0.000001s : 6: predicate.set_cell_output_no_recompute 1.43% : 0.000002s : 6: predicate.shard_identity_eliminate 1.14% : 0.000002s : 6: predicate.special_op_eliminate 0.93% : 0.000001s : 6: predicate.specialize_transform 1.18% : 0.000002s : 6: predicate.split_environ_get_set_with_tuple_value 0.90% : 0.000001s : 6: predicate.stack_unstack_eliminate 0.41% : 0.000001s : 3: predicate.switch_call_monad_eliminater 0.96% : 0.000001s : 9: predicate.switch_defer_inline 1.69% : 0.000002s : 15: predicate.switch_layer_defer_inline 4.33% : 0.000006s : 32: predicate.switch_simplify 0.88% : 0.000001s : 7: predicate.tile_eliminate 0.76% : 0.000001s : 7: predicate.transpose_eliminate 1.56% : 0.000002s : 13: predicate.tuple_list_convert_item_index_to_positive 1.60% : 0.000002s : 13: predicate.tuple_list_get_item_const_eliminator 1.28% : 0.000002s : 13: predicate.tuple_list_get_item_depend_reorder 3.36% : 0.000005s : 19: predicate.tuple_list_get_item_eliminator 1.48% : 0.000002s : 13: predicate.tuple_list_get_set_item_eliminator 2.56% : 0.000004s : 19: predicate.tuple_list_set_item_eliminator 1.61% : 0.000002s : 13: predicate.tuple_to_list_eliminator_ 1.90% : 0.000003s : 20: predicate.updatestate_pure_node_eliminater 2.80% : 0.000004s : 26: predicate.updatestate_useless_node_eliminater 0.84% : 0.000001s : 3: predicate.value_based_eliminate 1.30% : 0.000002s : 6: predicate.virtual_dataset_eliminate 0.71% : 0.000001s : 6: predicate.virtual_output_eliminate 0.32% : 0.000000s : 3: predicate.virtual_view_grad_eliminate 0.57% : 0.000001s : 3: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000215 5 8.08% : 0.000017s : 1: func_graph_cloner_run.FuncGraphClonerGraph 91.92% : 0.000198s : 4: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.025286 192 0.03% : 0.000006s : 1: ForceFp32Comm 12.83% : 0.003244s : 1: add_attr 12.77% : 0.003229s : 1: add_attr_with_inline 0.02% : 0.000006s : 1: add_comm_op_reuse_tag 0.20% : 0.000051s : 1: add_recomputation 0.03% : 0.000006s : 1: assign_add_opt 0.25% : 0.000063s : 1: auto_monad 0.11% : 0.000028s : 1: auto_monad_reorder 0.02% : 0.000006s : 1: begin_end_overlap_inline 0.03% : 0.000008s : 1: bias_add_comm_swap 1.94% : 0.000490s : 1: bootstrap 0.13% : 0.000033s : 1: cconv 0.03% : 0.000007s : 1: comm_op_add_attrs 0.08% : 0.000019s : 1: control_data_broadcast_order 0.05% : 0.000013s : 1: convert_after_rewriter 0.12% : 0.000031s : 1: cse_after_recomputation 0.03% : 0.000007s : 1: dataset_repeat_opt 0.08% : 0.000021s : 1: detach_backward 0.04% : 0.000011s : 1: environ_conv 0.09% : 0.000024s : 1: event_method 0.03% : 0.000007s : 1: full_micro_interleaved_order_control 0.03% : 0.000009s : 1: get_jit_bprop_graph 0.04% : 0.000011s : 1: graph_reusing 0.03% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.02% : 0.000006s : 1: handle_group_info 0.03% : 0.000009s : 1: inline 0.03% : 0.000009s : 1: insert-virtual-dataset 0.02% : 0.000006s : 1: interleave_parallel_branches 0.02% : 0.000006s : 1: interleave_split_concat_branches 0.03% : 0.000008s : 1: label_fine_grained_interleaved_index 0.04% : 0.000010s : 1: label_micro_interleaved_index 1.87% : 0.000473s : 1: loop_unroll 0.03% : 0.000006s : 1: merge_cast_opt 0.03% : 0.000008s : 1: micro_interleaved_order_control 2.14% : 0.000540s : 1: mutable_eliminate 0.04% : 0.000009s : 1: offloading_packed_experts 0.06% : 0.000015s : 1: opt.transform.loop_unroll_optimizer 0.07% : 0.000017s : 1: opt.transform.mutable_eliminate 3.03% : 0.000766s : 78: opt.transform.opt_a 0.11% : 0.000027s : 1: opt.transform.opt_after_cconv 0.10% : 0.000026s : 1: opt.transform.opt_after_jit_grad 0.54% : 0.000135s : 28: opt.transform.opt_b 0.17% : 0.000042s : 2: opt.transform.opt_trans_graph 0.15% : 0.000037s : 4: opt.transform.symbol_engine_opt 10.21% : 0.002581s : 1: opt_a 0.52% : 0.000130s : 1: opt_after_cconv 2.08% : 0.000526s : 1: opt_after_jit_grad 1.27% : 0.000321s : 1: opt_b 20.83% : 0.005267s : 1: optimize 0.09% : 0.000023s : 1: optimize_parallel_all_gather_comm 0.05% : 0.000011s : 1: order_py_execute_after_rewriter 0.11% : 0.000027s : 1: overlap_grad_flash_sp 0.03% : 0.000007s : 1: overlap_grad_matmul_and_grad_allreduce 0.04% : 0.000010s : 1: overlap_grad_ring_attention 0.03% : 0.000008s : 1: overlap_opt_shard_grad_in_pipeline 0.03% : 0.000007s : 1: overlap_opt_shard_in_pipeline 0.03% : 0.000008s : 1: overlap_param_gather 0.02% : 0.000006s : 1: overlap_recompute_allgather_and_fa_grad 0.04% : 0.000011s : 1: overlap_recompute_and_grad_model_parallel 0.03% : 0.000008s : 1: overlap_recompute_comm 0.04% : 0.000010s : 1: parallel-infer-symbol 0.02% : 0.000006s : 1: parallel-infer-symbol-second 0.03% : 0.000008s : 1: partial_unused_args_eliminate 0.04% : 0.000010s : 1: pipeline_parallel_scheduler 0.03% : 0.000007s : 1: pipeline_split 0.14% : 0.000036s : 1: pre_auto_parallel 0.10% : 0.000025s : 1: py_interpret_to_execute 0.06% : 0.000016s : 1: py_interpret_to_execute_after_opt_a 0.02% : 0.000006s : 1: remove_cast_before_assign_add 0.09% : 0.000022s : 1: remove_dup_value 1.31% : 0.000331s : 1: renormalize.infer 0.90% : 0.000227s : 1: renormalize.specialize 0.03% : 0.000008s : 1: reorder_send_recv_between_fp_bp 0.05% : 0.000012s : 1: rewriter_after_jit_bprop_graph 0.19% : 0.000047s : 1: rewriter_after_opt_a 0.21% : 0.000054s : 1: rewriter_before_opt_a 0.03% : 0.000008s : 1: slice_cell_reuse_recomputed_activation 0.03% : 0.000007s : 1: slice_recompute_activation 0.03% : 0.000007s : 1: split_layernorm_comm 0.03% : 0.000007s : 1: split_matmul_comm_elemetwise 0.04% : 0.000011s : 1: swap_dp_allreduce_reducescatter 0.40% : 0.000101s : 1: symbol_engine_optimizer 0.35% : 0.000088s : 1: tuple_transform 22.90% : 0.005790s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:58.527.259 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0144724, [21] [bootstrap]: 0.00044252 [type_inference]: 0.00559153 [event_method]: 1.349e-05 [auto_monad]: 5.893e-05 [graph_reusing]: 5.56e-06 [inline]: 2.34001e-06 [add_attr]: 0.00323343, [1] [add_attr_with_inline]: 0.00322462, [1] [Cycle 1]: 5.834e-05, [2] [tag_attr]: 1.504e-05 [meta_addattr_fg_expand]: 3.63e-06 [parallel-infer-symbol]: 3.19001e-06 [pre_auto_parallel]: 3.017e-05 [insert-virtual-dataset]: 2.38002e-06 [parallel-infer-symbol-second]: 9.20001e-07 [dataset_repeat_opt]: 2.23002e-06 [pipeline_split]: 1.60999e-06 [optimize]: 0.00435982, [53] [py_interpret_to_execute]: 1.917e-05 [rewriter_before_opt_a]: 4.713e-05 [opt_a]: 0.0022631, [2] [Cycle 1]: 0.00158769, [45] [expand_dump_flag]: 3.36001e-06 [switch_simplify]: 2.551e-05 [loop_unroll]: 1.418e-05 [a_1]: 0.0002923 [with_stream_mark]: 1.771e-05 [recompute_prepare]: 7.98001e-06 [updatestate_depend_eliminate]: 3.97998e-06 [updatestate_assign_eliminate]: 3.54002e-06 [updatestate_loads_eliminate]: 2.96001e-06 [parameter_eliminate]: 1.96e-06 [a_2]: 7.796e-05 [accelerated_algorithm]: 7.23e-06 [shard]: 2.64999e-06 [meta_shard_fg_expand]: 1.66e-06 [shard_inline]: 6.43003e-06 [merge_send_recv]: 8.82999e-06 [auto_parallel]: 6.89001e-06 [parallel]: 1.843e-05 [flash_sp]: 1.012e-05 [merge_comm]: 4.22e-06 [allreduce_fusion]: 3.51001e-06 [matmul_add_comm_reduction]: 9.17001e-06 [allreduce_slice_to_reducescatter]: 7.09988e-07 [virtual_shard_identity]: 8.88002e-06 [virtual_dataset]: 6.26e-06 [get_grad_eliminate_]: 6.81001e-06 [virtual_output]: 6.55002e-06 [merge_forward]: 4.09002e-06 [cell_reuse_recompute_pass]: 1.14998e-06 [offload_activation]: 1.017e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.308e-05 [merge_recompute_call_nodes]: 1.59998e-06 [before_grad]: 1.186e-05 [set_forward_comm_id_for_comm_node_pass]: 3.87998e-06 [meta_fg_expand]: 2.61e-06 [flash_sp_send_recv_attached]: 2.80002e-06 [receive_attached]: 2.19001e-06 [after_resolve]: 1.1e-05 [a_after_grad]: 8.97e-06 [renormalize]: 0.00059402 [add_forward_monad_depend]: 5.35001e-06 [auto_monad_grad]: 1.98002e-06 [auto_monad_eliminator]: 1.695e-05 [cse]: 3.153e-05 [a_3]: 4.797e-05 [Cycle 2]: 0.00066554, [45] [expand_dump_flag]: 9.00007e-07 [switch_simplify]: 7.32002e-06 [loop_unroll]: 5.95002e-06 [a_1]: 0.00010605 [with_stream_mark]: 1.53e-05 [recompute_prepare]: 6.34999e-06 [updatestate_depend_eliminate]: 3.78001e-06 [updatestate_assign_eliminate]: 2.27001e-06 [updatestate_loads_eliminate]: 2.84999e-06 [parameter_eliminate]: 1.20999e-06 [a_2]: 6.977e-05 [accelerated_algorithm]: 6.19999e-06 [shard]: 2.46e-06 [meta_shard_fg_expand]: 1.73002e-06 [shard_inline]: 1.978e-05 [merge_send_recv]: 5.87001e-06 [auto_parallel]: 6.31e-06 [parallel]: 6.39999e-06 [flash_sp]: 3.63e-06 [merge_comm]: 3.61999e-06 [allreduce_fusion]: 3.03003e-06 [matmul_add_comm_reduction]: 6.44001e-06 [allreduce_slice_to_reducescatter]: 5.09986e-07 [virtual_shard_identity]: 7.02997e-06 [virtual_dataset]: 5.99e-06 [get_grad_eliminate_]: 5.77001e-06 [virtual_output]: 5.72999e-06 [merge_forward]: 3.73001e-06 [cell_reuse_recompute_pass]: 1.37999e-06 [offload_activation]: 7.11001e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.416e-05 [merge_recompute_call_nodes]: 9.29984e-07 [before_grad]: 9.86e-06 [set_forward_comm_id_for_comm_node_pass]: 4.30999e-06 [meta_fg_expand]: 2.44001e-06 [flash_sp_send_recv_attached]: 1.30999e-06 [receive_attached]: 1.25001e-06 [after_resolve]: 9.31998e-06 [a_after_grad]: 8.59e-06 [renormalize]: 8.00064e-08 [add_forward_monad_depend]: 2.26e-06 [auto_monad_grad]: 1.49e-06 [auto_monad_eliminator]: 9.41e-06 [cse]: 1.948e-05 [a_3]: 3.613e-05 [py_interpret_to_execute_after_opt_a]: 1.01e-05 [slice_cell_reuse_recomputed_activation]: 2.56e-06 [rewriter_after_opt_a]: 3.805e-05 [convert_after_rewriter]: 6.36e-06 [order_py_execute_after_rewriter]: 5.34998e-06 [mutable_eliminate]: 0.00052039 [opt_b]: 0.00025613, [1] [Cycle 1]: 0.00024817, [7] [b_1]: 0.00015788 [b_2]: 8.47e-06 [updatestate_depend_eliminate]: 7.35998e-06 [updatestate_assign_eliminate]: 2.63e-06 [updatestate_loads_eliminate]: 2.36e-06 [renormalize]: 7.60017e-07 [cse]: 2.7e-05 [optimize_parallel_all_gather_comm]: 1.702e-05 [overlap_param_gather]: 2.66e-06 [cconv]: 2.846e-05 [loop_unroll]: 0.00046445 [opt_after_cconv]: 0.0001072, [1] [Cycle 1]: 0.000101, [7] [c_1]: 2.754e-05 [parameter_eliminate]: 3.43999e-06 [updatestate_depend_eliminate]: 7.21001e-06 [updatestate_assign_eliminate]: 2.39001e-06 [updatestate_loads_eliminate]: 2.24001e-06 [cse]: 2.237e-05 [renormalize]: 3.7998e-07 [remove_dup_value]: 1.565e-05 [tuple_transform]: 7.161e-05, [1] [Cycle 1]: 6.679e-05, [4] [d_1]: 3.905e-05 [none_parameter_eliminate]: 1.80001e-06 [renormalize]: 1.80007e-07 [switch_simplify]: 6.53e-06 [partial_unused_args_eliminate]: 2.28002e-06 [add_recomputation]: 5.082e-05 [cse_after_recomputation]: 2.401e-05, [1] [Cycle 1]: 1.905e-05, [1] [cse]: 1.314e-05 [environ_conv]: 5.73002e-06 [swap_dp_allreduce_reducescatter]: 5.07e-06 [bias_add_comm_swap]: 2.61999e-06 [label_micro_interleaved_index]: 5.24998e-06 [label_fine_grained_interleaved_index]: 2.79001e-06 [merge_cast_opt]: 1.34e-06 [slice_recompute_activation]: 1.94e-06 [micro_interleaved_order_control]: 2.24001e-06 [assign_add_opt]: 1.19e-06 [ForceFp32Comm]: 8.30012e-07 [remove_cast_before_assign_add]: 1.29e-06 [full_micro_interleaved_order_control]: 2.48002e-06 [reorder_send_recv_between_fp_bp]: 2.83e-06 [comm_op_add_attrs]: 1.14e-06 [add_comm_op_reuse_tag]: 1.14998e-06 [interleave_split_concat_branches]: 1.22999e-06 [interleave_parallel_branches]: 1.12e-06 [overlap_opt_shard_in_pipeline]: 1.10001e-06 [overlap_opt_shard_grad_in_pipeline]: 1.84998e-06 [control_data_broadcast_order]: 1.419e-05 [grouped_pairwise_exchange_alltoall]: 1.59e-06 [offloading_packed_experts]: 4.29002e-06 [overlap_recompute_and_grad_model_parallel]: 4.85001e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.21002e-06 [overlap_recompute_allgather_and_fa_grad]: 1.54e-06 [overlap_recompute_comm]: 2.48998e-06 [overlap_grad_ring_attention]: 4.37e-06 [overlap_grad_flash_sp]: 2.098e-05 [begin_end_overlap_inline]: 5.00004e-07 [split_matmul_comm_elemetwise]: 2.07999e-06 [split_layernorm_comm]: 1.66e-06 [handle_group_info]: 1.15999e-06 [symbol_engine_optimizer]: 7.689e-05, [1] [Cycle 1]: 7.252e-05, [6] [build]: 2.92002e-06 [elim_shapecalc]: 1.029e-05 [elim_not_effective]: 1.192e-05 [opt_reshape]: 6.76e-06 [fold_const_symbol]: 9.61e-06 [renormalize]: 2.80008e-07 [detach_backward]: 2.16e-06 [pipeline_parallel_scheduler]: 1.57999e-06 [auto_monad_reorder]: 1.776e-05 [get_jit_bprop_graph]: 1.92999e-06 [rewriter_after_jit_bprop_graph]: 5.40999e-06 [opt_after_jit_grad]: 0.00049289 [validate]: 4.04e-05 Sums bootstrap : 0.000443s : 4.33% type_inference : 0.005592s : 54.71% event_method : 0.000013s : 0.13% auto_monad : 0.000059s : 0.58% graph_reusing : 0.000006s : 0.05% inline : 0.000002s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000015s : 0.15% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000004s : 0.04% parallel-infer-symbol : 0.000003s : 0.03% pre_auto_parallel : 0.000030s : 0.30% insert-virtual-dataset : 0.000002s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.02% optimize.py_interpret_to_execute : 0.000019s : 0.19% optimize.rewriter_before_opt_a : 0.000047s : 0.46% optimize.opt_a.expand_dump_flag : 0.000004s : 0.04% optimize.opt_a.switch_simplify : 0.000033s : 0.32% optimize.opt_a.loop_unroll : 0.000020s : 0.20% optimize.opt_a.a_1 : 0.000398s : 3.90% optimize.opt_a.with_stream_mark : 0.000033s : 0.32% optimize.opt_a.recompute_prepare : 0.000014s : 0.14% optimize.opt_a.updatestate_depend_eliminate : 0.000008s : 0.08% optimize.opt_a.updatestate_assign_eliminate : 0.000006s : 0.06% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.06% optimize.opt_a.parameter_eliminate : 0.000003s : 0.03% optimize.opt_a.a_2 : 0.000148s : 1.45% optimize.opt_a.accelerated_algorithm : 0.000013s : 0.13% optimize.opt_a.shard : 0.000005s : 0.05% optimize.opt_a.meta_shard_fg_expand : 0.000003s : 0.03% optimize.opt_a.shard_inline : 0.000026s : 0.26% optimize.opt_a.merge_send_recv : 0.000015s : 0.14% optimize.opt_a.auto_parallel : 0.000013s : 0.13% optimize.opt_a.parallel : 0.000025s : 0.24% optimize.opt_a.flash_sp : 0.000014s : 0.13% optimize.opt_a.merge_comm : 0.000008s : 0.08% optimize.opt_a.allreduce_fusion : 0.000007s : 0.06% optimize.opt_a.matmul_add_comm_reduction : 0.000016s : 0.15% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000016s : 0.16% optimize.opt_a.virtual_dataset : 0.000012s : 0.12% optimize.opt_a.get_grad_eliminate_ : 0.000013s : 0.12% optimize.opt_a.virtual_output : 0.000012s : 0.12% optimize.opt_a.merge_forward : 0.000008s : 0.08% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.02% optimize.opt_a.offload_activation : 0.000017s : 0.17% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000027s : 0.27% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.02% optimize.opt_a.before_grad : 0.000022s : 0.21% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000008s : 0.08% optimize.opt_a.meta_fg_expand : 0.000005s : 0.05% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.04% optimize.opt_a.receive_attached : 0.000003s : 0.03% optimize.opt_a.after_resolve : 0.000020s : 0.20% optimize.opt_a.a_after_grad : 0.000018s : 0.17% optimize.opt_a.renormalize : 0.000594s : 5.81% optimize.opt_a.add_forward_monad_depend : 0.000008s : 0.07% optimize.opt_a.auto_monad_grad : 0.000003s : 0.03% optimize.opt_a.auto_monad_eliminator : 0.000026s : 0.26% optimize.opt_a.cse : 0.000051s : 0.50% optimize.opt_a.a_3 : 0.000084s : 0.82% optimize.py_interpret_to_execute_after_opt_a : 0.000010s : 0.10% optimize.slice_cell_reuse_recomputed_activation : 0.000003s : 0.03% optimize.rewriter_after_opt_a : 0.000038s : 0.37% optimize.convert_after_rewriter : 0.000006s : 0.06% optimize.order_py_execute_after_rewriter : 0.000005s : 0.05% optimize.mutable_eliminate : 0.000520s : 5.09% optimize.opt_b.b_1 : 0.000158s : 1.54% optimize.opt_b.b_2 : 0.000008s : 0.08% optimize.opt_b.updatestate_depend_eliminate : 0.000007s : 0.07% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_b.updatestate_loads_eliminate : 0.000002s : 0.02% optimize.opt_b.renormalize : 0.000001s : 0.01% optimize.opt_b.cse : 0.000027s : 0.26% optimize.optimize_parallel_all_gather_comm : 0.000017s : 0.17% optimize.overlap_param_gather : 0.000003s : 0.03% optimize.cconv : 0.000028s : 0.28% optimize.loop_unroll : 0.000464s : 4.54% optimize.opt_after_cconv.c_1 : 0.000028s : 0.27% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000007s : 0.07% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000002s : 0.02% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.02% optimize.opt_after_cconv.cse : 0.000022s : 0.22% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000016s : 0.15% optimize.tuple_transform.d_1 : 0.000039s : 0.38% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.02% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000007s : 0.06% optimize.partial_unused_args_eliminate : 0.000002s : 0.02% optimize.add_recomputation : 0.000051s : 0.50% optimize.cse_after_recomputation.cse : 0.000013s : 0.13% optimize.environ_conv : 0.000006s : 0.06% optimize.swap_dp_allreduce_reducescatter : 0.000005s : 0.05% optimize.bias_add_comm_swap : 0.000003s : 0.03% optimize.label_micro_interleaved_index : 0.000005s : 0.05% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.03% optimize.merge_cast_opt : 0.000001s : 0.01% optimize.slice_recompute_activation : 0.000002s : 0.02% optimize.micro_interleaved_order_control : 0.000002s : 0.02% optimize.assign_add_opt : 0.000001s : 0.01% optimize.ForceFp32Comm : 0.000001s : 0.01% optimize.remove_cast_before_assign_add : 0.000001s : 0.01% optimize.full_micro_interleaved_order_control : 0.000002s : 0.02% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.03% optimize.comm_op_add_attrs : 0.000001s : 0.01% optimize.add_comm_op_reuse_tag : 0.000001s : 0.01% optimize.interleave_split_concat_branches : 0.000001s : 0.01% optimize.interleave_parallel_branches : 0.000001s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.02% optimize.control_data_broadcast_order : 0.000014s : 0.14% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.02% optimize.offloading_packed_experts : 0.000004s : 0.04% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.05% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000002s : 0.02% optimize.overlap_recompute_comm : 0.000002s : 0.02% optimize.overlap_grad_ring_attention : 0.000004s : 0.04% optimize.overlap_grad_flash_sp : 0.000021s : 0.21% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.02% optimize.split_layernorm_comm : 0.000002s : 0.02% optimize.handle_group_info : 0.000001s : 0.01% optimize.symbol_engine_optimizer.build : 0.000003s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000010s : 0.10% optimize.symbol_engine_optimizer.elim_not_effective : 0.000012s : 0.12% optimize.symbol_engine_optimizer.opt_reshape : 0.000007s : 0.07% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000010s : 0.09% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.02% pipeline_parallel_scheduler : 0.000002s : 0.02% auto_monad_reorder : 0.000018s : 0.17% get_jit_bprop_graph : 0.000002s : 0.02% rewriter_after_jit_bprop_graph : 0.000005s : 0.05% opt_after_jit_grad : 0.000493s : 4.82% validate : 0.000040s : 0.40% Time group info: ------[substitution.] 0.000145 20 1.29% : 0.000002s : 2: substitution.elim_not_effective 0.91% : 0.000001s : 2: substitution.fold_const_symbol 3.79% : 0.000006s : 3: substitution.graph_param_transform 63.00% : 0.000092s : 2: substitution.inline 3.05% : 0.000004s : 4: substitution.j_node_and_user_rematch 3.01% : 0.000004s : 4: substitution.remove_not_recompute_node 2.35% : 0.000003s : 2: substitution.replace_old_param 22.60% : 0.000033s : 1: substitution.value_based_eliminate ------[type_inference.] 0.005542 2 91.61% : 0.005077s : 1: type_inference.infer 8.39% : 0.000465s : 1: type_inference.specialize ------[replace.] 0.000022 2 100.00% : 0.000022s : 2: replace.inline ------[match.] 0.000090 2 100.00% : 0.000090s : 2: match.inline ------[predicate.] 0.000134 754 0.81% : 0.000001s : 7: predicate.accumulaten_eliminater 1.32% : 0.000002s : 3: predicate.ad_related_special_op_eliminate 0.64% : 0.000001s : 6: predicate.addn_check_dump 0.78% : 0.000001s : 7: predicate.addn_zero_filter 0.66% : 0.000001s : 7: predicate.adjust_all_reduce_mul_add 2.20% : 0.000003s : 13: predicate.arithmetic_simplify 0.73% : 0.000001s : 7: predicate.cast_eliminate 0.69% : 0.000001s : 6: predicate.check_bprop_eliminate 0.67% : 0.000001s : 6: predicate.compare_switch_simplify 0.23% : 0.000000s : 3: predicate.const_output_eliminate 0.70% : 0.000001s : 6: predicate.depend_value_elim 0.77% : 0.000001s : 7: predicate.dict_get_item_const_eliminator 0.84% : 0.000001s : 7: predicate.dict_get_item_eliminator 0.75% : 0.000001s : 7: predicate.dict_set_item_eliminator 1.25% : 0.000002s : 6: predicate.dumpgradient_eliminate 0.34% : 0.000000s : 3: predicate.elim_not_effective 0.52% : 0.000001s : 3: predicate.elim_shapecalc_of_broadcastargs 1.07% : 0.000001s : 10: predicate.environ_add_const_eliminate 1.02% : 0.000001s : 10: predicate.environ_get_add_eliminate 0.99% : 0.000001s : 10: predicate.environ_get_depend_swap 1.87% : 0.000003s : 16: predicate.environ_get_eliminate 0.93% : 0.000001s : 10: predicate.environ_get_set_eliminate 0.94% : 0.000001s : 9: predicate.exchange_switch_depend_value 2.05% : 0.000003s : 9: predicate.float_depend_g_call 0.61% : 0.000001s : 6: predicate.float_environ_get_switch 0.96% : 0.000001s : 9: predicate.float_tuple_getitem_switch 0.24% : 0.000000s : 3: predicate.fold_const_symbol 0.84% : 0.000001s : 6: predicate.get_grad_eliminate 0.25% : 0.000000s : 3: predicate.graph_param_transform 0.75% : 0.000001s : 6: predicate.incorporate_call 0.70% : 0.000001s : 6: predicate.incorporate_call_switch 7.02% : 0.000009s : 34: predicate.inline 1.40% : 0.000002s : 6: predicate.inline_without_move 0.40% : 0.000001s : 6: predicate.j_node_and_user_rematch 1.09% : 0.000001s : 6: predicate.less_batch_normalization 1.63% : 0.000002s : 13: predicate.list_to_tuple_eliminator_ 1.99% : 0.000003s : 20: predicate.load_eliminater 1.65% : 0.000002s : 3: predicate.loop_unroll_after_grad 1.65% : 0.000002s : 14: predicate.loop_unroll_before_grad 1.87% : 0.000003s : 13: predicate.make_slice_get_slice_eliminator 0.64% : 0.000001s : 6: predicate.merge_addn 0.82% : 0.000001s : 6: predicate.micro_step_allgather_replace 0.72% : 0.000001s : 6: predicate.mini_step_allgather_replace 0.65% : 0.000001s : 7: predicate.minmaximum_grad 1.72% : 0.000002s : 3: predicate.mutable_eliminate 0.47% : 0.000001s : 3: predicate.opt_reshape 0.49% : 0.000001s : 3: predicate.parallel_virtual_node 1.27% : 0.000002s : 9: predicate.partial_defer_inline 1.24% : 0.000002s : 10: predicate.partial_eliminate 0.79% : 0.000001s : 7: predicate.print_const_string_wrapper 0.77% : 0.000001s : 6: predicate.reduce_all_const_elim 0.99% : 0.000001s : 7: predicate.reduce_eliminate 2.08% : 0.000003s : 20: predicate.redundant_stop_gradient_eliminater 0.76% : 0.000001s : 6: predicate.remove_not_recompute_node 1.15% : 0.000002s : 13: predicate.replace_applicator 0.74% : 0.000001s : 6: predicate.replace_old_param 0.32% : 0.000000s : 3: predicate.reset_defer_inline 0.87% : 0.000001s : 7: predicate.reshape_eliminate 0.79% : 0.000001s : 6: predicate.row_tensor_add_zeros_like 0.48% : 0.000001s : 3: predicate.row_tensor_eliminate 0.84% : 0.000001s : 6: predicate.same_eliminate 0.52% : 0.000001s : 6: predicate.set_cell_output_no_recompute 0.90% : 0.000001s : 6: predicate.shard_identity_eliminate 1.08% : 0.000001s : 6: predicate.special_op_eliminate 0.90% : 0.000001s : 6: predicate.specialize_transform 1.09% : 0.000001s : 6: predicate.split_environ_get_set_with_tuple_value 0.89% : 0.000001s : 6: predicate.stack_unstack_eliminate 0.44% : 0.000001s : 3: predicate.switch_call_monad_eliminater 1.12% : 0.000001s : 9: predicate.switch_defer_inline 1.87% : 0.000002s : 15: predicate.switch_layer_defer_inline 4.31% : 0.000006s : 32: predicate.switch_simplify 0.69% : 0.000001s : 7: predicate.tile_eliminate 0.79% : 0.000001s : 7: predicate.transpose_eliminate 1.52% : 0.000002s : 13: predicate.tuple_list_convert_item_index_to_positive 1.58% : 0.000002s : 13: predicate.tuple_list_get_item_const_eliminator 1.31% : 0.000002s : 13: predicate.tuple_list_get_item_depend_reorder 3.22% : 0.000004s : 19: predicate.tuple_list_get_item_eliminator 1.43% : 0.000002s : 13: predicate.tuple_list_get_set_item_eliminator 2.48% : 0.000003s : 19: predicate.tuple_list_set_item_eliminator 1.43% : 0.000002s : 13: predicate.tuple_to_list_eliminator_ 1.91% : 0.000003s : 20: predicate.updatestate_pure_node_eliminater 2.78% : 0.000004s : 26: predicate.updatestate_useless_node_eliminater 0.77% : 0.000001s : 3: predicate.value_based_eliminate 0.77% : 0.000001s : 6: predicate.virtual_dataset_eliminate 0.82% : 0.000001s : 6: predicate.virtual_output_eliminate 0.32% : 0.000000s : 3: predicate.virtual_view_grad_eliminate 0.62% : 0.000001s : 3: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000230 5 8.24% : 0.000019s : 1: func_graph_cloner_run.FuncGraphClonerGraph 91.76% : 0.000211s : 4: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.023606 192 0.01% : 0.000004s : 1: ForceFp32Comm 13.72% : 0.003239s : 1: add_attr 13.67% : 0.003228s : 1: add_attr_with_inline 0.02% : 0.000004s : 1: add_comm_op_reuse_tag 0.24% : 0.000056s : 1: add_recomputation 0.02% : 0.000004s : 1: assign_add_opt 0.27% : 0.000064s : 1: auto_monad 0.09% : 0.000022s : 1: auto_monad_reorder 0.01% : 0.000003s : 1: begin_end_overlap_inline 0.02% : 0.000006s : 1: bias_add_comm_swap 2.00% : 0.000473s : 1: bootstrap 0.14% : 0.000032s : 1: cconv 0.02% : 0.000004s : 1: comm_op_add_attrs 0.08% : 0.000018s : 1: control_data_broadcast_order 0.04% : 0.000010s : 1: convert_after_rewriter 0.11% : 0.000027s : 1: cse_after_recomputation 0.02% : 0.000005s : 1: dataset_repeat_opt 0.02% : 0.000006s : 1: detach_backward 0.04% : 0.000009s : 1: environ_conv 0.09% : 0.000020s : 1: event_method 0.02% : 0.000005s : 1: full_micro_interleaved_order_control 0.02% : 0.000005s : 1: get_jit_bprop_graph 0.04% : 0.000009s : 1: graph_reusing 0.02% : 0.000005s : 1: grouped_pairwise_exchange_alltoall 0.02% : 0.000004s : 1: handle_group_info 0.02% : 0.000005s : 1: inline 0.03% : 0.000006s : 1: insert-virtual-dataset 0.02% : 0.000004s : 1: interleave_parallel_branches 0.02% : 0.000004s : 1: interleave_split_concat_branches 0.02% : 0.000006s : 1: label_fine_grained_interleaved_index 0.03% : 0.000008s : 1: label_micro_interleaved_index 2.01% : 0.000475s : 1: loop_unroll 0.02% : 0.000004s : 1: merge_cast_opt 0.02% : 0.000005s : 1: micro_interleaved_order_control 2.24% : 0.000530s : 1: mutable_eliminate 0.03% : 0.000008s : 1: offloading_packed_experts 0.07% : 0.000017s : 1: opt.transform.loop_unroll_optimizer 0.07% : 0.000017s : 1: opt.transform.mutable_eliminate 3.31% : 0.000782s : 78: opt.transform.opt_a 0.11% : 0.000026s : 1: opt.transform.opt_after_cconv 0.11% : 0.000026s : 1: opt.transform.opt_after_jit_grad 0.56% : 0.000133s : 28: opt.transform.opt_b 0.18% : 0.000043s : 2: opt.transform.opt_trans_graph 0.15% : 0.000035s : 4: opt.transform.symbol_engine_opt 9.60% : 0.002266s : 1: opt_a 0.47% : 0.000111s : 1: opt_after_cconv 2.13% : 0.000503s : 1: opt_after_jit_grad 1.10% : 0.000260s : 1: opt_b 18.49% : 0.004365s : 1: optimize 0.09% : 0.000020s : 1: optimize_parallel_all_gather_comm 0.04% : 0.000008s : 1: order_py_execute_after_rewriter 0.10% : 0.000025s : 1: overlap_grad_flash_sp 0.02% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.03% : 0.000008s : 1: overlap_grad_ring_attention 0.02% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.02% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.03% : 0.000006s : 1: overlap_param_gather 0.02% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.03% : 0.000008s : 1: overlap_recompute_and_grad_model_parallel 0.02% : 0.000005s : 1: overlap_recompute_comm 0.03% : 0.000007s : 1: parallel-infer-symbol 0.02% : 0.000004s : 1: parallel-infer-symbol-second 0.02% : 0.000005s : 1: partial_unused_args_eliminate 0.02% : 0.000006s : 1: pipeline_parallel_scheduler 0.02% : 0.000004s : 1: pipeline_split 0.15% : 0.000035s : 1: pre_auto_parallel 0.10% : 0.000023s : 1: py_interpret_to_execute 0.06% : 0.000013s : 1: py_interpret_to_execute_after_opt_a 0.02% : 0.000004s : 1: remove_cast_before_assign_add 0.08% : 0.000019s : 1: remove_dup_value 1.45% : 0.000341s : 1: renormalize.infer 1.03% : 0.000244s : 1: renormalize.specialize 0.02% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.04% : 0.000009s : 1: rewriter_after_jit_bprop_graph 0.18% : 0.000042s : 1: rewriter_after_opt_a 0.22% : 0.000051s : 1: rewriter_before_opt_a 0.02% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.02% : 0.000005s : 1: slice_recompute_activation 0.02% : 0.000004s : 1: split_layernorm_comm 0.02% : 0.000005s : 1: split_matmul_comm_elemetwise 0.03% : 0.000008s : 1: swap_dp_allreduce_reducescatter 0.34% : 0.000080s : 1: symbol_engine_optimizer 0.31% : 0.000074s : 1: tuple_transform 23.77% : 0.005612s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:58.723.134 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:58.723.403 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0152713, [21] [bootstrap]: 0.00043836 [type_inference]: 0.00508091 [event_method]: 1.325e-05 [auto_monad]: 5.759e-05 [graph_reusing]: 6.09001e-06 [inline]: 2.32999e-06 [add_attr]: 0.0032795, [1] [add_attr_with_inline]: 0.00327081, [1] [Cycle 1]: 7.875e-05, [2] [tag_attr]: 1.493e-05 [meta_addattr_fg_expand]: 3.97e-06 [parallel-infer-symbol]: 3.21001e-06 [pre_auto_parallel]: 2.849e-05 [insert-virtual-dataset]: 2.32999e-06 [parallel-infer-symbol-second]: 7.29982e-07 [dataset_repeat_opt]: 2.05002e-06 [pipeline_split]: 1.54e-06 [optimize]: 0.00515519, [53] [py_interpret_to_execute]: 2.248e-05 [rewriter_before_opt_a]: 5.176e-05 [opt_a]: 0.00269665, [2] [Cycle 1]: 0.00180312, [45] [expand_dump_flag]: 3.52002e-06 [switch_simplify]: 2.49e-05 [loop_unroll]: 1.371e-05 [a_1]: 0.00030658 [with_stream_mark]: 1.972e-05 [recompute_prepare]: 1.116e-05 [updatestate_depend_eliminate]: 5.17e-06 [updatestate_assign_eliminate]: 3.93999e-06 [updatestate_loads_eliminate]: 2.78e-06 [parameter_eliminate]: 2.07999e-06 [a_2]: 0.0001064 [accelerated_algorithm]: 6.74999e-06 [shard]: 2.71e-06 [meta_shard_fg_expand]: 2.16e-06 [shard_inline]: 6.39999e-06 [merge_send_recv]: 8.82e-06 [auto_parallel]: 7.33e-06 [parallel]: 2.054e-05 [flash_sp]: 1.007e-05 [merge_comm]: 4.61002e-06 [allreduce_fusion]: 3.26999e-06 [matmul_add_comm_reduction]: 9.39e-06 [allreduce_slice_to_reducescatter]: 7.29982e-07 [virtual_shard_identity]: 9.54999e-06 [virtual_dataset]: 6.36e-06 [get_grad_eliminate_]: 6.48003e-06 [virtual_output]: 6.16998e-06 [merge_forward]: 3.91001e-06 [cell_reuse_recompute_pass]: 1.15999e-06 [offload_activation]: 1.099e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.613e-05 [merge_recompute_call_nodes]: 2.06e-06 [before_grad]: 1.121e-05 [set_forward_comm_id_for_comm_node_pass]: 3.84002e-06 [meta_fg_expand]: 2.89999e-06 [flash_sp_send_recv_attached]: 2.94001e-06 [receive_attached]: 2.37999e-06 [after_resolve]: 1.093e-05 [a_after_grad]: 8.87e-06 [renormalize]: 0.00057433 [add_forward_monad_depend]: 6.02999e-06 [auto_monad_grad]: 2.37999e-06 [auto_monad_eliminator]: 1.761e-05 [cse]: 3.16e-05 [a_3]: 6.551e-05 [Cycle 2]: 0.00087955, [45] [expand_dump_flag]: 1.15001e-06 [switch_simplify]: 7.44002e-06 [loop_unroll]: 6.01998e-06 [a_1]: 0.00010825 [with_stream_mark]: 1.384e-05 [recompute_prepare]: 7.05e-06 [updatestate_depend_eliminate]: 3.2e-06 [updatestate_assign_eliminate]: 2.48e-06 [updatestate_loads_eliminate]: 3.14001e-06 [parameter_eliminate]: 1.67999e-06 [a_2]: 9.765e-05 [accelerated_algorithm]: 7.23999e-06 [shard]: 2.06e-06 [meta_shard_fg_expand]: 1.94e-06 [shard_inline]: 7.03998e-06 [merge_send_recv]: 7.33999e-06 [auto_parallel]: 6.33e-06 [parallel]: 6.69001e-06 [flash_sp]: 3.9e-06 [merge_comm]: 3.75e-06 [allreduce_fusion]: 3.36001e-06 [matmul_add_comm_reduction]: 6.54999e-06 [allreduce_slice_to_reducescatter]: 5.10016e-07 [virtual_shard_identity]: 7.95e-06 [virtual_dataset]: 6.07001e-06 [get_grad_eliminate_]: 3.248e-05 [virtual_output]: 6.05002e-06 [merge_forward]: 4.02e-06 [cell_reuse_recompute_pass]: 1.79998e-06 [offload_activation]: 7.71001e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.609e-05 [merge_recompute_call_nodes]: 1.34998e-06 [before_grad]: 1.089e-05 [set_forward_comm_id_for_comm_node_pass]: 4.84e-06 [meta_fg_expand]: 2.12999e-06 [flash_sp_send_recv_attached]: 1.44e-06 [receive_attached]: 1.02e-06 [after_resolve]: 1.033e-05 [a_after_grad]: 8.17e-06 [renormalize]: 8.00064e-08 [add_forward_monad_depend]: 2.69001e-06 [auto_monad_grad]: 1.36002e-06 [auto_monad_eliminator]: 1.026e-05 [cse]: 1.934e-05 [a_3]: 4.843e-05 [py_interpret_to_execute_after_opt_a]: 1.509e-05 [slice_cell_reuse_recomputed_activation]: 4.96002e-06 [rewriter_after_opt_a]: 4.244e-05 [convert_after_rewriter]: 1.054e-05 [order_py_execute_after_rewriter]: 8.78001e-06 [mutable_eliminate]: 0.00057715 [opt_b]: 0.00033159, [1] [Cycle 1]: 0.00032131, [7] [b_1]: 0.00021125 [b_2]: 8.05999e-06 [updatestate_depend_eliminate]: 6.87002e-06 [updatestate_assign_eliminate]: 2.61999e-06 [updatestate_loads_eliminate]: 2.40002e-06 [renormalize]: 5.69999e-07 [cse]: 2.531e-05 [optimize_parallel_all_gather_comm]: 2.039e-05 [overlap_param_gather]: 5.38002e-06 [cconv]: 3.3e-05 [loop_unroll]: 0.00047701 [opt_after_cconv]: 0.00013701, [1] [Cycle 1]: 0.00012794, [7] [c_1]: 2.866e-05 [parameter_eliminate]: 3.92998e-06 [updatestate_depend_eliminate]: 7.28e-06 [updatestate_assign_eliminate]: 2.85002e-06 [updatestate_loads_eliminate]: 2.85998e-06 [cse]: 2.422e-05 [renormalize]: 6.59988e-07 [remove_dup_value]: 1.828e-05 [tuple_transform]: 9.107e-05, [1] [Cycle 1]: 8.339e-05, [4] [d_1]: 4.16e-05 [none_parameter_eliminate]: 1.98002e-06 [renormalize]: 2.09984e-07 [switch_simplify]: 6.94001e-06 [partial_unused_args_eliminate]: 5.02e-06 [add_recomputation]: 5.41e-05 [cse_after_recomputation]: 3.148e-05, [1] [Cycle 1]: 2.465e-05, [1] [cse]: 1.35e-05 [environ_conv]: 8.70001e-06 [swap_dp_allreduce_reducescatter]: 8.23001e-06 [bias_add_comm_swap]: 5.22999e-06 [label_micro_interleaved_index]: 6.95002e-06 [label_fine_grained_interleaved_index]: 5.47001e-06 [merge_cast_opt]: 3.71999e-06 [slice_recompute_activation]: 4.97e-06 [micro_interleaved_order_control]: 4.60999e-06 [assign_add_opt]: 3.55e-06 [ForceFp32Comm]: 3.03e-06 [remove_cast_before_assign_add]: 3.39001e-06 [full_micro_interleaved_order_control]: 4.40999e-06 [reorder_send_recv_between_fp_bp]: 5.57001e-06 [comm_op_add_attrs]: 3.8e-06 [add_comm_op_reuse_tag]: 3.63999e-06 [interleave_split_concat_branches]: 3.46999e-06 [interleave_parallel_branches]: 3.39001e-06 [overlap_opt_shard_in_pipeline]: 3.53999e-06 [overlap_opt_shard_grad_in_pipeline]: 4.33999e-06 [control_data_broadcast_order]: 1.711e-05 [grouped_pairwise_exchange_alltoall]: 4.45e-06 [offloading_packed_experts]: 7.3e-06 [overlap_recompute_and_grad_model_parallel]: 8.12003e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.80998e-06 [overlap_recompute_allgather_and_fa_grad]: 3.84002e-06 [overlap_recompute_comm]: 5.20999e-06 [overlap_grad_ring_attention]: 8.30999e-06 [overlap_grad_flash_sp]: 2.374e-05 [begin_end_overlap_inline]: 3.26999e-06 [split_matmul_comm_elemetwise]: 4.67e-06 [split_layernorm_comm]: 4.22998e-06 [handle_group_info]: 3.31999e-06 [symbol_engine_optimizer]: 0.00011019, [1] [Cycle 1]: 0.0001021, [6] [build]: 3.80998e-06 [elim_shapecalc]: 1.336e-05 [elim_not_effective]: 1.441e-05 [opt_reshape]: 7.2e-06 [fold_const_symbol]: 1.114e-05 [renormalize]: 2.89991e-07 [detach_backward]: 3.93001e-06 [pipeline_parallel_scheduler]: 1.82999e-06 [auto_monad_reorder]: 2.191e-05 [get_jit_bprop_graph]: 1.97999e-06 [rewriter_after_jit_bprop_graph]: 5.25999e-06 [opt_after_jit_grad]: 0.00051544 [validate]: 3.965e-05 Sums bootstrap : 0.000438s : 4.33% type_inference : 0.005081s : 50.20% event_method : 0.000013s : 0.13% auto_monad : 0.000058s : 0.57% graph_reusing : 0.000006s : 0.06% inline : 0.000002s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000015s : 0.15% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000004s : 0.04% parallel-infer-symbol : 0.000003s : 0.03% pre_auto_parallel : 0.000028s : 0.28% insert-virtual-dataset : 0.000002s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.02% optimize.py_interpret_to_execute : 0.000022s : 0.22% optimize.rewriter_before_opt_a : 0.000052s : 0.51% optimize.opt_a.expand_dump_flag : 0.000005s : 0.05% optimize.opt_a.switch_simplify : 0.000032s : 0.32% optimize.opt_a.loop_unroll : 0.000020s : 0.19% optimize.opt_a.a_1 : 0.000415s : 4.10% optimize.opt_a.with_stream_mark : 0.000034s : 0.33% optimize.opt_a.recompute_prepare : 0.000018s : 0.18% optimize.opt_a.updatestate_depend_eliminate : 0.000008s : 0.08% optimize.opt_a.updatestate_assign_eliminate : 0.000006s : 0.06% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.06% optimize.opt_a.parameter_eliminate : 0.000004s : 0.04% optimize.opt_a.a_2 : 0.000204s : 2.02% optimize.opt_a.accelerated_algorithm : 0.000014s : 0.14% optimize.opt_a.shard : 0.000005s : 0.05% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.04% optimize.opt_a.shard_inline : 0.000013s : 0.13% optimize.opt_a.merge_send_recv : 0.000016s : 0.16% optimize.opt_a.auto_parallel : 0.000014s : 0.13% optimize.opt_a.parallel : 0.000027s : 0.27% optimize.opt_a.flash_sp : 0.000014s : 0.14% optimize.opt_a.merge_comm : 0.000008s : 0.08% optimize.opt_a.allreduce_fusion : 0.000007s : 0.07% optimize.opt_a.matmul_add_comm_reduction : 0.000016s : 0.16% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000017s : 0.17% optimize.opt_a.virtual_dataset : 0.000012s : 0.12% optimize.opt_a.get_grad_eliminate_ : 0.000039s : 0.38% optimize.opt_a.virtual_output : 0.000012s : 0.12% optimize.opt_a.merge_forward : 0.000008s : 0.08% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.03% optimize.opt_a.offload_activation : 0.000019s : 0.18% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000032s : 0.32% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.03% optimize.opt_a.before_grad : 0.000022s : 0.22% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000009s : 0.09% optimize.opt_a.meta_fg_expand : 0.000005s : 0.05% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.04% optimize.opt_a.receive_attached : 0.000003s : 0.03% optimize.opt_a.after_resolve : 0.000021s : 0.21% optimize.opt_a.a_after_grad : 0.000017s : 0.17% optimize.opt_a.renormalize : 0.000574s : 5.68% optimize.opt_a.add_forward_monad_depend : 0.000009s : 0.09% optimize.opt_a.auto_monad_grad : 0.000004s : 0.04% optimize.opt_a.auto_monad_eliminator : 0.000028s : 0.28% optimize.opt_a.cse : 0.000051s : 0.50% optimize.opt_a.a_3 : 0.000114s : 1.13% optimize.py_interpret_to_execute_after_opt_a : 0.000015s : 0.15% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.05% optimize.rewriter_after_opt_a : 0.000042s : 0.42% optimize.convert_after_rewriter : 0.000011s : 0.10% optimize.order_py_execute_after_rewriter : 0.000009s : 0.09% optimize.mutable_eliminate : 0.000577s : 5.70% optimize.opt_b.b_1 : 0.000211s : 2.09% optimize.opt_b.b_2 : 0.000008s : 0.08% optimize.opt_b.updatestate_depend_eliminate : 0.000007s : 0.07% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_b.updatestate_loads_eliminate : 0.000002s : 0.02% optimize.opt_b.renormalize : 0.000001s : 0.01% optimize.opt_b.cse : 0.000025s : 0.25% optimize.optimize_parallel_all_gather_comm : 0.000020s : 0.20% optimize.overlap_param_gather : 0.000005s : 0.05% optimize.cconv : 0.000033s : 0.33% optimize.loop_unroll : 0.000477s : 4.71% optimize.opt_after_cconv.c_1 : 0.000029s : 0.28% optimize.opt_after_cconv.parameter_eliminate : 0.000004s : 0.04% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000007s : 0.07% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.cse : 0.000024s : 0.24% optimize.opt_after_cconv.renormalize : 0.000001s : 0.01% optimize.remove_dup_value : 0.000018s : 0.18% optimize.tuple_transform.d_1 : 0.000042s : 0.41% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.02% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000007s : 0.07% optimize.partial_unused_args_eliminate : 0.000005s : 0.05% optimize.add_recomputation : 0.000054s : 0.53% optimize.cse_after_recomputation.cse : 0.000013s : 0.13% optimize.environ_conv : 0.000009s : 0.09% optimize.swap_dp_allreduce_reducescatter : 0.000008s : 0.08% optimize.bias_add_comm_swap : 0.000005s : 0.05% optimize.label_micro_interleaved_index : 0.000007s : 0.07% optimize.label_fine_grained_interleaved_index : 0.000005s : 0.05% optimize.merge_cast_opt : 0.000004s : 0.04% optimize.slice_recompute_activation : 0.000005s : 0.05% optimize.micro_interleaved_order_control : 0.000005s : 0.05% optimize.assign_add_opt : 0.000004s : 0.04% optimize.ForceFp32Comm : 0.000003s : 0.03% optimize.remove_cast_before_assign_add : 0.000003s : 0.03% optimize.full_micro_interleaved_order_control : 0.000004s : 0.04% optimize.reorder_send_recv_between_fp_bp : 0.000006s : 0.06% optimize.comm_op_add_attrs : 0.000004s : 0.04% optimize.add_comm_op_reuse_tag : 0.000004s : 0.04% optimize.interleave_split_concat_branches : 0.000003s : 0.03% optimize.interleave_parallel_branches : 0.000003s : 0.03% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.03% optimize.overlap_opt_shard_grad_in_pipeline : 0.000004s : 0.04% optimize.control_data_broadcast_order : 0.000017s : 0.17% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.04% optimize.offloading_packed_experts : 0.000007s : 0.07% optimize.overlap_recompute_and_grad_model_parallel : 0.000008s : 0.08% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.04% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.04% optimize.overlap_recompute_comm : 0.000005s : 0.05% optimize.overlap_grad_ring_attention : 0.000008s : 0.08% optimize.overlap_grad_flash_sp : 0.000024s : 0.23% optimize.begin_end_overlap_inline : 0.000003s : 0.03% optimize.split_matmul_comm_elemetwise : 0.000005s : 0.05% optimize.split_layernorm_comm : 0.000004s : 0.04% optimize.handle_group_info : 0.000003s : 0.03% optimize.symbol_engine_optimizer.build : 0.000004s : 0.04% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000013s : 0.13% optimize.symbol_engine_optimizer.elim_not_effective : 0.000014s : 0.14% optimize.symbol_engine_optimizer.opt_reshape : 0.000007s : 0.07% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000011s : 0.11% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000004s : 0.04% pipeline_parallel_scheduler : 0.000002s : 0.02% auto_monad_reorder : 0.000022s : 0.22% get_jit_bprop_graph : 0.000002s : 0.02% rewriter_after_jit_bprop_graph : 0.000005s : 0.05% opt_after_jit_grad : 0.000515s : 5.09% validate : 0.000040s : 0.39% Time group info: ------[substitution.] 0.000150 20 1.26% : 0.000002s : 2: substitution.elim_not_effective 1.05% : 0.000002s : 2: substitution.fold_const_symbol 4.03% : 0.000006s : 3: substitution.graph_param_transform 62.16% : 0.000093s : 2: substitution.inline 3.16% : 0.000005s : 4: substitution.j_node_and_user_rematch 3.14% : 0.000005s : 4: substitution.remove_not_recompute_node 2.18% : 0.000003s : 2: substitution.replace_old_param 23.03% : 0.000035s : 1: substitution.value_based_eliminate ------[type_inference.] 0.005031 2 91.40% : 0.004599s : 1: type_inference.infer 8.60% : 0.000433s : 1: type_inference.specialize ------[replace.] 0.000023 2 100.00% : 0.000023s : 2: replace.inline ------[match.] 0.000091 2 100.00% : 0.000091s : 2: match.inline ------[predicate.] 0.000141 754 0.80% : 0.000001s : 7: predicate.accumulaten_eliminater 1.19% : 0.000002s : 3: predicate.ad_related_special_op_eliminate 0.64% : 0.000001s : 6: predicate.addn_check_dump 0.81% : 0.000001s : 7: predicate.addn_zero_filter 0.65% : 0.000001s : 7: predicate.adjust_all_reduce_mul_add 2.20% : 0.000003s : 13: predicate.arithmetic_simplify 0.76% : 0.000001s : 7: predicate.cast_eliminate 0.87% : 0.000001s : 6: predicate.check_bprop_eliminate 0.60% : 0.000001s : 6: predicate.compare_switch_simplify 0.21% : 0.000000s : 3: predicate.const_output_eliminate 0.69% : 0.000001s : 6: predicate.depend_value_elim 0.79% : 0.000001s : 7: predicate.dict_get_item_const_eliminator 0.97% : 0.000001s : 7: predicate.dict_get_item_eliminator 0.80% : 0.000001s : 7: predicate.dict_set_item_eliminator 1.28% : 0.000002s : 6: predicate.dumpgradient_eliminate 0.28% : 0.000000s : 3: predicate.elim_not_effective 0.44% : 0.000001s : 3: predicate.elim_shapecalc_of_broadcastargs 1.06% : 0.000001s : 10: predicate.environ_add_const_eliminate 1.02% : 0.000001s : 10: predicate.environ_get_add_eliminate 0.95% : 0.000001s : 10: predicate.environ_get_depend_swap 1.76% : 0.000002s : 16: predicate.environ_get_eliminate 1.24% : 0.000002s : 10: predicate.environ_get_set_eliminate 0.92% : 0.000001s : 9: predicate.exchange_switch_depend_value 1.93% : 0.000003s : 9: predicate.float_depend_g_call 0.62% : 0.000001s : 6: predicate.float_environ_get_switch 0.90% : 0.000001s : 9: predicate.float_tuple_getitem_switch 0.29% : 0.000000s : 3: predicate.fold_const_symbol 0.80% : 0.000001s : 6: predicate.get_grad_eliminate 0.28% : 0.000000s : 3: predicate.graph_param_transform 0.81% : 0.000001s : 6: predicate.incorporate_call 0.64% : 0.000001s : 6: predicate.incorporate_call_switch 7.01% : 0.000010s : 34: predicate.inline 0.95% : 0.000001s : 6: predicate.inline_without_move 0.41% : 0.000001s : 6: predicate.j_node_and_user_rematch 0.98% : 0.000001s : 6: predicate.less_batch_normalization 1.73% : 0.000002s : 13: predicate.list_to_tuple_eliminator_ 1.96% : 0.000003s : 20: predicate.load_eliminater 1.55% : 0.000002s : 3: predicate.loop_unroll_after_grad 1.46% : 0.000002s : 14: predicate.loop_unroll_before_grad 1.85% : 0.000003s : 13: predicate.make_slice_get_slice_eliminator 0.65% : 0.000001s : 6: predicate.merge_addn 0.71% : 0.000001s : 6: predicate.micro_step_allgather_replace 0.76% : 0.000001s : 6: predicate.mini_step_allgather_replace 0.63% : 0.000001s : 7: predicate.minmaximum_grad 1.81% : 0.000003s : 3: predicate.mutable_eliminate 0.66% : 0.000001s : 3: predicate.opt_reshape 0.40% : 0.000001s : 3: predicate.parallel_virtual_node 1.30% : 0.000002s : 9: predicate.partial_defer_inline 1.27% : 0.000002s : 10: predicate.partial_eliminate 0.78% : 0.000001s : 7: predicate.print_const_string_wrapper 0.74% : 0.000001s : 6: predicate.reduce_all_const_elim 1.55% : 0.000002s : 7: predicate.reduce_eliminate 2.02% : 0.000003s : 20: predicate.redundant_stop_gradient_eliminater 0.70% : 0.000001s : 6: predicate.remove_not_recompute_node 1.19% : 0.000002s : 13: predicate.replace_applicator 0.79% : 0.000001s : 6: predicate.replace_old_param 0.33% : 0.000000s : 3: predicate.reset_defer_inline 0.83% : 0.000001s : 7: predicate.reshape_eliminate 0.79% : 0.000001s : 6: predicate.row_tensor_add_zeros_like 0.43% : 0.000001s : 3: predicate.row_tensor_eliminate 0.99% : 0.000001s : 6: predicate.same_eliminate 0.82% : 0.000001s : 6: predicate.set_cell_output_no_recompute 0.98% : 0.000001s : 6: predicate.shard_identity_eliminate 0.92% : 0.000001s : 6: predicate.special_op_eliminate 1.04% : 0.000001s : 6: predicate.specialize_transform 1.34% : 0.000002s : 6: predicate.split_environ_get_set_with_tuple_value 0.87% : 0.000001s : 6: predicate.stack_unstack_eliminate 0.45% : 0.000001s : 3: predicate.switch_call_monad_eliminater 0.99% : 0.000001s : 9: predicate.switch_defer_inline 1.66% : 0.000002s : 15: predicate.switch_layer_defer_inline 4.16% : 0.000006s : 32: predicate.switch_simplify 0.85% : 0.000001s : 7: predicate.tile_eliminate 0.87% : 0.000001s : 7: predicate.transpose_eliminate 1.46% : 0.000002s : 13: predicate.tuple_list_convert_item_index_to_positive 1.45% : 0.000002s : 13: predicate.tuple_list_get_item_const_eliminator 1.49% : 0.000002s : 13: predicate.tuple_list_get_item_depend_reorder 3.01% : 0.000004s : 19: predicate.tuple_list_get_item_eliminator 1.32% : 0.000002s : 13: predicate.tuple_list_get_set_item_eliminator 2.60% : 0.000004s : 19: predicate.tuple_list_set_item_eliminator 1.36% : 0.000002s : 13: predicate.tuple_to_list_eliminator_ 1.95% : 0.000003s : 20: predicate.updatestate_pure_node_eliminater 2.74% : 0.000004s : 26: predicate.updatestate_useless_node_eliminater 0.80% : 0.000001s : 3: predicate.value_based_eliminate 0.97% : 0.000001s : 6: predicate.virtual_dataset_eliminate 0.67% : 0.000001s : 6: predicate.virtual_output_eliminate 0.31% : 0.000000s : 3: predicate.virtual_view_grad_eliminate 0.43% : 0.000001s : 3: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000233 5 8.32% : 0.000019s : 1: func_graph_cloner_run.FuncGraphClonerGraph 91.68% : 0.000214s : 4: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.025282 192 0.02% : 0.000006s : 1: ForceFp32Comm 13.01% : 0.003290s : 1: add_attr 12.95% : 0.003274s : 1: add_attr_with_inline 0.02% : 0.000006s : 1: add_comm_op_reuse_tag 0.23% : 0.000059s : 1: add_recomputation 0.02% : 0.000006s : 1: assign_add_opt 0.26% : 0.000066s : 1: auto_monad 0.12% : 0.000030s : 1: auto_monad_reorder 0.03% : 0.000006s : 1: begin_end_overlap_inline 0.03% : 0.000008s : 1: bias_add_comm_swap 1.92% : 0.000485s : 1: bootstrap 0.14% : 0.000036s : 1: cconv 0.03% : 0.000007s : 1: comm_op_add_attrs 0.09% : 0.000021s : 1: control_data_broadcast_order 0.06% : 0.000014s : 1: convert_after_rewriter 0.14% : 0.000034s : 1: cse_after_recomputation 0.03% : 0.000008s : 1: dataset_repeat_opt 0.09% : 0.000024s : 1: detach_backward 0.05% : 0.000012s : 1: environ_conv 0.09% : 0.000023s : 1: event_method 0.03% : 0.000007s : 1: full_micro_interleaved_order_control 0.03% : 0.000008s : 1: get_jit_bprop_graph 0.05% : 0.000012s : 1: graph_reusing 0.03% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.02% : 0.000006s : 1: handle_group_info 0.03% : 0.000008s : 1: inline 0.03% : 0.000009s : 1: insert-virtual-dataset 0.02% : 0.000006s : 1: interleave_parallel_branches 0.02% : 0.000006s : 1: interleave_split_concat_branches 0.03% : 0.000008s : 1: label_fine_grained_interleaved_index 0.04% : 0.000010s : 1: label_micro_interleaved_index 1.91% : 0.000484s : 1: loop_unroll 0.03% : 0.000006s : 1: merge_cast_opt 0.03% : 0.000007s : 1: micro_interleaved_order_control 2.31% : 0.000585s : 1: mutable_eliminate 0.04% : 0.000010s : 1: offloading_packed_experts 0.07% : 0.000017s : 1: opt.transform.loop_unroll_optimizer 0.07% : 0.000018s : 1: opt.transform.mutable_eliminate 3.26% : 0.000823s : 78: opt.transform.opt_a 0.11% : 0.000027s : 1: opt.transform.opt_after_cconv 0.10% : 0.000026s : 1: opt.transform.opt_after_jit_grad 0.55% : 0.000138s : 28: opt.transform.opt_b 0.18% : 0.000046s : 2: opt.transform.opt_trans_graph 0.16% : 0.000041s : 4: opt.transform.symbol_engine_opt 10.68% : 0.002700s : 1: opt_a 0.56% : 0.000141s : 1: opt_after_cconv 2.09% : 0.000527s : 1: opt_after_jit_grad 1.33% : 0.000336s : 1: opt_b 21.82% : 0.005517s : 1: optimize 0.10% : 0.000024s : 1: optimize_parallel_all_gather_comm 0.05% : 0.000012s : 1: order_py_execute_after_rewriter 0.11% : 0.000028s : 1: overlap_grad_flash_sp 0.03% : 0.000007s : 1: overlap_grad_matmul_and_grad_allreduce 0.05% : 0.000011s : 1: overlap_grad_ring_attention 0.03% : 0.000007s : 1: overlap_opt_shard_grad_in_pipeline 0.03% : 0.000007s : 1: overlap_opt_shard_in_pipeline 0.03% : 0.000009s : 1: overlap_param_gather 0.03% : 0.000007s : 1: overlap_recompute_allgather_and_fa_grad 0.04% : 0.000011s : 1: overlap_recompute_and_grad_model_parallel 0.03% : 0.000008s : 1: overlap_recompute_comm 0.04% : 0.000010s : 1: parallel-infer-symbol 0.02% : 0.000006s : 1: parallel-infer-symbol-second 0.03% : 0.000008s : 1: partial_unused_args_eliminate 0.04% : 0.000009s : 1: pipeline_parallel_scheduler 0.03% : 0.000007s : 1: pipeline_split 0.14% : 0.000036s : 1: pre_auto_parallel 0.10% : 0.000026s : 1: py_interpret_to_execute 0.08% : 0.000019s : 1: py_interpret_to_execute_after_opt_a 0.02% : 0.000006s : 1: remove_cast_before_assign_add 0.09% : 0.000022s : 1: remove_dup_value 1.21% : 0.000306s : 1: renormalize.infer 1.03% : 0.000259s : 1: renormalize.specialize 0.03% : 0.000009s : 1: reorder_send_recv_between_fp_bp 0.05% : 0.000011s : 1: rewriter_after_jit_bprop_graph 0.18% : 0.000046s : 1: rewriter_after_opt_a 0.22% : 0.000055s : 1: rewriter_before_opt_a 0.03% : 0.000008s : 1: slice_cell_reuse_recomputed_activation 0.03% : 0.000008s : 1: slice_recompute_activation 0.03% : 0.000007s : 1: split_layernorm_comm 0.03% : 0.000008s : 1: split_matmul_comm_elemetwise 0.04% : 0.000011s : 1: swap_dp_allreduce_reducescatter 0.45% : 0.000113s : 1: symbol_engine_optimizer 0.37% : 0.000094s : 1: tuple_transform 20.22% : 0.005113s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:58.923.030 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0130396, [21] [bootstrap]: 0.00043691 [type_inference]: 0.00483179 [event_method]: 1.167e-05 [auto_monad]: 5.331e-05 [graph_reusing]: 5.21998e-06 [inline]: 2.16e-06 [add_attr]: 0.00306011, [1] [add_attr_with_inline]: 0.00305183, [1] [Cycle 1]: 5.294e-05, [2] [tag_attr]: 1.374e-05 [meta_addattr_fg_expand]: 3.70998e-06 [parallel-infer-symbol]: 3.12997e-06 [pre_auto_parallel]: 2.284e-05 [insert-virtual-dataset]: 2.45002e-06 [parallel-infer-symbol-second]: 7.50006e-07 [dataset_repeat_opt]: 1.89999e-06 [pipeline_split]: 1.51998e-06 [optimize]: 0.003949, [53] [py_interpret_to_execute]: 1.685e-05 [rewriter_before_opt_a]: 4.286e-05 [opt_a]: 0.00202758, [2] [Cycle 1]: 0.00140789, [45] [expand_dump_flag]: 3.15998e-06 [switch_simplify]: 2.536e-05 [loop_unroll]: 1.408e-05 [a_1]: 0.00027952 [with_stream_mark]: 1.526e-05 [recompute_prepare]: 7.74002e-06 [updatestate_depend_eliminate]: 3.64002e-06 [updatestate_assign_eliminate]: 3.24001e-06 [updatestate_loads_eliminate]: 3.18e-06 [parameter_eliminate]: 2.25002e-06 [a_2]: 7.755e-05 [accelerated_algorithm]: 6.78e-06 [shard]: 2.37999e-06 [meta_shard_fg_expand]: 1.77999e-06 [shard_inline]: 6.13998e-06 [merge_send_recv]: 8.05e-06 [auto_parallel]: 6.06998e-06 [parallel]: 1.851e-05 [flash_sp]: 7.92e-06 [merge_comm]: 3.45e-06 [allreduce_fusion]: 3.40998e-06 [matmul_add_comm_reduction]: 9.73002e-06 [allreduce_slice_to_reducescatter]: 6.39993e-07 [virtual_shard_identity]: 7.41999e-06 [virtual_dataset]: 5.97999e-06 [get_grad_eliminate_]: 5.74999e-06 [virtual_output]: 5.61e-06 [merge_forward]: 4.51002e-06 [cell_reuse_recompute_pass]: 1.38002e-06 [offload_activation]: 9.82999e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.283e-05 [merge_recompute_call_nodes]: 1.49998e-06 [before_grad]: 9.84001e-06 [set_forward_comm_id_for_comm_node_pass]: 3.55e-06 [meta_fg_expand]: 2.74001e-06 [flash_sp_send_recv_attached]: 2.47001e-06 [receive_attached]: 2.14999e-06 [after_resolve]: 9.18002e-06 [a_after_grad]: 8.38001e-06 [renormalize]: 0.00047483 [add_forward_monad_depend]: 4.97999e-06 [auto_monad_grad]: 2.37999e-06 [auto_monad_eliminator]: 1.46e-05 [cse]: 2.943e-05 [a_3]: 4.369e-05 [Cycle 2]: 0.00061065, [45] [expand_dump_flag]: 1.37999e-06 [switch_simplify]: 7.1e-06 [loop_unroll]: 6.28e-06 [a_1]: 0.00010637 [with_stream_mark]: 1.069e-05 [recompute_prepare]: 6.02999e-06 [updatestate_depend_eliminate]: 3.11001e-06 [updatestate_assign_eliminate]: 2.30002e-06 [updatestate_loads_eliminate]: 2.78e-06 [parameter_eliminate]: 1.03001e-06 [a_2]: 7.181e-05 [accelerated_algorithm]: 5.87999e-06 [shard]: 1.20001e-06 [meta_shard_fg_expand]: 1.37e-06 [shard_inline]: 6.07999e-06 [merge_send_recv]: 4.78001e-06 [auto_parallel]: 5.59e-06 [parallel]: 4.48999e-06 [flash_sp]: 3.61001e-06 [merge_comm]: 3.39001e-06 [allreduce_fusion]: 2.98e-06 [matmul_add_comm_reduction]: 5.67999e-06 [allreduce_slice_to_reducescatter]: 3.69997e-07 [virtual_shard_identity]: 6.43e-06 [virtual_dataset]: 5.65001e-06 [get_grad_eliminate_]: 6.02001e-06 [virtual_output]: 5.65001e-06 [merge_forward]: 2.68998e-06 [cell_reuse_recompute_pass]: 1.39e-06 [offload_activation]: 9.11002e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.236e-05 [merge_recompute_call_nodes]: 8.70001e-07 [before_grad]: 8.95001e-06 [set_forward_comm_id_for_comm_node_pass]: 3.73001e-06 [meta_fg_expand]: 2.09e-06 [flash_sp_send_recv_attached]: 8.39995e-07 [receive_attached]: 1.04e-06 [after_resolve]: 8.37e-06 [a_after_grad]: 8.40001e-06 [renormalize]: 8.00064e-08 [add_forward_monad_depend]: 1.17e-06 [auto_monad_grad]: 8.30012e-07 [auto_monad_eliminator]: 6.64001e-06 [cse]: 1.386e-05 [a_3]: 3.473e-05 [py_interpret_to_execute_after_opt_a]: 8.17e-06 [slice_cell_reuse_recomputed_activation]: 1.89e-06 [rewriter_after_opt_a]: 3.405e-05 [convert_after_rewriter]: 6.36e-06 [order_py_execute_after_rewriter]: 5.54998e-06 [mutable_eliminate]: 0.00047938 [opt_b]: 0.00023596, [1] [Cycle 1]: 0.00023026, [7] [b_1]: 0.00015657 [b_2]: 8.08999e-06 [updatestate_depend_eliminate]: 5.40999e-06 [updatestate_assign_eliminate]: 2.46e-06 [updatestate_loads_eliminate]: 2.18002e-06 [renormalize]: 4.89992e-07 [cse]: 1.792e-05 [optimize_parallel_all_gather_comm]: 1.664e-05 [overlap_param_gather]: 2.21e-06 [cconv]: 2.321e-05 [loop_unroll]: 0.00041965 [opt_after_cconv]: 9.818e-05, [1] [Cycle 1]: 9.275e-05, [7] [c_1]: 2.68e-05 [parameter_eliminate]: 2.66999e-06 [updatestate_depend_eliminate]: 5.24e-06 [updatestate_assign_eliminate]: 2.48e-06 [updatestate_loads_eliminate]: 2.27001e-06 [cse]: 1.85e-05 [renormalize]: 4.69998e-07 [remove_dup_value]: 1.416e-05 [tuple_transform]: 7.081e-05, [1] [Cycle 1]: 6.623e-05, [4] [d_1]: 3.904e-05 [none_parameter_eliminate]: 1.59e-06 [renormalize]: 4.30009e-07 [switch_simplify]: 6.68e-06 [partial_unused_args_eliminate]: 1.70001e-06 [add_recomputation]: 4.503e-05 [cse_after_recomputation]: 2.088e-05, [1] [Cycle 1]: 1.63e-05, [1] [cse]: 1.083e-05 [environ_conv]: 5.32999e-06 [swap_dp_allreduce_reducescatter]: 5.00001e-06 [bias_add_comm_swap]: 2.36e-06 [label_micro_interleaved_index]: 4.22e-06 [label_fine_grained_interleaved_index]: 2.46e-06 [merge_cast_opt]: 1.48002e-06 [slice_recompute_activation]: 2.16e-06 [micro_interleaved_order_control]: 2.32001e-06 [assign_add_opt]: 1.21002e-06 [ForceFp32Comm]: 8.2e-07 [remove_cast_before_assign_add]: 9.70002e-07 [full_micro_interleaved_order_control]: 2.40002e-06 [reorder_send_recv_between_fp_bp]: 2.67001e-06 [comm_op_add_attrs]: 1.02998e-06 [add_comm_op_reuse_tag]: 1.00999e-06 [interleave_split_concat_branches]: 1.24e-06 [interleave_parallel_branches]: 1.00001e-06 [overlap_opt_shard_in_pipeline]: 1.22e-06 [overlap_opt_shard_grad_in_pipeline]: 2.34999e-06 [control_data_broadcast_order]: 1.186e-05 [grouped_pairwise_exchange_alltoall]: 1.64998e-06 [offloading_packed_experts]: 3.88999e-06 [overlap_recompute_and_grad_model_parallel]: 4.60999e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.15999e-06 [overlap_recompute_allgather_and_fa_grad]: 1.66998e-06 [overlap_recompute_comm]: 2.26e-06 [overlap_grad_ring_attention]: 3.81999e-06 [overlap_grad_flash_sp]: 1.783e-05 [begin_end_overlap_inline]: 7.10017e-07 [split_matmul_comm_elemetwise]: 2.13002e-06 [split_layernorm_comm]: 1.55001e-06 [handle_group_info]: 8.99978e-07 [symbol_engine_optimizer]: 7.188e-05, [1] [Cycle 1]: 6.787e-05, [6] [build]: 2.43e-06 [elim_shapecalc]: 8.87e-06 [elim_not_effective]: 1.227e-05 [opt_reshape]: 6.74999e-06 [fold_const_symbol]: 9.87001e-06 [renormalize]: 2.80008e-07 [detach_backward]: 1.71e-06 [pipeline_parallel_scheduler]: 1.45999e-06 [auto_monad_reorder]: 1.538e-05 [get_jit_bprop_graph]: 1.47999e-06 [rewriter_after_jit_bprop_graph]: 3.44001e-06 [opt_after_jit_grad]: 0.00045595 [validate]: 3.507e-05 Sums bootstrap : 0.000437s : 4.83% type_inference : 0.004832s : 53.41% event_method : 0.000012s : 0.13% auto_monad : 0.000053s : 0.59% graph_reusing : 0.000005s : 0.06% inline : 0.000002s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000014s : 0.15% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000004s : 0.04% parallel-infer-symbol : 0.000003s : 0.03% pre_auto_parallel : 0.000023s : 0.25% insert-virtual-dataset : 0.000002s : 0.03% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.02% optimize.py_interpret_to_execute : 0.000017s : 0.19% optimize.rewriter_before_opt_a : 0.000043s : 0.47% optimize.opt_a.expand_dump_flag : 0.000005s : 0.05% optimize.opt_a.switch_simplify : 0.000032s : 0.36% optimize.opt_a.loop_unroll : 0.000020s : 0.23% optimize.opt_a.a_1 : 0.000386s : 4.27% optimize.opt_a.with_stream_mark : 0.000026s : 0.29% optimize.opt_a.recompute_prepare : 0.000014s : 0.15% optimize.opt_a.updatestate_depend_eliminate : 0.000007s : 0.07% optimize.opt_a.updatestate_assign_eliminate : 0.000006s : 0.06% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.07% optimize.opt_a.parameter_eliminate : 0.000003s : 0.04% optimize.opt_a.a_2 : 0.000149s : 1.65% optimize.opt_a.accelerated_algorithm : 0.000013s : 0.14% optimize.opt_a.shard : 0.000004s : 0.04% optimize.opt_a.meta_shard_fg_expand : 0.000003s : 0.03% optimize.opt_a.shard_inline : 0.000012s : 0.14% optimize.opt_a.merge_send_recv : 0.000013s : 0.14% optimize.opt_a.auto_parallel : 0.000012s : 0.13% optimize.opt_a.parallel : 0.000023s : 0.25% optimize.opt_a.flash_sp : 0.000012s : 0.13% optimize.opt_a.merge_comm : 0.000007s : 0.08% optimize.opt_a.allreduce_fusion : 0.000006s : 0.07% optimize.opt_a.matmul_add_comm_reduction : 0.000015s : 0.17% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000014s : 0.15% optimize.opt_a.virtual_dataset : 0.000012s : 0.13% optimize.opt_a.get_grad_eliminate_ : 0.000012s : 0.13% optimize.opt_a.virtual_output : 0.000011s : 0.12% optimize.opt_a.merge_forward : 0.000007s : 0.08% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.03% optimize.opt_a.offload_activation : 0.000019s : 0.21% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000025s : 0.28% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.03% optimize.opt_a.before_grad : 0.000019s : 0.21% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000007s : 0.08% optimize.opt_a.meta_fg_expand : 0.000005s : 0.05% optimize.opt_a.flash_sp_send_recv_attached : 0.000003s : 0.04% optimize.opt_a.receive_attached : 0.000003s : 0.04% optimize.opt_a.after_resolve : 0.000018s : 0.19% optimize.opt_a.a_after_grad : 0.000017s : 0.19% optimize.opt_a.renormalize : 0.000475s : 5.25% optimize.opt_a.add_forward_monad_depend : 0.000006s : 0.07% optimize.opt_a.auto_monad_grad : 0.000003s : 0.04% optimize.opt_a.auto_monad_eliminator : 0.000021s : 0.23% optimize.opt_a.cse : 0.000043s : 0.48% optimize.opt_a.a_3 : 0.000078s : 0.87% optimize.py_interpret_to_execute_after_opt_a : 0.000008s : 0.09% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.02% optimize.rewriter_after_opt_a : 0.000034s : 0.38% optimize.convert_after_rewriter : 0.000006s : 0.07% optimize.order_py_execute_after_rewriter : 0.000006s : 0.06% optimize.mutable_eliminate : 0.000479s : 5.30% optimize.opt_b.b_1 : 0.000157s : 1.73% optimize.opt_b.b_2 : 0.000008s : 0.09% optimize.opt_b.updatestate_depend_eliminate : 0.000005s : 0.06% optimize.opt_b.updatestate_assign_eliminate : 0.000002s : 0.03% optimize.opt_b.updatestate_loads_eliminate : 0.000002s : 0.02% optimize.opt_b.renormalize : 0.000000s : 0.01% optimize.opt_b.cse : 0.000018s : 0.20% optimize.optimize_parallel_all_gather_comm : 0.000017s : 0.18% optimize.overlap_param_gather : 0.000002s : 0.02% optimize.cconv : 0.000023s : 0.26% optimize.loop_unroll : 0.000420s : 4.64% optimize.opt_after_cconv.c_1 : 0.000027s : 0.30% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000005s : 0.06% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000002s : 0.03% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.03% optimize.opt_after_cconv.cse : 0.000018s : 0.20% optimize.opt_after_cconv.renormalize : 0.000000s : 0.01% optimize.remove_dup_value : 0.000014s : 0.16% optimize.tuple_transform.d_1 : 0.000039s : 0.43% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.02% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000007s : 0.07% optimize.partial_unused_args_eliminate : 0.000002s : 0.02% optimize.add_recomputation : 0.000045s : 0.50% optimize.cse_after_recomputation.cse : 0.000011s : 0.12% optimize.environ_conv : 0.000005s : 0.06% optimize.swap_dp_allreduce_reducescatter : 0.000005s : 0.06% optimize.bias_add_comm_swap : 0.000002s : 0.03% optimize.label_micro_interleaved_index : 0.000004s : 0.05% optimize.label_fine_grained_interleaved_index : 0.000002s : 0.03% optimize.merge_cast_opt : 0.000001s : 0.02% optimize.slice_recompute_activation : 0.000002s : 0.02% optimize.micro_interleaved_order_control : 0.000002s : 0.03% optimize.assign_add_opt : 0.000001s : 0.01% optimize.ForceFp32Comm : 0.000001s : 0.01% optimize.remove_cast_before_assign_add : 0.000001s : 0.01% optimize.full_micro_interleaved_order_control : 0.000002s : 0.03% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.03% optimize.comm_op_add_attrs : 0.000001s : 0.01% optimize.add_comm_op_reuse_tag : 0.000001s : 0.01% optimize.interleave_split_concat_branches : 0.000001s : 0.01% optimize.interleave_parallel_branches : 0.000001s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.03% optimize.control_data_broadcast_order : 0.000012s : 0.13% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.02% optimize.offloading_packed_experts : 0.000004s : 0.04% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.05% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000002s : 0.02% optimize.overlap_recompute_comm : 0.000002s : 0.02% optimize.overlap_grad_ring_attention : 0.000004s : 0.04% optimize.overlap_grad_flash_sp : 0.000018s : 0.20% optimize.begin_end_overlap_inline : 0.000001s : 0.01% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.02% optimize.split_layernorm_comm : 0.000002s : 0.02% optimize.handle_group_info : 0.000001s : 0.01% optimize.symbol_engine_optimizer.build : 0.000002s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000009s : 0.10% optimize.symbol_engine_optimizer.elim_not_effective : 0.000012s : 0.14% optimize.symbol_engine_optimizer.opt_reshape : 0.000007s : 0.07% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000010s : 0.11% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.02% pipeline_parallel_scheduler : 0.000001s : 0.02% auto_monad_reorder : 0.000015s : 0.17% get_jit_bprop_graph : 0.000001s : 0.02% rewriter_after_jit_bprop_graph : 0.000003s : 0.04% opt_after_jit_grad : 0.000456s : 5.04% validate : 0.000035s : 0.39% Time group info: ------[substitution.] 0.000134 20 1.34% : 0.000002s : 2: substitution.elim_not_effective 1.17% : 0.000002s : 2: substitution.fold_const_symbol 4.50% : 0.000006s : 3: substitution.graph_param_transform 62.01% : 0.000083s : 2: substitution.inline 2.20% : 0.000003s : 4: substitution.j_node_and_user_rematch 3.79% : 0.000005s : 4: substitution.remove_not_recompute_node 1.89% : 0.000003s : 2: substitution.replace_old_param 23.10% : 0.000031s : 1: substitution.value_based_eliminate ------[type_inference.] 0.004786 2 91.39% : 0.004374s : 1: type_inference.infer 8.61% : 0.000412s : 1: type_inference.specialize ------[replace.] 0.000020 2 100.00% : 0.000020s : 2: replace.inline ------[match.] 0.000081 2 100.00% : 0.000081s : 2: match.inline ------[predicate.] 0.000132 754 0.77% : 0.000001s : 7: predicate.accumulaten_eliminater 0.96% : 0.000001s : 3: predicate.ad_related_special_op_eliminate 0.71% : 0.000001s : 6: predicate.addn_check_dump 0.84% : 0.000001s : 7: predicate.addn_zero_filter 0.69% : 0.000001s : 7: predicate.adjust_all_reduce_mul_add 2.24% : 0.000003s : 13: predicate.arithmetic_simplify 0.90% : 0.000001s : 7: predicate.cast_eliminate 0.63% : 0.000001s : 6: predicate.check_bprop_eliminate 0.71% : 0.000001s : 6: predicate.compare_switch_simplify 0.24% : 0.000000s : 3: predicate.const_output_eliminate 0.77% : 0.000001s : 6: predicate.depend_value_elim 0.83% : 0.000001s : 7: predicate.dict_get_item_const_eliminator 0.94% : 0.000001s : 7: predicate.dict_get_item_eliminator 0.78% : 0.000001s : 7: predicate.dict_set_item_eliminator 1.28% : 0.000002s : 6: predicate.dumpgradient_eliminate 0.27% : 0.000000s : 3: predicate.elim_not_effective 0.53% : 0.000001s : 3: predicate.elim_shapecalc_of_broadcastargs 1.10% : 0.000001s : 10: predicate.environ_add_const_eliminate 0.99% : 0.000001s : 10: predicate.environ_get_add_eliminate 0.96% : 0.000001s : 10: predicate.environ_get_depend_swap 1.81% : 0.000002s : 16: predicate.environ_get_eliminate 1.02% : 0.000001s : 10: predicate.environ_get_set_eliminate 1.03% : 0.000001s : 9: predicate.exchange_switch_depend_value 2.10% : 0.000003s : 9: predicate.float_depend_g_call 0.65% : 0.000001s : 6: predicate.float_environ_get_switch 0.93% : 0.000001s : 9: predicate.float_tuple_getitem_switch 0.22% : 0.000000s : 3: predicate.fold_const_symbol 0.79% : 0.000001s : 6: predicate.get_grad_eliminate 0.25% : 0.000000s : 3: predicate.graph_param_transform 0.78% : 0.000001s : 6: predicate.incorporate_call 0.65% : 0.000001s : 6: predicate.incorporate_call_switch 6.32% : 0.000008s : 34: predicate.inline 1.16% : 0.000002s : 6: predicate.inline_without_move 0.46% : 0.000001s : 6: predicate.j_node_and_user_rematch 1.20% : 0.000002s : 6: predicate.less_batch_normalization 1.66% : 0.000002s : 13: predicate.list_to_tuple_eliminator_ 2.01% : 0.000003s : 20: predicate.load_eliminater 1.25% : 0.000002s : 3: predicate.loop_unroll_after_grad 1.77% : 0.000002s : 14: predicate.loop_unroll_before_grad 1.70% : 0.000002s : 13: predicate.make_slice_get_slice_eliminator 0.67% : 0.000001s : 6: predicate.merge_addn 0.68% : 0.000001s : 6: predicate.micro_step_allgather_replace 0.68% : 0.000001s : 6: predicate.mini_step_allgather_replace 0.70% : 0.000001s : 7: predicate.minmaximum_grad 1.63% : 0.000002s : 3: predicate.mutable_eliminate 0.47% : 0.000001s : 3: predicate.opt_reshape 0.77% : 0.000001s : 3: predicate.parallel_virtual_node 1.26% : 0.000002s : 9: predicate.partial_defer_inline 1.29% : 0.000002s : 10: predicate.partial_eliminate 0.81% : 0.000001s : 7: predicate.print_const_string_wrapper 0.76% : 0.000001s : 6: predicate.reduce_all_const_elim 1.06% : 0.000001s : 7: predicate.reduce_eliminate 2.13% : 0.000003s : 20: predicate.redundant_stop_gradient_eliminater 0.66% : 0.000001s : 6: predicate.remove_not_recompute_node 1.24% : 0.000002s : 13: predicate.replace_applicator 0.69% : 0.000001s : 6: predicate.replace_old_param 0.36% : 0.000000s : 3: predicate.reset_defer_inline 0.86% : 0.000001s : 7: predicate.reshape_eliminate 0.71% : 0.000001s : 6: predicate.row_tensor_add_zeros_like 0.48% : 0.000001s : 3: predicate.row_tensor_eliminate 1.02% : 0.000001s : 6: predicate.same_eliminate 0.52% : 0.000001s : 6: predicate.set_cell_output_no_recompute 0.97% : 0.000001s : 6: predicate.shard_identity_eliminate 0.77% : 0.000001s : 6: predicate.special_op_eliminate 0.90% : 0.000001s : 6: predicate.specialize_transform 1.06% : 0.000001s : 6: predicate.split_environ_get_set_with_tuple_value 0.83% : 0.000001s : 6: predicate.stack_unstack_eliminate 0.49% : 0.000001s : 3: predicate.switch_call_monad_eliminater 1.05% : 0.000001s : 9: predicate.switch_defer_inline 1.66% : 0.000002s : 15: predicate.switch_layer_defer_inline 4.47% : 0.000006s : 32: predicate.switch_simplify 0.77% : 0.000001s : 7: predicate.tile_eliminate 0.82% : 0.000001s : 7: predicate.transpose_eliminate 1.59% : 0.000002s : 13: predicate.tuple_list_convert_item_index_to_positive 1.88% : 0.000002s : 13: predicate.tuple_list_get_item_const_eliminator 1.41% : 0.000002s : 13: predicate.tuple_list_get_item_depend_reorder 3.33% : 0.000004s : 19: predicate.tuple_list_get_item_eliminator 1.55% : 0.000002s : 13: predicate.tuple_list_get_set_item_eliminator 2.51% : 0.000003s : 19: predicate.tuple_list_set_item_eliminator 1.63% : 0.000002s : 13: predicate.tuple_to_list_eliminator_ 2.07% : 0.000003s : 20: predicate.updatestate_pure_node_eliminater 2.79% : 0.000004s : 26: predicate.updatestate_useless_node_eliminater 0.62% : 0.000001s : 3: predicate.value_based_eliminate 0.82% : 0.000001s : 6: predicate.virtual_dataset_eliminate 0.74% : 0.000001s : 6: predicate.virtual_output_eliminate 0.36% : 0.000000s : 3: predicate.virtual_view_grad_eliminate 0.56% : 0.000001s : 3: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000205 5 7.78% : 0.000016s : 1: func_graph_cloner_run.FuncGraphClonerGraph 92.22% : 0.000189s : 4: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.021436 192 0.02% : 0.000004s : 1: ForceFp32Comm 14.30% : 0.003064s : 1: add_attr 14.25% : 0.003055s : 1: add_attr_with_inline 0.02% : 0.000004s : 1: add_comm_op_reuse_tag 0.23% : 0.000049s : 1: add_recomputation 0.02% : 0.000004s : 1: assign_add_opt 0.27% : 0.000058s : 1: auto_monad 0.09% : 0.000019s : 1: auto_monad_reorder 0.02% : 0.000003s : 1: begin_end_overlap_inline 0.02% : 0.000005s : 1: bias_add_comm_swap 2.18% : 0.000466s : 1: bootstrap 0.12% : 0.000027s : 1: cconv 0.02% : 0.000004s : 1: comm_op_add_attrs 0.07% : 0.000015s : 1: control_data_broadcast_order 0.04% : 0.000009s : 1: convert_after_rewriter 0.11% : 0.000024s : 1: cse_after_recomputation 0.02% : 0.000005s : 1: dataset_repeat_opt 0.02% : 0.000005s : 1: detach_backward 0.04% : 0.000008s : 1: environ_conv 0.08% : 0.000018s : 1: event_method 0.02% : 0.000005s : 1: full_micro_interleaved_order_control 0.02% : 0.000005s : 1: get_jit_bprop_graph 0.04% : 0.000009s : 1: graph_reusing 0.02% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.02% : 0.000004s : 1: handle_group_info 0.02% : 0.000005s : 1: inline 0.03% : 0.000006s : 1: insert-virtual-dataset 0.02% : 0.000004s : 1: interleave_parallel_branches 0.02% : 0.000004s : 1: interleave_split_concat_branches 0.03% : 0.000005s : 1: label_fine_grained_interleaved_index 0.03% : 0.000007s : 1: label_micro_interleaved_index 1.99% : 0.000427s : 1: loop_unroll 0.02% : 0.000004s : 1: merge_cast_opt 0.02% : 0.000005s : 1: micro_interleaved_order_control 2.27% : 0.000487s : 1: mutable_eliminate 0.03% : 0.000007s : 1: offloading_packed_experts 0.06% : 0.000014s : 1: opt.transform.loop_unroll_optimizer 0.07% : 0.000015s : 1: opt.transform.mutable_eliminate 3.47% : 0.000743s : 78: opt.transform.opt_a 0.12% : 0.000026s : 1: opt.transform.opt_after_cconv 0.11% : 0.000023s : 1: opt.transform.opt_after_jit_grad 0.62% : 0.000132s : 28: opt.transform.opt_b 0.20% : 0.000044s : 2: opt.transform.opt_trans_graph 0.16% : 0.000034s : 4: opt.transform.symbol_engine_opt 9.47% : 0.002031s : 1: opt_a 0.47% : 0.000101s : 1: opt_after_cconv 2.17% : 0.000465s : 1: opt_after_jit_grad 1.12% : 0.000239s : 1: opt_b 18.44% : 0.003953s : 1: optimize 0.09% : 0.000020s : 1: optimize_parallel_all_gather_comm 0.04% : 0.000009s : 1: order_py_execute_after_rewriter 0.10% : 0.000021s : 1: overlap_grad_flash_sp 0.02% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.03% : 0.000007s : 1: overlap_grad_ring_attention 0.02% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.02% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.02% : 0.000005s : 1: overlap_param_gather 0.02% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.04% : 0.000008s : 1: overlap_recompute_and_grad_model_parallel 0.02% : 0.000005s : 1: overlap_recompute_comm 0.03% : 0.000007s : 1: parallel-infer-symbol 0.02% : 0.000004s : 1: parallel-infer-symbol-second 0.02% : 0.000005s : 1: partial_unused_args_eliminate 0.02% : 0.000005s : 1: pipeline_parallel_scheduler 0.02% : 0.000004s : 1: pipeline_split 0.12% : 0.000027s : 1: pre_auto_parallel 0.10% : 0.000021s : 1: py_interpret_to_execute 0.05% : 0.000011s : 1: py_interpret_to_execute_after_opt_a 0.02% : 0.000004s : 1: remove_cast_before_assign_add 0.08% : 0.000018s : 1: remove_dup_value 1.27% : 0.000272s : 1: renormalize.infer 0.91% : 0.000196s : 1: renormalize.specialize 0.03% : 0.000005s : 1: reorder_send_recv_between_fp_bp 0.03% : 0.000007s : 1: rewriter_after_jit_bprop_graph 0.18% : 0.000038s : 1: rewriter_after_opt_a 0.22% : 0.000047s : 1: rewriter_before_opt_a 0.02% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.02% : 0.000005s : 1: slice_recompute_activation 0.02% : 0.000004s : 1: split_layernorm_comm 0.02% : 0.000005s : 1: split_matmul_comm_elemetwise 0.04% : 0.000008s : 1: swap_dp_allreduce_reducescatter 0.35% : 0.000075s : 1: symbol_engine_optimizer 0.34% : 0.000074s : 1: tuple_transform 22.61% : 0.004847s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:59.109.639 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:59.109.893 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0153791, [21] [bootstrap]: 0.00043998 [type_inference]: 0.00556032 [event_method]: 1.299e-05 [auto_monad]: 5.864e-05 [graph_reusing]: 5.99e-06 [inline]: 2.21998e-06 [add_attr]: 0.00326453, [1] [add_attr_with_inline]: 0.00325561, [1] [Cycle 1]: 7.256e-05, [2] [tag_attr]: 1.397e-05 [meta_addattr_fg_expand]: 3.53e-06 [parallel-infer-symbol]: 3.01999e-06 [pre_auto_parallel]: 2.709e-05 [insert-virtual-dataset]: 2.40002e-06 [parallel-infer-symbol-second]: 9.89996e-07 [dataset_repeat_opt]: 1.92001e-06 [pipeline_split]: 1.92999e-06 [optimize]: 0.00485778, [53] [py_interpret_to_execute]: 2.206e-05 [rewriter_before_opt_a]: 5.289e-05 [opt_a]: 0.00255734, [2] [Cycle 1]: 0.00174334, [45] [expand_dump_flag]: 3.03e-06 [switch_simplify]: 2.459e-05 [loop_unroll]: 1.391e-05 [a_1]: 0.00029761 [with_stream_mark]: 1.731e-05 [recompute_prepare]: 8.07e-06 [updatestate_depend_eliminate]: 3.82998e-06 [updatestate_assign_eliminate]: 3.95e-06 [updatestate_loads_eliminate]: 3.04999e-06 [parameter_eliminate]: 1.99e-06 [a_2]: 0.00010872 [accelerated_algorithm]: 6.47001e-06 [shard]: 2.44001e-06 [meta_shard_fg_expand]: 1.86e-06 [shard_inline]: 6.22001e-06 [merge_send_recv]: 9.76e-06 [auto_parallel]: 6.07999e-06 [parallel]: 1.888e-05 [flash_sp]: 9.19e-06 [merge_comm]: 4.47e-06 [allreduce_fusion]: 3.89002e-06 [matmul_add_comm_reduction]: 1.054e-05 [allreduce_slice_to_reducescatter]: 6.00005e-07 [virtual_shard_identity]: 8.57998e-06 [virtual_dataset]: 6.17001e-06 [get_grad_eliminate_]: 5.96e-06 [virtual_output]: 6.11998e-06 [merge_forward]: 3.95e-06 [cell_reuse_recompute_pass]: 1.18001e-06 [offload_activation]: 1.021e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.501e-05 [merge_recompute_call_nodes]: 1.79e-06 [before_grad]: 1.031e-05 [set_forward_comm_id_for_comm_node_pass]: 3.66999e-06 [meta_fg_expand]: 2.94999e-06 [flash_sp_send_recv_attached]: 2.49001e-06 [receive_attached]: 2.69001e-06 [after_resolve]: 1.068e-05 [a_after_grad]: 9.00001e-06 [renormalize]: 0.00057317 [add_forward_monad_depend]: 5.84e-06 [auto_monad_grad]: 2.87002e-06 [auto_monad_eliminator]: 1.537e-05 [cse]: 3.175e-05 [a_3]: 6.126e-05 [Cycle 2]: 0.00080075, [45] [expand_dump_flag]: 1.29e-06 [switch_simplify]: 7.25e-06 [loop_unroll]: 5.97001e-06 [a_1]: 0.00010679 [with_stream_mark]: 1.218e-05 [recompute_prepare]: 6.83e-06 [updatestate_depend_eliminate]: 3.13998e-06 [updatestate_assign_eliminate]: 2.61e-06 [updatestate_loads_eliminate]: 2.71e-06 [parameter_eliminate]: 1.62999e-06 [a_2]: 9.805e-05 [accelerated_algorithm]: 5.89e-06 [shard]: 1.22e-06 [meta_shard_fg_expand]: 1.64998e-06 [shard_inline]: 6.21e-06 [merge_send_recv]: 6.01e-06 [auto_parallel]: 5.52999e-06 [parallel]: 5.83002e-06 [flash_sp]: 3.69002e-06 [merge_comm]: 3.47002e-06 [allreduce_fusion]: 3.06001e-06 [matmul_add_comm_reduction]: 7.41001e-06 [allreduce_slice_to_reducescatter]: 3.49974e-07 [virtual_shard_identity]: 6.30997e-06 [virtual_dataset]: 5.70001e-06 [get_grad_eliminate_]: 5.47999e-06 [virtual_output]: 5.52001e-06 [merge_forward]: 2.71999e-06 [cell_reuse_recompute_pass]: 1.80001e-06 [offload_activation]: 7.41001e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.61e-05 [merge_recompute_call_nodes]: 9.20001e-07 [before_grad]: 9.46003e-06 [set_forward_comm_id_for_comm_node_pass]: 4.12e-06 [meta_fg_expand]: 2.02999e-06 [flash_sp_send_recv_attached]: 1.00001e-06 [receive_attached]: 1.22999e-06 [after_resolve]: 8.81997e-06 [a_after_grad]: 8.67998e-06 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 2.01e-06 [auto_monad_grad]: 1.75001e-06 [auto_monad_eliminator]: 8.58001e-06 [cse]: 1.659e-05 [a_3]: 4.848e-05 [py_interpret_to_execute_after_opt_a]: 1.286e-05 [slice_cell_reuse_recomputed_activation]: 5.35999e-06 [rewriter_after_opt_a]: 4.179e-05 [convert_after_rewriter]: 9.44e-06 [order_py_execute_after_rewriter]: 8.2e-06 [mutable_eliminate]: 0.00053135 [opt_b]: 0.00031466, [1] [Cycle 1]: 0.00030505, [7] [b_1]: 0.00020471 [b_2]: 7.7e-06 [updatestate_depend_eliminate]: 6.89001e-06 [updatestate_assign_eliminate]: 2.58998e-06 [updatestate_loads_eliminate]: 2.84999e-06 [renormalize]: 1.09998e-06 [cse]: 1.975e-05 [optimize_parallel_all_gather_comm]: 2.086e-05 [overlap_param_gather]: 4.73001e-06 [cconv]: 3.051e-05 [loop_unroll]: 0.00045072 [opt_after_cconv]: 0.00012786, [1] [Cycle 1]: 0.00011879, [7] [c_1]: 2.811e-05 [parameter_eliminate]: 3.61999e-06 [updatestate_depend_eliminate]: 5.89999e-06 [updatestate_assign_eliminate]: 2.84001e-06 [updatestate_loads_eliminate]: 2.44001e-06 [cse]: 1.901e-05 [renormalize]: 3.29979e-07 [remove_dup_value]: 1.733e-05 [tuple_transform]: 8.569e-05, [1] [Cycle 1]: 7.834e-05, [4] [d_1]: 3.894e-05 [none_parameter_eliminate]: 1.57999e-06 [renormalize]: 2.09984e-07 [switch_simplify]: 6.89999e-06 [partial_unused_args_eliminate]: 4.3e-06 [add_recomputation]: 4.669e-05 [cse_after_recomputation]: 2.717e-05, [1] [Cycle 1]: 2.022e-05, [1] [cse]: 1.17e-05 [environ_conv]: 8.10999e-06 [swap_dp_allreduce_reducescatter]: 7.80998e-06 [bias_add_comm_swap]: 5.17999e-06 [label_micro_interleaved_index]: 6.73e-06 [label_fine_grained_interleaved_index]: 5.51e-06 [merge_cast_opt]: 3.73999e-06 [slice_recompute_activation]: 4.25999e-06 [micro_interleaved_order_control]: 4.70999e-06 [assign_add_opt]: 3.84002e-06 [ForceFp32Comm]: 3.09999e-06 [remove_cast_before_assign_add]: 3.4e-06 [full_micro_interleaved_order_control]: 4.38001e-06 [reorder_send_recv_between_fp_bp]: 5.46998e-06 [comm_op_add_attrs]: 3.43e-06 [add_comm_op_reuse_tag]: 3.46001e-06 [interleave_split_concat_branches]: 3.89002e-06 [interleave_parallel_branches]: 3.44001e-06 [overlap_opt_shard_in_pipeline]: 4.05e-06 [overlap_opt_shard_grad_in_pipeline]: 4.83001e-06 [control_data_broadcast_order]: 1.586e-05 [grouped_pairwise_exchange_alltoall]: 4.07e-06 [offloading_packed_experts]: 6.46e-06 [overlap_recompute_and_grad_model_parallel]: 7.11999e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.64002e-06 [overlap_recompute_allgather_and_fa_grad]: 3.80998e-06 [overlap_recompute_comm]: 5.22999e-06 [overlap_grad_ring_attention]: 6.45002e-06 [overlap_grad_flash_sp]: 2.198e-05 [begin_end_overlap_inline]: 2.99999e-06 [split_matmul_comm_elemetwise]: 4.70999e-06 [split_layernorm_comm]: 4.35e-06 [handle_group_info]: 3.3e-06 [symbol_engine_optimizer]: 9.585e-05, [1] [Cycle 1]: 8.861e-05, [6] [build]: 2.71e-06 [elim_shapecalc]: 9.89001e-06 [elim_not_effective]: 1.329e-05 [opt_reshape]: 6.64001e-06 [fold_const_symbol]: 1.032e-05 [renormalize]: 2.3999e-07 [detach_backward]: 3.46001e-06 [pipeline_parallel_scheduler]: 1.86998e-06 [auto_monad_reorder]: 1.969e-05 [get_jit_bprop_graph]: 1.77001e-06 [rewriter_after_jit_bprop_graph]: 4.32e-06 [opt_after_jit_grad]: 0.00052208 [validate]: 3.549e-05 Sums bootstrap : 0.000440s : 4.23% type_inference : 0.005560s : 53.50% event_method : 0.000013s : 0.12% auto_monad : 0.000059s : 0.56% graph_reusing : 0.000006s : 0.06% inline : 0.000002s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000014s : 0.13% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000004s : 0.03% parallel-infer-symbol : 0.000003s : 0.03% pre_auto_parallel : 0.000027s : 0.26% insert-virtual-dataset : 0.000002s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.02% optimize.py_interpret_to_execute : 0.000022s : 0.21% optimize.rewriter_before_opt_a : 0.000053s : 0.51% optimize.opt_a.expand_dump_flag : 0.000004s : 0.04% optimize.opt_a.switch_simplify : 0.000032s : 0.31% optimize.opt_a.loop_unroll : 0.000020s : 0.19% optimize.opt_a.a_1 : 0.000404s : 3.89% optimize.opt_a.with_stream_mark : 0.000029s : 0.28% optimize.opt_a.recompute_prepare : 0.000015s : 0.14% optimize.opt_a.updatestate_depend_eliminate : 0.000007s : 0.07% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.06% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.06% optimize.opt_a.parameter_eliminate : 0.000004s : 0.03% optimize.opt_a.a_2 : 0.000207s : 1.99% optimize.opt_a.accelerated_algorithm : 0.000012s : 0.12% optimize.opt_a.shard : 0.000004s : 0.04% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.03% optimize.opt_a.shard_inline : 0.000012s : 0.12% optimize.opt_a.merge_send_recv : 0.000016s : 0.15% optimize.opt_a.auto_parallel : 0.000012s : 0.11% optimize.opt_a.parallel : 0.000025s : 0.24% optimize.opt_a.flash_sp : 0.000013s : 0.12% optimize.opt_a.merge_comm : 0.000008s : 0.08% optimize.opt_a.allreduce_fusion : 0.000007s : 0.07% optimize.opt_a.matmul_add_comm_reduction : 0.000018s : 0.17% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000015s : 0.14% optimize.opt_a.virtual_dataset : 0.000012s : 0.11% optimize.opt_a.get_grad_eliminate_ : 0.000011s : 0.11% optimize.opt_a.virtual_output : 0.000012s : 0.11% optimize.opt_a.merge_forward : 0.000007s : 0.06% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.03% optimize.opt_a.offload_activation : 0.000018s : 0.17% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000031s : 0.30% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.03% optimize.opt_a.before_grad : 0.000020s : 0.19% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000008s : 0.07% optimize.opt_a.meta_fg_expand : 0.000005s : 0.05% optimize.opt_a.flash_sp_send_recv_attached : 0.000003s : 0.03% optimize.opt_a.receive_attached : 0.000004s : 0.04% optimize.opt_a.after_resolve : 0.000019s : 0.19% optimize.opt_a.a_after_grad : 0.000018s : 0.17% optimize.opt_a.renormalize : 0.000573s : 5.52% optimize.opt_a.add_forward_monad_depend : 0.000008s : 0.08% optimize.opt_a.auto_monad_grad : 0.000005s : 0.04% optimize.opt_a.auto_monad_eliminator : 0.000024s : 0.23% optimize.opt_a.cse : 0.000048s : 0.47% optimize.opt_a.a_3 : 0.000110s : 1.06% optimize.py_interpret_to_execute_after_opt_a : 0.000013s : 0.12% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.05% optimize.rewriter_after_opt_a : 0.000042s : 0.40% optimize.convert_after_rewriter : 0.000009s : 0.09% optimize.order_py_execute_after_rewriter : 0.000008s : 0.08% optimize.mutable_eliminate : 0.000531s : 5.11% optimize.opt_b.b_1 : 0.000205s : 1.97% optimize.opt_b.b_2 : 0.000008s : 0.07% optimize.opt_b.updatestate_depend_eliminate : 0.000007s : 0.07% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.02% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_b.renormalize : 0.000001s : 0.01% optimize.opt_b.cse : 0.000020s : 0.19% optimize.optimize_parallel_all_gather_comm : 0.000021s : 0.20% optimize.overlap_param_gather : 0.000005s : 0.05% optimize.cconv : 0.000031s : 0.29% optimize.loop_unroll : 0.000451s : 4.34% optimize.opt_after_cconv.c_1 : 0.000028s : 0.27% optimize.opt_after_cconv.parameter_eliminate : 0.000004s : 0.03% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.06% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.02% optimize.opt_after_cconv.cse : 0.000019s : 0.18% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000017s : 0.17% optimize.tuple_transform.d_1 : 0.000039s : 0.37% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.02% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000007s : 0.07% optimize.partial_unused_args_eliminate : 0.000004s : 0.04% optimize.add_recomputation : 0.000047s : 0.45% optimize.cse_after_recomputation.cse : 0.000012s : 0.11% optimize.environ_conv : 0.000008s : 0.08% optimize.swap_dp_allreduce_reducescatter : 0.000008s : 0.08% optimize.bias_add_comm_swap : 0.000005s : 0.05% optimize.label_micro_interleaved_index : 0.000007s : 0.06% optimize.label_fine_grained_interleaved_index : 0.000006s : 0.05% optimize.merge_cast_opt : 0.000004s : 0.04% optimize.slice_recompute_activation : 0.000004s : 0.04% optimize.micro_interleaved_order_control : 0.000005s : 0.05% optimize.assign_add_opt : 0.000004s : 0.04% optimize.ForceFp32Comm : 0.000003s : 0.03% optimize.remove_cast_before_assign_add : 0.000003s : 0.03% optimize.full_micro_interleaved_order_control : 0.000004s : 0.04% optimize.reorder_send_recv_between_fp_bp : 0.000005s : 0.05% optimize.comm_op_add_attrs : 0.000003s : 0.03% optimize.add_comm_op_reuse_tag : 0.000003s : 0.03% optimize.interleave_split_concat_branches : 0.000004s : 0.04% optimize.interleave_parallel_branches : 0.000003s : 0.03% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.04% optimize.overlap_opt_shard_grad_in_pipeline : 0.000005s : 0.05% optimize.control_data_broadcast_order : 0.000016s : 0.15% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.04% optimize.offloading_packed_experts : 0.000006s : 0.06% optimize.overlap_recompute_and_grad_model_parallel : 0.000007s : 0.07% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.04% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.04% optimize.overlap_recompute_comm : 0.000005s : 0.05% optimize.overlap_grad_ring_attention : 0.000006s : 0.06% optimize.overlap_grad_flash_sp : 0.000022s : 0.21% optimize.begin_end_overlap_inline : 0.000003s : 0.03% optimize.split_matmul_comm_elemetwise : 0.000005s : 0.05% optimize.split_layernorm_comm : 0.000004s : 0.04% optimize.handle_group_info : 0.000003s : 0.03% optimize.symbol_engine_optimizer.build : 0.000003s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000010s : 0.10% optimize.symbol_engine_optimizer.elim_not_effective : 0.000013s : 0.13% optimize.symbol_engine_optimizer.opt_reshape : 0.000007s : 0.06% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000010s : 0.10% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000003s : 0.03% pipeline_parallel_scheduler : 0.000002s : 0.02% auto_monad_reorder : 0.000020s : 0.19% get_jit_bprop_graph : 0.000002s : 0.02% rewriter_after_jit_bprop_graph : 0.000004s : 0.04% opt_after_jit_grad : 0.000522s : 5.02% validate : 0.000035s : 0.34% Time group info: ------[substitution.] 0.000149 20 1.50% : 0.000002s : 2: substitution.elim_not_effective 1.18% : 0.000002s : 2: substitution.fold_const_symbol 3.53% : 0.000005s : 3: substitution.graph_param_transform 62.80% : 0.000094s : 2: substitution.inline 2.48% : 0.000004s : 4: substitution.j_node_and_user_rematch 3.07% : 0.000005s : 4: substitution.remove_not_recompute_node 2.55% : 0.000004s : 2: substitution.replace_old_param 22.89% : 0.000034s : 1: substitution.value_based_eliminate ------[type_inference.] 0.005506 2 91.55% : 0.005041s : 1: type_inference.infer 8.45% : 0.000465s : 1: type_inference.specialize ------[replace.] 0.000024 2 100.00% : 0.000024s : 2: replace.inline ------[match.] 0.000092 2 100.00% : 0.000092s : 2: match.inline ------[predicate.] 0.000135 754 0.93% : 0.000001s : 7: predicate.accumulaten_eliminater 1.07% : 0.000001s : 3: predicate.ad_related_special_op_eliminate 0.66% : 0.000001s : 6: predicate.addn_check_dump 0.73% : 0.000001s : 7: predicate.addn_zero_filter 0.67% : 0.000001s : 7: predicate.adjust_all_reduce_mul_add 2.26% : 0.000003s : 13: predicate.arithmetic_simplify 0.82% : 0.000001s : 7: predicate.cast_eliminate 0.76% : 0.000001s : 6: predicate.check_bprop_eliminate 0.67% : 0.000001s : 6: predicate.compare_switch_simplify 0.21% : 0.000000s : 3: predicate.const_output_eliminate 0.78% : 0.000001s : 6: predicate.depend_value_elim 0.74% : 0.000001s : 7: predicate.dict_get_item_const_eliminator 0.89% : 0.000001s : 7: predicate.dict_get_item_eliminator 0.70% : 0.000001s : 7: predicate.dict_set_item_eliminator 1.41% : 0.000002s : 6: predicate.dumpgradient_eliminate 0.24% : 0.000000s : 3: predicate.elim_not_effective 0.53% : 0.000001s : 3: predicate.elim_shapecalc_of_broadcastargs 1.21% : 0.000002s : 10: predicate.environ_add_const_eliminate 0.99% : 0.000001s : 10: predicate.environ_get_add_eliminate 1.00% : 0.000001s : 10: predicate.environ_get_depend_swap 1.78% : 0.000002s : 16: predicate.environ_get_eliminate 1.00% : 0.000001s : 10: predicate.environ_get_set_eliminate 0.96% : 0.000001s : 9: predicate.exchange_switch_depend_value 2.02% : 0.000003s : 9: predicate.float_depend_g_call 0.68% : 0.000001s : 6: predicate.float_environ_get_switch 0.99% : 0.000001s : 9: predicate.float_tuple_getitem_switch 0.19% : 0.000000s : 3: predicate.fold_const_symbol 0.90% : 0.000001s : 6: predicate.get_grad_eliminate 0.27% : 0.000000s : 3: predicate.graph_param_transform 0.80% : 0.000001s : 6: predicate.incorporate_call 0.64% : 0.000001s : 6: predicate.incorporate_call_switch 6.42% : 0.000009s : 34: predicate.inline 1.04% : 0.000001s : 6: predicate.inline_without_move 0.40% : 0.000001s : 6: predicate.j_node_and_user_rematch 1.03% : 0.000001s : 6: predicate.less_batch_normalization 1.57% : 0.000002s : 13: predicate.list_to_tuple_eliminator_ 2.11% : 0.000003s : 20: predicate.load_eliminater 1.17% : 0.000002s : 3: predicate.loop_unroll_after_grad 1.56% : 0.000002s : 14: predicate.loop_unroll_before_grad 1.74% : 0.000002s : 13: predicate.make_slice_get_slice_eliminator 0.70% : 0.000001s : 6: predicate.merge_addn 0.73% : 0.000001s : 6: predicate.micro_step_allgather_replace 0.69% : 0.000001s : 6: predicate.mini_step_allgather_replace 0.70% : 0.000001s : 7: predicate.minmaximum_grad 1.54% : 0.000002s : 3: predicate.mutable_eliminate 0.47% : 0.000001s : 3: predicate.opt_reshape 0.47% : 0.000001s : 3: predicate.parallel_virtual_node 1.24% : 0.000002s : 9: predicate.partial_defer_inline 1.22% : 0.000002s : 10: predicate.partial_eliminate 0.76% : 0.000001s : 7: predicate.print_const_string_wrapper 0.74% : 0.000001s : 6: predicate.reduce_all_const_elim 1.01% : 0.000001s : 7: predicate.reduce_eliminate 2.09% : 0.000003s : 20: predicate.redundant_stop_gradient_eliminater 0.77% : 0.000001s : 6: predicate.remove_not_recompute_node 1.30% : 0.000002s : 13: predicate.replace_applicator 0.66% : 0.000001s : 6: predicate.replace_old_param 0.49% : 0.000001s : 3: predicate.reset_defer_inline 0.96% : 0.000001s : 7: predicate.reshape_eliminate 0.77% : 0.000001s : 6: predicate.row_tensor_add_zeros_like 0.49% : 0.000001s : 3: predicate.row_tensor_eliminate 1.06% : 0.000001s : 6: predicate.same_eliminate 0.65% : 0.000001s : 6: predicate.set_cell_output_no_recompute 1.08% : 0.000001s : 6: predicate.shard_identity_eliminate 0.93% : 0.000001s : 6: predicate.special_op_eliminate 0.99% : 0.000001s : 6: predicate.specialize_transform 1.38% : 0.000002s : 6: predicate.split_environ_get_set_with_tuple_value 0.92% : 0.000001s : 6: predicate.stack_unstack_eliminate 0.41% : 0.000001s : 3: predicate.switch_call_monad_eliminater 1.00% : 0.000001s : 9: predicate.switch_defer_inline 1.64% : 0.000002s : 15: predicate.switch_layer_defer_inline 4.87% : 0.000007s : 32: predicate.switch_simplify 0.81% : 0.000001s : 7: predicate.tile_eliminate 0.83% : 0.000001s : 7: predicate.transpose_eliminate 1.51% : 0.000002s : 13: predicate.tuple_list_convert_item_index_to_positive 1.67% : 0.000002s : 13: predicate.tuple_list_get_item_const_eliminator 1.31% : 0.000002s : 13: predicate.tuple_list_get_item_depend_reorder 3.04% : 0.000004s : 19: predicate.tuple_list_get_item_eliminator 1.36% : 0.000002s : 13: predicate.tuple_list_get_set_item_eliminator 2.53% : 0.000003s : 19: predicate.tuple_list_set_item_eliminator 1.57% : 0.000002s : 13: predicate.tuple_to_list_eliminator_ 1.97% : 0.000003s : 20: predicate.updatestate_pure_node_eliminater 2.99% : 0.000004s : 26: predicate.updatestate_useless_node_eliminater 0.57% : 0.000001s : 3: predicate.value_based_eliminate 0.90% : 0.000001s : 6: predicate.virtual_dataset_eliminate 0.82% : 0.000001s : 6: predicate.virtual_output_eliminate 0.31% : 0.000000s : 3: predicate.virtual_view_grad_eliminate 0.53% : 0.000001s : 3: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000233 5 7.95% : 0.000019s : 1: func_graph_cloner_run.FuncGraphClonerGraph 92.05% : 0.000214s : 4: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.025016 192 0.02% : 0.000006s : 1: ForceFp32Comm 13.08% : 0.003273s : 1: add_attr 13.03% : 0.003259s : 1: add_attr_with_inline 0.02% : 0.000006s : 1: add_comm_op_reuse_tag 0.20% : 0.000050s : 1: add_recomputation 0.03% : 0.000006s : 1: assign_add_opt 0.27% : 0.000067s : 1: auto_monad 0.11% : 0.000026s : 1: auto_monad_reorder 0.02% : 0.000006s : 1: begin_end_overlap_inline 0.03% : 0.000008s : 1: bias_add_comm_swap 1.93% : 0.000484s : 1: bootstrap 0.14% : 0.000034s : 1: cconv 0.02% : 0.000006s : 1: comm_op_add_attrs 0.08% : 0.000019s : 1: control_data_broadcast_order 0.05% : 0.000013s : 1: convert_after_rewriter 0.12% : 0.000030s : 1: cse_after_recomputation 0.03% : 0.000008s : 1: dataset_repeat_opt 0.08% : 0.000019s : 1: detach_backward 0.04% : 0.000011s : 1: environ_conv 0.10% : 0.000024s : 1: event_method 0.03% : 0.000007s : 1: full_micro_interleaved_order_control 0.03% : 0.000008s : 1: get_jit_bprop_graph 0.05% : 0.000013s : 1: graph_reusing 0.03% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.02% : 0.000006s : 1: handle_group_info 0.03% : 0.000008s : 1: inline 0.03% : 0.000008s : 1: insert-virtual-dataset 0.02% : 0.000006s : 1: interleave_parallel_branches 0.03% : 0.000006s : 1: interleave_split_concat_branches 0.03% : 0.000008s : 1: label_fine_grained_interleaved_index 0.04% : 0.000009s : 1: label_micro_interleaved_index 1.83% : 0.000458s : 1: loop_unroll 0.03% : 0.000006s : 1: merge_cast_opt 0.03% : 0.000007s : 1: micro_interleaved_order_control 2.15% : 0.000538s : 1: mutable_eliminate 0.04% : 0.000009s : 1: offloading_packed_experts 0.05% : 0.000014s : 1: opt.transform.loop_unroll_optimizer 0.06% : 0.000015s : 1: opt.transform.mutable_eliminate 3.09% : 0.000774s : 78: opt.transform.opt_a 0.11% : 0.000027s : 1: opt.transform.opt_after_cconv 0.10% : 0.000025s : 1: opt.transform.opt_after_jit_grad 0.55% : 0.000138s : 28: opt.transform.opt_b 0.17% : 0.000044s : 2: opt.transform.opt_trans_graph 0.15% : 0.000036s : 4: opt.transform.symbol_engine_opt 10.24% : 0.002561s : 1: opt_a 0.52% : 0.000131s : 1: opt_after_cconv 2.13% : 0.000533s : 1: opt_after_jit_grad 1.27% : 0.000318s : 1: opt_b 20.67% : 0.005172s : 1: optimize 0.10% : 0.000024s : 1: optimize_parallel_all_gather_comm 0.04% : 0.000011s : 1: order_py_execute_after_rewriter 0.10% : 0.000025s : 1: overlap_grad_flash_sp 0.02% : 0.000006s : 1: overlap_grad_matmul_and_grad_allreduce 0.04% : 0.000009s : 1: overlap_grad_ring_attention 0.03% : 0.000007s : 1: overlap_opt_shard_grad_in_pipeline 0.03% : 0.000007s : 1: overlap_opt_shard_in_pipeline 0.03% : 0.000008s : 1: overlap_param_gather 0.03% : 0.000006s : 1: overlap_recompute_allgather_and_fa_grad 0.04% : 0.000010s : 1: overlap_recompute_and_grad_model_parallel 0.03% : 0.000008s : 1: overlap_recompute_comm 0.04% : 0.000009s : 1: parallel-infer-symbol 0.03% : 0.000006s : 1: parallel-infer-symbol-second 0.03% : 0.000007s : 1: partial_unused_args_eliminate 0.03% : 0.000009s : 1: pipeline_parallel_scheduler 0.03% : 0.000007s : 1: pipeline_split 0.14% : 0.000034s : 1: pre_auto_parallel 0.10% : 0.000026s : 1: py_interpret_to_execute 0.07% : 0.000016s : 1: py_interpret_to_execute_after_opt_a 0.02% : 0.000006s : 1: remove_cast_before_assign_add 0.08% : 0.000020s : 1: remove_dup_value 1.22% : 0.000306s : 1: renormalize.infer 1.04% : 0.000260s : 1: renormalize.specialize 0.03% : 0.000008s : 1: reorder_send_recv_between_fp_bp 0.04% : 0.000011s : 1: rewriter_after_jit_bprop_graph 0.18% : 0.000046s : 1: rewriter_after_opt_a 0.23% : 0.000057s : 1: rewriter_before_opt_a 0.03% : 0.000008s : 1: slice_cell_reuse_recomputed_activation 0.03% : 0.000007s : 1: slice_recompute_activation 0.03% : 0.000007s : 1: split_layernorm_comm 0.03% : 0.000007s : 1: split_matmul_comm_elemetwise 0.04% : 0.000011s : 1: swap_dp_allreduce_reducescatter 0.40% : 0.000099s : 1: symbol_engine_optimizer 0.35% : 0.000089s : 1: tuple_transform 22.35% : 0.005591s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:59.298.388 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.013645, [21] [bootstrap]: 0.00043644 [type_inference]: 0.00530113 [event_method]: 1.249e-05 [auto_monad]: 5.449e-05 [graph_reusing]: 5.37001e-06 [inline]: 2.06998e-06 [add_attr]: 0.00310759, [1] [add_attr_with_inline]: 0.00309997, [1] [Cycle 1]: 5.079e-05, [2] [tag_attr]: 1.381e-05 [meta_addattr_fg_expand]: 3.86001e-06 [parallel-infer-symbol]: 3.14999e-06 [pre_auto_parallel]: 2.303e-05 [insert-virtual-dataset]: 2.67001e-06 [parallel-infer-symbol-second]: 7.2e-07 [dataset_repeat_opt]: 1.87999e-06 [pipeline_split]: 1.56998e-06 [optimize]: 0.00403198, [53] [py_interpret_to_execute]: 1.705e-05 [rewriter_before_opt_a]: 4.317e-05 [opt_a]: 0.00211268, [2] [Cycle 1]: 0.00149544, [45] [expand_dump_flag]: 2.98e-06 [switch_simplify]: 2.58e-05 [loop_unroll]: 1.493e-05 [a_1]: 0.00027678 [with_stream_mark]: 1.442e-05 [recompute_prepare]: 8.77e-06 [updatestate_depend_eliminate]: 3.61999e-06 [updatestate_assign_eliminate]: 3.64002e-06 [updatestate_loads_eliminate]: 2.88998e-06 [parameter_eliminate]: 1.87001e-06 [a_2]: 8.254e-05 [accelerated_algorithm]: 6.71e-06 [shard]: 2.14e-06 [meta_shard_fg_expand]: 1.74e-06 [shard_inline]: 6.21e-06 [merge_send_recv]: 7.82e-06 [auto_parallel]: 7.09001e-06 [parallel]: 1.805e-05 [flash_sp]: 7.58001e-06 [merge_comm]: 3.84002e-06 [allreduce_fusion]: 3.89002e-06 [matmul_add_comm_reduction]: 9.99999e-06 [allreduce_slice_to_reducescatter]: 5.89993e-07 [virtual_shard_identity]: 7.43999e-06 [virtual_dataset]: 6.89999e-06 [get_grad_eliminate_]: 5.71e-06 [virtual_output]: 6.37001e-06 [merge_forward]: 3.9e-06 [cell_reuse_recompute_pass]: 1.35999e-06 [offload_activation]: 1.052e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.307e-05 [merge_recompute_call_nodes]: 1.86998e-06 [before_grad]: 1.06e-05 [set_forward_comm_id_for_comm_node_pass]: 3.91999e-06 [meta_fg_expand]: 2.82002e-06 [flash_sp_send_recv_attached]: 2.48002e-06 [receive_attached]: 2.26e-06 [after_resolve]: 1.047e-05 [a_after_grad]: 9.84001e-06 [renormalize]: 0.00055193 [add_forward_monad_depend]: 4.98001e-06 [auto_monad_grad]: 1.60999e-06 [auto_monad_eliminator]: 1.407e-05 [cse]: 3.003e-05 [a_3]: 4.555e-05 [Cycle 2]: 0.00060778, [45] [expand_dump_flag]: 8.70001e-07 [switch_simplify]: 7.25e-06 [loop_unroll]: 5.77999e-06 [a_1]: 0.00010707 [with_stream_mark]: 1.341e-05 [recompute_prepare]: 6.28e-06 [updatestate_depend_eliminate]: 2.89999e-06 [updatestate_assign_eliminate]: 2.29001e-06 [updatestate_loads_eliminate]: 2.98998e-06 [parameter_eliminate]: 1.07e-06 [a_2]: 6.994e-05 [accelerated_algorithm]: 5.80002e-06 [shard]: 1.10001e-06 [meta_shard_fg_expand]: 1.32e-06 [shard_inline]: 6.05002e-06 [merge_send_recv]: 5.07e-06 [auto_parallel]: 5.44998e-06 [parallel]: 4.74e-06 [flash_sp]: 3.31001e-06 [merge_comm]: 3.58999e-06 [allreduce_fusion]: 3.01001e-06 [matmul_add_comm_reduction]: 5.54e-06 [allreduce_slice_to_reducescatter]: 3.30008e-07 [virtual_shard_identity]: 6.36e-06 [virtual_dataset]: 5.66e-06 [get_grad_eliminate_]: 5.59e-06 [virtual_output]: 5.22e-06 [merge_forward]: 2.59001e-06 [cell_reuse_recompute_pass]: 1.36002e-06 [offload_activation]: 6.56e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.19e-05 [merge_recompute_call_nodes]: 7.7e-07 [before_grad]: 8.99e-06 [set_forward_comm_id_for_comm_node_pass]: 3.96001e-06 [meta_fg_expand]: 2.17001e-06 [flash_sp_send_recv_attached]: 9.50007e-07 [receive_attached]: 9.89996e-07 [after_resolve]: 8.57e-06 [a_after_grad]: 8.37e-06 [renormalize]: 6.00121e-08 [add_forward_monad_depend]: 1.34e-06 [auto_monad_grad]: 8.30012e-07 [auto_monad_eliminator]: 6.59999e-06 [cse]: 1.416e-05 [a_3]: 3.493e-05 [py_interpret_to_execute_after_opt_a]: 7.73999e-06 [slice_cell_reuse_recomputed_activation]: 2.48998e-06 [rewriter_after_opt_a]: 3.346e-05 [convert_after_rewriter]: 7.06001e-06 [order_py_execute_after_rewriter]: 5.52001e-06 [mutable_eliminate]: 0.0004691 [opt_b]: 0.00023124, [1] [Cycle 1]: 0.00022554, [7] [b_1]: 0.0001517 [b_2]: 8.32998e-06 [updatestate_depend_eliminate]: 5.09e-06 [updatestate_assign_eliminate]: 2.54001e-06 [updatestate_loads_eliminate]: 2.20002e-06 [renormalize]: 4.10015e-07 [cse]: 1.759e-05 [optimize_parallel_all_gather_comm]: 1.565e-05 [overlap_param_gather]: 1.97999e-06 [cconv]: 2.403e-05 [loop_unroll]: 0.00041913 [opt_after_cconv]: 9.842e-05, [1] [Cycle 1]: 9.292e-05, [7] [c_1]: 2.772e-05 [parameter_eliminate]: 2.37999e-06 [updatestate_depend_eliminate]: 4.80999e-06 [updatestate_assign_eliminate]: 2.49999e-06 [updatestate_loads_eliminate]: 2.73998e-06 [cse]: 1.774e-05 [renormalize]: 3.30008e-07 [remove_dup_value]: 1.387e-05 [tuple_transform]: 7.1e-05, [1] [Cycle 1]: 6.571e-05, [4] [d_1]: 3.895e-05 [none_parameter_eliminate]: 1.59e-06 [renormalize]: 1.60013e-07 [switch_simplify]: 6.46999e-06 [partial_unused_args_eliminate]: 2.44001e-06 [add_recomputation]: 4.475e-05 [cse_after_recomputation]: 2.082e-05, [1] [Cycle 1]: 1.669e-05, [1] [cse]: 1.111e-05 [environ_conv]: 4.77e-06 [swap_dp_allreduce_reducescatter]: 5.42001e-06 [bias_add_comm_swap]: 2.59999e-06 [label_micro_interleaved_index]: 3.88999e-06 [label_fine_grained_interleaved_index]: 2.61e-06 [merge_cast_opt]: 1.25001e-06 [slice_recompute_activation]: 1.94999e-06 [micro_interleaved_order_control]: 2.46e-06 [assign_add_opt]: 1.50001e-06 [ForceFp32Comm]: 7.79983e-07 [remove_cast_before_assign_add]: 1.00001e-06 [full_micro_interleaved_order_control]: 2.06e-06 [reorder_send_recv_between_fp_bp]: 2.84001e-06 [comm_op_add_attrs]: 1.42e-06 [add_comm_op_reuse_tag]: 1.06997e-06 [interleave_split_concat_branches]: 1.34e-06 [interleave_parallel_branches]: 1.07998e-06 [overlap_opt_shard_in_pipeline]: 1.17999e-06 [overlap_opt_shard_grad_in_pipeline]: 2.33998e-06 [control_data_broadcast_order]: 1.22e-05 [grouped_pairwise_exchange_alltoall]: 1.47001e-06 [offloading_packed_experts]: 3.86999e-06 [overlap_recompute_and_grad_model_parallel]: 4.57e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.29e-06 [overlap_recompute_allgather_and_fa_grad]: 1.42e-06 [overlap_recompute_comm]: 2.44001e-06 [overlap_grad_ring_attention]: 4.30999e-06 [overlap_grad_flash_sp]: 1.745e-05 [begin_end_overlap_inline]: 5.8001e-07 [split_matmul_comm_elemetwise]: 2.05002e-06 [split_layernorm_comm]: 1.58002e-06 [handle_group_info]: 9.29984e-07 [symbol_engine_optimizer]: 7.434e-05, [1] [Cycle 1]: 7.011e-05, [6] [build]: 2.40002e-06 [elim_shapecalc]: 9.65002e-06 [elim_not_effective]: 1.294e-05 [opt_reshape]: 6.98998e-06 [fold_const_symbol]: 1.007e-05 [renormalize]: 2.19996e-07 [detach_backward]: 1.71e-06 [pipeline_parallel_scheduler]: 1.49e-06 [auto_monad_reorder]: 1.531e-05 [get_jit_bprop_graph]: 1.54e-06 [rewriter_after_jit_bprop_graph]: 3.62002e-06 [opt_after_jit_grad]: 0.00046026 [validate]: 3.536e-05 Sums bootstrap : 0.000436s : 4.55% type_inference : 0.005301s : 55.24% event_method : 0.000012s : 0.13% auto_monad : 0.000054s : 0.57% graph_reusing : 0.000005s : 0.06% inline : 0.000002s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000014s : 0.14% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000004s : 0.04% parallel-infer-symbol : 0.000003s : 0.03% pre_auto_parallel : 0.000023s : 0.24% insert-virtual-dataset : 0.000003s : 0.03% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.02% optimize.py_interpret_to_execute : 0.000017s : 0.18% optimize.rewriter_before_opt_a : 0.000043s : 0.45% optimize.opt_a.expand_dump_flag : 0.000004s : 0.04% optimize.opt_a.switch_simplify : 0.000033s : 0.34% optimize.opt_a.loop_unroll : 0.000021s : 0.22% optimize.opt_a.a_1 : 0.000384s : 4.00% optimize.opt_a.with_stream_mark : 0.000028s : 0.29% optimize.opt_a.recompute_prepare : 0.000015s : 0.16% optimize.opt_a.updatestate_depend_eliminate : 0.000007s : 0.07% optimize.opt_a.updatestate_assign_eliminate : 0.000006s : 0.06% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.06% optimize.opt_a.parameter_eliminate : 0.000003s : 0.03% optimize.opt_a.a_2 : 0.000152s : 1.59% optimize.opt_a.accelerated_algorithm : 0.000013s : 0.13% optimize.opt_a.shard : 0.000003s : 0.03% optimize.opt_a.meta_shard_fg_expand : 0.000003s : 0.03% optimize.opt_a.shard_inline : 0.000012s : 0.13% optimize.opt_a.merge_send_recv : 0.000013s : 0.13% optimize.opt_a.auto_parallel : 0.000013s : 0.13% optimize.opt_a.parallel : 0.000023s : 0.24% optimize.opt_a.flash_sp : 0.000011s : 0.11% optimize.opt_a.merge_comm : 0.000007s : 0.08% optimize.opt_a.allreduce_fusion : 0.000007s : 0.07% optimize.opt_a.matmul_add_comm_reduction : 0.000016s : 0.16% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000014s : 0.14% optimize.opt_a.virtual_dataset : 0.000013s : 0.13% optimize.opt_a.get_grad_eliminate_ : 0.000011s : 0.12% optimize.opt_a.virtual_output : 0.000012s : 0.12% optimize.opt_a.merge_forward : 0.000006s : 0.07% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.03% optimize.opt_a.offload_activation : 0.000017s : 0.18% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000025s : 0.26% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.03% optimize.opt_a.before_grad : 0.000020s : 0.20% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000008s : 0.08% optimize.opt_a.meta_fg_expand : 0.000005s : 0.05% optimize.opt_a.flash_sp_send_recv_attached : 0.000003s : 0.04% optimize.opt_a.receive_attached : 0.000003s : 0.03% optimize.opt_a.after_resolve : 0.000019s : 0.20% optimize.opt_a.a_after_grad : 0.000018s : 0.19% optimize.opt_a.renormalize : 0.000552s : 5.75% optimize.opt_a.add_forward_monad_depend : 0.000006s : 0.07% optimize.opt_a.auto_monad_grad : 0.000002s : 0.03% optimize.opt_a.auto_monad_eliminator : 0.000021s : 0.22% optimize.opt_a.cse : 0.000044s : 0.46% optimize.opt_a.a_3 : 0.000080s : 0.84% optimize.py_interpret_to_execute_after_opt_a : 0.000008s : 0.08% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.03% optimize.rewriter_after_opt_a : 0.000033s : 0.35% optimize.convert_after_rewriter : 0.000007s : 0.07% optimize.order_py_execute_after_rewriter : 0.000006s : 0.06% optimize.mutable_eliminate : 0.000469s : 4.89% optimize.opt_b.b_1 : 0.000152s : 1.58% optimize.opt_b.b_2 : 0.000008s : 0.09% optimize.opt_b.updatestate_depend_eliminate : 0.000005s : 0.05% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_b.updatestate_loads_eliminate : 0.000002s : 0.02% optimize.opt_b.renormalize : 0.000000s : 0.00% optimize.opt_b.cse : 0.000018s : 0.18% optimize.optimize_parallel_all_gather_comm : 0.000016s : 0.16% optimize.overlap_param_gather : 0.000002s : 0.02% optimize.cconv : 0.000024s : 0.25% optimize.loop_unroll : 0.000419s : 4.37% optimize.opt_after_cconv.c_1 : 0.000028s : 0.29% optimize.opt_after_cconv.parameter_eliminate : 0.000002s : 0.02% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000005s : 0.05% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000002s : 0.03% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.cse : 0.000018s : 0.18% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000014s : 0.14% optimize.tuple_transform.d_1 : 0.000039s : 0.41% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.02% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000006s : 0.07% optimize.partial_unused_args_eliminate : 0.000002s : 0.03% optimize.add_recomputation : 0.000045s : 0.47% optimize.cse_after_recomputation.cse : 0.000011s : 0.12% optimize.environ_conv : 0.000005s : 0.05% optimize.swap_dp_allreduce_reducescatter : 0.000005s : 0.06% optimize.bias_add_comm_swap : 0.000003s : 0.03% optimize.label_micro_interleaved_index : 0.000004s : 0.04% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.03% optimize.merge_cast_opt : 0.000001s : 0.01% optimize.slice_recompute_activation : 0.000002s : 0.02% optimize.micro_interleaved_order_control : 0.000002s : 0.03% optimize.assign_add_opt : 0.000002s : 0.02% optimize.ForceFp32Comm : 0.000001s : 0.01% optimize.remove_cast_before_assign_add : 0.000001s : 0.01% optimize.full_micro_interleaved_order_control : 0.000002s : 0.02% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.03% optimize.comm_op_add_attrs : 0.000001s : 0.01% optimize.add_comm_op_reuse_tag : 0.000001s : 0.01% optimize.interleave_split_concat_branches : 0.000001s : 0.01% optimize.interleave_parallel_branches : 0.000001s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.02% optimize.control_data_broadcast_order : 0.000012s : 0.13% optimize.grouped_pairwise_exchange_alltoall : 0.000001s : 0.02% optimize.offloading_packed_experts : 0.000004s : 0.04% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.05% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.01% optimize.overlap_recompute_comm : 0.000002s : 0.03% optimize.overlap_grad_ring_attention : 0.000004s : 0.04% optimize.overlap_grad_flash_sp : 0.000017s : 0.18% optimize.begin_end_overlap_inline : 0.000001s : 0.01% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.02% optimize.split_layernorm_comm : 0.000002s : 0.02% optimize.handle_group_info : 0.000001s : 0.01% optimize.symbol_engine_optimizer.build : 0.000002s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000010s : 0.10% optimize.symbol_engine_optimizer.elim_not_effective : 0.000013s : 0.13% optimize.symbol_engine_optimizer.opt_reshape : 0.000007s : 0.07% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000010s : 0.10% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.02% pipeline_parallel_scheduler : 0.000001s : 0.02% auto_monad_reorder : 0.000015s : 0.16% get_jit_bprop_graph : 0.000002s : 0.02% rewriter_after_jit_bprop_graph : 0.000004s : 0.04% opt_after_jit_grad : 0.000460s : 4.80% validate : 0.000035s : 0.37% Time group info: ------[substitution.] 0.000125 20 1.67% : 0.000002s : 2: substitution.elim_not_effective 1.16% : 0.000001s : 2: substitution.fold_const_symbol 4.87% : 0.000006s : 3: substitution.graph_param_transform 64.58% : 0.000081s : 2: substitution.inline 2.32% : 0.000003s : 4: substitution.j_node_and_user_rematch 3.48% : 0.000004s : 4: substitution.remove_not_recompute_node 2.09% : 0.000003s : 2: substitution.replace_old_param 19.84% : 0.000025s : 1: substitution.value_based_eliminate ------[type_inference.] 0.005257 2 91.65% : 0.004819s : 1: type_inference.infer 8.35% : 0.000439s : 1: type_inference.specialize ------[replace.] 0.000019 2 100.00% : 0.000019s : 2: replace.inline ------[match.] 0.000079 2 100.00% : 0.000079s : 2: match.inline ------[predicate.] 0.000132 754 1.11% : 0.000001s : 7: predicate.accumulaten_eliminater 1.01% : 0.000001s : 3: predicate.ad_related_special_op_eliminate 0.67% : 0.000001s : 6: predicate.addn_check_dump 0.85% : 0.000001s : 7: predicate.addn_zero_filter 0.71% : 0.000001s : 7: predicate.adjust_all_reduce_mul_add 2.04% : 0.000003s : 13: predicate.arithmetic_simplify 0.85% : 0.000001s : 7: predicate.cast_eliminate 0.87% : 0.000001s : 6: predicate.check_bprop_eliminate 0.67% : 0.000001s : 6: predicate.compare_switch_simplify 0.25% : 0.000000s : 3: predicate.const_output_eliminate 0.68% : 0.000001s : 6: predicate.depend_value_elim 0.76% : 0.000001s : 7: predicate.dict_get_item_const_eliminator 0.97% : 0.000001s : 7: predicate.dict_get_item_eliminator 0.79% : 0.000001s : 7: predicate.dict_set_item_eliminator 1.20% : 0.000002s : 6: predicate.dumpgradient_eliminate 0.24% : 0.000000s : 3: predicate.elim_not_effective 0.48% : 0.000001s : 3: predicate.elim_shapecalc_of_broadcastargs 1.07% : 0.000001s : 10: predicate.environ_add_const_eliminate 1.01% : 0.000001s : 10: predicate.environ_get_add_eliminate 1.02% : 0.000001s : 10: predicate.environ_get_depend_swap 1.76% : 0.000002s : 16: predicate.environ_get_eliminate 1.01% : 0.000001s : 10: predicate.environ_get_set_eliminate 0.97% : 0.000001s : 9: predicate.exchange_switch_depend_value 2.16% : 0.000003s : 9: predicate.float_depend_g_call 0.69% : 0.000001s : 6: predicate.float_environ_get_switch 0.92% : 0.000001s : 9: predicate.float_tuple_getitem_switch 0.26% : 0.000000s : 3: predicate.fold_const_symbol 0.79% : 0.000001s : 6: predicate.get_grad_eliminate 0.29% : 0.000000s : 3: predicate.graph_param_transform 0.79% : 0.000001s : 6: predicate.incorporate_call 0.62% : 0.000001s : 6: predicate.incorporate_call_switch 6.35% : 0.000008s : 34: predicate.inline 1.09% : 0.000001s : 6: predicate.inline_without_move 0.41% : 0.000001s : 6: predicate.j_node_and_user_rematch 1.11% : 0.000001s : 6: predicate.less_batch_normalization 1.67% : 0.000002s : 13: predicate.list_to_tuple_eliminator_ 2.19% : 0.000003s : 20: predicate.load_eliminater 1.25% : 0.000002s : 3: predicate.loop_unroll_after_grad 1.63% : 0.000002s : 14: predicate.loop_unroll_before_grad 2.01% : 0.000003s : 13: predicate.make_slice_get_slice_eliminator 0.73% : 0.000001s : 6: predicate.merge_addn 0.63% : 0.000001s : 6: predicate.micro_step_allgather_replace 0.68% : 0.000001s : 6: predicate.mini_step_allgather_replace 0.76% : 0.000001s : 7: predicate.minmaximum_grad 1.45% : 0.000002s : 3: predicate.mutable_eliminate 0.45% : 0.000001s : 3: predicate.opt_reshape 0.55% : 0.000001s : 3: predicate.parallel_virtual_node 1.27% : 0.000002s : 9: predicate.partial_defer_inline 1.24% : 0.000002s : 10: predicate.partial_eliminate 0.80% : 0.000001s : 7: predicate.print_const_string_wrapper 0.87% : 0.000001s : 6: predicate.reduce_all_const_elim 1.05% : 0.000001s : 7: predicate.reduce_eliminate 2.01% : 0.000003s : 20: predicate.redundant_stop_gradient_eliminater 0.65% : 0.000001s : 6: predicate.remove_not_recompute_node 1.18% : 0.000002s : 13: predicate.replace_applicator 0.69% : 0.000001s : 6: predicate.replace_old_param 0.42% : 0.000001s : 3: predicate.reset_defer_inline 0.83% : 0.000001s : 7: predicate.reshape_eliminate 0.78% : 0.000001s : 6: predicate.row_tensor_add_zeros_like 0.45% : 0.000001s : 3: predicate.row_tensor_eliminate 1.01% : 0.000001s : 6: predicate.same_eliminate 0.52% : 0.000001s : 6: predicate.set_cell_output_no_recompute 1.00% : 0.000001s : 6: predicate.shard_identity_eliminate 0.92% : 0.000001s : 6: predicate.special_op_eliminate 0.93% : 0.000001s : 6: predicate.specialize_transform 1.09% : 0.000001s : 6: predicate.split_environ_get_set_with_tuple_value 0.89% : 0.000001s : 6: predicate.stack_unstack_eliminate 0.45% : 0.000001s : 3: predicate.switch_call_monad_eliminater 1.04% : 0.000001s : 9: predicate.switch_defer_inline 1.68% : 0.000002s : 15: predicate.switch_layer_defer_inline 4.44% : 0.000006s : 32: predicate.switch_simplify 0.81% : 0.000001s : 7: predicate.tile_eliminate 0.79% : 0.000001s : 7: predicate.transpose_eliminate 1.57% : 0.000002s : 13: predicate.tuple_list_convert_item_index_to_positive 1.63% : 0.000002s : 13: predicate.tuple_list_get_item_const_eliminator 1.34% : 0.000002s : 13: predicate.tuple_list_get_item_depend_reorder 3.26% : 0.000004s : 19: predicate.tuple_list_get_item_eliminator 1.49% : 0.000002s : 13: predicate.tuple_list_get_set_item_eliminator 2.54% : 0.000003s : 19: predicate.tuple_list_set_item_eliminator 1.57% : 0.000002s : 13: predicate.tuple_to_list_eliminator_ 2.01% : 0.000003s : 20: predicate.updatestate_pure_node_eliminater 2.91% : 0.000004s : 26: predicate.updatestate_useless_node_eliminater 0.70% : 0.000001s : 3: predicate.value_based_eliminate 1.00% : 0.000001s : 6: predicate.virtual_dataset_eliminate 0.82% : 0.000001s : 6: predicate.virtual_output_eliminate 0.34% : 0.000000s : 3: predicate.virtual_view_grad_eliminate 0.53% : 0.000001s : 3: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000208 5 7.89% : 0.000016s : 1: func_graph_cloner_run.FuncGraphClonerGraph 92.11% : 0.000192s : 4: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.022251 192 0.02% : 0.000004s : 1: ForceFp32Comm 13.99% : 0.003112s : 1: add_attr 13.95% : 0.003103s : 1: add_attr_with_inline 0.02% : 0.000004s : 1: add_comm_op_reuse_tag 0.22% : 0.000049s : 1: add_recomputation 0.02% : 0.000004s : 1: assign_add_opt 0.27% : 0.000059s : 1: auto_monad 0.08% : 0.000019s : 1: auto_monad_reorder 0.02% : 0.000003s : 1: begin_end_overlap_inline 0.03% : 0.000006s : 1: bias_add_comm_swap 2.08% : 0.000464s : 1: bootstrap 0.12% : 0.000027s : 1: cconv 0.02% : 0.000004s : 1: comm_op_add_attrs 0.07% : 0.000016s : 1: control_data_broadcast_order 0.05% : 0.000010s : 1: convert_after_rewriter 0.11% : 0.000025s : 1: cse_after_recomputation 0.02% : 0.000005s : 1: dataset_repeat_opt 0.02% : 0.000005s : 1: detach_backward 0.04% : 0.000008s : 1: environ_conv 0.08% : 0.000018s : 1: event_method 0.02% : 0.000005s : 1: full_micro_interleaved_order_control 0.02% : 0.000005s : 1: get_jit_bprop_graph 0.04% : 0.000009s : 1: graph_reusing 0.02% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.02% : 0.000004s : 1: handle_group_info 0.02% : 0.000005s : 1: inline 0.03% : 0.000006s : 1: insert-virtual-dataset 0.02% : 0.000004s : 1: interleave_parallel_branches 0.02% : 0.000004s : 1: interleave_split_concat_branches 0.03% : 0.000006s : 1: label_fine_grained_interleaved_index 0.03% : 0.000007s : 1: label_micro_interleaved_index 1.92% : 0.000427s : 1: loop_unroll 0.02% : 0.000004s : 1: merge_cast_opt 0.02% : 0.000005s : 1: micro_interleaved_order_control 2.15% : 0.000479s : 1: mutable_eliminate 0.03% : 0.000007s : 1: offloading_packed_experts 0.06% : 0.000013s : 1: opt.transform.loop_unroll_optimizer 0.06% : 0.000014s : 1: opt.transform.mutable_eliminate 3.37% : 0.000750s : 78: opt.transform.opt_a 0.12% : 0.000026s : 1: opt.transform.opt_after_cconv 0.10% : 0.000023s : 1: opt.transform.opt_after_jit_grad 0.57% : 0.000127s : 28: opt.transform.opt_b 0.19% : 0.000043s : 2: opt.transform.opt_trans_graph 0.16% : 0.000036s : 4: opt.transform.symbol_engine_opt 9.51% : 0.002116s : 1: opt_a 0.46% : 0.000102s : 1: opt_after_cconv 2.11% : 0.000469s : 1: opt_after_jit_grad 1.06% : 0.000235s : 1: opt_b 18.14% : 0.004037s : 1: optimize 0.09% : 0.000019s : 1: optimize_parallel_all_gather_comm 0.04% : 0.000009s : 1: order_py_execute_after_rewriter 0.09% : 0.000021s : 1: overlap_grad_flash_sp 0.02% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.03% : 0.000007s : 1: overlap_grad_ring_attention 0.02% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.02% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.02% : 0.000005s : 1: overlap_param_gather 0.02% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.03% : 0.000008s : 1: overlap_recompute_and_grad_model_parallel 0.02% : 0.000005s : 1: overlap_recompute_comm 0.03% : 0.000006s : 1: parallel-infer-symbol 0.02% : 0.000004s : 1: parallel-infer-symbol-second 0.02% : 0.000005s : 1: partial_unused_args_eliminate 0.02% : 0.000005s : 1: pipeline_parallel_scheduler 0.02% : 0.000005s : 1: pipeline_split 0.12% : 0.000027s : 1: pre_auto_parallel 0.09% : 0.000021s : 1: py_interpret_to_execute 0.05% : 0.000011s : 1: py_interpret_to_execute_after_opt_a 0.02% : 0.000004s : 1: remove_cast_before_assign_add 0.08% : 0.000018s : 1: remove_dup_value 1.53% : 0.000340s : 1: renormalize.infer 0.92% : 0.000204s : 1: renormalize.specialize 0.03% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.03% : 0.000007s : 1: rewriter_after_jit_bprop_graph 0.17% : 0.000038s : 1: rewriter_after_opt_a 0.21% : 0.000047s : 1: rewriter_before_opt_a 0.02% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.02% : 0.000005s : 1: slice_recompute_activation 0.02% : 0.000004s : 1: split_layernorm_comm 0.02% : 0.000005s : 1: split_matmul_comm_elemetwise 0.04% : 0.000008s : 1: swap_dp_allreduce_reducescatter 0.35% : 0.000077s : 1: symbol_engine_optimizer 0.33% : 0.000074s : 1: tuple_transform 23.90% : 0.005317s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:59.486.999 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:59.487.260 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0146504, [21] [bootstrap]: 0.00043181 [type_inference]: 0.00540276 [event_method]: 1.199e-05 [auto_monad]: 5.219e-05 [graph_reusing]: 5.43002e-06 [inline]: 1.87999e-06 [add_attr]: 0.0030698, [1] [add_attr_with_inline]: 0.00306239, [1] [Cycle 1]: 6.485e-05, [2] [tag_attr]: 1.46e-05 [meta_addattr_fg_expand]: 4.15999e-06 [parallel-infer-symbol]: 3.21001e-06 [pre_auto_parallel]: 2.327e-05 [insert-virtual-dataset]: 2.64001e-06 [parallel-infer-symbol-second]: 7.50006e-07 [dataset_repeat_opt]: 1.92999e-06 [pipeline_split]: 1.67999e-06 [optimize]: 0.00453502, [53] [py_interpret_to_execute]: 2.158e-05 [rewriter_before_opt_a]: 4.62e-05 [opt_a]: 0.00234121, [2] [Cycle 1]: 0.00155625, [45] [expand_dump_flag]: 3.41999e-06 [switch_simplify]: 2.371e-05 [loop_unroll]: 1.39e-05 [a_1]: 0.00027612 [with_stream_mark]: 1.456e-05 [recompute_prepare]: 8.23001e-06 [updatestate_depend_eliminate]: 4.50999e-06 [updatestate_assign_eliminate]: 3.4e-06 [updatestate_loads_eliminate]: 2.84999e-06 [parameter_eliminate]: 2.05002e-06 [a_2]: 0.00012768 [accelerated_algorithm]: 6.89999e-06 [shard]: 2.34001e-06 [meta_shard_fg_expand]: 1.84e-06 [shard_inline]: 6.07001e-06 [merge_send_recv]: 8.33999e-06 [auto_parallel]: 5.77999e-06 [parallel]: 1.869e-05 [flash_sp]: 7.66001e-06 [merge_comm]: 4.30999e-06 [allreduce_fusion]: 3.4e-06 [matmul_add_comm_reduction]: 9.27001e-06 [allreduce_slice_to_reducescatter]: 6.09987e-07 [virtual_shard_identity]: 7.60998e-06 [virtual_dataset]: 6.78e-06 [get_grad_eliminate_]: 5.65001e-06 [virtual_output]: 6.54001e-06 [merge_forward]: 4.44002e-06 [cell_reuse_recompute_pass]: 1.40001e-06 [offload_activation]: 9.61e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.433e-05 [merge_recompute_call_nodes]: 1.45999e-06 [before_grad]: 1.058e-05 [set_forward_comm_id_for_comm_node_pass]: 3.80998e-06 [meta_fg_expand]: 2.54001e-06 [flash_sp_send_recv_attached]: 2.40002e-06 [receive_attached]: 1.89999e-06 [after_resolve]: 9.42001e-06 [a_after_grad]: 9.12001e-06 [renormalize]: 0.00042115 [add_forward_monad_depend]: 5.02e-06 [auto_monad_grad]: 1.86998e-06 [auto_monad_eliminator]: 1.362e-05 [cse]: 2.887e-05 [a_3]: 5.823e-05 [Cycle 2]: 0.00077286, [45] [expand_dump_flag]: 1.12e-06 [switch_simplify]: 7e-06 [loop_unroll]: 5.89e-06 [a_1]: 0.00010576 [with_stream_mark]: 1.04e-05 [recompute_prepare]: 6.31e-06 [updatestate_depend_eliminate]: 2.83e-06 [updatestate_assign_eliminate]: 2.53003e-06 [updatestate_loads_eliminate]: 2.50002e-06 [parameter_eliminate]: 1.04e-06 [a_2]: 9.763e-05 [accelerated_algorithm]: 6.06998e-06 [shard]: 1.13001e-06 [meta_shard_fg_expand]: 1.27e-06 [shard_inline]: 6.19001e-06 [merge_send_recv]: 4.83001e-06 [auto_parallel]: 5.49e-06 [parallel]: 5.20999e-06 [flash_sp]: 3.36001e-06 [merge_comm]: 3.39001e-06 [allreduce_fusion]: 2.89999e-06 [matmul_add_comm_reduction]: 5.72999e-06 [allreduce_slice_to_reducescatter]: 3.59985e-07 [virtual_shard_identity]: 6.61999e-06 [virtual_dataset]: 5.74999e-06 [get_grad_eliminate_]: 5.72999e-06 [virtual_output]: 5.61e-06 [merge_forward]: 2.72001e-06 [cell_reuse_recompute_pass]: 1.39e-06 [offload_activation]: 5.95002e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.382e-05 [merge_recompute_call_nodes]: 8.59989e-07 [before_grad]: 8.88002e-06 [set_forward_comm_id_for_comm_node_pass]: 3.67002e-06 [meta_fg_expand]: 2.08998e-06 [flash_sp_send_recv_attached]: 8.89995e-07 [receive_attached]: 1.00001e-06 [after_resolve]: 8.67998e-06 [a_after_grad]: 8.01001e-06 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 1.19003e-06 [auto_monad_grad]: 9.29984e-07 [auto_monad_eliminator]: 6.54999e-06 [cse]: 1.398e-05 [a_3]: 4.783e-05 [py_interpret_to_execute_after_opt_a]: 1.122e-05 [slice_cell_reuse_recomputed_activation]: 4.60999e-06 [rewriter_after_opt_a]: 3.891e-05 [convert_after_rewriter]: 9.18002e-06 [order_py_execute_after_rewriter]: 8.19002e-06 [mutable_eliminate]: 0.00047891 [opt_b]: 0.00030778, [1] [Cycle 1]: 0.00029834, [7] [b_1]: 0.00020047 [b_2]: 7.84002e-06 [updatestate_depend_eliminate]: 5.34e-06 [updatestate_assign_eliminate]: 2.51998e-06 [updatestate_loads_eliminate]: 2.33002e-06 [renormalize]: 4.30009e-07 [cse]: 1.841e-05 [optimize_parallel_all_gather_comm]: 1.853e-05 [overlap_param_gather]: 4.82998e-06 [cconv]: 2.555e-05 [loop_unroll]: 0.00043865 [opt_after_cconv]: 0.00012091, [1] [Cycle 1]: 0.00011228, [7] [c_1]: 2.732e-05 [parameter_eliminate]: 2.32001e-06 [updatestate_depend_eliminate]: 4.92e-06 [updatestate_assign_eliminate]: 2.40002e-06 [updatestate_loads_eliminate]: 2.56e-06 [cse]: 1.817e-05 [renormalize]: 3.20026e-07 [remove_dup_value]: 1.737e-05 [tuple_transform]: 8.627e-05, [1] [Cycle 1]: 7.927e-05, [4] [d_1]: 4.051e-05 [none_parameter_eliminate]: 1.59e-06 [renormalize]: 2.00002e-07 [switch_simplify]: 6.68e-06 [partial_unused_args_eliminate]: 4.63999e-06 [add_recomputation]: 4.726e-05 [cse_after_recomputation]: 2.747e-05, [1] [Cycle 1]: 2.017e-05, [1] [cse]: 1.132e-05 [environ_conv]: 8.08001e-06 [swap_dp_allreduce_reducescatter]: 7.77e-06 [bias_add_comm_swap]: 5.45001e-06 [label_micro_interleaved_index]: 6.86999e-06 [label_fine_grained_interleaved_index]: 5.32001e-06 [merge_cast_opt]: 3.7e-06 [slice_recompute_activation]: 4.84998e-06 [micro_interleaved_order_control]: 4.70999e-06 [assign_add_opt]: 3.59002e-06 [ForceFp32Comm]: 3.08998e-06 [remove_cast_before_assign_add]: 3.81999e-06 [full_micro_interleaved_order_control]: 4.79e-06 [reorder_send_recv_between_fp_bp]: 5.65001e-06 [comm_op_add_attrs]: 3.58999e-06 [add_comm_op_reuse_tag]: 3.38e-06 [interleave_split_concat_branches]: 3.57002e-06 [interleave_parallel_branches]: 3.51001e-06 [overlap_opt_shard_in_pipeline]: 3.6e-06 [overlap_opt_shard_grad_in_pipeline]: 4.11001e-06 [control_data_broadcast_order]: 1.469e-05 [grouped_pairwise_exchange_alltoall]: 4.16001e-06 [offloading_packed_experts]: 6.56e-06 [overlap_recompute_and_grad_model_parallel]: 7.36001e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.64002e-06 [overlap_recompute_allgather_and_fa_grad]: 3.88999e-06 [overlap_recompute_comm]: 5.02e-06 [overlap_grad_ring_attention]: 6.58e-06 [overlap_grad_flash_sp]: 1.993e-05 [begin_end_overlap_inline]: 2.89001e-06 [split_matmul_comm_elemetwise]: 4.98001e-06 [split_layernorm_comm]: 4.13999e-06 [handle_group_info]: 3.18e-06 [symbol_engine_optimizer]: 9.354e-05, [1] [Cycle 1]: 8.656e-05, [6] [build]: 2.71999e-06 [elim_shapecalc]: 9.25999e-06 [elim_not_effective]: 1.244e-05 [opt_reshape]: 6.84999e-06 [fold_const_symbol]: 1.024e-05 [renormalize]: 2.3999e-07 [detach_backward]: 3.36999e-06 [pipeline_parallel_scheduler]: 1.79998e-06 [auto_monad_reorder]: 1.955e-05 [get_jit_bprop_graph]: 1.24e-06 [rewriter_after_jit_bprop_graph]: 4.12e-06 [opt_after_jit_grad]: 0.00047342 [validate]: 3.344e-05 Sums bootstrap : 0.000432s : 4.38% type_inference : 0.005403s : 54.75% event_method : 0.000012s : 0.12% auto_monad : 0.000052s : 0.53% graph_reusing : 0.000005s : 0.06% inline : 0.000002s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000015s : 0.15% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000004s : 0.04% parallel-infer-symbol : 0.000003s : 0.03% pre_auto_parallel : 0.000023s : 0.24% insert-virtual-dataset : 0.000003s : 0.03% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.02% optimize.py_interpret_to_execute : 0.000022s : 0.22% optimize.rewriter_before_opt_a : 0.000046s : 0.47% optimize.opt_a.expand_dump_flag : 0.000005s : 0.05% optimize.opt_a.switch_simplify : 0.000031s : 0.31% optimize.opt_a.loop_unroll : 0.000020s : 0.20% optimize.opt_a.a_1 : 0.000382s : 3.87% optimize.opt_a.with_stream_mark : 0.000025s : 0.25% optimize.opt_a.recompute_prepare : 0.000015s : 0.15% optimize.opt_a.updatestate_depend_eliminate : 0.000007s : 0.07% optimize.opt_a.updatestate_assign_eliminate : 0.000006s : 0.06% optimize.opt_a.updatestate_loads_eliminate : 0.000005s : 0.05% optimize.opt_a.parameter_eliminate : 0.000003s : 0.03% optimize.opt_a.a_2 : 0.000225s : 2.28% optimize.opt_a.accelerated_algorithm : 0.000013s : 0.13% optimize.opt_a.shard : 0.000003s : 0.04% optimize.opt_a.meta_shard_fg_expand : 0.000003s : 0.03% optimize.opt_a.shard_inline : 0.000012s : 0.12% optimize.opt_a.merge_send_recv : 0.000013s : 0.13% optimize.opt_a.auto_parallel : 0.000011s : 0.11% optimize.opt_a.parallel : 0.000024s : 0.24% optimize.opt_a.flash_sp : 0.000011s : 0.11% optimize.opt_a.merge_comm : 0.000008s : 0.08% optimize.opt_a.allreduce_fusion : 0.000006s : 0.06% optimize.opt_a.matmul_add_comm_reduction : 0.000015s : 0.15% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000014s : 0.14% optimize.opt_a.virtual_dataset : 0.000013s : 0.13% optimize.opt_a.get_grad_eliminate_ : 0.000011s : 0.12% optimize.opt_a.virtual_output : 0.000012s : 0.12% optimize.opt_a.merge_forward : 0.000007s : 0.07% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.03% optimize.opt_a.offload_activation : 0.000016s : 0.16% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000028s : 0.29% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.02% optimize.opt_a.before_grad : 0.000019s : 0.20% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000007s : 0.08% optimize.opt_a.meta_fg_expand : 0.000005s : 0.05% optimize.opt_a.flash_sp_send_recv_attached : 0.000003s : 0.03% optimize.opt_a.receive_attached : 0.000003s : 0.03% optimize.opt_a.after_resolve : 0.000018s : 0.18% optimize.opt_a.a_after_grad : 0.000017s : 0.17% optimize.opt_a.renormalize : 0.000421s : 4.27% optimize.opt_a.add_forward_monad_depend : 0.000006s : 0.06% optimize.opt_a.auto_monad_grad : 0.000003s : 0.03% optimize.opt_a.auto_monad_eliminator : 0.000020s : 0.20% optimize.opt_a.cse : 0.000043s : 0.43% optimize.opt_a.a_3 : 0.000106s : 1.07% optimize.py_interpret_to_execute_after_opt_a : 0.000011s : 0.11% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.05% optimize.rewriter_after_opt_a : 0.000039s : 0.39% optimize.convert_after_rewriter : 0.000009s : 0.09% optimize.order_py_execute_after_rewriter : 0.000008s : 0.08% optimize.mutable_eliminate : 0.000479s : 4.85% optimize.opt_b.b_1 : 0.000200s : 2.03% optimize.opt_b.b_2 : 0.000008s : 0.08% optimize.opt_b.updatestate_depend_eliminate : 0.000005s : 0.05% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_b.updatestate_loads_eliminate : 0.000002s : 0.02% optimize.opt_b.renormalize : 0.000000s : 0.00% optimize.opt_b.cse : 0.000018s : 0.19% optimize.optimize_parallel_all_gather_comm : 0.000019s : 0.19% optimize.overlap_param_gather : 0.000005s : 0.05% optimize.cconv : 0.000026s : 0.26% optimize.loop_unroll : 0.000439s : 4.45% optimize.opt_after_cconv.c_1 : 0.000027s : 0.28% optimize.opt_after_cconv.parameter_eliminate : 0.000002s : 0.02% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000005s : 0.05% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000002s : 0.02% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.cse : 0.000018s : 0.18% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000017s : 0.18% optimize.tuple_transform.d_1 : 0.000041s : 0.41% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.02% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000007s : 0.07% optimize.partial_unused_args_eliminate : 0.000005s : 0.05% optimize.add_recomputation : 0.000047s : 0.48% optimize.cse_after_recomputation.cse : 0.000011s : 0.11% optimize.environ_conv : 0.000008s : 0.08% optimize.swap_dp_allreduce_reducescatter : 0.000008s : 0.08% optimize.bias_add_comm_swap : 0.000005s : 0.06% optimize.label_micro_interleaved_index : 0.000007s : 0.07% optimize.label_fine_grained_interleaved_index : 0.000005s : 0.05% optimize.merge_cast_opt : 0.000004s : 0.04% optimize.slice_recompute_activation : 0.000005s : 0.05% optimize.micro_interleaved_order_control : 0.000005s : 0.05% optimize.assign_add_opt : 0.000004s : 0.04% optimize.ForceFp32Comm : 0.000003s : 0.03% optimize.remove_cast_before_assign_add : 0.000004s : 0.04% optimize.full_micro_interleaved_order_control : 0.000005s : 0.05% optimize.reorder_send_recv_between_fp_bp : 0.000006s : 0.06% optimize.comm_op_add_attrs : 0.000004s : 0.04% optimize.add_comm_op_reuse_tag : 0.000003s : 0.03% optimize.interleave_split_concat_branches : 0.000004s : 0.04% optimize.interleave_parallel_branches : 0.000004s : 0.04% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.04% optimize.overlap_opt_shard_grad_in_pipeline : 0.000004s : 0.04% optimize.control_data_broadcast_order : 0.000015s : 0.15% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.04% optimize.offloading_packed_experts : 0.000007s : 0.07% optimize.overlap_recompute_and_grad_model_parallel : 0.000007s : 0.07% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.04% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.04% optimize.overlap_recompute_comm : 0.000005s : 0.05% optimize.overlap_grad_ring_attention : 0.000007s : 0.07% optimize.overlap_grad_flash_sp : 0.000020s : 0.20% optimize.begin_end_overlap_inline : 0.000003s : 0.03% optimize.split_matmul_comm_elemetwise : 0.000005s : 0.05% optimize.split_layernorm_comm : 0.000004s : 0.04% optimize.handle_group_info : 0.000003s : 0.03% optimize.symbol_engine_optimizer.build : 0.000003s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000009s : 0.09% optimize.symbol_engine_optimizer.elim_not_effective : 0.000012s : 0.13% optimize.symbol_engine_optimizer.opt_reshape : 0.000007s : 0.07% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000010s : 0.10% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000003s : 0.03% pipeline_parallel_scheduler : 0.000002s : 0.02% auto_monad_reorder : 0.000020s : 0.20% get_jit_bprop_graph : 0.000001s : 0.01% rewriter_after_jit_bprop_graph : 0.000004s : 0.04% opt_after_jit_grad : 0.000473s : 4.80% validate : 0.000033s : 0.34% Time group info: ------[substitution.] 0.000128 20 1.41% : 0.000002s : 2: substitution.elim_not_effective 1.01% : 0.000001s : 2: substitution.fold_const_symbol 3.72% : 0.000005s : 3: substitution.graph_param_transform 63.85% : 0.000082s : 2: substitution.inline 2.68% : 0.000003s : 4: substitution.j_node_and_user_rematch 3.50% : 0.000004s : 4: substitution.remove_not_recompute_node 2.26% : 0.000003s : 2: substitution.replace_old_param 21.59% : 0.000028s : 1: substitution.value_based_eliminate ------[type_inference.] 0.005359 2 91.30% : 0.004893s : 1: type_inference.infer 8.70% : 0.000466s : 1: type_inference.specialize ------[replace.] 0.000020 2 100.00% : 0.000020s : 2: replace.inline ------[match.] 0.000080 2 100.00% : 0.000080s : 2: match.inline ------[predicate.] 0.000131 754 0.85% : 0.000001s : 7: predicate.accumulaten_eliminater 1.03% : 0.000001s : 3: predicate.ad_related_special_op_eliminate 0.67% : 0.000001s : 6: predicate.addn_check_dump 0.79% : 0.000001s : 7: predicate.addn_zero_filter 0.68% : 0.000001s : 7: predicate.adjust_all_reduce_mul_add 2.42% : 0.000003s : 13: predicate.arithmetic_simplify 0.76% : 0.000001s : 7: predicate.cast_eliminate 0.75% : 0.000001s : 6: predicate.check_bprop_eliminate 0.79% : 0.000001s : 6: predicate.compare_switch_simplify 0.21% : 0.000000s : 3: predicate.const_output_eliminate 0.80% : 0.000001s : 6: predicate.depend_value_elim 0.81% : 0.000001s : 7: predicate.dict_get_item_const_eliminator 0.91% : 0.000001s : 7: predicate.dict_get_item_eliminator 0.77% : 0.000001s : 7: predicate.dict_set_item_eliminator 1.35% : 0.000002s : 6: predicate.dumpgradient_eliminate 0.30% : 0.000000s : 3: predicate.elim_not_effective 0.49% : 0.000001s : 3: predicate.elim_shapecalc_of_broadcastargs 1.15% : 0.000002s : 10: predicate.environ_add_const_eliminate 1.07% : 0.000001s : 10: predicate.environ_get_add_eliminate 1.01% : 0.000001s : 10: predicate.environ_get_depend_swap 1.83% : 0.000002s : 16: predicate.environ_get_eliminate 1.00% : 0.000001s : 10: predicate.environ_get_set_eliminate 0.92% : 0.000001s : 9: predicate.exchange_switch_depend_value 1.92% : 0.000003s : 9: predicate.float_depend_g_call 0.75% : 0.000001s : 6: predicate.float_environ_get_switch 1.02% : 0.000001s : 9: predicate.float_tuple_getitem_switch 0.21% : 0.000000s : 3: predicate.fold_const_symbol 0.86% : 0.000001s : 6: predicate.get_grad_eliminate 0.25% : 0.000000s : 3: predicate.graph_param_transform 0.83% : 0.000001s : 6: predicate.incorporate_call 0.65% : 0.000001s : 6: predicate.incorporate_call_switch 6.47% : 0.000008s : 34: predicate.inline 0.95% : 0.000001s : 6: predicate.inline_without_move 0.40% : 0.000001s : 6: predicate.j_node_and_user_rematch 1.00% : 0.000001s : 6: predicate.less_batch_normalization 1.61% : 0.000002s : 13: predicate.list_to_tuple_eliminator_ 2.05% : 0.000003s : 20: predicate.load_eliminater 1.34% : 0.000002s : 3: predicate.loop_unroll_after_grad 1.64% : 0.000002s : 14: predicate.loop_unroll_before_grad 1.85% : 0.000002s : 13: predicate.make_slice_get_slice_eliminator 0.73% : 0.000001s : 6: predicate.merge_addn 0.73% : 0.000001s : 6: predicate.micro_step_allgather_replace 0.75% : 0.000001s : 6: predicate.mini_step_allgather_replace 0.68% : 0.000001s : 7: predicate.minmaximum_grad 1.35% : 0.000002s : 3: predicate.mutable_eliminate 0.46% : 0.000001s : 3: predicate.opt_reshape 0.56% : 0.000001s : 3: predicate.parallel_virtual_node 1.29% : 0.000002s : 9: predicate.partial_defer_inline 1.23% : 0.000002s : 10: predicate.partial_eliminate 0.76% : 0.000001s : 7: predicate.print_const_string_wrapper 0.71% : 0.000001s : 6: predicate.reduce_all_const_elim 0.98% : 0.000001s : 7: predicate.reduce_eliminate 2.12% : 0.000003s : 20: predicate.redundant_stop_gradient_eliminater 0.60% : 0.000001s : 6: predicate.remove_not_recompute_node 1.19% : 0.000002s : 13: predicate.replace_applicator 0.67% : 0.000001s : 6: predicate.replace_old_param 0.33% : 0.000000s : 3: predicate.reset_defer_inline 0.81% : 0.000001s : 7: predicate.reshape_eliminate 0.79% : 0.000001s : 6: predicate.row_tensor_add_zeros_like 0.49% : 0.000001s : 3: predicate.row_tensor_eliminate 1.02% : 0.000001s : 6: predicate.same_eliminate 0.53% : 0.000001s : 6: predicate.set_cell_output_no_recompute 1.06% : 0.000001s : 6: predicate.shard_identity_eliminate 1.02% : 0.000001s : 6: predicate.special_op_eliminate 0.91% : 0.000001s : 6: predicate.specialize_transform 1.04% : 0.000001s : 6: predicate.split_environ_get_set_with_tuple_value 0.87% : 0.000001s : 6: predicate.stack_unstack_eliminate 0.45% : 0.000001s : 3: predicate.switch_call_monad_eliminater 1.08% : 0.000001s : 9: predicate.switch_defer_inline 1.72% : 0.000002s : 15: predicate.switch_layer_defer_inline 4.56% : 0.000006s : 32: predicate.switch_simplify 0.87% : 0.000001s : 7: predicate.tile_eliminate 0.96% : 0.000001s : 7: predicate.transpose_eliminate 1.53% : 0.000002s : 13: predicate.tuple_list_convert_item_index_to_positive 1.67% : 0.000002s : 13: predicate.tuple_list_get_item_const_eliminator 1.37% : 0.000002s : 13: predicate.tuple_list_get_item_depend_reorder 3.24% : 0.000004s : 19: predicate.tuple_list_get_item_eliminator 1.42% : 0.000002s : 13: predicate.tuple_list_get_set_item_eliminator 2.33% : 0.000003s : 19: predicate.tuple_list_set_item_eliminator 1.67% : 0.000002s : 13: predicate.tuple_to_list_eliminator_ 1.97% : 0.000003s : 20: predicate.updatestate_pure_node_eliminater 2.93% : 0.000004s : 26: predicate.updatestate_useless_node_eliminater 0.68% : 0.000001s : 3: predicate.value_based_eliminate 0.84% : 0.000001s : 6: predicate.virtual_dataset_eliminate 0.89% : 0.000001s : 6: predicate.virtual_output_eliminate 0.37% : 0.000000s : 3: predicate.virtual_view_grad_eliminate 0.56% : 0.000001s : 3: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000203 5 7.90% : 0.000016s : 1: func_graph_cloner_run.FuncGraphClonerGraph 92.10% : 0.000187s : 4: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.023588 192 0.02% : 0.000006s : 1: ForceFp32Comm 13.05% : 0.003078s : 1: add_attr 13.00% : 0.003066s : 1: add_attr_with_inline 0.03% : 0.000006s : 1: add_comm_op_reuse_tag 0.22% : 0.000051s : 1: add_recomputation 0.03% : 0.000006s : 1: assign_add_opt 0.26% : 0.000061s : 1: auto_monad 0.12% : 0.000028s : 1: auto_monad_reorder 0.02% : 0.000006s : 1: begin_end_overlap_inline 0.03% : 0.000008s : 1: bias_add_comm_swap 2.01% : 0.000475s : 1: bootstrap 0.12% : 0.000029s : 1: cconv 0.03% : 0.000006s : 1: comm_op_add_attrs 0.08% : 0.000018s : 1: control_data_broadcast_order 0.05% : 0.000012s : 1: convert_after_rewriter 0.13% : 0.000030s : 1: cse_after_recomputation 0.03% : 0.000007s : 1: dataset_repeat_opt 0.08% : 0.000018s : 1: detach_backward 0.05% : 0.000011s : 1: environ_conv 0.09% : 0.000022s : 1: event_method 0.03% : 0.000008s : 1: full_micro_interleaved_order_control 0.03% : 0.000007s : 1: get_jit_bprop_graph 0.05% : 0.000012s : 1: graph_reusing 0.03% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.02% : 0.000006s : 1: handle_group_info 0.03% : 0.000008s : 1: inline 0.04% : 0.000009s : 1: insert-virtual-dataset 0.03% : 0.000006s : 1: interleave_parallel_branches 0.03% : 0.000006s : 1: interleave_split_concat_branches 0.04% : 0.000008s : 1: label_fine_grained_interleaved_index 0.04% : 0.000010s : 1: label_micro_interleaved_index 1.88% : 0.000445s : 1: loop_unroll 0.03% : 0.000006s : 1: merge_cast_opt 0.03% : 0.000008s : 1: micro_interleaved_order_control 2.05% : 0.000485s : 1: mutable_eliminate 0.04% : 0.000009s : 1: offloading_packed_experts 0.06% : 0.000014s : 1: opt.transform.loop_unroll_optimizer 0.06% : 0.000014s : 1: opt.transform.mutable_eliminate 3.16% : 0.000745s : 78: opt.transform.opt_a 0.11% : 0.000026s : 1: opt.transform.opt_after_cconv 0.10% : 0.000023s : 1: opt.transform.opt_after_jit_grad 0.54% : 0.000128s : 28: opt.transform.opt_b 0.19% : 0.000044s : 2: opt.transform.opt_trans_graph 0.15% : 0.000035s : 4: opt.transform.symbol_engine_opt 9.94% : 0.002344s : 1: opt_a 0.53% : 0.000124s : 1: opt_after_cconv 2.05% : 0.000483s : 1: opt_after_jit_grad 1.32% : 0.000312s : 1: opt_b 20.68% : 0.004877s : 1: optimize 0.09% : 0.000022s : 1: optimize_parallel_all_gather_comm 0.05% : 0.000011s : 1: order_py_execute_after_rewriter 0.10% : 0.000023s : 1: overlap_grad_flash_sp 0.03% : 0.000006s : 1: overlap_grad_matmul_and_grad_allreduce 0.04% : 0.000010s : 1: overlap_grad_ring_attention 0.03% : 0.000007s : 1: overlap_opt_shard_grad_in_pipeline 0.03% : 0.000006s : 1: overlap_opt_shard_in_pipeline 0.03% : 0.000008s : 1: overlap_param_gather 0.03% : 0.000006s : 1: overlap_recompute_allgather_and_fa_grad 0.04% : 0.000010s : 1: overlap_recompute_and_grad_model_parallel 0.03% : 0.000008s : 1: overlap_recompute_comm 0.04% : 0.000010s : 1: parallel-infer-symbol 0.03% : 0.000006s : 1: parallel-infer-symbol-second 0.03% : 0.000008s : 1: partial_unused_args_eliminate 0.04% : 0.000008s : 1: pipeline_parallel_scheduler 0.03% : 0.000007s : 1: pipeline_split 0.13% : 0.000030s : 1: pre_auto_parallel 0.11% : 0.000025s : 1: py_interpret_to_execute 0.06% : 0.000015s : 1: py_interpret_to_execute_after_opt_a 0.03% : 0.000007s : 1: remove_cast_before_assign_add 0.09% : 0.000021s : 1: remove_dup_value 0.92% : 0.000218s : 1: renormalize.infer 0.83% : 0.000196s : 1: renormalize.specialize 0.04% : 0.000009s : 1: reorder_send_recv_between_fp_bp 0.04% : 0.000010s : 1: rewriter_after_jit_bprop_graph 0.18% : 0.000042s : 1: rewriter_after_opt_a 0.21% : 0.000050s : 1: rewriter_before_opt_a 0.03% : 0.000007s : 1: slice_cell_reuse_recomputed_activation 0.03% : 0.000007s : 1: slice_recompute_activation 0.03% : 0.000007s : 1: split_layernorm_comm 0.03% : 0.000008s : 1: split_matmul_comm_elemetwise 0.04% : 0.000011s : 1: swap_dp_allreduce_reducescatter 0.41% : 0.000097s : 1: symbol_engine_optimizer 0.38% : 0.000089s : 1: tuple_transform 23.01% : 0.005429s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:59.674.861 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0137119, [21] [bootstrap]: 0.0004376 [type_inference]: 0.0053916 [event_method]: 1.075e-05 [auto_monad]: 5.239e-05 [graph_reusing]: 5.45001e-06 [inline]: 2.17001e-06 [add_attr]: 0.00306414, [1] [add_attr_with_inline]: 0.00305543, [1] [Cycle 1]: 5.262e-05, [2] [tag_attr]: 1.34e-05 [meta_addattr_fg_expand]: 3.71001e-06 [parallel-infer-symbol]: 2.89001e-06 [pre_auto_parallel]: 2.314e-05 [insert-virtual-dataset]: 2.31998e-06 [parallel-infer-symbol-second]: 8.09989e-07 [dataset_repeat_opt]: 1.66e-06 [pipeline_split]: 1.52999e-06 [optimize]: 0.00402929, [53] [py_interpret_to_execute]: 1.689e-05 [rewriter_before_opt_a]: 4.409e-05 [opt_a]: 0.00211006, [2] [Cycle 1]: 0.00149412, [45] [expand_dump_flag]: 2.74999e-06 [switch_simplify]: 2.47e-05 [loop_unroll]: 1.352e-05 [a_1]: 0.00031917 [with_stream_mark]: 1.622e-05 [recompute_prepare]: 8.60999e-06 [updatestate_depend_eliminate]: 3.56001e-06 [updatestate_assign_eliminate]: 3.36999e-06 [updatestate_loads_eliminate]: 3.09999e-06 [parameter_eliminate]: 2.16998e-06 [a_2]: 7.886e-05 [accelerated_algorithm]: 6.83e-06 [shard]: 1.95001e-06 [meta_shard_fg_expand]: 1.87001e-06 [shard_inline]: 6.68e-06 [merge_send_recv]: 8.35999e-06 [auto_parallel]: 5.92001e-06 [parallel]: 1.8e-05 [flash_sp]: 7.71999e-06 [merge_comm]: 3.92998e-06 [allreduce_fusion]: 3.5e-06 [matmul_add_comm_reduction]: 8.99e-06 [allreduce_slice_to_reducescatter]: 6.59988e-07 [virtual_shard_identity]: 7.56001e-06 [virtual_dataset]: 6.06e-06 [get_grad_eliminate_]: 6.16e-06 [virtual_output]: 5.86998e-06 [merge_forward]: 3.86999e-06 [cell_reuse_recompute_pass]: 1.35001e-06 [offload_activation]: 9.64999e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.236e-05 [merge_recompute_call_nodes]: 1.45001e-06 [before_grad]: 1.092e-05 [set_forward_comm_id_for_comm_node_pass]: 3.51999e-06 [meta_fg_expand]: 2.93e-06 [flash_sp_send_recv_attached]: 2.91e-06 [receive_attached]: 2.28002e-06 [after_resolve]: 1.028e-05 [a_after_grad]: 8.85999e-06 [renormalize]: 0.00051424 [add_forward_monad_depend]: 4.37e-06 [auto_monad_grad]: 1.74e-06 [auto_monad_eliminator]: 1.405e-05 [cse]: 3.064e-05 [a_3]: 4.55e-05 [Cycle 2]: 0.00060696, [45] [expand_dump_flag]: 9.00007e-07 [switch_simplify]: 6.89999e-06 [loop_unroll]: 5.87999e-06 [a_1]: 0.00010681 [with_stream_mark]: 1.028e-05 [recompute_prepare]: 6.14001e-06 [updatestate_depend_eliminate]: 2.76e-06 [updatestate_assign_eliminate]: 2.46998e-06 [updatestate_loads_eliminate]: 2.71e-06 [parameter_eliminate]: 1.04e-06 [a_2]: 7.107e-05 [accelerated_algorithm]: 6.00002e-06 [shard]: 1.42e-06 [meta_shard_fg_expand]: 1.27999e-06 [shard_inline]: 6.74999e-06 [merge_send_recv]: 4.42e-06 [auto_parallel]: 5.45001e-06 [parallel]: 5.22e-06 [flash_sp]: 3.19001e-06 [merge_comm]: 3.18998e-06 [allreduce_fusion]: 3.05998e-06 [matmul_add_comm_reduction]: 5.79999e-06 [allreduce_slice_to_reducescatter]: 3.69997e-07 [virtual_shard_identity]: 6.47001e-06 [virtual_dataset]: 5.78002e-06 [get_grad_eliminate_]: 5.38002e-06 [virtual_output]: 5.13002e-06 [merge_forward]: 2.71e-06 [cell_reuse_recompute_pass]: 1.45001e-06 [offload_activation]: 6.07001e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.226e-05 [merge_recompute_call_nodes]: 7.59988e-07 [before_grad]: 8.65001e-06 [set_forward_comm_id_for_comm_node_pass]: 3.56999e-06 [meta_fg_expand]: 2.06e-06 [flash_sp_send_recv_attached]: 8.2e-07 [receive_attached]: 1.00001e-06 [after_resolve]: 8.62e-06 [a_after_grad]: 8.01001e-06 [renormalize]: 6.00121e-08 [add_forward_monad_depend]: 1.20999e-06 [auto_monad_grad]: 8.60018e-07 [auto_monad_eliminator]: 6.58998e-06 [cse]: 1.343e-05 [a_3]: 3.591e-05 [py_interpret_to_execute_after_opt_a]: 8.17e-06 [slice_cell_reuse_recomputed_activation]: 1.99e-06 [rewriter_after_opt_a]: 3.399e-05 [convert_after_rewriter]: 6.73e-06 [order_py_execute_after_rewriter]: 5.65001e-06 [mutable_eliminate]: 0.00047178 [opt_b]: 0.00023276, [1] [Cycle 1]: 0.00022715, [7] [b_1]: 0.00015239 [b_2]: 8.35999e-06 [updatestate_depend_eliminate]: 5.34e-06 [updatestate_assign_eliminate]: 2.53e-06 [updatestate_loads_eliminate]: 2.14e-06 [renormalize]: 4.00003e-07 [cse]: 1.885e-05 [optimize_parallel_all_gather_comm]: 1.487e-05 [overlap_param_gather]: 1.84e-06 [cconv]: 2.305e-05 [loop_unroll]: 0.00042404 [opt_after_cconv]: 9.829e-05, [1] [Cycle 1]: 9.272e-05, [7] [c_1]: 2.817e-05 [parameter_eliminate]: 2.36998e-06 [updatestate_depend_eliminate]: 5.17999e-06 [updatestate_assign_eliminate]: 2.51e-06 [updatestate_loads_eliminate]: 2.29001e-06 [cse]: 1.777e-05 [renormalize]: 4.09986e-07 [remove_dup_value]: 1.436e-05 [tuple_transform]: 6.832e-05, [1] [Cycle 1]: 6.409e-05, [4] [d_1]: 3.753e-05 [none_parameter_eliminate]: 1.62001e-06 [renormalize]: 1.60013e-07 [switch_simplify]: 6.33e-06 [partial_unused_args_eliminate]: 1.64998e-06 [add_recomputation]: 4.643e-05 [cse_after_recomputation]: 2.191e-05, [1] [Cycle 1]: 1.757e-05, [1] [cse]: 1.197e-05 [environ_conv]: 5.32999e-06 [swap_dp_allreduce_reducescatter]: 5.09e-06 [bias_add_comm_swap]: 2.53e-06 [label_micro_interleaved_index]: 4.83001e-06 [label_fine_grained_interleaved_index]: 2.91e-06 [merge_cast_opt]: 1.24e-06 [slice_recompute_activation]: 2.06e-06 [micro_interleaved_order_control]: 2.26e-06 [assign_add_opt]: 1.16002e-06 [ForceFp32Comm]: 9.10019e-07 [remove_cast_before_assign_add]: 1.14e-06 [full_micro_interleaved_order_control]: 2.03997e-06 [reorder_send_recv_between_fp_bp]: 2.64999e-06 [comm_op_add_attrs]: 9.79984e-07 [add_comm_op_reuse_tag]: 9.20001e-07 [interleave_split_concat_branches]: 1.23002e-06 [interleave_parallel_branches]: 1.34e-06 [overlap_opt_shard_in_pipeline]: 1.37e-06 [overlap_opt_shard_grad_in_pipeline]: 1.77001e-06 [control_data_broadcast_order]: 1.275e-05 [grouped_pairwise_exchange_alltoall]: 1.64e-06 [offloading_packed_experts]: 4e-06 [overlap_recompute_and_grad_model_parallel]: 4.82e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.25001e-06 [overlap_recompute_allgather_and_fa_grad]: 1.39e-06 [overlap_recompute_comm]: 2.16e-06 [overlap_grad_ring_attention]: 4.07003e-06 [overlap_grad_flash_sp]: 1.707e-05 [begin_end_overlap_inline]: 5.8001e-07 [split_matmul_comm_elemetwise]: 1.92001e-06 [split_layernorm_comm]: 1.77001e-06 [handle_group_info]: 9.09989e-07 [symbol_engine_optimizer]: 7.282e-05, [1] [Cycle 1]: 6.84e-05, [6] [build]: 2.68e-06 [elim_shapecalc]: 9.34e-06 [elim_not_effective]: 1.224e-05 [opt_reshape]: 6.78e-06 [fold_const_symbol]: 9.23997e-06 [renormalize]: 1.90019e-07 [detach_backward]: 2.21998e-06 [pipeline_parallel_scheduler]: 1.96998e-06 [auto_monad_reorder]: 1.663e-05 [get_jit_bprop_graph]: 1.15001e-06 [rewriter_after_jit_bprop_graph]: 3.50003e-06 [opt_after_jit_grad]: 0.00048259 [validate]: 3.458e-05 Sums bootstrap : 0.000438s : 4.51% type_inference : 0.005392s : 55.54% event_method : 0.000011s : 0.11% auto_monad : 0.000052s : 0.54% graph_reusing : 0.000005s : 0.06% inline : 0.000002s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000013s : 0.14% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000004s : 0.04% parallel-infer-symbol : 0.000003s : 0.03% pre_auto_parallel : 0.000023s : 0.24% insert-virtual-dataset : 0.000002s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.02% optimize.py_interpret_to_execute : 0.000017s : 0.17% optimize.rewriter_before_opt_a : 0.000044s : 0.45% optimize.opt_a.expand_dump_flag : 0.000004s : 0.04% optimize.opt_a.switch_simplify : 0.000032s : 0.33% optimize.opt_a.loop_unroll : 0.000019s : 0.20% optimize.opt_a.a_1 : 0.000426s : 4.39% optimize.opt_a.with_stream_mark : 0.000026s : 0.27% optimize.opt_a.recompute_prepare : 0.000015s : 0.15% optimize.opt_a.updatestate_depend_eliminate : 0.000006s : 0.07% optimize.opt_a.updatestate_assign_eliminate : 0.000006s : 0.06% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.06% optimize.opt_a.parameter_eliminate : 0.000003s : 0.03% optimize.opt_a.a_2 : 0.000150s : 1.54% optimize.opt_a.accelerated_algorithm : 0.000013s : 0.13% optimize.opt_a.shard : 0.000003s : 0.03% optimize.opt_a.meta_shard_fg_expand : 0.000003s : 0.03% optimize.opt_a.shard_inline : 0.000013s : 0.14% optimize.opt_a.merge_send_recv : 0.000013s : 0.13% optimize.opt_a.auto_parallel : 0.000011s : 0.12% optimize.opt_a.parallel : 0.000023s : 0.24% optimize.opt_a.flash_sp : 0.000011s : 0.11% optimize.opt_a.merge_comm : 0.000007s : 0.07% optimize.opt_a.allreduce_fusion : 0.000007s : 0.07% optimize.opt_a.matmul_add_comm_reduction : 0.000015s : 0.15% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000014s : 0.14% optimize.opt_a.virtual_dataset : 0.000012s : 0.12% optimize.opt_a.get_grad_eliminate_ : 0.000012s : 0.12% optimize.opt_a.virtual_output : 0.000011s : 0.11% optimize.opt_a.merge_forward : 0.000007s : 0.07% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.03% optimize.opt_a.offload_activation : 0.000016s : 0.16% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000025s : 0.25% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.02% optimize.opt_a.before_grad : 0.000020s : 0.20% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000007s : 0.07% optimize.opt_a.meta_fg_expand : 0.000005s : 0.05% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.04% optimize.opt_a.receive_attached : 0.000003s : 0.03% optimize.opt_a.after_resolve : 0.000019s : 0.19% optimize.opt_a.a_after_grad : 0.000017s : 0.17% optimize.opt_a.renormalize : 0.000514s : 5.30% optimize.opt_a.add_forward_monad_depend : 0.000006s : 0.06% optimize.opt_a.auto_monad_grad : 0.000003s : 0.03% optimize.opt_a.auto_monad_eliminator : 0.000021s : 0.21% optimize.opt_a.cse : 0.000044s : 0.45% optimize.opt_a.a_3 : 0.000081s : 0.84% optimize.py_interpret_to_execute_after_opt_a : 0.000008s : 0.08% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.02% optimize.rewriter_after_opt_a : 0.000034s : 0.35% optimize.convert_after_rewriter : 0.000007s : 0.07% optimize.order_py_execute_after_rewriter : 0.000006s : 0.06% optimize.mutable_eliminate : 0.000472s : 4.86% optimize.opt_b.b_1 : 0.000152s : 1.57% optimize.opt_b.b_2 : 0.000008s : 0.09% optimize.opt_b.updatestate_depend_eliminate : 0.000005s : 0.06% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_b.updatestate_loads_eliminate : 0.000002s : 0.02% optimize.opt_b.renormalize : 0.000000s : 0.00% optimize.opt_b.cse : 0.000019s : 0.19% optimize.optimize_parallel_all_gather_comm : 0.000015s : 0.15% optimize.overlap_param_gather : 0.000002s : 0.02% optimize.cconv : 0.000023s : 0.24% optimize.loop_unroll : 0.000424s : 4.37% optimize.opt_after_cconv.c_1 : 0.000028s : 0.29% optimize.opt_after_cconv.parameter_eliminate : 0.000002s : 0.02% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000005s : 0.05% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.02% optimize.opt_after_cconv.cse : 0.000018s : 0.18% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000014s : 0.15% optimize.tuple_transform.d_1 : 0.000038s : 0.39% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.02% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000006s : 0.07% optimize.partial_unused_args_eliminate : 0.000002s : 0.02% optimize.add_recomputation : 0.000046s : 0.48% optimize.cse_after_recomputation.cse : 0.000012s : 0.12% optimize.environ_conv : 0.000005s : 0.05% optimize.swap_dp_allreduce_reducescatter : 0.000005s : 0.05% optimize.bias_add_comm_swap : 0.000003s : 0.03% optimize.label_micro_interleaved_index : 0.000005s : 0.05% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.03% optimize.merge_cast_opt : 0.000001s : 0.01% optimize.slice_recompute_activation : 0.000002s : 0.02% optimize.micro_interleaved_order_control : 0.000002s : 0.02% optimize.assign_add_opt : 0.000001s : 0.01% optimize.ForceFp32Comm : 0.000001s : 0.01% optimize.remove_cast_before_assign_add : 0.000001s : 0.01% optimize.full_micro_interleaved_order_control : 0.000002s : 0.02% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.03% optimize.comm_op_add_attrs : 0.000001s : 0.01% optimize.add_comm_op_reuse_tag : 0.000001s : 0.01% optimize.interleave_split_concat_branches : 0.000001s : 0.01% optimize.interleave_parallel_branches : 0.000001s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.02% optimize.control_data_broadcast_order : 0.000013s : 0.13% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.02% optimize.offloading_packed_experts : 0.000004s : 0.04% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.05% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.01% optimize.overlap_recompute_comm : 0.000002s : 0.02% optimize.overlap_grad_ring_attention : 0.000004s : 0.04% optimize.overlap_grad_flash_sp : 0.000017s : 0.18% optimize.begin_end_overlap_inline : 0.000001s : 0.01% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.02% optimize.split_layernorm_comm : 0.000002s : 0.02% optimize.handle_group_info : 0.000001s : 0.01% optimize.symbol_engine_optimizer.build : 0.000003s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000009s : 0.10% optimize.symbol_engine_optimizer.elim_not_effective : 0.000012s : 0.13% optimize.symbol_engine_optimizer.opt_reshape : 0.000007s : 0.07% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000009s : 0.10% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.02% pipeline_parallel_scheduler : 0.000002s : 0.02% auto_monad_reorder : 0.000017s : 0.17% get_jit_bprop_graph : 0.000001s : 0.01% rewriter_after_jit_bprop_graph : 0.000004s : 0.04% opt_after_jit_grad : 0.000483s : 4.97% validate : 0.000035s : 0.36% Time group info: ------[substitution.] 0.000132 20 1.36% : 0.000002s : 2: substitution.elim_not_effective 0.96% : 0.000001s : 2: substitution.fold_const_symbol 4.07% : 0.000005s : 3: substitution.graph_param_transform 65.55% : 0.000087s : 2: substitution.inline 2.26% : 0.000003s : 4: substitution.j_node_and_user_rematch 3.26% : 0.000004s : 4: substitution.remove_not_recompute_node 2.14% : 0.000003s : 2: substitution.replace_old_param 20.40% : 0.000027s : 1: substitution.value_based_eliminate ------[type_inference.] 0.005345 2 91.84% : 0.004909s : 1: type_inference.infer 8.16% : 0.000436s : 1: type_inference.specialize ------[replace.] 0.000055 2 100.00% : 0.000055s : 2: replace.inline ------[match.] 0.000085 2 100.00% : 0.000085s : 2: match.inline ------[predicate.] 0.000132 754 0.79% : 0.000001s : 7: predicate.accumulaten_eliminater 1.22% : 0.000002s : 3: predicate.ad_related_special_op_eliminate 0.64% : 0.000001s : 6: predicate.addn_check_dump 0.85% : 0.000001s : 7: predicate.addn_zero_filter 0.67% : 0.000001s : 7: predicate.adjust_all_reduce_mul_add 2.22% : 0.000003s : 13: predicate.arithmetic_simplify 0.79% : 0.000001s : 7: predicate.cast_eliminate 0.73% : 0.000001s : 6: predicate.check_bprop_eliminate 0.67% : 0.000001s : 6: predicate.compare_switch_simplify 0.22% : 0.000000s : 3: predicate.const_output_eliminate 0.76% : 0.000001s : 6: predicate.depend_value_elim 0.80% : 0.000001s : 7: predicate.dict_get_item_const_eliminator 1.00% : 0.000001s : 7: predicate.dict_get_item_eliminator 0.85% : 0.000001s : 7: predicate.dict_set_item_eliminator 1.19% : 0.000002s : 6: predicate.dumpgradient_eliminate 0.26% : 0.000000s : 3: predicate.elim_not_effective 0.42% : 0.000001s : 3: predicate.elim_shapecalc_of_broadcastargs 1.22% : 0.000002s : 10: predicate.environ_add_const_eliminate 1.01% : 0.000001s : 10: predicate.environ_get_add_eliminate 1.02% : 0.000001s : 10: predicate.environ_get_depend_swap 1.66% : 0.000002s : 16: predicate.environ_get_eliminate 0.98% : 0.000001s : 10: predicate.environ_get_set_eliminate 0.95% : 0.000001s : 9: predicate.exchange_switch_depend_value 1.91% : 0.000003s : 9: predicate.float_depend_g_call 0.67% : 0.000001s : 6: predicate.float_environ_get_switch 1.04% : 0.000001s : 9: predicate.float_tuple_getitem_switch 0.23% : 0.000000s : 3: predicate.fold_const_symbol 0.77% : 0.000001s : 6: predicate.get_grad_eliminate 0.26% : 0.000000s : 3: predicate.graph_param_transform 0.80% : 0.000001s : 6: predicate.incorporate_call 0.64% : 0.000001s : 6: predicate.incorporate_call_switch 6.91% : 0.000009s : 34: predicate.inline 1.01% : 0.000001s : 6: predicate.inline_without_move 0.42% : 0.000001s : 6: predicate.j_node_and_user_rematch 1.19% : 0.000002s : 6: predicate.less_batch_normalization 1.69% : 0.000002s : 13: predicate.list_to_tuple_eliminator_ 2.07% : 0.000003s : 20: predicate.load_eliminater 1.25% : 0.000002s : 3: predicate.loop_unroll_after_grad 1.66% : 0.000002s : 14: predicate.loop_unroll_before_grad 1.63% : 0.000002s : 13: predicate.make_slice_get_slice_eliminator 0.66% : 0.000001s : 6: predicate.merge_addn 0.70% : 0.000001s : 6: predicate.micro_step_allgather_replace 0.69% : 0.000001s : 6: predicate.mini_step_allgather_replace 0.67% : 0.000001s : 7: predicate.minmaximum_grad 1.29% : 0.000002s : 3: predicate.mutable_eliminate 0.45% : 0.000001s : 3: predicate.opt_reshape 0.60% : 0.000001s : 3: predicate.parallel_virtual_node 1.37% : 0.000002s : 9: predicate.partial_defer_inline 1.22% : 0.000002s : 10: predicate.partial_eliminate 0.74% : 0.000001s : 7: predicate.print_const_string_wrapper 0.65% : 0.000001s : 6: predicate.reduce_all_const_elim 0.90% : 0.000001s : 7: predicate.reduce_eliminate 2.06% : 0.000003s : 20: predicate.redundant_stop_gradient_eliminater 0.67% : 0.000001s : 6: predicate.remove_not_recompute_node 1.18% : 0.000002s : 13: predicate.replace_applicator 0.82% : 0.000001s : 6: predicate.replace_old_param 0.36% : 0.000000s : 3: predicate.reset_defer_inline 0.86% : 0.000001s : 7: predicate.reshape_eliminate 0.75% : 0.000001s : 6: predicate.row_tensor_add_zeros_like 0.65% : 0.000001s : 3: predicate.row_tensor_eliminate 0.95% : 0.000001s : 6: predicate.same_eliminate 0.56% : 0.000001s : 6: predicate.set_cell_output_no_recompute 0.95% : 0.000001s : 6: predicate.shard_identity_eliminate 1.03% : 0.000001s : 6: predicate.special_op_eliminate 0.95% : 0.000001s : 6: predicate.specialize_transform 1.14% : 0.000002s : 6: predicate.split_environ_get_set_with_tuple_value 0.88% : 0.000001s : 6: predicate.stack_unstack_eliminate 0.44% : 0.000001s : 3: predicate.switch_call_monad_eliminater 1.01% : 0.000001s : 9: predicate.switch_defer_inline 1.70% : 0.000002s : 15: predicate.switch_layer_defer_inline 4.38% : 0.000006s : 32: predicate.switch_simplify 0.79% : 0.000001s : 7: predicate.tile_eliminate 1.06% : 0.000001s : 7: predicate.transpose_eliminate 1.55% : 0.000002s : 13: predicate.tuple_list_convert_item_index_to_positive 1.79% : 0.000002s : 13: predicate.tuple_list_get_item_const_eliminator 1.47% : 0.000002s : 13: predicate.tuple_list_get_item_depend_reorder 3.32% : 0.000004s : 19: predicate.tuple_list_get_item_eliminator 1.47% : 0.000002s : 13: predicate.tuple_list_get_set_item_eliminator 2.68% : 0.000004s : 19: predicate.tuple_list_set_item_eliminator 1.44% : 0.000002s : 13: predicate.tuple_to_list_eliminator_ 1.97% : 0.000003s : 20: predicate.updatestate_pure_node_eliminater 2.94% : 0.000004s : 26: predicate.updatestate_useless_node_eliminater 0.63% : 0.000001s : 3: predicate.value_based_eliminate 0.83% : 0.000001s : 6: predicate.virtual_dataset_eliminate 0.93% : 0.000001s : 6: predicate.virtual_output_eliminate 0.31% : 0.000000s : 3: predicate.virtual_view_grad_eliminate 0.52% : 0.000001s : 3: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000202 5 7.79% : 0.000016s : 1: func_graph_cloner_run.FuncGraphClonerGraph 92.21% : 0.000186s : 4: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.022270 192 0.02% : 0.000004s : 1: ForceFp32Comm 13.78% : 0.003069s : 1: add_attr 13.74% : 0.003059s : 1: add_attr_with_inline 0.02% : 0.000004s : 1: add_comm_op_reuse_tag 0.23% : 0.000050s : 1: add_recomputation 0.02% : 0.000004s : 1: assign_add_opt 0.26% : 0.000057s : 1: auto_monad 0.09% : 0.000020s : 1: auto_monad_reorder 0.02% : 0.000003s : 1: begin_end_overlap_inline 0.02% : 0.000005s : 1: bias_add_comm_swap 2.09% : 0.000466s : 1: bootstrap 0.12% : 0.000026s : 1: cconv 0.02% : 0.000004s : 1: comm_op_add_attrs 0.07% : 0.000016s : 1: control_data_broadcast_order 0.04% : 0.000010s : 1: convert_after_rewriter 0.11% : 0.000025s : 1: cse_after_recomputation 0.02% : 0.000004s : 1: dataset_repeat_opt 0.02% : 0.000006s : 1: detach_backward 0.04% : 0.000008s : 1: environ_conv 0.07% : 0.000016s : 1: event_method 0.02% : 0.000005s : 1: full_micro_interleaved_order_control 0.02% : 0.000004s : 1: get_jit_bprop_graph 0.04% : 0.000009s : 1: graph_reusing 0.02% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.02% : 0.000004s : 1: handle_group_info 0.02% : 0.000005s : 1: inline 0.03% : 0.000006s : 1: insert-virtual-dataset 0.02% : 0.000004s : 1: interleave_parallel_branches 0.02% : 0.000004s : 1: interleave_split_concat_branches 0.03% : 0.000006s : 1: label_fine_grained_interleaved_index 0.03% : 0.000008s : 1: label_micro_interleaved_index 1.94% : 0.000432s : 1: loop_unroll 0.02% : 0.000004s : 1: merge_cast_opt 0.02% : 0.000005s : 1: micro_interleaved_order_control 2.15% : 0.000480s : 1: mutable_eliminate 0.03% : 0.000007s : 1: offloading_packed_experts 0.06% : 0.000014s : 1: opt.transform.loop_unroll_optimizer 0.06% : 0.000013s : 1: opt.transform.mutable_eliminate 3.53% : 0.000787s : 78: opt.transform.opt_a 0.12% : 0.000027s : 1: opt.transform.opt_after_cconv 0.11% : 0.000024s : 1: opt.transform.opt_after_jit_grad 0.58% : 0.000128s : 28: opt.transform.opt_b 0.19% : 0.000042s : 2: opt.transform.opt_trans_graph 0.15% : 0.000034s : 4: opt.transform.symbol_engine_opt 9.49% : 0.002113s : 1: opt_a 0.46% : 0.000102s : 1: opt_after_cconv 2.21% : 0.000491s : 1: opt_after_jit_grad 1.06% : 0.000236s : 1: opt_b 18.11% : 0.004034s : 1: optimize 0.08% : 0.000018s : 1: optimize_parallel_all_gather_comm 0.04% : 0.000009s : 1: order_py_execute_after_rewriter 0.09% : 0.000021s : 1: overlap_grad_flash_sp 0.02% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.03% : 0.000007s : 1: overlap_grad_ring_attention 0.02% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.02% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.02% : 0.000005s : 1: overlap_param_gather 0.02% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.03% : 0.000008s : 1: overlap_recompute_and_grad_model_parallel 0.02% : 0.000005s : 1: overlap_recompute_comm 0.03% : 0.000006s : 1: parallel-infer-symbol 0.02% : 0.000004s : 1: parallel-infer-symbol-second 0.02% : 0.000005s : 1: partial_unused_args_eliminate 0.02% : 0.000005s : 1: pipeline_parallel_scheduler 0.02% : 0.000004s : 1: pipeline_split 0.12% : 0.000027s : 1: pre_auto_parallel 0.09% : 0.000021s : 1: py_interpret_to_execute 0.05% : 0.000012s : 1: py_interpret_to_execute_after_opt_a 0.02% : 0.000004s : 1: remove_cast_before_assign_add 0.08% : 0.000018s : 1: remove_dup_value 1.35% : 0.000301s : 1: renormalize.infer 0.93% : 0.000206s : 1: renormalize.specialize 0.02% : 0.000005s : 1: reorder_send_recv_between_fp_bp 0.03% : 0.000007s : 1: rewriter_after_jit_bprop_graph 0.17% : 0.000038s : 1: rewriter_after_opt_a 0.22% : 0.000048s : 1: rewriter_before_opt_a 0.02% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.02% : 0.000005s : 1: slice_recompute_activation 0.02% : 0.000005s : 1: split_layernorm_comm 0.02% : 0.000005s : 1: split_matmul_comm_elemetwise 0.04% : 0.000008s : 1: swap_dp_allreduce_reducescatter 0.34% : 0.000075s : 1: symbol_engine_optimizer 0.32% : 0.000072s : 1: tuple_transform 24.28% : 0.005407s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:59.863.700 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:46:59.863.957 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0149352, [21] [bootstrap]: 0.0004291 [type_inference]: 0.00535812 [event_method]: 1.236e-05 [auto_monad]: 5.49e-05 [graph_reusing]: 5.51998e-06 [inline]: 1.66e-06 [add_attr]: 0.00303529, [1] [add_attr_with_inline]: 0.00302775, [1] [Cycle 1]: 6.341e-05, [2] [tag_attr]: 1.382e-05 [meta_addattr_fg_expand]: 4.28001e-06 [parallel-infer-symbol]: 3.09001e-06 [pre_auto_parallel]: 2.332e-05 [insert-virtual-dataset]: 2.73e-06 [parallel-infer-symbol-second]: 7.30011e-07 [dataset_repeat_opt]: 1.89e-06 [pipeline_split]: 1.54e-06 [optimize]: 0.00489161, [53] [py_interpret_to_execute]: 2.064e-05 [rewriter_before_opt_a]: 5.059e-05 [opt_a]: 0.0025918, [2] [Cycle 1]: 0.00170027, [45] [expand_dump_flag]: 2.93e-06 [switch_simplify]: 2.634e-05 [loop_unroll]: 1.596e-05 [a_1]: 0.00032954 [with_stream_mark]: 1.46e-05 [recompute_prepare]: 9.52001e-06 [updatestate_depend_eliminate]: 4.38001e-06 [updatestate_assign_eliminate]: 4.05998e-06 [updatestate_loads_eliminate]: 3.41999e-06 [parameter_eliminate]: 1.88002e-06 [a_2]: 0.00012974 [accelerated_algorithm]: 8.23999e-06 [shard]: 2.26e-06 [meta_shard_fg_expand]: 2.04999e-06 [shard_inline]: 7.56999e-06 [merge_send_recv]: 9.77999e-06 [auto_parallel]: 6.74001e-06 [parallel]: 1.627e-05 [flash_sp]: 8.65001e-06 [merge_comm]: 4.71002e-06 [allreduce_fusion]: 4.1e-06 [matmul_add_comm_reduction]: 1.031e-05 [allreduce_slice_to_reducescatter]: 8.60018e-07 [virtual_shard_identity]: 1.022e-05 [virtual_dataset]: 7.48e-06 [get_grad_eliminate_]: 7.1e-06 [virtual_output]: 7.05998e-06 [merge_forward]: 4.23999e-06 [cell_reuse_recompute_pass]: 1.10999e-06 [offload_activation]: 1.036e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.746e-05 [merge_recompute_call_nodes]: 1.37999e-06 [before_grad]: 1.284e-05 [set_forward_comm_id_for_comm_node_pass]: 4.25e-06 [meta_fg_expand]: 3.55e-06 [flash_sp_send_recv_attached]: 2.64999e-06 [receive_attached]: 2.56998e-06 [after_resolve]: 1.089e-05 [a_after_grad]: 1.135e-05 [renormalize]: 0.00046664 [add_forward_monad_depend]: 4.97e-06 [auto_monad_grad]: 1.72999e-06 [auto_monad_eliminator]: 1.535e-05 [cse]: 3.356e-05 [a_3]: 6.84e-05 [Cycle 2]: 0.00087966, [45] [expand_dump_flag]: 9.80013e-07 [switch_simplify]: 8.27998e-06 [loop_unroll]: 7.31001e-06 [a_1]: 0.00014832 [with_stream_mark]: 1.433e-05 [recompute_prepare]: 7.91001e-06 [updatestate_depend_eliminate]: 3.95e-06 [updatestate_assign_eliminate]: 3.12002e-06 [updatestate_loads_eliminate]: 2.68003e-06 [parameter_eliminate]: 1.09e-06 [a_2]: 0.00011501 [accelerated_algorithm]: 7.36001e-06 [shard]: 1.02e-06 [meta_shard_fg_expand]: 1.59e-06 [shard_inline]: 7.61999e-06 [merge_send_recv]: 5.94e-06 [auto_parallel]: 6.45002e-06 [parallel]: 4.52e-06 [flash_sp]: 3.15002e-06 [merge_comm]: 4.34002e-06 [allreduce_fusion]: 3.92002e-06 [matmul_add_comm_reduction]: 6.36998e-06 [allreduce_slice_to_reducescatter]: 3.80009e-07 [virtual_shard_identity]: 8.07e-06 [virtual_dataset]: 6.90998e-06 [get_grad_eliminate_]: 6.53998e-06 [virtual_output]: 6.47001e-06 [merge_forward]: 3.3e-06 [cell_reuse_recompute_pass]: 1.30999e-06 [offload_activation]: 6.93998e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.596e-05 [merge_recompute_call_nodes]: 7.2e-07 [before_grad]: 1.128e-05 [set_forward_comm_id_for_comm_node_pass]: 4.72e-06 [meta_fg_expand]: 2.64999e-06 [flash_sp_send_recv_attached]: 8.50006e-07 [receive_attached]: 9.89996e-07 [after_resolve]: 9.68997e-06 [a_after_grad]: 1.01e-05 [renormalize]: 8.00064e-08 [add_forward_monad_depend]: 1.22e-06 [auto_monad_grad]: 9.49978e-07 [auto_monad_eliminator]: 8.39998e-06 [cse]: 1.812e-05 [a_3]: 5.683e-05 [py_interpret_to_execute_after_opt_a]: 1.161e-05 [slice_cell_reuse_recomputed_activation]: 4.84e-06 [rewriter_after_opt_a]: 4.314e-05 [convert_after_rewriter]: 1.019e-05 [order_py_execute_after_rewriter]: 8.71002e-06 [mutable_eliminate]: 0.000488 [opt_b]: 0.00033192, [1] [Cycle 1]: 0.00032335, [7] [b_1]: 0.00022174 [b_2]: 9.02999e-06 [updatestate_depend_eliminate]: 6.05002e-06 [updatestate_assign_eliminate]: 3.03e-06 [updatestate_loads_eliminate]: 2.81e-06 [renormalize]: 6.90023e-07 [cse]: 2.246e-05 [optimize_parallel_all_gather_comm]: 2.109e-05 [overlap_param_gather]: 4.52998e-06 [cconv]: 2.64e-05 [loop_unroll]: 0.00044029 [opt_after_cconv]: 0.00013708, [1] [Cycle 1]: 0.00012909, [7] [c_1]: 3.42e-05 [parameter_eliminate]: 2.69999e-06 [updatestate_depend_eliminate]: 6.02999e-06 [updatestate_assign_eliminate]: 3.26999e-06 [updatestate_loads_eliminate]: 2.99999e-06 [cse]: 2.25e-05 [renormalize]: 3.30008e-07 [remove_dup_value]: 1.923e-05 [tuple_transform]: 9.244e-05, [1] [Cycle 1]: 8.524e-05, [4] [d_1]: 4.484e-05 [none_parameter_eliminate]: 1.65001e-06 [renormalize]: 2.29978e-07 [switch_simplify]: 7.93001e-06 [partial_unused_args_eliminate]: 4.47998e-06 [add_recomputation]: 5.725e-05 [cse_after_recomputation]: 3.143e-05, [1] [Cycle 1]: 2.479e-05, [1] [cse]: 1.577e-05 [environ_conv]: 8.95999e-06 [swap_dp_allreduce_reducescatter]: 9.05001e-06 [bias_add_comm_swap]: 4.97e-06 [label_micro_interleaved_index]: 6.98e-06 [label_fine_grained_interleaved_index]: 5.19998e-06 [merge_cast_opt]: 4.01001e-06 [slice_recompute_activation]: 4.76002e-06 [micro_interleaved_order_control]: 4.68001e-06 [assign_add_opt]: 3.97002e-06 [ForceFp32Comm]: 3.26999e-06 [remove_cast_before_assign_add]: 3.46999e-06 [full_micro_interleaved_order_control]: 4.37998e-06 [reorder_send_recv_between_fp_bp]: 5.29e-06 [comm_op_add_attrs]: 3.61001e-06 [add_comm_op_reuse_tag]: 3.34001e-06 [interleave_split_concat_branches]: 3.94002e-06 [interleave_parallel_branches]: 3.76001e-06 [overlap_opt_shard_in_pipeline]: 3.55e-06 [overlap_opt_shard_grad_in_pipeline]: 4.35e-06 [control_data_broadcast_order]: 1.778e-05 [grouped_pairwise_exchange_alltoall]: 4.08001e-06 [offloading_packed_experts]: 7.1e-06 [overlap_recompute_and_grad_model_parallel]: 7.57002e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.76001e-06 [overlap_recompute_allgather_and_fa_grad]: 3.7e-06 [overlap_recompute_comm]: 4.63999e-06 [overlap_grad_ring_attention]: 6.93998e-06 [overlap_grad_flash_sp]: 2.303e-05 [begin_end_overlap_inline]: 2.96001e-06 [split_matmul_comm_elemetwise]: 4.47e-06 [split_layernorm_comm]: 4.23001e-06 [handle_group_info]: 3.21999e-06 [symbol_engine_optimizer]: 0.00010303, [1] [Cycle 1]: 9.601e-05, [6] [build]: 2.79999e-06 [elim_shapecalc]: 1.097e-05 [elim_not_effective]: 1.559e-05 [opt_reshape]: 8.38001e-06 [fold_const_symbol]: 1.222e-05 [renormalize]: 2.50002e-07 [detach_backward]: 3.03e-06 [pipeline_parallel_scheduler]: 1.71002e-06 [auto_monad_reorder]: 2.242e-05 [get_jit_bprop_graph]: 1.15001e-06 [rewriter_after_jit_bprop_graph]: 3.97e-06 [opt_after_jit_grad]: 0.00047743 [validate]: 3.812e-05 Sums bootstrap : 0.000429s : 4.21% type_inference : 0.005358s : 52.57% event_method : 0.000012s : 0.12% auto_monad : 0.000055s : 0.54% graph_reusing : 0.000006s : 0.05% inline : 0.000002s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000014s : 0.14% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000004s : 0.04% parallel-infer-symbol : 0.000003s : 0.03% pre_auto_parallel : 0.000023s : 0.23% insert-virtual-dataset : 0.000003s : 0.03% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.02% optimize.py_interpret_to_execute : 0.000021s : 0.20% optimize.rewriter_before_opt_a : 0.000051s : 0.50% optimize.opt_a.expand_dump_flag : 0.000004s : 0.04% optimize.opt_a.switch_simplify : 0.000035s : 0.34% optimize.opt_a.loop_unroll : 0.000023s : 0.23% optimize.opt_a.a_1 : 0.000478s : 4.69% optimize.opt_a.with_stream_mark : 0.000029s : 0.28% optimize.opt_a.recompute_prepare : 0.000017s : 0.17% optimize.opt_a.updatestate_depend_eliminate : 0.000008s : 0.08% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.07% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.06% optimize.opt_a.parameter_eliminate : 0.000003s : 0.03% optimize.opt_a.a_2 : 0.000245s : 2.40% optimize.opt_a.accelerated_algorithm : 0.000016s : 0.15% optimize.opt_a.shard : 0.000003s : 0.03% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.04% optimize.opt_a.shard_inline : 0.000015s : 0.15% optimize.opt_a.merge_send_recv : 0.000016s : 0.15% optimize.opt_a.auto_parallel : 0.000013s : 0.13% optimize.opt_a.parallel : 0.000021s : 0.20% optimize.opt_a.flash_sp : 0.000012s : 0.12% optimize.opt_a.merge_comm : 0.000009s : 0.09% optimize.opt_a.allreduce_fusion : 0.000008s : 0.08% optimize.opt_a.matmul_add_comm_reduction : 0.000017s : 0.16% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000018s : 0.18% optimize.opt_a.virtual_dataset : 0.000014s : 0.14% optimize.opt_a.get_grad_eliminate_ : 0.000014s : 0.13% optimize.opt_a.virtual_output : 0.000014s : 0.13% optimize.opt_a.merge_forward : 0.000008s : 0.07% optimize.opt_a.cell_reuse_recompute_pass : 0.000002s : 0.02% optimize.opt_a.offload_activation : 0.000017s : 0.17% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000033s : 0.33% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.02% optimize.opt_a.before_grad : 0.000024s : 0.24% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000009s : 0.09% optimize.opt_a.meta_fg_expand : 0.000006s : 0.06% optimize.opt_a.flash_sp_send_recv_attached : 0.000003s : 0.03% optimize.opt_a.receive_attached : 0.000004s : 0.03% optimize.opt_a.after_resolve : 0.000021s : 0.20% optimize.opt_a.a_after_grad : 0.000021s : 0.21% optimize.opt_a.renormalize : 0.000467s : 4.58% optimize.opt_a.add_forward_monad_depend : 0.000006s : 0.06% optimize.opt_a.auto_monad_grad : 0.000003s : 0.03% optimize.opt_a.auto_monad_eliminator : 0.000024s : 0.23% optimize.opt_a.cse : 0.000052s : 0.51% optimize.opt_a.a_3 : 0.000125s : 1.23% optimize.py_interpret_to_execute_after_opt_a : 0.000012s : 0.11% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.05% optimize.rewriter_after_opt_a : 0.000043s : 0.42% optimize.convert_after_rewriter : 0.000010s : 0.10% optimize.order_py_execute_after_rewriter : 0.000009s : 0.09% optimize.mutable_eliminate : 0.000488s : 4.79% optimize.opt_b.b_1 : 0.000222s : 2.18% optimize.opt_b.b_2 : 0.000009s : 0.09% optimize.opt_b.updatestate_depend_eliminate : 0.000006s : 0.06% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_b.renormalize : 0.000001s : 0.01% optimize.opt_b.cse : 0.000022s : 0.22% optimize.optimize_parallel_all_gather_comm : 0.000021s : 0.21% optimize.overlap_param_gather : 0.000005s : 0.04% optimize.cconv : 0.000026s : 0.26% optimize.loop_unroll : 0.000440s : 4.32% optimize.opt_after_cconv.c_1 : 0.000034s : 0.34% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.06% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.cse : 0.000022s : 0.22% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000019s : 0.19% optimize.tuple_transform.d_1 : 0.000045s : 0.44% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.02% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000008s : 0.08% optimize.partial_unused_args_eliminate : 0.000004s : 0.04% optimize.add_recomputation : 0.000057s : 0.56% optimize.cse_after_recomputation.cse : 0.000016s : 0.15% optimize.environ_conv : 0.000009s : 0.09% optimize.swap_dp_allreduce_reducescatter : 0.000009s : 0.09% optimize.bias_add_comm_swap : 0.000005s : 0.05% optimize.label_micro_interleaved_index : 0.000007s : 0.07% optimize.label_fine_grained_interleaved_index : 0.000005s : 0.05% optimize.merge_cast_opt : 0.000004s : 0.04% optimize.slice_recompute_activation : 0.000005s : 0.05% optimize.micro_interleaved_order_control : 0.000005s : 0.05% optimize.assign_add_opt : 0.000004s : 0.04% optimize.ForceFp32Comm : 0.000003s : 0.03% optimize.remove_cast_before_assign_add : 0.000003s : 0.03% optimize.full_micro_interleaved_order_control : 0.000004s : 0.04% optimize.reorder_send_recv_between_fp_bp : 0.000005s : 0.05% optimize.comm_op_add_attrs : 0.000004s : 0.04% optimize.add_comm_op_reuse_tag : 0.000003s : 0.03% optimize.interleave_split_concat_branches : 0.000004s : 0.04% optimize.interleave_parallel_branches : 0.000004s : 0.04% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.03% optimize.overlap_opt_shard_grad_in_pipeline : 0.000004s : 0.04% optimize.control_data_broadcast_order : 0.000018s : 0.17% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.04% optimize.offloading_packed_experts : 0.000007s : 0.07% optimize.overlap_recompute_and_grad_model_parallel : 0.000008s : 0.07% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.04% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.04% optimize.overlap_recompute_comm : 0.000005s : 0.05% optimize.overlap_grad_ring_attention : 0.000007s : 0.07% optimize.overlap_grad_flash_sp : 0.000023s : 0.23% optimize.begin_end_overlap_inline : 0.000003s : 0.03% optimize.split_matmul_comm_elemetwise : 0.000004s : 0.04% optimize.split_layernorm_comm : 0.000004s : 0.04% optimize.handle_group_info : 0.000003s : 0.03% optimize.symbol_engine_optimizer.build : 0.000003s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000011s : 0.11% optimize.symbol_engine_optimizer.elim_not_effective : 0.000016s : 0.15% optimize.symbol_engine_optimizer.opt_reshape : 0.000008s : 0.08% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000012s : 0.12% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000003s : 0.03% pipeline_parallel_scheduler : 0.000002s : 0.02% auto_monad_reorder : 0.000022s : 0.22% get_jit_bprop_graph : 0.000001s : 0.01% rewriter_after_jit_bprop_graph : 0.000004s : 0.04% opt_after_jit_grad : 0.000477s : 4.68% validate : 0.000038s : 0.37% Time group info: ------[substitution.] 0.000154 29 11.32% : 0.000017s : 2: substitution.cast_eliminate 1.47% : 0.000002s : 3: substitution.elim_not_effective 1.12% : 0.000002s : 3: substitution.fold_const_symbol 3.71% : 0.000006s : 4: substitution.graph_param_transform 57.67% : 0.000089s : 2: substitution.inline 2.75% : 0.000004s : 6: substitution.j_node_and_user_rematch 3.87% : 0.000006s : 6: substitution.remove_not_recompute_node 1.95% : 0.000003s : 2: substitution.replace_old_param 16.14% : 0.000025s : 1: substitution.value_based_eliminate ------[type_inference.] 0.005316 2 91.40% : 0.004859s : 1: type_inference.infer 8.60% : 0.000457s : 1: type_inference.specialize ------[replace.] 0.000020 2 100.00% : 0.000020s : 2: replace.inline ------[match.] 0.000087 2 100.00% : 0.000087s : 2: match.inline ------[predicate.] 0.000167 980 0.78% : 0.000001s : 9: predicate.accumulaten_eliminater 0.92% : 0.000002s : 4: predicate.ad_related_special_op_eliminate 0.72% : 0.000001s : 8: predicate.addn_check_dump 1.00% : 0.000002s : 9: predicate.addn_zero_filter 0.77% : 0.000001s : 9: predicate.adjust_all_reduce_mul_add 2.12% : 0.000004s : 17: predicate.arithmetic_simplify 0.97% : 0.000002s : 9: predicate.cast_eliminate 0.83% : 0.000001s : 8: predicate.check_bprop_eliminate 0.68% : 0.000001s : 8: predicate.compare_switch_simplify 0.24% : 0.000000s : 4: predicate.const_output_eliminate 0.83% : 0.000001s : 8: predicate.depend_value_elim 0.82% : 0.000001s : 9: predicate.dict_get_item_const_eliminator 0.92% : 0.000002s : 9: predicate.dict_get_item_eliminator 0.75% : 0.000001s : 9: predicate.dict_set_item_eliminator 1.38% : 0.000002s : 8: predicate.dumpgradient_eliminate 0.27% : 0.000000s : 4: predicate.elim_not_effective 0.49% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.25% : 0.000002s : 13: predicate.environ_add_const_eliminate 1.11% : 0.000002s : 13: predicate.environ_get_add_eliminate 1.06% : 0.000002s : 13: predicate.environ_get_depend_swap 1.86% : 0.000003s : 21: predicate.environ_get_eliminate 1.08% : 0.000002s : 13: predicate.environ_get_set_eliminate 0.93% : 0.000002s : 11: predicate.exchange_switch_depend_value 1.79% : 0.000003s : 11: predicate.float_depend_g_call 0.71% : 0.000001s : 8: predicate.float_environ_get_switch 1.02% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.22% : 0.000000s : 4: predicate.fold_const_symbol 0.82% : 0.000001s : 8: predicate.get_grad_eliminate 0.26% : 0.000000s : 4: predicate.graph_param_transform 0.83% : 0.000001s : 8: predicate.incorporate_call 0.68% : 0.000001s : 8: predicate.incorporate_call_switch 6.40% : 0.000011s : 44: predicate.inline 0.99% : 0.000002s : 8: predicate.inline_without_move 0.41% : 0.000001s : 8: predicate.j_node_and_user_rematch 1.02% : 0.000002s : 8: predicate.less_batch_normalization 1.66% : 0.000003s : 17: predicate.list_to_tuple_eliminator_ 2.13% : 0.000004s : 26: predicate.load_eliminater 1.11% : 0.000002s : 4: predicate.loop_unroll_after_grad 1.46% : 0.000002s : 16: predicate.loop_unroll_before_grad 1.72% : 0.000003s : 17: predicate.make_slice_get_slice_eliminator 0.78% : 0.000001s : 8: predicate.merge_addn 0.76% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.75% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.75% : 0.000001s : 9: predicate.minmaximum_grad 1.16% : 0.000002s : 4: predicate.mutable_eliminate 0.50% : 0.000001s : 4: predicate.opt_reshape 0.56% : 0.000001s : 4: predicate.parallel_virtual_node 1.20% : 0.000002s : 11: predicate.partial_defer_inline 1.31% : 0.000002s : 13: predicate.partial_eliminate 0.75% : 0.000001s : 9: predicate.print_const_string_wrapper 0.75% : 0.000001s : 8: predicate.reduce_all_const_elim 1.05% : 0.000002s : 9: predicate.reduce_eliminate 2.28% : 0.000004s : 26: predicate.redundant_stop_gradient_eliminater 0.51% : 0.000001s : 8: predicate.remove_not_recompute_node 1.14% : 0.000002s : 17: predicate.replace_applicator 0.60% : 0.000001s : 8: predicate.replace_old_param 0.34% : 0.000001s : 4: predicate.reset_defer_inline 0.85% : 0.000001s : 9: predicate.reshape_eliminate 0.80% : 0.000001s : 8: predicate.row_tensor_add_zeros_like 0.56% : 0.000001s : 4: predicate.row_tensor_eliminate 0.95% : 0.000002s : 8: predicate.same_eliminate 0.57% : 0.000001s : 8: predicate.set_cell_output_no_recompute 1.09% : 0.000002s : 8: predicate.shard_identity_eliminate 0.87% : 0.000001s : 8: predicate.special_op_eliminate 1.05% : 0.000002s : 8: predicate.specialize_transform 1.13% : 0.000002s : 8: predicate.split_environ_get_set_with_tuple_value 0.88% : 0.000001s : 8: predicate.stack_unstack_eliminate 0.46% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.01% : 0.000002s : 11: predicate.switch_defer_inline 1.75% : 0.000003s : 19: predicate.switch_layer_defer_inline 4.20% : 0.000007s : 39: predicate.switch_simplify 0.85% : 0.000001s : 9: predicate.tile_eliminate 0.83% : 0.000001s : 9: predicate.transpose_eliminate 1.65% : 0.000003s : 17: predicate.tuple_list_convert_item_index_to_positive 1.70% : 0.000003s : 17: predicate.tuple_list_get_item_const_eliminator 1.42% : 0.000002s : 17: predicate.tuple_list_get_item_depend_reorder 3.12% : 0.000005s : 25: predicate.tuple_list_get_item_eliminator 1.45% : 0.000002s : 17: predicate.tuple_list_get_set_item_eliminator 2.44% : 0.000004s : 25: predicate.tuple_list_set_item_eliminator 1.63% : 0.000003s : 17: predicate.tuple_to_list_eliminator_ 2.11% : 0.000004s : 26: predicate.updatestate_pure_node_eliminater 3.11% : 0.000005s : 34: predicate.updatestate_useless_node_eliminater 0.66% : 0.000001s : 4: predicate.value_based_eliminate 0.83% : 0.000001s : 8: predicate.virtual_dataset_eliminate 0.86% : 0.000001s : 8: predicate.virtual_output_eliminate 0.34% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.63% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000206 5 7.88% : 0.000016s : 1: func_graph_cloner_run.FuncGraphClonerGraph 92.12% : 0.000190s : 4: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.024479 192 0.02% : 0.000006s : 1: ForceFp32Comm 12.43% : 0.003043s : 1: add_attr 12.38% : 0.003031s : 1: add_attr_with_inline 0.02% : 0.000006s : 1: add_comm_op_reuse_tag 0.25% : 0.000061s : 1: add_recomputation 0.03% : 0.000007s : 1: assign_add_opt 0.26% : 0.000063s : 1: auto_monad 0.12% : 0.000030s : 1: auto_monad_reorder 0.02% : 0.000006s : 1: begin_end_overlap_inline 0.03% : 0.000008s : 1: bias_add_comm_swap 1.93% : 0.000471s : 1: bootstrap 0.12% : 0.000029s : 1: cconv 0.03% : 0.000006s : 1: comm_op_add_attrs 0.08% : 0.000021s : 1: control_data_broadcast_order 0.05% : 0.000013s : 1: convert_after_rewriter 0.14% : 0.000034s : 1: cse_after_recomputation 0.03% : 0.000007s : 1: dataset_repeat_opt 0.07% : 0.000018s : 1: detach_backward 0.05% : 0.000012s : 1: environ_conv 0.08% : 0.000021s : 1: event_method 0.03% : 0.000007s : 1: full_micro_interleaved_order_control 0.03% : 0.000007s : 1: get_jit_bprop_graph 0.05% : 0.000012s : 1: graph_reusing 0.03% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.02% : 0.000006s : 1: handle_group_info 0.03% : 0.000007s : 1: inline 0.04% : 0.000009s : 1: insert-virtual-dataset 0.03% : 0.000006s : 1: interleave_parallel_branches 0.03% : 0.000007s : 1: interleave_split_concat_branches 0.03% : 0.000008s : 1: label_fine_grained_interleaved_index 0.04% : 0.000010s : 1: label_micro_interleaved_index 1.82% : 0.000446s : 1: loop_unroll 0.03% : 0.000007s : 1: merge_cast_opt 0.03% : 0.000008s : 1: micro_interleaved_order_control 2.02% : 0.000494s : 1: mutable_eliminate 0.04% : 0.000010s : 1: offloading_packed_experts 0.06% : 0.000015s : 1: opt.transform.loop_unroll_optimizer 0.06% : 0.000015s : 1: opt.transform.mutable_eliminate 3.83% : 0.000937s : 78: opt.transform.opt_a 0.13% : 0.000033s : 1: opt.transform.opt_after_cconv 0.11% : 0.000026s : 1: opt.transform.opt_after_jit_grad 0.63% : 0.000155s : 28: opt.transform.opt_b 0.21% : 0.000050s : 2: opt.transform.opt_trans_graph 0.18% : 0.000044s : 4: opt.transform.symbol_engine_opt 10.60% : 0.002595s : 1: opt_a 0.57% : 0.000141s : 1: opt_after_cconv 1.99% : 0.000487s : 1: opt_after_jit_grad 1.37% : 0.000335s : 1: opt_b 21.40% : 0.005239s : 1: optimize 0.10% : 0.000024s : 1: optimize_parallel_all_gather_comm 0.05% : 0.000012s : 1: order_py_execute_after_rewriter 0.11% : 0.000026s : 1: overlap_grad_flash_sp 0.03% : 0.000007s : 1: overlap_grad_matmul_and_grad_allreduce 0.04% : 0.000010s : 1: overlap_grad_ring_attention 0.03% : 0.000007s : 1: overlap_opt_shard_grad_in_pipeline 0.03% : 0.000006s : 1: overlap_opt_shard_in_pipeline 0.03% : 0.000007s : 1: overlap_param_gather 0.03% : 0.000006s : 1: overlap_recompute_allgather_and_fa_grad 0.04% : 0.000010s : 1: overlap_recompute_and_grad_model_parallel 0.03% : 0.000007s : 1: overlap_recompute_comm 0.04% : 0.000009s : 1: parallel-infer-symbol 0.03% : 0.000006s : 1: parallel-infer-symbol-second 0.03% : 0.000008s : 1: partial_unused_args_eliminate 0.04% : 0.000009s : 1: pipeline_parallel_scheduler 0.03% : 0.000007s : 1: pipeline_split 0.13% : 0.000031s : 1: pre_auto_parallel 0.10% : 0.000024s : 1: py_interpret_to_execute 0.06% : 0.000015s : 1: py_interpret_to_execute_after_opt_a 0.03% : 0.000006s : 1: remove_cast_before_assign_add 0.09% : 0.000023s : 1: remove_dup_value 1.04% : 0.000255s : 1: renormalize.infer 0.83% : 0.000204s : 1: renormalize.specialize 0.03% : 0.000008s : 1: reorder_send_recv_between_fp_bp 0.04% : 0.000010s : 1: rewriter_after_jit_bprop_graph 0.19% : 0.000047s : 1: rewriter_after_opt_a 0.22% : 0.000054s : 1: rewriter_before_opt_a 0.03% : 0.000008s : 1: slice_cell_reuse_recomputed_activation 0.03% : 0.000007s : 1: slice_recompute_activation 0.03% : 0.000007s : 1: split_layernorm_comm 0.03% : 0.000007s : 1: split_matmul_comm_elemetwise 0.05% : 0.000012s : 1: swap_dp_allreduce_reducescatter 0.43% : 0.000106s : 1: symbol_engine_optimizer 0.39% : 0.000095s : 1: tuple_transform 21.97% : 0.005379s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:47:00.529.63 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0145172, [21] [bootstrap]: 0.00043668 [type_inference]: 0.00561518 [event_method]: 1.252e-05 [auto_monad]: 5.683e-05 [graph_reusing]: 5.29e-06 [inline]: 2.89999e-06 [add_attr]: 0.00315971, [1] [add_attr_with_inline]: 0.00315051, [1] [Cycle 1]: 6.028e-05, [2] [tag_attr]: 1.587e-05 [meta_addattr_fg_expand]: 3.63e-06 [parallel-infer-symbol]: 3.11001e-06 [pre_auto_parallel]: 2.709e-05 [insert-virtual-dataset]: 2.24999e-06 [parallel-infer-symbol-second]: 7.59988e-07 [dataset_repeat_opt]: 2.01e-06 [pipeline_split]: 1.64e-06 [optimize]: 0.00448297, [53] [py_interpret_to_execute]: 1.889e-05 [rewriter_before_opt_a]: 4.85e-05 [opt_a]: 0.00240103, [2] [Cycle 1]: 0.00167046, [45] [expand_dump_flag]: 2.79999e-06 [switch_simplify]: 2.75e-05 [loop_unroll]: 1.519e-05 [a_1]: 0.00034928 [with_stream_mark]: 1.592e-05 [recompute_prepare]: 8.94e-06 [updatestate_depend_eliminate]: 4.33999e-06 [updatestate_assign_eliminate]: 3.85e-06 [updatestate_loads_eliminate]: 3.43999e-06 [parameter_eliminate]: 1.75001e-06 [a_2]: 9.536e-05 [accelerated_algorithm]: 7.85e-06 [shard]: 2.46998e-06 [meta_shard_fg_expand]: 1.94e-06 [shard_inline]: 7.33e-06 [merge_send_recv]: 9.77001e-06 [auto_parallel]: 7.23999e-06 [parallel]: 1.843e-05 [flash_sp]: 7.8e-06 [merge_comm]: 4.76002e-06 [allreduce_fusion]: 4.13001e-06 [matmul_add_comm_reduction]: 1.085e-05 [allreduce_slice_to_reducescatter]: 6.00005e-07 [virtual_shard_identity]: 3.679e-05 [virtual_dataset]: 7.48e-06 [get_grad_eliminate_]: 7.06999e-06 [virtual_output]: 6.59001e-06 [merge_forward]: 4.53999e-06 [cell_reuse_recompute_pass]: 1.53002e-06 [offload_activation]: 1.209e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.556e-05 [merge_recompute_call_nodes]: 1.60999e-06 [before_grad]: 1.275e-05 [set_forward_comm_id_for_comm_node_pass]: 4.36002e-06 [meta_fg_expand]: 3.38e-06 [flash_sp_send_recv_attached]: 2.88e-06 [receive_attached]: 2.28998e-06 [after_resolve]: 1.03e-05 [a_after_grad]: 1.124e-05 [renormalize]: 0.00055711 [add_forward_monad_depend]: 4.87e-06 [auto_monad_grad]: 2.26e-06 [auto_monad_eliminator]: 1.651e-05 [cse]: 3.778e-05 [a_3]: 5.513e-05 [Cycle 2]: 0.00072029, [45] [expand_dump_flag]: 1.06002e-06 [switch_simplify]: 8.39998e-06 [loop_unroll]: 7.03e-06 [a_1]: 0.00015305 [with_stream_mark]: 1.482e-05 [recompute_prepare]: 7.6e-06 [updatestate_depend_eliminate]: 4.27998e-06 [updatestate_assign_eliminate]: 3.00002e-06 [updatestate_loads_eliminate]: 2.99001e-06 [parameter_eliminate]: 1.25999e-06 [a_2]: 8.672e-05 [accelerated_algorithm]: 7.25e-06 [shard]: 1.17e-06 [meta_shard_fg_expand]: 1.85001e-06 [shard_inline]: 7.16999e-06 [merge_send_recv]: 6.27001e-06 [auto_parallel]: 6.31e-06 [parallel]: 4.35e-06 [flash_sp]: 3.42002e-06 [merge_comm]: 4.48001e-06 [allreduce_fusion]: 3.98001e-06 [matmul_add_comm_reduction]: 7.21999e-06 [allreduce_slice_to_reducescatter]: 5.59987e-07 [virtual_shard_identity]: 7.58001e-06 [virtual_dataset]: 6.74001e-06 [get_grad_eliminate_]: 6.51999e-06 [virtual_output]: 6.44999e-06 [merge_forward]: 3.10998e-06 [cell_reuse_recompute_pass]: 1.58002e-06 [offload_activation]: 7.93999e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.432e-05 [merge_recompute_call_nodes]: 7.59988e-07 [before_grad]: 1.165e-05 [set_forward_comm_id_for_comm_node_pass]: 4.36002e-06 [meta_fg_expand]: 3.58e-06 [flash_sp_send_recv_attached]: 1.07e-06 [receive_attached]: 1.20999e-06 [after_resolve]: 9.97999e-06 [a_after_grad]: 1.028e-05 [renormalize]: 7.99773e-08 [add_forward_monad_depend]: 1.35001e-06 [auto_monad_grad]: 9.20001e-07 [auto_monad_eliminator]: 8.03001e-06 [cse]: 1.947e-05 [a_3]: 4.306e-05 [py_interpret_to_execute_after_opt_a]: 1.021e-05 [slice_cell_reuse_recomputed_activation]: 2.38002e-06 [rewriter_after_opt_a]: 4.196e-05 [convert_after_rewriter]: 7.58001e-06 [order_py_execute_after_rewriter]: 5.85002e-06 [mutable_eliminate]: 0.00049845 [opt_b]: 0.00027394, [1] [Cycle 1]: 0.00026745, [7] [b_1]: 0.00017978 [b_2]: 9.61e-06 [updatestate_depend_eliminate]: 7.16999e-06 [updatestate_assign_eliminate]: 3.12002e-06 [updatestate_loads_eliminate]: 3.56999e-06 [renormalize]: 3.50003e-07 [cse]: 2.537e-05 [optimize_parallel_all_gather_comm]: 1.802e-05 [overlap_param_gather]: 1.96e-06 [cconv]: 2.588e-05 [loop_unroll]: 0.00043011 [opt_after_cconv]: 0.0001129, [1] [Cycle 1]: 0.00010715, [7] [c_1]: 3.372e-05 [parameter_eliminate]: 2.96001e-06 [updatestate_depend_eliminate]: 5.52999e-06 [updatestate_assign_eliminate]: 2.92002e-06 [updatestate_loads_eliminate]: 3.08e-06 [cse]: 2.375e-05 [renormalize]: 3.19997e-07 [remove_dup_value]: 1.689e-05 [tuple_transform]: 7.938e-05, [1] [Cycle 1]: 7.487e-05, [4] [d_1]: 4.745e-05 [none_parameter_eliminate]: 1.83002e-06 [renormalize]: 2.00002e-07 [switch_simplify]: 7.65e-06 [partial_unused_args_eliminate]: 2.17999e-06 [add_recomputation]: 5.495e-05 [cse_after_recomputation]: 2.773e-05, [1] [Cycle 1]: 2.307e-05, [1] [cse]: 1.673e-05 [environ_conv]: 6.46e-06 [swap_dp_allreduce_reducescatter]: 5.86e-06 [bias_add_comm_swap]: 2.29999e-06 [label_micro_interleaved_index]: 4.18999e-06 [label_fine_grained_interleaved_index]: 2.63e-06 [merge_cast_opt]: 1.23002e-06 [slice_recompute_activation]: 1.89999e-06 [micro_interleaved_order_control]: 2.22999e-06 [assign_add_opt]: 1.34e-06 [ForceFp32Comm]: 7.59988e-07 [remove_cast_before_assign_add]: 9.70002e-07 [full_micro_interleaved_order_control]: 2.11998e-06 [reorder_send_recv_between_fp_bp]: 2.82002e-06 [comm_op_add_attrs]: 9.50007e-07 [add_comm_op_reuse_tag]: 9.50007e-07 [interleave_split_concat_branches]: 1.14998e-06 [interleave_parallel_branches]: 1.37999e-06 [overlap_opt_shard_in_pipeline]: 1.13001e-06 [overlap_opt_shard_grad_in_pipeline]: 1.94999e-06 [control_data_broadcast_order]: 1.535e-05 [grouped_pairwise_exchange_alltoall]: 1.51998e-06 [offloading_packed_experts]: 4.23999e-06 [overlap_recompute_and_grad_model_parallel]: 5.21002e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.50999e-06 [overlap_recompute_allgather_and_fa_grad]: 1.30001e-06 [overlap_recompute_comm]: 2.19001e-06 [overlap_grad_ring_attention]: 4.37998e-06 [overlap_grad_flash_sp]: 2.05e-05 [begin_end_overlap_inline]: 5.69999e-07 [split_matmul_comm_elemetwise]: 2.19001e-06 [split_layernorm_comm]: 1.59e-06 [handle_group_info]: 1.33002e-06 [symbol_engine_optimizer]: 8.2e-05, [1] [Cycle 1]: 7.769e-05, [6] [build]: 2.91999e-06 [elim_shapecalc]: 1.139e-05 [elim_not_effective]: 1.488e-05 [opt_reshape]: 7.91001e-06 [fold_const_symbol]: 1.201e-05 [renormalize]: 2.50002e-07 [detach_backward]: 1.86e-06 [pipeline_parallel_scheduler]: 1.52001e-06 [auto_monad_reorder]: 1.98e-05 [get_jit_bprop_graph]: 1.31998e-06 [rewriter_after_jit_bprop_graph]: 4e-06 [opt_after_jit_grad]: 0.00048782 [validate]: 3.987e-05 Sums bootstrap : 0.000437s : 4.20% type_inference : 0.005615s : 54.02% event_method : 0.000013s : 0.12% auto_monad : 0.000057s : 0.55% graph_reusing : 0.000005s : 0.05% inline : 0.000003s : 0.03% add_attr.add_attr_with_inline.tag_attr : 0.000016s : 0.15% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000004s : 0.03% parallel-infer-symbol : 0.000003s : 0.03% pre_auto_parallel : 0.000027s : 0.26% insert-virtual-dataset : 0.000002s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.02% optimize.py_interpret_to_execute : 0.000019s : 0.18% optimize.rewriter_before_opt_a : 0.000049s : 0.47% optimize.opt_a.expand_dump_flag : 0.000004s : 0.04% optimize.opt_a.switch_simplify : 0.000036s : 0.35% optimize.opt_a.loop_unroll : 0.000022s : 0.21% optimize.opt_a.a_1 : 0.000502s : 4.83% optimize.opt_a.with_stream_mark : 0.000031s : 0.30% optimize.opt_a.recompute_prepare : 0.000017s : 0.16% optimize.opt_a.updatestate_depend_eliminate : 0.000009s : 0.08% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.07% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.06% optimize.opt_a.parameter_eliminate : 0.000003s : 0.03% optimize.opt_a.a_2 : 0.000182s : 1.75% optimize.opt_a.accelerated_algorithm : 0.000015s : 0.15% optimize.opt_a.shard : 0.000004s : 0.04% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.04% optimize.opt_a.shard_inline : 0.000014s : 0.14% optimize.opt_a.merge_send_recv : 0.000016s : 0.15% optimize.opt_a.auto_parallel : 0.000014s : 0.13% optimize.opt_a.parallel : 0.000023s : 0.22% optimize.opt_a.flash_sp : 0.000011s : 0.11% optimize.opt_a.merge_comm : 0.000009s : 0.09% optimize.opt_a.allreduce_fusion : 0.000008s : 0.08% optimize.opt_a.matmul_add_comm_reduction : 0.000018s : 0.17% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000044s : 0.43% optimize.opt_a.virtual_dataset : 0.000014s : 0.14% optimize.opt_a.get_grad_eliminate_ : 0.000014s : 0.13% optimize.opt_a.virtual_output : 0.000013s : 0.13% optimize.opt_a.merge_forward : 0.000008s : 0.07% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.03% optimize.opt_a.offload_activation : 0.000020s : 0.19% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000030s : 0.29% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.02% optimize.opt_a.before_grad : 0.000024s : 0.23% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000009s : 0.08% optimize.opt_a.meta_fg_expand : 0.000007s : 0.07% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.04% optimize.opt_a.receive_attached : 0.000003s : 0.03% optimize.opt_a.after_resolve : 0.000020s : 0.20% optimize.opt_a.a_after_grad : 0.000022s : 0.21% optimize.opt_a.renormalize : 0.000557s : 5.36% optimize.opt_a.add_forward_monad_depend : 0.000006s : 0.06% optimize.opt_a.auto_monad_grad : 0.000003s : 0.03% optimize.opt_a.auto_monad_eliminator : 0.000025s : 0.24% optimize.opt_a.cse : 0.000057s : 0.55% optimize.opt_a.a_3 : 0.000098s : 0.94% optimize.py_interpret_to_execute_after_opt_a : 0.000010s : 0.10% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.02% optimize.rewriter_after_opt_a : 0.000042s : 0.40% optimize.convert_after_rewriter : 0.000008s : 0.07% optimize.order_py_execute_after_rewriter : 0.000006s : 0.06% optimize.mutable_eliminate : 0.000498s : 4.80% optimize.opt_b.b_1 : 0.000180s : 1.73% optimize.opt_b.b_2 : 0.000010s : 0.09% optimize.opt_b.updatestate_depend_eliminate : 0.000007s : 0.07% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_b.updatestate_loads_eliminate : 0.000004s : 0.03% optimize.opt_b.renormalize : 0.000000s : 0.00% optimize.opt_b.cse : 0.000025s : 0.24% optimize.optimize_parallel_all_gather_comm : 0.000018s : 0.17% optimize.overlap_param_gather : 0.000002s : 0.02% optimize.cconv : 0.000026s : 0.25% optimize.loop_unroll : 0.000430s : 4.14% optimize.opt_after_cconv.c_1 : 0.000034s : 0.32% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.05% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.cse : 0.000024s : 0.23% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000017s : 0.16% optimize.tuple_transform.d_1 : 0.000047s : 0.46% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.02% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000008s : 0.07% optimize.partial_unused_args_eliminate : 0.000002s : 0.02% optimize.add_recomputation : 0.000055s : 0.53% optimize.cse_after_recomputation.cse : 0.000017s : 0.16% optimize.environ_conv : 0.000006s : 0.06% optimize.swap_dp_allreduce_reducescatter : 0.000006s : 0.06% optimize.bias_add_comm_swap : 0.000002s : 0.02% optimize.label_micro_interleaved_index : 0.000004s : 0.04% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.03% optimize.merge_cast_opt : 0.000001s : 0.01% optimize.slice_recompute_activation : 0.000002s : 0.02% optimize.micro_interleaved_order_control : 0.000002s : 0.02% optimize.assign_add_opt : 0.000001s : 0.01% optimize.ForceFp32Comm : 0.000001s : 0.01% optimize.remove_cast_before_assign_add : 0.000001s : 0.01% optimize.full_micro_interleaved_order_control : 0.000002s : 0.02% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.03% optimize.comm_op_add_attrs : 0.000001s : 0.01% optimize.add_comm_op_reuse_tag : 0.000001s : 0.01% optimize.interleave_split_concat_branches : 0.000001s : 0.01% optimize.interleave_parallel_branches : 0.000001s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.02% optimize.control_data_broadcast_order : 0.000015s : 0.15% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.01% optimize.offloading_packed_experts : 0.000004s : 0.04% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.05% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000002s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.01% optimize.overlap_recompute_comm : 0.000002s : 0.02% optimize.overlap_grad_ring_attention : 0.000004s : 0.04% optimize.overlap_grad_flash_sp : 0.000020s : 0.20% optimize.begin_end_overlap_inline : 0.000001s : 0.01% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.02% optimize.split_layernorm_comm : 0.000002s : 0.02% optimize.handle_group_info : 0.000001s : 0.01% optimize.symbol_engine_optimizer.build : 0.000003s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000011s : 0.11% optimize.symbol_engine_optimizer.elim_not_effective : 0.000015s : 0.14% optimize.symbol_engine_optimizer.opt_reshape : 0.000008s : 0.08% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000012s : 0.12% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.02% pipeline_parallel_scheduler : 0.000002s : 0.01% auto_monad_reorder : 0.000020s : 0.19% get_jit_bprop_graph : 0.000001s : 0.01% rewriter_after_jit_bprop_graph : 0.000004s : 0.04% opt_after_jit_grad : 0.000488s : 4.69% validate : 0.000040s : 0.38% Time group info: ------[substitution.] 0.000175 29 11.28% : 0.000020s : 2: substitution.cast_eliminate 1.26% : 0.000002s : 3: substitution.elim_not_effective 0.96% : 0.000002s : 3: substitution.fold_const_symbol 3.71% : 0.000007s : 4: substitution.graph_param_transform 57.57% : 0.000101s : 2: substitution.inline 2.71% : 0.000005s : 6: substitution.j_node_and_user_rematch 3.44% : 0.000006s : 6: substitution.remove_not_recompute_node 1.98% : 0.000003s : 2: substitution.replace_old_param 17.09% : 0.000030s : 1: substitution.value_based_eliminate ------[type_inference.] 0.005568 2 91.35% : 0.005087s : 1: type_inference.infer 8.65% : 0.000481s : 1: type_inference.specialize ------[replace.] 0.000021 2 100.00% : 0.000021s : 2: replace.inline ------[match.] 0.000099 2 100.00% : 0.000099s : 2: match.inline ------[predicate.] 0.000168 980 0.87% : 0.000001s : 9: predicate.accumulaten_eliminater 0.98% : 0.000002s : 4: predicate.ad_related_special_op_eliminate 0.70% : 0.000001s : 8: predicate.addn_check_dump 0.97% : 0.000002s : 9: predicate.addn_zero_filter 0.73% : 0.000001s : 9: predicate.adjust_all_reduce_mul_add 2.50% : 0.000004s : 17: predicate.arithmetic_simplify 0.89% : 0.000002s : 9: predicate.cast_eliminate 0.71% : 0.000001s : 8: predicate.check_bprop_eliminate 0.68% : 0.000001s : 8: predicate.compare_switch_simplify 0.25% : 0.000000s : 4: predicate.const_output_eliminate 0.73% : 0.000001s : 8: predicate.depend_value_elim 0.81% : 0.000001s : 9: predicate.dict_get_item_const_eliminator 0.91% : 0.000002s : 9: predicate.dict_get_item_eliminator 0.75% : 0.000001s : 9: predicate.dict_set_item_eliminator 1.40% : 0.000002s : 8: predicate.dumpgradient_eliminate 0.26% : 0.000000s : 4: predicate.elim_not_effective 0.58% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.12% : 0.000002s : 13: predicate.environ_add_const_eliminate 1.04% : 0.000002s : 13: predicate.environ_get_add_eliminate 1.07% : 0.000002s : 13: predicate.environ_get_depend_swap 1.89% : 0.000003s : 21: predicate.environ_get_eliminate 1.06% : 0.000002s : 13: predicate.environ_get_set_eliminate 0.94% : 0.000002s : 11: predicate.exchange_switch_depend_value 1.84% : 0.000003s : 11: predicate.float_depend_g_call 0.66% : 0.000001s : 8: predicate.float_environ_get_switch 0.98% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.23% : 0.000000s : 4: predicate.fold_const_symbol 0.85% : 0.000001s : 8: predicate.get_grad_eliminate 0.27% : 0.000000s : 4: predicate.graph_param_transform 0.82% : 0.000001s : 8: predicate.incorporate_call 0.66% : 0.000001s : 8: predicate.incorporate_call_switch 6.34% : 0.000011s : 44: predicate.inline 1.02% : 0.000002s : 8: predicate.inline_without_move 0.38% : 0.000001s : 8: predicate.j_node_and_user_rematch 1.00% : 0.000002s : 8: predicate.less_batch_normalization 1.56% : 0.000003s : 17: predicate.list_to_tuple_eliminator_ 2.08% : 0.000003s : 26: predicate.load_eliminater 1.21% : 0.000002s : 4: predicate.loop_unroll_after_grad 1.56% : 0.000003s : 16: predicate.loop_unroll_before_grad 1.84% : 0.000003s : 17: predicate.make_slice_get_slice_eliminator 0.73% : 0.000001s : 8: predicate.merge_addn 0.71% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.73% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.72% : 0.000001s : 9: predicate.minmaximum_grad 1.55% : 0.000003s : 4: predicate.mutable_eliminate 0.47% : 0.000001s : 4: predicate.opt_reshape 0.46% : 0.000001s : 4: predicate.parallel_virtual_node 1.14% : 0.000002s : 11: predicate.partial_defer_inline 1.25% : 0.000002s : 13: predicate.partial_eliminate 0.81% : 0.000001s : 9: predicate.print_const_string_wrapper 0.76% : 0.000001s : 8: predicate.reduce_all_const_elim 1.08% : 0.000002s : 9: predicate.reduce_eliminate 2.13% : 0.000004s : 26: predicate.redundant_stop_gradient_eliminater 0.54% : 0.000001s : 8: predicate.remove_not_recompute_node 1.22% : 0.000002s : 17: predicate.replace_applicator 0.71% : 0.000001s : 8: predicate.replace_old_param 0.35% : 0.000001s : 4: predicate.reset_defer_inline 0.87% : 0.000001s : 9: predicate.reshape_eliminate 0.76% : 0.000001s : 8: predicate.row_tensor_add_zeros_like 0.49% : 0.000001s : 4: predicate.row_tensor_eliminate 0.98% : 0.000002s : 8: predicate.same_eliminate 0.53% : 0.000001s : 8: predicate.set_cell_output_no_recompute 1.26% : 0.000002s : 8: predicate.shard_identity_eliminate 0.87% : 0.000001s : 8: predicate.special_op_eliminate 1.00% : 0.000002s : 8: predicate.specialize_transform 1.27% : 0.000002s : 8: predicate.split_environ_get_set_with_tuple_value 0.89% : 0.000002s : 8: predicate.stack_unstack_eliminate 0.45% : 0.000001s : 4: predicate.switch_call_monad_eliminater 0.96% : 0.000002s : 11: predicate.switch_defer_inline 1.68% : 0.000003s : 19: predicate.switch_layer_defer_inline 4.18% : 0.000007s : 39: predicate.switch_simplify 0.79% : 0.000001s : 9: predicate.tile_eliminate 0.79% : 0.000001s : 9: predicate.transpose_eliminate 1.60% : 0.000003s : 17: predicate.tuple_list_convert_item_index_to_positive 1.73% : 0.000003s : 17: predicate.tuple_list_get_item_const_eliminator 1.48% : 0.000002s : 17: predicate.tuple_list_get_item_depend_reorder 3.17% : 0.000005s : 25: predicate.tuple_list_get_item_eliminator 1.56% : 0.000003s : 17: predicate.tuple_list_get_set_item_eliminator 2.43% : 0.000004s : 25: predicate.tuple_list_set_item_eliminator 1.50% : 0.000003s : 17: predicate.tuple_to_list_eliminator_ 2.08% : 0.000003s : 26: predicate.updatestate_pure_node_eliminater 2.96% : 0.000005s : 34: predicate.updatestate_useless_node_eliminater 0.64% : 0.000001s : 4: predicate.value_based_eliminate 0.84% : 0.000001s : 8: predicate.virtual_dataset_eliminate 0.81% : 0.000001s : 8: predicate.virtual_output_eliminate 0.36% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.57% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000226 5 7.92% : 0.000018s : 1: func_graph_cloner_run.FuncGraphClonerGraph 92.08% : 0.000208s : 4: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.023912 192 0.01% : 0.000003s : 1: ForceFp32Comm 13.24% : 0.003166s : 1: add_attr 13.19% : 0.003154s : 1: add_attr_with_inline 0.02% : 0.000004s : 1: add_comm_op_reuse_tag 0.25% : 0.000059s : 1: add_recomputation 0.02% : 0.000004s : 1: assign_add_opt 0.26% : 0.000062s : 1: auto_monad 0.10% : 0.000024s : 1: auto_monad_reorder 0.01% : 0.000003s : 1: begin_end_overlap_inline 0.02% : 0.000005s : 1: bias_add_comm_swap 1.95% : 0.000467s : 1: bootstrap 0.12% : 0.000029s : 1: cconv 0.02% : 0.000004s : 1: comm_op_add_attrs 0.08% : 0.000018s : 1: control_data_broadcast_order 0.04% : 0.000011s : 1: convert_after_rewriter 0.13% : 0.000031s : 1: cse_after_recomputation 0.02% : 0.000005s : 1: dataset_repeat_opt 0.02% : 0.000005s : 1: detach_backward 0.04% : 0.000010s : 1: environ_conv 0.08% : 0.000019s : 1: event_method 0.02% : 0.000005s : 1: full_micro_interleaved_order_control 0.02% : 0.000004s : 1: get_jit_bprop_graph 0.04% : 0.000009s : 1: graph_reusing 0.02% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.02% : 0.000004s : 1: handle_group_info 0.02% : 0.000006s : 1: inline 0.02% : 0.000006s : 1: insert-virtual-dataset 0.02% : 0.000004s : 1: interleave_parallel_branches 0.02% : 0.000004s : 1: interleave_split_concat_branches 0.02% : 0.000006s : 1: label_fine_grained_interleaved_index 0.03% : 0.000007s : 1: label_micro_interleaved_index 1.83% : 0.000438s : 1: loop_unroll 0.02% : 0.000004s : 1: merge_cast_opt 0.02% : 0.000005s : 1: micro_interleaved_order_control 2.12% : 0.000508s : 1: mutable_eliminate 0.03% : 0.000007s : 1: offloading_packed_experts 0.07% : 0.000016s : 1: opt.transform.loop_unroll_optimizer 0.08% : 0.000018s : 1: opt.transform.mutable_eliminate 4.09% : 0.000977s : 78: opt.transform.opt_a 0.14% : 0.000032s : 1: opt.transform.opt_after_cconv 0.12% : 0.000029s : 1: opt.transform.opt_after_jit_grad 0.66% : 0.000157s : 28: opt.transform.opt_b 0.22% : 0.000053s : 2: opt.transform.opt_trans_graph 0.18% : 0.000042s : 4: opt.transform.symbol_engine_opt 10.05% : 0.002404s : 1: opt_a 0.49% : 0.000116s : 1: opt_after_cconv 2.08% : 0.000497s : 1: opt_after_jit_grad 1.16% : 0.000278s : 1: opt_b 18.77% : 0.004487s : 1: optimize 0.09% : 0.000022s : 1: optimize_parallel_all_gather_comm 0.04% : 0.000009s : 1: order_py_execute_after_rewriter 0.10% : 0.000024s : 1: overlap_grad_flash_sp 0.02% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.03% : 0.000007s : 1: overlap_grad_ring_attention 0.02% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.02% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.02% : 0.000005s : 1: overlap_param_gather 0.02% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.03% : 0.000008s : 1: overlap_recompute_and_grad_model_parallel 0.02% : 0.000005s : 1: overlap_recompute_comm 0.03% : 0.000007s : 1: parallel-infer-symbol 0.02% : 0.000004s : 1: parallel-infer-symbol-second 0.02% : 0.000005s : 1: partial_unused_args_eliminate 0.02% : 0.000004s : 1: pipeline_parallel_scheduler 0.02% : 0.000004s : 1: pipeline_split 0.13% : 0.000032s : 1: pre_auto_parallel 0.09% : 0.000023s : 1: py_interpret_to_execute 0.06% : 0.000014s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000004s : 1: remove_cast_before_assign_add 0.09% : 0.000021s : 1: remove_dup_value 1.31% : 0.000314s : 1: renormalize.infer 0.98% : 0.000235s : 1: renormalize.specialize 0.02% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.03% : 0.000007s : 1: rewriter_after_jit_bprop_graph 0.19% : 0.000046s : 1: rewriter_after_opt_a 0.22% : 0.000053s : 1: rewriter_before_opt_a 0.02% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.02% : 0.000005s : 1: slice_recompute_activation 0.02% : 0.000004s : 1: split_layernorm_comm 0.02% : 0.000005s : 1: split_matmul_comm_elemetwise 0.04% : 0.000009s : 1: swap_dp_allreduce_reducescatter 0.35% : 0.000085s : 1: symbol_engine_optimizer 0.34% : 0.000082s : 1: tuple_transform 23.55% : 0.005632s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:47:00.245.039 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:47:00.245.317 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0150206, [21] [bootstrap]: 0.00043359 [type_inference]: 0.0050401 [event_method]: 1.311e-05 [auto_monad]: 5.857e-05 [graph_reusing]: 5.07e-06 [inline]: 2.27999e-06 [add_attr]: 0.00310944, [1] [add_attr_with_inline]: 0.00310092, [1] [Cycle 1]: 7.176e-05, [2] [tag_attr]: 1.574e-05 [meta_addattr_fg_expand]: 4.03999e-06 [parallel-infer-symbol]: 3.19001e-06 [pre_auto_parallel]: 2.564e-05 [insert-virtual-dataset]: 2.64999e-06 [parallel-infer-symbol-second]: 7.10017e-07 [dataset_repeat_opt]: 2.12999e-06 [pipeline_split]: 1.55001e-06 [optimize]: 0.00519128, [53] [py_interpret_to_execute]: 2.108e-05 [rewriter_before_opt_a]: 5.214e-05 [opt_a]: 0.00282143, [2] [Cycle 1]: 0.00191114, [45] [expand_dump_flag]: 2.56998e-06 [switch_simplify]: 2.668e-05 [loop_unroll]: 1.515e-05 [a_1]: 0.00038482 [with_stream_mark]: 1.599e-05 [recompute_prepare]: 9.92999e-06 [updatestate_depend_eliminate]: 4.47998e-06 [updatestate_assign_eliminate]: 3.98999e-06 [updatestate_loads_eliminate]: 4.48001e-06 [parameter_eliminate]: 2.37001e-06 [a_2]: 0.00012678 [accelerated_algorithm]: 8.64998e-06 [shard]: 1.96e-06 [meta_shard_fg_expand]: 1.97999e-06 [shard_inline]: 8.05e-06 [merge_send_recv]: 9.45001e-06 [auto_parallel]: 7.53999e-06 [parallel]: 1.908e-05 [flash_sp]: 8.69e-06 [merge_comm]: 4.50001e-06 [allreduce_fusion]: 4.67998e-06 [matmul_add_comm_reduction]: 1.089e-05 [allreduce_slice_to_reducescatter]: 9.09989e-07 [virtual_shard_identity]: 9.15999e-06 [virtual_dataset]: 8.46002e-06 [get_grad_eliminate_]: 7.18998e-06 [virtual_output]: 7.16999e-06 [merge_forward]: 4.39998e-06 [cell_reuse_recompute_pass]: 1.22e-06 [offload_activation]: 1.18e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.78e-05 [merge_recompute_call_nodes]: 1.45001e-06 [before_grad]: 1.262e-05 [set_forward_comm_id_for_comm_node_pass]: 4.90999e-06 [meta_fg_expand]: 3.33e-06 [flash_sp_send_recv_attached]: 2.88e-06 [receive_attached]: 2.39001e-06 [after_resolve]: 1.276e-05 [a_after_grad]: 1.14e-05 [renormalize]: 0.00059139 [add_forward_monad_depend]: 5.52001e-06 [auto_monad_grad]: 2.19001e-06 [auto_monad_eliminator]: 1.732e-05 [cse]: 3.562e-05 [a_3]: 7.097e-05 [Cycle 2]: 0.00089708, [45] [expand_dump_flag]: 1.22999e-06 [switch_simplify]: 8.43999e-06 [loop_unroll]: 6.99001e-06 [a_1]: 0.00015458 [with_stream_mark]: 1.15e-05 [recompute_prepare]: 7.53999e-06 [updatestate_depend_eliminate]: 4.57e-06 [updatestate_assign_eliminate]: 3.16001e-06 [updatestate_loads_eliminate]: 3.16001e-06 [parameter_eliminate]: 1.22e-06 [a_2]: 0.00011514 [accelerated_algorithm]: 7.41001e-06 [shard]: 1.27e-06 [meta_shard_fg_expand]: 1.69998e-06 [shard_inline]: 7.38e-06 [merge_send_recv]: 5.82001e-06 [auto_parallel]: 6.36998e-06 [parallel]: 5.87999e-06 [flash_sp]: 3.55e-06 [merge_comm]: 4.2e-06 [allreduce_fusion]: 4.11001e-06 [matmul_add_comm_reduction]: 1.06e-05 [allreduce_slice_to_reducescatter]: 4.60015e-07 [virtual_shard_identity]: 8.50999e-06 [virtual_dataset]: 6.88e-06 [get_grad_eliminate_]: 6.51999e-06 [virtual_output]: 6.39001e-06 [merge_forward]: 4.10998e-06 [cell_reuse_recompute_pass]: 1.42e-06 [offload_activation]: 7.82e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.614e-05 [merge_recompute_call_nodes]: 8.99978e-07 [before_grad]: 1.133e-05 [set_forward_comm_id_for_comm_node_pass]: 4.90999e-06 [meta_fg_expand]: 2.79999e-06 [flash_sp_send_recv_attached]: 1.04e-06 [receive_attached]: 1.29003e-06 [after_resolve]: 1.005e-05 [a_after_grad]: 1.024e-05 [renormalize]: 8.00064e-08 [add_forward_monad_depend]: 1.34e-06 [auto_monad_grad]: 1.09003e-06 [auto_monad_eliminator]: 8.65001e-06 [cse]: 2.022e-05 [a_3]: 5.585e-05 [py_interpret_to_execute_after_opt_a]: 1.375e-05 [slice_cell_reuse_recomputed_activation]: 4.53001e-06 [rewriter_after_opt_a]: 4.675e-05 [convert_after_rewriter]: 1.028e-05 [order_py_execute_after_rewriter]: 8.89e-06 [mutable_eliminate]: 0.00050624 [opt_b]: 0.00035077, [1] [Cycle 1]: 0.00034079, [7] [b_1]: 0.00023519 [b_2]: 9.51003e-06 [updatestate_depend_eliminate]: 6.53e-06 [updatestate_assign_eliminate]: 3.3e-06 [updatestate_loads_eliminate]: 3.15002e-06 [renormalize]: 4.89992e-07 [cse]: 2.378e-05 [optimize_parallel_all_gather_comm]: 1.993e-05 [overlap_param_gather]: 4.67e-06 [cconv]: 2.804e-05 [loop_unroll]: 0.00046091 [opt_after_cconv]: 0.00013778, [1] [Cycle 1]: 0.00012942, [7] [c_1]: 3.427e-05 [parameter_eliminate]: 2.68998e-06 [updatestate_depend_eliminate]: 5.99e-06 [updatestate_assign_eliminate]: 3.41001e-06 [updatestate_loads_eliminate]: 3.30998e-06 [cse]: 2.256e-05 [renormalize]: 6.00005e-07 [remove_dup_value]: 1.915e-05 [tuple_transform]: 9.446e-05, [1] [Cycle 1]: 8.745e-05, [4] [d_1]: 4.788e-05 [none_parameter_eliminate]: 1.54e-06 [renormalize]: 1.59984e-07 [switch_simplify]: 8.03001e-06 [partial_unused_args_eliminate]: 4.78001e-06 [add_recomputation]: 5.639e-05 [cse_after_recomputation]: 3.173e-05, [1] [Cycle 1]: 2.477e-05, [1] [cse]: 1.562e-05 [environ_conv]: 8.56002e-06 [swap_dp_allreduce_reducescatter]: 8.64e-06 [bias_add_comm_swap]: 5.03002e-06 [label_micro_interleaved_index]: 6.36e-06 [label_fine_grained_interleaved_index]: 5.52999e-06 [merge_cast_opt]: 4.07998e-06 [slice_recompute_activation]: 4.48999e-06 [micro_interleaved_order_control]: 4.47e-06 [assign_add_opt]: 3.41999e-06 [ForceFp32Comm]: 3.01999e-06 [remove_cast_before_assign_add]: 3.46001e-06 [full_micro_interleaved_order_control]: 4.34002e-06 [reorder_send_recv_between_fp_bp]: 5.00999e-06 [comm_op_add_attrs]: 3.85e-06 [add_comm_op_reuse_tag]: 3.21001e-06 [interleave_split_concat_branches]: 3.64002e-06 [interleave_parallel_branches]: 3.70998e-06 [overlap_opt_shard_in_pipeline]: 3.91999e-06 [overlap_opt_shard_grad_in_pipeline]: 4.55001e-06 [control_data_broadcast_order]: 1.692e-05 [grouped_pairwise_exchange_alltoall]: 3.9e-06 [offloading_packed_experts]: 6.93e-06 [overlap_recompute_and_grad_model_parallel]: 7.41001e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.72998e-06 [overlap_recompute_allgather_and_fa_grad]: 3.7e-06 [overlap_recompute_comm]: 4.77998e-06 [overlap_grad_ring_attention]: 7.14001e-06 [overlap_grad_flash_sp]: 2.428e-05 [begin_end_overlap_inline]: 3.09999e-06 [split_matmul_comm_elemetwise]: 4.48001e-06 [split_layernorm_comm]: 4.22e-06 [handle_group_info]: 3.56999e-06 [symbol_engine_optimizer]: 0.00010422, [1] [Cycle 1]: 9.716e-05, [6] [build]: 3.95e-06 [elim_shapecalc]: 1.099e-05 [elim_not_effective]: 1.474e-05 [opt_reshape]: 8.43001e-06 [fold_const_symbol]: 1.219e-05 [renormalize]: 2.50002e-07 [detach_backward]: 3.23e-06 [pipeline_parallel_scheduler]: 1.79e-06 [auto_monad_reorder]: 2.128e-05 [get_jit_bprop_graph]: 1.48002e-06 [rewriter_after_jit_bprop_graph]: 4.98001e-06 [opt_after_jit_grad]: 0.00048807 [validate]: 3.967e-05 Sums bootstrap : 0.000434s : 4.26% type_inference : 0.005040s : 49.52% event_method : 0.000013s : 0.13% auto_monad : 0.000059s : 0.58% graph_reusing : 0.000005s : 0.05% inline : 0.000002s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000016s : 0.15% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000004s : 0.04% parallel-infer-symbol : 0.000003s : 0.03% pre_auto_parallel : 0.000026s : 0.25% insert-virtual-dataset : 0.000003s : 0.03% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.02% optimize.py_interpret_to_execute : 0.000021s : 0.21% optimize.rewriter_before_opt_a : 0.000052s : 0.51% optimize.opt_a.expand_dump_flag : 0.000004s : 0.04% optimize.opt_a.switch_simplify : 0.000035s : 0.35% optimize.opt_a.loop_unroll : 0.000022s : 0.22% optimize.opt_a.a_1 : 0.000539s : 5.30% optimize.opt_a.with_stream_mark : 0.000027s : 0.27% optimize.opt_a.recompute_prepare : 0.000017s : 0.17% optimize.opt_a.updatestate_depend_eliminate : 0.000009s : 0.09% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.07% optimize.opt_a.updatestate_loads_eliminate : 0.000008s : 0.08% optimize.opt_a.parameter_eliminate : 0.000004s : 0.04% optimize.opt_a.a_2 : 0.000242s : 2.38% optimize.opt_a.accelerated_algorithm : 0.000016s : 0.16% optimize.opt_a.shard : 0.000003s : 0.03% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.04% optimize.opt_a.shard_inline : 0.000015s : 0.15% optimize.opt_a.merge_send_recv : 0.000015s : 0.15% optimize.opt_a.auto_parallel : 0.000014s : 0.14% optimize.opt_a.parallel : 0.000025s : 0.25% optimize.opt_a.flash_sp : 0.000012s : 0.12% optimize.opt_a.merge_comm : 0.000009s : 0.09% optimize.opt_a.allreduce_fusion : 0.000009s : 0.09% optimize.opt_a.matmul_add_comm_reduction : 0.000021s : 0.21% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000018s : 0.17% optimize.opt_a.virtual_dataset : 0.000015s : 0.15% optimize.opt_a.get_grad_eliminate_ : 0.000014s : 0.13% optimize.opt_a.virtual_output : 0.000014s : 0.13% optimize.opt_a.merge_forward : 0.000009s : 0.08% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.03% optimize.opt_a.offload_activation : 0.000020s : 0.19% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000034s : 0.33% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.02% optimize.opt_a.before_grad : 0.000024s : 0.24% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000010s : 0.10% optimize.opt_a.meta_fg_expand : 0.000006s : 0.06% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.04% optimize.opt_a.receive_attached : 0.000004s : 0.04% optimize.opt_a.after_resolve : 0.000023s : 0.22% optimize.opt_a.a_after_grad : 0.000022s : 0.21% optimize.opt_a.renormalize : 0.000591s : 5.81% optimize.opt_a.add_forward_monad_depend : 0.000007s : 0.07% optimize.opt_a.auto_monad_grad : 0.000003s : 0.03% optimize.opt_a.auto_monad_eliminator : 0.000026s : 0.26% optimize.opt_a.cse : 0.000056s : 0.55% optimize.opt_a.a_3 : 0.000127s : 1.25% optimize.py_interpret_to_execute_after_opt_a : 0.000014s : 0.14% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.04% optimize.rewriter_after_opt_a : 0.000047s : 0.46% optimize.convert_after_rewriter : 0.000010s : 0.10% optimize.order_py_execute_after_rewriter : 0.000009s : 0.09% optimize.mutable_eliminate : 0.000506s : 4.97% optimize.opt_b.b_1 : 0.000235s : 2.31% optimize.opt_b.b_2 : 0.000010s : 0.09% optimize.opt_b.updatestate_depend_eliminate : 0.000007s : 0.06% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_b.renormalize : 0.000000s : 0.00% optimize.opt_b.cse : 0.000024s : 0.23% optimize.optimize_parallel_all_gather_comm : 0.000020s : 0.20% optimize.overlap_param_gather : 0.000005s : 0.05% optimize.cconv : 0.000028s : 0.28% optimize.loop_unroll : 0.000461s : 4.53% optimize.opt_after_cconv.c_1 : 0.000034s : 0.34% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.06% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.cse : 0.000023s : 0.22% optimize.opt_after_cconv.renormalize : 0.000001s : 0.01% optimize.remove_dup_value : 0.000019s : 0.19% optimize.tuple_transform.d_1 : 0.000048s : 0.47% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.02% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000008s : 0.08% optimize.partial_unused_args_eliminate : 0.000005s : 0.05% optimize.add_recomputation : 0.000056s : 0.55% optimize.cse_after_recomputation.cse : 0.000016s : 0.15% optimize.environ_conv : 0.000009s : 0.08% optimize.swap_dp_allreduce_reducescatter : 0.000009s : 0.08% optimize.bias_add_comm_swap : 0.000005s : 0.05% optimize.label_micro_interleaved_index : 0.000006s : 0.06% optimize.label_fine_grained_interleaved_index : 0.000006s : 0.05% optimize.merge_cast_opt : 0.000004s : 0.04% optimize.slice_recompute_activation : 0.000004s : 0.04% optimize.micro_interleaved_order_control : 0.000004s : 0.04% optimize.assign_add_opt : 0.000003s : 0.03% optimize.ForceFp32Comm : 0.000003s : 0.03% optimize.remove_cast_before_assign_add : 0.000003s : 0.03% optimize.full_micro_interleaved_order_control : 0.000004s : 0.04% optimize.reorder_send_recv_between_fp_bp : 0.000005s : 0.05% optimize.comm_op_add_attrs : 0.000004s : 0.04% optimize.add_comm_op_reuse_tag : 0.000003s : 0.03% optimize.interleave_split_concat_branches : 0.000004s : 0.04% optimize.interleave_parallel_branches : 0.000004s : 0.04% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.04% optimize.overlap_opt_shard_grad_in_pipeline : 0.000005s : 0.04% optimize.control_data_broadcast_order : 0.000017s : 0.17% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.04% optimize.offloading_packed_experts : 0.000007s : 0.07% optimize.overlap_recompute_and_grad_model_parallel : 0.000007s : 0.07% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.04% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.04% optimize.overlap_recompute_comm : 0.000005s : 0.05% optimize.overlap_grad_ring_attention : 0.000007s : 0.07% optimize.overlap_grad_flash_sp : 0.000024s : 0.24% optimize.begin_end_overlap_inline : 0.000003s : 0.03% optimize.split_matmul_comm_elemetwise : 0.000004s : 0.04% optimize.split_layernorm_comm : 0.000004s : 0.04% optimize.handle_group_info : 0.000004s : 0.04% optimize.symbol_engine_optimizer.build : 0.000004s : 0.04% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000011s : 0.11% optimize.symbol_engine_optimizer.elim_not_effective : 0.000015s : 0.14% optimize.symbol_engine_optimizer.opt_reshape : 0.000008s : 0.08% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000012s : 0.12% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000003s : 0.03% pipeline_parallel_scheduler : 0.000002s : 0.02% auto_monad_reorder : 0.000021s : 0.21% get_jit_bprop_graph : 0.000001s : 0.01% rewriter_after_jit_bprop_graph : 0.000005s : 0.05% opt_after_jit_grad : 0.000488s : 4.80% validate : 0.000040s : 0.39% Time group info: ------[substitution.] 0.000213 29 9.92% : 0.000021s : 2: substitution.cast_eliminate 0.97% : 0.000002s : 3: substitution.elim_not_effective 0.76% : 0.000002s : 3: substitution.fold_const_symbol 3.07% : 0.000007s : 4: substitution.graph_param_transform 62.95% : 0.000134s : 2: substitution.inline 1.98% : 0.000004s : 6: substitution.j_node_and_user_rematch 2.83% : 0.000006s : 6: substitution.remove_not_recompute_node 1.79% : 0.000004s : 2: substitution.replace_old_param 15.73% : 0.000033s : 1: substitution.value_based_eliminate ------[type_inference.] 0.004994 2 90.58% : 0.004523s : 1: type_inference.infer 9.42% : 0.000471s : 1: type_inference.specialize ------[replace.] 0.000023 2 100.00% : 0.000023s : 2: replace.inline ------[match.] 0.000132 2 100.00% : 0.000132s : 2: match.inline ------[predicate.] 0.000170 980 0.79% : 0.000001s : 9: predicate.accumulaten_eliminater 0.99% : 0.000002s : 4: predicate.ad_related_special_op_eliminate 0.71% : 0.000001s : 8: predicate.addn_check_dump 0.84% : 0.000001s : 9: predicate.addn_zero_filter 0.73% : 0.000001s : 9: predicate.adjust_all_reduce_mul_add 2.18% : 0.000004s : 17: predicate.arithmetic_simplify 0.92% : 0.000002s : 9: predicate.cast_eliminate 0.81% : 0.000001s : 8: predicate.check_bprop_eliminate 0.68% : 0.000001s : 8: predicate.compare_switch_simplify 0.23% : 0.000000s : 4: predicate.const_output_eliminate 0.75% : 0.000001s : 8: predicate.depend_value_elim 0.81% : 0.000001s : 9: predicate.dict_get_item_const_eliminator 1.00% : 0.000002s : 9: predicate.dict_get_item_eliminator 0.77% : 0.000001s : 9: predicate.dict_set_item_eliminator 1.22% : 0.000002s : 8: predicate.dumpgradient_eliminate 0.25% : 0.000000s : 4: predicate.elim_not_effective 0.48% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.06% : 0.000002s : 13: predicate.environ_add_const_eliminate 1.04% : 0.000002s : 13: predicate.environ_get_add_eliminate 1.06% : 0.000002s : 13: predicate.environ_get_depend_swap 1.92% : 0.000003s : 21: predicate.environ_get_eliminate 1.14% : 0.000002s : 13: predicate.environ_get_set_eliminate 0.92% : 0.000002s : 11: predicate.exchange_switch_depend_value 1.87% : 0.000003s : 11: predicate.float_depend_g_call 0.72% : 0.000001s : 8: predicate.float_environ_get_switch 1.15% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.24% : 0.000000s : 4: predicate.fold_const_symbol 0.84% : 0.000001s : 8: predicate.get_grad_eliminate 0.24% : 0.000000s : 4: predicate.graph_param_transform 0.85% : 0.000001s : 8: predicate.incorporate_call 0.67% : 0.000001s : 8: predicate.incorporate_call_switch 6.35% : 0.000011s : 44: predicate.inline 0.93% : 0.000002s : 8: predicate.inline_without_move 0.35% : 0.000001s : 8: predicate.j_node_and_user_rematch 1.21% : 0.000002s : 8: predicate.less_batch_normalization 1.61% : 0.000003s : 17: predicate.list_to_tuple_eliminator_ 2.20% : 0.000004s : 26: predicate.load_eliminater 1.21% : 0.000002s : 4: predicate.loop_unroll_after_grad 1.56% : 0.000003s : 16: predicate.loop_unroll_before_grad 1.82% : 0.000003s : 17: predicate.make_slice_get_slice_eliminator 0.77% : 0.000001s : 8: predicate.merge_addn 0.70% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.77% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.71% : 0.000001s : 9: predicate.minmaximum_grad 1.18% : 0.000002s : 4: predicate.mutable_eliminate 0.48% : 0.000001s : 4: predicate.opt_reshape 0.45% : 0.000001s : 4: predicate.parallel_virtual_node 1.16% : 0.000002s : 11: predicate.partial_defer_inline 1.37% : 0.000002s : 13: predicate.partial_eliminate 0.77% : 0.000001s : 9: predicate.print_const_string_wrapper 0.90% : 0.000002s : 8: predicate.reduce_all_const_elim 1.00% : 0.000002s : 9: predicate.reduce_eliminate 2.22% : 0.000004s : 26: predicate.redundant_stop_gradient_eliminater 0.49% : 0.000001s : 8: predicate.remove_not_recompute_node 1.23% : 0.000002s : 17: predicate.replace_applicator 0.90% : 0.000002s : 8: predicate.replace_old_param 0.34% : 0.000001s : 4: predicate.reset_defer_inline 0.98% : 0.000002s : 9: predicate.reshape_eliminate 0.75% : 0.000001s : 8: predicate.row_tensor_add_zeros_like 0.40% : 0.000001s : 4: predicate.row_tensor_eliminate 0.97% : 0.000002s : 8: predicate.same_eliminate 0.55% : 0.000001s : 8: predicate.set_cell_output_no_recompute 1.11% : 0.000002s : 8: predicate.shard_identity_eliminate 0.89% : 0.000002s : 8: predicate.special_op_eliminate 0.95% : 0.000002s : 8: predicate.specialize_transform 1.30% : 0.000002s : 8: predicate.split_environ_get_set_with_tuple_value 0.96% : 0.000002s : 8: predicate.stack_unstack_eliminate 0.45% : 0.000001s : 4: predicate.switch_call_monad_eliminater 0.94% : 0.000002s : 11: predicate.switch_defer_inline 1.79% : 0.000003s : 19: predicate.switch_layer_defer_inline 4.12% : 0.000007s : 39: predicate.switch_simplify 0.78% : 0.000001s : 9: predicate.tile_eliminate 0.80% : 0.000001s : 9: predicate.transpose_eliminate 1.52% : 0.000003s : 17: predicate.tuple_list_convert_item_index_to_positive 1.63% : 0.000003s : 17: predicate.tuple_list_get_item_const_eliminator 1.56% : 0.000003s : 17: predicate.tuple_list_get_item_depend_reorder 3.24% : 0.000005s : 25: predicate.tuple_list_get_item_eliminator 1.44% : 0.000002s : 17: predicate.tuple_list_get_set_item_eliminator 2.63% : 0.000004s : 25: predicate.tuple_list_set_item_eliminator 1.66% : 0.000003s : 17: predicate.tuple_to_list_eliminator_ 2.02% : 0.000003s : 26: predicate.updatestate_pure_node_eliminater 2.94% : 0.000005s : 34: predicate.updatestate_useless_node_eliminater 0.61% : 0.000001s : 4: predicate.value_based_eliminate 0.80% : 0.000001s : 8: predicate.virtual_dataset_eliminate 0.76% : 0.000001s : 8: predicate.virtual_output_eliminate 0.36% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.55% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000223 5 7.76% : 0.000017s : 1: func_graph_cloner_run.FuncGraphClonerGraph 92.24% : 0.000206s : 4: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.025139 192 0.02% : 0.000006s : 1: ForceFp32Comm 12.41% : 0.003119s : 1: add_attr 12.35% : 0.003105s : 1: add_attr_with_inline 0.02% : 0.000006s : 1: add_comm_op_reuse_tag 0.24% : 0.000060s : 1: add_recomputation 0.02% : 0.000006s : 1: assign_add_opt 0.27% : 0.000067s : 1: auto_monad 0.11% : 0.000029s : 1: auto_monad_reorder 0.02% : 0.000006s : 1: begin_end_overlap_inline 0.03% : 0.000008s : 1: bias_add_comm_swap 1.90% : 0.000477s : 1: bootstrap 0.12% : 0.000031s : 1: cconv 0.03% : 0.000007s : 1: comm_op_add_attrs 0.08% : 0.000020s : 1: control_data_broadcast_order 0.05% : 0.000013s : 1: convert_after_rewriter 0.14% : 0.000035s : 1: cse_after_recomputation 0.03% : 0.000008s : 1: dataset_repeat_opt 0.07% : 0.000018s : 1: detach_backward 0.05% : 0.000011s : 1: environ_conv 0.09% : 0.000023s : 1: event_method 0.03% : 0.000007s : 1: full_micro_interleaved_order_control 0.03% : 0.000007s : 1: get_jit_bprop_graph 0.04% : 0.000011s : 1: graph_reusing 0.03% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.02% : 0.000006s : 1: handle_group_info 0.03% : 0.000008s : 1: inline 0.04% : 0.000009s : 1: insert-virtual-dataset 0.03% : 0.000006s : 1: interleave_parallel_branches 0.03% : 0.000006s : 1: interleave_split_concat_branches 0.03% : 0.000008s : 1: label_fine_grained_interleaved_index 0.04% : 0.000009s : 1: label_micro_interleaved_index 1.86% : 0.000467s : 1: loop_unroll 0.03% : 0.000007s : 1: merge_cast_opt 0.03% : 0.000007s : 1: micro_interleaved_order_control 2.04% : 0.000512s : 1: mutable_eliminate 0.04% : 0.000010s : 1: offloading_packed_experts 0.06% : 0.000016s : 1: opt.transform.loop_unroll_optimizer 0.07% : 0.000017s : 1: opt.transform.mutable_eliminate 3.98% : 0.001000s : 78: opt.transform.opt_a 0.13% : 0.000033s : 1: opt.transform.opt_after_cconv 0.12% : 0.000029s : 1: opt.transform.opt_after_jit_grad 0.65% : 0.000164s : 28: opt.transform.opt_b 0.21% : 0.000053s : 2: opt.transform.opt_trans_graph 0.17% : 0.000043s : 4: opt.transform.symbol_engine_opt 11.24% : 0.002825s : 1: opt_a 0.56% : 0.000141s : 1: opt_after_cconv 1.98% : 0.000499s : 1: opt_after_jit_grad 1.41% : 0.000354s : 1: opt_b 22.01% : 0.005534s : 1: optimize 0.09% : 0.000024s : 1: optimize_parallel_all_gather_comm 0.05% : 0.000012s : 1: order_py_execute_after_rewriter 0.11% : 0.000027s : 1: overlap_grad_flash_sp 0.03% : 0.000006s : 1: overlap_grad_matmul_and_grad_allreduce 0.04% : 0.000010s : 1: overlap_grad_ring_attention 0.03% : 0.000007s : 1: overlap_opt_shard_grad_in_pipeline 0.03% : 0.000007s : 1: overlap_opt_shard_in_pipeline 0.03% : 0.000008s : 1: overlap_param_gather 0.03% : 0.000006s : 1: overlap_recompute_allgather_and_fa_grad 0.04% : 0.000010s : 1: overlap_recompute_and_grad_model_parallel 0.03% : 0.000008s : 1: overlap_recompute_comm 0.04% : 0.000010s : 1: parallel-infer-symbol 0.03% : 0.000006s : 1: parallel-infer-symbol-second 0.03% : 0.000008s : 1: partial_unused_args_eliminate 0.03% : 0.000009s : 1: pipeline_parallel_scheduler 0.03% : 0.000007s : 1: pipeline_split 0.14% : 0.000034s : 1: pre_auto_parallel 0.10% : 0.000024s : 1: py_interpret_to_execute 0.07% : 0.000017s : 1: py_interpret_to_execute_after_opt_a 0.02% : 0.000006s : 1: remove_cast_before_assign_add 0.09% : 0.000022s : 1: remove_dup_value 1.34% : 0.000336s : 1: renormalize.infer 0.98% : 0.000247s : 1: renormalize.specialize 0.03% : 0.000008s : 1: reorder_send_recv_between_fp_bp 0.04% : 0.000011s : 1: rewriter_after_jit_bprop_graph 0.20% : 0.000051s : 1: rewriter_after_opt_a 0.22% : 0.000056s : 1: rewriter_before_opt_a 0.03% : 0.000007s : 1: slice_cell_reuse_recomputed_activation 0.03% : 0.000007s : 1: slice_recompute_activation 0.03% : 0.000007s : 1: split_layernorm_comm 0.03% : 0.000007s : 1: split_matmul_comm_elemetwise 0.05% : 0.000012s : 1: swap_dp_allreduce_reducescatter 0.43% : 0.000107s : 1: symbol_engine_optimizer 0.39% : 0.000097s : 1: tuple_transform 20.15% : 0.005066s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:47:00.436.425 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0134837, [21] [bootstrap]: 0.00043312 [type_inference]: 0.0049149 [event_method]: 1.199e-05 [auto_monad]: 5.506e-05 [graph_reusing]: 5.05999e-06 [inline]: 2.27999e-06 [add_attr]: 0.00302046, [1] [add_attr_with_inline]: 0.00301269, [1] [Cycle 1]: 5.321e-05, [2] [tag_attr]: 1.379e-05 [meta_addattr_fg_expand]: 3.53e-06 [parallel-infer-symbol]: 2.73e-06 [pre_auto_parallel]: 2.426e-05 [insert-virtual-dataset]: 2.33002e-06 [parallel-infer-symbol-second]: 6.90023e-07 [dataset_repeat_opt]: 1.86998e-06 [pipeline_split]: 1.67001e-06 [optimize]: 0.00431276, [53] [py_interpret_to_execute]: 1.804e-05 [rewriter_before_opt_a]: 4.736e-05 [opt_a]: 0.0022966, [2] [Cycle 1]: 0.00157771, [45] [expand_dump_flag]: 2.89999e-06 [switch_simplify]: 2.584e-05 [loop_unroll]: 1.469e-05 [a_1]: 0.00035706 [with_stream_mark]: 1.556e-05 [recompute_prepare]: 9.31e-06 [updatestate_depend_eliminate]: 4.15999e-06 [updatestate_assign_eliminate]: 3.55998e-06 [updatestate_loads_eliminate]: 3.43999e-06 [parameter_eliminate]: 1.82999e-06 [a_2]: 9.563e-05 [accelerated_algorithm]: 7.65e-06 [shard]: 2.17999e-06 [meta_shard_fg_expand]: 2.15002e-06 [shard_inline]: 7.48e-06 [merge_send_recv]: 8.80001e-06 [auto_parallel]: 6.76e-06 [parallel]: 1.906e-05 [flash_sp]: 7.77e-06 [merge_comm]: 4.82e-06 [allreduce_fusion]: 4.25e-06 [matmul_add_comm_reduction]: 1.074e-05 [allreduce_slice_to_reducescatter]: 6.30011e-07 [virtual_shard_identity]: 8.57e-06 [virtual_dataset]: 7.33e-06 [get_grad_eliminate_]: 6.75002e-06 [virtual_output]: 6.98e-06 [merge_forward]: 5.24998e-06 [cell_reuse_recompute_pass]: 1.41002e-06 [offload_activation]: 1.061e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.607e-05 [merge_recompute_call_nodes]: 1.49e-06 [before_grad]: 1.267e-05 [set_forward_comm_id_for_comm_node_pass]: 4.37998e-06 [meta_fg_expand]: 3.18e-06 [flash_sp_send_recv_attached]: 2.39001e-06 [receive_attached]: 2.36e-06 [after_resolve]: 1.146e-05 [a_after_grad]: 1.098e-05 [renormalize]: 0.00049567 [add_forward_monad_depend]: 4.96002e-06 [auto_monad_grad]: 1.74998e-06 [auto_monad_eliminator]: 1.545e-05 [cse]: 3.676e-05 [a_3]: 5.413e-05 [Cycle 2]: 0.00070965, [45] [expand_dump_flag]: 1.05999e-06 [switch_simplify]: 8.77e-06 [loop_unroll]: 6.99001e-06 [a_1]: 0.00015153 [with_stream_mark]: 1.154e-05 [recompute_prepare]: 7.34002e-06 [updatestate_depend_eliminate]: 4.02e-06 [updatestate_assign_eliminate]: 2.92002e-06 [updatestate_loads_eliminate]: 2.68e-06 [parameter_eliminate]: 1.10001e-06 [a_2]: 8.761e-05 [accelerated_algorithm]: 7.15003e-06 [shard]: 1.09e-06 [meta_shard_fg_expand]: 1.50999e-06 [shard_inline]: 7.18998e-06 [merge_send_recv]: 5.56e-06 [auto_parallel]: 6.04001e-06 [parallel]: 4.42e-06 [flash_sp]: 3.21999e-06 [merge_comm]: 4.05e-06 [allreduce_fusion]: 3.61001e-06 [matmul_add_comm_reduction]: 6.81001e-06 [allreduce_slice_to_reducescatter]: 4.00003e-07 [virtual_shard_identity]: 7.87e-06 [virtual_dataset]: 6.58e-06 [get_grad_eliminate_]: 6.76e-06 [virtual_output]: 6.36998e-06 [merge_forward]: 3.19001e-06 [cell_reuse_recompute_pass]: 1.30999e-06 [offload_activation]: 7.73999e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.328e-05 [merge_recompute_call_nodes]: 7.59988e-07 [before_grad]: 1.098e-05 [set_forward_comm_id_for_comm_node_pass]: 4.50999e-06 [meta_fg_expand]: 2.71e-06 [flash_sp_send_recv_attached]: 8.70001e-07 [receive_attached]: 9.89996e-07 [after_resolve]: 9.30001e-06 [a_after_grad]: 1.061e-05 [renormalize]: 8.00064e-08 [add_forward_monad_depend]: 1.20999e-06 [auto_monad_grad]: 8.30012e-07 [auto_monad_eliminator]: 8.07e-06 [cse]: 1.822e-05 [a_3]: 4.307e-05 [py_interpret_to_execute_after_opt_a]: 9.13002e-06 [slice_cell_reuse_recomputed_activation]: 2.17001e-06 [rewriter_after_opt_a]: 3.969e-05 [convert_after_rewriter]: 7.77e-06 [order_py_execute_after_rewriter]: 6.15002e-06 [mutable_eliminate]: 0.00046862 [opt_b]: 0.0002705, [1] [Cycle 1]: 0.0002647, [7] [b_1]: 0.0001825 [b_2]: 9.29e-06 [updatestate_depend_eliminate]: 5.81e-06 [updatestate_assign_eliminate]: 2.91e-06 [updatestate_loads_eliminate]: 2.78003e-06 [renormalize]: 3.99974e-07 [cse]: 2.31e-05 [optimize_parallel_all_gather_comm]: 1.652e-05 [overlap_param_gather]: 2.02001e-06 [cconv]: 2.302e-05 [loop_unroll]: 0.00042325 [opt_after_cconv]: 0.00011131, [1] [Cycle 1]: 0.00010541, [7] [c_1]: 3.373e-05 [parameter_eliminate]: 2.45002e-06 [updatestate_depend_eliminate]: 5.72999e-06 [updatestate_assign_eliminate]: 3.11001e-06 [updatestate_loads_eliminate]: 2.78998e-06 [cse]: 2.202e-05 [renormalize]: 3.50003e-07 [remove_dup_value]: 1.585e-05 [tuple_transform]: 7.871e-05, [1] [Cycle 1]: 7.443e-05, [4] [d_1]: 4.578e-05 [none_parameter_eliminate]: 1.66e-06 [renormalize]: 2.00002e-07 [switch_simplify]: 7.73999e-06 [partial_unused_args_eliminate]: 1.97999e-06 [add_recomputation]: 5.222e-05 [cse_after_recomputation]: 2.589e-05, [1] [Cycle 1]: 2.129e-05, [1] [cse]: 1.575e-05 [environ_conv]: 5.92001e-06 [swap_dp_allreduce_reducescatter]: 5.85002e-06 [bias_add_comm_swap]: 2.69001e-06 [label_micro_interleaved_index]: 4.40999e-06 [label_fine_grained_interleaved_index]: 2.69999e-06 [merge_cast_opt]: 1.32e-06 [slice_recompute_activation]: 2.27001e-06 [micro_interleaved_order_control]: 2.40002e-06 [assign_add_opt]: 1.19e-06 [ForceFp32Comm]: 7.80012e-07 [remove_cast_before_assign_add]: 9.70002e-07 [full_micro_interleaved_order_control]: 2.31e-06 [reorder_send_recv_between_fp_bp]: 2.58e-06 [comm_op_add_attrs]: 1.25001e-06 [add_comm_op_reuse_tag]: 9.00007e-07 [interleave_split_concat_branches]: 1.31998e-06 [interleave_parallel_branches]: 1.04e-06 [overlap_opt_shard_in_pipeline]: 1.20001e-06 [overlap_opt_shard_grad_in_pipeline]: 2.05002e-06 [control_data_broadcast_order]: 1.425e-05 [grouped_pairwise_exchange_alltoall]: 1.54e-06 [offloading_packed_experts]: 4.37e-06 [overlap_recompute_and_grad_model_parallel]: 4.97999e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.22e-06 [overlap_recompute_allgather_and_fa_grad]: 1.34e-06 [overlap_recompute_comm]: 2.32001e-06 [overlap_grad_ring_attention]: 4.4e-06 [overlap_grad_flash_sp]: 2.004e-05 [begin_end_overlap_inline]: 5.00004e-07 [split_matmul_comm_elemetwise]: 2.36e-06 [split_layernorm_comm]: 1.62001e-06 [handle_group_info]: 8.59989e-07 [symbol_engine_optimizer]: 8.248e-05, [1] [Cycle 1]: 7.804e-05, [6] [build]: 2.83e-06 [elim_shapecalc]: 1.129e-05 [elim_not_effective]: 1.49e-05 [opt_reshape]: 8.02998e-06 [fold_const_symbol]: 1.224e-05 [renormalize]: 1.69995e-07 [detach_backward]: 2.02999e-06 [pipeline_parallel_scheduler]: 1.50999e-06 [auto_monad_reorder]: 1.914e-05 [get_jit_bprop_graph]: 1.27999e-06 [rewriter_after_jit_bprop_graph]: 3.46001e-06 [opt_after_jit_grad]: 0.0004862 [validate]: 3.798e-05 Sums bootstrap : 0.000433s : 4.55% type_inference : 0.004915s : 51.64% event_method : 0.000012s : 0.13% auto_monad : 0.000055s : 0.58% graph_reusing : 0.000005s : 0.05% inline : 0.000002s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000014s : 0.14% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000004s : 0.04% parallel-infer-symbol : 0.000003s : 0.03% pre_auto_parallel : 0.000024s : 0.25% insert-virtual-dataset : 0.000002s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.02% optimize.py_interpret_to_execute : 0.000018s : 0.19% optimize.rewriter_before_opt_a : 0.000047s : 0.50% optimize.opt_a.expand_dump_flag : 0.000004s : 0.04% optimize.opt_a.switch_simplify : 0.000035s : 0.36% optimize.opt_a.loop_unroll : 0.000022s : 0.23% optimize.opt_a.a_1 : 0.000509s : 5.34% optimize.opt_a.with_stream_mark : 0.000027s : 0.28% optimize.opt_a.recompute_prepare : 0.000017s : 0.17% optimize.opt_a.updatestate_depend_eliminate : 0.000008s : 0.09% optimize.opt_a.updatestate_assign_eliminate : 0.000006s : 0.07% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.06% optimize.opt_a.parameter_eliminate : 0.000003s : 0.03% optimize.opt_a.a_2 : 0.000183s : 1.93% optimize.opt_a.accelerated_algorithm : 0.000015s : 0.16% optimize.opt_a.shard : 0.000003s : 0.03% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.04% optimize.opt_a.shard_inline : 0.000015s : 0.15% optimize.opt_a.merge_send_recv : 0.000014s : 0.15% optimize.opt_a.auto_parallel : 0.000013s : 0.13% optimize.opt_a.parallel : 0.000023s : 0.25% optimize.opt_a.flash_sp : 0.000011s : 0.12% optimize.opt_a.merge_comm : 0.000009s : 0.09% optimize.opt_a.allreduce_fusion : 0.000008s : 0.08% optimize.opt_a.matmul_add_comm_reduction : 0.000018s : 0.18% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000016s : 0.17% optimize.opt_a.virtual_dataset : 0.000014s : 0.15% optimize.opt_a.get_grad_eliminate_ : 0.000014s : 0.14% optimize.opt_a.virtual_output : 0.000013s : 0.14% optimize.opt_a.merge_forward : 0.000008s : 0.09% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.03% optimize.opt_a.offload_activation : 0.000018s : 0.19% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000029s : 0.31% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.02% optimize.opt_a.before_grad : 0.000024s : 0.25% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000009s : 0.09% optimize.opt_a.meta_fg_expand : 0.000006s : 0.06% optimize.opt_a.flash_sp_send_recv_attached : 0.000003s : 0.03% optimize.opt_a.receive_attached : 0.000003s : 0.04% optimize.opt_a.after_resolve : 0.000021s : 0.22% optimize.opt_a.a_after_grad : 0.000022s : 0.23% optimize.opt_a.renormalize : 0.000496s : 5.21% optimize.opt_a.add_forward_monad_depend : 0.000006s : 0.06% optimize.opt_a.auto_monad_grad : 0.000003s : 0.03% optimize.opt_a.auto_monad_eliminator : 0.000024s : 0.25% optimize.opt_a.cse : 0.000055s : 0.58% optimize.opt_a.a_3 : 0.000097s : 1.02% optimize.py_interpret_to_execute_after_opt_a : 0.000009s : 0.10% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.02% optimize.rewriter_after_opt_a : 0.000040s : 0.42% optimize.convert_after_rewriter : 0.000008s : 0.08% optimize.order_py_execute_after_rewriter : 0.000006s : 0.06% optimize.mutable_eliminate : 0.000469s : 4.92% optimize.opt_b.b_1 : 0.000182s : 1.92% optimize.opt_b.b_2 : 0.000009s : 0.10% optimize.opt_b.updatestate_depend_eliminate : 0.000006s : 0.06% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_b.renormalize : 0.000000s : 0.00% optimize.opt_b.cse : 0.000023s : 0.24% optimize.optimize_parallel_all_gather_comm : 0.000017s : 0.17% optimize.overlap_param_gather : 0.000002s : 0.02% optimize.cconv : 0.000023s : 0.24% optimize.loop_unroll : 0.000423s : 4.45% optimize.opt_after_cconv.c_1 : 0.000034s : 0.35% optimize.opt_after_cconv.parameter_eliminate : 0.000002s : 0.03% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.06% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.cse : 0.000022s : 0.23% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000016s : 0.17% optimize.tuple_transform.d_1 : 0.000046s : 0.48% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.02% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000008s : 0.08% optimize.partial_unused_args_eliminate : 0.000002s : 0.02% optimize.add_recomputation : 0.000052s : 0.55% optimize.cse_after_recomputation.cse : 0.000016s : 0.17% optimize.environ_conv : 0.000006s : 0.06% optimize.swap_dp_allreduce_reducescatter : 0.000006s : 0.06% optimize.bias_add_comm_swap : 0.000003s : 0.03% optimize.label_micro_interleaved_index : 0.000004s : 0.05% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.03% optimize.merge_cast_opt : 0.000001s : 0.01% optimize.slice_recompute_activation : 0.000002s : 0.02% optimize.micro_interleaved_order_control : 0.000002s : 0.03% optimize.assign_add_opt : 0.000001s : 0.01% optimize.ForceFp32Comm : 0.000001s : 0.01% optimize.remove_cast_before_assign_add : 0.000001s : 0.01% optimize.full_micro_interleaved_order_control : 0.000002s : 0.02% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.03% optimize.comm_op_add_attrs : 0.000001s : 0.01% optimize.add_comm_op_reuse_tag : 0.000001s : 0.01% optimize.interleave_split_concat_branches : 0.000001s : 0.01% optimize.interleave_parallel_branches : 0.000001s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.02% optimize.control_data_broadcast_order : 0.000014s : 0.15% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.02% optimize.offloading_packed_experts : 0.000004s : 0.05% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.05% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.01% optimize.overlap_recompute_comm : 0.000002s : 0.02% optimize.overlap_grad_ring_attention : 0.000004s : 0.05% optimize.overlap_grad_flash_sp : 0.000020s : 0.21% optimize.begin_end_overlap_inline : 0.000001s : 0.01% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.02% optimize.split_layernorm_comm : 0.000002s : 0.02% optimize.handle_group_info : 0.000001s : 0.01% optimize.symbol_engine_optimizer.build : 0.000003s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000011s : 0.12% optimize.symbol_engine_optimizer.elim_not_effective : 0.000015s : 0.16% optimize.symbol_engine_optimizer.opt_reshape : 0.000008s : 0.08% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000012s : 0.13% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.02% pipeline_parallel_scheduler : 0.000002s : 0.02% auto_monad_reorder : 0.000019s : 0.20% get_jit_bprop_graph : 0.000001s : 0.01% rewriter_after_jit_bprop_graph : 0.000003s : 0.04% opt_after_jit_grad : 0.000486s : 5.11% validate : 0.000038s : 0.40% Time group info: ------[substitution.] 0.000163 29 11.22% : 0.000018s : 2: substitution.cast_eliminate 1.32% : 0.000002s : 3: substitution.elim_not_effective 1.13% : 0.000002s : 3: substitution.fold_const_symbol 3.65% : 0.000006s : 4: substitution.graph_param_transform 56.07% : 0.000092s : 2: substitution.inline 2.34% : 0.000004s : 6: substitution.j_node_and_user_rematch 3.83% : 0.000006s : 6: substitution.remove_not_recompute_node 1.73% : 0.000003s : 2: substitution.replace_old_param 18.69% : 0.000031s : 1: substitution.value_based_eliminate ------[type_inference.] 0.004871 2 90.49% : 0.004408s : 1: type_inference.infer 9.51% : 0.000463s : 1: type_inference.specialize ------[replace.] 0.000021 2 100.00% : 0.000021s : 2: replace.inline ------[match.] 0.000090 2 100.00% : 0.000090s : 2: match.inline ------[predicate.] 0.000183 980 0.79% : 0.000001s : 9: predicate.accumulaten_eliminater 0.82% : 0.000002s : 4: predicate.ad_related_special_op_eliminate 0.62% : 0.000001s : 8: predicate.addn_check_dump 0.74% : 0.000001s : 9: predicate.addn_zero_filter 0.65% : 0.000001s : 9: predicate.adjust_all_reduce_mul_add 2.23% : 0.000004s : 17: predicate.arithmetic_simplify 11.06% : 0.000020s : 9: predicate.cast_eliminate 0.68% : 0.000001s : 8: predicate.check_bprop_eliminate 0.67% : 0.000001s : 8: predicate.compare_switch_simplify 0.24% : 0.000000s : 4: predicate.const_output_eliminate 0.92% : 0.000002s : 8: predicate.depend_value_elim 0.70% : 0.000001s : 9: predicate.dict_get_item_const_eliminator 1.01% : 0.000002s : 9: predicate.dict_get_item_eliminator 0.70% : 0.000001s : 9: predicate.dict_set_item_eliminator 1.14% : 0.000002s : 8: predicate.dumpgradient_eliminate 0.31% : 0.000001s : 4: predicate.elim_not_effective 0.47% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.01% : 0.000002s : 13: predicate.environ_add_const_eliminate 0.97% : 0.000002s : 13: predicate.environ_get_add_eliminate 0.92% : 0.000002s : 13: predicate.environ_get_depend_swap 1.64% : 0.000003s : 21: predicate.environ_get_eliminate 0.94% : 0.000002s : 13: predicate.environ_get_set_eliminate 0.83% : 0.000002s : 11: predicate.exchange_switch_depend_value 1.59% : 0.000003s : 11: predicate.float_depend_g_call 0.63% : 0.000001s : 8: predicate.float_environ_get_switch 0.90% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.21% : 0.000000s : 4: predicate.fold_const_symbol 0.74% : 0.000001s : 8: predicate.get_grad_eliminate 0.25% : 0.000000s : 4: predicate.graph_param_transform 0.83% : 0.000002s : 8: predicate.incorporate_call 0.61% : 0.000001s : 8: predicate.incorporate_call_switch 5.89% : 0.000011s : 44: predicate.inline 0.90% : 0.000002s : 8: predicate.inline_without_move 0.34% : 0.000001s : 8: predicate.j_node_and_user_rematch 0.96% : 0.000002s : 8: predicate.less_batch_normalization 1.45% : 0.000003s : 17: predicate.list_to_tuple_eliminator_ 2.07% : 0.000004s : 26: predicate.load_eliminater 1.00% : 0.000002s : 4: predicate.loop_unroll_after_grad 1.31% : 0.000002s : 16: predicate.loop_unroll_before_grad 1.54% : 0.000003s : 17: predicate.make_slice_get_slice_eliminator 0.68% : 0.000001s : 8: predicate.merge_addn 0.60% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.64% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.63% : 0.000001s : 9: predicate.minmaximum_grad 1.01% : 0.000002s : 4: predicate.mutable_eliminate 0.39% : 0.000001s : 4: predicate.opt_reshape 0.39% : 0.000001s : 4: predicate.parallel_virtual_node 1.16% : 0.000002s : 11: predicate.partial_defer_inline 1.21% : 0.000002s : 13: predicate.partial_eliminate 0.75% : 0.000001s : 9: predicate.print_const_string_wrapper 0.66% : 0.000001s : 8: predicate.reduce_all_const_elim 0.99% : 0.000002s : 9: predicate.reduce_eliminate 1.93% : 0.000004s : 26: predicate.redundant_stop_gradient_eliminater 0.45% : 0.000001s : 8: predicate.remove_not_recompute_node 1.07% : 0.000002s : 17: predicate.replace_applicator 0.60% : 0.000001s : 8: predicate.replace_old_param 0.33% : 0.000001s : 4: predicate.reset_defer_inline 0.74% : 0.000001s : 9: predicate.reshape_eliminate 0.66% : 0.000001s : 8: predicate.row_tensor_add_zeros_like 0.43% : 0.000001s : 4: predicate.row_tensor_eliminate 0.86% : 0.000002s : 8: predicate.same_eliminate 0.51% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.87% : 0.000002s : 8: predicate.shard_identity_eliminate 0.84% : 0.000002s : 8: predicate.special_op_eliminate 0.90% : 0.000002s : 8: predicate.specialize_transform 0.95% : 0.000002s : 8: predicate.split_environ_get_set_with_tuple_value 0.83% : 0.000002s : 8: predicate.stack_unstack_eliminate 0.44% : 0.000001s : 4: predicate.switch_call_monad_eliminater 0.95% : 0.000002s : 11: predicate.switch_defer_inline 1.49% : 0.000003s : 19: predicate.switch_layer_defer_inline 3.80% : 0.000007s : 39: predicate.switch_simplify 0.68% : 0.000001s : 9: predicate.tile_eliminate 0.72% : 0.000001s : 9: predicate.transpose_eliminate 1.43% : 0.000003s : 17: predicate.tuple_list_convert_item_index_to_positive 1.49% : 0.000003s : 17: predicate.tuple_list_get_item_const_eliminator 1.32% : 0.000002s : 17: predicate.tuple_list_get_item_depend_reorder 2.63% : 0.000005s : 25: predicate.tuple_list_get_item_eliminator 1.39% : 0.000003s : 17: predicate.tuple_list_get_set_item_eliminator 2.31% : 0.000004s : 25: predicate.tuple_list_set_item_eliminator 1.38% : 0.000003s : 17: predicate.tuple_to_list_eliminator_ 2.14% : 0.000004s : 26: predicate.updatestate_pure_node_eliminater 2.72% : 0.000005s : 34: predicate.updatestate_useless_node_eliminater 0.54% : 0.000001s : 4: predicate.value_based_eliminate 0.73% : 0.000001s : 8: predicate.virtual_dataset_eliminate 0.72% : 0.000001s : 8: predicate.virtual_output_eliminate 0.32% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.46% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000209 5 7.81% : 0.000016s : 1: func_graph_cloner_run.FuncGraphClonerGraph 92.19% : 0.000193s : 4: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.022489 192 0.02% : 0.000004s : 1: ForceFp32Comm 13.45% : 0.003025s : 1: add_attr 13.41% : 0.003016s : 1: add_attr_with_inline 0.02% : 0.000004s : 1: add_comm_op_reuse_tag 0.25% : 0.000056s : 1: add_recomputation 0.02% : 0.000004s : 1: assign_add_opt 0.27% : 0.000061s : 1: auto_monad 0.10% : 0.000023s : 1: auto_monad_reorder 0.01% : 0.000003s : 1: begin_end_overlap_inline 0.03% : 0.000006s : 1: bias_add_comm_swap 2.05% : 0.000462s : 1: bootstrap 0.12% : 0.000026s : 1: cconv 0.02% : 0.000004s : 1: comm_op_add_attrs 0.08% : 0.000017s : 1: control_data_broadcast_order 0.05% : 0.000011s : 1: convert_after_rewriter 0.13% : 0.000029s : 1: cse_after_recomputation 0.02% : 0.000005s : 1: dataset_repeat_opt 0.02% : 0.000005s : 1: detach_backward 0.04% : 0.000009s : 1: environ_conv 0.08% : 0.000017s : 1: event_method 0.02% : 0.000005s : 1: full_micro_interleaved_order_control 0.02% : 0.000005s : 1: get_jit_bprop_graph 0.04% : 0.000009s : 1: graph_reusing 0.02% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.02% : 0.000004s : 1: handle_group_info 0.02% : 0.000005s : 1: inline 0.03% : 0.000006s : 1: insert-virtual-dataset 0.02% : 0.000004s : 1: interleave_parallel_branches 0.02% : 0.000004s : 1: interleave_split_concat_branches 0.03% : 0.000006s : 1: label_fine_grained_interleaved_index 0.03% : 0.000007s : 1: label_micro_interleaved_index 1.91% : 0.000431s : 1: loop_unroll 0.02% : 0.000004s : 1: merge_cast_opt 0.02% : 0.000005s : 1: micro_interleaved_order_control 2.12% : 0.000476s : 1: mutable_eliminate 0.03% : 0.000007s : 1: offloading_packed_experts 0.07% : 0.000015s : 1: opt.transform.loop_unroll_optimizer 0.07% : 0.000016s : 1: opt.transform.mutable_eliminate 4.24% : 0.000954s : 78: opt.transform.opt_a 0.14% : 0.000032s : 1: opt.transform.opt_after_cconv 0.12% : 0.000028s : 1: opt.transform.opt_after_jit_grad 0.71% : 0.000160s : 28: opt.transform.opt_b 0.23% : 0.000051s : 2: opt.transform.opt_trans_graph 0.19% : 0.000043s : 4: opt.transform.symbol_engine_opt 10.23% : 0.002300s : 1: opt_a 0.51% : 0.000115s : 1: opt_after_cconv 2.20% : 0.000495s : 1: opt_after_jit_grad 1.22% : 0.000274s : 1: opt_b 19.20% : 0.004317s : 1: optimize 0.09% : 0.000020s : 1: optimize_parallel_all_gather_comm 0.04% : 0.000009s : 1: order_py_execute_after_rewriter 0.10% : 0.000023s : 1: overlap_grad_flash_sp 0.02% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.03% : 0.000007s : 1: overlap_grad_ring_attention 0.02% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.02% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.02% : 0.000005s : 1: overlap_param_gather 0.02% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.04% : 0.000008s : 1: overlap_recompute_and_grad_model_parallel 0.02% : 0.000005s : 1: overlap_recompute_comm 0.03% : 0.000006s : 1: parallel-infer-symbol 0.02% : 0.000004s : 1: parallel-infer-symbol-second 0.02% : 0.000005s : 1: partial_unused_args_eliminate 0.02% : 0.000005s : 1: pipeline_parallel_scheduler 0.02% : 0.000004s : 1: pipeline_split 0.13% : 0.000028s : 1: pre_auto_parallel 0.10% : 0.000022s : 1: py_interpret_to_execute 0.06% : 0.000012s : 1: py_interpret_to_execute_after_opt_a 0.02% : 0.000004s : 1: remove_cast_before_assign_add 0.08% : 0.000019s : 1: remove_dup_value 1.22% : 0.000274s : 1: renormalize.infer 0.95% : 0.000215s : 1: renormalize.specialize 0.02% : 0.000005s : 1: reorder_send_recv_between_fp_bp 0.03% : 0.000007s : 1: rewriter_after_jit_bprop_graph 0.19% : 0.000044s : 1: rewriter_after_opt_a 0.23% : 0.000051s : 1: rewriter_before_opt_a 0.02% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.02% : 0.000005s : 1: slice_recompute_activation 0.02% : 0.000004s : 1: split_layernorm_comm 0.02% : 0.000005s : 1: split_matmul_comm_elemetwise 0.04% : 0.000009s : 1: swap_dp_allreduce_reducescatter 0.38% : 0.000085s : 1: symbol_engine_optimizer 0.36% : 0.000081s : 1: tuple_transform 21.91% : 0.004928s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:47:00.624.821 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:47:00.625.088 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0144726, [21] [bootstrap]: 0.00042737 [type_inference]: 0.00480228 [event_method]: 1.103e-05 [auto_monad]: 5.235e-05 [graph_reusing]: 5.72999e-06 [inline]: 2.12999e-06 [add_attr]: 0.00305938, [1] [add_attr_with_inline]: 0.00305224, [1] [Cycle 1]: 6.046e-05, [2] [tag_attr]: 1.431e-05 [meta_addattr_fg_expand]: 3.73999e-06 [parallel-infer-symbol]: 2.86999e-06 [pre_auto_parallel]: 2.354e-05 [insert-virtual-dataset]: 2.43e-06 [parallel-infer-symbol-second]: 7.2e-07 [dataset_repeat_opt]: 1.83002e-06 [pipeline_split]: 1.57999e-06 [optimize]: 0.00490265, [53] [py_interpret_to_execute]: 2.085e-05 [rewriter_before_opt_a]: 4.821e-05 [opt_a]: 0.002499, [2] [Cycle 1]: 0.00165833, [45] [expand_dump_flag]: 2.93998e-06 [switch_simplify]: 2.592e-05 [loop_unroll]: 1.413e-05 [a_1]: 0.00028047 [with_stream_mark]: 1.693e-05 [recompute_prepare]: 8.62e-06 [updatestate_depend_eliminate]: 4.63999e-06 [updatestate_assign_eliminate]: 3.49001e-06 [updatestate_loads_eliminate]: 3.16001e-06 [parameter_eliminate]: 1.97001e-06 [a_2]: 0.00010873 [accelerated_algorithm]: 6.74001e-06 [shard]: 2.61e-06 [meta_shard_fg_expand]: 1.99e-06 [shard_inline]: 6.58003e-06 [merge_send_recv]: 9.13002e-06 [auto_parallel]: 7.45e-06 [parallel]: 1.903e-05 [flash_sp]: 8.18999e-06 [merge_comm]: 5.62999e-06 [allreduce_fusion]: 3.35e-06 [matmul_add_comm_reduction]: 9.53002e-06 [allreduce_slice_to_reducescatter]: 6.69999e-07 [virtual_shard_identity]: 9.07001e-06 [virtual_dataset]: 6.38e-06 [get_grad_eliminate_]: 6.04001e-06 [virtual_output]: 6.52001e-06 [merge_forward]: 3.78999e-06 [cell_reuse_recompute_pass]: 1.35999e-06 [offload_activation]: 1.023e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.753e-05 [merge_recompute_call_nodes]: 1.49998e-06 [before_grad]: 1.214e-05 [set_forward_comm_id_for_comm_node_pass]: 4.32e-06 [meta_fg_expand]: 2.76e-06 [flash_sp_send_recv_attached]: 2.53003e-06 [receive_attached]: 2.49999e-06 [after_resolve]: 1.042e-05 [a_after_grad]: 9.56998e-06 [renormalize]: 0.00045352 [add_forward_monad_depend]: 4.73001e-06 [auto_monad_grad]: 2.26e-06 [auto_monad_eliminator]: 1.517e-05 [cse]: 3.062e-05 [a_3]: 6.117e-05 [Cycle 2]: 0.00082675, [45] [expand_dump_flag]: 1.15001e-06 [switch_simplify]: 7.93999e-06 [loop_unroll]: 6.11998e-06 [a_1]: 0.00010713 [with_stream_mark]: 1.201e-05 [recompute_prepare]: 6.78e-06 [updatestate_depend_eliminate]: 3.61999e-06 [updatestate_assign_eliminate]: 2.53003e-06 [updatestate_loads_eliminate]: 2.63e-06 [parameter_eliminate]: 1.08001e-06 [a_2]: 9.812e-05 [accelerated_algorithm]: 6.18998e-06 [shard]: 1.97001e-06 [meta_shard_fg_expand]: 1.60999e-06 [shard_inline]: 6.21998e-06 [merge_send_recv]: 5.74999e-06 [auto_parallel]: 6.52001e-06 [parallel]: 6.09001e-06 [flash_sp]: 3.88001e-06 [merge_comm]: 3.97e-06 [allreduce_fusion]: 3.21999e-06 [matmul_add_comm_reduction]: 6.86999e-06 [allreduce_slice_to_reducescatter]: 4.89992e-07 [virtual_shard_identity]: 8.34002e-06 [virtual_dataset]: 5.56002e-06 [get_grad_eliminate_]: 6.07999e-06 [virtual_output]: 5.59e-06 [merge_forward]: 3.24001e-06 [cell_reuse_recompute_pass]: 1.42e-06 [offload_activation]: 7.07002e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.686e-05 [merge_recompute_call_nodes]: 1.12999e-06 [before_grad]: 9.91998e-06 [set_forward_comm_id_for_comm_node_pass]: 4.63001e-06 [meta_fg_expand]: 2.76999e-06 [flash_sp_send_recv_attached]: 1.20001e-06 [receive_attached]: 1.44e-06 [after_resolve]: 1.007e-05 [a_after_grad]: 8.29002e-06 [renormalize]: 8.9989e-08 [add_forward_monad_depend]: 2.56e-06 [auto_monad_grad]: 1.37999e-06 [auto_monad_eliminator]: 1.122e-05 [cse]: 1.798e-05 [a_3]: 4.864e-05 [py_interpret_to_execute_after_opt_a]: 1.399e-05 [slice_cell_reuse_recomputed_activation]: 5.65001e-06 [rewriter_after_opt_a]: 4.189e-05 [convert_after_rewriter]: 1.044e-05 [order_py_execute_after_rewriter]: 8.74e-06 [mutable_eliminate]: 0.00055516 [opt_b]: 0.00032122, [1] [Cycle 1]: 0.00031144, [7] [b_1]: 0.00020637 [b_2]: 8.57998e-06 [updatestate_depend_eliminate]: 7.38e-06 [updatestate_assign_eliminate]: 2.61e-06 [updatestate_loads_eliminate]: 2.53e-06 [renormalize]: 8.89995e-07 [cse]: 2.22e-05 [optimize_parallel_all_gather_comm]: 2.075e-05 [overlap_param_gather]: 4.67e-06 [cconv]: 2.993e-05 [loop_unroll]: 0.00047222 [opt_after_cconv]: 0.00013479, [1] [Cycle 1]: 0.00012521, [7] [c_1]: 2.882e-05 [parameter_eliminate]: 3.65e-06 [updatestate_depend_eliminate]: 6.76e-06 [updatestate_assign_eliminate]: 2.64001e-06 [updatestate_loads_eliminate]: 2.79999e-06 [cse]: 2.186e-05 [renormalize]: 5.8001e-07 [remove_dup_value]: 1.92e-05 [tuple_transform]: 9.113e-05, [1] [Cycle 1]: 8.426e-05, [4] [d_1]: 4.342e-05 [none_parameter_eliminate]: 1.53002e-06 [renormalize]: 2.09984e-07 [switch_simplify]: 7.33e-06 [partial_unused_args_eliminate]: 5.05999e-06 [add_recomputation]: 5.183e-05 [cse_after_recomputation]: 3.035e-05, [1] [Cycle 1]: 2.296e-05, [1] [cse]: 1.324e-05 [environ_conv]: 8.93002e-06 [swap_dp_allreduce_reducescatter]: 8.05e-06 [bias_add_comm_swap]: 4.93001e-06 [label_micro_interleaved_index]: 6.89999e-06 [label_fine_grained_interleaved_index]: 5.35999e-06 [merge_cast_opt]: 3.61001e-06 [slice_recompute_activation]: 4.72998e-06 [micro_interleaved_order_control]: 4.58999e-06 [assign_add_opt]: 3.48e-06 [ForceFp32Comm]: 3.14999e-06 [remove_cast_before_assign_add]: 3.43e-06 [full_micro_interleaved_order_control]: 4.88001e-06 [reorder_send_recv_between_fp_bp]: 5.49e-06 [comm_op_add_attrs]: 3.48e-06 [add_comm_op_reuse_tag]: 3.38999e-06 [interleave_split_concat_branches]: 3.41999e-06 [interleave_parallel_branches]: 3.71001e-06 [overlap_opt_shard_in_pipeline]: 3.71999e-06 [overlap_opt_shard_grad_in_pipeline]: 4.79e-06 [control_data_broadcast_order]: 1.724e-05 [grouped_pairwise_exchange_alltoall]: 3.96001e-06 [offloading_packed_experts]: 6.61e-06 [overlap_recompute_and_grad_model_parallel]: 7.85e-06 [overlap_grad_matmul_and_grad_allreduce]: 4.10998e-06 [overlap_recompute_allgather_and_fa_grad]: 4.03999e-06 [overlap_recompute_comm]: 5.27999e-06 [overlap_grad_ring_attention]: 6.74999e-06 [overlap_grad_flash_sp]: 2.45e-05 [begin_end_overlap_inline]: 3.01001e-06 [split_matmul_comm_elemetwise]: 4.62e-06 [split_layernorm_comm]: 4.2e-06 [handle_group_info]: 4.12e-06 [symbol_engine_optimizer]: 0.00010658, [1] [Cycle 1]: 9.881e-05, [6] [build]: 3.51001e-06 [elim_shapecalc]: 1.307e-05 [elim_not_effective]: 1.333e-05 [opt_reshape]: 7.23e-06 [fold_const_symbol]: 9.92999e-06 [renormalize]: 4.50003e-07 [detach_backward]: 4.45e-06 [pipeline_parallel_scheduler]: 1.92001e-06 [auto_monad_reorder]: 2.024e-05 [get_jit_bprop_graph]: 1.99999e-06 [rewriter_after_jit_bprop_graph]: 4.92e-06 [opt_after_jit_grad]: 0.00050938 [validate]: 3.928e-05 Sums bootstrap : 0.000427s : 4.47% type_inference : 0.004802s : 50.22% event_method : 0.000011s : 0.12% auto_monad : 0.000052s : 0.55% graph_reusing : 0.000006s : 0.06% inline : 0.000002s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000014s : 0.15% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000004s : 0.04% parallel-infer-symbol : 0.000003s : 0.03% pre_auto_parallel : 0.000024s : 0.25% insert-virtual-dataset : 0.000002s : 0.03% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.02% optimize.py_interpret_to_execute : 0.000021s : 0.22% optimize.rewriter_before_opt_a : 0.000048s : 0.50% optimize.opt_a.expand_dump_flag : 0.000004s : 0.04% optimize.opt_a.switch_simplify : 0.000034s : 0.35% optimize.opt_a.loop_unroll : 0.000020s : 0.21% optimize.opt_a.a_1 : 0.000388s : 4.05% optimize.opt_a.with_stream_mark : 0.000029s : 0.30% optimize.opt_a.recompute_prepare : 0.000015s : 0.16% optimize.opt_a.updatestate_depend_eliminate : 0.000008s : 0.09% optimize.opt_a.updatestate_assign_eliminate : 0.000006s : 0.06% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.06% optimize.opt_a.parameter_eliminate : 0.000003s : 0.03% optimize.opt_a.a_2 : 0.000207s : 2.16% optimize.opt_a.accelerated_algorithm : 0.000013s : 0.14% optimize.opt_a.shard : 0.000005s : 0.05% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.04% optimize.opt_a.shard_inline : 0.000013s : 0.13% optimize.opt_a.merge_send_recv : 0.000015s : 0.16% optimize.opt_a.auto_parallel : 0.000014s : 0.15% optimize.opt_a.parallel : 0.000025s : 0.26% optimize.opt_a.flash_sp : 0.000012s : 0.13% optimize.opt_a.merge_comm : 0.000010s : 0.10% optimize.opt_a.allreduce_fusion : 0.000007s : 0.07% optimize.opt_a.matmul_add_comm_reduction : 0.000016s : 0.17% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000017s : 0.18% optimize.opt_a.virtual_dataset : 0.000012s : 0.12% optimize.opt_a.get_grad_eliminate_ : 0.000012s : 0.13% optimize.opt_a.virtual_output : 0.000012s : 0.13% optimize.opt_a.merge_forward : 0.000007s : 0.07% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.03% optimize.opt_a.offload_activation : 0.000017s : 0.18% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000034s : 0.36% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.03% optimize.opt_a.before_grad : 0.000022s : 0.23% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000009s : 0.09% optimize.opt_a.meta_fg_expand : 0.000006s : 0.06% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.04% optimize.opt_a.receive_attached : 0.000004s : 0.04% optimize.opt_a.after_resolve : 0.000020s : 0.21% optimize.opt_a.a_after_grad : 0.000018s : 0.19% optimize.opt_a.renormalize : 0.000454s : 4.74% optimize.opt_a.add_forward_monad_depend : 0.000007s : 0.08% optimize.opt_a.auto_monad_grad : 0.000004s : 0.04% optimize.opt_a.auto_monad_eliminator : 0.000026s : 0.28% optimize.opt_a.cse : 0.000049s : 0.51% optimize.opt_a.a_3 : 0.000110s : 1.15% optimize.py_interpret_to_execute_after_opt_a : 0.000014s : 0.15% optimize.slice_cell_reuse_recomputed_activation : 0.000006s : 0.06% optimize.rewriter_after_opt_a : 0.000042s : 0.44% optimize.convert_after_rewriter : 0.000010s : 0.11% optimize.order_py_execute_after_rewriter : 0.000009s : 0.09% optimize.mutable_eliminate : 0.000555s : 5.81% optimize.opt_b.b_1 : 0.000206s : 2.16% optimize.opt_b.b_2 : 0.000009s : 0.09% optimize.opt_b.updatestate_depend_eliminate : 0.000007s : 0.08% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_b.renormalize : 0.000001s : 0.01% optimize.opt_b.cse : 0.000022s : 0.23% optimize.optimize_parallel_all_gather_comm : 0.000021s : 0.22% optimize.overlap_param_gather : 0.000005s : 0.05% optimize.cconv : 0.000030s : 0.31% optimize.loop_unroll : 0.000472s : 4.94% optimize.opt_after_cconv.c_1 : 0.000029s : 0.30% optimize.opt_after_cconv.parameter_eliminate : 0.000004s : 0.04% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000007s : 0.07% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.cse : 0.000022s : 0.23% optimize.opt_after_cconv.renormalize : 0.000001s : 0.01% optimize.remove_dup_value : 0.000019s : 0.20% optimize.tuple_transform.d_1 : 0.000043s : 0.45% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.02% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000007s : 0.08% optimize.partial_unused_args_eliminate : 0.000005s : 0.05% optimize.add_recomputation : 0.000052s : 0.54% optimize.cse_after_recomputation.cse : 0.000013s : 0.14% optimize.environ_conv : 0.000009s : 0.09% optimize.swap_dp_allreduce_reducescatter : 0.000008s : 0.08% optimize.bias_add_comm_swap : 0.000005s : 0.05% optimize.label_micro_interleaved_index : 0.000007s : 0.07% optimize.label_fine_grained_interleaved_index : 0.000005s : 0.06% optimize.merge_cast_opt : 0.000004s : 0.04% optimize.slice_recompute_activation : 0.000005s : 0.05% optimize.micro_interleaved_order_control : 0.000005s : 0.05% optimize.assign_add_opt : 0.000003s : 0.04% optimize.ForceFp32Comm : 0.000003s : 0.03% optimize.remove_cast_before_assign_add : 0.000003s : 0.04% optimize.full_micro_interleaved_order_control : 0.000005s : 0.05% optimize.reorder_send_recv_between_fp_bp : 0.000005s : 0.06% optimize.comm_op_add_attrs : 0.000003s : 0.04% optimize.add_comm_op_reuse_tag : 0.000003s : 0.04% optimize.interleave_split_concat_branches : 0.000003s : 0.04% optimize.interleave_parallel_branches : 0.000004s : 0.04% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.04% optimize.overlap_opt_shard_grad_in_pipeline : 0.000005s : 0.05% optimize.control_data_broadcast_order : 0.000017s : 0.18% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.04% optimize.offloading_packed_experts : 0.000007s : 0.07% optimize.overlap_recompute_and_grad_model_parallel : 0.000008s : 0.08% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.04% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.04% optimize.overlap_recompute_comm : 0.000005s : 0.06% optimize.overlap_grad_ring_attention : 0.000007s : 0.07% optimize.overlap_grad_flash_sp : 0.000024s : 0.26% optimize.begin_end_overlap_inline : 0.000003s : 0.03% optimize.split_matmul_comm_elemetwise : 0.000005s : 0.05% optimize.split_layernorm_comm : 0.000004s : 0.04% optimize.handle_group_info : 0.000004s : 0.04% optimize.symbol_engine_optimizer.build : 0.000004s : 0.04% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000013s : 0.14% optimize.symbol_engine_optimizer.elim_not_effective : 0.000013s : 0.14% optimize.symbol_engine_optimizer.opt_reshape : 0.000007s : 0.08% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000010s : 0.10% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000004s : 0.05% pipeline_parallel_scheduler : 0.000002s : 0.02% auto_monad_reorder : 0.000020s : 0.21% get_jit_bprop_graph : 0.000002s : 0.02% rewriter_after_jit_bprop_graph : 0.000005s : 0.05% opt_after_jit_grad : 0.000509s : 5.33% validate : 0.000039s : 0.41% Time group info: ------[substitution.] 0.000140 20 1.43% : 0.000002s : 2: substitution.elim_not_effective 0.94% : 0.000001s : 2: substitution.fold_const_symbol 3.95% : 0.000006s : 3: substitution.graph_param_transform 60.98% : 0.000085s : 2: substitution.inline 3.03% : 0.000004s : 4: substitution.j_node_and_user_rematch 3.36% : 0.000005s : 4: substitution.remove_not_recompute_node 2.63% : 0.000004s : 2: substitution.replace_old_param 23.68% : 0.000033s : 1: substitution.value_based_eliminate ------[type_inference.] 0.004757 2 91.44% : 0.004350s : 1: type_inference.infer 8.56% : 0.000407s : 1: type_inference.specialize ------[replace.] 0.000020 2 100.00% : 0.000020s : 2: replace.inline ------[match.] 0.000083 2 100.00% : 0.000083s : 2: match.inline ------[predicate.] 0.000135 754 0.78% : 0.000001s : 7: predicate.accumulaten_eliminater 1.00% : 0.000001s : 3: predicate.ad_related_special_op_eliminate 0.69% : 0.000001s : 6: predicate.addn_check_dump 0.87% : 0.000001s : 7: predicate.addn_zero_filter 0.64% : 0.000001s : 7: predicate.adjust_all_reduce_mul_add 2.14% : 0.000003s : 13: predicate.arithmetic_simplify 0.69% : 0.000001s : 7: predicate.cast_eliminate 0.75% : 0.000001s : 6: predicate.check_bprop_eliminate 0.64% : 0.000001s : 6: predicate.compare_switch_simplify 0.22% : 0.000000s : 3: predicate.const_output_eliminate 0.75% : 0.000001s : 6: predicate.depend_value_elim 1.04% : 0.000001s : 7: predicate.dict_get_item_const_eliminator 0.94% : 0.000001s : 7: predicate.dict_get_item_eliminator 0.78% : 0.000001s : 7: predicate.dict_set_item_eliminator 1.24% : 0.000002s : 6: predicate.dumpgradient_eliminate 0.25% : 0.000000s : 3: predicate.elim_not_effective 0.49% : 0.000001s : 3: predicate.elim_shapecalc_of_broadcastargs 1.09% : 0.000001s : 10: predicate.environ_add_const_eliminate 0.98% : 0.000001s : 10: predicate.environ_get_add_eliminate 1.04% : 0.000001s : 10: predicate.environ_get_depend_swap 1.75% : 0.000002s : 16: predicate.environ_get_eliminate 0.96% : 0.000001s : 10: predicate.environ_get_set_eliminate 0.89% : 0.000001s : 9: predicate.exchange_switch_depend_value 2.05% : 0.000003s : 9: predicate.float_depend_g_call 0.72% : 0.000001s : 6: predicate.float_environ_get_switch 0.95% : 0.000001s : 9: predicate.float_tuple_getitem_switch 0.23% : 0.000000s : 3: predicate.fold_const_symbol 0.81% : 0.000001s : 6: predicate.get_grad_eliminate 0.31% : 0.000000s : 3: predicate.graph_param_transform 0.80% : 0.000001s : 6: predicate.incorporate_call 0.66% : 0.000001s : 6: predicate.incorporate_call_switch 6.67% : 0.000009s : 34: predicate.inline 1.16% : 0.000002s : 6: predicate.inline_without_move 0.55% : 0.000001s : 6: predicate.j_node_and_user_rematch 1.11% : 0.000002s : 6: predicate.less_batch_normalization 1.69% : 0.000002s : 13: predicate.list_to_tuple_eliminator_ 1.91% : 0.000003s : 20: predicate.load_eliminater 1.62% : 0.000002s : 3: predicate.loop_unroll_after_grad 1.74% : 0.000002s : 14: predicate.loop_unroll_before_grad 1.59% : 0.000002s : 13: predicate.make_slice_get_slice_eliminator 0.71% : 0.000001s : 6: predicate.merge_addn 0.68% : 0.000001s : 6: predicate.micro_step_allgather_replace 0.69% : 0.000001s : 6: predicate.mini_step_allgather_replace 0.69% : 0.000001s : 7: predicate.minmaximum_grad 2.11% : 0.000003s : 3: predicate.mutable_eliminate 0.40% : 0.000001s : 3: predicate.opt_reshape 0.69% : 0.000001s : 3: predicate.parallel_virtual_node 1.20% : 0.000002s : 9: predicate.partial_defer_inline 1.20% : 0.000002s : 10: predicate.partial_eliminate 0.89% : 0.000001s : 7: predicate.print_const_string_wrapper 0.67% : 0.000001s : 6: predicate.reduce_all_const_elim 0.92% : 0.000001s : 7: predicate.reduce_eliminate 2.00% : 0.000003s : 20: predicate.redundant_stop_gradient_eliminater 0.50% : 0.000001s : 6: predicate.remove_not_recompute_node 1.09% : 0.000001s : 13: predicate.replace_applicator 0.69% : 0.000001s : 6: predicate.replace_old_param 0.33% : 0.000000s : 3: predicate.reset_defer_inline 0.81% : 0.000001s : 7: predicate.reshape_eliminate 0.75% : 0.000001s : 6: predicate.row_tensor_add_zeros_like 0.51% : 0.000001s : 3: predicate.row_tensor_eliminate 1.37% : 0.000002s : 6: predicate.same_eliminate 0.85% : 0.000001s : 6: predicate.set_cell_output_no_recompute 1.23% : 0.000002s : 6: predicate.shard_identity_eliminate 0.92% : 0.000001s : 6: predicate.special_op_eliminate 0.94% : 0.000001s : 6: predicate.specialize_transform 1.26% : 0.000002s : 6: predicate.split_environ_get_set_with_tuple_value 0.96% : 0.000001s : 6: predicate.stack_unstack_eliminate 0.42% : 0.000001s : 3: predicate.switch_call_monad_eliminater 1.05% : 0.000001s : 9: predicate.switch_defer_inline 1.63% : 0.000002s : 15: predicate.switch_layer_defer_inline 4.34% : 0.000006s : 32: predicate.switch_simplify 0.75% : 0.000001s : 7: predicate.tile_eliminate 0.85% : 0.000001s : 7: predicate.transpose_eliminate 1.54% : 0.000002s : 13: predicate.tuple_list_convert_item_index_to_positive 1.71% : 0.000002s : 13: predicate.tuple_list_get_item_const_eliminator 1.42% : 0.000002s : 13: predicate.tuple_list_get_item_depend_reorder 2.96% : 0.000004s : 19: predicate.tuple_list_get_item_eliminator 1.36% : 0.000002s : 13: predicate.tuple_list_get_set_item_eliminator 2.41% : 0.000003s : 19: predicate.tuple_list_set_item_eliminator 1.49% : 0.000002s : 13: predicate.tuple_to_list_eliminator_ 1.88% : 0.000003s : 20: predicate.updatestate_pure_node_eliminater 2.87% : 0.000004s : 26: predicate.updatestate_useless_node_eliminater 0.78% : 0.000001s : 3: predicate.value_based_eliminate 0.69% : 0.000001s : 6: predicate.virtual_dataset_eliminate 0.75% : 0.000001s : 6: predicate.virtual_output_eliminate 0.31% : 0.000000s : 3: predicate.virtual_view_grad_eliminate 0.55% : 0.000001s : 3: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000200 5 8.70% : 0.000017s : 1: func_graph_cloner_run.FuncGraphClonerGraph 91.30% : 0.000183s : 4: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.023836 192 0.03% : 0.000006s : 1: ForceFp32Comm 12.87% : 0.003068s : 1: add_attr 12.82% : 0.003056s : 1: add_attr_with_inline 0.03% : 0.000006s : 1: add_comm_op_reuse_tag 0.24% : 0.000056s : 1: add_recomputation 0.03% : 0.000006s : 1: assign_add_opt 0.25% : 0.000060s : 1: auto_monad 0.12% : 0.000028s : 1: auto_monad_reorder 0.03% : 0.000006s : 1: begin_end_overlap_inline 0.03% : 0.000008s : 1: bias_add_comm_swap 1.97% : 0.000469s : 1: bootstrap 0.14% : 0.000033s : 1: cconv 0.03% : 0.000006s : 1: comm_op_add_attrs 0.09% : 0.000021s : 1: control_data_broadcast_order 0.06% : 0.000014s : 1: convert_after_rewriter 0.14% : 0.000033s : 1: cse_after_recomputation 0.03% : 0.000008s : 1: dataset_repeat_opt 0.09% : 0.000023s : 1: detach_backward 0.05% : 0.000012s : 1: environ_conv 0.08% : 0.000020s : 1: event_method 0.03% : 0.000008s : 1: full_micro_interleaved_order_control 0.03% : 0.000008s : 1: get_jit_bprop_graph 0.05% : 0.000012s : 1: graph_reusing 0.03% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.03% : 0.000007s : 1: handle_group_info 0.03% : 0.000008s : 1: inline 0.04% : 0.000009s : 1: insert-virtual-dataset 0.03% : 0.000006s : 1: interleave_parallel_branches 0.03% : 0.000006s : 1: interleave_split_concat_branches 0.04% : 0.000009s : 1: label_fine_grained_interleaved_index 0.04% : 0.000010s : 1: label_micro_interleaved_index 2.01% : 0.000480s : 1: loop_unroll 0.03% : 0.000007s : 1: merge_cast_opt 0.03% : 0.000008s : 1: micro_interleaved_order_control 2.36% : 0.000563s : 1: mutable_eliminate 0.04% : 0.000010s : 1: offloading_packed_experts 0.07% : 0.000017s : 1: opt.transform.loop_unroll_optimizer 0.08% : 0.000019s : 1: opt.transform.mutable_eliminate 3.23% : 0.000769s : 78: opt.transform.opt_a 0.11% : 0.000027s : 1: opt.transform.opt_after_cconv 0.11% : 0.000025s : 1: opt.transform.opt_after_jit_grad 0.57% : 0.000136s : 28: opt.transform.opt_b 0.20% : 0.000048s : 2: opt.transform.opt_trans_graph 0.16% : 0.000039s : 4: opt.transform.symbol_engine_opt 10.50% : 0.002502s : 1: opt_a 0.58% : 0.000138s : 1: opt_after_cconv 2.19% : 0.000521s : 1: opt_after_jit_grad 1.36% : 0.000325s : 1: opt_b 22.11% : 0.005271s : 1: optimize 0.10% : 0.000024s : 1: optimize_parallel_all_gather_comm 0.05% : 0.000012s : 1: order_py_execute_after_rewriter 0.12% : 0.000029s : 1: overlap_grad_flash_sp 0.03% : 0.000007s : 1: overlap_grad_matmul_and_grad_allreduce 0.04% : 0.000010s : 1: overlap_grad_ring_attention 0.03% : 0.000007s : 1: overlap_opt_shard_grad_in_pipeline 0.03% : 0.000006s : 1: overlap_opt_shard_in_pipeline 0.03% : 0.000008s : 1: overlap_param_gather 0.03% : 0.000007s : 1: overlap_recompute_allgather_and_fa_grad 0.05% : 0.000011s : 1: overlap_recompute_and_grad_model_parallel 0.03% : 0.000008s : 1: overlap_recompute_comm 0.04% : 0.000009s : 1: parallel-infer-symbol 0.03% : 0.000006s : 1: parallel-infer-symbol-second 0.03% : 0.000008s : 1: partial_unused_args_eliminate 0.04% : 0.000010s : 1: pipeline_parallel_scheduler 0.03% : 0.000007s : 1: pipeline_split 0.13% : 0.000031s : 1: pre_auto_parallel 0.10% : 0.000024s : 1: py_interpret_to_execute 0.07% : 0.000018s : 1: py_interpret_to_execute_after_opt_a 0.03% : 0.000006s : 1: remove_cast_before_assign_add 0.10% : 0.000023s : 1: remove_dup_value 1.00% : 0.000238s : 1: renormalize.infer 0.87% : 0.000207s : 1: renormalize.specialize 0.04% : 0.000008s : 1: reorder_send_recv_between_fp_bp 0.05% : 0.000011s : 1: rewriter_after_jit_bprop_graph 0.19% : 0.000046s : 1: rewriter_after_opt_a 0.22% : 0.000052s : 1: rewriter_before_opt_a 0.04% : 0.000008s : 1: slice_cell_reuse_recomputed_activation 0.03% : 0.000007s : 1: slice_recompute_activation 0.03% : 0.000007s : 1: split_layernorm_comm 0.03% : 0.000007s : 1: split_matmul_comm_elemetwise 0.05% : 0.000011s : 1: swap_dp_allreduce_reducescatter 0.46% : 0.000109s : 1: symbol_engine_optimizer 0.40% : 0.000094s : 1: tuple_transform 20.24% : 0.004824s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:47:00.812.189 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0131679, [21] [bootstrap]: 0.00043726 [type_inference]: 0.00489367 [event_method]: 1.143e-05 [auto_monad]: 5.395e-05 [graph_reusing]: 5.24e-06 [inline]: 2.16e-06 [add_attr]: 0.00310149, [1] [add_attr_with_inline]: 0.00309347, [1] [Cycle 1]: 5.476e-05, [2] [tag_attr]: 1.427e-05 [meta_addattr_fg_expand]: 4.16001e-06 [parallel-infer-symbol]: 3.02002e-06 [pre_auto_parallel]: 2.587e-05 [insert-virtual-dataset]: 2.21e-06 [parallel-infer-symbol-second]: 6.90023e-07 [dataset_repeat_opt]: 2.69001e-06 [pipeline_split]: 1.57999e-06 [optimize]: 0.00397319, [53] [py_interpret_to_execute]: 1.822e-05 [rewriter_before_opt_a]: 4.374e-05 [opt_a]: 0.00202523, [2] [Cycle 1]: 0.00141127, [45] [expand_dump_flag]: 2.96001e-06 [switch_simplify]: 2.54e-05 [loop_unroll]: 1.433e-05 [a_1]: 0.00028005 [with_stream_mark]: 1.515e-05 [recompute_prepare]: 7.5e-06 [updatestate_depend_eliminate]: 3.42997e-06 [updatestate_assign_eliminate]: 3.25e-06 [updatestate_loads_eliminate]: 2.82002e-06 [parameter_eliminate]: 1.98002e-06 [a_2]: 7.865e-05 [accelerated_algorithm]: 6.86001e-06 [shard]: 2.18998e-06 [meta_shard_fg_expand]: 1.72001e-06 [shard_inline]: 6.38e-06 [merge_send_recv]: 8.50001e-06 [auto_parallel]: 5.76998e-06 [parallel]: 1.629e-05 [flash_sp]: 7.43999e-06 [merge_comm]: 4.24002e-06 [allreduce_fusion]: 3.54002e-06 [matmul_add_comm_reduction]: 9.74e-06 [allreduce_slice_to_reducescatter]: 6.40022e-07 [virtual_shard_identity]: 7.56001e-06 [virtual_dataset]: 6.14999e-06 [get_grad_eliminate_]: 6.09001e-06 [virtual_output]: 5.90002e-06 [merge_forward]: 3.45998e-06 [cell_reuse_recompute_pass]: 1.19e-06 [offload_activation]: 1.008e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.246e-05 [merge_recompute_call_nodes]: 1.56002e-06 [before_grad]: 1.062e-05 [set_forward_comm_id_for_comm_node_pass]: 3.46999e-06 [meta_fg_expand]: 2.84001e-06 [flash_sp_send_recv_attached]: 2.58e-06 [receive_attached]: 2.31e-06 [after_resolve]: 9.41998e-06 [a_after_grad]: 9.11998e-06 [renormalize]: 0.00047579 [add_forward_monad_depend]: 4.95001e-06 [auto_monad_grad]: 1.75001e-06 [auto_monad_eliminator]: 1.508e-05 [cse]: 3.084e-05 [a_3]: 4.407e-05 [Cycle 2]: 0.00060507, [45] [expand_dump_flag]: 1.16002e-06 [switch_simplify]: 7.08e-06 [loop_unroll]: 5.94999e-06 [a_1]: 0.0001067 [with_stream_mark]: 1.32e-05 [recompute_prepare]: 6.46e-06 [updatestate_depend_eliminate]: 3.25e-06 [updatestate_assign_eliminate]: 2.22999e-06 [updatestate_loads_eliminate]: 2.86999e-06 [parameter_eliminate]: 1.04e-06 [a_2]: 6.945e-05 [accelerated_algorithm]: 5.94e-06 [shard]: 1.07998e-06 [meta_shard_fg_expand]: 1.27e-06 [shard_inline]: 5.96e-06 [merge_send_recv]: 4.87998e-06 [auto_parallel]: 5.51e-06 [parallel]: 4.75001e-06 [flash_sp]: 3.45998e-06 [merge_comm]: 3.31001e-06 [allreduce_fusion]: 3.18998e-06 [matmul_add_comm_reduction]: 5.60001e-06 [allreduce_slice_to_reducescatter]: 3.59985e-07 [virtual_shard_identity]: 6.44001e-06 [virtual_dataset]: 5.70001e-06 [get_grad_eliminate_]: 5.34998e-06 [virtual_output]: 5.38002e-06 [merge_forward]: 2.57001e-06 [cell_reuse_recompute_pass]: 1.49e-06 [offload_activation]: 6.09001e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.245e-05 [merge_recompute_call_nodes]: 9.29984e-07 [before_grad]: 9.41998e-06 [set_forward_comm_id_for_comm_node_pass]: 3.63e-06 [meta_fg_expand]: 2.16998e-06 [flash_sp_send_recv_attached]: 9.09989e-07 [receive_attached]: 1.02998e-06 [after_resolve]: 8.46002e-06 [a_after_grad]: 8.60001e-06 [renormalize]: 7.99773e-08 [add_forward_monad_depend]: 1.15001e-06 [auto_monad_grad]: 9.10019e-07 [auto_monad_eliminator]: 6.98e-06 [cse]: 1.358e-05 [a_3]: 3.464e-05 [py_interpret_to_execute_after_opt_a]: 7.20998e-06 [slice_cell_reuse_recomputed_activation]: 2.45002e-06 [rewriter_after_opt_a]: 3.562e-05 [convert_after_rewriter]: 6.61999e-06 [order_py_execute_after_rewriter]: 4.99e-06 [mutable_eliminate]: 0.00049821 [opt_b]: 0.0002354, [1] [Cycle 1]: 0.00022931, [7] [b_1]: 0.00015541 [b_2]: 8.11002e-06 [updatestate_depend_eliminate]: 5.32999e-06 [updatestate_assign_eliminate]: 2.38002e-06 [updatestate_loads_eliminate]: 2.32001e-06 [renormalize]: 4.09986e-07 [cse]: 1.818e-05 [optimize_parallel_all_gather_comm]: 1.573e-05 [overlap_param_gather]: 1.96998e-06 [cconv]: 2.319e-05 [loop_unroll]: 0.00042025 [opt_after_cconv]: 9.644e-05, [1] [Cycle 1]: 9.113e-05, [7] [c_1]: 2.661e-05 [parameter_eliminate]: 2.34001e-06 [updatestate_depend_eliminate]: 4.90999e-06 [updatestate_assign_eliminate]: 2.34999e-06 [updatestate_loads_eliminate]: 2.59999e-06 [cse]: 1.828e-05 [renormalize]: 3.80009e-07 [remove_dup_value]: 1.429e-05 [tuple_transform]: 7.029e-05, [1] [Cycle 1]: 6.55e-05, [4] [d_1]: 3.892e-05 [none_parameter_eliminate]: 1.70001e-06 [renormalize]: 2.50002e-07 [switch_simplify]: 6.54999e-06 [partial_unused_args_eliminate]: 1.79e-06 [add_recomputation]: 4.574e-05 [cse_after_recomputation]: 2.19e-05, [1] [Cycle 1]: 1.708e-05, [1] [cse]: 1.127e-05 [environ_conv]: 4.82998e-06 [swap_dp_allreduce_reducescatter]: 4.65001e-06 [bias_add_comm_swap]: 2.30002e-06 [label_micro_interleaved_index]: 4.25999e-06 [label_fine_grained_interleaved_index]: 2.66e-06 [merge_cast_opt]: 1.29e-06 [slice_recompute_activation]: 2.17999e-06 [micro_interleaved_order_control]: 2.24001e-06 [assign_add_opt]: 1.41998e-06 [ForceFp32Comm]: 8.80013e-07 [remove_cast_before_assign_add]: 1.03001e-06 [full_micro_interleaved_order_control]: 2.17999e-06 [reorder_send_recv_between_fp_bp]: 2.46e-06 [comm_op_add_attrs]: 9.79984e-07 [add_comm_op_reuse_tag]: 9.5999e-07 [interleave_split_concat_branches]: 1.15999e-06 [interleave_parallel_branches]: 1.12999e-06 [overlap_opt_shard_in_pipeline]: 1.10001e-06 [overlap_opt_shard_grad_in_pipeline]: 2.06998e-06 [control_data_broadcast_order]: 1.314e-05 [grouped_pairwise_exchange_alltoall]: 1.81998e-06 [offloading_packed_experts]: 3.85e-06 [overlap_recompute_and_grad_model_parallel]: 5.04e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.17999e-06 [overlap_recompute_allgather_and_fa_grad]: 1.34998e-06 [overlap_recompute_comm]: 1.99999e-06 [overlap_grad_ring_attention]: 4.06001e-06 [overlap_grad_flash_sp]: 1.834e-05 [begin_end_overlap_inline]: 5.60016e-07 [split_matmul_comm_elemetwise]: 2.14999e-06 [split_layernorm_comm]: 1.60001e-06 [handle_group_info]: 8.90024e-07 [symbol_engine_optimizer]: 7.471e-05, [1] [Cycle 1]: 7.012e-05, [6] [build]: 2.54999e-06 [elim_shapecalc]: 9.52999e-06 [elim_not_effective]: 1.286e-05 [opt_reshape]: 6.76e-06 [fold_const_symbol]: 9.97999e-06 [renormalize]: 1.90019e-07 [detach_backward]: 2.06e-06 [pipeline_parallel_scheduler]: 1.87001e-06 [auto_monad_reorder]: 1.644e-05 [get_jit_bprop_graph]: 1.62001e-06 [rewriter_after_jit_bprop_graph]: 3.95e-06 [opt_after_jit_grad]: 0.00045105 [validate]: 3.466e-05 Sums bootstrap : 0.000437s : 4.79% type_inference : 0.004894s : 53.57% event_method : 0.000011s : 0.13% auto_monad : 0.000054s : 0.59% graph_reusing : 0.000005s : 0.06% inline : 0.000002s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000014s : 0.16% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000004s : 0.05% parallel-infer-symbol : 0.000003s : 0.03% pre_auto_parallel : 0.000026s : 0.28% insert-virtual-dataset : 0.000002s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000003s : 0.03% pipeline_split : 0.000002s : 0.02% optimize.py_interpret_to_execute : 0.000018s : 0.20% optimize.rewriter_before_opt_a : 0.000044s : 0.48% optimize.opt_a.expand_dump_flag : 0.000004s : 0.05% optimize.opt_a.switch_simplify : 0.000032s : 0.36% optimize.opt_a.loop_unroll : 0.000020s : 0.22% optimize.opt_a.a_1 : 0.000387s : 4.23% optimize.opt_a.with_stream_mark : 0.000028s : 0.31% optimize.opt_a.recompute_prepare : 0.000014s : 0.15% optimize.opt_a.updatestate_depend_eliminate : 0.000007s : 0.07% optimize.opt_a.updatestate_assign_eliminate : 0.000005s : 0.06% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.06% optimize.opt_a.parameter_eliminate : 0.000003s : 0.03% optimize.opt_a.a_2 : 0.000148s : 1.62% optimize.opt_a.accelerated_algorithm : 0.000013s : 0.14% optimize.opt_a.shard : 0.000003s : 0.04% optimize.opt_a.meta_shard_fg_expand : 0.000003s : 0.03% optimize.opt_a.shard_inline : 0.000012s : 0.14% optimize.opt_a.merge_send_recv : 0.000013s : 0.15% optimize.opt_a.auto_parallel : 0.000011s : 0.12% optimize.opt_a.parallel : 0.000021s : 0.23% optimize.opt_a.flash_sp : 0.000011s : 0.12% optimize.opt_a.merge_comm : 0.000008s : 0.08% optimize.opt_a.allreduce_fusion : 0.000007s : 0.07% optimize.opt_a.matmul_add_comm_reduction : 0.000015s : 0.17% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000014s : 0.15% optimize.opt_a.virtual_dataset : 0.000012s : 0.13% optimize.opt_a.get_grad_eliminate_ : 0.000011s : 0.13% optimize.opt_a.virtual_output : 0.000011s : 0.12% optimize.opt_a.merge_forward : 0.000006s : 0.07% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.03% optimize.opt_a.offload_activation : 0.000016s : 0.18% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000025s : 0.27% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.03% optimize.opt_a.before_grad : 0.000020s : 0.22% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000007s : 0.08% optimize.opt_a.meta_fg_expand : 0.000005s : 0.05% optimize.opt_a.flash_sp_send_recv_attached : 0.000003s : 0.04% optimize.opt_a.receive_attached : 0.000003s : 0.04% optimize.opt_a.after_resolve : 0.000018s : 0.20% optimize.opt_a.a_after_grad : 0.000018s : 0.19% optimize.opt_a.renormalize : 0.000476s : 5.21% optimize.opt_a.add_forward_monad_depend : 0.000006s : 0.07% optimize.opt_a.auto_monad_grad : 0.000003s : 0.03% optimize.opt_a.auto_monad_eliminator : 0.000022s : 0.24% optimize.opt_a.cse : 0.000044s : 0.49% optimize.opt_a.a_3 : 0.000079s : 0.86% optimize.py_interpret_to_execute_after_opt_a : 0.000007s : 0.08% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.03% optimize.rewriter_after_opt_a : 0.000036s : 0.39% optimize.convert_after_rewriter : 0.000007s : 0.07% optimize.order_py_execute_after_rewriter : 0.000005s : 0.05% optimize.mutable_eliminate : 0.000498s : 5.45% optimize.opt_b.b_1 : 0.000155s : 1.70% optimize.opt_b.b_2 : 0.000008s : 0.09% optimize.opt_b.updatestate_depend_eliminate : 0.000005s : 0.06% optimize.opt_b.updatestate_assign_eliminate : 0.000002s : 0.03% optimize.opt_b.updatestate_loads_eliminate : 0.000002s : 0.03% optimize.opt_b.renormalize : 0.000000s : 0.00% optimize.opt_b.cse : 0.000018s : 0.20% optimize.optimize_parallel_all_gather_comm : 0.000016s : 0.17% optimize.overlap_param_gather : 0.000002s : 0.02% optimize.cconv : 0.000023s : 0.25% optimize.loop_unroll : 0.000420s : 4.60% optimize.opt_after_cconv.c_1 : 0.000027s : 0.29% optimize.opt_after_cconv.parameter_eliminate : 0.000002s : 0.03% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000005s : 0.05% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000002s : 0.03% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.cse : 0.000018s : 0.20% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000014s : 0.16% optimize.tuple_transform.d_1 : 0.000039s : 0.43% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.02% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000007s : 0.07% optimize.partial_unused_args_eliminate : 0.000002s : 0.02% optimize.add_recomputation : 0.000046s : 0.50% optimize.cse_after_recomputation.cse : 0.000011s : 0.12% optimize.environ_conv : 0.000005s : 0.05% optimize.swap_dp_allreduce_reducescatter : 0.000005s : 0.05% optimize.bias_add_comm_swap : 0.000002s : 0.03% optimize.label_micro_interleaved_index : 0.000004s : 0.05% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.03% optimize.merge_cast_opt : 0.000001s : 0.01% optimize.slice_recompute_activation : 0.000002s : 0.02% optimize.micro_interleaved_order_control : 0.000002s : 0.02% optimize.assign_add_opt : 0.000001s : 0.02% optimize.ForceFp32Comm : 0.000001s : 0.01% optimize.remove_cast_before_assign_add : 0.000001s : 0.01% optimize.full_micro_interleaved_order_control : 0.000002s : 0.02% optimize.reorder_send_recv_between_fp_bp : 0.000002s : 0.03% optimize.comm_op_add_attrs : 0.000001s : 0.01% optimize.add_comm_op_reuse_tag : 0.000001s : 0.01% optimize.interleave_split_concat_branches : 0.000001s : 0.01% optimize.interleave_parallel_branches : 0.000001s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.02% optimize.control_data_broadcast_order : 0.000013s : 0.14% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.02% optimize.offloading_packed_experts : 0.000004s : 0.04% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.06% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.01% optimize.overlap_recompute_comm : 0.000002s : 0.02% optimize.overlap_grad_ring_attention : 0.000004s : 0.04% optimize.overlap_grad_flash_sp : 0.000018s : 0.20% optimize.begin_end_overlap_inline : 0.000001s : 0.01% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.02% optimize.split_layernorm_comm : 0.000002s : 0.02% optimize.handle_group_info : 0.000001s : 0.01% optimize.symbol_engine_optimizer.build : 0.000003s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000010s : 0.10% optimize.symbol_engine_optimizer.elim_not_effective : 0.000013s : 0.14% optimize.symbol_engine_optimizer.opt_reshape : 0.000007s : 0.07% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000010s : 0.11% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.02% pipeline_parallel_scheduler : 0.000002s : 0.02% auto_monad_reorder : 0.000016s : 0.18% get_jit_bprop_graph : 0.000002s : 0.02% rewriter_after_jit_bprop_graph : 0.000004s : 0.04% opt_after_jit_grad : 0.000451s : 4.94% validate : 0.000035s : 0.38% Time group info: ------[substitution.] 0.000134 20 1.33% : 0.000002s : 2: substitution.elim_not_effective 0.97% : 0.000001s : 2: substitution.fold_const_symbol 4.50% : 0.000006s : 3: substitution.graph_param_transform 63.14% : 0.000085s : 2: substitution.inline 2.62% : 0.000004s : 4: substitution.j_node_and_user_rematch 3.43% : 0.000005s : 4: substitution.remove_not_recompute_node 2.15% : 0.000003s : 2: substitution.replace_old_param 21.87% : 0.000029s : 1: substitution.value_based_eliminate ------[type_inference.] 0.004850 2 91.40% : 0.004433s : 1: type_inference.infer 8.60% : 0.000417s : 1: type_inference.specialize ------[replace.] 0.000020 2 100.00% : 0.000020s : 2: replace.inline ------[match.] 0.000083 2 100.00% : 0.000083s : 2: match.inline ------[predicate.] 0.000131 754 0.78% : 0.000001s : 7: predicate.accumulaten_eliminater 1.11% : 0.000001s : 3: predicate.ad_related_special_op_eliminate 0.67% : 0.000001s : 6: predicate.addn_check_dump 0.82% : 0.000001s : 7: predicate.addn_zero_filter 0.68% : 0.000001s : 7: predicate.adjust_all_reduce_mul_add 2.40% : 0.000003s : 13: predicate.arithmetic_simplify 0.83% : 0.000001s : 7: predicate.cast_eliminate 0.73% : 0.000001s : 6: predicate.check_bprop_eliminate 0.72% : 0.000001s : 6: predicate.compare_switch_simplify 0.24% : 0.000000s : 3: predicate.const_output_eliminate 0.71% : 0.000001s : 6: predicate.depend_value_elim 0.80% : 0.000001s : 7: predicate.dict_get_item_const_eliminator 1.16% : 0.000002s : 7: predicate.dict_get_item_eliminator 0.75% : 0.000001s : 7: predicate.dict_set_item_eliminator 1.32% : 0.000002s : 6: predicate.dumpgradient_eliminate 0.28% : 0.000000s : 3: predicate.elim_not_effective 0.48% : 0.000001s : 3: predicate.elim_shapecalc_of_broadcastargs 1.07% : 0.000001s : 10: predicate.environ_add_const_eliminate 1.03% : 0.000001s : 10: predicate.environ_get_add_eliminate 1.00% : 0.000001s : 10: predicate.environ_get_depend_swap 1.86% : 0.000002s : 16: predicate.environ_get_eliminate 0.99% : 0.000001s : 10: predicate.environ_get_set_eliminate 0.97% : 0.000001s : 9: predicate.exchange_switch_depend_value 1.81% : 0.000002s : 9: predicate.float_depend_g_call 0.65% : 0.000001s : 6: predicate.float_environ_get_switch 1.00% : 0.000001s : 9: predicate.float_tuple_getitem_switch 0.23% : 0.000000s : 3: predicate.fold_const_symbol 0.79% : 0.000001s : 6: predicate.get_grad_eliminate 0.27% : 0.000000s : 3: predicate.graph_param_transform 0.81% : 0.000001s : 6: predicate.incorporate_call 0.68% : 0.000001s : 6: predicate.incorporate_call_switch 6.21% : 0.000008s : 34: predicate.inline 1.06% : 0.000001s : 6: predicate.inline_without_move 0.44% : 0.000001s : 6: predicate.j_node_and_user_rematch 1.07% : 0.000001s : 6: predicate.less_batch_normalization 1.59% : 0.000002s : 13: predicate.list_to_tuple_eliminator_ 2.09% : 0.000003s : 20: predicate.load_eliminater 1.28% : 0.000002s : 3: predicate.loop_unroll_after_grad 1.84% : 0.000002s : 14: predicate.loop_unroll_before_grad 1.72% : 0.000002s : 13: predicate.make_slice_get_slice_eliminator 0.70% : 0.000001s : 6: predicate.merge_addn 0.62% : 0.000001s : 6: predicate.micro_step_allgather_replace 0.71% : 0.000001s : 6: predicate.mini_step_allgather_replace 0.73% : 0.000001s : 7: predicate.minmaximum_grad 1.41% : 0.000002s : 3: predicate.mutable_eliminate 0.40% : 0.000001s : 3: predicate.opt_reshape 0.55% : 0.000001s : 3: predicate.parallel_virtual_node 1.63% : 0.000002s : 9: predicate.partial_defer_inline 1.27% : 0.000002s : 10: predicate.partial_eliminate 0.84% : 0.000001s : 7: predicate.print_const_string_wrapper 0.72% : 0.000001s : 6: predicate.reduce_all_const_elim 1.11% : 0.000001s : 7: predicate.reduce_eliminate 1.99% : 0.000003s : 20: predicate.redundant_stop_gradient_eliminater 0.64% : 0.000001s : 6: predicate.remove_not_recompute_node 1.25% : 0.000002s : 13: predicate.replace_applicator 0.52% : 0.000001s : 6: predicate.replace_old_param 0.37% : 0.000000s : 3: predicate.reset_defer_inline 0.86% : 0.000001s : 7: predicate.reshape_eliminate 0.75% : 0.000001s : 6: predicate.row_tensor_add_zeros_like 0.62% : 0.000001s : 3: predicate.row_tensor_eliminate 0.91% : 0.000001s : 6: predicate.same_eliminate 0.49% : 0.000001s : 6: predicate.set_cell_output_no_recompute 0.91% : 0.000001s : 6: predicate.shard_identity_eliminate 1.00% : 0.000001s : 6: predicate.special_op_eliminate 1.07% : 0.000001s : 6: predicate.specialize_transform 1.13% : 0.000001s : 6: predicate.split_environ_get_set_with_tuple_value 0.95% : 0.000001s : 6: predicate.stack_unstack_eliminate 0.43% : 0.000001s : 3: predicate.switch_call_monad_eliminater 1.03% : 0.000001s : 9: predicate.switch_defer_inline 1.67% : 0.000002s : 15: predicate.switch_layer_defer_inline 4.55% : 0.000006s : 32: predicate.switch_simplify 0.71% : 0.000001s : 7: predicate.tile_eliminate 0.76% : 0.000001s : 7: predicate.transpose_eliminate 1.73% : 0.000002s : 13: predicate.tuple_list_convert_item_index_to_positive 1.75% : 0.000002s : 13: predicate.tuple_list_get_item_const_eliminator 1.40% : 0.000002s : 13: predicate.tuple_list_get_item_depend_reorder 3.21% : 0.000004s : 19: predicate.tuple_list_get_item_eliminator 1.40% : 0.000002s : 13: predicate.tuple_list_get_set_item_eliminator 2.65% : 0.000003s : 19: predicate.tuple_list_set_item_eliminator 1.49% : 0.000002s : 13: predicate.tuple_to_list_eliminator_ 2.04% : 0.000003s : 20: predicate.updatestate_pure_node_eliminater 2.89% : 0.000004s : 26: predicate.updatestate_useless_node_eliminater 0.65% : 0.000001s : 3: predicate.value_based_eliminate 0.82% : 0.000001s : 6: predicate.virtual_dataset_eliminate 0.81% : 0.000001s : 6: predicate.virtual_output_eliminate 0.32% : 0.000000s : 3: predicate.virtual_view_grad_eliminate 0.58% : 0.000001s : 3: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000200 5 8.01% : 0.000016s : 1: func_graph_cloner_run.FuncGraphClonerGraph 91.99% : 0.000184s : 4: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.021630 192 0.02% : 0.000004s : 1: ForceFp32Comm 14.36% : 0.003106s : 1: add_attr 14.32% : 0.003097s : 1: add_attr_with_inline 0.02% : 0.000004s : 1: add_comm_op_reuse_tag 0.23% : 0.000050s : 1: add_recomputation 0.02% : 0.000004s : 1: assign_add_opt 0.27% : 0.000059s : 1: auto_monad 0.09% : 0.000020s : 1: auto_monad_reorder 0.02% : 0.000003s : 1: begin_end_overlap_inline 0.02% : 0.000005s : 1: bias_add_comm_swap 2.15% : 0.000464s : 1: bootstrap 0.12% : 0.000027s : 1: cconv 0.02% : 0.000004s : 1: comm_op_add_attrs 0.08% : 0.000016s : 1: control_data_broadcast_order 0.04% : 0.000010s : 1: convert_after_rewriter 0.11% : 0.000025s : 1: cse_after_recomputation 0.03% : 0.000006s : 1: dataset_repeat_opt 0.03% : 0.000005s : 1: detach_backward 0.04% : 0.000008s : 1: environ_conv 0.08% : 0.000017s : 1: event_method 0.02% : 0.000005s : 1: full_micro_interleaved_order_control 0.02% : 0.000005s : 1: get_jit_bprop_graph 0.04% : 0.000009s : 1: graph_reusing 0.02% : 0.000005s : 1: grouped_pairwise_exchange_alltoall 0.02% : 0.000004s : 1: handle_group_info 0.02% : 0.000005s : 1: inline 0.03% : 0.000005s : 1: insert-virtual-dataset 0.02% : 0.000004s : 1: interleave_parallel_branches 0.02% : 0.000004s : 1: interleave_split_concat_branches 0.03% : 0.000006s : 1: label_fine_grained_interleaved_index 0.03% : 0.000007s : 1: label_micro_interleaved_index 1.98% : 0.000428s : 1: loop_unroll 0.02% : 0.000004s : 1: merge_cast_opt 0.02% : 0.000005s : 1: micro_interleaved_order_control 2.34% : 0.000506s : 1: mutable_eliminate 0.03% : 0.000007s : 1: offloading_packed_experts 0.06% : 0.000013s : 1: opt.transform.loop_unroll_optimizer 0.07% : 0.000014s : 1: opt.transform.mutable_eliminate 3.44% : 0.000744s : 78: opt.transform.opt_a 0.12% : 0.000025s : 1: opt.transform.opt_after_cconv 0.10% : 0.000022s : 1: opt.transform.opt_after_jit_grad 0.60% : 0.000130s : 28: opt.transform.opt_b 0.20% : 0.000043s : 2: opt.transform.opt_trans_graph 0.16% : 0.000035s : 4: opt.transform.symbol_engine_opt 9.38% : 0.002028s : 1: opt_a 0.46% : 0.000100s : 1: opt_after_cconv 2.13% : 0.000460s : 1: opt_after_jit_grad 1.11% : 0.000239s : 1: opt_b 18.39% : 0.003978s : 1: optimize 0.09% : 0.000019s : 1: optimize_parallel_all_gather_comm 0.04% : 0.000008s : 1: order_py_execute_after_rewriter 0.10% : 0.000022s : 1: overlap_grad_flash_sp 0.02% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.03% : 0.000007s : 1: overlap_grad_ring_attention 0.02% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.02% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.02% : 0.000005s : 1: overlap_param_gather 0.02% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.04% : 0.000008s : 1: overlap_recompute_and_grad_model_parallel 0.02% : 0.000005s : 1: overlap_recompute_comm 0.03% : 0.000007s : 1: parallel-infer-symbol 0.02% : 0.000004s : 1: parallel-infer-symbol-second 0.02% : 0.000005s : 1: partial_unused_args_eliminate 0.02% : 0.000005s : 1: pipeline_parallel_scheduler 0.02% : 0.000005s : 1: pipeline_split 0.14% : 0.000030s : 1: pre_auto_parallel 0.10% : 0.000022s : 1: py_interpret_to_execute 0.05% : 0.000010s : 1: py_interpret_to_execute_after_opt_a 0.02% : 0.000004s : 1: remove_cast_before_assign_add 0.08% : 0.000018s : 1: remove_dup_value 1.26% : 0.000272s : 1: renormalize.infer 0.91% : 0.000196s : 1: renormalize.specialize 0.02% : 0.000005s : 1: reorder_send_recv_between_fp_bp 0.03% : 0.000007s : 1: rewriter_after_jit_bprop_graph 0.18% : 0.000040s : 1: rewriter_after_opt_a 0.22% : 0.000048s : 1: rewriter_before_opt_a 0.03% : 0.000006s : 1: slice_cell_reuse_recomputed_activation 0.02% : 0.000005s : 1: slice_recompute_activation 0.02% : 0.000004s : 1: split_layernorm_comm 0.02% : 0.000005s : 1: split_matmul_comm_elemetwise 0.03% : 0.000008s : 1: swap_dp_allreduce_reducescatter 0.36% : 0.000078s : 1: symbol_engine_optimizer 0.34% : 0.000073s : 1: tuple_transform 22.69% : 0.004908s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:47:01.190.0 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:47:01.216.3 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0148758, [21] [bootstrap]: 0.00043615 [type_inference]: 0.0054034 [event_method]: 1.176e-05 [auto_monad]: 5.485e-05 [graph_reusing]: 4.75999e-06 [inline]: 2.27001e-06 [add_attr]: 0.00307079, [1] [add_attr_with_inline]: 0.00306305, [1] [Cycle 1]: 6.605e-05, [2] [tag_attr]: 1.486e-05 [meta_addattr_fg_expand]: 3.9e-06 [parallel-infer-symbol]: 2.94001e-06 [pre_auto_parallel]: 2.317e-05 [insert-virtual-dataset]: 2.36998e-06 [parallel-infer-symbol-second]: 7.90023e-07 [dataset_repeat_opt]: 1.84998e-06 [pipeline_split]: 1.84e-06 [optimize]: 0.00473166, [53] [py_interpret_to_execute]: 2.16e-05 [rewriter_before_opt_a]: 4.768e-05 [opt_a]: 0.00247179, [2] [Cycle 1]: 0.00166474, [45] [expand_dump_flag]: 2.89001e-06 [switch_simplify]: 2.48e-05 [loop_unroll]: 1.378e-05 [a_1]: 0.00028511 [with_stream_mark]: 1.716e-05 [recompute_prepare]: 8.42e-06 [updatestate_depend_eliminate]: 4.15999e-06 [updatestate_assign_eliminate]: 4.08001e-06 [updatestate_loads_eliminate]: 2.91e-06 [parameter_eliminate]: 2.59001e-06 [a_2]: 0.00010924 [accelerated_algorithm]: 6.66e-06 [shard]: 2.00002e-06 [meta_shard_fg_expand]: 1.64998e-06 [shard_inline]: 6.81001e-06 [merge_send_recv]: 8.55999e-06 [auto_parallel]: 6.17999e-06 [parallel]: 1.746e-05 [flash_sp]: 8.13999e-06 [merge_comm]: 3.60998e-06 [allreduce_fusion]: 3.36999e-06 [matmul_add_comm_reduction]: 9.20001e-06 [allreduce_slice_to_reducescatter]: 6.50005e-07 [virtual_shard_identity]: 9.07999e-06 [virtual_dataset]: 6.15002e-06 [get_grad_eliminate_]: 6.29001e-06 [virtual_output]: 5.61e-06 [merge_forward]: 3.6e-06 [cell_reuse_recompute_pass]: 1.29e-06 [offload_activation]: 1.013e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.443e-05 [merge_recompute_call_nodes]: 1.74998e-06 [before_grad]: 1.168e-05 [set_forward_comm_id_for_comm_node_pass]: 3.61999e-06 [meta_fg_expand]: 2.68e-06 [flash_sp_send_recv_attached]: 3.20998e-06 [receive_attached]: 2.12999e-06 [after_resolve]: 9.91e-06 [a_after_grad]: 9.17001e-06 [renormalize]: 0.00049325 [add_forward_monad_depend]: 4.99e-06 [auto_monad_grad]: 2.06e-06 [auto_monad_eliminator]: 1.523e-05 [cse]: 2.887e-05 [a_3]: 6.094e-05 [Cycle 2]: 0.00079311, [45] [expand_dump_flag]: 1.15001e-06 [switch_simplify]: 7.04001e-06 [loop_unroll]: 6.01e-06 [a_1]: 0.00010899 [with_stream_mark]: 1.115e-05 [recompute_prepare]: 6.64999e-06 [updatestate_depend_eliminate]: 3.21001e-06 [updatestate_assign_eliminate]: 2.44999e-06 [updatestate_loads_eliminate]: 2.73003e-06 [parameter_eliminate]: 1.01002e-06 [a_2]: 9.945e-05 [accelerated_algorithm]: 5.92001e-06 [shard]: 1.51002e-06 [meta_shard_fg_expand]: 1.59998e-06 [shard_inline]: 6.70002e-06 [merge_send_recv]: 5.51e-06 [auto_parallel]: 6.26998e-06 [parallel]: 5.27001e-06 [flash_sp]: 3.74002e-06 [merge_comm]: 3.21999e-06 [allreduce_fusion]: 3.16001e-06 [matmul_add_comm_reduction]: 6.39001e-06 [allreduce_slice_to_reducescatter]: 5.59987e-07 [virtual_shard_identity]: 6.29999e-06 [virtual_dataset]: 6.11e-06 [get_grad_eliminate_]: 5.61e-06 [virtual_output]: 5.44e-06 [merge_forward]: 2.73e-06 [cell_reuse_recompute_pass]: 1.40001e-06 [offload_activation]: 6.61e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.51e-05 [merge_recompute_call_nodes]: 7.7e-07 [before_grad]: 9.24e-06 [set_forward_comm_id_for_comm_node_pass]: 3.95e-06 [meta_fg_expand]: 2.11e-06 [flash_sp_send_recv_attached]: 8.70001e-07 [receive_attached]: 9.50007e-07 [after_resolve]: 8.84e-06 [a_after_grad]: 8.54e-06 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 1.27e-06 [auto_monad_grad]: 1.04e-06 [auto_monad_eliminator]: 8.23001e-06 [cse]: 1.414e-05 [a_3]: 4.96e-05 [py_interpret_to_execute_after_opt_a]: 1.191e-05 [slice_cell_reuse_recomputed_activation]: 4.77e-06 [rewriter_after_opt_a]: 4.093e-05 [convert_after_rewriter]: 9.57999e-06 [order_py_execute_after_rewriter]: 8.35999e-06 [mutable_eliminate]: 0.00050148 [opt_b]: 0.00030509, [1] [Cycle 1]: 0.00029597, [7] [b_1]: 0.00019728 [b_2]: 8.77e-06 [updatestate_depend_eliminate]: 6.46999e-06 [updatestate_assign_eliminate]: 2.44001e-06 [updatestate_loads_eliminate]: 2.48998e-06 [renormalize]: 4.80009e-07 [cse]: 1.955e-05 [optimize_parallel_all_gather_comm]: 1.982e-05 [overlap_param_gather]: 4.58999e-06 [cconv]: 2.793e-05 [loop_unroll]: 0.00044838 [opt_after_cconv]: 0.00012163, [1] [Cycle 1]: 0.00011373, [7] [c_1]: 2.72e-05 [parameter_eliminate]: 2.82002e-06 [updatestate_depend_eliminate]: 5.46e-06 [updatestate_assign_eliminate]: 3.21999e-06 [updatestate_loads_eliminate]: 2.19999e-06 [cse]: 1.762e-05 [renormalize]: 3.59985e-07 [remove_dup_value]: 1.628e-05 [tuple_transform]: 8.45e-05, [1] [Cycle 1]: 7.742e-05, [4] [d_1]: 3.856e-05 [none_parameter_eliminate]: 1.89e-06 [renormalize]: 2.3999e-07 [switch_simplify]: 6.73e-06 [partial_unused_args_eliminate]: 4.15999e-06 [add_recomputation]: 4.979e-05 [cse_after_recomputation]: 2.748e-05, [1] [Cycle 1]: 2.073e-05, [1] [cse]: 1.181e-05 [environ_conv]: 8.3e-06 [swap_dp_allreduce_reducescatter]: 7.90998e-06 [bias_add_comm_swap]: 4.82e-06 [label_micro_interleaved_index]: 7.51001e-06 [label_fine_grained_interleaved_index]: 5.09e-06 [merge_cast_opt]: 4.23999e-06 [slice_recompute_activation]: 4.68001e-06 [micro_interleaved_order_control]: 4.92e-06 [assign_add_opt]: 3.63e-06 [ForceFp32Comm]: 3.46001e-06 [remove_cast_before_assign_add]: 3.29001e-06 [full_micro_interleaved_order_control]: 4.50999e-06 [reorder_send_recv_between_fp_bp]: 5.26998e-06 [comm_op_add_attrs]: 3.66001e-06 [add_comm_op_reuse_tag]: 1.229e-05 [interleave_split_concat_branches]: 4.25999e-06 [interleave_parallel_branches]: 4.21001e-06 [overlap_opt_shard_in_pipeline]: 3.61999e-06 [overlap_opt_shard_grad_in_pipeline]: 4.74e-06 [control_data_broadcast_order]: 1.68e-05 [grouped_pairwise_exchange_alltoall]: 5.12e-06 [offloading_packed_experts]: 7e-06 [overlap_recompute_and_grad_model_parallel]: 7.53e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.7e-06 [overlap_recompute_allgather_and_fa_grad]: 3.68999e-06 [overlap_recompute_comm]: 5.00001e-06 [overlap_grad_ring_attention]: 6.86999e-06 [overlap_grad_flash_sp]: 2.307e-05 [begin_end_overlap_inline]: 2.94999e-06 [split_matmul_comm_elemetwise]: 5.02e-06 [split_layernorm_comm]: 4.09002e-06 [handle_group_info]: 3.43e-06 [symbol_engine_optimizer]: 9.766e-05, [1] [Cycle 1]: 9.027e-05, [6] [build]: 2.96001e-06 [elim_shapecalc]: 1.072e-05 [elim_not_effective]: 1.345e-05 [opt_reshape]: 6.71e-06 [fold_const_symbol]: 1.024e-05 [renormalize]: 2.40019e-07 [detach_backward]: 3.26999e-06 [pipeline_parallel_scheduler]: 2.32999e-06 [auto_monad_reorder]: 1.901e-05 [get_jit_bprop_graph]: 1.40001e-06 [rewriter_after_jit_bprop_graph]: 4.31002e-06 [opt_after_jit_grad]: 0.0004973 [validate]: 3.479e-05 Sums bootstrap : 0.000436s : 4.34% type_inference : 0.005403s : 53.75% event_method : 0.000012s : 0.12% auto_monad : 0.000055s : 0.55% graph_reusing : 0.000005s : 0.05% inline : 0.000002s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000015s : 0.15% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000004s : 0.04% parallel-infer-symbol : 0.000003s : 0.03% pre_auto_parallel : 0.000023s : 0.23% insert-virtual-dataset : 0.000002s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.02% optimize.py_interpret_to_execute : 0.000022s : 0.21% optimize.rewriter_before_opt_a : 0.000048s : 0.47% optimize.opt_a.expand_dump_flag : 0.000004s : 0.04% optimize.opt_a.switch_simplify : 0.000032s : 0.32% optimize.opt_a.loop_unroll : 0.000020s : 0.20% optimize.opt_a.a_1 : 0.000394s : 3.92% optimize.opt_a.with_stream_mark : 0.000028s : 0.28% optimize.opt_a.recompute_prepare : 0.000015s : 0.15% optimize.opt_a.updatestate_depend_eliminate : 0.000007s : 0.07% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.06% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.06% optimize.opt_a.parameter_eliminate : 0.000004s : 0.04% optimize.opt_a.a_2 : 0.000209s : 2.08% optimize.opt_a.accelerated_algorithm : 0.000013s : 0.13% optimize.opt_a.shard : 0.000004s : 0.03% optimize.opt_a.meta_shard_fg_expand : 0.000003s : 0.03% optimize.opt_a.shard_inline : 0.000014s : 0.13% optimize.opt_a.merge_send_recv : 0.000014s : 0.14% optimize.opt_a.auto_parallel : 0.000012s : 0.12% optimize.opt_a.parallel : 0.000023s : 0.23% optimize.opt_a.flash_sp : 0.000012s : 0.12% optimize.opt_a.merge_comm : 0.000007s : 0.07% optimize.opt_a.allreduce_fusion : 0.000007s : 0.06% optimize.opt_a.matmul_add_comm_reduction : 0.000016s : 0.16% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000015s : 0.15% optimize.opt_a.virtual_dataset : 0.000012s : 0.12% optimize.opt_a.get_grad_eliminate_ : 0.000012s : 0.12% optimize.opt_a.virtual_output : 0.000011s : 0.11% optimize.opt_a.merge_forward : 0.000006s : 0.06% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.03% optimize.opt_a.offload_activation : 0.000017s : 0.17% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000030s : 0.29% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.03% optimize.opt_a.before_grad : 0.000021s : 0.21% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000008s : 0.08% optimize.opt_a.meta_fg_expand : 0.000005s : 0.05% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.04% optimize.opt_a.receive_attached : 0.000003s : 0.03% optimize.opt_a.after_resolve : 0.000019s : 0.19% optimize.opt_a.a_after_grad : 0.000018s : 0.18% optimize.opt_a.renormalize : 0.000493s : 4.91% optimize.opt_a.add_forward_monad_depend : 0.000006s : 0.06% optimize.opt_a.auto_monad_grad : 0.000003s : 0.03% optimize.opt_a.auto_monad_eliminator : 0.000023s : 0.23% optimize.opt_a.cse : 0.000043s : 0.43% optimize.opt_a.a_3 : 0.000111s : 1.10% optimize.py_interpret_to_execute_after_opt_a : 0.000012s : 0.12% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.05% optimize.rewriter_after_opt_a : 0.000041s : 0.41% optimize.convert_after_rewriter : 0.000010s : 0.10% optimize.order_py_execute_after_rewriter : 0.000008s : 0.08% optimize.mutable_eliminate : 0.000501s : 4.99% optimize.opt_b.b_1 : 0.000197s : 1.96% optimize.opt_b.b_2 : 0.000009s : 0.09% optimize.opt_b.updatestate_depend_eliminate : 0.000006s : 0.06% optimize.opt_b.updatestate_assign_eliminate : 0.000002s : 0.02% optimize.opt_b.updatestate_loads_eliminate : 0.000002s : 0.02% optimize.opt_b.renormalize : 0.000000s : 0.00% optimize.opt_b.cse : 0.000020s : 0.19% optimize.optimize_parallel_all_gather_comm : 0.000020s : 0.20% optimize.overlap_param_gather : 0.000005s : 0.05% optimize.cconv : 0.000028s : 0.28% optimize.loop_unroll : 0.000448s : 4.46% optimize.opt_after_cconv.c_1 : 0.000027s : 0.27% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000005s : 0.05% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.02% optimize.opt_after_cconv.cse : 0.000018s : 0.18% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000016s : 0.16% optimize.tuple_transform.d_1 : 0.000039s : 0.38% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.02% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000007s : 0.07% optimize.partial_unused_args_eliminate : 0.000004s : 0.04% optimize.add_recomputation : 0.000050s : 0.50% optimize.cse_after_recomputation.cse : 0.000012s : 0.12% optimize.environ_conv : 0.000008s : 0.08% optimize.swap_dp_allreduce_reducescatter : 0.000008s : 0.08% optimize.bias_add_comm_swap : 0.000005s : 0.05% optimize.label_micro_interleaved_index : 0.000008s : 0.07% optimize.label_fine_grained_interleaved_index : 0.000005s : 0.05% optimize.merge_cast_opt : 0.000004s : 0.04% optimize.slice_recompute_activation : 0.000005s : 0.05% optimize.micro_interleaved_order_control : 0.000005s : 0.05% optimize.assign_add_opt : 0.000004s : 0.04% optimize.ForceFp32Comm : 0.000003s : 0.03% optimize.remove_cast_before_assign_add : 0.000003s : 0.03% optimize.full_micro_interleaved_order_control : 0.000005s : 0.04% optimize.reorder_send_recv_between_fp_bp : 0.000005s : 0.05% optimize.comm_op_add_attrs : 0.000004s : 0.04% optimize.add_comm_op_reuse_tag : 0.000012s : 0.12% optimize.interleave_split_concat_branches : 0.000004s : 0.04% optimize.interleave_parallel_branches : 0.000004s : 0.04% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.04% optimize.overlap_opt_shard_grad_in_pipeline : 0.000005s : 0.05% optimize.control_data_broadcast_order : 0.000017s : 0.17% optimize.grouped_pairwise_exchange_alltoall : 0.000005s : 0.05% optimize.offloading_packed_experts : 0.000007s : 0.07% optimize.overlap_recompute_and_grad_model_parallel : 0.000008s : 0.07% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.04% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.04% optimize.overlap_recompute_comm : 0.000005s : 0.05% optimize.overlap_grad_ring_attention : 0.000007s : 0.07% optimize.overlap_grad_flash_sp : 0.000023s : 0.23% optimize.begin_end_overlap_inline : 0.000003s : 0.03% optimize.split_matmul_comm_elemetwise : 0.000005s : 0.05% optimize.split_layernorm_comm : 0.000004s : 0.04% optimize.handle_group_info : 0.000003s : 0.03% optimize.symbol_engine_optimizer.build : 0.000003s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000011s : 0.11% optimize.symbol_engine_optimizer.elim_not_effective : 0.000013s : 0.13% optimize.symbol_engine_optimizer.opt_reshape : 0.000007s : 0.07% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000010s : 0.10% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000003s : 0.03% pipeline_parallel_scheduler : 0.000002s : 0.02% auto_monad_reorder : 0.000019s : 0.19% get_jit_bprop_graph : 0.000001s : 0.01% rewriter_after_jit_bprop_graph : 0.000004s : 0.04% opt_after_jit_grad : 0.000497s : 4.95% validate : 0.000035s : 0.35% Time group info: ------[substitution.] 0.000132 20 1.56% : 0.000002s : 2: substitution.elim_not_effective 1.03% : 0.000001s : 2: substitution.fold_const_symbol 4.44% : 0.000006s : 3: substitution.graph_param_transform 65.87% : 0.000087s : 2: substitution.inline 2.35% : 0.000003s : 4: substitution.j_node_and_user_rematch 3.50% : 0.000005s : 4: substitution.remove_not_recompute_node 2.35% : 0.000003s : 2: substitution.replace_old_param 18.90% : 0.000025s : 1: substitution.value_based_eliminate ------[type_inference.] 0.005359 2 90.85% : 0.004869s : 1: type_inference.infer 9.15% : 0.000490s : 1: type_inference.specialize ------[replace.] 0.000021 2 100.00% : 0.000021s : 2: replace.inline ------[match.] 0.000086 2 100.00% : 0.000086s : 2: match.inline ------[predicate.] 0.000136 754 0.84% : 0.000001s : 7: predicate.accumulaten_eliminater 1.23% : 0.000002s : 3: predicate.ad_related_special_op_eliminate 0.68% : 0.000001s : 6: predicate.addn_check_dump 0.78% : 0.000001s : 7: predicate.addn_zero_filter 0.71% : 0.000001s : 7: predicate.adjust_all_reduce_mul_add 2.21% : 0.000003s : 13: predicate.arithmetic_simplify 0.94% : 0.000001s : 7: predicate.cast_eliminate 1.17% : 0.000002s : 6: predicate.check_bprop_eliminate 0.70% : 0.000001s : 6: predicate.compare_switch_simplify 0.26% : 0.000000s : 3: predicate.const_output_eliminate 0.78% : 0.000001s : 6: predicate.depend_value_elim 0.78% : 0.000001s : 7: predicate.dict_get_item_const_eliminator 0.88% : 0.000001s : 7: predicate.dict_get_item_eliminator 0.70% : 0.000001s : 7: predicate.dict_set_item_eliminator 1.07% : 0.000001s : 6: predicate.dumpgradient_eliminate 0.26% : 0.000000s : 3: predicate.elim_not_effective 0.46% : 0.000001s : 3: predicate.elim_shapecalc_of_broadcastargs 1.07% : 0.000001s : 10: predicate.environ_add_const_eliminate 1.02% : 0.000001s : 10: predicate.environ_get_add_eliminate 1.05% : 0.000001s : 10: predicate.environ_get_depend_swap 1.80% : 0.000002s : 16: predicate.environ_get_eliminate 1.03% : 0.000001s : 10: predicate.environ_get_set_eliminate 0.93% : 0.000001s : 9: predicate.exchange_switch_depend_value 1.99% : 0.000003s : 9: predicate.float_depend_g_call 0.62% : 0.000001s : 6: predicate.float_environ_get_switch 1.03% : 0.000001s : 9: predicate.float_tuple_getitem_switch 0.23% : 0.000000s : 3: predicate.fold_const_symbol 0.85% : 0.000001s : 6: predicate.get_grad_eliminate 0.26% : 0.000000s : 3: predicate.graph_param_transform 0.80% : 0.000001s : 6: predicate.incorporate_call 0.69% : 0.000001s : 6: predicate.incorporate_call_switch 6.69% : 0.000009s : 34: predicate.inline 0.98% : 0.000001s : 6: predicate.inline_without_move 0.46% : 0.000001s : 6: predicate.j_node_and_user_rematch 1.04% : 0.000001s : 6: predicate.less_batch_normalization 1.50% : 0.000002s : 13: predicate.list_to_tuple_eliminator_ 2.11% : 0.000003s : 20: predicate.load_eliminater 1.50% : 0.000002s : 3: predicate.loop_unroll_after_grad 1.59% : 0.000002s : 14: predicate.loop_unroll_before_grad 1.77% : 0.000002s : 13: predicate.make_slice_get_slice_eliminator 0.75% : 0.000001s : 6: predicate.merge_addn 0.66% : 0.000001s : 6: predicate.micro_step_allgather_replace 0.74% : 0.000001s : 6: predicate.mini_step_allgather_replace 0.72% : 0.000001s : 7: predicate.minmaximum_grad 1.40% : 0.000002s : 3: predicate.mutable_eliminate 0.46% : 0.000001s : 3: predicate.opt_reshape 0.50% : 0.000001s : 3: predicate.parallel_virtual_node 1.24% : 0.000002s : 9: predicate.partial_defer_inline 1.24% : 0.000002s : 10: predicate.partial_eliminate 0.71% : 0.000001s : 7: predicate.print_const_string_wrapper 0.76% : 0.000001s : 6: predicate.reduce_all_const_elim 0.99% : 0.000001s : 7: predicate.reduce_eliminate 2.00% : 0.000003s : 20: predicate.redundant_stop_gradient_eliminater 0.62% : 0.000001s : 6: predicate.remove_not_recompute_node 1.10% : 0.000002s : 13: predicate.replace_applicator 0.64% : 0.000001s : 6: predicate.replace_old_param 0.35% : 0.000000s : 3: predicate.reset_defer_inline 0.89% : 0.000001s : 7: predicate.reshape_eliminate 0.76% : 0.000001s : 6: predicate.row_tensor_add_zeros_like 0.49% : 0.000001s : 3: predicate.row_tensor_eliminate 0.96% : 0.000001s : 6: predicate.same_eliminate 0.64% : 0.000001s : 6: predicate.set_cell_output_no_recompute 1.30% : 0.000002s : 6: predicate.shard_identity_eliminate 1.00% : 0.000001s : 6: predicate.special_op_eliminate 1.15% : 0.000002s : 6: predicate.specialize_transform 1.07% : 0.000001s : 6: predicate.split_environ_get_set_with_tuple_value 0.93% : 0.000001s : 6: predicate.stack_unstack_eliminate 0.40% : 0.000001s : 3: predicate.switch_call_monad_eliminater 0.96% : 0.000001s : 9: predicate.switch_defer_inline 1.75% : 0.000002s : 15: predicate.switch_layer_defer_inline 4.37% : 0.000006s : 32: predicate.switch_simplify 0.76% : 0.000001s : 7: predicate.tile_eliminate 0.93% : 0.000001s : 7: predicate.transpose_eliminate 1.46% : 0.000002s : 13: predicate.tuple_list_convert_item_index_to_positive 1.62% : 0.000002s : 13: predicate.tuple_list_get_item_const_eliminator 1.39% : 0.000002s : 13: predicate.tuple_list_get_item_depend_reorder 3.00% : 0.000004s : 19: predicate.tuple_list_get_item_eliminator 1.43% : 0.000002s : 13: predicate.tuple_list_get_set_item_eliminator 2.79% : 0.000004s : 19: predicate.tuple_list_set_item_eliminator 1.64% : 0.000002s : 13: predicate.tuple_to_list_eliminator_ 1.92% : 0.000003s : 20: predicate.updatestate_pure_node_eliminater 2.94% : 0.000004s : 26: predicate.updatestate_useless_node_eliminater 0.68% : 0.000001s : 3: predicate.value_based_eliminate 0.81% : 0.000001s : 6: predicate.virtual_dataset_eliminate 0.77% : 0.000001s : 6: predicate.virtual_output_eliminate 0.32% : 0.000000s : 3: predicate.virtual_view_grad_eliminate 0.53% : 0.000001s : 3: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000239 5 6.86% : 0.000016s : 1: func_graph_cloner_run.FuncGraphClonerGraph 93.14% : 0.000223s : 4: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.024103 192 0.02% : 0.000006s : 1: ForceFp32Comm 12.77% : 0.003079s : 1: add_attr 12.72% : 0.003066s : 1: add_attr_with_inline 0.06% : 0.000016s : 1: add_comm_op_reuse_tag 0.22% : 0.000053s : 1: add_recomputation 0.03% : 0.000006s : 1: assign_add_opt 0.26% : 0.000063s : 1: auto_monad 0.11% : 0.000026s : 1: auto_monad_reorder 0.02% : 0.000006s : 1: begin_end_overlap_inline 0.03% : 0.000008s : 1: bias_add_comm_swap 1.99% : 0.000479s : 1: bootstrap 0.13% : 0.000031s : 1: cconv 0.03% : 0.000007s : 1: comm_op_add_attrs 0.08% : 0.000020s : 1: control_data_broadcast_order 0.05% : 0.000013s : 1: convert_after_rewriter 0.13% : 0.000030s : 1: cse_after_recomputation 0.03% : 0.000007s : 1: dataset_repeat_opt 0.08% : 0.000020s : 1: detach_backward 0.05% : 0.000011s : 1: environ_conv 0.09% : 0.000021s : 1: event_method 0.03% : 0.000007s : 1: full_micro_interleaved_order_control 0.03% : 0.000008s : 1: get_jit_bprop_graph 0.05% : 0.000011s : 1: graph_reusing 0.03% : 0.000008s : 1: grouped_pairwise_exchange_alltoall 0.03% : 0.000006s : 1: handle_group_info 0.03% : 0.000008s : 1: inline 0.03% : 0.000008s : 1: insert-virtual-dataset 0.03% : 0.000007s : 1: interleave_parallel_branches 0.03% : 0.000007s : 1: interleave_split_concat_branches 0.03% : 0.000008s : 1: label_fine_grained_interleaved_index 0.04% : 0.000010s : 1: label_micro_interleaved_index 1.89% : 0.000455s : 1: loop_unroll 0.03% : 0.000007s : 1: merge_cast_opt 0.03% : 0.000008s : 1: micro_interleaved_order_control 2.11% : 0.000509s : 1: mutable_eliminate 0.04% : 0.000010s : 1: offloading_packed_experts 0.06% : 0.000014s : 1: opt.transform.loop_unroll_optimizer 0.06% : 0.000015s : 1: opt.transform.mutable_eliminate 3.18% : 0.000767s : 78: opt.transform.opt_a 0.11% : 0.000026s : 1: opt.transform.opt_after_cconv 0.10% : 0.000025s : 1: opt.transform.opt_after_jit_grad 0.54% : 0.000130s : 28: opt.transform.opt_b 0.18% : 0.000043s : 2: opt.transform.opt_trans_graph 0.15% : 0.000037s : 4: opt.transform.symbol_engine_opt 10.27% : 0.002475s : 1: opt_a 0.52% : 0.000125s : 1: opt_after_cconv 2.11% : 0.000508s : 1: opt_after_jit_grad 1.28% : 0.000309s : 1: opt_b 21.02% : 0.005067s : 1: optimize 0.09% : 0.000023s : 1: optimize_parallel_all_gather_comm 0.05% : 0.000011s : 1: order_py_execute_after_rewriter 0.11% : 0.000026s : 1: overlap_grad_flash_sp 0.03% : 0.000006s : 1: overlap_grad_matmul_and_grad_allreduce 0.04% : 0.000010s : 1: overlap_grad_ring_attention 0.03% : 0.000007s : 1: overlap_opt_shard_grad_in_pipeline 0.03% : 0.000006s : 1: overlap_opt_shard_in_pipeline 0.03% : 0.000007s : 1: overlap_param_gather 0.03% : 0.000007s : 1: overlap_recompute_allgather_and_fa_grad 0.04% : 0.000010s : 1: overlap_recompute_and_grad_model_parallel 0.03% : 0.000008s : 1: overlap_recompute_comm 0.04% : 0.000009s : 1: parallel-infer-symbol 0.03% : 0.000006s : 1: parallel-infer-symbol-second 0.03% : 0.000007s : 1: partial_unused_args_eliminate 0.04% : 0.000010s : 1: pipeline_parallel_scheduler 0.03% : 0.000008s : 1: pipeline_split 0.13% : 0.000031s : 1: pre_auto_parallel 0.10% : 0.000025s : 1: py_interpret_to_execute 0.07% : 0.000016s : 1: py_interpret_to_execute_after_opt_a 0.02% : 0.000006s : 1: remove_cast_before_assign_add 0.08% : 0.000019s : 1: remove_dup_value 1.18% : 0.000284s : 1: renormalize.infer 0.84% : 0.000202s : 1: renormalize.specialize 0.03% : 0.000008s : 1: reorder_send_recv_between_fp_bp 0.04% : 0.000010s : 1: rewriter_after_jit_bprop_graph 0.18% : 0.000044s : 1: rewriter_after_opt_a 0.21% : 0.000051s : 1: rewriter_before_opt_a 0.03% : 0.000008s : 1: slice_cell_reuse_recomputed_activation 0.03% : 0.000007s : 1: slice_recompute_activation 0.03% : 0.000007s : 1: split_layernorm_comm 0.03% : 0.000008s : 1: split_matmul_comm_elemetwise 0.04% : 0.000011s : 1: swap_dp_allreduce_reducescatter 0.42% : 0.000101s : 1: symbol_engine_optimizer 0.36% : 0.000087s : 1: tuple_transform 22.52% : 0.005429s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:47:01.191.902 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0137071, [21] [bootstrap]: 0.00043666 [type_inference]: 0.005395 [event_method]: 1.173e-05 [auto_monad]: 5.174e-05 [graph_reusing]: 5.05999e-06 [inline]: 2.12999e-06 [add_attr]: 0.00313564, [1] [add_attr_with_inline]: 0.00312701, [1] [Cycle 1]: 5.704e-05, [2] [tag_attr]: 1.495e-05 [meta_addattr_fg_expand]: 3.78001e-06 [parallel-infer-symbol]: 3.62998e-06 [pre_auto_parallel]: 2.555e-05 [insert-virtual-dataset]: 2.37001e-06 [parallel-infer-symbol-second]: 7.50006e-07 [dataset_repeat_opt]: 2.21e-06 [pipeline_split]: 1.59e-06 [optimize]: 0.00397142, [53] [py_interpret_to_execute]: 1.785e-05 [rewriter_before_opt_a]: 4.451e-05 [opt_a]: 0.00202289, [2] [Cycle 1]: 0.00141051, [45] [expand_dump_flag]: 2.73998e-06 [switch_simplify]: 2.55e-05 [loop_unroll]: 1.383e-05 [a_1]: 0.00028242 [with_stream_mark]: 1.612e-05 [recompute_prepare]: 7.78999e-06 [updatestate_depend_eliminate]: 3.59002e-06 [updatestate_assign_eliminate]: 3.00002e-06 [updatestate_loads_eliminate]: 3.2e-06 [parameter_eliminate]: 1.99e-06 [a_2]: 7.916e-05 [accelerated_algorithm]: 7.05e-06 [shard]: 2.32001e-06 [meta_shard_fg_expand]: 1.64998e-06 [shard_inline]: 6.11e-06 [merge_send_recv]: 7.71999e-06 [auto_parallel]: 6.21e-06 [parallel]: 1.783e-05 [flash_sp]: 8.08001e-06 [merge_comm]: 4.03001e-06 [allreduce_fusion]: 3.52002e-06 [matmul_add_comm_reduction]: 8.80999e-06 [allreduce_slice_to_reducescatter]: 1.02e-06 [virtual_shard_identity]: 7.46001e-06 [virtual_dataset]: 5.79e-06 [get_grad_eliminate_]: 6.24001e-06 [virtual_output]: 5.56e-06 [merge_forward]: 3.95e-06 [cell_reuse_recompute_pass]: 1.75001e-06 [offload_activation]: 9.86e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.373e-05 [merge_recompute_call_nodes]: 1.69998e-06 [before_grad]: 1.024e-05 [set_forward_comm_id_for_comm_node_pass]: 3.76001e-06 [meta_fg_expand]: 2.61999e-06 [flash_sp_send_recv_attached]: 2.36e-06 [receive_attached]: 1.96e-06 [after_resolve]: 1.058e-05 [a_after_grad]: 9.05001e-06 [renormalize]: 0.00046341 [add_forward_monad_depend]: 5.37001e-06 [auto_monad_grad]: 2.04999e-06 [auto_monad_eliminator]: 1.373e-05 [cse]: 3.028e-05 [a_3]: 4.479e-05 [Cycle 2]: 0.00060279, [45] [expand_dump_flag]: 1.19e-06 [switch_simplify]: 6.81001e-06 [loop_unroll]: 5.62999e-06 [a_1]: 0.00010766 [with_stream_mark]: 1.365e-05 [recompute_prepare]: 6.46999e-06 [updatestate_depend_eliminate]: 2.91999e-06 [updatestate_assign_eliminate]: 2.37001e-06 [updatestate_loads_eliminate]: 2.78e-06 [parameter_eliminate]: 1.02e-06 [a_2]: 7.019e-05 [accelerated_algorithm]: 5.81e-06 [shard]: 1.42e-06 [meta_shard_fg_expand]: 1.34e-06 [shard_inline]: 6.14001e-06 [merge_send_recv]: 4.60001e-06 [auto_parallel]: 5.50001e-06 [parallel]: 4.64002e-06 [flash_sp]: 3.35998e-06 [merge_comm]: 3.22002e-06 [allreduce_fusion]: 2.83e-06 [matmul_add_comm_reduction]: 5.49998e-06 [allreduce_slice_to_reducescatter]: 3.50003e-07 [virtual_shard_identity]: 6.17001e-06 [virtual_dataset]: 5.36002e-06 [get_grad_eliminate_]: 5.26998e-06 [virtual_output]: 5.22e-06 [merge_forward]: 2.39999e-06 [cell_reuse_recompute_pass]: 1.35999e-06 [offload_activation]: 6.14001e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.279e-05 [merge_recompute_call_nodes]: 7.79983e-07 [before_grad]: 8.65001e-06 [set_forward_comm_id_for_comm_node_pass]: 3.45e-06 [meta_fg_expand]: 2.14999e-06 [flash_sp_send_recv_attached]: 8.70001e-07 [receive_attached]: 1.03001e-06 [after_resolve]: 8.40001e-06 [a_after_grad]: 8.16002e-06 [renormalize]: 8.00064e-08 [add_forward_monad_depend]: 1.47999e-06 [auto_monad_grad]: 8.00006e-07 [auto_monad_eliminator]: 6.68e-06 [cse]: 1.343e-05 [a_3]: 3.395e-05 [py_interpret_to_execute_after_opt_a]: 7.88999e-06 [slice_cell_reuse_recomputed_activation]: 1.94e-06 [rewriter_after_opt_a]: 3.487e-05 [convert_after_rewriter]: 6.39999e-06 [order_py_execute_after_rewriter]: 6.19001e-06 [mutable_eliminate]: 0.00050002 [opt_b]: 0.00023312, [1] [Cycle 1]: 0.00022686, [7] [b_1]: 0.00015074 [b_2]: 7.9e-06 [updatestate_depend_eliminate]: 5.92001e-06 [updatestate_assign_eliminate]: 2.46e-06 [updatestate_loads_eliminate]: 2.27999e-06 [renormalize]: 5.69999e-07 [cse]: 2.036e-05 [optimize_parallel_all_gather_comm]: 1.545e-05 [overlap_param_gather]: 1.97001e-06 [cconv]: 2.473e-05 [loop_unroll]: 0.0004218 [opt_after_cconv]: 9.914e-05, [1] [Cycle 1]: 9.366e-05, [7] [c_1]: 2.761e-05 [parameter_eliminate]: 2.83e-06 [updatestate_depend_eliminate]: 5.00999e-06 [updatestate_assign_eliminate]: 2.43e-06 [updatestate_loads_eliminate]: 2.29999e-06 [cse]: 1.858e-05 [renormalize]: 4.19997e-07 [remove_dup_value]: 1.403e-05 [tuple_transform]: 6.918e-05, [1] [Cycle 1]: 6.482e-05, [4] [d_1]: 3.841e-05 [none_parameter_eliminate]: 1.89999e-06 [renormalize]: 2.30008e-07 [switch_simplify]: 6.37001e-06 [partial_unused_args_eliminate]: 1.66e-06 [add_recomputation]: 4.359e-05 [cse_after_recomputation]: 2.168e-05, [1] [Cycle 1]: 1.705e-05, [1] [cse]: 1.167e-05 [environ_conv]: 4.68999e-06 [swap_dp_allreduce_reducescatter]: 5.10999e-06 [bias_add_comm_swap]: 2.32999e-06 [label_micro_interleaved_index]: 4.07e-06 [label_fine_grained_interleaved_index]: 2.57001e-06 [merge_cast_opt]: 1.25999e-06 [slice_recompute_activation]: 2.01998e-06 [micro_interleaved_order_control]: 2.22999e-06 [assign_add_opt]: 1.19e-06 [ForceFp32Comm]: 8.29983e-07 [remove_cast_before_assign_add]: 1.00001e-06 [full_micro_interleaved_order_control]: 2.21e-06 [reorder_send_recv_between_fp_bp]: 2.64001e-06 [comm_op_add_attrs]: 9.50007e-07 [add_comm_op_reuse_tag]: 9.5999e-07 [interleave_split_concat_branches]: 1.11002e-06 [interleave_parallel_branches]: 1.24e-06 [overlap_opt_shard_in_pipeline]: 1.20001e-06 [overlap_opt_shard_grad_in_pipeline]: 1.77999e-06 [control_data_broadcast_order]: 1.211e-05 [grouped_pairwise_exchange_alltoall]: 1.84998e-06 [offloading_packed_experts]: 3.98001e-06 [overlap_recompute_and_grad_model_parallel]: 4.47e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.33002e-06 [overlap_recompute_allgather_and_fa_grad]: 1.34e-06 [overlap_recompute_comm]: 2.63e-06 [overlap_grad_ring_attention]: 4.2e-06 [overlap_grad_flash_sp]: 1.892e-05 [begin_end_overlap_inline]: 5.09986e-07 [split_matmul_comm_elemetwise]: 2.17001e-06 [split_layernorm_comm]: 1.57999e-06 [handle_group_info]: 8.59989e-07 [symbol_engine_optimizer]: 7.326e-05, [1] [Cycle 1]: 6.921e-05, [6] [build]: 2.44999e-06 [elim_shapecalc]: 9.22999e-06 [elim_not_effective]: 1.285e-05 [opt_reshape]: 6.79999e-06 [fold_const_symbol]: 9.72999e-06 [renormalize]: 1.69995e-07 [detach_backward]: 1.92001e-06 [pipeline_parallel_scheduler]: 1.57001e-06 [auto_monad_reorder]: 1.549e-05 [get_jit_bprop_graph]: 1.14998e-06 [rewriter_after_jit_bprop_graph]: 3.76999e-06 [opt_after_jit_grad]: 0.00045677 [validate]: 3.65e-05 Sums bootstrap : 0.000437s : 4.53% type_inference : 0.005395s : 56.01% event_method : 0.000012s : 0.12% auto_monad : 0.000052s : 0.54% graph_reusing : 0.000005s : 0.05% inline : 0.000002s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000015s : 0.16% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000004s : 0.04% parallel-infer-symbol : 0.000004s : 0.04% pre_auto_parallel : 0.000026s : 0.27% insert-virtual-dataset : 0.000002s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.02% optimize.py_interpret_to_execute : 0.000018s : 0.19% optimize.rewriter_before_opt_a : 0.000045s : 0.46% optimize.opt_a.expand_dump_flag : 0.000004s : 0.04% optimize.opt_a.switch_simplify : 0.000032s : 0.34% optimize.opt_a.loop_unroll : 0.000019s : 0.20% optimize.opt_a.a_1 : 0.000390s : 4.05% optimize.opt_a.with_stream_mark : 0.000030s : 0.31% optimize.opt_a.recompute_prepare : 0.000014s : 0.15% optimize.opt_a.updatestate_depend_eliminate : 0.000007s : 0.07% optimize.opt_a.updatestate_assign_eliminate : 0.000005s : 0.06% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.06% optimize.opt_a.parameter_eliminate : 0.000003s : 0.03% optimize.opt_a.a_2 : 0.000149s : 1.55% optimize.opt_a.accelerated_algorithm : 0.000013s : 0.13% optimize.opt_a.shard : 0.000004s : 0.04% optimize.opt_a.meta_shard_fg_expand : 0.000003s : 0.03% optimize.opt_a.shard_inline : 0.000012s : 0.13% optimize.opt_a.merge_send_recv : 0.000012s : 0.13% optimize.opt_a.auto_parallel : 0.000012s : 0.12% optimize.opt_a.parallel : 0.000022s : 0.23% optimize.opt_a.flash_sp : 0.000011s : 0.12% optimize.opt_a.merge_comm : 0.000007s : 0.08% optimize.opt_a.allreduce_fusion : 0.000006s : 0.07% optimize.opt_a.matmul_add_comm_reduction : 0.000014s : 0.15% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000014s : 0.14% optimize.opt_a.virtual_dataset : 0.000011s : 0.12% optimize.opt_a.get_grad_eliminate_ : 0.000012s : 0.12% optimize.opt_a.virtual_output : 0.000011s : 0.11% optimize.opt_a.merge_forward : 0.000006s : 0.07% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.03% optimize.opt_a.offload_activation : 0.000016s : 0.17% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000027s : 0.28% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.03% optimize.opt_a.before_grad : 0.000019s : 0.20% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000007s : 0.07% optimize.opt_a.meta_fg_expand : 0.000005s : 0.05% optimize.opt_a.flash_sp_send_recv_attached : 0.000003s : 0.03% optimize.opt_a.receive_attached : 0.000003s : 0.03% optimize.opt_a.after_resolve : 0.000019s : 0.20% optimize.opt_a.a_after_grad : 0.000017s : 0.18% optimize.opt_a.renormalize : 0.000463s : 4.81% optimize.opt_a.add_forward_monad_depend : 0.000007s : 0.07% optimize.opt_a.auto_monad_grad : 0.000003s : 0.03% optimize.opt_a.auto_monad_eliminator : 0.000020s : 0.21% optimize.opt_a.cse : 0.000044s : 0.45% optimize.opt_a.a_3 : 0.000079s : 0.82% optimize.py_interpret_to_execute_after_opt_a : 0.000008s : 0.08% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.02% optimize.rewriter_after_opt_a : 0.000035s : 0.36% optimize.convert_after_rewriter : 0.000006s : 0.07% optimize.order_py_execute_after_rewriter : 0.000006s : 0.06% optimize.mutable_eliminate : 0.000500s : 5.19% optimize.opt_b.b_1 : 0.000151s : 1.57% optimize.opt_b.b_2 : 0.000008s : 0.08% optimize.opt_b.updatestate_depend_eliminate : 0.000006s : 0.06% optimize.opt_b.updatestate_assign_eliminate : 0.000002s : 0.03% optimize.opt_b.updatestate_loads_eliminate : 0.000002s : 0.02% optimize.opt_b.renormalize : 0.000001s : 0.01% optimize.opt_b.cse : 0.000020s : 0.21% optimize.optimize_parallel_all_gather_comm : 0.000015s : 0.16% optimize.overlap_param_gather : 0.000002s : 0.02% optimize.cconv : 0.000025s : 0.26% optimize.loop_unroll : 0.000422s : 4.38% optimize.opt_after_cconv.c_1 : 0.000028s : 0.29% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000005s : 0.05% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000002s : 0.03% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.02% optimize.opt_after_cconv.cse : 0.000019s : 0.19% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000014s : 0.15% optimize.tuple_transform.d_1 : 0.000038s : 0.40% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.02% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000006s : 0.07% optimize.partial_unused_args_eliminate : 0.000002s : 0.02% optimize.add_recomputation : 0.000044s : 0.45% optimize.cse_after_recomputation.cse : 0.000012s : 0.12% optimize.environ_conv : 0.000005s : 0.05% optimize.swap_dp_allreduce_reducescatter : 0.000005s : 0.05% optimize.bias_add_comm_swap : 0.000002s : 0.02% optimize.label_micro_interleaved_index : 0.000004s : 0.04% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.03% optimize.merge_cast_opt : 0.000001s : 0.01% optimize.slice_recompute_activation : 0.000002s : 0.02% optimize.micro_interleaved_order_control : 0.000002s : 0.02% optimize.assign_add_opt : 0.000001s : 0.01% optimize.ForceFp32Comm : 0.000001s : 0.01% optimize.remove_cast_before_assign_add : 0.000001s : 0.01% optimize.full_micro_interleaved_order_control : 0.000002s : 0.02% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.03% optimize.comm_op_add_attrs : 0.000001s : 0.01% optimize.add_comm_op_reuse_tag : 0.000001s : 0.01% optimize.interleave_split_concat_branches : 0.000001s : 0.01% optimize.interleave_parallel_branches : 0.000001s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.02% optimize.control_data_broadcast_order : 0.000012s : 0.13% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.02% optimize.offloading_packed_experts : 0.000004s : 0.04% optimize.overlap_recompute_and_grad_model_parallel : 0.000004s : 0.05% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.01% optimize.overlap_recompute_comm : 0.000003s : 0.03% optimize.overlap_grad_ring_attention : 0.000004s : 0.04% optimize.overlap_grad_flash_sp : 0.000019s : 0.20% optimize.begin_end_overlap_inline : 0.000001s : 0.01% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.02% optimize.split_layernorm_comm : 0.000002s : 0.02% optimize.handle_group_info : 0.000001s : 0.01% optimize.symbol_engine_optimizer.build : 0.000002s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000009s : 0.10% optimize.symbol_engine_optimizer.elim_not_effective : 0.000013s : 0.13% optimize.symbol_engine_optimizer.opt_reshape : 0.000007s : 0.07% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000010s : 0.10% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.02% pipeline_parallel_scheduler : 0.000002s : 0.02% auto_monad_reorder : 0.000015s : 0.16% get_jit_bprop_graph : 0.000001s : 0.01% rewriter_after_jit_bprop_graph : 0.000004s : 0.04% opt_after_jit_grad : 0.000457s : 4.74% validate : 0.000036s : 0.38% Time group info: ------[substitution.] 0.000131 20 1.32% : 0.000002s : 2: substitution.elim_not_effective 1.02% : 0.000001s : 2: substitution.fold_const_symbol 4.23% : 0.000006s : 3: substitution.graph_param_transform 64.85% : 0.000085s : 2: substitution.inline 2.29% : 0.000003s : 4: substitution.j_node_and_user_rematch 3.89% : 0.000005s : 4: substitution.remove_not_recompute_node 2.85% : 0.000004s : 2: substitution.replace_old_param 19.55% : 0.000026s : 1: substitution.value_based_eliminate ------[type_inference.] 0.005352 2 91.80% : 0.004913s : 1: type_inference.infer 8.20% : 0.000439s : 1: type_inference.specialize ------[replace.] 0.000020 2 100.00% : 0.000020s : 2: replace.inline ------[match.] 0.000083 2 100.00% : 0.000083s : 2: match.inline ------[predicate.] 0.000134 754 0.96% : 0.000001s : 7: predicate.accumulaten_eliminater 0.98% : 0.000001s : 3: predicate.ad_related_special_op_eliminate 0.70% : 0.000001s : 6: predicate.addn_check_dump 1.05% : 0.000001s : 7: predicate.addn_zero_filter 0.72% : 0.000001s : 7: predicate.adjust_all_reduce_mul_add 2.24% : 0.000003s : 13: predicate.arithmetic_simplify 0.75% : 0.000001s : 7: predicate.cast_eliminate 0.78% : 0.000001s : 6: predicate.check_bprop_eliminate 0.75% : 0.000001s : 6: predicate.compare_switch_simplify 0.22% : 0.000000s : 3: predicate.const_output_eliminate 0.74% : 0.000001s : 6: predicate.depend_value_elim 0.79% : 0.000001s : 7: predicate.dict_get_item_const_eliminator 0.96% : 0.000001s : 7: predicate.dict_get_item_eliminator 0.75% : 0.000001s : 7: predicate.dict_set_item_eliminator 1.20% : 0.000002s : 6: predicate.dumpgradient_eliminate 0.36% : 0.000000s : 3: predicate.elim_not_effective 0.52% : 0.000001s : 3: predicate.elim_shapecalc_of_broadcastargs 1.11% : 0.000001s : 10: predicate.environ_add_const_eliminate 0.96% : 0.000001s : 10: predicate.environ_get_add_eliminate 1.08% : 0.000001s : 10: predicate.environ_get_depend_swap 1.79% : 0.000002s : 16: predicate.environ_get_eliminate 0.99% : 0.000001s : 10: predicate.environ_get_set_eliminate 0.96% : 0.000001s : 9: predicate.exchange_switch_depend_value 2.22% : 0.000003s : 9: predicate.float_depend_g_call 0.74% : 0.000001s : 6: predicate.float_environ_get_switch 0.98% : 0.000001s : 9: predicate.float_tuple_getitem_switch 0.19% : 0.000000s : 3: predicate.fold_const_symbol 0.77% : 0.000001s : 6: predicate.get_grad_eliminate 0.26% : 0.000000s : 3: predicate.graph_param_transform 0.78% : 0.000001s : 6: predicate.incorporate_call 0.67% : 0.000001s : 6: predicate.incorporate_call_switch 6.23% : 0.000008s : 34: predicate.inline 1.18% : 0.000002s : 6: predicate.inline_without_move 0.39% : 0.000001s : 6: predicate.j_node_and_user_rematch 0.90% : 0.000001s : 6: predicate.less_batch_normalization 1.82% : 0.000002s : 13: predicate.list_to_tuple_eliminator_ 2.06% : 0.000003s : 20: predicate.load_eliminater 1.49% : 0.000002s : 3: predicate.loop_unroll_after_grad 1.57% : 0.000002s : 14: predicate.loop_unroll_before_grad 1.70% : 0.000002s : 13: predicate.make_slice_get_slice_eliminator 0.65% : 0.000001s : 6: predicate.merge_addn 0.67% : 0.000001s : 6: predicate.micro_step_allgather_replace 0.79% : 0.000001s : 6: predicate.mini_step_allgather_replace 0.72% : 0.000001s : 7: predicate.minmaximum_grad 1.51% : 0.000002s : 3: predicate.mutable_eliminate 0.58% : 0.000001s : 3: predicate.opt_reshape 0.72% : 0.000001s : 3: predicate.parallel_virtual_node 1.23% : 0.000002s : 9: predicate.partial_defer_inline 1.25% : 0.000002s : 10: predicate.partial_eliminate 0.89% : 0.000001s : 7: predicate.print_const_string_wrapper 0.67% : 0.000001s : 6: predicate.reduce_all_const_elim 1.02% : 0.000001s : 7: predicate.reduce_eliminate 2.12% : 0.000003s : 20: predicate.redundant_stop_gradient_eliminater 0.63% : 0.000001s : 6: predicate.remove_not_recompute_node 1.13% : 0.000002s : 13: predicate.replace_applicator 0.67% : 0.000001s : 6: predicate.replace_old_param 0.37% : 0.000000s : 3: predicate.reset_defer_inline 0.89% : 0.000001s : 7: predicate.reshape_eliminate 0.84% : 0.000001s : 6: predicate.row_tensor_add_zeros_like 0.41% : 0.000001s : 3: predicate.row_tensor_eliminate 0.89% : 0.000001s : 6: predicate.same_eliminate 0.52% : 0.000001s : 6: predicate.set_cell_output_no_recompute 0.96% : 0.000001s : 6: predicate.shard_identity_eliminate 0.85% : 0.000001s : 6: predicate.special_op_eliminate 0.93% : 0.000001s : 6: predicate.specialize_transform 1.08% : 0.000001s : 6: predicate.split_environ_get_set_with_tuple_value 0.84% : 0.000001s : 6: predicate.stack_unstack_eliminate 0.42% : 0.000001s : 3: predicate.switch_call_monad_eliminater 1.06% : 0.000001s : 9: predicate.switch_defer_inline 1.79% : 0.000002s : 15: predicate.switch_layer_defer_inline 4.52% : 0.000006s : 32: predicate.switch_simplify 0.82% : 0.000001s : 7: predicate.tile_eliminate 0.91% : 0.000001s : 7: predicate.transpose_eliminate 1.57% : 0.000002s : 13: predicate.tuple_list_convert_item_index_to_positive 1.89% : 0.000003s : 13: predicate.tuple_list_get_item_const_eliminator 1.33% : 0.000002s : 13: predicate.tuple_list_get_item_depend_reorder 3.53% : 0.000005s : 19: predicate.tuple_list_get_item_eliminator 1.35% : 0.000002s : 13: predicate.tuple_list_get_set_item_eliminator 2.32% : 0.000003s : 19: predicate.tuple_list_set_item_eliminator 1.49% : 0.000002s : 13: predicate.tuple_to_list_eliminator_ 1.94% : 0.000003s : 20: predicate.updatestate_pure_node_eliminater 2.80% : 0.000004s : 26: predicate.updatestate_useless_node_eliminater 0.66% : 0.000001s : 3: predicate.value_based_eliminate 0.75% : 0.000001s : 6: predicate.virtual_dataset_eliminate 0.77% : 0.000001s : 6: predicate.virtual_output_eliminate 0.37% : 0.000000s : 3: predicate.virtual_view_grad_eliminate 0.58% : 0.000001s : 3: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000206 5 8.43% : 0.000017s : 1: func_graph_cloner_run.FuncGraphClonerGraph 91.57% : 0.000188s : 4: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.022185 192 0.02% : 0.000004s : 1: ForceFp32Comm 14.16% : 0.003141s : 1: add_attr 14.11% : 0.003131s : 1: add_attr_with_inline 0.02% : 0.000004s : 1: add_comm_op_reuse_tag 0.21% : 0.000048s : 1: add_recomputation 0.02% : 0.000004s : 1: assign_add_opt 0.25% : 0.000056s : 1: auto_monad 0.09% : 0.000019s : 1: auto_monad_reorder 0.02% : 0.000003s : 1: begin_end_overlap_inline 0.02% : 0.000005s : 1: bias_add_comm_swap 2.10% : 0.000466s : 1: bootstrap 0.13% : 0.000028s : 1: cconv 0.02% : 0.000004s : 1: comm_op_add_attrs 0.07% : 0.000015s : 1: control_data_broadcast_order 0.04% : 0.000010s : 1: convert_after_rewriter 0.11% : 0.000025s : 1: cse_after_recomputation 0.02% : 0.000005s : 1: dataset_repeat_opt 0.02% : 0.000005s : 1: detach_backward 0.04% : 0.000008s : 1: environ_conv 0.08% : 0.000018s : 1: event_method 0.02% : 0.000005s : 1: full_micro_interleaved_order_control 0.02% : 0.000004s : 1: get_jit_bprop_graph 0.04% : 0.000008s : 1: graph_reusing 0.02% : 0.000005s : 1: grouped_pairwise_exchange_alltoall 0.02% : 0.000004s : 1: handle_group_info 0.02% : 0.000005s : 1: inline 0.03% : 0.000006s : 1: insert-virtual-dataset 0.02% : 0.000004s : 1: interleave_parallel_branches 0.02% : 0.000004s : 1: interleave_split_concat_branches 0.02% : 0.000005s : 1: label_fine_grained_interleaved_index 0.03% : 0.000007s : 1: label_micro_interleaved_index 1.94% : 0.000430s : 1: loop_unroll 0.02% : 0.000004s : 1: merge_cast_opt 0.02% : 0.000005s : 1: micro_interleaved_order_control 2.29% : 0.000508s : 1: mutable_eliminate 0.03% : 0.000007s : 1: offloading_packed_experts 0.06% : 0.000014s : 1: opt.transform.loop_unroll_optimizer 0.06% : 0.000014s : 1: opt.transform.mutable_eliminate 3.37% : 0.000748s : 78: opt.transform.opt_a 0.12% : 0.000026s : 1: opt.transform.opt_after_cconv 0.11% : 0.000024s : 1: opt.transform.opt_after_jit_grad 0.57% : 0.000126s : 28: opt.transform.opt_b 0.19% : 0.000043s : 2: opt.transform.opt_trans_graph 0.16% : 0.000035s : 4: opt.transform.symbol_engine_opt 9.13% : 0.002026s : 1: opt_a 0.46% : 0.000103s : 1: opt_after_cconv 2.10% : 0.000466s : 1: opt_after_jit_grad 1.07% : 0.000237s : 1: opt_b 17.92% : 0.003976s : 1: optimize 0.08% : 0.000019s : 1: optimize_parallel_all_gather_comm 0.04% : 0.000009s : 1: order_py_execute_after_rewriter 0.10% : 0.000022s : 1: overlap_grad_flash_sp 0.02% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.03% : 0.000007s : 1: overlap_grad_ring_attention 0.02% : 0.000004s : 1: overlap_opt_shard_grad_in_pipeline 0.02% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.02% : 0.000005s : 1: overlap_param_gather 0.02% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.03% : 0.000007s : 1: overlap_recompute_and_grad_model_parallel 0.02% : 0.000005s : 1: overlap_recompute_comm 0.03% : 0.000007s : 1: parallel-infer-symbol 0.02% : 0.000004s : 1: parallel-infer-symbol-second 0.02% : 0.000005s : 1: partial_unused_args_eliminate 0.02% : 0.000005s : 1: pipeline_parallel_scheduler 0.02% : 0.000004s : 1: pipeline_split 0.13% : 0.000030s : 1: pre_auto_parallel 0.10% : 0.000021s : 1: py_interpret_to_execute 0.05% : 0.000011s : 1: py_interpret_to_execute_after_opt_a 0.02% : 0.000004s : 1: remove_cast_before_assign_add 0.08% : 0.000018s : 1: remove_dup_value 1.10% : 0.000244s : 1: renormalize.infer 0.95% : 0.000211s : 1: renormalize.specialize 0.02% : 0.000005s : 1: reorder_send_recv_between_fp_bp 0.03% : 0.000007s : 1: rewriter_after_jit_bprop_graph 0.18% : 0.000039s : 1: rewriter_after_opt_a 0.22% : 0.000048s : 1: rewriter_before_opt_a 0.02% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.02% : 0.000005s : 1: slice_recompute_activation 0.02% : 0.000004s : 1: split_layernorm_comm 0.02% : 0.000005s : 1: split_matmul_comm_elemetwise 0.04% : 0.000008s : 1: swap_dp_allreduce_reducescatter 0.34% : 0.000076s : 1: symbol_engine_optimizer 0.33% : 0.000072s : 1: tuple_transform 24.39% : 0.005410s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:47:01.381.868 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:47:01.382.139 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.015007, [21] [bootstrap]: 0.00046603 [type_inference]: 0.00501438 [event_method]: 1.24e-05 [auto_monad]: 5.625e-05 [graph_reusing]: 5.74999e-06 [inline]: 1.97001e-06 [add_attr]: 0.0031584, [1] [add_attr_with_inline]: 0.00314897, [1] [Cycle 1]: 6.837e-05, [2] [tag_attr]: 1.493e-05 [meta_addattr_fg_expand]: 4.36002e-06 [parallel-infer-symbol]: 2.87002e-06 [pre_auto_parallel]: 2.592e-05 [insert-virtual-dataset]: 2.41998e-06 [parallel-infer-symbol-second]: 7.20029e-07 [dataset_repeat_opt]: 2.01998e-06 [pipeline_split]: 1.86e-06 [optimize]: 0.00512789, [53] [py_interpret_to_execute]: 2.084e-05 [rewriter_before_opt_a]: 5.492e-05 [opt_a]: 0.00271054, [2] [Cycle 1]: 0.00179345, [45] [expand_dump_flag]: 2.98998e-06 [switch_simplify]: 2.638e-05 [loop_unroll]: 1.54e-05 [a_1]: 0.00034611 [with_stream_mark]: 1.822e-05 [recompute_prepare]: 9.66e-06 [updatestate_depend_eliminate]: 4.60999e-06 [updatestate_assign_eliminate]: 4.57e-06 [updatestate_loads_eliminate]: 3.51999e-06 [parameter_eliminate]: 1.89999e-06 [a_2]: 0.00012862 [accelerated_algorithm]: 8.13001e-06 [shard]: 2.88e-06 [meta_shard_fg_expand]: 2.02001e-06 [shard_inline]: 7.53999e-06 [merge_send_recv]: 9.80002e-06 [auto_parallel]: 6.53e-06 [parallel]: 1.783e-05 [flash_sp]: 7.93999e-06 [merge_comm]: 4.78001e-06 [allreduce_fusion]: 4.72e-06 [matmul_add_comm_reduction]: 1.08e-05 [allreduce_slice_to_reducescatter]: 8.39995e-07 [virtual_shard_identity]: 9.89001e-06 [virtual_dataset]: 8.45999e-06 [get_grad_eliminate_]: 7.51999e-06 [virtual_output]: 7.25998e-06 [merge_forward]: 4.52003e-06 [cell_reuse_recompute_pass]: 1.29e-06 [offload_activation]: 1.157e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.666e-05 [merge_recompute_call_nodes]: 1.49e-06 [before_grad]: 1.312e-05 [set_forward_comm_id_for_comm_node_pass]: 4.56002e-06 [meta_fg_expand]: 3.56999e-06 [flash_sp_send_recv_attached]: 2.31e-06 [receive_attached]: 2.22001e-06 [after_resolve]: 1.068e-05 [a_after_grad]: 1.131e-05 [renormalize]: 0.00051796 [add_forward_monad_depend]: 5.02999e-06 [auto_monad_grad]: 2.01998e-06 [auto_monad_eliminator]: 1.505e-05 [cse]: 3.646e-05 [a_3]: 6.933e-05 [Cycle 2]: 0.00090482, [45] [expand_dump_flag]: 1.04e-06 [switch_simplify]: 8.44998e-06 [loop_unroll]: 7.05002e-06 [a_1]: 0.00016294 [with_stream_mark]: 1.183e-05 [recompute_prepare]: 7.5e-06 [updatestate_depend_eliminate]: 4.13001e-06 [updatestate_assign_eliminate]: 3.11001e-06 [updatestate_loads_eliminate]: 2.70002e-06 [parameter_eliminate]: 1.03001e-06 [a_2]: 0.0001159 [accelerated_algorithm]: 7.06001e-06 [shard]: 1.20999e-06 [meta_shard_fg_expand]: 1.84e-06 [shard_inline]: 7.34002e-06 [merge_send_recv]: 6.04999e-06 [auto_parallel]: 6.51e-06 [parallel]: 5.06002e-06 [flash_sp]: 3.35e-06 [merge_comm]: 4.1e-06 [allreduce_fusion]: 4.06001e-06 [matmul_add_comm_reduction]: 6.59999e-06 [allreduce_slice_to_reducescatter]: 5.60016e-07 [virtual_shard_identity]: 7.67998e-06 [virtual_dataset]: 6.88998e-06 [get_grad_eliminate_]: 6.52001e-06 [virtual_output]: 6.63e-06 [merge_forward]: 3.56999e-06 [cell_reuse_recompute_pass]: 1.65001e-06 [offload_activation]: 7.35998e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.605e-05 [merge_recompute_call_nodes]: 7.59988e-07 [before_grad]: 1.158e-05 [set_forward_comm_id_for_comm_node_pass]: 4.60001e-06 [meta_fg_expand]: 2.89999e-06 [flash_sp_send_recv_attached]: 9.79984e-07 [receive_attached]: 1.08001e-06 [after_resolve]: 9.79999e-06 [a_after_grad]: 1.093e-05 [renormalize]: 8.00064e-08 [add_forward_monad_depend]: 1.35001e-06 [auto_monad_grad]: 8.89995e-07 [auto_monad_eliminator]: 8.87e-06 [cse]: 2.042e-05 [a_3]: 5.729e-05 [py_interpret_to_execute_after_opt_a]: 1.418e-05 [slice_cell_reuse_recomputed_activation]: 5.21002e-06 [rewriter_after_opt_a]: 5.164e-05 [convert_after_rewriter]: 1.075e-05 [order_py_execute_after_rewriter]: 8.92999e-06 [mutable_eliminate]: 0.0005235 [opt_b]: 0.00035008, [1] [Cycle 1]: 0.00034078, [7] [b_1]: 0.00023368 [b_2]: 9.47001e-06 [updatestate_depend_eliminate]: 6.44999e-06 [updatestate_assign_eliminate]: 3.17002e-06 [updatestate_loads_eliminate]: 2.79999e-06 [renormalize]: 4.19997e-07 [cse]: 2.442e-05 [optimize_parallel_all_gather_comm]: 2.309e-05 [overlap_param_gather]: 4.55001e-06 [cconv]: 2.715e-05 [loop_unroll]: 0.00046251 [opt_after_cconv]: 0.00013809, [1] [Cycle 1]: 0.00012917, [7] [c_1]: 3.442e-05 [parameter_eliminate]: 2.44001e-06 [updatestate_depend_eliminate]: 6.63e-06 [updatestate_assign_eliminate]: 3.04999e-06 [updatestate_loads_eliminate]: 2.91e-06 [cse]: 2.329e-05 [renormalize]: 3.59985e-07 [remove_dup_value]: 2.047e-05 [tuple_transform]: 9.403e-05, [1] [Cycle 1]: 8.713e-05, [4] [d_1]: 4.736e-05 [none_parameter_eliminate]: 1.62001e-06 [renormalize]: 1.8999e-07 [switch_simplify]: 8.43001e-06 [partial_unused_args_eliminate]: 4.55001e-06 [add_recomputation]: 6.268e-05 [cse_after_recomputation]: 3.339e-05, [1] [Cycle 1]: 2.65e-05, [1] [cse]: 1.731e-05 [environ_conv]: 9.90002e-06 [swap_dp_allreduce_reducescatter]: 9.23002e-06 [bias_add_comm_swap]: 4.77e-06 [label_micro_interleaved_index]: 6.98998e-06 [label_fine_grained_interleaved_index]: 5.17999e-06 [merge_cast_opt]: 4.1e-06 [slice_recompute_activation]: 4.23999e-06 [micro_interleaved_order_control]: 4.65999e-06 [assign_add_opt]: 3.91001e-06 [ForceFp32Comm]: 3.21001e-06 [remove_cast_before_assign_add]: 3.44001e-06 [full_micro_interleaved_order_control]: 4.75999e-06 [reorder_send_recv_between_fp_bp]: 5.59998e-06 [comm_op_add_attrs]: 3.38e-06 [add_comm_op_reuse_tag]: 3.23e-06 [interleave_split_concat_branches]: 3.46999e-06 [interleave_parallel_branches]: 3.35998e-06 [overlap_opt_shard_in_pipeline]: 3.42002e-06 [overlap_opt_shard_grad_in_pipeline]: 4.3e-06 [control_data_broadcast_order]: 1.743e-05 [grouped_pairwise_exchange_alltoall]: 4.45e-06 [offloading_packed_experts]: 6.69001e-06 [overlap_recompute_and_grad_model_parallel]: 7.60998e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.58e-06 [overlap_recompute_allgather_and_fa_grad]: 3.73001e-06 [overlap_recompute_comm]: 4.97999e-06 [overlap_grad_ring_attention]: 7.48999e-06 [overlap_grad_flash_sp]: 2.603e-05 [begin_end_overlap_inline]: 2.99999e-06 [split_matmul_comm_elemetwise]: 4.61002e-06 [split_layernorm_comm]: 4.33001e-06 [handle_group_info]: 3.43e-06 [symbol_engine_optimizer]: 0.00010575, [1] [Cycle 1]: 9.834e-05, [6] [build]: 2.99001e-06 [elim_shapecalc]: 1.202e-05 [elim_not_effective]: 1.549e-05 [opt_reshape]: 8.67998e-06 [fold_const_symbol]: 1.272e-05 [renormalize]: 2.09984e-07 [detach_backward]: 3.36999e-06 [pipeline_parallel_scheduler]: 1.87001e-06 [auto_monad_reorder]: 2.163e-05 [get_jit_bprop_graph]: 1.45999e-06 [rewriter_after_jit_bprop_graph]: 4.80001e-06 [opt_after_jit_grad]: 0.00048508 [validate]: 4.115e-05 Sums bootstrap : 0.000466s : 4.61% type_inference : 0.005014s : 49.60% event_method : 0.000012s : 0.12% auto_monad : 0.000056s : 0.56% graph_reusing : 0.000006s : 0.06% inline : 0.000002s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000015s : 0.15% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000004s : 0.04% parallel-infer-symbol : 0.000003s : 0.03% pre_auto_parallel : 0.000026s : 0.26% insert-virtual-dataset : 0.000002s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.02% optimize.py_interpret_to_execute : 0.000021s : 0.21% optimize.rewriter_before_opt_a : 0.000055s : 0.54% optimize.opt_a.expand_dump_flag : 0.000004s : 0.04% optimize.opt_a.switch_simplify : 0.000035s : 0.34% optimize.opt_a.loop_unroll : 0.000022s : 0.22% optimize.opt_a.a_1 : 0.000509s : 5.03% optimize.opt_a.with_stream_mark : 0.000030s : 0.30% optimize.opt_a.recompute_prepare : 0.000017s : 0.17% optimize.opt_a.updatestate_depend_eliminate : 0.000009s : 0.09% optimize.opt_a.updatestate_assign_eliminate : 0.000008s : 0.08% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.06% optimize.opt_a.parameter_eliminate : 0.000003s : 0.03% optimize.opt_a.a_2 : 0.000245s : 2.42% optimize.opt_a.accelerated_algorithm : 0.000015s : 0.15% optimize.opt_a.shard : 0.000004s : 0.04% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.04% optimize.opt_a.shard_inline : 0.000015s : 0.15% optimize.opt_a.merge_send_recv : 0.000016s : 0.16% optimize.opt_a.auto_parallel : 0.000013s : 0.13% optimize.opt_a.parallel : 0.000023s : 0.23% optimize.opt_a.flash_sp : 0.000011s : 0.11% optimize.opt_a.merge_comm : 0.000009s : 0.09% optimize.opt_a.allreduce_fusion : 0.000009s : 0.09% optimize.opt_a.matmul_add_comm_reduction : 0.000017s : 0.17% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000018s : 0.17% optimize.opt_a.virtual_dataset : 0.000015s : 0.15% optimize.opt_a.get_grad_eliminate_ : 0.000014s : 0.14% optimize.opt_a.virtual_output : 0.000014s : 0.14% optimize.opt_a.merge_forward : 0.000008s : 0.08% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.03% optimize.opt_a.offload_activation : 0.000019s : 0.19% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000033s : 0.32% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.02% optimize.opt_a.before_grad : 0.000025s : 0.24% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000009s : 0.09% optimize.opt_a.meta_fg_expand : 0.000006s : 0.06% optimize.opt_a.flash_sp_send_recv_attached : 0.000003s : 0.03% optimize.opt_a.receive_attached : 0.000003s : 0.03% optimize.opt_a.after_resolve : 0.000020s : 0.20% optimize.opt_a.a_after_grad : 0.000022s : 0.22% optimize.opt_a.renormalize : 0.000518s : 5.12% optimize.opt_a.add_forward_monad_depend : 0.000006s : 0.06% optimize.opt_a.auto_monad_grad : 0.000003s : 0.03% optimize.opt_a.auto_monad_eliminator : 0.000024s : 0.24% optimize.opt_a.cse : 0.000057s : 0.56% optimize.opt_a.a_3 : 0.000127s : 1.25% optimize.py_interpret_to_execute_after_opt_a : 0.000014s : 0.14% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.05% optimize.rewriter_after_opt_a : 0.000052s : 0.51% optimize.convert_after_rewriter : 0.000011s : 0.11% optimize.order_py_execute_after_rewriter : 0.000009s : 0.09% optimize.mutable_eliminate : 0.000523s : 5.18% optimize.opt_b.b_1 : 0.000234s : 2.31% optimize.opt_b.b_2 : 0.000009s : 0.09% optimize.opt_b.updatestate_depend_eliminate : 0.000006s : 0.06% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_b.renormalize : 0.000000s : 0.00% optimize.opt_b.cse : 0.000024s : 0.24% optimize.optimize_parallel_all_gather_comm : 0.000023s : 0.23% optimize.overlap_param_gather : 0.000005s : 0.05% optimize.cconv : 0.000027s : 0.27% optimize.loop_unroll : 0.000463s : 4.57% optimize.opt_after_cconv.c_1 : 0.000034s : 0.34% optimize.opt_after_cconv.parameter_eliminate : 0.000002s : 0.02% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000007s : 0.07% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.cse : 0.000023s : 0.23% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000020s : 0.20% optimize.tuple_transform.d_1 : 0.000047s : 0.47% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.02% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000008s : 0.08% optimize.partial_unused_args_eliminate : 0.000005s : 0.05% optimize.add_recomputation : 0.000063s : 0.62% optimize.cse_after_recomputation.cse : 0.000017s : 0.17% optimize.environ_conv : 0.000010s : 0.10% optimize.swap_dp_allreduce_reducescatter : 0.000009s : 0.09% optimize.bias_add_comm_swap : 0.000005s : 0.05% optimize.label_micro_interleaved_index : 0.000007s : 0.07% optimize.label_fine_grained_interleaved_index : 0.000005s : 0.05% optimize.merge_cast_opt : 0.000004s : 0.04% optimize.slice_recompute_activation : 0.000004s : 0.04% optimize.micro_interleaved_order_control : 0.000005s : 0.05% optimize.assign_add_opt : 0.000004s : 0.04% optimize.ForceFp32Comm : 0.000003s : 0.03% optimize.remove_cast_before_assign_add : 0.000003s : 0.03% optimize.full_micro_interleaved_order_control : 0.000005s : 0.05% optimize.reorder_send_recv_between_fp_bp : 0.000006s : 0.06% optimize.comm_op_add_attrs : 0.000003s : 0.03% optimize.add_comm_op_reuse_tag : 0.000003s : 0.03% optimize.interleave_split_concat_branches : 0.000003s : 0.03% optimize.interleave_parallel_branches : 0.000003s : 0.03% optimize.overlap_opt_shard_in_pipeline : 0.000003s : 0.03% optimize.overlap_opt_shard_grad_in_pipeline : 0.000004s : 0.04% optimize.control_data_broadcast_order : 0.000017s : 0.17% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.04% optimize.offloading_packed_experts : 0.000007s : 0.07% optimize.overlap_recompute_and_grad_model_parallel : 0.000008s : 0.08% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.04% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.04% optimize.overlap_recompute_comm : 0.000005s : 0.05% optimize.overlap_grad_ring_attention : 0.000007s : 0.07% optimize.overlap_grad_flash_sp : 0.000026s : 0.26% optimize.begin_end_overlap_inline : 0.000003s : 0.03% optimize.split_matmul_comm_elemetwise : 0.000005s : 0.05% optimize.split_layernorm_comm : 0.000004s : 0.04% optimize.handle_group_info : 0.000003s : 0.03% optimize.symbol_engine_optimizer.build : 0.000003s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000012s : 0.12% optimize.symbol_engine_optimizer.elim_not_effective : 0.000015s : 0.15% optimize.symbol_engine_optimizer.opt_reshape : 0.000009s : 0.09% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000013s : 0.13% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000003s : 0.03% pipeline_parallel_scheduler : 0.000002s : 0.02% auto_monad_reorder : 0.000022s : 0.21% get_jit_bprop_graph : 0.000001s : 0.01% rewriter_after_jit_bprop_graph : 0.000005s : 0.05% opt_after_jit_grad : 0.000485s : 4.80% validate : 0.000041s : 0.41% Time group info: ------[substitution.] 0.000170 29 11.35% : 0.000019s : 2: substitution.cast_eliminate 1.38% : 0.000002s : 3: substitution.elim_not_effective 1.02% : 0.000002s : 3: substitution.fold_const_symbol 3.71% : 0.000006s : 4: substitution.graph_param_transform 57.45% : 0.000098s : 2: substitution.inline 2.43% : 0.000004s : 6: substitution.j_node_and_user_rematch 3.41% : 0.000006s : 6: substitution.remove_not_recompute_node 1.86% : 0.000003s : 2: substitution.replace_old_param 17.39% : 0.000030s : 1: substitution.value_based_eliminate ------[type_inference.] 0.004969 2 91.05% : 0.004524s : 1: type_inference.infer 8.95% : 0.000445s : 1: type_inference.specialize ------[replace.] 0.000021 2 100.00% : 0.000021s : 2: replace.inline ------[match.] 0.000096 2 100.00% : 0.000096s : 2: match.inline ------[predicate.] 0.000175 980 0.89% : 0.000002s : 9: predicate.accumulaten_eliminater 0.84% : 0.000001s : 4: predicate.ad_related_special_op_eliminate 0.67% : 0.000001s : 8: predicate.addn_check_dump 0.82% : 0.000001s : 9: predicate.addn_zero_filter 0.74% : 0.000001s : 9: predicate.adjust_all_reduce_mul_add 2.24% : 0.000004s : 17: predicate.arithmetic_simplify 0.91% : 0.000002s : 9: predicate.cast_eliminate 0.78% : 0.000001s : 8: predicate.check_bprop_eliminate 0.68% : 0.000001s : 8: predicate.compare_switch_simplify 0.19% : 0.000000s : 4: predicate.const_output_eliminate 0.74% : 0.000001s : 8: predicate.depend_value_elim 0.86% : 0.000002s : 9: predicate.dict_get_item_const_eliminator 0.92% : 0.000002s : 9: predicate.dict_get_item_eliminator 0.76% : 0.000001s : 9: predicate.dict_set_item_eliminator 1.17% : 0.000002s : 8: predicate.dumpgradient_eliminate 0.21% : 0.000000s : 4: predicate.elim_not_effective 0.58% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.10% : 0.000002s : 13: predicate.environ_add_const_eliminate 1.06% : 0.000002s : 13: predicate.environ_get_add_eliminate 1.06% : 0.000002s : 13: predicate.environ_get_depend_swap 1.85% : 0.000003s : 21: predicate.environ_get_eliminate 1.07% : 0.000002s : 13: predicate.environ_get_set_eliminate 0.88% : 0.000002s : 11: predicate.exchange_switch_depend_value 1.69% : 0.000003s : 11: predicate.float_depend_g_call 0.66% : 0.000001s : 8: predicate.float_environ_get_switch 0.98% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.19% : 0.000000s : 4: predicate.fold_const_symbol 0.89% : 0.000002s : 8: predicate.get_grad_eliminate 0.26% : 0.000000s : 4: predicate.graph_param_transform 0.84% : 0.000001s : 8: predicate.incorporate_call 0.66% : 0.000001s : 8: predicate.incorporate_call_switch 6.20% : 0.000011s : 44: predicate.inline 1.03% : 0.000002s : 8: predicate.inline_without_move 0.39% : 0.000001s : 8: predicate.j_node_and_user_rematch 0.98% : 0.000002s : 8: predicate.less_batch_normalization 1.80% : 0.000003s : 17: predicate.list_to_tuple_eliminator_ 2.12% : 0.000004s : 26: predicate.load_eliminater 1.22% : 0.000002s : 4: predicate.loop_unroll_after_grad 1.51% : 0.000003s : 16: predicate.loop_unroll_before_grad 1.88% : 0.000003s : 17: predicate.make_slice_get_slice_eliminator 0.72% : 0.000001s : 8: predicate.merge_addn 0.78% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.68% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.66% : 0.000001s : 9: predicate.minmaximum_grad 1.40% : 0.000002s : 4: predicate.mutable_eliminate 0.53% : 0.000001s : 4: predicate.opt_reshape 0.68% : 0.000001s : 4: predicate.parallel_virtual_node 1.10% : 0.000002s : 11: predicate.partial_defer_inline 1.28% : 0.000002s : 13: predicate.partial_eliminate 0.79% : 0.000001s : 9: predicate.print_const_string_wrapper 0.73% : 0.000001s : 8: predicate.reduce_all_const_elim 1.04% : 0.000002s : 9: predicate.reduce_eliminate 2.11% : 0.000004s : 26: predicate.redundant_stop_gradient_eliminater 0.47% : 0.000001s : 8: predicate.remove_not_recompute_node 1.13% : 0.000002s : 17: predicate.replace_applicator 0.57% : 0.000001s : 8: predicate.replace_old_param 0.33% : 0.000001s : 4: predicate.reset_defer_inline 1.10% : 0.000002s : 9: predicate.reshape_eliminate 0.78% : 0.000001s : 8: predicate.row_tensor_add_zeros_like 0.39% : 0.000001s : 4: predicate.row_tensor_eliminate 0.90% : 0.000002s : 8: predicate.same_eliminate 0.50% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.91% : 0.000002s : 8: predicate.shard_identity_eliminate 0.89% : 0.000002s : 8: predicate.special_op_eliminate 0.96% : 0.000002s : 8: predicate.specialize_transform 1.08% : 0.000002s : 8: predicate.split_environ_get_set_with_tuple_value 0.84% : 0.000001s : 8: predicate.stack_unstack_eliminate 0.42% : 0.000001s : 4: predicate.switch_call_monad_eliminater 0.97% : 0.000002s : 11: predicate.switch_defer_inline 1.68% : 0.000003s : 19: predicate.switch_layer_defer_inline 4.25% : 0.000007s : 39: predicate.switch_simplify 0.82% : 0.000001s : 9: predicate.tile_eliminate 0.78% : 0.000001s : 9: predicate.transpose_eliminate 1.63% : 0.000003s : 17: predicate.tuple_list_convert_item_index_to_positive 1.79% : 0.000003s : 17: predicate.tuple_list_get_item_const_eliminator 1.49% : 0.000003s : 17: predicate.tuple_list_get_item_depend_reorder 2.98% : 0.000005s : 25: predicate.tuple_list_get_item_eliminator 1.44% : 0.000003s : 17: predicate.tuple_list_get_set_item_eliminator 2.36% : 0.000004s : 25: predicate.tuple_list_set_item_eliminator 1.60% : 0.000003s : 17: predicate.tuple_to_list_eliminator_ 2.16% : 0.000004s : 26: predicate.updatestate_pure_node_eliminater 2.98% : 0.000005s : 34: predicate.updatestate_useless_node_eliminater 2.37% : 0.000004s : 4: predicate.value_based_eliminate 0.96% : 0.000002s : 8: predicate.virtual_dataset_eliminate 0.80% : 0.000001s : 8: predicate.virtual_output_eliminate 0.33% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.55% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000208 5 8.01% : 0.000017s : 1: func_graph_cloner_run.FuncGraphClonerGraph 91.99% : 0.000191s : 4: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.025004 192 0.02% : 0.000006s : 1: ForceFp32Comm 12.67% : 0.003167s : 1: add_attr 12.61% : 0.003153s : 1: add_attr_with_inline 0.02% : 0.000006s : 1: add_comm_op_reuse_tag 0.27% : 0.000066s : 1: add_recomputation 0.03% : 0.000007s : 1: assign_add_opt 0.26% : 0.000065s : 1: auto_monad 0.12% : 0.000030s : 1: auto_monad_reorder 0.02% : 0.000006s : 1: begin_end_overlap_inline 0.03% : 0.000007s : 1: bias_add_comm_swap 2.03% : 0.000508s : 1: bootstrap 0.12% : 0.000030s : 1: cconv 0.02% : 0.000006s : 1: comm_op_add_attrs 0.08% : 0.000020s : 1: control_data_broadcast_order 0.06% : 0.000014s : 1: convert_after_rewriter 0.15% : 0.000036s : 1: cse_after_recomputation 0.03% : 0.000008s : 1: dataset_repeat_opt 0.07% : 0.000017s : 1: detach_backward 0.05% : 0.000013s : 1: environ_conv 0.09% : 0.000022s : 1: event_method 0.03% : 0.000007s : 1: full_micro_interleaved_order_control 0.03% : 0.000008s : 1: get_jit_bprop_graph 0.05% : 0.000012s : 1: graph_reusing 0.03% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.03% : 0.000006s : 1: handle_group_info 0.03% : 0.000008s : 1: inline 0.03% : 0.000009s : 1: insert-virtual-dataset 0.02% : 0.000006s : 1: interleave_parallel_branches 0.02% : 0.000006s : 1: interleave_split_concat_branches 0.03% : 0.000008s : 1: label_fine_grained_interleaved_index 0.04% : 0.000010s : 1: label_micro_interleaved_index 1.87% : 0.000468s : 1: loop_unroll 0.03% : 0.000007s : 1: merge_cast_opt 0.03% : 0.000007s : 1: micro_interleaved_order_control 2.12% : 0.000530s : 1: mutable_eliminate 0.04% : 0.000010s : 1: offloading_packed_experts 0.06% : 0.000016s : 1: opt.transform.loop_unroll_optimizer 0.07% : 0.000016s : 1: opt.transform.mutable_eliminate 3.88% : 0.000969s : 78: opt.transform.opt_a 0.13% : 0.000033s : 1: opt.transform.opt_after_cconv 0.11% : 0.000026s : 1: opt.transform.opt_after_jit_grad 0.67% : 0.000167s : 28: opt.transform.opt_b 0.21% : 0.000053s : 2: opt.transform.opt_trans_graph 0.18% : 0.000045s : 4: opt.transform.symbol_engine_opt 10.85% : 0.002714s : 1: opt_a 0.57% : 0.000142s : 1: opt_after_cconv 1.98% : 0.000495s : 1: opt_after_jit_grad 1.42% : 0.000354s : 1: opt_b 21.88% : 0.005470s : 1: optimize 0.11% : 0.000027s : 1: optimize_parallel_all_gather_comm 0.05% : 0.000012s : 1: order_py_execute_after_rewriter 0.12% : 0.000029s : 1: overlap_grad_flash_sp 0.03% : 0.000006s : 1: overlap_grad_matmul_and_grad_allreduce 0.04% : 0.000010s : 1: overlap_grad_ring_attention 0.03% : 0.000007s : 1: overlap_opt_shard_grad_in_pipeline 0.02% : 0.000006s : 1: overlap_opt_shard_in_pipeline 0.03% : 0.000008s : 1: overlap_param_gather 0.02% : 0.000006s : 1: overlap_recompute_allgather_and_fa_grad 0.04% : 0.000010s : 1: overlap_recompute_and_grad_model_parallel 0.03% : 0.000008s : 1: overlap_recompute_comm 0.04% : 0.000009s : 1: parallel-infer-symbol 0.02% : 0.000006s : 1: parallel-infer-symbol-second 0.03% : 0.000007s : 1: partial_unused_args_eliminate 0.04% : 0.000009s : 1: pipeline_parallel_scheduler 0.03% : 0.000007s : 1: pipeline_split 0.13% : 0.000033s : 1: pre_auto_parallel 0.10% : 0.000025s : 1: py_interpret_to_execute 0.07% : 0.000018s : 1: py_interpret_to_execute_after_opt_a 0.02% : 0.000006s : 1: remove_cast_before_assign_add 0.09% : 0.000024s : 1: remove_dup_value 1.16% : 0.000289s : 1: renormalize.infer 0.88% : 0.000221s : 1: renormalize.specialize 0.03% : 0.000008s : 1: reorder_send_recv_between_fp_bp 0.04% : 0.000011s : 1: rewriter_after_jit_bprop_graph 0.22% : 0.000055s : 1: rewriter_after_opt_a 0.23% : 0.000059s : 1: rewriter_before_opt_a 0.03% : 0.000008s : 1: slice_cell_reuse_recomputed_activation 0.03% : 0.000007s : 1: slice_recompute_activation 0.03% : 0.000007s : 1: split_layernorm_comm 0.03% : 0.000007s : 1: split_matmul_comm_elemetwise 0.05% : 0.000012s : 1: swap_dp_allreduce_reducescatter 0.43% : 0.000109s : 1: symbol_engine_optimizer 0.39% : 0.000097s : 1: tuple_transform 20.16% : 0.005040s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:47:01.577.423 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0133911, [21] [bootstrap]: 0.00042661 [type_inference]: 0.0048869 [event_method]: 1.231e-05 [auto_monad]: 5.494e-05 [graph_reusing]: 5.14e-06 [inline]: 2.22001e-06 [add_attr]: 0.0030511, [1] [add_attr_with_inline]: 0.00304338, [1] [Cycle 1]: 5.052e-05, [2] [tag_attr]: 1.435e-05 [meta_addattr_fg_expand]: 3.95e-06 [parallel-infer-symbol]: 3.18e-06 [pre_auto_parallel]: 2.408e-05 [insert-virtual-dataset]: 2.29999e-06 [parallel-infer-symbol-second]: 6.99976e-07 [dataset_repeat_opt]: 1.99e-06 [pipeline_split]: 1.55999e-06 [optimize]: 0.00424632, [53] [py_interpret_to_execute]: 1.64e-05 [rewriter_before_opt_a]: 4.6e-05 [opt_a]: 0.00223066, [2] [Cycle 1]: 0.0015136, [45] [expand_dump_flag]: 3.16001e-06 [switch_simplify]: 2.71e-05 [loop_unroll]: 1.503e-05 [a_1]: 0.00033324 [with_stream_mark]: 1.409e-05 [recompute_prepare]: 9.25001e-06 [updatestate_depend_eliminate]: 4.47998e-06 [updatestate_assign_eliminate]: 4.11001e-06 [updatestate_loads_eliminate]: 3.65e-06 [parameter_eliminate]: 1.92999e-06 [a_2]: 9.712e-05 [accelerated_algorithm]: 7.87e-06 [shard]: 2.69001e-06 [meta_shard_fg_expand]: 2.41e-06 [shard_inline]: 7.68001e-06 [merge_send_recv]: 9.36998e-06 [auto_parallel]: 7.33e-06 [parallel]: 1.797e-05 [flash_sp]: 7.81001e-06 [merge_comm]: 4.42e-06 [allreduce_fusion]: 4.18999e-06 [matmul_add_comm_reduction]: 1.075e-05 [allreduce_slice_to_reducescatter]: 7.2e-07 [virtual_shard_identity]: 8.94e-06 [virtual_dataset]: 7.16999e-06 [get_grad_eliminate_]: 7.26001e-06 [virtual_output]: 6.81999e-06 [merge_forward]: 4.27998e-06 [cell_reuse_recompute_pass]: 1.22e-06 [offload_activation]: 9.94001e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.458e-05 [merge_recompute_call_nodes]: 1.49e-06 [before_grad]: 1.27e-05 [set_forward_comm_id_for_comm_node_pass]: 4.31002e-06 [meta_fg_expand]: 3.28e-06 [flash_sp_send_recv_attached]: 2.41e-06 [receive_attached]: 2.09e-06 [after_resolve]: 1.094e-05 [a_after_grad]: 1.089e-05 [renormalize]: 0.00045927 [add_forward_monad_depend]: 4.50001e-06 [auto_monad_grad]: 1.91003e-06 [auto_monad_eliminator]: 1.506e-05 [cse]: 3.645e-05 [a_3]: 5.397e-05 [Cycle 2]: 0.00070771, [45] [expand_dump_flag]: 9.5999e-07 [switch_simplify]: 8.70001e-06 [loop_unroll]: 7.08998e-06 [a_1]: 0.0001485 [with_stream_mark]: 1.144e-05 [recompute_prepare]: 7.43999e-06 [updatestate_depend_eliminate]: 4.14002e-06 [updatestate_assign_eliminate]: 3.13e-06 [updatestate_loads_eliminate]: 2.72001e-06 [parameter_eliminate]: 1.02e-06 [a_2]: 8.738e-05 [accelerated_algorithm]: 6.93e-06 [shard]: 1.14003e-06 [meta_shard_fg_expand]: 1.63002e-06 [shard_inline]: 7.43e-06 [merge_send_recv]: 5.84e-06 [auto_parallel]: 5.81e-06 [parallel]: 4.37e-06 [flash_sp]: 3.23e-06 [merge_comm]: 4.15e-06 [allreduce_fusion]: 3.86999e-06 [matmul_add_comm_reduction]: 6.66999e-06 [allreduce_slice_to_reducescatter]: 3.80009e-07 [virtual_shard_identity]: 7.8e-06 [virtual_dataset]: 6.88e-06 [get_grad_eliminate_]: 6.71e-06 [virtual_output]: 6.49999e-06 [merge_forward]: 3.57002e-06 [cell_reuse_recompute_pass]: 1.44998e-06 [offload_activation]: 7.05e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.382e-05 [merge_recompute_call_nodes]: 7.29982e-07 [before_grad]: 1.133e-05 [set_forward_comm_id_for_comm_node_pass]: 4.25e-06 [meta_fg_expand]: 2.64999e-06 [flash_sp_send_recv_attached]: 8.99978e-07 [receive_attached]: 9.70002e-07 [after_resolve]: 9.52001e-06 [a_after_grad]: 9.84001e-06 [renormalize]: 9.00181e-08 [add_forward_monad_depend]: 1.05999e-06 [auto_monad_grad]: 7.80012e-07 [auto_monad_eliminator]: 8.58001e-06 [cse]: 1.779e-05 [a_3]: 4.298e-05 [py_interpret_to_execute_after_opt_a]: 8.64e-06 [slice_cell_reuse_recomputed_activation]: 1.80001e-06 [rewriter_after_opt_a]: 3.965e-05 [convert_after_rewriter]: 7.06001e-06 [order_py_execute_after_rewriter]: 5.74e-06 [mutable_eliminate]: 0.00047055 [opt_b]: 0.00026688, [1] [Cycle 1]: 0.00026088, [7] [b_1]: 0.00017869 [b_2]: 9.36002e-06 [updatestate_depend_eliminate]: 6.10002e-06 [updatestate_assign_eliminate]: 2.98998e-06 [updatestate_loads_eliminate]: 2.67001e-06 [renormalize]: 4.30009e-07 [cse]: 2.305e-05 [optimize_parallel_all_gather_comm]: 1.719e-05 [overlap_param_gather]: 2.14e-06 [cconv]: 2.425e-05 [loop_unroll]: 0.00042452 [opt_after_cconv]: 0.00011028, [1] [Cycle 1]: 0.00010496, [7] [c_1]: 3.278e-05 [parameter_eliminate]: 2.39001e-06 [updatestate_depend_eliminate]: 6.73998e-06 [updatestate_assign_eliminate]: 3.04999e-06 [updatestate_loads_eliminate]: 2.79001e-06 [cse]: 2.256e-05 [renormalize]: 3.4002e-07 [remove_dup_value]: 1.685e-05 [tuple_transform]: 7.911e-05, [1] [Cycle 1]: 7.493e-05, [4] [d_1]: 4.656e-05 [none_parameter_eliminate]: 1.97999e-06 [renormalize]: 2.19996e-07 [switch_simplify]: 8.15e-06 [partial_unused_args_eliminate]: 1.84998e-06 [add_recomputation]: 5.469e-05 [cse_after_recomputation]: 2.496e-05, [1] [Cycle 1]: 2.087e-05, [1] [cse]: 1.541e-05 [environ_conv]: 5.93002e-06 [swap_dp_allreduce_reducescatter]: 5.87999e-06 [bias_add_comm_swap]: 2.88e-06 [label_micro_interleaved_index]: 4.3e-06 [label_fine_grained_interleaved_index]: 2.73e-06 [merge_cast_opt]: 1.30001e-06 [slice_recompute_activation]: 2.06e-06 [micro_interleaved_order_control]: 2.32999e-06 [assign_add_opt]: 1.16002e-06 [ForceFp32Comm]: 7.79983e-07 [remove_cast_before_assign_add]: 1.00001e-06 [full_micro_interleaved_order_control]: 2.01e-06 [reorder_send_recv_between_fp_bp]: 2.66999e-06 [comm_op_add_attrs]: 9.79984e-07 [add_comm_op_reuse_tag]: 9.89996e-07 [interleave_split_concat_branches]: 1.14e-06 [interleave_parallel_branches]: 1.33002e-06 [overlap_opt_shard_in_pipeline]: 1.07e-06 [overlap_opt_shard_grad_in_pipeline]: 1.85001e-06 [control_data_broadcast_order]: 1.428e-05 [grouped_pairwise_exchange_alltoall]: 1.53002e-06 [offloading_packed_experts]: 4.43001e-06 [overlap_recompute_and_grad_model_parallel]: 4.85999e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.15001e-06 [overlap_recompute_allgather_and_fa_grad]: 1.66002e-06 [overlap_recompute_comm]: 2.24001e-06 [overlap_grad_ring_attention]: 4.70001e-06 [overlap_grad_flash_sp]: 2.038e-05 [begin_end_overlap_inline]: 4.80009e-07 [split_matmul_comm_elemetwise]: 2.22999e-06 [split_layernorm_comm]: 1.50999e-06 [handle_group_info]: 9.00007e-07 [symbol_engine_optimizer]: 8.266e-05, [1] [Cycle 1]: 7.841e-05, [6] [build]: 3.01999e-06 [elim_shapecalc]: 1.058e-05 [elim_not_effective]: 1.541e-05 [opt_reshape]: 8.14002e-06 [fold_const_symbol]: 1.215e-05 [renormalize]: 2.00002e-07 [detach_backward]: 1.64998e-06 [pipeline_parallel_scheduler]: 1.52001e-06 [auto_monad_reorder]: 1.887e-05 [get_jit_bprop_graph]: 1.14e-06 [rewriter_after_jit_bprop_graph]: 4.01001e-06 [opt_after_jit_grad]: 0.00046094 [validate]: 3.992e-05 Sums bootstrap : 0.000427s : 4.54% type_inference : 0.004887s : 51.99% event_method : 0.000012s : 0.13% auto_monad : 0.000055s : 0.58% graph_reusing : 0.000005s : 0.05% inline : 0.000002s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000014s : 0.15% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000004s : 0.04% parallel-infer-symbol : 0.000003s : 0.03% pre_auto_parallel : 0.000024s : 0.26% insert-virtual-dataset : 0.000002s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.02% optimize.py_interpret_to_execute : 0.000016s : 0.17% optimize.rewriter_before_opt_a : 0.000046s : 0.49% optimize.opt_a.expand_dump_flag : 0.000004s : 0.04% optimize.opt_a.switch_simplify : 0.000036s : 0.38% optimize.opt_a.loop_unroll : 0.000022s : 0.24% optimize.opt_a.a_1 : 0.000482s : 5.13% optimize.opt_a.with_stream_mark : 0.000026s : 0.27% optimize.opt_a.recompute_prepare : 0.000017s : 0.18% optimize.opt_a.updatestate_depend_eliminate : 0.000009s : 0.09% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.08% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.07% optimize.opt_a.parameter_eliminate : 0.000003s : 0.03% optimize.opt_a.a_2 : 0.000185s : 1.96% optimize.opt_a.accelerated_algorithm : 0.000015s : 0.16% optimize.opt_a.shard : 0.000004s : 0.04% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.04% optimize.opt_a.shard_inline : 0.000015s : 0.16% optimize.opt_a.merge_send_recv : 0.000015s : 0.16% optimize.opt_a.auto_parallel : 0.000013s : 0.14% optimize.opt_a.parallel : 0.000022s : 0.24% optimize.opt_a.flash_sp : 0.000011s : 0.12% optimize.opt_a.merge_comm : 0.000009s : 0.09% optimize.opt_a.allreduce_fusion : 0.000008s : 0.09% optimize.opt_a.matmul_add_comm_reduction : 0.000017s : 0.19% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000017s : 0.18% optimize.opt_a.virtual_dataset : 0.000014s : 0.15% optimize.opt_a.get_grad_eliminate_ : 0.000014s : 0.15% optimize.opt_a.virtual_output : 0.000013s : 0.14% optimize.opt_a.merge_forward : 0.000008s : 0.08% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.03% optimize.opt_a.offload_activation : 0.000017s : 0.18% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000028s : 0.30% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.02% optimize.opt_a.before_grad : 0.000024s : 0.26% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000009s : 0.09% optimize.opt_a.meta_fg_expand : 0.000006s : 0.06% optimize.opt_a.flash_sp_send_recv_attached : 0.000003s : 0.04% optimize.opt_a.receive_attached : 0.000003s : 0.03% optimize.opt_a.after_resolve : 0.000020s : 0.22% optimize.opt_a.a_after_grad : 0.000021s : 0.22% optimize.opt_a.renormalize : 0.000459s : 4.89% optimize.opt_a.add_forward_monad_depend : 0.000006s : 0.06% optimize.opt_a.auto_monad_grad : 0.000003s : 0.03% optimize.opt_a.auto_monad_eliminator : 0.000024s : 0.25% optimize.opt_a.cse : 0.000054s : 0.58% optimize.opt_a.a_3 : 0.000097s : 1.03% optimize.py_interpret_to_execute_after_opt_a : 0.000009s : 0.09% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.02% optimize.rewriter_after_opt_a : 0.000040s : 0.42% optimize.convert_after_rewriter : 0.000007s : 0.08% optimize.order_py_execute_after_rewriter : 0.000006s : 0.06% optimize.mutable_eliminate : 0.000471s : 5.01% optimize.opt_b.b_1 : 0.000179s : 1.90% optimize.opt_b.b_2 : 0.000009s : 0.10% optimize.opt_b.updatestate_depend_eliminate : 0.000006s : 0.06% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_b.renormalize : 0.000000s : 0.00% optimize.opt_b.cse : 0.000023s : 0.25% optimize.optimize_parallel_all_gather_comm : 0.000017s : 0.18% optimize.overlap_param_gather : 0.000002s : 0.02% optimize.cconv : 0.000024s : 0.26% optimize.loop_unroll : 0.000425s : 4.52% optimize.opt_after_cconv.c_1 : 0.000033s : 0.35% optimize.opt_after_cconv.parameter_eliminate : 0.000002s : 0.03% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000007s : 0.07% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.cse : 0.000023s : 0.24% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000017s : 0.18% optimize.tuple_transform.d_1 : 0.000047s : 0.50% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.02% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000008s : 0.09% optimize.partial_unused_args_eliminate : 0.000002s : 0.02% optimize.add_recomputation : 0.000055s : 0.58% optimize.cse_after_recomputation.cse : 0.000015s : 0.16% optimize.environ_conv : 0.000006s : 0.06% optimize.swap_dp_allreduce_reducescatter : 0.000006s : 0.06% optimize.bias_add_comm_swap : 0.000003s : 0.03% optimize.label_micro_interleaved_index : 0.000004s : 0.05% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.03% optimize.merge_cast_opt : 0.000001s : 0.01% optimize.slice_recompute_activation : 0.000002s : 0.02% optimize.micro_interleaved_order_control : 0.000002s : 0.02% optimize.assign_add_opt : 0.000001s : 0.01% optimize.ForceFp32Comm : 0.000001s : 0.01% optimize.remove_cast_before_assign_add : 0.000001s : 0.01% optimize.full_micro_interleaved_order_control : 0.000002s : 0.02% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.03% optimize.comm_op_add_attrs : 0.000001s : 0.01% optimize.add_comm_op_reuse_tag : 0.000001s : 0.01% optimize.interleave_split_concat_branches : 0.000001s : 0.01% optimize.interleave_parallel_branches : 0.000001s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.02% optimize.control_data_broadcast_order : 0.000014s : 0.15% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.02% optimize.offloading_packed_experts : 0.000004s : 0.05% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.05% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000002s : 0.02% optimize.overlap_recompute_comm : 0.000002s : 0.02% optimize.overlap_grad_ring_attention : 0.000005s : 0.05% optimize.overlap_grad_flash_sp : 0.000020s : 0.22% optimize.begin_end_overlap_inline : 0.000000s : 0.01% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.02% optimize.split_layernorm_comm : 0.000002s : 0.02% optimize.handle_group_info : 0.000001s : 0.01% optimize.symbol_engine_optimizer.build : 0.000003s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000011s : 0.11% optimize.symbol_engine_optimizer.elim_not_effective : 0.000015s : 0.16% optimize.symbol_engine_optimizer.opt_reshape : 0.000008s : 0.09% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000012s : 0.13% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.02% pipeline_parallel_scheduler : 0.000002s : 0.02% auto_monad_reorder : 0.000019s : 0.20% get_jit_bprop_graph : 0.000001s : 0.01% rewriter_after_jit_bprop_graph : 0.000004s : 0.04% opt_after_jit_grad : 0.000461s : 4.90% validate : 0.000040s : 0.42% Time group info: ------[substitution.] 0.000157 29 10.52% : 0.000017s : 2: substitution.cast_eliminate 1.54% : 0.000002s : 3: substitution.elim_not_effective 1.10% : 0.000002s : 3: substitution.fold_const_symbol 3.78% : 0.000006s : 4: substitution.graph_param_transform 57.67% : 0.000091s : 2: substitution.inline 2.53% : 0.000004s : 6: substitution.j_node_and_user_rematch 3.72% : 0.000006s : 6: substitution.remove_not_recompute_node 1.65% : 0.000003s : 2: substitution.replace_old_param 17.48% : 0.000028s : 1: substitution.value_based_eliminate ------[type_inference.] 0.004843 2 90.81% : 0.004398s : 1: type_inference.infer 9.19% : 0.000445s : 1: type_inference.specialize ------[replace.] 0.000021 2 100.00% : 0.000021s : 2: replace.inline ------[match.] 0.000089 2 100.00% : 0.000089s : 2: match.inline ------[predicate.] 0.000165 980 0.81% : 0.000001s : 9: predicate.accumulaten_eliminater 0.97% : 0.000002s : 4: predicate.ad_related_special_op_eliminate 0.68% : 0.000001s : 8: predicate.addn_check_dump 0.93% : 0.000002s : 9: predicate.addn_zero_filter 0.75% : 0.000001s : 9: predicate.adjust_all_reduce_mul_add 2.38% : 0.000004s : 17: predicate.arithmetic_simplify 0.90% : 0.000001s : 9: predicate.cast_eliminate 0.82% : 0.000001s : 8: predicate.check_bprop_eliminate 0.70% : 0.000001s : 8: predicate.compare_switch_simplify 0.22% : 0.000000s : 4: predicate.const_output_eliminate 0.87% : 0.000001s : 8: predicate.depend_value_elim 0.87% : 0.000001s : 9: predicate.dict_get_item_const_eliminator 0.98% : 0.000002s : 9: predicate.dict_get_item_eliminator 0.74% : 0.000001s : 9: predicate.dict_set_item_eliminator 1.20% : 0.000002s : 8: predicate.dumpgradient_eliminate 0.32% : 0.000001s : 4: predicate.elim_not_effective 0.44% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.10% : 0.000002s : 13: predicate.environ_add_const_eliminate 1.13% : 0.000002s : 13: predicate.environ_get_add_eliminate 1.06% : 0.000002s : 13: predicate.environ_get_depend_swap 1.92% : 0.000003s : 21: predicate.environ_get_eliminate 1.06% : 0.000002s : 13: predicate.environ_get_set_eliminate 0.95% : 0.000002s : 11: predicate.exchange_switch_depend_value 1.81% : 0.000003s : 11: predicate.float_depend_g_call 0.73% : 0.000001s : 8: predicate.float_environ_get_switch 1.00% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.23% : 0.000000s : 4: predicate.fold_const_symbol 0.82% : 0.000001s : 8: predicate.get_grad_eliminate 0.27% : 0.000000s : 4: predicate.graph_param_transform 0.84% : 0.000001s : 8: predicate.incorporate_call 0.70% : 0.000001s : 8: predicate.incorporate_call_switch 6.07% : 0.000010s : 44: predicate.inline 0.95% : 0.000002s : 8: predicate.inline_without_move 0.37% : 0.000001s : 8: predicate.j_node_and_user_rematch 1.01% : 0.000002s : 8: predicate.less_batch_normalization 1.59% : 0.000003s : 17: predicate.list_to_tuple_eliminator_ 2.23% : 0.000004s : 26: predicate.load_eliminater 1.06% : 0.000002s : 4: predicate.loop_unroll_after_grad 1.43% : 0.000002s : 16: predicate.loop_unroll_before_grad 1.83% : 0.000003s : 17: predicate.make_slice_get_slice_eliminator 0.74% : 0.000001s : 8: predicate.merge_addn 0.68% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.71% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.71% : 0.000001s : 9: predicate.minmaximum_grad 1.29% : 0.000002s : 4: predicate.mutable_eliminate 0.47% : 0.000001s : 4: predicate.opt_reshape 0.50% : 0.000001s : 4: predicate.parallel_virtual_node 1.22% : 0.000002s : 11: predicate.partial_defer_inline 1.31% : 0.000002s : 13: predicate.partial_eliminate 0.99% : 0.000002s : 9: predicate.print_const_string_wrapper 0.72% : 0.000001s : 8: predicate.reduce_all_const_elim 0.99% : 0.000002s : 9: predicate.reduce_eliminate 2.25% : 0.000004s : 26: predicate.redundant_stop_gradient_eliminater 0.51% : 0.000001s : 8: predicate.remove_not_recompute_node 1.18% : 0.000002s : 17: predicate.replace_applicator 0.65% : 0.000001s : 8: predicate.replace_old_param 0.34% : 0.000001s : 4: predicate.reset_defer_inline 0.94% : 0.000002s : 9: predicate.reshape_eliminate 0.79% : 0.000001s : 8: predicate.row_tensor_add_zeros_like 0.66% : 0.000001s : 4: predicate.row_tensor_eliminate 1.00% : 0.000002s : 8: predicate.same_eliminate 0.56% : 0.000001s : 8: predicate.set_cell_output_no_recompute 1.00% : 0.000002s : 8: predicate.shard_identity_eliminate 0.84% : 0.000001s : 8: predicate.special_op_eliminate 1.02% : 0.000002s : 8: predicate.specialize_transform 1.02% : 0.000002s : 8: predicate.split_environ_get_set_with_tuple_value 0.93% : 0.000002s : 8: predicate.stack_unstack_eliminate 0.46% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.02% : 0.000002s : 11: predicate.switch_defer_inline 1.73% : 0.000003s : 19: predicate.switch_layer_defer_inline 4.20% : 0.000007s : 39: predicate.switch_simplify 0.80% : 0.000001s : 9: predicate.tile_eliminate 0.93% : 0.000002s : 9: predicate.transpose_eliminate 1.78% : 0.000003s : 17: predicate.tuple_list_convert_item_index_to_positive 1.70% : 0.000003s : 17: predicate.tuple_list_get_item_const_eliminator 1.45% : 0.000002s : 17: predicate.tuple_list_get_item_depend_reorder 3.06% : 0.000005s : 25: predicate.tuple_list_get_item_eliminator 1.62% : 0.000003s : 17: predicate.tuple_list_get_set_item_eliminator 2.63% : 0.000004s : 25: predicate.tuple_list_set_item_eliminator 1.47% : 0.000002s : 17: predicate.tuple_to_list_eliminator_ 2.13% : 0.000004s : 26: predicate.updatestate_pure_node_eliminater 3.05% : 0.000005s : 34: predicate.updatestate_useless_node_eliminater 0.68% : 0.000001s : 4: predicate.value_based_eliminate 0.85% : 0.000001s : 8: predicate.virtual_dataset_eliminate 0.81% : 0.000001s : 8: predicate.virtual_output_eliminate 0.38% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.50% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000198 5 8.43% : 0.000017s : 1: func_graph_cloner_run.FuncGraphClonerGraph 91.57% : 0.000181s : 4: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.022296 192 0.02% : 0.000003s : 1: ForceFp32Comm 13.70% : 0.003056s : 1: add_attr 13.66% : 0.003047s : 1: add_attr_with_inline 0.02% : 0.000004s : 1: add_comm_op_reuse_tag 0.26% : 0.000059s : 1: add_recomputation 0.02% : 0.000004s : 1: assign_add_opt 0.27% : 0.000061s : 1: auto_monad 0.10% : 0.000022s : 1: auto_monad_reorder 0.02% : 0.000003s : 1: begin_end_overlap_inline 0.03% : 0.000006s : 1: bias_add_comm_swap 2.04% : 0.000454s : 1: bootstrap 0.12% : 0.000028s : 1: cconv 0.02% : 0.000004s : 1: comm_op_add_attrs 0.08% : 0.000017s : 1: control_data_broadcast_order 0.05% : 0.000010s : 1: convert_after_rewriter 0.12% : 0.000028s : 1: cse_after_recomputation 0.02% : 0.000005s : 1: dataset_repeat_opt 0.02% : 0.000005s : 1: detach_backward 0.04% : 0.000009s : 1: environ_conv 0.09% : 0.000019s : 1: event_method 0.02% : 0.000005s : 1: full_micro_interleaved_order_control 0.02% : 0.000004s : 1: get_jit_bprop_graph 0.04% : 0.000008s : 1: graph_reusing 0.02% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.02% : 0.000004s : 1: handle_group_info 0.02% : 0.000005s : 1: inline 0.03% : 0.000006s : 1: insert-virtual-dataset 0.02% : 0.000004s : 1: interleave_parallel_branches 0.02% : 0.000004s : 1: interleave_split_concat_branches 0.02% : 0.000006s : 1: label_fine_grained_interleaved_index 0.03% : 0.000007s : 1: label_micro_interleaved_index 1.94% : 0.000432s : 1: loop_unroll 0.02% : 0.000004s : 1: merge_cast_opt 0.02% : 0.000005s : 1: micro_interleaved_order_control 2.15% : 0.000479s : 1: mutable_eliminate 0.03% : 0.000007s : 1: offloading_packed_experts 0.07% : 0.000016s : 1: opt.transform.loop_unroll_optimizer 0.07% : 0.000016s : 1: opt.transform.mutable_eliminate 4.16% : 0.000928s : 78: opt.transform.opt_a 0.14% : 0.000031s : 1: opt.transform.opt_after_cconv 0.12% : 0.000027s : 1: opt.transform.opt_after_jit_grad 0.70% : 0.000156s : 28: opt.transform.opt_b 0.23% : 0.000052s : 2: opt.transform.opt_trans_graph 0.19% : 0.000043s : 4: opt.transform.symbol_engine_opt 10.02% : 0.002234s : 1: opt_a 0.51% : 0.000114s : 1: opt_after_cconv 2.11% : 0.000470s : 1: opt_after_jit_grad 1.21% : 0.000270s : 1: opt_b 19.06% : 0.004251s : 1: optimize 0.09% : 0.000021s : 1: optimize_parallel_all_gather_comm 0.04% : 0.000009s : 1: order_py_execute_after_rewriter 0.11% : 0.000024s : 1: overlap_grad_flash_sp 0.02% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.03% : 0.000008s : 1: overlap_grad_ring_attention 0.02% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.02% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.02% : 0.000005s : 1: overlap_param_gather 0.02% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.04% : 0.000008s : 1: overlap_recompute_and_grad_model_parallel 0.02% : 0.000005s : 1: overlap_recompute_comm 0.03% : 0.000006s : 1: parallel-infer-symbol 0.02% : 0.000004s : 1: parallel-infer-symbol-second 0.02% : 0.000005s : 1: partial_unused_args_eliminate 0.02% : 0.000005s : 1: pipeline_parallel_scheduler 0.02% : 0.000004s : 1: pipeline_split 0.13% : 0.000028s : 1: pre_auto_parallel 0.09% : 0.000020s : 1: py_interpret_to_execute 0.05% : 0.000012s : 1: py_interpret_to_execute_after_opt_a 0.02% : 0.000004s : 1: remove_cast_before_assign_add 0.09% : 0.000020s : 1: remove_dup_value 1.14% : 0.000254s : 1: renormalize.infer 0.89% : 0.000198s : 1: renormalize.specialize 0.03% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.03% : 0.000007s : 1: rewriter_after_jit_bprop_graph 0.20% : 0.000044s : 1: rewriter_after_opt_a 0.22% : 0.000050s : 1: rewriter_before_opt_a 0.02% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.02% : 0.000005s : 1: slice_recompute_activation 0.02% : 0.000004s : 1: split_layernorm_comm 0.02% : 0.000005s : 1: split_matmul_comm_elemetwise 0.04% : 0.000009s : 1: swap_dp_allreduce_reducescatter 0.38% : 0.000085s : 1: symbol_engine_optimizer 0.37% : 0.000082s : 1: tuple_transform 21.99% : 0.004902s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:47:01.766.807 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:47:01.767.078 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0146249, [21] [bootstrap]: 0.00042674 [type_inference]: 0.00496304 [event_method]: 1.258e-05 [auto_monad]: 5.659e-05 [graph_reusing]: 5.07e-06 [inline]: 2.00002e-06 [add_attr]: 0.00302524, [1] [add_attr_with_inline]: 0.00301751, [1] [Cycle 1]: 6.435e-05, [2] [tag_attr]: 1.47e-05 [meta_addattr_fg_expand]: 4.45999e-06 [parallel-infer-symbol]: 2.86999e-06 [pre_auto_parallel]: 2.337e-05 [insert-virtual-dataset]: 2.23998e-06 [parallel-infer-symbol-second]: 7.39994e-07 [dataset_repeat_opt]: 1.92999e-06 [pipeline_split]: 1.67001e-06 [optimize]: 0.00500417, [53] [py_interpret_to_execute]: 1.931e-05 [rewriter_before_opt_a]: 5.059e-05 [opt_a]: 0.00268401, [2] [Cycle 1]: 0.00178182, [45] [expand_dump_flag]: 2.59001e-06 [switch_simplify]: 2.516e-05 [loop_unroll]: 1.53e-05 [a_1]: 0.0003645 [with_stream_mark]: 1.373e-05 [recompute_prepare]: 1.004e-05 [updatestate_depend_eliminate]: 4.33999e-06 [updatestate_assign_eliminate]: 3.91001e-06 [updatestate_loads_eliminate]: 3.22002e-06 [parameter_eliminate]: 2.02001e-06 [a_2]: 0.00012537 [accelerated_algorithm]: 7.78001e-06 [shard]: 2.36e-06 [meta_shard_fg_expand]: 2.31e-06 [shard_inline]: 8.05e-06 [merge_send_recv]: 9.16002e-06 [auto_parallel]: 6.94999e-06 [parallel]: 1.862e-05 [flash_sp]: 8.33999e-06 [merge_comm]: 4.55001e-06 [allreduce_fusion]: 4.2e-06 [matmul_add_comm_reduction]: 9.72999e-06 [allreduce_slice_to_reducescatter]: 7.50006e-07 [virtual_shard_identity]: 9.27001e-06 [virtual_dataset]: 7.13998e-06 [get_grad_eliminate_]: 7.3e-06 [virtual_output]: 7e-06 [merge_forward]: 4.31002e-06 [cell_reuse_recompute_pass]: 1.59998e-06 [offload_activation]: 9.82001e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.688e-05 [merge_recompute_call_nodes]: 1.40999e-06 [before_grad]: 1.223e-05 [set_forward_comm_id_for_comm_node_pass]: 4.51002e-06 [meta_fg_expand]: 3.18e-06 [flash_sp_send_recv_attached]: 2.86999e-06 [receive_attached]: 2.05002e-06 [after_resolve]: 1.035e-05 [a_after_grad]: 1.071e-05 [renormalize]: 0.00051942 [add_forward_monad_depend]: 5.33002e-06 [auto_monad_grad]: 1.82001e-06 [auto_monad_eliminator]: 1.619e-05 [cse]: 3.586e-05 [a_3]: 6.646e-05 [Cycle 2]: 0.00088955, [45] [expand_dump_flag]: 1.04e-06 [switch_simplify]: 8.45001e-06 [loop_unroll]: 7.31999e-06 [a_1]: 0.00015046 [with_stream_mark]: 1.148e-05 [recompute_prepare]: 7.53999e-06 [updatestate_depend_eliminate]: 4.49002e-06 [updatestate_assign_eliminate]: 3.09001e-06 [updatestate_loads_eliminate]: 2.98e-06 [parameter_eliminate]: 1.13001e-06 [a_2]: 0.00011401 [accelerated_algorithm]: 7.48e-06 [shard]: 1.09e-06 [meta_shard_fg_expand]: 1.66e-06 [shard_inline]: 7.28e-06 [merge_send_recv]: 5.46e-06 [auto_parallel]: 6.07999e-06 [parallel]: 4.51002e-06 [flash_sp]: 3.43999e-06 [merge_comm]: 4.47e-06 [allreduce_fusion]: 3.98001e-06 [matmul_add_comm_reduction]: 6.58e-06 [allreduce_slice_to_reducescatter]: 3.89991e-07 [virtual_shard_identity]: 8.74e-06 [virtual_dataset]: 7.23999e-06 [get_grad_eliminate_]: 6.80998e-06 [virtual_output]: 7.09001e-06 [merge_forward]: 3.26001e-06 [cell_reuse_recompute_pass]: 1.45999e-06 [offload_activation]: 7.11999e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.613e-05 [merge_recompute_call_nodes]: 7.39994e-07 [before_grad]: 1.224e-05 [set_forward_comm_id_for_comm_node_pass]: 4.70001e-06 [meta_fg_expand]: 2.85002e-06 [flash_sp_send_recv_attached]: 9.00007e-07 [receive_attached]: 1.09e-06 [after_resolve]: 9.44e-06 [a_after_grad]: 1.038e-05 [renormalize]: 5.9983e-08 [add_forward_monad_depend]: 1.29e-06 [auto_monad_grad]: 9.89996e-07 [auto_monad_eliminator]: 8.74e-06 [cse]: 1.875e-05 [a_3]: 5.633e-05 [py_interpret_to_execute_after_opt_a]: 1.199e-05 [slice_cell_reuse_recomputed_activation]: 4.70001e-06 [rewriter_after_opt_a]: 4.325e-05 [convert_after_rewriter]: 1.03e-05 [order_py_execute_after_rewriter]: 8.92999e-06 [mutable_eliminate]: 0.00047599 [opt_b]: 0.00034489, [1] [Cycle 1]: 0.00033638, [7] [b_1]: 0.00023233 [b_2]: 9.75002e-06 [updatestate_depend_eliminate]: 5.95002e-06 [updatestate_assign_eliminate]: 3.21001e-06 [updatestate_loads_eliminate]: 3.01001e-06 [renormalize]: 3.80009e-07 [cse]: 2.291e-05 [optimize_parallel_all_gather_comm]: 2.002e-05 [overlap_param_gather]: 4.68999e-06 [cconv]: 2.535e-05 [loop_unroll]: 0.00046364 [opt_after_cconv]: 0.00013809, [1] [Cycle 1]: 0.00012934, [7] [c_1]: 3.558e-05 [parameter_eliminate]: 2.67001e-06 [updatestate_depend_eliminate]: 6.12001e-06 [updatestate_assign_eliminate]: 3.09001e-06 [updatestate_loads_eliminate]: 3.25998e-06 [cse]: 2.294e-05 [renormalize]: 3.50003e-07 [remove_dup_value]: 1.938e-05 [tuple_transform]: 9.552e-05, [1] [Cycle 1]: 8.877e-05, [4] [d_1]: 4.857e-05 [none_parameter_eliminate]: 1.58002e-06 [renormalize]: 2.10013e-07 [switch_simplify]: 8.45999e-06 [partial_unused_args_eliminate]: 4.90999e-06 [add_recomputation]: 5.577e-05 [cse_after_recomputation]: 3.168e-05, [1] [Cycle 1]: 2.496e-05, [1] [cse]: 1.604e-05 [environ_conv]: 9.07001e-06 [swap_dp_allreduce_reducescatter]: 8.85999e-06 [bias_add_comm_swap]: 4.68001e-06 [label_micro_interleaved_index]: 6.96999e-06 [label_fine_grained_interleaved_index]: 5.29e-06 [merge_cast_opt]: 4.17e-06 [slice_recompute_activation]: 4.58999e-06 [micro_interleaved_order_control]: 4.53999e-06 [assign_add_opt]: 3.97002e-06 [ForceFp32Comm]: 3.09999e-06 [remove_cast_before_assign_add]: 3.68999e-06 [full_micro_interleaved_order_control]: 4.60001e-06 [reorder_send_recv_between_fp_bp]: 5.04e-06 [comm_op_add_attrs]: 3.71999e-06 [add_comm_op_reuse_tag]: 3.24001e-06 [interleave_split_concat_branches]: 3.45e-06 [interleave_parallel_branches]: 3.33998e-06 [overlap_opt_shard_in_pipeline]: 3.4e-06 [overlap_opt_shard_grad_in_pipeline]: 4.08999e-06 [control_data_broadcast_order]: 1.716e-05 [grouped_pairwise_exchange_alltoall]: 3.92998e-06 [offloading_packed_experts]: 7.17002e-06 [overlap_recompute_and_grad_model_parallel]: 7.96001e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.78001e-06 [overlap_recompute_allgather_and_fa_grad]: 3.68e-06 [overlap_recompute_comm]: 4.85999e-06 [overlap_grad_ring_attention]: 6.83e-06 [overlap_grad_flash_sp]: 2.265e-05 [begin_end_overlap_inline]: 2.86e-06 [split_matmul_comm_elemetwise]: 4.49002e-06 [split_layernorm_comm]: 3.97e-06 [handle_group_info]: 3.56001e-06 [symbol_engine_optimizer]: 0.00010144, [1] [Cycle 1]: 9.454e-05, [6] [build]: 3.04001e-06 [elim_shapecalc]: 1.096e-05 [elim_not_effective]: 1.469e-05 [opt_reshape]: 8.01001e-06 [fold_const_symbol]: 1.242e-05 [renormalize]: 2.89991e-07 [detach_backward]: 2.94001e-06 [pipeline_parallel_scheduler]: 1.66e-06 [auto_monad_reorder]: 2.129e-05 [get_jit_bprop_graph]: 1.33002e-06 [rewriter_after_jit_bprop_graph]: 4.10998e-06 [opt_after_jit_grad]: 0.00047594 [validate]: 3.812e-05 Sums bootstrap : 0.000427s : 4.31% type_inference : 0.004963s : 50.13% event_method : 0.000013s : 0.13% auto_monad : 0.000057s : 0.57% graph_reusing : 0.000005s : 0.05% inline : 0.000002s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000015s : 0.15% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000004s : 0.05% parallel-infer-symbol : 0.000003s : 0.03% pre_auto_parallel : 0.000023s : 0.24% insert-virtual-dataset : 0.000002s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.02% optimize.py_interpret_to_execute : 0.000019s : 0.20% optimize.rewriter_before_opt_a : 0.000051s : 0.51% optimize.opt_a.expand_dump_flag : 0.000004s : 0.04% optimize.opt_a.switch_simplify : 0.000034s : 0.34% optimize.opt_a.loop_unroll : 0.000023s : 0.23% optimize.opt_a.a_1 : 0.000515s : 5.20% optimize.opt_a.with_stream_mark : 0.000025s : 0.25% optimize.opt_a.recompute_prepare : 0.000018s : 0.18% optimize.opt_a.updatestate_depend_eliminate : 0.000009s : 0.09% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.07% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.06% optimize.opt_a.parameter_eliminate : 0.000003s : 0.03% optimize.opt_a.a_2 : 0.000239s : 2.42% optimize.opt_a.accelerated_algorithm : 0.000015s : 0.15% optimize.opt_a.shard : 0.000003s : 0.03% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.04% optimize.opt_a.shard_inline : 0.000015s : 0.15% optimize.opt_a.merge_send_recv : 0.000015s : 0.15% optimize.opt_a.auto_parallel : 0.000013s : 0.13% optimize.opt_a.parallel : 0.000023s : 0.23% optimize.opt_a.flash_sp : 0.000012s : 0.12% optimize.opt_a.merge_comm : 0.000009s : 0.09% optimize.opt_a.allreduce_fusion : 0.000008s : 0.08% optimize.opt_a.matmul_add_comm_reduction : 0.000016s : 0.16% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000018s : 0.18% optimize.opt_a.virtual_dataset : 0.000014s : 0.15% optimize.opt_a.get_grad_eliminate_ : 0.000014s : 0.14% optimize.opt_a.virtual_output : 0.000014s : 0.14% optimize.opt_a.merge_forward : 0.000008s : 0.08% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.03% optimize.opt_a.offload_activation : 0.000017s : 0.17% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000033s : 0.33% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.02% optimize.opt_a.before_grad : 0.000024s : 0.25% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000009s : 0.09% optimize.opt_a.meta_fg_expand : 0.000006s : 0.06% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.04% optimize.opt_a.receive_attached : 0.000003s : 0.03% optimize.opt_a.after_resolve : 0.000020s : 0.20% optimize.opt_a.a_after_grad : 0.000021s : 0.21% optimize.opt_a.renormalize : 0.000519s : 5.25% optimize.opt_a.add_forward_monad_depend : 0.000007s : 0.07% optimize.opt_a.auto_monad_grad : 0.000003s : 0.03% optimize.opt_a.auto_monad_eliminator : 0.000025s : 0.25% optimize.opt_a.cse : 0.000055s : 0.55% optimize.opt_a.a_3 : 0.000123s : 1.24% optimize.py_interpret_to_execute_after_opt_a : 0.000012s : 0.12% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.05% optimize.rewriter_after_opt_a : 0.000043s : 0.44% optimize.convert_after_rewriter : 0.000010s : 0.10% optimize.order_py_execute_after_rewriter : 0.000009s : 0.09% optimize.mutable_eliminate : 0.000476s : 4.81% optimize.opt_b.b_1 : 0.000232s : 2.35% optimize.opt_b.b_2 : 0.000010s : 0.10% optimize.opt_b.updatestate_depend_eliminate : 0.000006s : 0.06% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_b.renormalize : 0.000000s : 0.00% optimize.opt_b.cse : 0.000023s : 0.23% optimize.optimize_parallel_all_gather_comm : 0.000020s : 0.20% optimize.overlap_param_gather : 0.000005s : 0.05% optimize.cconv : 0.000025s : 0.26% optimize.loop_unroll : 0.000464s : 4.68% optimize.opt_after_cconv.c_1 : 0.000036s : 0.36% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.06% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.cse : 0.000023s : 0.23% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000019s : 0.20% optimize.tuple_transform.d_1 : 0.000049s : 0.49% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.02% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000008s : 0.09% optimize.partial_unused_args_eliminate : 0.000005s : 0.05% optimize.add_recomputation : 0.000056s : 0.56% optimize.cse_after_recomputation.cse : 0.000016s : 0.16% optimize.environ_conv : 0.000009s : 0.09% optimize.swap_dp_allreduce_reducescatter : 0.000009s : 0.09% optimize.bias_add_comm_swap : 0.000005s : 0.05% optimize.label_micro_interleaved_index : 0.000007s : 0.07% optimize.label_fine_grained_interleaved_index : 0.000005s : 0.05% optimize.merge_cast_opt : 0.000004s : 0.04% optimize.slice_recompute_activation : 0.000005s : 0.05% optimize.micro_interleaved_order_control : 0.000005s : 0.05% optimize.assign_add_opt : 0.000004s : 0.04% optimize.ForceFp32Comm : 0.000003s : 0.03% optimize.remove_cast_before_assign_add : 0.000004s : 0.04% optimize.full_micro_interleaved_order_control : 0.000005s : 0.05% optimize.reorder_send_recv_between_fp_bp : 0.000005s : 0.05% optimize.comm_op_add_attrs : 0.000004s : 0.04% optimize.add_comm_op_reuse_tag : 0.000003s : 0.03% optimize.interleave_split_concat_branches : 0.000003s : 0.03% optimize.interleave_parallel_branches : 0.000003s : 0.03% optimize.overlap_opt_shard_in_pipeline : 0.000003s : 0.03% optimize.overlap_opt_shard_grad_in_pipeline : 0.000004s : 0.04% optimize.control_data_broadcast_order : 0.000017s : 0.17% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.04% optimize.offloading_packed_experts : 0.000007s : 0.07% optimize.overlap_recompute_and_grad_model_parallel : 0.000008s : 0.08% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.04% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.04% optimize.overlap_recompute_comm : 0.000005s : 0.05% optimize.overlap_grad_ring_attention : 0.000007s : 0.07% optimize.overlap_grad_flash_sp : 0.000023s : 0.23% optimize.begin_end_overlap_inline : 0.000003s : 0.03% optimize.split_matmul_comm_elemetwise : 0.000004s : 0.05% optimize.split_layernorm_comm : 0.000004s : 0.04% optimize.handle_group_info : 0.000004s : 0.04% optimize.symbol_engine_optimizer.build : 0.000003s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000011s : 0.11% optimize.symbol_engine_optimizer.elim_not_effective : 0.000015s : 0.15% optimize.symbol_engine_optimizer.opt_reshape : 0.000008s : 0.08% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000012s : 0.13% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000003s : 0.03% pipeline_parallel_scheduler : 0.000002s : 0.02% auto_monad_reorder : 0.000021s : 0.22% get_jit_bprop_graph : 0.000001s : 0.01% rewriter_after_jit_bprop_graph : 0.000004s : 0.04% opt_after_jit_grad : 0.000476s : 4.81% validate : 0.000038s : 0.39% Time group info: ------[substitution.] 0.000162 29 10.71% : 0.000017s : 2: substitution.cast_eliminate 1.36% : 0.000002s : 3: substitution.elim_not_effective 1.05% : 0.000002s : 3: substitution.fold_const_symbol 3.61% : 0.000006s : 4: substitution.graph_param_transform 57.30% : 0.000093s : 2: substitution.inline 2.37% : 0.000004s : 6: substitution.j_node_and_user_rematch 3.81% : 0.000006s : 6: substitution.remove_not_recompute_node 1.70% : 0.000003s : 2: substitution.replace_old_param 18.10% : 0.000029s : 1: substitution.value_based_eliminate ------[type_inference.] 0.004919 2 90.89% : 0.004471s : 1: type_inference.infer 9.11% : 0.000448s : 1: type_inference.specialize ------[replace.] 0.000020 2 100.00% : 0.000020s : 2: replace.inline ------[match.] 0.000091 2 100.00% : 0.000091s : 2: match.inline ------[predicate.] 0.000195 980 0.70% : 0.000001s : 9: predicate.accumulaten_eliminater 0.83% : 0.000002s : 4: predicate.ad_related_special_op_eliminate 0.59% : 0.000001s : 8: predicate.addn_check_dump 0.69% : 0.000001s : 9: predicate.addn_zero_filter 0.58% : 0.000001s : 9: predicate.adjust_all_reduce_mul_add 1.95% : 0.000004s : 17: predicate.arithmetic_simplify 0.83% : 0.000002s : 9: predicate.cast_eliminate 0.71% : 0.000001s : 8: predicate.check_bprop_eliminate 0.62% : 0.000001s : 8: predicate.compare_switch_simplify 0.22% : 0.000000s : 4: predicate.const_output_eliminate 0.66% : 0.000001s : 8: predicate.depend_value_elim 0.70% : 0.000001s : 9: predicate.dict_get_item_const_eliminator 0.83% : 0.000002s : 9: predicate.dict_get_item_eliminator 0.66% : 0.000001s : 9: predicate.dict_set_item_eliminator 1.05% : 0.000002s : 8: predicate.dumpgradient_eliminate 0.19% : 0.000000s : 4: predicate.elim_not_effective 0.41% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.21% : 0.000002s : 13: predicate.environ_add_const_eliminate 0.91% : 0.000002s : 13: predicate.environ_get_add_eliminate 14.08% : 0.000027s : 13: predicate.environ_get_depend_swap 1.51% : 0.000003s : 21: predicate.environ_get_eliminate 0.95% : 0.000002s : 13: predicate.environ_get_set_eliminate 0.82% : 0.000002s : 11: predicate.exchange_switch_depend_value 1.66% : 0.000003s : 11: predicate.float_depend_g_call 0.60% : 0.000001s : 8: predicate.float_environ_get_switch 0.87% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.18% : 0.000000s : 4: predicate.fold_const_symbol 0.69% : 0.000001s : 8: predicate.get_grad_eliminate 0.25% : 0.000000s : 4: predicate.graph_param_transform 0.71% : 0.000001s : 8: predicate.incorporate_call 0.58% : 0.000001s : 8: predicate.incorporate_call_switch 5.44% : 0.000011s : 44: predicate.inline 0.89% : 0.000002s : 8: predicate.inline_without_move 0.33% : 0.000001s : 8: predicate.j_node_and_user_rematch 0.90% : 0.000002s : 8: predicate.less_batch_normalization 1.43% : 0.000003s : 17: predicate.list_to_tuple_eliminator_ 1.95% : 0.000004s : 26: predicate.load_eliminater 0.93% : 0.000002s : 4: predicate.loop_unroll_after_grad 1.34% : 0.000003s : 16: predicate.loop_unroll_before_grad 1.48% : 0.000003s : 17: predicate.make_slice_get_slice_eliminator 0.62% : 0.000001s : 8: predicate.merge_addn 0.61% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.65% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.60% : 0.000001s : 9: predicate.minmaximum_grad 1.08% : 0.000002s : 4: predicate.mutable_eliminate 0.41% : 0.000001s : 4: predicate.opt_reshape 0.43% : 0.000001s : 4: predicate.parallel_virtual_node 1.25% : 0.000002s : 11: predicate.partial_defer_inline 1.14% : 0.000002s : 13: predicate.partial_eliminate 0.67% : 0.000001s : 9: predicate.print_const_string_wrapper 0.64% : 0.000001s : 8: predicate.reduce_all_const_elim 0.92% : 0.000002s : 9: predicate.reduce_eliminate 1.95% : 0.000004s : 26: predicate.redundant_stop_gradient_eliminater 0.42% : 0.000001s : 8: predicate.remove_not_recompute_node 0.95% : 0.000002s : 17: predicate.replace_applicator 0.58% : 0.000001s : 8: predicate.replace_old_param 0.28% : 0.000001s : 4: predicate.reset_defer_inline 0.80% : 0.000002s : 9: predicate.reshape_eliminate 0.70% : 0.000001s : 8: predicate.row_tensor_add_zeros_like 0.41% : 0.000001s : 4: predicate.row_tensor_eliminate 0.80% : 0.000002s : 8: predicate.same_eliminate 0.46% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.98% : 0.000002s : 8: predicate.shard_identity_eliminate 0.92% : 0.000002s : 8: predicate.special_op_eliminate 0.81% : 0.000002s : 8: predicate.specialize_transform 0.94% : 0.000002s : 8: predicate.split_environ_get_set_with_tuple_value 0.80% : 0.000002s : 8: predicate.stack_unstack_eliminate 0.40% : 0.000001s : 4: predicate.switch_call_monad_eliminater 0.82% : 0.000002s : 11: predicate.switch_defer_inline 1.51% : 0.000003s : 19: predicate.switch_layer_defer_inline 3.54% : 0.000007s : 39: predicate.switch_simplify 0.69% : 0.000001s : 9: predicate.tile_eliminate 0.74% : 0.000001s : 9: predicate.transpose_eliminate 1.34% : 0.000003s : 17: predicate.tuple_list_convert_item_index_to_positive 1.49% : 0.000003s : 17: predicate.tuple_list_get_item_const_eliminator 1.33% : 0.000003s : 17: predicate.tuple_list_get_item_depend_reorder 2.51% : 0.000005s : 25: predicate.tuple_list_get_item_eliminator 1.40% : 0.000003s : 17: predicate.tuple_list_get_set_item_eliminator 2.34% : 0.000005s : 25: predicate.tuple_list_set_item_eliminator 1.40% : 0.000003s : 17: predicate.tuple_to_list_eliminator_ 1.97% : 0.000004s : 26: predicate.updatestate_pure_node_eliminater 2.70% : 0.000005s : 34: predicate.updatestate_useless_node_eliminater 0.60% : 0.000001s : 4: predicate.value_based_eliminate 0.66% : 0.000001s : 8: predicate.virtual_dataset_eliminate 0.95% : 0.000002s : 8: predicate.virtual_output_eliminate 0.34% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.51% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000204 5 7.74% : 0.000016s : 1: func_graph_cloner_run.FuncGraphClonerGraph 92.26% : 0.000188s : 4: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.024371 192 0.02% : 0.000006s : 1: ForceFp32Comm 12.45% : 0.003033s : 1: add_attr 12.40% : 0.003021s : 1: add_attr_with_inline 0.02% : 0.000006s : 1: add_comm_op_reuse_tag 0.24% : 0.000059s : 1: add_recomputation 0.03% : 0.000007s : 1: assign_add_opt 0.27% : 0.000065s : 1: auto_monad 0.12% : 0.000028s : 1: auto_monad_reorder 0.02% : 0.000006s : 1: begin_end_overlap_inline 0.03% : 0.000007s : 1: bias_add_comm_swap 1.94% : 0.000473s : 1: bootstrap 0.12% : 0.000028s : 1: cconv 0.03% : 0.000006s : 1: comm_op_add_attrs 0.08% : 0.000020s : 1: control_data_broadcast_order 0.05% : 0.000013s : 1: convert_after_rewriter 0.14% : 0.000035s : 1: cse_after_recomputation 0.03% : 0.000008s : 1: dataset_repeat_opt 0.07% : 0.000018s : 1: detach_backward 0.05% : 0.000012s : 1: environ_conv 0.09% : 0.000022s : 1: event_method 0.03% : 0.000007s : 1: full_micro_interleaved_order_control 0.03% : 0.000007s : 1: get_jit_bprop_graph 0.05% : 0.000011s : 1: graph_reusing 0.03% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.03% : 0.000006s : 1: handle_group_info 0.03% : 0.000008s : 1: inline 0.03% : 0.000008s : 1: insert-virtual-dataset 0.02% : 0.000006s : 1: interleave_parallel_branches 0.03% : 0.000006s : 1: interleave_split_concat_branches 0.03% : 0.000008s : 1: label_fine_grained_interleaved_index 0.04% : 0.000010s : 1: label_micro_interleaved_index 1.93% : 0.000470s : 1: loop_unroll 0.03% : 0.000007s : 1: merge_cast_opt 0.03% : 0.000007s : 1: micro_interleaved_order_control 1.98% : 0.000482s : 1: mutable_eliminate 0.04% : 0.000010s : 1: offloading_packed_experts 0.06% : 0.000015s : 1: opt.transform.loop_unroll_optimizer 0.07% : 0.000017s : 1: opt.transform.mutable_eliminate 3.98% : 0.000969s : 78: opt.transform.opt_a 0.14% : 0.000034s : 1: opt.transform.opt_after_cconv 0.11% : 0.000028s : 1: opt.transform.opt_after_jit_grad 0.67% : 0.000162s : 28: opt.transform.opt_b 0.22% : 0.000054s : 2: opt.transform.opt_trans_graph 0.17% : 0.000043s : 4: opt.transform.symbol_engine_opt 11.03% : 0.002687s : 1: opt_a 0.58% : 0.000142s : 1: opt_after_cconv 1.99% : 0.000486s : 1: opt_after_jit_grad 1.43% : 0.000349s : 1: opt_b 21.88% : 0.005333s : 1: optimize 0.10% : 0.000023s : 1: optimize_parallel_all_gather_comm 0.05% : 0.000012s : 1: order_py_execute_after_rewriter 0.11% : 0.000026s : 1: overlap_grad_flash_sp 0.03% : 0.000006s : 1: overlap_grad_matmul_and_grad_allreduce 0.04% : 0.000010s : 1: overlap_grad_ring_attention 0.03% : 0.000007s : 1: overlap_opt_shard_grad_in_pipeline 0.02% : 0.000006s : 1: overlap_opt_shard_in_pipeline 0.03% : 0.000008s : 1: overlap_param_gather 0.03% : 0.000006s : 1: overlap_recompute_allgather_and_fa_grad 0.04% : 0.000011s : 1: overlap_recompute_and_grad_model_parallel 0.03% : 0.000008s : 1: overlap_recompute_comm 0.04% : 0.000009s : 1: parallel-infer-symbol 0.03% : 0.000006s : 1: parallel-infer-symbol-second 0.03% : 0.000008s : 1: partial_unused_args_eliminate 0.04% : 0.000009s : 1: pipeline_parallel_scheduler 0.03% : 0.000007s : 1: pipeline_split 0.13% : 0.000030s : 1: pre_auto_parallel 0.09% : 0.000023s : 1: py_interpret_to_execute 0.06% : 0.000015s : 1: py_interpret_to_execute_after_opt_a 0.03% : 0.000006s : 1: remove_cast_before_assign_add 0.09% : 0.000023s : 1: remove_dup_value 1.25% : 0.000305s : 1: renormalize.infer 0.85% : 0.000207s : 1: renormalize.specialize 0.03% : 0.000008s : 1: reorder_send_recv_between_fp_bp 0.04% : 0.000010s : 1: rewriter_after_jit_bprop_graph 0.19% : 0.000047s : 1: rewriter_after_opt_a 0.22% : 0.000054s : 1: rewriter_before_opt_a 0.03% : 0.000007s : 1: slice_cell_reuse_recomputed_activation 0.03% : 0.000007s : 1: slice_recompute_activation 0.03% : 0.000007s : 1: split_layernorm_comm 0.03% : 0.000007s : 1: split_matmul_comm_elemetwise 0.05% : 0.000012s : 1: swap_dp_allreduce_reducescatter 0.43% : 0.000104s : 1: symbol_engine_optimizer 0.40% : 0.000098s : 1: tuple_transform 20.46% : 0.004986s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:47:01.955.801 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0134649, [21] [bootstrap]: 0.00042645 [type_inference]: 0.00490098 [event_method]: 1.252e-05 [auto_monad]: 5.389e-05 [graph_reusing]: 5.77001e-06 [inline]: 2.26e-06 [add_attr]: 0.00305795, [1] [add_attr_with_inline]: 0.00304946, [1] [Cycle 1]: 5.272e-05, [2] [tag_attr]: 1.455e-05 [meta_addattr_fg_expand]: 3.55e-06 [parallel-infer-symbol]: 2.97002e-06 [pre_auto_parallel]: 2.285e-05 [insert-virtual-dataset]: 2.41e-06 [parallel-infer-symbol-second]: 7.80012e-07 [dataset_repeat_opt]: 1.71e-06 [pipeline_split]: 1.55001e-06 [optimize]: 0.00430555, [53] [py_interpret_to_execute]: 1.729e-05 [rewriter_before_opt_a]: 4.795e-05 [opt_a]: 0.00227785, [2] [Cycle 1]: 0.0015407, [45] [expand_dump_flag]: 2.88e-06 [switch_simplify]: 2.63e-05 [loop_unroll]: 1.431e-05 [a_1]: 0.00032906 [with_stream_mark]: 1.429e-05 [recompute_prepare]: 9.22999e-06 [updatestate_depend_eliminate]: 4.82e-06 [updatestate_assign_eliminate]: 4.01001e-06 [updatestate_loads_eliminate]: 3.56001e-06 [parameter_eliminate]: 2.04999e-06 [a_2]: 9.672e-05 [accelerated_algorithm]: 8.08999e-06 [shard]: 2.06e-06 [meta_shard_fg_expand]: 2.01e-06 [shard_inline]: 7.51001e-06 [merge_send_recv]: 8.89e-06 [auto_parallel]: 6.94999e-06 [parallel]: 1.78e-05 [flash_sp]: 7.78999e-06 [merge_comm]: 5.44e-06 [allreduce_fusion]: 4.19002e-06 [matmul_add_comm_reduction]: 1.006e-05 [allreduce_slice_to_reducescatter]: 6.30011e-07 [virtual_shard_identity]: 9.20999e-06 [virtual_dataset]: 7.46001e-06 [get_grad_eliminate_]: 7.13998e-06 [virtual_output]: 7.68001e-06 [merge_forward]: 4.3e-06 [cell_reuse_recompute_pass]: 1.28002e-06 [offload_activation]: 1.086e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.462e-05 [merge_recompute_call_nodes]: 1.45001e-06 [before_grad]: 1.319e-05 [set_forward_comm_id_for_comm_node_pass]: 4.32998e-06 [meta_fg_expand]: 3.14001e-06 [flash_sp_send_recv_attached]: 2.51998e-06 [receive_attached]: 1.99e-06 [after_resolve]: 1.074e-05 [a_after_grad]: 1.103e-05 [renormalize]: 0.00048973 [add_forward_monad_depend]: 4.63001e-06 [auto_monad_grad]: 1.60001e-06 [auto_monad_eliminator]: 1.573e-05 [cse]: 3.684e-05 [a_3]: 5.314e-05 [Cycle 2]: 0.00072815, [45] [expand_dump_flag]: 9.70002e-07 [switch_simplify]: 8.07e-06 [loop_unroll]: 7.12997e-06 [a_1]: 0.00017215 [with_stream_mark]: 1.128e-05 [recompute_prepare]: 7.45e-06 [updatestate_depend_eliminate]: 4.13001e-06 [updatestate_assign_eliminate]: 3.17002e-06 [updatestate_loads_eliminate]: 2.76e-06 [parameter_eliminate]: 1.00001e-06 [a_2]: 8.643e-05 [accelerated_algorithm]: 7.06001e-06 [shard]: 1.13001e-06 [meta_shard_fg_expand]: 1.59e-06 [shard_inline]: 7.36001e-06 [merge_send_recv]: 6.11998e-06 [auto_parallel]: 5.90002e-06 [parallel]: 4.37e-06 [flash_sp]: 3.33e-06 [merge_comm]: 4.19002e-06 [allreduce_fusion]: 3.9e-06 [matmul_add_comm_reduction]: 6.31998e-06 [allreduce_slice_to_reducescatter]: 3.69997e-07 [virtual_shard_identity]: 7.66999e-06 [virtual_dataset]: 6.67002e-06 [get_grad_eliminate_]: 6.64001e-06 [virtual_output]: 6.29001e-06 [merge_forward]: 3.26999e-06 [cell_reuse_recompute_pass]: 1.43002e-06 [offload_activation]: 6.88e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.359e-05 [merge_recompute_call_nodes]: 7.10017e-07 [before_grad]: 1.126e-05 [set_forward_comm_id_for_comm_node_pass]: 4.78001e-06 [meta_fg_expand]: 2.70002e-06 [flash_sp_send_recv_attached]: 8.29983e-07 [receive_attached]: 1.03001e-06 [after_resolve]: 9.37001e-06 [a_after_grad]: 9.69e-06 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 1.25999e-06 [auto_monad_grad]: 8.29983e-07 [auto_monad_eliminator]: 8.18001e-06 [cse]: 1.892e-05 [a_3]: 4.259e-05 [py_interpret_to_execute_after_opt_a]: 8.80001e-06 [slice_cell_reuse_recomputed_activation]: 2.20002e-06 [rewriter_after_opt_a]: 4.02e-05 [convert_after_rewriter]: 7.44002e-06 [order_py_execute_after_rewriter]: 5.92999e-06 [mutable_eliminate]: 0.00046341 [opt_b]: 0.00027281, [1] [Cycle 1]: 0.00026661, [7] [b_1]: 0.00017951 [b_2]: 9.14e-06 [updatestate_depend_eliminate]: 6.48e-06 [updatestate_assign_eliminate]: 3.16999e-06 [updatestate_loads_eliminate]: 2.96001e-06 [renormalize]: 5.40022e-07 [cse]: 2.321e-05 [optimize_parallel_all_gather_comm]: 1.72e-05 [overlap_param_gather]: 1.88002e-06 [cconv]: 2.399e-05 [loop_unroll]: 0.00042634 [opt_after_cconv]: 0.00011127, [1] [Cycle 1]: 0.00010576, [7] [c_1]: 3.314e-05 [parameter_eliminate]: 2.24001e-06 [updatestate_depend_eliminate]: 5.91e-06 [updatestate_assign_eliminate]: 3.35998e-06 [updatestate_loads_eliminate]: 3.38999e-06 [cse]: 2.283e-05 [renormalize]: 4.39992e-07 [remove_dup_value]: 1.66e-05 [tuple_transform]: 7.973e-05, [1] [Cycle 1]: 7.562e-05, [4] [d_1]: 4.759e-05 [none_parameter_eliminate]: 1.64998e-06 [renormalize]: 3.00002e-07 [switch_simplify]: 8.15e-06 [partial_unused_args_eliminate]: 1.77999e-06 [add_recomputation]: 5.557e-05 [cse_after_recomputation]: 2.641e-05, [1] [Cycle 1]: 2.158e-05, [1] [cse]: 1.592e-05 [environ_conv]: 6.38e-06 [swap_dp_allreduce_reducescatter]: 5.81e-06 [bias_add_comm_swap]: 2.59001e-06 [label_micro_interleaved_index]: 4.63999e-06 [label_fine_grained_interleaved_index]: 2.55002e-06 [merge_cast_opt]: 1.57001e-06 [slice_recompute_activation]: 2.36e-06 [micro_interleaved_order_control]: 2.07999e-06 [assign_add_opt]: 1.30001e-06 [ForceFp32Comm]: 1.05001e-06 [remove_cast_before_assign_add]: 8.80013e-07 [full_micro_interleaved_order_control]: 2.01e-06 [reorder_send_recv_between_fp_bp]: 2.91999e-06 [comm_op_add_attrs]: 1.14e-06 [add_comm_op_reuse_tag]: 9.90025e-07 [interleave_split_concat_branches]: 1.17999e-06 [interleave_parallel_branches]: 1.04e-06 [overlap_opt_shard_in_pipeline]: 1.17999e-06 [overlap_opt_shard_grad_in_pipeline]: 2.25002e-06 [control_data_broadcast_order]: 1.43e-05 [grouped_pairwise_exchange_alltoall]: 1.57999e-06 [offloading_packed_experts]: 4.52e-06 [overlap_recompute_and_grad_model_parallel]: 4.78001e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.12e-06 [overlap_recompute_allgather_and_fa_grad]: 1.56002e-06 [overlap_recompute_comm]: 2.34999e-06 [overlap_grad_ring_attention]: 4.38001e-06 [overlap_grad_flash_sp]: 1.98e-05 [begin_end_overlap_inline]: 4.7998e-07 [split_matmul_comm_elemetwise]: 2.10002e-06 [split_layernorm_comm]: 1.90001e-06 [handle_group_info]: 9.29984e-07 [symbol_engine_optimizer]: 8.299e-05, [1] [Cycle 1]: 7.869e-05, [6] [build]: 2.99001e-06 [elim_shapecalc]: 1.147e-05 [elim_not_effective]: 1.533e-05 [opt_reshape]: 7.93001e-06 [fold_const_symbol]: 1.242e-05 [renormalize]: 2.00002e-07 [detach_backward]: 1.69998e-06 [pipeline_parallel_scheduler]: 1.82001e-06 [auto_monad_reorder]: 1.989e-05 [get_jit_bprop_graph]: 1.03001e-06 [rewriter_after_jit_bprop_graph]: 3.78001e-06 [opt_after_jit_grad]: 0.00045866 [validate]: 3.976e-05 Sums bootstrap : 0.000426s : 4.51% type_inference : 0.004901s : 51.80% event_method : 0.000013s : 0.13% auto_monad : 0.000054s : 0.57% graph_reusing : 0.000006s : 0.06% inline : 0.000002s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000015s : 0.15% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000004s : 0.04% parallel-infer-symbol : 0.000003s : 0.03% pre_auto_parallel : 0.000023s : 0.24% insert-virtual-dataset : 0.000002s : 0.03% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.02% optimize.py_interpret_to_execute : 0.000017s : 0.18% optimize.rewriter_before_opt_a : 0.000048s : 0.51% optimize.opt_a.expand_dump_flag : 0.000004s : 0.04% optimize.opt_a.switch_simplify : 0.000034s : 0.36% optimize.opt_a.loop_unroll : 0.000021s : 0.23% optimize.opt_a.a_1 : 0.000501s : 5.30% optimize.opt_a.with_stream_mark : 0.000026s : 0.27% optimize.opt_a.recompute_prepare : 0.000017s : 0.18% optimize.opt_a.updatestate_depend_eliminate : 0.000009s : 0.09% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.08% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.07% optimize.opt_a.parameter_eliminate : 0.000003s : 0.03% optimize.opt_a.a_2 : 0.000183s : 1.94% optimize.opt_a.accelerated_algorithm : 0.000015s : 0.16% optimize.opt_a.shard : 0.000003s : 0.03% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.04% optimize.opt_a.shard_inline : 0.000015s : 0.16% optimize.opt_a.merge_send_recv : 0.000015s : 0.16% optimize.opt_a.auto_parallel : 0.000013s : 0.14% optimize.opt_a.parallel : 0.000022s : 0.23% optimize.opt_a.flash_sp : 0.000011s : 0.12% optimize.opt_a.merge_comm : 0.000010s : 0.10% optimize.opt_a.allreduce_fusion : 0.000008s : 0.09% optimize.opt_a.matmul_add_comm_reduction : 0.000016s : 0.17% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000017s : 0.18% optimize.opt_a.virtual_dataset : 0.000014s : 0.15% optimize.opt_a.get_grad_eliminate_ : 0.000014s : 0.15% optimize.opt_a.virtual_output : 0.000014s : 0.15% optimize.opt_a.merge_forward : 0.000008s : 0.08% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.03% optimize.opt_a.offload_activation : 0.000018s : 0.19% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000028s : 0.30% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.02% optimize.opt_a.before_grad : 0.000024s : 0.26% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000009s : 0.10% optimize.opt_a.meta_fg_expand : 0.000006s : 0.06% optimize.opt_a.flash_sp_send_recv_attached : 0.000003s : 0.04% optimize.opt_a.receive_attached : 0.000003s : 0.03% optimize.opt_a.after_resolve : 0.000020s : 0.21% optimize.opt_a.a_after_grad : 0.000021s : 0.22% optimize.opt_a.renormalize : 0.000490s : 5.18% optimize.opt_a.add_forward_monad_depend : 0.000006s : 0.06% optimize.opt_a.auto_monad_grad : 0.000002s : 0.03% optimize.opt_a.auto_monad_eliminator : 0.000024s : 0.25% optimize.opt_a.cse : 0.000056s : 0.59% optimize.opt_a.a_3 : 0.000096s : 1.01% optimize.py_interpret_to_execute_after_opt_a : 0.000009s : 0.09% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.02% optimize.rewriter_after_opt_a : 0.000040s : 0.42% optimize.convert_after_rewriter : 0.000007s : 0.08% optimize.order_py_execute_after_rewriter : 0.000006s : 0.06% optimize.mutable_eliminate : 0.000463s : 4.90% optimize.opt_b.b_1 : 0.000180s : 1.90% optimize.opt_b.b_2 : 0.000009s : 0.10% optimize.opt_b.updatestate_depend_eliminate : 0.000006s : 0.07% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_b.renormalize : 0.000001s : 0.01% optimize.opt_b.cse : 0.000023s : 0.25% optimize.optimize_parallel_all_gather_comm : 0.000017s : 0.18% optimize.overlap_param_gather : 0.000002s : 0.02% optimize.cconv : 0.000024s : 0.25% optimize.loop_unroll : 0.000426s : 4.51% optimize.opt_after_cconv.c_1 : 0.000033s : 0.35% optimize.opt_after_cconv.parameter_eliminate : 0.000002s : 0.02% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.06% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.04% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.04% optimize.opt_after_cconv.cse : 0.000023s : 0.24% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000017s : 0.18% optimize.tuple_transform.d_1 : 0.000048s : 0.50% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.02% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000008s : 0.09% optimize.partial_unused_args_eliminate : 0.000002s : 0.02% optimize.add_recomputation : 0.000056s : 0.59% optimize.cse_after_recomputation.cse : 0.000016s : 0.17% optimize.environ_conv : 0.000006s : 0.07% optimize.swap_dp_allreduce_reducescatter : 0.000006s : 0.06% optimize.bias_add_comm_swap : 0.000003s : 0.03% optimize.label_micro_interleaved_index : 0.000005s : 0.05% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.03% optimize.merge_cast_opt : 0.000002s : 0.02% optimize.slice_recompute_activation : 0.000002s : 0.02% optimize.micro_interleaved_order_control : 0.000002s : 0.02% optimize.assign_add_opt : 0.000001s : 0.01% optimize.ForceFp32Comm : 0.000001s : 0.01% optimize.remove_cast_before_assign_add : 0.000001s : 0.01% optimize.full_micro_interleaved_order_control : 0.000002s : 0.02% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.03% optimize.comm_op_add_attrs : 0.000001s : 0.01% optimize.add_comm_op_reuse_tag : 0.000001s : 0.01% optimize.interleave_split_concat_branches : 0.000001s : 0.01% optimize.interleave_parallel_branches : 0.000001s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.02% optimize.control_data_broadcast_order : 0.000014s : 0.15% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.02% optimize.offloading_packed_experts : 0.000005s : 0.05% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.05% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000002s : 0.02% optimize.overlap_recompute_comm : 0.000002s : 0.02% optimize.overlap_grad_ring_attention : 0.000004s : 0.05% optimize.overlap_grad_flash_sp : 0.000020s : 0.21% optimize.begin_end_overlap_inline : 0.000000s : 0.01% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.02% optimize.split_layernorm_comm : 0.000002s : 0.02% optimize.handle_group_info : 0.000001s : 0.01% optimize.symbol_engine_optimizer.build : 0.000003s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000011s : 0.12% optimize.symbol_engine_optimizer.elim_not_effective : 0.000015s : 0.16% optimize.symbol_engine_optimizer.opt_reshape : 0.000008s : 0.08% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000012s : 0.13% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.02% pipeline_parallel_scheduler : 0.000002s : 0.02% auto_monad_reorder : 0.000020s : 0.21% get_jit_bprop_graph : 0.000001s : 0.01% rewriter_after_jit_bprop_graph : 0.000004s : 0.04% opt_after_jit_grad : 0.000459s : 4.85% validate : 0.000040s : 0.42% Time group info: ------[substitution.] 0.000156 29 11.76% : 0.000018s : 2: substitution.cast_eliminate 1.35% : 0.000002s : 3: substitution.elim_not_effective 1.22% : 0.000002s : 3: substitution.fold_const_symbol 3.55% : 0.000006s : 4: substitution.graph_param_transform 55.77% : 0.000087s : 2: substitution.inline 2.75% : 0.000004s : 6: substitution.j_node_and_user_rematch 3.60% : 0.000006s : 6: substitution.remove_not_recompute_node 1.84% : 0.000003s : 2: substitution.replace_old_param 18.16% : 0.000028s : 1: substitution.value_based_eliminate ------[type_inference.] 0.004857 2 91.11% : 0.004425s : 1: type_inference.infer 8.89% : 0.000432s : 1: type_inference.specialize ------[replace.] 0.000021 2 100.00% : 0.000021s : 2: replace.inline ------[match.] 0.000085 2 100.00% : 0.000085s : 2: match.inline ------[predicate.] 0.000166 980 0.83% : 0.000001s : 9: predicate.accumulaten_eliminater 0.99% : 0.000002s : 4: predicate.ad_related_special_op_eliminate 0.67% : 0.000001s : 8: predicate.addn_check_dump 0.84% : 0.000001s : 9: predicate.addn_zero_filter 0.71% : 0.000001s : 9: predicate.adjust_all_reduce_mul_add 2.33% : 0.000004s : 17: predicate.arithmetic_simplify 1.02% : 0.000002s : 9: predicate.cast_eliminate 0.73% : 0.000001s : 8: predicate.check_bprop_eliminate 0.69% : 0.000001s : 8: predicate.compare_switch_simplify 0.26% : 0.000000s : 4: predicate.const_output_eliminate 0.72% : 0.000001s : 8: predicate.depend_value_elim 0.90% : 0.000002s : 9: predicate.dict_get_item_const_eliminator 0.92% : 0.000002s : 9: predicate.dict_get_item_eliminator 0.80% : 0.000001s : 9: predicate.dict_set_item_eliminator 1.16% : 0.000002s : 8: predicate.dumpgradient_eliminate 0.24% : 0.000000s : 4: predicate.elim_not_effective 0.48% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.12% : 0.000002s : 13: predicate.environ_add_const_eliminate 1.03% : 0.000002s : 13: predicate.environ_get_add_eliminate 1.06% : 0.000002s : 13: predicate.environ_get_depend_swap 1.83% : 0.000003s : 21: predicate.environ_get_eliminate 1.05% : 0.000002s : 13: predicate.environ_get_set_eliminate 0.93% : 0.000002s : 11: predicate.exchange_switch_depend_value 1.87% : 0.000003s : 11: predicate.float_depend_g_call 0.68% : 0.000001s : 8: predicate.float_environ_get_switch 0.99% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.21% : 0.000000s : 4: predicate.fold_const_symbol 0.86% : 0.000001s : 8: predicate.get_grad_eliminate 0.27% : 0.000000s : 4: predicate.graph_param_transform 0.81% : 0.000001s : 8: predicate.incorporate_call 0.66% : 0.000001s : 8: predicate.incorporate_call_switch 6.32% : 0.000010s : 44: predicate.inline 0.98% : 0.000002s : 8: predicate.inline_without_move 0.42% : 0.000001s : 8: predicate.j_node_and_user_rematch 1.11% : 0.000002s : 8: predicate.less_batch_normalization 1.66% : 0.000003s : 17: predicate.list_to_tuple_eliminator_ 2.16% : 0.000004s : 26: predicate.load_eliminater 1.32% : 0.000002s : 4: predicate.loop_unroll_after_grad 1.43% : 0.000002s : 16: predicate.loop_unroll_before_grad 1.73% : 0.000003s : 17: predicate.make_slice_get_slice_eliminator 0.76% : 0.000001s : 8: predicate.merge_addn 0.73% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.72% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.69% : 0.000001s : 9: predicate.minmaximum_grad 1.17% : 0.000002s : 4: predicate.mutable_eliminate 0.43% : 0.000001s : 4: predicate.opt_reshape 0.59% : 0.000001s : 4: predicate.parallel_virtual_node 1.16% : 0.000002s : 11: predicate.partial_defer_inline 1.33% : 0.000002s : 13: predicate.partial_eliminate 0.75% : 0.000001s : 9: predicate.print_const_string_wrapper 0.74% : 0.000001s : 8: predicate.reduce_all_const_elim 1.28% : 0.000002s : 9: predicate.reduce_eliminate 2.08% : 0.000003s : 26: predicate.redundant_stop_gradient_eliminater 0.56% : 0.000001s : 8: predicate.remove_not_recompute_node 1.24% : 0.000002s : 17: predicate.replace_applicator 0.51% : 0.000001s : 8: predicate.replace_old_param 0.34% : 0.000001s : 4: predicate.reset_defer_inline 0.97% : 0.000002s : 9: predicate.reshape_eliminate 0.80% : 0.000001s : 8: predicate.row_tensor_add_zeros_like 0.49% : 0.000001s : 4: predicate.row_tensor_eliminate 1.00% : 0.000002s : 8: predicate.same_eliminate 0.54% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.95% : 0.000002s : 8: predicate.shard_identity_eliminate 0.90% : 0.000001s : 8: predicate.special_op_eliminate 1.01% : 0.000002s : 8: predicate.specialize_transform 1.08% : 0.000002s : 8: predicate.split_environ_get_set_with_tuple_value 0.96% : 0.000002s : 8: predicate.stack_unstack_eliminate 0.46% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.02% : 0.000002s : 11: predicate.switch_defer_inline 1.81% : 0.000003s : 19: predicate.switch_layer_defer_inline 4.17% : 0.000007s : 39: predicate.switch_simplify 0.86% : 0.000001s : 9: predicate.tile_eliminate 0.79% : 0.000001s : 9: predicate.transpose_eliminate 1.61% : 0.000003s : 17: predicate.tuple_list_convert_item_index_to_positive 1.65% : 0.000003s : 17: predicate.tuple_list_get_item_const_eliminator 1.52% : 0.000003s : 17: predicate.tuple_list_get_item_depend_reorder 3.14% : 0.000005s : 25: predicate.tuple_list_get_item_eliminator 1.54% : 0.000003s : 17: predicate.tuple_list_get_set_item_eliminator 2.65% : 0.000004s : 25: predicate.tuple_list_set_item_eliminator 1.58% : 0.000003s : 17: predicate.tuple_to_list_eliminator_ 2.30% : 0.000004s : 26: predicate.updatestate_pure_node_eliminater 2.96% : 0.000005s : 34: predicate.updatestate_useless_node_eliminater 0.77% : 0.000001s : 4: predicate.value_based_eliminate 0.82% : 0.000001s : 8: predicate.virtual_dataset_eliminate 0.83% : 0.000001s : 8: predicate.virtual_output_eliminate 0.35% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.58% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000200 5 8.34% : 0.000017s : 1: func_graph_cloner_run.FuncGraphClonerGraph 91.66% : 0.000183s : 4: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.022485 192 0.02% : 0.000004s : 1: ForceFp32Comm 13.62% : 0.003062s : 1: add_attr 13.58% : 0.003053s : 1: add_attr_with_inline 0.02% : 0.000004s : 1: add_comm_op_reuse_tag 0.27% : 0.000060s : 1: add_recomputation 0.02% : 0.000004s : 1: assign_add_opt 0.26% : 0.000059s : 1: auto_monad 0.11% : 0.000024s : 1: auto_monad_reorder 0.02% : 0.000003s : 1: begin_end_overlap_inline 0.02% : 0.000005s : 1: bias_add_comm_swap 2.02% : 0.000455s : 1: bootstrap 0.12% : 0.000028s : 1: cconv 0.02% : 0.000004s : 1: comm_op_add_attrs 0.08% : 0.000018s : 1: control_data_broadcast_order 0.05% : 0.000011s : 1: convert_after_rewriter 0.13% : 0.000029s : 1: cse_after_recomputation 0.02% : 0.000005s : 1: dataset_repeat_opt 0.02% : 0.000005s : 1: detach_backward 0.04% : 0.000010s : 1: environ_conv 0.08% : 0.000019s : 1: event_method 0.02% : 0.000005s : 1: full_micro_interleaved_order_control 0.02% : 0.000004s : 1: get_jit_bprop_graph 0.04% : 0.000009s : 1: graph_reusing 0.02% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.02% : 0.000004s : 1: handle_group_info 0.02% : 0.000005s : 1: inline 0.03% : 0.000006s : 1: insert-virtual-dataset 0.02% : 0.000004s : 1: interleave_parallel_branches 0.02% : 0.000004s : 1: interleave_split_concat_branches 0.02% : 0.000005s : 1: label_fine_grained_interleaved_index 0.03% : 0.000007s : 1: label_micro_interleaved_index 1.93% : 0.000434s : 1: loop_unroll 0.02% : 0.000004s : 1: merge_cast_opt 0.02% : 0.000005s : 1: micro_interleaved_order_control 2.10% : 0.000472s : 1: mutable_eliminate 0.03% : 0.000007s : 1: offloading_packed_experts 0.07% : 0.000016s : 1: opt.transform.loop_unroll_optimizer 0.07% : 0.000016s : 1: opt.transform.mutable_eliminate 4.21% : 0.000946s : 78: opt.transform.opt_a 0.14% : 0.000032s : 1: opt.transform.opt_after_cconv 0.12% : 0.000027s : 1: opt.transform.opt_after_jit_grad 0.70% : 0.000157s : 28: opt.transform.opt_b 0.24% : 0.000054s : 2: opt.transform.opt_trans_graph 0.19% : 0.000043s : 4: opt.transform.symbol_engine_opt 10.14% : 0.002281s : 1: opt_a 0.51% : 0.000115s : 1: opt_after_cconv 2.08% : 0.000467s : 1: opt_after_jit_grad 1.23% : 0.000276s : 1: opt_b 19.17% : 0.004310s : 1: optimize 0.09% : 0.000021s : 1: optimize_parallel_all_gather_comm 0.04% : 0.000009s : 1: order_py_execute_after_rewriter 0.10% : 0.000023s : 1: overlap_grad_flash_sp 0.02% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.03% : 0.000007s : 1: overlap_grad_ring_attention 0.02% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.02% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.02% : 0.000005s : 1: overlap_param_gather 0.02% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.03% : 0.000008s : 1: overlap_recompute_and_grad_model_parallel 0.02% : 0.000005s : 1: overlap_recompute_comm 0.03% : 0.000006s : 1: parallel-infer-symbol 0.02% : 0.000004s : 1: parallel-infer-symbol-second 0.02% : 0.000005s : 1: partial_unused_args_eliminate 0.02% : 0.000005s : 1: pipeline_parallel_scheduler 0.02% : 0.000005s : 1: pipeline_split 0.12% : 0.000027s : 1: pre_auto_parallel 0.09% : 0.000021s : 1: py_interpret_to_execute 0.06% : 0.000012s : 1: py_interpret_to_execute_after_opt_a 0.02% : 0.000004s : 1: remove_cast_before_assign_add 0.09% : 0.000020s : 1: remove_dup_value 1.22% : 0.000274s : 1: renormalize.infer 0.93% : 0.000209s : 1: renormalize.specialize 0.03% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.03% : 0.000007s : 1: rewriter_after_jit_bprop_graph 0.20% : 0.000044s : 1: rewriter_after_opt_a 0.23% : 0.000052s : 1: rewriter_before_opt_a 0.02% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.02% : 0.000005s : 1: slice_recompute_activation 0.02% : 0.000005s : 1: split_layernorm_comm 0.02% : 0.000005s : 1: split_matmul_comm_elemetwise 0.04% : 0.000009s : 1: swap_dp_allreduce_reducescatter 0.38% : 0.000086s : 1: symbol_engine_optimizer 0.37% : 0.000082s : 1: tuple_transform 21.86% : 0.004915s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:47:02.143.471 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:47:02.143.724 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0146078, [21] [bootstrap]: 0.00042648 [type_inference]: 0.00492059 [event_method]: 1.27e-05 [auto_monad]: 5.724e-05 [graph_reusing]: 5.69999e-06 [inline]: 2.26e-06 [add_attr]: 0.00306429, [1] [add_attr_with_inline]: 0.00305683, [1] [Cycle 1]: 6.474e-05, [2] [tag_attr]: 1.451e-05 [meta_addattr_fg_expand]: 4.08999e-06 [parallel-infer-symbol]: 3.04001e-06 [pre_auto_parallel]: 2.256e-05 [insert-virtual-dataset]: 2.28002e-06 [parallel-infer-symbol-second]: 7.30011e-07 [dataset_repeat_opt]: 1.74e-06 [pipeline_split]: 1.49e-06 [optimize]: 0.00496515, [53] [py_interpret_to_execute]: 2.002e-05 [rewriter_before_opt_a]: 5.108e-05 [opt_a]: 0.00262859, [2] [Cycle 1]: 0.00174054, [45] [expand_dump_flag]: 3.00998e-06 [switch_simplify]: 2.657e-05 [loop_unroll]: 1.505e-05 [a_1]: 0.00033697 [with_stream_mark]: 1.406e-05 [recompute_prepare]: 9.57999e-06 [updatestate_depend_eliminate]: 4.79e-06 [updatestate_assign_eliminate]: 4e-06 [updatestate_loads_eliminate]: 4.47998e-06 [parameter_eliminate]: 2.44999e-06 [a_2]: 0.00012627 [accelerated_algorithm]: 8.25999e-06 [shard]: 2.27999e-06 [meta_shard_fg_expand]: 2.00002e-06 [shard_inline]: 7.85e-06 [merge_send_recv]: 9.44e-06 [auto_parallel]: 6.78e-06 [parallel]: 1.855e-05 [flash_sp]: 7.94997e-06 [merge_comm]: 4.52e-06 [allreduce_fusion]: 4.30999e-06 [matmul_add_comm_reduction]: 9.97999e-06 [allreduce_slice_to_reducescatter]: 6.19999e-07 [virtual_shard_identity]: 8.95999e-06 [virtual_dataset]: 7.8e-06 [get_grad_eliminate_]: 7.08e-06 [virtual_output]: 7.51999e-06 [merge_forward]: 4.71997e-06 [cell_reuse_recompute_pass]: 1.22e-06 [offload_activation]: 1.012e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.822e-05 [merge_recompute_call_nodes]: 1.55999e-06 [before_grad]: 1.284e-05 [set_forward_comm_id_for_comm_node_pass]: 4.92e-06 [meta_fg_expand]: 3.63e-06 [flash_sp_send_recv_attached]: 2.59999e-06 [receive_attached]: 2.39001e-06 [after_resolve]: 1.113e-05 [a_after_grad]: 1.158e-05 [renormalize]: 0.00048024 [add_forward_monad_depend]: 4.72e-06 [auto_monad_grad]: 1.84e-06 [auto_monad_eliminator]: 1.518e-05 [cse]: 3.458e-05 [a_3]: 6.794e-05 [Cycle 2]: 0.00087599, [45] [expand_dump_flag]: 1.12999e-06 [switch_simplify]: 8.32e-06 [loop_unroll]: 7.13998e-06 [a_1]: 0.00014905 [with_stream_mark]: 1.499e-05 [recompute_prepare]: 7.58999e-06 [updatestate_depend_eliminate]: 4.15e-06 [updatestate_assign_eliminate]: 3.16999e-06 [updatestate_loads_eliminate]: 2.77002e-06 [parameter_eliminate]: 1.20001e-06 [a_2]: 0.0001134 [accelerated_algorithm]: 7.63999e-06 [shard]: 1.10999e-06 [meta_shard_fg_expand]: 1.55999e-06 [shard_inline]: 7.55998e-06 [merge_send_recv]: 5.59e-06 [auto_parallel]: 6.20002e-06 [parallel]: 4.55999e-06 [flash_sp]: 3.66001e-06 [merge_comm]: 4.2e-06 [allreduce_fusion]: 3.82002e-06 [matmul_add_comm_reduction]: 6.37001e-06 [allreduce_slice_to_reducescatter]: 3.59985e-07 [virtual_shard_identity]: 8.15999e-06 [virtual_dataset]: 6.84999e-06 [get_grad_eliminate_]: 6.49001e-06 [virtual_output]: 6.66999e-06 [merge_forward]: 3.33e-06 [cell_reuse_recompute_pass]: 1.35001e-06 [offload_activation]: 7.31999e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.553e-05 [merge_recompute_call_nodes]: 7.09988e-07 [before_grad]: 1.131e-05 [set_forward_comm_id_for_comm_node_pass]: 4.4e-06 [meta_fg_expand]: 2.82002e-06 [flash_sp_send_recv_attached]: 7.99977e-07 [receive_attached]: 1.00001e-06 [after_resolve]: 9.22001e-06 [a_after_grad]: 1.018e-05 [renormalize]: 8.00064e-08 [add_forward_monad_depend]: 1.35001e-06 [auto_monad_grad]: 8.00006e-07 [auto_monad_eliminator]: 8.23001e-06 [cse]: 1.784e-05 [a_3]: 5.682e-05 [py_interpret_to_execute_after_opt_a]: 1.221e-05 [slice_cell_reuse_recomputed_activation]: 4.31002e-06 [rewriter_after_opt_a]: 4.274e-05 [convert_after_rewriter]: 1e-05 [order_py_execute_after_rewriter]: 8.96002e-06 [mutable_eliminate]: 0.00047843 [opt_b]: 0.00034825, [1] [Cycle 1]: 0.00033844, [7] [b_1]: 0.00022607 [b_2]: 1.082e-05 [updatestate_depend_eliminate]: 7.05e-06 [updatestate_assign_eliminate]: 3.55998e-06 [updatestate_loads_eliminate]: 3.38e-06 [renormalize]: 3.69997e-07 [cse]: 2.393e-05 [optimize_parallel_all_gather_comm]: 2.118e-05 [overlap_param_gather]: 4.85001e-06 [cconv]: 2.708e-05 [loop_unroll]: 0.00044338 [opt_after_cconv]: 0.00014015, [1] [Cycle 1]: 0.00013236, [7] [c_1]: 3.552e-05 [parameter_eliminate]: 2.64999e-06 [updatestate_depend_eliminate]: 6.61e-06 [updatestate_assign_eliminate]: 3.45998e-06 [updatestate_loads_eliminate]: 3.16001e-06 [cse]: 2.239e-05 [renormalize]: 3.9002e-07 [remove_dup_value]: 1.797e-05 [tuple_transform]: 9.646e-05, [1] [Cycle 1]: 8.834e-05, [4] [d_1]: 4.656e-05 [none_parameter_eliminate]: 1.57999e-06 [renormalize]: 2.19996e-07 [switch_simplify]: 8.97e-06 [partial_unused_args_eliminate]: 4.38001e-06 [add_recomputation]: 5.789e-05 [cse_after_recomputation]: 3.51e-05, [1] [Cycle 1]: 2.717e-05, [1] [cse]: 1.682e-05 [environ_conv]: 9.62999e-06 [swap_dp_allreduce_reducescatter]: 9.67001e-06 [bias_add_comm_swap]: 5.44e-06 [label_micro_interleaved_index]: 7.48e-06 [label_fine_grained_interleaved_index]: 5.60001e-06 [merge_cast_opt]: 4.27003e-06 [slice_recompute_activation]: 4.58999e-06 [micro_interleaved_order_control]: 5.04e-06 [assign_add_opt]: 3.66001e-06 [ForceFp32Comm]: 3.30998e-06 [remove_cast_before_assign_add]: 3.33e-06 [full_micro_interleaved_order_control]: 4.52e-06 [reorder_send_recv_between_fp_bp]: 5.05001e-06 [comm_op_add_attrs]: 3.73999e-06 [add_comm_op_reuse_tag]: 3.49001e-06 [interleave_split_concat_branches]: 3.63e-06 [interleave_parallel_branches]: 3.63e-06 [overlap_opt_shard_in_pipeline]: 4.27998e-06 [overlap_opt_shard_grad_in_pipeline]: 4.53999e-06 [control_data_broadcast_order]: 1.764e-05 [grouped_pairwise_exchange_alltoall]: 3.66999e-06 [offloading_packed_experts]: 6.84001e-06 [overlap_recompute_and_grad_model_parallel]: 7.68999e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.61001e-06 [overlap_recompute_allgather_and_fa_grad]: 3.61999e-06 [overlap_recompute_comm]: 5.19e-06 [overlap_grad_ring_attention]: 6.79001e-06 [overlap_grad_flash_sp]: 2.262e-05 [begin_end_overlap_inline]: 2.83e-06 [split_matmul_comm_elemetwise]: 4.65001e-06 [split_layernorm_comm]: 4.32e-06 [handle_group_info]: 3.18e-06 [symbol_engine_optimizer]: 0.00011383, [1] [Cycle 1]: 9.539e-05, [6] [build]: 2.95998e-06 [elim_shapecalc]: 1.093e-05 [elim_not_effective]: 1.485e-05 [opt_reshape]: 8.05999e-06 [fold_const_symbol]: 1.277e-05 [renormalize]: 2.10013e-07 [detach_backward]: 3.45e-06 [pipeline_parallel_scheduler]: 1.59e-06 [auto_monad_reorder]: 2.165e-05 [get_jit_bprop_graph]: 1.22e-06 [rewriter_after_jit_bprop_graph]: 3.95e-06 [opt_after_jit_grad]: 0.00047689 [validate]: 3.83e-05 Sums bootstrap : 0.000426s : 4.36% type_inference : 0.004921s : 50.29% event_method : 0.000013s : 0.13% auto_monad : 0.000057s : 0.58% graph_reusing : 0.000006s : 0.06% inline : 0.000002s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000015s : 0.15% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000004s : 0.04% parallel-infer-symbol : 0.000003s : 0.03% pre_auto_parallel : 0.000023s : 0.23% insert-virtual-dataset : 0.000002s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000001s : 0.02% optimize.py_interpret_to_execute : 0.000020s : 0.20% optimize.rewriter_before_opt_a : 0.000051s : 0.52% optimize.opt_a.expand_dump_flag : 0.000004s : 0.04% optimize.opt_a.switch_simplify : 0.000035s : 0.36% optimize.opt_a.loop_unroll : 0.000022s : 0.23% optimize.opt_a.a_1 : 0.000486s : 4.97% optimize.opt_a.with_stream_mark : 0.000029s : 0.30% optimize.opt_a.recompute_prepare : 0.000017s : 0.18% optimize.opt_a.updatestate_depend_eliminate : 0.000009s : 0.09% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.07% optimize.opt_a.updatestate_loads_eliminate : 0.000007s : 0.07% optimize.opt_a.parameter_eliminate : 0.000004s : 0.04% optimize.opt_a.a_2 : 0.000240s : 2.45% optimize.opt_a.accelerated_algorithm : 0.000016s : 0.16% optimize.opt_a.shard : 0.000003s : 0.03% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.04% optimize.opt_a.shard_inline : 0.000015s : 0.16% optimize.opt_a.merge_send_recv : 0.000015s : 0.15% optimize.opt_a.auto_parallel : 0.000013s : 0.13% optimize.opt_a.parallel : 0.000023s : 0.24% optimize.opt_a.flash_sp : 0.000012s : 0.12% optimize.opt_a.merge_comm : 0.000009s : 0.09% optimize.opt_a.allreduce_fusion : 0.000008s : 0.08% optimize.opt_a.matmul_add_comm_reduction : 0.000016s : 0.17% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000017s : 0.17% optimize.opt_a.virtual_dataset : 0.000015s : 0.15% optimize.opt_a.get_grad_eliminate_ : 0.000014s : 0.14% optimize.opt_a.virtual_output : 0.000014s : 0.15% optimize.opt_a.merge_forward : 0.000008s : 0.08% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.03% optimize.opt_a.offload_activation : 0.000017s : 0.18% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000034s : 0.34% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.02% optimize.opt_a.before_grad : 0.000024s : 0.25% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000009s : 0.10% optimize.opt_a.meta_fg_expand : 0.000006s : 0.07% optimize.opt_a.flash_sp_send_recv_attached : 0.000003s : 0.03% optimize.opt_a.receive_attached : 0.000003s : 0.03% optimize.opt_a.after_resolve : 0.000020s : 0.21% optimize.opt_a.a_after_grad : 0.000022s : 0.22% optimize.opt_a.renormalize : 0.000480s : 4.91% optimize.opt_a.add_forward_monad_depend : 0.000006s : 0.06% optimize.opt_a.auto_monad_grad : 0.000003s : 0.03% optimize.opt_a.auto_monad_eliminator : 0.000023s : 0.24% optimize.opt_a.cse : 0.000052s : 0.54% optimize.opt_a.a_3 : 0.000125s : 1.27% optimize.py_interpret_to_execute_after_opt_a : 0.000012s : 0.12% optimize.slice_cell_reuse_recomputed_activation : 0.000004s : 0.04% optimize.rewriter_after_opt_a : 0.000043s : 0.44% optimize.convert_after_rewriter : 0.000010s : 0.10% optimize.order_py_execute_after_rewriter : 0.000009s : 0.09% optimize.mutable_eliminate : 0.000478s : 4.89% optimize.opt_b.b_1 : 0.000226s : 2.31% optimize.opt_b.b_2 : 0.000011s : 0.11% optimize.opt_b.updatestate_depend_eliminate : 0.000007s : 0.07% optimize.opt_b.updatestate_assign_eliminate : 0.000004s : 0.04% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_b.renormalize : 0.000000s : 0.00% optimize.opt_b.cse : 0.000024s : 0.24% optimize.optimize_parallel_all_gather_comm : 0.000021s : 0.22% optimize.overlap_param_gather : 0.000005s : 0.05% optimize.cconv : 0.000027s : 0.28% optimize.loop_unroll : 0.000443s : 4.53% optimize.opt_after_cconv.c_1 : 0.000036s : 0.36% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000007s : 0.07% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.04% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.cse : 0.000022s : 0.23% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000018s : 0.18% optimize.tuple_transform.d_1 : 0.000047s : 0.48% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.02% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000009s : 0.09% optimize.partial_unused_args_eliminate : 0.000004s : 0.04% optimize.add_recomputation : 0.000058s : 0.59% optimize.cse_after_recomputation.cse : 0.000017s : 0.17% optimize.environ_conv : 0.000010s : 0.10% optimize.swap_dp_allreduce_reducescatter : 0.000010s : 0.10% optimize.bias_add_comm_swap : 0.000005s : 0.06% optimize.label_micro_interleaved_index : 0.000007s : 0.08% optimize.label_fine_grained_interleaved_index : 0.000006s : 0.06% optimize.merge_cast_opt : 0.000004s : 0.04% optimize.slice_recompute_activation : 0.000005s : 0.05% optimize.micro_interleaved_order_control : 0.000005s : 0.05% optimize.assign_add_opt : 0.000004s : 0.04% optimize.ForceFp32Comm : 0.000003s : 0.03% optimize.remove_cast_before_assign_add : 0.000003s : 0.03% optimize.full_micro_interleaved_order_control : 0.000005s : 0.05% optimize.reorder_send_recv_between_fp_bp : 0.000005s : 0.05% optimize.comm_op_add_attrs : 0.000004s : 0.04% optimize.add_comm_op_reuse_tag : 0.000003s : 0.04% optimize.interleave_split_concat_branches : 0.000004s : 0.04% optimize.interleave_parallel_branches : 0.000004s : 0.04% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.04% optimize.overlap_opt_shard_grad_in_pipeline : 0.000005s : 0.05% optimize.control_data_broadcast_order : 0.000018s : 0.18% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.04% optimize.offloading_packed_experts : 0.000007s : 0.07% optimize.overlap_recompute_and_grad_model_parallel : 0.000008s : 0.08% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.04% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.04% optimize.overlap_recompute_comm : 0.000005s : 0.05% optimize.overlap_grad_ring_attention : 0.000007s : 0.07% optimize.overlap_grad_flash_sp : 0.000023s : 0.23% optimize.begin_end_overlap_inline : 0.000003s : 0.03% optimize.split_matmul_comm_elemetwise : 0.000005s : 0.05% optimize.split_layernorm_comm : 0.000004s : 0.04% optimize.handle_group_info : 0.000003s : 0.03% optimize.symbol_engine_optimizer.build : 0.000003s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000011s : 0.11% optimize.symbol_engine_optimizer.elim_not_effective : 0.000015s : 0.15% optimize.symbol_engine_optimizer.opt_reshape : 0.000008s : 0.08% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000013s : 0.13% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000003s : 0.04% pipeline_parallel_scheduler : 0.000002s : 0.02% auto_monad_reorder : 0.000022s : 0.22% get_jit_bprop_graph : 0.000001s : 0.01% rewriter_after_jit_bprop_graph : 0.000004s : 0.04% opt_after_jit_grad : 0.000477s : 4.87% validate : 0.000038s : 0.39% Time group info: ------[substitution.] 0.000162 29 11.20% : 0.000018s : 2: substitution.cast_eliminate 1.32% : 0.000002s : 3: substitution.elim_not_effective 1.04% : 0.000002s : 3: substitution.fold_const_symbol 3.67% : 0.000006s : 4: substitution.graph_param_transform 57.26% : 0.000093s : 2: substitution.inline 2.71% : 0.000004s : 6: substitution.j_node_and_user_rematch 4.06% : 0.000007s : 6: substitution.remove_not_recompute_node 1.82% : 0.000003s : 2: substitution.replace_old_param 16.92% : 0.000027s : 1: substitution.value_based_eliminate ------[type_inference.] 0.004876 2 90.85% : 0.004430s : 1: type_inference.infer 9.15% : 0.000446s : 1: type_inference.specialize ------[replace.] 0.000020 2 100.00% : 0.000020s : 2: replace.inline ------[match.] 0.000091 2 100.00% : 0.000091s : 2: match.inline ------[predicate.] 0.000168 980 0.95% : 0.000002s : 9: predicate.accumulaten_eliminater 0.97% : 0.000002s : 4: predicate.ad_related_special_op_eliminate 0.65% : 0.000001s : 8: predicate.addn_check_dump 0.87% : 0.000001s : 9: predicate.addn_zero_filter 0.72% : 0.000001s : 9: predicate.adjust_all_reduce_mul_add 2.29% : 0.000004s : 17: predicate.arithmetic_simplify 1.05% : 0.000002s : 9: predicate.cast_eliminate 0.81% : 0.000001s : 8: predicate.check_bprop_eliminate 0.73% : 0.000001s : 8: predicate.compare_switch_simplify 0.23% : 0.000000s : 4: predicate.const_output_eliminate 0.94% : 0.000002s : 8: predicate.depend_value_elim 0.77% : 0.000001s : 9: predicate.dict_get_item_const_eliminator 0.92% : 0.000002s : 9: predicate.dict_get_item_eliminator 0.75% : 0.000001s : 9: predicate.dict_set_item_eliminator 1.21% : 0.000002s : 8: predicate.dumpgradient_eliminate 0.25% : 0.000000s : 4: predicate.elim_not_effective 0.49% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.16% : 0.000002s : 13: predicate.environ_add_const_eliminate 1.03% : 0.000002s : 13: predicate.environ_get_add_eliminate 1.11% : 0.000002s : 13: predicate.environ_get_depend_swap 1.82% : 0.000003s : 21: predicate.environ_get_eliminate 1.09% : 0.000002s : 13: predicate.environ_get_set_eliminate 0.98% : 0.000002s : 11: predicate.exchange_switch_depend_value 1.87% : 0.000003s : 11: predicate.float_depend_g_call 0.74% : 0.000001s : 8: predicate.float_environ_get_switch 0.99% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.23% : 0.000000s : 4: predicate.fold_const_symbol 0.83% : 0.000001s : 8: predicate.get_grad_eliminate 0.26% : 0.000000s : 4: predicate.graph_param_transform 0.80% : 0.000001s : 8: predicate.incorporate_call 0.68% : 0.000001s : 8: predicate.incorporate_call_switch 6.16% : 0.000010s : 44: predicate.inline 1.04% : 0.000002s : 8: predicate.inline_without_move 0.38% : 0.000001s : 8: predicate.j_node_and_user_rematch 1.04% : 0.000002s : 8: predicate.less_batch_normalization 1.61% : 0.000003s : 17: predicate.list_to_tuple_eliminator_ 2.34% : 0.000004s : 26: predicate.load_eliminater 1.10% : 0.000002s : 4: predicate.loop_unroll_after_grad 1.48% : 0.000002s : 16: predicate.loop_unroll_before_grad 1.80% : 0.000003s : 17: predicate.make_slice_get_slice_eliminator 0.79% : 0.000001s : 8: predicate.merge_addn 0.73% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.75% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.68% : 0.000001s : 9: predicate.minmaximum_grad 1.26% : 0.000002s : 4: predicate.mutable_eliminate 0.43% : 0.000001s : 4: predicate.opt_reshape 0.48% : 0.000001s : 4: predicate.parallel_virtual_node 1.17% : 0.000002s : 11: predicate.partial_defer_inline 1.35% : 0.000002s : 13: predicate.partial_eliminate 0.80% : 0.000001s : 9: predicate.print_const_string_wrapper 0.79% : 0.000001s : 8: predicate.reduce_all_const_elim 1.05% : 0.000002s : 9: predicate.reduce_eliminate 2.14% : 0.000004s : 26: predicate.redundant_stop_gradient_eliminater 0.48% : 0.000001s : 8: predicate.remove_not_recompute_node 1.10% : 0.000002s : 17: predicate.replace_applicator 0.63% : 0.000001s : 8: predicate.replace_old_param 0.33% : 0.000001s : 4: predicate.reset_defer_inline 0.89% : 0.000001s : 9: predicate.reshape_eliminate 0.80% : 0.000001s : 8: predicate.row_tensor_add_zeros_like 0.49% : 0.000001s : 4: predicate.row_tensor_eliminate 0.95% : 0.000002s : 8: predicate.same_eliminate 0.53% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.96% : 0.000002s : 8: predicate.shard_identity_eliminate 0.88% : 0.000001s : 8: predicate.special_op_eliminate 0.99% : 0.000002s : 8: predicate.specialize_transform 1.08% : 0.000002s : 8: predicate.split_environ_get_set_with_tuple_value 0.96% : 0.000002s : 8: predicate.stack_unstack_eliminate 0.45% : 0.000001s : 4: predicate.switch_call_monad_eliminater 0.98% : 0.000002s : 11: predicate.switch_defer_inline 1.75% : 0.000003s : 19: predicate.switch_layer_defer_inline 4.29% : 0.000007s : 39: predicate.switch_simplify 0.88% : 0.000001s : 9: predicate.tile_eliminate 0.78% : 0.000001s : 9: predicate.transpose_eliminate 1.67% : 0.000003s : 17: predicate.tuple_list_convert_item_index_to_positive 1.89% : 0.000003s : 17: predicate.tuple_list_get_item_const_eliminator 1.43% : 0.000002s : 17: predicate.tuple_list_get_item_depend_reorder 3.13% : 0.000005s : 25: predicate.tuple_list_get_item_eliminator 1.52% : 0.000003s : 17: predicate.tuple_list_get_set_item_eliminator 2.63% : 0.000004s : 25: predicate.tuple_list_set_item_eliminator 1.52% : 0.000003s : 17: predicate.tuple_to_list_eliminator_ 2.07% : 0.000003s : 26: predicate.updatestate_pure_node_eliminater 3.09% : 0.000005s : 34: predicate.updatestate_useless_node_eliminater 0.67% : 0.000001s : 4: predicate.value_based_eliminate 0.86% : 0.000001s : 8: predicate.virtual_dataset_eliminate 0.83% : 0.000001s : 8: predicate.virtual_output_eliminate 0.39% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.54% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000208 5 7.97% : 0.000017s : 1: func_graph_cloner_run.FuncGraphClonerGraph 92.03% : 0.000191s : 4: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.024287 192 0.02% : 0.000006s : 1: ForceFp32Comm 12.65% : 0.003073s : 1: add_attr 12.60% : 0.003060s : 1: add_attr_with_inline 0.03% : 0.000006s : 1: add_comm_op_reuse_tag 0.26% : 0.000062s : 1: add_recomputation 0.03% : 0.000006s : 1: assign_add_opt 0.27% : 0.000066s : 1: auto_monad 0.12% : 0.000030s : 1: auto_monad_reorder 0.02% : 0.000006s : 1: begin_end_overlap_inline 0.04% : 0.000009s : 1: bias_add_comm_swap 1.94% : 0.000471s : 1: bootstrap 0.12% : 0.000030s : 1: cconv 0.03% : 0.000006s : 1: comm_op_add_attrs 0.08% : 0.000021s : 1: control_data_broadcast_order 0.05% : 0.000013s : 1: convert_after_rewriter 0.16% : 0.000039s : 1: cse_after_recomputation 0.03% : 0.000007s : 1: dataset_repeat_opt 0.08% : 0.000018s : 1: detach_backward 0.05% : 0.000013s : 1: environ_conv 0.09% : 0.000023s : 1: event_method 0.03% : 0.000007s : 1: full_micro_interleaved_order_control 0.03% : 0.000007s : 1: get_jit_bprop_graph 0.05% : 0.000012s : 1: graph_reusing 0.03% : 0.000006s : 1: grouped_pairwise_exchange_alltoall 0.02% : 0.000006s : 1: handle_group_info 0.03% : 0.000008s : 1: inline 0.03% : 0.000008s : 1: insert-virtual-dataset 0.03% : 0.000006s : 1: interleave_parallel_branches 0.03% : 0.000006s : 1: interleave_split_concat_branches 0.04% : 0.000009s : 1: label_fine_grained_interleaved_index 0.04% : 0.000011s : 1: label_micro_interleaved_index 1.85% : 0.000449s : 1: loop_unroll 0.03% : 0.000007s : 1: merge_cast_opt 0.03% : 0.000008s : 1: micro_interleaved_order_control 1.99% : 0.000485s : 1: mutable_eliminate 0.04% : 0.000010s : 1: offloading_packed_experts 0.07% : 0.000016s : 1: opt.transform.loop_unroll_optimizer 0.07% : 0.000016s : 1: opt.transform.mutable_eliminate 3.89% : 0.000944s : 78: opt.transform.opt_a 0.14% : 0.000034s : 1: opt.transform.opt_after_cconv 0.11% : 0.000028s : 1: opt.transform.opt_after_jit_grad 0.67% : 0.000162s : 28: opt.transform.opt_b 0.22% : 0.000053s : 2: opt.transform.opt_trans_graph 0.18% : 0.000043s : 4: opt.transform.symbol_engine_opt 10.84% : 0.002632s : 1: opt_a 0.59% : 0.000143s : 1: opt_after_cconv 2.01% : 0.000488s : 1: opt_after_jit_grad 1.45% : 0.000352s : 1: opt_b 21.88% : 0.005313s : 1: optimize 0.10% : 0.000025s : 1: optimize_parallel_all_gather_comm 0.05% : 0.000012s : 1: order_py_execute_after_rewriter 0.11% : 0.000026s : 1: overlap_grad_flash_sp 0.03% : 0.000006s : 1: overlap_grad_matmul_and_grad_allreduce 0.04% : 0.000010s : 1: overlap_grad_ring_attention 0.03% : 0.000007s : 1: overlap_opt_shard_grad_in_pipeline 0.03% : 0.000007s : 1: overlap_opt_shard_in_pipeline 0.03% : 0.000008s : 1: overlap_param_gather 0.03% : 0.000006s : 1: overlap_recompute_allgather_and_fa_grad 0.04% : 0.000010s : 1: overlap_recompute_and_grad_model_parallel 0.03% : 0.000008s : 1: overlap_recompute_comm 0.04% : 0.000010s : 1: parallel-infer-symbol 0.03% : 0.000006s : 1: parallel-infer-symbol-second 0.03% : 0.000007s : 1: partial_unused_args_eliminate 0.03% : 0.000009s : 1: pipeline_parallel_scheduler 0.03% : 0.000007s : 1: pipeline_split 0.12% : 0.000030s : 1: pre_auto_parallel 0.10% : 0.000023s : 1: py_interpret_to_execute 0.06% : 0.000015s : 1: py_interpret_to_execute_after_opt_a 0.02% : 0.000006s : 1: remove_cast_before_assign_add 0.09% : 0.000021s : 1: remove_dup_value 1.11% : 0.000269s : 1: renormalize.infer 0.84% : 0.000204s : 1: renormalize.specialize 0.03% : 0.000008s : 1: reorder_send_recv_between_fp_bp 0.04% : 0.000010s : 1: rewriter_after_jit_bprop_graph 0.19% : 0.000046s : 1: rewriter_after_opt_a 0.22% : 0.000055s : 1: rewriter_before_opt_a 0.03% : 0.000007s : 1: slice_cell_reuse_recomputed_activation 0.03% : 0.000007s : 1: slice_recompute_activation 0.03% : 0.000007s : 1: split_layernorm_comm 0.03% : 0.000007s : 1: split_matmul_comm_elemetwise 0.05% : 0.000013s : 1: swap_dp_allreduce_reducescatter 0.48% : 0.000117s : 1: symbol_engine_optimizer 0.41% : 0.000100s : 1: tuple_transform 20.36% : 0.004945s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:47:02.331.746 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0137402, [21] [bootstrap]: 0.00043761 [type_inference]: 0.00500126 [event_method]: 1.316e-05 [auto_monad]: 5.655e-05 [graph_reusing]: 5.14e-06 [inline]: 2.56998e-06 [add_attr]: 0.00307418, [1] [add_attr_with_inline]: 0.00306592, [1] [Cycle 1]: 5.58e-05, [2] [tag_attr]: 1.507e-05 [meta_addattr_fg_expand]: 3.73999e-06 [parallel-infer-symbol]: 3.25e-06 [pre_auto_parallel]: 2.543e-05 [insert-virtual-dataset]: 2.21998e-06 [parallel-infer-symbol-second]: 8.09989e-07 [dataset_repeat_opt]: 1.92999e-06 [pipeline_split]: 1.57999e-06 [optimize]: 0.00441903, [53] [py_interpret_to_execute]: 1.734e-05 [rewriter_before_opt_a]: 4.657e-05 [opt_a]: 0.00237144, [2] [Cycle 1]: 0.00165098, [45] [expand_dump_flag]: 2.89999e-06 [switch_simplify]: 2.569e-05 [loop_unroll]: 1.548e-05 [a_1]: 0.00033691 [with_stream_mark]: 1.391e-05 [recompute_prepare]: 9.49e-06 [updatestate_depend_eliminate]: 4.70999e-06 [updatestate_assign_eliminate]: 3.84002e-06 [updatestate_loads_eliminate]: 3.51999e-06 [parameter_eliminate]: 1.74e-06 [a_2]: 0.00011448 [accelerated_algorithm]: 8.51002e-06 [shard]: 2.29999e-06 [meta_shard_fg_expand]: 2.02001e-06 [shard_inline]: 7.56001e-06 [merge_send_recv]: 9.72999e-06 [auto_parallel]: 6.86001e-06 [parallel]: 1.809e-05 [flash_sp]: 7.83999e-06 [merge_comm]: 5.35001e-06 [allreduce_fusion]: 4.07e-06 [matmul_add_comm_reduction]: 1.095e-05 [allreduce_slice_to_reducescatter]: 5.89993e-07 [virtual_shard_identity]: 9.36002e-06 [virtual_dataset]: 7.33999e-06 [get_grad_eliminate_]: 6.86001e-06 [virtual_output]: 7.55e-06 [merge_forward]: 4.57e-06 [cell_reuse_recompute_pass]: 1.29e-06 [offload_activation]: 1.103e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.5e-05 [merge_recompute_call_nodes]: 1.74e-06 [before_grad]: 1.256e-05 [set_forward_comm_id_for_comm_node_pass]: 4.35999e-06 [meta_fg_expand]: 3.16999e-06 [flash_sp_send_recv_attached]: 2.58e-06 [receive_attached]: 2.07999e-06 [after_resolve]: 1.043e-05 [a_after_grad]: 1.165e-05 [renormalize]: 0.00056893 [add_forward_monad_depend]: 4.72998e-06 [auto_monad_grad]: 2.36e-06 [auto_monad_eliminator]: 1.565e-05 [cse]: 3.669e-05 [a_3]: 5.599e-05 [Cycle 2]: 0.00071109, [45] [expand_dump_flag]: 1.04e-06 [switch_simplify]: 8.64e-06 [loop_unroll]: 7.51001e-06 [a_1]: 0.00015181 [with_stream_mark]: 1.183e-05 [recompute_prepare]: 7.22002e-06 [updatestate_depend_eliminate]: 4.01001e-06 [updatestate_assign_eliminate]: 2.88998e-06 [updatestate_loads_eliminate]: 2.70002e-06 [parameter_eliminate]: 1.14e-06 [a_2]: 8.79e-05 [accelerated_algorithm]: 7.25e-06 [shard]: 1.07998e-06 [meta_shard_fg_expand]: 1.56998e-06 [shard_inline]: 7.16999e-06 [merge_send_recv]: 5.56e-06 [auto_parallel]: 6.46999e-06 [parallel]: 4.95001e-06 [flash_sp]: 3.38e-06 [merge_comm]: 3.97e-06 [allreduce_fusion]: 3.63999e-06 [matmul_add_comm_reduction]: 6.23e-06 [allreduce_slice_to_reducescatter]: 3.39991e-07 [virtual_shard_identity]: 7.74002e-06 [virtual_dataset]: 6.86001e-06 [get_grad_eliminate_]: 6.59001e-06 [virtual_output]: 6.31e-06 [merge_forward]: 3.03998e-06 [cell_reuse_recompute_pass]: 1.29e-06 [offload_activation]: 7.2e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.347e-05 [merge_recompute_call_nodes]: 6.59988e-07 [before_grad]: 1.144e-05 [set_forward_comm_id_for_comm_node_pass]: 4.45e-06 [meta_fg_expand]: 2.78998e-06 [flash_sp_send_recv_attached]: 1.15999e-06 [receive_attached]: 1.06002e-06 [after_resolve]: 9.54999e-06 [a_after_grad]: 1.013e-05 [renormalize]: 8.00064e-08 [add_forward_monad_depend]: 1.29e-06 [auto_monad_grad]: 1.25001e-06 [auto_monad_eliminator]: 8.28999e-06 [cse]: 2.235e-05 [a_3]: 4.377e-05 [py_interpret_to_execute_after_opt_a]: 9.39e-06 [slice_cell_reuse_recomputed_activation]: 1.81e-06 [rewriter_after_opt_a]: 3.98e-05 [convert_after_rewriter]: 6.94001e-06 [order_py_execute_after_rewriter]: 5.74e-06 [mutable_eliminate]: 0.00047903 [opt_b]: 0.00027612, [1] [Cycle 1]: 0.00026943, [7] [b_1]: 0.00018105 [b_2]: 9.27999e-06 [updatestate_depend_eliminate]: 6.91999e-06 [updatestate_assign_eliminate]: 3.23e-06 [updatestate_loads_eliminate]: 2.93e-06 [renormalize]: 4.10015e-07 [cse]: 2.516e-05 [optimize_parallel_all_gather_comm]: 1.757e-05 [overlap_param_gather]: 2.05002e-06 [cconv]: 2.371e-05 [loop_unroll]: 0.00043039 [opt_after_cconv]: 0.0001121, [1] [Cycle 1]: 0.00010651, [7] [c_1]: 3.354e-05 [parameter_eliminate]: 2.98998e-06 [updatestate_depend_eliminate]: 6.07999e-06 [updatestate_assign_eliminate]: 2.99001e-06 [updatestate_loads_eliminate]: 3.21001e-06 [cse]: 2.315e-05 [renormalize]: 4.09986e-07 [remove_dup_value]: 1.518e-05 [tuple_transform]: 8.09e-05, [1] [Cycle 1]: 7.615e-05, [4] [d_1]: 4.699e-05 [none_parameter_eliminate]: 2.11e-06 [renormalize]: 2.3999e-07 [switch_simplify]: 8.46002e-06 [partial_unused_args_eliminate]: 2.02001e-06 [add_recomputation]: 5.25e-05 [cse_after_recomputation]: 2.616e-05, [1] [Cycle 1]: 2.146e-05, [1] [cse]: 1.561e-05 [environ_conv]: 6.44001e-06 [swap_dp_allreduce_reducescatter]: 6.10002e-06 [bias_add_comm_swap]: 2.82002e-06 [label_micro_interleaved_index]: 4.22e-06 [label_fine_grained_interleaved_index]: 2.61999e-06 [merge_cast_opt]: 1.27e-06 [slice_recompute_activation]: 1.97999e-06 [micro_interleaved_order_control]: 2.37999e-06 [assign_add_opt]: 1.18001e-06 [ForceFp32Comm]: 7.50006e-07 [remove_cast_before_assign_add]: 9.80013e-07 [full_micro_interleaved_order_control]: 2.09e-06 [reorder_send_recv_between_fp_bp]: 2.73998e-06 [comm_op_add_attrs]: 9.70002e-07 [add_comm_op_reuse_tag]: 1.02e-06 [interleave_split_concat_branches]: 1.16002e-06 [interleave_parallel_branches]: 1.09e-06 [overlap_opt_shard_in_pipeline]: 1.33002e-06 [overlap_opt_shard_grad_in_pipeline]: 1.81998e-06 [control_data_broadcast_order]: 1.484e-05 [grouped_pairwise_exchange_alltoall]: 1.47999e-06 [offloading_packed_experts]: 4.21001e-06 [overlap_recompute_and_grad_model_parallel]: 5.67001e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.39e-06 [overlap_recompute_allgather_and_fa_grad]: 1.32999e-06 [overlap_recompute_comm]: 2.76999e-06 [overlap_grad_ring_attention]: 4.62e-06 [overlap_grad_flash_sp]: 2.064e-05 [begin_end_overlap_inline]: 5.8001e-07 [split_matmul_comm_elemetwise]: 2.22999e-06 [split_layernorm_comm]: 1.59e-06 [handle_group_info]: 8.80013e-07 [symbol_engine_optimizer]: 8.369e-05, [1] [Cycle 1]: 7.919e-05, [6] [build]: 3.02002e-06 [elim_shapecalc]: 1.197e-05 [elim_not_effective]: 1.495e-05 [opt_reshape]: 8.04997e-06 [fold_const_symbol]: 1.179e-05 [renormalize]: 2.19996e-07 [detach_backward]: 1.91e-06 [pipeline_parallel_scheduler]: 1.40999e-06 [auto_monad_reorder]: 2.021e-05 [get_jit_bprop_graph]: 1.32e-06 [rewriter_after_jit_bprop_graph]: 3.98999e-06 [opt_after_jit_grad]: 0.00048114 [validate]: 4.06e-05 Sums bootstrap : 0.000438s : 4.50% type_inference : 0.005001s : 51.44% event_method : 0.000013s : 0.14% auto_monad : 0.000057s : 0.58% graph_reusing : 0.000005s : 0.05% inline : 0.000003s : 0.03% add_attr.add_attr_with_inline.tag_attr : 0.000015s : 0.16% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000004s : 0.04% parallel-infer-symbol : 0.000003s : 0.03% pre_auto_parallel : 0.000025s : 0.26% insert-virtual-dataset : 0.000002s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.02% optimize.py_interpret_to_execute : 0.000017s : 0.18% optimize.rewriter_before_opt_a : 0.000047s : 0.48% optimize.opt_a.expand_dump_flag : 0.000004s : 0.04% optimize.opt_a.switch_simplify : 0.000034s : 0.35% optimize.opt_a.loop_unroll : 0.000023s : 0.24% optimize.opt_a.a_1 : 0.000489s : 5.03% optimize.opt_a.with_stream_mark : 0.000026s : 0.26% optimize.opt_a.recompute_prepare : 0.000017s : 0.17% optimize.opt_a.updatestate_depend_eliminate : 0.000009s : 0.09% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.07% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.06% optimize.opt_a.parameter_eliminate : 0.000003s : 0.03% optimize.opt_a.a_2 : 0.000202s : 2.08% optimize.opt_a.accelerated_algorithm : 0.000016s : 0.16% optimize.opt_a.shard : 0.000003s : 0.03% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.04% optimize.opt_a.shard_inline : 0.000015s : 0.15% optimize.opt_a.merge_send_recv : 0.000015s : 0.16% optimize.opt_a.auto_parallel : 0.000013s : 0.14% optimize.opt_a.parallel : 0.000023s : 0.24% optimize.opt_a.flash_sp : 0.000011s : 0.12% optimize.opt_a.merge_comm : 0.000009s : 0.10% optimize.opt_a.allreduce_fusion : 0.000008s : 0.08% optimize.opt_a.matmul_add_comm_reduction : 0.000017s : 0.18% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000017s : 0.18% optimize.opt_a.virtual_dataset : 0.000014s : 0.15% optimize.opt_a.get_grad_eliminate_ : 0.000013s : 0.14% optimize.opt_a.virtual_output : 0.000014s : 0.14% optimize.opt_a.merge_forward : 0.000008s : 0.08% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.03% optimize.opt_a.offload_activation : 0.000018s : 0.19% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000028s : 0.29% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.02% optimize.opt_a.before_grad : 0.000024s : 0.25% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000009s : 0.09% optimize.opt_a.meta_fg_expand : 0.000006s : 0.06% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.04% optimize.opt_a.receive_attached : 0.000003s : 0.03% optimize.opt_a.after_resolve : 0.000020s : 0.21% optimize.opt_a.a_after_grad : 0.000022s : 0.22% optimize.opt_a.renormalize : 0.000569s : 5.85% optimize.opt_a.add_forward_monad_depend : 0.000006s : 0.06% optimize.opt_a.auto_monad_grad : 0.000004s : 0.04% optimize.opt_a.auto_monad_eliminator : 0.000024s : 0.25% optimize.opt_a.cse : 0.000059s : 0.61% optimize.opt_a.a_3 : 0.000100s : 1.03% optimize.py_interpret_to_execute_after_opt_a : 0.000009s : 0.10% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.02% optimize.rewriter_after_opt_a : 0.000040s : 0.41% optimize.convert_after_rewriter : 0.000007s : 0.07% optimize.order_py_execute_after_rewriter : 0.000006s : 0.06% optimize.mutable_eliminate : 0.000479s : 4.93% optimize.opt_b.b_1 : 0.000181s : 1.86% optimize.opt_b.b_2 : 0.000009s : 0.10% optimize.opt_b.updatestate_depend_eliminate : 0.000007s : 0.07% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_b.renormalize : 0.000000s : 0.00% optimize.opt_b.cse : 0.000025s : 0.26% optimize.optimize_parallel_all_gather_comm : 0.000018s : 0.18% optimize.overlap_param_gather : 0.000002s : 0.02% optimize.cconv : 0.000024s : 0.24% optimize.loop_unroll : 0.000430s : 4.43% optimize.opt_after_cconv.c_1 : 0.000034s : 0.34% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.06% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.cse : 0.000023s : 0.24% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000015s : 0.16% optimize.tuple_transform.d_1 : 0.000047s : 0.48% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.02% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000008s : 0.09% optimize.partial_unused_args_eliminate : 0.000002s : 0.02% optimize.add_recomputation : 0.000053s : 0.54% optimize.cse_after_recomputation.cse : 0.000016s : 0.16% optimize.environ_conv : 0.000006s : 0.07% optimize.swap_dp_allreduce_reducescatter : 0.000006s : 0.06% optimize.bias_add_comm_swap : 0.000003s : 0.03% optimize.label_micro_interleaved_index : 0.000004s : 0.04% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.03% optimize.merge_cast_opt : 0.000001s : 0.01% optimize.slice_recompute_activation : 0.000002s : 0.02% optimize.micro_interleaved_order_control : 0.000002s : 0.02% optimize.assign_add_opt : 0.000001s : 0.01% optimize.ForceFp32Comm : 0.000001s : 0.01% optimize.remove_cast_before_assign_add : 0.000001s : 0.01% optimize.full_micro_interleaved_order_control : 0.000002s : 0.02% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.03% optimize.comm_op_add_attrs : 0.000001s : 0.01% optimize.add_comm_op_reuse_tag : 0.000001s : 0.01% optimize.interleave_split_concat_branches : 0.000001s : 0.01% optimize.interleave_parallel_branches : 0.000001s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.02% optimize.control_data_broadcast_order : 0.000015s : 0.15% optimize.grouped_pairwise_exchange_alltoall : 0.000001s : 0.02% optimize.offloading_packed_experts : 0.000004s : 0.04% optimize.overlap_recompute_and_grad_model_parallel : 0.000006s : 0.06% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.01% optimize.overlap_recompute_comm : 0.000003s : 0.03% optimize.overlap_grad_ring_attention : 0.000005s : 0.05% optimize.overlap_grad_flash_sp : 0.000021s : 0.21% optimize.begin_end_overlap_inline : 0.000001s : 0.01% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.02% optimize.split_layernorm_comm : 0.000002s : 0.02% optimize.handle_group_info : 0.000001s : 0.01% optimize.symbol_engine_optimizer.build : 0.000003s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000012s : 0.12% optimize.symbol_engine_optimizer.elim_not_effective : 0.000015s : 0.15% optimize.symbol_engine_optimizer.opt_reshape : 0.000008s : 0.08% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000012s : 0.12% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.02% pipeline_parallel_scheduler : 0.000001s : 0.01% auto_monad_reorder : 0.000020s : 0.21% get_jit_bprop_graph : 0.000001s : 0.01% rewriter_after_jit_bprop_graph : 0.000004s : 0.04% opt_after_jit_grad : 0.000481s : 4.95% validate : 0.000041s : 0.42% Time group info: ------[substitution.] 0.000163 29 11.26% : 0.000018s : 2: substitution.cast_eliminate 1.26% : 0.000002s : 3: substitution.elim_not_effective 1.05% : 0.000002s : 3: substitution.fold_const_symbol 3.92% : 0.000006s : 4: substitution.graph_param_transform 57.03% : 0.000093s : 2: substitution.inline 2.68% : 0.000004s : 6: substitution.j_node_and_user_rematch 3.34% : 0.000005s : 6: substitution.remove_not_recompute_node 1.82% : 0.000003s : 2: substitution.replace_old_param 17.65% : 0.000029s : 1: substitution.value_based_eliminate ------[type_inference.] 0.004953 2 90.70% : 0.004493s : 1: type_inference.infer 9.30% : 0.000461s : 1: type_inference.specialize ------[replace.] 0.000021 2 100.00% : 0.000021s : 2: replace.inline ------[match.] 0.000091 2 100.00% : 0.000091s : 2: match.inline ------[predicate.] 0.000167 980 0.93% : 0.000002s : 9: predicate.accumulaten_eliminater 1.11% : 0.000002s : 4: predicate.ad_related_special_op_eliminate 0.67% : 0.000001s : 8: predicate.addn_check_dump 0.77% : 0.000001s : 9: predicate.addn_zero_filter 0.68% : 0.000001s : 9: predicate.adjust_all_reduce_mul_add 2.26% : 0.000004s : 17: predicate.arithmetic_simplify 0.93% : 0.000002s : 9: predicate.cast_eliminate 0.99% : 0.000002s : 8: predicate.check_bprop_eliminate 0.68% : 0.000001s : 8: predicate.compare_switch_simplify 0.21% : 0.000000s : 4: predicate.const_output_eliminate 0.76% : 0.000001s : 8: predicate.depend_value_elim 0.85% : 0.000001s : 9: predicate.dict_get_item_const_eliminator 1.02% : 0.000002s : 9: predicate.dict_get_item_eliminator 0.78% : 0.000001s : 9: predicate.dict_set_item_eliminator 1.18% : 0.000002s : 8: predicate.dumpgradient_eliminate 0.26% : 0.000000s : 4: predicate.elim_not_effective 0.47% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.09% : 0.000002s : 13: predicate.environ_add_const_eliminate 1.09% : 0.000002s : 13: predicate.environ_get_add_eliminate 1.08% : 0.000002s : 13: predicate.environ_get_depend_swap 1.79% : 0.000003s : 21: predicate.environ_get_eliminate 1.02% : 0.000002s : 13: predicate.environ_get_set_eliminate 0.91% : 0.000002s : 11: predicate.exchange_switch_depend_value 1.78% : 0.000003s : 11: predicate.float_depend_g_call 0.68% : 0.000001s : 8: predicate.float_environ_get_switch 0.98% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.21% : 0.000000s : 4: predicate.fold_const_symbol 0.82% : 0.000001s : 8: predicate.get_grad_eliminate 0.30% : 0.000000s : 4: predicate.graph_param_transform 0.83% : 0.000001s : 8: predicate.incorporate_call 0.66% : 0.000001s : 8: predicate.incorporate_call_switch 6.32% : 0.000011s : 44: predicate.inline 1.15% : 0.000002s : 8: predicate.inline_without_move 0.39% : 0.000001s : 8: predicate.j_node_and_user_rematch 1.36% : 0.000002s : 8: predicate.less_batch_normalization 1.65% : 0.000003s : 17: predicate.list_to_tuple_eliminator_ 2.14% : 0.000004s : 26: predicate.load_eliminater 1.11% : 0.000002s : 4: predicate.loop_unroll_after_grad 1.56% : 0.000003s : 16: predicate.loop_unroll_before_grad 1.68% : 0.000003s : 17: predicate.make_slice_get_slice_eliminator 0.72% : 0.000001s : 8: predicate.merge_addn 0.92% : 0.000002s : 8: predicate.micro_step_allgather_replace 0.75% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.74% : 0.000001s : 9: predicate.minmaximum_grad 1.41% : 0.000002s : 4: predicate.mutable_eliminate 0.42% : 0.000001s : 4: predicate.opt_reshape 0.54% : 0.000001s : 4: predicate.parallel_virtual_node 1.21% : 0.000002s : 11: predicate.partial_defer_inline 1.31% : 0.000002s : 13: predicate.partial_eliminate 0.77% : 0.000001s : 9: predicate.print_const_string_wrapper 0.74% : 0.000001s : 8: predicate.reduce_all_const_elim 1.11% : 0.000002s : 9: predicate.reduce_eliminate 2.09% : 0.000003s : 26: predicate.redundant_stop_gradient_eliminater 0.45% : 0.000001s : 8: predicate.remove_not_recompute_node 1.22% : 0.000002s : 17: predicate.replace_applicator 0.66% : 0.000001s : 8: predicate.replace_old_param 0.33% : 0.000001s : 4: predicate.reset_defer_inline 0.90% : 0.000002s : 9: predicate.reshape_eliminate 0.75% : 0.000001s : 8: predicate.row_tensor_add_zeros_like 0.63% : 0.000001s : 4: predicate.row_tensor_eliminate 1.06% : 0.000002s : 8: predicate.same_eliminate 0.55% : 0.000001s : 8: predicate.set_cell_output_no_recompute 1.23% : 0.000002s : 8: predicate.shard_identity_eliminate 0.90% : 0.000002s : 8: predicate.special_op_eliminate 0.95% : 0.000002s : 8: predicate.specialize_transform 1.08% : 0.000002s : 8: predicate.split_environ_get_set_with_tuple_value 0.94% : 0.000002s : 8: predicate.stack_unstack_eliminate 0.44% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.04% : 0.000002s : 11: predicate.switch_defer_inline 1.62% : 0.000003s : 19: predicate.switch_layer_defer_inline 4.17% : 0.000007s : 39: predicate.switch_simplify 0.84% : 0.000001s : 9: predicate.tile_eliminate 0.80% : 0.000001s : 9: predicate.transpose_eliminate 1.66% : 0.000003s : 17: predicate.tuple_list_convert_item_index_to_positive 1.67% : 0.000003s : 17: predicate.tuple_list_get_item_const_eliminator 1.56% : 0.000003s : 17: predicate.tuple_list_get_item_depend_reorder 2.99% : 0.000005s : 25: predicate.tuple_list_get_item_eliminator 1.48% : 0.000002s : 17: predicate.tuple_list_get_set_item_eliminator 2.46% : 0.000004s : 25: predicate.tuple_list_set_item_eliminator 1.46% : 0.000002s : 17: predicate.tuple_to_list_eliminator_ 2.20% : 0.000004s : 26: predicate.updatestate_pure_node_eliminater 2.94% : 0.000005s : 34: predicate.updatestate_useless_node_eliminater 0.56% : 0.000001s : 4: predicate.value_based_eliminate 0.80% : 0.000001s : 8: predicate.virtual_dataset_eliminate 0.89% : 0.000001s : 8: predicate.virtual_output_eliminate 0.34% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.54% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000208 5 7.81% : 0.000016s : 1: func_graph_cloner_run.FuncGraphClonerGraph 92.19% : 0.000191s : 4: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.022961 192 0.01% : 0.000003s : 1: ForceFp32Comm 13.41% : 0.003078s : 1: add_attr 13.37% : 0.003069s : 1: add_attr_with_inline 0.02% : 0.000004s : 1: add_comm_op_reuse_tag 0.25% : 0.000056s : 1: add_recomputation 0.02% : 0.000004s : 1: assign_add_opt 0.27% : 0.000061s : 1: auto_monad 0.10% : 0.000024s : 1: auto_monad_reorder 0.01% : 0.000003s : 1: begin_end_overlap_inline 0.02% : 0.000006s : 1: bias_add_comm_swap 2.03% : 0.000466s : 1: bootstrap 0.12% : 0.000027s : 1: cconv 0.02% : 0.000004s : 1: comm_op_add_attrs 0.08% : 0.000018s : 1: control_data_broadcast_order 0.04% : 0.000010s : 1: convert_after_rewriter 0.13% : 0.000029s : 1: cse_after_recomputation 0.02% : 0.000005s : 1: dataset_repeat_opt 0.02% : 0.000005s : 1: detach_backward 0.04% : 0.000010s : 1: environ_conv 0.09% : 0.000020s : 1: event_method 0.02% : 0.000005s : 1: full_micro_interleaved_order_control 0.02% : 0.000004s : 1: get_jit_bprop_graph 0.04% : 0.000008s : 1: graph_reusing 0.02% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.02% : 0.000004s : 1: handle_group_info 0.02% : 0.000005s : 1: inline 0.02% : 0.000006s : 1: insert-virtual-dataset 0.02% : 0.000004s : 1: interleave_parallel_branches 0.02% : 0.000004s : 1: interleave_split_concat_branches 0.02% : 0.000006s : 1: label_fine_grained_interleaved_index 0.03% : 0.000007s : 1: label_micro_interleaved_index 1.91% : 0.000438s : 1: loop_unroll 0.02% : 0.000004s : 1: merge_cast_opt 0.02% : 0.000005s : 1: micro_interleaved_order_control 2.12% : 0.000488s : 1: mutable_eliminate 0.03% : 0.000007s : 1: offloading_packed_experts 0.07% : 0.000016s : 1: opt.transform.loop_unroll_optimizer 0.07% : 0.000016s : 1: opt.transform.mutable_eliminate 4.10% : 0.000940s : 78: opt.transform.opt_a 0.14% : 0.000032s : 1: opt.transform.opt_after_cconv 0.12% : 0.000028s : 1: opt.transform.opt_after_jit_grad 0.68% : 0.000157s : 28: opt.transform.opt_b 0.23% : 0.000053s : 2: opt.transform.opt_trans_graph 0.19% : 0.000043s : 4: opt.transform.symbol_engine_opt 10.34% : 0.002374s : 1: opt_a 0.51% : 0.000116s : 1: opt_after_cconv 2.13% : 0.000490s : 1: opt_after_jit_grad 1.22% : 0.000280s : 1: opt_b 19.27% : 0.004423s : 1: optimize 0.09% : 0.000021s : 1: optimize_parallel_all_gather_comm 0.04% : 0.000009s : 1: order_py_execute_after_rewriter 0.10% : 0.000024s : 1: overlap_grad_flash_sp 0.02% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.03% : 0.000007s : 1: overlap_grad_ring_attention 0.02% : 0.000004s : 1: overlap_opt_shard_grad_in_pipeline 0.02% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.02% : 0.000005s : 1: overlap_param_gather 0.02% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.04% : 0.000009s : 1: overlap_recompute_and_grad_model_parallel 0.02% : 0.000006s : 1: overlap_recompute_comm 0.03% : 0.000007s : 1: parallel-infer-symbol 0.02% : 0.000004s : 1: parallel-infer-symbol-second 0.02% : 0.000005s : 1: partial_unused_args_eliminate 0.02% : 0.000005s : 1: pipeline_parallel_scheduler 0.02% : 0.000004s : 1: pipeline_split 0.13% : 0.000030s : 1: pre_auto_parallel 0.09% : 0.000021s : 1: py_interpret_to_execute 0.06% : 0.000013s : 1: py_interpret_to_execute_after_opt_a 0.02% : 0.000004s : 1: remove_cast_before_assign_add 0.08% : 0.000019s : 1: remove_dup_value 1.47% : 0.000338s : 1: renormalize.infer 0.97% : 0.000223s : 1: renormalize.specialize 0.02% : 0.000005s : 1: reorder_send_recv_between_fp_bp 0.03% : 0.000007s : 1: rewriter_after_jit_bprop_graph 0.19% : 0.000044s : 1: rewriter_after_opt_a 0.22% : 0.000050s : 1: rewriter_before_opt_a 0.02% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.02% : 0.000005s : 1: slice_recompute_activation 0.02% : 0.000004s : 1: split_layernorm_comm 0.02% : 0.000005s : 1: split_matmul_comm_elemetwise 0.04% : 0.000009s : 1: swap_dp_allreduce_reducescatter 0.38% : 0.000086s : 1: symbol_engine_optimizer 0.36% : 0.000084s : 1: tuple_transform 21.85% : 0.005017s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:47:02.521.208 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:47:02.521.507 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0153381, [21] [bootstrap]: 0.00043803 [type_inference]: 0.0052228 [event_method]: 1.318e-05 [auto_monad]: 5.779e-05 [graph_reusing]: 5.70001e-06 [inline]: 2.69999e-06 [add_attr]: 0.00333278, [1] [add_attr_with_inline]: 0.00332266, [1] [Cycle 1]: 8.061e-05, [2] [tag_attr]: 1.51e-05 [meta_addattr_fg_expand]: 3.98001e-06 [parallel-infer-symbol]: 3.49001e-06 [pre_auto_parallel]: 2.858e-05 [insert-virtual-dataset]: 2.31e-06 [parallel-infer-symbol-second]: 7.60017e-07 [dataset_repeat_opt]: 1.83002e-06 [pipeline_split]: 1.69998e-06 [optimize]: 0.00504365, [53] [py_interpret_to_execute]: 2.327e-05 [rewriter_before_opt_a]: 5.136e-05 [opt_a]: 0.00264283, [2] [Cycle 1]: 0.00176651, [45] [expand_dump_flag]: 3.03998e-06 [switch_simplify]: 2.505e-05 [loop_unroll]: 1.363e-05 [a_1]: 0.00029455 [with_stream_mark]: 1.908e-05 [recompute_prepare]: 7.68999e-06 [updatestate_depend_eliminate]: 4.18001e-06 [updatestate_assign_eliminate]: 3.78001e-06 [updatestate_loads_eliminate]: 3.78001e-06 [parameter_eliminate]: 2.04999e-06 [a_2]: 0.00010776 [accelerated_algorithm]: 7.27002e-06 [shard]: 2.21998e-06 [meta_shard_fg_expand]: 1.67001e-06 [shard_inline]: 6.53e-06 [merge_send_recv]: 8.14002e-06 [auto_parallel]: 7.26999e-06 [parallel]: 1.854e-05 [flash_sp]: 8.60001e-06 [merge_comm]: 4.22003e-06 [allreduce_fusion]: 3.49001e-06 [matmul_add_comm_reduction]: 1.045e-05 [allreduce_slice_to_reducescatter]: 6.40022e-07 [virtual_shard_identity]: 9.94999e-06 [virtual_dataset]: 7.46999e-06 [get_grad_eliminate_]: 5.91e-06 [virtual_output]: 6.00002e-06 [merge_forward]: 3.84002e-06 [cell_reuse_recompute_pass]: 1.35001e-06 [offload_activation]: 1.068e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.743e-05 [merge_recompute_call_nodes]: 1.45001e-06 [before_grad]: 1.117e-05 [set_forward_comm_id_for_comm_node_pass]: 4.02998e-06 [meta_fg_expand]: 2.58998e-06 [flash_sp_send_recv_attached]: 2.71e-06 [receive_attached]: 2.01998e-06 [after_resolve]: 1.044e-05 [a_after_grad]: 9.67001e-06 [renormalize]: 0.00057557 [add_forward_monad_depend]: 6.14999e-06 [auto_monad_grad]: 2.63e-06 [auto_monad_eliminator]: 1.682e-05 [cse]: 3.19e-05 [a_3]: 6.231e-05 [Cycle 2]: 0.00086117, [45] [expand_dump_flag]: 1.72999e-06 [switch_simplify]: 8.05999e-06 [loop_unroll]: 5.90002e-06 [a_1]: 0.00010603 [with_stream_mark]: 1.456e-05 [recompute_prepare]: 6.81001e-06 [updatestate_depend_eliminate]: 4.4e-06 [updatestate_assign_eliminate]: 2.57001e-06 [updatestate_loads_eliminate]: 2.61e-06 [parameter_eliminate]: 1.69e-06 [a_2]: 9.714e-05 [accelerated_algorithm]: 6.19999e-06 [shard]: 2.26e-06 [meta_shard_fg_expand]: 1.64e-06 [shard_inline]: 6.20002e-06 [merge_send_recv]: 7.42002e-06 [auto_parallel]: 6.54001e-06 [parallel]: 6.14001e-06 [flash_sp]: 3.9e-06 [merge_comm]: 3.62002e-06 [allreduce_fusion]: 3.35e-06 [matmul_add_comm_reduction]: 7.95998e-06 [allreduce_slice_to_reducescatter]: 5.50004e-07 [virtual_shard_identity]: 8.42e-06 [virtual_dataset]: 5.89e-06 [get_grad_eliminate_]: 6.36998e-06 [virtual_output]: 5.49998e-06 [merge_forward]: 2.81e-06 [cell_reuse_recompute_pass]: 1.50999e-06 [offload_activation]: 8.09002e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.637e-05 [merge_recompute_call_nodes]: 1.12e-06 [before_grad]: 1.02e-05 [set_forward_comm_id_for_comm_node_pass]: 4.26001e-06 [meta_fg_expand]: 3.10998e-06 [flash_sp_send_recv_attached]: 1.02998e-06 [receive_attached]: 1.54e-06 [after_resolve]: 1e-05 [a_after_grad]: 9.38002e-06 [renormalize]: 8.9989e-08 [add_forward_monad_depend]: 2.21e-06 [auto_monad_grad]: 1.37e-06 [auto_monad_eliminator]: 1.097e-05 [cse]: 1.726e-05 [a_3]: 8.446e-05 [py_interpret_to_execute_after_opt_a]: 1.45e-05 [slice_cell_reuse_recomputed_activation]: 4.73001e-06 [rewriter_after_opt_a]: 4.397e-05 [convert_after_rewriter]: 1.077e-05 [order_py_execute_after_rewriter]: 8.25e-06 [mutable_eliminate]: 0.0005606 [opt_b]: 0.00032839, [1] [Cycle 1]: 0.00031817, [7] [b_1]: 0.00020953 [b_2]: 7.61999e-06 [updatestate_depend_eliminate]: 7.46001e-06 [updatestate_assign_eliminate]: 2.61999e-06 [updatestate_loads_eliminate]: 2.34001e-06 [renormalize]: 3.09985e-07 [cse]: 2.398e-05 [optimize_parallel_all_gather_comm]: 2.234e-05 [overlap_param_gather]: 4.80999e-06 [cconv]: 3.254e-05 [loop_unroll]: 0.00046465 [opt_after_cconv]: 0.00012944, [1] [Cycle 1]: 0.00012088, [7] [c_1]: 2.779e-05 [parameter_eliminate]: 3.61999e-06 [updatestate_depend_eliminate]: 5.57001e-06 [updatestate_assign_eliminate]: 2.69001e-06 [updatestate_loads_eliminate]: 2.41e-06 [cse]: 2.004e-05 [renormalize]: 3.39991e-07 [remove_dup_value]: 1.922e-05 [tuple_transform]: 8.642e-05, [1] [Cycle 1]: 7.903e-05, [4] [d_1]: 4.055e-05 [none_parameter_eliminate]: 1.65001e-06 [renormalize]: 1.50001e-07 [switch_simplify]: 6.57002e-06 [partial_unused_args_eliminate]: 4.52e-06 [add_recomputation]: 5.182e-05 [cse_after_recomputation]: 2.866e-05, [1] [Cycle 1]: 2.148e-05, [1] [cse]: 1.19e-05 [environ_conv]: 8.24998e-06 [swap_dp_allreduce_reducescatter]: 8.15999e-06 [bias_add_comm_swap]: 4.99e-06 [label_micro_interleaved_index]: 7.91001e-06 [label_fine_grained_interleaved_index]: 5.19e-06 [merge_cast_opt]: 3.48999e-06 [slice_recompute_activation]: 4.95999e-06 [micro_interleaved_order_control]: 5.15001e-06 [assign_add_opt]: 3.50998e-06 [ForceFp32Comm]: 3.48999e-06 [remove_cast_before_assign_add]: 4.12e-06 [full_micro_interleaved_order_control]: 4.70001e-06 [reorder_send_recv_between_fp_bp]: 5.27001e-06 [comm_op_add_attrs]: 3.81999e-06 [add_comm_op_reuse_tag]: 3.94997e-06 [interleave_split_concat_branches]: 3.73999e-06 [interleave_parallel_branches]: 3.69002e-06 [overlap_opt_shard_in_pipeline]: 3.75e-06 [overlap_opt_shard_grad_in_pipeline]: 4.48001e-06 [control_data_broadcast_order]: 1.668e-05 [grouped_pairwise_exchange_alltoall]: 3.92002e-06 [offloading_packed_experts]: 6.97002e-06 [overlap_recompute_and_grad_model_parallel]: 7.3e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.51999e-06 [overlap_recompute_allgather_and_fa_grad]: 3.9e-06 [overlap_recompute_comm]: 5.22999e-06 [overlap_grad_ring_attention]: 6.46999e-06 [overlap_grad_flash_sp]: 2.474e-05 [begin_end_overlap_inline]: 3.11001e-06 [split_matmul_comm_elemetwise]: 4.74e-06 [split_layernorm_comm]: 4.39998e-06 [handle_group_info]: 3.29001e-06 [symbol_engine_optimizer]: 0.00010501, [1] [Cycle 1]: 9.729e-05, [6] [build]: 3.09999e-06 [elim_shapecalc]: 1.181e-05 [elim_not_effective]: 1.347e-05 [opt_reshape]: 8.14002e-06 [fold_const_symbol]: 1.03e-05 [renormalize]: 3.30008e-07 [detach_backward]: 3.98999e-06 [pipeline_parallel_scheduler]: 2.07999e-06 [auto_monad_reorder]: 2.033e-05 [get_jit_bprop_graph]: 1.66998e-06 [rewriter_after_jit_bprop_graph]: 5.21002e-06 [opt_after_jit_grad]: 0.00052061 [validate]: 4.056e-05 Sums bootstrap : 0.000438s : 4.29% type_inference : 0.005223s : 51.17% event_method : 0.000013s : 0.13% auto_monad : 0.000058s : 0.57% graph_reusing : 0.000006s : 0.06% inline : 0.000003s : 0.03% add_attr.add_attr_with_inline.tag_attr : 0.000015s : 0.15% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000004s : 0.04% parallel-infer-symbol : 0.000003s : 0.03% pre_auto_parallel : 0.000029s : 0.28% insert-virtual-dataset : 0.000002s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.02% optimize.py_interpret_to_execute : 0.000023s : 0.23% optimize.rewriter_before_opt_a : 0.000051s : 0.50% optimize.opt_a.expand_dump_flag : 0.000005s : 0.05% optimize.opt_a.switch_simplify : 0.000033s : 0.32% optimize.opt_a.loop_unroll : 0.000020s : 0.19% optimize.opt_a.a_1 : 0.000401s : 3.92% optimize.opt_a.with_stream_mark : 0.000034s : 0.33% optimize.opt_a.recompute_prepare : 0.000014s : 0.14% optimize.opt_a.updatestate_depend_eliminate : 0.000009s : 0.08% optimize.opt_a.updatestate_assign_eliminate : 0.000006s : 0.06% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.06% optimize.opt_a.parameter_eliminate : 0.000004s : 0.04% optimize.opt_a.a_2 : 0.000205s : 2.01% optimize.opt_a.accelerated_algorithm : 0.000013s : 0.13% optimize.opt_a.shard : 0.000004s : 0.04% optimize.opt_a.meta_shard_fg_expand : 0.000003s : 0.03% optimize.opt_a.shard_inline : 0.000013s : 0.12% optimize.opt_a.merge_send_recv : 0.000016s : 0.15% optimize.opt_a.auto_parallel : 0.000014s : 0.14% optimize.opt_a.parallel : 0.000025s : 0.24% optimize.opt_a.flash_sp : 0.000013s : 0.12% optimize.opt_a.merge_comm : 0.000008s : 0.08% optimize.opt_a.allreduce_fusion : 0.000007s : 0.07% optimize.opt_a.matmul_add_comm_reduction : 0.000018s : 0.18% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000018s : 0.18% optimize.opt_a.virtual_dataset : 0.000013s : 0.13% optimize.opt_a.get_grad_eliminate_ : 0.000012s : 0.12% optimize.opt_a.virtual_output : 0.000012s : 0.11% optimize.opt_a.merge_forward : 0.000007s : 0.07% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.03% optimize.opt_a.offload_activation : 0.000019s : 0.18% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000034s : 0.33% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.03% optimize.opt_a.before_grad : 0.000021s : 0.21% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000008s : 0.08% optimize.opt_a.meta_fg_expand : 0.000006s : 0.06% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.04% optimize.opt_a.receive_attached : 0.000004s : 0.03% optimize.opt_a.after_resolve : 0.000020s : 0.20% optimize.opt_a.a_after_grad : 0.000019s : 0.19% optimize.opt_a.renormalize : 0.000576s : 5.64% optimize.opt_a.add_forward_monad_depend : 0.000008s : 0.08% optimize.opt_a.auto_monad_grad : 0.000004s : 0.04% optimize.opt_a.auto_monad_eliminator : 0.000028s : 0.27% optimize.opt_a.cse : 0.000049s : 0.48% optimize.opt_a.a_3 : 0.000147s : 1.44% optimize.py_interpret_to_execute_after_opt_a : 0.000014s : 0.14% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.05% optimize.rewriter_after_opt_a : 0.000044s : 0.43% optimize.convert_after_rewriter : 0.000011s : 0.11% optimize.order_py_execute_after_rewriter : 0.000008s : 0.08% optimize.mutable_eliminate : 0.000561s : 5.49% optimize.opt_b.b_1 : 0.000210s : 2.05% optimize.opt_b.b_2 : 0.000008s : 0.07% optimize.opt_b.updatestate_depend_eliminate : 0.000007s : 0.07% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_b.updatestate_loads_eliminate : 0.000002s : 0.02% optimize.opt_b.renormalize : 0.000000s : 0.00% optimize.opt_b.cse : 0.000024s : 0.23% optimize.optimize_parallel_all_gather_comm : 0.000022s : 0.22% optimize.overlap_param_gather : 0.000005s : 0.05% optimize.cconv : 0.000033s : 0.32% optimize.loop_unroll : 0.000465s : 4.55% optimize.opt_after_cconv.c_1 : 0.000028s : 0.27% optimize.opt_after_cconv.parameter_eliminate : 0.000004s : 0.04% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.05% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.02% optimize.opt_after_cconv.cse : 0.000020s : 0.20% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000019s : 0.19% optimize.tuple_transform.d_1 : 0.000041s : 0.40% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.02% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000007s : 0.06% optimize.partial_unused_args_eliminate : 0.000005s : 0.04% optimize.add_recomputation : 0.000052s : 0.51% optimize.cse_after_recomputation.cse : 0.000012s : 0.12% optimize.environ_conv : 0.000008s : 0.08% optimize.swap_dp_allreduce_reducescatter : 0.000008s : 0.08% optimize.bias_add_comm_swap : 0.000005s : 0.05% optimize.label_micro_interleaved_index : 0.000008s : 0.08% optimize.label_fine_grained_interleaved_index : 0.000005s : 0.05% optimize.merge_cast_opt : 0.000003s : 0.03% optimize.slice_recompute_activation : 0.000005s : 0.05% optimize.micro_interleaved_order_control : 0.000005s : 0.05% optimize.assign_add_opt : 0.000004s : 0.03% optimize.ForceFp32Comm : 0.000003s : 0.03% optimize.remove_cast_before_assign_add : 0.000004s : 0.04% optimize.full_micro_interleaved_order_control : 0.000005s : 0.05% optimize.reorder_send_recv_between_fp_bp : 0.000005s : 0.05% optimize.comm_op_add_attrs : 0.000004s : 0.04% optimize.add_comm_op_reuse_tag : 0.000004s : 0.04% optimize.interleave_split_concat_branches : 0.000004s : 0.04% optimize.interleave_parallel_branches : 0.000004s : 0.04% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.04% optimize.overlap_opt_shard_grad_in_pipeline : 0.000004s : 0.04% optimize.control_data_broadcast_order : 0.000017s : 0.16% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.04% optimize.offloading_packed_experts : 0.000007s : 0.07% optimize.overlap_recompute_and_grad_model_parallel : 0.000007s : 0.07% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.03% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.04% optimize.overlap_recompute_comm : 0.000005s : 0.05% optimize.overlap_grad_ring_attention : 0.000006s : 0.06% optimize.overlap_grad_flash_sp : 0.000025s : 0.24% optimize.begin_end_overlap_inline : 0.000003s : 0.03% optimize.split_matmul_comm_elemetwise : 0.000005s : 0.05% optimize.split_layernorm_comm : 0.000004s : 0.04% optimize.handle_group_info : 0.000003s : 0.03% optimize.symbol_engine_optimizer.build : 0.000003s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000012s : 0.12% optimize.symbol_engine_optimizer.elim_not_effective : 0.000013s : 0.13% optimize.symbol_engine_optimizer.opt_reshape : 0.000008s : 0.08% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000010s : 0.10% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000004s : 0.04% pipeline_parallel_scheduler : 0.000002s : 0.02% auto_monad_reorder : 0.000020s : 0.20% get_jit_bprop_graph : 0.000002s : 0.02% rewriter_after_jit_bprop_graph : 0.000005s : 0.05% opt_after_jit_grad : 0.000521s : 5.10% validate : 0.000041s : 0.40% Time group info: ------[substitution.] 0.000153 20 1.28% : 0.000002s : 2: substitution.elim_not_effective 0.87% : 0.000001s : 2: substitution.fold_const_symbol 3.73% : 0.000006s : 3: substitution.graph_param_transform 62.02% : 0.000095s : 2: substitution.inline 2.43% : 0.000004s : 4: substitution.j_node_and_user_rematch 3.45% : 0.000005s : 4: substitution.remove_not_recompute_node 2.76% : 0.000004s : 2: substitution.replace_old_param 23.45% : 0.000036s : 1: substitution.value_based_eliminate ------[type_inference.] 0.005167 2 91.28% : 0.004717s : 1: type_inference.infer 8.72% : 0.000450s : 1: type_inference.specialize ------[replace.] 0.000022 2 100.00% : 0.000022s : 2: replace.inline ------[match.] 0.000093 2 100.00% : 0.000093s : 2: match.inline ------[predicate.] 0.000138 754 0.77% : 0.000001s : 7: predicate.accumulaten_eliminater 1.13% : 0.000002s : 3: predicate.ad_related_special_op_eliminate 0.71% : 0.000001s : 6: predicate.addn_check_dump 0.73% : 0.000001s : 7: predicate.addn_zero_filter 0.70% : 0.000001s : 7: predicate.adjust_all_reduce_mul_add 2.26% : 0.000003s : 13: predicate.arithmetic_simplify 0.78% : 0.000001s : 7: predicate.cast_eliminate 0.79% : 0.000001s : 6: predicate.check_bprop_eliminate 0.70% : 0.000001s : 6: predicate.compare_switch_simplify 0.21% : 0.000000s : 3: predicate.const_output_eliminate 0.79% : 0.000001s : 6: predicate.depend_value_elim 0.75% : 0.000001s : 7: predicate.dict_get_item_const_eliminator 0.86% : 0.000001s : 7: predicate.dict_get_item_eliminator 0.70% : 0.000001s : 7: predicate.dict_set_item_eliminator 1.82% : 0.000002s : 6: predicate.dumpgradient_eliminate 0.33% : 0.000000s : 3: predicate.elim_not_effective 0.61% : 0.000001s : 3: predicate.elim_shapecalc_of_broadcastargs 1.04% : 0.000001s : 10: predicate.environ_add_const_eliminate 0.96% : 0.000001s : 10: predicate.environ_get_add_eliminate 0.98% : 0.000001s : 10: predicate.environ_get_depend_swap 1.76% : 0.000002s : 16: predicate.environ_get_eliminate 0.94% : 0.000001s : 10: predicate.environ_get_set_eliminate 1.00% : 0.000001s : 9: predicate.exchange_switch_depend_value 1.93% : 0.000003s : 9: predicate.float_depend_g_call 0.65% : 0.000001s : 6: predicate.float_environ_get_switch 0.94% : 0.000001s : 9: predicate.float_tuple_getitem_switch 0.20% : 0.000000s : 3: predicate.fold_const_symbol 0.86% : 0.000001s : 6: predicate.get_grad_eliminate 0.26% : 0.000000s : 3: predicate.graph_param_transform 0.73% : 0.000001s : 6: predicate.incorporate_call 0.65% : 0.000001s : 6: predicate.incorporate_call_switch 6.33% : 0.000009s : 34: predicate.inline 1.15% : 0.000002s : 6: predicate.inline_without_move 0.36% : 0.000000s : 6: predicate.j_node_and_user_rematch 0.97% : 0.000001s : 6: predicate.less_batch_normalization 1.51% : 0.000002s : 13: predicate.list_to_tuple_eliminator_ 1.95% : 0.000003s : 20: predicate.load_eliminater 1.53% : 0.000002s : 3: predicate.loop_unroll_after_grad 1.55% : 0.000002s : 14: predicate.loop_unroll_before_grad 1.74% : 0.000002s : 13: predicate.make_slice_get_slice_eliminator 0.69% : 0.000001s : 6: predicate.merge_addn 0.73% : 0.000001s : 6: predicate.micro_step_allgather_replace 0.73% : 0.000001s : 6: predicate.mini_step_allgather_replace 0.68% : 0.000001s : 7: predicate.minmaximum_grad 1.55% : 0.000002s : 3: predicate.mutable_eliminate 0.52% : 0.000001s : 3: predicate.opt_reshape 0.78% : 0.000001s : 3: predicate.parallel_virtual_node 1.32% : 0.000002s : 9: predicate.partial_defer_inline 1.21% : 0.000002s : 10: predicate.partial_eliminate 0.69% : 0.000001s : 7: predicate.print_const_string_wrapper 0.73% : 0.000001s : 6: predicate.reduce_all_const_elim 1.05% : 0.000001s : 7: predicate.reduce_eliminate 2.14% : 0.000003s : 20: predicate.redundant_stop_gradient_eliminater 0.58% : 0.000001s : 6: predicate.remove_not_recompute_node 1.25% : 0.000002s : 13: predicate.replace_applicator 0.86% : 0.000001s : 6: predicate.replace_old_param 0.36% : 0.000000s : 3: predicate.reset_defer_inline 0.76% : 0.000001s : 7: predicate.reshape_eliminate 1.10% : 0.000002s : 6: predicate.row_tensor_add_zeros_like 0.47% : 0.000001s : 3: predicate.row_tensor_eliminate 1.18% : 0.000002s : 6: predicate.same_eliminate 0.47% : 0.000001s : 6: predicate.set_cell_output_no_recompute 1.45% : 0.000002s : 6: predicate.shard_identity_eliminate 0.86% : 0.000001s : 6: predicate.special_op_eliminate 0.98% : 0.000001s : 6: predicate.specialize_transform 1.33% : 0.000002s : 6: predicate.split_environ_get_set_with_tuple_value 1.11% : 0.000002s : 6: predicate.stack_unstack_eliminate 0.46% : 0.000001s : 3: predicate.switch_call_monad_eliminater 1.00% : 0.000001s : 9: predicate.switch_defer_inline 1.69% : 0.000002s : 15: predicate.switch_layer_defer_inline 4.21% : 0.000006s : 32: predicate.switch_simplify 0.68% : 0.000001s : 7: predicate.tile_eliminate 0.79% : 0.000001s : 7: predicate.transpose_eliminate 1.58% : 0.000002s : 13: predicate.tuple_list_convert_item_index_to_positive 1.61% : 0.000002s : 13: predicate.tuple_list_get_item_const_eliminator 1.39% : 0.000002s : 13: predicate.tuple_list_get_item_depend_reorder 3.30% : 0.000005s : 19: predicate.tuple_list_get_item_eliminator 1.29% : 0.000002s : 13: predicate.tuple_list_get_set_item_eliminator 2.25% : 0.000003s : 19: predicate.tuple_list_set_item_eliminator 1.42% : 0.000002s : 13: predicate.tuple_to_list_eliminator_ 1.96% : 0.000003s : 20: predicate.updatestate_pure_node_eliminater 3.03% : 0.000004s : 26: predicate.updatestate_useless_node_eliminater 0.68% : 0.000001s : 3: predicate.value_based_eliminate 0.77% : 0.000001s : 6: predicate.virtual_dataset_eliminate 0.71% : 0.000001s : 6: predicate.virtual_output_eliminate 0.33% : 0.000000s : 3: predicate.virtual_view_grad_eliminate 0.62% : 0.000001s : 3: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000231 5 8.19% : 0.000019s : 1: func_graph_cloner_run.FuncGraphClonerGraph 91.81% : 0.000212s : 4: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.025284 192 0.03% : 0.000006s : 1: ForceFp32Comm 13.22% : 0.003342s : 1: add_attr 13.16% : 0.003327s : 1: add_attr_with_inline 0.03% : 0.000007s : 1: add_comm_op_reuse_tag 0.22% : 0.000056s : 1: add_recomputation 0.02% : 0.000006s : 1: assign_add_opt 0.26% : 0.000067s : 1: auto_monad 0.11% : 0.000028s : 1: auto_monad_reorder 0.02% : 0.000006s : 1: begin_end_overlap_inline 0.03% : 0.000008s : 1: bias_add_comm_swap 1.91% : 0.000483s : 1: bootstrap 0.14% : 0.000036s : 1: cconv 0.03% : 0.000007s : 1: comm_op_add_attrs 0.08% : 0.000020s : 1: control_data_broadcast_order 0.05% : 0.000014s : 1: convert_after_rewriter 0.13% : 0.000032s : 1: cse_after_recomputation 0.03% : 0.000007s : 1: dataset_repeat_opt 0.09% : 0.000022s : 1: detach_backward 0.04% : 0.000011s : 1: environ_conv 0.09% : 0.000023s : 1: event_method 0.03% : 0.000008s : 1: full_micro_interleaved_order_control 0.03% : 0.000008s : 1: get_jit_bprop_graph 0.05% : 0.000012s : 1: graph_reusing 0.03% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.02% : 0.000006s : 1: handle_group_info 0.03% : 0.000009s : 1: inline 0.03% : 0.000008s : 1: insert-virtual-dataset 0.03% : 0.000006s : 1: interleave_parallel_branches 0.03% : 0.000006s : 1: interleave_split_concat_branches 0.03% : 0.000008s : 1: label_fine_grained_interleaved_index 0.04% : 0.000011s : 1: label_micro_interleaved_index 1.86% : 0.000471s : 1: loop_unroll 0.02% : 0.000006s : 1: merge_cast_opt 0.03% : 0.000008s : 1: micro_interleaved_order_control 2.25% : 0.000568s : 1: mutable_eliminate 0.04% : 0.000010s : 1: offloading_packed_experts 0.06% : 0.000015s : 1: opt.transform.loop_unroll_optimizer 0.07% : 0.000017s : 1: opt.transform.mutable_eliminate 3.24% : 0.000819s : 78: opt.transform.opt_a 0.10% : 0.000026s : 1: opt.transform.opt_after_cconv 0.11% : 0.000027s : 1: opt.transform.opt_after_jit_grad 0.56% : 0.000141s : 28: opt.transform.opt_b 0.18% : 0.000045s : 2: opt.transform.opt_trans_graph 0.16% : 0.000039s : 4: opt.transform.symbol_engine_opt 10.46% : 0.002646s : 1: opt_a 0.53% : 0.000133s : 1: opt_after_cconv 2.10% : 0.000532s : 1: opt_after_jit_grad 1.32% : 0.000333s : 1: opt_b 21.30% : 0.005386s : 1: optimize 0.10% : 0.000026s : 1: optimize_parallel_all_gather_comm 0.04% : 0.000011s : 1: order_py_execute_after_rewriter 0.11% : 0.000028s : 1: overlap_grad_flash_sp 0.02% : 0.000006s : 1: overlap_grad_matmul_and_grad_allreduce 0.04% : 0.000009s : 1: overlap_grad_ring_attention 0.03% : 0.000008s : 1: overlap_opt_shard_grad_in_pipeline 0.03% : 0.000007s : 1: overlap_opt_shard_in_pipeline 0.03% : 0.000008s : 1: overlap_param_gather 0.03% : 0.000007s : 1: overlap_recompute_allgather_and_fa_grad 0.04% : 0.000010s : 1: overlap_recompute_and_grad_model_parallel 0.03% : 0.000008s : 1: overlap_recompute_comm 0.04% : 0.000010s : 1: parallel-infer-symbol 0.03% : 0.000007s : 1: parallel-infer-symbol-second 0.03% : 0.000008s : 1: partial_unused_args_eliminate 0.04% : 0.000009s : 1: pipeline_parallel_scheduler 0.03% : 0.000007s : 1: pipeline_split 0.14% : 0.000036s : 1: pre_auto_parallel 0.11% : 0.000027s : 1: py_interpret_to_execute 0.07% : 0.000019s : 1: py_interpret_to_execute_after_opt_a 0.03% : 0.000007s : 1: remove_cast_before_assign_add 0.09% : 0.000023s : 1: remove_dup_value 1.25% : 0.000315s : 1: renormalize.infer 1.00% : 0.000252s : 1: renormalize.specialize 0.03% : 0.000008s : 1: reorder_send_recv_between_fp_bp 0.04% : 0.000011s : 1: rewriter_after_jit_bprop_graph 0.19% : 0.000048s : 1: rewriter_after_opt_a 0.22% : 0.000055s : 1: rewriter_before_opt_a 0.03% : 0.000008s : 1: slice_cell_reuse_recomputed_activation 0.03% : 0.000008s : 1: slice_recompute_activation 0.03% : 0.000007s : 1: split_layernorm_comm 0.03% : 0.000008s : 1: split_matmul_comm_elemetwise 0.05% : 0.000012s : 1: swap_dp_allreduce_reducescatter 0.43% : 0.000108s : 1: symbol_engine_optimizer 0.35% : 0.000089s : 1: tuple_transform 20.80% : 0.005258s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:47:02.718.935 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0137934, [21] [bootstrap]: 0.00045153 [type_inference]: 0.00511531 [event_method]: 1.329e-05 [auto_monad]: 5.421e-05 [graph_reusing]: 5.29e-06 [inline]: 1.97001e-06 [add_attr]: 0.00333968, [1] [add_attr_with_inline]: 0.0033305, [1] [Cycle 1]: 5.943e-05, [2] [tag_attr]: 1.458e-05 [meta_addattr_fg_expand]: 3.65998e-06 [parallel-infer-symbol]: 3.25e-06 [pre_auto_parallel]: 2.594e-05 [insert-virtual-dataset]: 2.26e-06 [parallel-infer-symbol-second]: 6.59988e-07 [dataset_repeat_opt]: 2.02001e-06 [pipeline_split]: 1.56002e-06 [optimize]: 0.00409004, [53] [py_interpret_to_execute]: 1.748e-05 [rewriter_before_opt_a]: 4.441e-05 [opt_a]: 0.00206406, [2] [Cycle 1]: 0.00144081, [45] [expand_dump_flag]: 3.14999e-06 [switch_simplify]: 2.613e-05 [loop_unroll]: 1.374e-05 [a_1]: 0.00028904 [with_stream_mark]: 1.743e-05 [recompute_prepare]: 7.92e-06 [updatestate_depend_eliminate]: 3.53999e-06 [updatestate_assign_eliminate]: 3.9e-06 [updatestate_loads_eliminate]: 2.99999e-06 [parameter_eliminate]: 1.77001e-06 [a_2]: 8.021e-05 [accelerated_algorithm]: 6.79999e-06 [shard]: 2.12999e-06 [meta_shard_fg_expand]: 1.64998e-06 [shard_inline]: 5.82999e-06 [merge_send_recv]: 8.92e-06 [auto_parallel]: 7.31999e-06 [parallel]: 1.739e-05 [flash_sp]: 7.38e-06 [merge_comm]: 3.75998e-06 [allreduce_fusion]: 3.80998e-06 [matmul_add_comm_reduction]: 9.22999e-06 [allreduce_slice_to_reducescatter]: 6.39993e-07 [virtual_shard_identity]: 7.65e-06 [virtual_dataset]: 6.49999e-06 [get_grad_eliminate_]: 5.64e-06 [virtual_output]: 6.07001e-06 [merge_forward]: 3.44001e-06 [cell_reuse_recompute_pass]: 1.60999e-06 [offload_activation]: 9.69e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.356e-05 [merge_recompute_call_nodes]: 1.40999e-06 [before_grad]: 1.067e-05 [set_forward_comm_id_for_comm_node_pass]: 3.99002e-06 [meta_fg_expand]: 2.74999e-06 [flash_sp_send_recv_attached]: 2.86e-06 [receive_attached]: 2.36e-06 [after_resolve]: 9.57001e-06 [a_after_grad]: 9.82001e-06 [renormalize]: 0.00047889 [add_forward_monad_depend]: 5.10999e-06 [auto_monad_grad]: 2.68e-06 [auto_monad_eliminator]: 1.473e-05 [cse]: 3.078e-05 [a_3]: 4.574e-05 [Cycle 2]: 0.00061294, [45] [expand_dump_flag]: 9.60019e-07 [switch_simplify]: 7.16999e-06 [loop_unroll]: 6.01e-06 [a_1]: 0.0001059 [with_stream_mark]: 1.131e-05 [recompute_prepare]: 5.96e-06 [updatestate_depend_eliminate]: 2.86999e-06 [updatestate_assign_eliminate]: 2.48e-06 [updatestate_loads_eliminate]: 2.68e-06 [parameter_eliminate]: 1.05001e-06 [a_2]: 6.921e-05 [accelerated_algorithm]: 5.97999e-06 [shard]: 1.89999e-06 [meta_shard_fg_expand]: 1.25999e-06 [shard_inline]: 6.09999e-06 [merge_send_recv]: 4.97999e-06 [auto_parallel]: 6.02001e-06 [parallel]: 4.80999e-06 [flash_sp]: 3.78999e-06 [merge_comm]: 3.31001e-06 [allreduce_fusion]: 3.02002e-06 [matmul_add_comm_reduction]: 5.74e-06 [allreduce_slice_to_reducescatter]: 5.19998e-07 [virtual_shard_identity]: 6.81999e-06 [virtual_dataset]: 5.64e-06 [get_grad_eliminate_]: 5.30999e-06 [virtual_output]: 5.39e-06 [merge_forward]: 2.75997e-06 [cell_reuse_recompute_pass]: 1.86e-06 [offload_activation]: 6.31e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.339e-05 [merge_recompute_call_nodes]: 9.20001e-07 [before_grad]: 8.97e-06 [set_forward_comm_id_for_comm_node_pass]: 4.04002e-06 [meta_fg_expand]: 2.12001e-06 [flash_sp_send_recv_attached]: 8.10018e-07 [receive_attached]: 1.38002e-06 [after_resolve]: 8.68001e-06 [a_after_grad]: 8.29002e-06 [renormalize]: 8.00064e-08 [add_forward_monad_depend]: 1.12e-06 [auto_monad_grad]: 1.45001e-06 [auto_monad_eliminator]: 7.28e-06 [cse]: 1.542e-05 [a_3]: 3.492e-05 [py_interpret_to_execute_after_opt_a]: 8.40001e-06 [slice_cell_reuse_recomputed_activation]: 1.81998e-06 [rewriter_after_opt_a]: 3.692e-05 [convert_after_rewriter]: 6.41998e-06 [order_py_execute_after_rewriter]: 5.17e-06 [mutable_eliminate]: 0.00052648 [opt_b]: 0.00025242, [1] [Cycle 1]: 0.00024548, [7] [b_1]: 0.00016196 [b_2]: 7.77998e-06 [updatestate_depend_eliminate]: 6.67002e-06 [updatestate_assign_eliminate]: 3.01999e-06 [updatestate_loads_eliminate]: 2.84999e-06 [renormalize]: 5.50004e-07 [cse]: 2.146e-05 [optimize_parallel_all_gather_comm]: 1.604e-05 [overlap_param_gather]: 2.24001e-06 [cconv]: 2.577e-05 [loop_unroll]: 0.00044035 [opt_after_cconv]: 0.00010197, [1] [Cycle 1]: 9.602e-05, [7] [c_1]: 2.746e-05 [parameter_eliminate]: 3.04999e-06 [updatestate_depend_eliminate]: 5.71998e-06 [updatestate_assign_eliminate]: 2.37001e-06 [updatestate_loads_eliminate]: 2.64999e-06 [cse]: 1.981e-05 [renormalize]: 4.70027e-07 [remove_dup_value]: 1.403e-05 [tuple_transform]: 7.042e-05, [1] [Cycle 1]: 6.636e-05, [4] [d_1]: 3.892e-05 [none_parameter_eliminate]: 1.94999e-06 [renormalize]: 4.49974e-07 [switch_simplify]: 6.92002e-06 [partial_unused_args_eliminate]: 1.92001e-06 [add_recomputation]: 4.532e-05 [cse_after_recomputation]: 2.245e-05, [1] [Cycle 1]: 1.796e-05, [1] [cse]: 1.228e-05 [environ_conv]: 4.65999e-06 [swap_dp_allreduce_reducescatter]: 5.48997e-06 [bias_add_comm_swap]: 2.43998e-06 [label_micro_interleaved_index]: 4.30999e-06 [label_fine_grained_interleaved_index]: 2.82002e-06 [merge_cast_opt]: 1.24003e-06 [slice_recompute_activation]: 2.17001e-06 [micro_interleaved_order_control]: 2.32001e-06 [assign_add_opt]: 1.22999e-06 [ForceFp32Comm]: 1.29e-06 [remove_cast_before_assign_add]: 9.79984e-07 [full_micro_interleaved_order_control]: 2.00002e-06 [reorder_send_recv_between_fp_bp]: 3.01001e-06 [comm_op_add_attrs]: 9.70002e-07 [add_comm_op_reuse_tag]: 9.29984e-07 [interleave_split_concat_branches]: 1.15999e-06 [interleave_parallel_branches]: 1.34e-06 [overlap_opt_shard_in_pipeline]: 1.20001e-06 [overlap_opt_shard_grad_in_pipeline]: 1.66e-06 [control_data_broadcast_order]: 1.225e-05 [grouped_pairwise_exchange_alltoall]: 1.76998e-06 [offloading_packed_experts]: 4.1e-06 [overlap_recompute_and_grad_model_parallel]: 4.65001e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.19e-06 [overlap_recompute_allgather_and_fa_grad]: 1.33002e-06 [overlap_recompute_comm]: 2.19001e-06 [overlap_grad_ring_attention]: 4.12e-06 [overlap_grad_flash_sp]: 1.845e-05 [begin_end_overlap_inline]: 5.40022e-07 [split_matmul_comm_elemetwise]: 2.05002e-06 [split_layernorm_comm]: 1.77999e-06 [handle_group_info]: 9.30013e-07 [symbol_engine_optimizer]: 7.387e-05, [1] [Cycle 1]: 6.928e-05, [6] [build]: 3.06001e-06 [elim_shapecalc]: 1.03e-05 [elim_not_effective]: 1.196e-05 [opt_reshape]: 6.67002e-06 [fold_const_symbol]: 9.54e-06 [renormalize]: 3.19997e-07 [detach_backward]: 1.80001e-06 [pipeline_parallel_scheduler]: 1.53002e-06 [auto_monad_reorder]: 1.589e-05 [get_jit_bprop_graph]: 1.74998e-06 [rewriter_after_jit_bprop_graph]: 4.02e-06 [opt_after_jit_grad]: 0.00047253 [validate]: 3.692e-05 Sums bootstrap : 0.000452s : 4.76% type_inference : 0.005115s : 53.88% event_method : 0.000013s : 0.14% auto_monad : 0.000054s : 0.57% graph_reusing : 0.000005s : 0.06% inline : 0.000002s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000015s : 0.15% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000004s : 0.04% parallel-infer-symbol : 0.000003s : 0.03% pre_auto_parallel : 0.000026s : 0.27% insert-virtual-dataset : 0.000002s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.02% optimize.py_interpret_to_execute : 0.000017s : 0.18% optimize.rewriter_before_opt_a : 0.000044s : 0.47% optimize.opt_a.expand_dump_flag : 0.000004s : 0.04% optimize.opt_a.switch_simplify : 0.000033s : 0.35% optimize.opt_a.loop_unroll : 0.000020s : 0.21% optimize.opt_a.a_1 : 0.000395s : 4.16% optimize.opt_a.with_stream_mark : 0.000029s : 0.30% optimize.opt_a.recompute_prepare : 0.000014s : 0.15% optimize.opt_a.updatestate_depend_eliminate : 0.000006s : 0.07% optimize.opt_a.updatestate_assign_eliminate : 0.000006s : 0.07% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.06% optimize.opt_a.parameter_eliminate : 0.000003s : 0.03% optimize.opt_a.a_2 : 0.000149s : 1.57% optimize.opt_a.accelerated_algorithm : 0.000013s : 0.13% optimize.opt_a.shard : 0.000004s : 0.04% optimize.opt_a.meta_shard_fg_expand : 0.000003s : 0.03% optimize.opt_a.shard_inline : 0.000012s : 0.13% optimize.opt_a.merge_send_recv : 0.000014s : 0.15% optimize.opt_a.auto_parallel : 0.000013s : 0.14% optimize.opt_a.parallel : 0.000022s : 0.23% optimize.opt_a.flash_sp : 0.000011s : 0.12% optimize.opt_a.merge_comm : 0.000007s : 0.07% optimize.opt_a.allreduce_fusion : 0.000007s : 0.07% optimize.opt_a.matmul_add_comm_reduction : 0.000015s : 0.16% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000014s : 0.15% optimize.opt_a.virtual_dataset : 0.000012s : 0.13% optimize.opt_a.get_grad_eliminate_ : 0.000011s : 0.12% optimize.opt_a.virtual_output : 0.000011s : 0.12% optimize.opt_a.merge_forward : 0.000006s : 0.07% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.04% optimize.opt_a.offload_activation : 0.000016s : 0.17% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000027s : 0.28% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.02% optimize.opt_a.before_grad : 0.000020s : 0.21% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000008s : 0.08% optimize.opt_a.meta_fg_expand : 0.000005s : 0.05% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.04% optimize.opt_a.receive_attached : 0.000004s : 0.04% optimize.opt_a.after_resolve : 0.000018s : 0.19% optimize.opt_a.a_after_grad : 0.000018s : 0.19% optimize.opt_a.renormalize : 0.000479s : 5.05% optimize.opt_a.add_forward_monad_depend : 0.000006s : 0.07% optimize.opt_a.auto_monad_grad : 0.000004s : 0.04% optimize.opt_a.auto_monad_eliminator : 0.000022s : 0.23% optimize.opt_a.cse : 0.000046s : 0.49% optimize.opt_a.a_3 : 0.000081s : 0.85% optimize.py_interpret_to_execute_after_opt_a : 0.000008s : 0.09% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.02% optimize.rewriter_after_opt_a : 0.000037s : 0.39% optimize.convert_after_rewriter : 0.000006s : 0.07% optimize.order_py_execute_after_rewriter : 0.000005s : 0.05% optimize.mutable_eliminate : 0.000526s : 5.55% optimize.opt_b.b_1 : 0.000162s : 1.71% optimize.opt_b.b_2 : 0.000008s : 0.08% optimize.opt_b.updatestate_depend_eliminate : 0.000007s : 0.07% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_b.renormalize : 0.000001s : 0.01% optimize.opt_b.cse : 0.000021s : 0.23% optimize.optimize_parallel_all_gather_comm : 0.000016s : 0.17% optimize.overlap_param_gather : 0.000002s : 0.02% optimize.cconv : 0.000026s : 0.27% optimize.loop_unroll : 0.000440s : 4.64% optimize.opt_after_cconv.c_1 : 0.000027s : 0.29% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.06% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000002s : 0.02% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.cse : 0.000020s : 0.21% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000014s : 0.15% optimize.tuple_transform.d_1 : 0.000039s : 0.41% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.02% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000007s : 0.07% optimize.partial_unused_args_eliminate : 0.000002s : 0.02% optimize.add_recomputation : 0.000045s : 0.48% optimize.cse_after_recomputation.cse : 0.000012s : 0.13% optimize.environ_conv : 0.000005s : 0.05% optimize.swap_dp_allreduce_reducescatter : 0.000005s : 0.06% optimize.bias_add_comm_swap : 0.000002s : 0.03% optimize.label_micro_interleaved_index : 0.000004s : 0.05% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.03% optimize.merge_cast_opt : 0.000001s : 0.01% optimize.slice_recompute_activation : 0.000002s : 0.02% optimize.micro_interleaved_order_control : 0.000002s : 0.02% optimize.assign_add_opt : 0.000001s : 0.01% optimize.ForceFp32Comm : 0.000001s : 0.01% optimize.remove_cast_before_assign_add : 0.000001s : 0.01% optimize.full_micro_interleaved_order_control : 0.000002s : 0.02% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.03% optimize.comm_op_add_attrs : 0.000001s : 0.01% optimize.add_comm_op_reuse_tag : 0.000001s : 0.01% optimize.interleave_split_concat_branches : 0.000001s : 0.01% optimize.interleave_parallel_branches : 0.000001s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.02% optimize.control_data_broadcast_order : 0.000012s : 0.13% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.02% optimize.offloading_packed_experts : 0.000004s : 0.04% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.05% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.01% optimize.overlap_recompute_comm : 0.000002s : 0.02% optimize.overlap_grad_ring_attention : 0.000004s : 0.04% optimize.overlap_grad_flash_sp : 0.000018s : 0.19% optimize.begin_end_overlap_inline : 0.000001s : 0.01% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.02% optimize.split_layernorm_comm : 0.000002s : 0.02% optimize.handle_group_info : 0.000001s : 0.01% optimize.symbol_engine_optimizer.build : 0.000003s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000010s : 0.11% optimize.symbol_engine_optimizer.elim_not_effective : 0.000012s : 0.13% optimize.symbol_engine_optimizer.opt_reshape : 0.000007s : 0.07% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000010s : 0.10% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.02% pipeline_parallel_scheduler : 0.000002s : 0.02% auto_monad_reorder : 0.000016s : 0.17% get_jit_bprop_graph : 0.000002s : 0.02% rewriter_after_jit_bprop_graph : 0.000004s : 0.04% opt_after_jit_grad : 0.000473s : 4.98% validate : 0.000037s : 0.39% Time group info: ------[substitution.] 0.000146 20 1.16% : 0.000002s : 2: substitution.elim_not_effective 0.90% : 0.000001s : 2: substitution.fold_const_symbol 3.87% : 0.000006s : 3: substitution.graph_param_transform 62.01% : 0.000091s : 2: substitution.inline 2.42% : 0.000004s : 4: substitution.j_node_and_user_rematch 3.20% : 0.000005s : 4: substitution.remove_not_recompute_node 2.36% : 0.000003s : 2: substitution.replace_old_param 24.06% : 0.000035s : 1: substitution.value_based_eliminate ------[type_inference.] 0.005064 2 91.51% : 0.004634s : 1: type_inference.infer 8.49% : 0.000430s : 1: type_inference.specialize ------[replace.] 0.000022 2 100.00% : 0.000022s : 2: replace.inline ------[match.] 0.000089 2 100.00% : 0.000089s : 2: match.inline ------[predicate.] 0.000134 754 0.79% : 0.000001s : 7: predicate.accumulaten_eliminater 1.01% : 0.000001s : 3: predicate.ad_related_special_op_eliminate 0.62% : 0.000001s : 6: predicate.addn_check_dump 0.78% : 0.000001s : 7: predicate.addn_zero_filter 0.67% : 0.000001s : 7: predicate.adjust_all_reduce_mul_add 2.19% : 0.000003s : 13: predicate.arithmetic_simplify 0.83% : 0.000001s : 7: predicate.cast_eliminate 0.74% : 0.000001s : 6: predicate.check_bprop_eliminate 0.67% : 0.000001s : 6: predicate.compare_switch_simplify 0.23% : 0.000000s : 3: predicate.const_output_eliminate 0.70% : 0.000001s : 6: predicate.depend_value_elim 0.89% : 0.000001s : 7: predicate.dict_get_item_const_eliminator 0.94% : 0.000001s : 7: predicate.dict_get_item_eliminator 0.74% : 0.000001s : 7: predicate.dict_set_item_eliminator 1.48% : 0.000002s : 6: predicate.dumpgradient_eliminate 0.36% : 0.000000s : 3: predicate.elim_not_effective 0.65% : 0.000001s : 3: predicate.elim_shapecalc_of_broadcastargs 1.06% : 0.000001s : 10: predicate.environ_add_const_eliminate 0.95% : 0.000001s : 10: predicate.environ_get_add_eliminate 0.97% : 0.000001s : 10: predicate.environ_get_depend_swap 2.01% : 0.000003s : 16: predicate.environ_get_eliminate 0.97% : 0.000001s : 10: predicate.environ_get_set_eliminate 0.95% : 0.000001s : 9: predicate.exchange_switch_depend_value 1.89% : 0.000003s : 9: predicate.float_depend_g_call 0.68% : 0.000001s : 6: predicate.float_environ_get_switch 0.97% : 0.000001s : 9: predicate.float_tuple_getitem_switch 0.19% : 0.000000s : 3: predicate.fold_const_symbol 0.80% : 0.000001s : 6: predicate.get_grad_eliminate 0.26% : 0.000000s : 3: predicate.graph_param_transform 0.78% : 0.000001s : 6: predicate.incorporate_call 0.63% : 0.000001s : 6: predicate.incorporate_call_switch 6.47% : 0.000009s : 34: predicate.inline 1.15% : 0.000002s : 6: predicate.inline_without_move 0.39% : 0.000001s : 6: predicate.j_node_and_user_rematch 0.90% : 0.000001s : 6: predicate.less_batch_normalization 1.75% : 0.000002s : 13: predicate.list_to_tuple_eliminator_ 2.06% : 0.000003s : 20: predicate.load_eliminater 1.50% : 0.000002s : 3: predicate.loop_unroll_after_grad 1.59% : 0.000002s : 14: predicate.loop_unroll_before_grad 1.74% : 0.000002s : 13: predicate.make_slice_get_slice_eliminator 0.72% : 0.000001s : 6: predicate.merge_addn 0.57% : 0.000001s : 6: predicate.micro_step_allgather_replace 0.69% : 0.000001s : 6: predicate.mini_step_allgather_replace 0.66% : 0.000001s : 7: predicate.minmaximum_grad 1.94% : 0.000003s : 3: predicate.mutable_eliminate 0.43% : 0.000001s : 3: predicate.opt_reshape 0.53% : 0.000001s : 3: predicate.parallel_virtual_node 1.18% : 0.000002s : 9: predicate.partial_defer_inline 1.24% : 0.000002s : 10: predicate.partial_eliminate 0.76% : 0.000001s : 7: predicate.print_const_string_wrapper 0.72% : 0.000001s : 6: predicate.reduce_all_const_elim 1.19% : 0.000002s : 7: predicate.reduce_eliminate 2.07% : 0.000003s : 20: predicate.redundant_stop_gradient_eliminater 0.60% : 0.000001s : 6: predicate.remove_not_recompute_node 1.18% : 0.000002s : 13: predicate.replace_applicator 0.67% : 0.000001s : 6: predicate.replace_old_param 0.35% : 0.000000s : 3: predicate.reset_defer_inline 0.80% : 0.000001s : 7: predicate.reshape_eliminate 0.73% : 0.000001s : 6: predicate.row_tensor_add_zeros_like 0.54% : 0.000001s : 3: predicate.row_tensor_eliminate 1.00% : 0.000001s : 6: predicate.same_eliminate 0.49% : 0.000001s : 6: predicate.set_cell_output_no_recompute 1.08% : 0.000001s : 6: predicate.shard_identity_eliminate 0.89% : 0.000001s : 6: predicate.special_op_eliminate 0.97% : 0.000001s : 6: predicate.specialize_transform 1.03% : 0.000001s : 6: predicate.split_environ_get_set_with_tuple_value 1.41% : 0.000002s : 6: predicate.stack_unstack_eliminate 0.45% : 0.000001s : 3: predicate.switch_call_monad_eliminater 1.01% : 0.000001s : 9: predicate.switch_defer_inline 1.76% : 0.000002s : 15: predicate.switch_layer_defer_inline 4.30% : 0.000006s : 32: predicate.switch_simplify 0.76% : 0.000001s : 7: predicate.tile_eliminate 0.76% : 0.000001s : 7: predicate.transpose_eliminate 1.54% : 0.000002s : 13: predicate.tuple_list_convert_item_index_to_positive 1.77% : 0.000002s : 13: predicate.tuple_list_get_item_const_eliminator 1.49% : 0.000002s : 13: predicate.tuple_list_get_item_depend_reorder 3.30% : 0.000004s : 19: predicate.tuple_list_get_item_eliminator 1.41% : 0.000002s : 13: predicate.tuple_list_get_set_item_eliminator 2.49% : 0.000003s : 19: predicate.tuple_list_set_item_eliminator 1.54% : 0.000002s : 13: predicate.tuple_to_list_eliminator_ 1.99% : 0.000003s : 20: predicate.updatestate_pure_node_eliminater 2.73% : 0.000004s : 26: predicate.updatestate_useless_node_eliminater 0.77% : 0.000001s : 3: predicate.value_based_eliminate 0.84% : 0.000001s : 6: predicate.virtual_dataset_eliminate 0.79% : 0.000001s : 6: predicate.virtual_output_eliminate 0.36% : 0.000000s : 3: predicate.virtual_view_grad_eliminate 0.58% : 0.000001s : 3: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000210 5 8.19% : 0.000017s : 1: func_graph_cloner_run.FuncGraphClonerGraph 91.81% : 0.000193s : 4: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.022629 192 0.02% : 0.000004s : 1: ForceFp32Comm 14.78% : 0.003345s : 1: add_attr 14.73% : 0.003334s : 1: add_attr_with_inline 0.02% : 0.000004s : 1: add_comm_op_reuse_tag 0.22% : 0.000049s : 1: add_recomputation 0.02% : 0.000004s : 1: assign_add_opt 0.26% : 0.000059s : 1: auto_monad 0.09% : 0.000020s : 1: auto_monad_reorder 0.01% : 0.000003s : 1: begin_end_overlap_inline 0.02% : 0.000005s : 1: bias_add_comm_swap 2.13% : 0.000481s : 1: bootstrap 0.13% : 0.000029s : 1: cconv 0.02% : 0.000004s : 1: comm_op_add_attrs 0.07% : 0.000016s : 1: control_data_broadcast_order 0.04% : 0.000010s : 1: convert_after_rewriter 0.11% : 0.000025s : 1: cse_after_recomputation 0.02% : 0.000005s : 1: dataset_repeat_opt 0.02% : 0.000005s : 1: detach_backward 0.03% : 0.000008s : 1: environ_conv 0.09% : 0.000020s : 1: event_method 0.02% : 0.000005s : 1: full_micro_interleaved_order_control 0.02% : 0.000005s : 1: get_jit_bprop_graph 0.04% : 0.000009s : 1: graph_reusing 0.02% : 0.000005s : 1: grouped_pairwise_exchange_alltoall 0.02% : 0.000004s : 1: handle_group_info 0.02% : 0.000005s : 1: inline 0.03% : 0.000006s : 1: insert-virtual-dataset 0.02% : 0.000004s : 1: interleave_parallel_branches 0.02% : 0.000004s : 1: interleave_split_concat_branches 0.03% : 0.000006s : 1: label_fine_grained_interleaved_index 0.03% : 0.000007s : 1: label_micro_interleaved_index 1.98% : 0.000449s : 1: loop_unroll 0.02% : 0.000004s : 1: merge_cast_opt 0.02% : 0.000005s : 1: micro_interleaved_order_control 2.37% : 0.000537s : 1: mutable_eliminate 0.03% : 0.000007s : 1: offloading_packed_experts 0.06% : 0.000014s : 1: opt.transform.loop_unroll_optimizer 0.07% : 0.000016s : 1: opt.transform.mutable_eliminate 3.34% : 0.000756s : 78: opt.transform.opt_a 0.12% : 0.000026s : 1: opt.transform.opt_after_cconv 0.11% : 0.000025s : 1: opt.transform.opt_after_jit_grad 0.60% : 0.000136s : 28: opt.transform.opt_b 0.19% : 0.000044s : 2: opt.transform.opt_trans_graph 0.15% : 0.000035s : 4: opt.transform.symbol_engine_opt 9.13% : 0.002067s : 1: opt_a 0.47% : 0.000106s : 1: opt_after_cconv 2.13% : 0.000481s : 1: opt_after_jit_grad 1.13% : 0.000256s : 1: opt_b 18.09% : 0.004095s : 1: optimize 0.09% : 0.000020s : 1: optimize_parallel_all_gather_comm 0.04% : 0.000008s : 1: order_py_execute_after_rewriter 0.10% : 0.000022s : 1: overlap_grad_flash_sp 0.02% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.03% : 0.000007s : 1: overlap_grad_ring_attention 0.02% : 0.000004s : 1: overlap_opt_shard_grad_in_pipeline 0.02% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.02% : 0.000005s : 1: overlap_param_gather 0.02% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.03% : 0.000008s : 1: overlap_recompute_and_grad_model_parallel 0.02% : 0.000005s : 1: overlap_recompute_comm 0.03% : 0.000007s : 1: parallel-infer-symbol 0.02% : 0.000003s : 1: parallel-infer-symbol-second 0.02% : 0.000005s : 1: partial_unused_args_eliminate 0.02% : 0.000005s : 1: pipeline_parallel_scheduler 0.02% : 0.000004s : 1: pipeline_split 0.13% : 0.000030s : 1: pre_auto_parallel 0.09% : 0.000021s : 1: py_interpret_to_execute 0.05% : 0.000012s : 1: py_interpret_to_execute_after_opt_a 0.02% : 0.000004s : 1: remove_cast_before_assign_add 0.08% : 0.000018s : 1: remove_dup_value 1.15% : 0.000261s : 1: renormalize.infer 0.93% : 0.000210s : 1: renormalize.specialize 0.03% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.03% : 0.000007s : 1: rewriter_after_jit_bprop_graph 0.18% : 0.000041s : 1: rewriter_after_opt_a 0.21% : 0.000048s : 1: rewriter_before_opt_a 0.02% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.02% : 0.000005s : 1: slice_recompute_activation 0.02% : 0.000004s : 1: split_layernorm_comm 0.02% : 0.000005s : 1: split_matmul_comm_elemetwise 0.04% : 0.000008s : 1: swap_dp_allreduce_reducescatter 0.34% : 0.000076s : 1: symbol_engine_optimizer 0.32% : 0.000073s : 1: tuple_transform 22.69% : 0.005134s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:47:02.922.292 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:47:02.922.557 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0235044, [21] [bootstrap]: 0.0004349 [type_inference]: 0.0126396 [event_method]: 1.935e-05 [auto_monad]: 8.911e-05 [graph_reusing]: 6.78e-06 [inline]: 2.32999e-06 [add_attr]: 0.00323222, [1] [add_attr_with_inline]: 0.00322344, [1] [Cycle 1]: 8.369e-05, [2] [tag_attr]: 2.025e-05 [meta_addattr_fg_expand]: 5.87999e-06 [parallel-infer-symbol]: 2.91e-06 [pre_auto_parallel]: 3.658e-05 [insert-virtual-dataset]: 2.56e-06 [parallel-infer-symbol-second]: 6.89994e-07 [dataset_repeat_opt]: 2.48002e-06 [pipeline_split]: 1.65001e-06 [optimize]: 0.00576458, [53] [py_interpret_to_execute]: 3.033e-05 [rewriter_before_opt_a]: 8.242e-05 [opt_a]: 0.00330864, [2] [Cycle 1]: 0.00246268, [45] [expand_dump_flag]: 3.48999e-06 [switch_simplify]: 0.00013998 [loop_unroll]: 2.513e-05 [a_1]: 0.0005239 [with_stream_mark]: 1.837e-05 [recompute_prepare]: 1.001e-05 [updatestate_depend_eliminate]: 4.56002e-06 [updatestate_assign_eliminate]: 3.51001e-06 [updatestate_loads_eliminate]: 2.73e-06 [parameter_eliminate]: 2.43e-06 [a_2]: 0.00010823 [accelerated_algorithm]: 6.78998e-06 [shard]: 2.76e-06 [meta_shard_fg_expand]: 1.80001e-06 [shard_inline]: 6.35002e-06 [merge_send_recv]: 8.74e-06 [auto_parallel]: 7.4e-06 [parallel]: 1.871e-05 [flash_sp]: 9.27999e-06 [merge_comm]: 4.55001e-06 [allreduce_fusion]: 3.72998e-06 [matmul_add_comm_reduction]: 1.006e-05 [allreduce_slice_to_reducescatter]: 6.60017e-07 [virtual_shard_identity]: 9.10001e-06 [virtual_dataset]: 6.94999e-06 [get_grad_eliminate_]: 6.14001e-06 [virtual_output]: 5.78002e-06 [merge_forward]: 4.13999e-06 [cell_reuse_recompute_pass]: 1.16002e-06 [offload_activation]: 1.044e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.538e-05 [merge_recompute_call_nodes]: 1.76e-06 [before_grad]: 1.248e-05 [set_forward_comm_id_for_comm_node_pass]: 4.00998e-06 [meta_fg_expand]: 3.41999e-06 [flash_sp_send_recv_attached]: 2.58998e-06 [receive_attached]: 2.76999e-06 [after_resolve]: 1.077e-05 [a_after_grad]: 9.56e-06 [renormalize]: 0.0008923 [add_forward_monad_depend]: 5.81e-06 [auto_monad_grad]: 2.56e-06 [auto_monad_eliminator]: 1.771e-05 [cse]: 3.133e-05 [a_3]: 6.259e-05 [Cycle 2]: 0.00083177, [45] [expand_dump_flag]: 1.54e-06 [switch_simplify]: 7.63001e-06 [loop_unroll]: 5.89e-06 [a_1]: 0.00010714 [with_stream_mark]: 1.333e-05 [recompute_prepare]: 6.44999e-06 [updatestate_depend_eliminate]: 3.50998e-06 [updatestate_assign_eliminate]: 2.41e-06 [updatestate_loads_eliminate]: 2.76e-06 [parameter_eliminate]: 1.72001e-06 [a_2]: 9.718e-05 [accelerated_algorithm]: 6.22001e-06 [shard]: 1.81e-06 [meta_shard_fg_expand]: 1.94999e-06 [shard_inline]: 6.91999e-06 [merge_send_recv]: 6.90998e-06 [auto_parallel]: 6.18002e-06 [parallel]: 6.78e-06 [flash_sp]: 3.87002e-06 [merge_comm]: 3.75998e-06 [allreduce_fusion]: 3.88999e-06 [matmul_add_comm_reduction]: 7.13998e-06 [allreduce_slice_to_reducescatter]: 6.79982e-07 [virtual_shard_identity]: 8.2e-06 [virtual_dataset]: 5.52001e-06 [get_grad_eliminate_]: 5.78997e-06 [virtual_output]: 5.37999e-06 [merge_forward]: 3.30003e-06 [cell_reuse_recompute_pass]: 1.72999e-06 [offload_activation]: 7.16999e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.69e-05 [merge_recompute_call_nodes]: 1.28002e-06 [before_grad]: 9.95002e-06 [set_forward_comm_id_for_comm_node_pass]: 4.73001e-06 [meta_fg_expand]: 2.58e-06 [flash_sp_send_recv_attached]: 1.77001e-06 [receive_attached]: 1.37e-06 [after_resolve]: 9.19e-06 [a_after_grad]: 8.44002e-06 [renormalize]: 8.9989e-08 [add_forward_monad_depend]: 2.76999e-06 [auto_monad_grad]: 1.43002e-06 [auto_monad_eliminator]: 9.05001e-06 [cse]: 1.924e-05 [a_3]: 5.038e-05 [py_interpret_to_execute_after_opt_a]: 1.411e-05 [slice_cell_reuse_recomputed_activation]: 4.68999e-06 [rewriter_after_opt_a]: 4.579e-05 [convert_after_rewriter]: 9.82001e-06 [order_py_execute_after_rewriter]: 8.77e-06 [mutable_eliminate]: 0.00058161 [opt_b]: 0.00030751, [1] [Cycle 1]: 0.00029691, [7] [b_1]: 0.00019361 [b_2]: 9.02e-06 [updatestate_depend_eliminate]: 7.16001e-06 [updatestate_assign_eliminate]: 2.43e-06 [updatestate_loads_eliminate]: 2.43e-06 [renormalize]: 1.00001e-06 [cse]: 2.225e-05 [optimize_parallel_all_gather_comm]: 2.078e-05 [overlap_param_gather]: 4.92e-06 [cconv]: 3.161e-05 [loop_unroll]: 0.00046683 [opt_after_cconv]: 0.00013174, [1] [Cycle 1]: 0.00012324, [7] [c_1]: 2.837e-05 [parameter_eliminate]: 3.14999e-06 [updatestate_depend_eliminate]: 7.45e-06 [updatestate_assign_eliminate]: 2.50002e-06 [updatestate_loads_eliminate]: 2.65997e-06 [cse]: 2.277e-05 [renormalize]: 3.4002e-07 [remove_dup_value]: 1.78e-05 [tuple_transform]: 8.759e-05, [1] [Cycle 1]: 8.078e-05, [4] [d_1]: 4.043e-05 [none_parameter_eliminate]: 1.77999e-06 [renormalize]: 1.8999e-07 [switch_simplify]: 6.93e-06 [partial_unused_args_eliminate]: 4.91002e-06 [add_recomputation]: 5.053e-05 [cse_after_recomputation]: 2.771e-05, [1] [Cycle 1]: 2.091e-05, [1] [cse]: 1.142e-05 [environ_conv]: 8.37998e-06 [swap_dp_allreduce_reducescatter]: 7.49002e-06 [bias_add_comm_swap]: 5.14998e-06 [label_micro_interleaved_index]: 7e-06 [label_fine_grained_interleaved_index]: 5.00001e-06 [merge_cast_opt]: 3.58e-06 [slice_recompute_activation]: 4.47e-06 [micro_interleaved_order_control]: 4.73001e-06 [assign_add_opt]: 3.35e-06 [ForceFp32Comm]: 2.95998e-06 [remove_cast_before_assign_add]: 3.42002e-06 [full_micro_interleaved_order_control]: 4.60001e-06 [reorder_send_recv_between_fp_bp]: 5.42001e-06 [comm_op_add_attrs]: 3.31001e-06 [add_comm_op_reuse_tag]: 3.41999e-06 [interleave_split_concat_branches]: 3.67998e-06 [interleave_parallel_branches]: 3.35e-06 [overlap_opt_shard_in_pipeline]: 3.39001e-06 [overlap_opt_shard_grad_in_pipeline]: 5.13002e-06 [control_data_broadcast_order]: 1.672e-05 [grouped_pairwise_exchange_alltoall]: 4.70999e-06 [offloading_packed_experts]: 6.80998e-06 [overlap_recompute_and_grad_model_parallel]: 8.37998e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.57002e-06 [overlap_recompute_allgather_and_fa_grad]: 4.32e-06 [overlap_recompute_comm]: 5.24003e-06 [overlap_grad_ring_attention]: 6.48e-06 [overlap_grad_flash_sp]: 3.786e-05 [begin_end_overlap_inline]: 3.12002e-06 [split_matmul_comm_elemetwise]: 4.90001e-06 [split_layernorm_comm]: 4.36002e-06 [handle_group_info]: 3.44001e-06 [symbol_engine_optimizer]: 0.00010454, [1] [Cycle 1]: 9.707e-05, [6] [build]: 3.05998e-06 [elim_shapecalc]: 1.264e-05 [elim_not_effective]: 1.317e-05 [opt_reshape]: 6.81001e-06 [fold_const_symbol]: 1.013e-05 [renormalize]: 3.7998e-07 [detach_backward]: 5.02e-06 [pipeline_parallel_scheduler]: 2.29999e-06 [auto_monad_reorder]: 2.2e-05 [get_jit_bprop_graph]: 1.87001e-06 [rewriter_after_jit_bprop_graph]: 5.64e-06 [opt_after_jit_grad]: 0.00053565 [validate]: 4.212e-05 Sums bootstrap : 0.000435s : 2.37% type_inference : 0.012640s : 68.74% event_method : 0.000019s : 0.11% auto_monad : 0.000089s : 0.48% graph_reusing : 0.000007s : 0.04% inline : 0.000002s : 0.01% add_attr.add_attr_with_inline.tag_attr : 0.000020s : 0.11% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.03% parallel-infer-symbol : 0.000003s : 0.02% pre_auto_parallel : 0.000037s : 0.20% insert-virtual-dataset : 0.000003s : 0.01% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.01% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000030s : 0.16% optimize.rewriter_before_opt_a : 0.000082s : 0.45% optimize.opt_a.expand_dump_flag : 0.000005s : 0.03% optimize.opt_a.switch_simplify : 0.000148s : 0.80% optimize.opt_a.loop_unroll : 0.000031s : 0.17% optimize.opt_a.a_1 : 0.000631s : 3.43% optimize.opt_a.with_stream_mark : 0.000032s : 0.17% optimize.opt_a.recompute_prepare : 0.000016s : 0.09% optimize.opt_a.updatestate_depend_eliminate : 0.000008s : 0.04% optimize.opt_a.updatestate_assign_eliminate : 0.000006s : 0.03% optimize.opt_a.updatestate_loads_eliminate : 0.000005s : 0.03% optimize.opt_a.parameter_eliminate : 0.000004s : 0.02% optimize.opt_a.a_2 : 0.000205s : 1.12% optimize.opt_a.accelerated_algorithm : 0.000013s : 0.07% optimize.opt_a.shard : 0.000005s : 0.02% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.02% optimize.opt_a.shard_inline : 0.000013s : 0.07% optimize.opt_a.merge_send_recv : 0.000016s : 0.09% optimize.opt_a.auto_parallel : 0.000014s : 0.07% optimize.opt_a.parallel : 0.000025s : 0.14% optimize.opt_a.flash_sp : 0.000013s : 0.07% optimize.opt_a.merge_comm : 0.000008s : 0.05% optimize.opt_a.allreduce_fusion : 0.000008s : 0.04% optimize.opt_a.matmul_add_comm_reduction : 0.000017s : 0.09% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000017s : 0.09% optimize.opt_a.virtual_dataset : 0.000012s : 0.07% optimize.opt_a.get_grad_eliminate_ : 0.000012s : 0.06% optimize.opt_a.virtual_output : 0.000011s : 0.06% optimize.opt_a.merge_forward : 0.000007s : 0.04% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.02% optimize.opt_a.offload_activation : 0.000018s : 0.10% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000032s : 0.18% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.02% optimize.opt_a.before_grad : 0.000022s : 0.12% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000009s : 0.05% optimize.opt_a.meta_fg_expand : 0.000006s : 0.03% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.02% optimize.opt_a.receive_attached : 0.000004s : 0.02% optimize.opt_a.after_resolve : 0.000020s : 0.11% optimize.opt_a.a_after_grad : 0.000018s : 0.10% optimize.opt_a.renormalize : 0.000892s : 4.85% optimize.opt_a.add_forward_monad_depend : 0.000009s : 0.05% optimize.opt_a.auto_monad_grad : 0.000004s : 0.02% optimize.opt_a.auto_monad_eliminator : 0.000027s : 0.15% optimize.opt_a.cse : 0.000051s : 0.28% optimize.opt_a.a_3 : 0.000113s : 0.61% optimize.py_interpret_to_execute_after_opt_a : 0.000014s : 0.08% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.03% optimize.rewriter_after_opt_a : 0.000046s : 0.25% optimize.convert_after_rewriter : 0.000010s : 0.05% optimize.order_py_execute_after_rewriter : 0.000009s : 0.05% optimize.mutable_eliminate : 0.000582s : 3.16% optimize.opt_b.b_1 : 0.000194s : 1.05% optimize.opt_b.b_2 : 0.000009s : 0.05% optimize.opt_b.updatestate_depend_eliminate : 0.000007s : 0.04% optimize.opt_b.updatestate_assign_eliminate : 0.000002s : 0.01% optimize.opt_b.updatestate_loads_eliminate : 0.000002s : 0.01% optimize.opt_b.renormalize : 0.000001s : 0.01% optimize.opt_b.cse : 0.000022s : 0.12% optimize.optimize_parallel_all_gather_comm : 0.000021s : 0.11% optimize.overlap_param_gather : 0.000005s : 0.03% optimize.cconv : 0.000032s : 0.17% optimize.loop_unroll : 0.000467s : 2.54% optimize.opt_after_cconv.c_1 : 0.000028s : 0.15% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.02% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000007s : 0.04% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.cse : 0.000023s : 0.12% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000018s : 0.10% optimize.tuple_transform.d_1 : 0.000040s : 0.22% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000007s : 0.04% optimize.partial_unused_args_eliminate : 0.000005s : 0.03% optimize.add_recomputation : 0.000051s : 0.27% optimize.cse_after_recomputation.cse : 0.000011s : 0.06% optimize.environ_conv : 0.000008s : 0.05% optimize.swap_dp_allreduce_reducescatter : 0.000007s : 0.04% optimize.bias_add_comm_swap : 0.000005s : 0.03% optimize.label_micro_interleaved_index : 0.000007s : 0.04% optimize.label_fine_grained_interleaved_index : 0.000005s : 0.03% optimize.merge_cast_opt : 0.000004s : 0.02% optimize.slice_recompute_activation : 0.000004s : 0.02% optimize.micro_interleaved_order_control : 0.000005s : 0.03% optimize.assign_add_opt : 0.000003s : 0.02% optimize.ForceFp32Comm : 0.000003s : 0.02% optimize.remove_cast_before_assign_add : 0.000003s : 0.02% optimize.full_micro_interleaved_order_control : 0.000005s : 0.03% optimize.reorder_send_recv_between_fp_bp : 0.000005s : 0.03% optimize.comm_op_add_attrs : 0.000003s : 0.02% optimize.add_comm_op_reuse_tag : 0.000003s : 0.02% optimize.interleave_split_concat_branches : 0.000004s : 0.02% optimize.interleave_parallel_branches : 0.000003s : 0.02% optimize.overlap_opt_shard_in_pipeline : 0.000003s : 0.02% optimize.overlap_opt_shard_grad_in_pipeline : 0.000005s : 0.03% optimize.control_data_broadcast_order : 0.000017s : 0.09% optimize.grouped_pairwise_exchange_alltoall : 0.000005s : 0.03% optimize.offloading_packed_experts : 0.000007s : 0.04% optimize.overlap_recompute_and_grad_model_parallel : 0.000008s : 0.05% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.02% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.02% optimize.overlap_recompute_comm : 0.000005s : 0.03% optimize.overlap_grad_ring_attention : 0.000006s : 0.04% optimize.overlap_grad_flash_sp : 0.000038s : 0.21% optimize.begin_end_overlap_inline : 0.000003s : 0.02% optimize.split_matmul_comm_elemetwise : 0.000005s : 0.03% optimize.split_layernorm_comm : 0.000004s : 0.02% optimize.handle_group_info : 0.000003s : 0.02% optimize.symbol_engine_optimizer.build : 0.000003s : 0.02% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000013s : 0.07% optimize.symbol_engine_optimizer.elim_not_effective : 0.000013s : 0.07% optimize.symbol_engine_optimizer.opt_reshape : 0.000007s : 0.04% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000010s : 0.06% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000005s : 0.03% pipeline_parallel_scheduler : 0.000002s : 0.01% auto_monad_reorder : 0.000022s : 0.12% get_jit_bprop_graph : 0.000002s : 0.01% rewriter_after_jit_bprop_graph : 0.000006s : 0.03% opt_after_jit_grad : 0.000536s : 2.91% validate : 0.000042s : 0.23% Time group info: ------[substitution.] 0.000162 25 1.16% : 0.000002s : 2: substitution.elim_not_effective 0.86% : 0.000001s : 2: substitution.fold_const_symbol 3.55% : 0.000006s : 3: substitution.graph_param_transform 79.26% : 0.000129s : 6: substitution.inline 3.29% : 0.000005s : 4: substitution.j_node_and_user_rematch 3.00% : 0.000005s : 4: substitution.remove_not_recompute_node 2.11% : 0.000003s : 2: substitution.replace_old_param 6.77% : 0.000011s : 2: substitution.switch_simplify ------[type_inference.] 0.012579 2 92.16% : 0.011593s : 1: type_inference.infer 7.84% : 0.000987s : 1: type_inference.specialize ------[replace.] 0.000093 8 45.32% : 0.000042s : 6: replace.inline 54.68% : 0.000051s : 2: replace.switch_simplify ------[match.] 0.000135 8 93.00% : 0.000125s : 6: match.inline 7.00% : 0.000009s : 2: match.switch_simplify ------[predicate.] 0.000178 996 1.09% : 0.000002s : 11: predicate.accumulaten_eliminater 0.92% : 0.000002s : 3: predicate.ad_related_special_op_eliminate 0.51% : 0.000001s : 6: predicate.addn_check_dump 1.01% : 0.000002s : 11: predicate.addn_zero_filter 0.83% : 0.000001s : 11: predicate.adjust_all_reduce_mul_add 2.53% : 0.000005s : 17: predicate.arithmetic_simplify 1.09% : 0.000002s : 11: predicate.cast_eliminate 0.77% : 0.000001s : 6: predicate.check_bprop_eliminate 0.57% : 0.000001s : 6: predicate.compare_switch_simplify 0.15% : 0.000000s : 3: predicate.const_output_eliminate 0.61% : 0.000001s : 6: predicate.depend_value_elim 0.95% : 0.000002s : 11: predicate.dict_get_item_const_eliminator 1.06% : 0.000002s : 11: predicate.dict_get_item_eliminator 0.91% : 0.000002s : 11: predicate.dict_set_item_eliminator 1.39% : 0.000002s : 6: predicate.dumpgradient_eliminate 0.19% : 0.000000s : 3: predicate.elim_not_effective 0.44% : 0.000001s : 3: predicate.elim_shapecalc_of_broadcastargs 1.16% : 0.000002s : 14: predicate.environ_add_const_eliminate 1.26% : 0.000002s : 14: predicate.environ_get_add_eliminate 1.07% : 0.000002s : 14: predicate.environ_get_depend_swap 1.76% : 0.000003s : 20: predicate.environ_get_eliminate 1.07% : 0.000002s : 14: predicate.environ_get_set_eliminate 1.38% : 0.000002s : 17: predicate.exchange_switch_depend_value 2.46% : 0.000004s : 17: predicate.float_depend_g_call 0.49% : 0.000001s : 6: predicate.float_environ_get_switch 0.72% : 0.000001s : 9: predicate.float_tuple_getitem_switch 0.16% : 0.000000s : 3: predicate.fold_const_symbol 0.59% : 0.000001s : 6: predicate.get_grad_eliminate 0.20% : 0.000000s : 3: predicate.graph_param_transform 0.59% : 0.000001s : 6: predicate.incorporate_call 0.47% : 0.000001s : 6: predicate.incorporate_call_switch 6.50% : 0.000012s : 46: predicate.inline 0.78% : 0.000001s : 6: predicate.inline_without_move 0.31% : 0.000001s : 6: predicate.j_node_and_user_rematch 0.72% : 0.000001s : 6: predicate.less_batch_normalization 1.59% : 0.000003s : 17: predicate.list_to_tuple_eliminator_ 2.21% : 0.000004s : 28: predicate.load_eliminater 1.14% : 0.000002s : 3: predicate.loop_unroll_after_grad 2.40% : 0.000004s : 27: predicate.loop_unroll_before_grad 1.61% : 0.000003s : 17: predicate.make_slice_get_slice_eliminator 0.54% : 0.000001s : 6: predicate.merge_addn 0.65% : 0.000001s : 6: predicate.micro_step_allgather_replace 0.52% : 0.000001s : 6: predicate.mini_step_allgather_replace 0.83% : 0.000001s : 11: predicate.minmaximum_grad 1.69% : 0.000003s : 3: predicate.mutable_eliminate 0.31% : 0.000001s : 3: predicate.opt_reshape 0.35% : 0.000001s : 3: predicate.parallel_virtual_node 2.10% : 0.000004s : 17: predicate.partial_defer_inline 1.35% : 0.000002s : 14: predicate.partial_eliminate 0.92% : 0.000002s : 11: predicate.print_const_string_wrapper 0.55% : 0.000001s : 6: predicate.reduce_all_const_elim 1.29% : 0.000002s : 11: predicate.reduce_eliminate 2.17% : 0.000004s : 28: predicate.redundant_stop_gradient_eliminater 0.46% : 0.000001s : 6: predicate.remove_not_recompute_node 1.16% : 0.000002s : 17: predicate.replace_applicator 0.39% : 0.000001s : 6: predicate.replace_old_param 0.26% : 0.000000s : 3: predicate.reset_defer_inline 1.01% : 0.000002s : 11: predicate.reshape_eliminate 0.61% : 0.000001s : 6: predicate.row_tensor_add_zeros_like 0.31% : 0.000001s : 3: predicate.row_tensor_eliminate 0.93% : 0.000002s : 6: predicate.same_eliminate 0.41% : 0.000001s : 6: predicate.set_cell_output_no_recompute 0.77% : 0.000001s : 6: predicate.shard_identity_eliminate 0.67% : 0.000001s : 6: predicate.special_op_eliminate 0.84% : 0.000001s : 6: predicate.specialize_transform 0.95% : 0.000002s : 6: predicate.split_environ_get_set_with_tuple_value 0.66% : 0.000001s : 6: predicate.stack_unstack_eliminate 0.33% : 0.000001s : 3: predicate.switch_call_monad_eliminater 1.56% : 0.000003s : 17: predicate.switch_defer_inline 2.11% : 0.000004s : 23: predicate.switch_layer_defer_inline 5.75% : 0.000010s : 57: predicate.switch_simplify 0.92% : 0.000002s : 11: predicate.tile_eliminate 1.01% : 0.000002s : 11: predicate.transpose_eliminate 1.51% : 0.000003s : 17: predicate.tuple_list_convert_item_index_to_positive 1.69% : 0.000003s : 17: predicate.tuple_list_get_item_const_eliminator 1.44% : 0.000003s : 17: predicate.tuple_list_get_item_depend_reorder 2.79% : 0.000005s : 23: predicate.tuple_list_get_item_eliminator 1.41% : 0.000003s : 17: predicate.tuple_list_get_set_item_eliminator 2.17% : 0.000004s : 23: predicate.tuple_list_set_item_eliminator 1.52% : 0.000003s : 17: predicate.tuple_to_list_eliminator_ 2.19% : 0.000004s : 28: predicate.updatestate_pure_node_eliminater 3.01% : 0.000005s : 34: predicate.updatestate_useless_node_eliminater 0.34% : 0.000001s : 3: predicate.value_based_eliminate 0.60% : 0.000001s : 6: predicate.virtual_dataset_eliminate 0.54% : 0.000001s : 6: predicate.virtual_output_eliminate 0.25% : 0.000000s : 3: predicate.virtual_view_grad_eliminate 0.48% : 0.000001s : 3: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000662 10 51.87% : 0.000343s : 2: func_graph_cloner_run.FuncGraphClonerGraph 48.13% : 0.000319s : 8: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.034692 192 0.02% : 0.000006s : 1: ForceFp32Comm 9.34% : 0.003242s : 1: add_attr 9.30% : 0.003227s : 1: add_attr_with_inline 0.02% : 0.000006s : 1: add_comm_op_reuse_tag 0.16% : 0.000054s : 1: add_recomputation 0.02% : 0.000006s : 1: assign_add_opt 0.29% : 0.000099s : 1: auto_monad 0.09% : 0.000030s : 1: auto_monad_reorder 0.02% : 0.000006s : 1: begin_end_overlap_inline 0.02% : 0.000008s : 1: bias_add_comm_swap 1.38% : 0.000478s : 1: bootstrap 0.10% : 0.000035s : 1: cconv 0.02% : 0.000007s : 1: comm_op_add_attrs 0.06% : 0.000020s : 1: control_data_broadcast_order 0.04% : 0.000014s : 1: convert_after_rewriter 0.09% : 0.000031s : 1: cse_after_recomputation 0.02% : 0.000008s : 1: dataset_repeat_opt 0.08% : 0.000028s : 1: detach_backward 0.03% : 0.000011s : 1: environ_conv 0.09% : 0.000031s : 1: event_method 0.02% : 0.000007s : 1: full_micro_interleaved_order_control 0.02% : 0.000008s : 1: get_jit_bprop_graph 0.04% : 0.000014s : 1: graph_reusing 0.02% : 0.000008s : 1: grouped_pairwise_exchange_alltoall 0.02% : 0.000006s : 1: handle_group_info 0.02% : 0.000008s : 1: inline 0.02% : 0.000008s : 1: insert-virtual-dataset 0.02% : 0.000006s : 1: interleave_parallel_branches 0.02% : 0.000006s : 1: interleave_split_concat_branches 0.02% : 0.000008s : 1: label_fine_grained_interleaved_index 0.03% : 0.000010s : 1: label_micro_interleaved_index 1.37% : 0.000474s : 1: loop_unroll 0.02% : 0.000006s : 1: merge_cast_opt 0.02% : 0.000007s : 1: micro_interleaved_order_control 1.70% : 0.000589s : 1: mutable_eliminate 0.03% : 0.000010s : 1: offloading_packed_experts 0.05% : 0.000017s : 1: opt.transform.loop_unroll_optimizer 0.05% : 0.000019s : 1: opt.transform.mutable_eliminate 3.27% : 0.001135s : 78: opt.transform.opt_a 0.08% : 0.000027s : 1: opt.transform.opt_after_cconv 0.08% : 0.000029s : 1: opt.transform.opt_after_jit_grad 0.37% : 0.000127s : 28: opt.transform.opt_b 0.13% : 0.000045s : 2: opt.transform.opt_trans_graph 0.11% : 0.000038s : 4: opt.transform.symbol_engine_opt 9.55% : 0.003312s : 1: opt_a 0.39% : 0.000135s : 1: opt_after_cconv 1.58% : 0.000548s : 1: opt_after_jit_grad 0.90% : 0.000311s : 1: opt_b 17.77% : 0.006166s : 1: optimize 0.07% : 0.000024s : 1: optimize_parallel_all_gather_comm 0.03% : 0.000012s : 1: order_py_execute_after_rewriter 0.12% : 0.000042s : 1: overlap_grad_flash_sp 0.02% : 0.000006s : 1: overlap_grad_matmul_and_grad_allreduce 0.03% : 0.000010s : 1: overlap_grad_ring_attention 0.02% : 0.000008s : 1: overlap_opt_shard_grad_in_pipeline 0.02% : 0.000006s : 1: overlap_opt_shard_in_pipeline 0.02% : 0.000008s : 1: overlap_param_gather 0.02% : 0.000007s : 1: overlap_recompute_allgather_and_fa_grad 0.03% : 0.000012s : 1: overlap_recompute_and_grad_model_parallel 0.02% : 0.000008s : 1: overlap_recompute_comm 0.03% : 0.000009s : 1: parallel-infer-symbol 0.02% : 0.000006s : 1: parallel-infer-symbol-second 0.02% : 0.000008s : 1: partial_unused_args_eliminate 0.03% : 0.000009s : 1: pipeline_parallel_scheduler 0.02% : 0.000007s : 1: pipeline_split 0.13% : 0.000044s : 1: pre_auto_parallel 0.10% : 0.000034s : 1: py_interpret_to_execute 0.05% : 0.000019s : 1: py_interpret_to_execute_after_opt_a 0.02% : 0.000006s : 1: remove_cast_before_assign_add 0.06% : 0.000021s : 1: remove_dup_value 1.42% : 0.000494s : 1: renormalize.infer 1.12% : 0.000388s : 1: renormalize.specialize 0.02% : 0.000008s : 1: reorder_send_recv_between_fp_bp 0.04% : 0.000012s : 1: rewriter_after_jit_bprop_graph 0.14% : 0.000050s : 1: rewriter_after_opt_a 0.25% : 0.000087s : 1: rewriter_before_opt_a 0.02% : 0.000008s : 1: slice_cell_reuse_recomputed_activation 0.02% : 0.000007s : 1: slice_recompute_activation 0.02% : 0.000007s : 1: split_layernorm_comm 0.02% : 0.000008s : 1: split_matmul_comm_elemetwise 0.03% : 0.000011s : 1: swap_dp_allreduce_reducescatter 0.31% : 0.000108s : 1: symbol_engine_optimizer 0.26% : 0.000091s : 1: tuple_transform 36.53% : 0.012673s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:47:03.130.826 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0222708, [21] [bootstrap]: 0.00044907 [type_inference]: 0.0126155 [event_method]: 1.909e-05 [auto_monad]: 8.642e-05 [graph_reusing]: 6.24999e-06 [inline]: 2.62001e-06 [add_attr]: 0.00321368, [1] [add_attr_with_inline]: 0.00320392, [1] [Cycle 1]: 6.691e-05, [2] [tag_attr]: 2.045e-05 [meta_addattr_fg_expand]: 5.71998e-06 [parallel-infer-symbol]: 3.36001e-06 [pre_auto_parallel]: 3.461e-05 [insert-virtual-dataset]: 3.21999e-06 [parallel-infer-symbol-second]: 9.00007e-07 [dataset_repeat_opt]: 2.03002e-06 [pipeline_split]: 1.60999e-06 [optimize]: 0.00510948, [53] [py_interpret_to_execute]: 2.714e-05 [rewriter_before_opt_a]: 7.897e-05 [opt_a]: 0.00295921, [2] [Cycle 1]: 0.00227468, [45] [expand_dump_flag]: 3.33998e-06 [switch_simplify]: 0.00010978 [loop_unroll]: 2.577e-05 [a_1]: 0.00052026 [with_stream_mark]: 1.939e-05 [recompute_prepare]: 1.097e-05 [updatestate_depend_eliminate]: 4.53999e-06 [updatestate_assign_eliminate]: 3.4e-06 [updatestate_loads_eliminate]: 3.34001e-06 [parameter_eliminate]: 2.01e-06 [a_2]: 8.201e-05 [accelerated_algorithm]: 9.34e-06 [shard]: 2.39001e-06 [meta_shard_fg_expand]: 1.99e-06 [shard_inline]: 6.36e-06 [merge_send_recv]: 9.10001e-06 [auto_parallel]: 7.87e-06 [parallel]: 2.024e-05 [flash_sp]: 9.32999e-06 [merge_comm]: 4.90999e-06 [allreduce_fusion]: 3.91001e-06 [matmul_add_comm_reduction]: 9.27999e-06 [allreduce_slice_to_reducescatter]: 8.89995e-07 [virtual_shard_identity]: 9.36e-06 [virtual_dataset]: 6.98e-06 [get_grad_eliminate_]: 6.82002e-06 [virtual_output]: 6.45002e-06 [merge_forward]: 3.91999e-06 [cell_reuse_recompute_pass]: 1.20999e-06 [offload_activation]: 1.007e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.496e-05 [merge_recompute_call_nodes]: 1.50999e-06 [before_grad]: 1.095e-05 [set_forward_comm_id_for_comm_node_pass]: 4.62e-06 [meta_fg_expand]: 3.2e-06 [flash_sp_send_recv_attached]: 2.53e-06 [receive_attached]: 2.43e-06 [after_resolve]: 1.185e-05 [a_after_grad]: 8.57998e-06 [renormalize]: 0.00090379 [add_forward_monad_depend]: 6.23998e-06 [auto_monad_grad]: 2.81999e-06 [auto_monad_eliminator]: 1.713e-05 [cse]: 3.339e-05 [a_3]: 5.033e-05 [Cycle 2]: 0.00067395, [45] [expand_dump_flag]: 1.34e-06 [switch_simplify]: 7.83999e-06 [loop_unroll]: 6.25002e-06 [a_1]: 0.00010746 [with_stream_mark]: 1.361e-05 [recompute_prepare]: 6.87002e-06 [updatestate_depend_eliminate]: 3.7e-06 [updatestate_assign_eliminate]: 2.51e-06 [updatestate_loads_eliminate]: 3.06001e-06 [parameter_eliminate]: 1.34e-06 [a_2]: 7.547e-05 [accelerated_algorithm]: 8.08999e-06 [shard]: 1.89e-06 [meta_shard_fg_expand]: 1.53002e-06 [shard_inline]: 6.49999e-06 [merge_send_recv]: 6.88e-06 [auto_parallel]: 6.41e-06 [parallel]: 6.54999e-06 [flash_sp]: 3.99002e-06 [merge_comm]: 3.95998e-06 [allreduce_fusion]: 3.60998e-06 [matmul_add_comm_reduction]: 7.33e-06 [allreduce_slice_to_reducescatter]: 5.79981e-07 [virtual_shard_identity]: 9.66998e-06 [virtual_dataset]: 6.51e-06 [get_grad_eliminate_]: 5.39998e-06 [virtual_output]: 5.29998e-06 [merge_forward]: 3.34001e-06 [cell_reuse_recompute_pass]: 1.30001e-06 [offload_activation]: 7.48e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.459e-05 [merge_recompute_call_nodes]: 1.19e-06 [before_grad]: 1.02e-05 [set_forward_comm_id_for_comm_node_pass]: 4.25e-06 [meta_fg_expand]: 2.24001e-06 [flash_sp_send_recv_attached]: 1.04998e-06 [receive_attached]: 9.80013e-07 [after_resolve]: 1.004e-05 [a_after_grad]: 8.05999e-06 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 2.54999e-06 [auto_monad_grad]: 1.48002e-06 [auto_monad_eliminator]: 9.39e-06 [cse]: 1.854e-05 [a_3]: 3.695e-05 [py_interpret_to_execute_after_opt_a]: 1.006e-05 [slice_cell_reuse_recomputed_activation]: 2.04e-06 [rewriter_after_opt_a]: 3.854e-05 [convert_after_rewriter]: 7.45998e-06 [order_py_execute_after_rewriter]: 6.01e-06 [mutable_eliminate]: 0.0005475 [opt_b]: 0.00021448, [1] [Cycle 1]: 0.00020698, [7] [b_1]: 0.00012253 [b_2]: 8.70999e-06 [updatestate_depend_eliminate]: 6.79999e-06 [updatestate_assign_eliminate]: 2.37999e-06 [updatestate_loads_eliminate]: 2.37001e-06 [renormalize]: 5.69999e-07 [cse]: 2.438e-05 [optimize_parallel_all_gather_comm]: 1.785e-05 [overlap_param_gather]: 2.22001e-06 [cconv]: 2.701e-05 [loop_unroll]: 0.00047557 [opt_after_cconv]: 0.00010821, [1] [Cycle 1]: 0.00010267, [7] [c_1]: 2.864e-05 [parameter_eliminate]: 3.28e-06 [updatestate_depend_eliminate]: 6.69001e-06 [updatestate_assign_eliminate]: 2.49001e-06 [updatestate_loads_eliminate]: 2.40002e-06 [cse]: 2.373e-05 [renormalize]: 7.00005e-07 [remove_dup_value]: 1.485e-05 [tuple_transform]: 7.003e-05, [1] [Cycle 1]: 6.575e-05, [4] [d_1]: 3.896e-05 [none_parameter_eliminate]: 1.70001e-06 [renormalize]: 1.69995e-07 [switch_simplify]: 6.23e-06 [partial_unused_args_eliminate]: 1.77001e-06 [add_recomputation]: 4.822e-05 [cse_after_recomputation]: 2.207e-05, [1] [Cycle 1]: 1.786e-05, [1] [cse]: 1.161e-05 [environ_conv]: 5.03002e-06 [swap_dp_allreduce_reducescatter]: 4.94e-06 [bias_add_comm_swap]: 3.28e-06 [label_micro_interleaved_index]: 4.26001e-06 [label_fine_grained_interleaved_index]: 3.31999e-06 [merge_cast_opt]: 1.35001e-06 [slice_recompute_activation]: 2.41998e-06 [micro_interleaved_order_control]: 2.07999e-06 [assign_add_opt]: 1.49e-06 [ForceFp32Comm]: 1.04e-06 [remove_cast_before_assign_add]: 1.03001e-06 [full_micro_interleaved_order_control]: 2.02999e-06 [reorder_send_recv_between_fp_bp]: 2.83e-06 [comm_op_add_attrs]: 1.35001e-06 [add_comm_op_reuse_tag]: 9.39996e-07 [interleave_split_concat_branches]: 1.15999e-06 [interleave_parallel_branches]: 1.03001e-06 [overlap_opt_shard_in_pipeline]: 1.43002e-06 [overlap_opt_shard_grad_in_pipeline]: 2.11998e-06 [control_data_broadcast_order]: 1.321e-05 [grouped_pairwise_exchange_alltoall]: 1.78002e-06 [offloading_packed_experts]: 3.46001e-06 [overlap_recompute_and_grad_model_parallel]: 4.69998e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.19e-06 [overlap_recompute_allgather_and_fa_grad]: 1.37999e-06 [overlap_recompute_comm]: 2.53e-06 [overlap_grad_ring_attention]: 3.73999e-06 [overlap_grad_flash_sp]: 3.061e-05 [begin_end_overlap_inline]: 5.19998e-07 [split_matmul_comm_elemetwise]: 2.73998e-06 [split_layernorm_comm]: 1.81e-06 [handle_group_info]: 1.29998e-06 [symbol_engine_optimizer]: 8.514e-05, [1] [Cycle 1]: 8.027e-05, [6] [build]: 4.1e-06 [elim_shapecalc]: 1.275e-05 [elim_not_effective]: 1.424e-05 [opt_reshape]: 7.26999e-06 [fold_const_symbol]: 9.99001e-06 [renormalize]: 2.19996e-07 [detach_backward]: 2.83e-06 [pipeline_parallel_scheduler]: 1.52001e-06 [auto_monad_reorder]: 1.944e-05 [get_jit_bprop_graph]: 1.80001e-06 [rewriter_after_jit_bprop_graph]: 4.55999e-06 [opt_after_jit_grad]: 0.00049257 [validate]: 3.936e-05 Sums bootstrap : 0.000449s : 2.49% type_inference : 0.012615s : 70.05% event_method : 0.000019s : 0.11% auto_monad : 0.000086s : 0.48% graph_reusing : 0.000006s : 0.03% inline : 0.000003s : 0.01% add_attr.add_attr_with_inline.tag_attr : 0.000020s : 0.11% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.03% parallel-infer-symbol : 0.000003s : 0.02% pre_auto_parallel : 0.000035s : 0.19% insert-virtual-dataset : 0.000003s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.01% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000027s : 0.15% optimize.rewriter_before_opt_a : 0.000079s : 0.44% optimize.opt_a.expand_dump_flag : 0.000005s : 0.03% optimize.opt_a.switch_simplify : 0.000118s : 0.65% optimize.opt_a.loop_unroll : 0.000032s : 0.18% optimize.opt_a.a_1 : 0.000628s : 3.49% optimize.opt_a.with_stream_mark : 0.000033s : 0.18% optimize.opt_a.recompute_prepare : 0.000018s : 0.10% optimize.opt_a.updatestate_depend_eliminate : 0.000008s : 0.05% optimize.opt_a.updatestate_assign_eliminate : 0.000006s : 0.03% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.04% optimize.opt_a.parameter_eliminate : 0.000003s : 0.02% optimize.opt_a.a_2 : 0.000157s : 0.87% optimize.opt_a.accelerated_algorithm : 0.000017s : 0.10% optimize.opt_a.shard : 0.000004s : 0.02% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.02% optimize.opt_a.shard_inline : 0.000013s : 0.07% optimize.opt_a.merge_send_recv : 0.000016s : 0.09% optimize.opt_a.auto_parallel : 0.000014s : 0.08% optimize.opt_a.parallel : 0.000027s : 0.15% optimize.opt_a.flash_sp : 0.000013s : 0.07% optimize.opt_a.merge_comm : 0.000009s : 0.05% optimize.opt_a.allreduce_fusion : 0.000008s : 0.04% optimize.opt_a.matmul_add_comm_reduction : 0.000017s : 0.09% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000019s : 0.11% optimize.opt_a.virtual_dataset : 0.000013s : 0.07% optimize.opt_a.get_grad_eliminate_ : 0.000012s : 0.07% optimize.opt_a.virtual_output : 0.000012s : 0.07% optimize.opt_a.merge_forward : 0.000007s : 0.04% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.01% optimize.opt_a.offload_activation : 0.000018s : 0.10% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000030s : 0.16% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.01% optimize.opt_a.before_grad : 0.000021s : 0.12% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000009s : 0.05% optimize.opt_a.meta_fg_expand : 0.000005s : 0.03% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.02% optimize.opt_a.receive_attached : 0.000003s : 0.02% optimize.opt_a.after_resolve : 0.000022s : 0.12% optimize.opt_a.a_after_grad : 0.000017s : 0.09% optimize.opt_a.renormalize : 0.000904s : 5.02% optimize.opt_a.add_forward_monad_depend : 0.000009s : 0.05% optimize.opt_a.auto_monad_grad : 0.000004s : 0.02% optimize.opt_a.auto_monad_eliminator : 0.000027s : 0.15% optimize.opt_a.cse : 0.000052s : 0.29% optimize.opt_a.a_3 : 0.000087s : 0.48% optimize.py_interpret_to_execute_after_opt_a : 0.000010s : 0.06% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.01% optimize.rewriter_after_opt_a : 0.000039s : 0.21% optimize.convert_after_rewriter : 0.000007s : 0.04% optimize.order_py_execute_after_rewriter : 0.000006s : 0.03% optimize.mutable_eliminate : 0.000547s : 3.04% optimize.opt_b.b_1 : 0.000123s : 0.68% optimize.opt_b.b_2 : 0.000009s : 0.05% optimize.opt_b.updatestate_depend_eliminate : 0.000007s : 0.04% optimize.opt_b.updatestate_assign_eliminate : 0.000002s : 0.01% optimize.opt_b.updatestate_loads_eliminate : 0.000002s : 0.01% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000024s : 0.14% optimize.optimize_parallel_all_gather_comm : 0.000018s : 0.10% optimize.overlap_param_gather : 0.000002s : 0.01% optimize.cconv : 0.000027s : 0.15% optimize.loop_unroll : 0.000476s : 2.64% optimize.opt_after_cconv.c_1 : 0.000029s : 0.16% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.02% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000007s : 0.04% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000002s : 0.01% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.01% optimize.opt_after_cconv.cse : 0.000024s : 0.13% optimize.opt_after_cconv.renormalize : 0.000001s : 0.00% optimize.remove_dup_value : 0.000015s : 0.08% optimize.tuple_transform.d_1 : 0.000039s : 0.22% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000006s : 0.03% optimize.partial_unused_args_eliminate : 0.000002s : 0.01% optimize.add_recomputation : 0.000048s : 0.27% optimize.cse_after_recomputation.cse : 0.000012s : 0.06% optimize.environ_conv : 0.000005s : 0.03% optimize.swap_dp_allreduce_reducescatter : 0.000005s : 0.03% optimize.bias_add_comm_swap : 0.000003s : 0.02% optimize.label_micro_interleaved_index : 0.000004s : 0.02% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.02% optimize.merge_cast_opt : 0.000001s : 0.01% optimize.slice_recompute_activation : 0.000002s : 0.01% optimize.micro_interleaved_order_control : 0.000002s : 0.01% optimize.assign_add_opt : 0.000001s : 0.01% optimize.ForceFp32Comm : 0.000001s : 0.01% optimize.remove_cast_before_assign_add : 0.000001s : 0.01% optimize.full_micro_interleaved_order_control : 0.000002s : 0.01% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.02% optimize.comm_op_add_attrs : 0.000001s : 0.01% optimize.add_comm_op_reuse_tag : 0.000001s : 0.01% optimize.interleave_split_concat_branches : 0.000001s : 0.01% optimize.interleave_parallel_branches : 0.000001s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.01% optimize.control_data_broadcast_order : 0.000013s : 0.07% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.01% optimize.offloading_packed_experts : 0.000003s : 0.02% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.03% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.01% optimize.overlap_recompute_comm : 0.000003s : 0.01% optimize.overlap_grad_ring_attention : 0.000004s : 0.02% optimize.overlap_grad_flash_sp : 0.000031s : 0.17% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000003s : 0.02% optimize.split_layernorm_comm : 0.000002s : 0.01% optimize.handle_group_info : 0.000001s : 0.01% optimize.symbol_engine_optimizer.build : 0.000004s : 0.02% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000013s : 0.07% optimize.symbol_engine_optimizer.elim_not_effective : 0.000014s : 0.08% optimize.symbol_engine_optimizer.opt_reshape : 0.000007s : 0.04% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000010s : 0.06% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000003s : 0.02% pipeline_parallel_scheduler : 0.000002s : 0.01% auto_monad_reorder : 0.000019s : 0.11% get_jit_bprop_graph : 0.000002s : 0.01% rewriter_after_jit_bprop_graph : 0.000005s : 0.03% opt_after_jit_grad : 0.000493s : 2.73% validate : 0.000039s : 0.22% Time group info: ------[substitution.] 0.000162 25 1.15% : 0.000002s : 2: substitution.elim_not_effective 0.84% : 0.000001s : 2: substitution.fold_const_symbol 3.52% : 0.000006s : 3: substitution.graph_param_transform 79.47% : 0.000128s : 6: substitution.inline 2.52% : 0.000004s : 4: substitution.j_node_and_user_rematch 3.29% : 0.000005s : 4: substitution.remove_not_recompute_node 2.84% : 0.000005s : 2: substitution.replace_old_param 6.38% : 0.000010s : 2: substitution.switch_simplify ------[type_inference.] 0.012555 2 92.24% : 0.011581s : 1: type_inference.infer 7.76% : 0.000974s : 1: type_inference.specialize ------[replace.] 0.000090 8 47.69% : 0.000043s : 6: replace.inline 52.31% : 0.000047s : 2: replace.switch_simplify ------[match.] 0.000134 8 93.37% : 0.000125s : 6: match.inline 6.63% : 0.000009s : 2: match.switch_simplify ------[predicate.] 0.000177 996 0.91% : 0.000002s : 11: predicate.accumulaten_eliminater 1.06% : 0.000002s : 3: predicate.ad_related_special_op_eliminate 0.51% : 0.000001s : 6: predicate.addn_check_dump 1.19% : 0.000002s : 11: predicate.addn_zero_filter 0.84% : 0.000001s : 11: predicate.adjust_all_reduce_mul_add 2.13% : 0.000004s : 17: predicate.arithmetic_simplify 0.96% : 0.000002s : 11: predicate.cast_eliminate 0.56% : 0.000001s : 6: predicate.check_bprop_eliminate 0.53% : 0.000001s : 6: predicate.compare_switch_simplify 0.19% : 0.000000s : 3: predicate.const_output_eliminate 0.62% : 0.000001s : 6: predicate.depend_value_elim 0.96% : 0.000002s : 11: predicate.dict_get_item_const_eliminator 1.08% : 0.000002s : 11: predicate.dict_get_item_eliminator 0.98% : 0.000002s : 11: predicate.dict_set_item_eliminator 0.98% : 0.000002s : 6: predicate.dumpgradient_eliminate 0.23% : 0.000000s : 3: predicate.elim_not_effective 0.37% : 0.000001s : 3: predicate.elim_shapecalc_of_broadcastargs 1.11% : 0.000002s : 14: predicate.environ_add_const_eliminate 1.11% : 0.000002s : 14: predicate.environ_get_add_eliminate 1.08% : 0.000002s : 14: predicate.environ_get_depend_swap 1.81% : 0.000003s : 20: predicate.environ_get_eliminate 1.07% : 0.000002s : 14: predicate.environ_get_set_eliminate 1.37% : 0.000002s : 17: predicate.exchange_switch_depend_value 2.41% : 0.000004s : 17: predicate.float_depend_g_call 0.47% : 0.000001s : 6: predicate.float_environ_get_switch 0.72% : 0.000001s : 9: predicate.float_tuple_getitem_switch 0.17% : 0.000000s : 3: predicate.fold_const_symbol 0.59% : 0.000001s : 6: predicate.get_grad_eliminate 0.21% : 0.000000s : 3: predicate.graph_param_transform 0.60% : 0.000001s : 6: predicate.incorporate_call 0.51% : 0.000001s : 6: predicate.incorporate_call_switch 6.61% : 0.000012s : 46: predicate.inline 0.81% : 0.000001s : 6: predicate.inline_without_move 0.30% : 0.000001s : 6: predicate.j_node_and_user_rematch 1.13% : 0.000002s : 6: predicate.less_batch_normalization 1.47% : 0.000003s : 17: predicate.list_to_tuple_eliminator_ 2.18% : 0.000004s : 28: predicate.load_eliminater 1.26% : 0.000002s : 3: predicate.loop_unroll_after_grad 2.49% : 0.000004s : 27: predicate.loop_unroll_before_grad 1.53% : 0.000003s : 17: predicate.make_slice_get_slice_eliminator 0.52% : 0.000001s : 6: predicate.merge_addn 0.54% : 0.000001s : 6: predicate.micro_step_allgather_replace 0.76% : 0.000001s : 6: predicate.mini_step_allgather_replace 0.80% : 0.000001s : 11: predicate.minmaximum_grad 1.36% : 0.000002s : 3: predicate.mutable_eliminate 0.45% : 0.000001s : 3: predicate.opt_reshape 0.48% : 0.000001s : 3: predicate.parallel_virtual_node 1.91% : 0.000003s : 17: predicate.partial_defer_inline 1.31% : 0.000002s : 14: predicate.partial_eliminate 0.89% : 0.000002s : 11: predicate.print_const_string_wrapper 0.62% : 0.000001s : 6: predicate.reduce_all_const_elim 1.20% : 0.000002s : 11: predicate.reduce_eliminate 2.21% : 0.000004s : 28: predicate.redundant_stop_gradient_eliminater 0.81% : 0.000001s : 6: predicate.remove_not_recompute_node 1.17% : 0.000002s : 17: predicate.replace_applicator 0.53% : 0.000001s : 6: predicate.replace_old_param 0.26% : 0.000000s : 3: predicate.reset_defer_inline 0.93% : 0.000002s : 11: predicate.reshape_eliminate 0.60% : 0.000001s : 6: predicate.row_tensor_add_zeros_like 0.34% : 0.000001s : 3: predicate.row_tensor_eliminate 0.82% : 0.000001s : 6: predicate.same_eliminate 0.52% : 0.000001s : 6: predicate.set_cell_output_no_recompute 1.22% : 0.000002s : 6: predicate.shard_identity_eliminate 0.65% : 0.000001s : 6: predicate.special_op_eliminate 0.68% : 0.000001s : 6: predicate.specialize_transform 0.88% : 0.000002s : 6: predicate.split_environ_get_set_with_tuple_value 0.65% : 0.000001s : 6: predicate.stack_unstack_eliminate 0.34% : 0.000001s : 3: predicate.switch_call_monad_eliminater 1.55% : 0.000003s : 17: predicate.switch_defer_inline 2.16% : 0.000004s : 23: predicate.switch_layer_defer_inline 5.76% : 0.000010s : 57: predicate.switch_simplify 0.93% : 0.000002s : 11: predicate.tile_eliminate 0.94% : 0.000002s : 11: predicate.transpose_eliminate 1.50% : 0.000003s : 17: predicate.tuple_list_convert_item_index_to_positive 1.63% : 0.000003s : 17: predicate.tuple_list_get_item_const_eliminator 1.40% : 0.000002s : 17: predicate.tuple_list_get_item_depend_reorder 2.89% : 0.000005s : 23: predicate.tuple_list_get_item_eliminator 1.40% : 0.000002s : 17: predicate.tuple_list_get_set_item_eliminator 2.54% : 0.000005s : 23: predicate.tuple_list_set_item_eliminator 1.44% : 0.000003s : 17: predicate.tuple_to_list_eliminator_ 2.21% : 0.000004s : 28: predicate.updatestate_pure_node_eliminater 2.79% : 0.000005s : 34: predicate.updatestate_useless_node_eliminater 0.33% : 0.000001s : 3: predicate.value_based_eliminate 0.62% : 0.000001s : 6: predicate.virtual_dataset_eliminate 0.62% : 0.000001s : 6: predicate.virtual_output_eliminate 0.27% : 0.000000s : 3: predicate.virtual_view_grad_eliminate 0.49% : 0.000001s : 3: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000669 10 51.35% : 0.000343s : 2: func_graph_cloner_run.FuncGraphClonerGraph 48.65% : 0.000325s : 8: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.032744 192 0.01% : 0.000004s : 1: ForceFp32Comm 9.83% : 0.003219s : 1: add_attr 9.80% : 0.003208s : 1: add_attr_with_inline 0.01% : 0.000004s : 1: add_comm_op_reuse_tag 0.16% : 0.000053s : 1: add_recomputation 0.01% : 0.000004s : 1: assign_add_opt 0.28% : 0.000093s : 1: auto_monad 0.07% : 0.000024s : 1: auto_monad_reorder 0.01% : 0.000004s : 1: begin_end_overlap_inline 0.02% : 0.000006s : 1: bias_add_comm_swap 1.46% : 0.000480s : 1: bootstrap 0.10% : 0.000031s : 1: cconv 0.01% : 0.000005s : 1: comm_op_add_attrs 0.05% : 0.000017s : 1: control_data_broadcast_order 0.03% : 0.000011s : 1: convert_after_rewriter 0.08% : 0.000025s : 1: cse_after_recomputation 0.01% : 0.000005s : 1: dataset_repeat_opt 0.02% : 0.000006s : 1: detach_backward 0.03% : 0.000009s : 1: environ_conv 0.08% : 0.000025s : 1: event_method 0.01% : 0.000005s : 1: full_micro_interleaved_order_control 0.02% : 0.000005s : 1: get_jit_bprop_graph 0.03% : 0.000010s : 1: graph_reusing 0.01% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000004s : 1: handle_group_info 0.02% : 0.000006s : 1: inline 0.02% : 0.000007s : 1: insert-virtual-dataset 0.01% : 0.000004s : 1: interleave_parallel_branches 0.01% : 0.000004s : 1: interleave_split_concat_branches 0.02% : 0.000007s : 1: label_fine_grained_interleaved_index 0.02% : 0.000008s : 1: label_micro_interleaved_index 1.48% : 0.000485s : 1: loop_unroll 0.01% : 0.000004s : 1: merge_cast_opt 0.01% : 0.000005s : 1: micro_interleaved_order_control 1.70% : 0.000557s : 1: mutable_eliminate 0.02% : 0.000006s : 1: offloading_packed_experts 0.05% : 0.000017s : 1: opt.transform.loop_unroll_optimizer 0.06% : 0.000018s : 1: opt.transform.mutable_eliminate 3.40% : 0.001114s : 78: opt.transform.opt_a 0.08% : 0.000027s : 1: opt.transform.opt_after_cconv 0.08% : 0.000026s : 1: opt.transform.opt_after_jit_grad 0.30% : 0.000098s : 28: opt.transform.opt_b 0.13% : 0.000043s : 2: opt.transform.opt_trans_graph 0.12% : 0.000039s : 4: opt.transform.symbol_engine_opt 9.05% : 0.002962s : 1: opt_a 0.34% : 0.000112s : 1: opt_after_cconv 1.53% : 0.000502s : 1: opt_after_jit_grad 0.67% : 0.000219s : 1: opt_b 15.62% : 0.005115s : 1: optimize 0.07% : 0.000023s : 1: optimize_parallel_all_gather_comm 0.03% : 0.000009s : 1: order_py_execute_after_rewriter 0.11% : 0.000036s : 1: overlap_grad_flash_sp 0.01% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.02% : 0.000007s : 1: overlap_grad_ring_attention 0.02% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000005s : 1: overlap_opt_shard_in_pipeline 0.02% : 0.000005s : 1: overlap_param_gather 0.01% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.02% : 0.000008s : 1: overlap_recompute_and_grad_model_parallel 0.02% : 0.000006s : 1: overlap_recompute_comm 0.02% : 0.000007s : 1: parallel-infer-symbol 0.01% : 0.000004s : 1: parallel-infer-symbol-second 0.01% : 0.000005s : 1: partial_unused_args_eliminate 0.01% : 0.000005s : 1: pipeline_parallel_scheduler 0.01% : 0.000004s : 1: pipeline_split 0.12% : 0.000039s : 1: pre_auto_parallel 0.10% : 0.000032s : 1: py_interpret_to_execute 0.04% : 0.000013s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000004s : 1: remove_cast_before_assign_add 0.06% : 0.000018s : 1: remove_dup_value 1.49% : 0.000489s : 1: renormalize.infer 1.23% : 0.000404s : 1: renormalize.specialize 0.02% : 0.000005s : 1: reorder_send_recv_between_fp_bp 0.02% : 0.000008s : 1: rewriter_after_jit_bprop_graph 0.14% : 0.000044s : 1: rewriter_after_opt_a 0.25% : 0.000083s : 1: rewriter_before_opt_a 0.01% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.02% : 0.000005s : 1: slice_recompute_activation 0.01% : 0.000005s : 1: split_layernorm_comm 0.02% : 0.000005s : 1: split_matmul_comm_elemetwise 0.02% : 0.000008s : 1: swap_dp_allreduce_reducescatter 0.27% : 0.000088s : 1: symbol_engine_optimizer 0.22% : 0.000073s : 1: tuple_transform 38.59% : 0.012634s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:47:03.336.619 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:47:03.336.898 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0239605, [21] [bootstrap]: 0.00044274 [type_inference]: 0.0127478 [event_method]: 2.036e-05 [auto_monad]: 9.233e-05 [graph_reusing]: 7.21999e-06 [inline]: 1.92999e-06 [add_attr]: 0.00320655, [1] [add_attr_with_inline]: 0.00319739, [1] [Cycle 1]: 8.13e-05, [2] [tag_attr]: 2.188e-05 [meta_addattr_fg_expand]: 5.99e-06 [parallel-infer-symbol]: 3.18998e-06 [pre_auto_parallel]: 3.433e-05 [insert-virtual-dataset]: 2.24001e-06 [parallel-infer-symbol-second]: 1.01997e-06 [dataset_repeat_opt]: 2.09e-06 [pipeline_split]: 1.57001e-06 [optimize]: 0.00616781, [53] [py_interpret_to_execute]: 3.054e-05 [rewriter_before_opt_a]: 8.626e-05 [opt_a]: 0.00357489, [2] [Cycle 1]: 0.00259495, [45] [expand_dump_flag]: 3.70998e-06 [switch_simplify]: 0.00010977 [loop_unroll]: 2.674e-05 [a_1]: 0.0005833 [with_stream_mark]: 1.837e-05 [recompute_prepare]: 1.244e-05 [updatestate_depend_eliminate]: 4.92999e-06 [updatestate_assign_eliminate]: 4.3e-06 [updatestate_loads_eliminate]: 3.38999e-06 [parameter_eliminate]: 2.01998e-06 [a_2]: 0.00012546 [accelerated_algorithm]: 8.53001e-06 [shard]: 3.04999e-06 [meta_shard_fg_expand]: 2.22999e-06 [shard_inline]: 7.63001e-06 [merge_send_recv]: 9.67999e-06 [auto_parallel]: 8.29002e-06 [parallel]: 1.891e-05 [flash_sp]: 9.34998e-06 [merge_comm]: 5.44998e-06 [allreduce_fusion]: 5.35999e-06 [matmul_add_comm_reduction]: 1.118e-05 [allreduce_slice_to_reducescatter]: 9.39996e-07 [virtual_shard_identity]: 1.282e-05 [virtual_dataset]: 9.15001e-06 [get_grad_eliminate_]: 7.28e-06 [virtual_output]: 7.98999e-06 [merge_forward]: 5.27001e-06 [cell_reuse_recompute_pass]: 1.25999e-06 [offload_activation]: 1.047e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.984e-05 [merge_recompute_call_nodes]: 1.44e-06 [before_grad]: 1.323e-05 [set_forward_comm_id_for_comm_node_pass]: 5.96e-06 [meta_fg_expand]: 3.9e-06 [flash_sp_send_recv_attached]: 2.42001e-06 [receive_attached]: 2.22999e-06 [after_resolve]: 1.181e-05 [a_after_grad]: 1.222e-05 [renormalize]: 0.00090943 [add_forward_monad_depend]: 6.69001e-06 [auto_monad_grad]: 2.53998e-06 [auto_monad_eliminator]: 1.932e-05 [cse]: 3.925e-05 [a_3]: 7.391e-05 [Cycle 2]: 0.00096477, [45] [expand_dump_flag]: 1.55999e-06 [switch_simplify]: 9.41e-06 [loop_unroll]: 7.77002e-06 [a_1]: 0.00015667 [with_stream_mark]: 1.67e-05 [recompute_prepare]: 9.76e-06 [updatestate_depend_eliminate]: 5.25001e-06 [updatestate_assign_eliminate]: 3.14001e-06 [updatestate_loads_eliminate]: 2.96001e-06 [parameter_eliminate]: 1.32e-06 [a_2]: 0.0001169 [accelerated_algorithm]: 8.38999e-06 [shard]: 1.74e-06 [meta_shard_fg_expand]: 1.86998e-06 [shard_inline]: 8.22003e-06 [merge_send_recv]: 7.03e-06 [auto_parallel]: 7.05e-06 [parallel]: 6.12999e-06 [flash_sp]: 3.67998e-06 [merge_comm]: 4.55999e-06 [allreduce_fusion]: 4.29002e-06 [matmul_add_comm_reduction]: 7.69002e-06 [allreduce_slice_to_reducescatter]: 9.20001e-07 [virtual_shard_identity]: 9.65002e-06 [virtual_dataset]: 7.8e-06 [get_grad_eliminate_]: 7.35e-06 [virtual_output]: 6.74999e-06 [merge_forward]: 4.18999e-06 [cell_reuse_recompute_pass]: 1.59e-06 [offload_activation]: 9.26002e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.847e-05 [merge_recompute_call_nodes]: 1.05999e-06 [before_grad]: 1.26e-05 [set_forward_comm_id_for_comm_node_pass]: 6.21e-06 [meta_fg_expand]: 3.18e-06 [flash_sp_send_recv_attached]: 1.21002e-06 [receive_attached]: 1.49e-06 [after_resolve]: 1.116e-05 [a_after_grad]: 1.046e-05 [renormalize]: 8.00064e-08 [add_forward_monad_depend]: 2.53998e-06 [auto_monad_grad]: 1.29e-06 [auto_monad_eliminator]: 1.122e-05 [cse]: 2.416e-05 [a_3]: 5.643e-05 [py_interpret_to_execute_after_opt_a]: 1.668e-05 [slice_cell_reuse_recomputed_activation]: 5.20999e-06 [rewriter_after_opt_a]: 5.011e-05 [convert_after_rewriter]: 1.114e-05 [order_py_execute_after_rewriter]: 9.74e-06 [mutable_eliminate]: 0.00057499 [opt_b]: 0.00031366, [1] [Cycle 1]: 0.00030231, [7] [b_1]: 0.0001894 [b_2]: 9.25999e-06 [updatestate_depend_eliminate]: 8.1e-06 [updatestate_assign_eliminate]: 3.24001e-06 [updatestate_loads_eliminate]: 3.06999e-06 [renormalize]: 6.09987e-07 [cse]: 2.978e-05 [optimize_parallel_all_gather_comm]: 2.183e-05 [overlap_param_gather]: 5.19e-06 [cconv]: 3.321e-05 [loop_unroll]: 0.0004853 [opt_after_cconv]: 0.0001539, [1] [Cycle 1]: 0.00014474, [7] [c_1]: 3.62e-05 [parameter_eliminate]: 4.08001e-06 [updatestate_depend_eliminate]: 8.33001e-06 [updatestate_assign_eliminate]: 3.3e-06 [updatestate_loads_eliminate]: 3.21001e-06 [cse]: 2.995e-05 [renormalize]: 6.89994e-07 [remove_dup_value]: 2.011e-05 [tuple_transform]: 0.00010487, [1] [Cycle 1]: 9.629e-05, [4] [d_1]: 4.923e-05 [none_parameter_eliminate]: 2.27999e-06 [renormalize]: 1.8999e-07 [switch_simplify]: 9.37001e-06 [partial_unused_args_eliminate]: 5.19e-06 [add_recomputation]: 6.099e-05 [cse_after_recomputation]: 3.566e-05, [1] [Cycle 1]: 2.825e-05, [1] [cse]: 1.669e-05 [environ_conv]: 1.027e-05 [swap_dp_allreduce_reducescatter]: 2.543e-05 [bias_add_comm_swap]: 5.29e-06 [label_micro_interleaved_index]: 7.95e-06 [label_fine_grained_interleaved_index]: 4.99e-06 [merge_cast_opt]: 3.45e-06 [slice_recompute_activation]: 5.00999e-06 [micro_interleaved_order_control]: 5.42001e-06 [assign_add_opt]: 3.53e-06 [ForceFp32Comm]: 3.56001e-06 [remove_cast_before_assign_add]: 3.2e-06 [full_micro_interleaved_order_control]: 4.72e-06 [reorder_send_recv_between_fp_bp]: 5.82001e-06 [comm_op_add_attrs]: 3.2e-06 [add_comm_op_reuse_tag]: 3.23e-06 [interleave_split_concat_branches]: 3.36999e-06 [interleave_parallel_branches]: 3.31999e-06 [overlap_opt_shard_in_pipeline]: 3.31999e-06 [overlap_opt_shard_grad_in_pipeline]: 4.28001e-06 [control_data_broadcast_order]: 1.989e-05 [grouped_pairwise_exchange_alltoall]: 4.23999e-06 [offloading_packed_experts]: 7.01001e-06 [overlap_recompute_and_grad_model_parallel]: 7.61001e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.93999e-06 [overlap_recompute_allgather_and_fa_grad]: 3.53999e-06 [overlap_recompute_comm]: 4.77e-06 [overlap_grad_ring_attention]: 7.42002e-06 [overlap_grad_flash_sp]: 2.566e-05 [begin_end_overlap_inline]: 3.16001e-06 [split_matmul_comm_elemetwise]: 4.68001e-06 [split_layernorm_comm]: 1.577e-05 [handle_group_info]: 3.58999e-06 [symbol_engine_optimizer]: 0.00011674, [1] [Cycle 1]: 0.00010828, [6] [build]: 4.29002e-06 [elim_shapecalc]: 1.468e-05 [elim_not_effective]: 1.703e-05 [opt_reshape]: 8.57e-06 [fold_const_symbol]: 1.248e-05 [renormalize]: 3.50003e-07 [detach_backward]: 4.64998e-06 [pipeline_parallel_scheduler]: 2.16e-06 [auto_monad_reorder]: 2.577e-05 [get_jit_bprop_graph]: 1.66e-06 [rewriter_after_jit_bprop_graph]: 4.82e-06 [opt_after_jit_grad]: 0.00051658 [validate]: 4.517e-05 Sums bootstrap : 0.000443s : 2.35% type_inference : 0.012748s : 67.62% event_method : 0.000020s : 0.11% auto_monad : 0.000092s : 0.49% graph_reusing : 0.000007s : 0.04% inline : 0.000002s : 0.01% add_attr.add_attr_with_inline.tag_attr : 0.000022s : 0.12% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.03% parallel-infer-symbol : 0.000003s : 0.02% pre_auto_parallel : 0.000034s : 0.18% insert-virtual-dataset : 0.000002s : 0.01% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.01% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000031s : 0.16% optimize.rewriter_before_opt_a : 0.000086s : 0.46% optimize.opt_a.expand_dump_flag : 0.000005s : 0.03% optimize.opt_a.switch_simplify : 0.000119s : 0.63% optimize.opt_a.loop_unroll : 0.000035s : 0.18% optimize.opt_a.a_1 : 0.000740s : 3.93% optimize.opt_a.with_stream_mark : 0.000035s : 0.19% optimize.opt_a.recompute_prepare : 0.000022s : 0.12% optimize.opt_a.updatestate_depend_eliminate : 0.000010s : 0.05% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.04% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.03% optimize.opt_a.parameter_eliminate : 0.000003s : 0.02% optimize.opt_a.a_2 : 0.000242s : 1.29% optimize.opt_a.accelerated_algorithm : 0.000017s : 0.09% optimize.opt_a.shard : 0.000005s : 0.03% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.02% optimize.opt_a.shard_inline : 0.000016s : 0.08% optimize.opt_a.merge_send_recv : 0.000017s : 0.09% optimize.opt_a.auto_parallel : 0.000015s : 0.08% optimize.opt_a.parallel : 0.000025s : 0.13% optimize.opt_a.flash_sp : 0.000013s : 0.07% optimize.opt_a.merge_comm : 0.000010s : 0.05% optimize.opt_a.allreduce_fusion : 0.000010s : 0.05% optimize.opt_a.matmul_add_comm_reduction : 0.000019s : 0.10% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000002s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000022s : 0.12% optimize.opt_a.virtual_dataset : 0.000017s : 0.09% optimize.opt_a.get_grad_eliminate_ : 0.000015s : 0.08% optimize.opt_a.virtual_output : 0.000015s : 0.08% optimize.opt_a.merge_forward : 0.000009s : 0.05% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.02% optimize.opt_a.offload_activation : 0.000020s : 0.10% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000038s : 0.20% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.01% optimize.opt_a.before_grad : 0.000026s : 0.14% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000012s : 0.06% optimize.opt_a.meta_fg_expand : 0.000007s : 0.04% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.02% optimize.opt_a.receive_attached : 0.000004s : 0.02% optimize.opt_a.after_resolve : 0.000023s : 0.12% optimize.opt_a.a_after_grad : 0.000023s : 0.12% optimize.opt_a.renormalize : 0.000910s : 4.82% optimize.opt_a.add_forward_monad_depend : 0.000009s : 0.05% optimize.opt_a.auto_monad_grad : 0.000004s : 0.02% optimize.opt_a.auto_monad_eliminator : 0.000031s : 0.16% optimize.opt_a.cse : 0.000063s : 0.34% optimize.opt_a.a_3 : 0.000130s : 0.69% optimize.py_interpret_to_execute_after_opt_a : 0.000017s : 0.09% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.03% optimize.rewriter_after_opt_a : 0.000050s : 0.27% optimize.convert_after_rewriter : 0.000011s : 0.06% optimize.order_py_execute_after_rewriter : 0.000010s : 0.05% optimize.mutable_eliminate : 0.000575s : 3.05% optimize.opt_b.b_1 : 0.000189s : 1.00% optimize.opt_b.b_2 : 0.000009s : 0.05% optimize.opt_b.updatestate_depend_eliminate : 0.000008s : 0.04% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.02% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.02% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000030s : 0.16% optimize.optimize_parallel_all_gather_comm : 0.000022s : 0.12% optimize.overlap_param_gather : 0.000005s : 0.03% optimize.cconv : 0.000033s : 0.18% optimize.loop_unroll : 0.000485s : 2.57% optimize.opt_after_cconv.c_1 : 0.000036s : 0.19% optimize.opt_after_cconv.parameter_eliminate : 0.000004s : 0.02% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000008s : 0.04% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.02% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.02% optimize.opt_after_cconv.cse : 0.000030s : 0.16% optimize.opt_after_cconv.renormalize : 0.000001s : 0.00% optimize.remove_dup_value : 0.000020s : 0.11% optimize.tuple_transform.d_1 : 0.000049s : 0.26% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000009s : 0.05% optimize.partial_unused_args_eliminate : 0.000005s : 0.03% optimize.add_recomputation : 0.000061s : 0.32% optimize.cse_after_recomputation.cse : 0.000017s : 0.09% optimize.environ_conv : 0.000010s : 0.05% optimize.swap_dp_allreduce_reducescatter : 0.000025s : 0.13% optimize.bias_add_comm_swap : 0.000005s : 0.03% optimize.label_micro_interleaved_index : 0.000008s : 0.04% optimize.label_fine_grained_interleaved_index : 0.000005s : 0.03% optimize.merge_cast_opt : 0.000003s : 0.02% optimize.slice_recompute_activation : 0.000005s : 0.03% optimize.micro_interleaved_order_control : 0.000005s : 0.03% optimize.assign_add_opt : 0.000004s : 0.02% optimize.ForceFp32Comm : 0.000004s : 0.02% optimize.remove_cast_before_assign_add : 0.000003s : 0.02% optimize.full_micro_interleaved_order_control : 0.000005s : 0.03% optimize.reorder_send_recv_between_fp_bp : 0.000006s : 0.03% optimize.comm_op_add_attrs : 0.000003s : 0.02% optimize.add_comm_op_reuse_tag : 0.000003s : 0.02% optimize.interleave_split_concat_branches : 0.000003s : 0.02% optimize.interleave_parallel_branches : 0.000003s : 0.02% optimize.overlap_opt_shard_in_pipeline : 0.000003s : 0.02% optimize.overlap_opt_shard_grad_in_pipeline : 0.000004s : 0.02% optimize.control_data_broadcast_order : 0.000020s : 0.11% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.02% optimize.offloading_packed_experts : 0.000007s : 0.04% optimize.overlap_recompute_and_grad_model_parallel : 0.000008s : 0.04% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.02% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.02% optimize.overlap_recompute_comm : 0.000005s : 0.03% optimize.overlap_grad_ring_attention : 0.000007s : 0.04% optimize.overlap_grad_flash_sp : 0.000026s : 0.14% optimize.begin_end_overlap_inline : 0.000003s : 0.02% optimize.split_matmul_comm_elemetwise : 0.000005s : 0.02% optimize.split_layernorm_comm : 0.000016s : 0.08% optimize.handle_group_info : 0.000004s : 0.02% optimize.symbol_engine_optimizer.build : 0.000004s : 0.02% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000015s : 0.08% optimize.symbol_engine_optimizer.elim_not_effective : 0.000017s : 0.09% optimize.symbol_engine_optimizer.opt_reshape : 0.000009s : 0.05% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000012s : 0.07% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000005s : 0.02% pipeline_parallel_scheduler : 0.000002s : 0.01% auto_monad_reorder : 0.000026s : 0.14% get_jit_bprop_graph : 0.000002s : 0.01% rewriter_after_jit_bprop_graph : 0.000005s : 0.03% opt_after_jit_grad : 0.000517s : 2.74% validate : 0.000045s : 0.24% Time group info: ------[substitution.] 0.000196 34 11.80% : 0.000023s : 2: substitution.cast_eliminate 1.14% : 0.000002s : 3: substitution.elim_not_effective 1.01% : 0.000002s : 3: substitution.fold_const_symbol 3.10% : 0.000006s : 4: substitution.graph_param_transform 70.22% : 0.000138s : 6: substitution.inline 2.52% : 0.000005s : 6: substitution.j_node_and_user_rematch 3.12% : 0.000006s : 6: substitution.remove_not_recompute_node 1.82% : 0.000004s : 2: substitution.replace_old_param 5.27% : 0.000010s : 2: substitution.switch_simplify ------[type_inference.] 0.012686 2 92.21% : 0.011698s : 1: type_inference.infer 7.79% : 0.000988s : 1: type_inference.specialize ------[replace.] 0.000091 8 49.34% : 0.000045s : 6: replace.inline 50.66% : 0.000046s : 2: replace.switch_simplify ------[match.] 0.000142 8 93.82% : 0.000133s : 6: match.inline 6.18% : 0.000009s : 2: match.switch_simplify ------[predicate.] 0.000211 1222 0.91% : 0.000002s : 13: predicate.accumulaten_eliminater 0.80% : 0.000002s : 4: predicate.ad_related_special_op_eliminate 0.55% : 0.000001s : 8: predicate.addn_check_dump 0.97% : 0.000002s : 13: predicate.addn_zero_filter 0.86% : 0.000002s : 13: predicate.adjust_all_reduce_mul_add 2.25% : 0.000005s : 21: predicate.arithmetic_simplify 1.16% : 0.000002s : 13: predicate.cast_eliminate 0.66% : 0.000001s : 8: predicate.check_bprop_eliminate 0.59% : 0.000001s : 8: predicate.compare_switch_simplify 0.18% : 0.000000s : 4: predicate.const_output_eliminate 0.62% : 0.000001s : 8: predicate.depend_value_elim 0.93% : 0.000002s : 13: predicate.dict_get_item_const_eliminator 0.98% : 0.000002s : 13: predicate.dict_get_item_eliminator 0.88% : 0.000002s : 13: predicate.dict_set_item_eliminator 0.97% : 0.000002s : 8: predicate.dumpgradient_eliminate 0.31% : 0.000001s : 4: predicate.elim_not_effective 0.40% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.16% : 0.000002s : 17: predicate.environ_add_const_eliminate 1.10% : 0.000002s : 17: predicate.environ_get_add_eliminate 1.06% : 0.000002s : 17: predicate.environ_get_depend_swap 1.75% : 0.000004s : 25: predicate.environ_get_eliminate 1.09% : 0.000002s : 17: predicate.environ_get_set_eliminate 1.33% : 0.000003s : 19: predicate.exchange_switch_depend_value 2.10% : 0.000004s : 19: predicate.float_depend_g_call 0.58% : 0.000001s : 8: predicate.float_environ_get_switch 0.81% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.17% : 0.000000s : 4: predicate.fold_const_symbol 0.68% : 0.000001s : 8: predicate.get_grad_eliminate 0.21% : 0.000000s : 4: predicate.graph_param_transform 0.65% : 0.000001s : 8: predicate.incorporate_call 0.53% : 0.000001s : 8: predicate.incorporate_call_switch 6.91% : 0.000015s : 56: predicate.inline 0.88% : 0.000002s : 8: predicate.inline_without_move 0.31% : 0.000001s : 8: predicate.j_node_and_user_rematch 0.88% : 0.000002s : 8: predicate.less_batch_normalization 1.69% : 0.000004s : 21: predicate.list_to_tuple_eliminator_ 2.26% : 0.000005s : 34: predicate.load_eliminater 1.15% : 0.000002s : 4: predicate.loop_unroll_after_grad 2.23% : 0.000005s : 29: predicate.loop_unroll_before_grad 1.61% : 0.000003s : 21: predicate.make_slice_get_slice_eliminator 0.60% : 0.000001s : 8: predicate.merge_addn 0.59% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.60% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.84% : 0.000002s : 13: predicate.minmaximum_grad 1.20% : 0.000003s : 4: predicate.mutable_eliminate 0.34% : 0.000001s : 4: predicate.opt_reshape 0.37% : 0.000001s : 4: predicate.parallel_virtual_node 1.66% : 0.000004s : 19: predicate.partial_defer_inline 1.36% : 0.000003s : 17: predicate.partial_eliminate 1.05% : 0.000002s : 13: predicate.print_const_string_wrapper 0.56% : 0.000001s : 8: predicate.reduce_all_const_elim 1.17% : 0.000002s : 13: predicate.reduce_eliminate 2.19% : 0.000005s : 34: predicate.redundant_stop_gradient_eliminater 0.40% : 0.000001s : 8: predicate.remove_not_recompute_node 1.16% : 0.000002s : 21: predicate.replace_applicator 0.60% : 0.000001s : 8: predicate.replace_old_param 0.26% : 0.000001s : 4: predicate.reset_defer_inline 0.91% : 0.000002s : 13: predicate.reshape_eliminate 0.72% : 0.000002s : 8: predicate.row_tensor_add_zeros_like 0.38% : 0.000001s : 4: predicate.row_tensor_eliminate 0.72% : 0.000002s : 8: predicate.same_eliminate 0.44% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.97% : 0.000002s : 8: predicate.shard_identity_eliminate 0.71% : 0.000001s : 8: predicate.special_op_eliminate 0.83% : 0.000002s : 8: predicate.specialize_transform 0.86% : 0.000002s : 8: predicate.split_environ_get_set_with_tuple_value 0.69% : 0.000001s : 8: predicate.stack_unstack_eliminate 0.35% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.39% : 0.000003s : 19: predicate.switch_defer_inline 1.97% : 0.000004s : 27: predicate.switch_layer_defer_inline 5.57% : 0.000012s : 64: predicate.switch_simplify 1.18% : 0.000003s : 13: predicate.tile_eliminate 0.96% : 0.000002s : 13: predicate.transpose_eliminate 1.64% : 0.000003s : 21: predicate.tuple_list_convert_item_index_to_positive 1.97% : 0.000004s : 21: predicate.tuple_list_get_item_const_eliminator 1.47% : 0.000003s : 21: predicate.tuple_list_get_item_depend_reorder 2.92% : 0.000006s : 29: predicate.tuple_list_get_item_eliminator 1.55% : 0.000003s : 21: predicate.tuple_list_get_set_item_eliminator 2.41% : 0.000005s : 29: predicate.tuple_list_set_item_eliminator 1.50% : 0.000003s : 21: predicate.tuple_to_list_eliminator_ 2.17% : 0.000005s : 34: predicate.updatestate_pure_node_eliminater 3.00% : 0.000006s : 42: predicate.updatestate_useless_node_eliminater 0.35% : 0.000001s : 4: predicate.value_based_eliminate 0.86% : 0.000002s : 8: predicate.virtual_dataset_eliminate 0.64% : 0.000001s : 8: predicate.virtual_output_eliminate 0.28% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.47% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000670 10 50.39% : 0.000337s : 2: func_graph_cloner_run.FuncGraphClonerGraph 49.61% : 0.000332s : 8: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.035746 192 0.02% : 0.000006s : 1: ForceFp32Comm 9.00% : 0.003216s : 1: add_attr 8.96% : 0.003201s : 1: add_attr_with_inline 0.02% : 0.000006s : 1: add_comm_op_reuse_tag 0.19% : 0.000067s : 1: add_recomputation 0.02% : 0.000007s : 1: assign_add_opt 0.29% : 0.000102s : 1: auto_monad 0.10% : 0.000034s : 1: auto_monad_reorder 0.02% : 0.000006s : 1: begin_end_overlap_inline 0.02% : 0.000008s : 1: bias_add_comm_swap 1.36% : 0.000487s : 1: bootstrap 0.10% : 0.000037s : 1: cconv 0.02% : 0.000006s : 1: comm_op_add_attrs 0.06% : 0.000023s : 1: control_data_broadcast_order 0.04% : 0.000015s : 1: convert_after_rewriter 0.11% : 0.000039s : 1: cse_after_recomputation 0.02% : 0.000008s : 1: dataset_repeat_opt 0.07% : 0.000027s : 1: detach_backward 0.04% : 0.000014s : 1: environ_conv 0.09% : 0.000031s : 1: event_method 0.02% : 0.000008s : 1: full_micro_interleaved_order_control 0.02% : 0.000008s : 1: get_jit_bprop_graph 0.04% : 0.000014s : 1: graph_reusing 0.02% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.02% : 0.000006s : 1: handle_group_info 0.02% : 0.000008s : 1: inline 0.02% : 0.000008s : 1: insert-virtual-dataset 0.02% : 0.000006s : 1: interleave_parallel_branches 0.02% : 0.000006s : 1: interleave_split_concat_branches 0.02% : 0.000008s : 1: label_fine_grained_interleaved_index 0.03% : 0.000011s : 1: label_micro_interleaved_index 1.38% : 0.000493s : 1: loop_unroll 0.02% : 0.000006s : 1: merge_cast_opt 0.02% : 0.000008s : 1: micro_interleaved_order_control 1.63% : 0.000583s : 1: mutable_eliminate 0.03% : 0.000010s : 1: offloading_packed_experts 0.06% : 0.000020s : 1: opt.transform.loop_unroll_optimizer 0.05% : 0.000019s : 1: opt.transform.mutable_eliminate 3.68% : 0.001314s : 78: opt.transform.opt_a 0.10% : 0.000035s : 1: opt.transform.opt_after_cconv 0.08% : 0.000030s : 1: opt.transform.opt_after_jit_grad 0.35% : 0.000126s : 28: opt.transform.opt_b 0.16% : 0.000056s : 2: opt.transform.opt_trans_graph 0.13% : 0.000048s : 4: opt.transform.symbol_engine_opt 10.01% : 0.003579s : 1: opt_a 0.44% : 0.000158s : 1: opt_after_cconv 1.48% : 0.000528s : 1: opt_after_jit_grad 0.89% : 0.000317s : 1: opt_b 18.30% : 0.006543s : 1: optimize 0.07% : 0.000025s : 1: optimize_parallel_all_gather_comm 0.04% : 0.000013s : 1: order_py_execute_after_rewriter 0.08% : 0.000029s : 1: overlap_grad_flash_sp 0.02% : 0.000007s : 1: overlap_grad_matmul_and_grad_allreduce 0.03% : 0.000010s : 1: overlap_grad_ring_attention 0.02% : 0.000007s : 1: overlap_opt_shard_grad_in_pipeline 0.02% : 0.000006s : 1: overlap_opt_shard_in_pipeline 0.02% : 0.000008s : 1: overlap_param_gather 0.02% : 0.000006s : 1: overlap_recompute_allgather_and_fa_grad 0.03% : 0.000010s : 1: overlap_recompute_and_grad_model_parallel 0.02% : 0.000008s : 1: overlap_recompute_comm 0.03% : 0.000010s : 1: parallel-infer-symbol 0.02% : 0.000007s : 1: parallel-infer-symbol-second 0.02% : 0.000008s : 1: partial_unused_args_eliminate 0.03% : 0.000010s : 1: pipeline_parallel_scheduler 0.02% : 0.000007s : 1: pipeline_split 0.12% : 0.000042s : 1: pre_auto_parallel 0.10% : 0.000034s : 1: py_interpret_to_execute 0.06% : 0.000021s : 1: py_interpret_to_execute_after_opt_a 0.02% : 0.000006s : 1: remove_cast_before_assign_add 0.07% : 0.000024s : 1: remove_dup_value 1.40% : 0.000499s : 1: renormalize.infer 1.12% : 0.000401s : 1: renormalize.specialize 0.02% : 0.000009s : 1: reorder_send_recv_between_fp_bp 0.03% : 0.000011s : 1: rewriter_after_jit_bprop_graph 0.15% : 0.000055s : 1: rewriter_after_opt_a 0.25% : 0.000091s : 1: rewriter_before_opt_a 0.02% : 0.000008s : 1: slice_cell_reuse_recomputed_activation 0.02% : 0.000008s : 1: slice_recompute_activation 0.05% : 0.000019s : 1: split_layernorm_comm 0.02% : 0.000007s : 1: split_matmul_comm_elemetwise 0.08% : 0.000030s : 1: swap_dp_allreduce_reducescatter 0.33% : 0.000120s : 1: symbol_engine_optimizer 0.30% : 0.000108s : 1: tuple_transform 35.75% : 0.012781s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:47:03.545.643 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0210369, [21] [bootstrap]: 0.00043132 [type_inference]: 0.0119374 [event_method]: 1.809e-05 [auto_monad]: 8.472e-05 [graph_reusing]: 6.58998e-06 [inline]: 2.34001e-06 [add_attr]: 0.00302899, [1] [add_attr_with_inline]: 0.00302064, [1] [Cycle 1]: 5.648e-05, [2] [tag_attr]: 1.975e-05 [meta_addattr_fg_expand]: 6.14001e-06 [parallel-infer-symbol]: 3.47002e-06 [pre_auto_parallel]: 3.161e-05 [insert-virtual-dataset]: 2.29999e-06 [parallel-infer-symbol-second]: 1.15001e-06 [dataset_repeat_opt]: 2.21e-06 [pipeline_split]: 1.64e-06 [optimize]: 0.00480517, [53] [py_interpret_to_execute]: 2.486e-05 [rewriter_before_opt_a]: 7.97e-05 [opt_a]: 0.00275947, [2] [Cycle 1]: 0.00204884, [45] [expand_dump_flag]: 3.33998e-06 [switch_simplify]: 0.00010306 [loop_unroll]: 2.644e-05 [a_1]: 0.00054885 [with_stream_mark]: 1.445e-05 [recompute_prepare]: 9.46e-06 [updatestate_depend_eliminate]: 4.82e-06 [updatestate_assign_eliminate]: 3.65998e-06 [updatestate_loads_eliminate]: 3.8e-06 [parameter_eliminate]: 2.41e-06 [a_2]: 9.754e-05 [accelerated_algorithm]: 7.98999e-06 [shard]: 2.19001e-06 [meta_shard_fg_expand]: 2.19999e-06 [shard_inline]: 7.86001e-06 [merge_send_recv]: 9.31998e-06 [auto_parallel]: 6.71e-06 [parallel]: 1.726e-05 [flash_sp]: 8.05999e-06 [merge_comm]: 4.65001e-06 [allreduce_fusion]: 3.93001e-06 [matmul_add_comm_reduction]: 9.78998e-06 [allreduce_slice_to_reducescatter]: 7.10017e-07 [virtual_shard_identity]: 9.29998e-06 [virtual_dataset]: 7.38e-06 [get_grad_eliminate_]: 7.12997e-06 [virtual_output]: 7.03998e-06 [merge_forward]: 4.08999e-06 [cell_reuse_recompute_pass]: 1.37999e-06 [offload_activation]: 9.96998e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.438e-05 [merge_recompute_call_nodes]: 1.52001e-06 [before_grad]: 1.229e-05 [set_forward_comm_id_for_comm_node_pass]: 4.20999e-06 [meta_fg_expand]: 3.53999e-06 [flash_sp_send_recv_attached]: 2.90998e-06 [receive_attached]: 2.61e-06 [after_resolve]: 1.102e-05 [a_after_grad]: 1.089e-05 [renormalize]: 0.0006787 [add_forward_monad_depend]: 5.29e-06 [auto_monad_grad]: 1.93002e-06 [auto_monad_eliminator]: 1.663e-05 [cse]: 3.586e-05 [a_3]: 5.577e-05 [Cycle 2]: 0.00070105, [45] [expand_dump_flag]: 1.12e-06 [switch_simplify]: 8.62998e-06 [loop_unroll]: 7.01999e-06 [a_1]: 0.0001514 [with_stream_mark]: 1.207e-05 [recompute_prepare]: 7.06001e-06 [updatestate_depend_eliminate]: 4.33001e-06 [updatestate_assign_eliminate]: 3.02002e-06 [updatestate_loads_eliminate]: 2.62001e-06 [parameter_eliminate]: 1.10999e-06 [a_2]: 8.647e-05 [accelerated_algorithm]: 7.08e-06 [shard]: 1.21002e-06 [meta_shard_fg_expand]: 1.74e-06 [shard_inline]: 7.16999e-06 [merge_send_recv]: 5.77001e-06 [auto_parallel]: 6.35002e-06 [parallel]: 4.52e-06 [flash_sp]: 3.55e-06 [merge_comm]: 3.76999e-06 [allreduce_fusion]: 3.89002e-06 [matmul_add_comm_reduction]: 6.51e-06 [allreduce_slice_to_reducescatter]: 3.80009e-07 [virtual_shard_identity]: 7.88001e-06 [virtual_dataset]: 6.58998e-06 [get_grad_eliminate_]: 6.62002e-06 [virtual_output]: 6.23e-06 [merge_forward]: 3.17002e-06 [cell_reuse_recompute_pass]: 1.39998e-06 [offload_activation]: 7.17002e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.367e-05 [merge_recompute_call_nodes]: 7.2e-07 [before_grad]: 1.105e-05 [set_forward_comm_id_for_comm_node_pass]: 4.50999e-06 [meta_fg_expand]: 2.69001e-06 [flash_sp_send_recv_attached]: 8.00006e-07 [receive_attached]: 9.99979e-07 [after_resolve]: 9.29e-06 [a_after_grad]: 9.75002e-06 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 1.14998e-06 [auto_monad_grad]: 8.89995e-07 [auto_monad_eliminator]: 8.28999e-06 [cse]: 1.784e-05 [a_3]: 4.298e-05 [py_interpret_to_execute_after_opt_a]: 9.05001e-06 [slice_cell_reuse_recomputed_activation]: 1.91998e-06 [rewriter_after_opt_a]: 3.999e-05 [convert_after_rewriter]: 7.70998e-06 [order_py_execute_after_rewriter]: 5.95002e-06 [mutable_eliminate]: 0.00048633 [opt_b]: 0.0002279, [1] [Cycle 1]: 0.00022233, [7] [b_1]: 0.00014459 [b_2]: 8.70999e-06 [updatestate_depend_eliminate]: 5.65001e-06 [updatestate_assign_eliminate]: 2.96001e-06 [updatestate_loads_eliminate]: 2.84001e-06 [renormalize]: 4.00003e-07 [cse]: 2.228e-05 [optimize_parallel_all_gather_comm]: 1.671e-05 [overlap_param_gather]: 1.86e-06 [cconv]: 2.326e-05 [loop_unroll]: 0.00042414 [opt_after_cconv]: 0.00011244, [1] [Cycle 1]: 0.00010673, [7] [c_1]: 3.529e-05 [parameter_eliminate]: 2.45002e-06 [updatestate_depend_eliminate]: 5.68997e-06 [updatestate_assign_eliminate]: 3.14999e-06 [updatestate_loads_eliminate]: 2.87002e-06 [cse]: 2.253e-05 [renormalize]: 4.49974e-07 [remove_dup_value]: 1.713e-05 [tuple_transform]: 7.743e-05, [1] [Cycle 1]: 7.304e-05, [4] [d_1]: 4.587e-05 [none_parameter_eliminate]: 1.57001e-06 [renormalize]: 2.60014e-07 [switch_simplify]: 7.56999e-06 [partial_unused_args_eliminate]: 1.69e-06 [add_recomputation]: 5.411e-05 [cse_after_recomputation]: 2.411e-05, [1] [Cycle 1]: 1.98e-05, [1] [cse]: 1.458e-05 [environ_conv]: 5.57001e-06 [swap_dp_allreduce_reducescatter]: 5.67999e-06 [bias_add_comm_swap]: 3.11999e-06 [label_micro_interleaved_index]: 4.19002e-06 [label_fine_grained_interleaved_index]: 2.61999e-06 [merge_cast_opt]: 1.22999e-06 [slice_recompute_activation]: 1.90001e-06 [micro_interleaved_order_control]: 2.53003e-06 [assign_add_opt]: 1.15999e-06 [ForceFp32Comm]: 1.14e-06 [remove_cast_before_assign_add]: 1.27e-06 [full_micro_interleaved_order_control]: 2.24001e-06 [reorder_send_recv_between_fp_bp]: 2.91e-06 [comm_op_add_attrs]: 9.30013e-07 [add_comm_op_reuse_tag]: 9.19972e-07 [interleave_split_concat_branches]: 1.10999e-06 [interleave_parallel_branches]: 1.03001e-06 [overlap_opt_shard_in_pipeline]: 1.14e-06 [overlap_opt_shard_grad_in_pipeline]: 1.66e-06 [control_data_broadcast_order]: 1.433e-05 [grouped_pairwise_exchange_alltoall]: 1.44e-06 [offloading_packed_experts]: 3.83999e-06 [overlap_recompute_and_grad_model_parallel]: 5.17e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.15999e-06 [overlap_recompute_allgather_and_fa_grad]: 1.30999e-06 [overlap_recompute_comm]: 2.58e-06 [overlap_grad_ring_attention]: 4.25e-06 [overlap_grad_flash_sp]: 1.953e-05 [begin_end_overlap_inline]: 5.29981e-07 [split_matmul_comm_elemetwise]: 1.96e-06 [split_layernorm_comm]: 1.63002e-06 [handle_group_info]: 9.50007e-07 [symbol_engine_optimizer]: 8.259e-05, [1] [Cycle 1]: 7.821e-05, [6] [build]: 3.23e-06 [elim_shapecalc]: 1.109e-05 [elim_not_effective]: 1.517e-05 [opt_reshape]: 8.07998e-06 [fold_const_symbol]: 1.234e-05 [renormalize]: 2.30008e-07 [detach_backward]: 1.72999e-06 [pipeline_parallel_scheduler]: 1.69e-06 [auto_monad_reorder]: 1.943e-05 [get_jit_bprop_graph]: 1.20001e-06 [rewriter_after_jit_bprop_graph]: 3.68e-06 [opt_after_jit_grad]: 0.00046578 [validate]: 4.028e-05 Sums bootstrap : 0.000431s : 2.53% type_inference : 0.011937s : 69.97% event_method : 0.000018s : 0.11% auto_monad : 0.000085s : 0.50% graph_reusing : 0.000007s : 0.04% inline : 0.000002s : 0.01% add_attr.add_attr_with_inline.tag_attr : 0.000020s : 0.12% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.04% parallel-infer-symbol : 0.000003s : 0.02% pre_auto_parallel : 0.000032s : 0.19% insert-virtual-dataset : 0.000002s : 0.01% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.01% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000025s : 0.15% optimize.rewriter_before_opt_a : 0.000080s : 0.47% optimize.opt_a.expand_dump_flag : 0.000004s : 0.03% optimize.opt_a.switch_simplify : 0.000112s : 0.65% optimize.opt_a.loop_unroll : 0.000033s : 0.20% optimize.opt_a.a_1 : 0.000700s : 4.10% optimize.opt_a.with_stream_mark : 0.000027s : 0.16% optimize.opt_a.recompute_prepare : 0.000017s : 0.10% optimize.opt_a.updatestate_depend_eliminate : 0.000009s : 0.05% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.04% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.04% optimize.opt_a.parameter_eliminate : 0.000004s : 0.02% optimize.opt_a.a_2 : 0.000184s : 1.08% optimize.opt_a.accelerated_algorithm : 0.000015s : 0.09% optimize.opt_a.shard : 0.000003s : 0.02% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.02% optimize.opt_a.shard_inline : 0.000015s : 0.09% optimize.opt_a.merge_send_recv : 0.000015s : 0.09% optimize.opt_a.auto_parallel : 0.000013s : 0.08% optimize.opt_a.parallel : 0.000022s : 0.13% optimize.opt_a.flash_sp : 0.000012s : 0.07% optimize.opt_a.merge_comm : 0.000008s : 0.05% optimize.opt_a.allreduce_fusion : 0.000008s : 0.05% optimize.opt_a.matmul_add_comm_reduction : 0.000016s : 0.10% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000017s : 0.10% optimize.opt_a.virtual_dataset : 0.000014s : 0.08% optimize.opt_a.get_grad_eliminate_ : 0.000014s : 0.08% optimize.opt_a.virtual_output : 0.000013s : 0.08% optimize.opt_a.merge_forward : 0.000007s : 0.04% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.02% optimize.opt_a.offload_activation : 0.000017s : 0.10% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000028s : 0.16% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.01% optimize.opt_a.before_grad : 0.000023s : 0.14% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000009s : 0.05% optimize.opt_a.meta_fg_expand : 0.000006s : 0.04% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.02% optimize.opt_a.receive_attached : 0.000004s : 0.02% optimize.opt_a.after_resolve : 0.000020s : 0.12% optimize.opt_a.a_after_grad : 0.000021s : 0.12% optimize.opt_a.renormalize : 0.000679s : 3.98% optimize.opt_a.add_forward_monad_depend : 0.000006s : 0.04% optimize.opt_a.auto_monad_grad : 0.000003s : 0.02% optimize.opt_a.auto_monad_eliminator : 0.000025s : 0.15% optimize.opt_a.cse : 0.000054s : 0.31% optimize.opt_a.a_3 : 0.000099s : 0.58% optimize.py_interpret_to_execute_after_opt_a : 0.000009s : 0.05% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.01% optimize.rewriter_after_opt_a : 0.000040s : 0.23% optimize.convert_after_rewriter : 0.000008s : 0.05% optimize.order_py_execute_after_rewriter : 0.000006s : 0.03% optimize.mutable_eliminate : 0.000486s : 2.85% optimize.opt_b.b_1 : 0.000145s : 0.85% optimize.opt_b.b_2 : 0.000009s : 0.05% optimize.opt_b.updatestate_depend_eliminate : 0.000006s : 0.03% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.02% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.02% optimize.opt_b.renormalize : 0.000000s : 0.00% optimize.opt_b.cse : 0.000022s : 0.13% optimize.optimize_parallel_all_gather_comm : 0.000017s : 0.10% optimize.overlap_param_gather : 0.000002s : 0.01% optimize.cconv : 0.000023s : 0.14% optimize.loop_unroll : 0.000424s : 2.49% optimize.opt_after_cconv.c_1 : 0.000035s : 0.21% optimize.opt_after_cconv.parameter_eliminate : 0.000002s : 0.01% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.03% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.02% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.02% optimize.opt_after_cconv.cse : 0.000023s : 0.13% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000017s : 0.10% optimize.tuple_transform.d_1 : 0.000046s : 0.27% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000008s : 0.04% optimize.partial_unused_args_eliminate : 0.000002s : 0.01% optimize.add_recomputation : 0.000054s : 0.32% optimize.cse_after_recomputation.cse : 0.000015s : 0.09% optimize.environ_conv : 0.000006s : 0.03% optimize.swap_dp_allreduce_reducescatter : 0.000006s : 0.03% optimize.bias_add_comm_swap : 0.000003s : 0.02% optimize.label_micro_interleaved_index : 0.000004s : 0.02% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.02% optimize.merge_cast_opt : 0.000001s : 0.01% optimize.slice_recompute_activation : 0.000002s : 0.01% optimize.micro_interleaved_order_control : 0.000003s : 0.01% optimize.assign_add_opt : 0.000001s : 0.01% optimize.ForceFp32Comm : 0.000001s : 0.01% optimize.remove_cast_before_assign_add : 0.000001s : 0.01% optimize.full_micro_interleaved_order_control : 0.000002s : 0.01% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.02% optimize.comm_op_add_attrs : 0.000001s : 0.01% optimize.add_comm_op_reuse_tag : 0.000001s : 0.01% optimize.interleave_split_concat_branches : 0.000001s : 0.01% optimize.interleave_parallel_branches : 0.000001s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.01% optimize.control_data_broadcast_order : 0.000014s : 0.08% optimize.grouped_pairwise_exchange_alltoall : 0.000001s : 0.01% optimize.offloading_packed_experts : 0.000004s : 0.02% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.03% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.01% optimize.overlap_recompute_comm : 0.000003s : 0.02% optimize.overlap_grad_ring_attention : 0.000004s : 0.02% optimize.overlap_grad_flash_sp : 0.000020s : 0.11% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.01% optimize.split_layernorm_comm : 0.000002s : 0.01% optimize.handle_group_info : 0.000001s : 0.01% optimize.symbol_engine_optimizer.build : 0.000003s : 0.02% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000011s : 0.07% optimize.symbol_engine_optimizer.elim_not_effective : 0.000015s : 0.09% optimize.symbol_engine_optimizer.opt_reshape : 0.000008s : 0.05% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000012s : 0.07% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.01% pipeline_parallel_scheduler : 0.000002s : 0.01% auto_monad_reorder : 0.000019s : 0.11% get_jit_bprop_graph : 0.000001s : 0.01% rewriter_after_jit_bprop_graph : 0.000004s : 0.02% opt_after_jit_grad : 0.000466s : 2.73% validate : 0.000040s : 0.24% Time group info: ------[substitution.] 0.000169 34 10.33% : 0.000017s : 2: substitution.cast_eliminate 1.27% : 0.000002s : 3: substitution.elim_not_effective 0.97% : 0.000002s : 3: substitution.fold_const_symbol 3.46% : 0.000006s : 4: substitution.graph_param_transform 71.15% : 0.000121s : 6: substitution.inline 2.28% : 0.000004s : 6: substitution.j_node_and_user_rematch 3.22% : 0.000005s : 6: substitution.remove_not_recompute_node 1.61% : 0.000003s : 2: substitution.replace_old_param 5.71% : 0.000010s : 2: substitution.switch_simplify ------[type_inference.] 0.011888 2 92.46% : 0.010992s : 1: type_inference.infer 7.54% : 0.000897s : 1: type_inference.specialize ------[replace.] 0.000081 8 48.28% : 0.000039s : 6: replace.inline 51.72% : 0.000042s : 2: replace.switch_simplify ------[match.] 0.000125 8 93.45% : 0.000117s : 6: match.inline 6.55% : 0.000008s : 2: match.switch_simplify ------[predicate.] 0.000204 1222 1.00% : 0.000002s : 13: predicate.accumulaten_eliminater 0.85% : 0.000002s : 4: predicate.ad_related_special_op_eliminate 0.54% : 0.000001s : 8: predicate.addn_check_dump 0.98% : 0.000002s : 13: predicate.addn_zero_filter 0.86% : 0.000002s : 13: predicate.adjust_all_reduce_mul_add 2.17% : 0.000004s : 21: predicate.arithmetic_simplify 1.16% : 0.000002s : 13: predicate.cast_eliminate 0.65% : 0.000001s : 8: predicate.check_bprop_eliminate 0.54% : 0.000001s : 8: predicate.compare_switch_simplify 0.18% : 0.000000s : 4: predicate.const_output_eliminate 0.64% : 0.000001s : 8: predicate.depend_value_elim 0.97% : 0.000002s : 13: predicate.dict_get_item_const_eliminator 1.10% : 0.000002s : 13: predicate.dict_get_item_eliminator 0.93% : 0.000002s : 13: predicate.dict_set_item_eliminator 0.88% : 0.000002s : 8: predicate.dumpgradient_eliminate 0.21% : 0.000000s : 4: predicate.elim_not_effective 0.37% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.15% : 0.000002s : 17: predicate.environ_add_const_eliminate 1.17% : 0.000002s : 17: predicate.environ_get_add_eliminate 1.15% : 0.000002s : 17: predicate.environ_get_depend_swap 1.78% : 0.000004s : 25: predicate.environ_get_eliminate 1.14% : 0.000002s : 17: predicate.environ_get_set_eliminate 1.37% : 0.000003s : 19: predicate.exchange_switch_depend_value 2.34% : 0.000005s : 19: predicate.float_depend_g_call 0.56% : 0.000001s : 8: predicate.float_environ_get_switch 0.80% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.16% : 0.000000s : 4: predicate.fold_const_symbol 0.70% : 0.000001s : 8: predicate.get_grad_eliminate 0.21% : 0.000000s : 4: predicate.graph_param_transform 0.70% : 0.000001s : 8: predicate.incorporate_call 0.56% : 0.000001s : 8: predicate.incorporate_call_switch 6.61% : 0.000013s : 56: predicate.inline 0.82% : 0.000002s : 8: predicate.inline_without_move 0.34% : 0.000001s : 8: predicate.j_node_and_user_rematch 0.87% : 0.000002s : 8: predicate.less_batch_normalization 1.72% : 0.000004s : 21: predicate.list_to_tuple_eliminator_ 2.33% : 0.000005s : 34: predicate.load_eliminater 0.92% : 0.000002s : 4: predicate.loop_unroll_after_grad 2.19% : 0.000004s : 29: predicate.loop_unroll_before_grad 1.67% : 0.000003s : 21: predicate.make_slice_get_slice_eliminator 0.61% : 0.000001s : 8: predicate.merge_addn 0.68% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.59% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.83% : 0.000002s : 13: predicate.minmaximum_grad 1.00% : 0.000002s : 4: predicate.mutable_eliminate 0.38% : 0.000001s : 4: predicate.opt_reshape 0.36% : 0.000001s : 4: predicate.parallel_virtual_node 1.69% : 0.000003s : 19: predicate.partial_defer_inline 1.40% : 0.000003s : 17: predicate.partial_eliminate 0.93% : 0.000002s : 13: predicate.print_const_string_wrapper 0.69% : 0.000001s : 8: predicate.reduce_all_const_elim 1.21% : 0.000002s : 13: predicate.reduce_eliminate 2.33% : 0.000005s : 34: predicate.redundant_stop_gradient_eliminater 0.40% : 0.000001s : 8: predicate.remove_not_recompute_node 1.21% : 0.000002s : 21: predicate.replace_applicator 0.55% : 0.000001s : 8: predicate.replace_old_param 0.28% : 0.000001s : 4: predicate.reset_defer_inline 1.01% : 0.000002s : 13: predicate.reshape_eliminate 0.77% : 0.000002s : 8: predicate.row_tensor_add_zeros_like 0.35% : 0.000001s : 4: predicate.row_tensor_eliminate 0.75% : 0.000002s : 8: predicate.same_eliminate 0.42% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.79% : 0.000002s : 8: predicate.shard_identity_eliminate 0.73% : 0.000001s : 8: predicate.special_op_eliminate 0.79% : 0.000002s : 8: predicate.specialize_transform 0.90% : 0.000002s : 8: predicate.split_environ_get_set_with_tuple_value 0.72% : 0.000001s : 8: predicate.stack_unstack_eliminate 0.36% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.49% : 0.000003s : 19: predicate.switch_defer_inline 2.04% : 0.000004s : 27: predicate.switch_layer_defer_inline 5.69% : 0.000012s : 64: predicate.switch_simplify 0.93% : 0.000002s : 13: predicate.tile_eliminate 0.95% : 0.000002s : 13: predicate.transpose_eliminate 1.58% : 0.000003s : 21: predicate.tuple_list_convert_item_index_to_positive 1.68% : 0.000003s : 21: predicate.tuple_list_get_item_const_eliminator 1.55% : 0.000003s : 21: predicate.tuple_list_get_item_depend_reorder 2.79% : 0.000006s : 29: predicate.tuple_list_get_item_eliminator 1.52% : 0.000003s : 21: predicate.tuple_list_get_set_item_eliminator 2.48% : 0.000005s : 29: predicate.tuple_list_set_item_eliminator 1.53% : 0.000003s : 21: predicate.tuple_to_list_eliminator_ 2.24% : 0.000005s : 34: predicate.updatestate_pure_node_eliminater 3.00% : 0.000006s : 42: predicate.updatestate_useless_node_eliminater 0.39% : 0.000001s : 4: predicate.value_based_eliminate 0.69% : 0.000001s : 8: predicate.virtual_dataset_eliminate 0.63% : 0.000001s : 8: predicate.virtual_output_eliminate 0.35% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.43% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000603 10 54.07% : 0.000326s : 2: func_graph_cloner_run.FuncGraphClonerGraph 45.93% : 0.000277s : 8: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.030959 192 0.01% : 0.000004s : 1: ForceFp32Comm 9.80% : 0.003034s : 1: add_attr 9.77% : 0.003024s : 1: add_attr_with_inline 0.01% : 0.000003s : 1: add_comm_op_reuse_tag 0.19% : 0.000058s : 1: add_recomputation 0.01% : 0.000004s : 1: assign_add_opt 0.29% : 0.000090s : 1: auto_monad 0.07% : 0.000023s : 1: auto_monad_reorder 0.01% : 0.000003s : 1: begin_end_overlap_inline 0.02% : 0.000006s : 1: bias_add_comm_swap 1.49% : 0.000461s : 1: bootstrap 0.09% : 0.000027s : 1: cconv 0.01% : 0.000004s : 1: comm_op_add_attrs 0.06% : 0.000018s : 1: control_data_broadcast_order 0.04% : 0.000011s : 1: convert_after_rewriter 0.09% : 0.000028s : 1: cse_after_recomputation 0.02% : 0.000005s : 1: dataset_repeat_opt 0.02% : 0.000005s : 1: detach_backward 0.03% : 0.000009s : 1: environ_conv 0.08% : 0.000024s : 1: event_method 0.02% : 0.000005s : 1: full_micro_interleaved_order_control 0.01% : 0.000004s : 1: get_jit_bprop_graph 0.03% : 0.000010s : 1: graph_reusing 0.01% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000004s : 1: handle_group_info 0.02% : 0.000006s : 1: inline 0.02% : 0.000005s : 1: insert-virtual-dataset 0.01% : 0.000004s : 1: interleave_parallel_branches 0.01% : 0.000004s : 1: interleave_split_concat_branches 0.02% : 0.000006s : 1: label_fine_grained_interleaved_index 0.02% : 0.000007s : 1: label_micro_interleaved_index 1.40% : 0.000432s : 1: loop_unroll 0.01% : 0.000004s : 1: merge_cast_opt 0.02% : 0.000005s : 1: micro_interleaved_order_control 1.60% : 0.000495s : 1: mutable_eliminate 0.02% : 0.000007s : 1: offloading_packed_experts 0.05% : 0.000015s : 1: opt.transform.loop_unroll_optimizer 0.05% : 0.000017s : 1: opt.transform.mutable_eliminate 3.99% : 0.001235s : 78: opt.transform.opt_a 0.11% : 0.000034s : 1: opt.transform.opt_after_cconv 0.09% : 0.000027s : 1: opt.transform.opt_after_jit_grad 0.40% : 0.000123s : 28: opt.transform.opt_b 0.17% : 0.000051s : 2: opt.transform.opt_trans_graph 0.14% : 0.000043s : 4: opt.transform.symbol_engine_opt 8.92% : 0.002763s : 1: opt_a 0.37% : 0.000116s : 1: opt_after_cconv 1.53% : 0.000475s : 1: opt_after_jit_grad 0.75% : 0.000232s : 1: opt_b 15.53% : 0.004809s : 1: optimize 0.07% : 0.000020s : 1: optimize_parallel_all_gather_comm 0.03% : 0.000009s : 1: order_py_execute_after_rewriter 0.07% : 0.000023s : 1: overlap_grad_flash_sp 0.01% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.02% : 0.000007s : 1: overlap_grad_ring_attention 0.01% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.02% : 0.000005s : 1: overlap_param_gather 0.01% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.03% : 0.000008s : 1: overlap_recompute_and_grad_model_parallel 0.02% : 0.000006s : 1: overlap_recompute_comm 0.02% : 0.000007s : 1: parallel-infer-symbol 0.01% : 0.000004s : 1: parallel-infer-symbol-second 0.02% : 0.000005s : 1: partial_unused_args_eliminate 0.02% : 0.000005s : 1: pipeline_parallel_scheduler 0.01% : 0.000004s : 1: pipeline_split 0.12% : 0.000036s : 1: pre_auto_parallel 0.09% : 0.000029s : 1: py_interpret_to_execute 0.04% : 0.000012s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000004s : 1: remove_cast_before_assign_add 0.07% : 0.000021s : 1: remove_dup_value 1.13% : 0.000350s : 1: renormalize.infer 1.04% : 0.000321s : 1: renormalize.specialize 0.02% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.02% : 0.000007s : 1: rewriter_after_jit_bprop_graph 0.14% : 0.000044s : 1: rewriter_after_opt_a 0.27% : 0.000084s : 1: rewriter_before_opt_a 0.01% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.01% : 0.000004s : 1: slice_recompute_activation 0.02% : 0.000005s : 1: split_layernorm_comm 0.01% : 0.000005s : 1: split_matmul_comm_elemetwise 0.03% : 0.000009s : 1: swap_dp_allreduce_reducescatter 0.28% : 0.000086s : 1: symbol_engine_optimizer 0.26% : 0.000081s : 1: tuple_transform 38.61% : 0.011953s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:47:03.742.934 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:47:03.743.194 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0231206, [21] [bootstrap]: 0.00043203 [type_inference]: 0.0126262 [event_method]: 1.8e-05 [auto_monad]: 0.00010107 [graph_reusing]: 6.78998e-06 [inline]: 2.39999e-06 [add_attr]: 0.00310342, [1] [add_attr_with_inline]: 0.00309476, [1] [Cycle 1]: 7.278e-05, [2] [tag_attr]: 1.932e-05 [meta_addattr_fg_expand]: 6.34999e-06 [parallel-infer-symbol]: 2.81e-06 [pre_auto_parallel]: 3.286e-05 [insert-virtual-dataset]: 2.52001e-06 [parallel-infer-symbol-second]: 7.89994e-07 [dataset_repeat_opt]: 1.82001e-06 [pipeline_split]: 1.57999e-06 [optimize]: 0.00562528, [53] [py_interpret_to_execute]: 2.956e-05 [rewriter_before_opt_a]: 8.208e-05 [opt_a]: 0.00324349, [2] [Cycle 1]: 0.00234398, [45] [expand_dump_flag]: 3.23e-06 [switch_simplify]: 0.00010544 [loop_unroll]: 2.638e-05 [a_1]: 0.00057632 [with_stream_mark]: 1.605e-05 [recompute_prepare]: 9.56998e-06 [updatestate_depend_eliminate]: 4.46002e-06 [updatestate_assign_eliminate]: 3.91001e-06 [updatestate_loads_eliminate]: 3.54002e-06 [parameter_eliminate]: 1.99999e-06 [a_2]: 0.00012649 [accelerated_algorithm]: 8.3e-06 [shard]: 2.29999e-06 [meta_shard_fg_expand]: 2.16e-06 [shard_inline]: 7.35e-06 [merge_send_recv]: 9.79e-06 [auto_parallel]: 7.13e-06 [parallel]: 1.831e-05 [flash_sp]: 8.07e-06 [merge_comm]: 4.69002e-06 [allreduce_fusion]: 4.47e-06 [matmul_add_comm_reduction]: 9.77999e-06 [allreduce_slice_to_reducescatter]: 5.90022e-07 [virtual_shard_identity]: 9.34e-06 [virtual_dataset]: 7.33e-06 [get_grad_eliminate_]: 6.98998e-06 [virtual_output]: 7.27002e-06 [merge_forward]: 4.15e-06 [cell_reuse_recompute_pass]: 1.12999e-06 [offload_activation]: 1.071e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.657e-05 [merge_recompute_call_nodes]: 1.40001e-06 [before_grad]: 1.368e-05 [set_forward_comm_id_for_comm_node_pass]: 4.52e-06 [meta_fg_expand]: 3.51999e-06 [flash_sp_send_recv_attached]: 2.59999e-06 [receive_attached]: 2.19001e-06 [after_resolve]: 1.022e-05 [a_after_grad]: 1.184e-05 [renormalize]: 0.00075362 [add_forward_monad_depend]: 5.53002e-06 [auto_monad_grad]: 2.34001e-06 [auto_monad_eliminator]: 1.63e-05 [cse]: 3.603e-05 [a_3]: 7.053e-05 [Cycle 2]: 0.00088651, [45] [expand_dump_flag]: 1.11997e-06 [switch_simplify]: 8.57e-06 [loop_unroll]: 7.16999e-06 [a_1]: 0.00015334 [with_stream_mark]: 1.193e-05 [recompute_prepare]: 7.43e-06 [updatestate_depend_eliminate]: 4.60001e-06 [updatestate_assign_eliminate]: 3.19001e-06 [updatestate_loads_eliminate]: 2.98e-06 [parameter_eliminate]: 1.24e-06 [a_2]: 0.00011481 [accelerated_algorithm]: 7.09001e-06 [shard]: 1.16002e-06 [meta_shard_fg_expand]: 1.59e-06 [shard_inline]: 7.3e-06 [merge_send_recv]: 6.29999e-06 [auto_parallel]: 6.11998e-06 [parallel]: 5.15001e-06 [flash_sp]: 3.3e-06 [merge_comm]: 4.17e-06 [allreduce_fusion]: 3.85998e-06 [matmul_add_comm_reduction]: 6.89999e-06 [allreduce_slice_to_reducescatter]: 4.19997e-07 [virtual_shard_identity]: 8e-06 [virtual_dataset]: 7.02002e-06 [get_grad_eliminate_]: 6.56e-06 [virtual_output]: 6.71e-06 [merge_forward]: 3.5e-06 [cell_reuse_recompute_pass]: 1.55999e-06 [offload_activation]: 7.21001e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.565e-05 [merge_recompute_call_nodes]: 7.39994e-07 [before_grad]: 1.124e-05 [set_forward_comm_id_for_comm_node_pass]: 4.47e-06 [meta_fg_expand]: 2.88e-06 [flash_sp_send_recv_attached]: 8.39995e-07 [receive_attached]: 1.30001e-06 [after_resolve]: 9.89999e-06 [a_after_grad]: 1.056e-05 [renormalize]: 8.00064e-08 [add_forward_monad_depend]: 1.34998e-06 [auto_monad_grad]: 1.20999e-06 [auto_monad_eliminator]: 8.85999e-06 [cse]: 1.895e-05 [a_3]: 5.653e-05 [py_interpret_to_execute_after_opt_a]: 1.314e-05 [slice_cell_reuse_recomputed_activation]: 4.52e-06 [rewriter_after_opt_a]: 4.409e-05 [convert_after_rewriter]: 1.02e-05 [order_py_execute_after_rewriter]: 8.90001e-06 [mutable_eliminate]: 0.00050045 [opt_b]: 0.00029846, [1] [Cycle 1]: 0.00028949, [7] [b_1]: 0.00018792 [b_2]: 9.16998e-06 [updatestate_depend_eliminate]: 5.84e-06 [updatestate_assign_eliminate]: 2.98e-06 [updatestate_loads_eliminate]: 2.81999e-06 [renormalize]: 7.59988e-07 [cse]: 2.384e-05 [optimize_parallel_all_gather_comm]: 2.039e-05 [overlap_param_gather]: 5.35999e-06 [cconv]: 2.713e-05 [loop_unroll]: 0.00047716 [opt_after_cconv]: 0.00013849, [1] [Cycle 1]: 0.00012964, [7] [c_1]: 3.456e-05 [parameter_eliminate]: 2.73e-06 [updatestate_depend_eliminate]: 5.97001e-06 [updatestate_assign_eliminate]: 3.56999e-06 [updatestate_loads_eliminate]: 3.00998e-06 [cse]: 2.358e-05 [renormalize]: 4.20026e-07 [remove_dup_value]: 2.002e-05 [tuple_transform]: 9.559e-05, [1] [Cycle 1]: 8.846e-05, [4] [d_1]: 4.73e-05 [none_parameter_eliminate]: 1.64e-06 [renormalize]: 2.00002e-07 [switch_simplify]: 8.38999e-06 [partial_unused_args_eliminate]: 4.27998e-06 [add_recomputation]: 5.825e-05 [cse_after_recomputation]: 3.217e-05, [1] [Cycle 1]: 2.536e-05, [1] [cse]: 1.569e-05 [environ_conv]: 9.04e-06 [swap_dp_allreduce_reducescatter]: 8.79e-06 [bias_add_comm_swap]: 5.12e-06 [label_micro_interleaved_index]: 6.67002e-06 [label_fine_grained_interleaved_index]: 5.02999e-06 [merge_cast_opt]: 4.01001e-06 [slice_recompute_activation]: 4.42998e-06 [micro_interleaved_order_control]: 4.63001e-06 [assign_add_opt]: 3.95e-06 [ForceFp32Comm]: 2.99001e-06 [remove_cast_before_assign_add]: 3.21001e-06 [full_micro_interleaved_order_control]: 4.91002e-06 [reorder_send_recv_between_fp_bp]: 5.51998e-06 [comm_op_add_attrs]: 3.31999e-06 [add_comm_op_reuse_tag]: 3.16999e-06 [interleave_split_concat_branches]: 3.38e-06 [interleave_parallel_branches]: 3.35998e-06 [overlap_opt_shard_in_pipeline]: 3.96001e-06 [overlap_opt_shard_grad_in_pipeline]: 4.67e-06 [control_data_broadcast_order]: 1.887e-05 [grouped_pairwise_exchange_alltoall]: 4.08999e-06 [offloading_packed_experts]: 6.76e-06 [overlap_recompute_and_grad_model_parallel]: 7.94002e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.57002e-06 [overlap_recompute_allgather_and_fa_grad]: 4.04002e-06 [overlap_recompute_comm]: 5.20001e-06 [overlap_grad_ring_attention]: 6.77002e-06 [overlap_grad_flash_sp]: 2.465e-05 [begin_end_overlap_inline]: 2.89999e-06 [split_matmul_comm_elemetwise]: 4.47e-06 [split_layernorm_comm]: 4.18001e-06 [handle_group_info]: 3.30998e-06 [symbol_engine_optimizer]: 0.0001028, [1] [Cycle 1]: 9.62e-05, [6] [build]: 2.96999e-06 [elim_shapecalc]: 1.088e-05 [elim_not_effective]: 1.536e-05 [opt_reshape]: 8.1e-06 [fold_const_symbol]: 1.263e-05 [renormalize]: 2.30008e-07 [detach_backward]: 3.6e-06 [pipeline_parallel_scheduler]: 1.88997e-06 [auto_monad_reorder]: 2.292e-05 [get_jit_bprop_graph]: 1.42e-06 [rewriter_after_jit_bprop_graph]: 4.76002e-06 [opt_after_jit_grad]: 0.00048542 [validate]: 3.91e-05 Sums bootstrap : 0.000432s : 2.37% type_inference : 0.012626s : 69.19% event_method : 0.000018s : 0.10% auto_monad : 0.000101s : 0.55% graph_reusing : 0.000007s : 0.04% inline : 0.000002s : 0.01% add_attr.add_attr_with_inline.tag_attr : 0.000019s : 0.11% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.03% parallel-infer-symbol : 0.000003s : 0.02% pre_auto_parallel : 0.000033s : 0.18% insert-virtual-dataset : 0.000003s : 0.01% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.01% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000030s : 0.16% optimize.rewriter_before_opt_a : 0.000082s : 0.45% optimize.opt_a.expand_dump_flag : 0.000004s : 0.02% optimize.opt_a.switch_simplify : 0.000114s : 0.62% optimize.opt_a.loop_unroll : 0.000034s : 0.18% optimize.opt_a.a_1 : 0.000730s : 4.00% optimize.opt_a.with_stream_mark : 0.000028s : 0.15% optimize.opt_a.recompute_prepare : 0.000017s : 0.09% optimize.opt_a.updatestate_depend_eliminate : 0.000009s : 0.05% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.04% optimize.opt_a.updatestate_loads_eliminate : 0.000007s : 0.04% optimize.opt_a.parameter_eliminate : 0.000003s : 0.02% optimize.opt_a.a_2 : 0.000241s : 1.32% optimize.opt_a.accelerated_algorithm : 0.000015s : 0.08% optimize.opt_a.shard : 0.000003s : 0.02% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.02% optimize.opt_a.shard_inline : 0.000015s : 0.08% optimize.opt_a.merge_send_recv : 0.000016s : 0.09% optimize.opt_a.auto_parallel : 0.000013s : 0.07% optimize.opt_a.parallel : 0.000023s : 0.13% optimize.opt_a.flash_sp : 0.000011s : 0.06% optimize.opt_a.merge_comm : 0.000009s : 0.05% optimize.opt_a.allreduce_fusion : 0.000008s : 0.05% optimize.opt_a.matmul_add_comm_reduction : 0.000017s : 0.09% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000017s : 0.10% optimize.opt_a.virtual_dataset : 0.000014s : 0.08% optimize.opt_a.get_grad_eliminate_ : 0.000014s : 0.07% optimize.opt_a.virtual_output : 0.000014s : 0.08% optimize.opt_a.merge_forward : 0.000008s : 0.04% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.01% optimize.opt_a.offload_activation : 0.000018s : 0.10% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000032s : 0.18% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.01% optimize.opt_a.before_grad : 0.000025s : 0.14% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000009s : 0.05% optimize.opt_a.meta_fg_expand : 0.000006s : 0.04% optimize.opt_a.flash_sp_send_recv_attached : 0.000003s : 0.02% optimize.opt_a.receive_attached : 0.000003s : 0.02% optimize.opt_a.after_resolve : 0.000020s : 0.11% optimize.opt_a.a_after_grad : 0.000022s : 0.12% optimize.opt_a.renormalize : 0.000754s : 4.13% optimize.opt_a.add_forward_monad_depend : 0.000007s : 0.04% optimize.opt_a.auto_monad_grad : 0.000004s : 0.02% optimize.opt_a.auto_monad_eliminator : 0.000025s : 0.14% optimize.opt_a.cse : 0.000055s : 0.30% optimize.opt_a.a_3 : 0.000127s : 0.70% optimize.py_interpret_to_execute_after_opt_a : 0.000013s : 0.07% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.02% optimize.rewriter_after_opt_a : 0.000044s : 0.24% optimize.convert_after_rewriter : 0.000010s : 0.06% optimize.order_py_execute_after_rewriter : 0.000009s : 0.05% optimize.mutable_eliminate : 0.000500s : 2.74% optimize.opt_b.b_1 : 0.000188s : 1.03% optimize.opt_b.b_2 : 0.000009s : 0.05% optimize.opt_b.updatestate_depend_eliminate : 0.000006s : 0.03% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.02% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.02% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000024s : 0.13% optimize.optimize_parallel_all_gather_comm : 0.000020s : 0.11% optimize.overlap_param_gather : 0.000005s : 0.03% optimize.cconv : 0.000027s : 0.15% optimize.loop_unroll : 0.000477s : 2.61% optimize.opt_after_cconv.c_1 : 0.000035s : 0.19% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.03% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000004s : 0.02% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.02% optimize.opt_after_cconv.cse : 0.000024s : 0.13% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000020s : 0.11% optimize.tuple_transform.d_1 : 0.000047s : 0.26% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000008s : 0.05% optimize.partial_unused_args_eliminate : 0.000004s : 0.02% optimize.add_recomputation : 0.000058s : 0.32% optimize.cse_after_recomputation.cse : 0.000016s : 0.09% optimize.environ_conv : 0.000009s : 0.05% optimize.swap_dp_allreduce_reducescatter : 0.000009s : 0.05% optimize.bias_add_comm_swap : 0.000005s : 0.03% optimize.label_micro_interleaved_index : 0.000007s : 0.04% optimize.label_fine_grained_interleaved_index : 0.000005s : 0.03% optimize.merge_cast_opt : 0.000004s : 0.02% optimize.slice_recompute_activation : 0.000004s : 0.02% optimize.micro_interleaved_order_control : 0.000005s : 0.03% optimize.assign_add_opt : 0.000004s : 0.02% optimize.ForceFp32Comm : 0.000003s : 0.02% optimize.remove_cast_before_assign_add : 0.000003s : 0.02% optimize.full_micro_interleaved_order_control : 0.000005s : 0.03% optimize.reorder_send_recv_between_fp_bp : 0.000006s : 0.03% optimize.comm_op_add_attrs : 0.000003s : 0.02% optimize.add_comm_op_reuse_tag : 0.000003s : 0.02% optimize.interleave_split_concat_branches : 0.000003s : 0.02% optimize.interleave_parallel_branches : 0.000003s : 0.02% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.02% optimize.overlap_opt_shard_grad_in_pipeline : 0.000005s : 0.03% optimize.control_data_broadcast_order : 0.000019s : 0.10% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.02% optimize.offloading_packed_experts : 0.000007s : 0.04% optimize.overlap_recompute_and_grad_model_parallel : 0.000008s : 0.04% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.02% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.02% optimize.overlap_recompute_comm : 0.000005s : 0.03% optimize.overlap_grad_ring_attention : 0.000007s : 0.04% optimize.overlap_grad_flash_sp : 0.000025s : 0.14% optimize.begin_end_overlap_inline : 0.000003s : 0.02% optimize.split_matmul_comm_elemetwise : 0.000004s : 0.02% optimize.split_layernorm_comm : 0.000004s : 0.02% optimize.handle_group_info : 0.000003s : 0.02% optimize.symbol_engine_optimizer.build : 0.000003s : 0.02% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000011s : 0.06% optimize.symbol_engine_optimizer.elim_not_effective : 0.000015s : 0.08% optimize.symbol_engine_optimizer.opt_reshape : 0.000008s : 0.04% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000013s : 0.07% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000004s : 0.02% pipeline_parallel_scheduler : 0.000002s : 0.01% auto_monad_reorder : 0.000023s : 0.13% get_jit_bprop_graph : 0.000001s : 0.01% rewriter_after_jit_bprop_graph : 0.000005s : 0.03% opt_after_jit_grad : 0.000485s : 2.66% validate : 0.000039s : 0.21% Time group info: ------[substitution.] 0.000202 34 10.10% : 0.000020s : 2: substitution.cast_eliminate 1.17% : 0.000002s : 3: substitution.elim_not_effective 0.93% : 0.000002s : 3: substitution.fold_const_symbol 3.05% : 0.000006s : 4: substitution.graph_param_transform 73.42% : 0.000148s : 6: substitution.inline 2.29% : 0.000005s : 6: substitution.j_node_and_user_rematch 2.74% : 0.000006s : 6: substitution.remove_not_recompute_node 1.49% : 0.000003s : 2: substitution.replace_old_param 4.80% : 0.000010s : 2: substitution.switch_simplify ------[type_inference.] 0.012571 2 92.55% : 0.011634s : 1: type_inference.infer 7.45% : 0.000937s : 1: type_inference.specialize ------[replace.] 0.000081 8 46.35% : 0.000038s : 6: replace.inline 53.65% : 0.000043s : 2: replace.switch_simplify ------[match.] 0.000153 8 94.69% : 0.000145s : 6: match.inline 5.31% : 0.000008s : 2: match.switch_simplify ------[predicate.] 0.000208 1222 0.94% : 0.000002s : 13: predicate.accumulaten_eliminater 0.72% : 0.000001s : 4: predicate.ad_related_special_op_eliminate 0.56% : 0.000001s : 8: predicate.addn_check_dump 1.11% : 0.000002s : 13: predicate.addn_zero_filter 0.82% : 0.000002s : 13: predicate.adjust_all_reduce_mul_add 2.29% : 0.000005s : 21: predicate.arithmetic_simplify 1.04% : 0.000002s : 13: predicate.cast_eliminate 0.69% : 0.000001s : 8: predicate.check_bprop_eliminate 0.57% : 0.000001s : 8: predicate.compare_switch_simplify 0.19% : 0.000000s : 4: predicate.const_output_eliminate 0.61% : 0.000001s : 8: predicate.depend_value_elim 0.99% : 0.000002s : 13: predicate.dict_get_item_const_eliminator 1.15% : 0.000002s : 13: predicate.dict_get_item_eliminator 0.85% : 0.000002s : 13: predicate.dict_set_item_eliminator 0.95% : 0.000002s : 8: predicate.dumpgradient_eliminate 0.19% : 0.000000s : 4: predicate.elim_not_effective 0.45% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.16% : 0.000002s : 17: predicate.environ_add_const_eliminate 1.09% : 0.000002s : 17: predicate.environ_get_add_eliminate 1.11% : 0.000002s : 17: predicate.environ_get_depend_swap 1.72% : 0.000004s : 25: predicate.environ_get_eliminate 1.13% : 0.000002s : 17: predicate.environ_get_set_eliminate 1.34% : 0.000003s : 19: predicate.exchange_switch_depend_value 2.19% : 0.000005s : 19: predicate.float_depend_g_call 0.57% : 0.000001s : 8: predicate.float_environ_get_switch 0.90% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.19% : 0.000000s : 4: predicate.fold_const_symbol 0.67% : 0.000001s : 8: predicate.get_grad_eliminate 0.20% : 0.000000s : 4: predicate.graph_param_transform 0.66% : 0.000001s : 8: predicate.incorporate_call 0.53% : 0.000001s : 8: predicate.incorporate_call_switch 6.76% : 0.000014s : 56: predicate.inline 0.86% : 0.000002s : 8: predicate.inline_without_move 0.31% : 0.000001s : 8: predicate.j_node_and_user_rematch 0.84% : 0.000002s : 8: predicate.less_batch_normalization 1.52% : 0.000003s : 21: predicate.list_to_tuple_eliminator_ 2.27% : 0.000005s : 34: predicate.load_eliminater 1.00% : 0.000002s : 4: predicate.loop_unroll_after_grad 2.12% : 0.000004s : 29: predicate.loop_unroll_before_grad 1.72% : 0.000004s : 21: predicate.make_slice_get_slice_eliminator 0.65% : 0.000001s : 8: predicate.merge_addn 0.53% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.68% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.84% : 0.000002s : 13: predicate.minmaximum_grad 0.92% : 0.000002s : 4: predicate.mutable_eliminate 0.36% : 0.000001s : 4: predicate.opt_reshape 0.37% : 0.000001s : 4: predicate.parallel_virtual_node 1.71% : 0.000004s : 19: predicate.partial_defer_inline 1.38% : 0.000003s : 17: predicate.partial_eliminate 0.95% : 0.000002s : 13: predicate.print_const_string_wrapper 0.62% : 0.000001s : 8: predicate.reduce_all_const_elim 1.38% : 0.000003s : 13: predicate.reduce_eliminate 2.28% : 0.000005s : 34: predicate.redundant_stop_gradient_eliminater 0.38% : 0.000001s : 8: predicate.remove_not_recompute_node 1.16% : 0.000002s : 21: predicate.replace_applicator 0.48% : 0.000001s : 8: predicate.replace_old_param 0.28% : 0.000001s : 4: predicate.reset_defer_inline 1.03% : 0.000002s : 13: predicate.reshape_eliminate 0.65% : 0.000001s : 8: predicate.row_tensor_add_zeros_like 0.70% : 0.000001s : 4: predicate.row_tensor_eliminate 0.71% : 0.000001s : 8: predicate.same_eliminate 0.43% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.81% : 0.000002s : 8: predicate.shard_identity_eliminate 0.78% : 0.000002s : 8: predicate.special_op_eliminate 0.89% : 0.000002s : 8: predicate.specialize_transform 0.83% : 0.000002s : 8: predicate.split_environ_get_set_with_tuple_value 0.74% : 0.000002s : 8: predicate.stack_unstack_eliminate 0.35% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.49% : 0.000003s : 19: predicate.switch_defer_inline 2.06% : 0.000004s : 27: predicate.switch_layer_defer_inline 5.43% : 0.000011s : 64: predicate.switch_simplify 0.99% : 0.000002s : 13: predicate.tile_eliminate 0.97% : 0.000002s : 13: predicate.transpose_eliminate 1.59% : 0.000003s : 21: predicate.tuple_list_convert_item_index_to_positive 1.64% : 0.000003s : 21: predicate.tuple_list_get_item_const_eliminator 1.64% : 0.000003s : 21: predicate.tuple_list_get_item_depend_reorder 2.73% : 0.000006s : 29: predicate.tuple_list_get_item_eliminator 1.62% : 0.000003s : 21: predicate.tuple_list_get_set_item_eliminator 2.51% : 0.000005s : 29: predicate.tuple_list_set_item_eliminator 1.55% : 0.000003s : 21: predicate.tuple_to_list_eliminator_ 2.28% : 0.000005s : 34: predicate.updatestate_pure_node_eliminater 3.05% : 0.000006s : 42: predicate.updatestate_useless_node_eliminater 0.37% : 0.000001s : 4: predicate.value_based_eliminate 0.64% : 0.000001s : 8: predicate.virtual_dataset_eliminate 0.66% : 0.000001s : 8: predicate.virtual_output_eliminate 0.29% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.60% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000624 10 52.33% : 0.000326s : 2: func_graph_cloner_run.FuncGraphClonerGraph 47.67% : 0.000297s : 8: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.034053 192 0.02% : 0.000006s : 1: ForceFp32Comm 9.14% : 0.003112s : 1: add_attr 9.10% : 0.003099s : 1: add_attr_with_inline 0.02% : 0.000006s : 1: add_comm_op_reuse_tag 0.18% : 0.000062s : 1: add_recomputation 0.02% : 0.000007s : 1: assign_add_opt 0.33% : 0.000111s : 1: auto_monad 0.09% : 0.000030s : 1: auto_monad_reorder 0.02% : 0.000006s : 1: begin_end_overlap_inline 0.02% : 0.000008s : 1: bias_add_comm_swap 1.40% : 0.000475s : 1: bootstrap 0.09% : 0.000030s : 1: cconv 0.02% : 0.000006s : 1: comm_op_add_attrs 0.06% : 0.000022s : 1: control_data_broadcast_order 0.04% : 0.000013s : 1: convert_after_rewriter 0.10% : 0.000035s : 1: cse_after_recomputation 0.02% : 0.000007s : 1: dataset_repeat_opt 0.06% : 0.000021s : 1: detach_backward 0.04% : 0.000012s : 1: environ_conv 0.08% : 0.000027s : 1: event_method 0.02% : 0.000008s : 1: full_micro_interleaved_order_control 0.02% : 0.000007s : 1: get_jit_bprop_graph 0.04% : 0.000014s : 1: graph_reusing 0.02% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.02% : 0.000006s : 1: handle_group_info 0.03% : 0.000009s : 1: inline 0.02% : 0.000008s : 1: insert-virtual-dataset 0.02% : 0.000006s : 1: interleave_parallel_branches 0.02% : 0.000006s : 1: interleave_split_concat_branches 0.02% : 0.000008s : 1: label_fine_grained_interleaved_index 0.03% : 0.000009s : 1: label_micro_interleaved_index 1.42% : 0.000483s : 1: loop_unroll 0.02% : 0.000007s : 1: merge_cast_opt 0.02% : 0.000007s : 1: micro_interleaved_order_control 1.49% : 0.000507s : 1: mutable_eliminate 0.03% : 0.000010s : 1: offloading_packed_experts 0.05% : 0.000016s : 1: opt.transform.loop_unroll_optimizer 0.05% : 0.000015s : 1: opt.transform.mutable_eliminate 3.75% : 0.001276s : 78: opt.transform.opt_a 0.10% : 0.000033s : 1: opt.transform.opt_after_cconv 0.08% : 0.000028s : 1: opt.transform.opt_after_jit_grad 0.37% : 0.000126s : 28: opt.transform.opt_b 0.16% : 0.000053s : 2: opt.transform.opt_trans_graph 0.13% : 0.000043s : 4: opt.transform.symbol_engine_opt 9.53% : 0.003247s : 1: opt_a 0.42% : 0.000142s : 1: opt_after_cconv 1.46% : 0.000495s : 1: opt_after_jit_grad 0.89% : 0.000302s : 1: opt_b 17.60% : 0.005994s : 1: optimize 0.07% : 0.000024s : 1: optimize_parallel_all_gather_comm 0.03% : 0.000012s : 1: order_py_execute_after_rewriter 0.08% : 0.000028s : 1: overlap_grad_flash_sp 0.02% : 0.000006s : 1: overlap_grad_matmul_and_grad_allreduce 0.03% : 0.000010s : 1: overlap_grad_ring_attention 0.02% : 0.000007s : 1: overlap_opt_shard_grad_in_pipeline 0.02% : 0.000007s : 1: overlap_opt_shard_in_pipeline 0.02% : 0.000008s : 1: overlap_param_gather 0.02% : 0.000007s : 1: overlap_recompute_allgather_and_fa_grad 0.03% : 0.000011s : 1: overlap_recompute_and_grad_model_parallel 0.02% : 0.000008s : 1: overlap_recompute_comm 0.03% : 0.000009s : 1: parallel-infer-symbol 0.02% : 0.000006s : 1: parallel-infer-symbol-second 0.02% : 0.000007s : 1: partial_unused_args_eliminate 0.02% : 0.000008s : 1: pipeline_parallel_scheduler 0.02% : 0.000007s : 1: pipeline_split 0.12% : 0.000041s : 1: pre_auto_parallel 0.10% : 0.000033s : 1: py_interpret_to_execute 0.05% : 0.000017s : 1: py_interpret_to_execute_after_opt_a 0.02% : 0.000006s : 1: remove_cast_before_assign_add 0.07% : 0.000023s : 1: remove_dup_value 1.17% : 0.000398s : 1: renormalize.infer 1.02% : 0.000348s : 1: renormalize.specialize 0.02% : 0.000008s : 1: reorder_send_recv_between_fp_bp 0.03% : 0.000011s : 1: rewriter_after_jit_bprop_graph 0.14% : 0.000048s : 1: rewriter_after_opt_a 0.25% : 0.000086s : 1: rewriter_before_opt_a 0.02% : 0.000007s : 1: slice_cell_reuse_recomputed_activation 0.02% : 0.000007s : 1: slice_recompute_activation 0.02% : 0.000007s : 1: split_layernorm_comm 0.02% : 0.000007s : 1: split_matmul_comm_elemetwise 0.03% : 0.000012s : 1: swap_dp_allreduce_reducescatter 0.31% : 0.000106s : 1: symbol_engine_optimizer 0.29% : 0.000099s : 1: tuple_transform 37.17% : 0.012656s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:47:03.940.616 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0218557, [21] [bootstrap]: 0.00045888 [type_inference]: 0.0124074 [event_method]: 1.745e-05 [auto_monad]: 8.681e-05 [graph_reusing]: 6.94999e-06 [inline]: 2.26998e-06 [add_attr]: 0.00306899, [1] [add_attr_with_inline]: 0.00306029, [1] [Cycle 1]: 7.775e-05, [2] [tag_attr]: 3.878e-05 [meta_addattr_fg_expand]: 6.04001e-06 [parallel-infer-symbol]: 2.95002e-06 [pre_auto_parallel]: 3.166e-05 [insert-virtual-dataset]: 2.64999e-06 [parallel-infer-symbol-second]: 6.69999e-07 [dataset_repeat_opt]: 1.72999e-06 [pipeline_split]: 2.16e-06 [optimize]: 0.00504414, [53] [py_interpret_to_execute]: 2.444e-05 [rewriter_before_opt_a]: 8.021e-05 [opt_a]: 0.00286047, [2] [Cycle 1]: 0.00212033, [45] [expand_dump_flag]: 3.27002e-06 [switch_simplify]: 0.00010473 [loop_unroll]: 2.548e-05 [a_1]: 0.00054586 [with_stream_mark]: 1.57e-05 [recompute_prepare]: 1.009e-05 [updatestate_depend_eliminate]: 4.71997e-06 [updatestate_assign_eliminate]: 4.08001e-06 [updatestate_loads_eliminate]: 4.32e-06 [parameter_eliminate]: 1.84e-06 [a_2]: 9.657e-05 [accelerated_algorithm]: 8.33999e-06 [shard]: 2.47001e-06 [meta_shard_fg_expand]: 2.24001e-06 [shard_inline]: 7.45e-06 [merge_send_recv]: 9.26998e-06 [auto_parallel]: 7.21999e-06 [parallel]: 1.715e-05 [flash_sp]: 8.85999e-06 [merge_comm]: 4.80001e-06 [allreduce_fusion]: 4.34002e-06 [matmul_add_comm_reduction]: 1.006e-05 [allreduce_slice_to_reducescatter]: 6.59988e-07 [virtual_shard_identity]: 8.82e-06 [virtual_dataset]: 8.12e-06 [get_grad_eliminate_]: 6.87002e-06 [virtual_output]: 7.16001e-06 [merge_forward]: 4.42e-06 [cell_reuse_recompute_pass]: 1.15001e-06 [offload_activation]: 9.99001e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.556e-05 [merge_recompute_call_nodes]: 1.40999e-06 [before_grad]: 1.243e-05 [set_forward_comm_id_for_comm_node_pass]: 4.30999e-06 [meta_fg_expand]: 3.53e-06 [flash_sp_send_recv_attached]: 2.79999e-06 [receive_attached]: 1.89999e-06 [after_resolve]: 1.12e-05 [a_after_grad]: 1.128e-05 [renormalize]: 0.00073784 [add_forward_monad_depend]: 5.39998e-06 [auto_monad_grad]: 2.12001e-06 [auto_monad_eliminator]: 1.638e-05 [cse]: 3.758e-05 [a_3]: 5.738e-05 [Cycle 2]: 0.00073022, [45] [expand_dump_flag]: 1.32e-06 [switch_simplify]: 8.92e-06 [loop_unroll]: 7.41999e-06 [a_1]: 0.00015361 [with_stream_mark]: 1.298e-05 [recompute_prepare]: 7.71001e-06 [updatestate_depend_eliminate]: 4.07998e-06 [updatestate_assign_eliminate]: 3.09001e-06 [updatestate_loads_eliminate]: 2.86e-06 [parameter_eliminate]: 1.10001e-06 [a_2]: 8.74e-05 [accelerated_algorithm]: 7.55e-06 [shard]: 1.35001e-06 [meta_shard_fg_expand]: 1.87001e-06 [shard_inline]: 7.66999e-06 [merge_send_recv]: 6.49001e-06 [auto_parallel]: 6.55002e-06 [parallel]: 5.25001e-06 [flash_sp]: 3.41999e-06 [merge_comm]: 3.81001e-06 [allreduce_fusion]: 3.7e-06 [matmul_add_comm_reduction]: 8.18001e-06 [allreduce_slice_to_reducescatter]: 3.89991e-07 [virtual_shard_identity]: 7.9e-06 [virtual_dataset]: 6.80002e-06 [get_grad_eliminate_]: 6.57002e-06 [virtual_output]: 6.44001e-06 [merge_forward]: 3.13e-06 [cell_reuse_recompute_pass]: 1.56998e-06 [offload_activation]: 7.46001e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.379e-05 [merge_recompute_call_nodes]: 1.05001e-06 [before_grad]: 1.115e-05 [set_forward_comm_id_for_comm_node_pass]: 4.34002e-06 [meta_fg_expand]: 3.32002e-06 [flash_sp_send_recv_attached]: 9.70002e-07 [receive_attached]: 1.02e-06 [after_resolve]: 9.86998e-06 [a_after_grad]: 1.018e-05 [renormalize]: 7.00238e-08 [add_forward_monad_depend]: 1.86998e-06 [auto_monad_grad]: 1.18001e-06 [auto_monad_eliminator]: 8.65001e-06 [cse]: 2.211e-05 [a_3]: 4.422e-05 [py_interpret_to_execute_after_opt_a]: 1.1e-05 [slice_cell_reuse_recomputed_activation]: 1.86003e-06 [rewriter_after_opt_a]: 4.391e-05 [convert_after_rewriter]: 7.45e-06 [order_py_execute_after_rewriter]: 6.09001e-06 [mutable_eliminate]: 0.00051339 [opt_b]: 0.00023879, [1] [Cycle 1]: 0.00023252, [7] [b_1]: 0.00014564 [b_2]: 9.31e-06 [updatestate_depend_eliminate]: 7.46001e-06 [updatestate_assign_eliminate]: 3.21001e-06 [updatestate_loads_eliminate]: 3.01001e-06 [renormalize]: 7.10017e-07 [cse]: 2.753e-05 [optimize_parallel_all_gather_comm]: 4.679e-05 [overlap_param_gather]: 2.21e-06 [cconv]: 2.714e-05 [loop_unroll]: 0.00044027 [opt_after_cconv]: 0.00012186, [1] [Cycle 1]: 0.00011546, [7] [c_1]: 3.397e-05 [parameter_eliminate]: 3.6e-06 [updatestate_depend_eliminate]: 6.84999e-06 [updatestate_assign_eliminate]: 3.21999e-06 [updatestate_loads_eliminate]: 2.84001e-06 [cse]: 2.68e-05 [renormalize]: 7.2e-07 [remove_dup_value]: 1.778e-05 [tuple_transform]: 8.872e-05, [1] [Cycle 1]: 8.35e-05, [4] [d_1]: 5.26e-05 [none_parameter_eliminate]: 1.78997e-06 [renormalize]: 2.19996e-07 [switch_simplify]: 8.87999e-06 [partial_unused_args_eliminate]: 1.79e-06 [add_recomputation]: 5.854e-05 [cse_after_recomputation]: 2.645e-05, [1] [Cycle 1]: 2.163e-05, [1] [cse]: 1.577e-05 [environ_conv]: 6.16998e-06 [swap_dp_allreduce_reducescatter]: 5.52001e-06 [bias_add_comm_swap]: 2.59001e-06 [label_micro_interleaved_index]: 4.45e-06 [label_fine_grained_interleaved_index]: 2.64999e-06 [merge_cast_opt]: 1.24e-06 [slice_recompute_activation]: 2.29001e-06 [micro_interleaved_order_control]: 2.23998e-06 [assign_add_opt]: 1.13001e-06 [ForceFp32Comm]: 8.00006e-07 [remove_cast_before_assign_add]: 1.00999e-06 [full_micro_interleaved_order_control]: 2.63e-06 [reorder_send_recv_between_fp_bp]: 2.61e-06 [comm_op_add_attrs]: 1.02998e-06 [add_comm_op_reuse_tag]: 9.39996e-07 [interleave_split_concat_branches]: 1.15999e-06 [interleave_parallel_branches]: 1.08001e-06 [overlap_opt_shard_in_pipeline]: 1.69e-06 [overlap_opt_shard_grad_in_pipeline]: 2.04e-06 [control_data_broadcast_order]: 1.494e-05 [grouped_pairwise_exchange_alltoall]: 1.56998e-06 [offloading_packed_experts]: 3.98001e-06 [overlap_recompute_and_grad_model_parallel]: 4.63999e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.12999e-06 [overlap_recompute_allgather_and_fa_grad]: 1.30999e-06 [overlap_recompute_comm]: 2.29001e-06 [overlap_grad_ring_attention]: 4.2e-06 [overlap_grad_flash_sp]: 2.101e-05 [begin_end_overlap_inline]: 5.69999e-07 [split_matmul_comm_elemetwise]: 2.28002e-06 [split_layernorm_comm]: 2.02999e-06 [handle_group_info]: 9.80013e-07 [symbol_engine_optimizer]: 8.871e-05, [1] [Cycle 1]: 8.414e-05, [6] [build]: 3.13e-06 [elim_shapecalc]: 1.221e-05 [elim_not_effective]: 1.621e-05 [opt_reshape]: 9.29e-06 [fold_const_symbol]: 1.288e-05 [renormalize]: 2.80008e-07 [detach_backward]: 2.11e-06 [pipeline_parallel_scheduler]: 1.72999e-06 [auto_monad_reorder]: 2.008e-05 [get_jit_bprop_graph]: 1.74e-06 [rewriter_after_jit_bprop_graph]: 4.63999e-06 [opt_after_jit_grad]: 0.00049761 [validate]: 4.29e-05 Sums bootstrap : 0.000459s : 2.58% type_inference : 0.012407s : 69.63% event_method : 0.000017s : 0.10% auto_monad : 0.000087s : 0.49% graph_reusing : 0.000007s : 0.04% inline : 0.000002s : 0.01% add_attr.add_attr_with_inline.tag_attr : 0.000039s : 0.22% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.03% parallel-infer-symbol : 0.000003s : 0.02% pre_auto_parallel : 0.000032s : 0.18% insert-virtual-dataset : 0.000003s : 0.01% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.01% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000024s : 0.14% optimize.rewriter_before_opt_a : 0.000080s : 0.45% optimize.opt_a.expand_dump_flag : 0.000005s : 0.03% optimize.opt_a.switch_simplify : 0.000114s : 0.64% optimize.opt_a.loop_unroll : 0.000033s : 0.18% optimize.opt_a.a_1 : 0.000699s : 3.93% optimize.opt_a.with_stream_mark : 0.000029s : 0.16% optimize.opt_a.recompute_prepare : 0.000018s : 0.10% optimize.opt_a.updatestate_depend_eliminate : 0.000009s : 0.05% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.04% optimize.opt_a.updatestate_loads_eliminate : 0.000007s : 0.04% optimize.opt_a.parameter_eliminate : 0.000003s : 0.02% optimize.opt_a.a_2 : 0.000184s : 1.03% optimize.opt_a.accelerated_algorithm : 0.000016s : 0.09% optimize.opt_a.shard : 0.000004s : 0.02% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.02% optimize.opt_a.shard_inline : 0.000015s : 0.08% optimize.opt_a.merge_send_recv : 0.000016s : 0.09% optimize.opt_a.auto_parallel : 0.000014s : 0.08% optimize.opt_a.parallel : 0.000022s : 0.13% optimize.opt_a.flash_sp : 0.000012s : 0.07% optimize.opt_a.merge_comm : 0.000009s : 0.05% optimize.opt_a.allreduce_fusion : 0.000008s : 0.05% optimize.opt_a.matmul_add_comm_reduction : 0.000018s : 0.10% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000017s : 0.09% optimize.opt_a.virtual_dataset : 0.000015s : 0.08% optimize.opt_a.get_grad_eliminate_ : 0.000013s : 0.08% optimize.opt_a.virtual_output : 0.000014s : 0.08% optimize.opt_a.merge_forward : 0.000008s : 0.04% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.02% optimize.opt_a.offload_activation : 0.000017s : 0.10% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000029s : 0.16% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.01% optimize.opt_a.before_grad : 0.000024s : 0.13% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000009s : 0.05% optimize.opt_a.meta_fg_expand : 0.000007s : 0.04% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.02% optimize.opt_a.receive_attached : 0.000003s : 0.02% optimize.opt_a.after_resolve : 0.000021s : 0.12% optimize.opt_a.a_after_grad : 0.000021s : 0.12% optimize.opt_a.renormalize : 0.000738s : 4.14% optimize.opt_a.add_forward_monad_depend : 0.000007s : 0.04% optimize.opt_a.auto_monad_grad : 0.000003s : 0.02% optimize.opt_a.auto_monad_eliminator : 0.000025s : 0.14% optimize.opt_a.cse : 0.000060s : 0.33% optimize.opt_a.a_3 : 0.000102s : 0.57% optimize.py_interpret_to_execute_after_opt_a : 0.000011s : 0.06% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.01% optimize.rewriter_after_opt_a : 0.000044s : 0.25% optimize.convert_after_rewriter : 0.000007s : 0.04% optimize.order_py_execute_after_rewriter : 0.000006s : 0.03% optimize.mutable_eliminate : 0.000513s : 2.88% optimize.opt_b.b_1 : 0.000146s : 0.82% optimize.opt_b.b_2 : 0.000009s : 0.05% optimize.opt_b.updatestate_depend_eliminate : 0.000007s : 0.04% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.02% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.02% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000028s : 0.15% optimize.optimize_parallel_all_gather_comm : 0.000047s : 0.26% optimize.overlap_param_gather : 0.000002s : 0.01% optimize.cconv : 0.000027s : 0.15% optimize.loop_unroll : 0.000440s : 2.47% optimize.opt_after_cconv.c_1 : 0.000034s : 0.19% optimize.opt_after_cconv.parameter_eliminate : 0.000004s : 0.02% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000007s : 0.04% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.02% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.02% optimize.opt_after_cconv.cse : 0.000027s : 0.15% optimize.opt_after_cconv.renormalize : 0.000001s : 0.00% optimize.remove_dup_value : 0.000018s : 0.10% optimize.tuple_transform.d_1 : 0.000053s : 0.30% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000009s : 0.05% optimize.partial_unused_args_eliminate : 0.000002s : 0.01% optimize.add_recomputation : 0.000059s : 0.33% optimize.cse_after_recomputation.cse : 0.000016s : 0.09% optimize.environ_conv : 0.000006s : 0.03% optimize.swap_dp_allreduce_reducescatter : 0.000006s : 0.03% optimize.bias_add_comm_swap : 0.000003s : 0.01% optimize.label_micro_interleaved_index : 0.000004s : 0.02% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.01% optimize.merge_cast_opt : 0.000001s : 0.01% optimize.slice_recompute_activation : 0.000002s : 0.01% optimize.micro_interleaved_order_control : 0.000002s : 0.01% optimize.assign_add_opt : 0.000001s : 0.01% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.01% optimize.full_micro_interleaved_order_control : 0.000003s : 0.01% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.01% optimize.comm_op_add_attrs : 0.000001s : 0.01% optimize.add_comm_op_reuse_tag : 0.000001s : 0.01% optimize.interleave_split_concat_branches : 0.000001s : 0.01% optimize.interleave_parallel_branches : 0.000001s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000002s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.01% optimize.control_data_broadcast_order : 0.000015s : 0.08% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.01% optimize.offloading_packed_experts : 0.000004s : 0.02% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.03% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.01% optimize.overlap_recompute_comm : 0.000002s : 0.01% optimize.overlap_grad_ring_attention : 0.000004s : 0.02% optimize.overlap_grad_flash_sp : 0.000021s : 0.12% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.01% optimize.split_layernorm_comm : 0.000002s : 0.01% optimize.handle_group_info : 0.000001s : 0.01% optimize.symbol_engine_optimizer.build : 0.000003s : 0.02% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000012s : 0.07% optimize.symbol_engine_optimizer.elim_not_effective : 0.000016s : 0.09% optimize.symbol_engine_optimizer.opt_reshape : 0.000009s : 0.05% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000013s : 0.07% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.01% pipeline_parallel_scheduler : 0.000002s : 0.01% auto_monad_reorder : 0.000020s : 0.11% get_jit_bprop_graph : 0.000002s : 0.01% rewriter_after_jit_bprop_graph : 0.000005s : 0.03% opt_after_jit_grad : 0.000498s : 2.79% validate : 0.000043s : 0.24% Time group info: ------[substitution.] 0.000174 34 11.01% : 0.000019s : 2: substitution.cast_eliminate 1.30% : 0.000002s : 3: substitution.elim_not_effective 1.18% : 0.000002s : 3: substitution.fold_const_symbol 3.52% : 0.000006s : 4: substitution.graph_param_transform 69.88% : 0.000121s : 6: substitution.inline 2.49% : 0.000004s : 6: substitution.j_node_and_user_rematch 3.42% : 0.000006s : 6: substitution.remove_not_recompute_node 1.64% : 0.000003s : 2: substitution.replace_old_param 5.56% : 0.000010s : 2: substitution.switch_simplify ------[type_inference.] 0.012356 2 92.20% : 0.011393s : 1: type_inference.infer 7.80% : 0.000963s : 1: type_inference.specialize ------[replace.] 0.000082 8 47.38% : 0.000039s : 6: replace.inline 52.62% : 0.000043s : 2: replace.switch_simplify ------[match.] 0.000126 8 93.56% : 0.000118s : 6: match.inline 6.44% : 0.000008s : 2: match.switch_simplify ------[predicate.] 0.000207 1222 1.11% : 0.000002s : 13: predicate.accumulaten_eliminater 1.02% : 0.000002s : 4: predicate.ad_related_special_op_eliminate 0.56% : 0.000001s : 8: predicate.addn_check_dump 0.96% : 0.000002s : 13: predicate.addn_zero_filter 0.87% : 0.000002s : 13: predicate.adjust_all_reduce_mul_add 2.27% : 0.000005s : 21: predicate.arithmetic_simplify 1.01% : 0.000002s : 13: predicate.cast_eliminate 0.61% : 0.000001s : 8: predicate.check_bprop_eliminate 0.56% : 0.000001s : 8: predicate.compare_switch_simplify 0.19% : 0.000000s : 4: predicate.const_output_eliminate 0.66% : 0.000001s : 8: predicate.depend_value_elim 0.95% : 0.000002s : 13: predicate.dict_get_item_const_eliminator 1.08% : 0.000002s : 13: predicate.dict_get_item_eliminator 0.94% : 0.000002s : 13: predicate.dict_set_item_eliminator 1.02% : 0.000002s : 8: predicate.dumpgradient_eliminate 0.22% : 0.000000s : 4: predicate.elim_not_effective 0.43% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.16% : 0.000002s : 17: predicate.environ_add_const_eliminate 1.13% : 0.000002s : 17: predicate.environ_get_add_eliminate 1.14% : 0.000002s : 17: predicate.environ_get_depend_swap 1.80% : 0.000004s : 25: predicate.environ_get_eliminate 1.11% : 0.000002s : 17: predicate.environ_get_set_eliminate 1.31% : 0.000003s : 19: predicate.exchange_switch_depend_value 2.16% : 0.000004s : 19: predicate.float_depend_g_call 0.57% : 0.000001s : 8: predicate.float_environ_get_switch 0.91% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.18% : 0.000000s : 4: predicate.fold_const_symbol 0.72% : 0.000001s : 8: predicate.get_grad_eliminate 0.24% : 0.000000s : 4: predicate.graph_param_transform 0.71% : 0.000001s : 8: predicate.incorporate_call 0.58% : 0.000001s : 8: predicate.incorporate_call_switch 6.43% : 0.000013s : 56: predicate.inline 0.81% : 0.000002s : 8: predicate.inline_without_move 0.31% : 0.000001s : 8: predicate.j_node_and_user_rematch 0.86% : 0.000002s : 8: predicate.less_batch_normalization 1.62% : 0.000003s : 21: predicate.list_to_tuple_eliminator_ 2.23% : 0.000005s : 34: predicate.load_eliminater 0.90% : 0.000002s : 4: predicate.loop_unroll_after_grad 2.09% : 0.000004s : 29: predicate.loop_unroll_before_grad 1.67% : 0.000003s : 21: predicate.make_slice_get_slice_eliminator 0.60% : 0.000001s : 8: predicate.merge_addn 0.60% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.69% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.85% : 0.000002s : 13: predicate.minmaximum_grad 1.16% : 0.000002s : 4: predicate.mutable_eliminate 0.43% : 0.000001s : 4: predicate.opt_reshape 0.48% : 0.000001s : 4: predicate.parallel_virtual_node 1.79% : 0.000004s : 19: predicate.partial_defer_inline 1.36% : 0.000003s : 17: predicate.partial_eliminate 1.00% : 0.000002s : 13: predicate.print_const_string_wrapper 0.55% : 0.000001s : 8: predicate.reduce_all_const_elim 1.11% : 0.000002s : 13: predicate.reduce_eliminate 2.23% : 0.000005s : 34: predicate.redundant_stop_gradient_eliminater 0.41% : 0.000001s : 8: predicate.remove_not_recompute_node 1.17% : 0.000002s : 21: predicate.replace_applicator 0.50% : 0.000001s : 8: predicate.replace_old_param 0.29% : 0.000001s : 4: predicate.reset_defer_inline 0.97% : 0.000002s : 13: predicate.reshape_eliminate 0.62% : 0.000001s : 8: predicate.row_tensor_add_zeros_like 0.50% : 0.000001s : 4: predicate.row_tensor_eliminate 0.79% : 0.000002s : 8: predicate.same_eliminate 0.44% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.88% : 0.000002s : 8: predicate.shard_identity_eliminate 0.76% : 0.000002s : 8: predicate.special_op_eliminate 0.75% : 0.000002s : 8: predicate.specialize_transform 0.89% : 0.000002s : 8: predicate.split_environ_get_set_with_tuple_value 0.79% : 0.000002s : 8: predicate.stack_unstack_eliminate 0.36% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.50% : 0.000003s : 19: predicate.switch_defer_inline 1.98% : 0.000004s : 27: predicate.switch_layer_defer_inline 5.45% : 0.000011s : 64: predicate.switch_simplify 0.92% : 0.000002s : 13: predicate.tile_eliminate 0.97% : 0.000002s : 13: predicate.transpose_eliminate 1.63% : 0.000003s : 21: predicate.tuple_list_convert_item_index_to_positive 1.65% : 0.000003s : 21: predicate.tuple_list_get_item_const_eliminator 1.52% : 0.000003s : 21: predicate.tuple_list_get_item_depend_reorder 3.10% : 0.000006s : 29: predicate.tuple_list_get_item_eliminator 1.61% : 0.000003s : 21: predicate.tuple_list_get_set_item_eliminator 2.45% : 0.000005s : 29: predicate.tuple_list_set_item_eliminator 1.61% : 0.000003s : 21: predicate.tuple_to_list_eliminator_ 2.19% : 0.000005s : 34: predicate.updatestate_pure_node_eliminater 2.89% : 0.000006s : 42: predicate.updatestate_useless_node_eliminater 0.45% : 0.000001s : 4: predicate.value_based_eliminate 0.70% : 0.000001s : 8: predicate.virtual_dataset_eliminate 0.66% : 0.000001s : 8: predicate.virtual_output_eliminate 0.28% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.39% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000637 10 51.47% : 0.000328s : 2: func_graph_cloner_run.FuncGraphClonerGraph 48.53% : 0.000309s : 8: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.032142 192 0.01% : 0.000004s : 1: ForceFp32Comm 9.56% : 0.003074s : 1: add_attr 9.53% : 0.003064s : 1: add_attr_with_inline 0.01% : 0.000003s : 1: add_comm_op_reuse_tag 0.19% : 0.000062s : 1: add_recomputation 0.01% : 0.000004s : 1: assign_add_opt 0.29% : 0.000093s : 1: auto_monad 0.08% : 0.000024s : 1: auto_monad_reorder 0.01% : 0.000003s : 1: begin_end_overlap_inline 0.02% : 0.000005s : 1: bias_add_comm_swap 1.51% : 0.000486s : 1: bootstrap 0.10% : 0.000031s : 1: cconv 0.01% : 0.000004s : 1: comm_op_add_attrs 0.06% : 0.000018s : 1: control_data_broadcast_order 0.03% : 0.000010s : 1: convert_after_rewriter 0.09% : 0.000029s : 1: cse_after_recomputation 0.01% : 0.000005s : 1: dataset_repeat_opt 0.02% : 0.000005s : 1: detach_backward 0.03% : 0.000010s : 1: environ_conv 0.08% : 0.000024s : 1: event_method 0.02% : 0.000005s : 1: full_micro_interleaved_order_control 0.02% : 0.000005s : 1: get_jit_bprop_graph 0.03% : 0.000011s : 1: graph_reusing 0.01% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.05% : 0.000016s : 1: handle_group_info 0.02% : 0.000005s : 1: inline 0.02% : 0.000006s : 1: insert-virtual-dataset 0.01% : 0.000004s : 1: interleave_parallel_branches 0.01% : 0.000004s : 1: interleave_split_concat_branches 0.02% : 0.000006s : 1: label_fine_grained_interleaved_index 0.02% : 0.000007s : 1: label_micro_interleaved_index 1.40% : 0.000449s : 1: loop_unroll 0.01% : 0.000004s : 1: merge_cast_opt 0.02% : 0.000005s : 1: micro_interleaved_order_control 1.63% : 0.000523s : 1: mutable_eliminate 0.02% : 0.000007s : 1: offloading_packed_experts 0.05% : 0.000016s : 1: opt.transform.loop_unroll_optimizer 0.06% : 0.000018s : 1: opt.transform.mutable_eliminate 3.86% : 0.001242s : 78: opt.transform.opt_a 0.10% : 0.000032s : 1: opt.transform.opt_after_cconv 0.09% : 0.000030s : 1: opt.transform.opt_after_jit_grad 0.38% : 0.000124s : 28: opt.transform.opt_b 0.18% : 0.000058s : 2: opt.transform.opt_trans_graph 0.14% : 0.000046s : 4: opt.transform.symbol_engine_opt 8.91% : 0.002864s : 1: opt_a 0.39% : 0.000126s : 1: opt_after_cconv 1.58% : 0.000508s : 1: opt_after_jit_grad 0.75% : 0.000242s : 1: opt_b 15.71% : 0.005049s : 1: optimize 0.16% : 0.000051s : 1: optimize_parallel_all_gather_comm 0.03% : 0.000009s : 1: order_py_execute_after_rewriter 0.08% : 0.000025s : 1: overlap_grad_flash_sp 0.01% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.02% : 0.000007s : 1: overlap_grad_ring_attention 0.01% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000005s : 1: overlap_opt_shard_in_pipeline 0.02% : 0.000006s : 1: overlap_param_gather 0.01% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.02% : 0.000007s : 1: overlap_recompute_and_grad_model_parallel 0.02% : 0.000005s : 1: overlap_recompute_comm 0.02% : 0.000006s : 1: parallel-infer-symbol 0.01% : 0.000004s : 1: parallel-infer-symbol-second 0.02% : 0.000005s : 1: partial_unused_args_eliminate 0.02% : 0.000005s : 1: pipeline_parallel_scheduler 0.02% : 0.000005s : 1: pipeline_split 0.11% : 0.000036s : 1: pre_auto_parallel 0.09% : 0.000028s : 1: py_interpret_to_execute 0.04% : 0.000014s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000004s : 1: remove_cast_before_assign_add 0.07% : 0.000021s : 1: remove_dup_value 1.20% : 0.000385s : 1: renormalize.infer 1.07% : 0.000345s : 1: renormalize.specialize 0.02% : 0.000005s : 1: reorder_send_recv_between_fp_bp 0.03% : 0.000008s : 1: rewriter_after_jit_bprop_graph 0.15% : 0.000048s : 1: rewriter_after_opt_a 0.26% : 0.000084s : 1: rewriter_before_opt_a 0.01% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.02% : 0.000005s : 1: slice_recompute_activation 0.02% : 0.000005s : 1: split_layernorm_comm 0.02% : 0.000005s : 1: split_matmul_comm_elemetwise 0.03% : 0.000008s : 1: swap_dp_allreduce_reducescatter 0.29% : 0.000092s : 1: symbol_engine_optimizer 0.28% : 0.000091s : 1: tuple_transform 38.66% : 0.012425s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:47:04.138.605 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:47:04.138.865 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0235893, [21] [bootstrap]: 0.00045423 [type_inference]: 0.0128556 [event_method]: 1.892e-05 [auto_monad]: 8.886e-05 [graph_reusing]: 6.99001e-06 [inline]: 2.24001e-06 [add_attr]: 0.00334365, [1] [add_attr_with_inline]: 0.00333355, [1] [Cycle 1]: 9.009e-05, [2] [tag_attr]: 2.191e-05 [meta_addattr_fg_expand]: 6.37001e-06 [parallel-infer-symbol]: 3.59002e-06 [pre_auto_parallel]: 3.904e-05 [insert-virtual-dataset]: 2.49001e-06 [parallel-infer-symbol-second]: 8.49977e-07 [dataset_repeat_opt]: 2.54999e-06 [pipeline_split]: 1.69e-06 [optimize]: 0.00561862, [53] [py_interpret_to_execute]: 3.547e-05 [rewriter_before_opt_a]: 9.149e-05 [opt_a]: 0.00336002, [2] [Cycle 1]: 0.00255138, [45] [expand_dump_flag]: 3.7e-06 [switch_simplify]: 0.00012061 [loop_unroll]: 3.016e-05 [a_1]: 0.00060829 [with_stream_mark]: 1.987e-05 [recompute_prepare]: 1.203e-05 [updatestate_depend_eliminate]: 5.14e-06 [updatestate_assign_eliminate]: 4e-06 [updatestate_loads_eliminate]: 3.25002e-06 [parameter_eliminate]: 2.21e-06 [a_2]: 0.00012465 [accelerated_algorithm]: 9.22999e-06 [shard]: 2.20002e-06 [meta_shard_fg_expand]: 2.16e-06 [shard_inline]: 7.21001e-06 [merge_send_recv]: 9.68002e-06 [auto_parallel]: 7.71999e-06 [parallel]: 1.95e-05 [flash_sp]: 9.36e-06 [merge_comm]: 5.12e-06 [allreduce_fusion]: 3.86001e-06 [matmul_add_comm_reduction]: 1.028e-05 [allreduce_slice_to_reducescatter]: 6.89994e-07 [virtual_shard_identity]: 1.033e-05 [virtual_dataset]: 7.24001e-06 [get_grad_eliminate_]: 6.34999e-06 [virtual_output]: 7.85e-06 [merge_forward]: 4.15999e-06 [cell_reuse_recompute_pass]: 1.34e-06 [offload_activation]: 1.066e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.811e-05 [merge_recompute_call_nodes]: 1.74998e-06 [before_grad]: 1.181e-05 [set_forward_comm_id_for_comm_node_pass]: 4.4e-06 [meta_fg_expand]: 3.09999e-06 [flash_sp_send_recv_attached]: 2.81999e-06 [receive_attached]: 2.22001e-06 [after_resolve]: 1.045e-05 [a_after_grad]: 9.72999e-06 [renormalize]: 0.00086016 [add_forward_monad_depend]: 5.94999e-06 [auto_monad_grad]: 2.73e-06 [auto_monad_eliminator]: 1.669e-05 [cse]: 2.958e-05 [a_3]: 6.224e-05 [Cycle 2]: 0.00079417, [45] [expand_dump_flag]: 1.29e-06 [switch_simplify]: 7.55e-06 [loop_unroll]: 6.20002e-06 [a_1]: 0.00010653 [with_stream_mark]: 1.152e-05 [recompute_prepare]: 6.39001e-06 [updatestate_depend_eliminate]: 3.31001e-06 [updatestate_assign_eliminate]: 2.53003e-06 [updatestate_loads_eliminate]: 2.74001e-06 [parameter_eliminate]: 1.50999e-06 [a_2]: 0.00010641 [accelerated_algorithm]: 6.26e-06 [shard]: 1.14998e-06 [meta_shard_fg_expand]: 1.64e-06 [shard_inline]: 6.66e-06 [merge_send_recv]: 5.32001e-06 [auto_parallel]: 6.07001e-06 [parallel]: 6.34001e-06 [flash_sp]: 3.41001e-06 [merge_comm]: 3.36001e-06 [allreduce_fusion]: 3.33e-06 [matmul_add_comm_reduction]: 6.72002e-06 [allreduce_slice_to_reducescatter]: 3.59985e-07 [virtual_shard_identity]: 6.78998e-06 [virtual_dataset]: 5.73002e-06 [get_grad_eliminate_]: 5.53002e-06 [virtual_output]: 5.35999e-06 [merge_forward]: 2.74001e-06 [cell_reuse_recompute_pass]: 1.46002e-06 [offload_activation]: 7.61001e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.452e-05 [merge_recompute_call_nodes]: 8.59989e-07 [before_grad]: 9.74e-06 [set_forward_comm_id_for_comm_node_pass]: 4.07e-06 [meta_fg_expand]: 2.09e-06 [flash_sp_send_recv_attached]: 9.80013e-07 [receive_attached]: 9.80013e-07 [after_resolve]: 9.00999e-06 [a_after_grad]: 8.50999e-06 [renormalize]: 7.00238e-08 [add_forward_monad_depend]: 1.54e-06 [auto_monad_grad]: 9.39996e-07 [auto_monad_eliminator]: 7.00998e-06 [cse]: 1.428e-05 [a_3]: 4.791e-05 [py_interpret_to_execute_after_opt_a]: 1.286e-05 [slice_cell_reuse_recomputed_activation]: 4.85999e-06 [rewriter_after_opt_a]: 4.203e-05 [convert_after_rewriter]: 9.27001e-06 [order_py_execute_after_rewriter]: 8.65999e-06 [mutable_eliminate]: 0.00051367 [opt_b]: 0.00026182, [1] [Cycle 1]: 0.00025322, [7] [b_1]: 0.00016052 [b_2]: 7.75e-06 [updatestate_depend_eliminate]: 5.20999e-06 [updatestate_assign_eliminate]: 2.51998e-06 [updatestate_loads_eliminate]: 2.19999e-06 [renormalize]: 4.60015e-07 [cse]: 1.805e-05 [optimize_parallel_all_gather_comm]: 1.921e-05 [overlap_param_gather]: 5.07999e-06 [cconv]: 2.783e-05 [loop_unroll]: 0.00043341 [opt_after_cconv]: 0.00012051, [1] [Cycle 1]: 0.00011206, [7] [c_1]: 2.775e-05 [parameter_eliminate]: 2.48e-06 [updatestate_depend_eliminate]: 4.86002e-06 [updatestate_assign_eliminate]: 2.49999e-06 [updatestate_loads_eliminate]: 2.12001e-06 [cse]: 1.724e-05 [renormalize]: 4.00003e-07 [remove_dup_value]: 1.732e-05 [tuple_transform]: 8.356e-05, [1] [Cycle 1]: 7.7e-05, [4] [d_1]: 3.816e-05 [none_parameter_eliminate]: 1.77001e-06 [renormalize]: 2.09984e-07 [switch_simplify]: 6.44999e-06 [partial_unused_args_eliminate]: 5.09e-06 [add_recomputation]: 4.772e-05 [cse_after_recomputation]: 2.701e-05, [1] [Cycle 1]: 2.039e-05, [1] [cse]: 1.131e-05 [environ_conv]: 7.5e-06 [swap_dp_allreduce_reducescatter]: 7.82e-06 [bias_add_comm_swap]: 4.88001e-06 [label_micro_interleaved_index]: 7.15e-06 [label_fine_grained_interleaved_index]: 5.91998e-06 [merge_cast_opt]: 3.63e-06 [slice_recompute_activation]: 4.49002e-06 [micro_interleaved_order_control]: 5.27001e-06 [assign_add_opt]: 3.35e-06 [ForceFp32Comm]: 3.01999e-06 [remove_cast_before_assign_add]: 3.76001e-06 [full_micro_interleaved_order_control]: 4.45999e-06 [reorder_send_recv_between_fp_bp]: 5.61e-06 [comm_op_add_attrs]: 3.2e-06 [add_comm_op_reuse_tag]: 3.33998e-06 [interleave_split_concat_branches]: 3.4e-06 [interleave_parallel_branches]: 3.9e-06 [overlap_opt_shard_in_pipeline]: 3.89002e-06 [overlap_opt_shard_grad_in_pipeline]: 4.17998e-06 [control_data_broadcast_order]: 1.495e-05 [grouped_pairwise_exchange_alltoall]: 4e-06 [offloading_packed_experts]: 6.76e-06 [overlap_recompute_and_grad_model_parallel]: 6.84999e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.4e-06 [overlap_recompute_allgather_and_fa_grad]: 3.58e-06 [overlap_recompute_comm]: 4.93001e-06 [overlap_grad_ring_attention]: 1.702e-05 [overlap_grad_flash_sp]: 2.188e-05 [begin_end_overlap_inline]: 2.98e-06 [split_matmul_comm_elemetwise]: 4.37e-06 [split_layernorm_comm]: 4.13999e-06 [handle_group_info]: 3.43e-06 [symbol_engine_optimizer]: 9.527e-05, [1] [Cycle 1]: 8.842e-05, [6] [build]: 3.08e-06 [elim_shapecalc]: 1.005e-05 [elim_not_effective]: 1.287e-05 [opt_reshape]: 6.68e-06 [fold_const_symbol]: 9.82999e-06 [renormalize]: 2.69996e-07 [detach_backward]: 3.7e-06 [pipeline_parallel_scheduler]: 1.86e-06 [auto_monad_reorder]: 1.785e-05 [get_jit_bprop_graph]: 1.71e-06 [rewriter_after_jit_bprop_graph]: 4.18999e-06 [opt_after_jit_grad]: 0.00047393 [validate]: 3.751e-05 Sums bootstrap : 0.000454s : 2.46% type_inference : 0.012856s : 69.73% event_method : 0.000019s : 0.10% auto_monad : 0.000089s : 0.48% graph_reusing : 0.000007s : 0.04% inline : 0.000002s : 0.01% add_attr.add_attr_with_inline.tag_attr : 0.000022s : 0.12% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.03% parallel-infer-symbol : 0.000004s : 0.02% pre_auto_parallel : 0.000039s : 0.21% insert-virtual-dataset : 0.000002s : 0.01% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000003s : 0.01% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000035s : 0.19% optimize.rewriter_before_opt_a : 0.000091s : 0.50% optimize.opt_a.expand_dump_flag : 0.000005s : 0.03% optimize.opt_a.switch_simplify : 0.000128s : 0.70% optimize.opt_a.loop_unroll : 0.000036s : 0.20% optimize.opt_a.a_1 : 0.000715s : 3.88% optimize.opt_a.with_stream_mark : 0.000031s : 0.17% optimize.opt_a.recompute_prepare : 0.000018s : 0.10% optimize.opt_a.updatestate_depend_eliminate : 0.000008s : 0.05% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.04% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.03% optimize.opt_a.parameter_eliminate : 0.000004s : 0.02% optimize.opt_a.a_2 : 0.000231s : 1.25% optimize.opt_a.accelerated_algorithm : 0.000015s : 0.08% optimize.opt_a.shard : 0.000003s : 0.02% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.02% optimize.opt_a.shard_inline : 0.000014s : 0.08% optimize.opt_a.merge_send_recv : 0.000015s : 0.08% optimize.opt_a.auto_parallel : 0.000014s : 0.07% optimize.opt_a.parallel : 0.000026s : 0.14% optimize.opt_a.flash_sp : 0.000013s : 0.07% optimize.opt_a.merge_comm : 0.000008s : 0.05% optimize.opt_a.allreduce_fusion : 0.000007s : 0.04% optimize.opt_a.matmul_add_comm_reduction : 0.000017s : 0.09% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000017s : 0.09% optimize.opt_a.virtual_dataset : 0.000013s : 0.07% optimize.opt_a.get_grad_eliminate_ : 0.000012s : 0.06% optimize.opt_a.virtual_output : 0.000013s : 0.07% optimize.opt_a.merge_forward : 0.000007s : 0.04% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.02% optimize.opt_a.offload_activation : 0.000018s : 0.10% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000033s : 0.18% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.01% optimize.opt_a.before_grad : 0.000022s : 0.12% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000008s : 0.05% optimize.opt_a.meta_fg_expand : 0.000005s : 0.03% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.02% optimize.opt_a.receive_attached : 0.000003s : 0.02% optimize.opt_a.after_resolve : 0.000019s : 0.11% optimize.opt_a.a_after_grad : 0.000018s : 0.10% optimize.opt_a.renormalize : 0.000860s : 4.67% optimize.opt_a.add_forward_monad_depend : 0.000007s : 0.04% optimize.opt_a.auto_monad_grad : 0.000004s : 0.02% optimize.opt_a.auto_monad_eliminator : 0.000024s : 0.13% optimize.opt_a.cse : 0.000044s : 0.24% optimize.opt_a.a_3 : 0.000110s : 0.60% optimize.py_interpret_to_execute_after_opt_a : 0.000013s : 0.07% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.03% optimize.rewriter_after_opt_a : 0.000042s : 0.23% optimize.convert_after_rewriter : 0.000009s : 0.05% optimize.order_py_execute_after_rewriter : 0.000009s : 0.05% optimize.mutable_eliminate : 0.000514s : 2.79% optimize.opt_b.b_1 : 0.000161s : 0.87% optimize.opt_b.b_2 : 0.000008s : 0.04% optimize.opt_b.updatestate_depend_eliminate : 0.000005s : 0.03% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.01% optimize.opt_b.updatestate_loads_eliminate : 0.000002s : 0.01% optimize.opt_b.renormalize : 0.000000s : 0.00% optimize.opt_b.cse : 0.000018s : 0.10% optimize.optimize_parallel_all_gather_comm : 0.000019s : 0.10% optimize.overlap_param_gather : 0.000005s : 0.03% optimize.cconv : 0.000028s : 0.15% optimize.loop_unroll : 0.000433s : 2.35% optimize.opt_after_cconv.c_1 : 0.000028s : 0.15% optimize.opt_after_cconv.parameter_eliminate : 0.000002s : 0.01% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000005s : 0.03% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000002s : 0.01% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.01% optimize.opt_after_cconv.cse : 0.000017s : 0.09% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000017s : 0.09% optimize.tuple_transform.d_1 : 0.000038s : 0.21% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000006s : 0.03% optimize.partial_unused_args_eliminate : 0.000005s : 0.03% optimize.add_recomputation : 0.000048s : 0.26% optimize.cse_after_recomputation.cse : 0.000011s : 0.06% optimize.environ_conv : 0.000007s : 0.04% optimize.swap_dp_allreduce_reducescatter : 0.000008s : 0.04% optimize.bias_add_comm_swap : 0.000005s : 0.03% optimize.label_micro_interleaved_index : 0.000007s : 0.04% optimize.label_fine_grained_interleaved_index : 0.000006s : 0.03% optimize.merge_cast_opt : 0.000004s : 0.02% optimize.slice_recompute_activation : 0.000004s : 0.02% optimize.micro_interleaved_order_control : 0.000005s : 0.03% optimize.assign_add_opt : 0.000003s : 0.02% optimize.ForceFp32Comm : 0.000003s : 0.02% optimize.remove_cast_before_assign_add : 0.000004s : 0.02% optimize.full_micro_interleaved_order_control : 0.000004s : 0.02% optimize.reorder_send_recv_between_fp_bp : 0.000006s : 0.03% optimize.comm_op_add_attrs : 0.000003s : 0.02% optimize.add_comm_op_reuse_tag : 0.000003s : 0.02% optimize.interleave_split_concat_branches : 0.000003s : 0.02% optimize.interleave_parallel_branches : 0.000004s : 0.02% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.02% optimize.overlap_opt_shard_grad_in_pipeline : 0.000004s : 0.02% optimize.control_data_broadcast_order : 0.000015s : 0.08% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.02% optimize.offloading_packed_experts : 0.000007s : 0.04% optimize.overlap_recompute_and_grad_model_parallel : 0.000007s : 0.04% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000003s : 0.02% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.02% optimize.overlap_recompute_comm : 0.000005s : 0.03% optimize.overlap_grad_ring_attention : 0.000017s : 0.09% optimize.overlap_grad_flash_sp : 0.000022s : 0.12% optimize.begin_end_overlap_inline : 0.000003s : 0.02% optimize.split_matmul_comm_elemetwise : 0.000004s : 0.02% optimize.split_layernorm_comm : 0.000004s : 0.02% optimize.handle_group_info : 0.000003s : 0.02% optimize.symbol_engine_optimizer.build : 0.000003s : 0.02% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000010s : 0.05% optimize.symbol_engine_optimizer.elim_not_effective : 0.000013s : 0.07% optimize.symbol_engine_optimizer.opt_reshape : 0.000007s : 0.04% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000010s : 0.05% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000004s : 0.02% pipeline_parallel_scheduler : 0.000002s : 0.01% auto_monad_reorder : 0.000018s : 0.10% get_jit_bprop_graph : 0.000002s : 0.01% rewriter_after_jit_bprop_graph : 0.000004s : 0.02% opt_after_jit_grad : 0.000474s : 2.57% validate : 0.000038s : 0.20% Time group info: ------[substitution.] 0.000177 25 1.08% : 0.000002s : 2: substitution.elim_not_effective 0.82% : 0.000001s : 2: substitution.fold_const_symbol 2.91% : 0.000005s : 3: substitution.graph_param_transform 81.54% : 0.000144s : 6: substitution.inline 2.51% : 0.000004s : 4: substitution.j_node_and_user_rematch 2.64% : 0.000005s : 4: substitution.remove_not_recompute_node 1.98% : 0.000003s : 2: substitution.replace_old_param 6.53% : 0.000012s : 2: substitution.switch_simplify ------[type_inference.] 0.012796 2 92.47% : 0.011833s : 1: type_inference.infer 7.53% : 0.000963s : 1: type_inference.specialize ------[replace.] 0.000097 8 46.97% : 0.000045s : 6: replace.inline 53.03% : 0.000051s : 2: replace.switch_simplify ------[match.] 0.000150 8 93.41% : 0.000140s : 6: match.inline 6.59% : 0.000010s : 2: match.switch_simplify ------[predicate.] 0.000197 996 0.96% : 0.000002s : 11: predicate.accumulaten_eliminater 0.81% : 0.000002s : 3: predicate.ad_related_special_op_eliminate 0.47% : 0.000001s : 6: predicate.addn_check_dump 0.94% : 0.000002s : 11: predicate.addn_zero_filter 0.94% : 0.000002s : 11: predicate.adjust_all_reduce_mul_add 6.67% : 0.000013s : 17: predicate.arithmetic_simplify 0.83% : 0.000002s : 11: predicate.cast_eliminate 0.53% : 0.000001s : 6: predicate.check_bprop_eliminate 0.52% : 0.000001s : 6: predicate.compare_switch_simplify 0.15% : 0.000000s : 3: predicate.const_output_eliminate 0.59% : 0.000001s : 6: predicate.depend_value_elim 0.93% : 0.000002s : 11: predicate.dict_get_item_const_eliminator 1.11% : 0.000002s : 11: predicate.dict_get_item_eliminator 0.87% : 0.000002s : 11: predicate.dict_set_item_eliminator 1.02% : 0.000002s : 6: predicate.dumpgradient_eliminate 0.20% : 0.000000s : 3: predicate.elim_not_effective 0.36% : 0.000001s : 3: predicate.elim_shapecalc_of_broadcastargs 1.25% : 0.000002s : 14: predicate.environ_add_const_eliminate 1.03% : 0.000002s : 14: predicate.environ_get_add_eliminate 1.03% : 0.000002s : 14: predicate.environ_get_depend_swap 1.77% : 0.000003s : 20: predicate.environ_get_eliminate 1.05% : 0.000002s : 14: predicate.environ_get_set_eliminate 1.41% : 0.000003s : 17: predicate.exchange_switch_depend_value 2.42% : 0.000005s : 17: predicate.float_depend_g_call 0.51% : 0.000001s : 6: predicate.float_environ_get_switch 0.69% : 0.000001s : 9: predicate.float_tuple_getitem_switch 0.17% : 0.000000s : 3: predicate.fold_const_symbol 0.60% : 0.000001s : 6: predicate.get_grad_eliminate 0.17% : 0.000000s : 3: predicate.graph_param_transform 0.57% : 0.000001s : 6: predicate.incorporate_call 0.49% : 0.000001s : 6: predicate.incorporate_call_switch 6.73% : 0.000013s : 46: predicate.inline 0.68% : 0.000001s : 6: predicate.inline_without_move 0.29% : 0.000001s : 6: predicate.j_node_and_user_rematch 0.83% : 0.000002s : 6: predicate.less_batch_normalization 1.52% : 0.000003s : 17: predicate.list_to_tuple_eliminator_ 2.19% : 0.000004s : 28: predicate.load_eliminater 0.95% : 0.000002s : 3: predicate.loop_unroll_after_grad 2.54% : 0.000005s : 27: predicate.loop_unroll_before_grad 1.64% : 0.000003s : 17: predicate.make_slice_get_slice_eliminator 0.51% : 0.000001s : 6: predicate.merge_addn 0.42% : 0.000001s : 6: predicate.micro_step_allgather_replace 0.50% : 0.000001s : 6: predicate.mini_step_allgather_replace 0.99% : 0.000002s : 11: predicate.minmaximum_grad 0.89% : 0.000002s : 3: predicate.mutable_eliminate 0.30% : 0.000001s : 3: predicate.opt_reshape 0.29% : 0.000001s : 3: predicate.parallel_virtual_node 1.85% : 0.000004s : 17: predicate.partial_defer_inline 1.22% : 0.000002s : 14: predicate.partial_eliminate 0.89% : 0.000002s : 11: predicate.print_const_string_wrapper 0.50% : 0.000001s : 6: predicate.reduce_all_const_elim 1.33% : 0.000003s : 11: predicate.reduce_eliminate 2.18% : 0.000004s : 28: predicate.redundant_stop_gradient_eliminater 0.70% : 0.000001s : 6: predicate.remove_not_recompute_node 1.24% : 0.000002s : 17: predicate.replace_applicator 0.45% : 0.000001s : 6: predicate.replace_old_param 0.23% : 0.000000s : 3: predicate.reset_defer_inline 0.99% : 0.000002s : 11: predicate.reshape_eliminate 0.54% : 0.000001s : 6: predicate.row_tensor_add_zeros_like 0.32% : 0.000001s : 3: predicate.row_tensor_eliminate 0.63% : 0.000001s : 6: predicate.same_eliminate 0.53% : 0.000001s : 6: predicate.set_cell_output_no_recompute 0.74% : 0.000001s : 6: predicate.shard_identity_eliminate 0.57% : 0.000001s : 6: predicate.special_op_eliminate 0.86% : 0.000002s : 6: predicate.specialize_transform 0.67% : 0.000001s : 6: predicate.split_environ_get_set_with_tuple_value 0.63% : 0.000001s : 6: predicate.stack_unstack_eliminate 0.27% : 0.000001s : 3: predicate.switch_call_monad_eliminater 1.59% : 0.000003s : 17: predicate.switch_defer_inline 2.14% : 0.000004s : 23: predicate.switch_layer_defer_inline 5.78% : 0.000011s : 57: predicate.switch_simplify 0.99% : 0.000002s : 11: predicate.tile_eliminate 0.95% : 0.000002s : 11: predicate.transpose_eliminate 1.57% : 0.000003s : 17: predicate.tuple_list_convert_item_index_to_positive 1.60% : 0.000003s : 17: predicate.tuple_list_get_item_const_eliminator 1.41% : 0.000003s : 17: predicate.tuple_list_get_item_depend_reorder 2.52% : 0.000005s : 23: predicate.tuple_list_get_item_eliminator 1.38% : 0.000003s : 17: predicate.tuple_list_get_set_item_eliminator 2.22% : 0.000004s : 23: predicate.tuple_list_set_item_eliminator 1.42% : 0.000003s : 17: predicate.tuple_to_list_eliminator_ 2.13% : 0.000004s : 28: predicate.updatestate_pure_node_eliminater 2.69% : 0.000005s : 34: predicate.updatestate_useless_node_eliminater 0.26% : 0.000001s : 3: predicate.value_based_eliminate 0.53% : 0.000001s : 6: predicate.virtual_dataset_eliminate 0.57% : 0.000001s : 6: predicate.virtual_output_eliminate 0.20% : 0.000000s : 3: predicate.virtual_view_grad_eliminate 0.38% : 0.000001s : 3: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000652 10 51.29% : 0.000335s : 2: func_graph_cloner_run.FuncGraphClonerGraph 48.71% : 0.000318s : 8: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.034763 192 0.02% : 0.000006s : 1: ForceFp32Comm 9.65% : 0.003354s : 1: add_attr 9.60% : 0.003338s : 1: add_attr_with_inline 0.02% : 0.000006s : 1: add_comm_op_reuse_tag 0.15% : 0.000051s : 1: add_recomputation 0.02% : 0.000006s : 1: assign_add_opt 0.28% : 0.000098s : 1: auto_monad 0.07% : 0.000026s : 1: auto_monad_reorder 0.02% : 0.000006s : 1: begin_end_overlap_inline 0.02% : 0.000008s : 1: bias_add_comm_swap 1.44% : 0.000499s : 1: bootstrap 0.09% : 0.000031s : 1: cconv 0.02% : 0.000006s : 1: comm_op_add_attrs 0.05% : 0.000018s : 1: control_data_broadcast_order 0.04% : 0.000012s : 1: convert_after_rewriter 0.09% : 0.000030s : 1: cse_after_recomputation 0.03% : 0.000009s : 1: dataset_repeat_opt 0.06% : 0.000020s : 1: detach_backward 0.03% : 0.000010s : 1: environ_conv 0.09% : 0.000030s : 1: event_method 0.02% : 0.000007s : 1: full_micro_interleaved_order_control 0.02% : 0.000008s : 1: get_jit_bprop_graph 0.04% : 0.000014s : 1: graph_reusing 0.02% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.02% : 0.000006s : 1: handle_group_info 0.02% : 0.000008s : 1: inline 0.03% : 0.000009s : 1: insert-virtual-dataset 0.02% : 0.000006s : 1: interleave_parallel_branches 0.02% : 0.000006s : 1: interleave_split_concat_branches 0.03% : 0.000009s : 1: label_fine_grained_interleaved_index 0.03% : 0.000010s : 1: label_micro_interleaved_index 1.26% : 0.000439s : 1: loop_unroll 0.02% : 0.000006s : 1: merge_cast_opt 0.02% : 0.000008s : 1: micro_interleaved_order_control 1.49% : 0.000520s : 1: mutable_eliminate 0.03% : 0.000010s : 1: offloading_packed_experts 0.04% : 0.000014s : 1: opt.transform.loop_unroll_optimizer 0.04% : 0.000014s : 1: opt.transform.mutable_eliminate 3.54% : 0.001232s : 78: opt.transform.opt_a 0.08% : 0.000026s : 1: opt.transform.opt_after_cconv 0.07% : 0.000024s : 1: opt.transform.opt_after_jit_grad 0.28% : 0.000096s : 28: opt.transform.opt_b 0.12% : 0.000042s : 2: opt.transform.opt_trans_graph 0.10% : 0.000036s : 4: opt.transform.symbol_engine_opt 9.67% : 0.003363s : 1: opt_a 0.36% : 0.000124s : 1: opt_after_cconv 1.39% : 0.000484s : 1: opt_after_jit_grad 0.76% : 0.000265s : 1: opt_b 17.20% : 0.005979s : 1: optimize 0.06% : 0.000022s : 1: optimize_parallel_all_gather_comm 0.03% : 0.000012s : 1: order_py_execute_after_rewriter 0.07% : 0.000025s : 1: overlap_grad_flash_sp 0.02% : 0.000006s : 1: overlap_grad_matmul_and_grad_allreduce 0.06% : 0.000020s : 1: overlap_grad_ring_attention 0.02% : 0.000007s : 1: overlap_opt_shard_grad_in_pipeline 0.02% : 0.000007s : 1: overlap_opt_shard_in_pipeline 0.02% : 0.000008s : 1: overlap_param_gather 0.02% : 0.000006s : 1: overlap_recompute_allgather_and_fa_grad 0.03% : 0.000010s : 1: overlap_recompute_and_grad_model_parallel 0.02% : 0.000008s : 1: overlap_recompute_comm 0.03% : 0.000011s : 1: parallel-infer-symbol 0.02% : 0.000007s : 1: parallel-infer-symbol-second 0.02% : 0.000008s : 1: partial_unused_args_eliminate 0.03% : 0.000009s : 1: pipeline_parallel_scheduler 0.02% : 0.000008s : 1: pipeline_split 0.14% : 0.000048s : 1: pre_auto_parallel 0.11% : 0.000040s : 1: py_interpret_to_execute 0.05% : 0.000016s : 1: py_interpret_to_execute_after_opt_a 0.02% : 0.000006s : 1: remove_cast_before_assign_add 0.06% : 0.000020s : 1: remove_dup_value 1.37% : 0.000476s : 1: renormalize.infer 1.08% : 0.000376s : 1: renormalize.specialize 0.02% : 0.000008s : 1: reorder_send_recv_between_fp_bp 0.03% : 0.000010s : 1: rewriter_after_jit_bprop_graph 0.13% : 0.000046s : 1: rewriter_after_opt_a 0.28% : 0.000096s : 1: rewriter_before_opt_a 0.02% : 0.000008s : 1: slice_cell_reuse_recomputed_activation 0.02% : 0.000007s : 1: slice_recompute_activation 0.02% : 0.000007s : 1: split_layernorm_comm 0.02% : 0.000007s : 1: split_matmul_comm_elemetwise 0.03% : 0.000011s : 1: swap_dp_allreduce_reducescatter 0.28% : 0.000098s : 1: symbol_engine_optimizer 0.25% : 0.000086s : 1: tuple_transform 37.07% : 0.012887s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:47:04.337.335 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0213689, [21] [bootstrap]: 0.00046052 [type_inference]: 0.0125027 [event_method]: 1.742e-05 [auto_monad]: 8.585e-05 [graph_reusing]: 6.63e-06 [inline]: 2.58e-06 [add_attr]: 0.00307733, [1] [add_attr_with_inline]: 0.00306803, [1] [Cycle 1]: 6.252e-05, [2] [tag_attr]: 1.907e-05 [meta_addattr_fg_expand]: 5.54e-06 [parallel-infer-symbol]: 3.44001e-06 [pre_auto_parallel]: 3.154e-05 [insert-virtual-dataset]: 2.46e-06 [parallel-infer-symbol-second]: 8.30012e-07 [dataset_repeat_opt]: 2.17999e-06 [pipeline_split]: 1.54e-06 [optimize]: 0.00450524, [53] [py_interpret_to_execute]: 2.407e-05 [rewriter_before_opt_a]: 0.00010086 [opt_a]: 0.00254703, [2] [Cycle 1]: 0.00194163, [45] [expand_dump_flag]: 3.2e-06 [switch_simplify]: 0.00010516 [loop_unroll]: 2.489e-05 [a_1]: 0.00049846 [with_stream_mark]: 1.446e-05 [recompute_prepare]: 8.38001e-06 [updatestate_depend_eliminate]: 4.25e-06 [updatestate_assign_eliminate]: 3.08e-06 [updatestate_loads_eliminate]: 3.01999e-06 [parameter_eliminate]: 1.69998e-06 [a_2]: 8.006e-05 [accelerated_algorithm]: 6.50002e-06 [shard]: 2.58e-06 [meta_shard_fg_expand]: 2.11e-06 [shard_inline]: 6.68998e-06 [merge_send_recv]: 8.04002e-06 [auto_parallel]: 6.29999e-06 [parallel]: 1.791e-05 [flash_sp]: 8.54002e-06 [merge_comm]: 3.83001e-06 [allreduce_fusion]: 3.45e-06 [matmul_add_comm_reduction]: 9.34998e-06 [allreduce_slice_to_reducescatter]: 8.10018e-07 [virtual_shard_identity]: 8.52e-06 [virtual_dataset]: 6.12999e-06 [get_grad_eliminate_]: 6.46e-06 [virtual_output]: 5.80002e-06 [merge_forward]: 3.92998e-06 [cell_reuse_recompute_pass]: 1.14003e-06 [offload_activation]: 9.15999e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.223e-05 [merge_recompute_call_nodes]: 1.67001e-06 [before_grad]: 1.102e-05 [set_forward_comm_id_for_comm_node_pass]: 3.68e-06 [meta_fg_expand]: 3.06999e-06 [flash_sp_send_recv_attached]: 2.56e-06 [receive_attached]: 1.99e-06 [after_resolve]: 9.27001e-06 [a_after_grad]: 8.72e-06 [renormalize]: 0.00068012 [add_forward_monad_depend]: 5.33002e-06 [auto_monad_grad]: 2.22999e-06 [auto_monad_eliminator]: 1.486e-05 [cse]: 2.958e-05 [a_3]: 4.607e-05 [Cycle 2]: 0.00059599, [45] [expand_dump_flag]: 8.70001e-07 [switch_simplify]: 7.46001e-06 [loop_unroll]: 5.99e-06 [a_1]: 0.00010389 [with_stream_mark]: 1.088e-05 [recompute_prepare]: 5.92001e-06 [updatestate_depend_eliminate]: 2.86e-06 [updatestate_assign_eliminate]: 2.26e-06 [updatestate_loads_eliminate]: 2.64999e-06 [parameter_eliminate]: 9.39996e-07 [a_2]: 6.974e-05 [accelerated_algorithm]: 5.76e-06 [shard]: 1.12999e-06 [meta_shard_fg_expand]: 1.27999e-06 [shard_inline]: 6.02999e-06 [merge_send_recv]: 4.90001e-06 [auto_parallel]: 5.40999e-06 [parallel]: 4.87e-06 [flash_sp]: 3.23e-06 [merge_comm]: 3.25e-06 [allreduce_fusion]: 3.13998e-06 [matmul_add_comm_reduction]: 6.12001e-06 [allreduce_slice_to_reducescatter]: 4.30009e-07 [virtual_shard_identity]: 6.21e-06 [virtual_dataset]: 5.36002e-06 [get_grad_eliminate_]: 5.40001e-06 [virtual_output]: 5.09e-06 [merge_forward]: 2.68e-06 [cell_reuse_recompute_pass]: 1.42e-06 [offload_activation]: 6.21e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.239e-05 [merge_recompute_call_nodes]: 9.10019e-07 [before_grad]: 8.90001e-06 [set_forward_comm_id_for_comm_node_pass]: 3.73001e-06 [meta_fg_expand]: 1.99999e-06 [flash_sp_send_recv_attached]: 9.20001e-07 [receive_attached]: 1.02e-06 [after_resolve]: 8.85001e-06 [a_after_grad]: 8.15e-06 [renormalize]: 6.00121e-08 [add_forward_monad_depend]: 1.14998e-06 [auto_monad_grad]: 9.00007e-07 [auto_monad_eliminator]: 6.61e-06 [cse]: 1.398e-05 [a_3]: 3.439e-05 [py_interpret_to_execute_after_opt_a]: 8.23001e-06 [slice_cell_reuse_recomputed_activation]: 1.91e-06 [rewriter_after_opt_a]: 3.426e-05 [convert_after_rewriter]: 6.26998e-06 [order_py_execute_after_rewriter]: 5.14e-06 [mutable_eliminate]: 0.00047821 [opt_b]: 0.00019394, [1] [Cycle 1]: 0.00018825, [7] [b_1]: 0.00011741 [b_2]: 7.51001e-06 [updatestate_depend_eliminate]: 5.27001e-06 [updatestate_assign_eliminate]: 2.41998e-06 [updatestate_loads_eliminate]: 2.11e-06 [renormalize]: 3.39991e-07 [cse]: 1.805e-05 [optimize_parallel_all_gather_comm]: 1.537e-05 [overlap_param_gather]: 2.29001e-06 [cconv]: 2.266e-05 [loop_unroll]: 0.00041554 [opt_after_cconv]: 9.5e-05, [1] [Cycle 1]: 8.977e-05, [7] [c_1]: 2.707e-05 [parameter_eliminate]: 2.54001e-06 [updatestate_depend_eliminate]: 4.99e-06 [updatestate_assign_eliminate]: 2.19999e-06 [updatestate_loads_eliminate]: 2.50002e-06 [cse]: 1.737e-05 [renormalize]: 3.50003e-07 [remove_dup_value]: 1.337e-05 [tuple_transform]: 8.236e-05, [1] [Cycle 1]: 7.739e-05, [4] [d_1]: 4.928e-05 [none_parameter_eliminate]: 1.88002e-06 [renormalize]: 2.70025e-07 [switch_simplify]: 6.29999e-06 [partial_unused_args_eliminate]: 1.85001e-06 [add_recomputation]: 4.515e-05 [cse_after_recomputation]: 2.226e-05, [1] [Cycle 1]: 1.8e-05, [1] [cse]: 1.234e-05 [environ_conv]: 4.90999e-06 [swap_dp_allreduce_reducescatter]: 4.59998e-06 [bias_add_comm_swap]: 2.74999e-06 [label_micro_interleaved_index]: 4.12e-06 [label_fine_grained_interleaved_index]: 2.63e-06 [merge_cast_opt]: 1.27999e-06 [slice_recompute_activation]: 1.99e-06 [micro_interleaved_order_control]: 2.37999e-06 [assign_add_opt]: 1.17e-06 [ForceFp32Comm]: 7.89994e-07 [remove_cast_before_assign_add]: 1.22e-06 [full_micro_interleaved_order_control]: 1.92001e-06 [reorder_send_recv_between_fp_bp]: 2.59999e-06 [comm_op_add_attrs]: 9.50007e-07 [add_comm_op_reuse_tag]: 1.05001e-06 [interleave_split_concat_branches]: 1.14e-06 [interleave_parallel_branches]: 1.00999e-06 [overlap_opt_shard_in_pipeline]: 1.13001e-06 [overlap_opt_shard_grad_in_pipeline]: 1.69998e-06 [control_data_broadcast_order]: 1.167e-05 [grouped_pairwise_exchange_alltoall]: 1.55001e-06 [offloading_packed_experts]: 3.35e-06 [overlap_recompute_and_grad_model_parallel]: 5.14e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.15999e-06 [overlap_recompute_allgather_and_fa_grad]: 1.63002e-06 [overlap_recompute_comm]: 2.22001e-06 [overlap_grad_ring_attention]: 4.22003e-06 [overlap_grad_flash_sp]: 1.669e-05 [begin_end_overlap_inline]: 4.89992e-07 [split_matmul_comm_elemetwise]: 2.26998e-06 [split_layernorm_comm]: 1.52001e-06 [handle_group_info]: 1.00001e-06 [symbol_engine_optimizer]: 8.497e-05, [1] [Cycle 1]: 8.104e-05, [6] [build]: 2.43e-06 [elim_shapecalc]: 9.46e-06 [elim_not_effective]: 1.143e-05 [opt_reshape]: 1.864e-05 [fold_const_symbol]: 1.06e-05 [renormalize]: 2.50002e-07 [detach_backward]: 1.71e-06 [pipeline_parallel_scheduler]: 1.84e-06 [auto_monad_reorder]: 1.576e-05 [get_jit_bprop_graph]: 1.20001e-06 [rewriter_after_jit_bprop_graph]: 3.90998e-06 [opt_after_jit_grad]: 0.00045816 [validate]: 3.421e-05 Sums bootstrap : 0.000461s : 2.65% type_inference : 0.012503s : 72.06% event_method : 0.000017s : 0.10% auto_monad : 0.000086s : 0.49% graph_reusing : 0.000007s : 0.04% inline : 0.000003s : 0.01% add_attr.add_attr_with_inline.tag_attr : 0.000019s : 0.11% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.03% parallel-infer-symbol : 0.000003s : 0.02% pre_auto_parallel : 0.000032s : 0.18% insert-virtual-dataset : 0.000002s : 0.01% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.01% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000024s : 0.14% optimize.rewriter_before_opt_a : 0.000101s : 0.58% optimize.opt_a.expand_dump_flag : 0.000004s : 0.02% optimize.opt_a.switch_simplify : 0.000113s : 0.65% optimize.opt_a.loop_unroll : 0.000031s : 0.18% optimize.opt_a.a_1 : 0.000602s : 3.47% optimize.opt_a.with_stream_mark : 0.000025s : 0.15% optimize.opt_a.recompute_prepare : 0.000014s : 0.08% optimize.opt_a.updatestate_depend_eliminate : 0.000007s : 0.04% optimize.opt_a.updatestate_assign_eliminate : 0.000005s : 0.03% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.03% optimize.opt_a.parameter_eliminate : 0.000003s : 0.02% optimize.opt_a.a_2 : 0.000150s : 0.86% optimize.opt_a.accelerated_algorithm : 0.000012s : 0.07% optimize.opt_a.shard : 0.000004s : 0.02% optimize.opt_a.meta_shard_fg_expand : 0.000003s : 0.02% optimize.opt_a.shard_inline : 0.000013s : 0.07% optimize.opt_a.merge_send_recv : 0.000013s : 0.07% optimize.opt_a.auto_parallel : 0.000012s : 0.07% optimize.opt_a.parallel : 0.000023s : 0.13% optimize.opt_a.flash_sp : 0.000012s : 0.07% optimize.opt_a.merge_comm : 0.000007s : 0.04% optimize.opt_a.allreduce_fusion : 0.000007s : 0.04% optimize.opt_a.matmul_add_comm_reduction : 0.000015s : 0.09% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000015s : 0.08% optimize.opt_a.virtual_dataset : 0.000011s : 0.07% optimize.opt_a.get_grad_eliminate_ : 0.000012s : 0.07% optimize.opt_a.virtual_output : 0.000011s : 0.06% optimize.opt_a.merge_forward : 0.000007s : 0.04% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.01% optimize.opt_a.offload_activation : 0.000015s : 0.09% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000025s : 0.14% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.01% optimize.opt_a.before_grad : 0.000020s : 0.11% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000007s : 0.04% optimize.opt_a.meta_fg_expand : 0.000005s : 0.03% optimize.opt_a.flash_sp_send_recv_attached : 0.000003s : 0.02% optimize.opt_a.receive_attached : 0.000003s : 0.02% optimize.opt_a.after_resolve : 0.000018s : 0.10% optimize.opt_a.a_after_grad : 0.000017s : 0.10% optimize.opt_a.renormalize : 0.000680s : 3.92% optimize.opt_a.add_forward_monad_depend : 0.000006s : 0.04% optimize.opt_a.auto_monad_grad : 0.000003s : 0.02% optimize.opt_a.auto_monad_eliminator : 0.000021s : 0.12% optimize.opt_a.cse : 0.000044s : 0.25% optimize.opt_a.a_3 : 0.000080s : 0.46% optimize.py_interpret_to_execute_after_opt_a : 0.000008s : 0.05% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.01% optimize.rewriter_after_opt_a : 0.000034s : 0.20% optimize.convert_after_rewriter : 0.000006s : 0.04% optimize.order_py_execute_after_rewriter : 0.000005s : 0.03% optimize.mutable_eliminate : 0.000478s : 2.76% optimize.opt_b.b_1 : 0.000117s : 0.68% optimize.opt_b.b_2 : 0.000008s : 0.04% optimize.opt_b.updatestate_depend_eliminate : 0.000005s : 0.03% optimize.opt_b.updatestate_assign_eliminate : 0.000002s : 0.01% optimize.opt_b.updatestate_loads_eliminate : 0.000002s : 0.01% optimize.opt_b.renormalize : 0.000000s : 0.00% optimize.opt_b.cse : 0.000018s : 0.10% optimize.optimize_parallel_all_gather_comm : 0.000015s : 0.09% optimize.overlap_param_gather : 0.000002s : 0.01% optimize.cconv : 0.000023s : 0.13% optimize.loop_unroll : 0.000416s : 2.39% optimize.opt_after_cconv.c_1 : 0.000027s : 0.16% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000005s : 0.03% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000002s : 0.01% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.cse : 0.000017s : 0.10% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000013s : 0.08% optimize.tuple_transform.d_1 : 0.000049s : 0.28% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000006s : 0.04% optimize.partial_unused_args_eliminate : 0.000002s : 0.01% optimize.add_recomputation : 0.000045s : 0.26% optimize.cse_after_recomputation.cse : 0.000012s : 0.07% optimize.environ_conv : 0.000005s : 0.03% optimize.swap_dp_allreduce_reducescatter : 0.000005s : 0.03% optimize.bias_add_comm_swap : 0.000003s : 0.02% optimize.label_micro_interleaved_index : 0.000004s : 0.02% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.02% optimize.merge_cast_opt : 0.000001s : 0.01% optimize.slice_recompute_activation : 0.000002s : 0.01% optimize.micro_interleaved_order_control : 0.000002s : 0.01% optimize.assign_add_opt : 0.000001s : 0.01% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.01% optimize.full_micro_interleaved_order_control : 0.000002s : 0.01% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.01% optimize.comm_op_add_attrs : 0.000001s : 0.01% optimize.add_comm_op_reuse_tag : 0.000001s : 0.01% optimize.interleave_split_concat_branches : 0.000001s : 0.01% optimize.interleave_parallel_branches : 0.000001s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.01% optimize.control_data_broadcast_order : 0.000012s : 0.07% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.01% optimize.offloading_packed_experts : 0.000003s : 0.02% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.03% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000002s : 0.01% optimize.overlap_recompute_comm : 0.000002s : 0.01% optimize.overlap_grad_ring_attention : 0.000004s : 0.02% optimize.overlap_grad_flash_sp : 0.000017s : 0.10% optimize.begin_end_overlap_inline : 0.000000s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.01% optimize.split_layernorm_comm : 0.000002s : 0.01% optimize.handle_group_info : 0.000001s : 0.01% optimize.symbol_engine_optimizer.build : 0.000002s : 0.01% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000009s : 0.05% optimize.symbol_engine_optimizer.elim_not_effective : 0.000011s : 0.07% optimize.symbol_engine_optimizer.opt_reshape : 0.000019s : 0.11% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000011s : 0.06% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.01% pipeline_parallel_scheduler : 0.000002s : 0.01% auto_monad_reorder : 0.000016s : 0.09% get_jit_bprop_graph : 0.000001s : 0.01% rewriter_after_jit_bprop_graph : 0.000004s : 0.02% opt_after_jit_grad : 0.000458s : 2.64% validate : 0.000034s : 0.20% Time group info: ------[substitution.] 0.000147 25 1.17% : 0.000002s : 2: substitution.elim_not_effective 0.87% : 0.000001s : 2: substitution.fold_const_symbol 3.99% : 0.000006s : 3: substitution.graph_param_transform 80.66% : 0.000119s : 6: substitution.inline 2.25% : 0.000003s : 4: substitution.j_node_and_user_rematch 2.99% : 0.000004s : 4: substitution.remove_not_recompute_node 2.02% : 0.000003s : 2: substitution.replace_old_param 6.05% : 0.000009s : 2: substitution.switch_simplify ------[type_inference.] 0.012448 2 92.27% : 0.011486s : 1: type_inference.infer 7.73% : 0.000962s : 1: type_inference.specialize ------[replace.] 0.000083 8 45.82% : 0.000038s : 6: replace.inline 54.18% : 0.000045s : 2: replace.switch_simplify ------[match.] 0.000123 8 93.88% : 0.000115s : 6: match.inline 6.12% : 0.000007s : 2: match.switch_simplify ------[predicate.] 0.000180 996 0.93% : 0.000002s : 11: predicate.accumulaten_eliminater 0.90% : 0.000002s : 3: predicate.ad_related_special_op_eliminate 0.52% : 0.000001s : 6: predicate.addn_check_dump 0.99% : 0.000002s : 11: predicate.addn_zero_filter 0.80% : 0.000001s : 11: predicate.adjust_all_reduce_mul_add 2.17% : 0.000004s : 17: predicate.arithmetic_simplify 0.93% : 0.000002s : 11: predicate.cast_eliminate 0.54% : 0.000001s : 6: predicate.check_bprop_eliminate 0.53% : 0.000001s : 6: predicate.compare_switch_simplify 0.16% : 0.000000s : 3: predicate.const_output_eliminate 0.58% : 0.000001s : 6: predicate.depend_value_elim 0.87% : 0.000002s : 11: predicate.dict_get_item_const_eliminator 1.04% : 0.000002s : 11: predicate.dict_get_item_eliminator 0.87% : 0.000002s : 11: predicate.dict_set_item_eliminator 0.95% : 0.000002s : 6: predicate.dumpgradient_eliminate 0.19% : 0.000000s : 3: predicate.elim_not_effective 0.33% : 0.000001s : 3: predicate.elim_shapecalc_of_broadcastargs 1.09% : 0.000002s : 14: predicate.environ_add_const_eliminate 1.07% : 0.000002s : 14: predicate.environ_get_add_eliminate 1.07% : 0.000002s : 14: predicate.environ_get_depend_swap 1.58% : 0.000003s : 20: predicate.environ_get_eliminate 1.02% : 0.000002s : 14: predicate.environ_get_set_eliminate 1.38% : 0.000002s : 17: predicate.exchange_switch_depend_value 2.48% : 0.000004s : 17: predicate.float_depend_g_call 0.48% : 0.000001s : 6: predicate.float_environ_get_switch 0.67% : 0.000001s : 9: predicate.float_tuple_getitem_switch 0.14% : 0.000000s : 3: predicate.fold_const_symbol 0.63% : 0.000001s : 6: predicate.get_grad_eliminate 0.23% : 0.000000s : 3: predicate.graph_param_transform 0.65% : 0.000001s : 6: predicate.incorporate_call 0.46% : 0.000001s : 6: predicate.incorporate_call_switch 6.12% : 0.000011s : 46: predicate.inline 0.73% : 0.000001s : 6: predicate.inline_without_move 0.30% : 0.000001s : 6: predicate.j_node_and_user_rematch 0.77% : 0.000001s : 6: predicate.less_batch_normalization 1.41% : 0.000003s : 17: predicate.list_to_tuple_eliminator_ 2.14% : 0.000004s : 28: predicate.load_eliminater 0.96% : 0.000002s : 3: predicate.loop_unroll_after_grad 2.31% : 0.000004s : 27: predicate.loop_unroll_before_grad 1.57% : 0.000003s : 17: predicate.make_slice_get_slice_eliminator 0.48% : 0.000001s : 6: predicate.merge_addn 0.47% : 0.000001s : 6: predicate.micro_step_allgather_replace 0.49% : 0.000001s : 6: predicate.mini_step_allgather_replace 0.92% : 0.000002s : 11: predicate.minmaximum_grad 1.00% : 0.000002s : 3: predicate.mutable_eliminate 0.35% : 0.000001s : 3: predicate.opt_reshape 0.34% : 0.000001s : 3: predicate.parallel_virtual_node 1.70% : 0.000003s : 17: predicate.partial_defer_inline 1.28% : 0.000002s : 14: predicate.partial_eliminate 0.91% : 0.000002s : 11: predicate.print_const_string_wrapper 0.54% : 0.000001s : 6: predicate.reduce_all_const_elim 1.21% : 0.000002s : 11: predicate.reduce_eliminate 2.17% : 0.000004s : 28: predicate.redundant_stop_gradient_eliminater 0.47% : 0.000001s : 6: predicate.remove_not_recompute_node 1.21% : 0.000002s : 17: predicate.replace_applicator 0.50% : 0.000001s : 6: predicate.replace_old_param 0.24% : 0.000000s : 3: predicate.reset_defer_inline 0.88% : 0.000002s : 11: predicate.reshape_eliminate 0.52% : 0.000001s : 6: predicate.row_tensor_add_zeros_like 0.29% : 0.000001s : 3: predicate.row_tensor_eliminate 0.67% : 0.000001s : 6: predicate.same_eliminate 0.39% : 0.000001s : 6: predicate.set_cell_output_no_recompute 0.73% : 0.000001s : 6: predicate.shard_identity_eliminate 0.61% : 0.000001s : 6: predicate.special_op_eliminate 0.67% : 0.000001s : 6: predicate.specialize_transform 0.79% : 0.000001s : 6: predicate.split_environ_get_set_with_tuple_value 0.83% : 0.000001s : 6: predicate.stack_unstack_eliminate 0.30% : 0.000001s : 3: predicate.switch_call_monad_eliminater 1.47% : 0.000003s : 17: predicate.switch_defer_inline 1.96% : 0.000004s : 23: predicate.switch_layer_defer_inline 5.73% : 0.000010s : 57: predicate.switch_simplify 1.01% : 0.000002s : 11: predicate.tile_eliminate 0.93% : 0.000002s : 11: predicate.transpose_eliminate 1.40% : 0.000003s : 17: predicate.tuple_list_convert_item_index_to_positive 1.57% : 0.000003s : 17: predicate.tuple_list_get_item_const_eliminator 1.40% : 0.000003s : 17: predicate.tuple_list_get_item_depend_reorder 8.97% : 0.000016s : 23: predicate.tuple_list_get_item_eliminator 1.54% : 0.000003s : 17: predicate.tuple_list_get_set_item_eliminator 2.15% : 0.000004s : 23: predicate.tuple_list_set_item_eliminator 1.40% : 0.000003s : 17: predicate.tuple_to_list_eliminator_ 2.10% : 0.000004s : 28: predicate.updatestate_pure_node_eliminater 2.67% : 0.000005s : 34: predicate.updatestate_useless_node_eliminater 0.37% : 0.000001s : 3: predicate.value_based_eliminate 0.56% : 0.000001s : 6: predicate.virtual_dataset_eliminate 0.58% : 0.000001s : 6: predicate.virtual_output_eliminate 0.26% : 0.000000s : 3: predicate.virtual_view_grad_eliminate 0.38% : 0.000001s : 3: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000615 10 52.85% : 0.000325s : 2: func_graph_cloner_run.FuncGraphClonerGraph 47.15% : 0.000290s : 8: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.030832 192 0.01% : 0.000004s : 1: ForceFp32Comm 10.00% : 0.003083s : 1: add_attr 9.96% : 0.003072s : 1: add_attr_with_inline 0.01% : 0.000004s : 1: add_comm_op_reuse_tag 0.16% : 0.000049s : 1: add_recomputation 0.01% : 0.000004s : 1: assign_add_opt 0.30% : 0.000091s : 1: auto_monad 0.06% : 0.000019s : 1: auto_monad_reorder 0.01% : 0.000003s : 1: begin_end_overlap_inline 0.02% : 0.000006s : 1: bias_add_comm_swap 1.59% : 0.000490s : 1: bootstrap 0.08% : 0.000026s : 1: cconv 0.01% : 0.000004s : 1: comm_op_add_attrs 0.05% : 0.000015s : 1: control_data_broadcast_order 0.03% : 0.000009s : 1: convert_after_rewriter 0.08% : 0.000025s : 1: cse_after_recomputation 0.02% : 0.000005s : 1: dataset_repeat_opt 0.02% : 0.000005s : 1: detach_backward 0.03% : 0.000008s : 1: environ_conv 0.08% : 0.000023s : 1: event_method 0.01% : 0.000005s : 1: full_micro_interleaved_order_control 0.01% : 0.000004s : 1: get_jit_bprop_graph 0.04% : 0.000011s : 1: graph_reusing 0.01% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000004s : 1: handle_group_info 0.02% : 0.000006s : 1: inline 0.02% : 0.000006s : 1: insert-virtual-dataset 0.01% : 0.000004s : 1: interleave_parallel_branches 0.01% : 0.000004s : 1: interleave_split_concat_branches 0.02% : 0.000006s : 1: label_fine_grained_interleaved_index 0.02% : 0.000007s : 1: label_micro_interleaved_index 1.37% : 0.000424s : 1: loop_unroll 0.01% : 0.000004s : 1: merge_cast_opt 0.02% : 0.000005s : 1: micro_interleaved_order_control 1.58% : 0.000486s : 1: mutable_eliminate 0.02% : 0.000006s : 1: offloading_packed_experts 0.04% : 0.000013s : 1: opt.transform.loop_unroll_optimizer 0.05% : 0.000014s : 1: opt.transform.mutable_eliminate 3.41% : 0.001052s : 78: opt.transform.opt_a 0.08% : 0.000026s : 1: opt.transform.opt_after_cconv 0.07% : 0.000023s : 1: opt.transform.opt_after_jit_grad 0.30% : 0.000094s : 28: opt.transform.opt_b 0.17% : 0.000053s : 2: opt.transform.opt_trans_graph 0.15% : 0.000046s : 4: opt.transform.symbol_engine_opt 8.27% : 0.002550s : 1: opt_a 0.32% : 0.000098s : 1: opt_after_cconv 1.51% : 0.000467s : 1: opt_after_jit_grad 0.64% : 0.000197s : 1: opt_b 14.63% : 0.004510s : 1: optimize 0.06% : 0.000019s : 1: optimize_parallel_all_gather_comm 0.03% : 0.000008s : 1: order_py_execute_after_rewriter 0.06% : 0.000020s : 1: overlap_grad_flash_sp 0.01% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.02% : 0.000007s : 1: overlap_grad_ring_attention 0.01% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.02% : 0.000005s : 1: overlap_param_gather 0.01% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.03% : 0.000008s : 1: overlap_recompute_and_grad_model_parallel 0.02% : 0.000005s : 1: overlap_recompute_comm 0.02% : 0.000007s : 1: parallel-infer-symbol 0.01% : 0.000004s : 1: parallel-infer-symbol-second 0.02% : 0.000005s : 1: partial_unused_args_eliminate 0.02% : 0.000005s : 1: pipeline_parallel_scheduler 0.01% : 0.000004s : 1: pipeline_split 0.12% : 0.000036s : 1: pre_auto_parallel 0.09% : 0.000028s : 1: py_interpret_to_execute 0.04% : 0.000011s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000004s : 1: remove_cast_before_assign_add 0.06% : 0.000017s : 1: remove_dup_value 1.12% : 0.000346s : 1: renormalize.infer 1.06% : 0.000325s : 1: renormalize.specialize 0.02% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.02% : 0.000007s : 1: rewriter_after_jit_bprop_graph 0.12% : 0.000038s : 1: rewriter_after_opt_a 0.34% : 0.000106s : 1: rewriter_before_opt_a 0.02% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.01% : 0.000005s : 1: slice_recompute_activation 0.01% : 0.000004s : 1: split_layernorm_comm 0.02% : 0.000005s : 1: split_matmul_comm_elemetwise 0.02% : 0.000008s : 1: swap_dp_allreduce_reducescatter 0.28% : 0.000088s : 1: symbol_engine_optimizer 0.28% : 0.000085s : 1: tuple_transform 40.61% : 0.012521s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:47:04.536.663 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:47:04.536.925 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0226274, [21] [bootstrap]: 0.00043129 [type_inference]: 0.0122 [event_method]: 1.946e-05 [auto_monad]: 9.254e-05 [graph_reusing]: 6.74001e-06 [inline]: 2.21e-06 [add_attr]: 0.00329839, [1] [add_attr_with_inline]: 0.00328743, [1] [Cycle 1]: 7.635e-05, [2] [tag_attr]: 2.042e-05 [meta_addattr_fg_expand]: 5.96e-06 [parallel-infer-symbol]: 3.03e-06 [pre_auto_parallel]: 3.494e-05 [insert-virtual-dataset]: 2.37001e-06 [parallel-infer-symbol-second]: 7.00005e-07 [dataset_repeat_opt]: 1.70001e-06 [pipeline_split]: 1.61002e-06 [optimize]: 0.00537494, [53] [py_interpret_to_execute]: 2.894e-05 [rewriter_before_opt_a]: 8.247e-05 [opt_a]: 0.00307624, [2] [Cycle 1]: 0.0022928, [45] [expand_dump_flag]: 3.14999e-06 [switch_simplify]: 0.00010456 [loop_unroll]: 2.464e-05 [a_1]: 0.0005121 [with_stream_mark]: 1.614e-05 [recompute_prepare]: 8.79e-06 [updatestate_depend_eliminate]: 4.07e-06 [updatestate_assign_eliminate]: 3.28e-06 [updatestate_loads_eliminate]: 2.73e-06 [parameter_eliminate]: 2.42001e-06 [a_2]: 0.00010727 [accelerated_algorithm]: 7.08998e-06 [shard]: 1.98002e-06 [meta_shard_fg_expand]: 1.95001e-06 [shard_inline]: 6.05002e-06 [merge_send_recv]: 1.035e-05 [auto_parallel]: 6.98e-06 [parallel]: 1.915e-05 [flash_sp]: 8.02998e-06 [merge_comm]: 3.96001e-06 [allreduce_fusion]: 3.37002e-06 [matmul_add_comm_reduction]: 9.92999e-06 [allreduce_slice_to_reducescatter]: 6.30011e-07 [virtual_shard_identity]: 8.38999e-06 [virtual_dataset]: 6.33998e-06 [get_grad_eliminate_]: 5.80002e-06 [virtual_output]: 5.74999e-06 [merge_forward]: 3.63e-06 [cell_reuse_recompute_pass]: 1.15999e-06 [offload_activation]: 1.087e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.434e-05 [merge_recompute_call_nodes]: 1.83002e-06 [before_grad]: 1.072e-05 [set_forward_comm_id_for_comm_node_pass]: 3.59002e-06 [meta_fg_expand]: 3.16001e-06 [flash_sp_send_recv_attached]: 2.45002e-06 [receive_attached]: 3.04001e-06 [after_resolve]: 8.57e-06 [a_after_grad]: 1.001e-05 [renormalize]: 0.00083076 [add_forward_monad_depend]: 5.41998e-06 [auto_monad_grad]: 2.63998e-06 [auto_monad_eliminator]: 1.495e-05 [cse]: 3.078e-05 [a_3]: 5.979e-05 [Cycle 2]: 0.00077044, [45] [expand_dump_flag]: 1.38002e-06 [switch_simplify]: 7.46999e-06 [loop_unroll]: 6.05002e-06 [a_1]: 0.00010548 [with_stream_mark]: 1.171e-05 [recompute_prepare]: 5.96998e-06 [updatestate_depend_eliminate]: 3.40998e-06 [updatestate_assign_eliminate]: 2.43002e-06 [updatestate_loads_eliminate]: 2.39999e-06 [parameter_eliminate]: 1.19e-06 [a_2]: 9.648e-05 [accelerated_algorithm]: 5.99999e-06 [shard]: 1.50001e-06 [meta_shard_fg_expand]: 1.45001e-06 [shard_inline]: 6.24001e-06 [merge_send_recv]: 4.45999e-06 [auto_parallel]: 5.57999e-06 [parallel]: 5.19e-06 [flash_sp]: 3.6e-06 [merge_comm]: 3.28998e-06 [allreduce_fusion]: 3.48e-06 [matmul_add_comm_reduction]: 5.64e-06 [allreduce_slice_to_reducescatter]: 4.40021e-07 [virtual_shard_identity]: 6.43e-06 [virtual_dataset]: 5.62001e-06 [get_grad_eliminate_]: 5.32001e-06 [virtual_output]: 5.59998e-06 [merge_forward]: 2.79001e-06 [cell_reuse_recompute_pass]: 1.71e-06 [offload_activation]: 6.16998e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.452e-05 [merge_recompute_call_nodes]: 8.80013e-07 [before_grad]: 9.54999e-06 [set_forward_comm_id_for_comm_node_pass]: 3.76999e-06 [meta_fg_expand]: 2.11e-06 [flash_sp_send_recv_attached]: 8.59989e-07 [receive_attached]: 1.15999e-06 [after_resolve]: 9.14e-06 [a_after_grad]: 8.69003e-06 [renormalize]: 8.9989e-08 [add_forward_monad_depend]: 1.15999e-06 [auto_monad_grad]: 1.03001e-06 [auto_monad_eliminator]: 6.61e-06 [cse]: 1.399e-05 [a_3]: 4.717e-05 [py_interpret_to_execute_after_opt_a]: 1.205e-05 [slice_cell_reuse_recomputed_activation]: 5.24e-06 [rewriter_after_opt_a]: 3.961e-05 [convert_after_rewriter]: 9.96e-06 [order_py_execute_after_rewriter]: 8.37e-06 [mutable_eliminate]: 0.00052802 [opt_b]: 0.00026405, [1] [Cycle 1]: 0.00025474, [7] [b_1]: 0.00016092 [b_2]: 7.26001e-06 [updatestate_depend_eliminate]: 5.67999e-06 [updatestate_assign_eliminate]: 2.48e-06 [updatestate_loads_eliminate]: 2.19001e-06 [renormalize]: 4.50003e-07 [cse]: 1.97e-05 [optimize_parallel_all_gather_comm]: 1.915e-05 [overlap_param_gather]: 5.07999e-06 [cconv]: 2.751e-05 [loop_unroll]: 0.00044549 [opt_after_cconv]: 0.00012445, [1] [Cycle 1]: 0.00011515, [7] [c_1]: 2.739e-05 [parameter_eliminate]: 2.88e-06 [updatestate_depend_eliminate]: 6.48998e-06 [updatestate_assign_eliminate]: 2.78998e-06 [updatestate_loads_eliminate]: 2.48e-06 [cse]: 1.816e-05 [renormalize]: 3.89991e-07 [remove_dup_value]: 1.747e-05 [tuple_transform]: 8.503e-05, [1] [Cycle 1]: 7.839e-05, [4] [d_1]: 3.959e-05 [none_parameter_eliminate]: 1.77999e-06 [renormalize]: 2.19996e-07 [switch_simplify]: 7.32002e-06 [partial_unused_args_eliminate]: 4.27e-06 [add_recomputation]: 4.903e-05 [cse_after_recomputation]: 2.795e-05, [1] [Cycle 1]: 2.167e-05, [1] [cse]: 1.179e-05 [environ_conv]: 7.48999e-06 [swap_dp_allreduce_reducescatter]: 8.32e-06 [bias_add_comm_swap]: 4.75001e-06 [label_micro_interleaved_index]: 6.21e-06 [label_fine_grained_interleaved_index]: 5.02e-06 [merge_cast_opt]: 4e-06 [slice_recompute_activation]: 4.12e-06 [micro_interleaved_order_control]: 4.43001e-06 [assign_add_opt]: 3.90998e-06 [ForceFp32Comm]: 2.96001e-06 [remove_cast_before_assign_add]: 3.29001e-06 [full_micro_interleaved_order_control]: 4.62998e-06 [reorder_send_recv_between_fp_bp]: 5.30001e-06 [comm_op_add_attrs]: 3.38999e-06 [add_comm_op_reuse_tag]: 3.26999e-06 [interleave_split_concat_branches]: 3.38999e-06 [interleave_parallel_branches]: 3.38e-06 [overlap_opt_shard_in_pipeline]: 3.3e-06 [overlap_opt_shard_grad_in_pipeline]: 4.84e-06 [control_data_broadcast_order]: 1.538e-05 [grouped_pairwise_exchange_alltoall]: 3.93999e-06 [offloading_packed_experts]: 5.92001e-06 [overlap_recompute_and_grad_model_parallel]: 7.31999e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.38999e-06 [overlap_recompute_allgather_and_fa_grad]: 3.61001e-06 [overlap_recompute_comm]: 5.09e-06 [overlap_grad_ring_attention]: 6.74999e-06 [overlap_grad_flash_sp]: 2.096e-05 [begin_end_overlap_inline]: 3.49001e-06 [split_matmul_comm_elemetwise]: 4.23001e-06 [split_layernorm_comm]: 4.14002e-06 [handle_group_info]: 3.83001e-06 [symbol_engine_optimizer]: 0.00010899, [1] [Cycle 1]: 0.00010229, [6] [build]: 2.91999e-06 [elim_shapecalc]: 9.37999e-06 [elim_not_effective]: 1.291e-05 [opt_reshape]: 6.40002e-06 [fold_const_symbol]: 2.497e-05 [renormalize]: 2.19996e-07 [detach_backward]: 3.79002e-06 [pipeline_parallel_scheduler]: 1.76e-06 [auto_monad_reorder]: 1.996e-05 [get_jit_bprop_graph]: 1.60001e-06 [rewriter_after_jit_bprop_graph]: 4.55999e-06 [opt_after_jit_grad]: 0.00049091 [validate]: 3.712e-05 Sums bootstrap : 0.000431s : 2.45% type_inference : 0.012200s : 69.42% event_method : 0.000019s : 0.11% auto_monad : 0.000093s : 0.53% graph_reusing : 0.000007s : 0.04% inline : 0.000002s : 0.01% add_attr.add_attr_with_inline.tag_attr : 0.000020s : 0.12% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.03% parallel-infer-symbol : 0.000003s : 0.02% pre_auto_parallel : 0.000035s : 0.20% insert-virtual-dataset : 0.000002s : 0.01% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.01% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000029s : 0.16% optimize.rewriter_before_opt_a : 0.000082s : 0.47% optimize.opt_a.expand_dump_flag : 0.000005s : 0.03% optimize.opt_a.switch_simplify : 0.000112s : 0.64% optimize.opt_a.loop_unroll : 0.000031s : 0.17% optimize.opt_a.a_1 : 0.000618s : 3.51% optimize.opt_a.with_stream_mark : 0.000028s : 0.16% optimize.opt_a.recompute_prepare : 0.000015s : 0.08% optimize.opt_a.updatestate_depend_eliminate : 0.000007s : 0.04% optimize.opt_a.updatestate_assign_eliminate : 0.000006s : 0.03% optimize.opt_a.updatestate_loads_eliminate : 0.000005s : 0.03% optimize.opt_a.parameter_eliminate : 0.000004s : 0.02% optimize.opt_a.a_2 : 0.000204s : 1.16% optimize.opt_a.accelerated_algorithm : 0.000013s : 0.07% optimize.opt_a.shard : 0.000003s : 0.02% optimize.opt_a.meta_shard_fg_expand : 0.000003s : 0.02% optimize.opt_a.shard_inline : 0.000012s : 0.07% optimize.opt_a.merge_send_recv : 0.000015s : 0.08% optimize.opt_a.auto_parallel : 0.000013s : 0.07% optimize.opt_a.parallel : 0.000024s : 0.14% optimize.opt_a.flash_sp : 0.000012s : 0.07% optimize.opt_a.merge_comm : 0.000007s : 0.04% optimize.opt_a.allreduce_fusion : 0.000007s : 0.04% optimize.opt_a.matmul_add_comm_reduction : 0.000016s : 0.09% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000015s : 0.08% optimize.opt_a.virtual_dataset : 0.000012s : 0.07% optimize.opt_a.get_grad_eliminate_ : 0.000011s : 0.06% optimize.opt_a.virtual_output : 0.000011s : 0.06% optimize.opt_a.merge_forward : 0.000006s : 0.04% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.02% optimize.opt_a.offload_activation : 0.000017s : 0.10% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000029s : 0.16% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.02% optimize.opt_a.before_grad : 0.000020s : 0.12% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000007s : 0.04% optimize.opt_a.meta_fg_expand : 0.000005s : 0.03% optimize.opt_a.flash_sp_send_recv_attached : 0.000003s : 0.02% optimize.opt_a.receive_attached : 0.000004s : 0.02% optimize.opt_a.after_resolve : 0.000018s : 0.10% optimize.opt_a.a_after_grad : 0.000019s : 0.11% optimize.opt_a.renormalize : 0.000831s : 4.73% optimize.opt_a.add_forward_monad_depend : 0.000007s : 0.04% optimize.opt_a.auto_monad_grad : 0.000004s : 0.02% optimize.opt_a.auto_monad_eliminator : 0.000022s : 0.12% optimize.opt_a.cse : 0.000045s : 0.25% optimize.opt_a.a_3 : 0.000107s : 0.61% optimize.py_interpret_to_execute_after_opt_a : 0.000012s : 0.07% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.03% optimize.rewriter_after_opt_a : 0.000040s : 0.23% optimize.convert_after_rewriter : 0.000010s : 0.06% optimize.order_py_execute_after_rewriter : 0.000008s : 0.05% optimize.mutable_eliminate : 0.000528s : 3.00% optimize.opt_b.b_1 : 0.000161s : 0.92% optimize.opt_b.b_2 : 0.000007s : 0.04% optimize.opt_b.updatestate_depend_eliminate : 0.000006s : 0.03% optimize.opt_b.updatestate_assign_eliminate : 0.000002s : 0.01% optimize.opt_b.updatestate_loads_eliminate : 0.000002s : 0.01% optimize.opt_b.renormalize : 0.000000s : 0.00% optimize.opt_b.cse : 0.000020s : 0.11% optimize.optimize_parallel_all_gather_comm : 0.000019s : 0.11% optimize.overlap_param_gather : 0.000005s : 0.03% optimize.cconv : 0.000028s : 0.16% optimize.loop_unroll : 0.000445s : 2.53% optimize.opt_after_cconv.c_1 : 0.000027s : 0.16% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.02% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.04% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.02% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.01% optimize.opt_after_cconv.cse : 0.000018s : 0.10% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000017s : 0.10% optimize.tuple_transform.d_1 : 0.000040s : 0.23% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000007s : 0.04% optimize.partial_unused_args_eliminate : 0.000004s : 0.02% optimize.add_recomputation : 0.000049s : 0.28% optimize.cse_after_recomputation.cse : 0.000012s : 0.07% optimize.environ_conv : 0.000007s : 0.04% optimize.swap_dp_allreduce_reducescatter : 0.000008s : 0.05% optimize.bias_add_comm_swap : 0.000005s : 0.03% optimize.label_micro_interleaved_index : 0.000006s : 0.04% optimize.label_fine_grained_interleaved_index : 0.000005s : 0.03% optimize.merge_cast_opt : 0.000004s : 0.02% optimize.slice_recompute_activation : 0.000004s : 0.02% optimize.micro_interleaved_order_control : 0.000004s : 0.03% optimize.assign_add_opt : 0.000004s : 0.02% optimize.ForceFp32Comm : 0.000003s : 0.02% optimize.remove_cast_before_assign_add : 0.000003s : 0.02% optimize.full_micro_interleaved_order_control : 0.000005s : 0.03% optimize.reorder_send_recv_between_fp_bp : 0.000005s : 0.03% optimize.comm_op_add_attrs : 0.000003s : 0.02% optimize.add_comm_op_reuse_tag : 0.000003s : 0.02% optimize.interleave_split_concat_branches : 0.000003s : 0.02% optimize.interleave_parallel_branches : 0.000003s : 0.02% optimize.overlap_opt_shard_in_pipeline : 0.000003s : 0.02% optimize.overlap_opt_shard_grad_in_pipeline : 0.000005s : 0.03% optimize.control_data_broadcast_order : 0.000015s : 0.09% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.02% optimize.offloading_packed_experts : 0.000006s : 0.03% optimize.overlap_recompute_and_grad_model_parallel : 0.000007s : 0.04% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000003s : 0.02% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.02% optimize.overlap_recompute_comm : 0.000005s : 0.03% optimize.overlap_grad_ring_attention : 0.000007s : 0.04% optimize.overlap_grad_flash_sp : 0.000021s : 0.12% optimize.begin_end_overlap_inline : 0.000003s : 0.02% optimize.split_matmul_comm_elemetwise : 0.000004s : 0.02% optimize.split_layernorm_comm : 0.000004s : 0.02% optimize.handle_group_info : 0.000004s : 0.02% optimize.symbol_engine_optimizer.build : 0.000003s : 0.02% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000009s : 0.05% optimize.symbol_engine_optimizer.elim_not_effective : 0.000013s : 0.07% optimize.symbol_engine_optimizer.opt_reshape : 0.000006s : 0.04% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000025s : 0.14% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000004s : 0.02% pipeline_parallel_scheduler : 0.000002s : 0.01% auto_monad_reorder : 0.000020s : 0.11% get_jit_bprop_graph : 0.000002s : 0.01% rewriter_after_jit_bprop_graph : 0.000005s : 0.03% opt_after_jit_grad : 0.000491s : 2.79% validate : 0.000037s : 0.21% Time group info: ------[substitution.] 0.000170 25 1.14% : 0.000002s : 2: substitution.elim_not_effective 8.47% : 0.000014s : 2: substitution.fold_const_symbol 3.56% : 0.000006s : 3: substitution.graph_param_transform 74.84% : 0.000127s : 6: substitution.inline 2.18% : 0.000004s : 4: substitution.j_node_and_user_rematch 2.75% : 0.000005s : 4: substitution.remove_not_recompute_node 1.55% : 0.000003s : 2: substitution.replace_old_param 5.51% : 0.000009s : 2: substitution.switch_simplify ------[type_inference.] 0.012140 2 92.03% : 0.011173s : 1: type_inference.infer 7.97% : 0.000967s : 1: type_inference.specialize ------[replace.] 0.000085 8 45.93% : 0.000039s : 6: replace.inline 54.07% : 0.000046s : 2: replace.switch_simplify ------[match.] 0.000132 8 93.99% : 0.000124s : 6: match.inline 6.01% : 0.000008s : 2: match.switch_simplify ------[predicate.] 0.000173 996 0.93% : 0.000002s : 11: predicate.accumulaten_eliminater 0.80% : 0.000001s : 3: predicate.ad_related_special_op_eliminate 0.52% : 0.000001s : 6: predicate.addn_check_dump 0.96% : 0.000002s : 11: predicate.addn_zero_filter 0.85% : 0.000001s : 11: predicate.adjust_all_reduce_mul_add 2.24% : 0.000004s : 17: predicate.arithmetic_simplify 1.11% : 0.000002s : 11: predicate.cast_eliminate 0.62% : 0.000001s : 6: predicate.check_bprop_eliminate 0.52% : 0.000001s : 6: predicate.compare_switch_simplify 0.17% : 0.000000s : 3: predicate.const_output_eliminate 0.59% : 0.000001s : 6: predicate.depend_value_elim 1.04% : 0.000002s : 11: predicate.dict_get_item_const_eliminator 1.12% : 0.000002s : 11: predicate.dict_get_item_eliminator 1.08% : 0.000002s : 11: predicate.dict_set_item_eliminator 1.08% : 0.000002s : 6: predicate.dumpgradient_eliminate 0.22% : 0.000000s : 3: predicate.elim_not_effective 0.42% : 0.000001s : 3: predicate.elim_shapecalc_of_broadcastargs 1.16% : 0.000002s : 14: predicate.environ_add_const_eliminate 1.09% : 0.000002s : 14: predicate.environ_get_add_eliminate 1.08% : 0.000002s : 14: predicate.environ_get_depend_swap 1.76% : 0.000003s : 20: predicate.environ_get_eliminate 1.14% : 0.000002s : 14: predicate.environ_get_set_eliminate 1.46% : 0.000003s : 17: predicate.exchange_switch_depend_value 2.31% : 0.000004s : 17: predicate.float_depend_g_call 0.51% : 0.000001s : 6: predicate.float_environ_get_switch 0.74% : 0.000001s : 9: predicate.float_tuple_getitem_switch 0.16% : 0.000000s : 3: predicate.fold_const_symbol 0.69% : 0.000001s : 6: predicate.get_grad_eliminate 0.21% : 0.000000s : 3: predicate.graph_param_transform 0.61% : 0.000001s : 6: predicate.incorporate_call 0.51% : 0.000001s : 6: predicate.incorporate_call_switch 6.67% : 0.000012s : 46: predicate.inline 0.81% : 0.000001s : 6: predicate.inline_without_move 0.31% : 0.000001s : 6: predicate.j_node_and_user_rematch 0.96% : 0.000002s : 6: predicate.less_batch_normalization 1.61% : 0.000003s : 17: predicate.list_to_tuple_eliminator_ 2.25% : 0.000004s : 28: predicate.load_eliminater 1.03% : 0.000002s : 3: predicate.loop_unroll_after_grad 2.36% : 0.000004s : 27: predicate.loop_unroll_before_grad 1.57% : 0.000003s : 17: predicate.make_slice_get_slice_eliminator 0.52% : 0.000001s : 6: predicate.merge_addn 0.54% : 0.000001s : 6: predicate.micro_step_allgather_replace 0.56% : 0.000001s : 6: predicate.mini_step_allgather_replace 0.84% : 0.000001s : 11: predicate.minmaximum_grad 1.23% : 0.000002s : 3: predicate.mutable_eliminate 0.38% : 0.000001s : 3: predicate.opt_reshape 0.34% : 0.000001s : 3: predicate.parallel_virtual_node 1.85% : 0.000003s : 17: predicate.partial_defer_inline 1.41% : 0.000002s : 14: predicate.partial_eliminate 0.89% : 0.000002s : 11: predicate.print_const_string_wrapper 0.54% : 0.000001s : 6: predicate.reduce_all_const_elim 1.19% : 0.000002s : 11: predicate.reduce_eliminate 2.28% : 0.000004s : 28: predicate.redundant_stop_gradient_eliminater 0.45% : 0.000001s : 6: predicate.remove_not_recompute_node 1.18% : 0.000002s : 17: predicate.replace_applicator 0.52% : 0.000001s : 6: predicate.replace_old_param 0.28% : 0.000000s : 3: predicate.reset_defer_inline 1.23% : 0.000002s : 11: predicate.reshape_eliminate 0.73% : 0.000001s : 6: predicate.row_tensor_add_zeros_like 0.34% : 0.000001s : 3: predicate.row_tensor_eliminate 0.89% : 0.000002s : 6: predicate.same_eliminate 0.37% : 0.000001s : 6: predicate.set_cell_output_no_recompute 0.80% : 0.000001s : 6: predicate.shard_identity_eliminate 0.89% : 0.000002s : 6: predicate.special_op_eliminate 0.74% : 0.000001s : 6: predicate.specialize_transform 0.88% : 0.000002s : 6: predicate.split_environ_get_set_with_tuple_value 0.78% : 0.000001s : 6: predicate.stack_unstack_eliminate 0.32% : 0.000001s : 3: predicate.switch_call_monad_eliminater 1.60% : 0.000003s : 17: predicate.switch_defer_inline 2.03% : 0.000004s : 23: predicate.switch_layer_defer_inline 5.87% : 0.000010s : 57: predicate.switch_simplify 0.88% : 0.000002s : 11: predicate.tile_eliminate 0.94% : 0.000002s : 11: predicate.transpose_eliminate 1.56% : 0.000003s : 17: predicate.tuple_list_convert_item_index_to_positive 1.72% : 0.000003s : 17: predicate.tuple_list_get_item_const_eliminator 1.43% : 0.000002s : 17: predicate.tuple_list_get_item_depend_reorder 2.91% : 0.000005s : 23: predicate.tuple_list_get_item_eliminator 1.40% : 0.000002s : 17: predicate.tuple_list_get_set_item_eliminator 2.23% : 0.000004s : 23: predicate.tuple_list_set_item_eliminator 1.50% : 0.000003s : 17: predicate.tuple_to_list_eliminator_ 2.20% : 0.000004s : 28: predicate.updatestate_pure_node_eliminater 3.01% : 0.000005s : 34: predicate.updatestate_useless_node_eliminater 0.35% : 0.000001s : 3: predicate.value_based_eliminate 0.62% : 0.000001s : 6: predicate.virtual_dataset_eliminate 0.71% : 0.000001s : 6: predicate.virtual_output_eliminate 0.31% : 0.000001s : 3: predicate.virtual_view_grad_eliminate 0.43% : 0.000001s : 3: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000646 10 49.69% : 0.000321s : 2: func_graph_cloner_run.FuncGraphClonerGraph 50.31% : 0.000325s : 8: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.033323 192 0.02% : 0.000006s : 1: ForceFp32Comm 9.93% : 0.003309s : 1: add_attr 9.88% : 0.003292s : 1: add_attr_with_inline 0.02% : 0.000006s : 1: add_comm_op_reuse_tag 0.16% : 0.000053s : 1: add_recomputation 0.02% : 0.000006s : 1: assign_add_opt 0.31% : 0.000102s : 1: auto_monad 0.08% : 0.000027s : 1: auto_monad_reorder 0.02% : 0.000006s : 1: begin_end_overlap_inline 0.02% : 0.000007s : 1: bias_add_comm_swap 1.43% : 0.000476s : 1: bootstrap 0.09% : 0.000031s : 1: cconv 0.02% : 0.000006s : 1: comm_op_add_attrs 0.05% : 0.000018s : 1: control_data_broadcast_order 0.04% : 0.000013s : 1: convert_after_rewriter 0.09% : 0.000031s : 1: cse_after_recomputation 0.02% : 0.000007s : 1: dataset_repeat_opt 0.06% : 0.000020s : 1: detach_backward 0.03% : 0.000010s : 1: environ_conv 0.09% : 0.000031s : 1: event_method 0.02% : 0.000007s : 1: full_micro_interleaved_order_control 0.02% : 0.000008s : 1: get_jit_bprop_graph 0.04% : 0.000013s : 1: graph_reusing 0.02% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.02% : 0.000007s : 1: handle_group_info 0.02% : 0.000008s : 1: inline 0.02% : 0.000008s : 1: insert-virtual-dataset 0.02% : 0.000006s : 1: interleave_parallel_branches 0.02% : 0.000006s : 1: interleave_split_concat_branches 0.02% : 0.000008s : 1: label_fine_grained_interleaved_index 0.03% : 0.000009s : 1: label_micro_interleaved_index 1.36% : 0.000452s : 1: loop_unroll 0.02% : 0.000007s : 1: merge_cast_opt 0.02% : 0.000007s : 1: micro_interleaved_order_control 1.61% : 0.000535s : 1: mutable_eliminate 0.03% : 0.000009s : 1: offloading_packed_experts 0.04% : 0.000014s : 1: opt.transform.loop_unroll_optimizer 0.04% : 0.000015s : 1: opt.transform.mutable_eliminate 3.22% : 0.001074s : 78: opt.transform.opt_a 0.08% : 0.000026s : 1: opt.transform.opt_after_cconv 0.07% : 0.000024s : 1: opt.transform.opt_after_jit_grad 0.29% : 0.000097s : 28: opt.transform.opt_b 0.13% : 0.000045s : 2: opt.transform.opt_trans_graph 0.15% : 0.000050s : 4: opt.transform.symbol_engine_opt 9.24% : 0.003079s : 1: opt_a 0.38% : 0.000128s : 1: opt_after_cconv 1.51% : 0.000502s : 1: opt_after_jit_grad 0.80% : 0.000267s : 1: opt_b 17.18% : 0.005726s : 1: optimize 0.07% : 0.000023s : 1: optimize_parallel_all_gather_comm 0.03% : 0.000011s : 1: order_py_execute_after_rewriter 0.07% : 0.000024s : 1: overlap_grad_flash_sp 0.02% : 0.000006s : 1: overlap_grad_matmul_and_grad_allreduce 0.03% : 0.000009s : 1: overlap_grad_ring_attention 0.02% : 0.000007s : 1: overlap_opt_shard_grad_in_pipeline 0.02% : 0.000006s : 1: overlap_opt_shard_in_pipeline 0.03% : 0.000009s : 1: overlap_param_gather 0.02% : 0.000006s : 1: overlap_recompute_allgather_and_fa_grad 0.03% : 0.000010s : 1: overlap_recompute_and_grad_model_parallel 0.03% : 0.000008s : 1: overlap_recompute_comm 0.03% : 0.000009s : 1: parallel-infer-symbol 0.02% : 0.000006s : 1: parallel-infer-symbol-second 0.02% : 0.000007s : 1: partial_unused_args_eliminate 0.03% : 0.000009s : 1: pipeline_parallel_scheduler 0.02% : 0.000007s : 1: pipeline_split 0.13% : 0.000043s : 1: pre_auto_parallel 0.10% : 0.000033s : 1: py_interpret_to_execute 0.05% : 0.000015s : 1: py_interpret_to_execute_after_opt_a 0.02% : 0.000006s : 1: remove_cast_before_assign_add 0.06% : 0.000021s : 1: remove_dup_value 1.32% : 0.000438s : 1: renormalize.infer 1.16% : 0.000385s : 1: renormalize.specialize 0.02% : 0.000008s : 1: reorder_send_recv_between_fp_bp 0.03% : 0.000011s : 1: rewriter_after_jit_bprop_graph 0.13% : 0.000043s : 1: rewriter_after_opt_a 0.26% : 0.000086s : 1: rewriter_before_opt_a 0.02% : 0.000008s : 1: slice_cell_reuse_recomputed_activation 0.02% : 0.000007s : 1: slice_recompute_activation 0.02% : 0.000007s : 1: split_layernorm_comm 0.02% : 0.000007s : 1: split_matmul_comm_elemetwise 0.03% : 0.000011s : 1: swap_dp_allreduce_reducescatter 0.34% : 0.000112s : 1: symbol_engine_optimizer 0.26% : 0.000088s : 1: tuple_transform 36.72% : 0.012235s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:47:04.734.207 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0227291, [21] [bootstrap]: 0.00043579 [type_inference]: 0.0130157 [event_method]: 1.874e-05 [auto_monad]: 8.56e-05 [graph_reusing]: 6.91001e-06 [inline]: 2.61e-06 [add_attr]: 0.00334531, [1] [add_attr_with_inline]: 0.00333503, [1] [Cycle 1]: 7.118e-05, [2] [tag_attr]: 2.161e-05 [meta_addattr_fg_expand]: 6.56e-06 [parallel-infer-symbol]: 5.128e-05 [pre_auto_parallel]: 3.733e-05 [insert-virtual-dataset]: 2.36e-06 [parallel-infer-symbol-second]: 8.09989e-07 [dataset_repeat_opt]: 1.84e-06 [pipeline_split]: 1.60999e-06 [optimize]: 0.00501218, [53] [py_interpret_to_execute]: 2.678e-05 [rewriter_before_opt_a]: 7.99e-05 [opt_a]: 0.00283402, [2] [Cycle 1]: 0.00220813, [45] [expand_dump_flag]: 3.7e-06 [switch_simplify]: 0.00011401 [loop_unroll]: 2.627e-05 [a_1]: 0.00051591 [with_stream_mark]: 1.7e-05 [recompute_prepare]: 8.69e-06 [updatestate_depend_eliminate]: 4.44998e-06 [updatestate_assign_eliminate]: 3.3e-06 [updatestate_loads_eliminate]: 3.08e-06 [parameter_eliminate]: 2.47001e-06 [a_2]: 8.127e-05 [accelerated_algorithm]: 7.16999e-06 [shard]: 2.16e-06 [meta_shard_fg_expand]: 2.32999e-06 [shard_inline]: 6.22001e-06 [merge_send_recv]: 8.77e-06 [auto_parallel]: 7.03e-06 [parallel]: 1.947e-05 [flash_sp]: 8.33001e-06 [merge_comm]: 5.11002e-06 [allreduce_fusion]: 3.33998e-06 [matmul_add_comm_reduction]: 9.31e-06 [allreduce_slice_to_reducescatter]: 6.69999e-07 [virtual_shard_identity]: 7.43999e-06 [virtual_dataset]: 6.31998e-06 [get_grad_eliminate_]: 5.91e-06 [virtual_output]: 6.61999e-06 [merge_forward]: 4.13001e-06 [cell_reuse_recompute_pass]: 1.40001e-06 [offload_activation]: 1.039e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.244e-05 [merge_recompute_call_nodes]: 1.49998e-06 [before_grad]: 1.128e-05 [set_forward_comm_id_for_comm_node_pass]: 3.65e-06 [meta_fg_expand]: 3.13e-06 [flash_sp_send_recv_attached]: 2.64001e-06 [receive_attached]: 2.21e-06 [after_resolve]: 1.008e-05 [a_after_grad]: 9.88002e-06 [renormalize]: 0.00089355 [add_forward_monad_depend]: 6.04001e-06 [auto_monad_grad]: 2.36998e-06 [auto_monad_eliminator]: 1.575e-05 [cse]: 3.118e-05 [a_3]: 4.833e-05 [Cycle 2]: 0.00061565, [45] [expand_dump_flag]: 1.64e-06 [switch_simplify]: 7.68999e-06 [loop_unroll]: 6.25002e-06 [a_1]: 0.00010856 [with_stream_mark]: 1.162e-05 [recompute_prepare]: 6.24999e-06 [updatestate_depend_eliminate]: 2.99001e-06 [updatestate_assign_eliminate]: 2.44001e-06 [updatestate_loads_eliminate]: 2.68998e-06 [parameter_eliminate]: 1.02e-06 [a_2]: 7.093e-05 [accelerated_algorithm]: 5.87001e-06 [shard]: 1.15999e-06 [meta_shard_fg_expand]: 1.42e-06 [shard_inline]: 6.17001e-06 [merge_send_recv]: 4.87998e-06 [auto_parallel]: 5.60001e-06 [parallel]: 5.08002e-06 [flash_sp]: 3.45998e-06 [merge_comm]: 3.16001e-06 [allreduce_fusion]: 3.04001e-06 [matmul_add_comm_reduction]: 5.82001e-06 [allreduce_slice_to_reducescatter]: 4.60015e-07 [virtual_shard_identity]: 6.60002e-06 [virtual_dataset]: 5.80002e-06 [get_grad_eliminate_]: 5.54e-06 [virtual_output]: 5.54e-06 [merge_forward]: 3.01999e-06 [cell_reuse_recompute_pass]: 1.70001e-06 [offload_activation]: 6.23e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.389e-05 [merge_recompute_call_nodes]: 9.30013e-07 [before_grad]: 9.73002e-06 [set_forward_comm_id_for_comm_node_pass]: 4.75999e-06 [meta_fg_expand]: 2.20002e-06 [flash_sp_send_recv_attached]: 8.40024e-07 [receive_attached]: 1.04998e-06 [after_resolve]: 9.05999e-06 [a_after_grad]: 8.55001e-06 [renormalize]: 5.9983e-08 [add_forward_monad_depend]: 1.22999e-06 [auto_monad_grad]: 1.13001e-06 [auto_monad_eliminator]: 6.64001e-06 [cse]: 1.486e-05 [a_3]: 3.513e-05 [py_interpret_to_execute_after_opt_a]: 9.14e-06 [slice_cell_reuse_recomputed_activation]: 2.52001e-06 [rewriter_after_opt_a]: 3.768e-05 [convert_after_rewriter]: 6.77002e-06 [order_py_execute_after_rewriter]: 5.41998e-06 [mutable_eliminate]: 0.00060631 [opt_b]: 0.00020298, [1] [Cycle 1]: 0.00019646, [7] [b_1]: 0.00012105 [b_2]: 7.45e-06 [updatestate_depend_eliminate]: 6.06e-06 [updatestate_assign_eliminate]: 2.63e-06 [updatestate_loads_eliminate]: 2.63e-06 [renormalize]: 8.39995e-07 [cse]: 2.07e-05 [optimize_parallel_all_gather_comm]: 4.134e-05 [overlap_param_gather]: 2.36e-06 [cconv]: 2.899e-05 [loop_unroll]: 0.00044914 [opt_after_cconv]: 0.00010275, [1] [Cycle 1]: 9.675e-05, [7] [c_1]: 2.913e-05 [parameter_eliminate]: 3.03e-06 [updatestate_depend_eliminate]: 5.17e-06 [updatestate_assign_eliminate]: 2.76e-06 [updatestate_loads_eliminate]: 2.31998e-06 [cse]: 1.962e-05 [renormalize]: 5.00004e-07 [remove_dup_value]: 1.511e-05 [tuple_transform]: 7.273e-05, [1] [Cycle 1]: 6.836e-05, [4] [d_1]: 3.947e-05 [none_parameter_eliminate]: 1.69998e-06 [renormalize]: 4.69998e-07 [switch_simplify]: 6.83e-06 [partial_unused_args_eliminate]: 1.74e-06 [add_recomputation]: 4.716e-05 [cse_after_recomputation]: 2.167e-05, [1] [Cycle 1]: 1.742e-05, [1] [cse]: 1.18e-05 [environ_conv]: 5.46e-06 [swap_dp_allreduce_reducescatter]: 4.82e-06 [bias_add_comm_swap]: 2.89999e-06 [label_micro_interleaved_index]: 4.58999e-06 [label_fine_grained_interleaved_index]: 2.76e-06 [merge_cast_opt]: 1.32e-06 [slice_recompute_activation]: 2.27999e-06 [micro_interleaved_order_control]: 2.38998e-06 [assign_add_opt]: 1.17999e-06 [ForceFp32Comm]: 8.09989e-07 [remove_cast_before_assign_add]: 1.23002e-06 [full_micro_interleaved_order_control]: 2.43e-06 [reorder_send_recv_between_fp_bp]: 2.86e-06 [comm_op_add_attrs]: 1.00001e-06 [add_comm_op_reuse_tag]: 1.28002e-06 [interleave_split_concat_branches]: 1.14e-06 [interleave_parallel_branches]: 1.14e-06 [overlap_opt_shard_in_pipeline]: 1.30001e-06 [overlap_opt_shard_grad_in_pipeline]: 2.03997e-06 [control_data_broadcast_order]: 1.311e-05 [grouped_pairwise_exchange_alltoall]: 1.87999e-06 [offloading_packed_experts]: 3.76999e-06 [overlap_recompute_and_grad_model_parallel]: 4.62e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.37e-06 [overlap_recompute_allgather_and_fa_grad]: 1.32e-06 [overlap_recompute_comm]: 2.69001e-06 [overlap_grad_ring_attention]: 4.4e-06 [overlap_grad_flash_sp]: 2.029e-05 [begin_end_overlap_inline]: 6.10016e-07 [split_matmul_comm_elemetwise]: 2.12001e-06 [split_layernorm_comm]: 2.16e-06 [handle_group_info]: 9.70002e-07 [symbol_engine_optimizer]: 9.354e-05, [1] [Cycle 1]: 8.873e-05, [6] [build]: 2.99999e-06 [elim_shapecalc]: 2.505e-05 [elim_not_effective]: 1.347e-05 [opt_reshape]: 7.13e-06 [fold_const_symbol]: 1.051e-05 [renormalize]: 2.80008e-07 [detach_backward]: 2.03002e-06 [pipeline_parallel_scheduler]: 1.71e-06 [auto_monad_reorder]: 1.59e-05 [get_jit_bprop_graph]: 2.07999e-06 [rewriter_after_jit_bprop_graph]: 4.29002e-06 [opt_after_jit_grad]: 0.00048452 [validate]: 4.32e-05 Sums bootstrap : 0.000436s : 2.37% type_inference : 0.013016s : 70.67% event_method : 0.000019s : 0.10% auto_monad : 0.000086s : 0.46% graph_reusing : 0.000007s : 0.04% inline : 0.000003s : 0.01% add_attr.add_attr_with_inline.tag_attr : 0.000022s : 0.12% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000007s : 0.04% parallel-infer-symbol : 0.000051s : 0.28% pre_auto_parallel : 0.000037s : 0.20% insert-virtual-dataset : 0.000002s : 0.01% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.01% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000027s : 0.15% optimize.rewriter_before_opt_a : 0.000080s : 0.43% optimize.opt_a.expand_dump_flag : 0.000005s : 0.03% optimize.opt_a.switch_simplify : 0.000122s : 0.66% optimize.opt_a.loop_unroll : 0.000033s : 0.18% optimize.opt_a.a_1 : 0.000624s : 3.39% optimize.opt_a.with_stream_mark : 0.000029s : 0.16% optimize.opt_a.recompute_prepare : 0.000015s : 0.08% optimize.opt_a.updatestate_depend_eliminate : 0.000007s : 0.04% optimize.opt_a.updatestate_assign_eliminate : 0.000006s : 0.03% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.03% optimize.opt_a.parameter_eliminate : 0.000003s : 0.02% optimize.opt_a.a_2 : 0.000152s : 0.83% optimize.opt_a.accelerated_algorithm : 0.000013s : 0.07% optimize.opt_a.shard : 0.000003s : 0.02% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.02% optimize.opt_a.shard_inline : 0.000012s : 0.07% optimize.opt_a.merge_send_recv : 0.000014s : 0.07% optimize.opt_a.auto_parallel : 0.000013s : 0.07% optimize.opt_a.parallel : 0.000025s : 0.13% optimize.opt_a.flash_sp : 0.000012s : 0.06% optimize.opt_a.merge_comm : 0.000008s : 0.04% optimize.opt_a.allreduce_fusion : 0.000006s : 0.03% optimize.opt_a.matmul_add_comm_reduction : 0.000015s : 0.08% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000014s : 0.08% optimize.opt_a.virtual_dataset : 0.000012s : 0.07% optimize.opt_a.get_grad_eliminate_ : 0.000011s : 0.06% optimize.opt_a.virtual_output : 0.000012s : 0.07% optimize.opt_a.merge_forward : 0.000007s : 0.04% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.02% optimize.opt_a.offload_activation : 0.000017s : 0.09% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000026s : 0.14% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.01% optimize.opt_a.before_grad : 0.000021s : 0.11% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000008s : 0.05% optimize.opt_a.meta_fg_expand : 0.000005s : 0.03% optimize.opt_a.flash_sp_send_recv_attached : 0.000003s : 0.02% optimize.opt_a.receive_attached : 0.000003s : 0.02% optimize.opt_a.after_resolve : 0.000019s : 0.10% optimize.opt_a.a_after_grad : 0.000018s : 0.10% optimize.opt_a.renormalize : 0.000894s : 4.85% optimize.opt_a.add_forward_monad_depend : 0.000007s : 0.04% optimize.opt_a.auto_monad_grad : 0.000003s : 0.02% optimize.opt_a.auto_monad_eliminator : 0.000022s : 0.12% optimize.opt_a.cse : 0.000046s : 0.25% optimize.opt_a.a_3 : 0.000083s : 0.45% optimize.py_interpret_to_execute_after_opt_a : 0.000009s : 0.05% optimize.slice_cell_reuse_recomputed_activation : 0.000003s : 0.01% optimize.rewriter_after_opt_a : 0.000038s : 0.20% optimize.convert_after_rewriter : 0.000007s : 0.04% optimize.order_py_execute_after_rewriter : 0.000005s : 0.03% optimize.mutable_eliminate : 0.000606s : 3.29% optimize.opt_b.b_1 : 0.000121s : 0.66% optimize.opt_b.b_2 : 0.000007s : 0.04% optimize.opt_b.updatestate_depend_eliminate : 0.000006s : 0.03% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.01% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.01% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000021s : 0.11% optimize.optimize_parallel_all_gather_comm : 0.000041s : 0.22% optimize.overlap_param_gather : 0.000002s : 0.01% optimize.cconv : 0.000029s : 0.16% optimize.loop_unroll : 0.000449s : 2.44% optimize.opt_after_cconv.c_1 : 0.000029s : 0.16% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.02% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000005s : 0.03% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.01% optimize.opt_after_cconv.cse : 0.000020s : 0.11% optimize.opt_after_cconv.renormalize : 0.000001s : 0.00% optimize.remove_dup_value : 0.000015s : 0.08% optimize.tuple_transform.d_1 : 0.000039s : 0.21% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000007s : 0.04% optimize.partial_unused_args_eliminate : 0.000002s : 0.01% optimize.add_recomputation : 0.000047s : 0.26% optimize.cse_after_recomputation.cse : 0.000012s : 0.06% optimize.environ_conv : 0.000005s : 0.03% optimize.swap_dp_allreduce_reducescatter : 0.000005s : 0.03% optimize.bias_add_comm_swap : 0.000003s : 0.02% optimize.label_micro_interleaved_index : 0.000005s : 0.02% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.01% optimize.merge_cast_opt : 0.000001s : 0.01% optimize.slice_recompute_activation : 0.000002s : 0.01% optimize.micro_interleaved_order_control : 0.000002s : 0.01% optimize.assign_add_opt : 0.000001s : 0.01% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.01% optimize.full_micro_interleaved_order_control : 0.000002s : 0.01% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.02% optimize.comm_op_add_attrs : 0.000001s : 0.01% optimize.add_comm_op_reuse_tag : 0.000001s : 0.01% optimize.interleave_split_concat_branches : 0.000001s : 0.01% optimize.interleave_parallel_branches : 0.000001s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.01% optimize.control_data_broadcast_order : 0.000013s : 0.07% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.01% optimize.offloading_packed_experts : 0.000004s : 0.02% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.03% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.01% optimize.overlap_recompute_comm : 0.000003s : 0.01% optimize.overlap_grad_ring_attention : 0.000004s : 0.02% optimize.overlap_grad_flash_sp : 0.000020s : 0.11% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.01% optimize.split_layernorm_comm : 0.000002s : 0.01% optimize.handle_group_info : 0.000001s : 0.01% optimize.symbol_engine_optimizer.build : 0.000003s : 0.02% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000025s : 0.14% optimize.symbol_engine_optimizer.elim_not_effective : 0.000013s : 0.07% optimize.symbol_engine_optimizer.opt_reshape : 0.000007s : 0.04% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000011s : 0.06% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.01% pipeline_parallel_scheduler : 0.000002s : 0.01% auto_monad_reorder : 0.000016s : 0.09% get_jit_bprop_graph : 0.000002s : 0.01% rewriter_after_jit_bprop_graph : 0.000004s : 0.02% opt_after_jit_grad : 0.000485s : 2.63% validate : 0.000043s : 0.23% Time group info: ------[substitution.] 0.000162 25 1.31% : 0.000002s : 2: substitution.elim_not_effective 0.93% : 0.000002s : 2: substitution.fold_const_symbol 3.81% : 0.000006s : 3: substitution.graph_param_transform 79.68% : 0.000129s : 6: substitution.inline 2.91% : 0.000005s : 4: substitution.j_node_and_user_rematch 3.12% : 0.000005s : 4: substitution.remove_not_recompute_node 2.08% : 0.000003s : 2: substitution.replace_old_param 6.16% : 0.000010s : 2: substitution.switch_simplify ------[type_inference.] 0.012956 2 91.65% : 0.011875s : 1: type_inference.infer 8.35% : 0.001082s : 1: type_inference.specialize ------[replace.] 0.000089 8 43.80% : 0.000039s : 6: replace.inline 56.20% : 0.000050s : 2: replace.switch_simplify ------[match.] 0.000134 8 93.71% : 0.000125s : 6: match.inline 6.29% : 0.000008s : 2: match.switch_simplify ------[predicate.] 0.000175 996 0.93% : 0.000002s : 11: predicate.accumulaten_eliminater 0.84% : 0.000001s : 3: predicate.ad_related_special_op_eliminate 0.51% : 0.000001s : 6: predicate.addn_check_dump 1.04% : 0.000002s : 11: predicate.addn_zero_filter 0.83% : 0.000001s : 11: predicate.adjust_all_reduce_mul_add 2.40% : 0.000004s : 17: predicate.arithmetic_simplify 1.00% : 0.000002s : 11: predicate.cast_eliminate 0.56% : 0.000001s : 6: predicate.check_bprop_eliminate 0.52% : 0.000001s : 6: predicate.compare_switch_simplify 0.17% : 0.000000s : 3: predicate.const_output_eliminate 0.60% : 0.000001s : 6: predicate.depend_value_elim 0.95% : 0.000002s : 11: predicate.dict_get_item_const_eliminator 1.04% : 0.000002s : 11: predicate.dict_get_item_eliminator 0.91% : 0.000002s : 11: predicate.dict_set_item_eliminator 1.37% : 0.000002s : 6: predicate.dumpgradient_eliminate 0.28% : 0.000000s : 3: predicate.elim_not_effective 0.41% : 0.000001s : 3: predicate.elim_shapecalc_of_broadcastargs 1.12% : 0.000002s : 14: predicate.environ_add_const_eliminate 1.07% : 0.000002s : 14: predicate.environ_get_add_eliminate 1.09% : 0.000002s : 14: predicate.environ_get_depend_swap 1.71% : 0.000003s : 20: predicate.environ_get_eliminate 1.20% : 0.000002s : 14: predicate.environ_get_set_eliminate 1.42% : 0.000002s : 17: predicate.exchange_switch_depend_value 2.52% : 0.000004s : 17: predicate.float_depend_g_call 0.46% : 0.000001s : 6: predicate.float_environ_get_switch 0.77% : 0.000001s : 9: predicate.float_tuple_getitem_switch 0.16% : 0.000000s : 3: predicate.fold_const_symbol 0.60% : 0.000001s : 6: predicate.get_grad_eliminate 0.21% : 0.000000s : 3: predicate.graph_param_transform 0.61% : 0.000001s : 6: predicate.incorporate_call 0.50% : 0.000001s : 6: predicate.incorporate_call_switch 6.25% : 0.000011s : 46: predicate.inline 0.85% : 0.000001s : 6: predicate.inline_without_move 0.30% : 0.000001s : 6: predicate.j_node_and_user_rematch 0.85% : 0.000001s : 6: predicate.less_batch_normalization 1.55% : 0.000003s : 17: predicate.list_to_tuple_eliminator_ 2.29% : 0.000004s : 28: predicate.load_eliminater 0.95% : 0.000002s : 3: predicate.loop_unroll_after_grad 2.32% : 0.000004s : 27: predicate.loop_unroll_before_grad 1.57% : 0.000003s : 17: predicate.make_slice_get_slice_eliminator 0.50% : 0.000001s : 6: predicate.merge_addn 0.55% : 0.000001s : 6: predicate.micro_step_allgather_replace 0.55% : 0.000001s : 6: predicate.mini_step_allgather_replace 0.83% : 0.000001s : 11: predicate.minmaximum_grad 1.61% : 0.000003s : 3: predicate.mutable_eliminate 0.35% : 0.000001s : 3: predicate.opt_reshape 0.46% : 0.000001s : 3: predicate.parallel_virtual_node 1.83% : 0.000003s : 17: predicate.partial_defer_inline 1.36% : 0.000002s : 14: predicate.partial_eliminate 0.95% : 0.000002s : 11: predicate.print_const_string_wrapper 0.55% : 0.000001s : 6: predicate.reduce_all_const_elim 1.47% : 0.000003s : 11: predicate.reduce_eliminate 2.29% : 0.000004s : 28: predicate.redundant_stop_gradient_eliminater 0.57% : 0.000001s : 6: predicate.remove_not_recompute_node 1.16% : 0.000002s : 17: predicate.replace_applicator 0.56% : 0.000001s : 6: predicate.replace_old_param 0.26% : 0.000000s : 3: predicate.reset_defer_inline 1.19% : 0.000002s : 11: predicate.reshape_eliminate 0.60% : 0.000001s : 6: predicate.row_tensor_add_zeros_like 0.38% : 0.000001s : 3: predicate.row_tensor_eliminate 0.84% : 0.000001s : 6: predicate.same_eliminate 0.42% : 0.000001s : 6: predicate.set_cell_output_no_recompute 0.74% : 0.000001s : 6: predicate.shard_identity_eliminate 0.72% : 0.000001s : 6: predicate.special_op_eliminate 0.78% : 0.000001s : 6: predicate.specialize_transform 0.87% : 0.000002s : 6: predicate.split_environ_get_set_with_tuple_value 0.72% : 0.000001s : 6: predicate.stack_unstack_eliminate 0.37% : 0.000001s : 3: predicate.switch_call_monad_eliminater 1.56% : 0.000003s : 17: predicate.switch_defer_inline 2.07% : 0.000004s : 23: predicate.switch_layer_defer_inline 5.91% : 0.000010s : 57: predicate.switch_simplify 0.98% : 0.000002s : 11: predicate.tile_eliminate 0.92% : 0.000002s : 11: predicate.transpose_eliminate 1.52% : 0.000003s : 17: predicate.tuple_list_convert_item_index_to_positive 1.64% : 0.000003s : 17: predicate.tuple_list_get_item_const_eliminator 1.40% : 0.000002s : 17: predicate.tuple_list_get_item_depend_reorder 2.93% : 0.000005s : 23: predicate.tuple_list_get_item_eliminator 1.49% : 0.000003s : 17: predicate.tuple_list_get_set_item_eliminator 2.29% : 0.000004s : 23: predicate.tuple_list_set_item_eliminator 1.66% : 0.000003s : 17: predicate.tuple_to_list_eliminator_ 2.15% : 0.000004s : 28: predicate.updatestate_pure_node_eliminater 2.94% : 0.000005s : 34: predicate.updatestate_useless_node_eliminater 0.36% : 0.000001s : 3: predicate.value_based_eliminate 0.63% : 0.000001s : 6: predicate.virtual_dataset_eliminate 0.60% : 0.000001s : 6: predicate.virtual_output_eliminate 0.29% : 0.000001s : 3: predicate.virtual_view_grad_eliminate 0.39% : 0.000001s : 3: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000696 10 51.11% : 0.000356s : 2: func_graph_cloner_run.FuncGraphClonerGraph 48.89% : 0.000340s : 8: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.033201 192 0.01% : 0.000005s : 1: ForceFp32Comm 10.09% : 0.003351s : 1: add_attr 10.06% : 0.003339s : 1: add_attr_with_inline 0.01% : 0.000004s : 1: add_comm_op_reuse_tag 0.15% : 0.000051s : 1: add_recomputation 0.01% : 0.000004s : 1: assign_add_opt 0.28% : 0.000091s : 1: auto_monad 0.06% : 0.000020s : 1: auto_monad_reorder 0.01% : 0.000003s : 1: begin_end_overlap_inline 0.02% : 0.000005s : 1: bias_add_comm_swap 1.40% : 0.000464s : 1: bootstrap 0.10% : 0.000032s : 1: cconv 0.01% : 0.000004s : 1: comm_op_add_attrs 0.05% : 0.000016s : 1: control_data_broadcast_order 0.03% : 0.000010s : 1: convert_after_rewriter 0.07% : 0.000024s : 1: cse_after_recomputation 0.02% : 0.000005s : 1: dataset_repeat_opt 0.02% : 0.000005s : 1: detach_backward 0.03% : 0.000009s : 1: environ_conv 0.08% : 0.000025s : 1: event_method 0.02% : 0.000005s : 1: full_micro_interleaved_order_control 0.02% : 0.000005s : 1: get_jit_bprop_graph 0.03% : 0.000010s : 1: graph_reusing 0.01% : 0.000005s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000003s : 1: handle_group_info 0.02% : 0.000006s : 1: inline 0.02% : 0.000005s : 1: insert-virtual-dataset 0.01% : 0.000004s : 1: interleave_parallel_branches 0.01% : 0.000004s : 1: interleave_split_concat_branches 0.02% : 0.000006s : 1: label_fine_grained_interleaved_index 0.02% : 0.000008s : 1: label_micro_interleaved_index 1.38% : 0.000457s : 1: loop_unroll 0.01% : 0.000004s : 1: merge_cast_opt 0.02% : 0.000005s : 1: micro_interleaved_order_control 1.85% : 0.000615s : 1: mutable_eliminate 0.02% : 0.000007s : 1: offloading_packed_experts 0.05% : 0.000015s : 1: opt.transform.loop_unroll_optimizer 0.05% : 0.000017s : 1: opt.transform.mutable_eliminate 3.30% : 0.001095s : 78: opt.transform.opt_a 0.08% : 0.000028s : 1: opt.transform.opt_after_cconv 0.08% : 0.000025s : 1: opt.transform.opt_after_jit_grad 0.29% : 0.000096s : 28: opt.transform.opt_b 0.13% : 0.000044s : 2: opt.transform.opt_trans_graph 0.11% : 0.000038s : 4: opt.transform.symbol_engine_opt 8.55% : 0.002837s : 1: opt_a 0.32% : 0.000106s : 1: opt_after_cconv 1.49% : 0.000494s : 1: opt_after_jit_grad 0.62% : 0.000206s : 1: opt_b 15.11% : 0.005018s : 1: optimize 0.14% : 0.000045s : 1: optimize_parallel_all_gather_comm 0.03% : 0.000009s : 1: order_py_execute_after_rewriter 0.07% : 0.000023s : 1: overlap_grad_flash_sp 0.01% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.02% : 0.000007s : 1: overlap_grad_ring_attention 0.01% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000005s : 1: overlap_opt_shard_in_pipeline 0.02% : 0.000005s : 1: overlap_param_gather 0.01% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.02% : 0.000007s : 1: overlap_recompute_and_grad_model_parallel 0.02% : 0.000006s : 1: overlap_recompute_comm 0.17% : 0.000056s : 1: parallel-infer-symbol 0.01% : 0.000004s : 1: parallel-infer-symbol-second 0.01% : 0.000005s : 1: partial_unused_args_eliminate 0.02% : 0.000005s : 1: pipeline_parallel_scheduler 0.01% : 0.000005s : 1: pipeline_split 0.13% : 0.000042s : 1: pre_auto_parallel 0.09% : 0.000031s : 1: py_interpret_to_execute 0.04% : 0.000013s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000004s : 1: remove_cast_before_assign_add 0.06% : 0.000019s : 1: remove_dup_value 1.41% : 0.000469s : 1: renormalize.infer 1.25% : 0.000416s : 1: renormalize.specialize 0.02% : 0.000005s : 1: reorder_send_recv_between_fp_bp 0.02% : 0.000008s : 1: rewriter_after_jit_bprop_graph 0.13% : 0.000042s : 1: rewriter_after_opt_a 0.25% : 0.000084s : 1: rewriter_before_opt_a 0.02% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.02% : 0.000006s : 1: slice_recompute_activation 0.01% : 0.000005s : 1: split_layernorm_comm 0.01% : 0.000005s : 1: split_matmul_comm_elemetwise 0.02% : 0.000008s : 1: swap_dp_allreduce_reducescatter 0.29% : 0.000096s : 1: symbol_engine_optimizer 0.23% : 0.000076s : 1: tuple_transform 39.26% : 0.013035s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:47:04.938.945 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:47:04.939.227 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0243516, [21] [bootstrap]: 0.00044469 [type_inference]: 0.0133002 [event_method]: 1.883e-05 [auto_monad]: 8.898e-05 [graph_reusing]: 6.83e-06 [inline]: 2.22001e-06 [add_attr]: 0.00339809, [1] [add_attr_with_inline]: 0.00338749, [1] [Cycle 1]: 8.535e-05, [2] [tag_attr]: 2.26e-05 [meta_addattr_fg_expand]: 5.77001e-06 [parallel-infer-symbol]: 3.2e-06 [pre_auto_parallel]: 3.677e-05 [insert-virtual-dataset]: 2.24001e-06 [parallel-infer-symbol-second]: 7.39994e-07 [dataset_repeat_opt]: 1.99e-06 [pipeline_split]: 1.64998e-06 [optimize]: 0.00582153, [53] [py_interpret_to_execute]: 3.081e-05 [rewriter_before_opt_a]: 8.582e-05 [opt_a]: 0.00330813, [2] [Cycle 1]: 0.00249955, [45] [expand_dump_flag]: 3.45998e-06 [switch_simplify]: 0.00010726 [loop_unroll]: 2.513e-05 [a_1]: 0.0005196 [with_stream_mark]: 1.899e-05 [recompute_prepare]: 8.87e-06 [updatestate_depend_eliminate]: 4.45e-06 [updatestate_assign_eliminate]: 3.51001e-06 [updatestate_loads_eliminate]: 2.81e-06 [parameter_eliminate]: 2.03997e-06 [a_2]: 0.00011184 [accelerated_algorithm]: 6.61e-06 [shard]: 2.13998e-06 [meta_shard_fg_expand]: 2.99999e-06 [shard_inline]: 7.41999e-06 [merge_send_recv]: 9.05001e-06 [auto_parallel]: 9.30001e-06 [parallel]: 2.056e-05 [flash_sp]: 9.74e-06 [merge_comm]: 4.57e-06 [allreduce_fusion]: 4.05e-06 [matmul_add_comm_reduction]: 9.99001e-06 [allreduce_slice_to_reducescatter]: 6.69999e-07 [virtual_shard_identity]: 1.023e-05 [virtual_dataset]: 6.86001e-06 [get_grad_eliminate_]: 6.86001e-06 [virtual_output]: 6.57002e-06 [merge_forward]: 4.33999e-06 [cell_reuse_recompute_pass]: 1.35001e-06 [offload_activation]: 1.042e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.582e-05 [merge_recompute_call_nodes]: 1.91e-06 [before_grad]: 1.399e-05 [set_forward_comm_id_for_comm_node_pass]: 3.70998e-06 [meta_fg_expand]: 3.86999e-06 [flash_sp_send_recv_attached]: 2.94999e-06 [receive_attached]: 2.37999e-06 [after_resolve]: 1.153e-05 [a_after_grad]: 9.17001e-06 [renormalize]: 0.00097044 [add_forward_monad_depend]: 6.44001e-06 [auto_monad_grad]: 2.39001e-06 [auto_monad_eliminator]: 1.631e-05 [cse]: 3.045e-05 [a_3]: 6.239e-05 [Cycle 2]: 0.00079259, [45] [expand_dump_flag]: 1.82001e-06 [switch_simplify]: 7.98001e-06 [loop_unroll]: 6.48e-06 [a_1]: 0.00010971 [with_stream_mark]: 1.359e-05 [recompute_prepare]: 6.07999e-06 [updatestate_depend_eliminate]: 3.73001e-06 [updatestate_assign_eliminate]: 2.59001e-06 [updatestate_loads_eliminate]: 2.69999e-06 [parameter_eliminate]: 1.19998e-06 [a_2]: 9.773e-05 [accelerated_algorithm]: 6.14001e-06 [shard]: 1.26002e-06 [meta_shard_fg_expand]: 1.49998e-06 [shard_inline]: 6.58e-06 [merge_send_recv]: 5.04e-06 [auto_parallel]: 6.58998e-06 [parallel]: 5.22999e-06 [flash_sp]: 3.97e-06 [merge_comm]: 3.58999e-06 [allreduce_fusion]: 3.2e-06 [matmul_add_comm_reduction]: 6.07999e-06 [allreduce_slice_to_reducescatter]: 3.80009e-07 [virtual_shard_identity]: 7.13e-06 [virtual_dataset]: 5.99999e-06 [get_grad_eliminate_]: 5.49e-06 [virtual_output]: 5.67999e-06 [merge_forward]: 3.07002e-06 [cell_reuse_recompute_pass]: 2.14e-06 [offload_activation]: 7.15998e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.551e-05 [merge_recompute_call_nodes]: 7.40023e-07 [before_grad]: 9.97999e-06 [set_forward_comm_id_for_comm_node_pass]: 3.59002e-06 [meta_fg_expand]: 2.48002e-06 [flash_sp_send_recv_attached]: 9.89996e-07 [receive_attached]: 1.81998e-06 [after_resolve]: 9.00999e-06 [a_after_grad]: 8.47e-06 [renormalize]: 7.99773e-08 [add_forward_monad_depend]: 1.80001e-06 [auto_monad_grad]: 1.13001e-06 [auto_monad_eliminator]: 6.76e-06 [cse]: 1.4e-05 [a_3]: 4.739e-05 [py_interpret_to_execute_after_opt_a]: 1.317e-05 [slice_cell_reuse_recomputed_activation]: 4.90999e-06 [rewriter_after_opt_a]: 7.701e-05 [convert_after_rewriter]: 1.008e-05 [order_py_execute_after_rewriter]: 8.48999e-06 [mutable_eliminate]: 0.00067245 [opt_b]: 0.00027197, [1] [Cycle 1]: 0.00026093, [7] [b_1]: 0.00016389 [b_2]: 8.00999e-06 [updatestate_depend_eliminate]: 6.68e-06 [updatestate_assign_eliminate]: 2.63e-06 [updatestate_loads_eliminate]: 2.43e-06 [renormalize]: 6.39993e-07 [cse]: 2.072e-05 [optimize_parallel_all_gather_comm]: 1.988e-05 [overlap_param_gather]: 5.25001e-06 [cconv]: 3.214e-05 [loop_unroll]: 0.00046196 [opt_after_cconv]: 0.00012961, [1] [Cycle 1]: 0.00012089, [7] [c_1]: 2.99e-05 [parameter_eliminate]: 3.73999e-06 [updatestate_depend_eliminate]: 5.51e-06 [updatestate_assign_eliminate]: 2.54999e-06 [updatestate_loads_eliminate]: 2.19001e-06 [cse]: 2.09e-05 [renormalize]: 3.39991e-07 [remove_dup_value]: 1.768e-05 [tuple_transform]: 8.78e-05, [1] [Cycle 1]: 7.958e-05, [4] [d_1]: 3.975e-05 [none_parameter_eliminate]: 1.52999e-06 [renormalize]: 1.80007e-07 [switch_simplify]: 7.03e-06 [partial_unused_args_eliminate]: 4.17e-06 [add_recomputation]: 4.858e-05 [cse_after_recomputation]: 2.76e-05, [1] [Cycle 1]: 2.061e-05, [1] [cse]: 1.16e-05 [environ_conv]: 8.14997e-06 [swap_dp_allreduce_reducescatter]: 7.35e-06 [bias_add_comm_swap]: 5.19998e-06 [label_micro_interleaved_index]: 7.53e-06 [label_fine_grained_interleaved_index]: 5.15001e-06 [merge_cast_opt]: 3.78999e-06 [slice_recompute_activation]: 4.75999e-06 [micro_interleaved_order_control]: 4.65999e-06 [assign_add_opt]: 3.53e-06 [ForceFp32Comm]: 3.97e-06 [remove_cast_before_assign_add]: 3.16999e-06 [full_micro_interleaved_order_control]: 4.38999e-06 [reorder_send_recv_between_fp_bp]: 5.98002e-06 [comm_op_add_attrs]: 3.80998e-06 [add_comm_op_reuse_tag]: 3.50998e-06 [interleave_split_concat_branches]: 3.39001e-06 [interleave_parallel_branches]: 3.91001e-06 [overlap_opt_shard_in_pipeline]: 3.5e-06 [overlap_opt_shard_grad_in_pipeline]: 4.27e-06 [control_data_broadcast_order]: 1.569e-05 [grouped_pairwise_exchange_alltoall]: 3.83001e-06 [offloading_packed_experts]: 6.28e-06 [overlap_recompute_and_grad_model_parallel]: 7.43e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.46001e-06 [overlap_recompute_allgather_and_fa_grad]: 3.68e-06 [overlap_recompute_comm]: 4.67998e-06 [overlap_grad_ring_attention]: 7e-06 [overlap_grad_flash_sp]: 2.211e-05 [begin_end_overlap_inline]: 2.84999e-06 [split_matmul_comm_elemetwise]: 4.1e-06 [split_layernorm_comm]: 4.4e-06 [handle_group_info]: 3.36001e-06 [symbol_engine_optimizer]: 9.771e-05, [1] [Cycle 1]: 9.077e-05, [6] [build]: 3.11999e-06 [elim_shapecalc]: 1.004e-05 [elim_not_effective]: 1.353e-05 [opt_reshape]: 7.25998e-06 [fold_const_symbol]: 1.062e-05 [renormalize]: 2.09984e-07 [detach_backward]: 3.51001e-06 [pipeline_parallel_scheduler]: 1.72001e-06 [auto_monad_reorder]: 1.951e-05 [get_jit_bprop_graph]: 1.92001e-06 [rewriter_after_jit_bprop_graph]: 5.01002e-06 [opt_after_jit_grad]: 0.00052772 [validate]: 4.419e-05 Sums bootstrap : 0.000445s : 2.32% type_inference : 0.013300s : 69.47% event_method : 0.000019s : 0.10% auto_monad : 0.000089s : 0.46% graph_reusing : 0.000007s : 0.04% inline : 0.000002s : 0.01% add_attr.add_attr_with_inline.tag_attr : 0.000023s : 0.12% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.03% parallel-infer-symbol : 0.000003s : 0.02% pre_auto_parallel : 0.000037s : 0.19% insert-virtual-dataset : 0.000002s : 0.01% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.01% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000031s : 0.16% optimize.rewriter_before_opt_a : 0.000086s : 0.45% optimize.opt_a.expand_dump_flag : 0.000005s : 0.03% optimize.opt_a.switch_simplify : 0.000115s : 0.60% optimize.opt_a.loop_unroll : 0.000032s : 0.17% optimize.opt_a.a_1 : 0.000629s : 3.29% optimize.opt_a.with_stream_mark : 0.000033s : 0.17% optimize.opt_a.recompute_prepare : 0.000015s : 0.08% optimize.opt_a.updatestate_depend_eliminate : 0.000008s : 0.04% optimize.opt_a.updatestate_assign_eliminate : 0.000006s : 0.03% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.03% optimize.opt_a.parameter_eliminate : 0.000003s : 0.02% optimize.opt_a.a_2 : 0.000210s : 1.09% optimize.opt_a.accelerated_algorithm : 0.000013s : 0.07% optimize.opt_a.shard : 0.000003s : 0.02% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.02% optimize.opt_a.shard_inline : 0.000014s : 0.07% optimize.opt_a.merge_send_recv : 0.000014s : 0.07% optimize.opt_a.auto_parallel : 0.000016s : 0.08% optimize.opt_a.parallel : 0.000026s : 0.13% optimize.opt_a.flash_sp : 0.000014s : 0.07% optimize.opt_a.merge_comm : 0.000008s : 0.04% optimize.opt_a.allreduce_fusion : 0.000007s : 0.04% optimize.opt_a.matmul_add_comm_reduction : 0.000016s : 0.08% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000017s : 0.09% optimize.opt_a.virtual_dataset : 0.000013s : 0.07% optimize.opt_a.get_grad_eliminate_ : 0.000012s : 0.06% optimize.opt_a.virtual_output : 0.000012s : 0.06% optimize.opt_a.merge_forward : 0.000007s : 0.04% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.02% optimize.opt_a.offload_activation : 0.000018s : 0.09% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000031s : 0.16% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.01% optimize.opt_a.before_grad : 0.000024s : 0.13% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000007s : 0.04% optimize.opt_a.meta_fg_expand : 0.000006s : 0.03% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.02% optimize.opt_a.receive_attached : 0.000004s : 0.02% optimize.opt_a.after_resolve : 0.000021s : 0.11% optimize.opt_a.a_after_grad : 0.000018s : 0.09% optimize.opt_a.renormalize : 0.000971s : 5.07% optimize.opt_a.add_forward_monad_depend : 0.000008s : 0.04% optimize.opt_a.auto_monad_grad : 0.000004s : 0.02% optimize.opt_a.auto_monad_eliminator : 0.000023s : 0.12% optimize.opt_a.cse : 0.000044s : 0.23% optimize.opt_a.a_3 : 0.000110s : 0.57% optimize.py_interpret_to_execute_after_opt_a : 0.000013s : 0.07% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.03% optimize.rewriter_after_opt_a : 0.000077s : 0.40% optimize.convert_after_rewriter : 0.000010s : 0.05% optimize.order_py_execute_after_rewriter : 0.000008s : 0.04% optimize.mutable_eliminate : 0.000672s : 3.51% optimize.opt_b.b_1 : 0.000164s : 0.86% optimize.opt_b.b_2 : 0.000008s : 0.04% optimize.opt_b.updatestate_depend_eliminate : 0.000007s : 0.03% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.01% optimize.opt_b.updatestate_loads_eliminate : 0.000002s : 0.01% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000021s : 0.11% optimize.optimize_parallel_all_gather_comm : 0.000020s : 0.10% optimize.overlap_param_gather : 0.000005s : 0.03% optimize.cconv : 0.000032s : 0.17% optimize.loop_unroll : 0.000462s : 2.41% optimize.opt_after_cconv.c_1 : 0.000030s : 0.16% optimize.opt_after_cconv.parameter_eliminate : 0.000004s : 0.02% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.03% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.01% optimize.opt_after_cconv.cse : 0.000021s : 0.11% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000018s : 0.09% optimize.tuple_transform.d_1 : 0.000040s : 0.21% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000007s : 0.04% optimize.partial_unused_args_eliminate : 0.000004s : 0.02% optimize.add_recomputation : 0.000049s : 0.25% optimize.cse_after_recomputation.cse : 0.000012s : 0.06% optimize.environ_conv : 0.000008s : 0.04% optimize.swap_dp_allreduce_reducescatter : 0.000007s : 0.04% optimize.bias_add_comm_swap : 0.000005s : 0.03% optimize.label_micro_interleaved_index : 0.000008s : 0.04% optimize.label_fine_grained_interleaved_index : 0.000005s : 0.03% optimize.merge_cast_opt : 0.000004s : 0.02% optimize.slice_recompute_activation : 0.000005s : 0.02% optimize.micro_interleaved_order_control : 0.000005s : 0.02% optimize.assign_add_opt : 0.000004s : 0.02% optimize.ForceFp32Comm : 0.000004s : 0.02% optimize.remove_cast_before_assign_add : 0.000003s : 0.02% optimize.full_micro_interleaved_order_control : 0.000004s : 0.02% optimize.reorder_send_recv_between_fp_bp : 0.000006s : 0.03% optimize.comm_op_add_attrs : 0.000004s : 0.02% optimize.add_comm_op_reuse_tag : 0.000004s : 0.02% optimize.interleave_split_concat_branches : 0.000003s : 0.02% optimize.interleave_parallel_branches : 0.000004s : 0.02% optimize.overlap_opt_shard_in_pipeline : 0.000003s : 0.02% optimize.overlap_opt_shard_grad_in_pipeline : 0.000004s : 0.02% optimize.control_data_broadcast_order : 0.000016s : 0.08% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.02% optimize.offloading_packed_experts : 0.000006s : 0.03% optimize.overlap_recompute_and_grad_model_parallel : 0.000007s : 0.04% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000003s : 0.02% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.02% optimize.overlap_recompute_comm : 0.000005s : 0.02% optimize.overlap_grad_ring_attention : 0.000007s : 0.04% optimize.overlap_grad_flash_sp : 0.000022s : 0.12% optimize.begin_end_overlap_inline : 0.000003s : 0.01% optimize.split_matmul_comm_elemetwise : 0.000004s : 0.02% optimize.split_layernorm_comm : 0.000004s : 0.02% optimize.handle_group_info : 0.000003s : 0.02% optimize.symbol_engine_optimizer.build : 0.000003s : 0.02% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000010s : 0.05% optimize.symbol_engine_optimizer.elim_not_effective : 0.000014s : 0.07% optimize.symbol_engine_optimizer.opt_reshape : 0.000007s : 0.04% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000011s : 0.06% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000004s : 0.02% pipeline_parallel_scheduler : 0.000002s : 0.01% auto_monad_reorder : 0.000020s : 0.10% get_jit_bprop_graph : 0.000002s : 0.01% rewriter_after_jit_bprop_graph : 0.000005s : 0.03% opt_after_jit_grad : 0.000528s : 2.76% validate : 0.000044s : 0.23% Time group info: ------[substitution.] 0.000164 25 1.12% : 0.000002s : 2: substitution.elim_not_effective 0.93% : 0.000002s : 2: substitution.fold_const_symbol 3.23% : 0.000005s : 3: substitution.graph_param_transform 80.99% : 0.000133s : 6: substitution.inline 2.52% : 0.000004s : 4: substitution.j_node_and_user_rematch 2.98% : 0.000005s : 4: substitution.remove_not_recompute_node 2.32% : 0.000004s : 2: substitution.replace_old_param 5.91% : 0.000010s : 2: substitution.switch_simplify ------[type_inference.] 0.013241 2 91.78% : 0.012153s : 1: type_inference.infer 8.22% : 0.001089s : 1: type_inference.specialize ------[replace.] 0.000085 8 45.02% : 0.000038s : 6: replace.inline 54.98% : 0.000047s : 2: replace.switch_simplify ------[match.] 0.000137 8 94.10% : 0.000129s : 6: match.inline 5.90% : 0.000008s : 2: match.switch_simplify ------[predicate.] 0.000180 996 0.91% : 0.000002s : 11: predicate.accumulaten_eliminater 1.05% : 0.000002s : 3: predicate.ad_related_special_op_eliminate 0.52% : 0.000001s : 6: predicate.addn_check_dump 0.94% : 0.000002s : 11: predicate.addn_zero_filter 0.89% : 0.000002s : 11: predicate.adjust_all_reduce_mul_add 2.36% : 0.000004s : 17: predicate.arithmetic_simplify 0.95% : 0.000002s : 11: predicate.cast_eliminate 0.58% : 0.000001s : 6: predicate.check_bprop_eliminate 0.55% : 0.000001s : 6: predicate.compare_switch_simplify 0.17% : 0.000000s : 3: predicate.const_output_eliminate 0.63% : 0.000001s : 6: predicate.depend_value_elim 0.89% : 0.000002s : 11: predicate.dict_get_item_const_eliminator 1.04% : 0.000002s : 11: predicate.dict_get_item_eliminator 0.95% : 0.000002s : 11: predicate.dict_set_item_eliminator 1.18% : 0.000002s : 6: predicate.dumpgradient_eliminate 0.21% : 0.000000s : 3: predicate.elim_not_effective 0.38% : 0.000001s : 3: predicate.elim_shapecalc_of_broadcastargs 1.09% : 0.000002s : 14: predicate.environ_add_const_eliminate 1.03% : 0.000002s : 14: predicate.environ_get_add_eliminate 1.04% : 0.000002s : 14: predicate.environ_get_depend_swap 1.81% : 0.000003s : 20: predicate.environ_get_eliminate 1.02% : 0.000002s : 14: predicate.environ_get_set_eliminate 1.34% : 0.000002s : 17: predicate.exchange_switch_depend_value 2.41% : 0.000004s : 17: predicate.float_depend_g_call 0.50% : 0.000001s : 6: predicate.float_environ_get_switch 0.79% : 0.000001s : 9: predicate.float_tuple_getitem_switch 0.15% : 0.000000s : 3: predicate.fold_const_symbol 0.68% : 0.000001s : 6: predicate.get_grad_eliminate 0.18% : 0.000000s : 3: predicate.graph_param_transform 0.58% : 0.000001s : 6: predicate.incorporate_call 0.49% : 0.000001s : 6: predicate.incorporate_call_switch 7.07% : 0.000013s : 46: predicate.inline 0.74% : 0.000001s : 6: predicate.inline_without_move 0.33% : 0.000001s : 6: predicate.j_node_and_user_rematch 0.86% : 0.000002s : 6: predicate.less_batch_normalization 1.61% : 0.000003s : 17: predicate.list_to_tuple_eliminator_ 2.23% : 0.000004s : 28: predicate.load_eliminater 0.91% : 0.000002s : 3: predicate.loop_unroll_after_grad 2.42% : 0.000004s : 27: predicate.loop_unroll_before_grad 1.70% : 0.000003s : 17: predicate.make_slice_get_slice_eliminator 0.52% : 0.000001s : 6: predicate.merge_addn 0.54% : 0.000001s : 6: predicate.micro_step_allgather_replace 0.56% : 0.000001s : 6: predicate.mini_step_allgather_replace 0.84% : 0.000002s : 11: predicate.minmaximum_grad 1.42% : 0.000003s : 3: predicate.mutable_eliminate 0.38% : 0.000001s : 3: predicate.opt_reshape 0.51% : 0.000001s : 3: predicate.parallel_virtual_node 1.83% : 0.000003s : 17: predicate.partial_defer_inline 1.33% : 0.000002s : 14: predicate.partial_eliminate 0.94% : 0.000002s : 11: predicate.print_const_string_wrapper 0.58% : 0.000001s : 6: predicate.reduce_all_const_elim 1.19% : 0.000002s : 11: predicate.reduce_eliminate 2.13% : 0.000004s : 28: predicate.redundant_stop_gradient_eliminater 0.56% : 0.000001s : 6: predicate.remove_not_recompute_node 1.14% : 0.000002s : 17: predicate.replace_applicator 0.56% : 0.000001s : 6: predicate.replace_old_param 0.26% : 0.000000s : 3: predicate.reset_defer_inline 0.98% : 0.000002s : 11: predicate.reshape_eliminate 0.59% : 0.000001s : 6: predicate.row_tensor_add_zeros_like 0.37% : 0.000001s : 3: predicate.row_tensor_eliminate 0.83% : 0.000002s : 6: predicate.same_eliminate 0.38% : 0.000001s : 6: predicate.set_cell_output_no_recompute 0.97% : 0.000002s : 6: predicate.shard_identity_eliminate 0.82% : 0.000001s : 6: predicate.special_op_eliminate 0.84% : 0.000002s : 6: predicate.specialize_transform 0.87% : 0.000002s : 6: predicate.split_environ_get_set_with_tuple_value 0.71% : 0.000001s : 6: predicate.stack_unstack_eliminate 0.31% : 0.000001s : 3: predicate.switch_call_monad_eliminater 1.56% : 0.000003s : 17: predicate.switch_defer_inline 2.24% : 0.000004s : 23: predicate.switch_layer_defer_inline 5.61% : 0.000010s : 57: predicate.switch_simplify 1.04% : 0.000002s : 11: predicate.tile_eliminate 0.89% : 0.000002s : 11: predicate.transpose_eliminate 1.48% : 0.000003s : 17: predicate.tuple_list_convert_item_index_to_positive 1.84% : 0.000003s : 17: predicate.tuple_list_get_item_const_eliminator 1.43% : 0.000003s : 17: predicate.tuple_list_get_item_depend_reorder 3.00% : 0.000005s : 23: predicate.tuple_list_get_item_eliminator 1.45% : 0.000003s : 17: predicate.tuple_list_get_set_item_eliminator 2.47% : 0.000004s : 23: predicate.tuple_list_set_item_eliminator 1.48% : 0.000003s : 17: predicate.tuple_to_list_eliminator_ 2.10% : 0.000004s : 28: predicate.updatestate_pure_node_eliminater 2.74% : 0.000005s : 34: predicate.updatestate_useless_node_eliminater 0.46% : 0.000001s : 3: predicate.value_based_eliminate 0.73% : 0.000001s : 6: predicate.virtual_dataset_eliminate 0.62% : 0.000001s : 6: predicate.virtual_output_eliminate 0.26% : 0.000000s : 3: predicate.virtual_view_grad_eliminate 0.46% : 0.000001s : 3: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000691 10 48.91% : 0.000338s : 2: func_graph_cloner_run.FuncGraphClonerGraph 51.09% : 0.000353s : 8: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.035760 192 0.02% : 0.000007s : 1: ForceFp32Comm 9.53% : 0.003408s : 1: add_attr 9.48% : 0.003391s : 1: add_attr_with_inline 0.02% : 0.000006s : 1: add_comm_op_reuse_tag 0.15% : 0.000052s : 1: add_recomputation 0.02% : 0.000006s : 1: assign_add_opt 0.27% : 0.000098s : 1: auto_monad 0.07% : 0.000026s : 1: auto_monad_reorder 0.02% : 0.000006s : 1: begin_end_overlap_inline 0.02% : 0.000008s : 1: bias_add_comm_swap 1.37% : 0.000490s : 1: bootstrap 0.10% : 0.000035s : 1: cconv 0.02% : 0.000006s : 1: comm_op_add_attrs 0.05% : 0.000019s : 1: control_data_broadcast_order 0.04% : 0.000013s : 1: convert_after_rewriter 0.09% : 0.000031s : 1: cse_after_recomputation 0.02% : 0.000008s : 1: dataset_repeat_opt 0.06% : 0.000020s : 1: detach_backward 0.03% : 0.000011s : 1: environ_conv 0.08% : 0.000029s : 1: event_method 0.02% : 0.000007s : 1: full_micro_interleaved_order_control 0.02% : 0.000008s : 1: get_jit_bprop_graph 0.04% : 0.000013s : 1: graph_reusing 0.02% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.02% : 0.000006s : 1: handle_group_info 0.02% : 0.000008s : 1: inline 0.02% : 0.000008s : 1: insert-virtual-dataset 0.02% : 0.000006s : 1: interleave_parallel_branches 0.02% : 0.000006s : 1: interleave_split_concat_branches 0.02% : 0.000008s : 1: label_fine_grained_interleaved_index 0.03% : 0.000010s : 1: label_micro_interleaved_index 1.31% : 0.000468s : 1: loop_unroll 0.02% : 0.000007s : 1: merge_cast_opt 0.02% : 0.000007s : 1: micro_interleaved_order_control 1.90% : 0.000679s : 1: mutable_eliminate 0.03% : 0.000009s : 1: offloading_packed_experts 0.04% : 0.000015s : 1: opt.transform.loop_unroll_optimizer 0.05% : 0.000016s : 1: opt.transform.mutable_eliminate 3.10% : 0.001107s : 78: opt.transform.opt_a 0.08% : 0.000028s : 1: opt.transform.opt_after_cconv 0.08% : 0.000027s : 1: opt.transform.opt_after_jit_grad 0.28% : 0.000099s : 28: opt.transform.opt_b 0.12% : 0.000044s : 2: opt.transform.opt_trans_graph 0.11% : 0.000038s : 4: opt.transform.symbol_engine_opt 9.26% : 0.003312s : 1: opt_a 0.37% : 0.000133s : 1: opt_after_cconv 1.51% : 0.000540s : 1: opt_after_jit_grad 0.77% : 0.000276s : 1: opt_b 17.32% : 0.006194s : 1: optimize 0.06% : 0.000023s : 1: optimize_parallel_all_gather_comm 0.03% : 0.000012s : 1: order_py_execute_after_rewriter 0.07% : 0.000025s : 1: overlap_grad_flash_sp 0.02% : 0.000006s : 1: overlap_grad_matmul_and_grad_allreduce 0.03% : 0.000010s : 1: overlap_grad_ring_attention 0.02% : 0.000007s : 1: overlap_opt_shard_grad_in_pipeline 0.02% : 0.000007s : 1: overlap_opt_shard_in_pipeline 0.02% : 0.000008s : 1: overlap_param_gather 0.02% : 0.000006s : 1: overlap_recompute_allgather_and_fa_grad 0.03% : 0.000010s : 1: overlap_recompute_and_grad_model_parallel 0.02% : 0.000008s : 1: overlap_recompute_comm 0.03% : 0.000010s : 1: parallel-infer-symbol 0.02% : 0.000006s : 1: parallel-infer-symbol-second 0.02% : 0.000007s : 1: partial_unused_args_eliminate 0.02% : 0.000008s : 1: pipeline_parallel_scheduler 0.02% : 0.000007s : 1: pipeline_split 0.12% : 0.000045s : 1: pre_auto_parallel 0.10% : 0.000035s : 1: py_interpret_to_execute 0.05% : 0.000016s : 1: py_interpret_to_execute_after_opt_a 0.02% : 0.000006s : 1: remove_cast_before_assign_add 0.06% : 0.000021s : 1: remove_dup_value 1.45% : 0.000519s : 1: renormalize.infer 1.24% : 0.000442s : 1: renormalize.specialize 0.02% : 0.000009s : 1: reorder_send_recv_between_fp_bp 0.03% : 0.000011s : 1: rewriter_after_jit_bprop_graph 0.23% : 0.000081s : 1: rewriter_after_opt_a 0.25% : 0.000090s : 1: rewriter_before_opt_a 0.02% : 0.000008s : 1: slice_cell_reuse_recomputed_activation 0.02% : 0.000008s : 1: slice_recompute_activation 0.02% : 0.000008s : 1: split_layernorm_comm 0.02% : 0.000007s : 1: split_matmul_comm_elemetwise 0.03% : 0.000010s : 1: swap_dp_allreduce_reducescatter 0.28% : 0.000101s : 1: symbol_engine_optimizer 0.25% : 0.000091s : 1: tuple_transform 37.29% : 0.013336s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:47:05.146.090 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0213581, [21] [bootstrap]: 0.00043918 [type_inference]: 0.0124475 [event_method]: 1.756e-05 [auto_monad]: 8.41e-05 [graph_reusing]: 6.49999e-06 [inline]: 2.42001e-06 [add_attr]: 0.00307325, [1] [add_attr_with_inline]: 0.00306477, [1] [Cycle 1]: 5.785e-05, [2] [tag_attr]: 1.945e-05 [meta_addattr_fg_expand]: 5.61e-06 [parallel-infer-symbol]: 3.8e-06 [pre_auto_parallel]: 3.179e-05 [insert-virtual-dataset]: 2.34001e-06 [parallel-infer-symbol-second]: 7.09988e-07 [dataset_repeat_opt]: 2.36e-06 [pipeline_split]: 1.48002e-06 [optimize]: 0.00457404, [53] [py_interpret_to_execute]: 2.329e-05 [rewriter_before_opt_a]: 7.867e-05 [opt_a]: 0.00258742, [2] [Cycle 1]: 0.00197488, [45] [expand_dump_flag]: 3.65e-06 [switch_simplify]: 0.00010481 [loop_unroll]: 2.472e-05 [a_1]: 0.00049796 [with_stream_mark]: 1.399e-05 [recompute_prepare]: 8.80999e-06 [updatestate_depend_eliminate]: 4.05e-06 [updatestate_assign_eliminate]: 3.06999e-06 [updatestate_loads_eliminate]: 3.09001e-06 [parameter_eliminate]: 1.80001e-06 [a_2]: 7.931e-05 [accelerated_algorithm]: 6.75998e-06 [shard]: 1.89999e-06 [meta_shard_fg_expand]: 1.93002e-06 [shard_inline]: 6.73998e-06 [merge_send_recv]: 8.33001e-06 [auto_parallel]: 6.04001e-06 [parallel]: 1.815e-05 [flash_sp]: 8.29998e-06 [merge_comm]: 3.73001e-06 [allreduce_fusion]: 3.68999e-06 [matmul_add_comm_reduction]: 9.17001e-06 [allreduce_slice_to_reducescatter]: 6.60017e-07 [virtual_shard_identity]: 8.27998e-06 [virtual_dataset]: 6.47001e-06 [get_grad_eliminate_]: 6.36e-06 [virtual_output]: 6.17001e-06 [merge_forward]: 3.76999e-06 [cell_reuse_recompute_pass]: 1.39e-06 [offload_activation]: 9.20999e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.209e-05 [merge_recompute_call_nodes]: 1.39998e-06 [before_grad]: 1.097e-05 [set_forward_comm_id_for_comm_node_pass]: 3.62998e-06 [meta_fg_expand]: 2.99001e-06 [flash_sp_send_recv_attached]: 3.01999e-06 [receive_attached]: 2.14e-06 [after_resolve]: 9.82999e-06 [a_after_grad]: 8.89e-06 [renormalize]: 0.00071445 [add_forward_monad_depend]: 5.84e-06 [auto_monad_grad]: 2.19999e-06 [auto_monad_eliminator]: 1.484e-05 [cse]: 2.919e-05 [a_3]: 4.595e-05 [Cycle 2]: 0.00060296, [45] [expand_dump_flag]: 1.14998e-06 [switch_simplify]: 7.97e-06 [loop_unroll]: 6.14001e-06 [a_1]: 0.00010694 [with_stream_mark]: 1.147e-05 [recompute_prepare]: 6.12999e-06 [updatestate_depend_eliminate]: 2.99999e-06 [updatestate_assign_eliminate]: 2.34999e-06 [updatestate_loads_eliminate]: 2.71e-06 [parameter_eliminate]: 1.07e-06 [a_2]: 6.999e-05 [accelerated_algorithm]: 5.92999e-06 [shard]: 1.10999e-06 [meta_shard_fg_expand]: 1.54e-06 [shard_inline]: 5.82001e-06 [merge_send_recv]: 4.69002e-06 [auto_parallel]: 5.37001e-06 [parallel]: 4.67e-06 [flash_sp]: 3.07002e-06 [merge_comm]: 3.18998e-06 [allreduce_fusion]: 2.92002e-06 [matmul_add_comm_reduction]: 5.96e-06 [allreduce_slice_to_reducescatter]: 3.80009e-07 [virtual_shard_identity]: 6.12999e-06 [virtual_dataset]: 5.67999e-06 [get_grad_eliminate_]: 5.44998e-06 [virtual_output]: 5.17e-06 [merge_forward]: 2.69999e-06 [cell_reuse_recompute_pass]: 1.56002e-06 [offload_activation]: 6.11e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.313e-05 [merge_recompute_call_nodes]: 6.80011e-07 [before_grad]: 9.78002e-06 [set_forward_comm_id_for_comm_node_pass]: 3.91999e-06 [meta_fg_expand]: 2.23002e-06 [flash_sp_send_recv_attached]: 8.09989e-07 [receive_attached]: 1.04e-06 [after_resolve]: 8.54002e-06 [a_after_grad]: 7.93999e-06 [renormalize]: 8.00064e-08 [add_forward_monad_depend]: 1.28002e-06 [auto_monad_grad]: 1.06002e-06 [auto_monad_eliminator]: 6.49001e-06 [cse]: 1.375e-05 [a_3]: 3.422e-05 [py_interpret_to_execute_after_opt_a]: 8.12998e-06 [slice_cell_reuse_recomputed_activation]: 1.96e-06 [rewriter_after_opt_a]: 3.409e-05 [convert_after_rewriter]: 6.36e-06 [order_py_execute_after_rewriter]: 5.66998e-06 [mutable_eliminate]: 0.00047453 [opt_b]: 0.00023141, [1] [Cycle 1]: 0.00022553, [7] [b_1]: 0.00015398 [b_2]: 7.31999e-06 [updatestate_depend_eliminate]: 5.24e-06 [updatestate_assign_eliminate]: 2.21998e-06 [updatestate_loads_eliminate]: 2.18998e-06 [renormalize]: 4.49974e-07 [cse]: 1.806e-05 [optimize_parallel_all_gather_comm]: 1.606e-05 [overlap_param_gather]: 1.85001e-06 [cconv]: 2.418e-05 [loop_unroll]: 0.00043348 [opt_after_cconv]: 9.663e-05, [1] [Cycle 1]: 9.103e-05, [7] [c_1]: 2.727e-05 [parameter_eliminate]: 2.44999e-06 [updatestate_depend_eliminate]: 4.92e-06 [updatestate_assign_eliminate]: 2.26e-06 [updatestate_loads_eliminate]: 2.74999e-06 [cse]: 1.73e-05 [renormalize]: 4.30009e-07 [remove_dup_value]: 1.358e-05 [tuple_transform]: 7.035e-05, [1] [Cycle 1]: 6.556e-05, [4] [d_1]: 3.889e-05 [none_parameter_eliminate]: 1.71e-06 [renormalize]: 2.00002e-07 [switch_simplify]: 6.49999e-06 [partial_unused_args_eliminate]: 1.96998e-06 [add_recomputation]: 4.419e-05 [cse_after_recomputation]: 2.051e-05, [1] [Cycle 1]: 1.649e-05, [1] [cse]: 1.114e-05 [environ_conv]: 5.07e-06 [swap_dp_allreduce_reducescatter]: 4.82998e-06 [bias_add_comm_swap]: 2.69001e-06 [label_micro_interleaved_index]: 4.08001e-06 [label_fine_grained_interleaved_index]: 2.79999e-06 [merge_cast_opt]: 1.25999e-06 [slice_recompute_activation]: 2.13998e-06 [micro_interleaved_order_control]: 2.16e-06 [assign_add_opt]: 1.17e-06 [ForceFp32Comm]: 8.10018e-07 [remove_cast_before_assign_add]: 1.05001e-06 [full_micro_interleaved_order_control]: 2.11e-06 [reorder_send_recv_between_fp_bp]: 2.61e-06 [comm_op_add_attrs]: 9.79984e-07 [add_comm_op_reuse_tag]: 9.70002e-07 [interleave_split_concat_branches]: 1.10999e-06 [interleave_parallel_branches]: 1.03001e-06 [overlap_opt_shard_in_pipeline]: 1.12999e-06 [overlap_opt_shard_grad_in_pipeline]: 1.70001e-06 [control_data_broadcast_order]: 1.253e-05 [grouped_pairwise_exchange_alltoall]: 1.99e-06 [offloading_packed_experts]: 3.46001e-06 [overlap_recompute_and_grad_model_parallel]: 5.27999e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.09998e-06 [overlap_recompute_allgather_and_fa_grad]: 1.32999e-06 [overlap_recompute_comm]: 2.39001e-06 [overlap_grad_ring_attention]: 4.02998e-06 [overlap_grad_flash_sp]: 1.887e-05 [begin_end_overlap_inline]: 6.00005e-07 [split_matmul_comm_elemetwise]: 2.14e-06 [split_layernorm_comm]: 1.89e-06 [handle_group_info]: 9.5999e-07 [symbol_engine_optimizer]: 8.917e-05, [1] [Cycle 1]: 8.489e-05, [6] [build]: 2.60002e-06 [elim_shapecalc]: 2.323e-05 [elim_not_effective]: 1.318e-05 [opt_reshape]: 6.79999e-06 [fold_const_symbol]: 1.023e-05 [renormalize]: 3.10014e-07 [detach_backward]: 1.87001e-06 [pipeline_parallel_scheduler]: 1.62001e-06 [auto_monad_reorder]: 1.583e-05 [get_jit_bprop_graph]: 1.25001e-06 [rewriter_after_jit_bprop_graph]: 4.15e-06 [opt_after_jit_grad]: 0.00046095 [validate]: 3.613e-05 Sums bootstrap : 0.000439s : 2.53% type_inference : 0.012447s : 71.77% event_method : 0.000018s : 0.10% auto_monad : 0.000084s : 0.48% graph_reusing : 0.000006s : 0.04% inline : 0.000002s : 0.01% add_attr.add_attr_with_inline.tag_attr : 0.000019s : 0.11% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.03% parallel-infer-symbol : 0.000004s : 0.02% pre_auto_parallel : 0.000032s : 0.18% insert-virtual-dataset : 0.000002s : 0.01% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.01% pipeline_split : 0.000001s : 0.01% optimize.py_interpret_to_execute : 0.000023s : 0.13% optimize.rewriter_before_opt_a : 0.000079s : 0.45% optimize.opt_a.expand_dump_flag : 0.000005s : 0.03% optimize.opt_a.switch_simplify : 0.000113s : 0.65% optimize.opt_a.loop_unroll : 0.000031s : 0.18% optimize.opt_a.a_1 : 0.000605s : 3.49% optimize.opt_a.with_stream_mark : 0.000025s : 0.15% optimize.opt_a.recompute_prepare : 0.000015s : 0.09% optimize.opt_a.updatestate_depend_eliminate : 0.000007s : 0.04% optimize.opt_a.updatestate_assign_eliminate : 0.000005s : 0.03% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.03% optimize.opt_a.parameter_eliminate : 0.000003s : 0.02% optimize.opt_a.a_2 : 0.000149s : 0.86% optimize.opt_a.accelerated_algorithm : 0.000013s : 0.07% optimize.opt_a.shard : 0.000003s : 0.02% optimize.opt_a.meta_shard_fg_expand : 0.000003s : 0.02% optimize.opt_a.shard_inline : 0.000013s : 0.07% optimize.opt_a.merge_send_recv : 0.000013s : 0.08% optimize.opt_a.auto_parallel : 0.000011s : 0.07% optimize.opt_a.parallel : 0.000023s : 0.13% optimize.opt_a.flash_sp : 0.000011s : 0.07% optimize.opt_a.merge_comm : 0.000007s : 0.04% optimize.opt_a.allreduce_fusion : 0.000007s : 0.04% optimize.opt_a.matmul_add_comm_reduction : 0.000015s : 0.09% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000014s : 0.08% optimize.opt_a.virtual_dataset : 0.000012s : 0.07% optimize.opt_a.get_grad_eliminate_ : 0.000012s : 0.07% optimize.opt_a.virtual_output : 0.000011s : 0.07% optimize.opt_a.merge_forward : 0.000006s : 0.04% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.02% optimize.opt_a.offload_activation : 0.000015s : 0.09% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000025s : 0.15% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.01% optimize.opt_a.before_grad : 0.000021s : 0.12% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000008s : 0.04% optimize.opt_a.meta_fg_expand : 0.000005s : 0.03% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.02% optimize.opt_a.receive_attached : 0.000003s : 0.02% optimize.opt_a.after_resolve : 0.000018s : 0.11% optimize.opt_a.a_after_grad : 0.000017s : 0.10% optimize.opt_a.renormalize : 0.000715s : 4.12% optimize.opt_a.add_forward_monad_depend : 0.000007s : 0.04% optimize.opt_a.auto_monad_grad : 0.000003s : 0.02% optimize.opt_a.auto_monad_eliminator : 0.000021s : 0.12% optimize.opt_a.cse : 0.000043s : 0.25% optimize.opt_a.a_3 : 0.000080s : 0.46% optimize.py_interpret_to_execute_after_opt_a : 0.000008s : 0.05% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.01% optimize.rewriter_after_opt_a : 0.000034s : 0.20% optimize.convert_after_rewriter : 0.000006s : 0.04% optimize.order_py_execute_after_rewriter : 0.000006s : 0.03% optimize.mutable_eliminate : 0.000475s : 2.74% optimize.opt_b.b_1 : 0.000154s : 0.89% optimize.opt_b.b_2 : 0.000007s : 0.04% optimize.opt_b.updatestate_depend_eliminate : 0.000005s : 0.03% optimize.opt_b.updatestate_assign_eliminate : 0.000002s : 0.01% optimize.opt_b.updatestate_loads_eliminate : 0.000002s : 0.01% optimize.opt_b.renormalize : 0.000000s : 0.00% optimize.opt_b.cse : 0.000018s : 0.10% optimize.optimize_parallel_all_gather_comm : 0.000016s : 0.09% optimize.overlap_param_gather : 0.000002s : 0.01% optimize.cconv : 0.000024s : 0.14% optimize.loop_unroll : 0.000433s : 2.50% optimize.opt_after_cconv.c_1 : 0.000027s : 0.16% optimize.opt_after_cconv.parameter_eliminate : 0.000002s : 0.01% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000005s : 0.03% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000002s : 0.01% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.02% optimize.opt_after_cconv.cse : 0.000017s : 0.10% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000014s : 0.08% optimize.tuple_transform.d_1 : 0.000039s : 0.22% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000006s : 0.04% optimize.partial_unused_args_eliminate : 0.000002s : 0.01% optimize.add_recomputation : 0.000044s : 0.25% optimize.cse_after_recomputation.cse : 0.000011s : 0.06% optimize.environ_conv : 0.000005s : 0.03% optimize.swap_dp_allreduce_reducescatter : 0.000005s : 0.03% optimize.bias_add_comm_swap : 0.000003s : 0.02% optimize.label_micro_interleaved_index : 0.000004s : 0.02% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.02% optimize.merge_cast_opt : 0.000001s : 0.01% optimize.slice_recompute_activation : 0.000002s : 0.01% optimize.micro_interleaved_order_control : 0.000002s : 0.01% optimize.assign_add_opt : 0.000001s : 0.01% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.01% optimize.full_micro_interleaved_order_control : 0.000002s : 0.01% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.02% optimize.comm_op_add_attrs : 0.000001s : 0.01% optimize.add_comm_op_reuse_tag : 0.000001s : 0.01% optimize.interleave_split_concat_branches : 0.000001s : 0.01% optimize.interleave_parallel_branches : 0.000001s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.01% optimize.control_data_broadcast_order : 0.000013s : 0.07% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.01% optimize.offloading_packed_experts : 0.000003s : 0.02% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.03% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.01% optimize.overlap_recompute_comm : 0.000002s : 0.01% optimize.overlap_grad_ring_attention : 0.000004s : 0.02% optimize.overlap_grad_flash_sp : 0.000019s : 0.11% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.01% optimize.split_layernorm_comm : 0.000002s : 0.01% optimize.handle_group_info : 0.000001s : 0.01% optimize.symbol_engine_optimizer.build : 0.000003s : 0.01% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000023s : 0.13% optimize.symbol_engine_optimizer.elim_not_effective : 0.000013s : 0.08% optimize.symbol_engine_optimizer.opt_reshape : 0.000007s : 0.04% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000010s : 0.06% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.01% pipeline_parallel_scheduler : 0.000002s : 0.01% auto_monad_reorder : 0.000016s : 0.09% get_jit_bprop_graph : 0.000001s : 0.01% rewriter_after_jit_bprop_graph : 0.000004s : 0.02% opt_after_jit_grad : 0.000461s : 2.66% validate : 0.000036s : 0.21% Time group info: ------[substitution.] 0.000148 25 1.28% : 0.000002s : 2: substitution.elim_not_effective 0.94% : 0.000001s : 2: substitution.fold_const_symbol 4.01% : 0.000006s : 3: substitution.graph_param_transform 79.88% : 0.000118s : 6: substitution.inline 2.42% : 0.000004s : 4: substitution.j_node_and_user_rematch 3.05% : 0.000005s : 4: substitution.remove_not_recompute_node 2.06% : 0.000003s : 2: substitution.replace_old_param 6.36% : 0.000009s : 2: substitution.switch_simplify ------[type_inference.] 0.012395 2 92.42% : 0.011455s : 1: type_inference.infer 7.58% : 0.000940s : 1: type_inference.specialize ------[replace.] 0.000080 8 46.43% : 0.000037s : 6: replace.inline 53.57% : 0.000043s : 2: replace.switch_simplify ------[match.] 0.000122 8 93.59% : 0.000115s : 6: match.inline 6.41% : 0.000008s : 2: match.switch_simplify ------[predicate.] 0.000170 996 0.97% : 0.000002s : 11: predicate.accumulaten_eliminater 0.91% : 0.000002s : 3: predicate.ad_related_special_op_eliminate 0.54% : 0.000001s : 6: predicate.addn_check_dump 1.12% : 0.000002s : 11: predicate.addn_zero_filter 0.88% : 0.000001s : 11: predicate.adjust_all_reduce_mul_add 2.28% : 0.000004s : 17: predicate.arithmetic_simplify 1.23% : 0.000002s : 11: predicate.cast_eliminate 0.57% : 0.000001s : 6: predicate.check_bprop_eliminate 0.53% : 0.000001s : 6: predicate.compare_switch_simplify 0.18% : 0.000000s : 3: predicate.const_output_eliminate 0.59% : 0.000001s : 6: predicate.depend_value_elim 0.99% : 0.000002s : 11: predicate.dict_get_item_const_eliminator 1.07% : 0.000002s : 11: predicate.dict_get_item_eliminator 0.88% : 0.000001s : 11: predicate.dict_set_item_eliminator 0.95% : 0.000002s : 6: predicate.dumpgradient_eliminate 0.30% : 0.000001s : 3: predicate.elim_not_effective 0.37% : 0.000001s : 3: predicate.elim_shapecalc_of_broadcastargs 1.22% : 0.000002s : 14: predicate.environ_add_const_eliminate 1.12% : 0.000002s : 14: predicate.environ_get_add_eliminate 1.07% : 0.000002s : 14: predicate.environ_get_depend_swap 1.67% : 0.000003s : 20: predicate.environ_get_eliminate 1.08% : 0.000002s : 14: predicate.environ_get_set_eliminate 1.50% : 0.000003s : 17: predicate.exchange_switch_depend_value 2.49% : 0.000004s : 17: predicate.float_depend_g_call 0.50% : 0.000001s : 6: predicate.float_environ_get_switch 0.80% : 0.000001s : 9: predicate.float_tuple_getitem_switch 0.16% : 0.000000s : 3: predicate.fold_const_symbol 0.63% : 0.000001s : 6: predicate.get_grad_eliminate 0.21% : 0.000000s : 3: predicate.graph_param_transform 0.62% : 0.000001s : 6: predicate.incorporate_call 0.51% : 0.000001s : 6: predicate.incorporate_call_switch 6.48% : 0.000011s : 46: predicate.inline 0.82% : 0.000001s : 6: predicate.inline_without_move 0.31% : 0.000001s : 6: predicate.j_node_and_user_rematch 0.81% : 0.000001s : 6: predicate.less_batch_normalization 1.53% : 0.000003s : 17: predicate.list_to_tuple_eliminator_ 2.24% : 0.000004s : 28: predicate.load_eliminater 0.93% : 0.000002s : 3: predicate.loop_unroll_after_grad 2.39% : 0.000004s : 27: predicate.loop_unroll_before_grad 1.64% : 0.000003s : 17: predicate.make_slice_get_slice_eliminator 0.51% : 0.000001s : 6: predicate.merge_addn 0.52% : 0.000001s : 6: predicate.micro_step_allgather_replace 0.52% : 0.000001s : 6: predicate.mini_step_allgather_replace 0.86% : 0.000001s : 11: predicate.minmaximum_grad 0.99% : 0.000002s : 3: predicate.mutable_eliminate 0.35% : 0.000001s : 3: predicate.opt_reshape 0.38% : 0.000001s : 3: predicate.parallel_virtual_node 1.75% : 0.000003s : 17: predicate.partial_defer_inline 1.38% : 0.000002s : 14: predicate.partial_eliminate 0.95% : 0.000002s : 11: predicate.print_const_string_wrapper 0.57% : 0.000001s : 6: predicate.reduce_all_const_elim 1.35% : 0.000002s : 11: predicate.reduce_eliminate 2.25% : 0.000004s : 28: predicate.redundant_stop_gradient_eliminater 0.51% : 0.000001s : 6: predicate.remove_not_recompute_node 1.25% : 0.000002s : 17: predicate.replace_applicator 0.51% : 0.000001s : 6: predicate.replace_old_param 0.29% : 0.000000s : 3: predicate.reset_defer_inline 1.09% : 0.000002s : 11: predicate.reshape_eliminate 0.57% : 0.000001s : 6: predicate.row_tensor_add_zeros_like 0.35% : 0.000001s : 3: predicate.row_tensor_eliminate 0.75% : 0.000001s : 6: predicate.same_eliminate 0.39% : 0.000001s : 6: predicate.set_cell_output_no_recompute 0.72% : 0.000001s : 6: predicate.shard_identity_eliminate 0.64% : 0.000001s : 6: predicate.special_op_eliminate 0.73% : 0.000001s : 6: predicate.specialize_transform 0.87% : 0.000001s : 6: predicate.split_environ_get_set_with_tuple_value 0.90% : 0.000002s : 6: predicate.stack_unstack_eliminate 0.35% : 0.000001s : 3: predicate.switch_call_monad_eliminater 1.55% : 0.000003s : 17: predicate.switch_defer_inline 2.07% : 0.000004s : 23: predicate.switch_layer_defer_inline 6.21% : 0.000011s : 57: predicate.switch_simplify 1.10% : 0.000002s : 11: predicate.tile_eliminate 1.01% : 0.000002s : 11: predicate.transpose_eliminate 1.55% : 0.000003s : 17: predicate.tuple_list_convert_item_index_to_positive 1.72% : 0.000003s : 17: predicate.tuple_list_get_item_const_eliminator 1.60% : 0.000003s : 17: predicate.tuple_list_get_item_depend_reorder 2.93% : 0.000005s : 23: predicate.tuple_list_get_item_eliminator 1.48% : 0.000003s : 17: predicate.tuple_list_get_set_item_eliminator 2.30% : 0.000004s : 23: predicate.tuple_list_set_item_eliminator 1.71% : 0.000003s : 17: predicate.tuple_to_list_eliminator_ 2.20% : 0.000004s : 28: predicate.updatestate_pure_node_eliminater 2.96% : 0.000005s : 34: predicate.updatestate_useless_node_eliminater 0.31% : 0.000001s : 3: predicate.value_based_eliminate 0.67% : 0.000001s : 6: predicate.virtual_dataset_eliminate 0.61% : 0.000001s : 6: predicate.virtual_output_eliminate 0.28% : 0.000000s : 3: predicate.virtual_view_grad_eliminate 0.39% : 0.000001s : 3: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000613 10 52.86% : 0.000324s : 2: func_graph_cloner_run.FuncGraphClonerGraph 47.14% : 0.000289s : 8: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.030942 192 0.01% : 0.000004s : 1: ForceFp32Comm 9.95% : 0.003078s : 1: add_attr 9.92% : 0.003068s : 1: add_attr_with_inline 0.01% : 0.000003s : 1: add_comm_op_reuse_tag 0.15% : 0.000048s : 1: add_recomputation 0.01% : 0.000004s : 1: assign_add_opt 0.29% : 0.000089s : 1: auto_monad 0.06% : 0.000019s : 1: auto_monad_reorder 0.01% : 0.000003s : 1: begin_end_overlap_inline 0.02% : 0.000006s : 1: bias_add_comm_swap 1.52% : 0.000469s : 1: bootstrap 0.09% : 0.000028s : 1: cconv 0.01% : 0.000004s : 1: comm_op_add_attrs 0.05% : 0.000016s : 1: control_data_broadcast_order 0.03% : 0.000009s : 1: convert_after_rewriter 0.08% : 0.000024s : 1: cse_after_recomputation 0.02% : 0.000005s : 1: dataset_repeat_opt 0.02% : 0.000005s : 1: detach_backward 0.03% : 0.000008s : 1: environ_conv 0.07% : 0.000023s : 1: event_method 0.02% : 0.000005s : 1: full_micro_interleaved_order_control 0.01% : 0.000004s : 1: get_jit_bprop_graph 0.03% : 0.000010s : 1: graph_reusing 0.01% : 0.000005s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000004s : 1: handle_group_info 0.02% : 0.000005s : 1: inline 0.02% : 0.000005s : 1: insert-virtual-dataset 0.01% : 0.000004s : 1: interleave_parallel_branches 0.01% : 0.000004s : 1: interleave_split_concat_branches 0.02% : 0.000006s : 1: label_fine_grained_interleaved_index 0.02% : 0.000007s : 1: label_micro_interleaved_index 1.43% : 0.000442s : 1: loop_unroll 0.01% : 0.000004s : 1: merge_cast_opt 0.02% : 0.000005s : 1: micro_interleaved_order_control 1.56% : 0.000483s : 1: mutable_eliminate 0.02% : 0.000006s : 1: offloading_packed_experts 0.05% : 0.000014s : 1: opt.transform.loop_unroll_optimizer 0.05% : 0.000014s : 1: opt.transform.mutable_eliminate 3.42% : 0.001058s : 78: opt.transform.opt_a 0.08% : 0.000026s : 1: opt.transform.opt_after_cconv 0.07% : 0.000023s : 1: opt.transform.opt_after_jit_grad 0.42% : 0.000129s : 28: opt.transform.opt_b 0.14% : 0.000043s : 2: opt.transform.opt_trans_graph 0.12% : 0.000036s : 4: opt.transform.symbol_engine_opt 8.37% : 0.002590s : 1: opt_a 0.32% : 0.000100s : 1: opt_after_cconv 1.52% : 0.000470s : 1: opt_after_jit_grad 0.76% : 0.000235s : 1: opt_b 14.80% : 0.004579s : 1: optimize 0.06% : 0.000020s : 1: optimize_parallel_all_gather_comm 0.03% : 0.000009s : 1: order_py_execute_after_rewriter 0.07% : 0.000022s : 1: overlap_grad_flash_sp 0.01% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.02% : 0.000007s : 1: overlap_grad_ring_attention 0.01% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.02% : 0.000005s : 1: overlap_param_gather 0.01% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.03% : 0.000008s : 1: overlap_recompute_and_grad_model_parallel 0.02% : 0.000006s : 1: overlap_recompute_comm 0.02% : 0.000007s : 1: parallel-infer-symbol 0.01% : 0.000004s : 1: parallel-infer-symbol-second 0.02% : 0.000005s : 1: partial_unused_args_eliminate 0.02% : 0.000005s : 1: pipeline_parallel_scheduler 0.01% : 0.000004s : 1: pipeline_split 0.12% : 0.000036s : 1: pre_auto_parallel 0.09% : 0.000028s : 1: py_interpret_to_execute 0.04% : 0.000011s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000004s : 1: remove_cast_before_assign_add 0.06% : 0.000017s : 1: remove_dup_value 1.22% : 0.000378s : 1: renormalize.infer 1.06% : 0.000329s : 1: renormalize.specialize 0.02% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.02% : 0.000007s : 1: rewriter_after_jit_bprop_graph 0.12% : 0.000038s : 1: rewriter_after_opt_a 0.27% : 0.000083s : 1: rewriter_before_opt_a 0.02% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.02% : 0.000005s : 1: slice_recompute_activation 0.01% : 0.000004s : 1: split_layernorm_comm 0.02% : 0.000005s : 1: split_matmul_comm_elemetwise 0.02% : 0.000008s : 1: swap_dp_allreduce_reducescatter 0.30% : 0.000092s : 1: symbol_engine_optimizer 0.24% : 0.000073s : 1: tuple_transform 40.29% : 0.012467s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:47:05.345.736 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:47:05.345.994 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0239456, [21] [bootstrap]: 0.00043228 [type_inference]: 0.0130935 [event_method]: 1.816e-05 [auto_monad]: 8.815e-05 [graph_reusing]: 6.63998e-06 [inline]: 2.56e-06 [add_attr]: 0.00325021, [1] [add_attr_with_inline]: 0.0032412, [1] [Cycle 1]: 7.642e-05, [2] [tag_attr]: 2.088e-05 [meta_addattr_fg_expand]: 5.97999e-06 [parallel-infer-symbol]: 3.17002e-06 [pre_auto_parallel]: 3.379e-05 [insert-virtual-dataset]: 2.70002e-06 [parallel-infer-symbol-second]: 7.59988e-07 [dataset_repeat_opt]: 1.99e-06 [pipeline_split]: 1.89e-06 [optimize]: 0.00577671, [53] [py_interpret_to_execute]: 2.941e-05 [rewriter_before_opt_a]: 8.528e-05 [opt_a]: 0.00334701, [2] [Cycle 1]: 0.00244332, [45] [expand_dump_flag]: 3.45e-06 [switch_simplify]: 0.00010441 [loop_unroll]: 2.609e-05 [a_1]: 0.00055158 [with_stream_mark]: 1.609e-05 [recompute_prepare]: 1.036e-05 [updatestate_depend_eliminate]: 4.89998e-06 [updatestate_assign_eliminate]: 5.30999e-06 [updatestate_loads_eliminate]: 3.75e-06 [parameter_eliminate]: 2.04999e-06 [a_2]: 0.00012548 [accelerated_algorithm]: 8.12e-06 [shard]: 1.70001e-06 [meta_shard_fg_expand]: 2.20002e-06 [shard_inline]: 7.63001e-06 [merge_send_recv]: 9.72001e-06 [auto_parallel]: 8.43999e-06 [parallel]: 1.81e-05 [flash_sp]: 8.47998e-06 [merge_comm]: 4.85999e-06 [allreduce_fusion]: 5.01002e-06 [matmul_add_comm_reduction]: 9.97001e-06 [allreduce_slice_to_reducescatter]: 6.39993e-07 [virtual_shard_identity]: 8.84003e-06 [virtual_dataset]: 7.82e-06 [get_grad_eliminate_]: 7.45e-06 [virtual_output]: 8.02e-06 [merge_forward]: 4.79998e-06 [cell_reuse_recompute_pass]: 1.21002e-06 [offload_activation]: 1.07e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.717e-05 [merge_recompute_call_nodes]: 1.50999e-06 [before_grad]: 1.269e-05 [set_forward_comm_id_for_comm_node_pass]: 5.20001e-06 [meta_fg_expand]: 3.86999e-06 [flash_sp_send_recv_attached]: 2.47001e-06 [receive_attached]: 2.46e-06 [after_resolve]: 1.187e-05 [a_after_grad]: 1.139e-05 [renormalize]: 0.00083785 [add_forward_monad_depend]: 6.18998e-06 [auto_monad_grad]: 2.11998e-06 [auto_monad_eliminator]: 1.722e-05 [cse]: 5.819e-05 [a_3]: 7.144e-05 [Cycle 2]: 0.00089008, [45] [expand_dump_flag]: 1.30999e-06 [switch_simplify]: 8.72e-06 [loop_unroll]: 7.36999e-06 [a_1]: 0.00015245 [with_stream_mark]: 1.26e-05 [recompute_prepare]: 7.61999e-06 [updatestate_depend_eliminate]: 4.47998e-06 [updatestate_assign_eliminate]: 3.15998e-06 [updatestate_loads_eliminate]: 3.01999e-06 [parameter_eliminate]: 1.45001e-06 [a_2]: 0.00011387 [accelerated_algorithm]: 7.15998e-06 [shard]: 1.25999e-06 [meta_shard_fg_expand]: 1.94e-06 [shard_inline]: 7.26001e-06 [merge_send_recv]: 6.94999e-06 [auto_parallel]: 6.35002e-06 [parallel]: 5.81998e-06 [flash_sp]: 3.51999e-06 [merge_comm]: 4.25e-06 [allreduce_fusion]: 4.05e-06 [matmul_add_comm_reduction]: 7.99997e-06 [allreduce_slice_to_reducescatter]: 7.89994e-07 [virtual_shard_identity]: 8.10999e-06 [virtual_dataset]: 6.86001e-06 [get_grad_eliminate_]: 6.69001e-06 [virtual_output]: 6.29999e-06 [merge_forward]: 3.45003e-06 [cell_reuse_recompute_pass]: 1.46002e-06 [offload_activation]: 7.90998e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.624e-05 [merge_recompute_call_nodes]: 1.07e-06 [before_grad]: 1.11e-05 [set_forward_comm_id_for_comm_node_pass]: 4.58999e-06 [meta_fg_expand]: 2.71e-06 [flash_sp_send_recv_attached]: 8.79983e-07 [receive_attached]: 1.07e-06 [after_resolve]: 9.47001e-06 [a_after_grad]: 1.045e-05 [renormalize]: 1.00001e-07 [add_forward_monad_depend]: 1.51002e-06 [auto_monad_grad]: 1.07e-06 [auto_monad_eliminator]: 8.47e-06 [cse]: 1.961e-05 [a_3]: 5.601e-05 [py_interpret_to_execute_after_opt_a]: 1.357e-05 [slice_cell_reuse_recomputed_activation]: 4.35e-06 [rewriter_after_opt_a]: 4.511e-05 [convert_after_rewriter]: 9.98002e-06 [order_py_execute_after_rewriter]: 8.59e-06 [mutable_eliminate]: 0.00053869 [opt_b]: 0.00030477, [1] [Cycle 1]: 0.00029484, [7] [b_1]: 0.00019143 [b_2]: 9.51e-06 [updatestate_depend_eliminate]: 7.01001e-06 [updatestate_assign_eliminate]: 3.40998e-06 [updatestate_loads_eliminate]: 2.91e-06 [renormalize]: 5.29981e-07 [cse]: 2.409e-05 [optimize_parallel_all_gather_comm]: 2.22e-05 [overlap_param_gather]: 4.94e-06 [cconv]: 2.996e-05 [loop_unroll]: 0.00045976 [opt_after_cconv]: 0.00014057, [1] [Cycle 1]: 0.0001324, [7] [c_1]: 3.575e-05 [parameter_eliminate]: 2.90002e-06 [updatestate_depend_eliminate]: 6.72002e-06 [updatestate_assign_eliminate]: 3.14001e-06 [updatestate_loads_eliminate]: 3.10998e-06 [cse]: 2.527e-05 [renormalize]: 4.19997e-07 [remove_dup_value]: 2.041e-05 [tuple_transform]: 9.45e-05, [1] [Cycle 1]: 8.746e-05, [4] [d_1]: 4.668e-05 [none_parameter_eliminate]: 1.77999e-06 [renormalize]: 2.09984e-07 [switch_simplify]: 7.89002e-06 [partial_unused_args_eliminate]: 4.65999e-06 [add_recomputation]: 6.131e-05 [cse_after_recomputation]: 3.277e-05, [1] [Cycle 1]: 2.556e-05, [1] [cse]: 1.589e-05 [environ_conv]: 1.02e-05 [swap_dp_allreduce_reducescatter]: 8.57998e-06 [bias_add_comm_swap]: 5.35001e-06 [label_micro_interleaved_index]: 7.28e-06 [label_fine_grained_interleaved_index]: 5.31002e-06 [merge_cast_opt]: 3.43999e-06 [slice_recompute_activation]: 4.43999e-06 [micro_interleaved_order_control]: 4.63999e-06 [assign_add_opt]: 3.73999e-06 [ForceFp32Comm]: 3.82998e-06 [remove_cast_before_assign_add]: 3.20002e-06 [full_micro_interleaved_order_control]: 4.51002e-06 [reorder_send_recv_between_fp_bp]: 5.32001e-06 [comm_op_add_attrs]: 3.74002e-06 [add_comm_op_reuse_tag]: 3.49001e-06 [interleave_split_concat_branches]: 3.39001e-06 [interleave_parallel_branches]: 4.06001e-06 [overlap_opt_shard_in_pipeline]: 3.48999e-06 [overlap_opt_shard_grad_in_pipeline]: 4.00998e-06 [control_data_broadcast_order]: 1.816e-05 [grouped_pairwise_exchange_alltoall]: 4.27e-06 [offloading_packed_experts]: 6.91001e-06 [overlap_recompute_and_grad_model_parallel]: 7.79002e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.33e-06 [overlap_recompute_allgather_and_fa_grad]: 3.59002e-06 [overlap_recompute_comm]: 5.08002e-06 [overlap_grad_ring_attention]: 7.51001e-06 [overlap_grad_flash_sp]: 2.317e-05 [begin_end_overlap_inline]: 2.83e-06 [split_matmul_comm_elemetwise]: 4.60999e-06 [split_layernorm_comm]: 3.98001e-06 [handle_group_info]: 1.738e-05 [symbol_engine_optimizer]: 0.00010819, [1] [Cycle 1]: 0.00010098, [6] [build]: 3.45e-06 [elim_shapecalc]: 1.292e-05 [elim_not_effective]: 1.691e-05 [opt_reshape]: 8.3e-06 [fold_const_symbol]: 1.247e-05 [renormalize]: 2.09984e-07 [detach_backward]: 3.91001e-06 [pipeline_parallel_scheduler]: 1.81998e-06 [auto_monad_reorder]: 2.383e-05 [get_jit_bprop_graph]: 1.79998e-06 [rewriter_after_jit_bprop_graph]: 4.43999e-06 [opt_after_jit_grad]: 0.00053621 [validate]: 4.297e-05 Sums bootstrap : 0.000432s : 2.29% type_inference : 0.013093s : 69.22% event_method : 0.000018s : 0.10% auto_monad : 0.000088s : 0.47% graph_reusing : 0.000007s : 0.04% inline : 0.000003s : 0.01% add_attr.add_attr_with_inline.tag_attr : 0.000021s : 0.11% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.03% parallel-infer-symbol : 0.000003s : 0.02% pre_auto_parallel : 0.000034s : 0.18% insert-virtual-dataset : 0.000003s : 0.01% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.01% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000029s : 0.16% optimize.rewriter_before_opt_a : 0.000085s : 0.45% optimize.opt_a.expand_dump_flag : 0.000005s : 0.03% optimize.opt_a.switch_simplify : 0.000113s : 0.60% optimize.opt_a.loop_unroll : 0.000033s : 0.18% optimize.opt_a.a_1 : 0.000704s : 3.72% optimize.opt_a.with_stream_mark : 0.000029s : 0.15% optimize.opt_a.recompute_prepare : 0.000018s : 0.10% optimize.opt_a.updatestate_depend_eliminate : 0.000009s : 0.05% optimize.opt_a.updatestate_assign_eliminate : 0.000008s : 0.04% optimize.opt_a.updatestate_loads_eliminate : 0.000007s : 0.04% optimize.opt_a.parameter_eliminate : 0.000003s : 0.02% optimize.opt_a.a_2 : 0.000239s : 1.27% optimize.opt_a.accelerated_algorithm : 0.000015s : 0.08% optimize.opt_a.shard : 0.000003s : 0.02% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.02% optimize.opt_a.shard_inline : 0.000015s : 0.08% optimize.opt_a.merge_send_recv : 0.000017s : 0.09% optimize.opt_a.auto_parallel : 0.000015s : 0.08% optimize.opt_a.parallel : 0.000024s : 0.13% optimize.opt_a.flash_sp : 0.000012s : 0.06% optimize.opt_a.merge_comm : 0.000009s : 0.05% optimize.opt_a.allreduce_fusion : 0.000009s : 0.05% optimize.opt_a.matmul_add_comm_reduction : 0.000018s : 0.09% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000017s : 0.09% optimize.opt_a.virtual_dataset : 0.000015s : 0.08% optimize.opt_a.get_grad_eliminate_ : 0.000014s : 0.07% optimize.opt_a.virtual_output : 0.000014s : 0.08% optimize.opt_a.merge_forward : 0.000008s : 0.04% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.01% optimize.opt_a.offload_activation : 0.000019s : 0.10% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000033s : 0.18% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.01% optimize.opt_a.before_grad : 0.000024s : 0.13% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000010s : 0.05% optimize.opt_a.meta_fg_expand : 0.000007s : 0.03% optimize.opt_a.flash_sp_send_recv_attached : 0.000003s : 0.02% optimize.opt_a.receive_attached : 0.000004s : 0.02% optimize.opt_a.after_resolve : 0.000021s : 0.11% optimize.opt_a.a_after_grad : 0.000022s : 0.12% optimize.opt_a.renormalize : 0.000838s : 4.43% optimize.opt_a.add_forward_monad_depend : 0.000008s : 0.04% optimize.opt_a.auto_monad_grad : 0.000003s : 0.02% optimize.opt_a.auto_monad_eliminator : 0.000026s : 0.14% optimize.opt_a.cse : 0.000078s : 0.41% optimize.opt_a.a_3 : 0.000127s : 0.67% optimize.py_interpret_to_execute_after_opt_a : 0.000014s : 0.07% optimize.slice_cell_reuse_recomputed_activation : 0.000004s : 0.02% optimize.rewriter_after_opt_a : 0.000045s : 0.24% optimize.convert_after_rewriter : 0.000010s : 0.05% optimize.order_py_execute_after_rewriter : 0.000009s : 0.05% optimize.mutable_eliminate : 0.000539s : 2.85% optimize.opt_b.b_1 : 0.000191s : 1.01% optimize.opt_b.b_2 : 0.000010s : 0.05% optimize.opt_b.updatestate_depend_eliminate : 0.000007s : 0.04% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.02% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.02% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000024s : 0.13% optimize.optimize_parallel_all_gather_comm : 0.000022s : 0.12% optimize.overlap_param_gather : 0.000005s : 0.03% optimize.cconv : 0.000030s : 0.16% optimize.loop_unroll : 0.000460s : 2.43% optimize.opt_after_cconv.c_1 : 0.000036s : 0.19% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.02% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000007s : 0.04% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.02% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.02% optimize.opt_after_cconv.cse : 0.000025s : 0.13% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000020s : 0.11% optimize.tuple_transform.d_1 : 0.000047s : 0.25% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000008s : 0.04% optimize.partial_unused_args_eliminate : 0.000005s : 0.02% optimize.add_recomputation : 0.000061s : 0.32% optimize.cse_after_recomputation.cse : 0.000016s : 0.08% optimize.environ_conv : 0.000010s : 0.05% optimize.swap_dp_allreduce_reducescatter : 0.000009s : 0.05% optimize.bias_add_comm_swap : 0.000005s : 0.03% optimize.label_micro_interleaved_index : 0.000007s : 0.04% optimize.label_fine_grained_interleaved_index : 0.000005s : 0.03% optimize.merge_cast_opt : 0.000003s : 0.02% optimize.slice_recompute_activation : 0.000004s : 0.02% optimize.micro_interleaved_order_control : 0.000005s : 0.02% optimize.assign_add_opt : 0.000004s : 0.02% optimize.ForceFp32Comm : 0.000004s : 0.02% optimize.remove_cast_before_assign_add : 0.000003s : 0.02% optimize.full_micro_interleaved_order_control : 0.000005s : 0.02% optimize.reorder_send_recv_between_fp_bp : 0.000005s : 0.03% optimize.comm_op_add_attrs : 0.000004s : 0.02% optimize.add_comm_op_reuse_tag : 0.000003s : 0.02% optimize.interleave_split_concat_branches : 0.000003s : 0.02% optimize.interleave_parallel_branches : 0.000004s : 0.02% optimize.overlap_opt_shard_in_pipeline : 0.000003s : 0.02% optimize.overlap_opt_shard_grad_in_pipeline : 0.000004s : 0.02% optimize.control_data_broadcast_order : 0.000018s : 0.10% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.02% optimize.offloading_packed_experts : 0.000007s : 0.04% optimize.overlap_recompute_and_grad_model_parallel : 0.000008s : 0.04% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000003s : 0.02% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.02% optimize.overlap_recompute_comm : 0.000005s : 0.03% optimize.overlap_grad_ring_attention : 0.000008s : 0.04% optimize.overlap_grad_flash_sp : 0.000023s : 0.12% optimize.begin_end_overlap_inline : 0.000003s : 0.01% optimize.split_matmul_comm_elemetwise : 0.000005s : 0.02% optimize.split_layernorm_comm : 0.000004s : 0.02% optimize.handle_group_info : 0.000017s : 0.09% optimize.symbol_engine_optimizer.build : 0.000003s : 0.02% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000013s : 0.07% optimize.symbol_engine_optimizer.elim_not_effective : 0.000017s : 0.09% optimize.symbol_engine_optimizer.opt_reshape : 0.000008s : 0.04% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000012s : 0.07% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000004s : 0.02% pipeline_parallel_scheduler : 0.000002s : 0.01% auto_monad_reorder : 0.000024s : 0.13% get_jit_bprop_graph : 0.000002s : 0.01% rewriter_after_jit_bprop_graph : 0.000004s : 0.02% opt_after_jit_grad : 0.000536s : 2.83% validate : 0.000043s : 0.23% Time group info: ------[substitution.] 0.000177 34 11.37% : 0.000020s : 2: substitution.cast_eliminate 1.43% : 0.000003s : 3: substitution.elim_not_effective 1.00% : 0.000002s : 3: substitution.fold_const_symbol 3.52% : 0.000006s : 4: substitution.graph_param_transform 69.96% : 0.000124s : 6: substitution.inline 2.25% : 0.000004s : 6: substitution.j_node_and_user_rematch 3.23% : 0.000006s : 6: substitution.remove_not_recompute_node 2.01% : 0.000004s : 2: substitution.replace_old_param 5.23% : 0.000009s : 2: substitution.switch_simplify ------[type_inference.] 0.013035 2 91.95% : 0.011986s : 1: type_inference.infer 8.05% : 0.001049s : 1: type_inference.specialize ------[replace.] 0.000082 8 46.28% : 0.000038s : 6: replace.inline 53.72% : 0.000044s : 2: replace.switch_simplify ------[match.] 0.000129 8 93.85% : 0.000121s : 6: match.inline 6.15% : 0.000008s : 2: match.switch_simplify ------[predicate.] 0.000208 1222 0.92% : 0.000002s : 13: predicate.accumulaten_eliminater 0.85% : 0.000002s : 4: predicate.ad_related_special_op_eliminate 0.61% : 0.000001s : 8: predicate.addn_check_dump 1.10% : 0.000002s : 13: predicate.addn_zero_filter 0.81% : 0.000002s : 13: predicate.adjust_all_reduce_mul_add 2.46% : 0.000005s : 21: predicate.arithmetic_simplify 1.05% : 0.000002s : 13: predicate.cast_eliminate 0.61% : 0.000001s : 8: predicate.check_bprop_eliminate 0.57% : 0.000001s : 8: predicate.compare_switch_simplify 0.18% : 0.000000s : 4: predicate.const_output_eliminate 0.64% : 0.000001s : 8: predicate.depend_value_elim 0.96% : 0.000002s : 13: predicate.dict_get_item_const_eliminator 1.10% : 0.000002s : 13: predicate.dict_get_item_eliminator 0.91% : 0.000002s : 13: predicate.dict_set_item_eliminator 0.93% : 0.000002s : 8: predicate.dumpgradient_eliminate 0.20% : 0.000000s : 4: predicate.elim_not_effective 0.59% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.18% : 0.000002s : 17: predicate.environ_add_const_eliminate 1.18% : 0.000002s : 17: predicate.environ_get_add_eliminate 1.11% : 0.000002s : 17: predicate.environ_get_depend_swap 1.88% : 0.000004s : 25: predicate.environ_get_eliminate 1.09% : 0.000002s : 17: predicate.environ_get_set_eliminate 1.34% : 0.000003s : 19: predicate.exchange_switch_depend_value 2.49% : 0.000005s : 19: predicate.float_depend_g_call 0.59% : 0.000001s : 8: predicate.float_environ_get_switch 0.83% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.17% : 0.000000s : 4: predicate.fold_const_symbol 0.81% : 0.000002s : 8: predicate.get_grad_eliminate 0.25% : 0.000001s : 4: predicate.graph_param_transform 0.66% : 0.000001s : 8: predicate.incorporate_call 0.54% : 0.000001s : 8: predicate.incorporate_call_switch 6.44% : 0.000013s : 56: predicate.inline 0.85% : 0.000002s : 8: predicate.inline_without_move 0.34% : 0.000001s : 8: predicate.j_node_and_user_rematch 0.83% : 0.000002s : 8: predicate.less_batch_normalization 1.55% : 0.000003s : 21: predicate.list_to_tuple_eliminator_ 2.33% : 0.000005s : 34: predicate.load_eliminater 0.96% : 0.000002s : 4: predicate.loop_unroll_after_grad 2.18% : 0.000005s : 29: predicate.loop_unroll_before_grad 1.60% : 0.000003s : 21: predicate.make_slice_get_slice_eliminator 0.62% : 0.000001s : 8: predicate.merge_addn 0.62% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.60% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.81% : 0.000002s : 13: predicate.minmaximum_grad 1.12% : 0.000002s : 4: predicate.mutable_eliminate 0.36% : 0.000001s : 4: predicate.opt_reshape 0.37% : 0.000001s : 4: predicate.parallel_virtual_node 1.65% : 0.000003s : 19: predicate.partial_defer_inline 1.37% : 0.000003s : 17: predicate.partial_eliminate 0.92% : 0.000002s : 13: predicate.print_const_string_wrapper 0.73% : 0.000002s : 8: predicate.reduce_all_const_elim 1.19% : 0.000002s : 13: predicate.reduce_eliminate 2.21% : 0.000005s : 34: predicate.redundant_stop_gradient_eliminater 0.40% : 0.000001s : 8: predicate.remove_not_recompute_node 1.24% : 0.000003s : 21: predicate.replace_applicator 0.52% : 0.000001s : 8: predicate.replace_old_param 0.35% : 0.000001s : 4: predicate.reset_defer_inline 0.95% : 0.000002s : 13: predicate.reshape_eliminate 0.65% : 0.000001s : 8: predicate.row_tensor_add_zeros_like 0.36% : 0.000001s : 4: predicate.row_tensor_eliminate 0.82% : 0.000002s : 8: predicate.same_eliminate 0.44% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.73% : 0.000002s : 8: predicate.shard_identity_eliminate 0.69% : 0.000001s : 8: predicate.special_op_eliminate 0.81% : 0.000002s : 8: predicate.specialize_transform 0.85% : 0.000002s : 8: predicate.split_environ_get_set_with_tuple_value 0.81% : 0.000002s : 8: predicate.stack_unstack_eliminate 0.35% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.49% : 0.000003s : 19: predicate.switch_defer_inline 2.03% : 0.000004s : 27: predicate.switch_layer_defer_inline 5.42% : 0.000011s : 64: predicate.switch_simplify 0.99% : 0.000002s : 13: predicate.tile_eliminate 0.92% : 0.000002s : 13: predicate.transpose_eliminate 1.63% : 0.000003s : 21: predicate.tuple_list_convert_item_index_to_positive 1.65% : 0.000003s : 21: predicate.tuple_list_get_item_const_eliminator 1.46% : 0.000003s : 21: predicate.tuple_list_get_item_depend_reorder 3.04% : 0.000006s : 29: predicate.tuple_list_get_item_eliminator 1.53% : 0.000003s : 21: predicate.tuple_list_get_set_item_eliminator 2.50% : 0.000005s : 29: predicate.tuple_list_set_item_eliminator 1.56% : 0.000003s : 21: predicate.tuple_to_list_eliminator_ 2.24% : 0.000005s : 34: predicate.updatestate_pure_node_eliminater 2.93% : 0.000006s : 42: predicate.updatestate_useless_node_eliminater 0.37% : 0.000001s : 4: predicate.value_based_eliminate 0.65% : 0.000001s : 8: predicate.virtual_dataset_eliminate 0.62% : 0.000001s : 8: predicate.virtual_output_eliminate 0.31% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.47% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000698 10 52.54% : 0.000367s : 2: func_graph_cloner_run.FuncGraphClonerGraph 47.46% : 0.000331s : 8: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.035248 192 0.02% : 0.000007s : 1: ForceFp32Comm 9.25% : 0.003259s : 1: add_attr 9.21% : 0.003245s : 1: add_attr_with_inline 0.02% : 0.000006s : 1: add_comm_op_reuse_tag 0.18% : 0.000065s : 1: add_recomputation 0.02% : 0.000007s : 1: assign_add_opt 0.28% : 0.000098s : 1: auto_monad 0.09% : 0.000032s : 1: auto_monad_reorder 0.02% : 0.000006s : 1: begin_end_overlap_inline 0.02% : 0.000008s : 1: bias_add_comm_swap 1.35% : 0.000476s : 1: bootstrap 0.09% : 0.000033s : 1: cconv 0.02% : 0.000006s : 1: comm_op_add_attrs 0.06% : 0.000021s : 1: control_data_broadcast_order 0.04% : 0.000013s : 1: convert_after_rewriter 0.10% : 0.000036s : 1: cse_after_recomputation 0.02% : 0.000008s : 1: dataset_repeat_opt 0.06% : 0.000020s : 1: detach_backward 0.04% : 0.000013s : 1: environ_conv 0.08% : 0.000029s : 1: event_method 0.02% : 0.000007s : 1: full_micro_interleaved_order_control 0.02% : 0.000008s : 1: get_jit_bprop_graph 0.04% : 0.000013s : 1: graph_reusing 0.02% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.06% : 0.000021s : 1: handle_group_info 0.03% : 0.000009s : 1: inline 0.02% : 0.000009s : 1: insert-virtual-dataset 0.02% : 0.000007s : 1: interleave_parallel_branches 0.02% : 0.000006s : 1: interleave_split_concat_branches 0.02% : 0.000008s : 1: label_fine_grained_interleaved_index 0.03% : 0.000010s : 1: label_micro_interleaved_index 1.32% : 0.000466s : 1: loop_unroll 0.02% : 0.000006s : 1: merge_cast_opt 0.02% : 0.000007s : 1: micro_interleaved_order_control 1.55% : 0.000545s : 1: mutable_eliminate 0.03% : 0.000010s : 1: offloading_packed_experts 0.05% : 0.000017s : 1: opt.transform.loop_unroll_optimizer 0.05% : 0.000018s : 1: opt.transform.mutable_eliminate 3.55% : 0.001251s : 78: opt.transform.opt_a 0.10% : 0.000034s : 1: opt.transform.opt_after_cconv 0.08% : 0.000029s : 1: opt.transform.opt_after_jit_grad 0.36% : 0.000128s : 28: opt.transform.opt_b 0.15% : 0.000052s : 2: opt.transform.opt_trans_graph 0.13% : 0.000046s : 4: opt.transform.symbol_engine_opt 9.51% : 0.003350s : 1: opt_a 0.41% : 0.000144s : 1: opt_after_cconv 1.55% : 0.000547s : 1: opt_after_jit_grad 0.87% : 0.000308s : 1: opt_b 17.45% : 0.006152s : 1: optimize 0.07% : 0.000025s : 1: optimize_parallel_all_gather_comm 0.03% : 0.000012s : 1: order_py_execute_after_rewriter 0.07% : 0.000026s : 1: overlap_grad_flash_sp 0.02% : 0.000006s : 1: overlap_grad_matmul_and_grad_allreduce 0.03% : 0.000010s : 1: overlap_grad_ring_attention 0.02% : 0.000007s : 1: overlap_opt_shard_grad_in_pipeline 0.02% : 0.000006s : 1: overlap_opt_shard_in_pipeline 0.02% : 0.000008s : 1: overlap_param_gather 0.02% : 0.000007s : 1: overlap_recompute_allgather_and_fa_grad 0.03% : 0.000011s : 1: overlap_recompute_and_grad_model_parallel 0.02% : 0.000008s : 1: overlap_recompute_comm 0.03% : 0.000009s : 1: parallel-infer-symbol 0.02% : 0.000006s : 1: parallel-infer-symbol-second 0.02% : 0.000008s : 1: partial_unused_args_eliminate 0.02% : 0.000009s : 1: pipeline_parallel_scheduler 0.02% : 0.000007s : 1: pipeline_split 0.12% : 0.000042s : 1: pre_auto_parallel 0.09% : 0.000033s : 1: py_interpret_to_execute 0.05% : 0.000017s : 1: py_interpret_to_execute_after_opt_a 0.02% : 0.000006s : 1: remove_cast_before_assign_add 0.07% : 0.000024s : 1: remove_dup_value 1.25% : 0.000440s : 1: renormalize.infer 1.10% : 0.000389s : 1: renormalize.specialize 0.02% : 0.000008s : 1: reorder_send_recv_between_fp_bp 0.03% : 0.000010s : 1: rewriter_after_jit_bprop_graph 0.14% : 0.000049s : 1: rewriter_after_opt_a 0.25% : 0.000089s : 1: rewriter_before_opt_a 0.02% : 0.000007s : 1: slice_cell_reuse_recomputed_activation 0.02% : 0.000008s : 1: slice_recompute_activation 0.02% : 0.000006s : 1: split_layernorm_comm 0.02% : 0.000007s : 1: split_matmul_comm_elemetwise 0.03% : 0.000011s : 1: swap_dp_allreduce_reducescatter 0.32% : 0.000111s : 1: symbol_engine_optimizer 0.28% : 0.000097s : 1: tuple_transform 37.24% : 0.013127s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:47:05.547.511 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0215117, [21] [bootstrap]: 0.00042637 [type_inference]: 0.0123842 [event_method]: 1.773e-05 [auto_monad]: 8.299e-05 [graph_reusing]: 6.56e-06 [inline]: 2.42001e-06 [add_attr]: 0.00305646, [1] [add_attr_with_inline]: 0.00304857, [1] [Cycle 1]: 5.885e-05, [2] [tag_attr]: 1.916e-05 [meta_addattr_fg_expand]: 6.01e-06 [parallel-infer-symbol]: 3.3e-06 [pre_auto_parallel]: 3.173e-05 [insert-virtual-dataset]: 2.32001e-06 [parallel-infer-symbol-second]: 7.60017e-07 [dataset_repeat_opt]: 2.09999e-06 [pipeline_split]: 1.47999e-06 [optimize]: 0.00481488, [53] [py_interpret_to_execute]: 2.386e-05 [rewriter_before_opt_a]: 7.694e-05 [opt_a]: 0.0027788, [2] [Cycle 1]: 0.00206442, [45] [expand_dump_flag]: 3.06999e-06 [switch_simplify]: 0.00010284 [loop_unroll]: 2.624e-05 [a_1]: 0.00054143 [with_stream_mark]: 1.445e-05 [recompute_prepare]: 9.19e-06 [updatestate_depend_eliminate]: 4.74e-06 [updatestate_assign_eliminate]: 4.38999e-06 [updatestate_loads_eliminate]: 3.56999e-06 [parameter_eliminate]: 2.06e-06 [a_2]: 9.967e-05 [accelerated_algorithm]: 7.89002e-06 [shard]: 2.09e-06 [meta_shard_fg_expand]: 2.07001e-06 [shard_inline]: 7.35998e-06 [merge_send_recv]: 9.59e-06 [auto_parallel]: 7.28e-06 [parallel]: 1.668e-05 [flash_sp]: 7.76001e-06 [merge_comm]: 4.71002e-06 [allreduce_fusion]: 4.65001e-06 [matmul_add_comm_reduction]: 9.82999e-06 [allreduce_slice_to_reducescatter]: 6.19999e-07 [virtual_shard_identity]: 8.70001e-06 [virtual_dataset]: 7.56999e-06 [get_grad_eliminate_]: 6.68e-06 [virtual_output]: 7.28e-06 [merge_forward]: 4.33999e-06 [cell_reuse_recompute_pass]: 1.52999e-06 [offload_activation]: 1.03e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.485e-05 [merge_recompute_call_nodes]: 1.40999e-06 [before_grad]: 1.198e-05 [set_forward_comm_id_for_comm_node_pass]: 5.09e-06 [meta_fg_expand]: 3.7e-06 [flash_sp_send_recv_attached]: 2.49001e-06 [receive_attached]: 2.19999e-06 [after_resolve]: 1.135e-05 [a_after_grad]: 1.126e-05 [renormalize]: 0.00069992 [add_forward_monad_depend]: 5.09e-06 [auto_monad_grad]: 2.42001e-06 [auto_monad_eliminator]: 1.805e-05 [cse]: 3.668e-05 [a_3]: 5.534e-05 [Cycle 2]: 0.00070486, [45] [expand_dump_flag]: 1.24998e-06 [switch_simplify]: 8.58001e-06 [loop_unroll]: 7.06999e-06 [a_1]: 0.00014979 [with_stream_mark]: 1.151e-05 [recompute_prepare]: 7.6e-06 [updatestate_depend_eliminate]: 4.37e-06 [updatestate_assign_eliminate]: 2.94001e-06 [updatestate_loads_eliminate]: 2.68003e-06 [parameter_eliminate]: 9.50007e-07 [a_2]: 8.698e-05 [accelerated_algorithm]: 6.83998e-06 [shard]: 1.14e-06 [meta_shard_fg_expand]: 1.82999e-06 [shard_inline]: 7.13e-06 [merge_send_recv]: 5.87999e-06 [auto_parallel]: 6.46e-06 [parallel]: 4.60999e-06 [flash_sp]: 3.40003e-06 [merge_comm]: 4e-06 [allreduce_fusion]: 3.88999e-06 [matmul_add_comm_reduction]: 6.48998e-06 [allreduce_slice_to_reducescatter]: 4.00003e-07 [virtual_shard_identity]: 7.88001e-06 [virtual_dataset]: 6.74999e-06 [get_grad_eliminate_]: 6.57002e-06 [virtual_output]: 6.33e-06 [merge_forward]: 3.4e-06 [cell_reuse_recompute_pass]: 1.79e-06 [offload_activation]: 7.15e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.361e-05 [merge_recompute_call_nodes]: 8.89995e-07 [before_grad]: 1.113e-05 [set_forward_comm_id_for_comm_node_pass]: 4.59002e-06 [meta_fg_expand]: 2.68e-06 [flash_sp_send_recv_attached]: 7.39994e-07 [receive_attached]: 9.80013e-07 [after_resolve]: 9.27999e-06 [a_after_grad]: 1.025e-05 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 1.47999e-06 [auto_monad_grad]: 9.5999e-07 [auto_monad_eliminator]: 8.38999e-06 [cse]: 1.851e-05 [a_3]: 4.351e-05 [py_interpret_to_execute_after_opt_a]: 9.42001e-06 [slice_cell_reuse_recomputed_activation]: 1.86998e-06 [rewriter_after_opt_a]: 4.009e-05 [convert_after_rewriter]: 7.71999e-06 [order_py_execute_after_rewriter]: 6.41e-06 [mutable_eliminate]: 0.00047085 [opt_b]: 0.00023345, [1] [Cycle 1]: 0.00022771, [7] [b_1]: 0.00014668 [b_2]: 8.37e-06 [updatestate_depend_eliminate]: 6.12001e-06 [updatestate_assign_eliminate]: 3.03e-06 [updatestate_loads_eliminate]: 3.32002e-06 [renormalize]: 5.8001e-07 [cse]: 2.371e-05 [optimize_parallel_all_gather_comm]: 1.698e-05 [overlap_param_gather]: 2.16998e-06 [cconv]: 2.352e-05 [loop_unroll]: 0.00042282 [opt_after_cconv]: 0.00011164, [1] [Cycle 1]: 0.00010648, [7] [c_1]: 3.364e-05 [parameter_eliminate]: 2.51e-06 [updatestate_depend_eliminate]: 5.82001e-06 [updatestate_assign_eliminate]: 3.52002e-06 [updatestate_loads_eliminate]: 2.73e-06 [cse]: 2.341e-05 [renormalize]: 4.19997e-07 [remove_dup_value]: 1.692e-05 [tuple_transform]: 7.726e-05, [1] [Cycle 1]: 7.256e-05, [4] [d_1]: 4.514e-05 [none_parameter_eliminate]: 1.70001e-06 [renormalize]: 2.10013e-07 [switch_simplify]: 7.6e-06 [partial_unused_args_eliminate]: 1.80001e-06 [add_recomputation]: 5.361e-05 [cse_after_recomputation]: 2.444e-05, [1] [Cycle 1]: 2.002e-05, [1] [cse]: 1.465e-05 [environ_conv]: 6.10002e-06 [swap_dp_allreduce_reducescatter]: 6.26e-06 [bias_add_comm_swap]: 2.31998e-06 [label_micro_interleaved_index]: 4.21001e-06 [label_fine_grained_interleaved_index]: 2.47001e-06 [merge_cast_opt]: 1.29e-06 [slice_recompute_activation]: 2.07999e-06 [micro_interleaved_order_control]: 2.41e-06 [assign_add_opt]: 1.20001e-06 [ForceFp32Comm]: 7.80012e-07 [remove_cast_before_assign_add]: 9.89996e-07 [full_micro_interleaved_order_control]: 2.19001e-06 [reorder_send_recv_between_fp_bp]: 2.69999e-06 [comm_op_add_attrs]: 9.89996e-07 [add_comm_op_reuse_tag]: 9.20001e-07 [interleave_split_concat_branches]: 1.13001e-06 [interleave_parallel_branches]: 1.07e-06 [overlap_opt_shard_in_pipeline]: 1.08001e-06 [overlap_opt_shard_grad_in_pipeline]: 1.86e-06 [control_data_broadcast_order]: 1.37e-05 [grouped_pairwise_exchange_alltoall]: 1.82001e-06 [offloading_packed_experts]: 3.79002e-06 [overlap_recompute_and_grad_model_parallel]: 4.68999e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.14e-06 [overlap_recompute_allgather_and_fa_grad]: 1.33002e-06 [overlap_recompute_comm]: 2.54999e-06 [overlap_grad_ring_attention]: 4.23999e-06 [overlap_grad_flash_sp]: 1.961e-05 [begin_end_overlap_inline]: 4.89992e-07 [split_matmul_comm_elemetwise]: 2.06998e-06 [split_layernorm_comm]: 1.82999e-06 [handle_group_info]: 1.07998e-06 [symbol_engine_optimizer]: 8.664e-05, [1] [Cycle 1]: 8.226e-05, [6] [build]: 3.04999e-06 [elim_shapecalc]: 1.183e-05 [elim_not_effective]: 1.667e-05 [opt_reshape]: 8.45001e-06 [fold_const_symbol]: 1.298e-05 [renormalize]: 2.3999e-07 [detach_backward]: 2.04e-06 [pipeline_parallel_scheduler]: 1.38002e-06 [auto_monad_reorder]: 2.008e-05 [get_jit_bprop_graph]: 1.21002e-06 [rewriter_after_jit_bprop_graph]: 4.05e-06 [opt_after_jit_grad]: 0.00046171 [validate]: 4.054e-05 Sums bootstrap : 0.000426s : 2.44% type_inference : 0.012384s : 70.75% event_method : 0.000018s : 0.10% auto_monad : 0.000083s : 0.47% graph_reusing : 0.000007s : 0.04% inline : 0.000002s : 0.01% add_attr.add_attr_with_inline.tag_attr : 0.000019s : 0.11% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.03% parallel-infer-symbol : 0.000003s : 0.02% pre_auto_parallel : 0.000032s : 0.18% insert-virtual-dataset : 0.000002s : 0.01% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.01% pipeline_split : 0.000001s : 0.01% optimize.py_interpret_to_execute : 0.000024s : 0.14% optimize.rewriter_before_opt_a : 0.000077s : 0.44% optimize.opt_a.expand_dump_flag : 0.000004s : 0.02% optimize.opt_a.switch_simplify : 0.000111s : 0.64% optimize.opt_a.loop_unroll : 0.000033s : 0.19% optimize.opt_a.a_1 : 0.000691s : 3.95% optimize.opt_a.with_stream_mark : 0.000026s : 0.15% optimize.opt_a.recompute_prepare : 0.000017s : 0.10% optimize.opt_a.updatestate_depend_eliminate : 0.000009s : 0.05% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.04% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.04% optimize.opt_a.parameter_eliminate : 0.000003s : 0.02% optimize.opt_a.a_2 : 0.000187s : 1.07% optimize.opt_a.accelerated_algorithm : 0.000015s : 0.08% optimize.opt_a.shard : 0.000003s : 0.02% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.02% optimize.opt_a.shard_inline : 0.000014s : 0.08% optimize.opt_a.merge_send_recv : 0.000015s : 0.09% optimize.opt_a.auto_parallel : 0.000014s : 0.08% optimize.opt_a.parallel : 0.000021s : 0.12% optimize.opt_a.flash_sp : 0.000011s : 0.06% optimize.opt_a.merge_comm : 0.000009s : 0.05% optimize.opt_a.allreduce_fusion : 0.000009s : 0.05% optimize.opt_a.matmul_add_comm_reduction : 0.000016s : 0.09% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000017s : 0.09% optimize.opt_a.virtual_dataset : 0.000014s : 0.08% optimize.opt_a.get_grad_eliminate_ : 0.000013s : 0.08% optimize.opt_a.virtual_output : 0.000014s : 0.08% optimize.opt_a.merge_forward : 0.000008s : 0.04% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.02% optimize.opt_a.offload_activation : 0.000017s : 0.10% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000028s : 0.16% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.01% optimize.opt_a.before_grad : 0.000023s : 0.13% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000010s : 0.06% optimize.opt_a.meta_fg_expand : 0.000006s : 0.04% optimize.opt_a.flash_sp_send_recv_attached : 0.000003s : 0.02% optimize.opt_a.receive_attached : 0.000003s : 0.02% optimize.opt_a.after_resolve : 0.000021s : 0.12% optimize.opt_a.a_after_grad : 0.000022s : 0.12% optimize.opt_a.renormalize : 0.000700s : 4.00% optimize.opt_a.add_forward_monad_depend : 0.000007s : 0.04% optimize.opt_a.auto_monad_grad : 0.000003s : 0.02% optimize.opt_a.auto_monad_eliminator : 0.000026s : 0.15% optimize.opt_a.cse : 0.000055s : 0.32% optimize.opt_a.a_3 : 0.000099s : 0.56% optimize.py_interpret_to_execute_after_opt_a : 0.000009s : 0.05% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.01% optimize.rewriter_after_opt_a : 0.000040s : 0.23% optimize.convert_after_rewriter : 0.000008s : 0.04% optimize.order_py_execute_after_rewriter : 0.000006s : 0.04% optimize.mutable_eliminate : 0.000471s : 2.69% optimize.opt_b.b_1 : 0.000147s : 0.84% optimize.opt_b.b_2 : 0.000008s : 0.05% optimize.opt_b.updatestate_depend_eliminate : 0.000006s : 0.03% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.02% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.02% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000024s : 0.14% optimize.optimize_parallel_all_gather_comm : 0.000017s : 0.10% optimize.overlap_param_gather : 0.000002s : 0.01% optimize.cconv : 0.000024s : 0.13% optimize.loop_unroll : 0.000423s : 2.42% optimize.opt_after_cconv.c_1 : 0.000034s : 0.19% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.03% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000004s : 0.02% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.02% optimize.opt_after_cconv.cse : 0.000023s : 0.13% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000017s : 0.10% optimize.tuple_transform.d_1 : 0.000045s : 0.26% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000008s : 0.04% optimize.partial_unused_args_eliminate : 0.000002s : 0.01% optimize.add_recomputation : 0.000054s : 0.31% optimize.cse_after_recomputation.cse : 0.000015s : 0.08% optimize.environ_conv : 0.000006s : 0.03% optimize.swap_dp_allreduce_reducescatter : 0.000006s : 0.04% optimize.bias_add_comm_swap : 0.000002s : 0.01% optimize.label_micro_interleaved_index : 0.000004s : 0.02% optimize.label_fine_grained_interleaved_index : 0.000002s : 0.01% optimize.merge_cast_opt : 0.000001s : 0.01% optimize.slice_recompute_activation : 0.000002s : 0.01% optimize.micro_interleaved_order_control : 0.000002s : 0.01% optimize.assign_add_opt : 0.000001s : 0.01% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.01% optimize.full_micro_interleaved_order_control : 0.000002s : 0.01% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.02% optimize.comm_op_add_attrs : 0.000001s : 0.01% optimize.add_comm_op_reuse_tag : 0.000001s : 0.01% optimize.interleave_split_concat_branches : 0.000001s : 0.01% optimize.interleave_parallel_branches : 0.000001s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.01% optimize.control_data_broadcast_order : 0.000014s : 0.08% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.01% optimize.offloading_packed_experts : 0.000004s : 0.02% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.03% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.01% optimize.overlap_recompute_comm : 0.000003s : 0.01% optimize.overlap_grad_ring_attention : 0.000004s : 0.02% optimize.overlap_grad_flash_sp : 0.000020s : 0.11% optimize.begin_end_overlap_inline : 0.000000s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.01% optimize.split_layernorm_comm : 0.000002s : 0.01% optimize.handle_group_info : 0.000001s : 0.01% optimize.symbol_engine_optimizer.build : 0.000003s : 0.02% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000012s : 0.07% optimize.symbol_engine_optimizer.elim_not_effective : 0.000017s : 0.10% optimize.symbol_engine_optimizer.opt_reshape : 0.000008s : 0.05% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000013s : 0.07% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.01% pipeline_parallel_scheduler : 0.000001s : 0.01% auto_monad_reorder : 0.000020s : 0.11% get_jit_bprop_graph : 0.000001s : 0.01% rewriter_after_jit_bprop_graph : 0.000004s : 0.02% opt_after_jit_grad : 0.000462s : 2.64% validate : 0.000041s : 0.23% Time group info: ------[substitution.] 0.000168 34 10.53% : 0.000018s : 2: substitution.cast_eliminate 1.44% : 0.000002s : 3: substitution.elim_not_effective 1.13% : 0.000002s : 3: substitution.fold_const_symbol 3.29% : 0.000006s : 4: substitution.graph_param_transform 70.46% : 0.000118s : 6: substitution.inline 2.29% : 0.000004s : 6: substitution.j_node_and_user_rematch 3.39% : 0.000006s : 6: substitution.remove_not_recompute_node 1.83% : 0.000003s : 2: substitution.replace_old_param 5.64% : 0.000009s : 2: substitution.switch_simplify ------[type_inference.] 0.012331 2 92.53% : 0.011410s : 1: type_inference.infer 7.47% : 0.000921s : 1: type_inference.specialize ------[replace.] 0.000078 8 48.29% : 0.000038s : 6: replace.inline 51.71% : 0.000040s : 2: replace.switch_simplify ------[match.] 0.000123 8 93.60% : 0.000115s : 6: match.inline 6.40% : 0.000008s : 2: match.switch_simplify ------[predicate.] 0.000204 1222 0.99% : 0.000002s : 13: predicate.accumulaten_eliminater 0.75% : 0.000002s : 4: predicate.ad_related_special_op_eliminate 0.56% : 0.000001s : 8: predicate.addn_check_dump 1.04% : 0.000002s : 13: predicate.addn_zero_filter 0.86% : 0.000002s : 13: predicate.adjust_all_reduce_mul_add 2.28% : 0.000005s : 21: predicate.arithmetic_simplify 1.03% : 0.000002s : 13: predicate.cast_eliminate 0.66% : 0.000001s : 8: predicate.check_bprop_eliminate 0.59% : 0.000001s : 8: predicate.compare_switch_simplify 0.19% : 0.000000s : 4: predicate.const_output_eliminate 0.63% : 0.000001s : 8: predicate.depend_value_elim 0.99% : 0.000002s : 13: predicate.dict_get_item_const_eliminator 1.04% : 0.000002s : 13: predicate.dict_get_item_eliminator 0.89% : 0.000002s : 13: predicate.dict_set_item_eliminator 0.95% : 0.000002s : 8: predicate.dumpgradient_eliminate 0.22% : 0.000000s : 4: predicate.elim_not_effective 0.45% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.21% : 0.000002s : 17: predicate.environ_add_const_eliminate 1.14% : 0.000002s : 17: predicate.environ_get_add_eliminate 1.15% : 0.000002s : 17: predicate.environ_get_depend_swap 1.74% : 0.000004s : 25: predicate.environ_get_eliminate 1.11% : 0.000002s : 17: predicate.environ_get_set_eliminate 1.39% : 0.000003s : 19: predicate.exchange_switch_depend_value 2.33% : 0.000005s : 19: predicate.float_depend_g_call 0.58% : 0.000001s : 8: predicate.float_environ_get_switch 0.82% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.17% : 0.000000s : 4: predicate.fold_const_symbol 0.71% : 0.000001s : 8: predicate.get_grad_eliminate 0.25% : 0.000001s : 4: predicate.graph_param_transform 0.67% : 0.000001s : 8: predicate.incorporate_call 0.54% : 0.000001s : 8: predicate.incorporate_call_switch 6.43% : 0.000013s : 56: predicate.inline 0.86% : 0.000002s : 8: predicate.inline_without_move 0.33% : 0.000001s : 8: predicate.j_node_and_user_rematch 0.88% : 0.000002s : 8: predicate.less_batch_normalization 1.62% : 0.000003s : 21: predicate.list_to_tuple_eliminator_ 2.36% : 0.000005s : 34: predicate.load_eliminater 0.94% : 0.000002s : 4: predicate.loop_unroll_after_grad 2.19% : 0.000004s : 29: predicate.loop_unroll_before_grad 1.80% : 0.000004s : 21: predicate.make_slice_get_slice_eliminator 0.61% : 0.000001s : 8: predicate.merge_addn 0.58% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.55% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.89% : 0.000002s : 13: predicate.minmaximum_grad 0.97% : 0.000002s : 4: predicate.mutable_eliminate 0.34% : 0.000001s : 4: predicate.opt_reshape 0.38% : 0.000001s : 4: predicate.parallel_virtual_node 1.66% : 0.000003s : 19: predicate.partial_defer_inline 1.41% : 0.000003s : 17: predicate.partial_eliminate 0.91% : 0.000002s : 13: predicate.print_const_string_wrapper 0.59% : 0.000001s : 8: predicate.reduce_all_const_elim 1.19% : 0.000002s : 13: predicate.reduce_eliminate 2.27% : 0.000005s : 34: predicate.redundant_stop_gradient_eliminater 0.42% : 0.000001s : 8: predicate.remove_not_recompute_node 1.18% : 0.000002s : 21: predicate.replace_applicator 0.52% : 0.000001s : 8: predicate.replace_old_param 0.28% : 0.000001s : 4: predicate.reset_defer_inline 1.04% : 0.000002s : 13: predicate.reshape_eliminate 0.61% : 0.000001s : 8: predicate.row_tensor_add_zeros_like 0.37% : 0.000001s : 4: predicate.row_tensor_eliminate 0.76% : 0.000002s : 8: predicate.same_eliminate 0.44% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.74% : 0.000001s : 8: predicate.shard_identity_eliminate 0.76% : 0.000002s : 8: predicate.special_op_eliminate 0.82% : 0.000002s : 8: predicate.specialize_transform 0.85% : 0.000002s : 8: predicate.split_environ_get_set_with_tuple_value 0.82% : 0.000002s : 8: predicate.stack_unstack_eliminate 0.36% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.56% : 0.000003s : 19: predicate.switch_defer_inline 2.05% : 0.000004s : 27: predicate.switch_layer_defer_inline 5.39% : 0.000011s : 64: predicate.switch_simplify 0.90% : 0.000002s : 13: predicate.tile_eliminate 1.01% : 0.000002s : 13: predicate.transpose_eliminate 1.66% : 0.000003s : 21: predicate.tuple_list_convert_item_index_to_positive 1.74% : 0.000004s : 21: predicate.tuple_list_get_item_const_eliminator 1.45% : 0.000003s : 21: predicate.tuple_list_get_item_depend_reorder 2.97% : 0.000006s : 29: predicate.tuple_list_get_item_eliminator 1.47% : 0.000003s : 21: predicate.tuple_list_get_set_item_eliminator 2.67% : 0.000005s : 29: predicate.tuple_list_set_item_eliminator 1.56% : 0.000003s : 21: predicate.tuple_to_list_eliminator_ 2.29% : 0.000005s : 34: predicate.updatestate_pure_node_eliminater 3.12% : 0.000006s : 42: predicate.updatestate_useless_node_eliminater 0.38% : 0.000001s : 4: predicate.value_based_eliminate 0.67% : 0.000001s : 8: predicate.virtual_dataset_eliminate 0.69% : 0.000001s : 8: predicate.virtual_output_eliminate 0.34% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.44% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000603 10 53.12% : 0.000320s : 2: func_graph_cloner_run.FuncGraphClonerGraph 46.88% : 0.000283s : 8: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.031494 192 0.01% : 0.000004s : 1: ForceFp32Comm 9.72% : 0.003061s : 1: add_attr 9.69% : 0.003052s : 1: add_attr_with_inline 0.01% : 0.000004s : 1: add_comm_op_reuse_tag 0.18% : 0.000058s : 1: add_recomputation 0.01% : 0.000004s : 1: assign_add_opt 0.28% : 0.000089s : 1: auto_monad 0.08% : 0.000024s : 1: auto_monad_reorder 0.01% : 0.000004s : 1: begin_end_overlap_inline 0.02% : 0.000005s : 1: bias_add_comm_swap 1.44% : 0.000455s : 1: bootstrap 0.09% : 0.000027s : 1: cconv 0.01% : 0.000004s : 1: comm_op_add_attrs 0.05% : 0.000017s : 1: control_data_broadcast_order 0.04% : 0.000011s : 1: convert_after_rewriter 0.09% : 0.000028s : 1: cse_after_recomputation 0.02% : 0.000005s : 1: dataset_repeat_opt 0.02% : 0.000005s : 1: detach_backward 0.03% : 0.000009s : 1: environ_conv 0.08% : 0.000024s : 1: event_method 0.02% : 0.000005s : 1: full_micro_interleaved_order_control 0.01% : 0.000004s : 1: get_jit_bprop_graph 0.03% : 0.000010s : 1: graph_reusing 0.01% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.05% : 0.000016s : 1: handle_group_info 0.02% : 0.000006s : 1: inline 0.02% : 0.000006s : 1: insert-virtual-dataset 0.01% : 0.000004s : 1: interleave_parallel_branches 0.01% : 0.000004s : 1: interleave_split_concat_branches 0.02% : 0.000006s : 1: label_fine_grained_interleaved_index 0.02% : 0.000007s : 1: label_micro_interleaved_index 1.37% : 0.000431s : 1: loop_unroll 0.01% : 0.000004s : 1: merge_cast_opt 0.02% : 0.000005s : 1: micro_interleaved_order_control 1.52% : 0.000479s : 1: mutable_eliminate 0.02% : 0.000007s : 1: offloading_packed_experts 0.05% : 0.000016s : 1: opt.transform.loop_unroll_optimizer 0.05% : 0.000016s : 1: opt.transform.mutable_eliminate 3.89% : 0.001226s : 78: opt.transform.opt_a 0.10% : 0.000032s : 1: opt.transform.opt_after_cconv 0.09% : 0.000027s : 1: opt.transform.opt_after_jit_grad 0.39% : 0.000123s : 28: opt.transform.opt_b 0.16% : 0.000051s : 2: opt.transform.opt_trans_graph 0.15% : 0.000046s : 4: opt.transform.symbol_engine_opt 8.83% : 0.002782s : 1: opt_a 0.37% : 0.000115s : 1: opt_after_cconv 1.50% : 0.000471s : 1: opt_after_jit_grad 0.75% : 0.000237s : 1: opt_b 15.30% : 0.004819s : 1: optimize 0.07% : 0.000021s : 1: optimize_parallel_all_gather_comm 0.03% : 0.000010s : 1: order_py_execute_after_rewriter 0.07% : 0.000023s : 1: overlap_grad_flash_sp 0.01% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.02% : 0.000007s : 1: overlap_grad_ring_attention 0.02% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.02% : 0.000005s : 1: overlap_param_gather 0.01% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.02% : 0.000008s : 1: overlap_recompute_and_grad_model_parallel 0.02% : 0.000006s : 1: overlap_recompute_comm 0.02% : 0.000007s : 1: parallel-infer-symbol 0.01% : 0.000004s : 1: parallel-infer-symbol-second 0.01% : 0.000005s : 1: partial_unused_args_eliminate 0.02% : 0.000005s : 1: pipeline_parallel_scheduler 0.01% : 0.000004s : 1: pipeline_split 0.11% : 0.000036s : 1: pre_auto_parallel 0.09% : 0.000028s : 1: py_interpret_to_execute 0.04% : 0.000013s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000004s : 1: remove_cast_before_assign_add 0.06% : 0.000020s : 1: remove_dup_value 1.18% : 0.000371s : 1: renormalize.infer 1.02% : 0.000321s : 1: renormalize.specialize 0.02% : 0.000005s : 1: reorder_send_recv_between_fp_bp 0.02% : 0.000007s : 1: rewriter_after_jit_bprop_graph 0.14% : 0.000044s : 1: rewriter_after_opt_a 0.26% : 0.000081s : 1: rewriter_before_opt_a 0.01% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.02% : 0.000005s : 1: slice_recompute_activation 0.01% : 0.000004s : 1: split_layernorm_comm 0.02% : 0.000005s : 1: split_matmul_comm_elemetwise 0.03% : 0.000009s : 1: swap_dp_allreduce_reducescatter 0.28% : 0.000089s : 1: symbol_engine_optimizer 0.25% : 0.000080s : 1: tuple_transform 39.38% : 0.012401s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:47:05.748.480 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:47:05.748.745 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0222241, [21] [bootstrap]: 0.00046 [type_inference]: 0.0119585 [event_method]: 1.962e-05 [auto_monad]: 8.631e-05 [graph_reusing]: 7.11999e-06 [inline]: 2.68e-06 [add_attr]: 0.00304456, [1] [add_attr_with_inline]: 0.00303654, [1] [Cycle 1]: 7.105e-05, [2] [tag_attr]: 1.938e-05 [meta_addattr_fg_expand]: 6.06998e-06 [parallel-infer-symbol]: 3.03e-06 [pre_auto_parallel]: 3.083e-05 [insert-virtual-dataset]: 2.49001e-06 [parallel-infer-symbol-second]: 7.59988e-07 [dataset_repeat_opt]: 2.03997e-06 [pipeline_split]: 1.75001e-06 [optimize]: 0.00544877, [53] [py_interpret_to_execute]: 2.836e-05 [rewriter_before_opt_a]: 8.277e-05 [opt_a]: 0.00314305, [2] [Cycle 1]: 0.00225897, [45] [expand_dump_flag]: 3.26999e-06 [switch_simplify]: 0.00010281 [loop_unroll]: 2.613e-05 [a_1]: 0.00056764 [with_stream_mark]: 1.634e-05 [recompute_prepare]: 9.91998e-06 [updatestate_depend_eliminate]: 4.98001e-06 [updatestate_assign_eliminate]: 4.60999e-06 [updatestate_loads_eliminate]: 3.48999e-06 [parameter_eliminate]: 2.26e-06 [a_2]: 0.00012544 [accelerated_algorithm]: 8.03001e-06 [shard]: 1.86e-06 [meta_shard_fg_expand]: 2.30002e-06 [shard_inline]: 7.34002e-06 [merge_send_recv]: 8.85999e-06 [auto_parallel]: 7.13e-06 [parallel]: 1.712e-05 [flash_sp]: 7.61999e-06 [merge_comm]: 4.42e-06 [allreduce_fusion]: 4.66002e-06 [matmul_add_comm_reduction]: 1.026e-05 [allreduce_slice_to_reducescatter]: 6.00005e-07 [virtual_shard_identity]: 8.77e-06 [virtual_dataset]: 8.1e-06 [get_grad_eliminate_]: 6.93998e-06 [virtual_output]: 7.34002e-06 [merge_forward]: 4.16001e-06 [cell_reuse_recompute_pass]: 1.25999e-06 [offload_activation]: 1.045e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.712e-05 [merge_recompute_call_nodes]: 1.44e-06 [before_grad]: 1.231e-05 [set_forward_comm_id_for_comm_node_pass]: 4.94e-06 [meta_fg_expand]: 3.55998e-06 [flash_sp_send_recv_attached]: 2.58e-06 [receive_attached]: 2.39999e-06 [after_resolve]: 1.128e-05 [a_after_grad]: 1.136e-05 [renormalize]: 0.00069513 [add_forward_monad_depend]: 5.07e-06 [auto_monad_grad]: 2.25002e-06 [auto_monad_eliminator]: 1.688e-05 [cse]: 3.431e-05 [a_3]: 6.941e-05 [Cycle 2]: 0.00087193, [45] [expand_dump_flag]: 1.19e-06 [switch_simplify]: 9.10999e-06 [loop_unroll]: 7.09001e-06 [a_1]: 0.00015052 [with_stream_mark]: 1.234e-05 [recompute_prepare]: 7.58001e-06 [updatestate_depend_eliminate]: 4.33001e-06 [updatestate_assign_eliminate]: 3.18998e-06 [updatestate_loads_eliminate]: 2.77002e-06 [parameter_eliminate]: 1.15001e-06 [a_2]: 0.00011392 [accelerated_algorithm]: 7.43e-06 [shard]: 1.07e-06 [meta_shard_fg_expand]: 1.72999e-06 [shard_inline]: 7.45e-06 [merge_send_recv]: 5.68002e-06 [auto_parallel]: 6.19999e-06 [parallel]: 4.48999e-06 [flash_sp]: 3.28e-06 [merge_comm]: 3.81999e-06 [allreduce_fusion]: 4.15e-06 [matmul_add_comm_reduction]: 6.49001e-06 [allreduce_slice_to_reducescatter]: 4.80009e-07 [virtual_shard_identity]: 7.71999e-06 [virtual_dataset]: 6.94999e-06 [get_grad_eliminate_]: 6.74999e-06 [virtual_output]: 6.26e-06 [merge_forward]: 3.34001e-06 [cell_reuse_recompute_pass]: 1.32999e-06 [offload_activation]: 7.03998e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.583e-05 [merge_recompute_call_nodes]: 7.59988e-07 [before_grad]: 1.115e-05 [set_forward_comm_id_for_comm_node_pass]: 4.53999e-06 [meta_fg_expand]: 2.69999e-06 [flash_sp_send_recv_attached]: 9.5999e-07 [receive_attached]: 1.02e-06 [after_resolve]: 9.25999e-06 [a_after_grad]: 9.97999e-06 [renormalize]: 8.00064e-08 [add_forward_monad_depend]: 1.20999e-06 [auto_monad_grad]: 9.90025e-07 [auto_monad_eliminator]: 7.88999e-06 [cse]: 1.957e-05 [a_3]: 5.577e-05 [py_interpret_to_execute_after_opt_a]: 1.143e-05 [slice_cell_reuse_recomputed_activation]: 4.63999e-06 [rewriter_after_opt_a]: 4.29e-05 [convert_after_rewriter]: 1.036e-05 [order_py_execute_after_rewriter]: 8.90999e-06 [mutable_eliminate]: 0.00047637 [opt_b]: 0.00029734, [1] [Cycle 1]: 0.00028899, [7] [b_1]: 0.00018841 [b_2]: 9.27001e-06 [updatestate_depend_eliminate]: 6.02001e-06 [updatestate_assign_eliminate]: 3.20998e-06 [updatestate_loads_eliminate]: 2.91e-06 [renormalize]: 4.00003e-07 [cse]: 2.346e-05 [optimize_parallel_all_gather_comm]: 1.954e-05 [overlap_param_gather]: 5.07999e-06 [cconv]: 2.761e-05 [loop_unroll]: 0.0004423 [opt_after_cconv]: 0.00013492, [1] [Cycle 1]: 0.00012652, [7] [c_1]: 3.469e-05 [parameter_eliminate]: 2.64001e-06 [updatestate_depend_eliminate]: 5.80002e-06 [updatestate_assign_eliminate]: 3.06999e-06 [updatestate_loads_eliminate]: 3.25998e-06 [cse]: 2.213e-05 [renormalize]: 3.4002e-07 [remove_dup_value]: 1.955e-05 [tuple_transform]: 9.446e-05, [1] [Cycle 1]: 8.707e-05, [4] [d_1]: 4.7e-05 [none_parameter_eliminate]: 1.86e-06 [renormalize]: 1.69995e-07 [switch_simplify]: 7.58999e-06 [partial_unused_args_eliminate]: 4.45e-06 [add_recomputation]: 5.633e-05 [cse_after_recomputation]: 3.216e-05, [1] [Cycle 1]: 2.48e-05, [1] [cse]: 1.544e-05 [environ_conv]: 9.27999e-06 [swap_dp_allreduce_reducescatter]: 9.17001e-06 [bias_add_comm_swap]: 5.42999e-06 [label_micro_interleaved_index]: 6.39001e-06 [label_fine_grained_interleaved_index]: 4.99998e-06 [merge_cast_opt]: 4.35e-06 [slice_recompute_activation]: 4.48001e-06 [micro_interleaved_order_control]: 4.84e-06 [assign_add_opt]: 3.6e-06 [ForceFp32Comm]: 2.99001e-06 [remove_cast_before_assign_add]: 3.24001e-06 [full_micro_interleaved_order_control]: 4.22e-06 [reorder_send_recv_between_fp_bp]: 5.26002e-06 [comm_op_add_attrs]: 3.33998e-06 [add_comm_op_reuse_tag]: 3.12002e-06 [interleave_split_concat_branches]: 3.79002e-06 [interleave_parallel_branches]: 3.61999e-06 [overlap_opt_shard_in_pipeline]: 3.28e-06 [overlap_opt_shard_grad_in_pipeline]: 4.35e-06 [control_data_broadcast_order]: 1.762e-05 [grouped_pairwise_exchange_alltoall]: 3.97e-06 [offloading_packed_experts]: 6.60997e-06 [overlap_recompute_and_grad_model_parallel]: 7.95e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.35998e-06 [overlap_recompute_allgather_and_fa_grad]: 3.53e-06 [overlap_recompute_comm]: 5.32001e-06 [overlap_grad_ring_attention]: 6.70998e-06 [overlap_grad_flash_sp]: 2.301e-05 [begin_end_overlap_inline]: 2.71e-06 [split_matmul_comm_elemetwise]: 1.637e-05 [split_layernorm_comm]: 4.17e-06 [handle_group_info]: 3.26999e-06 [symbol_engine_optimizer]: 0.00010326, [1] [Cycle 1]: 9.628e-05, [6] [build]: 3.04999e-06 [elim_shapecalc]: 1.16e-05 [elim_not_effective]: 1.541e-05 [opt_reshape]: 8.1e-06 [fold_const_symbol]: 1.251e-05 [renormalize]: 2.69996e-07 [detach_backward]: 3.58e-06 [pipeline_parallel_scheduler]: 2.09e-06 [auto_monad_reorder]: 2.23e-05 [get_jit_bprop_graph]: 1.43002e-06 [rewriter_after_jit_bprop_graph]: 4.05e-06 [opt_after_jit_grad]: 0.00048084 [validate]: 3.894e-05 Sums bootstrap : 0.000460s : 2.64% type_inference : 0.011959s : 68.54% event_method : 0.000020s : 0.11% auto_monad : 0.000086s : 0.49% graph_reusing : 0.000007s : 0.04% inline : 0.000003s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000019s : 0.11% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.03% parallel-infer-symbol : 0.000003s : 0.02% pre_auto_parallel : 0.000031s : 0.18% insert-virtual-dataset : 0.000002s : 0.01% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.01% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000028s : 0.16% optimize.rewriter_before_opt_a : 0.000083s : 0.47% optimize.opt_a.expand_dump_flag : 0.000004s : 0.03% optimize.opt_a.switch_simplify : 0.000112s : 0.64% optimize.opt_a.loop_unroll : 0.000033s : 0.19% optimize.opt_a.a_1 : 0.000718s : 4.12% optimize.opt_a.with_stream_mark : 0.000029s : 0.16% optimize.opt_a.recompute_prepare : 0.000017s : 0.10% optimize.opt_a.updatestate_depend_eliminate : 0.000009s : 0.05% optimize.opt_a.updatestate_assign_eliminate : 0.000008s : 0.04% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.04% optimize.opt_a.parameter_eliminate : 0.000003s : 0.02% optimize.opt_a.a_2 : 0.000239s : 1.37% optimize.opt_a.accelerated_algorithm : 0.000015s : 0.09% optimize.opt_a.shard : 0.000003s : 0.02% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.02% optimize.opt_a.shard_inline : 0.000015s : 0.08% optimize.opt_a.merge_send_recv : 0.000015s : 0.08% optimize.opt_a.auto_parallel : 0.000013s : 0.08% optimize.opt_a.parallel : 0.000022s : 0.12% optimize.opt_a.flash_sp : 0.000011s : 0.06% optimize.opt_a.merge_comm : 0.000008s : 0.05% optimize.opt_a.allreduce_fusion : 0.000009s : 0.05% optimize.opt_a.matmul_add_comm_reduction : 0.000017s : 0.10% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000016s : 0.09% optimize.opt_a.virtual_dataset : 0.000015s : 0.09% optimize.opt_a.get_grad_eliminate_ : 0.000014s : 0.08% optimize.opt_a.virtual_output : 0.000014s : 0.08% optimize.opt_a.merge_forward : 0.000008s : 0.04% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.01% optimize.opt_a.offload_activation : 0.000017s : 0.10% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000033s : 0.19% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.01% optimize.opt_a.before_grad : 0.000023s : 0.13% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000009s : 0.05% optimize.opt_a.meta_fg_expand : 0.000006s : 0.04% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.02% optimize.opt_a.receive_attached : 0.000003s : 0.02% optimize.opt_a.after_resolve : 0.000021s : 0.12% optimize.opt_a.a_after_grad : 0.000021s : 0.12% optimize.opt_a.renormalize : 0.000695s : 3.98% optimize.opt_a.add_forward_monad_depend : 0.000006s : 0.04% optimize.opt_a.auto_monad_grad : 0.000003s : 0.02% optimize.opt_a.auto_monad_eliminator : 0.000025s : 0.14% optimize.opt_a.cse : 0.000054s : 0.31% optimize.opt_a.a_3 : 0.000125s : 0.72% optimize.py_interpret_to_execute_after_opt_a : 0.000011s : 0.07% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.03% optimize.rewriter_after_opt_a : 0.000043s : 0.25% optimize.convert_after_rewriter : 0.000010s : 0.06% optimize.order_py_execute_after_rewriter : 0.000009s : 0.05% optimize.mutable_eliminate : 0.000476s : 2.73% optimize.opt_b.b_1 : 0.000188s : 1.08% optimize.opt_b.b_2 : 0.000009s : 0.05% optimize.opt_b.updatestate_depend_eliminate : 0.000006s : 0.03% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.02% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.02% optimize.opt_b.renormalize : 0.000000s : 0.00% optimize.opt_b.cse : 0.000023s : 0.13% optimize.optimize_parallel_all_gather_comm : 0.000020s : 0.11% optimize.overlap_param_gather : 0.000005s : 0.03% optimize.cconv : 0.000028s : 0.16% optimize.loop_unroll : 0.000442s : 2.53% optimize.opt_after_cconv.c_1 : 0.000035s : 0.20% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.02% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.03% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.02% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.02% optimize.opt_after_cconv.cse : 0.000022s : 0.13% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000020s : 0.11% optimize.tuple_transform.d_1 : 0.000047s : 0.27% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000008s : 0.04% optimize.partial_unused_args_eliminate : 0.000004s : 0.03% optimize.add_recomputation : 0.000056s : 0.32% optimize.cse_after_recomputation.cse : 0.000015s : 0.09% optimize.environ_conv : 0.000009s : 0.05% optimize.swap_dp_allreduce_reducescatter : 0.000009s : 0.05% optimize.bias_add_comm_swap : 0.000005s : 0.03% optimize.label_micro_interleaved_index : 0.000006s : 0.04% optimize.label_fine_grained_interleaved_index : 0.000005s : 0.03% optimize.merge_cast_opt : 0.000004s : 0.02% optimize.slice_recompute_activation : 0.000004s : 0.03% optimize.micro_interleaved_order_control : 0.000005s : 0.03% optimize.assign_add_opt : 0.000004s : 0.02% optimize.ForceFp32Comm : 0.000003s : 0.02% optimize.remove_cast_before_assign_add : 0.000003s : 0.02% optimize.full_micro_interleaved_order_control : 0.000004s : 0.02% optimize.reorder_send_recv_between_fp_bp : 0.000005s : 0.03% optimize.comm_op_add_attrs : 0.000003s : 0.02% optimize.add_comm_op_reuse_tag : 0.000003s : 0.02% optimize.interleave_split_concat_branches : 0.000004s : 0.02% optimize.interleave_parallel_branches : 0.000004s : 0.02% optimize.overlap_opt_shard_in_pipeline : 0.000003s : 0.02% optimize.overlap_opt_shard_grad_in_pipeline : 0.000004s : 0.02% optimize.control_data_broadcast_order : 0.000018s : 0.10% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.02% optimize.offloading_packed_experts : 0.000007s : 0.04% optimize.overlap_recompute_and_grad_model_parallel : 0.000008s : 0.05% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000003s : 0.02% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.02% optimize.overlap_recompute_comm : 0.000005s : 0.03% optimize.overlap_grad_ring_attention : 0.000007s : 0.04% optimize.overlap_grad_flash_sp : 0.000023s : 0.13% optimize.begin_end_overlap_inline : 0.000003s : 0.02% optimize.split_matmul_comm_elemetwise : 0.000016s : 0.09% optimize.split_layernorm_comm : 0.000004s : 0.02% optimize.handle_group_info : 0.000003s : 0.02% optimize.symbol_engine_optimizer.build : 0.000003s : 0.02% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000012s : 0.07% optimize.symbol_engine_optimizer.elim_not_effective : 0.000015s : 0.09% optimize.symbol_engine_optimizer.opt_reshape : 0.000008s : 0.05% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000013s : 0.07% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000004s : 0.02% pipeline_parallel_scheduler : 0.000002s : 0.01% auto_monad_reorder : 0.000022s : 0.13% get_jit_bprop_graph : 0.000001s : 0.01% rewriter_after_jit_bprop_graph : 0.000004s : 0.02% opt_after_jit_grad : 0.000481s : 2.76% validate : 0.000039s : 0.22% Time group info: ------[substitution.] 0.000172 34 10.74% : 0.000018s : 2: substitution.cast_eliminate 1.28% : 0.000002s : 3: substitution.elim_not_effective 0.99% : 0.000002s : 3: substitution.fold_const_symbol 3.96% : 0.000007s : 4: substitution.graph_param_transform 70.07% : 0.000121s : 6: substitution.inline 2.29% : 0.000004s : 6: substitution.j_node_and_user_rematch 3.26% : 0.000006s : 6: substitution.remove_not_recompute_node 1.76% : 0.000003s : 2: substitution.replace_old_param 5.66% : 0.000010s : 2: substitution.switch_simplify ------[type_inference.] 0.011877 2 92.34% : 0.010968s : 1: type_inference.infer 7.66% : 0.000909s : 1: type_inference.specialize ------[replace.] 0.000081 8 47.65% : 0.000038s : 6: replace.inline 52.35% : 0.000042s : 2: replace.switch_simplify ------[match.] 0.000125 8 93.45% : 0.000117s : 6: match.inline 6.55% : 0.000008s : 2: match.switch_simplify ------[predicate.] 0.000205 1222 0.97% : 0.000002s : 13: predicate.accumulaten_eliminater 0.79% : 0.000002s : 4: predicate.ad_related_special_op_eliminate 0.57% : 0.000001s : 8: predicate.addn_check_dump 1.08% : 0.000002s : 13: predicate.addn_zero_filter 0.88% : 0.000002s : 13: predicate.adjust_all_reduce_mul_add 2.19% : 0.000004s : 21: predicate.arithmetic_simplify 1.13% : 0.000002s : 13: predicate.cast_eliminate 0.63% : 0.000001s : 8: predicate.check_bprop_eliminate 0.58% : 0.000001s : 8: predicate.compare_switch_simplify 0.19% : 0.000000s : 4: predicate.const_output_eliminate 0.66% : 0.000001s : 8: predicate.depend_value_elim 0.93% : 0.000002s : 13: predicate.dict_get_item_const_eliminator 1.16% : 0.000002s : 13: predicate.dict_get_item_eliminator 0.89% : 0.000002s : 13: predicate.dict_set_item_eliminator 0.97% : 0.000002s : 8: predicate.dumpgradient_eliminate 0.20% : 0.000000s : 4: predicate.elim_not_effective 0.44% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.16% : 0.000002s : 17: predicate.environ_add_const_eliminate 1.15% : 0.000002s : 17: predicate.environ_get_add_eliminate 1.17% : 0.000002s : 17: predicate.environ_get_depend_swap 1.82% : 0.000004s : 25: predicate.environ_get_eliminate 1.13% : 0.000002s : 17: predicate.environ_get_set_eliminate 1.38% : 0.000003s : 19: predicate.exchange_switch_depend_value 2.30% : 0.000005s : 19: predicate.float_depend_g_call 0.59% : 0.000001s : 8: predicate.float_environ_get_switch 0.81% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.15% : 0.000000s : 4: predicate.fold_const_symbol 0.75% : 0.000002s : 8: predicate.get_grad_eliminate 0.24% : 0.000000s : 4: predicate.graph_param_transform 0.69% : 0.000001s : 8: predicate.incorporate_call 0.56% : 0.000001s : 8: predicate.incorporate_call_switch 6.61% : 0.000014s : 56: predicate.inline 0.90% : 0.000002s : 8: predicate.inline_without_move 0.32% : 0.000001s : 8: predicate.j_node_and_user_rematch 0.84% : 0.000002s : 8: predicate.less_batch_normalization 1.64% : 0.000003s : 21: predicate.list_to_tuple_eliminator_ 2.33% : 0.000005s : 34: predicate.load_eliminater 1.03% : 0.000002s : 4: predicate.loop_unroll_after_grad 2.12% : 0.000004s : 29: predicate.loop_unroll_before_grad 1.70% : 0.000003s : 21: predicate.make_slice_get_slice_eliminator 0.61% : 0.000001s : 8: predicate.merge_addn 0.60% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.58% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.83% : 0.000002s : 13: predicate.minmaximum_grad 1.02% : 0.000002s : 4: predicate.mutable_eliminate 0.37% : 0.000001s : 4: predicate.opt_reshape 0.37% : 0.000001s : 4: predicate.parallel_virtual_node 1.74% : 0.000004s : 19: predicate.partial_defer_inline 1.39% : 0.000003s : 17: predicate.partial_eliminate 0.88% : 0.000002s : 13: predicate.print_const_string_wrapper 0.60% : 0.000001s : 8: predicate.reduce_all_const_elim 1.17% : 0.000002s : 13: predicate.reduce_eliminate 2.31% : 0.000005s : 34: predicate.redundant_stop_gradient_eliminater 0.38% : 0.000001s : 8: predicate.remove_not_recompute_node 1.16% : 0.000002s : 21: predicate.replace_applicator 0.52% : 0.000001s : 8: predicate.replace_old_param 0.30% : 0.000001s : 4: predicate.reset_defer_inline 1.04% : 0.000002s : 13: predicate.reshape_eliminate 0.69% : 0.000001s : 8: predicate.row_tensor_add_zeros_like 0.37% : 0.000001s : 4: predicate.row_tensor_eliminate 0.83% : 0.000002s : 8: predicate.same_eliminate 0.46% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.75% : 0.000002s : 8: predicate.shard_identity_eliminate 0.72% : 0.000001s : 8: predicate.special_op_eliminate 0.87% : 0.000002s : 8: predicate.specialize_transform 0.89% : 0.000002s : 8: predicate.split_environ_get_set_with_tuple_value 0.72% : 0.000001s : 8: predicate.stack_unstack_eliminate 0.37% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.44% : 0.000003s : 19: predicate.switch_defer_inline 2.02% : 0.000004s : 27: predicate.switch_layer_defer_inline 5.35% : 0.000011s : 64: predicate.switch_simplify 0.89% : 0.000002s : 13: predicate.tile_eliminate 0.93% : 0.000002s : 13: predicate.transpose_eliminate 1.59% : 0.000003s : 21: predicate.tuple_list_convert_item_index_to_positive 1.76% : 0.000004s : 21: predicate.tuple_list_get_item_const_eliminator 1.58% : 0.000003s : 21: predicate.tuple_list_get_item_depend_reorder 2.80% : 0.000006s : 29: predicate.tuple_list_get_item_eliminator 1.57% : 0.000003s : 21: predicate.tuple_list_get_set_item_eliminator 2.37% : 0.000005s : 29: predicate.tuple_list_set_item_eliminator 1.60% : 0.000003s : 21: predicate.tuple_to_list_eliminator_ 2.31% : 0.000005s : 34: predicate.updatestate_pure_node_eliminater 3.11% : 0.000006s : 42: predicate.updatestate_useless_node_eliminater 0.42% : 0.000001s : 4: predicate.value_based_eliminate 0.64% : 0.000001s : 8: predicate.virtual_dataset_eliminate 0.66% : 0.000001s : 8: predicate.virtual_output_eliminate 0.30% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.46% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000604 10 53.10% : 0.000321s : 2: func_graph_cloner_run.FuncGraphClonerGraph 46.90% : 0.000283s : 8: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.032866 192 0.02% : 0.000006s : 1: ForceFp32Comm 9.29% : 0.003053s : 1: add_attr 9.25% : 0.003040s : 1: add_attr_with_inline 0.02% : 0.000006s : 1: add_comm_op_reuse_tag 0.18% : 0.000060s : 1: add_recomputation 0.02% : 0.000006s : 1: assign_add_opt 0.29% : 0.000096s : 1: auto_monad 0.09% : 0.000030s : 1: auto_monad_reorder 0.02% : 0.000006s : 1: begin_end_overlap_inline 0.03% : 0.000008s : 1: bias_add_comm_swap 1.53% : 0.000504s : 1: bootstrap 0.09% : 0.000031s : 1: cconv 0.02% : 0.000006s : 1: comm_op_add_attrs 0.06% : 0.000021s : 1: control_data_broadcast_order 0.04% : 0.000013s : 1: convert_after_rewriter 0.11% : 0.000035s : 1: cse_after_recomputation 0.02% : 0.000008s : 1: dataset_repeat_opt 0.06% : 0.000019s : 1: detach_backward 0.04% : 0.000012s : 1: environ_conv 0.09% : 0.000030s : 1: event_method 0.02% : 0.000007s : 1: full_micro_interleaved_order_control 0.02% : 0.000007s : 1: get_jit_bprop_graph 0.04% : 0.000014s : 1: graph_reusing 0.02% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.02% : 0.000006s : 1: handle_group_info 0.03% : 0.000009s : 1: inline 0.02% : 0.000008s : 1: insert-virtual-dataset 0.02% : 0.000006s : 1: interleave_parallel_branches 0.02% : 0.000006s : 1: interleave_split_concat_branches 0.02% : 0.000008s : 1: label_fine_grained_interleaved_index 0.03% : 0.000009s : 1: label_micro_interleaved_index 1.36% : 0.000449s : 1: loop_unroll 0.02% : 0.000007s : 1: merge_cast_opt 0.02% : 0.000008s : 1: micro_interleaved_order_control 1.47% : 0.000483s : 1: mutable_eliminate 0.03% : 0.000009s : 1: offloading_packed_experts 0.07% : 0.000024s : 1: opt.transform.loop_unroll_optimizer 0.05% : 0.000016s : 1: opt.transform.mutable_eliminate 3.83% : 0.001259s : 78: opt.transform.opt_a 0.10% : 0.000033s : 1: opt.transform.opt_after_cconv 0.08% : 0.000028s : 1: opt.transform.opt_after_jit_grad 0.38% : 0.000126s : 28: opt.transform.opt_b 0.16% : 0.000052s : 2: opt.transform.opt_trans_graph 0.13% : 0.000044s : 4: opt.transform.symbol_engine_opt 9.57% : 0.003146s : 1: opt_a 0.42% : 0.000139s : 1: opt_after_cconv 1.49% : 0.000491s : 1: opt_after_jit_grad 0.92% : 0.000301s : 1: opt_b 17.69% : 0.005815s : 1: optimize 0.07% : 0.000023s : 1: optimize_parallel_all_gather_comm 0.04% : 0.000012s : 1: order_py_execute_after_rewriter 0.08% : 0.000026s : 1: overlap_grad_flash_sp 0.02% : 0.000006s : 1: overlap_grad_matmul_and_grad_allreduce 0.03% : 0.000010s : 1: overlap_grad_ring_attention 0.02% : 0.000007s : 1: overlap_opt_shard_grad_in_pipeline 0.02% : 0.000006s : 1: overlap_opt_shard_in_pipeline 0.02% : 0.000008s : 1: overlap_param_gather 0.02% : 0.000006s : 1: overlap_recompute_allgather_and_fa_grad 0.03% : 0.000011s : 1: overlap_recompute_and_grad_model_parallel 0.03% : 0.000008s : 1: overlap_recompute_comm 0.03% : 0.000009s : 1: parallel-infer-symbol 0.02% : 0.000006s : 1: parallel-infer-symbol-second 0.02% : 0.000008s : 1: partial_unused_args_eliminate 0.03% : 0.000009s : 1: pipeline_parallel_scheduler 0.02% : 0.000007s : 1: pipeline_split 0.12% : 0.000038s : 1: pre_auto_parallel 0.10% : 0.000032s : 1: py_interpret_to_execute 0.04% : 0.000015s : 1: py_interpret_to_execute_after_opt_a 0.02% : 0.000006s : 1: remove_cast_before_assign_add 0.07% : 0.000023s : 1: remove_dup_value 1.11% : 0.000365s : 1: renormalize.infer 0.98% : 0.000322s : 1: renormalize.specialize 0.02% : 0.000008s : 1: reorder_send_recv_between_fp_bp 0.03% : 0.000010s : 1: rewriter_after_jit_bprop_graph 0.14% : 0.000046s : 1: rewriter_after_opt_a 0.26% : 0.000086s : 1: rewriter_before_opt_a 0.02% : 0.000007s : 1: slice_cell_reuse_recomputed_activation 0.02% : 0.000007s : 1: slice_recompute_activation 0.02% : 0.000007s : 1: split_layernorm_comm 0.06% : 0.000019s : 1: split_matmul_comm_elemetwise 0.04% : 0.000012s : 1: swap_dp_allreduce_reducescatter 0.32% : 0.000106s : 1: symbol_engine_optimizer 0.30% : 0.000097s : 1: tuple_transform 36.48% : 0.011989s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:47:05.944.393 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0208854, [21] [bootstrap]: 0.00043017 [type_inference]: 0.0118129 [event_method]: 1.792e-05 [auto_monad]: 8.298e-05 [graph_reusing]: 6.88998e-06 [inline]: 2.26998e-06 [add_attr]: 0.00303209, [1] [add_attr_with_inline]: 0.00302416, [1] [Cycle 1]: 5.987e-05, [2] [tag_attr]: 2.035e-05 [meta_addattr_fg_expand]: 6.16998e-06 [parallel-infer-symbol]: 3.63999e-06 [pre_auto_parallel]: 3.091e-05 [insert-virtual-dataset]: 2.27999e-06 [parallel-infer-symbol-second]: 7.39994e-07 [dataset_repeat_opt]: 1.94999e-06 [pipeline_split]: 1.89999e-06 [optimize]: 0.00478344, [53] [py_interpret_to_execute]: 2.359e-05 [rewriter_before_opt_a]: 7.729e-05 [opt_a]: 0.00276613, [2] [Cycle 1]: 0.00205759, [45] [expand_dump_flag]: 2.89999e-06 [switch_simplify]: 0.00010284 [loop_unroll]: 2.652e-05 [a_1]: 0.00054934 [with_stream_mark]: 1.48e-05 [recompute_prepare]: 9.86998e-06 [updatestate_depend_eliminate]: 4.48999e-06 [updatestate_assign_eliminate]: 3.7e-06 [updatestate_loads_eliminate]: 3.43e-06 [parameter_eliminate]: 1.82001e-06 [a_2]: 9.66e-05 [accelerated_algorithm]: 7.92998e-06 [shard]: 1.89e-06 [meta_shard_fg_expand]: 2.53e-06 [shard_inline]: 7.33e-06 [merge_send_recv]: 8.96002e-06 [auto_parallel]: 7.18e-06 [parallel]: 1.772e-05 [flash_sp]: 7.61999e-06 [merge_comm]: 4.46002e-06 [allreduce_fusion]: 4.13001e-06 [matmul_add_comm_reduction]: 9.88002e-06 [allreduce_slice_to_reducescatter]: 6.69999e-07 [virtual_shard_identity]: 8.69998e-06 [virtual_dataset]: 7.11999e-06 [get_grad_eliminate_]: 7.33e-06 [virtual_output]: 6.74999e-06 [merge_forward]: 4.43999e-06 [cell_reuse_recompute_pass]: 1.12999e-06 [offload_activation]: 1.05e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.593e-05 [merge_recompute_call_nodes]: 1.74998e-06 [before_grad]: 1.167e-05 [set_forward_comm_id_for_comm_node_pass]: 4.37998e-06 [meta_fg_expand]: 3.6e-06 [flash_sp_send_recv_attached]: 2.76e-06 [receive_attached]: 2.48e-06 [after_resolve]: 1.095e-05 [a_after_grad]: 1.048e-05 [renormalize]: 0.00069038 [add_forward_monad_depend]: 5.32999e-06 [auto_monad_grad]: 1.86e-06 [auto_monad_eliminator]: 1.668e-05 [cse]: 3.562e-05 [a_3]: 5.485e-05 [Cycle 2]: 0.00069915, [45] [expand_dump_flag]: 1.02998e-06 [switch_simplify]: 8.59e-06 [loop_unroll]: 6.94999e-06 [a_1]: 0.00015007 [with_stream_mark]: 1.209e-05 [recompute_prepare]: 7.39002e-06 [updatestate_depend_eliminate]: 4.15e-06 [updatestate_assign_eliminate]: 2.95002e-06 [updatestate_loads_eliminate]: 2.67001e-06 [parameter_eliminate]: 1.12e-06 [a_2]: 8.642e-05 [accelerated_algorithm]: 6.94001e-06 [shard]: 1.09003e-06 [meta_shard_fg_expand]: 1.66e-06 [shard_inline]: 7.36001e-06 [merge_send_recv]: 5.22999e-06 [auto_parallel]: 5.71e-06 [parallel]: 4.70999e-06 [flash_sp]: 3.22002e-06 [merge_comm]: 4e-06 [allreduce_fusion]: 3.82998e-06 [matmul_add_comm_reduction]: 6.66e-06 [allreduce_slice_to_reducescatter]: 3.69997e-07 [virtual_shard_identity]: 7.86001e-06 [virtual_dataset]: 6.54001e-06 [get_grad_eliminate_]: 6.41e-06 [virtual_output]: 6.39999e-06 [merge_forward]: 3.29001e-06 [cell_reuse_recompute_pass]: 1.64e-06 [offload_activation]: 7.11001e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.35e-05 [merge_recompute_call_nodes]: 7.7e-07 [before_grad]: 1.1e-05 [set_forward_comm_id_for_comm_node_pass]: 4.48001e-06 [meta_fg_expand]: 2.84999e-06 [flash_sp_send_recv_attached]: 8.2e-07 [receive_attached]: 1.18001e-06 [after_resolve]: 9.52001e-06 [a_after_grad]: 9.94001e-06 [renormalize]: 5.9983e-08 [add_forward_monad_depend]: 1.00999e-06 [auto_monad_grad]: 8.79983e-07 [auto_monad_eliminator]: 7.97e-06 [cse]: 1.822e-05 [a_3]: 4.339e-05 [py_interpret_to_execute_after_opt_a]: 8.52e-06 [slice_cell_reuse_recomputed_activation]: 2.36998e-06 [rewriter_after_opt_a]: 3.944e-05 [convert_after_rewriter]: 6.88e-06 [order_py_execute_after_rewriter]: 6.11e-06 [mutable_eliminate]: 0.00046128 [opt_b]: 0.00022976, [1] [Cycle 1]: 0.00022415, [7] [b_1]: 0.00014665 [b_2]: 8.73001e-06 [updatestate_depend_eliminate]: 5.64e-06 [updatestate_assign_eliminate]: 2.92002e-06 [updatestate_loads_eliminate]: 3.13e-06 [renormalize]: 3.59985e-07 [cse]: 2.185e-05 [optimize_parallel_all_gather_comm]: 1.758e-05 [overlap_param_gather]: 2.04999e-06 [cconv]: 2.248e-05 [loop_unroll]: 0.0004247 [opt_after_cconv]: 0.00011169, [1] [Cycle 1]: 0.00010615, [7] [c_1]: 3.481e-05 [parameter_eliminate]: 2.47001e-06 [updatestate_depend_eliminate]: 5.46e-06 [updatestate_assign_eliminate]: 3.16001e-06 [updatestate_loads_eliminate]: 2.82002e-06 [cse]: 2.278e-05 [renormalize]: 3.30008e-07 [remove_dup_value]: 1.57e-05 [tuple_transform]: 7.931e-05, [1] [Cycle 1]: 7.483e-05, [4] [d_1]: 4.604e-05 [none_parameter_eliminate]: 1.77999e-06 [renormalize]: 2.60014e-07 [switch_simplify]: 7.45e-06 [partial_unused_args_eliminate]: 1.94e-06 [add_recomputation]: 5.245e-05 [cse_after_recomputation]: 2.445e-05, [1] [Cycle 1]: 2.023e-05, [1] [cse]: 1.469e-05 [environ_conv]: 5.96998e-06 [swap_dp_allreduce_reducescatter]: 5.49998e-06 [bias_add_comm_swap]: 2.27001e-06 [label_micro_interleaved_index]: 4.07e-06 [label_fine_grained_interleaved_index]: 2.83e-06 [merge_cast_opt]: 1.20999e-06 [slice_recompute_activation]: 2.11998e-06 [micro_interleaved_order_control]: 2.47001e-06 [assign_add_opt]: 1.18001e-06 [ForceFp32Comm]: 9.30013e-07 [remove_cast_before_assign_add]: 1.01002e-06 [full_micro_interleaved_order_control]: 1.93997e-06 [reorder_send_recv_between_fp_bp]: 2.70002e-06 [comm_op_add_attrs]: 9.39996e-07 [add_comm_op_reuse_tag]: 9.30013e-07 [interleave_split_concat_branches]: 1.17e-06 [interleave_parallel_branches]: 1.43002e-06 [overlap_opt_shard_in_pipeline]: 1.17e-06 [overlap_opt_shard_grad_in_pipeline]: 1.99e-06 [control_data_broadcast_order]: 1.416e-05 [grouped_pairwise_exchange_alltoall]: 1.54e-06 [offloading_packed_experts]: 4.00998e-06 [overlap_recompute_and_grad_model_parallel]: 4.87e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.14998e-06 [overlap_recompute_allgather_and_fa_grad]: 1.33002e-06 [overlap_recompute_comm]: 2.19001e-06 [overlap_grad_ring_attention]: 4.48999e-06 [overlap_grad_flash_sp]: 2.009e-05 [begin_end_overlap_inline]: 5.60016e-07 [split_matmul_comm_elemetwise]: 2.21e-06 [split_layernorm_comm]: 1.56002e-06 [handle_group_info]: 9.00007e-07 [symbol_engine_optimizer]: 9.606e-05, [1] [Cycle 1]: 9.179e-05, [6] [build]: 2.81999e-06 [elim_shapecalc]: 1.149e-05 [elim_not_effective]: 2.757e-05 [opt_reshape]: 8.55999e-06 [fold_const_symbol]: 1.22e-05 [renormalize]: 2.50002e-07 [detach_backward]: 1.89e-06 [pipeline_parallel_scheduler]: 1.60999e-06 [auto_monad_reorder]: 2.016e-05 [get_jit_bprop_graph]: 1.05999e-06 [rewriter_after_jit_bprop_graph]: 3.85998e-06 [opt_after_jit_grad]: 0.00046157 [validate]: 3.847e-05 Sums bootstrap : 0.000430s : 2.54% type_inference : 0.011813s : 69.83% event_method : 0.000018s : 0.11% auto_monad : 0.000083s : 0.49% graph_reusing : 0.000007s : 0.04% inline : 0.000002s : 0.01% add_attr.add_attr_with_inline.tag_attr : 0.000020s : 0.12% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.04% parallel-infer-symbol : 0.000004s : 0.02% pre_auto_parallel : 0.000031s : 0.18% insert-virtual-dataset : 0.000002s : 0.01% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.01% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000024s : 0.14% optimize.rewriter_before_opt_a : 0.000077s : 0.46% optimize.opt_a.expand_dump_flag : 0.000004s : 0.02% optimize.opt_a.switch_simplify : 0.000111s : 0.66% optimize.opt_a.loop_unroll : 0.000033s : 0.20% optimize.opt_a.a_1 : 0.000699s : 4.13% optimize.opt_a.with_stream_mark : 0.000027s : 0.16% optimize.opt_a.recompute_prepare : 0.000017s : 0.10% optimize.opt_a.updatestate_depend_eliminate : 0.000009s : 0.05% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.04% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.04% optimize.opt_a.parameter_eliminate : 0.000003s : 0.02% optimize.opt_a.a_2 : 0.000183s : 1.08% optimize.opt_a.accelerated_algorithm : 0.000015s : 0.09% optimize.opt_a.shard : 0.000003s : 0.02% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.02% optimize.opt_a.shard_inline : 0.000015s : 0.09% optimize.opt_a.merge_send_recv : 0.000014s : 0.08% optimize.opt_a.auto_parallel : 0.000013s : 0.08% optimize.opt_a.parallel : 0.000022s : 0.13% optimize.opt_a.flash_sp : 0.000011s : 0.06% optimize.opt_a.merge_comm : 0.000008s : 0.05% optimize.opt_a.allreduce_fusion : 0.000008s : 0.05% optimize.opt_a.matmul_add_comm_reduction : 0.000017s : 0.10% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000017s : 0.10% optimize.opt_a.virtual_dataset : 0.000014s : 0.08% optimize.opt_a.get_grad_eliminate_ : 0.000014s : 0.08% optimize.opt_a.virtual_output : 0.000013s : 0.08% optimize.opt_a.merge_forward : 0.000008s : 0.05% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.02% optimize.opt_a.offload_activation : 0.000018s : 0.10% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000029s : 0.17% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.01% optimize.opt_a.before_grad : 0.000023s : 0.13% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000009s : 0.05% optimize.opt_a.meta_fg_expand : 0.000006s : 0.04% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.02% optimize.opt_a.receive_attached : 0.000004s : 0.02% optimize.opt_a.after_resolve : 0.000020s : 0.12% optimize.opt_a.a_after_grad : 0.000020s : 0.12% optimize.opt_a.renormalize : 0.000690s : 4.08% optimize.opt_a.add_forward_monad_depend : 0.000006s : 0.04% optimize.opt_a.auto_monad_grad : 0.000003s : 0.02% optimize.opt_a.auto_monad_eliminator : 0.000025s : 0.15% optimize.opt_a.cse : 0.000054s : 0.32% optimize.opt_a.a_3 : 0.000098s : 0.58% optimize.py_interpret_to_execute_after_opt_a : 0.000009s : 0.05% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.01% optimize.rewriter_after_opt_a : 0.000039s : 0.23% optimize.convert_after_rewriter : 0.000007s : 0.04% optimize.order_py_execute_after_rewriter : 0.000006s : 0.04% optimize.mutable_eliminate : 0.000461s : 2.73% optimize.opt_b.b_1 : 0.000147s : 0.87% optimize.opt_b.b_2 : 0.000009s : 0.05% optimize.opt_b.updatestate_depend_eliminate : 0.000006s : 0.03% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.02% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.02% optimize.opt_b.renormalize : 0.000000s : 0.00% optimize.opt_b.cse : 0.000022s : 0.13% optimize.optimize_parallel_all_gather_comm : 0.000018s : 0.10% optimize.overlap_param_gather : 0.000002s : 0.01% optimize.cconv : 0.000022s : 0.13% optimize.loop_unroll : 0.000425s : 2.51% optimize.opt_after_cconv.c_1 : 0.000035s : 0.21% optimize.opt_after_cconv.parameter_eliminate : 0.000002s : 0.01% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000005s : 0.03% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.02% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.02% optimize.opt_after_cconv.cse : 0.000023s : 0.13% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000016s : 0.09% optimize.tuple_transform.d_1 : 0.000046s : 0.27% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000007s : 0.04% optimize.partial_unused_args_eliminate : 0.000002s : 0.01% optimize.add_recomputation : 0.000052s : 0.31% optimize.cse_after_recomputation.cse : 0.000015s : 0.09% optimize.environ_conv : 0.000006s : 0.04% optimize.swap_dp_allreduce_reducescatter : 0.000005s : 0.03% optimize.bias_add_comm_swap : 0.000002s : 0.01% optimize.label_micro_interleaved_index : 0.000004s : 0.02% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.02% optimize.merge_cast_opt : 0.000001s : 0.01% optimize.slice_recompute_activation : 0.000002s : 0.01% optimize.micro_interleaved_order_control : 0.000002s : 0.01% optimize.assign_add_opt : 0.000001s : 0.01% optimize.ForceFp32Comm : 0.000001s : 0.01% optimize.remove_cast_before_assign_add : 0.000001s : 0.01% optimize.full_micro_interleaved_order_control : 0.000002s : 0.01% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.02% optimize.comm_op_add_attrs : 0.000001s : 0.01% optimize.add_comm_op_reuse_tag : 0.000001s : 0.01% optimize.interleave_split_concat_branches : 0.000001s : 0.01% optimize.interleave_parallel_branches : 0.000001s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.01% optimize.control_data_broadcast_order : 0.000014s : 0.08% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.01% optimize.offloading_packed_experts : 0.000004s : 0.02% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.03% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.01% optimize.overlap_recompute_comm : 0.000002s : 0.01% optimize.overlap_grad_ring_attention : 0.000004s : 0.03% optimize.overlap_grad_flash_sp : 0.000020s : 0.12% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.01% optimize.split_layernorm_comm : 0.000002s : 0.01% optimize.handle_group_info : 0.000001s : 0.01% optimize.symbol_engine_optimizer.build : 0.000003s : 0.02% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000011s : 0.07% optimize.symbol_engine_optimizer.elim_not_effective : 0.000028s : 0.16% optimize.symbol_engine_optimizer.opt_reshape : 0.000009s : 0.05% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000012s : 0.07% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.01% pipeline_parallel_scheduler : 0.000002s : 0.01% auto_monad_reorder : 0.000020s : 0.12% get_jit_bprop_graph : 0.000001s : 0.01% rewriter_after_jit_bprop_graph : 0.000004s : 0.02% opt_after_jit_grad : 0.000462s : 2.73% validate : 0.000038s : 0.23% Time group info: ------[substitution.] 0.000167 34 10.72% : 0.000018s : 2: substitution.cast_eliminate 1.46% : 0.000002s : 3: substitution.elim_not_effective 1.03% : 0.000002s : 3: substitution.fold_const_symbol 3.78% : 0.000006s : 4: substitution.graph_param_transform 69.38% : 0.000116s : 6: substitution.inline 2.27% : 0.000004s : 6: substitution.j_node_and_user_rematch 3.80% : 0.000006s : 6: substitution.remove_not_recompute_node 2.09% : 0.000003s : 2: substitution.replace_old_param 5.47% : 0.000009s : 2: substitution.switch_simplify ------[type_inference.] 0.011762 2 92.33% : 0.010859s : 1: type_inference.infer 7.67% : 0.000902s : 1: type_inference.specialize ------[replace.] 0.000079 8 47.14% : 0.000037s : 6: replace.inline 52.86% : 0.000042s : 2: replace.switch_simplify ------[match.] 0.000120 8 93.68% : 0.000113s : 6: match.inline 6.32% : 0.000008s : 2: match.switch_simplify ------[predicate.] 0.000203 1222 0.99% : 0.000002s : 13: predicate.accumulaten_eliminater 0.83% : 0.000002s : 4: predicate.ad_related_special_op_eliminate 0.53% : 0.000001s : 8: predicate.addn_check_dump 0.90% : 0.000002s : 13: predicate.addn_zero_filter 0.87% : 0.000002s : 13: predicate.adjust_all_reduce_mul_add 2.16% : 0.000004s : 21: predicate.arithmetic_simplify 1.15% : 0.000002s : 13: predicate.cast_eliminate 0.57% : 0.000001s : 8: predicate.check_bprop_eliminate 0.54% : 0.000001s : 8: predicate.compare_switch_simplify 0.18% : 0.000000s : 4: predicate.const_output_eliminate 0.59% : 0.000001s : 8: predicate.depend_value_elim 0.99% : 0.000002s : 13: predicate.dict_get_item_const_eliminator 1.04% : 0.000002s : 13: predicate.dict_get_item_eliminator 1.03% : 0.000002s : 13: predicate.dict_set_item_eliminator 0.99% : 0.000002s : 8: predicate.dumpgradient_eliminate 0.19% : 0.000000s : 4: predicate.elim_not_effective 0.43% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.16% : 0.000002s : 17: predicate.environ_add_const_eliminate 1.15% : 0.000002s : 17: predicate.environ_get_add_eliminate 1.14% : 0.000002s : 17: predicate.environ_get_depend_swap 1.82% : 0.000004s : 25: predicate.environ_get_eliminate 1.19% : 0.000002s : 17: predicate.environ_get_set_eliminate 1.34% : 0.000003s : 19: predicate.exchange_switch_depend_value 2.34% : 0.000005s : 19: predicate.float_depend_g_call 0.58% : 0.000001s : 8: predicate.float_environ_get_switch 0.80% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.20% : 0.000000s : 4: predicate.fold_const_symbol 0.69% : 0.000001s : 8: predicate.get_grad_eliminate 0.21% : 0.000000s : 4: predicate.graph_param_transform 0.69% : 0.000001s : 8: predicate.incorporate_call 0.54% : 0.000001s : 8: predicate.incorporate_call_switch 6.45% : 0.000013s : 56: predicate.inline 0.82% : 0.000002s : 8: predicate.inline_without_move 0.31% : 0.000001s : 8: predicate.j_node_and_user_rematch 0.75% : 0.000002s : 8: predicate.less_batch_normalization 1.69% : 0.000003s : 21: predicate.list_to_tuple_eliminator_ 2.38% : 0.000005s : 34: predicate.load_eliminater 0.94% : 0.000002s : 4: predicate.loop_unroll_after_grad 2.35% : 0.000005s : 29: predicate.loop_unroll_before_grad 1.68% : 0.000003s : 21: predicate.make_slice_get_slice_eliminator 0.67% : 0.000001s : 8: predicate.merge_addn 0.59% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.57% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.96% : 0.000002s : 13: predicate.minmaximum_grad 0.94% : 0.000002s : 4: predicate.mutable_eliminate 0.36% : 0.000001s : 4: predicate.opt_reshape 0.37% : 0.000001s : 4: predicate.parallel_virtual_node 1.67% : 0.000003s : 19: predicate.partial_defer_inline 1.39% : 0.000003s : 17: predicate.partial_eliminate 1.13% : 0.000002s : 13: predicate.print_const_string_wrapper 0.73% : 0.000001s : 8: predicate.reduce_all_const_elim 1.16% : 0.000002s : 13: predicate.reduce_eliminate 2.30% : 0.000005s : 34: predicate.redundant_stop_gradient_eliminater 0.43% : 0.000001s : 8: predicate.remove_not_recompute_node 1.19% : 0.000002s : 21: predicate.replace_applicator 0.52% : 0.000001s : 8: predicate.replace_old_param 0.28% : 0.000001s : 4: predicate.reset_defer_inline 0.96% : 0.000002s : 13: predicate.reshape_eliminate 0.65% : 0.000001s : 8: predicate.row_tensor_add_zeros_like 0.39% : 0.000001s : 4: predicate.row_tensor_eliminate 0.80% : 0.000002s : 8: predicate.same_eliminate 0.41% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.78% : 0.000002s : 8: predicate.shard_identity_eliminate 0.86% : 0.000002s : 8: predicate.special_op_eliminate 0.80% : 0.000002s : 8: predicate.specialize_transform 0.84% : 0.000002s : 8: predicate.split_environ_get_set_with_tuple_value 0.76% : 0.000002s : 8: predicate.stack_unstack_eliminate 0.38% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.50% : 0.000003s : 19: predicate.switch_defer_inline 1.99% : 0.000004s : 27: predicate.switch_layer_defer_inline 5.53% : 0.000011s : 64: predicate.switch_simplify 0.92% : 0.000002s : 13: predicate.tile_eliminate 0.99% : 0.000002s : 13: predicate.transpose_eliminate 1.66% : 0.000003s : 21: predicate.tuple_list_convert_item_index_to_positive 1.73% : 0.000004s : 21: predicate.tuple_list_get_item_const_eliminator 1.45% : 0.000003s : 21: predicate.tuple_list_get_item_depend_reorder 2.88% : 0.000006s : 29: predicate.tuple_list_get_item_eliminator 1.57% : 0.000003s : 21: predicate.tuple_list_get_set_item_eliminator 2.44% : 0.000005s : 29: predicate.tuple_list_set_item_eliminator 1.69% : 0.000003s : 21: predicate.tuple_to_list_eliminator_ 2.23% : 0.000005s : 34: predicate.updatestate_pure_node_eliminater 3.00% : 0.000006s : 42: predicate.updatestate_useless_node_eliminater 0.37% : 0.000001s : 4: predicate.value_based_eliminate 0.63% : 0.000001s : 8: predicate.virtual_dataset_eliminate 0.65% : 0.000001s : 8: predicate.virtual_output_eliminate 0.28% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.40% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000605 10 54.20% : 0.000328s : 2: func_graph_cloner_run.FuncGraphClonerGraph 45.80% : 0.000277s : 8: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.030825 192 0.01% : 0.000004s : 1: ForceFp32Comm 9.85% : 0.003037s : 1: add_attr 9.82% : 0.003028s : 1: add_attr_with_inline 0.01% : 0.000003s : 1: add_comm_op_reuse_tag 0.18% : 0.000056s : 1: add_recomputation 0.01% : 0.000004s : 1: assign_add_opt 0.29% : 0.000088s : 1: auto_monad 0.08% : 0.000024s : 1: auto_monad_reorder 0.01% : 0.000003s : 1: begin_end_overlap_inline 0.02% : 0.000005s : 1: bias_add_comm_swap 1.49% : 0.000460s : 1: bootstrap 0.08% : 0.000026s : 1: cconv 0.01% : 0.000004s : 1: comm_op_add_attrs 0.06% : 0.000017s : 1: control_data_broadcast_order 0.03% : 0.000010s : 1: convert_after_rewriter 0.09% : 0.000027s : 1: cse_after_recomputation 0.02% : 0.000005s : 1: dataset_repeat_opt 0.02% : 0.000005s : 1: detach_backward 0.03% : 0.000009s : 1: environ_conv 0.08% : 0.000024s : 1: event_method 0.01% : 0.000005s : 1: full_micro_interleaved_order_control 0.01% : 0.000004s : 1: get_jit_bprop_graph 0.03% : 0.000010s : 1: graph_reusing 0.01% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000003s : 1: handle_group_info 0.02% : 0.000005s : 1: inline 0.02% : 0.000006s : 1: insert-virtual-dataset 0.01% : 0.000005s : 1: interleave_parallel_branches 0.01% : 0.000004s : 1: interleave_split_concat_branches 0.02% : 0.000006s : 1: label_fine_grained_interleaved_index 0.02% : 0.000007s : 1: label_micro_interleaved_index 1.40% : 0.000433s : 1: loop_unroll 0.01% : 0.000004s : 1: merge_cast_opt 0.02% : 0.000005s : 1: micro_interleaved_order_control 1.52% : 0.000469s : 1: mutable_eliminate 0.02% : 0.000007s : 1: offloading_packed_experts 0.05% : 0.000016s : 1: opt.transform.loop_unroll_optimizer 0.05% : 0.000016s : 1: opt.transform.mutable_eliminate 4.00% : 0.001231s : 78: opt.transform.opt_a 0.11% : 0.000033s : 1: opt.transform.opt_after_cconv 0.09% : 0.000028s : 1: opt.transform.opt_after_jit_grad 0.40% : 0.000124s : 28: opt.transform.opt_b 0.17% : 0.000051s : 2: opt.transform.opt_trans_graph 0.18% : 0.000056s : 4: opt.transform.symbol_engine_opt 8.98% : 0.002769s : 1: opt_a 0.37% : 0.000115s : 1: opt_after_cconv 1.53% : 0.000470s : 1: opt_after_jit_grad 0.76% : 0.000233s : 1: opt_b 15.53% : 0.004788s : 1: optimize 0.07% : 0.000021s : 1: optimize_parallel_all_gather_comm 0.03% : 0.000009s : 1: order_py_execute_after_rewriter 0.08% : 0.000023s : 1: overlap_grad_flash_sp 0.01% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.02% : 0.000007s : 1: overlap_grad_ring_attention 0.02% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.02% : 0.000005s : 1: overlap_param_gather 0.01% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.02% : 0.000008s : 1: overlap_recompute_and_grad_model_parallel 0.02% : 0.000005s : 1: overlap_recompute_comm 0.02% : 0.000007s : 1: parallel-infer-symbol 0.01% : 0.000004s : 1: parallel-infer-symbol-second 0.02% : 0.000005s : 1: partial_unused_args_eliminate 0.02% : 0.000005s : 1: pipeline_parallel_scheduler 0.02% : 0.000005s : 1: pipeline_split 0.11% : 0.000035s : 1: pre_auto_parallel 0.09% : 0.000028s : 1: py_interpret_to_execute 0.04% : 0.000012s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000004s : 1: remove_cast_before_assign_add 0.06% : 0.000019s : 1: remove_dup_value 1.14% : 0.000351s : 1: renormalize.infer 1.08% : 0.000332s : 1: renormalize.specialize 0.02% : 0.000005s : 1: reorder_send_recv_between_fp_bp 0.02% : 0.000007s : 1: rewriter_after_jit_bprop_graph 0.14% : 0.000043s : 1: rewriter_after_opt_a 0.26% : 0.000081s : 1: rewriter_before_opt_a 0.02% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.02% : 0.000005s : 1: slice_recompute_activation 0.01% : 0.000004s : 1: split_layernorm_comm 0.02% : 0.000005s : 1: split_matmul_comm_elemetwise 0.03% : 0.000008s : 1: swap_dp_allreduce_reducescatter 0.32% : 0.000099s : 1: symbol_engine_optimizer 0.27% : 0.000082s : 1: tuple_transform 38.37% : 0.011829s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:47:06.139.984 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:47:06.140.249 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0233811, [21] [bootstrap]: 0.00049231 [type_inference]: 0.012755 [event_method]: 1.943e-05 [auto_monad]: 8.867e-05 [graph_reusing]: 8.02e-06 [inline]: 2.53e-06 [add_attr]: 0.0032404, [1] [add_attr_with_inline]: 0.00323, [1] [Cycle 1]: 7.72e-05, [2] [tag_attr]: 2.075e-05 [meta_addattr_fg_expand]: 6.58e-06 [parallel-infer-symbol]: 3.91001e-06 [pre_auto_parallel]: 3.464e-05 [insert-virtual-dataset]: 3.16001e-06 [parallel-infer-symbol-second]: 9.10019e-07 [dataset_repeat_opt]: 1.99e-06 [pipeline_split]: 1.54998e-06 [optimize]: 0.00550681, [53] [py_interpret_to_execute]: 3.324e-05 [rewriter_before_opt_a]: 8.466e-05 [opt_a]: 0.00312689, [2] [Cycle 1]: 0.00230562, [45] [expand_dump_flag]: 3.51999e-06 [switch_simplify]: 0.00010117 [loop_unroll]: 2.577e-05 [a_1]: 0.00050775 [with_stream_mark]: 1.949e-05 [recompute_prepare]: 1.046e-05 [updatestate_depend_eliminate]: 4.70999e-06 [updatestate_assign_eliminate]: 3.38999e-06 [updatestate_loads_eliminate]: 2.98998e-06 [parameter_eliminate]: 1.89999e-06 [a_2]: 0.00010981 [accelerated_algorithm]: 7.15e-06 [shard]: 1.94999e-06 [meta_shard_fg_expand]: 1.88997e-06 [shard_inline]: 6.22001e-06 [merge_send_recv]: 8.37e-06 [auto_parallel]: 7.6e-06 [parallel]: 1.875e-05 [flash_sp]: 9.34e-06 [merge_comm]: 4.60999e-06 [allreduce_fusion]: 3.65e-06 [matmul_add_comm_reduction]: 7.8e-06 [allreduce_slice_to_reducescatter]: 8.70001e-07 [virtual_shard_identity]: 8.2e-06 [virtual_dataset]: 6.31e-06 [get_grad_eliminate_]: 5.78997e-06 [virtual_output]: 6.16e-06 [merge_forward]: 4.47e-06 [cell_reuse_recompute_pass]: 1.66998e-06 [offload_activation]: 9.52999e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.64e-05 [merge_recompute_call_nodes]: 1.57001e-06 [before_grad]: 1.167e-05 [set_forward_comm_id_for_comm_node_pass]: 3.88001e-06 [meta_fg_expand]: 3.2e-06 [flash_sp_send_recv_attached]: 4.18001e-06 [receive_attached]: 2.34999e-06 [after_resolve]: 9.91998e-06 [a_after_grad]: 1.043e-05 [renormalize]: 0.00080693 [add_forward_monad_depend]: 6.80998e-06 [auto_monad_grad]: 2.22001e-06 [auto_monad_eliminator]: 1.724e-05 [cse]: 2.96e-05 [a_3]: 6.445e-05 [Cycle 2]: 0.00080595, [45] [expand_dump_flag]: 2.05002e-06 [switch_simplify]: 7.85998e-06 [loop_unroll]: 6.37001e-06 [a_1]: 0.00010738 [with_stream_mark]: 1.378e-05 [recompute_prepare]: 6.06e-06 [updatestate_depend_eliminate]: 3.58e-06 [updatestate_assign_eliminate]: 2.70997e-06 [updatestate_loads_eliminate]: 2.53e-06 [parameter_eliminate]: 1.41998e-06 [a_2]: 9.703e-05 [accelerated_algorithm]: 6.11e-06 [shard]: 1.54e-06 [meta_shard_fg_expand]: 1.69e-06 [shard_inline]: 5.91998e-06 [merge_send_recv]: 5.46002e-06 [auto_parallel]: 6.06e-06 [parallel]: 6.11e-06 [flash_sp]: 3.43e-06 [merge_comm]: 3.20998e-06 [allreduce_fusion]: 3.53999e-06 [matmul_add_comm_reduction]: 6.89999e-06 [allreduce_slice_to_reducescatter]: 5.50004e-07 [virtual_shard_identity]: 6.91001e-06 [virtual_dataset]: 5.50001e-06 [get_grad_eliminate_]: 5.76e-06 [virtual_output]: 5.26998e-06 [merge_forward]: 3.96001e-06 [cell_reuse_recompute_pass]: 1.67999e-06 [offload_activation]: 7.73001e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.657e-05 [merge_recompute_call_nodes]: 1.04998e-06 [before_grad]: 9.68002e-06 [set_forward_comm_id_for_comm_node_pass]: 3.7e-06 [meta_fg_expand]: 2.49999e-06 [flash_sp_send_recv_attached]: 8.70001e-07 [receive_attached]: 1.84e-06 [after_resolve]: 9.12999e-06 [a_after_grad]: 8.99e-06 [renormalize]: 1.00001e-07 [add_forward_monad_depend]: 1.67001e-06 [auto_monad_grad]: 1.14e-06 [auto_monad_eliminator]: 9.24998e-06 [cse]: 1.66e-05 [a_3]: 4.983e-05 [py_interpret_to_execute_after_opt_a]: 1.547e-05 [slice_cell_reuse_recomputed_activation]: 1.547e-05 [rewriter_after_opt_a]: 4.368e-05 [convert_after_rewriter]: 9.73998e-06 [order_py_execute_after_rewriter]: 8.80999e-06 [mutable_eliminate]: 0.00055898 [opt_b]: 0.00027104, [1] [Cycle 1]: 0.00026109, [7] [b_1]: 0.00016181 [b_2]: 7.36001e-06 [updatestate_depend_eliminate]: 6.48e-06 [updatestate_assign_eliminate]: 2.65997e-06 [updatestate_loads_eliminate]: 2.64999e-06 [renormalize]: 6.10016e-07 [cse]: 2.188e-05 [optimize_parallel_all_gather_comm]: 2.224e-05 [overlap_param_gather]: 4.99998e-06 [cconv]: 2.918e-05 [loop_unroll]: 0.0004653 [opt_after_cconv]: 0.00012791, [1] [Cycle 1]: 0.00011858, [7] [c_1]: 2.823e-05 [parameter_eliminate]: 3.6e-06 [updatestate_depend_eliminate]: 5.51e-06 [updatestate_assign_eliminate]: 2.68e-06 [updatestate_loads_eliminate]: 2.09e-06 [cse]: 1.871e-05 [renormalize]: 3.69997e-07 [remove_dup_value]: 1.858e-05 [tuple_transform]: 8.81e-05, [1] [Cycle 1]: 8.083e-05, [4] [d_1]: 4.08e-05 [none_parameter_eliminate]: 1.69e-06 [renormalize]: 2.10013e-07 [switch_simplify]: 6.64999e-06 [partial_unused_args_eliminate]: 4.23001e-06 [add_recomputation]: 4.898e-05 [cse_after_recomputation]: 2.918e-05, [1] [Cycle 1]: 2.238e-05, [1] [cse]: 1.299e-05 [environ_conv]: 7.58001e-06 [swap_dp_allreduce_reducescatter]: 7.88001e-06 [bias_add_comm_swap]: 5.09e-06 [label_micro_interleaved_index]: 6.78e-06 [label_fine_grained_interleaved_index]: 5.35001e-06 [merge_cast_opt]: 3.55e-06 [slice_recompute_activation]: 4.27998e-06 [micro_interleaved_order_control]: 4.70001e-06 [assign_add_opt]: 3.58999e-06 [ForceFp32Comm]: 3.61999e-06 [remove_cast_before_assign_add]: 3.26001e-06 [full_micro_interleaved_order_control]: 4.48999e-06 [reorder_send_recv_between_fp_bp]: 6.22001e-06 [comm_op_add_attrs]: 3.33e-06 [add_comm_op_reuse_tag]: 3.78999e-06 [interleave_split_concat_branches]: 3.41001e-06 [interleave_parallel_branches]: 3.34001e-06 [overlap_opt_shard_in_pipeline]: 3.51001e-06 [overlap_opt_shard_grad_in_pipeline]: 4.62998e-06 [control_data_broadcast_order]: 1.635e-05 [grouped_pairwise_exchange_alltoall]: 4.17e-06 [offloading_packed_experts]: 6.38998e-06 [overlap_recompute_and_grad_model_parallel]: 7.5e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.46001e-06 [overlap_recompute_allgather_and_fa_grad]: 3.68999e-06 [overlap_recompute_comm]: 5.56998e-06 [overlap_grad_ring_attention]: 6.24999e-06 [overlap_grad_flash_sp]: 2.163e-05 [begin_end_overlap_inline]: 2.81e-06 [split_matmul_comm_elemetwise]: 4.09002e-06 [split_layernorm_comm]: 3.90998e-06 [handle_group_info]: 3.09001e-06 [symbol_engine_optimizer]: 0.00011095, [1] [Cycle 1]: 0.00010363, [6] [build]: 2.76999e-06 [elim_shapecalc]: 9.84001e-06 [elim_not_effective]: 1.279e-05 [opt_reshape]: 1.953e-05 [fold_const_symbol]: 1.07e-05 [renormalize]: 1.80007e-07 [detach_backward]: 4.45e-06 [pipeline_parallel_scheduler]: 2.36e-06 [auto_monad_reorder]: 1.994e-05 [get_jit_bprop_graph]: 1.72001e-06 [rewriter_after_jit_bprop_graph]: 4.84e-06 [opt_after_jit_grad]: 0.00053218 [validate]: 4.128e-05 Sums bootstrap : 0.000492s : 2.68% type_inference : 0.012755s : 69.54% event_method : 0.000019s : 0.11% auto_monad : 0.000089s : 0.48% graph_reusing : 0.000008s : 0.04% inline : 0.000003s : 0.01% add_attr.add_attr_with_inline.tag_attr : 0.000021s : 0.11% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000007s : 0.04% parallel-infer-symbol : 0.000004s : 0.02% pre_auto_parallel : 0.000035s : 0.19% insert-virtual-dataset : 0.000003s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.01% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000033s : 0.18% optimize.rewriter_before_opt_a : 0.000085s : 0.46% optimize.opt_a.expand_dump_flag : 0.000006s : 0.03% optimize.opt_a.switch_simplify : 0.000109s : 0.59% optimize.opt_a.loop_unroll : 0.000032s : 0.18% optimize.opt_a.a_1 : 0.000615s : 3.35% optimize.opt_a.with_stream_mark : 0.000033s : 0.18% optimize.opt_a.recompute_prepare : 0.000017s : 0.09% optimize.opt_a.updatestate_depend_eliminate : 0.000008s : 0.05% optimize.opt_a.updatestate_assign_eliminate : 0.000006s : 0.03% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.03% optimize.opt_a.parameter_eliminate : 0.000003s : 0.02% optimize.opt_a.a_2 : 0.000207s : 1.13% optimize.opt_a.accelerated_algorithm : 0.000013s : 0.07% optimize.opt_a.shard : 0.000003s : 0.02% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.02% optimize.opt_a.shard_inline : 0.000012s : 0.07% optimize.opt_a.merge_send_recv : 0.000014s : 0.08% optimize.opt_a.auto_parallel : 0.000014s : 0.07% optimize.opt_a.parallel : 0.000025s : 0.14% optimize.opt_a.flash_sp : 0.000013s : 0.07% optimize.opt_a.merge_comm : 0.000008s : 0.04% optimize.opt_a.allreduce_fusion : 0.000007s : 0.04% optimize.opt_a.matmul_add_comm_reduction : 0.000015s : 0.08% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000015s : 0.08% optimize.opt_a.virtual_dataset : 0.000012s : 0.06% optimize.opt_a.get_grad_eliminate_ : 0.000012s : 0.06% optimize.opt_a.virtual_output : 0.000011s : 0.06% optimize.opt_a.merge_forward : 0.000008s : 0.05% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.02% optimize.opt_a.offload_activation : 0.000017s : 0.09% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000033s : 0.18% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.01% optimize.opt_a.before_grad : 0.000021s : 0.12% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000008s : 0.04% optimize.opt_a.meta_fg_expand : 0.000006s : 0.03% optimize.opt_a.flash_sp_send_recv_attached : 0.000005s : 0.03% optimize.opt_a.receive_attached : 0.000004s : 0.02% optimize.opt_a.after_resolve : 0.000019s : 0.10% optimize.opt_a.a_after_grad : 0.000019s : 0.11% optimize.opt_a.renormalize : 0.000807s : 4.40% optimize.opt_a.add_forward_monad_depend : 0.000008s : 0.05% optimize.opt_a.auto_monad_grad : 0.000003s : 0.02% optimize.opt_a.auto_monad_eliminator : 0.000026s : 0.14% optimize.opt_a.cse : 0.000046s : 0.25% optimize.opt_a.a_3 : 0.000114s : 0.62% optimize.py_interpret_to_execute_after_opt_a : 0.000015s : 0.08% optimize.slice_cell_reuse_recomputed_activation : 0.000015s : 0.08% optimize.rewriter_after_opt_a : 0.000044s : 0.24% optimize.convert_after_rewriter : 0.000010s : 0.05% optimize.order_py_execute_after_rewriter : 0.000009s : 0.05% optimize.mutable_eliminate : 0.000559s : 3.05% optimize.opt_b.b_1 : 0.000162s : 0.88% optimize.opt_b.b_2 : 0.000007s : 0.04% optimize.opt_b.updatestate_depend_eliminate : 0.000006s : 0.04% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.01% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.01% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000022s : 0.12% optimize.optimize_parallel_all_gather_comm : 0.000022s : 0.12% optimize.overlap_param_gather : 0.000005s : 0.03% optimize.cconv : 0.000029s : 0.16% optimize.loop_unroll : 0.000465s : 2.54% optimize.opt_after_cconv.c_1 : 0.000028s : 0.15% optimize.opt_after_cconv.parameter_eliminate : 0.000004s : 0.02% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.03% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.01% optimize.opt_after_cconv.cse : 0.000019s : 0.10% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000019s : 0.10% optimize.tuple_transform.d_1 : 0.000041s : 0.22% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000007s : 0.04% optimize.partial_unused_args_eliminate : 0.000004s : 0.02% optimize.add_recomputation : 0.000049s : 0.27% optimize.cse_after_recomputation.cse : 0.000013s : 0.07% optimize.environ_conv : 0.000008s : 0.04% optimize.swap_dp_allreduce_reducescatter : 0.000008s : 0.04% optimize.bias_add_comm_swap : 0.000005s : 0.03% optimize.label_micro_interleaved_index : 0.000007s : 0.04% optimize.label_fine_grained_interleaved_index : 0.000005s : 0.03% optimize.merge_cast_opt : 0.000004s : 0.02% optimize.slice_recompute_activation : 0.000004s : 0.02% optimize.micro_interleaved_order_control : 0.000005s : 0.03% optimize.assign_add_opt : 0.000004s : 0.02% optimize.ForceFp32Comm : 0.000004s : 0.02% optimize.remove_cast_before_assign_add : 0.000003s : 0.02% optimize.full_micro_interleaved_order_control : 0.000004s : 0.02% optimize.reorder_send_recv_between_fp_bp : 0.000006s : 0.03% optimize.comm_op_add_attrs : 0.000003s : 0.02% optimize.add_comm_op_reuse_tag : 0.000004s : 0.02% optimize.interleave_split_concat_branches : 0.000003s : 0.02% optimize.interleave_parallel_branches : 0.000003s : 0.02% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.02% optimize.overlap_opt_shard_grad_in_pipeline : 0.000005s : 0.03% optimize.control_data_broadcast_order : 0.000016s : 0.09% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.02% optimize.offloading_packed_experts : 0.000006s : 0.03% optimize.overlap_recompute_and_grad_model_parallel : 0.000007s : 0.04% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000003s : 0.02% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.02% optimize.overlap_recompute_comm : 0.000006s : 0.03% optimize.overlap_grad_ring_attention : 0.000006s : 0.03% optimize.overlap_grad_flash_sp : 0.000022s : 0.12% optimize.begin_end_overlap_inline : 0.000003s : 0.02% optimize.split_matmul_comm_elemetwise : 0.000004s : 0.02% optimize.split_layernorm_comm : 0.000004s : 0.02% optimize.handle_group_info : 0.000003s : 0.02% optimize.symbol_engine_optimizer.build : 0.000003s : 0.02% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000010s : 0.05% optimize.symbol_engine_optimizer.elim_not_effective : 0.000013s : 0.07% optimize.symbol_engine_optimizer.opt_reshape : 0.000020s : 0.11% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000011s : 0.06% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000004s : 0.02% pipeline_parallel_scheduler : 0.000002s : 0.01% auto_monad_reorder : 0.000020s : 0.11% get_jit_bprop_graph : 0.000002s : 0.01% rewriter_after_jit_bprop_graph : 0.000005s : 0.03% opt_after_jit_grad : 0.000532s : 2.90% validate : 0.000041s : 0.23% Time group info: ------[substitution.] 0.000158 25 1.43% : 0.000002s : 2: substitution.elim_not_effective 0.91% : 0.000001s : 2: substitution.fold_const_symbol 3.87% : 0.000006s : 3: substitution.graph_param_transform 78.97% : 0.000125s : 6: substitution.inline 2.58% : 0.000004s : 4: substitution.j_node_and_user_rematch 3.76% : 0.000006s : 4: substitution.remove_not_recompute_node 2.06% : 0.000003s : 2: substitution.replace_old_param 6.42% : 0.000010s : 2: substitution.switch_simplify ------[type_inference.] 0.012695 2 92.32% : 0.011720s : 1: type_inference.infer 7.68% : 0.000975s : 1: type_inference.specialize ------[replace.] 0.000074 8 45.92% : 0.000034s : 6: replace.inline 54.08% : 0.000040s : 2: replace.switch_simplify ------[match.] 0.000130 8 93.37% : 0.000121s : 6: match.inline 6.63% : 0.000009s : 2: match.switch_simplify ------[predicate.] 0.000183 996 0.92% : 0.000002s : 11: predicate.accumulaten_eliminater 1.15% : 0.000002s : 3: predicate.ad_related_special_op_eliminate 0.48% : 0.000001s : 6: predicate.addn_check_dump 0.93% : 0.000002s : 11: predicate.addn_zero_filter 0.82% : 0.000002s : 11: predicate.adjust_all_reduce_mul_add 2.27% : 0.000004s : 17: predicate.arithmetic_simplify 0.95% : 0.000002s : 11: predicate.cast_eliminate 0.56% : 0.000001s : 6: predicate.check_bprop_eliminate 0.49% : 0.000001s : 6: predicate.compare_switch_simplify 0.20% : 0.000000s : 3: predicate.const_output_eliminate 0.61% : 0.000001s : 6: predicate.depend_value_elim 0.90% : 0.000002s : 11: predicate.dict_get_item_const_eliminator 1.20% : 0.000002s : 11: predicate.dict_get_item_eliminator 0.86% : 0.000002s : 11: predicate.dict_set_item_eliminator 1.22% : 0.000002s : 6: predicate.dumpgradient_eliminate 0.19% : 0.000000s : 3: predicate.elim_not_effective 0.64% : 0.000001s : 3: predicate.elim_shapecalc_of_broadcastargs 1.13% : 0.000002s : 14: predicate.environ_add_const_eliminate 1.10% : 0.000002s : 14: predicate.environ_get_add_eliminate 1.17% : 0.000002s : 14: predicate.environ_get_depend_swap 1.73% : 0.000003s : 20: predicate.environ_get_eliminate 1.09% : 0.000002s : 14: predicate.environ_get_set_eliminate 1.35% : 0.000002s : 17: predicate.exchange_switch_depend_value 2.43% : 0.000004s : 17: predicate.float_depend_g_call 0.49% : 0.000001s : 6: predicate.float_environ_get_switch 0.73% : 0.000001s : 9: predicate.float_tuple_getitem_switch 0.15% : 0.000000s : 3: predicate.fold_const_symbol 0.63% : 0.000001s : 6: predicate.get_grad_eliminate 0.34% : 0.000001s : 3: predicate.graph_param_transform 0.57% : 0.000001s : 6: predicate.incorporate_call 0.46% : 0.000001s : 6: predicate.incorporate_call_switch 6.45% : 0.000012s : 46: predicate.inline 0.86% : 0.000002s : 6: predicate.inline_without_move 0.29% : 0.000001s : 6: predicate.j_node_and_user_rematch 0.76% : 0.000001s : 6: predicate.less_batch_normalization 1.47% : 0.000003s : 17: predicate.list_to_tuple_eliminator_ 2.05% : 0.000004s : 28: predicate.load_eliminater 1.15% : 0.000002s : 3: predicate.loop_unroll_after_grad 2.37% : 0.000004s : 27: predicate.loop_unroll_before_grad 1.68% : 0.000003s : 17: predicate.make_slice_get_slice_eliminator 0.51% : 0.000001s : 6: predicate.merge_addn 0.51% : 0.000001s : 6: predicate.micro_step_allgather_replace 0.52% : 0.000001s : 6: predicate.mini_step_allgather_replace 0.81% : 0.000001s : 11: predicate.minmaximum_grad 1.63% : 0.000003s : 3: predicate.mutable_eliminate 0.45% : 0.000001s : 3: predicate.opt_reshape 0.36% : 0.000001s : 3: predicate.parallel_virtual_node 1.88% : 0.000003s : 17: predicate.partial_defer_inline 1.27% : 0.000002s : 14: predicate.partial_eliminate 1.15% : 0.000002s : 11: predicate.print_const_string_wrapper 0.66% : 0.000001s : 6: predicate.reduce_all_const_elim 1.19% : 0.000002s : 11: predicate.reduce_eliminate 2.36% : 0.000004s : 28: predicate.redundant_stop_gradient_eliminater 0.54% : 0.000001s : 6: predicate.remove_not_recompute_node 1.52% : 0.000003s : 17: predicate.replace_applicator 0.45% : 0.000001s : 6: predicate.replace_old_param 0.38% : 0.000001s : 3: predicate.reset_defer_inline 1.04% : 0.000002s : 11: predicate.reshape_eliminate 0.56% : 0.000001s : 6: predicate.row_tensor_add_zeros_like 0.34% : 0.000001s : 3: predicate.row_tensor_eliminate 0.84% : 0.000002s : 6: predicate.same_eliminate 0.37% : 0.000001s : 6: predicate.set_cell_output_no_recompute 0.93% : 0.000002s : 6: predicate.shard_identity_eliminate 0.68% : 0.000001s : 6: predicate.special_op_eliminate 0.67% : 0.000001s : 6: predicate.specialize_transform 1.02% : 0.000002s : 6: predicate.split_environ_get_set_with_tuple_value 0.81% : 0.000001s : 6: predicate.stack_unstack_eliminate 0.39% : 0.000001s : 3: predicate.switch_call_monad_eliminater 1.60% : 0.000003s : 17: predicate.switch_defer_inline 1.94% : 0.000004s : 23: predicate.switch_layer_defer_inline 5.81% : 0.000011s : 57: predicate.switch_simplify 0.93% : 0.000002s : 11: predicate.tile_eliminate 0.89% : 0.000002s : 11: predicate.transpose_eliminate 1.43% : 0.000003s : 17: predicate.tuple_list_convert_item_index_to_positive 1.58% : 0.000003s : 17: predicate.tuple_list_get_item_const_eliminator 1.43% : 0.000003s : 17: predicate.tuple_list_get_item_depend_reorder 3.12% : 0.000006s : 23: predicate.tuple_list_get_item_eliminator 1.56% : 0.000003s : 17: predicate.tuple_list_get_set_item_eliminator 2.40% : 0.000004s : 23: predicate.tuple_list_set_item_eliminator 1.43% : 0.000003s : 17: predicate.tuple_to_list_eliminator_ 2.10% : 0.000004s : 28: predicate.updatestate_pure_node_eliminater 2.91% : 0.000005s : 34: predicate.updatestate_useless_node_eliminater 0.34% : 0.000001s : 3: predicate.value_based_eliminate 0.58% : 0.000001s : 6: predicate.virtual_dataset_eliminate 0.57% : 0.000001s : 6: predicate.virtual_output_eliminate 0.27% : 0.000000s : 3: predicate.virtual_view_grad_eliminate 0.41% : 0.000001s : 3: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000653 10 51.81% : 0.000338s : 2: func_graph_cloner_run.FuncGraphClonerGraph 48.19% : 0.000315s : 8: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.034157 192 0.02% : 0.000007s : 1: ForceFp32Comm 9.52% : 0.003250s : 1: add_attr 9.47% : 0.003234s : 1: add_attr_with_inline 0.02% : 0.000006s : 1: add_comm_op_reuse_tag 0.16% : 0.000053s : 1: add_recomputation 0.02% : 0.000006s : 1: assign_add_opt 0.29% : 0.000098s : 1: auto_monad 0.08% : 0.000027s : 1: auto_monad_reorder 0.02% : 0.000005s : 1: begin_end_overlap_inline 0.02% : 0.000008s : 1: bias_add_comm_swap 1.57% : 0.000538s : 1: bootstrap 0.10% : 0.000033s : 1: cconv 0.02% : 0.000006s : 1: comm_op_add_attrs 0.06% : 0.000019s : 1: control_data_broadcast_order 0.04% : 0.000013s : 1: convert_after_rewriter 0.09% : 0.000032s : 1: cse_after_recomputation 0.02% : 0.000009s : 1: dataset_repeat_opt 0.07% : 0.000023s : 1: detach_backward 0.03% : 0.000010s : 1: environ_conv 0.09% : 0.000031s : 1: event_method 0.02% : 0.000007s : 1: full_micro_interleaved_order_control 0.02% : 0.000008s : 1: get_jit_bprop_graph 0.04% : 0.000014s : 1: graph_reusing 0.02% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.02% : 0.000006s : 1: handle_group_info 0.02% : 0.000008s : 1: inline 0.03% : 0.000009s : 1: insert-virtual-dataset 0.02% : 0.000006s : 1: interleave_parallel_branches 0.02% : 0.000006s : 1: interleave_split_concat_branches 0.02% : 0.000008s : 1: label_fine_grained_interleaved_index 0.03% : 0.000009s : 1: label_micro_interleaved_index 1.38% : 0.000472s : 1: loop_unroll 0.02% : 0.000006s : 1: merge_cast_opt 0.02% : 0.000007s : 1: micro_interleaved_order_control 1.65% : 0.000565s : 1: mutable_eliminate 0.03% : 0.000009s : 1: offloading_packed_experts 0.04% : 0.000015s : 1: opt.transform.loop_unroll_optimizer 0.05% : 0.000017s : 1: opt.transform.mutable_eliminate 3.17% : 0.001084s : 78: opt.transform.opt_a 0.08% : 0.000027s : 1: opt.transform.opt_after_cconv 0.08% : 0.000029s : 1: opt.transform.opt_after_jit_grad 0.28% : 0.000097s : 28: opt.transform.opt_b 0.13% : 0.000045s : 2: opt.transform.opt_trans_graph 0.14% : 0.000049s : 4: opt.transform.symbol_engine_opt 9.17% : 0.003131s : 1: opt_a 0.38% : 0.000131s : 1: opt_after_cconv 1.59% : 0.000544s : 1: opt_after_jit_grad 0.80% : 0.000274s : 1: opt_b 17.19% : 0.005870s : 1: optimize 0.08% : 0.000026s : 1: optimize_parallel_all_gather_comm 0.03% : 0.000012s : 1: order_py_execute_after_rewriter 0.07% : 0.000025s : 1: overlap_grad_flash_sp 0.02% : 0.000006s : 1: overlap_grad_matmul_and_grad_allreduce 0.03% : 0.000009s : 1: overlap_grad_ring_attention 0.02% : 0.000007s : 1: overlap_opt_shard_grad_in_pipeline 0.02% : 0.000007s : 1: overlap_opt_shard_in_pipeline 0.02% : 0.000008s : 1: overlap_param_gather 0.02% : 0.000006s : 1: overlap_recompute_allgather_and_fa_grad 0.03% : 0.000010s : 1: overlap_recompute_and_grad_model_parallel 0.02% : 0.000009s : 1: overlap_recompute_comm 0.03% : 0.000010s : 1: parallel-infer-symbol 0.02% : 0.000007s : 1: parallel-infer-symbol-second 0.02% : 0.000007s : 1: partial_unused_args_eliminate 0.03% : 0.000010s : 1: pipeline_parallel_scheduler 0.02% : 0.000007s : 1: pipeline_split 0.13% : 0.000043s : 1: pre_auto_parallel 0.11% : 0.000037s : 1: py_interpret_to_execute 0.05% : 0.000019s : 1: py_interpret_to_execute_after_opt_a 0.02% : 0.000006s : 1: remove_cast_before_assign_add 0.06% : 0.000022s : 1: remove_dup_value 1.23% : 0.000420s : 1: renormalize.infer 1.10% : 0.000377s : 1: renormalize.specialize 0.03% : 0.000009s : 1: reorder_send_recv_between_fp_bp 0.03% : 0.000011s : 1: rewriter_after_jit_bprop_graph 0.14% : 0.000047s : 1: rewriter_after_opt_a 0.26% : 0.000089s : 1: rewriter_before_opt_a 0.06% : 0.000019s : 1: slice_cell_reuse_recomputed_activation 0.02% : 0.000007s : 1: slice_recompute_activation 0.02% : 0.000006s : 1: split_layernorm_comm 0.02% : 0.000007s : 1: split_matmul_comm_elemetwise 0.03% : 0.000011s : 1: swap_dp_allreduce_reducescatter 0.33% : 0.000114s : 1: symbol_engine_optimizer 0.27% : 0.000091s : 1: tuple_transform 37.44% : 0.012788s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:47:06.339.252 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0204142, [21] [bootstrap]: 0.00043892 [type_inference]: 0.011804 [event_method]: 1.671e-05 [auto_monad]: 8.111e-05 [graph_reusing]: 6.56e-06 [inline]: 2.04e-06 [add_attr]: 0.00303054, [1] [add_attr_with_inline]: 0.00302226, [1] [Cycle 1]: 5.445e-05, [2] [tag_attr]: 1.85e-05 [meta_addattr_fg_expand]: 5.91e-06 [parallel-infer-symbol]: 3.07002e-06 [pre_auto_parallel]: 2.923e-05 [insert-virtual-dataset]: 2.89999e-06 [parallel-infer-symbol-second]: 6.60017e-07 [dataset_repeat_opt]: 1.81998e-06 [pipeline_split]: 1.96e-06 [optimize]: 0.00434193, [53] [py_interpret_to_execute]: 2.262e-05 [rewriter_before_opt_a]: 7.461e-05 [opt_a]: 0.00245857, [2] [Cycle 1]: 0.00182666, [45] [expand_dump_flag]: 3.49001e-06 [switch_simplify]: 9.772e-05 [loop_unroll]: 2.636e-05 [a_1]: 0.0004919 [with_stream_mark]: 1.438e-05 [recompute_prepare]: 7.97e-06 [updatestate_depend_eliminate]: 3.96001e-06 [updatestate_assign_eliminate]: 3.95e-06 [updatestate_loads_eliminate]: 3.16999e-06 [parameter_eliminate]: 1.72001e-06 [a_2]: 7.839e-05 [accelerated_algorithm]: 6.69001e-06 [shard]: 1.70001e-06 [meta_shard_fg_expand]: 1.87001e-06 [shard_inline]: 6.13002e-06 [merge_send_recv]: 8.02998e-06 [auto_parallel]: 6.53e-06 [parallel]: 1.789e-05 [flash_sp]: 7.11999e-06 [merge_comm]: 4.47e-06 [allreduce_fusion]: 4.06001e-06 [matmul_add_comm_reduction]: 8.94998e-06 [allreduce_slice_to_reducescatter]: 6.49976e-07 [virtual_shard_identity]: 8.00999e-06 [virtual_dataset]: 6.59001e-06 [get_grad_eliminate_]: 5.86e-06 [virtual_output]: 6.38e-06 [merge_forward]: 3.66999e-06 [cell_reuse_recompute_pass]: 1.30999e-06 [offload_activation]: 9.66998e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.254e-05 [merge_recompute_call_nodes]: 1.46002e-06 [before_grad]: 9.95002e-06 [set_forward_comm_id_for_comm_node_pass]: 3.90998e-06 [meta_fg_expand]: 2.89999e-06 [flash_sp_send_recv_attached]: 2.59001e-06 [receive_attached]: 2.31e-06 [after_resolve]: 9.68002e-06 [a_after_grad]: 9.07001e-06 [renormalize]: 0.00058915 [add_forward_monad_depend]: 5.20001e-06 [auto_monad_grad]: 1.60001e-06 [auto_monad_eliminator]: 1.422e-05 [cse]: 2.781e-05 [a_3]: 4.694e-05 [Cycle 2]: 0.00062265, [45] [expand_dump_flag]: 1.05001e-06 [switch_simplify]: 8.08001e-06 [loop_unroll]: 6.13002e-06 [a_1]: 0.00010576 [with_stream_mark]: 2.074e-05 [recompute_prepare]: 6.66999e-06 [updatestate_depend_eliminate]: 3.06999e-06 [updatestate_assign_eliminate]: 2.34999e-06 [updatestate_loads_eliminate]: 2.73e-06 [parameter_eliminate]: 1.19e-06 [a_2]: 6.977e-05 [accelerated_algorithm]: 6.03998e-06 [shard]: 1.12999e-06 [meta_shard_fg_expand]: 1.28002e-06 [shard_inline]: 5.99999e-06 [merge_send_recv]: 5.23002e-06 [auto_parallel]: 5.79e-06 [parallel]: 5.22999e-06 [flash_sp]: 3.26001e-06 [merge_comm]: 3.33e-06 [allreduce_fusion]: 2.94999e-06 [matmul_add_comm_reduction]: 5.37001e-06 [allreduce_slice_to_reducescatter]: 4.10015e-07 [virtual_shard_identity]: 6.48e-06 [virtual_dataset]: 5.49e-06 [get_grad_eliminate_]: 5.16002e-06 [virtual_output]: 5.20001e-06 [merge_forward]: 2.58e-06 [cell_reuse_recompute_pass]: 1.45001e-06 [offload_activation]: 6.07999e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.282e-05 [merge_recompute_call_nodes]: 7.2e-07 [before_grad]: 8.84998e-06 [set_forward_comm_id_for_comm_node_pass]: 3.45e-06 [meta_fg_expand]: 2.16e-06 [flash_sp_send_recv_attached]: 8.00006e-07 [receive_attached]: 1.00001e-06 [after_resolve]: 8.92e-06 [a_after_grad]: 8.14002e-06 [renormalize]: 8.9989e-08 [add_forward_monad_depend]: 1.37e-06 [auto_monad_grad]: 8.49977e-07 [auto_monad_eliminator]: 7.06001e-06 [cse]: 1.359e-05 [a_3]: 4.455e-05 [py_interpret_to_execute_after_opt_a]: 8.51997e-06 [slice_cell_reuse_recomputed_activation]: 1.97999e-06 [rewriter_after_opt_a]: 3.346e-05 [convert_after_rewriter]: 6.64999e-06 [order_py_execute_after_rewriter]: 5.25001e-06 [mutable_eliminate]: 0.00045222 [opt_b]: 0.0001945, [1] [Cycle 1]: 0.00018855, [7] [b_1]: 0.00011841 [b_2]: 7.17997e-06 [updatestate_depend_eliminate]: 5.15999e-06 [updatestate_assign_eliminate]: 2.54999e-06 [updatestate_loads_eliminate]: 2.68e-06 [renormalize]: 6.50005e-07 [cse]: 1.736e-05 [optimize_parallel_all_gather_comm]: 1.604e-05 [overlap_param_gather]: 2.00002e-06 [cconv]: 2.396e-05 [loop_unroll]: 0.00041359 [opt_after_cconv]: 9.821e-05, [1] [Cycle 1]: 9.261e-05, [7] [c_1]: 2.741e-05 [parameter_eliminate]: 2.34001e-06 [updatestate_depend_eliminate]: 5.09e-06 [updatestate_assign_eliminate]: 3.13e-06 [updatestate_loads_eliminate]: 2.16e-06 [cse]: 1.733e-05 [renormalize]: 5.10016e-07 [remove_dup_value]: 1.424e-05 [tuple_transform]: 6.867e-05, [1] [Cycle 1]: 6.422e-05, [4] [d_1]: 3.809e-05 [none_parameter_eliminate]: 1.63002e-06 [renormalize]: 2.19996e-07 [switch_simplify]: 6.47001e-06 [partial_unused_args_eliminate]: 1.95001e-06 [add_recomputation]: 4.367e-05 [cse_after_recomputation]: 2.063e-05, [1] [Cycle 1]: 1.597e-05, [1] [cse]: 1.062e-05 [environ_conv]: 4.68999e-06 [swap_dp_allreduce_reducescatter]: 4.87e-06 [bias_add_comm_swap]: 2.43e-06 [label_micro_interleaved_index]: 4.43001e-06 [label_fine_grained_interleaved_index]: 2.52001e-06 [merge_cast_opt]: 1.21002e-06 [slice_recompute_activation]: 1.94e-06 [micro_interleaved_order_control]: 2.76e-06 [assign_add_opt]: 1.10999e-06 [ForceFp32Comm]: 1.07e-06 [remove_cast_before_assign_add]: 1.29e-06 [full_micro_interleaved_order_control]: 2.32001e-06 [reorder_send_recv_between_fp_bp]: 2.59999e-06 [comm_op_add_attrs]: 1.00001e-06 [add_comm_op_reuse_tag]: 9.99979e-07 [interleave_split_concat_branches]: 1.12e-06 [interleave_parallel_branches]: 1.00999e-06 [overlap_opt_shard_in_pipeline]: 1.06002e-06 [overlap_opt_shard_grad_in_pipeline]: 2.01998e-06 [control_data_broadcast_order]: 1.224e-05 [grouped_pairwise_exchange_alltoall]: 1.56002e-06 [offloading_packed_experts]: 3.28998e-06 [overlap_recompute_and_grad_model_parallel]: 5.34998e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.15999e-06 [overlap_recompute_allgather_and_fa_grad]: 1.40001e-06 [overlap_recompute_comm]: 2.81e-06 [overlap_grad_ring_attention]: 3.83001e-06 [overlap_grad_flash_sp]: 1.811e-05 [begin_end_overlap_inline]: 5.50004e-07 [split_matmul_comm_elemetwise]: 1.87001e-06 [split_layernorm_comm]: 1.81e-06 [handle_group_info]: 1.02998e-06 [symbol_engine_optimizer]: 7.372e-05, [1] [Cycle 1]: 6.951e-05, [6] [build]: 2.54999e-06 [elim_shapecalc]: 9.49e-06 [elim_not_effective]: 1.245e-05 [opt_reshape]: 6.84001e-06 [fold_const_symbol]: 9.92001e-06 [renormalize]: 2.50002e-07 [detach_backward]: 1.67001e-06 [pipeline_parallel_scheduler]: 1.58002e-06 [auto_monad_reorder]: 1.652e-05 [get_jit_bprop_graph]: 1.39e-06 [rewriter_after_jit_bprop_graph]: 3.45e-06 [opt_after_jit_grad]: 0.00045284 [validate]: 3.429e-05 Sums bootstrap : 0.000439s : 2.67% type_inference : 0.011804s : 71.71% event_method : 0.000017s : 0.10% auto_monad : 0.000081s : 0.49% graph_reusing : 0.000007s : 0.04% inline : 0.000002s : 0.01% add_attr.add_attr_with_inline.tag_attr : 0.000018s : 0.11% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.04% parallel-infer-symbol : 0.000003s : 0.02% pre_auto_parallel : 0.000029s : 0.18% insert-virtual-dataset : 0.000003s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.01% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000023s : 0.14% optimize.rewriter_before_opt_a : 0.000075s : 0.45% optimize.opt_a.expand_dump_flag : 0.000005s : 0.03% optimize.opt_a.switch_simplify : 0.000106s : 0.64% optimize.opt_a.loop_unroll : 0.000032s : 0.20% optimize.opt_a.a_1 : 0.000598s : 3.63% optimize.opt_a.with_stream_mark : 0.000035s : 0.21% optimize.opt_a.recompute_prepare : 0.000015s : 0.09% optimize.opt_a.updatestate_depend_eliminate : 0.000007s : 0.04% optimize.opt_a.updatestate_assign_eliminate : 0.000006s : 0.04% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.04% optimize.opt_a.parameter_eliminate : 0.000003s : 0.02% optimize.opt_a.a_2 : 0.000148s : 0.90% optimize.opt_a.accelerated_algorithm : 0.000013s : 0.08% optimize.opt_a.shard : 0.000003s : 0.02% optimize.opt_a.meta_shard_fg_expand : 0.000003s : 0.02% optimize.opt_a.shard_inline : 0.000012s : 0.07% optimize.opt_a.merge_send_recv : 0.000013s : 0.08% optimize.opt_a.auto_parallel : 0.000012s : 0.07% optimize.opt_a.parallel : 0.000023s : 0.14% optimize.opt_a.flash_sp : 0.000010s : 0.06% optimize.opt_a.merge_comm : 0.000008s : 0.05% optimize.opt_a.allreduce_fusion : 0.000007s : 0.04% optimize.opt_a.matmul_add_comm_reduction : 0.000014s : 0.09% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000014s : 0.09% optimize.opt_a.virtual_dataset : 0.000012s : 0.07% optimize.opt_a.get_grad_eliminate_ : 0.000011s : 0.07% optimize.opt_a.virtual_output : 0.000012s : 0.07% optimize.opt_a.merge_forward : 0.000006s : 0.04% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.02% optimize.opt_a.offload_activation : 0.000016s : 0.10% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000025s : 0.15% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.01% optimize.opt_a.before_grad : 0.000019s : 0.11% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000007s : 0.04% optimize.opt_a.meta_fg_expand : 0.000005s : 0.03% optimize.opt_a.flash_sp_send_recv_attached : 0.000003s : 0.02% optimize.opt_a.receive_attached : 0.000003s : 0.02% optimize.opt_a.after_resolve : 0.000019s : 0.11% optimize.opt_a.a_after_grad : 0.000017s : 0.10% optimize.opt_a.renormalize : 0.000589s : 3.58% optimize.opt_a.add_forward_monad_depend : 0.000007s : 0.04% optimize.opt_a.auto_monad_grad : 0.000002s : 0.01% optimize.opt_a.auto_monad_eliminator : 0.000021s : 0.13% optimize.opt_a.cse : 0.000041s : 0.25% optimize.opt_a.a_3 : 0.000091s : 0.56% optimize.py_interpret_to_execute_after_opt_a : 0.000009s : 0.05% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.01% optimize.rewriter_after_opt_a : 0.000033s : 0.20% optimize.convert_after_rewriter : 0.000007s : 0.04% optimize.order_py_execute_after_rewriter : 0.000005s : 0.03% optimize.mutable_eliminate : 0.000452s : 2.75% optimize.opt_b.b_1 : 0.000118s : 0.72% optimize.opt_b.b_2 : 0.000007s : 0.04% optimize.opt_b.updatestate_depend_eliminate : 0.000005s : 0.03% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.02% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.02% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000017s : 0.11% optimize.optimize_parallel_all_gather_comm : 0.000016s : 0.10% optimize.overlap_param_gather : 0.000002s : 0.01% optimize.cconv : 0.000024s : 0.15% optimize.loop_unroll : 0.000414s : 2.51% optimize.opt_after_cconv.c_1 : 0.000027s : 0.17% optimize.opt_after_cconv.parameter_eliminate : 0.000002s : 0.01% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000005s : 0.03% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.02% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.01% optimize.opt_after_cconv.cse : 0.000017s : 0.11% optimize.opt_after_cconv.renormalize : 0.000001s : 0.00% optimize.remove_dup_value : 0.000014s : 0.09% optimize.tuple_transform.d_1 : 0.000038s : 0.23% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000006s : 0.04% optimize.partial_unused_args_eliminate : 0.000002s : 0.01% optimize.add_recomputation : 0.000044s : 0.27% optimize.cse_after_recomputation.cse : 0.000011s : 0.06% optimize.environ_conv : 0.000005s : 0.03% optimize.swap_dp_allreduce_reducescatter : 0.000005s : 0.03% optimize.bias_add_comm_swap : 0.000002s : 0.01% optimize.label_micro_interleaved_index : 0.000004s : 0.03% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.02% optimize.merge_cast_opt : 0.000001s : 0.01% optimize.slice_recompute_activation : 0.000002s : 0.01% optimize.micro_interleaved_order_control : 0.000003s : 0.02% optimize.assign_add_opt : 0.000001s : 0.01% optimize.ForceFp32Comm : 0.000001s : 0.01% optimize.remove_cast_before_assign_add : 0.000001s : 0.01% optimize.full_micro_interleaved_order_control : 0.000002s : 0.01% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.02% optimize.comm_op_add_attrs : 0.000001s : 0.01% optimize.add_comm_op_reuse_tag : 0.000001s : 0.01% optimize.interleave_split_concat_branches : 0.000001s : 0.01% optimize.interleave_parallel_branches : 0.000001s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.01% optimize.control_data_broadcast_order : 0.000012s : 0.07% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.01% optimize.offloading_packed_experts : 0.000003s : 0.02% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.03% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.01% optimize.overlap_recompute_comm : 0.000003s : 0.02% optimize.overlap_grad_ring_attention : 0.000004s : 0.02% optimize.overlap_grad_flash_sp : 0.000018s : 0.11% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.01% optimize.split_layernorm_comm : 0.000002s : 0.01% optimize.handle_group_info : 0.000001s : 0.01% optimize.symbol_engine_optimizer.build : 0.000003s : 0.02% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000009s : 0.06% optimize.symbol_engine_optimizer.elim_not_effective : 0.000012s : 0.08% optimize.symbol_engine_optimizer.opt_reshape : 0.000007s : 0.04% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000010s : 0.06% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.01% pipeline_parallel_scheduler : 0.000002s : 0.01% auto_monad_reorder : 0.000017s : 0.10% get_jit_bprop_graph : 0.000001s : 0.01% rewriter_after_jit_bprop_graph : 0.000003s : 0.02% opt_after_jit_grad : 0.000453s : 2.75% validate : 0.000034s : 0.21% Time group info: ------[substitution.] 0.000140 25 1.24% : 0.000002s : 2: substitution.elim_not_effective 0.99% : 0.000001s : 2: substitution.fold_const_symbol 3.59% : 0.000005s : 3: substitution.graph_param_transform 80.53% : 0.000113s : 6: substitution.inline 2.01% : 0.000003s : 4: substitution.j_node_and_user_rematch 3.17% : 0.000004s : 4: substitution.remove_not_recompute_node 2.05% : 0.000003s : 2: substitution.replace_old_param 6.43% : 0.000009s : 2: substitution.switch_simplify ------[type_inference.] 0.011753 2 92.57% : 0.010879s : 1: type_inference.infer 7.43% : 0.000874s : 1: type_inference.specialize ------[replace.] 0.000077 8 47.46% : 0.000036s : 6: replace.inline 52.54% : 0.000040s : 2: replace.switch_simplify ------[match.] 0.000117 8 93.43% : 0.000109s : 6: match.inline 6.57% : 0.000008s : 2: match.switch_simplify ------[predicate.] 0.000169 996 0.97% : 0.000002s : 11: predicate.accumulaten_eliminater 0.75% : 0.000001s : 3: predicate.ad_related_special_op_eliminate 0.50% : 0.000001s : 6: predicate.addn_check_dump 0.90% : 0.000002s : 11: predicate.addn_zero_filter 0.88% : 0.000001s : 11: predicate.adjust_all_reduce_mul_add 2.24% : 0.000004s : 17: predicate.arithmetic_simplify 1.05% : 0.000002s : 11: predicate.cast_eliminate 0.56% : 0.000001s : 6: predicate.check_bprop_eliminate 0.51% : 0.000001s : 6: predicate.compare_switch_simplify 0.17% : 0.000000s : 3: predicate.const_output_eliminate 0.60% : 0.000001s : 6: predicate.depend_value_elim 1.02% : 0.000002s : 11: predicate.dict_get_item_const_eliminator 1.15% : 0.000002s : 11: predicate.dict_get_item_eliminator 1.04% : 0.000002s : 11: predicate.dict_set_item_eliminator 0.86% : 0.000001s : 6: predicate.dumpgradient_eliminate 0.22% : 0.000000s : 3: predicate.elim_not_effective 0.36% : 0.000001s : 3: predicate.elim_shapecalc_of_broadcastargs 1.14% : 0.000002s : 14: predicate.environ_add_const_eliminate 1.10% : 0.000002s : 14: predicate.environ_get_add_eliminate 1.11% : 0.000002s : 14: predicate.environ_get_depend_swap 1.67% : 0.000003s : 20: predicate.environ_get_eliminate 1.12% : 0.000002s : 14: predicate.environ_get_set_eliminate 1.51% : 0.000003s : 17: predicate.exchange_switch_depend_value 2.66% : 0.000004s : 17: predicate.float_depend_g_call 0.51% : 0.000001s : 6: predicate.float_environ_get_switch 0.73% : 0.000001s : 9: predicate.float_tuple_getitem_switch 0.17% : 0.000000s : 3: predicate.fold_const_symbol 0.62% : 0.000001s : 6: predicate.get_grad_eliminate 0.22% : 0.000000s : 3: predicate.graph_param_transform 0.63% : 0.000001s : 6: predicate.incorporate_call 0.51% : 0.000001s : 6: predicate.incorporate_call_switch 6.58% : 0.000011s : 46: predicate.inline 0.79% : 0.000001s : 6: predicate.inline_without_move 0.32% : 0.000001s : 6: predicate.j_node_and_user_rematch 0.79% : 0.000001s : 6: predicate.less_batch_normalization 1.64% : 0.000003s : 17: predicate.list_to_tuple_eliminator_ 2.32% : 0.000004s : 28: predicate.load_eliminater 0.88% : 0.000001s : 3: predicate.loop_unroll_after_grad 2.44% : 0.000004s : 27: predicate.loop_unroll_before_grad 1.82% : 0.000003s : 17: predicate.make_slice_get_slice_eliminator 0.50% : 0.000001s : 6: predicate.merge_addn 0.50% : 0.000001s : 6: predicate.micro_step_allgather_replace 0.54% : 0.000001s : 6: predicate.mini_step_allgather_replace 0.89% : 0.000002s : 11: predicate.minmaximum_grad 0.99% : 0.000002s : 3: predicate.mutable_eliminate 0.37% : 0.000001s : 3: predicate.opt_reshape 0.28% : 0.000000s : 3: predicate.parallel_virtual_node 1.91% : 0.000003s : 17: predicate.partial_defer_inline 1.40% : 0.000002s : 14: predicate.partial_eliminate 0.92% : 0.000002s : 11: predicate.print_const_string_wrapper 0.53% : 0.000001s : 6: predicate.reduce_all_const_elim 1.18% : 0.000002s : 11: predicate.reduce_eliminate 2.24% : 0.000004s : 28: predicate.redundant_stop_gradient_eliminater 0.48% : 0.000001s : 6: predicate.remove_not_recompute_node 1.21% : 0.000002s : 17: predicate.replace_applicator 0.54% : 0.000001s : 6: predicate.replace_old_param 0.31% : 0.000001s : 3: predicate.reset_defer_inline 1.02% : 0.000002s : 11: predicate.reshape_eliminate 0.59% : 0.000001s : 6: predicate.row_tensor_add_zeros_like 0.35% : 0.000001s : 3: predicate.row_tensor_eliminate 0.73% : 0.000001s : 6: predicate.same_eliminate 0.40% : 0.000001s : 6: predicate.set_cell_output_no_recompute 0.77% : 0.000001s : 6: predicate.shard_identity_eliminate 0.77% : 0.000001s : 6: predicate.special_op_eliminate 0.70% : 0.000001s : 6: predicate.specialize_transform 0.92% : 0.000002s : 6: predicate.split_environ_get_set_with_tuple_value 0.71% : 0.000001s : 6: predicate.stack_unstack_eliminate 0.33% : 0.000001s : 3: predicate.switch_call_monad_eliminater 1.61% : 0.000003s : 17: predicate.switch_defer_inline 2.21% : 0.000004s : 23: predicate.switch_layer_defer_inline 5.91% : 0.000010s : 57: predicate.switch_simplify 0.97% : 0.000002s : 11: predicate.tile_eliminate 0.90% : 0.000002s : 11: predicate.transpose_eliminate 1.86% : 0.000003s : 17: predicate.tuple_list_convert_item_index_to_positive 1.63% : 0.000003s : 17: predicate.tuple_list_get_item_const_eliminator 1.47% : 0.000002s : 17: predicate.tuple_list_get_item_depend_reorder 2.99% : 0.000005s : 23: predicate.tuple_list_get_item_eliminator 1.67% : 0.000003s : 17: predicate.tuple_list_get_set_item_eliminator 2.54% : 0.000004s : 23: predicate.tuple_list_set_item_eliminator 1.69% : 0.000003s : 17: predicate.tuple_to_list_eliminator_ 2.28% : 0.000004s : 28: predicate.updatestate_pure_node_eliminater 2.86% : 0.000005s : 34: predicate.updatestate_useless_node_eliminater 0.37% : 0.000001s : 3: predicate.value_based_eliminate 0.61% : 0.000001s : 6: predicate.virtual_dataset_eliminate 0.63% : 0.000001s : 6: predicate.virtual_output_eliminate 0.25% : 0.000000s : 3: predicate.virtual_view_grad_eliminate 0.41% : 0.000001s : 3: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000587 10 54.00% : 0.000317s : 2: func_graph_cloner_run.FuncGraphClonerGraph 46.00% : 0.000270s : 8: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.029558 192 0.01% : 0.000004s : 1: ForceFp32Comm 10.27% : 0.003035s : 1: add_attr 10.24% : 0.003026s : 1: add_attr_with_inline 0.01% : 0.000003s : 1: add_comm_op_reuse_tag 0.16% : 0.000047s : 1: add_recomputation 0.01% : 0.000004s : 1: assign_add_opt 0.29% : 0.000086s : 1: auto_monad 0.07% : 0.000021s : 1: auto_monad_reorder 0.01% : 0.000004s : 1: begin_end_overlap_inline 0.02% : 0.000006s : 1: bias_add_comm_swap 1.58% : 0.000466s : 1: bootstrap 0.09% : 0.000027s : 1: cconv 0.01% : 0.000004s : 1: comm_op_add_attrs 0.05% : 0.000015s : 1: control_data_broadcast_order 0.03% : 0.000010s : 1: convert_after_rewriter 0.08% : 0.000024s : 1: cse_after_recomputation 0.02% : 0.000005s : 1: dataset_repeat_opt 0.02% : 0.000005s : 1: detach_backward 0.03% : 0.000008s : 1: environ_conv 0.08% : 0.000023s : 1: event_method 0.02% : 0.000005s : 1: full_micro_interleaved_order_control 0.02% : 0.000005s : 1: get_jit_bprop_graph 0.03% : 0.000010s : 1: graph_reusing 0.01% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000004s : 1: handle_group_info 0.02% : 0.000005s : 1: inline 0.02% : 0.000006s : 1: insert-virtual-dataset 0.01% : 0.000004s : 1: interleave_parallel_branches 0.01% : 0.000004s : 1: interleave_split_concat_branches 0.02% : 0.000006s : 1: label_fine_grained_interleaved_index 0.02% : 0.000007s : 1: label_micro_interleaved_index 1.43% : 0.000422s : 1: loop_unroll 0.01% : 0.000004s : 1: merge_cast_opt 0.02% : 0.000005s : 1: micro_interleaved_order_control 1.56% : 0.000461s : 1: mutable_eliminate 0.02% : 0.000006s : 1: offloading_packed_experts 0.05% : 0.000014s : 1: opt.transform.loop_unroll_optimizer 0.05% : 0.000014s : 1: opt.transform.mutable_eliminate 3.56% : 0.001053s : 78: opt.transform.opt_a 0.09% : 0.000026s : 1: opt.transform.opt_after_cconv 0.08% : 0.000022s : 1: opt.transform.opt_after_jit_grad 0.32% : 0.000094s : 28: opt.transform.opt_b 0.14% : 0.000042s : 2: opt.transform.opt_trans_graph 0.12% : 0.000035s : 4: opt.transform.symbol_engine_opt 8.33% : 0.002461s : 1: opt_a 0.34% : 0.000102s : 1: opt_after_cconv 1.56% : 0.000462s : 1: opt_after_jit_grad 0.67% : 0.000198s : 1: opt_b 14.70% : 0.004346s : 1: optimize 0.07% : 0.000020s : 1: optimize_parallel_all_gather_comm 0.03% : 0.000008s : 1: order_py_execute_after_rewriter 0.07% : 0.000021s : 1: overlap_grad_flash_sp 0.01% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.02% : 0.000007s : 1: overlap_grad_ring_attention 0.02% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.02% : 0.000005s : 1: overlap_param_gather 0.01% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.03% : 0.000008s : 1: overlap_recompute_and_grad_model_parallel 0.02% : 0.000006s : 1: overlap_recompute_comm 0.02% : 0.000006s : 1: parallel-infer-symbol 0.01% : 0.000003s : 1: parallel-infer-symbol-second 0.02% : 0.000005s : 1: partial_unused_args_eliminate 0.02% : 0.000005s : 1: pipeline_parallel_scheduler 0.02% : 0.000005s : 1: pipeline_split 0.11% : 0.000033s : 1: pre_auto_parallel 0.09% : 0.000027s : 1: py_interpret_to_execute 0.04% : 0.000012s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000004s : 1: remove_cast_before_assign_add 0.06% : 0.000017s : 1: remove_dup_value 1.00% : 0.000295s : 1: renormalize.infer 0.97% : 0.000287s : 1: renormalize.specialize 0.02% : 0.000005s : 1: reorder_send_recv_between_fp_bp 0.02% : 0.000007s : 1: rewriter_after_jit_bprop_graph 0.13% : 0.000037s : 1: rewriter_after_opt_a 0.27% : 0.000079s : 1: rewriter_before_opt_a 0.02% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.02% : 0.000005s : 1: slice_recompute_activation 0.01% : 0.000004s : 1: split_layernorm_comm 0.02% : 0.000005s : 1: split_matmul_comm_elemetwise 0.03% : 0.000008s : 1: swap_dp_allreduce_reducescatter 0.26% : 0.000077s : 1: symbol_engine_optimizer 0.24% : 0.000071s : 1: tuple_transform 39.98% : 0.011819s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:47:06.539.183 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:47:06.539.445 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0180922, [21] [bootstrap]: 0.00042537 [type_inference]: 0.00806095 [event_method]: 2.076e-05 [auto_monad]: 6.214e-05 [graph_reusing]: 5.57001e-06 [inline]: 1.85001e-06 [add_attr]: 0.00305773, [1] [add_attr_with_inline]: 0.00304961, [1] [Cycle 1]: 7.01e-05, [2] [tag_attr]: 2.016e-05 [meta_addattr_fg_expand]: 7.11999e-06 [parallel-infer-symbol]: 3.13e-06 [pre_auto_parallel]: 3.311e-05 [insert-virtual-dataset]: 2.24001e-06 [parallel-infer-symbol-second]: 7.10017e-07 [dataset_repeat_opt]: 2.11e-06 [pipeline_split]: 1.66998e-06 [optimize]: 0.00532141, [53] [py_interpret_to_execute]: 3.315e-05 [rewriter_before_opt_a]: 9.33e-05 [opt_a]: 0.00309679, [2] [Cycle 1]: 0.00222166, [45] [expand_dump_flag]: 2.98998e-06 [switch_simplify]: 4.539e-05 [loop_unroll]: 3.564e-05 [a_1]: 0.00071925 [with_stream_mark]: 1.433e-05 [recompute_prepare]: 9.12999e-06 [updatestate_depend_eliminate]: 3.87002e-06 [updatestate_assign_eliminate]: 3.75e-06 [updatestate_loads_eliminate]: 2.84999e-06 [parameter_eliminate]: 1.86e-06 [a_2]: 0.00012003 [accelerated_algorithm]: 7.76001e-06 [shard]: 1.94e-06 [meta_shard_fg_expand]: 1.92001e-06 [shard_inline]: 7.55998e-06 [merge_send_recv]: 8.30999e-06 [auto_parallel]: 6.02001e-06 [parallel]: 1.739e-05 [flash_sp]: 6.99001e-06 [merge_comm]: 4.02e-06 [allreduce_fusion]: 3.51001e-06 [matmul_add_comm_reduction]: 9.42001e-06 [allreduce_slice_to_reducescatter]: 6.39993e-07 [virtual_shard_identity]: 9.03002e-06 [virtual_dataset]: 7.68999e-06 [get_grad_eliminate_]: 6.93e-06 [virtual_output]: 7.08e-06 [merge_forward]: 3.91001e-06 [cell_reuse_recompute_pass]: 1.20999e-06 [offload_activation]: 1.049e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.53e-05 [merge_recompute_call_nodes]: 1.50001e-06 [before_grad]: 1.124e-05 [set_forward_comm_id_for_comm_node_pass]: 3.63999e-06 [meta_fg_expand]: 2.88e-06 [flash_sp_send_recv_attached]: 2.60002e-06 [receive_attached]: 2.02001e-06 [after_resolve]: 1.361e-05 [a_after_grad]: 1.136e-05 [renormalize]: 0.00057948 [add_forward_monad_depend]: 4.97e-06 [auto_monad_grad]: 1.92001e-06 [auto_monad_eliminator]: 1.444e-05 [cse]: 3.246e-05 [a_3]: 6.574e-05 [Cycle 2]: 0.0008622, [45] [expand_dump_flag]: 1.05999e-06 [switch_simplify]: 8.27998e-06 [loop_unroll]: 7.35e-06 [a_1]: 0.00016504 [with_stream_mark]: 1.046e-05 [recompute_prepare]: 7.15003e-06 [updatestate_depend_eliminate]: 3.08e-06 [updatestate_assign_eliminate]: 2.38998e-06 [updatestate_loads_eliminate]: 2.19999e-06 [parameter_eliminate]: 1.20001e-06 [a_2]: 0.00010841 [accelerated_algorithm]: 6.82002e-06 [shard]: 1.12e-06 [meta_shard_fg_expand]: 1.25999e-06 [shard_inline]: 6.71e-06 [merge_send_recv]: 4.89998e-06 [auto_parallel]: 5.29e-06 [parallel]: 4.38999e-06 [flash_sp]: 3.4e-06 [merge_comm]: 3.33e-06 [allreduce_fusion]: 3.46999e-06 [matmul_add_comm_reduction]: 5.20001e-06 [allreduce_slice_to_reducescatter]: 3.7998e-07 [virtual_shard_identity]: 7.8e-06 [virtual_dataset]: 6.74999e-06 [get_grad_eliminate_]: 6.53e-06 [virtual_output]: 6.31e-06 [merge_forward]: 2.59001e-06 [cell_reuse_recompute_pass]: 1.55999e-06 [offload_activation]: 6.09999e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.522e-05 [merge_recompute_call_nodes]: 7.00005e-07 [before_grad]: 9.99001e-06 [set_forward_comm_id_for_comm_node_pass]: 3.4e-06 [meta_fg_expand]: 2.21e-06 [flash_sp_send_recv_attached]: 8.29983e-07 [receive_attached]: 9.89996e-07 [after_resolve]: 1.266e-05 [a_after_grad]: 1.046e-05 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 1.19e-06 [auto_monad_grad]: 8.89995e-07 [auto_monad_eliminator]: 7.09001e-06 [cse]: 1.435e-05 [a_3]: 5.459e-05 [py_interpret_to_execute_after_opt_a]: 1.195e-05 [slice_cell_reuse_recomputed_activation]: 4.37e-06 [rewriter_after_opt_a]: 3.732e-05 [convert_after_rewriter]: 9.35001e-06 [order_py_execute_after_rewriter]: 7.71001e-06 [mutable_eliminate]: 0.00046628 [opt_b]: 0.00029289, [1] [Cycle 1]: 0.00028448, [7] [b_1]: 0.00019002 [b_2]: 8.27e-06 [updatestate_depend_eliminate]: 5.00001e-06 [updatestate_assign_eliminate]: 2.64001e-06 [updatestate_loads_eliminate]: 2.61999e-06 [renormalize]: 3.80009e-07 [cse]: 1.861e-05 [optimize_parallel_all_gather_comm]: 1.877e-05 [overlap_param_gather]: 4.63001e-06 [cconv]: 2.585e-05 [loop_unroll]: 0.00042485 [opt_after_cconv]: 0.00012804, [1] [Cycle 1]: 0.00012002, [7] [c_1]: 3.592e-05 [parameter_eliminate]: 2.31e-06 [updatestate_depend_eliminate]: 5.14e-06 [updatestate_assign_eliminate]: 2.56e-06 [updatestate_loads_eliminate]: 2.25002e-06 [cse]: 1.761e-05 [renormalize]: 4.09986e-07 [remove_dup_value]: 1.679e-05 [tuple_transform]: 9.618e-05, [1] [Cycle 1]: 8.916e-05, [4] [d_1]: 4.886e-05 [none_parameter_eliminate]: 1.74e-06 [renormalize]: 2.09984e-07 [switch_simplify]: 7.7e-06 [partial_unused_args_eliminate]: 4.62998e-06 [add_recomputation]: 4.787e-05 [cse_after_recomputation]: 2.806e-05, [1] [Cycle 1]: 2.142e-05, [1] [cse]: 1.211e-05 [environ_conv]: 7.53999e-06 [swap_dp_allreduce_reducescatter]: 8.01001e-06 [bias_add_comm_swap]: 4.87e-06 [label_micro_interleaved_index]: 6.53e-06 [label_fine_grained_interleaved_index]: 5.48002e-06 [merge_cast_opt]: 3.6e-06 [slice_recompute_activation]: 4.18001e-06 [micro_interleaved_order_control]: 4.99998e-06 [assign_add_opt]: 3.33998e-06 [ForceFp32Comm]: 2.98998e-06 [remove_cast_before_assign_add]: 3.18e-06 [full_micro_interleaved_order_control]: 4.25999e-06 [reorder_send_recv_between_fp_bp]: 5.33002e-06 [comm_op_add_attrs]: 3.18998e-06 [add_comm_op_reuse_tag]: 3.13e-06 [interleave_split_concat_branches]: 3.48e-06 [interleave_parallel_branches]: 3.45e-06 [overlap_opt_shard_in_pipeline]: 3.28998e-06 [overlap_opt_shard_grad_in_pipeline]: 4.30999e-06 [control_data_broadcast_order]: 1.455e-05 [grouped_pairwise_exchange_alltoall]: 3.91999e-06 [offloading_packed_experts]: 6.26998e-06 [overlap_recompute_and_grad_model_parallel]: 6.89999e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.4e-06 [overlap_recompute_allgather_and_fa_grad]: 4.07e-06 [overlap_recompute_comm]: 5.18002e-06 [overlap_grad_ring_attention]: 6.12001e-06 [overlap_grad_flash_sp]: 1.991e-05 [begin_end_overlap_inline]: 2.79001e-06 [split_matmul_comm_elemetwise]: 4.34002e-06 [split_layernorm_comm]: 3.98999e-06 [handle_group_info]: 3.38e-06 [symbol_engine_optimizer]: 9.567e-05, [1] [Cycle 1]: 8.928e-05, [6] [build]: 2.76999e-06 [elim_shapecalc]: 1.01e-05 [elim_not_effective]: 1.323e-05 [opt_reshape]: 7.63999e-06 [fold_const_symbol]: 1.091e-05 [renormalize]: 2.00002e-07 [detach_backward]: 3.09999e-06 [pipeline_parallel_scheduler]: 1.74998e-06 [auto_monad_reorder]: 1.954e-05 [get_jit_bprop_graph]: 1.15001e-06 [rewriter_after_jit_bprop_graph]: 3.96001e-06 [opt_after_jit_grad]: 0.00049008 [validate]: 3.432e-05 Sums bootstrap : 0.000425s : 3.18% type_inference : 0.008061s : 60.31% event_method : 0.000021s : 0.16% auto_monad : 0.000062s : 0.46% graph_reusing : 0.000006s : 0.04% inline : 0.000002s : 0.01% add_attr.add_attr_with_inline.tag_attr : 0.000020s : 0.15% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000007s : 0.05% parallel-infer-symbol : 0.000003s : 0.02% pre_auto_parallel : 0.000033s : 0.25% insert-virtual-dataset : 0.000002s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000033s : 0.25% optimize.rewriter_before_opt_a : 0.000093s : 0.70% optimize.opt_a.expand_dump_flag : 0.000004s : 0.03% optimize.opt_a.switch_simplify : 0.000054s : 0.40% optimize.opt_a.loop_unroll : 0.000043s : 0.32% optimize.opt_a.a_1 : 0.000884s : 6.62% optimize.opt_a.with_stream_mark : 0.000025s : 0.19% optimize.opt_a.recompute_prepare : 0.000016s : 0.12% optimize.opt_a.updatestate_depend_eliminate : 0.000007s : 0.05% optimize.opt_a.updatestate_assign_eliminate : 0.000006s : 0.05% optimize.opt_a.updatestate_loads_eliminate : 0.000005s : 0.04% optimize.opt_a.parameter_eliminate : 0.000003s : 0.02% optimize.opt_a.a_2 : 0.000228s : 1.71% optimize.opt_a.accelerated_algorithm : 0.000015s : 0.11% optimize.opt_a.shard : 0.000003s : 0.02% optimize.opt_a.meta_shard_fg_expand : 0.000003s : 0.02% optimize.opt_a.shard_inline : 0.000014s : 0.11% optimize.opt_a.merge_send_recv : 0.000013s : 0.10% optimize.opt_a.auto_parallel : 0.000011s : 0.08% optimize.opt_a.parallel : 0.000022s : 0.16% optimize.opt_a.flash_sp : 0.000010s : 0.08% optimize.opt_a.merge_comm : 0.000007s : 0.05% optimize.opt_a.allreduce_fusion : 0.000007s : 0.05% optimize.opt_a.matmul_add_comm_reduction : 0.000015s : 0.11% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000017s : 0.13% optimize.opt_a.virtual_dataset : 0.000014s : 0.11% optimize.opt_a.get_grad_eliminate_ : 0.000013s : 0.10% optimize.opt_a.virtual_output : 0.000013s : 0.10% optimize.opt_a.merge_forward : 0.000007s : 0.05% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.02% optimize.opt_a.offload_activation : 0.000017s : 0.12% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000031s : 0.23% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.02% optimize.opt_a.before_grad : 0.000021s : 0.16% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000007s : 0.05% optimize.opt_a.meta_fg_expand : 0.000005s : 0.04% optimize.opt_a.flash_sp_send_recv_attached : 0.000003s : 0.03% optimize.opt_a.receive_attached : 0.000003s : 0.02% optimize.opt_a.after_resolve : 0.000026s : 0.20% optimize.opt_a.a_after_grad : 0.000022s : 0.16% optimize.opt_a.renormalize : 0.000580s : 4.34% optimize.opt_a.add_forward_monad_depend : 0.000006s : 0.05% optimize.opt_a.auto_monad_grad : 0.000003s : 0.02% optimize.opt_a.auto_monad_eliminator : 0.000022s : 0.16% optimize.opt_a.cse : 0.000047s : 0.35% optimize.opt_a.a_3 : 0.000120s : 0.90% optimize.py_interpret_to_execute_after_opt_a : 0.000012s : 0.09% optimize.slice_cell_reuse_recomputed_activation : 0.000004s : 0.03% optimize.rewriter_after_opt_a : 0.000037s : 0.28% optimize.convert_after_rewriter : 0.000009s : 0.07% optimize.order_py_execute_after_rewriter : 0.000008s : 0.06% optimize.mutable_eliminate : 0.000466s : 3.49% optimize.opt_b.b_1 : 0.000190s : 1.42% optimize.opt_b.b_2 : 0.000008s : 0.06% optimize.opt_b.updatestate_depend_eliminate : 0.000005s : 0.04% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.02% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.02% optimize.opt_b.renormalize : 0.000000s : 0.00% optimize.opt_b.cse : 0.000019s : 0.14% optimize.optimize_parallel_all_gather_comm : 0.000019s : 0.14% optimize.overlap_param_gather : 0.000005s : 0.03% optimize.cconv : 0.000026s : 0.19% optimize.loop_unroll : 0.000425s : 3.18% optimize.opt_after_cconv.c_1 : 0.000036s : 0.27% optimize.opt_after_cconv.parameter_eliminate : 0.000002s : 0.02% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000005s : 0.04% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.02% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.02% optimize.opt_after_cconv.cse : 0.000018s : 0.13% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000017s : 0.13% optimize.tuple_transform.d_1 : 0.000049s : 0.37% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000008s : 0.06% optimize.partial_unused_args_eliminate : 0.000005s : 0.03% optimize.add_recomputation : 0.000048s : 0.36% optimize.cse_after_recomputation.cse : 0.000012s : 0.09% optimize.environ_conv : 0.000008s : 0.06% optimize.swap_dp_allreduce_reducescatter : 0.000008s : 0.06% optimize.bias_add_comm_swap : 0.000005s : 0.04% optimize.label_micro_interleaved_index : 0.000007s : 0.05% optimize.label_fine_grained_interleaved_index : 0.000005s : 0.04% optimize.merge_cast_opt : 0.000004s : 0.03% optimize.slice_recompute_activation : 0.000004s : 0.03% optimize.micro_interleaved_order_control : 0.000005s : 0.04% optimize.assign_add_opt : 0.000003s : 0.02% optimize.ForceFp32Comm : 0.000003s : 0.02% optimize.remove_cast_before_assign_add : 0.000003s : 0.02% optimize.full_micro_interleaved_order_control : 0.000004s : 0.03% optimize.reorder_send_recv_between_fp_bp : 0.000005s : 0.04% optimize.comm_op_add_attrs : 0.000003s : 0.02% optimize.add_comm_op_reuse_tag : 0.000003s : 0.02% optimize.interleave_split_concat_branches : 0.000003s : 0.03% optimize.interleave_parallel_branches : 0.000003s : 0.03% optimize.overlap_opt_shard_in_pipeline : 0.000003s : 0.02% optimize.overlap_opt_shard_grad_in_pipeline : 0.000004s : 0.03% optimize.control_data_broadcast_order : 0.000015s : 0.11% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.03% optimize.offloading_packed_experts : 0.000006s : 0.05% optimize.overlap_recompute_and_grad_model_parallel : 0.000007s : 0.05% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000003s : 0.03% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.03% optimize.overlap_recompute_comm : 0.000005s : 0.04% optimize.overlap_grad_ring_attention : 0.000006s : 0.05% optimize.overlap_grad_flash_sp : 0.000020s : 0.15% optimize.begin_end_overlap_inline : 0.000003s : 0.02% optimize.split_matmul_comm_elemetwise : 0.000004s : 0.03% optimize.split_layernorm_comm : 0.000004s : 0.03% optimize.handle_group_info : 0.000003s : 0.03% optimize.symbol_engine_optimizer.build : 0.000003s : 0.02% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000010s : 0.08% optimize.symbol_engine_optimizer.elim_not_effective : 0.000013s : 0.10% optimize.symbol_engine_optimizer.opt_reshape : 0.000008s : 0.06% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000011s : 0.08% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000003s : 0.02% pipeline_parallel_scheduler : 0.000002s : 0.01% auto_monad_reorder : 0.000020s : 0.15% get_jit_bprop_graph : 0.000001s : 0.01% rewriter_after_jit_bprop_graph : 0.000004s : 0.03% opt_after_jit_grad : 0.000490s : 3.67% validate : 0.000034s : 0.26% Time group info: ------[substitution.] 0.000187 34 0.92% : 0.000002s : 2: substitution.elim_not_effective 0.63% : 0.000001s : 2: substitution.fold_const_symbol 3.12% : 0.000006s : 5: substitution.graph_param_transform 75.82% : 0.000142s : 4: substitution.inline 1.80% : 0.000003s : 4: substitution.j_node_and_user_rematch 2.36% : 0.000004s : 4: substitution.remove_not_recompute_node 2.95% : 0.000006s : 6: substitution.replace_old_param 9.59% : 0.000018s : 6: substitution.tuple_list_get_item_eliminator 2.80% : 0.000005s : 1: substitution.value_based_eliminate ------[type_inference.] 0.008010 2 90.28% : 0.007232s : 1: type_inference.infer 9.72% : 0.000778s : 1: type_inference.specialize ------[replace.] 0.000069 10 55.20% : 0.000038s : 4: replace.inline 44.80% : 0.000031s : 6: replace.tuple_list_get_item_eliminator ------[match.] 0.000155 10 90.12% : 0.000139s : 4: match.inline 9.88% : 0.000015s : 6: match.tuple_list_get_item_eliminator ------[predicate.] 0.000230 1590 0.90% : 0.000002s : 16: predicate.accumulaten_eliminater 0.77% : 0.000002s : 5: predicate.ad_related_special_op_eliminate 0.53% : 0.000001s : 10: predicate.addn_check_dump 1.08% : 0.000002s : 16: predicate.addn_zero_filter 0.83% : 0.000002s : 16: predicate.adjust_all_reduce_mul_add 1.97% : 0.000005s : 26: predicate.arithmetic_simplify 0.97% : 0.000002s : 16: predicate.cast_eliminate 0.63% : 0.000001s : 10: predicate.check_bprop_eliminate 0.56% : 0.000001s : 10: predicate.compare_switch_simplify 0.20% : 0.000000s : 5: predicate.const_output_eliminate 0.60% : 0.000001s : 10: predicate.depend_value_elim 0.95% : 0.000002s : 16: predicate.dict_get_item_const_eliminator 1.04% : 0.000002s : 16: predicate.dict_get_item_eliminator 0.86% : 0.000002s : 16: predicate.dict_set_item_eliminator 0.88% : 0.000002s : 10: predicate.dumpgradient_eliminate 0.24% : 0.000001s : 5: predicate.elim_not_effective 0.40% : 0.000001s : 5: predicate.elim_shapecalc_of_broadcastargs 1.12% : 0.000003s : 21: predicate.environ_add_const_eliminate 1.10% : 0.000003s : 21: predicate.environ_get_add_eliminate 1.24% : 0.000003s : 21: predicate.environ_get_depend_swap 1.68% : 0.000004s : 31: predicate.environ_get_eliminate 1.12% : 0.000003s : 21: predicate.environ_get_set_eliminate 1.55% : 0.000004s : 26: predicate.exchange_switch_depend_value 2.51% : 0.000006s : 26: predicate.float_depend_g_call 0.54% : 0.000001s : 10: predicate.float_environ_get_switch 0.78% : 0.000002s : 15: predicate.float_tuple_getitem_switch 0.19% : 0.000000s : 5: predicate.fold_const_symbol 0.66% : 0.000002s : 10: predicate.get_grad_eliminate 0.22% : 0.000000s : 5: predicate.graph_param_transform 0.60% : 0.000001s : 10: predicate.incorporate_call 0.50% : 0.000001s : 10: predicate.incorporate_call_switch 6.13% : 0.000014s : 72: predicate.inline 0.70% : 0.000002s : 10: predicate.inline_without_move 0.37% : 0.000001s : 10: predicate.j_node_and_user_rematch 0.71% : 0.000002s : 10: predicate.less_batch_normalization 1.95% : 0.000004s : 32: predicate.list_to_tuple_eliminator_ 2.66% : 0.000006s : 48: predicate.load_eliminater 0.75% : 0.000002s : 5: predicate.loop_unroll_after_grad 2.41% : 0.000006s : 40: predicate.loop_unroll_before_grad 1.63% : 0.000004s : 26: predicate.make_slice_get_slice_eliminator 0.57% : 0.000001s : 10: predicate.merge_addn 0.62% : 0.000001s : 10: predicate.micro_step_allgather_replace 0.59% : 0.000001s : 10: predicate.mini_step_allgather_replace 0.85% : 0.000002s : 16: predicate.minmaximum_grad 0.85% : 0.000002s : 5: predicate.mutable_eliminate 0.34% : 0.000001s : 5: predicate.opt_reshape 0.39% : 0.000001s : 5: predicate.parallel_virtual_node 1.89% : 0.000004s : 26: predicate.partial_defer_inline 1.84% : 0.000004s : 27: predicate.partial_eliminate 0.92% : 0.000002s : 16: predicate.print_const_string_wrapper 0.55% : 0.000001s : 10: predicate.reduce_all_const_elim 1.13% : 0.000003s : 16: predicate.reduce_eliminate 2.61% : 0.000006s : 48: predicate.redundant_stop_gradient_eliminater 0.46% : 0.000001s : 10: predicate.remove_not_recompute_node 1.53% : 0.000004s : 32: predicate.replace_applicator 0.54% : 0.000001s : 10: predicate.replace_old_param 0.27% : 0.000001s : 5: predicate.reset_defer_inline 0.94% : 0.000002s : 16: predicate.reshape_eliminate 0.62% : 0.000001s : 10: predicate.row_tensor_add_zeros_like 0.33% : 0.000001s : 5: predicate.row_tensor_eliminate 0.70% : 0.000002s : 10: predicate.same_eliminate 0.43% : 0.000001s : 10: predicate.set_cell_output_no_recompute 0.71% : 0.000002s : 10: predicate.shard_identity_eliminate 0.66% : 0.000002s : 10: predicate.special_op_eliminate 0.65% : 0.000002s : 10: predicate.specialize_transform 0.81% : 0.000002s : 10: predicate.split_environ_get_set_with_tuple_value 0.70% : 0.000002s : 10: predicate.stack_unstack_eliminate 0.33% : 0.000001s : 5: predicate.switch_call_monad_eliminater 1.67% : 0.000004s : 26: predicate.switch_defer_inline 2.15% : 0.000005s : 36: predicate.switch_layer_defer_inline 5.23% : 0.000012s : 81: predicate.switch_simplify 0.86% : 0.000002s : 16: predicate.tile_eliminate 0.88% : 0.000002s : 16: predicate.transpose_eliminate 1.48% : 0.000003s : 26: predicate.tuple_list_convert_item_index_to_positive 1.59% : 0.000004s : 26: predicate.tuple_list_get_item_const_eliminator 1.40% : 0.000003s : 26: predicate.tuple_list_get_item_depend_reorder 3.45% : 0.000008s : 42: predicate.tuple_list_get_item_eliminator 1.50% : 0.000003s : 26: predicate.tuple_list_get_set_item_eliminator 2.14% : 0.000005s : 36: predicate.tuple_list_set_item_eliminator 1.84% : 0.000004s : 32: predicate.tuple_to_list_eliminator_ 2.61% : 0.000006s : 48: predicate.updatestate_pure_node_eliminater 3.28% : 0.000008s : 58: predicate.updatestate_useless_node_eliminater 0.62% : 0.000001s : 5: predicate.value_based_eliminate 0.61% : 0.000001s : 10: predicate.virtual_dataset_eliminate 0.60% : 0.000001s : 10: predicate.virtual_output_eliminate 0.32% : 0.000001s : 5: predicate.virtual_view_grad_eliminate 0.42% : 0.000001s : 5: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000592 11 54.02% : 0.000320s : 5: func_graph_cloner_run.FuncGraphClonerGraph 45.98% : 0.000272s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.028599 192 0.02% : 0.000006s : 1: ForceFp32Comm 10.72% : 0.003066s : 1: add_attr 10.68% : 0.003053s : 1: add_attr_with_inline 0.02% : 0.000006s : 1: add_comm_op_reuse_tag 0.18% : 0.000052s : 1: add_recomputation 0.02% : 0.000006s : 1: assign_add_opt 0.25% : 0.000071s : 1: auto_monad 0.09% : 0.000027s : 1: auto_monad_reorder 0.02% : 0.000005s : 1: begin_end_overlap_inline 0.03% : 0.000007s : 1: bias_add_comm_swap 1.62% : 0.000464s : 1: bootstrap 0.10% : 0.000029s : 1: cconv 0.02% : 0.000006s : 1: comm_op_add_attrs 0.06% : 0.000018s : 1: control_data_broadcast_order 0.04% : 0.000012s : 1: convert_after_rewriter 0.11% : 0.000031s : 1: cse_after_recomputation 0.03% : 0.000008s : 1: dataset_repeat_opt 0.06% : 0.000018s : 1: detach_backward 0.04% : 0.000010s : 1: environ_conv 0.11% : 0.000031s : 1: event_method 0.03% : 0.000008s : 1: full_micro_interleaved_order_control 0.03% : 0.000007s : 1: get_jit_bprop_graph 0.04% : 0.000012s : 1: graph_reusing 0.02% : 0.000006s : 1: grouped_pairwise_exchange_alltoall 0.02% : 0.000006s : 1: handle_group_info 0.03% : 0.000008s : 1: inline 0.03% : 0.000008s : 1: insert-virtual-dataset 0.02% : 0.000006s : 1: interleave_parallel_branches 0.02% : 0.000006s : 1: interleave_split_concat_branches 0.03% : 0.000008s : 1: label_fine_grained_interleaved_index 0.03% : 0.000009s : 1: label_micro_interleaved_index 1.51% : 0.000431s : 1: loop_unroll 0.02% : 0.000006s : 1: merge_cast_opt 0.03% : 0.000008s : 1: micro_interleaved_order_control 1.65% : 0.000472s : 1: mutable_eliminate 0.03% : 0.000009s : 1: offloading_packed_experts 0.05% : 0.000014s : 1: opt.transform.loop_unroll_optimizer 0.05% : 0.000014s : 1: opt.transform.mutable_eliminate 4.76% : 0.001362s : 78: opt.transform.opt_a 0.12% : 0.000035s : 1: opt.transform.opt_after_cconv 0.10% : 0.000028s : 1: opt.transform.opt_after_jit_grad 0.44% : 0.000126s : 28: opt.transform.opt_b 0.19% : 0.000054s : 2: opt.transform.opt_trans_graph 0.13% : 0.000039s : 4: opt.transform.symbol_engine_opt 10.84% : 0.003100s : 1: opt_a 0.46% : 0.000132s : 1: opt_after_cconv 1.75% : 0.000501s : 1: opt_after_jit_grad 1.04% : 0.000296s : 1: opt_b 19.69% : 0.005632s : 1: optimize 0.08% : 0.000022s : 1: optimize_parallel_all_gather_comm 0.04% : 0.000011s : 1: order_py_execute_after_rewriter 0.08% : 0.000023s : 1: overlap_grad_flash_sp 0.02% : 0.000006s : 1: overlap_grad_matmul_and_grad_allreduce 0.03% : 0.000009s : 1: overlap_grad_ring_attention 0.02% : 0.000007s : 1: overlap_opt_shard_grad_in_pipeline 0.02% : 0.000006s : 1: overlap_opt_shard_in_pipeline 0.03% : 0.000008s : 1: overlap_param_gather 0.02% : 0.000007s : 1: overlap_recompute_allgather_and_fa_grad 0.03% : 0.000010s : 1: overlap_recompute_and_grad_model_parallel 0.03% : 0.000008s : 1: overlap_recompute_comm 0.03% : 0.000010s : 1: parallel-infer-symbol 0.02% : 0.000006s : 1: parallel-infer-symbol-second 0.03% : 0.000008s : 1: partial_unused_args_eliminate 0.03% : 0.000009s : 1: pipeline_parallel_scheduler 0.03% : 0.000007s : 1: pipeline_split 0.14% : 0.000040s : 1: pre_auto_parallel 0.13% : 0.000037s : 1: py_interpret_to_execute 0.05% : 0.000015s : 1: py_interpret_to_execute_after_opt_a 0.02% : 0.000006s : 1: remove_cast_before_assign_add 0.07% : 0.000020s : 1: remove_dup_value 1.05% : 0.000299s : 1: renormalize.infer 0.96% : 0.000273s : 1: renormalize.specialize 0.03% : 0.000008s : 1: reorder_send_recv_between_fp_bp 0.03% : 0.000010s : 1: rewriter_after_jit_bprop_graph 0.14% : 0.000041s : 1: rewriter_after_opt_a 0.34% : 0.000097s : 1: rewriter_before_opt_a 0.02% : 0.000007s : 1: slice_cell_reuse_recomputed_activation 0.02% : 0.000007s : 1: slice_recompute_activation 0.02% : 0.000007s : 1: split_layernorm_comm 0.02% : 0.000007s : 1: split_matmul_comm_elemetwise 0.04% : 0.000011s : 1: swap_dp_allreduce_reducescatter 0.35% : 0.000099s : 1: symbol_engine_optimizer 0.35% : 0.000099s : 1: tuple_transform 28.29% : 0.008092s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:47:06.731.273 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0146832, [21] [bootstrap]: 0.00043033 [type_inference]: 0.0057989 [event_method]: 1.938e-05 [auto_monad]: 6.058e-05 [graph_reusing]: 5.65001e-06 [inline]: 2.32999e-06 [add_attr]: 0.00298853, [1] [add_attr_with_inline]: 0.0029801, [1] [Cycle 1]: 5.289e-05, [2] [tag_attr]: 1.92e-05 [meta_addattr_fg_expand]: 6.48e-06 [parallel-infer-symbol]: 3.39001e-06 [pre_auto_parallel]: 3.264e-05 [insert-virtual-dataset]: 2.19999e-06 [parallel-infer-symbol-second]: 8.00006e-07 [dataset_repeat_opt]: 2.19999e-06 [pipeline_split]: 1.64998e-06 [optimize]: 0.00468407, [53] [py_interpret_to_execute]: 2.77e-05 [rewriter_before_opt_a]: 8.632e-05 [opt_a]: 0.00270236, [2] [Cycle 1]: 0.00200976, [45] [expand_dump_flag]: 2.99999e-06 [switch_simplify]: 4.609e-05 [loop_unroll]: 3.494e-05 [a_1]: 0.00071595 [with_stream_mark]: 1.41e-05 [recompute_prepare]: 8.75001e-06 [updatestate_depend_eliminate]: 3.51001e-06 [updatestate_assign_eliminate]: 2.98e-06 [updatestate_loads_eliminate]: 3.09001e-06 [parameter_eliminate]: 2.42001e-06 [a_2]: 9.054e-05 [accelerated_algorithm]: 7.62002e-06 [shard]: 1.94e-06 [meta_shard_fg_expand]: 1.77001e-06 [shard_inline]: 7.21999e-06 [merge_send_recv]: 7.96001e-06 [auto_parallel]: 5.89999e-06 [parallel]: 1.68e-05 [flash_sp]: 7.79002e-06 [merge_comm]: 3.8e-06 [allreduce_fusion]: 3.63e-06 [matmul_add_comm_reduction]: 8.99e-06 [allreduce_slice_to_reducescatter]: 8.29983e-07 [virtual_shard_identity]: 8.36002e-06 [virtual_dataset]: 7.15e-06 [get_grad_eliminate_]: 6.80998e-06 [virtual_output]: 7.09001e-06 [merge_forward]: 3.83999e-06 [cell_reuse_recompute_pass]: 1.22e-06 [offload_activation]: 8.99e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.2e-05 [merge_recompute_call_nodes]: 1.40001e-06 [before_grad]: 1.217e-05 [set_forward_comm_id_for_comm_node_pass]: 3.53999e-06 [meta_fg_expand]: 2.81e-06 [flash_sp_send_recv_attached]: 2.48998e-06 [receive_attached]: 2.29001e-06 [after_resolve]: 1.348e-05 [a_after_grad]: 1.125e-05 [renormalize]: 0.00056427 [add_forward_monad_depend]: 5.04998e-06 [auto_monad_grad]: 2.14999e-06 [auto_monad_eliminator]: 1.497e-05 [cse]: 3.228e-05 [a_3]: 5.078e-05 [Cycle 2]: 0.00068333, [45] [expand_dump_flag]: 9.89996e-07 [switch_simplify]: 8.00999e-06 [loop_unroll]: 7.33999e-06 [a_1]: 0.00015932 [with_stream_mark]: 1.074e-05 [recompute_prepare]: 7.03e-06 [updatestate_depend_eliminate]: 2.91999e-06 [updatestate_assign_eliminate]: 2.41e-06 [updatestate_loads_eliminate]: 2.21e-06 [parameter_eliminate]: 1.07998e-06 [a_2]: 8.064e-05 [accelerated_algorithm]: 6.83e-06 [shard]: 1.09998e-06 [meta_shard_fg_expand]: 1.22e-06 [shard_inline]: 6.94999e-06 [merge_send_recv]: 4.71002e-06 [auto_parallel]: 5.32999e-06 [parallel]: 4.52e-06 [flash_sp]: 3.36001e-06 [merge_comm]: 3.31001e-06 [allreduce_fusion]: 2.99001e-06 [matmul_add_comm_reduction]: 5.05999e-06 [allreduce_slice_to_reducescatter]: 3.69997e-07 [virtual_shard_identity]: 7.55e-06 [virtual_dataset]: 6.69001e-06 [get_grad_eliminate_]: 6.31998e-06 [virtual_output]: 6.33002e-06 [merge_forward]: 2.71999e-06 [cell_reuse_recompute_pass]: 1.56002e-06 [offload_activation]: 6.38e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.292e-05 [merge_recompute_call_nodes]: 7.09988e-07 [before_grad]: 9.51e-06 [set_forward_comm_id_for_comm_node_pass]: 3.36999e-06 [meta_fg_expand]: 1.99e-06 [flash_sp_send_recv_attached]: 9.00007e-07 [receive_attached]: 1.05999e-06 [after_resolve]: 1.179e-05 [a_after_grad]: 1.08e-05 [renormalize]: 8.00064e-08 [add_forward_monad_depend]: 1.10999e-06 [auto_monad_grad]: 8.70001e-07 [auto_monad_eliminator]: 6.36e-06 [cse]: 1.483e-05 [a_3]: 4.03e-05 [py_interpret_to_execute_after_opt_a]: 9.33002e-06 [slice_cell_reuse_recomputed_activation]: 1.83002e-06 [rewriter_after_opt_a]: 3.263e-05 [convert_after_rewriter]: 7.29001e-06 [order_py_execute_after_rewriter]: 5.17999e-06 [mutable_eliminate]: 0.00044673 [opt_b]: 0.00024432, [1] [Cycle 1]: 0.00023793, [7] [b_1]: 0.00016282 [b_2]: 9.09e-06 [updatestate_depend_eliminate]: 5.06997e-06 [updatestate_assign_eliminate]: 2.51e-06 [updatestate_loads_eliminate]: 2.63e-06 [renormalize]: 6.30011e-07 [cse]: 1.816e-05 [optimize_parallel_all_gather_comm]: 1.556e-05 [overlap_param_gather]: 1.86998e-06 [cconv]: 2.361e-05 [loop_unroll]: 0.00042038 [opt_after_cconv]: 0.00011284, [1] [Cycle 1]: 0.00010699, [7] [c_1]: 3.546e-05 [parameter_eliminate]: 2.37001e-06 [updatestate_depend_eliminate]: 5.04e-06 [updatestate_assign_eliminate]: 2.38002e-06 [updatestate_loads_eliminate]: 2.14e-06 [cse]: 2.465e-05 [renormalize]: 3.9002e-07 [remove_dup_value]: 1.332e-05 [tuple_transform]: 8.224e-05, [1] [Cycle 1]: 7.736e-05, [4] [d_1]: 4.956e-05 [none_parameter_eliminate]: 1.64e-06 [renormalize]: 2.30008e-07 [switch_simplify]: 7.8e-06 [partial_unused_args_eliminate]: 1.75001e-06 [add_recomputation]: 4.387e-05 [cse_after_recomputation]: 2.202e-05, [1] [Cycle 1]: 1.786e-05, [1] [cse]: 1.235e-05 [environ_conv]: 5.51998e-06 [swap_dp_allreduce_reducescatter]: 5.25999e-06 [bias_add_comm_swap]: 2.19999e-06 [label_micro_interleaved_index]: 4.24002e-06 [label_fine_grained_interleaved_index]: 2.49001e-06 [merge_cast_opt]: 1.28002e-06 [slice_recompute_activation]: 2.01e-06 [micro_interleaved_order_control]: 2.07001e-06 [assign_add_opt]: 1.30999e-06 [ForceFp32Comm]: 9.09989e-07 [remove_cast_before_assign_add]: 9.79984e-07 [full_micro_interleaved_order_control]: 2.12999e-06 [reorder_send_recv_between_fp_bp]: 2.53998e-06 [comm_op_add_attrs]: 9.89996e-07 [add_comm_op_reuse_tag]: 9.20001e-07 [interleave_split_concat_branches]: 1.12e-06 [interleave_parallel_branches]: 1.14998e-06 [overlap_opt_shard_in_pipeline]: 1.19998e-06 [overlap_opt_shard_grad_in_pipeline]: 2.04999e-06 [control_data_broadcast_order]: 1.226e-05 [grouped_pairwise_exchange_alltoall]: 1.53002e-06 [offloading_packed_experts]: 3.86001e-06 [overlap_recompute_and_grad_model_parallel]: 4.70001e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.14e-06 [overlap_recompute_allgather_and_fa_grad]: 1.36002e-06 [overlap_recompute_comm]: 2.48998e-06 [overlap_grad_ring_attention]: 4.32e-06 [overlap_grad_flash_sp]: 1.701e-05 [begin_end_overlap_inline]: 5.59987e-07 [split_matmul_comm_elemetwise]: 2.24001e-06 [split_layernorm_comm]: 1.89e-06 [handle_group_info]: 9.30013e-07 [symbol_engine_optimizer]: 7.63e-05, [1] [Cycle 1]: 7.212e-05, [6] [build]: 2.63998e-06 [elim_shapecalc]: 9.88998e-06 [elim_not_effective]: 1.277e-05 [opt_reshape]: 8.07003e-06 [fold_const_symbol]: 1.037e-05 [renormalize]: 2.50002e-07 [detach_backward]: 1.84998e-06 [pipeline_parallel_scheduler]: 1.53002e-06 [auto_monad_reorder]: 1.74e-05 [get_jit_bprop_graph]: 1.30999e-06 [rewriter_after_jit_bprop_graph]: 3.43e-06 [opt_after_jit_grad]: 0.00045385 [validate]: 3.324e-05 Sums bootstrap : 0.000430s : 3.99% type_inference : 0.005799s : 53.81% event_method : 0.000019s : 0.18% auto_monad : 0.000061s : 0.56% graph_reusing : 0.000006s : 0.05% inline : 0.000002s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000019s : 0.18% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.06% parallel-infer-symbol : 0.000003s : 0.03% pre_auto_parallel : 0.000033s : 0.30% insert-virtual-dataset : 0.000002s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.02% optimize.py_interpret_to_execute : 0.000028s : 0.26% optimize.rewriter_before_opt_a : 0.000086s : 0.80% optimize.opt_a.expand_dump_flag : 0.000004s : 0.04% optimize.opt_a.switch_simplify : 0.000054s : 0.50% optimize.opt_a.loop_unroll : 0.000042s : 0.39% optimize.opt_a.a_1 : 0.000875s : 8.12% optimize.opt_a.with_stream_mark : 0.000025s : 0.23% optimize.opt_a.recompute_prepare : 0.000016s : 0.15% optimize.opt_a.updatestate_depend_eliminate : 0.000006s : 0.06% optimize.opt_a.updatestate_assign_eliminate : 0.000005s : 0.05% optimize.opt_a.updatestate_loads_eliminate : 0.000005s : 0.05% optimize.opt_a.parameter_eliminate : 0.000003s : 0.03% optimize.opt_a.a_2 : 0.000171s : 1.59% optimize.opt_a.accelerated_algorithm : 0.000014s : 0.13% optimize.opt_a.shard : 0.000003s : 0.03% optimize.opt_a.meta_shard_fg_expand : 0.000003s : 0.03% optimize.opt_a.shard_inline : 0.000014s : 0.13% optimize.opt_a.merge_send_recv : 0.000013s : 0.12% optimize.opt_a.auto_parallel : 0.000011s : 0.10% optimize.opt_a.parallel : 0.000021s : 0.20% optimize.opt_a.flash_sp : 0.000011s : 0.10% optimize.opt_a.merge_comm : 0.000007s : 0.07% optimize.opt_a.allreduce_fusion : 0.000007s : 0.06% optimize.opt_a.matmul_add_comm_reduction : 0.000014s : 0.13% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000016s : 0.15% optimize.opt_a.virtual_dataset : 0.000014s : 0.13% optimize.opt_a.get_grad_eliminate_ : 0.000013s : 0.12% optimize.opt_a.virtual_output : 0.000013s : 0.12% optimize.opt_a.merge_forward : 0.000007s : 0.06% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.03% optimize.opt_a.offload_activation : 0.000015s : 0.14% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000025s : 0.23% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.02% optimize.opt_a.before_grad : 0.000022s : 0.20% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000007s : 0.06% optimize.opt_a.meta_fg_expand : 0.000005s : 0.04% optimize.opt_a.flash_sp_send_recv_attached : 0.000003s : 0.03% optimize.opt_a.receive_attached : 0.000003s : 0.03% optimize.opt_a.after_resolve : 0.000025s : 0.23% optimize.opt_a.a_after_grad : 0.000022s : 0.20% optimize.opt_a.renormalize : 0.000564s : 5.24% optimize.opt_a.add_forward_monad_depend : 0.000006s : 0.06% optimize.opt_a.auto_monad_grad : 0.000003s : 0.03% optimize.opt_a.auto_monad_eliminator : 0.000021s : 0.20% optimize.opt_a.cse : 0.000047s : 0.44% optimize.opt_a.a_3 : 0.000091s : 0.85% optimize.py_interpret_to_execute_after_opt_a : 0.000009s : 0.09% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.02% optimize.rewriter_after_opt_a : 0.000033s : 0.30% optimize.convert_after_rewriter : 0.000007s : 0.07% optimize.order_py_execute_after_rewriter : 0.000005s : 0.05% optimize.mutable_eliminate : 0.000447s : 4.15% optimize.opt_b.b_1 : 0.000163s : 1.51% optimize.opt_b.b_2 : 0.000009s : 0.08% optimize.opt_b.updatestate_depend_eliminate : 0.000005s : 0.05% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.02% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.02% optimize.opt_b.renormalize : 0.000001s : 0.01% optimize.opt_b.cse : 0.000018s : 0.17% optimize.optimize_parallel_all_gather_comm : 0.000016s : 0.14% optimize.overlap_param_gather : 0.000002s : 0.02% optimize.cconv : 0.000024s : 0.22% optimize.loop_unroll : 0.000420s : 3.90% optimize.opt_after_cconv.c_1 : 0.000035s : 0.33% optimize.opt_after_cconv.parameter_eliminate : 0.000002s : 0.02% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000005s : 0.05% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000002s : 0.02% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.02% optimize.opt_after_cconv.cse : 0.000025s : 0.23% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000013s : 0.12% optimize.tuple_transform.d_1 : 0.000050s : 0.46% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.02% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000008s : 0.07% optimize.partial_unused_args_eliminate : 0.000002s : 0.02% optimize.add_recomputation : 0.000044s : 0.41% optimize.cse_after_recomputation.cse : 0.000012s : 0.11% optimize.environ_conv : 0.000006s : 0.05% optimize.swap_dp_allreduce_reducescatter : 0.000005s : 0.05% optimize.bias_add_comm_swap : 0.000002s : 0.02% optimize.label_micro_interleaved_index : 0.000004s : 0.04% optimize.label_fine_grained_interleaved_index : 0.000002s : 0.02% optimize.merge_cast_opt : 0.000001s : 0.01% optimize.slice_recompute_activation : 0.000002s : 0.02% optimize.micro_interleaved_order_control : 0.000002s : 0.02% optimize.assign_add_opt : 0.000001s : 0.01% optimize.ForceFp32Comm : 0.000001s : 0.01% optimize.remove_cast_before_assign_add : 0.000001s : 0.01% optimize.full_micro_interleaved_order_control : 0.000002s : 0.02% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.02% optimize.comm_op_add_attrs : 0.000001s : 0.01% optimize.add_comm_op_reuse_tag : 0.000001s : 0.01% optimize.interleave_split_concat_branches : 0.000001s : 0.01% optimize.interleave_parallel_branches : 0.000001s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.02% optimize.control_data_broadcast_order : 0.000012s : 0.11% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.01% optimize.offloading_packed_experts : 0.000004s : 0.04% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.04% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.01% optimize.overlap_recompute_comm : 0.000002s : 0.02% optimize.overlap_grad_ring_attention : 0.000004s : 0.04% optimize.overlap_grad_flash_sp : 0.000017s : 0.16% optimize.begin_end_overlap_inline : 0.000001s : 0.01% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.02% optimize.split_layernorm_comm : 0.000002s : 0.02% optimize.handle_group_info : 0.000001s : 0.01% optimize.symbol_engine_optimizer.build : 0.000003s : 0.02% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000010s : 0.09% optimize.symbol_engine_optimizer.elim_not_effective : 0.000013s : 0.12% optimize.symbol_engine_optimizer.opt_reshape : 0.000008s : 0.07% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000010s : 0.10% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.02% pipeline_parallel_scheduler : 0.000002s : 0.01% auto_monad_reorder : 0.000017s : 0.16% get_jit_bprop_graph : 0.000001s : 0.01% rewriter_after_jit_bprop_graph : 0.000003s : 0.03% opt_after_jit_grad : 0.000454s : 4.21% validate : 0.000033s : 0.31% Time group info: ------[substitution.] 0.000188 34 0.98% : 0.000002s : 2: substitution.elim_not_effective 0.68% : 0.000001s : 2: substitution.fold_const_symbol 3.04% : 0.000006s : 5: substitution.graph_param_transform 75.10% : 0.000142s : 4: substitution.inline 2.00% : 0.000004s : 4: substitution.j_node_and_user_rematch 2.40% : 0.000005s : 4: substitution.remove_not_recompute_node 2.81% : 0.000005s : 6: substitution.replace_old_param 9.82% : 0.000019s : 6: substitution.tuple_list_get_item_eliminator 3.16% : 0.000006s : 1: substitution.value_based_eliminate ------[type_inference.] 0.005739 2 87.11% : 0.005000s : 1: type_inference.infer 12.89% : 0.000740s : 1: type_inference.specialize ------[replace.] 0.000069 10 54.20% : 0.000037s : 4: replace.inline 45.80% : 0.000032s : 6: replace.tuple_list_get_item_eliminator ------[match.] 0.000155 10 89.86% : 0.000139s : 4: match.inline 10.14% : 0.000016s : 6: match.tuple_list_get_item_eliminator ------[predicate.] 0.000227 1590 0.93% : 0.000002s : 16: predicate.accumulaten_eliminater 0.68% : 0.000002s : 5: predicate.ad_related_special_op_eliminate 0.51% : 0.000001s : 10: predicate.addn_check_dump 0.95% : 0.000002s : 16: predicate.addn_zero_filter 0.84% : 0.000002s : 16: predicate.adjust_all_reduce_mul_add 2.02% : 0.000005s : 26: predicate.arithmetic_simplify 0.93% : 0.000002s : 16: predicate.cast_eliminate 0.59% : 0.000001s : 10: predicate.check_bprop_eliminate 0.49% : 0.000001s : 10: predicate.compare_switch_simplify 0.19% : 0.000000s : 5: predicate.const_output_eliminate 0.62% : 0.000001s : 10: predicate.depend_value_elim 0.98% : 0.000002s : 16: predicate.dict_get_item_const_eliminator 1.03% : 0.000002s : 16: predicate.dict_get_item_eliminator 0.93% : 0.000002s : 16: predicate.dict_set_item_eliminator 0.91% : 0.000002s : 10: predicate.dumpgradient_eliminate 0.21% : 0.000000s : 5: predicate.elim_not_effective 0.36% : 0.000001s : 5: predicate.elim_shapecalc_of_broadcastargs 1.18% : 0.000003s : 21: predicate.environ_add_const_eliminate 1.11% : 0.000003s : 21: predicate.environ_get_add_eliminate 1.13% : 0.000003s : 21: predicate.environ_get_depend_swap 1.68% : 0.000004s : 31: predicate.environ_get_eliminate 1.13% : 0.000003s : 21: predicate.environ_get_set_eliminate 1.57% : 0.000004s : 26: predicate.exchange_switch_depend_value 2.54% : 0.000006s : 26: predicate.float_depend_g_call 0.51% : 0.000001s : 10: predicate.float_environ_get_switch 0.79% : 0.000002s : 15: predicate.float_tuple_getitem_switch 0.19% : 0.000000s : 5: predicate.fold_const_symbol 0.64% : 0.000001s : 10: predicate.get_grad_eliminate 0.25% : 0.000001s : 5: predicate.graph_param_transform 0.60% : 0.000001s : 10: predicate.incorporate_call 0.46% : 0.000001s : 10: predicate.incorporate_call_switch 6.29% : 0.000014s : 72: predicate.inline 0.75% : 0.000002s : 10: predicate.inline_without_move 0.37% : 0.000001s : 10: predicate.j_node_and_user_rematch 0.75% : 0.000002s : 10: predicate.less_batch_normalization 1.96% : 0.000004s : 32: predicate.list_to_tuple_eliminator_ 2.69% : 0.000006s : 48: predicate.load_eliminater 0.77% : 0.000002s : 5: predicate.loop_unroll_after_grad 2.48% : 0.000006s : 40: predicate.loop_unroll_before_grad 1.57% : 0.000004s : 26: predicate.make_slice_get_slice_eliminator 0.52% : 0.000001s : 10: predicate.merge_addn 0.56% : 0.000001s : 10: predicate.micro_step_allgather_replace 0.58% : 0.000001s : 10: predicate.mini_step_allgather_replace 0.86% : 0.000002s : 16: predicate.minmaximum_grad 0.88% : 0.000002s : 5: predicate.mutable_eliminate 0.34% : 0.000001s : 5: predicate.opt_reshape 0.43% : 0.000001s : 5: predicate.parallel_virtual_node 1.85% : 0.000004s : 26: predicate.partial_defer_inline 1.89% : 0.000004s : 27: predicate.partial_eliminate 0.92% : 0.000002s : 16: predicate.print_const_string_wrapper 0.56% : 0.000001s : 10: predicate.reduce_all_const_elim 1.16% : 0.000003s : 16: predicate.reduce_eliminate 2.66% : 0.000006s : 48: predicate.redundant_stop_gradient_eliminater 0.42% : 0.000001s : 10: predicate.remove_not_recompute_node 1.54% : 0.000003s : 32: predicate.replace_applicator 0.44% : 0.000001s : 10: predicate.replace_old_param 0.28% : 0.000001s : 5: predicate.reset_defer_inline 0.97% : 0.000002s : 16: predicate.reshape_eliminate 0.72% : 0.000002s : 10: predicate.row_tensor_add_zeros_like 0.40% : 0.000001s : 5: predicate.row_tensor_eliminate 0.67% : 0.000002s : 10: predicate.same_eliminate 0.44% : 0.000001s : 10: predicate.set_cell_output_no_recompute 0.74% : 0.000002s : 10: predicate.shard_identity_eliminate 0.66% : 0.000001s : 10: predicate.special_op_eliminate 0.69% : 0.000002s : 10: predicate.specialize_transform 0.75% : 0.000002s : 10: predicate.split_environ_get_set_with_tuple_value 0.75% : 0.000002s : 10: predicate.stack_unstack_eliminate 0.35% : 0.000001s : 5: predicate.switch_call_monad_eliminater 1.61% : 0.000004s : 26: predicate.switch_defer_inline 2.17% : 0.000005s : 36: predicate.switch_layer_defer_inline 5.21% : 0.000012s : 81: predicate.switch_simplify 0.99% : 0.000002s : 16: predicate.tile_eliminate 0.94% : 0.000002s : 16: predicate.transpose_eliminate 1.48% : 0.000003s : 26: predicate.tuple_list_convert_item_index_to_positive 1.60% : 0.000004s : 26: predicate.tuple_list_get_item_const_eliminator 1.44% : 0.000003s : 26: predicate.tuple_list_get_item_depend_reorder 3.35% : 0.000008s : 42: predicate.tuple_list_get_item_eliminator 1.53% : 0.000003s : 26: predicate.tuple_list_get_set_item_eliminator 2.16% : 0.000005s : 36: predicate.tuple_list_set_item_eliminator 1.83% : 0.000004s : 32: predicate.tuple_to_list_eliminator_ 2.56% : 0.000006s : 48: predicate.updatestate_pure_node_eliminater 3.18% : 0.000007s : 58: predicate.updatestate_useless_node_eliminater 0.43% : 0.000001s : 5: predicate.value_based_eliminate 0.63% : 0.000001s : 10: predicate.virtual_dataset_eliminate 0.61% : 0.000001s : 10: predicate.virtual_output_eliminate 0.31% : 0.000001s : 5: predicate.virtual_view_grad_eliminate 0.37% : 0.000001s : 5: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000550 11 54.92% : 0.000302s : 5: func_graph_cloner_run.FuncGraphClonerGraph 45.08% : 0.000248s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.024467 192 0.02% : 0.000004s : 1: ForceFp32Comm 12.23% : 0.002993s : 1: add_attr 12.20% : 0.002984s : 1: add_attr_with_inline 0.01% : 0.000004s : 1: add_comm_op_reuse_tag 0.20% : 0.000048s : 1: add_recomputation 0.02% : 0.000004s : 1: assign_add_opt 0.27% : 0.000066s : 1: auto_monad 0.09% : 0.000021s : 1: auto_monad_reorder 0.01% : 0.000003s : 1: begin_end_overlap_inline 0.02% : 0.000005s : 1: bias_add_comm_swap 1.87% : 0.000458s : 1: bootstrap 0.11% : 0.000027s : 1: cconv 0.01% : 0.000004s : 1: comm_op_add_attrs 0.06% : 0.000015s : 1: control_data_broadcast_order 0.04% : 0.000011s : 1: convert_after_rewriter 0.10% : 0.000025s : 1: cse_after_recomputation 0.02% : 0.000005s : 1: dataset_repeat_opt 0.02% : 0.000005s : 1: detach_backward 0.03% : 0.000008s : 1: environ_conv 0.10% : 0.000025s : 1: event_method 0.02% : 0.000005s : 1: full_micro_interleaved_order_control 0.02% : 0.000005s : 1: get_jit_bprop_graph 0.04% : 0.000009s : 1: graph_reusing 0.02% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.02% : 0.000004s : 1: handle_group_info 0.02% : 0.000005s : 1: inline 0.02% : 0.000005s : 1: insert-virtual-dataset 0.02% : 0.000004s : 1: interleave_parallel_branches 0.02% : 0.000004s : 1: interleave_split_concat_branches 0.02% : 0.000005s : 1: label_fine_grained_interleaved_index 0.03% : 0.000007s : 1: label_micro_interleaved_index 1.75% : 0.000428s : 1: loop_unroll 0.02% : 0.000004s : 1: merge_cast_opt 0.02% : 0.000005s : 1: micro_interleaved_order_control 1.86% : 0.000454s : 1: mutable_eliminate 0.03% : 0.000007s : 1: offloading_packed_experts 0.06% : 0.000014s : 1: opt.transform.loop_unroll_optimizer 0.06% : 0.000014s : 1: opt.transform.mutable_eliminate 5.49% : 0.001343s : 78: opt.transform.opt_a 0.14% : 0.000034s : 1: opt.transform.opt_after_cconv 0.11% : 0.000027s : 1: opt.transform.opt_after_jit_grad 0.57% : 0.000139s : 28: opt.transform.opt_b 0.22% : 0.000055s : 2: opt.transform.opt_trans_graph 0.15% : 0.000038s : 4: opt.transform.symbol_engine_opt 11.06% : 0.002705s : 1: opt_a 0.47% : 0.000116s : 1: opt_after_cconv 1.89% : 0.000462s : 1: opt_after_jit_grad 1.01% : 0.000247s : 1: opt_b 19.16% : 0.004688s : 1: optimize 0.08% : 0.000019s : 1: optimize_parallel_all_gather_comm 0.03% : 0.000008s : 1: order_py_execute_after_rewriter 0.08% : 0.000020s : 1: overlap_grad_flash_sp 0.02% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.03% : 0.000007s : 1: overlap_grad_ring_attention 0.02% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.02% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.02% : 0.000005s : 1: overlap_param_gather 0.02% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.03% : 0.000008s : 1: overlap_recompute_and_grad_model_parallel 0.02% : 0.000005s : 1: overlap_recompute_comm 0.03% : 0.000007s : 1: parallel-infer-symbol 0.01% : 0.000004s : 1: parallel-infer-symbol-second 0.02% : 0.000005s : 1: partial_unused_args_eliminate 0.02% : 0.000005s : 1: pipeline_parallel_scheduler 0.02% : 0.000004s : 1: pipeline_split 0.15% : 0.000037s : 1: pre_auto_parallel 0.13% : 0.000031s : 1: py_interpret_to_execute 0.05% : 0.000013s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000004s : 1: remove_cast_before_assign_add 0.07% : 0.000017s : 1: remove_dup_value 1.22% : 0.000298s : 1: renormalize.infer 1.06% : 0.000259s : 1: renormalize.specialize 0.02% : 0.000005s : 1: reorder_send_recv_between_fp_bp 0.03% : 0.000007s : 1: rewriter_after_jit_bprop_graph 0.15% : 0.000037s : 1: rewriter_after_opt_a 0.37% : 0.000091s : 1: rewriter_before_opt_a 0.02% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.02% : 0.000005s : 1: slice_recompute_activation 0.02% : 0.000005s : 1: split_layernorm_comm 0.02% : 0.000005s : 1: split_matmul_comm_elemetwise 0.03% : 0.000008s : 1: swap_dp_allreduce_reducescatter 0.32% : 0.000079s : 1: symbol_engine_optimizer 0.35% : 0.000085s : 1: tuple_transform 23.75% : 0.005811s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:47:06.921.993 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:47:06.922.232 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0167132, [21] [bootstrap]: 0.00043823 [type_inference]: 0.00593156 [event_method]: 2.175e-05 [auto_monad]: 6.411e-05 [graph_reusing]: 5.90002e-06 [inline]: 2.12999e-06 [add_attr]: 0.00302848, [1] [add_attr_with_inline]: 0.00302044, [1] [Cycle 1]: 7.027e-05, [2] [tag_attr]: 2.029e-05 [meta_addattr_fg_expand]: 6.54999e-06 [parallel-infer-symbol]: 3.13998e-06 [pre_auto_parallel]: 3.351e-05 [insert-virtual-dataset]: 2.63e-06 [parallel-infer-symbol-second]: 7.00005e-07 [dataset_repeat_opt]: 2.01e-06 [pipeline_split]: 1.50999e-06 [optimize]: 0.00596585, [53] [py_interpret_to_execute]: 3.312e-05 [rewriter_before_opt_a]: 9.658e-05 [opt_a]: 0.00353939, [2] [Cycle 1]: 0.00253565, [45] [expand_dump_flag]: 2.94001e-06 [switch_simplify]: 4.717e-05 [loop_unroll]: 3.563e-05 [a_1]: 0.00083294 [with_stream_mark]: 1.631e-05 [recompute_prepare]: 1.16e-05 [updatestate_depend_eliminate]: 4.75001e-06 [updatestate_assign_eliminate]: 4.23999e-06 [updatestate_loads_eliminate]: 3.73001e-06 [parameter_eliminate]: 2.16e-06 [a_2]: 0.00013863 [accelerated_algorithm]: 9.10001e-06 [shard]: 2.21e-06 [meta_shard_fg_expand]: 2.11e-06 [shard_inline]: 8.45999e-06 [merge_send_recv]: 9.47001e-06 [auto_parallel]: 7.97003e-06 [parallel]: 1.895e-05 [flash_sp]: 8.62e-06 [merge_comm]: 4.82e-06 [allreduce_fusion]: 4.47998e-06 [matmul_add_comm_reduction]: 1.009e-05 [allreduce_slice_to_reducescatter]: 6.90023e-07 [virtual_shard_identity]: 1.039e-05 [virtual_dataset]: 8.56002e-06 [get_grad_eliminate_]: 8.42998e-06 [virtual_output]: 8.61002e-06 [merge_forward]: 5.56e-06 [cell_reuse_recompute_pass]: 1.42e-06 [offload_activation]: 1.055e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.841e-05 [merge_recompute_call_nodes]: 1.81e-06 [before_grad]: 1.446e-05 [set_forward_comm_id_for_comm_node_pass]: 4.58999e-06 [meta_fg_expand]: 3.48e-06 [flash_sp_send_recv_attached]: 2.89001e-06 [receive_attached]: 2.27999e-06 [after_resolve]: 1.536e-05 [a_after_grad]: 1.371e-05 [renormalize]: 0.00069422 [add_forward_monad_depend]: 5.59e-06 [auto_monad_grad]: 2.45002e-06 [auto_monad_eliminator]: 1.676e-05 [cse]: 3.492e-05 [a_3]: 7.65e-05 [Cycle 2]: 0.00099009, [45] [expand_dump_flag]: 1.18001e-06 [switch_simplify]: 1.02e-05 [loop_unroll]: 8.72998e-06 [a_1]: 0.00020417 [with_stream_mark]: 1.294e-05 [recompute_prepare]: 8.92e-06 [updatestate_depend_eliminate]: 4.07003e-06 [updatestate_assign_eliminate]: 3.33998e-06 [updatestate_loads_eliminate]: 3.64002e-06 [parameter_eliminate]: 1.09e-06 [a_2]: 0.00012852 [accelerated_algorithm]: 8.55001e-06 [shard]: 1.57999e-06 [meta_shard_fg_expand]: 2.23002e-06 [shard_inline]: 8.53001e-06 [merge_send_recv]: 6.19001e-06 [auto_parallel]: 6.64001e-06 [parallel]: 5.40001e-06 [flash_sp]: 3.77998e-06 [merge_comm]: 4.17e-06 [allreduce_fusion]: 4.07e-06 [matmul_add_comm_reduction]: 7.63001e-06 [allreduce_slice_to_reducescatter]: 5.60016e-07 [virtual_shard_identity]: 9.19998e-06 [virtual_dataset]: 7.97998e-06 [get_grad_eliminate_]: 8.60001e-06 [virtual_output]: 7.65e-06 [merge_forward]: 4.05e-06 [cell_reuse_recompute_pass]: 1.37e-06 [offload_activation]: 8.73001e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.669e-05 [merge_recompute_call_nodes]: 8.59989e-07 [before_grad]: 1.282e-05 [set_forward_comm_id_for_comm_node_pass]: 4.57e-06 [meta_fg_expand]: 2.92002e-06 [flash_sp_send_recv_attached]: 1.26002e-06 [receive_attached]: 1.02e-06 [after_resolve]: 1.349e-05 [a_after_grad]: 1.244e-05 [renormalize]: 7.00238e-08 [add_forward_monad_depend]: 1.47999e-06 [auto_monad_grad]: 1.06002e-06 [auto_monad_eliminator]: 9.42999e-06 [cse]: 1.967e-05 [a_3]: 6.423e-05 [py_interpret_to_execute_after_opt_a]: 1.509e-05 [slice_cell_reuse_recomputed_activation]: 4.48001e-06 [rewriter_after_opt_a]: 4.395e-05 [convert_after_rewriter]: 1.085e-05 [order_py_execute_after_rewriter]: 9.10001e-06 [mutable_eliminate]: 0.00051158 [opt_b]: 0.00034027, [1] [Cycle 1]: 0.00033161, [7] [b_1]: 0.00022571 [b_2]: 9.49999e-06 [updatestate_depend_eliminate]: 7.05e-06 [updatestate_assign_eliminate]: 3.25e-06 [updatestate_loads_eliminate]: 3.51001e-06 [renormalize]: 5.89993e-07 [cse]: 2.393e-05 [optimize_parallel_all_gather_comm]: 2.207e-05 [overlap_param_gather]: 4.44002e-06 [cconv]: 3.004e-05 [loop_unroll]: 0.00043157 [opt_after_cconv]: 0.00014929, [1] [Cycle 1]: 0.00014072, [7] [c_1]: 4.204e-05 [parameter_eliminate]: 3.45e-06 [updatestate_depend_eliminate]: 6.36998e-06 [updatestate_assign_eliminate]: 3.2e-06 [updatestate_loads_eliminate]: 3.46999e-06 [cse]: 2.681e-05 [renormalize]: 3.19997e-07 [remove_dup_value]: 1.899e-05 [tuple_transform]: 0.00010474, [1] [Cycle 1]: 9.818e-05, [4] [d_1]: 5.763e-05 [none_parameter_eliminate]: 1.67999e-06 [renormalize]: 2.19996e-07 [switch_simplify]: 8.60001e-06 [partial_unused_args_eliminate]: 4.2e-06 [add_recomputation]: 5.774e-05 [cse_after_recomputation]: 3.222e-05, [1] [Cycle 1]: 2.528e-05, [1] [cse]: 1.615e-05 [environ_conv]: 9.17999e-06 [swap_dp_allreduce_reducescatter]: 9.09e-06 [bias_add_comm_swap]: 5.29e-06 [label_micro_interleaved_index]: 6.37001e-06 [label_fine_grained_interleaved_index]: 5.30999e-06 [merge_cast_opt]: 3.64002e-06 [slice_recompute_activation]: 4.55001e-06 [micro_interleaved_order_control]: 4.89e-06 [assign_add_opt]: 3.53e-06 [ForceFp32Comm]: 3.03998e-06 [remove_cast_before_assign_add]: 3.30003e-06 [full_micro_interleaved_order_control]: 4.66002e-06 [reorder_send_recv_between_fp_bp]: 5.35001e-06 [comm_op_add_attrs]: 3.46999e-06 [add_comm_op_reuse_tag]: 3.69002e-06 [interleave_split_concat_branches]: 3.85e-06 [interleave_parallel_branches]: 3.36999e-06 [overlap_opt_shard_in_pipeline]: 3.81001e-06 [overlap_opt_shard_grad_in_pipeline]: 4.01001e-06 [control_data_broadcast_order]: 1.743e-05 [grouped_pairwise_exchange_alltoall]: 4.06001e-06 [offloading_packed_experts]: 7.21001e-06 [overlap_recompute_and_grad_model_parallel]: 7.31001e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.36999e-06 [overlap_recompute_allgather_and_fa_grad]: 3.83001e-06 [overlap_recompute_comm]: 5.79e-06 [overlap_grad_ring_attention]: 6.57002e-06 [overlap_grad_flash_sp]: 2.449e-05 [begin_end_overlap_inline]: 2.69001e-06 [split_matmul_comm_elemetwise]: 4.45e-06 [split_layernorm_comm]: 4.15999e-06 [handle_group_info]: 3.18e-06 [symbol_engine_optimizer]: 0.00010855, [1] [Cycle 1]: 0.00010176, [6] [build]: 3.48e-06 [elim_shapecalc]: 1.287e-05 [elim_not_effective]: 1.564e-05 [opt_reshape]: 9.59e-06 [fold_const_symbol]: 1.358e-05 [renormalize]: 3.80009e-07 [detach_backward]: 3.49001e-06 [pipeline_parallel_scheduler]: 1.89999e-06 [auto_monad_reorder]: 2.474e-05 [get_jit_bprop_graph]: 1.47999e-06 [rewriter_after_jit_bprop_graph]: 4.47e-06 [opt_after_jit_grad]: 0.00050225 [validate]: 4.389e-05 Sums bootstrap : 0.000438s : 3.68% type_inference : 0.005932s : 49.85% event_method : 0.000022s : 0.18% auto_monad : 0.000064s : 0.54% graph_reusing : 0.000006s : 0.05% inline : 0.000002s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000020s : 0.17% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000007s : 0.06% parallel-infer-symbol : 0.000003s : 0.03% pre_auto_parallel : 0.000034s : 0.28% insert-virtual-dataset : 0.000003s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000033s : 0.28% optimize.rewriter_before_opt_a : 0.000097s : 0.81% optimize.opt_a.expand_dump_flag : 0.000004s : 0.03% optimize.opt_a.switch_simplify : 0.000057s : 0.48% optimize.opt_a.loop_unroll : 0.000044s : 0.37% optimize.opt_a.a_1 : 0.001037s : 8.72% optimize.opt_a.with_stream_mark : 0.000029s : 0.25% optimize.opt_a.recompute_prepare : 0.000021s : 0.17% optimize.opt_a.updatestate_depend_eliminate : 0.000009s : 0.07% optimize.opt_a.updatestate_assign_eliminate : 0.000008s : 0.06% optimize.opt_a.updatestate_loads_eliminate : 0.000007s : 0.06% optimize.opt_a.parameter_eliminate : 0.000003s : 0.03% optimize.opt_a.a_2 : 0.000267s : 2.25% optimize.opt_a.accelerated_algorithm : 0.000018s : 0.15% optimize.opt_a.shard : 0.000004s : 0.03% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.04% optimize.opt_a.shard_inline : 0.000017s : 0.14% optimize.opt_a.merge_send_recv : 0.000016s : 0.13% optimize.opt_a.auto_parallel : 0.000015s : 0.12% optimize.opt_a.parallel : 0.000024s : 0.20% optimize.opt_a.flash_sp : 0.000012s : 0.10% optimize.opt_a.merge_comm : 0.000009s : 0.08% optimize.opt_a.allreduce_fusion : 0.000009s : 0.07% optimize.opt_a.matmul_add_comm_reduction : 0.000018s : 0.15% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000020s : 0.16% optimize.opt_a.virtual_dataset : 0.000017s : 0.14% optimize.opt_a.get_grad_eliminate_ : 0.000017s : 0.14% optimize.opt_a.virtual_output : 0.000016s : 0.14% optimize.opt_a.merge_forward : 0.000010s : 0.08% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.02% optimize.opt_a.offload_activation : 0.000019s : 0.16% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000035s : 0.30% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.02% optimize.opt_a.before_grad : 0.000027s : 0.23% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000009s : 0.08% optimize.opt_a.meta_fg_expand : 0.000006s : 0.05% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.03% optimize.opt_a.receive_attached : 0.000003s : 0.03% optimize.opt_a.after_resolve : 0.000029s : 0.24% optimize.opt_a.a_after_grad : 0.000026s : 0.22% optimize.opt_a.renormalize : 0.000694s : 5.84% optimize.opt_a.add_forward_monad_depend : 0.000007s : 0.06% optimize.opt_a.auto_monad_grad : 0.000004s : 0.03% optimize.opt_a.auto_monad_eliminator : 0.000026s : 0.22% optimize.opt_a.cse : 0.000055s : 0.46% optimize.opt_a.a_3 : 0.000141s : 1.18% optimize.py_interpret_to_execute_after_opt_a : 0.000015s : 0.13% optimize.slice_cell_reuse_recomputed_activation : 0.000004s : 0.04% optimize.rewriter_after_opt_a : 0.000044s : 0.37% optimize.convert_after_rewriter : 0.000011s : 0.09% optimize.order_py_execute_after_rewriter : 0.000009s : 0.08% optimize.mutable_eliminate : 0.000512s : 4.30% optimize.opt_b.b_1 : 0.000226s : 1.90% optimize.opt_b.b_2 : 0.000009s : 0.08% optimize.opt_b.updatestate_depend_eliminate : 0.000007s : 0.06% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_b.updatestate_loads_eliminate : 0.000004s : 0.03% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000024s : 0.20% optimize.optimize_parallel_all_gather_comm : 0.000022s : 0.19% optimize.overlap_param_gather : 0.000004s : 0.04% optimize.cconv : 0.000030s : 0.25% optimize.loop_unroll : 0.000432s : 3.63% optimize.opt_after_cconv.c_1 : 0.000042s : 0.35% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.05% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.cse : 0.000027s : 0.23% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000019s : 0.16% optimize.tuple_transform.d_1 : 0.000058s : 0.48% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000009s : 0.07% optimize.partial_unused_args_eliminate : 0.000004s : 0.04% optimize.add_recomputation : 0.000058s : 0.49% optimize.cse_after_recomputation.cse : 0.000016s : 0.14% optimize.environ_conv : 0.000009s : 0.08% optimize.swap_dp_allreduce_reducescatter : 0.000009s : 0.08% optimize.bias_add_comm_swap : 0.000005s : 0.04% optimize.label_micro_interleaved_index : 0.000006s : 0.05% optimize.label_fine_grained_interleaved_index : 0.000005s : 0.04% optimize.merge_cast_opt : 0.000004s : 0.03% optimize.slice_recompute_activation : 0.000005s : 0.04% optimize.micro_interleaved_order_control : 0.000005s : 0.04% optimize.assign_add_opt : 0.000004s : 0.03% optimize.ForceFp32Comm : 0.000003s : 0.03% optimize.remove_cast_before_assign_add : 0.000003s : 0.03% optimize.full_micro_interleaved_order_control : 0.000005s : 0.04% optimize.reorder_send_recv_between_fp_bp : 0.000005s : 0.04% optimize.comm_op_add_attrs : 0.000003s : 0.03% optimize.add_comm_op_reuse_tag : 0.000004s : 0.03% optimize.interleave_split_concat_branches : 0.000004s : 0.03% optimize.interleave_parallel_branches : 0.000003s : 0.03% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.03% optimize.overlap_opt_shard_grad_in_pipeline : 0.000004s : 0.03% optimize.control_data_broadcast_order : 0.000017s : 0.15% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.03% optimize.offloading_packed_experts : 0.000007s : 0.06% optimize.overlap_recompute_and_grad_model_parallel : 0.000007s : 0.06% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000003s : 0.03% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.03% optimize.overlap_recompute_comm : 0.000006s : 0.05% optimize.overlap_grad_ring_attention : 0.000007s : 0.06% optimize.overlap_grad_flash_sp : 0.000024s : 0.21% optimize.begin_end_overlap_inline : 0.000003s : 0.02% optimize.split_matmul_comm_elemetwise : 0.000004s : 0.04% optimize.split_layernorm_comm : 0.000004s : 0.03% optimize.handle_group_info : 0.000003s : 0.03% optimize.symbol_engine_optimizer.build : 0.000003s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000013s : 0.11% optimize.symbol_engine_optimizer.elim_not_effective : 0.000016s : 0.13% optimize.symbol_engine_optimizer.opt_reshape : 0.000010s : 0.08% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000014s : 0.11% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000003s : 0.03% pipeline_parallel_scheduler : 0.000002s : 0.02% auto_monad_reorder : 0.000025s : 0.21% get_jit_bprop_graph : 0.000001s : 0.01% rewriter_after_jit_bprop_graph : 0.000004s : 0.04% opt_after_jit_grad : 0.000502s : 4.22% validate : 0.000044s : 0.37% Time group info: ------[substitution.] 0.000219 44 9.82% : 0.000022s : 3: substitution.cast_eliminate 1.21% : 0.000003s : 3: substitution.elim_not_effective 0.78% : 0.000002s : 3: substitution.fold_const_symbol 3.14% : 0.000007s : 6: substitution.graph_param_transform 66.52% : 0.000146s : 4: substitution.inline 2.36% : 0.000005s : 6: substitution.j_node_and_user_rematch 2.77% : 0.000006s : 6: substitution.remove_not_recompute_node 2.63% : 0.000006s : 6: substitution.replace_old_param 8.23% : 0.000018s : 6: substitution.tuple_list_get_item_eliminator 2.53% : 0.000006s : 1: substitution.value_based_eliminate ------[type_inference.] 0.005880 2 86.48% : 0.005086s : 1: type_inference.infer 13.52% : 0.000795s : 1: type_inference.specialize ------[replace.] 0.000070 10 53.15% : 0.000037s : 4: replace.inline 46.85% : 0.000033s : 6: replace.tuple_list_get_item_eliminator ------[match.] 0.000158 10 90.43% : 0.000143s : 4: match.inline 9.57% : 0.000015s : 6: match.tuple_list_get_item_eliminator ------[predicate.] 0.000286 1908 0.97% : 0.000003s : 20: predicate.accumulaten_eliminater 0.62% : 0.000002s : 6: predicate.ad_related_special_op_eliminate 0.55% : 0.000002s : 12: predicate.addn_check_dump 1.09% : 0.000003s : 20: predicate.addn_zero_filter 0.88% : 0.000003s : 20: predicate.adjust_all_reduce_mul_add 1.98% : 0.000006s : 32: predicate.arithmetic_simplify 1.06% : 0.000003s : 20: predicate.cast_eliminate 0.61% : 0.000002s : 12: predicate.check_bprop_eliminate 0.55% : 0.000002s : 12: predicate.compare_switch_simplify 0.19% : 0.000001s : 6: predicate.const_output_eliminate 0.72% : 0.000002s : 12: predicate.depend_value_elim 1.00% : 0.000003s : 20: predicate.dict_get_item_const_eliminator 1.07% : 0.000003s : 20: predicate.dict_get_item_eliminator 0.91% : 0.000003s : 20: predicate.dict_set_item_eliminator 0.87% : 0.000002s : 12: predicate.dumpgradient_eliminate 0.21% : 0.000001s : 6: predicate.elim_not_effective 0.34% : 0.000001s : 6: predicate.elim_shapecalc_of_broadcastargs 1.19% : 0.000003s : 26: predicate.environ_add_const_eliminate 1.17% : 0.000003s : 26: predicate.environ_get_add_eliminate 1.19% : 0.000003s : 26: predicate.environ_get_depend_swap 1.78% : 0.000005s : 38: predicate.environ_get_eliminate 1.18% : 0.000003s : 26: predicate.environ_get_set_eliminate 1.47% : 0.000004s : 30: predicate.exchange_switch_depend_value 2.29% : 0.000007s : 30: predicate.float_depend_g_call 0.54% : 0.000002s : 12: predicate.float_environ_get_switch 0.89% : 0.000003s : 18: predicate.float_tuple_getitem_switch 0.20% : 0.000001s : 6: predicate.fold_const_symbol 0.79% : 0.000002s : 12: predicate.get_grad_eliminate 0.21% : 0.000001s : 6: predicate.graph_param_transform 0.60% : 0.000002s : 12: predicate.incorporate_call 0.52% : 0.000001s : 12: predicate.incorporate_call_switch 6.27% : 0.000018s : 86: predicate.inline 0.82% : 0.000002s : 12: predicate.inline_without_move 0.39% : 0.000001s : 12: predicate.j_node_and_user_rematch 0.71% : 0.000002s : 12: predicate.less_batch_normalization 1.91% : 0.000005s : 38: predicate.list_to_tuple_eliminator_ 2.68% : 0.000008s : 58: predicate.load_eliminater 0.66% : 0.000002s : 6: predicate.loop_unroll_after_grad 2.10% : 0.000006s : 42: predicate.loop_unroll_before_grad 1.71% : 0.000005s : 32: predicate.make_slice_get_slice_eliminator 0.56% : 0.000002s : 12: predicate.merge_addn 0.64% : 0.000002s : 12: predicate.micro_step_allgather_replace 0.60% : 0.000002s : 12: predicate.mini_step_allgather_replace 0.88% : 0.000003s : 20: predicate.minmaximum_grad 0.74% : 0.000002s : 6: predicate.mutable_eliminate 0.36% : 0.000001s : 6: predicate.opt_reshape 0.35% : 0.000001s : 6: predicate.parallel_virtual_node 1.70% : 0.000005s : 30: predicate.partial_defer_inline 1.80% : 0.000005s : 32: predicate.partial_eliminate 0.95% : 0.000003s : 20: predicate.print_const_string_wrapper 0.56% : 0.000002s : 12: predicate.reduce_all_const_elim 1.16% : 0.000003s : 20: predicate.reduce_eliminate 2.66% : 0.000008s : 58: predicate.redundant_stop_gradient_eliminater 0.42% : 0.000001s : 12: predicate.remove_not_recompute_node 1.42% : 0.000004s : 38: predicate.replace_applicator 0.50% : 0.000001s : 12: predicate.replace_old_param 0.25% : 0.000001s : 6: predicate.reset_defer_inline 0.95% : 0.000003s : 20: predicate.reshape_eliminate 0.58% : 0.000002s : 12: predicate.row_tensor_add_zeros_like 0.34% : 0.000001s : 6: predicate.row_tensor_eliminate 0.81% : 0.000002s : 12: predicate.same_eliminate 0.51% : 0.000001s : 12: predicate.set_cell_output_no_recompute 0.72% : 0.000002s : 12: predicate.shard_identity_eliminate 0.71% : 0.000002s : 12: predicate.special_op_eliminate 0.77% : 0.000002s : 12: predicate.specialize_transform 0.75% : 0.000002s : 12: predicate.split_environ_get_set_with_tuple_value 0.69% : 0.000002s : 12: predicate.stack_unstack_eliminate 0.34% : 0.000001s : 6: predicate.switch_call_monad_eliminater 1.54% : 0.000004s : 30: predicate.switch_defer_inline 2.07% : 0.000006s : 42: predicate.switch_layer_defer_inline 4.82% : 0.000014s : 90: predicate.switch_simplify 0.92% : 0.000003s : 20: predicate.tile_eliminate 0.99% : 0.000003s : 20: predicate.transpose_eliminate 1.55% : 0.000004s : 32: predicate.tuple_list_convert_item_index_to_positive 1.61% : 0.000005s : 32: predicate.tuple_list_get_item_const_eliminator 1.51% : 0.000004s : 32: predicate.tuple_list_get_item_depend_reorder 3.26% : 0.000009s : 50: predicate.tuple_list_get_item_eliminator 1.61% : 0.000005s : 32: predicate.tuple_list_get_set_item_eliminator 2.24% : 0.000006s : 44: predicate.tuple_list_set_item_eliminator 1.81% : 0.000005s : 38: predicate.tuple_to_list_eliminator_ 2.60% : 0.000007s : 58: predicate.updatestate_pure_node_eliminater 3.33% : 0.000010s : 70: predicate.updatestate_useless_node_eliminater 0.50% : 0.000001s : 6: predicate.value_based_eliminate 0.64% : 0.000002s : 12: predicate.virtual_dataset_eliminate 0.67% : 0.000002s : 12: predicate.virtual_output_eliminate 0.28% : 0.000001s : 6: predicate.virtual_view_grad_eliminate 0.42% : 0.000001s : 6: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000585 11 52.30% : 0.000306s : 5: func_graph_cloner_run.FuncGraphClonerGraph 47.70% : 0.000279s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.028262 192 0.02% : 0.000006s : 1: ForceFp32Comm 10.75% : 0.003037s : 1: add_attr 10.70% : 0.003024s : 1: add_attr_with_inline 0.02% : 0.000007s : 1: add_comm_op_reuse_tag 0.22% : 0.000061s : 1: add_recomputation 0.02% : 0.000006s : 1: assign_add_opt 0.26% : 0.000074s : 1: auto_monad 0.12% : 0.000033s : 1: auto_monad_reorder 0.02% : 0.000006s : 1: begin_end_overlap_inline 0.03% : 0.000008s : 1: bias_add_comm_swap 1.71% : 0.000482s : 1: bootstrap 0.12% : 0.000034s : 1: cconv 0.02% : 0.000006s : 1: comm_op_add_attrs 0.07% : 0.000021s : 1: control_data_broadcast_order 0.05% : 0.000014s : 1: convert_after_rewriter 0.12% : 0.000035s : 1: cse_after_recomputation 0.03% : 0.000007s : 1: dataset_repeat_opt 0.08% : 0.000022s : 1: detach_backward 0.04% : 0.000012s : 1: environ_conv 0.11% : 0.000032s : 1: event_method 0.03% : 0.000008s : 1: full_micro_interleaved_order_control 0.03% : 0.000007s : 1: get_jit_bprop_graph 0.04% : 0.000012s : 1: graph_reusing 0.02% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.02% : 0.000006s : 1: handle_group_info 0.03% : 0.000008s : 1: inline 0.03% : 0.000009s : 1: insert-virtual-dataset 0.02% : 0.000006s : 1: interleave_parallel_branches 0.02% : 0.000007s : 1: interleave_split_concat_branches 0.03% : 0.000008s : 1: label_fine_grained_interleaved_index 0.03% : 0.000009s : 1: label_micro_interleaved_index 1.55% : 0.000438s : 1: loop_unroll 0.02% : 0.000007s : 1: merge_cast_opt 0.03% : 0.000008s : 1: micro_interleaved_order_control 1.83% : 0.000518s : 1: mutable_eliminate 0.04% : 0.000010s : 1: offloading_packed_experts 0.06% : 0.000016s : 1: opt.transform.loop_unroll_optimizer 0.07% : 0.000019s : 1: opt.transform.mutable_eliminate 5.71% : 0.001614s : 78: opt.transform.opt_a 0.14% : 0.000041s : 1: opt.transform.opt_after_cconv 0.12% : 0.000033s : 1: opt.transform.opt_after_jit_grad 0.56% : 0.000159s : 28: opt.transform.opt_b 0.23% : 0.000064s : 2: opt.transform.opt_trans_graph 0.17% : 0.000048s : 4: opt.transform.symbol_engine_opt 12.53% : 0.003542s : 1: opt_a 0.54% : 0.000153s : 1: opt_after_cconv 1.81% : 0.000512s : 1: opt_after_jit_grad 1.22% : 0.000344s : 1: opt_b 22.48% : 0.006355s : 1: optimize 0.09% : 0.000025s : 1: optimize_parallel_all_gather_comm 0.04% : 0.000012s : 1: order_py_execute_after_rewriter 0.10% : 0.000028s : 1: overlap_grad_flash_sp 0.02% : 0.000006s : 1: overlap_grad_matmul_and_grad_allreduce 0.03% : 0.000009s : 1: overlap_grad_ring_attention 0.02% : 0.000007s : 1: overlap_opt_shard_grad_in_pipeline 0.02% : 0.000007s : 1: overlap_opt_shard_in_pipeline 0.03% : 0.000007s : 1: overlap_param_gather 0.02% : 0.000007s : 1: overlap_recompute_allgather_and_fa_grad 0.04% : 0.000010s : 1: overlap_recompute_and_grad_model_parallel 0.03% : 0.000009s : 1: overlap_recompute_comm 0.04% : 0.000010s : 1: parallel-infer-symbol 0.02% : 0.000006s : 1: parallel-infer-symbol-second 0.02% : 0.000007s : 1: partial_unused_args_eliminate 0.03% : 0.000009s : 1: pipeline_parallel_scheduler 0.02% : 0.000007s : 1: pipeline_split 0.15% : 0.000041s : 1: pre_auto_parallel 0.13% : 0.000037s : 1: py_interpret_to_execute 0.07% : 0.000019s : 1: py_interpret_to_execute_after_opt_a 0.02% : 0.000006s : 1: remove_cast_before_assign_add 0.08% : 0.000022s : 1: remove_dup_value 1.35% : 0.000382s : 1: renormalize.infer 1.08% : 0.000305s : 1: renormalize.specialize 0.03% : 0.000009s : 1: reorder_send_recv_between_fp_bp 0.04% : 0.000010s : 1: rewriter_after_jit_bprop_graph 0.17% : 0.000047s : 1: rewriter_after_opt_a 0.36% : 0.000100s : 1: rewriter_before_opt_a 0.03% : 0.000007s : 1: slice_cell_reuse_recomputed_activation 0.03% : 0.000007s : 1: slice_recompute_activation 0.02% : 0.000007s : 1: split_layernorm_comm 0.03% : 0.000007s : 1: split_matmul_comm_elemetwise 0.04% : 0.000012s : 1: swap_dp_allreduce_reducescatter 0.40% : 0.000112s : 1: symbol_engine_optimizer 0.38% : 0.000108s : 1: tuple_transform 21.11% : 0.005965s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:47:07.113.151 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0161831, [21] [bootstrap]: 0.00048839 [type_inference]: 0.00636447 [event_method]: 2.205e-05 [auto_monad]: 6.316e-05 [graph_reusing]: 6.04001e-06 [inline]: 2.27001e-06 [add_attr]: 0.00311014, [1] [add_attr_with_inline]: 0.00310077, [1] [Cycle 1]: 6.589e-05, [2] [tag_attr]: 2.127e-05 [meta_addattr_fg_expand]: 6.58998e-06 [parallel-infer-symbol]: 3.36001e-06 [pre_auto_parallel]: 3.338e-05 [insert-virtual-dataset]: 2.59001e-06 [parallel-infer-symbol-second]: 8.2e-07 [dataset_repeat_opt]: 1.67001e-06 [pipeline_split]: 1.67001e-06 [optimize]: 0.0053591, [53] [py_interpret_to_execute]: 2.866e-05 [rewriter_before_opt_a]: 9.404e-05 [opt_a]: 0.00317872, [2] [Cycle 1]: 0.00236328, [45] [expand_dump_flag]: 3.03e-06 [switch_simplify]: 4.842e-05 [loop_unroll]: 3.605e-05 [a_1]: 0.00084551 [with_stream_mark]: 1.427e-05 [recompute_prepare]: 1.125e-05 [updatestate_depend_eliminate]: 4.96002e-06 [updatestate_assign_eliminate]: 4.10998e-06 [updatestate_loads_eliminate]: 3.75e-06 [parameter_eliminate]: 1.79998e-06 [a_2]: 0.00011201 [accelerated_algorithm]: 8.93002e-06 [shard]: 1.88997e-06 [meta_shard_fg_expand]: 1.85001e-06 [shard_inline]: 8.82e-06 [merge_send_recv]: 9.05001e-06 [auto_parallel]: 6.53003e-06 [parallel]: 1.858e-05 [flash_sp]: 7.98999e-06 [merge_comm]: 4.98001e-06 [allreduce_fusion]: 4.56002e-06 [matmul_add_comm_reduction]: 1.076e-05 [allreduce_slice_to_reducescatter]: 6.50005e-07 [virtual_shard_identity]: 9.81998e-06 [virtual_dataset]: 9.41998e-06 [get_grad_eliminate_]: 8.54e-06 [virtual_output]: 8.32e-06 [merge_forward]: 4.35e-06 [cell_reuse_recompute_pass]: 1.35999e-06 [offload_activation]: 1.107e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.554e-05 [merge_recompute_call_nodes]: 1.50999e-06 [before_grad]: 1.318e-05 [set_forward_comm_id_for_comm_node_pass]: 4.52e-06 [meta_fg_expand]: 3.73999e-06 [flash_sp_send_recv_attached]: 2.46e-06 [receive_attached]: 1.97001e-06 [after_resolve]: 1.52e-05 [a_after_grad]: 1.338e-05 [renormalize]: 0.00070383 [add_forward_monad_depend]: 5.14e-06 [auto_monad_grad]: 2.27999e-06 [auto_monad_eliminator]: 1.627e-05 [cse]: 3.609e-05 [a_3]: 6.177e-05 [Cycle 2]: 0.0008057, [45] [expand_dump_flag]: 1.10001e-06 [switch_simplify]: 9.75002e-06 [loop_unroll]: 7.98999e-06 [a_1]: 0.00020362 [with_stream_mark]: 1.151e-05 [recompute_prepare]: 8.08999e-06 [updatestate_depend_eliminate]: 3.98999e-06 [updatestate_assign_eliminate]: 3.13998e-06 [updatestate_loads_eliminate]: 3.03e-06 [parameter_eliminate]: 1.07e-06 [a_2]: 0.00010179 [accelerated_algorithm]: 8e-06 [shard]: 1.18001e-06 [meta_shard_fg_expand]: 1.55001e-06 [shard_inline]: 7.87003e-06 [merge_send_recv]: 5.61e-06 [auto_parallel]: 6.19001e-06 [parallel]: 4.72e-06 [flash_sp]: 3.26001e-06 [merge_comm]: 4.18001e-06 [allreduce_fusion]: 4.05998e-06 [matmul_add_comm_reduction]: 6.53e-06 [allreduce_slice_to_reducescatter]: 4.2998e-07 [virtual_shard_identity]: 9.44998e-06 [virtual_dataset]: 8.10999e-06 [get_grad_eliminate_]: 7.77e-06 [virtual_output]: 7.63001e-06 [merge_forward]: 3.23998e-06 [cell_reuse_recompute_pass]: 1.43002e-06 [offload_activation]: 7.28e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.489e-05 [merge_recompute_call_nodes]: 7.99977e-07 [before_grad]: 1.364e-05 [set_forward_comm_id_for_comm_node_pass]: 4.16001e-06 [meta_fg_expand]: 2.99999e-06 [flash_sp_send_recv_attached]: 9.00007e-07 [receive_attached]: 1.25999e-06 [after_resolve]: 1.399e-05 [a_after_grad]: 1.301e-05 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 1.11002e-06 [auto_monad_grad]: 8.40024e-07 [auto_monad_eliminator]: 8.94e-06 [cse]: 1.81e-05 [a_3]: 5.149e-05 [py_interpret_to_execute_after_opt_a]: 1.158e-05 [slice_cell_reuse_recomputed_activation]: 1.85001e-06 [rewriter_after_opt_a]: 3.962e-05 [convert_after_rewriter]: 8.25999e-06 [order_py_execute_after_rewriter]: 6.13998e-06 [mutable_eliminate]: 0.00050417 [opt_b]: 0.00027541, [1] [Cycle 1]: 0.00026852, [7] [b_1]: 0.00017837 [b_2]: 9.96e-06 [updatestate_depend_eliminate]: 7.71001e-06 [updatestate_assign_eliminate]: 3.26999e-06 [updatestate_loads_eliminate]: 3.31999e-06 [renormalize]: 9.5999e-07 [cse]: 2.56e-05 [optimize_parallel_all_gather_comm]: 1.823e-05 [overlap_param_gather]: 2.35002e-06 [cconv]: 2.768e-05 [loop_unroll]: 0.00043666 [opt_after_cconv]: 0.00012016, [1] [Cycle 1]: 0.00011432, [7] [c_1]: 4.261e-05 [parameter_eliminate]: 3.14999e-06 [updatestate_depend_eliminate]: 6.58e-06 [updatestate_assign_eliminate]: 3.08998e-06 [updatestate_loads_eliminate]: 3.28e-06 [cse]: 2.233e-05 [renormalize]: 7.7e-07 [remove_dup_value]: 1.578e-05 [tuple_transform]: 8.964e-05, [1] [Cycle 1]: 8.544e-05, [4] [d_1]: 5.665e-05 [none_parameter_eliminate]: 1.58002e-06 [renormalize]: 2.00002e-07 [switch_simplify]: 9.11998e-06 [partial_unused_args_eliminate]: 1.71e-06 [add_recomputation]: 5.748e-05 [cse_after_recomputation]: 2.601e-05, [1] [Cycle 1]: 2.115e-05, [1] [cse]: 1.581e-05 [environ_conv]: 6.16998e-06 [swap_dp_allreduce_reducescatter]: 5.87001e-06 [bias_add_comm_swap]: 2.47001e-06 [label_micro_interleaved_index]: 4.36002e-06 [label_fine_grained_interleaved_index]: 2.69001e-06 [merge_cast_opt]: 1.24e-06 [slice_recompute_activation]: 2.11e-06 [micro_interleaved_order_control]: 2.59999e-06 [assign_add_opt]: 1.15999e-06 [ForceFp32Comm]: 7.7e-07 [remove_cast_before_assign_add]: 9.80013e-07 [full_micro_interleaved_order_control]: 2.44001e-06 [reorder_send_recv_between_fp_bp]: 2.83e-06 [comm_op_add_attrs]: 9.70002e-07 [add_comm_op_reuse_tag]: 1.16997e-06 [interleave_split_concat_branches]: 1.12999e-06 [interleave_parallel_branches]: 1.04e-06 [overlap_opt_shard_in_pipeline]: 1.27e-06 [overlap_opt_shard_grad_in_pipeline]: 2.02001e-06 [control_data_broadcast_order]: 1.504e-05 [grouped_pairwise_exchange_alltoall]: 1.92999e-06 [offloading_packed_experts]: 4.95001e-06 [overlap_recompute_and_grad_model_parallel]: 5.52999e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.26997e-06 [overlap_recompute_allgather_and_fa_grad]: 1.35001e-06 [overlap_recompute_comm]: 2.17001e-06 [overlap_grad_ring_attention]: 4.42e-06 [overlap_grad_flash_sp]: 2.28e-05 [begin_end_overlap_inline]: 7.80012e-07 [split_matmul_comm_elemetwise]: 2.16e-06 [split_layernorm_comm]: 1.57999e-06 [handle_group_info]: 9.49978e-07 [symbol_engine_optimizer]: 8.865e-05, [1] [Cycle 1]: 8.423e-05, [6] [build]: 3.64002e-06 [elim_shapecalc]: 1.268e-05 [elim_not_effective]: 1.697e-05 [opt_reshape]: 8.90999e-06 [fold_const_symbol]: 1.296e-05 [renormalize]: 2.19996e-07 [detach_backward]: 2.07999e-06 [pipeline_parallel_scheduler]: 1.50999e-06 [auto_monad_reorder]: 2.032e-05 [get_jit_bprop_graph]: 1.64e-06 [rewriter_after_jit_bprop_graph]: 3.74002e-06 [opt_after_jit_grad]: 0.00049207 [validate]: 4.498e-05 Sums bootstrap : 0.000488s : 4.03% type_inference : 0.006364s : 52.53% event_method : 0.000022s : 0.18% auto_monad : 0.000063s : 0.52% graph_reusing : 0.000006s : 0.05% inline : 0.000002s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000021s : 0.18% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000007s : 0.05% parallel-infer-symbol : 0.000003s : 0.03% pre_auto_parallel : 0.000033s : 0.28% insert-virtual-dataset : 0.000003s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.01% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000029s : 0.24% optimize.rewriter_before_opt_a : 0.000094s : 0.78% optimize.opt_a.expand_dump_flag : 0.000004s : 0.03% optimize.opt_a.switch_simplify : 0.000058s : 0.48% optimize.opt_a.loop_unroll : 0.000044s : 0.36% optimize.opt_a.a_1 : 0.001049s : 8.66% optimize.opt_a.with_stream_mark : 0.000026s : 0.21% optimize.opt_a.recompute_prepare : 0.000019s : 0.16% optimize.opt_a.updatestate_depend_eliminate : 0.000009s : 0.07% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.06% optimize.opt_a.updatestate_loads_eliminate : 0.000007s : 0.06% optimize.opt_a.parameter_eliminate : 0.000003s : 0.02% optimize.opt_a.a_2 : 0.000214s : 1.76% optimize.opt_a.accelerated_algorithm : 0.000017s : 0.14% optimize.opt_a.shard : 0.000003s : 0.03% optimize.opt_a.meta_shard_fg_expand : 0.000003s : 0.03% optimize.opt_a.shard_inline : 0.000017s : 0.14% optimize.opt_a.merge_send_recv : 0.000015s : 0.12% optimize.opt_a.auto_parallel : 0.000013s : 0.10% optimize.opt_a.parallel : 0.000023s : 0.19% optimize.opt_a.flash_sp : 0.000011s : 0.09% optimize.opt_a.merge_comm : 0.000009s : 0.08% optimize.opt_a.allreduce_fusion : 0.000009s : 0.07% optimize.opt_a.matmul_add_comm_reduction : 0.000017s : 0.14% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000019s : 0.16% optimize.opt_a.virtual_dataset : 0.000018s : 0.14% optimize.opt_a.get_grad_eliminate_ : 0.000016s : 0.13% optimize.opt_a.virtual_output : 0.000016s : 0.13% optimize.opt_a.merge_forward : 0.000008s : 0.06% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.02% optimize.opt_a.offload_activation : 0.000018s : 0.15% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000030s : 0.25% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.02% optimize.opt_a.before_grad : 0.000027s : 0.22% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000009s : 0.07% optimize.opt_a.meta_fg_expand : 0.000007s : 0.06% optimize.opt_a.flash_sp_send_recv_attached : 0.000003s : 0.03% optimize.opt_a.receive_attached : 0.000003s : 0.03% optimize.opt_a.after_resolve : 0.000029s : 0.24% optimize.opt_a.a_after_grad : 0.000026s : 0.22% optimize.opt_a.renormalize : 0.000704s : 5.81% optimize.opt_a.add_forward_monad_depend : 0.000006s : 0.05% optimize.opt_a.auto_monad_grad : 0.000003s : 0.03% optimize.opt_a.auto_monad_eliminator : 0.000025s : 0.21% optimize.opt_a.cse : 0.000054s : 0.45% optimize.opt_a.a_3 : 0.000113s : 0.93% optimize.py_interpret_to_execute_after_opt_a : 0.000012s : 0.10% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.02% optimize.rewriter_after_opt_a : 0.000040s : 0.33% optimize.convert_after_rewriter : 0.000008s : 0.07% optimize.order_py_execute_after_rewriter : 0.000006s : 0.05% optimize.mutable_eliminate : 0.000504s : 4.16% optimize.opt_b.b_1 : 0.000178s : 1.47% optimize.opt_b.b_2 : 0.000010s : 0.08% optimize.opt_b.updatestate_depend_eliminate : 0.000008s : 0.06% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_b.renormalize : 0.000001s : 0.01% optimize.opt_b.cse : 0.000026s : 0.21% optimize.optimize_parallel_all_gather_comm : 0.000018s : 0.15% optimize.overlap_param_gather : 0.000002s : 0.02% optimize.cconv : 0.000028s : 0.23% optimize.loop_unroll : 0.000437s : 3.60% optimize.opt_after_cconv.c_1 : 0.000043s : 0.35% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000007s : 0.05% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.cse : 0.000022s : 0.18% optimize.opt_after_cconv.renormalize : 0.000001s : 0.01% optimize.remove_dup_value : 0.000016s : 0.13% optimize.tuple_transform.d_1 : 0.000057s : 0.47% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000009s : 0.08% optimize.partial_unused_args_eliminate : 0.000002s : 0.01% optimize.add_recomputation : 0.000057s : 0.47% optimize.cse_after_recomputation.cse : 0.000016s : 0.13% optimize.environ_conv : 0.000006s : 0.05% optimize.swap_dp_allreduce_reducescatter : 0.000006s : 0.05% optimize.bias_add_comm_swap : 0.000002s : 0.02% optimize.label_micro_interleaved_index : 0.000004s : 0.04% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.02% optimize.merge_cast_opt : 0.000001s : 0.01% optimize.slice_recompute_activation : 0.000002s : 0.02% optimize.micro_interleaved_order_control : 0.000003s : 0.02% optimize.assign_add_opt : 0.000001s : 0.01% optimize.ForceFp32Comm : 0.000001s : 0.01% optimize.remove_cast_before_assign_add : 0.000001s : 0.01% optimize.full_micro_interleaved_order_control : 0.000002s : 0.02% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.02% optimize.comm_op_add_attrs : 0.000001s : 0.01% optimize.add_comm_op_reuse_tag : 0.000001s : 0.01% optimize.interleave_split_concat_branches : 0.000001s : 0.01% optimize.interleave_parallel_branches : 0.000001s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.02% optimize.control_data_broadcast_order : 0.000015s : 0.12% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.02% optimize.offloading_packed_experts : 0.000005s : 0.04% optimize.overlap_recompute_and_grad_model_parallel : 0.000006s : 0.05% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.01% optimize.overlap_recompute_comm : 0.000002s : 0.02% optimize.overlap_grad_ring_attention : 0.000004s : 0.04% optimize.overlap_grad_flash_sp : 0.000023s : 0.19% optimize.begin_end_overlap_inline : 0.000001s : 0.01% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.02% optimize.split_layernorm_comm : 0.000002s : 0.01% optimize.handle_group_info : 0.000001s : 0.01% optimize.symbol_engine_optimizer.build : 0.000004s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000013s : 0.10% optimize.symbol_engine_optimizer.elim_not_effective : 0.000017s : 0.14% optimize.symbol_engine_optimizer.opt_reshape : 0.000009s : 0.07% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000013s : 0.11% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.02% pipeline_parallel_scheduler : 0.000002s : 0.01% auto_monad_reorder : 0.000020s : 0.17% get_jit_bprop_graph : 0.000002s : 0.01% rewriter_after_jit_bprop_graph : 0.000004s : 0.03% opt_after_jit_grad : 0.000492s : 4.06% validate : 0.000045s : 0.37% Time group info: ------[substitution.] 0.000225 44 9.56% : 0.000022s : 3: substitution.cast_eliminate 0.89% : 0.000002s : 3: substitution.elim_not_effective 0.78% : 0.000002s : 3: substitution.fold_const_symbol 2.92% : 0.000007s : 6: substitution.graph_param_transform 67.76% : 0.000152s : 4: substitution.inline 1.94% : 0.000004s : 6: substitution.j_node_and_user_rematch 2.67% : 0.000006s : 6: substitution.remove_not_recompute_node 2.43% : 0.000005s : 6: substitution.replace_old_param 8.50% : 0.000019s : 6: substitution.tuple_list_get_item_eliminator 2.54% : 0.000006s : 1: substitution.value_based_eliminate ------[type_inference.] 0.006297 2 87.26% : 0.005494s : 1: type_inference.infer 12.74% : 0.000802s : 1: type_inference.specialize ------[replace.] 0.000072 10 51.87% : 0.000037s : 4: replace.inline 48.13% : 0.000035s : 6: replace.tuple_list_get_item_eliminator ------[match.] 0.000166 10 90.10% : 0.000150s : 4: match.inline 9.90% : 0.000016s : 6: match.tuple_list_get_item_eliminator ------[predicate.] 0.000284 1908 0.94% : 0.000003s : 20: predicate.accumulaten_eliminater 0.82% : 0.000002s : 6: predicate.ad_related_special_op_eliminate 0.54% : 0.000002s : 12: predicate.addn_check_dump 0.98% : 0.000003s : 20: predicate.addn_zero_filter 0.88% : 0.000003s : 20: predicate.adjust_all_reduce_mul_add 2.00% : 0.000006s : 32: predicate.arithmetic_simplify 1.01% : 0.000003s : 20: predicate.cast_eliminate 0.58% : 0.000002s : 12: predicate.check_bprop_eliminate 0.54% : 0.000002s : 12: predicate.compare_switch_simplify 0.19% : 0.000001s : 6: predicate.const_output_eliminate 0.62% : 0.000002s : 12: predicate.depend_value_elim 0.99% : 0.000003s : 20: predicate.dict_get_item_const_eliminator 1.07% : 0.000003s : 20: predicate.dict_get_item_eliminator 0.92% : 0.000003s : 20: predicate.dict_set_item_eliminator 0.85% : 0.000002s : 12: predicate.dumpgradient_eliminate 0.28% : 0.000001s : 6: predicate.elim_not_effective 0.36% : 0.000001s : 6: predicate.elim_shapecalc_of_broadcastargs 1.19% : 0.000003s : 26: predicate.environ_add_const_eliminate 1.14% : 0.000003s : 26: predicate.environ_get_add_eliminate 1.12% : 0.000003s : 26: predicate.environ_get_depend_swap 1.81% : 0.000005s : 38: predicate.environ_get_eliminate 1.14% : 0.000003s : 26: predicate.environ_get_set_eliminate 1.47% : 0.000004s : 30: predicate.exchange_switch_depend_value 2.32% : 0.000007s : 30: predicate.float_depend_g_call 0.56% : 0.000002s : 12: predicate.float_environ_get_switch 0.83% : 0.000002s : 18: predicate.float_tuple_getitem_switch 0.20% : 0.000001s : 6: predicate.fold_const_symbol 0.62% : 0.000002s : 12: predicate.get_grad_eliminate 0.21% : 0.000001s : 6: predicate.graph_param_transform 0.60% : 0.000002s : 12: predicate.incorporate_call 0.51% : 0.000001s : 12: predicate.incorporate_call_switch 6.16% : 0.000017s : 86: predicate.inline 0.79% : 0.000002s : 12: predicate.inline_without_move 0.35% : 0.000001s : 12: predicate.j_node_and_user_rematch 0.75% : 0.000002s : 12: predicate.less_batch_normalization 1.88% : 0.000005s : 38: predicate.list_to_tuple_eliminator_ 2.66% : 0.000008s : 58: predicate.load_eliminater 0.75% : 0.000002s : 6: predicate.loop_unroll_after_grad 2.15% : 0.000006s : 42: predicate.loop_unroll_before_grad 1.92% : 0.000005s : 32: predicate.make_slice_get_slice_eliminator 0.61% : 0.000002s : 12: predicate.merge_addn 0.57% : 0.000002s : 12: predicate.micro_step_allgather_replace 0.58% : 0.000002s : 12: predicate.mini_step_allgather_replace 0.88% : 0.000003s : 20: predicate.minmaximum_grad 0.84% : 0.000002s : 6: predicate.mutable_eliminate 0.32% : 0.000001s : 6: predicate.opt_reshape 0.41% : 0.000001s : 6: predicate.parallel_virtual_node 1.84% : 0.000005s : 30: predicate.partial_defer_inline 1.83% : 0.000005s : 32: predicate.partial_eliminate 0.95% : 0.000003s : 20: predicate.print_const_string_wrapper 0.57% : 0.000002s : 12: predicate.reduce_all_const_elim 1.13% : 0.000003s : 20: predicate.reduce_eliminate 2.66% : 0.000008s : 58: predicate.redundant_stop_gradient_eliminater 0.42% : 0.000001s : 12: predicate.remove_not_recompute_node 1.43% : 0.000004s : 38: predicate.replace_applicator 0.52% : 0.000001s : 12: predicate.replace_old_param 0.33% : 0.000001s : 6: predicate.reset_defer_inline 0.97% : 0.000003s : 20: predicate.reshape_eliminate 0.56% : 0.000002s : 12: predicate.row_tensor_add_zeros_like 0.36% : 0.000001s : 6: predicate.row_tensor_eliminate 0.73% : 0.000002s : 12: predicate.same_eliminate 0.45% : 0.000001s : 12: predicate.set_cell_output_no_recompute 0.68% : 0.000002s : 12: predicate.shard_identity_eliminate 0.70% : 0.000002s : 12: predicate.special_op_eliminate 0.74% : 0.000002s : 12: predicate.specialize_transform 0.78% : 0.000002s : 12: predicate.split_environ_get_set_with_tuple_value 0.70% : 0.000002s : 12: predicate.stack_unstack_eliminate 0.35% : 0.000001s : 6: predicate.switch_call_monad_eliminater 1.60% : 0.000005s : 30: predicate.switch_defer_inline 2.07% : 0.000006s : 42: predicate.switch_layer_defer_inline 4.83% : 0.000014s : 90: predicate.switch_simplify 0.92% : 0.000003s : 20: predicate.tile_eliminate 0.94% : 0.000003s : 20: predicate.transpose_eliminate 1.60% : 0.000005s : 32: predicate.tuple_list_convert_item_index_to_positive 1.64% : 0.000005s : 32: predicate.tuple_list_get_item_const_eliminator 1.52% : 0.000004s : 32: predicate.tuple_list_get_item_depend_reorder 3.27% : 0.000009s : 50: predicate.tuple_list_get_item_eliminator 1.61% : 0.000005s : 32: predicate.tuple_list_get_set_item_eliminator 2.28% : 0.000006s : 44: predicate.tuple_list_set_item_eliminator 1.84% : 0.000005s : 38: predicate.tuple_to_list_eliminator_ 2.66% : 0.000008s : 58: predicate.updatestate_pure_node_eliminater 3.30% : 0.000009s : 70: predicate.updatestate_useless_node_eliminater 0.39% : 0.000001s : 6: predicate.value_based_eliminate 0.63% : 0.000002s : 12: predicate.virtual_dataset_eliminate 0.58% : 0.000002s : 12: predicate.virtual_output_eliminate 0.33% : 0.000001s : 6: predicate.virtual_view_grad_eliminate 0.37% : 0.000001s : 6: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000610 11 56.23% : 0.000343s : 5: func_graph_cloner_run.FuncGraphClonerGraph 43.77% : 0.000267s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.027224 192 0.01% : 0.000003s : 1: ForceFp32Comm 11.44% : 0.003115s : 1: add_attr 11.40% : 0.003105s : 1: add_attr_with_inline 0.01% : 0.000004s : 1: add_comm_op_reuse_tag 0.23% : 0.000062s : 1: add_recomputation 0.01% : 0.000004s : 1: assign_add_opt 0.25% : 0.000069s : 1: auto_monad 0.09% : 0.000024s : 1: auto_monad_reorder 0.01% : 0.000003s : 1: begin_end_overlap_inline 0.02% : 0.000005s : 1: bias_add_comm_swap 1.91% : 0.000520s : 1: bootstrap 0.11% : 0.000031s : 1: cconv 0.01% : 0.000004s : 1: comm_op_add_attrs 0.07% : 0.000018s : 1: control_data_broadcast_order 0.04% : 0.000011s : 1: convert_after_rewriter 0.11% : 0.000029s : 1: cse_after_recomputation 0.02% : 0.000005s : 1: dataset_repeat_opt 0.02% : 0.000005s : 1: detach_backward 0.04% : 0.000010s : 1: environ_conv 0.11% : 0.000029s : 1: event_method 0.02% : 0.000005s : 1: full_micro_interleaved_order_control 0.02% : 0.000005s : 1: get_jit_bprop_graph 0.03% : 0.000009s : 1: graph_reusing 0.02% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000004s : 1: handle_group_info 0.02% : 0.000005s : 1: inline 0.02% : 0.000006s : 1: insert-virtual-dataset 0.01% : 0.000004s : 1: interleave_parallel_branches 0.01% : 0.000004s : 1: interleave_split_concat_branches 0.02% : 0.000005s : 1: label_fine_grained_interleaved_index 0.03% : 0.000007s : 1: label_micro_interleaved_index 1.63% : 0.000445s : 1: loop_unroll 0.01% : 0.000004s : 1: merge_cast_opt 0.02% : 0.000005s : 1: micro_interleaved_order_control 1.89% : 0.000513s : 1: mutable_eliminate 0.03% : 0.000008s : 1: offloading_packed_experts 0.06% : 0.000017s : 1: opt.transform.loop_unroll_optimizer 0.07% : 0.000020s : 1: opt.transform.mutable_eliminate 5.96% : 0.001622s : 78: opt.transform.opt_a 0.15% : 0.000041s : 1: opt.transform.opt_after_cconv 0.13% : 0.000035s : 1: opt.transform.opt_after_jit_grad 0.58% : 0.000158s : 28: opt.transform.opt_b 0.23% : 0.000064s : 2: opt.transform.opt_trans_graph 0.17% : 0.000048s : 4: opt.transform.symbol_engine_opt 11.69% : 0.003182s : 1: opt_a 0.45% : 0.000124s : 1: opt_after_cconv 1.85% : 0.000502s : 1: opt_after_jit_grad 1.02% : 0.000279s : 1: opt_b 19.70% : 0.005364s : 1: optimize 0.08% : 0.000022s : 1: optimize_parallel_all_gather_comm 0.03% : 0.000009s : 1: order_py_execute_after_rewriter 0.10% : 0.000027s : 1: overlap_grad_flash_sp 0.01% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.03% : 0.000007s : 1: overlap_grad_ring_attention 0.02% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.02% : 0.000005s : 1: overlap_param_gather 0.01% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.03% : 0.000009s : 1: overlap_recompute_and_grad_model_parallel 0.02% : 0.000005s : 1: overlap_recompute_comm 0.03% : 0.000007s : 1: parallel-infer-symbol 0.02% : 0.000004s : 1: parallel-infer-symbol-second 0.02% : 0.000005s : 1: partial_unused_args_eliminate 0.02% : 0.000004s : 1: pipeline_parallel_scheduler 0.02% : 0.000004s : 1: pipeline_split 0.14% : 0.000038s : 1: pre_auto_parallel 0.12% : 0.000033s : 1: py_interpret_to_execute 0.06% : 0.000015s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000004s : 1: remove_cast_before_assign_add 0.07% : 0.000019s : 1: remove_dup_value 1.35% : 0.000368s : 1: renormalize.infer 1.21% : 0.000328s : 1: renormalize.specialize 0.02% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.02% : 0.000007s : 1: rewriter_after_jit_bprop_graph 0.16% : 0.000043s : 1: rewriter_after_opt_a 0.36% : 0.000098s : 1: rewriter_before_opt_a 0.02% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.02% : 0.000005s : 1: slice_recompute_activation 0.02% : 0.000004s : 1: split_layernorm_comm 0.02% : 0.000005s : 1: split_matmul_comm_elemetwise 0.03% : 0.000009s : 1: swap_dp_allreduce_reducescatter 0.34% : 0.000091s : 1: symbol_engine_optimizer 0.34% : 0.000092s : 1: tuple_transform 23.45% : 0.006383s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:47:07.315.051 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:47:07.315.323 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0190339, [21] [bootstrap]: 0.00044809 [type_inference]: 0.00621849 [event_method]: 2.442e-05 [auto_monad]: 7.271e-05 [graph_reusing]: 6.41e-06 [inline]: 2.33002e-06 [add_attr]: 0.00337415, [1] [add_attr_with_inline]: 0.00336338, [1] [Cycle 1]: 9.314e-05, [2] [tag_attr]: 2.464e-05 [meta_addattr_fg_expand]: 6.84001e-06 [parallel-infer-symbol]: 3.21999e-06 [pre_auto_parallel]: 4.072e-05 [insert-virtual-dataset]: 2.30002e-06 [parallel-infer-symbol-second]: 7.39994e-07 [dataset_repeat_opt]: 1.92999e-06 [pipeline_split]: 1.60001e-06 [optimize]: 0.00751044, [53] [py_interpret_to_execute]: 3.976e-05 [rewriter_before_opt_a]: 0.00010734 [opt_a]: 0.00444075, [2] [Cycle 1]: 0.003179, [45] [expand_dump_flag]: 3.30998e-06 [switch_simplify]: 4.944e-05 [loop_unroll]: 3.774e-05 [a_1]: 0.0009352 [with_stream_mark]: 2.295e-05 [recompute_prepare]: 1.588e-05 [updatestate_depend_eliminate]: 7.05e-06 [updatestate_assign_eliminate]: 4.77998e-06 [updatestate_loads_eliminate]: 4.32003e-06 [parameter_eliminate]: 2.60997e-06 [a_2]: 0.00016399 [accelerated_algorithm]: 1.324e-05 [shard]: 2.32001e-06 [meta_shard_fg_expand]: 2.76999e-06 [shard_inline]: 1.099e-05 [merge_send_recv]: 1.338e-05 [auto_parallel]: 1.13e-05 [parallel]: 2.082e-05 [flash_sp]: 1.038e-05 [merge_comm]: 6.00002e-06 [allreduce_fusion]: 4.97999e-06 [matmul_add_comm_reduction]: 1.259e-05 [allreduce_slice_to_reducescatter]: 6.69999e-07 [virtual_shard_identity]: 1.653e-05 [virtual_dataset]: 1.049e-05 [get_grad_eliminate_]: 9.77999e-06 [virtual_output]: 1.083e-05 [merge_forward]: 5.79e-06 [cell_reuse_recompute_pass]: 1.53002e-06 [offload_activation]: 1.38e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.753e-05 [merge_recompute_call_nodes]: 1.82001e-06 [before_grad]: 1.944e-05 [set_forward_comm_id_for_comm_node_pass]: 7.37997e-06 [meta_fg_expand]: 4.26001e-06 [flash_sp_send_recv_attached]: 4.60001e-06 [receive_attached]: 2.12999e-06 [after_resolve]: 1.943e-05 [a_after_grad]: 1.686e-05 [renormalize]: 0.0009775 [add_forward_monad_depend]: 8.50999e-06 [auto_monad_grad]: 2.53e-06 [auto_monad_eliminator]: 2.204e-05 [cse]: 7.244e-05 [a_3]: 9.489e-05 [Cycle 2]: 0.00124325, [45] [expand_dump_flag]: 2.24001e-06 [switch_simplify]: 1.229e-05 [loop_unroll]: 1.018e-05 [a_1]: 0.00024727 [with_stream_mark]: 3.97e-05 [recompute_prepare]: 1.453e-05 [updatestate_depend_eliminate]: 6.14999e-06 [updatestate_assign_eliminate]: 4.22e-06 [updatestate_loads_eliminate]: 4.18999e-06 [parameter_eliminate]: 2.29001e-06 [a_2]: 0.0001494 [accelerated_algorithm]: 1.027e-05 [shard]: 2.37001e-06 [meta_shard_fg_expand]: 2.52001e-06 [shard_inline]: 1.019e-05 [merge_send_recv]: 1.146e-05 [auto_parallel]: 1.039e-05 [parallel]: 7.02997e-06 [flash_sp]: 3.98001e-06 [merge_comm]: 5.39998e-06 [allreduce_fusion]: 5.13002e-06 [matmul_add_comm_reduction]: 1.107e-05 [allreduce_slice_to_reducescatter]: 1.01002e-06 [virtual_shard_identity]: 1.309e-05 [virtual_dataset]: 9.71e-06 [get_grad_eliminate_]: 8.93002e-06 [virtual_output]: 8.90001e-06 [merge_forward]: 6.20002e-06 [cell_reuse_recompute_pass]: 2.02999e-06 [offload_activation]: 1.116e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.46e-05 [merge_recompute_call_nodes]: 1.64998e-06 [before_grad]: 1.588e-05 [set_forward_comm_id_for_comm_node_pass]: 6.53e-06 [meta_fg_expand]: 3.91001e-06 [flash_sp_send_recv_attached]: 1.99999e-06 [receive_attached]: 1.59e-06 [after_resolve]: 1.728e-05 [a_after_grad]: 1.581e-05 [renormalize]: 7.00238e-08 [add_forward_monad_depend]: 3.99002e-06 [auto_monad_grad]: 2.31e-06 [auto_monad_eliminator]: 1.459e-05 [cse]: 3.433e-05 [a_3]: 7.885e-05 [py_interpret_to_execute_after_opt_a]: 2.194e-05 [slice_cell_reuse_recomputed_activation]: 5.64e-06 [rewriter_after_opt_a]: 6.472e-05 [convert_after_rewriter]: 1.372e-05 [order_py_execute_after_rewriter]: 1.047e-05 [mutable_eliminate]: 0.0007055 [opt_b]: 0.00042551, [1] [Cycle 1]: 0.0004147, [7] [b_1]: 0.00026401 [b_2]: 1.367e-05 [updatestate_depend_eliminate]: 1.21e-05 [updatestate_assign_eliminate]: 5.34998e-06 [updatestate_loads_eliminate]: 4.57998e-06 [renormalize]: 1.10001e-06 [cse]: 4.673e-05 [optimize_parallel_all_gather_comm]: 2.775e-05 [overlap_param_gather]: 4.78001e-06 [cconv]: 4.01e-05 [loop_unroll]: 0.00051746 [opt_after_cconv]: 0.00017796, [1] [Cycle 1]: 0.00016796, [7] [c_1]: 4.988e-05 [parameter_eliminate]: 5.12e-06 [updatestate_depend_eliminate]: 1.014e-05 [updatestate_assign_eliminate]: 3.98001e-06 [updatestate_loads_eliminate]: 3.63e-06 [cse]: 3.503e-05 [renormalize]: 7.2e-07 [remove_dup_value]: 5.468e-05 [tuple_transform]: 0.00013187, [1] [Cycle 1]: 0.00012292, [4] [d_1]: 7.035e-05 [none_parameter_eliminate]: 2.31998e-06 [renormalize]: 2.19996e-07 [switch_simplify]: 1.199e-05 [partial_unused_args_eliminate]: 4.96002e-06 [add_recomputation]: 7.404e-05 [cse_after_recomputation]: 4.028e-05, [1] [Cycle 1]: 3.316e-05, [1] [cse]: 2.225e-05 [environ_conv]: 1.155e-05 [swap_dp_allreduce_reducescatter]: 1.073e-05 [bias_add_comm_swap]: 6.36998e-06 [label_micro_interleaved_index]: 7.75e-06 [label_fine_grained_interleaved_index]: 5.67001e-06 [merge_cast_opt]: 4.27e-06 [slice_recompute_activation]: 4.42998e-06 [micro_interleaved_order_control]: 5.34998e-06 [assign_add_opt]: 3.58e-06 [ForceFp32Comm]: 3.27997e-06 [remove_cast_before_assign_add]: 3.75998e-06 [full_micro_interleaved_order_control]: 4.65999e-06 [reorder_send_recv_between_fp_bp]: 5.97001e-06 [comm_op_add_attrs]: 3.45e-06 [add_comm_op_reuse_tag]: 3.61001e-06 [interleave_split_concat_branches]: 3.67002e-06 [interleave_parallel_branches]: 3.48999e-06 [overlap_opt_shard_in_pipeline]: 3.50998e-06 [overlap_opt_shard_grad_in_pipeline]: 4.26001e-06 [control_data_broadcast_order]: 2.248e-05 [grouped_pairwise_exchange_alltoall]: 4.05998e-06 [offloading_packed_experts]: 8.50999e-06 [overlap_recompute_and_grad_model_parallel]: 8.57e-06 [overlap_grad_matmul_and_grad_allreduce]: 4.28001e-06 [overlap_recompute_allgather_and_fa_grad]: 4.28999e-06 [overlap_recompute_comm]: 4.87e-06 [overlap_grad_ring_attention]: 8.82e-06 [overlap_grad_flash_sp]: 3.386e-05 [begin_end_overlap_inline]: 3.18e-06 [split_matmul_comm_elemetwise]: 4.48999e-06 [split_layernorm_comm]: 4.43999e-06 [handle_group_info]: 3.53e-06 [symbol_engine_optimizer]: 0.00013982, [1] [Cycle 1]: 0.00013133, [6] [build]: 5.60001e-06 [elim_shapecalc]: 1.898e-05 [elim_not_effective]: 2.11e-05 [opt_reshape]: 1.153e-05 [fold_const_symbol]: 1.668e-05 [renormalize]: 6.40022e-07 [detach_backward]: 4.43999e-06 [pipeline_parallel_scheduler]: 2.17999e-06 [auto_monad_reorder]: 2.72e-05 [get_jit_bprop_graph]: 2.06003e-06 [rewriter_after_jit_bprop_graph]: 6.58e-06 [opt_after_jit_grad]: 0.00057434 [validate]: 4.918e-05 Sums bootstrap : 0.000448s : 3.29% type_inference : 0.006218s : 45.63% event_method : 0.000024s : 0.18% auto_monad : 0.000073s : 0.53% graph_reusing : 0.000006s : 0.05% inline : 0.000002s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000025s : 0.18% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000007s : 0.05% parallel-infer-symbol : 0.000003s : 0.02% pre_auto_parallel : 0.000041s : 0.30% insert-virtual-dataset : 0.000002s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.01% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000040s : 0.29% optimize.rewriter_before_opt_a : 0.000107s : 0.79% optimize.opt_a.expand_dump_flag : 0.000006s : 0.04% optimize.opt_a.switch_simplify : 0.000062s : 0.45% optimize.opt_a.loop_unroll : 0.000048s : 0.35% optimize.opt_a.a_1 : 0.001182s : 8.68% optimize.opt_a.with_stream_mark : 0.000063s : 0.46% optimize.opt_a.recompute_prepare : 0.000030s : 0.22% optimize.opt_a.updatestate_depend_eliminate : 0.000013s : 0.10% optimize.opt_a.updatestate_assign_eliminate : 0.000009s : 0.07% optimize.opt_a.updatestate_loads_eliminate : 0.000009s : 0.06% optimize.opt_a.parameter_eliminate : 0.000005s : 0.04% optimize.opt_a.a_2 : 0.000313s : 2.30% optimize.opt_a.accelerated_algorithm : 0.000024s : 0.17% optimize.opt_a.shard : 0.000005s : 0.03% optimize.opt_a.meta_shard_fg_expand : 0.000005s : 0.04% optimize.opt_a.shard_inline : 0.000021s : 0.16% optimize.opt_a.merge_send_recv : 0.000025s : 0.18% optimize.opt_a.auto_parallel : 0.000022s : 0.16% optimize.opt_a.parallel : 0.000028s : 0.20% optimize.opt_a.flash_sp : 0.000014s : 0.11% optimize.opt_a.merge_comm : 0.000011s : 0.08% optimize.opt_a.allreduce_fusion : 0.000010s : 0.07% optimize.opt_a.matmul_add_comm_reduction : 0.000024s : 0.17% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000002s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000030s : 0.22% optimize.opt_a.virtual_dataset : 0.000020s : 0.15% optimize.opt_a.get_grad_eliminate_ : 0.000019s : 0.14% optimize.opt_a.virtual_output : 0.000020s : 0.14% optimize.opt_a.merge_forward : 0.000012s : 0.09% optimize.opt_a.cell_reuse_recompute_pass : 0.000004s : 0.03% optimize.opt_a.offload_activation : 0.000025s : 0.18% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000052s : 0.38% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.03% optimize.opt_a.before_grad : 0.000035s : 0.26% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000014s : 0.10% optimize.opt_a.meta_fg_expand : 0.000008s : 0.06% optimize.opt_a.flash_sp_send_recv_attached : 0.000007s : 0.05% optimize.opt_a.receive_attached : 0.000004s : 0.03% optimize.opt_a.after_resolve : 0.000037s : 0.27% optimize.opt_a.a_after_grad : 0.000033s : 0.24% optimize.opt_a.renormalize : 0.000978s : 7.17% optimize.opt_a.add_forward_monad_depend : 0.000013s : 0.09% optimize.opt_a.auto_monad_grad : 0.000005s : 0.04% optimize.opt_a.auto_monad_eliminator : 0.000037s : 0.27% optimize.opt_a.cse : 0.000107s : 0.78% optimize.opt_a.a_3 : 0.000174s : 1.27% optimize.py_interpret_to_execute_after_opt_a : 0.000022s : 0.16% optimize.slice_cell_reuse_recomputed_activation : 0.000006s : 0.04% optimize.rewriter_after_opt_a : 0.000065s : 0.47% optimize.convert_after_rewriter : 0.000014s : 0.10% optimize.order_py_execute_after_rewriter : 0.000010s : 0.08% optimize.mutable_eliminate : 0.000705s : 5.18% optimize.opt_b.b_1 : 0.000264s : 1.94% optimize.opt_b.b_2 : 0.000014s : 0.10% optimize.opt_b.updatestate_depend_eliminate : 0.000012s : 0.09% optimize.opt_b.updatestate_assign_eliminate : 0.000005s : 0.04% optimize.opt_b.updatestate_loads_eliminate : 0.000005s : 0.03% optimize.opt_b.renormalize : 0.000001s : 0.01% optimize.opt_b.cse : 0.000047s : 0.34% optimize.optimize_parallel_all_gather_comm : 0.000028s : 0.20% optimize.overlap_param_gather : 0.000005s : 0.04% optimize.cconv : 0.000040s : 0.29% optimize.loop_unroll : 0.000517s : 3.80% optimize.opt_after_cconv.c_1 : 0.000050s : 0.37% optimize.opt_after_cconv.parameter_eliminate : 0.000005s : 0.04% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000010s : 0.07% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000004s : 0.03% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000004s : 0.03% optimize.opt_after_cconv.cse : 0.000035s : 0.26% optimize.opt_after_cconv.renormalize : 0.000001s : 0.01% optimize.remove_dup_value : 0.000055s : 0.40% optimize.tuple_transform.d_1 : 0.000070s : 0.52% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.02% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000012s : 0.09% optimize.partial_unused_args_eliminate : 0.000005s : 0.04% optimize.add_recomputation : 0.000074s : 0.54% optimize.cse_after_recomputation.cse : 0.000022s : 0.16% optimize.environ_conv : 0.000012s : 0.08% optimize.swap_dp_allreduce_reducescatter : 0.000011s : 0.08% optimize.bias_add_comm_swap : 0.000006s : 0.05% optimize.label_micro_interleaved_index : 0.000008s : 0.06% optimize.label_fine_grained_interleaved_index : 0.000006s : 0.04% optimize.merge_cast_opt : 0.000004s : 0.03% optimize.slice_recompute_activation : 0.000004s : 0.03% optimize.micro_interleaved_order_control : 0.000005s : 0.04% optimize.assign_add_opt : 0.000004s : 0.03% optimize.ForceFp32Comm : 0.000003s : 0.02% optimize.remove_cast_before_assign_add : 0.000004s : 0.03% optimize.full_micro_interleaved_order_control : 0.000005s : 0.03% optimize.reorder_send_recv_between_fp_bp : 0.000006s : 0.04% optimize.comm_op_add_attrs : 0.000003s : 0.03% optimize.add_comm_op_reuse_tag : 0.000004s : 0.03% optimize.interleave_split_concat_branches : 0.000004s : 0.03% optimize.interleave_parallel_branches : 0.000003s : 0.03% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.03% optimize.overlap_opt_shard_grad_in_pipeline : 0.000004s : 0.03% optimize.control_data_broadcast_order : 0.000022s : 0.16% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.03% optimize.offloading_packed_experts : 0.000009s : 0.06% optimize.overlap_recompute_and_grad_model_parallel : 0.000009s : 0.06% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.03% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.03% optimize.overlap_recompute_comm : 0.000005s : 0.04% optimize.overlap_grad_ring_attention : 0.000009s : 0.06% optimize.overlap_grad_flash_sp : 0.000034s : 0.25% optimize.begin_end_overlap_inline : 0.000003s : 0.02% optimize.split_matmul_comm_elemetwise : 0.000004s : 0.03% optimize.split_layernorm_comm : 0.000004s : 0.03% optimize.handle_group_info : 0.000004s : 0.03% optimize.symbol_engine_optimizer.build : 0.000006s : 0.04% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000019s : 0.14% optimize.symbol_engine_optimizer.elim_not_effective : 0.000021s : 0.15% optimize.symbol_engine_optimizer.opt_reshape : 0.000012s : 0.08% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000017s : 0.12% optimize.symbol_engine_optimizer.renormalize : 0.000001s : 0.00% detach_backward : 0.000004s : 0.03% pipeline_parallel_scheduler : 0.000002s : 0.02% auto_monad_reorder : 0.000027s : 0.20% get_jit_bprop_graph : 0.000002s : 0.02% rewriter_after_jit_bprop_graph : 0.000007s : 0.05% opt_after_jit_grad : 0.000574s : 4.21% validate : 0.000049s : 0.36% Time group info: ------[substitution.] 0.000277 54 13.88% : 0.000039s : 6: substitution.cast_eliminate 1.07% : 0.000003s : 4: substitution.elim_not_effective 0.75% : 0.000002s : 4: substitution.fold_const_symbol 2.97% : 0.000008s : 7: substitution.graph_param_transform 64.07% : 0.000178s : 4: substitution.inline 2.16% : 0.000006s : 8: substitution.j_node_and_user_rematch 3.10% : 0.000009s : 8: substitution.remove_not_recompute_node 2.74% : 0.000008s : 6: substitution.replace_old_param 7.16% : 0.000020s : 6: substitution.tuple_list_get_item_eliminator 2.10% : 0.000006s : 1: substitution.value_based_eliminate ------[type_inference.] 0.006152 2 85.24% : 0.005244s : 1: type_inference.infer 14.76% : 0.000908s : 1: type_inference.specialize ------[replace.] 0.000082 10 50.31% : 0.000041s : 4: replace.inline 49.69% : 0.000041s : 6: replace.tuple_list_get_item_eliminator ------[match.] 0.000192 10 91.16% : 0.000175s : 4: match.inline 8.84% : 0.000017s : 6: match.tuple_list_get_item_eliminator ------[predicate.] 0.000335 2134 0.88% : 0.000003s : 22: predicate.accumulaten_eliminater 0.87% : 0.000003s : 7: predicate.ad_related_special_op_eliminate 0.64% : 0.000002s : 14: predicate.addn_check_dump 0.89% : 0.000003s : 22: predicate.addn_zero_filter 0.84% : 0.000003s : 22: predicate.adjust_all_reduce_mul_add 1.84% : 0.000006s : 36: predicate.arithmetic_simplify 1.01% : 0.000003s : 22: predicate.cast_eliminate 0.78% : 0.000003s : 14: predicate.check_bprop_eliminate 0.62% : 0.000002s : 14: predicate.compare_switch_simplify 0.20% : 0.000001s : 7: predicate.const_output_eliminate 0.66% : 0.000002s : 14: predicate.depend_value_elim 0.96% : 0.000003s : 22: predicate.dict_get_item_const_eliminator 1.03% : 0.000003s : 22: predicate.dict_get_item_eliminator 0.88% : 0.000003s : 22: predicate.dict_set_item_eliminator 0.97% : 0.000003s : 14: predicate.dumpgradient_eliminate 0.35% : 0.000001s : 7: predicate.elim_not_effective 0.40% : 0.000001s : 7: predicate.elim_shapecalc_of_broadcastargs 1.14% : 0.000004s : 29: predicate.environ_add_const_eliminate 1.12% : 0.000004s : 29: predicate.environ_get_add_eliminate 1.21% : 0.000004s : 29: predicate.environ_get_depend_swap 1.76% : 0.000006s : 43: predicate.environ_get_eliminate 1.12% : 0.000004s : 29: predicate.environ_get_set_eliminate 1.38% : 0.000005s : 32: predicate.exchange_switch_depend_value 2.28% : 0.000008s : 32: predicate.float_depend_g_call 0.58% : 0.000002s : 14: predicate.float_environ_get_switch 0.85% : 0.000003s : 21: predicate.float_tuple_getitem_switch 0.20% : 0.000001s : 7: predicate.fold_const_symbol 0.66% : 0.000002s : 14: predicate.get_grad_eliminate 0.26% : 0.000001s : 7: predicate.graph_param_transform 0.65% : 0.000002s : 14: predicate.incorporate_call 0.56% : 0.000002s : 14: predicate.incorporate_call_switch 6.44% : 0.000022s : 96: predicate.inline 0.94% : 0.000003s : 14: predicate.inline_without_move 0.41% : 0.000001s : 14: predicate.j_node_and_user_rematch 0.79% : 0.000003s : 14: predicate.less_batch_normalization 1.87% : 0.000006s : 42: predicate.list_to_tuple_eliminator_ 2.57% : 0.000009s : 64: predicate.load_eliminater 0.91% : 0.000003s : 7: predicate.loop_unroll_after_grad 1.94% : 0.000006s : 44: predicate.loop_unroll_before_grad 1.57% : 0.000005s : 36: predicate.make_slice_get_slice_eliminator 0.64% : 0.000002s : 14: predicate.merge_addn 0.59% : 0.000002s : 14: predicate.micro_step_allgather_replace 0.59% : 0.000002s : 14: predicate.mini_step_allgather_replace 0.86% : 0.000003s : 22: predicate.minmaximum_grad 1.00% : 0.000003s : 7: predicate.mutable_eliminate 0.39% : 0.000001s : 7: predicate.opt_reshape 0.38% : 0.000001s : 7: predicate.parallel_virtual_node 1.70% : 0.000006s : 32: predicate.partial_defer_inline 1.69% : 0.000006s : 35: predicate.partial_eliminate 0.88% : 0.000003s : 22: predicate.print_const_string_wrapper 0.71% : 0.000002s : 14: predicate.reduce_all_const_elim 1.13% : 0.000004s : 22: predicate.reduce_eliminate 2.49% : 0.000008s : 64: predicate.redundant_stop_gradient_eliminater 0.41% : 0.000001s : 14: predicate.remove_not_recompute_node 1.46% : 0.000005s : 42: predicate.replace_applicator 0.42% : 0.000001s : 14: predicate.replace_old_param 0.35% : 0.000001s : 7: predicate.reset_defer_inline 0.93% : 0.000003s : 22: predicate.reshape_eliminate 0.63% : 0.000002s : 14: predicate.row_tensor_add_zeros_like 0.33% : 0.000001s : 7: predicate.row_tensor_eliminate 0.79% : 0.000003s : 14: predicate.same_eliminate 0.71% : 0.000002s : 14: predicate.set_cell_output_no_recompute 0.82% : 0.000003s : 14: predicate.shard_identity_eliminate 0.69% : 0.000002s : 14: predicate.special_op_eliminate 0.78% : 0.000003s : 14: predicate.specialize_transform 0.97% : 0.000003s : 14: predicate.split_environ_get_set_with_tuple_value 0.80% : 0.000003s : 14: predicate.stack_unstack_eliminate 0.35% : 0.000001s : 7: predicate.switch_call_monad_eliminater 1.39% : 0.000005s : 32: predicate.switch_defer_inline 1.94% : 0.000007s : 46: predicate.switch_layer_defer_inline 4.55% : 0.000015s : 97: predicate.switch_simplify 0.88% : 0.000003s : 22: predicate.tile_eliminate 0.88% : 0.000003s : 22: predicate.transpose_eliminate 1.56% : 0.000005s : 36: predicate.tuple_list_convert_item_index_to_positive 1.61% : 0.000005s : 36: predicate.tuple_list_get_item_const_eliminator 1.53% : 0.000005s : 36: predicate.tuple_list_get_item_depend_reorder 3.15% : 0.000011s : 56: predicate.tuple_list_get_item_eliminator 1.58% : 0.000005s : 36: predicate.tuple_list_get_set_item_eliminator 2.33% : 0.000008s : 50: predicate.tuple_list_set_item_eliminator 1.76% : 0.000006s : 42: predicate.tuple_to_list_eliminator_ 2.49% : 0.000008s : 64: predicate.updatestate_pure_node_eliminater 3.19% : 0.000011s : 78: predicate.updatestate_useless_node_eliminater 0.63% : 0.000002s : 7: predicate.value_based_eliminate 0.61% : 0.000002s : 14: predicate.virtual_dataset_eliminate 0.72% : 0.000002s : 14: predicate.virtual_output_eliminate 0.32% : 0.000001s : 7: predicate.virtual_view_grad_eliminate 0.37% : 0.000001s : 7: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000656 11 48.44% : 0.000318s : 5: func_graph_cloner_run.FuncGraphClonerGraph 51.56% : 0.000338s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.033117 192 0.02% : 0.000006s : 1: ForceFp32Comm 10.22% : 0.003384s : 1: add_attr 10.17% : 0.003368s : 1: add_attr_with_inline 0.02% : 0.000007s : 1: add_comm_op_reuse_tag 0.24% : 0.000079s : 1: add_recomputation 0.02% : 0.000006s : 1: assign_add_opt 0.25% : 0.000082s : 1: auto_monad 0.11% : 0.000036s : 1: auto_monad_reorder 0.02% : 0.000006s : 1: begin_end_overlap_inline 0.03% : 0.000009s : 1: bias_add_comm_swap 1.48% : 0.000491s : 1: bootstrap 0.13% : 0.000044s : 1: cconv 0.02% : 0.000007s : 1: comm_op_add_attrs 0.08% : 0.000027s : 1: control_data_broadcast_order 0.05% : 0.000018s : 1: convert_after_rewriter 0.13% : 0.000044s : 1: cse_after_recomputation 0.02% : 0.000007s : 1: dataset_repeat_opt 0.08% : 0.000026s : 1: detach_backward 0.04% : 0.000015s : 1: environ_conv 0.11% : 0.000037s : 1: event_method 0.02% : 0.000007s : 1: full_micro_interleaved_order_control 0.03% : 0.000008s : 1: get_jit_bprop_graph 0.04% : 0.000013s : 1: graph_reusing 0.02% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.02% : 0.000006s : 1: handle_group_info 0.02% : 0.000008s : 1: inline 0.03% : 0.000008s : 1: insert-virtual-dataset 0.02% : 0.000006s : 1: interleave_parallel_branches 0.02% : 0.000006s : 1: interleave_split_concat_branches 0.03% : 0.000009s : 1: label_fine_grained_interleaved_index 0.03% : 0.000011s : 1: label_micro_interleaved_index 1.59% : 0.000525s : 1: loop_unroll 0.02% : 0.000007s : 1: merge_cast_opt 0.03% : 0.000009s : 1: micro_interleaved_order_control 2.16% : 0.000714s : 1: mutable_eliminate 0.04% : 0.000012s : 1: offloading_packed_experts 0.07% : 0.000023s : 1: opt.transform.loop_unroll_optimizer 0.08% : 0.000026s : 1: opt.transform.mutable_eliminate 5.73% : 0.001899s : 78: opt.transform.opt_a 0.15% : 0.000048s : 1: opt.transform.opt_after_cconv 0.13% : 0.000042s : 1: opt.transform.opt_after_jit_grad 0.60% : 0.000200s : 28: opt.transform.opt_b 0.24% : 0.000079s : 2: opt.transform.opt_trans_graph 0.19% : 0.000063s : 4: opt.transform.symbol_engine_opt 13.42% : 0.004444s : 1: opt_a 0.55% : 0.000182s : 1: opt_after_cconv 1.77% : 0.000585s : 1: opt_after_jit_grad 1.30% : 0.000430s : 1: opt_b 23.87% : 0.007904s : 1: optimize 0.10% : 0.000033s : 1: optimize_parallel_all_gather_comm 0.04% : 0.000014s : 1: order_py_execute_after_rewriter 0.11% : 0.000038s : 1: overlap_grad_flash_sp 0.02% : 0.000007s : 1: overlap_grad_matmul_and_grad_allreduce 0.04% : 0.000012s : 1: overlap_grad_ring_attention 0.02% : 0.000007s : 1: overlap_opt_shard_grad_in_pipeline 0.02% : 0.000007s : 1: overlap_opt_shard_in_pipeline 0.03% : 0.000009s : 1: overlap_param_gather 0.02% : 0.000007s : 1: overlap_recompute_allgather_and_fa_grad 0.03% : 0.000011s : 1: overlap_recompute_and_grad_model_parallel 0.02% : 0.000008s : 1: overlap_recompute_comm 0.03% : 0.000010s : 1: parallel-infer-symbol 0.02% : 0.000006s : 1: parallel-infer-symbol-second 0.03% : 0.000009s : 1: partial_unused_args_eliminate 0.03% : 0.000010s : 1: pipeline_parallel_scheduler 0.02% : 0.000007s : 1: pipeline_split 0.15% : 0.000048s : 1: pre_auto_parallel 0.13% : 0.000044s : 1: py_interpret_to_execute 0.08% : 0.000026s : 1: py_interpret_to_execute_after_opt_a 0.02% : 0.000006s : 1: remove_cast_before_assign_add 0.18% : 0.000059s : 1: remove_dup_value 1.65% : 0.000545s : 1: renormalize.infer 1.27% : 0.000420s : 1: renormalize.specialize 0.03% : 0.000009s : 1: reorder_send_recv_between_fp_bp 0.04% : 0.000013s : 1: rewriter_after_jit_bprop_graph 0.22% : 0.000071s : 1: rewriter_after_opt_a 0.34% : 0.000113s : 1: rewriter_before_opt_a 0.03% : 0.000008s : 1: slice_cell_reuse_recomputed_activation 0.02% : 0.000007s : 1: slice_recompute_activation 0.02% : 0.000007s : 1: split_layernorm_comm 0.02% : 0.000007s : 1: split_matmul_comm_elemetwise 0.04% : 0.000014s : 1: swap_dp_allreduce_reducescatter 0.43% : 0.000143s : 1: symbol_engine_optimizer 0.41% : 0.000135s : 1: tuple_transform 18.91% : 0.006263s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:47:07.524.204 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.018493, [21] [bootstrap]: 0.00045807 [type_inference]: 0.00658639 [event_method]: 2.389e-05 [auto_monad]: 0.00010319 [graph_reusing]: 7.28e-06 [inline]: 3.18998e-06 [add_attr]: 0.00376814, [1] [add_attr_with_inline]: 0.00375459, [1] [Cycle 1]: 8.971e-05, [2] [tag_attr]: 2.934e-05 [meta_addattr_fg_expand]: 6.62002e-06 [parallel-infer-symbol]: 3.68999e-06 [pre_auto_parallel]: 4.609e-05 [insert-virtual-dataset]: 2.82002e-06 [parallel-infer-symbol-second]: 8.70001e-07 [dataset_repeat_opt]: 2.19001e-06 [pipeline_split]: 1.82999e-06 [optimize]: 0.00670726, [53] [py_interpret_to_execute]: 3.465e-05 [rewriter_before_opt_a]: 0.00013126 [opt_a]: 0.00400797, [2] [Cycle 1]: 0.00298854, [45] [expand_dump_flag]: 3.17002e-06 [switch_simplify]: 5.144e-05 [loop_unroll]: 3.744e-05 [a_1]: 0.00095757 [with_stream_mark]: 2.567e-05 [recompute_prepare]: 1.588e-05 [updatestate_depend_eliminate]: 5.54e-06 [updatestate_assign_eliminate]: 4.84e-06 [updatestate_loads_eliminate]: 4.12e-06 [parameter_eliminate]: 2.37999e-06 [a_2]: 0.00013072 [accelerated_algorithm]: 1.114e-05 [shard]: 2.21e-06 [meta_shard_fg_expand]: 2.56e-06 [shard_inline]: 1.007e-05 [merge_send_recv]: 1.174e-05 [auto_parallel]: 1.053e-05 [parallel]: 2.126e-05 [flash_sp]: 1.15e-05 [merge_comm]: 5.83002e-06 [allreduce_fusion]: 5.10001e-06 [matmul_add_comm_reduction]: 1.242e-05 [allreduce_slice_to_reducescatter]: 6.69999e-07 [virtual_shard_identity]: 1.306e-05 [virtual_dataset]: 9.97001e-06 [get_grad_eliminate_]: 9.51e-06 [virtual_output]: 9.49999e-06 [merge_forward]: 5.23002e-06 [cell_reuse_recompute_pass]: 1.29e-06 [offload_activation]: 1.306e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.226e-05 [merge_recompute_call_nodes]: 1.67001e-06 [before_grad]: 1.674e-05 [set_forward_comm_id_for_comm_node_pass]: 6.17999e-06 [meta_fg_expand]: 4.02e-06 [flash_sp_send_recv_attached]: 2.88998e-06 [receive_attached]: 2.63e-06 [after_resolve]: 1.79e-05 [a_after_grad]: 1.575e-05 [renormalize]: 0.00104223 [add_forward_monad_depend]: 7.43e-06 [auto_monad_grad]: 3.06001e-06 [auto_monad_eliminator]: 2.202e-05 [cse]: 5.512e-05 [a_3]: 7.482e-05 [Cycle 2]: 0.00100668, [45] [expand_dump_flag]: 2.29999e-06 [switch_simplify]: 1.233e-05 [loop_unroll]: 9.77999e-06 [a_1]: 0.00024686 [with_stream_mark]: 1.747e-05 [recompute_prepare]: 1.131e-05 [updatestate_depend_eliminate]: 5.66998e-06 [updatestate_assign_eliminate]: 3.88999e-06 [updatestate_loads_eliminate]: 3.91001e-06 [parameter_eliminate]: 1.88002e-06 [a_2]: 0.00012184 [accelerated_algorithm]: 1.097e-05 [shard]: 1.79e-06 [meta_shard_fg_expand]: 2.49001e-06 [shard_inline]: 9.32001e-06 [merge_send_recv]: 9.72001e-06 [auto_parallel]: 9.07999e-06 [parallel]: 6.49001e-06 [flash_sp]: 3.25998e-06 [merge_comm]: 5.98998e-06 [allreduce_fusion]: 4.95001e-06 [matmul_add_comm_reduction]: 1.009e-05 [allreduce_slice_to_reducescatter]: 5.00004e-07 [virtual_shard_identity]: 1.382e-05 [virtual_dataset]: 9.35001e-06 [get_grad_eliminate_]: 8.75001e-06 [virtual_output]: 8.42998e-06 [merge_forward]: 6.26e-06 [cell_reuse_recompute_pass]: 2.66e-06 [offload_activation]: 1.045e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.985e-05 [merge_recompute_call_nodes]: 1.04e-06 [before_grad]: 1.537e-05 [set_forward_comm_id_for_comm_node_pass]: 5.67999e-06 [meta_fg_expand]: 3.61999e-06 [flash_sp_send_recv_attached]: 1.60999e-06 [receive_attached]: 1.40999e-06 [after_resolve]: 1.613e-05 [a_after_grad]: 1.479e-05 [renormalize]: 5.9983e-08 [add_forward_monad_depend]: 2.37001e-06 [auto_monad_grad]: 2.01e-06 [auto_monad_eliminator]: 1.225e-05 [cse]: 3.224e-05 [a_3]: 5.998e-05 [py_interpret_to_execute_after_opt_a]: 1.575e-05 [slice_cell_reuse_recomputed_activation]: 2.04999e-06 [rewriter_after_opt_a]: 5.091e-05 [convert_after_rewriter]: 9.22999e-06 [order_py_execute_after_rewriter]: 6.19999e-06 [mutable_eliminate]: 0.0007112 [opt_b]: 0.00033514, [1] [Cycle 1]: 0.00032722, [7] [b_1]: 0.00021054 [b_2]: 1.306e-05 [updatestate_depend_eliminate]: 8.83001e-06 [updatestate_assign_eliminate]: 4.03001e-06 [updatestate_loads_eliminate]: 3.94002e-06 [renormalize]: 8.80013e-07 [cse]: 4.093e-05 [optimize_parallel_all_gather_comm]: 2.217e-05 [overlap_param_gather]: 2.12001e-06 [cconv]: 3.275e-05 [loop_unroll]: 0.00048574 [opt_after_cconv]: 0.00014725, [1] [Cycle 1]: 0.00014076, [7] [c_1]: 4.89e-05 [parameter_eliminate]: 4.45e-06 [updatestate_depend_eliminate]: 8.22e-06 [updatestate_assign_eliminate]: 4.4e-06 [updatestate_loads_eliminate]: 3.65e-06 [cse]: 3.485e-05 [renormalize]: 7.7e-07 [remove_dup_value]: 4.568e-05 [tuple_transform]: 0.00010698, [1] [Cycle 1]: 0.00010197, [4] [d_1]: 6.93e-05 [none_parameter_eliminate]: 1.89999e-06 [renormalize]: 2.10013e-07 [switch_simplify]: 1.064e-05 [partial_unused_args_eliminate]: 1.63002e-06 [add_recomputation]: 6.875e-05 [cse_after_recomputation]: 2.979e-05, [1] [Cycle 1]: 2.484e-05, [1] [cse]: 1.856e-05 [environ_conv]: 7.85e-06 [swap_dp_allreduce_reducescatter]: 7.05e-06 [bias_add_comm_swap]: 3.33998e-06 [label_micro_interleaved_index]: 5.07e-06 [label_fine_grained_interleaved_index]: 3.01999e-06 [merge_cast_opt]: 1.29e-06 [slice_recompute_activation]: 1.97001e-06 [micro_interleaved_order_control]: 3.08e-06 [assign_add_opt]: 1.45999e-06 [ForceFp32Comm]: 8.50006e-07 [remove_cast_before_assign_add]: 1.24e-06 [full_micro_interleaved_order_control]: 1.94999e-06 [reorder_send_recv_between_fp_bp]: 2.62001e-06 [comm_op_add_attrs]: 1.01002e-06 [add_comm_op_reuse_tag]: 9.30013e-07 [interleave_split_concat_branches]: 1.17e-06 [interleave_parallel_branches]: 1.42e-06 [overlap_opt_shard_in_pipeline]: 1.20999e-06 [overlap_opt_shard_grad_in_pipeline]: 1.86e-06 [control_data_broadcast_order]: 1.767e-05 [grouped_pairwise_exchange_alltoall]: 1.55999e-06 [offloading_packed_experts]: 5.29e-06 [overlap_recompute_and_grad_model_parallel]: 5.67999e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.17e-06 [overlap_recompute_allgather_and_fa_grad]: 1.30999e-06 [overlap_recompute_comm]: 2.61999e-06 [overlap_grad_ring_attention]: 5.04e-06 [overlap_grad_flash_sp]: 2.655e-05 [begin_end_overlap_inline]: 5.59987e-07 [split_matmul_comm_elemetwise]: 2.22001e-06 [split_layernorm_comm]: 1.54998e-06 [handle_group_info]: 9.89996e-07 [symbol_engine_optimizer]: 0.00010455, [1] [Cycle 1]: 9.986e-05, [6] [build]: 5.07999e-06 [elim_shapecalc]: 1.693e-05 [elim_not_effective]: 1.993e-05 [opt_reshape]: 1.039e-05 [fold_const_symbol]: 1.502e-05 [renormalize]: 2.19996e-07 [detach_backward]: 2.27999e-06 [pipeline_parallel_scheduler]: 1.42e-06 [auto_monad_reorder]: 2.417e-05 [get_jit_bprop_graph]: 2.04999e-06 [rewriter_after_jit_bprop_graph]: 5.84e-06 [opt_after_jit_grad]: 0.0005208 [validate]: 4.918e-05 Sums bootstrap : 0.000458s : 3.36% type_inference : 0.006586s : 48.30% event_method : 0.000024s : 0.18% auto_monad : 0.000103s : 0.76% graph_reusing : 0.000007s : 0.05% inline : 0.000003s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000029s : 0.22% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000007s : 0.05% parallel-infer-symbol : 0.000004s : 0.03% pre_auto_parallel : 0.000046s : 0.34% insert-virtual-dataset : 0.000003s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000035s : 0.25% optimize.rewriter_before_opt_a : 0.000131s : 0.96% optimize.opt_a.expand_dump_flag : 0.000005s : 0.04% optimize.opt_a.switch_simplify : 0.000064s : 0.47% optimize.opt_a.loop_unroll : 0.000047s : 0.35% optimize.opt_a.a_1 : 0.001204s : 8.83% optimize.opt_a.with_stream_mark : 0.000043s : 0.32% optimize.opt_a.recompute_prepare : 0.000027s : 0.20% optimize.opt_a.updatestate_depend_eliminate : 0.000011s : 0.08% optimize.opt_a.updatestate_assign_eliminate : 0.000009s : 0.06% optimize.opt_a.updatestate_loads_eliminate : 0.000008s : 0.06% optimize.opt_a.parameter_eliminate : 0.000004s : 0.03% optimize.opt_a.a_2 : 0.000253s : 1.85% optimize.opt_a.accelerated_algorithm : 0.000022s : 0.16% optimize.opt_a.shard : 0.000004s : 0.03% optimize.opt_a.meta_shard_fg_expand : 0.000005s : 0.04% optimize.opt_a.shard_inline : 0.000019s : 0.14% optimize.opt_a.merge_send_recv : 0.000021s : 0.16% optimize.opt_a.auto_parallel : 0.000020s : 0.14% optimize.opt_a.parallel : 0.000028s : 0.20% optimize.opt_a.flash_sp : 0.000015s : 0.11% optimize.opt_a.merge_comm : 0.000012s : 0.09% optimize.opt_a.allreduce_fusion : 0.000010s : 0.07% optimize.opt_a.matmul_add_comm_reduction : 0.000023s : 0.17% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000027s : 0.20% optimize.opt_a.virtual_dataset : 0.000019s : 0.14% optimize.opt_a.get_grad_eliminate_ : 0.000018s : 0.13% optimize.opt_a.virtual_output : 0.000018s : 0.13% optimize.opt_a.merge_forward : 0.000011s : 0.08% optimize.opt_a.cell_reuse_recompute_pass : 0.000004s : 0.03% optimize.opt_a.offload_activation : 0.000024s : 0.17% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000042s : 0.31% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.02% optimize.opt_a.before_grad : 0.000032s : 0.24% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000012s : 0.09% optimize.opt_a.meta_fg_expand : 0.000008s : 0.06% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.03% optimize.opt_a.receive_attached : 0.000004s : 0.03% optimize.opt_a.after_resolve : 0.000034s : 0.25% optimize.opt_a.a_after_grad : 0.000031s : 0.22% optimize.opt_a.renormalize : 0.001042s : 7.64% optimize.opt_a.add_forward_monad_depend : 0.000010s : 0.07% optimize.opt_a.auto_monad_grad : 0.000005s : 0.04% optimize.opt_a.auto_monad_eliminator : 0.000034s : 0.25% optimize.opt_a.cse : 0.000087s : 0.64% optimize.opt_a.a_3 : 0.000135s : 0.99% optimize.py_interpret_to_execute_after_opt_a : 0.000016s : 0.12% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.02% optimize.rewriter_after_opt_a : 0.000051s : 0.37% optimize.convert_after_rewriter : 0.000009s : 0.07% optimize.order_py_execute_after_rewriter : 0.000006s : 0.05% optimize.mutable_eliminate : 0.000711s : 5.21% optimize.opt_b.b_1 : 0.000211s : 1.54% optimize.opt_b.b_2 : 0.000013s : 0.10% optimize.opt_b.updatestate_depend_eliminate : 0.000009s : 0.06% optimize.opt_b.updatestate_assign_eliminate : 0.000004s : 0.03% optimize.opt_b.updatestate_loads_eliminate : 0.000004s : 0.03% optimize.opt_b.renormalize : 0.000001s : 0.01% optimize.opt_b.cse : 0.000041s : 0.30% optimize.optimize_parallel_all_gather_comm : 0.000022s : 0.16% optimize.overlap_param_gather : 0.000002s : 0.02% optimize.cconv : 0.000033s : 0.24% optimize.loop_unroll : 0.000486s : 3.56% optimize.opt_after_cconv.c_1 : 0.000049s : 0.36% optimize.opt_after_cconv.parameter_eliminate : 0.000004s : 0.03% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000008s : 0.06% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000004s : 0.03% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000004s : 0.03% optimize.opt_after_cconv.cse : 0.000035s : 0.26% optimize.opt_after_cconv.renormalize : 0.000001s : 0.01% optimize.remove_dup_value : 0.000046s : 0.33% optimize.tuple_transform.d_1 : 0.000069s : 0.51% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000011s : 0.08% optimize.partial_unused_args_eliminate : 0.000002s : 0.01% optimize.add_recomputation : 0.000069s : 0.50% optimize.cse_after_recomputation.cse : 0.000019s : 0.14% optimize.environ_conv : 0.000008s : 0.06% optimize.swap_dp_allreduce_reducescatter : 0.000007s : 0.05% optimize.bias_add_comm_swap : 0.000003s : 0.02% optimize.label_micro_interleaved_index : 0.000005s : 0.04% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.02% optimize.merge_cast_opt : 0.000001s : 0.01% optimize.slice_recompute_activation : 0.000002s : 0.01% optimize.micro_interleaved_order_control : 0.000003s : 0.02% optimize.assign_add_opt : 0.000001s : 0.01% optimize.ForceFp32Comm : 0.000001s : 0.01% optimize.remove_cast_before_assign_add : 0.000001s : 0.01% optimize.full_micro_interleaved_order_control : 0.000002s : 0.01% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.02% optimize.comm_op_add_attrs : 0.000001s : 0.01% optimize.add_comm_op_reuse_tag : 0.000001s : 0.01% optimize.interleave_split_concat_branches : 0.000001s : 0.01% optimize.interleave_parallel_branches : 0.000001s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.01% optimize.control_data_broadcast_order : 0.000018s : 0.13% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.01% optimize.offloading_packed_experts : 0.000005s : 0.04% optimize.overlap_recompute_and_grad_model_parallel : 0.000006s : 0.04% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.01% optimize.overlap_recompute_comm : 0.000003s : 0.02% optimize.overlap_grad_ring_attention : 0.000005s : 0.04% optimize.overlap_grad_flash_sp : 0.000027s : 0.19% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.02% optimize.split_layernorm_comm : 0.000002s : 0.01% optimize.handle_group_info : 0.000001s : 0.01% optimize.symbol_engine_optimizer.build : 0.000005s : 0.04% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000017s : 0.12% optimize.symbol_engine_optimizer.elim_not_effective : 0.000020s : 0.15% optimize.symbol_engine_optimizer.opt_reshape : 0.000010s : 0.08% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000015s : 0.11% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.02% pipeline_parallel_scheduler : 0.000001s : 0.01% auto_monad_reorder : 0.000024s : 0.18% get_jit_bprop_graph : 0.000002s : 0.02% rewriter_after_jit_bprop_graph : 0.000006s : 0.04% opt_after_jit_grad : 0.000521s : 3.82% validate : 0.000049s : 0.36% Time group info: ------[substitution.] 0.000292 54 12.22% : 0.000036s : 6: substitution.cast_eliminate 0.87% : 0.000003s : 4: substitution.elim_not_effective 0.67% : 0.000002s : 4: substitution.fold_const_symbol 2.87% : 0.000008s : 7: substitution.graph_param_transform 66.84% : 0.000195s : 4: substitution.inline 2.23% : 0.000007s : 8: substitution.j_node_and_user_rematch 2.45% : 0.000007s : 8: substitution.remove_not_recompute_node 2.67% : 0.000008s : 6: substitution.replace_old_param 7.16% : 0.000021s : 6: substitution.tuple_list_get_item_eliminator 2.01% : 0.000006s : 1: substitution.value_based_eliminate ------[type_inference.] 0.006502 2 85.86% : 0.005583s : 1: type_inference.infer 14.14% : 0.000920s : 1: type_inference.specialize ------[replace.] 0.000085 10 51.44% : 0.000044s : 4: replace.inline 48.56% : 0.000041s : 6: replace.tuple_list_get_item_eliminator ------[match.] 0.000210 10 91.54% : 0.000192s : 4: match.inline 8.46% : 0.000018s : 6: match.tuple_list_get_item_eliminator ------[predicate.] 0.000332 2134 0.87% : 0.000003s : 22: predicate.accumulaten_eliminater 0.78% : 0.000003s : 7: predicate.ad_related_special_op_eliminate 0.55% : 0.000002s : 14: predicate.addn_check_dump 0.88% : 0.000003s : 22: predicate.addn_zero_filter 0.86% : 0.000003s : 22: predicate.adjust_all_reduce_mul_add 1.88% : 0.000006s : 36: predicate.arithmetic_simplify 1.02% : 0.000003s : 22: predicate.cast_eliminate 0.59% : 0.000002s : 14: predicate.check_bprop_eliminate 0.63% : 0.000002s : 14: predicate.compare_switch_simplify 0.19% : 0.000001s : 7: predicate.const_output_eliminate 0.64% : 0.000002s : 14: predicate.depend_value_elim 0.96% : 0.000003s : 22: predicate.dict_get_item_const_eliminator 1.03% : 0.000003s : 22: predicate.dict_get_item_eliminator 0.90% : 0.000003s : 22: predicate.dict_set_item_eliminator 1.00% : 0.000003s : 14: predicate.dumpgradient_eliminate 0.34% : 0.000001s : 7: predicate.elim_not_effective 0.36% : 0.000001s : 7: predicate.elim_shapecalc_of_broadcastargs 1.15% : 0.000004s : 29: predicate.environ_add_const_eliminate 1.12% : 0.000004s : 29: predicate.environ_get_add_eliminate 1.14% : 0.000004s : 29: predicate.environ_get_depend_swap 1.85% : 0.000006s : 43: predicate.environ_get_eliminate 1.13% : 0.000004s : 29: predicate.environ_get_set_eliminate 1.34% : 0.000004s : 32: predicate.exchange_switch_depend_value 2.44% : 0.000008s : 32: predicate.float_depend_g_call 0.55% : 0.000002s : 14: predicate.float_environ_get_switch 0.86% : 0.000003s : 21: predicate.float_tuple_getitem_switch 0.20% : 0.000001s : 7: predicate.fold_const_symbol 0.64% : 0.000002s : 14: predicate.get_grad_eliminate 0.25% : 0.000001s : 7: predicate.graph_param_transform 0.61% : 0.000002s : 14: predicate.incorporate_call 0.55% : 0.000002s : 14: predicate.incorporate_call_switch 6.16% : 0.000020s : 96: predicate.inline 0.86% : 0.000003s : 14: predicate.inline_without_move 0.35% : 0.000001s : 14: predicate.j_node_and_user_rematch 0.96% : 0.000003s : 14: predicate.less_batch_normalization 1.79% : 0.000006s : 42: predicate.list_to_tuple_eliminator_ 2.67% : 0.000009s : 64: predicate.load_eliminater 0.87% : 0.000003s : 7: predicate.loop_unroll_after_grad 1.96% : 0.000007s : 44: predicate.loop_unroll_before_grad 1.66% : 0.000006s : 36: predicate.make_slice_get_slice_eliminator 0.63% : 0.000002s : 14: predicate.merge_addn 0.60% : 0.000002s : 14: predicate.micro_step_allgather_replace 0.72% : 0.000002s : 14: predicate.mini_step_allgather_replace 0.90% : 0.000003s : 22: predicate.minmaximum_grad 0.98% : 0.000003s : 7: predicate.mutable_eliminate 0.36% : 0.000001s : 7: predicate.opt_reshape 0.38% : 0.000001s : 7: predicate.parallel_virtual_node 1.55% : 0.000005s : 32: predicate.partial_defer_inline 1.72% : 0.000006s : 35: predicate.partial_eliminate 0.93% : 0.000003s : 22: predicate.print_const_string_wrapper 0.66% : 0.000002s : 14: predicate.reduce_all_const_elim 1.16% : 0.000004s : 22: predicate.reduce_eliminate 2.62% : 0.000009s : 64: predicate.redundant_stop_gradient_eliminater 0.45% : 0.000001s : 14: predicate.remove_not_recompute_node 1.43% : 0.000005s : 42: predicate.replace_applicator 0.51% : 0.000002s : 14: predicate.replace_old_param 0.29% : 0.000001s : 7: predicate.reset_defer_inline 0.97% : 0.000003s : 22: predicate.reshape_eliminate 0.60% : 0.000002s : 14: predicate.row_tensor_add_zeros_like 0.49% : 0.000002s : 7: predicate.row_tensor_eliminate 0.86% : 0.000003s : 14: predicate.same_eliminate 0.63% : 0.000002s : 14: predicate.set_cell_output_no_recompute 0.81% : 0.000003s : 14: predicate.shard_identity_eliminate 0.72% : 0.000002s : 14: predicate.special_op_eliminate 0.74% : 0.000002s : 14: predicate.specialize_transform 0.85% : 0.000003s : 14: predicate.split_environ_get_set_with_tuple_value 0.87% : 0.000003s : 14: predicate.stack_unstack_eliminate 0.36% : 0.000001s : 7: predicate.switch_call_monad_eliminater 1.41% : 0.000005s : 32: predicate.switch_defer_inline 2.00% : 0.000007s : 46: predicate.switch_layer_defer_inline 4.67% : 0.000016s : 97: predicate.switch_simplify 0.92% : 0.000003s : 22: predicate.tile_eliminate 0.93% : 0.000003s : 22: predicate.transpose_eliminate 1.50% : 0.000005s : 36: predicate.tuple_list_convert_item_index_to_positive 1.78% : 0.000006s : 36: predicate.tuple_list_get_item_const_eliminator 1.51% : 0.000005s : 36: predicate.tuple_list_get_item_depend_reorder 3.12% : 0.000010s : 56: predicate.tuple_list_get_item_eliminator 1.54% : 0.000005s : 36: predicate.tuple_list_get_set_item_eliminator 2.30% : 0.000008s : 50: predicate.tuple_list_set_item_eliminator 1.76% : 0.000006s : 42: predicate.tuple_to_list_eliminator_ 2.48% : 0.000008s : 64: predicate.updatestate_pure_node_eliminater 3.21% : 0.000011s : 78: predicate.updatestate_useless_node_eliminater 0.39% : 0.000001s : 7: predicate.value_based_eliminate 0.72% : 0.000002s : 14: predicate.virtual_dataset_eliminate 0.62% : 0.000002s : 14: predicate.virtual_output_eliminate 0.31% : 0.000001s : 7: predicate.virtual_view_grad_eliminate 0.43% : 0.000001s : 7: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000672 11 52.36% : 0.000352s : 5: func_graph_cloner_run.FuncGraphClonerGraph 47.64% : 0.000320s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.032202 192 0.01% : 0.000004s : 1: ForceFp32Comm 11.72% : 0.003775s : 1: add_attr 11.67% : 0.003759s : 1: add_attr_with_inline 0.01% : 0.000004s : 1: add_comm_op_reuse_tag 0.23% : 0.000073s : 1: add_recomputation 0.01% : 0.000004s : 1: assign_add_opt 0.34% : 0.000109s : 1: auto_monad 0.09% : 0.000029s : 1: auto_monad_reorder 0.01% : 0.000003s : 1: begin_end_overlap_inline 0.02% : 0.000006s : 1: bias_add_comm_swap 1.53% : 0.000491s : 1: bootstrap 0.11% : 0.000037s : 1: cconv 0.01% : 0.000004s : 1: comm_op_add_attrs 0.07% : 0.000022s : 1: control_data_broadcast_order 0.04% : 0.000013s : 1: convert_after_rewriter 0.10% : 0.000033s : 1: cse_after_recomputation 0.02% : 0.000005s : 1: dataset_repeat_opt 0.02% : 0.000006s : 1: detach_backward 0.03% : 0.000011s : 1: environ_conv 0.09% : 0.000030s : 1: event_method 0.01% : 0.000005s : 1: full_micro_interleaved_order_control 0.02% : 0.000005s : 1: get_jit_bprop_graph 0.03% : 0.000011s : 1: graph_reusing 0.01% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000004s : 1: handle_group_info 0.02% : 0.000006s : 1: inline 0.02% : 0.000006s : 1: insert-virtual-dataset 0.01% : 0.000004s : 1: interleave_parallel_branches 0.01% : 0.000004s : 1: interleave_split_concat_branches 0.02% : 0.000006s : 1: label_fine_grained_interleaved_index 0.03% : 0.000008s : 1: label_micro_interleaved_index 1.54% : 0.000496s : 1: loop_unroll 0.01% : 0.000004s : 1: merge_cast_opt 0.02% : 0.000006s : 1: micro_interleaved_order_control 2.24% : 0.000721s : 1: mutable_eliminate 0.03% : 0.000008s : 1: offloading_packed_experts 0.06% : 0.000021s : 1: opt.transform.loop_unroll_optimizer 0.07% : 0.000023s : 1: opt.transform.mutable_eliminate 5.87% : 0.001890s : 78: opt.transform.opt_a 0.15% : 0.000047s : 1: opt.transform.opt_after_cconv 0.12% : 0.000038s : 1: opt.transform.opt_after_jit_grad 0.59% : 0.000190s : 28: opt.transform.opt_b 0.24% : 0.000077s : 2: opt.transform.opt_trans_graph 0.18% : 0.000058s : 4: opt.transform.symbol_engine_opt 12.46% : 0.004012s : 1: opt_a 0.47% : 0.000151s : 1: opt_after_cconv 1.65% : 0.000531s : 1: opt_after_jit_grad 1.05% : 0.000339s : 1: opt_b 20.84% : 0.006712s : 1: optimize 0.08% : 0.000026s : 1: optimize_parallel_all_gather_comm 0.03% : 0.000009s : 1: order_py_execute_after_rewriter 0.10% : 0.000031s : 1: overlap_grad_flash_sp 0.01% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.03% : 0.000008s : 1: overlap_grad_ring_attention 0.01% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.02% : 0.000005s : 1: overlap_param_gather 0.01% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.03% : 0.000009s : 1: overlap_recompute_and_grad_model_parallel 0.02% : 0.000005s : 1: overlap_recompute_comm 0.03% : 0.000008s : 1: parallel-infer-symbol 0.01% : 0.000004s : 1: parallel-infer-symbol-second 0.01% : 0.000005s : 1: partial_unused_args_eliminate 0.01% : 0.000004s : 1: pipeline_parallel_scheduler 0.01% : 0.000005s : 1: pipeline_split 0.16% : 0.000051s : 1: pre_auto_parallel 0.12% : 0.000039s : 1: py_interpret_to_execute 0.06% : 0.000019s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000004s : 1: remove_cast_before_assign_add 0.16% : 0.000051s : 1: remove_dup_value 1.82% : 0.000588s : 1: renormalize.infer 1.38% : 0.000443s : 1: renormalize.specialize 0.02% : 0.000005s : 1: reorder_send_recv_between_fp_bp 0.03% : 0.000009s : 1: rewriter_after_jit_bprop_graph 0.17% : 0.000056s : 1: rewriter_after_opt_a 0.43% : 0.000138s : 1: rewriter_before_opt_a 0.01% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.01% : 0.000005s : 1: slice_recompute_activation 0.01% : 0.000004s : 1: split_layernorm_comm 0.02% : 0.000005s : 1: split_matmul_comm_elemetwise 0.03% : 0.000010s : 1: swap_dp_allreduce_reducescatter 0.33% : 0.000107s : 1: symbol_engine_optimizer 0.34% : 0.000110s : 1: tuple_transform 20.52% : 0.006609s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:47:07.728.624 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:47:07.728.901 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0186169, [21] [bootstrap]: 0.00048452 [type_inference]: 0.00635125 [event_method]: 2.248e-05 [auto_monad]: 6.632e-05 [graph_reusing]: 5.80002e-06 [inline]: 2.24001e-06 [add_attr]: 0.00342262, [1] [add_attr_with_inline]: 0.00341186, [1] [Cycle 1]: 9.399e-05, [2] [tag_attr]: 2.402e-05 [meta_addattr_fg_expand]: 6.79999e-06 [parallel-infer-symbol]: 3.25e-06 [pre_auto_parallel]: 3.932e-05 [insert-virtual-dataset]: 2.46998e-06 [parallel-infer-symbol-second]: 7.50006e-07 [dataset_repeat_opt]: 2.78e-06 [pipeline_split]: 1.57001e-06 [optimize]: 0.00688737, [53] [py_interpret_to_execute]: 3.671e-05 [rewriter_before_opt_a]: 0.0001077 [opt_a]: 0.00403777, [2] [Cycle 1]: 0.00295489, [45] [expand_dump_flag]: 2.91999e-06 [switch_simplify]: 4.769e-05 [loop_unroll]: 3.577e-05 [a_1]: 0.00092416 [with_stream_mark]: 2.266e-05 [recompute_prepare]: 1.392e-05 [updatestate_depend_eliminate]: 5.28002e-06 [updatestate_assign_eliminate]: 3.96001e-06 [updatestate_loads_eliminate]: 3.63e-06 [parameter_eliminate]: 1.92001e-06 [a_2]: 0.00014062 [accelerated_algorithm]: 1.038e-05 [shard]: 1.86e-06 [meta_shard_fg_expand]: 2.29999e-06 [shard_inline]: 9.41998e-06 [merge_send_recv]: 1.128e-05 [auto_parallel]: 8.28001e-06 [parallel]: 1.956e-05 [flash_sp]: 1e-05 [merge_comm]: 5.47999e-06 [allreduce_fusion]: 4.74e-06 [matmul_add_comm_reduction]: 1.18e-05 [allreduce_slice_to_reducescatter]: 6.80011e-07 [virtual_shard_identity]: 1.159e-05 [virtual_dataset]: 8.84998e-06 [get_grad_eliminate_]: 8.60999e-06 [virtual_output]: 8.64e-06 [merge_forward]: 5.54e-06 [cell_reuse_recompute_pass]: 1.32e-06 [offload_activation]: 3.338e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.177e-05 [merge_recompute_call_nodes]: 1.64e-06 [before_grad]: 1.522e-05 [set_forward_comm_id_for_comm_node_pass]: 5.74999e-06 [meta_fg_expand]: 3.62998e-06 [flash_sp_send_recv_attached]: 2.94001e-06 [receive_attached]: 2.27001e-06 [after_resolve]: 1.649e-05 [a_after_grad]: 1.43e-05 [renormalize]: 0.00091473 [add_forward_monad_depend]: 7.41001e-06 [auto_monad_grad]: 2.76999e-06 [auto_monad_eliminator]: 2.062e-05 [cse]: 3.715e-05 [a_3]: 8.1e-05 [Cycle 2]: 0.00106628, [45] [expand_dump_flag]: 1.76e-06 [switch_simplify]: 1.133e-05 [loop_unroll]: 8.55999e-06 [a_1]: 0.00021005 [with_stream_mark]: 1.626e-05 [recompute_prepare]: 1.078e-05 [updatestate_depend_eliminate]: 3.85e-06 [updatestate_assign_eliminate]: 3.49001e-06 [updatestate_loads_eliminate]: 3.33e-06 [parameter_eliminate]: 1.24e-06 [a_2]: 0.0001314 [accelerated_algorithm]: 1.001e-05 [shard]: 1.57001e-06 [meta_shard_fg_expand]: 2.47001e-06 [shard_inline]: 8.82999e-06 [merge_send_recv]: 8.68001e-06 [auto_parallel]: 9.10001e-06 [parallel]: 6.38e-06 [flash_sp]: 3.56001e-06 [merge_comm]: 4.97999e-06 [allreduce_fusion]: 4.63999e-06 [matmul_add_comm_reduction]: 9.13002e-06 [allreduce_slice_to_reducescatter]: 3.80009e-07 [virtual_shard_identity]: 1.085e-05 [virtual_dataset]: 8.60001e-06 [get_grad_eliminate_]: 8.12e-06 [virtual_output]: 8.32e-06 [merge_forward]: 4.78001e-06 [cell_reuse_recompute_pass]: 1.74e-06 [offload_activation]: 1.012e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.026e-05 [merge_recompute_call_nodes]: 1.51998e-06 [before_grad]: 1.324e-05 [set_forward_comm_id_for_comm_node_pass]: 5.17e-06 [meta_fg_expand]: 2.84001e-06 [flash_sp_send_recv_attached]: 1.39e-06 [receive_attached]: 1.91e-06 [after_resolve]: 1.45e-05 [a_after_grad]: 1.319e-05 [renormalize]: 5.9983e-08 [add_forward_monad_depend]: 2.19999e-06 [auto_monad_grad]: 1.59e-06 [auto_monad_eliminator]: 1.15e-05 [cse]: 2.265e-05 [a_3]: 6.57e-05 [py_interpret_to_execute_after_opt_a]: 1.93e-05 [slice_cell_reuse_recomputed_activation]: 5.57001e-06 [rewriter_after_opt_a]: 5.283e-05 [convert_after_rewriter]: 1.135e-05 [order_py_execute_after_rewriter]: 8.70999e-06 [mutable_eliminate]: 0.00069331 [opt_b]: 0.00035695, [1] [Cycle 1]: 0.00034667, [7] [b_1]: 0.0002296 [b_2]: 1.141e-05 [updatestate_depend_eliminate]: 8.87e-06 [updatestate_assign_eliminate]: 3.46001e-06 [updatestate_loads_eliminate]: 3.31999e-06 [renormalize]: 8.30012e-07 [cse]: 3.104e-05 [optimize_parallel_all_gather_comm]: 2.27e-05 [overlap_param_gather]: 4.54998e-06 [cconv]: 4.163e-05 [loop_unroll]: 0.00053415 [opt_after_cconv]: 0.00015794, [1] [Cycle 1]: 0.00014856, [7] [c_1]: 4.427e-05 [parameter_eliminate]: 4.42998e-06 [updatestate_depend_eliminate]: 7.46999e-06 [updatestate_assign_eliminate]: 3.86999e-06 [updatestate_loads_eliminate]: 3.70998e-06 [cse]: 2.645e-05 [renormalize]: 7.2e-07 [remove_dup_value]: 2.029e-05 [tuple_transform]: 0.00011463, [1] [Cycle 1]: 0.00010684, [4] [d_1]: 6.262e-05 [none_parameter_eliminate]: 2.06e-06 [renormalize]: 2.20025e-07 [switch_simplify]: 9.87999e-06 [partial_unused_args_eliminate]: 4.43999e-06 [add_recomputation]: 6.779e-05 [cse_after_recomputation]: 3.763e-05, [1] [Cycle 1]: 3.033e-05, [1] [cse]: 1.779e-05 [environ_conv]: 1.026e-05 [swap_dp_allreduce_reducescatter]: 9.73002e-06 [bias_add_comm_swap]: 6.07001e-06 [label_micro_interleaved_index]: 7.6e-06 [label_fine_grained_interleaved_index]: 5.05001e-06 [merge_cast_opt]: 3.81999e-06 [slice_recompute_activation]: 5.27001e-06 [micro_interleaved_order_control]: 4.72e-06 [assign_add_opt]: 3.59002e-06 [ForceFp32Comm]: 3.72002e-06 [remove_cast_before_assign_add]: 3.76999e-06 [full_micro_interleaved_order_control]: 4.4e-06 [reorder_send_recv_between_fp_bp]: 5.23002e-06 [comm_op_add_attrs]: 3.58e-06 [add_comm_op_reuse_tag]: 3.17002e-06 [interleave_split_concat_branches]: 3.94002e-06 [interleave_parallel_branches]: 3.36999e-06 [overlap_opt_shard_in_pipeline]: 3.73001e-06 [overlap_opt_shard_grad_in_pipeline]: 4.54002e-06 [control_data_broadcast_order]: 2.088e-05 [grouped_pairwise_exchange_alltoall]: 4.1e-06 [offloading_packed_experts]: 8.28999e-06 [overlap_recompute_and_grad_model_parallel]: 8.69e-06 [overlap_grad_matmul_and_grad_allreduce]: 4.08999e-06 [overlap_recompute_allgather_and_fa_grad]: 3.65e-06 [overlap_recompute_comm]: 4.74e-06 [overlap_grad_ring_attention]: 7.23e-06 [overlap_grad_flash_sp]: 2.831e-05 [begin_end_overlap_inline]: 3.43999e-06 [split_matmul_comm_elemetwise]: 5.36002e-06 [split_layernorm_comm]: 4.11001e-06 [handle_group_info]: 3.6e-06 [symbol_engine_optimizer]: 0.00012179, [1] [Cycle 1]: 0.00011457, [6] [build]: 4.37e-06 [elim_shapecalc]: 1.664e-05 [elim_not_effective]: 1.774e-05 [opt_reshape]: 9.84001e-06 [fold_const_symbol]: 1.376e-05 [renormalize]: 6.69999e-07 [detach_backward]: 4.23999e-06 [pipeline_parallel_scheduler]: 1.79e-06 [auto_monad_reorder]: 2.6e-05 [get_jit_bprop_graph]: 2.12001e-06 [rewriter_after_jit_bprop_graph]: 6.26e-06 [opt_after_jit_grad]: 0.00057934 [validate]: 5.117e-05 Sums bootstrap : 0.000485s : 3.65% type_inference : 0.006351s : 47.83% event_method : 0.000022s : 0.17% auto_monad : 0.000066s : 0.50% graph_reusing : 0.000006s : 0.04% inline : 0.000002s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000024s : 0.18% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000007s : 0.05% parallel-infer-symbol : 0.000003s : 0.02% pre_auto_parallel : 0.000039s : 0.30% insert-virtual-dataset : 0.000002s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000003s : 0.02% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000037s : 0.28% optimize.rewriter_before_opt_a : 0.000108s : 0.81% optimize.opt_a.expand_dump_flag : 0.000005s : 0.04% optimize.opt_a.switch_simplify : 0.000059s : 0.44% optimize.opt_a.loop_unroll : 0.000044s : 0.33% optimize.opt_a.a_1 : 0.001134s : 8.54% optimize.opt_a.with_stream_mark : 0.000039s : 0.29% optimize.opt_a.recompute_prepare : 0.000025s : 0.19% optimize.opt_a.updatestate_depend_eliminate : 0.000009s : 0.07% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.06% optimize.opt_a.updatestate_loads_eliminate : 0.000007s : 0.05% optimize.opt_a.parameter_eliminate : 0.000003s : 0.02% optimize.opt_a.a_2 : 0.000272s : 2.05% optimize.opt_a.accelerated_algorithm : 0.000020s : 0.15% optimize.opt_a.shard : 0.000003s : 0.03% optimize.opt_a.meta_shard_fg_expand : 0.000005s : 0.04% optimize.opt_a.shard_inline : 0.000018s : 0.14% optimize.opt_a.merge_send_recv : 0.000020s : 0.15% optimize.opt_a.auto_parallel : 0.000017s : 0.13% optimize.opt_a.parallel : 0.000026s : 0.20% optimize.opt_a.flash_sp : 0.000014s : 0.10% optimize.opt_a.merge_comm : 0.000010s : 0.08% optimize.opt_a.allreduce_fusion : 0.000009s : 0.07% optimize.opt_a.matmul_add_comm_reduction : 0.000021s : 0.16% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000022s : 0.17% optimize.opt_a.virtual_dataset : 0.000017s : 0.13% optimize.opt_a.get_grad_eliminate_ : 0.000017s : 0.13% optimize.opt_a.virtual_output : 0.000017s : 0.13% optimize.opt_a.merge_forward : 0.000010s : 0.08% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.02% optimize.opt_a.offload_activation : 0.000043s : 0.33% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000042s : 0.32% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.02% optimize.opt_a.before_grad : 0.000028s : 0.21% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000011s : 0.08% optimize.opt_a.meta_fg_expand : 0.000006s : 0.05% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.03% optimize.opt_a.receive_attached : 0.000004s : 0.03% optimize.opt_a.after_resolve : 0.000031s : 0.23% optimize.opt_a.a_after_grad : 0.000027s : 0.21% optimize.opt_a.renormalize : 0.000915s : 6.89% optimize.opt_a.add_forward_monad_depend : 0.000010s : 0.07% optimize.opt_a.auto_monad_grad : 0.000004s : 0.03% optimize.opt_a.auto_monad_eliminator : 0.000032s : 0.24% optimize.opt_a.cse : 0.000060s : 0.45% optimize.opt_a.a_3 : 0.000147s : 1.10% optimize.py_interpret_to_execute_after_opt_a : 0.000019s : 0.15% optimize.slice_cell_reuse_recomputed_activation : 0.000006s : 0.04% optimize.rewriter_after_opt_a : 0.000053s : 0.40% optimize.convert_after_rewriter : 0.000011s : 0.09% optimize.order_py_execute_after_rewriter : 0.000009s : 0.07% optimize.mutable_eliminate : 0.000693s : 5.22% optimize.opt_b.b_1 : 0.000230s : 1.73% optimize.opt_b.b_2 : 0.000011s : 0.09% optimize.opt_b.updatestate_depend_eliminate : 0.000009s : 0.07% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_b.renormalize : 0.000001s : 0.01% optimize.opt_b.cse : 0.000031s : 0.23% optimize.optimize_parallel_all_gather_comm : 0.000023s : 0.17% optimize.overlap_param_gather : 0.000005s : 0.03% optimize.cconv : 0.000042s : 0.31% optimize.loop_unroll : 0.000534s : 4.02% optimize.opt_after_cconv.c_1 : 0.000044s : 0.33% optimize.opt_after_cconv.parameter_eliminate : 0.000004s : 0.03% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000007s : 0.06% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000004s : 0.03% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000004s : 0.03% optimize.opt_after_cconv.cse : 0.000026s : 0.20% optimize.opt_after_cconv.renormalize : 0.000001s : 0.01% optimize.remove_dup_value : 0.000020s : 0.15% optimize.tuple_transform.d_1 : 0.000063s : 0.47% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.02% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000010s : 0.07% optimize.partial_unused_args_eliminate : 0.000004s : 0.03% optimize.add_recomputation : 0.000068s : 0.51% optimize.cse_after_recomputation.cse : 0.000018s : 0.13% optimize.environ_conv : 0.000010s : 0.08% optimize.swap_dp_allreduce_reducescatter : 0.000010s : 0.07% optimize.bias_add_comm_swap : 0.000006s : 0.05% optimize.label_micro_interleaved_index : 0.000008s : 0.06% optimize.label_fine_grained_interleaved_index : 0.000005s : 0.04% optimize.merge_cast_opt : 0.000004s : 0.03% optimize.slice_recompute_activation : 0.000005s : 0.04% optimize.micro_interleaved_order_control : 0.000005s : 0.04% optimize.assign_add_opt : 0.000004s : 0.03% optimize.ForceFp32Comm : 0.000004s : 0.03% optimize.remove_cast_before_assign_add : 0.000004s : 0.03% optimize.full_micro_interleaved_order_control : 0.000004s : 0.03% optimize.reorder_send_recv_between_fp_bp : 0.000005s : 0.04% optimize.comm_op_add_attrs : 0.000004s : 0.03% optimize.add_comm_op_reuse_tag : 0.000003s : 0.02% optimize.interleave_split_concat_branches : 0.000004s : 0.03% optimize.interleave_parallel_branches : 0.000003s : 0.03% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.03% optimize.overlap_opt_shard_grad_in_pipeline : 0.000005s : 0.03% optimize.control_data_broadcast_order : 0.000021s : 0.16% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.03% optimize.offloading_packed_experts : 0.000008s : 0.06% optimize.overlap_recompute_and_grad_model_parallel : 0.000009s : 0.07% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.03% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.03% optimize.overlap_recompute_comm : 0.000005s : 0.04% optimize.overlap_grad_ring_attention : 0.000007s : 0.05% optimize.overlap_grad_flash_sp : 0.000028s : 0.21% optimize.begin_end_overlap_inline : 0.000003s : 0.03% optimize.split_matmul_comm_elemetwise : 0.000005s : 0.04% optimize.split_layernorm_comm : 0.000004s : 0.03% optimize.handle_group_info : 0.000004s : 0.03% optimize.symbol_engine_optimizer.build : 0.000004s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000017s : 0.13% optimize.symbol_engine_optimizer.elim_not_effective : 0.000018s : 0.13% optimize.symbol_engine_optimizer.opt_reshape : 0.000010s : 0.07% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000014s : 0.10% optimize.symbol_engine_optimizer.renormalize : 0.000001s : 0.01% detach_backward : 0.000004s : 0.03% pipeline_parallel_scheduler : 0.000002s : 0.01% auto_monad_reorder : 0.000026s : 0.20% get_jit_bprop_graph : 0.000002s : 0.02% rewriter_after_jit_bprop_graph : 0.000006s : 0.05% opt_after_jit_grad : 0.000579s : 4.36% validate : 0.000051s : 0.39% Time group info: ------[substitution.] 0.000260 44 10.25% : 0.000027s : 3: substitution.cast_eliminate 0.88% : 0.000002s : 3: substitution.elim_not_effective 0.82% : 0.000002s : 3: substitution.fold_const_symbol 3.16% : 0.000008s : 6: substitution.graph_param_transform 68.24% : 0.000177s : 4: substitution.inline 2.01% : 0.000005s : 6: substitution.j_node_and_user_rematch 2.51% : 0.000007s : 6: substitution.remove_not_recompute_node 2.56% : 0.000007s : 6: substitution.replace_old_param 7.51% : 0.000019s : 6: substitution.tuple_list_get_item_eliminator 2.06% : 0.000005s : 1: substitution.value_based_eliminate ------[type_inference.] 0.006288 2 86.70% : 0.005451s : 1: type_inference.infer 13.30% : 0.000837s : 1: type_inference.specialize ------[replace.] 0.000081 10 49.84% : 0.000040s : 4: replace.inline 50.16% : 0.000041s : 6: replace.tuple_list_get_item_eliminator ------[match.] 0.000190 10 91.48% : 0.000174s : 4: match.inline 8.52% : 0.000016s : 6: match.tuple_list_get_item_eliminator ------[predicate.] 0.000303 1954 0.97% : 0.000003s : 21: predicate.accumulaten_eliminater 0.75% : 0.000002s : 6: predicate.ad_related_special_op_eliminate 0.58% : 0.000002s : 12: predicate.addn_check_dump 0.97% : 0.000003s : 21: predicate.addn_zero_filter 0.86% : 0.000003s : 21: predicate.adjust_all_reduce_mul_add 2.06% : 0.000006s : 33: predicate.arithmetic_simplify 1.17% : 0.000004s : 21: predicate.cast_eliminate 0.56% : 0.000002s : 12: predicate.check_bprop_eliminate 0.52% : 0.000002s : 12: predicate.compare_switch_simplify 0.25% : 0.000001s : 6: predicate.const_output_eliminate 0.57% : 0.000002s : 12: predicate.depend_value_elim 1.14% : 0.000003s : 21: predicate.dict_get_item_const_eliminator 1.10% : 0.000003s : 21: predicate.dict_get_item_eliminator 0.95% : 0.000003s : 21: predicate.dict_set_item_eliminator 0.83% : 0.000002s : 12: predicate.dumpgradient_eliminate 0.26% : 0.000001s : 6: predicate.elim_not_effective 0.50% : 0.000002s : 6: predicate.elim_shapecalc_of_broadcastargs 1.24% : 0.000004s : 27: predicate.environ_add_const_eliminate 1.11% : 0.000003s : 27: predicate.environ_get_add_eliminate 1.11% : 0.000003s : 27: predicate.environ_get_depend_swap 1.85% : 0.000006s : 39: predicate.environ_get_eliminate 1.12% : 0.000003s : 27: predicate.environ_get_set_eliminate 1.42% : 0.000004s : 31: predicate.exchange_switch_depend_value 2.32% : 0.000007s : 31: predicate.float_depend_g_call 0.58% : 0.000002s : 12: predicate.float_environ_get_switch 0.79% : 0.000002s : 18: predicate.float_tuple_getitem_switch 0.17% : 0.000001s : 6: predicate.fold_const_symbol 0.72% : 0.000002s : 12: predicate.get_grad_eliminate 0.19% : 0.000001s : 6: predicate.graph_param_transform 0.53% : 0.000002s : 12: predicate.incorporate_call 0.48% : 0.000001s : 12: predicate.incorporate_call_switch 6.47% : 0.000020s : 88: predicate.inline 0.77% : 0.000002s : 12: predicate.inline_without_move 0.34% : 0.000001s : 12: predicate.j_node_and_user_rematch 0.86% : 0.000003s : 12: predicate.less_batch_normalization 1.89% : 0.000006s : 39: predicate.list_to_tuple_eliminator_ 2.73% : 0.000008s : 60: predicate.load_eliminater 0.82% : 0.000002s : 6: predicate.loop_unroll_after_grad 1.93% : 0.000006s : 42: predicate.loop_unroll_before_grad 1.76% : 0.000005s : 33: predicate.make_slice_get_slice_eliminator 0.53% : 0.000002s : 12: predicate.merge_addn 0.55% : 0.000002s : 12: predicate.micro_step_allgather_replace 0.56% : 0.000002s : 12: predicate.mini_step_allgather_replace 0.89% : 0.000003s : 21: predicate.minmaximum_grad 0.80% : 0.000002s : 6: predicate.mutable_eliminate 0.33% : 0.000001s : 6: predicate.opt_reshape 0.36% : 0.000001s : 6: predicate.parallel_virtual_node 1.86% : 0.000006s : 31: predicate.partial_defer_inline 1.79% : 0.000005s : 33: predicate.partial_eliminate 0.98% : 0.000003s : 21: predicate.print_const_string_wrapper 0.56% : 0.000002s : 12: predicate.reduce_all_const_elim 1.13% : 0.000003s : 21: predicate.reduce_eliminate 2.65% : 0.000008s : 60: predicate.redundant_stop_gradient_eliminater 0.43% : 0.000001s : 12: predicate.remove_not_recompute_node 1.46% : 0.000004s : 39: predicate.replace_applicator 0.47% : 0.000001s : 12: predicate.replace_old_param 0.31% : 0.000001s : 6: predicate.reset_defer_inline 0.94% : 0.000003s : 21: predicate.reshape_eliminate 0.57% : 0.000002s : 12: predicate.row_tensor_add_zeros_like 0.39% : 0.000001s : 6: predicate.row_tensor_eliminate 0.64% : 0.000002s : 12: predicate.same_eliminate 0.59% : 0.000002s : 12: predicate.set_cell_output_no_recompute 0.76% : 0.000002s : 12: predicate.shard_identity_eliminate 0.80% : 0.000002s : 12: predicate.special_op_eliminate 0.68% : 0.000002s : 12: predicate.specialize_transform 0.73% : 0.000002s : 12: predicate.split_environ_get_set_with_tuple_value 0.77% : 0.000002s : 12: predicate.stack_unstack_eliminate 0.37% : 0.000001s : 6: predicate.switch_call_monad_eliminater 1.53% : 0.000005s : 31: predicate.switch_defer_inline 2.13% : 0.000006s : 43: predicate.switch_layer_defer_inline 4.59% : 0.000014s : 91: predicate.switch_simplify 0.95% : 0.000003s : 21: predicate.tile_eliminate 0.95% : 0.000003s : 21: predicate.transpose_eliminate 1.48% : 0.000004s : 33: predicate.tuple_list_convert_item_index_to_positive 1.63% : 0.000005s : 33: predicate.tuple_list_get_item_const_eliminator 1.47% : 0.000004s : 33: predicate.tuple_list_get_item_depend_reorder 3.23% : 0.000010s : 51: predicate.tuple_list_get_item_eliminator 1.49% : 0.000005s : 33: predicate.tuple_list_get_set_item_eliminator 2.36% : 0.000007s : 45: predicate.tuple_list_set_item_eliminator 1.79% : 0.000005s : 39: predicate.tuple_to_list_eliminator_ 2.57% : 0.000008s : 60: predicate.updatestate_pure_node_eliminater 3.24% : 0.000010s : 72: predicate.updatestate_useless_node_eliminater 0.43% : 0.000001s : 6: predicate.value_based_eliminate 0.66% : 0.000002s : 12: predicate.virtual_dataset_eliminate 0.72% : 0.000002s : 12: predicate.virtual_output_eliminate 0.30% : 0.000001s : 6: predicate.virtual_view_grad_eliminate 0.38% : 0.000001s : 6: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000644 11 53.44% : 0.000344s : 5: func_graph_cloner_run.FuncGraphClonerGraph 46.56% : 0.000300s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.031829 192 0.02% : 0.000007s : 1: ForceFp32Comm 10.79% : 0.003433s : 1: add_attr 10.73% : 0.003416s : 1: add_attr_with_inline 0.02% : 0.000006s : 1: add_comm_op_reuse_tag 0.23% : 0.000073s : 1: add_recomputation 0.02% : 0.000006s : 1: assign_add_opt 0.24% : 0.000076s : 1: auto_monad 0.11% : 0.000034s : 1: auto_monad_reorder 0.02% : 0.000006s : 1: begin_end_overlap_inline 0.03% : 0.000009s : 1: bias_add_comm_swap 1.66% : 0.000530s : 1: bootstrap 0.14% : 0.000045s : 1: cconv 0.02% : 0.000007s : 1: comm_op_add_attrs 0.08% : 0.000024s : 1: control_data_broadcast_order 0.05% : 0.000015s : 1: convert_after_rewriter 0.13% : 0.000041s : 1: cse_after_recomputation 0.03% : 0.000009s : 1: dataset_repeat_opt 0.08% : 0.000024s : 1: detach_backward 0.04% : 0.000014s : 1: environ_conv 0.10% : 0.000033s : 1: event_method 0.02% : 0.000007s : 1: full_micro_interleaved_order_control 0.03% : 0.000008s : 1: get_jit_bprop_graph 0.04% : 0.000013s : 1: graph_reusing 0.02% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.02% : 0.000007s : 1: handle_group_info 0.02% : 0.000008s : 1: inline 0.03% : 0.000009s : 1: insert-virtual-dataset 0.02% : 0.000006s : 1: interleave_parallel_branches 0.02% : 0.000006s : 1: interleave_split_concat_branches 0.03% : 0.000008s : 1: label_fine_grained_interleaved_index 0.03% : 0.000011s : 1: label_micro_interleaved_index 1.70% : 0.000541s : 1: loop_unroll 0.02% : 0.000007s : 1: merge_cast_opt 0.02% : 0.000008s : 1: micro_interleaved_order_control 2.20% : 0.000701s : 1: mutable_eliminate 0.03% : 0.000011s : 1: offloading_packed_experts 0.06% : 0.000019s : 1: opt.transform.loop_unroll_optimizer 0.07% : 0.000021s : 1: opt.transform.mutable_eliminate 5.46% : 0.001737s : 78: opt.transform.opt_a 0.13% : 0.000043s : 1: opt.transform.opt_after_cconv 0.11% : 0.000034s : 1: opt.transform.opt_after_jit_grad 0.52% : 0.000166s : 28: opt.transform.opt_b 0.22% : 0.000070s : 2: opt.transform.opt_trans_graph 0.17% : 0.000053s : 4: opt.transform.symbol_engine_opt 12.70% : 0.004042s : 1: opt_a 0.51% : 0.000161s : 1: opt_after_cconv 1.86% : 0.000592s : 1: opt_after_jit_grad 1.13% : 0.000360s : 1: opt_b 22.85% : 0.007273s : 1: optimize 0.09% : 0.000028s : 1: optimize_parallel_all_gather_comm 0.04% : 0.000012s : 1: order_py_execute_after_rewriter 0.10% : 0.000032s : 1: overlap_grad_flash_sp 0.02% : 0.000007s : 1: overlap_grad_matmul_and_grad_allreduce 0.03% : 0.000010s : 1: overlap_grad_ring_attention 0.02% : 0.000007s : 1: overlap_opt_shard_grad_in_pipeline 0.02% : 0.000007s : 1: overlap_opt_shard_in_pipeline 0.02% : 0.000008s : 1: overlap_param_gather 0.02% : 0.000006s : 1: overlap_recompute_allgather_and_fa_grad 0.04% : 0.000012s : 1: overlap_recompute_and_grad_model_parallel 0.02% : 0.000008s : 1: overlap_recompute_comm 0.03% : 0.000010s : 1: parallel-infer-symbol 0.02% : 0.000006s : 1: parallel-infer-symbol-second 0.02% : 0.000007s : 1: partial_unused_args_eliminate 0.03% : 0.000010s : 1: pipeline_parallel_scheduler 0.02% : 0.000007s : 1: pipeline_split 0.15% : 0.000047s : 1: pre_auto_parallel 0.13% : 0.000041s : 1: py_interpret_to_execute 0.07% : 0.000023s : 1: py_interpret_to_execute_after_opt_a 0.02% : 0.000006s : 1: remove_cast_before_assign_add 0.07% : 0.000024s : 1: remove_dup_value 1.60% : 0.000511s : 1: renormalize.infer 1.24% : 0.000394s : 1: renormalize.specialize 0.02% : 0.000008s : 1: reorder_send_recv_between_fp_bp 0.04% : 0.000012s : 1: rewriter_after_jit_bprop_graph 0.18% : 0.000058s : 1: rewriter_after_opt_a 0.35% : 0.000112s : 1: rewriter_before_opt_a 0.03% : 0.000008s : 1: slice_cell_reuse_recomputed_activation 0.03% : 0.000009s : 1: slice_recompute_activation 0.02% : 0.000007s : 1: split_layernorm_comm 0.03% : 0.000008s : 1: split_matmul_comm_elemetwise 0.04% : 0.000013s : 1: swap_dp_allreduce_reducescatter 0.39% : 0.000125s : 1: symbol_engine_optimizer 0.37% : 0.000117s : 1: tuple_transform 20.09% : 0.006394s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:47:07.940.878 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0155853, [21] [bootstrap]: 0.00043547 [type_inference]: 0.00593835 [event_method]: 2.096e-05 [auto_monad]: 6.2e-05 [graph_reusing]: 5.89999e-06 [inline]: 2.39001e-06 [add_attr]: 0.00304841, [1] [add_attr_with_inline]: 0.00303896, [1] [Cycle 1]: 5.94e-05, [2] [tag_attr]: 2.106e-05 [meta_addattr_fg_expand]: 6.53e-06 [parallel-infer-symbol]: 3.33e-06 [pre_auto_parallel]: 3.577e-05 [insert-virtual-dataset]: 2.94001e-06 [parallel-infer-symbol-second]: 1.17999e-06 [dataset_repeat_opt]: 1.79998e-06 [pipeline_split]: 1.59998e-06 [optimize]: 0.00535506, [53] [py_interpret_to_execute]: 2.973e-05 [rewriter_before_opt_a]: 9.319e-05 [opt_a]: 0.00321025, [2] [Cycle 1]: 0.00239419, [45] [expand_dump_flag]: 2.94001e-06 [switch_simplify]: 4.84e-05 [loop_unroll]: 3.578e-05 [a_1]: 0.00086812 [with_stream_mark]: 1.633e-05 [recompute_prepare]: 1.174e-05 [updatestate_depend_eliminate]: 4.82998e-06 [updatestate_assign_eliminate]: 3.91001e-06 [updatestate_loads_eliminate]: 3.55e-06 [parameter_eliminate]: 1.89e-06 [a_2]: 0.00011371 [accelerated_algorithm]: 9.32001e-06 [shard]: 2.01e-06 [meta_shard_fg_expand]: 1.97001e-06 [shard_inline]: 8.95001e-06 [merge_send_recv]: 1.014e-05 [auto_parallel]: 6.96001e-06 [parallel]: 1.788e-05 [flash_sp]: 8.10999e-06 [merge_comm]: 5.59e-06 [allreduce_fusion]: 4.36002e-06 [matmul_add_comm_reduction]: 9.86998e-06 [allreduce_slice_to_reducescatter]: 8.30012e-07 [virtual_shard_identity]: 1.07e-05 [virtual_dataset]: 8.73001e-06 [get_grad_eliminate_]: 8.1e-06 [virtual_output]: 8.53001e-06 [merge_forward]: 4.56002e-06 [cell_reuse_recompute_pass]: 1.10999e-06 [offload_activation]: 1.086e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.724e-05 [merge_recompute_call_nodes]: 1.45999e-06 [before_grad]: 1.373e-05 [set_forward_comm_id_for_comm_node_pass]: 4.72e-06 [meta_fg_expand]: 3.26999e-06 [flash_sp_send_recv_attached]: 2.63e-06 [receive_attached]: 2.88998e-06 [after_resolve]: 1.524e-05 [a_after_grad]: 1.317e-05 [renormalize]: 0.00070112 [add_forward_monad_depend]: 5.65001e-06 [auto_monad_grad]: 2.31e-06 [auto_monad_eliminator]: 1.694e-05 [cse]: 3.557e-05 [a_3]: 6.216e-05 [Cycle 2]: 0.00080634, [45] [expand_dump_flag]: 1.23002e-06 [switch_simplify]: 1.002e-05 [loop_unroll]: 8.32e-06 [a_1]: 0.00020193 [with_stream_mark]: 1.216e-05 [recompute_prepare]: 8.55001e-06 [updatestate_depend_eliminate]: 4.03001e-06 [updatestate_assign_eliminate]: 3.11001e-06 [updatestate_loads_eliminate]: 3.09999e-06 [parameter_eliminate]: 1.25999e-06 [a_2]: 0.00010326 [accelerated_algorithm]: 8.44002e-06 [shard]: 1.13001e-06 [meta_shard_fg_expand]: 1.85001e-06 [shard_inline]: 8e-06 [merge_send_recv]: 5.61003e-06 [auto_parallel]: 6.41998e-06 [parallel]: 4.63001e-06 [flash_sp]: 3.28e-06 [merge_comm]: 4.12998e-06 [allreduce_fusion]: 4.1e-06 [matmul_add_comm_reduction]: 6.29001e-06 [allreduce_slice_to_reducescatter]: 4.10015e-07 [virtual_shard_identity]: 9.17001e-06 [virtual_dataset]: 8.07e-06 [get_grad_eliminate_]: 7.86001e-06 [virtual_output]: 7.75998e-06 [merge_forward]: 3.35e-06 [cell_reuse_recompute_pass]: 1.55001e-06 [offload_activation]: 7.33e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.443e-05 [merge_recompute_call_nodes]: 7.80012e-07 [before_grad]: 1.245e-05 [set_forward_comm_id_for_comm_node_pass]: 4.63999e-06 [meta_fg_expand]: 2.86999e-06 [flash_sp_send_recv_attached]: 9.09989e-07 [receive_attached]: 9.30013e-07 [after_resolve]: 1.279e-05 [a_after_grad]: 1.302e-05 [renormalize]: 7.99773e-08 [add_forward_monad_depend]: 1.31002e-06 [auto_monad_grad]: 1.02e-06 [auto_monad_eliminator]: 8.05999e-06 [cse]: 1.802e-05 [a_3]: 5.068e-05 [py_interpret_to_execute_after_opt_a]: 1.09e-05 [slice_cell_reuse_recomputed_activation]: 2.02999e-06 [rewriter_after_opt_a]: 4.079e-05 [convert_after_rewriter]: 7.83001e-06 [order_py_execute_after_rewriter]: 5.72999e-06 [mutable_eliminate]: 0.00051408 [opt_b]: 0.00027046, [1] [Cycle 1]: 0.00026425, [7] [b_1]: 0.00017858 [b_2]: 1.037e-05 [updatestate_depend_eliminate]: 5.77001e-06 [updatestate_assign_eliminate]: 3.14999e-06 [updatestate_loads_eliminate]: 3.17002e-06 [renormalize]: 3.19997e-07 [cse]: 2.224e-05 [optimize_parallel_all_gather_comm]: 1.773e-05 [overlap_param_gather]: 1.84e-06 [cconv]: 2.363e-05 [loop_unroll]: 0.00041854 [opt_after_cconv]: 0.0001184, [1] [Cycle 1]: 0.00011306, [7] [c_1]: 4.192e-05 [parameter_eliminate]: 2.56998e-06 [updatestate_depend_eliminate]: 5.81e-06 [updatestate_assign_eliminate]: 3.18e-06 [updatestate_loads_eliminate]: 3.26999e-06 [cse]: 2.193e-05 [renormalize]: 4.09986e-07 [remove_dup_value]: 1.444e-05 [tuple_transform]: 9.017e-05, [1] [Cycle 1]: 8.576e-05, [4] [d_1]: 5.661e-05 [none_parameter_eliminate]: 1.57001e-06 [renormalize]: 2.30008e-07 [switch_simplify]: 8.83001e-06 [partial_unused_args_eliminate]: 1.76e-06 [add_recomputation]: 5.465e-05 [cse_after_recomputation]: 2.603e-05, [1] [Cycle 1]: 2.11e-05, [1] [cse]: 1.542e-05 [environ_conv]: 6.30997e-06 [swap_dp_allreduce_reducescatter]: 6.06e-06 [bias_add_comm_swap]: 2.53998e-06 [label_micro_interleaved_index]: 4.1e-06 [label_fine_grained_interleaved_index]: 2.62001e-06 [merge_cast_opt]: 1.22e-06 [slice_recompute_activation]: 1.94999e-06 [micro_interleaved_order_control]: 2.24001e-06 [assign_add_opt]: 1.22e-06 [ForceFp32Comm]: 7.80012e-07 [remove_cast_before_assign_add]: 1.20001e-06 [full_micro_interleaved_order_control]: 2.31998e-06 [reorder_send_recv_between_fp_bp]: 2.90998e-06 [comm_op_add_attrs]: 1.02e-06 [add_comm_op_reuse_tag]: 1.17e-06 [interleave_split_concat_branches]: 1.29e-06 [interleave_parallel_branches]: 1.05001e-06 [overlap_opt_shard_in_pipeline]: 1.17999e-06 [overlap_opt_shard_grad_in_pipeline]: 1.72001e-06 [control_data_broadcast_order]: 1.435e-05 [grouped_pairwise_exchange_alltoall]: 1.86e-06 [offloading_packed_experts]: 4.69998e-06 [overlap_recompute_and_grad_model_parallel]: 5.27001e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.20999e-06 [overlap_recompute_allgather_and_fa_grad]: 1.67999e-06 [overlap_recompute_comm]: 2.21e-06 [overlap_grad_ring_attention]: 4.46002e-06 [overlap_grad_flash_sp]: 2.157e-05 [begin_end_overlap_inline]: 7.09988e-07 [split_matmul_comm_elemetwise]: 2.23998e-06 [split_layernorm_comm]: 1.54e-06 [handle_group_info]: 8.80013e-07 [symbol_engine_optimizer]: 8.349e-05, [1] [Cycle 1]: 7.949e-05, [6] [build]: 2.90002e-06 [elim_shapecalc]: 1.164e-05 [elim_not_effective]: 1.583e-05 [opt_reshape]: 8.58001e-06 [fold_const_symbol]: 1.249e-05 [renormalize]: 2.20025e-07 [detach_backward]: 2.09e-06 [pipeline_parallel_scheduler]: 1.41002e-06 [auto_monad_reorder]: 1.926e-05 [get_jit_bprop_graph]: 1.43002e-06 [rewriter_after_jit_bprop_graph]: 3.61001e-06 [opt_after_jit_grad]: 0.00045719 [validate]: 4.226e-05 Sums bootstrap : 0.000435s : 3.76% type_inference : 0.005938s : 51.23% event_method : 0.000021s : 0.18% auto_monad : 0.000062s : 0.53% graph_reusing : 0.000006s : 0.05% inline : 0.000002s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000021s : 0.18% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000007s : 0.06% parallel-infer-symbol : 0.000003s : 0.03% pre_auto_parallel : 0.000036s : 0.31% insert-virtual-dataset : 0.000003s : 0.03% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000030s : 0.26% optimize.rewriter_before_opt_a : 0.000093s : 0.80% optimize.opt_a.expand_dump_flag : 0.000004s : 0.04% optimize.opt_a.switch_simplify : 0.000058s : 0.50% optimize.opt_a.loop_unroll : 0.000044s : 0.38% optimize.opt_a.a_1 : 0.001070s : 9.23% optimize.opt_a.with_stream_mark : 0.000028s : 0.25% optimize.opt_a.recompute_prepare : 0.000020s : 0.18% optimize.opt_a.updatestate_depend_eliminate : 0.000009s : 0.08% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.06% optimize.opt_a.updatestate_loads_eliminate : 0.000007s : 0.06% optimize.opt_a.parameter_eliminate : 0.000003s : 0.03% optimize.opt_a.a_2 : 0.000217s : 1.87% optimize.opt_a.accelerated_algorithm : 0.000018s : 0.15% optimize.opt_a.shard : 0.000003s : 0.03% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.03% optimize.opt_a.shard_inline : 0.000017s : 0.15% optimize.opt_a.merge_send_recv : 0.000016s : 0.14% optimize.opt_a.auto_parallel : 0.000013s : 0.12% optimize.opt_a.parallel : 0.000023s : 0.19% optimize.opt_a.flash_sp : 0.000011s : 0.10% optimize.opt_a.merge_comm : 0.000010s : 0.08% optimize.opt_a.allreduce_fusion : 0.000008s : 0.07% optimize.opt_a.matmul_add_comm_reduction : 0.000016s : 0.14% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000020s : 0.17% optimize.opt_a.virtual_dataset : 0.000017s : 0.14% optimize.opt_a.get_grad_eliminate_ : 0.000016s : 0.14% optimize.opt_a.virtual_output : 0.000016s : 0.14% optimize.opt_a.merge_forward : 0.000008s : 0.07% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.02% optimize.opt_a.offload_activation : 0.000018s : 0.16% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000032s : 0.27% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.02% optimize.opt_a.before_grad : 0.000026s : 0.23% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000009s : 0.08% optimize.opt_a.meta_fg_expand : 0.000006s : 0.05% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.03% optimize.opt_a.receive_attached : 0.000004s : 0.03% optimize.opt_a.after_resolve : 0.000028s : 0.24% optimize.opt_a.a_after_grad : 0.000026s : 0.23% optimize.opt_a.renormalize : 0.000701s : 6.05% optimize.opt_a.add_forward_monad_depend : 0.000007s : 0.06% optimize.opt_a.auto_monad_grad : 0.000003s : 0.03% optimize.opt_a.auto_monad_eliminator : 0.000025s : 0.22% optimize.opt_a.cse : 0.000054s : 0.46% optimize.opt_a.a_3 : 0.000113s : 0.97% optimize.py_interpret_to_execute_after_opt_a : 0.000011s : 0.09% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.02% optimize.rewriter_after_opt_a : 0.000041s : 0.35% optimize.convert_after_rewriter : 0.000008s : 0.07% optimize.order_py_execute_after_rewriter : 0.000006s : 0.05% optimize.mutable_eliminate : 0.000514s : 4.43% optimize.opt_b.b_1 : 0.000179s : 1.54% optimize.opt_b.b_2 : 0.000010s : 0.09% optimize.opt_b.updatestate_depend_eliminate : 0.000006s : 0.05% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_b.renormalize : 0.000000s : 0.00% optimize.opt_b.cse : 0.000022s : 0.19% optimize.optimize_parallel_all_gather_comm : 0.000018s : 0.15% optimize.overlap_param_gather : 0.000002s : 0.02% optimize.cconv : 0.000024s : 0.20% optimize.loop_unroll : 0.000419s : 3.61% optimize.opt_after_cconv.c_1 : 0.000042s : 0.36% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.02% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.05% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.cse : 0.000022s : 0.19% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000014s : 0.12% optimize.tuple_transform.d_1 : 0.000057s : 0.49% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000009s : 0.08% optimize.partial_unused_args_eliminate : 0.000002s : 0.02% optimize.add_recomputation : 0.000055s : 0.47% optimize.cse_after_recomputation.cse : 0.000015s : 0.13% optimize.environ_conv : 0.000006s : 0.05% optimize.swap_dp_allreduce_reducescatter : 0.000006s : 0.05% optimize.bias_add_comm_swap : 0.000003s : 0.02% optimize.label_micro_interleaved_index : 0.000004s : 0.04% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.02% optimize.merge_cast_opt : 0.000001s : 0.01% optimize.slice_recompute_activation : 0.000002s : 0.02% optimize.micro_interleaved_order_control : 0.000002s : 0.02% optimize.assign_add_opt : 0.000001s : 0.01% optimize.ForceFp32Comm : 0.000001s : 0.01% optimize.remove_cast_before_assign_add : 0.000001s : 0.01% optimize.full_micro_interleaved_order_control : 0.000002s : 0.02% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.03% optimize.comm_op_add_attrs : 0.000001s : 0.01% optimize.add_comm_op_reuse_tag : 0.000001s : 0.01% optimize.interleave_split_concat_branches : 0.000001s : 0.01% optimize.interleave_parallel_branches : 0.000001s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.01% optimize.control_data_broadcast_order : 0.000014s : 0.12% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.02% optimize.offloading_packed_experts : 0.000005s : 0.04% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.05% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000002s : 0.01% optimize.overlap_recompute_comm : 0.000002s : 0.02% optimize.overlap_grad_ring_attention : 0.000004s : 0.04% optimize.overlap_grad_flash_sp : 0.000022s : 0.19% optimize.begin_end_overlap_inline : 0.000001s : 0.01% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.02% optimize.split_layernorm_comm : 0.000002s : 0.01% optimize.handle_group_info : 0.000001s : 0.01% optimize.symbol_engine_optimizer.build : 0.000003s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000012s : 0.10% optimize.symbol_engine_optimizer.elim_not_effective : 0.000016s : 0.14% optimize.symbol_engine_optimizer.opt_reshape : 0.000009s : 0.07% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000012s : 0.11% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.02% pipeline_parallel_scheduler : 0.000001s : 0.01% auto_monad_reorder : 0.000019s : 0.17% get_jit_bprop_graph : 0.000001s : 0.01% rewriter_after_jit_bprop_graph : 0.000004s : 0.03% opt_after_jit_grad : 0.000457s : 3.94% validate : 0.000042s : 0.36% Time group info: ------[substitution.] 0.000218 44 9.43% : 0.000021s : 3: substitution.cast_eliminate 1.06% : 0.000002s : 3: substitution.elim_not_effective 0.73% : 0.000002s : 3: substitution.fold_const_symbol 3.07% : 0.000007s : 6: substitution.graph_param_transform 67.67% : 0.000148s : 4: substitution.inline 1.86% : 0.000004s : 6: substitution.j_node_and_user_rematch 2.63% : 0.000006s : 6: substitution.remove_not_recompute_node 2.46% : 0.000005s : 6: substitution.replace_old_param 8.73% : 0.000019s : 6: substitution.tuple_list_get_item_eliminator 2.35% : 0.000005s : 1: substitution.value_based_eliminate ------[type_inference.] 0.005878 2 87.34% : 0.005134s : 1: type_inference.infer 12.66% : 0.000744s : 1: type_inference.specialize ------[replace.] 0.000072 10 50.93% : 0.000037s : 4: replace.inline 49.07% : 0.000035s : 6: replace.tuple_list_get_item_eliminator ------[match.] 0.000161 10 89.85% : 0.000145s : 4: match.inline 10.15% : 0.000016s : 6: match.tuple_list_get_item_eliminator ------[predicate.] 0.000288 1954 0.99% : 0.000003s : 21: predicate.accumulaten_eliminater 0.68% : 0.000002s : 6: predicate.ad_related_special_op_eliminate 0.54% : 0.000002s : 12: predicate.addn_check_dump 0.96% : 0.000003s : 21: predicate.addn_zero_filter 0.91% : 0.000003s : 21: predicate.adjust_all_reduce_mul_add 2.01% : 0.000006s : 33: predicate.arithmetic_simplify 1.10% : 0.000003s : 21: predicate.cast_eliminate 0.57% : 0.000002s : 12: predicate.check_bprop_eliminate 0.54% : 0.000002s : 12: predicate.compare_switch_simplify 0.20% : 0.000001s : 6: predicate.const_output_eliminate 0.60% : 0.000002s : 12: predicate.depend_value_elim 1.03% : 0.000003s : 21: predicate.dict_get_item_const_eliminator 1.16% : 0.000003s : 21: predicate.dict_get_item_eliminator 0.94% : 0.000003s : 21: predicate.dict_set_item_eliminator 0.80% : 0.000002s : 12: predicate.dumpgradient_eliminate 0.20% : 0.000001s : 6: predicate.elim_not_effective 0.34% : 0.000001s : 6: predicate.elim_shapecalc_of_broadcastargs 1.20% : 0.000003s : 27: predicate.environ_add_const_eliminate 1.17% : 0.000003s : 27: predicate.environ_get_add_eliminate 1.22% : 0.000004s : 27: predicate.environ_get_depend_swap 1.78% : 0.000005s : 39: predicate.environ_get_eliminate 1.21% : 0.000003s : 27: predicate.environ_get_set_eliminate 1.47% : 0.000004s : 31: predicate.exchange_switch_depend_value 2.27% : 0.000007s : 31: predicate.float_depend_g_call 0.54% : 0.000002s : 12: predicate.float_environ_get_switch 0.83% : 0.000002s : 18: predicate.float_tuple_getitem_switch 0.19% : 0.000001s : 6: predicate.fold_const_symbol 0.63% : 0.000002s : 12: predicate.get_grad_eliminate 0.20% : 0.000001s : 6: predicate.graph_param_transform 0.58% : 0.000002s : 12: predicate.incorporate_call 0.50% : 0.000001s : 12: predicate.incorporate_call_switch 6.29% : 0.000018s : 88: predicate.inline 0.74% : 0.000002s : 12: predicate.inline_without_move 0.35% : 0.000001s : 12: predicate.j_node_and_user_rematch 0.75% : 0.000002s : 12: predicate.less_batch_normalization 1.83% : 0.000005s : 39: predicate.list_to_tuple_eliminator_ 2.68% : 0.000008s : 60: predicate.load_eliminater 0.66% : 0.000002s : 6: predicate.loop_unroll_after_grad 2.09% : 0.000006s : 42: predicate.loop_unroll_before_grad 1.71% : 0.000005s : 33: predicate.make_slice_get_slice_eliminator 0.55% : 0.000002s : 12: predicate.merge_addn 0.56% : 0.000002s : 12: predicate.micro_step_allgather_replace 0.58% : 0.000002s : 12: predicate.mini_step_allgather_replace 0.92% : 0.000003s : 21: predicate.minmaximum_grad 0.80% : 0.000002s : 6: predicate.mutable_eliminate 0.32% : 0.000001s : 6: predicate.opt_reshape 0.33% : 0.000001s : 6: predicate.parallel_virtual_node 1.76% : 0.000005s : 31: predicate.partial_defer_inline 1.83% : 0.000005s : 33: predicate.partial_eliminate 0.97% : 0.000003s : 21: predicate.print_const_string_wrapper 0.58% : 0.000002s : 12: predicate.reduce_all_const_elim 1.20% : 0.000003s : 21: predicate.reduce_eliminate 2.70% : 0.000008s : 60: predicate.redundant_stop_gradient_eliminater 0.39% : 0.000001s : 12: predicate.remove_not_recompute_node 1.56% : 0.000004s : 39: predicate.replace_applicator 0.47% : 0.000001s : 12: predicate.replace_old_param 0.26% : 0.000001s : 6: predicate.reset_defer_inline 1.03% : 0.000003s : 21: predicate.reshape_eliminate 0.59% : 0.000002s : 12: predicate.row_tensor_add_zeros_like 0.44% : 0.000001s : 6: predicate.row_tensor_eliminate 0.86% : 0.000002s : 12: predicate.same_eliminate 0.42% : 0.000001s : 12: predicate.set_cell_output_no_recompute 0.74% : 0.000002s : 12: predicate.shard_identity_eliminate 0.71% : 0.000002s : 12: predicate.special_op_eliminate 0.69% : 0.000002s : 12: predicate.specialize_transform 0.80% : 0.000002s : 12: predicate.split_environ_get_set_with_tuple_value 0.69% : 0.000002s : 12: predicate.stack_unstack_eliminate 0.35% : 0.000001s : 6: predicate.switch_call_monad_eliminater 1.55% : 0.000004s : 31: predicate.switch_defer_inline 2.13% : 0.000006s : 43: predicate.switch_layer_defer_inline 4.78% : 0.000014s : 91: predicate.switch_simplify 0.94% : 0.000003s : 21: predicate.tile_eliminate 0.97% : 0.000003s : 21: predicate.transpose_eliminate 1.62% : 0.000005s : 33: predicate.tuple_list_convert_item_index_to_positive 1.71% : 0.000005s : 33: predicate.tuple_list_get_item_const_eliminator 1.54% : 0.000004s : 33: predicate.tuple_list_get_item_depend_reorder 3.20% : 0.000009s : 51: predicate.tuple_list_get_item_eliminator 1.60% : 0.000005s : 33: predicate.tuple_list_get_set_item_eliminator 2.32% : 0.000007s : 45: predicate.tuple_list_set_item_eliminator 1.83% : 0.000005s : 39: predicate.tuple_to_list_eliminator_ 2.67% : 0.000008s : 60: predicate.updatestate_pure_node_eliminater 3.27% : 0.000009s : 72: predicate.updatestate_useless_node_eliminater 0.40% : 0.000001s : 6: predicate.value_based_eliminate 0.62% : 0.000002s : 12: predicate.virtual_dataset_eliminate 0.62% : 0.000002s : 12: predicate.virtual_output_eliminate 0.28% : 0.000001s : 6: predicate.virtual_view_grad_eliminate 0.41% : 0.000001s : 6: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000557 11 53.05% : 0.000296s : 5: func_graph_cloner_run.FuncGraphClonerGraph 46.95% : 0.000262s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.026575 192 0.01% : 0.000004s : 1: ForceFp32Comm 11.49% : 0.003054s : 1: add_attr 11.45% : 0.003043s : 1: add_attr_with_inline 0.01% : 0.000004s : 1: add_comm_op_reuse_tag 0.22% : 0.000059s : 1: add_recomputation 0.01% : 0.000004s : 1: assign_add_opt 0.25% : 0.000068s : 1: auto_monad 0.09% : 0.000023s : 1: auto_monad_reorder 0.01% : 0.000003s : 1: begin_end_overlap_inline 0.02% : 0.000005s : 1: bias_add_comm_swap 1.74% : 0.000464s : 1: bootstrap 0.10% : 0.000027s : 1: cconv 0.01% : 0.000004s : 1: comm_op_add_attrs 0.07% : 0.000018s : 1: control_data_broadcast_order 0.04% : 0.000011s : 1: convert_after_rewriter 0.11% : 0.000029s : 1: cse_after_recomputation 0.02% : 0.000005s : 1: dataset_repeat_opt 0.02% : 0.000005s : 1: detach_backward 0.04% : 0.000010s : 1: environ_conv 0.10% : 0.000026s : 1: event_method 0.02% : 0.000005s : 1: full_micro_interleaved_order_control 0.02% : 0.000005s : 1: get_jit_bprop_graph 0.03% : 0.000009s : 1: graph_reusing 0.02% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000004s : 1: handle_group_info 0.02% : 0.000005s : 1: inline 0.02% : 0.000006s : 1: insert-virtual-dataset 0.01% : 0.000004s : 1: interleave_parallel_branches 0.02% : 0.000004s : 1: interleave_split_concat_branches 0.02% : 0.000006s : 1: label_fine_grained_interleaved_index 0.03% : 0.000007s : 1: label_micro_interleaved_index 1.61% : 0.000427s : 1: loop_unroll 0.02% : 0.000004s : 1: merge_cast_opt 0.02% : 0.000005s : 1: micro_interleaved_order_control 1.97% : 0.000522s : 1: mutable_eliminate 0.03% : 0.000008s : 1: offloading_packed_experts 0.06% : 0.000016s : 1: opt.transform.loop_unroll_optimizer 0.06% : 0.000017s : 1: opt.transform.mutable_eliminate 6.19% : 0.001645s : 78: opt.transform.opt_a 0.15% : 0.000041s : 1: opt.transform.opt_after_cconv 0.11% : 0.000030s : 1: opt.transform.opt_after_jit_grad 0.59% : 0.000157s : 28: opt.transform.opt_b 0.24% : 0.000063s : 2: opt.transform.opt_trans_graph 0.17% : 0.000045s : 4: opt.transform.symbol_engine_opt 12.09% : 0.003214s : 1: opt_a 0.46% : 0.000122s : 1: opt_after_cconv 1.75% : 0.000466s : 1: opt_after_jit_grad 1.03% : 0.000274s : 1: opt_b 20.17% : 0.005359s : 1: optimize 0.08% : 0.000021s : 1: optimize_parallel_all_gather_comm 0.03% : 0.000009s : 1: order_py_execute_after_rewriter 0.09% : 0.000025s : 1: overlap_grad_flash_sp 0.01% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.03% : 0.000007s : 1: overlap_grad_ring_attention 0.02% : 0.000004s : 1: overlap_opt_shard_grad_in_pipeline 0.02% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.02% : 0.000005s : 1: overlap_param_gather 0.02% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.03% : 0.000008s : 1: overlap_recompute_and_grad_model_parallel 0.02% : 0.000005s : 1: overlap_recompute_comm 0.03% : 0.000007s : 1: parallel-infer-symbol 0.02% : 0.000004s : 1: parallel-infer-symbol-second 0.02% : 0.000005s : 1: partial_unused_args_eliminate 0.02% : 0.000005s : 1: pipeline_parallel_scheduler 0.02% : 0.000005s : 1: pipeline_split 0.15% : 0.000040s : 1: pre_auto_parallel 0.13% : 0.000034s : 1: py_interpret_to_execute 0.05% : 0.000014s : 1: py_interpret_to_execute_after_opt_a 0.02% : 0.000004s : 1: remove_cast_before_assign_add 0.07% : 0.000018s : 1: remove_dup_value 1.40% : 0.000371s : 1: renormalize.infer 1.21% : 0.000322s : 1: renormalize.specialize 0.02% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.02% : 0.000007s : 1: rewriter_after_jit_bprop_graph 0.17% : 0.000045s : 1: rewriter_after_opt_a 0.37% : 0.000097s : 1: rewriter_before_opt_a 0.02% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.02% : 0.000005s : 1: slice_recompute_activation 0.02% : 0.000004s : 1: split_layernorm_comm 0.02% : 0.000005s : 1: split_matmul_comm_elemetwise 0.03% : 0.000009s : 1: swap_dp_allreduce_reducescatter 0.32% : 0.000086s : 1: symbol_engine_optimizer 0.35% : 0.000093s : 1: tuple_transform 22.40% : 0.005953s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:47:08.128.779 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:47:08.129.055 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.015788, [21] [bootstrap]: 0.00043306 [type_inference]: 0.00567949 [event_method]: 2.005e-05 [auto_monad]: 6.241e-05 [graph_reusing]: 5.92001e-06 [inline]: 2.11e-06 [add_attr]: 0.0030714, [1] [add_attr_with_inline]: 0.00306287, [1] [Cycle 1]: 6.947e-05, [2] [tag_attr]: 1.925e-05 [meta_addattr_fg_expand]: 6.57002e-06 [parallel-infer-symbol]: 3.04001e-06 [pre_auto_parallel]: 3.224e-05 [insert-virtual-dataset]: 2.09999e-06 [parallel-infer-symbol-second]: 8.2e-07 [dataset_repeat_opt]: 1.69998e-06 [pipeline_split]: 1.60999e-06 [optimize]: 0.00537898, [53] [py_interpret_to_execute]: 3.858e-05 [rewriter_before_opt_a]: 9.398e-05 [opt_a]: 0.00307098, [2] [Cycle 1]: 0.00219004, [45] [expand_dump_flag]: 3.2e-06 [switch_simplify]: 4.721e-05 [loop_unroll]: 3.556e-05 [a_1]: 0.0007227 [with_stream_mark]: 1.483e-05 [recompute_prepare]: 9.31e-06 [updatestate_depend_eliminate]: 3.76001e-06 [updatestate_assign_eliminate]: 3.5e-06 [updatestate_loads_eliminate]: 3.17002e-06 [parameter_eliminate]: 1.99e-06 [a_2]: 0.00012161 [accelerated_algorithm]: 7.46001e-06 [shard]: 1.94e-06 [meta_shard_fg_expand]: 1.83002e-06 [shard_inline]: 7.35998e-06 [merge_send_recv]: 8.07998e-06 [auto_parallel]: 5.99e-06 [parallel]: 1.784e-05 [flash_sp]: 7.87e-06 [merge_comm]: 4.22e-06 [allreduce_fusion]: 3.48999e-06 [matmul_add_comm_reduction]: 8.85999e-06 [allreduce_slice_to_reducescatter]: 6.09987e-07 [virtual_shard_identity]: 8.65001e-06 [virtual_dataset]: 7.46001e-06 [get_grad_eliminate_]: 6.88e-06 [virtual_output]: 7.13998e-06 [merge_forward]: 3.68e-06 [cell_reuse_recompute_pass]: 1.22999e-06 [offload_activation]: 1.042e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.472e-05 [merge_recompute_call_nodes]: 1.52001e-06 [before_grad]: 1.145e-05 [set_forward_comm_id_for_comm_node_pass]: 3.51001e-06 [meta_fg_expand]: 2.86999e-06 [flash_sp_send_recv_attached]: 2.52001e-06 [receive_attached]: 2.46e-06 [after_resolve]: 1.316e-05 [a_after_grad]: 1.131e-05 [renormalize]: 0.00054829 [add_forward_monad_depend]: 5.13002e-06 [auto_monad_grad]: 1.86998e-06 [auto_monad_eliminator]: 1.409e-05 [cse]: 3.046e-05 [a_3]: 6.497e-05 [Cycle 2]: 0.00086769, [45] [expand_dump_flag]: 1.07e-06 [switch_simplify]: 9.05001e-06 [loop_unroll]: 7.25e-06 [a_1]: 0.00015907 [with_stream_mark]: 1.049e-05 [recompute_prepare]: 7.23e-06 [updatestate_depend_eliminate]: 3.03998e-06 [updatestate_assign_eliminate]: 2.97002e-06 [updatestate_loads_eliminate]: 2.21998e-06 [parameter_eliminate]: 1.06997e-06 [a_2]: 0.00011029 [accelerated_algorithm]: 7.09001e-06 [shard]: 1.10999e-06 [meta_shard_fg_expand]: 1.43002e-06 [shard_inline]: 6.87002e-06 [merge_send_recv]: 4.72998e-06 [auto_parallel]: 5.47001e-06 [parallel]: 4.23001e-06 [flash_sp]: 3.50003e-06 [merge_comm]: 3.2e-06 [allreduce_fusion]: 2.88998e-06 [matmul_add_comm_reduction]: 5.75001e-06 [allreduce_slice_to_reducescatter]: 3.50003e-07 [virtual_shard_identity]: 7.48e-06 [virtual_dataset]: 6.69999e-06 [get_grad_eliminate_]: 7.20998e-06 [virtual_output]: 6.59001e-06 [merge_forward]: 2.69001e-06 [cell_reuse_recompute_pass]: 1.45999e-06 [offload_activation]: 6.48e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.569e-05 [merge_recompute_call_nodes]: 7.29982e-07 [before_grad]: 1.012e-05 [set_forward_comm_id_for_comm_node_pass]: 3.35e-06 [meta_fg_expand]: 2.17001e-06 [flash_sp_send_recv_attached]: 8.80013e-07 [receive_attached]: 1.12e-06 [after_resolve]: 1.218e-05 [a_after_grad]: 1.058e-05 [renormalize]: 7.00238e-08 [add_forward_monad_depend]: 1.31002e-06 [auto_monad_grad]: 8.89995e-07 [auto_monad_eliminator]: 6.94999e-06 [cse]: 1.395e-05 [a_3]: 5.316e-05 [py_interpret_to_execute_after_opt_a]: 1.212e-05 [slice_cell_reuse_recomputed_activation]: 5.20999e-06 [rewriter_after_opt_a]: 4.14e-05 [convert_after_rewriter]: 1.001e-05 [order_py_execute_after_rewriter]: 7.78001e-06 [mutable_eliminate]: 0.0005198 [opt_b]: 0.00029941, [1] [Cycle 1]: 0.00029032, [7] [b_1]: 0.00019604 [b_2]: 8.74998e-06 [updatestate_depend_eliminate]: 4.79e-06 [updatestate_assign_eliminate]: 2.53e-06 [updatestate_loads_eliminate]: 2.54001e-06 [renormalize]: 3.89991e-07 [cse]: 1.894e-05 [optimize_parallel_all_gather_comm]: 1.906e-05 [overlap_param_gather]: 4.67e-06 [cconv]: 2.549e-05 [loop_unroll]: 0.00043007 [opt_after_cconv]: 0.00012929, [1] [Cycle 1]: 0.00012074, [7] [c_1]: 3.625e-05 [parameter_eliminate]: 2.38998e-06 [updatestate_depend_eliminate]: 5.12e-06 [updatestate_assign_eliminate]: 2.43e-06 [updatestate_loads_eliminate]: 2.36e-06 [cse]: 1.746e-05 [renormalize]: 2.50002e-07 [remove_dup_value]: 1.58e-05 [tuple_transform]: 9.603e-05, [1] [Cycle 1]: 8.864e-05, [4] [d_1]: 4.847e-05 [none_parameter_eliminate]: 1.57001e-06 [renormalize]: 1.70025e-07 [switch_simplify]: 7.75e-06 [partial_unused_args_eliminate]: 4.33999e-06 [add_recomputation]: 4.892e-05 [cse_after_recomputation]: 2.856e-05, [1] [Cycle 1]: 2.158e-05, [1] [cse]: 1.225e-05 [environ_conv]: 7.88001e-06 [swap_dp_allreduce_reducescatter]: 7.88001e-06 [bias_add_comm_swap]: 5.12e-06 [label_micro_interleaved_index]: 6.55002e-06 [label_fine_grained_interleaved_index]: 5.04e-06 [merge_cast_opt]: 3.71999e-06 [slice_recompute_activation]: 4.26001e-06 [micro_interleaved_order_control]: 4.59002e-06 [assign_add_opt]: 3.56999e-06 [ForceFp32Comm]: 3.25e-06 [remove_cast_before_assign_add]: 3.38e-06 [full_micro_interleaved_order_control]: 4.62e-06 [reorder_send_recv_between_fp_bp]: 5.42001e-06 [comm_op_add_attrs]: 3.23998e-06 [add_comm_op_reuse_tag]: 3.16999e-06 [interleave_split_concat_branches]: 3.50998e-06 [interleave_parallel_branches]: 3.36001e-06 [overlap_opt_shard_in_pipeline]: 3.35998e-06 [overlap_opt_shard_grad_in_pipeline]: 4.00998e-06 [control_data_broadcast_order]: 1.423e-05 [grouped_pairwise_exchange_alltoall]: 3.86999e-06 [offloading_packed_experts]: 6.07001e-06 [overlap_recompute_and_grad_model_parallel]: 7.42002e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.5e-06 [overlap_recompute_allgather_and_fa_grad]: 3.55998e-06 [overlap_recompute_comm]: 4.54002e-06 [overlap_grad_ring_attention]: 6.42001e-06 [overlap_grad_flash_sp]: 2.024e-05 [begin_end_overlap_inline]: 2.83e-06 [split_matmul_comm_elemetwise]: 4.22e-06 [split_layernorm_comm]: 4.22003e-06 [handle_group_info]: 3.27002e-06 [symbol_engine_optimizer]: 9.821e-05, [1] [Cycle 1]: 9.146e-05, [6] [build]: 2.82002e-06 [elim_shapecalc]: 1.059e-05 [elim_not_effective]: 1.386e-05 [opt_reshape]: 8.32e-06 [fold_const_symbol]: 1.088e-05 [renormalize]: 2.10013e-07 [detach_backward]: 2.84999e-06 [pipeline_parallel_scheduler]: 1.67999e-06 [auto_monad_reorder]: 1.945e-05 [get_jit_bprop_graph]: 1.27e-06 [rewriter_after_jit_bprop_graph]: 4.58001e-06 [opt_after_jit_grad]: 0.00047487 [validate]: 3.858e-05 Sums bootstrap : 0.000433s : 3.93% type_inference : 0.005679s : 51.51% event_method : 0.000020s : 0.18% auto_monad : 0.000062s : 0.57% graph_reusing : 0.000006s : 0.05% inline : 0.000002s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000019s : 0.17% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000007s : 0.06% parallel-infer-symbol : 0.000003s : 0.03% pre_auto_parallel : 0.000032s : 0.29% insert-virtual-dataset : 0.000002s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000039s : 0.35% optimize.rewriter_before_opt_a : 0.000094s : 0.85% optimize.opt_a.expand_dump_flag : 0.000004s : 0.04% optimize.opt_a.switch_simplify : 0.000056s : 0.51% optimize.opt_a.loop_unroll : 0.000043s : 0.39% optimize.opt_a.a_1 : 0.000882s : 8.00% optimize.opt_a.with_stream_mark : 0.000025s : 0.23% optimize.opt_a.recompute_prepare : 0.000017s : 0.15% optimize.opt_a.updatestate_depend_eliminate : 0.000007s : 0.06% optimize.opt_a.updatestate_assign_eliminate : 0.000006s : 0.06% optimize.opt_a.updatestate_loads_eliminate : 0.000005s : 0.05% optimize.opt_a.parameter_eliminate : 0.000003s : 0.03% optimize.opt_a.a_2 : 0.000232s : 2.10% optimize.opt_a.accelerated_algorithm : 0.000015s : 0.13% optimize.opt_a.shard : 0.000003s : 0.03% optimize.opt_a.meta_shard_fg_expand : 0.000003s : 0.03% optimize.opt_a.shard_inline : 0.000014s : 0.13% optimize.opt_a.merge_send_recv : 0.000013s : 0.12% optimize.opt_a.auto_parallel : 0.000011s : 0.10% optimize.opt_a.parallel : 0.000022s : 0.20% optimize.opt_a.flash_sp : 0.000011s : 0.10% optimize.opt_a.merge_comm : 0.000007s : 0.07% optimize.opt_a.allreduce_fusion : 0.000006s : 0.06% optimize.opt_a.matmul_add_comm_reduction : 0.000015s : 0.13% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000016s : 0.15% optimize.opt_a.virtual_dataset : 0.000014s : 0.13% optimize.opt_a.get_grad_eliminate_ : 0.000014s : 0.13% optimize.opt_a.virtual_output : 0.000014s : 0.12% optimize.opt_a.merge_forward : 0.000006s : 0.06% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.02% optimize.opt_a.offload_activation : 0.000017s : 0.15% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000030s : 0.28% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.02% optimize.opt_a.before_grad : 0.000022s : 0.20% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000007s : 0.06% optimize.opt_a.meta_fg_expand : 0.000005s : 0.05% optimize.opt_a.flash_sp_send_recv_attached : 0.000003s : 0.03% optimize.opt_a.receive_attached : 0.000004s : 0.03% optimize.opt_a.after_resolve : 0.000025s : 0.23% optimize.opt_a.a_after_grad : 0.000022s : 0.20% optimize.opt_a.renormalize : 0.000548s : 4.97% optimize.opt_a.add_forward_monad_depend : 0.000006s : 0.06% optimize.opt_a.auto_monad_grad : 0.000003s : 0.03% optimize.opt_a.auto_monad_eliminator : 0.000021s : 0.19% optimize.opt_a.cse : 0.000044s : 0.40% optimize.opt_a.a_3 : 0.000118s : 1.07% optimize.py_interpret_to_execute_after_opt_a : 0.000012s : 0.11% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.05% optimize.rewriter_after_opt_a : 0.000041s : 0.38% optimize.convert_after_rewriter : 0.000010s : 0.09% optimize.order_py_execute_after_rewriter : 0.000008s : 0.07% optimize.mutable_eliminate : 0.000520s : 4.71% optimize.opt_b.b_1 : 0.000196s : 1.78% optimize.opt_b.b_2 : 0.000009s : 0.08% optimize.opt_b.updatestate_depend_eliminate : 0.000005s : 0.04% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.02% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.02% optimize.opt_b.renormalize : 0.000000s : 0.00% optimize.opt_b.cse : 0.000019s : 0.17% optimize.optimize_parallel_all_gather_comm : 0.000019s : 0.17% optimize.overlap_param_gather : 0.000005s : 0.04% optimize.cconv : 0.000025s : 0.23% optimize.loop_unroll : 0.000430s : 3.90% optimize.opt_after_cconv.c_1 : 0.000036s : 0.33% optimize.opt_after_cconv.parameter_eliminate : 0.000002s : 0.02% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000005s : 0.05% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000002s : 0.02% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.02% optimize.opt_after_cconv.cse : 0.000017s : 0.16% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000016s : 0.14% optimize.tuple_transform.d_1 : 0.000048s : 0.44% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000008s : 0.07% optimize.partial_unused_args_eliminate : 0.000004s : 0.04% optimize.add_recomputation : 0.000049s : 0.44% optimize.cse_after_recomputation.cse : 0.000012s : 0.11% optimize.environ_conv : 0.000008s : 0.07% optimize.swap_dp_allreduce_reducescatter : 0.000008s : 0.07% optimize.bias_add_comm_swap : 0.000005s : 0.05% optimize.label_micro_interleaved_index : 0.000007s : 0.06% optimize.label_fine_grained_interleaved_index : 0.000005s : 0.05% optimize.merge_cast_opt : 0.000004s : 0.03% optimize.slice_recompute_activation : 0.000004s : 0.04% optimize.micro_interleaved_order_control : 0.000005s : 0.04% optimize.assign_add_opt : 0.000004s : 0.03% optimize.ForceFp32Comm : 0.000003s : 0.03% optimize.remove_cast_before_assign_add : 0.000003s : 0.03% optimize.full_micro_interleaved_order_control : 0.000005s : 0.04% optimize.reorder_send_recv_between_fp_bp : 0.000005s : 0.05% optimize.comm_op_add_attrs : 0.000003s : 0.03% optimize.add_comm_op_reuse_tag : 0.000003s : 0.03% optimize.interleave_split_concat_branches : 0.000004s : 0.03% optimize.interleave_parallel_branches : 0.000003s : 0.03% optimize.overlap_opt_shard_in_pipeline : 0.000003s : 0.03% optimize.overlap_opt_shard_grad_in_pipeline : 0.000004s : 0.04% optimize.control_data_broadcast_order : 0.000014s : 0.13% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.04% optimize.offloading_packed_experts : 0.000006s : 0.06% optimize.overlap_recompute_and_grad_model_parallel : 0.000007s : 0.07% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000003s : 0.03% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.03% optimize.overlap_recompute_comm : 0.000005s : 0.04% optimize.overlap_grad_ring_attention : 0.000006s : 0.06% optimize.overlap_grad_flash_sp : 0.000020s : 0.18% optimize.begin_end_overlap_inline : 0.000003s : 0.03% optimize.split_matmul_comm_elemetwise : 0.000004s : 0.04% optimize.split_layernorm_comm : 0.000004s : 0.04% optimize.handle_group_info : 0.000003s : 0.03% optimize.symbol_engine_optimizer.build : 0.000003s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000011s : 0.10% optimize.symbol_engine_optimizer.elim_not_effective : 0.000014s : 0.13% optimize.symbol_engine_optimizer.opt_reshape : 0.000008s : 0.08% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000011s : 0.10% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000003s : 0.03% pipeline_parallel_scheduler : 0.000002s : 0.02% auto_monad_reorder : 0.000019s : 0.18% get_jit_bprop_graph : 0.000001s : 0.01% rewriter_after_jit_bprop_graph : 0.000005s : 0.04% opt_after_jit_grad : 0.000475s : 4.31% validate : 0.000039s : 0.35% Time group info: ------[substitution.] 0.000186 34 1.11% : 0.000002s : 2: substitution.elim_not_effective 0.73% : 0.000001s : 2: substitution.fold_const_symbol 3.11% : 0.000006s : 5: substitution.graph_param_transform 74.93% : 0.000139s : 4: substitution.inline 2.05% : 0.000004s : 4: substitution.j_node_and_user_rematch 2.48% : 0.000005s : 4: substitution.remove_not_recompute_node 2.77% : 0.000005s : 6: substitution.replace_old_param 9.74% : 0.000018s : 6: substitution.tuple_list_get_item_eliminator 3.06% : 0.000006s : 1: substitution.value_based_eliminate ------[type_inference.] 0.005632 2 87.09% : 0.004905s : 1: type_inference.infer 12.91% : 0.000727s : 1: type_inference.specialize ------[replace.] 0.000069 10 53.94% : 0.000037s : 4: replace.inline 46.06% : 0.000032s : 6: replace.tuple_list_get_item_eliminator ------[match.] 0.000152 10 89.85% : 0.000136s : 4: match.inline 10.15% : 0.000015s : 6: match.tuple_list_get_item_eliminator ------[predicate.] 0.000233 1590 0.99% : 0.000002s : 16: predicate.accumulaten_eliminater 0.67% : 0.000002s : 5: predicate.ad_related_special_op_eliminate 0.51% : 0.000001s : 10: predicate.addn_check_dump 0.96% : 0.000002s : 16: predicate.addn_zero_filter 0.85% : 0.000002s : 16: predicate.adjust_all_reduce_mul_add 1.97% : 0.000005s : 26: predicate.arithmetic_simplify 0.99% : 0.000002s : 16: predicate.cast_eliminate 0.58% : 0.000001s : 10: predicate.check_bprop_eliminate 0.54% : 0.000001s : 10: predicate.compare_switch_simplify 0.21% : 0.000000s : 5: predicate.const_output_eliminate 0.63% : 0.000001s : 10: predicate.depend_value_elim 0.96% : 0.000002s : 16: predicate.dict_get_item_const_eliminator 1.02% : 0.000002s : 16: predicate.dict_get_item_eliminator 0.95% : 0.000002s : 16: predicate.dict_set_item_eliminator 0.99% : 0.000002s : 10: predicate.dumpgradient_eliminate 0.20% : 0.000000s : 5: predicate.elim_not_effective 0.39% : 0.000001s : 5: predicate.elim_shapecalc_of_broadcastargs 1.14% : 0.000003s : 21: predicate.environ_add_const_eliminate 1.08% : 0.000003s : 21: predicate.environ_get_add_eliminate 1.10% : 0.000003s : 21: predicate.environ_get_depend_swap 1.65% : 0.000004s : 31: predicate.environ_get_eliminate 1.07% : 0.000003s : 21: predicate.environ_get_set_eliminate 1.51% : 0.000004s : 26: predicate.exchange_switch_depend_value 2.46% : 0.000006s : 26: predicate.float_depend_g_call 0.54% : 0.000001s : 10: predicate.float_environ_get_switch 0.80% : 0.000002s : 15: predicate.float_tuple_getitem_switch 0.19% : 0.000000s : 5: predicate.fold_const_symbol 0.64% : 0.000001s : 10: predicate.get_grad_eliminate 0.21% : 0.000000s : 5: predicate.graph_param_transform 0.57% : 0.000001s : 10: predicate.incorporate_call 0.48% : 0.000001s : 10: predicate.incorporate_call_switch 6.30% : 0.000015s : 72: predicate.inline 0.70% : 0.000002s : 10: predicate.inline_without_move 0.36% : 0.000001s : 10: predicate.j_node_and_user_rematch 0.67% : 0.000002s : 10: predicate.less_batch_normalization 1.92% : 0.000004s : 32: predicate.list_to_tuple_eliminator_ 2.72% : 0.000006s : 48: predicate.load_eliminater 0.84% : 0.000002s : 5: predicate.loop_unroll_after_grad 2.46% : 0.000006s : 40: predicate.loop_unroll_before_grad 1.55% : 0.000004s : 26: predicate.make_slice_get_slice_eliminator 0.57% : 0.000001s : 10: predicate.merge_addn 0.54% : 0.000001s : 10: predicate.micro_step_allgather_replace 0.58% : 0.000001s : 10: predicate.mini_step_allgather_replace 0.87% : 0.000002s : 16: predicate.minmaximum_grad 0.79% : 0.000002s : 5: predicate.mutable_eliminate 0.35% : 0.000001s : 5: predicate.opt_reshape 0.37% : 0.000001s : 5: predicate.parallel_virtual_node 1.88% : 0.000004s : 26: predicate.partial_defer_inline 1.85% : 0.000004s : 27: predicate.partial_eliminate 0.90% : 0.000002s : 16: predicate.print_const_string_wrapper 0.57% : 0.000001s : 10: predicate.reduce_all_const_elim 1.13% : 0.000003s : 16: predicate.reduce_eliminate 2.79% : 0.000007s : 48: predicate.redundant_stop_gradient_eliminater 0.43% : 0.000001s : 10: predicate.remove_not_recompute_node 1.60% : 0.000004s : 32: predicate.replace_applicator 0.45% : 0.000001s : 10: predicate.replace_old_param 0.27% : 0.000001s : 5: predicate.reset_defer_inline 0.92% : 0.000002s : 16: predicate.reshape_eliminate 0.61% : 0.000001s : 10: predicate.row_tensor_add_zeros_like 0.33% : 0.000001s : 5: predicate.row_tensor_eliminate 0.72% : 0.000002s : 10: predicate.same_eliminate 0.45% : 0.000001s : 10: predicate.set_cell_output_no_recompute 0.70% : 0.000002s : 10: predicate.shard_identity_eliminate 0.68% : 0.000002s : 10: predicate.special_op_eliminate 0.71% : 0.000002s : 10: predicate.specialize_transform 0.77% : 0.000002s : 10: predicate.split_environ_get_set_with_tuple_value 0.72% : 0.000002s : 10: predicate.stack_unstack_eliminate 0.33% : 0.000001s : 5: predicate.switch_call_monad_eliminater 1.62% : 0.000004s : 26: predicate.switch_defer_inline 2.25% : 0.000005s : 36: predicate.switch_layer_defer_inline 5.47% : 0.000013s : 81: predicate.switch_simplify 0.92% : 0.000002s : 16: predicate.tile_eliminate 0.90% : 0.000002s : 16: predicate.transpose_eliminate 1.45% : 0.000003s : 26: predicate.tuple_list_convert_item_index_to_positive 1.56% : 0.000004s : 26: predicate.tuple_list_get_item_const_eliminator 1.45% : 0.000003s : 26: predicate.tuple_list_get_item_depend_reorder 3.37% : 0.000008s : 42: predicate.tuple_list_get_item_eliminator 1.47% : 0.000003s : 26: predicate.tuple_list_get_set_item_eliminator 2.28% : 0.000005s : 36: predicate.tuple_list_set_item_eliminator 1.83% : 0.000004s : 32: predicate.tuple_to_list_eliminator_ 2.55% : 0.000006s : 48: predicate.updatestate_pure_node_eliminater 3.32% : 0.000008s : 58: predicate.updatestate_useless_node_eliminater 0.42% : 0.000001s : 5: predicate.value_based_eliminate 0.61% : 0.000001s : 10: predicate.virtual_dataset_eliminate 0.60% : 0.000001s : 10: predicate.virtual_output_eliminate 0.26% : 0.000001s : 5: predicate.virtual_view_grad_eliminate 0.42% : 0.000001s : 5: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000519 11 53.94% : 0.000280s : 5: func_graph_cloner_run.FuncGraphClonerGraph 46.06% : 0.000239s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.026333 192 0.02% : 0.000006s : 1: ForceFp32Comm 11.69% : 0.003080s : 1: add_attr 11.65% : 0.003067s : 1: add_attr_with_inline 0.02% : 0.000006s : 1: add_comm_op_reuse_tag 0.20% : 0.000053s : 1: add_recomputation 0.02% : 0.000006s : 1: assign_add_opt 0.27% : 0.000071s : 1: auto_monad 0.10% : 0.000027s : 1: auto_monad_reorder 0.02% : 0.000005s : 1: begin_end_overlap_inline 0.03% : 0.000008s : 1: bias_add_comm_swap 1.81% : 0.000478s : 1: bootstrap 0.11% : 0.000029s : 1: cconv 0.02% : 0.000006s : 1: comm_op_add_attrs 0.06% : 0.000017s : 1: control_data_broadcast_order 0.05% : 0.000014s : 1: convert_after_rewriter 0.12% : 0.000032s : 1: cse_after_recomputation 0.03% : 0.000007s : 1: dataset_repeat_opt 0.06% : 0.000016s : 1: detach_backward 0.04% : 0.000011s : 1: environ_conv 0.11% : 0.000030s : 1: event_method 0.03% : 0.000007s : 1: full_micro_interleaved_order_control 0.03% : 0.000007s : 1: get_jit_bprop_graph 0.05% : 0.000012s : 1: graph_reusing 0.02% : 0.000006s : 1: grouped_pairwise_exchange_alltoall 0.02% : 0.000006s : 1: handle_group_info 0.03% : 0.000008s : 1: inline 0.03% : 0.000008s : 1: insert-virtual-dataset 0.02% : 0.000006s : 1: interleave_parallel_branches 0.02% : 0.000006s : 1: interleave_split_concat_branches 0.03% : 0.000008s : 1: label_fine_grained_interleaved_index 0.04% : 0.000009s : 1: label_micro_interleaved_index 1.66% : 0.000436s : 1: loop_unroll 0.03% : 0.000007s : 1: merge_cast_opt 0.03% : 0.000007s : 1: micro_interleaved_order_control 2.00% : 0.000526s : 1: mutable_eliminate 0.03% : 0.000009s : 1: offloading_packed_experts 0.05% : 0.000014s : 1: opt.transform.loop_unroll_optimizer 0.05% : 0.000014s : 1: opt.transform.mutable_eliminate 5.16% : 0.001360s : 78: opt.transform.opt_a 0.13% : 0.000035s : 1: opt.transform.opt_after_cconv 0.10% : 0.000027s : 1: opt.transform.opt_after_jit_grad 0.49% : 0.000130s : 28: opt.transform.opt_b 0.21% : 0.000054s : 2: opt.transform.opt_trans_graph 0.15% : 0.000040s : 4: opt.transform.symbol_engine_opt 11.67% : 0.003074s : 1: opt_a 0.50% : 0.000133s : 1: opt_after_cconv 1.84% : 0.000485s : 1: opt_after_jit_grad 1.15% : 0.000303s : 1: opt_b 21.63% : 0.005696s : 1: optimize 0.09% : 0.000022s : 1: optimize_parallel_all_gather_comm 0.04% : 0.000011s : 1: order_py_execute_after_rewriter 0.09% : 0.000023s : 1: overlap_grad_flash_sp 0.02% : 0.000006s : 1: overlap_grad_matmul_and_grad_allreduce 0.04% : 0.000009s : 1: overlap_grad_ring_attention 0.02% : 0.000007s : 1: overlap_opt_shard_grad_in_pipeline 0.02% : 0.000006s : 1: overlap_opt_shard_in_pipeline 0.03% : 0.000008s : 1: overlap_param_gather 0.02% : 0.000006s : 1: overlap_recompute_allgather_and_fa_grad 0.04% : 0.000010s : 1: overlap_recompute_and_grad_model_parallel 0.03% : 0.000007s : 1: overlap_recompute_comm 0.04% : 0.000010s : 1: parallel-infer-symbol 0.02% : 0.000006s : 1: parallel-infer-symbol-second 0.03% : 0.000007s : 1: partial_unused_args_eliminate 0.03% : 0.000009s : 1: pipeline_parallel_scheduler 0.03% : 0.000007s : 1: pipeline_split 0.15% : 0.000040s : 1: pre_auto_parallel 0.16% : 0.000043s : 1: py_interpret_to_execute 0.06% : 0.000016s : 1: py_interpret_to_execute_after_opt_a 0.02% : 0.000006s : 1: remove_cast_before_assign_add 0.07% : 0.000019s : 1: remove_dup_value 1.02% : 0.000270s : 1: renormalize.infer 1.03% : 0.000271s : 1: renormalize.specialize 0.03% : 0.000008s : 1: reorder_send_recv_between_fp_bp 0.04% : 0.000010s : 1: rewriter_after_jit_bprop_graph 0.17% : 0.000045s : 1: rewriter_after_opt_a 0.37% : 0.000098s : 1: rewriter_before_opt_a 0.03% : 0.000008s : 1: slice_cell_reuse_recomputed_activation 0.03% : 0.000007s : 1: slice_recompute_activation 0.03% : 0.000007s : 1: split_layernorm_comm 0.03% : 0.000007s : 1: split_matmul_comm_elemetwise 0.04% : 0.000011s : 1: swap_dp_allreduce_reducescatter 0.38% : 0.000101s : 1: symbol_engine_optimizer 0.38% : 0.000099s : 1: tuple_transform 21.68% : 0.005710s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:47:08.325.603 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0144479, [21] [bootstrap]: 0.00042694 [type_inference]: 0.00563992 [event_method]: 1.899e-05 [auto_monad]: 6.021e-05 [graph_reusing]: 5.35001e-06 [inline]: 2.06e-06 [add_attr]: 0.00295693, [1] [add_attr_with_inline]: 0.00294883, [1] [Cycle 1]: 5.328e-05, [2] [tag_attr]: 1.897e-05 [meta_addattr_fg_expand]: 5.97001e-06 [parallel-infer-symbol]: 3.11999e-06 [pre_auto_parallel]: 3.206e-05 [insert-virtual-dataset]: 2.21e-06 [parallel-infer-symbol-second]: 7.40023e-07 [dataset_repeat_opt]: 1.92001e-06 [pipeline_split]: 1.67999e-06 [optimize]: 0.00464698, [53] [py_interpret_to_execute]: 2.621e-05 [rewriter_before_opt_a]: 8.659e-05 [opt_a]: 0.00270832, [2] [Cycle 1]: 0.00198268, [45] [expand_dump_flag]: 3.03e-06 [switch_simplify]: 4.59e-05 [loop_unroll]: 3.495e-05 [a_1]: 0.00070359 [with_stream_mark]: 1.407e-05 [recompute_prepare]: 9.23002e-06 [updatestate_depend_eliminate]: 3.6e-06 [updatestate_assign_eliminate]: 3.06999e-06 [updatestate_loads_eliminate]: 2.68e-06 [parameter_eliminate]: 1.99e-06 [a_2]: 8.85e-05 [accelerated_algorithm]: 7.15998e-06 [shard]: 1.76e-06 [meta_shard_fg_expand]: 1.76e-06 [shard_inline]: 7.11999e-06 [merge_send_recv]: 8.52e-06 [auto_parallel]: 5.52001e-06 [parallel]: 1.776e-05 [flash_sp]: 7.00002e-06 [merge_comm]: 3.95e-06 [allreduce_fusion]: 3.4e-06 [matmul_add_comm_reduction]: 9.35001e-06 [allreduce_slice_to_reducescatter]: 7.00005e-07 [virtual_shard_identity]: 8.37e-06 [virtual_dataset]: 7.33e-06 [get_grad_eliminate_]: 6.94999e-06 [virtual_output]: 6.91001e-06 [merge_forward]: 3.91001e-06 [cell_reuse_recompute_pass]: 1.17e-06 [offload_activation]: 9.46e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.205e-05 [merge_recompute_call_nodes]: 2.05002e-06 [before_grad]: 1.048e-05 [set_forward_comm_id_for_comm_node_pass]: 4.12e-06 [meta_fg_expand]: 2.74999e-06 [flash_sp_send_recv_attached]: 2.42001e-06 [receive_attached]: 2.32999e-06 [after_resolve]: 1.327e-05 [a_after_grad]: 1.111e-05 [renormalize]: 0.00055582 [add_forward_monad_depend]: 4.50999e-06 [auto_monad_grad]: 1.79e-06 [auto_monad_eliminator]: 1.341e-05 [cse]: 3.11e-05 [a_3]: 4.954e-05 [Cycle 2]: 0.00071625, [45] [expand_dump_flag]: 1.12e-06 [switch_simplify]: 8.34002e-06 [loop_unroll]: 6.96001e-06 [a_1]: 0.000157 [with_stream_mark]: 1.048e-05 [recompute_prepare]: 7.18998e-06 [updatestate_depend_eliminate]: 2.88e-06 [updatestate_assign_eliminate]: 2.63e-06 [updatestate_loads_eliminate]: 2.11e-06 [parameter_eliminate]: 1.10999e-06 [a_2]: 8.084e-05 [accelerated_algorithm]: 6.71999e-06 [shard]: 1.14e-06 [meta_shard_fg_expand]: 1.34998e-06 [shard_inline]: 6.73998e-06 [merge_send_recv]: 2.996e-05 [auto_parallel]: 5.72001e-06 [parallel]: 4.60999e-06 [flash_sp]: 3.4e-06 [merge_comm]: 3.43999e-06 [allreduce_fusion]: 3.04001e-06 [matmul_add_comm_reduction]: 5.25001e-06 [allreduce_slice_to_reducescatter]: 3.59985e-07 [virtual_shard_identity]: 7.45e-06 [virtual_dataset]: 6.63e-06 [get_grad_eliminate_]: 6.94999e-06 [virtual_output]: 6.36e-06 [merge_forward]: 2.61999e-06 [cell_reuse_recompute_pass]: 1.34998e-06 [offload_activation]: 6.26e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.287e-05 [merge_recompute_call_nodes]: 7.2e-07 [before_grad]: 9.48997e-06 [set_forward_comm_id_for_comm_node_pass]: 3.34001e-06 [meta_fg_expand]: 2.36e-06 [flash_sp_send_recv_attached]: 8.50006e-07 [receive_attached]: 1.00001e-06 [after_resolve]: 1.175e-05 [a_after_grad]: 1.014e-05 [renormalize]: 7.00238e-08 [add_forward_monad_depend]: 1.20999e-06 [auto_monad_grad]: 1.02e-06 [auto_monad_eliminator]: 7.03e-06 [cse]: 1.497e-05 [a_3]: 4.031e-05 [py_interpret_to_execute_after_opt_a]: 7.92e-06 [slice_cell_reuse_recomputed_activation]: 2.12001e-06 [rewriter_after_opt_a]: 3.296e-05 [convert_after_rewriter]: 6.84001e-06 [order_py_execute_after_rewriter]: 4.77998e-06 [mutable_eliminate]: 0.00045555 [opt_b]: 0.00022165, [1] [Cycle 1]: 0.00021542, [7] [b_1]: 0.00014296 [b_2]: 9.23002e-06 [updatestate_depend_eliminate]: 4.85999e-06 [updatestate_assign_eliminate]: 2.37999e-06 [updatestate_loads_eliminate]: 2.21e-06 [renormalize]: 3.69997e-07 [cse]: 1.728e-05 [optimize_parallel_all_gather_comm]: 1.58e-05 [overlap_param_gather]: 1.94e-06 [cconv]: 2.318e-05 [loop_unroll]: 0.00041202 [opt_after_cconv]: 0.00010359, [1] [Cycle 1]: 9.828e-05, [7] [c_1]: 3.517e-05 [parameter_eliminate]: 2.73e-06 [updatestate_depend_eliminate]: 4.80999e-06 [updatestate_assign_eliminate]: 2.51e-06 [updatestate_loads_eliminate]: 2.32001e-06 [cse]: 1.691e-05 [renormalize]: 3.50003e-07 [remove_dup_value]: 1.309e-05 [tuple_transform]: 7.979e-05, [1] [Cycle 1]: 7.535e-05, [4] [d_1]: 4.799e-05 [none_parameter_eliminate]: 1.77001e-06 [renormalize]: 2.50002e-07 [switch_simplify]: 7.26999e-06 [partial_unused_args_eliminate]: 1.91e-06 [add_recomputation]: 4.459e-05 [cse_after_recomputation]: 2.083e-05, [1] [Cycle 1]: 1.633e-05, [1] [cse]: 1.099e-05 [environ_conv]: 4.35999e-06 [swap_dp_allreduce_reducescatter]: 4.87e-06 [bias_add_comm_swap]: 2.41e-06 [label_micro_interleaved_index]: 4.27e-06 [label_fine_grained_interleaved_index]: 2.83e-06 [merge_cast_opt]: 1.29e-06 [slice_recompute_activation]: 2.09999e-06 [micro_interleaved_order_control]: 2.51998e-06 [assign_add_opt]: 1.12e-06 [ForceFp32Comm]: 7.50006e-07 [remove_cast_before_assign_add]: 9.80013e-07 [full_micro_interleaved_order_control]: 2.12999e-06 [reorder_send_recv_between_fp_bp]: 2.38998e-06 [comm_op_add_attrs]: 9.5999e-07 [add_comm_op_reuse_tag]: 8.80013e-07 [interleave_split_concat_branches]: 1.10001e-06 [interleave_parallel_branches]: 1.00999e-06 [overlap_opt_shard_in_pipeline]: 1.07998e-06 [overlap_opt_shard_grad_in_pipeline]: 1.96e-06 [control_data_broadcast_order]: 1.162e-05 [grouped_pairwise_exchange_alltoall]: 1.51998e-06 [offloading_packed_experts]: 3.60998e-06 [overlap_recompute_and_grad_model_parallel]: 4.11001e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.10999e-06 [overlap_recompute_allgather_and_fa_grad]: 1.31002e-06 [overlap_recompute_comm]: 2.34001e-06 [overlap_grad_ring_attention]: 4.26001e-06 [overlap_grad_flash_sp]: 1.696e-05 [begin_end_overlap_inline]: 5.19998e-07 [split_matmul_comm_elemetwise]: 2.04999e-06 [split_layernorm_comm]: 1.54998e-06 [handle_group_info]: 1.22e-06 [symbol_engine_optimizer]: 7.58e-05, [1] [Cycle 1]: 7.174e-05, [6] [build]: 2.58e-06 [elim_shapecalc]: 9.99999e-06 [elim_not_effective]: 1.256e-05 [opt_reshape]: 8.02e-06 [fold_const_symbol]: 1.048e-05 [renormalize]: 2.50002e-07 [detach_backward]: 1.74e-06 [pipeline_parallel_scheduler]: 1.83997e-06 [auto_monad_reorder]: 1.699e-05 [get_jit_bprop_graph]: 1.25001e-06 [rewriter_after_jit_bprop_graph]: 3.43999e-06 [opt_after_jit_grad]: 0.00044863 [validate]: 3.398e-05 Sums bootstrap : 0.000427s : 4.04% type_inference : 0.005640s : 53.39% event_method : 0.000019s : 0.18% auto_monad : 0.000060s : 0.57% graph_reusing : 0.000005s : 0.05% inline : 0.000002s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000019s : 0.18% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.06% parallel-infer-symbol : 0.000003s : 0.03% pre_auto_parallel : 0.000032s : 0.30% insert-virtual-dataset : 0.000002s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.02% optimize.py_interpret_to_execute : 0.000026s : 0.25% optimize.rewriter_before_opt_a : 0.000087s : 0.82% optimize.opt_a.expand_dump_flag : 0.000004s : 0.04% optimize.opt_a.switch_simplify : 0.000054s : 0.51% optimize.opt_a.loop_unroll : 0.000042s : 0.40% optimize.opt_a.a_1 : 0.000861s : 8.15% optimize.opt_a.with_stream_mark : 0.000025s : 0.23% optimize.opt_a.recompute_prepare : 0.000016s : 0.16% optimize.opt_a.updatestate_depend_eliminate : 0.000006s : 0.06% optimize.opt_a.updatestate_assign_eliminate : 0.000006s : 0.05% optimize.opt_a.updatestate_loads_eliminate : 0.000005s : 0.05% optimize.opt_a.parameter_eliminate : 0.000003s : 0.03% optimize.opt_a.a_2 : 0.000169s : 1.60% optimize.opt_a.accelerated_algorithm : 0.000014s : 0.13% optimize.opt_a.shard : 0.000003s : 0.03% optimize.opt_a.meta_shard_fg_expand : 0.000003s : 0.03% optimize.opt_a.shard_inline : 0.000014s : 0.13% optimize.opt_a.merge_send_recv : 0.000038s : 0.36% optimize.opt_a.auto_parallel : 0.000011s : 0.11% optimize.opt_a.parallel : 0.000022s : 0.21% optimize.opt_a.flash_sp : 0.000010s : 0.10% optimize.opt_a.merge_comm : 0.000007s : 0.07% optimize.opt_a.allreduce_fusion : 0.000006s : 0.06% optimize.opt_a.matmul_add_comm_reduction : 0.000015s : 0.14% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000016s : 0.15% optimize.opt_a.virtual_dataset : 0.000014s : 0.13% optimize.opt_a.get_grad_eliminate_ : 0.000014s : 0.13% optimize.opt_a.virtual_output : 0.000013s : 0.13% optimize.opt_a.merge_forward : 0.000007s : 0.06% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.02% optimize.opt_a.offload_activation : 0.000016s : 0.15% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000025s : 0.24% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.03% optimize.opt_a.before_grad : 0.000020s : 0.19% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000007s : 0.07% optimize.opt_a.meta_fg_expand : 0.000005s : 0.05% optimize.opt_a.flash_sp_send_recv_attached : 0.000003s : 0.03% optimize.opt_a.receive_attached : 0.000003s : 0.03% optimize.opt_a.after_resolve : 0.000025s : 0.24% optimize.opt_a.a_after_grad : 0.000021s : 0.20% optimize.opt_a.renormalize : 0.000556s : 5.26% optimize.opt_a.add_forward_monad_depend : 0.000006s : 0.05% optimize.opt_a.auto_monad_grad : 0.000003s : 0.03% optimize.opt_a.auto_monad_eliminator : 0.000020s : 0.19% optimize.opt_a.cse : 0.000046s : 0.44% optimize.opt_a.a_3 : 0.000090s : 0.85% optimize.py_interpret_to_execute_after_opt_a : 0.000008s : 0.07% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.02% optimize.rewriter_after_opt_a : 0.000033s : 0.31% optimize.convert_after_rewriter : 0.000007s : 0.06% optimize.order_py_execute_after_rewriter : 0.000005s : 0.05% optimize.mutable_eliminate : 0.000456s : 4.31% optimize.opt_b.b_1 : 0.000143s : 1.35% optimize.opt_b.b_2 : 0.000009s : 0.09% optimize.opt_b.updatestate_depend_eliminate : 0.000005s : 0.05% optimize.opt_b.updatestate_assign_eliminate : 0.000002s : 0.02% optimize.opt_b.updatestate_loads_eliminate : 0.000002s : 0.02% optimize.opt_b.renormalize : 0.000000s : 0.00% optimize.opt_b.cse : 0.000017s : 0.16% optimize.optimize_parallel_all_gather_comm : 0.000016s : 0.15% optimize.overlap_param_gather : 0.000002s : 0.02% optimize.cconv : 0.000023s : 0.22% optimize.loop_unroll : 0.000412s : 3.90% optimize.opt_after_cconv.c_1 : 0.000035s : 0.33% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000005s : 0.05% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.02% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.02% optimize.opt_after_cconv.cse : 0.000017s : 0.16% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000013s : 0.12% optimize.tuple_transform.d_1 : 0.000048s : 0.45% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.02% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000007s : 0.07% optimize.partial_unused_args_eliminate : 0.000002s : 0.02% optimize.add_recomputation : 0.000045s : 0.42% optimize.cse_after_recomputation.cse : 0.000011s : 0.10% optimize.environ_conv : 0.000004s : 0.04% optimize.swap_dp_allreduce_reducescatter : 0.000005s : 0.05% optimize.bias_add_comm_swap : 0.000002s : 0.02% optimize.label_micro_interleaved_index : 0.000004s : 0.04% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.03% optimize.merge_cast_opt : 0.000001s : 0.01% optimize.slice_recompute_activation : 0.000002s : 0.02% optimize.micro_interleaved_order_control : 0.000003s : 0.02% optimize.assign_add_opt : 0.000001s : 0.01% optimize.ForceFp32Comm : 0.000001s : 0.01% optimize.remove_cast_before_assign_add : 0.000001s : 0.01% optimize.full_micro_interleaved_order_control : 0.000002s : 0.02% optimize.reorder_send_recv_between_fp_bp : 0.000002s : 0.02% optimize.comm_op_add_attrs : 0.000001s : 0.01% optimize.add_comm_op_reuse_tag : 0.000001s : 0.01% optimize.interleave_split_concat_branches : 0.000001s : 0.01% optimize.interleave_parallel_branches : 0.000001s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.02% optimize.control_data_broadcast_order : 0.000012s : 0.11% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.01% optimize.offloading_packed_experts : 0.000004s : 0.03% optimize.overlap_recompute_and_grad_model_parallel : 0.000004s : 0.04% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.01% optimize.overlap_recompute_comm : 0.000002s : 0.02% optimize.overlap_grad_ring_attention : 0.000004s : 0.04% optimize.overlap_grad_flash_sp : 0.000017s : 0.16% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.02% optimize.split_layernorm_comm : 0.000002s : 0.01% optimize.handle_group_info : 0.000001s : 0.01% optimize.symbol_engine_optimizer.build : 0.000003s : 0.02% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000010s : 0.09% optimize.symbol_engine_optimizer.elim_not_effective : 0.000013s : 0.12% optimize.symbol_engine_optimizer.opt_reshape : 0.000008s : 0.08% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000010s : 0.10% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.02% pipeline_parallel_scheduler : 0.000002s : 0.02% auto_monad_reorder : 0.000017s : 0.16% get_jit_bprop_graph : 0.000001s : 0.01% rewriter_after_jit_bprop_graph : 0.000003s : 0.03% opt_after_jit_grad : 0.000449s : 4.25% validate : 0.000034s : 0.32% Time group info: ------[substitution.] 0.000182 34 0.93% : 0.000002s : 2: substitution.elim_not_effective 0.72% : 0.000001s : 2: substitution.fold_const_symbol 3.15% : 0.000006s : 5: substitution.graph_param_transform 75.47% : 0.000137s : 4: substitution.inline 1.67% : 0.000003s : 4: substitution.j_node_and_user_rematch 2.32% : 0.000004s : 4: substitution.remove_not_recompute_node 2.63% : 0.000005s : 6: substitution.replace_old_param 10.17% : 0.000018s : 6: substitution.tuple_list_get_item_eliminator 2.94% : 0.000005s : 1: substitution.value_based_eliminate ------[type_inference.] 0.005582 2 87.20% : 0.004868s : 1: type_inference.infer 12.80% : 0.000714s : 1: type_inference.specialize ------[replace.] 0.000067 10 54.19% : 0.000036s : 4: replace.inline 45.81% : 0.000031s : 6: replace.tuple_list_get_item_eliminator ------[match.] 0.000150 10 89.56% : 0.000135s : 4: match.inline 10.44% : 0.000016s : 6: match.tuple_list_get_item_eliminator ------[predicate.] 0.000227 1590 0.93% : 0.000002s : 16: predicate.accumulaten_eliminater 0.67% : 0.000002s : 5: predicate.ad_related_special_op_eliminate 0.52% : 0.000001s : 10: predicate.addn_check_dump 0.93% : 0.000002s : 16: predicate.addn_zero_filter 0.87% : 0.000002s : 16: predicate.adjust_all_reduce_mul_add 1.94% : 0.000004s : 26: predicate.arithmetic_simplify 0.96% : 0.000002s : 16: predicate.cast_eliminate 0.57% : 0.000001s : 10: predicate.check_bprop_eliminate 0.50% : 0.000001s : 10: predicate.compare_switch_simplify 0.21% : 0.000000s : 5: predicate.const_output_eliminate 0.60% : 0.000001s : 10: predicate.depend_value_elim 1.04% : 0.000002s : 16: predicate.dict_get_item_const_eliminator 1.04% : 0.000002s : 16: predicate.dict_get_item_eliminator 1.05% : 0.000002s : 16: predicate.dict_set_item_eliminator 0.87% : 0.000002s : 10: predicate.dumpgradient_eliminate 0.22% : 0.000001s : 5: predicate.elim_not_effective 0.34% : 0.000001s : 5: predicate.elim_shapecalc_of_broadcastargs 1.16% : 0.000003s : 21: predicate.environ_add_const_eliminate 1.10% : 0.000002s : 21: predicate.environ_get_add_eliminate 1.14% : 0.000003s : 21: predicate.environ_get_depend_swap 1.70% : 0.000004s : 31: predicate.environ_get_eliminate 1.13% : 0.000003s : 21: predicate.environ_get_set_eliminate 1.53% : 0.000003s : 26: predicate.exchange_switch_depend_value 2.51% : 0.000006s : 26: predicate.float_depend_g_call 0.52% : 0.000001s : 10: predicate.float_environ_get_switch 0.77% : 0.000002s : 15: predicate.float_tuple_getitem_switch 0.24% : 0.000001s : 5: predicate.fold_const_symbol 0.64% : 0.000001s : 10: predicate.get_grad_eliminate 0.22% : 0.000000s : 5: predicate.graph_param_transform 0.59% : 0.000001s : 10: predicate.incorporate_call 0.49% : 0.000001s : 10: predicate.incorporate_call_switch 6.15% : 0.000014s : 72: predicate.inline 0.74% : 0.000002s : 10: predicate.inline_without_move 0.37% : 0.000001s : 10: predicate.j_node_and_user_rematch 0.70% : 0.000002s : 10: predicate.less_batch_normalization 1.95% : 0.000004s : 32: predicate.list_to_tuple_eliminator_ 2.74% : 0.000006s : 48: predicate.load_eliminater 0.78% : 0.000002s : 5: predicate.loop_unroll_after_grad 2.46% : 0.000006s : 40: predicate.loop_unroll_before_grad 1.64% : 0.000004s : 26: predicate.make_slice_get_slice_eliminator 0.54% : 0.000001s : 10: predicate.merge_addn 0.56% : 0.000001s : 10: predicate.micro_step_allgather_replace 0.54% : 0.000001s : 10: predicate.mini_step_allgather_replace 0.87% : 0.000002s : 16: predicate.minmaximum_grad 0.90% : 0.000002s : 5: predicate.mutable_eliminate 0.37% : 0.000001s : 5: predicate.opt_reshape 0.33% : 0.000001s : 5: predicate.parallel_virtual_node 1.87% : 0.000004s : 26: predicate.partial_defer_inline 1.81% : 0.000004s : 27: predicate.partial_eliminate 0.93% : 0.000002s : 16: predicate.print_const_string_wrapper 0.65% : 0.000001s : 10: predicate.reduce_all_const_elim 1.20% : 0.000003s : 16: predicate.reduce_eliminate 2.64% : 0.000006s : 48: predicate.redundant_stop_gradient_eliminater 0.42% : 0.000001s : 10: predicate.remove_not_recompute_node 1.49% : 0.000003s : 32: predicate.replace_applicator 0.53% : 0.000001s : 10: predicate.replace_old_param 0.26% : 0.000001s : 5: predicate.reset_defer_inline 0.93% : 0.000002s : 16: predicate.reshape_eliminate 0.63% : 0.000001s : 10: predicate.row_tensor_add_zeros_like 0.40% : 0.000001s : 5: predicate.row_tensor_eliminate 0.69% : 0.000002s : 10: predicate.same_eliminate 0.47% : 0.000001s : 10: predicate.set_cell_output_no_recompute 0.71% : 0.000002s : 10: predicate.shard_identity_eliminate 0.64% : 0.000001s : 10: predicate.special_op_eliminate 0.65% : 0.000001s : 10: predicate.specialize_transform 0.74% : 0.000002s : 10: predicate.split_environ_get_set_with_tuple_value 0.72% : 0.000002s : 10: predicate.stack_unstack_eliminate 0.35% : 0.000001s : 5: predicate.switch_call_monad_eliminater 1.63% : 0.000004s : 26: predicate.switch_defer_inline 2.16% : 0.000005s : 36: predicate.switch_layer_defer_inline 5.33% : 0.000012s : 81: predicate.switch_simplify 0.90% : 0.000002s : 16: predicate.tile_eliminate 0.95% : 0.000002s : 16: predicate.transpose_eliminate 1.50% : 0.000003s : 26: predicate.tuple_list_convert_item_index_to_positive 1.55% : 0.000004s : 26: predicate.tuple_list_get_item_const_eliminator 1.45% : 0.000003s : 26: predicate.tuple_list_get_item_depend_reorder 3.28% : 0.000007s : 42: predicate.tuple_list_get_item_eliminator 1.48% : 0.000003s : 26: predicate.tuple_list_get_set_item_eliminator 2.24% : 0.000005s : 36: predicate.tuple_list_set_item_eliminator 1.82% : 0.000004s : 32: predicate.tuple_to_list_eliminator_ 2.64% : 0.000006s : 48: predicate.updatestate_pure_node_eliminater 3.24% : 0.000007s : 58: predicate.updatestate_useless_node_eliminater 0.45% : 0.000001s : 5: predicate.value_based_eliminate 0.67% : 0.000002s : 10: predicate.virtual_dataset_eliminate 0.62% : 0.000001s : 10: predicate.virtual_output_eliminate 0.30% : 0.000001s : 5: predicate.virtual_view_grad_eliminate 0.40% : 0.000001s : 5: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000507 11 55.49% : 0.000282s : 5: func_graph_cloner_run.FuncGraphClonerGraph 44.51% : 0.000226s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.024116 192 0.01% : 0.000003s : 1: ForceFp32Comm 12.28% : 0.002961s : 1: add_attr 12.24% : 0.002953s : 1: add_attr_with_inline 0.01% : 0.000004s : 1: add_comm_op_reuse_tag 0.20% : 0.000049s : 1: add_recomputation 0.02% : 0.000004s : 1: assign_add_opt 0.27% : 0.000065s : 1: auto_monad 0.09% : 0.000021s : 1: auto_monad_reorder 0.01% : 0.000003s : 1: begin_end_overlap_inline 0.02% : 0.000005s : 1: bias_add_comm_swap 1.88% : 0.000454s : 1: bootstrap 0.11% : 0.000026s : 1: cconv 0.02% : 0.000004s : 1: comm_op_add_attrs 0.06% : 0.000015s : 1: control_data_broadcast_order 0.04% : 0.000010s : 1: convert_after_rewriter 0.10% : 0.000024s : 1: cse_after_recomputation 0.02% : 0.000005s : 1: dataset_repeat_opt 0.02% : 0.000005s : 1: detach_backward 0.03% : 0.000008s : 1: environ_conv 0.10% : 0.000024s : 1: event_method 0.02% : 0.000005s : 1: full_micro_interleaved_order_control 0.02% : 0.000005s : 1: get_jit_bprop_graph 0.04% : 0.000009s : 1: graph_reusing 0.02% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.02% : 0.000004s : 1: handle_group_info 0.02% : 0.000005s : 1: inline 0.02% : 0.000006s : 1: insert-virtual-dataset 0.02% : 0.000004s : 1: interleave_parallel_branches 0.02% : 0.000004s : 1: interleave_split_concat_branches 0.02% : 0.000006s : 1: label_fine_grained_interleaved_index 0.03% : 0.000007s : 1: label_micro_interleaved_index 1.74% : 0.000420s : 1: loop_unroll 0.02% : 0.000004s : 1: merge_cast_opt 0.02% : 0.000005s : 1: micro_interleaved_order_control 1.92% : 0.000463s : 1: mutable_eliminate 0.03% : 0.000007s : 1: offloading_packed_experts 0.06% : 0.000014s : 1: opt.transform.loop_unroll_optimizer 0.06% : 0.000015s : 1: opt.transform.mutable_eliminate 5.48% : 0.001323s : 78: opt.transform.opt_a 0.14% : 0.000034s : 1: opt.transform.opt_after_cconv 0.11% : 0.000026s : 1: opt.transform.opt_after_jit_grad 0.51% : 0.000122s : 28: opt.transform.opt_b 0.22% : 0.000053s : 2: opt.transform.opt_trans_graph 0.16% : 0.000038s : 4: opt.transform.symbol_engine_opt 11.24% : 0.002711s : 1: opt_a 0.44% : 0.000107s : 1: opt_after_cconv 1.90% : 0.000457s : 1: opt_after_jit_grad 0.93% : 0.000225s : 1: opt_b 19.29% : 0.004651s : 1: optimize 0.08% : 0.000019s : 1: optimize_parallel_all_gather_comm 0.03% : 0.000008s : 1: order_py_execute_after_rewriter 0.08% : 0.000020s : 1: overlap_grad_flash_sp 0.02% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.03% : 0.000007s : 1: overlap_grad_ring_attention 0.02% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.02% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.02% : 0.000005s : 1: overlap_param_gather 0.02% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.03% : 0.000007s : 1: overlap_recompute_and_grad_model_parallel 0.02% : 0.000005s : 1: overlap_recompute_comm 0.03% : 0.000007s : 1: parallel-infer-symbol 0.02% : 0.000004s : 1: parallel-infer-symbol-second 0.02% : 0.000005s : 1: partial_unused_args_eliminate 0.02% : 0.000005s : 1: pipeline_parallel_scheduler 0.02% : 0.000005s : 1: pipeline_split 0.15% : 0.000036s : 1: pre_auto_parallel 0.13% : 0.000030s : 1: py_interpret_to_execute 0.05% : 0.000011s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000004s : 1: remove_cast_before_assign_add 0.07% : 0.000016s : 1: remove_dup_value 1.24% : 0.000300s : 1: renormalize.infer 1.03% : 0.000249s : 1: renormalize.specialize 0.02% : 0.000005s : 1: reorder_send_recv_between_fp_bp 0.03% : 0.000007s : 1: rewriter_after_jit_bprop_graph 0.15% : 0.000037s : 1: rewriter_after_opt_a 0.38% : 0.000091s : 1: rewriter_before_opt_a 0.02% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.02% : 0.000005s : 1: slice_recompute_activation 0.02% : 0.000004s : 1: split_layernorm_comm 0.02% : 0.000005s : 1: split_matmul_comm_elemetwise 0.03% : 0.000008s : 1: swap_dp_allreduce_reducescatter 0.33% : 0.000079s : 1: symbol_engine_optimizer 0.34% : 0.000083s : 1: tuple_transform 23.44% : 0.005654s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:47:08.508.036 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:47:08.508.281 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0160768, [21] [bootstrap]: 0.00042906 [type_inference]: 0.0056744 [event_method]: 1.956e-05 [auto_monad]: 6.176e-05 [graph_reusing]: 6.63e-06 [inline]: 1.98002e-06 [add_attr]: 0.00291356, [1] [add_attr_with_inline]: 0.00290589, [1] [Cycle 1]: 6.434e-05, [2] [tag_attr]: 1.92e-05 [meta_addattr_fg_expand]: 6.44999e-06 [parallel-infer-symbol]: 2.63e-06 [pre_auto_parallel]: 3.303e-05 [insert-virtual-dataset]: 2.53e-06 [parallel-infer-symbol-second]: 6.40022e-07 [dataset_repeat_opt]: 2.04999e-06 [pipeline_split]: 1.59e-06 [optimize]: 0.00581719, [53] [py_interpret_to_execute]: 3.218e-05 [rewriter_before_opt_a]: 9.726e-05 [opt_a]: 0.00343028, [2] [Cycle 1]: 0.00244049, [45] [expand_dump_flag]: 3.12002e-06 [switch_simplify]: 4.635e-05 [loop_unroll]: 3.613e-05 [a_1]: 0.00087635 [with_stream_mark]: 1.548e-05 [recompute_prepare]: 1.149e-05 [updatestate_depend_eliminate]: 4.76002e-06 [updatestate_assign_eliminate]: 4.22003e-06 [updatestate_loads_eliminate]: 3.71001e-06 [parameter_eliminate]: 2.27999e-06 [a_2]: 0.00013926 [accelerated_algorithm]: 9.05001e-06 [shard]: 1.72999e-06 [meta_shard_fg_expand]: 2.09999e-06 [shard_inline]: 8.70999e-06 [merge_send_recv]: 9.05001e-06 [auto_parallel]: 6.71e-06 [parallel]: 1.745e-05 [flash_sp]: 8.38001e-06 [merge_comm]: 4.89998e-06 [allreduce_fusion]: 4.72e-06 [matmul_add_comm_reduction]: 9.79e-06 [allreduce_slice_to_reducescatter]: 6.19999e-07 [virtual_shard_identity]: 9.61e-06 [virtual_dataset]: 8.64998e-06 [get_grad_eliminate_]: 8.37e-06 [virtual_output]: 8.3e-06 [merge_forward]: 4.54998e-06 [cell_reuse_recompute_pass]: 1.77001e-06 [offload_activation]: 1.004e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.76e-05 [merge_recompute_call_nodes]: 1.47999e-06 [before_grad]: 1.323e-05 [set_forward_comm_id_for_comm_node_pass]: 4.57e-06 [meta_fg_expand]: 3.8e-06 [flash_sp_send_recv_attached]: 2.29999e-06 [receive_attached]: 2.51998e-06 [after_resolve]: 1.444e-05 [a_after_grad]: 1.339e-05 [renormalize]: 0.00058513 [add_forward_monad_depend]: 4.96997e-06 [auto_monad_grad]: 2.35002e-06 [auto_monad_eliminator]: 1.643e-05 [cse]: 3.353e-05 [a_3]: 7.395e-05 [Cycle 2]: 0.00097718, [45] [expand_dump_flag]: 1.15999e-06 [switch_simplify]: 1.005e-05 [loop_unroll]: 8.25e-06 [a_1]: 0.00020318 [with_stream_mark]: 1.181e-05 [recompute_prepare]: 8.52e-06 [updatestate_depend_eliminate]: 3.62002e-06 [updatestate_assign_eliminate]: 3.03e-06 [updatestate_loads_eliminate]: 2.93e-06 [parameter_eliminate]: 1.01002e-06 [a_2]: 0.00012986 [accelerated_algorithm]: 8.3e-06 [shard]: 1.06997e-06 [meta_shard_fg_expand]: 1.62999e-06 [shard_inline]: 8.07e-06 [merge_send_recv]: 5.56998e-06 [auto_parallel]: 9.04003e-06 [parallel]: 4.39002e-06 [flash_sp]: 3.36001e-06 [merge_comm]: 4.46002e-06 [allreduce_fusion]: 3.85998e-06 [matmul_add_comm_reduction]: 6.46999e-06 [allreduce_slice_to_reducescatter]: 3.69997e-07 [virtual_shard_identity]: 9.01998e-06 [virtual_dataset]: 8.27e-06 [get_grad_eliminate_]: 8.25999e-06 [virtual_output]: 7.68001e-06 [merge_forward]: 3.36001e-06 [cell_reuse_recompute_pass]: 1.59e-06 [offload_activation]: 8.04002e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.656e-05 [merge_recompute_call_nodes]: 7.2e-07 [before_grad]: 1.237e-05 [set_forward_comm_id_for_comm_node_pass]: 4.32e-06 [meta_fg_expand]: 2.76999e-06 [flash_sp_send_recv_attached]: 9.20001e-07 [receive_attached]: 9.89996e-07 [after_resolve]: 1.388e-05 [a_after_grad]: 1.293e-05 [renormalize]: 8.9989e-08 [add_forward_monad_depend]: 1.22e-06 [auto_monad_grad]: 8.30012e-07 [auto_monad_eliminator]: 8.37e-06 [cse]: 1.879e-05 [a_3]: 6.385e-05 [py_interpret_to_execute_after_opt_a]: 1.338e-05 [slice_cell_reuse_recomputed_activation]: 4.47e-06 [rewriter_after_opt_a]: 4.17e-05 [convert_after_rewriter]: 1.078e-05 [order_py_execute_after_rewriter]: 9.00999e-06 [mutable_eliminate]: 0.00047076 [opt_b]: 0.00033763, [1] [Cycle 1]: 0.00032837, [7] [b_1]: 0.00022751 [b_2]: 1.015e-05 [updatestate_depend_eliminate]: 6.02999e-06 [updatestate_assign_eliminate]: 3.21999e-06 [updatestate_loads_eliminate]: 3.55e-06 [renormalize]: 4.19997e-07 [cse]: 2.124e-05 [optimize_parallel_all_gather_comm]: 2.056e-05 [overlap_param_gather]: 4.68999e-06 [cconv]: 2.634e-05 [loop_unroll]: 0.00045881 [opt_after_cconv]: 0.00014284, [1] [Cycle 1]: 0.00013412, [7] [c_1]: 4.273e-05 [parameter_eliminate]: 2.49001e-06 [updatestate_depend_eliminate]: 5.86998e-06 [updatestate_assign_eliminate]: 3.34001e-06 [updatestate_loads_eliminate]: 3.07002e-06 [cse]: 2.11e-05 [renormalize]: 4.19997e-07 [remove_dup_value]: 1.756e-05 [tuple_transform]: 0.00010677, [1] [Cycle 1]: 9.968e-05, [4] [d_1]: 5.79e-05 [none_parameter_eliminate]: 1.68002e-06 [renormalize]: 2.10013e-07 [switch_simplify]: 8.89e-06 [partial_unused_args_eliminate]: 4.43001e-06 [add_recomputation]: 5.949e-05 [cse_after_recomputation]: 3.213e-05, [1] [Cycle 1]: 2.483e-05, [1] [cse]: 1.584e-05 [environ_conv]: 9.70002e-06 [swap_dp_allreduce_reducescatter]: 8.88002e-06 [bias_add_comm_swap]: 4.88001e-06 [label_micro_interleaved_index]: 7.21001e-06 [label_fine_grained_interleaved_index]: 5.43002e-06 [merge_cast_opt]: 3.73001e-06 [slice_recompute_activation]: 4.33001e-06 [micro_interleaved_order_control]: 4.50001e-06 [assign_add_opt]: 3.61001e-06 [ForceFp32Comm]: 3.15998e-06 [remove_cast_before_assign_add]: 3.23998e-06 [full_micro_interleaved_order_control]: 4.39002e-06 [reorder_send_recv_between_fp_bp]: 5.05001e-06 [comm_op_add_attrs]: 3.43e-06 [add_comm_op_reuse_tag]: 3.38e-06 [interleave_split_concat_branches]: 3.62002e-06 [interleave_parallel_branches]: 3.41001e-06 [overlap_opt_shard_in_pipeline]: 3.85e-06 [overlap_opt_shard_grad_in_pipeline]: 4.27998e-06 [control_data_broadcast_order]: 1.776e-05 [grouped_pairwise_exchange_alltoall]: 4.13999e-06 [offloading_packed_experts]: 7.11999e-06 [overlap_recompute_and_grad_model_parallel]: 7.8e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.45e-06 [overlap_recompute_allgather_and_fa_grad]: 3.81999e-06 [overlap_recompute_comm]: 5.10999e-06 [overlap_grad_ring_attention]: 7.36001e-06 [overlap_grad_flash_sp]: 2.363e-05 [begin_end_overlap_inline]: 2.94999e-06 [split_matmul_comm_elemetwise]: 4.46002e-06 [split_layernorm_comm]: 4.13001e-06 [handle_group_info]: 3.18998e-06 [symbol_engine_optimizer]: 0.0001055, [1] [Cycle 1]: 9.844e-05, [6] [build]: 2.76999e-06 [elim_shapecalc]: 1.138e-05 [elim_not_effective]: 1.624e-05 [opt_reshape]: 9.29e-06 [fold_const_symbol]: 1.339e-05 [renormalize]: 2.09984e-07 [detach_backward]: 3.13e-06 [pipeline_parallel_scheduler]: 1.77999e-06 [auto_monad_reorder]: 2.153e-05 [get_jit_bprop_graph]: 1.18001e-06 [rewriter_after_jit_bprop_graph]: 4.01001e-06 [opt_after_jit_grad]: 0.00047663 [validate]: 3.981e-05 Sums bootstrap : 0.000429s : 3.74% type_inference : 0.005674s : 49.48% event_method : 0.000020s : 0.17% auto_monad : 0.000062s : 0.54% graph_reusing : 0.000007s : 0.06% inline : 0.000002s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000019s : 0.17% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.06% parallel-infer-symbol : 0.000003s : 0.02% pre_auto_parallel : 0.000033s : 0.29% insert-virtual-dataset : 0.000003s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000032s : 0.28% optimize.rewriter_before_opt_a : 0.000097s : 0.85% optimize.opt_a.expand_dump_flag : 0.000004s : 0.04% optimize.opt_a.switch_simplify : 0.000056s : 0.49% optimize.opt_a.loop_unroll : 0.000044s : 0.39% optimize.opt_a.a_1 : 0.001080s : 9.41% optimize.opt_a.with_stream_mark : 0.000027s : 0.24% optimize.opt_a.recompute_prepare : 0.000020s : 0.17% optimize.opt_a.updatestate_depend_eliminate : 0.000008s : 0.07% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.06% optimize.opt_a.updatestate_loads_eliminate : 0.000007s : 0.06% optimize.opt_a.parameter_eliminate : 0.000003s : 0.03% optimize.opt_a.a_2 : 0.000269s : 2.35% optimize.opt_a.accelerated_algorithm : 0.000017s : 0.15% optimize.opt_a.shard : 0.000003s : 0.02% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.03% optimize.opt_a.shard_inline : 0.000017s : 0.15% optimize.opt_a.merge_send_recv : 0.000015s : 0.13% optimize.opt_a.auto_parallel : 0.000016s : 0.14% optimize.opt_a.parallel : 0.000022s : 0.19% optimize.opt_a.flash_sp : 0.000012s : 0.10% optimize.opt_a.merge_comm : 0.000009s : 0.08% optimize.opt_a.allreduce_fusion : 0.000009s : 0.07% optimize.opt_a.matmul_add_comm_reduction : 0.000016s : 0.14% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000019s : 0.16% optimize.opt_a.virtual_dataset : 0.000017s : 0.15% optimize.opt_a.get_grad_eliminate_ : 0.000017s : 0.15% optimize.opt_a.virtual_output : 0.000016s : 0.14% optimize.opt_a.merge_forward : 0.000008s : 0.07% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.03% optimize.opt_a.offload_activation : 0.000018s : 0.16% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000034s : 0.30% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.02% optimize.opt_a.before_grad : 0.000026s : 0.22% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000009s : 0.08% optimize.opt_a.meta_fg_expand : 0.000007s : 0.06% optimize.opt_a.flash_sp_send_recv_attached : 0.000003s : 0.03% optimize.opt_a.receive_attached : 0.000004s : 0.03% optimize.opt_a.after_resolve : 0.000028s : 0.25% optimize.opt_a.a_after_grad : 0.000026s : 0.23% optimize.opt_a.renormalize : 0.000585s : 5.10% optimize.opt_a.add_forward_monad_depend : 0.000006s : 0.05% optimize.opt_a.auto_monad_grad : 0.000003s : 0.03% optimize.opt_a.auto_monad_eliminator : 0.000025s : 0.22% optimize.opt_a.cse : 0.000052s : 0.46% optimize.opt_a.a_3 : 0.000138s : 1.20% optimize.py_interpret_to_execute_after_opt_a : 0.000013s : 0.12% optimize.slice_cell_reuse_recomputed_activation : 0.000004s : 0.04% optimize.rewriter_after_opt_a : 0.000042s : 0.36% optimize.convert_after_rewriter : 0.000011s : 0.09% optimize.order_py_execute_after_rewriter : 0.000009s : 0.08% optimize.mutable_eliminate : 0.000471s : 4.11% optimize.opt_b.b_1 : 0.000228s : 1.98% optimize.opt_b.b_2 : 0.000010s : 0.09% optimize.opt_b.updatestate_depend_eliminate : 0.000006s : 0.05% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_b.updatestate_loads_eliminate : 0.000004s : 0.03% optimize.opt_b.renormalize : 0.000000s : 0.00% optimize.opt_b.cse : 0.000021s : 0.19% optimize.optimize_parallel_all_gather_comm : 0.000021s : 0.18% optimize.overlap_param_gather : 0.000005s : 0.04% optimize.cconv : 0.000026s : 0.23% optimize.loop_unroll : 0.000459s : 4.00% optimize.opt_after_cconv.c_1 : 0.000043s : 0.37% optimize.opt_after_cconv.parameter_eliminate : 0.000002s : 0.02% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.05% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.cse : 0.000021s : 0.18% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000018s : 0.15% optimize.tuple_transform.d_1 : 0.000058s : 0.50% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000009s : 0.08% optimize.partial_unused_args_eliminate : 0.000004s : 0.04% optimize.add_recomputation : 0.000059s : 0.52% optimize.cse_after_recomputation.cse : 0.000016s : 0.14% optimize.environ_conv : 0.000010s : 0.08% optimize.swap_dp_allreduce_reducescatter : 0.000009s : 0.08% optimize.bias_add_comm_swap : 0.000005s : 0.04% optimize.label_micro_interleaved_index : 0.000007s : 0.06% optimize.label_fine_grained_interleaved_index : 0.000005s : 0.05% optimize.merge_cast_opt : 0.000004s : 0.03% optimize.slice_recompute_activation : 0.000004s : 0.04% optimize.micro_interleaved_order_control : 0.000005s : 0.04% optimize.assign_add_opt : 0.000004s : 0.03% optimize.ForceFp32Comm : 0.000003s : 0.03% optimize.remove_cast_before_assign_add : 0.000003s : 0.03% optimize.full_micro_interleaved_order_control : 0.000004s : 0.04% optimize.reorder_send_recv_between_fp_bp : 0.000005s : 0.04% optimize.comm_op_add_attrs : 0.000003s : 0.03% optimize.add_comm_op_reuse_tag : 0.000003s : 0.03% optimize.interleave_split_concat_branches : 0.000004s : 0.03% optimize.interleave_parallel_branches : 0.000003s : 0.03% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.03% optimize.overlap_opt_shard_grad_in_pipeline : 0.000004s : 0.04% optimize.control_data_broadcast_order : 0.000018s : 0.15% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.04% optimize.offloading_packed_experts : 0.000007s : 0.06% optimize.overlap_recompute_and_grad_model_parallel : 0.000008s : 0.07% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000003s : 0.03% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.03% optimize.overlap_recompute_comm : 0.000005s : 0.04% optimize.overlap_grad_ring_attention : 0.000007s : 0.06% optimize.overlap_grad_flash_sp : 0.000024s : 0.21% optimize.begin_end_overlap_inline : 0.000003s : 0.03% optimize.split_matmul_comm_elemetwise : 0.000004s : 0.04% optimize.split_layernorm_comm : 0.000004s : 0.04% optimize.handle_group_info : 0.000003s : 0.03% optimize.symbol_engine_optimizer.build : 0.000003s : 0.02% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000011s : 0.10% optimize.symbol_engine_optimizer.elim_not_effective : 0.000016s : 0.14% optimize.symbol_engine_optimizer.opt_reshape : 0.000009s : 0.08% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000013s : 0.12% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000003s : 0.03% pipeline_parallel_scheduler : 0.000002s : 0.02% auto_monad_reorder : 0.000022s : 0.19% get_jit_bprop_graph : 0.000001s : 0.01% rewriter_after_jit_bprop_graph : 0.000004s : 0.03% opt_after_jit_grad : 0.000477s : 4.16% validate : 0.000040s : 0.35% Time group info: ------[substitution.] 0.000212 44 9.51% : 0.000020s : 3: substitution.cast_eliminate 1.18% : 0.000002s : 3: substitution.elim_not_effective 0.85% : 0.000002s : 3: substitution.fold_const_symbol 3.22% : 0.000007s : 6: substitution.graph_param_transform 65.89% : 0.000139s : 4: substitution.inline 2.10% : 0.000004s : 6: substitution.j_node_and_user_rematch 3.04% : 0.000006s : 6: substitution.remove_not_recompute_node 2.52% : 0.000005s : 6: substitution.replace_old_param 8.95% : 0.000019s : 6: substitution.tuple_list_get_item_eliminator 2.74% : 0.000006s : 1: substitution.value_based_eliminate ------[type_inference.] 0.005627 2 86.72% : 0.004880s : 1: type_inference.infer 13.28% : 0.000747s : 1: type_inference.specialize ------[replace.] 0.000073 10 49.64% : 0.000036s : 4: replace.inline 50.36% : 0.000037s : 6: replace.tuple_list_get_item_eliminator ------[match.] 0.000153 10 89.45% : 0.000137s : 4: match.inline 10.55% : 0.000016s : 6: match.tuple_list_get_item_eliminator ------[predicate.] 0.000293 1954 0.95% : 0.000003s : 21: predicate.accumulaten_eliminater 0.59% : 0.000002s : 6: predicate.ad_related_special_op_eliminate 0.53% : 0.000002s : 12: predicate.addn_check_dump 0.97% : 0.000003s : 21: predicate.addn_zero_filter 0.91% : 0.000003s : 21: predicate.adjust_all_reduce_mul_add 1.89% : 0.000006s : 33: predicate.arithmetic_simplify 1.10% : 0.000003s : 21: predicate.cast_eliminate 0.59% : 0.000002s : 12: predicate.check_bprop_eliminate 0.55% : 0.000002s : 12: predicate.compare_switch_simplify 0.18% : 0.000001s : 6: predicate.const_output_eliminate 0.60% : 0.000002s : 12: predicate.depend_value_elim 1.04% : 0.000003s : 21: predicate.dict_get_item_const_eliminator 1.06% : 0.000003s : 21: predicate.dict_get_item_eliminator 0.93% : 0.000003s : 21: predicate.dict_set_item_eliminator 0.84% : 0.000002s : 12: predicate.dumpgradient_eliminate 0.19% : 0.000001s : 6: predicate.elim_not_effective 0.34% : 0.000001s : 6: predicate.elim_shapecalc_of_broadcastargs 1.21% : 0.000004s : 27: predicate.environ_add_const_eliminate 1.18% : 0.000003s : 27: predicate.environ_get_add_eliminate 1.20% : 0.000004s : 27: predicate.environ_get_depend_swap 1.77% : 0.000005s : 39: predicate.environ_get_eliminate 1.19% : 0.000003s : 27: predicate.environ_get_set_eliminate 1.51% : 0.000004s : 31: predicate.exchange_switch_depend_value 2.39% : 0.000007s : 31: predicate.float_depend_g_call 0.54% : 0.000002s : 12: predicate.float_environ_get_switch 0.87% : 0.000003s : 18: predicate.float_tuple_getitem_switch 0.19% : 0.000001s : 6: predicate.fold_const_symbol 0.63% : 0.000002s : 12: predicate.get_grad_eliminate 0.20% : 0.000001s : 6: predicate.graph_param_transform 0.60% : 0.000002s : 12: predicate.incorporate_call 0.52% : 0.000002s : 12: predicate.incorporate_call_switch 6.18% : 0.000018s : 88: predicate.inline 0.77% : 0.000002s : 12: predicate.inline_without_move 0.34% : 0.000001s : 12: predicate.j_node_and_user_rematch 0.73% : 0.000002s : 12: predicate.less_batch_normalization 2.05% : 0.000006s : 39: predicate.list_to_tuple_eliminator_ 2.68% : 0.000008s : 60: predicate.load_eliminater 0.72% : 0.000002s : 6: predicate.loop_unroll_after_grad 2.05% : 0.000006s : 42: predicate.loop_unroll_before_grad 1.62% : 0.000005s : 33: predicate.make_slice_get_slice_eliminator 0.55% : 0.000002s : 12: predicate.merge_addn 0.59% : 0.000002s : 12: predicate.micro_step_allgather_replace 0.59% : 0.000002s : 12: predicate.mini_step_allgather_replace 0.90% : 0.000003s : 21: predicate.minmaximum_grad 0.66% : 0.000002s : 6: predicate.mutable_eliminate 0.37% : 0.000001s : 6: predicate.opt_reshape 0.34% : 0.000001s : 6: predicate.parallel_virtual_node 1.72% : 0.000005s : 31: predicate.partial_defer_inline 1.85% : 0.000005s : 33: predicate.partial_eliminate 0.93% : 0.000003s : 21: predicate.print_const_string_wrapper 0.59% : 0.000002s : 12: predicate.reduce_all_const_elim 1.20% : 0.000003s : 21: predicate.reduce_eliminate 2.72% : 0.000008s : 60: predicate.redundant_stop_gradient_eliminater 0.40% : 0.000001s : 12: predicate.remove_not_recompute_node 1.49% : 0.000004s : 39: predicate.replace_applicator 0.46% : 0.000001s : 12: predicate.replace_old_param 0.34% : 0.000001s : 6: predicate.reset_defer_inline 1.06% : 0.000003s : 21: predicate.reshape_eliminate 0.59% : 0.000002s : 12: predicate.row_tensor_add_zeros_like 0.37% : 0.000001s : 6: predicate.row_tensor_eliminate 0.69% : 0.000002s : 12: predicate.same_eliminate 0.42% : 0.000001s : 12: predicate.set_cell_output_no_recompute 0.64% : 0.000002s : 12: predicate.shard_identity_eliminate 0.65% : 0.000002s : 12: predicate.special_op_eliminate 0.72% : 0.000002s : 12: predicate.specialize_transform 0.75% : 0.000002s : 12: predicate.split_environ_get_set_with_tuple_value 0.71% : 0.000002s : 12: predicate.stack_unstack_eliminate 0.35% : 0.000001s : 6: predicate.switch_call_monad_eliminater 1.56% : 0.000005s : 31: predicate.switch_defer_inline 2.10% : 0.000006s : 43: predicate.switch_layer_defer_inline 4.76% : 0.000014s : 91: predicate.switch_simplify 1.01% : 0.000003s : 21: predicate.tile_eliminate 1.01% : 0.000003s : 21: predicate.transpose_eliminate 1.58% : 0.000005s : 33: predicate.tuple_list_convert_item_index_to_positive 1.66% : 0.000005s : 33: predicate.tuple_list_get_item_const_eliminator 1.56% : 0.000005s : 33: predicate.tuple_list_get_item_depend_reorder 3.31% : 0.000010s : 51: predicate.tuple_list_get_item_eliminator 1.70% : 0.000005s : 33: predicate.tuple_list_get_set_item_eliminator 2.35% : 0.000007s : 45: predicate.tuple_list_set_item_eliminator 1.84% : 0.000005s : 39: predicate.tuple_to_list_eliminator_ 2.73% : 0.000008s : 60: predicate.updatestate_pure_node_eliminater 3.37% : 0.000010s : 72: predicate.updatestate_useless_node_eliminater 0.40% : 0.000001s : 6: predicate.value_based_eliminate 0.61% : 0.000002s : 12: predicate.virtual_dataset_eliminate 0.59% : 0.000002s : 12: predicate.virtual_output_eliminate 0.30% : 0.000001s : 6: predicate.virtual_view_grad_eliminate 0.52% : 0.000002s : 6: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000505 11 54.61% : 0.000276s : 5: func_graph_cloner_run.FuncGraphClonerGraph 45.39% : 0.000229s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.027299 192 0.02% : 0.000006s : 1: ForceFp32Comm 10.70% : 0.002922s : 1: add_attr 10.66% : 0.002909s : 1: add_attr_with_inline 0.02% : 0.000006s : 1: add_comm_op_reuse_tag 0.23% : 0.000063s : 1: add_recomputation 0.02% : 0.000006s : 1: assign_add_opt 0.26% : 0.000070s : 1: auto_monad 0.11% : 0.000029s : 1: auto_monad_reorder 0.02% : 0.000006s : 1: begin_end_overlap_inline 0.03% : 0.000008s : 1: bias_add_comm_swap 1.72% : 0.000470s : 1: bootstrap 0.11% : 0.000029s : 1: cconv 0.02% : 0.000006s : 1: comm_op_add_attrs 0.08% : 0.000021s : 1: control_data_broadcast_order 0.05% : 0.000014s : 1: convert_after_rewriter 0.13% : 0.000035s : 1: cse_after_recomputation 0.03% : 0.000008s : 1: dataset_repeat_opt 0.06% : 0.000017s : 1: detach_backward 0.05% : 0.000013s : 1: environ_conv 0.11% : 0.000029s : 1: event_method 0.03% : 0.000007s : 1: full_micro_interleaved_order_control 0.03% : 0.000007s : 1: get_jit_bprop_graph 0.05% : 0.000013s : 1: graph_reusing 0.02% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.02% : 0.000006s : 1: handle_group_info 0.03% : 0.000008s : 1: inline 0.03% : 0.000009s : 1: insert-virtual-dataset 0.02% : 0.000006s : 1: interleave_parallel_branches 0.02% : 0.000006s : 1: interleave_split_concat_branches 0.03% : 0.000008s : 1: label_fine_grained_interleaved_index 0.04% : 0.000010s : 1: label_micro_interleaved_index 1.70% : 0.000465s : 1: loop_unroll 0.02% : 0.000006s : 1: merge_cast_opt 0.03% : 0.000007s : 1: micro_interleaved_order_control 1.74% : 0.000476s : 1: mutable_eliminate 0.04% : 0.000010s : 1: offloading_packed_experts 0.06% : 0.000016s : 1: opt.transform.loop_unroll_optimizer 0.06% : 0.000016s : 1: opt.transform.mutable_eliminate 6.06% : 0.001653s : 78: opt.transform.opt_a 0.15% : 0.000041s : 1: opt.transform.opt_after_cconv 0.11% : 0.000031s : 1: opt.transform.opt_after_jit_grad 0.60% : 0.000163s : 28: opt.transform.opt_b 0.24% : 0.000065s : 2: opt.transform.opt_trans_graph 0.17% : 0.000047s : 4: opt.transform.symbol_engine_opt 12.58% : 0.003433s : 1: opt_a 0.54% : 0.000146s : 1: opt_after_cconv 1.78% : 0.000487s : 1: opt_after_jit_grad 1.25% : 0.000341s : 1: opt_b 22.54% : 0.006154s : 1: optimize 0.09% : 0.000024s : 1: optimize_parallel_all_gather_comm 0.04% : 0.000012s : 1: order_py_execute_after_rewriter 0.10% : 0.000027s : 1: overlap_grad_flash_sp 0.02% : 0.000006s : 1: overlap_grad_matmul_and_grad_allreduce 0.04% : 0.000010s : 1: overlap_grad_ring_attention 0.03% : 0.000007s : 1: overlap_opt_shard_grad_in_pipeline 0.02% : 0.000007s : 1: overlap_opt_shard_in_pipeline 0.03% : 0.000008s : 1: overlap_param_gather 0.02% : 0.000006s : 1: overlap_recompute_allgather_and_fa_grad 0.04% : 0.000011s : 1: overlap_recompute_and_grad_model_parallel 0.03% : 0.000008s : 1: overlap_recompute_comm 0.03% : 0.000009s : 1: parallel-infer-symbol 0.02% : 0.000006s : 1: parallel-infer-symbol-second 0.03% : 0.000007s : 1: partial_unused_args_eliminate 0.04% : 0.000010s : 1: pipeline_parallel_scheduler 0.03% : 0.000007s : 1: pipeline_split 0.15% : 0.000040s : 1: pre_auto_parallel 0.13% : 0.000036s : 1: py_interpret_to_execute 0.06% : 0.000017s : 1: py_interpret_to_execute_after_opt_a 0.02% : 0.000006s : 1: remove_cast_before_assign_add 0.08% : 0.000021s : 1: remove_dup_value 1.12% : 0.000305s : 1: renormalize.infer 1.00% : 0.000273s : 1: renormalize.specialize 0.03% : 0.000008s : 1: reorder_send_recv_between_fp_bp 0.04% : 0.000010s : 1: rewriter_after_jit_bprop_graph 0.16% : 0.000045s : 1: rewriter_after_opt_a 0.37% : 0.000101s : 1: rewriter_before_opt_a 0.03% : 0.000007s : 1: slice_cell_reuse_recomputed_activation 0.03% : 0.000007s : 1: slice_recompute_activation 0.03% : 0.000007s : 1: split_layernorm_comm 0.03% : 0.000007s : 1: split_matmul_comm_elemetwise 0.04% : 0.000012s : 1: swap_dp_allreduce_reducescatter 0.40% : 0.000108s : 1: symbol_engine_optimizer 0.40% : 0.000110s : 1: tuple_transform 20.90% : 0.005704s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:47:08.697.169 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0156082, [21] [bootstrap]: 0.00043381 [type_inference]: 0.00584485 [event_method]: 2.081e-05 [auto_monad]: 6.489e-05 [graph_reusing]: 6.04999e-06 [inline]: 2.12999e-06 [add_attr]: 0.00299538, [1] [add_attr_with_inline]: 0.00298642, [1] [Cycle 1]: 5.959e-05, [2] [tag_attr]: 2.007e-05 [meta_addattr_fg_expand]: 6.48e-06 [parallel-infer-symbol]: 3.01999e-06 [pre_auto_parallel]: 3.432e-05 [insert-virtual-dataset]: 2.71e-06 [parallel-infer-symbol-second]: 7.49977e-07 [dataset_repeat_opt]: 1.91e-06 [pipeline_split]: 1.60999e-06 [optimize]: 0.00549612, [53] [py_interpret_to_execute]: 2.984e-05 [rewriter_before_opt_a]: 9.36e-05 [opt_a]: 0.00327653, [2] [Cycle 1]: 0.00241833, [45] [expand_dump_flag]: 2.83e-06 [switch_simplify]: 4.915e-05 [loop_unroll]: 4.735e-05 [a_1]: 0.00088383 [with_stream_mark]: 1.774e-05 [recompute_prepare]: 1.346e-05 [updatestate_depend_eliminate]: 5.30999e-06 [updatestate_assign_eliminate]: 4.13999e-06 [updatestate_loads_eliminate]: 4.06001e-06 [parameter_eliminate]: 1.88002e-06 [a_2]: 0.00011062 [accelerated_algorithm]: 9.77999e-06 [shard]: 2.61e-06 [meta_shard_fg_expand]: 2.06998e-06 [shard_inline]: 8.97e-06 [merge_send_recv]: 1.069e-05 [auto_parallel]: 7.14001e-06 [parallel]: 1.799e-05 [flash_sp]: 8.62998e-06 [merge_comm]: 5.44998e-06 [allreduce_fusion]: 4.27e-06 [matmul_add_comm_reduction]: 1.036e-05 [allreduce_slice_to_reducescatter]: 6.09987e-07 [virtual_shard_identity]: 1.003e-05 [virtual_dataset]: 8.89998e-06 [get_grad_eliminate_]: 8.07e-06 [virtual_output]: 8.35001e-06 [merge_forward]: 4.60999e-06 [cell_reuse_recompute_pass]: 1.15001e-06 [offload_activation]: 1.065e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.726e-05 [merge_recompute_call_nodes]: 1.55999e-06 [before_grad]: 1.398e-05 [set_forward_comm_id_for_comm_node_pass]: 4.45e-06 [meta_fg_expand]: 3.75e-06 [flash_sp_send_recv_attached]: 3.09001e-06 [receive_attached]: 2.16998e-06 [after_resolve]: 1.536e-05 [a_after_grad]: 1.376e-05 [renormalize]: 0.00067654 [add_forward_monad_depend]: 5.94999e-06 [auto_monad_grad]: 2.05002e-06 [auto_monad_eliminator]: 1.796e-05 [cse]: 3.696e-05 [a_3]: 6.298e-05 [Cycle 2]: 0.00084815, [45] [expand_dump_flag]: 1.30999e-06 [switch_simplify]: 1.065e-05 [loop_unroll]: 8.43999e-06 [a_1]: 0.00020276 [with_stream_mark]: 1.402e-05 [recompute_prepare]: 8.87e-06 [updatestate_depend_eliminate]: 4.12e-06 [updatestate_assign_eliminate]: 3.17002e-06 [updatestate_loads_eliminate]: 3.11999e-06 [parameter_eliminate]: 1.24e-06 [a_2]: 0.00010185 [accelerated_algorithm]: 8.59e-06 [shard]: 1.64998e-06 [meta_shard_fg_expand]: 2.24999e-06 [shard_inline]: 8.23001e-06 [merge_send_recv]: 6.43e-06 [auto_parallel]: 6.56e-06 [parallel]: 4.77e-06 [flash_sp]: 3.52002e-06 [merge_comm]: 5.14e-06 [allreduce_fusion]: 4.15e-06 [matmul_add_comm_reduction]: 7.38999e-06 [allreduce_slice_to_reducescatter]: 6.99976e-07 [virtual_shard_identity]: 1.15e-05 [virtual_dataset]: 7.81001e-06 [get_grad_eliminate_]: 7.68999e-06 [virtual_output]: 8.07e-06 [merge_forward]: 3.82998e-06 [cell_reuse_recompute_pass]: 1.60001e-06 [offload_activation]: 8.25e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.628e-05 [merge_recompute_call_nodes]: 1.09e-06 [before_grad]: 1.339e-05 [set_forward_comm_id_for_comm_node_pass]: 4.45e-06 [meta_fg_expand]: 3.04999e-06 [flash_sp_send_recv_attached]: 1.37999e-06 [receive_attached]: 1.44e-06 [after_resolve]: 1.397e-05 [a_after_grad]: 1.387e-05 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 2.43e-06 [auto_monad_grad]: 1.00001e-06 [auto_monad_eliminator]: 1.059e-05 [cse]: 2.108e-05 [a_3]: 5.184e-05 [py_interpret_to_execute_after_opt_a]: 1.181e-05 [slice_cell_reuse_recomputed_activation]: 2.53e-06 [rewriter_after_opt_a]: 4.308e-05 [convert_after_rewriter]: 7.66999e-06 [order_py_execute_after_rewriter]: 5.92999e-06 [mutable_eliminate]: 0.00049966 [opt_b]: 0.00030086, [1] [Cycle 1]: 0.00029416, [7] [b_1]: 0.00017865 [b_2]: 3.285e-05 [updatestate_depend_eliminate]: 8.30999e-06 [updatestate_assign_eliminate]: 3.28e-06 [updatestate_loads_eliminate]: 3.6e-06 [renormalize]: 6.80011e-07 [cse]: 2.712e-05 [optimize_parallel_all_gather_comm]: 2.003e-05 [overlap_param_gather]: 2.06e-06 [cconv]: 2.729e-05 [loop_unroll]: 0.00044614 [opt_after_cconv]: 0.00012514, [1] [Cycle 1]: 0.00011933, [7] [c_1]: 4.229e-05 [parameter_eliminate]: 3.21001e-06 [updatestate_depend_eliminate]: 7.19001e-06 [updatestate_assign_eliminate]: 3.50998e-06 [updatestate_loads_eliminate]: 3.05002e-06 [cse]: 2.471e-05 [renormalize]: 5.19998e-07 [remove_dup_value]: 1.548e-05 [tuple_transform]: 9.142e-05, [1] [Cycle 1]: 8.716e-05, [4] [d_1]: 5.823e-05 [none_parameter_eliminate]: 1.69e-06 [renormalize]: 2.00002e-07 [switch_simplify]: 8.70001e-06 [partial_unused_args_eliminate]: 2.07001e-06 [add_recomputation]: 5.739e-05 [cse_after_recomputation]: 2.568e-05, [1] [Cycle 1]: 2.108e-05, [1] [cse]: 1.563e-05 [environ_conv]: 6.11e-06 [swap_dp_allreduce_reducescatter]: 6.28e-06 [bias_add_comm_swap]: 2.83e-06 [label_micro_interleaved_index]: 4.76002e-06 [label_fine_grained_interleaved_index]: 2.54999e-06 [merge_cast_opt]: 1.20999e-06 [slice_recompute_activation]: 2.06e-06 [micro_interleaved_order_control]: 2.26998e-06 [assign_add_opt]: 1.50001e-06 [ForceFp32Comm]: 9.09989e-07 [remove_cast_before_assign_add]: 1.10999e-06 [full_micro_interleaved_order_control]: 2.11e-06 [reorder_send_recv_between_fp_bp]: 2.92002e-06 [comm_op_add_attrs]: 1.12999e-06 [add_comm_op_reuse_tag]: 1.26002e-06 [interleave_split_concat_branches]: 1.17e-06 [interleave_parallel_branches]: 1.02e-06 [overlap_opt_shard_in_pipeline]: 1.07998e-06 [overlap_opt_shard_grad_in_pipeline]: 2.36998e-06 [control_data_broadcast_order]: 1.503e-05 [grouped_pairwise_exchange_alltoall]: 1.52001e-06 [offloading_packed_experts]: 4.33999e-06 [overlap_recompute_and_grad_model_parallel]: 5.03002e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.19e-06 [overlap_recompute_allgather_and_fa_grad]: 1.39e-06 [overlap_recompute_comm]: 2.49001e-06 [overlap_grad_ring_attention]: 4.43001e-06 [overlap_grad_flash_sp]: 2.22e-05 [begin_end_overlap_inline]: 5.69999e-07 [split_matmul_comm_elemetwise]: 2.12001e-06 [split_layernorm_comm]: 1.67001e-06 [handle_group_info]: 1.17e-06 [symbol_engine_optimizer]: 8.585e-05, [1] [Cycle 1]: 8.134e-05, [6] [build]: 3.68e-06 [elim_shapecalc]: 1.148e-05 [elim_not_effective]: 1.552e-05 [opt_reshape]: 9.17001e-06 [fold_const_symbol]: 1.353e-05 [renormalize]: 1.80007e-07 [detach_backward]: 1.94999e-06 [pipeline_parallel_scheduler]: 1.47001e-06 [auto_monad_reorder]: 2.079e-05 [get_jit_bprop_graph]: 1.85001e-06 [rewriter_after_jit_bprop_graph]: 4.42003e-06 [opt_after_jit_grad]: 0.00047967 [validate]: 4.364e-05 Sums bootstrap : 0.000434s : 3.73% type_inference : 0.005845s : 50.26% event_method : 0.000021s : 0.18% auto_monad : 0.000065s : 0.56% graph_reusing : 0.000006s : 0.05% inline : 0.000002s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000020s : 0.17% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.06% parallel-infer-symbol : 0.000003s : 0.03% pre_auto_parallel : 0.000034s : 0.30% insert-virtual-dataset : 0.000003s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000030s : 0.26% optimize.rewriter_before_opt_a : 0.000094s : 0.80% optimize.opt_a.expand_dump_flag : 0.000004s : 0.04% optimize.opt_a.switch_simplify : 0.000060s : 0.51% optimize.opt_a.loop_unroll : 0.000056s : 0.48% optimize.opt_a.a_1 : 0.001087s : 9.34% optimize.opt_a.with_stream_mark : 0.000032s : 0.27% optimize.opt_a.recompute_prepare : 0.000022s : 0.19% optimize.opt_a.updatestate_depend_eliminate : 0.000009s : 0.08% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.06% optimize.opt_a.updatestate_loads_eliminate : 0.000007s : 0.06% optimize.opt_a.parameter_eliminate : 0.000003s : 0.03% optimize.opt_a.a_2 : 0.000212s : 1.83% optimize.opt_a.accelerated_algorithm : 0.000018s : 0.16% optimize.opt_a.shard : 0.000004s : 0.04% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.04% optimize.opt_a.shard_inline : 0.000017s : 0.15% optimize.opt_a.merge_send_recv : 0.000017s : 0.15% optimize.opt_a.auto_parallel : 0.000014s : 0.12% optimize.opt_a.parallel : 0.000023s : 0.20% optimize.opt_a.flash_sp : 0.000012s : 0.10% optimize.opt_a.merge_comm : 0.000011s : 0.09% optimize.opt_a.allreduce_fusion : 0.000008s : 0.07% optimize.opt_a.matmul_add_comm_reduction : 0.000018s : 0.15% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000022s : 0.19% optimize.opt_a.virtual_dataset : 0.000017s : 0.14% optimize.opt_a.get_grad_eliminate_ : 0.000016s : 0.14% optimize.opt_a.virtual_output : 0.000016s : 0.14% optimize.opt_a.merge_forward : 0.000008s : 0.07% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.02% optimize.opt_a.offload_activation : 0.000019s : 0.16% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000034s : 0.29% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.02% optimize.opt_a.before_grad : 0.000027s : 0.24% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000009s : 0.08% optimize.opt_a.meta_fg_expand : 0.000007s : 0.06% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.04% optimize.opt_a.receive_attached : 0.000004s : 0.03% optimize.opt_a.after_resolve : 0.000029s : 0.25% optimize.opt_a.a_after_grad : 0.000028s : 0.24% optimize.opt_a.renormalize : 0.000677s : 5.82% optimize.opt_a.add_forward_monad_depend : 0.000008s : 0.07% optimize.opt_a.auto_monad_grad : 0.000003s : 0.03% optimize.opt_a.auto_monad_eliminator : 0.000029s : 0.25% optimize.opt_a.cse : 0.000058s : 0.50% optimize.opt_a.a_3 : 0.000115s : 0.99% optimize.py_interpret_to_execute_after_opt_a : 0.000012s : 0.10% optimize.slice_cell_reuse_recomputed_activation : 0.000003s : 0.02% optimize.rewriter_after_opt_a : 0.000043s : 0.37% optimize.convert_after_rewriter : 0.000008s : 0.07% optimize.order_py_execute_after_rewriter : 0.000006s : 0.05% optimize.mutable_eliminate : 0.000500s : 4.30% optimize.opt_b.b_1 : 0.000179s : 1.54% optimize.opt_b.b_2 : 0.000033s : 0.28% optimize.opt_b.updatestate_depend_eliminate : 0.000008s : 0.07% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_b.updatestate_loads_eliminate : 0.000004s : 0.03% optimize.opt_b.renormalize : 0.000001s : 0.01% optimize.opt_b.cse : 0.000027s : 0.23% optimize.optimize_parallel_all_gather_comm : 0.000020s : 0.17% optimize.overlap_param_gather : 0.000002s : 0.02% optimize.cconv : 0.000027s : 0.23% optimize.loop_unroll : 0.000446s : 3.84% optimize.opt_after_cconv.c_1 : 0.000042s : 0.36% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000007s : 0.06% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000004s : 0.03% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.cse : 0.000025s : 0.21% optimize.opt_after_cconv.renormalize : 0.000001s : 0.00% optimize.remove_dup_value : 0.000015s : 0.13% optimize.tuple_transform.d_1 : 0.000058s : 0.50% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000009s : 0.07% optimize.partial_unused_args_eliminate : 0.000002s : 0.02% optimize.add_recomputation : 0.000057s : 0.49% optimize.cse_after_recomputation.cse : 0.000016s : 0.13% optimize.environ_conv : 0.000006s : 0.05% optimize.swap_dp_allreduce_reducescatter : 0.000006s : 0.05% optimize.bias_add_comm_swap : 0.000003s : 0.02% optimize.label_micro_interleaved_index : 0.000005s : 0.04% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.02% optimize.merge_cast_opt : 0.000001s : 0.01% optimize.slice_recompute_activation : 0.000002s : 0.02% optimize.micro_interleaved_order_control : 0.000002s : 0.02% optimize.assign_add_opt : 0.000002s : 0.01% optimize.ForceFp32Comm : 0.000001s : 0.01% optimize.remove_cast_before_assign_add : 0.000001s : 0.01% optimize.full_micro_interleaved_order_control : 0.000002s : 0.02% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.03% optimize.comm_op_add_attrs : 0.000001s : 0.01% optimize.add_comm_op_reuse_tag : 0.000001s : 0.01% optimize.interleave_split_concat_branches : 0.000001s : 0.01% optimize.interleave_parallel_branches : 0.000001s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.02% optimize.control_data_broadcast_order : 0.000015s : 0.13% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.01% optimize.offloading_packed_experts : 0.000004s : 0.04% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.04% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.01% optimize.overlap_recompute_comm : 0.000002s : 0.02% optimize.overlap_grad_ring_attention : 0.000004s : 0.04% optimize.overlap_grad_flash_sp : 0.000022s : 0.19% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.02% optimize.split_layernorm_comm : 0.000002s : 0.01% optimize.handle_group_info : 0.000001s : 0.01% optimize.symbol_engine_optimizer.build : 0.000004s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000011s : 0.10% optimize.symbol_engine_optimizer.elim_not_effective : 0.000016s : 0.13% optimize.symbol_engine_optimizer.opt_reshape : 0.000009s : 0.08% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000014s : 0.12% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.02% pipeline_parallel_scheduler : 0.000001s : 0.01% auto_monad_reorder : 0.000021s : 0.18% get_jit_bprop_graph : 0.000002s : 0.02% rewriter_after_jit_bprop_graph : 0.000004s : 0.04% opt_after_jit_grad : 0.000480s : 4.12% validate : 0.000044s : 0.38% Time group info: ------[substitution.] 0.000229 44 10.06% : 0.000023s : 3: substitution.cast_eliminate 0.97% : 0.000002s : 3: substitution.elim_not_effective 0.86% : 0.000002s : 3: substitution.fold_const_symbol 3.05% : 0.000007s : 6: substitution.graph_param_transform 66.47% : 0.000152s : 4: substitution.inline 2.29% : 0.000005s : 6: substitution.j_node_and_user_rematch 2.80% : 0.000006s : 6: substitution.remove_not_recompute_node 2.64% : 0.000006s : 6: substitution.replace_old_param 8.53% : 0.000020s : 6: substitution.tuple_list_get_item_eliminator 2.33% : 0.000005s : 1: substitution.value_based_eliminate ------[type_inference.] 0.005785 2 86.79% : 0.005021s : 1: type_inference.infer 13.21% : 0.000764s : 1: type_inference.specialize ------[replace.] 0.000078 10 50.57% : 0.000039s : 4: replace.inline 49.43% : 0.000038s : 6: replace.tuple_list_get_item_eliminator ------[match.] 0.000166 10 89.87% : 0.000149s : 4: match.inline 10.13% : 0.000017s : 6: match.tuple_list_get_item_eliminator ------[predicate.] 0.000294 1954 0.99% : 0.000003s : 21: predicate.accumulaten_eliminater 0.61% : 0.000002s : 6: predicate.ad_related_special_op_eliminate 0.52% : 0.000002s : 12: predicate.addn_check_dump 0.94% : 0.000003s : 21: predicate.addn_zero_filter 0.88% : 0.000003s : 21: predicate.adjust_all_reduce_mul_add 1.99% : 0.000006s : 33: predicate.arithmetic_simplify 1.02% : 0.000003s : 21: predicate.cast_eliminate 0.58% : 0.000002s : 12: predicate.check_bprop_eliminate 0.54% : 0.000002s : 12: predicate.compare_switch_simplify 0.19% : 0.000001s : 6: predicate.const_output_eliminate 0.56% : 0.000002s : 12: predicate.depend_value_elim 1.02% : 0.000003s : 21: predicate.dict_get_item_const_eliminator 1.09% : 0.000003s : 21: predicate.dict_get_item_eliminator 0.93% : 0.000003s : 21: predicate.dict_set_item_eliminator 0.85% : 0.000003s : 12: predicate.dumpgradient_eliminate 0.22% : 0.000001s : 6: predicate.elim_not_effective 0.33% : 0.000001s : 6: predicate.elim_shapecalc_of_broadcastargs 1.21% : 0.000004s : 27: predicate.environ_add_const_eliminate 1.16% : 0.000003s : 27: predicate.environ_get_add_eliminate 1.15% : 0.000003s : 27: predicate.environ_get_depend_swap 1.74% : 0.000005s : 39: predicate.environ_get_eliminate 1.17% : 0.000003s : 27: predicate.environ_get_set_eliminate 1.49% : 0.000004s : 31: predicate.exchange_switch_depend_value 2.44% : 0.000007s : 31: predicate.float_depend_g_call 0.55% : 0.000002s : 12: predicate.float_environ_get_switch 0.76% : 0.000002s : 18: predicate.float_tuple_getitem_switch 0.19% : 0.000001s : 6: predicate.fold_const_symbol 0.61% : 0.000002s : 12: predicate.get_grad_eliminate 0.20% : 0.000001s : 6: predicate.graph_param_transform 0.57% : 0.000002s : 12: predicate.incorporate_call 0.50% : 0.000001s : 12: predicate.incorporate_call_switch 6.33% : 0.000019s : 88: predicate.inline 0.99% : 0.000003s : 12: predicate.inline_without_move 0.35% : 0.000001s : 12: predicate.j_node_and_user_rematch 0.71% : 0.000002s : 12: predicate.less_batch_normalization 1.87% : 0.000005s : 39: predicate.list_to_tuple_eliminator_ 2.64% : 0.000008s : 60: predicate.load_eliminater 1.00% : 0.000003s : 6: predicate.loop_unroll_after_grad 1.98% : 0.000006s : 42: predicate.loop_unroll_before_grad 1.65% : 0.000005s : 33: predicate.make_slice_get_slice_eliminator 0.51% : 0.000002s : 12: predicate.merge_addn 0.57% : 0.000002s : 12: predicate.micro_step_allgather_replace 0.56% : 0.000002s : 12: predicate.mini_step_allgather_replace 0.90% : 0.000003s : 21: predicate.minmaximum_grad 0.80% : 0.000002s : 6: predicate.mutable_eliminate 0.36% : 0.000001s : 6: predicate.opt_reshape 0.31% : 0.000001s : 6: predicate.parallel_virtual_node 1.66% : 0.000005s : 31: predicate.partial_defer_inline 1.84% : 0.000005s : 33: predicate.partial_eliminate 0.95% : 0.000003s : 21: predicate.print_const_string_wrapper 0.55% : 0.000002s : 12: predicate.reduce_all_const_elim 1.27% : 0.000004s : 21: predicate.reduce_eliminate 2.64% : 0.000008s : 60: predicate.redundant_stop_gradient_eliminater 0.38% : 0.000001s : 12: predicate.remove_not_recompute_node 1.41% : 0.000004s : 39: predicate.replace_applicator 0.48% : 0.000001s : 12: predicate.replace_old_param 0.26% : 0.000001s : 6: predicate.reset_defer_inline 1.04% : 0.000003s : 21: predicate.reshape_eliminate 0.59% : 0.000002s : 12: predicate.row_tensor_add_zeros_like 0.49% : 0.000001s : 6: predicate.row_tensor_eliminate 0.69% : 0.000002s : 12: predicate.same_eliminate 0.47% : 0.000001s : 12: predicate.set_cell_output_no_recompute 0.72% : 0.000002s : 12: predicate.shard_identity_eliminate 0.65% : 0.000002s : 12: predicate.special_op_eliminate 0.71% : 0.000002s : 12: predicate.specialize_transform 0.73% : 0.000002s : 12: predicate.split_environ_get_set_with_tuple_value 0.75% : 0.000002s : 12: predicate.stack_unstack_eliminate 0.34% : 0.000001s : 6: predicate.switch_call_monad_eliminater 1.57% : 0.000005s : 31: predicate.switch_defer_inline 2.10% : 0.000006s : 43: predicate.switch_layer_defer_inline 4.93% : 0.000014s : 91: predicate.switch_simplify 0.92% : 0.000003s : 21: predicate.tile_eliminate 0.93% : 0.000003s : 21: predicate.transpose_eliminate 1.54% : 0.000005s : 33: predicate.tuple_list_convert_item_index_to_positive 1.70% : 0.000005s : 33: predicate.tuple_list_get_item_const_eliminator 1.57% : 0.000005s : 33: predicate.tuple_list_get_item_depend_reorder 3.64% : 0.000011s : 51: predicate.tuple_list_get_item_eliminator 1.57% : 0.000005s : 33: predicate.tuple_list_get_set_item_eliminator 2.30% : 0.000007s : 45: predicate.tuple_list_set_item_eliminator 1.91% : 0.000006s : 39: predicate.tuple_to_list_eliminator_ 2.62% : 0.000008s : 60: predicate.updatestate_pure_node_eliminater 3.25% : 0.000010s : 72: predicate.updatestate_useless_node_eliminater 0.49% : 0.000001s : 6: predicate.value_based_eliminate 0.60% : 0.000002s : 12: predicate.virtual_dataset_eliminate 0.60% : 0.000002s : 12: predicate.virtual_output_eliminate 0.26% : 0.000001s : 6: predicate.virtual_view_grad_eliminate 0.37% : 0.000001s : 6: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000547 11 54.58% : 0.000299s : 5: func_graph_cloner_run.FuncGraphClonerGraph 45.42% : 0.000249s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.026704 192 0.01% : 0.000004s : 1: ForceFp32Comm 11.23% : 0.003000s : 1: add_attr 11.20% : 0.002990s : 1: add_attr_with_inline 0.01% : 0.000004s : 1: add_comm_op_reuse_tag 0.23% : 0.000062s : 1: add_recomputation 0.02% : 0.000004s : 1: assign_add_opt 0.26% : 0.000070s : 1: auto_monad 0.09% : 0.000025s : 1: auto_monad_reorder 0.01% : 0.000003s : 1: begin_end_overlap_inline 0.02% : 0.000006s : 1: bias_add_comm_swap 1.73% : 0.000462s : 1: bootstrap 0.12% : 0.000031s : 1: cconv 0.01% : 0.000004s : 1: comm_op_add_attrs 0.07% : 0.000018s : 1: control_data_broadcast_order 0.04% : 0.000011s : 1: convert_after_rewriter 0.11% : 0.000029s : 1: cse_after_recomputation 0.02% : 0.000005s : 1: dataset_repeat_opt 0.02% : 0.000005s : 1: detach_backward 0.04% : 0.000009s : 1: environ_conv 0.10% : 0.000027s : 1: event_method 0.02% : 0.000005s : 1: full_micro_interleaved_order_control 0.02% : 0.000005s : 1: get_jit_bprop_graph 0.04% : 0.000010s : 1: graph_reusing 0.02% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000004s : 1: handle_group_info 0.02% : 0.000005s : 1: inline 0.02% : 0.000006s : 1: insert-virtual-dataset 0.01% : 0.000004s : 1: interleave_parallel_branches 0.02% : 0.000004s : 1: interleave_split_concat_branches 0.02% : 0.000005s : 1: label_fine_grained_interleaved_index 0.03% : 0.000008s : 1: label_micro_interleaved_index 1.70% : 0.000455s : 1: loop_unroll 0.01% : 0.000004s : 1: merge_cast_opt 0.02% : 0.000005s : 1: micro_interleaved_order_control 1.91% : 0.000509s : 1: mutable_eliminate 0.03% : 0.000007s : 1: offloading_packed_experts 0.07% : 0.000019s : 1: opt.transform.loop_unroll_optimizer 0.07% : 0.000019s : 1: opt.transform.mutable_eliminate 6.29% : 0.001680s : 78: opt.transform.opt_a 0.15% : 0.000041s : 1: opt.transform.opt_after_cconv 0.12% : 0.000031s : 1: opt.transform.opt_after_jit_grad 0.59% : 0.000159s : 28: opt.transform.opt_b 0.24% : 0.000064s : 2: opt.transform.opt_trans_graph 0.17% : 0.000046s : 4: opt.transform.symbol_engine_opt 12.28% : 0.003279s : 1: opt_a 0.48% : 0.000129s : 1: opt_after_cconv 1.83% : 0.000489s : 1: opt_after_jit_grad 1.14% : 0.000304s : 1: opt_b 20.60% : 0.005501s : 1: optimize 0.09% : 0.000024s : 1: optimize_parallel_all_gather_comm 0.03% : 0.000009s : 1: order_py_execute_after_rewriter 0.10% : 0.000026s : 1: overlap_grad_flash_sp 0.02% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.03% : 0.000008s : 1: overlap_grad_ring_attention 0.02% : 0.000006s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.02% : 0.000005s : 1: overlap_param_gather 0.02% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.03% : 0.000008s : 1: overlap_recompute_and_grad_model_parallel 0.02% : 0.000005s : 1: overlap_recompute_comm 0.03% : 0.000007s : 1: parallel-infer-symbol 0.01% : 0.000004s : 1: parallel-infer-symbol-second 0.02% : 0.000005s : 1: partial_unused_args_eliminate 0.02% : 0.000005s : 1: pipeline_parallel_scheduler 0.02% : 0.000004s : 1: pipeline_split 0.14% : 0.000038s : 1: pre_auto_parallel 0.13% : 0.000034s : 1: py_interpret_to_execute 0.06% : 0.000015s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000004s : 1: remove_cast_before_assign_add 0.07% : 0.000019s : 1: remove_dup_value 1.35% : 0.000361s : 1: renormalize.infer 1.15% : 0.000307s : 1: renormalize.specialize 0.02% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.03% : 0.000007s : 1: rewriter_after_jit_bprop_graph 0.18% : 0.000048s : 1: rewriter_after_opt_a 0.37% : 0.000098s : 1: rewriter_before_opt_a 0.02% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.02% : 0.000005s : 1: slice_recompute_activation 0.02% : 0.000004s : 1: split_layernorm_comm 0.02% : 0.000005s : 1: split_matmul_comm_elemetwise 0.03% : 0.000009s : 1: swap_dp_allreduce_reducescatter 0.33% : 0.000089s : 1: symbol_engine_optimizer 0.35% : 0.000094s : 1: tuple_transform 21.94% : 0.005860s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:47:08.884.699 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:47:08.884.954 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0165002, [21] [bootstrap]: 0.00042284 [type_inference]: 0.00574141 [event_method]: 2.048e-05 [auto_monad]: 6.348e-05 [graph_reusing]: 5.61e-06 [inline]: 2.07001e-06 [add_attr]: 0.0029537, [1] [add_attr_with_inline]: 0.00294623, [1] [Cycle 1]: 6.659e-05, [2] [tag_attr]: 2.016e-05 [meta_addattr_fg_expand]: 6.64001e-06 [parallel-infer-symbol]: 2.76e-06 [pre_auto_parallel]: 3.486e-05 [insert-virtual-dataset]: 2.51998e-06 [parallel-infer-symbol-second]: 6.99976e-07 [dataset_repeat_opt]: 2.02999e-06 [pipeline_split]: 1.52999e-06 [optimize]: 0.00611858, [53] [py_interpret_to_execute]: 3.273e-05 [rewriter_before_opt_a]: 0.00011416 [opt_a]: 0.00361707, [2] [Cycle 1]: 0.0025438, [45] [expand_dump_flag]: 2.81999e-06 [switch_simplify]: 4.836e-05 [loop_unroll]: 3.722e-05 [a_1]: 0.00086439 [with_stream_mark]: 1.567e-05 [recompute_prepare]: 1.155e-05 [updatestate_depend_eliminate]: 6.07001e-06 [updatestate_assign_eliminate]: 4.72e-06 [updatestate_loads_eliminate]: 4.46002e-06 [parameter_eliminate]: 1.93002e-06 [a_2]: 0.00015532 [accelerated_algorithm]: 1.009e-05 [shard]: 1.76998e-06 [meta_shard_fg_expand]: 2.34001e-06 [shard_inline]: 9.67001e-06 [merge_send_recv]: 1.011e-05 [auto_parallel]: 7.15e-06 [parallel]: 1.644e-05 [flash_sp]: 8.63001e-06 [merge_comm]: 5.93998e-06 [allreduce_fusion]: 5.08002e-06 [matmul_add_comm_reduction]: 1.08e-05 [allreduce_slice_to_reducescatter]: 6.90023e-07 [virtual_shard_identity]: 1.136e-05 [virtual_dataset]: 9.67999e-06 [get_grad_eliminate_]: 9.15001e-06 [virtual_output]: 9.40001e-06 [merge_forward]: 4.99e-06 [cell_reuse_recompute_pass]: 1.25001e-06 [offload_activation]: 1.121e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.99e-05 [merge_recompute_call_nodes]: 1.39998e-06 [before_grad]: 1.576e-05 [set_forward_comm_id_for_comm_node_pass]: 5.49998e-06 [meta_fg_expand]: 4.07003e-06 [flash_sp_send_recv_attached]: 2.60002e-06 [receive_attached]: 2.14999e-06 [after_resolve]: 1.551e-05 [a_after_grad]: 1.526e-05 [renormalize]: 0.00062904 [add_forward_monad_depend]: 5.92999e-06 [auto_monad_grad]: 1.89e-06 [auto_monad_eliminator]: 1.777e-05 [cse]: 4.85e-05 [a_3]: 8.282e-05 [Cycle 2]: 0.00106062, [45] [expand_dump_flag]: 1.25001e-06 [switch_simplify]: 1.09e-05 [loop_unroll]: 9.52999e-06 [a_1]: 0.00023638 [with_stream_mark]: 1.188e-05 [recompute_prepare]: 9.46e-06 [updatestate_depend_eliminate]: 4.45e-06 [updatestate_assign_eliminate]: 3.85e-06 [updatestate_loads_eliminate]: 3.46999e-06 [parameter_eliminate]: 1.00001e-06 [a_2]: 0.00014489 [accelerated_algorithm]: 9.56e-06 [shard]: 1.02e-06 [meta_shard_fg_expand]: 1.86e-06 [shard_inline]: 9.27001e-06 [merge_send_recv]: 6.66e-06 [auto_parallel]: 6.71999e-06 [parallel]: 4.32e-06 [flash_sp]: 3.29001e-06 [merge_comm]: 4.94998e-06 [allreduce_fusion]: 4.27e-06 [matmul_add_comm_reduction]: 7.17002e-06 [allreduce_slice_to_reducescatter]: 4.2998e-07 [virtual_shard_identity]: 1.045e-05 [virtual_dataset]: 9.51998e-06 [get_grad_eliminate_]: 8.75999e-06 [virtual_output]: 8.80001e-06 [merge_forward]: 4.08999e-06 [cell_reuse_recompute_pass]: 1.44e-06 [offload_activation]: 8.70999e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.941e-05 [merge_recompute_call_nodes]: 7.09988e-07 [before_grad]: 1.456e-05 [set_forward_comm_id_for_comm_node_pass]: 5.07e-06 [meta_fg_expand]: 3.39001e-06 [flash_sp_send_recv_attached]: 9.10019e-07 [receive_attached]: 1.04e-06 [after_resolve]: 1.427e-05 [a_after_grad]: 1.428e-05 [renormalize]: 7.99773e-08 [add_forward_monad_depend]: 1.57001e-06 [auto_monad_grad]: 9.20001e-07 [auto_monad_eliminator]: 9.57001e-06 [cse]: 2.456e-05 [a_3]: 7.289e-05 [py_interpret_to_execute_after_opt_a]: 1.278e-05 [slice_cell_reuse_recomputed_activation]: 5.06002e-06 [rewriter_after_opt_a]: 4.796e-05 [convert_after_rewriter]: 1.203e-05 [order_py_execute_after_rewriter]: 9.71e-06 [mutable_eliminate]: 0.00048827 [opt_b]: 0.00036672, [1] [Cycle 1]: 0.00035779, [7] [b_1]: 0.00024791 [b_2]: 1.141e-05 [updatestate_depend_eliminate]: 6.72002e-06 [updatestate_assign_eliminate]: 3.76999e-06 [updatestate_loads_eliminate]: 4.01001e-06 [renormalize]: 3.89991e-07 [cse]: 2.865e-05 [optimize_parallel_all_gather_comm]: 2.161e-05 [overlap_param_gather]: 4.53999e-06 [cconv]: 2.713e-05 [loop_unroll]: 0.00043451 [opt_after_cconv]: 0.00015769, [1] [Cycle 1]: 0.00014918, [7] [c_1]: 4.779e-05 [parameter_eliminate]: 2.51998e-06 [updatestate_depend_eliminate]: 6.81999e-06 [updatestate_assign_eliminate]: 4.02e-06 [updatestate_loads_eliminate]: 3.62002e-06 [cse]: 2.779e-05 [renormalize]: 7.39994e-07 [remove_dup_value]: 3.827e-05 [tuple_transform]: 0.00011618, [1] [Cycle 1]: 0.00010872, [4] [d_1]: 6.523e-05 [none_parameter_eliminate]: 1.82999e-06 [renormalize]: 2.50002e-07 [switch_simplify]: 1.009e-05 [partial_unused_args_eliminate]: 4.53001e-06 [add_recomputation]: 6.062e-05 [cse_after_recomputation]: 3.374e-05, [1] [Cycle 1]: 2.685e-05, [1] [cse]: 1.767e-05 [environ_conv]: 9.19998e-06 [swap_dp_allreduce_reducescatter]: 9.31e-06 [bias_add_comm_swap]: 5.40999e-06 [label_micro_interleaved_index]: 7.21001e-06 [label_fine_grained_interleaved_index]: 5.02e-06 [merge_cast_opt]: 3.81999e-06 [slice_recompute_activation]: 4.38999e-06 [micro_interleaved_order_control]: 4.63001e-06 [assign_add_opt]: 3.65e-06 [ForceFp32Comm]: 3.39001e-06 [remove_cast_before_assign_add]: 3.39001e-06 [full_micro_interleaved_order_control]: 4.45e-06 [reorder_send_recv_between_fp_bp]: 5.05001e-06 [comm_op_add_attrs]: 3.62002e-06 [add_comm_op_reuse_tag]: 3.38e-06 [interleave_split_concat_branches]: 3.66001e-06 [interleave_parallel_branches]: 3.61001e-06 [overlap_opt_shard_in_pipeline]: 3.51999e-06 [overlap_opt_shard_grad_in_pipeline]: 3.97998e-06 [control_data_broadcast_order]: 1.876e-05 [grouped_pairwise_exchange_alltoall]: 4.00998e-06 [offloading_packed_experts]: 7.31001e-06 [overlap_recompute_and_grad_model_parallel]: 8.50001e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.56001e-06 [overlap_recompute_allgather_and_fa_grad]: 3.75e-06 [overlap_recompute_comm]: 4.65999e-06 [overlap_grad_ring_attention]: 7.69002e-06 [overlap_grad_flash_sp]: 2.496e-05 [begin_end_overlap_inline]: 2.96999e-06 [split_matmul_comm_elemetwise]: 4.93001e-06 [split_layernorm_comm]: 4.23001e-06 [handle_group_info]: 3.16001e-06 [symbol_engine_optimizer]: 0.00011275, [1] [Cycle 1]: 0.00010603, [6] [build]: 3.25e-06 [elim_shapecalc]: 1.347e-05 [elim_not_effective]: 1.799e-05 [opt_reshape]: 1.013e-05 [fold_const_symbol]: 1.521e-05 [renormalize]: 1.90019e-07 [detach_backward]: 3.15998e-06 [pipeline_parallel_scheduler]: 1.60999e-06 [auto_monad_reorder]: 2.307e-05 [get_jit_bprop_graph]: 1.32999e-06 [rewriter_after_jit_bprop_graph]: 4.35e-06 [opt_after_jit_grad]: 0.00048044 [validate]: 4.063e-05 Sums bootstrap : 0.000423s : 3.57% type_inference : 0.005741s : 48.52% event_method : 0.000020s : 0.17% auto_monad : 0.000063s : 0.54% graph_reusing : 0.000006s : 0.05% inline : 0.000002s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000020s : 0.17% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000007s : 0.06% parallel-infer-symbol : 0.000003s : 0.02% pre_auto_parallel : 0.000035s : 0.29% insert-virtual-dataset : 0.000003s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000033s : 0.28% optimize.rewriter_before_opt_a : 0.000114s : 0.96% optimize.opt_a.expand_dump_flag : 0.000004s : 0.03% optimize.opt_a.switch_simplify : 0.000059s : 0.50% optimize.opt_a.loop_unroll : 0.000047s : 0.40% optimize.opt_a.a_1 : 0.001101s : 9.30% optimize.opt_a.with_stream_mark : 0.000028s : 0.23% optimize.opt_a.recompute_prepare : 0.000021s : 0.18% optimize.opt_a.updatestate_depend_eliminate : 0.000011s : 0.09% optimize.opt_a.updatestate_assign_eliminate : 0.000009s : 0.07% optimize.opt_a.updatestate_loads_eliminate : 0.000008s : 0.07% optimize.opt_a.parameter_eliminate : 0.000003s : 0.02% optimize.opt_a.a_2 : 0.000300s : 2.54% optimize.opt_a.accelerated_algorithm : 0.000020s : 0.17% optimize.opt_a.shard : 0.000003s : 0.02% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.04% optimize.opt_a.shard_inline : 0.000019s : 0.16% optimize.opt_a.merge_send_recv : 0.000017s : 0.14% optimize.opt_a.auto_parallel : 0.000014s : 0.12% optimize.opt_a.parallel : 0.000021s : 0.18% optimize.opt_a.flash_sp : 0.000012s : 0.10% optimize.opt_a.merge_comm : 0.000011s : 0.09% optimize.opt_a.allreduce_fusion : 0.000009s : 0.08% optimize.opt_a.matmul_add_comm_reduction : 0.000018s : 0.15% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000022s : 0.18% optimize.opt_a.virtual_dataset : 0.000019s : 0.16% optimize.opt_a.get_grad_eliminate_ : 0.000018s : 0.15% optimize.opt_a.virtual_output : 0.000018s : 0.15% optimize.opt_a.merge_forward : 0.000009s : 0.08% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.02% optimize.opt_a.offload_activation : 0.000020s : 0.17% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000039s : 0.33% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.02% optimize.opt_a.before_grad : 0.000030s : 0.26% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000011s : 0.09% optimize.opt_a.meta_fg_expand : 0.000007s : 0.06% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.03% optimize.opt_a.receive_attached : 0.000003s : 0.03% optimize.opt_a.after_resolve : 0.000030s : 0.25% optimize.opt_a.a_after_grad : 0.000030s : 0.25% optimize.opt_a.renormalize : 0.000629s : 5.32% optimize.opt_a.add_forward_monad_depend : 0.000007s : 0.06% optimize.opt_a.auto_monad_grad : 0.000003s : 0.02% optimize.opt_a.auto_monad_eliminator : 0.000027s : 0.23% optimize.opt_a.cse : 0.000073s : 0.62% optimize.opt_a.a_3 : 0.000156s : 1.32% optimize.py_interpret_to_execute_after_opt_a : 0.000013s : 0.11% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.04% optimize.rewriter_after_opt_a : 0.000048s : 0.41% optimize.convert_after_rewriter : 0.000012s : 0.10% optimize.order_py_execute_after_rewriter : 0.000010s : 0.08% optimize.mutable_eliminate : 0.000488s : 4.13% optimize.opt_b.b_1 : 0.000248s : 2.09% optimize.opt_b.b_2 : 0.000011s : 0.10% optimize.opt_b.updatestate_depend_eliminate : 0.000007s : 0.06% optimize.opt_b.updatestate_assign_eliminate : 0.000004s : 0.03% optimize.opt_b.updatestate_loads_eliminate : 0.000004s : 0.03% optimize.opt_b.renormalize : 0.000000s : 0.00% optimize.opt_b.cse : 0.000029s : 0.24% optimize.optimize_parallel_all_gather_comm : 0.000022s : 0.18% optimize.overlap_param_gather : 0.000005s : 0.04% optimize.cconv : 0.000027s : 0.23% optimize.loop_unroll : 0.000435s : 3.67% optimize.opt_after_cconv.c_1 : 0.000048s : 0.40% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.02% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000007s : 0.06% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000004s : 0.03% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000004s : 0.03% optimize.opt_after_cconv.cse : 0.000028s : 0.23% optimize.opt_after_cconv.renormalize : 0.000001s : 0.01% optimize.remove_dup_value : 0.000038s : 0.32% optimize.tuple_transform.d_1 : 0.000065s : 0.55% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.02% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000010s : 0.09% optimize.partial_unused_args_eliminate : 0.000005s : 0.04% optimize.add_recomputation : 0.000061s : 0.51% optimize.cse_after_recomputation.cse : 0.000018s : 0.15% optimize.environ_conv : 0.000009s : 0.08% optimize.swap_dp_allreduce_reducescatter : 0.000009s : 0.08% optimize.bias_add_comm_swap : 0.000005s : 0.05% optimize.label_micro_interleaved_index : 0.000007s : 0.06% optimize.label_fine_grained_interleaved_index : 0.000005s : 0.04% optimize.merge_cast_opt : 0.000004s : 0.03% optimize.slice_recompute_activation : 0.000004s : 0.04% optimize.micro_interleaved_order_control : 0.000005s : 0.04% optimize.assign_add_opt : 0.000004s : 0.03% optimize.ForceFp32Comm : 0.000003s : 0.03% optimize.remove_cast_before_assign_add : 0.000003s : 0.03% optimize.full_micro_interleaved_order_control : 0.000004s : 0.04% optimize.reorder_send_recv_between_fp_bp : 0.000005s : 0.04% optimize.comm_op_add_attrs : 0.000004s : 0.03% optimize.add_comm_op_reuse_tag : 0.000003s : 0.03% optimize.interleave_split_concat_branches : 0.000004s : 0.03% optimize.interleave_parallel_branches : 0.000004s : 0.03% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.03% optimize.overlap_opt_shard_grad_in_pipeline : 0.000004s : 0.03% optimize.control_data_broadcast_order : 0.000019s : 0.16% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.03% optimize.offloading_packed_experts : 0.000007s : 0.06% optimize.overlap_recompute_and_grad_model_parallel : 0.000009s : 0.07% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.03% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.03% optimize.overlap_recompute_comm : 0.000005s : 0.04% optimize.overlap_grad_ring_attention : 0.000008s : 0.06% optimize.overlap_grad_flash_sp : 0.000025s : 0.21% optimize.begin_end_overlap_inline : 0.000003s : 0.03% optimize.split_matmul_comm_elemetwise : 0.000005s : 0.04% optimize.split_layernorm_comm : 0.000004s : 0.04% optimize.handle_group_info : 0.000003s : 0.03% optimize.symbol_engine_optimizer.build : 0.000003s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000013s : 0.11% optimize.symbol_engine_optimizer.elim_not_effective : 0.000018s : 0.15% optimize.symbol_engine_optimizer.opt_reshape : 0.000010s : 0.09% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000015s : 0.13% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000003s : 0.03% pipeline_parallel_scheduler : 0.000002s : 0.01% auto_monad_reorder : 0.000023s : 0.19% get_jit_bprop_graph : 0.000001s : 0.01% rewriter_after_jit_bprop_graph : 0.000004s : 0.04% opt_after_jit_grad : 0.000480s : 4.06% validate : 0.000041s : 0.34% Time group info: ------[substitution.] 0.000223 54 11.90% : 0.000026s : 6: substitution.cast_eliminate 1.10% : 0.000002s : 4: substitution.elim_not_effective 0.90% : 0.000002s : 4: substitution.fold_const_symbol 3.32% : 0.000007s : 7: substitution.graph_param_transform 64.08% : 0.000143s : 4: substitution.inline 2.27% : 0.000005s : 8: substitution.j_node_and_user_rematch 3.56% : 0.000008s : 8: substitution.remove_not_recompute_node 2.28% : 0.000005s : 6: substitution.replace_old_param 8.15% : 0.000018s : 6: substitution.tuple_list_get_item_eliminator 2.44% : 0.000005s : 1: substitution.value_based_eliminate ------[type_inference.] 0.005693 2 86.34% : 0.004916s : 1: type_inference.infer 13.66% : 0.000777s : 1: type_inference.specialize ------[replace.] 0.000070 10 52.38% : 0.000037s : 4: replace.inline 47.62% : 0.000033s : 6: replace.tuple_list_get_item_eliminator ------[match.] 0.000156 10 90.18% : 0.000140s : 4: match.inline 9.82% : 0.000015s : 6: match.tuple_list_get_item_eliminator ------[predicate.] 0.000319 2134 0.99% : 0.000003s : 22: predicate.accumulaten_eliminater 0.62% : 0.000002s : 7: predicate.ad_related_special_op_eliminate 0.58% : 0.000002s : 14: predicate.addn_check_dump 0.93% : 0.000003s : 22: predicate.addn_zero_filter 0.86% : 0.000003s : 22: predicate.adjust_all_reduce_mul_add 2.02% : 0.000006s : 36: predicate.arithmetic_simplify 1.08% : 0.000003s : 22: predicate.cast_eliminate 0.61% : 0.000002s : 14: predicate.check_bprop_eliminate 0.62% : 0.000002s : 14: predicate.compare_switch_simplify 0.19% : 0.000001s : 7: predicate.const_output_eliminate 0.63% : 0.000002s : 14: predicate.depend_value_elim 0.97% : 0.000003s : 22: predicate.dict_get_item_const_eliminator 1.04% : 0.000003s : 22: predicate.dict_get_item_eliminator 0.91% : 0.000003s : 22: predicate.dict_set_item_eliminator 0.94% : 0.000003s : 14: predicate.dumpgradient_eliminate 0.22% : 0.000001s : 7: predicate.elim_not_effective 0.37% : 0.000001s : 7: predicate.elim_shapecalc_of_broadcastargs 1.21% : 0.000004s : 29: predicate.environ_add_const_eliminate 1.16% : 0.000004s : 29: predicate.environ_get_add_eliminate 1.17% : 0.000004s : 29: predicate.environ_get_depend_swap 1.85% : 0.000006s : 43: predicate.environ_get_eliminate 1.17% : 0.000004s : 29: predicate.environ_get_set_eliminate 1.39% : 0.000004s : 32: predicate.exchange_switch_depend_value 2.25% : 0.000007s : 32: predicate.float_depend_g_call 0.60% : 0.000002s : 14: predicate.float_environ_get_switch 0.87% : 0.000003s : 21: predicate.float_tuple_getitem_switch 0.20% : 0.000001s : 7: predicate.fold_const_symbol 0.66% : 0.000002s : 14: predicate.get_grad_eliminate 0.21% : 0.000001s : 7: predicate.graph_param_transform 0.63% : 0.000002s : 14: predicate.incorporate_call 0.55% : 0.000002s : 14: predicate.incorporate_call_switch 6.12% : 0.000019s : 96: predicate.inline 0.78% : 0.000002s : 14: predicate.inline_without_move 0.36% : 0.000001s : 14: predicate.j_node_and_user_rematch 0.78% : 0.000002s : 14: predicate.less_batch_normalization 1.93% : 0.000006s : 42: predicate.list_to_tuple_eliminator_ 2.71% : 0.000009s : 64: predicate.load_eliminater 0.64% : 0.000002s : 7: predicate.loop_unroll_after_grad 1.97% : 0.000006s : 44: predicate.loop_unroll_before_grad 1.65% : 0.000005s : 36: predicate.make_slice_get_slice_eliminator 0.63% : 0.000002s : 14: predicate.merge_addn 0.70% : 0.000002s : 14: predicate.micro_step_allgather_replace 0.61% : 0.000002s : 14: predicate.mini_step_allgather_replace 0.90% : 0.000003s : 22: predicate.minmaximum_grad 0.73% : 0.000002s : 7: predicate.mutable_eliminate 0.34% : 0.000001s : 7: predicate.opt_reshape 0.32% : 0.000001s : 7: predicate.parallel_virtual_node 1.72% : 0.000005s : 32: predicate.partial_defer_inline 1.80% : 0.000006s : 35: predicate.partial_eliminate 0.90% : 0.000003s : 22: predicate.print_const_string_wrapper 0.61% : 0.000002s : 14: predicate.reduce_all_const_elim 1.19% : 0.000004s : 22: predicate.reduce_eliminate 2.67% : 0.000009s : 64: predicate.redundant_stop_gradient_eliminater 0.39% : 0.000001s : 14: predicate.remove_not_recompute_node 1.41% : 0.000005s : 42: predicate.replace_applicator 0.48% : 0.000002s : 14: predicate.replace_old_param 0.25% : 0.000001s : 7: predicate.reset_defer_inline 0.96% : 0.000003s : 22: predicate.reshape_eliminate 0.65% : 0.000002s : 14: predicate.row_tensor_add_zeros_like 0.37% : 0.000001s : 7: predicate.row_tensor_eliminate 0.77% : 0.000002s : 14: predicate.same_eliminate 0.45% : 0.000001s : 14: predicate.set_cell_output_no_recompute 0.77% : 0.000002s : 14: predicate.shard_identity_eliminate 0.92% : 0.000003s : 14: predicate.special_op_eliminate 0.79% : 0.000003s : 14: predicate.specialize_transform 0.78% : 0.000002s : 14: predicate.split_environ_get_set_with_tuple_value 0.78% : 0.000002s : 14: predicate.stack_unstack_eliminate 0.35% : 0.000001s : 7: predicate.switch_call_monad_eliminater 1.53% : 0.000005s : 32: predicate.switch_defer_inline 2.05% : 0.000007s : 46: predicate.switch_layer_defer_inline 4.61% : 0.000015s : 97: predicate.switch_simplify 0.93% : 0.000003s : 22: predicate.tile_eliminate 0.94% : 0.000003s : 22: predicate.transpose_eliminate 1.61% : 0.000005s : 36: predicate.tuple_list_convert_item_index_to_positive 1.65% : 0.000005s : 36: predicate.tuple_list_get_item_const_eliminator 1.55% : 0.000005s : 36: predicate.tuple_list_get_item_depend_reorder 3.27% : 0.000010s : 56: predicate.tuple_list_get_item_eliminator 1.54% : 0.000005s : 36: predicate.tuple_list_get_set_item_eliminator 2.33% : 0.000007s : 50: predicate.tuple_list_set_item_eliminator 1.88% : 0.000006s : 42: predicate.tuple_to_list_eliminator_ 2.59% : 0.000008s : 64: predicate.updatestate_pure_node_eliminater 3.36% : 0.000011s : 78: predicate.updatestate_useless_node_eliminater 0.40% : 0.000001s : 7: predicate.value_based_eliminate 0.68% : 0.000002s : 14: predicate.virtual_dataset_eliminate 0.65% : 0.000002s : 14: predicate.virtual_output_eliminate 0.32% : 0.000001s : 7: predicate.virtual_view_grad_eliminate 0.43% : 0.000001s : 7: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000528 11 53.69% : 0.000283s : 5: func_graph_cloner_run.FuncGraphClonerGraph 46.31% : 0.000244s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.028261 192 0.02% : 0.000006s : 1: ForceFp32Comm 10.48% : 0.002962s : 1: add_attr 10.44% : 0.002950s : 1: add_attr_with_inline 0.02% : 0.000006s : 1: add_comm_op_reuse_tag 0.23% : 0.000064s : 1: add_recomputation 0.02% : 0.000006s : 1: assign_add_opt 0.25% : 0.000072s : 1: auto_monad 0.11% : 0.000031s : 1: auto_monad_reorder 0.02% : 0.000006s : 1: begin_end_overlap_inline 0.03% : 0.000008s : 1: bias_add_comm_swap 1.64% : 0.000462s : 1: bootstrap 0.11% : 0.000030s : 1: cconv 0.02% : 0.000006s : 1: comm_op_add_attrs 0.08% : 0.000022s : 1: control_data_broadcast_order 0.05% : 0.000015s : 1: convert_after_rewriter 0.13% : 0.000037s : 1: cse_after_recomputation 0.03% : 0.000007s : 1: dataset_repeat_opt 0.06% : 0.000017s : 1: detach_backward 0.04% : 0.000012s : 1: environ_conv 0.11% : 0.000031s : 1: event_method 0.03% : 0.000007s : 1: full_micro_interleaved_order_control 0.03% : 0.000007s : 1: get_jit_bprop_graph 0.04% : 0.000012s : 1: graph_reusing 0.02% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.02% : 0.000006s : 1: handle_group_info 0.03% : 0.000008s : 1: inline 0.03% : 0.000009s : 1: insert-virtual-dataset 0.02% : 0.000006s : 1: interleave_parallel_branches 0.02% : 0.000007s : 1: interleave_split_concat_branches 0.03% : 0.000008s : 1: label_fine_grained_interleaved_index 0.04% : 0.000010s : 1: label_micro_interleaved_index 1.56% : 0.000441s : 1: loop_unroll 0.02% : 0.000006s : 1: merge_cast_opt 0.03% : 0.000007s : 1: micro_interleaved_order_control 1.75% : 0.000494s : 1: mutable_eliminate 0.04% : 0.000010s : 1: offloading_packed_experts 0.06% : 0.000017s : 1: opt.transform.loop_unroll_optimizer 0.06% : 0.000018s : 1: opt.transform.mutable_eliminate 6.22% : 0.001758s : 78: opt.transform.opt_a 0.16% : 0.000046s : 1: opt.transform.opt_after_cconv 0.12% : 0.000035s : 1: opt.transform.opt_after_jit_grad 0.66% : 0.000187s : 28: opt.transform.opt_b 0.26% : 0.000073s : 2: opt.transform.opt_trans_graph 0.19% : 0.000053s : 4: opt.transform.symbol_engine_opt 12.81% : 0.003620s : 1: opt_a 0.57% : 0.000161s : 1: opt_after_cconv 1.74% : 0.000491s : 1: opt_after_jit_grad 1.31% : 0.000370s : 1: opt_b 22.87% : 0.006465s : 1: optimize 0.09% : 0.000025s : 1: optimize_parallel_all_gather_comm 0.04% : 0.000013s : 1: order_py_execute_after_rewriter 0.10% : 0.000028s : 1: overlap_grad_flash_sp 0.02% : 0.000006s : 1: overlap_grad_matmul_and_grad_allreduce 0.04% : 0.000011s : 1: overlap_grad_ring_attention 0.02% : 0.000007s : 1: overlap_opt_shard_grad_in_pipeline 0.02% : 0.000006s : 1: overlap_opt_shard_in_pipeline 0.03% : 0.000008s : 1: overlap_param_gather 0.02% : 0.000006s : 1: overlap_recompute_allgather_and_fa_grad 0.04% : 0.000011s : 1: overlap_recompute_and_grad_model_parallel 0.03% : 0.000007s : 1: overlap_recompute_comm 0.03% : 0.000009s : 1: parallel-infer-symbol 0.02% : 0.000006s : 1: parallel-infer-symbol-second 0.03% : 0.000008s : 1: partial_unused_args_eliminate 0.04% : 0.000010s : 1: pipeline_parallel_scheduler 0.02% : 0.000007s : 1: pipeline_split 0.15% : 0.000042s : 1: pre_auto_parallel 0.13% : 0.000037s : 1: py_interpret_to_execute 0.06% : 0.000016s : 1: py_interpret_to_execute_after_opt_a 0.02% : 0.000006s : 1: remove_cast_before_assign_add 0.15% : 0.000042s : 1: remove_dup_value 1.17% : 0.000330s : 1: renormalize.infer 1.03% : 0.000292s : 1: renormalize.specialize 0.03% : 0.000008s : 1: reorder_send_recv_between_fp_bp 0.04% : 0.000010s : 1: rewriter_after_jit_bprop_graph 0.18% : 0.000051s : 1: rewriter_after_opt_a 0.42% : 0.000119s : 1: rewriter_before_opt_a 0.03% : 0.000008s : 1: slice_cell_reuse_recomputed_activation 0.02% : 0.000007s : 1: slice_recompute_activation 0.02% : 0.000007s : 1: split_layernorm_comm 0.03% : 0.000008s : 1: split_matmul_comm_elemetwise 0.04% : 0.000012s : 1: swap_dp_allreduce_reducescatter 0.41% : 0.000116s : 1: symbol_engine_optimizer 0.42% : 0.000119s : 1: tuple_transform 20.42% : 0.005772s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:47:09.747.51 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0153967, [21] [bootstrap]: 0.00042695 [type_inference]: 0.00574614 [event_method]: 2.054e-05 [auto_monad]: 6.332e-05 [graph_reusing]: 5.75001e-06 [inline]: 2.00002e-06 [add_attr]: 0.00293116, [1] [add_attr_with_inline]: 0.00292399, [1] [Cycle 1]: 5.401e-05, [2] [tag_attr]: 2.065e-05 [meta_addattr_fg_expand]: 6.58003e-06 [parallel-infer-symbol]: 2.78e-06 [pre_auto_parallel]: 3.271e-05 [insert-virtual-dataset]: 2.81999e-06 [parallel-infer-symbol-second]: 8.00006e-07 [dataset_repeat_opt]: 2.16e-06 [pipeline_split]: 1.69998e-06 [optimize]: 0.00548261, [53] [py_interpret_to_execute]: 2.825e-05 [rewriter_before_opt_a]: 9.402e-05 [opt_a]: 0.00328911, [2] [Cycle 1]: 0.00238773, [45] [expand_dump_flag]: 3.08e-06 [switch_simplify]: 4.816e-05 [loop_unroll]: 3.633e-05 [a_1]: 0.00089344 [with_stream_mark]: 1.558e-05 [recompute_prepare]: 1.203e-05 [updatestate_depend_eliminate]: 5.44998e-06 [updatestate_assign_eliminate]: 4.85001e-06 [updatestate_loads_eliminate]: 4.15e-06 [parameter_eliminate]: 1.86e-06 [a_2]: 0.00012752 [accelerated_algorithm]: 1.051e-05 [shard]: 1.69e-06 [meta_shard_fg_expand]: 2.53e-06 [shard_inline]: 9.99999e-06 [merge_send_recv]: 1.095e-05 [auto_parallel]: 7.28e-06 [parallel]: 1.767e-05 [flash_sp]: 7.81001e-06 [merge_comm]: 5.59998e-06 [allreduce_fusion]: 5.19e-06 [matmul_add_comm_reduction]: 1.1e-05 [allreduce_slice_to_reducescatter]: 6.19999e-07 [virtual_shard_identity]: 1.121e-05 [virtual_dataset]: 9.86e-06 [get_grad_eliminate_]: 8.99e-06 [virtual_output]: 9.15999e-06 [merge_forward]: 4.97e-06 [cell_reuse_recompute_pass]: 1.07998e-06 [offload_activation]: 1.112e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.752e-05 [merge_recompute_call_nodes]: 1.57001e-06 [before_grad]: 1.586e-05 [set_forward_comm_id_for_comm_node_pass]: 4.99e-06 [meta_fg_expand]: 4.38001e-06 [flash_sp_send_recv_attached]: 2.40002e-06 [receive_attached]: 2.09e-06 [after_resolve]: 1.515e-05 [a_after_grad]: 1.484e-05 [renormalize]: 0.0006289 [add_forward_monad_depend]: 5.10001e-06 [auto_monad_grad]: 1.59e-06 [auto_monad_eliminator]: 1.78e-05 [cse]: 4.898e-05 [a_3]: 6.81e-05 [Cycle 2]: 0.00089207, [45] [expand_dump_flag]: 1.10001e-06 [switch_simplify]: 1.092e-05 [loop_unroll]: 9.67001e-06 [a_1]: 0.00023599 [with_stream_mark]: 1.15e-05 [recompute_prepare]: 9.46998e-06 [updatestate_depend_eliminate]: 4.58999e-06 [updatestate_assign_eliminate]: 3.50998e-06 [updatestate_loads_eliminate]: 3.8e-06 [parameter_eliminate]: 1.05001e-06 [a_2]: 0.00011908 [accelerated_algorithm]: 9.56003e-06 [shard]: 1.11002e-06 [meta_shard_fg_expand]: 1.81e-06 [shard_inline]: 9.17001e-06 [merge_send_recv]: 6.58e-06 [auto_parallel]: 6.79001e-06 [parallel]: 4.45e-06 [flash_sp]: 3.35e-06 [merge_comm]: 5.12e-06 [allreduce_fusion]: 4.68999e-06 [matmul_add_comm_reduction]: 7.3e-06 [allreduce_slice_to_reducescatter]: 3.10014e-07 [virtual_shard_identity]: 9.84999e-06 [virtual_dataset]: 9.29998e-06 [get_grad_eliminate_]: 8.96002e-06 [virtual_output]: 8.62e-06 [merge_forward]: 3.95e-06 [cell_reuse_recompute_pass]: 1.35999e-06 [offload_activation]: 7.95998e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.792e-05 [merge_recompute_call_nodes]: 7.40023e-07 [before_grad]: 1.395e-05 [set_forward_comm_id_for_comm_node_pass]: 4.87e-06 [meta_fg_expand]: 3.41001e-06 [flash_sp_send_recv_attached]: 8.00006e-07 [receive_attached]: 9.49978e-07 [after_resolve]: 1.453e-05 [a_after_grad]: 1.522e-05 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 1.18001e-06 [auto_monad_grad]: 9.39996e-07 [auto_monad_eliminator]: 9.73998e-06 [cse]: 2.444e-05 [a_3]: 5.887e-05 [py_interpret_to_execute_after_opt_a]: 9.90002e-06 [slice_cell_reuse_recomputed_activation]: 2.09e-06 [rewriter_after_opt_a]: 4.403e-05 [convert_after_rewriter]: 8.43999e-06 [order_py_execute_after_rewriter]: 6.23e-06 [mutable_eliminate]: 0.00045482 [opt_b]: 0.00030026, [1] [Cycle 1]: 0.00029479, [7] [b_1]: 0.00020528 [b_2]: 1.103e-05 [updatestate_depend_eliminate]: 6.49999e-06 [updatestate_assign_eliminate]: 3.63e-06 [updatestate_loads_eliminate]: 3.89002e-06 [renormalize]: 3.80009e-07 [cse]: 2.858e-05 [optimize_parallel_all_gather_comm]: 1.899e-05 [overlap_param_gather]: 1.86e-06 [cconv]: 2.338e-05 [loop_unroll]: 0.00043195 [opt_after_cconv]: 0.00013252, [1] [Cycle 1]: 0.000127, [7] [c_1]: 4.861e-05 [parameter_eliminate]: 2.39999e-06 [updatestate_depend_eliminate]: 6.49001e-06 [updatestate_assign_eliminate]: 3.78999e-06 [updatestate_loads_eliminate]: 3.63999e-06 [cse]: 2.767e-05 [renormalize]: 4.59986e-07 [remove_dup_value]: 3.446e-05 [tuple_transform]: 0.00010177, [1] [Cycle 1]: 9.712e-05, [4] [d_1]: 6.644e-05 [none_parameter_eliminate]: 1.71e-06 [renormalize]: 2.00002e-07 [switch_simplify]: 1.017e-05 [partial_unused_args_eliminate]: 1.74e-06 [add_recomputation]: 6.031e-05 [cse_after_recomputation]: 2.876e-05, [1] [Cycle 1]: 2.411e-05, [1] [cse]: 1.856e-05 [environ_conv]: 5.61e-06 [swap_dp_allreduce_reducescatter]: 6.94001e-06 [bias_add_comm_swap]: 2.69001e-06 [label_micro_interleaved_index]: 4.35999e-06 [label_fine_grained_interleaved_index]: 2.96001e-06 [merge_cast_opt]: 1.35001e-06 [slice_recompute_activation]: 1.90001e-06 [micro_interleaved_order_control]: 2.07001e-06 [assign_add_opt]: 1.40001e-06 [ForceFp32Comm]: 1.15999e-06 [remove_cast_before_assign_add]: 9.70002e-07 [full_micro_interleaved_order_control]: 1.92999e-06 [reorder_send_recv_between_fp_bp]: 2.42001e-06 [comm_op_add_attrs]: 9.70002e-07 [add_comm_op_reuse_tag]: 9.20001e-07 [interleave_split_concat_branches]: 1.12999e-06 [interleave_parallel_branches]: 1.04998e-06 [overlap_opt_shard_in_pipeline]: 1.17e-06 [overlap_opt_shard_grad_in_pipeline]: 1.81e-06 [control_data_broadcast_order]: 1.616e-05 [grouped_pairwise_exchange_alltoall]: 1.74e-06 [offloading_packed_experts]: 4.47e-06 [overlap_recompute_and_grad_model_parallel]: 5.46e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.50001e-06 [overlap_recompute_allgather_and_fa_grad]: 1.31998e-06 [overlap_recompute_comm]: 2.07999e-06 [overlap_grad_ring_attention]: 4.75999e-06 [overlap_grad_flash_sp]: 2.272e-05 [begin_end_overlap_inline]: 4.50003e-07 [split_matmul_comm_elemetwise]: 1.99999e-06 [split_layernorm_comm]: 1.64998e-06 [handle_group_info]: 8.59989e-07 [symbol_engine_optimizer]: 9.013e-05, [1] [Cycle 1]: 8.597e-05, [6] [build]: 3.01001e-06 [elim_shapecalc]: 1.322e-05 [elim_not_effective]: 1.73e-05 [opt_reshape]: 9.79999e-06 [fold_const_symbol]: 1.501e-05 [renormalize]: 2.00002e-07 [detach_backward]: 1.82999e-06 [pipeline_parallel_scheduler]: 1.81003e-06 [auto_monad_reorder]: 2.035e-05 [get_jit_bprop_graph]: 1.32999e-06 [rewriter_after_jit_bprop_graph]: 3.65e-06 [opt_after_jit_grad]: 0.0004673 [validate]: 4.048e-05 Sums bootstrap : 0.000427s : 3.70% type_inference : 0.005746s : 49.77% event_method : 0.000021s : 0.18% auto_monad : 0.000063s : 0.55% graph_reusing : 0.000006s : 0.05% inline : 0.000002s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000021s : 0.18% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000007s : 0.06% parallel-infer-symbol : 0.000003s : 0.02% pre_auto_parallel : 0.000033s : 0.28% insert-virtual-dataset : 0.000003s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000028s : 0.24% optimize.rewriter_before_opt_a : 0.000094s : 0.81% optimize.opt_a.expand_dump_flag : 0.000004s : 0.04% optimize.opt_a.switch_simplify : 0.000059s : 0.51% optimize.opt_a.loop_unroll : 0.000046s : 0.40% optimize.opt_a.a_1 : 0.001129s : 9.78% optimize.opt_a.with_stream_mark : 0.000027s : 0.23% optimize.opt_a.recompute_prepare : 0.000021s : 0.19% optimize.opt_a.updatestate_depend_eliminate : 0.000010s : 0.09% optimize.opt_a.updatestate_assign_eliminate : 0.000008s : 0.07% optimize.opt_a.updatestate_loads_eliminate : 0.000008s : 0.07% optimize.opt_a.parameter_eliminate : 0.000003s : 0.03% optimize.opt_a.a_2 : 0.000247s : 2.14% optimize.opt_a.accelerated_algorithm : 0.000020s : 0.17% optimize.opt_a.shard : 0.000003s : 0.02% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.04% optimize.opt_a.shard_inline : 0.000019s : 0.17% optimize.opt_a.merge_send_recv : 0.000018s : 0.15% optimize.opt_a.auto_parallel : 0.000014s : 0.12% optimize.opt_a.parallel : 0.000022s : 0.19% optimize.opt_a.flash_sp : 0.000011s : 0.10% optimize.opt_a.merge_comm : 0.000011s : 0.09% optimize.opt_a.allreduce_fusion : 0.000010s : 0.09% optimize.opt_a.matmul_add_comm_reduction : 0.000018s : 0.16% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000021s : 0.18% optimize.opt_a.virtual_dataset : 0.000019s : 0.17% optimize.opt_a.get_grad_eliminate_ : 0.000018s : 0.16% optimize.opt_a.virtual_output : 0.000018s : 0.15% optimize.opt_a.merge_forward : 0.000009s : 0.08% optimize.opt_a.cell_reuse_recompute_pass : 0.000002s : 0.02% optimize.opt_a.offload_activation : 0.000019s : 0.17% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000035s : 0.31% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.02% optimize.opt_a.before_grad : 0.000030s : 0.26% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000010s : 0.09% optimize.opt_a.meta_fg_expand : 0.000008s : 0.07% optimize.opt_a.flash_sp_send_recv_attached : 0.000003s : 0.03% optimize.opt_a.receive_attached : 0.000003s : 0.03% optimize.opt_a.after_resolve : 0.000030s : 0.26% optimize.opt_a.a_after_grad : 0.000030s : 0.26% optimize.opt_a.renormalize : 0.000629s : 5.45% optimize.opt_a.add_forward_monad_depend : 0.000006s : 0.05% optimize.opt_a.auto_monad_grad : 0.000003s : 0.02% optimize.opt_a.auto_monad_eliminator : 0.000028s : 0.24% optimize.opt_a.cse : 0.000073s : 0.64% optimize.opt_a.a_3 : 0.000127s : 1.10% optimize.py_interpret_to_execute_after_opt_a : 0.000010s : 0.09% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.02% optimize.rewriter_after_opt_a : 0.000044s : 0.38% optimize.convert_after_rewriter : 0.000008s : 0.07% optimize.order_py_execute_after_rewriter : 0.000006s : 0.05% optimize.mutable_eliminate : 0.000455s : 3.94% optimize.opt_b.b_1 : 0.000205s : 1.78% optimize.opt_b.b_2 : 0.000011s : 0.10% optimize.opt_b.updatestate_depend_eliminate : 0.000006s : 0.06% optimize.opt_b.updatestate_assign_eliminate : 0.000004s : 0.03% optimize.opt_b.updatestate_loads_eliminate : 0.000004s : 0.03% optimize.opt_b.renormalize : 0.000000s : 0.00% optimize.opt_b.cse : 0.000029s : 0.25% optimize.optimize_parallel_all_gather_comm : 0.000019s : 0.16% optimize.overlap_param_gather : 0.000002s : 0.02% optimize.cconv : 0.000023s : 0.20% optimize.loop_unroll : 0.000432s : 3.74% optimize.opt_after_cconv.c_1 : 0.000049s : 0.42% optimize.opt_after_cconv.parameter_eliminate : 0.000002s : 0.02% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.06% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000004s : 0.03% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000004s : 0.03% optimize.opt_after_cconv.cse : 0.000028s : 0.24% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000034s : 0.30% optimize.tuple_transform.d_1 : 0.000066s : 0.58% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000010s : 0.09% optimize.partial_unused_args_eliminate : 0.000002s : 0.02% optimize.add_recomputation : 0.000060s : 0.52% optimize.cse_after_recomputation.cse : 0.000019s : 0.16% optimize.environ_conv : 0.000006s : 0.05% optimize.swap_dp_allreduce_reducescatter : 0.000007s : 0.06% optimize.bias_add_comm_swap : 0.000003s : 0.02% optimize.label_micro_interleaved_index : 0.000004s : 0.04% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.03% optimize.merge_cast_opt : 0.000001s : 0.01% optimize.slice_recompute_activation : 0.000002s : 0.02% optimize.micro_interleaved_order_control : 0.000002s : 0.02% optimize.assign_add_opt : 0.000001s : 0.01% optimize.ForceFp32Comm : 0.000001s : 0.01% optimize.remove_cast_before_assign_add : 0.000001s : 0.01% optimize.full_micro_interleaved_order_control : 0.000002s : 0.02% optimize.reorder_send_recv_between_fp_bp : 0.000002s : 0.02% optimize.comm_op_add_attrs : 0.000001s : 0.01% optimize.add_comm_op_reuse_tag : 0.000001s : 0.01% optimize.interleave_split_concat_branches : 0.000001s : 0.01% optimize.interleave_parallel_branches : 0.000001s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.02% optimize.control_data_broadcast_order : 0.000016s : 0.14% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.02% optimize.offloading_packed_experts : 0.000004s : 0.04% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.05% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000002s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.01% optimize.overlap_recompute_comm : 0.000002s : 0.02% optimize.overlap_grad_ring_attention : 0.000005s : 0.04% optimize.overlap_grad_flash_sp : 0.000023s : 0.20% optimize.begin_end_overlap_inline : 0.000000s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.02% optimize.split_layernorm_comm : 0.000002s : 0.01% optimize.handle_group_info : 0.000001s : 0.01% optimize.symbol_engine_optimizer.build : 0.000003s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000013s : 0.11% optimize.symbol_engine_optimizer.elim_not_effective : 0.000017s : 0.15% optimize.symbol_engine_optimizer.opt_reshape : 0.000010s : 0.08% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000015s : 0.13% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.02% pipeline_parallel_scheduler : 0.000002s : 0.02% auto_monad_reorder : 0.000020s : 0.18% get_jit_bprop_graph : 0.000001s : 0.01% rewriter_after_jit_bprop_graph : 0.000004s : 0.03% opt_after_jit_grad : 0.000467s : 4.05% validate : 0.000040s : 0.35% Time group info: ------[substitution.] 0.000249 54 10.38% : 0.000026s : 6: substitution.cast_eliminate 0.95% : 0.000002s : 4: substitution.elim_not_effective 0.81% : 0.000002s : 4: substitution.fold_const_symbol 3.02% : 0.000007s : 7: substitution.graph_param_transform 68.18% : 0.000170s : 4: substitution.inline 1.99% : 0.000005s : 8: substitution.j_node_and_user_rematch 3.32% : 0.000008s : 8: substitution.remove_not_recompute_node 2.05% : 0.000005s : 6: substitution.replace_old_param 6.99% : 0.000017s : 6: substitution.tuple_list_get_item_eliminator 2.31% : 0.000006s : 1: substitution.value_based_eliminate ------[type_inference.] 0.005688 2 86.64% : 0.004928s : 1: type_inference.infer 13.36% : 0.000760s : 1: type_inference.specialize ------[replace.] 0.000072 10 51.77% : 0.000037s : 4: replace.inline 48.23% : 0.000035s : 6: replace.tuple_list_get_item_eliminator ------[match.] 0.000182 10 91.89% : 0.000167s : 4: match.inline 8.11% : 0.000015s : 6: match.tuple_list_get_item_eliminator ------[predicate.] 0.000317 2134 1.08% : 0.000003s : 22: predicate.accumulaten_eliminater 0.66% : 0.000002s : 7: predicate.ad_related_special_op_eliminate 0.57% : 0.000002s : 14: predicate.addn_check_dump 0.95% : 0.000003s : 22: predicate.addn_zero_filter 0.88% : 0.000003s : 22: predicate.adjust_all_reduce_mul_add 1.97% : 0.000006s : 36: predicate.arithmetic_simplify 1.11% : 0.000004s : 22: predicate.cast_eliminate 0.62% : 0.000002s : 14: predicate.check_bprop_eliminate 0.61% : 0.000002s : 14: predicate.compare_switch_simplify 0.20% : 0.000001s : 7: predicate.const_output_eliminate 0.63% : 0.000002s : 14: predicate.depend_value_elim 0.96% : 0.000003s : 22: predicate.dict_get_item_const_eliminator 1.17% : 0.000004s : 22: predicate.dict_get_item_eliminator 0.91% : 0.000003s : 22: predicate.dict_set_item_eliminator 0.86% : 0.000003s : 14: predicate.dumpgradient_eliminate 0.20% : 0.000001s : 7: predicate.elim_not_effective 0.38% : 0.000001s : 7: predicate.elim_shapecalc_of_broadcastargs 1.25% : 0.000004s : 29: predicate.environ_add_const_eliminate 1.20% : 0.000004s : 29: predicate.environ_get_add_eliminate 1.16% : 0.000004s : 29: predicate.environ_get_depend_swap 1.84% : 0.000006s : 43: predicate.environ_get_eliminate 1.19% : 0.000004s : 29: predicate.environ_get_set_eliminate 1.40% : 0.000004s : 32: predicate.exchange_switch_depend_value 2.21% : 0.000007s : 32: predicate.float_depend_g_call 0.58% : 0.000002s : 14: predicate.float_environ_get_switch 0.85% : 0.000003s : 21: predicate.float_tuple_getitem_switch 0.20% : 0.000001s : 7: predicate.fold_const_symbol 0.68% : 0.000002s : 14: predicate.get_grad_eliminate 0.21% : 0.000001s : 7: predicate.graph_param_transform 0.62% : 0.000002s : 14: predicate.incorporate_call 0.56% : 0.000002s : 14: predicate.incorporate_call_switch 6.16% : 0.000020s : 96: predicate.inline 0.84% : 0.000003s : 14: predicate.inline_without_move 0.39% : 0.000001s : 14: predicate.j_node_and_user_rematch 0.84% : 0.000003s : 14: predicate.less_batch_normalization 1.86% : 0.000006s : 42: predicate.list_to_tuple_eliminator_ 2.68% : 0.000008s : 64: predicate.load_eliminater 0.65% : 0.000002s : 7: predicate.loop_unroll_after_grad 1.89% : 0.000006s : 44: predicate.loop_unroll_before_grad 1.69% : 0.000005s : 36: predicate.make_slice_get_slice_eliminator 0.61% : 0.000002s : 14: predicate.merge_addn 0.59% : 0.000002s : 14: predicate.micro_step_allgather_replace 0.58% : 0.000002s : 14: predicate.mini_step_allgather_replace 0.89% : 0.000003s : 22: predicate.minmaximum_grad 0.74% : 0.000002s : 7: predicate.mutable_eliminate 0.35% : 0.000001s : 7: predicate.opt_reshape 0.33% : 0.000001s : 7: predicate.parallel_virtual_node 1.70% : 0.000005s : 32: predicate.partial_defer_inline 1.78% : 0.000006s : 35: predicate.partial_eliminate 0.93% : 0.000003s : 22: predicate.print_const_string_wrapper 0.62% : 0.000002s : 14: predicate.reduce_all_const_elim 1.16% : 0.000004s : 22: predicate.reduce_eliminate 2.66% : 0.000008s : 64: predicate.redundant_stop_gradient_eliminater 0.41% : 0.000001s : 14: predicate.remove_not_recompute_node 1.42% : 0.000004s : 42: predicate.replace_applicator 0.53% : 0.000002s : 14: predicate.replace_old_param 0.26% : 0.000001s : 7: predicate.reset_defer_inline 0.99% : 0.000003s : 22: predicate.reshape_eliminate 0.62% : 0.000002s : 14: predicate.row_tensor_add_zeros_like 0.34% : 0.000001s : 7: predicate.row_tensor_eliminate 0.75% : 0.000002s : 14: predicate.same_eliminate 0.44% : 0.000001s : 14: predicate.set_cell_output_no_recompute 0.74% : 0.000002s : 14: predicate.shard_identity_eliminate 0.71% : 0.000002s : 14: predicate.special_op_eliminate 0.77% : 0.000002s : 14: predicate.specialize_transform 0.76% : 0.000002s : 14: predicate.split_environ_get_set_with_tuple_value 0.72% : 0.000002s : 14: predicate.stack_unstack_eliminate 0.36% : 0.000001s : 7: predicate.switch_call_monad_eliminater 1.48% : 0.000005s : 32: predicate.switch_defer_inline 2.06% : 0.000007s : 46: predicate.switch_layer_defer_inline 4.60% : 0.000015s : 97: predicate.switch_simplify 0.88% : 0.000003s : 22: predicate.tile_eliminate 0.98% : 0.000003s : 22: predicate.transpose_eliminate 1.67% : 0.000005s : 36: predicate.tuple_list_convert_item_index_to_positive 1.74% : 0.000005s : 36: predicate.tuple_list_get_item_const_eliminator 1.58% : 0.000005s : 36: predicate.tuple_list_get_item_depend_reorder 3.21% : 0.000010s : 56: predicate.tuple_list_get_item_eliminator 1.59% : 0.000005s : 36: predicate.tuple_list_get_set_item_eliminator 2.31% : 0.000007s : 50: predicate.tuple_list_set_item_eliminator 1.92% : 0.000006s : 42: predicate.tuple_to_list_eliminator_ 2.63% : 0.000008s : 64: predicate.updatestate_pure_node_eliminater 3.34% : 0.000011s : 78: predicate.updatestate_useless_node_eliminater 0.49% : 0.000002s : 7: predicate.value_based_eliminate 0.65% : 0.000002s : 14: predicate.virtual_dataset_eliminate 0.65% : 0.000002s : 14: predicate.virtual_output_eliminate 0.31% : 0.000001s : 7: predicate.virtual_view_grad_eliminate 0.41% : 0.000001s : 7: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000515 11 54.63% : 0.000281s : 5: func_graph_cloner_run.FuncGraphClonerGraph 45.37% : 0.000234s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.026525 192 0.01% : 0.000004s : 1: ForceFp32Comm 11.07% : 0.002936s : 1: add_attr 11.04% : 0.002927s : 1: add_attr_with_inline 0.01% : 0.000003s : 1: add_comm_op_reuse_tag 0.24% : 0.000064s : 1: add_recomputation 0.02% : 0.000004s : 1: assign_add_opt 0.26% : 0.000068s : 1: auto_monad 0.09% : 0.000024s : 1: auto_monad_reorder 0.01% : 0.000003s : 1: begin_end_overlap_inline 0.02% : 0.000006s : 1: bias_add_comm_swap 1.71% : 0.000454s : 1: bootstrap 0.10% : 0.000027s : 1: cconv 0.01% : 0.000004s : 1: comm_op_add_attrs 0.07% : 0.000019s : 1: control_data_broadcast_order 0.04% : 0.000012s : 1: convert_after_rewriter 0.12% : 0.000032s : 1: cse_after_recomputation 0.02% : 0.000005s : 1: dataset_repeat_opt 0.02% : 0.000005s : 1: detach_backward 0.03% : 0.000009s : 1: environ_conv 0.10% : 0.000026s : 1: event_method 0.02% : 0.000005s : 1: full_micro_interleaved_order_control 0.02% : 0.000004s : 1: get_jit_bprop_graph 0.03% : 0.000009s : 1: graph_reusing 0.02% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000004s : 1: handle_group_info 0.02% : 0.000005s : 1: inline 0.02% : 0.000006s : 1: insert-virtual-dataset 0.01% : 0.000004s : 1: interleave_parallel_branches 0.02% : 0.000004s : 1: interleave_split_concat_branches 0.02% : 0.000006s : 1: label_fine_grained_interleaved_index 0.03% : 0.000007s : 1: label_micro_interleaved_index 1.66% : 0.000439s : 1: loop_unroll 0.02% : 0.000004s : 1: merge_cast_opt 0.02% : 0.000005s : 1: micro_interleaved_order_control 1.74% : 0.000463s : 1: mutable_eliminate 0.03% : 0.000007s : 1: offloading_packed_experts 0.06% : 0.000017s : 1: opt.transform.loop_unroll_optimizer 0.07% : 0.000017s : 1: opt.transform.mutable_eliminate 6.72% : 0.001781s : 78: opt.transform.opt_a 0.18% : 0.000047s : 1: opt.transform.opt_after_cconv 0.13% : 0.000034s : 1: opt.transform.opt_after_jit_grad 0.70% : 0.000185s : 28: opt.transform.opt_b 0.28% : 0.000074s : 2: opt.transform.opt_trans_graph 0.20% : 0.000052s : 4: opt.transform.symbol_engine_opt 12.41% : 0.003292s : 1: opt_a 0.51% : 0.000136s : 1: opt_after_cconv 1.80% : 0.000477s : 1: opt_after_jit_grad 1.15% : 0.000304s : 1: opt_b 20.68% : 0.005486s : 1: optimize 0.08% : 0.000022s : 1: optimize_parallel_all_gather_comm 0.04% : 0.000009s : 1: order_py_execute_after_rewriter 0.10% : 0.000026s : 1: overlap_grad_flash_sp 0.02% : 0.000005s : 1: overlap_grad_matmul_and_grad_allreduce 0.03% : 0.000008s : 1: overlap_grad_ring_attention 0.02% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.02% : 0.000005s : 1: overlap_param_gather 0.01% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.03% : 0.000008s : 1: overlap_recompute_and_grad_model_parallel 0.02% : 0.000005s : 1: overlap_recompute_comm 0.02% : 0.000006s : 1: parallel-infer-symbol 0.01% : 0.000004s : 1: parallel-infer-symbol-second 0.02% : 0.000005s : 1: partial_unused_args_eliminate 0.02% : 0.000005s : 1: pipeline_parallel_scheduler 0.02% : 0.000004s : 1: pipeline_split 0.14% : 0.000037s : 1: pre_auto_parallel 0.12% : 0.000032s : 1: py_interpret_to_execute 0.05% : 0.000013s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000004s : 1: remove_cast_before_assign_add 0.15% : 0.000039s : 1: remove_dup_value 1.26% : 0.000333s : 1: renormalize.infer 1.09% : 0.000288s : 1: renormalize.specialize 0.02% : 0.000005s : 1: reorder_send_recv_between_fp_bp 0.03% : 0.000007s : 1: rewriter_after_jit_bprop_graph 0.18% : 0.000048s : 1: rewriter_after_opt_a 0.37% : 0.000098s : 1: rewriter_before_opt_a 0.02% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.02% : 0.000005s : 1: slice_recompute_activation 0.02% : 0.000004s : 1: split_layernorm_comm 0.02% : 0.000005s : 1: split_matmul_comm_elemetwise 0.04% : 0.000010s : 1: swap_dp_allreduce_reducescatter 0.35% : 0.000093s : 1: symbol_engine_optimizer 0.39% : 0.000105s : 1: tuple_transform 21.71% : 0.005760s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:47:09.260.564 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:47:09.260.808 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0160884, [21] [bootstrap]: 0.00042683 [type_inference]: 0.00572909 [event_method]: 2.016e-05 [auto_monad]: 6.041e-05 [graph_reusing]: 6.31998e-06 [inline]: 1.75001e-06 [add_attr]: 0.0029479, [1] [add_attr_with_inline]: 0.00294015, [1] [Cycle 1]: 6.592e-05, [2] [tag_attr]: 1.933e-05 [meta_addattr_fg_expand]: 6.38998e-06 [parallel-infer-symbol]: 3.01999e-06 [pre_auto_parallel]: 3.249e-05 [insert-virtual-dataset]: 2.28998e-06 [parallel-infer-symbol-second]: 8.09989e-07 [dataset_repeat_opt]: 1.85001e-06 [pipeline_split]: 1.59e-06 [optimize]: 0.00574343, [53] [py_interpret_to_execute]: 2.993e-05 [rewriter_before_opt_a]: 9.533e-05 [opt_a]: 0.00338443, [2] [Cycle 1]: 0.00239894, [45] [expand_dump_flag]: 2.84001e-06 [switch_simplify]: 4.673e-05 [loop_unroll]: 3.52e-05 [a_1]: 0.00084279 [with_stream_mark]: 1.437e-05 [recompute_prepare]: 1.042e-05 [updatestate_depend_eliminate]: 4.63999e-06 [updatestate_assign_eliminate]: 4.22e-06 [updatestate_loads_eliminate]: 3.95998e-06 [parameter_eliminate]: 1.78002e-06 [a_2]: 0.00013796 [accelerated_algorithm]: 9.22001e-06 [shard]: 1.71e-06 [meta_shard_fg_expand]: 2.07999e-06 [shard_inline]: 8.60999e-06 [merge_send_recv]: 9.46998e-06 [auto_parallel]: 7.18e-06 [parallel]: 1.686e-05 [flash_sp]: 7.85998e-06 [merge_comm]: 5.07e-06 [allreduce_fusion]: 4.65001e-06 [matmul_add_comm_reduction]: 1.06e-05 [allreduce_slice_to_reducescatter]: 6.00005e-07 [virtual_shard_identity]: 1e-05 [virtual_dataset]: 8.47e-06 [get_grad_eliminate_]: 8.28999e-06 [virtual_output]: 8.59998e-06 [merge_forward]: 4.50001e-06 [cell_reuse_recompute_pass]: 1.24e-06 [offload_activation]: 1.046e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.761e-05 [merge_recompute_call_nodes]: 1.53002e-06 [before_grad]: 1.309e-05 [set_forward_comm_id_for_comm_node_pass]: 4.62e-06 [meta_fg_expand]: 3.7e-06 [flash_sp_send_recv_attached]: 2.81e-06 [receive_attached]: 2.40002e-06 [after_resolve]: 1.479e-05 [a_after_grad]: 1.366e-05 [renormalize]: 0.00057875 [add_forward_monad_depend]: 5.17e-06 [auto_monad_grad]: 1.74998e-06 [auto_monad_eliminator]: 1.625e-05 [cse]: 3.295e-05 [a_3]: 7.492e-05 [Cycle 2]: 0.00097256, [45] [expand_dump_flag]: 1.10001e-06 [switch_simplify]: 9.69e-06 [loop_unroll]: 8.33999e-06 [a_1]: 0.00020245 [with_stream_mark]: 1.157e-05 [recompute_prepare]: 8.73001e-06 [updatestate_depend_eliminate]: 4.1e-06 [updatestate_assign_eliminate]: 3.26999e-06 [updatestate_loads_eliminate]: 3.18e-06 [parameter_eliminate]: 1.05999e-06 [a_2]: 0.0001279 [accelerated_algorithm]: 8.43001e-06 [shard]: 1.04998e-06 [meta_shard_fg_expand]: 1.76003e-06 [shard_inline]: 1.157e-05 [merge_send_recv]: 5.72001e-06 [auto_parallel]: 6.51e-06 [parallel]: 4.24002e-06 [flash_sp]: 3.16999e-06 [merge_comm]: 4.2e-06 [allreduce_fusion]: 3.73001e-06 [matmul_add_comm_reduction]: 6.11998e-06 [allreduce_slice_to_reducescatter]: 3.60014e-07 [virtual_shard_identity]: 9.12999e-06 [virtual_dataset]: 7.95e-06 [get_grad_eliminate_]: 7.56999e-06 [virtual_output]: 7.57998e-06 [merge_forward]: 3.48e-06 [cell_reuse_recompute_pass]: 1.51998e-06 [offload_activation]: 7.43e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.702e-05 [merge_recompute_call_nodes]: 9.09989e-07 [before_grad]: 1.253e-05 [set_forward_comm_id_for_comm_node_pass]: 4.52e-06 [meta_fg_expand]: 2.96999e-06 [flash_sp_send_recv_attached]: 9.20001e-07 [receive_attached]: 9.50007e-07 [after_resolve]: 1.368e-05 [a_after_grad]: 1.227e-05 [renormalize]: 1.10012e-07 [add_forward_monad_depend]: 1.30999e-06 [auto_monad_grad]: 1.22e-06 [auto_monad_eliminator]: 8.85001e-06 [cse]: 1.765e-05 [a_3]: 6.278e-05 [py_interpret_to_execute_after_opt_a]: 1.246e-05 [slice_cell_reuse_recomputed_activation]: 4.65999e-06 [rewriter_after_opt_a]: 4.315e-05 [convert_after_rewriter]: 1.131e-05 [order_py_execute_after_rewriter]: 9.17999e-06 [mutable_eliminate]: 0.00047273 [opt_b]: 0.00033128, [1] [Cycle 1]: 0.0003221, [7] [b_1]: 0.00022202 [b_2]: 9.99999e-06 [updatestate_depend_eliminate]: 6.00002e-06 [updatestate_assign_eliminate]: 3.26999e-06 [updatestate_loads_eliminate]: 3.41999e-06 [renormalize]: 3.69997e-07 [cse]: 2.146e-05 [optimize_parallel_all_gather_comm]: 1.947e-05 [overlap_param_gather]: 4.55001e-06 [cconv]: 2.548e-05 [loop_unroll]: 0.00044755 [opt_after_cconv]: 0.00014311, [1] [Cycle 1]: 0.00013471, [7] [c_1]: 4.22e-05 [parameter_eliminate]: 2.54999e-06 [updatestate_depend_eliminate]: 5.89999e-06 [updatestate_assign_eliminate]: 3.41999e-06 [updatestate_loads_eliminate]: 3.18e-06 [cse]: 2.232e-05 [renormalize]: 4.19997e-07 [remove_dup_value]: 1.848e-05 [tuple_transform]: 0.0001055, [1] [Cycle 1]: 9.853e-05, [4] [d_1]: 5.766e-05 [none_parameter_eliminate]: 1.69998e-06 [renormalize]: 2.30008e-07 [switch_simplify]: 9.09998e-06 [partial_unused_args_eliminate]: 4.35999e-06 [add_recomputation]: 5.685e-05 [cse_after_recomputation]: 3.148e-05, [1] [Cycle 1]: 2.43e-05, [1] [cse]: 1.528e-05 [environ_conv]: 9.17001e-06 [swap_dp_allreduce_reducescatter]: 9.10001e-06 [bias_add_comm_swap]: 4.93001e-06 [label_micro_interleaved_index]: 6.61999e-06 [label_fine_grained_interleaved_index]: 5.40999e-06 [merge_cast_opt]: 3.9e-06 [slice_recompute_activation]: 4.28001e-06 [micro_interleaved_order_control]: 4.65999e-06 [assign_add_opt]: 3.63e-06 [ForceFp32Comm]: 3.06999e-06 [remove_cast_before_assign_add]: 3.41001e-06 [full_micro_interleaved_order_control]: 4.42e-06 [reorder_send_recv_between_fp_bp]: 5.13002e-06 [comm_op_add_attrs]: 3.46001e-06 [add_comm_op_reuse_tag]: 3.3e-06 [interleave_split_concat_branches]: 3.45e-06 [interleave_parallel_branches]: 3.33e-06 [overlap_opt_shard_in_pipeline]: 3.76999e-06 [overlap_opt_shard_grad_in_pipeline]: 4.23001e-06 [control_data_broadcast_order]: 1.698e-05 [grouped_pairwise_exchange_alltoall]: 3.93001e-06 [offloading_packed_experts]: 7.2e-06 [overlap_recompute_and_grad_model_parallel]: 7.38999e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.56999e-06 [overlap_recompute_allgather_and_fa_grad]: 3.68e-06 [overlap_recompute_comm]: 4.51002e-06 [overlap_grad_ring_attention]: 7.23e-06 [overlap_grad_flash_sp]: 2.299e-05 [begin_end_overlap_inline]: 3.28998e-06 [split_matmul_comm_elemetwise]: 4.63999e-06 [split_layernorm_comm]: 4.09002e-06 [handle_group_info]: 3.14999e-06 [symbol_engine_optimizer]: 0.00010526, [1] [Cycle 1]: 9.849e-05, [6] [build]: 3.01001e-06 [elim_shapecalc]: 1.184e-05 [elim_not_effective]: 1.615e-05 [opt_reshape]: 9.02999e-06 [fold_const_symbol]: 1.297e-05 [renormalize]: 2.30008e-07 [detach_backward]: 2.76999e-06 [pipeline_parallel_scheduler]: 1.54998e-06 [auto_monad_reorder]: 2.189e-05 [get_jit_bprop_graph]: 1.14e-06 [rewriter_after_jit_bprop_graph]: 4.16001e-06 [opt_after_jit_grad]: 0.00047726 [validate]: 3.861e-05 Sums bootstrap : 0.000427s : 3.73% type_inference : 0.005729s : 50.05% event_method : 0.000020s : 0.18% auto_monad : 0.000060s : 0.53% graph_reusing : 0.000006s : 0.06% inline : 0.000002s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000019s : 0.17% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.06% parallel-infer-symbol : 0.000003s : 0.03% pre_auto_parallel : 0.000032s : 0.28% insert-virtual-dataset : 0.000002s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000030s : 0.26% optimize.rewriter_before_opt_a : 0.000095s : 0.83% optimize.opt_a.expand_dump_flag : 0.000004s : 0.03% optimize.opt_a.switch_simplify : 0.000056s : 0.49% optimize.opt_a.loop_unroll : 0.000044s : 0.38% optimize.opt_a.a_1 : 0.001045s : 9.13% optimize.opt_a.with_stream_mark : 0.000026s : 0.23% optimize.opt_a.recompute_prepare : 0.000019s : 0.17% optimize.opt_a.updatestate_depend_eliminate : 0.000009s : 0.08% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.07% optimize.opt_a.updatestate_loads_eliminate : 0.000007s : 0.06% optimize.opt_a.parameter_eliminate : 0.000003s : 0.02% optimize.opt_a.a_2 : 0.000266s : 2.32% optimize.opt_a.accelerated_algorithm : 0.000018s : 0.15% optimize.opt_a.shard : 0.000003s : 0.02% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.03% optimize.opt_a.shard_inline : 0.000020s : 0.18% optimize.opt_a.merge_send_recv : 0.000015s : 0.13% optimize.opt_a.auto_parallel : 0.000014s : 0.12% optimize.opt_a.parallel : 0.000021s : 0.18% optimize.opt_a.flash_sp : 0.000011s : 0.10% optimize.opt_a.merge_comm : 0.000009s : 0.08% optimize.opt_a.allreduce_fusion : 0.000008s : 0.07% optimize.opt_a.matmul_add_comm_reduction : 0.000017s : 0.15% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000019s : 0.17% optimize.opt_a.virtual_dataset : 0.000016s : 0.14% optimize.opt_a.get_grad_eliminate_ : 0.000016s : 0.14% optimize.opt_a.virtual_output : 0.000016s : 0.14% optimize.opt_a.merge_forward : 0.000008s : 0.07% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.02% optimize.opt_a.offload_activation : 0.000018s : 0.16% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000035s : 0.30% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.02% optimize.opt_a.before_grad : 0.000026s : 0.22% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000009s : 0.08% optimize.opt_a.meta_fg_expand : 0.000007s : 0.06% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.03% optimize.opt_a.receive_attached : 0.000003s : 0.03% optimize.opt_a.after_resolve : 0.000028s : 0.25% optimize.opt_a.a_after_grad : 0.000026s : 0.23% optimize.opt_a.renormalize : 0.000579s : 5.06% optimize.opt_a.add_forward_monad_depend : 0.000006s : 0.06% optimize.opt_a.auto_monad_grad : 0.000003s : 0.03% optimize.opt_a.auto_monad_eliminator : 0.000025s : 0.22% optimize.opt_a.cse : 0.000051s : 0.44% optimize.opt_a.a_3 : 0.000138s : 1.20% optimize.py_interpret_to_execute_after_opt_a : 0.000012s : 0.11% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.04% optimize.rewriter_after_opt_a : 0.000043s : 0.38% optimize.convert_after_rewriter : 0.000011s : 0.10% optimize.order_py_execute_after_rewriter : 0.000009s : 0.08% optimize.mutable_eliminate : 0.000473s : 4.13% optimize.opt_b.b_1 : 0.000222s : 1.94% optimize.opt_b.b_2 : 0.000010s : 0.09% optimize.opt_b.updatestate_depend_eliminate : 0.000006s : 0.05% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_b.renormalize : 0.000000s : 0.00% optimize.opt_b.cse : 0.000021s : 0.19% optimize.optimize_parallel_all_gather_comm : 0.000019s : 0.17% optimize.overlap_param_gather : 0.000005s : 0.04% optimize.cconv : 0.000025s : 0.22% optimize.loop_unroll : 0.000448s : 3.91% optimize.opt_after_cconv.c_1 : 0.000042s : 0.37% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.02% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.05% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.cse : 0.000022s : 0.20% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000018s : 0.16% optimize.tuple_transform.d_1 : 0.000058s : 0.50% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000009s : 0.08% optimize.partial_unused_args_eliminate : 0.000004s : 0.04% optimize.add_recomputation : 0.000057s : 0.50% optimize.cse_after_recomputation.cse : 0.000015s : 0.13% optimize.environ_conv : 0.000009s : 0.08% optimize.swap_dp_allreduce_reducescatter : 0.000009s : 0.08% optimize.bias_add_comm_swap : 0.000005s : 0.04% optimize.label_micro_interleaved_index : 0.000007s : 0.06% optimize.label_fine_grained_interleaved_index : 0.000005s : 0.05% optimize.merge_cast_opt : 0.000004s : 0.03% optimize.slice_recompute_activation : 0.000004s : 0.04% optimize.micro_interleaved_order_control : 0.000005s : 0.04% optimize.assign_add_opt : 0.000004s : 0.03% optimize.ForceFp32Comm : 0.000003s : 0.03% optimize.remove_cast_before_assign_add : 0.000003s : 0.03% optimize.full_micro_interleaved_order_control : 0.000004s : 0.04% optimize.reorder_send_recv_between_fp_bp : 0.000005s : 0.04% optimize.comm_op_add_attrs : 0.000003s : 0.03% optimize.add_comm_op_reuse_tag : 0.000003s : 0.03% optimize.interleave_split_concat_branches : 0.000003s : 0.03% optimize.interleave_parallel_branches : 0.000003s : 0.03% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.03% optimize.overlap_opt_shard_grad_in_pipeline : 0.000004s : 0.04% optimize.control_data_broadcast_order : 0.000017s : 0.15% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.03% optimize.offloading_packed_experts : 0.000007s : 0.06% optimize.overlap_recompute_and_grad_model_parallel : 0.000007s : 0.06% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.03% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.03% optimize.overlap_recompute_comm : 0.000005s : 0.04% optimize.overlap_grad_ring_attention : 0.000007s : 0.06% optimize.overlap_grad_flash_sp : 0.000023s : 0.20% optimize.begin_end_overlap_inline : 0.000003s : 0.03% optimize.split_matmul_comm_elemetwise : 0.000005s : 0.04% optimize.split_layernorm_comm : 0.000004s : 0.04% optimize.handle_group_info : 0.000003s : 0.03% optimize.symbol_engine_optimizer.build : 0.000003s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000012s : 0.10% optimize.symbol_engine_optimizer.elim_not_effective : 0.000016s : 0.14% optimize.symbol_engine_optimizer.opt_reshape : 0.000009s : 0.08% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000013s : 0.11% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000003s : 0.02% pipeline_parallel_scheduler : 0.000002s : 0.01% auto_monad_reorder : 0.000022s : 0.19% get_jit_bprop_graph : 0.000001s : 0.01% rewriter_after_jit_bprop_graph : 0.000004s : 0.04% opt_after_jit_grad : 0.000477s : 4.17% validate : 0.000039s : 0.34% Time group info: ------[substitution.] 0.000210 44 9.79% : 0.000021s : 3: substitution.cast_eliminate 1.11% : 0.000002s : 3: substitution.elim_not_effective 0.75% : 0.000002s : 3: substitution.fold_const_symbol 3.26% : 0.000007s : 6: substitution.graph_param_transform 66.14% : 0.000139s : 4: substitution.inline 1.95% : 0.000004s : 6: substitution.j_node_and_user_rematch 2.95% : 0.000006s : 6: substitution.remove_not_recompute_node 2.58% : 0.000005s : 6: substitution.replace_old_param 8.79% : 0.000018s : 6: substitution.tuple_list_get_item_eliminator 2.68% : 0.000006s : 1: substitution.value_based_eliminate ------[type_inference.] 0.005682 2 86.75% : 0.004929s : 1: type_inference.infer 13.25% : 0.000753s : 1: type_inference.specialize ------[replace.] 0.000070 10 49.92% : 0.000035s : 4: replace.inline 50.08% : 0.000035s : 6: replace.tuple_list_get_item_eliminator ------[match.] 0.000152 10 89.81% : 0.000136s : 4: match.inline 10.19% : 0.000015s : 6: match.tuple_list_get_item_eliminator ------[predicate.] 0.000299 1908 0.92% : 0.000003s : 20: predicate.accumulaten_eliminater 0.58% : 0.000002s : 6: predicate.ad_related_special_op_eliminate 0.53% : 0.000002s : 12: predicate.addn_check_dump 0.89% : 0.000003s : 20: predicate.addn_zero_filter 0.83% : 0.000002s : 20: predicate.adjust_all_reduce_mul_add 1.86% : 0.000006s : 32: predicate.arithmetic_simplify 0.96% : 0.000003s : 20: predicate.cast_eliminate 0.58% : 0.000002s : 12: predicate.check_bprop_eliminate 0.51% : 0.000002s : 12: predicate.compare_switch_simplify 0.19% : 0.000001s : 6: predicate.const_output_eliminate 0.54% : 0.000002s : 12: predicate.depend_value_elim 0.96% : 0.000003s : 20: predicate.dict_get_item_const_eliminator 1.06% : 0.000003s : 20: predicate.dict_get_item_eliminator 0.88% : 0.000003s : 20: predicate.dict_set_item_eliminator 0.76% : 0.000002s : 12: predicate.dumpgradient_eliminate 0.21% : 0.000001s : 6: predicate.elim_not_effective 0.34% : 0.000001s : 6: predicate.elim_shapecalc_of_broadcastargs 1.10% : 0.000003s : 26: predicate.environ_add_const_eliminate 1.11% : 0.000003s : 26: predicate.environ_get_add_eliminate 1.10% : 0.000003s : 26: predicate.environ_get_depend_swap 1.73% : 0.000005s : 38: predicate.environ_get_eliminate 1.12% : 0.000003s : 26: predicate.environ_get_set_eliminate 1.37% : 0.000004s : 30: predicate.exchange_switch_depend_value 2.30% : 0.000007s : 30: predicate.float_depend_g_call 0.54% : 0.000002s : 12: predicate.float_environ_get_switch 0.78% : 0.000002s : 18: predicate.float_tuple_getitem_switch 0.20% : 0.000001s : 6: predicate.fold_const_symbol 0.63% : 0.000002s : 12: predicate.get_grad_eliminate 0.20% : 0.000001s : 6: predicate.graph_param_transform 0.57% : 0.000002s : 12: predicate.incorporate_call 0.49% : 0.000001s : 12: predicate.incorporate_call_switch 10.85% : 0.000032s : 86: predicate.inline 0.74% : 0.000002s : 12: predicate.inline_without_move 0.35% : 0.000001s : 12: predicate.j_node_and_user_rematch 0.72% : 0.000002s : 12: predicate.less_batch_normalization 1.80% : 0.000005s : 38: predicate.list_to_tuple_eliminator_ 2.60% : 0.000008s : 58: predicate.load_eliminater 0.81% : 0.000002s : 6: predicate.loop_unroll_after_grad 1.95% : 0.000006s : 42: predicate.loop_unroll_before_grad 1.60% : 0.000005s : 32: predicate.make_slice_get_slice_eliminator 0.56% : 0.000002s : 12: predicate.merge_addn 0.58% : 0.000002s : 12: predicate.micro_step_allgather_replace 0.63% : 0.000002s : 12: predicate.mini_step_allgather_replace 0.85% : 0.000003s : 20: predicate.minmaximum_grad 0.66% : 0.000002s : 6: predicate.mutable_eliminate 0.33% : 0.000001s : 6: predicate.opt_reshape 0.32% : 0.000001s : 6: predicate.parallel_virtual_node 1.77% : 0.000005s : 30: predicate.partial_defer_inline 1.73% : 0.000005s : 32: predicate.partial_eliminate 0.90% : 0.000003s : 20: predicate.print_const_string_wrapper 0.55% : 0.000002s : 12: predicate.reduce_all_const_elim 1.16% : 0.000003s : 20: predicate.reduce_eliminate 2.54% : 0.000008s : 58: predicate.redundant_stop_gradient_eliminater 0.40% : 0.000001s : 12: predicate.remove_not_recompute_node 1.36% : 0.000004s : 38: predicate.replace_applicator 0.40% : 0.000001s : 12: predicate.replace_old_param 0.22% : 0.000001s : 6: predicate.reset_defer_inline 0.95% : 0.000003s : 20: predicate.reshape_eliminate 0.60% : 0.000002s : 12: predicate.row_tensor_add_zeros_like 0.36% : 0.000001s : 6: predicate.row_tensor_eliminate 0.74% : 0.000002s : 12: predicate.same_eliminate 0.44% : 0.000001s : 12: predicate.set_cell_output_no_recompute 0.68% : 0.000002s : 12: predicate.shard_identity_eliminate 0.61% : 0.000002s : 12: predicate.special_op_eliminate 0.67% : 0.000002s : 12: predicate.specialize_transform 0.78% : 0.000002s : 12: predicate.split_environ_get_set_with_tuple_value 0.73% : 0.000002s : 12: predicate.stack_unstack_eliminate 0.30% : 0.000001s : 6: predicate.switch_call_monad_eliminater 1.52% : 0.000005s : 30: predicate.switch_defer_inline 2.00% : 0.000006s : 42: predicate.switch_layer_defer_inline 4.53% : 0.000014s : 90: predicate.switch_simplify 0.86% : 0.000003s : 20: predicate.tile_eliminate 0.89% : 0.000003s : 20: predicate.transpose_eliminate 1.53% : 0.000005s : 32: predicate.tuple_list_convert_item_index_to_positive 1.61% : 0.000005s : 32: predicate.tuple_list_get_item_const_eliminator 1.50% : 0.000004s : 32: predicate.tuple_list_get_item_depend_reorder 3.05% : 0.000009s : 50: predicate.tuple_list_get_item_eliminator 1.57% : 0.000005s : 32: predicate.tuple_list_get_set_item_eliminator 2.15% : 0.000006s : 44: predicate.tuple_list_set_item_eliminator 1.73% : 0.000005s : 38: predicate.tuple_to_list_eliminator_ 2.49% : 0.000007s : 58: predicate.updatestate_pure_node_eliminater 3.21% : 0.000010s : 70: predicate.updatestate_useless_node_eliminater 0.43% : 0.000001s : 6: predicate.value_based_eliminate 0.57% : 0.000002s : 12: predicate.virtual_dataset_eliminate 0.60% : 0.000002s : 12: predicate.virtual_output_eliminate 0.30% : 0.000001s : 6: predicate.virtual_view_grad_eliminate 0.40% : 0.000001s : 6: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000517 11 54.76% : 0.000283s : 5: func_graph_cloner_run.FuncGraphClonerGraph 45.24% : 0.000234s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.027227 192 0.02% : 0.000006s : 1: ForceFp32Comm 10.86% : 0.002956s : 1: add_attr 10.81% : 0.002944s : 1: add_attr_with_inline 0.02% : 0.000006s : 1: add_comm_op_reuse_tag 0.22% : 0.000060s : 1: add_recomputation 0.02% : 0.000006s : 1: assign_add_opt 0.26% : 0.000070s : 1: auto_monad 0.11% : 0.000030s : 1: auto_monad_reorder 0.02% : 0.000006s : 1: begin_end_overlap_inline 0.03% : 0.000008s : 1: bias_add_comm_swap 1.73% : 0.000470s : 1: bootstrap 0.10% : 0.000029s : 1: cconv 0.02% : 0.000006s : 1: comm_op_add_attrs 0.07% : 0.000020s : 1: control_data_broadcast_order 0.05% : 0.000014s : 1: convert_after_rewriter 0.13% : 0.000035s : 1: cse_after_recomputation 0.03% : 0.000007s : 1: dataset_repeat_opt 0.06% : 0.000017s : 1: detach_backward 0.04% : 0.000012s : 1: environ_conv 0.11% : 0.000030s : 1: event_method 0.03% : 0.000007s : 1: full_micro_interleaved_order_control 0.03% : 0.000007s : 1: get_jit_bprop_graph 0.05% : 0.000013s : 1: graph_reusing 0.02% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.02% : 0.000006s : 1: handle_group_info 0.03% : 0.000008s : 1: inline 0.03% : 0.000008s : 1: insert-virtual-dataset 0.02% : 0.000006s : 1: interleave_parallel_branches 0.02% : 0.000006s : 1: interleave_split_concat_branches 0.03% : 0.000008s : 1: label_fine_grained_interleaved_index 0.03% : 0.000009s : 1: label_micro_interleaved_index 1.66% : 0.000453s : 1: loop_unroll 0.02% : 0.000007s : 1: merge_cast_opt 0.03% : 0.000007s : 1: micro_interleaved_order_control 1.76% : 0.000478s : 1: mutable_eliminate 0.04% : 0.000010s : 1: offloading_packed_experts 0.06% : 0.000017s : 1: opt.transform.loop_unroll_optimizer 0.06% : 0.000016s : 1: opt.transform.mutable_eliminate 5.95% : 0.001619s : 78: opt.transform.opt_a 0.15% : 0.000041s : 1: opt.transform.opt_after_cconv 0.11% : 0.000031s : 1: opt.transform.opt_after_jit_grad 0.59% : 0.000159s : 28: opt.transform.opt_b 0.24% : 0.000065s : 2: opt.transform.opt_trans_graph 0.17% : 0.000047s : 4: opt.transform.symbol_engine_opt 12.44% : 0.003387s : 1: opt_a 0.54% : 0.000146s : 1: opt_after_cconv 1.79% : 0.000487s : 1: opt_after_jit_grad 1.23% : 0.000334s : 1: opt_b 22.32% : 0.006076s : 1: optimize 0.08% : 0.000023s : 1: optimize_parallel_all_gather_comm 0.04% : 0.000012s : 1: order_py_execute_after_rewriter 0.10% : 0.000026s : 1: overlap_grad_flash_sp 0.02% : 0.000006s : 1: overlap_grad_matmul_and_grad_allreduce 0.04% : 0.000010s : 1: overlap_grad_ring_attention 0.03% : 0.000007s : 1: overlap_opt_shard_grad_in_pipeline 0.02% : 0.000006s : 1: overlap_opt_shard_in_pipeline 0.03% : 0.000008s : 1: overlap_param_gather 0.02% : 0.000006s : 1: overlap_recompute_allgather_and_fa_grad 0.04% : 0.000010s : 1: overlap_recompute_and_grad_model_parallel 0.03% : 0.000007s : 1: overlap_recompute_comm 0.04% : 0.000010s : 1: parallel-infer-symbol 0.02% : 0.000006s : 1: parallel-infer-symbol-second 0.03% : 0.000007s : 1: partial_unused_args_eliminate 0.03% : 0.000009s : 1: pipeline_parallel_scheduler 0.03% : 0.000007s : 1: pipeline_split 0.15% : 0.000040s : 1: pre_auto_parallel 0.12% : 0.000034s : 1: py_interpret_to_execute 0.06% : 0.000016s : 1: py_interpret_to_execute_after_opt_a 0.02% : 0.000006s : 1: remove_cast_before_assign_add 0.08% : 0.000022s : 1: remove_dup_value 1.10% : 0.000299s : 1: renormalize.infer 1.00% : 0.000272s : 1: renormalize.specialize 0.03% : 0.000008s : 1: reorder_send_recv_between_fp_bp 0.04% : 0.000010s : 1: rewriter_after_jit_bprop_graph 0.17% : 0.000047s : 1: rewriter_after_opt_a 0.36% : 0.000099s : 1: rewriter_before_opt_a 0.03% : 0.000007s : 1: slice_cell_reuse_recomputed_activation 0.03% : 0.000007s : 1: slice_recompute_activation 0.02% : 0.000007s : 1: split_layernorm_comm 0.03% : 0.000007s : 1: split_matmul_comm_elemetwise 0.04% : 0.000012s : 1: swap_dp_allreduce_reducescatter 0.40% : 0.000108s : 1: symbol_engine_optimizer 0.40% : 0.000108s : 1: tuple_transform 21.16% : 0.005760s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:47:09.449.165 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0151653, [21] [bootstrap]: 0.000443 [type_inference]: 0.00571638 [event_method]: 2.03e-05 [auto_monad]: 6.218e-05 [graph_reusing]: 5.47001e-06 [inline]: 2.15002e-06 [add_attr]: 0.00298118, [1] [add_attr_with_inline]: 0.00297286, [1] [Cycle 1]: 5.425e-05, [2] [tag_attr]: 2.017e-05 [meta_addattr_fg_expand]: 6.54999e-06 [parallel-infer-symbol]: 3.01001e-06 [pre_auto_parallel]: 3.297e-05 [insert-virtual-dataset]: 3.16001e-06 [parallel-infer-symbol-second]: 7.59988e-07 [dataset_repeat_opt]: 1.59e-06 [pipeline_split]: 1.57001e-06 [optimize]: 0.00519692, [53] [py_interpret_to_execute]: 2.742e-05 [rewriter_before_opt_a]: 9.169e-05 [opt_a]: 0.00310764, [2] [Cycle 1]: 0.00229273, [45] [expand_dump_flag]: 3.3e-06 [switch_simplify]: 4.711e-05 [loop_unroll]: 3.529e-05 [a_1]: 0.00082898 [with_stream_mark]: 1.531e-05 [recompute_prepare]: 9.89999e-06 [updatestate_depend_eliminate]: 4.50999e-06 [updatestate_assign_eliminate]: 4.18001e-06 [updatestate_loads_eliminate]: 3.83999e-06 [parameter_eliminate]: 1.79e-06 [a_2]: 0.00011131 [accelerated_algorithm]: 8.75999e-06 [shard]: 1.78002e-06 [meta_shard_fg_expand]: 2.34999e-06 [shard_inline]: 8.32e-06 [merge_send_recv]: 9.32999e-06 [auto_parallel]: 7.05998e-06 [parallel]: 1.782e-05 [flash_sp]: 7.79002e-06 [merge_comm]: 4.84e-06 [allreduce_fusion]: 4.4e-06 [matmul_add_comm_reduction]: 9.86e-06 [allreduce_slice_to_reducescatter]: 6.39993e-07 [virtual_shard_identity]: 9.94001e-06 [virtual_dataset]: 8.50001e-06 [get_grad_eliminate_]: 8.35001e-06 [virtual_output]: 8.31002e-06 [merge_forward]: 4.29002e-06 [cell_reuse_recompute_pass]: 1.20999e-06 [offload_activation]: 1.048e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.586e-05 [merge_recompute_call_nodes]: 1.44e-06 [before_grad]: 1.316e-05 [set_forward_comm_id_for_comm_node_pass]: 4.32e-06 [meta_fg_expand]: 3.63e-06 [flash_sp_send_recv_attached]: 2.60002e-06 [receive_attached]: 1.95001e-06 [after_resolve]: 1.448e-05 [a_after_grad]: 1.493e-05 [renormalize]: 0.00063942 [add_forward_monad_depend]: 5.59e-06 [auto_monad_grad]: 2.19001e-06 [auto_monad_eliminator]: 1.66e-05 [cse]: 3.32e-05 [a_3]: 6.072e-05 [Cycle 2]: 0.0008055, [45] [expand_dump_flag]: 1.12e-06 [switch_simplify]: 9.78998e-06 [loop_unroll]: 8.39998e-06 [a_1]: 0.0002015 [with_stream_mark]: 1.149e-05 [recompute_prepare]: 8.35001e-06 [updatestate_depend_eliminate]: 3.89002e-06 [updatestate_assign_eliminate]: 3.09001e-06 [updatestate_loads_eliminate]: 3.11001e-06 [parameter_eliminate]: 1.14e-06 [a_2]: 0.00010179 [accelerated_algorithm]: 8.12e-06 [shard]: 1.08001e-06 [meta_shard_fg_expand]: 1.72999e-06 [shard_inline]: 7.88999e-06 [merge_send_recv]: 5.72999e-06 [auto_parallel]: 6.28e-06 [parallel]: 4.63999e-06 [flash_sp]: 3.56999e-06 [merge_comm]: 4.4e-06 [allreduce_fusion]: 4.18999e-06 [matmul_add_comm_reduction]: 6.36998e-06 [allreduce_slice_to_reducescatter]: 3.60014e-07 [virtual_shard_identity]: 8.70001e-06 [virtual_dataset]: 7.9e-06 [get_grad_eliminate_]: 7.75998e-06 [virtual_output]: 7.46001e-06 [merge_forward]: 3.44001e-06 [cell_reuse_recompute_pass]: 1.33002e-06 [offload_activation]: 6.99001e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.548e-05 [merge_recompute_call_nodes]: 9.20001e-07 [before_grad]: 1.195e-05 [set_forward_comm_id_for_comm_node_pass]: 4.08999e-06 [meta_fg_expand]: 2.87002e-06 [flash_sp_send_recv_attached]: 8.2e-07 [receive_attached]: 1.00999e-06 [after_resolve]: 1.357e-05 [a_after_grad]: 1.213e-05 [renormalize]: 7.99773e-08 [add_forward_monad_depend]: 1.22e-06 [auto_monad_grad]: 1.02998e-06 [auto_monad_eliminator]: 8.12e-06 [cse]: 1.943e-05 [a_3]: 5.065e-05 [py_interpret_to_execute_after_opt_a]: 1.007e-05 [slice_cell_reuse_recomputed_activation]: 1.87999e-06 [rewriter_after_opt_a]: 3.695e-05 [convert_after_rewriter]: 7.98001e-06 [order_py_execute_after_rewriter]: 5.69999e-06 [mutable_eliminate]: 0.00046689 [opt_b]: 0.00026659, [1] [Cycle 1]: 0.00026068, [7] [b_1]: 0.00017905 [b_2]: 9.97999e-06 [updatestate_depend_eliminate]: 5.87999e-06 [updatestate_assign_eliminate]: 3.86999e-06 [updatestate_loads_eliminate]: 3.14999e-06 [renormalize]: 3.50003e-07 [cse]: 2.255e-05 [optimize_parallel_all_gather_comm]: 1.838e-05 [overlap_param_gather]: 2.00002e-06 [cconv]: 2.393e-05 [loop_unroll]: 0.00041984 [opt_after_cconv]: 0.00011921, [1] [Cycle 1]: 0.00011384, [7] [c_1]: 4.182e-05 [parameter_eliminate]: 2.61999e-06 [updatestate_depend_eliminate]: 5.52001e-06 [updatestate_assign_eliminate]: 3.28e-06 [updatestate_loads_eliminate]: 3.13e-06 [cse]: 2.315e-05 [renormalize]: 4.00003e-07 [remove_dup_value]: 1.597e-05 [tuple_transform]: 9.221e-05, [1] [Cycle 1]: 8.721e-05, [4] [d_1]: 5.783e-05 [none_parameter_eliminate]: 1.58002e-06 [renormalize]: 2.10013e-07 [switch_simplify]: 8.76002e-06 [partial_unused_args_eliminate]: 1.66e-06 [add_recomputation]: 5.164e-05 [cse_after_recomputation]: 2.55e-05, [1] [Cycle 1]: 2.107e-05, [1] [cse]: 1.569e-05 [environ_conv]: 6.20002e-06 [swap_dp_allreduce_reducescatter]: 6.09001e-06 [bias_add_comm_swap]: 2.66e-06 [label_micro_interleaved_index]: 3.91999e-06 [label_fine_grained_interleaved_index]: 2.77002e-06 [merge_cast_opt]: 1.30001e-06 [slice_recompute_activation]: 1.92001e-06 [micro_interleaved_order_control]: 2.81e-06 [assign_add_opt]: 1.14e-06 [ForceFp32Comm]: 8.00006e-07 [remove_cast_before_assign_add]: 9.79984e-07 [full_micro_interleaved_order_control]: 2.42001e-06 [reorder_send_recv_between_fp_bp]: 2.80002e-06 [comm_op_add_attrs]: 1.00001e-06 [add_comm_op_reuse_tag]: 9.30013e-07 [interleave_split_concat_branches]: 1.41998e-06 [interleave_parallel_branches]: 1.17e-06 [overlap_opt_shard_in_pipeline]: 1.20001e-06 [overlap_opt_shard_grad_in_pipeline]: 2.04999e-06 [control_data_broadcast_order]: 1.485e-05 [grouped_pairwise_exchange_alltoall]: 1.71e-06 [offloading_packed_experts]: 4.22e-06 [overlap_recompute_and_grad_model_parallel]: 4.74e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.19e-06 [overlap_recompute_allgather_and_fa_grad]: 1.34998e-06 [overlap_recompute_comm]: 2.44001e-06 [overlap_grad_ring_attention]: 4.89998e-06 [overlap_grad_flash_sp]: 1.959e-05 [begin_end_overlap_inline]: 5.40022e-07 [split_matmul_comm_elemetwise]: 2.37999e-06 [split_layernorm_comm]: 1.52999e-06 [handle_group_info]: 8.80013e-07 [symbol_engine_optimizer]: 8.437e-05, [1] [Cycle 1]: 8.019e-05, [6] [build]: 3.2e-06 [elim_shapecalc]: 1.204e-05 [elim_not_effective]: 1.554e-05 [opt_reshape]: 8.94003e-06 [fold_const_symbol]: 1.304e-05 [renormalize]: 2.20025e-07 [detach_backward]: 2.36998e-06 [pipeline_parallel_scheduler]: 1.66e-06 [auto_monad_reorder]: 1.912e-05 [get_jit_bprop_graph]: 1.37e-06 [rewriter_after_jit_bprop_graph]: 3.46001e-06 [opt_after_jit_grad]: 0.00048633 [validate]: 3.989e-05 Sums bootstrap : 0.000443s : 3.95% type_inference : 0.005716s : 50.92% event_method : 0.000020s : 0.18% auto_monad : 0.000062s : 0.55% graph_reusing : 0.000005s : 0.05% inline : 0.000002s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000020s : 0.18% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000007s : 0.06% parallel-infer-symbol : 0.000003s : 0.03% pre_auto_parallel : 0.000033s : 0.29% insert-virtual-dataset : 0.000003s : 0.03% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.01% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000027s : 0.24% optimize.rewriter_before_opt_a : 0.000092s : 0.82% optimize.opt_a.expand_dump_flag : 0.000004s : 0.04% optimize.opt_a.switch_simplify : 0.000057s : 0.51% optimize.opt_a.loop_unroll : 0.000044s : 0.39% optimize.opt_a.a_1 : 0.001030s : 9.18% optimize.opt_a.with_stream_mark : 0.000027s : 0.24% optimize.opt_a.recompute_prepare : 0.000018s : 0.16% optimize.opt_a.updatestate_depend_eliminate : 0.000008s : 0.07% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.06% optimize.opt_a.updatestate_loads_eliminate : 0.000007s : 0.06% optimize.opt_a.parameter_eliminate : 0.000003s : 0.03% optimize.opt_a.a_2 : 0.000213s : 1.90% optimize.opt_a.accelerated_algorithm : 0.000017s : 0.15% optimize.opt_a.shard : 0.000003s : 0.03% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.04% optimize.opt_a.shard_inline : 0.000016s : 0.14% optimize.opt_a.merge_send_recv : 0.000015s : 0.13% optimize.opt_a.auto_parallel : 0.000013s : 0.12% optimize.opt_a.parallel : 0.000022s : 0.20% optimize.opt_a.flash_sp : 0.000011s : 0.10% optimize.opt_a.merge_comm : 0.000009s : 0.08% optimize.opt_a.allreduce_fusion : 0.000009s : 0.08% optimize.opt_a.matmul_add_comm_reduction : 0.000016s : 0.14% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000019s : 0.17% optimize.opt_a.virtual_dataset : 0.000016s : 0.15% optimize.opt_a.get_grad_eliminate_ : 0.000016s : 0.14% optimize.opt_a.virtual_output : 0.000016s : 0.14% optimize.opt_a.merge_forward : 0.000008s : 0.07% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.02% optimize.opt_a.offload_activation : 0.000017s : 0.16% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000031s : 0.28% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.02% optimize.opt_a.before_grad : 0.000025s : 0.22% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000008s : 0.07% optimize.opt_a.meta_fg_expand : 0.000007s : 0.06% optimize.opt_a.flash_sp_send_recv_attached : 0.000003s : 0.03% optimize.opt_a.receive_attached : 0.000003s : 0.03% optimize.opt_a.after_resolve : 0.000028s : 0.25% optimize.opt_a.a_after_grad : 0.000027s : 0.24% optimize.opt_a.renormalize : 0.000639s : 5.70% optimize.opt_a.add_forward_monad_depend : 0.000007s : 0.06% optimize.opt_a.auto_monad_grad : 0.000003s : 0.03% optimize.opt_a.auto_monad_eliminator : 0.000025s : 0.22% optimize.opt_a.cse : 0.000053s : 0.47% optimize.opt_a.a_3 : 0.000111s : 0.99% optimize.py_interpret_to_execute_after_opt_a : 0.000010s : 0.09% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.02% optimize.rewriter_after_opt_a : 0.000037s : 0.33% optimize.convert_after_rewriter : 0.000008s : 0.07% optimize.order_py_execute_after_rewriter : 0.000006s : 0.05% optimize.mutable_eliminate : 0.000467s : 4.16% optimize.opt_b.b_1 : 0.000179s : 1.59% optimize.opt_b.b_2 : 0.000010s : 0.09% optimize.opt_b.updatestate_depend_eliminate : 0.000006s : 0.05% optimize.opt_b.updatestate_assign_eliminate : 0.000004s : 0.03% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_b.renormalize : 0.000000s : 0.00% optimize.opt_b.cse : 0.000023s : 0.20% optimize.optimize_parallel_all_gather_comm : 0.000018s : 0.16% optimize.overlap_param_gather : 0.000002s : 0.02% optimize.cconv : 0.000024s : 0.21% optimize.loop_unroll : 0.000420s : 3.74% optimize.opt_after_cconv.c_1 : 0.000042s : 0.37% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.02% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.05% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.cse : 0.000023s : 0.21% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000016s : 0.14% optimize.tuple_transform.d_1 : 0.000058s : 0.52% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000009s : 0.08% optimize.partial_unused_args_eliminate : 0.000002s : 0.01% optimize.add_recomputation : 0.000052s : 0.46% optimize.cse_after_recomputation.cse : 0.000016s : 0.14% optimize.environ_conv : 0.000006s : 0.06% optimize.swap_dp_allreduce_reducescatter : 0.000006s : 0.05% optimize.bias_add_comm_swap : 0.000003s : 0.02% optimize.label_micro_interleaved_index : 0.000004s : 0.03% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.02% optimize.merge_cast_opt : 0.000001s : 0.01% optimize.slice_recompute_activation : 0.000002s : 0.02% optimize.micro_interleaved_order_control : 0.000003s : 0.03% optimize.assign_add_opt : 0.000001s : 0.01% optimize.ForceFp32Comm : 0.000001s : 0.01% optimize.remove_cast_before_assign_add : 0.000001s : 0.01% optimize.full_micro_interleaved_order_control : 0.000002s : 0.02% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.02% optimize.comm_op_add_attrs : 0.000001s : 0.01% optimize.add_comm_op_reuse_tag : 0.000001s : 0.01% optimize.interleave_split_concat_branches : 0.000001s : 0.01% optimize.interleave_parallel_branches : 0.000001s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.02% optimize.control_data_broadcast_order : 0.000015s : 0.13% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.02% optimize.offloading_packed_experts : 0.000004s : 0.04% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.04% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.01% optimize.overlap_recompute_comm : 0.000002s : 0.02% optimize.overlap_grad_ring_attention : 0.000005s : 0.04% optimize.overlap_grad_flash_sp : 0.000020s : 0.17% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.02% optimize.split_layernorm_comm : 0.000002s : 0.01% optimize.handle_group_info : 0.000001s : 0.01% optimize.symbol_engine_optimizer.build : 0.000003s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000012s : 0.11% optimize.symbol_engine_optimizer.elim_not_effective : 0.000016s : 0.14% optimize.symbol_engine_optimizer.opt_reshape : 0.000009s : 0.08% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000013s : 0.12% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.02% pipeline_parallel_scheduler : 0.000002s : 0.01% auto_monad_reorder : 0.000019s : 0.17% get_jit_bprop_graph : 0.000001s : 0.01% rewriter_after_jit_bprop_graph : 0.000003s : 0.03% opt_after_jit_grad : 0.000486s : 4.33% validate : 0.000040s : 0.36% Time group info: ------[substitution.] 0.000213 44 8.77% : 0.000019s : 3: substitution.cast_eliminate 1.02% : 0.000002s : 3: substitution.elim_not_effective 0.76% : 0.000002s : 3: substitution.fold_const_symbol 3.19% : 0.000007s : 6: substitution.graph_param_transform 67.48% : 0.000144s : 4: substitution.inline 1.81% : 0.000004s : 6: substitution.j_node_and_user_rematch 3.28% : 0.000007s : 6: substitution.remove_not_recompute_node 2.48% : 0.000005s : 6: substitution.replace_old_param 8.48% : 0.000018s : 6: substitution.tuple_list_get_item_eliminator 2.72% : 0.000006s : 1: substitution.value_based_eliminate ------[type_inference.] 0.005660 2 86.97% : 0.004923s : 1: type_inference.infer 13.03% : 0.000737s : 1: type_inference.specialize ------[replace.] 0.000072 10 52.24% : 0.000037s : 4: replace.inline 47.76% : 0.000034s : 6: replace.tuple_list_get_item_eliminator ------[match.] 0.000156 10 90.20% : 0.000141s : 4: match.inline 9.80% : 0.000015s : 6: match.tuple_list_get_item_eliminator ------[predicate.] 0.000283 1908 0.93% : 0.000003s : 20: predicate.accumulaten_eliminater 0.70% : 0.000002s : 6: predicate.ad_related_special_op_eliminate 0.54% : 0.000002s : 12: predicate.addn_check_dump 0.96% : 0.000003s : 20: predicate.addn_zero_filter 0.88% : 0.000002s : 20: predicate.adjust_all_reduce_mul_add 2.02% : 0.000006s : 32: predicate.arithmetic_simplify 1.09% : 0.000003s : 20: predicate.cast_eliminate 0.59% : 0.000002s : 12: predicate.check_bprop_eliminate 0.56% : 0.000002s : 12: predicate.compare_switch_simplify 0.20% : 0.000001s : 6: predicate.const_output_eliminate 0.64% : 0.000002s : 12: predicate.depend_value_elim 1.01% : 0.000003s : 20: predicate.dict_get_item_const_eliminator 1.08% : 0.000003s : 20: predicate.dict_get_item_eliminator 0.91% : 0.000003s : 20: predicate.dict_set_item_eliminator 0.87% : 0.000002s : 12: predicate.dumpgradient_eliminate 0.24% : 0.000001s : 6: predicate.elim_not_effective 0.35% : 0.000001s : 6: predicate.elim_shapecalc_of_broadcastargs 1.21% : 0.000003s : 26: predicate.environ_add_const_eliminate 1.17% : 0.000003s : 26: predicate.environ_get_add_eliminate 1.15% : 0.000003s : 26: predicate.environ_get_depend_swap 1.83% : 0.000005s : 38: predicate.environ_get_eliminate 1.19% : 0.000003s : 26: predicate.environ_get_set_eliminate 1.48% : 0.000004s : 30: predicate.exchange_switch_depend_value 2.32% : 0.000007s : 30: predicate.float_depend_g_call 0.53% : 0.000002s : 12: predicate.float_environ_get_switch 0.79% : 0.000002s : 18: predicate.float_tuple_getitem_switch 0.20% : 0.000001s : 6: predicate.fold_const_symbol 0.73% : 0.000002s : 12: predicate.get_grad_eliminate 0.20% : 0.000001s : 6: predicate.graph_param_transform 0.60% : 0.000002s : 12: predicate.incorporate_call 0.52% : 0.000001s : 12: predicate.incorporate_call_switch 6.07% : 0.000017s : 86: predicate.inline 0.80% : 0.000002s : 12: predicate.inline_without_move 0.35% : 0.000001s : 12: predicate.j_node_and_user_rematch 0.73% : 0.000002s : 12: predicate.less_batch_normalization 1.80% : 0.000005s : 38: predicate.list_to_tuple_eliminator_ 2.79% : 0.000008s : 58: predicate.load_eliminater 0.73% : 0.000002s : 6: predicate.loop_unroll_after_grad 2.10% : 0.000006s : 42: predicate.loop_unroll_before_grad 1.67% : 0.000005s : 32: predicate.make_slice_get_slice_eliminator 0.56% : 0.000002s : 12: predicate.merge_addn 0.58% : 0.000002s : 12: predicate.micro_step_allgather_replace 0.61% : 0.000002s : 12: predicate.mini_step_allgather_replace 0.89% : 0.000003s : 20: predicate.minmaximum_grad 0.75% : 0.000002s : 6: predicate.mutable_eliminate 0.35% : 0.000001s : 6: predicate.opt_reshape 0.56% : 0.000002s : 6: predicate.parallel_virtual_node 1.74% : 0.000005s : 30: predicate.partial_defer_inline 1.84% : 0.000005s : 32: predicate.partial_eliminate 1.02% : 0.000003s : 20: predicate.print_const_string_wrapper 0.58% : 0.000002s : 12: predicate.reduce_all_const_elim 1.22% : 0.000003s : 20: predicate.reduce_eliminate 2.71% : 0.000008s : 58: predicate.redundant_stop_gradient_eliminater 0.41% : 0.000001s : 12: predicate.remove_not_recompute_node 1.42% : 0.000004s : 38: predicate.replace_applicator 0.51% : 0.000001s : 12: predicate.replace_old_param 0.25% : 0.000001s : 6: predicate.reset_defer_inline 0.99% : 0.000003s : 20: predicate.reshape_eliminate 0.59% : 0.000002s : 12: predicate.row_tensor_add_zeros_like 0.37% : 0.000001s : 6: predicate.row_tensor_eliminate 0.68% : 0.000002s : 12: predicate.same_eliminate 0.43% : 0.000001s : 12: predicate.set_cell_output_no_recompute 0.74% : 0.000002s : 12: predicate.shard_identity_eliminate 0.75% : 0.000002s : 12: predicate.special_op_eliminate 0.72% : 0.000002s : 12: predicate.specialize_transform 0.80% : 0.000002s : 12: predicate.split_environ_get_set_with_tuple_value 0.71% : 0.000002s : 12: predicate.stack_unstack_eliminate 0.36% : 0.000001s : 6: predicate.switch_call_monad_eliminater 1.57% : 0.000004s : 30: predicate.switch_defer_inline 2.11% : 0.000006s : 42: predicate.switch_layer_defer_inline 4.85% : 0.000014s : 90: predicate.switch_simplify 0.92% : 0.000003s : 20: predicate.tile_eliminate 1.14% : 0.000003s : 20: predicate.transpose_eliminate 1.51% : 0.000004s : 32: predicate.tuple_list_convert_item_index_to_positive 1.68% : 0.000005s : 32: predicate.tuple_list_get_item_const_eliminator 1.53% : 0.000004s : 32: predicate.tuple_list_get_item_depend_reorder 3.21% : 0.000009s : 50: predicate.tuple_list_get_item_eliminator 1.55% : 0.000004s : 32: predicate.tuple_list_get_set_item_eliminator 2.19% : 0.000006s : 44: predicate.tuple_list_set_item_eliminator 1.80% : 0.000005s : 38: predicate.tuple_to_list_eliminator_ 2.67% : 0.000008s : 58: predicate.updatestate_pure_node_eliminater 3.24% : 0.000009s : 70: predicate.updatestate_useless_node_eliminater 0.40% : 0.000001s : 6: predicate.value_based_eliminate 0.64% : 0.000002s : 12: predicate.virtual_dataset_eliminate 0.67% : 0.000002s : 12: predicate.virtual_output_eliminate 0.28% : 0.000001s : 6: predicate.virtual_view_grad_eliminate 0.37% : 0.000001s : 6: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000523 11 54.57% : 0.000286s : 5: func_graph_cloner_run.FuncGraphClonerGraph 45.43% : 0.000238s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.025827 192 0.01% : 0.000003s : 1: ForceFp32Comm 11.56% : 0.002986s : 1: add_attr 11.52% : 0.002976s : 1: add_attr_with_inline 0.01% : 0.000004s : 1: add_comm_op_reuse_tag 0.21% : 0.000055s : 1: add_recomputation 0.01% : 0.000004s : 1: assign_add_opt 0.26% : 0.000068s : 1: auto_monad 0.09% : 0.000023s : 1: auto_monad_reorder 0.01% : 0.000003s : 1: begin_end_overlap_inline 0.02% : 0.000005s : 1: bias_add_comm_swap 1.82% : 0.000471s : 1: bootstrap 0.11% : 0.000027s : 1: cconv 0.01% : 0.000004s : 1: comm_op_add_attrs 0.07% : 0.000018s : 1: control_data_broadcast_order 0.04% : 0.000011s : 1: convert_after_rewriter 0.11% : 0.000028s : 1: cse_after_recomputation 0.02% : 0.000004s : 1: dataset_repeat_opt 0.02% : 0.000006s : 1: detach_backward 0.04% : 0.000009s : 1: environ_conv 0.10% : 0.000026s : 1: event_method 0.02% : 0.000005s : 1: full_micro_interleaved_order_control 0.02% : 0.000004s : 1: get_jit_bprop_graph 0.03% : 0.000009s : 1: graph_reusing 0.02% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000004s : 1: handle_group_info 0.02% : 0.000005s : 1: inline 0.03% : 0.000007s : 1: insert-virtual-dataset 0.02% : 0.000004s : 1: interleave_parallel_branches 0.02% : 0.000004s : 1: interleave_split_concat_branches 0.02% : 0.000006s : 1: label_fine_grained_interleaved_index 0.03% : 0.000007s : 1: label_micro_interleaved_index 1.66% : 0.000428s : 1: loop_unroll 0.02% : 0.000004s : 1: merge_cast_opt 0.02% : 0.000006s : 1: micro_interleaved_order_control 1.84% : 0.000475s : 1: mutable_eliminate 0.03% : 0.000007s : 1: offloading_packed_experts 0.06% : 0.000016s : 1: opt.transform.loop_unroll_optimizer 0.06% : 0.000017s : 1: opt.transform.mutable_eliminate 6.18% : 0.001597s : 78: opt.transform.opt_a 0.16% : 0.000041s : 1: opt.transform.opt_after_cconv 0.12% : 0.000031s : 1: opt.transform.opt_after_jit_grad 0.61% : 0.000158s : 28: opt.transform.opt_b 0.25% : 0.000064s : 2: opt.transform.opt_trans_graph 0.18% : 0.000045s : 4: opt.transform.symbol_engine_opt 12.04% : 0.003111s : 1: opt_a 0.48% : 0.000123s : 1: opt_after_cconv 1.92% : 0.000495s : 1: opt_after_jit_grad 1.05% : 0.000270s : 1: opt_b 20.14% : 0.005201s : 1: optimize 0.08% : 0.000022s : 1: optimize_parallel_all_gather_comm 0.03% : 0.000009s : 1: order_py_execute_after_rewriter 0.09% : 0.000023s : 1: overlap_grad_flash_sp 0.02% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.03% : 0.000008s : 1: overlap_grad_ring_attention 0.02% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.02% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.02% : 0.000005s : 1: overlap_param_gather 0.02% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.03% : 0.000008s : 1: overlap_recompute_and_grad_model_parallel 0.02% : 0.000005s : 1: overlap_recompute_comm 0.03% : 0.000007s : 1: parallel-infer-symbol 0.01% : 0.000004s : 1: parallel-infer-symbol-second 0.02% : 0.000005s : 1: partial_unused_args_eliminate 0.02% : 0.000005s : 1: pipeline_parallel_scheduler 0.02% : 0.000004s : 1: pipeline_split 0.14% : 0.000037s : 1: pre_auto_parallel 0.12% : 0.000031s : 1: py_interpret_to_execute 0.05% : 0.000013s : 1: py_interpret_to_execute_after_opt_a 0.02% : 0.000004s : 1: remove_cast_before_assign_add 0.08% : 0.000019s : 1: remove_dup_value 1.33% : 0.000344s : 1: renormalize.infer 1.12% : 0.000288s : 1: renormalize.specialize 0.02% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.03% : 0.000007s : 1: rewriter_after_jit_bprop_graph 0.16% : 0.000041s : 1: rewriter_after_opt_a 0.37% : 0.000096s : 1: rewriter_before_opt_a 0.02% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.02% : 0.000005s : 1: slice_recompute_activation 0.02% : 0.000004s : 1: split_layernorm_comm 0.02% : 0.000005s : 1: split_matmul_comm_elemetwise 0.03% : 0.000009s : 1: swap_dp_allreduce_reducescatter 0.34% : 0.000087s : 1: symbol_engine_optimizer 0.37% : 0.000095s : 1: tuple_transform 22.19% : 0.005730s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:47:09.634.760 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:47:09.635.023 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0174988, [21] [bootstrap]: 0.00044755 [type_inference]: 0.00615766 [event_method]: 2.163e-05 [auto_monad]: 6.344e-05 [graph_reusing]: 5.61e-06 [inline]: 3.06999e-06 [add_attr]: 0.00323601, [1] [add_attr_with_inline]: 0.00322411, [1] [Cycle 1]: 8.717e-05, [2] [tag_attr]: 2.255e-05 [meta_addattr_fg_expand]: 6.31e-06 [parallel-infer-symbol]: 3.51999e-06 [pre_auto_parallel]: 3.889e-05 [insert-virtual-dataset]: 2.31998e-06 [parallel-infer-symbol-second]: 9.09989e-07 [dataset_repeat_opt]: 2.16e-06 [pipeline_split]: 1.86998e-06 [optimize]: 0.00620045, [53] [py_interpret_to_execute]: 5.068e-05 [rewriter_before_opt_a]: 9.911e-05 [opt_a]: 0.00351817, [2] [Cycle 1]: 0.00254121, [45] [expand_dump_flag]: 3.21999e-06 [switch_simplify]: 4.908e-05 [loop_unroll]: 3.62e-05 [a_1]: 0.00075051 [with_stream_mark]: 2.527e-05 [recompute_prepare]: 1.25e-05 [updatestate_depend_eliminate]: 4.99998e-06 [updatestate_assign_eliminate]: 3.38e-06 [updatestate_loads_eliminate]: 3.18e-06 [parameter_eliminate]: 2.54001e-06 [a_2]: 0.00012133 [accelerated_algorithm]: 9.29998e-06 [shard]: 2.92002e-06 [meta_shard_fg_expand]: 2.73e-06 [shard_inline]: 7.11001e-06 [merge_send_recv]: 8.60001e-06 [auto_parallel]: 8.23999e-06 [parallel]: 1.99e-05 [flash_sp]: 1.068e-05 [merge_comm]: 4.35e-06 [allreduce_fusion]: 3.5e-06 [matmul_add_comm_reduction]: 1.024e-05 [allreduce_slice_to_reducescatter]: 1.08001e-06 [virtual_shard_identity]: 1.142e-05 [virtual_dataset]: 7.87e-06 [get_grad_eliminate_]: 7.58001e-06 [virtual_output]: 7.65998e-06 [merge_forward]: 4.02e-06 [cell_reuse_recompute_pass]: 2.03002e-06 [offload_activation]: 1.076e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.875e-05 [merge_recompute_call_nodes]: 1.57001e-06 [before_grad]: 1.099e-05 [set_forward_comm_id_for_comm_node_pass]: 4.15e-06 [meta_fg_expand]: 3.14999e-06 [flash_sp_send_recv_attached]: 4.70999e-06 [receive_attached]: 2.36e-06 [after_resolve]: 1.398e-05 [a_after_grad]: 1.145e-05 [renormalize]: 0.00073868 [add_forward_monad_depend]: 7.28e-06 [auto_monad_grad]: 2.88e-06 [auto_monad_eliminator]: 1.837e-05 [cse]: 3.315e-05 [a_3]: 7.056e-05 [Cycle 2]: 0.0009595, [45] [expand_dump_flag]: 2.73e-06 [switch_simplify]: 9.59e-06 [loop_unroll]: 7.19001e-06 [a_1]: 0.00015854 [with_stream_mark]: 1.737e-05 [recompute_prepare]: 7.88999e-06 [updatestate_depend_eliminate]: 3.93001e-06 [updatestate_assign_eliminate]: 3.33e-06 [updatestate_loads_eliminate]: 2.29001e-06 [parameter_eliminate]: 1.69e-06 [a_2]: 0.00011136 [accelerated_algorithm]: 7.52002e-06 [shard]: 1.53002e-06 [meta_shard_fg_expand]: 1.60999e-06 [shard_inline]: 7.83001e-06 [merge_send_recv]: 7.00998e-06 [auto_parallel]: 6.66e-06 [parallel]: 8.44002e-06 [flash_sp]: 3.24001e-06 [merge_comm]: 4.12e-06 [allreduce_fusion]: 3.13e-06 [matmul_add_comm_reduction]: 9.05999e-06 [allreduce_slice_to_reducescatter]: 7.10017e-07 [virtual_shard_identity]: 8.79e-06 [virtual_dataset]: 7.15998e-06 [get_grad_eliminate_]: 6.79001e-06 [virtual_output]: 7.02002e-06 [merge_forward]: 4.47e-06 [cell_reuse_recompute_pass]: 2.25002e-06 [offload_activation]: 1.035e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.157e-05 [merge_recompute_call_nodes]: 1.24e-06 [before_grad]: 1.195e-05 [set_forward_comm_id_for_comm_node_pass]: 4.48001e-06 [meta_fg_expand]: 2.49999e-06 [flash_sp_send_recv_attached]: 1.30001e-06 [receive_attached]: 2.31998e-06 [after_resolve]: 1.408e-05 [a_after_grad]: 1.12e-05 [renormalize]: 6.00121e-08 [add_forward_monad_depend]: 3.85e-06 [auto_monad_grad]: 1.82001e-06 [auto_monad_eliminator]: 1.146e-05 [cse]: 2.164e-05 [a_3]: 5.711e-05 [py_interpret_to_execute_after_opt_a]: 1.838e-05 [slice_cell_reuse_recomputed_activation]: 5.49e-06 [rewriter_after_opt_a]: 5.069e-05 [convert_after_rewriter]: 1.018e-05 [order_py_execute_after_rewriter]: 9.10999e-06 [mutable_eliminate]: 0.00065086 [opt_b]: 0.00032544, [1] [Cycle 1]: 0.00031484, [7] [b_1]: 0.00020073 [b_2]: 9.36998e-06 [updatestate_depend_eliminate]: 9.68002e-06 [updatestate_assign_eliminate]: 3.21001e-06 [updatestate_loads_eliminate]: 2.98e-06 [renormalize]: 7.89994e-07 [cse]: 2.829e-05 [optimize_parallel_all_gather_comm]: 2.392e-05 [overlap_param_gather]: 5.46e-06 [cconv]: 3.563e-05 [loop_unroll]: 0.00050286 [opt_after_cconv]: 0.00014794, [1] [Cycle 1]: 0.00013794, [7] [c_1]: 3.843e-05 [parameter_eliminate]: 5.35001e-06 [updatestate_depend_eliminate]: 7.50998e-06 [updatestate_assign_eliminate]: 2.44999e-06 [updatestate_loads_eliminate]: 2.40002e-06 [cse]: 2.246e-05 [renormalize]: 6.00005e-07 [remove_dup_value]: 1.839e-05 [tuple_transform]: 0.00010673, [1] [Cycle 1]: 9.835e-05, [4] [d_1]: 5.324e-05 [none_parameter_eliminate]: 2.39999e-06 [renormalize]: 2.69996e-07 [switch_simplify]: 8.44002e-06 [partial_unused_args_eliminate]: 4.62e-06 [add_recomputation]: 5.734e-05 [cse_after_recomputation]: 3.012e-05, [1] [Cycle 1]: 2.302e-05, [1] [cse]: 1.275e-05 [environ_conv]: 9.46e-06 [swap_dp_allreduce_reducescatter]: 8.43999e-06 [bias_add_comm_swap]: 5.39e-06 [label_micro_interleaved_index]: 7.92e-06 [label_fine_grained_interleaved_index]: 5.77001e-06 [merge_cast_opt]: 3.84002e-06 [slice_recompute_activation]: 4.57998e-06 [micro_interleaved_order_control]: 4.63999e-06 [assign_add_opt]: 3.98001e-06 [ForceFp32Comm]: 3.32002e-06 [remove_cast_before_assign_add]: 3.46999e-06 [full_micro_interleaved_order_control]: 4.84e-06 [reorder_send_recv_between_fp_bp]: 5.07999e-06 [comm_op_add_attrs]: 3.59002e-06 [add_comm_op_reuse_tag]: 3.37002e-06 [interleave_split_concat_branches]: 3.58999e-06 [interleave_parallel_branches]: 3.57002e-06 [overlap_opt_shard_in_pipeline]: 4.15e-06 [overlap_opt_shard_grad_in_pipeline]: 3.97e-06 [control_data_broadcast_order]: 1.868e-05 [grouped_pairwise_exchange_alltoall]: 4.05998e-06 [offloading_packed_experts]: 6.41e-06 [overlap_recompute_and_grad_model_parallel]: 7.44002e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.55e-06 [overlap_recompute_allgather_and_fa_grad]: 3.61999e-06 [overlap_recompute_comm]: 4.90001e-06 [overlap_grad_ring_attention]: 6.52001e-06 [overlap_grad_flash_sp]: 2.722e-05 [begin_end_overlap_inline]: 3.02002e-06 [split_matmul_comm_elemetwise]: 4.55999e-06 [split_layernorm_comm]: 3.97998e-06 [handle_group_info]: 3.14001e-06 [symbol_engine_optimizer]: 0.00010857, [1] [Cycle 1]: 0.0001008, [6] [build]: 4.57e-06 [elim_shapecalc]: 1.284e-05 [elim_not_effective]: 1.364e-05 [opt_reshape]: 8.22998e-06 [fold_const_symbol]: 1.093e-05 [renormalize]: 2.30008e-07 [detach_backward]: 4.31002e-06 [pipeline_parallel_scheduler]: 2.10002e-06 [auto_monad_reorder]: 2.778e-05 [get_jit_bprop_graph]: 1.90001e-06 [rewriter_after_jit_bprop_graph]: 6.25002e-06 [opt_after_jit_grad]: 0.00060787 [validate]: 4.69e-05 Sums bootstrap : 0.000448s : 3.62% type_inference : 0.006158s : 49.80% event_method : 0.000022s : 0.17% auto_monad : 0.000063s : 0.51% graph_reusing : 0.000006s : 0.05% inline : 0.000003s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000023s : 0.18% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.05% parallel-infer-symbol : 0.000004s : 0.03% pre_auto_parallel : 0.000039s : 0.31% insert-virtual-dataset : 0.000002s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.02% optimize.py_interpret_to_execute : 0.000051s : 0.41% optimize.rewriter_before_opt_a : 0.000099s : 0.80% optimize.opt_a.expand_dump_flag : 0.000006s : 0.05% optimize.opt_a.switch_simplify : 0.000059s : 0.47% optimize.opt_a.loop_unroll : 0.000043s : 0.35% optimize.opt_a.a_1 : 0.000909s : 7.35% optimize.opt_a.with_stream_mark : 0.000043s : 0.34% optimize.opt_a.recompute_prepare : 0.000020s : 0.16% optimize.opt_a.updatestate_depend_eliminate : 0.000009s : 0.07% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.05% optimize.opt_a.updatestate_loads_eliminate : 0.000005s : 0.04% optimize.opt_a.parameter_eliminate : 0.000004s : 0.03% optimize.opt_a.a_2 : 0.000233s : 1.88% optimize.opt_a.accelerated_algorithm : 0.000017s : 0.14% optimize.opt_a.shard : 0.000004s : 0.04% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.04% optimize.opt_a.shard_inline : 0.000015s : 0.12% optimize.opt_a.merge_send_recv : 0.000016s : 0.13% optimize.opt_a.auto_parallel : 0.000015s : 0.12% optimize.opt_a.parallel : 0.000028s : 0.23% optimize.opt_a.flash_sp : 0.000014s : 0.11% optimize.opt_a.merge_comm : 0.000008s : 0.07% optimize.opt_a.allreduce_fusion : 0.000007s : 0.05% optimize.opt_a.matmul_add_comm_reduction : 0.000019s : 0.16% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000002s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000020s : 0.16% optimize.opt_a.virtual_dataset : 0.000015s : 0.12% optimize.opt_a.get_grad_eliminate_ : 0.000014s : 0.12% optimize.opt_a.virtual_output : 0.000015s : 0.12% optimize.opt_a.merge_forward : 0.000008s : 0.07% optimize.opt_a.cell_reuse_recompute_pass : 0.000004s : 0.03% optimize.opt_a.offload_activation : 0.000021s : 0.17% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000040s : 0.33% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.02% optimize.opt_a.before_grad : 0.000023s : 0.19% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000009s : 0.07% optimize.opt_a.meta_fg_expand : 0.000006s : 0.05% optimize.opt_a.flash_sp_send_recv_attached : 0.000006s : 0.05% optimize.opt_a.receive_attached : 0.000005s : 0.04% optimize.opt_a.after_resolve : 0.000028s : 0.23% optimize.opt_a.a_after_grad : 0.000023s : 0.18% optimize.opt_a.renormalize : 0.000739s : 5.98% optimize.opt_a.add_forward_monad_depend : 0.000011s : 0.09% optimize.opt_a.auto_monad_grad : 0.000005s : 0.04% optimize.opt_a.auto_monad_eliminator : 0.000030s : 0.24% optimize.opt_a.cse : 0.000055s : 0.44% optimize.opt_a.a_3 : 0.000128s : 1.03% optimize.py_interpret_to_execute_after_opt_a : 0.000018s : 0.15% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.04% optimize.rewriter_after_opt_a : 0.000051s : 0.41% optimize.convert_after_rewriter : 0.000010s : 0.08% optimize.order_py_execute_after_rewriter : 0.000009s : 0.07% optimize.mutable_eliminate : 0.000651s : 5.26% optimize.opt_b.b_1 : 0.000201s : 1.62% optimize.opt_b.b_2 : 0.000009s : 0.08% optimize.opt_b.updatestate_depend_eliminate : 0.000010s : 0.08% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.02% optimize.opt_b.renormalize : 0.000001s : 0.01% optimize.opt_b.cse : 0.000028s : 0.23% optimize.optimize_parallel_all_gather_comm : 0.000024s : 0.19% optimize.overlap_param_gather : 0.000005s : 0.04% optimize.cconv : 0.000036s : 0.29% optimize.loop_unroll : 0.000503s : 4.07% optimize.opt_after_cconv.c_1 : 0.000038s : 0.31% optimize.opt_after_cconv.parameter_eliminate : 0.000005s : 0.04% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000008s : 0.06% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000002s : 0.02% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.02% optimize.opt_after_cconv.cse : 0.000022s : 0.18% optimize.opt_after_cconv.renormalize : 0.000001s : 0.00% optimize.remove_dup_value : 0.000018s : 0.15% optimize.tuple_transform.d_1 : 0.000053s : 0.43% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.02% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000008s : 0.07% optimize.partial_unused_args_eliminate : 0.000005s : 0.04% optimize.add_recomputation : 0.000057s : 0.46% optimize.cse_after_recomputation.cse : 0.000013s : 0.10% optimize.environ_conv : 0.000009s : 0.08% optimize.swap_dp_allreduce_reducescatter : 0.000008s : 0.07% optimize.bias_add_comm_swap : 0.000005s : 0.04% optimize.label_micro_interleaved_index : 0.000008s : 0.06% optimize.label_fine_grained_interleaved_index : 0.000006s : 0.05% optimize.merge_cast_opt : 0.000004s : 0.03% optimize.slice_recompute_activation : 0.000005s : 0.04% optimize.micro_interleaved_order_control : 0.000005s : 0.04% optimize.assign_add_opt : 0.000004s : 0.03% optimize.ForceFp32Comm : 0.000003s : 0.03% optimize.remove_cast_before_assign_add : 0.000003s : 0.03% optimize.full_micro_interleaved_order_control : 0.000005s : 0.04% optimize.reorder_send_recv_between_fp_bp : 0.000005s : 0.04% optimize.comm_op_add_attrs : 0.000004s : 0.03% optimize.add_comm_op_reuse_tag : 0.000003s : 0.03% optimize.interleave_split_concat_branches : 0.000004s : 0.03% optimize.interleave_parallel_branches : 0.000004s : 0.03% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.03% optimize.overlap_opt_shard_grad_in_pipeline : 0.000004s : 0.03% optimize.control_data_broadcast_order : 0.000019s : 0.15% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.03% optimize.offloading_packed_experts : 0.000006s : 0.05% optimize.overlap_recompute_and_grad_model_parallel : 0.000007s : 0.06% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.03% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.03% optimize.overlap_recompute_comm : 0.000005s : 0.04% optimize.overlap_grad_ring_attention : 0.000007s : 0.05% optimize.overlap_grad_flash_sp : 0.000027s : 0.22% optimize.begin_end_overlap_inline : 0.000003s : 0.02% optimize.split_matmul_comm_elemetwise : 0.000005s : 0.04% optimize.split_layernorm_comm : 0.000004s : 0.03% optimize.handle_group_info : 0.000003s : 0.03% optimize.symbol_engine_optimizer.build : 0.000005s : 0.04% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000013s : 0.10% optimize.symbol_engine_optimizer.elim_not_effective : 0.000014s : 0.11% optimize.symbol_engine_optimizer.opt_reshape : 0.000008s : 0.07% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000011s : 0.09% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000004s : 0.03% pipeline_parallel_scheduler : 0.000002s : 0.02% auto_monad_reorder : 0.000028s : 0.22% get_jit_bprop_graph : 0.000002s : 0.02% rewriter_after_jit_bprop_graph : 0.000006s : 0.05% opt_after_jit_grad : 0.000608s : 4.92% validate : 0.000047s : 0.38% Time group info: ------[substitution.] 0.000216 34 0.95% : 0.000002s : 2: substitution.elim_not_effective 0.62% : 0.000001s : 2: substitution.fold_const_symbol 2.96% : 0.000006s : 5: substitution.graph_param_transform 75.87% : 0.000164s : 4: substitution.inline 2.07% : 0.000004s : 4: substitution.j_node_and_user_rematch 2.50% : 0.000005s : 4: substitution.remove_not_recompute_node 3.09% : 0.000007s : 6: substitution.replace_old_param 9.00% : 0.000019s : 6: substitution.tuple_list_get_item_eliminator 2.92% : 0.000006s : 1: substitution.value_based_eliminate ------[type_inference.] 0.006097 2 86.33% : 0.005264s : 1: type_inference.infer 13.67% : 0.000833s : 1: type_inference.specialize ------[replace.] 0.000072 10 54.12% : 0.000039s : 4: replace.inline 45.88% : 0.000033s : 6: replace.tuple_list_get_item_eliminator ------[match.] 0.000177 10 90.59% : 0.000161s : 4: match.inline 9.41% : 0.000017s : 6: match.tuple_list_get_item_eliminator ------[predicate.] 0.000247 1590 0.84% : 0.000002s : 16: predicate.accumulaten_eliminater 0.94% : 0.000002s : 5: predicate.ad_related_special_op_eliminate 0.47% : 0.000001s : 10: predicate.addn_check_dump 0.86% : 0.000002s : 16: predicate.addn_zero_filter 0.78% : 0.000002s : 16: predicate.adjust_all_reduce_mul_add 1.96% : 0.000005s : 26: predicate.arithmetic_simplify 0.85% : 0.000002s : 16: predicate.cast_eliminate 0.54% : 0.000001s : 10: predicate.check_bprop_eliminate 0.49% : 0.000001s : 10: predicate.compare_switch_simplify 0.19% : 0.000000s : 5: predicate.const_output_eliminate 0.56% : 0.000001s : 10: predicate.depend_value_elim 0.87% : 0.000002s : 16: predicate.dict_get_item_const_eliminator 0.97% : 0.000002s : 16: predicate.dict_get_item_eliminator 0.83% : 0.000002s : 16: predicate.dict_set_item_eliminator 1.19% : 0.000003s : 10: predicate.dumpgradient_eliminate 0.21% : 0.000001s : 5: predicate.elim_not_effective 0.36% : 0.000001s : 5: predicate.elim_shapecalc_of_broadcastargs 1.25% : 0.000003s : 21: predicate.environ_add_const_eliminate 1.01% : 0.000002s : 21: predicate.environ_get_add_eliminate 0.99% : 0.000002s : 21: predicate.environ_get_depend_swap 1.76% : 0.000004s : 31: predicate.environ_get_eliminate 1.02% : 0.000003s : 21: predicate.environ_get_set_eliminate 1.46% : 0.000004s : 26: predicate.exchange_switch_depend_value 2.56% : 0.000006s : 26: predicate.float_depend_g_call 0.48% : 0.000001s : 10: predicate.float_environ_get_switch 0.72% : 0.000002s : 15: predicate.float_tuple_getitem_switch 0.17% : 0.000000s : 5: predicate.fold_const_symbol 0.61% : 0.000002s : 10: predicate.get_grad_eliminate 0.27% : 0.000001s : 5: predicate.graph_param_transform 0.65% : 0.000002s : 10: predicate.incorporate_call 0.53% : 0.000001s : 10: predicate.incorporate_call_switch 6.59% : 0.000016s : 72: predicate.inline 0.97% : 0.000002s : 10: predicate.inline_without_move 0.32% : 0.000001s : 10: predicate.j_node_and_user_rematch 0.72% : 0.000002s : 10: predicate.less_batch_normalization 1.84% : 0.000005s : 32: predicate.list_to_tuple_eliminator_ 2.46% : 0.000006s : 48: predicate.load_eliminater 1.03% : 0.000003s : 5: predicate.loop_unroll_after_grad 2.38% : 0.000006s : 40: predicate.loop_unroll_before_grad 1.53% : 0.000004s : 26: predicate.make_slice_get_slice_eliminator 0.53% : 0.000001s : 10: predicate.merge_addn 0.51% : 0.000001s : 10: predicate.micro_step_allgather_replace 0.52% : 0.000001s : 10: predicate.mini_step_allgather_replace 0.81% : 0.000002s : 16: predicate.minmaximum_grad 1.54% : 0.000004s : 5: predicate.mutable_eliminate 0.32% : 0.000001s : 5: predicate.opt_reshape 0.38% : 0.000001s : 5: predicate.parallel_virtual_node 1.68% : 0.000004s : 26: predicate.partial_defer_inline 1.70% : 0.000004s : 27: predicate.partial_eliminate 0.84% : 0.000002s : 16: predicate.print_const_string_wrapper 0.55% : 0.000001s : 10: predicate.reduce_all_const_elim 1.02% : 0.000003s : 16: predicate.reduce_eliminate 2.49% : 0.000006s : 48: predicate.redundant_stop_gradient_eliminater 0.55% : 0.000001s : 10: predicate.remove_not_recompute_node 1.50% : 0.000004s : 32: predicate.replace_applicator 0.49% : 0.000001s : 10: predicate.replace_old_param 0.51% : 0.000001s : 5: predicate.reset_defer_inline 0.86% : 0.000002s : 16: predicate.reshape_eliminate 0.63% : 0.000002s : 10: predicate.row_tensor_add_zeros_like 0.38% : 0.000001s : 5: predicate.row_tensor_eliminate 0.78% : 0.000002s : 10: predicate.same_eliminate 0.67% : 0.000002s : 10: predicate.set_cell_output_no_recompute 0.98% : 0.000002s : 10: predicate.shard_identity_eliminate 0.82% : 0.000002s : 10: predicate.special_op_eliminate 0.76% : 0.000002s : 10: predicate.specialize_transform 0.86% : 0.000002s : 10: predicate.split_environ_get_set_with_tuple_value 0.81% : 0.000002s : 10: predicate.stack_unstack_eliminate 0.38% : 0.000001s : 5: predicate.switch_call_monad_eliminater 1.52% : 0.000004s : 26: predicate.switch_defer_inline 2.13% : 0.000005s : 36: predicate.switch_layer_defer_inline 5.56% : 0.000014s : 81: predicate.switch_simplify 0.83% : 0.000002s : 16: predicate.tile_eliminate 0.86% : 0.000002s : 16: predicate.transpose_eliminate 1.41% : 0.000003s : 26: predicate.tuple_list_convert_item_index_to_positive 1.47% : 0.000004s : 26: predicate.tuple_list_get_item_const_eliminator 1.36% : 0.000003s : 26: predicate.tuple_list_get_item_depend_reorder 3.57% : 0.000009s : 42: predicate.tuple_list_get_item_eliminator 1.49% : 0.000004s : 26: predicate.tuple_list_get_set_item_eliminator 2.05% : 0.000005s : 36: predicate.tuple_list_set_item_eliminator 1.70% : 0.000004s : 32: predicate.tuple_to_list_eliminator_ 2.41% : 0.000006s : 48: predicate.updatestate_pure_node_eliminater 3.09% : 0.000008s : 58: predicate.updatestate_useless_node_eliminater 0.36% : 0.000001s : 5: predicate.value_based_eliminate 0.70% : 0.000002s : 10: predicate.virtual_dataset_eliminate 0.74% : 0.000002s : 10: predicate.virtual_output_eliminate 0.26% : 0.000001s : 5: predicate.virtual_view_grad_eliminate 0.40% : 0.000001s : 5: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000604 11 53.07% : 0.000321s : 5: func_graph_cloner_run.FuncGraphClonerGraph 46.93% : 0.000284s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.029289 192 0.02% : 0.000006s : 1: ForceFp32Comm 11.08% : 0.003245s : 1: add_attr 11.02% : 0.003228s : 1: add_attr_with_inline 0.02% : 0.000006s : 1: add_comm_op_reuse_tag 0.21% : 0.000062s : 1: add_recomputation 0.02% : 0.000007s : 1: assign_add_opt 0.25% : 0.000072s : 1: auto_monad 0.12% : 0.000036s : 1: auto_monad_reorder 0.02% : 0.000006s : 1: begin_end_overlap_inline 0.03% : 0.000008s : 1: bias_add_comm_swap 1.68% : 0.000493s : 1: bootstrap 0.13% : 0.000039s : 1: cconv 0.02% : 0.000006s : 1: comm_op_add_attrs 0.07% : 0.000022s : 1: control_data_broadcast_order 0.05% : 0.000014s : 1: convert_after_rewriter 0.11% : 0.000034s : 1: cse_after_recomputation 0.03% : 0.000008s : 1: dataset_repeat_opt 0.09% : 0.000026s : 1: detach_backward 0.04% : 0.000012s : 1: environ_conv 0.11% : 0.000032s : 1: event_method 0.03% : 0.000008s : 1: full_micro_interleaved_order_control 0.03% : 0.000008s : 1: get_jit_bprop_graph 0.04% : 0.000012s : 1: graph_reusing 0.02% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.02% : 0.000006s : 1: handle_group_info 0.03% : 0.000009s : 1: inline 0.03% : 0.000009s : 1: insert-virtual-dataset 0.02% : 0.000006s : 1: interleave_parallel_branches 0.02% : 0.000006s : 1: interleave_split_concat_branches 0.03% : 0.000008s : 1: label_fine_grained_interleaved_index 0.04% : 0.000011s : 1: label_micro_interleaved_index 1.74% : 0.000511s : 1: loop_unroll 0.02% : 0.000007s : 1: merge_cast_opt 0.03% : 0.000007s : 1: micro_interleaved_order_control 2.25% : 0.000660s : 1: mutable_eliminate 0.03% : 0.000009s : 1: offloading_packed_experts 0.06% : 0.000018s : 1: opt.transform.loop_unroll_optimizer 0.08% : 0.000023s : 1: opt.transform.mutable_eliminate 4.84% : 0.001419s : 78: opt.transform.opt_a 0.13% : 0.000037s : 1: opt.transform.opt_after_cconv 0.12% : 0.000037s : 1: opt.transform.opt_after_jit_grad 0.46% : 0.000135s : 28: opt.transform.opt_b 0.20% : 0.000059s : 2: opt.transform.opt_trans_graph 0.14% : 0.000042s : 4: opt.transform.symbol_engine_opt 12.03% : 0.003522s : 1: opt_a 0.52% : 0.000152s : 1: opt_after_cconv 2.12% : 0.000620s : 1: opt_after_jit_grad 1.13% : 0.000330s : 1: opt_b 22.38% : 0.006555s : 1: optimize 0.09% : 0.000027s : 1: optimize_parallel_all_gather_comm 0.04% : 0.000013s : 1: order_py_execute_after_rewriter 0.10% : 0.000030s : 1: overlap_grad_flash_sp 0.02% : 0.000006s : 1: overlap_grad_matmul_and_grad_allreduce 0.03% : 0.000009s : 1: overlap_grad_ring_attention 0.02% : 0.000007s : 1: overlap_opt_shard_grad_in_pipeline 0.02% : 0.000007s : 1: overlap_opt_shard_in_pipeline 0.03% : 0.000009s : 1: overlap_param_gather 0.02% : 0.000006s : 1: overlap_recompute_allgather_and_fa_grad 0.03% : 0.000010s : 1: overlap_recompute_and_grad_model_parallel 0.03% : 0.000008s : 1: overlap_recompute_comm 0.03% : 0.000010s : 1: parallel-infer-symbol 0.02% : 0.000007s : 1: parallel-infer-symbol-second 0.03% : 0.000008s : 1: partial_unused_args_eliminate 0.03% : 0.000010s : 1: pipeline_parallel_scheduler 0.03% : 0.000007s : 1: pipeline_split 0.16% : 0.000047s : 1: pre_auto_parallel 0.19% : 0.000056s : 1: py_interpret_to_execute 0.08% : 0.000023s : 1: py_interpret_to_execute_after_opt_a 0.02% : 0.000007s : 1: remove_cast_before_assign_add 0.08% : 0.000022s : 1: remove_dup_value 1.29% : 0.000378s : 1: renormalize.infer 1.19% : 0.000349s : 1: renormalize.specialize 0.03% : 0.000008s : 1: reorder_send_recv_between_fp_bp 0.04% : 0.000012s : 1: rewriter_after_jit_bprop_graph 0.19% : 0.000056s : 1: rewriter_after_opt_a 0.35% : 0.000104s : 1: rewriter_before_opt_a 0.03% : 0.000008s : 1: slice_cell_reuse_recomputed_activation 0.02% : 0.000007s : 1: slice_recompute_activation 0.02% : 0.000007s : 1: split_layernorm_comm 0.03% : 0.000008s : 1: split_matmul_comm_elemetwise 0.04% : 0.000011s : 1: swap_dp_allreduce_reducescatter 0.38% : 0.000112s : 1: symbol_engine_optimizer 0.38% : 0.000110s : 1: tuple_transform 21.16% : 0.006198s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:47:09.832.317 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0148554, [21] [bootstrap]: 0.00043792 [type_inference]: 0.00580544 [event_method]: 1.982e-05 [auto_monad]: 6.073e-05 [graph_reusing]: 5.71998e-06 [inline]: 2.02001e-06 [add_attr]: 0.00301798, [1] [add_attr_with_inline]: 0.00300849, [1] [Cycle 1]: 5.984e-05, [2] [tag_attr]: 2.037e-05 [meta_addattr_fg_expand]: 6.07999e-06 [parallel-infer-symbol]: 3.28998e-06 [pre_auto_parallel]: 4.017e-05 [insert-virtual-dataset]: 2.26e-06 [parallel-infer-symbol-second]: 8.39995e-07 [dataset_repeat_opt]: 2.07999e-06 [pipeline_split]: 1.71e-06 [optimize]: 0.00479022, [53] [py_interpret_to_execute]: 2.79e-05 [rewriter_before_opt_a]: 8.737e-05 [opt_a]: 0.00278448, [2] [Cycle 1]: 0.00204332, [45] [expand_dump_flag]: 2.94999e-06 [switch_simplify]: 4.739e-05 [loop_unroll]: 3.488e-05 [a_1]: 0.00071314 [with_stream_mark]: 1.7e-05 [recompute_prepare]: 9.96998e-06 [updatestate_depend_eliminate]: 3.98001e-06 [updatestate_assign_eliminate]: 3.53e-06 [updatestate_loads_eliminate]: 2.91999e-06 [parameter_eliminate]: 1.92999e-06 [a_2]: 9.055e-05 [accelerated_algorithm]: 7.43999e-06 [shard]: 1.89e-06 [meta_shard_fg_expand]: 1.75001e-06 [shard_inline]: 7.06999e-06 [merge_send_recv]: 8.01001e-06 [auto_parallel]: 6.23e-06 [parallel]: 1.725e-05 [flash_sp]: 8.17e-06 [merge_comm]: 4.03999e-06 [allreduce_fusion]: 3.27997e-06 [matmul_add_comm_reduction]: 9.61e-06 [allreduce_slice_to_reducescatter]: 8.00006e-07 [virtual_shard_identity]: 8.77e-06 [virtual_dataset]: 7.48e-06 [get_grad_eliminate_]: 6.93998e-06 [virtual_output]: 7.11001e-06 [merge_forward]: 3.93001e-06 [cell_reuse_recompute_pass]: 1.45001e-06 [offload_activation]: 1.025e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.355e-05 [merge_recompute_call_nodes]: 1.47999e-06 [before_grad]: 1.133e-05 [set_forward_comm_id_for_comm_node_pass]: 3.81999e-06 [meta_fg_expand]: 2.73e-06 [flash_sp_send_recv_attached]: 2.64999e-06 [receive_attached]: 2.00002e-06 [after_resolve]: 1.381e-05 [a_after_grad]: 1.163e-05 [renormalize]: 0.00058307 [add_forward_monad_depend]: 5.53002e-06 [auto_monad_grad]: 2.39999e-06 [auto_monad_eliminator]: 1.421e-05 [cse]: 3.157e-05 [a_3]: 5.209e-05 [Cycle 2]: 0.00073149, [45] [expand_dump_flag]: 1.49e-06 [switch_simplify]: 8.94e-06 [loop_unroll]: 6.93e-06 [a_1]: 0.00015735 [with_stream_mark]: 1.213e-05 [recompute_prepare]: 7.52002e-06 [updatestate_depend_eliminate]: 3.28e-06 [updatestate_assign_eliminate]: 2.87002e-06 [updatestate_loads_eliminate]: 2.11e-06 [parameter_eliminate]: 1.08001e-06 [a_2]: 0.00010446 [accelerated_algorithm]: 7.14001e-06 [shard]: 1.14003e-06 [meta_shard_fg_expand]: 1.62001e-06 [shard_inline]: 6.74999e-06 [merge_send_recv]: 5.20001e-06 [auto_parallel]: 5.77999e-06 [parallel]: 4.86002e-06 [flash_sp]: 3.34001e-06 [merge_comm]: 3.46999e-06 [allreduce_fusion]: 2.82002e-06 [matmul_add_comm_reduction]: 6.09001e-06 [allreduce_slice_to_reducescatter]: 5.00004e-07 [virtual_shard_identity]: 7.8e-06 [virtual_dataset]: 6.59001e-06 [get_grad_eliminate_]: 6.56999e-06 [virtual_output]: 6.46e-06 [merge_forward]: 2.67001e-06 [cell_reuse_recompute_pass]: 1.53002e-06 [offload_activation]: 6.61e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.641e-05 [merge_recompute_call_nodes]: 8.89995e-07 [before_grad]: 9.96e-06 [set_forward_comm_id_for_comm_node_pass]: 3.6e-06 [meta_fg_expand]: 2.17001e-06 [flash_sp_send_recv_attached]: 1.30001e-06 [receive_attached]: 1.17e-06 [after_resolve]: 1.304e-05 [a_after_grad]: 1.047e-05 [renormalize]: 7.99773e-08 [add_forward_monad_depend]: 1.51998e-06 [auto_monad_grad]: 1.13001e-06 [auto_monad_eliminator]: 7.33e-06 [cse]: 1.624e-05 [a_3]: 4.074e-05 [py_interpret_to_execute_after_opt_a]: 1.064e-05 [slice_cell_reuse_recomputed_activation]: 2.26998e-06 [rewriter_after_opt_a]: 3.418e-05 [convert_after_rewriter]: 7.06999e-06 [order_py_execute_after_rewriter]: 5.00001e-06 [mutable_eliminate]: 0.00048735 [opt_b]: 0.00022768, [1] [Cycle 1]: 0.00022091, [7] [b_1]: 0.00014502 [b_2]: 8.68001e-06 [updatestate_depend_eliminate]: 5.51998e-06 [updatestate_assign_eliminate]: 2.39001e-06 [updatestate_loads_eliminate]: 2.69999e-06 [renormalize]: 5.19998e-07 [cse]: 2e-05 [optimize_parallel_all_gather_comm]: 1.596e-05 [overlap_param_gather]: 1.86e-06 [cconv]: 2.554e-05 [loop_unroll]: 0.00041325 [opt_after_cconv]: 0.00010617, [1] [Cycle 1]: 0.00010017, [7] [c_1]: 3.569e-05 [parameter_eliminate]: 3.11001e-06 [updatestate_depend_eliminate]: 5.05001e-06 [updatestate_assign_eliminate]: 2.43e-06 [updatestate_loads_eliminate]: 2.27999e-06 [cse]: 1.759e-05 [renormalize]: 3.19997e-07 [remove_dup_value]: 1.382e-05 [tuple_transform]: 8.193e-05, [1] [Cycle 1]: 7.753e-05, [4] [d_1]: 4.928e-05 [none_parameter_eliminate]: 1.57001e-06 [renormalize]: 2.9002e-07 [switch_simplify]: 7.79002e-06 [partial_unused_args_eliminate]: 1.72001e-06 [add_recomputation]: 4.751e-05 [cse_after_recomputation]: 2.136e-05, [1] [Cycle 1]: 1.69e-05, [1] [cse]: 1.155e-05 [environ_conv]: 5.20999e-06 [swap_dp_allreduce_reducescatter]: 5.54998e-06 [bias_add_comm_swap]: 2.59999e-06 [label_micro_interleaved_index]: 3.85e-06 [label_fine_grained_interleaved_index]: 2.46e-06 [merge_cast_opt]: 1.27e-06 [slice_recompute_activation]: 1.96e-06 [micro_interleaved_order_control]: 2.27001e-06 [assign_add_opt]: 1.12e-06 [ForceFp32Comm]: 8.10018e-07 [remove_cast_before_assign_add]: 1.05001e-06 [full_micro_interleaved_order_control]: 1.88002e-06 [reorder_send_recv_between_fp_bp]: 2.96001e-06 [comm_op_add_attrs]: 9.49978e-07 [add_comm_op_reuse_tag]: 9.29984e-07 [interleave_split_concat_branches]: 1.15001e-06 [interleave_parallel_branches]: 1.04998e-06 [overlap_opt_shard_in_pipeline]: 1.53002e-06 [overlap_opt_shard_grad_in_pipeline]: 1.94e-06 [control_data_broadcast_order]: 1.277e-05 [grouped_pairwise_exchange_alltoall]: 1.67999e-06 [offloading_packed_experts]: 3.7e-06 [overlap_recompute_and_grad_model_parallel]: 4.71002e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.18001e-06 [overlap_recompute_allgather_and_fa_grad]: 1.34e-06 [overlap_recompute_comm]: 2.39999e-06 [overlap_grad_ring_attention]: 4.55001e-06 [overlap_grad_flash_sp]: 1.937e-05 [begin_end_overlap_inline]: 4.69998e-07 [split_matmul_comm_elemetwise]: 2.09999e-06 [split_layernorm_comm]: 1.77999e-06 [handle_group_info]: 1.22999e-06 [symbol_engine_optimizer]: 7.614e-05, [1] [Cycle 1]: 7.203e-05, [6] [build]: 2.94001e-06 [elim_shapecalc]: 1.023e-05 [elim_not_effective]: 1.297e-05 [opt_reshape]: 7.46999e-06 [fold_const_symbol]: 1.122e-05 [renormalize]: 2.50002e-07 [detach_backward]: 2.10002e-06 [pipeline_parallel_scheduler]: 1.46998e-06 [auto_monad_reorder]: 1.821e-05 [get_jit_bprop_graph]: 1.60999e-06 [rewriter_after_jit_bprop_graph]: 4.04002e-06 [opt_after_jit_grad]: 0.00045674 [validate]: 3.718e-05 Sums bootstrap : 0.000438s : 4.02% type_inference : 0.005805s : 53.27% event_method : 0.000020s : 0.18% auto_monad : 0.000061s : 0.56% graph_reusing : 0.000006s : 0.05% inline : 0.000002s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000020s : 0.19% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.06% parallel-infer-symbol : 0.000003s : 0.03% pre_auto_parallel : 0.000040s : 0.37% insert-virtual-dataset : 0.000002s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.02% optimize.py_interpret_to_execute : 0.000028s : 0.26% optimize.rewriter_before_opt_a : 0.000087s : 0.80% optimize.opt_a.expand_dump_flag : 0.000004s : 0.04% optimize.opt_a.switch_simplify : 0.000056s : 0.52% optimize.opt_a.loop_unroll : 0.000042s : 0.38% optimize.opt_a.a_1 : 0.000870s : 7.99% optimize.opt_a.with_stream_mark : 0.000029s : 0.27% optimize.opt_a.recompute_prepare : 0.000017s : 0.16% optimize.opt_a.updatestate_depend_eliminate : 0.000007s : 0.07% optimize.opt_a.updatestate_assign_eliminate : 0.000006s : 0.06% optimize.opt_a.updatestate_loads_eliminate : 0.000005s : 0.05% optimize.opt_a.parameter_eliminate : 0.000003s : 0.03% optimize.opt_a.a_2 : 0.000195s : 1.79% optimize.opt_a.accelerated_algorithm : 0.000015s : 0.13% optimize.opt_a.shard : 0.000003s : 0.03% optimize.opt_a.meta_shard_fg_expand : 0.000003s : 0.03% optimize.opt_a.shard_inline : 0.000014s : 0.13% optimize.opt_a.merge_send_recv : 0.000013s : 0.12% optimize.opt_a.auto_parallel : 0.000012s : 0.11% optimize.opt_a.parallel : 0.000022s : 0.20% optimize.opt_a.flash_sp : 0.000012s : 0.11% optimize.opt_a.merge_comm : 0.000008s : 0.07% optimize.opt_a.allreduce_fusion : 0.000006s : 0.06% optimize.opt_a.matmul_add_comm_reduction : 0.000016s : 0.14% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000017s : 0.15% optimize.opt_a.virtual_dataset : 0.000014s : 0.13% optimize.opt_a.get_grad_eliminate_ : 0.000014s : 0.12% optimize.opt_a.virtual_output : 0.000014s : 0.12% optimize.opt_a.merge_forward : 0.000007s : 0.06% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.03% optimize.opt_a.offload_activation : 0.000017s : 0.15% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000030s : 0.27% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.02% optimize.opt_a.before_grad : 0.000021s : 0.20% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000007s : 0.07% optimize.opt_a.meta_fg_expand : 0.000005s : 0.04% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.04% optimize.opt_a.receive_attached : 0.000003s : 0.03% optimize.opt_a.after_resolve : 0.000027s : 0.25% optimize.opt_a.a_after_grad : 0.000022s : 0.20% optimize.opt_a.renormalize : 0.000583s : 5.35% optimize.opt_a.add_forward_monad_depend : 0.000007s : 0.06% optimize.opt_a.auto_monad_grad : 0.000004s : 0.03% optimize.opt_a.auto_monad_eliminator : 0.000022s : 0.20% optimize.opt_a.cse : 0.000048s : 0.44% optimize.opt_a.a_3 : 0.000093s : 0.85% optimize.py_interpret_to_execute_after_opt_a : 0.000011s : 0.10% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.02% optimize.rewriter_after_opt_a : 0.000034s : 0.31% optimize.convert_after_rewriter : 0.000007s : 0.06% optimize.order_py_execute_after_rewriter : 0.000005s : 0.05% optimize.mutable_eliminate : 0.000487s : 4.47% optimize.opt_b.b_1 : 0.000145s : 1.33% optimize.opt_b.b_2 : 0.000009s : 0.08% optimize.opt_b.updatestate_depend_eliminate : 0.000006s : 0.05% optimize.opt_b.updatestate_assign_eliminate : 0.000002s : 0.02% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.02% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000020s : 0.18% optimize.optimize_parallel_all_gather_comm : 0.000016s : 0.15% optimize.overlap_param_gather : 0.000002s : 0.02% optimize.cconv : 0.000026s : 0.23% optimize.loop_unroll : 0.000413s : 3.79% optimize.opt_after_cconv.c_1 : 0.000036s : 0.33% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000005s : 0.05% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000002s : 0.02% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.02% optimize.opt_after_cconv.cse : 0.000018s : 0.16% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000014s : 0.13% optimize.tuple_transform.d_1 : 0.000049s : 0.45% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000008s : 0.07% optimize.partial_unused_args_eliminate : 0.000002s : 0.02% optimize.add_recomputation : 0.000048s : 0.44% optimize.cse_after_recomputation.cse : 0.000012s : 0.11% optimize.environ_conv : 0.000005s : 0.05% optimize.swap_dp_allreduce_reducescatter : 0.000006s : 0.05% optimize.bias_add_comm_swap : 0.000003s : 0.02% optimize.label_micro_interleaved_index : 0.000004s : 0.04% optimize.label_fine_grained_interleaved_index : 0.000002s : 0.02% optimize.merge_cast_opt : 0.000001s : 0.01% optimize.slice_recompute_activation : 0.000002s : 0.02% optimize.micro_interleaved_order_control : 0.000002s : 0.02% optimize.assign_add_opt : 0.000001s : 0.01% optimize.ForceFp32Comm : 0.000001s : 0.01% optimize.remove_cast_before_assign_add : 0.000001s : 0.01% optimize.full_micro_interleaved_order_control : 0.000002s : 0.02% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.03% optimize.comm_op_add_attrs : 0.000001s : 0.01% optimize.add_comm_op_reuse_tag : 0.000001s : 0.01% optimize.interleave_split_concat_branches : 0.000001s : 0.01% optimize.interleave_parallel_branches : 0.000001s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000002s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.02% optimize.control_data_broadcast_order : 0.000013s : 0.12% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.02% optimize.offloading_packed_experts : 0.000004s : 0.03% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.04% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.01% optimize.overlap_recompute_comm : 0.000002s : 0.02% optimize.overlap_grad_ring_attention : 0.000005s : 0.04% optimize.overlap_grad_flash_sp : 0.000019s : 0.18% optimize.begin_end_overlap_inline : 0.000000s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.02% optimize.split_layernorm_comm : 0.000002s : 0.02% optimize.handle_group_info : 0.000001s : 0.01% optimize.symbol_engine_optimizer.build : 0.000003s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000010s : 0.09% optimize.symbol_engine_optimizer.elim_not_effective : 0.000013s : 0.12% optimize.symbol_engine_optimizer.opt_reshape : 0.000007s : 0.07% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000011s : 0.10% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.02% pipeline_parallel_scheduler : 0.000001s : 0.01% auto_monad_reorder : 0.000018s : 0.17% get_jit_bprop_graph : 0.000002s : 0.01% rewriter_after_jit_bprop_graph : 0.000004s : 0.04% opt_after_jit_grad : 0.000457s : 4.19% validate : 0.000037s : 0.34% Time group info: ------[substitution.] 0.000192 34 1.05% : 0.000002s : 2: substitution.elim_not_effective 0.86% : 0.000002s : 2: substitution.fold_const_symbol 3.24% : 0.000006s : 5: substitution.graph_param_transform 74.30% : 0.000143s : 4: substitution.inline 2.00% : 0.000004s : 4: substitution.j_node_and_user_rematch 3.71% : 0.000007s : 4: substitution.remove_not_recompute_node 2.65% : 0.000005s : 6: substitution.replace_old_param 9.28% : 0.000018s : 6: substitution.tuple_list_get_item_eliminator 2.92% : 0.000006s : 1: substitution.value_based_eliminate ------[type_inference.] 0.005747 2 87.53% : 0.005030s : 1: type_inference.infer 12.47% : 0.000717s : 1: type_inference.specialize ------[replace.] 0.000067 10 53.84% : 0.000036s : 4: replace.inline 46.16% : 0.000031s : 6: replace.tuple_list_get_item_eliminator ------[match.] 0.000155 10 90.24% : 0.000140s : 4: match.inline 9.76% : 0.000015s : 6: match.tuple_list_get_item_eliminator ------[predicate.] 0.000251 1590 0.81% : 0.000002s : 16: predicate.accumulaten_eliminater 0.62% : 0.000002s : 5: predicate.ad_related_special_op_eliminate 0.48% : 0.000001s : 10: predicate.addn_check_dump 0.82% : 0.000002s : 16: predicate.addn_zero_filter 0.78% : 0.000002s : 16: predicate.adjust_all_reduce_mul_add 1.82% : 0.000005s : 26: predicate.arithmetic_simplify 0.92% : 0.000002s : 16: predicate.cast_eliminate 0.59% : 0.000001s : 10: predicate.check_bprop_eliminate 0.46% : 0.000001s : 10: predicate.compare_switch_simplify 0.18% : 0.000000s : 5: predicate.const_output_eliminate 0.54% : 0.000001s : 10: predicate.depend_value_elim 0.87% : 0.000002s : 16: predicate.dict_get_item_const_eliminator 0.91% : 0.000002s : 16: predicate.dict_get_item_eliminator 0.82% : 0.000002s : 16: predicate.dict_set_item_eliminator 0.77% : 0.000002s : 10: predicate.dumpgradient_eliminate 0.20% : 0.000001s : 5: predicate.elim_not_effective 0.44% : 0.000001s : 5: predicate.elim_shapecalc_of_broadcastargs 1.02% : 0.000003s : 21: predicate.environ_add_const_eliminate 0.98% : 0.000002s : 21: predicate.environ_get_add_eliminate 0.98% : 0.000002s : 21: predicate.environ_get_depend_swap 1.56% : 0.000004s : 31: predicate.environ_get_eliminate 1.00% : 0.000003s : 21: predicate.environ_get_set_eliminate 1.39% : 0.000003s : 26: predicate.exchange_switch_depend_value 2.23% : 0.000006s : 26: predicate.float_depend_g_call 0.48% : 0.000001s : 10: predicate.float_environ_get_switch 0.77% : 0.000002s : 15: predicate.float_tuple_getitem_switch 0.19% : 0.000000s : 5: predicate.fold_const_symbol 0.62% : 0.000002s : 10: predicate.get_grad_eliminate 0.29% : 0.000001s : 5: predicate.graph_param_transform 0.56% : 0.000001s : 10: predicate.incorporate_call 0.44% : 0.000001s : 10: predicate.incorporate_call_switch 5.60% : 0.000014s : 72: predicate.inline 0.68% : 0.000002s : 10: predicate.inline_without_move 0.31% : 0.000001s : 10: predicate.j_node_and_user_rematch 0.66% : 0.000002s : 10: predicate.less_batch_normalization 1.73% : 0.000004s : 32: predicate.list_to_tuple_eliminator_ 2.40% : 0.000006s : 48: predicate.load_eliminater 0.72% : 0.000002s : 5: predicate.loop_unroll_after_grad 2.14% : 0.000005s : 40: predicate.loop_unroll_before_grad 1.68% : 0.000004s : 26: predicate.make_slice_get_slice_eliminator 0.48% : 0.000001s : 10: predicate.merge_addn 0.51% : 0.000001s : 10: predicate.micro_step_allgather_replace 0.50% : 0.000001s : 10: predicate.mini_step_allgather_replace 0.77% : 0.000002s : 16: predicate.minmaximum_grad 0.76% : 0.000002s : 5: predicate.mutable_eliminate 0.31% : 0.000001s : 5: predicate.opt_reshape 0.31% : 0.000001s : 5: predicate.parallel_virtual_node 1.65% : 0.000004s : 26: predicate.partial_defer_inline 1.68% : 0.000004s : 27: predicate.partial_eliminate 0.80% : 0.000002s : 16: predicate.print_const_string_wrapper 0.59% : 0.000001s : 10: predicate.reduce_all_const_elim 1.07% : 0.000003s : 16: predicate.reduce_eliminate 2.42% : 0.000006s : 48: predicate.redundant_stop_gradient_eliminater 0.54% : 0.000001s : 10: predicate.remove_not_recompute_node 1.50% : 0.000004s : 32: predicate.replace_applicator 0.47% : 0.000001s : 10: predicate.replace_old_param 0.32% : 0.000001s : 5: predicate.reset_defer_inline 0.86% : 0.000002s : 16: predicate.reshape_eliminate 0.55% : 0.000001s : 10: predicate.row_tensor_add_zeros_like 0.29% : 0.000001s : 5: predicate.row_tensor_eliminate 0.84% : 0.000002s : 10: predicate.same_eliminate 0.50% : 0.000001s : 10: predicate.set_cell_output_no_recompute 0.76% : 0.000002s : 10: predicate.shard_identity_eliminate 0.65% : 0.000002s : 10: predicate.special_op_eliminate 0.63% : 0.000002s : 10: predicate.specialize_transform 0.71% : 0.000002s : 10: predicate.split_environ_get_set_with_tuple_value 0.70% : 0.000002s : 10: predicate.stack_unstack_eliminate 0.40% : 0.000001s : 5: predicate.switch_call_monad_eliminater 1.49% : 0.000004s : 26: predicate.switch_defer_inline 1.96% : 0.000005s : 36: predicate.switch_layer_defer_inline 4.88% : 0.000012s : 81: predicate.switch_simplify 0.80% : 0.000002s : 16: predicate.tile_eliminate 0.82% : 0.000002s : 16: predicate.transpose_eliminate 1.35% : 0.000003s : 26: predicate.tuple_list_convert_item_index_to_positive 1.57% : 0.000004s : 26: predicate.tuple_list_get_item_const_eliminator 1.27% : 0.000003s : 26: predicate.tuple_list_get_item_depend_reorder 2.89% : 0.000007s : 42: predicate.tuple_list_get_item_eliminator 1.36% : 0.000003s : 26: predicate.tuple_list_get_set_item_eliminator 10.48% : 0.000026s : 36: predicate.tuple_list_set_item_eliminator 1.65% : 0.000004s : 32: predicate.tuple_to_list_eliminator_ 2.39% : 0.000006s : 48: predicate.updatestate_pure_node_eliminater 2.89% : 0.000007s : 58: predicate.updatestate_useless_node_eliminater 0.37% : 0.000001s : 5: predicate.value_based_eliminate 0.66% : 0.000002s : 10: predicate.virtual_dataset_eliminate 0.60% : 0.000002s : 10: predicate.virtual_output_eliminate 0.23% : 0.000001s : 5: predicate.virtual_view_grad_eliminate 0.33% : 0.000001s : 5: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000545 11 56.46% : 0.000308s : 5: func_graph_cloner_run.FuncGraphClonerGraph 43.54% : 0.000237s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.024802 192 0.01% : 0.000003s : 1: ForceFp32Comm 12.19% : 0.003023s : 1: add_attr 12.15% : 0.003012s : 1: add_attr_with_inline 0.01% : 0.000004s : 1: add_comm_op_reuse_tag 0.21% : 0.000051s : 1: add_recomputation 0.01% : 0.000004s : 1: assign_add_opt 0.27% : 0.000066s : 1: auto_monad 0.09% : 0.000022s : 1: auto_monad_reorder 0.01% : 0.000003s : 1: begin_end_overlap_inline 0.02% : 0.000005s : 1: bias_add_comm_swap 1.88% : 0.000467s : 1: bootstrap 0.12% : 0.000029s : 1: cconv 0.02% : 0.000004s : 1: comm_op_add_attrs 0.06% : 0.000016s : 1: control_data_broadcast_order 0.04% : 0.000011s : 1: convert_after_rewriter 0.10% : 0.000024s : 1: cse_after_recomputation 0.02% : 0.000005s : 1: dataset_repeat_opt 0.02% : 0.000005s : 1: detach_backward 0.03% : 0.000008s : 1: environ_conv 0.10% : 0.000026s : 1: event_method 0.02% : 0.000005s : 1: full_micro_interleaved_order_control 0.02% : 0.000005s : 1: get_jit_bprop_graph 0.04% : 0.000009s : 1: graph_reusing 0.02% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.02% : 0.000004s : 1: handle_group_info 0.02% : 0.000005s : 1: inline 0.02% : 0.000006s : 1: insert-virtual-dataset 0.01% : 0.000004s : 1: interleave_parallel_branches 0.02% : 0.000004s : 1: interleave_split_concat_branches 0.02% : 0.000005s : 1: label_fine_grained_interleaved_index 0.03% : 0.000007s : 1: label_micro_interleaved_index 1.70% : 0.000421s : 1: loop_unroll 0.02% : 0.000004s : 1: merge_cast_opt 0.02% : 0.000005s : 1: micro_interleaved_order_control 2.00% : 0.000495s : 1: mutable_eliminate 0.03% : 0.000007s : 1: offloading_packed_experts 0.06% : 0.000015s : 1: opt.transform.loop_unroll_optimizer 0.07% : 0.000016s : 1: opt.transform.mutable_eliminate 5.53% : 0.001372s : 78: opt.transform.opt_a 0.14% : 0.000034s : 1: opt.transform.opt_after_cconv 0.11% : 0.000026s : 1: opt.transform.opt_after_jit_grad 0.50% : 0.000123s : 28: opt.transform.opt_b 0.22% : 0.000055s : 2: opt.transform.opt_trans_graph 0.15% : 0.000038s : 4: opt.transform.symbol_engine_opt 11.24% : 0.002787s : 1: opt_a 0.44% : 0.000109s : 1: opt_after_cconv 1.88% : 0.000465s : 1: opt_after_jit_grad 0.93% : 0.000231s : 1: opt_b 19.33% : 0.004794s : 1: optimize 0.08% : 0.000019s : 1: optimize_parallel_all_gather_comm 0.03% : 0.000008s : 1: order_py_execute_after_rewriter 0.09% : 0.000023s : 1: overlap_grad_flash_sp 0.02% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.03% : 0.000007s : 1: overlap_grad_ring_attention 0.02% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.02% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.02% : 0.000005s : 1: overlap_param_gather 0.02% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.03% : 0.000008s : 1: overlap_recompute_and_grad_model_parallel 0.02% : 0.000005s : 1: overlap_recompute_comm 0.03% : 0.000007s : 1: parallel-infer-symbol 0.01% : 0.000004s : 1: parallel-infer-symbol-second 0.02% : 0.000005s : 1: partial_unused_args_eliminate 0.02% : 0.000004s : 1: pipeline_parallel_scheduler 0.02% : 0.000005s : 1: pipeline_split 0.18% : 0.000045s : 1: pre_auto_parallel 0.13% : 0.000032s : 1: py_interpret_to_execute 0.06% : 0.000014s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000004s : 1: remove_cast_before_assign_add 0.07% : 0.000017s : 1: remove_dup_value 1.19% : 0.000294s : 1: renormalize.infer 1.13% : 0.000280s : 1: renormalize.specialize 0.02% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.03% : 0.000007s : 1: rewriter_after_jit_bprop_graph 0.15% : 0.000038s : 1: rewriter_after_opt_a 0.37% : 0.000091s : 1: rewriter_before_opt_a 0.02% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.02% : 0.000005s : 1: slice_recompute_activation 0.02% : 0.000005s : 1: split_layernorm_comm 0.02% : 0.000005s : 1: split_matmul_comm_elemetwise 0.03% : 0.000008s : 1: swap_dp_allreduce_reducescatter 0.32% : 0.000079s : 1: symbol_engine_optimizer 0.34% : 0.000085s : 1: tuple_transform 23.47% : 0.005820s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:47:10.295.81 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:47:10.298.59 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0158758, [21] [bootstrap]: 0.00043858 [type_inference]: 0.0057986 [event_method]: 1.987e-05 [auto_monad]: 6.213e-05 [graph_reusing]: 5.53997e-06 [inline]: 2.30002e-06 [add_attr]: 0.00303404, [1] [add_attr_with_inline]: 0.00302541, [1] [Cycle 1]: 7.299e-05, [2] [tag_attr]: 2.026e-05 [meta_addattr_fg_expand]: 6.11998e-06 [parallel-infer-symbol]: 3.40003e-06 [pre_auto_parallel]: 3.546e-05 [insert-virtual-dataset]: 2.22001e-06 [parallel-infer-symbol-second]: 7.7e-07 [dataset_repeat_opt]: 1.78002e-06 [pipeline_split]: 1.59998e-06 [optimize]: 0.00538471, [53] [py_interpret_to_execute]: 3.709e-05 [rewriter_before_opt_a]: 9.363e-05 [opt_a]: 0.00309779, [2] [Cycle 1]: 0.00221463, [45] [expand_dump_flag]: 2.88998e-06 [switch_simplify]: 4.821e-05 [loop_unroll]: 3.453e-05 [a_1]: 0.00072114 [with_stream_mark]: 1.669e-05 [recompute_prepare]: 9.24998e-06 [updatestate_depend_eliminate]: 3.73001e-06 [updatestate_assign_eliminate]: 3.50998e-06 [updatestate_loads_eliminate]: 2.83998e-06 [parameter_eliminate]: 1.85001e-06 [a_2]: 0.00011769 [accelerated_algorithm]: 7.9e-06 [shard]: 1.71002e-06 [meta_shard_fg_expand]: 1.87001e-06 [shard_inline]: 7.38999e-06 [merge_send_recv]: 8.97999e-06 [auto_parallel]: 6.39999e-06 [parallel]: 1.863e-05 [flash_sp]: 8.52e-06 [merge_comm]: 4.28999e-06 [allreduce_fusion]: 3.58e-06 [matmul_add_comm_reduction]: 9.24e-06 [allreduce_slice_to_reducescatter]: 8.50006e-07 [virtual_shard_identity]: 8.44998e-06 [virtual_dataset]: 7.35e-06 [get_grad_eliminate_]: 7.08998e-06 [virtual_output]: 7e-06 [merge_forward]: 3.61999e-06 [cell_reuse_recompute_pass]: 1.24e-06 [offload_activation]: 9.29e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.552e-05 [merge_recompute_call_nodes]: 1.39e-06 [before_grad]: 1.093e-05 [set_forward_comm_id_for_comm_node_pass]: 3.73999e-06 [meta_fg_expand]: 2.76e-06 [flash_sp_send_recv_attached]: 2.73e-06 [receive_attached]: 1.96e-06 [after_resolve]: 1.392e-05 [a_after_grad]: 1.141e-05 [renormalize]: 0.00056543 [add_forward_monad_depend]: 5.27001e-06 [auto_monad_grad]: 2.52001e-06 [auto_monad_eliminator]: 1.486e-05 [cse]: 3.096e-05 [a_3]: 6.491e-05 [Cycle 2]: 0.0008706, [45] [expand_dump_flag]: 1.32999e-06 [switch_simplify]: 8.64e-06 [loop_unroll]: 7.25e-06 [a_1]: 0.00015774 [with_stream_mark]: 1.229e-05 [recompute_prepare]: 7.36001e-06 [updatestate_depend_eliminate]: 3.26999e-06 [updatestate_assign_eliminate]: 2.81e-06 [updatestate_loads_eliminate]: 2.24999e-06 [parameter_eliminate]: 9.90025e-07 [a_2]: 0.00010775 [accelerated_algorithm]: 7.04001e-06 [shard]: 1.32999e-06 [meta_shard_fg_expand]: 1.59e-06 [shard_inline]: 6.84999e-06 [merge_send_recv]: 5.01997e-06 [auto_parallel]: 5.87999e-06 [parallel]: 4.67e-06 [flash_sp]: 3.35998e-06 [merge_comm]: 3.78001e-06 [allreduce_fusion]: 3.30998e-06 [matmul_add_comm_reduction]: 5.81e-06 [allreduce_slice_to_reducescatter]: 5.50004e-07 [virtual_shard_identity]: 7.21001e-06 [virtual_dataset]: 6.79999e-06 [get_grad_eliminate_]: 6.85998e-06 [virtual_output]: 6.49001e-06 [merge_forward]: 2.81e-06 [cell_reuse_recompute_pass]: 1.45999e-06 [offload_activation]: 6.36e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.602e-05 [merge_recompute_call_nodes]: 8.50006e-07 [before_grad]: 9.82001e-06 [set_forward_comm_id_for_comm_node_pass]: 3.25002e-06 [meta_fg_expand]: 2.18002e-06 [flash_sp_send_recv_attached]: 7.80012e-07 [receive_attached]: 1.69e-06 [after_resolve]: 1.338e-05 [a_after_grad]: 1.109e-05 [renormalize]: 8.9989e-08 [add_forward_monad_depend]: 1.47999e-06 [auto_monad_grad]: 8.80013e-07 [auto_monad_eliminator]: 7.25998e-06 [cse]: 1.411e-05 [a_3]: 5.257e-05 [py_interpret_to_execute_after_opt_a]: 1.309e-05 [slice_cell_reuse_recomputed_activation]: 5.14e-06 [rewriter_after_opt_a]: 3.758e-05 [convert_after_rewriter]: 9.81e-06 [order_py_execute_after_rewriter]: 8.23001e-06 [mutable_eliminate]: 0.00050983 [opt_b]: 0.0002961, [1] [Cycle 1]: 0.00028655, [7] [b_1]: 0.00019172 [b_2]: 8.75999e-06 [updatestate_depend_eliminate]: 5.44998e-06 [updatestate_assign_eliminate]: 2.64999e-06 [updatestate_loads_eliminate]: 2.89999e-06 [renormalize]: 3.69997e-07 [cse]: 1.869e-05 [optimize_parallel_all_gather_comm]: 1.963e-05 [overlap_param_gather]: 4.95001e-06 [cconv]: 2.768e-05 [loop_unroll]: 0.00042614 [opt_after_cconv]: 0.00012846, [1] [Cycle 1]: 0.00012001, [7] [c_1]: 3.572e-05 [parameter_eliminate]: 2.77002e-06 [updatestate_depend_eliminate]: 4.94003e-06 [updatestate_assign_eliminate]: 2.63e-06 [updatestate_loads_eliminate]: 2.41e-06 [cse]: 1.743e-05 [renormalize]: 3.60014e-07 [remove_dup_value]: 1.572e-05 [tuple_transform]: 9.489e-05, [1] [Cycle 1]: 8.815e-05, [4] [d_1]: 4.916e-05 [none_parameter_eliminate]: 1.65001e-06 [renormalize]: 1.8999e-07 [switch_simplify]: 7.7e-06 [partial_unused_args_eliminate]: 4.90999e-06 [add_recomputation]: 4.952e-05 [cse_after_recomputation]: 2.684e-05, [1] [Cycle 1]: 2.033e-05, [1] [cse]: 1.159e-05 [environ_conv]: 8.92e-06 [swap_dp_allreduce_reducescatter]: 7.71999e-06 [bias_add_comm_swap]: 4.66002e-06 [label_micro_interleaved_index]: 6.51e-06 [label_fine_grained_interleaved_index]: 5.19e-06 [merge_cast_opt]: 3.68999e-06 [slice_recompute_activation]: 4.50999e-06 [micro_interleaved_order_control]: 4.68999e-06 [assign_add_opt]: 3.51001e-06 [ForceFp32Comm]: 3.18e-06 [remove_cast_before_assign_add]: 3.28e-06 [full_micro_interleaved_order_control]: 4.52998e-06 [reorder_send_recv_between_fp_bp]: 5.25001e-06 [comm_op_add_attrs]: 3.7e-06 [add_comm_op_reuse_tag]: 3.48e-06 [interleave_split_concat_branches]: 3.68e-06 [interleave_parallel_branches]: 3.51001e-06 [overlap_opt_shard_in_pipeline]: 3.87002e-06 [overlap_opt_shard_grad_in_pipeline]: 4.41002e-06 [control_data_broadcast_order]: 1.49e-05 [grouped_pairwise_exchange_alltoall]: 4.33999e-06 [offloading_packed_experts]: 6.14001e-06 [overlap_recompute_and_grad_model_parallel]: 7.03e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.46001e-06 [overlap_recompute_allgather_and_fa_grad]: 3.56999e-06 [overlap_recompute_comm]: 4.73001e-06 [overlap_grad_ring_attention]: 6.59999e-06 [overlap_grad_flash_sp]: 2.082e-05 [begin_end_overlap_inline]: 2.92002e-06 [split_matmul_comm_elemetwise]: 4.28999e-06 [split_layernorm_comm]: 4.13001e-06 [handle_group_info]: 3.31001e-06 [symbol_engine_optimizer]: 9.621e-05, [1] [Cycle 1]: 8.95e-05, [6] [build]: 2.98e-06 [elim_shapecalc]: 1.02e-05 [elim_not_effective]: 1.3e-05 [opt_reshape]: 7.78999e-06 [fold_const_symbol]: 1.079e-05 [renormalize]: 2.09984e-07 [detach_backward]: 3.54002e-06 [pipeline_parallel_scheduler]: 1.77001e-06 [auto_monad_reorder]: 2.024e-05 [get_jit_bprop_graph]: 1.02998e-06 [rewriter_after_jit_bprop_graph]: 4.41002e-06 [opt_after_jit_grad]: 0.00047662 [validate]: 3.519e-05 Sums bootstrap : 0.000439s : 3.93% type_inference : 0.005799s : 51.96% event_method : 0.000020s : 0.18% auto_monad : 0.000062s : 0.56% graph_reusing : 0.000006s : 0.05% inline : 0.000002s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000020s : 0.18% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.05% parallel-infer-symbol : 0.000003s : 0.03% pre_auto_parallel : 0.000035s : 0.32% insert-virtual-dataset : 0.000002s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000037s : 0.33% optimize.rewriter_before_opt_a : 0.000094s : 0.84% optimize.opt_a.expand_dump_flag : 0.000004s : 0.04% optimize.opt_a.switch_simplify : 0.000057s : 0.51% optimize.opt_a.loop_unroll : 0.000042s : 0.37% optimize.opt_a.a_1 : 0.000879s : 7.88% optimize.opt_a.with_stream_mark : 0.000029s : 0.26% optimize.opt_a.recompute_prepare : 0.000017s : 0.15% optimize.opt_a.updatestate_depend_eliminate : 0.000007s : 0.06% optimize.opt_a.updatestate_assign_eliminate : 0.000006s : 0.06% optimize.opt_a.updatestate_loads_eliminate : 0.000005s : 0.05% optimize.opt_a.parameter_eliminate : 0.000003s : 0.03% optimize.opt_a.a_2 : 0.000225s : 2.02% optimize.opt_a.accelerated_algorithm : 0.000015s : 0.13% optimize.opt_a.shard : 0.000003s : 0.03% optimize.opt_a.meta_shard_fg_expand : 0.000003s : 0.03% optimize.opt_a.shard_inline : 0.000014s : 0.13% optimize.opt_a.merge_send_recv : 0.000014s : 0.13% optimize.opt_a.auto_parallel : 0.000012s : 0.11% optimize.opt_a.parallel : 0.000023s : 0.21% optimize.opt_a.flash_sp : 0.000012s : 0.11% optimize.opt_a.merge_comm : 0.000008s : 0.07% optimize.opt_a.allreduce_fusion : 0.000007s : 0.06% optimize.opt_a.matmul_add_comm_reduction : 0.000015s : 0.13% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000016s : 0.14% optimize.opt_a.virtual_dataset : 0.000014s : 0.13% optimize.opt_a.get_grad_eliminate_ : 0.000014s : 0.13% optimize.opt_a.virtual_output : 0.000013s : 0.12% optimize.opt_a.merge_forward : 0.000006s : 0.06% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.02% optimize.opt_a.offload_activation : 0.000016s : 0.14% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000032s : 0.28% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.02% optimize.opt_a.before_grad : 0.000021s : 0.19% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000007s : 0.06% optimize.opt_a.meta_fg_expand : 0.000005s : 0.04% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.03% optimize.opt_a.receive_attached : 0.000004s : 0.03% optimize.opt_a.after_resolve : 0.000027s : 0.24% optimize.opt_a.a_after_grad : 0.000022s : 0.20% optimize.opt_a.renormalize : 0.000566s : 5.07% optimize.opt_a.add_forward_monad_depend : 0.000007s : 0.06% optimize.opt_a.auto_monad_grad : 0.000003s : 0.03% optimize.opt_a.auto_monad_eliminator : 0.000022s : 0.20% optimize.opt_a.cse : 0.000045s : 0.40% optimize.opt_a.a_3 : 0.000117s : 1.05% optimize.py_interpret_to_execute_after_opt_a : 0.000013s : 0.12% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.05% optimize.rewriter_after_opt_a : 0.000038s : 0.34% optimize.convert_after_rewriter : 0.000010s : 0.09% optimize.order_py_execute_after_rewriter : 0.000008s : 0.07% optimize.mutable_eliminate : 0.000510s : 4.57% optimize.opt_b.b_1 : 0.000192s : 1.72% optimize.opt_b.b_2 : 0.000009s : 0.08% optimize.opt_b.updatestate_depend_eliminate : 0.000005s : 0.05% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.02% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_b.renormalize : 0.000000s : 0.00% optimize.opt_b.cse : 0.000019s : 0.17% optimize.optimize_parallel_all_gather_comm : 0.000020s : 0.18% optimize.overlap_param_gather : 0.000005s : 0.04% optimize.cconv : 0.000028s : 0.25% optimize.loop_unroll : 0.000426s : 3.82% optimize.opt_after_cconv.c_1 : 0.000036s : 0.32% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.02% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000005s : 0.04% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.02% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.02% optimize.opt_after_cconv.cse : 0.000017s : 0.16% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000016s : 0.14% optimize.tuple_transform.d_1 : 0.000049s : 0.44% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000008s : 0.07% optimize.partial_unused_args_eliminate : 0.000005s : 0.04% optimize.add_recomputation : 0.000050s : 0.44% optimize.cse_after_recomputation.cse : 0.000012s : 0.10% optimize.environ_conv : 0.000009s : 0.08% optimize.swap_dp_allreduce_reducescatter : 0.000008s : 0.07% optimize.bias_add_comm_swap : 0.000005s : 0.04% optimize.label_micro_interleaved_index : 0.000007s : 0.06% optimize.label_fine_grained_interleaved_index : 0.000005s : 0.05% optimize.merge_cast_opt : 0.000004s : 0.03% optimize.slice_recompute_activation : 0.000005s : 0.04% optimize.micro_interleaved_order_control : 0.000005s : 0.04% optimize.assign_add_opt : 0.000004s : 0.03% optimize.ForceFp32Comm : 0.000003s : 0.03% optimize.remove_cast_before_assign_add : 0.000003s : 0.03% optimize.full_micro_interleaved_order_control : 0.000005s : 0.04% optimize.reorder_send_recv_between_fp_bp : 0.000005s : 0.05% optimize.comm_op_add_attrs : 0.000004s : 0.03% optimize.add_comm_op_reuse_tag : 0.000003s : 0.03% optimize.interleave_split_concat_branches : 0.000004s : 0.03% optimize.interleave_parallel_branches : 0.000004s : 0.03% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.03% optimize.overlap_opt_shard_grad_in_pipeline : 0.000004s : 0.04% optimize.control_data_broadcast_order : 0.000015s : 0.13% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.04% optimize.offloading_packed_experts : 0.000006s : 0.06% optimize.overlap_recompute_and_grad_model_parallel : 0.000007s : 0.06% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000003s : 0.03% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.03% optimize.overlap_recompute_comm : 0.000005s : 0.04% optimize.overlap_grad_ring_attention : 0.000007s : 0.06% optimize.overlap_grad_flash_sp : 0.000021s : 0.19% optimize.begin_end_overlap_inline : 0.000003s : 0.03% optimize.split_matmul_comm_elemetwise : 0.000004s : 0.04% optimize.split_layernorm_comm : 0.000004s : 0.04% optimize.handle_group_info : 0.000003s : 0.03% optimize.symbol_engine_optimizer.build : 0.000003s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000010s : 0.09% optimize.symbol_engine_optimizer.elim_not_effective : 0.000013s : 0.12% optimize.symbol_engine_optimizer.opt_reshape : 0.000008s : 0.07% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000011s : 0.10% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000004s : 0.03% pipeline_parallel_scheduler : 0.000002s : 0.02% auto_monad_reorder : 0.000020s : 0.18% get_jit_bprop_graph : 0.000001s : 0.01% rewriter_after_jit_bprop_graph : 0.000004s : 0.04% opt_after_jit_grad : 0.000477s : 4.27% validate : 0.000035s : 0.32% Time group info: ------[substitution.] 0.000196 34 0.98% : 0.000002s : 2: substitution.elim_not_effective 0.67% : 0.000001s : 2: substitution.fold_const_symbol 3.05% : 0.000006s : 5: substitution.graph_param_transform 76.33% : 0.000149s : 4: substitution.inline 1.66% : 0.000003s : 4: substitution.j_node_and_user_rematch 2.49% : 0.000005s : 4: substitution.remove_not_recompute_node 2.73% : 0.000005s : 6: substitution.replace_old_param 9.24% : 0.000018s : 6: substitution.tuple_list_get_item_eliminator 2.85% : 0.000006s : 1: substitution.value_based_eliminate ------[type_inference.] 0.005749 2 87.00% : 0.005002s : 1: type_inference.infer 13.00% : 0.000747s : 1: type_inference.specialize ------[replace.] 0.000069 10 53.91% : 0.000037s : 4: replace.inline 46.09% : 0.000032s : 6: replace.tuple_list_get_item_eliminator ------[match.] 0.000162 10 90.44% : 0.000147s : 4: match.inline 9.56% : 0.000016s : 6: match.tuple_list_get_item_eliminator ------[predicate.] 0.000232 1590 0.91% : 0.000002s : 16: predicate.accumulaten_eliminater 0.63% : 0.000001s : 5: predicate.ad_related_special_op_eliminate 0.51% : 0.000001s : 10: predicate.addn_check_dump 0.92% : 0.000002s : 16: predicate.addn_zero_filter 0.85% : 0.000002s : 16: predicate.adjust_all_reduce_mul_add 2.15% : 0.000005s : 26: predicate.arithmetic_simplify 0.96% : 0.000002s : 16: predicate.cast_eliminate 0.63% : 0.000001s : 10: predicate.check_bprop_eliminate 0.53% : 0.000001s : 10: predicate.compare_switch_simplify 0.20% : 0.000000s : 5: predicate.const_output_eliminate 0.68% : 0.000002s : 10: predicate.depend_value_elim 0.95% : 0.000002s : 16: predicate.dict_get_item_const_eliminator 1.00% : 0.000002s : 16: predicate.dict_get_item_eliminator 0.86% : 0.000002s : 16: predicate.dict_set_item_eliminator 0.90% : 0.000002s : 10: predicate.dumpgradient_eliminate 0.22% : 0.000001s : 5: predicate.elim_not_effective 0.41% : 0.000001s : 5: predicate.elim_shapecalc_of_broadcastargs 1.16% : 0.000003s : 21: predicate.environ_add_const_eliminate 1.11% : 0.000003s : 21: predicate.environ_get_add_eliminate 1.09% : 0.000003s : 21: predicate.environ_get_depend_swap 1.71% : 0.000004s : 31: predicate.environ_get_eliminate 1.08% : 0.000003s : 21: predicate.environ_get_set_eliminate 1.55% : 0.000004s : 26: predicate.exchange_switch_depend_value 2.45% : 0.000006s : 26: predicate.float_depend_g_call 0.53% : 0.000001s : 10: predicate.float_environ_get_switch 0.77% : 0.000002s : 15: predicate.float_tuple_getitem_switch 0.19% : 0.000000s : 5: predicate.fold_const_symbol 0.74% : 0.000002s : 10: predicate.get_grad_eliminate 0.29% : 0.000001s : 5: predicate.graph_param_transform 0.56% : 0.000001s : 10: predicate.incorporate_call 0.46% : 0.000001s : 10: predicate.incorporate_call_switch 6.15% : 0.000014s : 72: predicate.inline 0.80% : 0.000002s : 10: predicate.inline_without_move 0.35% : 0.000001s : 10: predicate.j_node_and_user_rematch 0.78% : 0.000002s : 10: predicate.less_batch_normalization 1.92% : 0.000004s : 32: predicate.list_to_tuple_eliminator_ 2.63% : 0.000006s : 48: predicate.load_eliminater 0.79% : 0.000002s : 5: predicate.loop_unroll_after_grad 2.44% : 0.000006s : 40: predicate.loop_unroll_before_grad 1.55% : 0.000004s : 26: predicate.make_slice_get_slice_eliminator 0.54% : 0.000001s : 10: predicate.merge_addn 0.53% : 0.000001s : 10: predicate.micro_step_allgather_replace 0.69% : 0.000002s : 10: predicate.mini_step_allgather_replace 0.85% : 0.000002s : 16: predicate.minmaximum_grad 0.92% : 0.000002s : 5: predicate.mutable_eliminate 0.35% : 0.000001s : 5: predicate.opt_reshape 0.39% : 0.000001s : 5: predicate.parallel_virtual_node 1.78% : 0.000004s : 26: predicate.partial_defer_inline 1.80% : 0.000004s : 27: predicate.partial_eliminate 0.94% : 0.000002s : 16: predicate.print_const_string_wrapper 0.54% : 0.000001s : 10: predicate.reduce_all_const_elim 1.12% : 0.000003s : 16: predicate.reduce_eliminate 2.58% : 0.000006s : 48: predicate.redundant_stop_gradient_eliminater 0.73% : 0.000002s : 10: predicate.remove_not_recompute_node 1.51% : 0.000003s : 32: predicate.replace_applicator 0.73% : 0.000002s : 10: predicate.replace_old_param 0.27% : 0.000001s : 5: predicate.reset_defer_inline 0.95% : 0.000002s : 16: predicate.reshape_eliminate 0.59% : 0.000001s : 10: predicate.row_tensor_add_zeros_like 0.35% : 0.000001s : 5: predicate.row_tensor_eliminate 0.77% : 0.000002s : 10: predicate.same_eliminate 0.45% : 0.000001s : 10: predicate.set_cell_output_no_recompute 0.70% : 0.000002s : 10: predicate.shard_identity_eliminate 0.68% : 0.000002s : 10: predicate.special_op_eliminate 0.70% : 0.000002s : 10: predicate.specialize_transform 0.77% : 0.000002s : 10: predicate.split_environ_get_set_with_tuple_value 0.80% : 0.000002s : 10: predicate.stack_unstack_eliminate 0.34% : 0.000001s : 5: predicate.switch_call_monad_eliminater 1.59% : 0.000004s : 26: predicate.switch_defer_inline 2.18% : 0.000005s : 36: predicate.switch_layer_defer_inline 5.24% : 0.000012s : 81: predicate.switch_simplify 0.86% : 0.000002s : 16: predicate.tile_eliminate 0.95% : 0.000002s : 16: predicate.transpose_eliminate 1.49% : 0.000003s : 26: predicate.tuple_list_convert_item_index_to_positive 1.61% : 0.000004s : 26: predicate.tuple_list_get_item_const_eliminator 1.46% : 0.000003s : 26: predicate.tuple_list_get_item_depend_reorder 3.22% : 0.000007s : 42: predicate.tuple_list_get_item_eliminator 1.47% : 0.000003s : 26: predicate.tuple_list_get_set_item_eliminator 2.08% : 0.000005s : 36: predicate.tuple_list_set_item_eliminator 1.95% : 0.000005s : 32: predicate.tuple_to_list_eliminator_ 2.56% : 0.000006s : 48: predicate.updatestate_pure_node_eliminater 3.19% : 0.000007s : 58: predicate.updatestate_useless_node_eliminater 0.47% : 0.000001s : 5: predicate.value_based_eliminate 0.61% : 0.000001s : 10: predicate.virtual_dataset_eliminate 0.61% : 0.000001s : 10: predicate.virtual_output_eliminate 0.27% : 0.000001s : 5: predicate.virtual_view_grad_eliminate 0.44% : 0.000001s : 5: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000564 11 54.38% : 0.000307s : 5: func_graph_cloner_run.FuncGraphClonerGraph 45.62% : 0.000257s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.026407 192 0.02% : 0.000006s : 1: ForceFp32Comm 11.52% : 0.003043s : 1: add_attr 11.47% : 0.003029s : 1: add_attr_with_inline 0.02% : 0.000006s : 1: add_comm_op_reuse_tag 0.20% : 0.000053s : 1: add_recomputation 0.02% : 0.000006s : 1: assign_add_opt 0.27% : 0.000070s : 1: auto_monad 0.10% : 0.000028s : 1: auto_monad_reorder 0.02% : 0.000006s : 1: begin_end_overlap_inline 0.03% : 0.000007s : 1: bias_add_comm_swap 1.82% : 0.000481s : 1: bootstrap 0.12% : 0.000031s : 1: cconv 0.03% : 0.000007s : 1: comm_op_add_attrs 0.07% : 0.000018s : 1: control_data_broadcast_order 0.05% : 0.000013s : 1: convert_after_rewriter 0.11% : 0.000030s : 1: cse_after_recomputation 0.03% : 0.000007s : 1: dataset_repeat_opt 0.07% : 0.000017s : 1: detach_backward 0.04% : 0.000012s : 1: environ_conv 0.11% : 0.000030s : 1: event_method 0.03% : 0.000007s : 1: full_micro_interleaved_order_control 0.03% : 0.000007s : 1: get_jit_bprop_graph 0.04% : 0.000012s : 1: graph_reusing 0.03% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.02% : 0.000006s : 1: handle_group_info 0.03% : 0.000008s : 1: inline 0.03% : 0.000008s : 1: insert-virtual-dataset 0.02% : 0.000006s : 1: interleave_parallel_branches 0.02% : 0.000006s : 1: interleave_split_concat_branches 0.03% : 0.000008s : 1: label_fine_grained_interleaved_index 0.04% : 0.000009s : 1: label_micro_interleaved_index 1.64% : 0.000432s : 1: loop_unroll 0.02% : 0.000007s : 1: merge_cast_opt 0.03% : 0.000007s : 1: micro_interleaved_order_control 1.95% : 0.000516s : 1: mutable_eliminate 0.04% : 0.000009s : 1: offloading_packed_experts 0.05% : 0.000014s : 1: opt.transform.loop_unroll_optimizer 0.06% : 0.000015s : 1: opt.transform.mutable_eliminate 5.14% : 0.001356s : 78: opt.transform.opt_a 0.13% : 0.000034s : 1: opt.transform.opt_after_cconv 0.11% : 0.000028s : 1: opt.transform.opt_after_jit_grad 0.49% : 0.000129s : 28: opt.transform.opt_b 0.21% : 0.000055s : 2: opt.transform.opt_trans_graph 0.15% : 0.000039s : 4: opt.transform.symbol_engine_opt 11.74% : 0.003101s : 1: opt_a 0.50% : 0.000132s : 1: opt_after_cconv 1.84% : 0.000486s : 1: opt_after_jit_grad 1.13% : 0.000300s : 1: opt_b 21.57% : 0.005696s : 1: optimize 0.09% : 0.000023s : 1: optimize_parallel_all_gather_comm 0.04% : 0.000011s : 1: order_py_execute_after_rewriter 0.09% : 0.000024s : 1: overlap_grad_flash_sp 0.02% : 0.000006s : 1: overlap_grad_matmul_and_grad_allreduce 0.04% : 0.000010s : 1: overlap_grad_ring_attention 0.03% : 0.000007s : 1: overlap_opt_shard_grad_in_pipeline 0.02% : 0.000007s : 1: overlap_opt_shard_in_pipeline 0.03% : 0.000008s : 1: overlap_param_gather 0.02% : 0.000006s : 1: overlap_recompute_allgather_and_fa_grad 0.04% : 0.000010s : 1: overlap_recompute_and_grad_model_parallel 0.03% : 0.000007s : 1: overlap_recompute_comm 0.04% : 0.000010s : 1: parallel-infer-symbol 0.02% : 0.000006s : 1: parallel-infer-symbol-second 0.03% : 0.000008s : 1: partial_unused_args_eliminate 0.03% : 0.000009s : 1: pipeline_parallel_scheduler 0.03% : 0.000007s : 1: pipeline_split 0.16% : 0.000043s : 1: pre_auto_parallel 0.16% : 0.000041s : 1: py_interpret_to_execute 0.06% : 0.000016s : 1: py_interpret_to_execute_after_opt_a 0.02% : 0.000006s : 1: remove_cast_before_assign_add 0.07% : 0.000019s : 1: remove_dup_value 1.07% : 0.000283s : 1: renormalize.infer 1.04% : 0.000275s : 1: renormalize.specialize 0.03% : 0.000008s : 1: reorder_send_recv_between_fp_bp 0.04% : 0.000010s : 1: rewriter_after_jit_bprop_graph 0.16% : 0.000041s : 1: rewriter_after_opt_a 0.37% : 0.000097s : 1: rewriter_before_opt_a 0.03% : 0.000008s : 1: slice_cell_reuse_recomputed_activation 0.03% : 0.000007s : 1: slice_recompute_activation 0.03% : 0.000007s : 1: split_layernorm_comm 0.03% : 0.000007s : 1: split_matmul_comm_elemetwise 0.04% : 0.000011s : 1: swap_dp_allreduce_reducescatter 0.37% : 0.000099s : 1: symbol_engine_optimizer 0.37% : 0.000098s : 1: tuple_transform 22.09% : 0.005832s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:47:10.221.842 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.014558, [21] [bootstrap]: 0.00043071 [type_inference]: 0.00564005 [event_method]: 1.995e-05 [auto_monad]: 6.007e-05 [graph_reusing]: 5.29998e-06 [inline]: 2.43e-06 [add_attr]: 0.00298324, [1] [add_attr_with_inline]: 0.0029753, [1] [Cycle 1]: 5.571e-05, [2] [tag_attr]: 2.017e-05 [meta_addattr_fg_expand]: 6.08998e-06 [parallel-infer-symbol]: 3.28e-06 [pre_auto_parallel]: 3.266e-05 [insert-virtual-dataset]: 2.41998e-06 [parallel-infer-symbol-second]: 6.89994e-07 [dataset_repeat_opt]: 2.04e-06 [pipeline_split]: 1.57999e-06 [optimize]: 0.00472925, [53] [py_interpret_to_execute]: 5.942e-05 [rewriter_before_opt_a]: 8.923e-05 [opt_a]: 0.00270095, [2] [Cycle 1]: 0.00199744, [45] [expand_dump_flag]: 3.14999e-06 [switch_simplify]: 4.615e-05 [loop_unroll]: 3.467e-05 [a_1]: 0.00070874 [with_stream_mark]: 1.757e-05 [recompute_prepare]: 9.16002e-06 [updatestate_depend_eliminate]: 3.67002e-06 [updatestate_assign_eliminate]: 3.14999e-06 [updatestate_loads_eliminate]: 2.89001e-06 [parameter_eliminate]: 2.36e-06 [a_2]: 8.876e-05 [accelerated_algorithm]: 7.66999e-06 [shard]: 2.19999e-06 [meta_shard_fg_expand]: 1.60999e-06 [shard_inline]: 7.16999e-06 [merge_send_recv]: 8.04997e-06 [auto_parallel]: 6.42001e-06 [parallel]: 1.839e-05 [flash_sp]: 7.25e-06 [merge_comm]: 4.22998e-06 [allreduce_fusion]: 3.55e-06 [matmul_add_comm_reduction]: 8.91997e-06 [allreduce_slice_to_reducescatter]: 6.10016e-07 [virtual_shard_identity]: 8.32e-06 [virtual_dataset]: 7.3e-06 [get_grad_eliminate_]: 6.86001e-06 [virtual_output]: 6.76e-06 [merge_forward]: 3.87002e-06 [cell_reuse_recompute_pass]: 1.13001e-06 [offload_activation]: 9.42001e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.247e-05 [merge_recompute_call_nodes]: 1.32999e-06 [before_grad]: 1.044e-05 [set_forward_comm_id_for_comm_node_pass]: 3.58e-06 [meta_fg_expand]: 2.76999e-06 [flash_sp_send_recv_attached]: 2.49001e-06 [receive_attached]: 2.41998e-06 [after_resolve]: 1.301e-05 [a_after_grad]: 1.102e-05 [renormalize]: 0.00055652 [add_forward_monad_depend]: 4.73001e-06 [auto_monad_grad]: 2.17999e-06 [auto_monad_eliminator]: 1.429e-05 [cse]: 3.264e-05 [a_3]: 5.03e-05 [Cycle 2]: 0.00069337, [45] [expand_dump_flag]: 1.26997e-06 [switch_simplify]: 8.60999e-06 [loop_unroll]: 6.93e-06 [a_1]: 0.00015786 [with_stream_mark]: 1.072e-05 [recompute_prepare]: 6.84999e-06 [updatestate_depend_eliminate]: 2.82002e-06 [updatestate_assign_eliminate]: 2.86999e-06 [updatestate_loads_eliminate]: 2.37001e-06 [parameter_eliminate]: 9.50007e-07 [a_2]: 8.225e-05 [accelerated_algorithm]: 6.88998e-06 [shard]: 1.13001e-06 [meta_shard_fg_expand]: 1.41998e-06 [shard_inline]: 6.78e-06 [merge_send_recv]: 4.67e-06 [auto_parallel]: 5.29e-06 [parallel]: 4.13999e-06 [flash_sp]: 3.13e-06 [merge_comm]: 3.28e-06 [allreduce_fusion]: 2.79999e-06 [matmul_add_comm_reduction]: 5.25001e-06 [allreduce_slice_to_reducescatter]: 3.50003e-07 [virtual_shard_identity]: 7.45998e-06 [virtual_dataset]: 6.51e-06 [get_grad_eliminate_]: 8.62998e-06 [virtual_output]: 6.28e-06 [merge_forward]: 2.65002e-06 [cell_reuse_recompute_pass]: 1.37999e-06 [offload_activation]: 6.38e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.409e-05 [merge_recompute_call_nodes]: 7.50006e-07 [before_grad]: 9.86e-06 [set_forward_comm_id_for_comm_node_pass]: 3.20998e-06 [meta_fg_expand]: 2.46e-06 [flash_sp_send_recv_attached]: 8.89995e-07 [receive_attached]: 1.05999e-06 [after_resolve]: 1.19e-05 [a_after_grad]: 1.031e-05 [renormalize]: 7.00238e-08 [add_forward_monad_depend]: 1.09e-06 [auto_monad_grad]: 1.09003e-06 [auto_monad_eliminator]: 6.60997e-06 [cse]: 1.494e-05 [a_3]: 4.02e-05 [py_interpret_to_execute_after_opt_a]: 8.47e-06 [slice_cell_reuse_recomputed_activation]: 1.77001e-06 [rewriter_after_opt_a]: 3.363e-05 [convert_after_rewriter]: 7e-06 [order_py_execute_after_rewriter]: 5.10001e-06 [mutable_eliminate]: 0.000466 [opt_b]: 0.00022536, [1] [Cycle 1]: 0.00021923, [7] [b_1]: 0.00014634 [b_2]: 8.57e-06 [updatestate_depend_eliminate]: 5.07999e-06 [updatestate_assign_eliminate]: 2.37999e-06 [updatestate_loads_eliminate]: 2.71e-06 [renormalize]: 3.50003e-07 [cse]: 1.856e-05 [optimize_parallel_all_gather_comm]: 1.59e-05 [overlap_param_gather]: 1.82001e-06 [cconv]: 2.314e-05 [loop_unroll]: 0.00043873 [opt_after_cconv]: 0.00010632, [1] [Cycle 1]: 0.00010053, [7] [c_1]: 3.576e-05 [parameter_eliminate]: 2.47001e-06 [updatestate_depend_eliminate]: 4.89e-06 [updatestate_assign_eliminate]: 2.58998e-06 [updatestate_loads_eliminate]: 2.29999e-06 [cse]: 1.793e-05 [renormalize]: 4.19997e-07 [remove_dup_value]: 1.262e-05 [tuple_transform]: 8.08e-05, [1] [Cycle 1]: 7.633e-05, [4] [d_1]: 4.883e-05 [none_parameter_eliminate]: 1.49e-06 [renormalize]: 2.50002e-07 [switch_simplify]: 7.57002e-06 [partial_unused_args_eliminate]: 1.90001e-06 [add_recomputation]: 4.533e-05 [cse_after_recomputation]: 2.126e-05, [1] [Cycle 1]: 1.711e-05, [1] [cse]: 1.171e-05 [environ_conv]: 4.49002e-06 [swap_dp_allreduce_reducescatter]: 5.32999e-06 [bias_add_comm_swap]: 2.73e-06 [label_micro_interleaved_index]: 4.03001e-06 [label_fine_grained_interleaved_index]: 2.81999e-06 [merge_cast_opt]: 1.54e-06 [slice_recompute_activation]: 1.90001e-06 [micro_interleaved_order_control]: 2.34001e-06 [assign_add_opt]: 1.19e-06 [ForceFp32Comm]: 7.80012e-07 [remove_cast_before_assign_add]: 1.03001e-06 [full_micro_interleaved_order_control]: 2.07999e-06 [reorder_send_recv_between_fp_bp]: 2.63e-06 [comm_op_add_attrs]: 9.70002e-07 [add_comm_op_reuse_tag]: 9.50007e-07 [interleave_split_concat_branches]: 1.27999e-06 [interleave_parallel_branches]: 1.18001e-06 [overlap_opt_shard_in_pipeline]: 1.02998e-06 [overlap_opt_shard_grad_in_pipeline]: 1.92001e-06 [control_data_broadcast_order]: 1.207e-05 [grouped_pairwise_exchange_alltoall]: 1.99e-06 [offloading_packed_experts]: 3.98001e-06 [overlap_recompute_and_grad_model_parallel]: 4.23001e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.38002e-06 [overlap_recompute_allgather_and_fa_grad]: 1.32e-06 [overlap_recompute_comm]: 2.14999e-06 [overlap_grad_ring_attention]: 4.03001e-06 [overlap_grad_flash_sp]: 1.788e-05 [begin_end_overlap_inline]: 4.89992e-07 [split_matmul_comm_elemetwise]: 2.31e-06 [split_layernorm_comm]: 2.09e-06 [handle_group_info]: 9.60019e-07 [symbol_engine_optimizer]: 7.61e-05, [1] [Cycle 1]: 7.182e-05, [6] [build]: 2.70002e-06 [elim_shapecalc]: 9.76e-06 [elim_not_effective]: 1.275e-05 [opt_reshape]: 7.51999e-06 [fold_const_symbol]: 1.077e-05 [renormalize]: 2.29978e-07 [detach_backward]: 1.59998e-06 [pipeline_parallel_scheduler]: 1.45999e-06 [auto_monad_reorder]: 1.74e-05 [get_jit_bprop_graph]: 1.07e-06 [rewriter_after_jit_bprop_graph]: 3.20002e-06 [opt_after_jit_grad]: 0.00044535 [validate]: 3.38e-05 Sums bootstrap : 0.000431s : 4.05% type_inference : 0.005640s : 52.97% event_method : 0.000020s : 0.19% auto_monad : 0.000060s : 0.56% graph_reusing : 0.000005s : 0.05% inline : 0.000002s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000020s : 0.19% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.06% parallel-infer-symbol : 0.000003s : 0.03% pre_auto_parallel : 0.000033s : 0.31% insert-virtual-dataset : 0.000002s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000059s : 0.56% optimize.rewriter_before_opt_a : 0.000089s : 0.84% optimize.opt_a.expand_dump_flag : 0.000004s : 0.04% optimize.opt_a.switch_simplify : 0.000055s : 0.51% optimize.opt_a.loop_unroll : 0.000042s : 0.39% optimize.opt_a.a_1 : 0.000867s : 8.14% optimize.opt_a.with_stream_mark : 0.000028s : 0.27% optimize.opt_a.recompute_prepare : 0.000016s : 0.15% optimize.opt_a.updatestate_depend_eliminate : 0.000006s : 0.06% optimize.opt_a.updatestate_assign_eliminate : 0.000006s : 0.06% optimize.opt_a.updatestate_loads_eliminate : 0.000005s : 0.05% optimize.opt_a.parameter_eliminate : 0.000003s : 0.03% optimize.opt_a.a_2 : 0.000171s : 1.61% optimize.opt_a.accelerated_algorithm : 0.000015s : 0.14% optimize.opt_a.shard : 0.000003s : 0.03% optimize.opt_a.meta_shard_fg_expand : 0.000003s : 0.03% optimize.opt_a.shard_inline : 0.000014s : 0.13% optimize.opt_a.merge_send_recv : 0.000013s : 0.12% optimize.opt_a.auto_parallel : 0.000012s : 0.11% optimize.opt_a.parallel : 0.000023s : 0.21% optimize.opt_a.flash_sp : 0.000010s : 0.10% optimize.opt_a.merge_comm : 0.000008s : 0.07% optimize.opt_a.allreduce_fusion : 0.000006s : 0.06% optimize.opt_a.matmul_add_comm_reduction : 0.000014s : 0.13% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000016s : 0.15% optimize.opt_a.virtual_dataset : 0.000014s : 0.13% optimize.opt_a.get_grad_eliminate_ : 0.000015s : 0.15% optimize.opt_a.virtual_output : 0.000013s : 0.12% optimize.opt_a.merge_forward : 0.000007s : 0.06% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.02% optimize.opt_a.offload_activation : 0.000016s : 0.15% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000027s : 0.25% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.02% optimize.opt_a.before_grad : 0.000020s : 0.19% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000007s : 0.06% optimize.opt_a.meta_fg_expand : 0.000005s : 0.05% optimize.opt_a.flash_sp_send_recv_attached : 0.000003s : 0.03% optimize.opt_a.receive_attached : 0.000003s : 0.03% optimize.opt_a.after_resolve : 0.000025s : 0.23% optimize.opt_a.a_after_grad : 0.000021s : 0.20% optimize.opt_a.renormalize : 0.000557s : 5.23% optimize.opt_a.add_forward_monad_depend : 0.000006s : 0.05% optimize.opt_a.auto_monad_grad : 0.000003s : 0.03% optimize.opt_a.auto_monad_eliminator : 0.000021s : 0.20% optimize.opt_a.cse : 0.000048s : 0.45% optimize.opt_a.a_3 : 0.000091s : 0.85% optimize.py_interpret_to_execute_after_opt_a : 0.000008s : 0.08% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.02% optimize.rewriter_after_opt_a : 0.000034s : 0.32% optimize.convert_after_rewriter : 0.000007s : 0.07% optimize.order_py_execute_after_rewriter : 0.000005s : 0.05% optimize.mutable_eliminate : 0.000466s : 4.38% optimize.opt_b.b_1 : 0.000146s : 1.37% optimize.opt_b.b_2 : 0.000009s : 0.08% optimize.opt_b.updatestate_depend_eliminate : 0.000005s : 0.05% optimize.opt_b.updatestate_assign_eliminate : 0.000002s : 0.02% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_b.renormalize : 0.000000s : 0.00% optimize.opt_b.cse : 0.000019s : 0.17% optimize.optimize_parallel_all_gather_comm : 0.000016s : 0.15% optimize.overlap_param_gather : 0.000002s : 0.02% optimize.cconv : 0.000023s : 0.22% optimize.loop_unroll : 0.000439s : 4.12% optimize.opt_after_cconv.c_1 : 0.000036s : 0.34% optimize.opt_after_cconv.parameter_eliminate : 0.000002s : 0.02% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000005s : 0.05% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.02% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.02% optimize.opt_after_cconv.cse : 0.000018s : 0.17% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000013s : 0.12% optimize.tuple_transform.d_1 : 0.000049s : 0.46% optimize.tuple_transform.none_parameter_eliminate : 0.000001s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000008s : 0.07% optimize.partial_unused_args_eliminate : 0.000002s : 0.02% optimize.add_recomputation : 0.000045s : 0.43% optimize.cse_after_recomputation.cse : 0.000012s : 0.11% optimize.environ_conv : 0.000004s : 0.04% optimize.swap_dp_allreduce_reducescatter : 0.000005s : 0.05% optimize.bias_add_comm_swap : 0.000003s : 0.03% optimize.label_micro_interleaved_index : 0.000004s : 0.04% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.03% optimize.merge_cast_opt : 0.000002s : 0.01% optimize.slice_recompute_activation : 0.000002s : 0.02% optimize.micro_interleaved_order_control : 0.000002s : 0.02% optimize.assign_add_opt : 0.000001s : 0.01% optimize.ForceFp32Comm : 0.000001s : 0.01% optimize.remove_cast_before_assign_add : 0.000001s : 0.01% optimize.full_micro_interleaved_order_control : 0.000002s : 0.02% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.02% optimize.comm_op_add_attrs : 0.000001s : 0.01% optimize.add_comm_op_reuse_tag : 0.000001s : 0.01% optimize.interleave_split_concat_branches : 0.000001s : 0.01% optimize.interleave_parallel_branches : 0.000001s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.02% optimize.control_data_broadcast_order : 0.000012s : 0.11% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.02% optimize.offloading_packed_experts : 0.000004s : 0.04% optimize.overlap_recompute_and_grad_model_parallel : 0.000004s : 0.04% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.01% optimize.overlap_recompute_comm : 0.000002s : 0.02% optimize.overlap_grad_ring_attention : 0.000004s : 0.04% optimize.overlap_grad_flash_sp : 0.000018s : 0.17% optimize.begin_end_overlap_inline : 0.000000s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.02% optimize.split_layernorm_comm : 0.000002s : 0.02% optimize.handle_group_info : 0.000001s : 0.01% optimize.symbol_engine_optimizer.build : 0.000003s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000010s : 0.09% optimize.symbol_engine_optimizer.elim_not_effective : 0.000013s : 0.12% optimize.symbol_engine_optimizer.opt_reshape : 0.000008s : 0.07% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000011s : 0.10% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.02% pipeline_parallel_scheduler : 0.000001s : 0.01% auto_monad_reorder : 0.000017s : 0.16% get_jit_bprop_graph : 0.000001s : 0.01% rewriter_after_jit_bprop_graph : 0.000003s : 0.03% opt_after_jit_grad : 0.000445s : 4.18% validate : 0.000034s : 0.32% Time group info: ------[substitution.] 0.000186 34 0.91% : 0.000002s : 2: substitution.elim_not_effective 0.68% : 0.000001s : 2: substitution.fold_const_symbol 3.16% : 0.000006s : 5: substitution.graph_param_transform 75.66% : 0.000141s : 4: substitution.inline 1.59% : 0.000003s : 4: substitution.j_node_and_user_rematch 2.43% : 0.000005s : 4: substitution.remove_not_recompute_node 2.82% : 0.000005s : 6: substitution.replace_old_param 9.57% : 0.000018s : 6: substitution.tuple_list_get_item_eliminator 3.17% : 0.000006s : 1: substitution.value_based_eliminate ------[type_inference.] 0.005581 2 87.34% : 0.004874s : 1: type_inference.infer 12.66% : 0.000706s : 1: type_inference.specialize ------[replace.] 0.000066 10 53.08% : 0.000035s : 4: replace.inline 46.92% : 0.000031s : 6: replace.tuple_list_get_item_eliminator ------[match.] 0.000153 10 90.16% : 0.000138s : 4: match.inline 9.84% : 0.000015s : 6: match.tuple_list_get_item_eliminator ------[predicate.] 0.000227 1590 0.92% : 0.000002s : 16: predicate.accumulaten_eliminater 0.66% : 0.000001s : 5: predicate.ad_related_special_op_eliminate 0.51% : 0.000001s : 10: predicate.addn_check_dump 0.99% : 0.000002s : 16: predicate.addn_zero_filter 0.86% : 0.000002s : 16: predicate.adjust_all_reduce_mul_add 1.86% : 0.000004s : 26: predicate.arithmetic_simplify 0.93% : 0.000002s : 16: predicate.cast_eliminate 0.60% : 0.000001s : 10: predicate.check_bprop_eliminate 0.53% : 0.000001s : 10: predicate.compare_switch_simplify 0.21% : 0.000000s : 5: predicate.const_output_eliminate 0.56% : 0.000001s : 10: predicate.depend_value_elim 0.97% : 0.000002s : 16: predicate.dict_get_item_const_eliminator 1.07% : 0.000002s : 16: predicate.dict_get_item_eliminator 1.03% : 0.000002s : 16: predicate.dict_set_item_eliminator 0.89% : 0.000002s : 10: predicate.dumpgradient_eliminate 0.21% : 0.000000s : 5: predicate.elim_not_effective 0.36% : 0.000001s : 5: predicate.elim_shapecalc_of_broadcastargs 1.16% : 0.000003s : 21: predicate.environ_add_const_eliminate 1.13% : 0.000003s : 21: predicate.environ_get_add_eliminate 1.12% : 0.000003s : 21: predicate.environ_get_depend_swap 1.71% : 0.000004s : 31: predicate.environ_get_eliminate 1.16% : 0.000003s : 21: predicate.environ_get_set_eliminate 1.59% : 0.000004s : 26: predicate.exchange_switch_depend_value 2.43% : 0.000006s : 26: predicate.float_depend_g_call 0.50% : 0.000001s : 10: predicate.float_environ_get_switch 0.77% : 0.000002s : 15: predicate.float_tuple_getitem_switch 0.19% : 0.000000s : 5: predicate.fold_const_symbol 0.64% : 0.000001s : 10: predicate.get_grad_eliminate 0.22% : 0.000001s : 5: predicate.graph_param_transform 0.60% : 0.000001s : 10: predicate.incorporate_call 0.48% : 0.000001s : 10: predicate.incorporate_call_switch 6.21% : 0.000014s : 72: predicate.inline 0.75% : 0.000002s : 10: predicate.inline_without_move 0.37% : 0.000001s : 10: predicate.j_node_and_user_rematch 0.76% : 0.000002s : 10: predicate.less_batch_normalization 1.97% : 0.000004s : 32: predicate.list_to_tuple_eliminator_ 2.70% : 0.000006s : 48: predicate.load_eliminater 0.93% : 0.000002s : 5: predicate.loop_unroll_after_grad 2.40% : 0.000005s : 40: predicate.loop_unroll_before_grad 1.61% : 0.000004s : 26: predicate.make_slice_get_slice_eliminator 0.53% : 0.000001s : 10: predicate.merge_addn 0.56% : 0.000001s : 10: predicate.micro_step_allgather_replace 0.62% : 0.000001s : 10: predicate.mini_step_allgather_replace 0.92% : 0.000002s : 16: predicate.minmaximum_grad 0.97% : 0.000002s : 5: predicate.mutable_eliminate 0.34% : 0.000001s : 5: predicate.opt_reshape 0.37% : 0.000001s : 5: predicate.parallel_virtual_node 1.91% : 0.000004s : 26: predicate.partial_defer_inline 1.83% : 0.000004s : 27: predicate.partial_eliminate 1.06% : 0.000002s : 16: predicate.print_const_string_wrapper 0.55% : 0.000001s : 10: predicate.reduce_all_const_elim 1.10% : 0.000002s : 16: predicate.reduce_eliminate 2.64% : 0.000006s : 48: predicate.redundant_stop_gradient_eliminater 0.46% : 0.000001s : 10: predicate.remove_not_recompute_node 1.48% : 0.000003s : 32: predicate.replace_applicator 0.44% : 0.000001s : 10: predicate.replace_old_param 0.28% : 0.000001s : 5: predicate.reset_defer_inline 0.94% : 0.000002s : 16: predicate.reshape_eliminate 0.63% : 0.000001s : 10: predicate.row_tensor_add_zeros_like 0.34% : 0.000001s : 5: predicate.row_tensor_eliminate 0.68% : 0.000002s : 10: predicate.same_eliminate 0.45% : 0.000001s : 10: predicate.set_cell_output_no_recompute 0.70% : 0.000002s : 10: predicate.shard_identity_eliminate 0.70% : 0.000002s : 10: predicate.special_op_eliminate 0.64% : 0.000001s : 10: predicate.specialize_transform 0.76% : 0.000002s : 10: predicate.split_environ_get_set_with_tuple_value 0.69% : 0.000002s : 10: predicate.stack_unstack_eliminate 0.34% : 0.000001s : 5: predicate.switch_call_monad_eliminater 1.67% : 0.000004s : 26: predicate.switch_defer_inline 2.18% : 0.000005s : 36: predicate.switch_layer_defer_inline 5.21% : 0.000012s : 81: predicate.switch_simplify 0.87% : 0.000002s : 16: predicate.tile_eliminate 0.90% : 0.000002s : 16: predicate.transpose_eliminate 1.46% : 0.000003s : 26: predicate.tuple_list_convert_item_index_to_positive 1.62% : 0.000004s : 26: predicate.tuple_list_get_item_const_eliminator 1.50% : 0.000003s : 26: predicate.tuple_list_get_item_depend_reorder 3.27% : 0.000007s : 42: predicate.tuple_list_get_item_eliminator 1.49% : 0.000003s : 26: predicate.tuple_list_get_set_item_eliminator 2.17% : 0.000005s : 36: predicate.tuple_list_set_item_eliminator 1.84% : 0.000004s : 32: predicate.tuple_to_list_eliminator_ 2.62% : 0.000006s : 48: predicate.updatestate_pure_node_eliminater 3.40% : 0.000008s : 58: predicate.updatestate_useless_node_eliminater 0.41% : 0.000001s : 5: predicate.value_based_eliminate 0.60% : 0.000001s : 10: predicate.virtual_dataset_eliminate 0.59% : 0.000001s : 10: predicate.virtual_output_eliminate 0.28% : 0.000001s : 5: predicate.virtual_view_grad_eliminate 0.47% : 0.000001s : 5: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000520 11 55.12% : 0.000287s : 5: func_graph_cloner_run.FuncGraphClonerGraph 44.88% : 0.000233s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.024350 192 0.01% : 0.000004s : 1: ForceFp32Comm 12.27% : 0.002988s : 1: add_attr 12.23% : 0.002979s : 1: add_attr_with_inline 0.01% : 0.000004s : 1: add_comm_op_reuse_tag 0.20% : 0.000050s : 1: add_recomputation 0.02% : 0.000004s : 1: assign_add_opt 0.27% : 0.000065s : 1: auto_monad 0.09% : 0.000021s : 1: auto_monad_reorder 0.01% : 0.000003s : 1: begin_end_overlap_inline 0.02% : 0.000006s : 1: bias_add_comm_swap 1.89% : 0.000460s : 1: bootstrap 0.11% : 0.000026s : 1: cconv 0.02% : 0.000004s : 1: comm_op_add_attrs 0.06% : 0.000015s : 1: control_data_broadcast_order 0.04% : 0.000010s : 1: convert_after_rewriter 0.10% : 0.000024s : 1: cse_after_recomputation 0.02% : 0.000005s : 1: dataset_repeat_opt 0.02% : 0.000005s : 1: detach_backward 0.03% : 0.000007s : 1: environ_conv 0.10% : 0.000025s : 1: event_method 0.02% : 0.000005s : 1: full_micro_interleaved_order_control 0.02% : 0.000004s : 1: get_jit_bprop_graph 0.04% : 0.000009s : 1: graph_reusing 0.02% : 0.000005s : 1: grouped_pairwise_exchange_alltoall 0.02% : 0.000004s : 1: handle_group_info 0.02% : 0.000006s : 1: inline 0.02% : 0.000006s : 1: insert-virtual-dataset 0.02% : 0.000004s : 1: interleave_parallel_branches 0.02% : 0.000004s : 1: interleave_split_concat_branches 0.02% : 0.000006s : 1: label_fine_grained_interleaved_index 0.03% : 0.000007s : 1: label_micro_interleaved_index 1.83% : 0.000446s : 1: loop_unroll 0.02% : 0.000005s : 1: merge_cast_opt 0.02% : 0.000005s : 1: micro_interleaved_order_control 1.95% : 0.000474s : 1: mutable_eliminate 0.03% : 0.000007s : 1: offloading_packed_experts 0.06% : 0.000015s : 1: opt.transform.loop_unroll_optimizer 0.06% : 0.000014s : 1: opt.transform.mutable_eliminate 5.48% : 0.001334s : 78: opt.transform.opt_a 0.14% : 0.000034s : 1: opt.transform.opt_after_cconv 0.10% : 0.000025s : 1: opt.transform.opt_after_jit_grad 0.51% : 0.000125s : 28: opt.transform.opt_b 0.22% : 0.000054s : 2: opt.transform.opt_trans_graph 0.15% : 0.000037s : 4: opt.transform.symbol_engine_opt 11.10% : 0.002704s : 1: opt_a 0.45% : 0.000110s : 1: opt_after_cconv 1.86% : 0.000454s : 1: opt_after_jit_grad 0.94% : 0.000229s : 1: opt_b 19.44% : 0.004733s : 1: optimize 0.08% : 0.000019s : 1: optimize_parallel_all_gather_comm 0.03% : 0.000008s : 1: order_py_execute_after_rewriter 0.09% : 0.000021s : 1: overlap_grad_flash_sp 0.02% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.03% : 0.000007s : 1: overlap_grad_ring_attention 0.02% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.02% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.02% : 0.000005s : 1: overlap_param_gather 0.02% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.03% : 0.000007s : 1: overlap_recompute_and_grad_model_parallel 0.02% : 0.000005s : 1: overlap_recompute_comm 0.03% : 0.000007s : 1: parallel-infer-symbol 0.01% : 0.000003s : 1: parallel-infer-symbol-second 0.02% : 0.000005s : 1: partial_unused_args_eliminate 0.02% : 0.000004s : 1: pipeline_parallel_scheduler 0.02% : 0.000004s : 1: pipeline_split 0.15% : 0.000037s : 1: pre_auto_parallel 0.26% : 0.000064s : 1: py_interpret_to_execute 0.05% : 0.000012s : 1: py_interpret_to_execute_after_opt_a 0.02% : 0.000004s : 1: remove_cast_before_assign_add 0.07% : 0.000016s : 1: remove_dup_value 1.14% : 0.000277s : 1: renormalize.infer 1.12% : 0.000272s : 1: renormalize.specialize 0.02% : 0.000005s : 1: reorder_send_recv_between_fp_bp 0.03% : 0.000006s : 1: rewriter_after_jit_bprop_graph 0.16% : 0.000038s : 1: rewriter_after_opt_a 0.38% : 0.000093s : 1: rewriter_before_opt_a 0.02% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.02% : 0.000005s : 1: slice_recompute_activation 0.02% : 0.000005s : 1: split_layernorm_comm 0.02% : 0.000005s : 1: split_matmul_comm_elemetwise 0.03% : 0.000008s : 1: swap_dp_allreduce_reducescatter 0.32% : 0.000079s : 1: symbol_engine_optimizer 0.34% : 0.000084s : 1: tuple_transform 23.22% : 0.005654s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:47:10.406.584 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:47:10.406.831 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.01654, [21] [bootstrap]: 0.00043743 [type_inference]: 0.00582561 [event_method]: 2.129e-05 [auto_monad]: 6.4e-05 [graph_reusing]: 6.05002e-06 [inline]: 2.21e-06 [add_attr]: 0.00303355, [1] [add_attr_with_inline]: 0.00302483, [1] [Cycle 1]: 7.312e-05, [2] [tag_attr]: 2.113e-05 [meta_addattr_fg_expand]: 6.49999e-06 [parallel-infer-symbol]: 3.11001e-06 [pre_auto_parallel]: 3.558e-05 [insert-virtual-dataset]: 2.31e-06 [parallel-infer-symbol-second]: 6.50005e-07 [dataset_repeat_opt]: 1.94e-06 [pipeline_split]: 1.47999e-06 [optimize]: 0.0059546, [53] [py_interpret_to_execute]: 3.329e-05 [rewriter_before_opt_a]: 9.644e-05 [opt_a]: 0.00354839, [2] [Cycle 1]: 0.00251153, [45] [expand_dump_flag]: 3.09001e-06 [switch_simplify]: 4.827e-05 [loop_unroll]: 3.573e-05 [a_1]: 0.00083776 [with_stream_mark]: 1.68e-05 [recompute_prepare]: 1.096e-05 [updatestate_depend_eliminate]: 5.72001e-06 [updatestate_assign_eliminate]: 4.25999e-06 [updatestate_loads_eliminate]: 3.91001e-06 [parameter_eliminate]: 2.16e-06 [a_2]: 0.00013878 [accelerated_algorithm]: 8.77e-06 [shard]: 2.19001e-06 [meta_shard_fg_expand]: 2.22001e-06 [shard_inline]: 9.22999e-06 [merge_send_recv]: 1.013e-05 [auto_parallel]: 7.46999e-06 [parallel]: 1.797e-05 [flash_sp]: 8.69e-06 [merge_comm]: 4.90001e-06 [allreduce_fusion]: 4.46002e-06 [matmul_add_comm_reduction]: 1.062e-05 [allreduce_slice_to_reducescatter]: 7.80012e-07 [virtual_shard_identity]: 1.052e-05 [virtual_dataset]: 2.008e-05 [get_grad_eliminate_]: 8.48999e-06 [virtual_output]: 8.32e-06 [merge_forward]: 5.27001e-06 [cell_reuse_recompute_pass]: 1.27e-06 [offload_activation]: 1.07e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.859e-05 [merge_recompute_call_nodes]: 1.37999e-06 [before_grad]: 1.37e-05 [set_forward_comm_id_for_comm_node_pass]: 4.77e-06 [meta_fg_expand]: 3.63999e-06 [flash_sp_send_recv_attached]: 2.49001e-06 [receive_attached]: 1.94999e-06 [after_resolve]: 1.445e-05 [a_after_grad]: 1.393e-05 [renormalize]: 0.00065354 [add_forward_monad_depend]: 5.39998e-06 [auto_monad_grad]: 2.34999e-06 [auto_monad_eliminator]: 1.699e-05 [cse]: 3.615e-05 [a_3]: 7.625e-05 [Cycle 2]: 0.00102251, [45] [expand_dump_flag]: 1.49e-06 [switch_simplify]: 9.82001e-06 [loop_unroll]: 8.36002e-06 [a_1]: 0.00020577 [with_stream_mark]: 1.25e-05 [recompute_prepare]: 8.85001e-06 [updatestate_depend_eliminate]: 4.13999e-06 [updatestate_assign_eliminate]: 3.4e-06 [updatestate_loads_eliminate]: 3.23e-06 [parameter_eliminate]: 1.08001e-06 [a_2]: 0.00012985 [accelerated_algorithm]: 8.52998e-06 [shard]: 1.06002e-06 [meta_shard_fg_expand]: 1.76998e-06 [shard_inline]: 8.35999e-06 [merge_send_recv]: 6.43e-06 [auto_parallel]: 6.99001e-06 [parallel]: 4.77e-06 [flash_sp]: 3.36999e-06 [merge_comm]: 4.37e-06 [allreduce_fusion]: 3.66999e-06 [matmul_add_comm_reduction]: 7.08e-06 [allreduce_slice_to_reducescatter]: 7.2e-07 [virtual_shard_identity]: 8.84e-06 [virtual_dataset]: 2.944e-05 [get_grad_eliminate_]: 8.87e-06 [virtual_output]: 8.52998e-06 [merge_forward]: 4.05998e-06 [cell_reuse_recompute_pass]: 1.83002e-06 [offload_activation]: 8.26002e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.763e-05 [merge_recompute_call_nodes]: 8.80013e-07 [before_grad]: 1.304e-05 [set_forward_comm_id_for_comm_node_pass]: 4.87e-06 [meta_fg_expand]: 2.98e-06 [flash_sp_send_recv_attached]: 1.04e-06 [receive_attached]: 1.05001e-06 [after_resolve]: 1.349e-05 [a_after_grad]: 1.268e-05 [renormalize]: 8.00064e-08 [add_forward_monad_depend]: 1.78997e-06 [auto_monad_grad]: 1.14998e-06 [auto_monad_eliminator]: 8.79998e-06 [cse]: 1.926e-05 [a_3]: 6.402e-05 [py_interpret_to_execute_after_opt_a]: 1.366e-05 [slice_cell_reuse_recomputed_activation]: 5.14e-06 [rewriter_after_opt_a]: 4.384e-05 [convert_after_rewriter]: 1.112e-05 [order_py_execute_after_rewriter]: 8.95999e-06 [mutable_eliminate]: 0.00049928 [opt_b]: 0.00033907, [1] [Cycle 1]: 0.00032983, [7] [b_1]: 0.00022629 [b_2]: 9.94999e-06 [updatestate_depend_eliminate]: 6.93e-06 [updatestate_assign_eliminate]: 3.45e-06 [updatestate_loads_eliminate]: 3.65e-06 [renormalize]: 3.99974e-07 [cse]: 2.312e-05 [optimize_parallel_all_gather_comm]: 2.158e-05 [overlap_param_gather]: 4.77998e-06 [cconv]: 2.752e-05 [loop_unroll]: 0.00044038 [opt_after_cconv]: 0.00014591, [1] [Cycle 1]: 0.00013736, [7] [c_1]: 4.241e-05 [parameter_eliminate]: 2.58e-06 [updatestate_depend_eliminate]: 6.69001e-06 [updatestate_assign_eliminate]: 3.56999e-06 [updatestate_loads_eliminate]: 3.51001e-06 [cse]: 2.234e-05 [renormalize]: 3.39991e-07 [remove_dup_value]: 1.825e-05 [tuple_transform]: 0.00010616, [1] [Cycle 1]: 9.917e-05, [4] [d_1]: 5.825e-05 [none_parameter_eliminate]: 1.68002e-06 [renormalize]: 2.00002e-07 [switch_simplify]: 9.05999e-06 [partial_unused_args_eliminate]: 4.37e-06 [add_recomputation]: 5.787e-05 [cse_after_recomputation]: 3.198e-05, [1] [Cycle 1]: 2.498e-05, [1] [cse]: 1.567e-05 [environ_conv]: 9.36e-06 [swap_dp_allreduce_reducescatter]: 9.61e-06 [bias_add_comm_swap]: 5.42001e-06 [label_micro_interleaved_index]: 6.44999e-06 [label_fine_grained_interleaved_index]: 5.04e-06 [merge_cast_opt]: 3.67998e-06 [slice_recompute_activation]: 4.65999e-06 [micro_interleaved_order_control]: 4.84e-06 [assign_add_opt]: 3.56999e-06 [ForceFp32Comm]: 3.66999e-06 [remove_cast_before_assign_add]: 3.65e-06 [full_micro_interleaved_order_control]: 4.57e-06 [reorder_send_recv_between_fp_bp]: 4.95999e-06 [comm_op_add_attrs]: 3.26999e-06 [add_comm_op_reuse_tag]: 3.35e-06 [interleave_split_concat_branches]: 3.44001e-06 [interleave_parallel_branches]: 3.27002e-06 [overlap_opt_shard_in_pipeline]: 3.38e-06 [overlap_opt_shard_grad_in_pipeline]: 4.25e-06 [control_data_broadcast_order]: 1.822e-05 [grouped_pairwise_exchange_alltoall]: 3.86999e-06 [offloading_packed_experts]: 6.71e-06 [overlap_recompute_and_grad_model_parallel]: 7.81001e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.49001e-06 [overlap_recompute_allgather_and_fa_grad]: 3.67002e-06 [overlap_recompute_comm]: 4.67e-06 [overlap_grad_ring_attention]: 7.09001e-06 [overlap_grad_flash_sp]: 2.4e-05 [begin_end_overlap_inline]: 2.89001e-06 [split_matmul_comm_elemetwise]: 4.47e-06 [split_layernorm_comm]: 4.23999e-06 [handle_group_info]: 3.14999e-06 [symbol_engine_optimizer]: 0.00010509, [1] [Cycle 1]: 9.852e-05, [6] [build]: 3.26001e-06 [elim_shapecalc]: 1.186e-05 [elim_not_effective]: 1.585e-05 [opt_reshape]: 8.97e-06 [fold_const_symbol]: 1.322e-05 [renormalize]: 2.69996e-07 [detach_backward]: 3.8e-06 [pipeline_parallel_scheduler]: 2.08002e-06 [auto_monad_reorder]: 2.314e-05 [get_jit_bprop_graph]: 1.00001e-06 [rewriter_after_jit_bprop_graph]: 4.35999e-06 [opt_after_jit_grad]: 0.00048709 [validate]: 4.058e-05 Sums bootstrap : 0.000437s : 3.72% type_inference : 0.005826s : 49.54% event_method : 0.000021s : 0.18% auto_monad : 0.000064s : 0.54% graph_reusing : 0.000006s : 0.05% inline : 0.000002s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000021s : 0.18% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.06% parallel-infer-symbol : 0.000003s : 0.03% pre_auto_parallel : 0.000036s : 0.30% insert-virtual-dataset : 0.000002s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000001s : 0.01% optimize.py_interpret_to_execute : 0.000033s : 0.28% optimize.rewriter_before_opt_a : 0.000096s : 0.82% optimize.opt_a.expand_dump_flag : 0.000005s : 0.04% optimize.opt_a.switch_simplify : 0.000058s : 0.49% optimize.opt_a.loop_unroll : 0.000044s : 0.37% optimize.opt_a.a_1 : 0.001044s : 8.87% optimize.opt_a.with_stream_mark : 0.000029s : 0.25% optimize.opt_a.recompute_prepare : 0.000020s : 0.17% optimize.opt_a.updatestate_depend_eliminate : 0.000010s : 0.08% optimize.opt_a.updatestate_assign_eliminate : 0.000008s : 0.07% optimize.opt_a.updatestate_loads_eliminate : 0.000007s : 0.06% optimize.opt_a.parameter_eliminate : 0.000003s : 0.03% optimize.opt_a.a_2 : 0.000269s : 2.28% optimize.opt_a.accelerated_algorithm : 0.000017s : 0.15% optimize.opt_a.shard : 0.000003s : 0.03% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.03% optimize.opt_a.shard_inline : 0.000018s : 0.15% optimize.opt_a.merge_send_recv : 0.000017s : 0.14% optimize.opt_a.auto_parallel : 0.000014s : 0.12% optimize.opt_a.parallel : 0.000023s : 0.19% optimize.opt_a.flash_sp : 0.000012s : 0.10% optimize.opt_a.merge_comm : 0.000009s : 0.08% optimize.opt_a.allreduce_fusion : 0.000008s : 0.07% optimize.opt_a.matmul_add_comm_reduction : 0.000018s : 0.15% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000002s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000019s : 0.16% optimize.opt_a.virtual_dataset : 0.000050s : 0.42% optimize.opt_a.get_grad_eliminate_ : 0.000017s : 0.15% optimize.opt_a.virtual_output : 0.000017s : 0.14% optimize.opt_a.merge_forward : 0.000009s : 0.08% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.03% optimize.opt_a.offload_activation : 0.000019s : 0.16% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000036s : 0.31% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.02% optimize.opt_a.before_grad : 0.000027s : 0.23% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000010s : 0.08% optimize.opt_a.meta_fg_expand : 0.000007s : 0.06% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.03% optimize.opt_a.receive_attached : 0.000003s : 0.03% optimize.opt_a.after_resolve : 0.000028s : 0.24% optimize.opt_a.a_after_grad : 0.000027s : 0.23% optimize.opt_a.renormalize : 0.000654s : 5.56% optimize.opt_a.add_forward_monad_depend : 0.000007s : 0.06% optimize.opt_a.auto_monad_grad : 0.000003s : 0.03% optimize.opt_a.auto_monad_eliminator : 0.000026s : 0.22% optimize.opt_a.cse : 0.000055s : 0.47% optimize.opt_a.a_3 : 0.000140s : 1.19% optimize.py_interpret_to_execute_after_opt_a : 0.000014s : 0.12% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.04% optimize.rewriter_after_opt_a : 0.000044s : 0.37% optimize.convert_after_rewriter : 0.000011s : 0.09% optimize.order_py_execute_after_rewriter : 0.000009s : 0.08% optimize.mutable_eliminate : 0.000499s : 4.25% optimize.opt_b.b_1 : 0.000226s : 1.92% optimize.opt_b.b_2 : 0.000010s : 0.08% optimize.opt_b.updatestate_depend_eliminate : 0.000007s : 0.06% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_b.updatestate_loads_eliminate : 0.000004s : 0.03% optimize.opt_b.renormalize : 0.000000s : 0.00% optimize.opt_b.cse : 0.000023s : 0.20% optimize.optimize_parallel_all_gather_comm : 0.000022s : 0.18% optimize.overlap_param_gather : 0.000005s : 0.04% optimize.cconv : 0.000028s : 0.23% optimize.loop_unroll : 0.000440s : 3.74% optimize.opt_after_cconv.c_1 : 0.000042s : 0.36% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.02% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000007s : 0.06% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000004s : 0.03% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000004s : 0.03% optimize.opt_after_cconv.cse : 0.000022s : 0.19% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000018s : 0.16% optimize.tuple_transform.d_1 : 0.000058s : 0.50% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000009s : 0.08% optimize.partial_unused_args_eliminate : 0.000004s : 0.04% optimize.add_recomputation : 0.000058s : 0.49% optimize.cse_after_recomputation.cse : 0.000016s : 0.13% optimize.environ_conv : 0.000009s : 0.08% optimize.swap_dp_allreduce_reducescatter : 0.000010s : 0.08% optimize.bias_add_comm_swap : 0.000005s : 0.05% optimize.label_micro_interleaved_index : 0.000006s : 0.05% optimize.label_fine_grained_interleaved_index : 0.000005s : 0.04% optimize.merge_cast_opt : 0.000004s : 0.03% optimize.slice_recompute_activation : 0.000005s : 0.04% optimize.micro_interleaved_order_control : 0.000005s : 0.04% optimize.assign_add_opt : 0.000004s : 0.03% optimize.ForceFp32Comm : 0.000004s : 0.03% optimize.remove_cast_before_assign_add : 0.000004s : 0.03% optimize.full_micro_interleaved_order_control : 0.000005s : 0.04% optimize.reorder_send_recv_between_fp_bp : 0.000005s : 0.04% optimize.comm_op_add_attrs : 0.000003s : 0.03% optimize.add_comm_op_reuse_tag : 0.000003s : 0.03% optimize.interleave_split_concat_branches : 0.000003s : 0.03% optimize.interleave_parallel_branches : 0.000003s : 0.03% optimize.overlap_opt_shard_in_pipeline : 0.000003s : 0.03% optimize.overlap_opt_shard_grad_in_pipeline : 0.000004s : 0.04% optimize.control_data_broadcast_order : 0.000018s : 0.15% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.03% optimize.offloading_packed_experts : 0.000007s : 0.06% optimize.overlap_recompute_and_grad_model_parallel : 0.000008s : 0.07% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000003s : 0.03% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.03% optimize.overlap_recompute_comm : 0.000005s : 0.04% optimize.overlap_grad_ring_attention : 0.000007s : 0.06% optimize.overlap_grad_flash_sp : 0.000024s : 0.20% optimize.begin_end_overlap_inline : 0.000003s : 0.02% optimize.split_matmul_comm_elemetwise : 0.000004s : 0.04% optimize.split_layernorm_comm : 0.000004s : 0.04% optimize.handle_group_info : 0.000003s : 0.03% optimize.symbol_engine_optimizer.build : 0.000003s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000012s : 0.10% optimize.symbol_engine_optimizer.elim_not_effective : 0.000016s : 0.13% optimize.symbol_engine_optimizer.opt_reshape : 0.000009s : 0.08% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000013s : 0.11% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000004s : 0.03% pipeline_parallel_scheduler : 0.000002s : 0.02% auto_monad_reorder : 0.000023s : 0.20% get_jit_bprop_graph : 0.000001s : 0.01% rewriter_after_jit_bprop_graph : 0.000004s : 0.04% opt_after_jit_grad : 0.000487s : 4.14% validate : 0.000041s : 0.35% Time group info: ------[substitution.] 0.000226 44 10.26% : 0.000023s : 3: substitution.cast_eliminate 0.93% : 0.000002s : 3: substitution.elim_not_effective 0.74% : 0.000002s : 3: substitution.fold_const_symbol 3.10% : 0.000007s : 6: substitution.graph_param_transform 66.44% : 0.000150s : 4: substitution.inline 1.89% : 0.000004s : 6: substitution.j_node_and_user_rematch 3.06% : 0.000007s : 6: substitution.remove_not_recompute_node 2.54% : 0.000006s : 6: substitution.replace_old_param 8.36% : 0.000019s : 6: substitution.tuple_list_get_item_eliminator 2.67% : 0.000006s : 1: substitution.value_based_eliminate ------[type_inference.] 0.005773 2 87.20% : 0.005034s : 1: type_inference.infer 12.80% : 0.000739s : 1: type_inference.specialize ------[replace.] 0.000070 10 52.62% : 0.000037s : 4: replace.inline 47.38% : 0.000033s : 6: replace.tuple_list_get_item_eliminator ------[match.] 0.000164 10 90.14% : 0.000148s : 4: match.inline 9.86% : 0.000016s : 6: match.tuple_list_get_item_eliminator ------[predicate.] 0.000286 1908 0.96% : 0.000003s : 20: predicate.accumulaten_eliminater 0.66% : 0.000002s : 6: predicate.ad_related_special_op_eliminate 0.54% : 0.000002s : 12: predicate.addn_check_dump 0.87% : 0.000003s : 20: predicate.addn_zero_filter 0.87% : 0.000002s : 20: predicate.adjust_all_reduce_mul_add 1.92% : 0.000006s : 32: predicate.arithmetic_simplify 0.99% : 0.000003s : 20: predicate.cast_eliminate 0.61% : 0.000002s : 12: predicate.check_bprop_eliminate 0.58% : 0.000002s : 12: predicate.compare_switch_simplify 0.20% : 0.000001s : 6: predicate.const_output_eliminate 0.74% : 0.000002s : 12: predicate.depend_value_elim 0.99% : 0.000003s : 20: predicate.dict_get_item_const_eliminator 1.21% : 0.000003s : 20: predicate.dict_get_item_eliminator 0.92% : 0.000003s : 20: predicate.dict_set_item_eliminator 0.87% : 0.000002s : 12: predicate.dumpgradient_eliminate 0.20% : 0.000001s : 6: predicate.elim_not_effective 0.36% : 0.000001s : 6: predicate.elim_shapecalc_of_broadcastargs 1.25% : 0.000004s : 26: predicate.environ_add_const_eliminate 1.17% : 0.000003s : 26: predicate.environ_get_add_eliminate 1.16% : 0.000003s : 26: predicate.environ_get_depend_swap 1.82% : 0.000005s : 38: predicate.environ_get_eliminate 1.18% : 0.000003s : 26: predicate.environ_get_set_eliminate 1.46% : 0.000004s : 30: predicate.exchange_switch_depend_value 2.35% : 0.000007s : 30: predicate.float_depend_g_call 0.56% : 0.000002s : 12: predicate.float_environ_get_switch 0.83% : 0.000002s : 18: predicate.float_tuple_getitem_switch 0.19% : 0.000001s : 6: predicate.fold_const_symbol 0.68% : 0.000002s : 12: predicate.get_grad_eliminate 0.28% : 0.000001s : 6: predicate.graph_param_transform 0.58% : 0.000002s : 12: predicate.incorporate_call 0.51% : 0.000001s : 12: predicate.incorporate_call_switch 6.25% : 0.000018s : 86: predicate.inline 0.73% : 0.000002s : 12: predicate.inline_without_move 0.36% : 0.000001s : 12: predicate.j_node_and_user_rematch 0.73% : 0.000002s : 12: predicate.less_batch_normalization 1.84% : 0.000005s : 38: predicate.list_to_tuple_eliminator_ 2.71% : 0.000008s : 58: predicate.load_eliminater 0.79% : 0.000002s : 6: predicate.loop_unroll_after_grad 2.11% : 0.000006s : 42: predicate.loop_unroll_before_grad 1.62% : 0.000005s : 32: predicate.make_slice_get_slice_eliminator 0.58% : 0.000002s : 12: predicate.merge_addn 0.59% : 0.000002s : 12: predicate.micro_step_allgather_replace 0.59% : 0.000002s : 12: predicate.mini_step_allgather_replace 0.89% : 0.000003s : 20: predicate.minmaximum_grad 0.94% : 0.000003s : 6: predicate.mutable_eliminate 0.37% : 0.000001s : 6: predicate.opt_reshape 0.35% : 0.000001s : 6: predicate.parallel_virtual_node 1.71% : 0.000005s : 30: predicate.partial_defer_inline 1.82% : 0.000005s : 32: predicate.partial_eliminate 0.98% : 0.000003s : 20: predicate.print_const_string_wrapper 0.61% : 0.000002s : 12: predicate.reduce_all_const_elim 1.12% : 0.000003s : 20: predicate.reduce_eliminate 2.62% : 0.000007s : 58: predicate.redundant_stop_gradient_eliminater 0.38% : 0.000001s : 12: predicate.remove_not_recompute_node 1.42% : 0.000004s : 38: predicate.replace_applicator 0.41% : 0.000001s : 12: predicate.replace_old_param 0.33% : 0.000001s : 6: predicate.reset_defer_inline 0.96% : 0.000003s : 20: predicate.reshape_eliminate 0.64% : 0.000002s : 12: predicate.row_tensor_add_zeros_like 0.36% : 0.000001s : 6: predicate.row_tensor_eliminate 0.85% : 0.000002s : 12: predicate.same_eliminate 0.43% : 0.000001s : 12: predicate.set_cell_output_no_recompute 0.78% : 0.000002s : 12: predicate.shard_identity_eliminate 0.67% : 0.000002s : 12: predicate.special_op_eliminate 0.71% : 0.000002s : 12: predicate.specialize_transform 0.78% : 0.000002s : 12: predicate.split_environ_get_set_with_tuple_value 0.80% : 0.000002s : 12: predicate.stack_unstack_eliminate 0.35% : 0.000001s : 6: predicate.switch_call_monad_eliminater 1.55% : 0.000004s : 30: predicate.switch_defer_inline 2.14% : 0.000006s : 42: predicate.switch_layer_defer_inline 4.78% : 0.000014s : 90: predicate.switch_simplify 0.95% : 0.000003s : 20: predicate.tile_eliminate 0.92% : 0.000003s : 20: predicate.transpose_eliminate 1.54% : 0.000004s : 32: predicate.tuple_list_convert_item_index_to_positive 1.70% : 0.000005s : 32: predicate.tuple_list_get_item_const_eliminator 1.49% : 0.000004s : 32: predicate.tuple_list_get_item_depend_reorder 3.20% : 0.000009s : 50: predicate.tuple_list_get_item_eliminator 1.61% : 0.000005s : 32: predicate.tuple_list_get_set_item_eliminator 2.25% : 0.000006s : 44: predicate.tuple_list_set_item_eliminator 1.86% : 0.000005s : 38: predicate.tuple_to_list_eliminator_ 2.61% : 0.000007s : 58: predicate.updatestate_pure_node_eliminater 3.35% : 0.000010s : 70: predicate.updatestate_useless_node_eliminater 0.38% : 0.000001s : 6: predicate.value_based_eliminate 0.71% : 0.000002s : 12: predicate.virtual_dataset_eliminate 0.61% : 0.000002s : 12: predicate.virtual_output_eliminate 0.28% : 0.000001s : 6: predicate.virtual_view_grad_eliminate 0.38% : 0.000001s : 6: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000547 11 54.70% : 0.000299s : 5: func_graph_cloner_run.FuncGraphClonerGraph 45.30% : 0.000248s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.028091 192 0.02% : 0.000006s : 1: ForceFp32Comm 10.83% : 0.003043s : 1: add_attr 10.78% : 0.003029s : 1: add_attr_with_inline 0.02% : 0.000006s : 1: add_comm_op_reuse_tag 0.22% : 0.000062s : 1: add_recomputation 0.02% : 0.000006s : 1: assign_add_opt 0.26% : 0.000074s : 1: auto_monad 0.11% : 0.000031s : 1: auto_monad_reorder 0.02% : 0.000006s : 1: begin_end_overlap_inline 0.03% : 0.000008s : 1: bias_add_comm_swap 1.71% : 0.000481s : 1: bootstrap 0.11% : 0.000031s : 1: cconv 0.02% : 0.000006s : 1: comm_op_add_attrs 0.08% : 0.000021s : 1: control_data_broadcast_order 0.05% : 0.000014s : 1: convert_after_rewriter 0.13% : 0.000035s : 1: cse_after_recomputation 0.03% : 0.000008s : 1: dataset_repeat_opt 0.06% : 0.000018s : 1: detach_backward 0.04% : 0.000012s : 1: environ_conv 0.11% : 0.000032s : 1: event_method 0.03% : 0.000007s : 1: full_micro_interleaved_order_control 0.02% : 0.000007s : 1: get_jit_bprop_graph 0.04% : 0.000012s : 1: graph_reusing 0.02% : 0.000006s : 1: grouped_pairwise_exchange_alltoall 0.02% : 0.000006s : 1: handle_group_info 0.03% : 0.000008s : 1: inline 0.03% : 0.000008s : 1: insert-virtual-dataset 0.02% : 0.000006s : 1: interleave_parallel_branches 0.02% : 0.000006s : 1: interleave_split_concat_branches 0.03% : 0.000008s : 1: label_fine_grained_interleaved_index 0.03% : 0.000009s : 1: label_micro_interleaved_index 1.59% : 0.000446s : 1: loop_unroll 0.02% : 0.000006s : 1: merge_cast_opt 0.03% : 0.000007s : 1: micro_interleaved_order_control 1.80% : 0.000506s : 1: mutable_eliminate 0.03% : 0.000009s : 1: offloading_packed_experts 0.06% : 0.000017s : 1: opt.transform.loop_unroll_optimizer 0.06% : 0.000018s : 1: opt.transform.mutable_eliminate 5.90% : 0.001658s : 78: opt.transform.opt_a 0.15% : 0.000041s : 1: opt.transform.opt_after_cconv 0.11% : 0.000032s : 1: opt.transform.opt_after_jit_grad 0.58% : 0.000162s : 28: opt.transform.opt_b 0.23% : 0.000065s : 2: opt.transform.opt_trans_graph 0.17% : 0.000047s : 4: opt.transform.symbol_engine_opt 12.64% : 0.003552s : 1: opt_a 0.53% : 0.000149s : 1: opt_after_cconv 1.77% : 0.000497s : 1: opt_after_jit_grad 1.22% : 0.000343s : 1: opt_b 22.44% : 0.006304s : 1: optimize 0.09% : 0.000025s : 1: optimize_parallel_all_gather_comm 0.04% : 0.000012s : 1: order_py_execute_after_rewriter 0.10% : 0.000027s : 1: overlap_grad_flash_sp 0.02% : 0.000006s : 1: overlap_grad_matmul_and_grad_allreduce 0.04% : 0.000010s : 1: overlap_grad_ring_attention 0.03% : 0.000007s : 1: overlap_opt_shard_grad_in_pipeline 0.02% : 0.000006s : 1: overlap_opt_shard_in_pipeline 0.03% : 0.000008s : 1: overlap_param_gather 0.02% : 0.000006s : 1: overlap_recompute_allgather_and_fa_grad 0.04% : 0.000011s : 1: overlap_recompute_and_grad_model_parallel 0.03% : 0.000007s : 1: overlap_recompute_comm 0.04% : 0.000010s : 1: parallel-infer-symbol 0.02% : 0.000006s : 1: parallel-infer-symbol-second 0.03% : 0.000007s : 1: partial_unused_args_eliminate 0.04% : 0.000010s : 1: pipeline_parallel_scheduler 0.02% : 0.000007s : 1: pipeline_split 0.15% : 0.000043s : 1: pre_auto_parallel 0.13% : 0.000037s : 1: py_interpret_to_execute 0.06% : 0.000017s : 1: py_interpret_to_execute_after_opt_a 0.02% : 0.000007s : 1: remove_cast_before_assign_add 0.08% : 0.000021s : 1: remove_dup_value 1.23% : 0.000346s : 1: renormalize.infer 1.06% : 0.000299s : 1: renormalize.specialize 0.03% : 0.000008s : 1: reorder_send_recv_between_fp_bp 0.04% : 0.000010s : 1: rewriter_after_jit_bprop_graph 0.17% : 0.000047s : 1: rewriter_after_opt_a 0.36% : 0.000100s : 1: rewriter_before_opt_a 0.03% : 0.000008s : 1: slice_cell_reuse_recomputed_activation 0.03% : 0.000007s : 1: slice_recompute_activation 0.02% : 0.000007s : 1: split_layernorm_comm 0.03% : 0.000007s : 1: split_matmul_comm_elemetwise 0.04% : 0.000012s : 1: swap_dp_allreduce_reducescatter 0.38% : 0.000108s : 1: symbol_engine_optimizer 0.39% : 0.000109s : 1: tuple_transform 20.86% : 0.005860s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:47:10.599.352 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0159771, [21] [bootstrap]: 0.00044293 [type_inference]: 0.00593197 [event_method]: 2.139e-05 [auto_monad]: 6.345e-05 [graph_reusing]: 6.32001e-06 [inline]: 2.46e-06 [add_attr]: 0.00312976, [1] [add_attr_with_inline]: 0.00312106, [1] [Cycle 1]: 6.706e-05, [2] [tag_attr]: 2.155e-05 [meta_addattr_fg_expand]: 6.43e-06 [parallel-infer-symbol]: 3.26001e-06 [pre_auto_parallel]: 3.846e-05 [insert-virtual-dataset]: 2.35002e-06 [parallel-infer-symbol-second]: 8.29983e-07 [dataset_repeat_opt]: 1.97999e-06 [pipeline_split]: 1.81998e-06 [optimize]: 0.00564002, [53] [py_interpret_to_execute]: 3.048e-05 [rewriter_before_opt_a]: 9.477e-05 [opt_a]: 0.00339048, [2] [Cycle 1]: 0.00252762, [45] [expand_dump_flag]: 2.91e-06 [switch_simplify]: 4.88e-05 [loop_unroll]: 3.623e-05 [a_1]: 0.00086543 [with_stream_mark]: 2.161e-05 [recompute_prepare]: 1.385e-05 [updatestate_depend_eliminate]: 5.84e-06 [updatestate_assign_eliminate]: 4.08001e-06 [updatestate_loads_eliminate]: 3.71999e-06 [parameter_eliminate]: 2.16e-06 [a_2]: 0.00011004 [accelerated_algorithm]: 1.05e-05 [shard]: 2.66e-06 [meta_shard_fg_expand]: 2.07001e-06 [shard_inline]: 8.68001e-06 [merge_send_recv]: 1.037e-05 [auto_parallel]: 8.03001e-06 [parallel]: 1.92e-05 [flash_sp]: 9.80002e-06 [merge_comm]: 5.05999e-06 [allreduce_fusion]: 4.48999e-06 [matmul_add_comm_reduction]: 1.08e-05 [allreduce_slice_to_reducescatter]: 7.59988e-07 [virtual_shard_identity]: 1.135e-05 [virtual_dataset]: 9.09e-06 [get_grad_eliminate_]: 8.08001e-06 [virtual_output]: 8.35001e-06 [merge_forward]: 4.80001e-06 [cell_reuse_recompute_pass]: 1.40999e-06 [offload_activation]: 1.165e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.762e-05 [merge_recompute_call_nodes]: 1.87001e-06 [before_grad]: 1.384e-05 [set_forward_comm_id_for_comm_node_pass]: 5.69e-06 [meta_fg_expand]: 3.5e-06 [flash_sp_send_recv_attached]: 2.78998e-06 [receive_attached]: 2.32999e-06 [after_resolve]: 1.503e-05 [a_after_grad]: 1.301e-05 [renormalize]: 0.00078988 [add_forward_monad_depend]: 6.43e-06 [auto_monad_grad]: 2.37999e-06 [auto_monad_eliminator]: 1.768e-05 [cse]: 3.905e-05 [a_3]: 6.364e-05 [Cycle 2]: 0.0008522, [45] [expand_dump_flag]: 2.22999e-06 [switch_simplify]: 1.011e-05 [loop_unroll]: 8.14002e-06 [a_1]: 0.00020603 [with_stream_mark]: 1.462e-05 [recompute_prepare]: 8.74e-06 [updatestate_depend_eliminate]: 4.11001e-06 [updatestate_assign_eliminate]: 3.53e-06 [updatestate_loads_eliminate]: 3.14999e-06 [parameter_eliminate]: 1.40999e-06 [a_2]: 0.00010249 [accelerated_algorithm]: 9.25999e-06 [shard]: 1.81e-06 [meta_shard_fg_expand]: 1.87001e-06 [shard_inline]: 8.25e-06 [merge_send_recv]: 6.37001e-06 [auto_parallel]: 6.66e-06 [parallel]: 5.64e-06 [flash_sp]: 3.38e-06 [merge_comm]: 4.80001e-06 [allreduce_fusion]: 4.15999e-06 [matmul_add_comm_reduction]: 9.91e-06 [allreduce_slice_to_reducescatter]: 5.59987e-07 [virtual_shard_identity]: 9.80002e-06 [virtual_dataset]: 7.80998e-06 [get_grad_eliminate_]: 7.81001e-06 [virtual_output]: 7.56001e-06 [merge_forward]: 4e-06 [cell_reuse_recompute_pass]: 1.39998e-06 [offload_activation]: 8.45999e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.569e-05 [merge_recompute_call_nodes]: 1.03001e-06 [before_grad]: 1.275e-05 [set_forward_comm_id_for_comm_node_pass]: 4.52e-06 [meta_fg_expand]: 3.23998e-06 [flash_sp_send_recv_attached]: 1.49e-06 [receive_attached]: 1.37999e-06 [after_resolve]: 1.546e-05 [a_after_grad]: 1.196e-05 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 2.34001e-06 [auto_monad_grad]: 1.39e-06 [auto_monad_eliminator]: 1.111e-05 [cse]: 2.311e-05 [a_3]: 5.03e-05 [py_interpret_to_execute_after_opt_a]: 1.362e-05 [slice_cell_reuse_recomputed_activation]: 2.26e-06 [rewriter_after_opt_a]: 4.569e-05 [convert_after_rewriter]: 8.37e-06 [order_py_execute_after_rewriter]: 5.94e-06 [mutable_eliminate]: 0.0005468 [opt_b]: 0.0002746, [1] [Cycle 1]: 0.00026805, [7] [b_1]: 0.00017978 [b_2]: 9.96e-06 [updatestate_depend_eliminate]: 7.19001e-06 [updatestate_assign_eliminate]: 3.56999e-06 [updatestate_loads_eliminate]: 3.67998e-06 [renormalize]: 6.59988e-07 [cse]: 2.628e-05 [optimize_parallel_all_gather_comm]: 1.938e-05 [overlap_param_gather]: 1.92999e-06 [cconv]: 2.615e-05 [loop_unroll]: 0.0004332 [opt_after_cconv]: 0.00012175, [1] [Cycle 1]: 0.00011623, [7] [c_1]: 4.174e-05 [parameter_eliminate]: 3.78001e-06 [updatestate_depend_eliminate]: 6.27001e-06 [updatestate_assign_eliminate]: 3.47997e-06 [updatestate_loads_eliminate]: 3.24001e-06 [cse]: 2.314e-05 [renormalize]: 4.09986e-07 [remove_dup_value]: 1.494e-05 [tuple_transform]: 9.187e-05, [1] [Cycle 1]: 8.763e-05, [4] [d_1]: 5.84e-05 [none_parameter_eliminate]: 1.68002e-06 [renormalize]: 2.09984e-07 [switch_simplify]: 9.10001e-06 [partial_unused_args_eliminate]: 1.91e-06 [add_recomputation]: 5.672e-05 [cse_after_recomputation]: 2.518e-05, [1] [Cycle 1]: 2.08e-05, [1] [cse]: 1.52e-05 [environ_conv]: 7.23999e-06 [swap_dp_allreduce_reducescatter]: 5.94e-06 [bias_add_comm_swap]: 3.35998e-06 [label_micro_interleaved_index]: 4.13001e-06 [label_fine_grained_interleaved_index]: 2.59001e-06 [merge_cast_opt]: 1.21997e-06 [slice_recompute_activation]: 1.92001e-06 [micro_interleaved_order_control]: 2.31e-06 [assign_add_opt]: 1.29e-06 [ForceFp32Comm]: 1.07998e-06 [remove_cast_before_assign_add]: 9.70002e-07 [full_micro_interleaved_order_control]: 1.91998e-06 [reorder_send_recv_between_fp_bp]: 2.83e-06 [comm_op_add_attrs]: 9.70002e-07 [add_comm_op_reuse_tag]: 9.20001e-07 [interleave_split_concat_branches]: 1.10001e-06 [interleave_parallel_branches]: 1.05999e-06 [overlap_opt_shard_in_pipeline]: 1.57999e-06 [overlap_opt_shard_grad_in_pipeline]: 1.69e-06 [control_data_broadcast_order]: 1.575e-05 [grouped_pairwise_exchange_alltoall]: 1.44e-06 [offloading_packed_experts]: 4.30999e-06 [overlap_recompute_and_grad_model_parallel]: 5.40999e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.35001e-06 [overlap_recompute_allgather_and_fa_grad]: 1.39e-06 [overlap_recompute_comm]: 2.41e-06 [overlap_grad_ring_attention]: 4.69002e-06 [overlap_grad_flash_sp]: 2.235e-05 [begin_end_overlap_inline]: 6.69999e-07 [split_matmul_comm_elemetwise]: 2.24001e-06 [split_layernorm_comm]: 1.54998e-06 [handle_group_info]: 8.79983e-07 [symbol_engine_optimizer]: 8.435e-05, [1] [Cycle 1]: 7.976e-05, [6] [build]: 3.05002e-06 [elim_shapecalc]: 1.146e-05 [elim_not_effective]: 1.562e-05 [opt_reshape]: 8.75999e-06 [fold_const_symbol]: 1.282e-05 [renormalize]: 4.80009e-07 [detach_backward]: 2.32999e-06 [pipeline_parallel_scheduler]: 1.74e-06 [auto_monad_reorder]: 1.982e-05 [get_jit_bprop_graph]: 1.64998e-06 [rewriter_after_jit_bprop_graph]: 4.43999e-06 [opt_after_jit_grad]: 0.00046254 [validate]: 4.724e-05 Sums bootstrap : 0.000443s : 3.74% type_inference : 0.005932s : 50.13% event_method : 0.000021s : 0.18% auto_monad : 0.000063s : 0.54% graph_reusing : 0.000006s : 0.05% inline : 0.000002s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000022s : 0.18% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.05% parallel-infer-symbol : 0.000003s : 0.03% pre_auto_parallel : 0.000038s : 0.33% insert-virtual-dataset : 0.000002s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.02% optimize.py_interpret_to_execute : 0.000030s : 0.26% optimize.rewriter_before_opt_a : 0.000095s : 0.80% optimize.opt_a.expand_dump_flag : 0.000005s : 0.04% optimize.opt_a.switch_simplify : 0.000059s : 0.50% optimize.opt_a.loop_unroll : 0.000044s : 0.37% optimize.opt_a.a_1 : 0.001071s : 9.06% optimize.opt_a.with_stream_mark : 0.000036s : 0.31% optimize.opt_a.recompute_prepare : 0.000023s : 0.19% optimize.opt_a.updatestate_depend_eliminate : 0.000010s : 0.08% optimize.opt_a.updatestate_assign_eliminate : 0.000008s : 0.06% optimize.opt_a.updatestate_loads_eliminate : 0.000007s : 0.06% optimize.opt_a.parameter_eliminate : 0.000004s : 0.03% optimize.opt_a.a_2 : 0.000213s : 1.80% optimize.opt_a.accelerated_algorithm : 0.000020s : 0.17% optimize.opt_a.shard : 0.000004s : 0.04% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.03% optimize.opt_a.shard_inline : 0.000017s : 0.14% optimize.opt_a.merge_send_recv : 0.000017s : 0.14% optimize.opt_a.auto_parallel : 0.000015s : 0.12% optimize.opt_a.parallel : 0.000025s : 0.21% optimize.opt_a.flash_sp : 0.000013s : 0.11% optimize.opt_a.merge_comm : 0.000010s : 0.08% optimize.opt_a.allreduce_fusion : 0.000009s : 0.07% optimize.opt_a.matmul_add_comm_reduction : 0.000021s : 0.18% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000021s : 0.18% optimize.opt_a.virtual_dataset : 0.000017s : 0.14% optimize.opt_a.get_grad_eliminate_ : 0.000016s : 0.13% optimize.opt_a.virtual_output : 0.000016s : 0.13% optimize.opt_a.merge_forward : 0.000009s : 0.07% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.02% optimize.opt_a.offload_activation : 0.000020s : 0.17% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000033s : 0.28% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.02% optimize.opt_a.before_grad : 0.000027s : 0.22% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000010s : 0.09% optimize.opt_a.meta_fg_expand : 0.000007s : 0.06% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.04% optimize.opt_a.receive_attached : 0.000004s : 0.03% optimize.opt_a.after_resolve : 0.000030s : 0.26% optimize.opt_a.a_after_grad : 0.000025s : 0.21% optimize.opt_a.renormalize : 0.000790s : 6.68% optimize.opt_a.add_forward_monad_depend : 0.000009s : 0.07% optimize.opt_a.auto_monad_grad : 0.000004s : 0.03% optimize.opt_a.auto_monad_eliminator : 0.000029s : 0.24% optimize.opt_a.cse : 0.000062s : 0.53% optimize.opt_a.a_3 : 0.000114s : 0.96% optimize.py_interpret_to_execute_after_opt_a : 0.000014s : 0.12% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.02% optimize.rewriter_after_opt_a : 0.000046s : 0.39% optimize.convert_after_rewriter : 0.000008s : 0.07% optimize.order_py_execute_after_rewriter : 0.000006s : 0.05% optimize.mutable_eliminate : 0.000547s : 4.62% optimize.opt_b.b_1 : 0.000180s : 1.52% optimize.opt_b.b_2 : 0.000010s : 0.08% optimize.opt_b.updatestate_depend_eliminate : 0.000007s : 0.06% optimize.opt_b.updatestate_assign_eliminate : 0.000004s : 0.03% optimize.opt_b.updatestate_loads_eliminate : 0.000004s : 0.03% optimize.opt_b.renormalize : 0.000001s : 0.01% optimize.opt_b.cse : 0.000026s : 0.22% optimize.optimize_parallel_all_gather_comm : 0.000019s : 0.16% optimize.overlap_param_gather : 0.000002s : 0.02% optimize.cconv : 0.000026s : 0.22% optimize.loop_unroll : 0.000433s : 3.66% optimize.opt_after_cconv.c_1 : 0.000042s : 0.35% optimize.opt_after_cconv.parameter_eliminate : 0.000004s : 0.03% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.05% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.cse : 0.000023s : 0.20% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000015s : 0.13% optimize.tuple_transform.d_1 : 0.000058s : 0.49% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000009s : 0.08% optimize.partial_unused_args_eliminate : 0.000002s : 0.02% optimize.add_recomputation : 0.000057s : 0.48% optimize.cse_after_recomputation.cse : 0.000015s : 0.13% optimize.environ_conv : 0.000007s : 0.06% optimize.swap_dp_allreduce_reducescatter : 0.000006s : 0.05% optimize.bias_add_comm_swap : 0.000003s : 0.03% optimize.label_micro_interleaved_index : 0.000004s : 0.03% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.02% optimize.merge_cast_opt : 0.000001s : 0.01% optimize.slice_recompute_activation : 0.000002s : 0.02% optimize.micro_interleaved_order_control : 0.000002s : 0.02% optimize.assign_add_opt : 0.000001s : 0.01% optimize.ForceFp32Comm : 0.000001s : 0.01% optimize.remove_cast_before_assign_add : 0.000001s : 0.01% optimize.full_micro_interleaved_order_control : 0.000002s : 0.02% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.02% optimize.comm_op_add_attrs : 0.000001s : 0.01% optimize.add_comm_op_reuse_tag : 0.000001s : 0.01% optimize.interleave_split_concat_branches : 0.000001s : 0.01% optimize.interleave_parallel_branches : 0.000001s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000002s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.01% optimize.control_data_broadcast_order : 0.000016s : 0.13% optimize.grouped_pairwise_exchange_alltoall : 0.000001s : 0.01% optimize.offloading_packed_experts : 0.000004s : 0.04% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.05% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.01% optimize.overlap_recompute_comm : 0.000002s : 0.02% optimize.overlap_grad_ring_attention : 0.000005s : 0.04% optimize.overlap_grad_flash_sp : 0.000022s : 0.19% optimize.begin_end_overlap_inline : 0.000001s : 0.01% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.02% optimize.split_layernorm_comm : 0.000002s : 0.01% optimize.handle_group_info : 0.000001s : 0.01% optimize.symbol_engine_optimizer.build : 0.000003s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000011s : 0.10% optimize.symbol_engine_optimizer.elim_not_effective : 0.000016s : 0.13% optimize.symbol_engine_optimizer.opt_reshape : 0.000009s : 0.07% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000013s : 0.11% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.02% pipeline_parallel_scheduler : 0.000002s : 0.01% auto_monad_reorder : 0.000020s : 0.17% get_jit_bprop_graph : 0.000002s : 0.01% rewriter_after_jit_bprop_graph : 0.000004s : 0.04% opt_after_jit_grad : 0.000463s : 3.91% validate : 0.000047s : 0.40% Time group info: ------[substitution.] 0.000235 44 10.79% : 0.000025s : 3: substitution.cast_eliminate 0.89% : 0.000002s : 3: substitution.elim_not_effective 0.78% : 0.000002s : 3: substitution.fold_const_symbol 3.02% : 0.000007s : 6: substitution.graph_param_transform 67.24% : 0.000158s : 4: substitution.inline 2.05% : 0.000005s : 6: substitution.j_node_and_user_rematch 2.49% : 0.000006s : 6: substitution.remove_not_recompute_node 2.54% : 0.000006s : 6: substitution.replace_old_param 7.96% : 0.000019s : 6: substitution.tuple_list_get_item_eliminator 2.24% : 0.000005s : 1: substitution.value_based_eliminate ------[type_inference.] 0.005867 2 87.18% : 0.005115s : 1: type_inference.infer 12.82% : 0.000752s : 1: type_inference.specialize ------[replace.] 0.000075 10 51.58% : 0.000039s : 4: replace.inline 48.42% : 0.000036s : 6: replace.tuple_list_get_item_eliminator ------[match.] 0.000172 10 90.68% : 0.000156s : 4: match.inline 9.32% : 0.000016s : 6: match.tuple_list_get_item_eliminator ------[predicate.] 0.000289 1908 0.93% : 0.000003s : 20: predicate.accumulaten_eliminater 0.70% : 0.000002s : 6: predicate.ad_related_special_op_eliminate 0.54% : 0.000002s : 12: predicate.addn_check_dump 1.00% : 0.000003s : 20: predicate.addn_zero_filter 0.88% : 0.000003s : 20: predicate.adjust_all_reduce_mul_add 1.96% : 0.000006s : 32: predicate.arithmetic_simplify 1.11% : 0.000003s : 20: predicate.cast_eliminate 0.57% : 0.000002s : 12: predicate.check_bprop_eliminate 0.55% : 0.000002s : 12: predicate.compare_switch_simplify 0.19% : 0.000001s : 6: predicate.const_output_eliminate 0.61% : 0.000002s : 12: predicate.depend_value_elim 1.03% : 0.000003s : 20: predicate.dict_get_item_const_eliminator 1.06% : 0.000003s : 20: predicate.dict_get_item_eliminator 0.90% : 0.000003s : 20: predicate.dict_set_item_eliminator 0.91% : 0.000003s : 12: predicate.dumpgradient_eliminate 0.22% : 0.000001s : 6: predicate.elim_not_effective 0.41% : 0.000001s : 6: predicate.elim_shapecalc_of_broadcastargs 1.17% : 0.000003s : 26: predicate.environ_add_const_eliminate 1.13% : 0.000003s : 26: predicate.environ_get_add_eliminate 1.29% : 0.000004s : 26: predicate.environ_get_depend_swap 1.72% : 0.000005s : 38: predicate.environ_get_eliminate 1.11% : 0.000003s : 26: predicate.environ_get_set_eliminate 1.45% : 0.000004s : 30: predicate.exchange_switch_depend_value 2.28% : 0.000007s : 30: predicate.float_depend_g_call 0.53% : 0.000002s : 12: predicate.float_environ_get_switch 0.79% : 0.000002s : 18: predicate.float_tuple_getitem_switch 0.19% : 0.000001s : 6: predicate.fold_const_symbol 0.61% : 0.000002s : 12: predicate.get_grad_eliminate 0.21% : 0.000001s : 6: predicate.graph_param_transform 0.58% : 0.000002s : 12: predicate.incorporate_call 0.52% : 0.000002s : 12: predicate.incorporate_call_switch 6.43% : 0.000019s : 86: predicate.inline 0.81% : 0.000002s : 12: predicate.inline_without_move 0.36% : 0.000001s : 12: predicate.j_node_and_user_rematch 0.92% : 0.000003s : 12: predicate.less_batch_normalization 2.01% : 0.000006s : 38: predicate.list_to_tuple_eliminator_ 2.69% : 0.000008s : 58: predicate.load_eliminater 0.68% : 0.000002s : 6: predicate.loop_unroll_after_grad 2.00% : 0.000006s : 42: predicate.loop_unroll_before_grad 1.56% : 0.000005s : 32: predicate.make_slice_get_slice_eliminator 0.53% : 0.000002s : 12: predicate.merge_addn 0.57% : 0.000002s : 12: predicate.micro_step_allgather_replace 0.55% : 0.000002s : 12: predicate.mini_step_allgather_replace 0.88% : 0.000003s : 20: predicate.minmaximum_grad 0.99% : 0.000003s : 6: predicate.mutable_eliminate 0.35% : 0.000001s : 6: predicate.opt_reshape 0.33% : 0.000001s : 6: predicate.parallel_virtual_node 1.82% : 0.000005s : 30: predicate.partial_defer_inline 1.77% : 0.000005s : 32: predicate.partial_eliminate 0.90% : 0.000003s : 20: predicate.print_const_string_wrapper 0.58% : 0.000002s : 12: predicate.reduce_all_const_elim 1.16% : 0.000003s : 20: predicate.reduce_eliminate 2.66% : 0.000008s : 58: predicate.redundant_stop_gradient_eliminater 0.39% : 0.000001s : 12: predicate.remove_not_recompute_node 1.42% : 0.000004s : 38: predicate.replace_applicator 0.46% : 0.000001s : 12: predicate.replace_old_param 0.26% : 0.000001s : 6: predicate.reset_defer_inline 0.95% : 0.000003s : 20: predicate.reshape_eliminate 0.62% : 0.000002s : 12: predicate.row_tensor_add_zeros_like 0.36% : 0.000001s : 6: predicate.row_tensor_eliminate 0.78% : 0.000002s : 12: predicate.same_eliminate 0.52% : 0.000002s : 12: predicate.set_cell_output_no_recompute 0.83% : 0.000002s : 12: predicate.shard_identity_eliminate 0.65% : 0.000002s : 12: predicate.special_op_eliminate 0.76% : 0.000002s : 12: predicate.specialize_transform 0.83% : 0.000002s : 12: predicate.split_environ_get_set_with_tuple_value 0.68% : 0.000002s : 12: predicate.stack_unstack_eliminate 0.40% : 0.000001s : 6: predicate.switch_call_monad_eliminater 1.56% : 0.000005s : 30: predicate.switch_defer_inline 2.08% : 0.000006s : 42: predicate.switch_layer_defer_inline 4.72% : 0.000014s : 90: predicate.switch_simplify 0.90% : 0.000003s : 20: predicate.tile_eliminate 1.02% : 0.000003s : 20: predicate.transpose_eliminate 1.51% : 0.000004s : 32: predicate.tuple_list_convert_item_index_to_positive 1.72% : 0.000005s : 32: predicate.tuple_list_get_item_const_eliminator 1.58% : 0.000005s : 32: predicate.tuple_list_get_item_depend_reorder 3.37% : 0.000010s : 50: predicate.tuple_list_get_item_eliminator 1.55% : 0.000004s : 32: predicate.tuple_list_get_set_item_eliminator 2.23% : 0.000006s : 44: predicate.tuple_list_set_item_eliminator 1.85% : 0.000005s : 38: predicate.tuple_to_list_eliminator_ 2.61% : 0.000008s : 58: predicate.updatestate_pure_node_eliminater 3.31% : 0.000010s : 70: predicate.updatestate_useless_node_eliminater 0.45% : 0.000001s : 6: predicate.value_based_eliminate 0.74% : 0.000002s : 12: predicate.virtual_dataset_eliminate 0.60% : 0.000002s : 12: predicate.virtual_output_eliminate 0.28% : 0.000001s : 6: predicate.virtual_view_grad_eliminate 0.35% : 0.000001s : 6: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000570 11 52.52% : 0.000299s : 5: func_graph_cloner_run.FuncGraphClonerGraph 47.48% : 0.000271s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.027425 192 0.01% : 0.000004s : 1: ForceFp32Comm 11.43% : 0.003134s : 1: add_attr 11.39% : 0.003125s : 1: add_attr_with_inline 0.01% : 0.000004s : 1: add_comm_op_reuse_tag 0.22% : 0.000061s : 1: add_recomputation 0.01% : 0.000004s : 1: assign_add_opt 0.25% : 0.000069s : 1: auto_monad 0.09% : 0.000024s : 1: auto_monad_reorder 0.01% : 0.000003s : 1: begin_end_overlap_inline 0.02% : 0.000006s : 1: bias_add_comm_swap 1.72% : 0.000471s : 1: bootstrap 0.11% : 0.000030s : 1: cconv 0.01% : 0.000004s : 1: comm_op_add_attrs 0.07% : 0.000019s : 1: control_data_broadcast_order 0.04% : 0.000012s : 1: convert_after_rewriter 0.10% : 0.000028s : 1: cse_after_recomputation 0.02% : 0.000005s : 1: dataset_repeat_opt 0.02% : 0.000006s : 1: detach_backward 0.04% : 0.000010s : 1: environ_conv 0.10% : 0.000028s : 1: event_method 0.02% : 0.000005s : 1: full_micro_interleaved_order_control 0.02% : 0.000005s : 1: get_jit_bprop_graph 0.04% : 0.000010s : 1: graph_reusing 0.01% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000004s : 1: handle_group_info 0.02% : 0.000005s : 1: inline 0.02% : 0.000006s : 1: insert-virtual-dataset 0.01% : 0.000004s : 1: interleave_parallel_branches 0.01% : 0.000004s : 1: interleave_split_concat_branches 0.02% : 0.000006s : 1: label_fine_grained_interleaved_index 0.03% : 0.000007s : 1: label_micro_interleaved_index 1.61% : 0.000442s : 1: loop_unroll 0.01% : 0.000004s : 1: merge_cast_opt 0.02% : 0.000005s : 1: micro_interleaved_order_control 2.03% : 0.000556s : 1: mutable_eliminate 0.03% : 0.000007s : 1: offloading_packed_experts 0.06% : 0.000016s : 1: opt.transform.loop_unroll_optimizer 0.07% : 0.000019s : 1: opt.transform.mutable_eliminate 6.01% : 0.001649s : 78: opt.transform.opt_a 0.15% : 0.000040s : 1: opt.transform.opt_after_cconv 0.11% : 0.000031s : 1: opt.transform.opt_after_jit_grad 0.58% : 0.000158s : 28: opt.transform.opt_b 0.24% : 0.000065s : 2: opt.transform.opt_trans_graph 0.16% : 0.000045s : 4: opt.transform.symbol_engine_opt 12.45% : 0.003414s : 1: opt_a 0.46% : 0.000125s : 1: opt_after_cconv 1.72% : 0.000472s : 1: opt_after_jit_grad 1.01% : 0.000278s : 1: opt_b 20.58% : 0.005644s : 1: optimize 0.08% : 0.000023s : 1: optimize_parallel_all_gather_comm 0.03% : 0.000009s : 1: order_py_execute_after_rewriter 0.09% : 0.000026s : 1: overlap_grad_flash_sp 0.02% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.03% : 0.000008s : 1: overlap_grad_ring_attention 0.02% : 0.000004s : 1: overlap_opt_shard_grad_in_pipeline 0.02% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.02% : 0.000005s : 1: overlap_param_gather 0.01% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.03% : 0.000008s : 1: overlap_recompute_and_grad_model_parallel 0.02% : 0.000005s : 1: overlap_recompute_comm 0.02% : 0.000007s : 1: parallel-infer-symbol 0.01% : 0.000004s : 1: parallel-infer-symbol-second 0.02% : 0.000005s : 1: partial_unused_args_eliminate 0.02% : 0.000005s : 1: pipeline_parallel_scheduler 0.02% : 0.000005s : 1: pipeline_split 0.16% : 0.000043s : 1: pre_auto_parallel 0.13% : 0.000036s : 1: py_interpret_to_execute 0.07% : 0.000019s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000004s : 1: remove_cast_before_assign_add 0.07% : 0.000019s : 1: remove_dup_value 1.57% : 0.000430s : 1: renormalize.infer 1.28% : 0.000350s : 1: renormalize.specialize 0.02% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.03% : 0.000008s : 1: rewriter_after_jit_bprop_graph 0.18% : 0.000050s : 1: rewriter_after_opt_a 0.36% : 0.000100s : 1: rewriter_before_opt_a 0.02% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.02% : 0.000005s : 1: slice_recompute_activation 0.02% : 0.000004s : 1: split_layernorm_comm 0.02% : 0.000005s : 1: split_matmul_comm_elemetwise 0.03% : 0.000009s : 1: swap_dp_allreduce_reducescatter 0.32% : 0.000087s : 1: symbol_engine_optimizer 0.35% : 0.000095s : 1: tuple_transform 21.69% : 0.005949s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:47:10.789.947 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:47:10.790.219 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0173019, [21] [bootstrap]: 0.00045548 [type_inference]: 0.00590611 [event_method]: 2.111e-05 [auto_monad]: 6.816e-05 [graph_reusing]: 6.22001e-06 [inline]: 2.25002e-06 [add_attr]: 0.00305807, [1] [add_attr_with_inline]: 0.00304907, [1] [Cycle 1]: 7.802e-05, [2] [tag_attr]: 2.183e-05 [meta_addattr_fg_expand]: 6.71e-06 [parallel-infer-symbol]: 3.21001e-06 [pre_auto_parallel]: 3.571e-05 [insert-virtual-dataset]: 2.54001e-06 [parallel-infer-symbol-second]: 7.29982e-07 [dataset_repeat_opt]: 2.27001e-06 [pipeline_split]: 1.52001e-06 [optimize]: 0.00656109, [53] [py_interpret_to_execute]: 3.25e-05 [rewriter_before_opt_a]: 0.00010352 [opt_a]: 0.00389705, [2] [Cycle 1]: 0.00278892, [45] [expand_dump_flag]: 3.5e-06 [switch_simplify]: 4.858e-05 [loop_unroll]: 3.738e-05 [a_1]: 0.00090931 [with_stream_mark]: 2.096e-05 [recompute_prepare]: 1.574e-05 [updatestate_depend_eliminate]: 6.46e-06 [updatestate_assign_eliminate]: 5.39e-06 [updatestate_loads_eliminate]: 4.78001e-06 [parameter_eliminate]: 1.94999e-06 [a_2]: 0.00015893 [accelerated_algorithm]: 1.049e-05 [shard]: 2.51e-06 [meta_shard_fg_expand]: 2.31e-06 [shard_inline]: 1.019e-05 [merge_send_recv]: 1.043e-05 [auto_parallel]: 9.20001e-06 [parallel]: 1.79e-05 [flash_sp]: 9.27999e-06 [merge_comm]: 5.67001e-06 [allreduce_fusion]: 5.04e-06 [matmul_add_comm_reduction]: 1.188e-05 [allreduce_slice_to_reducescatter]: 6.80011e-07 [virtual_shard_identity]: 1.076e-05 [virtual_dataset]: 1.025e-05 [get_grad_eliminate_]: 9.69999e-06 [virtual_output]: 1.015e-05 [merge_forward]: 4.80999e-06 [cell_reuse_recompute_pass]: 1.34e-06 [offload_activation]: 1.165e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.12e-05 [merge_recompute_call_nodes]: 1.42e-06 [before_grad]: 1.58e-05 [set_forward_comm_id_for_comm_node_pass]: 5.27001e-06 [meta_fg_expand]: 4.22e-06 [flash_sp_send_recv_attached]: 2.36e-06 [receive_attached]: 2.32999e-06 [after_resolve]: 1.591e-05 [a_after_grad]: 1.522e-05 [renormalize]: 0.00077083 [add_forward_monad_depend]: 5.87001e-06 [auto_monad_grad]: 2.48e-06 [auto_monad_eliminator]: 1.823e-05 [cse]: 5.098e-05 [a_3]: 8.394e-05 [Cycle 2]: 0.00109405, [45] [expand_dump_flag]: 1.27999e-06 [switch_simplify]: 1.144e-05 [loop_unroll]: 1.033e-05 [a_1]: 0.00024147 [with_stream_mark]: 1.334e-05 [recompute_prepare]: 9.66e-06 [updatestate_depend_eliminate]: 4.95999e-06 [updatestate_assign_eliminate]: 3.73999e-06 [updatestate_loads_eliminate]: 3.68999e-06 [parameter_eliminate]: 1.43002e-06 [a_2]: 0.00014552 [accelerated_algorithm]: 9.56998e-06 [shard]: 1.07e-06 [meta_shard_fg_expand]: 1.82001e-06 [shard_inline]: 9.62999e-06 [merge_send_recv]: 6.71e-06 [auto_parallel]: 7.66999e-06 [parallel]: 4.82998e-06 [flash_sp]: 3.58e-06 [merge_comm]: 5.44e-06 [allreduce_fusion]: 4.60001e-06 [matmul_add_comm_reduction]: 8.35001e-06 [allreduce_slice_to_reducescatter]: 3.20026e-07 [virtual_shard_identity]: 9.71998e-06 [virtual_dataset]: 9.31998e-06 [get_grad_eliminate_]: 8.87e-06 [virtual_output]: 8.97e-06 [merge_forward]: 4.28999e-06 [cell_reuse_recompute_pass]: 1.32e-06 [offload_activation]: 8.58001e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.822e-05 [merge_recompute_call_nodes]: 6.89994e-07 [before_grad]: 2.807e-05 [set_forward_comm_id_for_comm_node_pass]: 5.59998e-06 [meta_fg_expand]: 3.63e-06 [flash_sp_send_recv_attached]: 8.00006e-07 [receive_attached]: 9.60019e-07 [after_resolve]: 1.494e-05 [a_after_grad]: 1.423e-05 [renormalize]: 8.00064e-08 [add_forward_monad_depend]: 1.50001e-06 [auto_monad_grad]: 9.89996e-07 [auto_monad_eliminator]: 9.79999e-06 [cse]: 2.597e-05 [a_3]: 7.23e-05 [py_interpret_to_execute_after_opt_a]: 1.47e-05 [slice_cell_reuse_recomputed_activation]: 4.81002e-06 [rewriter_after_opt_a]: 4.859e-05 [convert_after_rewriter]: 1.182e-05 [order_py_execute_after_rewriter]: 1.021e-05 [mutable_eliminate]: 0.00051448 [opt_b]: 0.000378, [1] [Cycle 1]: 0.00036819, [7] [b_1]: 0.0002532 [b_2]: 1.076e-05 [updatestate_depend_eliminate]: 7.89997e-06 [updatestate_assign_eliminate]: 4.05e-06 [updatestate_loads_eliminate]: 4.2e-06 [renormalize]: 5.09986e-07 [cse]: 2.981e-05 [optimize_parallel_all_gather_comm]: 2.397e-05 [overlap_param_gather]: 4.92e-06 [cconv]: 2.881e-05 [loop_unroll]: 0.00046537 [opt_after_cconv]: 0.00017786, [1] [Cycle 1]: 0.0001682, [7] [c_1]: 5.539e-05 [parameter_eliminate]: 2.66999e-06 [updatestate_depend_eliminate]: 7.41999e-06 [updatestate_assign_eliminate]: 4.32998e-06 [updatestate_loads_eliminate]: 4.22e-06 [cse]: 3.092e-05 [renormalize]: 4.00003e-07 [remove_dup_value]: 4.677e-05 [tuple_transform]: 0.00012878, [1] [Cycle 1]: 0.00012073, [4] [d_1]: 7.381e-05 [none_parameter_eliminate]: 1.76e-06 [renormalize]: 2.69996e-07 [switch_simplify]: 1.158e-05 [partial_unused_args_eliminate]: 4.75999e-06 [add_recomputation]: 6.711e-05 [cse_after_recomputation]: 3.825e-05, [1] [Cycle 1]: 3.058e-05, [1] [cse]: 2.09e-05 [environ_conv]: 1.001e-05 [swap_dp_allreduce_reducescatter]: 1.035e-05 [bias_add_comm_swap]: 5.16002e-06 [label_micro_interleaved_index]: 7.25e-06 [label_fine_grained_interleaved_index]: 5.69999e-06 [merge_cast_opt]: 4.2e-06 [slice_recompute_activation]: 4.95999e-06 [micro_interleaved_order_control]: 5.11997e-06 [assign_add_opt]: 3.86999e-06 [ForceFp32Comm]: 3.48999e-06 [remove_cast_before_assign_add]: 3.48e-06 [full_micro_interleaved_order_control]: 5.15999e-06 [reorder_send_recv_between_fp_bp]: 5.76e-06 [comm_op_add_attrs]: 4.10998e-06 [add_comm_op_reuse_tag]: 3.50003e-06 [interleave_split_concat_branches]: 4.07003e-06 [interleave_parallel_branches]: 4.08999e-06 [overlap_opt_shard_in_pipeline]: 4.03001e-06 [overlap_opt_shard_grad_in_pipeline]: 4.79e-06 [control_data_broadcast_order]: 2.084e-05 [grouped_pairwise_exchange_alltoall]: 4.65999e-06 [offloading_packed_experts]: 9.09e-06 [overlap_recompute_and_grad_model_parallel]: 8.93002e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.97e-06 [overlap_recompute_allgather_and_fa_grad]: 4.07e-06 [overlap_recompute_comm]: 5.22999e-06 [overlap_grad_ring_attention]: 8.3e-06 [overlap_grad_flash_sp]: 2.809e-05 [begin_end_overlap_inline]: 3.18e-06 [split_matmul_comm_elemetwise]: 5.22999e-06 [split_layernorm_comm]: 4.48999e-06 [handle_group_info]: 3.35998e-06 [symbol_engine_optimizer]: 0.00012235, [1] [Cycle 1]: 0.00011452, [6] [build]: 3.63999e-06 [elim_shapecalc]: 1.45e-05 [elim_not_effective]: 1.981e-05 [opt_reshape]: 1.116e-05 [fold_const_symbol]: 1.642e-05 [renormalize]: 2.70025e-07 [detach_backward]: 3.83001e-06 [pipeline_parallel_scheduler]: 1.77001e-06 [auto_monad_reorder]: 2.389e-05 [get_jit_bprop_graph]: 1.67999e-06 [rewriter_after_jit_bprop_graph]: 4.46002e-06 [opt_after_jit_grad]: 0.00048522 [validate]: 4.234e-05 Sums bootstrap : 0.000455s : 3.67% type_inference : 0.005906s : 47.55% event_method : 0.000021s : 0.17% auto_monad : 0.000068s : 0.55% graph_reusing : 0.000006s : 0.05% inline : 0.000002s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000022s : 0.18% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000007s : 0.05% parallel-infer-symbol : 0.000003s : 0.03% pre_auto_parallel : 0.000036s : 0.29% insert-virtual-dataset : 0.000003s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000032s : 0.26% optimize.rewriter_before_opt_a : 0.000104s : 0.83% optimize.opt_a.expand_dump_flag : 0.000005s : 0.04% optimize.opt_a.switch_simplify : 0.000060s : 0.48% optimize.opt_a.loop_unroll : 0.000048s : 0.38% optimize.opt_a.a_1 : 0.001151s : 9.27% optimize.opt_a.with_stream_mark : 0.000034s : 0.28% optimize.opt_a.recompute_prepare : 0.000025s : 0.20% optimize.opt_a.updatestate_depend_eliminate : 0.000011s : 0.09% optimize.opt_a.updatestate_assign_eliminate : 0.000009s : 0.07% optimize.opt_a.updatestate_loads_eliminate : 0.000008s : 0.07% optimize.opt_a.parameter_eliminate : 0.000003s : 0.03% optimize.opt_a.a_2 : 0.000304s : 2.45% optimize.opt_a.accelerated_algorithm : 0.000020s : 0.16% optimize.opt_a.shard : 0.000004s : 0.03% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.03% optimize.opt_a.shard_inline : 0.000020s : 0.16% optimize.opt_a.merge_send_recv : 0.000017s : 0.14% optimize.opt_a.auto_parallel : 0.000017s : 0.14% optimize.opt_a.parallel : 0.000023s : 0.18% optimize.opt_a.flash_sp : 0.000013s : 0.10% optimize.opt_a.merge_comm : 0.000011s : 0.09% optimize.opt_a.allreduce_fusion : 0.000010s : 0.08% optimize.opt_a.matmul_add_comm_reduction : 0.000020s : 0.16% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000020s : 0.16% optimize.opt_a.virtual_dataset : 0.000020s : 0.16% optimize.opt_a.get_grad_eliminate_ : 0.000019s : 0.15% optimize.opt_a.virtual_output : 0.000019s : 0.15% optimize.opt_a.merge_forward : 0.000009s : 0.07% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.02% optimize.opt_a.offload_activation : 0.000020s : 0.16% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000039s : 0.32% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.02% optimize.opt_a.before_grad : 0.000044s : 0.35% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000011s : 0.09% optimize.opt_a.meta_fg_expand : 0.000008s : 0.06% optimize.opt_a.flash_sp_send_recv_attached : 0.000003s : 0.03% optimize.opt_a.receive_attached : 0.000003s : 0.03% optimize.opt_a.after_resolve : 0.000031s : 0.25% optimize.opt_a.a_after_grad : 0.000029s : 0.24% optimize.opt_a.renormalize : 0.000771s : 6.21% optimize.opt_a.add_forward_monad_depend : 0.000007s : 0.06% optimize.opt_a.auto_monad_grad : 0.000003s : 0.03% optimize.opt_a.auto_monad_eliminator : 0.000028s : 0.23% optimize.opt_a.cse : 0.000077s : 0.62% optimize.opt_a.a_3 : 0.000156s : 1.26% optimize.py_interpret_to_execute_after_opt_a : 0.000015s : 0.12% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.04% optimize.rewriter_after_opt_a : 0.000049s : 0.39% optimize.convert_after_rewriter : 0.000012s : 0.10% optimize.order_py_execute_after_rewriter : 0.000010s : 0.08% optimize.mutable_eliminate : 0.000514s : 4.14% optimize.opt_b.b_1 : 0.000253s : 2.04% optimize.opt_b.b_2 : 0.000011s : 0.09% optimize.opt_b.updatestate_depend_eliminate : 0.000008s : 0.06% optimize.opt_b.updatestate_assign_eliminate : 0.000004s : 0.03% optimize.opt_b.updatestate_loads_eliminate : 0.000004s : 0.03% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000030s : 0.24% optimize.optimize_parallel_all_gather_comm : 0.000024s : 0.19% optimize.overlap_param_gather : 0.000005s : 0.04% optimize.cconv : 0.000029s : 0.23% optimize.loop_unroll : 0.000465s : 3.75% optimize.opt_after_cconv.c_1 : 0.000055s : 0.45% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.02% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000007s : 0.06% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000004s : 0.03% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000004s : 0.03% optimize.opt_after_cconv.cse : 0.000031s : 0.25% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000047s : 0.38% optimize.tuple_transform.d_1 : 0.000074s : 0.59% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000012s : 0.09% optimize.partial_unused_args_eliminate : 0.000005s : 0.04% optimize.add_recomputation : 0.000067s : 0.54% optimize.cse_after_recomputation.cse : 0.000021s : 0.17% optimize.environ_conv : 0.000010s : 0.08% optimize.swap_dp_allreduce_reducescatter : 0.000010s : 0.08% optimize.bias_add_comm_swap : 0.000005s : 0.04% optimize.label_micro_interleaved_index : 0.000007s : 0.06% optimize.label_fine_grained_interleaved_index : 0.000006s : 0.05% optimize.merge_cast_opt : 0.000004s : 0.03% optimize.slice_recompute_activation : 0.000005s : 0.04% optimize.micro_interleaved_order_control : 0.000005s : 0.04% optimize.assign_add_opt : 0.000004s : 0.03% optimize.ForceFp32Comm : 0.000003s : 0.03% optimize.remove_cast_before_assign_add : 0.000003s : 0.03% optimize.full_micro_interleaved_order_control : 0.000005s : 0.04% optimize.reorder_send_recv_between_fp_bp : 0.000006s : 0.05% optimize.comm_op_add_attrs : 0.000004s : 0.03% optimize.add_comm_op_reuse_tag : 0.000004s : 0.03% optimize.interleave_split_concat_branches : 0.000004s : 0.03% optimize.interleave_parallel_branches : 0.000004s : 0.03% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.03% optimize.overlap_opt_shard_grad_in_pipeline : 0.000005s : 0.04% optimize.control_data_broadcast_order : 0.000021s : 0.17% optimize.grouped_pairwise_exchange_alltoall : 0.000005s : 0.04% optimize.offloading_packed_experts : 0.000009s : 0.07% optimize.overlap_recompute_and_grad_model_parallel : 0.000009s : 0.07% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.03% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.03% optimize.overlap_recompute_comm : 0.000005s : 0.04% optimize.overlap_grad_ring_attention : 0.000008s : 0.07% optimize.overlap_grad_flash_sp : 0.000028s : 0.23% optimize.begin_end_overlap_inline : 0.000003s : 0.03% optimize.split_matmul_comm_elemetwise : 0.000005s : 0.04% optimize.split_layernorm_comm : 0.000004s : 0.04% optimize.handle_group_info : 0.000003s : 0.03% optimize.symbol_engine_optimizer.build : 0.000004s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000014s : 0.12% optimize.symbol_engine_optimizer.elim_not_effective : 0.000020s : 0.16% optimize.symbol_engine_optimizer.opt_reshape : 0.000011s : 0.09% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000016s : 0.13% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000004s : 0.03% pipeline_parallel_scheduler : 0.000002s : 0.01% auto_monad_reorder : 0.000024s : 0.19% get_jit_bprop_graph : 0.000002s : 0.01% rewriter_after_jit_bprop_graph : 0.000004s : 0.04% opt_after_jit_grad : 0.000485s : 3.91% validate : 0.000042s : 0.34% Time group info: ------[substitution.] 0.000240 54 11.51% : 0.000028s : 6: substitution.cast_eliminate 1.11% : 0.000003s : 4: substitution.elim_not_effective 0.82% : 0.000002s : 4: substitution.fold_const_symbol 3.70% : 0.000009s : 7: substitution.graph_param_transform 64.57% : 0.000155s : 4: substitution.inline 2.25% : 0.000005s : 8: substitution.j_node_and_user_rematch 3.15% : 0.000008s : 8: substitution.remove_not_recompute_node 2.35% : 0.000006s : 6: substitution.replace_old_param 8.30% : 0.000020s : 6: substitution.tuple_list_get_item_eliminator 2.24% : 0.000005s : 1: substitution.value_based_eliminate ------[type_inference.] 0.005853 2 86.96% : 0.005090s : 1: type_inference.infer 13.04% : 0.000763s : 1: type_inference.specialize ------[replace.] 0.000076 10 50.24% : 0.000038s : 4: replace.inline 49.76% : 0.000038s : 6: replace.tuple_list_get_item_eliminator ------[match.] 0.000170 10 89.92% : 0.000153s : 4: match.inline 10.08% : 0.000017s : 6: match.tuple_list_get_item_eliminator ------[predicate.] 0.000327 2134 0.93% : 0.000003s : 22: predicate.accumulaten_eliminater 0.62% : 0.000002s : 7: predicate.ad_related_special_op_eliminate 0.59% : 0.000002s : 14: predicate.addn_check_dump 1.02% : 0.000003s : 22: predicate.addn_zero_filter 0.84% : 0.000003s : 22: predicate.adjust_all_reduce_mul_add 1.91% : 0.000006s : 36: predicate.arithmetic_simplify 1.09% : 0.000004s : 22: predicate.cast_eliminate 0.63% : 0.000002s : 14: predicate.check_bprop_eliminate 0.64% : 0.000002s : 14: predicate.compare_switch_simplify 0.20% : 0.000001s : 7: predicate.const_output_eliminate 0.66% : 0.000002s : 14: predicate.depend_value_elim 0.99% : 0.000003s : 22: predicate.dict_get_item_const_eliminator 1.06% : 0.000003s : 22: predicate.dict_get_item_eliminator 0.95% : 0.000003s : 22: predicate.dict_set_item_eliminator 0.88% : 0.000003s : 14: predicate.dumpgradient_eliminate 0.22% : 0.000001s : 7: predicate.elim_not_effective 0.42% : 0.000001s : 7: predicate.elim_shapecalc_of_broadcastargs 1.21% : 0.000004s : 29: predicate.environ_add_const_eliminate 1.19% : 0.000004s : 29: predicate.environ_get_add_eliminate 1.17% : 0.000004s : 29: predicate.environ_get_depend_swap 1.83% : 0.000006s : 43: predicate.environ_get_eliminate 1.18% : 0.000004s : 29: predicate.environ_get_set_eliminate 1.38% : 0.000005s : 32: predicate.exchange_switch_depend_value 2.27% : 0.000007s : 32: predicate.float_depend_g_call 0.58% : 0.000002s : 14: predicate.float_environ_get_switch 0.85% : 0.000003s : 21: predicate.float_tuple_getitem_switch 0.22% : 0.000001s : 7: predicate.fold_const_symbol 0.68% : 0.000002s : 14: predicate.get_grad_eliminate 0.24% : 0.000001s : 7: predicate.graph_param_transform 0.65% : 0.000002s : 14: predicate.incorporate_call 0.55% : 0.000002s : 14: predicate.incorporate_call_switch 6.14% : 0.000020s : 96: predicate.inline 0.85% : 0.000003s : 14: predicate.inline_without_move 0.36% : 0.000001s : 14: predicate.j_node_and_user_rematch 0.78% : 0.000003s : 14: predicate.less_batch_normalization 2.00% : 0.000007s : 42: predicate.list_to_tuple_eliminator_ 2.66% : 0.000009s : 64: predicate.load_eliminater 0.74% : 0.000002s : 7: predicate.loop_unroll_after_grad 1.93% : 0.000006s : 44: predicate.loop_unroll_before_grad 1.65% : 0.000005s : 36: predicate.make_slice_get_slice_eliminator 0.60% : 0.000002s : 14: predicate.merge_addn 0.60% : 0.000002s : 14: predicate.micro_step_allgather_replace 0.60% : 0.000002s : 14: predicate.mini_step_allgather_replace 0.86% : 0.000003s : 22: predicate.minmaximum_grad 0.78% : 0.000003s : 7: predicate.mutable_eliminate 0.39% : 0.000001s : 7: predicate.opt_reshape 0.34% : 0.000001s : 7: predicate.parallel_virtual_node 1.70% : 0.000006s : 32: predicate.partial_defer_inline 1.83% : 0.000006s : 35: predicate.partial_eliminate 0.91% : 0.000003s : 22: predicate.print_const_string_wrapper 0.60% : 0.000002s : 14: predicate.reduce_all_const_elim 1.23% : 0.000004s : 22: predicate.reduce_eliminate 2.68% : 0.000009s : 64: predicate.redundant_stop_gradient_eliminater 0.37% : 0.000001s : 14: predicate.remove_not_recompute_node 1.38% : 0.000005s : 42: predicate.replace_applicator 0.48% : 0.000002s : 14: predicate.replace_old_param 0.24% : 0.000001s : 7: predicate.reset_defer_inline 0.93% : 0.000003s : 22: predicate.reshape_eliminate 0.62% : 0.000002s : 14: predicate.row_tensor_add_zeros_like 0.32% : 0.000001s : 7: predicate.row_tensor_eliminate 0.74% : 0.000002s : 14: predicate.same_eliminate 0.68% : 0.000002s : 14: predicate.set_cell_output_no_recompute 0.67% : 0.000002s : 14: predicate.shard_identity_eliminate 0.71% : 0.000002s : 14: predicate.special_op_eliminate 0.79% : 0.000003s : 14: predicate.specialize_transform 0.83% : 0.000003s : 14: predicate.split_environ_get_set_with_tuple_value 0.71% : 0.000002s : 14: predicate.stack_unstack_eliminate 0.39% : 0.000001s : 7: predicate.switch_call_monad_eliminater 1.47% : 0.000005s : 32: predicate.switch_defer_inline 2.03% : 0.000007s : 46: predicate.switch_layer_defer_inline 4.59% : 0.000015s : 97: predicate.switch_simplify 0.94% : 0.000003s : 22: predicate.tile_eliminate 0.90% : 0.000003s : 22: predicate.transpose_eliminate 1.68% : 0.000005s : 36: predicate.tuple_list_convert_item_index_to_positive 1.65% : 0.000005s : 36: predicate.tuple_list_get_item_const_eliminator 1.52% : 0.000005s : 36: predicate.tuple_list_get_item_depend_reorder 3.22% : 0.000011s : 56: predicate.tuple_list_get_item_eliminator 1.59% : 0.000005s : 36: predicate.tuple_list_get_set_item_eliminator 2.30% : 0.000008s : 50: predicate.tuple_list_set_item_eliminator 1.82% : 0.000006s : 42: predicate.tuple_to_list_eliminator_ 2.68% : 0.000009s : 64: predicate.updatestate_pure_node_eliminater 3.33% : 0.000011s : 78: predicate.updatestate_useless_node_eliminater 0.41% : 0.000001s : 7: predicate.value_based_eliminate 0.64% : 0.000002s : 14: predicate.virtual_dataset_eliminate 0.69% : 0.000002s : 14: predicate.virtual_output_eliminate 0.32% : 0.000001s : 7: predicate.virtual_view_grad_eliminate 0.46% : 0.000001s : 7: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000559 11 53.83% : 0.000301s : 5: func_graph_cloner_run.FuncGraphClonerGraph 46.17% : 0.000258s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.029844 192 0.02% : 0.000007s : 1: ForceFp32Comm 10.28% : 0.003067s : 1: add_attr 10.23% : 0.003053s : 1: add_attr_with_inline 0.02% : 0.000006s : 1: add_comm_op_reuse_tag 0.24% : 0.000071s : 1: add_recomputation 0.02% : 0.000007s : 1: assign_add_opt 0.26% : 0.000077s : 1: auto_monad 0.11% : 0.000032s : 1: auto_monad_reorder 0.02% : 0.000006s : 1: begin_end_overlap_inline 0.03% : 0.000008s : 1: bias_add_comm_swap 1.68% : 0.000501s : 1: bootstrap 0.11% : 0.000032s : 1: cconv 0.02% : 0.000007s : 1: comm_op_add_attrs 0.08% : 0.000024s : 1: control_data_broadcast_order 0.05% : 0.000015s : 1: convert_after_rewriter 0.14% : 0.000042s : 1: cse_after_recomputation 0.03% : 0.000008s : 1: dataset_repeat_opt 0.06% : 0.000019s : 1: detach_backward 0.04% : 0.000013s : 1: environ_conv 0.11% : 0.000031s : 1: event_method 0.03% : 0.000008s : 1: full_micro_interleaved_order_control 0.03% : 0.000008s : 1: get_jit_bprop_graph 0.04% : 0.000013s : 1: graph_reusing 0.02% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.02% : 0.000007s : 1: handle_group_info 0.03% : 0.000008s : 1: inline 0.03% : 0.000009s : 1: insert-virtual-dataset 0.02% : 0.000007s : 1: interleave_parallel_branches 0.02% : 0.000007s : 1: interleave_split_concat_branches 0.03% : 0.000009s : 1: label_fine_grained_interleaved_index 0.03% : 0.000010s : 1: label_micro_interleaved_index 1.58% : 0.000472s : 1: loop_unroll 0.02% : 0.000007s : 1: merge_cast_opt 0.03% : 0.000008s : 1: micro_interleaved_order_control 1.74% : 0.000521s : 1: mutable_eliminate 0.04% : 0.000012s : 1: offloading_packed_experts 0.06% : 0.000019s : 1: opt.transform.loop_unroll_optimizer 0.06% : 0.000018s : 1: opt.transform.mutable_eliminate 6.14% : 0.001833s : 78: opt.transform.opt_a 0.18% : 0.000054s : 1: opt.transform.opt_after_cconv 0.12% : 0.000035s : 1: opt.transform.opt_after_jit_grad 0.64% : 0.000190s : 28: opt.transform.opt_b 0.28% : 0.000083s : 2: opt.transform.opt_trans_graph 0.19% : 0.000058s : 4: opt.transform.symbol_engine_opt 13.07% : 0.003900s : 1: opt_a 0.61% : 0.000182s : 1: opt_after_cconv 1.66% : 0.000495s : 1: opt_after_jit_grad 1.28% : 0.000381s : 1: opt_b 23.23% : 0.006933s : 1: optimize 0.09% : 0.000028s : 1: optimize_parallel_all_gather_comm 0.04% : 0.000013s : 1: order_py_execute_after_rewriter 0.11% : 0.000032s : 1: overlap_grad_flash_sp 0.02% : 0.000007s : 1: overlap_grad_matmul_and_grad_allreduce 0.04% : 0.000011s : 1: overlap_grad_ring_attention 0.03% : 0.000008s : 1: overlap_opt_shard_grad_in_pipeline 0.02% : 0.000007s : 1: overlap_opt_shard_in_pipeline 0.03% : 0.000008s : 1: overlap_param_gather 0.02% : 0.000007s : 1: overlap_recompute_allgather_and_fa_grad 0.04% : 0.000012s : 1: overlap_recompute_and_grad_model_parallel 0.03% : 0.000008s : 1: overlap_recompute_comm 0.03% : 0.000010s : 1: parallel-infer-symbol 0.02% : 0.000006s : 1: parallel-infer-symbol-second 0.03% : 0.000008s : 1: partial_unused_args_eliminate 0.03% : 0.000010s : 1: pipeline_parallel_scheduler 0.02% : 0.000007s : 1: pipeline_split 0.14% : 0.000043s : 1: pre_auto_parallel 0.12% : 0.000037s : 1: py_interpret_to_execute 0.06% : 0.000018s : 1: py_interpret_to_execute_after_opt_a 0.02% : 0.000006s : 1: remove_cast_before_assign_add 0.17% : 0.000051s : 1: remove_dup_value 1.41% : 0.000421s : 1: renormalize.infer 1.14% : 0.000341s : 1: renormalize.specialize 0.03% : 0.000009s : 1: reorder_send_recv_between_fp_bp 0.03% : 0.000010s : 1: rewriter_after_jit_bprop_graph 0.17% : 0.000052s : 1: rewriter_after_opt_a 0.36% : 0.000108s : 1: rewriter_before_opt_a 0.03% : 0.000008s : 1: slice_cell_reuse_recomputed_activation 0.03% : 0.000008s : 1: slice_recompute_activation 0.03% : 0.000007s : 1: split_layernorm_comm 0.03% : 0.000008s : 1: split_matmul_comm_elemetwise 0.05% : 0.000014s : 1: swap_dp_allreduce_reducescatter 0.42% : 0.000125s : 1: symbol_engine_optimizer 0.44% : 0.000132s : 1: tuple_transform 19.91% : 0.005943s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:47:10.984.340 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0159851, [21] [bootstrap]: 0.00052901 [type_inference]: 0.00584884 [event_method]: 2.164e-05 [auto_monad]: 6.777e-05 [graph_reusing]: 6.06e-06 [inline]: 2.01e-06 [add_attr]: 0.00301041, [1] [add_attr_with_inline]: 0.00300143, [1] [Cycle 1]: 5.924e-05, [2] [tag_attr]: 2.086e-05 [meta_addattr_fg_expand]: 6.49999e-06 [parallel-infer-symbol]: 3.2e-06 [pre_auto_parallel]: 3.48e-05 [insert-virtual-dataset]: 2.56998e-06 [parallel-infer-symbol-second]: 8.70001e-07 [dataset_repeat_opt]: 2.18002e-06 [pipeline_split]: 1.59e-06 [optimize]: 0.00578272, [53] [py_interpret_to_execute]: 3.148e-05 [rewriter_before_opt_a]: 9.621e-05 [opt_a]: 0.00350964, [2] [Cycle 1]: 0.00259741, [45] [expand_dump_flag]: 3.11999e-06 [switch_simplify]: 4.896e-05 [loop_unroll]: 3.644e-05 [a_1]: 0.00088858 [with_stream_mark]: 1.627e-05 [recompute_prepare]: 1.278e-05 [updatestate_depend_eliminate]: 5.42999e-06 [updatestate_assign_eliminate]: 5.13002e-06 [updatestate_loads_eliminate]: 4.32998e-06 [parameter_eliminate]: 1.82001e-06 [a_2]: 0.00012749 [accelerated_algorithm]: 1.013e-05 [shard]: 1.72001e-06 [meta_shard_fg_expand]: 2.59001e-06 [shard_inline]: 9.88002e-06 [merge_send_recv]: 1.027e-05 [auto_parallel]: 7.97e-06 [parallel]: 1.856e-05 [flash_sp]: 7.92e-06 [merge_comm]: 5.81998e-06 [allreduce_fusion]: 4.94003e-06 [matmul_add_comm_reduction]: 1.087e-05 [allreduce_slice_to_reducescatter]: 7.79983e-07 [virtual_shard_identity]: 1.162e-05 [virtual_dataset]: 9.36e-06 [get_grad_eliminate_]: 9.24e-06 [virtual_output]: 9.52999e-06 [merge_forward]: 4.92e-06 [cell_reuse_recompute_pass]: 1.44998e-06 [offload_activation]: 1.186e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.869e-05 [merge_recompute_call_nodes]: 1.37999e-06 [before_grad]: 1.562e-05 [set_forward_comm_id_for_comm_node_pass]: 5.40999e-06 [meta_fg_expand]: 4.08001e-06 [flash_sp_send_recv_attached]: 3.25002e-06 [receive_attached]: 2.27999e-06 [after_resolve]: 1.566e-05 [a_after_grad]: 1.518e-05 [renormalize]: 0.00080921 [add_forward_monad_depend]: 5.00001e-06 [auto_monad_grad]: 2.26e-06 [auto_monad_eliminator]: 1.854e-05 [cse]: 5.112e-05 [a_3]: 8.16e-05 [Cycle 2]: 0.00090206, [45] [expand_dump_flag]: 1.17e-06 [switch_simplify]: 1.146e-05 [loop_unroll]: 1.001e-05 [a_1]: 0.00024089 [with_stream_mark]: 1.397e-05 [recompute_prepare]: 9.72001e-06 [updatestate_depend_eliminate]: 4.68999e-06 [updatestate_assign_eliminate]: 3.81001e-06 [updatestate_loads_eliminate]: 3.61999e-06 [parameter_eliminate]: 1.29998e-06 [a_2]: 0.00011936 [accelerated_algorithm]: 9.04998e-06 [shard]: 1.09e-06 [meta_shard_fg_expand]: 2.02999e-06 [shard_inline]: 8.92e-06 [merge_send_recv]: 6.89999e-06 [auto_parallel]: 7.18e-06 [parallel]: 4.59998e-06 [flash_sp]: 3.28e-06 [merge_comm]: 5.02999e-06 [allreduce_fusion]: 4.71002e-06 [matmul_add_comm_reduction]: 7.9e-06 [allreduce_slice_to_reducescatter]: 3.59985e-07 [virtual_shard_identity]: 9.91998e-06 [virtual_dataset]: 9.29e-06 [get_grad_eliminate_]: 8.57998e-06 [virtual_output]: 8.63001e-06 [merge_forward]: 4e-06 [cell_reuse_recompute_pass]: 1.37e-06 [offload_activation]: 8.19998e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.626e-05 [merge_recompute_call_nodes]: 8.70001e-07 [before_grad]: 1.413e-05 [set_forward_comm_id_for_comm_node_pass]: 5.57001e-06 [meta_fg_expand]: 3.36999e-06 [flash_sp_send_recv_attached]: 1.22e-06 [receive_attached]: 1.08001e-06 [after_resolve]: 1.485e-05 [a_after_grad]: 1.454e-05 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 1.24003e-06 [auto_monad_grad]: 1.19e-06 [auto_monad_eliminator]: 9.92001e-06 [cse]: 2.595e-05 [a_3]: 5.923e-05 [py_interpret_to_execute_after_opt_a]: 1.234e-05 [slice_cell_reuse_recomputed_activation]: 1.76998e-06 [rewriter_after_opt_a]: 4.508e-05 [convert_after_rewriter]: 9.07999e-06 [order_py_execute_after_rewriter]: 6.21e-06 [mutable_eliminate]: 0.00050007 [opt_b]: 0.00030284, [1] [Cycle 1]: 0.00029642, [7] [b_1]: 0.00020544 [b_2]: 1.112e-05 [updatestate_depend_eliminate]: 7.3e-06 [updatestate_assign_eliminate]: 3.66999e-06 [updatestate_loads_eliminate]: 3.97e-06 [renormalize]: 2.69996e-07 [cse]: 2.939e-05 [optimize_parallel_all_gather_comm]: 1.881e-05 [overlap_param_gather]: 1.74e-06 [cconv]: 2.625e-05 [loop_unroll]: 0.00042543 [opt_after_cconv]: 0.00013691, [1] [Cycle 1]: 0.00013112, [7] [c_1]: 4.94e-05 [parameter_eliminate]: 2.43e-06 [updatestate_depend_eliminate]: 7.18e-06 [updatestate_assign_eliminate]: 3.93999e-06 [updatestate_loads_eliminate]: 3.7e-06 [cse]: 2.838e-05 [renormalize]: 3.80009e-07 [remove_dup_value]: 3.751e-05 [tuple_transform]: 0.00010396, [1] [Cycle 1]: 9.92e-05, [4] [d_1]: 6.706e-05 [none_parameter_eliminate]: 2.00002e-06 [renormalize]: 2.50002e-07 [switch_simplify]: 1.035e-05 [partial_unused_args_eliminate]: 1.82001e-06 [add_recomputation]: 6.128e-05 [cse_after_recomputation]: 2.946e-05, [1] [Cycle 1]: 2.48e-05, [1] [cse]: 1.88e-05 [environ_conv]: 6.96999e-06 [swap_dp_allreduce_reducescatter]: 6.55002e-06 [bias_add_comm_swap]: 3.19001e-06 [label_micro_interleaved_index]: 3.99002e-06 [label_fine_grained_interleaved_index]: 2.75997e-06 [merge_cast_opt]: 1.21997e-06 [slice_recompute_activation]: 2.33002e-06 [micro_interleaved_order_control]: 2.24999e-06 [assign_add_opt]: 1.22e-06 [ForceFp32Comm]: 7.59988e-07 [remove_cast_before_assign_add]: 9.89996e-07 [full_micro_interleaved_order_control]: 2.17001e-06 [reorder_send_recv_between_fp_bp]: 2.71999e-06 [comm_op_add_attrs]: 9.90025e-07 [add_comm_op_reuse_tag]: 9.29984e-07 [interleave_split_concat_branches]: 1.15999e-06 [interleave_parallel_branches]: 1.07e-06 [overlap_opt_shard_in_pipeline]: 1.07e-06 [overlap_opt_shard_grad_in_pipeline]: 2.01e-06 [control_data_broadcast_order]: 1.62e-05 [grouped_pairwise_exchange_alltoall]: 1.72001e-06 [offloading_packed_experts]: 5.02e-06 [overlap_recompute_and_grad_model_parallel]: 5.81e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.25999e-06 [overlap_recompute_allgather_and_fa_grad]: 1.37e-06 [overlap_recompute_comm]: 2.44999e-06 [overlap_grad_ring_attention]: 5.42001e-06 [overlap_grad_flash_sp]: 2.374e-05 [begin_end_overlap_inline]: 5.50004e-07 [split_matmul_comm_elemetwise]: 2.04999e-06 [split_layernorm_comm]: 1.57999e-06 [handle_group_info]: 1.36998e-06 [symbol_engine_optimizer]: 9.276e-05, [1] [Cycle 1]: 8.816e-05, [6] [build]: 3.20998e-06 [elim_shapecalc]: 1.39e-05 [elim_not_effective]: 1.8e-05 [opt_reshape]: 1.049e-05 [fold_const_symbol]: 1.425e-05 [renormalize]: 2.49973e-07 [detach_backward]: 1.74e-06 [pipeline_parallel_scheduler]: 1.50001e-06 [auto_monad_reorder]: 2.139e-05 [get_jit_bprop_graph]: 1.24e-06 [rewriter_after_jit_bprop_graph]: 3.5e-06 [opt_after_jit_grad]: 0.00045881 [validate]: 4.193e-05 Sums bootstrap : 0.000529s : 4.40% type_inference : 0.005849s : 48.62% event_method : 0.000022s : 0.18% auto_monad : 0.000068s : 0.56% graph_reusing : 0.000006s : 0.05% inline : 0.000002s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000021s : 0.17% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.05% parallel-infer-symbol : 0.000003s : 0.03% pre_auto_parallel : 0.000035s : 0.29% insert-virtual-dataset : 0.000003s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000031s : 0.26% optimize.rewriter_before_opt_a : 0.000096s : 0.80% optimize.opt_a.expand_dump_flag : 0.000004s : 0.04% optimize.opt_a.switch_simplify : 0.000060s : 0.50% optimize.opt_a.loop_unroll : 0.000046s : 0.39% optimize.opt_a.a_1 : 0.001129s : 9.39% optimize.opt_a.with_stream_mark : 0.000030s : 0.25% optimize.opt_a.recompute_prepare : 0.000022s : 0.19% optimize.opt_a.updatestate_depend_eliminate : 0.000010s : 0.08% optimize.opt_a.updatestate_assign_eliminate : 0.000009s : 0.07% optimize.opt_a.updatestate_loads_eliminate : 0.000008s : 0.07% optimize.opt_a.parameter_eliminate : 0.000003s : 0.03% optimize.opt_a.a_2 : 0.000247s : 2.05% optimize.opt_a.accelerated_algorithm : 0.000019s : 0.16% optimize.opt_a.shard : 0.000003s : 0.02% optimize.opt_a.meta_shard_fg_expand : 0.000005s : 0.04% optimize.opt_a.shard_inline : 0.000019s : 0.16% optimize.opt_a.merge_send_recv : 0.000017s : 0.14% optimize.opt_a.auto_parallel : 0.000015s : 0.13% optimize.opt_a.parallel : 0.000023s : 0.19% optimize.opt_a.flash_sp : 0.000011s : 0.09% optimize.opt_a.merge_comm : 0.000011s : 0.09% optimize.opt_a.allreduce_fusion : 0.000010s : 0.08% optimize.opt_a.matmul_add_comm_reduction : 0.000019s : 0.16% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000022s : 0.18% optimize.opt_a.virtual_dataset : 0.000019s : 0.16% optimize.opt_a.get_grad_eliminate_ : 0.000018s : 0.15% optimize.opt_a.virtual_output : 0.000018s : 0.15% optimize.opt_a.merge_forward : 0.000009s : 0.07% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.02% optimize.opt_a.offload_activation : 0.000020s : 0.17% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000035s : 0.29% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.02% optimize.opt_a.before_grad : 0.000030s : 0.25% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000011s : 0.09% optimize.opt_a.meta_fg_expand : 0.000007s : 0.06% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.04% optimize.opt_a.receive_attached : 0.000003s : 0.03% optimize.opt_a.after_resolve : 0.000031s : 0.25% optimize.opt_a.a_after_grad : 0.000030s : 0.25% optimize.opt_a.renormalize : 0.000809s : 6.73% optimize.opt_a.add_forward_monad_depend : 0.000006s : 0.05% optimize.opt_a.auto_monad_grad : 0.000003s : 0.03% optimize.opt_a.auto_monad_eliminator : 0.000028s : 0.24% optimize.opt_a.cse : 0.000077s : 0.64% optimize.opt_a.a_3 : 0.000141s : 1.17% optimize.py_interpret_to_execute_after_opt_a : 0.000012s : 0.10% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.01% optimize.rewriter_after_opt_a : 0.000045s : 0.37% optimize.convert_after_rewriter : 0.000009s : 0.08% optimize.order_py_execute_after_rewriter : 0.000006s : 0.05% optimize.mutable_eliminate : 0.000500s : 4.16% optimize.opt_b.b_1 : 0.000205s : 1.71% optimize.opt_b.b_2 : 0.000011s : 0.09% optimize.opt_b.updatestate_depend_eliminate : 0.000007s : 0.06% optimize.opt_b.updatestate_assign_eliminate : 0.000004s : 0.03% optimize.opt_b.updatestate_loads_eliminate : 0.000004s : 0.03% optimize.opt_b.renormalize : 0.000000s : 0.00% optimize.opt_b.cse : 0.000029s : 0.24% optimize.optimize_parallel_all_gather_comm : 0.000019s : 0.16% optimize.overlap_param_gather : 0.000002s : 0.01% optimize.cconv : 0.000026s : 0.22% optimize.loop_unroll : 0.000425s : 3.54% optimize.opt_after_cconv.c_1 : 0.000049s : 0.41% optimize.opt_after_cconv.parameter_eliminate : 0.000002s : 0.02% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000007s : 0.06% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000004s : 0.03% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000004s : 0.03% optimize.opt_after_cconv.cse : 0.000028s : 0.24% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000038s : 0.31% optimize.tuple_transform.d_1 : 0.000067s : 0.56% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.02% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000010s : 0.09% optimize.partial_unused_args_eliminate : 0.000002s : 0.02% optimize.add_recomputation : 0.000061s : 0.51% optimize.cse_after_recomputation.cse : 0.000019s : 0.16% optimize.environ_conv : 0.000007s : 0.06% optimize.swap_dp_allreduce_reducescatter : 0.000007s : 0.05% optimize.bias_add_comm_swap : 0.000003s : 0.03% optimize.label_micro_interleaved_index : 0.000004s : 0.03% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.02% optimize.merge_cast_opt : 0.000001s : 0.01% optimize.slice_recompute_activation : 0.000002s : 0.02% optimize.micro_interleaved_order_control : 0.000002s : 0.02% optimize.assign_add_opt : 0.000001s : 0.01% optimize.ForceFp32Comm : 0.000001s : 0.01% optimize.remove_cast_before_assign_add : 0.000001s : 0.01% optimize.full_micro_interleaved_order_control : 0.000002s : 0.02% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.02% optimize.comm_op_add_attrs : 0.000001s : 0.01% optimize.add_comm_op_reuse_tag : 0.000001s : 0.01% optimize.interleave_split_concat_branches : 0.000001s : 0.01% optimize.interleave_parallel_branches : 0.000001s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.02% optimize.control_data_broadcast_order : 0.000016s : 0.13% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.01% optimize.offloading_packed_experts : 0.000005s : 0.04% optimize.overlap_recompute_and_grad_model_parallel : 0.000006s : 0.05% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.01% optimize.overlap_recompute_comm : 0.000002s : 0.02% optimize.overlap_grad_ring_attention : 0.000005s : 0.05% optimize.overlap_grad_flash_sp : 0.000024s : 0.20% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.02% optimize.split_layernorm_comm : 0.000002s : 0.01% optimize.handle_group_info : 0.000001s : 0.01% optimize.symbol_engine_optimizer.build : 0.000003s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000014s : 0.12% optimize.symbol_engine_optimizer.elim_not_effective : 0.000018s : 0.15% optimize.symbol_engine_optimizer.opt_reshape : 0.000010s : 0.09% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000014s : 0.12% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.01% pipeline_parallel_scheduler : 0.000002s : 0.01% auto_monad_reorder : 0.000021s : 0.18% get_jit_bprop_graph : 0.000001s : 0.01% rewriter_after_jit_bprop_graph : 0.000003s : 0.03% opt_after_jit_grad : 0.000459s : 3.81% validate : 0.000042s : 0.35% Time group info: ------[substitution.] 0.000244 54 12.18% : 0.000030s : 6: substitution.cast_eliminate 1.05% : 0.000003s : 4: substitution.elim_not_effective 0.79% : 0.000002s : 4: substitution.fold_const_symbol 3.03% : 0.000007s : 7: substitution.graph_param_transform 64.20% : 0.000156s : 4: substitution.inline 2.11% : 0.000005s : 8: substitution.j_node_and_user_rematch 2.88% : 0.000007s : 8: substitution.remove_not_recompute_node 2.20% : 0.000005s : 6: substitution.replace_old_param 8.98% : 0.000022s : 6: substitution.tuple_list_get_item_eliminator 2.58% : 0.000006s : 1: substitution.value_based_eliminate ------[type_inference.] 0.005789 2 87.06% : 0.005040s : 1: type_inference.infer 12.94% : 0.000749s : 1: type_inference.specialize ------[replace.] 0.000074 10 52.82% : 0.000039s : 4: replace.inline 47.18% : 0.000035s : 6: replace.tuple_list_get_item_eliminator ------[match.] 0.000173 10 89.06% : 0.000154s : 4: match.inline 10.94% : 0.000019s : 6: match.tuple_list_get_item_eliminator ------[predicate.] 0.000318 2134 0.95% : 0.000003s : 22: predicate.accumulaten_eliminater 0.74% : 0.000002s : 7: predicate.ad_related_special_op_eliminate 0.59% : 0.000002s : 14: predicate.addn_check_dump 0.95% : 0.000003s : 22: predicate.addn_zero_filter 0.89% : 0.000003s : 22: predicate.adjust_all_reduce_mul_add 1.95% : 0.000006s : 36: predicate.arithmetic_simplify 1.04% : 0.000003s : 22: predicate.cast_eliminate 0.59% : 0.000002s : 14: predicate.check_bprop_eliminate 0.62% : 0.000002s : 14: predicate.compare_switch_simplify 0.19% : 0.000001s : 7: predicate.const_output_eliminate 0.62% : 0.000002s : 14: predicate.depend_value_elim 1.01% : 0.000003s : 22: predicate.dict_get_item_const_eliminator 1.05% : 0.000003s : 22: predicate.dict_get_item_eliminator 0.91% : 0.000003s : 22: predicate.dict_set_item_eliminator 0.84% : 0.000003s : 14: predicate.dumpgradient_eliminate 0.20% : 0.000001s : 7: predicate.elim_not_effective 0.60% : 0.000002s : 7: predicate.elim_shapecalc_of_broadcastargs 1.20% : 0.000004s : 29: predicate.environ_add_const_eliminate 1.17% : 0.000004s : 29: predicate.environ_get_add_eliminate 1.14% : 0.000004s : 29: predicate.environ_get_depend_swap 1.80% : 0.000006s : 43: predicate.environ_get_eliminate 1.18% : 0.000004s : 29: predicate.environ_get_set_eliminate 1.41% : 0.000004s : 32: predicate.exchange_switch_depend_value 2.20% : 0.000007s : 32: predicate.float_depend_g_call 0.55% : 0.000002s : 14: predicate.float_environ_get_switch 0.93% : 0.000003s : 21: predicate.float_tuple_getitem_switch 0.19% : 0.000001s : 7: predicate.fold_const_symbol 0.67% : 0.000002s : 14: predicate.get_grad_eliminate 0.22% : 0.000001s : 7: predicate.graph_param_transform 0.64% : 0.000002s : 14: predicate.incorporate_call 0.56% : 0.000002s : 14: predicate.incorporate_call_switch 6.16% : 0.000020s : 96: predicate.inline 0.81% : 0.000003s : 14: predicate.inline_without_move 0.36% : 0.000001s : 14: predicate.j_node_and_user_rematch 0.75% : 0.000002s : 14: predicate.less_batch_normalization 1.96% : 0.000006s : 42: predicate.list_to_tuple_eliminator_ 2.66% : 0.000008s : 64: predicate.load_eliminater 0.69% : 0.000002s : 7: predicate.loop_unroll_after_grad 1.95% : 0.000006s : 44: predicate.loop_unroll_before_grad 1.63% : 0.000005s : 36: predicate.make_slice_get_slice_eliminator 0.61% : 0.000002s : 14: predicate.merge_addn 0.60% : 0.000002s : 14: predicate.micro_step_allgather_replace 0.69% : 0.000002s : 14: predicate.mini_step_allgather_replace 0.88% : 0.000003s : 22: predicate.minmaximum_grad 0.88% : 0.000003s : 7: predicate.mutable_eliminate 0.42% : 0.000001s : 7: predicate.opt_reshape 0.41% : 0.000001s : 7: predicate.parallel_virtual_node 1.63% : 0.000005s : 32: predicate.partial_defer_inline 1.84% : 0.000006s : 35: predicate.partial_eliminate 0.96% : 0.000003s : 22: predicate.print_const_string_wrapper 0.61% : 0.000002s : 14: predicate.reduce_all_const_elim 1.22% : 0.000004s : 22: predicate.reduce_eliminate 2.64% : 0.000008s : 64: predicate.redundant_stop_gradient_eliminater 0.44% : 0.000001s : 14: predicate.remove_not_recompute_node 1.46% : 0.000005s : 42: predicate.replace_applicator 0.43% : 0.000001s : 14: predicate.replace_old_param 0.23% : 0.000001s : 7: predicate.reset_defer_inline 0.97% : 0.000003s : 22: predicate.reshape_eliminate 0.71% : 0.000002s : 14: predicate.row_tensor_add_zeros_like 0.34% : 0.000001s : 7: predicate.row_tensor_eliminate 0.82% : 0.000003s : 14: predicate.same_eliminate 0.40% : 0.000001s : 14: predicate.set_cell_output_no_recompute 0.74% : 0.000002s : 14: predicate.shard_identity_eliminate 0.67% : 0.000002s : 14: predicate.special_op_eliminate 0.76% : 0.000002s : 14: predicate.specialize_transform 0.76% : 0.000002s : 14: predicate.split_environ_get_set_with_tuple_value 0.74% : 0.000002s : 14: predicate.stack_unstack_eliminate 0.40% : 0.000001s : 7: predicate.switch_call_monad_eliminater 1.47% : 0.000005s : 32: predicate.switch_defer_inline 2.04% : 0.000006s : 46: predicate.switch_layer_defer_inline 4.61% : 0.000015s : 97: predicate.switch_simplify 0.90% : 0.000003s : 22: predicate.tile_eliminate 0.95% : 0.000003s : 22: predicate.transpose_eliminate 1.54% : 0.000005s : 36: predicate.tuple_list_convert_item_index_to_positive 1.65% : 0.000005s : 36: predicate.tuple_list_get_item_const_eliminator 1.55% : 0.000005s : 36: predicate.tuple_list_get_item_depend_reorder 3.26% : 0.000010s : 56: predicate.tuple_list_get_item_eliminator 1.56% : 0.000005s : 36: predicate.tuple_list_get_set_item_eliminator 2.34% : 0.000007s : 50: predicate.tuple_list_set_item_eliminator 1.84% : 0.000006s : 42: predicate.tuple_to_list_eliminator_ 2.62% : 0.000008s : 64: predicate.updatestate_pure_node_eliminater 3.35% : 0.000011s : 78: predicate.updatestate_useless_node_eliminater 0.40% : 0.000001s : 7: predicate.value_based_eliminate 0.65% : 0.000002s : 14: predicate.virtual_dataset_eliminate 0.68% : 0.000002s : 14: predicate.virtual_output_eliminate 0.34% : 0.000001s : 7: predicate.virtual_view_grad_eliminate 0.41% : 0.000001s : 7: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000547 11 53.44% : 0.000292s : 5: func_graph_cloner_run.FuncGraphClonerGraph 46.56% : 0.000255s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.027686 192 0.01% : 0.000004s : 1: ForceFp32Comm 10.89% : 0.003014s : 1: add_attr 10.86% : 0.003005s : 1: add_attr_with_inline 0.01% : 0.000004s : 1: add_comm_op_reuse_tag 0.24% : 0.000065s : 1: add_recomputation 0.01% : 0.000004s : 1: assign_add_opt 0.26% : 0.000073s : 1: auto_monad 0.09% : 0.000025s : 1: auto_monad_reorder 0.01% : 0.000003s : 1: begin_end_overlap_inline 0.02% : 0.000006s : 1: bias_add_comm_swap 2.01% : 0.000558s : 1: bootstrap 0.11% : 0.000030s : 1: cconv 0.01% : 0.000004s : 1: comm_op_add_attrs 0.07% : 0.000019s : 1: control_data_broadcast_order 0.04% : 0.000012s : 1: convert_after_rewriter 0.12% : 0.000032s : 1: cse_after_recomputation 0.02% : 0.000005s : 1: dataset_repeat_opt 0.02% : 0.000005s : 1: detach_backward 0.04% : 0.000011s : 1: environ_conv 0.10% : 0.000027s : 1: event_method 0.02% : 0.000005s : 1: full_micro_interleaved_order_control 0.02% : 0.000004s : 1: get_jit_bprop_graph 0.03% : 0.000010s : 1: graph_reusing 0.02% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.02% : 0.000004s : 1: handle_group_info 0.02% : 0.000005s : 1: inline 0.02% : 0.000006s : 1: insert-virtual-dataset 0.01% : 0.000004s : 1: interleave_parallel_branches 0.02% : 0.000004s : 1: interleave_split_concat_branches 0.02% : 0.000006s : 1: label_fine_grained_interleaved_index 0.03% : 0.000007s : 1: label_micro_interleaved_index 1.56% : 0.000433s : 1: loop_unroll 0.01% : 0.000004s : 1: merge_cast_opt 0.02% : 0.000005s : 1: micro_interleaved_order_control 1.84% : 0.000508s : 1: mutable_eliminate 0.03% : 0.000008s : 1: offloading_packed_experts 0.07% : 0.000018s : 1: opt.transform.loop_unroll_optimizer 0.07% : 0.000019s : 1: opt.transform.mutable_eliminate 6.48% : 0.001795s : 78: opt.transform.opt_a 0.17% : 0.000048s : 1: opt.transform.opt_after_cconv 0.12% : 0.000034s : 1: opt.transform.opt_after_jit_grad 0.67% : 0.000186s : 28: opt.transform.opt_b 0.27% : 0.000075s : 2: opt.transform.opt_trans_graph 0.19% : 0.000053s : 4: opt.transform.symbol_engine_opt 12.69% : 0.003513s : 1: opt_a 0.51% : 0.000141s : 1: opt_after_cconv 1.69% : 0.000468s : 1: opt_after_jit_grad 1.11% : 0.000306s : 1: opt_b 20.90% : 0.005787s : 1: optimize 0.08% : 0.000022s : 1: optimize_parallel_all_gather_comm 0.03% : 0.000009s : 1: order_py_execute_after_rewriter 0.10% : 0.000027s : 1: overlap_grad_flash_sp 0.02% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.03% : 0.000008s : 1: overlap_grad_ring_attention 0.02% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.02% : 0.000005s : 1: overlap_param_gather 0.01% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.03% : 0.000009s : 1: overlap_recompute_and_grad_model_parallel 0.02% : 0.000005s : 1: overlap_recompute_comm 0.02% : 0.000007s : 1: parallel-infer-symbol 0.01% : 0.000004s : 1: parallel-infer-symbol-second 0.02% : 0.000005s : 1: partial_unused_args_eliminate 0.02% : 0.000005s : 1: pipeline_parallel_scheduler 0.02% : 0.000004s : 1: pipeline_split 0.14% : 0.000039s : 1: pre_auto_parallel 0.13% : 0.000036s : 1: py_interpret_to_execute 0.06% : 0.000016s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000004s : 1: remove_cast_before_assign_add 0.15% : 0.000042s : 1: remove_dup_value 1.69% : 0.000469s : 1: renormalize.infer 1.20% : 0.000332s : 1: renormalize.specialize 0.02% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.02% : 0.000007s : 1: rewriter_after_jit_bprop_graph 0.18% : 0.000049s : 1: rewriter_after_opt_a 0.36% : 0.000101s : 1: rewriter_before_opt_a 0.02% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.02% : 0.000005s : 1: slice_recompute_activation 0.02% : 0.000004s : 1: split_layernorm_comm 0.02% : 0.000005s : 1: split_matmul_comm_elemetwise 0.03% : 0.000010s : 1: swap_dp_allreduce_reducescatter 0.34% : 0.000095s : 1: symbol_engine_optimizer 0.39% : 0.000107s : 1: tuple_transform 21.18% : 0.005863s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:47:11.174.956 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:47:11.175.211 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0165499, [21] [bootstrap]: 0.00045362 [type_inference]: 0.00565921 [event_method]: 2.105e-05 [auto_monad]: 6.156e-05 [graph_reusing]: 6.04999e-06 [inline]: 1.92999e-06 [add_attr]: 0.00300187, [1] [add_attr_with_inline]: 0.00299354, [1] [Cycle 1]: 7.478e-05, [2] [tag_attr]: 2.085e-05 [meta_addattr_fg_expand]: 6.54999e-06 [parallel-infer-symbol]: 3.13998e-06 [pre_auto_parallel]: 3.621e-05 [insert-virtual-dataset]: 2.46998e-06 [parallel-infer-symbol-second]: 7.2e-07 [dataset_repeat_opt]: 1.70001e-06 [pipeline_split]: 1.49e-06 [optimize]: 0.00611545, [53] [py_interpret_to_execute]: 3.275e-05 [rewriter_before_opt_a]: 9.681e-05 [opt_a]: 0.00366817, [2] [Cycle 1]: 0.00256487, [45] [expand_dump_flag]: 3.31999e-06 [switch_simplify]: 4.838e-05 [loop_unroll]: 3.544e-05 [a_1]: 0.00086665 [with_stream_mark]: 1.687e-05 [recompute_prepare]: 1.224e-05 [updatestate_depend_eliminate]: 5.30001e-06 [updatestate_assign_eliminate]: 4.47e-06 [updatestate_loads_eliminate]: 3.7e-06 [parameter_eliminate]: 2.06e-06 [a_2]: 0.00013934 [accelerated_algorithm]: 8.92e-06 [shard]: 1.79e-06 [meta_shard_fg_expand]: 2.34999e-06 [shard_inline]: 8.94e-06 [merge_send_recv]: 1.038e-05 [auto_parallel]: 7.16999e-06 [parallel]: 1.96e-05 [flash_sp]: 9.25999e-06 [merge_comm]: 5.20999e-06 [allreduce_fusion]: 4.27998e-06 [matmul_add_comm_reduction]: 1.008e-05 [allreduce_slice_to_reducescatter]: 9.70002e-07 [virtual_shard_identity]: 1.079e-05 [virtual_dataset]: 8.68001e-06 [get_grad_eliminate_]: 8.47998e-06 [virtual_output]: 8.58001e-06 [merge_forward]: 5.44e-06 [cell_reuse_recompute_pass]: 1.37e-06 [offload_activation]: 1.199e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.795e-05 [merge_recompute_call_nodes]: 1.52999e-06 [before_grad]: 1.465e-05 [set_forward_comm_id_for_comm_node_pass]: 4.53001e-06 [meta_fg_expand]: 3.6e-06 [flash_sp_send_recv_attached]: 2.91e-06 [receive_attached]: 2.09e-06 [after_resolve]: 1.468e-05 [a_after_grad]: 1.35e-05 [renormalize]: 0.00068538 [add_forward_monad_depend]: 5.89999e-06 [auto_monad_grad]: 2.17999e-06 [auto_monad_eliminator]: 1.719e-05 [cse]: 3.499e-05 [a_3]: 7.72e-05 [Cycle 2]: 0.00108946, [45] [expand_dump_flag]: 1.44e-06 [switch_simplify]: 1.018e-05 [loop_unroll]: 8.48999e-06 [a_1]: 0.00021829 [with_stream_mark]: 3.71e-05 [recompute_prepare]: 9.67001e-06 [updatestate_depend_eliminate]: 4.05e-06 [updatestate_assign_eliminate]: 3.16001e-06 [updatestate_loads_eliminate]: 3.57002e-06 [parameter_eliminate]: 1.29e-06 [a_2]: 0.00014903 [accelerated_algorithm]: 9.59e-06 [shard]: 1.32e-06 [meta_shard_fg_expand]: 2.04999e-06 [shard_inline]: 9.29998e-06 [merge_send_recv]: 6.74001e-06 [auto_parallel]: 7.39002e-06 [parallel]: 5.90002e-06 [flash_sp]: 3.89002e-06 [merge_comm]: 4.73001e-06 [allreduce_fusion]: 4.79998e-06 [matmul_add_comm_reduction]: 7.58001e-06 [allreduce_slice_to_reducescatter]: 3.89991e-07 [virtual_shard_identity]: 1.084e-05 [virtual_dataset]: 9.09e-06 [get_grad_eliminate_]: 9.02e-06 [virtual_output]: 8.75001e-06 [merge_forward]: 3.88999e-06 [cell_reuse_recompute_pass]: 1.64e-06 [offload_activation]: 8.85999e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.841e-05 [merge_recompute_call_nodes]: 8.70001e-07 [before_grad]: 1.258e-05 [set_forward_comm_id_for_comm_node_pass]: 4.99e-06 [meta_fg_expand]: 3.03e-06 [flash_sp_send_recv_attached]: 9.70002e-07 [receive_attached]: 1.17e-06 [after_resolve]: 1.553e-05 [a_after_grad]: 1.457e-05 [renormalize]: 9.00181e-08 [add_forward_monad_depend]: 2.16e-06 [auto_monad_grad]: 1.23002e-06 [auto_monad_eliminator]: 9.94001e-06 [cse]: 1.909e-05 [a_3]: 6.495e-05 [py_interpret_to_execute_after_opt_a]: 1.552e-05 [slice_cell_reuse_recomputed_activation]: 5.00001e-06 [rewriter_after_opt_a]: 4.551e-05 [convert_after_rewriter]: 1.066e-05 [order_py_execute_after_rewriter]: 8.64e-06 [mutable_eliminate]: 0.00053108 [opt_b]: 0.00033788, [1] [Cycle 1]: 0.00032812, [7] [b_1]: 0.00022471 [b_2]: 1.018e-05 [updatestate_depend_eliminate]: 6.70998e-06 [updatestate_assign_eliminate]: 3.39001e-06 [updatestate_loads_eliminate]: 3.72002e-06 [renormalize]: 5.3001e-07 [cse]: 2.226e-05 [optimize_parallel_all_gather_comm]: 2.107e-05 [overlap_param_gather]: 4.43001e-06 [cconv]: 2.873e-05 [loop_unroll]: 0.0004372 [opt_after_cconv]: 0.00014554, [1] [Cycle 1]: 0.00013677, [7] [c_1]: 4.229e-05 [parameter_eliminate]: 2.76999e-06 [updatestate_depend_eliminate]: 6.19999e-06 [updatestate_assign_eliminate]: 3.36001e-06 [updatestate_loads_eliminate]: 3.08998e-06 [cse]: 2.229e-05 [renormalize]: 5.99975e-07 [remove_dup_value]: 1.929e-05 [tuple_transform]: 0.0001059, [1] [Cycle 1]: 9.861e-05, [4] [d_1]: 5.81e-05 [none_parameter_eliminate]: 1.54e-06 [renormalize]: 2.19996e-07 [switch_simplify]: 8.90001e-06 [partial_unused_args_eliminate]: 4.60999e-06 [add_recomputation]: 5.898e-05 [cse_after_recomputation]: 3.212e-05, [1] [Cycle 1]: 2.491e-05, [1] [cse]: 1.585e-05 [environ_conv]: 9.77001e-06 [swap_dp_allreduce_reducescatter]: 9.06002e-06 [bias_add_comm_swap]: 4.81002e-06 [label_micro_interleaved_index]: 6.84999e-06 [label_fine_grained_interleaved_index]: 5.14e-06 [merge_cast_opt]: 3.84002e-06 [slice_recompute_activation]: 4.64002e-06 [micro_interleaved_order_control]: 4.82e-06 [assign_add_opt]: 3.63999e-06 [ForceFp32Comm]: 3.18998e-06 [remove_cast_before_assign_add]: 3.35e-06 [full_micro_interleaved_order_control]: 4.38001e-06 [reorder_send_recv_between_fp_bp]: 5.39e-06 [comm_op_add_attrs]: 3.73001e-06 [add_comm_op_reuse_tag]: 3.23e-06 [interleave_split_concat_branches]: 3.42002e-06 [interleave_parallel_branches]: 3.38e-06 [overlap_opt_shard_in_pipeline]: 3.51999e-06 [overlap_opt_shard_grad_in_pipeline]: 4.05e-06 [control_data_broadcast_order]: 1.959e-05 [grouped_pairwise_exchange_alltoall]: 4.35e-06 [offloading_packed_experts]: 6.93e-06 [overlap_recompute_and_grad_model_parallel]: 8.03999e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.54002e-06 [overlap_recompute_allgather_and_fa_grad]: 3.65e-06 [overlap_recompute_comm]: 4.79998e-06 [overlap_grad_ring_attention]: 7.75998e-06 [overlap_grad_flash_sp]: 2.615e-05 [begin_end_overlap_inline]: 3.58999e-06 [split_matmul_comm_elemetwise]: 4.58001e-06 [split_layernorm_comm]: 4.15e-06 [handle_group_info]: 3.2e-06 [symbol_engine_optimizer]: 0.00010594, [1] [Cycle 1]: 9.924e-05, [6] [build]: 2.98998e-06 [elim_shapecalc]: 1.174e-05 [elim_not_effective]: 1.54e-05 [opt_reshape]: 8.94998e-06 [fold_const_symbol]: 1.319e-05 [renormalize]: 2.30008e-07 [detach_backward]: 4.28001e-06 [pipeline_parallel_scheduler]: 1.62001e-06 [auto_monad_reorder]: 2.248e-05 [get_jit_bprop_graph]: 1.36002e-06 [rewriter_after_jit_bprop_graph]: 5.54e-06 [opt_after_jit_grad]: 0.00051341 [validate]: 4.501e-05 Sums bootstrap : 0.000454s : 3.85% type_inference : 0.005659s : 48.03% event_method : 0.000021s : 0.18% auto_monad : 0.000062s : 0.52% graph_reusing : 0.000006s : 0.05% inline : 0.000002s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000021s : 0.18% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000007s : 0.06% parallel-infer-symbol : 0.000003s : 0.03% pre_auto_parallel : 0.000036s : 0.31% insert-virtual-dataset : 0.000002s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.01% pipeline_split : 0.000001s : 0.01% optimize.py_interpret_to_execute : 0.000033s : 0.28% optimize.rewriter_before_opt_a : 0.000097s : 0.82% optimize.opt_a.expand_dump_flag : 0.000005s : 0.04% optimize.opt_a.switch_simplify : 0.000059s : 0.50% optimize.opt_a.loop_unroll : 0.000044s : 0.37% optimize.opt_a.a_1 : 0.001085s : 9.21% optimize.opt_a.with_stream_mark : 0.000054s : 0.46% optimize.opt_a.recompute_prepare : 0.000022s : 0.19% optimize.opt_a.updatestate_depend_eliminate : 0.000009s : 0.08% optimize.opt_a.updatestate_assign_eliminate : 0.000008s : 0.06% optimize.opt_a.updatestate_loads_eliminate : 0.000007s : 0.06% optimize.opt_a.parameter_eliminate : 0.000003s : 0.03% optimize.opt_a.a_2 : 0.000288s : 2.45% optimize.opt_a.accelerated_algorithm : 0.000019s : 0.16% optimize.opt_a.shard : 0.000003s : 0.03% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.04% optimize.opt_a.shard_inline : 0.000018s : 0.15% optimize.opt_a.merge_send_recv : 0.000017s : 0.15% optimize.opt_a.auto_parallel : 0.000015s : 0.12% optimize.opt_a.parallel : 0.000026s : 0.22% optimize.opt_a.flash_sp : 0.000013s : 0.11% optimize.opt_a.merge_comm : 0.000010s : 0.08% optimize.opt_a.allreduce_fusion : 0.000009s : 0.08% optimize.opt_a.matmul_add_comm_reduction : 0.000018s : 0.15% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000022s : 0.18% optimize.opt_a.virtual_dataset : 0.000018s : 0.15% optimize.opt_a.get_grad_eliminate_ : 0.000017s : 0.15% optimize.opt_a.virtual_output : 0.000017s : 0.15% optimize.opt_a.merge_forward : 0.000009s : 0.08% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.03% optimize.opt_a.offload_activation : 0.000021s : 0.18% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000036s : 0.31% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.02% optimize.opt_a.before_grad : 0.000027s : 0.23% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000010s : 0.08% optimize.opt_a.meta_fg_expand : 0.000007s : 0.06% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.03% optimize.opt_a.receive_attached : 0.000003s : 0.03% optimize.opt_a.after_resolve : 0.000030s : 0.26% optimize.opt_a.a_after_grad : 0.000028s : 0.24% optimize.opt_a.renormalize : 0.000685s : 5.82% optimize.opt_a.add_forward_monad_depend : 0.000008s : 0.07% optimize.opt_a.auto_monad_grad : 0.000003s : 0.03% optimize.opt_a.auto_monad_eliminator : 0.000027s : 0.23% optimize.opt_a.cse : 0.000054s : 0.46% optimize.opt_a.a_3 : 0.000142s : 1.21% optimize.py_interpret_to_execute_after_opt_a : 0.000016s : 0.13% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.04% optimize.rewriter_after_opt_a : 0.000046s : 0.39% optimize.convert_after_rewriter : 0.000011s : 0.09% optimize.order_py_execute_after_rewriter : 0.000009s : 0.07% optimize.mutable_eliminate : 0.000531s : 4.51% optimize.opt_b.b_1 : 0.000225s : 1.91% optimize.opt_b.b_2 : 0.000010s : 0.09% optimize.opt_b.updatestate_depend_eliminate : 0.000007s : 0.06% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_b.updatestate_loads_eliminate : 0.000004s : 0.03% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000022s : 0.19% optimize.optimize_parallel_all_gather_comm : 0.000021s : 0.18% optimize.overlap_param_gather : 0.000004s : 0.04% optimize.cconv : 0.000029s : 0.24% optimize.loop_unroll : 0.000437s : 3.71% optimize.opt_after_cconv.c_1 : 0.000042s : 0.36% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.02% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.05% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.cse : 0.000022s : 0.19% optimize.opt_after_cconv.renormalize : 0.000001s : 0.01% optimize.remove_dup_value : 0.000019s : 0.16% optimize.tuple_transform.d_1 : 0.000058s : 0.49% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000009s : 0.08% optimize.partial_unused_args_eliminate : 0.000005s : 0.04% optimize.add_recomputation : 0.000059s : 0.50% optimize.cse_after_recomputation.cse : 0.000016s : 0.13% optimize.environ_conv : 0.000010s : 0.08% optimize.swap_dp_allreduce_reducescatter : 0.000009s : 0.08% optimize.bias_add_comm_swap : 0.000005s : 0.04% optimize.label_micro_interleaved_index : 0.000007s : 0.06% optimize.label_fine_grained_interleaved_index : 0.000005s : 0.04% optimize.merge_cast_opt : 0.000004s : 0.03% optimize.slice_recompute_activation : 0.000005s : 0.04% optimize.micro_interleaved_order_control : 0.000005s : 0.04% optimize.assign_add_opt : 0.000004s : 0.03% optimize.ForceFp32Comm : 0.000003s : 0.03% optimize.remove_cast_before_assign_add : 0.000003s : 0.03% optimize.full_micro_interleaved_order_control : 0.000004s : 0.04% optimize.reorder_send_recv_between_fp_bp : 0.000005s : 0.05% optimize.comm_op_add_attrs : 0.000004s : 0.03% optimize.add_comm_op_reuse_tag : 0.000003s : 0.03% optimize.interleave_split_concat_branches : 0.000003s : 0.03% optimize.interleave_parallel_branches : 0.000003s : 0.03% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.03% optimize.overlap_opt_shard_grad_in_pipeline : 0.000004s : 0.03% optimize.control_data_broadcast_order : 0.000020s : 0.17% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.04% optimize.offloading_packed_experts : 0.000007s : 0.06% optimize.overlap_recompute_and_grad_model_parallel : 0.000008s : 0.07% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.03% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.03% optimize.overlap_recompute_comm : 0.000005s : 0.04% optimize.overlap_grad_ring_attention : 0.000008s : 0.07% optimize.overlap_grad_flash_sp : 0.000026s : 0.22% optimize.begin_end_overlap_inline : 0.000004s : 0.03% optimize.split_matmul_comm_elemetwise : 0.000005s : 0.04% optimize.split_layernorm_comm : 0.000004s : 0.04% optimize.handle_group_info : 0.000003s : 0.03% optimize.symbol_engine_optimizer.build : 0.000003s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000012s : 0.10% optimize.symbol_engine_optimizer.elim_not_effective : 0.000015s : 0.13% optimize.symbol_engine_optimizer.opt_reshape : 0.000009s : 0.08% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000013s : 0.11% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000004s : 0.04% pipeline_parallel_scheduler : 0.000002s : 0.01% auto_monad_reorder : 0.000022s : 0.19% get_jit_bprop_graph : 0.000001s : 0.01% rewriter_after_jit_bprop_graph : 0.000006s : 0.05% opt_after_jit_grad : 0.000513s : 4.36% validate : 0.000045s : 0.38% Time group info: ------[substitution.] 0.000223 44 9.29% : 0.000021s : 3: substitution.cast_eliminate 0.93% : 0.000002s : 3: substitution.elim_not_effective 0.83% : 0.000002s : 3: substitution.fold_const_symbol 3.16% : 0.000007s : 6: substitution.graph_param_transform 66.64% : 0.000149s : 4: substitution.inline 2.14% : 0.000005s : 6: substitution.j_node_and_user_rematch 2.99% : 0.000007s : 6: substitution.remove_not_recompute_node 2.81% : 0.000006s : 6: substitution.replace_old_param 8.49% : 0.000019s : 6: substitution.tuple_list_get_item_eliminator 2.72% : 0.000006s : 1: substitution.value_based_eliminate ------[type_inference.] 0.005610 2 87.13% : 0.004888s : 1: type_inference.infer 12.87% : 0.000722s : 1: type_inference.specialize ------[replace.] 0.000072 10 51.97% : 0.000037s : 4: replace.inline 48.03% : 0.000034s : 6: replace.tuple_list_get_item_eliminator ------[match.] 0.000162 10 89.99% : 0.000146s : 4: match.inline 10.01% : 0.000016s : 6: match.tuple_list_get_item_eliminator ------[predicate.] 0.000300 1954 0.98% : 0.000003s : 21: predicate.accumulaten_eliminater 0.57% : 0.000002s : 6: predicate.ad_related_special_op_eliminate 0.59% : 0.000002s : 12: predicate.addn_check_dump 1.09% : 0.000003s : 21: predicate.addn_zero_filter 0.89% : 0.000003s : 21: predicate.adjust_all_reduce_mul_add 2.15% : 0.000006s : 33: predicate.arithmetic_simplify 1.06% : 0.000003s : 21: predicate.cast_eliminate 0.60% : 0.000002s : 12: predicate.check_bprop_eliminate 0.58% : 0.000002s : 12: predicate.compare_switch_simplify 0.18% : 0.000001s : 6: predicate.const_output_eliminate 0.62% : 0.000002s : 12: predicate.depend_value_elim 1.02% : 0.000003s : 21: predicate.dict_get_item_const_eliminator 1.07% : 0.000003s : 21: predicate.dict_get_item_eliminator 0.92% : 0.000003s : 21: predicate.dict_set_item_eliminator 0.84% : 0.000003s : 12: predicate.dumpgradient_eliminate 0.21% : 0.000001s : 6: predicate.elim_not_effective 0.32% : 0.000001s : 6: predicate.elim_shapecalc_of_broadcastargs 1.24% : 0.000004s : 27: predicate.environ_add_const_eliminate 1.18% : 0.000004s : 27: predicate.environ_get_add_eliminate 1.18% : 0.000004s : 27: predicate.environ_get_depend_swap 1.76% : 0.000005s : 39: predicate.environ_get_eliminate 1.17% : 0.000003s : 27: predicate.environ_get_set_eliminate 1.46% : 0.000004s : 31: predicate.exchange_switch_depend_value 2.40% : 0.000007s : 31: predicate.float_depend_g_call 0.61% : 0.000002s : 12: predicate.float_environ_get_switch 0.85% : 0.000003s : 18: predicate.float_tuple_getitem_switch 0.19% : 0.000001s : 6: predicate.fold_const_symbol 0.65% : 0.000002s : 12: predicate.get_grad_eliminate 0.25% : 0.000001s : 6: predicate.graph_param_transform 0.61% : 0.000002s : 12: predicate.incorporate_call 0.51% : 0.000002s : 12: predicate.incorporate_call_switch 6.37% : 0.000019s : 88: predicate.inline 0.84% : 0.000003s : 12: predicate.inline_without_move 0.34% : 0.000001s : 12: predicate.j_node_and_user_rematch 0.80% : 0.000002s : 12: predicate.less_batch_normalization 1.84% : 0.000006s : 39: predicate.list_to_tuple_eliminator_ 2.69% : 0.000008s : 60: predicate.load_eliminater 0.88% : 0.000003s : 6: predicate.loop_unroll_after_grad 1.94% : 0.000006s : 42: predicate.loop_unroll_before_grad 1.76% : 0.000005s : 33: predicate.make_slice_get_slice_eliminator 0.61% : 0.000002s : 12: predicate.merge_addn 0.55% : 0.000002s : 12: predicate.micro_step_allgather_replace 0.57% : 0.000002s : 12: predicate.mini_step_allgather_replace 0.92% : 0.000003s : 21: predicate.minmaximum_grad 0.86% : 0.000003s : 6: predicate.mutable_eliminate 0.31% : 0.000001s : 6: predicate.opt_reshape 0.33% : 0.000001s : 6: predicate.parallel_virtual_node 1.68% : 0.000005s : 31: predicate.partial_defer_inline 1.80% : 0.000005s : 33: predicate.partial_eliminate 0.97% : 0.000003s : 21: predicate.print_const_string_wrapper 0.60% : 0.000002s : 12: predicate.reduce_all_const_elim 1.20% : 0.000004s : 21: predicate.reduce_eliminate 2.65% : 0.000008s : 60: predicate.redundant_stop_gradient_eliminater 0.39% : 0.000001s : 12: predicate.remove_not_recompute_node 1.42% : 0.000004s : 39: predicate.replace_applicator 0.49% : 0.000001s : 12: predicate.replace_old_param 0.24% : 0.000001s : 6: predicate.reset_defer_inline 1.03% : 0.000003s : 21: predicate.reshape_eliminate 0.66% : 0.000002s : 12: predicate.row_tensor_add_zeros_like 0.33% : 0.000001s : 6: predicate.row_tensor_eliminate 0.74% : 0.000002s : 12: predicate.same_eliminate 0.44% : 0.000001s : 12: predicate.set_cell_output_no_recompute 0.83% : 0.000002s : 12: predicate.shard_identity_eliminate 0.62% : 0.000002s : 12: predicate.special_op_eliminate 0.73% : 0.000002s : 12: predicate.specialize_transform 0.77% : 0.000002s : 12: predicate.split_environ_get_set_with_tuple_value 0.78% : 0.000002s : 12: predicate.stack_unstack_eliminate 0.32% : 0.000001s : 6: predicate.switch_call_monad_eliminater 1.55% : 0.000005s : 31: predicate.switch_defer_inline 2.09% : 0.000006s : 43: predicate.switch_layer_defer_inline 4.72% : 0.000014s : 91: predicate.switch_simplify 0.94% : 0.000003s : 21: predicate.tile_eliminate 0.93% : 0.000003s : 21: predicate.transpose_eliminate 1.58% : 0.000005s : 33: predicate.tuple_list_convert_item_index_to_positive 1.64% : 0.000005s : 33: predicate.tuple_list_get_item_const_eliminator 1.54% : 0.000005s : 33: predicate.tuple_list_get_item_depend_reorder 3.17% : 0.000010s : 51: predicate.tuple_list_get_item_eliminator 1.51% : 0.000005s : 33: predicate.tuple_list_get_set_item_eliminator 2.24% : 0.000007s : 45: predicate.tuple_list_set_item_eliminator 1.82% : 0.000005s : 39: predicate.tuple_to_list_eliminator_ 2.66% : 0.000008s : 60: predicate.updatestate_pure_node_eliminater 3.26% : 0.000010s : 72: predicate.updatestate_useless_node_eliminater 0.36% : 0.000001s : 6: predicate.value_based_eliminate 0.67% : 0.000002s : 12: predicate.virtual_dataset_eliminate 0.65% : 0.000002s : 12: predicate.virtual_output_eliminate 0.26% : 0.000001s : 6: predicate.virtual_view_grad_eliminate 0.37% : 0.000001s : 6: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000532 11 53.58% : 0.000285s : 5: func_graph_cloner_run.FuncGraphClonerGraph 46.42% : 0.000247s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.028303 192 0.02% : 0.000006s : 1: ForceFp32Comm 10.64% : 0.003010s : 1: add_attr 10.59% : 0.002997s : 1: add_attr_with_inline 0.02% : 0.000006s : 1: add_comm_op_reuse_tag 0.22% : 0.000063s : 1: add_recomputation 0.02% : 0.000007s : 1: assign_add_opt 0.25% : 0.000072s : 1: auto_monad 0.11% : 0.000030s : 1: auto_monad_reorder 0.02% : 0.000006s : 1: begin_end_overlap_inline 0.03% : 0.000008s : 1: bias_add_comm_swap 1.75% : 0.000496s : 1: bootstrap 0.11% : 0.000032s : 1: cconv 0.02% : 0.000006s : 1: comm_op_add_attrs 0.08% : 0.000023s : 1: control_data_broadcast_order 0.05% : 0.000014s : 1: convert_after_rewriter 0.12% : 0.000035s : 1: cse_after_recomputation 0.03% : 0.000007s : 1: dataset_repeat_opt 0.07% : 0.000020s : 1: detach_backward 0.05% : 0.000013s : 1: environ_conv 0.11% : 0.000031s : 1: event_method 0.02% : 0.000007s : 1: full_micro_interleaved_order_control 0.03% : 0.000007s : 1: get_jit_bprop_graph 0.04% : 0.000012s : 1: graph_reusing 0.02% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.02% : 0.000006s : 1: handle_group_info 0.03% : 0.000008s : 1: inline 0.03% : 0.000009s : 1: insert-virtual-dataset 0.02% : 0.000006s : 1: interleave_parallel_branches 0.02% : 0.000006s : 1: interleave_split_concat_branches 0.03% : 0.000008s : 1: label_fine_grained_interleaved_index 0.03% : 0.000010s : 1: label_micro_interleaved_index 1.57% : 0.000443s : 1: loop_unroll 0.02% : 0.000007s : 1: merge_cast_opt 0.03% : 0.000008s : 1: micro_interleaved_order_control 1.90% : 0.000537s : 1: mutable_eliminate 0.03% : 0.000010s : 1: offloading_packed_experts 0.06% : 0.000017s : 1: opt.transform.loop_unroll_optimizer 0.07% : 0.000018s : 1: opt.transform.mutable_eliminate 5.98% : 0.001693s : 78: opt.transform.opt_a 0.14% : 0.000041s : 1: opt.transform.opt_after_cconv 0.16% : 0.000045s : 1: opt.transform.opt_after_jit_grad 0.57% : 0.000162s : 28: opt.transform.opt_b 0.23% : 0.000065s : 2: opt.transform.opt_trans_graph 0.16% : 0.000046s : 4: opt.transform.symbol_engine_opt 12.97% : 0.003671s : 1: opt_a 0.53% : 0.000149s : 1: opt_after_cconv 1.85% : 0.000524s : 1: opt_after_jit_grad 1.21% : 0.000342s : 1: opt_b 22.85% : 0.006468s : 1: optimize 0.09% : 0.000024s : 1: optimize_parallel_all_gather_comm 0.04% : 0.000012s : 1: order_py_execute_after_rewriter 0.10% : 0.000029s : 1: overlap_grad_flash_sp 0.02% : 0.000006s : 1: overlap_grad_matmul_and_grad_allreduce 0.04% : 0.000011s : 1: overlap_grad_ring_attention 0.02% : 0.000007s : 1: overlap_opt_shard_grad_in_pipeline 0.02% : 0.000006s : 1: overlap_opt_shard_in_pipeline 0.03% : 0.000007s : 1: overlap_param_gather 0.02% : 0.000006s : 1: overlap_recompute_allgather_and_fa_grad 0.04% : 0.000011s : 1: overlap_recompute_and_grad_model_parallel 0.03% : 0.000008s : 1: overlap_recompute_comm 0.03% : 0.000010s : 1: parallel-infer-symbol 0.02% : 0.000006s : 1: parallel-infer-symbol-second 0.03% : 0.000007s : 1: partial_unused_args_eliminate 0.03% : 0.000010s : 1: pipeline_parallel_scheduler 0.02% : 0.000007s : 1: pipeline_split 0.15% : 0.000044s : 1: pre_auto_parallel 0.13% : 0.000036s : 1: py_interpret_to_execute 0.07% : 0.000019s : 1: py_interpret_to_execute_after_opt_a 0.02% : 0.000006s : 1: remove_cast_before_assign_add 0.08% : 0.000022s : 1: remove_dup_value 1.29% : 0.000365s : 1: renormalize.infer 1.10% : 0.000312s : 1: renormalize.specialize 0.03% : 0.000008s : 1: reorder_send_recv_between_fp_bp 0.04% : 0.000011s : 1: rewriter_after_jit_bprop_graph 0.17% : 0.000049s : 1: rewriter_after_opt_a 0.36% : 0.000101s : 1: rewriter_before_opt_a 0.03% : 0.000008s : 1: slice_cell_reuse_recomputed_activation 0.03% : 0.000007s : 1: slice_recompute_activation 0.02% : 0.000007s : 1: split_layernorm_comm 0.03% : 0.000007s : 1: split_matmul_comm_elemetwise 0.04% : 0.000012s : 1: swap_dp_allreduce_reducescatter 0.38% : 0.000109s : 1: symbol_engine_optimizer 0.38% : 0.000109s : 1: tuple_transform 20.11% : 0.005691s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:47:11.368.279 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0163541, [21] [bootstrap]: 0.00047936 [type_inference]: 0.0060921 [event_method]: 2.301e-05 [auto_monad]: 6.694e-05 [graph_reusing]: 5.67999e-06 [inline]: 2.13998e-06 [add_attr]: 0.00323563, [1] [add_attr_with_inline]: 0.00322625, [1] [Cycle 1]: 6.241e-05, [2] [tag_attr]: 2.195e-05 [meta_addattr_fg_expand]: 6.73e-06 [parallel-infer-symbol]: 3.16001e-06 [pre_auto_parallel]: 3.719e-05 [insert-virtual-dataset]: 2.41e-06 [parallel-infer-symbol-second]: 8.39995e-07 [dataset_repeat_opt]: 2.06998e-06 [pipeline_split]: 1.95001e-06 [optimize]: 0.00566662, [53] [py_interpret_to_execute]: 2.952e-05 [rewriter_before_opt_a]: 9.981e-05 [opt_a]: 0.00342023, [2] [Cycle 1]: 0.00256598, [45] [expand_dump_flag]: 2.89001e-06 [switch_simplify]: 5.089e-05 [loop_unroll]: 3.731e-05 [a_1]: 0.00090881 [with_stream_mark]: 1.668e-05 [recompute_prepare]: 1.453e-05 [updatestate_depend_eliminate]: 5.02e-06 [updatestate_assign_eliminate]: 4.17e-06 [updatestate_loads_eliminate]: 3.81999e-06 [parameter_eliminate]: 1.67999e-06 [a_2]: 0.00011284 [accelerated_algorithm]: 9.42999e-06 [shard]: 1.71002e-06 [meta_shard_fg_expand]: 2.54999e-06 [shard_inline]: 8.55001e-06 [merge_send_recv]: 1.044e-05 [auto_parallel]: 7e-06 [parallel]: 2.1e-05 [flash_sp]: 9.52999e-06 [merge_comm]: 5.53002e-06 [allreduce_fusion]: 4.52998e-06 [matmul_add_comm_reduction]: 1.147e-05 [allreduce_slice_to_reducescatter]: 8.09989e-07 [virtual_shard_identity]: 1.048e-05 [virtual_dataset]: 8.59e-06 [get_grad_eliminate_]: 8.89998e-06 [virtual_output]: 8.54998e-06 [merge_forward]: 4.80999e-06 [cell_reuse_recompute_pass]: 1.25001e-06 [offload_activation]: 1.08e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.659e-05 [merge_recompute_call_nodes]: 1.40999e-06 [before_grad]: 1.409e-05 [set_forward_comm_id_for_comm_node_pass]: 4.79e-06 [meta_fg_expand]: 4.23999e-06 [flash_sp_send_recv_attached]: 2.89001e-06 [receive_attached]: 2.24001e-06 [after_resolve]: 1.578e-05 [a_after_grad]: 1.324e-05 [renormalize]: 0.00080277 [add_forward_monad_depend]: 6.73998e-06 [auto_monad_grad]: 2.50002e-06 [auto_monad_eliminator]: 1.819e-05 [cse]: 3.834e-05 [a_3]: 6.496e-05 [Cycle 2]: 0.00084337, [45] [expand_dump_flag]: 1.56002e-06 [switch_simplify]: 1.068e-05 [loop_unroll]: 8.48001e-06 [a_1]: 0.00020725 [with_stream_mark]: 1.339e-05 [recompute_prepare]: 8.52e-06 [updatestate_depend_eliminate]: 3.83999e-06 [updatestate_assign_eliminate]: 3.06999e-06 [updatestate_loads_eliminate]: 3.19001e-06 [parameter_eliminate]: 1.07e-06 [a_2]: 0.00010421 [accelerated_algorithm]: 8.34998e-06 [shard]: 1.47999e-06 [meta_shard_fg_expand]: 1.87001e-06 [shard_inline]: 8.25e-06 [merge_send_recv]: 6.60997e-06 [auto_parallel]: 7.28999e-06 [parallel]: 5.03002e-06 [flash_sp]: 3.62002e-06 [merge_comm]: 4.31002e-06 [allreduce_fusion]: 4e-06 [matmul_add_comm_reduction]: 7.08e-06 [allreduce_slice_to_reducescatter]: 5.3001e-07 [virtual_shard_identity]: 9.62001e-06 [virtual_dataset]: 8.37998e-06 [get_grad_eliminate_]: 7.74002e-06 [virtual_output]: 8.34998e-06 [merge_forward]: 3.46001e-06 [cell_reuse_recompute_pass]: 1.70001e-06 [offload_activation]: 7.98001e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.529e-05 [merge_recompute_call_nodes]: 1.56998e-06 [before_grad]: 1.282e-05 [set_forward_comm_id_for_comm_node_pass]: 4.47e-06 [meta_fg_expand]: 2.86999e-06 [flash_sp_send_recv_attached]: 8.10018e-07 [receive_attached]: 1.29e-06 [after_resolve]: 1.557e-05 [a_after_grad]: 1.318e-05 [renormalize]: 8.9989e-08 [add_forward_monad_depend]: 1.67999e-06 [auto_monad_grad]: 8.80013e-07 [auto_monad_eliminator]: 9.34e-06 [cse]: 2.249e-05 [a_3]: 5.565e-05 [py_interpret_to_execute_after_opt_a]: 1.293e-05 [slice_cell_reuse_recomputed_activation]: 2.24001e-06 [rewriter_after_opt_a]: 4.505e-05 [convert_after_rewriter]: 8.57998e-06 [order_py_execute_after_rewriter]: 5.79999e-06 [mutable_eliminate]: 0.00053331 [opt_b]: 0.00028023, [1] [Cycle 1]: 0.0002735, [7] [b_1]: 0.00018663 [b_2]: 1.135e-05 [updatestate_depend_eliminate]: 7.25998e-06 [updatestate_assign_eliminate]: 3.09999e-06 [updatestate_loads_eliminate]: 3.48e-06 [renormalize]: 5.69999e-07 [cse]: 2.445e-05 [optimize_parallel_all_gather_comm]: 1.873e-05 [overlap_param_gather]: 1.97001e-06 [cconv]: 2.751e-05 [loop_unroll]: 0.00043386 [opt_after_cconv]: 0.00012308, [1] [Cycle 1]: 0.00011746, [7] [c_1]: 4.233e-05 [parameter_eliminate]: 3.28998e-06 [updatestate_depend_eliminate]: 7.11999e-06 [updatestate_assign_eliminate]: 3.56999e-06 [updatestate_loads_eliminate]: 3.04001e-06 [cse]: 2.285e-05 [renormalize]: 7.30011e-07 [remove_dup_value]: 1.603e-05 [tuple_transform]: 9.354e-05, [1] [Cycle 1]: 8.915e-05, [4] [d_1]: 6.002e-05 [none_parameter_eliminate]: 1.64e-06 [renormalize]: 2.50002e-07 [switch_simplify]: 8.77999e-06 [partial_unused_args_eliminate]: 2.16e-06 [add_recomputation]: 6.159e-05 [cse_after_recomputation]: 2.785e-05, [1] [Cycle 1]: 2.319e-05, [1] [cse]: 1.741e-05 [environ_conv]: 6.82002e-06 [swap_dp_allreduce_reducescatter]: 5.76998e-06 [bias_add_comm_swap]: 2.55002e-06 [label_micro_interleaved_index]: 4.33999e-06 [label_fine_grained_interleaved_index]: 3.16999e-06 [merge_cast_opt]: 1.28002e-06 [slice_recompute_activation]: 2.22999e-06 [micro_interleaved_order_control]: 2.04e-06 [assign_add_opt]: 1.49e-06 [ForceFp32Comm]: 8.10018e-07 [remove_cast_before_assign_add]: 1.12999e-06 [full_micro_interleaved_order_control]: 2.15002e-06 [reorder_send_recv_between_fp_bp]: 2.66e-06 [comm_op_add_attrs]: 1.04e-06 [add_comm_op_reuse_tag]: 1.08001e-06 [interleave_split_concat_branches]: 1.15001e-06 [interleave_parallel_branches]: 1.07998e-06 [overlap_opt_shard_in_pipeline]: 1.30999e-06 [overlap_opt_shard_grad_in_pipeline]: 2.03997e-06 [control_data_broadcast_order]: 1.569e-05 [grouped_pairwise_exchange_alltoall]: 1.58002e-06 [offloading_packed_experts]: 5.30001e-06 [overlap_recompute_and_grad_model_parallel]: 5.25999e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.52999e-06 [overlap_recompute_allgather_and_fa_grad]: 1.77999e-06 [overlap_recompute_comm]: 3.06999e-06 [overlap_grad_ring_attention]: 4.63001e-06 [overlap_grad_flash_sp]: 2.173e-05 [begin_end_overlap_inline]: 5.8001e-07 [split_matmul_comm_elemetwise]: 2.64001e-06 [split_layernorm_comm]: 1.82001e-06 [handle_group_info]: 9.30013e-07 [symbol_engine_optimizer]: 8.762e-05, [1] [Cycle 1]: 8.336e-05, [6] [build]: 3.3e-06 [elim_shapecalc]: 1.165e-05 [elim_not_effective]: 1.601e-05 [opt_reshape]: 9.31e-06 [fold_const_symbol]: 1.337e-05 [renormalize]: 7.30011e-07 [detach_backward]: 2.21998e-06 [pipeline_parallel_scheduler]: 1.55999e-06 [auto_monad_reorder]: 2.149e-05 [get_jit_bprop_graph]: 1.72999e-06 [rewriter_after_jit_bprop_graph]: 4.50001e-06 [opt_after_jit_grad]: 0.00050658 [validate]: 4.434e-05 Sums bootstrap : 0.000479s : 3.95% type_inference : 0.006092s : 50.17% event_method : 0.000023s : 0.19% auto_monad : 0.000067s : 0.55% graph_reusing : 0.000006s : 0.05% inline : 0.000002s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000022s : 0.18% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000007s : 0.06% parallel-infer-symbol : 0.000003s : 0.03% pre_auto_parallel : 0.000037s : 0.31% insert-virtual-dataset : 0.000002s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.02% optimize.py_interpret_to_execute : 0.000030s : 0.24% optimize.rewriter_before_opt_a : 0.000100s : 0.82% optimize.opt_a.expand_dump_flag : 0.000004s : 0.04% optimize.opt_a.switch_simplify : 0.000062s : 0.51% optimize.opt_a.loop_unroll : 0.000046s : 0.38% optimize.opt_a.a_1 : 0.001116s : 9.19% optimize.opt_a.with_stream_mark : 0.000030s : 0.25% optimize.opt_a.recompute_prepare : 0.000023s : 0.19% optimize.opt_a.updatestate_depend_eliminate : 0.000009s : 0.07% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.06% optimize.opt_a.updatestate_loads_eliminate : 0.000007s : 0.06% optimize.opt_a.parameter_eliminate : 0.000003s : 0.02% optimize.opt_a.a_2 : 0.000217s : 1.79% optimize.opt_a.accelerated_algorithm : 0.000018s : 0.15% optimize.opt_a.shard : 0.000003s : 0.03% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.04% optimize.opt_a.shard_inline : 0.000017s : 0.14% optimize.opt_a.merge_send_recv : 0.000017s : 0.14% optimize.opt_a.auto_parallel : 0.000014s : 0.12% optimize.opt_a.parallel : 0.000026s : 0.21% optimize.opt_a.flash_sp : 0.000013s : 0.11% optimize.opt_a.merge_comm : 0.000010s : 0.08% optimize.opt_a.allreduce_fusion : 0.000009s : 0.07% optimize.opt_a.matmul_add_comm_reduction : 0.000019s : 0.15% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000020s : 0.17% optimize.opt_a.virtual_dataset : 0.000017s : 0.14% optimize.opt_a.get_grad_eliminate_ : 0.000017s : 0.14% optimize.opt_a.virtual_output : 0.000017s : 0.14% optimize.opt_a.merge_forward : 0.000008s : 0.07% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.02% optimize.opt_a.offload_activation : 0.000019s : 0.15% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000032s : 0.26% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.02% optimize.opt_a.before_grad : 0.000027s : 0.22% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000009s : 0.08% optimize.opt_a.meta_fg_expand : 0.000007s : 0.06% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.03% optimize.opt_a.receive_attached : 0.000004s : 0.03% optimize.opt_a.after_resolve : 0.000031s : 0.26% optimize.opt_a.a_after_grad : 0.000026s : 0.22% optimize.opt_a.renormalize : 0.000803s : 6.61% optimize.opt_a.add_forward_monad_depend : 0.000008s : 0.07% optimize.opt_a.auto_monad_grad : 0.000003s : 0.03% optimize.opt_a.auto_monad_eliminator : 0.000028s : 0.23% optimize.opt_a.cse : 0.000061s : 0.50% optimize.opt_a.a_3 : 0.000121s : 0.99% optimize.py_interpret_to_execute_after_opt_a : 0.000013s : 0.11% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.02% optimize.rewriter_after_opt_a : 0.000045s : 0.37% optimize.convert_after_rewriter : 0.000009s : 0.07% optimize.order_py_execute_after_rewriter : 0.000006s : 0.05% optimize.mutable_eliminate : 0.000533s : 4.39% optimize.opt_b.b_1 : 0.000187s : 1.54% optimize.opt_b.b_2 : 0.000011s : 0.09% optimize.opt_b.updatestate_depend_eliminate : 0.000007s : 0.06% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000024s : 0.20% optimize.optimize_parallel_all_gather_comm : 0.000019s : 0.15% optimize.overlap_param_gather : 0.000002s : 0.02% optimize.cconv : 0.000028s : 0.23% optimize.loop_unroll : 0.000434s : 3.57% optimize.opt_after_cconv.c_1 : 0.000042s : 0.35% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000007s : 0.06% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000004s : 0.03% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.cse : 0.000023s : 0.19% optimize.opt_after_cconv.renormalize : 0.000001s : 0.01% optimize.remove_dup_value : 0.000016s : 0.13% optimize.tuple_transform.d_1 : 0.000060s : 0.49% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000009s : 0.07% optimize.partial_unused_args_eliminate : 0.000002s : 0.02% optimize.add_recomputation : 0.000062s : 0.51% optimize.cse_after_recomputation.cse : 0.000017s : 0.14% optimize.environ_conv : 0.000007s : 0.06% optimize.swap_dp_allreduce_reducescatter : 0.000006s : 0.05% optimize.bias_add_comm_swap : 0.000003s : 0.02% optimize.label_micro_interleaved_index : 0.000004s : 0.04% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.03% optimize.merge_cast_opt : 0.000001s : 0.01% optimize.slice_recompute_activation : 0.000002s : 0.02% optimize.micro_interleaved_order_control : 0.000002s : 0.02% optimize.assign_add_opt : 0.000001s : 0.01% optimize.ForceFp32Comm : 0.000001s : 0.01% optimize.remove_cast_before_assign_add : 0.000001s : 0.01% optimize.full_micro_interleaved_order_control : 0.000002s : 0.02% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.02% optimize.comm_op_add_attrs : 0.000001s : 0.01% optimize.add_comm_op_reuse_tag : 0.000001s : 0.01% optimize.interleave_split_concat_branches : 0.000001s : 0.01% optimize.interleave_parallel_branches : 0.000001s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.02% optimize.control_data_broadcast_order : 0.000016s : 0.13% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.01% optimize.offloading_packed_experts : 0.000005s : 0.04% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.04% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000002s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000002s : 0.01% optimize.overlap_recompute_comm : 0.000003s : 0.03% optimize.overlap_grad_ring_attention : 0.000005s : 0.04% optimize.overlap_grad_flash_sp : 0.000022s : 0.18% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000003s : 0.02% optimize.split_layernorm_comm : 0.000002s : 0.01% optimize.handle_group_info : 0.000001s : 0.01% optimize.symbol_engine_optimizer.build : 0.000003s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000012s : 0.10% optimize.symbol_engine_optimizer.elim_not_effective : 0.000016s : 0.13% optimize.symbol_engine_optimizer.opt_reshape : 0.000009s : 0.08% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000013s : 0.11% optimize.symbol_engine_optimizer.renormalize : 0.000001s : 0.01% detach_backward : 0.000002s : 0.02% pipeline_parallel_scheduler : 0.000002s : 0.01% auto_monad_reorder : 0.000021s : 0.18% get_jit_bprop_graph : 0.000002s : 0.01% rewriter_after_jit_bprop_graph : 0.000005s : 0.04% opt_after_jit_grad : 0.000507s : 4.17% validate : 0.000044s : 0.37% Time group info: ------[substitution.] 0.000240 44 10.24% : 0.000025s : 3: substitution.cast_eliminate 0.88% : 0.000002s : 3: substitution.elim_not_effective 0.86% : 0.000002s : 3: substitution.fold_const_symbol 3.17% : 0.000008s : 6: substitution.graph_param_transform 67.10% : 0.000161s : 4: substitution.inline 2.08% : 0.000005s : 6: substitution.j_node_and_user_rematch 2.67% : 0.000006s : 6: substitution.remove_not_recompute_node 2.54% : 0.000006s : 6: substitution.replace_old_param 7.91% : 0.000019s : 6: substitution.tuple_list_get_item_eliminator 2.55% : 0.000006s : 1: substitution.value_based_eliminate ------[type_inference.] 0.006025 2 86.95% : 0.005239s : 1: type_inference.infer 13.05% : 0.000786s : 1: type_inference.specialize ------[replace.] 0.000075 10 50.98% : 0.000038s : 4: replace.inline 49.02% : 0.000037s : 6: replace.tuple_list_get_item_eliminator ------[match.] 0.000174 10 90.66% : 0.000158s : 4: match.inline 9.34% : 0.000016s : 6: match.tuple_list_get_item_eliminator ------[predicate.] 0.000303 1954 1.10% : 0.000003s : 21: predicate.accumulaten_eliminater 0.60% : 0.000002s : 6: predicate.ad_related_special_op_eliminate 0.53% : 0.000002s : 12: predicate.addn_check_dump 1.02% : 0.000003s : 21: predicate.addn_zero_filter 0.89% : 0.000003s : 21: predicate.adjust_all_reduce_mul_add 2.05% : 0.000006s : 33: predicate.arithmetic_simplify 1.07% : 0.000003s : 21: predicate.cast_eliminate 0.63% : 0.000002s : 12: predicate.check_bprop_eliminate 0.49% : 0.000001s : 12: predicate.compare_switch_simplify 0.21% : 0.000001s : 6: predicate.const_output_eliminate 0.62% : 0.000002s : 12: predicate.depend_value_elim 1.06% : 0.000003s : 21: predicate.dict_get_item_const_eliminator 1.12% : 0.000003s : 21: predicate.dict_get_item_eliminator 0.99% : 0.000003s : 21: predicate.dict_set_item_eliminator 0.79% : 0.000002s : 12: predicate.dumpgradient_eliminate 0.19% : 0.000001s : 6: predicate.elim_not_effective 0.33% : 0.000001s : 6: predicate.elim_shapecalc_of_broadcastargs 1.21% : 0.000004s : 27: predicate.environ_add_const_eliminate 1.15% : 0.000003s : 27: predicate.environ_get_add_eliminate 1.22% : 0.000004s : 27: predicate.environ_get_depend_swap 1.75% : 0.000005s : 39: predicate.environ_get_eliminate 1.20% : 0.000004s : 27: predicate.environ_get_set_eliminate 1.49% : 0.000005s : 31: predicate.exchange_switch_depend_value 2.31% : 0.000007s : 31: predicate.float_depend_g_call 0.52% : 0.000002s : 12: predicate.float_environ_get_switch 0.77% : 0.000002s : 18: predicate.float_tuple_getitem_switch 0.19% : 0.000001s : 6: predicate.fold_const_symbol 0.67% : 0.000002s : 12: predicate.get_grad_eliminate 0.20% : 0.000001s : 6: predicate.graph_param_transform 0.59% : 0.000002s : 12: predicate.incorporate_call 0.50% : 0.000002s : 12: predicate.incorporate_call_switch 6.26% : 0.000019s : 88: predicate.inline 0.76% : 0.000002s : 12: predicate.inline_without_move 0.32% : 0.000001s : 12: predicate.j_node_and_user_rematch 0.72% : 0.000002s : 12: predicate.less_batch_normalization 1.97% : 0.000006s : 39: predicate.list_to_tuple_eliminator_ 2.67% : 0.000008s : 60: predicate.load_eliminater 0.74% : 0.000002s : 6: predicate.loop_unroll_after_grad 2.04% : 0.000006s : 42: predicate.loop_unroll_before_grad 1.62% : 0.000005s : 33: predicate.make_slice_get_slice_eliminator 0.61% : 0.000002s : 12: predicate.merge_addn 0.57% : 0.000002s : 12: predicate.micro_step_allgather_replace 0.61% : 0.000002s : 12: predicate.mini_step_allgather_replace 0.91% : 0.000003s : 21: predicate.minmaximum_grad 1.09% : 0.000003s : 6: predicate.mutable_eliminate 0.34% : 0.000001s : 6: predicate.opt_reshape 0.33% : 0.000001s : 6: predicate.parallel_virtual_node 1.84% : 0.000006s : 31: predicate.partial_defer_inline 1.80% : 0.000005s : 33: predicate.partial_eliminate 0.95% : 0.000003s : 21: predicate.print_const_string_wrapper 0.58% : 0.000002s : 12: predicate.reduce_all_const_elim 1.26% : 0.000004s : 21: predicate.reduce_eliminate 2.66% : 0.000008s : 60: predicate.redundant_stop_gradient_eliminater 0.37% : 0.000001s : 12: predicate.remove_not_recompute_node 1.39% : 0.000004s : 39: predicate.replace_applicator 0.48% : 0.000001s : 12: predicate.replace_old_param 0.29% : 0.000001s : 6: predicate.reset_defer_inline 1.02% : 0.000003s : 21: predicate.reshape_eliminate 0.62% : 0.000002s : 12: predicate.row_tensor_add_zeros_like 0.37% : 0.000001s : 6: predicate.row_tensor_eliminate 0.72% : 0.000002s : 12: predicate.same_eliminate 0.55% : 0.000002s : 12: predicate.set_cell_output_no_recompute 0.77% : 0.000002s : 12: predicate.shard_identity_eliminate 0.59% : 0.000002s : 12: predicate.special_op_eliminate 0.65% : 0.000002s : 12: predicate.specialize_transform 0.72% : 0.000002s : 12: predicate.split_environ_get_set_with_tuple_value 0.74% : 0.000002s : 12: predicate.stack_unstack_eliminate 0.33% : 0.000001s : 6: predicate.switch_call_monad_eliminater 1.58% : 0.000005s : 31: predicate.switch_defer_inline 2.10% : 0.000006s : 43: predicate.switch_layer_defer_inline 4.73% : 0.000014s : 91: predicate.switch_simplify 0.95% : 0.000003s : 21: predicate.tile_eliminate 0.95% : 0.000003s : 21: predicate.transpose_eliminate 1.58% : 0.000005s : 33: predicate.tuple_list_convert_item_index_to_positive 1.66% : 0.000005s : 33: predicate.tuple_list_get_item_const_eliminator 1.50% : 0.000005s : 33: predicate.tuple_list_get_item_depend_reorder 3.18% : 0.000010s : 51: predicate.tuple_list_get_item_eliminator 1.56% : 0.000005s : 33: predicate.tuple_list_get_set_item_eliminator 2.26% : 0.000007s : 45: predicate.tuple_list_set_item_eliminator 1.82% : 0.000006s : 39: predicate.tuple_to_list_eliminator_ 2.67% : 0.000008s : 60: predicate.updatestate_pure_node_eliminater 3.25% : 0.000010s : 72: predicate.updatestate_useless_node_eliminater 0.41% : 0.000001s : 6: predicate.value_based_eliminate 0.62% : 0.000002s : 12: predicate.virtual_dataset_eliminate 0.64% : 0.000002s : 12: predicate.virtual_output_eliminate 0.27% : 0.000001s : 6: predicate.virtual_view_grad_eliminate 0.56% : 0.000002s : 6: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000585 11 53.24% : 0.000312s : 5: func_graph_cloner_run.FuncGraphClonerGraph 46.76% : 0.000274s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.028027 192 0.01% : 0.000004s : 1: ForceFp32Comm 11.56% : 0.003241s : 1: add_attr 11.53% : 0.003230s : 1: add_attr_with_inline 0.01% : 0.000004s : 1: add_comm_op_reuse_tag 0.24% : 0.000066s : 1: add_recomputation 0.01% : 0.000004s : 1: assign_add_opt 0.26% : 0.000074s : 1: auto_monad 0.09% : 0.000026s : 1: auto_monad_reorder 0.01% : 0.000003s : 1: begin_end_overlap_inline 0.02% : 0.000006s : 1: bias_add_comm_swap 1.81% : 0.000507s : 1: bootstrap 0.11% : 0.000031s : 1: cconv 0.01% : 0.000004s : 1: comm_op_add_attrs 0.07% : 0.000019s : 1: control_data_broadcast_order 0.04% : 0.000012s : 1: convert_after_rewriter 0.11% : 0.000031s : 1: cse_after_recomputation 0.02% : 0.000005s : 1: dataset_repeat_opt 0.02% : 0.000006s : 1: detach_backward 0.04% : 0.000010s : 1: environ_conv 0.10% : 0.000029s : 1: event_method 0.02% : 0.000005s : 1: full_micro_interleaved_order_control 0.02% : 0.000005s : 1: get_jit_bprop_graph 0.03% : 0.000009s : 1: graph_reusing 0.02% : 0.000005s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000004s : 1: handle_group_info 0.02% : 0.000005s : 1: inline 0.02% : 0.000006s : 1: insert-virtual-dataset 0.01% : 0.000004s : 1: interleave_parallel_branches 0.01% : 0.000004s : 1: interleave_split_concat_branches 0.02% : 0.000006s : 1: label_fine_grained_interleaved_index 0.03% : 0.000007s : 1: label_micro_interleaved_index 1.58% : 0.000442s : 1: loop_unroll 0.01% : 0.000004s : 1: merge_cast_opt 0.02% : 0.000005s : 1: micro_interleaved_order_control 1.94% : 0.000543s : 1: mutable_eliminate 0.03% : 0.000009s : 1: offloading_packed_experts 0.06% : 0.000017s : 1: opt.transform.loop_unroll_optimizer 0.08% : 0.000021s : 1: opt.transform.mutable_eliminate 6.11% : 0.001711s : 78: opt.transform.opt_a 0.15% : 0.000041s : 1: opt.transform.opt_after_cconv 0.11% : 0.000032s : 1: opt.transform.opt_after_jit_grad 0.59% : 0.000166s : 28: opt.transform.opt_b 0.24% : 0.000067s : 2: opt.transform.opt_trans_graph 0.17% : 0.000046s : 4: opt.transform.symbol_engine_opt 12.21% : 0.003423s : 1: opt_a 0.45% : 0.000127s : 1: opt_after_cconv 1.84% : 0.000516s : 1: opt_after_jit_grad 1.01% : 0.000284s : 1: opt_b 20.24% : 0.005671s : 1: optimize 0.08% : 0.000022s : 1: optimize_parallel_all_gather_comm 0.03% : 0.000009s : 1: order_py_execute_after_rewriter 0.09% : 0.000025s : 1: overlap_grad_flash_sp 0.02% : 0.000005s : 1: overlap_grad_matmul_and_grad_allreduce 0.03% : 0.000008s : 1: overlap_grad_ring_attention 0.02% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.02% : 0.000005s : 1: overlap_param_gather 0.02% : 0.000005s : 1: overlap_recompute_allgather_and_fa_grad 0.03% : 0.000008s : 1: overlap_recompute_and_grad_model_parallel 0.02% : 0.000006s : 1: overlap_recompute_comm 0.02% : 0.000007s : 1: parallel-infer-symbol 0.01% : 0.000004s : 1: parallel-infer-symbol-second 0.02% : 0.000005s : 1: partial_unused_args_eliminate 0.02% : 0.000005s : 1: pipeline_parallel_scheduler 0.02% : 0.000005s : 1: pipeline_split 0.15% : 0.000042s : 1: pre_auto_parallel 0.12% : 0.000034s : 1: py_interpret_to_execute 0.06% : 0.000016s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000004s : 1: remove_cast_before_assign_add 0.07% : 0.000020s : 1: remove_dup_value 1.61% : 0.000450s : 1: renormalize.infer 1.23% : 0.000344s : 1: renormalize.specialize 0.02% : 0.000005s : 1: reorder_send_recv_between_fp_bp 0.03% : 0.000008s : 1: rewriter_after_jit_bprop_graph 0.18% : 0.000049s : 1: rewriter_after_opt_a 0.37% : 0.000104s : 1: rewriter_before_opt_a 0.02% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.02% : 0.000005s : 1: slice_recompute_activation 0.02% : 0.000004s : 1: split_layernorm_comm 0.02% : 0.000005s : 1: split_matmul_comm_elemetwise 0.03% : 0.000009s : 1: swap_dp_allreduce_reducescatter 0.32% : 0.000090s : 1: symbol_engine_optimizer 0.34% : 0.000096s : 1: tuple_transform 21.80% : 0.006110s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:47:11.559.733 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:47:11.559.992 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0159376, [21] [bootstrap]: 0.00043868 [type_inference]: 0.00564524 [event_method]: 1.998e-05 [auto_monad]: 6.266e-05 [graph_reusing]: 5.77999e-06 [inline]: 2.42001e-06 [add_attr]: 0.0030085, [1] [add_attr_with_inline]: 0.00300021, [1] [Cycle 1]: 6.994e-05, [2] [tag_attr]: 1.969e-05 [meta_addattr_fg_expand]: 6.66e-06 [parallel-infer-symbol]: 3.09001e-06 [pre_auto_parallel]: 3.274e-05 [insert-virtual-dataset]: 2.76e-06 [parallel-infer-symbol-second]: 6.50005e-07 [dataset_repeat_opt]: 2.11998e-06 [pipeline_split]: 1.65001e-06 [optimize]: 0.00559663, [53] [py_interpret_to_execute]: 3.287e-05 [rewriter_before_opt_a]: 9.135e-05 [opt_a]: 0.00324335, [2] [Cycle 1]: 0.00236311, [45] [expand_dump_flag]: 3.16999e-06 [switch_simplify]: 4.605e-05 [loop_unroll]: 3.495e-05 [a_1]: 0.00072938 [with_stream_mark]: 1.714e-05 [recompute_prepare]: 1.055e-05 [updatestate_depend_eliminate]: 4.30999e-06 [updatestate_assign_eliminate]: 3.57002e-06 [updatestate_loads_eliminate]: 3.01001e-06 [parameter_eliminate]: 1.96e-06 [a_2]: 0.00011885 [accelerated_algorithm]: 7.92003e-06 [shard]: 2.09999e-06 [meta_shard_fg_expand]: 1.98997e-06 [shard_inline]: 7.51001e-06 [merge_send_recv]: 7.96001e-06 [auto_parallel]: 6.81001e-06 [parallel]: 1.978e-05 [flash_sp]: 8.05e-06 [merge_comm]: 4.13999e-06 [allreduce_fusion]: 3.66999e-06 [matmul_add_comm_reduction]: 9.84001e-06 [allreduce_slice_to_reducescatter]: 5.8001e-07 [virtual_shard_identity]: 9.02e-06 [virtual_dataset]: 7.58001e-06 [get_grad_eliminate_]: 6.86001e-06 [virtual_output]: 7.05e-06 [merge_forward]: 3.86001e-06 [cell_reuse_recompute_pass]: 1.32e-06 [offload_activation]: 9.71998e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.589e-05 [merge_recompute_call_nodes]: 1.57999e-06 [before_grad]: 1.077e-05 [set_forward_comm_id_for_comm_node_pass]: 3.85998e-06 [meta_fg_expand]: 2.88e-06 [flash_sp_send_recv_attached]: 2.69999e-06 [receive_attached]: 2.91e-06 [after_resolve]: 1.376e-05 [a_after_grad]: 1.126e-05 [renormalize]: 0.00067867 [add_forward_monad_depend]: 5.17999e-06 [auto_monad_grad]: 2.54999e-06 [auto_monad_eliminator]: 1.432e-05 [cse]: 3.051e-05 [a_3]: 6.449e-05 [Cycle 2]: 0.00086699, [45] [expand_dump_flag]: 1.33002e-06 [switch_simplify]: 9.07001e-06 [loop_unroll]: 7.13e-06 [a_1]: 0.00015755 [with_stream_mark]: 1.163e-05 [recompute_prepare]: 7.3e-06 [updatestate_depend_eliminate]: 3.2e-06 [updatestate_assign_eliminate]: 3.06999e-06 [updatestate_loads_eliminate]: 2.49001e-06 [parameter_eliminate]: 1.22e-06 [a_2]: 0.00010762 [accelerated_algorithm]: 6.88e-06 [shard]: 1.22e-06 [meta_shard_fg_expand]: 1.34e-06 [shard_inline]: 6.68e-06 [merge_send_recv]: 4.89e-06 [auto_parallel]: 5.39e-06 [parallel]: 4.58001e-06 [flash_sp]: 3.5e-06 [merge_comm]: 3.40998e-06 [allreduce_fusion]: 3.35003e-06 [matmul_add_comm_reduction]: 5.47001e-06 [allreduce_slice_to_reducescatter]: 5.09986e-07 [virtual_shard_identity]: 7.42002e-06 [virtual_dataset]: 6.85998e-06 [get_grad_eliminate_]: 9.46e-06 [virtual_output]: 6.47001e-06 [merge_forward]: 2.67001e-06 [cell_reuse_recompute_pass]: 1.94999e-06 [offload_activation]: 6.23e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.525e-05 [merge_recompute_call_nodes]: 8.79983e-07 [before_grad]: 9.99999e-06 [set_forward_comm_id_for_comm_node_pass]: 3.45e-06 [meta_fg_expand]: 2.14e-06 [flash_sp_send_recv_attached]: 8.50006e-07 [receive_attached]: 1.02998e-06 [after_resolve]: 1.321e-05 [a_after_grad]: 1.109e-05 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 1.27999e-06 [auto_monad_grad]: 1.06002e-06 [auto_monad_eliminator]: 7.65e-06 [cse]: 1.471e-05 [a_3]: 5.36e-05 [py_interpret_to_execute_after_opt_a]: 1.437e-05 [slice_cell_reuse_recomputed_activation]: 4.70999e-06 [rewriter_after_opt_a]: 3.847e-05 [convert_after_rewriter]: 1.004e-05 [order_py_execute_after_rewriter]: 7.82e-06 [mutable_eliminate]: 0.00052045 [opt_b]: 0.00029752, [1] [Cycle 1]: 0.00028812, [7] [b_1]: 0.00019169 [b_2]: 8.99e-06 [updatestate_depend_eliminate]: 5.58002e-06 [updatestate_assign_eliminate]: 2.44999e-06 [updatestate_loads_eliminate]: 2.83e-06 [renormalize]: 7.2e-07 [cse]: 1.802e-05 [optimize_parallel_all_gather_comm]: 1.859e-05 [overlap_param_gather]: 4.77e-06 [cconv]: 2.672e-05 [loop_unroll]: 0.00043205 [opt_after_cconv]: 0.00013284, [1] [Cycle 1]: 0.0001244, [7] [c_1]: 3.615e-05 [parameter_eliminate]: 2.49001e-06 [updatestate_depend_eliminate]: 5.14e-06 [updatestate_assign_eliminate]: 2.57001e-06 [updatestate_loads_eliminate]: 2.44001e-06 [cse]: 1.821e-05 [renormalize]: 4.10015e-07 [remove_dup_value]: 1.663e-05 [tuple_transform]: 0.00011694, [1] [Cycle 1]: 0.00010975, [4] [d_1]: 4.928e-05 [none_parameter_eliminate]: 1.72999e-06 [renormalize]: 2.3999e-07 [switch_simplify]: 8.27e-06 [partial_unused_args_eliminate]: 4.74e-06 [add_recomputation]: 5.389e-05 [cse_after_recomputation]: 3.124e-05, [1] [Cycle 1]: 2.403e-05, [1] [cse]: 1.372e-05 [environ_conv]: 8.90999e-06 [swap_dp_allreduce_reducescatter]: 8.80999e-06 [bias_add_comm_swap]: 5.86998e-06 [label_micro_interleaved_index]: 6.89999e-06 [label_fine_grained_interleaved_index]: 5.39e-06 [merge_cast_opt]: 3.91001e-06 [slice_recompute_activation]: 4.67998e-06 [micro_interleaved_order_control]: 4.89e-06 [assign_add_opt]: 3.95e-06 [ForceFp32Comm]: 3.52997e-06 [remove_cast_before_assign_add]: 3.61999e-06 [full_micro_interleaved_order_control]: 4.86002e-06 [reorder_send_recv_between_fp_bp]: 6.00002e-06 [comm_op_add_attrs]: 3.70998e-06 [add_comm_op_reuse_tag]: 3.68e-06 [interleave_split_concat_branches]: 3.91999e-06 [interleave_parallel_branches]: 4.2e-06 [overlap_opt_shard_in_pipeline]: 3.73001e-06 [overlap_opt_shard_grad_in_pipeline]: 4.37e-06 [control_data_broadcast_order]: 1.667e-05 [grouped_pairwise_exchange_alltoall]: 4.52e-06 [offloading_packed_experts]: 6.49001e-06 [overlap_recompute_and_grad_model_parallel]: 7.13e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.48999e-06 [overlap_recompute_allgather_and_fa_grad]: 3.8e-06 [overlap_recompute_comm]: 4.70999e-06 [overlap_grad_ring_attention]: 6.57002e-06 [overlap_grad_flash_sp]: 2.142e-05 [begin_end_overlap_inline]: 2.89001e-06 [split_matmul_comm_elemetwise]: 4.77998e-06 [split_layernorm_comm]: 3.98999e-06 [handle_group_info]: 3.27002e-06 [symbol_engine_optimizer]: 9.771e-05, [1] [Cycle 1]: 9.09e-05, [6] [build]: 2.58e-06 [elim_shapecalc]: 9.99001e-06 [elim_not_effective]: 1.391e-05 [opt_reshape]: 8.18999e-06 [fold_const_symbol]: 1.061e-05 [renormalize]: 2.49973e-07 [detach_backward]: 2.94999e-06 [pipeline_parallel_scheduler]: 1.59998e-06 [auto_monad_reorder]: 1.936e-05 [get_jit_bprop_graph]: 1.45999e-06 [rewriter_after_jit_bprop_graph]: 4.07e-06 [opt_after_jit_grad]: 0.00048384 [validate]: 3.612e-05 Sums bootstrap : 0.000439s : 3.93% type_inference : 0.005645s : 50.55% event_method : 0.000020s : 0.18% auto_monad : 0.000063s : 0.56% graph_reusing : 0.000006s : 0.05% inline : 0.000002s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000020s : 0.18% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000007s : 0.06% parallel-infer-symbol : 0.000003s : 0.03% pre_auto_parallel : 0.000033s : 0.29% insert-virtual-dataset : 0.000003s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000033s : 0.29% optimize.rewriter_before_opt_a : 0.000091s : 0.82% optimize.opt_a.expand_dump_flag : 0.000005s : 0.04% optimize.opt_a.switch_simplify : 0.000055s : 0.49% optimize.opt_a.loop_unroll : 0.000042s : 0.38% optimize.opt_a.a_1 : 0.000887s : 7.94% optimize.opt_a.with_stream_mark : 0.000029s : 0.26% optimize.opt_a.recompute_prepare : 0.000018s : 0.16% optimize.opt_a.updatestate_depend_eliminate : 0.000008s : 0.07% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.06% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.05% optimize.opt_a.parameter_eliminate : 0.000003s : 0.03% optimize.opt_a.a_2 : 0.000226s : 2.03% optimize.opt_a.accelerated_algorithm : 0.000015s : 0.13% optimize.opt_a.shard : 0.000003s : 0.03% optimize.opt_a.meta_shard_fg_expand : 0.000003s : 0.03% optimize.opt_a.shard_inline : 0.000014s : 0.13% optimize.opt_a.merge_send_recv : 0.000013s : 0.12% optimize.opt_a.auto_parallel : 0.000012s : 0.11% optimize.opt_a.parallel : 0.000024s : 0.22% optimize.opt_a.flash_sp : 0.000012s : 0.10% optimize.opt_a.merge_comm : 0.000008s : 0.07% optimize.opt_a.allreduce_fusion : 0.000007s : 0.06% optimize.opt_a.matmul_add_comm_reduction : 0.000015s : 0.14% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000016s : 0.15% optimize.opt_a.virtual_dataset : 0.000014s : 0.13% optimize.opt_a.get_grad_eliminate_ : 0.000016s : 0.15% optimize.opt_a.virtual_output : 0.000014s : 0.12% optimize.opt_a.merge_forward : 0.000007s : 0.06% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.03% optimize.opt_a.offload_activation : 0.000016s : 0.14% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000031s : 0.28% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.02% optimize.opt_a.before_grad : 0.000021s : 0.19% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000007s : 0.07% optimize.opt_a.meta_fg_expand : 0.000005s : 0.04% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.03% optimize.opt_a.receive_attached : 0.000004s : 0.04% optimize.opt_a.after_resolve : 0.000027s : 0.24% optimize.opt_a.a_after_grad : 0.000022s : 0.20% optimize.opt_a.renormalize : 0.000679s : 6.08% optimize.opt_a.add_forward_monad_depend : 0.000006s : 0.06% optimize.opt_a.auto_monad_grad : 0.000004s : 0.03% optimize.opt_a.auto_monad_eliminator : 0.000022s : 0.20% optimize.opt_a.cse : 0.000045s : 0.40% optimize.opt_a.a_3 : 0.000118s : 1.06% optimize.py_interpret_to_execute_after_opt_a : 0.000014s : 0.13% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.04% optimize.rewriter_after_opt_a : 0.000038s : 0.34% optimize.convert_after_rewriter : 0.000010s : 0.09% optimize.order_py_execute_after_rewriter : 0.000008s : 0.07% optimize.mutable_eliminate : 0.000520s : 4.66% optimize.opt_b.b_1 : 0.000192s : 1.72% optimize.opt_b.b_2 : 0.000009s : 0.08% optimize.opt_b.updatestate_depend_eliminate : 0.000006s : 0.05% optimize.opt_b.updatestate_assign_eliminate : 0.000002s : 0.02% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_b.renormalize : 0.000001s : 0.01% optimize.opt_b.cse : 0.000018s : 0.16% optimize.optimize_parallel_all_gather_comm : 0.000019s : 0.17% optimize.overlap_param_gather : 0.000005s : 0.04% optimize.cconv : 0.000027s : 0.24% optimize.loop_unroll : 0.000432s : 3.87% optimize.opt_after_cconv.c_1 : 0.000036s : 0.32% optimize.opt_after_cconv.parameter_eliminate : 0.000002s : 0.02% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000005s : 0.05% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.02% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.02% optimize.opt_after_cconv.cse : 0.000018s : 0.16% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000017s : 0.15% optimize.tuple_transform.d_1 : 0.000049s : 0.44% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.02% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000008s : 0.07% optimize.partial_unused_args_eliminate : 0.000005s : 0.04% optimize.add_recomputation : 0.000054s : 0.48% optimize.cse_after_recomputation.cse : 0.000014s : 0.12% optimize.environ_conv : 0.000009s : 0.08% optimize.swap_dp_allreduce_reducescatter : 0.000009s : 0.08% optimize.bias_add_comm_swap : 0.000006s : 0.05% optimize.label_micro_interleaved_index : 0.000007s : 0.06% optimize.label_fine_grained_interleaved_index : 0.000005s : 0.05% optimize.merge_cast_opt : 0.000004s : 0.04% optimize.slice_recompute_activation : 0.000005s : 0.04% optimize.micro_interleaved_order_control : 0.000005s : 0.04% optimize.assign_add_opt : 0.000004s : 0.04% optimize.ForceFp32Comm : 0.000004s : 0.03% optimize.remove_cast_before_assign_add : 0.000004s : 0.03% optimize.full_micro_interleaved_order_control : 0.000005s : 0.04% optimize.reorder_send_recv_between_fp_bp : 0.000006s : 0.05% optimize.comm_op_add_attrs : 0.000004s : 0.03% optimize.add_comm_op_reuse_tag : 0.000004s : 0.03% optimize.interleave_split_concat_branches : 0.000004s : 0.04% optimize.interleave_parallel_branches : 0.000004s : 0.04% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.03% optimize.overlap_opt_shard_grad_in_pipeline : 0.000004s : 0.04% optimize.control_data_broadcast_order : 0.000017s : 0.15% optimize.grouped_pairwise_exchange_alltoall : 0.000005s : 0.04% optimize.offloading_packed_experts : 0.000006s : 0.06% optimize.overlap_recompute_and_grad_model_parallel : 0.000007s : 0.06% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000003s : 0.03% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.03% optimize.overlap_recompute_comm : 0.000005s : 0.04% optimize.overlap_grad_ring_attention : 0.000007s : 0.06% optimize.overlap_grad_flash_sp : 0.000021s : 0.19% optimize.begin_end_overlap_inline : 0.000003s : 0.03% optimize.split_matmul_comm_elemetwise : 0.000005s : 0.04% optimize.split_layernorm_comm : 0.000004s : 0.04% optimize.handle_group_info : 0.000003s : 0.03% optimize.symbol_engine_optimizer.build : 0.000003s : 0.02% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000010s : 0.09% optimize.symbol_engine_optimizer.elim_not_effective : 0.000014s : 0.12% optimize.symbol_engine_optimizer.opt_reshape : 0.000008s : 0.07% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000011s : 0.10% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000003s : 0.03% pipeline_parallel_scheduler : 0.000002s : 0.01% auto_monad_reorder : 0.000019s : 0.17% get_jit_bprop_graph : 0.000001s : 0.01% rewriter_after_jit_bprop_graph : 0.000004s : 0.04% opt_after_jit_grad : 0.000484s : 4.33% validate : 0.000036s : 0.32% Time group info: ------[substitution.] 0.000204 34 0.88% : 0.000002s : 2: substitution.elim_not_effective 0.63% : 0.000001s : 2: substitution.fold_const_symbol 2.95% : 0.000006s : 5: substitution.graph_param_transform 77.01% : 0.000157s : 4: substitution.inline 1.67% : 0.000003s : 4: substitution.j_node_and_user_rematch 2.20% : 0.000004s : 4: substitution.remove_not_recompute_node 2.68% : 0.000005s : 6: substitution.replace_old_param 9.12% : 0.000019s : 6: substitution.tuple_list_get_item_eliminator 2.86% : 0.000006s : 1: substitution.value_based_eliminate ------[type_inference.] 0.005597 2 87.20% : 0.004880s : 1: type_inference.infer 12.80% : 0.000717s : 1: type_inference.specialize ------[replace.] 0.000068 10 53.73% : 0.000037s : 4: replace.inline 46.27% : 0.000032s : 6: replace.tuple_list_get_item_eliminator ------[match.] 0.000170 10 90.60% : 0.000154s : 4: match.inline 9.40% : 0.000016s : 6: match.tuple_list_get_item_eliminator ------[predicate.] 0.000234 1590 0.88% : 0.000002s : 16: predicate.accumulaten_eliminater 0.73% : 0.000002s : 5: predicate.ad_related_special_op_eliminate 0.52% : 0.000001s : 10: predicate.addn_check_dump 0.88% : 0.000002s : 16: predicate.addn_zero_filter 0.84% : 0.000002s : 16: predicate.adjust_all_reduce_mul_add 1.93% : 0.000005s : 26: predicate.arithmetic_simplify 1.02% : 0.000002s : 16: predicate.cast_eliminate 0.57% : 0.000001s : 10: predicate.check_bprop_eliminate 0.53% : 0.000001s : 10: predicate.compare_switch_simplify 0.19% : 0.000000s : 5: predicate.const_output_eliminate 0.63% : 0.000001s : 10: predicate.depend_value_elim 0.98% : 0.000002s : 16: predicate.dict_get_item_const_eliminator 1.12% : 0.000003s : 16: predicate.dict_get_item_eliminator 0.87% : 0.000002s : 16: predicate.dict_set_item_eliminator 0.89% : 0.000002s : 10: predicate.dumpgradient_eliminate 0.26% : 0.000001s : 5: predicate.elim_not_effective 0.35% : 0.000001s : 5: predicate.elim_shapecalc_of_broadcastargs 1.15% : 0.000003s : 21: predicate.environ_add_const_eliminate 1.08% : 0.000003s : 21: predicate.environ_get_add_eliminate 1.08% : 0.000003s : 21: predicate.environ_get_depend_swap 1.71% : 0.000004s : 31: predicate.environ_get_eliminate 1.11% : 0.000003s : 21: predicate.environ_get_set_eliminate 1.52% : 0.000004s : 26: predicate.exchange_switch_depend_value 2.49% : 0.000006s : 26: predicate.float_depend_g_call 0.52% : 0.000001s : 10: predicate.float_environ_get_switch 0.76% : 0.000002s : 15: predicate.float_tuple_getitem_switch 0.17% : 0.000000s : 5: predicate.fold_const_symbol 0.61% : 0.000001s : 10: predicate.get_grad_eliminate 0.22% : 0.000001s : 5: predicate.graph_param_transform 0.58% : 0.000001s : 10: predicate.incorporate_call 0.47% : 0.000001s : 10: predicate.incorporate_call_switch 6.17% : 0.000014s : 72: predicate.inline 0.77% : 0.000002s : 10: predicate.inline_without_move 0.35% : 0.000001s : 10: predicate.j_node_and_user_rematch 0.74% : 0.000002s : 10: predicate.less_batch_normalization 1.98% : 0.000005s : 32: predicate.list_to_tuple_eliminator_ 2.69% : 0.000006s : 48: predicate.load_eliminater 0.82% : 0.000002s : 5: predicate.loop_unroll_after_grad 2.49% : 0.000006s : 40: predicate.loop_unroll_before_grad 1.70% : 0.000004s : 26: predicate.make_slice_get_slice_eliminator 0.53% : 0.000001s : 10: predicate.merge_addn 0.61% : 0.000001s : 10: predicate.micro_step_allgather_replace 0.58% : 0.000001s : 10: predicate.mini_step_allgather_replace 0.84% : 0.000002s : 16: predicate.minmaximum_grad 0.85% : 0.000002s : 5: predicate.mutable_eliminate 0.38% : 0.000001s : 5: predicate.opt_reshape 0.46% : 0.000001s : 5: predicate.parallel_virtual_node 1.92% : 0.000005s : 26: predicate.partial_defer_inline 1.80% : 0.000004s : 27: predicate.partial_eliminate 0.89% : 0.000002s : 16: predicate.print_const_string_wrapper 0.54% : 0.000001s : 10: predicate.reduce_all_const_elim 1.25% : 0.000003s : 16: predicate.reduce_eliminate 2.67% : 0.000006s : 48: predicate.redundant_stop_gradient_eliminater 0.48% : 0.000001s : 10: predicate.remove_not_recompute_node 1.44% : 0.000003s : 32: predicate.replace_applicator 0.53% : 0.000001s : 10: predicate.replace_old_param 0.26% : 0.000001s : 5: predicate.reset_defer_inline 0.93% : 0.000002s : 16: predicate.reshape_eliminate 0.56% : 0.000001s : 10: predicate.row_tensor_add_zeros_like 0.40% : 0.000001s : 5: predicate.row_tensor_eliminate 0.79% : 0.000002s : 10: predicate.same_eliminate 0.52% : 0.000001s : 10: predicate.set_cell_output_no_recompute 0.83% : 0.000002s : 10: predicate.shard_identity_eliminate 0.66% : 0.000002s : 10: predicate.special_op_eliminate 0.69% : 0.000002s : 10: predicate.specialize_transform 0.83% : 0.000002s : 10: predicate.split_environ_get_set_with_tuple_value 0.77% : 0.000002s : 10: predicate.stack_unstack_eliminate 0.33% : 0.000001s : 5: predicate.switch_call_monad_eliminater 1.63% : 0.000004s : 26: predicate.switch_defer_inline 2.15% : 0.000005s : 36: predicate.switch_layer_defer_inline 5.21% : 0.000012s : 81: predicate.switch_simplify 0.88% : 0.000002s : 16: predicate.tile_eliminate 0.87% : 0.000002s : 16: predicate.transpose_eliminate 1.41% : 0.000003s : 26: predicate.tuple_list_convert_item_index_to_positive 1.58% : 0.000004s : 26: predicate.tuple_list_get_item_const_eliminator 1.45% : 0.000003s : 26: predicate.tuple_list_get_item_depend_reorder 3.45% : 0.000008s : 42: predicate.tuple_list_get_item_eliminator 1.46% : 0.000003s : 26: predicate.tuple_list_get_set_item_eliminator 2.21% : 0.000005s : 36: predicate.tuple_list_set_item_eliminator 1.88% : 0.000004s : 32: predicate.tuple_to_list_eliminator_ 2.57% : 0.000006s : 48: predicate.updatestate_pure_node_eliminater 3.16% : 0.000007s : 58: predicate.updatestate_useless_node_eliminater 0.42% : 0.000001s : 5: predicate.value_based_eliminate 0.63% : 0.000001s : 10: predicate.virtual_dataset_eliminate 0.61% : 0.000001s : 10: predicate.virtual_output_eliminate 0.26% : 0.000001s : 5: predicate.virtual_view_grad_eliminate 0.47% : 0.000001s : 5: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000531 11 54.18% : 0.000288s : 5: func_graph_cloner_run.FuncGraphClonerGraph 45.82% : 0.000243s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.026777 192 0.02% : 0.000006s : 1: ForceFp32Comm 11.27% : 0.003017s : 1: add_attr 11.22% : 0.003004s : 1: add_attr_with_inline 0.02% : 0.000007s : 1: add_comm_op_reuse_tag 0.22% : 0.000058s : 1: add_recomputation 0.03% : 0.000007s : 1: assign_add_opt 0.27% : 0.000071s : 1: auto_monad 0.10% : 0.000027s : 1: auto_monad_reorder 0.02% : 0.000006s : 1: begin_end_overlap_inline 0.03% : 0.000009s : 1: bias_add_comm_swap 1.81% : 0.000484s : 1: bootstrap 0.11% : 0.000030s : 1: cconv 0.03% : 0.000007s : 1: comm_op_add_attrs 0.07% : 0.000020s : 1: control_data_broadcast_order 0.05% : 0.000013s : 1: convert_after_rewriter 0.13% : 0.000035s : 1: cse_after_recomputation 0.03% : 0.000008s : 1: dataset_repeat_opt 0.06% : 0.000017s : 1: detach_backward 0.04% : 0.000012s : 1: environ_conv 0.11% : 0.000030s : 1: event_method 0.03% : 0.000008s : 1: full_micro_interleaved_order_control 0.03% : 0.000007s : 1: get_jit_bprop_graph 0.05% : 0.000012s : 1: graph_reusing 0.03% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.02% : 0.000006s : 1: handle_group_info 0.03% : 0.000008s : 1: inline 0.03% : 0.000009s : 1: insert-virtual-dataset 0.03% : 0.000007s : 1: interleave_parallel_branches 0.03% : 0.000007s : 1: interleave_split_concat_branches 0.03% : 0.000008s : 1: label_fine_grained_interleaved_index 0.04% : 0.000010s : 1: label_micro_interleaved_index 1.63% : 0.000438s : 1: loop_unroll 0.03% : 0.000007s : 1: merge_cast_opt 0.03% : 0.000008s : 1: micro_interleaved_order_control 1.97% : 0.000526s : 1: mutable_eliminate 0.04% : 0.000010s : 1: offloading_packed_experts 0.05% : 0.000014s : 1: opt.transform.loop_unroll_optimizer 0.06% : 0.000016s : 1: opt.transform.mutable_eliminate 5.11% : 0.001369s : 78: opt.transform.opt_a 0.13% : 0.000035s : 1: opt.transform.opt_after_cconv 0.10% : 0.000028s : 1: opt.transform.opt_after_jit_grad 0.48% : 0.000128s : 28: opt.transform.opt_b 0.21% : 0.000055s : 2: opt.transform.opt_trans_graph 0.15% : 0.000039s : 4: opt.transform.symbol_engine_opt 12.12% : 0.003247s : 1: opt_a 0.51% : 0.000136s : 1: opt_after_cconv 1.84% : 0.000494s : 1: opt_after_jit_grad 1.12% : 0.000301s : 1: opt_b 22.13% : 0.005925s : 1: optimize 0.08% : 0.000022s : 1: optimize_parallel_all_gather_comm 0.04% : 0.000011s : 1: order_py_execute_after_rewriter 0.09% : 0.000025s : 1: overlap_grad_flash_sp 0.02% : 0.000006s : 1: overlap_grad_matmul_and_grad_allreduce 0.03% : 0.000009s : 1: overlap_grad_ring_attention 0.03% : 0.000007s : 1: overlap_opt_shard_grad_in_pipeline 0.02% : 0.000007s : 1: overlap_opt_shard_in_pipeline 0.03% : 0.000008s : 1: overlap_param_gather 0.02% : 0.000006s : 1: overlap_recompute_allgather_and_fa_grad 0.04% : 0.000010s : 1: overlap_recompute_and_grad_model_parallel 0.03% : 0.000007s : 1: overlap_recompute_comm 0.04% : 0.000010s : 1: parallel-infer-symbol 0.02% : 0.000006s : 1: parallel-infer-symbol-second 0.03% : 0.000008s : 1: partial_unused_args_eliminate 0.03% : 0.000009s : 1: pipeline_parallel_scheduler 0.03% : 0.000007s : 1: pipeline_split 0.15% : 0.000040s : 1: pre_auto_parallel 0.14% : 0.000037s : 1: py_interpret_to_execute 0.07% : 0.000018s : 1: py_interpret_to_execute_after_opt_a 0.02% : 0.000006s : 1: remove_cast_before_assign_add 0.07% : 0.000020s : 1: remove_dup_value 1.41% : 0.000378s : 1: renormalize.infer 1.09% : 0.000292s : 1: renormalize.specialize 0.03% : 0.000009s : 1: reorder_send_recv_between_fp_bp 0.04% : 0.000010s : 1: rewriter_after_jit_bprop_graph 0.16% : 0.000042s : 1: rewriter_after_opt_a 0.36% : 0.000095s : 1: rewriter_before_opt_a 0.03% : 0.000007s : 1: slice_cell_reuse_recomputed_activation 0.03% : 0.000008s : 1: slice_recompute_activation 0.02% : 0.000007s : 1: split_layernorm_comm 0.03% : 0.000008s : 1: split_matmul_comm_elemetwise 0.05% : 0.000012s : 1: swap_dp_allreduce_reducescatter 0.38% : 0.000101s : 1: symbol_engine_optimizer 0.45% : 0.000120s : 1: tuple_transform 21.21% : 0.005680s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:47:11.752.768 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0148786, [21] [bootstrap]: 0.00043214 [type_inference]: 0.00573407 [event_method]: 2.068e-05 [auto_monad]: 5.983e-05 [graph_reusing]: 5.61e-06 [inline]: 2.12999e-06 [add_attr]: 0.00304542, [1] [add_attr_with_inline]: 0.00303736, [1] [Cycle 1]: 5.816e-05, [2] [tag_attr]: 2.005e-05 [meta_addattr_fg_expand]: 6.44001e-06 [parallel-infer-symbol]: 3.28e-06 [pre_auto_parallel]: 3.352e-05 [insert-virtual-dataset]: 2.46e-06 [parallel-infer-symbol-second]: 7.79983e-07 [dataset_repeat_opt]: 1.91e-06 [pipeline_split]: 1.71002e-06 [optimize]: 0.00486817, [53] [py_interpret_to_execute]: 3.407e-05 [rewriter_before_opt_a]: 8.883e-05 [opt_a]: 0.00279107, [2] [Cycle 1]: 0.00207634, [45] [expand_dump_flag]: 2.94999e-06 [switch_simplify]: 4.686e-05 [loop_unroll]: 3.52e-05 [a_1]: 0.00071629 [with_stream_mark]: 1.516e-05 [recompute_prepare]: 9.90002e-06 [updatestate_depend_eliminate]: 3.78001e-06 [updatestate_assign_eliminate]: 3.18998e-06 [updatestate_loads_eliminate]: 2.78998e-06 [parameter_eliminate]: 2.24999e-06 [a_2]: 8.977e-05 [accelerated_algorithm]: 7.73999e-06 [shard]: 1.77001e-06 [meta_shard_fg_expand]: 2.06003e-06 [shard_inline]: 7.33999e-06 [merge_send_recv]: 8.48999e-06 [auto_parallel]: 6.12999e-06 [parallel]: 1.725e-05 [flash_sp]: 7.5e-06 [merge_comm]: 4.25e-06 [allreduce_fusion]: 3.53999e-06 [matmul_add_comm_reduction]: 9.23002e-06 [allreduce_slice_to_reducescatter]: 6.29982e-07 [virtual_shard_identity]: 9.12001e-06 [virtual_dataset]: 7.29001e-06 [get_grad_eliminate_]: 7.33e-06 [virtual_output]: 7.06001e-06 [merge_forward]: 3.6e-06 [cell_reuse_recompute_pass]: 1.19998e-06 [offload_activation]: 1.016e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.277e-05 [merge_recompute_call_nodes]: 1.69e-06 [before_grad]: 1.053e-05 [set_forward_comm_id_for_comm_node_pass]: 3.58e-06 [meta_fg_expand]: 3.08e-06 [flash_sp_send_recv_attached]: 2.34999e-06 [receive_attached]: 2.09999e-06 [after_resolve]: 1.362e-05 [a_after_grad]: 1.094e-05 [renormalize]: 0.00060957 [add_forward_monad_depend]: 5.59e-06 [auto_monad_grad]: 2.12001e-06 [auto_monad_eliminator]: 1.603e-05 [cse]: 3.336e-05 [a_3]: 5.243e-05 [Cycle 2]: 0.00070386, [45] [expand_dump_flag]: 1.26002e-06 [switch_simplify]: 8.86997e-06 [loop_unroll]: 6.83003e-06 [a_1]: 0.00015882 [with_stream_mark]: 1.096e-05 [recompute_prepare]: 7.06001e-06 [updatestate_depend_eliminate]: 2.96999e-06 [updatestate_assign_eliminate]: 2.79999e-06 [updatestate_loads_eliminate]: 2.36e-06 [parameter_eliminate]: 1.18001e-06 [a_2]: 8.233e-05 [accelerated_algorithm]: 6.93e-06 [shard]: 1.52001e-06 [meta_shard_fg_expand]: 1.71e-06 [shard_inline]: 6.64999e-06 [merge_send_recv]: 5.64998e-06 [auto_parallel]: 5.76e-06 [parallel]: 5.10001e-06 [flash_sp]: 3.44001e-06 [merge_comm]: 3.57002e-06 [allreduce_fusion]: 3.31001e-06 [matmul_add_comm_reduction]: 5.94e-06 [allreduce_slice_to_reducescatter]: 4.00003e-07 [virtual_shard_identity]: 7.98999e-06 [virtual_dataset]: 6.74999e-06 [get_grad_eliminate_]: 6.64001e-06 [virtual_output]: 6.26e-06 [merge_forward]: 2.71e-06 [cell_reuse_recompute_pass]: 1.36002e-06 [offload_activation]: 6.47001e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.372e-05 [merge_recompute_call_nodes]: 9.29984e-07 [before_grad]: 9.85002e-06 [set_forward_comm_id_for_comm_node_pass]: 3.28998e-06 [meta_fg_expand]: 1.96e-06 [flash_sp_send_recv_attached]: 8.30012e-07 [receive_attached]: 1.09e-06 [after_resolve]: 1.192e-05 [a_after_grad]: 1.012e-05 [renormalize]: 8.00064e-08 [add_forward_monad_depend]: 1.44998e-06 [auto_monad_grad]: 8.59989e-07 [auto_monad_eliminator]: 7.55998e-06 [cse]: 1.463e-05 [a_3]: 4.165e-05 [py_interpret_to_execute_after_opt_a]: 1.03e-05 [slice_cell_reuse_recomputed_activation]: 1.79998e-06 [rewriter_after_opt_a]: 3.454e-05 [convert_after_rewriter]: 6.61e-06 [order_py_execute_after_rewriter]: 5.39998e-06 [mutable_eliminate]: 0.00054462 [opt_b]: 0.00022791, [1] [Cycle 1]: 0.00022127, [7] [b_1]: 0.0001463 [b_2]: 8.50999e-06 [updatestate_depend_eliminate]: 5.92999e-06 [updatestate_assign_eliminate]: 2.41998e-06 [updatestate_loads_eliminate]: 2.74999e-06 [renormalize]: 4.39992e-07 [cse]: 1.916e-05 [optimize_parallel_all_gather_comm]: 1.562e-05 [overlap_param_gather]: 2.14e-06 [cconv]: 2.492e-05 [loop_unroll]: 0.00041553 [opt_after_cconv]: 0.00010628, [1] [Cycle 1]: 0.00010082, [7] [c_1]: 3.581e-05 [parameter_eliminate]: 3.27002e-06 [updatestate_depend_eliminate]: 5.05001e-06 [updatestate_assign_eliminate]: 2.56e-06 [updatestate_loads_eliminate]: 2.21998e-06 [cse]: 1.71e-05 [renormalize]: 4.30009e-07 [remove_dup_value]: 1.402e-05 [tuple_transform]: 8.325e-05, [1] [Cycle 1]: 7.851e-05, [4] [d_1]: 5.055e-05 [none_parameter_eliminate]: 1.60999e-06 [renormalize]: 2.80008e-07 [switch_simplify]: 7.52002e-06 [partial_unused_args_eliminate]: 1.91e-06 [add_recomputation]: 4.688e-05 [cse_after_recomputation]: 2.115e-05, [1] [Cycle 1]: 1.676e-05, [1] [cse]: 1.114e-05 [environ_conv]: 4.79002e-06 [swap_dp_allreduce_reducescatter]: 5.49e-06 [bias_add_comm_swap]: 2.83e-06 [label_micro_interleaved_index]: 4.25999e-06 [label_fine_grained_interleaved_index]: 2.56e-06 [merge_cast_opt]: 1.24e-06 [slice_recompute_activation]: 1.97001e-06 [micro_interleaved_order_control]: 2.50002e-06 [assign_add_opt]: 1.20999e-06 [ForceFp32Comm]: 7.89994e-07 [remove_cast_before_assign_add]: 1.13001e-06 [full_micro_interleaved_order_control]: 2.39999e-06 [reorder_send_recv_between_fp_bp]: 2.86999e-06 [comm_op_add_attrs]: 1.22e-06 [add_comm_op_reuse_tag]: 1.20999e-06 [interleave_split_concat_branches]: 1.17e-06 [interleave_parallel_branches]: 1.04e-06 [overlap_opt_shard_in_pipeline]: 1.15001e-06 [overlap_opt_shard_grad_in_pipeline]: 1.72001e-06 [control_data_broadcast_order]: 1.219e-05 [grouped_pairwise_exchange_alltoall]: 1.47001e-06 [offloading_packed_experts]: 4.16001e-06 [overlap_recompute_and_grad_model_parallel]: 4.27e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.17e-06 [overlap_recompute_allgather_and_fa_grad]: 1.35999e-06 [overlap_recompute_comm]: 1.97001e-06 [overlap_grad_ring_attention]: 4.32e-06 [overlap_grad_flash_sp]: 1.908e-05 [begin_end_overlap_inline]: 5.09986e-07 [split_matmul_comm_elemetwise]: 2.16e-06 [split_layernorm_comm]: 1.96e-06 [handle_group_info]: 1.62999e-06 [symbol_engine_optimizer]: 7.793e-05, [1] [Cycle 1]: 7.359e-05, [6] [build]: 3.45e-06 [elim_shapecalc]: 1.057e-05 [elim_not_effective]: 1.279e-05 [opt_reshape]: 7.85e-06 [fold_const_symbol]: 1.089e-05 [renormalize]: 2.00002e-07 [detach_backward]: 2.06998e-06 [pipeline_parallel_scheduler]: 1.67999e-06 [auto_monad_reorder]: 1.711e-05 [get_jit_bprop_graph]: 1.35999e-06 [rewriter_after_jit_bprop_graph]: 3.8e-06 [opt_after_jit_grad]: 0.00045977 [validate]: 3.783e-05 Sums bootstrap : 0.000432s : 3.97% type_inference : 0.005734s : 52.68% event_method : 0.000021s : 0.19% auto_monad : 0.000060s : 0.55% graph_reusing : 0.000006s : 0.05% inline : 0.000002s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000020s : 0.18% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.06% parallel-infer-symbol : 0.000003s : 0.03% pre_auto_parallel : 0.000034s : 0.31% insert-virtual-dataset : 0.000002s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.02% optimize.py_interpret_to_execute : 0.000034s : 0.31% optimize.rewriter_before_opt_a : 0.000089s : 0.82% optimize.opt_a.expand_dump_flag : 0.000004s : 0.04% optimize.opt_a.switch_simplify : 0.000056s : 0.51% optimize.opt_a.loop_unroll : 0.000042s : 0.39% optimize.opt_a.a_1 : 0.000875s : 8.04% optimize.opt_a.with_stream_mark : 0.000026s : 0.24% optimize.opt_a.recompute_prepare : 0.000017s : 0.16% optimize.opt_a.updatestate_depend_eliminate : 0.000007s : 0.06% optimize.opt_a.updatestate_assign_eliminate : 0.000006s : 0.06% optimize.opt_a.updatestate_loads_eliminate : 0.000005s : 0.05% optimize.opt_a.parameter_eliminate : 0.000003s : 0.03% optimize.opt_a.a_2 : 0.000172s : 1.58% optimize.opt_a.accelerated_algorithm : 0.000015s : 0.13% optimize.opt_a.shard : 0.000003s : 0.03% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.03% optimize.opt_a.shard_inline : 0.000014s : 0.13% optimize.opt_a.merge_send_recv : 0.000014s : 0.13% optimize.opt_a.auto_parallel : 0.000012s : 0.11% optimize.opt_a.parallel : 0.000022s : 0.21% optimize.opt_a.flash_sp : 0.000011s : 0.10% optimize.opt_a.merge_comm : 0.000008s : 0.07% optimize.opt_a.allreduce_fusion : 0.000007s : 0.06% optimize.opt_a.matmul_add_comm_reduction : 0.000015s : 0.14% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000017s : 0.16% optimize.opt_a.virtual_dataset : 0.000014s : 0.13% optimize.opt_a.get_grad_eliminate_ : 0.000014s : 0.13% optimize.opt_a.virtual_output : 0.000013s : 0.12% optimize.opt_a.merge_forward : 0.000006s : 0.06% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.02% optimize.opt_a.offload_activation : 0.000017s : 0.15% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000026s : 0.24% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.02% optimize.opt_a.before_grad : 0.000020s : 0.19% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000007s : 0.06% optimize.opt_a.meta_fg_expand : 0.000005s : 0.05% optimize.opt_a.flash_sp_send_recv_attached : 0.000003s : 0.03% optimize.opt_a.receive_attached : 0.000003s : 0.03% optimize.opt_a.after_resolve : 0.000026s : 0.23% optimize.opt_a.a_after_grad : 0.000021s : 0.19% optimize.opt_a.renormalize : 0.000610s : 5.60% optimize.opt_a.add_forward_monad_depend : 0.000007s : 0.06% optimize.opt_a.auto_monad_grad : 0.000003s : 0.03% optimize.opt_a.auto_monad_eliminator : 0.000024s : 0.22% optimize.opt_a.cse : 0.000048s : 0.44% optimize.opt_a.a_3 : 0.000094s : 0.86% optimize.py_interpret_to_execute_after_opt_a : 0.000010s : 0.09% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.02% optimize.rewriter_after_opt_a : 0.000035s : 0.32% optimize.convert_after_rewriter : 0.000007s : 0.06% optimize.order_py_execute_after_rewriter : 0.000005s : 0.05% optimize.mutable_eliminate : 0.000545s : 5.00% optimize.opt_b.b_1 : 0.000146s : 1.34% optimize.opt_b.b_2 : 0.000009s : 0.08% optimize.opt_b.updatestate_depend_eliminate : 0.000006s : 0.05% optimize.opt_b.updatestate_assign_eliminate : 0.000002s : 0.02% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_b.renormalize : 0.000000s : 0.00% optimize.opt_b.cse : 0.000019s : 0.18% optimize.optimize_parallel_all_gather_comm : 0.000016s : 0.14% optimize.overlap_param_gather : 0.000002s : 0.02% optimize.cconv : 0.000025s : 0.23% optimize.loop_unroll : 0.000416s : 3.82% optimize.opt_after_cconv.c_1 : 0.000036s : 0.33% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000005s : 0.05% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.02% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.02% optimize.opt_after_cconv.cse : 0.000017s : 0.16% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000014s : 0.13% optimize.tuple_transform.d_1 : 0.000051s : 0.46% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000008s : 0.07% optimize.partial_unused_args_eliminate : 0.000002s : 0.02% optimize.add_recomputation : 0.000047s : 0.43% optimize.cse_after_recomputation.cse : 0.000011s : 0.10% optimize.environ_conv : 0.000005s : 0.04% optimize.swap_dp_allreduce_reducescatter : 0.000005s : 0.05% optimize.bias_add_comm_swap : 0.000003s : 0.03% optimize.label_micro_interleaved_index : 0.000004s : 0.04% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.02% optimize.merge_cast_opt : 0.000001s : 0.01% optimize.slice_recompute_activation : 0.000002s : 0.02% optimize.micro_interleaved_order_control : 0.000003s : 0.02% optimize.assign_add_opt : 0.000001s : 0.01% optimize.ForceFp32Comm : 0.000001s : 0.01% optimize.remove_cast_before_assign_add : 0.000001s : 0.01% optimize.full_micro_interleaved_order_control : 0.000002s : 0.02% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.03% optimize.comm_op_add_attrs : 0.000001s : 0.01% optimize.add_comm_op_reuse_tag : 0.000001s : 0.01% optimize.interleave_split_concat_branches : 0.000001s : 0.01% optimize.interleave_parallel_branches : 0.000001s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.02% optimize.control_data_broadcast_order : 0.000012s : 0.11% optimize.grouped_pairwise_exchange_alltoall : 0.000001s : 0.01% optimize.offloading_packed_experts : 0.000004s : 0.04% optimize.overlap_recompute_and_grad_model_parallel : 0.000004s : 0.04% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.01% optimize.overlap_recompute_comm : 0.000002s : 0.02% optimize.overlap_grad_ring_attention : 0.000004s : 0.04% optimize.overlap_grad_flash_sp : 0.000019s : 0.18% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.02% optimize.split_layernorm_comm : 0.000002s : 0.02% optimize.handle_group_info : 0.000002s : 0.01% optimize.symbol_engine_optimizer.build : 0.000003s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000011s : 0.10% optimize.symbol_engine_optimizer.elim_not_effective : 0.000013s : 0.12% optimize.symbol_engine_optimizer.opt_reshape : 0.000008s : 0.07% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000011s : 0.10% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.02% pipeline_parallel_scheduler : 0.000002s : 0.02% auto_monad_reorder : 0.000017s : 0.16% get_jit_bprop_graph : 0.000001s : 0.01% rewriter_after_jit_bprop_graph : 0.000004s : 0.03% opt_after_jit_grad : 0.000460s : 4.22% validate : 0.000038s : 0.35% Time group info: ------[substitution.] 0.000188 34 0.91% : 0.000002s : 2: substitution.elim_not_effective 0.74% : 0.000001s : 2: substitution.fold_const_symbol 3.42% : 0.000006s : 5: substitution.graph_param_transform 75.45% : 0.000142s : 4: substitution.inline 1.63% : 0.000003s : 4: substitution.j_node_and_user_rematch 2.52% : 0.000005s : 4: substitution.remove_not_recompute_node 2.89% : 0.000005s : 6: substitution.replace_old_param 9.58% : 0.000018s : 6: substitution.tuple_list_get_item_eliminator 2.87% : 0.000005s : 1: substitution.value_based_eliminate ------[type_inference.] 0.005672 2 87.33% : 0.004953s : 1: type_inference.infer 12.67% : 0.000719s : 1: type_inference.specialize ------[replace.] 0.000070 10 54.35% : 0.000038s : 4: replace.inline 45.65% : 0.000032s : 6: replace.tuple_list_get_item_eliminator ------[match.] 0.000155 10 90.08% : 0.000140s : 4: match.inline 9.92% : 0.000015s : 6: match.tuple_list_get_item_eliminator ------[predicate.] 0.000229 1590 0.97% : 0.000002s : 16: predicate.accumulaten_eliminater 0.75% : 0.000002s : 5: predicate.ad_related_special_op_eliminate 0.53% : 0.000001s : 10: predicate.addn_check_dump 0.96% : 0.000002s : 16: predicate.addn_zero_filter 0.84% : 0.000002s : 16: predicate.adjust_all_reduce_mul_add 1.96% : 0.000004s : 26: predicate.arithmetic_simplify 1.08% : 0.000002s : 16: predicate.cast_eliminate 0.60% : 0.000001s : 10: predicate.check_bprop_eliminate 0.53% : 0.000001s : 10: predicate.compare_switch_simplify 0.21% : 0.000000s : 5: predicate.const_output_eliminate 0.60% : 0.000001s : 10: predicate.depend_value_elim 0.95% : 0.000002s : 16: predicate.dict_get_item_const_eliminator 1.15% : 0.000003s : 16: predicate.dict_get_item_eliminator 0.87% : 0.000002s : 16: predicate.dict_set_item_eliminator 0.88% : 0.000002s : 10: predicate.dumpgradient_eliminate 0.25% : 0.000001s : 5: predicate.elim_not_effective 0.37% : 0.000001s : 5: predicate.elim_shapecalc_of_broadcastargs 1.15% : 0.000003s : 21: predicate.environ_add_const_eliminate 1.12% : 0.000003s : 21: predicate.environ_get_add_eliminate 1.12% : 0.000003s : 21: predicate.environ_get_depend_swap 1.65% : 0.000004s : 31: predicate.environ_get_eliminate 1.13% : 0.000003s : 21: predicate.environ_get_set_eliminate 1.56% : 0.000004s : 26: predicate.exchange_switch_depend_value 2.44% : 0.000006s : 26: predicate.float_depend_g_call 0.50% : 0.000001s : 10: predicate.float_environ_get_switch 0.79% : 0.000002s : 15: predicate.float_tuple_getitem_switch 0.20% : 0.000000s : 5: predicate.fold_const_symbol 0.62% : 0.000001s : 10: predicate.get_grad_eliminate 0.24% : 0.000001s : 5: predicate.graph_param_transform 0.60% : 0.000001s : 10: predicate.incorporate_call 0.49% : 0.000001s : 10: predicate.incorporate_call_switch 6.30% : 0.000014s : 72: predicate.inline 0.72% : 0.000002s : 10: predicate.inline_without_move 0.35% : 0.000001s : 10: predicate.j_node_and_user_rematch 0.76% : 0.000002s : 10: predicate.less_batch_normalization 2.00% : 0.000005s : 32: predicate.list_to_tuple_eliminator_ 2.63% : 0.000006s : 48: predicate.load_eliminater 0.77% : 0.000002s : 5: predicate.loop_unroll_after_grad 2.40% : 0.000005s : 40: predicate.loop_unroll_before_grad 1.64% : 0.000004s : 26: predicate.make_slice_get_slice_eliminator 0.56% : 0.000001s : 10: predicate.merge_addn 0.56% : 0.000001s : 10: predicate.micro_step_allgather_replace 0.53% : 0.000001s : 10: predicate.mini_step_allgather_replace 0.86% : 0.000002s : 16: predicate.minmaximum_grad 0.80% : 0.000002s : 5: predicate.mutable_eliminate 0.42% : 0.000001s : 5: predicate.opt_reshape 0.33% : 0.000001s : 5: predicate.parallel_virtual_node 1.85% : 0.000004s : 26: predicate.partial_defer_inline 1.85% : 0.000004s : 27: predicate.partial_eliminate 0.91% : 0.000002s : 16: predicate.print_const_string_wrapper 0.55% : 0.000001s : 10: predicate.reduce_all_const_elim 1.13% : 0.000003s : 16: predicate.reduce_eliminate 2.66% : 0.000006s : 48: predicate.redundant_stop_gradient_eliminater 0.43% : 0.000001s : 10: predicate.remove_not_recompute_node 1.57% : 0.000004s : 32: predicate.replace_applicator 0.56% : 0.000001s : 10: predicate.replace_old_param 0.27% : 0.000001s : 5: predicate.reset_defer_inline 0.97% : 0.000002s : 16: predicate.reshape_eliminate 0.62% : 0.000001s : 10: predicate.row_tensor_add_zeros_like 0.32% : 0.000001s : 5: predicate.row_tensor_eliminate 0.67% : 0.000002s : 10: predicate.same_eliminate 0.58% : 0.000001s : 10: predicate.set_cell_output_no_recompute 0.84% : 0.000002s : 10: predicate.shard_identity_eliminate 0.69% : 0.000002s : 10: predicate.special_op_eliminate 0.68% : 0.000002s : 10: predicate.specialize_transform 0.75% : 0.000002s : 10: predicate.split_environ_get_set_with_tuple_value 0.73% : 0.000002s : 10: predicate.stack_unstack_eliminate 0.34% : 0.000001s : 5: predicate.switch_call_monad_eliminater 1.71% : 0.000004s : 26: predicate.switch_defer_inline 2.15% : 0.000005s : 36: predicate.switch_layer_defer_inline 5.19% : 0.000012s : 81: predicate.switch_simplify 0.90% : 0.000002s : 16: predicate.tile_eliminate 0.89% : 0.000002s : 16: predicate.transpose_eliminate 1.53% : 0.000004s : 26: predicate.tuple_list_convert_item_index_to_positive 1.57% : 0.000004s : 26: predicate.tuple_list_get_item_const_eliminator 1.47% : 0.000003s : 26: predicate.tuple_list_get_item_depend_reorder 3.35% : 0.000008s : 42: predicate.tuple_list_get_item_eliminator 1.47% : 0.000003s : 26: predicate.tuple_list_get_set_item_eliminator 2.16% : 0.000005s : 36: predicate.tuple_list_set_item_eliminator 1.82% : 0.000004s : 32: predicate.tuple_to_list_eliminator_ 2.59% : 0.000006s : 48: predicate.updatestate_pure_node_eliminater 3.27% : 0.000007s : 58: predicate.updatestate_useless_node_eliminater 0.41% : 0.000001s : 5: predicate.value_based_eliminate 0.62% : 0.000001s : 10: predicate.virtual_dataset_eliminate 0.60% : 0.000001s : 10: predicate.virtual_output_eliminate 0.26% : 0.000001s : 5: predicate.virtual_view_grad_eliminate 0.36% : 0.000001s : 5: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000552 11 55.66% : 0.000307s : 5: func_graph_cloner_run.FuncGraphClonerGraph 44.34% : 0.000245s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.024935 192 0.01% : 0.000004s : 1: ForceFp32Comm 12.23% : 0.003050s : 1: add_attr 12.20% : 0.003041s : 1: add_attr_with_inline 0.02% : 0.000004s : 1: add_comm_op_reuse_tag 0.21% : 0.000051s : 1: add_recomputation 0.02% : 0.000004s : 1: assign_add_opt 0.26% : 0.000065s : 1: auto_monad 0.08% : 0.000021s : 1: auto_monad_reorder 0.01% : 0.000003s : 1: begin_end_overlap_inline 0.02% : 0.000006s : 1: bias_add_comm_swap 1.85% : 0.000461s : 1: bootstrap 0.11% : 0.000028s : 1: cconv 0.02% : 0.000004s : 1: comm_op_add_attrs 0.06% : 0.000015s : 1: control_data_broadcast_order 0.04% : 0.000010s : 1: convert_after_rewriter 0.10% : 0.000024s : 1: cse_after_recomputation 0.02% : 0.000005s : 1: dataset_repeat_opt 0.02% : 0.000005s : 1: detach_backward 0.03% : 0.000008s : 1: environ_conv 0.11% : 0.000027s : 1: event_method 0.02% : 0.000005s : 1: full_micro_interleaved_order_control 0.02% : 0.000005s : 1: get_jit_bprop_graph 0.04% : 0.000009s : 1: graph_reusing 0.02% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.02% : 0.000004s : 1: handle_group_info 0.02% : 0.000005s : 1: inline 0.02% : 0.000006s : 1: insert-virtual-dataset 0.02% : 0.000004s : 1: interleave_parallel_branches 0.02% : 0.000004s : 1: interleave_split_concat_branches 0.02% : 0.000006s : 1: label_fine_grained_interleaved_index 0.03% : 0.000007s : 1: label_micro_interleaved_index 1.70% : 0.000423s : 1: loop_unroll 0.02% : 0.000004s : 1: merge_cast_opt 0.02% : 0.000005s : 1: micro_interleaved_order_control 2.22% : 0.000553s : 1: mutable_eliminate 0.03% : 0.000007s : 1: offloading_packed_experts 0.06% : 0.000014s : 1: opt.transform.loop_unroll_optimizer 0.06% : 0.000015s : 1: opt.transform.mutable_eliminate 5.40% : 0.001347s : 78: opt.transform.opt_a 0.14% : 0.000034s : 1: opt.transform.opt_after_cconv 0.11% : 0.000027s : 1: opt.transform.opt_after_jit_grad 0.50% : 0.000124s : 28: opt.transform.opt_b 0.22% : 0.000056s : 2: opt.transform.opt_trans_graph 0.15% : 0.000039s : 4: opt.transform.symbol_engine_opt 11.21% : 0.002794s : 1: opt_a 0.44% : 0.000110s : 1: opt_after_cconv 1.88% : 0.000468s : 1: opt_after_jit_grad 0.93% : 0.000231s : 1: opt_b 19.54% : 0.004872s : 1: optimize 0.08% : 0.000019s : 1: optimize_parallel_all_gather_comm 0.03% : 0.000008s : 1: order_py_execute_after_rewriter 0.09% : 0.000022s : 1: overlap_grad_flash_sp 0.02% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.03% : 0.000007s : 1: overlap_grad_ring_attention 0.02% : 0.000004s : 1: overlap_opt_shard_grad_in_pipeline 0.02% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.02% : 0.000005s : 1: overlap_param_gather 0.02% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.03% : 0.000007s : 1: overlap_recompute_and_grad_model_parallel 0.02% : 0.000005s : 1: overlap_recompute_comm 0.03% : 0.000007s : 1: parallel-infer-symbol 0.01% : 0.000004s : 1: parallel-infer-symbol-second 0.02% : 0.000005s : 1: partial_unused_args_eliminate 0.02% : 0.000005s : 1: pipeline_parallel_scheduler 0.02% : 0.000004s : 1: pipeline_split 0.15% : 0.000038s : 1: pre_auto_parallel 0.16% : 0.000039s : 1: py_interpret_to_execute 0.05% : 0.000014s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000004s : 1: remove_cast_before_assign_add 0.07% : 0.000018s : 1: remove_dup_value 1.24% : 0.000308s : 1: renormalize.infer 1.18% : 0.000293s : 1: renormalize.specialize 0.02% : 0.000005s : 1: reorder_send_recv_between_fp_bp 0.03% : 0.000007s : 1: rewriter_after_jit_bprop_graph 0.16% : 0.000039s : 1: rewriter_after_opt_a 0.37% : 0.000093s : 1: rewriter_before_opt_a 0.02% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.02% : 0.000005s : 1: slice_recompute_activation 0.02% : 0.000005s : 1: split_layernorm_comm 0.02% : 0.000005s : 1: split_matmul_comm_elemetwise 0.03% : 0.000008s : 1: swap_dp_allreduce_reducescatter 0.32% : 0.000081s : 1: symbol_engine_optimizer 0.35% : 0.000086s : 1: tuple_transform 23.06% : 0.005750s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:47:11.939.955 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:47:11.940.226 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0162722, [21] [bootstrap]: 0.00043336 [type_inference]: 0.00563974 [event_method]: 2.089e-05 [auto_monad]: 6.227e-05 [graph_reusing]: 5.99999e-06 [inline]: 2.51e-06 [add_attr]: 0.00301633, [1] [add_attr_with_inline]: 0.00300825, [1] [Cycle 1]: 6.845e-05, [2] [tag_attr]: 2.121e-05 [meta_addattr_fg_expand]: 6.69001e-06 [parallel-infer-symbol]: 2.64001e-06 [pre_auto_parallel]: 3.224e-05 [insert-virtual-dataset]: 2.56e-06 [parallel-infer-symbol-second]: 7.39994e-07 [dataset_repeat_opt]: 2.25002e-06 [pipeline_split]: 1.64e-06 [optimize]: 0.00594302, [53] [py_interpret_to_execute]: 3.2e-05 [rewriter_before_opt_a]: 9.593e-05 [opt_a]: 0.00352849, [2] [Cycle 1]: 0.00252964, [45] [expand_dump_flag]: 3.09999e-06 [switch_simplify]: 4.744e-05 [loop_unroll]: 3.615e-05 [a_1]: 0.00086938 [with_stream_mark]: 1.617e-05 [recompute_prepare]: 1.282e-05 [updatestate_depend_eliminate]: 5.25999e-06 [updatestate_assign_eliminate]: 4.13999e-06 [updatestate_loads_eliminate]: 3.75998e-06 [parameter_eliminate]: 1.79e-06 [a_2]: 0.00013879 [accelerated_algorithm]: 9.19e-06 [shard]: 2.07001e-06 [meta_shard_fg_expand]: 1.60001e-06 [shard_inline]: 9.15999e-06 [merge_send_recv]: 9.82999e-06 [auto_parallel]: 7.15003e-06 [parallel]: 2.007e-05 [flash_sp]: 8.60999e-06 [merge_comm]: 4.94998e-06 [allreduce_fusion]: 4.48001e-06 [matmul_add_comm_reduction]: 1.072e-05 [allreduce_slice_to_reducescatter]: 8.60018e-07 [virtual_shard_identity]: 1.045e-05 [virtual_dataset]: 8.77e-06 [get_grad_eliminate_]: 7.92e-06 [virtual_output]: 8.41002e-06 [merge_forward]: 4.79998e-06 [cell_reuse_recompute_pass]: 1.32e-06 [offload_activation]: 1.062e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.903e-05 [merge_recompute_call_nodes]: 1.40999e-06 [before_grad]: 1.377e-05 [set_forward_comm_id_for_comm_node_pass]: 4.48001e-06 [meta_fg_expand]: 3.41001e-06 [flash_sp_send_recv_attached]: 2.29001e-06 [receive_attached]: 2.04999e-06 [after_resolve]: 1.418e-05 [a_after_grad]: 1.314e-05 [renormalize]: 0.00063407 [add_forward_monad_depend]: 5.40001e-06 [auto_monad_grad]: 2.36e-06 [auto_monad_eliminator]: 1.669e-05 [cse]: 3.441e-05 [a_3]: 7.635e-05 [Cycle 2]: 0.00098536, [45] [expand_dump_flag]: 1.20999e-06 [switch_simplify]: 1.02e-05 [loop_unroll]: 8.27e-06 [a_1]: 0.00020401 [with_stream_mark]: 1.171e-05 [recompute_prepare]: 8.33999e-06 [updatestate_depend_eliminate]: 3.97e-06 [updatestate_assign_eliminate]: 3.11999e-06 [updatestate_loads_eliminate]: 3.11999e-06 [parameter_eliminate]: 1.50001e-06 [a_2]: 0.00013038 [accelerated_algorithm]: 8.60001e-06 [shard]: 1.15999e-06 [meta_shard_fg_expand]: 1.60999e-06 [shard_inline]: 8.04002e-06 [merge_send_recv]: 5.73997e-06 [auto_parallel]: 6.41e-06 [parallel]: 4.61002e-06 [flash_sp]: 3.45e-06 [merge_comm]: 4.22e-06 [allreduce_fusion]: 3.81001e-06 [matmul_add_comm_reduction]: 6.54999e-06 [allreduce_slice_to_reducescatter]: 3.89991e-07 [virtual_shard_identity]: 9.62999e-06 [virtual_dataset]: 8.1e-06 [get_grad_eliminate_]: 7.66999e-06 [virtual_output]: 7.71999e-06 [merge_forward]: 3.61999e-06 [cell_reuse_recompute_pass]: 1.97001e-06 [offload_activation]: 7.08998e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.665e-05 [merge_recompute_call_nodes]: 8.79983e-07 [before_grad]: 1.215e-05 [set_forward_comm_id_for_comm_node_pass]: 4.16001e-06 [meta_fg_expand]: 2.69999e-06 [flash_sp_send_recv_attached]: 7.7e-07 [receive_attached]: 1.02e-06 [after_resolve]: 1.322e-05 [a_after_grad]: 1.233e-05 [renormalize]: 8.00064e-08 [add_forward_monad_depend]: 1.64e-06 [auto_monad_grad]: 1.04e-06 [auto_monad_eliminator]: 8.74003e-06 [cse]: 1.838e-05 [a_3]: 6.314e-05 [py_interpret_to_execute_after_opt_a]: 1.442e-05 [slice_cell_reuse_recomputed_activation]: 4.75999e-06 [rewriter_after_opt_a]: 4.536e-05 [convert_after_rewriter]: 1.085e-05 [order_py_execute_after_rewriter]: 8.72e-06 [mutable_eliminate]: 0.00049545 [opt_b]: 0.0003353, [1] [Cycle 1]: 0.00032553, [7] [b_1]: 0.00022452 [b_2]: 1.008e-05 [updatestate_depend_eliminate]: 5.91e-06 [updatestate_assign_eliminate]: 3.8e-06 [updatestate_loads_eliminate]: 3.14999e-06 [renormalize]: 5.29981e-07 [cse]: 2.151e-05 [optimize_parallel_all_gather_comm]: 2.068e-05 [overlap_param_gather]: 4.62998e-06 [cconv]: 2.684e-05 [loop_unroll]: 0.00043161 [opt_after_cconv]: 0.00015066, [1] [Cycle 1]: 0.00014219, [7] [c_1]: 4.163e-05 [parameter_eliminate]: 3.11001e-06 [updatestate_depend_eliminate]: 6.70998e-06 [updatestate_assign_eliminate]: 3.78999e-06 [updatestate_loads_eliminate]: 3.46001e-06 [cse]: 2.331e-05 [renormalize]: 7.59988e-07 [remove_dup_value]: 1.821e-05 [tuple_transform]: 0.00010622, [1] [Cycle 1]: 9.909e-05, [4] [d_1]: 5.744e-05 [none_parameter_eliminate]: 1.87999e-06 [renormalize]: 3.30008e-07 [switch_simplify]: 8.72998e-06 [partial_unused_args_eliminate]: 5.64e-06 [add_recomputation]: 5.914e-05 [cse_after_recomputation]: 3.359e-05, [1] [Cycle 1]: 2.66e-05, [1] [cse]: 1.706e-05 [environ_conv]: 9.17999e-06 [swap_dp_allreduce_reducescatter]: 8.69998e-06 [bias_add_comm_swap]: 5.17999e-06 [label_micro_interleaved_index]: 6.69001e-06 [label_fine_grained_interleaved_index]: 5.64998e-06 [merge_cast_opt]: 3.81999e-06 [slice_recompute_activation]: 4.36002e-06 [micro_interleaved_order_control]: 4.72e-06 [assign_add_opt]: 3.7e-06 [ForceFp32Comm]: 3.21999e-06 [remove_cast_before_assign_add]: 3.45003e-06 [full_micro_interleaved_order_control]: 5.22e-06 [reorder_send_recv_between_fp_bp]: 5.35001e-06 [comm_op_add_attrs]: 3.70998e-06 [add_comm_op_reuse_tag]: 3.39001e-06 [interleave_split_concat_branches]: 3.66001e-06 [interleave_parallel_branches]: 3.66999e-06 [overlap_opt_shard_in_pipeline]: 3.66999e-06 [overlap_opt_shard_grad_in_pipeline]: 4.39998e-06 [control_data_broadcast_order]: 1.822e-05 [grouped_pairwise_exchange_alltoall]: 4.22e-06 [offloading_packed_experts]: 7e-06 [overlap_recompute_and_grad_model_parallel]: 7.98999e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.61001e-06 [overlap_recompute_allgather_and_fa_grad]: 3.66999e-06 [overlap_recompute_comm]: 4.94e-06 [overlap_grad_ring_attention]: 7.21001e-06 [overlap_grad_flash_sp]: 2.313e-05 [begin_end_overlap_inline]: 3.26999e-06 [split_matmul_comm_elemetwise]: 4.3e-06 [split_layernorm_comm]: 4.32e-06 [handle_group_info]: 3.25998e-06 [symbol_engine_optimizer]: 0.00010436, [1] [Cycle 1]: 9.786e-05, [6] [build]: 2.73998e-06 [elim_shapecalc]: 1.187e-05 [elim_not_effective]: 1.581e-05 [opt_reshape]: 9.12001e-06 [fold_const_symbol]: 1.282e-05 [renormalize]: 3.69997e-07 [detach_backward]: 3.71001e-06 [pipeline_parallel_scheduler]: 1.62999e-06 [auto_monad_reorder]: 2.195e-05 [get_jit_bprop_graph]: 1.30999e-06 [rewriter_after_jit_bprop_graph]: 4.35999e-06 [opt_after_jit_grad]: 0.00047125 [validate]: 3.92e-05 Sums bootstrap : 0.000433s : 3.77% type_inference : 0.005640s : 49.06% event_method : 0.000021s : 0.18% auto_monad : 0.000062s : 0.54% graph_reusing : 0.000006s : 0.05% inline : 0.000003s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000021s : 0.18% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000007s : 0.06% parallel-infer-symbol : 0.000003s : 0.02% pre_auto_parallel : 0.000032s : 0.28% insert-virtual-dataset : 0.000003s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000032s : 0.28% optimize.rewriter_before_opt_a : 0.000096s : 0.83% optimize.opt_a.expand_dump_flag : 0.000004s : 0.04% optimize.opt_a.switch_simplify : 0.000058s : 0.50% optimize.opt_a.loop_unroll : 0.000044s : 0.39% optimize.opt_a.a_1 : 0.001073s : 9.34% optimize.opt_a.with_stream_mark : 0.000028s : 0.24% optimize.opt_a.recompute_prepare : 0.000021s : 0.18% optimize.opt_a.updatestate_depend_eliminate : 0.000009s : 0.08% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.06% optimize.opt_a.updatestate_loads_eliminate : 0.000007s : 0.06% optimize.opt_a.parameter_eliminate : 0.000003s : 0.03% optimize.opt_a.a_2 : 0.000269s : 2.34% optimize.opt_a.accelerated_algorithm : 0.000018s : 0.15% optimize.opt_a.shard : 0.000003s : 0.03% optimize.opt_a.meta_shard_fg_expand : 0.000003s : 0.03% optimize.opt_a.shard_inline : 0.000017s : 0.15% optimize.opt_a.merge_send_recv : 0.000016s : 0.14% optimize.opt_a.auto_parallel : 0.000014s : 0.12% optimize.opt_a.parallel : 0.000025s : 0.21% optimize.opt_a.flash_sp : 0.000012s : 0.10% optimize.opt_a.merge_comm : 0.000009s : 0.08% optimize.opt_a.allreduce_fusion : 0.000008s : 0.07% optimize.opt_a.matmul_add_comm_reduction : 0.000017s : 0.15% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000020s : 0.17% optimize.opt_a.virtual_dataset : 0.000017s : 0.15% optimize.opt_a.get_grad_eliminate_ : 0.000016s : 0.14% optimize.opt_a.virtual_output : 0.000016s : 0.14% optimize.opt_a.merge_forward : 0.000008s : 0.07% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.03% optimize.opt_a.offload_activation : 0.000018s : 0.15% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000036s : 0.31% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.02% optimize.opt_a.before_grad : 0.000026s : 0.23% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000009s : 0.08% optimize.opt_a.meta_fg_expand : 0.000006s : 0.05% optimize.opt_a.flash_sp_send_recv_attached : 0.000003s : 0.03% optimize.opt_a.receive_attached : 0.000003s : 0.03% optimize.opt_a.after_resolve : 0.000027s : 0.24% optimize.opt_a.a_after_grad : 0.000025s : 0.22% optimize.opt_a.renormalize : 0.000634s : 5.52% optimize.opt_a.add_forward_monad_depend : 0.000007s : 0.06% optimize.opt_a.auto_monad_grad : 0.000003s : 0.03% optimize.opt_a.auto_monad_eliminator : 0.000025s : 0.22% optimize.opt_a.cse : 0.000053s : 0.46% optimize.opt_a.a_3 : 0.000139s : 1.21% optimize.py_interpret_to_execute_after_opt_a : 0.000014s : 0.13% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.04% optimize.rewriter_after_opt_a : 0.000045s : 0.39% optimize.convert_after_rewriter : 0.000011s : 0.09% optimize.order_py_execute_after_rewriter : 0.000009s : 0.08% optimize.mutable_eliminate : 0.000495s : 4.31% optimize.opt_b.b_1 : 0.000225s : 1.95% optimize.opt_b.b_2 : 0.000010s : 0.09% optimize.opt_b.updatestate_depend_eliminate : 0.000006s : 0.05% optimize.opt_b.updatestate_assign_eliminate : 0.000004s : 0.03% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000022s : 0.19% optimize.optimize_parallel_all_gather_comm : 0.000021s : 0.18% optimize.overlap_param_gather : 0.000005s : 0.04% optimize.cconv : 0.000027s : 0.23% optimize.loop_unroll : 0.000432s : 3.75% optimize.opt_after_cconv.c_1 : 0.000042s : 0.36% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000007s : 0.06% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000004s : 0.03% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.cse : 0.000023s : 0.20% optimize.opt_after_cconv.renormalize : 0.000001s : 0.01% optimize.remove_dup_value : 0.000018s : 0.16% optimize.tuple_transform.d_1 : 0.000057s : 0.50% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.02% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000009s : 0.08% optimize.partial_unused_args_eliminate : 0.000006s : 0.05% optimize.add_recomputation : 0.000059s : 0.51% optimize.cse_after_recomputation.cse : 0.000017s : 0.15% optimize.environ_conv : 0.000009s : 0.08% optimize.swap_dp_allreduce_reducescatter : 0.000009s : 0.08% optimize.bias_add_comm_swap : 0.000005s : 0.05% optimize.label_micro_interleaved_index : 0.000007s : 0.06% optimize.label_fine_grained_interleaved_index : 0.000006s : 0.05% optimize.merge_cast_opt : 0.000004s : 0.03% optimize.slice_recompute_activation : 0.000004s : 0.04% optimize.micro_interleaved_order_control : 0.000005s : 0.04% optimize.assign_add_opt : 0.000004s : 0.03% optimize.ForceFp32Comm : 0.000003s : 0.03% optimize.remove_cast_before_assign_add : 0.000003s : 0.03% optimize.full_micro_interleaved_order_control : 0.000005s : 0.05% optimize.reorder_send_recv_between_fp_bp : 0.000005s : 0.05% optimize.comm_op_add_attrs : 0.000004s : 0.03% optimize.add_comm_op_reuse_tag : 0.000003s : 0.03% optimize.interleave_split_concat_branches : 0.000004s : 0.03% optimize.interleave_parallel_branches : 0.000004s : 0.03% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.03% optimize.overlap_opt_shard_grad_in_pipeline : 0.000004s : 0.04% optimize.control_data_broadcast_order : 0.000018s : 0.16% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.04% optimize.offloading_packed_experts : 0.000007s : 0.06% optimize.overlap_recompute_and_grad_model_parallel : 0.000008s : 0.07% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.03% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.03% optimize.overlap_recompute_comm : 0.000005s : 0.04% optimize.overlap_grad_ring_attention : 0.000007s : 0.06% optimize.overlap_grad_flash_sp : 0.000023s : 0.20% optimize.begin_end_overlap_inline : 0.000003s : 0.03% optimize.split_matmul_comm_elemetwise : 0.000004s : 0.04% optimize.split_layernorm_comm : 0.000004s : 0.04% optimize.handle_group_info : 0.000003s : 0.03% optimize.symbol_engine_optimizer.build : 0.000003s : 0.02% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000012s : 0.10% optimize.symbol_engine_optimizer.elim_not_effective : 0.000016s : 0.14% optimize.symbol_engine_optimizer.opt_reshape : 0.000009s : 0.08% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000013s : 0.11% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000004s : 0.03% pipeline_parallel_scheduler : 0.000002s : 0.01% auto_monad_reorder : 0.000022s : 0.19% get_jit_bprop_graph : 0.000001s : 0.01% rewriter_after_jit_bprop_graph : 0.000004s : 0.04% opt_after_jit_grad : 0.000471s : 4.10% validate : 0.000039s : 0.34% Time group info: ------[substitution.] 0.000216 44 9.81% : 0.000021s : 3: substitution.cast_eliminate 0.99% : 0.000002s : 3: substitution.elim_not_effective 0.75% : 0.000002s : 3: substitution.fold_const_symbol 3.23% : 0.000007s : 6: substitution.graph_param_transform 66.69% : 0.000144s : 4: substitution.inline 1.99% : 0.000004s : 6: substitution.j_node_and_user_rematch 2.93% : 0.000006s : 6: substitution.remove_not_recompute_node 2.42% : 0.000005s : 6: substitution.replace_old_param 8.53% : 0.000018s : 6: substitution.tuple_list_get_item_eliminator 2.63% : 0.000006s : 1: substitution.value_based_eliminate ------[type_inference.] 0.005592 2 87.07% : 0.004869s : 1: type_inference.infer 12.93% : 0.000723s : 1: type_inference.specialize ------[replace.] 0.000075 10 50.57% : 0.000038s : 4: replace.inline 49.43% : 0.000037s : 6: replace.tuple_list_get_item_eliminator ------[match.] 0.000157 10 90.03% : 0.000142s : 4: match.inline 9.97% : 0.000016s : 6: match.tuple_list_get_item_eliminator ------[predicate.] 0.000294 1954 0.98% : 0.000003s : 21: predicate.accumulaten_eliminater 0.62% : 0.000002s : 6: predicate.ad_related_special_op_eliminate 0.52% : 0.000002s : 12: predicate.addn_check_dump 0.98% : 0.000003s : 21: predicate.addn_zero_filter 0.93% : 0.000003s : 21: predicate.adjust_all_reduce_mul_add 2.04% : 0.000006s : 33: predicate.arithmetic_simplify 1.05% : 0.000003s : 21: predicate.cast_eliminate 0.60% : 0.000002s : 12: predicate.check_bprop_eliminate 0.52% : 0.000002s : 12: predicate.compare_switch_simplify 0.19% : 0.000001s : 6: predicate.const_output_eliminate 0.59% : 0.000002s : 12: predicate.depend_value_elim 1.09% : 0.000003s : 21: predicate.dict_get_item_const_eliminator 1.10% : 0.000003s : 21: predicate.dict_get_item_eliminator 0.95% : 0.000003s : 21: predicate.dict_set_item_eliminator 0.84% : 0.000002s : 12: predicate.dumpgradient_eliminate 0.20% : 0.000001s : 6: predicate.elim_not_effective 0.39% : 0.000001s : 6: predicate.elim_shapecalc_of_broadcastargs 1.32% : 0.000004s : 27: predicate.environ_add_const_eliminate 1.24% : 0.000004s : 27: predicate.environ_get_add_eliminate 1.19% : 0.000003s : 27: predicate.environ_get_depend_swap 1.83% : 0.000005s : 39: predicate.environ_get_eliminate 1.17% : 0.000003s : 27: predicate.environ_get_set_eliminate 1.48% : 0.000004s : 31: predicate.exchange_switch_depend_value 2.32% : 0.000007s : 31: predicate.float_depend_g_call 0.54% : 0.000002s : 12: predicate.float_environ_get_switch 0.81% : 0.000002s : 18: predicate.float_tuple_getitem_switch 0.19% : 0.000001s : 6: predicate.fold_const_symbol 0.62% : 0.000002s : 12: predicate.get_grad_eliminate 0.20% : 0.000001s : 6: predicate.graph_param_transform 0.60% : 0.000002s : 12: predicate.incorporate_call 0.51% : 0.000002s : 12: predicate.incorporate_call_switch 6.26% : 0.000018s : 88: predicate.inline 0.79% : 0.000002s : 12: predicate.inline_without_move 0.33% : 0.000001s : 12: predicate.j_node_and_user_rematch 0.68% : 0.000002s : 12: predicate.less_batch_normalization 1.90% : 0.000006s : 39: predicate.list_to_tuple_eliminator_ 2.70% : 0.000008s : 60: predicate.load_eliminater 0.73% : 0.000002s : 6: predicate.loop_unroll_after_grad 2.00% : 0.000006s : 42: predicate.loop_unroll_before_grad 1.60% : 0.000005s : 33: predicate.make_slice_get_slice_eliminator 0.57% : 0.000002s : 12: predicate.merge_addn 0.61% : 0.000002s : 12: predicate.micro_step_allgather_replace 0.58% : 0.000002s : 12: predicate.mini_step_allgather_replace 0.90% : 0.000003s : 21: predicate.minmaximum_grad 0.69% : 0.000002s : 6: predicate.mutable_eliminate 0.35% : 0.000001s : 6: predicate.opt_reshape 0.34% : 0.000001s : 6: predicate.parallel_virtual_node 1.83% : 0.000005s : 31: predicate.partial_defer_inline 1.85% : 0.000005s : 33: predicate.partial_eliminate 0.97% : 0.000003s : 21: predicate.print_const_string_wrapper 0.59% : 0.000002s : 12: predicate.reduce_all_const_elim 1.21% : 0.000004s : 21: predicate.reduce_eliminate 2.75% : 0.000008s : 60: predicate.redundant_stop_gradient_eliminater 0.38% : 0.000001s : 12: predicate.remove_not_recompute_node 1.42% : 0.000004s : 39: predicate.replace_applicator 0.49% : 0.000001s : 12: predicate.replace_old_param 0.23% : 0.000001s : 6: predicate.reset_defer_inline 1.00% : 0.000003s : 21: predicate.reshape_eliminate 0.68% : 0.000002s : 12: predicate.row_tensor_add_zeros_like 0.32% : 0.000001s : 6: predicate.row_tensor_eliminate 0.68% : 0.000002s : 12: predicate.same_eliminate 0.56% : 0.000002s : 12: predicate.set_cell_output_no_recompute 0.83% : 0.000002s : 12: predicate.shard_identity_eliminate 0.74% : 0.000002s : 12: predicate.special_op_eliminate 0.70% : 0.000002s : 12: predicate.specialize_transform 0.72% : 0.000002s : 12: predicate.split_environ_get_set_with_tuple_value 0.71% : 0.000002s : 12: predicate.stack_unstack_eliminate 0.33% : 0.000001s : 6: predicate.switch_call_monad_eliminater 1.56% : 0.000005s : 31: predicate.switch_defer_inline 2.20% : 0.000006s : 43: predicate.switch_layer_defer_inline 4.68% : 0.000014s : 91: predicate.switch_simplify 0.95% : 0.000003s : 21: predicate.tile_eliminate 0.98% : 0.000003s : 21: predicate.transpose_eliminate 1.61% : 0.000005s : 33: predicate.tuple_list_convert_item_index_to_positive 1.65% : 0.000005s : 33: predicate.tuple_list_get_item_const_eliminator 1.49% : 0.000004s : 33: predicate.tuple_list_get_item_depend_reorder 3.27% : 0.000010s : 51: predicate.tuple_list_get_item_eliminator 1.56% : 0.000005s : 33: predicate.tuple_list_get_set_item_eliminator 2.29% : 0.000007s : 45: predicate.tuple_list_set_item_eliminator 1.79% : 0.000005s : 39: predicate.tuple_to_list_eliminator_ 2.66% : 0.000008s : 60: predicate.updatestate_pure_node_eliminater 3.36% : 0.000010s : 72: predicate.updatestate_useless_node_eliminater 0.37% : 0.000001s : 6: predicate.value_based_eliminate 0.68% : 0.000002s : 12: predicate.virtual_dataset_eliminate 0.61% : 0.000002s : 12: predicate.virtual_output_eliminate 0.25% : 0.000001s : 6: predicate.virtual_view_grad_eliminate 0.37% : 0.000001s : 6: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000527 11 54.77% : 0.000289s : 5: func_graph_cloner_run.FuncGraphClonerGraph 45.23% : 0.000239s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.027751 192 0.02% : 0.000006s : 1: ForceFp32Comm 10.90% : 0.003025s : 1: add_attr 10.85% : 0.003012s : 1: add_attr_with_inline 0.02% : 0.000006s : 1: add_comm_op_reuse_tag 0.23% : 0.000063s : 1: add_recomputation 0.02% : 0.000006s : 1: assign_add_opt 0.26% : 0.000072s : 1: auto_monad 0.11% : 0.000029s : 1: auto_monad_reorder 0.02% : 0.000006s : 1: begin_end_overlap_inline 0.03% : 0.000008s : 1: bias_add_comm_swap 1.71% : 0.000475s : 1: bootstrap 0.11% : 0.000030s : 1: cconv 0.02% : 0.000006s : 1: comm_op_add_attrs 0.08% : 0.000021s : 1: control_data_broadcast_order 0.05% : 0.000014s : 1: convert_after_rewriter 0.13% : 0.000037s : 1: cse_after_recomputation 0.03% : 0.000008s : 1: dataset_repeat_opt 0.06% : 0.000018s : 1: detach_backward 0.04% : 0.000012s : 1: environ_conv 0.11% : 0.000031s : 1: event_method 0.03% : 0.000008s : 1: full_micro_interleaved_order_control 0.03% : 0.000007s : 1: get_jit_bprop_graph 0.04% : 0.000012s : 1: graph_reusing 0.03% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.02% : 0.000006s : 1: handle_group_info 0.03% : 0.000008s : 1: inline 0.03% : 0.000009s : 1: insert-virtual-dataset 0.02% : 0.000006s : 1: interleave_parallel_branches 0.02% : 0.000006s : 1: interleave_split_concat_branches 0.03% : 0.000008s : 1: label_fine_grained_interleaved_index 0.03% : 0.000010s : 1: label_micro_interleaved_index 1.58% : 0.000437s : 1: loop_unroll 0.02% : 0.000006s : 1: merge_cast_opt 0.03% : 0.000007s : 1: micro_interleaved_order_control 1.81% : 0.000501s : 1: mutable_eliminate 0.04% : 0.000010s : 1: offloading_packed_experts 0.06% : 0.000016s : 1: opt.transform.loop_unroll_optimizer 0.06% : 0.000017s : 1: opt.transform.mutable_eliminate 5.95% : 0.001652s : 78: opt.transform.opt_a 0.15% : 0.000040s : 1: opt.transform.opt_after_cconv 0.11% : 0.000030s : 1: opt.transform.opt_after_jit_grad 0.58% : 0.000162s : 28: opt.transform.opt_b 0.23% : 0.000064s : 2: opt.transform.opt_trans_graph 0.17% : 0.000046s : 4: opt.transform.symbol_engine_opt 12.73% : 0.003532s : 1: opt_a 0.56% : 0.000154s : 1: opt_after_cconv 1.73% : 0.000481s : 1: opt_after_jit_grad 1.22% : 0.000339s : 1: opt_b 22.61% : 0.006276s : 1: optimize 0.09% : 0.000024s : 1: optimize_parallel_all_gather_comm 0.04% : 0.000012s : 1: order_py_execute_after_rewriter 0.09% : 0.000026s : 1: overlap_grad_flash_sp 0.02% : 0.000006s : 1: overlap_grad_matmul_and_grad_allreduce 0.04% : 0.000010s : 1: overlap_grad_ring_attention 0.03% : 0.000007s : 1: overlap_opt_shard_grad_in_pipeline 0.02% : 0.000006s : 1: overlap_opt_shard_in_pipeline 0.03% : 0.000008s : 1: overlap_param_gather 0.02% : 0.000006s : 1: overlap_recompute_allgather_and_fa_grad 0.04% : 0.000011s : 1: overlap_recompute_and_grad_model_parallel 0.03% : 0.000008s : 1: overlap_recompute_comm 0.03% : 0.000009s : 1: parallel-infer-symbol 0.02% : 0.000007s : 1: parallel-infer-symbol-second 0.03% : 0.000009s : 1: partial_unused_args_eliminate 0.03% : 0.000009s : 1: pipeline_parallel_scheduler 0.03% : 0.000008s : 1: pipeline_split 0.14% : 0.000039s : 1: pre_auto_parallel 0.13% : 0.000036s : 1: py_interpret_to_execute 0.06% : 0.000018s : 1: py_interpret_to_execute_after_opt_a 0.02% : 0.000006s : 1: remove_cast_before_assign_add 0.08% : 0.000021s : 1: remove_dup_value 1.19% : 0.000331s : 1: renormalize.infer 1.07% : 0.000296s : 1: renormalize.specialize 0.03% : 0.000008s : 1: reorder_send_recv_between_fp_bp 0.04% : 0.000010s : 1: rewriter_after_jit_bprop_graph 0.18% : 0.000049s : 1: rewriter_after_opt_a 0.36% : 0.000100s : 1: rewriter_before_opt_a 0.03% : 0.000008s : 1: slice_cell_reuse_recomputed_activation 0.03% : 0.000007s : 1: slice_recompute_activation 0.03% : 0.000007s : 1: split_layernorm_comm 0.03% : 0.000007s : 1: split_matmul_comm_elemetwise 0.04% : 0.000012s : 1: swap_dp_allreduce_reducescatter 0.39% : 0.000107s : 1: symbol_engine_optimizer 0.39% : 0.000109s : 1: tuple_transform 20.43% : 0.005669s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:47:12.133.679 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0159328, [21] [bootstrap]: 0.00043201 [type_inference]: 0.00586753 [event_method]: 2.135e-05 [auto_monad]: 6.383e-05 [graph_reusing]: 5.49998e-06 [inline]: 2.35002e-06 [add_attr]: 0.00312685, [1] [add_attr_with_inline]: 0.00311761, [1] [Cycle 1]: 6.404e-05, [2] [tag_attr]: 2.138e-05 [meta_addattr_fg_expand]: 6.09001e-06 [parallel-infer-symbol]: 3.63999e-06 [pre_auto_parallel]: 3.456e-05 [insert-virtual-dataset]: 2.51998e-06 [parallel-infer-symbol-second]: 8.40024e-07 [dataset_repeat_opt]: 1.80001e-06 [pipeline_split]: 1.64e-06 [optimize]: 0.0056229, [53] [py_interpret_to_execute]: 2.893e-05 [rewriter_before_opt_a]: 9.384e-05 [opt_a]: 0.00338327, [2] [Cycle 1]: 0.00252677, [45] [expand_dump_flag]: 2.91e-06 [switch_simplify]: 4.997e-05 [loop_unroll]: 3.632e-05 [a_1]: 0.00088857 [with_stream_mark]: 1.999e-05 [recompute_prepare]: 1.296e-05 [updatestate_depend_eliminate]: 5.07e-06 [updatestate_assign_eliminate]: 4.23999e-06 [updatestate_loads_eliminate]: 4.08001e-06 [parameter_eliminate]: 1.81003e-06 [a_2]: 0.00011071 [accelerated_algorithm]: 9.36e-06 [shard]: 2.14999e-06 [meta_shard_fg_expand]: 2.07999e-06 [shard_inline]: 8.62e-06 [merge_send_recv]: 9.71003e-06 [auto_parallel]: 7.61999e-06 [parallel]: 1.901e-05 [flash_sp]: 9.67999e-06 [merge_comm]: 4.52e-06 [allreduce_fusion]: 4.2e-06 [matmul_add_comm_reduction]: 1.14e-05 [allreduce_slice_to_reducescatter]: 9.20001e-07 [virtual_shard_identity]: 1.147e-05 [virtual_dataset]: 8.87999e-06 [get_grad_eliminate_]: 8.03001e-06 [virtual_output]: 8.18999e-06 [merge_forward]: 4.47003e-06 [cell_reuse_recompute_pass]: 1.49e-06 [offload_activation]: 1.094e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.691e-05 [merge_recompute_call_nodes]: 1.42e-06 [before_grad]: 1.372e-05 [set_forward_comm_id_for_comm_node_pass]: 4.95999e-06 [meta_fg_expand]: 3.70998e-06 [flash_sp_send_recv_attached]: 2.81999e-06 [receive_attached]: 2.51998e-06 [after_resolve]: 1.527e-05 [a_after_grad]: 1.382e-05 [renormalize]: 0.00078803 [add_forward_monad_depend]: 6.42001e-06 [auto_monad_grad]: 2.41e-06 [auto_monad_eliminator]: 1.743e-05 [cse]: 3.612e-05 [a_3]: 6.369e-05 [Cycle 2]: 0.0008463, [45] [expand_dump_flag]: 1.19e-06 [switch_simplify]: 1.001e-05 [loop_unroll]: 8.37e-06 [a_1]: 0.00020296 [with_stream_mark]: 1.3e-05 [recompute_prepare]: 8.52e-06 [updatestate_depend_eliminate]: 3.90998e-06 [updatestate_assign_eliminate]: 3.16999e-06 [updatestate_loads_eliminate]: 3.3e-06 [parameter_eliminate]: 1.82999e-06 [a_2]: 0.00010299 [accelerated_algorithm]: 8.55001e-06 [shard]: 1.25999e-06 [meta_shard_fg_expand]: 1.91e-06 [shard_inline]: 8.11002e-06 [merge_send_recv]: 7.01999e-06 [auto_parallel]: 6.29001e-06 [parallel]: 6.39001e-06 [flash_sp]: 3.85e-06 [merge_comm]: 4.62e-06 [allreduce_fusion]: 5.19e-06 [matmul_add_comm_reduction]: 7.63999e-06 [allreduce_slice_to_reducescatter]: 3.50003e-07 [virtual_shard_identity]: 9.89999e-06 [virtual_dataset]: 8.07e-06 [get_grad_eliminate_]: 7.63001e-06 [virtual_output]: 7.51001e-06 [merge_forward]: 4.75999e-06 [cell_reuse_recompute_pass]: 1.71998e-06 [offload_activation]: 8.52e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.708e-05 [merge_recompute_call_nodes]: 1.07e-06 [before_grad]: 1.35e-05 [set_forward_comm_id_for_comm_node_pass]: 4.37e-06 [meta_fg_expand]: 2.78e-06 [flash_sp_send_recv_attached]: 8.2e-07 [receive_attached]: 1.00001e-06 [after_resolve]: 1.406e-05 [a_after_grad]: 1.256e-05 [renormalize]: 8.00064e-08 [add_forward_monad_depend]: 2.17001e-06 [auto_monad_grad]: 1.54e-06 [auto_monad_eliminator]: 1.124e-05 [cse]: 2.229e-05 [a_3]: 5.088e-05 [py_interpret_to_execute_after_opt_a]: 1.184e-05 [slice_cell_reuse_recomputed_activation]: 2.21e-06 [rewriter_after_opt_a]: 4.224e-05 [convert_after_rewriter]: 8.54998e-06 [order_py_execute_after_rewriter]: 5.69e-06 [mutable_eliminate]: 0.00050874 [opt_b]: 0.00027757, [1] [Cycle 1]: 0.0002713, [7] [b_1]: 0.00018157 [b_2]: 1.123e-05 [updatestate_depend_eliminate]: 7.01001e-06 [updatestate_assign_eliminate]: 3.81001e-06 [updatestate_loads_eliminate]: 3.25e-06 [renormalize]: 8.89995e-07 [cse]: 2.587e-05 [optimize_parallel_all_gather_comm]: 1.906e-05 [overlap_param_gather]: 2.19999e-06 [cconv]: 2.664e-05 [loop_unroll]: 0.00045224 [opt_after_cconv]: 0.00012941, [1] [Cycle 1]: 0.00012335, [7] [c_1]: 4.348e-05 [parameter_eliminate]: 3.19001e-06 [updatestate_depend_eliminate]: 7.66001e-06 [updatestate_assign_eliminate]: 3.49001e-06 [updatestate_loads_eliminate]: 3.31999e-06 [cse]: 2.597e-05 [renormalize]: 5.19998e-07 [remove_dup_value]: 1.553e-05 [tuple_transform]: 9.598e-05, [1] [Cycle 1]: 9.077e-05, [4] [d_1]: 5.916e-05 [none_parameter_eliminate]: 1.82001e-06 [renormalize]: 2.50002e-07 [switch_simplify]: 9.81e-06 [partial_unused_args_eliminate]: 1.76e-06 [add_recomputation]: 5.873e-05 [cse_after_recomputation]: 2.976e-05, [1] [Cycle 1]: 2.42e-05, [1] [cse]: 1.765e-05 [environ_conv]: 6.73e-06 [swap_dp_allreduce_reducescatter]: 6.76e-06 [bias_add_comm_swap]: 2.78e-06 [label_micro_interleaved_index]: 4.99e-06 [label_fine_grained_interleaved_index]: 2.74001e-06 [merge_cast_opt]: 1.34e-06 [slice_recompute_activation]: 2.12001e-06 [micro_interleaved_order_control]: 2.37999e-06 [assign_add_opt]: 1.12999e-06 [ForceFp32Comm]: 1.09e-06 [remove_cast_before_assign_add]: 1.40999e-06 [full_micro_interleaved_order_control]: 1.97999e-06 [reorder_send_recv_between_fp_bp]: 2.92002e-06 [comm_op_add_attrs]: 9.80013e-07 [add_comm_op_reuse_tag]: 9.39996e-07 [interleave_split_concat_branches]: 1.09998e-06 [interleave_parallel_branches]: 1.23002e-06 [overlap_opt_shard_in_pipeline]: 1.43002e-06 [overlap_opt_shard_grad_in_pipeline]: 1.72001e-06 [control_data_broadcast_order]: 1.503e-05 [grouped_pairwise_exchange_alltoall]: 1.54e-06 [offloading_packed_experts]: 4.47e-06 [overlap_recompute_and_grad_model_parallel]: 5.29e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.14e-06 [overlap_recompute_allgather_and_fa_grad]: 1.35001e-06 [overlap_recompute_comm]: 2.48e-06 [overlap_grad_ring_attention]: 4.69998e-06 [overlap_grad_flash_sp]: 2.193e-05 [begin_end_overlap_inline]: 6.79982e-07 [split_matmul_comm_elemetwise]: 2.21e-06 [split_layernorm_comm]: 1.60999e-06 [handle_group_info]: 8.89995e-07 [symbol_engine_optimizer]: 9.552e-05, [1] [Cycle 1]: 9.086e-05, [6] [build]: 4.23001e-06 [elim_shapecalc]: 1.491e-05 [elim_not_effective]: 1.691e-05 [opt_reshape]: 9.02e-06 [fold_const_symbol]: 1.338e-05 [renormalize]: 5.50004e-07 [detach_backward]: 2.17001e-06 [pipeline_parallel_scheduler]: 1.43002e-06 [auto_monad_reorder]: 2.225e-05 [get_jit_bprop_graph]: 1.87001e-06 [rewriter_after_jit_bprop_graph]: 4.48999e-06 [opt_after_jit_grad]: 0.00052284 [validate]: 4.203e-05 Sums bootstrap : 0.000432s : 3.66% type_inference : 0.005868s : 49.68% event_method : 0.000021s : 0.18% auto_monad : 0.000064s : 0.54% graph_reusing : 0.000005s : 0.05% inline : 0.000002s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000021s : 0.18% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.05% parallel-infer-symbol : 0.000004s : 0.03% pre_auto_parallel : 0.000035s : 0.29% insert-virtual-dataset : 0.000003s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000029s : 0.24% optimize.rewriter_before_opt_a : 0.000094s : 0.79% optimize.opt_a.expand_dump_flag : 0.000004s : 0.03% optimize.opt_a.switch_simplify : 0.000060s : 0.51% optimize.opt_a.loop_unroll : 0.000045s : 0.38% optimize.opt_a.a_1 : 0.001092s : 9.24% optimize.opt_a.with_stream_mark : 0.000033s : 0.28% optimize.opt_a.recompute_prepare : 0.000021s : 0.18% optimize.opt_a.updatestate_depend_eliminate : 0.000009s : 0.08% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.06% optimize.opt_a.updatestate_loads_eliminate : 0.000007s : 0.06% optimize.opt_a.parameter_eliminate : 0.000004s : 0.03% optimize.opt_a.a_2 : 0.000214s : 1.81% optimize.opt_a.accelerated_algorithm : 0.000018s : 0.15% optimize.opt_a.shard : 0.000003s : 0.03% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.03% optimize.opt_a.shard_inline : 0.000017s : 0.14% optimize.opt_a.merge_send_recv : 0.000017s : 0.14% optimize.opt_a.auto_parallel : 0.000014s : 0.12% optimize.opt_a.parallel : 0.000025s : 0.22% optimize.opt_a.flash_sp : 0.000014s : 0.11% optimize.opt_a.merge_comm : 0.000009s : 0.08% optimize.opt_a.allreduce_fusion : 0.000009s : 0.08% optimize.opt_a.matmul_add_comm_reduction : 0.000019s : 0.16% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000021s : 0.18% optimize.opt_a.virtual_dataset : 0.000017s : 0.14% optimize.opt_a.get_grad_eliminate_ : 0.000016s : 0.13% optimize.opt_a.virtual_output : 0.000016s : 0.13% optimize.opt_a.merge_forward : 0.000009s : 0.08% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.03% optimize.opt_a.offload_activation : 0.000019s : 0.16% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000034s : 0.29% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.02% optimize.opt_a.before_grad : 0.000027s : 0.23% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000009s : 0.08% optimize.opt_a.meta_fg_expand : 0.000006s : 0.05% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.03% optimize.opt_a.receive_attached : 0.000004s : 0.03% optimize.opt_a.after_resolve : 0.000029s : 0.25% optimize.opt_a.a_after_grad : 0.000026s : 0.22% optimize.opt_a.renormalize : 0.000788s : 6.67% optimize.opt_a.add_forward_monad_depend : 0.000009s : 0.07% optimize.opt_a.auto_monad_grad : 0.000004s : 0.03% optimize.opt_a.auto_monad_eliminator : 0.000029s : 0.24% optimize.opt_a.cse : 0.000058s : 0.49% optimize.opt_a.a_3 : 0.000115s : 0.97% optimize.py_interpret_to_execute_after_opt_a : 0.000012s : 0.10% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.02% optimize.rewriter_after_opt_a : 0.000042s : 0.36% optimize.convert_after_rewriter : 0.000009s : 0.07% optimize.order_py_execute_after_rewriter : 0.000006s : 0.05% optimize.mutable_eliminate : 0.000509s : 4.31% optimize.opt_b.b_1 : 0.000182s : 1.54% optimize.opt_b.b_2 : 0.000011s : 0.10% optimize.opt_b.updatestate_depend_eliminate : 0.000007s : 0.06% optimize.opt_b.updatestate_assign_eliminate : 0.000004s : 0.03% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_b.renormalize : 0.000001s : 0.01% optimize.opt_b.cse : 0.000026s : 0.22% optimize.optimize_parallel_all_gather_comm : 0.000019s : 0.16% optimize.overlap_param_gather : 0.000002s : 0.02% optimize.cconv : 0.000027s : 0.23% optimize.loop_unroll : 0.000452s : 3.83% optimize.opt_after_cconv.c_1 : 0.000043s : 0.37% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000008s : 0.06% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.cse : 0.000026s : 0.22% optimize.opt_after_cconv.renormalize : 0.000001s : 0.00% optimize.remove_dup_value : 0.000016s : 0.13% optimize.tuple_transform.d_1 : 0.000059s : 0.50% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.02% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000010s : 0.08% optimize.partial_unused_args_eliminate : 0.000002s : 0.01% optimize.add_recomputation : 0.000059s : 0.50% optimize.cse_after_recomputation.cse : 0.000018s : 0.15% optimize.environ_conv : 0.000007s : 0.06% optimize.swap_dp_allreduce_reducescatter : 0.000007s : 0.06% optimize.bias_add_comm_swap : 0.000003s : 0.02% optimize.label_micro_interleaved_index : 0.000005s : 0.04% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.02% optimize.merge_cast_opt : 0.000001s : 0.01% optimize.slice_recompute_activation : 0.000002s : 0.02% optimize.micro_interleaved_order_control : 0.000002s : 0.02% optimize.assign_add_opt : 0.000001s : 0.01% optimize.ForceFp32Comm : 0.000001s : 0.01% optimize.remove_cast_before_assign_add : 0.000001s : 0.01% optimize.full_micro_interleaved_order_control : 0.000002s : 0.02% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.02% optimize.comm_op_add_attrs : 0.000001s : 0.01% optimize.add_comm_op_reuse_tag : 0.000001s : 0.01% optimize.interleave_split_concat_branches : 0.000001s : 0.01% optimize.interleave_parallel_branches : 0.000001s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.01% optimize.control_data_broadcast_order : 0.000015s : 0.13% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.01% optimize.offloading_packed_experts : 0.000004s : 0.04% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.04% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.01% optimize.overlap_recompute_comm : 0.000002s : 0.02% optimize.overlap_grad_ring_attention : 0.000005s : 0.04% optimize.overlap_grad_flash_sp : 0.000022s : 0.19% optimize.begin_end_overlap_inline : 0.000001s : 0.01% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.02% optimize.split_layernorm_comm : 0.000002s : 0.01% optimize.handle_group_info : 0.000001s : 0.01% optimize.symbol_engine_optimizer.build : 0.000004s : 0.04% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000015s : 0.13% optimize.symbol_engine_optimizer.elim_not_effective : 0.000017s : 0.14% optimize.symbol_engine_optimizer.opt_reshape : 0.000009s : 0.08% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000013s : 0.11% optimize.symbol_engine_optimizer.renormalize : 0.000001s : 0.00% detach_backward : 0.000002s : 0.02% pipeline_parallel_scheduler : 0.000001s : 0.01% auto_monad_reorder : 0.000022s : 0.19% get_jit_bprop_graph : 0.000002s : 0.02% rewriter_after_jit_bprop_graph : 0.000004s : 0.04% opt_after_jit_grad : 0.000523s : 4.43% validate : 0.000042s : 0.36% Time group info: ------[substitution.] 0.000230 44 9.44% : 0.000022s : 3: substitution.cast_eliminate 0.97% : 0.000002s : 3: substitution.elim_not_effective 0.69% : 0.000002s : 3: substitution.fold_const_symbol 3.06% : 0.000007s : 6: substitution.graph_param_transform 67.53% : 0.000156s : 4: substitution.inline 2.00% : 0.000005s : 6: substitution.j_node_and_user_rematch 2.89% : 0.000007s : 6: substitution.remove_not_recompute_node 2.80% : 0.000006s : 6: substitution.replace_old_param 8.05% : 0.000019s : 6: substitution.tuple_list_get_item_eliminator 2.56% : 0.000006s : 1: substitution.value_based_eliminate ------[type_inference.] 0.005800 2 86.71% : 0.005029s : 1: type_inference.infer 13.29% : 0.000770s : 1: type_inference.specialize ------[replace.] 0.000076 10 52.20% : 0.000040s : 4: replace.inline 47.80% : 0.000037s : 6: replace.tuple_list_get_item_eliminator ------[match.] 0.000168 10 90.79% : 0.000153s : 4: match.inline 9.21% : 0.000015s : 6: match.tuple_list_get_item_eliminator ------[predicate.] 0.000294 1954 0.96% : 0.000003s : 21: predicate.accumulaten_eliminater 0.70% : 0.000002s : 6: predicate.ad_related_special_op_eliminate 0.58% : 0.000002s : 12: predicate.addn_check_dump 1.00% : 0.000003s : 21: predicate.addn_zero_filter 0.92% : 0.000003s : 21: predicate.adjust_all_reduce_mul_add 1.82% : 0.000005s : 33: predicate.arithmetic_simplify 1.16% : 0.000003s : 21: predicate.cast_eliminate 0.61% : 0.000002s : 12: predicate.check_bprop_eliminate 0.53% : 0.000002s : 12: predicate.compare_switch_simplify 0.19% : 0.000001s : 6: predicate.const_output_eliminate 0.58% : 0.000002s : 12: predicate.depend_value_elim 1.01% : 0.000003s : 21: predicate.dict_get_item_const_eliminator 1.13% : 0.000003s : 21: predicate.dict_get_item_eliminator 0.93% : 0.000003s : 21: predicate.dict_set_item_eliminator 0.94% : 0.000003s : 12: predicate.dumpgradient_eliminate 0.33% : 0.000001s : 6: predicate.elim_not_effective 0.31% : 0.000001s : 6: predicate.elim_shapecalc_of_broadcastargs 1.21% : 0.000004s : 27: predicate.environ_add_const_eliminate 1.23% : 0.000004s : 27: predicate.environ_get_add_eliminate 1.26% : 0.000004s : 27: predicate.environ_get_depend_swap 1.77% : 0.000005s : 39: predicate.environ_get_eliminate 1.16% : 0.000003s : 27: predicate.environ_get_set_eliminate 1.47% : 0.000004s : 31: predicate.exchange_switch_depend_value 2.41% : 0.000007s : 31: predicate.float_depend_g_call 0.51% : 0.000001s : 12: predicate.float_environ_get_switch 0.78% : 0.000002s : 18: predicate.float_tuple_getitem_switch 0.19% : 0.000001s : 6: predicate.fold_const_symbol 0.61% : 0.000002s : 12: predicate.get_grad_eliminate 0.20% : 0.000001s : 6: predicate.graph_param_transform 0.57% : 0.000002s : 12: predicate.incorporate_call 0.51% : 0.000001s : 12: predicate.incorporate_call_switch 6.20% : 0.000018s : 88: predicate.inline 0.92% : 0.000003s : 12: predicate.inline_without_move 0.36% : 0.000001s : 12: predicate.j_node_and_user_rematch 0.75% : 0.000002s : 12: predicate.less_batch_normalization 1.85% : 0.000005s : 39: predicate.list_to_tuple_eliminator_ 2.76% : 0.000008s : 60: predicate.load_eliminater 0.90% : 0.000003s : 6: predicate.loop_unroll_after_grad 1.97% : 0.000006s : 42: predicate.loop_unroll_before_grad 1.70% : 0.000005s : 33: predicate.make_slice_get_slice_eliminator 0.54% : 0.000002s : 12: predicate.merge_addn 0.53% : 0.000002s : 12: predicate.micro_step_allgather_replace 0.55% : 0.000002s : 12: predicate.mini_step_allgather_replace 0.90% : 0.000003s : 21: predicate.minmaximum_grad 0.87% : 0.000003s : 6: predicate.mutable_eliminate 0.32% : 0.000001s : 6: predicate.opt_reshape 0.35% : 0.000001s : 6: predicate.parallel_virtual_node 1.88% : 0.000006s : 31: predicate.partial_defer_inline 1.80% : 0.000005s : 33: predicate.partial_eliminate 0.95% : 0.000003s : 21: predicate.print_const_string_wrapper 0.55% : 0.000002s : 12: predicate.reduce_all_const_elim 1.22% : 0.000004s : 21: predicate.reduce_eliminate 2.66% : 0.000008s : 60: predicate.redundant_stop_gradient_eliminater 0.48% : 0.000001s : 12: predicate.remove_not_recompute_node 1.51% : 0.000004s : 39: predicate.replace_applicator 0.51% : 0.000002s : 12: predicate.replace_old_param 0.24% : 0.000001s : 6: predicate.reset_defer_inline 1.07% : 0.000003s : 21: predicate.reshape_eliminate 0.53% : 0.000002s : 12: predicate.row_tensor_add_zeros_like 0.45% : 0.000001s : 6: predicate.row_tensor_eliminate 0.76% : 0.000002s : 12: predicate.same_eliminate 0.48% : 0.000001s : 12: predicate.set_cell_output_no_recompute 0.67% : 0.000002s : 12: predicate.shard_identity_eliminate 0.68% : 0.000002s : 12: predicate.special_op_eliminate 0.68% : 0.000002s : 12: predicate.specialize_transform 0.72% : 0.000002s : 12: predicate.split_environ_get_set_with_tuple_value 0.70% : 0.000002s : 12: predicate.stack_unstack_eliminate 0.32% : 0.000001s : 6: predicate.switch_call_monad_eliminater 1.51% : 0.000004s : 31: predicate.switch_defer_inline 2.03% : 0.000006s : 43: predicate.switch_layer_defer_inline 4.72% : 0.000014s : 91: predicate.switch_simplify 0.97% : 0.000003s : 21: predicate.tile_eliminate 0.98% : 0.000003s : 21: predicate.transpose_eliminate 1.53% : 0.000004s : 33: predicate.tuple_list_convert_item_index_to_positive 1.68% : 0.000005s : 33: predicate.tuple_list_get_item_const_eliminator 1.51% : 0.000004s : 33: predicate.tuple_list_get_item_depend_reorder 3.14% : 0.000009s : 51: predicate.tuple_list_get_item_eliminator 1.64% : 0.000005s : 33: predicate.tuple_list_get_set_item_eliminator 2.24% : 0.000007s : 45: predicate.tuple_list_set_item_eliminator 1.93% : 0.000006s : 39: predicate.tuple_to_list_eliminator_ 2.65% : 0.000008s : 60: predicate.updatestate_pure_node_eliminater 3.19% : 0.000009s : 72: predicate.updatestate_useless_node_eliminater 0.40% : 0.000001s : 6: predicate.value_based_eliminate 0.70% : 0.000002s : 12: predicate.virtual_dataset_eliminate 0.63% : 0.000002s : 12: predicate.virtual_output_eliminate 0.29% : 0.000001s : 6: predicate.virtual_view_grad_eliminate 0.34% : 0.000001s : 6: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000568 11 51.87% : 0.000295s : 5: func_graph_cloner_run.FuncGraphClonerGraph 48.13% : 0.000274s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.027405 192 0.01% : 0.000004s : 1: ForceFp32Comm 11.43% : 0.003131s : 1: add_attr 11.39% : 0.003122s : 1: add_attr_with_inline 0.01% : 0.000004s : 1: add_comm_op_reuse_tag 0.23% : 0.000064s : 1: add_recomputation 0.01% : 0.000004s : 1: assign_add_opt 0.25% : 0.000070s : 1: auto_monad 0.10% : 0.000027s : 1: auto_monad_reorder 0.01% : 0.000003s : 1: begin_end_overlap_inline 0.02% : 0.000006s : 1: bias_add_comm_swap 1.67% : 0.000459s : 1: bootstrap 0.11% : 0.000031s : 1: cconv 0.01% : 0.000004s : 1: comm_op_add_attrs 0.07% : 0.000019s : 1: control_data_broadcast_order 0.04% : 0.000012s : 1: convert_after_rewriter 0.12% : 0.000033s : 1: cse_after_recomputation 0.02% : 0.000005s : 1: dataset_repeat_opt 0.02% : 0.000005s : 1: detach_backward 0.04% : 0.000010s : 1: environ_conv 0.10% : 0.000028s : 1: event_method 0.02% : 0.000005s : 1: full_micro_interleaved_order_control 0.02% : 0.000005s : 1: get_jit_bprop_graph 0.03% : 0.000009s : 1: graph_reusing 0.02% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000004s : 1: handle_group_info 0.02% : 0.000005s : 1: inline 0.02% : 0.000006s : 1: insert-virtual-dataset 0.02% : 0.000004s : 1: interleave_parallel_branches 0.01% : 0.000004s : 1: interleave_split_concat_branches 0.02% : 0.000006s : 1: label_fine_grained_interleaved_index 0.03% : 0.000008s : 1: label_micro_interleaved_index 1.69% : 0.000462s : 1: loop_unroll 0.01% : 0.000004s : 1: merge_cast_opt 0.02% : 0.000005s : 1: micro_interleaved_order_control 1.89% : 0.000518s : 1: mutable_eliminate 0.03% : 0.000007s : 1: offloading_packed_experts 0.07% : 0.000019s : 1: opt.transform.loop_unroll_optimizer 0.07% : 0.000019s : 1: opt.transform.mutable_eliminate 6.11% : 0.001674s : 78: opt.transform.opt_a 0.15% : 0.000042s : 1: opt.transform.opt_after_cconv 0.12% : 0.000033s : 1: opt.transform.opt_after_jit_grad 0.59% : 0.000161s : 28: opt.transform.opt_b 0.24% : 0.000067s : 2: opt.transform.opt_trans_graph 0.18% : 0.000050s : 4: opt.transform.symbol_engine_opt 12.36% : 0.003387s : 1: opt_a 0.49% : 0.000133s : 1: opt_after_cconv 1.94% : 0.000533s : 1: opt_after_jit_grad 1.03% : 0.000281s : 1: opt_b 20.54% : 0.005629s : 1: optimize 0.08% : 0.000023s : 1: optimize_parallel_all_gather_comm 0.03% : 0.000009s : 1: order_py_execute_after_rewriter 0.09% : 0.000025s : 1: overlap_grad_flash_sp 0.01% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.03% : 0.000008s : 1: overlap_grad_ring_attention 0.02% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.02% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.02% : 0.000005s : 1: overlap_param_gather 0.01% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.03% : 0.000008s : 1: overlap_recompute_and_grad_model_parallel 0.02% : 0.000005s : 1: overlap_recompute_comm 0.03% : 0.000007s : 1: parallel-infer-symbol 0.01% : 0.000004s : 1: parallel-infer-symbol-second 0.02% : 0.000005s : 1: partial_unused_args_eliminate 0.02% : 0.000004s : 1: pipeline_parallel_scheduler 0.02% : 0.000004s : 1: pipeline_split 0.14% : 0.000039s : 1: pre_auto_parallel 0.12% : 0.000033s : 1: py_interpret_to_execute 0.06% : 0.000015s : 1: py_interpret_to_execute_after_opt_a 0.02% : 0.000004s : 1: remove_cast_before_assign_add 0.07% : 0.000019s : 1: remove_dup_value 1.63% : 0.000447s : 1: renormalize.infer 1.21% : 0.000333s : 1: renormalize.specialize 0.02% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.03% : 0.000008s : 1: rewriter_after_jit_bprop_graph 0.17% : 0.000046s : 1: rewriter_after_opt_a 0.36% : 0.000098s : 1: rewriter_before_opt_a 0.02% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.02% : 0.000005s : 1: slice_recompute_activation 0.02% : 0.000004s : 1: split_layernorm_comm 0.02% : 0.000005s : 1: split_matmul_comm_elemetwise 0.04% : 0.000010s : 1: swap_dp_allreduce_reducescatter 0.36% : 0.000098s : 1: symbol_engine_optimizer 0.36% : 0.000099s : 1: tuple_transform 21.47% : 0.005883s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:47:12.323.837 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:47:12.324.082 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0168291, [21] [bootstrap]: 0.00043503 [type_inference]: 0.00578498 [event_method]: 2.023e-05 [auto_monad]: 6.436e-05 [graph_reusing]: 5.44e-06 [inline]: 2.17001e-06 [add_attr]: 0.00300208, [1] [add_attr_with_inline]: 0.00299409, [1] [Cycle 1]: 7.19e-05, [2] [tag_attr]: 2.087e-05 [meta_addattr_fg_expand]: 6.39999e-06 [parallel-infer-symbol]: 2.89001e-06 [pre_auto_parallel]: 3.455e-05 [insert-virtual-dataset]: 2.19999e-06 [parallel-infer-symbol-second]: 7.89994e-07 [dataset_repeat_opt]: 1.91e-06 [pipeline_split]: 1.59998e-06 [optimize]: 0.00633014, [53] [py_interpret_to_execute]: 3.302e-05 [rewriter_before_opt_a]: 0.00010166 [opt_a]: 0.00376896, [2] [Cycle 1]: 0.0026871, [45] [expand_dump_flag]: 3.13e-06 [switch_simplify]: 4.895e-05 [loop_unroll]: 6.478e-05 [a_1]: 0.00088237 [with_stream_mark]: 1.643e-05 [recompute_prepare]: 1.298e-05 [updatestate_depend_eliminate]: 5.83002e-06 [updatestate_assign_eliminate]: 4.87e-06 [updatestate_loads_eliminate]: 4.41002e-06 [parameter_eliminate]: 2.22001e-06 [a_2]: 0.0001572 [accelerated_algorithm]: 1.061e-05 [shard]: 1.84998e-06 [meta_shard_fg_expand]: 2.59999e-06 [shard_inline]: 9.63002e-06 [merge_send_recv]: 1.14e-05 [auto_parallel]: 9.29e-06 [parallel]: 1.83e-05 [flash_sp]: 8.84e-06 [merge_comm]: 5.84999e-06 [allreduce_fusion]: 5.04e-06 [matmul_add_comm_reduction]: 1.114e-05 [allreduce_slice_to_reducescatter]: 8.59989e-07 [virtual_shard_identity]: 1.113e-05 [virtual_dataset]: 9.57001e-06 [get_grad_eliminate_]: 9.46e-06 [virtual_output]: 9.42999e-06 [merge_forward]: 5.54e-06 [cell_reuse_recompute_pass]: 1.77999e-06 [offload_activation]: 1.179e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.084e-05 [merge_recompute_call_nodes]: 1.45999e-06 [before_grad]: 1.605e-05 [set_forward_comm_id_for_comm_node_pass]: 5.46e-06 [meta_fg_expand]: 4.2e-06 [flash_sp_send_recv_attached]: 2.41e-06 [receive_attached]: 2.29001e-06 [after_resolve]: 1.601e-05 [a_after_grad]: 1.519e-05 [renormalize]: 0.00069744 [add_forward_monad_depend]: 5.40999e-06 [auto_monad_grad]: 2.12999e-06 [auto_monad_eliminator]: 1.923e-05 [cse]: 4.951e-05 [a_3]: 8.383e-05 [Cycle 2]: 0.0010686, [45] [expand_dump_flag]: 1.12e-06 [switch_simplify]: 1.06e-05 [loop_unroll]: 9.59e-06 [a_1]: 0.00023688 [with_stream_mark]: 1.209e-05 [recompute_prepare]: 9.55001e-06 [updatestate_depend_eliminate]: 4.70999e-06 [updatestate_assign_eliminate]: 3.64002e-06 [updatestate_loads_eliminate]: 3.43e-06 [parameter_eliminate]: 1.06997e-06 [a_2]: 0.00014625 [accelerated_algorithm]: 9.97999e-06 [shard]: 1.30001e-06 [meta_shard_fg_expand]: 2.47001e-06 [shard_inline]: 9.57999e-06 [merge_send_recv]: 7.11001e-06 [auto_parallel]: 7.06999e-06 [parallel]: 4.79e-06 [flash_sp]: 3.78001e-06 [merge_comm]: 4.79e-06 [allreduce_fusion]: 4.45e-06 [matmul_add_comm_reduction]: 7.86001e-06 [allreduce_slice_to_reducescatter]: 3.80009e-07 [virtual_shard_identity]: 9.94001e-06 [virtual_dataset]: 9.56e-06 [get_grad_eliminate_]: 9.00999e-06 [virtual_output]: 8.84e-06 [merge_forward]: 4.28999e-06 [cell_reuse_recompute_pass]: 1.57001e-06 [offload_activation]: 7.98001e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.914e-05 [merge_recompute_call_nodes]: 8.69972e-07 [before_grad]: 1.451e-05 [set_forward_comm_id_for_comm_node_pass]: 5.17e-06 [meta_fg_expand]: 3.43999e-06 [flash_sp_send_recv_attached]: 8.50006e-07 [receive_attached]: 1.17999e-06 [after_resolve]: 1.474e-05 [a_after_grad]: 1.416e-05 [renormalize]: 5.9983e-08 [add_forward_monad_depend]: 1.76998e-06 [auto_monad_grad]: 9.10019e-07 [auto_monad_eliminator]: 1.054e-05 [cse]: 2.482e-05 [a_3]: 7.308e-05 [py_interpret_to_execute_after_opt_a]: 1.66e-05 [slice_cell_reuse_recomputed_activation]: 4.71002e-06 [rewriter_after_opt_a]: 4.897e-05 [convert_after_rewriter]: 1.146e-05 [order_py_execute_after_rewriter]: 9.89999e-06 [mutable_eliminate]: 0.00052618 [opt_b]: 0.00037409, [1] [Cycle 1]: 0.00036402, [7] [b_1]: 0.00024966 [b_2]: 1.108e-05 [updatestate_depend_eliminate]: 7.01001e-06 [updatestate_assign_eliminate]: 3.81001e-06 [updatestate_loads_eliminate]: 4.17e-06 [renormalize]: 6.19999e-07 [cse]: 3.036e-05 [optimize_parallel_all_gather_comm]: 2.384e-05 [overlap_param_gather]: 4.60001e-06 [cconv]: 2.846e-05 [loop_unroll]: 0.00043856 [opt_after_cconv]: 0.00015935, [1] [Cycle 1]: 0.00015102, [7] [c_1]: 4.875e-05 [parameter_eliminate]: 3.23e-06 [updatestate_depend_eliminate]: 6.81001e-06 [updatestate_assign_eliminate]: 4.05e-06 [updatestate_loads_eliminate]: 3.63999e-06 [cse]: 2.899e-05 [renormalize]: 3.89991e-07 [remove_dup_value]: 4.07e-05 [tuple_transform]: 0.00011881, [1] [Cycle 1]: 0.00011097, [4] [d_1]: 6.721e-05 [none_parameter_eliminate]: 1.79e-06 [renormalize]: 2.69996e-07 [switch_simplify]: 1.054e-05 [partial_unused_args_eliminate]: 4.47e-06 [add_recomputation]: 6.373e-05 [cse_after_recomputation]: 3.543e-05, [1] [Cycle 1]: 2.832e-05, [1] [cse]: 1.887e-05 [environ_conv]: 9.83002e-06 [swap_dp_allreduce_reducescatter]: 9.83002e-06 [bias_add_comm_swap]: 4.95999e-06 [label_micro_interleaved_index]: 6.68998e-06 [label_fine_grained_interleaved_index]: 4.99e-06 [merge_cast_opt]: 3.95e-06 [slice_recompute_activation]: 4.50999e-06 [micro_interleaved_order_control]: 4.76002e-06 [assign_add_opt]: 3.97002e-06 [ForceFp32Comm]: 3.55e-06 [remove_cast_before_assign_add]: 3.4e-06 [full_micro_interleaved_order_control]: 4.43001e-06 [reorder_send_recv_between_fp_bp]: 5.25999e-06 [comm_op_add_attrs]: 3.55e-06 [add_comm_op_reuse_tag]: 3.18e-06 [interleave_split_concat_branches]: 3.55e-06 [interleave_parallel_branches]: 3.75998e-06 [overlap_opt_shard_in_pipeline]: 3.53e-06 [overlap_opt_shard_grad_in_pipeline]: 4.34002e-06 [control_data_broadcast_order]: 1.955e-05 [grouped_pairwise_exchange_alltoall]: 3.93999e-06 [offloading_packed_experts]: 8.16002e-06 [overlap_recompute_and_grad_model_parallel]: 8.33999e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.75e-06 [overlap_recompute_allgather_and_fa_grad]: 3.76001e-06 [overlap_recompute_comm]: 4.78001e-06 [overlap_grad_ring_attention]: 7.91001e-06 [overlap_grad_flash_sp]: 2.61e-05 [begin_end_overlap_inline]: 2.85002e-06 [split_matmul_comm_elemetwise]: 4.89e-06 [split_layernorm_comm]: 4.28001e-06 [handle_group_info]: 3.18e-06 [symbol_engine_optimizer]: 0.00011292, [1] [Cycle 1]: 0.00010599, [6] [build]: 3.31999e-06 [elim_shapecalc]: 1.327e-05 [elim_not_effective]: 1.787e-05 [opt_reshape]: 1.052e-05 [fold_const_symbol]: 1.511e-05 [renormalize]: 2.20025e-07 [detach_backward]: 3.15002e-06 [pipeline_parallel_scheduler]: 1.87999e-06 [auto_monad_reorder]: 2.415e-05 [get_jit_bprop_graph]: 1.54e-06 [rewriter_after_jit_bprop_graph]: 4.18999e-06 [opt_after_jit_grad]: 0.0004925 [validate]: 4.138e-05 Sums bootstrap : 0.000435s : 3.59% type_inference : 0.005785s : 47.80% event_method : 0.000020s : 0.17% auto_monad : 0.000064s : 0.53% graph_reusing : 0.000005s : 0.04% inline : 0.000002s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000021s : 0.17% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.05% parallel-infer-symbol : 0.000003s : 0.02% pre_auto_parallel : 0.000035s : 0.29% insert-virtual-dataset : 0.000002s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000033s : 0.27% optimize.rewriter_before_opt_a : 0.000102s : 0.84% optimize.opt_a.expand_dump_flag : 0.000004s : 0.04% optimize.opt_a.switch_simplify : 0.000060s : 0.49% optimize.opt_a.loop_unroll : 0.000074s : 0.61% optimize.opt_a.a_1 : 0.001119s : 9.25% optimize.opt_a.with_stream_mark : 0.000029s : 0.24% optimize.opt_a.recompute_prepare : 0.000023s : 0.19% optimize.opt_a.updatestate_depend_eliminate : 0.000011s : 0.09% optimize.opt_a.updatestate_assign_eliminate : 0.000009s : 0.07% optimize.opt_a.updatestate_loads_eliminate : 0.000008s : 0.06% optimize.opt_a.parameter_eliminate : 0.000003s : 0.03% optimize.opt_a.a_2 : 0.000303s : 2.51% optimize.opt_a.accelerated_algorithm : 0.000021s : 0.17% optimize.opt_a.shard : 0.000003s : 0.03% optimize.opt_a.meta_shard_fg_expand : 0.000005s : 0.04% optimize.opt_a.shard_inline : 0.000019s : 0.16% optimize.opt_a.merge_send_recv : 0.000019s : 0.15% optimize.opt_a.auto_parallel : 0.000016s : 0.14% optimize.opt_a.parallel : 0.000023s : 0.19% optimize.opt_a.flash_sp : 0.000013s : 0.10% optimize.opt_a.merge_comm : 0.000011s : 0.09% optimize.opt_a.allreduce_fusion : 0.000009s : 0.08% optimize.opt_a.matmul_add_comm_reduction : 0.000019s : 0.16% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000021s : 0.17% optimize.opt_a.virtual_dataset : 0.000019s : 0.16% optimize.opt_a.get_grad_eliminate_ : 0.000018s : 0.15% optimize.opt_a.virtual_output : 0.000018s : 0.15% optimize.opt_a.merge_forward : 0.000010s : 0.08% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.03% optimize.opt_a.offload_activation : 0.000020s : 0.16% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000040s : 0.33% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.02% optimize.opt_a.before_grad : 0.000031s : 0.25% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000011s : 0.09% optimize.opt_a.meta_fg_expand : 0.000008s : 0.06% optimize.opt_a.flash_sp_send_recv_attached : 0.000003s : 0.03% optimize.opt_a.receive_attached : 0.000003s : 0.03% optimize.opt_a.after_resolve : 0.000031s : 0.25% optimize.opt_a.a_after_grad : 0.000029s : 0.24% optimize.opt_a.renormalize : 0.000697s : 5.76% optimize.opt_a.add_forward_monad_depend : 0.000007s : 0.06% optimize.opt_a.auto_monad_grad : 0.000003s : 0.03% optimize.opt_a.auto_monad_eliminator : 0.000030s : 0.25% optimize.opt_a.cse : 0.000074s : 0.61% optimize.opt_a.a_3 : 0.000157s : 1.30% optimize.py_interpret_to_execute_after_opt_a : 0.000017s : 0.14% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.04% optimize.rewriter_after_opt_a : 0.000049s : 0.40% optimize.convert_after_rewriter : 0.000011s : 0.09% optimize.order_py_execute_after_rewriter : 0.000010s : 0.08% optimize.mutable_eliminate : 0.000526s : 4.35% optimize.opt_b.b_1 : 0.000250s : 2.06% optimize.opt_b.b_2 : 0.000011s : 0.09% optimize.opt_b.updatestate_depend_eliminate : 0.000007s : 0.06% optimize.opt_b.updatestate_assign_eliminate : 0.000004s : 0.03% optimize.opt_b.updatestate_loads_eliminate : 0.000004s : 0.03% optimize.opt_b.renormalize : 0.000001s : 0.01% optimize.opt_b.cse : 0.000030s : 0.25% optimize.optimize_parallel_all_gather_comm : 0.000024s : 0.20% optimize.overlap_param_gather : 0.000005s : 0.04% optimize.cconv : 0.000028s : 0.24% optimize.loop_unroll : 0.000439s : 3.62% optimize.opt_after_cconv.c_1 : 0.000049s : 0.40% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000007s : 0.06% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000004s : 0.03% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000004s : 0.03% optimize.opt_after_cconv.cse : 0.000029s : 0.24% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000041s : 0.34% optimize.tuple_transform.d_1 : 0.000067s : 0.56% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000011s : 0.09% optimize.partial_unused_args_eliminate : 0.000004s : 0.04% optimize.add_recomputation : 0.000064s : 0.53% optimize.cse_after_recomputation.cse : 0.000019s : 0.16% optimize.environ_conv : 0.000010s : 0.08% optimize.swap_dp_allreduce_reducescatter : 0.000010s : 0.08% optimize.bias_add_comm_swap : 0.000005s : 0.04% optimize.label_micro_interleaved_index : 0.000007s : 0.06% optimize.label_fine_grained_interleaved_index : 0.000005s : 0.04% optimize.merge_cast_opt : 0.000004s : 0.03% optimize.slice_recompute_activation : 0.000005s : 0.04% optimize.micro_interleaved_order_control : 0.000005s : 0.04% optimize.assign_add_opt : 0.000004s : 0.03% optimize.ForceFp32Comm : 0.000004s : 0.03% optimize.remove_cast_before_assign_add : 0.000003s : 0.03% optimize.full_micro_interleaved_order_control : 0.000004s : 0.04% optimize.reorder_send_recv_between_fp_bp : 0.000005s : 0.04% optimize.comm_op_add_attrs : 0.000004s : 0.03% optimize.add_comm_op_reuse_tag : 0.000003s : 0.03% optimize.interleave_split_concat_branches : 0.000004s : 0.03% optimize.interleave_parallel_branches : 0.000004s : 0.03% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.03% optimize.overlap_opt_shard_grad_in_pipeline : 0.000004s : 0.04% optimize.control_data_broadcast_order : 0.000020s : 0.16% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.03% optimize.offloading_packed_experts : 0.000008s : 0.07% optimize.overlap_recompute_and_grad_model_parallel : 0.000008s : 0.07% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.03% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.03% optimize.overlap_recompute_comm : 0.000005s : 0.04% optimize.overlap_grad_ring_attention : 0.000008s : 0.07% optimize.overlap_grad_flash_sp : 0.000026s : 0.22% optimize.begin_end_overlap_inline : 0.000003s : 0.02% optimize.split_matmul_comm_elemetwise : 0.000005s : 0.04% optimize.split_layernorm_comm : 0.000004s : 0.04% optimize.handle_group_info : 0.000003s : 0.03% optimize.symbol_engine_optimizer.build : 0.000003s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000013s : 0.11% optimize.symbol_engine_optimizer.elim_not_effective : 0.000018s : 0.15% optimize.symbol_engine_optimizer.opt_reshape : 0.000011s : 0.09% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000015s : 0.12% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000003s : 0.03% pipeline_parallel_scheduler : 0.000002s : 0.02% auto_monad_reorder : 0.000024s : 0.20% get_jit_bprop_graph : 0.000002s : 0.01% rewriter_after_jit_bprop_graph : 0.000004s : 0.03% opt_after_jit_grad : 0.000492s : 4.07% validate : 0.000041s : 0.34% Time group info: ------[substitution.] 0.000237 54 11.89% : 0.000028s : 6: substitution.cast_eliminate 1.06% : 0.000003s : 4: substitution.elim_not_effective 0.81% : 0.000002s : 4: substitution.fold_const_symbol 3.40% : 0.000008s : 7: substitution.graph_param_transform 64.84% : 0.000154s : 4: substitution.inline 2.26% : 0.000005s : 8: substitution.j_node_and_user_rematch 3.11% : 0.000007s : 8: substitution.remove_not_recompute_node 2.43% : 0.000006s : 6: substitution.replace_old_param 7.96% : 0.000019s : 6: substitution.tuple_list_get_item_eliminator 2.24% : 0.000005s : 1: substitution.value_based_eliminate ------[type_inference.] 0.005736 2 86.64% : 0.004970s : 1: type_inference.infer 13.36% : 0.000766s : 1: type_inference.specialize ------[replace.] 0.000073 10 51.91% : 0.000038s : 4: replace.inline 48.09% : 0.000035s : 6: replace.tuple_list_get_item_eliminator ------[match.] 0.000167 10 90.53% : 0.000151s : 4: match.inline 9.47% : 0.000016s : 6: match.tuple_list_get_item_eliminator ------[predicate.] 0.000318 2134 0.95% : 0.000003s : 22: predicate.accumulaten_eliminater 0.77% : 0.000002s : 7: predicate.ad_related_special_op_eliminate 0.60% : 0.000002s : 14: predicate.addn_check_dump 0.97% : 0.000003s : 22: predicate.addn_zero_filter 0.89% : 0.000003s : 22: predicate.adjust_all_reduce_mul_add 1.90% : 0.000006s : 36: predicate.arithmetic_simplify 1.07% : 0.000003s : 22: predicate.cast_eliminate 0.71% : 0.000002s : 14: predicate.check_bprop_eliminate 0.59% : 0.000002s : 14: predicate.compare_switch_simplify 0.20% : 0.000001s : 7: predicate.const_output_eliminate 0.60% : 0.000002s : 14: predicate.depend_value_elim 1.01% : 0.000003s : 22: predicate.dict_get_item_const_eliminator 1.02% : 0.000003s : 22: predicate.dict_get_item_eliminator 0.92% : 0.000003s : 22: predicate.dict_set_item_eliminator 0.87% : 0.000003s : 14: predicate.dumpgradient_eliminate 0.22% : 0.000001s : 7: predicate.elim_not_effective 0.43% : 0.000001s : 7: predicate.elim_shapecalc_of_broadcastargs 1.20% : 0.000004s : 29: predicate.environ_add_const_eliminate 1.18% : 0.000004s : 29: predicate.environ_get_add_eliminate 1.16% : 0.000004s : 29: predicate.environ_get_depend_swap 1.82% : 0.000006s : 43: predicate.environ_get_eliminate 1.19% : 0.000004s : 29: predicate.environ_get_set_eliminate 1.41% : 0.000004s : 32: predicate.exchange_switch_depend_value 2.18% : 0.000007s : 32: predicate.float_depend_g_call 0.62% : 0.000002s : 14: predicate.float_environ_get_switch 0.87% : 0.000003s : 21: predicate.float_tuple_getitem_switch 0.18% : 0.000001s : 7: predicate.fold_const_symbol 0.66% : 0.000002s : 14: predicate.get_grad_eliminate 0.21% : 0.000001s : 7: predicate.graph_param_transform 0.66% : 0.000002s : 14: predicate.incorporate_call 0.55% : 0.000002s : 14: predicate.incorporate_call_switch 6.14% : 0.000020s : 96: predicate.inline 0.85% : 0.000003s : 14: predicate.inline_without_move 0.36% : 0.000001s : 14: predicate.j_node_and_user_rematch 0.76% : 0.000002s : 14: predicate.less_batch_normalization 1.90% : 0.000006s : 42: predicate.list_to_tuple_eliminator_ 2.72% : 0.000009s : 64: predicate.load_eliminater 0.70% : 0.000002s : 7: predicate.loop_unroll_after_grad 1.93% : 0.000006s : 44: predicate.loop_unroll_before_grad 1.76% : 0.000006s : 36: predicate.make_slice_get_slice_eliminator 0.64% : 0.000002s : 14: predicate.merge_addn 0.59% : 0.000002s : 14: predicate.micro_step_allgather_replace 0.60% : 0.000002s : 14: predicate.mini_step_allgather_replace 0.88% : 0.000003s : 22: predicate.minmaximum_grad 0.79% : 0.000003s : 7: predicate.mutable_eliminate 0.37% : 0.000001s : 7: predicate.opt_reshape 0.36% : 0.000001s : 7: predicate.parallel_virtual_node 1.63% : 0.000005s : 32: predicate.partial_defer_inline 1.79% : 0.000006s : 35: predicate.partial_eliminate 0.91% : 0.000003s : 22: predicate.print_const_string_wrapper 0.61% : 0.000002s : 14: predicate.reduce_all_const_elim 1.19% : 0.000004s : 22: predicate.reduce_eliminate 2.68% : 0.000009s : 64: predicate.redundant_stop_gradient_eliminater 0.38% : 0.000001s : 14: predicate.remove_not_recompute_node 1.47% : 0.000005s : 42: predicate.replace_applicator 0.47% : 0.000002s : 14: predicate.replace_old_param 0.24% : 0.000001s : 7: predicate.reset_defer_inline 0.97% : 0.000003s : 22: predicate.reshape_eliminate 0.63% : 0.000002s : 14: predicate.row_tensor_add_zeros_like 0.37% : 0.000001s : 7: predicate.row_tensor_eliminate 0.74% : 0.000002s : 14: predicate.same_eliminate 0.51% : 0.000002s : 14: predicate.set_cell_output_no_recompute 0.76% : 0.000002s : 14: predicate.shard_identity_eliminate 0.69% : 0.000002s : 14: predicate.special_op_eliminate 0.76% : 0.000002s : 14: predicate.specialize_transform 0.82% : 0.000003s : 14: predicate.split_environ_get_set_with_tuple_value 0.71% : 0.000002s : 14: predicate.stack_unstack_eliminate 0.37% : 0.000001s : 7: predicate.switch_call_monad_eliminater 1.47% : 0.000005s : 32: predicate.switch_defer_inline 2.09% : 0.000007s : 46: predicate.switch_layer_defer_inline 4.66% : 0.000015s : 97: predicate.switch_simplify 0.96% : 0.000003s : 22: predicate.tile_eliminate 0.92% : 0.000003s : 22: predicate.transpose_eliminate 1.58% : 0.000005s : 36: predicate.tuple_list_convert_item_index_to_positive 1.63% : 0.000005s : 36: predicate.tuple_list_get_item_const_eliminator 1.52% : 0.000005s : 36: predicate.tuple_list_get_item_depend_reorder 3.26% : 0.000010s : 56: predicate.tuple_list_get_item_eliminator 1.61% : 0.000005s : 36: predicate.tuple_list_get_set_item_eliminator 2.32% : 0.000007s : 50: predicate.tuple_list_set_item_eliminator 1.80% : 0.000006s : 42: predicate.tuple_to_list_eliminator_ 2.59% : 0.000008s : 64: predicate.updatestate_pure_node_eliminater 3.45% : 0.000011s : 78: predicate.updatestate_useless_node_eliminater 0.40% : 0.000001s : 7: predicate.value_based_eliminate 0.67% : 0.000002s : 14: predicate.virtual_dataset_eliminate 0.66% : 0.000002s : 14: predicate.virtual_output_eliminate 0.30% : 0.000001s : 7: predicate.virtual_view_grad_eliminate 0.44% : 0.000001s : 7: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000524 11 53.70% : 0.000281s : 5: func_graph_cloner_run.FuncGraphClonerGraph 46.30% : 0.000243s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.028974 192 0.02% : 0.000006s : 1: ForceFp32Comm 10.39% : 0.003010s : 1: add_attr 10.35% : 0.002998s : 1: add_attr_with_inline 0.02% : 0.000006s : 1: add_comm_op_reuse_tag 0.23% : 0.000068s : 1: add_recomputation 0.02% : 0.000007s : 1: assign_add_opt 0.25% : 0.000073s : 1: auto_monad 0.11% : 0.000032s : 1: auto_monad_reorder 0.02% : 0.000006s : 1: begin_end_overlap_inline 0.03% : 0.000008s : 1: bias_add_comm_swap 1.65% : 0.000479s : 1: bootstrap 0.11% : 0.000032s : 1: cconv 0.02% : 0.000006s : 1: comm_op_add_attrs 0.08% : 0.000023s : 1: control_data_broadcast_order 0.05% : 0.000015s : 1: convert_after_rewriter 0.13% : 0.000039s : 1: cse_after_recomputation 0.03% : 0.000007s : 1: dataset_repeat_opt 0.06% : 0.000018s : 1: detach_backward 0.04% : 0.000013s : 1: environ_conv 0.10% : 0.000029s : 1: event_method 0.02% : 0.000007s : 1: full_micro_interleaved_order_control 0.03% : 0.000008s : 1: get_jit_bprop_graph 0.04% : 0.000012s : 1: graph_reusing 0.02% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.02% : 0.000006s : 1: handle_group_info 0.03% : 0.000008s : 1: inline 0.03% : 0.000008s : 1: insert-virtual-dataset 0.02% : 0.000006s : 1: interleave_parallel_branches 0.02% : 0.000007s : 1: interleave_split_concat_branches 0.03% : 0.000008s : 1: label_fine_grained_interleaved_index 0.03% : 0.000009s : 1: label_micro_interleaved_index 1.53% : 0.000444s : 1: loop_unroll 0.02% : 0.000007s : 1: merge_cast_opt 0.03% : 0.000007s : 1: micro_interleaved_order_control 1.84% : 0.000532s : 1: mutable_eliminate 0.04% : 0.000011s : 1: offloading_packed_experts 0.06% : 0.000018s : 1: opt.transform.loop_unroll_optimizer 0.07% : 0.000019s : 1: opt.transform.mutable_eliminate 6.25% : 0.001810s : 78: opt.transform.opt_a 0.16% : 0.000047s : 1: opt.transform.opt_after_cconv 0.12% : 0.000034s : 1: opt.transform.opt_after_jit_grad 0.65% : 0.000187s : 28: opt.transform.opt_b 0.26% : 0.000075s : 2: opt.transform.opt_trans_graph 0.18% : 0.000053s : 4: opt.transform.symbol_engine_opt 13.02% : 0.003772s : 1: opt_a 0.56% : 0.000163s : 1: opt_after_cconv 1.74% : 0.000503s : 1: opt_after_jit_grad 1.30% : 0.000378s : 1: opt_b 23.02% : 0.006671s : 1: optimize 0.09% : 0.000027s : 1: optimize_parallel_all_gather_comm 0.04% : 0.000013s : 1: order_py_execute_after_rewriter 0.10% : 0.000029s : 1: overlap_grad_flash_sp 0.02% : 0.000007s : 1: overlap_grad_matmul_and_grad_allreduce 0.04% : 0.000011s : 1: overlap_grad_ring_attention 0.02% : 0.000007s : 1: overlap_opt_shard_grad_in_pipeline 0.02% : 0.000006s : 1: overlap_opt_shard_in_pipeline 0.03% : 0.000008s : 1: overlap_param_gather 0.02% : 0.000006s : 1: overlap_recompute_allgather_and_fa_grad 0.04% : 0.000011s : 1: overlap_recompute_and_grad_model_parallel 0.03% : 0.000008s : 1: overlap_recompute_comm 0.03% : 0.000009s : 1: parallel-infer-symbol 0.02% : 0.000006s : 1: parallel-infer-symbol-second 0.03% : 0.000007s : 1: partial_unused_args_eliminate 0.03% : 0.000010s : 1: pipeline_parallel_scheduler 0.02% : 0.000007s : 1: pipeline_split 0.14% : 0.000041s : 1: pre_auto_parallel 0.13% : 0.000037s : 1: py_interpret_to_execute 0.07% : 0.000020s : 1: py_interpret_to_execute_after_opt_a 0.02% : 0.000006s : 1: remove_cast_before_assign_add 0.15% : 0.000044s : 1: remove_dup_value 1.29% : 0.000375s : 1: renormalize.infer 1.09% : 0.000315s : 1: renormalize.specialize 0.03% : 0.000008s : 1: reorder_send_recv_between_fp_bp 0.03% : 0.000010s : 1: rewriter_after_jit_bprop_graph 0.18% : 0.000053s : 1: rewriter_after_opt_a 0.36% : 0.000105s : 1: rewriter_before_opt_a 0.03% : 0.000008s : 1: slice_cell_reuse_recomputed_activation 0.03% : 0.000007s : 1: slice_recompute_activation 0.02% : 0.000007s : 1: split_layernorm_comm 0.03% : 0.000007s : 1: split_matmul_comm_elemetwise 0.04% : 0.000013s : 1: swap_dp_allreduce_reducescatter 0.40% : 0.000116s : 1: symbol_engine_optimizer 0.42% : 0.000122s : 1: tuple_transform 20.08% : 0.005817s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:47:12.520.256 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.016265, [21] [bootstrap]: 0.00048386 [type_inference]: 0.00584946 [event_method]: 2.082e-05 [auto_monad]: 6.827e-05 [graph_reusing]: 5.40999e-06 [inline]: 2.24001e-06 [add_attr]: 0.00316723, [1] [add_attr_with_inline]: 0.00315736, [1] [Cycle 1]: 6.845e-05, [2] [tag_attr]: 2.145e-05 [meta_addattr_fg_expand]: 6.61e-06 [parallel-infer-symbol]: 3.83001e-06 [pre_auto_parallel]: 3.844e-05 [insert-virtual-dataset]: 2.36e-06 [parallel-infer-symbol-second]: 8.10018e-07 [dataset_repeat_opt]: 1.87001e-06 [pipeline_split]: 1.66998e-06 [optimize]: 0.00591537, [53] [py_interpret_to_execute]: 3.06e-05 [rewriter_before_opt_a]: 9.776e-05 [opt_a]: 0.00357678, [2] [Cycle 1]: 0.00263093, [45] [expand_dump_flag]: 2.87002e-06 [switch_simplify]: 5.08e-05 [loop_unroll]: 3.7e-05 [a_1]: 0.00089059 [with_stream_mark]: 1.943e-05 [recompute_prepare]: 1.177e-05 [updatestate_depend_eliminate]: 5.32001e-06 [updatestate_assign_eliminate]: 4.54002e-06 [updatestate_loads_eliminate]: 4.10998e-06 [parameter_eliminate]: 1.79998e-06 [a_2]: 0.00012833 [accelerated_algorithm]: 1.074e-05 [shard]: 1.67999e-06 [meta_shard_fg_expand]: 2.22999e-06 [shard_inline]: 9.82999e-06 [merge_send_recv]: 1.006e-05 [auto_parallel]: 8.31002e-06 [parallel]: 1.965e-05 [flash_sp]: 9.84001e-06 [merge_comm]: 6.07999e-06 [allreduce_fusion]: 4.78001e-06 [matmul_add_comm_reduction]: 1.129e-05 [allreduce_slice_to_reducescatter]: 8.99978e-07 [virtual_shard_identity]: 1.113e-05 [virtual_dataset]: 1.013e-05 [get_grad_eliminate_]: 9.40001e-06 [virtual_output]: 9.71e-06 [merge_forward]: 5.61e-06 [cell_reuse_recompute_pass]: 1.22e-06 [offload_activation]: 1.173e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.772e-05 [merge_recompute_call_nodes]: 1.53002e-06 [before_grad]: 1.605e-05 [set_forward_comm_id_for_comm_node_pass]: 5.71e-06 [meta_fg_expand]: 3.95e-06 [flash_sp_send_recv_attached]: 3.05998e-06 [receive_attached]: 2.35002e-06 [after_resolve]: 1.648e-05 [a_after_grad]: 1.519e-05 [renormalize]: 0.00083945 [add_forward_monad_depend]: 6.36998e-06 [auto_monad_grad]: 2.29001e-06 [auto_monad_eliminator]: 1.916e-05 [cse]: 5.079e-05 [a_3]: 7.117e-05 [Cycle 2]: 0.00093509, [45] [expand_dump_flag]: 1.35001e-06 [switch_simplify]: 1.115e-05 [loop_unroll]: 9.74999e-06 [a_1]: 0.00023802 [with_stream_mark]: 1.497e-05 [recompute_prepare]: 9.94001e-06 [updatestate_depend_eliminate]: 5.05001e-06 [updatestate_assign_eliminate]: 3.6e-06 [updatestate_loads_eliminate]: 3.78999e-06 [parameter_eliminate]: 1.41002e-06 [a_2]: 0.00012129 [accelerated_algorithm]: 9.96e-06 [shard]: 1.66e-06 [meta_shard_fg_expand]: 2.11e-06 [shard_inline]: 9.46e-06 [merge_send_recv]: 7.41001e-06 [auto_parallel]: 7.18e-06 [parallel]: 6.33e-06 [flash_sp]: 3.86999e-06 [merge_comm]: 5.33002e-06 [allreduce_fusion]: 4.94998e-06 [matmul_add_comm_reduction]: 8.54e-06 [allreduce_slice_to_reducescatter]: 5.69999e-07 [virtual_shard_identity]: 1.031e-05 [virtual_dataset]: 9.06002e-06 [get_grad_eliminate_]: 9.47999e-06 [virtual_output]: 1.014e-05 [merge_forward]: 5.52999e-06 [cell_reuse_recompute_pass]: 2.11e-06 [offload_activation]: 1.035e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.033e-05 [merge_recompute_call_nodes]: 1.02e-06 [before_grad]: 1.556e-05 [set_forward_comm_id_for_comm_node_pass]: 5.13002e-06 [meta_fg_expand]: 3.6e-06 [flash_sp_send_recv_attached]: 1.10999e-06 [receive_attached]: 1.54e-06 [after_resolve]: 1.468e-05 [a_after_grad]: 1.409e-05 [renormalize]: 8.00064e-08 [add_forward_monad_depend]: 1.66998e-06 [auto_monad_grad]: 1.60999e-06 [auto_monad_eliminator]: 1.052e-05 [cse]: 2.642e-05 [a_3]: 5.989e-05 [py_interpret_to_execute_after_opt_a]: 1.476e-05 [slice_cell_reuse_recomputed_activation]: 1.81e-06 [rewriter_after_opt_a]: 4.701e-05 [convert_after_rewriter]: 8.45001e-06 [order_py_execute_after_rewriter]: 6.42001e-06 [mutable_eliminate]: 0.0005533 [opt_b]: 0.00030814, [1] [Cycle 1]: 0.00030131, [7] [b_1]: 0.0002065 [b_2]: 1.148e-05 [updatestate_depend_eliminate]: 7.53e-06 [updatestate_assign_eliminate]: 3.60998e-06 [updatestate_loads_eliminate]: 3.91999e-06 [renormalize]: 6.80011e-07 [cse]: 3.176e-05 [optimize_parallel_all_gather_comm]: 1.913e-05 [overlap_param_gather]: 1.84e-06 [cconv]: 2.793e-05 [loop_unroll]: 0.00042891 [opt_after_cconv]: 0.00013342, [1] [Cycle 1]: 0.0001281, [7] [c_1]: 4.838e-05 [parameter_eliminate]: 2.88e-06 [updatestate_depend_eliminate]: 7.01999e-06 [updatestate_assign_eliminate]: 3.93999e-06 [updatestate_loads_eliminate]: 3.61999e-06 [cse]: 2.779e-05 [renormalize]: 4.00003e-07 [remove_dup_value]: 4.308e-05 [tuple_transform]: 0.00010245, [1] [Cycle 1]: 9.751e-05, [4] [d_1]: 6.629e-05 [none_parameter_eliminate]: 1.72999e-06 [renormalize]: 3.09985e-07 [switch_simplify]: 1.013e-05 [partial_unused_args_eliminate]: 1.89e-06 [add_recomputation]: 6.423e-05 [cse_after_recomputation]: 2.773e-05, [1] [Cycle 1]: 2.324e-05, [1] [cse]: 1.783e-05 [environ_conv]: 6.23e-06 [swap_dp_allreduce_reducescatter]: 7.01999e-06 [bias_add_comm_swap]: 2.71e-06 [label_micro_interleaved_index]: 4.32e-06 [label_fine_grained_interleaved_index]: 2.68e-06 [merge_cast_opt]: 1.24003e-06 [slice_recompute_activation]: 2.11e-06 [micro_interleaved_order_control]: 2.39001e-06 [assign_add_opt]: 1.35999e-06 [ForceFp32Comm]: 8.10018e-07 [remove_cast_before_assign_add]: 1.17e-06 [full_micro_interleaved_order_control]: 2.08002e-06 [reorder_send_recv_between_fp_bp]: 2.65002e-06 [comm_op_add_attrs]: 9.60019e-07 [add_comm_op_reuse_tag]: 9.39996e-07 [interleave_split_concat_branches]: 1.12e-06 [interleave_parallel_branches]: 1.05999e-06 [overlap_opt_shard_in_pipeline]: 1.26002e-06 [overlap_opt_shard_grad_in_pipeline]: 1.80001e-06 [control_data_broadcast_order]: 1.677e-05 [grouped_pairwise_exchange_alltoall]: 1.69998e-06 [offloading_packed_experts]: 4.90999e-06 [overlap_recompute_and_grad_model_parallel]: 5.49e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.15001e-06 [overlap_recompute_allgather_and_fa_grad]: 1.33002e-06 [overlap_recompute_comm]: 2.49999e-06 [overlap_grad_ring_attention]: 5.08002e-06 [overlap_grad_flash_sp]: 2.414e-05 [begin_end_overlap_inline]: 4.60015e-07 [split_matmul_comm_elemetwise]: 2.44001e-06 [split_layernorm_comm]: 1.52999e-06 [handle_group_info]: 8.79983e-07 [symbol_engine_optimizer]: 9.117e-05, [1] [Cycle 1]: 8.676e-05, [6] [build]: 3.46999e-06 [elim_shapecalc]: 1.309e-05 [elim_not_effective]: 1.845e-05 [opt_reshape]: 9.91998e-06 [fold_const_symbol]: 1.476e-05 [renormalize]: 2.70025e-07 [detach_backward]: 2.14e-06 [pipeline_parallel_scheduler]: 1.71e-06 [auto_monad_reorder]: 2.112e-05 [get_jit_bprop_graph]: 1.58002e-06 [rewriter_after_jit_bprop_graph]: 3.53999e-06 [opt_after_jit_grad]: 0.00047151 [validate]: 5.729e-05 Sums bootstrap : 0.000484s : 3.98% type_inference : 0.005849s : 48.16% event_method : 0.000021s : 0.17% auto_monad : 0.000068s : 0.56% graph_reusing : 0.000005s : 0.04% inline : 0.000002s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000021s : 0.18% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000007s : 0.05% parallel-infer-symbol : 0.000004s : 0.03% pre_auto_parallel : 0.000038s : 0.32% insert-virtual-dataset : 0.000002s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000031s : 0.25% optimize.rewriter_before_opt_a : 0.000098s : 0.80% optimize.opt_a.expand_dump_flag : 0.000004s : 0.03% optimize.opt_a.switch_simplify : 0.000062s : 0.51% optimize.opt_a.loop_unroll : 0.000047s : 0.38% optimize.opt_a.a_1 : 0.001129s : 9.29% optimize.opt_a.with_stream_mark : 0.000034s : 0.28% optimize.opt_a.recompute_prepare : 0.000022s : 0.18% optimize.opt_a.updatestate_depend_eliminate : 0.000010s : 0.09% optimize.opt_a.updatestate_assign_eliminate : 0.000008s : 0.07% optimize.opt_a.updatestate_loads_eliminate : 0.000008s : 0.07% optimize.opt_a.parameter_eliminate : 0.000003s : 0.03% optimize.opt_a.a_2 : 0.000250s : 2.06% optimize.opt_a.accelerated_algorithm : 0.000021s : 0.17% optimize.opt_a.shard : 0.000003s : 0.03% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.04% optimize.opt_a.shard_inline : 0.000019s : 0.16% optimize.opt_a.merge_send_recv : 0.000017s : 0.14% optimize.opt_a.auto_parallel : 0.000015s : 0.13% optimize.opt_a.parallel : 0.000026s : 0.21% optimize.opt_a.flash_sp : 0.000014s : 0.11% optimize.opt_a.merge_comm : 0.000011s : 0.09% optimize.opt_a.allreduce_fusion : 0.000010s : 0.08% optimize.opt_a.matmul_add_comm_reduction : 0.000020s : 0.16% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000021s : 0.18% optimize.opt_a.virtual_dataset : 0.000019s : 0.16% optimize.opt_a.get_grad_eliminate_ : 0.000019s : 0.16% optimize.opt_a.virtual_output : 0.000020s : 0.16% optimize.opt_a.merge_forward : 0.000011s : 0.09% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.03% optimize.opt_a.offload_activation : 0.000022s : 0.18% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000038s : 0.31% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.02% optimize.opt_a.before_grad : 0.000032s : 0.26% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000011s : 0.09% optimize.opt_a.meta_fg_expand : 0.000008s : 0.06% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.03% optimize.opt_a.receive_attached : 0.000004s : 0.03% optimize.opt_a.after_resolve : 0.000031s : 0.26% optimize.opt_a.a_after_grad : 0.000029s : 0.24% optimize.opt_a.renormalize : 0.000840s : 6.91% optimize.opt_a.add_forward_monad_depend : 0.000008s : 0.07% optimize.opt_a.auto_monad_grad : 0.000004s : 0.03% optimize.opt_a.auto_monad_eliminator : 0.000030s : 0.24% optimize.opt_a.cse : 0.000077s : 0.64% optimize.opt_a.a_3 : 0.000131s : 1.08% optimize.py_interpret_to_execute_after_opt_a : 0.000015s : 0.12% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.01% optimize.rewriter_after_opt_a : 0.000047s : 0.39% optimize.convert_after_rewriter : 0.000008s : 0.07% optimize.order_py_execute_after_rewriter : 0.000006s : 0.05% optimize.mutable_eliminate : 0.000553s : 4.56% optimize.opt_b.b_1 : 0.000207s : 1.70% optimize.opt_b.b_2 : 0.000011s : 0.09% optimize.opt_b.updatestate_depend_eliminate : 0.000008s : 0.06% optimize.opt_b.updatestate_assign_eliminate : 0.000004s : 0.03% optimize.opt_b.updatestate_loads_eliminate : 0.000004s : 0.03% optimize.opt_b.renormalize : 0.000001s : 0.01% optimize.opt_b.cse : 0.000032s : 0.26% optimize.optimize_parallel_all_gather_comm : 0.000019s : 0.16% optimize.overlap_param_gather : 0.000002s : 0.02% optimize.cconv : 0.000028s : 0.23% optimize.loop_unroll : 0.000429s : 3.53% optimize.opt_after_cconv.c_1 : 0.000048s : 0.40% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.02% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000007s : 0.06% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000004s : 0.03% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000004s : 0.03% optimize.opt_after_cconv.cse : 0.000028s : 0.23% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000043s : 0.35% optimize.tuple_transform.d_1 : 0.000066s : 0.55% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000010s : 0.08% optimize.partial_unused_args_eliminate : 0.000002s : 0.02% optimize.add_recomputation : 0.000064s : 0.53% optimize.cse_after_recomputation.cse : 0.000018s : 0.15% optimize.environ_conv : 0.000006s : 0.05% optimize.swap_dp_allreduce_reducescatter : 0.000007s : 0.06% optimize.bias_add_comm_swap : 0.000003s : 0.02% optimize.label_micro_interleaved_index : 0.000004s : 0.04% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.02% optimize.merge_cast_opt : 0.000001s : 0.01% optimize.slice_recompute_activation : 0.000002s : 0.02% optimize.micro_interleaved_order_control : 0.000002s : 0.02% optimize.assign_add_opt : 0.000001s : 0.01% optimize.ForceFp32Comm : 0.000001s : 0.01% optimize.remove_cast_before_assign_add : 0.000001s : 0.01% optimize.full_micro_interleaved_order_control : 0.000002s : 0.02% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.02% optimize.comm_op_add_attrs : 0.000001s : 0.01% optimize.add_comm_op_reuse_tag : 0.000001s : 0.01% optimize.interleave_split_concat_branches : 0.000001s : 0.01% optimize.interleave_parallel_branches : 0.000001s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.01% optimize.control_data_broadcast_order : 0.000017s : 0.14% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.01% optimize.offloading_packed_experts : 0.000005s : 0.04% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.05% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.01% optimize.overlap_recompute_comm : 0.000002s : 0.02% optimize.overlap_grad_ring_attention : 0.000005s : 0.04% optimize.overlap_grad_flash_sp : 0.000024s : 0.20% optimize.begin_end_overlap_inline : 0.000000s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.02% optimize.split_layernorm_comm : 0.000002s : 0.01% optimize.handle_group_info : 0.000001s : 0.01% optimize.symbol_engine_optimizer.build : 0.000003s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000013s : 0.11% optimize.symbol_engine_optimizer.elim_not_effective : 0.000018s : 0.15% optimize.symbol_engine_optimizer.opt_reshape : 0.000010s : 0.08% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000015s : 0.12% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.02% pipeline_parallel_scheduler : 0.000002s : 0.01% auto_monad_reorder : 0.000021s : 0.17% get_jit_bprop_graph : 0.000002s : 0.01% rewriter_after_jit_bprop_graph : 0.000004s : 0.03% opt_after_jit_grad : 0.000472s : 3.88% validate : 0.000057s : 0.47% Time group info: ------[substitution.] 0.000243 54 12.13% : 0.000029s : 6: substitution.cast_eliminate 0.97% : 0.000002s : 4: substitution.elim_not_effective 0.91% : 0.000002s : 4: substitution.fold_const_symbol 3.03% : 0.000007s : 7: substitution.graph_param_transform 65.29% : 0.000159s : 4: substitution.inline 2.25% : 0.000005s : 8: substitution.j_node_and_user_rematch 3.27% : 0.000008s : 8: substitution.remove_not_recompute_node 2.18% : 0.000005s : 6: substitution.replace_old_param 7.67% : 0.000019s : 6: substitution.tuple_list_get_item_eliminator 2.28% : 0.000006s : 1: substitution.value_based_eliminate ------[type_inference.] 0.005785 2 86.74% : 0.005018s : 1: type_inference.infer 13.26% : 0.000767s : 1: type_inference.specialize ------[replace.] 0.000074 10 51.84% : 0.000038s : 4: replace.inline 48.16% : 0.000036s : 6: replace.tuple_list_get_item_eliminator ------[match.] 0.000172 10 90.72% : 0.000156s : 4: match.inline 9.28% : 0.000016s : 6: match.tuple_list_get_item_eliminator ------[predicate.] 0.000321 2134 0.90% : 0.000003s : 22: predicate.accumulaten_eliminater 0.68% : 0.000002s : 7: predicate.ad_related_special_op_eliminate 0.59% : 0.000002s : 14: predicate.addn_check_dump 0.92% : 0.000003s : 22: predicate.addn_zero_filter 0.87% : 0.000003s : 22: predicate.adjust_all_reduce_mul_add 1.97% : 0.000006s : 36: predicate.arithmetic_simplify 1.07% : 0.000003s : 22: predicate.cast_eliminate 0.61% : 0.000002s : 14: predicate.check_bprop_eliminate 0.62% : 0.000002s : 14: predicate.compare_switch_simplify 0.21% : 0.000001s : 7: predicate.const_output_eliminate 0.61% : 0.000002s : 14: predicate.depend_value_elim 1.00% : 0.000003s : 22: predicate.dict_get_item_const_eliminator 1.09% : 0.000003s : 22: predicate.dict_get_item_eliminator 0.92% : 0.000003s : 22: predicate.dict_set_item_eliminator 0.92% : 0.000003s : 14: predicate.dumpgradient_eliminate 0.21% : 0.000001s : 7: predicate.elim_not_effective 0.34% : 0.000001s : 7: predicate.elim_shapecalc_of_broadcastargs 1.19% : 0.000004s : 29: predicate.environ_add_const_eliminate 1.16% : 0.000004s : 29: predicate.environ_get_add_eliminate 1.16% : 0.000004s : 29: predicate.environ_get_depend_swap 1.75% : 0.000006s : 43: predicate.environ_get_eliminate 1.15% : 0.000004s : 29: predicate.environ_get_set_eliminate 1.37% : 0.000004s : 32: predicate.exchange_switch_depend_value 2.24% : 0.000007s : 32: predicate.float_depend_g_call 0.56% : 0.000002s : 14: predicate.float_environ_get_switch 0.83% : 0.000003s : 21: predicate.float_tuple_getitem_switch 0.18% : 0.000001s : 7: predicate.fold_const_symbol 0.64% : 0.000002s : 14: predicate.get_grad_eliminate 0.26% : 0.000001s : 7: predicate.graph_param_transform 0.63% : 0.000002s : 14: predicate.incorporate_call 0.54% : 0.000002s : 14: predicate.incorporate_call_switch 6.40% : 0.000021s : 96: predicate.inline 0.89% : 0.000003s : 14: predicate.inline_without_move 0.35% : 0.000001s : 14: predicate.j_node_and_user_rematch 0.76% : 0.000002s : 14: predicate.less_batch_normalization 1.96% : 0.000006s : 42: predicate.list_to_tuple_eliminator_ 2.63% : 0.000008s : 64: predicate.load_eliminater 0.74% : 0.000002s : 7: predicate.loop_unroll_after_grad 1.93% : 0.000006s : 44: predicate.loop_unroll_before_grad 1.65% : 0.000005s : 36: predicate.make_slice_get_slice_eliminator 0.60% : 0.000002s : 14: predicate.merge_addn 0.65% : 0.000002s : 14: predicate.micro_step_allgather_replace 0.58% : 0.000002s : 14: predicate.mini_step_allgather_replace 0.89% : 0.000003s : 22: predicate.minmaximum_grad 0.97% : 0.000003s : 7: predicate.mutable_eliminate 0.34% : 0.000001s : 7: predicate.opt_reshape 0.35% : 0.000001s : 7: predicate.parallel_virtual_node 1.64% : 0.000005s : 32: predicate.partial_defer_inline 1.77% : 0.000006s : 35: predicate.partial_eliminate 0.91% : 0.000003s : 22: predicate.print_const_string_wrapper 0.64% : 0.000002s : 14: predicate.reduce_all_const_elim 1.28% : 0.000004s : 22: predicate.reduce_eliminate 2.63% : 0.000008s : 64: predicate.redundant_stop_gradient_eliminater 0.41% : 0.000001s : 14: predicate.remove_not_recompute_node 1.39% : 0.000004s : 42: predicate.replace_applicator 0.44% : 0.000001s : 14: predicate.replace_old_param 0.25% : 0.000001s : 7: predicate.reset_defer_inline 0.95% : 0.000003s : 22: predicate.reshape_eliminate 0.63% : 0.000002s : 14: predicate.row_tensor_add_zeros_like 0.47% : 0.000002s : 7: predicate.row_tensor_eliminate 0.89% : 0.000003s : 14: predicate.same_eliminate 0.43% : 0.000001s : 14: predicate.set_cell_output_no_recompute 0.78% : 0.000003s : 14: predicate.shard_identity_eliminate 0.70% : 0.000002s : 14: predicate.special_op_eliminate 0.74% : 0.000002s : 14: predicate.specialize_transform 0.82% : 0.000003s : 14: predicate.split_environ_get_set_with_tuple_value 0.78% : 0.000002s : 14: predicate.stack_unstack_eliminate 0.39% : 0.000001s : 7: predicate.switch_call_monad_eliminater 1.46% : 0.000005s : 32: predicate.switch_defer_inline 2.02% : 0.000006s : 46: predicate.switch_layer_defer_inline 4.66% : 0.000015s : 97: predicate.switch_simplify 0.88% : 0.000003s : 22: predicate.tile_eliminate 0.92% : 0.000003s : 22: predicate.transpose_eliminate 1.58% : 0.000005s : 36: predicate.tuple_list_convert_item_index_to_positive 1.74% : 0.000006s : 36: predicate.tuple_list_get_item_const_eliminator 1.54% : 0.000005s : 36: predicate.tuple_list_get_item_depend_reorder 3.18% : 0.000010s : 56: predicate.tuple_list_get_item_eliminator 1.59% : 0.000005s : 36: predicate.tuple_list_get_set_item_eliminator 2.34% : 0.000008s : 50: predicate.tuple_list_set_item_eliminator 1.79% : 0.000006s : 42: predicate.tuple_to_list_eliminator_ 2.61% : 0.000008s : 64: predicate.updatestate_pure_node_eliminater 3.28% : 0.000011s : 78: predicate.updatestate_useless_node_eliminater 0.45% : 0.000001s : 7: predicate.value_based_eliminate 0.66% : 0.000002s : 14: predicate.virtual_dataset_eliminate 0.72% : 0.000002s : 14: predicate.virtual_output_eliminate 0.32% : 0.000001s : 7: predicate.virtual_view_grad_eliminate 0.45% : 0.000001s : 7: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000559 11 52.64% : 0.000294s : 5: func_graph_cloner_run.FuncGraphClonerGraph 47.36% : 0.000265s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.028268 192 0.01% : 0.000003s : 1: ForceFp32Comm 11.22% : 0.003172s : 1: add_attr 11.18% : 0.003162s : 1: add_attr_with_inline 0.01% : 0.000004s : 1: add_comm_op_reuse_tag 0.24% : 0.000068s : 1: add_recomputation 0.01% : 0.000004s : 1: assign_add_opt 0.26% : 0.000073s : 1: auto_monad 0.09% : 0.000025s : 1: auto_monad_reorder 0.01% : 0.000003s : 1: begin_end_overlap_inline 0.02% : 0.000006s : 1: bias_add_comm_swap 1.80% : 0.000509s : 1: bootstrap 0.11% : 0.000032s : 1: cconv 0.01% : 0.000004s : 1: comm_op_add_attrs 0.07% : 0.000020s : 1: control_data_broadcast_order 0.04% : 0.000012s : 1: convert_after_rewriter 0.11% : 0.000031s : 1: cse_after_recomputation 0.02% : 0.000005s : 1: dataset_repeat_opt 0.02% : 0.000005s : 1: detach_backward 0.03% : 0.000009s : 1: environ_conv 0.10% : 0.000027s : 1: event_method 0.02% : 0.000005s : 1: full_micro_interleaved_order_control 0.02% : 0.000005s : 1: get_jit_bprop_graph 0.03% : 0.000009s : 1: graph_reusing 0.02% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000004s : 1: handle_group_info 0.02% : 0.000005s : 1: inline 0.02% : 0.000006s : 1: insert-virtual-dataset 0.01% : 0.000004s : 1: interleave_parallel_branches 0.02% : 0.000004s : 1: interleave_split_concat_branches 0.02% : 0.000005s : 1: label_fine_grained_interleaved_index 0.03% : 0.000007s : 1: label_micro_interleaved_index 1.54% : 0.000436s : 1: loop_unroll 0.01% : 0.000004s : 1: merge_cast_opt 0.02% : 0.000005s : 1: micro_interleaved_order_control 1.99% : 0.000562s : 1: mutable_eliminate 0.03% : 0.000008s : 1: offloading_packed_experts 0.06% : 0.000018s : 1: opt.transform.loop_unroll_optimizer 0.07% : 0.000020s : 1: opt.transform.mutable_eliminate 6.36% : 0.001797s : 78: opt.transform.opt_a 0.17% : 0.000047s : 1: opt.transform.opt_after_cconv 0.13% : 0.000036s : 1: opt.transform.opt_after_jit_grad 0.66% : 0.000187s : 28: opt.transform.opt_b 0.26% : 0.000074s : 2: opt.transform.opt_trans_graph 0.19% : 0.000053s : 4: opt.transform.symbol_engine_opt 12.66% : 0.003580s : 1: opt_a 0.49% : 0.000137s : 1: opt_after_cconv 1.70% : 0.000480s : 1: opt_after_jit_grad 1.10% : 0.000312s : 1: opt_b 20.94% : 0.005920s : 1: optimize 0.08% : 0.000023s : 1: optimize_parallel_all_gather_comm 0.03% : 0.000010s : 1: order_py_execute_after_rewriter 0.10% : 0.000027s : 1: overlap_grad_flash_sp 0.01% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.03% : 0.000008s : 1: overlap_grad_ring_attention 0.02% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.02% : 0.000005s : 1: overlap_param_gather 0.01% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.03% : 0.000008s : 1: overlap_recompute_and_grad_model_parallel 0.02% : 0.000005s : 1: overlap_recompute_comm 0.03% : 0.000007s : 1: parallel-infer-symbol 0.01% : 0.000004s : 1: parallel-infer-symbol-second 0.02% : 0.000005s : 1: partial_unused_args_eliminate 0.02% : 0.000005s : 1: pipeline_parallel_scheduler 0.02% : 0.000004s : 1: pipeline_split 0.15% : 0.000043s : 1: pre_auto_parallel 0.12% : 0.000035s : 1: py_interpret_to_execute 0.06% : 0.000018s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000004s : 1: remove_cast_before_assign_add 0.17% : 0.000047s : 1: remove_dup_value 1.61% : 0.000456s : 1: renormalize.infer 1.32% : 0.000374s : 1: renormalize.specialize 0.02% : 0.000005s : 1: reorder_send_recv_between_fp_bp 0.02% : 0.000007s : 1: rewriter_after_jit_bprop_graph 0.18% : 0.000051s : 1: rewriter_after_opt_a 0.36% : 0.000102s : 1: rewriter_before_opt_a 0.02% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.02% : 0.000005s : 1: slice_recompute_activation 0.01% : 0.000004s : 1: split_layernorm_comm 0.02% : 0.000005s : 1: split_matmul_comm_elemetwise 0.04% : 0.000010s : 1: swap_dp_allreduce_reducescatter 0.33% : 0.000094s : 1: symbol_engine_optimizer 0.37% : 0.000105s : 1: tuple_transform 20.75% : 0.005866s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:47:12.711.578 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:47:12.711.822 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0169096, [21] [bootstrap]: 0.00044693 [type_inference]: 0.0058561 [event_method]: 2.06e-05 [auto_monad]: 6.575e-05 [graph_reusing]: 6.48e-06 [inline]: 2.12999e-06 [add_attr]: 0.00309523, [1] [add_attr_with_inline]: 0.00308647, [1] [Cycle 1]: 7.287e-05, [2] [tag_attr]: 2.06e-05 [meta_addattr_fg_expand]: 6.59001e-06 [parallel-infer-symbol]: 3.00002e-06 [pre_auto_parallel]: 3.605e-05 [insert-virtual-dataset]: 2.55002e-06 [parallel-infer-symbol-second]: 7.39994e-07 [dataset_repeat_opt]: 1.97999e-06 [pipeline_split]: 1.55001e-06 [optimize]: 0.00617981, [53] [py_interpret_to_execute]: 3.322e-05 [rewriter_before_opt_a]: 9.84e-05 [opt_a]: 0.00367046, [2] [Cycle 1]: 0.00265037, [45] [expand_dump_flag]: 3.01001e-06 [switch_simplify]: 4.752e-05 [loop_unroll]: 3.579e-05 [a_1]: 0.00088509 [with_stream_mark]: 1.78e-05 [recompute_prepare]: 1.183e-05 [updatestate_depend_eliminate]: 5.66e-06 [updatestate_assign_eliminate]: 4.62e-06 [updatestate_loads_eliminate]: 4.08999e-06 [parameter_eliminate]: 2.24999e-06 [a_2]: 0.0001385 [accelerated_algorithm]: 9.00999e-06 [shard]: 2.44001e-06 [meta_shard_fg_expand]: 2.04e-06 [shard_inline]: 9.31e-06 [merge_send_recv]: 9.32999e-06 [auto_parallel]: 7.95e-06 [parallel]: 1.988e-05 [flash_sp]: 8.46002e-06 [merge_comm]: 5.23002e-06 [allreduce_fusion]: 4.85999e-06 [matmul_add_comm_reduction]: 1.028e-05 [allreduce_slice_to_reducescatter]: 6.19999e-07 [virtual_shard_identity]: 1.056e-05 [virtual_dataset]: 9.06998e-06 [get_grad_eliminate_]: 8.17e-06 [virtual_output]: 8.28999e-06 [merge_forward]: 5.08002e-06 [cell_reuse_recompute_pass]: 1.22999e-06 [offload_activation]: 1.151e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.783e-05 [merge_recompute_call_nodes]: 1.66e-06 [before_grad]: 1.381e-05 [set_forward_comm_id_for_comm_node_pass]: 5.09e-06 [meta_fg_expand]: 4.11001e-06 [flash_sp_send_recv_attached]: 2.63e-06 [receive_attached]: 2.29001e-06 [after_resolve]: 1.555e-05 [a_after_grad]: 1.329e-05 [renormalize]: 0.0007435 [add_forward_monad_depend]: 5.87999e-06 [auto_monad_grad]: 2.41e-06 [auto_monad_eliminator]: 1.693e-05 [cse]: 3.603e-05 [a_3]: 7.809e-05 [Cycle 2]: 0.00100616, [45] [expand_dump_flag]: 1.69e-06 [switch_simplify]: 1.022e-05 [loop_unroll]: 8.68001e-06 [a_1]: 0.00020675 [with_stream_mark]: 1.358e-05 [recompute_prepare]: 9.00999e-06 [updatestate_depend_eliminate]: 4.25999e-06 [updatestate_assign_eliminate]: 3.16999e-06 [updatestate_loads_eliminate]: 3.06001e-06 [parameter_eliminate]: 1.21002e-06 [a_2]: 0.00013031 [accelerated_algorithm]: 8.40999e-06 [shard]: 1.79998e-06 [meta_shard_fg_expand]: 2.47001e-06 [shard_inline]: 1.151e-05 [merge_send_recv]: 6.31998e-06 [auto_parallel]: 7e-06 [parallel]: 4.95999e-06 [flash_sp]: 3.79002e-06 [merge_comm]: 4.38001e-06 [allreduce_fusion]: 4.53001e-06 [matmul_add_comm_reduction]: 7.08e-06 [allreduce_slice_to_reducescatter]: 8.39995e-07 [virtual_shard_identity]: 9.02999e-06 [virtual_dataset]: 8.02998e-06 [get_grad_eliminate_]: 7.75e-06 [virtual_output]: 7.88001e-06 [merge_forward]: 3.95e-06 [cell_reuse_recompute_pass]: 1.67001e-06 [offload_activation]: 8.2e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.769e-05 [merge_recompute_call_nodes]: 1.22999e-06 [before_grad]: 1.361e-05 [set_forward_comm_id_for_comm_node_pass]: 4.74e-06 [meta_fg_expand]: 3.23998e-06 [flash_sp_send_recv_attached]: 8.39995e-07 [receive_attached]: 1.24e-06 [after_resolve]: 1.39e-05 [a_after_grad]: 1.315e-05 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 1.99e-06 [auto_monad_grad]: 1.37e-06 [auto_monad_eliminator]: 8.68001e-06 [cse]: 2.014e-05 [a_3]: 6.45e-05 [py_interpret_to_execute_after_opt_a]: 1.541e-05 [slice_cell_reuse_recomputed_activation]: 4.67e-06 [rewriter_after_opt_a]: 4.665e-05 [convert_after_rewriter]: 1.138e-05 [order_py_execute_after_rewriter]: 9.19e-06 [mutable_eliminate]: 0.00057541 [opt_b]: 0.00034014, [1] [Cycle 1]: 0.00033039, [7] [b_1]: 0.00022459 [b_2]: 1.037e-05 [updatestate_depend_eliminate]: 7.11999e-06 [updatestate_assign_eliminate]: 3.76001e-06 [updatestate_loads_eliminate]: 3.65998e-06 [renormalize]: 5.8001e-07 [cse]: 2.379e-05 [optimize_parallel_all_gather_comm]: 2.166e-05 [overlap_param_gather]: 4.45999e-06 [cconv]: 2.861e-05 [loop_unroll]: 0.00044184 [opt_after_cconv]: 0.00014744, [1] [Cycle 1]: 0.00013926, [7] [c_1]: 4.332e-05 [parameter_eliminate]: 3.72002e-06 [updatestate_depend_eliminate]: 6.19999e-06 [updatestate_assign_eliminate]: 3.5e-06 [updatestate_loads_eliminate]: 3.26001e-06 [cse]: 2.333e-05 [renormalize]: 4.00003e-07 [remove_dup_value]: 1.852e-05 [tuple_transform]: 0.00010719, [1] [Cycle 1]: 9.952e-05, [4] [d_1]: 5.87e-05 [none_parameter_eliminate]: 1.62999e-06 [renormalize]: 1.80007e-07 [switch_simplify]: 9.27001e-06 [partial_unused_args_eliminate]: 4.52998e-06 [add_recomputation]: 6.008e-05 [cse_after_recomputation]: 3.165e-05, [1] [Cycle 1]: 2.504e-05, [1] [cse]: 1.583e-05 [environ_conv]: 1.027e-05 [swap_dp_allreduce_reducescatter]: 8.77e-06 [bias_add_comm_swap]: 4.81002e-06 [label_micro_interleaved_index]: 7.14001e-06 [label_fine_grained_interleaved_index]: 5.50001e-06 [merge_cast_opt]: 3.54002e-06 [slice_recompute_activation]: 4.38001e-06 [micro_interleaved_order_control]: 4.4e-06 [assign_add_opt]: 3.66999e-06 [ForceFp32Comm]: 3.26999e-06 [remove_cast_before_assign_add]: 3.51001e-06 [full_micro_interleaved_order_control]: 4.72998e-06 [reorder_send_recv_between_fp_bp]: 5.40999e-06 [comm_op_add_attrs]: 3.63999e-06 [add_comm_op_reuse_tag]: 3.43e-06 [interleave_split_concat_branches]: 3.53999e-06 [interleave_parallel_branches]: 3.58e-06 [overlap_opt_shard_in_pipeline]: 3.66001e-06 [overlap_opt_shard_grad_in_pipeline]: 4.3e-06 [control_data_broadcast_order]: 1.841e-05 [grouped_pairwise_exchange_alltoall]: 4.4e-06 [offloading_packed_experts]: 7.35998e-06 [overlap_recompute_and_grad_model_parallel]: 8.13001e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.51001e-06 [overlap_recompute_allgather_and_fa_grad]: 3.73999e-06 [overlap_recompute_comm]: 4.42e-06 [overlap_grad_ring_attention]: 6.99001e-06 [overlap_grad_flash_sp]: 2.657e-05 [begin_end_overlap_inline]: 2.94999e-06 [split_matmul_comm_elemetwise]: 4.72998e-06 [split_layernorm_comm]: 4.46002e-06 [handle_group_info]: 3.5e-06 [symbol_engine_optimizer]: 0.00010918, [1] [Cycle 1]: 0.00010224, [6] [build]: 3.64002e-06 [elim_shapecalc]: 1.263e-05 [elim_not_effective]: 1.657e-05 [opt_reshape]: 9.20999e-06 [fold_const_symbol]: 1.31e-05 [renormalize]: 1.8999e-07 [detach_backward]: 4.08001e-06 [pipeline_parallel_scheduler]: 1.82999e-06 [auto_monad_reorder]: 2.292e-05 [get_jit_bprop_graph]: 1.68002e-06 [rewriter_after_jit_bprop_graph]: 4.76997e-06 [opt_after_jit_grad]: 0.00050882 [validate]: 4.498e-05 Sums bootstrap : 0.000447s : 3.71% type_inference : 0.005856s : 48.60% event_method : 0.000021s : 0.17% auto_monad : 0.000066s : 0.55% graph_reusing : 0.000006s : 0.05% inline : 0.000002s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000021s : 0.17% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000007s : 0.05% parallel-infer-symbol : 0.000003s : 0.02% pre_auto_parallel : 0.000036s : 0.30% insert-virtual-dataset : 0.000003s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000033s : 0.28% optimize.rewriter_before_opt_a : 0.000098s : 0.82% optimize.opt_a.expand_dump_flag : 0.000005s : 0.04% optimize.opt_a.switch_simplify : 0.000058s : 0.48% optimize.opt_a.loop_unroll : 0.000044s : 0.37% optimize.opt_a.a_1 : 0.001092s : 9.06% optimize.opt_a.with_stream_mark : 0.000031s : 0.26% optimize.opt_a.recompute_prepare : 0.000021s : 0.17% optimize.opt_a.updatestate_depend_eliminate : 0.000010s : 0.08% optimize.opt_a.updatestate_assign_eliminate : 0.000008s : 0.06% optimize.opt_a.updatestate_loads_eliminate : 0.000007s : 0.06% optimize.opt_a.parameter_eliminate : 0.000003s : 0.03% optimize.opt_a.a_2 : 0.000269s : 2.23% optimize.opt_a.accelerated_algorithm : 0.000017s : 0.14% optimize.opt_a.shard : 0.000004s : 0.04% optimize.opt_a.meta_shard_fg_expand : 0.000005s : 0.04% optimize.opt_a.shard_inline : 0.000021s : 0.17% optimize.opt_a.merge_send_recv : 0.000016s : 0.13% optimize.opt_a.auto_parallel : 0.000015s : 0.12% optimize.opt_a.parallel : 0.000025s : 0.21% optimize.opt_a.flash_sp : 0.000012s : 0.10% optimize.opt_a.merge_comm : 0.000010s : 0.08% optimize.opt_a.allreduce_fusion : 0.000009s : 0.08% optimize.opt_a.matmul_add_comm_reduction : 0.000017s : 0.14% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000020s : 0.16% optimize.opt_a.virtual_dataset : 0.000017s : 0.14% optimize.opt_a.get_grad_eliminate_ : 0.000016s : 0.13% optimize.opt_a.virtual_output : 0.000016s : 0.13% optimize.opt_a.merge_forward : 0.000009s : 0.07% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.02% optimize.opt_a.offload_activation : 0.000020s : 0.16% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000036s : 0.29% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.02% optimize.opt_a.before_grad : 0.000027s : 0.23% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000010s : 0.08% optimize.opt_a.meta_fg_expand : 0.000007s : 0.06% optimize.opt_a.flash_sp_send_recv_attached : 0.000003s : 0.03% optimize.opt_a.receive_attached : 0.000004s : 0.03% optimize.opt_a.after_resolve : 0.000029s : 0.24% optimize.opt_a.a_after_grad : 0.000026s : 0.22% optimize.opt_a.renormalize : 0.000744s : 6.17% optimize.opt_a.add_forward_monad_depend : 0.000008s : 0.07% optimize.opt_a.auto_monad_grad : 0.000004s : 0.03% optimize.opt_a.auto_monad_eliminator : 0.000026s : 0.21% optimize.opt_a.cse : 0.000056s : 0.47% optimize.opt_a.a_3 : 0.000143s : 1.18% optimize.py_interpret_to_execute_after_opt_a : 0.000015s : 0.13% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.04% optimize.rewriter_after_opt_a : 0.000047s : 0.39% optimize.convert_after_rewriter : 0.000011s : 0.09% optimize.order_py_execute_after_rewriter : 0.000009s : 0.08% optimize.mutable_eliminate : 0.000575s : 4.78% optimize.opt_b.b_1 : 0.000225s : 1.86% optimize.opt_b.b_2 : 0.000010s : 0.09% optimize.opt_b.updatestate_depend_eliminate : 0.000007s : 0.06% optimize.opt_b.updatestate_assign_eliminate : 0.000004s : 0.03% optimize.opt_b.updatestate_loads_eliminate : 0.000004s : 0.03% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000024s : 0.20% optimize.optimize_parallel_all_gather_comm : 0.000022s : 0.18% optimize.overlap_param_gather : 0.000004s : 0.04% optimize.cconv : 0.000029s : 0.24% optimize.loop_unroll : 0.000442s : 3.67% optimize.opt_after_cconv.c_1 : 0.000043s : 0.36% optimize.opt_after_cconv.parameter_eliminate : 0.000004s : 0.03% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.05% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.cse : 0.000023s : 0.19% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000019s : 0.15% optimize.tuple_transform.d_1 : 0.000059s : 0.49% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000009s : 0.08% optimize.partial_unused_args_eliminate : 0.000005s : 0.04% optimize.add_recomputation : 0.000060s : 0.50% optimize.cse_after_recomputation.cse : 0.000016s : 0.13% optimize.environ_conv : 0.000010s : 0.09% optimize.swap_dp_allreduce_reducescatter : 0.000009s : 0.07% optimize.bias_add_comm_swap : 0.000005s : 0.04% optimize.label_micro_interleaved_index : 0.000007s : 0.06% optimize.label_fine_grained_interleaved_index : 0.000006s : 0.05% optimize.merge_cast_opt : 0.000004s : 0.03% optimize.slice_recompute_activation : 0.000004s : 0.04% optimize.micro_interleaved_order_control : 0.000004s : 0.04% optimize.assign_add_opt : 0.000004s : 0.03% optimize.ForceFp32Comm : 0.000003s : 0.03% optimize.remove_cast_before_assign_add : 0.000004s : 0.03% optimize.full_micro_interleaved_order_control : 0.000005s : 0.04% optimize.reorder_send_recv_between_fp_bp : 0.000005s : 0.04% optimize.comm_op_add_attrs : 0.000004s : 0.03% optimize.add_comm_op_reuse_tag : 0.000003s : 0.03% optimize.interleave_split_concat_branches : 0.000004s : 0.03% optimize.interleave_parallel_branches : 0.000004s : 0.03% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.03% optimize.overlap_opt_shard_grad_in_pipeline : 0.000004s : 0.04% optimize.control_data_broadcast_order : 0.000018s : 0.15% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.04% optimize.offloading_packed_experts : 0.000007s : 0.06% optimize.overlap_recompute_and_grad_model_parallel : 0.000008s : 0.07% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.03% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.03% optimize.overlap_recompute_comm : 0.000004s : 0.04% optimize.overlap_grad_ring_attention : 0.000007s : 0.06% optimize.overlap_grad_flash_sp : 0.000027s : 0.22% optimize.begin_end_overlap_inline : 0.000003s : 0.02% optimize.split_matmul_comm_elemetwise : 0.000005s : 0.04% optimize.split_layernorm_comm : 0.000004s : 0.04% optimize.handle_group_info : 0.000003s : 0.03% optimize.symbol_engine_optimizer.build : 0.000004s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000013s : 0.10% optimize.symbol_engine_optimizer.elim_not_effective : 0.000017s : 0.14% optimize.symbol_engine_optimizer.opt_reshape : 0.000009s : 0.08% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000013s : 0.11% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000004s : 0.03% pipeline_parallel_scheduler : 0.000002s : 0.02% auto_monad_reorder : 0.000023s : 0.19% get_jit_bprop_graph : 0.000002s : 0.01% rewriter_after_jit_bprop_graph : 0.000005s : 0.04% opt_after_jit_grad : 0.000509s : 4.22% validate : 0.000045s : 0.37% Time group info: ------[substitution.] 0.000234 44 9.92% : 0.000023s : 3: substitution.cast_eliminate 1.00% : 0.000002s : 3: substitution.elim_not_effective 0.71% : 0.000002s : 3: substitution.fold_const_symbol 3.10% : 0.000007s : 6: substitution.graph_param_transform 66.97% : 0.000157s : 4: substitution.inline 2.16% : 0.000005s : 6: substitution.j_node_and_user_rematch 2.77% : 0.000006s : 6: substitution.remove_not_recompute_node 2.52% : 0.000006s : 6: substitution.replace_old_param 8.26% : 0.000019s : 6: substitution.tuple_list_get_item_eliminator 2.57% : 0.000006s : 1: substitution.value_based_eliminate ------[type_inference.] 0.005805 2 86.82% : 0.005040s : 1: type_inference.infer 13.18% : 0.000765s : 1: type_inference.specialize ------[replace.] 0.000071 10 53.86% : 0.000038s : 4: replace.inline 46.14% : 0.000033s : 6: replace.tuple_list_get_item_eliminator ------[match.] 0.000170 10 90.30% : 0.000154s : 4: match.inline 9.70% : 0.000017s : 6: match.tuple_list_get_item_eliminator ------[predicate.] 0.000293 1908 0.94% : 0.000003s : 20: predicate.accumulaten_eliminater 0.66% : 0.000002s : 6: predicate.ad_related_special_op_eliminate 0.54% : 0.000002s : 12: predicate.addn_check_dump 0.92% : 0.000003s : 20: predicate.addn_zero_filter 0.90% : 0.000003s : 20: predicate.adjust_all_reduce_mul_add 1.85% : 0.000005s : 32: predicate.arithmetic_simplify 1.09% : 0.000003s : 20: predicate.cast_eliminate 0.60% : 0.000002s : 12: predicate.check_bprop_eliminate 0.55% : 0.000002s : 12: predicate.compare_switch_simplify 0.19% : 0.000001s : 6: predicate.const_output_eliminate 0.61% : 0.000002s : 12: predicate.depend_value_elim 1.00% : 0.000003s : 20: predicate.dict_get_item_const_eliminator 1.04% : 0.000003s : 20: predicate.dict_get_item_eliminator 0.93% : 0.000003s : 20: predicate.dict_set_item_eliminator 1.07% : 0.000003s : 12: predicate.dumpgradient_eliminate 0.24% : 0.000001s : 6: predicate.elim_not_effective 0.37% : 0.000001s : 6: predicate.elim_shapecalc_of_broadcastargs 1.27% : 0.000004s : 26: predicate.environ_add_const_eliminate 1.18% : 0.000003s : 26: predicate.environ_get_add_eliminate 1.21% : 0.000004s : 26: predicate.environ_get_depend_swap 1.78% : 0.000005s : 38: predicate.environ_get_eliminate 1.21% : 0.000004s : 26: predicate.environ_get_set_eliminate 1.46% : 0.000004s : 30: predicate.exchange_switch_depend_value 2.24% : 0.000007s : 30: predicate.float_depend_g_call 0.55% : 0.000002s : 12: predicate.float_environ_get_switch 0.82% : 0.000002s : 18: predicate.float_tuple_getitem_switch 0.21% : 0.000001s : 6: predicate.fold_const_symbol 0.64% : 0.000002s : 12: predicate.get_grad_eliminate 0.21% : 0.000001s : 6: predicate.graph_param_transform 0.61% : 0.000002s : 12: predicate.incorporate_call 0.52% : 0.000002s : 12: predicate.incorporate_call_switch 6.30% : 0.000018s : 86: predicate.inline 0.84% : 0.000002s : 12: predicate.inline_without_move 0.41% : 0.000001s : 12: predicate.j_node_and_user_rematch 0.71% : 0.000002s : 12: predicate.less_batch_normalization 1.90% : 0.000006s : 38: predicate.list_to_tuple_eliminator_ 2.70% : 0.000008s : 58: predicate.load_eliminater 0.78% : 0.000002s : 6: predicate.loop_unroll_after_grad 2.07% : 0.000006s : 42: predicate.loop_unroll_before_grad 1.64% : 0.000005s : 32: predicate.make_slice_get_slice_eliminator 0.57% : 0.000002s : 12: predicate.merge_addn 0.68% : 0.000002s : 12: predicate.micro_step_allgather_replace 0.58% : 0.000002s : 12: predicate.mini_step_allgather_replace 0.85% : 0.000003s : 20: predicate.minmaximum_grad 1.01% : 0.000003s : 6: predicate.mutable_eliminate 0.34% : 0.000001s : 6: predicate.opt_reshape 0.36% : 0.000001s : 6: predicate.parallel_virtual_node 1.85% : 0.000005s : 30: predicate.partial_defer_inline 1.86% : 0.000005s : 32: predicate.partial_eliminate 0.92% : 0.000003s : 20: predicate.print_const_string_wrapper 0.57% : 0.000002s : 12: predicate.reduce_all_const_elim 1.19% : 0.000003s : 20: predicate.reduce_eliminate 2.60% : 0.000008s : 58: predicate.redundant_stop_gradient_eliminater 0.45% : 0.000001s : 12: predicate.remove_not_recompute_node 1.43% : 0.000004s : 38: predicate.replace_applicator 0.49% : 0.000001s : 12: predicate.replace_old_param 0.30% : 0.000001s : 6: predicate.reset_defer_inline 1.02% : 0.000003s : 20: predicate.reshape_eliminate 0.64% : 0.000002s : 12: predicate.row_tensor_add_zeros_like 0.32% : 0.000001s : 6: predicate.row_tensor_eliminate 0.80% : 0.000002s : 12: predicate.same_eliminate 0.53% : 0.000002s : 12: predicate.set_cell_output_no_recompute 0.74% : 0.000002s : 12: predicate.shard_identity_eliminate 0.64% : 0.000002s : 12: predicate.special_op_eliminate 0.71% : 0.000002s : 12: predicate.specialize_transform 0.80% : 0.000002s : 12: predicate.split_environ_get_set_with_tuple_value 0.70% : 0.000002s : 12: predicate.stack_unstack_eliminate 0.35% : 0.000001s : 6: predicate.switch_call_monad_eliminater 1.65% : 0.000005s : 30: predicate.switch_defer_inline 2.08% : 0.000006s : 42: predicate.switch_layer_defer_inline 4.72% : 0.000014s : 90: predicate.switch_simplify 0.90% : 0.000003s : 20: predicate.tile_eliminate 0.89% : 0.000003s : 20: predicate.transpose_eliminate 1.56% : 0.000005s : 32: predicate.tuple_list_convert_item_index_to_positive 1.62% : 0.000005s : 32: predicate.tuple_list_get_item_const_eliminator 1.43% : 0.000004s : 32: predicate.tuple_list_get_item_depend_reorder 3.09% : 0.000009s : 50: predicate.tuple_list_get_item_eliminator 1.49% : 0.000004s : 32: predicate.tuple_list_get_set_item_eliminator 2.15% : 0.000006s : 44: predicate.tuple_list_set_item_eliminator 2.00% : 0.000006s : 38: predicate.tuple_to_list_eliminator_ 2.60% : 0.000008s : 58: predicate.updatestate_pure_node_eliminater 3.26% : 0.000010s : 70: predicate.updatestate_useless_node_eliminater 0.50% : 0.000001s : 6: predicate.value_based_eliminate 0.66% : 0.000002s : 12: predicate.virtual_dataset_eliminate 0.62% : 0.000002s : 12: predicate.virtual_output_eliminate 0.30% : 0.000001s : 6: predicate.virtual_view_grad_eliminate 0.41% : 0.000001s : 6: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000550 11 53.64% : 0.000295s : 5: func_graph_cloner_run.FuncGraphClonerGraph 46.36% : 0.000255s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.028856 192 0.02% : 0.000006s : 1: ForceFp32Comm 10.76% : 0.003104s : 1: add_attr 10.71% : 0.003091s : 1: add_attr_with_inline 0.02% : 0.000006s : 1: add_comm_op_reuse_tag 0.22% : 0.000064s : 1: add_recomputation 0.02% : 0.000006s : 1: assign_add_opt 0.26% : 0.000076s : 1: auto_monad 0.10% : 0.000030s : 1: auto_monad_reorder 0.02% : 0.000006s : 1: begin_end_overlap_inline 0.03% : 0.000008s : 1: bias_add_comm_swap 1.70% : 0.000490s : 1: bootstrap 0.11% : 0.000032s : 1: cconv 0.02% : 0.000007s : 1: comm_op_add_attrs 0.07% : 0.000021s : 1: control_data_broadcast_order 0.05% : 0.000014s : 1: convert_after_rewriter 0.12% : 0.000035s : 1: cse_after_recomputation 0.03% : 0.000008s : 1: dataset_repeat_opt 0.07% : 0.000019s : 1: detach_backward 0.05% : 0.000013s : 1: environ_conv 0.11% : 0.000031s : 1: event_method 0.03% : 0.000007s : 1: full_micro_interleaved_order_control 0.03% : 0.000008s : 1: get_jit_bprop_graph 0.04% : 0.000013s : 1: graph_reusing 0.02% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.02% : 0.000006s : 1: handle_group_info 0.03% : 0.000008s : 1: inline 0.03% : 0.000009s : 1: insert-virtual-dataset 0.02% : 0.000006s : 1: interleave_parallel_branches 0.02% : 0.000006s : 1: interleave_split_concat_branches 0.03% : 0.000008s : 1: label_fine_grained_interleaved_index 0.04% : 0.000010s : 1: label_micro_interleaved_index 1.55% : 0.000447s : 1: loop_unroll 0.02% : 0.000006s : 1: merge_cast_opt 0.02% : 0.000007s : 1: micro_interleaved_order_control 2.02% : 0.000582s : 1: mutable_eliminate 0.04% : 0.000010s : 1: offloading_packed_experts 0.06% : 0.000017s : 1: opt.transform.loop_unroll_optimizer 0.06% : 0.000018s : 1: opt.transform.mutable_eliminate 5.81% : 0.001678s : 78: opt.transform.opt_a 0.14% : 0.000042s : 1: opt.transform.opt_after_cconv 0.12% : 0.000033s : 1: opt.transform.opt_after_jit_grad 0.56% : 0.000163s : 28: opt.transform.opt_b 0.23% : 0.000066s : 2: opt.transform.opt_trans_graph 0.16% : 0.000047s : 4: opt.transform.symbol_engine_opt 12.73% : 0.003674s : 1: opt_a 0.52% : 0.000151s : 1: opt_after_cconv 1.80% : 0.000519s : 1: opt_after_jit_grad 1.19% : 0.000344s : 1: opt_b 22.67% : 0.006541s : 1: optimize 0.09% : 0.000025s : 1: optimize_parallel_all_gather_comm 0.04% : 0.000012s : 1: order_py_execute_after_rewriter 0.10% : 0.000030s : 1: overlap_grad_flash_sp 0.02% : 0.000006s : 1: overlap_grad_matmul_and_grad_allreduce 0.04% : 0.000010s : 1: overlap_grad_ring_attention 0.03% : 0.000007s : 1: overlap_opt_shard_grad_in_pipeline 0.02% : 0.000006s : 1: overlap_opt_shard_in_pipeline 0.03% : 0.000007s : 1: overlap_param_gather 0.02% : 0.000006s : 1: overlap_recompute_allgather_and_fa_grad 0.04% : 0.000011s : 1: overlap_recompute_and_grad_model_parallel 0.02% : 0.000007s : 1: overlap_recompute_comm 0.03% : 0.000010s : 1: parallel-infer-symbol 0.02% : 0.000006s : 1: parallel-infer-symbol-second 0.03% : 0.000007s : 1: partial_unused_args_eliminate 0.03% : 0.000010s : 1: pipeline_parallel_scheduler 0.02% : 0.000007s : 1: pipeline_split 0.15% : 0.000044s : 1: pre_auto_parallel 0.13% : 0.000037s : 1: py_interpret_to_execute 0.06% : 0.000019s : 1: py_interpret_to_execute_after_opt_a 0.02% : 0.000007s : 1: remove_cast_before_assign_add 0.08% : 0.000022s : 1: remove_dup_value 1.41% : 0.000406s : 1: renormalize.infer 1.14% : 0.000329s : 1: renormalize.specialize 0.03% : 0.000008s : 1: reorder_send_recv_between_fp_bp 0.04% : 0.000010s : 1: rewriter_after_jit_bprop_graph 0.17% : 0.000050s : 1: rewriter_after_opt_a 0.35% : 0.000102s : 1: rewriter_before_opt_a 0.03% : 0.000008s : 1: slice_cell_reuse_recomputed_activation 0.02% : 0.000007s : 1: slice_recompute_activation 0.02% : 0.000007s : 1: split_layernorm_comm 0.03% : 0.000007s : 1: split_matmul_comm_elemetwise 0.04% : 0.000012s : 1: swap_dp_allreduce_reducescatter 0.39% : 0.000112s : 1: symbol_engine_optimizer 0.38% : 0.000110s : 1: tuple_transform 20.42% : 0.005891s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:47:12.907.796 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0152624, [21] [bootstrap]: 0.00044605 [type_inference]: 0.00572005 [event_method]: 1.948e-05 [auto_monad]: 6.482e-05 [graph_reusing]: 5.74999e-06 [inline]: 2.65002e-06 [add_attr]: 0.00302905, [1] [add_attr_with_inline]: 0.00302048, [1] [Cycle 1]: 5.665e-05, [2] [tag_attr]: 2.036e-05 [meta_addattr_fg_expand]: 6.48e-06 [parallel-infer-symbol]: 3.36001e-06 [pre_auto_parallel]: 3.343e-05 [insert-virtual-dataset]: 2.24999e-06 [parallel-infer-symbol-second]: 7.79983e-07 [dataset_repeat_opt]: 2.04999e-06 [pipeline_split]: 1.57999e-06 [optimize]: 0.00526049, [53] [py_interpret_to_execute]: 2.824e-05 [rewriter_before_opt_a]: 9.276e-05 [opt_a]: 0.00313936, [2] [Cycle 1]: 0.00228915, [45] [expand_dump_flag]: 2.96999e-06 [switch_simplify]: 4.862e-05 [loop_unroll]: 3.584e-05 [a_1]: 0.00083065 [with_stream_mark]: 1.537e-05 [recompute_prepare]: 1.028e-05 [updatestate_depend_eliminate]: 5.26998e-06 [updatestate_assign_eliminate]: 4.25e-06 [updatestate_loads_eliminate]: 3.58e-06 [parameter_eliminate]: 2.32001e-06 [a_2]: 0.00010831 [accelerated_algorithm]: 8.99e-06 [shard]: 2.32001e-06 [meta_shard_fg_expand]: 1.96e-06 [shard_inline]: 8.82e-06 [merge_send_recv]: 9.10001e-06 [auto_parallel]: 6.68e-06 [parallel]: 1.884e-05 [flash_sp]: 8.17e-06 [merge_comm]: 5.19e-06 [allreduce_fusion]: 4.34002e-06 [matmul_add_comm_reduction]: 1.056e-05 [allreduce_slice_to_reducescatter]: 6.59988e-07 [virtual_shard_identity]: 1.031e-05 [virtual_dataset]: 8.97e-06 [get_grad_eliminate_]: 8.28001e-06 [virtual_output]: 8.41002e-06 [merge_forward]: 4.62998e-06 [cell_reuse_recompute_pass]: 1.12e-06 [offload_activation]: 1.101e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.606e-05 [merge_recompute_call_nodes]: 2.02999e-06 [before_grad]: 1.332e-05 [set_forward_comm_id_for_comm_node_pass]: 4.41002e-06 [meta_fg_expand]: 3.36001e-06 [flash_sp_send_recv_attached]: 2.52001e-06 [receive_attached]: 2.34999e-06 [after_resolve]: 2.284e-05 [a_after_grad]: 1.338e-05 [renormalize]: 0.00064172 [add_forward_monad_depend]: 5.44e-06 [auto_monad_grad]: 2.26e-06 [auto_monad_eliminator]: 1.634e-05 [cse]: 3.544e-05 [a_3]: 6.066e-05 [Cycle 2]: 0.00084029, [45] [expand_dump_flag]: 1.27999e-06 [switch_simplify]: 9.69e-06 [loop_unroll]: 8.52e-06 [a_1]: 0.00022472 [with_stream_mark]: 1.341e-05 [recompute_prepare]: 8.67e-06 [updatestate_depend_eliminate]: 3.98999e-06 [updatestate_assign_eliminate]: 3.09999e-06 [updatestate_loads_eliminate]: 3.22002e-06 [parameter_eliminate]: 1.14998e-06 [a_2]: 0.00010186 [accelerated_algorithm]: 8.22e-06 [shard]: 1.45001e-06 [meta_shard_fg_expand]: 1.92001e-06 [shard_inline]: 8.13001e-06 [merge_send_recv]: 6.30002e-06 [auto_parallel]: 6.63e-06 [parallel]: 4.39002e-06 [flash_sp]: 3.44001e-06 [merge_comm]: 4.28001e-06 [allreduce_fusion]: 7.21001e-06 [matmul_add_comm_reduction]: 7.07002e-06 [allreduce_slice_to_reducescatter]: 5.59987e-07 [virtual_shard_identity]: 8.89e-06 [virtual_dataset]: 7.88999e-06 [get_grad_eliminate_]: 7.77998e-06 [virtual_output]: 7.53e-06 [merge_forward]: 3.37002e-06 [cell_reuse_recompute_pass]: 1.31998e-06 [offload_activation]: 7.53999e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.482e-05 [merge_recompute_call_nodes]: 9.00007e-07 [before_grad]: 1.256e-05 [set_forward_comm_id_for_comm_node_pass]: 4.27e-06 [meta_fg_expand]: 3.03e-06 [flash_sp_send_recv_attached]: 9.20001e-07 [receive_attached]: 1.08001e-06 [after_resolve]: 1.473e-05 [a_after_grad]: 1.237e-05 [renormalize]: 1.09983e-07 [add_forward_monad_depend]: 1.64e-06 [auto_monad_grad]: 1.22e-06 [auto_monad_eliminator]: 9.74e-06 [cse]: 1.888e-05 [a_3]: 4.925e-05 [py_interpret_to_execute_after_opt_a]: 1.114e-05 [slice_cell_reuse_recomputed_activation]: 1.98002e-06 [rewriter_after_opt_a]: 4.011e-05 [convert_after_rewriter]: 7.55e-06 [order_py_execute_after_rewriter]: 5.96e-06 [mutable_eliminate]: 0.00049198 [opt_b]: 0.00026688, [1] [Cycle 1]: 0.0002605, [7] [b_1]: 0.00017876 [b_2]: 1.003e-05 [updatestate_depend_eliminate]: 6.27001e-06 [updatestate_assign_eliminate]: 3.08e-06 [updatestate_loads_eliminate]: 3.23998e-06 [renormalize]: 5.50004e-07 [cse]: 2.287e-05 [optimize_parallel_all_gather_comm]: 1.74e-05 [overlap_param_gather]: 1.94999e-06 [cconv]: 2.513e-05 [loop_unroll]: 0.00042177 [opt_after_cconv]: 0.00011799, [1] [Cycle 1]: 0.00011254, [7] [c_1]: 4.16e-05 [parameter_eliminate]: 2.78e-06 [updatestate_depend_eliminate]: 5.71998e-06 [updatestate_assign_eliminate]: 3.23e-06 [updatestate_loads_eliminate]: 3.27002e-06 [cse]: 2.22e-05 [renormalize]: 3.50003e-07 [remove_dup_value]: 1.496e-05 [tuple_transform]: 9.02e-05, [1] [Cycle 1]: 8.587e-05, [4] [d_1]: 5.712e-05 [none_parameter_eliminate]: 1.62001e-06 [renormalize]: 2.30008e-07 [switch_simplify]: 8.75001e-06 [partial_unused_args_eliminate]: 1.71e-06 [add_recomputation]: 5.447e-05 [cse_after_recomputation]: 2.628e-05, [1] [Cycle 1]: 2.164e-05, [1] [cse]: 1.621e-05 [environ_conv]: 6.41e-06 [swap_dp_allreduce_reducescatter]: 6.22001e-06 [bias_add_comm_swap]: 2.61999e-06 [label_micro_interleaved_index]: 4.25999e-06 [label_fine_grained_interleaved_index]: 2.90998e-06 [merge_cast_opt]: 1.27e-06 [slice_recompute_activation]: 1.87001e-06 [micro_interleaved_order_control]: 2.29001e-06 [assign_add_opt]: 1.14e-06 [ForceFp32Comm]: 8.00006e-07 [remove_cast_before_assign_add]: 1.00001e-06 [full_micro_interleaved_order_control]: 2.04e-06 [reorder_send_recv_between_fp_bp]: 2.60002e-06 [comm_op_add_attrs]: 1.38002e-06 [add_comm_op_reuse_tag]: 9.50007e-07 [interleave_split_concat_branches]: 1.42e-06 [interleave_parallel_branches]: 1.00001e-06 [overlap_opt_shard_in_pipeline]: 1.19e-06 [overlap_opt_shard_grad_in_pipeline]: 1.76e-06 [control_data_broadcast_order]: 1.411e-05 [grouped_pairwise_exchange_alltoall]: 1.52999e-06 [offloading_packed_experts]: 4.2e-06 [overlap_recompute_and_grad_model_parallel]: 5.22e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.15999e-06 [overlap_recompute_allgather_and_fa_grad]: 1.45001e-06 [overlap_recompute_comm]: 2.40002e-06 [overlap_grad_ring_attention]: 4.4e-06 [overlap_grad_flash_sp]: 2.133e-05 [begin_end_overlap_inline]: 5.90022e-07 [split_matmul_comm_elemetwise]: 2.09e-06 [split_layernorm_comm]: 1.75001e-06 [handle_group_info]: 9.20001e-07 [symbol_engine_optimizer]: 8.53e-05, [1] [Cycle 1]: 8.039e-05, [6] [build]: 2.98998e-06 [elim_shapecalc]: 1.151e-05 [elim_not_effective]: 1.586e-05 [opt_reshape]: 9.26998e-06 [fold_const_symbol]: 1.271e-05 [renormalize]: 2.59985e-07 [detach_backward]: 2.15002e-06 [pipeline_parallel_scheduler]: 1.52999e-06 [auto_monad_reorder]: 1.909e-05 [get_jit_bprop_graph]: 1.74e-06 [rewriter_after_jit_bprop_graph]: 3.38999e-06 [opt_after_jit_grad]: 0.00046002 [validate]: 4.146e-05 Sums bootstrap : 0.000446s : 3.95% type_inference : 0.005720s : 50.63% event_method : 0.000019s : 0.17% auto_monad : 0.000065s : 0.57% graph_reusing : 0.000006s : 0.05% inline : 0.000003s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000020s : 0.18% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.06% parallel-infer-symbol : 0.000003s : 0.03% pre_auto_parallel : 0.000033s : 0.30% insert-virtual-dataset : 0.000002s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000028s : 0.25% optimize.rewriter_before_opt_a : 0.000093s : 0.82% optimize.opt_a.expand_dump_flag : 0.000004s : 0.04% optimize.opt_a.switch_simplify : 0.000058s : 0.52% optimize.opt_a.loop_unroll : 0.000044s : 0.39% optimize.opt_a.a_1 : 0.001055s : 9.34% optimize.opt_a.with_stream_mark : 0.000029s : 0.25% optimize.opt_a.recompute_prepare : 0.000019s : 0.17% optimize.opt_a.updatestate_depend_eliminate : 0.000009s : 0.08% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.07% optimize.opt_a.updatestate_loads_eliminate : 0.000007s : 0.06% optimize.opt_a.parameter_eliminate : 0.000003s : 0.03% optimize.opt_a.a_2 : 0.000210s : 1.86% optimize.opt_a.accelerated_algorithm : 0.000017s : 0.15% optimize.opt_a.shard : 0.000004s : 0.03% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.03% optimize.opt_a.shard_inline : 0.000017s : 0.15% optimize.opt_a.merge_send_recv : 0.000015s : 0.14% optimize.opt_a.auto_parallel : 0.000013s : 0.12% optimize.opt_a.parallel : 0.000023s : 0.21% optimize.opt_a.flash_sp : 0.000012s : 0.10% optimize.opt_a.merge_comm : 0.000009s : 0.08% optimize.opt_a.allreduce_fusion : 0.000012s : 0.10% optimize.opt_a.matmul_add_comm_reduction : 0.000018s : 0.16% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000019s : 0.17% optimize.opt_a.virtual_dataset : 0.000017s : 0.15% optimize.opt_a.get_grad_eliminate_ : 0.000016s : 0.14% optimize.opt_a.virtual_output : 0.000016s : 0.14% optimize.opt_a.merge_forward : 0.000008s : 0.07% optimize.opt_a.cell_reuse_recompute_pass : 0.000002s : 0.02% optimize.opt_a.offload_activation : 0.000019s : 0.16% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000031s : 0.27% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.03% optimize.opt_a.before_grad : 0.000026s : 0.23% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000009s : 0.08% optimize.opt_a.meta_fg_expand : 0.000006s : 0.06% optimize.opt_a.flash_sp_send_recv_attached : 0.000003s : 0.03% optimize.opt_a.receive_attached : 0.000003s : 0.03% optimize.opt_a.after_resolve : 0.000038s : 0.33% optimize.opt_a.a_after_grad : 0.000026s : 0.23% optimize.opt_a.renormalize : 0.000642s : 5.68% optimize.opt_a.add_forward_monad_depend : 0.000007s : 0.06% optimize.opt_a.auto_monad_grad : 0.000003s : 0.03% optimize.opt_a.auto_monad_eliminator : 0.000026s : 0.23% optimize.opt_a.cse : 0.000054s : 0.48% optimize.opt_a.a_3 : 0.000110s : 0.97% optimize.py_interpret_to_execute_after_opt_a : 0.000011s : 0.10% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.02% optimize.rewriter_after_opt_a : 0.000040s : 0.36% optimize.convert_after_rewriter : 0.000008s : 0.07% optimize.order_py_execute_after_rewriter : 0.000006s : 0.05% optimize.mutable_eliminate : 0.000492s : 4.35% optimize.opt_b.b_1 : 0.000179s : 1.58% optimize.opt_b.b_2 : 0.000010s : 0.09% optimize.opt_b.updatestate_depend_eliminate : 0.000006s : 0.06% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000023s : 0.20% optimize.optimize_parallel_all_gather_comm : 0.000017s : 0.15% optimize.overlap_param_gather : 0.000002s : 0.02% optimize.cconv : 0.000025s : 0.22% optimize.loop_unroll : 0.000422s : 3.73% optimize.opt_after_cconv.c_1 : 0.000042s : 0.37% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.02% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.05% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.cse : 0.000022s : 0.20% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000015s : 0.13% optimize.tuple_transform.d_1 : 0.000057s : 0.51% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000009s : 0.08% optimize.partial_unused_args_eliminate : 0.000002s : 0.02% optimize.add_recomputation : 0.000054s : 0.48% optimize.cse_after_recomputation.cse : 0.000016s : 0.14% optimize.environ_conv : 0.000006s : 0.06% optimize.swap_dp_allreduce_reducescatter : 0.000006s : 0.06% optimize.bias_add_comm_swap : 0.000003s : 0.02% optimize.label_micro_interleaved_index : 0.000004s : 0.04% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.03% optimize.merge_cast_opt : 0.000001s : 0.01% optimize.slice_recompute_activation : 0.000002s : 0.02% optimize.micro_interleaved_order_control : 0.000002s : 0.02% optimize.assign_add_opt : 0.000001s : 0.01% optimize.ForceFp32Comm : 0.000001s : 0.01% optimize.remove_cast_before_assign_add : 0.000001s : 0.01% optimize.full_micro_interleaved_order_control : 0.000002s : 0.02% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.02% optimize.comm_op_add_attrs : 0.000001s : 0.01% optimize.add_comm_op_reuse_tag : 0.000001s : 0.01% optimize.interleave_split_concat_branches : 0.000001s : 0.01% optimize.interleave_parallel_branches : 0.000001s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.02% optimize.control_data_broadcast_order : 0.000014s : 0.12% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.01% optimize.offloading_packed_experts : 0.000004s : 0.04% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.05% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.01% optimize.overlap_recompute_comm : 0.000002s : 0.02% optimize.overlap_grad_ring_attention : 0.000004s : 0.04% optimize.overlap_grad_flash_sp : 0.000021s : 0.19% optimize.begin_end_overlap_inline : 0.000001s : 0.01% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.02% optimize.split_layernorm_comm : 0.000002s : 0.02% optimize.handle_group_info : 0.000001s : 0.01% optimize.symbol_engine_optimizer.build : 0.000003s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000012s : 0.10% optimize.symbol_engine_optimizer.elim_not_effective : 0.000016s : 0.14% optimize.symbol_engine_optimizer.opt_reshape : 0.000009s : 0.08% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000013s : 0.11% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.02% pipeline_parallel_scheduler : 0.000002s : 0.01% auto_monad_reorder : 0.000019s : 0.17% get_jit_bprop_graph : 0.000002s : 0.02% rewriter_after_jit_bprop_graph : 0.000003s : 0.03% opt_after_jit_grad : 0.000460s : 4.07% validate : 0.000041s : 0.37% Time group info: ------[substitution.] 0.000218 44 9.75% : 0.000021s : 3: substitution.cast_eliminate 1.07% : 0.000002s : 3: substitution.elim_not_effective 0.74% : 0.000002s : 3: substitution.fold_const_symbol 3.16% : 0.000007s : 6: substitution.graph_param_transform 67.18% : 0.000146s : 4: substitution.inline 1.94% : 0.000004s : 6: substitution.j_node_and_user_rematch 2.66% : 0.000006s : 6: substitution.remove_not_recompute_node 2.86% : 0.000006s : 6: substitution.replace_old_param 8.28% : 0.000018s : 6: substitution.tuple_list_get_item_eliminator 2.37% : 0.000005s : 1: substitution.value_based_eliminate ------[type_inference.] 0.005661 2 87.11% : 0.004931s : 1: type_inference.infer 12.89% : 0.000730s : 1: type_inference.specialize ------[replace.] 0.000070 10 52.09% : 0.000037s : 4: replace.inline 47.91% : 0.000034s : 6: replace.tuple_list_get_item_eliminator ------[match.] 0.000159 10 90.40% : 0.000144s : 4: match.inline 9.60% : 0.000015s : 6: match.tuple_list_get_item_eliminator ------[predicate.] 0.000285 1908 0.99% : 0.000003s : 20: predicate.accumulaten_eliminater 0.72% : 0.000002s : 6: predicate.ad_related_special_op_eliminate 0.53% : 0.000001s : 12: predicate.addn_check_dump 1.01% : 0.000003s : 20: predicate.addn_zero_filter 0.90% : 0.000003s : 20: predicate.adjust_all_reduce_mul_add 2.00% : 0.000006s : 32: predicate.arithmetic_simplify 1.05% : 0.000003s : 20: predicate.cast_eliminate 0.64% : 0.000002s : 12: predicate.check_bprop_eliminate 0.53% : 0.000001s : 12: predicate.compare_switch_simplify 0.20% : 0.000001s : 6: predicate.const_output_eliminate 0.61% : 0.000002s : 12: predicate.depend_value_elim 1.01% : 0.000003s : 20: predicate.dict_get_item_const_eliminator 1.05% : 0.000003s : 20: predicate.dict_get_item_eliminator 0.92% : 0.000003s : 20: predicate.dict_set_item_eliminator 0.83% : 0.000002s : 12: predicate.dumpgradient_eliminate 0.22% : 0.000001s : 6: predicate.elim_not_effective 0.34% : 0.000001s : 6: predicate.elim_shapecalc_of_broadcastargs 1.20% : 0.000003s : 26: predicate.environ_add_const_eliminate 1.17% : 0.000003s : 26: predicate.environ_get_add_eliminate 1.14% : 0.000003s : 26: predicate.environ_get_depend_swap 1.80% : 0.000005s : 38: predicate.environ_get_eliminate 1.15% : 0.000003s : 26: predicate.environ_get_set_eliminate 1.48% : 0.000004s : 30: predicate.exchange_switch_depend_value 2.19% : 0.000006s : 30: predicate.float_depend_g_call 0.54% : 0.000002s : 12: predicate.float_environ_get_switch 0.85% : 0.000002s : 18: predicate.float_tuple_getitem_switch 0.20% : 0.000001s : 6: predicate.fold_const_symbol 0.68% : 0.000002s : 12: predicate.get_grad_eliminate 0.20% : 0.000001s : 6: predicate.graph_param_transform 0.61% : 0.000002s : 12: predicate.incorporate_call 0.54% : 0.000002s : 12: predicate.incorporate_call_switch 6.08% : 0.000017s : 86: predicate.inline 0.75% : 0.000002s : 12: predicate.inline_without_move 0.36% : 0.000001s : 12: predicate.j_node_and_user_rematch 0.72% : 0.000002s : 12: predicate.less_batch_normalization 1.88% : 0.000005s : 38: predicate.list_to_tuple_eliminator_ 2.80% : 0.000008s : 58: predicate.load_eliminater 0.81% : 0.000002s : 6: predicate.loop_unroll_after_grad 2.19% : 0.000006s : 42: predicate.loop_unroll_before_grad 1.63% : 0.000005s : 32: predicate.make_slice_get_slice_eliminator 0.54% : 0.000002s : 12: predicate.merge_addn 0.61% : 0.000002s : 12: predicate.micro_step_allgather_replace 0.57% : 0.000002s : 12: predicate.mini_step_allgather_replace 0.89% : 0.000003s : 20: predicate.minmaximum_grad 0.79% : 0.000002s : 6: predicate.mutable_eliminate 0.33% : 0.000001s : 6: predicate.opt_reshape 0.38% : 0.000001s : 6: predicate.parallel_virtual_node 1.76% : 0.000005s : 30: predicate.partial_defer_inline 1.83% : 0.000005s : 32: predicate.partial_eliminate 0.95% : 0.000003s : 20: predicate.print_const_string_wrapper 0.58% : 0.000002s : 12: predicate.reduce_all_const_elim 1.28% : 0.000004s : 20: predicate.reduce_eliminate 2.71% : 0.000008s : 58: predicate.redundant_stop_gradient_eliminater 0.47% : 0.000001s : 12: predicate.remove_not_recompute_node 1.45% : 0.000004s : 38: predicate.replace_applicator 0.44% : 0.000001s : 12: predicate.replace_old_param 0.26% : 0.000001s : 6: predicate.reset_defer_inline 0.99% : 0.000003s : 20: predicate.reshape_eliminate 0.58% : 0.000002s : 12: predicate.row_tensor_add_zeros_like 0.32% : 0.000001s : 6: predicate.row_tensor_eliminate 0.70% : 0.000002s : 12: predicate.same_eliminate 0.50% : 0.000001s : 12: predicate.set_cell_output_no_recompute 0.83% : 0.000002s : 12: predicate.shard_identity_eliminate 0.74% : 0.000002s : 12: predicate.special_op_eliminate 0.69% : 0.000002s : 12: predicate.specialize_transform 0.72% : 0.000002s : 12: predicate.split_environ_get_set_with_tuple_value 0.70% : 0.000002s : 12: predicate.stack_unstack_eliminate 0.35% : 0.000001s : 6: predicate.switch_call_monad_eliminater 1.58% : 0.000004s : 30: predicate.switch_defer_inline 2.21% : 0.000006s : 42: predicate.switch_layer_defer_inline 4.89% : 0.000014s : 90: predicate.switch_simplify 0.94% : 0.000003s : 20: predicate.tile_eliminate 0.97% : 0.000003s : 20: predicate.transpose_eliminate 1.64% : 0.000005s : 32: predicate.tuple_list_convert_item_index_to_positive 1.71% : 0.000005s : 32: predicate.tuple_list_get_item_const_eliminator 1.48% : 0.000004s : 32: predicate.tuple_list_get_item_depend_reorder 3.23% : 0.000009s : 50: predicate.tuple_list_get_item_eliminator 1.56% : 0.000004s : 32: predicate.tuple_list_get_set_item_eliminator 2.24% : 0.000006s : 44: predicate.tuple_list_set_item_eliminator 1.81% : 0.000005s : 38: predicate.tuple_to_list_eliminator_ 2.63% : 0.000007s : 58: predicate.updatestate_pure_node_eliminater 3.31% : 0.000009s : 70: predicate.updatestate_useless_node_eliminater 0.45% : 0.000001s : 6: predicate.value_based_eliminate 0.63% : 0.000002s : 12: predicate.virtual_dataset_eliminate 0.59% : 0.000002s : 12: predicate.virtual_output_eliminate 0.31% : 0.000001s : 6: predicate.virtual_view_grad_eliminate 0.38% : 0.000001s : 6: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000525 11 54.65% : 0.000287s : 5: func_graph_cloner_run.FuncGraphClonerGraph 45.35% : 0.000238s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.026071 192 0.01% : 0.000003s : 1: ForceFp32Comm 11.64% : 0.003034s : 1: add_attr 11.60% : 0.003024s : 1: add_attr_with_inline 0.01% : 0.000004s : 1: add_comm_op_reuse_tag 0.23% : 0.000059s : 1: add_recomputation 0.01% : 0.000004s : 1: assign_add_opt 0.27% : 0.000071s : 1: auto_monad 0.09% : 0.000023s : 1: auto_monad_reorder 0.01% : 0.000003s : 1: begin_end_overlap_inline 0.02% : 0.000005s : 1: bias_add_comm_swap 1.81% : 0.000473s : 1: bootstrap 0.11% : 0.000028s : 1: cconv 0.02% : 0.000004s : 1: comm_op_add_attrs 0.07% : 0.000017s : 1: control_data_broadcast_order 0.04% : 0.000011s : 1: convert_after_rewriter 0.11% : 0.000029s : 1: cse_after_recomputation 0.02% : 0.000005s : 1: dataset_repeat_opt 0.02% : 0.000005s : 1: detach_backward 0.04% : 0.000010s : 1: environ_conv 0.10% : 0.000026s : 1: event_method 0.02% : 0.000005s : 1: full_micro_interleaved_order_control 0.02% : 0.000005s : 1: get_jit_bprop_graph 0.03% : 0.000009s : 1: graph_reusing 0.02% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000004s : 1: handle_group_info 0.02% : 0.000006s : 1: inline 0.02% : 0.000006s : 1: insert-virtual-dataset 0.01% : 0.000004s : 1: interleave_parallel_branches 0.02% : 0.000004s : 1: interleave_split_concat_branches 0.02% : 0.000006s : 1: label_fine_grained_interleaved_index 0.03% : 0.000007s : 1: label_micro_interleaved_index 1.65% : 0.000429s : 1: loop_unroll 0.02% : 0.000004s : 1: merge_cast_opt 0.02% : 0.000005s : 1: micro_interleaved_order_control 1.92% : 0.000500s : 1: mutable_eliminate 0.03% : 0.000007s : 1: offloading_packed_experts 0.06% : 0.000016s : 1: opt.transform.loop_unroll_optimizer 0.07% : 0.000017s : 1: opt.transform.mutable_eliminate 6.25% : 0.001629s : 78: opt.transform.opt_a 0.15% : 0.000040s : 1: opt.transform.opt_after_cconv 0.12% : 0.000032s : 1: opt.transform.opt_after_jit_grad 0.60% : 0.000158s : 28: opt.transform.opt_b 0.24% : 0.000063s : 2: opt.transform.opt_trans_graph 0.17% : 0.000045s : 4: opt.transform.symbol_engine_opt 12.05% : 0.003142s : 1: opt_a 0.47% : 0.000122s : 1: opt_after_cconv 1.80% : 0.000469s : 1: opt_after_jit_grad 1.04% : 0.000270s : 1: opt_b 20.19% : 0.005265s : 1: optimize 0.08% : 0.000021s : 1: optimize_parallel_all_gather_comm 0.03% : 0.000009s : 1: order_py_execute_after_rewriter 0.09% : 0.000025s : 1: overlap_grad_flash_sp 0.01% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.03% : 0.000008s : 1: overlap_grad_ring_attention 0.02% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.02% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.02% : 0.000005s : 1: overlap_param_gather 0.02% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.03% : 0.000008s : 1: overlap_recompute_and_grad_model_parallel 0.02% : 0.000005s : 1: overlap_recompute_comm 0.03% : 0.000007s : 1: parallel-infer-symbol 0.01% : 0.000004s : 1: parallel-infer-symbol-second 0.02% : 0.000005s : 1: partial_unused_args_eliminate 0.02% : 0.000005s : 1: pipeline_parallel_scheduler 0.02% : 0.000004s : 1: pipeline_split 0.14% : 0.000038s : 1: pre_auto_parallel 0.12% : 0.000033s : 1: py_interpret_to_execute 0.06% : 0.000014s : 1: py_interpret_to_execute_after_opt_a 0.02% : 0.000004s : 1: remove_cast_before_assign_add 0.07% : 0.000019s : 1: remove_dup_value 1.30% : 0.000338s : 1: renormalize.infer 1.14% : 0.000296s : 1: renormalize.specialize 0.02% : 0.000005s : 1: reorder_send_recv_between_fp_bp 0.02% : 0.000006s : 1: rewriter_after_jit_bprop_graph 0.17% : 0.000044s : 1: rewriter_after_opt_a 0.37% : 0.000097s : 1: rewriter_before_opt_a 0.02% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.02% : 0.000005s : 1: slice_recompute_activation 0.02% : 0.000004s : 1: split_layernorm_comm 0.02% : 0.000005s : 1: split_matmul_comm_elemetwise 0.03% : 0.000009s : 1: swap_dp_allreduce_reducescatter 0.34% : 0.000088s : 1: symbol_engine_optimizer 0.36% : 0.000093s : 1: tuple_transform 22.00% : 0.005735s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:47:13.978.21 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:47:13.980.65 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0156987, [21] [bootstrap]: 0.00044673 [type_inference]: 0.0057253 [event_method]: 2.081e-05 [auto_monad]: 6.154e-05 [graph_reusing]: 5.87999e-06 [inline]: 2.14999e-06 [add_attr]: 0.00299806, [1] [add_attr_with_inline]: 0.00298995, [1] [Cycle 1]: 6.853e-05, [2] [tag_attr]: 1.917e-05 [meta_addattr_fg_expand]: 6.36e-06 [parallel-infer-symbol]: 2.92002e-06 [pre_auto_parallel]: 3.197e-05 [insert-virtual-dataset]: 2.22999e-06 [parallel-infer-symbol-second]: 8.59989e-07 [dataset_repeat_opt]: 1.93002e-06 [pipeline_split]: 1.62001e-06 [optimize]: 0.00531436, [53] [py_interpret_to_execute]: 3.132e-05 [rewriter_before_opt_a]: 0.00011535 [opt_a]: 0.00303455, [2] [Cycle 1]: 0.00216041, [45] [expand_dump_flag]: 3.26001e-06 [switch_simplify]: 4.671e-05 [loop_unroll]: 3.472e-05 [a_1]: 0.00071535 [with_stream_mark]: 1.518e-05 [recompute_prepare]: 9.59e-06 [updatestate_depend_eliminate]: 4.06001e-06 [updatestate_assign_eliminate]: 3.37997e-06 [updatestate_loads_eliminate]: 3.02002e-06 [parameter_eliminate]: 1.94999e-06 [a_2]: 0.00011962 [accelerated_algorithm]: 7.47002e-06 [shard]: 2.02999e-06 [meta_shard_fg_expand]: 1.82001e-06 [shard_inline]: 7.00998e-06 [merge_send_recv]: 7.75e-06 [auto_parallel]: 5.96e-06 [parallel]: 1.904e-05 [flash_sp]: 8.47e-06 [merge_comm]: 4.12998e-06 [allreduce_fusion]: 3.35e-06 [matmul_add_comm_reduction]: 9.51e-06 [allreduce_slice_to_reducescatter]: 9.70002e-07 [virtual_shard_identity]: 8.69e-06 [virtual_dataset]: 7.7e-06 [get_grad_eliminate_]: 7e-06 [virtual_output]: 7.49002e-06 [merge_forward]: 3.71001e-06 [cell_reuse_recompute_pass]: 1.25001e-06 [offload_activation]: 9.39e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.504e-05 [merge_recompute_call_nodes]: 1.39e-06 [before_grad]: 1.082e-05 [set_forward_comm_id_for_comm_node_pass]: 3.56001e-06 [meta_fg_expand]: 2.81e-06 [flash_sp_send_recv_attached]: 2.53e-06 [receive_attached]: 1.99999e-06 [after_resolve]: 1.297e-05 [a_after_grad]: 1.113e-05 [renormalize]: 0.00053038 [add_forward_monad_depend]: 5.07e-06 [auto_monad_grad]: 2.04999e-06 [auto_monad_eliminator]: 1.351e-05 [cse]: 3.051e-05 [a_3]: 6.421e-05 [Cycle 2]: 0.00086163, [45] [expand_dump_flag]: 1.17e-06 [switch_simplify]: 8.60001e-06 [loop_unroll]: 7.12997e-06 [a_1]: 0.00015715 [with_stream_mark]: 1.061e-05 [recompute_prepare]: 7.06999e-06 [updatestate_depend_eliminate]: 3.05002e-06 [updatestate_assign_eliminate]: 2.80002e-06 [updatestate_loads_eliminate]: 2.21e-06 [parameter_eliminate]: 9.89996e-07 [a_2]: 0.00010725 [accelerated_algorithm]: 6.79001e-06 [shard]: 1.10001e-06 [meta_shard_fg_expand]: 1.32999e-06 [shard_inline]: 6.63e-06 [merge_send_recv]: 4.53999e-06 [auto_parallel]: 5.49998e-06 [parallel]: 4.27e-06 [flash_sp]: 3.22997e-06 [merge_comm]: 3.43e-06 [allreduce_fusion]: 2.91999e-06 [matmul_add_comm_reduction]: 5.37001e-06 [allreduce_slice_to_reducescatter]: 3.60014e-07 [virtual_shard_identity]: 7.41999e-06 [virtual_dataset]: 6.91999e-06 [get_grad_eliminate_]: 6.39001e-06 [virtual_output]: 6.51999e-06 [merge_forward]: 2.98998e-06 [cell_reuse_recompute_pass]: 1.54e-06 [offload_activation]: 6.73e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.523e-05 [merge_recompute_call_nodes]: 8.80013e-07 [before_grad]: 1.062e-05 [set_forward_comm_id_for_comm_node_pass]: 3.40998e-06 [meta_fg_expand]: 2.16998e-06 [flash_sp_send_recv_attached]: 1.09e-06 [receive_attached]: 1.02e-06 [after_resolve]: 1.285e-05 [a_after_grad]: 1.058e-05 [renormalize]: 8.00064e-08 [add_forward_monad_depend]: 1.34e-06 [auto_monad_grad]: 9.20001e-07 [auto_monad_eliminator]: 6.53998e-06 [cse]: 1.37e-05 [a_3]: 5.298e-05 [py_interpret_to_execute_after_opt_a]: 1.158e-05 [slice_cell_reuse_recomputed_activation]: 4.67998e-06 [rewriter_after_opt_a]: 3.841e-05 [convert_after_rewriter]: 9.68997e-06 [order_py_execute_after_rewriter]: 8.03001e-06 [mutable_eliminate]: 0.00047033 [opt_b]: 0.00030877, [1] [Cycle 1]: 0.00029845, [7] [b_1]: 0.00018798 [b_2]: 9.02e-06 [updatestate_depend_eliminate]: 5.00999e-06 [updatestate_assign_eliminate]: 3.08e-06 [updatestate_loads_eliminate]: 2.38998e-06 [renormalize]: 4.30009e-07 [cse]: 1.791e-05 [optimize_parallel_all_gather_comm]: 1.979e-05 [overlap_param_gather]: 4.82e-06 [cconv]: 2.668e-05 [loop_unroll]: 0.00042834 [opt_after_cconv]: 0.00012992, [1] [Cycle 1]: 0.00012144, [7] [c_1]: 3.603e-05 [parameter_eliminate]: 2.49999e-06 [updatestate_depend_eliminate]: 4.98001e-06 [updatestate_assign_eliminate]: 2.44999e-06 [updatestate_loads_eliminate]: 2.35002e-06 [cse]: 1.783e-05 [renormalize]: 3.80009e-07 [remove_dup_value]: 1.616e-05 [tuple_transform]: 9.507e-05, [1] [Cycle 1]: 8.792e-05, [4] [d_1]: 4.9e-05 [none_parameter_eliminate]: 1.79998e-06 [renormalize]: 1.8999e-07 [switch_simplify]: 7.66999e-06 [partial_unused_args_eliminate]: 4.42998e-06 [add_recomputation]: 4.88e-05 [cse_after_recomputation]: 2.715e-05, [1] [Cycle 1]: 2.054e-05, [1] [cse]: 1.156e-05 [environ_conv]: 7.60003e-06 [swap_dp_allreduce_reducescatter]: 8.15e-06 [bias_add_comm_swap]: 5.29e-06 [label_micro_interleaved_index]: 6.75002e-06 [label_fine_grained_interleaved_index]: 5.04998e-06 [merge_cast_opt]: 4.07e-06 [slice_recompute_activation]: 4.42e-06 [micro_interleaved_order_control]: 4.95999e-06 [assign_add_opt]: 3.52002e-06 [ForceFp32Comm]: 3.14999e-06 [remove_cast_before_assign_add]: 3.26001e-06 [full_micro_interleaved_order_control]: 4.51002e-06 [reorder_send_recv_between_fp_bp]: 5.20999e-06 [comm_op_add_attrs]: 3.46001e-06 [add_comm_op_reuse_tag]: 3.23998e-06 [interleave_split_concat_branches]: 3.68e-06 [interleave_parallel_branches]: 3.58999e-06 [overlap_opt_shard_in_pipeline]: 3.71001e-06 [overlap_opt_shard_grad_in_pipeline]: 4.2e-06 [control_data_broadcast_order]: 1.502e-05 [grouped_pairwise_exchange_alltoall]: 3.91999e-06 [offloading_packed_experts]: 6.23998e-06 [overlap_recompute_and_grad_model_parallel]: 7.55e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.58e-06 [overlap_recompute_allgather_and_fa_grad]: 3.61999e-06 [overlap_recompute_comm]: 4.94e-06 [overlap_grad_ring_attention]: 6.64999e-06 [overlap_grad_flash_sp]: 2.139e-05 [begin_end_overlap_inline]: 2.83e-06 [split_matmul_comm_elemetwise]: 4.67998e-06 [split_layernorm_comm]: 3.9e-06 [handle_group_info]: 3.56999e-06 [symbol_engine_optimizer]: 9.59e-05, [1] [Cycle 1]: 8.942e-05, [6] [build]: 2.99001e-06 [elim_shapecalc]: 9.94001e-06 [elim_not_effective]: 1.312e-05 [opt_reshape]: 7.80998e-06 [fold_const_symbol]: 1.094e-05 [renormalize]: 2.10013e-07 [detach_backward]: 3.2e-06 [pipeline_parallel_scheduler]: 1.64998e-06 [auto_monad_reorder]: 1.988e-05 [get_jit_bprop_graph]: 1.31998e-06 [rewriter_after_jit_bprop_graph]: 3.79002e-06 [opt_after_jit_grad]: 0.00047683 [validate]: 3.501e-05 Sums bootstrap : 0.000447s : 4.06% type_inference : 0.005725s : 52.02% event_method : 0.000021s : 0.19% auto_monad : 0.000062s : 0.56% graph_reusing : 0.000006s : 0.05% inline : 0.000002s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000019s : 0.17% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.06% parallel-infer-symbol : 0.000003s : 0.03% pre_auto_parallel : 0.000032s : 0.29% insert-virtual-dataset : 0.000002s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000031s : 0.28% optimize.rewriter_before_opt_a : 0.000115s : 1.05% optimize.opt_a.expand_dump_flag : 0.000004s : 0.04% optimize.opt_a.switch_simplify : 0.000055s : 0.50% optimize.opt_a.loop_unroll : 0.000042s : 0.38% optimize.opt_a.a_1 : 0.000872s : 7.93% optimize.opt_a.with_stream_mark : 0.000026s : 0.23% optimize.opt_a.recompute_prepare : 0.000017s : 0.15% optimize.opt_a.updatestate_depend_eliminate : 0.000007s : 0.06% optimize.opt_a.updatestate_assign_eliminate : 0.000006s : 0.06% optimize.opt_a.updatestate_loads_eliminate : 0.000005s : 0.05% optimize.opt_a.parameter_eliminate : 0.000003s : 0.03% optimize.opt_a.a_2 : 0.000227s : 2.06% optimize.opt_a.accelerated_algorithm : 0.000014s : 0.13% optimize.opt_a.shard : 0.000003s : 0.03% optimize.opt_a.meta_shard_fg_expand : 0.000003s : 0.03% optimize.opt_a.shard_inline : 0.000014s : 0.12% optimize.opt_a.merge_send_recv : 0.000012s : 0.11% optimize.opt_a.auto_parallel : 0.000011s : 0.10% optimize.opt_a.parallel : 0.000023s : 0.21% optimize.opt_a.flash_sp : 0.000012s : 0.11% optimize.opt_a.merge_comm : 0.000008s : 0.07% optimize.opt_a.allreduce_fusion : 0.000006s : 0.06% optimize.opt_a.matmul_add_comm_reduction : 0.000015s : 0.14% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000016s : 0.15% optimize.opt_a.virtual_dataset : 0.000015s : 0.13% optimize.opt_a.get_grad_eliminate_ : 0.000013s : 0.12% optimize.opt_a.virtual_output : 0.000014s : 0.13% optimize.opt_a.merge_forward : 0.000007s : 0.06% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.03% optimize.opt_a.offload_activation : 0.000016s : 0.15% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000030s : 0.28% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.02% optimize.opt_a.before_grad : 0.000021s : 0.19% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000007s : 0.06% optimize.opt_a.meta_fg_expand : 0.000005s : 0.05% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.03% optimize.opt_a.receive_attached : 0.000003s : 0.03% optimize.opt_a.after_resolve : 0.000026s : 0.23% optimize.opt_a.a_after_grad : 0.000022s : 0.20% optimize.opt_a.renormalize : 0.000530s : 4.82% optimize.opt_a.add_forward_monad_depend : 0.000006s : 0.06% optimize.opt_a.auto_monad_grad : 0.000003s : 0.03% optimize.opt_a.auto_monad_eliminator : 0.000020s : 0.18% optimize.opt_a.cse : 0.000044s : 0.40% optimize.opt_a.a_3 : 0.000117s : 1.06% optimize.py_interpret_to_execute_after_opt_a : 0.000012s : 0.11% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.04% optimize.rewriter_after_opt_a : 0.000038s : 0.35% optimize.convert_after_rewriter : 0.000010s : 0.09% optimize.order_py_execute_after_rewriter : 0.000008s : 0.07% optimize.mutable_eliminate : 0.000470s : 4.27% optimize.opt_b.b_1 : 0.000188s : 1.71% optimize.opt_b.b_2 : 0.000009s : 0.08% optimize.opt_b.updatestate_depend_eliminate : 0.000005s : 0.05% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_b.updatestate_loads_eliminate : 0.000002s : 0.02% optimize.opt_b.renormalize : 0.000000s : 0.00% optimize.opt_b.cse : 0.000018s : 0.16% optimize.optimize_parallel_all_gather_comm : 0.000020s : 0.18% optimize.overlap_param_gather : 0.000005s : 0.04% optimize.cconv : 0.000027s : 0.24% optimize.loop_unroll : 0.000428s : 3.89% optimize.opt_after_cconv.c_1 : 0.000036s : 0.33% optimize.opt_after_cconv.parameter_eliminate : 0.000002s : 0.02% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000005s : 0.05% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000002s : 0.02% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.02% optimize.opt_after_cconv.cse : 0.000018s : 0.16% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000016s : 0.15% optimize.tuple_transform.d_1 : 0.000049s : 0.45% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.02% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000008s : 0.07% optimize.partial_unused_args_eliminate : 0.000004s : 0.04% optimize.add_recomputation : 0.000049s : 0.44% optimize.cse_after_recomputation.cse : 0.000012s : 0.11% optimize.environ_conv : 0.000008s : 0.07% optimize.swap_dp_allreduce_reducescatter : 0.000008s : 0.07% optimize.bias_add_comm_swap : 0.000005s : 0.05% optimize.label_micro_interleaved_index : 0.000007s : 0.06% optimize.label_fine_grained_interleaved_index : 0.000005s : 0.05% optimize.merge_cast_opt : 0.000004s : 0.04% optimize.slice_recompute_activation : 0.000004s : 0.04% optimize.micro_interleaved_order_control : 0.000005s : 0.05% optimize.assign_add_opt : 0.000004s : 0.03% optimize.ForceFp32Comm : 0.000003s : 0.03% optimize.remove_cast_before_assign_add : 0.000003s : 0.03% optimize.full_micro_interleaved_order_control : 0.000005s : 0.04% optimize.reorder_send_recv_between_fp_bp : 0.000005s : 0.05% optimize.comm_op_add_attrs : 0.000003s : 0.03% optimize.add_comm_op_reuse_tag : 0.000003s : 0.03% optimize.interleave_split_concat_branches : 0.000004s : 0.03% optimize.interleave_parallel_branches : 0.000004s : 0.03% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.03% optimize.overlap_opt_shard_grad_in_pipeline : 0.000004s : 0.04% optimize.control_data_broadcast_order : 0.000015s : 0.14% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.04% optimize.offloading_packed_experts : 0.000006s : 0.06% optimize.overlap_recompute_and_grad_model_parallel : 0.000008s : 0.07% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.03% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.03% optimize.overlap_recompute_comm : 0.000005s : 0.04% optimize.overlap_grad_ring_attention : 0.000007s : 0.06% optimize.overlap_grad_flash_sp : 0.000021s : 0.19% optimize.begin_end_overlap_inline : 0.000003s : 0.03% optimize.split_matmul_comm_elemetwise : 0.000005s : 0.04% optimize.split_layernorm_comm : 0.000004s : 0.04% optimize.handle_group_info : 0.000004s : 0.03% optimize.symbol_engine_optimizer.build : 0.000003s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000010s : 0.09% optimize.symbol_engine_optimizer.elim_not_effective : 0.000013s : 0.12% optimize.symbol_engine_optimizer.opt_reshape : 0.000008s : 0.07% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000011s : 0.10% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000003s : 0.03% pipeline_parallel_scheduler : 0.000002s : 0.01% auto_monad_reorder : 0.000020s : 0.18% get_jit_bprop_graph : 0.000001s : 0.01% rewriter_after_jit_bprop_graph : 0.000004s : 0.03% opt_after_jit_grad : 0.000477s : 4.33% validate : 0.000035s : 0.32% Time group info: ------[substitution.] 0.000187 34 0.94% : 0.000002s : 2: substitution.elim_not_effective 0.75% : 0.000001s : 2: substitution.fold_const_symbol 3.25% : 0.000006s : 5: substitution.graph_param_transform 75.07% : 0.000141s : 4: substitution.inline 1.83% : 0.000003s : 4: substitution.j_node_and_user_rematch 2.66% : 0.000005s : 4: substitution.remove_not_recompute_node 2.81% : 0.000005s : 6: substitution.replace_old_param 9.70% : 0.000018s : 6: substitution.tuple_list_get_item_eliminator 2.97% : 0.000006s : 1: substitution.value_based_eliminate ------[type_inference.] 0.005676 2 87.03% : 0.004940s : 1: type_inference.infer 12.97% : 0.000736s : 1: type_inference.specialize ------[replace.] 0.000069 10 55.39% : 0.000038s : 4: replace.inline 44.61% : 0.000031s : 6: replace.tuple_list_get_item_eliminator ------[match.] 0.000154 10 89.88% : 0.000138s : 4: match.inline 10.12% : 0.000016s : 6: match.tuple_list_get_item_eliminator ------[predicate.] 0.000228 1590 0.95% : 0.000002s : 16: predicate.accumulaten_eliminater 0.65% : 0.000001s : 5: predicate.ad_related_special_op_eliminate 0.52% : 0.000001s : 10: predicate.addn_check_dump 0.92% : 0.000002s : 16: predicate.addn_zero_filter 0.85% : 0.000002s : 16: predicate.adjust_all_reduce_mul_add 1.97% : 0.000004s : 26: predicate.arithmetic_simplify 0.97% : 0.000002s : 16: predicate.cast_eliminate 0.59% : 0.000001s : 10: predicate.check_bprop_eliminate 0.53% : 0.000001s : 10: predicate.compare_switch_simplify 0.20% : 0.000000s : 5: predicate.const_output_eliminate 0.60% : 0.000001s : 10: predicate.depend_value_elim 0.97% : 0.000002s : 16: predicate.dict_get_item_const_eliminator 1.20% : 0.000003s : 16: predicate.dict_get_item_eliminator 0.97% : 0.000002s : 16: predicate.dict_set_item_eliminator 0.86% : 0.000002s : 10: predicate.dumpgradient_eliminate 0.21% : 0.000000s : 5: predicate.elim_not_effective 0.35% : 0.000001s : 5: predicate.elim_shapecalc_of_broadcastargs 1.17% : 0.000003s : 21: predicate.environ_add_const_eliminate 1.11% : 0.000003s : 21: predicate.environ_get_add_eliminate 1.13% : 0.000003s : 21: predicate.environ_get_depend_swap 1.78% : 0.000004s : 31: predicate.environ_get_eliminate 1.09% : 0.000002s : 21: predicate.environ_get_set_eliminate 1.55% : 0.000004s : 26: predicate.exchange_switch_depend_value 2.49% : 0.000006s : 26: predicate.float_depend_g_call 0.53% : 0.000001s : 10: predicate.float_environ_get_switch 0.83% : 0.000002s : 15: predicate.float_tuple_getitem_switch 0.21% : 0.000000s : 5: predicate.fold_const_symbol 0.63% : 0.000001s : 10: predicate.get_grad_eliminate 0.23% : 0.000001s : 5: predicate.graph_param_transform 0.58% : 0.000001s : 10: predicate.incorporate_call 0.48% : 0.000001s : 10: predicate.incorporate_call_switch 6.12% : 0.000014s : 72: predicate.inline 0.72% : 0.000002s : 10: predicate.inline_without_move 0.33% : 0.000001s : 10: predicate.j_node_and_user_rematch 0.70% : 0.000002s : 10: predicate.less_batch_normalization 1.93% : 0.000004s : 32: predicate.list_to_tuple_eliminator_ 2.86% : 0.000007s : 48: predicate.load_eliminater 0.81% : 0.000002s : 5: predicate.loop_unroll_after_grad 2.44% : 0.000006s : 40: predicate.loop_unroll_before_grad 1.63% : 0.000004s : 26: predicate.make_slice_get_slice_eliminator 0.56% : 0.000001s : 10: predicate.merge_addn 0.54% : 0.000001s : 10: predicate.micro_step_allgather_replace 0.54% : 0.000001s : 10: predicate.mini_step_allgather_replace 0.83% : 0.000002s : 16: predicate.minmaximum_grad 0.80% : 0.000002s : 5: predicate.mutable_eliminate 0.33% : 0.000001s : 5: predicate.opt_reshape 0.34% : 0.000001s : 5: predicate.parallel_virtual_node 1.92% : 0.000004s : 26: predicate.partial_defer_inline 1.86% : 0.000004s : 27: predicate.partial_eliminate 0.93% : 0.000002s : 16: predicate.print_const_string_wrapper 0.59% : 0.000001s : 10: predicate.reduce_all_const_elim 1.18% : 0.000003s : 16: predicate.reduce_eliminate 2.67% : 0.000006s : 48: predicate.redundant_stop_gradient_eliminater 0.43% : 0.000001s : 10: predicate.remove_not_recompute_node 1.49% : 0.000003s : 32: predicate.replace_applicator 0.51% : 0.000001s : 10: predicate.replace_old_param 0.28% : 0.000001s : 5: predicate.reset_defer_inline 0.91% : 0.000002s : 16: predicate.reshape_eliminate 0.59% : 0.000001s : 10: predicate.row_tensor_add_zeros_like 0.34% : 0.000001s : 5: predicate.row_tensor_eliminate 0.79% : 0.000002s : 10: predicate.same_eliminate 0.44% : 0.000001s : 10: predicate.set_cell_output_no_recompute 0.82% : 0.000002s : 10: predicate.shard_identity_eliminate 0.60% : 0.000001s : 10: predicate.special_op_eliminate 0.66% : 0.000001s : 10: predicate.specialize_transform 0.74% : 0.000002s : 10: predicate.split_environ_get_set_with_tuple_value 0.69% : 0.000002s : 10: predicate.stack_unstack_eliminate 0.33% : 0.000001s : 5: predicate.switch_call_monad_eliminater 1.67% : 0.000004s : 26: predicate.switch_defer_inline 2.13% : 0.000005s : 36: predicate.switch_layer_defer_inline 5.16% : 0.000012s : 81: predicate.switch_simplify 0.90% : 0.000002s : 16: predicate.tile_eliminate 1.01% : 0.000002s : 16: predicate.transpose_eliminate 1.47% : 0.000003s : 26: predicate.tuple_list_convert_item_index_to_positive 1.53% : 0.000003s : 26: predicate.tuple_list_get_item_const_eliminator 1.45% : 0.000003s : 26: predicate.tuple_list_get_item_depend_reorder 3.39% : 0.000008s : 42: predicate.tuple_list_get_item_eliminator 1.52% : 0.000003s : 26: predicate.tuple_list_get_set_item_eliminator 2.11% : 0.000005s : 36: predicate.tuple_list_set_item_eliminator 1.95% : 0.000004s : 32: predicate.tuple_to_list_eliminator_ 2.59% : 0.000006s : 48: predicate.updatestate_pure_node_eliminater 3.26% : 0.000007s : 58: predicate.updatestate_useless_node_eliminater 0.41% : 0.000001s : 5: predicate.value_based_eliminate 0.71% : 0.000002s : 10: predicate.virtual_dataset_eliminate 0.62% : 0.000001s : 10: predicate.virtual_output_eliminate 0.29% : 0.000001s : 5: predicate.virtual_view_grad_eliminate 0.45% : 0.000001s : 5: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000518 11 56.01% : 0.000290s : 5: func_graph_cloner_run.FuncGraphClonerGraph 43.99% : 0.000228s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.026073 192 0.02% : 0.000006s : 1: ForceFp32Comm 11.53% : 0.003006s : 1: add_attr 11.48% : 0.002994s : 1: add_attr_with_inline 0.02% : 0.000006s : 1: add_comm_op_reuse_tag 0.20% : 0.000053s : 1: add_recomputation 0.02% : 0.000006s : 1: assign_add_opt 0.27% : 0.000070s : 1: auto_monad 0.11% : 0.000027s : 1: auto_monad_reorder 0.02% : 0.000005s : 1: begin_end_overlap_inline 0.03% : 0.000008s : 1: bias_add_comm_swap 1.88% : 0.000491s : 1: bootstrap 0.11% : 0.000030s : 1: cconv 0.02% : 0.000007s : 1: comm_op_add_attrs 0.07% : 0.000018s : 1: control_data_broadcast_order 0.05% : 0.000013s : 1: convert_after_rewriter 0.12% : 0.000030s : 1: cse_after_recomputation 0.03% : 0.000007s : 1: dataset_repeat_opt 0.06% : 0.000016s : 1: detach_backward 0.04% : 0.000011s : 1: environ_conv 0.12% : 0.000030s : 1: event_method 0.03% : 0.000007s : 1: full_micro_interleaved_order_control 0.03% : 0.000007s : 1: get_jit_bprop_graph 0.05% : 0.000012s : 1: graph_reusing 0.03% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.02% : 0.000006s : 1: handle_group_info 0.03% : 0.000008s : 1: inline 0.05% : 0.000012s : 1: insert-virtual-dataset 0.02% : 0.000006s : 1: interleave_parallel_branches 0.02% : 0.000006s : 1: interleave_split_concat_branches 0.03% : 0.000008s : 1: label_fine_grained_interleaved_index 0.04% : 0.000010s : 1: label_micro_interleaved_index 1.67% : 0.000434s : 1: loop_unroll 0.03% : 0.000007s : 1: merge_cast_opt 0.03% : 0.000008s : 1: micro_interleaved_order_control 1.83% : 0.000476s : 1: mutable_eliminate 0.04% : 0.000009s : 1: offloading_packed_experts 0.05% : 0.000014s : 1: opt.transform.loop_unroll_optimizer 0.05% : 0.000014s : 1: opt.transform.mutable_eliminate 5.17% : 0.001347s : 78: opt.transform.opt_a 0.13% : 0.000035s : 1: opt.transform.opt_after_cconv 0.10% : 0.000027s : 1: opt.transform.opt_after_jit_grad 0.48% : 0.000126s : 28: opt.transform.opt_b 0.21% : 0.000054s : 2: opt.transform.opt_trans_graph 0.15% : 0.000039s : 4: opt.transform.symbol_engine_opt 11.65% : 0.003038s : 1: opt_a 0.51% : 0.000133s : 1: opt_after_cconv 1.87% : 0.000487s : 1: opt_after_jit_grad 1.20% : 0.000313s : 1: opt_b 21.56% : 0.005620s : 1: optimize 0.09% : 0.000023s : 1: optimize_parallel_all_gather_comm 0.04% : 0.000011s : 1: order_py_execute_after_rewriter 0.09% : 0.000025s : 1: overlap_grad_flash_sp 0.02% : 0.000006s : 1: overlap_grad_matmul_and_grad_allreduce 0.04% : 0.000009s : 1: overlap_grad_ring_attention 0.03% : 0.000007s : 1: overlap_opt_shard_grad_in_pipeline 0.02% : 0.000006s : 1: overlap_opt_shard_in_pipeline 0.03% : 0.000008s : 1: overlap_param_gather 0.02% : 0.000006s : 1: overlap_recompute_allgather_and_fa_grad 0.04% : 0.000010s : 1: overlap_recompute_and_grad_model_parallel 0.03% : 0.000008s : 1: overlap_recompute_comm 0.04% : 0.000009s : 1: parallel-infer-symbol 0.03% : 0.000007s : 1: parallel-infer-symbol-second 0.03% : 0.000007s : 1: partial_unused_args_eliminate 0.04% : 0.000009s : 1: pipeline_parallel_scheduler 0.03% : 0.000007s : 1: pipeline_split 0.15% : 0.000039s : 1: pre_auto_parallel 0.14% : 0.000036s : 1: py_interpret_to_execute 0.06% : 0.000015s : 1: py_interpret_to_execute_after_opt_a 0.02% : 0.000006s : 1: remove_cast_before_assign_add 0.07% : 0.000019s : 1: remove_dup_value 1.01% : 0.000263s : 1: renormalize.infer 1.00% : 0.000260s : 1: renormalize.specialize 0.03% : 0.000008s : 1: reorder_send_recv_between_fp_bp 0.04% : 0.000010s : 1: rewriter_after_jit_bprop_graph 0.16% : 0.000042s : 1: rewriter_after_opt_a 0.46% : 0.000119s : 1: rewriter_before_opt_a 0.03% : 0.000007s : 1: slice_cell_reuse_recomputed_activation 0.03% : 0.000007s : 1: slice_recompute_activation 0.03% : 0.000007s : 1: split_layernorm_comm 0.03% : 0.000008s : 1: split_matmul_comm_elemetwise 0.04% : 0.000011s : 1: swap_dp_allreduce_reducescatter 0.38% : 0.000099s : 1: symbol_engine_optimizer 0.38% : 0.000098s : 1: tuple_transform 22.08% : 0.005757s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:47:13.289.517 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0145396, [21] [bootstrap]: 0.0004321 [type_inference]: 0.00568335 [event_method]: 1.9e-05 [auto_monad]: 5.926e-05 [graph_reusing]: 5.67999e-06 [inline]: 1.76998e-06 [add_attr]: 0.00301274, [1] [add_attr_with_inline]: 0.00300418, [1] [Cycle 1]: 5.483e-05, [2] [tag_attr]: 1.937e-05 [meta_addattr_fg_expand]: 6.31e-06 [parallel-infer-symbol]: 2.83e-06 [pre_auto_parallel]: 3.214e-05 [insert-virtual-dataset]: 2.27999e-06 [parallel-infer-symbol-second]: 8.00006e-07 [dataset_repeat_opt]: 1.73002e-06 [pipeline_split]: 1.76e-06 [optimize]: 0.00463968, [53] [py_interpret_to_execute]: 3.194e-05 [rewriter_before_opt_a]: 8.741e-05 [opt_a]: 0.00267275, [2] [Cycle 1]: 0.00197094, [45] [expand_dump_flag]: 3.31001e-06 [switch_simplify]: 4.685e-05 [loop_unroll]: 3.46e-05 [a_1]: 0.00071252 [with_stream_mark]: 1.47e-05 [recompute_prepare]: 9.12999e-06 [updatestate_depend_eliminate]: 3.70998e-06 [updatestate_assign_eliminate]: 3.45998e-06 [updatestate_loads_eliminate]: 2.76999e-06 [parameter_eliminate]: 1.96e-06 [a_2]: 9.02e-05 [accelerated_algorithm]: 7.65e-06 [shard]: 1.87999e-06 [meta_shard_fg_expand]: 1.86e-06 [shard_inline]: 7.41999e-06 [merge_send_recv]: 8.21002e-06 [auto_parallel]: 6.07999e-06 [parallel]: 1.789e-05 [flash_sp]: 7.26001e-06 [merge_comm]: 3.98001e-06 [allreduce_fusion]: 3.96001e-06 [matmul_add_comm_reduction]: 9.09998e-06 [allreduce_slice_to_reducescatter]: 6.50005e-07 [virtual_shard_identity]: 8.21002e-06 [virtual_dataset]: 7.3e-06 [get_grad_eliminate_]: 7.03998e-06 [virtual_output]: 6.94999e-06 [merge_forward]: 3.6e-06 [cell_reuse_recompute_pass]: 1.15999e-06 [offload_activation]: 9.20001e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.318e-05 [merge_recompute_call_nodes]: 1.50999e-06 [before_grad]: 1.105e-05 [set_forward_comm_id_for_comm_node_pass]: 3.49001e-06 [meta_fg_expand]: 2.76e-06 [flash_sp_send_recv_attached]: 2.63998e-06 [receive_attached]: 2.06003e-06 [after_resolve]: 1.346e-05 [a_after_grad]: 1.083e-05 [renormalize]: 0.00052817 [add_forward_monad_depend]: 5.25999e-06 [auto_monad_grad]: 1.77001e-06 [auto_monad_eliminator]: 1.47e-05 [cse]: 3.127e-05 [a_3]: 4.994e-05 [Cycle 2]: 0.00069265, [45] [expand_dump_flag]: 9.50007e-07 [switch_simplify]: 9.00999e-06 [loop_unroll]: 6.88998e-06 [a_1]: 0.00015954 [with_stream_mark]: 1.045e-05 [recompute_prepare]: 7.04001e-06 [updatestate_depend_eliminate]: 2.94001e-06 [updatestate_assign_eliminate]: 2.75997e-06 [updatestate_loads_eliminate]: 2.22001e-06 [parameter_eliminate]: 1.07998e-06 [a_2]: 8.192e-05 [accelerated_algorithm]: 6.73e-06 [shard]: 1.28002e-06 [meta_shard_fg_expand]: 1.29e-06 [shard_inline]: 6.71999e-06 [merge_send_recv]: 4.48999e-06 [auto_parallel]: 5.26002e-06 [parallel]: 4.25e-06 [flash_sp]: 3.09001e-06 [merge_comm]: 3.21999e-06 [allreduce_fusion]: 3.11999e-06 [matmul_add_comm_reduction]: 5.27001e-06 [allreduce_slice_to_reducescatter]: 3.69997e-07 [virtual_shard_identity]: 7.15998e-06 [virtual_dataset]: 6.46999e-06 [get_grad_eliminate_]: 6.75002e-06 [virtual_output]: 6.20002e-06 [merge_forward]: 2.81e-06 [cell_reuse_recompute_pass]: 1.30999e-06 [offload_activation]: 6.43e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.289e-05 [merge_recompute_call_nodes]: 7.49977e-07 [before_grad]: 9.76e-06 [set_forward_comm_id_for_comm_node_pass]: 3.26001e-06 [meta_fg_expand]: 2.03997e-06 [flash_sp_send_recv_attached]: 9.20001e-07 [receive_attached]: 9.5999e-07 [after_resolve]: 1.232e-05 [a_after_grad]: 1.219e-05 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 1.37999e-06 [auto_monad_grad]: 9.30013e-07 [auto_monad_eliminator]: 7.16999e-06 [cse]: 1.374e-05 [a_3]: 4.216e-05 [py_interpret_to_execute_after_opt_a]: 8.43999e-06 [slice_cell_reuse_recomputed_activation]: 2.04e-06 [rewriter_after_opt_a]: 3.243e-05 [convert_after_rewriter]: 6.85998e-06 [order_py_execute_after_rewriter]: 5.11002e-06 [mutable_eliminate]: 0.00045321 [opt_b]: 0.00022408, [1] [Cycle 1]: 0.00021847, [7] [b_1]: 0.00014564 [b_2]: 8.57e-06 [updatestate_depend_eliminate]: 5.20001e-06 [updatestate_assign_eliminate]: 2.53e-06 [updatestate_loads_eliminate]: 2.67001e-06 [renormalize]: 3.80009e-07 [cse]: 1.799e-05 [optimize_parallel_all_gather_comm]: 1.657e-05 [overlap_param_gather]: 1.87999e-06 [cconv]: 2.199e-05 [loop_unroll]: 0.00042534 [opt_after_cconv]: 0.00010548, [1] [Cycle 1]: 9.978e-05, [7] [c_1]: 3.583e-05 [parameter_eliminate]: 2.66999e-06 [updatestate_depend_eliminate]: 4.98001e-06 [updatestate_assign_eliminate]: 2.42001e-06 [updatestate_loads_eliminate]: 2.27001e-06 [cse]: 1.759e-05 [renormalize]: 4.50003e-07 [remove_dup_value]: 1.361e-05 [tuple_transform]: 8.107e-05, [1] [Cycle 1]: 7.685e-05, [4] [d_1]: 4.893e-05 [none_parameter_eliminate]: 1.70001e-06 [renormalize]: 2.19996e-07 [switch_simplify]: 7.41999e-06 [partial_unused_args_eliminate]: 1.84998e-06 [add_recomputation]: 4.477e-05 [cse_after_recomputation]: 2.231e-05, [1] [Cycle 1]: 1.793e-05, [1] [cse]: 1.197e-05 [environ_conv]: 5.14e-06 [swap_dp_allreduce_reducescatter]: 5.09e-06 [bias_add_comm_swap]: 2.86999e-06 [label_micro_interleaved_index]: 4.50001e-06 [label_fine_grained_interleaved_index]: 2.99001e-06 [merge_cast_opt]: 1.27999e-06 [slice_recompute_activation]: 2.24001e-06 [micro_interleaved_order_control]: 2.44001e-06 [assign_add_opt]: 1.15999e-06 [ForceFp32Comm]: 1.05999e-06 [remove_cast_before_assign_add]: 9.79984e-07 [full_micro_interleaved_order_control]: 1.99999e-06 [reorder_send_recv_between_fp_bp]: 2.81e-06 [comm_op_add_attrs]: 9.90025e-07 [add_comm_op_reuse_tag]: 1.05999e-06 [interleave_split_concat_branches]: 1.25999e-06 [interleave_parallel_branches]: 1.07e-06 [overlap_opt_shard_in_pipeline]: 1.16002e-06 [overlap_opt_shard_grad_in_pipeline]: 2.07999e-06 [control_data_broadcast_order]: 1.176e-05 [grouped_pairwise_exchange_alltoall]: 1.79998e-06 [offloading_packed_experts]: 3.31001e-06 [overlap_recompute_and_grad_model_parallel]: 4.25e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.18001e-06 [overlap_recompute_allgather_and_fa_grad]: 1.36002e-06 [overlap_recompute_comm]: 2.29999e-06 [overlap_grad_ring_attention]: 4.13001e-06 [overlap_grad_flash_sp]: 1.679e-05 [begin_end_overlap_inline]: 5.39992e-07 [split_matmul_comm_elemetwise]: 2.06998e-06 [split_layernorm_comm]: 1.93997e-06 [handle_group_info]: 1.10001e-06 [symbol_engine_optimizer]: 7.593e-05, [1] [Cycle 1]: 7.173e-05, [6] [build]: 2.89999e-06 [elim_shapecalc]: 1.012e-05 [elim_not_effective]: 1.337e-05 [opt_reshape]: 7.33999e-06 [fold_const_symbol]: 1.075e-05 [renormalize]: 2.3999e-07 [detach_backward]: 1.86e-06 [pipeline_parallel_scheduler]: 1.46998e-06 [auto_monad_reorder]: 1.796e-05 [get_jit_bprop_graph]: 1.47999e-06 [rewriter_after_jit_bprop_graph]: 3.33998e-06 [opt_after_jit_grad]: 0.00044474 [validate]: 3.459e-05 Sums bootstrap : 0.000432s : 4.07% type_inference : 0.005683s : 53.57% event_method : 0.000019s : 0.18% auto_monad : 0.000059s : 0.56% graph_reusing : 0.000006s : 0.05% inline : 0.000002s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000019s : 0.18% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.06% parallel-infer-symbol : 0.000003s : 0.03% pre_auto_parallel : 0.000032s : 0.30% insert-virtual-dataset : 0.000002s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.02% optimize.py_interpret_to_execute : 0.000032s : 0.30% optimize.rewriter_before_opt_a : 0.000087s : 0.82% optimize.opt_a.expand_dump_flag : 0.000004s : 0.04% optimize.opt_a.switch_simplify : 0.000056s : 0.53% optimize.opt_a.loop_unroll : 0.000041s : 0.39% optimize.opt_a.a_1 : 0.000872s : 8.22% optimize.opt_a.with_stream_mark : 0.000025s : 0.24% optimize.opt_a.recompute_prepare : 0.000016s : 0.15% optimize.opt_a.updatestate_depend_eliminate : 0.000007s : 0.06% optimize.opt_a.updatestate_assign_eliminate : 0.000006s : 0.06% optimize.opt_a.updatestate_loads_eliminate : 0.000005s : 0.05% optimize.opt_a.parameter_eliminate : 0.000003s : 0.03% optimize.opt_a.a_2 : 0.000172s : 1.62% optimize.opt_a.accelerated_algorithm : 0.000014s : 0.14% optimize.opt_a.shard : 0.000003s : 0.03% optimize.opt_a.meta_shard_fg_expand : 0.000003s : 0.03% optimize.opt_a.shard_inline : 0.000014s : 0.13% optimize.opt_a.merge_send_recv : 0.000013s : 0.12% optimize.opt_a.auto_parallel : 0.000011s : 0.11% optimize.opt_a.parallel : 0.000022s : 0.21% optimize.opt_a.flash_sp : 0.000010s : 0.10% optimize.opt_a.merge_comm : 0.000007s : 0.07% optimize.opt_a.allreduce_fusion : 0.000007s : 0.07% optimize.opt_a.matmul_add_comm_reduction : 0.000014s : 0.14% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000015s : 0.14% optimize.opt_a.virtual_dataset : 0.000014s : 0.13% optimize.opt_a.get_grad_eliminate_ : 0.000014s : 0.13% optimize.opt_a.virtual_output : 0.000013s : 0.12% optimize.opt_a.merge_forward : 0.000006s : 0.06% optimize.opt_a.cell_reuse_recompute_pass : 0.000002s : 0.02% optimize.opt_a.offload_activation : 0.000016s : 0.15% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000026s : 0.25% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.02% optimize.opt_a.before_grad : 0.000021s : 0.20% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000007s : 0.06% optimize.opt_a.meta_fg_expand : 0.000005s : 0.05% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.03% optimize.opt_a.receive_attached : 0.000003s : 0.03% optimize.opt_a.after_resolve : 0.000026s : 0.24% optimize.opt_a.a_after_grad : 0.000023s : 0.22% optimize.opt_a.renormalize : 0.000528s : 4.98% optimize.opt_a.add_forward_monad_depend : 0.000007s : 0.06% optimize.opt_a.auto_monad_grad : 0.000003s : 0.03% optimize.opt_a.auto_monad_eliminator : 0.000022s : 0.21% optimize.opt_a.cse : 0.000045s : 0.42% optimize.opt_a.a_3 : 0.000092s : 0.87% optimize.py_interpret_to_execute_after_opt_a : 0.000008s : 0.08% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.02% optimize.rewriter_after_opt_a : 0.000032s : 0.31% optimize.convert_after_rewriter : 0.000007s : 0.06% optimize.order_py_execute_after_rewriter : 0.000005s : 0.05% optimize.mutable_eliminate : 0.000453s : 4.27% optimize.opt_b.b_1 : 0.000146s : 1.37% optimize.opt_b.b_2 : 0.000009s : 0.08% optimize.opt_b.updatestate_depend_eliminate : 0.000005s : 0.05% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.02% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_b.renormalize : 0.000000s : 0.00% optimize.opt_b.cse : 0.000018s : 0.17% optimize.optimize_parallel_all_gather_comm : 0.000017s : 0.16% optimize.overlap_param_gather : 0.000002s : 0.02% optimize.cconv : 0.000022s : 0.21% optimize.loop_unroll : 0.000425s : 4.01% optimize.opt_after_cconv.c_1 : 0.000036s : 0.34% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000005s : 0.05% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000002s : 0.02% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.02% optimize.opt_after_cconv.cse : 0.000018s : 0.17% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000014s : 0.13% optimize.tuple_transform.d_1 : 0.000049s : 0.46% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.02% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000007s : 0.07% optimize.partial_unused_args_eliminate : 0.000002s : 0.02% optimize.add_recomputation : 0.000045s : 0.42% optimize.cse_after_recomputation.cse : 0.000012s : 0.11% optimize.environ_conv : 0.000005s : 0.05% optimize.swap_dp_allreduce_reducescatter : 0.000005s : 0.05% optimize.bias_add_comm_swap : 0.000003s : 0.03% optimize.label_micro_interleaved_index : 0.000005s : 0.04% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.03% optimize.merge_cast_opt : 0.000001s : 0.01% optimize.slice_recompute_activation : 0.000002s : 0.02% optimize.micro_interleaved_order_control : 0.000002s : 0.02% optimize.assign_add_opt : 0.000001s : 0.01% optimize.ForceFp32Comm : 0.000001s : 0.01% optimize.remove_cast_before_assign_add : 0.000001s : 0.01% optimize.full_micro_interleaved_order_control : 0.000002s : 0.02% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.03% optimize.comm_op_add_attrs : 0.000001s : 0.01% optimize.add_comm_op_reuse_tag : 0.000001s : 0.01% optimize.interleave_split_concat_branches : 0.000001s : 0.01% optimize.interleave_parallel_branches : 0.000001s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.02% optimize.control_data_broadcast_order : 0.000012s : 0.11% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.02% optimize.offloading_packed_experts : 0.000003s : 0.03% optimize.overlap_recompute_and_grad_model_parallel : 0.000004s : 0.04% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.01% optimize.overlap_recompute_comm : 0.000002s : 0.02% optimize.overlap_grad_ring_attention : 0.000004s : 0.04% optimize.overlap_grad_flash_sp : 0.000017s : 0.16% optimize.begin_end_overlap_inline : 0.000001s : 0.01% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.02% optimize.split_layernorm_comm : 0.000002s : 0.02% optimize.handle_group_info : 0.000001s : 0.01% optimize.symbol_engine_optimizer.build : 0.000003s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000010s : 0.10% optimize.symbol_engine_optimizer.elim_not_effective : 0.000013s : 0.13% optimize.symbol_engine_optimizer.opt_reshape : 0.000007s : 0.07% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000011s : 0.10% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.02% pipeline_parallel_scheduler : 0.000001s : 0.01% auto_monad_reorder : 0.000018s : 0.17% get_jit_bprop_graph : 0.000001s : 0.01% rewriter_after_jit_bprop_graph : 0.000003s : 0.03% opt_after_jit_grad : 0.000445s : 4.19% validate : 0.000035s : 0.33% Time group info: ------[substitution.] 0.000182 34 0.96% : 0.000002s : 2: substitution.elim_not_effective 0.71% : 0.000001s : 2: substitution.fold_const_symbol 3.43% : 0.000006s : 5: substitution.graph_param_transform 74.62% : 0.000136s : 4: substitution.inline 1.87% : 0.000003s : 4: substitution.j_node_and_user_rematch 2.67% : 0.000005s : 4: substitution.remove_not_recompute_node 2.87% : 0.000005s : 6: substitution.replace_old_param 10.07% : 0.000018s : 6: substitution.tuple_list_get_item_eliminator 2.81% : 0.000005s : 1: substitution.value_based_eliminate ------[type_inference.] 0.005627 2 87.23% : 0.004908s : 1: type_inference.infer 12.77% : 0.000719s : 1: type_inference.specialize ------[replace.] 0.000069 10 54.61% : 0.000037s : 4: replace.inline 45.39% : 0.000031s : 6: replace.tuple_list_get_item_eliminator ------[match.] 0.000149 10 89.51% : 0.000133s : 4: match.inline 10.49% : 0.000016s : 6: match.tuple_list_get_item_eliminator ------[predicate.] 0.000229 1590 0.98% : 0.000002s : 16: predicate.accumulaten_eliminater 0.63% : 0.000001s : 5: predicate.ad_related_special_op_eliminate 0.59% : 0.000001s : 10: predicate.addn_check_dump 0.93% : 0.000002s : 16: predicate.addn_zero_filter 0.84% : 0.000002s : 16: predicate.adjust_all_reduce_mul_add 1.95% : 0.000004s : 26: predicate.arithmetic_simplify 0.95% : 0.000002s : 16: predicate.cast_eliminate 0.60% : 0.000001s : 10: predicate.check_bprop_eliminate 0.50% : 0.000001s : 10: predicate.compare_switch_simplify 0.20% : 0.000000s : 5: predicate.const_output_eliminate 0.57% : 0.000001s : 10: predicate.depend_value_elim 0.98% : 0.000002s : 16: predicate.dict_get_item_const_eliminator 1.05% : 0.000002s : 16: predicate.dict_get_item_eliminator 1.02% : 0.000002s : 16: predicate.dict_set_item_eliminator 0.91% : 0.000002s : 10: predicate.dumpgradient_eliminate 0.21% : 0.000000s : 5: predicate.elim_not_effective 0.35% : 0.000001s : 5: predicate.elim_shapecalc_of_broadcastargs 1.16% : 0.000003s : 21: predicate.environ_add_const_eliminate 1.16% : 0.000003s : 21: predicate.environ_get_add_eliminate 1.12% : 0.000003s : 21: predicate.environ_get_depend_swap 1.74% : 0.000004s : 31: predicate.environ_get_eliminate 1.09% : 0.000002s : 21: predicate.environ_get_set_eliminate 1.56% : 0.000004s : 26: predicate.exchange_switch_depend_value 2.45% : 0.000006s : 26: predicate.float_depend_g_call 0.49% : 0.000001s : 10: predicate.float_environ_get_switch 0.77% : 0.000002s : 15: predicate.float_tuple_getitem_switch 0.21% : 0.000000s : 5: predicate.fold_const_symbol 0.70% : 0.000002s : 10: predicate.get_grad_eliminate 0.21% : 0.000000s : 5: predicate.graph_param_transform 0.55% : 0.000001s : 10: predicate.incorporate_call 0.47% : 0.000001s : 10: predicate.incorporate_call_switch 6.19% : 0.000014s : 72: predicate.inline 0.71% : 0.000002s : 10: predicate.inline_without_move 0.37% : 0.000001s : 10: predicate.j_node_and_user_rematch 0.72% : 0.000002s : 10: predicate.less_batch_normalization 1.96% : 0.000004s : 32: predicate.list_to_tuple_eliminator_ 2.80% : 0.000006s : 48: predicate.load_eliminater 0.80% : 0.000002s : 5: predicate.loop_unroll_after_grad 2.39% : 0.000005s : 40: predicate.loop_unroll_before_grad 1.60% : 0.000004s : 26: predicate.make_slice_get_slice_eliminator 0.53% : 0.000001s : 10: predicate.merge_addn 0.57% : 0.000001s : 10: predicate.micro_step_allgather_replace 0.55% : 0.000001s : 10: predicate.mini_step_allgather_replace 0.86% : 0.000002s : 16: predicate.minmaximum_grad 0.80% : 0.000002s : 5: predicate.mutable_eliminate 0.34% : 0.000001s : 5: predicate.opt_reshape 0.35% : 0.000001s : 5: predicate.parallel_virtual_node 1.85% : 0.000004s : 26: predicate.partial_defer_inline 1.82% : 0.000004s : 27: predicate.partial_eliminate 0.88% : 0.000002s : 16: predicate.print_const_string_wrapper 0.56% : 0.000001s : 10: predicate.reduce_all_const_elim 1.09% : 0.000002s : 16: predicate.reduce_eliminate 2.59% : 0.000006s : 48: predicate.redundant_stop_gradient_eliminater 0.41% : 0.000001s : 10: predicate.remove_not_recompute_node 1.51% : 0.000003s : 32: predicate.replace_applicator 0.50% : 0.000001s : 10: predicate.replace_old_param 0.27% : 0.000001s : 5: predicate.reset_defer_inline 0.94% : 0.000002s : 16: predicate.reshape_eliminate 0.57% : 0.000001s : 10: predicate.row_tensor_add_zeros_like 0.34% : 0.000001s : 5: predicate.row_tensor_eliminate 0.81% : 0.000002s : 10: predicate.same_eliminate 0.46% : 0.000001s : 10: predicate.set_cell_output_no_recompute 0.71% : 0.000002s : 10: predicate.shard_identity_eliminate 0.63% : 0.000001s : 10: predicate.special_op_eliminate 0.73% : 0.000002s : 10: predicate.specialize_transform 0.75% : 0.000002s : 10: predicate.split_environ_get_set_with_tuple_value 0.68% : 0.000002s : 10: predicate.stack_unstack_eliminate 0.35% : 0.000001s : 5: predicate.switch_call_monad_eliminater 1.63% : 0.000004s : 26: predicate.switch_defer_inline 2.21% : 0.000005s : 36: predicate.switch_layer_defer_inline 5.24% : 0.000012s : 81: predicate.switch_simplify 0.90% : 0.000002s : 16: predicate.tile_eliminate 0.92% : 0.000002s : 16: predicate.transpose_eliminate 1.54% : 0.000004s : 26: predicate.tuple_list_convert_item_index_to_positive 1.59% : 0.000004s : 26: predicate.tuple_list_get_item_const_eliminator 1.45% : 0.000003s : 26: predicate.tuple_list_get_item_depend_reorder 3.44% : 0.000008s : 42: predicate.tuple_list_get_item_eliminator 1.53% : 0.000003s : 26: predicate.tuple_list_get_set_item_eliminator 2.33% : 0.000005s : 36: predicate.tuple_list_set_item_eliminator 1.83% : 0.000004s : 32: predicate.tuple_to_list_eliminator_ 2.65% : 0.000006s : 48: predicate.updatestate_pure_node_eliminater 3.22% : 0.000007s : 58: predicate.updatestate_useless_node_eliminater 0.65% : 0.000001s : 5: predicate.value_based_eliminate 0.61% : 0.000001s : 10: predicate.virtual_dataset_eliminate 0.61% : 0.000001s : 10: predicate.virtual_output_eliminate 0.26% : 0.000001s : 5: predicate.virtual_view_grad_eliminate 0.46% : 0.000001s : 5: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000517 11 55.25% : 0.000286s : 5: func_graph_cloner_run.FuncGraphClonerGraph 44.75% : 0.000232s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.024254 192 0.02% : 0.000004s : 1: ForceFp32Comm 12.44% : 0.003017s : 1: add_attr 12.40% : 0.003008s : 1: add_attr_with_inline 0.02% : 0.000004s : 1: add_comm_op_reuse_tag 0.20% : 0.000049s : 1: add_recomputation 0.02% : 0.000004s : 1: assign_add_opt 0.27% : 0.000064s : 1: auto_monad 0.09% : 0.000022s : 1: auto_monad_reorder 0.01% : 0.000003s : 1: begin_end_overlap_inline 0.02% : 0.000006s : 1: bias_add_comm_swap 1.89% : 0.000459s : 1: bootstrap 0.11% : 0.000026s : 1: cconv 0.02% : 0.000004s : 1: comm_op_add_attrs 0.06% : 0.000015s : 1: control_data_broadcast_order 0.04% : 0.000010s : 1: convert_after_rewriter 0.10% : 0.000025s : 1: cse_after_recomputation 0.02% : 0.000005s : 1: dataset_repeat_opt 0.02% : 0.000005s : 1: detach_backward 0.03% : 0.000008s : 1: environ_conv 0.10% : 0.000025s : 1: event_method 0.02% : 0.000005s : 1: full_micro_interleaved_order_control 0.02% : 0.000005s : 1: get_jit_bprop_graph 0.04% : 0.000009s : 1: graph_reusing 0.02% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.02% : 0.000004s : 1: handle_group_info 0.02% : 0.000005s : 1: inline 0.02% : 0.000006s : 1: insert-virtual-dataset 0.02% : 0.000004s : 1: interleave_parallel_branches 0.02% : 0.000004s : 1: interleave_split_concat_branches 0.02% : 0.000006s : 1: label_fine_grained_interleaved_index 0.03% : 0.000007s : 1: label_micro_interleaved_index 1.79% : 0.000433s : 1: loop_unroll 0.02% : 0.000004s : 1: merge_cast_opt 0.02% : 0.000005s : 1: micro_interleaved_order_control 1.90% : 0.000461s : 1: mutable_eliminate 0.03% : 0.000006s : 1: offloading_packed_experts 0.06% : 0.000014s : 1: opt.transform.loop_unroll_optimizer 0.06% : 0.000015s : 1: opt.transform.mutable_eliminate 5.54% : 0.001344s : 78: opt.transform.opt_a 0.14% : 0.000034s : 1: opt.transform.opt_after_cconv 0.11% : 0.000026s : 1: opt.transform.opt_after_jit_grad 0.51% : 0.000124s : 28: opt.transform.opt_b 0.22% : 0.000054s : 2: opt.transform.opt_trans_graph 0.16% : 0.000038s : 4: opt.transform.symbol_engine_opt 11.03% : 0.002676s : 1: opt_a 0.45% : 0.000109s : 1: opt_after_cconv 1.87% : 0.000453s : 1: opt_after_jit_grad 0.94% : 0.000227s : 1: opt_b 19.15% : 0.004644s : 1: optimize 0.08% : 0.000020s : 1: optimize_parallel_all_gather_comm 0.03% : 0.000008s : 1: order_py_execute_after_rewriter 0.08% : 0.000020s : 1: overlap_grad_flash_sp 0.02% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.03% : 0.000007s : 1: overlap_grad_ring_attention 0.02% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.02% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.02% : 0.000005s : 1: overlap_param_gather 0.02% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.03% : 0.000007s : 1: overlap_recompute_and_grad_model_parallel 0.02% : 0.000005s : 1: overlap_recompute_comm 0.03% : 0.000006s : 1: parallel-infer-symbol 0.02% : 0.000004s : 1: parallel-infer-symbol-second 0.02% : 0.000005s : 1: partial_unused_args_eliminate 0.02% : 0.000004s : 1: pipeline_parallel_scheduler 0.02% : 0.000005s : 1: pipeline_split 0.15% : 0.000036s : 1: pre_auto_parallel 0.15% : 0.000036s : 1: py_interpret_to_execute 0.05% : 0.000012s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000004s : 1: remove_cast_before_assign_add 0.07% : 0.000017s : 1: remove_dup_value 1.08% : 0.000263s : 1: renormalize.infer 1.06% : 0.000258s : 1: renormalize.specialize 0.02% : 0.000005s : 1: reorder_send_recv_between_fp_bp 0.03% : 0.000006s : 1: rewriter_after_jit_bprop_graph 0.15% : 0.000037s : 1: rewriter_after_opt_a 0.38% : 0.000091s : 1: rewriter_before_opt_a 0.02% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.02% : 0.000005s : 1: slice_recompute_activation 0.02% : 0.000005s : 1: split_layernorm_comm 0.02% : 0.000005s : 1: split_matmul_comm_elemetwise 0.03% : 0.000008s : 1: swap_dp_allreduce_reducescatter 0.32% : 0.000079s : 1: symbol_engine_optimizer 0.35% : 0.000084s : 1: tuple_transform 23.50% : 0.005699s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:47:13.475.195 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:47:13.475.441 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0157047, [21] [bootstrap]: 0.0004387 [type_inference]: 0.00567935 [event_method]: 1.969e-05 [auto_monad]: 6.07e-05 [graph_reusing]: 5.82001e-06 [inline]: 2.03002e-06 [add_attr]: 0.00299092, [1] [add_attr_with_inline]: 0.00298312, [1] [Cycle 1]: 6.931e-05, [2] [tag_attr]: 1.949e-05 [meta_addattr_fg_expand]: 6.05002e-06 [parallel-infer-symbol]: 2.93e-06 [pre_auto_parallel]: 3.271e-05 [insert-virtual-dataset]: 2.25002e-06 [parallel-infer-symbol-second]: 7.39994e-07 [dataset_repeat_opt]: 1.97999e-06 [pipeline_split]: 1.72999e-06 [optimize]: 0.00538309, [53] [py_interpret_to_execute]: 3.984e-05 [rewriter_before_opt_a]: 9.365e-05 [opt_a]: 0.00307883, [2] [Cycle 1]: 0.00221022, [45] [expand_dump_flag]: 3.01001e-06 [switch_simplify]: 5.787e-05 [loop_unroll]: 3.572e-05 [a_1]: 0.00071493 [with_stream_mark]: 1.747e-05 [recompute_prepare]: 8.83001e-06 [updatestate_depend_eliminate]: 4.1e-06 [updatestate_assign_eliminate]: 3.36001e-06 [updatestate_loads_eliminate]: 2.78e-06 [parameter_eliminate]: 2.04e-06 [a_2]: 0.0001186 [accelerated_algorithm]: 7.82e-06 [shard]: 1.74998e-06 [meta_shard_fg_expand]: 2.04e-06 [shard_inline]: 7.2e-06 [merge_send_recv]: 8.81997e-06 [auto_parallel]: 6.05002e-06 [parallel]: 1.768e-05 [flash_sp]: 8.75999e-06 [merge_comm]: 4.32e-06 [allreduce_fusion]: 3.54002e-06 [matmul_add_comm_reduction]: 1.035e-05 [allreduce_slice_to_reducescatter]: 6.50005e-07 [virtual_shard_identity]: 8.89003e-06 [virtual_dataset]: 7.90998e-06 [get_grad_eliminate_]: 7.24001e-06 [virtual_output]: 7.06999e-06 [merge_forward]: 3.81001e-06 [cell_reuse_recompute_pass]: 1.09e-06 [offload_activation]: 9.82001e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.501e-05 [merge_recompute_call_nodes]: 1.69998e-06 [before_grad]: 1.084e-05 [set_forward_comm_id_for_comm_node_pass]: 3.66999e-06 [meta_fg_expand]: 2.89001e-06 [flash_sp_send_recv_attached]: 2.66999e-06 [receive_attached]: 2.44001e-06 [after_resolve]: 1.345e-05 [a_after_grad]: 1.145e-05 [renormalize]: 0.00055442 [add_forward_monad_depend]: 4.82e-06 [auto_monad_grad]: 2.21998e-06 [auto_monad_eliminator]: 1.477e-05 [cse]: 3.195e-05 [a_3]: 6.584e-05 [Cycle 2]: 0.00085444, [45] [expand_dump_flag]: 1.67001e-06 [switch_simplify]: 8.44002e-06 [loop_unroll]: 7.23e-06 [a_1]: 0.00015706 [with_stream_mark]: 1.165e-05 [recompute_prepare]: 7.15e-06 [updatestate_depend_eliminate]: 3.27997e-06 [updatestate_assign_eliminate]: 2.42001e-06 [updatestate_loads_eliminate]: 2.17999e-06 [parameter_eliminate]: 1.04e-06 [a_2]: 0.00010814 [accelerated_algorithm]: 6.95002e-06 [shard]: 1.34998e-06 [meta_shard_fg_expand]: 1.30001e-06 [shard_inline]: 6.71e-06 [merge_send_recv]: 4.94e-06 [auto_parallel]: 5.10999e-06 [parallel]: 5.14e-06 [flash_sp]: 3.27002e-06 [merge_comm]: 3.30003e-06 [allreduce_fusion]: 3.07002e-06 [matmul_add_comm_reduction]: 5.89e-06 [allreduce_slice_to_reducescatter]: 3.30008e-07 [virtual_shard_identity]: 7.42002e-06 [virtual_dataset]: 6.82002e-06 [get_grad_eliminate_]: 6.37001e-06 [virtual_output]: 6.58e-06 [merge_forward]: 2.61e-06 [cell_reuse_recompute_pass]: 1.39e-06 [offload_activation]: 6.19999e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.54e-05 [merge_recompute_call_nodes]: 1.24e-06 [before_grad]: 1.009e-05 [set_forward_comm_id_for_comm_node_pass]: 3.44001e-06 [meta_fg_expand]: 2.22001e-06 [flash_sp_send_recv_attached]: 8.50006e-07 [receive_attached]: 1.05999e-06 [after_resolve]: 1.215e-05 [a_after_grad]: 1.121e-05 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 1.12999e-06 [auto_monad_grad]: 9.70002e-07 [auto_monad_eliminator]: 7.05998e-06 [cse]: 1.389e-05 [a_3]: 5.315e-05 [py_interpret_to_execute_after_opt_a]: 1.292e-05 [slice_cell_reuse_recomputed_activation]: 4.80001e-06 [rewriter_after_opt_a]: 3.998e-05 [convert_after_rewriter]: 9.71998e-06 [order_py_execute_after_rewriter]: 7.87e-06 [mutable_eliminate]: 0.00049961 [opt_b]: 0.00029516, [1] [Cycle 1]: 0.0002855, [7] [b_1]: 0.00019064 [b_2]: 8.99e-06 [updatestate_depend_eliminate]: 5.29e-06 [updatestate_assign_eliminate]: 2.51e-06 [updatestate_loads_eliminate]: 2.78998e-06 [renormalize]: 5.69999e-07 [cse]: 1.76e-05 [optimize_parallel_all_gather_comm]: 1.911e-05 [overlap_param_gather]: 4.87e-06 [cconv]: 4.081e-05 [loop_unroll]: 0.0004285 [opt_after_cconv]: 0.0001296, [1] [Cycle 1]: 0.00012115, [7] [c_1]: 3.57e-05 [parameter_eliminate]: 2.44001e-06 [updatestate_depend_eliminate]: 5.35001e-06 [updatestate_assign_eliminate]: 2.55002e-06 [updatestate_loads_eliminate]: 2.32999e-06 [cse]: 1.71e-05 [renormalize]: 6.39993e-07 [remove_dup_value]: 1.637e-05 [tuple_transform]: 9.589e-05, [1] [Cycle 1]: 8.881e-05, [4] [d_1]: 4.864e-05 [none_parameter_eliminate]: 1.49e-06 [renormalize]: 2.30008e-07 [switch_simplify]: 7.75e-06 [partial_unused_args_eliminate]: 4.85999e-06 [add_recomputation]: 4.869e-05 [cse_after_recomputation]: 2.791e-05, [1] [Cycle 1]: 2.093e-05, [1] [cse]: 1.187e-05 [environ_conv]: 8.16002e-06 [swap_dp_allreduce_reducescatter]: 8.33001e-06 [bias_add_comm_swap]: 4.63999e-06 [label_micro_interleaved_index]: 7.08998e-06 [label_fine_grained_interleaved_index]: 5.39998e-06 [merge_cast_opt]: 3.65e-06 [slice_recompute_activation]: 4.3e-06 [micro_interleaved_order_control]: 4.77998e-06 [assign_add_opt]: 3.51999e-06 [ForceFp32Comm]: 3.13e-06 [remove_cast_before_assign_add]: 3.3e-06 [full_micro_interleaved_order_control]: 4.4e-06 [reorder_send_recv_between_fp_bp]: 5.25001e-06 [comm_op_add_attrs]: 3.27002e-06 [add_comm_op_reuse_tag]: 3.48e-06 [interleave_split_concat_branches]: 3.44001e-06 [interleave_parallel_branches]: 3.36999e-06 [overlap_opt_shard_in_pipeline]: 3.38e-06 [overlap_opt_shard_grad_in_pipeline]: 4.33001e-06 [control_data_broadcast_order]: 1.539e-05 [grouped_pairwise_exchange_alltoall]: 4.14002e-06 [offloading_packed_experts]: 6.04999e-06 [overlap_recompute_and_grad_model_parallel]: 7.21999e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.80998e-06 [overlap_recompute_allgather_and_fa_grad]: 3.73001e-06 [overlap_recompute_comm]: 5.32999e-06 [overlap_grad_ring_attention]: 6.46999e-06 [overlap_grad_flash_sp]: 2.138e-05 [begin_end_overlap_inline]: 3.18e-06 [split_matmul_comm_elemetwise]: 4.35e-06 [split_layernorm_comm]: 4.08999e-06 [handle_group_info]: 3.29001e-06 [symbol_engine_optimizer]: 9.73e-05, [1] [Cycle 1]: 9.042e-05, [6] [build]: 2.59999e-06 [elim_shapecalc]: 9.52001e-06 [elim_not_effective]: 1.321e-05 [opt_reshape]: 8.60001e-06 [fold_const_symbol]: 1.136e-05 [renormalize]: 2.19996e-07 [detach_backward]: 3.61001e-06 [pipeline_parallel_scheduler]: 1.87001e-06 [auto_monad_reorder]: 2.028e-05 [get_jit_bprop_graph]: 1.52999e-06 [rewriter_after_jit_bprop_graph]: 4.50001e-06 [opt_after_jit_grad]: 0.00047057 [validate]: 3.489e-05 Sums bootstrap : 0.000439s : 3.98% type_inference : 0.005679s : 51.49% event_method : 0.000020s : 0.18% auto_monad : 0.000061s : 0.55% graph_reusing : 0.000006s : 0.05% inline : 0.000002s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000019s : 0.18% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.05% parallel-infer-symbol : 0.000003s : 0.03% pre_auto_parallel : 0.000033s : 0.30% insert-virtual-dataset : 0.000002s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.02% optimize.py_interpret_to_execute : 0.000040s : 0.36% optimize.rewriter_before_opt_a : 0.000094s : 0.85% optimize.opt_a.expand_dump_flag : 0.000005s : 0.04% optimize.opt_a.switch_simplify : 0.000066s : 0.60% optimize.opt_a.loop_unroll : 0.000043s : 0.39% optimize.opt_a.a_1 : 0.000872s : 7.91% optimize.opt_a.with_stream_mark : 0.000029s : 0.26% optimize.opt_a.recompute_prepare : 0.000016s : 0.14% optimize.opt_a.updatestate_depend_eliminate : 0.000007s : 0.07% optimize.opt_a.updatestate_assign_eliminate : 0.000006s : 0.05% optimize.opt_a.updatestate_loads_eliminate : 0.000005s : 0.04% optimize.opt_a.parameter_eliminate : 0.000003s : 0.03% optimize.opt_a.a_2 : 0.000227s : 2.06% optimize.opt_a.accelerated_algorithm : 0.000015s : 0.13% optimize.opt_a.shard : 0.000003s : 0.03% optimize.opt_a.meta_shard_fg_expand : 0.000003s : 0.03% optimize.opt_a.shard_inline : 0.000014s : 0.13% optimize.opt_a.merge_send_recv : 0.000014s : 0.12% optimize.opt_a.auto_parallel : 0.000011s : 0.10% optimize.opt_a.parallel : 0.000023s : 0.21% optimize.opt_a.flash_sp : 0.000012s : 0.11% optimize.opt_a.merge_comm : 0.000008s : 0.07% optimize.opt_a.allreduce_fusion : 0.000007s : 0.06% optimize.opt_a.matmul_add_comm_reduction : 0.000016s : 0.15% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000016s : 0.15% optimize.opt_a.virtual_dataset : 0.000015s : 0.13% optimize.opt_a.get_grad_eliminate_ : 0.000014s : 0.12% optimize.opt_a.virtual_output : 0.000014s : 0.12% optimize.opt_a.merge_forward : 0.000006s : 0.06% optimize.opt_a.cell_reuse_recompute_pass : 0.000002s : 0.02% optimize.opt_a.offload_activation : 0.000016s : 0.15% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000030s : 0.28% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.03% optimize.opt_a.before_grad : 0.000021s : 0.19% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000007s : 0.06% optimize.opt_a.meta_fg_expand : 0.000005s : 0.05% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.03% optimize.opt_a.receive_attached : 0.000003s : 0.03% optimize.opt_a.after_resolve : 0.000026s : 0.23% optimize.opt_a.a_after_grad : 0.000023s : 0.21% optimize.opt_a.renormalize : 0.000554s : 5.03% optimize.opt_a.add_forward_monad_depend : 0.000006s : 0.05% optimize.opt_a.auto_monad_grad : 0.000003s : 0.03% optimize.opt_a.auto_monad_eliminator : 0.000022s : 0.20% optimize.opt_a.cse : 0.000046s : 0.42% optimize.opt_a.a_3 : 0.000119s : 1.08% optimize.py_interpret_to_execute_after_opt_a : 0.000013s : 0.12% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.04% optimize.rewriter_after_opt_a : 0.000040s : 0.36% optimize.convert_after_rewriter : 0.000010s : 0.09% optimize.order_py_execute_after_rewriter : 0.000008s : 0.07% optimize.mutable_eliminate : 0.000500s : 4.53% optimize.opt_b.b_1 : 0.000191s : 1.73% optimize.opt_b.b_2 : 0.000009s : 0.08% optimize.opt_b.updatestate_depend_eliminate : 0.000005s : 0.05% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.02% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_b.renormalize : 0.000001s : 0.01% optimize.opt_b.cse : 0.000018s : 0.16% optimize.optimize_parallel_all_gather_comm : 0.000019s : 0.17% optimize.overlap_param_gather : 0.000005s : 0.04% optimize.cconv : 0.000041s : 0.37% optimize.loop_unroll : 0.000429s : 3.89% optimize.opt_after_cconv.c_1 : 0.000036s : 0.32% optimize.opt_after_cconv.parameter_eliminate : 0.000002s : 0.02% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000005s : 0.05% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.02% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.02% optimize.opt_after_cconv.cse : 0.000017s : 0.16% optimize.opt_after_cconv.renormalize : 0.000001s : 0.01% optimize.remove_dup_value : 0.000016s : 0.15% optimize.tuple_transform.d_1 : 0.000049s : 0.44% optimize.tuple_transform.none_parameter_eliminate : 0.000001s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000008s : 0.07% optimize.partial_unused_args_eliminate : 0.000005s : 0.04% optimize.add_recomputation : 0.000049s : 0.44% optimize.cse_after_recomputation.cse : 0.000012s : 0.11% optimize.environ_conv : 0.000008s : 0.07% optimize.swap_dp_allreduce_reducescatter : 0.000008s : 0.08% optimize.bias_add_comm_swap : 0.000005s : 0.04% optimize.label_micro_interleaved_index : 0.000007s : 0.06% optimize.label_fine_grained_interleaved_index : 0.000005s : 0.05% optimize.merge_cast_opt : 0.000004s : 0.03% optimize.slice_recompute_activation : 0.000004s : 0.04% optimize.micro_interleaved_order_control : 0.000005s : 0.04% optimize.assign_add_opt : 0.000004s : 0.03% optimize.ForceFp32Comm : 0.000003s : 0.03% optimize.remove_cast_before_assign_add : 0.000003s : 0.03% optimize.full_micro_interleaved_order_control : 0.000004s : 0.04% optimize.reorder_send_recv_between_fp_bp : 0.000005s : 0.05% optimize.comm_op_add_attrs : 0.000003s : 0.03% optimize.add_comm_op_reuse_tag : 0.000003s : 0.03% optimize.interleave_split_concat_branches : 0.000003s : 0.03% optimize.interleave_parallel_branches : 0.000003s : 0.03% optimize.overlap_opt_shard_in_pipeline : 0.000003s : 0.03% optimize.overlap_opt_shard_grad_in_pipeline : 0.000004s : 0.04% optimize.control_data_broadcast_order : 0.000015s : 0.14% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.04% optimize.offloading_packed_experts : 0.000006s : 0.05% optimize.overlap_recompute_and_grad_model_parallel : 0.000007s : 0.07% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.03% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.03% optimize.overlap_recompute_comm : 0.000005s : 0.05% optimize.overlap_grad_ring_attention : 0.000006s : 0.06% optimize.overlap_grad_flash_sp : 0.000021s : 0.19% optimize.begin_end_overlap_inline : 0.000003s : 0.03% optimize.split_matmul_comm_elemetwise : 0.000004s : 0.04% optimize.split_layernorm_comm : 0.000004s : 0.04% optimize.handle_group_info : 0.000003s : 0.03% optimize.symbol_engine_optimizer.build : 0.000003s : 0.02% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000010s : 0.09% optimize.symbol_engine_optimizer.elim_not_effective : 0.000013s : 0.12% optimize.symbol_engine_optimizer.opt_reshape : 0.000009s : 0.08% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000011s : 0.10% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000004s : 0.03% pipeline_parallel_scheduler : 0.000002s : 0.02% auto_monad_reorder : 0.000020s : 0.18% get_jit_bprop_graph : 0.000002s : 0.01% rewriter_after_jit_bprop_graph : 0.000005s : 0.04% opt_after_jit_grad : 0.000471s : 4.27% validate : 0.000035s : 0.32% Time group info: ------[substitution.] 0.000191 34 0.98% : 0.000002s : 2: substitution.elim_not_effective 0.86% : 0.000002s : 2: substitution.fold_const_symbol 3.19% : 0.000006s : 5: substitution.graph_param_transform 75.67% : 0.000144s : 4: substitution.inline 1.93% : 0.000004s : 4: substitution.j_node_and_user_rematch 2.39% : 0.000005s : 4: substitution.remove_not_recompute_node 2.80% : 0.000005s : 6: substitution.replace_old_param 9.37% : 0.000018s : 6: substitution.tuple_list_get_item_eliminator 2.81% : 0.000005s : 1: substitution.value_based_eliminate ------[type_inference.] 0.005630 2 86.92% : 0.004893s : 1: type_inference.infer 13.08% : 0.000737s : 1: type_inference.specialize ------[replace.] 0.000068 10 53.15% : 0.000036s : 4: replace.inline 46.85% : 0.000032s : 6: replace.tuple_list_get_item_eliminator ------[match.] 0.000157 10 90.30% : 0.000142s : 4: match.inline 9.70% : 0.000015s : 6: match.tuple_list_get_item_eliminator ------[predicate.] 0.000232 1590 0.89% : 0.000002s : 16: predicate.accumulaten_eliminater 0.65% : 0.000002s : 5: predicate.ad_related_special_op_eliminate 0.51% : 0.000001s : 10: predicate.addn_check_dump 1.02% : 0.000002s : 16: predicate.addn_zero_filter 0.82% : 0.000002s : 16: predicate.adjust_all_reduce_mul_add 2.01% : 0.000005s : 26: predicate.arithmetic_simplify 0.85% : 0.000002s : 16: predicate.cast_eliminate 0.58% : 0.000001s : 10: predicate.check_bprop_eliminate 0.52% : 0.000001s : 10: predicate.compare_switch_simplify 0.20% : 0.000000s : 5: predicate.const_output_eliminate 0.55% : 0.000001s : 10: predicate.depend_value_elim 0.98% : 0.000002s : 16: predicate.dict_get_item_const_eliminator 1.11% : 0.000003s : 16: predicate.dict_get_item_eliminator 0.89% : 0.000002s : 16: predicate.dict_set_item_eliminator 0.91% : 0.000002s : 10: predicate.dumpgradient_eliminate 0.22% : 0.000001s : 5: predicate.elim_not_effective 0.34% : 0.000001s : 5: predicate.elim_shapecalc_of_broadcastargs 1.14% : 0.000003s : 21: predicate.environ_add_const_eliminate 1.08% : 0.000003s : 21: predicate.environ_get_add_eliminate 1.10% : 0.000003s : 21: predicate.environ_get_depend_swap 1.71% : 0.000004s : 31: predicate.environ_get_eliminate 1.09% : 0.000003s : 21: predicate.environ_get_set_eliminate 1.54% : 0.000004s : 26: predicate.exchange_switch_depend_value 2.52% : 0.000006s : 26: predicate.float_depend_g_call 0.54% : 0.000001s : 10: predicate.float_environ_get_switch 0.79% : 0.000002s : 15: predicate.float_tuple_getitem_switch 0.20% : 0.000000s : 5: predicate.fold_const_symbol 0.65% : 0.000001s : 10: predicate.get_grad_eliminate 0.23% : 0.000001s : 5: predicate.graph_param_transform 0.62% : 0.000001s : 10: predicate.incorporate_call 0.50% : 0.000001s : 10: predicate.incorporate_call_switch 6.34% : 0.000015s : 72: predicate.inline 0.81% : 0.000002s : 10: predicate.inline_without_move 0.34% : 0.000001s : 10: predicate.j_node_and_user_rematch 0.76% : 0.000002s : 10: predicate.less_batch_normalization 1.97% : 0.000005s : 32: predicate.list_to_tuple_eliminator_ 2.66% : 0.000006s : 48: predicate.load_eliminater 0.75% : 0.000002s : 5: predicate.loop_unroll_after_grad 2.55% : 0.000006s : 40: predicate.loop_unroll_before_grad 1.56% : 0.000004s : 26: predicate.make_slice_get_slice_eliminator 0.55% : 0.000001s : 10: predicate.merge_addn 0.57% : 0.000001s : 10: predicate.micro_step_allgather_replace 0.57% : 0.000001s : 10: predicate.mini_step_allgather_replace 0.88% : 0.000002s : 16: predicate.minmaximum_grad 0.83% : 0.000002s : 5: predicate.mutable_eliminate 0.48% : 0.000001s : 5: predicate.opt_reshape 0.35% : 0.000001s : 5: predicate.parallel_virtual_node 2.00% : 0.000005s : 26: predicate.partial_defer_inline 1.86% : 0.000004s : 27: predicate.partial_eliminate 0.92% : 0.000002s : 16: predicate.print_const_string_wrapper 0.60% : 0.000001s : 10: predicate.reduce_all_const_elim 1.17% : 0.000003s : 16: predicate.reduce_eliminate 2.66% : 0.000006s : 48: predicate.redundant_stop_gradient_eliminater 0.46% : 0.000001s : 10: predicate.remove_not_recompute_node 1.52% : 0.000004s : 32: predicate.replace_applicator 0.50% : 0.000001s : 10: predicate.replace_old_param 0.28% : 0.000001s : 5: predicate.reset_defer_inline 0.90% : 0.000002s : 16: predicate.reshape_eliminate 0.58% : 0.000001s : 10: predicate.row_tensor_add_zeros_like 0.31% : 0.000001s : 5: predicate.row_tensor_eliminate 0.69% : 0.000002s : 10: predicate.same_eliminate 0.45% : 0.000001s : 10: predicate.set_cell_output_no_recompute 0.88% : 0.000002s : 10: predicate.shard_identity_eliminate 0.65% : 0.000001s : 10: predicate.special_op_eliminate 0.80% : 0.000002s : 10: predicate.specialize_transform 0.76% : 0.000002s : 10: predicate.split_environ_get_set_with_tuple_value 0.82% : 0.000002s : 10: predicate.stack_unstack_eliminate 0.33% : 0.000001s : 5: predicate.switch_call_monad_eliminater 1.63% : 0.000004s : 26: predicate.switch_defer_inline 2.25% : 0.000005s : 36: predicate.switch_layer_defer_inline 5.25% : 0.000012s : 81: predicate.switch_simplify 0.92% : 0.000002s : 16: predicate.tile_eliminate 0.92% : 0.000002s : 16: predicate.transpose_eliminate 1.58% : 0.000004s : 26: predicate.tuple_list_convert_item_index_to_positive 1.54% : 0.000004s : 26: predicate.tuple_list_get_item_const_eliminator 1.41% : 0.000003s : 26: predicate.tuple_list_get_item_depend_reorder 3.21% : 0.000007s : 42: predicate.tuple_list_get_item_eliminator 1.41% : 0.000003s : 26: predicate.tuple_list_get_set_item_eliminator 2.06% : 0.000005s : 36: predicate.tuple_list_set_item_eliminator 1.84% : 0.000004s : 32: predicate.tuple_to_list_eliminator_ 2.61% : 0.000006s : 48: predicate.updatestate_pure_node_eliminater 3.28% : 0.000008s : 58: predicate.updatestate_useless_node_eliminater 0.42% : 0.000001s : 5: predicate.value_based_eliminate 0.57% : 0.000001s : 10: predicate.virtual_dataset_eliminate 0.59% : 0.000001s : 10: predicate.virtual_output_eliminate 0.25% : 0.000001s : 5: predicate.virtual_view_grad_eliminate 0.40% : 0.000001s : 5: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000548 11 56.58% : 0.000310s : 5: func_graph_cloner_run.FuncGraphClonerGraph 43.42% : 0.000238s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.026175 192 0.02% : 0.000006s : 1: ForceFp32Comm 11.46% : 0.002999s : 1: add_attr 11.41% : 0.002987s : 1: add_attr_with_inline 0.02% : 0.000006s : 1: add_comm_op_reuse_tag 0.20% : 0.000053s : 1: add_recomputation 0.02% : 0.000006s : 1: assign_add_opt 0.27% : 0.000069s : 1: auto_monad 0.11% : 0.000028s : 1: auto_monad_reorder 0.02% : 0.000006s : 1: begin_end_overlap_inline 0.03% : 0.000007s : 1: bias_add_comm_swap 1.84% : 0.000482s : 1: bootstrap 0.17% : 0.000044s : 1: cconv 0.02% : 0.000006s : 1: comm_op_add_attrs 0.07% : 0.000019s : 1: control_data_broadcast_order 0.05% : 0.000013s : 1: convert_after_rewriter 0.12% : 0.000031s : 1: cse_after_recomputation 0.03% : 0.000007s : 1: dataset_repeat_opt 0.07% : 0.000019s : 1: detach_backward 0.04% : 0.000011s : 1: environ_conv 0.11% : 0.000029s : 1: event_method 0.03% : 0.000008s : 1: full_micro_interleaved_order_control 0.03% : 0.000008s : 1: get_jit_bprop_graph 0.05% : 0.000012s : 1: graph_reusing 0.03% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.02% : 0.000006s : 1: handle_group_info 0.03% : 0.000008s : 1: inline 0.03% : 0.000008s : 1: insert-virtual-dataset 0.02% : 0.000006s : 1: interleave_parallel_branches 0.02% : 0.000006s : 1: interleave_split_concat_branches 0.03% : 0.000008s : 1: label_fine_grained_interleaved_index 0.04% : 0.000010s : 1: label_micro_interleaved_index 1.66% : 0.000434s : 1: loop_unroll 0.02% : 0.000006s : 1: merge_cast_opt 0.03% : 0.000007s : 1: micro_interleaved_order_control 1.93% : 0.000506s : 1: mutable_eliminate 0.03% : 0.000009s : 1: offloading_packed_experts 0.05% : 0.000014s : 1: opt.transform.loop_unroll_optimizer 0.06% : 0.000015s : 1: opt.transform.mutable_eliminate 5.20% : 0.001361s : 78: opt.transform.opt_a 0.13% : 0.000034s : 1: opt.transform.opt_after_cconv 0.10% : 0.000026s : 1: opt.transform.opt_after_jit_grad 0.49% : 0.000127s : 28: opt.transform.opt_b 0.21% : 0.000054s : 2: opt.transform.opt_trans_graph 0.15% : 0.000039s : 4: opt.transform.symbol_engine_opt 11.77% : 0.003082s : 1: opt_a 0.51% : 0.000133s : 1: opt_after_cconv 1.84% : 0.000481s : 1: opt_after_jit_grad 1.14% : 0.000299s : 1: opt_b 21.75% : 0.005693s : 1: optimize 0.09% : 0.000022s : 1: optimize_parallel_all_gather_comm 0.04% : 0.000011s : 1: order_py_execute_after_rewriter 0.10% : 0.000025s : 1: overlap_grad_flash_sp 0.02% : 0.000006s : 1: overlap_grad_matmul_and_grad_allreduce 0.04% : 0.000009s : 1: overlap_grad_ring_attention 0.03% : 0.000007s : 1: overlap_opt_shard_grad_in_pipeline 0.02% : 0.000006s : 1: overlap_opt_shard_in_pipeline 0.03% : 0.000008s : 1: overlap_param_gather 0.02% : 0.000006s : 1: overlap_recompute_allgather_and_fa_grad 0.04% : 0.000010s : 1: overlap_recompute_and_grad_model_parallel 0.03% : 0.000008s : 1: overlap_recompute_comm 0.04% : 0.000009s : 1: parallel-infer-symbol 0.02% : 0.000006s : 1: parallel-infer-symbol-second 0.03% : 0.000008s : 1: partial_unused_args_eliminate 0.03% : 0.000009s : 1: pipeline_parallel_scheduler 0.03% : 0.000007s : 1: pipeline_split 0.15% : 0.000040s : 1: pre_auto_parallel 0.17% : 0.000044s : 1: py_interpret_to_execute 0.06% : 0.000016s : 1: py_interpret_to_execute_after_opt_a 0.02% : 0.000006s : 1: remove_cast_before_assign_add 0.07% : 0.000019s : 1: remove_dup_value 1.09% : 0.000286s : 1: renormalize.infer 0.99% : 0.000260s : 1: renormalize.specialize 0.03% : 0.000008s : 1: reorder_send_recv_between_fp_bp 0.04% : 0.000010s : 1: rewriter_after_jit_bprop_graph 0.17% : 0.000044s : 1: rewriter_after_opt_a 0.37% : 0.000097s : 1: rewriter_before_opt_a 0.03% : 0.000008s : 1: slice_cell_reuse_recomputed_activation 0.03% : 0.000007s : 1: slice_recompute_activation 0.03% : 0.000007s : 1: split_layernorm_comm 0.03% : 0.000007s : 1: split_matmul_comm_elemetwise 0.04% : 0.000011s : 1: swap_dp_allreduce_reducescatter 0.38% : 0.000100s : 1: symbol_engine_optimizer 0.38% : 0.000099s : 1: tuple_transform 21.83% : 0.005713s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:47:13.668.976 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0146282, [21] [bootstrap]: 0.00043772 [type_inference]: 0.00559542 [event_method]: 1.963e-05 [auto_monad]: 6.107e-05 [graph_reusing]: 6.07001e-06 [inline]: 2.42001e-06 [add_attr]: 0.00301921, [1] [add_attr_with_inline]: 0.0030113, [1] [Cycle 1]: 5.605e-05, [2] [tag_attr]: 2.098e-05 [meta_addattr_fg_expand]: 6.17001e-06 [parallel-infer-symbol]: 3.08998e-06 [pre_auto_parallel]: 3.284e-05 [insert-virtual-dataset]: 2.26998e-06 [parallel-infer-symbol-second]: 9.39996e-07 [dataset_repeat_opt]: 2.03002e-06 [pipeline_split]: 1.60999e-06 [optimize]: 0.00477119, [53] [py_interpret_to_execute]: 2.751e-05 [rewriter_before_opt_a]: 8.823e-05 [opt_a]: 0.00276146, [2] [Cycle 1]: 0.0020562, [45] [expand_dump_flag]: 2.96999e-06 [switch_simplify]: 4.707e-05 [loop_unroll]: 3.461e-05 [a_1]: 0.0007148 [with_stream_mark]: 1.453e-05 [recompute_prepare]: 8.76002e-06 [updatestate_depend_eliminate]: 3.63999e-06 [updatestate_assign_eliminate]: 3.16001e-06 [updatestate_loads_eliminate]: 2.93998e-06 [parameter_eliminate]: 1.79998e-06 [a_2]: 9.055e-05 [accelerated_algorithm]: 8.17e-06 [shard]: 2.33002e-06 [meta_shard_fg_expand]: 1.67999e-06 [shard_inline]: 7.51999e-06 [merge_send_recv]: 8.62e-06 [auto_parallel]: 5.86003e-06 [parallel]: 1.759e-05 [flash_sp]: 7.38e-06 [merge_comm]: 4.02998e-06 [allreduce_fusion]: 3.49001e-06 [matmul_add_comm_reduction]: 9.17999e-06 [allreduce_slice_to_reducescatter]: 6.90023e-07 [virtual_shard_identity]: 8.37998e-06 [virtual_dataset]: 7.1e-06 [get_grad_eliminate_]: 6.73e-06 [virtual_output]: 6.81001e-06 [merge_forward]: 3.70998e-06 [cell_reuse_recompute_pass]: 1.12e-06 [offload_activation]: 9.52001e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.315e-05 [merge_recompute_call_nodes]: 1.47999e-06 [before_grad]: 1.11e-05 [set_forward_comm_id_for_comm_node_pass]: 3.47997e-06 [meta_fg_expand]: 2.88e-06 [flash_sp_send_recv_attached]: 3.06001e-06 [receive_attached]: 2.33002e-06 [after_resolve]: 1.423e-05 [a_after_grad]: 1.179e-05 [renormalize]: 0.00060612 [add_forward_monad_depend]: 5.25999e-06 [auto_monad_grad]: 1.99e-06 [auto_monad_eliminator]: 1.363e-05 [cse]: 3.182e-05 [a_3]: 5.082e-05 [Cycle 2]: 0.00069563, [45] [expand_dump_flag]: 1.17e-06 [switch_simplify]: 8.46002e-06 [loop_unroll]: 7.11001e-06 [a_1]: 0.00015905 [with_stream_mark]: 1.153e-05 [recompute_prepare]: 7.36999e-06 [updatestate_depend_eliminate]: 2.84999e-06 [updatestate_assign_eliminate]: 2.98e-06 [updatestate_loads_eliminate]: 2.21e-06 [parameter_eliminate]: 1.00001e-06 [a_2]: 8.113e-05 [accelerated_algorithm]: 7.06999e-06 [shard]: 1.10001e-06 [meta_shard_fg_expand]: 1.50001e-06 [shard_inline]: 6.73e-06 [merge_send_recv]: 5.10001e-06 [auto_parallel]: 5.30001e-06 [parallel]: 4.61002e-06 [flash_sp]: 3.61999e-06 [merge_comm]: 3.28e-06 [allreduce_fusion]: 3.08e-06 [matmul_add_comm_reduction]: 5.26002e-06 [allreduce_slice_to_reducescatter]: 3.70026e-07 [virtual_shard_identity]: 7.46001e-06 [virtual_dataset]: 6.39999e-06 [get_grad_eliminate_]: 8.85001e-06 [virtual_output]: 6.19999e-06 [merge_forward]: 2.64999e-06 [cell_reuse_recompute_pass]: 1.33002e-06 [offload_activation]: 6.31998e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.3e-05 [merge_recompute_call_nodes]: 9.40025e-07 [before_grad]: 9.49e-06 [set_forward_comm_id_for_comm_node_pass]: 3.19001e-06 [meta_fg_expand]: 2.36e-06 [flash_sp_send_recv_attached]: 8.70001e-07 [receive_attached]: 1.05999e-06 [after_resolve]: 1.321e-05 [a_after_grad]: 1.088e-05 [renormalize]: 6.00121e-08 [add_forward_monad_depend]: 1.35001e-06 [auto_monad_grad]: 7.80012e-07 [auto_monad_eliminator]: 6.48998e-06 [cse]: 1.468e-05 [a_3]: 4.013e-05 [py_interpret_to_execute_after_opt_a]: 9.72999e-06 [slice_cell_reuse_recomputed_activation]: 2.11e-06 [rewriter_after_opt_a]: 3.301e-05 [convert_after_rewriter]: 6.51999e-06 [order_py_execute_after_rewriter]: 5.01997e-06 [mutable_eliminate]: 0.00047096 [opt_b]: 0.00022487, [1] [Cycle 1]: 0.00021852, [7] [b_1]: 0.00014528 [b_2]: 8.42e-06 [updatestate_depend_eliminate]: 5.46e-06 [updatestate_assign_eliminate]: 2.44001e-06 [updatestate_loads_eliminate]: 2.73e-06 [renormalize]: 4.69998e-07 [cse]: 1.81e-05 [optimize_parallel_all_gather_comm]: 1.547e-05 [overlap_param_gather]: 1.89999e-06 [cconv]: 2.367e-05 [loop_unroll]: 0.00043894 [opt_after_cconv]: 0.00010599, [1] [Cycle 1]: 0.00010045, [7] [c_1]: 3.596e-05 [parameter_eliminate]: 2.68e-06 [updatestate_depend_eliminate]: 4.96002e-06 [updatestate_assign_eliminate]: 2.57001e-06 [updatestate_loads_eliminate]: 2.37001e-06 [cse]: 1.793e-05 [renormalize]: 2.69996e-07 [remove_dup_value]: 1.314e-05 [tuple_transform]: 8.336e-05, [1] [Cycle 1]: 7.877e-05, [4] [d_1]: 4.954e-05 [none_parameter_eliminate]: 1.77001e-06 [renormalize]: 7.2e-07 [switch_simplify]: 7.85e-06 [partial_unused_args_eliminate]: 1.72999e-06 [add_recomputation]: 4.765e-05 [cse_after_recomputation]: 2.196e-05, [1] [Cycle 1]: 1.75e-05, [1] [cse]: 1.168e-05 [environ_conv]: 5.02e-06 [swap_dp_allreduce_reducescatter]: 5.47999e-06 [bias_add_comm_swap]: 2.89001e-06 [label_micro_interleaved_index]: 4.47e-06 [label_fine_grained_interleaved_index]: 2.74999e-06 [merge_cast_opt]: 1.44998e-06 [slice_recompute_activation]: 2.13002e-06 [micro_interleaved_order_control]: 2.49001e-06 [assign_add_opt]: 1.15999e-06 [ForceFp32Comm]: 8.70001e-07 [remove_cast_before_assign_add]: 1.01002e-06 [full_micro_interleaved_order_control]: 1.97999e-06 [reorder_send_recv_between_fp_bp]: 2.78998e-06 [comm_op_add_attrs]: 1.04e-06 [add_comm_op_reuse_tag]: 1.00001e-06 [interleave_split_concat_branches]: 1.10001e-06 [interleave_parallel_branches]: 1.08001e-06 [overlap_opt_shard_in_pipeline]: 1.14e-06 [overlap_opt_shard_grad_in_pipeline]: 1.77999e-06 [control_data_broadcast_order]: 1.354e-05 [grouped_pairwise_exchange_alltoall]: 1.74998e-06 [offloading_packed_experts]: 3.73999e-06 [overlap_recompute_and_grad_model_parallel]: 4.32998e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.18001e-06 [overlap_recompute_allgather_and_fa_grad]: 1.32999e-06 [overlap_recompute_comm]: 2.31e-06 [overlap_grad_ring_attention]: 3.9e-06 [overlap_grad_flash_sp]: 1.826e-05 [begin_end_overlap_inline]: 5.00004e-07 [split_matmul_comm_elemetwise]: 1.97999e-06 [split_layernorm_comm]: 1.76e-06 [handle_group_info]: 8.89995e-07 [symbol_engine_optimizer]: 7.806e-05, [1] [Cycle 1]: 7.396e-05, [6] [build]: 2.65002e-06 [elim_shapecalc]: 1.08e-05 [elim_not_effective]: 1.323e-05 [opt_reshape]: 8.03001e-06 [fold_const_symbol]: 1.085e-05 [renormalize]: 1.69995e-07 [detach_backward]: 1.94999e-06 [pipeline_parallel_scheduler]: 1.84e-06 [auto_monad_reorder]: 1.896e-05 [get_jit_bprop_graph]: 1.42999e-06 [rewriter_after_jit_bprop_graph]: 3.90998e-06 [opt_after_jit_grad]: 0.00046616 [validate]: 3.765e-05 Sums bootstrap : 0.000438s : 4.10% type_inference : 0.005595s : 52.39% event_method : 0.000020s : 0.18% auto_monad : 0.000061s : 0.57% graph_reusing : 0.000006s : 0.06% inline : 0.000002s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000021s : 0.20% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.06% parallel-infer-symbol : 0.000003s : 0.03% pre_auto_parallel : 0.000033s : 0.31% insert-virtual-dataset : 0.000002s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.02% optimize.py_interpret_to_execute : 0.000028s : 0.26% optimize.rewriter_before_opt_a : 0.000088s : 0.83% optimize.opt_a.expand_dump_flag : 0.000004s : 0.04% optimize.opt_a.switch_simplify : 0.000056s : 0.52% optimize.opt_a.loop_unroll : 0.000042s : 0.39% optimize.opt_a.a_1 : 0.000874s : 8.18% optimize.opt_a.with_stream_mark : 0.000026s : 0.24% optimize.opt_a.recompute_prepare : 0.000016s : 0.15% optimize.opt_a.updatestate_depend_eliminate : 0.000006s : 0.06% optimize.opt_a.updatestate_assign_eliminate : 0.000006s : 0.06% optimize.opt_a.updatestate_loads_eliminate : 0.000005s : 0.05% optimize.opt_a.parameter_eliminate : 0.000003s : 0.03% optimize.opt_a.a_2 : 0.000172s : 1.61% optimize.opt_a.accelerated_algorithm : 0.000015s : 0.14% optimize.opt_a.shard : 0.000003s : 0.03% optimize.opt_a.meta_shard_fg_expand : 0.000003s : 0.03% optimize.opt_a.shard_inline : 0.000014s : 0.13% optimize.opt_a.merge_send_recv : 0.000014s : 0.13% optimize.opt_a.auto_parallel : 0.000011s : 0.10% optimize.opt_a.parallel : 0.000022s : 0.21% optimize.opt_a.flash_sp : 0.000011s : 0.10% optimize.opt_a.merge_comm : 0.000007s : 0.07% optimize.opt_a.allreduce_fusion : 0.000007s : 0.06% optimize.opt_a.matmul_add_comm_reduction : 0.000014s : 0.14% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000016s : 0.15% optimize.opt_a.virtual_dataset : 0.000013s : 0.13% optimize.opt_a.get_grad_eliminate_ : 0.000016s : 0.15% optimize.opt_a.virtual_output : 0.000013s : 0.12% optimize.opt_a.merge_forward : 0.000006s : 0.06% optimize.opt_a.cell_reuse_recompute_pass : 0.000002s : 0.02% optimize.opt_a.offload_activation : 0.000016s : 0.15% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000026s : 0.24% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.02% optimize.opt_a.before_grad : 0.000021s : 0.19% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000007s : 0.06% optimize.opt_a.meta_fg_expand : 0.000005s : 0.05% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.04% optimize.opt_a.receive_attached : 0.000003s : 0.03% optimize.opt_a.after_resolve : 0.000027s : 0.26% optimize.opt_a.a_after_grad : 0.000023s : 0.21% optimize.opt_a.renormalize : 0.000606s : 5.68% optimize.opt_a.add_forward_monad_depend : 0.000007s : 0.06% optimize.opt_a.auto_monad_grad : 0.000003s : 0.03% optimize.opt_a.auto_monad_eliminator : 0.000020s : 0.19% optimize.opt_a.cse : 0.000047s : 0.44% optimize.opt_a.a_3 : 0.000091s : 0.85% optimize.py_interpret_to_execute_after_opt_a : 0.000010s : 0.09% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.02% optimize.rewriter_after_opt_a : 0.000033s : 0.31% optimize.convert_after_rewriter : 0.000007s : 0.06% optimize.order_py_execute_after_rewriter : 0.000005s : 0.05% optimize.mutable_eliminate : 0.000471s : 4.41% optimize.opt_b.b_1 : 0.000145s : 1.36% optimize.opt_b.b_2 : 0.000008s : 0.08% optimize.opt_b.updatestate_depend_eliminate : 0.000005s : 0.05% optimize.opt_b.updatestate_assign_eliminate : 0.000002s : 0.02% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_b.renormalize : 0.000000s : 0.00% optimize.opt_b.cse : 0.000018s : 0.17% optimize.optimize_parallel_all_gather_comm : 0.000015s : 0.14% optimize.overlap_param_gather : 0.000002s : 0.02% optimize.cconv : 0.000024s : 0.22% optimize.loop_unroll : 0.000439s : 4.11% optimize.opt_after_cconv.c_1 : 0.000036s : 0.34% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000005s : 0.05% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.02% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.02% optimize.opt_after_cconv.cse : 0.000018s : 0.17% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000013s : 0.12% optimize.tuple_transform.d_1 : 0.000050s : 0.46% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.02% optimize.tuple_transform.renormalize : 0.000001s : 0.01% optimize.tuple_transform.switch_simplify : 0.000008s : 0.07% optimize.partial_unused_args_eliminate : 0.000002s : 0.02% optimize.add_recomputation : 0.000048s : 0.45% optimize.cse_after_recomputation.cse : 0.000012s : 0.11% optimize.environ_conv : 0.000005s : 0.05% optimize.swap_dp_allreduce_reducescatter : 0.000005s : 0.05% optimize.bias_add_comm_swap : 0.000003s : 0.03% optimize.label_micro_interleaved_index : 0.000004s : 0.04% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.03% optimize.merge_cast_opt : 0.000001s : 0.01% optimize.slice_recompute_activation : 0.000002s : 0.02% optimize.micro_interleaved_order_control : 0.000002s : 0.02% optimize.assign_add_opt : 0.000001s : 0.01% optimize.ForceFp32Comm : 0.000001s : 0.01% optimize.remove_cast_before_assign_add : 0.000001s : 0.01% optimize.full_micro_interleaved_order_control : 0.000002s : 0.02% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.03% optimize.comm_op_add_attrs : 0.000001s : 0.01% optimize.add_comm_op_reuse_tag : 0.000001s : 0.01% optimize.interleave_split_concat_branches : 0.000001s : 0.01% optimize.interleave_parallel_branches : 0.000001s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.02% optimize.control_data_broadcast_order : 0.000014s : 0.13% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.02% optimize.offloading_packed_experts : 0.000004s : 0.04% optimize.overlap_recompute_and_grad_model_parallel : 0.000004s : 0.04% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.01% optimize.overlap_recompute_comm : 0.000002s : 0.02% optimize.overlap_grad_ring_attention : 0.000004s : 0.04% optimize.overlap_grad_flash_sp : 0.000018s : 0.17% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.02% optimize.split_layernorm_comm : 0.000002s : 0.02% optimize.handle_group_info : 0.000001s : 0.01% optimize.symbol_engine_optimizer.build : 0.000003s : 0.02% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000011s : 0.10% optimize.symbol_engine_optimizer.elim_not_effective : 0.000013s : 0.12% optimize.symbol_engine_optimizer.opt_reshape : 0.000008s : 0.08% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000011s : 0.10% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.02% pipeline_parallel_scheduler : 0.000002s : 0.02% auto_monad_reorder : 0.000019s : 0.18% get_jit_bprop_graph : 0.000001s : 0.01% rewriter_after_jit_bprop_graph : 0.000004s : 0.04% opt_after_jit_grad : 0.000466s : 4.36% validate : 0.000038s : 0.35% Time group info: ------[substitution.] 0.000188 34 0.98% : 0.000002s : 2: substitution.elim_not_effective 0.71% : 0.000001s : 2: substitution.fold_const_symbol 3.20% : 0.000006s : 5: substitution.graph_param_transform 75.59% : 0.000142s : 4: substitution.inline 1.75% : 0.000003s : 4: substitution.j_node_and_user_rematch 2.39% : 0.000004s : 4: substitution.remove_not_recompute_node 2.91% : 0.000005s : 6: substitution.replace_old_param 9.67% : 0.000018s : 6: substitution.tuple_list_get_item_eliminator 2.80% : 0.000005s : 1: substitution.value_based_eliminate ------[type_inference.] 0.005540 2 87.07% : 0.004823s : 1: type_inference.infer 12.93% : 0.000717s : 1: type_inference.specialize ------[replace.] 0.000070 10 54.84% : 0.000038s : 4: replace.inline 45.16% : 0.000032s : 6: replace.tuple_list_get_item_eliminator ------[match.] 0.000155 10 89.96% : 0.000139s : 4: match.inline 10.04% : 0.000016s : 6: match.tuple_list_get_item_eliminator ------[predicate.] 0.000230 1590 0.93% : 0.000002s : 16: predicate.accumulaten_eliminater 0.67% : 0.000002s : 5: predicate.ad_related_special_op_eliminate 0.49% : 0.000001s : 10: predicate.addn_check_dump 0.91% : 0.000002s : 16: predicate.addn_zero_filter 0.84% : 0.000002s : 16: predicate.adjust_all_reduce_mul_add 1.90% : 0.000004s : 26: predicate.arithmetic_simplify 0.88% : 0.000002s : 16: predicate.cast_eliminate 0.59% : 0.000001s : 10: predicate.check_bprop_eliminate 0.51% : 0.000001s : 10: predicate.compare_switch_simplify 0.21% : 0.000000s : 5: predicate.const_output_eliminate 0.57% : 0.000001s : 10: predicate.depend_value_elim 1.03% : 0.000002s : 16: predicate.dict_get_item_const_eliminator 1.06% : 0.000002s : 16: predicate.dict_get_item_eliminator 0.88% : 0.000002s : 16: predicate.dict_set_item_eliminator 0.88% : 0.000002s : 10: predicate.dumpgradient_eliminate 0.22% : 0.000001s : 5: predicate.elim_not_effective 0.37% : 0.000001s : 5: predicate.elim_shapecalc_of_broadcastargs 1.18% : 0.000003s : 21: predicate.environ_add_const_eliminate 1.11% : 0.000003s : 21: predicate.environ_get_add_eliminate 1.17% : 0.000003s : 21: predicate.environ_get_depend_swap 1.69% : 0.000004s : 31: predicate.environ_get_eliminate 1.08% : 0.000002s : 21: predicate.environ_get_set_eliminate 1.55% : 0.000004s : 26: predicate.exchange_switch_depend_value 2.52% : 0.000006s : 26: predicate.float_depend_g_call 0.56% : 0.000001s : 10: predicate.float_environ_get_switch 0.77% : 0.000002s : 15: predicate.float_tuple_getitem_switch 0.19% : 0.000000s : 5: predicate.fold_const_symbol 0.63% : 0.000001s : 10: predicate.get_grad_eliminate 0.22% : 0.000001s : 5: predicate.graph_param_transform 0.58% : 0.000001s : 10: predicate.incorporate_call 0.48% : 0.000001s : 10: predicate.incorporate_call_switch 6.13% : 0.000014s : 72: predicate.inline 0.76% : 0.000002s : 10: predicate.inline_without_move 0.38% : 0.000001s : 10: predicate.j_node_and_user_rematch 0.95% : 0.000002s : 10: predicate.less_batch_normalization 1.95% : 0.000004s : 32: predicate.list_to_tuple_eliminator_ 2.82% : 0.000006s : 48: predicate.load_eliminater 0.82% : 0.000002s : 5: predicate.loop_unroll_after_grad 2.42% : 0.000006s : 40: predicate.loop_unroll_before_grad 1.63% : 0.000004s : 26: predicate.make_slice_get_slice_eliminator 0.53% : 0.000001s : 10: predicate.merge_addn 0.51% : 0.000001s : 10: predicate.micro_step_allgather_replace 0.57% : 0.000001s : 10: predicate.mini_step_allgather_replace 0.85% : 0.000002s : 16: predicate.minmaximum_grad 0.87% : 0.000002s : 5: predicate.mutable_eliminate 0.33% : 0.000001s : 5: predicate.opt_reshape 0.47% : 0.000001s : 5: predicate.parallel_virtual_node 1.95% : 0.000004s : 26: predicate.partial_defer_inline 1.85% : 0.000004s : 27: predicate.partial_eliminate 0.98% : 0.000002s : 16: predicate.print_const_string_wrapper 0.57% : 0.000001s : 10: predicate.reduce_all_const_elim 1.09% : 0.000002s : 16: predicate.reduce_eliminate 2.67% : 0.000006s : 48: predicate.redundant_stop_gradient_eliminater 0.46% : 0.000001s : 10: predicate.remove_not_recompute_node 1.48% : 0.000003s : 32: predicate.replace_applicator 0.53% : 0.000001s : 10: predicate.replace_old_param 0.27% : 0.000001s : 5: predicate.reset_defer_inline 1.05% : 0.000002s : 16: predicate.reshape_eliminate 0.61% : 0.000001s : 10: predicate.row_tensor_add_zeros_like 0.31% : 0.000001s : 5: predicate.row_tensor_eliminate 0.82% : 0.000002s : 10: predicate.same_eliminate 0.45% : 0.000001s : 10: predicate.set_cell_output_no_recompute 0.69% : 0.000002s : 10: predicate.shard_identity_eliminate 0.68% : 0.000002s : 10: predicate.special_op_eliminate 0.67% : 0.000002s : 10: predicate.specialize_transform 0.77% : 0.000002s : 10: predicate.split_environ_get_set_with_tuple_value 0.80% : 0.000002s : 10: predicate.stack_unstack_eliminate 0.34% : 0.000001s : 5: predicate.switch_call_monad_eliminater 1.60% : 0.000004s : 26: predicate.switch_defer_inline 2.19% : 0.000005s : 36: predicate.switch_layer_defer_inline 5.16% : 0.000012s : 81: predicate.switch_simplify 0.93% : 0.000002s : 16: predicate.tile_eliminate 0.93% : 0.000002s : 16: predicate.transpose_eliminate 1.49% : 0.000003s : 26: predicate.tuple_list_convert_item_index_to_positive 1.56% : 0.000004s : 26: predicate.tuple_list_get_item_const_eliminator 1.45% : 0.000003s : 26: predicate.tuple_list_get_item_depend_reorder 3.27% : 0.000007s : 42: predicate.tuple_list_get_item_eliminator 1.45% : 0.000003s : 26: predicate.tuple_list_get_set_item_eliminator 2.09% : 0.000005s : 36: predicate.tuple_list_set_item_eliminator 1.86% : 0.000004s : 32: predicate.tuple_to_list_eliminator_ 2.64% : 0.000006s : 48: predicate.updatestate_pure_node_eliminater 3.33% : 0.000008s : 58: predicate.updatestate_useless_node_eliminater 0.43% : 0.000001s : 5: predicate.value_based_eliminate 0.65% : 0.000001s : 10: predicate.virtual_dataset_eliminate 0.64% : 0.000001s : 10: predicate.virtual_output_eliminate 0.29% : 0.000001s : 5: predicate.virtual_view_grad_eliminate 0.37% : 0.000001s : 5: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000518 11 54.42% : 0.000282s : 5: func_graph_cloner_run.FuncGraphClonerGraph 45.58% : 0.000236s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.024559 192 0.02% : 0.000004s : 1: ForceFp32Comm 12.31% : 0.003024s : 1: add_attr 12.28% : 0.003015s : 1: add_attr_with_inline 0.01% : 0.000004s : 1: add_comm_op_reuse_tag 0.21% : 0.000052s : 1: add_recomputation 0.02% : 0.000004s : 1: assign_add_opt 0.27% : 0.000066s : 1: auto_monad 0.09% : 0.000023s : 1: auto_monad_reorder 0.01% : 0.000003s : 1: begin_end_overlap_inline 0.02% : 0.000006s : 1: bias_add_comm_swap 1.90% : 0.000466s : 1: bootstrap 0.11% : 0.000027s : 1: cconv 0.02% : 0.000004s : 1: comm_op_add_attrs 0.07% : 0.000017s : 1: control_data_broadcast_order 0.04% : 0.000010s : 1: convert_after_rewriter 0.10% : 0.000025s : 1: cse_after_recomputation 0.02% : 0.000005s : 1: dataset_repeat_opt 0.02% : 0.000005s : 1: detach_backward 0.03% : 0.000008s : 1: environ_conv 0.10% : 0.000025s : 1: event_method 0.02% : 0.000005s : 1: full_micro_interleaved_order_control 0.02% : 0.000005s : 1: get_jit_bprop_graph 0.04% : 0.000009s : 1: graph_reusing 0.02% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000004s : 1: handle_group_info 0.02% : 0.000006s : 1: inline 0.02% : 0.000006s : 1: insert-virtual-dataset 0.02% : 0.000004s : 1: interleave_parallel_branches 0.02% : 0.000004s : 1: interleave_split_concat_branches 0.02% : 0.000005s : 1: label_fine_grained_interleaved_index 0.03% : 0.000007s : 1: label_micro_interleaved_index 1.82% : 0.000447s : 1: loop_unroll 0.02% : 0.000005s : 1: merge_cast_opt 0.02% : 0.000005s : 1: micro_interleaved_order_control 1.95% : 0.000480s : 1: mutable_eliminate 0.03% : 0.000007s : 1: offloading_packed_experts 0.06% : 0.000014s : 1: opt.transform.loop_unroll_optimizer 0.06% : 0.000015s : 1: opt.transform.mutable_eliminate 5.49% : 0.001348s : 78: opt.transform.opt_a 0.14% : 0.000035s : 1: opt.transform.opt_after_cconv 0.11% : 0.000026s : 1: opt.transform.opt_after_jit_grad 0.50% : 0.000123s : 28: opt.transform.opt_b 0.22% : 0.000055s : 2: opt.transform.opt_trans_graph 0.16% : 0.000038s : 4: opt.transform.symbol_engine_opt 11.26% : 0.002764s : 1: opt_a 0.45% : 0.000109s : 1: opt_after_cconv 1.93% : 0.000475s : 1: opt_after_jit_grad 0.93% : 0.000228s : 1: opt_b 19.44% : 0.004775s : 1: optimize 0.08% : 0.000019s : 1: optimize_parallel_all_gather_comm 0.03% : 0.000008s : 1: order_py_execute_after_rewriter 0.09% : 0.000021s : 1: overlap_grad_flash_sp 0.02% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.03% : 0.000007s : 1: overlap_grad_ring_attention 0.02% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.02% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.02% : 0.000005s : 1: overlap_param_gather 0.02% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.03% : 0.000007s : 1: overlap_recompute_and_grad_model_parallel 0.02% : 0.000005s : 1: overlap_recompute_comm 0.03% : 0.000007s : 1: parallel-infer-symbol 0.02% : 0.000004s : 1: parallel-infer-symbol-second 0.02% : 0.000005s : 1: partial_unused_args_eliminate 0.02% : 0.000005s : 1: pipeline_parallel_scheduler 0.02% : 0.000004s : 1: pipeline_split 0.15% : 0.000037s : 1: pre_auto_parallel 0.13% : 0.000032s : 1: py_interpret_to_execute 0.05% : 0.000013s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000004s : 1: remove_cast_before_assign_add 0.07% : 0.000017s : 1: remove_dup_value 1.32% : 0.000325s : 1: renormalize.infer 1.12% : 0.000274s : 1: renormalize.specialize 0.02% : 0.000005s : 1: reorder_send_recv_between_fp_bp 0.03% : 0.000007s : 1: rewriter_after_jit_bprop_graph 0.15% : 0.000037s : 1: rewriter_after_opt_a 0.38% : 0.000092s : 1: rewriter_before_opt_a 0.02% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.02% : 0.000005s : 1: slice_recompute_activation 0.02% : 0.000004s : 1: split_layernorm_comm 0.02% : 0.000005s : 1: split_matmul_comm_elemetwise 0.03% : 0.000008s : 1: swap_dp_allreduce_reducescatter 0.33% : 0.000081s : 1: symbol_engine_optimizer 0.35% : 0.000086s : 1: tuple_transform 22.84% : 0.005609s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:47:13.855.412 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:47:13.855.682 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0164516, [21] [bootstrap]: 0.00042988 [type_inference]: 0.00565804 [event_method]: 2.076e-05 [auto_monad]: 6.362e-05 [graph_reusing]: 6.33e-06 [inline]: 1.89e-06 [add_attr]: 0.00311316, [1] [add_attr_with_inline]: 0.00310502, [1] [Cycle 1]: 7.65e-05, [2] [tag_attr]: 2.108e-05 [meta_addattr_fg_expand]: 7.05e-06 [parallel-infer-symbol]: 3.26001e-06 [pre_auto_parallel]: 3.59e-05 [insert-virtual-dataset]: 2.32999e-06 [parallel-infer-symbol-second]: 7.80012e-07 [dataset_repeat_opt]: 2.20002e-06 [pipeline_split]: 1.57001e-06 [optimize]: 0.00598666, [53] [py_interpret_to_execute]: 3.425e-05 [rewriter_before_opt_a]: 9.91e-05 [opt_a]: 0.00358232, [2] [Cycle 1]: 0.00257395, [45] [expand_dump_flag]: 2.78998e-06 [switch_simplify]: 4.73e-05 [loop_unroll]: 3.522e-05 [a_1]: 0.00084447 [with_stream_mark]: 1.773e-05 [recompute_prepare]: 1.273e-05 [updatestate_depend_eliminate]: 4.80001e-06 [updatestate_assign_eliminate]: 4.34002e-06 [updatestate_loads_eliminate]: 3.61999e-06 [parameter_eliminate]: 2.19999e-06 [a_2]: 0.00013947 [accelerated_algorithm]: 9.57999e-06 [shard]: 1.99e-06 [meta_shard_fg_expand]: 2.34001e-06 [shard_inline]: 9.22999e-06 [merge_send_recv]: 9.57999e-06 [auto_parallel]: 7.42002e-06 [parallel]: 1.924e-05 [flash_sp]: 9.12999e-06 [merge_comm]: 5.17999e-06 [allreduce_fusion]: 4.42e-06 [matmul_add_comm_reduction]: 1.088e-05 [allreduce_slice_to_reducescatter]: 9.10019e-07 [virtual_shard_identity]: 1.051e-05 [virtual_dataset]: 9.20001e-06 [get_grad_eliminate_]: 8.84998e-06 [virtual_output]: 8.97e-06 [merge_forward]: 4.81002e-06 [cell_reuse_recompute_pass]: 1.22e-06 [offload_activation]: 1.095e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.889e-05 [merge_recompute_call_nodes]: 1.60001e-06 [before_grad]: 1.389e-05 [set_forward_comm_id_for_comm_node_pass]: 5.23002e-06 [meta_fg_expand]: 3.31001e-06 [flash_sp_send_recv_attached]: 2.46e-06 [receive_attached]: 2.29001e-06 [after_resolve]: 1.553e-05 [a_after_grad]: 2.285e-05 [renormalize]: 0.00069226 [add_forward_monad_depend]: 5.89e-06 [auto_monad_grad]: 2.54999e-06 [auto_monad_eliminator]: 1.642e-05 [cse]: 3.604e-05 [a_3]: 7.601e-05 [Cycle 2]: 0.00099445, [45] [expand_dump_flag]: 1.45999e-06 [switch_simplify]: 1.013e-05 [loop_unroll]: 8.24002e-06 [a_1]: 0.00020447 [with_stream_mark]: 1.349e-05 [recompute_prepare]: 8.70999e-06 [updatestate_depend_eliminate]: 4.03001e-06 [updatestate_assign_eliminate]: 3.16999e-06 [updatestate_loads_eliminate]: 3.27002e-06 [parameter_eliminate]: 1.15999e-06 [a_2]: 0.00012928 [accelerated_algorithm]: 8.33999e-06 [shard]: 1.47999e-06 [meta_shard_fg_expand]: 1.81e-06 [shard_inline]: 1.139e-05 [merge_send_recv]: 6.42001e-06 [auto_parallel]: 7.01001e-06 [parallel]: 4.55999e-06 [flash_sp]: 3.24001e-06 [merge_comm]: 4.22003e-06 [allreduce_fusion]: 3.97e-06 [matmul_add_comm_reduction]: 7.98001e-06 [allreduce_slice_to_reducescatter]: 5.59987e-07 [virtual_shard_identity]: 9.78002e-06 [virtual_dataset]: 8.30999e-06 [get_grad_eliminate_]: 7.66999e-06 [virtual_output]: 8.1e-06 [merge_forward]: 5.49e-06 [cell_reuse_recompute_pass]: 1.51002e-06 [offload_activation]: 8.50001e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.711e-05 [merge_recompute_call_nodes]: 8.00006e-07 [before_grad]: 1.348e-05 [set_forward_comm_id_for_comm_node_pass]: 4.50001e-06 [meta_fg_expand]: 3.05002e-06 [flash_sp_send_recv_attached]: 1.02e-06 [receive_attached]: 1.02e-06 [after_resolve]: 1.416e-05 [a_after_grad]: 1.336e-05 [renormalize]: 8.00064e-08 [add_forward_monad_depend]: 1.34998e-06 [auto_monad_grad]: 1.04e-06 [auto_monad_eliminator]: 9.20999e-06 [cse]: 1.831e-05 [a_3]: 6.409e-05 [py_interpret_to_execute_after_opt_a]: 1.389e-05 [slice_cell_reuse_recomputed_activation]: 5.58002e-06 [rewriter_after_opt_a]: 4.409e-05 [convert_after_rewriter]: 1.109e-05 [order_py_execute_after_rewriter]: 8.92999e-06 [mutable_eliminate]: 0.00049811 [opt_b]: 0.00033659, [1] [Cycle 1]: 0.00032749, [7] [b_1]: 0.00022521 [b_2]: 1.129e-05 [updatestate_depend_eliminate]: 6.22001e-06 [updatestate_assign_eliminate]: 3.45998e-06 [updatestate_loads_eliminate]: 3.6e-06 [renormalize]: 4.30009e-07 [cse]: 2.216e-05 [optimize_parallel_all_gather_comm]: 2.014e-05 [overlap_param_gather]: 4.66002e-06 [cconv]: 2.678e-05 [loop_unroll]: 0.00042797 [opt_after_cconv]: 0.00014327, [1] [Cycle 1]: 0.00013501, [7] [c_1]: 4.241e-05 [parameter_eliminate]: 2.62001e-06 [updatestate_depend_eliminate]: 6.21e-06 [updatestate_assign_eliminate]: 3.33998e-06 [updatestate_loads_eliminate]: 3.07002e-06 [cse]: 2.208e-05 [renormalize]: 4.7998e-07 [remove_dup_value]: 1.778e-05 [tuple_transform]: 0.00010442, [1] [Cycle 1]: 9.769e-05, [4] [d_1]: 5.754e-05 [none_parameter_eliminate]: 1.66e-06 [renormalize]: 2.30008e-07 [switch_simplify]: 8.89e-06 [partial_unused_args_eliminate]: 4.28001e-06 [add_recomputation]: 5.883e-05 [cse_after_recomputation]: 3.247e-05, [1] [Cycle 1]: 2.5e-05, [1] [cse]: 1.605e-05 [environ_conv]: 9.61e-06 [swap_dp_allreduce_reducescatter]: 8.79e-06 [bias_add_comm_swap]: 5.10999e-06 [label_micro_interleaved_index]: 6.75998e-06 [label_fine_grained_interleaved_index]: 5.30999e-06 [merge_cast_opt]: 3.93001e-06 [slice_recompute_activation]: 4.27998e-06 [micro_interleaved_order_control]: 4.73001e-06 [assign_add_opt]: 3.47002e-06 [ForceFp32Comm]: 3.09001e-06 [remove_cast_before_assign_add]: 3.76999e-06 [full_micro_interleaved_order_control]: 4.67e-06 [reorder_send_recv_between_fp_bp]: 5.29e-06 [comm_op_add_attrs]: 3.40998e-06 [add_comm_op_reuse_tag]: 3.25e-06 [interleave_split_concat_branches]: 3.47002e-06 [interleave_parallel_branches]: 3.43e-06 [overlap_opt_shard_in_pipeline]: 3.40998e-06 [overlap_opt_shard_grad_in_pipeline]: 4.38999e-06 [control_data_broadcast_order]: 1.677e-05 [grouped_pairwise_exchange_alltoall]: 3.91999e-06 [offloading_packed_experts]: 6.79999e-06 [overlap_recompute_and_grad_model_parallel]: 7.78001e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.57997e-06 [overlap_recompute_allgather_and_fa_grad]: 3.7e-06 [overlap_recompute_comm]: 4.2e-06 [overlap_grad_ring_attention]: 1.921e-05 [overlap_grad_flash_sp]: 2.531e-05 [begin_end_overlap_inline]: 3.26001e-06 [split_matmul_comm_elemetwise]: 4.74e-06 [split_layernorm_comm]: 4.27e-06 [handle_group_info]: 3.30003e-06 [symbol_engine_optimizer]: 0.0001068, [1] [Cycle 1]: 9.989e-05, [6] [build]: 3.2e-06 [elim_shapecalc]: 1.226e-05 [elim_not_effective]: 1.629e-05 [opt_reshape]: 9.27999e-06 [fold_const_symbol]: 1.293e-05 [renormalize]: 1.8999e-07 [detach_backward]: 3.04001e-06 [pipeline_parallel_scheduler]: 1.52999e-06 [auto_monad_reorder]: 2.194e-05 [get_jit_bprop_graph]: 1.60999e-06 [rewriter_after_jit_bprop_graph]: 4.24002e-06 [opt_after_jit_grad]: 0.00048779 [validate]: 4.152e-05 Sums bootstrap : 0.000430s : 3.70% type_inference : 0.005658s : 48.71% event_method : 0.000021s : 0.18% auto_monad : 0.000064s : 0.55% graph_reusing : 0.000006s : 0.05% inline : 0.000002s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000021s : 0.18% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000007s : 0.06% parallel-infer-symbol : 0.000003s : 0.03% pre_auto_parallel : 0.000036s : 0.31% insert-virtual-dataset : 0.000002s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000034s : 0.29% optimize.rewriter_before_opt_a : 0.000099s : 0.85% optimize.opt_a.expand_dump_flag : 0.000004s : 0.04% optimize.opt_a.switch_simplify : 0.000057s : 0.49% optimize.opt_a.loop_unroll : 0.000043s : 0.37% optimize.opt_a.a_1 : 0.001049s : 9.03% optimize.opt_a.with_stream_mark : 0.000031s : 0.27% optimize.opt_a.recompute_prepare : 0.000021s : 0.18% optimize.opt_a.updatestate_depend_eliminate : 0.000009s : 0.08% optimize.opt_a.updatestate_assign_eliminate : 0.000008s : 0.06% optimize.opt_a.updatestate_loads_eliminate : 0.000007s : 0.06% optimize.opt_a.parameter_eliminate : 0.000003s : 0.03% optimize.opt_a.a_2 : 0.000269s : 2.31% optimize.opt_a.accelerated_algorithm : 0.000018s : 0.15% optimize.opt_a.shard : 0.000003s : 0.03% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.04% optimize.opt_a.shard_inline : 0.000021s : 0.18% optimize.opt_a.merge_send_recv : 0.000016s : 0.14% optimize.opt_a.auto_parallel : 0.000014s : 0.12% optimize.opt_a.parallel : 0.000024s : 0.20% optimize.opt_a.flash_sp : 0.000012s : 0.11% optimize.opt_a.merge_comm : 0.000009s : 0.08% optimize.opt_a.allreduce_fusion : 0.000008s : 0.07% optimize.opt_a.matmul_add_comm_reduction : 0.000019s : 0.16% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000020s : 0.17% optimize.opt_a.virtual_dataset : 0.000018s : 0.15% optimize.opt_a.get_grad_eliminate_ : 0.000017s : 0.14% optimize.opt_a.virtual_output : 0.000017s : 0.15% optimize.opt_a.merge_forward : 0.000010s : 0.09% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.02% optimize.opt_a.offload_activation : 0.000019s : 0.17% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000036s : 0.31% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.02% optimize.opt_a.before_grad : 0.000027s : 0.24% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000010s : 0.08% optimize.opt_a.meta_fg_expand : 0.000006s : 0.05% optimize.opt_a.flash_sp_send_recv_attached : 0.000003s : 0.03% optimize.opt_a.receive_attached : 0.000003s : 0.03% optimize.opt_a.after_resolve : 0.000030s : 0.26% optimize.opt_a.a_after_grad : 0.000036s : 0.31% optimize.opt_a.renormalize : 0.000692s : 5.96% optimize.opt_a.add_forward_monad_depend : 0.000007s : 0.06% optimize.opt_a.auto_monad_grad : 0.000004s : 0.03% optimize.opt_a.auto_monad_eliminator : 0.000026s : 0.22% optimize.opt_a.cse : 0.000054s : 0.47% optimize.opt_a.a_3 : 0.000140s : 1.21% optimize.py_interpret_to_execute_after_opt_a : 0.000014s : 0.12% optimize.slice_cell_reuse_recomputed_activation : 0.000006s : 0.05% optimize.rewriter_after_opt_a : 0.000044s : 0.38% optimize.convert_after_rewriter : 0.000011s : 0.10% optimize.order_py_execute_after_rewriter : 0.000009s : 0.08% optimize.mutable_eliminate : 0.000498s : 4.29% optimize.opt_b.b_1 : 0.000225s : 1.94% optimize.opt_b.b_2 : 0.000011s : 0.10% optimize.opt_b.updatestate_depend_eliminate : 0.000006s : 0.05% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_b.updatestate_loads_eliminate : 0.000004s : 0.03% optimize.opt_b.renormalize : 0.000000s : 0.00% optimize.opt_b.cse : 0.000022s : 0.19% optimize.optimize_parallel_all_gather_comm : 0.000020s : 0.17% optimize.overlap_param_gather : 0.000005s : 0.04% optimize.cconv : 0.000027s : 0.23% optimize.loop_unroll : 0.000428s : 3.68% optimize.opt_after_cconv.c_1 : 0.000042s : 0.37% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.02% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.05% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.cse : 0.000022s : 0.19% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000018s : 0.15% optimize.tuple_transform.d_1 : 0.000058s : 0.50% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000009s : 0.08% optimize.partial_unused_args_eliminate : 0.000004s : 0.04% optimize.add_recomputation : 0.000059s : 0.51% optimize.cse_after_recomputation.cse : 0.000016s : 0.14% optimize.environ_conv : 0.000010s : 0.08% optimize.swap_dp_allreduce_reducescatter : 0.000009s : 0.08% optimize.bias_add_comm_swap : 0.000005s : 0.04% optimize.label_micro_interleaved_index : 0.000007s : 0.06% optimize.label_fine_grained_interleaved_index : 0.000005s : 0.05% optimize.merge_cast_opt : 0.000004s : 0.03% optimize.slice_recompute_activation : 0.000004s : 0.04% optimize.micro_interleaved_order_control : 0.000005s : 0.04% optimize.assign_add_opt : 0.000003s : 0.03% optimize.ForceFp32Comm : 0.000003s : 0.03% optimize.remove_cast_before_assign_add : 0.000004s : 0.03% optimize.full_micro_interleaved_order_control : 0.000005s : 0.04% optimize.reorder_send_recv_between_fp_bp : 0.000005s : 0.05% optimize.comm_op_add_attrs : 0.000003s : 0.03% optimize.add_comm_op_reuse_tag : 0.000003s : 0.03% optimize.interleave_split_concat_branches : 0.000003s : 0.03% optimize.interleave_parallel_branches : 0.000003s : 0.03% optimize.overlap_opt_shard_in_pipeline : 0.000003s : 0.03% optimize.overlap_opt_shard_grad_in_pipeline : 0.000004s : 0.04% optimize.control_data_broadcast_order : 0.000017s : 0.14% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.03% optimize.offloading_packed_experts : 0.000007s : 0.06% optimize.overlap_recompute_and_grad_model_parallel : 0.000008s : 0.07% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.03% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.03% optimize.overlap_recompute_comm : 0.000004s : 0.04% optimize.overlap_grad_ring_attention : 0.000019s : 0.17% optimize.overlap_grad_flash_sp : 0.000025s : 0.22% optimize.begin_end_overlap_inline : 0.000003s : 0.03% optimize.split_matmul_comm_elemetwise : 0.000005s : 0.04% optimize.split_layernorm_comm : 0.000004s : 0.04% optimize.handle_group_info : 0.000003s : 0.03% optimize.symbol_engine_optimizer.build : 0.000003s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000012s : 0.11% optimize.symbol_engine_optimizer.elim_not_effective : 0.000016s : 0.14% optimize.symbol_engine_optimizer.opt_reshape : 0.000009s : 0.08% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000013s : 0.11% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000003s : 0.03% pipeline_parallel_scheduler : 0.000002s : 0.01% auto_monad_reorder : 0.000022s : 0.19% get_jit_bprop_graph : 0.000002s : 0.01% rewriter_after_jit_bprop_graph : 0.000004s : 0.04% opt_after_jit_grad : 0.000488s : 4.20% validate : 0.000042s : 0.36% Time group info: ------[substitution.] 0.000225 44 10.28% : 0.000023s : 3: substitution.cast_eliminate 0.96% : 0.000002s : 3: substitution.elim_not_effective 0.73% : 0.000002s : 3: substitution.fold_const_symbol 2.97% : 0.000007s : 6: substitution.graph_param_transform 66.86% : 0.000151s : 4: substitution.inline 2.26% : 0.000005s : 6: substitution.j_node_and_user_rematch 2.69% : 0.000006s : 6: substitution.remove_not_recompute_node 2.56% : 0.000006s : 6: substitution.replace_old_param 8.37% : 0.000019s : 6: substitution.tuple_list_get_item_eliminator 2.32% : 0.000005s : 1: substitution.value_based_eliminate ------[type_inference.] 0.005610 2 87.04% : 0.004883s : 1: type_inference.infer 12.96% : 0.000727s : 1: type_inference.specialize ------[replace.] 0.000074 10 52.95% : 0.000039s : 4: replace.inline 47.05% : 0.000035s : 6: replace.tuple_list_get_item_eliminator ------[match.] 0.000164 10 90.20% : 0.000148s : 4: match.inline 9.80% : 0.000016s : 6: match.tuple_list_get_item_eliminator ------[predicate.] 0.000287 1908 0.94% : 0.000003s : 20: predicate.accumulaten_eliminater 0.68% : 0.000002s : 6: predicate.ad_related_special_op_eliminate 0.54% : 0.000002s : 12: predicate.addn_check_dump 1.04% : 0.000003s : 20: predicate.addn_zero_filter 0.88% : 0.000003s : 20: predicate.adjust_all_reduce_mul_add 1.95% : 0.000006s : 32: predicate.arithmetic_simplify 1.06% : 0.000003s : 20: predicate.cast_eliminate 0.59% : 0.000002s : 12: predicate.check_bprop_eliminate 0.55% : 0.000002s : 12: predicate.compare_switch_simplify 0.20% : 0.000001s : 6: predicate.const_output_eliminate 0.59% : 0.000002s : 12: predicate.depend_value_elim 1.10% : 0.000003s : 20: predicate.dict_get_item_const_eliminator 1.07% : 0.000003s : 20: predicate.dict_get_item_eliminator 0.93% : 0.000003s : 20: predicate.dict_set_item_eliminator 0.90% : 0.000003s : 12: predicate.dumpgradient_eliminate 0.21% : 0.000001s : 6: predicate.elim_not_effective 0.38% : 0.000001s : 6: predicate.elim_shapecalc_of_broadcastargs 1.26% : 0.000004s : 26: predicate.environ_add_const_eliminate 1.17% : 0.000003s : 26: predicate.environ_get_add_eliminate 1.17% : 0.000003s : 26: predicate.environ_get_depend_swap 1.76% : 0.000005s : 38: predicate.environ_get_eliminate 1.17% : 0.000003s : 26: predicate.environ_get_set_eliminate 1.46% : 0.000004s : 30: predicate.exchange_switch_depend_value 2.27% : 0.000007s : 30: predicate.float_depend_g_call 0.55% : 0.000002s : 12: predicate.float_environ_get_switch 0.80% : 0.000002s : 18: predicate.float_tuple_getitem_switch 0.18% : 0.000001s : 6: predicate.fold_const_symbol 0.72% : 0.000002s : 12: predicate.get_grad_eliminate 0.22% : 0.000001s : 6: predicate.graph_param_transform 0.62% : 0.000002s : 12: predicate.incorporate_call 0.53% : 0.000002s : 12: predicate.incorporate_call_switch 6.20% : 0.000018s : 86: predicate.inline 0.88% : 0.000003s : 12: predicate.inline_without_move 0.35% : 0.000001s : 12: predicate.j_node_and_user_rematch 0.82% : 0.000002s : 12: predicate.less_batch_normalization 1.90% : 0.000005s : 38: predicate.list_to_tuple_eliminator_ 2.67% : 0.000008s : 58: predicate.load_eliminater 0.73% : 0.000002s : 6: predicate.loop_unroll_after_grad 1.98% : 0.000006s : 42: predicate.loop_unroll_before_grad 1.62% : 0.000005s : 32: predicate.make_slice_get_slice_eliminator 0.57% : 0.000002s : 12: predicate.merge_addn 0.57% : 0.000002s : 12: predicate.micro_step_allgather_replace 0.59% : 0.000002s : 12: predicate.mini_step_allgather_replace 0.89% : 0.000003s : 20: predicate.minmaximum_grad 0.77% : 0.000002s : 6: predicate.mutable_eliminate 0.35% : 0.000001s : 6: predicate.opt_reshape 0.39% : 0.000001s : 6: predicate.parallel_virtual_node 1.77% : 0.000005s : 30: predicate.partial_defer_inline 1.80% : 0.000005s : 32: predicate.partial_eliminate 0.92% : 0.000003s : 20: predicate.print_const_string_wrapper 0.63% : 0.000002s : 12: predicate.reduce_all_const_elim 1.14% : 0.000003s : 20: predicate.reduce_eliminate 2.65% : 0.000008s : 58: predicate.redundant_stop_gradient_eliminater 0.38% : 0.000001s : 12: predicate.remove_not_recompute_node 1.44% : 0.000004s : 38: predicate.replace_applicator 0.50% : 0.000001s : 12: predicate.replace_old_param 0.24% : 0.000001s : 6: predicate.reset_defer_inline 0.96% : 0.000003s : 20: predicate.reshape_eliminate 0.63% : 0.000002s : 12: predicate.row_tensor_add_zeros_like 0.48% : 0.000001s : 6: predicate.row_tensor_eliminate 0.69% : 0.000002s : 12: predicate.same_eliminate 0.53% : 0.000002s : 12: predicate.set_cell_output_no_recompute 0.74% : 0.000002s : 12: predicate.shard_identity_eliminate 0.75% : 0.000002s : 12: predicate.special_op_eliminate 0.74% : 0.000002s : 12: predicate.specialize_transform 0.75% : 0.000002s : 12: predicate.split_environ_get_set_with_tuple_value 0.76% : 0.000002s : 12: predicate.stack_unstack_eliminate 0.34% : 0.000001s : 6: predicate.switch_call_monad_eliminater 1.52% : 0.000004s : 30: predicate.switch_defer_inline 2.05% : 0.000006s : 42: predicate.switch_layer_defer_inline 4.65% : 0.000013s : 90: predicate.switch_simplify 0.92% : 0.000003s : 20: predicate.tile_eliminate 0.98% : 0.000003s : 20: predicate.transpose_eliminate 1.55% : 0.000004s : 32: predicate.tuple_list_convert_item_index_to_positive 1.66% : 0.000005s : 32: predicate.tuple_list_get_item_const_eliminator 1.49% : 0.000004s : 32: predicate.tuple_list_get_item_depend_reorder 3.30% : 0.000009s : 50: predicate.tuple_list_get_item_eliminator 1.62% : 0.000005s : 32: predicate.tuple_list_get_set_item_eliminator 2.20% : 0.000006s : 44: predicate.tuple_list_set_item_eliminator 1.81% : 0.000005s : 38: predicate.tuple_to_list_eliminator_ 2.61% : 0.000007s : 58: predicate.updatestate_pure_node_eliminater 3.36% : 0.000010s : 70: predicate.updatestate_useless_node_eliminater 0.47% : 0.000001s : 6: predicate.value_based_eliminate 0.79% : 0.000002s : 12: predicate.virtual_dataset_eliminate 0.64% : 0.000002s : 12: predicate.virtual_output_eliminate 0.33% : 0.000001s : 6: predicate.virtual_view_grad_eliminate 0.41% : 0.000001s : 6: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000527 11 53.58% : 0.000283s : 5: func_graph_cloner_run.FuncGraphClonerGraph 46.42% : 0.000245s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.028135 192 0.02% : 0.000006s : 1: ForceFp32Comm 11.10% : 0.003122s : 1: add_attr 11.05% : 0.003109s : 1: add_attr_with_inline 0.02% : 0.000006s : 1: add_comm_op_reuse_tag 0.22% : 0.000063s : 1: add_recomputation 0.02% : 0.000006s : 1: assign_add_opt 0.26% : 0.000073s : 1: auto_monad 0.10% : 0.000029s : 1: auto_monad_reorder 0.02% : 0.000006s : 1: begin_end_overlap_inline 0.03% : 0.000008s : 1: bias_add_comm_swap 1.68% : 0.000473s : 1: bootstrap 0.11% : 0.000030s : 1: cconv 0.02% : 0.000006s : 1: comm_op_add_attrs 0.07% : 0.000020s : 1: control_data_broadcast_order 0.05% : 0.000014s : 1: convert_after_rewriter 0.13% : 0.000036s : 1: cse_after_recomputation 0.03% : 0.000008s : 1: dataset_repeat_opt 0.06% : 0.000017s : 1: detach_backward 0.04% : 0.000013s : 1: environ_conv 0.11% : 0.000030s : 1: event_method 0.03% : 0.000007s : 1: full_micro_interleaved_order_control 0.03% : 0.000007s : 1: get_jit_bprop_graph 0.04% : 0.000012s : 1: graph_reusing 0.02% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.02% : 0.000006s : 1: handle_group_info 0.03% : 0.000007s : 1: inline 0.03% : 0.000008s : 1: insert-virtual-dataset 0.02% : 0.000006s : 1: interleave_parallel_branches 0.02% : 0.000006s : 1: interleave_split_concat_branches 0.03% : 0.000008s : 1: label_fine_grained_interleaved_index 0.03% : 0.000010s : 1: label_micro_interleaved_index 1.54% : 0.000434s : 1: loop_unroll 0.02% : 0.000007s : 1: merge_cast_opt 0.03% : 0.000007s : 1: micro_interleaved_order_control 1.79% : 0.000504s : 1: mutable_eliminate 0.03% : 0.000010s : 1: offloading_packed_experts 0.06% : 0.000016s : 1: opt.transform.loop_unroll_optimizer 0.06% : 0.000017s : 1: opt.transform.mutable_eliminate 5.84% : 0.001644s : 78: opt.transform.opt_a 0.14% : 0.000041s : 1: opt.transform.opt_after_cconv 0.11% : 0.000031s : 1: opt.transform.opt_after_jit_grad 0.58% : 0.000163s : 28: opt.transform.opt_b 0.23% : 0.000064s : 2: opt.transform.opt_trans_graph 0.17% : 0.000047s : 4: opt.transform.symbol_engine_opt 12.74% : 0.003585s : 1: opt_a 0.52% : 0.000147s : 1: opt_after_cconv 1.77% : 0.000498s : 1: opt_after_jit_grad 1.21% : 0.000340s : 1: opt_b 22.45% : 0.006318s : 1: optimize 0.08% : 0.000023s : 1: optimize_parallel_all_gather_comm 0.04% : 0.000012s : 1: order_py_execute_after_rewriter 0.10% : 0.000029s : 1: overlap_grad_flash_sp 0.02% : 0.000006s : 1: overlap_grad_matmul_and_grad_allreduce 0.08% : 0.000023s : 1: overlap_grad_ring_attention 0.03% : 0.000007s : 1: overlap_opt_shard_grad_in_pipeline 0.02% : 0.000006s : 1: overlap_opt_shard_in_pipeline 0.03% : 0.000008s : 1: overlap_param_gather 0.02% : 0.000006s : 1: overlap_recompute_allgather_and_fa_grad 0.04% : 0.000010s : 1: overlap_recompute_and_grad_model_parallel 0.02% : 0.000007s : 1: overlap_recompute_comm 0.03% : 0.000010s : 1: parallel-infer-symbol 0.02% : 0.000006s : 1: parallel-infer-symbol-second 0.03% : 0.000007s : 1: partial_unused_args_eliminate 0.03% : 0.000009s : 1: pipeline_parallel_scheduler 0.02% : 0.000007s : 1: pipeline_split 0.15% : 0.000044s : 1: pre_auto_parallel 0.14% : 0.000038s : 1: py_interpret_to_execute 0.06% : 0.000017s : 1: py_interpret_to_execute_after_opt_a 0.02% : 0.000007s : 1: remove_cast_before_assign_add 0.07% : 0.000021s : 1: remove_dup_value 1.32% : 0.000370s : 1: renormalize.infer 1.11% : 0.000313s : 1: renormalize.specialize 0.03% : 0.000008s : 1: reorder_send_recv_between_fp_bp 0.04% : 0.000010s : 1: rewriter_after_jit_bprop_graph 0.17% : 0.000047s : 1: rewriter_after_opt_a 0.37% : 0.000103s : 1: rewriter_before_opt_a 0.03% : 0.000008s : 1: slice_cell_reuse_recomputed_activation 0.02% : 0.000007s : 1: slice_recompute_activation 0.02% : 0.000007s : 1: split_layernorm_comm 0.03% : 0.000007s : 1: split_matmul_comm_elemetwise 0.04% : 0.000012s : 1: swap_dp_allreduce_reducescatter 0.39% : 0.000110s : 1: symbol_engine_optimizer 0.38% : 0.000107s : 1: tuple_transform 20.22% : 0.005690s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:47:14.488.86 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0152007, [21] [bootstrap]: 0.0004321 [type_inference]: 0.00562877 [event_method]: 1.968e-05 [auto_monad]: 6.308e-05 [graph_reusing]: 5.49998e-06 [inline]: 2.07001e-06 [add_attr]: 0.00303213, [1] [add_attr_with_inline]: 0.00302381, [1] [Cycle 1]: 5.956e-05, [2] [tag_attr]: 2.141e-05 [meta_addattr_fg_expand]: 6.40002e-06 [parallel-infer-symbol]: 3.08e-06 [pre_auto_parallel]: 3.468e-05 [insert-virtual-dataset]: 2.29999e-06 [parallel-infer-symbol-second]: 8.70001e-07 [dataset_repeat_opt]: 2.29001e-06 [pipeline_split]: 1.84e-06 [optimize]: 0.00528389, [53] [py_interpret_to_execute]: 2.958e-05 [rewriter_before_opt_a]: 9.484e-05 [opt_a]: 0.00318816, [2] [Cycle 1]: 0.00237723, [45] [expand_dump_flag]: 2.78998e-06 [switch_simplify]: 4.735e-05 [loop_unroll]: 3.503e-05 [a_1]: 0.00083809 [with_stream_mark]: 1.59e-05 [recompute_prepare]: 1.057e-05 [updatestate_depend_eliminate]: 4.94e-06 [updatestate_assign_eliminate]: 4.37e-06 [updatestate_loads_eliminate]: 3.71999e-06 [parameter_eliminate]: 2.06e-06 [a_2]: 0.00010919 [accelerated_algorithm]: 8.62998e-06 [shard]: 1.64e-06 [meta_shard_fg_expand]: 1.97001e-06 [shard_inline]: 8.33999e-06 [merge_send_recv]: 9.63002e-06 [auto_parallel]: 6.76e-06 [parallel]: 1.805e-05 [flash_sp]: 8.17998e-06 [merge_comm]: 5.29998e-06 [allreduce_fusion]: 4.70999e-06 [matmul_add_comm_reduction]: 1.028e-05 [allreduce_slice_to_reducescatter]: 9.20001e-07 [virtual_shard_identity]: 9.53002e-06 [virtual_dataset]: 8.79e-06 [get_grad_eliminate_]: 7.95998e-06 [virtual_output]: 8.08001e-06 [merge_forward]: 4.48001e-06 [cell_reuse_recompute_pass]: 1.24e-06 [offload_activation]: 1.089e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.584e-05 [merge_recompute_call_nodes]: 1.47999e-06 [before_grad]: 1.396e-05 [set_forward_comm_id_for_comm_node_pass]: 4.4e-06 [meta_fg_expand]: 3.63e-06 [flash_sp_send_recv_attached]: 2.43e-06 [receive_attached]: 2.02999e-06 [after_resolve]: 1.495e-05 [a_after_grad]: 1.31e-05 [renormalize]: 0.00073471 [add_forward_monad_depend]: 6.07001e-06 [auto_monad_grad]: 2.02001e-06 [auto_monad_eliminator]: 1.613e-05 [cse]: 3.645e-05 [a_3]: 6.074e-05 [Cycle 2]: 0.00080123, [45] [expand_dump_flag]: 1.04003e-06 [switch_simplify]: 9.52001e-06 [loop_unroll]: 8.07998e-06 [a_1]: 0.00020134 [with_stream_mark]: 1.098e-05 [recompute_prepare]: 8.13999e-06 [updatestate_depend_eliminate]: 3.76001e-06 [updatestate_assign_eliminate]: 3.04999e-06 [updatestate_loads_eliminate]: 3.21999e-06 [parameter_eliminate]: 9.70002e-07 [a_2]: 0.00010124 [accelerated_algorithm]: 7.96001e-06 [shard]: 1.15999e-06 [meta_shard_fg_expand]: 1.57001e-06 [shard_inline]: 7.8e-06 [merge_send_recv]: 6.08002e-06 [auto_parallel]: 6.14001e-06 [parallel]: 4.33999e-06 [flash_sp]: 3.69002e-06 [merge_comm]: 4.4e-06 [allreduce_fusion]: 3.97e-06 [matmul_add_comm_reduction]: 6.41e-06 [allreduce_slice_to_reducescatter]: 3.4002e-07 [virtual_shard_identity]: 1.197e-05 [virtual_dataset]: 8.05e-06 [get_grad_eliminate_]: 7.6e-06 [virtual_output]: 7.66999e-06 [merge_forward]: 3.73001e-06 [cell_reuse_recompute_pass]: 1.47999e-06 [offload_activation]: 7.28999e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.454e-05 [merge_recompute_call_nodes]: 7.7e-07 [before_grad]: 1.226e-05 [set_forward_comm_id_for_comm_node_pass]: 3.97e-06 [meta_fg_expand]: 2.89001e-06 [flash_sp_send_recv_attached]: 9.70002e-07 [receive_attached]: 1.01002e-06 [after_resolve]: 1.432e-05 [a_after_grad]: 1.306e-05 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 1.39998e-06 [auto_monad_grad]: 8.89995e-07 [auto_monad_eliminator]: 8.25e-06 [cse]: 1.854e-05 [a_3]: 4.923e-05 [py_interpret_to_execute_after_opt_a]: 9.39e-06 [slice_cell_reuse_recomputed_activation]: 2.13998e-06 [rewriter_after_opt_a]: 3.846e-05 [convert_after_rewriter]: 7.90998e-06 [order_py_execute_after_rewriter]: 6.07999e-06 [mutable_eliminate]: 0.0004778 [opt_b]: 0.0002639, [1] [Cycle 1]: 0.00025803, [7] [b_1]: 0.00017708 [b_2]: 9.86e-06 [updatestate_depend_eliminate]: 5.99e-06 [updatestate_assign_eliminate]: 3.16001e-06 [updatestate_loads_eliminate]: 3.43999e-06 [renormalize]: 4.30009e-07 [cse]: 2.241e-05 [optimize_parallel_all_gather_comm]: 1.746e-05 [overlap_param_gather]: 1.82001e-06 [cconv]: 2.394e-05 [loop_unroll]: 0.00042018 [opt_after_cconv]: 0.00011997, [1] [Cycle 1]: 0.00011442, [7] [c_1]: 4.259e-05 [parameter_eliminate]: 2.79001e-06 [updatestate_depend_eliminate]: 5.77999e-06 [updatestate_assign_eliminate]: 3.2e-06 [updatestate_loads_eliminate]: 2.84001e-06 [cse]: 2.265e-05 [renormalize]: 5.19998e-07 [remove_dup_value]: 1.572e-05 [tuple_transform]: 9.102e-05, [1] [Cycle 1]: 8.633e-05, [4] [d_1]: 5.762e-05 [none_parameter_eliminate]: 1.66e-06 [renormalize]: 1.69995e-07 [switch_simplify]: 8.57e-06 [partial_unused_args_eliminate]: 1.76003e-06 [add_recomputation]: 5.257e-05 [cse_after_recomputation]: 2.532e-05, [1] [Cycle 1]: 2.089e-05, [1] [cse]: 1.541e-05 [environ_conv]: 6.36998e-06 [swap_dp_allreduce_reducescatter]: 6.09001e-06 [bias_add_comm_swap]: 2.71999e-06 [label_micro_interleaved_index]: 4.15e-06 [label_fine_grained_interleaved_index]: 2.63998e-06 [merge_cast_opt]: 1.23002e-06 [slice_recompute_activation]: 2.21e-06 [micro_interleaved_order_control]: 2.60002e-06 [assign_add_opt]: 1.19e-06 [ForceFp32Comm]: 7.7e-07 [remove_cast_before_assign_add]: 1.00001e-06 [full_micro_interleaved_order_control]: 2.31998e-06 [reorder_send_recv_between_fp_bp]: 2.57001e-06 [comm_op_add_attrs]: 9.70002e-07 [add_comm_op_reuse_tag]: 9.20001e-07 [interleave_split_concat_branches]: 1.14998e-06 [interleave_parallel_branches]: 1.02e-06 [overlap_opt_shard_in_pipeline]: 1.05001e-06 [overlap_opt_shard_grad_in_pipeline]: 2.30002e-06 [control_data_broadcast_order]: 1.502e-05 [grouped_pairwise_exchange_alltoall]: 1.45001e-06 [offloading_packed_experts]: 4.03999e-06 [overlap_recompute_and_grad_model_parallel]: 5.40999e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.22999e-06 [overlap_recompute_allgather_and_fa_grad]: 1.32999e-06 [overlap_recompute_comm]: 2.11e-06 [overlap_grad_ring_attention]: 4.50001e-06 [overlap_grad_flash_sp]: 2.013e-05 [begin_end_overlap_inline]: 6.89994e-07 [split_matmul_comm_elemetwise]: 2.24001e-06 [split_layernorm_comm]: 1.59e-06 [handle_group_info]: 9.10019e-07 [symbol_engine_optimizer]: 8.334e-05, [1] [Cycle 1]: 7.916e-05, [6] [build]: 2.84999e-06 [elim_shapecalc]: 1.128e-05 [elim_not_effective]: 1.599e-05 [opt_reshape]: 8.76002e-06 [fold_const_symbol]: 1.251e-05 [renormalize]: 2.3999e-07 [detach_backward]: 2.12001e-06 [pipeline_parallel_scheduler]: 1.47001e-06 [auto_monad_reorder]: 1.993e-05 [get_jit_bprop_graph]: 1.37e-06 [rewriter_after_jit_bprop_graph]: 3.58999e-06 [opt_after_jit_grad]: 0.00048252 [validate]: 3.958e-05 Sums bootstrap : 0.000432s : 3.84% type_inference : 0.005629s : 50.04% event_method : 0.000020s : 0.17% auto_monad : 0.000063s : 0.56% graph_reusing : 0.000005s : 0.05% inline : 0.000002s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000021s : 0.19% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.06% parallel-infer-symbol : 0.000003s : 0.03% pre_auto_parallel : 0.000035s : 0.31% insert-virtual-dataset : 0.000002s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.02% optimize.py_interpret_to_execute : 0.000030s : 0.26% optimize.rewriter_before_opt_a : 0.000095s : 0.84% optimize.opt_a.expand_dump_flag : 0.000004s : 0.03% optimize.opt_a.switch_simplify : 0.000057s : 0.51% optimize.opt_a.loop_unroll : 0.000043s : 0.38% optimize.opt_a.a_1 : 0.001039s : 9.24% optimize.opt_a.with_stream_mark : 0.000027s : 0.24% optimize.opt_a.recompute_prepare : 0.000019s : 0.17% optimize.opt_a.updatestate_depend_eliminate : 0.000009s : 0.08% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.07% optimize.opt_a.updatestate_loads_eliminate : 0.000007s : 0.06% optimize.opt_a.parameter_eliminate : 0.000003s : 0.03% optimize.opt_a.a_2 : 0.000210s : 1.87% optimize.opt_a.accelerated_algorithm : 0.000017s : 0.15% optimize.opt_a.shard : 0.000003s : 0.02% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.03% optimize.opt_a.shard_inline : 0.000016s : 0.14% optimize.opt_a.merge_send_recv : 0.000016s : 0.14% optimize.opt_a.auto_parallel : 0.000013s : 0.11% optimize.opt_a.parallel : 0.000022s : 0.20% optimize.opt_a.flash_sp : 0.000012s : 0.11% optimize.opt_a.merge_comm : 0.000010s : 0.09% optimize.opt_a.allreduce_fusion : 0.000009s : 0.08% optimize.opt_a.matmul_add_comm_reduction : 0.000017s : 0.15% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000022s : 0.19% optimize.opt_a.virtual_dataset : 0.000017s : 0.15% optimize.opt_a.get_grad_eliminate_ : 0.000016s : 0.14% optimize.opt_a.virtual_output : 0.000016s : 0.14% optimize.opt_a.merge_forward : 0.000008s : 0.07% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.02% optimize.opt_a.offload_activation : 0.000018s : 0.16% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000030s : 0.27% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.02% optimize.opt_a.before_grad : 0.000026s : 0.23% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000008s : 0.07% optimize.opt_a.meta_fg_expand : 0.000007s : 0.06% optimize.opt_a.flash_sp_send_recv_attached : 0.000003s : 0.03% optimize.opt_a.receive_attached : 0.000003s : 0.03% optimize.opt_a.after_resolve : 0.000029s : 0.26% optimize.opt_a.a_after_grad : 0.000026s : 0.23% optimize.opt_a.renormalize : 0.000735s : 6.53% optimize.opt_a.add_forward_monad_depend : 0.000007s : 0.07% optimize.opt_a.auto_monad_grad : 0.000003s : 0.03% optimize.opt_a.auto_monad_eliminator : 0.000024s : 0.22% optimize.opt_a.cse : 0.000055s : 0.49% optimize.opt_a.a_3 : 0.000110s : 0.98% optimize.py_interpret_to_execute_after_opt_a : 0.000009s : 0.08% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.02% optimize.rewriter_after_opt_a : 0.000038s : 0.34% optimize.convert_after_rewriter : 0.000008s : 0.07% optimize.order_py_execute_after_rewriter : 0.000006s : 0.05% optimize.mutable_eliminate : 0.000478s : 4.25% optimize.opt_b.b_1 : 0.000177s : 1.57% optimize.opt_b.b_2 : 0.000010s : 0.09% optimize.opt_b.updatestate_depend_eliminate : 0.000006s : 0.05% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_b.renormalize : 0.000000s : 0.00% optimize.opt_b.cse : 0.000022s : 0.20% optimize.optimize_parallel_all_gather_comm : 0.000017s : 0.16% optimize.overlap_param_gather : 0.000002s : 0.02% optimize.cconv : 0.000024s : 0.21% optimize.loop_unroll : 0.000420s : 3.74% optimize.opt_after_cconv.c_1 : 0.000043s : 0.38% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.02% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.05% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.cse : 0.000023s : 0.20% optimize.opt_after_cconv.renormalize : 0.000001s : 0.00% optimize.remove_dup_value : 0.000016s : 0.14% optimize.tuple_transform.d_1 : 0.000058s : 0.51% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000009s : 0.08% optimize.partial_unused_args_eliminate : 0.000002s : 0.02% optimize.add_recomputation : 0.000053s : 0.47% optimize.cse_after_recomputation.cse : 0.000015s : 0.14% optimize.environ_conv : 0.000006s : 0.06% optimize.swap_dp_allreduce_reducescatter : 0.000006s : 0.05% optimize.bias_add_comm_swap : 0.000003s : 0.02% optimize.label_micro_interleaved_index : 0.000004s : 0.04% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.02% optimize.merge_cast_opt : 0.000001s : 0.01% optimize.slice_recompute_activation : 0.000002s : 0.02% optimize.micro_interleaved_order_control : 0.000003s : 0.02% optimize.assign_add_opt : 0.000001s : 0.01% optimize.ForceFp32Comm : 0.000001s : 0.01% optimize.remove_cast_before_assign_add : 0.000001s : 0.01% optimize.full_micro_interleaved_order_control : 0.000002s : 0.02% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.02% optimize.comm_op_add_attrs : 0.000001s : 0.01% optimize.add_comm_op_reuse_tag : 0.000001s : 0.01% optimize.interleave_split_concat_branches : 0.000001s : 0.01% optimize.interleave_parallel_branches : 0.000001s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.02% optimize.control_data_broadcast_order : 0.000015s : 0.13% optimize.grouped_pairwise_exchange_alltoall : 0.000001s : 0.01% optimize.offloading_packed_experts : 0.000004s : 0.04% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.05% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.01% optimize.overlap_recompute_comm : 0.000002s : 0.02% optimize.overlap_grad_ring_attention : 0.000005s : 0.04% optimize.overlap_grad_flash_sp : 0.000020s : 0.18% optimize.begin_end_overlap_inline : 0.000001s : 0.01% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.02% optimize.split_layernorm_comm : 0.000002s : 0.01% optimize.handle_group_info : 0.000001s : 0.01% optimize.symbol_engine_optimizer.build : 0.000003s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000011s : 0.10% optimize.symbol_engine_optimizer.elim_not_effective : 0.000016s : 0.14% optimize.symbol_engine_optimizer.opt_reshape : 0.000009s : 0.08% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000013s : 0.11% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.02% pipeline_parallel_scheduler : 0.000001s : 0.01% auto_monad_reorder : 0.000020s : 0.18% get_jit_bprop_graph : 0.000001s : 0.01% rewriter_after_jit_bprop_graph : 0.000004s : 0.03% opt_after_jit_grad : 0.000483s : 4.29% validate : 0.000040s : 0.35% Time group info: ------[substitution.] 0.000222 44 9.30% : 0.000021s : 3: substitution.cast_eliminate 1.01% : 0.000002s : 3: substitution.elim_not_effective 0.71% : 0.000002s : 3: substitution.fold_const_symbol 3.16% : 0.000007s : 6: substitution.graph_param_transform 67.25% : 0.000149s : 4: substitution.inline 2.08% : 0.000005s : 6: substitution.j_node_and_user_rematch 2.68% : 0.000006s : 6: substitution.remove_not_recompute_node 2.58% : 0.000006s : 6: substitution.replace_old_param 8.53% : 0.000019s : 6: substitution.tuple_list_get_item_eliminator 2.69% : 0.000006s : 1: substitution.value_based_eliminate ------[type_inference.] 0.005572 2 87.03% : 0.004849s : 1: type_inference.infer 12.97% : 0.000723s : 1: type_inference.specialize ------[replace.] 0.000072 10 53.62% : 0.000039s : 4: replace.inline 46.38% : 0.000034s : 6: replace.tuple_list_get_item_eliminator ------[match.] 0.000163 10 90.02% : 0.000147s : 4: match.inline 9.98% : 0.000016s : 6: match.tuple_list_get_item_eliminator ------[predicate.] 0.000283 1908 0.94% : 0.000003s : 20: predicate.accumulaten_eliminater 0.69% : 0.000002s : 6: predicate.ad_related_special_op_eliminate 0.53% : 0.000001s : 12: predicate.addn_check_dump 1.00% : 0.000003s : 20: predicate.addn_zero_filter 0.89% : 0.000003s : 20: predicate.adjust_all_reduce_mul_add 2.03% : 0.000006s : 32: predicate.arithmetic_simplify 1.07% : 0.000003s : 20: predicate.cast_eliminate 0.60% : 0.000002s : 12: predicate.check_bprop_eliminate 0.53% : 0.000001s : 12: predicate.compare_switch_simplify 0.20% : 0.000001s : 6: predicate.const_output_eliminate 0.61% : 0.000002s : 12: predicate.depend_value_elim 1.02% : 0.000003s : 20: predicate.dict_get_item_const_eliminator 1.06% : 0.000003s : 20: predicate.dict_get_item_eliminator 0.92% : 0.000003s : 20: predicate.dict_set_item_eliminator 0.85% : 0.000002s : 12: predicate.dumpgradient_eliminate 0.25% : 0.000001s : 6: predicate.elim_not_effective 0.38% : 0.000001s : 6: predicate.elim_shapecalc_of_broadcastargs 1.18% : 0.000003s : 26: predicate.environ_add_const_eliminate 1.14% : 0.000003s : 26: predicate.environ_get_add_eliminate 1.20% : 0.000003s : 26: predicate.environ_get_depend_swap 1.87% : 0.000005s : 38: predicate.environ_get_eliminate 1.14% : 0.000003s : 26: predicate.environ_get_set_eliminate 1.47% : 0.000004s : 30: predicate.exchange_switch_depend_value 2.31% : 0.000007s : 30: predicate.float_depend_g_call 0.53% : 0.000001s : 12: predicate.float_environ_get_switch 0.78% : 0.000002s : 18: predicate.float_tuple_getitem_switch 0.18% : 0.000001s : 6: predicate.fold_const_symbol 0.66% : 0.000002s : 12: predicate.get_grad_eliminate 0.21% : 0.000001s : 6: predicate.graph_param_transform 0.60% : 0.000002s : 12: predicate.incorporate_call 0.52% : 0.000001s : 12: predicate.incorporate_call_switch 6.10% : 0.000017s : 86: predicate.inline 0.75% : 0.000002s : 12: predicate.inline_without_move 0.36% : 0.000001s : 12: predicate.j_node_and_user_rematch 0.72% : 0.000002s : 12: predicate.less_batch_normalization 1.91% : 0.000005s : 38: predicate.list_to_tuple_eliminator_ 2.75% : 0.000008s : 58: predicate.load_eliminater 0.76% : 0.000002s : 6: predicate.loop_unroll_after_grad 2.03% : 0.000006s : 42: predicate.loop_unroll_before_grad 1.68% : 0.000005s : 32: predicate.make_slice_get_slice_eliminator 0.56% : 0.000002s : 12: predicate.merge_addn 0.59% : 0.000002s : 12: predicate.micro_step_allgather_replace 0.56% : 0.000002s : 12: predicate.mini_step_allgather_replace 0.95% : 0.000003s : 20: predicate.minmaximum_grad 0.83% : 0.000002s : 6: predicate.mutable_eliminate 0.34% : 0.000001s : 6: predicate.opt_reshape 0.42% : 0.000001s : 6: predicate.parallel_virtual_node 1.80% : 0.000005s : 30: predicate.partial_defer_inline 1.81% : 0.000005s : 32: predicate.partial_eliminate 0.92% : 0.000003s : 20: predicate.print_const_string_wrapper 0.61% : 0.000002s : 12: predicate.reduce_all_const_elim 1.21% : 0.000003s : 20: predicate.reduce_eliminate 2.69% : 0.000008s : 58: predicate.redundant_stop_gradient_eliminater 0.45% : 0.000001s : 12: predicate.remove_not_recompute_node 1.44% : 0.000004s : 38: predicate.replace_applicator 0.51% : 0.000001s : 12: predicate.replace_old_param 0.25% : 0.000001s : 6: predicate.reset_defer_inline 0.96% : 0.000003s : 20: predicate.reshape_eliminate 0.62% : 0.000002s : 12: predicate.row_tensor_add_zeros_like 0.36% : 0.000001s : 6: predicate.row_tensor_eliminate 0.75% : 0.000002s : 12: predicate.same_eliminate 0.43% : 0.000001s : 12: predicate.set_cell_output_no_recompute 0.72% : 0.000002s : 12: predicate.shard_identity_eliminate 0.73% : 0.000002s : 12: predicate.special_op_eliminate 0.67% : 0.000002s : 12: predicate.specialize_transform 0.77% : 0.000002s : 12: predicate.split_environ_get_set_with_tuple_value 0.69% : 0.000002s : 12: predicate.stack_unstack_eliminate 0.35% : 0.000001s : 6: predicate.switch_call_monad_eliminater 1.53% : 0.000004s : 30: predicate.switch_defer_inline 2.12% : 0.000006s : 42: predicate.switch_layer_defer_inline 4.80% : 0.000014s : 90: predicate.switch_simplify 1.01% : 0.000003s : 20: predicate.tile_eliminate 0.95% : 0.000003s : 20: predicate.transpose_eliminate 1.54% : 0.000004s : 32: predicate.tuple_list_convert_item_index_to_positive 1.77% : 0.000005s : 32: predicate.tuple_list_get_item_const_eliminator 1.56% : 0.000004s : 32: predicate.tuple_list_get_item_depend_reorder 3.19% : 0.000009s : 50: predicate.tuple_list_get_item_eliminator 1.59% : 0.000004s : 32: predicate.tuple_list_get_set_item_eliminator 2.25% : 0.000006s : 44: predicate.tuple_list_set_item_eliminator 1.81% : 0.000005s : 38: predicate.tuple_to_list_eliminator_ 2.61% : 0.000007s : 58: predicate.updatestate_pure_node_eliminater 3.33% : 0.000009s : 70: predicate.updatestate_useless_node_eliminater 0.38% : 0.000001s : 6: predicate.value_based_eliminate 0.78% : 0.000002s : 12: predicate.virtual_dataset_eliminate 0.62% : 0.000002s : 12: predicate.virtual_output_eliminate 0.31% : 0.000001s : 6: predicate.virtual_view_grad_eliminate 0.43% : 0.000001s : 6: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000526 11 54.17% : 0.000285s : 5: func_graph_cloner_run.FuncGraphClonerGraph 45.83% : 0.000241s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.026107 192 0.01% : 0.000003s : 1: ForceFp32Comm 11.63% : 0.003037s : 1: add_attr 11.60% : 0.003028s : 1: add_attr_with_inline 0.01% : 0.000003s : 1: add_comm_op_reuse_tag 0.22% : 0.000057s : 1: add_recomputation 0.01% : 0.000004s : 1: assign_add_opt 0.27% : 0.000069s : 1: auto_monad 0.09% : 0.000024s : 1: auto_monad_reorder 0.01% : 0.000003s : 1: begin_end_overlap_inline 0.02% : 0.000005s : 1: bias_add_comm_swap 1.76% : 0.000459s : 1: bootstrap 0.10% : 0.000027s : 1: cconv 0.01% : 0.000004s : 1: comm_op_add_attrs 0.07% : 0.000018s : 1: control_data_broadcast_order 0.04% : 0.000011s : 1: convert_after_rewriter 0.11% : 0.000028s : 1: cse_after_recomputation 0.02% : 0.000005s : 1: dataset_repeat_opt 0.02% : 0.000005s : 1: detach_backward 0.04% : 0.000009s : 1: environ_conv 0.10% : 0.000026s : 1: event_method 0.02% : 0.000005s : 1: full_micro_interleaved_order_control 0.02% : 0.000004s : 1: get_jit_bprop_graph 0.03% : 0.000009s : 1: graph_reusing 0.02% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000004s : 1: handle_group_info 0.02% : 0.000005s : 1: inline 0.02% : 0.000006s : 1: insert-virtual-dataset 0.01% : 0.000004s : 1: interleave_parallel_branches 0.01% : 0.000004s : 1: interleave_split_concat_branches 0.02% : 0.000006s : 1: label_fine_grained_interleaved_index 0.03% : 0.000007s : 1: label_micro_interleaved_index 1.64% : 0.000428s : 1: loop_unroll 0.02% : 0.000004s : 1: merge_cast_opt 0.02% : 0.000005s : 1: micro_interleaved_order_control 1.86% : 0.000486s : 1: mutable_eliminate 0.03% : 0.000007s : 1: offloading_packed_experts 0.06% : 0.000016s : 1: opt.transform.loop_unroll_optimizer 0.06% : 0.000016s : 1: opt.transform.mutable_eliminate 6.15% : 0.001605s : 78: opt.transform.opt_a 0.16% : 0.000041s : 1: opt.transform.opt_after_cconv 0.12% : 0.000032s : 1: opt.transform.opt_after_jit_grad 0.60% : 0.000156s : 28: opt.transform.opt_b 0.25% : 0.000064s : 2: opt.transform.opt_trans_graph 0.17% : 0.000045s : 4: opt.transform.symbol_engine_opt 12.22% : 0.003191s : 1: opt_a 0.47% : 0.000124s : 1: opt_after_cconv 1.88% : 0.000491s : 1: opt_after_jit_grad 1.02% : 0.000267s : 1: opt_b 20.26% : 0.005288s : 1: optimize 0.08% : 0.000021s : 1: optimize_parallel_all_gather_comm 0.03% : 0.000009s : 1: order_py_execute_after_rewriter 0.09% : 0.000023s : 1: overlap_grad_flash_sp 0.01% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.03% : 0.000007s : 1: overlap_grad_ring_attention 0.02% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.02% : 0.000005s : 1: overlap_param_gather 0.02% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.03% : 0.000008s : 1: overlap_recompute_and_grad_model_parallel 0.02% : 0.000005s : 1: overlap_recompute_comm 0.03% : 0.000007s : 1: parallel-infer-symbol 0.01% : 0.000004s : 1: parallel-infer-symbol-second 0.02% : 0.000005s : 1: partial_unused_args_eliminate 0.02% : 0.000004s : 1: pipeline_parallel_scheduler 0.02% : 0.000005s : 1: pipeline_split 0.15% : 0.000039s : 1: pre_auto_parallel 0.13% : 0.000034s : 1: py_interpret_to_execute 0.05% : 0.000013s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000004s : 1: remove_cast_before_assign_add 0.07% : 0.000019s : 1: remove_dup_value 1.66% : 0.000433s : 1: renormalize.infer 1.13% : 0.000294s : 1: renormalize.specialize 0.02% : 0.000005s : 1: reorder_send_recv_between_fp_bp 0.03% : 0.000007s : 1: rewriter_after_jit_bprop_graph 0.16% : 0.000042s : 1: rewriter_after_opt_a 0.38% : 0.000099s : 1: rewriter_before_opt_a 0.02% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.02% : 0.000005s : 1: slice_recompute_activation 0.02% : 0.000004s : 1: split_layernorm_comm 0.02% : 0.000005s : 1: split_matmul_comm_elemetwise 0.03% : 0.000009s : 1: swap_dp_allreduce_reducescatter 0.33% : 0.000086s : 1: symbol_engine_optimizer 0.36% : 0.000094s : 1: tuple_transform 21.61% : 0.005642s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:47:14.235.074 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:47:14.235.317 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0167362, [21] [bootstrap]: 0.00042769 [type_inference]: 0.00582645 [event_method]: 2.046e-05 [auto_monad]: 6.442e-05 [graph_reusing]: 5.61003e-06 [inline]: 2.02999e-06 [add_attr]: 0.00298919, [1] [add_attr_with_inline]: 0.00298096, [1] [Cycle 1]: 6.999e-05, [2] [tag_attr]: 2.16e-05 [meta_addattr_fg_expand]: 6.87002e-06 [parallel-infer-symbol]: 2.96999e-06 [pre_auto_parallel]: 5.505e-05 [insert-virtual-dataset]: 2.41e-06 [parallel-infer-symbol-second]: 8.39995e-07 [dataset_repeat_opt]: 2.09e-06 [pipeline_split]: 1.74998e-06 [optimize]: 0.00620328, [53] [py_interpret_to_execute]: 3.199e-05 [rewriter_before_opt_a]: 9.987e-05 [opt_a]: 0.00370103, [2] [Cycle 1]: 0.00260815, [45] [expand_dump_flag]: 2.79999e-06 [switch_simplify]: 4.935e-05 [loop_unroll]: 3.905e-05 [a_1]: 0.00089807 [with_stream_mark]: 1.708e-05 [recompute_prepare]: 1.193e-05 [updatestate_depend_eliminate]: 5.64e-06 [updatestate_assign_eliminate]: 4.53001e-06 [updatestate_loads_eliminate]: 4.22e-06 [parameter_eliminate]: 1.97001e-06 [a_2]: 0.00015738 [accelerated_algorithm]: 1.016e-05 [shard]: 1.69e-06 [meta_shard_fg_expand]: 2.24001e-06 [shard_inline]: 9.76e-06 [merge_send_recv]: 1.012e-05 [auto_parallel]: 7.85998e-06 [parallel]: 1.866e-05 [flash_sp]: 8.35001e-06 [merge_comm]: 5.56e-06 [allreduce_fusion]: 5.65001e-06 [matmul_add_comm_reduction]: 1.135e-05 [allreduce_slice_to_reducescatter]: 6.30011e-07 [virtual_shard_identity]: 1.167e-05 [virtual_dataset]: 1.018e-05 [get_grad_eliminate_]: 9.25001e-06 [virtual_output]: 9.92001e-06 [merge_forward]: 5.06002e-06 [cell_reuse_recompute_pass]: 1.49e-06 [offload_activation]: 1.195e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.062e-05 [merge_recompute_call_nodes]: 1.64e-06 [before_grad]: 1.572e-05 [set_forward_comm_id_for_comm_node_pass]: 5.46002e-06 [meta_fg_expand]: 3.98999e-06 [flash_sp_send_recv_attached]: 2.54001e-06 [receive_attached]: 2.27999e-06 [after_resolve]: 1.601e-05 [a_after_grad]: 1.539e-05 [renormalize]: 0.00063924 [add_forward_monad_depend]: 4.88001e-06 [auto_monad_grad]: 1.99999e-06 [auto_monad_eliminator]: 1.86e-05 [cse]: 4.931e-05 [a_3]: 8.439e-05 [Cycle 2]: 0.00108057, [45] [expand_dump_flag]: 1.22e-06 [switch_simplify]: 1.077e-05 [loop_unroll]: 1.034e-05 [a_1]: 0.00024668 [with_stream_mark]: 1.26e-05 [recompute_prepare]: 9.94999e-06 [updatestate_depend_eliminate]: 4.62e-06 [updatestate_assign_eliminate]: 3.73999e-06 [updatestate_loads_eliminate]: 3.81999e-06 [parameter_eliminate]: 9.80013e-07 [a_2]: 0.00014652 [accelerated_algorithm]: 9.31e-06 [shard]: 1.06997e-06 [meta_shard_fg_expand]: 1.84e-06 [shard_inline]: 9.42001e-06 [merge_send_recv]: 6.56999e-06 [auto_parallel]: 7.11001e-06 [parallel]: 4.52998e-06 [flash_sp]: 3.56999e-06 [merge_comm]: 5.18002e-06 [allreduce_fusion]: 4.52998e-06 [matmul_add_comm_reduction]: 7.71001e-06 [allreduce_slice_to_reducescatter]: 4.39992e-07 [virtual_shard_identity]: 1.009e-05 [virtual_dataset]: 9.95002e-06 [get_grad_eliminate_]: 9.20999e-06 [virtual_output]: 8.89e-06 [merge_forward]: 4.25e-06 [cell_reuse_recompute_pass]: 1.51002e-06 [offload_activation]: 8.64e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.891e-05 [merge_recompute_call_nodes]: 9.39996e-07 [before_grad]: 1.472e-05 [set_forward_comm_id_for_comm_node_pass]: 5.05999e-06 [meta_fg_expand]: 3.38e-06 [flash_sp_send_recv_attached]: 9.39996e-07 [receive_attached]: 9.89996e-07 [after_resolve]: 1.456e-05 [a_after_grad]: 1.521e-05 [renormalize]: 7.00238e-08 [add_forward_monad_depend]: 1.22999e-06 [auto_monad_grad]: 9.5999e-07 [auto_monad_eliminator]: 9.91e-06 [cse]: 2.39e-05 [a_3]: 7.379e-05 [py_interpret_to_execute_after_opt_a]: 1.332e-05 [slice_cell_reuse_recomputed_activation]: 5.07e-06 [rewriter_after_opt_a]: 6.558e-05 [convert_after_rewriter]: 1.245e-05 [order_py_execute_after_rewriter]: 9.91998e-06 [mutable_eliminate]: 0.00048025 [opt_b]: 0.00037191, [1] [Cycle 1]: 0.00036278, [7] [b_1]: 0.00024988 [b_2]: 1.157e-05 [updatestate_depend_eliminate]: 6.48e-06 [updatestate_assign_eliminate]: 3.81999e-06 [updatestate_loads_eliminate]: 4.15999e-06 [renormalize]: 5.59987e-07 [cse]: 2.866e-05 [optimize_parallel_all_gather_comm]: 2.173e-05 [overlap_param_gather]: 4.70999e-06 [cconv]: 2.673e-05 [loop_unroll]: 0.00043198 [opt_after_cconv]: 0.00015742, [1] [Cycle 1]: 0.00014864, [7] [c_1]: 4.823e-05 [parameter_eliminate]: 2.74999e-06 [updatestate_depend_eliminate]: 6.79001e-06 [updatestate_assign_eliminate]: 4.1e-06 [updatestate_loads_eliminate]: 3.56999e-06 [cse]: 2.776e-05 [renormalize]: 3.39991e-07 [remove_dup_value]: 3.907e-05 [tuple_transform]: 0.00011974, [1] [Cycle 1]: 0.00011224, [4] [d_1]: 6.776e-05 [none_parameter_eliminate]: 1.71e-06 [renormalize]: 2.09984e-07 [switch_simplify]: 1.042e-05 [partial_unused_args_eliminate]: 4.38999e-06 [add_recomputation]: 6.08e-05 [cse_after_recomputation]: 3.416e-05, [1] [Cycle 1]: 2.699e-05, [1] [cse]: 1.785e-05 [environ_conv]: 9.55001e-06 [swap_dp_allreduce_reducescatter]: 9.70002e-06 [bias_add_comm_swap]: 5.59e-06 [label_micro_interleaved_index]: 6.76e-06 [label_fine_grained_interleaved_index]: 4.87998e-06 [merge_cast_opt]: 3.88001e-06 [slice_recompute_activation]: 5.09998e-06 [micro_interleaved_order_control]: 4.68999e-06 [assign_add_opt]: 3.5e-06 [ForceFp32Comm]: 3.14001e-06 [remove_cast_before_assign_add]: 3.35e-06 [full_micro_interleaved_order_control]: 4.57998e-06 [reorder_send_recv_between_fp_bp]: 5.17e-06 [comm_op_add_attrs]: 3.81999e-06 [add_comm_op_reuse_tag]: 3.18e-06 [interleave_split_concat_branches]: 3.49001e-06 [interleave_parallel_branches]: 3.45e-06 [overlap_opt_shard_in_pipeline]: 3.63e-06 [overlap_opt_shard_grad_in_pipeline]: 4.18999e-06 [control_data_broadcast_order]: 1.895e-05 [grouped_pairwise_exchange_alltoall]: 3.76999e-06 [offloading_packed_experts]: 7.71999e-06 [overlap_recompute_and_grad_model_parallel]: 8.35999e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.5e-06 [overlap_recompute_allgather_and_fa_grad]: 3.7e-06 [overlap_recompute_comm]: 4.57e-06 [overlap_grad_ring_attention]: 7.43999e-06 [overlap_grad_flash_sp]: 2.53e-05 [begin_end_overlap_inline]: 2.94999e-06 [split_matmul_comm_elemetwise]: 4.72e-06 [split_layernorm_comm]: 4.35999e-06 [handle_group_info]: 3.43e-06 [symbol_engine_optimizer]: 0.00011209, [1] [Cycle 1]: 0.00010507, [6] [build]: 3.48e-06 [elim_shapecalc]: 1.321e-05 [elim_not_effective]: 1.799e-05 [opt_reshape]: 1.006e-05 [fold_const_symbol]: 1.492e-05 [renormalize]: 2.89991e-07 [detach_backward]: 3.19001e-06 [pipeline_parallel_scheduler]: 1.72001e-06 [auto_monad_reorder]: 2.327e-05 [get_jit_bprop_graph]: 1.12999e-06 [rewriter_after_jit_bprop_graph]: 4.33001e-06 [opt_after_jit_grad]: 0.00048729 [validate]: 4.022e-05 Sums bootstrap : 0.000428s : 3.55% type_inference : 0.005826s : 48.43% event_method : 0.000020s : 0.17% auto_monad : 0.000064s : 0.54% graph_reusing : 0.000006s : 0.05% inline : 0.000002s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000022s : 0.18% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000007s : 0.06% parallel-infer-symbol : 0.000003s : 0.02% pre_auto_parallel : 0.000055s : 0.46% insert-virtual-dataset : 0.000002s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000032s : 0.27% optimize.rewriter_before_opt_a : 0.000100s : 0.83% optimize.opt_a.expand_dump_flag : 0.000004s : 0.03% optimize.opt_a.switch_simplify : 0.000060s : 0.50% optimize.opt_a.loop_unroll : 0.000049s : 0.41% optimize.opt_a.a_1 : 0.001145s : 9.51% optimize.opt_a.with_stream_mark : 0.000030s : 0.25% optimize.opt_a.recompute_prepare : 0.000022s : 0.18% optimize.opt_a.updatestate_depend_eliminate : 0.000010s : 0.09% optimize.opt_a.updatestate_assign_eliminate : 0.000008s : 0.07% optimize.opt_a.updatestate_loads_eliminate : 0.000008s : 0.07% optimize.opt_a.parameter_eliminate : 0.000003s : 0.02% optimize.opt_a.a_2 : 0.000304s : 2.53% optimize.opt_a.accelerated_algorithm : 0.000019s : 0.16% optimize.opt_a.shard : 0.000003s : 0.02% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.03% optimize.opt_a.shard_inline : 0.000019s : 0.16% optimize.opt_a.merge_send_recv : 0.000017s : 0.14% optimize.opt_a.auto_parallel : 0.000015s : 0.12% optimize.opt_a.parallel : 0.000023s : 0.19% optimize.opt_a.flash_sp : 0.000012s : 0.10% optimize.opt_a.merge_comm : 0.000011s : 0.09% optimize.opt_a.allreduce_fusion : 0.000010s : 0.08% optimize.opt_a.matmul_add_comm_reduction : 0.000019s : 0.16% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000022s : 0.18% optimize.opt_a.virtual_dataset : 0.000020s : 0.17% optimize.opt_a.get_grad_eliminate_ : 0.000018s : 0.15% optimize.opt_a.virtual_output : 0.000019s : 0.16% optimize.opt_a.merge_forward : 0.000009s : 0.08% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.02% optimize.opt_a.offload_activation : 0.000021s : 0.17% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000040s : 0.33% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.02% optimize.opt_a.before_grad : 0.000030s : 0.25% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000011s : 0.09% optimize.opt_a.meta_fg_expand : 0.000007s : 0.06% optimize.opt_a.flash_sp_send_recv_attached : 0.000003s : 0.03% optimize.opt_a.receive_attached : 0.000003s : 0.03% optimize.opt_a.after_resolve : 0.000031s : 0.25% optimize.opt_a.a_after_grad : 0.000031s : 0.25% optimize.opt_a.renormalize : 0.000639s : 5.31% optimize.opt_a.add_forward_monad_depend : 0.000006s : 0.05% optimize.opt_a.auto_monad_grad : 0.000003s : 0.02% optimize.opt_a.auto_monad_eliminator : 0.000029s : 0.24% optimize.opt_a.cse : 0.000073s : 0.61% optimize.opt_a.a_3 : 0.000158s : 1.31% optimize.py_interpret_to_execute_after_opt_a : 0.000013s : 0.11% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.04% optimize.rewriter_after_opt_a : 0.000066s : 0.55% optimize.convert_after_rewriter : 0.000012s : 0.10% optimize.order_py_execute_after_rewriter : 0.000010s : 0.08% optimize.mutable_eliminate : 0.000480s : 3.99% optimize.opt_b.b_1 : 0.000250s : 2.08% optimize.opt_b.b_2 : 0.000012s : 0.10% optimize.opt_b.updatestate_depend_eliminate : 0.000006s : 0.05% optimize.opt_b.updatestate_assign_eliminate : 0.000004s : 0.03% optimize.opt_b.updatestate_loads_eliminate : 0.000004s : 0.03% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000029s : 0.24% optimize.optimize_parallel_all_gather_comm : 0.000022s : 0.18% optimize.overlap_param_gather : 0.000005s : 0.04% optimize.cconv : 0.000027s : 0.22% optimize.loop_unroll : 0.000432s : 3.59% optimize.opt_after_cconv.c_1 : 0.000048s : 0.40% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.02% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000007s : 0.06% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000004s : 0.03% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000004s : 0.03% optimize.opt_after_cconv.cse : 0.000028s : 0.23% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000039s : 0.32% optimize.tuple_transform.d_1 : 0.000068s : 0.56% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000010s : 0.09% optimize.partial_unused_args_eliminate : 0.000004s : 0.04% optimize.add_recomputation : 0.000061s : 0.51% optimize.cse_after_recomputation.cse : 0.000018s : 0.15% optimize.environ_conv : 0.000010s : 0.08% optimize.swap_dp_allreduce_reducescatter : 0.000010s : 0.08% optimize.bias_add_comm_swap : 0.000006s : 0.05% optimize.label_micro_interleaved_index : 0.000007s : 0.06% optimize.label_fine_grained_interleaved_index : 0.000005s : 0.04% optimize.merge_cast_opt : 0.000004s : 0.03% optimize.slice_recompute_activation : 0.000005s : 0.04% optimize.micro_interleaved_order_control : 0.000005s : 0.04% optimize.assign_add_opt : 0.000003s : 0.03% optimize.ForceFp32Comm : 0.000003s : 0.03% optimize.remove_cast_before_assign_add : 0.000003s : 0.03% optimize.full_micro_interleaved_order_control : 0.000005s : 0.04% optimize.reorder_send_recv_between_fp_bp : 0.000005s : 0.04% optimize.comm_op_add_attrs : 0.000004s : 0.03% optimize.add_comm_op_reuse_tag : 0.000003s : 0.03% optimize.interleave_split_concat_branches : 0.000003s : 0.03% optimize.interleave_parallel_branches : 0.000003s : 0.03% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.03% optimize.overlap_opt_shard_grad_in_pipeline : 0.000004s : 0.03% optimize.control_data_broadcast_order : 0.000019s : 0.16% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.03% optimize.offloading_packed_experts : 0.000008s : 0.06% optimize.overlap_recompute_and_grad_model_parallel : 0.000008s : 0.07% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000003s : 0.03% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.03% optimize.overlap_recompute_comm : 0.000005s : 0.04% optimize.overlap_grad_ring_attention : 0.000007s : 0.06% optimize.overlap_grad_flash_sp : 0.000025s : 0.21% optimize.begin_end_overlap_inline : 0.000003s : 0.02% optimize.split_matmul_comm_elemetwise : 0.000005s : 0.04% optimize.split_layernorm_comm : 0.000004s : 0.04% optimize.handle_group_info : 0.000003s : 0.03% optimize.symbol_engine_optimizer.build : 0.000003s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000013s : 0.11% optimize.symbol_engine_optimizer.elim_not_effective : 0.000018s : 0.15% optimize.symbol_engine_optimizer.opt_reshape : 0.000010s : 0.08% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000015s : 0.12% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000003s : 0.03% pipeline_parallel_scheduler : 0.000002s : 0.01% auto_monad_reorder : 0.000023s : 0.19% get_jit_bprop_graph : 0.000001s : 0.01% rewriter_after_jit_bprop_graph : 0.000004s : 0.04% opt_after_jit_grad : 0.000487s : 4.05% validate : 0.000040s : 0.33% Time group info: ------[substitution.] 0.000226 54 12.02% : 0.000027s : 6: substitution.cast_eliminate 1.07% : 0.000002s : 4: substitution.elim_not_effective 0.90% : 0.000002s : 4: substitution.fold_const_symbol 3.45% : 0.000008s : 7: substitution.graph_param_transform 64.03% : 0.000145s : 4: substitution.inline 2.09% : 0.000005s : 8: substitution.j_node_and_user_rematch 3.28% : 0.000007s : 8: substitution.remove_not_recompute_node 2.55% : 0.000006s : 6: substitution.replace_old_param 8.29% : 0.000019s : 6: substitution.tuple_list_get_item_eliminator 2.31% : 0.000005s : 1: substitution.value_based_eliminate ------[type_inference.] 0.005778 2 87.07% : 0.005030s : 1: type_inference.infer 12.93% : 0.000747s : 1: type_inference.specialize ------[replace.] 0.000071 10 52.87% : 0.000038s : 4: replace.inline 47.13% : 0.000034s : 6: replace.tuple_list_get_item_eliminator ------[match.] 0.000158 10 89.96% : 0.000142s : 4: match.inline 10.04% : 0.000016s : 6: match.tuple_list_get_item_eliminator ------[predicate.] 0.000324 2134 0.97% : 0.000003s : 22: predicate.accumulaten_eliminater 0.66% : 0.000002s : 7: predicate.ad_related_special_op_eliminate 0.57% : 0.000002s : 14: predicate.addn_check_dump 1.03% : 0.000003s : 22: predicate.addn_zero_filter 0.89% : 0.000003s : 22: predicate.adjust_all_reduce_mul_add 2.01% : 0.000006s : 36: predicate.arithmetic_simplify 1.11% : 0.000004s : 22: predicate.cast_eliminate 0.61% : 0.000002s : 14: predicate.check_bprop_eliminate 0.61% : 0.000002s : 14: predicate.compare_switch_simplify 0.21% : 0.000001s : 7: predicate.const_output_eliminate 0.68% : 0.000002s : 14: predicate.depend_value_elim 1.02% : 0.000003s : 22: predicate.dict_get_item_const_eliminator 1.11% : 0.000004s : 22: predicate.dict_get_item_eliminator 0.94% : 0.000003s : 22: predicate.dict_set_item_eliminator 0.81% : 0.000003s : 14: predicate.dumpgradient_eliminate 0.21% : 0.000001s : 7: predicate.elim_not_effective 0.36% : 0.000001s : 7: predicate.elim_shapecalc_of_broadcastargs 1.22% : 0.000004s : 29: predicate.environ_add_const_eliminate 1.17% : 0.000004s : 29: predicate.environ_get_add_eliminate 1.15% : 0.000004s : 29: predicate.environ_get_depend_swap 1.82% : 0.000006s : 43: predicate.environ_get_eliminate 1.22% : 0.000004s : 29: predicate.environ_get_set_eliminate 1.38% : 0.000004s : 32: predicate.exchange_switch_depend_value 2.18% : 0.000007s : 32: predicate.float_depend_g_call 0.58% : 0.000002s : 14: predicate.float_environ_get_switch 0.85% : 0.000003s : 21: predicate.float_tuple_getitem_switch 0.19% : 0.000001s : 7: predicate.fold_const_symbol 0.67% : 0.000002s : 14: predicate.get_grad_eliminate 0.23% : 0.000001s : 7: predicate.graph_param_transform 0.65% : 0.000002s : 14: predicate.incorporate_call 0.55% : 0.000002s : 14: predicate.incorporate_call_switch 6.15% : 0.000020s : 96: predicate.inline 0.88% : 0.000003s : 14: predicate.inline_without_move 0.38% : 0.000001s : 14: predicate.j_node_and_user_rematch 0.74% : 0.000002s : 14: predicate.less_batch_normalization 1.95% : 0.000006s : 42: predicate.list_to_tuple_eliminator_ 2.71% : 0.000009s : 64: predicate.load_eliminater 0.71% : 0.000002s : 7: predicate.loop_unroll_after_grad 2.04% : 0.000007s : 44: predicate.loop_unroll_before_grad 1.64% : 0.000005s : 36: predicate.make_slice_get_slice_eliminator 0.63% : 0.000002s : 14: predicate.merge_addn 0.58% : 0.000002s : 14: predicate.micro_step_allgather_replace 0.60% : 0.000002s : 14: predicate.mini_step_allgather_replace 0.87% : 0.000003s : 22: predicate.minmaximum_grad 0.72% : 0.000002s : 7: predicate.mutable_eliminate 0.35% : 0.000001s : 7: predicate.opt_reshape 0.34% : 0.000001s : 7: predicate.parallel_virtual_node 1.67% : 0.000005s : 32: predicate.partial_defer_inline 1.79% : 0.000006s : 35: predicate.partial_eliminate 0.93% : 0.000003s : 22: predicate.print_const_string_wrapper 0.59% : 0.000002s : 14: predicate.reduce_all_const_elim 1.14% : 0.000004s : 22: predicate.reduce_eliminate 2.66% : 0.000009s : 64: predicate.redundant_stop_gradient_eliminater 0.39% : 0.000001s : 14: predicate.remove_not_recompute_node 1.37% : 0.000004s : 42: predicate.replace_applicator 0.52% : 0.000002s : 14: predicate.replace_old_param 0.23% : 0.000001s : 7: predicate.reset_defer_inline 0.97% : 0.000003s : 22: predicate.reshape_eliminate 0.62% : 0.000002s : 14: predicate.row_tensor_add_zeros_like 0.35% : 0.000001s : 7: predicate.row_tensor_eliminate 0.73% : 0.000002s : 14: predicate.same_eliminate 0.45% : 0.000001s : 14: predicate.set_cell_output_no_recompute 0.76% : 0.000002s : 14: predicate.shard_identity_eliminate 0.66% : 0.000002s : 14: predicate.special_op_eliminate 0.76% : 0.000002s : 14: predicate.specialize_transform 0.81% : 0.000003s : 14: predicate.split_environ_get_set_with_tuple_value 0.76% : 0.000002s : 14: predicate.stack_unstack_eliminate 0.37% : 0.000001s : 7: predicate.switch_call_monad_eliminater 1.46% : 0.000005s : 32: predicate.switch_defer_inline 2.05% : 0.000007s : 46: predicate.switch_layer_defer_inline 4.75% : 0.000015s : 97: predicate.switch_simplify 0.98% : 0.000003s : 22: predicate.tile_eliminate 0.96% : 0.000003s : 22: predicate.transpose_eliminate 1.52% : 0.000005s : 36: predicate.tuple_list_convert_item_index_to_positive 1.68% : 0.000005s : 36: predicate.tuple_list_get_item_const_eliminator 1.57% : 0.000005s : 36: predicate.tuple_list_get_item_depend_reorder 3.31% : 0.000011s : 56: predicate.tuple_list_get_item_eliminator 1.58% : 0.000005s : 36: predicate.tuple_list_get_set_item_eliminator 2.35% : 0.000008s : 50: predicate.tuple_list_set_item_eliminator 1.87% : 0.000006s : 42: predicate.tuple_to_list_eliminator_ 2.61% : 0.000008s : 64: predicate.updatestate_pure_node_eliminater 3.29% : 0.000011s : 78: predicate.updatestate_useless_node_eliminater 0.42% : 0.000001s : 7: predicate.value_based_eliminate 0.65% : 0.000002s : 14: predicate.virtual_dataset_eliminate 0.74% : 0.000002s : 14: predicate.virtual_output_eliminate 0.30% : 0.000001s : 7: predicate.virtual_view_grad_eliminate 0.43% : 0.000001s : 7: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000520 11 54.38% : 0.000283s : 5: func_graph_cloner_run.FuncGraphClonerGraph 45.62% : 0.000237s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.028684 192 0.02% : 0.000006s : 1: ForceFp32Comm 10.45% : 0.002997s : 1: add_attr 10.40% : 0.002984s : 1: add_attr_with_inline 0.02% : 0.000006s : 1: add_comm_op_reuse_tag 0.22% : 0.000064s : 1: add_recomputation 0.02% : 0.000006s : 1: assign_add_opt 0.25% : 0.000073s : 1: auto_monad 0.11% : 0.000031s : 1: auto_monad_reorder 0.02% : 0.000006s : 1: begin_end_overlap_inline 0.03% : 0.000008s : 1: bias_add_comm_swap 1.64% : 0.000472s : 1: bootstrap 0.10% : 0.000030s : 1: cconv 0.02% : 0.000006s : 1: comm_op_add_attrs 0.08% : 0.000022s : 1: control_data_broadcast_order 0.05% : 0.000016s : 1: convert_after_rewriter 0.13% : 0.000037s : 1: cse_after_recomputation 0.03% : 0.000008s : 1: dataset_repeat_opt 0.06% : 0.000018s : 1: detach_backward 0.04% : 0.000013s : 1: environ_conv 0.10% : 0.000030s : 1: event_method 0.03% : 0.000007s : 1: full_micro_interleaved_order_control 0.02% : 0.000007s : 1: get_jit_bprop_graph 0.04% : 0.000012s : 1: graph_reusing 0.02% : 0.000006s : 1: grouped_pairwise_exchange_alltoall 0.02% : 0.000006s : 1: handle_group_info 0.03% : 0.000008s : 1: inline 0.03% : 0.000009s : 1: insert-virtual-dataset 0.02% : 0.000006s : 1: interleave_parallel_branches 0.02% : 0.000006s : 1: interleave_split_concat_branches 0.03% : 0.000008s : 1: label_fine_grained_interleaved_index 0.03% : 0.000009s : 1: label_micro_interleaved_index 1.53% : 0.000438s : 1: loop_unroll 0.02% : 0.000006s : 1: merge_cast_opt 0.03% : 0.000007s : 1: micro_interleaved_order_control 1.69% : 0.000486s : 1: mutable_eliminate 0.04% : 0.000011s : 1: offloading_packed_experts 0.06% : 0.000017s : 1: opt.transform.loop_unroll_optimizer 0.06% : 0.000018s : 1: opt.transform.mutable_eliminate 6.32% : 0.001812s : 78: opt.transform.opt_a 0.16% : 0.000047s : 1: opt.transform.opt_after_cconv 0.12% : 0.000035s : 1: opt.transform.opt_after_jit_grad 0.66% : 0.000188s : 28: opt.transform.opt_b 0.26% : 0.000076s : 2: opt.transform.opt_trans_graph 0.18% : 0.000053s : 4: opt.transform.symbol_engine_opt 12.91% : 0.003704s : 1: opt_a 0.56% : 0.000161s : 1: opt_after_cconv 1.73% : 0.000497s : 1: opt_after_jit_grad 1.31% : 0.000375s : 1: opt_b 22.80% : 0.006540s : 1: optimize 0.09% : 0.000025s : 1: optimize_parallel_all_gather_comm 0.04% : 0.000013s : 1: order_py_execute_after_rewriter 0.10% : 0.000028s : 1: overlap_grad_flash_sp 0.02% : 0.000007s : 1: overlap_grad_matmul_and_grad_allreduce 0.04% : 0.000010s : 1: overlap_grad_ring_attention 0.02% : 0.000007s : 1: overlap_opt_shard_grad_in_pipeline 0.02% : 0.000006s : 1: overlap_opt_shard_in_pipeline 0.03% : 0.000008s : 1: overlap_param_gather 0.02% : 0.000006s : 1: overlap_recompute_allgather_and_fa_grad 0.04% : 0.000011s : 1: overlap_recompute_and_grad_model_parallel 0.03% : 0.000007s : 1: overlap_recompute_comm 0.03% : 0.000009s : 1: parallel-infer-symbol 0.02% : 0.000006s : 1: parallel-infer-symbol-second 0.03% : 0.000007s : 1: partial_unused_args_eliminate 0.03% : 0.000010s : 1: pipeline_parallel_scheduler 0.03% : 0.000007s : 1: pipeline_split 0.22% : 0.000064s : 1: pre_auto_parallel 0.13% : 0.000036s : 1: py_interpret_to_execute 0.06% : 0.000016s : 1: py_interpret_to_execute_after_opt_a 0.02% : 0.000006s : 1: remove_cast_before_assign_add 0.15% : 0.000043s : 1: remove_dup_value 1.18% : 0.000338s : 1: renormalize.infer 1.02% : 0.000294s : 1: renormalize.specialize 0.03% : 0.000008s : 1: reorder_send_recv_between_fp_bp 0.04% : 0.000010s : 1: rewriter_after_jit_bprop_graph 0.24% : 0.000069s : 1: rewriter_after_opt_a 0.36% : 0.000104s : 1: rewriter_before_opt_a 0.03% : 0.000008s : 1: slice_cell_reuse_recomputed_activation 0.03% : 0.000008s : 1: slice_recompute_activation 0.02% : 0.000007s : 1: split_layernorm_comm 0.03% : 0.000007s : 1: split_matmul_comm_elemetwise 0.04% : 0.000013s : 1: swap_dp_allreduce_reducescatter 0.40% : 0.000115s : 1: symbol_engine_optimizer 0.43% : 0.000123s : 1: tuple_transform 20.42% : 0.005857s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:47:14.427.503 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0156911, [21] [bootstrap]: 0.00044469 [type_inference]: 0.00573801 [event_method]: 2.034e-05 [auto_monad]: 6.73e-05 [graph_reusing]: 5.54e-06 [inline]: 2.57001e-06 [add_attr]: 0.0030097, [1] [add_attr_with_inline]: 0.00300177, [1] [Cycle 1]: 5.909e-05, [2] [tag_attr]: 2.156e-05 [meta_addattr_fg_expand]: 6.89999e-06 [parallel-infer-symbol]: 3.39001e-06 [pre_auto_parallel]: 3.436e-05 [insert-virtual-dataset]: 2.48e-06 [parallel-infer-symbol-second]: 7.50006e-07 [dataset_repeat_opt]: 1.79998e-06 [pipeline_split]: 1.57999e-06 [optimize]: 0.00567866, [53] [py_interpret_to_execute]: 2.74e-05 [rewriter_before_opt_a]: 9.694e-05 [opt_a]: 0.00343812, [2] [Cycle 1]: 0.00250191, [45] [expand_dump_flag]: 3.09001e-06 [switch_simplify]: 4.998e-05 [loop_unroll]: 3.614e-05 [a_1]: 0.00089124 [with_stream_mark]: 1.587e-05 [recompute_prepare]: 1.159e-05 [updatestate_depend_eliminate]: 4.99e-06 [updatestate_assign_eliminate]: 4.63001e-06 [updatestate_loads_eliminate]: 3.97e-06 [parameter_eliminate]: 1.89e-06 [a_2]: 0.00013137 [accelerated_algorithm]: 1.049e-05 [shard]: 2.13002e-06 [meta_shard_fg_expand]: 2.15002e-06 [shard_inline]: 9.89001e-06 [merge_send_recv]: 1.025e-05 [auto_parallel]: 7.23e-06 [parallel]: 1.791e-05 [flash_sp]: 7.87e-06 [merge_comm]: 5.47001e-06 [allreduce_fusion]: 5.20001e-06 [matmul_add_comm_reduction]: 1.184e-05 [allreduce_slice_to_reducescatter]: 5.90022e-07 [virtual_shard_identity]: 1.107e-05 [virtual_dataset]: 9.49e-06 [get_grad_eliminate_]: 9.15999e-06 [virtual_output]: 9.32999e-06 [merge_forward]: 4.89e-06 [cell_reuse_recompute_pass]: 1.31002e-06 [offload_activation]: 1.142e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.777e-05 [merge_recompute_call_nodes]: 1.47999e-06 [before_grad]: 1.583e-05 [set_forward_comm_id_for_comm_node_pass]: 5.22e-06 [meta_fg_expand]: 3.98999e-06 [flash_sp_send_recv_attached]: 2.54001e-06 [receive_attached]: 2.01e-06 [after_resolve]: 1.58e-05 [a_after_grad]: 1.532e-05 [renormalize]: 0.00073266 [add_forward_monad_depend]: 4.95999e-06 [auto_monad_grad]: 1.99e-06 [auto_monad_eliminator]: 1.831e-05 [cse]: 4.988e-05 [a_3]: 6.88e-05 [Cycle 2]: 0.00092633, [45] [expand_dump_flag]: 1.07998e-06 [switch_simplify]: 1.09e-05 [loop_unroll]: 9.99001e-06 [a_1]: 0.00025947 [with_stream_mark]: 1.335e-05 [recompute_prepare]: 9.71998e-06 [updatestate_depend_eliminate]: 4.75999e-06 [updatestate_assign_eliminate]: 3.69002e-06 [updatestate_loads_eliminate]: 3.63e-06 [parameter_eliminate]: 9.89996e-07 [a_2]: 0.00011893 [accelerated_algorithm]: 9.27001e-06 [shard]: 1.55999e-06 [meta_shard_fg_expand]: 1.72999e-06 [shard_inline]: 9.20001e-06 [merge_send_recv]: 6.78e-06 [auto_parallel]: 6.83998e-06 [parallel]: 4.72e-06 [flash_sp]: 3.46999e-06 [merge_comm]: 4.92e-06 [allreduce_fusion]: 4.90999e-06 [matmul_add_comm_reduction]: 7.11001e-06 [allreduce_slice_to_reducescatter]: 3.30008e-07 [virtual_shard_identity]: 9.67001e-06 [virtual_dataset]: 9.07001e-06 [get_grad_eliminate_]: 8.92e-06 [virtual_output]: 1.093e-05 [merge_forward]: 3.93001e-06 [cell_reuse_recompute_pass]: 1.40999e-06 [offload_activation]: 8.95001e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.825e-05 [merge_recompute_call_nodes]: 9.80013e-07 [before_grad]: 1.464e-05 [set_forward_comm_id_for_comm_node_pass]: 5.29e-06 [meta_fg_expand]: 3.46001e-06 [flash_sp_send_recv_attached]: 8.70001e-07 [receive_attached]: 1.02e-06 [after_resolve]: 1.433e-05 [a_after_grad]: 1.439e-05 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 1.32999e-06 [auto_monad_grad]: 1.42999e-06 [auto_monad_eliminator]: 1.073e-05 [cse]: 2.675e-05 [a_3]: 5.934e-05 [py_interpret_to_execute_after_opt_a]: 1.129e-05 [slice_cell_reuse_recomputed_activation]: 2.24001e-06 [rewriter_after_opt_a]: 4.494e-05 [convert_after_rewriter]: 8.76997e-06 [order_py_execute_after_rewriter]: 6.47001e-06 [mutable_eliminate]: 0.00048112 [opt_b]: 0.00030803, [1] [Cycle 1]: 0.00030191, [7] [b_1]: 0.00020838 [b_2]: 1.196e-05 [updatestate_depend_eliminate]: 7.08e-06 [updatestate_assign_eliminate]: 3.89002e-06 [updatestate_loads_eliminate]: 4e-06 [renormalize]: 5.10016e-07 [cse]: 2.991e-05 [optimize_parallel_all_gather_comm]: 2.1e-05 [overlap_param_gather]: 1.94e-06 [cconv]: 2.489e-05 [loop_unroll]: 0.00042292 [opt_after_cconv]: 0.00013263, [1] [Cycle 1]: 0.00012719, [7] [c_1]: 4.757e-05 [parameter_eliminate]: 2.79999e-06 [updatestate_depend_eliminate]: 6.91001e-06 [updatestate_assign_eliminate]: 3.81999e-06 [updatestate_loads_eliminate]: 3.55e-06 [cse]: 2.751e-05 [renormalize]: 3.70026e-07 [remove_dup_value]: 3.673e-05 [tuple_transform]: 0.00010284, [1] [Cycle 1]: 9.841e-05, [4] [d_1]: 6.683e-05 [none_parameter_eliminate]: 1.71e-06 [renormalize]: 1.59984e-07 [switch_simplify]: 1.05e-05 [partial_unused_args_eliminate]: 1.86e-06 [add_recomputation]: 6.08e-05 [cse_after_recomputation]: 2.815e-05, [1] [Cycle 1]: 2.373e-05, [1] [cse]: 1.823e-05 [environ_conv]: 5.91e-06 [swap_dp_allreduce_reducescatter]: 6.89999e-06 [bias_add_comm_swap]: 2.88e-06 [label_micro_interleaved_index]: 4.13999e-06 [label_fine_grained_interleaved_index]: 2.68e-06 [merge_cast_opt]: 1.25999e-06 [slice_recompute_activation]: 2.24999e-06 [micro_interleaved_order_control]: 2.71999e-06 [assign_add_opt]: 1.34e-06 [ForceFp32Comm]: 9.89996e-07 [remove_cast_before_assign_add]: 1.19e-06 [full_micro_interleaved_order_control]: 2.20002e-06 [reorder_send_recv_between_fp_bp]: 2.54001e-06 [comm_op_add_attrs]: 1.30999e-06 [add_comm_op_reuse_tag]: 1.18001e-06 [interleave_split_concat_branches]: 1.42e-06 [interleave_parallel_branches]: 1.02e-06 [overlap_opt_shard_in_pipeline]: 1.05001e-06 [overlap_opt_shard_grad_in_pipeline]: 2.27001e-06 [control_data_broadcast_order]: 1.74e-05 [grouped_pairwise_exchange_alltoall]: 1.39e-06 [offloading_packed_experts]: 4.25999e-06 [overlap_recompute_and_grad_model_parallel]: 5.66e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.12999e-06 [overlap_recompute_allgather_and_fa_grad]: 1.35999e-06 [overlap_recompute_comm]: 2.67001e-06 [overlap_grad_ring_attention]: 4.82e-06 [overlap_grad_flash_sp]: 2.307e-05 [begin_end_overlap_inline]: 5.50004e-07 [split_matmul_comm_elemetwise]: 1.91e-06 [split_layernorm_comm]: 1.62999e-06 [handle_group_info]: 1.14e-06 [symbol_engine_optimizer]: 9.045e-05, [1] [Cycle 1]: 8.607e-05, [6] [build]: 2.94001e-06 [elim_shapecalc]: 1.335e-05 [elim_not_effective]: 1.733e-05 [opt_reshape]: 9.82999e-06 [fold_const_symbol]: 1.495e-05 [renormalize]: 1.80007e-07 [detach_backward]: 2.27999e-06 [pipeline_parallel_scheduler]: 1.45999e-06 [auto_monad_reorder]: 2.176e-05 [get_jit_bprop_graph]: 1.32999e-06 [rewriter_after_jit_bprop_graph]: 3.71001e-06 [opt_after_jit_grad]: 0.00046824 [validate]: 4.279e-05 Sums bootstrap : 0.000445s : 3.78% type_inference : 0.005738s : 48.83% event_method : 0.000020s : 0.17% auto_monad : 0.000067s : 0.57% graph_reusing : 0.000006s : 0.05% inline : 0.000003s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000022s : 0.18% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000007s : 0.06% parallel-infer-symbol : 0.000003s : 0.03% pre_auto_parallel : 0.000034s : 0.29% insert-virtual-dataset : 0.000002s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000027s : 0.23% optimize.rewriter_before_opt_a : 0.000097s : 0.82% optimize.opt_a.expand_dump_flag : 0.000004s : 0.04% optimize.opt_a.switch_simplify : 0.000061s : 0.52% optimize.opt_a.loop_unroll : 0.000046s : 0.39% optimize.opt_a.a_1 : 0.001151s : 9.79% optimize.opt_a.with_stream_mark : 0.000029s : 0.25% optimize.opt_a.recompute_prepare : 0.000021s : 0.18% optimize.opt_a.updatestate_depend_eliminate : 0.000010s : 0.08% optimize.opt_a.updatestate_assign_eliminate : 0.000008s : 0.07% optimize.opt_a.updatestate_loads_eliminate : 0.000008s : 0.06% optimize.opt_a.parameter_eliminate : 0.000003s : 0.02% optimize.opt_a.a_2 : 0.000250s : 2.13% optimize.opt_a.accelerated_algorithm : 0.000020s : 0.17% optimize.opt_a.shard : 0.000004s : 0.03% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.03% optimize.opt_a.shard_inline : 0.000019s : 0.16% optimize.opt_a.merge_send_recv : 0.000017s : 0.14% optimize.opt_a.auto_parallel : 0.000014s : 0.12% optimize.opt_a.parallel : 0.000023s : 0.19% optimize.opt_a.flash_sp : 0.000011s : 0.10% optimize.opt_a.merge_comm : 0.000010s : 0.09% optimize.opt_a.allreduce_fusion : 0.000010s : 0.09% optimize.opt_a.matmul_add_comm_reduction : 0.000019s : 0.16% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000021s : 0.18% optimize.opt_a.virtual_dataset : 0.000019s : 0.16% optimize.opt_a.get_grad_eliminate_ : 0.000018s : 0.15% optimize.opt_a.virtual_output : 0.000020s : 0.17% optimize.opt_a.merge_forward : 0.000009s : 0.08% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.02% optimize.opt_a.offload_activation : 0.000020s : 0.17% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000036s : 0.31% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.02% optimize.opt_a.before_grad : 0.000030s : 0.26% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000011s : 0.09% optimize.opt_a.meta_fg_expand : 0.000007s : 0.06% optimize.opt_a.flash_sp_send_recv_attached : 0.000003s : 0.03% optimize.opt_a.receive_attached : 0.000003s : 0.03% optimize.opt_a.after_resolve : 0.000030s : 0.26% optimize.opt_a.a_after_grad : 0.000030s : 0.25% optimize.opt_a.renormalize : 0.000733s : 6.24% optimize.opt_a.add_forward_monad_depend : 0.000006s : 0.05% optimize.opt_a.auto_monad_grad : 0.000003s : 0.03% optimize.opt_a.auto_monad_eliminator : 0.000029s : 0.25% optimize.opt_a.cse : 0.000077s : 0.65% optimize.opt_a.a_3 : 0.000128s : 1.09% optimize.py_interpret_to_execute_after_opt_a : 0.000011s : 0.10% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.02% optimize.rewriter_after_opt_a : 0.000045s : 0.38% optimize.convert_after_rewriter : 0.000009s : 0.07% optimize.order_py_execute_after_rewriter : 0.000006s : 0.06% optimize.mutable_eliminate : 0.000481s : 4.09% optimize.opt_b.b_1 : 0.000208s : 1.77% optimize.opt_b.b_2 : 0.000012s : 0.10% optimize.opt_b.updatestate_depend_eliminate : 0.000007s : 0.06% optimize.opt_b.updatestate_assign_eliminate : 0.000004s : 0.03% optimize.opt_b.updatestate_loads_eliminate : 0.000004s : 0.03% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000030s : 0.25% optimize.optimize_parallel_all_gather_comm : 0.000021s : 0.18% optimize.overlap_param_gather : 0.000002s : 0.02% optimize.cconv : 0.000025s : 0.21% optimize.loop_unroll : 0.000423s : 3.60% optimize.opt_after_cconv.c_1 : 0.000048s : 0.40% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.02% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000007s : 0.06% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000004s : 0.03% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000004s : 0.03% optimize.opt_after_cconv.cse : 0.000028s : 0.23% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000037s : 0.31% optimize.tuple_transform.d_1 : 0.000067s : 0.57% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000010s : 0.09% optimize.partial_unused_args_eliminate : 0.000002s : 0.02% optimize.add_recomputation : 0.000061s : 0.52% optimize.cse_after_recomputation.cse : 0.000018s : 0.16% optimize.environ_conv : 0.000006s : 0.05% optimize.swap_dp_allreduce_reducescatter : 0.000007s : 0.06% optimize.bias_add_comm_swap : 0.000003s : 0.02% optimize.label_micro_interleaved_index : 0.000004s : 0.04% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.02% optimize.merge_cast_opt : 0.000001s : 0.01% optimize.slice_recompute_activation : 0.000002s : 0.02% optimize.micro_interleaved_order_control : 0.000003s : 0.02% optimize.assign_add_opt : 0.000001s : 0.01% optimize.ForceFp32Comm : 0.000001s : 0.01% optimize.remove_cast_before_assign_add : 0.000001s : 0.01% optimize.full_micro_interleaved_order_control : 0.000002s : 0.02% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.02% optimize.comm_op_add_attrs : 0.000001s : 0.01% optimize.add_comm_op_reuse_tag : 0.000001s : 0.01% optimize.interleave_split_concat_branches : 0.000001s : 0.01% optimize.interleave_parallel_branches : 0.000001s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.02% optimize.control_data_broadcast_order : 0.000017s : 0.15% optimize.grouped_pairwise_exchange_alltoall : 0.000001s : 0.01% optimize.offloading_packed_experts : 0.000004s : 0.04% optimize.overlap_recompute_and_grad_model_parallel : 0.000006s : 0.05% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.01% optimize.overlap_recompute_comm : 0.000003s : 0.02% optimize.overlap_grad_ring_attention : 0.000005s : 0.04% optimize.overlap_grad_flash_sp : 0.000023s : 0.20% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.02% optimize.split_layernorm_comm : 0.000002s : 0.01% optimize.handle_group_info : 0.000001s : 0.01% optimize.symbol_engine_optimizer.build : 0.000003s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000013s : 0.11% optimize.symbol_engine_optimizer.elim_not_effective : 0.000017s : 0.15% optimize.symbol_engine_optimizer.opt_reshape : 0.000010s : 0.08% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000015s : 0.13% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.02% pipeline_parallel_scheduler : 0.000001s : 0.01% auto_monad_reorder : 0.000022s : 0.19% get_jit_bprop_graph : 0.000001s : 0.01% rewriter_after_jit_bprop_graph : 0.000004s : 0.03% opt_after_jit_grad : 0.000468s : 3.98% validate : 0.000043s : 0.36% Time group info: ------[substitution.] 0.000242 54 11.46% : 0.000028s : 6: substitution.cast_eliminate 1.03% : 0.000002s : 4: substitution.elim_not_effective 0.83% : 0.000002s : 4: substitution.fold_const_symbol 3.15% : 0.000008s : 7: substitution.graph_param_transform 66.01% : 0.000159s : 4: substitution.inline 2.17% : 0.000005s : 8: substitution.j_node_and_user_rematch 3.17% : 0.000008s : 8: substitution.remove_not_recompute_node 2.17% : 0.000005s : 6: substitution.replace_old_param 7.65% : 0.000018s : 6: substitution.tuple_list_get_item_eliminator 2.38% : 0.000006s : 1: substitution.value_based_eliminate ------[type_inference.] 0.005679 2 86.75% : 0.004927s : 1: type_inference.infer 13.25% : 0.000752s : 1: type_inference.specialize ------[replace.] 0.000071 10 52.17% : 0.000037s : 4: replace.inline 47.83% : 0.000034s : 6: replace.tuple_list_get_item_eliminator ------[match.] 0.000173 10 90.91% : 0.000157s : 4: match.inline 9.09% : 0.000016s : 6: match.tuple_list_get_item_eliminator ------[predicate.] 0.000317 2134 0.92% : 0.000003s : 22: predicate.accumulaten_eliminater 0.55% : 0.000002s : 7: predicate.ad_related_special_op_eliminate 0.61% : 0.000002s : 14: predicate.addn_check_dump 0.96% : 0.000003s : 22: predicate.addn_zero_filter 0.88% : 0.000003s : 22: predicate.adjust_all_reduce_mul_add 1.96% : 0.000006s : 36: predicate.arithmetic_simplify 1.11% : 0.000004s : 22: predicate.cast_eliminate 0.61% : 0.000002s : 14: predicate.check_bprop_eliminate 0.60% : 0.000002s : 14: predicate.compare_switch_simplify 0.21% : 0.000001s : 7: predicate.const_output_eliminate 0.62% : 0.000002s : 14: predicate.depend_value_elim 1.00% : 0.000003s : 22: predicate.dict_get_item_const_eliminator 1.05% : 0.000003s : 22: predicate.dict_get_item_eliminator 0.91% : 0.000003s : 22: predicate.dict_set_item_eliminator 0.87% : 0.000003s : 14: predicate.dumpgradient_eliminate 0.21% : 0.000001s : 7: predicate.elim_not_effective 0.40% : 0.000001s : 7: predicate.elim_shapecalc_of_broadcastargs 1.22% : 0.000004s : 29: predicate.environ_add_const_eliminate 1.20% : 0.000004s : 29: predicate.environ_get_add_eliminate 1.20% : 0.000004s : 29: predicate.environ_get_depend_swap 1.84% : 0.000006s : 43: predicate.environ_get_eliminate 1.16% : 0.000004s : 29: predicate.environ_get_set_eliminate 1.44% : 0.000005s : 32: predicate.exchange_switch_depend_value 2.25% : 0.000007s : 32: predicate.float_depend_g_call 0.57% : 0.000002s : 14: predicate.float_environ_get_switch 0.85% : 0.000003s : 21: predicate.float_tuple_getitem_switch 0.20% : 0.000001s : 7: predicate.fold_const_symbol 0.68% : 0.000002s : 14: predicate.get_grad_eliminate 0.21% : 0.000001s : 7: predicate.graph_param_transform 0.66% : 0.000002s : 14: predicate.incorporate_call 0.56% : 0.000002s : 14: predicate.incorporate_call_switch 6.18% : 0.000020s : 96: predicate.inline 0.85% : 0.000003s : 14: predicate.inline_without_move 0.35% : 0.000001s : 14: predicate.j_node_and_user_rematch 0.81% : 0.000003s : 14: predicate.less_batch_normalization 1.85% : 0.000006s : 42: predicate.list_to_tuple_eliminator_ 2.66% : 0.000008s : 64: predicate.load_eliminater 0.73% : 0.000002s : 7: predicate.loop_unroll_after_grad 1.90% : 0.000006s : 44: predicate.loop_unroll_before_grad 1.64% : 0.000005s : 36: predicate.make_slice_get_slice_eliminator 0.61% : 0.000002s : 14: predicate.merge_addn 0.61% : 0.000002s : 14: predicate.micro_step_allgather_replace 0.66% : 0.000002s : 14: predicate.mini_step_allgather_replace 0.91% : 0.000003s : 22: predicate.minmaximum_grad 0.80% : 0.000003s : 7: predicate.mutable_eliminate 0.35% : 0.000001s : 7: predicate.opt_reshape 0.38% : 0.000001s : 7: predicate.parallel_virtual_node 1.65% : 0.000005s : 32: predicate.partial_defer_inline 1.85% : 0.000006s : 35: predicate.partial_eliminate 0.91% : 0.000003s : 22: predicate.print_const_string_wrapper 0.62% : 0.000002s : 14: predicate.reduce_all_const_elim 1.25% : 0.000004s : 22: predicate.reduce_eliminate 2.72% : 0.000009s : 64: predicate.redundant_stop_gradient_eliminater 0.40% : 0.000001s : 14: predicate.remove_not_recompute_node 1.42% : 0.000005s : 42: predicate.replace_applicator 0.51% : 0.000002s : 14: predicate.replace_old_param 0.27% : 0.000001s : 7: predicate.reset_defer_inline 0.98% : 0.000003s : 22: predicate.reshape_eliminate 0.60% : 0.000002s : 14: predicate.row_tensor_add_zeros_like 0.38% : 0.000001s : 7: predicate.row_tensor_eliminate 0.72% : 0.000002s : 14: predicate.same_eliminate 0.45% : 0.000001s : 14: predicate.set_cell_output_no_recompute 0.75% : 0.000002s : 14: predicate.shard_identity_eliminate 0.65% : 0.000002s : 14: predicate.special_op_eliminate 0.78% : 0.000002s : 14: predicate.specialize_transform 0.79% : 0.000003s : 14: predicate.split_environ_get_set_with_tuple_value 0.74% : 0.000002s : 14: predicate.stack_unstack_eliminate 0.39% : 0.000001s : 7: predicate.switch_call_monad_eliminater 1.48% : 0.000005s : 32: predicate.switch_defer_inline 2.04% : 0.000006s : 46: predicate.switch_layer_defer_inline 4.70% : 0.000015s : 97: predicate.switch_simplify 1.02% : 0.000003s : 22: predicate.tile_eliminate 1.06% : 0.000003s : 22: predicate.transpose_eliminate 1.58% : 0.000005s : 36: predicate.tuple_list_convert_item_index_to_positive 1.68% : 0.000005s : 36: predicate.tuple_list_get_item_const_eliminator 1.57% : 0.000005s : 36: predicate.tuple_list_get_item_depend_reorder 3.16% : 0.000010s : 56: predicate.tuple_list_get_item_eliminator 1.58% : 0.000005s : 36: predicate.tuple_list_get_set_item_eliminator 2.32% : 0.000007s : 50: predicate.tuple_list_set_item_eliminator 1.80% : 0.000006s : 42: predicate.tuple_to_list_eliminator_ 2.63% : 0.000008s : 64: predicate.updatestate_pure_node_eliminater 3.32% : 0.000011s : 78: predicate.updatestate_useless_node_eliminater 0.44% : 0.000001s : 7: predicate.value_based_eliminate 0.66% : 0.000002s : 14: predicate.virtual_dataset_eliminate 0.64% : 0.000002s : 14: predicate.virtual_output_eliminate 0.32% : 0.000001s : 7: predicate.virtual_view_grad_eliminate 0.38% : 0.000001s : 7: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000530 11 53.67% : 0.000284s : 5: func_graph_cloner_run.FuncGraphClonerGraph 46.33% : 0.000245s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.027227 192 0.01% : 0.000004s : 1: ForceFp32Comm 11.07% : 0.003014s : 1: add_attr 11.04% : 0.003006s : 1: add_attr_with_inline 0.01% : 0.000004s : 1: add_comm_op_reuse_tag 0.24% : 0.000065s : 1: add_recomputation 0.01% : 0.000004s : 1: assign_add_opt 0.27% : 0.000072s : 1: auto_monad 0.09% : 0.000025s : 1: auto_monad_reorder 0.01% : 0.000003s : 1: begin_end_overlap_inline 0.02% : 0.000006s : 1: bias_add_comm_swap 1.73% : 0.000470s : 1: bootstrap 0.10% : 0.000028s : 1: cconv 0.01% : 0.000004s : 1: comm_op_add_attrs 0.08% : 0.000021s : 1: control_data_broadcast_order 0.04% : 0.000012s : 1: convert_after_rewriter 0.11% : 0.000031s : 1: cse_after_recomputation 0.02% : 0.000005s : 1: dataset_repeat_opt 0.02% : 0.000005s : 1: detach_backward 0.03% : 0.000009s : 1: environ_conv 0.09% : 0.000026s : 1: event_method 0.02% : 0.000005s : 1: full_micro_interleaved_order_control 0.02% : 0.000004s : 1: get_jit_bprop_graph 0.03% : 0.000009s : 1: graph_reusing 0.01% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000004s : 1: handle_group_info 0.02% : 0.000005s : 1: inline 0.02% : 0.000006s : 1: insert-virtual-dataset 0.01% : 0.000004s : 1: interleave_parallel_branches 0.02% : 0.000004s : 1: interleave_split_concat_branches 0.02% : 0.000006s : 1: label_fine_grained_interleaved_index 0.03% : 0.000007s : 1: label_micro_interleaved_index 1.58% : 0.000431s : 1: loop_unroll 0.01% : 0.000004s : 1: merge_cast_opt 0.02% : 0.000006s : 1: micro_interleaved_order_control 1.80% : 0.000490s : 1: mutable_eliminate 0.03% : 0.000007s : 1: offloading_packed_experts 0.06% : 0.000017s : 1: opt.transform.loop_unroll_optimizer 0.07% : 0.000018s : 1: opt.transform.mutable_eliminate 6.65% : 0.001810s : 78: opt.transform.opt_a 0.17% : 0.000046s : 1: opt.transform.opt_after_cconv 0.13% : 0.000035s : 1: opt.transform.opt_after_jit_grad 0.69% : 0.000189s : 28: opt.transform.opt_b 0.27% : 0.000074s : 2: opt.transform.opt_trans_graph 0.19% : 0.000052s : 4: opt.transform.symbol_engine_opt 12.64% : 0.003441s : 1: opt_a 0.50% : 0.000136s : 1: opt_after_cconv 1.75% : 0.000477s : 1: opt_after_jit_grad 1.14% : 0.000312s : 1: opt_b 20.87% : 0.005683s : 1: optimize 0.09% : 0.000025s : 1: optimize_parallel_all_gather_comm 0.04% : 0.000010s : 1: order_py_execute_after_rewriter 0.10% : 0.000026s : 1: overlap_grad_flash_sp 0.01% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.03% : 0.000008s : 1: overlap_grad_ring_attention 0.02% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.02% : 0.000005s : 1: overlap_param_gather 0.01% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.03% : 0.000008s : 1: overlap_recompute_and_grad_model_parallel 0.02% : 0.000005s : 1: overlap_recompute_comm 0.03% : 0.000007s : 1: parallel-infer-symbol 0.01% : 0.000004s : 1: parallel-infer-symbol-second 0.02% : 0.000005s : 1: partial_unused_args_eliminate 0.02% : 0.000005s : 1: pipeline_parallel_scheduler 0.02% : 0.000004s : 1: pipeline_split 0.14% : 0.000038s : 1: pre_auto_parallel 0.12% : 0.000032s : 1: py_interpret_to_execute 0.05% : 0.000015s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000004s : 1: remove_cast_before_assign_add 0.15% : 0.000041s : 1: remove_dup_value 1.55% : 0.000421s : 1: renormalize.infer 1.12% : 0.000304s : 1: renormalize.specialize 0.02% : 0.000005s : 1: reorder_send_recv_between_fp_bp 0.02% : 0.000007s : 1: rewriter_after_jit_bprop_graph 0.18% : 0.000049s : 1: rewriter_after_opt_a 0.37% : 0.000101s : 1: rewriter_before_opt_a 0.02% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.02% : 0.000005s : 1: slice_recompute_activation 0.02% : 0.000004s : 1: split_layernorm_comm 0.02% : 0.000005s : 1: split_matmul_comm_elemetwise 0.04% : 0.000010s : 1: swap_dp_allreduce_reducescatter 0.34% : 0.000093s : 1: symbol_engine_optimizer 0.39% : 0.000106s : 1: tuple_transform 21.13% : 0.005752s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:47:14.614.844 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:47:14.615.100 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0161757, [21] [bootstrap]: 0.00044287 [type_inference]: 0.00567945 [event_method]: 1.98e-05 [auto_monad]: 6.153e-05 [graph_reusing]: 6.11998e-06 [inline]: 1.81003e-06 [add_attr]: 0.00298151, [1] [add_attr_with_inline]: 0.00297355, [1] [Cycle 1]: 6.537e-05, [2] [tag_attr]: 2.082e-05 [meta_addattr_fg_expand]: 6.64001e-06 [parallel-infer-symbol]: 2.92002e-06 [pre_auto_parallel]: 3.256e-05 [insert-virtual-dataset]: 2.17999e-06 [parallel-infer-symbol-second]: 6.50005e-07 [dataset_repeat_opt]: 1.71e-06 [pipeline_split]: 1.53002e-06 [optimize]: 0.00583649, [53] [py_interpret_to_execute]: 3.136e-05 [rewriter_before_opt_a]: 9.558e-05 [opt_a]: 0.00344921, [2] [Cycle 1]: 0.00245475, [45] [expand_dump_flag]: 3.68e-06 [switch_simplify]: 4.807e-05 [loop_unroll]: 3.588e-05 [a_1]: 0.00086655 [with_stream_mark]: 1.455e-05 [recompute_prepare]: 1.039e-05 [updatestate_depend_eliminate]: 4.58999e-06 [updatestate_assign_eliminate]: 3.9e-06 [updatestate_loads_eliminate]: 4e-06 [parameter_eliminate]: 1.84e-06 [a_2]: 0.00013746 [accelerated_algorithm]: 9.09e-06 [shard]: 1.73002e-06 [meta_shard_fg_expand]: 2.08998e-06 [shard_inline]: 8.42e-06 [merge_send_recv]: 9.94999e-06 [auto_parallel]: 7.18e-06 [parallel]: 1.75e-05 [flash_sp]: 7.48e-06 [merge_comm]: 5.00999e-06 [allreduce_fusion]: 4.55999e-06 [matmul_add_comm_reduction]: 1.068e-05 [allreduce_slice_to_reducescatter]: 6.80011e-07 [virtual_shard_identity]: 9.97001e-06 [virtual_dataset]: 8.64003e-06 [get_grad_eliminate_]: 8.29998e-06 [virtual_output]: 8.30999e-06 [merge_forward]: 4.50999e-06 [cell_reuse_recompute_pass]: 1.17999e-06 [offload_activation]: 1.118e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.673e-05 [merge_recompute_call_nodes]: 1.83997e-06 [before_grad]: 1.326e-05 [set_forward_comm_id_for_comm_node_pass]: 4.37998e-06 [meta_fg_expand]: 3.44001e-06 [flash_sp_send_recv_attached]: 2.45002e-06 [receive_attached]: 2.79001e-06 [after_resolve]: 1.436e-05 [a_after_grad]: 1.357e-05 [renormalize]: 0.00060289 [add_forward_monad_depend]: 5.69999e-06 [auto_monad_grad]: 1.81e-06 [auto_monad_eliminator]: 1.633e-05 [cse]: 3.428e-05 [a_3]: 7.625e-05 [Cycle 2]: 0.00098212, [45] [expand_dump_flag]: 1.08001e-06 [switch_simplify]: 9.69999e-06 [loop_unroll]: 8.30999e-06 [a_1]: 0.00020257 [with_stream_mark]: 1.139e-05 [recompute_prepare]: 8.32e-06 [updatestate_depend_eliminate]: 3.88001e-06 [updatestate_assign_eliminate]: 3.11001e-06 [updatestate_loads_eliminate]: 3.02002e-06 [parameter_eliminate]: 1.10999e-06 [a_2]: 0.00013305 [accelerated_algorithm]: 8.62e-06 [shard]: 1.07e-06 [meta_shard_fg_expand]: 1.49998e-06 [shard_inline]: 8.30999e-06 [merge_send_recv]: 5.59e-06 [auto_parallel]: 6.79001e-06 [parallel]: 4.57e-06 [flash_sp]: 2.98e-06 [merge_comm]: 4.32e-06 [allreduce_fusion]: 4.41002e-06 [matmul_add_comm_reduction]: 6.59999e-06 [allreduce_slice_to_reducescatter]: 3.39991e-07 [virtual_shard_identity]: 9.07001e-06 [virtual_dataset]: 8.27e-06 [get_grad_eliminate_]: 9.86e-06 [virtual_output]: 7.82e-06 [merge_forward]: 3.38e-06 [cell_reuse_recompute_pass]: 1.35001e-06 [offload_activation]: 7.53e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.737e-05 [merge_recompute_call_nodes]: 7.59988e-07 [before_grad]: 1.253e-05 [set_forward_comm_id_for_comm_node_pass]: 4.4e-06 [meta_fg_expand]: 3.19001e-06 [flash_sp_send_recv_attached]: 8.49977e-07 [receive_attached]: 9.5999e-07 [after_resolve]: 1.371e-05 [a_after_grad]: 1.266e-05 [renormalize]: 8.00064e-08 [add_forward_monad_depend]: 1.35999e-06 [auto_monad_grad]: 1.05999e-06 [auto_monad_eliminator]: 8.52e-06 [cse]: 1.813e-05 [a_3]: 6.508e-05 [py_interpret_to_execute_after_opt_a]: 1.289e-05 [slice_cell_reuse_recomputed_activation]: 4.68001e-06 [rewriter_after_opt_a]: 4.412e-05 [convert_after_rewriter]: 1.093e-05 [order_py_execute_after_rewriter]: 8.85001e-06 [mutable_eliminate]: 0.00050873 [opt_b]: 0.00033439, [1] [Cycle 1]: 0.00032565, [7] [b_1]: 0.00022445 [b_2]: 1.017e-05 [updatestate_depend_eliminate]: 6.00002e-06 [updatestate_assign_eliminate]: 3.16999e-06 [updatestate_loads_eliminate]: 3.6e-06 [renormalize]: 4.89992e-07 [cse]: 2.152e-05 [optimize_parallel_all_gather_comm]: 2.022e-05 [overlap_param_gather]: 4.68001e-06 [cconv]: 2.625e-05 [loop_unroll]: 0.00043255 [opt_after_cconv]: 0.00014367, [1] [Cycle 1]: 0.00013537, [7] [c_1]: 4.238e-05 [parameter_eliminate]: 2.48e-06 [updatestate_depend_eliminate]: 6.07999e-06 [updatestate_assign_eliminate]: 3.46999e-06 [updatestate_loads_eliminate]: 3.33e-06 [cse]: 2.186e-05 [renormalize]: 3.89991e-07 [remove_dup_value]: 1.783e-05 [tuple_transform]: 0.00010625, [1] [Cycle 1]: 9.938e-05, [4] [d_1]: 5.805e-05 [none_parameter_eliminate]: 1.57001e-06 [renormalize]: 2.50002e-07 [switch_simplify]: 9.24e-06 [partial_unused_args_eliminate]: 4.32e-06 [add_recomputation]: 5.711e-05 [cse_after_recomputation]: 3.124e-05, [1] [Cycle 1]: 2.427e-05, [1] [cse]: 1.514e-05 [environ_conv]: 9.37999e-06 [swap_dp_allreduce_reducescatter]: 8.60001e-06 [bias_add_comm_swap]: 4.82e-06 [label_micro_interleaved_index]: 6.71e-06 [label_fine_grained_interleaved_index]: 5.07e-06 [merge_cast_opt]: 3.64002e-06 [slice_recompute_activation]: 4.22e-06 [micro_interleaved_order_control]: 5.00001e-06 [assign_add_opt]: 3.59002e-06 [ForceFp32Comm]: 3.20002e-06 [remove_cast_before_assign_add]: 3.29001e-06 [full_micro_interleaved_order_control]: 4.60999e-06 [reorder_send_recv_between_fp_bp]: 5.24e-06 [comm_op_add_attrs]: 3.35e-06 [add_comm_op_reuse_tag]: 3.23998e-06 [interleave_split_concat_branches]: 3.42002e-06 [interleave_parallel_branches]: 3.5e-06 [overlap_opt_shard_in_pipeline]: 3.43999e-06 [overlap_opt_shard_grad_in_pipeline]: 4.68999e-06 [control_data_broadcast_order]: 1.747e-05 [grouped_pairwise_exchange_alltoall]: 4.06001e-06 [offloading_packed_experts]: 7.06999e-06 [overlap_recompute_and_grad_model_parallel]: 7.97e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.53999e-06 [overlap_recompute_allgather_and_fa_grad]: 3.63e-06 [overlap_recompute_comm]: 4.60001e-06 [overlap_grad_ring_attention]: 7.3e-06 [overlap_grad_flash_sp]: 2.375e-05 [begin_end_overlap_inline]: 3.09001e-06 [split_matmul_comm_elemetwise]: 4.55999e-06 [split_layernorm_comm]: 3.92998e-06 [handle_group_info]: 3.20002e-06 [symbol_engine_optimizer]: 0.00010448, [1] [Cycle 1]: 9.792e-05, [6] [build]: 2.89999e-06 [elim_shapecalc]: 1.135e-05 [elim_not_effective]: 1.59e-05 [opt_reshape]: 8.79e-06 [fold_const_symbol]: 1.318e-05 [renormalize]: 2.20025e-07 [detach_backward]: 3.28e-06 [pipeline_parallel_scheduler]: 1.66e-06 [auto_monad_reorder]: 2.174e-05 [get_jit_bprop_graph]: 1.17e-06 [rewriter_after_jit_bprop_graph]: 3.91001e-06 [opt_after_jit_grad]: 0.00047652 [validate]: 3.967e-05 Sums bootstrap : 0.000443s : 3.85% type_inference : 0.005679s : 49.37% event_method : 0.000020s : 0.17% auto_monad : 0.000062s : 0.53% graph_reusing : 0.000006s : 0.05% inline : 0.000002s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000021s : 0.18% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000007s : 0.06% parallel-infer-symbol : 0.000003s : 0.03% pre_auto_parallel : 0.000033s : 0.28% insert-virtual-dataset : 0.000002s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.01% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000031s : 0.27% optimize.rewriter_before_opt_a : 0.000096s : 0.83% optimize.opt_a.expand_dump_flag : 0.000005s : 0.04% optimize.opt_a.switch_simplify : 0.000058s : 0.50% optimize.opt_a.loop_unroll : 0.000044s : 0.38% optimize.opt_a.a_1 : 0.001069s : 9.29% optimize.opt_a.with_stream_mark : 0.000026s : 0.23% optimize.opt_a.recompute_prepare : 0.000019s : 0.16% optimize.opt_a.updatestate_depend_eliminate : 0.000008s : 0.07% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.06% optimize.opt_a.updatestate_loads_eliminate : 0.000007s : 0.06% optimize.opt_a.parameter_eliminate : 0.000003s : 0.03% optimize.opt_a.a_2 : 0.000271s : 2.35% optimize.opt_a.accelerated_algorithm : 0.000018s : 0.15% optimize.opt_a.shard : 0.000003s : 0.02% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.03% optimize.opt_a.shard_inline : 0.000017s : 0.15% optimize.opt_a.merge_send_recv : 0.000016s : 0.14% optimize.opt_a.auto_parallel : 0.000014s : 0.12% optimize.opt_a.parallel : 0.000022s : 0.19% optimize.opt_a.flash_sp : 0.000010s : 0.09% optimize.opt_a.merge_comm : 0.000009s : 0.08% optimize.opt_a.allreduce_fusion : 0.000009s : 0.08% optimize.opt_a.matmul_add_comm_reduction : 0.000017s : 0.15% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000019s : 0.17% optimize.opt_a.virtual_dataset : 0.000017s : 0.15% optimize.opt_a.get_grad_eliminate_ : 0.000018s : 0.16% optimize.opt_a.virtual_output : 0.000016s : 0.14% optimize.opt_a.merge_forward : 0.000008s : 0.07% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.02% optimize.opt_a.offload_activation : 0.000019s : 0.16% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000034s : 0.30% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.02% optimize.opt_a.before_grad : 0.000026s : 0.22% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000009s : 0.08% optimize.opt_a.meta_fg_expand : 0.000007s : 0.06% optimize.opt_a.flash_sp_send_recv_attached : 0.000003s : 0.03% optimize.opt_a.receive_attached : 0.000004s : 0.03% optimize.opt_a.after_resolve : 0.000028s : 0.24% optimize.opt_a.a_after_grad : 0.000026s : 0.23% optimize.opt_a.renormalize : 0.000603s : 5.24% optimize.opt_a.add_forward_monad_depend : 0.000007s : 0.06% optimize.opt_a.auto_monad_grad : 0.000003s : 0.02% optimize.opt_a.auto_monad_eliminator : 0.000025s : 0.22% optimize.opt_a.cse : 0.000052s : 0.46% optimize.opt_a.a_3 : 0.000141s : 1.23% optimize.py_interpret_to_execute_after_opt_a : 0.000013s : 0.11% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.04% optimize.rewriter_after_opt_a : 0.000044s : 0.38% optimize.convert_after_rewriter : 0.000011s : 0.10% optimize.order_py_execute_after_rewriter : 0.000009s : 0.08% optimize.mutable_eliminate : 0.000509s : 4.42% optimize.opt_b.b_1 : 0.000224s : 1.95% optimize.opt_b.b_2 : 0.000010s : 0.09% optimize.opt_b.updatestate_depend_eliminate : 0.000006s : 0.05% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_b.updatestate_loads_eliminate : 0.000004s : 0.03% optimize.opt_b.renormalize : 0.000000s : 0.00% optimize.opt_b.cse : 0.000022s : 0.19% optimize.optimize_parallel_all_gather_comm : 0.000020s : 0.18% optimize.overlap_param_gather : 0.000005s : 0.04% optimize.cconv : 0.000026s : 0.23% optimize.loop_unroll : 0.000433s : 3.76% optimize.opt_after_cconv.c_1 : 0.000042s : 0.37% optimize.opt_after_cconv.parameter_eliminate : 0.000002s : 0.02% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.05% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.cse : 0.000022s : 0.19% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000018s : 0.15% optimize.tuple_transform.d_1 : 0.000058s : 0.50% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000009s : 0.08% optimize.partial_unused_args_eliminate : 0.000004s : 0.04% optimize.add_recomputation : 0.000057s : 0.50% optimize.cse_after_recomputation.cse : 0.000015s : 0.13% optimize.environ_conv : 0.000009s : 0.08% optimize.swap_dp_allreduce_reducescatter : 0.000009s : 0.07% optimize.bias_add_comm_swap : 0.000005s : 0.04% optimize.label_micro_interleaved_index : 0.000007s : 0.06% optimize.label_fine_grained_interleaved_index : 0.000005s : 0.04% optimize.merge_cast_opt : 0.000004s : 0.03% optimize.slice_recompute_activation : 0.000004s : 0.04% optimize.micro_interleaved_order_control : 0.000005s : 0.04% optimize.assign_add_opt : 0.000004s : 0.03% optimize.ForceFp32Comm : 0.000003s : 0.03% optimize.remove_cast_before_assign_add : 0.000003s : 0.03% optimize.full_micro_interleaved_order_control : 0.000005s : 0.04% optimize.reorder_send_recv_between_fp_bp : 0.000005s : 0.05% optimize.comm_op_add_attrs : 0.000003s : 0.03% optimize.add_comm_op_reuse_tag : 0.000003s : 0.03% optimize.interleave_split_concat_branches : 0.000003s : 0.03% optimize.interleave_parallel_branches : 0.000003s : 0.03% optimize.overlap_opt_shard_in_pipeline : 0.000003s : 0.03% optimize.overlap_opt_shard_grad_in_pipeline : 0.000005s : 0.04% optimize.control_data_broadcast_order : 0.000017s : 0.15% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.04% optimize.offloading_packed_experts : 0.000007s : 0.06% optimize.overlap_recompute_and_grad_model_parallel : 0.000008s : 0.07% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.03% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.03% optimize.overlap_recompute_comm : 0.000005s : 0.04% optimize.overlap_grad_ring_attention : 0.000007s : 0.06% optimize.overlap_grad_flash_sp : 0.000024s : 0.21% optimize.begin_end_overlap_inline : 0.000003s : 0.03% optimize.split_matmul_comm_elemetwise : 0.000005s : 0.04% optimize.split_layernorm_comm : 0.000004s : 0.03% optimize.handle_group_info : 0.000003s : 0.03% optimize.symbol_engine_optimizer.build : 0.000003s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000011s : 0.10% optimize.symbol_engine_optimizer.elim_not_effective : 0.000016s : 0.14% optimize.symbol_engine_optimizer.opt_reshape : 0.000009s : 0.08% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000013s : 0.11% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000003s : 0.03% pipeline_parallel_scheduler : 0.000002s : 0.01% auto_monad_reorder : 0.000022s : 0.19% get_jit_bprop_graph : 0.000001s : 0.01% rewriter_after_jit_bprop_graph : 0.000004s : 0.03% opt_after_jit_grad : 0.000477s : 4.14% validate : 0.000040s : 0.34% Time group info: ------[substitution.] 0.000212 44 9.07% : 0.000019s : 3: substitution.cast_eliminate 1.02% : 0.000002s : 3: substitution.elim_not_effective 0.77% : 0.000002s : 3: substitution.fold_const_symbol 3.54% : 0.000008s : 6: substitution.graph_param_transform 66.99% : 0.000142s : 4: substitution.inline 1.97% : 0.000004s : 6: substitution.j_node_and_user_rematch 2.83% : 0.000006s : 6: substitution.remove_not_recompute_node 2.61% : 0.000006s : 6: substitution.replace_old_param 8.65% : 0.000018s : 6: substitution.tuple_list_get_item_eliminator 2.54% : 0.000005s : 1: substitution.value_based_eliminate ------[type_inference.] 0.005632 2 87.13% : 0.004907s : 1: type_inference.infer 12.87% : 0.000725s : 1: type_inference.specialize ------[replace.] 0.000071 10 51.11% : 0.000036s : 4: replace.inline 48.89% : 0.000035s : 6: replace.tuple_list_get_item_eliminator ------[match.] 0.000155 10 89.96% : 0.000139s : 4: match.inline 10.04% : 0.000016s : 6: match.tuple_list_get_item_eliminator ------[predicate.] 0.000293 1954 0.98% : 0.000003s : 21: predicate.accumulaten_eliminater 0.58% : 0.000002s : 6: predicate.ad_related_special_op_eliminate 0.53% : 0.000002s : 12: predicate.addn_check_dump 0.96% : 0.000003s : 21: predicate.addn_zero_filter 0.89% : 0.000003s : 21: predicate.adjust_all_reduce_mul_add 2.04% : 0.000006s : 33: predicate.arithmetic_simplify 1.04% : 0.000003s : 21: predicate.cast_eliminate 0.62% : 0.000002s : 12: predicate.check_bprop_eliminate 0.58% : 0.000002s : 12: predicate.compare_switch_simplify 0.20% : 0.000001s : 6: predicate.const_output_eliminate 0.61% : 0.000002s : 12: predicate.depend_value_elim 1.03% : 0.000003s : 21: predicate.dict_get_item_const_eliminator 1.10% : 0.000003s : 21: predicate.dict_get_item_eliminator 0.93% : 0.000003s : 21: predicate.dict_set_item_eliminator 0.83% : 0.000002s : 12: predicate.dumpgradient_eliminate 0.26% : 0.000001s : 6: predicate.elim_not_effective 0.32% : 0.000001s : 6: predicate.elim_shapecalc_of_broadcastargs 1.20% : 0.000003s : 27: predicate.environ_add_const_eliminate 1.21% : 0.000004s : 27: predicate.environ_get_add_eliminate 1.21% : 0.000004s : 27: predicate.environ_get_depend_swap 1.90% : 0.000006s : 39: predicate.environ_get_eliminate 1.20% : 0.000004s : 27: predicate.environ_get_set_eliminate 1.49% : 0.000004s : 31: predicate.exchange_switch_depend_value 2.37% : 0.000007s : 31: predicate.float_depend_g_call 0.54% : 0.000002s : 12: predicate.float_environ_get_switch 0.79% : 0.000002s : 18: predicate.float_tuple_getitem_switch 0.19% : 0.000001s : 6: predicate.fold_const_symbol 0.64% : 0.000002s : 12: predicate.get_grad_eliminate 0.19% : 0.000001s : 6: predicate.graph_param_transform 0.60% : 0.000002s : 12: predicate.incorporate_call 0.49% : 0.000001s : 12: predicate.incorporate_call_switch 6.18% : 0.000018s : 88: predicate.inline 0.82% : 0.000002s : 12: predicate.inline_without_move 0.36% : 0.000001s : 12: predicate.j_node_and_user_rematch 0.70% : 0.000002s : 12: predicate.less_batch_normalization 1.95% : 0.000006s : 39: predicate.list_to_tuple_eliminator_ 2.77% : 0.000008s : 60: predicate.load_eliminater 0.64% : 0.000002s : 6: predicate.loop_unroll_after_grad 2.04% : 0.000006s : 42: predicate.loop_unroll_before_grad 1.59% : 0.000005s : 33: predicate.make_slice_get_slice_eliminator 0.55% : 0.000002s : 12: predicate.merge_addn 0.70% : 0.000002s : 12: predicate.micro_step_allgather_replace 0.58% : 0.000002s : 12: predicate.mini_step_allgather_replace 0.94% : 0.000003s : 21: predicate.minmaximum_grad 0.67% : 0.000002s : 6: predicate.mutable_eliminate 0.30% : 0.000001s : 6: predicate.opt_reshape 0.34% : 0.000001s : 6: predicate.parallel_virtual_node 1.73% : 0.000005s : 31: predicate.partial_defer_inline 1.82% : 0.000005s : 33: predicate.partial_eliminate 1.04% : 0.000003s : 21: predicate.print_const_string_wrapper 0.53% : 0.000002s : 12: predicate.reduce_all_const_elim 1.23% : 0.000004s : 21: predicate.reduce_eliminate 2.67% : 0.000008s : 60: predicate.redundant_stop_gradient_eliminater 0.39% : 0.000001s : 12: predicate.remove_not_recompute_node 1.49% : 0.000004s : 39: predicate.replace_applicator 0.48% : 0.000001s : 12: predicate.replace_old_param 0.24% : 0.000001s : 6: predicate.reset_defer_inline 1.05% : 0.000003s : 21: predicate.reshape_eliminate 0.64% : 0.000002s : 12: predicate.row_tensor_add_zeros_like 0.35% : 0.000001s : 6: predicate.row_tensor_eliminate 0.67% : 0.000002s : 12: predicate.same_eliminate 0.44% : 0.000001s : 12: predicate.set_cell_output_no_recompute 0.80% : 0.000002s : 12: predicate.shard_identity_eliminate 0.66% : 0.000002s : 12: predicate.special_op_eliminate 0.71% : 0.000002s : 12: predicate.specialize_transform 0.72% : 0.000002s : 12: predicate.split_environ_get_set_with_tuple_value 0.69% : 0.000002s : 12: predicate.stack_unstack_eliminate 0.33% : 0.000001s : 6: predicate.switch_call_monad_eliminater 1.56% : 0.000005s : 31: predicate.switch_defer_inline 2.17% : 0.000006s : 43: predicate.switch_layer_defer_inline 4.80% : 0.000014s : 91: predicate.switch_simplify 0.99% : 0.000003s : 21: predicate.tile_eliminate 1.06% : 0.000003s : 21: predicate.transpose_eliminate 1.59% : 0.000005s : 33: predicate.tuple_list_convert_item_index_to_positive 1.72% : 0.000005s : 33: predicate.tuple_list_get_item_const_eliminator 1.53% : 0.000004s : 33: predicate.tuple_list_get_item_depend_reorder 3.28% : 0.000010s : 51: predicate.tuple_list_get_item_eliminator 1.61% : 0.000005s : 33: predicate.tuple_list_get_set_item_eliminator 2.29% : 0.000007s : 45: predicate.tuple_list_set_item_eliminator 1.83% : 0.000005s : 39: predicate.tuple_to_list_eliminator_ 2.66% : 0.000008s : 60: predicate.updatestate_pure_node_eliminater 3.37% : 0.000010s : 72: predicate.updatestate_useless_node_eliminater 0.38% : 0.000001s : 6: predicate.value_based_eliminate 0.62% : 0.000002s : 12: predicate.virtual_dataset_eliminate 0.55% : 0.000002s : 12: predicate.virtual_output_eliminate 0.30% : 0.000001s : 6: predicate.virtual_view_grad_eliminate 0.37% : 0.000001s : 6: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000518 11 54.84% : 0.000284s : 5: func_graph_cloner_run.FuncGraphClonerGraph 45.16% : 0.000234s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.027496 192 0.02% : 0.000006s : 1: ForceFp32Comm 10.87% : 0.002990s : 1: add_attr 10.83% : 0.002977s : 1: add_attr_with_inline 0.02% : 0.000006s : 1: add_comm_op_reuse_tag 0.22% : 0.000061s : 1: add_recomputation 0.02% : 0.000006s : 1: assign_add_opt 0.26% : 0.000072s : 1: auto_monad 0.11% : 0.000029s : 1: auto_monad_reorder 0.02% : 0.000006s : 1: begin_end_overlap_inline 0.03% : 0.000007s : 1: bias_add_comm_swap 1.77% : 0.000488s : 1: bootstrap 0.11% : 0.000029s : 1: cconv 0.02% : 0.000006s : 1: comm_op_add_attrs 0.07% : 0.000020s : 1: control_data_broadcast_order 0.05% : 0.000014s : 1: convert_after_rewriter 0.13% : 0.000034s : 1: cse_after_recomputation 0.03% : 0.000007s : 1: dataset_repeat_opt 0.06% : 0.000017s : 1: detach_backward 0.04% : 0.000012s : 1: environ_conv 0.11% : 0.000030s : 1: event_method 0.03% : 0.000007s : 1: full_micro_interleaved_order_control 0.03% : 0.000007s : 1: get_jit_bprop_graph 0.04% : 0.000012s : 1: graph_reusing 0.02% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.02% : 0.000006s : 1: handle_group_info 0.03% : 0.000007s : 1: inline 0.03% : 0.000008s : 1: insert-virtual-dataset 0.02% : 0.000006s : 1: interleave_parallel_branches 0.02% : 0.000006s : 1: interleave_split_concat_branches 0.03% : 0.000008s : 1: label_fine_grained_interleaved_index 0.03% : 0.000009s : 1: label_micro_interleaved_index 1.59% : 0.000438s : 1: loop_unroll 0.02% : 0.000006s : 1: merge_cast_opt 0.03% : 0.000008s : 1: micro_interleaved_order_control 1.87% : 0.000515s : 1: mutable_eliminate 0.04% : 0.000010s : 1: offloading_packed_experts 0.06% : 0.000016s : 1: opt.transform.loop_unroll_optimizer 0.06% : 0.000016s : 1: opt.transform.mutable_eliminate 6.00% : 0.001649s : 78: opt.transform.opt_a 0.15% : 0.000041s : 1: opt.transform.opt_after_cconv 0.11% : 0.000030s : 1: opt.transform.opt_after_jit_grad 0.59% : 0.000161s : 28: opt.transform.opt_b 0.24% : 0.000065s : 2: opt.transform.opt_trans_graph 0.17% : 0.000046s : 4: opt.transform.symbol_engine_opt 12.56% : 0.003452s : 1: opt_a 0.53% : 0.000147s : 1: opt_after_cconv 1.77% : 0.000486s : 1: opt_after_jit_grad 1.23% : 0.000338s : 1: opt_b 22.41% : 0.006162s : 1: optimize 0.09% : 0.000023s : 1: optimize_parallel_all_gather_comm 0.04% : 0.000012s : 1: order_py_execute_after_rewriter 0.10% : 0.000027s : 1: overlap_grad_flash_sp 0.02% : 0.000006s : 1: overlap_grad_matmul_and_grad_allreduce 0.04% : 0.000010s : 1: overlap_grad_ring_attention 0.03% : 0.000008s : 1: overlap_opt_shard_grad_in_pipeline 0.02% : 0.000006s : 1: overlap_opt_shard_in_pipeline 0.03% : 0.000008s : 1: overlap_param_gather 0.02% : 0.000006s : 1: overlap_recompute_allgather_and_fa_grad 0.04% : 0.000011s : 1: overlap_recompute_and_grad_model_parallel 0.03% : 0.000007s : 1: overlap_recompute_comm 0.03% : 0.000010s : 1: parallel-infer-symbol 0.02% : 0.000006s : 1: parallel-infer-symbol-second 0.03% : 0.000007s : 1: partial_unused_args_eliminate 0.03% : 0.000009s : 1: pipeline_parallel_scheduler 0.03% : 0.000007s : 1: pipeline_split 0.14% : 0.000040s : 1: pre_auto_parallel 0.13% : 0.000035s : 1: py_interpret_to_execute 0.06% : 0.000016s : 1: py_interpret_to_execute_after_opt_a 0.02% : 0.000006s : 1: remove_cast_before_assign_add 0.08% : 0.000021s : 1: remove_dup_value 1.15% : 0.000317s : 1: renormalize.infer 1.01% : 0.000278s : 1: renormalize.specialize 0.03% : 0.000008s : 1: reorder_send_recv_between_fp_bp 0.04% : 0.000010s : 1: rewriter_after_jit_bprop_graph 0.17% : 0.000048s : 1: rewriter_after_opt_a 0.36% : 0.000099s : 1: rewriter_before_opt_a 0.03% : 0.000007s : 1: slice_cell_reuse_recomputed_activation 0.02% : 0.000007s : 1: slice_recompute_activation 0.02% : 0.000007s : 1: split_layernorm_comm 0.03% : 0.000007s : 1: split_matmul_comm_elemetwise 0.04% : 0.000011s : 1: swap_dp_allreduce_reducescatter 0.39% : 0.000107s : 1: symbol_engine_optimizer 0.40% : 0.000109s : 1: tuple_transform 20.77% : 0.005710s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:47:14.806.751 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0161541, [21] [bootstrap]: 0.00048022 [type_inference]: 0.00605284 [event_method]: 1.987e-05 [auto_monad]: 6.38e-05 [graph_reusing]: 5.75001e-06 [inline]: 2.25002e-06 [add_attr]: 0.00315846, [1] [add_attr_with_inline]: 0.00315063, [1] [Cycle 1]: 5.843e-05, [2] [tag_attr]: 2.062e-05 [meta_addattr_fg_expand]: 6.49001e-06 [parallel-infer-symbol]: 3.35e-06 [pre_auto_parallel]: 3.523e-05 [insert-virtual-dataset]: 2.94999e-06 [parallel-infer-symbol-second]: 9.49978e-07 [dataset_repeat_opt]: 2.13002e-06 [pipeline_split]: 1.61002e-06 [optimize]: 0.00559765, [53] [py_interpret_to_execute]: 3.07e-05 [rewriter_before_opt_a]: 9.526e-05 [opt_a]: 0.0033326, [2] [Cycle 1]: 0.00246159, [45] [expand_dump_flag]: 3.09001e-06 [switch_simplify]: 4.839e-05 [loop_unroll]: 3.546e-05 [a_1]: 0.00088744 [with_stream_mark]: 1.891e-05 [recompute_prepare]: 1.366e-05 [updatestate_depend_eliminate]: 4.90001e-06 [updatestate_assign_eliminate]: 4.22e-06 [updatestate_loads_eliminate]: 3.74002e-06 [parameter_eliminate]: 1.72001e-06 [a_2]: 0.00011338 [accelerated_algorithm]: 9.56e-06 [shard]: 2.33998e-06 [meta_shard_fg_expand]: 2.26998e-06 [shard_inline]: 8.62e-06 [merge_send_recv]: 1.004e-05 [auto_parallel]: 7.2e-06 [parallel]: 1.934e-05 [flash_sp]: 9.03002e-06 [merge_comm]: 4.99e-06 [allreduce_fusion]: 4.65999e-06 [matmul_add_comm_reduction]: 1.034e-05 [allreduce_slice_to_reducescatter]: 6.50005e-07 [virtual_shard_identity]: 1.058e-05 [virtual_dataset]: 8.43999e-06 [get_grad_eliminate_]: 8.38001e-06 [virtual_output]: 8.41002e-06 [merge_forward]: 4.73001e-06 [cell_reuse_recompute_pass]: 1.27e-06 [offload_activation]: 1.054e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.647e-05 [merge_recompute_call_nodes]: 1.50001e-06 [before_grad]: 1.643e-05 [set_forward_comm_id_for_comm_node_pass]: 4.94e-06 [meta_fg_expand]: 3.48e-06 [flash_sp_send_recv_attached]: 2.68e-06 [receive_attached]: 2.18002e-06 [after_resolve]: 1.553e-05 [a_after_grad]: 1.408e-05 [renormalize]: 0.00072259 [add_forward_monad_depend]: 6.92002e-06 [auto_monad_grad]: 2.31998e-06 [auto_monad_eliminator]: 1.786e-05 [cse]: 3.612e-05 [a_3]: 6.61e-05 [Cycle 2]: 0.00086079, [45] [expand_dump_flag]: 1.42999e-06 [switch_simplify]: 1.011e-05 [loop_unroll]: 8.18001e-06 [a_1]: 0.00020288 [with_stream_mark]: 1.402e-05 [recompute_prepare]: 9.51e-06 [updatestate_depend_eliminate]: 3.93999e-06 [updatestate_assign_eliminate]: 3.37002e-06 [updatestate_loads_eliminate]: 3.31999e-06 [parameter_eliminate]: 1.59998e-06 [a_2]: 0.00011375 [accelerated_algorithm]: 9.07999e-06 [shard]: 1.30999e-06 [meta_shard_fg_expand]: 2.17999e-06 [shard_inline]: 8.22e-06 [merge_send_recv]: 7.11001e-06 [auto_parallel]: 7.5e-06 [parallel]: 6.11998e-06 [flash_sp]: 3.46999e-06 [merge_comm]: 4.92e-06 [allreduce_fusion]: 3.73999e-06 [matmul_add_comm_reduction]: 8.32998e-06 [allreduce_slice_to_reducescatter]: 3.30008e-07 [virtual_shard_identity]: 1.063e-05 [virtual_dataset]: 7.88999e-06 [get_grad_eliminate_]: 8.16002e-06 [virtual_output]: 7.62998e-06 [merge_forward]: 3.83999e-06 [cell_reuse_recompute_pass]: 1.62001e-06 [offload_activation]: 9.04e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.576e-05 [merge_recompute_call_nodes]: 1.10999e-06 [before_grad]: 1.477e-05 [set_forward_comm_id_for_comm_node_pass]: 4.63999e-06 [meta_fg_expand]: 3.06001e-06 [flash_sp_send_recv_attached]: 8.00006e-07 [receive_attached]: 1.50001e-06 [after_resolve]: 1.515e-05 [a_after_grad]: 1.227e-05 [renormalize]: 8.00064e-08 [add_forward_monad_depend]: 2.58e-06 [auto_monad_grad]: 1.52999e-06 [auto_monad_eliminator]: 1.179e-05 [cse]: 2.311e-05 [a_3]: 5.018e-05 [py_interpret_to_execute_after_opt_a]: 1.252e-05 [slice_cell_reuse_recomputed_activation]: 2.03997e-06 [rewriter_after_opt_a]: 4.248e-05 [convert_after_rewriter]: 8.24998e-06 [order_py_execute_after_rewriter]: 5.74e-06 [mutable_eliminate]: 0.00054141 [opt_b]: 0.00027692, [1] [Cycle 1]: 0.00027062, [7] [b_1]: 0.00018082 [b_2]: 1.077e-05 [updatestate_depend_eliminate]: 7.65e-06 [updatestate_assign_eliminate]: 3.22997e-06 [updatestate_loads_eliminate]: 3.94002e-06 [renormalize]: 7.79983e-07 [cse]: 2.698e-05 [optimize_parallel_all_gather_comm]: 1.997e-05 [overlap_param_gather]: 2.07001e-06 [cconv]: 2.733e-05 [loop_unroll]: 0.00045522 [opt_after_cconv]: 0.00012477, [1] [Cycle 1]: 0.00011861, [7] [c_1]: 4.187e-05 [parameter_eliminate]: 3.33e-06 [updatestate_depend_eliminate]: 7.20998e-06 [updatestate_assign_eliminate]: 3.29001e-06 [updatestate_loads_eliminate]: 3.32002e-06 [cse]: 2.395e-05 [renormalize]: 9.70002e-07 [remove_dup_value]: 1.531e-05 [tuple_transform]: 9.503e-05, [1] [Cycle 1]: 8.966e-05, [4] [d_1]: 5.944e-05 [none_parameter_eliminate]: 2.03997e-06 [renormalize]: 2.59985e-07 [switch_simplify]: 8.96998e-06 [partial_unused_args_eliminate]: 2.07001e-06 [add_recomputation]: 6.199e-05 [cse_after_recomputation]: 2.686e-05, [1] [Cycle 1]: 2.224e-05, [1] [cse]: 1.619e-05 [environ_conv]: 6.87002e-06 [swap_dp_allreduce_reducescatter]: 6.21e-06 [bias_add_comm_swap]: 2.78998e-06 [label_micro_interleaved_index]: 5.37001e-06 [label_fine_grained_interleaved_index]: 2.94001e-06 [merge_cast_opt]: 1.27999e-06 [slice_recompute_activation]: 1.88997e-06 [micro_interleaved_order_control]: 2.40002e-06 [assign_add_opt]: 1.20001e-06 [ForceFp32Comm]: 7.59988e-07 [remove_cast_before_assign_add]: 1.32e-06 [full_micro_interleaved_order_control]: 2.40002e-06 [reorder_send_recv_between_fp_bp]: 2.68e-06 [comm_op_add_attrs]: 1.32e-06 [add_comm_op_reuse_tag]: 9.60019e-07 [interleave_split_concat_branches]: 1.48002e-06 [interleave_parallel_branches]: 1.02998e-06 [overlap_opt_shard_in_pipeline]: 1.15001e-06 [overlap_opt_shard_grad_in_pipeline]: 2.10002e-06 [control_data_broadcast_order]: 1.472e-05 [grouped_pairwise_exchange_alltoall]: 1.73002e-06 [offloading_packed_experts]: 4.57e-06 [overlap_recompute_and_grad_model_parallel]: 5.53002e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.18001e-06 [overlap_recompute_allgather_and_fa_grad]: 1.32e-06 [overlap_recompute_comm]: 2.24999e-06 [overlap_grad_ring_attention]: 4.37998e-06 [overlap_grad_flash_sp]: 2.207e-05 [begin_end_overlap_inline]: 6.19999e-07 [split_matmul_comm_elemetwise]: 2.27999e-06 [split_layernorm_comm]: 1.54998e-06 [handle_group_info]: 8.99978e-07 [symbol_engine_optimizer]: 8.995e-05, [1] [Cycle 1]: 8.53e-05, [6] [build]: 3.08e-06 [elim_shapecalc]: 1.249e-05 [elim_not_effective]: 1.716e-05 [opt_reshape]: 8.77999e-06 [fold_const_symbol]: 1.336e-05 [renormalize]: 4.39992e-07 [detach_backward]: 2.26e-06 [pipeline_parallel_scheduler]: 1.43002e-06 [auto_monad_reorder]: 2.01e-05 [get_jit_bprop_graph]: 1.96e-06 [rewriter_after_jit_bprop_graph]: 4.12e-06 [opt_after_jit_grad]: 0.00050593 [validate]: 4.525e-05 Sums bootstrap : 0.000480s : 4.00% type_inference : 0.006053s : 50.36% event_method : 0.000020s : 0.17% auto_monad : 0.000064s : 0.53% graph_reusing : 0.000006s : 0.05% inline : 0.000002s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000021s : 0.17% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.05% parallel-infer-symbol : 0.000003s : 0.03% pre_auto_parallel : 0.000035s : 0.29% insert-virtual-dataset : 0.000003s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000031s : 0.26% optimize.rewriter_before_opt_a : 0.000095s : 0.79% optimize.opt_a.expand_dump_flag : 0.000005s : 0.04% optimize.opt_a.switch_simplify : 0.000059s : 0.49% optimize.opt_a.loop_unroll : 0.000044s : 0.36% optimize.opt_a.a_1 : 0.001090s : 9.07% optimize.opt_a.with_stream_mark : 0.000033s : 0.27% optimize.opt_a.recompute_prepare : 0.000023s : 0.19% optimize.opt_a.updatestate_depend_eliminate : 0.000009s : 0.07% optimize.opt_a.updatestate_assign_eliminate : 0.000008s : 0.06% optimize.opt_a.updatestate_loads_eliminate : 0.000007s : 0.06% optimize.opt_a.parameter_eliminate : 0.000003s : 0.03% optimize.opt_a.a_2 : 0.000227s : 1.89% optimize.opt_a.accelerated_algorithm : 0.000019s : 0.16% optimize.opt_a.shard : 0.000004s : 0.03% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.04% optimize.opt_a.shard_inline : 0.000017s : 0.14% optimize.opt_a.merge_send_recv : 0.000017s : 0.14% optimize.opt_a.auto_parallel : 0.000015s : 0.12% optimize.opt_a.parallel : 0.000025s : 0.21% optimize.opt_a.flash_sp : 0.000013s : 0.10% optimize.opt_a.merge_comm : 0.000010s : 0.08% optimize.opt_a.allreduce_fusion : 0.000008s : 0.07% optimize.opt_a.matmul_add_comm_reduction : 0.000019s : 0.16% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000021s : 0.18% optimize.opt_a.virtual_dataset : 0.000016s : 0.14% optimize.opt_a.get_grad_eliminate_ : 0.000017s : 0.14% optimize.opt_a.virtual_output : 0.000016s : 0.13% optimize.opt_a.merge_forward : 0.000009s : 0.07% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.02% optimize.opt_a.offload_activation : 0.000020s : 0.16% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000032s : 0.27% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.02% optimize.opt_a.before_grad : 0.000031s : 0.26% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000010s : 0.08% optimize.opt_a.meta_fg_expand : 0.000007s : 0.05% optimize.opt_a.flash_sp_send_recv_attached : 0.000003s : 0.03% optimize.opt_a.receive_attached : 0.000004s : 0.03% optimize.opt_a.after_resolve : 0.000031s : 0.26% optimize.opt_a.a_after_grad : 0.000026s : 0.22% optimize.opt_a.renormalize : 0.000723s : 6.01% optimize.opt_a.add_forward_monad_depend : 0.000010s : 0.08% optimize.opt_a.auto_monad_grad : 0.000004s : 0.03% optimize.opt_a.auto_monad_eliminator : 0.000030s : 0.25% optimize.opt_a.cse : 0.000059s : 0.49% optimize.opt_a.a_3 : 0.000116s : 0.97% optimize.py_interpret_to_execute_after_opt_a : 0.000013s : 0.10% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.02% optimize.rewriter_after_opt_a : 0.000042s : 0.35% optimize.convert_after_rewriter : 0.000008s : 0.07% optimize.order_py_execute_after_rewriter : 0.000006s : 0.05% optimize.mutable_eliminate : 0.000541s : 4.50% optimize.opt_b.b_1 : 0.000181s : 1.50% optimize.opt_b.b_2 : 0.000011s : 0.09% optimize.opt_b.updatestate_depend_eliminate : 0.000008s : 0.06% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_b.updatestate_loads_eliminate : 0.000004s : 0.03% optimize.opt_b.renormalize : 0.000001s : 0.01% optimize.opt_b.cse : 0.000027s : 0.22% optimize.optimize_parallel_all_gather_comm : 0.000020s : 0.17% optimize.overlap_param_gather : 0.000002s : 0.02% optimize.cconv : 0.000027s : 0.23% optimize.loop_unroll : 0.000455s : 3.79% optimize.opt_after_cconv.c_1 : 0.000042s : 0.35% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000007s : 0.06% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.cse : 0.000024s : 0.20% optimize.opt_after_cconv.renormalize : 0.000001s : 0.01% optimize.remove_dup_value : 0.000015s : 0.13% optimize.tuple_transform.d_1 : 0.000059s : 0.49% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.02% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000009s : 0.07% optimize.partial_unused_args_eliminate : 0.000002s : 0.02% optimize.add_recomputation : 0.000062s : 0.52% optimize.cse_after_recomputation.cse : 0.000016s : 0.13% optimize.environ_conv : 0.000007s : 0.06% optimize.swap_dp_allreduce_reducescatter : 0.000006s : 0.05% optimize.bias_add_comm_swap : 0.000003s : 0.02% optimize.label_micro_interleaved_index : 0.000005s : 0.04% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.02% optimize.merge_cast_opt : 0.000001s : 0.01% optimize.slice_recompute_activation : 0.000002s : 0.02% optimize.micro_interleaved_order_control : 0.000002s : 0.02% optimize.assign_add_opt : 0.000001s : 0.01% optimize.ForceFp32Comm : 0.000001s : 0.01% optimize.remove_cast_before_assign_add : 0.000001s : 0.01% optimize.full_micro_interleaved_order_control : 0.000002s : 0.02% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.02% optimize.comm_op_add_attrs : 0.000001s : 0.01% optimize.add_comm_op_reuse_tag : 0.000001s : 0.01% optimize.interleave_split_concat_branches : 0.000001s : 0.01% optimize.interleave_parallel_branches : 0.000001s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.02% optimize.control_data_broadcast_order : 0.000015s : 0.12% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.01% optimize.offloading_packed_experts : 0.000005s : 0.04% optimize.overlap_recompute_and_grad_model_parallel : 0.000006s : 0.05% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.01% optimize.overlap_recompute_comm : 0.000002s : 0.02% optimize.overlap_grad_ring_attention : 0.000004s : 0.04% optimize.overlap_grad_flash_sp : 0.000022s : 0.18% optimize.begin_end_overlap_inline : 0.000001s : 0.01% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.02% optimize.split_layernorm_comm : 0.000002s : 0.01% optimize.handle_group_info : 0.000001s : 0.01% optimize.symbol_engine_optimizer.build : 0.000003s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000012s : 0.10% optimize.symbol_engine_optimizer.elim_not_effective : 0.000017s : 0.14% optimize.symbol_engine_optimizer.opt_reshape : 0.000009s : 0.07% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000013s : 0.11% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.02% pipeline_parallel_scheduler : 0.000001s : 0.01% auto_monad_reorder : 0.000020s : 0.17% get_jit_bprop_graph : 0.000002s : 0.02% rewriter_after_jit_bprop_graph : 0.000004s : 0.03% opt_after_jit_grad : 0.000506s : 4.21% validate : 0.000045s : 0.38% Time group info: ------[substitution.] 0.000232 44 9.30% : 0.000022s : 3: substitution.cast_eliminate 1.02% : 0.000002s : 3: substitution.elim_not_effective 0.73% : 0.000002s : 3: substitution.fold_const_symbol 3.15% : 0.000007s : 6: substitution.graph_param_transform 67.91% : 0.000157s : 4: substitution.inline 2.07% : 0.000005s : 6: substitution.j_node_and_user_rematch 2.39% : 0.000006s : 6: substitution.remove_not_recompute_node 2.79% : 0.000006s : 6: substitution.replace_old_param 8.32% : 0.000019s : 6: substitution.tuple_list_get_item_eliminator 2.32% : 0.000005s : 1: substitution.value_based_eliminate ------[type_inference.] 0.005994 2 87.39% : 0.005239s : 1: type_inference.infer 12.61% : 0.000756s : 1: type_inference.specialize ------[replace.] 0.000076 10 51.36% : 0.000039s : 4: replace.inline 48.64% : 0.000037s : 6: replace.tuple_list_get_item_eliminator ------[match.] 0.000171 10 90.74% : 0.000155s : 4: match.inline 9.26% : 0.000016s : 6: match.tuple_list_get_item_eliminator ------[predicate.] 0.000305 1954 0.96% : 0.000003s : 21: predicate.accumulaten_eliminater 0.80% : 0.000002s : 6: predicate.ad_related_special_op_eliminate 0.50% : 0.000002s : 12: predicate.addn_check_dump 0.92% : 0.000003s : 21: predicate.addn_zero_filter 0.89% : 0.000003s : 21: predicate.adjust_all_reduce_mul_add 1.93% : 0.000006s : 33: predicate.arithmetic_simplify 1.02% : 0.000003s : 21: predicate.cast_eliminate 0.65% : 0.000002s : 12: predicate.check_bprop_eliminate 0.51% : 0.000002s : 12: predicate.compare_switch_simplify 0.20% : 0.000001s : 6: predicate.const_output_eliminate 0.59% : 0.000002s : 12: predicate.depend_value_elim 0.98% : 0.000003s : 21: predicate.dict_get_item_const_eliminator 1.09% : 0.000003s : 21: predicate.dict_get_item_eliminator 0.89% : 0.000003s : 21: predicate.dict_set_item_eliminator 0.79% : 0.000002s : 12: predicate.dumpgradient_eliminate 0.21% : 0.000001s : 6: predicate.elim_not_effective 0.42% : 0.000001s : 6: predicate.elim_shapecalc_of_broadcastargs 1.16% : 0.000004s : 27: predicate.environ_add_const_eliminate 1.15% : 0.000003s : 27: predicate.environ_get_add_eliminate 1.13% : 0.000003s : 27: predicate.environ_get_depend_swap 1.71% : 0.000005s : 39: predicate.environ_get_eliminate 1.11% : 0.000003s : 27: predicate.environ_get_set_eliminate 1.41% : 0.000004s : 31: predicate.exchange_switch_depend_value 2.29% : 0.000007s : 31: predicate.float_depend_g_call 0.49% : 0.000001s : 12: predicate.float_environ_get_switch 0.74% : 0.000002s : 18: predicate.float_tuple_getitem_switch 0.19% : 0.000001s : 6: predicate.fold_const_symbol 0.58% : 0.000002s : 12: predicate.get_grad_eliminate 0.20% : 0.000001s : 6: predicate.graph_param_transform 0.58% : 0.000002s : 12: predicate.incorporate_call 0.48% : 0.000001s : 12: predicate.incorporate_call_switch 9.25% : 0.000028s : 88: predicate.inline 0.80% : 0.000002s : 12: predicate.inline_without_move 0.33% : 0.000001s : 12: predicate.j_node_and_user_rematch 0.72% : 0.000002s : 12: predicate.less_batch_normalization 1.81% : 0.000006s : 39: predicate.list_to_tuple_eliminator_ 2.62% : 0.000008s : 60: predicate.load_eliminater 0.78% : 0.000002s : 6: predicate.loop_unroll_after_grad 1.90% : 0.000006s : 42: predicate.loop_unroll_before_grad 1.53% : 0.000005s : 33: predicate.make_slice_get_slice_eliminator 0.53% : 0.000002s : 12: predicate.merge_addn 0.52% : 0.000002s : 12: predicate.micro_step_allgather_replace 0.55% : 0.000002s : 12: predicate.mini_step_allgather_replace 0.99% : 0.000003s : 21: predicate.minmaximum_grad 0.90% : 0.000003s : 6: predicate.mutable_eliminate 0.34% : 0.000001s : 6: predicate.opt_reshape 0.34% : 0.000001s : 6: predicate.parallel_virtual_node 1.75% : 0.000005s : 31: predicate.partial_defer_inline 1.77% : 0.000005s : 33: predicate.partial_eliminate 0.93% : 0.000003s : 21: predicate.print_const_string_wrapper 0.51% : 0.000002s : 12: predicate.reduce_all_const_elim 1.15% : 0.000004s : 21: predicate.reduce_eliminate 2.53% : 0.000008s : 60: predicate.redundant_stop_gradient_eliminater 0.37% : 0.000001s : 12: predicate.remove_not_recompute_node 1.51% : 0.000005s : 39: predicate.replace_applicator 0.46% : 0.000001s : 12: predicate.replace_old_param 0.30% : 0.000001s : 6: predicate.reset_defer_inline 0.92% : 0.000003s : 21: predicate.reshape_eliminate 0.54% : 0.000002s : 12: predicate.row_tensor_add_zeros_like 0.33% : 0.000001s : 6: predicate.row_tensor_eliminate 0.65% : 0.000002s : 12: predicate.same_eliminate 0.59% : 0.000002s : 12: predicate.set_cell_output_no_recompute 0.80% : 0.000002s : 12: predicate.shard_identity_eliminate 0.59% : 0.000002s : 12: predicate.special_op_eliminate 0.70% : 0.000002s : 12: predicate.specialize_transform 0.81% : 0.000002s : 12: predicate.split_environ_get_set_with_tuple_value 0.72% : 0.000002s : 12: predicate.stack_unstack_eliminate 0.33% : 0.000001s : 6: predicate.switch_call_monad_eliminater 1.60% : 0.000005s : 31: predicate.switch_defer_inline 2.13% : 0.000006s : 43: predicate.switch_layer_defer_inline 4.51% : 0.000014s : 91: predicate.switch_simplify 0.92% : 0.000003s : 21: predicate.tile_eliminate 0.93% : 0.000003s : 21: predicate.transpose_eliminate 1.48% : 0.000005s : 33: predicate.tuple_list_convert_item_index_to_positive 1.58% : 0.000005s : 33: predicate.tuple_list_get_item_const_eliminator 1.49% : 0.000005s : 33: predicate.tuple_list_get_item_depend_reorder 3.13% : 0.000010s : 51: predicate.tuple_list_get_item_eliminator 1.58% : 0.000005s : 33: predicate.tuple_list_get_set_item_eliminator 2.16% : 0.000007s : 45: predicate.tuple_list_set_item_eliminator 1.77% : 0.000005s : 39: predicate.tuple_to_list_eliminator_ 2.55% : 0.000008s : 60: predicate.updatestate_pure_node_eliminater 3.16% : 0.000010s : 72: predicate.updatestate_useless_node_eliminater 0.44% : 0.000001s : 6: predicate.value_based_eliminate 0.59% : 0.000002s : 12: predicate.virtual_dataset_eliminate 0.59% : 0.000002s : 12: predicate.virtual_output_eliminate 0.30% : 0.000001s : 6: predicate.virtual_view_grad_eliminate 0.39% : 0.000001s : 6: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000612 11 57.00% : 0.000349s : 5: func_graph_cloner_run.FuncGraphClonerGraph 43.00% : 0.000263s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.027576 192 0.01% : 0.000003s : 1: ForceFp32Comm 11.47% : 0.003163s : 1: add_attr 11.44% : 0.003154s : 1: add_attr_with_inline 0.01% : 0.000004s : 1: add_comm_op_reuse_tag 0.24% : 0.000066s : 1: add_recomputation 0.01% : 0.000004s : 1: assign_add_opt 0.26% : 0.000072s : 1: auto_monad 0.09% : 0.000025s : 1: auto_monad_reorder 0.01% : 0.000003s : 1: begin_end_overlap_inline 0.02% : 0.000006s : 1: bias_add_comm_swap 1.83% : 0.000505s : 1: bootstrap 0.11% : 0.000031s : 1: cconv 0.01% : 0.000004s : 1: comm_op_add_attrs 0.07% : 0.000018s : 1: control_data_broadcast_order 0.04% : 0.000012s : 1: convert_after_rewriter 0.11% : 0.000030s : 1: cse_after_recomputation 0.02% : 0.000005s : 1: dataset_repeat_opt 0.02% : 0.000005s : 1: detach_backward 0.04% : 0.000010s : 1: environ_conv 0.09% : 0.000025s : 1: event_method 0.02% : 0.000005s : 1: full_micro_interleaved_order_control 0.02% : 0.000005s : 1: get_jit_bprop_graph 0.03% : 0.000009s : 1: graph_reusing 0.02% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000004s : 1: handle_group_info 0.02% : 0.000005s : 1: inline 0.02% : 0.000006s : 1: insert-virtual-dataset 0.01% : 0.000004s : 1: interleave_parallel_branches 0.02% : 0.000004s : 1: interleave_split_concat_branches 0.02% : 0.000006s : 1: label_fine_grained_interleaved_index 0.03% : 0.000008s : 1: label_micro_interleaved_index 1.68% : 0.000464s : 1: loop_unroll 0.01% : 0.000004s : 1: merge_cast_opt 0.02% : 0.000005s : 1: micro_interleaved_order_control 2.00% : 0.000550s : 1: mutable_eliminate 0.03% : 0.000008s : 1: offloading_packed_experts 0.07% : 0.000019s : 1: opt.transform.loop_unroll_optimizer 0.07% : 0.000020s : 1: opt.transform.mutable_eliminate 6.13% : 0.001690s : 78: opt.transform.opt_a 0.15% : 0.000041s : 1: opt.transform.opt_after_cconv 0.12% : 0.000034s : 1: opt.transform.opt_after_jit_grad 0.58% : 0.000159s : 28: opt.transform.opt_b 0.24% : 0.000066s : 2: opt.transform.opt_trans_graph 0.17% : 0.000048s : 4: opt.transform.symbol_engine_opt 12.10% : 0.003336s : 1: opt_a 0.47% : 0.000128s : 1: opt_after_cconv 1.87% : 0.000516s : 1: opt_after_jit_grad 1.02% : 0.000280s : 1: opt_b 20.32% : 0.005602s : 1: optimize 0.08% : 0.000023s : 1: optimize_parallel_all_gather_comm 0.03% : 0.000009s : 1: order_py_execute_after_rewriter 0.09% : 0.000025s : 1: overlap_grad_flash_sp 0.01% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.03% : 0.000008s : 1: overlap_grad_ring_attention 0.02% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.02% : 0.000005s : 1: overlap_param_gather 0.01% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.03% : 0.000009s : 1: overlap_recompute_and_grad_model_parallel 0.02% : 0.000005s : 1: overlap_recompute_comm 0.02% : 0.000007s : 1: parallel-infer-symbol 0.02% : 0.000005s : 1: parallel-infer-symbol-second 0.02% : 0.000005s : 1: partial_unused_args_eliminate 0.02% : 0.000004s : 1: pipeline_parallel_scheduler 0.02% : 0.000004s : 1: pipeline_split 0.14% : 0.000039s : 1: pre_auto_parallel 0.13% : 0.000035s : 1: py_interpret_to_execute 0.06% : 0.000016s : 1: py_interpret_to_execute_after_opt_a 0.02% : 0.000004s : 1: remove_cast_before_assign_add 0.07% : 0.000019s : 1: remove_dup_value 1.42% : 0.000391s : 1: renormalize.infer 1.17% : 0.000324s : 1: renormalize.specialize 0.02% : 0.000005s : 1: reorder_send_recv_between_fp_bp 0.03% : 0.000007s : 1: rewriter_after_jit_bprop_graph 0.17% : 0.000047s : 1: rewriter_after_opt_a 0.36% : 0.000100s : 1: rewriter_before_opt_a 0.02% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.02% : 0.000005s : 1: slice_recompute_activation 0.02% : 0.000004s : 1: split_layernorm_comm 0.02% : 0.000005s : 1: split_matmul_comm_elemetwise 0.03% : 0.000009s : 1: swap_dp_allreduce_reducescatter 0.34% : 0.000093s : 1: symbol_engine_optimizer 0.36% : 0.000098s : 1: tuple_transform 22.00% : 0.006068s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:47:15.547. [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:47:15.795. [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0159599, [21] [bootstrap]: 0.00043446 [type_inference]: 0.00562218 [event_method]: 1.973e-05 [auto_monad]: 6.116e-05 [graph_reusing]: 5.97999e-06 [inline]: 2.12999e-06 [add_attr]: 0.00304191, [1] [add_attr_with_inline]: 0.00303368, [1] [Cycle 1]: 7.38e-05, [2] [tag_attr]: 2.068e-05 [meta_addattr_fg_expand]: 6.37001e-06 [parallel-infer-symbol]: 3.38e-06 [pre_auto_parallel]: 3.835e-05 [insert-virtual-dataset]: 2.54001e-06 [parallel-infer-symbol-second]: 7.7e-07 [dataset_repeat_opt]: 2.06e-06 [pipeline_split]: 1.60001e-06 [optimize]: 0.00558868, [53] [py_interpret_to_execute]: 3.725e-05 [rewriter_before_opt_a]: 9.562e-05 [opt_a]: 0.00322716, [2] [Cycle 1]: 0.0022904, [45] [expand_dump_flag]: 3.25998e-06 [switch_simplify]: 4.669e-05 [loop_unroll]: 3.484e-05 [a_1]: 0.00072897 [with_stream_mark]: 1.749e-05 [recompute_prepare]: 1.161e-05 [updatestate_depend_eliminate]: 3.98001e-06 [updatestate_assign_eliminate]: 3.43e-06 [updatestate_loads_eliminate]: 3.02002e-06 [parameter_eliminate]: 2.15002e-06 [a_2]: 0.00012479 [accelerated_algorithm]: 7.83001e-06 [shard]: 2.48998e-06 [meta_shard_fg_expand]: 2.09999e-06 [shard_inline]: 7.11999e-06 [merge_send_recv]: 9.36002e-06 [auto_parallel]: 6.93e-06 [parallel]: 1.865e-05 [flash_sp]: 8.82999e-06 [merge_comm]: 4.16001e-06 [allreduce_fusion]: 3.58e-06 [matmul_add_comm_reduction]: 1.041e-05 [allreduce_slice_to_reducescatter]: 6.69999e-07 [virtual_shard_identity]: 9.35001e-06 [virtual_dataset]: 7.67002e-06 [get_grad_eliminate_]: 7.11001e-06 [virtual_output]: 7.05e-06 [merge_forward]: 3.75e-06 [cell_reuse_recompute_pass]: 1.25999e-06 [offload_activation]: 9.29998e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.936e-05 [merge_recompute_call_nodes]: 1.45001e-06 [before_grad]: 1.246e-05 [set_forward_comm_id_for_comm_node_pass]: 4.04002e-06 [meta_fg_expand]: 2.89001e-06 [flash_sp_send_recv_attached]: 2.42001e-06 [receive_attached]: 2.39999e-06 [after_resolve]: 1.406e-05 [a_after_grad]: 1.109e-05 [renormalize]: 0.00058777 [add_forward_monad_depend]: 6.64999e-06 [auto_monad_grad]: 2.72001e-06 [auto_monad_eliminator]: 1.57e-05 [cse]: 3.198e-05 [a_3]: 6.735e-05 [Cycle 2]: 0.00092371, [45] [expand_dump_flag]: 1.10999e-06 [switch_simplify]: 8.35001e-06 [loop_unroll]: 6.95998e-06 [a_1]: 0.00015795 [with_stream_mark]: 1.365e-05 [recompute_prepare]: 7.59002e-06 [updatestate_depend_eliminate]: 3.13e-06 [updatestate_assign_eliminate]: 3.12002e-06 [updatestate_loads_eliminate]: 2.52001e-06 [parameter_eliminate]: 1.74e-06 [a_2]: 0.00010856 [accelerated_algorithm]: 7.75998e-06 [shard]: 1.35001e-06 [meta_shard_fg_expand]: 1.76998e-06 [shard_inline]: 7.16001e-06 [merge_send_recv]: 7.16999e-06 [auto_parallel]: 5.81e-06 [parallel]: 5.24998e-06 [flash_sp]: 3.28998e-06 [merge_comm]: 3.41001e-06 [allreduce_fusion]: 3.09999e-06 [matmul_add_comm_reduction]: 7e-06 [allreduce_slice_to_reducescatter]: 3.39991e-07 [virtual_shard_identity]: 8.69003e-06 [virtual_dataset]: 7.11999e-06 [get_grad_eliminate_]: 6.50002e-06 [virtual_output]: 6.17001e-06 [merge_forward]: 2.93998e-06 [cell_reuse_recompute_pass]: 1.74998e-06 [offload_activation]: 7.41001e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.734e-05 [merge_recompute_call_nodes]: 1.03001e-06 [before_grad]: 1.069e-05 [set_forward_comm_id_for_comm_node_pass]: 4.18999e-06 [meta_fg_expand]: 2.19999e-06 [flash_sp_send_recv_attached]: 1.06997e-06 [receive_attached]: 1.04003e-06 [after_resolve]: 1.369e-05 [a_after_grad]: 1.063e-05 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 3.14999e-06 [auto_monad_grad]: 2.17001e-06 [auto_monad_eliminator]: 9.37001e-06 [cse]: 1.846e-05 [a_3]: 5.312e-05 [py_interpret_to_execute_after_opt_a]: 1.353e-05 [slice_cell_reuse_recomputed_activation]: 4.99e-06 [rewriter_after_opt_a]: 4.089e-05 [convert_after_rewriter]: 9.92999e-06 [order_py_execute_after_rewriter]: 8.19002e-06 [mutable_eliminate]: 0.00050466 [opt_b]: 0.00030101, [1] [Cycle 1]: 0.0002915, [7] [b_1]: 0.00019118 [b_2]: 9.42999e-06 [updatestate_depend_eliminate]: 6.02001e-06 [updatestate_assign_eliminate]: 2.84001e-06 [updatestate_loads_eliminate]: 2.90002e-06 [renormalize]: 8.79983e-07 [cse]: 2.06e-05 [optimize_parallel_all_gather_comm]: 2.093e-05 [overlap_param_gather]: 4.94e-06 [cconv]: 2.977e-05 [loop_unroll]: 0.00045701 [opt_after_cconv]: 0.00013548, [1] [Cycle 1]: 0.00012663, [7] [c_1]: 3.581e-05 [parameter_eliminate]: 3.31001e-06 [updatestate_depend_eliminate]: 6.01e-06 [updatestate_assign_eliminate]: 2.47001e-06 [updatestate_loads_eliminate]: 2.32999e-06 [cse]: 1.986e-05 [renormalize]: 8.30012e-07 [remove_dup_value]: 1.638e-05 [tuple_transform]: 9.997e-05, [1] [Cycle 1]: 9.284e-05, [4] [d_1]: 5.152e-05 [none_parameter_eliminate]: 1.91003e-06 [renormalize]: 1.39989e-07 [switch_simplify]: 7.78001e-06 [partial_unused_args_eliminate]: 4.58999e-06 [add_recomputation]: 5.186e-05 [cse_after_recomputation]: 2.97e-05, [1] [Cycle 1]: 2.208e-05, [1] [cse]: 1.233e-05 [environ_conv]: 8.05999e-06 [swap_dp_allreduce_reducescatter]: 8.17e-06 [bias_add_comm_swap]: 5.34e-06 [label_micro_interleaved_index]: 7.55e-06 [label_fine_grained_interleaved_index]: 5.32001e-06 [merge_cast_opt]: 3.76999e-06 [slice_recompute_activation]: 4.63999e-06 [micro_interleaved_order_control]: 4.93001e-06 [assign_add_opt]: 3.48999e-06 [ForceFp32Comm]: 3.16999e-06 [remove_cast_before_assign_add]: 3.63999e-06 [full_micro_interleaved_order_control]: 4.90999e-06 [reorder_send_recv_between_fp_bp]: 5.56e-06 [comm_op_add_attrs]: 3.95998e-06 [add_comm_op_reuse_tag]: 3.23e-06 [interleave_split_concat_branches]: 3.57002e-06 [interleave_parallel_branches]: 3.4e-06 [overlap_opt_shard_in_pipeline]: 3.94002e-06 [overlap_opt_shard_grad_in_pipeline]: 4.23001e-06 [control_data_broadcast_order]: 1.586e-05 [grouped_pairwise_exchange_alltoall]: 4.01001e-06 [offloading_packed_experts]: 6.66999e-06 [overlap_recompute_and_grad_model_parallel]: 7.48999e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.50998e-06 [overlap_recompute_allgather_and_fa_grad]: 3.83001e-06 [overlap_recompute_comm]: 4.60001e-06 [overlap_grad_ring_attention]: 6.30002e-06 [overlap_grad_flash_sp]: 2.195e-05 [begin_end_overlap_inline]: 2.84999e-06 [split_matmul_comm_elemetwise]: 4.85001e-06 [split_layernorm_comm]: 3.93001e-06 [handle_group_info]: 3.41999e-06 [symbol_engine_optimizer]: 0.00010127, [1] [Cycle 1]: 9.433e-05, [6] [build]: 2.98e-06 [elim_shapecalc]: 1.076e-05 [elim_not_effective]: 1.434e-05 [opt_reshape]: 7.95e-06 [fold_const_symbol]: 1.049e-05 [renormalize]: 4.20026e-07 [detach_backward]: 3.48999e-06 [pipeline_parallel_scheduler]: 1.87999e-06 [auto_monad_reorder]: 2.147e-05 [get_jit_bprop_graph]: 1.59e-06 [rewriter_after_jit_bprop_graph]: 3.90998e-06 [opt_after_jit_grad]: 0.00050947 [validate]: 3.686e-05 Sums bootstrap : 0.000434s : 3.90% type_inference : 0.005622s : 50.41% event_method : 0.000020s : 0.18% auto_monad : 0.000061s : 0.55% graph_reusing : 0.000006s : 0.05% inline : 0.000002s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000021s : 0.19% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.06% parallel-infer-symbol : 0.000003s : 0.03% pre_auto_parallel : 0.000038s : 0.34% insert-virtual-dataset : 0.000003s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000037s : 0.33% optimize.rewriter_before_opt_a : 0.000096s : 0.86% optimize.opt_a.expand_dump_flag : 0.000004s : 0.04% optimize.opt_a.switch_simplify : 0.000055s : 0.49% optimize.opt_a.loop_unroll : 0.000042s : 0.37% optimize.opt_a.a_1 : 0.000887s : 7.95% optimize.opt_a.with_stream_mark : 0.000031s : 0.28% optimize.opt_a.recompute_prepare : 0.000019s : 0.17% optimize.opt_a.updatestate_depend_eliminate : 0.000007s : 0.06% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.06% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.05% optimize.opt_a.parameter_eliminate : 0.000004s : 0.03% optimize.opt_a.a_2 : 0.000233s : 2.09% optimize.opt_a.accelerated_algorithm : 0.000016s : 0.14% optimize.opt_a.shard : 0.000004s : 0.03% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.03% optimize.opt_a.shard_inline : 0.000014s : 0.13% optimize.opt_a.merge_send_recv : 0.000017s : 0.15% optimize.opt_a.auto_parallel : 0.000013s : 0.11% optimize.opt_a.parallel : 0.000024s : 0.21% optimize.opt_a.flash_sp : 0.000012s : 0.11% optimize.opt_a.merge_comm : 0.000008s : 0.07% optimize.opt_a.allreduce_fusion : 0.000007s : 0.06% optimize.opt_a.matmul_add_comm_reduction : 0.000017s : 0.16% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000018s : 0.16% optimize.opt_a.virtual_dataset : 0.000015s : 0.13% optimize.opt_a.get_grad_eliminate_ : 0.000014s : 0.12% optimize.opt_a.virtual_output : 0.000013s : 0.12% optimize.opt_a.merge_forward : 0.000007s : 0.06% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.03% optimize.opt_a.offload_activation : 0.000017s : 0.15% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000037s : 0.33% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.02% optimize.opt_a.before_grad : 0.000023s : 0.21% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000008s : 0.07% optimize.opt_a.meta_fg_expand : 0.000005s : 0.05% optimize.opt_a.flash_sp_send_recv_attached : 0.000003s : 0.03% optimize.opt_a.receive_attached : 0.000003s : 0.03% optimize.opt_a.after_resolve : 0.000028s : 0.25% optimize.opt_a.a_after_grad : 0.000022s : 0.19% optimize.opt_a.renormalize : 0.000588s : 5.27% optimize.opt_a.add_forward_monad_depend : 0.000010s : 0.09% optimize.opt_a.auto_monad_grad : 0.000005s : 0.04% optimize.opt_a.auto_monad_eliminator : 0.000025s : 0.22% optimize.opt_a.cse : 0.000050s : 0.45% optimize.opt_a.a_3 : 0.000120s : 1.08% optimize.py_interpret_to_execute_after_opt_a : 0.000014s : 0.12% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.04% optimize.rewriter_after_opt_a : 0.000041s : 0.37% optimize.convert_after_rewriter : 0.000010s : 0.09% optimize.order_py_execute_after_rewriter : 0.000008s : 0.07% optimize.mutable_eliminate : 0.000505s : 4.52% optimize.opt_b.b_1 : 0.000191s : 1.71% optimize.opt_b.b_2 : 0.000009s : 0.08% optimize.opt_b.updatestate_depend_eliminate : 0.000006s : 0.05% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_b.renormalize : 0.000001s : 0.01% optimize.opt_b.cse : 0.000021s : 0.18% optimize.optimize_parallel_all_gather_comm : 0.000021s : 0.19% optimize.overlap_param_gather : 0.000005s : 0.04% optimize.cconv : 0.000030s : 0.27% optimize.loop_unroll : 0.000457s : 4.10% optimize.opt_after_cconv.c_1 : 0.000036s : 0.32% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.05% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000002s : 0.02% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.02% optimize.opt_after_cconv.cse : 0.000020s : 0.18% optimize.opt_after_cconv.renormalize : 0.000001s : 0.01% optimize.remove_dup_value : 0.000016s : 0.15% optimize.tuple_transform.d_1 : 0.000052s : 0.46% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.02% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000008s : 0.07% optimize.partial_unused_args_eliminate : 0.000005s : 0.04% optimize.add_recomputation : 0.000052s : 0.46% optimize.cse_after_recomputation.cse : 0.000012s : 0.11% optimize.environ_conv : 0.000008s : 0.07% optimize.swap_dp_allreduce_reducescatter : 0.000008s : 0.07% optimize.bias_add_comm_swap : 0.000005s : 0.05% optimize.label_micro_interleaved_index : 0.000008s : 0.07% optimize.label_fine_grained_interleaved_index : 0.000005s : 0.05% optimize.merge_cast_opt : 0.000004s : 0.03% optimize.slice_recompute_activation : 0.000005s : 0.04% optimize.micro_interleaved_order_control : 0.000005s : 0.04% optimize.assign_add_opt : 0.000003s : 0.03% optimize.ForceFp32Comm : 0.000003s : 0.03% optimize.remove_cast_before_assign_add : 0.000004s : 0.03% optimize.full_micro_interleaved_order_control : 0.000005s : 0.04% optimize.reorder_send_recv_between_fp_bp : 0.000006s : 0.05% optimize.comm_op_add_attrs : 0.000004s : 0.04% optimize.add_comm_op_reuse_tag : 0.000003s : 0.03% optimize.interleave_split_concat_branches : 0.000004s : 0.03% optimize.interleave_parallel_branches : 0.000003s : 0.03% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.04% optimize.overlap_opt_shard_grad_in_pipeline : 0.000004s : 0.04% optimize.control_data_broadcast_order : 0.000016s : 0.14% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.04% optimize.offloading_packed_experts : 0.000007s : 0.06% optimize.overlap_recompute_and_grad_model_parallel : 0.000007s : 0.07% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.03% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.03% optimize.overlap_recompute_comm : 0.000005s : 0.04% optimize.overlap_grad_ring_attention : 0.000006s : 0.06% optimize.overlap_grad_flash_sp : 0.000022s : 0.20% optimize.begin_end_overlap_inline : 0.000003s : 0.03% optimize.split_matmul_comm_elemetwise : 0.000005s : 0.04% optimize.split_layernorm_comm : 0.000004s : 0.04% optimize.handle_group_info : 0.000003s : 0.03% optimize.symbol_engine_optimizer.build : 0.000003s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000011s : 0.10% optimize.symbol_engine_optimizer.elim_not_effective : 0.000014s : 0.13% optimize.symbol_engine_optimizer.opt_reshape : 0.000008s : 0.07% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000010s : 0.09% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000003s : 0.03% pipeline_parallel_scheduler : 0.000002s : 0.02% auto_monad_reorder : 0.000021s : 0.19% get_jit_bprop_graph : 0.000002s : 0.01% rewriter_after_jit_bprop_graph : 0.000004s : 0.04% opt_after_jit_grad : 0.000509s : 4.57% validate : 0.000037s : 0.33% Time group info: ------[substitution.] 0.000204 34 0.88% : 0.000002s : 2: substitution.elim_not_effective 0.73% : 0.000001s : 2: substitution.fold_const_symbol 3.08% : 0.000006s : 5: substitution.graph_param_transform 74.55% : 0.000152s : 4: substitution.inline 2.35% : 0.000005s : 4: substitution.j_node_and_user_rematch 3.54% : 0.000007s : 4: substitution.remove_not_recompute_node 2.76% : 0.000006s : 6: substitution.replace_old_param 9.06% : 0.000019s : 6: substitution.tuple_list_get_item_eliminator 3.05% : 0.000006s : 1: substitution.value_based_eliminate ------[type_inference.] 0.005574 2 87.11% : 0.004856s : 1: type_inference.infer 12.89% : 0.000718s : 1: type_inference.specialize ------[replace.] 0.000071 10 54.70% : 0.000039s : 4: replace.inline 45.30% : 0.000032s : 6: replace.tuple_list_get_item_eliminator ------[match.] 0.000165 10 90.39% : 0.000150s : 4: match.inline 9.61% : 0.000016s : 6: match.tuple_list_get_item_eliminator ------[predicate.] 0.000232 1590 0.92% : 0.000002s : 16: predicate.accumulaten_eliminater 0.71% : 0.000002s : 5: predicate.ad_related_special_op_eliminate 0.51% : 0.000001s : 10: predicate.addn_check_dump 0.95% : 0.000002s : 16: predicate.addn_zero_filter 0.85% : 0.000002s : 16: predicate.adjust_all_reduce_mul_add 1.85% : 0.000004s : 26: predicate.arithmetic_simplify 0.89% : 0.000002s : 16: predicate.cast_eliminate 0.59% : 0.000001s : 10: predicate.check_bprop_eliminate 0.52% : 0.000001s : 10: predicate.compare_switch_simplify 0.19% : 0.000000s : 5: predicate.const_output_eliminate 0.63% : 0.000001s : 10: predicate.depend_value_elim 0.96% : 0.000002s : 16: predicate.dict_get_item_const_eliminator 1.04% : 0.000002s : 16: predicate.dict_get_item_eliminator 0.90% : 0.000002s : 16: predicate.dict_set_item_eliminator 0.90% : 0.000002s : 10: predicate.dumpgradient_eliminate 0.28% : 0.000001s : 5: predicate.elim_not_effective 0.35% : 0.000001s : 5: predicate.elim_shapecalc_of_broadcastargs 1.36% : 0.000003s : 21: predicate.environ_add_const_eliminate 1.14% : 0.000003s : 21: predicate.environ_get_add_eliminate 1.11% : 0.000003s : 21: predicate.environ_get_depend_swap 1.88% : 0.000004s : 31: predicate.environ_get_eliminate 1.14% : 0.000003s : 21: predicate.environ_get_set_eliminate 1.53% : 0.000004s : 26: predicate.exchange_switch_depend_value 2.40% : 0.000006s : 26: predicate.float_depend_g_call 0.55% : 0.000001s : 10: predicate.float_environ_get_switch 0.85% : 0.000002s : 15: predicate.float_tuple_getitem_switch 0.19% : 0.000000s : 5: predicate.fold_const_symbol 0.58% : 0.000001s : 10: predicate.get_grad_eliminate 0.23% : 0.000001s : 5: predicate.graph_param_transform 0.63% : 0.000001s : 10: predicate.incorporate_call 0.49% : 0.000001s : 10: predicate.incorporate_call_switch 6.04% : 0.000014s : 72: predicate.inline 0.68% : 0.000002s : 10: predicate.inline_without_move 0.35% : 0.000001s : 10: predicate.j_node_and_user_rematch 0.74% : 0.000002s : 10: predicate.less_batch_normalization 1.93% : 0.000004s : 32: predicate.list_to_tuple_eliminator_ 2.70% : 0.000006s : 48: predicate.load_eliminater 0.90% : 0.000002s : 5: predicate.loop_unroll_after_grad 2.40% : 0.000006s : 40: predicate.loop_unroll_before_grad 1.54% : 0.000004s : 26: predicate.make_slice_get_slice_eliminator 0.55% : 0.000001s : 10: predicate.merge_addn 0.57% : 0.000001s : 10: predicate.micro_step_allgather_replace 0.55% : 0.000001s : 10: predicate.mini_step_allgather_replace 0.82% : 0.000002s : 16: predicate.minmaximum_grad 0.95% : 0.000002s : 5: predicate.mutable_eliminate 0.32% : 0.000001s : 5: predicate.opt_reshape 0.36% : 0.000001s : 5: predicate.parallel_virtual_node 1.93% : 0.000004s : 26: predicate.partial_defer_inline 1.79% : 0.000004s : 27: predicate.partial_eliminate 0.92% : 0.000002s : 16: predicate.print_const_string_wrapper 0.57% : 0.000001s : 10: predicate.reduce_all_const_elim 1.14% : 0.000003s : 16: predicate.reduce_eliminate 2.60% : 0.000006s : 48: predicate.redundant_stop_gradient_eliminater 0.55% : 0.000001s : 10: predicate.remove_not_recompute_node 1.62% : 0.000004s : 32: predicate.replace_applicator 0.71% : 0.000002s : 10: predicate.replace_old_param 0.27% : 0.000001s : 5: predicate.reset_defer_inline 0.94% : 0.000002s : 16: predicate.reshape_eliminate 0.65% : 0.000002s : 10: predicate.row_tensor_add_zeros_like 0.34% : 0.000001s : 5: predicate.row_tensor_eliminate 0.76% : 0.000002s : 10: predicate.same_eliminate 0.48% : 0.000001s : 10: predicate.set_cell_output_no_recompute 0.71% : 0.000002s : 10: predicate.shard_identity_eliminate 0.64% : 0.000001s : 10: predicate.special_op_eliminate 0.71% : 0.000002s : 10: predicate.specialize_transform 0.73% : 0.000002s : 10: predicate.split_environ_get_set_with_tuple_value 0.72% : 0.000002s : 10: predicate.stack_unstack_eliminate 0.33% : 0.000001s : 5: predicate.switch_call_monad_eliminater 1.64% : 0.000004s : 26: predicate.switch_defer_inline 2.16% : 0.000005s : 36: predicate.switch_layer_defer_inline 5.17% : 0.000012s : 81: predicate.switch_simplify 0.92% : 0.000002s : 16: predicate.tile_eliminate 0.90% : 0.000002s : 16: predicate.transpose_eliminate 1.52% : 0.000004s : 26: predicate.tuple_list_convert_item_index_to_positive 1.53% : 0.000004s : 26: predicate.tuple_list_get_item_const_eliminator 1.44% : 0.000003s : 26: predicate.tuple_list_get_item_depend_reorder 3.23% : 0.000007s : 42: predicate.tuple_list_get_item_eliminator 1.44% : 0.000003s : 26: predicate.tuple_list_get_set_item_eliminator 2.22% : 0.000005s : 36: predicate.tuple_list_set_item_eliminator 1.91% : 0.000004s : 32: predicate.tuple_to_list_eliminator_ 2.60% : 0.000006s : 48: predicate.updatestate_pure_node_eliminater 3.31% : 0.000008s : 58: predicate.updatestate_useless_node_eliminater 0.50% : 0.000001s : 5: predicate.value_based_eliminate 0.71% : 0.000002s : 10: predicate.virtual_dataset_eliminate 0.57% : 0.000001s : 10: predicate.virtual_output_eliminate 0.25% : 0.000001s : 5: predicate.virtual_view_grad_eliminate 0.38% : 0.000001s : 5: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000529 11 54.07% : 0.000286s : 5: func_graph_cloner_run.FuncGraphClonerGraph 45.93% : 0.000243s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.026756 192 0.02% : 0.000006s : 1: ForceFp32Comm 11.40% : 0.003050s : 1: add_attr 11.35% : 0.003037s : 1: add_attr_with_inline 0.02% : 0.000006s : 1: add_comm_op_reuse_tag 0.21% : 0.000056s : 1: add_recomputation 0.02% : 0.000006s : 1: assign_add_opt 0.26% : 0.000070s : 1: auto_monad 0.11% : 0.000029s : 1: auto_monad_reorder 0.02% : 0.000006s : 1: begin_end_overlap_inline 0.03% : 0.000008s : 1: bias_add_comm_swap 1.79% : 0.000478s : 1: bootstrap 0.12% : 0.000033s : 1: cconv 0.03% : 0.000007s : 1: comm_op_add_attrs 0.07% : 0.000019s : 1: control_data_broadcast_order 0.05% : 0.000013s : 1: convert_after_rewriter 0.12% : 0.000033s : 1: cse_after_recomputation 0.03% : 0.000008s : 1: dataset_repeat_opt 0.08% : 0.000020s : 1: detach_backward 0.04% : 0.000011s : 1: environ_conv 0.11% : 0.000030s : 1: event_method 0.03% : 0.000008s : 1: full_micro_interleaved_order_control 0.03% : 0.000008s : 1: get_jit_bprop_graph 0.05% : 0.000012s : 1: graph_reusing 0.02% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.02% : 0.000006s : 1: handle_group_info 0.03% : 0.000008s : 1: inline 0.03% : 0.000009s : 1: insert-virtual-dataset 0.02% : 0.000006s : 1: interleave_parallel_branches 0.02% : 0.000006s : 1: interleave_split_concat_branches 0.03% : 0.000008s : 1: label_fine_grained_interleaved_index 0.04% : 0.000010s : 1: label_micro_interleaved_index 1.73% : 0.000463s : 1: loop_unroll 0.03% : 0.000007s : 1: merge_cast_opt 0.03% : 0.000008s : 1: micro_interleaved_order_control 1.91% : 0.000512s : 1: mutable_eliminate 0.04% : 0.000010s : 1: offloading_packed_experts 0.06% : 0.000016s : 1: opt.transform.loop_unroll_optimizer 0.06% : 0.000017s : 1: opt.transform.mutable_eliminate 5.17% : 0.001384s : 78: opt.transform.opt_a 0.13% : 0.000035s : 1: opt.transform.opt_after_cconv 0.11% : 0.000029s : 1: opt.transform.opt_after_jit_grad 0.48% : 0.000128s : 28: opt.transform.opt_b 0.21% : 0.000057s : 2: opt.transform.opt_trans_graph 0.15% : 0.000039s : 4: opt.transform.symbol_engine_opt 12.08% : 0.003231s : 1: opt_a 0.52% : 0.000139s : 1: opt_after_cconv 1.94% : 0.000520s : 1: opt_after_jit_grad 1.14% : 0.000304s : 1: opt_b 22.07% : 0.005906s : 1: optimize 0.09% : 0.000025s : 1: optimize_parallel_all_gather_comm 0.04% : 0.000011s : 1: order_py_execute_after_rewriter 0.10% : 0.000026s : 1: overlap_grad_flash_sp 0.02% : 0.000006s : 1: overlap_grad_matmul_and_grad_allreduce 0.03% : 0.000009s : 1: overlap_grad_ring_attention 0.03% : 0.000007s : 1: overlap_opt_shard_grad_in_pipeline 0.02% : 0.000007s : 1: overlap_opt_shard_in_pipeline 0.03% : 0.000008s : 1: overlap_param_gather 0.02% : 0.000006s : 1: overlap_recompute_allgather_and_fa_grad 0.04% : 0.000010s : 1: overlap_recompute_and_grad_model_parallel 0.03% : 0.000007s : 1: overlap_recompute_comm 0.04% : 0.000010s : 1: parallel-infer-symbol 0.02% : 0.000006s : 1: parallel-infer-symbol-second 0.03% : 0.000007s : 1: partial_unused_args_eliminate 0.03% : 0.000009s : 1: pipeline_parallel_scheduler 0.03% : 0.000007s : 1: pipeline_split 0.17% : 0.000047s : 1: pre_auto_parallel 0.15% : 0.000041s : 1: py_interpret_to_execute 0.06% : 0.000017s : 1: py_interpret_to_execute_after_opt_a 0.02% : 0.000006s : 1: remove_cast_before_assign_add 0.07% : 0.000020s : 1: remove_dup_value 1.09% : 0.000292s : 1: renormalize.infer 1.07% : 0.000287s : 1: renormalize.specialize 0.03% : 0.000008s : 1: reorder_send_recv_between_fp_bp 0.04% : 0.000010s : 1: rewriter_after_jit_bprop_graph 0.17% : 0.000046s : 1: rewriter_after_opt_a 0.37% : 0.000099s : 1: rewriter_before_opt_a 0.03% : 0.000008s : 1: slice_cell_reuse_recomputed_activation 0.03% : 0.000007s : 1: slice_recompute_activation 0.03% : 0.000007s : 1: split_layernorm_comm 0.03% : 0.000008s : 1: split_matmul_comm_elemetwise 0.04% : 0.000011s : 1: swap_dp_allreduce_reducescatter 0.39% : 0.000104s : 1: symbol_engine_optimizer 0.38% : 0.000103s : 1: tuple_transform 21.14% : 0.005655s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:47:15.195.414 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0150494, [21] [bootstrap]: 0.00045538 [type_inference]: 0.00597698 [event_method]: 2.088e-05 [auto_monad]: 6.287e-05 [graph_reusing]: 5.91e-06 [inline]: 2.44999e-06 [add_attr]: 0.00308086, [1] [add_attr_with_inline]: 0.00307274, [1] [Cycle 1]: 6.138e-05, [2] [tag_attr]: 2.053e-05 [meta_addattr_fg_expand]: 6.19001e-06 [parallel-infer-symbol]: 3.09999e-06 [pre_auto_parallel]: 3.366e-05 [insert-virtual-dataset]: 2.40002e-06 [parallel-infer-symbol-second]: 7.50006e-07 [dataset_repeat_opt]: 2.26e-06 [pipeline_split]: 1.59e-06 [optimize]: 0.00474529, [53] [py_interpret_to_execute]: 2.765e-05 [rewriter_before_opt_a]: 8.883e-05 [opt_a]: 0.00276376, [2] [Cycle 1]: 0.0020604, [45] [expand_dump_flag]: 3.03998e-06 [switch_simplify]: 4.751e-05 [loop_unroll]: 3.501e-05 [a_1]: 0.00071789 [with_stream_mark]: 1.606e-05 [recompute_prepare]: 9.35001e-06 [updatestate_depend_eliminate]: 3.69002e-06 [updatestate_assign_eliminate]: 3.33e-06 [updatestate_loads_eliminate]: 2.81999e-06 [parameter_eliminate]: 1.74e-06 [a_2]: 9.006e-05 [accelerated_algorithm]: 7.83999e-06 [shard]: 1.81003e-06 [meta_shard_fg_expand]: 1.72001e-06 [shard_inline]: 7.1e-06 [merge_send_recv]: 7.68001e-06 [auto_parallel]: 6.21e-06 [parallel]: 1.807e-05 [flash_sp]: 8.18001e-06 [merge_comm]: 4.08999e-06 [allreduce_fusion]: 3.7e-06 [matmul_add_comm_reduction]: 9.28002e-06 [allreduce_slice_to_reducescatter]: 6.00005e-07 [virtual_shard_identity]: 8.59998e-06 [virtual_dataset]: 7.53e-06 [get_grad_eliminate_]: 6.99001e-06 [virtual_output]: 7.1e-06 [merge_forward]: 3.53e-06 [cell_reuse_recompute_pass]: 1.24e-06 [offload_activation]: 9.32999e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.342e-05 [merge_recompute_call_nodes]: 1.44e-06 [before_grad]: 1.109e-05 [set_forward_comm_id_for_comm_node_pass]: 3.5e-06 [meta_fg_expand]: 2.74999e-06 [flash_sp_send_recv_attached]: 2.24999e-06 [receive_attached]: 2.04999e-06 [after_resolve]: 1.31e-05 [a_after_grad]: 1.591e-05 [renormalize]: 0.00059682 [add_forward_monad_depend]: 5.12e-06 [auto_monad_grad]: 2.22999e-06 [auto_monad_eliminator]: 1.432e-05 [cse]: 3.258e-05 [a_3]: 5.211e-05 [Cycle 2]: 0.00069353, [45] [expand_dump_flag]: 1.12999e-06 [switch_simplify]: 9.04e-06 [loop_unroll]: 6.91001e-06 [a_1]: 0.00015736 [with_stream_mark]: 1.165e-05 [recompute_prepare]: 7.08e-06 [updatestate_depend_eliminate]: 3.01999e-06 [updatestate_assign_eliminate]: 2.74001e-06 [updatestate_loads_eliminate]: 2.14999e-06 [parameter_eliminate]: 9.50007e-07 [a_2]: 8.056e-05 [accelerated_algorithm]: 7.05e-06 [shard]: 1.46002e-06 [meta_shard_fg_expand]: 1.27e-06 [shard_inline]: 6.77002e-06 [merge_send_recv]: 4.67e-06 [auto_parallel]: 5.08002e-06 [parallel]: 5.39e-06 [flash_sp]: 3.40998e-06 [merge_comm]: 3.21001e-06 [allreduce_fusion]: 2.99999e-06 [matmul_add_comm_reduction]: 5.79e-06 [allreduce_slice_to_reducescatter]: 3.19997e-07 [virtual_shard_identity]: 7.6e-06 [virtual_dataset]: 6.86999e-06 [get_grad_eliminate_]: 9.83998e-06 [virtual_output]: 6.52001e-06 [merge_forward]: 2.79001e-06 [cell_reuse_recompute_pass]: 1.52999e-06 [offload_activation]: 6.36998e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.276e-05 [merge_recompute_call_nodes]: 7.30011e-07 [before_grad]: 9.36998e-06 [set_forward_comm_id_for_comm_node_pass]: 3.21999e-06 [meta_fg_expand]: 2.04999e-06 [flash_sp_send_recv_attached]: 9.50007e-07 [receive_attached]: 1.09003e-06 [after_resolve]: 1.306e-05 [a_after_grad]: 1.058e-05 [renormalize]: 8.00064e-08 [add_forward_monad_depend]: 1.30999e-06 [auto_monad_grad]: 8.59989e-07 [auto_monad_eliminator]: 6.68998e-06 [cse]: 1.437e-05 [a_3]: 4.008e-05 [py_interpret_to_execute_after_opt_a]: 9.89001e-06 [slice_cell_reuse_recomputed_activation]: 2.22999e-06 [rewriter_after_opt_a]: 3.382e-05 [convert_after_rewriter]: 7.35e-06 [order_py_execute_after_rewriter]: 4.93001e-06 [mutable_eliminate]: 0.00047831 [opt_b]: 0.00022456, [1] [Cycle 1]: 0.00021859, [7] [b_1]: 0.00014588 [b_2]: 8.54002e-06 [updatestate_depend_eliminate]: 5.19998e-06 [updatestate_assign_eliminate]: 2.39999e-06 [updatestate_loads_eliminate]: 2.68998e-06 [renormalize]: 3.20026e-07 [cse]: 1.865e-05 [optimize_parallel_all_gather_comm]: 1.579e-05 [overlap_param_gather]: 1.99e-06 [cconv]: 2.484e-05 [loop_unroll]: 0.00041476 [opt_after_cconv]: 0.00010352, [1] [Cycle 1]: 9.811e-05, [7] [c_1]: 3.489e-05 [parameter_eliminate]: 2.39999e-06 [updatestate_depend_eliminate]: 4.53001e-06 [updatestate_assign_eliminate]: 2.58e-06 [updatestate_loads_eliminate]: 2.34999e-06 [cse]: 1.722e-05 [renormalize]: 3.50003e-07 [remove_dup_value]: 1.366e-05 [tuple_transform]: 8.007e-05, [1] [Cycle 1]: 7.561e-05, [4] [d_1]: 4.835e-05 [none_parameter_eliminate]: 1.86e-06 [renormalize]: 2.19996e-07 [switch_simplify]: 7.35e-06 [partial_unused_args_eliminate]: 1.82001e-06 [add_recomputation]: 4.495e-05 [cse_after_recomputation]: 2.123e-05, [1] [Cycle 1]: 1.694e-05, [1] [cse]: 1.14e-05 [environ_conv]: 4.64002e-06 [swap_dp_allreduce_reducescatter]: 5.05001e-06 [bias_add_comm_swap]: 2.26998e-06 [label_micro_interleaved_index]: 4e-06 [label_fine_grained_interleaved_index]: 2.81e-06 [merge_cast_opt]: 1.38002e-06 [slice_recompute_activation]: 2.26998e-06 [micro_interleaved_order_control]: 2.21e-06 [assign_add_opt]: 1.12e-06 [ForceFp32Comm]: 9.39996e-07 [remove_cast_before_assign_add]: 9.60019e-07 [full_micro_interleaved_order_control]: 1.99999e-06 [reorder_send_recv_between_fp_bp]: 2.53e-06 [comm_op_add_attrs]: 9.70002e-07 [add_comm_op_reuse_tag]: 9.29984e-07 [interleave_split_concat_branches]: 1.10999e-06 [interleave_parallel_branches]: 1.02e-06 [overlap_opt_shard_in_pipeline]: 1.15999e-06 [overlap_opt_shard_grad_in_pipeline]: 1.81003e-06 [control_data_broadcast_order]: 1.226e-05 [grouped_pairwise_exchange_alltoall]: 1.60001e-06 [offloading_packed_experts]: 3.77998e-06 [overlap_recompute_and_grad_model_parallel]: 4.89e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.43002e-06 [overlap_recompute_allgather_and_fa_grad]: 1.57001e-06 [overlap_recompute_comm]: 2.23002e-06 [overlap_grad_ring_attention]: 4.22e-06 [overlap_grad_flash_sp]: 1.745e-05 [begin_end_overlap_inline]: 5.60016e-07 [split_matmul_comm_elemetwise]: 2.17001e-06 [split_layernorm_comm]: 1.72999e-06 [handle_group_info]: 1.22e-06 [symbol_engine_optimizer]: 7.577e-05, [1] [Cycle 1]: 7.172e-05, [6] [build]: 2.81999e-06 [elim_shapecalc]: 9.62999e-06 [elim_not_effective]: 1.408e-05 [opt_reshape]: 7.4e-06 [fold_const_symbol]: 1.058e-05 [renormalize]: 2.50002e-07 [detach_backward]: 1.82999e-06 [pipeline_parallel_scheduler]: 1.85001e-06 [auto_monad_reorder]: 1.735e-05 [get_jit_bprop_graph]: 1.49998e-06 [rewriter_after_jit_bprop_graph]: 3.49001e-06 [opt_after_jit_grad]: 0.00044824 [validate]: 3.538e-05 Sums bootstrap : 0.000455s : 4.12% type_inference : 0.005977s : 54.14% event_method : 0.000021s : 0.19% auto_monad : 0.000063s : 0.57% graph_reusing : 0.000006s : 0.05% inline : 0.000002s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000021s : 0.19% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.06% parallel-infer-symbol : 0.000003s : 0.03% pre_auto_parallel : 0.000034s : 0.30% insert-virtual-dataset : 0.000002s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000028s : 0.25% optimize.rewriter_before_opt_a : 0.000089s : 0.80% optimize.opt_a.expand_dump_flag : 0.000004s : 0.04% optimize.opt_a.switch_simplify : 0.000057s : 0.51% optimize.opt_a.loop_unroll : 0.000042s : 0.38% optimize.opt_a.a_1 : 0.000875s : 7.93% optimize.opt_a.with_stream_mark : 0.000028s : 0.25% optimize.opt_a.recompute_prepare : 0.000016s : 0.15% optimize.opt_a.updatestate_depend_eliminate : 0.000007s : 0.06% optimize.opt_a.updatestate_assign_eliminate : 0.000006s : 0.05% optimize.opt_a.updatestate_loads_eliminate : 0.000005s : 0.05% optimize.opt_a.parameter_eliminate : 0.000003s : 0.02% optimize.opt_a.a_2 : 0.000171s : 1.55% optimize.opt_a.accelerated_algorithm : 0.000015s : 0.13% optimize.opt_a.shard : 0.000003s : 0.03% optimize.opt_a.meta_shard_fg_expand : 0.000003s : 0.03% optimize.opt_a.shard_inline : 0.000014s : 0.13% optimize.opt_a.merge_send_recv : 0.000012s : 0.11% optimize.opt_a.auto_parallel : 0.000011s : 0.10% optimize.opt_a.parallel : 0.000023s : 0.21% optimize.opt_a.flash_sp : 0.000012s : 0.10% optimize.opt_a.merge_comm : 0.000007s : 0.07% optimize.opt_a.allreduce_fusion : 0.000007s : 0.06% optimize.opt_a.matmul_add_comm_reduction : 0.000015s : 0.14% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000016s : 0.15% optimize.opt_a.virtual_dataset : 0.000014s : 0.13% optimize.opt_a.get_grad_eliminate_ : 0.000017s : 0.15% optimize.opt_a.virtual_output : 0.000014s : 0.12% optimize.opt_a.merge_forward : 0.000006s : 0.06% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.03% optimize.opt_a.offload_activation : 0.000016s : 0.14% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000026s : 0.24% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.02% optimize.opt_a.before_grad : 0.000020s : 0.19% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000007s : 0.06% optimize.opt_a.meta_fg_expand : 0.000005s : 0.04% optimize.opt_a.flash_sp_send_recv_attached : 0.000003s : 0.03% optimize.opt_a.receive_attached : 0.000003s : 0.03% optimize.opt_a.after_resolve : 0.000026s : 0.24% optimize.opt_a.a_after_grad : 0.000026s : 0.24% optimize.opt_a.renormalize : 0.000597s : 5.41% optimize.opt_a.add_forward_monad_depend : 0.000006s : 0.06% optimize.opt_a.auto_monad_grad : 0.000003s : 0.03% optimize.opt_a.auto_monad_eliminator : 0.000021s : 0.19% optimize.opt_a.cse : 0.000047s : 0.43% optimize.opt_a.a_3 : 0.000092s : 0.84% optimize.py_interpret_to_execute_after_opt_a : 0.000010s : 0.09% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.02% optimize.rewriter_after_opt_a : 0.000034s : 0.31% optimize.convert_after_rewriter : 0.000007s : 0.07% optimize.order_py_execute_after_rewriter : 0.000005s : 0.04% optimize.mutable_eliminate : 0.000478s : 4.33% optimize.opt_b.b_1 : 0.000146s : 1.32% optimize.opt_b.b_2 : 0.000009s : 0.08% optimize.opt_b.updatestate_depend_eliminate : 0.000005s : 0.05% optimize.opt_b.updatestate_assign_eliminate : 0.000002s : 0.02% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.02% optimize.opt_b.renormalize : 0.000000s : 0.00% optimize.opt_b.cse : 0.000019s : 0.17% optimize.optimize_parallel_all_gather_comm : 0.000016s : 0.14% optimize.overlap_param_gather : 0.000002s : 0.02% optimize.cconv : 0.000025s : 0.22% optimize.loop_unroll : 0.000415s : 3.76% optimize.opt_after_cconv.c_1 : 0.000035s : 0.32% optimize.opt_after_cconv.parameter_eliminate : 0.000002s : 0.02% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000005s : 0.04% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.02% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.02% optimize.opt_after_cconv.cse : 0.000017s : 0.16% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000014s : 0.12% optimize.tuple_transform.d_1 : 0.000048s : 0.44% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.02% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000007s : 0.07% optimize.partial_unused_args_eliminate : 0.000002s : 0.02% optimize.add_recomputation : 0.000045s : 0.41% optimize.cse_after_recomputation.cse : 0.000011s : 0.10% optimize.environ_conv : 0.000005s : 0.04% optimize.swap_dp_allreduce_reducescatter : 0.000005s : 0.05% optimize.bias_add_comm_swap : 0.000002s : 0.02% optimize.label_micro_interleaved_index : 0.000004s : 0.04% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.03% optimize.merge_cast_opt : 0.000001s : 0.01% optimize.slice_recompute_activation : 0.000002s : 0.02% optimize.micro_interleaved_order_control : 0.000002s : 0.02% optimize.assign_add_opt : 0.000001s : 0.01% optimize.ForceFp32Comm : 0.000001s : 0.01% optimize.remove_cast_before_assign_add : 0.000001s : 0.01% optimize.full_micro_interleaved_order_control : 0.000002s : 0.02% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.02% optimize.comm_op_add_attrs : 0.000001s : 0.01% optimize.add_comm_op_reuse_tag : 0.000001s : 0.01% optimize.interleave_split_concat_branches : 0.000001s : 0.01% optimize.interleave_parallel_branches : 0.000001s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.02% optimize.control_data_broadcast_order : 0.000012s : 0.11% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.01% optimize.offloading_packed_experts : 0.000004s : 0.03% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.04% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000002s : 0.01% optimize.overlap_recompute_comm : 0.000002s : 0.02% optimize.overlap_grad_ring_attention : 0.000004s : 0.04% optimize.overlap_grad_flash_sp : 0.000017s : 0.16% optimize.begin_end_overlap_inline : 0.000001s : 0.01% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.02% optimize.split_layernorm_comm : 0.000002s : 0.02% optimize.handle_group_info : 0.000001s : 0.01% optimize.symbol_engine_optimizer.build : 0.000003s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000010s : 0.09% optimize.symbol_engine_optimizer.elim_not_effective : 0.000014s : 0.13% optimize.symbol_engine_optimizer.opt_reshape : 0.000007s : 0.07% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000011s : 0.10% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.02% pipeline_parallel_scheduler : 0.000002s : 0.02% auto_monad_reorder : 0.000017s : 0.16% get_jit_bprop_graph : 0.000001s : 0.01% rewriter_after_jit_bprop_graph : 0.000003s : 0.03% opt_after_jit_grad : 0.000448s : 4.06% validate : 0.000035s : 0.32% Time group info: ------[substitution.] 0.000194 34 1.27% : 0.000002s : 2: substitution.elim_not_effective 0.67% : 0.000001s : 2: substitution.fold_const_symbol 3.19% : 0.000006s : 5: substitution.graph_param_transform 76.35% : 0.000148s : 4: substitution.inline 1.72% : 0.000003s : 4: substitution.j_node_and_user_rematch 2.22% : 0.000004s : 4: substitution.remove_not_recompute_node 2.72% : 0.000005s : 6: substitution.replace_old_param 9.04% : 0.000018s : 6: substitution.tuple_list_get_item_eliminator 2.82% : 0.000005s : 1: substitution.value_based_eliminate ------[type_inference.] 0.005913 2 87.37% : 0.005166s : 1: type_inference.infer 12.63% : 0.000747s : 1: type_inference.specialize ------[replace.] 0.000068 10 55.38% : 0.000038s : 4: replace.inline 44.62% : 0.000030s : 6: replace.tuple_list_get_item_eliminator ------[match.] 0.000161 10 90.68% : 0.000146s : 4: match.inline 9.32% : 0.000015s : 6: match.tuple_list_get_item_eliminator ------[predicate.] 0.000229 1590 0.91% : 0.000002s : 16: predicate.accumulaten_eliminater 0.69% : 0.000002s : 5: predicate.ad_related_special_op_eliminate 0.56% : 0.000001s : 10: predicate.addn_check_dump 0.97% : 0.000002s : 16: predicate.addn_zero_filter 0.86% : 0.000002s : 16: predicate.adjust_all_reduce_mul_add 2.03% : 0.000005s : 26: predicate.arithmetic_simplify 0.95% : 0.000002s : 16: predicate.cast_eliminate 0.73% : 0.000002s : 10: predicate.check_bprop_eliminate 0.50% : 0.000001s : 10: predicate.compare_switch_simplify 0.20% : 0.000000s : 5: predicate.const_output_eliminate 0.59% : 0.000001s : 10: predicate.depend_value_elim 0.95% : 0.000002s : 16: predicate.dict_get_item_const_eliminator 1.06% : 0.000002s : 16: predicate.dict_get_item_eliminator 0.88% : 0.000002s : 16: predicate.dict_set_item_eliminator 0.90% : 0.000002s : 10: predicate.dumpgradient_eliminate 0.31% : 0.000001s : 5: predicate.elim_not_effective 0.34% : 0.000001s : 5: predicate.elim_shapecalc_of_broadcastargs 1.16% : 0.000003s : 21: predicate.environ_add_const_eliminate 1.14% : 0.000003s : 21: predicate.environ_get_add_eliminate 1.14% : 0.000003s : 21: predicate.environ_get_depend_swap 1.68% : 0.000004s : 31: predicate.environ_get_eliminate 1.15% : 0.000003s : 21: predicate.environ_get_set_eliminate 1.52% : 0.000003s : 26: predicate.exchange_switch_depend_value 2.47% : 0.000006s : 26: predicate.float_depend_g_call 0.52% : 0.000001s : 10: predicate.float_environ_get_switch 0.93% : 0.000002s : 15: predicate.float_tuple_getitem_switch 0.20% : 0.000000s : 5: predicate.fold_const_symbol 0.71% : 0.000002s : 10: predicate.get_grad_eliminate 0.24% : 0.000001s : 5: predicate.graph_param_transform 0.58% : 0.000001s : 10: predicate.incorporate_call 0.48% : 0.000001s : 10: predicate.incorporate_call_switch 6.21% : 0.000014s : 72: predicate.inline 0.73% : 0.000002s : 10: predicate.inline_without_move 0.36% : 0.000001s : 10: predicate.j_node_and_user_rematch 0.79% : 0.000002s : 10: predicate.less_batch_normalization 1.90% : 0.000004s : 32: predicate.list_to_tuple_eliminator_ 2.66% : 0.000006s : 48: predicate.load_eliminater 0.75% : 0.000002s : 5: predicate.loop_unroll_after_grad 2.38% : 0.000005s : 40: predicate.loop_unroll_before_grad 1.57% : 0.000004s : 26: predicate.make_slice_get_slice_eliminator 0.53% : 0.000001s : 10: predicate.merge_addn 0.55% : 0.000001s : 10: predicate.micro_step_allgather_replace 0.54% : 0.000001s : 10: predicate.mini_step_allgather_replace 0.87% : 0.000002s : 16: predicate.minmaximum_grad 0.79% : 0.000002s : 5: predicate.mutable_eliminate 0.34% : 0.000001s : 5: predicate.opt_reshape 0.35% : 0.000001s : 5: predicate.parallel_virtual_node 1.90% : 0.000004s : 26: predicate.partial_defer_inline 1.84% : 0.000004s : 27: predicate.partial_eliminate 0.90% : 0.000002s : 16: predicate.print_const_string_wrapper 0.56% : 0.000001s : 10: predicate.reduce_all_const_elim 1.15% : 0.000003s : 16: predicate.reduce_eliminate 2.60% : 0.000006s : 48: predicate.redundant_stop_gradient_eliminater 0.44% : 0.000001s : 10: predicate.remove_not_recompute_node 1.49% : 0.000003s : 32: predicate.replace_applicator 0.46% : 0.000001s : 10: predicate.replace_old_param 0.27% : 0.000001s : 5: predicate.reset_defer_inline 1.02% : 0.000002s : 16: predicate.reshape_eliminate 0.60% : 0.000001s : 10: predicate.row_tensor_add_zeros_like 0.33% : 0.000001s : 5: predicate.row_tensor_eliminate 0.70% : 0.000002s : 10: predicate.same_eliminate 0.47% : 0.000001s : 10: predicate.set_cell_output_no_recompute 0.75% : 0.000002s : 10: predicate.shard_identity_eliminate 0.62% : 0.000001s : 10: predicate.special_op_eliminate 0.66% : 0.000002s : 10: predicate.specialize_transform 0.73% : 0.000002s : 10: predicate.split_environ_get_set_with_tuple_value 0.72% : 0.000002s : 10: predicate.stack_unstack_eliminate 0.33% : 0.000001s : 5: predicate.switch_call_monad_eliminater 1.64% : 0.000004s : 26: predicate.switch_defer_inline 2.14% : 0.000005s : 36: predicate.switch_layer_defer_inline 5.32% : 0.000012s : 81: predicate.switch_simplify 0.90% : 0.000002s : 16: predicate.tile_eliminate 0.97% : 0.000002s : 16: predicate.transpose_eliminate 1.51% : 0.000003s : 26: predicate.tuple_list_convert_item_index_to_positive 1.58% : 0.000004s : 26: predicate.tuple_list_get_item_const_eliminator 1.45% : 0.000003s : 26: predicate.tuple_list_get_item_depend_reorder 3.29% : 0.000008s : 42: predicate.tuple_list_get_item_eliminator 1.48% : 0.000003s : 26: predicate.tuple_list_get_set_item_eliminator 2.24% : 0.000005s : 36: predicate.tuple_list_set_item_eliminator 1.81% : 0.000004s : 32: predicate.tuple_to_list_eliminator_ 2.59% : 0.000006s : 48: predicate.updatestate_pure_node_eliminater 3.27% : 0.000007s : 58: predicate.updatestate_useless_node_eliminater 0.58% : 0.000001s : 5: predicate.value_based_eliminate 0.69% : 0.000002s : 10: predicate.virtual_dataset_eliminate 0.63% : 0.000001s : 10: predicate.virtual_output_eliminate 0.31% : 0.000001s : 5: predicate.virtual_view_grad_eliminate 0.38% : 0.000001s : 5: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000560 11 54.77% : 0.000306s : 5: func_graph_cloner_run.FuncGraphClonerGraph 45.23% : 0.000253s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.025010 192 0.01% : 0.000004s : 1: ForceFp32Comm 12.34% : 0.003086s : 1: add_attr 12.30% : 0.003076s : 1: add_attr_with_inline 0.01% : 0.000004s : 1: add_comm_op_reuse_tag 0.20% : 0.000049s : 1: add_recomputation 0.01% : 0.000004s : 1: assign_add_opt 0.27% : 0.000068s : 1: auto_monad 0.08% : 0.000021s : 1: auto_monad_reorder 0.01% : 0.000003s : 1: begin_end_overlap_inline 0.02% : 0.000005s : 1: bias_add_comm_swap 1.94% : 0.000485s : 1: bootstrap 0.11% : 0.000028s : 1: cconv 0.02% : 0.000004s : 1: comm_op_add_attrs 0.06% : 0.000015s : 1: control_data_broadcast_order 0.04% : 0.000011s : 1: convert_after_rewriter 0.10% : 0.000024s : 1: cse_after_recomputation 0.02% : 0.000005s : 1: dataset_repeat_opt 0.02% : 0.000005s : 1: detach_backward 0.03% : 0.000008s : 1: environ_conv 0.11% : 0.000027s : 1: event_method 0.02% : 0.000005s : 1: full_micro_interleaved_order_control 0.02% : 0.000004s : 1: get_jit_bprop_graph 0.04% : 0.000010s : 1: graph_reusing 0.02% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.02% : 0.000004s : 1: handle_group_info 0.02% : 0.000005s : 1: inline 0.02% : 0.000006s : 1: insert-virtual-dataset 0.01% : 0.000004s : 1: interleave_parallel_branches 0.02% : 0.000004s : 1: interleave_split_concat_branches 0.02% : 0.000006s : 1: label_fine_grained_interleaved_index 0.03% : 0.000007s : 1: label_micro_interleaved_index 1.69% : 0.000422s : 1: loop_unroll 0.02% : 0.000004s : 1: merge_cast_opt 0.02% : 0.000005s : 1: micro_interleaved_order_control 1.94% : 0.000486s : 1: mutable_eliminate 0.03% : 0.000007s : 1: offloading_packed_experts 0.05% : 0.000013s : 1: opt.transform.loop_unroll_optimizer 0.06% : 0.000014s : 1: opt.transform.mutable_eliminate 5.42% : 0.001354s : 78: opt.transform.opt_a 0.13% : 0.000034s : 1: opt.transform.opt_after_cconv 0.11% : 0.000027s : 1: opt.transform.opt_after_jit_grad 0.49% : 0.000123s : 28: opt.transform.opt_b 0.21% : 0.000054s : 2: opt.transform.opt_trans_graph 0.15% : 0.000038s : 4: opt.transform.symbol_engine_opt 11.06% : 0.002767s : 1: opt_a 0.43% : 0.000107s : 1: opt_after_cconv 1.83% : 0.000456s : 1: opt_after_jit_grad 0.91% : 0.000228s : 1: opt_b 18.99% : 0.004749s : 1: optimize 0.08% : 0.000019s : 1: optimize_parallel_all_gather_comm 0.03% : 0.000008s : 1: order_py_execute_after_rewriter 0.08% : 0.000021s : 1: overlap_grad_flash_sp 0.02% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.03% : 0.000007s : 1: overlap_grad_ring_attention 0.02% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.02% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.02% : 0.000005s : 1: overlap_param_gather 0.02% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.03% : 0.000008s : 1: overlap_recompute_and_grad_model_parallel 0.02% : 0.000005s : 1: overlap_recompute_comm 0.03% : 0.000007s : 1: parallel-infer-symbol 0.01% : 0.000004s : 1: parallel-infer-symbol-second 0.02% : 0.000005s : 1: partial_unused_args_eliminate 0.02% : 0.000005s : 1: pipeline_parallel_scheduler 0.02% : 0.000004s : 1: pipeline_split 0.15% : 0.000038s : 1: pre_auto_parallel 0.13% : 0.000032s : 1: py_interpret_to_execute 0.05% : 0.000013s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000004s : 1: remove_cast_before_assign_add 0.07% : 0.000017s : 1: remove_dup_value 1.24% : 0.000311s : 1: renormalize.infer 1.11% : 0.000277s : 1: renormalize.specialize 0.02% : 0.000005s : 1: reorder_send_recv_between_fp_bp 0.03% : 0.000007s : 1: rewriter_after_jit_bprop_graph 0.15% : 0.000038s : 1: rewriter_after_opt_a 0.37% : 0.000093s : 1: rewriter_before_opt_a 0.02% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.02% : 0.000005s : 1: slice_recompute_activation 0.02% : 0.000004s : 1: split_layernorm_comm 0.02% : 0.000005s : 1: split_matmul_comm_elemetwise 0.03% : 0.000008s : 1: swap_dp_allreduce_reducescatter 0.31% : 0.000079s : 1: symbol_engine_optimizer 0.33% : 0.000083s : 1: tuple_transform 23.96% : 0.005993s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:47:15.381.875 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:47:15.382.130 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.016813, [21] [bootstrap]: 0.00043496 [type_inference]: 0.00583326 [event_method]: 2.204e-05 [auto_monad]: 6.482e-05 [graph_reusing]: 6.07999e-06 [inline]: 2.37999e-06 [add_attr]: 0.00308181, [1] [add_attr_with_inline]: 0.00307298, [1] [Cycle 1]: 7.499e-05, [2] [tag_attr]: 2.084e-05 [meta_addattr_fg_expand]: 6.71e-06 [parallel-infer-symbol]: 2.79999e-06 [pre_auto_parallel]: 3.555e-05 [insert-virtual-dataset]: 2.27999e-06 [parallel-infer-symbol-second]: 9.09989e-07 [dataset_repeat_opt]: 2.10002e-06 [pipeline_split]: 2.06e-06 [optimize]: 0.00617283, [53] [py_interpret_to_execute]: 4.586e-05 [rewriter_before_opt_a]: 9.781e-05 [opt_a]: 0.00369314, [2] [Cycle 1]: 0.00267725, [45] [expand_dump_flag]: 2.92002e-06 [switch_simplify]: 4.757e-05 [loop_unroll]: 3.601e-05 [a_1]: 0.00091118 [with_stream_mark]: 1.804e-05 [recompute_prepare]: 1.158e-05 [updatestate_depend_eliminate]: 4.82e-06 [updatestate_assign_eliminate]: 4.60001e-06 [updatestate_loads_eliminate]: 3.65998e-06 [parameter_eliminate]: 2.20002e-06 [a_2]: 0.00014152 [accelerated_algorithm]: 9.51e-06 [shard]: 2.63e-06 [meta_shard_fg_expand]: 2.22001e-06 [shard_inline]: 8.70999e-06 [merge_send_recv]: 9.92001e-06 [auto_parallel]: 7.46999e-06 [parallel]: 1.986e-05 [flash_sp]: 8.67e-06 [merge_comm]: 5.14e-06 [allreduce_fusion]: 4.29997e-06 [matmul_add_comm_reduction]: 1.094e-05 [allreduce_slice_to_reducescatter]: 8.90024e-07 [virtual_shard_identity]: 1.049e-05 [virtual_dataset]: 8.89e-06 [get_grad_eliminate_]: 8.62998e-06 [virtual_output]: 8.67e-06 [merge_forward]: 5.40999e-06 [cell_reuse_recompute_pass]: 1.38002e-06 [offload_activation]: 1.18e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.075e-05 [merge_recompute_call_nodes]: 1.50999e-06 [before_grad]: 1.454e-05 [set_forward_comm_id_for_comm_node_pass]: 5.00001e-06 [meta_fg_expand]: 3.75e-06 [flash_sp_send_recv_attached]: 2.58e-06 [receive_attached]: 1.99999e-06 [after_resolve]: 1.548e-05 [a_after_grad]: 1.362e-05 [renormalize]: 0.00073235 [add_forward_monad_depend]: 6.28e-06 [auto_monad_grad]: 2.41e-06 [auto_monad_eliminator]: 1.776e-05 [cse]: 3.544e-05 [a_3]: 7.76e-05 [Cycle 2]: 0.00100167, [45] [expand_dump_flag]: 1.13001e-06 [switch_simplify]: 9.88002e-06 [loop_unroll]: 8.18999e-06 [a_1]: 0.00020279 [with_stream_mark]: 1.228e-05 [recompute_prepare]: 8.48999e-06 [updatestate_depend_eliminate]: 4.07e-06 [updatestate_assign_eliminate]: 3.49001e-06 [updatestate_loads_eliminate]: 3.11001e-06 [parameter_eliminate]: 1.35999e-06 [a_2]: 0.00013034 [accelerated_algorithm]: 8.20999e-06 [shard]: 1.35001e-06 [meta_shard_fg_expand]: 2.16e-06 [shard_inline]: 8.28001e-06 [merge_send_recv]: 6.86999e-06 [auto_parallel]: 6.19999e-06 [parallel]: 5.59e-06 [flash_sp]: 3.43999e-06 [merge_comm]: 4.55001e-06 [allreduce_fusion]: 4.47e-06 [matmul_add_comm_reduction]: 7.83001e-06 [allreduce_slice_to_reducescatter]: 3.50003e-07 [virtual_shard_identity]: 9.57999e-06 [virtual_dataset]: 8.22e-06 [get_grad_eliminate_]: 8e-06 [virtual_output]: 9.72999e-06 [merge_forward]: 3.59002e-06 [cell_reuse_recompute_pass]: 1.44998e-06 [offload_activation]: 8.84e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.742e-05 [merge_recompute_call_nodes]: 1.27e-06 [before_grad]: 1.355e-05 [set_forward_comm_id_for_comm_node_pass]: 4.89e-06 [meta_fg_expand]: 2.90002e-06 [flash_sp_send_recv_attached]: 8.89995e-07 [receive_attached]: 9.80013e-07 [after_resolve]: 1.383e-05 [a_after_grad]: 1.281e-05 [renormalize]: 8.00064e-08 [add_forward_monad_depend]: 1.66e-06 [auto_monad_grad]: 1.01002e-06 [auto_monad_eliminator]: 9.31e-06 [cse]: 2.057e-05 [a_3]: 6.426e-05 [py_interpret_to_execute_after_opt_a]: 1.482e-05 [slice_cell_reuse_recomputed_activation]: 4.60999e-06 [rewriter_after_opt_a]: 4.471e-05 [convert_after_rewriter]: 1.121e-05 [order_py_execute_after_rewriter]: 8.89e-06 [mutable_eliminate]: 0.00052361 [opt_b]: 0.00037103, [1] [Cycle 1]: 0.00036108, [7] [b_1]: 0.00025297 [b_2]: 1.021e-05 [updatestate_depend_eliminate]: 7.66001e-06 [updatestate_assign_eliminate]: 3.35003e-06 [updatestate_loads_eliminate]: 3.56999e-06 [renormalize]: 6.50005e-07 [cse]: 2.475e-05 [optimize_parallel_all_gather_comm]: 2.191e-05 [overlap_param_gather]: 4.70999e-06 [cconv]: 2.925e-05 [loop_unroll]: 0.00043807 [opt_after_cconv]: 0.00014578, [1] [Cycle 1]: 0.00013706, [7] [c_1]: 4.178e-05 [parameter_eliminate]: 3.58999e-06 [updatestate_depend_eliminate]: 6.56999e-06 [updatestate_assign_eliminate]: 3.37997e-06 [updatestate_loads_eliminate]: 3.16999e-06 [cse]: 2.27e-05 [renormalize]: 6.29982e-07 [remove_dup_value]: 1.921e-05 [tuple_transform]: 0.00010612, [1] [Cycle 1]: 9.923e-05, [4] [d_1]: 5.804e-05 [none_parameter_eliminate]: 1.62001e-06 [renormalize]: 2.60014e-07 [switch_simplify]: 9.15999e-06 [partial_unused_args_eliminate]: 4.37e-06 [add_recomputation]: 5.663e-05 [cse_after_recomputation]: 3.174e-05, [1] [Cycle 1]: 2.441e-05, [1] [cse]: 1.535e-05 [environ_conv]: 9.46e-06 [swap_dp_allreduce_reducescatter]: 9.07001e-06 [bias_add_comm_swap]: 5.64e-06 [label_micro_interleaved_index]: 6.58e-06 [label_fine_grained_interleaved_index]: 5.20001e-06 [merge_cast_opt]: 3.83001e-06 [slice_recompute_activation]: 4.42e-06 [micro_interleaved_order_control]: 4.75001e-06 [assign_add_opt]: 3.73999e-06 [ForceFp32Comm]: 3.40003e-06 [remove_cast_before_assign_add]: 3.47002e-06 [full_micro_interleaved_order_control]: 4.4e-06 [reorder_send_recv_between_fp_bp]: 5.04e-06 [comm_op_add_attrs]: 3.54002e-06 [add_comm_op_reuse_tag]: 3.75998e-06 [interleave_split_concat_branches]: 3.51001e-06 [interleave_parallel_branches]: 3.30998e-06 [overlap_opt_shard_in_pipeline]: 3.90998e-06 [overlap_opt_shard_grad_in_pipeline]: 4.12998e-06 [control_data_broadcast_order]: 1.728e-05 [grouped_pairwise_exchange_alltoall]: 4.25999e-06 [offloading_packed_experts]: 7.1e-06 [overlap_recompute_and_grad_model_parallel]: 7.51999e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.56001e-06 [overlap_recompute_allgather_and_fa_grad]: 3.8e-06 [overlap_recompute_comm]: 5.02e-06 [overlap_grad_ring_attention]: 6.86001e-06 [overlap_grad_flash_sp]: 2.427e-05 [begin_end_overlap_inline]: 2.94001e-06 [split_matmul_comm_elemetwise]: 4.45e-06 [split_layernorm_comm]: 3.97e-06 [handle_group_info]: 3.18998e-06 [symbol_engine_optimizer]: 0.00010566, [1] [Cycle 1]: 9.879e-05, [6] [build]: 2.81999e-06 [elim_shapecalc]: 1.196e-05 [elim_not_effective]: 1.618e-05 [opt_reshape]: 9.20999e-06 [fold_const_symbol]: 1.308e-05 [renormalize]: 1.69995e-07 [detach_backward]: 3.25e-06 [pipeline_parallel_scheduler]: 1.68002e-06 [auto_monad_reorder]: 2.268e-05 [get_jit_bprop_graph]: 1.55999e-06 [rewriter_after_jit_bprop_graph]: 4.22e-06 [opt_after_jit_grad]: 0.000482 [validate]: 4.167e-05 Sums bootstrap : 0.000435s : 3.63% type_inference : 0.005833s : 48.74% event_method : 0.000022s : 0.18% auto_monad : 0.000065s : 0.54% graph_reusing : 0.000006s : 0.05% inline : 0.000002s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000021s : 0.17% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000007s : 0.06% parallel-infer-symbol : 0.000003s : 0.02% pre_auto_parallel : 0.000036s : 0.30% insert-virtual-dataset : 0.000002s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.02% optimize.py_interpret_to_execute : 0.000046s : 0.38% optimize.rewriter_before_opt_a : 0.000098s : 0.82% optimize.opt_a.expand_dump_flag : 0.000004s : 0.03% optimize.opt_a.switch_simplify : 0.000057s : 0.48% optimize.opt_a.loop_unroll : 0.000044s : 0.37% optimize.opt_a.a_1 : 0.001114s : 9.31% optimize.opt_a.with_stream_mark : 0.000030s : 0.25% optimize.opt_a.recompute_prepare : 0.000020s : 0.17% optimize.opt_a.updatestate_depend_eliminate : 0.000009s : 0.07% optimize.opt_a.updatestate_assign_eliminate : 0.000008s : 0.07% optimize.opt_a.updatestate_loads_eliminate : 0.000007s : 0.06% optimize.opt_a.parameter_eliminate : 0.000004s : 0.03% optimize.opt_a.a_2 : 0.000272s : 2.27% optimize.opt_a.accelerated_algorithm : 0.000018s : 0.15% optimize.opt_a.shard : 0.000004s : 0.03% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.04% optimize.opt_a.shard_inline : 0.000017s : 0.14% optimize.opt_a.merge_send_recv : 0.000017s : 0.14% optimize.opt_a.auto_parallel : 0.000014s : 0.11% optimize.opt_a.parallel : 0.000025s : 0.21% optimize.opt_a.flash_sp : 0.000012s : 0.10% optimize.opt_a.merge_comm : 0.000010s : 0.08% optimize.opt_a.allreduce_fusion : 0.000009s : 0.07% optimize.opt_a.matmul_add_comm_reduction : 0.000019s : 0.16% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000020s : 0.17% optimize.opt_a.virtual_dataset : 0.000017s : 0.14% optimize.opt_a.get_grad_eliminate_ : 0.000017s : 0.14% optimize.opt_a.virtual_output : 0.000018s : 0.15% optimize.opt_a.merge_forward : 0.000009s : 0.08% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.02% optimize.opt_a.offload_activation : 0.000021s : 0.17% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000038s : 0.32% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.02% optimize.opt_a.before_grad : 0.000028s : 0.23% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000010s : 0.08% optimize.opt_a.meta_fg_expand : 0.000007s : 0.06% optimize.opt_a.flash_sp_send_recv_attached : 0.000003s : 0.03% optimize.opt_a.receive_attached : 0.000003s : 0.02% optimize.opt_a.after_resolve : 0.000029s : 0.24% optimize.opt_a.a_after_grad : 0.000026s : 0.22% optimize.opt_a.renormalize : 0.000732s : 6.12% optimize.opt_a.add_forward_monad_depend : 0.000008s : 0.07% optimize.opt_a.auto_monad_grad : 0.000003s : 0.03% optimize.opt_a.auto_monad_eliminator : 0.000027s : 0.23% optimize.opt_a.cse : 0.000056s : 0.47% optimize.opt_a.a_3 : 0.000142s : 1.19% optimize.py_interpret_to_execute_after_opt_a : 0.000015s : 0.12% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.04% optimize.rewriter_after_opt_a : 0.000045s : 0.37% optimize.convert_after_rewriter : 0.000011s : 0.09% optimize.order_py_execute_after_rewriter : 0.000009s : 0.07% optimize.mutable_eliminate : 0.000524s : 4.38% optimize.opt_b.b_1 : 0.000253s : 2.11% optimize.opt_b.b_2 : 0.000010s : 0.09% optimize.opt_b.updatestate_depend_eliminate : 0.000008s : 0.06% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_b.updatestate_loads_eliminate : 0.000004s : 0.03% optimize.opt_b.renormalize : 0.000001s : 0.01% optimize.opt_b.cse : 0.000025s : 0.21% optimize.optimize_parallel_all_gather_comm : 0.000022s : 0.18% optimize.overlap_param_gather : 0.000005s : 0.04% optimize.cconv : 0.000029s : 0.24% optimize.loop_unroll : 0.000438s : 3.66% optimize.opt_after_cconv.c_1 : 0.000042s : 0.35% optimize.opt_after_cconv.parameter_eliminate : 0.000004s : 0.03% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000007s : 0.05% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.cse : 0.000023s : 0.19% optimize.opt_after_cconv.renormalize : 0.000001s : 0.01% optimize.remove_dup_value : 0.000019s : 0.16% optimize.tuple_transform.d_1 : 0.000058s : 0.48% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000009s : 0.08% optimize.partial_unused_args_eliminate : 0.000004s : 0.04% optimize.add_recomputation : 0.000057s : 0.47% optimize.cse_after_recomputation.cse : 0.000015s : 0.13% optimize.environ_conv : 0.000009s : 0.08% optimize.swap_dp_allreduce_reducescatter : 0.000009s : 0.08% optimize.bias_add_comm_swap : 0.000006s : 0.05% optimize.label_micro_interleaved_index : 0.000007s : 0.05% optimize.label_fine_grained_interleaved_index : 0.000005s : 0.04% optimize.merge_cast_opt : 0.000004s : 0.03% optimize.slice_recompute_activation : 0.000004s : 0.04% optimize.micro_interleaved_order_control : 0.000005s : 0.04% optimize.assign_add_opt : 0.000004s : 0.03% optimize.ForceFp32Comm : 0.000003s : 0.03% optimize.remove_cast_before_assign_add : 0.000003s : 0.03% optimize.full_micro_interleaved_order_control : 0.000004s : 0.04% optimize.reorder_send_recv_between_fp_bp : 0.000005s : 0.04% optimize.comm_op_add_attrs : 0.000004s : 0.03% optimize.add_comm_op_reuse_tag : 0.000004s : 0.03% optimize.interleave_split_concat_branches : 0.000004s : 0.03% optimize.interleave_parallel_branches : 0.000003s : 0.03% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.03% optimize.overlap_opt_shard_grad_in_pipeline : 0.000004s : 0.03% optimize.control_data_broadcast_order : 0.000017s : 0.14% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.04% optimize.offloading_packed_experts : 0.000007s : 0.06% optimize.overlap_recompute_and_grad_model_parallel : 0.000008s : 0.06% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.03% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.03% optimize.overlap_recompute_comm : 0.000005s : 0.04% optimize.overlap_grad_ring_attention : 0.000007s : 0.06% optimize.overlap_grad_flash_sp : 0.000024s : 0.20% optimize.begin_end_overlap_inline : 0.000003s : 0.02% optimize.split_matmul_comm_elemetwise : 0.000004s : 0.04% optimize.split_layernorm_comm : 0.000004s : 0.03% optimize.handle_group_info : 0.000003s : 0.03% optimize.symbol_engine_optimizer.build : 0.000003s : 0.02% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000012s : 0.10% optimize.symbol_engine_optimizer.elim_not_effective : 0.000016s : 0.14% optimize.symbol_engine_optimizer.opt_reshape : 0.000009s : 0.08% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000013s : 0.11% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000003s : 0.03% pipeline_parallel_scheduler : 0.000002s : 0.01% auto_monad_reorder : 0.000023s : 0.19% get_jit_bprop_graph : 0.000002s : 0.01% rewriter_after_jit_bprop_graph : 0.000004s : 0.04% opt_after_jit_grad : 0.000482s : 4.03% validate : 0.000042s : 0.35% Time group info: ------[substitution.] 0.000227 44 9.40% : 0.000021s : 3: substitution.cast_eliminate 1.07% : 0.000002s : 3: substitution.elim_not_effective 0.71% : 0.000002s : 3: substitution.fold_const_symbol 3.17% : 0.000007s : 6: substitution.graph_param_transform 66.99% : 0.000152s : 4: substitution.inline 2.15% : 0.000005s : 6: substitution.j_node_and_user_rematch 3.41% : 0.000008s : 6: substitution.remove_not_recompute_node 2.45% : 0.000006s : 6: substitution.replace_old_param 8.24% : 0.000019s : 6: substitution.tuple_list_get_item_eliminator 2.41% : 0.000005s : 1: substitution.value_based_eliminate ------[type_inference.] 0.005780 2 87.25% : 0.005043s : 1: type_inference.infer 12.75% : 0.000737s : 1: type_inference.specialize ------[replace.] 0.000074 10 50.35% : 0.000038s : 4: replace.inline 49.65% : 0.000037s : 6: replace.tuple_list_get_item_eliminator ------[match.] 0.000165 10 90.33% : 0.000149s : 4: match.inline 9.67% : 0.000016s : 6: match.tuple_list_get_item_eliminator ------[predicate.] 0.000347 1954 0.83% : 0.000003s : 21: predicate.accumulaten_eliminater 0.53% : 0.000002s : 6: predicate.ad_related_special_op_eliminate 0.51% : 0.000002s : 12: predicate.addn_check_dump 0.86% : 0.000003s : 21: predicate.addn_zero_filter 0.78% : 0.000003s : 21: predicate.adjust_all_reduce_mul_add 1.79% : 0.000006s : 33: predicate.arithmetic_simplify 0.89% : 0.000003s : 21: predicate.cast_eliminate 0.52% : 0.000002s : 12: predicate.check_bprop_eliminate 0.48% : 0.000002s : 12: predicate.compare_switch_simplify 0.17% : 0.000001s : 6: predicate.const_output_eliminate 0.51% : 0.000002s : 12: predicate.depend_value_elim 0.87% : 0.000003s : 21: predicate.dict_get_item_const_eliminator 0.91% : 0.000003s : 21: predicate.dict_get_item_eliminator 0.80% : 0.000003s : 21: predicate.dict_set_item_eliminator 0.70% : 0.000002s : 12: predicate.dumpgradient_eliminate 0.18% : 0.000001s : 6: predicate.elim_not_effective 0.31% : 0.000001s : 6: predicate.elim_shapecalc_of_broadcastargs 1.15% : 0.000004s : 27: predicate.environ_add_const_eliminate 1.04% : 0.000004s : 27: predicate.environ_get_add_eliminate 1.03% : 0.000004s : 27: predicate.environ_get_depend_swap 8.82% : 0.000031s : 39: predicate.environ_get_eliminate 1.02% : 0.000004s : 27: predicate.environ_get_set_eliminate 1.25% : 0.000004s : 31: predicate.exchange_switch_depend_value 2.05% : 0.000007s : 31: predicate.float_depend_g_call 0.49% : 0.000002s : 12: predicate.float_environ_get_switch 0.73% : 0.000003s : 18: predicate.float_tuple_getitem_switch 0.17% : 0.000001s : 6: predicate.fold_const_symbol 0.56% : 0.000002s : 12: predicate.get_grad_eliminate 0.18% : 0.000001s : 6: predicate.graph_param_transform 0.50% : 0.000002s : 12: predicate.incorporate_call 0.43% : 0.000001s : 12: predicate.incorporate_call_switch 5.26% : 0.000018s : 88: predicate.inline 0.66% : 0.000002s : 12: predicate.inline_without_move 0.29% : 0.000001s : 12: predicate.j_node_and_user_rematch 0.62% : 0.000002s : 12: predicate.less_batch_normalization 1.64% : 0.000006s : 39: predicate.list_to_tuple_eliminator_ 2.37% : 0.000008s : 60: predicate.load_eliminater 0.58% : 0.000002s : 6: predicate.loop_unroll_after_grad 1.73% : 0.000006s : 42: predicate.loop_unroll_before_grad 1.49% : 0.000005s : 33: predicate.make_slice_get_slice_eliminator 0.49% : 0.000002s : 12: predicate.merge_addn 0.51% : 0.000002s : 12: predicate.micro_step_allgather_replace 0.56% : 0.000002s : 12: predicate.mini_step_allgather_replace 0.77% : 0.000003s : 21: predicate.minmaximum_grad 0.82% : 0.000003s : 6: predicate.mutable_eliminate 0.31% : 0.000001s : 6: predicate.opt_reshape 0.44% : 0.000002s : 6: predicate.parallel_virtual_node 1.62% : 0.000006s : 31: predicate.partial_defer_inline 1.57% : 0.000005s : 33: predicate.partial_eliminate 6.96% : 0.000024s : 21: predicate.print_const_string_wrapper 0.53% : 0.000002s : 12: predicate.reduce_all_const_elim 1.00% : 0.000003s : 21: predicate.reduce_eliminate 2.33% : 0.000008s : 60: predicate.redundant_stop_gradient_eliminater 0.31% : 0.000001s : 12: predicate.remove_not_recompute_node 1.30% : 0.000005s : 39: predicate.replace_applicator 0.40% : 0.000001s : 12: predicate.replace_old_param 0.26% : 0.000001s : 6: predicate.reset_defer_inline 0.90% : 0.000003s : 21: predicate.reshape_eliminate 0.60% : 0.000002s : 12: predicate.row_tensor_add_zeros_like 0.29% : 0.000001s : 6: predicate.row_tensor_eliminate 0.65% : 0.000002s : 12: predicate.same_eliminate 0.34% : 0.000001s : 12: predicate.set_cell_output_no_recompute 0.65% : 0.000002s : 12: predicate.shard_identity_eliminate 0.57% : 0.000002s : 12: predicate.special_op_eliminate 0.59% : 0.000002s : 12: predicate.specialize_transform 0.62% : 0.000002s : 12: predicate.split_environ_get_set_with_tuple_value 0.64% : 0.000002s : 12: predicate.stack_unstack_eliminate 0.28% : 0.000001s : 6: predicate.switch_call_monad_eliminater 1.36% : 0.000005s : 31: predicate.switch_defer_inline 1.83% : 0.000006s : 43: predicate.switch_layer_defer_inline 4.13% : 0.000014s : 91: predicate.switch_simplify 0.78% : 0.000003s : 21: predicate.tile_eliminate 0.83% : 0.000003s : 21: predicate.transpose_eliminate 1.32% : 0.000005s : 33: predicate.tuple_list_convert_item_index_to_positive 1.40% : 0.000005s : 33: predicate.tuple_list_get_item_const_eliminator 1.34% : 0.000005s : 33: predicate.tuple_list_get_item_depend_reorder 2.96% : 0.000010s : 51: predicate.tuple_list_get_item_eliminator 1.36% : 0.000005s : 33: predicate.tuple_list_get_set_item_eliminator 1.88% : 0.000007s : 45: predicate.tuple_list_set_item_eliminator 1.54% : 0.000005s : 39: predicate.tuple_to_list_eliminator_ 2.33% : 0.000008s : 60: predicate.updatestate_pure_node_eliminater 2.86% : 0.000010s : 72: predicate.updatestate_useless_node_eliminater 0.34% : 0.000001s : 6: predicate.value_based_eliminate 0.53% : 0.000002s : 12: predicate.virtual_dataset_eliminate 0.53% : 0.000002s : 12: predicate.virtual_output_eliminate 0.24% : 0.000001s : 6: predicate.virtual_view_grad_eliminate 0.39% : 0.000001s : 6: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000558 11 53.49% : 0.000298s : 5: func_graph_cloner_run.FuncGraphClonerGraph 46.51% : 0.000259s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.028783 192 0.02% : 0.000006s : 1: ForceFp32Comm 10.74% : 0.003091s : 1: add_attr 10.69% : 0.003077s : 1: add_attr_with_inline 0.02% : 0.000006s : 1: add_comm_op_reuse_tag 0.21% : 0.000060s : 1: add_recomputation 0.02% : 0.000006s : 1: assign_add_opt 0.26% : 0.000074s : 1: auto_monad 0.11% : 0.000030s : 1: auto_monad_reorder 0.02% : 0.000006s : 1: begin_end_overlap_inline 0.03% : 0.000008s : 1: bias_add_comm_swap 1.66% : 0.000478s : 1: bootstrap 0.11% : 0.000032s : 1: cconv 0.02% : 0.000006s : 1: comm_op_add_attrs 0.07% : 0.000021s : 1: control_data_broadcast_order 0.05% : 0.000014s : 1: convert_after_rewriter 0.12% : 0.000035s : 1: cse_after_recomputation 0.03% : 0.000008s : 1: dataset_repeat_opt 0.06% : 0.000018s : 1: detach_backward 0.04% : 0.000012s : 1: environ_conv 0.11% : 0.000033s : 1: event_method 0.02% : 0.000007s : 1: full_micro_interleaved_order_control 0.03% : 0.000008s : 1: get_jit_bprop_graph 0.04% : 0.000012s : 1: graph_reusing 0.02% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.02% : 0.000006s : 1: handle_group_info 0.03% : 0.000008s : 1: inline 0.03% : 0.000008s : 1: insert-virtual-dataset 0.02% : 0.000006s : 1: interleave_parallel_branches 0.02% : 0.000006s : 1: interleave_split_concat_branches 0.03% : 0.000008s : 1: label_fine_grained_interleaved_index 0.03% : 0.000009s : 1: label_micro_interleaved_index 1.54% : 0.000444s : 1: loop_unroll 0.02% : 0.000006s : 1: merge_cast_opt 0.03% : 0.000007s : 1: micro_interleaved_order_control 1.84% : 0.000530s : 1: mutable_eliminate 0.03% : 0.000010s : 1: offloading_packed_experts 0.06% : 0.000016s : 1: opt.transform.loop_unroll_optimizer 0.06% : 0.000019s : 1: opt.transform.mutable_eliminate 5.93% : 0.001706s : 78: opt.transform.opt_a 0.14% : 0.000040s : 1: opt.transform.opt_after_cconv 0.11% : 0.000031s : 1: opt.transform.opt_after_jit_grad 0.66% : 0.000189s : 28: opt.transform.opt_b 0.23% : 0.000065s : 2: opt.transform.opt_trans_graph 0.16% : 0.000047s : 4: opt.transform.symbol_engine_opt 12.84% : 0.003696s : 1: opt_a 0.52% : 0.000149s : 1: opt_after_cconv 1.71% : 0.000492s : 1: opt_after_jit_grad 1.30% : 0.000375s : 1: opt_b 22.64% : 0.006515s : 1: optimize 0.09% : 0.000025s : 1: optimize_parallel_all_gather_comm 0.04% : 0.000012s : 1: order_py_execute_after_rewriter 0.10% : 0.000028s : 1: overlap_grad_flash_sp 0.02% : 0.000006s : 1: overlap_grad_matmul_and_grad_allreduce 0.03% : 0.000010s : 1: overlap_grad_ring_attention 0.02% : 0.000007s : 1: overlap_opt_shard_grad_in_pipeline 0.02% : 0.000007s : 1: overlap_opt_shard_in_pipeline 0.03% : 0.000008s : 1: overlap_param_gather 0.02% : 0.000006s : 1: overlap_recompute_allgather_and_fa_grad 0.04% : 0.000010s : 1: overlap_recompute_and_grad_model_parallel 0.03% : 0.000008s : 1: overlap_recompute_comm 0.03% : 0.000010s : 1: parallel-infer-symbol 0.02% : 0.000006s : 1: parallel-infer-symbol-second 0.02% : 0.000007s : 1: partial_unused_args_eliminate 0.03% : 0.000010s : 1: pipeline_parallel_scheduler 0.03% : 0.000007s : 1: pipeline_split 0.15% : 0.000043s : 1: pre_auto_parallel 0.17% : 0.000050s : 1: py_interpret_to_execute 0.06% : 0.000018s : 1: py_interpret_to_execute_after_opt_a 0.02% : 0.000006s : 1: remove_cast_before_assign_add 0.08% : 0.000022s : 1: remove_dup_value 1.38% : 0.000398s : 1: renormalize.infer 1.13% : 0.000326s : 1: renormalize.specialize 0.03% : 0.000008s : 1: reorder_send_recv_between_fp_bp 0.04% : 0.000010s : 1: rewriter_after_jit_bprop_graph 0.17% : 0.000048s : 1: rewriter_after_opt_a 0.35% : 0.000102s : 1: rewriter_before_opt_a 0.03% : 0.000007s : 1: slice_cell_reuse_recomputed_activation 0.02% : 0.000007s : 1: slice_recompute_activation 0.02% : 0.000007s : 1: split_layernorm_comm 0.02% : 0.000007s : 1: split_matmul_comm_elemetwise 0.04% : 0.000012s : 1: swap_dp_allreduce_reducescatter 0.38% : 0.000108s : 1: symbol_engine_optimizer 0.38% : 0.000109s : 1: tuple_transform 20.43% : 0.005880s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:47:15.578.678 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0157961, [21] [bootstrap]: 0.00043132 [type_inference]: 0.0057165 [event_method]: 2.04e-05 [auto_monad]: 6.335e-05 [graph_reusing]: 5.74e-06 [inline]: 2.18998e-06 [add_attr]: 0.00310932, [1] [add_attr_with_inline]: 0.0031007, [1] [Cycle 1]: 6.45e-05, [2] [tag_attr]: 2.17e-05 [meta_addattr_fg_expand]: 6.70998e-06 [parallel-infer-symbol]: 3.06001e-06 [pre_auto_parallel]: 3.664e-05 [insert-virtual-dataset]: 2.40002e-06 [parallel-infer-symbol-second]: 7.39994e-07 [dataset_repeat_opt]: 2.14999e-06 [pipeline_split]: 1.67001e-06 [optimize]: 0.00569335, [53] [py_interpret_to_execute]: 2.926e-05 [rewriter_before_opt_a]: 9.346e-05 [opt_a]: 0.0034341, [2] [Cycle 1]: 0.00257215, [45] [expand_dump_flag]: 2.88e-06 [switch_simplify]: 4.891e-05 [loop_unroll]: 3.533e-05 [a_1]: 0.00089165 [with_stream_mark]: 1.813e-05 [recompute_prepare]: 5.425e-05 [updatestate_depend_eliminate]: 6.30002e-06 [updatestate_assign_eliminate]: 4.1e-06 [updatestate_loads_eliminate]: 3.58e-06 [parameter_eliminate]: 2.54999e-06 [a_2]: 0.00011236 [accelerated_algorithm]: 9.78998e-06 [shard]: 2.66e-06 [meta_shard_fg_expand]: 2.27001e-06 [shard_inline]: 8.68001e-06 [merge_send_recv]: 1.048e-05 [auto_parallel]: 7.03e-06 [parallel]: 1.934e-05 [flash_sp]: 9.37001e-06 [merge_comm]: 4.95001e-06 [allreduce_fusion]: 4.33999e-06 [matmul_add_comm_reduction]: 1.191e-05 [allreduce_slice_to_reducescatter]: 6.69999e-07 [virtual_shard_identity]: 1.132e-05 [virtual_dataset]: 8.60999e-06 [get_grad_eliminate_]: 8.40999e-06 [virtual_output]: 8.30999e-06 [merge_forward]: 4.89e-06 [cell_reuse_recompute_pass]: 1.12e-06 [offload_activation]: 1.107e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.819e-05 [merge_recompute_call_nodes]: 1.56998e-06 [before_grad]: 1.419e-05 [set_forward_comm_id_for_comm_node_pass]: 4.85001e-06 [meta_fg_expand]: 4.18001e-06 [flash_sp_send_recv_attached]: 2.53998e-06 [receive_attached]: 2.24001e-06 [after_resolve]: 1.596e-05 [a_after_grad]: 1.328e-05 [renormalize]: 0.00077555 [add_forward_monad_depend]: 6.28002e-06 [auto_monad_grad]: 2.81999e-06 [auto_monad_eliminator]: 1.872e-05 [cse]: 3.77e-05 [a_3]: 6.407e-05 [Cycle 2]: 0.00085181, [45] [expand_dump_flag]: 1.37e-06 [switch_simplify]: 1.101e-05 [loop_unroll]: 8.62e-06 [a_1]: 0.00020488 [with_stream_mark]: 1.419e-05 [recompute_prepare]: 9.29998e-06 [updatestate_depend_eliminate]: 4.13999e-06 [updatestate_assign_eliminate]: 3.63999e-06 [updatestate_loads_eliminate]: 3.25e-06 [parameter_eliminate]: 1.50001e-06 [a_2]: 0.00010415 [accelerated_algorithm]: 8.72e-06 [shard]: 1.62999e-06 [meta_shard_fg_expand]: 2.11e-06 [shard_inline]: 8.18999e-06 [merge_send_recv]: 6.84999e-06 [auto_parallel]: 6.82002e-06 [parallel]: 5.49e-06 [flash_sp]: 3.41001e-06 [merge_comm]: 4.3e-06 [allreduce_fusion]: 3.93999e-06 [matmul_add_comm_reduction]: 8.3e-06 [allreduce_slice_to_reducescatter]: 5.59987e-07 [virtual_shard_identity]: 9.97999e-06 [virtual_dataset]: 8.67998e-06 [get_grad_eliminate_]: 7.68999e-06 [virtual_output]: 7.71999e-06 [merge_forward]: 4.27e-06 [cell_reuse_recompute_pass]: 1.99999e-06 [offload_activation]: 8.80999e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.72e-05 [merge_recompute_call_nodes]: 1.24e-06 [before_grad]: 1.33e-05 [set_forward_comm_id_for_comm_node_pass]: 5.00999e-06 [meta_fg_expand]: 2.88998e-06 [flash_sp_send_recv_attached]: 1.12999e-06 [receive_attached]: 1.17e-06 [after_resolve]: 1.469e-05 [a_after_grad]: 1.267e-05 [renormalize]: 1.10012e-07 [add_forward_monad_depend]: 1.97999e-06 [auto_monad_grad]: 1.44e-06 [auto_monad_eliminator]: 1.014e-05 [cse]: 2.222e-05 [a_3]: 5.152e-05 [py_interpret_to_execute_after_opt_a]: 1.257e-05 [slice_cell_reuse_recomputed_activation]: 1.92999e-06 [rewriter_after_opt_a]: 4.457e-05 [convert_after_rewriter]: 7.63999e-06 [order_py_execute_after_rewriter]: 5.69999e-06 [mutable_eliminate]: 0.00053739 [opt_b]: 0.00027676, [1] [Cycle 1]: 0.00027075, [7] [b_1]: 0.00018117 [b_2]: 1.078e-05 [updatestate_depend_eliminate]: 7.55998e-06 [updatestate_assign_eliminate]: 3.35998e-06 [updatestate_loads_eliminate]: 3.73001e-06 [renormalize]: 7.50006e-07 [cse]: 2.599e-05 [optimize_parallel_all_gather_comm]: 1.844e-05 [overlap_param_gather]: 2.02001e-06 [cconv]: 2.921e-05 [loop_unroll]: 0.0004353 [opt_after_cconv]: 0.00012373, [1] [Cycle 1]: 0.00011758, [7] [c_1]: 4.242e-05 [parameter_eliminate]: 3.73999e-06 [updatestate_depend_eliminate]: 6.93e-06 [updatestate_assign_eliminate]: 3.21001e-06 [updatestate_loads_eliminate]: 3.09001e-06 [cse]: 2.364e-05 [renormalize]: 3.09985e-07 [remove_dup_value]: 1.592e-05 [tuple_transform]: 0.00012376, [1] [Cycle 1]: 0.00011934, [4] [d_1]: 8.664e-05 [none_parameter_eliminate]: 1.79e-06 [renormalize]: 2.30008e-07 [switch_simplify]: 9.79999e-06 [partial_unused_args_eliminate]: 1.92999e-06 [add_recomputation]: 5.926e-05 [cse_after_recomputation]: 2.696e-05, [1] [Cycle 1]: 2.212e-05, [1] [cse]: 1.667e-05 [environ_conv]: 6.83e-06 [swap_dp_allreduce_reducescatter]: 6.04999e-06 [bias_add_comm_swap]: 2.74001e-06 [label_micro_interleaved_index]: 5.07999e-06 [label_fine_grained_interleaved_index]: 2.68e-06 [merge_cast_opt]: 1.25999e-06 [slice_recompute_activation]: 1.90001e-06 [micro_interleaved_order_control]: 2.35002e-06 [assign_add_opt]: 1.15999e-06 [ForceFp32Comm]: 7.7e-07 [remove_cast_before_assign_add]: 9.70002e-07 [full_micro_interleaved_order_control]: 2.25002e-06 [reorder_send_recv_between_fp_bp]: 2.84001e-06 [comm_op_add_attrs]: 1.39e-06 [add_comm_op_reuse_tag]: 9.20001e-07 [interleave_split_concat_branches]: 1.36998e-06 [interleave_parallel_branches]: 1.02e-06 [overlap_opt_shard_in_pipeline]: 1.14e-06 [overlap_opt_shard_grad_in_pipeline]: 2.22001e-06 [control_data_broadcast_order]: 1.554e-05 [grouped_pairwise_exchange_alltoall]: 1.86998e-06 [offloading_packed_experts]: 4.76002e-06 [overlap_recompute_and_grad_model_parallel]: 5.52001e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.47999e-06 [overlap_recompute_allgather_and_fa_grad]: 1.49998e-06 [overlap_recompute_comm]: 2.42001e-06 [overlap_grad_ring_attention]: 4.38999e-06 [overlap_grad_flash_sp]: 2.268e-05 [begin_end_overlap_inline]: 5.19998e-07 [split_matmul_comm_elemetwise]: 2.06e-06 [split_layernorm_comm]: 1.82999e-06 [handle_group_info]: 9.00007e-07 [symbol_engine_optimizer]: 8.696e-05, [1] [Cycle 1]: 8.264e-05, [6] [build]: 3.09999e-06 [elim_shapecalc]: 1.199e-05 [elim_not_effective]: 1.679e-05 [opt_reshape]: 9.52999e-06 [fold_const_symbol]: 1.266e-05 [renormalize]: 2.20025e-07 [detach_backward]: 2.49001e-06 [pipeline_parallel_scheduler]: 1.62999e-06 [auto_monad_reorder]: 2.036e-05 [get_jit_bprop_graph]: 2.02001e-06 [rewriter_after_jit_bprop_graph]: 4.14002e-06 [opt_after_jit_grad]: 0.00048433 [validate]: 4.473e-05 Sums bootstrap : 0.000431s : 3.68% type_inference : 0.005716s : 48.84% event_method : 0.000020s : 0.17% auto_monad : 0.000063s : 0.54% graph_reusing : 0.000006s : 0.05% inline : 0.000002s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000022s : 0.19% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000007s : 0.06% parallel-infer-symbol : 0.000003s : 0.03% pre_auto_parallel : 0.000037s : 0.31% insert-virtual-dataset : 0.000002s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000029s : 0.25% optimize.rewriter_before_opt_a : 0.000093s : 0.80% optimize.opt_a.expand_dump_flag : 0.000004s : 0.04% optimize.opt_a.switch_simplify : 0.000060s : 0.51% optimize.opt_a.loop_unroll : 0.000044s : 0.38% optimize.opt_a.a_1 : 0.001097s : 9.37% optimize.opt_a.with_stream_mark : 0.000032s : 0.28% optimize.opt_a.recompute_prepare : 0.000064s : 0.54% optimize.opt_a.updatestate_depend_eliminate : 0.000010s : 0.09% optimize.opt_a.updatestate_assign_eliminate : 0.000008s : 0.07% optimize.opt_a.updatestate_loads_eliminate : 0.000007s : 0.06% optimize.opt_a.parameter_eliminate : 0.000004s : 0.03% optimize.opt_a.a_2 : 0.000217s : 1.85% optimize.opt_a.accelerated_algorithm : 0.000019s : 0.16% optimize.opt_a.shard : 0.000004s : 0.04% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.04% optimize.opt_a.shard_inline : 0.000017s : 0.14% optimize.opt_a.merge_send_recv : 0.000017s : 0.15% optimize.opt_a.auto_parallel : 0.000014s : 0.12% optimize.opt_a.parallel : 0.000025s : 0.21% optimize.opt_a.flash_sp : 0.000013s : 0.11% optimize.opt_a.merge_comm : 0.000009s : 0.08% optimize.opt_a.allreduce_fusion : 0.000008s : 0.07% optimize.opt_a.matmul_add_comm_reduction : 0.000020s : 0.17% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000021s : 0.18% optimize.opt_a.virtual_dataset : 0.000017s : 0.15% optimize.opt_a.get_grad_eliminate_ : 0.000016s : 0.14% optimize.opt_a.virtual_output : 0.000016s : 0.14% optimize.opt_a.merge_forward : 0.000009s : 0.08% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.03% optimize.opt_a.offload_activation : 0.000020s : 0.17% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000035s : 0.30% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.02% optimize.opt_a.before_grad : 0.000027s : 0.23% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000010s : 0.08% optimize.opt_a.meta_fg_expand : 0.000007s : 0.06% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.03% optimize.opt_a.receive_attached : 0.000003s : 0.03% optimize.opt_a.after_resolve : 0.000031s : 0.26% optimize.opt_a.a_after_grad : 0.000026s : 0.22% optimize.opt_a.renormalize : 0.000776s : 6.63% optimize.opt_a.add_forward_monad_depend : 0.000008s : 0.07% optimize.opt_a.auto_monad_grad : 0.000004s : 0.04% optimize.opt_a.auto_monad_eliminator : 0.000029s : 0.25% optimize.opt_a.cse : 0.000060s : 0.51% optimize.opt_a.a_3 : 0.000116s : 0.99% optimize.py_interpret_to_execute_after_opt_a : 0.000013s : 0.11% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.02% optimize.rewriter_after_opt_a : 0.000045s : 0.38% optimize.convert_after_rewriter : 0.000008s : 0.07% optimize.order_py_execute_after_rewriter : 0.000006s : 0.05% optimize.mutable_eliminate : 0.000537s : 4.59% optimize.opt_b.b_1 : 0.000181s : 1.55% optimize.opt_b.b_2 : 0.000011s : 0.09% optimize.opt_b.updatestate_depend_eliminate : 0.000008s : 0.06% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_b.updatestate_loads_eliminate : 0.000004s : 0.03% optimize.opt_b.renormalize : 0.000001s : 0.01% optimize.opt_b.cse : 0.000026s : 0.22% optimize.optimize_parallel_all_gather_comm : 0.000018s : 0.16% optimize.overlap_param_gather : 0.000002s : 0.02% optimize.cconv : 0.000029s : 0.25% optimize.loop_unroll : 0.000435s : 3.72% optimize.opt_after_cconv.c_1 : 0.000042s : 0.36% optimize.opt_after_cconv.parameter_eliminate : 0.000004s : 0.03% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000007s : 0.06% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.cse : 0.000024s : 0.20% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000016s : 0.14% optimize.tuple_transform.d_1 : 0.000087s : 0.74% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.02% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000010s : 0.08% optimize.partial_unused_args_eliminate : 0.000002s : 0.02% optimize.add_recomputation : 0.000059s : 0.51% optimize.cse_after_recomputation.cse : 0.000017s : 0.14% optimize.environ_conv : 0.000007s : 0.06% optimize.swap_dp_allreduce_reducescatter : 0.000006s : 0.05% optimize.bias_add_comm_swap : 0.000003s : 0.02% optimize.label_micro_interleaved_index : 0.000005s : 0.04% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.02% optimize.merge_cast_opt : 0.000001s : 0.01% optimize.slice_recompute_activation : 0.000002s : 0.02% optimize.micro_interleaved_order_control : 0.000002s : 0.02% optimize.assign_add_opt : 0.000001s : 0.01% optimize.ForceFp32Comm : 0.000001s : 0.01% optimize.remove_cast_before_assign_add : 0.000001s : 0.01% optimize.full_micro_interleaved_order_control : 0.000002s : 0.02% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.02% optimize.comm_op_add_attrs : 0.000001s : 0.01% optimize.add_comm_op_reuse_tag : 0.000001s : 0.01% optimize.interleave_split_concat_branches : 0.000001s : 0.01% optimize.interleave_parallel_branches : 0.000001s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.02% optimize.control_data_broadcast_order : 0.000016s : 0.13% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.02% optimize.offloading_packed_experts : 0.000005s : 0.04% optimize.overlap_recompute_and_grad_model_parallel : 0.000006s : 0.05% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.01% optimize.overlap_recompute_comm : 0.000002s : 0.02% optimize.overlap_grad_ring_attention : 0.000004s : 0.04% optimize.overlap_grad_flash_sp : 0.000023s : 0.19% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.02% optimize.split_layernorm_comm : 0.000002s : 0.02% optimize.handle_group_info : 0.000001s : 0.01% optimize.symbol_engine_optimizer.build : 0.000003s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000012s : 0.10% optimize.symbol_engine_optimizer.elim_not_effective : 0.000017s : 0.14% optimize.symbol_engine_optimizer.opt_reshape : 0.000010s : 0.08% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000013s : 0.11% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.02% pipeline_parallel_scheduler : 0.000002s : 0.01% auto_monad_reorder : 0.000020s : 0.17% get_jit_bprop_graph : 0.000002s : 0.02% rewriter_after_jit_bprop_graph : 0.000004s : 0.04% opt_after_jit_grad : 0.000484s : 4.14% validate : 0.000045s : 0.38% Time group info: ------[substitution.] 0.000237 44 10.03% : 0.000024s : 3: substitution.cast_eliminate 0.93% : 0.000002s : 3: substitution.elim_not_effective 0.68% : 0.000002s : 3: substitution.fold_const_symbol 3.22% : 0.000008s : 6: substitution.graph_param_transform 66.48% : 0.000157s : 4: substitution.inline 2.26% : 0.000005s : 6: substitution.j_node_and_user_rematch 2.89% : 0.000007s : 6: substitution.remove_not_recompute_node 2.65% : 0.000006s : 6: substitution.replace_old_param 8.33% : 0.000020s : 6: substitution.tuple_list_get_item_eliminator 2.52% : 0.000006s : 1: substitution.value_based_eliminate ------[type_inference.] 0.005656 2 87.12% : 0.004928s : 1: type_inference.infer 12.88% : 0.000728s : 1: type_inference.specialize ------[replace.] 0.000076 10 50.48% : 0.000038s : 4: replace.inline 49.52% : 0.000037s : 6: replace.tuple_list_get_item_eliminator ------[match.] 0.000171 10 90.19% : 0.000155s : 4: match.inline 9.81% : 0.000017s : 6: match.tuple_list_get_item_eliminator ------[predicate.] 0.000319 1954 0.91% : 0.000003s : 21: predicate.accumulaten_eliminater 0.61% : 0.000002s : 6: predicate.ad_related_special_op_eliminate 0.49% : 0.000002s : 12: predicate.addn_check_dump 0.89% : 0.000003s : 21: predicate.addn_zero_filter 0.83% : 0.000003s : 21: predicate.adjust_all_reduce_mul_add 1.74% : 0.000006s : 33: predicate.arithmetic_simplify 0.98% : 0.000003s : 21: predicate.cast_eliminate 0.56% : 0.000002s : 12: predicate.check_bprop_eliminate 0.50% : 0.000002s : 12: predicate.compare_switch_simplify 0.18% : 0.000001s : 6: predicate.const_output_eliminate 0.55% : 0.000002s : 12: predicate.depend_value_elim 0.97% : 0.000003s : 21: predicate.dict_get_item_const_eliminator 0.96% : 0.000003s : 21: predicate.dict_get_item_eliminator 0.87% : 0.000003s : 21: predicate.dict_set_item_eliminator 0.79% : 0.000003s : 12: predicate.dumpgradient_eliminate 0.20% : 0.000001s : 6: predicate.elim_not_effective 0.32% : 0.000001s : 6: predicate.elim_shapecalc_of_broadcastargs 1.12% : 0.000004s : 27: predicate.environ_add_const_eliminate 1.07% : 0.000003s : 27: predicate.environ_get_add_eliminate 1.10% : 0.000004s : 27: predicate.environ_get_depend_swap 1.63% : 0.000005s : 39: predicate.environ_get_eliminate 1.06% : 0.000003s : 27: predicate.environ_get_set_eliminate 1.37% : 0.000004s : 31: predicate.exchange_switch_depend_value 2.14% : 0.000007s : 31: predicate.float_depend_g_call 0.50% : 0.000002s : 12: predicate.float_environ_get_switch 0.74% : 0.000002s : 18: predicate.float_tuple_getitem_switch 0.17% : 0.000001s : 6: predicate.fold_const_symbol 0.59% : 0.000002s : 12: predicate.get_grad_eliminate 0.18% : 0.000001s : 6: predicate.graph_param_transform 0.51% : 0.000002s : 12: predicate.incorporate_call 0.45% : 0.000001s : 12: predicate.incorporate_call_switch 5.67% : 0.000018s : 88: predicate.inline 0.73% : 0.000002s : 12: predicate.inline_without_move 0.33% : 0.000001s : 12: predicate.j_node_and_user_rematch 0.65% : 0.000002s : 12: predicate.less_batch_normalization 1.72% : 0.000005s : 39: predicate.list_to_tuple_eliminator_ 2.53% : 0.000008s : 60: predicate.load_eliminater 0.60% : 0.000002s : 6: predicate.loop_unroll_after_grad 1.84% : 0.000006s : 42: predicate.loop_unroll_before_grad 1.55% : 0.000005s : 33: predicate.make_slice_get_slice_eliminator 0.50% : 0.000002s : 12: predicate.merge_addn 0.53% : 0.000002s : 12: predicate.micro_step_allgather_replace 0.54% : 0.000002s : 12: predicate.mini_step_allgather_replace 0.83% : 0.000003s : 21: predicate.minmaximum_grad 0.78% : 0.000002s : 6: predicate.mutable_eliminate 0.30% : 0.000001s : 6: predicate.opt_reshape 0.37% : 0.000001s : 6: predicate.parallel_virtual_node 1.66% : 0.000005s : 31: predicate.partial_defer_inline 1.65% : 0.000005s : 33: predicate.partial_eliminate 0.87% : 0.000003s : 21: predicate.print_const_string_wrapper 0.52% : 0.000002s : 12: predicate.reduce_all_const_elim 1.08% : 0.000003s : 21: predicate.reduce_eliminate 2.49% : 0.000008s : 60: predicate.redundant_stop_gradient_eliminater 0.43% : 0.000001s : 12: predicate.remove_not_recompute_node 1.40% : 0.000004s : 39: predicate.replace_applicator 0.45% : 0.000001s : 12: predicate.replace_old_param 0.21% : 0.000001s : 6: predicate.reset_defer_inline 0.92% : 0.000003s : 21: predicate.reshape_eliminate 0.54% : 0.000002s : 12: predicate.row_tensor_add_zeros_like 0.31% : 0.000001s : 6: predicate.row_tensor_eliminate 0.63% : 0.000002s : 12: predicate.same_eliminate 0.44% : 0.000001s : 12: predicate.set_cell_output_no_recompute 0.65% : 0.000002s : 12: predicate.shard_identity_eliminate 0.62% : 0.000002s : 12: predicate.special_op_eliminate 0.65% : 0.000002s : 12: predicate.specialize_transform 0.66% : 0.000002s : 12: predicate.split_environ_get_set_with_tuple_value 0.65% : 0.000002s : 12: predicate.stack_unstack_eliminate 0.31% : 0.000001s : 6: predicate.switch_call_monad_eliminater 1.42% : 0.000005s : 31: predicate.switch_defer_inline 1.96% : 0.000006s : 43: predicate.switch_layer_defer_inline 4.50% : 0.000014s : 91: predicate.switch_simplify 0.86% : 0.000003s : 21: predicate.tile_eliminate 1.01% : 0.000003s : 21: predicate.transpose_eliminate 1.55% : 0.000005s : 33: predicate.tuple_list_convert_item_index_to_positive 1.58% : 0.000005s : 33: predicate.tuple_list_get_item_const_eliminator 1.44% : 0.000005s : 33: predicate.tuple_list_get_item_depend_reorder 3.02% : 0.000010s : 51: predicate.tuple_list_get_item_eliminator 9.67% : 0.000031s : 33: predicate.tuple_list_get_set_item_eliminator 2.09% : 0.000007s : 45: predicate.tuple_list_set_item_eliminator 1.67% : 0.000005s : 39: predicate.tuple_to_list_eliminator_ 2.47% : 0.000008s : 60: predicate.updatestate_pure_node_eliminater 3.00% : 0.000010s : 72: predicate.updatestate_useless_node_eliminater 0.36% : 0.000001s : 6: predicate.value_based_eliminate 0.59% : 0.000002s : 12: predicate.virtual_dataset_eliminate 0.57% : 0.000002s : 12: predicate.virtual_output_eliminate 0.26% : 0.000001s : 6: predicate.virtual_view_grad_eliminate 0.34% : 0.000001s : 6: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000560 11 54.25% : 0.000304s : 5: func_graph_cloner_run.FuncGraphClonerGraph 45.75% : 0.000256s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.027379 192 0.01% : 0.000003s : 1: ForceFp32Comm 11.37% : 0.003114s : 1: add_attr 11.34% : 0.003105s : 1: add_attr_with_inline 0.01% : 0.000003s : 1: add_comm_op_reuse_tag 0.23% : 0.000063s : 1: add_recomputation 0.01% : 0.000004s : 1: assign_add_opt 0.26% : 0.000070s : 1: auto_monad 0.09% : 0.000024s : 1: auto_monad_reorder 0.01% : 0.000003s : 1: begin_end_overlap_inline 0.02% : 0.000006s : 1: bias_add_comm_swap 1.67% : 0.000458s : 1: bootstrap 0.12% : 0.000033s : 1: cconv 0.01% : 0.000004s : 1: comm_op_add_attrs 0.07% : 0.000019s : 1: control_data_broadcast_order 0.04% : 0.000011s : 1: convert_after_rewriter 0.11% : 0.000030s : 1: cse_after_recomputation 0.02% : 0.000005s : 1: dataset_repeat_opt 0.02% : 0.000006s : 1: detach_backward 0.04% : 0.000010s : 1: environ_conv 0.10% : 0.000027s : 1: event_method 0.02% : 0.000005s : 1: full_micro_interleaved_order_control 0.02% : 0.000005s : 1: get_jit_bprop_graph 0.03% : 0.000009s : 1: graph_reusing 0.02% : 0.000005s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000004s : 1: handle_group_info 0.02% : 0.000005s : 1: inline 0.02% : 0.000006s : 1: insert-virtual-dataset 0.01% : 0.000004s : 1: interleave_parallel_branches 0.01% : 0.000004s : 1: interleave_split_concat_branches 0.02% : 0.000006s : 1: label_fine_grained_interleaved_index 0.03% : 0.000008s : 1: label_micro_interleaved_index 1.62% : 0.000443s : 1: loop_unroll 0.01% : 0.000004s : 1: merge_cast_opt 0.02% : 0.000005s : 1: micro_interleaved_order_control 1.99% : 0.000546s : 1: mutable_eliminate 0.03% : 0.000008s : 1: offloading_packed_experts 0.06% : 0.000018s : 1: opt.transform.loop_unroll_optimizer 0.08% : 0.000021s : 1: opt.transform.mutable_eliminate 6.30% : 0.001726s : 78: opt.transform.opt_a 0.15% : 0.000041s : 1: opt.transform.opt_after_cconv 0.11% : 0.000031s : 1: opt.transform.opt_after_jit_grad 0.59% : 0.000160s : 28: opt.transform.opt_b 0.34% : 0.000094s : 2: opt.transform.opt_trans_graph 0.17% : 0.000047s : 4: opt.transform.symbol_engine_opt 12.55% : 0.003437s : 1: opt_a 0.46% : 0.000127s : 1: opt_after_cconv 1.80% : 0.000494s : 1: opt_after_jit_grad 1.02% : 0.000280s : 1: opt_b 20.81% : 0.005698s : 1: optimize 0.08% : 0.000022s : 1: optimize_parallel_all_gather_comm 0.03% : 0.000009s : 1: order_py_execute_after_rewriter 0.10% : 0.000026s : 1: overlap_grad_flash_sp 0.02% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.03% : 0.000007s : 1: overlap_grad_ring_attention 0.02% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.02% : 0.000005s : 1: overlap_param_gather 0.01% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.03% : 0.000008s : 1: overlap_recompute_and_grad_model_parallel 0.02% : 0.000005s : 1: overlap_recompute_comm 0.03% : 0.000007s : 1: parallel-infer-symbol 0.01% : 0.000004s : 1: parallel-infer-symbol-second 0.02% : 0.000005s : 1: partial_unused_args_eliminate 0.02% : 0.000005s : 1: pipeline_parallel_scheduler 0.02% : 0.000004s : 1: pipeline_split 0.15% : 0.000041s : 1: pre_auto_parallel 0.12% : 0.000033s : 1: py_interpret_to_execute 0.06% : 0.000016s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000004s : 1: remove_cast_before_assign_add 0.07% : 0.000019s : 1: remove_dup_value 1.57% : 0.000429s : 1: renormalize.infer 1.23% : 0.000337s : 1: renormalize.specialize 0.02% : 0.000005s : 1: reorder_send_recv_between_fp_bp 0.03% : 0.000007s : 1: rewriter_after_jit_bprop_graph 0.18% : 0.000049s : 1: rewriter_after_opt_a 0.36% : 0.000098s : 1: rewriter_before_opt_a 0.02% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.02% : 0.000005s : 1: slice_recompute_activation 0.02% : 0.000005s : 1: split_layernorm_comm 0.02% : 0.000005s : 1: split_matmul_comm_elemetwise 0.03% : 0.000009s : 1: swap_dp_allreduce_reducescatter 0.33% : 0.000090s : 1: symbol_engine_optimizer 0.46% : 0.000127s : 1: tuple_transform 20.94% : 0.005732s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:47:15.767.882 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:47:15.768.137 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0175941, [21] [bootstrap]: 0.00045617 [type_inference]: 0.00583947 [event_method]: 2.26e-05 [auto_monad]: 6.623e-05 [graph_reusing]: 5.78002e-06 [inline]: 1.92001e-06 [add_attr]: 0.00314243, [1] [add_attr_with_inline]: 0.00313382, [1] [Cycle 1]: 7.449e-05, [2] [tag_attr]: 2.23e-05 [meta_addattr_fg_expand]: 6.66999e-06 [parallel-infer-symbol]: 3.21001e-06 [pre_auto_parallel]: 3.587e-05 [insert-virtual-dataset]: 2.21e-06 [parallel-infer-symbol-second]: 8.00006e-07 [dataset_repeat_opt]: 1.83002e-06 [pipeline_split]: 1.76e-06 [optimize]: 0.00675054, [53] [py_interpret_to_execute]: 3.455e-05 [rewriter_before_opt_a]: 0.00010431 [opt_a]: 0.00399935, [2] [Cycle 1]: 0.00288242, [45] [expand_dump_flag]: 3.23e-06 [switch_simplify]: 4.731e-05 [loop_unroll]: 3.664e-05 [a_1]: 0.00088403 [with_stream_mark]: 1.627e-05 [recompute_prepare]: 1.231e-05 [updatestate_depend_eliminate]: 5.71e-06 [updatestate_assign_eliminate]: 4.82e-06 [updatestate_loads_eliminate]: 4.38999e-06 [parameter_eliminate]: 1.97001e-06 [a_2]: 0.00015541 [accelerated_algorithm]: 1.047e-05 [shard]: 2.03002e-06 [meta_shard_fg_expand]: 2.51e-06 [shard_inline]: 9.59e-06 [merge_send_recv]: 1.011e-05 [auto_parallel]: 7.77e-06 [parallel]: 1.882e-05 [flash_sp]: 8.52998e-06 [merge_comm]: 5.57999e-06 [allreduce_fusion]: 5.34998e-06 [matmul_add_comm_reduction]: 1.158e-05 [allreduce_slice_to_reducescatter]: 9.80013e-07 [virtual_shard_identity]: 1.075e-05 [virtual_dataset]: 9.92001e-06 [get_grad_eliminate_]: 9.44e-06 [virtual_output]: 9.62999e-06 [merge_forward]: 5.20001e-06 [cell_reuse_recompute_pass]: 1.65001e-06 [offload_activation]: 1.253e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.161e-05 [merge_recompute_call_nodes]: 1.55999e-06 [before_grad]: 1.636e-05 [set_forward_comm_id_for_comm_node_pass]: 5.47001e-06 [meta_fg_expand]: 4.18999e-06 [flash_sp_send_recv_attached]: 2.36e-06 [receive_attached]: 2.05002e-06 [after_resolve]: 1.606e-05 [a_after_grad]: 1.552e-05 [renormalize]: 0.00091511 [add_forward_monad_depend]: 5.39e-06 [auto_monad_grad]: 2.04999e-06 [auto_monad_eliminator]: 1.955e-05 [cse]: 5.481e-05 [a_3]: 8.854e-05 [Cycle 2]: 0.00110296, [45] [expand_dump_flag]: 1.59998e-06 [switch_simplify]: 1.132e-05 [loop_unroll]: 1.025e-05 [a_1]: 0.00024184 [with_stream_mark]: 1.521e-05 [recompute_prepare]: 1.02e-05 [updatestate_depend_eliminate]: 4.55999e-06 [updatestate_assign_eliminate]: 3.8e-06 [updatestate_loads_eliminate]: 3.61999e-06 [parameter_eliminate]: 1.15999e-06 [a_2]: 0.00014802 [accelerated_algorithm]: 1.048e-05 [shard]: 1.21997e-06 [meta_shard_fg_expand]: 2.01e-06 [shard_inline]: 9.77001e-06 [merge_send_recv]: 7.41999e-06 [auto_parallel]: 8.11002e-06 [parallel]: 5.66998e-06 [flash_sp]: 3.56999e-06 [merge_comm]: 5.06002e-06 [allreduce_fusion]: 5.77001e-06 [matmul_add_comm_reduction]: 8.99e-06 [allreduce_slice_to_reducescatter]: 5.69999e-07 [virtual_shard_identity]: 1.053e-05 [virtual_dataset]: 9.59e-06 [get_grad_eliminate_]: 9.21002e-06 [virtual_output]: 9.01002e-06 [merge_forward]: 4.96002e-06 [cell_reuse_recompute_pass]: 1.68002e-06 [offload_activation]: 9.51e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.972e-05 [merge_recompute_call_nodes]: 8.89995e-07 [before_grad]: 1.452e-05 [set_forward_comm_id_for_comm_node_pass]: 5.62001e-06 [meta_fg_expand]: 4.05998e-06 [flash_sp_send_recv_attached]: 8.89995e-07 [receive_attached]: 1.24998e-06 [after_resolve]: 1.564e-05 [a_after_grad]: 1.432e-05 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 1.34e-06 [auto_monad_grad]: 1.10001e-06 [auto_monad_eliminator]: 1.155e-05 [cse]: 2.729e-05 [a_3]: 7.348e-05 [py_interpret_to_execute_after_opt_a]: 1.588e-05 [slice_cell_reuse_recomputed_activation]: 5.33002e-06 [rewriter_after_opt_a]: 4.983e-05 [convert_after_rewriter]: 1.178e-05 [order_py_execute_after_rewriter]: 9.32999e-06 [mutable_eliminate]: 0.00060724 [opt_b]: 0.00037968, [1] [Cycle 1]: 0.00037, [7] [b_1]: 0.00025431 [b_2]: 1.152e-05 [updatestate_depend_eliminate]: 7.06001e-06 [updatestate_assign_eliminate]: 4.03001e-06 [updatestate_loads_eliminate]: 4.23999e-06 [renormalize]: 4.69998e-07 [cse]: 3.094e-05 [optimize_parallel_all_gather_comm]: 2.333e-05 [overlap_param_gather]: 4.55001e-06 [cconv]: 2.982e-05 [loop_unroll]: 0.00047859 [opt_after_cconv]: 0.00016321, [1] [Cycle 1]: 0.00015443, [7] [c_1]: 4.961e-05 [parameter_eliminate]: 3.04001e-06 [updatestate_depend_eliminate]: 6.84999e-06 [updatestate_assign_eliminate]: 3.88001e-06 [updatestate_loads_eliminate]: 4.00998e-06 [cse]: 2.93e-05 [renormalize]: 4.90021e-07 [remove_dup_value]: 4.355e-05 [tuple_transform]: 0.00012058, [1] [Cycle 1]: 0.00011301, [4] [d_1]: 6.875e-05 [none_parameter_eliminate]: 1.85001e-06 [renormalize]: 1.59984e-07 [switch_simplify]: 1.058e-05 [partial_unused_args_eliminate]: 4.70999e-06 [add_recomputation]: 6.549e-05 [cse_after_recomputation]: 6.558e-05, [1] [Cycle 1]: 5.805e-05, [1] [cse]: 1.902e-05 [environ_conv]: 1.09e-05 [swap_dp_allreduce_reducescatter]: 9.51003e-06 [bias_add_comm_swap]: 5.87001e-06 [label_micro_interleaved_index]: 7.75e-06 [label_fine_grained_interleaved_index]: 5.27999e-06 [merge_cast_opt]: 4.22998e-06 [slice_recompute_activation]: 4.53999e-06 [micro_interleaved_order_control]: 5.15001e-06 [assign_add_opt]: 3.73999e-06 [ForceFp32Comm]: 3.20002e-06 [remove_cast_before_assign_add]: 3.48e-06 [full_micro_interleaved_order_control]: 4.80999e-06 [reorder_send_recv_between_fp_bp]: 4.85001e-06 [comm_op_add_attrs]: 3.83001e-06 [add_comm_op_reuse_tag]: 3.25e-06 [interleave_split_concat_branches]: 3.50998e-06 [interleave_parallel_branches]: 3.56001e-06 [overlap_opt_shard_in_pipeline]: 3.63999e-06 [overlap_opt_shard_grad_in_pipeline]: 4.47e-06 [control_data_broadcast_order]: 1.98e-05 [grouped_pairwise_exchange_alltoall]: 3.88999e-06 [offloading_packed_experts]: 7.45e-06 [overlap_recompute_and_grad_model_parallel]: 7.96001e-06 [overlap_grad_matmul_and_grad_allreduce]: 4.40999e-06 [overlap_recompute_allgather_and_fa_grad]: 3.7e-06 [overlap_recompute_comm]: 5.39998e-06 [overlap_grad_ring_attention]: 7.77e-06 [overlap_grad_flash_sp]: 2.731e-05 [begin_end_overlap_inline]: 2.91e-06 [split_matmul_comm_elemetwise]: 4.53999e-06 [split_layernorm_comm]: 4.43001e-06 [handle_group_info]: 3.32002e-06 [symbol_engine_optimizer]: 0.00011854, [1] [Cycle 1]: 0.00011143, [6] [build]: 3.58999e-06 [elim_shapecalc]: 1.53e-05 [elim_not_effective]: 1.874e-05 [opt_reshape]: 1.086e-05 [fold_const_symbol]: 1.593e-05 [renormalize]: 2.30008e-07 [detach_backward]: 3.81999e-06 [pipeline_parallel_scheduler]: 1.66e-06 [auto_monad_reorder]: 2.469e-05 [get_jit_bprop_graph]: 1.64e-06 [rewriter_after_jit_bprop_graph]: 5.80002e-06 [opt_after_jit_grad]: 0.00058277 [validate]: 4.663e-05 Sums bootstrap : 0.000456s : 3.60% type_inference : 0.005839s : 46.14% event_method : 0.000023s : 0.18% auto_monad : 0.000066s : 0.52% graph_reusing : 0.000006s : 0.05% inline : 0.000002s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000022s : 0.18% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000007s : 0.05% parallel-infer-symbol : 0.000003s : 0.03% pre_auto_parallel : 0.000036s : 0.28% insert-virtual-dataset : 0.000002s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.01% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000035s : 0.27% optimize.rewriter_before_opt_a : 0.000104s : 0.82% optimize.opt_a.expand_dump_flag : 0.000005s : 0.04% optimize.opt_a.switch_simplify : 0.000059s : 0.46% optimize.opt_a.loop_unroll : 0.000047s : 0.37% optimize.opt_a.a_1 : 0.001126s : 8.90% optimize.opt_a.with_stream_mark : 0.000031s : 0.25% optimize.opt_a.recompute_prepare : 0.000023s : 0.18% optimize.opt_a.updatestate_depend_eliminate : 0.000010s : 0.08% optimize.opt_a.updatestate_assign_eliminate : 0.000009s : 0.07% optimize.opt_a.updatestate_loads_eliminate : 0.000008s : 0.06% optimize.opt_a.parameter_eliminate : 0.000003s : 0.02% optimize.opt_a.a_2 : 0.000303s : 2.40% optimize.opt_a.accelerated_algorithm : 0.000021s : 0.17% optimize.opt_a.shard : 0.000003s : 0.03% optimize.opt_a.meta_shard_fg_expand : 0.000005s : 0.04% optimize.opt_a.shard_inline : 0.000019s : 0.15% optimize.opt_a.merge_send_recv : 0.000018s : 0.14% optimize.opt_a.auto_parallel : 0.000016s : 0.13% optimize.opt_a.parallel : 0.000024s : 0.19% optimize.opt_a.flash_sp : 0.000012s : 0.10% optimize.opt_a.merge_comm : 0.000011s : 0.08% optimize.opt_a.allreduce_fusion : 0.000011s : 0.09% optimize.opt_a.matmul_add_comm_reduction : 0.000021s : 0.16% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000002s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000021s : 0.17% optimize.opt_a.virtual_dataset : 0.000020s : 0.15% optimize.opt_a.get_grad_eliminate_ : 0.000019s : 0.15% optimize.opt_a.virtual_output : 0.000019s : 0.15% optimize.opt_a.merge_forward : 0.000010s : 0.08% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.03% optimize.opt_a.offload_activation : 0.000022s : 0.17% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000041s : 0.33% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.02% optimize.opt_a.before_grad : 0.000031s : 0.24% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000011s : 0.09% optimize.opt_a.meta_fg_expand : 0.000008s : 0.07% optimize.opt_a.flash_sp_send_recv_attached : 0.000003s : 0.03% optimize.opt_a.receive_attached : 0.000003s : 0.03% optimize.opt_a.after_resolve : 0.000032s : 0.25% optimize.opt_a.a_after_grad : 0.000030s : 0.24% optimize.opt_a.renormalize : 0.000915s : 7.23% optimize.opt_a.add_forward_monad_depend : 0.000007s : 0.05% optimize.opt_a.auto_monad_grad : 0.000003s : 0.02% optimize.opt_a.auto_monad_eliminator : 0.000031s : 0.25% optimize.opt_a.cse : 0.000082s : 0.65% optimize.opt_a.a_3 : 0.000162s : 1.28% optimize.py_interpret_to_execute_after_opt_a : 0.000016s : 0.13% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.04% optimize.rewriter_after_opt_a : 0.000050s : 0.39% optimize.convert_after_rewriter : 0.000012s : 0.09% optimize.order_py_execute_after_rewriter : 0.000009s : 0.07% optimize.mutable_eliminate : 0.000607s : 4.80% optimize.opt_b.b_1 : 0.000254s : 2.01% optimize.opt_b.b_2 : 0.000012s : 0.09% optimize.opt_b.updatestate_depend_eliminate : 0.000007s : 0.06% optimize.opt_b.updatestate_assign_eliminate : 0.000004s : 0.03% optimize.opt_b.updatestate_loads_eliminate : 0.000004s : 0.03% optimize.opt_b.renormalize : 0.000000s : 0.00% optimize.opt_b.cse : 0.000031s : 0.24% optimize.optimize_parallel_all_gather_comm : 0.000023s : 0.18% optimize.overlap_param_gather : 0.000005s : 0.04% optimize.cconv : 0.000030s : 0.24% optimize.loop_unroll : 0.000479s : 3.78% optimize.opt_after_cconv.c_1 : 0.000050s : 0.39% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.02% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000007s : 0.05% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000004s : 0.03% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000004s : 0.03% optimize.opt_after_cconv.cse : 0.000029s : 0.23% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000044s : 0.34% optimize.tuple_transform.d_1 : 0.000069s : 0.54% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000011s : 0.08% optimize.partial_unused_args_eliminate : 0.000005s : 0.04% optimize.add_recomputation : 0.000065s : 0.52% optimize.cse_after_recomputation.cse : 0.000019s : 0.15% optimize.environ_conv : 0.000011s : 0.09% optimize.swap_dp_allreduce_reducescatter : 0.000010s : 0.08% optimize.bias_add_comm_swap : 0.000006s : 0.05% optimize.label_micro_interleaved_index : 0.000008s : 0.06% optimize.label_fine_grained_interleaved_index : 0.000005s : 0.04% optimize.merge_cast_opt : 0.000004s : 0.03% optimize.slice_recompute_activation : 0.000005s : 0.04% optimize.micro_interleaved_order_control : 0.000005s : 0.04% optimize.assign_add_opt : 0.000004s : 0.03% optimize.ForceFp32Comm : 0.000003s : 0.03% optimize.remove_cast_before_assign_add : 0.000003s : 0.03% optimize.full_micro_interleaved_order_control : 0.000005s : 0.04% optimize.reorder_send_recv_between_fp_bp : 0.000005s : 0.04% optimize.comm_op_add_attrs : 0.000004s : 0.03% optimize.add_comm_op_reuse_tag : 0.000003s : 0.03% optimize.interleave_split_concat_branches : 0.000004s : 0.03% optimize.interleave_parallel_branches : 0.000004s : 0.03% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.03% optimize.overlap_opt_shard_grad_in_pipeline : 0.000004s : 0.04% optimize.control_data_broadcast_order : 0.000020s : 0.16% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.03% optimize.offloading_packed_experts : 0.000007s : 0.06% optimize.overlap_recompute_and_grad_model_parallel : 0.000008s : 0.06% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.03% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.03% optimize.overlap_recompute_comm : 0.000005s : 0.04% optimize.overlap_grad_ring_attention : 0.000008s : 0.06% optimize.overlap_grad_flash_sp : 0.000027s : 0.22% optimize.begin_end_overlap_inline : 0.000003s : 0.02% optimize.split_matmul_comm_elemetwise : 0.000005s : 0.04% optimize.split_layernorm_comm : 0.000004s : 0.04% optimize.handle_group_info : 0.000003s : 0.03% optimize.symbol_engine_optimizer.build : 0.000004s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000015s : 0.12% optimize.symbol_engine_optimizer.elim_not_effective : 0.000019s : 0.15% optimize.symbol_engine_optimizer.opt_reshape : 0.000011s : 0.09% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000016s : 0.13% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000004s : 0.03% pipeline_parallel_scheduler : 0.000002s : 0.01% auto_monad_reorder : 0.000025s : 0.20% get_jit_bprop_graph : 0.000002s : 0.01% rewriter_after_jit_bprop_graph : 0.000006s : 0.05% opt_after_jit_grad : 0.000583s : 4.60% validate : 0.000047s : 0.37% Time group info: ------[substitution.] 0.000243 54 12.41% : 0.000030s : 6: substitution.cast_eliminate 1.11% : 0.000003s : 4: substitution.elim_not_effective 1.02% : 0.000002s : 4: substitution.fold_const_symbol 3.31% : 0.000008s : 7: substitution.graph_param_transform 63.83% : 0.000155s : 4: substitution.inline 2.32% : 0.000006s : 8: substitution.j_node_and_user_rematch 3.38% : 0.000008s : 8: substitution.remove_not_recompute_node 2.50% : 0.000006s : 6: substitution.replace_old_param 7.71% : 0.000019s : 6: substitution.tuple_list_get_item_eliminator 2.42% : 0.000006s : 1: substitution.value_based_eliminate ------[type_inference.] 0.005785 2 86.60% : 0.005010s : 1: type_inference.infer 13.40% : 0.000775s : 1: type_inference.specialize ------[replace.] 0.000073 10 52.19% : 0.000038s : 4: replace.inline 47.81% : 0.000035s : 6: replace.tuple_list_get_item_eliminator ------[match.] 0.000168 10 90.50% : 0.000152s : 4: match.inline 9.50% : 0.000016s : 6: match.tuple_list_get_item_eliminator ------[predicate.] 0.000326 2134 0.91% : 0.000003s : 22: predicate.accumulaten_eliminater 0.83% : 0.000003s : 7: predicate.ad_related_special_op_eliminate 0.58% : 0.000002s : 14: predicate.addn_check_dump 0.86% : 0.000003s : 22: predicate.addn_zero_filter 0.87% : 0.000003s : 22: predicate.adjust_all_reduce_mul_add 1.93% : 0.000006s : 36: predicate.arithmetic_simplify 1.10% : 0.000004s : 22: predicate.cast_eliminate 0.79% : 0.000003s : 14: predicate.check_bprop_eliminate 0.60% : 0.000002s : 14: predicate.compare_switch_simplify 0.19% : 0.000001s : 7: predicate.const_output_eliminate 0.62% : 0.000002s : 14: predicate.depend_value_elim 0.96% : 0.000003s : 22: predicate.dict_get_item_const_eliminator 1.11% : 0.000004s : 22: predicate.dict_get_item_eliminator 0.90% : 0.000003s : 22: predicate.dict_set_item_eliminator 0.96% : 0.000003s : 14: predicate.dumpgradient_eliminate 0.25% : 0.000001s : 7: predicate.elim_not_effective 0.48% : 0.000002s : 7: predicate.elim_shapecalc_of_broadcastargs 1.22% : 0.000004s : 29: predicate.environ_add_const_eliminate 1.13% : 0.000004s : 29: predicate.environ_get_add_eliminate 1.20% : 0.000004s : 29: predicate.environ_get_depend_swap 1.83% : 0.000006s : 43: predicate.environ_get_eliminate 1.13% : 0.000004s : 29: predicate.environ_get_set_eliminate 1.39% : 0.000005s : 32: predicate.exchange_switch_depend_value 2.19% : 0.000007s : 32: predicate.float_depend_g_call 0.59% : 0.000002s : 14: predicate.float_environ_get_switch 0.91% : 0.000003s : 21: predicate.float_tuple_getitem_switch 0.20% : 0.000001s : 7: predicate.fold_const_symbol 0.70% : 0.000002s : 14: predicate.get_grad_eliminate 0.22% : 0.000001s : 7: predicate.graph_param_transform 0.60% : 0.000002s : 14: predicate.incorporate_call 0.53% : 0.000002s : 14: predicate.incorporate_call_switch 6.10% : 0.000020s : 96: predicate.inline 0.78% : 0.000003s : 14: predicate.inline_without_move 0.36% : 0.000001s : 14: predicate.j_node_and_user_rematch 0.91% : 0.000003s : 14: predicate.less_batch_normalization 1.92% : 0.000006s : 42: predicate.list_to_tuple_eliminator_ 2.69% : 0.000009s : 64: predicate.load_eliminater 0.81% : 0.000003s : 7: predicate.loop_unroll_after_grad 1.93% : 0.000006s : 44: predicate.loop_unroll_before_grad 1.79% : 0.000006s : 36: predicate.make_slice_get_slice_eliminator 0.60% : 0.000002s : 14: predicate.merge_addn 0.66% : 0.000002s : 14: predicate.micro_step_allgather_replace 0.63% : 0.000002s : 14: predicate.mini_step_allgather_replace 0.85% : 0.000003s : 22: predicate.minmaximum_grad 0.86% : 0.000003s : 7: predicate.mutable_eliminate 0.40% : 0.000001s : 7: predicate.opt_reshape 0.31% : 0.000001s : 7: predicate.parallel_virtual_node 1.67% : 0.000005s : 32: predicate.partial_defer_inline 1.76% : 0.000006s : 35: predicate.partial_eliminate 0.91% : 0.000003s : 22: predicate.print_const_string_wrapper 0.63% : 0.000002s : 14: predicate.reduce_all_const_elim 1.13% : 0.000004s : 22: predicate.reduce_eliminate 2.64% : 0.000009s : 64: predicate.redundant_stop_gradient_eliminater 0.37% : 0.000001s : 14: predicate.remove_not_recompute_node 1.46% : 0.000005s : 42: predicate.replace_applicator 0.49% : 0.000002s : 14: predicate.replace_old_param 0.25% : 0.000001s : 7: predicate.reset_defer_inline 0.96% : 0.000003s : 22: predicate.reshape_eliminate 0.61% : 0.000002s : 14: predicate.row_tensor_add_zeros_like 0.37% : 0.000001s : 7: predicate.row_tensor_eliminate 0.75% : 0.000002s : 14: predicate.same_eliminate 0.43% : 0.000001s : 14: predicate.set_cell_output_no_recompute 0.75% : 0.000002s : 14: predicate.shard_identity_eliminate 0.75% : 0.000002s : 14: predicate.special_op_eliminate 0.76% : 0.000002s : 14: predicate.specialize_transform 0.79% : 0.000003s : 14: predicate.split_environ_get_set_with_tuple_value 0.76% : 0.000002s : 14: predicate.stack_unstack_eliminate 0.38% : 0.000001s : 7: predicate.switch_call_monad_eliminater 1.46% : 0.000005s : 32: predicate.switch_defer_inline 2.06% : 0.000007s : 46: predicate.switch_layer_defer_inline 4.57% : 0.000015s : 97: predicate.switch_simplify 0.89% : 0.000003s : 22: predicate.tile_eliminate 0.93% : 0.000003s : 22: predicate.transpose_eliminate 1.60% : 0.000005s : 36: predicate.tuple_list_convert_item_index_to_positive 1.64% : 0.000005s : 36: predicate.tuple_list_get_item_const_eliminator 1.59% : 0.000005s : 36: predicate.tuple_list_get_item_depend_reorder 3.10% : 0.000010s : 56: predicate.tuple_list_get_item_eliminator 1.53% : 0.000005s : 36: predicate.tuple_list_get_set_item_eliminator 2.36% : 0.000008s : 50: predicate.tuple_list_set_item_eliminator 1.77% : 0.000006s : 42: predicate.tuple_to_list_eliminator_ 2.61% : 0.000008s : 64: predicate.updatestate_pure_node_eliminater 3.36% : 0.000011s : 78: predicate.updatestate_useless_node_eliminater 0.49% : 0.000002s : 7: predicate.value_based_eliminate 0.70% : 0.000002s : 14: predicate.virtual_dataset_eliminate 0.64% : 0.000002s : 14: predicate.virtual_output_eliminate 0.31% : 0.000001s : 7: predicate.virtual_view_grad_eliminate 0.41% : 0.000001s : 7: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000554 11 52.37% : 0.000290s : 5: func_graph_cloner_run.FuncGraphClonerGraph 47.63% : 0.000264s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.030506 192 0.02% : 0.000006s : 1: ForceFp32Comm 10.33% : 0.003151s : 1: add_attr 10.28% : 0.003137s : 1: add_attr_with_inline 0.02% : 0.000006s : 1: add_comm_op_reuse_tag 0.23% : 0.000069s : 1: add_recomputation 0.02% : 0.000006s : 1: assign_add_opt 0.25% : 0.000075s : 1: auto_monad 0.11% : 0.000032s : 1: auto_monad_reorder 0.02% : 0.000006s : 1: begin_end_overlap_inline 0.03% : 0.000009s : 1: bias_add_comm_swap 1.63% : 0.000499s : 1: bootstrap 0.11% : 0.000033s : 1: cconv 0.02% : 0.000007s : 1: comm_op_add_attrs 0.07% : 0.000023s : 1: control_data_broadcast_order 0.05% : 0.000015s : 1: convert_after_rewriter 0.23% : 0.000070s : 1: cse_after_recomputation 0.02% : 0.000007s : 1: dataset_repeat_opt 0.07% : 0.000021s : 1: detach_backward 0.05% : 0.000014s : 1: environ_conv 0.11% : 0.000033s : 1: event_method 0.02% : 0.000007s : 1: full_micro_interleaved_order_control 0.02% : 0.000008s : 1: get_jit_bprop_graph 0.04% : 0.000012s : 1: graph_reusing 0.02% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.02% : 0.000006s : 1: handle_group_info 0.03% : 0.000008s : 1: inline 0.03% : 0.000008s : 1: insert-virtual-dataset 0.02% : 0.000006s : 1: interleave_parallel_branches 0.02% : 0.000007s : 1: interleave_split_concat_branches 0.03% : 0.000008s : 1: label_fine_grained_interleaved_index 0.03% : 0.000011s : 1: label_micro_interleaved_index 1.59% : 0.000485s : 1: loop_unroll 0.02% : 0.000007s : 1: merge_cast_opt 0.03% : 0.000008s : 1: micro_interleaved_order_control 2.01% : 0.000614s : 1: mutable_eliminate 0.03% : 0.000010s : 1: offloading_packed_experts 0.06% : 0.000019s : 1: opt.transform.loop_unroll_optimizer 0.06% : 0.000019s : 1: opt.transform.mutable_eliminate 5.88% : 0.001795s : 78: opt.transform.opt_a 0.16% : 0.000048s : 1: opt.transform.opt_after_cconv 0.13% : 0.000039s : 1: opt.transform.opt_after_jit_grad 0.63% : 0.000191s : 28: opt.transform.opt_b 0.25% : 0.000077s : 2: opt.transform.opt_trans_graph 0.19% : 0.000057s : 4: opt.transform.symbol_engine_opt 13.12% : 0.004003s : 1: opt_a 0.55% : 0.000167s : 1: opt_after_cconv 1.95% : 0.000594s : 1: opt_after_jit_grad 1.26% : 0.000384s : 1: opt_b 23.29% : 0.007104s : 1: optimize 0.09% : 0.000027s : 1: optimize_parallel_all_gather_comm 0.04% : 0.000012s : 1: order_py_execute_after_rewriter 0.10% : 0.000030s : 1: overlap_grad_flash_sp 0.03% : 0.000008s : 1: overlap_grad_matmul_and_grad_allreduce 0.03% : 0.000011s : 1: overlap_grad_ring_attention 0.02% : 0.000007s : 1: overlap_opt_shard_grad_in_pipeline 0.02% : 0.000006s : 1: overlap_opt_shard_in_pipeline 0.03% : 0.000008s : 1: overlap_param_gather 0.02% : 0.000006s : 1: overlap_recompute_allgather_and_fa_grad 0.04% : 0.000011s : 1: overlap_recompute_and_grad_model_parallel 0.03% : 0.000008s : 1: overlap_recompute_comm 0.03% : 0.000010s : 1: parallel-infer-symbol 0.02% : 0.000006s : 1: parallel-infer-symbol-second 0.03% : 0.000008s : 1: partial_unused_args_eliminate 0.03% : 0.000010s : 1: pipeline_parallel_scheduler 0.02% : 0.000007s : 1: pipeline_split 0.14% : 0.000043s : 1: pre_auto_parallel 0.13% : 0.000038s : 1: py_interpret_to_execute 0.06% : 0.000019s : 1: py_interpret_to_execute_after_opt_a 0.02% : 0.000006s : 1: remove_cast_before_assign_add 0.15% : 0.000047s : 1: remove_dup_value 1.79% : 0.000546s : 1: renormalize.infer 1.18% : 0.000361s : 1: renormalize.specialize 0.02% : 0.000008s : 1: reorder_send_recv_between_fp_bp 0.04% : 0.000012s : 1: rewriter_after_jit_bprop_graph 0.18% : 0.000054s : 1: rewriter_after_opt_a 0.35% : 0.000108s : 1: rewriter_before_opt_a 0.03% : 0.000008s : 1: slice_cell_reuse_recomputed_activation 0.02% : 0.000007s : 1: slice_recompute_activation 0.02% : 0.000007s : 1: split_layernorm_comm 0.02% : 0.000007s : 1: split_matmul_comm_elemetwise 0.04% : 0.000012s : 1: swap_dp_allreduce_reducescatter 0.40% : 0.000121s : 1: symbol_engine_optimizer 0.41% : 0.000124s : 1: tuple_transform 19.26% : 0.005877s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:47:15.971.661 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0162973, [21] [bootstrap]: 0.00047435 [type_inference]: 0.00626507 [event_method]: 2.095e-05 [auto_monad]: 6.726e-05 [graph_reusing]: 5.48002e-06 [inline]: 2.37999e-06 [add_attr]: 0.00307037, [1] [add_attr_with_inline]: 0.00306211, [1] [Cycle 1]: 5.856e-05, [2] [tag_attr]: 2.112e-05 [meta_addattr_fg_expand]: 6.58e-06 [parallel-infer-symbol]: 2.99999e-06 [pre_auto_parallel]: 3.414e-05 [insert-virtual-dataset]: 2.46998e-06 [parallel-infer-symbol-second]: 1.00001e-06 [dataset_repeat_opt]: 2.02999e-06 [pipeline_split]: 1.65001e-06 [optimize]: 0.00563157, [53] [py_interpret_to_execute]: 2.776e-05 [rewriter_before_opt_a]: 9.618e-05 [opt_a]: 0.00338268, [2] [Cycle 1]: 0.00247214, [45] [expand_dump_flag]: 3.08e-06 [switch_simplify]: 4.867e-05 [loop_unroll]: 3.623e-05 [a_1]: 0.0008756 [with_stream_mark]: 1.583e-05 [recompute_prepare]: 1.137e-05 [updatestate_depend_eliminate]: 5.69999e-06 [updatestate_assign_eliminate]: 4.67e-06 [updatestate_loads_eliminate]: 4.25e-06 [parameter_eliminate]: 1.82999e-06 [a_2]: 0.00012823 [accelerated_algorithm]: 1.034e-05 [shard]: 1.63997e-06 [meta_shard_fg_expand]: 2.12999e-06 [shard_inline]: 9.59e-06 [merge_send_recv]: 1.009e-05 [auto_parallel]: 7.49002e-06 [parallel]: 1.773e-05 [flash_sp]: 8.25999e-06 [merge_comm]: 5.76e-06 [allreduce_fusion]: 4.93001e-06 [matmul_add_comm_reduction]: 1.09e-05 [allreduce_slice_to_reducescatter]: 5.89993e-07 [virtual_shard_identity]: 1.086e-05 [virtual_dataset]: 9.71003e-06 [get_grad_eliminate_]: 9.26002e-06 [virtual_output]: 9.49e-06 [merge_forward]: 5.42001e-06 [cell_reuse_recompute_pass]: 1.62001e-06 [offload_activation]: 1.163e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.799e-05 [merge_recompute_call_nodes]: 1.54e-06 [before_grad]: 1.52e-05 [set_forward_comm_id_for_comm_node_pass]: 5.44998e-06 [meta_fg_expand]: 4.2e-06 [flash_sp_send_recv_attached]: 2.38002e-06 [receive_attached]: 2.685e-05 [after_resolve]: 1.62e-05 [a_after_grad]: 1.502e-05 [renormalize]: 0.00070246 [add_forward_monad_depend]: 5.61e-06 [auto_monad_grad]: 2.11e-06 [auto_monad_eliminator]: 1.818e-05 [cse]: 4.522e-05 [a_3]: 6.872e-05 [Cycle 2]: 0.00090096, [45] [expand_dump_flag]: 1.06997e-06 [switch_simplify]: 1.087e-05 [loop_unroll]: 9.60001e-06 [a_1]: 0.00023729 [with_stream_mark]: 1.25e-05 [recompute_prepare]: 9.49e-06 [updatestate_depend_eliminate]: 4.33999e-06 [updatestate_assign_eliminate]: 3.68e-06 [updatestate_loads_eliminate]: 3.66001e-06 [parameter_eliminate]: 1.07e-06 [a_2]: 0.00011776 [accelerated_algorithm]: 9.16002e-06 [shard]: 1.19e-06 [meta_shard_fg_expand]: 2.04999e-06 [shard_inline]: 8.94998e-06 [merge_send_recv]: 6.46999e-06 [auto_parallel]: 6.68e-06 [parallel]: 4.68999e-06 [flash_sp]: 3.53e-06 [merge_comm]: 4.95001e-06 [allreduce_fusion]: 4.44002e-06 [matmul_add_comm_reduction]: 7.38e-06 [allreduce_slice_to_reducescatter]: 4.19997e-07 [virtual_shard_identity]: 1.032e-05 [virtual_dataset]: 9.02e-06 [get_grad_eliminate_]: 8.40001e-06 [virtual_output]: 8.65001e-06 [merge_forward]: 6.61e-06 [cell_reuse_recompute_pass]: 1.45999e-06 [offload_activation]: 7.87e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.749e-05 [merge_recompute_call_nodes]: 7.89994e-07 [before_grad]: 1.482e-05 [set_forward_comm_id_for_comm_node_pass]: 5.34e-06 [meta_fg_expand]: 3.33e-06 [flash_sp_send_recv_attached]: 1.07998e-06 [receive_attached]: 1.00999e-06 [after_resolve]: 1.427e-05 [a_after_grad]: 1.431e-05 [renormalize]: 8.00064e-08 [add_forward_monad_depend]: 1.42e-06 [auto_monad_grad]: 9.50007e-07 [auto_monad_eliminator]: 9.96998e-06 [cse]: 2.416e-05 [a_3]: 5.955e-05 [py_interpret_to_execute_after_opt_a]: 1.091e-05 [slice_cell_reuse_recomputed_activation]: 1.92001e-06 [rewriter_after_opt_a]: 4.376e-05 [convert_after_rewriter]: 8.44998e-06 [order_py_execute_after_rewriter]: 6.48998e-06 [mutable_eliminate]: 0.00047463 [opt_b]: 0.00030635, [1] [Cycle 1]: 0.0003002, [7] [b_1]: 0.00020664 [b_2]: 1.171e-05 [updatestate_depend_eliminate]: 6.83e-06 [updatestate_assign_eliminate]: 3.90998e-06 [updatestate_loads_eliminate]: 4.52e-06 [renormalize]: 3.9002e-07 [cse]: 2.961e-05 [optimize_parallel_all_gather_comm]: 2.075e-05 [overlap_param_gather]: 1.97999e-06 [cconv]: 2.499e-05 [loop_unroll]: 0.00043946 [opt_after_cconv]: 0.00013396, [1] [Cycle 1]: 0.00012839, [7] [c_1]: 4.835e-05 [parameter_eliminate]: 2.36e-06 [updatestate_depend_eliminate]: 7.01001e-06 [updatestate_assign_eliminate]: 3.87002e-06 [updatestate_loads_eliminate]: 3.6e-06 [cse]: 2.845e-05 [renormalize]: 3.4002e-07 [remove_dup_value]: 3.648e-05 [tuple_transform]: 0.00010257, [1] [Cycle 1]: 9.779e-05, [4] [d_1]: 6.702e-05 [none_parameter_eliminate]: 1.64e-06 [renormalize]: 2.3999e-07 [switch_simplify]: 1.021e-05 [partial_unused_args_eliminate]: 1.74998e-06 [add_recomputation]: 6.135e-05 [cse_after_recomputation]: 2.783e-05, [1] [Cycle 1]: 2.324e-05, [1] [cse]: 1.774e-05 [environ_conv]: 5.87999e-06 [swap_dp_allreduce_reducescatter]: 6.81999e-06 [bias_add_comm_swap]: 2.71e-06 [label_micro_interleaved_index]: 4.12e-06 [label_fine_grained_interleaved_index]: 2.83998e-06 [merge_cast_opt]: 1.33002e-06 [slice_recompute_activation]: 2.27999e-06 [micro_interleaved_order_control]: 2.38002e-06 [assign_add_opt]: 1.27999e-06 [ForceFp32Comm]: 8.2e-07 [remove_cast_before_assign_add]: 9.89996e-07 [full_micro_interleaved_order_control]: 2.27001e-06 [reorder_send_recv_between_fp_bp]: 2.89999e-06 [comm_op_add_attrs]: 9.90025e-07 [add_comm_op_reuse_tag]: 9.39996e-07 [interleave_split_concat_branches]: 1.15999e-06 [interleave_parallel_branches]: 1.02e-06 [overlap_opt_shard_in_pipeline]: 1.51002e-06 [overlap_opt_shard_grad_in_pipeline]: 2.09e-06 [control_data_broadcast_order]: 1.592e-05 [grouped_pairwise_exchange_alltoall]: 1.56998e-06 [offloading_packed_experts]: 4.79002e-06 [overlap_recompute_and_grad_model_parallel]: 5.72999e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.11002e-06 [overlap_recompute_allgather_and_fa_grad]: 1.37e-06 [overlap_recompute_comm]: 2.43e-06 [overlap_grad_ring_attention]: 4.77e-06 [overlap_grad_flash_sp]: 2.314e-05 [begin_end_overlap_inline]: 5.09986e-07 [split_matmul_comm_elemetwise]: 1.92999e-06 [split_layernorm_comm]: 1.88997e-06 [handle_group_info]: 8.89995e-07 [symbol_engine_optimizer]: 9.221e-05, [1] [Cycle 1]: 8.772e-05, [6] [build]: 2.99001e-06 [elim_shapecalc]: 1.304e-05 [elim_not_effective]: 1.895e-05 [opt_reshape]: 1.022e-05 [fold_const_symbol]: 1.49e-05 [renormalize]: 2.60014e-07 [detach_backward]: 1.74e-06 [pipeline_parallel_scheduler]: 1.47001e-06 [auto_monad_reorder]: 3.745e-05 [get_jit_bprop_graph]: 1.31002e-06 [rewriter_after_jit_bprop_graph]: 3.58e-06 [opt_after_jit_grad]: 0.00048052 [validate]: 4.195e-05 Sums bootstrap : 0.000474s : 3.86% type_inference : 0.006265s : 51.02% event_method : 0.000021s : 0.17% auto_monad : 0.000067s : 0.55% graph_reusing : 0.000005s : 0.04% inline : 0.000002s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000021s : 0.17% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000007s : 0.05% parallel-infer-symbol : 0.000003s : 0.02% pre_auto_parallel : 0.000034s : 0.28% insert-virtual-dataset : 0.000002s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000028s : 0.23% optimize.rewriter_before_opt_a : 0.000096s : 0.78% optimize.opt_a.expand_dump_flag : 0.000004s : 0.03% optimize.opt_a.switch_simplify : 0.000060s : 0.48% optimize.opt_a.loop_unroll : 0.000046s : 0.37% optimize.opt_a.a_1 : 0.001113s : 9.06% optimize.opt_a.with_stream_mark : 0.000028s : 0.23% optimize.opt_a.recompute_prepare : 0.000021s : 0.17% optimize.opt_a.updatestate_depend_eliminate : 0.000010s : 0.08% optimize.opt_a.updatestate_assign_eliminate : 0.000008s : 0.07% optimize.opt_a.updatestate_loads_eliminate : 0.000008s : 0.06% optimize.opt_a.parameter_eliminate : 0.000003s : 0.02% optimize.opt_a.a_2 : 0.000246s : 2.00% optimize.opt_a.accelerated_algorithm : 0.000020s : 0.16% optimize.opt_a.shard : 0.000003s : 0.02% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.03% optimize.opt_a.shard_inline : 0.000019s : 0.15% optimize.opt_a.merge_send_recv : 0.000017s : 0.13% optimize.opt_a.auto_parallel : 0.000014s : 0.12% optimize.opt_a.parallel : 0.000022s : 0.18% optimize.opt_a.flash_sp : 0.000012s : 0.10% optimize.opt_a.merge_comm : 0.000011s : 0.09% optimize.opt_a.allreduce_fusion : 0.000009s : 0.08% optimize.opt_a.matmul_add_comm_reduction : 0.000018s : 0.15% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000021s : 0.17% optimize.opt_a.virtual_dataset : 0.000019s : 0.15% optimize.opt_a.get_grad_eliminate_ : 0.000018s : 0.14% optimize.opt_a.virtual_output : 0.000018s : 0.15% optimize.opt_a.merge_forward : 0.000012s : 0.10% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.03% optimize.opt_a.offload_activation : 0.000019s : 0.16% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000035s : 0.29% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.02% optimize.opt_a.before_grad : 0.000030s : 0.24% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000011s : 0.09% optimize.opt_a.meta_fg_expand : 0.000008s : 0.06% optimize.opt_a.flash_sp_send_recv_attached : 0.000003s : 0.03% optimize.opt_a.receive_attached : 0.000028s : 0.23% optimize.opt_a.after_resolve : 0.000030s : 0.25% optimize.opt_a.a_after_grad : 0.000029s : 0.24% optimize.opt_a.renormalize : 0.000703s : 5.72% optimize.opt_a.add_forward_monad_depend : 0.000007s : 0.06% optimize.opt_a.auto_monad_grad : 0.000003s : 0.02% optimize.opt_a.auto_monad_eliminator : 0.000028s : 0.23% optimize.opt_a.cse : 0.000069s : 0.56% optimize.opt_a.a_3 : 0.000128s : 1.04% optimize.py_interpret_to_execute_after_opt_a : 0.000011s : 0.09% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.02% optimize.rewriter_after_opt_a : 0.000044s : 0.36% optimize.convert_after_rewriter : 0.000008s : 0.07% optimize.order_py_execute_after_rewriter : 0.000006s : 0.05% optimize.mutable_eliminate : 0.000475s : 3.87% optimize.opt_b.b_1 : 0.000207s : 1.68% optimize.opt_b.b_2 : 0.000012s : 0.10% optimize.opt_b.updatestate_depend_eliminate : 0.000007s : 0.06% optimize.opt_b.updatestate_assign_eliminate : 0.000004s : 0.03% optimize.opt_b.updatestate_loads_eliminate : 0.000005s : 0.04% optimize.opt_b.renormalize : 0.000000s : 0.00% optimize.opt_b.cse : 0.000030s : 0.24% optimize.optimize_parallel_all_gather_comm : 0.000021s : 0.17% optimize.overlap_param_gather : 0.000002s : 0.02% optimize.cconv : 0.000025s : 0.20% optimize.loop_unroll : 0.000439s : 3.58% optimize.opt_after_cconv.c_1 : 0.000048s : 0.39% optimize.opt_after_cconv.parameter_eliminate : 0.000002s : 0.02% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000007s : 0.06% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000004s : 0.03% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000004s : 0.03% optimize.opt_after_cconv.cse : 0.000028s : 0.23% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000036s : 0.30% optimize.tuple_transform.d_1 : 0.000067s : 0.55% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000010s : 0.08% optimize.partial_unused_args_eliminate : 0.000002s : 0.01% optimize.add_recomputation : 0.000061s : 0.50% optimize.cse_after_recomputation.cse : 0.000018s : 0.14% optimize.environ_conv : 0.000006s : 0.05% optimize.swap_dp_allreduce_reducescatter : 0.000007s : 0.06% optimize.bias_add_comm_swap : 0.000003s : 0.02% optimize.label_micro_interleaved_index : 0.000004s : 0.03% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.02% optimize.merge_cast_opt : 0.000001s : 0.01% optimize.slice_recompute_activation : 0.000002s : 0.02% optimize.micro_interleaved_order_control : 0.000002s : 0.02% optimize.assign_add_opt : 0.000001s : 0.01% optimize.ForceFp32Comm : 0.000001s : 0.01% optimize.remove_cast_before_assign_add : 0.000001s : 0.01% optimize.full_micro_interleaved_order_control : 0.000002s : 0.02% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.02% optimize.comm_op_add_attrs : 0.000001s : 0.01% optimize.add_comm_op_reuse_tag : 0.000001s : 0.01% optimize.interleave_split_concat_branches : 0.000001s : 0.01% optimize.interleave_parallel_branches : 0.000001s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000002s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.02% optimize.control_data_broadcast_order : 0.000016s : 0.13% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.01% optimize.offloading_packed_experts : 0.000005s : 0.04% optimize.overlap_recompute_and_grad_model_parallel : 0.000006s : 0.05% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.01% optimize.overlap_recompute_comm : 0.000002s : 0.02% optimize.overlap_grad_ring_attention : 0.000005s : 0.04% optimize.overlap_grad_flash_sp : 0.000023s : 0.19% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.02% optimize.split_layernorm_comm : 0.000002s : 0.02% optimize.handle_group_info : 0.000001s : 0.01% optimize.symbol_engine_optimizer.build : 0.000003s : 0.02% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000013s : 0.11% optimize.symbol_engine_optimizer.elim_not_effective : 0.000019s : 0.15% optimize.symbol_engine_optimizer.opt_reshape : 0.000010s : 0.08% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000015s : 0.12% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.01% pipeline_parallel_scheduler : 0.000001s : 0.01% auto_monad_reorder : 0.000037s : 0.30% get_jit_bprop_graph : 0.000001s : 0.01% rewriter_after_jit_bprop_graph : 0.000004s : 0.03% opt_after_jit_grad : 0.000481s : 3.91% validate : 0.000042s : 0.34% Time group info: ------[substitution.] 0.000233 54 11.82% : 0.000028s : 6: substitution.cast_eliminate 1.24% : 0.000003s : 4: substitution.elim_not_effective 0.87% : 0.000002s : 4: substitution.fold_const_symbol 3.34% : 0.000008s : 7: substitution.graph_param_transform 64.61% : 0.000150s : 4: substitution.inline 2.19% : 0.000005s : 8: substitution.j_node_and_user_rematch 3.31% : 0.000008s : 8: substitution.remove_not_recompute_node 2.29% : 0.000005s : 6: substitution.replace_old_param 7.82% : 0.000018s : 6: substitution.tuple_list_get_item_eliminator 2.52% : 0.000006s : 1: substitution.value_based_eliminate ------[type_inference.] 0.006201 2 87.60% : 0.005432s : 1: type_inference.infer 12.40% : 0.000769s : 1: type_inference.specialize ------[replace.] 0.000070 10 52.67% : 0.000037s : 4: replace.inline 47.33% : 0.000033s : 6: replace.tuple_list_get_item_eliminator ------[match.] 0.000164 10 90.44% : 0.000148s : 4: match.inline 9.56% : 0.000016s : 6: match.tuple_list_get_item_eliminator ------[predicate.] 0.000319 2134 0.92% : 0.000003s : 22: predicate.accumulaten_eliminater 0.67% : 0.000002s : 7: predicate.ad_related_special_op_eliminate 0.63% : 0.000002s : 14: predicate.addn_check_dump 0.94% : 0.000003s : 22: predicate.addn_zero_filter 0.87% : 0.000003s : 22: predicate.adjust_all_reduce_mul_add 2.06% : 0.000007s : 36: predicate.arithmetic_simplify 1.09% : 0.000003s : 22: predicate.cast_eliminate 0.63% : 0.000002s : 14: predicate.check_bprop_eliminate 0.61% : 0.000002s : 14: predicate.compare_switch_simplify 0.20% : 0.000001s : 7: predicate.const_output_eliminate 0.63% : 0.000002s : 14: predicate.depend_value_elim 0.99% : 0.000003s : 22: predicate.dict_get_item_const_eliminator 1.04% : 0.000003s : 22: predicate.dict_get_item_eliminator 0.89% : 0.000003s : 22: predicate.dict_set_item_eliminator 0.89% : 0.000003s : 14: predicate.dumpgradient_eliminate 0.22% : 0.000001s : 7: predicate.elim_not_effective 0.39% : 0.000001s : 7: predicate.elim_shapecalc_of_broadcastargs 1.21% : 0.000004s : 29: predicate.environ_add_const_eliminate 1.16% : 0.000004s : 29: predicate.environ_get_add_eliminate 1.16% : 0.000004s : 29: predicate.environ_get_depend_swap 1.78% : 0.000006s : 43: predicate.environ_get_eliminate 1.18% : 0.000004s : 29: predicate.environ_get_set_eliminate 1.39% : 0.000004s : 32: predicate.exchange_switch_depend_value 2.10% : 0.000007s : 32: predicate.float_depend_g_call 0.56% : 0.000002s : 14: predicate.float_environ_get_switch 0.89% : 0.000003s : 21: predicate.float_tuple_getitem_switch 0.20% : 0.000001s : 7: predicate.fold_const_symbol 0.66% : 0.000002s : 14: predicate.get_grad_eliminate 0.21% : 0.000001s : 7: predicate.graph_param_transform 0.63% : 0.000002s : 14: predicate.incorporate_call 0.54% : 0.000002s : 14: predicate.incorporate_call_switch 6.32% : 0.000020s : 96: predicate.inline 0.81% : 0.000003s : 14: predicate.inline_without_move 0.36% : 0.000001s : 14: predicate.j_node_and_user_rematch 0.86% : 0.000003s : 14: predicate.less_batch_normalization 1.95% : 0.000006s : 42: predicate.list_to_tuple_eliminator_ 2.69% : 0.000009s : 64: predicate.load_eliminater 0.81% : 0.000003s : 7: predicate.loop_unroll_after_grad 1.99% : 0.000006s : 44: predicate.loop_unroll_before_grad 1.61% : 0.000005s : 36: predicate.make_slice_get_slice_eliminator 0.62% : 0.000002s : 14: predicate.merge_addn 0.59% : 0.000002s : 14: predicate.micro_step_allgather_replace 0.59% : 0.000002s : 14: predicate.mini_step_allgather_replace 0.87% : 0.000003s : 22: predicate.minmaximum_grad 0.79% : 0.000003s : 7: predicate.mutable_eliminate 0.34% : 0.000001s : 7: predicate.opt_reshape 0.40% : 0.000001s : 7: predicate.parallel_virtual_node 1.62% : 0.000005s : 32: predicate.partial_defer_inline 1.81% : 0.000006s : 35: predicate.partial_eliminate 0.95% : 0.000003s : 22: predicate.print_const_string_wrapper 0.59% : 0.000002s : 14: predicate.reduce_all_const_elim 1.21% : 0.000004s : 22: predicate.reduce_eliminate 2.67% : 0.000009s : 64: predicate.redundant_stop_gradient_eliminater 0.42% : 0.000001s : 14: predicate.remove_not_recompute_node 1.37% : 0.000004s : 42: predicate.replace_applicator 0.44% : 0.000001s : 14: predicate.replace_old_param 0.25% : 0.000001s : 7: predicate.reset_defer_inline 0.94% : 0.000003s : 22: predicate.reshape_eliminate 0.61% : 0.000002s : 14: predicate.row_tensor_add_zeros_like 0.34% : 0.000001s : 7: predicate.row_tensor_eliminate 0.77% : 0.000002s : 14: predicate.same_eliminate 0.41% : 0.000001s : 14: predicate.set_cell_output_no_recompute 0.77% : 0.000002s : 14: predicate.shard_identity_eliminate 0.74% : 0.000002s : 14: predicate.special_op_eliminate 0.74% : 0.000002s : 14: predicate.specialize_transform 0.77% : 0.000002s : 14: predicate.split_environ_get_set_with_tuple_value 0.79% : 0.000003s : 14: predicate.stack_unstack_eliminate 0.37% : 0.000001s : 7: predicate.switch_call_monad_eliminater 1.54% : 0.000005s : 32: predicate.switch_defer_inline 2.08% : 0.000007s : 46: predicate.switch_layer_defer_inline 4.66% : 0.000015s : 97: predicate.switch_simplify 0.96% : 0.000003s : 22: predicate.tile_eliminate 0.98% : 0.000003s : 22: predicate.transpose_eliminate 1.60% : 0.000005s : 36: predicate.tuple_list_convert_item_index_to_positive 1.67% : 0.000005s : 36: predicate.tuple_list_get_item_const_eliminator 1.67% : 0.000005s : 36: predicate.tuple_list_get_item_depend_reorder 3.18% : 0.000010s : 56: predicate.tuple_list_get_item_eliminator 1.56% : 0.000005s : 36: predicate.tuple_list_get_set_item_eliminator 2.35% : 0.000008s : 50: predicate.tuple_list_set_item_eliminator 1.81% : 0.000006s : 42: predicate.tuple_to_list_eliminator_ 2.62% : 0.000008s : 64: predicate.updatestate_pure_node_eliminater 3.39% : 0.000011s : 78: predicate.updatestate_useless_node_eliminater 0.40% : 0.000001s : 7: predicate.value_based_eliminate 0.67% : 0.000002s : 14: predicate.virtual_dataset_eliminate 0.67% : 0.000002s : 14: predicate.virtual_output_eliminate 0.31% : 0.000001s : 7: predicate.virtual_view_grad_eliminate 0.33% : 0.000001s : 7: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000554 11 55.25% : 0.000306s : 5: func_graph_cloner_run.FuncGraphClonerGraph 44.75% : 0.000248s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.027766 192 0.01% : 0.000004s : 1: ForceFp32Comm 11.07% : 0.003075s : 1: add_attr 11.04% : 0.003066s : 1: add_attr_with_inline 0.01% : 0.000004s : 1: add_comm_op_reuse_tag 0.23% : 0.000065s : 1: add_recomputation 0.01% : 0.000004s : 1: assign_add_opt 0.26% : 0.000072s : 1: auto_monad 0.15% : 0.000042s : 1: auto_monad_reorder 0.01% : 0.000003s : 1: begin_end_overlap_inline 0.02% : 0.000006s : 1: bias_add_comm_swap 1.82% : 0.000504s : 1: bootstrap 0.10% : 0.000029s : 1: cconv 0.01% : 0.000004s : 1: comm_op_add_attrs 0.07% : 0.000019s : 1: control_data_broadcast_order 0.04% : 0.000012s : 1: convert_after_rewriter 0.11% : 0.000031s : 1: cse_after_recomputation 0.02% : 0.000005s : 1: dataset_repeat_opt 0.02% : 0.000005s : 1: detach_backward 0.03% : 0.000009s : 1: environ_conv 0.10% : 0.000027s : 1: event_method 0.02% : 0.000005s : 1: full_micro_interleaved_order_control 0.02% : 0.000005s : 1: get_jit_bprop_graph 0.03% : 0.000009s : 1: graph_reusing 0.02% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000004s : 1: handle_group_info 0.02% : 0.000005s : 1: inline 0.02% : 0.000006s : 1: insert-virtual-dataset 0.01% : 0.000004s : 1: interleave_parallel_branches 0.02% : 0.000004s : 1: interleave_split_concat_branches 0.02% : 0.000006s : 1: label_fine_grained_interleaved_index 0.03% : 0.000007s : 1: label_micro_interleaved_index 1.61% : 0.000448s : 1: loop_unroll 0.01% : 0.000004s : 1: merge_cast_opt 0.02% : 0.000005s : 1: micro_interleaved_order_control 1.74% : 0.000483s : 1: mutable_eliminate 0.03% : 0.000008s : 1: offloading_packed_experts 0.06% : 0.000018s : 1: opt.transform.loop_unroll_optimizer 0.07% : 0.000019s : 1: opt.transform.mutable_eliminate 6.35% : 0.001763s : 78: opt.transform.opt_a 0.17% : 0.000047s : 1: opt.transform.opt_after_cconv 0.13% : 0.000035s : 1: opt.transform.opt_after_jit_grad 0.67% : 0.000187s : 28: opt.transform.opt_b 0.27% : 0.000075s : 2: opt.transform.opt_trans_graph 0.19% : 0.000053s : 4: opt.transform.symbol_engine_opt 12.19% : 0.003385s : 1: opt_a 0.50% : 0.000138s : 1: opt_after_cconv 1.76% : 0.000489s : 1: opt_after_jit_grad 1.12% : 0.000310s : 1: opt_b 20.30% : 0.005635s : 1: optimize 0.09% : 0.000025s : 1: optimize_parallel_all_gather_comm 0.03% : 0.000009s : 1: order_py_execute_after_rewriter 0.10% : 0.000026s : 1: overlap_grad_flash_sp 0.01% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.03% : 0.000008s : 1: overlap_grad_ring_attention 0.02% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.02% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.02% : 0.000005s : 1: overlap_param_gather 0.01% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.03% : 0.000009s : 1: overlap_recompute_and_grad_model_parallel 0.02% : 0.000005s : 1: overlap_recompute_comm 0.02% : 0.000007s : 1: parallel-infer-symbol 0.01% : 0.000004s : 1: parallel-infer-symbol-second 0.02% : 0.000005s : 1: partial_unused_args_eliminate 0.02% : 0.000004s : 1: pipeline_parallel_scheduler 0.02% : 0.000004s : 1: pipeline_split 0.14% : 0.000038s : 1: pre_auto_parallel 0.11% : 0.000032s : 1: py_interpret_to_execute 0.05% : 0.000014s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000004s : 1: remove_cast_before_assign_add 0.15% : 0.000041s : 1: remove_dup_value 1.37% : 0.000380s : 1: renormalize.infer 1.13% : 0.000313s : 1: renormalize.specialize 0.02% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.02% : 0.000007s : 1: rewriter_after_jit_bprop_graph 0.17% : 0.000048s : 1: rewriter_after_opt_a 0.36% : 0.000100s : 1: rewriter_before_opt_a 0.02% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.02% : 0.000005s : 1: slice_recompute_activation 0.02% : 0.000005s : 1: split_layernorm_comm 0.02% : 0.000005s : 1: split_matmul_comm_elemetwise 0.03% : 0.000010s : 1: swap_dp_allreduce_reducescatter 0.34% : 0.000095s : 1: symbol_engine_optimizer 0.38% : 0.000105s : 1: tuple_transform 22.62% : 0.006282s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:47:16.167.933 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:47:16.168.176 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0167461, [21] [bootstrap]: 0.00048914 [type_inference]: 0.005878 [event_method]: 2.059e-05 [auto_monad]: 6.639e-05 [graph_reusing]: 6.01998e-06 [inline]: 2.34999e-06 [add_attr]: 0.00309337, [1] [add_attr_with_inline]: 0.00308488, [1] [Cycle 1]: 7.423e-05, [2] [tag_attr]: 2.105e-05 [meta_addattr_fg_expand]: 6.60002e-06 [parallel-infer-symbol]: 3.35003e-06 [pre_auto_parallel]: 3.377e-05 [insert-virtual-dataset]: 2.23998e-06 [parallel-infer-symbol-second]: 7.30011e-07 [dataset_repeat_opt]: 2.09999e-06 [pipeline_split]: 1.57001e-06 [optimize]: 0.00595035, [53] [py_interpret_to_execute]: 3.378e-05 [rewriter_before_opt_a]: 9.693e-05 [opt_a]: 0.00353985, [2] [Cycle 1]: 0.00254468, [45] [expand_dump_flag]: 3.01001e-06 [switch_simplify]: 4.749e-05 [loop_unroll]: 3.597e-05 [a_1]: 0.00083692 [with_stream_mark]: 1.503e-05 [recompute_prepare]: 1.059e-05 [updatestate_depend_eliminate]: 5.00999e-06 [updatestate_assign_eliminate]: 4.08001e-06 [updatestate_loads_eliminate]: 3.86001e-06 [parameter_eliminate]: 1.92001e-06 [a_2]: 0.00013786 [accelerated_algorithm]: 9.25001e-06 [shard]: 1.77999e-06 [meta_shard_fg_expand]: 2.07999e-06 [shard_inline]: 9.46e-06 [merge_send_recv]: 9.41e-06 [auto_parallel]: 7.18e-06 [parallel]: 1.947e-05 [flash_sp]: 8.35999e-06 [merge_comm]: 5.86998e-06 [allreduce_fusion]: 4.36002e-06 [matmul_add_comm_reduction]: 1.033e-05 [allreduce_slice_to_reducescatter]: 7.2e-07 [virtual_shard_identity]: 1.073e-05 [virtual_dataset]: 8.77e-06 [get_grad_eliminate_]: 8.38001e-06 [virtual_output]: 8.74998e-06 [merge_forward]: 4.84003e-06 [cell_reuse_recompute_pass]: 1.17999e-06 [offload_activation]: 1.116e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.799e-05 [merge_recompute_call_nodes]: 1.39e-06 [before_grad]: 1.393e-05 [set_forward_comm_id_for_comm_node_pass]: 4.84e-06 [meta_fg_expand]: 3.25e-06 [flash_sp_send_recv_attached]: 2.27999e-06 [receive_attached]: 1.97999e-06 [after_resolve]: 1.476e-05 [a_after_grad]: 1.323e-05 [renormalize]: 0.00071296 [add_forward_monad_depend]: 5.79e-06 [auto_monad_grad]: 1.92999e-06 [auto_monad_eliminator]: 1.74e-05 [cse]: 3.599e-05 [a_3]: 7.611e-05 [Cycle 2]: 0.00098217, [45] [expand_dump_flag]: 1.09998e-06 [switch_simplify]: 9.59e-06 [loop_unroll]: 8.08001e-06 [a_1]: 0.00020228 [with_stream_mark]: 1.251e-05 [recompute_prepare]: 8.38999e-06 [updatestate_depend_eliminate]: 3.98001e-06 [updatestate_assign_eliminate]: 3.06001e-06 [updatestate_loads_eliminate]: 3.11001e-06 [parameter_eliminate]: 1.14e-06 [a_2]: 0.00012826 [accelerated_algorithm]: 8.02e-06 [shard]: 1.55999e-06 [meta_shard_fg_expand]: 1.81e-06 [shard_inline]: 1.162e-05 [merge_send_recv]: 6.13998e-06 [auto_parallel]: 6.37001e-06 [parallel]: 5.27999e-06 [flash_sp]: 3.45e-06 [merge_comm]: 4.02e-06 [allreduce_fusion]: 3.93001e-06 [matmul_add_comm_reduction]: 7.05e-06 [allreduce_slice_to_reducescatter]: 4.89992e-07 [virtual_shard_identity]: 9.00001e-06 [virtual_dataset]: 8.05e-06 [get_grad_eliminate_]: 7.65e-06 [virtual_output]: 7.61001e-06 [merge_forward]: 3.64002e-06 [cell_reuse_recompute_pass]: 1.92001e-06 [offload_activation]: 8.15e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.738e-05 [merge_recompute_call_nodes]: 9.00007e-07 [before_grad]: 1.307e-05 [set_forward_comm_id_for_comm_node_pass]: 4.48999e-06 [meta_fg_expand]: 2.81999e-06 [flash_sp_send_recv_attached]: 8.50006e-07 [receive_attached]: 1.00999e-06 [after_resolve]: 1.355e-05 [a_after_grad]: 1.259e-05 [renormalize]: 8.00064e-08 [add_forward_monad_depend]: 1.40999e-06 [auto_monad_grad]: 1.17999e-06 [auto_monad_eliminator]: 8.62e-06 [cse]: 1.969e-05 [a_3]: 6.374e-05 [py_interpret_to_execute_after_opt_a]: 1.364e-05 [slice_cell_reuse_recomputed_activation]: 4.67e-06 [rewriter_after_opt_a]: 4.36e-05 [convert_after_rewriter]: 1.115e-05 [order_py_execute_after_rewriter]: 8.97999e-06 [mutable_eliminate]: 0.00050055 [opt_b]: 0.00033581, [1] [Cycle 1]: 0.00032696, [7] [b_1]: 0.000222 [b_2]: 1.054e-05 [updatestate_depend_eliminate]: 6.53e-06 [updatestate_assign_eliminate]: 3.20998e-06 [updatestate_loads_eliminate]: 3.53e-06 [renormalize]: 7.2e-07 [cse]: 2.368e-05 [optimize_parallel_all_gather_comm]: 2.14e-05 [overlap_param_gather]: 4.40999e-06 [cconv]: 2.887e-05 [loop_unroll]: 0.00044287 [opt_after_cconv]: 0.00014545, [1] [Cycle 1]: 0.00013708, [7] [c_1]: 4.241e-05 [parameter_eliminate]: 3.28e-06 [updatestate_depend_eliminate]: 6.54001e-06 [updatestate_assign_eliminate]: 3.25e-06 [updatestate_loads_eliminate]: 3.14001e-06 [cse]: 2.249e-05 [renormalize]: 3.89991e-07 [remove_dup_value]: 1.907e-05 [tuple_transform]: 0.00010499, [1] [Cycle 1]: 9.806e-05, [4] [d_1]: 5.764e-05 [none_parameter_eliminate]: 1.52999e-06 [renormalize]: 1.69995e-07 [switch_simplify]: 9.29998e-06 [partial_unused_args_eliminate]: 4.43001e-06 [add_recomputation]: 5.733e-05 [cse_after_recomputation]: 3.129e-05, [1] [Cycle 1]: 2.458e-05, [1] [cse]: 1.566e-05 [environ_conv]: 9.66e-06 [swap_dp_allreduce_reducescatter]: 8.53001e-06 [bias_add_comm_swap]: 5.10999e-06 [label_micro_interleaved_index]: 7.06001e-06 [label_fine_grained_interleaved_index]: 5.19998e-06 [merge_cast_opt]: 4.03001e-06 [slice_recompute_activation]: 4.92999e-06 [micro_interleaved_order_control]: 4.67e-06 [assign_add_opt]: 3.56001e-06 [ForceFp32Comm]: 3.14999e-06 [remove_cast_before_assign_add]: 3.35e-06 [full_micro_interleaved_order_control]: 4.57e-06 [reorder_send_recv_between_fp_bp]: 5.00999e-06 [comm_op_add_attrs]: 3.58e-06 [add_comm_op_reuse_tag]: 3.25002e-06 [interleave_split_concat_branches]: 3.43e-06 [interleave_parallel_branches]: 3.47997e-06 [overlap_opt_shard_in_pipeline]: 3.6e-06 [overlap_opt_shard_grad_in_pipeline]: 5.05999e-06 [control_data_broadcast_order]: 1.733e-05 [grouped_pairwise_exchange_alltoall]: 4.68999e-06 [offloading_packed_experts]: 6.68e-06 [overlap_recompute_and_grad_model_parallel]: 7.68001e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.48e-06 [overlap_recompute_allgather_and_fa_grad]: 3.96001e-06 [overlap_recompute_comm]: 5.00001e-06 [overlap_grad_ring_attention]: 7.17002e-06 [overlap_grad_flash_sp]: 2.376e-05 [begin_end_overlap_inline]: 3.11001e-06 [split_matmul_comm_elemetwise]: 4.68001e-06 [split_layernorm_comm]: 4.27998e-06 [handle_group_info]: 3.22002e-06 [symbol_engine_optimizer]: 0.00010627, [1] [Cycle 1]: 9.964e-05, [6] [build]: 3.48999e-06 [elim_shapecalc]: 1.202e-05 [elim_not_effective]: 1.642e-05 [opt_reshape]: 9.31e-06 [fold_const_symbol]: 1.282e-05 [renormalize]: 2.10013e-07 [detach_backward]: 3.71999e-06 [pipeline_parallel_scheduler]: 1.77999e-06 [auto_monad_reorder]: 2.239e-05 [get_jit_bprop_graph]: 1.37e-06 [rewriter_after_jit_bprop_graph]: 4.03001e-06 [opt_after_jit_grad]: 0.00049662 [validate]: 4.013e-05 Sums bootstrap : 0.000489s : 4.11% type_inference : 0.005878s : 49.44% event_method : 0.000021s : 0.17% auto_monad : 0.000066s : 0.56% graph_reusing : 0.000006s : 0.05% inline : 0.000002s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000021s : 0.18% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000007s : 0.06% parallel-infer-symbol : 0.000003s : 0.03% pre_auto_parallel : 0.000034s : 0.28% insert-virtual-dataset : 0.000002s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000034s : 0.28% optimize.rewriter_before_opt_a : 0.000097s : 0.82% optimize.opt_a.expand_dump_flag : 0.000004s : 0.03% optimize.opt_a.switch_simplify : 0.000057s : 0.48% optimize.opt_a.loop_unroll : 0.000044s : 0.37% optimize.opt_a.a_1 : 0.001039s : 8.74% optimize.opt_a.with_stream_mark : 0.000028s : 0.23% optimize.opt_a.recompute_prepare : 0.000019s : 0.16% optimize.opt_a.updatestate_depend_eliminate : 0.000009s : 0.08% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.06% optimize.opt_a.updatestate_loads_eliminate : 0.000007s : 0.06% optimize.opt_a.parameter_eliminate : 0.000003s : 0.03% optimize.opt_a.a_2 : 0.000266s : 2.24% optimize.opt_a.accelerated_algorithm : 0.000017s : 0.15% optimize.opt_a.shard : 0.000003s : 0.03% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.03% optimize.opt_a.shard_inline : 0.000021s : 0.18% optimize.opt_a.merge_send_recv : 0.000016s : 0.13% optimize.opt_a.auto_parallel : 0.000014s : 0.11% optimize.opt_a.parallel : 0.000025s : 0.21% optimize.opt_a.flash_sp : 0.000012s : 0.10% optimize.opt_a.merge_comm : 0.000010s : 0.08% optimize.opt_a.allreduce_fusion : 0.000008s : 0.07% optimize.opt_a.matmul_add_comm_reduction : 0.000017s : 0.15% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000020s : 0.17% optimize.opt_a.virtual_dataset : 0.000017s : 0.14% optimize.opt_a.get_grad_eliminate_ : 0.000016s : 0.13% optimize.opt_a.virtual_output : 0.000016s : 0.14% optimize.opt_a.merge_forward : 0.000008s : 0.07% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.03% optimize.opt_a.offload_activation : 0.000019s : 0.16% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000035s : 0.30% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.02% optimize.opt_a.before_grad : 0.000027s : 0.23% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000009s : 0.08% optimize.opt_a.meta_fg_expand : 0.000006s : 0.05% optimize.opt_a.flash_sp_send_recv_attached : 0.000003s : 0.03% optimize.opt_a.receive_attached : 0.000003s : 0.03% optimize.opt_a.after_resolve : 0.000028s : 0.24% optimize.opt_a.a_after_grad : 0.000026s : 0.22% optimize.opt_a.renormalize : 0.000713s : 6.00% optimize.opt_a.add_forward_monad_depend : 0.000007s : 0.06% optimize.opt_a.auto_monad_grad : 0.000003s : 0.03% optimize.opt_a.auto_monad_eliminator : 0.000026s : 0.22% optimize.opt_a.cse : 0.000056s : 0.47% optimize.opt_a.a_3 : 0.000140s : 1.18% optimize.py_interpret_to_execute_after_opt_a : 0.000014s : 0.11% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.04% optimize.rewriter_after_opt_a : 0.000044s : 0.37% optimize.convert_after_rewriter : 0.000011s : 0.09% optimize.order_py_execute_after_rewriter : 0.000009s : 0.08% optimize.mutable_eliminate : 0.000501s : 4.21% optimize.opt_b.b_1 : 0.000222s : 1.87% optimize.opt_b.b_2 : 0.000011s : 0.09% optimize.opt_b.updatestate_depend_eliminate : 0.000007s : 0.05% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_b.updatestate_loads_eliminate : 0.000004s : 0.03% optimize.opt_b.renormalize : 0.000001s : 0.01% optimize.opt_b.cse : 0.000024s : 0.20% optimize.optimize_parallel_all_gather_comm : 0.000021s : 0.18% optimize.overlap_param_gather : 0.000004s : 0.04% optimize.cconv : 0.000029s : 0.24% optimize.loop_unroll : 0.000443s : 3.73% optimize.opt_after_cconv.c_1 : 0.000042s : 0.36% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000007s : 0.06% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.cse : 0.000022s : 0.19% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000019s : 0.16% optimize.tuple_transform.d_1 : 0.000058s : 0.48% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000009s : 0.08% optimize.partial_unused_args_eliminate : 0.000004s : 0.04% optimize.add_recomputation : 0.000057s : 0.48% optimize.cse_after_recomputation.cse : 0.000016s : 0.13% optimize.environ_conv : 0.000010s : 0.08% optimize.swap_dp_allreduce_reducescatter : 0.000009s : 0.07% optimize.bias_add_comm_swap : 0.000005s : 0.04% optimize.label_micro_interleaved_index : 0.000007s : 0.06% optimize.label_fine_grained_interleaved_index : 0.000005s : 0.04% optimize.merge_cast_opt : 0.000004s : 0.03% optimize.slice_recompute_activation : 0.000005s : 0.04% optimize.micro_interleaved_order_control : 0.000005s : 0.04% optimize.assign_add_opt : 0.000004s : 0.03% optimize.ForceFp32Comm : 0.000003s : 0.03% optimize.remove_cast_before_assign_add : 0.000003s : 0.03% optimize.full_micro_interleaved_order_control : 0.000005s : 0.04% optimize.reorder_send_recv_between_fp_bp : 0.000005s : 0.04% optimize.comm_op_add_attrs : 0.000004s : 0.03% optimize.add_comm_op_reuse_tag : 0.000003s : 0.03% optimize.interleave_split_concat_branches : 0.000003s : 0.03% optimize.interleave_parallel_branches : 0.000003s : 0.03% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.03% optimize.overlap_opt_shard_grad_in_pipeline : 0.000005s : 0.04% optimize.control_data_broadcast_order : 0.000017s : 0.15% optimize.grouped_pairwise_exchange_alltoall : 0.000005s : 0.04% optimize.offloading_packed_experts : 0.000007s : 0.06% optimize.overlap_recompute_and_grad_model_parallel : 0.000008s : 0.06% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000003s : 0.03% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.03% optimize.overlap_recompute_comm : 0.000005s : 0.04% optimize.overlap_grad_ring_attention : 0.000007s : 0.06% optimize.overlap_grad_flash_sp : 0.000024s : 0.20% optimize.begin_end_overlap_inline : 0.000003s : 0.03% optimize.split_matmul_comm_elemetwise : 0.000005s : 0.04% optimize.split_layernorm_comm : 0.000004s : 0.04% optimize.handle_group_info : 0.000003s : 0.03% optimize.symbol_engine_optimizer.build : 0.000003s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000012s : 0.10% optimize.symbol_engine_optimizer.elim_not_effective : 0.000016s : 0.14% optimize.symbol_engine_optimizer.opt_reshape : 0.000009s : 0.08% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000013s : 0.11% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000004s : 0.03% pipeline_parallel_scheduler : 0.000002s : 0.01% auto_monad_reorder : 0.000022s : 0.19% get_jit_bprop_graph : 0.000001s : 0.01% rewriter_after_jit_bprop_graph : 0.000004s : 0.03% opt_after_jit_grad : 0.000497s : 4.18% validate : 0.000040s : 0.34% Time group info: ------[substitution.] 0.000218 44 9.59% : 0.000021s : 3: substitution.cast_eliminate 1.06% : 0.000002s : 3: substitution.elim_not_effective 0.77% : 0.000002s : 3: substitution.fold_const_symbol 3.06% : 0.000007s : 6: substitution.graph_param_transform 67.52% : 0.000147s : 4: substitution.inline 2.04% : 0.000004s : 6: substitution.j_node_and_user_rematch 2.99% : 0.000007s : 6: substitution.remove_not_recompute_node 2.51% : 0.000005s : 6: substitution.replace_old_param 8.13% : 0.000018s : 6: substitution.tuple_list_get_item_eliminator 2.32% : 0.000005s : 1: substitution.value_based_eliminate ------[type_inference.] 0.005824 2 87.36% : 0.005088s : 1: type_inference.infer 12.64% : 0.000736s : 1: type_inference.specialize ------[replace.] 0.000072 10 53.27% : 0.000039s : 4: replace.inline 46.73% : 0.000034s : 6: replace.tuple_list_get_item_eliminator ------[match.] 0.000160 10 90.63% : 0.000145s : 4: match.inline 9.37% : 0.000015s : 6: match.tuple_list_get_item_eliminator ------[predicate.] 0.000286 1908 1.01% : 0.000003s : 20: predicate.accumulaten_eliminater 0.59% : 0.000002s : 6: predicate.ad_related_special_op_eliminate 0.56% : 0.000002s : 12: predicate.addn_check_dump 0.97% : 0.000003s : 20: predicate.addn_zero_filter 0.90% : 0.000003s : 20: predicate.adjust_all_reduce_mul_add 1.98% : 0.000006s : 32: predicate.arithmetic_simplify 1.04% : 0.000003s : 20: predicate.cast_eliminate 0.59% : 0.000002s : 12: predicate.check_bprop_eliminate 0.54% : 0.000002s : 12: predicate.compare_switch_simplify 0.20% : 0.000001s : 6: predicate.const_output_eliminate 0.64% : 0.000002s : 12: predicate.depend_value_elim 0.98% : 0.000003s : 20: predicate.dict_get_item_const_eliminator 1.07% : 0.000003s : 20: predicate.dict_get_item_eliminator 0.89% : 0.000003s : 20: predicate.dict_set_item_eliminator 0.90% : 0.000003s : 12: predicate.dumpgradient_eliminate 0.20% : 0.000001s : 6: predicate.elim_not_effective 0.37% : 0.000001s : 6: predicate.elim_shapecalc_of_broadcastargs 1.22% : 0.000003s : 26: predicate.environ_add_const_eliminate 1.18% : 0.000003s : 26: predicate.environ_get_add_eliminate 1.15% : 0.000003s : 26: predicate.environ_get_depend_swap 1.85% : 0.000005s : 38: predicate.environ_get_eliminate 1.18% : 0.000003s : 26: predicate.environ_get_set_eliminate 1.43% : 0.000004s : 30: predicate.exchange_switch_depend_value 2.41% : 0.000007s : 30: predicate.float_depend_g_call 0.56% : 0.000002s : 12: predicate.float_environ_get_switch 0.81% : 0.000002s : 18: predicate.float_tuple_getitem_switch 0.18% : 0.000001s : 6: predicate.fold_const_symbol 0.65% : 0.000002s : 12: predicate.get_grad_eliminate 0.20% : 0.000001s : 6: predicate.graph_param_transform 0.63% : 0.000002s : 12: predicate.incorporate_call 0.51% : 0.000001s : 12: predicate.incorporate_call_switch 6.22% : 0.000018s : 86: predicate.inline 0.75% : 0.000002s : 12: predicate.inline_without_move 0.35% : 0.000001s : 12: predicate.j_node_and_user_rematch 0.72% : 0.000002s : 12: predicate.less_batch_normalization 1.87% : 0.000005s : 38: predicate.list_to_tuple_eliminator_ 2.67% : 0.000008s : 58: predicate.load_eliminater 0.72% : 0.000002s : 6: predicate.loop_unroll_after_grad 2.01% : 0.000006s : 42: predicate.loop_unroll_before_grad 1.58% : 0.000005s : 32: predicate.make_slice_get_slice_eliminator 0.57% : 0.000002s : 12: predicate.merge_addn 0.59% : 0.000002s : 12: predicate.micro_step_allgather_replace 0.63% : 0.000002s : 12: predicate.mini_step_allgather_replace 0.87% : 0.000002s : 20: predicate.minmaximum_grad 0.76% : 0.000002s : 6: predicate.mutable_eliminate 0.35% : 0.000001s : 6: predicate.opt_reshape 0.38% : 0.000001s : 6: predicate.parallel_virtual_node 1.71% : 0.000005s : 30: predicate.partial_defer_inline 1.82% : 0.000005s : 32: predicate.partial_eliminate 0.96% : 0.000003s : 20: predicate.print_const_string_wrapper 0.58% : 0.000002s : 12: predicate.reduce_all_const_elim 1.23% : 0.000004s : 20: predicate.reduce_eliminate 2.70% : 0.000008s : 58: predicate.redundant_stop_gradient_eliminater 0.42% : 0.000001s : 12: predicate.remove_not_recompute_node 1.61% : 0.000005s : 38: predicate.replace_applicator 0.46% : 0.000001s : 12: predicate.replace_old_param 0.31% : 0.000001s : 6: predicate.reset_defer_inline 0.94% : 0.000003s : 20: predicate.reshape_eliminate 0.63% : 0.000002s : 12: predicate.row_tensor_add_zeros_like 0.42% : 0.000001s : 6: predicate.row_tensor_eliminate 0.72% : 0.000002s : 12: predicate.same_eliminate 0.44% : 0.000001s : 12: predicate.set_cell_output_no_recompute 0.95% : 0.000003s : 12: predicate.shard_identity_eliminate 0.67% : 0.000002s : 12: predicate.special_op_eliminate 0.73% : 0.000002s : 12: predicate.specialize_transform 0.79% : 0.000002s : 12: predicate.split_environ_get_set_with_tuple_value 0.68% : 0.000002s : 12: predicate.stack_unstack_eliminate 0.35% : 0.000001s : 6: predicate.switch_call_monad_eliminater 1.51% : 0.000004s : 30: predicate.switch_defer_inline 2.15% : 0.000006s : 42: predicate.switch_layer_defer_inline 4.78% : 0.000014s : 90: predicate.switch_simplify 0.92% : 0.000003s : 20: predicate.tile_eliminate 0.97% : 0.000003s : 20: predicate.transpose_eliminate 1.55% : 0.000004s : 32: predicate.tuple_list_convert_item_index_to_positive 1.67% : 0.000005s : 32: predicate.tuple_list_get_item_const_eliminator 1.58% : 0.000005s : 32: predicate.tuple_list_get_item_depend_reorder 3.26% : 0.000009s : 50: predicate.tuple_list_get_item_eliminator 1.55% : 0.000004s : 32: predicate.tuple_list_get_set_item_eliminator 2.26% : 0.000006s : 44: predicate.tuple_list_set_item_eliminator 1.86% : 0.000005s : 38: predicate.tuple_to_list_eliminator_ 2.60% : 0.000007s : 58: predicate.updatestate_pure_node_eliminater 3.33% : 0.000010s : 70: predicate.updatestate_useless_node_eliminater 0.45% : 0.000001s : 6: predicate.value_based_eliminate 0.68% : 0.000002s : 12: predicate.virtual_dataset_eliminate 0.69% : 0.000002s : 12: predicate.virtual_output_eliminate 0.29% : 0.000001s : 6: predicate.virtual_view_grad_eliminate 0.35% : 0.000001s : 6: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000538 11 54.14% : 0.000291s : 5: func_graph_cloner_run.FuncGraphClonerGraph 45.86% : 0.000247s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.028372 192 0.02% : 0.000006s : 1: ForceFp32Comm 10.93% : 0.003102s : 1: add_attr 10.89% : 0.003089s : 1: add_attr_with_inline 0.02% : 0.000006s : 1: add_comm_op_reuse_tag 0.21% : 0.000061s : 1: add_recomputation 0.02% : 0.000006s : 1: assign_add_opt 0.27% : 0.000076s : 1: auto_monad 0.11% : 0.000030s : 1: auto_monad_reorder 0.02% : 0.000006s : 1: begin_end_overlap_inline 0.03% : 0.000008s : 1: bias_add_comm_swap 1.88% : 0.000534s : 1: bootstrap 0.11% : 0.000032s : 1: cconv 0.02% : 0.000006s : 1: comm_op_add_attrs 0.07% : 0.000020s : 1: control_data_broadcast_order 0.05% : 0.000014s : 1: convert_after_rewriter 0.12% : 0.000034s : 1: cse_after_recomputation 0.03% : 0.000008s : 1: dataset_repeat_opt 0.07% : 0.000019s : 1: detach_backward 0.04% : 0.000013s : 1: environ_conv 0.11% : 0.000030s : 1: event_method 0.03% : 0.000007s : 1: full_micro_interleaved_order_control 0.03% : 0.000007s : 1: get_jit_bprop_graph 0.04% : 0.000012s : 1: graph_reusing 0.03% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.02% : 0.000006s : 1: handle_group_info 0.03% : 0.000008s : 1: inline 0.03% : 0.000008s : 1: insert-virtual-dataset 0.02% : 0.000006s : 1: interleave_parallel_branches 0.02% : 0.000006s : 1: interleave_split_concat_branches 0.03% : 0.000008s : 1: label_fine_grained_interleaved_index 0.03% : 0.000010s : 1: label_micro_interleaved_index 1.58% : 0.000449s : 1: loop_unroll 0.02% : 0.000007s : 1: merge_cast_opt 0.03% : 0.000007s : 1: micro_interleaved_order_control 1.79% : 0.000507s : 1: mutable_eliminate 0.03% : 0.000009s : 1: offloading_packed_experts 0.06% : 0.000017s : 1: opt.transform.loop_unroll_optimizer 0.06% : 0.000018s : 1: opt.transform.mutable_eliminate 5.71% : 0.001621s : 78: opt.transform.opt_a 0.14% : 0.000041s : 1: opt.transform.opt_after_cconv 0.11% : 0.000031s : 1: opt.transform.opt_after_jit_grad 0.57% : 0.000161s : 28: opt.transform.opt_b 0.23% : 0.000065s : 2: opt.transform.opt_trans_graph 0.17% : 0.000047s : 4: opt.transform.symbol_engine_opt 12.49% : 0.003543s : 1: opt_a 0.53% : 0.000149s : 1: opt_after_cconv 1.79% : 0.000507s : 1: opt_after_jit_grad 1.20% : 0.000340s : 1: opt_b 22.33% : 0.006335s : 1: optimize 0.09% : 0.000025s : 1: optimize_parallel_all_gather_comm 0.04% : 0.000012s : 1: order_py_execute_after_rewriter 0.09% : 0.000027s : 1: overlap_grad_flash_sp 0.02% : 0.000006s : 1: overlap_grad_matmul_and_grad_allreduce 0.04% : 0.000010s : 1: overlap_grad_ring_attention 0.03% : 0.000008s : 1: overlap_opt_shard_grad_in_pipeline 0.02% : 0.000006s : 1: overlap_opt_shard_in_pipeline 0.03% : 0.000007s : 1: overlap_param_gather 0.02% : 0.000007s : 1: overlap_recompute_allgather_and_fa_grad 0.04% : 0.000010s : 1: overlap_recompute_and_grad_model_parallel 0.03% : 0.000008s : 1: overlap_recompute_comm 0.04% : 0.000010s : 1: parallel-infer-symbol 0.02% : 0.000006s : 1: parallel-infer-symbol-second 0.03% : 0.000007s : 1: partial_unused_args_eliminate 0.03% : 0.000010s : 1: pipeline_parallel_scheduler 0.02% : 0.000007s : 1: pipeline_split 0.15% : 0.000041s : 1: pre_auto_parallel 0.13% : 0.000038s : 1: py_interpret_to_execute 0.06% : 0.000017s : 1: py_interpret_to_execute_after_opt_a 0.02% : 0.000006s : 1: remove_cast_before_assign_add 0.08% : 0.000022s : 1: remove_dup_value 1.28% : 0.000364s : 1: renormalize.infer 1.20% : 0.000341s : 1: renormalize.specialize 0.03% : 0.000008s : 1: reorder_send_recv_between_fp_bp 0.03% : 0.000010s : 1: rewriter_after_jit_bprop_graph 0.17% : 0.000047s : 1: rewriter_after_opt_a 0.35% : 0.000101s : 1: rewriter_before_opt_a 0.03% : 0.000007s : 1: slice_cell_reuse_recomputed_activation 0.03% : 0.000008s : 1: slice_recompute_activation 0.02% : 0.000007s : 1: split_layernorm_comm 0.03% : 0.000007s : 1: split_matmul_comm_elemetwise 0.04% : 0.000011s : 1: swap_dp_allreduce_reducescatter 0.39% : 0.000109s : 1: symbol_engine_optimizer 0.38% : 0.000108s : 1: tuple_transform 20.84% : 0.005913s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:47:16.360.596 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0151642, [21] [bootstrap]: 0.00043953 [type_inference]: 0.00577015 [event_method]: 1.972e-05 [auto_monad]: 5.998e-05 [graph_reusing]: 5.46002e-06 [inline]: 1.75001e-06 [add_attr]: 0.0030076, [1] [add_attr_with_inline]: 0.00299931, [1] [Cycle 1]: 5.631e-05, [2] [tag_attr]: 1.946e-05 [meta_addattr_fg_expand]: 6.75998e-06 [parallel-infer-symbol]: 2.91e-06 [pre_auto_parallel]: 3.368e-05 [insert-virtual-dataset]: 2.42001e-06 [parallel-infer-symbol-second]: 8.50006e-07 [dataset_repeat_opt]: 2.29999e-06 [pipeline_split]: 1.74e-06 [optimize]: 0.00514985, [53] [py_interpret_to_execute]: 2.658e-05 [rewriter_before_opt_a]: 9.161e-05 [opt_a]: 0.0030561, [2] [Cycle 1]: 0.002246, [45] [expand_dump_flag]: 3.35e-06 [switch_simplify]: 4.762e-05 [loop_unroll]: 3.52e-05 [a_1]: 0.00083162 [with_stream_mark]: 1.54e-05 [recompute_prepare]: 1.087e-05 [updatestate_depend_eliminate]: 4.82e-06 [updatestate_assign_eliminate]: 3.87002e-06 [updatestate_loads_eliminate]: 3.53999e-06 [parameter_eliminate]: 1.83002e-06 [a_2]: 0.00011069 [accelerated_algorithm]: 9.41998e-06 [shard]: 1.96998e-06 [meta_shard_fg_expand]: 2.11e-06 [shard_inline]: 8.46002e-06 [merge_send_recv]: 8.94e-06 [auto_parallel]: 6.86999e-06 [parallel]: 1.86e-05 [flash_sp]: 7.41001e-06 [merge_comm]: 4.85001e-06 [allreduce_fusion]: 4.52e-06 [matmul_add_comm_reduction]: 1.069e-05 [allreduce_slice_to_reducescatter]: 6.39993e-07 [virtual_shard_identity]: 9.98002e-06 [virtual_dataset]: 8.57e-06 [get_grad_eliminate_]: 8.05e-06 [virtual_output]: 8.49002e-06 [merge_forward]: 4.75001e-06 [cell_reuse_recompute_pass]: 1.25001e-06 [offload_activation]: 1.019e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.546e-05 [merge_recompute_call_nodes]: 1.39e-06 [before_grad]: 1.338e-05 [set_forward_comm_id_for_comm_node_pass]: 4.76002e-06 [meta_fg_expand]: 3.48e-06 [flash_sp_send_recv_attached]: 2.74001e-06 [receive_attached]: 2.46998e-06 [after_resolve]: 1.531e-05 [a_after_grad]: 1.318e-05 [renormalize]: 0.00061193 [add_forward_monad_depend]: 4.99e-06 [auto_monad_grad]: 2.02999e-06 [auto_monad_eliminator]: 1.636e-05 [cse]: 3.499e-05 [a_3]: 6.131e-05 [Cycle 2]: 0.00080084, [45] [expand_dump_flag]: 1.10001e-06 [switch_simplify]: 9.90002e-06 [loop_unroll]: 8.47998e-06 [a_1]: 0.00020212 [with_stream_mark]: 1.134e-05 [recompute_prepare]: 8.46002e-06 [updatestate_depend_eliminate]: 4.25e-06 [updatestate_assign_eliminate]: 2.98e-06 [updatestate_loads_eliminate]: 2.99001e-06 [parameter_eliminate]: 1.29e-06 [a_2]: 0.00010204 [accelerated_algorithm]: 7.96001e-06 [shard]: 1.09e-06 [meta_shard_fg_expand]: 1.67001e-06 [shard_inline]: 7.97998e-06 [merge_send_recv]: 5.57001e-06 [auto_parallel]: 6.00002e-06 [parallel]: 4.42e-06 [flash_sp]: 3.21001e-06 [merge_comm]: 4.30999e-06 [allreduce_fusion]: 3.95e-06 [matmul_add_comm_reduction]: 6.05002e-06 [allreduce_slice_to_reducescatter]: 3.7998e-07 [virtual_shard_identity]: 9.77999e-06 [virtual_dataset]: 7.92e-06 [get_grad_eliminate_]: 7.52998e-06 [virtual_output]: 7.65e-06 [merge_forward]: 3.87998e-06 [cell_reuse_recompute_pass]: 1.52001e-06 [offload_activation]: 7.21999e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.434e-05 [merge_recompute_call_nodes]: 7.50006e-07 [before_grad]: 1.23e-05 [set_forward_comm_id_for_comm_node_pass]: 3.97e-06 [meta_fg_expand]: 2.78e-06 [flash_sp_send_recv_attached]: 9.00007e-07 [receive_attached]: 1.00999e-06 [after_resolve]: 1.394e-05 [a_after_grad]: 1.244e-05 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 1.22999e-06 [auto_monad_grad]: 9.89996e-07 [auto_monad_eliminator]: 8.3e-06 [cse]: 1.748e-05 [a_3]: 4.994e-05 [py_interpret_to_execute_after_opt_a]: 9.44998e-06 [slice_cell_reuse_recomputed_activation]: 1.93997e-06 [rewriter_after_opt_a]: 3.868e-05 [convert_after_rewriter]: 7.55998e-06 [order_py_execute_after_rewriter]: 5.97001e-06 [mutable_eliminate]: 0.00047841 [opt_b]: 0.00026529, [1] [Cycle 1]: 0.00025943, [7] [b_1]: 0.00017866 [b_2]: 9.82999e-06 [updatestate_depend_eliminate]: 5.81e-06 [updatestate_assign_eliminate]: 3.11001e-06 [updatestate_loads_eliminate]: 3.32002e-06 [renormalize]: 3.39991e-07 [cse]: 2.22e-05 [optimize_parallel_all_gather_comm]: 1.734e-05 [overlap_param_gather]: 1.84998e-06 [cconv]: 2.216e-05 [loop_unroll]: 0.00042099 [opt_after_cconv]: 0.0001188, [1] [Cycle 1]: 0.00011321, [7] [c_1]: 4.206e-05 [parameter_eliminate]: 2.74999e-06 [updatestate_depend_eliminate]: 5.84999e-06 [updatestate_assign_eliminate]: 3.38e-06 [updatestate_loads_eliminate]: 3.01999e-06 [cse]: 2.177e-05 [renormalize]: 5.89993e-07 [remove_dup_value]: 1.448e-05 [tuple_transform]: 9.094e-05, [1] [Cycle 1]: 8.68e-05, [4] [d_1]: 5.767e-05 [none_parameter_eliminate]: 1.70001e-06 [renormalize]: 2.10013e-07 [switch_simplify]: 9.05001e-06 [partial_unused_args_eliminate]: 2.10002e-06 [add_recomputation]: 5.492e-05 [cse_after_recomputation]: 2.561e-05, [1] [Cycle 1]: 2.1e-05, [1] [cse]: 1.543e-05 [environ_conv]: 6.38e-06 [swap_dp_allreduce_reducescatter]: 6.23e-06 [bias_add_comm_swap]: 2.12001e-06 [label_micro_interleaved_index]: 4.35999e-06 [label_fine_grained_interleaved_index]: 2.96001e-06 [merge_cast_opt]: 1.50001e-06 [slice_recompute_activation]: 1.91e-06 [micro_interleaved_order_control]: 2.52001e-06 [assign_add_opt]: 1.54e-06 [ForceFp32Comm]: 8.39995e-07 [remove_cast_before_assign_add]: 1.30001e-06 [full_micro_interleaved_order_control]: 2.04e-06 [reorder_send_recv_between_fp_bp]: 2.80002e-06 [comm_op_add_attrs]: 1.09e-06 [add_comm_op_reuse_tag]: 9.29984e-07 [interleave_split_concat_branches]: 1.15001e-06 [interleave_parallel_branches]: 1.09e-06 [overlap_opt_shard_in_pipeline]: 1.09e-06 [overlap_opt_shard_grad_in_pipeline]: 1.79998e-06 [control_data_broadcast_order]: 1.513e-05 [grouped_pairwise_exchange_alltoall]: 1.59998e-06 [offloading_packed_experts]: 4.17e-06 [overlap_recompute_and_grad_model_parallel]: 5.30001e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.15999e-06 [overlap_recompute_allgather_and_fa_grad]: 1.54998e-06 [overlap_recompute_comm]: 1.99999e-06 [overlap_grad_ring_attention]: 4.48001e-06 [overlap_grad_flash_sp]: 2.117e-05 [begin_end_overlap_inline]: 8.49977e-07 [split_matmul_comm_elemetwise]: 2.25002e-06 [split_layernorm_comm]: 1.49998e-06 [handle_group_info]: 9.00007e-07 [symbol_engine_optimizer]: 8.29e-05, [1] [Cycle 1]: 7.822e-05, [6] [build]: 2.76999e-06 [elim_shapecalc]: 1.101e-05 [elim_not_effective]: 1.54e-05 [opt_reshape]: 8.62e-06 [fold_const_symbol]: 1.27e-05 [renormalize]: 2.30008e-07 [detach_backward]: 1.74998e-06 [pipeline_parallel_scheduler]: 1.47999e-06 [auto_monad_reorder]: 1.98e-05 [get_jit_bprop_graph]: 1.16002e-06 [rewriter_after_jit_bprop_graph]: 4.08999e-06 [opt_after_jit_grad]: 0.00045619 [validate]: 3.959e-05 Sums bootstrap : 0.000440s : 3.91% type_inference : 0.005770s : 51.38% event_method : 0.000020s : 0.18% auto_monad : 0.000060s : 0.53% graph_reusing : 0.000005s : 0.05% inline : 0.000002s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000019s : 0.17% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000007s : 0.06% parallel-infer-symbol : 0.000003s : 0.03% pre_auto_parallel : 0.000034s : 0.30% insert-virtual-dataset : 0.000002s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.02% optimize.py_interpret_to_execute : 0.000027s : 0.24% optimize.rewriter_before_opt_a : 0.000092s : 0.82% optimize.opt_a.expand_dump_flag : 0.000004s : 0.04% optimize.opt_a.switch_simplify : 0.000058s : 0.51% optimize.opt_a.loop_unroll : 0.000044s : 0.39% optimize.opt_a.a_1 : 0.001034s : 9.21% optimize.opt_a.with_stream_mark : 0.000027s : 0.24% optimize.opt_a.recompute_prepare : 0.000019s : 0.17% optimize.opt_a.updatestate_depend_eliminate : 0.000009s : 0.08% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.06% optimize.opt_a.updatestate_loads_eliminate : 0.000007s : 0.06% optimize.opt_a.parameter_eliminate : 0.000003s : 0.03% optimize.opt_a.a_2 : 0.000213s : 1.89% optimize.opt_a.accelerated_algorithm : 0.000017s : 0.15% optimize.opt_a.shard : 0.000003s : 0.03% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.03% optimize.opt_a.shard_inline : 0.000016s : 0.15% optimize.opt_a.merge_send_recv : 0.000015s : 0.13% optimize.opt_a.auto_parallel : 0.000013s : 0.11% optimize.opt_a.parallel : 0.000023s : 0.20% optimize.opt_a.flash_sp : 0.000011s : 0.09% optimize.opt_a.merge_comm : 0.000009s : 0.08% optimize.opt_a.allreduce_fusion : 0.000008s : 0.08% optimize.opt_a.matmul_add_comm_reduction : 0.000017s : 0.15% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000020s : 0.18% optimize.opt_a.virtual_dataset : 0.000016s : 0.15% optimize.opt_a.get_grad_eliminate_ : 0.000016s : 0.14% optimize.opt_a.virtual_output : 0.000016s : 0.14% optimize.opt_a.merge_forward : 0.000009s : 0.08% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.02% optimize.opt_a.offload_activation : 0.000017s : 0.16% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000030s : 0.27% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.02% optimize.opt_a.before_grad : 0.000026s : 0.23% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000009s : 0.08% optimize.opt_a.meta_fg_expand : 0.000006s : 0.06% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.03% optimize.opt_a.receive_attached : 0.000003s : 0.03% optimize.opt_a.after_resolve : 0.000029s : 0.26% optimize.opt_a.a_after_grad : 0.000026s : 0.23% optimize.opt_a.renormalize : 0.000612s : 5.45% optimize.opt_a.add_forward_monad_depend : 0.000006s : 0.06% optimize.opt_a.auto_monad_grad : 0.000003s : 0.03% optimize.opt_a.auto_monad_eliminator : 0.000025s : 0.22% optimize.opt_a.cse : 0.000052s : 0.47% optimize.opt_a.a_3 : 0.000111s : 0.99% optimize.py_interpret_to_execute_after_opt_a : 0.000009s : 0.08% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.02% optimize.rewriter_after_opt_a : 0.000039s : 0.34% optimize.convert_after_rewriter : 0.000008s : 0.07% optimize.order_py_execute_after_rewriter : 0.000006s : 0.05% optimize.mutable_eliminate : 0.000478s : 4.26% optimize.opt_b.b_1 : 0.000179s : 1.59% optimize.opt_b.b_2 : 0.000010s : 0.09% optimize.opt_b.updatestate_depend_eliminate : 0.000006s : 0.05% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_b.renormalize : 0.000000s : 0.00% optimize.opt_b.cse : 0.000022s : 0.20% optimize.optimize_parallel_all_gather_comm : 0.000017s : 0.15% optimize.overlap_param_gather : 0.000002s : 0.02% optimize.cconv : 0.000022s : 0.20% optimize.loop_unroll : 0.000421s : 3.75% optimize.opt_after_cconv.c_1 : 0.000042s : 0.37% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.02% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.05% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.cse : 0.000022s : 0.19% optimize.opt_after_cconv.renormalize : 0.000001s : 0.01% optimize.remove_dup_value : 0.000014s : 0.13% optimize.tuple_transform.d_1 : 0.000058s : 0.51% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.02% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000009s : 0.08% optimize.partial_unused_args_eliminate : 0.000002s : 0.02% optimize.add_recomputation : 0.000055s : 0.49% optimize.cse_after_recomputation.cse : 0.000015s : 0.14% optimize.environ_conv : 0.000006s : 0.06% optimize.swap_dp_allreduce_reducescatter : 0.000006s : 0.06% optimize.bias_add_comm_swap : 0.000002s : 0.02% optimize.label_micro_interleaved_index : 0.000004s : 0.04% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.03% optimize.merge_cast_opt : 0.000002s : 0.01% optimize.slice_recompute_activation : 0.000002s : 0.02% optimize.micro_interleaved_order_control : 0.000003s : 0.02% optimize.assign_add_opt : 0.000002s : 0.01% optimize.ForceFp32Comm : 0.000001s : 0.01% optimize.remove_cast_before_assign_add : 0.000001s : 0.01% optimize.full_micro_interleaved_order_control : 0.000002s : 0.02% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.02% optimize.comm_op_add_attrs : 0.000001s : 0.01% optimize.add_comm_op_reuse_tag : 0.000001s : 0.01% optimize.interleave_split_concat_branches : 0.000001s : 0.01% optimize.interleave_parallel_branches : 0.000001s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.02% optimize.control_data_broadcast_order : 0.000015s : 0.13% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.01% optimize.offloading_packed_experts : 0.000004s : 0.04% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.05% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000002s : 0.01% optimize.overlap_recompute_comm : 0.000002s : 0.02% optimize.overlap_grad_ring_attention : 0.000004s : 0.04% optimize.overlap_grad_flash_sp : 0.000021s : 0.19% optimize.begin_end_overlap_inline : 0.000001s : 0.01% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.02% optimize.split_layernorm_comm : 0.000001s : 0.01% optimize.handle_group_info : 0.000001s : 0.01% optimize.symbol_engine_optimizer.build : 0.000003s : 0.02% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000011s : 0.10% optimize.symbol_engine_optimizer.elim_not_effective : 0.000015s : 0.14% optimize.symbol_engine_optimizer.opt_reshape : 0.000009s : 0.08% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000013s : 0.11% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.02% pipeline_parallel_scheduler : 0.000001s : 0.01% auto_monad_reorder : 0.000020s : 0.18% get_jit_bprop_graph : 0.000001s : 0.01% rewriter_after_jit_bprop_graph : 0.000004s : 0.04% opt_after_jit_grad : 0.000456s : 4.06% validate : 0.000040s : 0.35% Time group info: ------[substitution.] 0.000216 44 9.42% : 0.000020s : 3: substitution.cast_eliminate 1.03% : 0.000002s : 3: substitution.elim_not_effective 0.83% : 0.000002s : 3: substitution.fold_const_symbol 3.20% : 0.000007s : 6: substitution.graph_param_transform 67.48% : 0.000146s : 4: substitution.inline 1.96% : 0.000004s : 6: substitution.j_node_and_user_rematch 2.72% : 0.000006s : 6: substitution.remove_not_recompute_node 2.63% : 0.000006s : 6: substitution.replace_old_param 8.22% : 0.000018s : 6: substitution.tuple_list_get_item_eliminator 2.48% : 0.000005s : 1: substitution.value_based_eliminate ------[type_inference.] 0.005712 2 87.36% : 0.004990s : 1: type_inference.infer 12.64% : 0.000722s : 1: type_inference.specialize ------[replace.] 0.000070 10 52.69% : 0.000037s : 4: replace.inline 47.31% : 0.000033s : 6: replace.tuple_list_get_item_eliminator ------[match.] 0.000158 10 90.57% : 0.000143s : 4: match.inline 9.43% : 0.000015s : 6: match.tuple_list_get_item_eliminator ------[predicate.] 0.000282 1908 0.96% : 0.000003s : 20: predicate.accumulaten_eliminater 0.60% : 0.000002s : 6: predicate.ad_related_special_op_eliminate 0.54% : 0.000002s : 12: predicate.addn_check_dump 0.93% : 0.000003s : 20: predicate.addn_zero_filter 0.90% : 0.000003s : 20: predicate.adjust_all_reduce_mul_add 2.10% : 0.000006s : 32: predicate.arithmetic_simplify 1.03% : 0.000003s : 20: predicate.cast_eliminate 0.59% : 0.000002s : 12: predicate.check_bprop_eliminate 0.56% : 0.000002s : 12: predicate.compare_switch_simplify 0.20% : 0.000001s : 6: predicate.const_output_eliminate 0.56% : 0.000002s : 12: predicate.depend_value_elim 1.02% : 0.000003s : 20: predicate.dict_get_item_const_eliminator 1.08% : 0.000003s : 20: predicate.dict_get_item_eliminator 0.96% : 0.000003s : 20: predicate.dict_set_item_eliminator 0.89% : 0.000003s : 12: predicate.dumpgradient_eliminate 0.20% : 0.000001s : 6: predicate.elim_not_effective 0.34% : 0.000001s : 6: predicate.elim_shapecalc_of_broadcastargs 1.24% : 0.000003s : 26: predicate.environ_add_const_eliminate 1.15% : 0.000003s : 26: predicate.environ_get_add_eliminate 1.22% : 0.000003s : 26: predicate.environ_get_depend_swap 1.82% : 0.000005s : 38: predicate.environ_get_eliminate 1.16% : 0.000003s : 26: predicate.environ_get_set_eliminate 1.51% : 0.000004s : 30: predicate.exchange_switch_depend_value 2.34% : 0.000007s : 30: predicate.float_depend_g_call 0.54% : 0.000002s : 12: predicate.float_environ_get_switch 0.81% : 0.000002s : 18: predicate.float_tuple_getitem_switch 0.18% : 0.000001s : 6: predicate.fold_const_symbol 0.63% : 0.000002s : 12: predicate.get_grad_eliminate 0.21% : 0.000001s : 6: predicate.graph_param_transform 0.60% : 0.000002s : 12: predicate.incorporate_call 0.52% : 0.000001s : 12: predicate.incorporate_call_switch 6.10% : 0.000017s : 86: predicate.inline 0.78% : 0.000002s : 12: predicate.inline_without_move 0.36% : 0.000001s : 12: predicate.j_node_and_user_rematch 0.70% : 0.000002s : 12: predicate.less_batch_normalization 1.90% : 0.000005s : 38: predicate.list_to_tuple_eliminator_ 2.73% : 0.000008s : 58: predicate.load_eliminater 0.67% : 0.000002s : 6: predicate.loop_unroll_after_grad 2.06% : 0.000006s : 42: predicate.loop_unroll_before_grad 1.66% : 0.000005s : 32: predicate.make_slice_get_slice_eliminator 0.57% : 0.000002s : 12: predicate.merge_addn 0.59% : 0.000002s : 12: predicate.micro_step_allgather_replace 0.58% : 0.000002s : 12: predicate.mini_step_allgather_replace 0.89% : 0.000003s : 20: predicate.minmaximum_grad 0.74% : 0.000002s : 6: predicate.mutable_eliminate 0.31% : 0.000001s : 6: predicate.opt_reshape 0.34% : 0.000001s : 6: predicate.parallel_virtual_node 1.69% : 0.000005s : 30: predicate.partial_defer_inline 1.84% : 0.000005s : 32: predicate.partial_eliminate 0.94% : 0.000003s : 20: predicate.print_const_string_wrapper 0.57% : 0.000002s : 12: predicate.reduce_all_const_elim 1.37% : 0.000004s : 20: predicate.reduce_eliminate 2.70% : 0.000008s : 58: predicate.redundant_stop_gradient_eliminater 0.41% : 0.000001s : 12: predicate.remove_not_recompute_node 1.45% : 0.000004s : 38: predicate.replace_applicator 0.50% : 0.000001s : 12: predicate.replace_old_param 0.26% : 0.000001s : 6: predicate.reset_defer_inline 0.99% : 0.000003s : 20: predicate.reshape_eliminate 0.61% : 0.000002s : 12: predicate.row_tensor_add_zeros_like 0.34% : 0.000001s : 6: predicate.row_tensor_eliminate 0.75% : 0.000002s : 12: predicate.same_eliminate 0.44% : 0.000001s : 12: predicate.set_cell_output_no_recompute 0.69% : 0.000002s : 12: predicate.shard_identity_eliminate 0.72% : 0.000002s : 12: predicate.special_op_eliminate 0.69% : 0.000002s : 12: predicate.specialize_transform 0.76% : 0.000002s : 12: predicate.split_environ_get_set_with_tuple_value 0.72% : 0.000002s : 12: predicate.stack_unstack_eliminate 0.35% : 0.000001s : 6: predicate.switch_call_monad_eliminater 1.57% : 0.000004s : 30: predicate.switch_defer_inline 2.14% : 0.000006s : 42: predicate.switch_layer_defer_inline 4.88% : 0.000014s : 90: predicate.switch_simplify 0.94% : 0.000003s : 20: predicate.tile_eliminate 0.97% : 0.000003s : 20: predicate.transpose_eliminate 1.62% : 0.000005s : 32: predicate.tuple_list_convert_item_index_to_positive 1.74% : 0.000005s : 32: predicate.tuple_list_get_item_const_eliminator 1.53% : 0.000004s : 32: predicate.tuple_list_get_item_depend_reorder 3.31% : 0.000009s : 50: predicate.tuple_list_get_item_eliminator 1.62% : 0.000005s : 32: predicate.tuple_list_get_set_item_eliminator 2.26% : 0.000006s : 44: predicate.tuple_list_set_item_eliminator 1.92% : 0.000005s : 38: predicate.tuple_to_list_eliminator_ 2.69% : 0.000008s : 58: predicate.updatestate_pure_node_eliminater 3.32% : 0.000009s : 70: predicate.updatestate_useless_node_eliminater 0.44% : 0.000001s : 6: predicate.value_based_eliminate 0.66% : 0.000002s : 12: predicate.virtual_dataset_eliminate 0.61% : 0.000002s : 12: predicate.virtual_output_eliminate 0.29% : 0.000001s : 6: predicate.virtual_view_grad_eliminate 0.36% : 0.000001s : 6: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000520 11 54.07% : 0.000281s : 5: func_graph_cloner_run.FuncGraphClonerGraph 45.93% : 0.000239s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.025783 192 0.01% : 0.000004s : 1: ForceFp32Comm 11.68% : 0.003012s : 1: add_attr 11.65% : 0.003003s : 1: add_attr_with_inline 0.01% : 0.000004s : 1: add_comm_op_reuse_tag 0.23% : 0.000059s : 1: add_recomputation 0.02% : 0.000004s : 1: assign_add_opt 0.26% : 0.000066s : 1: auto_monad 0.09% : 0.000024s : 1: auto_monad_reorder 0.01% : 0.000004s : 1: begin_end_overlap_inline 0.02% : 0.000005s : 1: bias_add_comm_swap 1.81% : 0.000466s : 1: bootstrap 0.10% : 0.000026s : 1: cconv 0.02% : 0.000004s : 1: comm_op_add_attrs 0.07% : 0.000018s : 1: control_data_broadcast_order 0.04% : 0.000011s : 1: convert_after_rewriter 0.11% : 0.000029s : 1: cse_after_recomputation 0.02% : 0.000005s : 1: dataset_repeat_opt 0.02% : 0.000005s : 1: detach_backward 0.04% : 0.000010s : 1: environ_conv 0.10% : 0.000025s : 1: event_method 0.02% : 0.000005s : 1: full_micro_interleaved_order_control 0.02% : 0.000004s : 1: get_jit_bprop_graph 0.03% : 0.000009s : 1: graph_reusing 0.02% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000004s : 1: handle_group_info 0.02% : 0.000005s : 1: inline 0.02% : 0.000006s : 1: insert-virtual-dataset 0.01% : 0.000004s : 1: interleave_parallel_branches 0.02% : 0.000004s : 1: interleave_split_concat_branches 0.02% : 0.000006s : 1: label_fine_grained_interleaved_index 0.03% : 0.000007s : 1: label_micro_interleaved_index 1.66% : 0.000429s : 1: loop_unroll 0.02% : 0.000004s : 1: merge_cast_opt 0.02% : 0.000005s : 1: micro_interleaved_order_control 1.89% : 0.000486s : 1: mutable_eliminate 0.03% : 0.000007s : 1: offloading_packed_experts 0.06% : 0.000016s : 1: opt.transform.loop_unroll_optimizer 0.06% : 0.000016s : 1: opt.transform.mutable_eliminate 6.22% : 0.001603s : 78: opt.transform.opt_a 0.16% : 0.000041s : 1: opt.transform.opt_after_cconv 0.12% : 0.000031s : 1: opt.transform.opt_after_jit_grad 0.61% : 0.000157s : 28: opt.transform.opt_b 0.25% : 0.000065s : 2: opt.transform.opt_trans_graph 0.17% : 0.000044s : 4: opt.transform.symbol_engine_opt 11.86% : 0.003059s : 1: opt_a 0.47% : 0.000122s : 1: opt_after_cconv 1.80% : 0.000465s : 1: opt_after_jit_grad 1.04% : 0.000269s : 1: opt_b 19.99% : 0.005154s : 1: optimize 0.08% : 0.000021s : 1: optimize_parallel_all_gather_comm 0.04% : 0.000009s : 1: order_py_execute_after_rewriter 0.10% : 0.000025s : 1: overlap_grad_flash_sp 0.01% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.03% : 0.000008s : 1: overlap_grad_ring_attention 0.02% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.02% : 0.000005s : 1: overlap_param_gather 0.02% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.03% : 0.000008s : 1: overlap_recompute_and_grad_model_parallel 0.02% : 0.000005s : 1: overlap_recompute_comm 0.03% : 0.000007s : 1: parallel-infer-symbol 0.01% : 0.000004s : 1: parallel-infer-symbol-second 0.02% : 0.000005s : 1: partial_unused_args_eliminate 0.02% : 0.000004s : 1: pipeline_parallel_scheduler 0.02% : 0.000005s : 1: pipeline_split 0.15% : 0.000038s : 1: pre_auto_parallel 0.12% : 0.000031s : 1: py_interpret_to_execute 0.05% : 0.000013s : 1: py_interpret_to_execute_after_opt_a 0.02% : 0.000004s : 1: remove_cast_before_assign_add 0.07% : 0.000018s : 1: remove_dup_value 1.24% : 0.000319s : 1: renormalize.infer 1.11% : 0.000286s : 1: renormalize.specialize 0.02% : 0.000005s : 1: reorder_send_recv_between_fp_bp 0.03% : 0.000007s : 1: rewriter_after_jit_bprop_graph 0.16% : 0.000043s : 1: rewriter_after_opt_a 0.37% : 0.000096s : 1: rewriter_before_opt_a 0.02% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.02% : 0.000005s : 1: slice_recompute_activation 0.02% : 0.000004s : 1: split_layernorm_comm 0.02% : 0.000005s : 1: split_matmul_comm_elemetwise 0.04% : 0.000009s : 1: swap_dp_allreduce_reducescatter 0.33% : 0.000086s : 1: symbol_engine_optimizer 0.36% : 0.000094s : 1: tuple_transform 22.44% : 0.005785s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:47:16.548.664 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:47:16.548.910 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0161438, [21] [bootstrap]: 0.00044074 [type_inference]: 0.00571885 [event_method]: 2.04e-05 [auto_monad]: 5.86e-05 [graph_reusing]: 5.60001e-06 [inline]: 2.12999e-06 [add_attr]: 0.0031319, [1] [add_attr_with_inline]: 0.0031227, [1] [Cycle 1]: 7.54e-05, [2] [tag_attr]: 2.14e-05 [meta_addattr_fg_expand]: 6.23e-06 [parallel-infer-symbol]: 3.18e-06 [pre_auto_parallel]: 3.462e-05 [insert-virtual-dataset]: 2.60997e-06 [parallel-infer-symbol-second]: 6.69999e-07 [dataset_repeat_opt]: 1.84e-06 [pipeline_split]: 1.92999e-06 [optimize]: 0.00563081, [53] [py_interpret_to_execute]: 3.972e-05 [rewriter_before_opt_a]: 9.631e-05 [opt_a]: 0.00324155, [2] [Cycle 1]: 0.0023575, [45] [expand_dump_flag]: 2.92002e-06 [switch_simplify]: 4.817e-05 [loop_unroll]: 3.496e-05 [a_1]: 0.00076479 [with_stream_mark]: 1.67e-05 [recompute_prepare]: 1.004e-05 [updatestate_depend_eliminate]: 4.08999e-06 [updatestate_assign_eliminate]: 3.71999e-06 [updatestate_loads_eliminate]: 2.69001e-06 [parameter_eliminate]: 1.87999e-06 [a_2]: 0.0001189 [accelerated_algorithm]: 7.86001e-06 [shard]: 1.96e-06 [meta_shard_fg_expand]: 2.02001e-06 [shard_inline]: 7.5e-06 [merge_send_recv]: 8.94998e-06 [auto_parallel]: 7.03e-06 [parallel]: 1.904e-05 [flash_sp]: 8.08999e-06 [merge_comm]: 4.67e-06 [allreduce_fusion]: 3.51001e-06 [matmul_add_comm_reduction]: 1.019e-05 [allreduce_slice_to_reducescatter]: 6.60017e-07 [virtual_shard_identity]: 9.33002e-06 [virtual_dataset]: 7.47998e-06 [get_grad_eliminate_]: 7.10998e-06 [virtual_output]: 7.53e-06 [merge_forward]: 3.66001e-06 [cell_reuse_recompute_pass]: 1.39e-06 [offload_activation]: 9.87999e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.465e-05 [merge_recompute_call_nodes]: 1.81998e-06 [before_grad]: 1.169e-05 [set_forward_comm_id_for_comm_node_pass]: 3.6e-06 [meta_fg_expand]: 3.04999e-06 [flash_sp_send_recv_attached]: 2.86e-06 [receive_attached]: 2.46e-06 [after_resolve]: 1.386e-05 [a_after_grad]: 1.135e-05 [renormalize]: 0.00065024 [add_forward_monad_depend]: 5.87001e-06 [auto_monad_grad]: 2.59999e-06 [auto_monad_eliminator]: 1.514e-05 [cse]: 3.213e-05 [a_3]: 6.502e-05 [Cycle 2]: 0.00087096, [45] [expand_dump_flag]: 9.99979e-07 [switch_simplify]: 8.78001e-06 [loop_unroll]: 7.53e-06 [a_1]: 0.00015808 [with_stream_mark]: 1.154e-05 [recompute_prepare]: 7.13e-06 [updatestate_depend_eliminate]: 3.13e-06 [updatestate_assign_eliminate]: 2.88998e-06 [updatestate_loads_eliminate]: 2.43e-06 [parameter_eliminate]: 1.09e-06 [a_2]: 0.00010781 [accelerated_algorithm]: 6.87002e-06 [shard]: 1.62999e-06 [meta_shard_fg_expand]: 1.47001e-06 [shard_inline]: 6.78998e-06 [merge_send_recv]: 5.12e-06 [auto_parallel]: 5.66e-06 [parallel]: 4.47e-06 [flash_sp]: 3.31001e-06 [merge_comm]: 3.32997e-06 [allreduce_fusion]: 3.19001e-06 [matmul_add_comm_reduction]: 5.81e-06 [allreduce_slice_to_reducescatter]: 3.69997e-07 [virtual_shard_identity]: 7.23999e-06 [virtual_dataset]: 6.59999e-06 [get_grad_eliminate_]: 6.32001e-06 [virtual_output]: 6.38003e-06 [merge_forward]: 2.71e-06 [cell_reuse_recompute_pass]: 1.71e-06 [offload_activation]: 6.44001e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.601e-05 [merge_recompute_call_nodes]: 8.60018e-07 [before_grad]: 1.036e-05 [set_forward_comm_id_for_comm_node_pass]: 3.64002e-06 [meta_fg_expand]: 2.29001e-06 [flash_sp_send_recv_attached]: 1.44e-06 [receive_attached]: 1.07e-06 [after_resolve]: 1.253e-05 [a_after_grad]: 1.095e-05 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 1.24e-06 [auto_monad_grad]: 9.5999e-07 [auto_monad_eliminator]: 7.55998e-06 [cse]: 1.506e-05 [a_3]: 5.359e-05 [py_interpret_to_execute_after_opt_a]: 1.33e-05 [slice_cell_reuse_recomputed_activation]: 4.57e-06 [rewriter_after_opt_a]: 3.964e-05 [convert_after_rewriter]: 9.82001e-06 [order_py_execute_after_rewriter]: 7.7e-06 [mutable_eliminate]: 0.00053154 [opt_b]: 0.00029688, [1] [Cycle 1]: 0.00028795, [7] [b_1]: 0.00019063 [b_2]: 9.25999e-06 [updatestate_depend_eliminate]: 5.46e-06 [updatestate_assign_eliminate]: 2.83998e-06 [updatestate_loads_eliminate]: 2.81999e-06 [renormalize]: 4.89992e-07 [cse]: 2.018e-05 [optimize_parallel_all_gather_comm]: 1.975e-05 [overlap_param_gather]: 4.43999e-06 [cconv]: 2.814e-05 [loop_unroll]: 0.00048494 [opt_after_cconv]: 0.00013223, [1] [Cycle 1]: 0.00012355, [7] [c_1]: 3.566e-05 [parameter_eliminate]: 3.28e-06 [updatestate_depend_eliminate]: 5.50001e-06 [updatestate_assign_eliminate]: 2.56998e-06 [updatestate_loads_eliminate]: 2.59001e-06 [cse]: 1.826e-05 [renormalize]: 1.04e-06 [remove_dup_value]: 1.742e-05 [tuple_transform]: 9.644e-05, [1] [Cycle 1]: 8.953e-05, [4] [d_1]: 4.962e-05 [none_parameter_eliminate]: 1.87999e-06 [renormalize]: 1.80007e-07 [switch_simplify]: 7.69002e-06 [partial_unused_args_eliminate]: 4.62998e-06 [add_recomputation]: 4.792e-05 [cse_after_recomputation]: 2.71e-05, [1] [Cycle 1]: 2.034e-05, [1] [cse]: 1.145e-05 [environ_conv]: 8.79998e-06 [swap_dp_allreduce_reducescatter]: 7.97998e-06 [bias_add_comm_swap]: 5.30999e-06 [label_micro_interleaved_index]: 6.58e-06 [label_fine_grained_interleaved_index]: 5.25999e-06 [merge_cast_opt]: 4e-06 [slice_recompute_activation]: 4.36002e-06 [micro_interleaved_order_control]: 5.00999e-06 [assign_add_opt]: 3.51001e-06 [ForceFp32Comm]: 3.31001e-06 [remove_cast_before_assign_add]: 3.25998e-06 [full_micro_interleaved_order_control]: 4.68001e-06 [reorder_send_recv_between_fp_bp]: 5.34998e-06 [comm_op_add_attrs]: 3.66999e-06 [add_comm_op_reuse_tag]: 3.21001e-06 [interleave_split_concat_branches]: 3.45998e-06 [interleave_parallel_branches]: 3.39001e-06 [overlap_opt_shard_in_pipeline]: 3.58e-06 [overlap_opt_shard_grad_in_pipeline]: 4.32e-06 [control_data_broadcast_order]: 1.558e-05 [grouped_pairwise_exchange_alltoall]: 4.06001e-06 [offloading_packed_experts]: 6.71999e-06 [overlap_recompute_and_grad_model_parallel]: 6.92002e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.71999e-06 [overlap_recompute_allgather_and_fa_grad]: 3.66999e-06 [overlap_recompute_comm]: 5.44e-06 [overlap_grad_ring_attention]: 6.48e-06 [overlap_grad_flash_sp]: 2.154e-05 [begin_end_overlap_inline]: 3.21999e-06 [split_matmul_comm_elemetwise]: 4.63999e-06 [split_layernorm_comm]: 4.22e-06 [handle_group_info]: 3.29001e-06 [symbol_engine_optimizer]: 9.758e-05, [1] [Cycle 1]: 9.084e-05, [6] [build]: 3.11999e-06 [elim_shapecalc]: 1.042e-05 [elim_not_effective]: 1.363e-05 [opt_reshape]: 7.68999e-06 [fold_const_symbol]: 1.092e-05 [renormalize]: 2.30008e-07 [detach_backward]: 3.11999e-06 [pipeline_parallel_scheduler]: 1.56998e-06 [auto_monad_reorder]: 1.988e-05 [get_jit_bprop_graph]: 1.41002e-06 [rewriter_after_jit_bprop_graph]: 4.03999e-06 [opt_after_jit_grad]: 0.00048576 [validate]: 3.681e-05 Sums bootstrap : 0.000441s : 3.89% type_inference : 0.005719s : 50.50% event_method : 0.000020s : 0.18% auto_monad : 0.000059s : 0.52% graph_reusing : 0.000006s : 0.05% inline : 0.000002s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000021s : 0.19% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.06% parallel-infer-symbol : 0.000003s : 0.03% pre_auto_parallel : 0.000035s : 0.31% insert-virtual-dataset : 0.000003s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.02% optimize.py_interpret_to_execute : 0.000040s : 0.35% optimize.rewriter_before_opt_a : 0.000096s : 0.85% optimize.opt_a.expand_dump_flag : 0.000004s : 0.03% optimize.opt_a.switch_simplify : 0.000057s : 0.50% optimize.opt_a.loop_unroll : 0.000042s : 0.38% optimize.opt_a.a_1 : 0.000923s : 8.15% optimize.opt_a.with_stream_mark : 0.000028s : 0.25% optimize.opt_a.recompute_prepare : 0.000017s : 0.15% optimize.opt_a.updatestate_depend_eliminate : 0.000007s : 0.06% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.06% optimize.opt_a.updatestate_loads_eliminate : 0.000005s : 0.05% optimize.opt_a.parameter_eliminate : 0.000003s : 0.03% optimize.opt_a.a_2 : 0.000227s : 2.00% optimize.opt_a.accelerated_algorithm : 0.000015s : 0.13% optimize.opt_a.shard : 0.000004s : 0.03% optimize.opt_a.meta_shard_fg_expand : 0.000003s : 0.03% optimize.opt_a.shard_inline : 0.000014s : 0.13% optimize.opt_a.merge_send_recv : 0.000014s : 0.12% optimize.opt_a.auto_parallel : 0.000013s : 0.11% optimize.opt_a.parallel : 0.000024s : 0.21% optimize.opt_a.flash_sp : 0.000011s : 0.10% optimize.opt_a.merge_comm : 0.000008s : 0.07% optimize.opt_a.allreduce_fusion : 0.000007s : 0.06% optimize.opt_a.matmul_add_comm_reduction : 0.000016s : 0.14% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000017s : 0.15% optimize.opt_a.virtual_dataset : 0.000014s : 0.12% optimize.opt_a.get_grad_eliminate_ : 0.000013s : 0.12% optimize.opt_a.virtual_output : 0.000014s : 0.12% optimize.opt_a.merge_forward : 0.000006s : 0.06% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.03% optimize.opt_a.offload_activation : 0.000016s : 0.14% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000031s : 0.27% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.02% optimize.opt_a.before_grad : 0.000022s : 0.19% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000007s : 0.06% optimize.opt_a.meta_fg_expand : 0.000005s : 0.05% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.04% optimize.opt_a.receive_attached : 0.000004s : 0.03% optimize.opt_a.after_resolve : 0.000026s : 0.23% optimize.opt_a.a_after_grad : 0.000022s : 0.20% optimize.opt_a.renormalize : 0.000650s : 5.74% optimize.opt_a.add_forward_monad_depend : 0.000007s : 0.06% optimize.opt_a.auto_monad_grad : 0.000004s : 0.03% optimize.opt_a.auto_monad_eliminator : 0.000023s : 0.20% optimize.opt_a.cse : 0.000047s : 0.42% optimize.opt_a.a_3 : 0.000119s : 1.05% optimize.py_interpret_to_execute_after_opt_a : 0.000013s : 0.12% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.04% optimize.rewriter_after_opt_a : 0.000040s : 0.35% optimize.convert_after_rewriter : 0.000010s : 0.09% optimize.order_py_execute_after_rewriter : 0.000008s : 0.07% optimize.mutable_eliminate : 0.000532s : 4.69% optimize.opt_b.b_1 : 0.000191s : 1.68% optimize.opt_b.b_2 : 0.000009s : 0.08% optimize.opt_b.updatestate_depend_eliminate : 0.000005s : 0.05% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.02% optimize.opt_b.renormalize : 0.000000s : 0.00% optimize.opt_b.cse : 0.000020s : 0.18% optimize.optimize_parallel_all_gather_comm : 0.000020s : 0.17% optimize.overlap_param_gather : 0.000004s : 0.04% optimize.cconv : 0.000028s : 0.25% optimize.loop_unroll : 0.000485s : 4.28% optimize.opt_after_cconv.c_1 : 0.000036s : 0.31% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.05% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.02% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.02% optimize.opt_after_cconv.cse : 0.000018s : 0.16% optimize.opt_after_cconv.renormalize : 0.000001s : 0.01% optimize.remove_dup_value : 0.000017s : 0.15% optimize.tuple_transform.d_1 : 0.000050s : 0.44% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.02% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000008s : 0.07% optimize.partial_unused_args_eliminate : 0.000005s : 0.04% optimize.add_recomputation : 0.000048s : 0.42% optimize.cse_after_recomputation.cse : 0.000011s : 0.10% optimize.environ_conv : 0.000009s : 0.08% optimize.swap_dp_allreduce_reducescatter : 0.000008s : 0.07% optimize.bias_add_comm_swap : 0.000005s : 0.05% optimize.label_micro_interleaved_index : 0.000007s : 0.06% optimize.label_fine_grained_interleaved_index : 0.000005s : 0.05% optimize.merge_cast_opt : 0.000004s : 0.04% optimize.slice_recompute_activation : 0.000004s : 0.04% optimize.micro_interleaved_order_control : 0.000005s : 0.04% optimize.assign_add_opt : 0.000004s : 0.03% optimize.ForceFp32Comm : 0.000003s : 0.03% optimize.remove_cast_before_assign_add : 0.000003s : 0.03% optimize.full_micro_interleaved_order_control : 0.000005s : 0.04% optimize.reorder_send_recv_between_fp_bp : 0.000005s : 0.05% optimize.comm_op_add_attrs : 0.000004s : 0.03% optimize.add_comm_op_reuse_tag : 0.000003s : 0.03% optimize.interleave_split_concat_branches : 0.000003s : 0.03% optimize.interleave_parallel_branches : 0.000003s : 0.03% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.03% optimize.overlap_opt_shard_grad_in_pipeline : 0.000004s : 0.04% optimize.control_data_broadcast_order : 0.000016s : 0.14% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.04% optimize.offloading_packed_experts : 0.000007s : 0.06% optimize.overlap_recompute_and_grad_model_parallel : 0.000007s : 0.06% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.03% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.03% optimize.overlap_recompute_comm : 0.000005s : 0.05% optimize.overlap_grad_ring_attention : 0.000006s : 0.06% optimize.overlap_grad_flash_sp : 0.000022s : 0.19% optimize.begin_end_overlap_inline : 0.000003s : 0.03% optimize.split_matmul_comm_elemetwise : 0.000005s : 0.04% optimize.split_layernorm_comm : 0.000004s : 0.04% optimize.handle_group_info : 0.000003s : 0.03% optimize.symbol_engine_optimizer.build : 0.000003s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000010s : 0.09% optimize.symbol_engine_optimizer.elim_not_effective : 0.000014s : 0.12% optimize.symbol_engine_optimizer.opt_reshape : 0.000008s : 0.07% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000011s : 0.10% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000003s : 0.03% pipeline_parallel_scheduler : 0.000002s : 0.01% auto_monad_reorder : 0.000020s : 0.18% get_jit_bprop_graph : 0.000001s : 0.01% rewriter_after_jit_bprop_graph : 0.000004s : 0.04% opt_after_jit_grad : 0.000486s : 4.29% validate : 0.000037s : 0.33% Time group info: ------[substitution.] 0.000235 34 0.82% : 0.000002s : 2: substitution.elim_not_effective 0.61% : 0.000001s : 2: substitution.fold_const_symbol 2.72% : 0.000006s : 5: substitution.graph_param_transform 78.99% : 0.000185s : 4: substitution.inline 1.67% : 0.000004s : 4: substitution.j_node_and_user_rematch 2.03% : 0.000005s : 4: substitution.remove_not_recompute_node 2.47% : 0.000006s : 6: substitution.replace_old_param 8.27% : 0.000019s : 6: substitution.tuple_list_get_item_eliminator 2.42% : 0.000006s : 1: substitution.value_based_eliminate ------[type_inference.] 0.005668 2 87.10% : 0.004937s : 1: type_inference.infer 12.90% : 0.000731s : 1: type_inference.specialize ------[replace.] 0.000073 10 55.71% : 0.000040s : 4: replace.inline 44.29% : 0.000032s : 6: replace.tuple_list_get_item_eliminator ------[match.] 0.000199 10 91.64% : 0.000183s : 4: match.inline 8.36% : 0.000017s : 6: match.tuple_list_get_item_eliminator ------[predicate.] 0.000234 1590 0.95% : 0.000002s : 16: predicate.accumulaten_eliminater 0.82% : 0.000002s : 5: predicate.ad_related_special_op_eliminate 0.52% : 0.000001s : 10: predicate.addn_check_dump 0.86% : 0.000002s : 16: predicate.addn_zero_filter 0.88% : 0.000002s : 16: predicate.adjust_all_reduce_mul_add 1.99% : 0.000005s : 26: predicate.arithmetic_simplify 0.94% : 0.000002s : 16: predicate.cast_eliminate 0.63% : 0.000001s : 10: predicate.check_bprop_eliminate 0.51% : 0.000001s : 10: predicate.compare_switch_simplify 0.20% : 0.000000s : 5: predicate.const_output_eliminate 0.64% : 0.000002s : 10: predicate.depend_value_elim 0.94% : 0.000002s : 16: predicate.dict_get_item_const_eliminator 1.07% : 0.000002s : 16: predicate.dict_get_item_eliminator 0.89% : 0.000002s : 16: predicate.dict_set_item_eliminator 0.92% : 0.000002s : 10: predicate.dumpgradient_eliminate 0.24% : 0.000001s : 5: predicate.elim_not_effective 0.38% : 0.000001s : 5: predicate.elim_shapecalc_of_broadcastargs 1.16% : 0.000003s : 21: predicate.environ_add_const_eliminate 1.14% : 0.000003s : 21: predicate.environ_get_add_eliminate 1.13% : 0.000003s : 21: predicate.environ_get_depend_swap 1.80% : 0.000004s : 31: predicate.environ_get_eliminate 1.12% : 0.000003s : 21: predicate.environ_get_set_eliminate 1.53% : 0.000004s : 26: predicate.exchange_switch_depend_value 2.44% : 0.000006s : 26: predicate.float_depend_g_call 0.53% : 0.000001s : 10: predicate.float_environ_get_switch 0.76% : 0.000002s : 15: predicate.float_tuple_getitem_switch 0.20% : 0.000000s : 5: predicate.fold_const_symbol 0.78% : 0.000002s : 10: predicate.get_grad_eliminate 0.26% : 0.000001s : 5: predicate.graph_param_transform 0.56% : 0.000001s : 10: predicate.incorporate_call 0.47% : 0.000001s : 10: predicate.incorporate_call_switch 6.08% : 0.000014s : 72: predicate.inline 0.77% : 0.000002s : 10: predicate.inline_without_move 0.35% : 0.000001s : 10: predicate.j_node_and_user_rematch 0.82% : 0.000002s : 10: predicate.less_batch_normalization 1.87% : 0.000004s : 32: predicate.list_to_tuple_eliminator_ 2.57% : 0.000006s : 48: predicate.load_eliminater 0.79% : 0.000002s : 5: predicate.loop_unroll_after_grad 2.43% : 0.000006s : 40: predicate.loop_unroll_before_grad 1.66% : 0.000004s : 26: predicate.make_slice_get_slice_eliminator 0.54% : 0.000001s : 10: predicate.merge_addn 0.57% : 0.000001s : 10: predicate.micro_step_allgather_replace 0.53% : 0.000001s : 10: predicate.mini_step_allgather_replace 0.85% : 0.000002s : 16: predicate.minmaximum_grad 1.01% : 0.000002s : 5: predicate.mutable_eliminate 0.33% : 0.000001s : 5: predicate.opt_reshape 0.37% : 0.000001s : 5: predicate.parallel_virtual_node 1.89% : 0.000004s : 26: predicate.partial_defer_inline 1.76% : 0.000004s : 27: predicate.partial_eliminate 0.96% : 0.000002s : 16: predicate.print_const_string_wrapper 0.57% : 0.000001s : 10: predicate.reduce_all_const_elim 1.12% : 0.000003s : 16: predicate.reduce_eliminate 2.57% : 0.000006s : 48: predicate.redundant_stop_gradient_eliminater 0.43% : 0.000001s : 10: predicate.remove_not_recompute_node 1.53% : 0.000004s : 32: predicate.replace_applicator 0.44% : 0.000001s : 10: predicate.replace_old_param 0.26% : 0.000001s : 5: predicate.reset_defer_inline 1.03% : 0.000002s : 16: predicate.reshape_eliminate 0.56% : 0.000001s : 10: predicate.row_tensor_add_zeros_like 0.36% : 0.000001s : 5: predicate.row_tensor_eliminate 0.87% : 0.000002s : 10: predicate.same_eliminate 0.45% : 0.000001s : 10: predicate.set_cell_output_no_recompute 0.77% : 0.000002s : 10: predicate.shard_identity_eliminate 0.64% : 0.000001s : 10: predicate.special_op_eliminate 0.73% : 0.000002s : 10: predicate.specialize_transform 0.72% : 0.000002s : 10: predicate.split_environ_get_set_with_tuple_value 0.73% : 0.000002s : 10: predicate.stack_unstack_eliminate 0.33% : 0.000001s : 5: predicate.switch_call_monad_eliminater 1.63% : 0.000004s : 26: predicate.switch_defer_inline 2.20% : 0.000005s : 36: predicate.switch_layer_defer_inline 5.16% : 0.000012s : 81: predicate.switch_simplify 0.86% : 0.000002s : 16: predicate.tile_eliminate 1.01% : 0.000002s : 16: predicate.transpose_eliminate 1.53% : 0.000004s : 26: predicate.tuple_list_convert_item_index_to_positive 1.65% : 0.000004s : 26: predicate.tuple_list_get_item_const_eliminator 1.42% : 0.000003s : 26: predicate.tuple_list_get_item_depend_reorder 3.35% : 0.000008s : 42: predicate.tuple_list_get_item_eliminator 1.52% : 0.000004s : 26: predicate.tuple_list_get_set_item_eliminator 2.11% : 0.000005s : 36: predicate.tuple_list_set_item_eliminator 1.83% : 0.000004s : 32: predicate.tuple_to_list_eliminator_ 2.57% : 0.000006s : 48: predicate.updatestate_pure_node_eliminater 3.23% : 0.000008s : 58: predicate.updatestate_useless_node_eliminater 0.47% : 0.000001s : 5: predicate.value_based_eliminate 0.63% : 0.000001s : 10: predicate.virtual_dataset_eliminate 0.65% : 0.000002s : 10: predicate.virtual_output_eliminate 0.27% : 0.000001s : 5: predicate.virtual_view_grad_eliminate 0.42% : 0.000001s : 5: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000537 11 53.76% : 0.000289s : 5: func_graph_cloner_run.FuncGraphClonerGraph 46.24% : 0.000248s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.027147 192 0.02% : 0.000006s : 1: ForceFp32Comm 11.57% : 0.003142s : 1: add_attr 11.52% : 0.003126s : 1: add_attr_with_inline 0.02% : 0.000006s : 1: add_comm_op_reuse_tag 0.19% : 0.000052s : 1: add_recomputation 0.02% : 0.000006s : 1: assign_add_opt 0.25% : 0.000067s : 1: auto_monad 0.10% : 0.000027s : 1: auto_monad_reorder 0.02% : 0.000006s : 1: begin_end_overlap_inline 0.03% : 0.000008s : 1: bias_add_comm_swap 1.76% : 0.000479s : 1: bootstrap 0.12% : 0.000031s : 1: cconv 0.02% : 0.000007s : 1: comm_op_add_attrs 0.07% : 0.000019s : 1: control_data_broadcast_order 0.05% : 0.000013s : 1: convert_after_rewriter 0.11% : 0.000030s : 1: cse_after_recomputation 0.03% : 0.000007s : 1: dataset_repeat_opt 0.06% : 0.000018s : 1: detach_backward 0.04% : 0.000012s : 1: environ_conv 0.11% : 0.000030s : 1: event_method 0.03% : 0.000007s : 1: full_micro_interleaved_order_control 0.03% : 0.000007s : 1: get_jit_bprop_graph 0.04% : 0.000012s : 1: graph_reusing 0.02% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.02% : 0.000006s : 1: handle_group_info 0.03% : 0.000008s : 1: inline 0.03% : 0.000009s : 1: insert-virtual-dataset 0.02% : 0.000006s : 1: interleave_parallel_branches 0.02% : 0.000006s : 1: interleave_split_concat_branches 0.03% : 0.000008s : 1: label_fine_grained_interleaved_index 0.03% : 0.000009s : 1: label_micro_interleaved_index 1.81% : 0.000491s : 1: loop_unroll 0.03% : 0.000007s : 1: merge_cast_opt 0.03% : 0.000008s : 1: micro_interleaved_order_control 1.98% : 0.000538s : 1: mutable_eliminate 0.04% : 0.000010s : 1: offloading_packed_experts 0.05% : 0.000015s : 1: opt.transform.loop_unroll_optimizer 0.06% : 0.000016s : 1: opt.transform.mutable_eliminate 5.17% : 0.001403s : 78: opt.transform.opt_a 0.13% : 0.000034s : 1: opt.transform.opt_after_cconv 0.10% : 0.000028s : 1: opt.transform.opt_after_jit_grad 0.47% : 0.000128s : 28: opt.transform.opt_b 0.20% : 0.000055s : 2: opt.transform.opt_trans_graph 0.14% : 0.000039s : 4: opt.transform.symbol_engine_opt 11.95% : 0.003245s : 1: opt_a 0.50% : 0.000136s : 1: opt_after_cconv 1.83% : 0.000496s : 1: opt_after_jit_grad 1.11% : 0.000300s : 1: opt_b 21.87% : 0.005937s : 1: optimize 0.09% : 0.000023s : 1: optimize_parallel_all_gather_comm 0.04% : 0.000011s : 1: order_py_execute_after_rewriter 0.09% : 0.000025s : 1: overlap_grad_flash_sp 0.02% : 0.000006s : 1: overlap_grad_matmul_and_grad_allreduce 0.03% : 0.000009s : 1: overlap_grad_ring_attention 0.03% : 0.000007s : 1: overlap_opt_shard_grad_in_pipeline 0.02% : 0.000006s : 1: overlap_opt_shard_in_pipeline 0.03% : 0.000008s : 1: overlap_param_gather 0.02% : 0.000006s : 1: overlap_recompute_allgather_and_fa_grad 0.04% : 0.000010s : 1: overlap_recompute_and_grad_model_parallel 0.03% : 0.000008s : 1: overlap_recompute_comm 0.04% : 0.000010s : 1: parallel-infer-symbol 0.02% : 0.000006s : 1: parallel-infer-symbol-second 0.03% : 0.000008s : 1: partial_unused_args_eliminate 0.03% : 0.000009s : 1: pipeline_parallel_scheduler 0.03% : 0.000007s : 1: pipeline_split 0.15% : 0.000042s : 1: pre_auto_parallel 0.16% : 0.000044s : 1: py_interpret_to_execute 0.06% : 0.000017s : 1: py_interpret_to_execute_after_opt_a 0.02% : 0.000006s : 1: remove_cast_before_assign_add 0.08% : 0.000021s : 1: remove_dup_value 1.23% : 0.000335s : 1: renormalize.infer 1.13% : 0.000308s : 1: renormalize.specialize 0.03% : 0.000008s : 1: reorder_send_recv_between_fp_bp 0.04% : 0.000010s : 1: rewriter_after_jit_bprop_graph 0.16% : 0.000043s : 1: rewriter_after_opt_a 0.37% : 0.000100s : 1: rewriter_before_opt_a 0.03% : 0.000007s : 1: slice_cell_reuse_recomputed_activation 0.03% : 0.000007s : 1: slice_recompute_activation 0.03% : 0.000007s : 1: split_layernorm_comm 0.03% : 0.000008s : 1: split_matmul_comm_elemetwise 0.04% : 0.000011s : 1: swap_dp_allreduce_reducescatter 0.37% : 0.000100s : 1: symbol_engine_optimizer 0.37% : 0.000099s : 1: tuple_transform 21.19% : 0.005752s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:47:16.742.110 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0148347, [21] [bootstrap]: 0.00043747 [type_inference]: 0.00566227 [event_method]: 1.987e-05 [auto_monad]: 6.104e-05 [graph_reusing]: 6.04999e-06 [inline]: 2.25002e-06 [add_attr]: 0.00308245, [1] [add_attr_with_inline]: 0.00307383, [1] [Cycle 1]: 5.968e-05, [2] [tag_attr]: 2.02e-05 [meta_addattr_fg_expand]: 6.66999e-06 [parallel-infer-symbol]: 2.99001e-06 [pre_auto_parallel]: 3.313e-05 [insert-virtual-dataset]: 2.66e-06 [parallel-infer-symbol-second]: 8.10018e-07 [dataset_repeat_opt]: 2.27999e-06 [pipeline_split]: 1.59e-06 [optimize]: 0.00485119, [53] [py_interpret_to_execute]: 2.734e-05 [rewriter_before_opt_a]: 8.759e-05 [opt_a]: 0.00282121, [2] [Cycle 1]: 0.00210213, [45] [expand_dump_flag]: 3.08e-06 [switch_simplify]: 4.621e-05 [loop_unroll]: 3.431e-05 [a_1]: 0.00071524 [with_stream_mark]: 1.534e-05 [recompute_prepare]: 9.07999e-06 [updatestate_depend_eliminate]: 4.32e-06 [updatestate_assign_eliminate]: 3.13998e-06 [updatestate_loads_eliminate]: 2.71e-06 [parameter_eliminate]: 1.94e-06 [a_2]: 8.971e-05 [accelerated_algorithm]: 7.67998e-06 [shard]: 1.84998e-06 [meta_shard_fg_expand]: 1.81e-06 [shard_inline]: 7.38999e-06 [merge_send_recv]: 8.47e-06 [auto_parallel]: 5.99e-06 [parallel]: 1.762e-05 [flash_sp]: 7.95e-06 [merge_comm]: 4.09997e-06 [allreduce_fusion]: 3.34001e-06 [matmul_add_comm_reduction]: 9.37001e-06 [allreduce_slice_to_reducescatter]: 6.59988e-07 [virtual_shard_identity]: 9.07999e-06 [virtual_dataset]: 7.2e-06 [get_grad_eliminate_]: 6.86001e-06 [virtual_output]: 7.07002e-06 [merge_forward]: 3.66999e-06 [cell_reuse_recompute_pass]: 1.02998e-06 [offload_activation]: 9.47001e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.299e-05 [merge_recompute_call_nodes]: 1.41998e-06 [before_grad]: 1.076e-05 [set_forward_comm_id_for_comm_node_pass]: 3.58e-06 [meta_fg_expand]: 2.82002e-06 [flash_sp_send_recv_attached]: 2.51e-06 [receive_attached]: 2.32999e-06 [after_resolve]: 1.344e-05 [a_after_grad]: 1.166e-05 [renormalize]: 0.00063889 [add_forward_monad_depend]: 5.49e-06 [auto_monad_grad]: 2.34999e-06 [auto_monad_eliminator]: 1.516e-05 [cse]: 3.378e-05 [a_3]: 5.43e-05 [Cycle 2]: 0.00070959, [45] [expand_dump_flag]: 1.20001e-06 [switch_simplify]: 8.81002e-06 [loop_unroll]: 6.91001e-06 [a_1]: 0.00015772 [with_stream_mark]: 1.237e-05 [recompute_prepare]: 7.33e-06 [updatestate_depend_eliminate]: 3.38999e-06 [updatestate_assign_eliminate]: 2.98e-06 [updatestate_loads_eliminate]: 2.48002e-06 [parameter_eliminate]: 1.12999e-06 [a_2]: 8.116e-05 [accelerated_algorithm]: 7.00998e-06 [shard]: 1.51998e-06 [meta_shard_fg_expand]: 1.45001e-06 [shard_inline]: 6.64999e-06 [merge_send_recv]: 5.66998e-06 [auto_parallel]: 5.52001e-06 [parallel]: 5.35001e-06 [flash_sp]: 3.76999e-06 [merge_comm]: 3.54002e-06 [allreduce_fusion]: 3.31001e-06 [matmul_add_comm_reduction]: 6.36e-06 [allreduce_slice_to_reducescatter]: 4.09986e-07 [virtual_shard_identity]: 7.5e-06 [virtual_dataset]: 6.78e-06 [get_grad_eliminate_]: 6.21998e-06 [virtual_output]: 6.45002e-06 [merge_forward]: 3.09999e-06 [cell_reuse_recompute_pass]: 1.51998e-06 [offload_activation]: 6.62002e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.597e-05 [merge_recompute_call_nodes]: 9.00007e-07 [before_grad]: 1.03e-05 [set_forward_comm_id_for_comm_node_pass]: 3.35e-06 [meta_fg_expand]: 2.64001e-06 [flash_sp_send_recv_attached]: 8.89995e-07 [receive_attached]: 1.18001e-06 [after_resolve]: 1.375e-05 [a_after_grad]: 1.074e-05 [renormalize]: 7.00238e-08 [add_forward_monad_depend]: 1.59e-06 [auto_monad_grad]: 1.11997e-06 [auto_monad_eliminator]: 7.14001e-06 [cse]: 1.516e-05 [a_3]: 4.03e-05 [py_interpret_to_execute_after_opt_a]: 1.046e-05 [slice_cell_reuse_recomputed_activation]: 2.13002e-06 [rewriter_after_opt_a]: 3.482e-05 [convert_after_rewriter]: 7.26999e-06 [order_py_execute_after_rewriter]: 5.03002e-06 [mutable_eliminate]: 0.00050354 [opt_b]: 0.00022815, [1] [Cycle 1]: 0.00022183, [7] [b_1]: 0.00014641 [b_2]: 8.70999e-06 [updatestate_depend_eliminate]: 5.46e-06 [updatestate_assign_eliminate]: 2.48e-06 [updatestate_loads_eliminate]: 2.66e-06 [renormalize]: 4.09986e-07 [cse]: 1.994e-05 [optimize_parallel_all_gather_comm]: 1.57e-05 [overlap_param_gather]: 2.21998e-06 [cconv]: 2.422e-05 [loop_unroll]: 0.00042523 [opt_after_cconv]: 0.0001073, [1] [Cycle 1]: 0.00010189, [7] [c_1]: 3.631e-05 [parameter_eliminate]: 2.80002e-06 [updatestate_depend_eliminate]: 5.24e-06 [updatestate_assign_eliminate]: 2.53e-06 [updatestate_loads_eliminate]: 2.37999e-06 [cse]: 1.842e-05 [renormalize]: 3.59985e-07 [remove_dup_value]: 1.349e-05 [tuple_transform]: 8.155e-05, [1] [Cycle 1]: 7.705e-05, [4] [d_1]: 4.979e-05 [none_parameter_eliminate]: 1.60001e-06 [renormalize]: 2.09984e-07 [switch_simplify]: 7.66999e-06 [partial_unused_args_eliminate]: 2.24999e-06 [add_recomputation]: 4.505e-05 [cse_after_recomputation]: 2.132e-05, [1] [Cycle 1]: 1.694e-05, [1] [cse]: 1.156e-05 [environ_conv]: 5.02e-06 [swap_dp_allreduce_reducescatter]: 4.84e-06 [bias_add_comm_swap]: 2.34999e-06 [label_micro_interleaved_index]: 3.68e-06 [label_fine_grained_interleaved_index]: 2.43e-06 [merge_cast_opt]: 1.39e-06 [slice_recompute_activation]: 1.94999e-06 [micro_interleaved_order_control]: 2.29001e-06 [assign_add_opt]: 1.54e-06 [ForceFp32Comm]: 9.89996e-07 [remove_cast_before_assign_add]: 9.80013e-07 [full_micro_interleaved_order_control]: 1.88002e-06 [reorder_send_recv_between_fp_bp]: 2.64999e-06 [comm_op_add_attrs]: 9.50007e-07 [add_comm_op_reuse_tag]: 9.20001e-07 [interleave_split_concat_branches]: 1.14e-06 [interleave_parallel_branches]: 1.03001e-06 [overlap_opt_shard_in_pipeline]: 1.34998e-06 [overlap_opt_shard_grad_in_pipeline]: 2.01003e-06 [control_data_broadcast_order]: 1.16e-05 [grouped_pairwise_exchange_alltoall]: 1.52001e-06 [offloading_packed_experts]: 3.68e-06 [overlap_recompute_and_grad_model_parallel]: 4.75001e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.12999e-06 [overlap_recompute_allgather_and_fa_grad]: 1.34e-06 [overlap_recompute_comm]: 2.22999e-06 [overlap_grad_ring_attention]: 4.05e-06 [overlap_grad_flash_sp]: 1.817e-05 [begin_end_overlap_inline]: 5.69999e-07 [split_matmul_comm_elemetwise]: 2.31e-06 [split_layernorm_comm]: 1.82001e-06 [handle_group_info]: 1.15999e-06 [symbol_engine_optimizer]: 7.657e-05, [1] [Cycle 1]: 7.197e-05, [6] [build]: 2.72001e-06 [elim_shapecalc]: 9.97001e-06 [elim_not_effective]: 1.303e-05 [opt_reshape]: 7.63999e-06 [fold_const_symbol]: 1.052e-05 [renormalize]: 2.60014e-07 [detach_backward]: 2.01998e-06 [pipeline_parallel_scheduler]: 1.49e-06 [auto_monad_reorder]: 1.888e-05 [get_jit_bprop_graph]: 1.22e-06 [rewriter_after_jit_bprop_graph]: 3.78001e-06 [opt_after_jit_grad]: 0.00046713 [validate]: 3.559e-05 Sums bootstrap : 0.000437s : 4.05% type_inference : 0.005662s : 52.36% event_method : 0.000020s : 0.18% auto_monad : 0.000061s : 0.56% graph_reusing : 0.000006s : 0.06% inline : 0.000002s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000020s : 0.19% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000007s : 0.06% parallel-infer-symbol : 0.000003s : 0.03% pre_auto_parallel : 0.000033s : 0.31% insert-virtual-dataset : 0.000003s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000027s : 0.25% optimize.rewriter_before_opt_a : 0.000088s : 0.81% optimize.opt_a.expand_dump_flag : 0.000004s : 0.04% optimize.opt_a.switch_simplify : 0.000055s : 0.51% optimize.opt_a.loop_unroll : 0.000041s : 0.38% optimize.opt_a.a_1 : 0.000873s : 8.07% optimize.opt_a.with_stream_mark : 0.000028s : 0.26% optimize.opt_a.recompute_prepare : 0.000016s : 0.15% optimize.opt_a.updatestate_depend_eliminate : 0.000008s : 0.07% optimize.opt_a.updatestate_assign_eliminate : 0.000006s : 0.06% optimize.opt_a.updatestate_loads_eliminate : 0.000005s : 0.05% optimize.opt_a.parameter_eliminate : 0.000003s : 0.03% optimize.opt_a.a_2 : 0.000171s : 1.58% optimize.opt_a.accelerated_algorithm : 0.000015s : 0.14% optimize.opt_a.shard : 0.000003s : 0.03% optimize.opt_a.meta_shard_fg_expand : 0.000003s : 0.03% optimize.opt_a.shard_inline : 0.000014s : 0.13% optimize.opt_a.merge_send_recv : 0.000014s : 0.13% optimize.opt_a.auto_parallel : 0.000012s : 0.11% optimize.opt_a.parallel : 0.000023s : 0.21% optimize.opt_a.flash_sp : 0.000012s : 0.11% optimize.opt_a.merge_comm : 0.000008s : 0.07% optimize.opt_a.allreduce_fusion : 0.000007s : 0.06% optimize.opt_a.matmul_add_comm_reduction : 0.000016s : 0.15% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000017s : 0.15% optimize.opt_a.virtual_dataset : 0.000014s : 0.13% optimize.opt_a.get_grad_eliminate_ : 0.000013s : 0.12% optimize.opt_a.virtual_output : 0.000014s : 0.13% optimize.opt_a.merge_forward : 0.000007s : 0.06% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.02% optimize.opt_a.offload_activation : 0.000016s : 0.15% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000029s : 0.27% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.02% optimize.opt_a.before_grad : 0.000021s : 0.19% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000007s : 0.06% optimize.opt_a.meta_fg_expand : 0.000005s : 0.05% optimize.opt_a.flash_sp_send_recv_attached : 0.000003s : 0.03% optimize.opt_a.receive_attached : 0.000004s : 0.03% optimize.opt_a.after_resolve : 0.000027s : 0.25% optimize.opt_a.a_after_grad : 0.000022s : 0.21% optimize.opt_a.renormalize : 0.000639s : 5.91% optimize.opt_a.add_forward_monad_depend : 0.000007s : 0.07% optimize.opt_a.auto_monad_grad : 0.000003s : 0.03% optimize.opt_a.auto_monad_eliminator : 0.000022s : 0.21% optimize.opt_a.cse : 0.000049s : 0.45% optimize.opt_a.a_3 : 0.000095s : 0.87% optimize.py_interpret_to_execute_after_opt_a : 0.000010s : 0.10% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.02% optimize.rewriter_after_opt_a : 0.000035s : 0.32% optimize.convert_after_rewriter : 0.000007s : 0.07% optimize.order_py_execute_after_rewriter : 0.000005s : 0.05% optimize.mutable_eliminate : 0.000504s : 4.66% optimize.opt_b.b_1 : 0.000146s : 1.35% optimize.opt_b.b_2 : 0.000009s : 0.08% optimize.opt_b.updatestate_depend_eliminate : 0.000005s : 0.05% optimize.opt_b.updatestate_assign_eliminate : 0.000002s : 0.02% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.02% optimize.opt_b.renormalize : 0.000000s : 0.00% optimize.opt_b.cse : 0.000020s : 0.18% optimize.optimize_parallel_all_gather_comm : 0.000016s : 0.15% optimize.overlap_param_gather : 0.000002s : 0.02% optimize.cconv : 0.000024s : 0.22% optimize.loop_unroll : 0.000425s : 3.93% optimize.opt_after_cconv.c_1 : 0.000036s : 0.34% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000005s : 0.05% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.02% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.02% optimize.opt_after_cconv.cse : 0.000018s : 0.17% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000013s : 0.12% optimize.tuple_transform.d_1 : 0.000050s : 0.46% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000008s : 0.07% optimize.partial_unused_args_eliminate : 0.000002s : 0.02% optimize.add_recomputation : 0.000045s : 0.42% optimize.cse_after_recomputation.cse : 0.000012s : 0.11% optimize.environ_conv : 0.000005s : 0.05% optimize.swap_dp_allreduce_reducescatter : 0.000005s : 0.04% optimize.bias_add_comm_swap : 0.000002s : 0.02% optimize.label_micro_interleaved_index : 0.000004s : 0.03% optimize.label_fine_grained_interleaved_index : 0.000002s : 0.02% optimize.merge_cast_opt : 0.000001s : 0.01% optimize.slice_recompute_activation : 0.000002s : 0.02% optimize.micro_interleaved_order_control : 0.000002s : 0.02% optimize.assign_add_opt : 0.000002s : 0.01% optimize.ForceFp32Comm : 0.000001s : 0.01% optimize.remove_cast_before_assign_add : 0.000001s : 0.01% optimize.full_micro_interleaved_order_control : 0.000002s : 0.02% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.02% optimize.comm_op_add_attrs : 0.000001s : 0.01% optimize.add_comm_op_reuse_tag : 0.000001s : 0.01% optimize.interleave_split_concat_branches : 0.000001s : 0.01% optimize.interleave_parallel_branches : 0.000001s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.02% optimize.control_data_broadcast_order : 0.000012s : 0.11% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.01% optimize.offloading_packed_experts : 0.000004s : 0.03% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.04% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.01% optimize.overlap_recompute_comm : 0.000002s : 0.02% optimize.overlap_grad_ring_attention : 0.000004s : 0.04% optimize.overlap_grad_flash_sp : 0.000018s : 0.17% optimize.begin_end_overlap_inline : 0.000001s : 0.01% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.02% optimize.split_layernorm_comm : 0.000002s : 0.02% optimize.handle_group_info : 0.000001s : 0.01% optimize.symbol_engine_optimizer.build : 0.000003s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000010s : 0.09% optimize.symbol_engine_optimizer.elim_not_effective : 0.000013s : 0.12% optimize.symbol_engine_optimizer.opt_reshape : 0.000008s : 0.07% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000011s : 0.10% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.02% pipeline_parallel_scheduler : 0.000001s : 0.01% auto_monad_reorder : 0.000019s : 0.17% get_jit_bprop_graph : 0.000001s : 0.01% rewriter_after_jit_bprop_graph : 0.000004s : 0.03% opt_after_jit_grad : 0.000467s : 4.32% validate : 0.000036s : 0.33% Time group info: ------[substitution.] 0.000194 34 0.94% : 0.000002s : 2: substitution.elim_not_effective 0.66% : 0.000001s : 2: substitution.fold_const_symbol 3.23% : 0.000006s : 5: substitution.graph_param_transform 74.94% : 0.000145s : 4: substitution.inline 1.83% : 0.000004s : 4: substitution.j_node_and_user_rematch 3.54% : 0.000007s : 4: substitution.remove_not_recompute_node 2.86% : 0.000006s : 6: substitution.replace_old_param 9.26% : 0.000018s : 6: substitution.tuple_list_get_item_eliminator 2.72% : 0.000005s : 1: substitution.value_based_eliminate ------[type_inference.] 0.005604 2 87.14% : 0.004883s : 1: type_inference.infer 12.86% : 0.000721s : 1: type_inference.specialize ------[replace.] 0.000068 10 56.18% : 0.000038s : 4: replace.inline 43.82% : 0.000030s : 6: replace.tuple_list_get_item_eliminator ------[match.] 0.000158 10 90.23% : 0.000143s : 4: match.inline 9.77% : 0.000015s : 6: match.tuple_list_get_item_eliminator ------[predicate.] 0.000231 1590 0.90% : 0.000002s : 16: predicate.accumulaten_eliminater 0.70% : 0.000002s : 5: predicate.ad_related_special_op_eliminate 0.53% : 0.000001s : 10: predicate.addn_check_dump 0.94% : 0.000002s : 16: predicate.addn_zero_filter 0.83% : 0.000002s : 16: predicate.adjust_all_reduce_mul_add 2.02% : 0.000005s : 26: predicate.arithmetic_simplify 0.97% : 0.000002s : 16: predicate.cast_eliminate 0.59% : 0.000001s : 10: predicate.check_bprop_eliminate 0.52% : 0.000001s : 10: predicate.compare_switch_simplify 0.21% : 0.000000s : 5: predicate.const_output_eliminate 0.57% : 0.000001s : 10: predicate.depend_value_elim 0.98% : 0.000002s : 16: predicate.dict_get_item_const_eliminator 1.09% : 0.000003s : 16: predicate.dict_get_item_eliminator 0.90% : 0.000002s : 16: predicate.dict_set_item_eliminator 0.82% : 0.000002s : 10: predicate.dumpgradient_eliminate 0.19% : 0.000000s : 5: predicate.elim_not_effective 0.32% : 0.000001s : 5: predicate.elim_shapecalc_of_broadcastargs 1.21% : 0.000003s : 21: predicate.environ_add_const_eliminate 1.09% : 0.000003s : 21: predicate.environ_get_add_eliminate 1.12% : 0.000003s : 21: predicate.environ_get_depend_swap 1.70% : 0.000004s : 31: predicate.environ_get_eliminate 1.10% : 0.000003s : 21: predicate.environ_get_set_eliminate 1.54% : 0.000004s : 26: predicate.exchange_switch_depend_value 2.54% : 0.000006s : 26: predicate.float_depend_g_call 0.50% : 0.000001s : 10: predicate.float_environ_get_switch 0.74% : 0.000002s : 15: predicate.float_tuple_getitem_switch 0.20% : 0.000000s : 5: predicate.fold_const_symbol 0.60% : 0.000001s : 10: predicate.get_grad_eliminate 0.22% : 0.000001s : 5: predicate.graph_param_transform 0.57% : 0.000001s : 10: predicate.incorporate_call 0.48% : 0.000001s : 10: predicate.incorporate_call_switch 6.26% : 0.000014s : 72: predicate.inline 0.78% : 0.000002s : 10: predicate.inline_without_move 0.35% : 0.000001s : 10: predicate.j_node_and_user_rematch 0.78% : 0.000002s : 10: predicate.less_batch_normalization 1.95% : 0.000005s : 32: predicate.list_to_tuple_eliminator_ 2.73% : 0.000006s : 48: predicate.load_eliminater 0.77% : 0.000002s : 5: predicate.loop_unroll_after_grad 2.38% : 0.000005s : 40: predicate.loop_unroll_before_grad 1.60% : 0.000004s : 26: predicate.make_slice_get_slice_eliminator 0.54% : 0.000001s : 10: predicate.merge_addn 0.59% : 0.000001s : 10: predicate.micro_step_allgather_replace 0.56% : 0.000001s : 10: predicate.mini_step_allgather_replace 0.87% : 0.000002s : 16: predicate.minmaximum_grad 0.87% : 0.000002s : 5: predicate.mutable_eliminate 0.32% : 0.000001s : 5: predicate.opt_reshape 0.34% : 0.000001s : 5: predicate.parallel_virtual_node 1.86% : 0.000004s : 26: predicate.partial_defer_inline 1.80% : 0.000004s : 27: predicate.partial_eliminate 1.00% : 0.000002s : 16: predicate.print_const_string_wrapper 0.55% : 0.000001s : 10: predicate.reduce_all_const_elim 1.19% : 0.000003s : 16: predicate.reduce_eliminate 2.69% : 0.000006s : 48: predicate.redundant_stop_gradient_eliminater 0.41% : 0.000001s : 10: predicate.remove_not_recompute_node 1.58% : 0.000004s : 32: predicate.replace_applicator 0.44% : 0.000001s : 10: predicate.replace_old_param 0.29% : 0.000001s : 5: predicate.reset_defer_inline 0.93% : 0.000002s : 16: predicate.reshape_eliminate 0.55% : 0.000001s : 10: predicate.row_tensor_add_zeros_like 0.33% : 0.000001s : 5: predicate.row_tensor_eliminate 0.91% : 0.000002s : 10: predicate.same_eliminate 0.45% : 0.000001s : 10: predicate.set_cell_output_no_recompute 0.83% : 0.000002s : 10: predicate.shard_identity_eliminate 0.65% : 0.000002s : 10: predicate.special_op_eliminate 0.65% : 0.000001s : 10: predicate.specialize_transform 0.81% : 0.000002s : 10: predicate.split_environ_get_set_with_tuple_value 0.77% : 0.000002s : 10: predicate.stack_unstack_eliminate 0.34% : 0.000001s : 5: predicate.switch_call_monad_eliminater 1.62% : 0.000004s : 26: predicate.switch_defer_inline 2.10% : 0.000005s : 36: predicate.switch_layer_defer_inline 5.23% : 0.000012s : 81: predicate.switch_simplify 0.95% : 0.000002s : 16: predicate.tile_eliminate 0.90% : 0.000002s : 16: predicate.transpose_eliminate 1.46% : 0.000003s : 26: predicate.tuple_list_convert_item_index_to_positive 1.60% : 0.000004s : 26: predicate.tuple_list_get_item_const_eliminator 1.45% : 0.000003s : 26: predicate.tuple_list_get_item_depend_reorder 3.21% : 0.000007s : 42: predicate.tuple_list_get_item_eliminator 1.41% : 0.000003s : 26: predicate.tuple_list_get_set_item_eliminator 2.36% : 0.000005s : 36: predicate.tuple_list_set_item_eliminator 1.98% : 0.000005s : 32: predicate.tuple_to_list_eliminator_ 2.66% : 0.000006s : 48: predicate.updatestate_pure_node_eliminater 3.30% : 0.000008s : 58: predicate.updatestate_useless_node_eliminater 0.45% : 0.000001s : 5: predicate.value_based_eliminate 0.64% : 0.000001s : 10: predicate.virtual_dataset_eliminate 0.61% : 0.000001s : 10: predicate.virtual_output_eliminate 0.29% : 0.000001s : 5: predicate.virtual_view_grad_eliminate 0.39% : 0.000001s : 5: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000538 11 54.14% : 0.000291s : 5: func_graph_cloner_run.FuncGraphClonerGraph 45.86% : 0.000247s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.024942 192 0.01% : 0.000004s : 1: ForceFp32Comm 12.38% : 0.003087s : 1: add_attr 12.34% : 0.003078s : 1: add_attr_with_inline 0.01% : 0.000004s : 1: add_comm_op_reuse_tag 0.20% : 0.000049s : 1: add_recomputation 0.02% : 0.000004s : 1: assign_add_opt 0.26% : 0.000066s : 1: auto_monad 0.09% : 0.000023s : 1: auto_monad_reorder 0.01% : 0.000003s : 1: begin_end_overlap_inline 0.02% : 0.000005s : 1: bias_add_comm_swap 1.85% : 0.000461s : 1: bootstrap 0.11% : 0.000027s : 1: cconv 0.02% : 0.000004s : 1: comm_op_add_attrs 0.06% : 0.000015s : 1: control_data_broadcast_order 0.04% : 0.000011s : 1: convert_after_rewriter 0.10% : 0.000024s : 1: cse_after_recomputation 0.02% : 0.000006s : 1: dataset_repeat_opt 0.02% : 0.000005s : 1: detach_backward 0.03% : 0.000008s : 1: environ_conv 0.10% : 0.000025s : 1: event_method 0.02% : 0.000005s : 1: full_micro_interleaved_order_control 0.02% : 0.000004s : 1: get_jit_bprop_graph 0.04% : 0.000009s : 1: graph_reusing 0.02% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.02% : 0.000004s : 1: handle_group_info 0.02% : 0.000005s : 1: inline 0.02% : 0.000006s : 1: insert-virtual-dataset 0.02% : 0.000004s : 1: interleave_parallel_branches 0.02% : 0.000004s : 1: interleave_split_concat_branches 0.02% : 0.000005s : 1: label_fine_grained_interleaved_index 0.03% : 0.000006s : 1: label_micro_interleaved_index 1.74% : 0.000433s : 1: loop_unroll 0.02% : 0.000004s : 1: merge_cast_opt 0.02% : 0.000005s : 1: micro_interleaved_order_control 2.05% : 0.000512s : 1: mutable_eliminate 0.03% : 0.000007s : 1: offloading_packed_experts 0.06% : 0.000014s : 1: opt.transform.loop_unroll_optimizer 0.06% : 0.000015s : 1: opt.transform.mutable_eliminate 5.40% : 0.001348s : 78: opt.transform.opt_a 0.14% : 0.000035s : 1: opt.transform.opt_after_cconv 0.11% : 0.000026s : 1: opt.transform.opt_after_jit_grad 0.50% : 0.000124s : 28: opt.transform.opt_b 0.22% : 0.000055s : 2: opt.transform.opt_trans_graph 0.15% : 0.000038s : 4: opt.transform.symbol_engine_opt 11.32% : 0.002824s : 1: opt_a 0.44% : 0.000111s : 1: opt_after_cconv 1.91% : 0.000476s : 1: opt_after_jit_grad 0.93% : 0.000232s : 1: opt_b 19.47% : 0.004855s : 1: optimize 0.08% : 0.000019s : 1: optimize_parallel_all_gather_comm 0.03% : 0.000008s : 1: order_py_execute_after_rewriter 0.09% : 0.000021s : 1: overlap_grad_flash_sp 0.02% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.03% : 0.000007s : 1: overlap_grad_ring_attention 0.02% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.02% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.02% : 0.000005s : 1: overlap_param_gather 0.02% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.03% : 0.000008s : 1: overlap_recompute_and_grad_model_parallel 0.02% : 0.000005s : 1: overlap_recompute_comm 0.03% : 0.000006s : 1: parallel-infer-symbol 0.01% : 0.000004s : 1: parallel-infer-symbol-second 0.02% : 0.000005s : 1: partial_unused_args_eliminate 0.02% : 0.000005s : 1: pipeline_parallel_scheduler 0.02% : 0.000004s : 1: pipeline_split 0.15% : 0.000037s : 1: pre_auto_parallel 0.13% : 0.000031s : 1: py_interpret_to_execute 0.06% : 0.000014s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000004s : 1: remove_cast_before_assign_add 0.07% : 0.000017s : 1: remove_dup_value 1.33% : 0.000332s : 1: renormalize.infer 1.20% : 0.000299s : 1: renormalize.specialize 0.02% : 0.000005s : 1: reorder_send_recv_between_fp_bp 0.03% : 0.000007s : 1: rewriter_after_jit_bprop_graph 0.16% : 0.000039s : 1: rewriter_after_opt_a 0.37% : 0.000092s : 1: rewriter_before_opt_a 0.02% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.02% : 0.000005s : 1: slice_recompute_activation 0.02% : 0.000005s : 1: split_layernorm_comm 0.02% : 0.000005s : 1: split_matmul_comm_elemetwise 0.03% : 0.000008s : 1: swap_dp_allreduce_reducescatter 0.32% : 0.000079s : 1: symbol_engine_optimizer 0.34% : 0.000084s : 1: tuple_transform 22.76% : 0.005678s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:47:16.929.839 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:47:16.930.087 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0170971, [21] [bootstrap]: 0.00043981 [type_inference]: 0.00609526 [event_method]: 1.94e-05 [auto_monad]: 6.179e-05 [graph_reusing]: 5.94e-06 [inline]: 2.45002e-06 [add_attr]: 0.00315268, [1] [add_attr_with_inline]: 0.00314351, [1] [Cycle 1]: 7.139e-05, [2] [tag_attr]: 2.034e-05 [meta_addattr_fg_expand]: 6.12999e-06 [parallel-infer-symbol]: 3.33998e-06 [pre_auto_parallel]: 3.339e-05 [insert-virtual-dataset]: 2.26e-06 [parallel-infer-symbol-second]: 8.2e-07 [dataset_repeat_opt]: 1.92001e-06 [pipeline_split]: 1.60999e-06 [optimize]: 0.00595711, [53] [py_interpret_to_execute]: 3.719e-05 [rewriter_before_opt_a]: 9.43e-05 [opt_a]: 0.00334746, [2] [Cycle 1]: 0.0024314, [45] [expand_dump_flag]: 3.70998e-06 [switch_simplify]: 4.633e-05 [loop_unroll]: 3.469e-05 [a_1]: 0.00075081 [with_stream_mark]: 1.559e-05 [recompute_prepare]: 9.70002e-06 [updatestate_depend_eliminate]: 4.11001e-06 [updatestate_assign_eliminate]: 3.38e-06 [updatestate_loads_eliminate]: 2.98e-06 [parameter_eliminate]: 2.24999e-06 [a_2]: 0.0001214 [accelerated_algorithm]: 7.41001e-06 [shard]: 2.16003e-06 [meta_shard_fg_expand]: 1.89999e-06 [shard_inline]: 7.19001e-06 [merge_send_recv]: 8.15e-06 [auto_parallel]: 6.59001e-06 [parallel]: 1.83e-05 [flash_sp]: 8.39002e-06 [merge_comm]: 4.19002e-06 [allreduce_fusion]: 3.45e-06 [matmul_add_comm_reduction]: 9.35001e-06 [allreduce_slice_to_reducescatter]: 9.39996e-07 [virtual_shard_identity]: 8.89e-06 [virtual_dataset]: 7.66001e-06 [get_grad_eliminate_]: 7.08e-06 [virtual_output]: 7.21001e-06 [merge_forward]: 3.98999e-06 [cell_reuse_recompute_pass]: 1.40001e-06 [offload_activation]: 9.76998e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.494e-05 [merge_recompute_call_nodes]: 1.50999e-06 [before_grad]: 1.143e-05 [set_forward_comm_id_for_comm_node_pass]: 3.61999e-06 [meta_fg_expand]: 3.49001e-06 [flash_sp_send_recv_attached]: 2.41e-06 [receive_attached]: 2.24999e-06 [after_resolve]: 1.393e-05 [a_after_grad]: 1.181e-05 [renormalize]: 0.00074159 [add_forward_monad_depend]: 5.39e-06 [auto_monad_grad]: 2.24001e-06 [auto_monad_eliminator]: 1.561e-05 [cse]: 3.349e-05 [a_3]: 6.978e-05 [Cycle 2]: 0.00090197, [45] [expand_dump_flag]: 1.33002e-06 [switch_simplify]: 8.84998e-06 [loop_unroll]: 7.3e-06 [a_1]: 0.00016271 [with_stream_mark]: 1.267e-05 [recompute_prepare]: 6.93e-06 [updatestate_depend_eliminate]: 3.13e-06 [updatestate_assign_eliminate]: 2.96001e-06 [updatestate_loads_eliminate]: 2.54001e-06 [parameter_eliminate]: 1.16002e-06 [a_2]: 0.00010941 [accelerated_algorithm]: 7.24001e-06 [shard]: 1.71e-06 [meta_shard_fg_expand]: 1.45999e-06 [shard_inline]: 7.40998e-06 [merge_send_recv]: 6.23e-06 [auto_parallel]: 6.84999e-06 [parallel]: 5.32999e-06 [flash_sp]: 3.8e-06 [merge_comm]: 3.46999e-06 [allreduce_fusion]: 3.43e-06 [matmul_add_comm_reduction]: 7.49002e-06 [allreduce_slice_to_reducescatter]: 4.39992e-07 [virtual_shard_identity]: 7.76001e-06 [virtual_dataset]: 6.85002e-06 [get_grad_eliminate_]: 6.52001e-06 [virtual_output]: 6.57002e-06 [merge_forward]: 2.98e-06 [cell_reuse_recompute_pass]: 1.63002e-06 [offload_activation]: 8.06001e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.767e-05 [merge_recompute_call_nodes]: 1.12e-06 [before_grad]: 1.155e-05 [set_forward_comm_id_for_comm_node_pass]: 3.60998e-06 [meta_fg_expand]: 2.69999e-06 [flash_sp_send_recv_attached]: 1.00999e-06 [receive_attached]: 1.34e-06 [after_resolve]: 1.338e-05 [a_after_grad]: 1.065e-05 [renormalize]: 5.9983e-08 [add_forward_monad_depend]: 1.41002e-06 [auto_monad_grad]: 1.10999e-06 [auto_monad_eliminator]: 8.32e-06 [cse]: 1.58e-05 [a_3]: 5.437e-05 [py_interpret_to_execute_after_opt_a]: 1.688e-05 [slice_cell_reuse_recomputed_activation]: 5.02e-06 [rewriter_after_opt_a]: 3.93e-05 [convert_after_rewriter]: 9.97001e-06 [order_py_execute_after_rewriter]: 8.13999e-06 [mutable_eliminate]: 0.0006537 [opt_b]: 0.00034717, [1] [Cycle 1]: 0.00033717, [7] [b_1]: 0.00023406 [b_2]: 9.09e-06 [updatestate_depend_eliminate]: 6.64001e-06 [updatestate_assign_eliminate]: 2.81999e-06 [updatestate_loads_eliminate]: 3.19001e-06 [renormalize]: 7.50006e-07 [cse]: 2.288e-05 [optimize_parallel_all_gather_comm]: 2.062e-05 [overlap_param_gather]: 4.53001e-06 [cconv]: 2.969e-05 [loop_unroll]: 0.00050219 [opt_after_cconv]: 0.00013709, [1] [Cycle 1]: 0.00012854, [7] [c_1]: 3.682e-05 [parameter_eliminate]: 3.30998e-06 [updatestate_depend_eliminate]: 6.16e-06 [updatestate_assign_eliminate]: 2.54999e-06 [updatestate_loads_eliminate]: 2.58e-06 [cse]: 2.055e-05 [renormalize]: 6.10016e-07 [remove_dup_value]: 1.815e-05 [tuple_transform]: 9.79e-05, [1] [Cycle 1]: 9.015e-05, [4] [d_1]: 5.063e-05 [none_parameter_eliminate]: 1.72999e-06 [renormalize]: 2.30008e-07 [switch_simplify]: 7.91001e-06 [partial_unused_args_eliminate]: 4.50001e-06 [add_recomputation]: 5.52e-05 [cse_after_recomputation]: 2.924e-05, [1] [Cycle 1]: 2.168e-05, [1] [cse]: 1.22e-05 [environ_conv]: 8.57998e-06 [swap_dp_allreduce_reducescatter]: 8.08001e-06 [bias_add_comm_swap]: 5.05999e-06 [label_micro_interleaved_index]: 8.10999e-06 [label_fine_grained_interleaved_index]: 5.16998e-06 [merge_cast_opt]: 3.98001e-06 [slice_recompute_activation]: 4.75001e-06 [micro_interleaved_order_control]: 4.55001e-06 [assign_add_opt]: 3.63e-06 [ForceFp32Comm]: 3.54002e-06 [remove_cast_before_assign_add]: 3.38e-06 [full_micro_interleaved_order_control]: 4.35e-06 [reorder_send_recv_between_fp_bp]: 5.37001e-06 [comm_op_add_attrs]: 3.61001e-06 [add_comm_op_reuse_tag]: 3.22002e-06 [interleave_split_concat_branches]: 3.76001e-06 [interleave_parallel_branches]: 3.45e-06 [overlap_opt_shard_in_pipeline]: 3.85e-06 [overlap_opt_shard_grad_in_pipeline]: 5.30001e-06 [control_data_broadcast_order]: 1.615e-05 [grouped_pairwise_exchange_alltoall]: 4.28001e-06 [offloading_packed_experts]: 7.10002e-06 [overlap_recompute_and_grad_model_parallel]: 6.99001e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.63999e-06 [overlap_recompute_allgather_and_fa_grad]: 3.9e-06 [overlap_recompute_comm]: 5.10999e-06 [overlap_grad_ring_attention]: 6.69001e-06 [overlap_grad_flash_sp]: 2.26e-05 [begin_end_overlap_inline]: 3.28e-06 [split_matmul_comm_elemetwise]: 4.89e-06 [split_layernorm_comm]: 4.22e-06 [handle_group_info]: 3.34001e-06 [symbol_engine_optimizer]: 0.00010253, [1] [Cycle 1]: 9.546e-05, [6] [build]: 2.93e-06 [elim_shapecalc]: 1.155e-05 [elim_not_effective]: 1.418e-05 [opt_reshape]: 8.42e-06 [fold_const_symbol]: 1.09e-05 [renormalize]: 1.90019e-07 [detach_backward]: 3.44001e-06 [pipeline_parallel_scheduler]: 1.74e-06 [auto_monad_reorder]: 2.175e-05 [get_jit_bprop_graph]: 1.66e-06 [rewriter_after_jit_bprop_graph]: 5.67001e-06 [opt_after_jit_grad]: 0.00063999 [validate]: 4.294e-05 Sums bootstrap : 0.000440s : 3.61% type_inference : 0.006095s : 50.05% event_method : 0.000019s : 0.16% auto_monad : 0.000062s : 0.51% graph_reusing : 0.000006s : 0.05% inline : 0.000002s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000020s : 0.17% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.05% parallel-infer-symbol : 0.000003s : 0.03% pre_auto_parallel : 0.000033s : 0.27% insert-virtual-dataset : 0.000002s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000037s : 0.31% optimize.rewriter_before_opt_a : 0.000094s : 0.77% optimize.opt_a.expand_dump_flag : 0.000005s : 0.04% optimize.opt_a.switch_simplify : 0.000055s : 0.45% optimize.opt_a.loop_unroll : 0.000042s : 0.34% optimize.opt_a.a_1 : 0.000914s : 7.50% optimize.opt_a.with_stream_mark : 0.000028s : 0.23% optimize.opt_a.recompute_prepare : 0.000017s : 0.14% optimize.opt_a.updatestate_depend_eliminate : 0.000007s : 0.06% optimize.opt_a.updatestate_assign_eliminate : 0.000006s : 0.05% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.05% optimize.opt_a.parameter_eliminate : 0.000003s : 0.03% optimize.opt_a.a_2 : 0.000231s : 1.90% optimize.opt_a.accelerated_algorithm : 0.000015s : 0.12% optimize.opt_a.shard : 0.000004s : 0.03% optimize.opt_a.meta_shard_fg_expand : 0.000003s : 0.03% optimize.opt_a.shard_inline : 0.000015s : 0.12% optimize.opt_a.merge_send_recv : 0.000014s : 0.12% optimize.opt_a.auto_parallel : 0.000013s : 0.11% optimize.opt_a.parallel : 0.000024s : 0.19% optimize.opt_a.flash_sp : 0.000012s : 0.10% optimize.opt_a.merge_comm : 0.000008s : 0.06% optimize.opt_a.allreduce_fusion : 0.000007s : 0.06% optimize.opt_a.matmul_add_comm_reduction : 0.000017s : 0.14% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000017s : 0.14% optimize.opt_a.virtual_dataset : 0.000015s : 0.12% optimize.opt_a.get_grad_eliminate_ : 0.000014s : 0.11% optimize.opt_a.virtual_output : 0.000014s : 0.11% optimize.opt_a.merge_forward : 0.000007s : 0.06% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.02% optimize.opt_a.offload_activation : 0.000018s : 0.15% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000033s : 0.27% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.02% optimize.opt_a.before_grad : 0.000023s : 0.19% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000007s : 0.06% optimize.opt_a.meta_fg_expand : 0.000006s : 0.05% optimize.opt_a.flash_sp_send_recv_attached : 0.000003s : 0.03% optimize.opt_a.receive_attached : 0.000004s : 0.03% optimize.opt_a.after_resolve : 0.000027s : 0.22% optimize.opt_a.a_after_grad : 0.000022s : 0.18% optimize.opt_a.renormalize : 0.000742s : 6.09% optimize.opt_a.add_forward_monad_depend : 0.000007s : 0.06% optimize.opt_a.auto_monad_grad : 0.000003s : 0.03% optimize.opt_a.auto_monad_eliminator : 0.000024s : 0.20% optimize.opt_a.cse : 0.000049s : 0.40% optimize.opt_a.a_3 : 0.000124s : 1.02% optimize.py_interpret_to_execute_after_opt_a : 0.000017s : 0.14% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.04% optimize.rewriter_after_opt_a : 0.000039s : 0.32% optimize.convert_after_rewriter : 0.000010s : 0.08% optimize.order_py_execute_after_rewriter : 0.000008s : 0.07% optimize.mutable_eliminate : 0.000654s : 5.37% optimize.opt_b.b_1 : 0.000234s : 1.92% optimize.opt_b.b_2 : 0.000009s : 0.07% optimize.opt_b.updatestate_depend_eliminate : 0.000007s : 0.05% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.02% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_b.renormalize : 0.000001s : 0.01% optimize.opt_b.cse : 0.000023s : 0.19% optimize.optimize_parallel_all_gather_comm : 0.000021s : 0.17% optimize.overlap_param_gather : 0.000005s : 0.04% optimize.cconv : 0.000030s : 0.24% optimize.loop_unroll : 0.000502s : 4.12% optimize.opt_after_cconv.c_1 : 0.000037s : 0.30% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.05% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.02% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.02% optimize.opt_after_cconv.cse : 0.000021s : 0.17% optimize.opt_after_cconv.renormalize : 0.000001s : 0.01% optimize.remove_dup_value : 0.000018s : 0.15% optimize.tuple_transform.d_1 : 0.000051s : 0.42% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000008s : 0.06% optimize.partial_unused_args_eliminate : 0.000005s : 0.04% optimize.add_recomputation : 0.000055s : 0.45% optimize.cse_after_recomputation.cse : 0.000012s : 0.10% optimize.environ_conv : 0.000009s : 0.07% optimize.swap_dp_allreduce_reducescatter : 0.000008s : 0.07% optimize.bias_add_comm_swap : 0.000005s : 0.04% optimize.label_micro_interleaved_index : 0.000008s : 0.07% optimize.label_fine_grained_interleaved_index : 0.000005s : 0.04% optimize.merge_cast_opt : 0.000004s : 0.03% optimize.slice_recompute_activation : 0.000005s : 0.04% optimize.micro_interleaved_order_control : 0.000005s : 0.04% optimize.assign_add_opt : 0.000004s : 0.03% optimize.ForceFp32Comm : 0.000004s : 0.03% optimize.remove_cast_before_assign_add : 0.000003s : 0.03% optimize.full_micro_interleaved_order_control : 0.000004s : 0.04% optimize.reorder_send_recv_between_fp_bp : 0.000005s : 0.04% optimize.comm_op_add_attrs : 0.000004s : 0.03% optimize.add_comm_op_reuse_tag : 0.000003s : 0.03% optimize.interleave_split_concat_branches : 0.000004s : 0.03% optimize.interleave_parallel_branches : 0.000003s : 0.03% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.03% optimize.overlap_opt_shard_grad_in_pipeline : 0.000005s : 0.04% optimize.control_data_broadcast_order : 0.000016s : 0.13% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.04% optimize.offloading_packed_experts : 0.000007s : 0.06% optimize.overlap_recompute_and_grad_model_parallel : 0.000007s : 0.06% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.03% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.03% optimize.overlap_recompute_comm : 0.000005s : 0.04% optimize.overlap_grad_ring_attention : 0.000007s : 0.05% optimize.overlap_grad_flash_sp : 0.000023s : 0.19% optimize.begin_end_overlap_inline : 0.000003s : 0.03% optimize.split_matmul_comm_elemetwise : 0.000005s : 0.04% optimize.split_layernorm_comm : 0.000004s : 0.03% optimize.handle_group_info : 0.000003s : 0.03% optimize.symbol_engine_optimizer.build : 0.000003s : 0.02% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000012s : 0.09% optimize.symbol_engine_optimizer.elim_not_effective : 0.000014s : 0.12% optimize.symbol_engine_optimizer.opt_reshape : 0.000008s : 0.07% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000011s : 0.09% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000003s : 0.03% pipeline_parallel_scheduler : 0.000002s : 0.01% auto_monad_reorder : 0.000022s : 0.18% get_jit_bprop_graph : 0.000002s : 0.01% rewriter_after_jit_bprop_graph : 0.000006s : 0.05% opt_after_jit_grad : 0.000640s : 5.26% validate : 0.000043s : 0.35% Time group info: ------[substitution.] 0.000224 34 0.86% : 0.000002s : 2: substitution.elim_not_effective 0.66% : 0.000001s : 2: substitution.fold_const_symbol 2.68% : 0.000006s : 5: substitution.graph_param_transform 77.72% : 0.000174s : 4: substitution.inline 2.06% : 0.000005s : 4: substitution.j_node_and_user_rematch 2.42% : 0.000005s : 4: substitution.remove_not_recompute_node 2.87% : 0.000006s : 6: substitution.replace_old_param 8.33% : 0.000019s : 6: substitution.tuple_list_get_item_eliminator 2.42% : 0.000005s : 1: substitution.value_based_eliminate ------[type_inference.] 0.006041 2 86.85% : 0.005246s : 1: type_inference.infer 13.15% : 0.000794s : 1: type_inference.specialize ------[replace.] 0.000071 10 55.50% : 0.000039s : 4: replace.inline 44.50% : 0.000032s : 6: replace.tuple_list_get_item_eliminator ------[match.] 0.000188 10 91.47% : 0.000172s : 4: match.inline 8.53% : 0.000016s : 6: match.tuple_list_get_item_eliminator ------[predicate.] 0.000276 1590 0.80% : 0.000002s : 16: predicate.accumulaten_eliminater 0.90% : 0.000002s : 5: predicate.ad_related_special_op_eliminate 0.55% : 0.000002s : 10: predicate.addn_check_dump 0.80% : 0.000002s : 16: predicate.addn_zero_filter 0.74% : 0.000002s : 16: predicate.adjust_all_reduce_mul_add 1.75% : 0.000005s : 26: predicate.arithmetic_simplify 0.98% : 0.000003s : 16: predicate.cast_eliminate 0.69% : 0.000002s : 10: predicate.check_bprop_eliminate 0.45% : 0.000001s : 10: predicate.compare_switch_simplify 0.18% : 0.000001s : 5: predicate.const_output_eliminate 0.54% : 0.000002s : 10: predicate.depend_value_elim 0.83% : 0.000002s : 16: predicate.dict_get_item_const_eliminator 0.90% : 0.000002s : 16: predicate.dict_get_item_eliminator 0.79% : 0.000002s : 16: predicate.dict_set_item_eliminator 0.88% : 0.000002s : 10: predicate.dumpgradient_eliminate 0.19% : 0.000001s : 5: predicate.elim_not_effective 0.40% : 0.000001s : 5: predicate.elim_shapecalc_of_broadcastargs 1.11% : 0.000003s : 21: predicate.environ_add_const_eliminate 0.94% : 0.000003s : 21: predicate.environ_get_add_eliminate 0.94% : 0.000003s : 21: predicate.environ_get_depend_swap 1.51% : 0.000004s : 31: predicate.environ_get_eliminate 0.93% : 0.000003s : 21: predicate.environ_get_set_eliminate 1.29% : 0.000004s : 26: predicate.exchange_switch_depend_value 2.26% : 0.000006s : 26: predicate.float_depend_g_call 0.47% : 0.000001s : 10: predicate.float_environ_get_switch 0.66% : 0.000002s : 15: predicate.float_tuple_getitem_switch 0.16% : 0.000000s : 5: predicate.fold_const_symbol 0.57% : 0.000002s : 10: predicate.get_grad_eliminate 0.20% : 0.000001s : 5: predicate.graph_param_transform 0.51% : 0.000001s : 10: predicate.incorporate_call 0.40% : 0.000001s : 10: predicate.incorporate_call_switch 17.51% : 0.000048s : 72: predicate.inline 0.62% : 0.000002s : 10: predicate.inline_without_move 0.36% : 0.000001s : 10: predicate.j_node_and_user_rematch 0.65% : 0.000002s : 10: predicate.less_batch_normalization 1.58% : 0.000004s : 32: predicate.list_to_tuple_eliminator_ 2.22% : 0.000006s : 48: predicate.load_eliminater 0.82% : 0.000002s : 5: predicate.loop_unroll_after_grad 2.13% : 0.000006s : 40: predicate.loop_unroll_before_grad 1.28% : 0.000004s : 26: predicate.make_slice_get_slice_eliminator 0.51% : 0.000001s : 10: predicate.merge_addn 0.48% : 0.000001s : 10: predicate.micro_step_allgather_replace 0.50% : 0.000001s : 10: predicate.mini_step_allgather_replace 0.72% : 0.000002s : 16: predicate.minmaximum_grad 0.95% : 0.000003s : 5: predicate.mutable_eliminate 0.32% : 0.000001s : 5: predicate.opt_reshape 0.32% : 0.000001s : 5: predicate.parallel_virtual_node 1.53% : 0.000004s : 26: predicate.partial_defer_inline 1.56% : 0.000004s : 27: predicate.partial_eliminate 0.89% : 0.000002s : 16: predicate.print_const_string_wrapper 0.50% : 0.000001s : 10: predicate.reduce_all_const_elim 1.00% : 0.000003s : 16: predicate.reduce_eliminate 2.19% : 0.000006s : 48: predicate.redundant_stop_gradient_eliminater 0.42% : 0.000001s : 10: predicate.remove_not_recompute_node 1.25% : 0.000003s : 32: predicate.replace_applicator 0.43% : 0.000001s : 10: predicate.replace_old_param 0.22% : 0.000001s : 5: predicate.reset_defer_inline 0.83% : 0.000002s : 16: predicate.reshape_eliminate 0.53% : 0.000001s : 10: predicate.row_tensor_add_zeros_like 0.32% : 0.000001s : 5: predicate.row_tensor_eliminate 0.62% : 0.000002s : 10: predicate.same_eliminate 0.37% : 0.000001s : 10: predicate.set_cell_output_no_recompute 0.65% : 0.000002s : 10: predicate.shard_identity_eliminate 0.62% : 0.000002s : 10: predicate.special_op_eliminate 0.58% : 0.000002s : 10: predicate.specialize_transform 0.74% : 0.000002s : 10: predicate.split_environ_get_set_with_tuple_value 0.79% : 0.000002s : 10: predicate.stack_unstack_eliminate 0.28% : 0.000001s : 5: predicate.switch_call_monad_eliminater 1.39% : 0.000004s : 26: predicate.switch_defer_inline 2.05% : 0.000006s : 36: predicate.switch_layer_defer_inline 4.45% : 0.000012s : 81: predicate.switch_simplify 0.80% : 0.000002s : 16: predicate.tile_eliminate 0.78% : 0.000002s : 16: predicate.transpose_eliminate 1.29% : 0.000004s : 26: predicate.tuple_list_convert_item_index_to_positive 1.31% : 0.000004s : 26: predicate.tuple_list_get_item_const_eliminator 1.33% : 0.000004s : 26: predicate.tuple_list_get_item_depend_reorder 2.86% : 0.000008s : 42: predicate.tuple_list_get_item_eliminator 1.32% : 0.000004s : 26: predicate.tuple_list_get_set_item_eliminator 2.02% : 0.000006s : 36: predicate.tuple_list_set_item_eliminator 1.60% : 0.000004s : 32: predicate.tuple_to_list_eliminator_ 2.14% : 0.000006s : 48: predicate.updatestate_pure_node_eliminater 3.01% : 0.000008s : 58: predicate.updatestate_useless_node_eliminater 0.40% : 0.000001s : 5: predicate.value_based_eliminate 0.51% : 0.000001s : 10: predicate.virtual_dataset_eliminate 0.55% : 0.000002s : 10: predicate.virtual_output_eliminate 0.22% : 0.000001s : 5: predicate.virtual_view_grad_eliminate 0.38% : 0.000001s : 5: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000568 11 53.28% : 0.000303s : 5: func_graph_cloner_run.FuncGraphClonerGraph 46.72% : 0.000265s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.028580 192 0.02% : 0.000006s : 1: ForceFp32Comm 11.06% : 0.003161s : 1: add_attr 11.01% : 0.003147s : 1: add_attr_with_inline 0.02% : 0.000006s : 1: add_comm_op_reuse_tag 0.21% : 0.000059s : 1: add_recomputation 0.02% : 0.000006s : 1: assign_add_opt 0.25% : 0.000070s : 1: auto_monad 0.10% : 0.000030s : 1: auto_monad_reorder 0.02% : 0.000006s : 1: begin_end_overlap_inline 0.03% : 0.000008s : 1: bias_add_comm_swap 1.68% : 0.000480s : 1: bootstrap 0.11% : 0.000033s : 1: cconv 0.02% : 0.000007s : 1: comm_op_add_attrs 0.07% : 0.000019s : 1: control_data_broadcast_order 0.05% : 0.000013s : 1: convert_after_rewriter 0.11% : 0.000032s : 1: cse_after_recomputation 0.03% : 0.000007s : 1: dataset_repeat_opt 0.07% : 0.000020s : 1: detach_backward 0.04% : 0.000011s : 1: environ_conv 0.10% : 0.000029s : 1: event_method 0.02% : 0.000007s : 1: full_micro_interleaved_order_control 0.03% : 0.000008s : 1: get_jit_bprop_graph 0.04% : 0.000013s : 1: graph_reusing 0.02% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.02% : 0.000006s : 1: handle_group_info 0.03% : 0.000008s : 1: inline 0.03% : 0.000009s : 1: insert-virtual-dataset 0.02% : 0.000006s : 1: interleave_parallel_branches 0.02% : 0.000006s : 1: interleave_split_concat_branches 0.03% : 0.000008s : 1: label_fine_grained_interleaved_index 0.04% : 0.000011s : 1: label_micro_interleaved_index 1.78% : 0.000508s : 1: loop_unroll 0.03% : 0.000008s : 1: merge_cast_opt 0.02% : 0.000007s : 1: micro_interleaved_order_control 2.31% : 0.000660s : 1: mutable_eliminate 0.04% : 0.000011s : 1: offloading_packed_experts 0.06% : 0.000017s : 1: opt.transform.loop_unroll_optimizer 0.06% : 0.000018s : 1: opt.transform.mutable_eliminate 4.91% : 0.001402s : 78: opt.transform.opt_a 0.12% : 0.000035s : 1: opt.transform.opt_after_cconv 0.11% : 0.000033s : 1: opt.transform.opt_after_jit_grad 0.59% : 0.000168s : 28: opt.transform.opt_b 0.20% : 0.000056s : 2: opt.transform.opt_trans_graph 0.14% : 0.000041s : 4: opt.transform.symbol_engine_opt 11.73% : 0.003351s : 1: opt_a 0.49% : 0.000141s : 1: opt_after_cconv 2.28% : 0.000652s : 1: opt_after_jit_grad 1.23% : 0.000351s : 1: opt_b 22.10% : 0.006316s : 1: optimize 0.08% : 0.000024s : 1: optimize_parallel_all_gather_comm 0.04% : 0.000011s : 1: order_py_execute_after_rewriter 0.09% : 0.000026s : 1: overlap_grad_flash_sp 0.02% : 0.000006s : 1: overlap_grad_matmul_and_grad_allreduce 0.03% : 0.000010s : 1: overlap_grad_ring_attention 0.03% : 0.000008s : 1: overlap_opt_shard_grad_in_pipeline 0.02% : 0.000006s : 1: overlap_opt_shard_in_pipeline 0.03% : 0.000008s : 1: overlap_param_gather 0.02% : 0.000006s : 1: overlap_recompute_allgather_and_fa_grad 0.03% : 0.000010s : 1: overlap_recompute_and_grad_model_parallel 0.03% : 0.000008s : 1: overlap_recompute_comm 0.03% : 0.000010s : 1: parallel-infer-symbol 0.02% : 0.000006s : 1: parallel-infer-symbol-second 0.03% : 0.000007s : 1: partial_unused_args_eliminate 0.03% : 0.000009s : 1: pipeline_parallel_scheduler 0.03% : 0.000007s : 1: pipeline_split 0.14% : 0.000041s : 1: pre_auto_parallel 0.14% : 0.000041s : 1: py_interpret_to_execute 0.07% : 0.000020s : 1: py_interpret_to_execute_after_opt_a 0.02% : 0.000006s : 1: remove_cast_before_assign_add 0.07% : 0.000021s : 1: remove_dup_value 1.40% : 0.000399s : 1: renormalize.infer 1.17% : 0.000334s : 1: renormalize.specialize 0.03% : 0.000008s : 1: reorder_send_recv_between_fp_bp 0.04% : 0.000011s : 1: rewriter_after_jit_bprop_graph 0.15% : 0.000043s : 1: rewriter_after_opt_a 0.34% : 0.000098s : 1: rewriter_before_opt_a 0.03% : 0.000008s : 1: slice_cell_reuse_recomputed_activation 0.03% : 0.000007s : 1: slice_recompute_activation 0.02% : 0.000007s : 1: split_layernorm_comm 0.03% : 0.000008s : 1: split_matmul_comm_elemetwise 0.04% : 0.000011s : 1: swap_dp_allreduce_reducescatter 0.37% : 0.000106s : 1: symbol_engine_optimizer 0.35% : 0.000101s : 1: tuple_transform 21.46% : 0.006134s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:47:17.127.950 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0156434, [21] [bootstrap]: 0.0004376 [type_inference]: 0.00597676 [event_method]: 2.098e-05 [auto_monad]: 8.474e-05 [graph_reusing]: 6.12999e-06 [inline]: 2.39001e-06 [add_attr]: 0.00314788, [1] [add_attr_with_inline]: 0.00313787, [1] [Cycle 1]: 6.478e-05, [2] [tag_attr]: 2.057e-05 [meta_addattr_fg_expand]: 6.72002e-06 [parallel-infer-symbol]: 2.98e-06 [pre_auto_parallel]: 3.477e-05 [insert-virtual-dataset]: 2.89999e-06 [parallel-infer-symbol-second]: 7.2e-07 [dataset_repeat_opt]: 2.03997e-06 [pipeline_split]: 1.60001e-06 [optimize]: 0.00520645, [53] [py_interpret_to_execute]: 3.056e-05 [rewriter_before_opt_a]: 9.145e-05 [opt_a]: 0.00298861, [2] [Cycle 1]: 0.00225168, [45] [expand_dump_flag]: 3.33e-06 [switch_simplify]: 4.852e-05 [loop_unroll]: 3.53e-05 [a_1]: 0.00076122 [with_stream_mark]: 1.679e-05 [recompute_prepare]: 1.069e-05 [updatestate_depend_eliminate]: 4.28001e-06 [updatestate_assign_eliminate]: 3.15002e-06 [updatestate_loads_eliminate]: 3.23e-06 [parameter_eliminate]: 1.90001e-06 [a_2]: 9.305e-05 [accelerated_algorithm]: 8.01001e-06 [shard]: 1.97001e-06 [meta_shard_fg_expand]: 1.87001e-06 [shard_inline]: 7.93001e-06 [merge_send_recv]: 8.70001e-06 [auto_parallel]: 7.92e-06 [parallel]: 2.015e-05 [flash_sp]: 7.67002e-06 [merge_comm]: 4.32e-06 [allreduce_fusion]: 3.28e-06 [matmul_add_comm_reduction]: 1.002e-05 [allreduce_slice_to_reducescatter]: 6.69999e-07 [virtual_shard_identity]: 9.08002e-06 [virtual_dataset]: 7.38e-06 [get_grad_eliminate_]: 6.93e-06 [virtual_output]: 6.91001e-06 [merge_forward]: 3.68e-06 [cell_reuse_recompute_pass]: 1.20999e-06 [offload_activation]: 9.62999e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.394e-05 [merge_recompute_call_nodes]: 1.72999e-06 [before_grad]: 1.097e-05 [set_forward_comm_id_for_comm_node_pass]: 3.51001e-06 [meta_fg_expand]: 3.25e-06 [flash_sp_send_recv_attached]: 2.77002e-06 [receive_attached]: 3.2e-06 [after_resolve]: 1.434e-05 [a_after_grad]: 1.103e-05 [renormalize]: 0.00070956 [add_forward_monad_depend]: 6.09001e-06 [auto_monad_grad]: 2.27999e-06 [auto_monad_eliminator]: 1.681e-05 [cse]: 3.244e-05 [a_3]: 5.443e-05 [Cycle 2]: 0.00072719, [45] [expand_dump_flag]: 1.12e-06 [switch_simplify]: 9.08002e-06 [loop_unroll]: 6.86001e-06 [a_1]: 0.00016171 [with_stream_mark]: 1.271e-05 [recompute_prepare]: 7.71999e-06 [updatestate_depend_eliminate]: 3.06999e-06 [updatestate_assign_eliminate]: 2.84999e-06 [updatestate_loads_eliminate]: 2.37999e-06 [parameter_eliminate]: 1.23002e-06 [a_2]: 8.313e-05 [accelerated_algorithm]: 7.11999e-06 [shard]: 1.66e-06 [meta_shard_fg_expand]: 1.48002e-06 [shard_inline]: 7.5e-06 [merge_send_recv]: 6.21e-06 [auto_parallel]: 6.18002e-06 [parallel]: 5.21002e-06 [flash_sp]: 3.66001e-06 [merge_comm]: 3.18998e-06 [allreduce_fusion]: 3.4e-06 [matmul_add_comm_reduction]: 6.23e-06 [allreduce_slice_to_reducescatter]: 5.39992e-07 [virtual_shard_identity]: 7.13e-06 [virtual_dataset]: 6.58998e-06 [get_grad_eliminate_]: 6.73998e-06 [virtual_output]: 6.54999e-06 [merge_forward]: 3.04999e-06 [cell_reuse_recompute_pass]: 1.42e-06 [offload_activation]: 6.94001e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.419e-05 [merge_recompute_call_nodes]: 1.08001e-06 [before_grad]: 1.014e-05 [set_forward_comm_id_for_comm_node_pass]: 3.45e-06 [meta_fg_expand]: 2.29001e-06 [flash_sp_send_recv_attached]: 1.17e-06 [receive_attached]: 9.70002e-07 [after_resolve]: 1.408e-05 [a_after_grad]: 1.082e-05 [renormalize]: 7.00238e-08 [add_forward_monad_depend]: 1.98002e-06 [auto_monad_grad]: 1.39e-06 [auto_monad_eliminator]: 9.22001e-06 [cse]: 1.739e-05 [a_3]: 4.14e-05 [py_interpret_to_execute_after_opt_a]: 1.206e-05 [slice_cell_reuse_recomputed_activation]: 2.16e-06 [rewriter_after_opt_a]: 3.795e-05 [convert_after_rewriter]: 7.08e-06 [order_py_execute_after_rewriter]: 5.34998e-06 [mutable_eliminate]: 0.00054533 [opt_b]: 0.00024537, [1] [Cycle 1]: 0.00023718, [7] [b_1]: 0.00015174 [b_2]: 9.59999e-06 [updatestate_depend_eliminate]: 7.3e-06 [updatestate_assign_eliminate]: 2.37999e-06 [updatestate_loads_eliminate]: 2.91999e-06 [renormalize]: 7.09988e-07 [cse]: 2.398e-05 [optimize_parallel_all_gather_comm]: 1.708e-05 [overlap_param_gather]: 2.21e-06 [cconv]: 2.563e-05 [loop_unroll]: 0.00045977 [opt_after_cconv]: 0.00011673, [1] [Cycle 1]: 0.00011042, [7] [c_1]: 3.695e-05 [parameter_eliminate]: 3.48999e-06 [updatestate_depend_eliminate]: 7.1e-06 [updatestate_assign_eliminate]: 2.51998e-06 [updatestate_loads_eliminate]: 2.36998e-06 [cse]: 2.249e-05 [renormalize]: 6.89994e-07 [remove_dup_value]: 1.453e-05 [tuple_transform]: 8.623e-05, [1] [Cycle 1]: 8.196e-05, [4] [d_1]: 5.161e-05 [none_parameter_eliminate]: 1.96003e-06 [renormalize]: 3.69997e-07 [switch_simplify]: 7.61999e-06 [partial_unused_args_eliminate]: 1.87001e-06 [add_recomputation]: 4.935e-05 [cse_after_recomputation]: 2.342e-05, [1] [Cycle 1]: 1.914e-05, [1] [cse]: 1.314e-05 [environ_conv]: 5.43002e-06 [swap_dp_allreduce_reducescatter]: 5.42001e-06 [bias_add_comm_swap]: 2.72001e-06 [label_micro_interleaved_index]: 4.30999e-06 [label_fine_grained_interleaved_index]: 2.74999e-06 [merge_cast_opt]: 1.24e-06 [slice_recompute_activation]: 2.16998e-06 [micro_interleaved_order_control]: 2.10002e-06 [assign_add_opt]: 1.39998e-06 [ForceFp32Comm]: 7.60017e-07 [remove_cast_before_assign_add]: 9.99979e-07 [full_micro_interleaved_order_control]: 2.22001e-06 [reorder_send_recv_between_fp_bp]: 2.66999e-06 [comm_op_add_attrs]: 1.00001e-06 [add_comm_op_reuse_tag]: 9.90025e-07 [interleave_split_concat_branches]: 1.13001e-06 [interleave_parallel_branches]: 1.15999e-06 [overlap_opt_shard_in_pipeline]: 1.17e-06 [overlap_opt_shard_grad_in_pipeline]: 2.02001e-06 [control_data_broadcast_order]: 1.355e-05 [grouped_pairwise_exchange_alltoall]: 1.59e-06 [offloading_packed_experts]: 3.9e-06 [overlap_recompute_and_grad_model_parallel]: 4.65001e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.44998e-06 [overlap_recompute_allgather_and_fa_grad]: 1.37999e-06 [overlap_recompute_comm]: 2.26e-06 [overlap_grad_ring_attention]: 4.17e-06 [overlap_grad_flash_sp]: 1.964e-05 [begin_end_overlap_inline]: 5.60016e-07 [split_matmul_comm_elemetwise]: 2.04e-06 [split_layernorm_comm]: 1.59e-06 [handle_group_info]: 9.89996e-07 [symbol_engine_optimizer]: 8.154e-05, [1] [Cycle 1]: 7.687e-05, [6] [build]: 3.16001e-06 [elim_shapecalc]: 1.144e-05 [elim_not_effective]: 1.401e-05 [opt_reshape]: 7.68999e-06 [fold_const_symbol]: 1.085e-05 [renormalize]: 1.8999e-07 [detach_backward]: 2.60002e-06 [pipeline_parallel_scheduler]: 1.59e-06 [auto_monad_reorder]: 1.814e-05 [get_jit_bprop_graph]: 2.04999e-06 [rewriter_after_jit_bprop_graph]: 4.12e-06 [opt_after_jit_grad]: 0.00049538 [validate]: 4.255e-05 Sums bootstrap : 0.000438s : 3.81% type_inference : 0.005977s : 52.09% event_method : 0.000021s : 0.18% auto_monad : 0.000085s : 0.74% graph_reusing : 0.000006s : 0.05% inline : 0.000002s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000021s : 0.18% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000007s : 0.06% parallel-infer-symbol : 0.000003s : 0.03% pre_auto_parallel : 0.000035s : 0.30% insert-virtual-dataset : 0.000003s : 0.03% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000031s : 0.27% optimize.rewriter_before_opt_a : 0.000091s : 0.80% optimize.opt_a.expand_dump_flag : 0.000004s : 0.04% optimize.opt_a.switch_simplify : 0.000058s : 0.50% optimize.opt_a.loop_unroll : 0.000042s : 0.37% optimize.opt_a.a_1 : 0.000923s : 8.04% optimize.opt_a.with_stream_mark : 0.000029s : 0.26% optimize.opt_a.recompute_prepare : 0.000018s : 0.16% optimize.opt_a.updatestate_depend_eliminate : 0.000007s : 0.06% optimize.opt_a.updatestate_assign_eliminate : 0.000006s : 0.05% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.05% optimize.opt_a.parameter_eliminate : 0.000003s : 0.03% optimize.opt_a.a_2 : 0.000176s : 1.54% optimize.opt_a.accelerated_algorithm : 0.000015s : 0.13% optimize.opt_a.shard : 0.000004s : 0.03% optimize.opt_a.meta_shard_fg_expand : 0.000003s : 0.03% optimize.opt_a.shard_inline : 0.000015s : 0.13% optimize.opt_a.merge_send_recv : 0.000015s : 0.13% optimize.opt_a.auto_parallel : 0.000014s : 0.12% optimize.opt_a.parallel : 0.000025s : 0.22% optimize.opt_a.flash_sp : 0.000011s : 0.10% optimize.opt_a.merge_comm : 0.000008s : 0.07% optimize.opt_a.allreduce_fusion : 0.000007s : 0.06% optimize.opt_a.matmul_add_comm_reduction : 0.000016s : 0.14% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000016s : 0.14% optimize.opt_a.virtual_dataset : 0.000014s : 0.12% optimize.opt_a.get_grad_eliminate_ : 0.000014s : 0.12% optimize.opt_a.virtual_output : 0.000013s : 0.12% optimize.opt_a.merge_forward : 0.000007s : 0.06% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.02% optimize.opt_a.offload_activation : 0.000017s : 0.14% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000028s : 0.25% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.02% optimize.opt_a.before_grad : 0.000021s : 0.18% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000007s : 0.06% optimize.opt_a.meta_fg_expand : 0.000006s : 0.05% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.03% optimize.opt_a.receive_attached : 0.000004s : 0.04% optimize.opt_a.after_resolve : 0.000028s : 0.25% optimize.opt_a.a_after_grad : 0.000022s : 0.19% optimize.opt_a.renormalize : 0.000710s : 6.19% optimize.opt_a.add_forward_monad_depend : 0.000008s : 0.07% optimize.opt_a.auto_monad_grad : 0.000004s : 0.03% optimize.opt_a.auto_monad_eliminator : 0.000026s : 0.23% optimize.opt_a.cse : 0.000050s : 0.43% optimize.opt_a.a_3 : 0.000096s : 0.84% optimize.py_interpret_to_execute_after_opt_a : 0.000012s : 0.11% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.02% optimize.rewriter_after_opt_a : 0.000038s : 0.33% optimize.convert_after_rewriter : 0.000007s : 0.06% optimize.order_py_execute_after_rewriter : 0.000005s : 0.05% optimize.mutable_eliminate : 0.000545s : 4.75% optimize.opt_b.b_1 : 0.000152s : 1.32% optimize.opt_b.b_2 : 0.000010s : 0.08% optimize.opt_b.updatestate_depend_eliminate : 0.000007s : 0.06% optimize.opt_b.updatestate_assign_eliminate : 0.000002s : 0.02% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_b.renormalize : 0.000001s : 0.01% optimize.opt_b.cse : 0.000024s : 0.21% optimize.optimize_parallel_all_gather_comm : 0.000017s : 0.15% optimize.overlap_param_gather : 0.000002s : 0.02% optimize.cconv : 0.000026s : 0.22% optimize.loop_unroll : 0.000460s : 4.01% optimize.opt_after_cconv.c_1 : 0.000037s : 0.32% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000007s : 0.06% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.02% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.02% optimize.opt_after_cconv.cse : 0.000022s : 0.20% optimize.opt_after_cconv.renormalize : 0.000001s : 0.01% optimize.remove_dup_value : 0.000015s : 0.13% optimize.tuple_transform.d_1 : 0.000052s : 0.45% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.02% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000008s : 0.07% optimize.partial_unused_args_eliminate : 0.000002s : 0.02% optimize.add_recomputation : 0.000049s : 0.43% optimize.cse_after_recomputation.cse : 0.000013s : 0.11% optimize.environ_conv : 0.000005s : 0.05% optimize.swap_dp_allreduce_reducescatter : 0.000005s : 0.05% optimize.bias_add_comm_swap : 0.000003s : 0.02% optimize.label_micro_interleaved_index : 0.000004s : 0.04% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.02% optimize.merge_cast_opt : 0.000001s : 0.01% optimize.slice_recompute_activation : 0.000002s : 0.02% optimize.micro_interleaved_order_control : 0.000002s : 0.02% optimize.assign_add_opt : 0.000001s : 0.01% optimize.ForceFp32Comm : 0.000001s : 0.01% optimize.remove_cast_before_assign_add : 0.000001s : 0.01% optimize.full_micro_interleaved_order_control : 0.000002s : 0.02% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.02% optimize.comm_op_add_attrs : 0.000001s : 0.01% optimize.add_comm_op_reuse_tag : 0.000001s : 0.01% optimize.interleave_split_concat_branches : 0.000001s : 0.01% optimize.interleave_parallel_branches : 0.000001s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.02% optimize.control_data_broadcast_order : 0.000014s : 0.12% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.01% optimize.offloading_packed_experts : 0.000004s : 0.03% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.04% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.01% optimize.overlap_recompute_comm : 0.000002s : 0.02% optimize.overlap_grad_ring_attention : 0.000004s : 0.04% optimize.overlap_grad_flash_sp : 0.000020s : 0.17% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.02% optimize.split_layernorm_comm : 0.000002s : 0.01% optimize.handle_group_info : 0.000001s : 0.01% optimize.symbol_engine_optimizer.build : 0.000003s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000011s : 0.10% optimize.symbol_engine_optimizer.elim_not_effective : 0.000014s : 0.12% optimize.symbol_engine_optimizer.opt_reshape : 0.000008s : 0.07% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000011s : 0.09% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000003s : 0.02% pipeline_parallel_scheduler : 0.000002s : 0.01% auto_monad_reorder : 0.000018s : 0.16% get_jit_bprop_graph : 0.000002s : 0.02% rewriter_after_jit_bprop_graph : 0.000004s : 0.04% opt_after_jit_grad : 0.000495s : 4.32% validate : 0.000043s : 0.37% Time group info: ------[substitution.] 0.000199 34 0.87% : 0.000002s : 2: substitution.elim_not_effective 0.86% : 0.000002s : 2: substitution.fold_const_symbol 3.13% : 0.000006s : 5: substitution.graph_param_transform 75.72% : 0.000150s : 4: substitution.inline 2.14% : 0.000004s : 4: substitution.j_node_and_user_rematch 2.17% : 0.000004s : 4: substitution.remove_not_recompute_node 2.91% : 0.000006s : 6: substitution.replace_old_param 9.36% : 0.000019s : 6: substitution.tuple_list_get_item_eliminator 2.85% : 0.000006s : 1: substitution.value_based_eliminate ------[type_inference.] 0.005913 2 87.12% : 0.005151s : 1: type_inference.infer 12.88% : 0.000762s : 1: type_inference.specialize ------[replace.] 0.000072 10 53.75% : 0.000039s : 4: replace.inline 46.25% : 0.000033s : 6: replace.tuple_list_get_item_eliminator ------[match.] 0.000164 10 90.28% : 0.000148s : 4: match.inline 9.72% : 0.000016s : 6: match.tuple_list_get_item_eliminator ------[predicate.] 0.000236 1590 1.03% : 0.000002s : 16: predicate.accumulaten_eliminater 0.73% : 0.000002s : 5: predicate.ad_related_special_op_eliminate 0.49% : 0.000001s : 10: predicate.addn_check_dump 1.03% : 0.000002s : 16: predicate.addn_zero_filter 0.80% : 0.000002s : 16: predicate.adjust_all_reduce_mul_add 2.06% : 0.000005s : 26: predicate.arithmetic_simplify 1.16% : 0.000003s : 16: predicate.cast_eliminate 0.56% : 0.000001s : 10: predicate.check_bprop_eliminate 0.51% : 0.000001s : 10: predicate.compare_switch_simplify 0.20% : 0.000000s : 5: predicate.const_output_eliminate 0.56% : 0.000001s : 10: predicate.depend_value_elim 0.94% : 0.000002s : 16: predicate.dict_get_item_const_eliminator 1.03% : 0.000002s : 16: predicate.dict_get_item_eliminator 0.91% : 0.000002s : 16: predicate.dict_set_item_eliminator 0.87% : 0.000002s : 10: predicate.dumpgradient_eliminate 0.22% : 0.000001s : 5: predicate.elim_not_effective 0.33% : 0.000001s : 5: predicate.elim_shapecalc_of_broadcastargs 1.11% : 0.000003s : 21: predicate.environ_add_const_eliminate 1.12% : 0.000003s : 21: predicate.environ_get_add_eliminate 1.30% : 0.000003s : 21: predicate.environ_get_depend_swap 1.65% : 0.000004s : 31: predicate.environ_get_eliminate 1.17% : 0.000003s : 21: predicate.environ_get_set_eliminate 1.57% : 0.000004s : 26: predicate.exchange_switch_depend_value 2.44% : 0.000006s : 26: predicate.float_depend_g_call 0.47% : 0.000001s : 10: predicate.float_environ_get_switch 0.79% : 0.000002s : 15: predicate.float_tuple_getitem_switch 0.20% : 0.000000s : 5: predicate.fold_const_symbol 0.62% : 0.000001s : 10: predicate.get_grad_eliminate 0.23% : 0.000001s : 5: predicate.graph_param_transform 0.57% : 0.000001s : 10: predicate.incorporate_call 0.47% : 0.000001s : 10: predicate.incorporate_call_switch 6.40% : 0.000015s : 72: predicate.inline 0.81% : 0.000002s : 10: predicate.inline_without_move 0.34% : 0.000001s : 10: predicate.j_node_and_user_rematch 0.71% : 0.000002s : 10: predicate.less_batch_normalization 1.87% : 0.000004s : 32: predicate.list_to_tuple_eliminator_ 2.56% : 0.000006s : 48: predicate.load_eliminater 0.99% : 0.000002s : 5: predicate.loop_unroll_after_grad 2.32% : 0.000005s : 40: predicate.loop_unroll_before_grad 1.64% : 0.000004s : 26: predicate.make_slice_get_slice_eliminator 0.53% : 0.000001s : 10: predicate.merge_addn 0.56% : 0.000001s : 10: predicate.micro_step_allgather_replace 0.56% : 0.000001s : 10: predicate.mini_step_allgather_replace 0.82% : 0.000002s : 16: predicate.minmaximum_grad 1.06% : 0.000002s : 5: predicate.mutable_eliminate 0.34% : 0.000001s : 5: predicate.opt_reshape 0.33% : 0.000001s : 5: predicate.parallel_virtual_node 1.82% : 0.000004s : 26: predicate.partial_defer_inline 1.80% : 0.000004s : 27: predicate.partial_eliminate 0.86% : 0.000002s : 16: predicate.print_const_string_wrapper 0.55% : 0.000001s : 10: predicate.reduce_all_const_elim 1.22% : 0.000003s : 16: predicate.reduce_eliminate 2.56% : 0.000006s : 48: predicate.redundant_stop_gradient_eliminater 0.43% : 0.000001s : 10: predicate.remove_not_recompute_node 1.47% : 0.000003s : 32: predicate.replace_applicator 0.63% : 0.000001s : 10: predicate.replace_old_param 0.27% : 0.000001s : 5: predicate.reset_defer_inline 0.91% : 0.000002s : 16: predicate.reshape_eliminate 0.55% : 0.000001s : 10: predicate.row_tensor_add_zeros_like 0.49% : 0.000001s : 5: predicate.row_tensor_eliminate 0.78% : 0.000002s : 10: predicate.same_eliminate 0.54% : 0.000001s : 10: predicate.set_cell_output_no_recompute 0.73% : 0.000002s : 10: predicate.shard_identity_eliminate 0.82% : 0.000002s : 10: predicate.special_op_eliminate 0.68% : 0.000002s : 10: predicate.specialize_transform 0.73% : 0.000002s : 10: predicate.split_environ_get_set_with_tuple_value 0.64% : 0.000002s : 10: predicate.stack_unstack_eliminate 0.34% : 0.000001s : 5: predicate.switch_call_monad_eliminater 1.59% : 0.000004s : 26: predicate.switch_defer_inline 2.07% : 0.000005s : 36: predicate.switch_layer_defer_inline 5.45% : 0.000013s : 81: predicate.switch_simplify 0.88% : 0.000002s : 16: predicate.tile_eliminate 0.92% : 0.000002s : 16: predicate.transpose_eliminate 1.44% : 0.000003s : 26: predicate.tuple_list_convert_item_index_to_positive 1.53% : 0.000004s : 26: predicate.tuple_list_get_item_const_eliminator 1.41% : 0.000003s : 26: predicate.tuple_list_get_item_depend_reorder 3.35% : 0.000008s : 42: predicate.tuple_list_get_item_eliminator 1.43% : 0.000003s : 26: predicate.tuple_list_get_set_item_eliminator 2.25% : 0.000005s : 36: predicate.tuple_list_set_item_eliminator 1.84% : 0.000004s : 32: predicate.tuple_to_list_eliminator_ 2.58% : 0.000006s : 48: predicate.updatestate_pure_node_eliminater 3.18% : 0.000008s : 58: predicate.updatestate_useless_node_eliminater 0.50% : 0.000001s : 5: predicate.value_based_eliminate 0.54% : 0.000001s : 10: predicate.virtual_dataset_eliminate 0.57% : 0.000001s : 10: predicate.virtual_output_eliminate 0.30% : 0.000001s : 5: predicate.virtual_view_grad_eliminate 0.43% : 0.000001s : 5: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000599 11 55.51% : 0.000333s : 5: func_graph_cloner_run.FuncGraphClonerGraph 44.49% : 0.000266s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.026273 192 0.01% : 0.000003s : 1: ForceFp32Comm 12.00% : 0.003153s : 1: add_attr 11.96% : 0.003143s : 1: add_attr_with_inline 0.01% : 0.000004s : 1: add_comm_op_reuse_tag 0.20% : 0.000054s : 1: add_recomputation 0.02% : 0.000004s : 1: assign_add_opt 0.35% : 0.000091s : 1: auto_monad 0.09% : 0.000022s : 1: auto_monad_reorder 0.01% : 0.000003s : 1: begin_end_overlap_inline 0.02% : 0.000006s : 1: bias_add_comm_swap 1.76% : 0.000462s : 1: bootstrap 0.11% : 0.000030s : 1: cconv 0.02% : 0.000004s : 1: comm_op_add_attrs 0.06% : 0.000017s : 1: control_data_broadcast_order 0.04% : 0.000010s : 1: convert_after_rewriter 0.10% : 0.000026s : 1: cse_after_recomputation 0.02% : 0.000005s : 1: dataset_repeat_opt 0.02% : 0.000006s : 1: detach_backward 0.03% : 0.000009s : 1: environ_conv 0.10% : 0.000027s : 1: event_method 0.02% : 0.000005s : 1: full_micro_interleaved_order_control 0.02% : 0.000005s : 1: get_jit_bprop_graph 0.04% : 0.000010s : 1: graph_reusing 0.02% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000004s : 1: handle_group_info 0.02% : 0.000005s : 1: inline 0.02% : 0.000006s : 1: insert-virtual-dataset 0.01% : 0.000004s : 1: interleave_parallel_branches 0.01% : 0.000004s : 1: interleave_split_concat_branches 0.02% : 0.000006s : 1: label_fine_grained_interleaved_index 0.03% : 0.000007s : 1: label_micro_interleaved_index 1.79% : 0.000469s : 1: loop_unroll 0.02% : 0.000004s : 1: merge_cast_opt 0.02% : 0.000005s : 1: micro_interleaved_order_control 2.11% : 0.000555s : 1: mutable_eliminate 0.03% : 0.000008s : 1: offloading_packed_experts 0.07% : 0.000018s : 1: opt.transform.loop_unroll_optimizer 0.07% : 0.000018s : 1: opt.transform.mutable_eliminate 5.36% : 0.001409s : 78: opt.transform.opt_a 0.14% : 0.000036s : 1: opt.transform.opt_after_cconv 0.11% : 0.000029s : 1: opt.transform.opt_after_jit_grad 0.49% : 0.000129s : 28: opt.transform.opt_b 0.22% : 0.000057s : 2: opt.transform.opt_trans_graph 0.15% : 0.000040s : 4: opt.transform.symbol_engine_opt 11.39% : 0.002992s : 1: opt_a 0.46% : 0.000121s : 1: opt_after_cconv 1.92% : 0.000505s : 1: opt_after_jit_grad 0.95% : 0.000249s : 1: opt_b 19.84% : 0.005211s : 1: optimize 0.08% : 0.000021s : 1: optimize_parallel_all_gather_comm 0.03% : 0.000009s : 1: order_py_execute_after_rewriter 0.09% : 0.000024s : 1: overlap_grad_flash_sp 0.02% : 0.000005s : 1: overlap_grad_matmul_and_grad_allreduce 0.03% : 0.000007s : 1: overlap_grad_ring_attention 0.02% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.02% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.02% : 0.000005s : 1: overlap_param_gather 0.02% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.03% : 0.000008s : 1: overlap_recompute_and_grad_model_parallel 0.02% : 0.000005s : 1: overlap_recompute_comm 0.03% : 0.000007s : 1: parallel-infer-symbol 0.01% : 0.000004s : 1: parallel-infer-symbol-second 0.02% : 0.000005s : 1: partial_unused_args_eliminate 0.02% : 0.000005s : 1: pipeline_parallel_scheduler 0.02% : 0.000004s : 1: pipeline_split 0.15% : 0.000039s : 1: pre_auto_parallel 0.13% : 0.000035s : 1: py_interpret_to_execute 0.06% : 0.000016s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000004s : 1: remove_cast_before_assign_add 0.07% : 0.000019s : 1: remove_dup_value 1.41% : 0.000371s : 1: renormalize.infer 1.26% : 0.000330s : 1: renormalize.specialize 0.02% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.03% : 0.000007s : 1: rewriter_after_jit_bprop_graph 0.16% : 0.000043s : 1: rewriter_after_opt_a 0.37% : 0.000096s : 1: rewriter_before_opt_a 0.02% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.02% : 0.000005s : 1: slice_recompute_activation 0.02% : 0.000004s : 1: split_layernorm_comm 0.02% : 0.000005s : 1: split_matmul_comm_elemetwise 0.03% : 0.000008s : 1: swap_dp_allreduce_reducescatter 0.32% : 0.000084s : 1: symbol_engine_optimizer 0.34% : 0.000089s : 1: tuple_transform 22.82% : 0.005994s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:47:17.323.341 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:47:17.323.596 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0180579, [21] [bootstrap]: 0.00044282 [type_inference]: 0.00592536 [event_method]: 2.212e-05 [auto_monad]: 6.716e-05 [graph_reusing]: 6.48e-06 [inline]: 2.31e-06 [add_attr]: 0.00361234, [1] [add_attr_with_inline]: 0.00360031, [1] [Cycle 1]: 9.789e-05, [2] [tag_attr]: 2.599e-05 [meta_addattr_fg_expand]: 7.14001e-06 [parallel-infer-symbol]: 3.85998e-06 [pre_auto_parallel]: 4.305e-05 [insert-virtual-dataset]: 2.32999e-06 [parallel-infer-symbol-second]: 8.09989e-07 [dataset_repeat_opt]: 1.96e-06 [pipeline_split]: 1.82999e-06 [optimize]: 0.00670376, [53] [py_interpret_to_execute]: 3.781e-05 [rewriter_before_opt_a]: 0.00010636 [opt_a]: 0.0040211, [2] [Cycle 1]: 0.00294043, [45] [expand_dump_flag]: 3.08e-06 [switch_simplify]: 4.813e-05 [loop_unroll]: 3.549e-05 [a_1]: 0.00087232 [with_stream_mark]: 1.709e-05 [recompute_prepare]: 1.019e-05 [updatestate_depend_eliminate]: 1.554e-05 [updatestate_assign_eliminate]: 4.35e-06 [updatestate_loads_eliminate]: 4.05e-06 [parameter_eliminate]: 2.01e-06 [a_2]: 0.00013934 [accelerated_algorithm]: 9.56003e-06 [shard]: 1.92999e-06 [meta_shard_fg_expand]: 2.17001e-06 [shard_inline]: 8.62e-06 [merge_send_recv]: 9.41e-06 [auto_parallel]: 8.03999e-06 [parallel]: 1.82e-05 [flash_sp]: 9.41e-06 [merge_comm]: 5.37999e-06 [allreduce_fusion]: 4.68999e-06 [matmul_add_comm_reduction]: 1.131e-05 [allreduce_slice_to_reducescatter]: 6.69999e-07 [virtual_shard_identity]: 1.039e-05 [virtual_dataset]: 9.50001e-06 [get_grad_eliminate_]: 8.41002e-06 [virtual_output]: 8.42e-06 [merge_forward]: 5.33002e-06 [cell_reuse_recompute_pass]: 1.17e-06 [offload_activation]: 1.248e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.863e-05 [merge_recompute_call_nodes]: 1.42999e-06 [before_grad]: 1.356e-05 [set_forward_comm_id_for_comm_node_pass]: 4.99e-06 [meta_fg_expand]: 4.13999e-06 [flash_sp_send_recv_attached]: 2.48002e-06 [receive_attached]: 2.72001e-06 [after_resolve]: 1.607e-05 [a_after_grad]: 1.368e-05 [renormalize]: 0.00100102 [add_forward_monad_depend]: 6.99001e-06 [auto_monad_grad]: 2.88e-06 [auto_monad_eliminator]: 2.153e-05 [cse]: 3.8e-05 [a_3]: 8.011e-05 [Cycle 2]: 0.00106482, [45] [expand_dump_flag]: 1.32999e-06 [switch_simplify]: 1.049e-05 [loop_unroll]: 8.33999e-06 [a_1]: 0.00021167 [with_stream_mark]: 1.437e-05 [recompute_prepare]: 8.80001e-06 [updatestate_depend_eliminate]: 4.53999e-06 [updatestate_assign_eliminate]: 3.25e-06 [updatestate_loads_eliminate]: 3.6e-06 [parameter_eliminate]: 1.40001e-06 [a_2]: 0.00013019 [accelerated_algorithm]: 9.52001e-06 [shard]: 1.84e-06 [meta_shard_fg_expand]: 2.09e-06 [shard_inline]: 1.064e-05 [merge_send_recv]: 1.186e-05 [auto_parallel]: 7.42002e-06 [parallel]: 6.64999e-06 [flash_sp]: 3.6e-06 [merge_comm]: 5.47001e-06 [allreduce_fusion]: 4.42998e-06 [matmul_add_comm_reduction]: 8.13001e-06 [allreduce_slice_to_reducescatter]: 9.20001e-07 [virtual_shard_identity]: 1.196e-05 [virtual_dataset]: 9.02e-06 [get_grad_eliminate_]: 7.78001e-06 [virtual_output]: 7.72998e-06 [merge_forward]: 5.02e-06 [cell_reuse_recompute_pass]: 2.13002e-06 [offload_activation]: 9.42001e-06 [cell_reuse_handle_not_recompute_node_pass]: 2.007e-05 [merge_recompute_call_nodes]: 1.25999e-06 [before_grad]: 1.359e-05 [set_forward_comm_id_for_comm_node_pass]: 5.10999e-06 [meta_fg_expand]: 3.27002e-06 [flash_sp_send_recv_attached]: 1.09998e-06 [receive_attached]: 1.13001e-06 [after_resolve]: 1.478e-05 [a_after_grad]: 1.305e-05 [renormalize]: 8.00064e-08 [add_forward_monad_depend]: 2.81e-06 [auto_monad_grad]: 1.57001e-06 [auto_monad_eliminator]: 1.216e-05 [cse]: 2.363e-05 [a_3]: 6.525e-05 [py_interpret_to_execute_after_opt_a]: 1.712e-05 [slice_cell_reuse_recomputed_activation]: 5.44998e-06 [rewriter_after_opt_a]: 5.115e-05 [convert_after_rewriter]: 1.1e-05 [order_py_execute_after_rewriter]: 9.19e-06 [mutable_eliminate]: 0.0006665 [opt_b]: 0.00034853, [1] [Cycle 1]: 0.0003381, [7] [b_1]: 0.00022853 [b_2]: 1.029e-05 [updatestate_depend_eliminate]: 7.98999e-06 [updatestate_assign_eliminate]: 3.37002e-06 [updatestate_loads_eliminate]: 3.95998e-06 [renormalize]: 3.19997e-07 [cse]: 2.621e-05 [optimize_parallel_all_gather_comm]: 2.359e-05 [overlap_param_gather]: 4.44002e-06 [cconv]: 3.196e-05 [loop_unroll]: 0.00048495 [opt_after_cconv]: 0.00015028, [1] [Cycle 1]: 0.00014139, [7] [c_1]: 4.289e-05 [parameter_eliminate]: 3.95e-06 [updatestate_depend_eliminate]: 7.25e-06 [updatestate_assign_eliminate]: 3.42002e-06 [updatestate_loads_eliminate]: 2.98e-06 [cse]: 2.368e-05 [renormalize]: 4.40021e-07 [remove_dup_value]: 1.884e-05 [tuple_transform]: 0.0001076, [1] [Cycle 1]: 0.00010013, [4] [d_1]: 5.891e-05 [none_parameter_eliminate]: 1.93002e-06 [renormalize]: 1.79978e-07 [switch_simplify]: 8.95999e-06 [partial_unused_args_eliminate]: 4.63999e-06 [add_recomputation]: 6.069e-05 [cse_after_recomputation]: 3.23e-05, [1] [Cycle 1]: 2.535e-05, [1] [cse]: 1.593e-05 [environ_conv]: 9.85002e-06 [swap_dp_allreduce_reducescatter]: 8.63001e-06 [bias_add_comm_swap]: 5.17e-06 [label_micro_interleaved_index]: 7.23999e-06 [label_fine_grained_interleaved_index]: 5.22e-06 [merge_cast_opt]: 3.66001e-06 [slice_recompute_activation]: 4.70001e-06 [micro_interleaved_order_control]: 4.74002e-06 [assign_add_opt]: 3.87002e-06 [ForceFp32Comm]: 3.3e-06 [remove_cast_before_assign_add]: 3.74002e-06 [full_micro_interleaved_order_control]: 4.50999e-06 [reorder_send_recv_between_fp_bp]: 5.12e-06 [comm_op_add_attrs]: 3.30003e-06 [add_comm_op_reuse_tag]: 3.20002e-06 [interleave_split_concat_branches]: 3.48e-06 [interleave_parallel_branches]: 3.69002e-06 [overlap_opt_shard_in_pipeline]: 3.6e-06 [overlap_opt_shard_grad_in_pipeline]: 4.30999e-06 [control_data_broadcast_order]: 1.839e-05 [grouped_pairwise_exchange_alltoall]: 4.10998e-06 [offloading_packed_experts]: 7.15998e-06 [overlap_recompute_and_grad_model_parallel]: 7.48e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.56001e-06 [overlap_recompute_allgather_and_fa_grad]: 3.66999e-06 [overlap_recompute_comm]: 4.97e-06 [overlap_grad_ring_attention]: 7.04001e-06 [overlap_grad_flash_sp]: 2.565e-05 [begin_end_overlap_inline]: 3.05002e-06 [split_matmul_comm_elemetwise]: 4.68999e-06 [split_layernorm_comm]: 3.91999e-06 [handle_group_info]: 3.55e-06 [symbol_engine_optimizer]: 0.00010744, [1] [Cycle 1]: 0.00010003, [6] [build]: 3.57997e-06 [elim_shapecalc]: 1.165e-05 [elim_not_effective]: 1.677e-05 [opt_reshape]: 9.16998e-06 [fold_const_symbol]: 1.409e-05 [renormalize]: 1.69995e-07 [detach_backward]: 3.58e-06 [pipeline_parallel_scheduler]: 1.76e-06 [auto_monad_reorder]: 2.38e-05 [get_jit_bprop_graph]: 2.32001e-06 [rewriter_after_jit_bprop_graph]: 5.17e-06 [opt_after_jit_grad]: 0.0005331 [validate]: 4.792e-05 Sums bootstrap : 0.000443s : 3.51% type_inference : 0.005925s : 46.93% event_method : 0.000022s : 0.18% auto_monad : 0.000067s : 0.53% graph_reusing : 0.000006s : 0.05% inline : 0.000002s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000026s : 0.21% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000007s : 0.06% parallel-infer-symbol : 0.000004s : 0.03% pre_auto_parallel : 0.000043s : 0.34% insert-virtual-dataset : 0.000002s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000038s : 0.30% optimize.rewriter_before_opt_a : 0.000106s : 0.84% optimize.opt_a.expand_dump_flag : 0.000004s : 0.03% optimize.opt_a.switch_simplify : 0.000059s : 0.46% optimize.opt_a.loop_unroll : 0.000044s : 0.35% optimize.opt_a.a_1 : 0.001084s : 8.58% optimize.opt_a.with_stream_mark : 0.000031s : 0.25% optimize.opt_a.recompute_prepare : 0.000019s : 0.15% optimize.opt_a.updatestate_depend_eliminate : 0.000020s : 0.16% optimize.opt_a.updatestate_assign_eliminate : 0.000008s : 0.06% optimize.opt_a.updatestate_loads_eliminate : 0.000008s : 0.06% optimize.opt_a.parameter_eliminate : 0.000003s : 0.03% optimize.opt_a.a_2 : 0.000270s : 2.13% optimize.opt_a.accelerated_algorithm : 0.000019s : 0.15% optimize.opt_a.shard : 0.000004s : 0.03% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.03% optimize.opt_a.shard_inline : 0.000019s : 0.15% optimize.opt_a.merge_send_recv : 0.000021s : 0.17% optimize.opt_a.auto_parallel : 0.000015s : 0.12% optimize.opt_a.parallel : 0.000025s : 0.20% optimize.opt_a.flash_sp : 0.000013s : 0.10% optimize.opt_a.merge_comm : 0.000011s : 0.09% optimize.opt_a.allreduce_fusion : 0.000009s : 0.07% optimize.opt_a.matmul_add_comm_reduction : 0.000019s : 0.15% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000002s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000022s : 0.18% optimize.opt_a.virtual_dataset : 0.000019s : 0.15% optimize.opt_a.get_grad_eliminate_ : 0.000016s : 0.13% optimize.opt_a.virtual_output : 0.000016s : 0.13% optimize.opt_a.merge_forward : 0.000010s : 0.08% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.03% optimize.opt_a.offload_activation : 0.000022s : 0.17% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000039s : 0.31% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.02% optimize.opt_a.before_grad : 0.000027s : 0.22% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000010s : 0.08% optimize.opt_a.meta_fg_expand : 0.000007s : 0.06% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.03% optimize.opt_a.receive_attached : 0.000004s : 0.03% optimize.opt_a.after_resolve : 0.000031s : 0.24% optimize.opt_a.a_after_grad : 0.000027s : 0.21% optimize.opt_a.renormalize : 0.001001s : 7.93% optimize.opt_a.add_forward_monad_depend : 0.000010s : 0.08% optimize.opt_a.auto_monad_grad : 0.000004s : 0.04% optimize.opt_a.auto_monad_eliminator : 0.000034s : 0.27% optimize.opt_a.cse : 0.000062s : 0.49% optimize.opt_a.a_3 : 0.000145s : 1.15% optimize.py_interpret_to_execute_after_opt_a : 0.000017s : 0.14% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.04% optimize.rewriter_after_opt_a : 0.000051s : 0.41% optimize.convert_after_rewriter : 0.000011s : 0.09% optimize.order_py_execute_after_rewriter : 0.000009s : 0.07% optimize.mutable_eliminate : 0.000667s : 5.28% optimize.opt_b.b_1 : 0.000229s : 1.81% optimize.opt_b.b_2 : 0.000010s : 0.08% optimize.opt_b.updatestate_depend_eliminate : 0.000008s : 0.06% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_b.updatestate_loads_eliminate : 0.000004s : 0.03% optimize.opt_b.renormalize : 0.000000s : 0.00% optimize.opt_b.cse : 0.000026s : 0.21% optimize.optimize_parallel_all_gather_comm : 0.000024s : 0.19% optimize.overlap_param_gather : 0.000004s : 0.04% optimize.cconv : 0.000032s : 0.25% optimize.loop_unroll : 0.000485s : 3.84% optimize.opt_after_cconv.c_1 : 0.000043s : 0.34% optimize.opt_after_cconv.parameter_eliminate : 0.000004s : 0.03% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000007s : 0.06% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.02% optimize.opt_after_cconv.cse : 0.000024s : 0.19% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000019s : 0.15% optimize.tuple_transform.d_1 : 0.000059s : 0.47% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.02% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000009s : 0.07% optimize.partial_unused_args_eliminate : 0.000005s : 0.04% optimize.add_recomputation : 0.000061s : 0.48% optimize.cse_after_recomputation.cse : 0.000016s : 0.13% optimize.environ_conv : 0.000010s : 0.08% optimize.swap_dp_allreduce_reducescatter : 0.000009s : 0.07% optimize.bias_add_comm_swap : 0.000005s : 0.04% optimize.label_micro_interleaved_index : 0.000007s : 0.06% optimize.label_fine_grained_interleaved_index : 0.000005s : 0.04% optimize.merge_cast_opt : 0.000004s : 0.03% optimize.slice_recompute_activation : 0.000005s : 0.04% optimize.micro_interleaved_order_control : 0.000005s : 0.04% optimize.assign_add_opt : 0.000004s : 0.03% optimize.ForceFp32Comm : 0.000003s : 0.03% optimize.remove_cast_before_assign_add : 0.000004s : 0.03% optimize.full_micro_interleaved_order_control : 0.000005s : 0.04% optimize.reorder_send_recv_between_fp_bp : 0.000005s : 0.04% optimize.comm_op_add_attrs : 0.000003s : 0.03% optimize.add_comm_op_reuse_tag : 0.000003s : 0.03% optimize.interleave_split_concat_branches : 0.000003s : 0.03% optimize.interleave_parallel_branches : 0.000004s : 0.03% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.03% optimize.overlap_opt_shard_grad_in_pipeline : 0.000004s : 0.03% optimize.control_data_broadcast_order : 0.000018s : 0.15% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.03% optimize.offloading_packed_experts : 0.000007s : 0.06% optimize.overlap_recompute_and_grad_model_parallel : 0.000007s : 0.06% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.03% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.03% optimize.overlap_recompute_comm : 0.000005s : 0.04% optimize.overlap_grad_ring_attention : 0.000007s : 0.06% optimize.overlap_grad_flash_sp : 0.000026s : 0.20% optimize.begin_end_overlap_inline : 0.000003s : 0.02% optimize.split_matmul_comm_elemetwise : 0.000005s : 0.04% optimize.split_layernorm_comm : 0.000004s : 0.03% optimize.handle_group_info : 0.000004s : 0.03% optimize.symbol_engine_optimizer.build : 0.000004s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000012s : 0.09% optimize.symbol_engine_optimizer.elim_not_effective : 0.000017s : 0.13% optimize.symbol_engine_optimizer.opt_reshape : 0.000009s : 0.07% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000014s : 0.11% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000004s : 0.03% pipeline_parallel_scheduler : 0.000002s : 0.01% auto_monad_reorder : 0.000024s : 0.19% get_jit_bprop_graph : 0.000002s : 0.02% rewriter_after_jit_bprop_graph : 0.000005s : 0.04% opt_after_jit_grad : 0.000533s : 4.22% validate : 0.000048s : 0.38% Time group info: ------[substitution.] 0.000250 44 9.64% : 0.000024s : 3: substitution.cast_eliminate 0.90% : 0.000002s : 3: substitution.elim_not_effective 0.83% : 0.000002s : 3: substitution.fold_const_symbol 2.93% : 0.000007s : 6: substitution.graph_param_transform 68.50% : 0.000171s : 4: substitution.inline 2.05% : 0.000005s : 6: substitution.j_node_and_user_rematch 2.71% : 0.000007s : 6: substitution.remove_not_recompute_node 2.91% : 0.000007s : 6: substitution.replace_old_param 7.42% : 0.000019s : 6: substitution.tuple_list_get_item_eliminator 2.11% : 0.000005s : 1: substitution.value_based_eliminate ------[type_inference.] 0.005867 2 86.10% : 0.005052s : 1: type_inference.infer 13.90% : 0.000816s : 1: type_inference.specialize ------[replace.] 0.000075 10 54.01% : 0.000040s : 4: replace.inline 45.99% : 0.000034s : 6: replace.tuple_list_get_item_eliminator ------[match.] 0.000185 10 91.43% : 0.000169s : 4: match.inline 8.57% : 0.000016s : 6: match.tuple_list_get_item_eliminator ------[predicate.] 0.000297 1908 1.00% : 0.000003s : 20: predicate.accumulaten_eliminater 0.59% : 0.000002s : 6: predicate.ad_related_special_op_eliminate 0.54% : 0.000002s : 12: predicate.addn_check_dump 0.96% : 0.000003s : 20: predicate.addn_zero_filter 0.83% : 0.000002s : 20: predicate.adjust_all_reduce_mul_add 2.04% : 0.000006s : 32: predicate.arithmetic_simplify 1.10% : 0.000003s : 20: predicate.cast_eliminate 0.63% : 0.000002s : 12: predicate.check_bprop_eliminate 0.57% : 0.000002s : 12: predicate.compare_switch_simplify 0.19% : 0.000001s : 6: predicate.const_output_eliminate 0.60% : 0.000002s : 12: predicate.depend_value_elim 0.98% : 0.000003s : 20: predicate.dict_get_item_const_eliminator 1.06% : 0.000003s : 20: predicate.dict_get_item_eliminator 0.89% : 0.000003s : 20: predicate.dict_set_item_eliminator 0.92% : 0.000003s : 12: predicate.dumpgradient_eliminate 0.24% : 0.000001s : 6: predicate.elim_not_effective 0.32% : 0.000001s : 6: predicate.elim_shapecalc_of_broadcastargs 1.39% : 0.000004s : 26: predicate.environ_add_const_eliminate 1.11% : 0.000003s : 26: predicate.environ_get_add_eliminate 1.21% : 0.000004s : 26: predicate.environ_get_depend_swap 1.89% : 0.000006s : 38: predicate.environ_get_eliminate 1.18% : 0.000004s : 26: predicate.environ_get_set_eliminate 1.41% : 0.000004s : 30: predicate.exchange_switch_depend_value 2.27% : 0.000007s : 30: predicate.float_depend_g_call 0.63% : 0.000002s : 12: predicate.float_environ_get_switch 0.85% : 0.000003s : 18: predicate.float_tuple_getitem_switch 0.19% : 0.000001s : 6: predicate.fold_const_symbol 0.61% : 0.000002s : 12: predicate.get_grad_eliminate 0.19% : 0.000001s : 6: predicate.graph_param_transform 0.61% : 0.000002s : 12: predicate.incorporate_call 0.50% : 0.000002s : 12: predicate.incorporate_call_switch 6.09% : 0.000018s : 86: predicate.inline 0.73% : 0.000002s : 12: predicate.inline_without_move 0.33% : 0.000001s : 12: predicate.j_node_and_user_rematch 0.76% : 0.000002s : 12: predicate.less_batch_normalization 1.93% : 0.000006s : 38: predicate.list_to_tuple_eliminator_ 2.74% : 0.000008s : 58: predicate.load_eliminater 0.79% : 0.000002s : 6: predicate.loop_unroll_after_grad 1.97% : 0.000006s : 42: predicate.loop_unroll_before_grad 1.71% : 0.000005s : 32: predicate.make_slice_get_slice_eliminator 0.63% : 0.000002s : 12: predicate.merge_addn 0.57% : 0.000002s : 12: predicate.micro_step_allgather_replace 0.55% : 0.000002s : 12: predicate.mini_step_allgather_replace 0.89% : 0.000003s : 20: predicate.minmaximum_grad 0.89% : 0.000003s : 6: predicate.mutable_eliminate 0.31% : 0.000001s : 6: predicate.opt_reshape 0.37% : 0.000001s : 6: predicate.parallel_virtual_node 1.66% : 0.000005s : 30: predicate.partial_defer_inline 1.74% : 0.000005s : 32: predicate.partial_eliminate 1.05% : 0.000003s : 20: predicate.print_const_string_wrapper 0.57% : 0.000002s : 12: predicate.reduce_all_const_elim 1.27% : 0.000004s : 20: predicate.reduce_eliminate 2.79% : 0.000008s : 58: predicate.redundant_stop_gradient_eliminater 0.38% : 0.000001s : 12: predicate.remove_not_recompute_node 1.40% : 0.000004s : 38: predicate.replace_applicator 0.48% : 0.000001s : 12: predicate.replace_old_param 0.24% : 0.000001s : 6: predicate.reset_defer_inline 1.07% : 0.000003s : 20: predicate.reshape_eliminate 0.62% : 0.000002s : 12: predicate.row_tensor_add_zeros_like 0.37% : 0.000001s : 6: predicate.row_tensor_eliminate 0.72% : 0.000002s : 12: predicate.same_eliminate 0.44% : 0.000001s : 12: predicate.set_cell_output_no_recompute 0.84% : 0.000003s : 12: predicate.shard_identity_eliminate 0.65% : 0.000002s : 12: predicate.special_op_eliminate 0.68% : 0.000002s : 12: predicate.specialize_transform 0.82% : 0.000002s : 12: predicate.split_environ_get_set_with_tuple_value 0.90% : 0.000003s : 12: predicate.stack_unstack_eliminate 0.33% : 0.000001s : 6: predicate.switch_call_monad_eliminater 1.55% : 0.000005s : 30: predicate.switch_defer_inline 2.07% : 0.000006s : 42: predicate.switch_layer_defer_inline 4.64% : 0.000014s : 90: predicate.switch_simplify 0.97% : 0.000003s : 20: predicate.tile_eliminate 1.00% : 0.000003s : 20: predicate.transpose_eliminate 1.52% : 0.000005s : 32: predicate.tuple_list_convert_item_index_to_positive 1.71% : 0.000005s : 32: predicate.tuple_list_get_item_const_eliminator 1.54% : 0.000005s : 32: predicate.tuple_list_get_item_depend_reorder 3.25% : 0.000010s : 50: predicate.tuple_list_get_item_eliminator 1.58% : 0.000005s : 32: predicate.tuple_list_get_set_item_eliminator 2.40% : 0.000007s : 44: predicate.tuple_list_set_item_eliminator 1.78% : 0.000005s : 38: predicate.tuple_to_list_eliminator_ 2.55% : 0.000008s : 58: predicate.updatestate_pure_node_eliminater 3.23% : 0.000010s : 70: predicate.updatestate_useless_node_eliminater 0.46% : 0.000001s : 6: predicate.value_based_eliminate 0.68% : 0.000002s : 12: predicate.virtual_dataset_eliminate 0.65% : 0.000002s : 12: predicate.virtual_output_eliminate 0.28% : 0.000001s : 6: predicate.virtual_view_grad_eliminate 0.36% : 0.000001s : 6: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000586 11 50.70% : 0.000297s : 5: func_graph_cloner_run.FuncGraphClonerGraph 49.30% : 0.000289s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.031296 192 0.02% : 0.000006s : 1: ForceFp32Comm 11.58% : 0.003624s : 1: add_attr 11.52% : 0.003605s : 1: add_attr_with_inline 0.02% : 0.000006s : 1: add_comm_op_reuse_tag 0.21% : 0.000064s : 1: add_recomputation 0.02% : 0.000007s : 1: assign_add_opt 0.25% : 0.000078s : 1: auto_monad 0.10% : 0.000032s : 1: auto_monad_reorder 0.02% : 0.000006s : 1: begin_end_overlap_inline 0.03% : 0.000008s : 1: bias_add_comm_swap 1.56% : 0.000487s : 1: bootstrap 0.11% : 0.000035s : 1: cconv 0.02% : 0.000006s : 1: comm_op_add_attrs 0.07% : 0.000022s : 1: control_data_broadcast_order 0.05% : 0.000015s : 1: convert_after_rewriter 0.11% : 0.000035s : 1: cse_after_recomputation 0.02% : 0.000007s : 1: dataset_repeat_opt 0.06% : 0.000020s : 1: detach_backward 0.04% : 0.000013s : 1: environ_conv 0.10% : 0.000031s : 1: event_method 0.02% : 0.000007s : 1: full_micro_interleaved_order_control 0.03% : 0.000009s : 1: get_jit_bprop_graph 0.04% : 0.000013s : 1: graph_reusing 0.02% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.02% : 0.000006s : 1: handle_group_info 0.03% : 0.000008s : 1: inline 0.03% : 0.000009s : 1: insert-virtual-dataset 0.02% : 0.000006s : 1: interleave_parallel_branches 0.02% : 0.000006s : 1: interleave_split_concat_branches 0.03% : 0.000008s : 1: label_fine_grained_interleaved_index 0.03% : 0.000010s : 1: label_micro_interleaved_index 1.57% : 0.000491s : 1: loop_unroll 0.02% : 0.000006s : 1: merge_cast_opt 0.02% : 0.000007s : 1: micro_interleaved_order_control 2.15% : 0.000673s : 1: mutable_eliminate 0.03% : 0.000010s : 1: offloading_packed_experts 0.06% : 0.000019s : 1: opt.transform.loop_unroll_optimizer 0.06% : 0.000020s : 1: opt.transform.mutable_eliminate 5.35% : 0.001675s : 78: opt.transform.opt_a 0.13% : 0.000042s : 1: opt.transform.opt_after_cconv 0.11% : 0.000034s : 1: opt.transform.opt_after_jit_grad 0.53% : 0.000164s : 28: opt.transform.opt_b 0.21% : 0.000066s : 2: opt.transform.opt_trans_graph 0.15% : 0.000048s : 4: opt.transform.symbol_engine_opt 12.86% : 0.004024s : 1: opt_a 0.49% : 0.000154s : 1: opt_after_cconv 1.74% : 0.000545s : 1: opt_after_jit_grad 1.13% : 0.000352s : 1: opt_b 22.53% : 0.007052s : 1: optimize 0.09% : 0.000027s : 1: optimize_parallel_all_gather_comm 0.04% : 0.000012s : 1: order_py_execute_after_rewriter 0.09% : 0.000029s : 1: overlap_grad_flash_sp 0.02% : 0.000006s : 1: overlap_grad_matmul_and_grad_allreduce 0.03% : 0.000010s : 1: overlap_grad_ring_attention 0.02% : 0.000007s : 1: overlap_opt_shard_grad_in_pipeline 0.02% : 0.000006s : 1: overlap_opt_shard_in_pipeline 0.02% : 0.000007s : 1: overlap_param_gather 0.02% : 0.000006s : 1: overlap_recompute_allgather_and_fa_grad 0.03% : 0.000010s : 1: overlap_recompute_and_grad_model_parallel 0.02% : 0.000008s : 1: overlap_recompute_comm 0.03% : 0.000010s : 1: parallel-infer-symbol 0.02% : 0.000006s : 1: parallel-infer-symbol-second 0.02% : 0.000008s : 1: partial_unused_args_eliminate 0.03% : 0.000009s : 1: pipeline_parallel_scheduler 0.02% : 0.000007s : 1: pipeline_split 0.16% : 0.000050s : 1: pre_auto_parallel 0.13% : 0.000042s : 1: py_interpret_to_execute 0.07% : 0.000021s : 1: py_interpret_to_execute_after_opt_a 0.02% : 0.000007s : 1: remove_cast_before_assign_add 0.07% : 0.000022s : 1: remove_dup_value 1.88% : 0.000587s : 1: renormalize.infer 1.29% : 0.000404s : 1: renormalize.specialize 0.02% : 0.000008s : 1: reorder_send_recv_between_fp_bp 0.04% : 0.000011s : 1: rewriter_after_jit_bprop_graph 0.18% : 0.000055s : 1: rewriter_after_opt_a 0.35% : 0.000110s : 1: rewriter_before_opt_a 0.03% : 0.000008s : 1: slice_cell_reuse_recomputed_activation 0.02% : 0.000007s : 1: slice_recompute_activation 0.02% : 0.000007s : 1: split_layernorm_comm 0.02% : 0.000007s : 1: split_matmul_comm_elemetwise 0.04% : 0.000012s : 1: swap_dp_allreduce_reducescatter 0.35% : 0.000111s : 1: symbol_engine_optimizer 0.35% : 0.000110s : 1: tuple_transform 19.07% : 0.005968s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:47:17.522.544 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0158014, [21] [bootstrap]: 0.00043976 [type_inference]: 0.00593405 [event_method]: 2.168e-05 [auto_monad]: 6.718e-05 [graph_reusing]: 6.28e-06 [inline]: 2.42001e-06 [add_attr]: 0.00317444, [1] [add_attr_with_inline]: 0.00316476, [1] [Cycle 1]: 6.48e-05, [2] [tag_attr]: 2.168e-05 [meta_addattr_fg_expand]: 6.78998e-06 [parallel-infer-symbol]: 3.4e-06 [pre_auto_parallel]: 3.703e-05 [insert-virtual-dataset]: 2.63e-06 [parallel-infer-symbol-second]: 1.02998e-06 [dataset_repeat_opt]: 2.39999e-06 [pipeline_split]: 1.70001e-06 [optimize]: 0.00543739, [53] [py_interpret_to_execute]: 3.028e-05 [rewriter_before_opt_a]: 9.293e-05 [opt_a]: 0.00331183, [2] [Cycle 1]: 0.00247198, [45] [expand_dump_flag]: 3.5e-06 [switch_simplify]: 4.818e-05 [loop_unroll]: 3.671e-05 [a_1]: 0.0008708 [with_stream_mark]: 2.03e-05 [recompute_prepare]: 1.343e-05 [updatestate_depend_eliminate]: 5.72001e-06 [updatestate_assign_eliminate]: 4.01001e-06 [updatestate_loads_eliminate]: 3.75e-06 [parameter_eliminate]: 1.84998e-06 [a_2]: 0.00011204 [accelerated_algorithm]: 9.27001e-06 [shard]: 2.54001e-06 [meta_shard_fg_expand]: 1.99999e-06 [shard_inline]: 8.35001e-06 [merge_send_recv]: 1.01e-05 [auto_parallel]: 7.97e-06 [parallel]: 2.08e-05 [flash_sp]: 9.68002e-06 [merge_comm]: 5.70001e-06 [allreduce_fusion]: 4.70999e-06 [matmul_add_comm_reduction]: 1.067e-05 [allreduce_slice_to_reducescatter]: 6.89994e-07 [virtual_shard_identity]: 1.206e-05 [virtual_dataset]: 8.78001e-06 [get_grad_eliminate_]: 8.22e-06 [virtual_output]: 8.86997e-06 [merge_forward]: 5.09e-06 [cell_reuse_recompute_pass]: 1.14998e-06 [offload_activation]: 1.118e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.945e-05 [merge_recompute_call_nodes]: 1.47001e-06 [before_grad]: 1.461e-05 [set_forward_comm_id_for_comm_node_pass]: 6.05002e-06 [meta_fg_expand]: 3.52002e-06 [flash_sp_send_recv_attached]: 2.78003e-06 [receive_attached]: 2.44999e-06 [after_resolve]: 1.58e-05 [a_after_grad]: 1.329e-05 [renormalize]: 0.00073535 [add_forward_monad_depend]: 5.76e-06 [auto_monad_grad]: 2.32999e-06 [auto_monad_eliminator]: 1.618e-05 [cse]: 3.679e-05 [a_3]: 6.148e-05 [Cycle 2]: 0.00082827, [45] [expand_dump_flag]: 1.54e-06 [switch_simplify]: 9.77001e-06 [loop_unroll]: 8.35001e-06 [a_1]: 0.00020326 [with_stream_mark]: 1.258e-05 [recompute_prepare]: 8.60999e-06 [updatestate_depend_eliminate]: 4.18999e-06 [updatestate_assign_eliminate]: 3.11001e-06 [updatestate_loads_eliminate]: 3.11001e-06 [parameter_eliminate]: 9.5999e-07 [a_2]: 0.00011311 [accelerated_algorithm]: 8.30999e-06 [shard]: 1.61002e-06 [meta_shard_fg_expand]: 1.81003e-06 [shard_inline]: 7.84002e-06 [merge_send_recv]: 6.36998e-06 [auto_parallel]: 7.11001e-06 [parallel]: 8.52e-06 [flash_sp]: 3.48e-06 [merge_comm]: 4.07e-06 [allreduce_fusion]: 4.02998e-06 [matmul_add_comm_reduction]: 7.75e-06 [allreduce_slice_to_reducescatter]: 6.00005e-07 [virtual_shard_identity]: 8.92e-06 [virtual_dataset]: 7.95e-06 [get_grad_eliminate_]: 7.66999e-06 [virtual_output]: 7.98001e-06 [merge_forward]: 3.85e-06 [cell_reuse_recompute_pass]: 1.29e-06 [offload_activation]: 7.99997e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.694e-05 [merge_recompute_call_nodes]: 7.29982e-07 [before_grad]: 1.285e-05 [set_forward_comm_id_for_comm_node_pass]: 4.62e-06 [meta_fg_expand]: 2.91999e-06 [flash_sp_send_recv_attached]: 1.32999e-06 [receive_attached]: 9.99979e-07 [after_resolve]: 1.318e-05 [a_after_grad]: 1.285e-05 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 1.45001e-06 [auto_monad_grad]: 1.39e-06 [auto_monad_eliminator]: 8.60999e-06 [cse]: 1.82e-05 [a_3]: 4.997e-05 [py_interpret_to_execute_after_opt_a]: 1.115e-05 [slice_cell_reuse_recomputed_activation]: 2.38998e-06 [rewriter_after_opt_a]: 4.136e-05 [convert_after_rewriter]: 7.77e-06 [order_py_execute_after_rewriter]: 6.14999e-06 [mutable_eliminate]: 0.00050042 [opt_b]: 0.00026573, [1] [Cycle 1]: 0.00025968, [7] [b_1]: 0.00017793 [b_2]: 1.053e-05 [updatestate_depend_eliminate]: 5.79e-06 [updatestate_assign_eliminate]: 3.22002e-06 [updatestate_loads_eliminate]: 3.58e-06 [renormalize]: 5.00004e-07 [cse]: 2.272e-05 [optimize_parallel_all_gather_comm]: 1.706e-05 [overlap_param_gather]: 2.53e-06 [cconv]: 2.45e-05 [loop_unroll]: 0.0004142 [opt_after_cconv]: 0.00011974, [1] [Cycle 1]: 0.00011442, [7] [c_1]: 4.244e-05 [parameter_eliminate]: 2.32999e-06 [updatestate_depend_eliminate]: 5.96998e-06 [updatestate_assign_eliminate]: 3.35e-06 [updatestate_loads_eliminate]: 3.03e-06 [cse]: 2.267e-05 [renormalize]: 3.19997e-07 [remove_dup_value]: 1.483e-05 [tuple_transform]: 9.13e-05, [1] [Cycle 1]: 8.699e-05, [4] [d_1]: 5.745e-05 [none_parameter_eliminate]: 1.97999e-06 [renormalize]: 2.9002e-07 [switch_simplify]: 8.79998e-06 [partial_unused_args_eliminate]: 1.71e-06 [add_recomputation]: 5.35e-05 [cse_after_recomputation]: 2.511e-05, [1] [Cycle 1]: 2.074e-05, [1] [cse]: 1.544e-05 [environ_conv]: 6.39001e-06 [swap_dp_allreduce_reducescatter]: 5.79e-06 [bias_add_comm_swap]: 2.37999e-06 [label_micro_interleaved_index]: 4.11001e-06 [label_fine_grained_interleaved_index]: 2.55002e-06 [merge_cast_opt]: 1.22e-06 [slice_recompute_activation]: 2.17001e-06 [micro_interleaved_order_control]: 2.39001e-06 [assign_add_opt]: 1.18001e-06 [ForceFp32Comm]: 8.70001e-07 [remove_cast_before_assign_add]: 1.49e-06 [full_micro_interleaved_order_control]: 2.04999e-06 [reorder_send_recv_between_fp_bp]: 2.63e-06 [comm_op_add_attrs]: 1.01997e-06 [add_comm_op_reuse_tag]: 9.39996e-07 [interleave_split_concat_branches]: 1.15999e-06 [interleave_parallel_branches]: 1.04e-06 [overlap_opt_shard_in_pipeline]: 1.48002e-06 [overlap_opt_shard_grad_in_pipeline]: 1.92999e-06 [control_data_broadcast_order]: 1.441e-05 [grouped_pairwise_exchange_alltoall]: 1.52999e-06 [offloading_packed_experts]: 4.76002e-06 [overlap_recompute_and_grad_model_parallel]: 5.40999e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.24e-06 [overlap_recompute_allgather_and_fa_grad]: 1.59e-06 [overlap_recompute_comm]: 2.21e-06 [overlap_grad_ring_attention]: 4.62e-06 [overlap_grad_flash_sp]: 2.085e-05 [begin_end_overlap_inline]: 6.00005e-07 [split_matmul_comm_elemetwise]: 2.03002e-06 [split_layernorm_comm]: 1.59e-06 [handle_group_info]: 8.90024e-07 [symbol_engine_optimizer]: 8.369e-05, [1] [Cycle 1]: 7.955e-05, [6] [build]: 3.18998e-06 [elim_shapecalc]: 1.165e-05 [elim_not_effective]: 1.494e-05 [opt_reshape]: 9.43002e-06 [fold_const_symbol]: 1.288e-05 [renormalize]: 2.00002e-07 [detach_backward]: 1.78002e-06 [pipeline_parallel_scheduler]: 1.35001e-06 [auto_monad_reorder]: 1.991e-05 [get_jit_bprop_graph]: 1.60999e-06 [rewriter_after_jit_bprop_graph]: 3.56999e-06 [opt_after_jit_grad]: 0.00045302 [validate]: 4.006e-05 Sums bootstrap : 0.000440s : 3.77% type_inference : 0.005934s : 50.88% event_method : 0.000022s : 0.19% auto_monad : 0.000067s : 0.58% graph_reusing : 0.000006s : 0.05% inline : 0.000002s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000022s : 0.19% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000007s : 0.06% parallel-infer-symbol : 0.000003s : 0.03% pre_auto_parallel : 0.000037s : 0.32% insert-virtual-dataset : 0.000003s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000030s : 0.26% optimize.rewriter_before_opt_a : 0.000093s : 0.80% optimize.opt_a.expand_dump_flag : 0.000005s : 0.04% optimize.opt_a.switch_simplify : 0.000058s : 0.50% optimize.opt_a.loop_unroll : 0.000045s : 0.39% optimize.opt_a.a_1 : 0.001074s : 9.21% optimize.opt_a.with_stream_mark : 0.000033s : 0.28% optimize.opt_a.recompute_prepare : 0.000022s : 0.19% optimize.opt_a.updatestate_depend_eliminate : 0.000010s : 0.08% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.06% optimize.opt_a.updatestate_loads_eliminate : 0.000007s : 0.06% optimize.opt_a.parameter_eliminate : 0.000003s : 0.02% optimize.opt_a.a_2 : 0.000225s : 1.93% optimize.opt_a.accelerated_algorithm : 0.000018s : 0.15% optimize.opt_a.shard : 0.000004s : 0.04% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.03% optimize.opt_a.shard_inline : 0.000016s : 0.14% optimize.opt_a.merge_send_recv : 0.000016s : 0.14% optimize.opt_a.auto_parallel : 0.000015s : 0.13% optimize.opt_a.parallel : 0.000029s : 0.25% optimize.opt_a.flash_sp : 0.000013s : 0.11% optimize.opt_a.merge_comm : 0.000010s : 0.08% optimize.opt_a.allreduce_fusion : 0.000009s : 0.07% optimize.opt_a.matmul_add_comm_reduction : 0.000018s : 0.16% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000021s : 0.18% optimize.opt_a.virtual_dataset : 0.000017s : 0.14% optimize.opt_a.get_grad_eliminate_ : 0.000016s : 0.14% optimize.opt_a.virtual_output : 0.000017s : 0.14% optimize.opt_a.merge_forward : 0.000009s : 0.08% optimize.opt_a.cell_reuse_recompute_pass : 0.000002s : 0.02% optimize.opt_a.offload_activation : 0.000019s : 0.16% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000036s : 0.31% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.02% optimize.opt_a.before_grad : 0.000027s : 0.24% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000011s : 0.09% optimize.opt_a.meta_fg_expand : 0.000006s : 0.06% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.04% optimize.opt_a.receive_attached : 0.000003s : 0.03% optimize.opt_a.after_resolve : 0.000029s : 0.25% optimize.opt_a.a_after_grad : 0.000026s : 0.22% optimize.opt_a.renormalize : 0.000735s : 6.31% optimize.opt_a.add_forward_monad_depend : 0.000007s : 0.06% optimize.opt_a.auto_monad_grad : 0.000004s : 0.03% optimize.opt_a.auto_monad_eliminator : 0.000025s : 0.21% optimize.opt_a.cse : 0.000055s : 0.47% optimize.opt_a.a_3 : 0.000111s : 0.96% optimize.py_interpret_to_execute_after_opt_a : 0.000011s : 0.10% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.02% optimize.rewriter_after_opt_a : 0.000041s : 0.35% optimize.convert_after_rewriter : 0.000008s : 0.07% optimize.order_py_execute_after_rewriter : 0.000006s : 0.05% optimize.mutable_eliminate : 0.000500s : 4.29% optimize.opt_b.b_1 : 0.000178s : 1.53% optimize.opt_b.b_2 : 0.000011s : 0.09% optimize.opt_b.updatestate_depend_eliminate : 0.000006s : 0.05% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_b.updatestate_loads_eliminate : 0.000004s : 0.03% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000023s : 0.19% optimize.optimize_parallel_all_gather_comm : 0.000017s : 0.15% optimize.overlap_param_gather : 0.000003s : 0.02% optimize.cconv : 0.000024s : 0.21% optimize.loop_unroll : 0.000414s : 3.55% optimize.opt_after_cconv.c_1 : 0.000042s : 0.36% optimize.opt_after_cconv.parameter_eliminate : 0.000002s : 0.02% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.05% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.cse : 0.000023s : 0.19% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000015s : 0.13% optimize.tuple_transform.d_1 : 0.000057s : 0.49% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.02% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000009s : 0.08% optimize.partial_unused_args_eliminate : 0.000002s : 0.01% optimize.add_recomputation : 0.000053s : 0.46% optimize.cse_after_recomputation.cse : 0.000015s : 0.13% optimize.environ_conv : 0.000006s : 0.05% optimize.swap_dp_allreduce_reducescatter : 0.000006s : 0.05% optimize.bias_add_comm_swap : 0.000002s : 0.02% optimize.label_micro_interleaved_index : 0.000004s : 0.04% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.02% optimize.merge_cast_opt : 0.000001s : 0.01% optimize.slice_recompute_activation : 0.000002s : 0.02% optimize.micro_interleaved_order_control : 0.000002s : 0.02% optimize.assign_add_opt : 0.000001s : 0.01% optimize.ForceFp32Comm : 0.000001s : 0.01% optimize.remove_cast_before_assign_add : 0.000001s : 0.01% optimize.full_micro_interleaved_order_control : 0.000002s : 0.02% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.02% optimize.comm_op_add_attrs : 0.000001s : 0.01% optimize.add_comm_op_reuse_tag : 0.000001s : 0.01% optimize.interleave_split_concat_branches : 0.000001s : 0.01% optimize.interleave_parallel_branches : 0.000001s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.02% optimize.control_data_broadcast_order : 0.000014s : 0.12% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.01% optimize.offloading_packed_experts : 0.000005s : 0.04% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.05% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000002s : 0.01% optimize.overlap_recompute_comm : 0.000002s : 0.02% optimize.overlap_grad_ring_attention : 0.000005s : 0.04% optimize.overlap_grad_flash_sp : 0.000021s : 0.18% optimize.begin_end_overlap_inline : 0.000001s : 0.01% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.02% optimize.split_layernorm_comm : 0.000002s : 0.01% optimize.handle_group_info : 0.000001s : 0.01% optimize.symbol_engine_optimizer.build : 0.000003s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000012s : 0.10% optimize.symbol_engine_optimizer.elim_not_effective : 0.000015s : 0.13% optimize.symbol_engine_optimizer.opt_reshape : 0.000009s : 0.08% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000013s : 0.11% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.02% pipeline_parallel_scheduler : 0.000001s : 0.01% auto_monad_reorder : 0.000020s : 0.17% get_jit_bprop_graph : 0.000002s : 0.01% rewriter_after_jit_bprop_graph : 0.000004s : 0.03% opt_after_jit_grad : 0.000453s : 3.88% validate : 0.000040s : 0.34% Time group info: ------[substitution.] 0.000241 44 10.88% : 0.000026s : 3: substitution.cast_eliminate 0.87% : 0.000002s : 3: substitution.elim_not_effective 0.77% : 0.000002s : 3: substitution.fold_const_symbol 3.01% : 0.000007s : 6: substitution.graph_param_transform 66.42% : 0.000160s : 4: substitution.inline 2.06% : 0.000005s : 6: substitution.j_node_and_user_rematch 3.40% : 0.000008s : 6: substitution.remove_not_recompute_node 2.22% : 0.000005s : 6: substitution.replace_old_param 8.06% : 0.000019s : 6: substitution.tuple_list_get_item_eliminator 2.30% : 0.000006s : 1: substitution.value_based_eliminate ------[type_inference.] 0.005866 2 86.54% : 0.005077s : 1: type_inference.infer 13.46% : 0.000789s : 1: type_inference.specialize ------[replace.] 0.000078 10 51.20% : 0.000040s : 4: replace.inline 48.80% : 0.000038s : 6: replace.tuple_list_get_item_eliminator ------[match.] 0.000174 10 90.34% : 0.000157s : 4: match.inline 9.66% : 0.000017s : 6: match.tuple_list_get_item_eliminator ------[predicate.] 0.000285 1908 0.96% : 0.000003s : 20: predicate.accumulaten_eliminater 0.62% : 0.000002s : 6: predicate.ad_related_special_op_eliminate 0.52% : 0.000001s : 12: predicate.addn_check_dump 0.98% : 0.000003s : 20: predicate.addn_zero_filter 0.87% : 0.000002s : 20: predicate.adjust_all_reduce_mul_add 1.88% : 0.000005s : 32: predicate.arithmetic_simplify 1.08% : 0.000003s : 20: predicate.cast_eliminate 0.59% : 0.000002s : 12: predicate.check_bprop_eliminate 0.54% : 0.000002s : 12: predicate.compare_switch_simplify 0.19% : 0.000001s : 6: predicate.const_output_eliminate 0.68% : 0.000002s : 12: predicate.depend_value_elim 1.09% : 0.000003s : 20: predicate.dict_get_item_const_eliminator 1.02% : 0.000003s : 20: predicate.dict_get_item_eliminator 0.95% : 0.000003s : 20: predicate.dict_set_item_eliminator 0.82% : 0.000002s : 12: predicate.dumpgradient_eliminate 0.21% : 0.000001s : 6: predicate.elim_not_effective 0.37% : 0.000001s : 6: predicate.elim_shapecalc_of_broadcastargs 1.21% : 0.000003s : 26: predicate.environ_add_const_eliminate 1.19% : 0.000003s : 26: predicate.environ_get_add_eliminate 1.19% : 0.000003s : 26: predicate.environ_get_depend_swap 1.73% : 0.000005s : 38: predicate.environ_get_eliminate 1.22% : 0.000003s : 26: predicate.environ_get_set_eliminate 1.47% : 0.000004s : 30: predicate.exchange_switch_depend_value 2.36% : 0.000007s : 30: predicate.float_depend_g_call 0.53% : 0.000002s : 12: predicate.float_environ_get_switch 0.82% : 0.000002s : 18: predicate.float_tuple_getitem_switch 0.19% : 0.000001s : 6: predicate.fold_const_symbol 0.64% : 0.000002s : 12: predicate.get_grad_eliminate 0.22% : 0.000001s : 6: predicate.graph_param_transform 0.59% : 0.000002s : 12: predicate.incorporate_call 0.49% : 0.000001s : 12: predicate.incorporate_call_switch 6.27% : 0.000018s : 86: predicate.inline 0.76% : 0.000002s : 12: predicate.inline_without_move 0.35% : 0.000001s : 12: predicate.j_node_and_user_rematch 0.78% : 0.000002s : 12: predicate.less_batch_normalization 1.89% : 0.000005s : 38: predicate.list_to_tuple_eliminator_ 2.81% : 0.000008s : 58: predicate.load_eliminater 0.73% : 0.000002s : 6: predicate.loop_unroll_after_grad 2.09% : 0.000006s : 42: predicate.loop_unroll_before_grad 1.70% : 0.000005s : 32: predicate.make_slice_get_slice_eliminator 0.56% : 0.000002s : 12: predicate.merge_addn 0.57% : 0.000002s : 12: predicate.micro_step_allgather_replace 0.60% : 0.000002s : 12: predicate.mini_step_allgather_replace 0.90% : 0.000003s : 20: predicate.minmaximum_grad 0.79% : 0.000002s : 6: predicate.mutable_eliminate 0.47% : 0.000001s : 6: predicate.opt_reshape 0.34% : 0.000001s : 6: predicate.parallel_virtual_node 1.68% : 0.000005s : 30: predicate.partial_defer_inline 1.79% : 0.000005s : 32: predicate.partial_eliminate 1.05% : 0.000003s : 20: predicate.print_const_string_wrapper 0.58% : 0.000002s : 12: predicate.reduce_all_const_elim 1.19% : 0.000003s : 20: predicate.reduce_eliminate 2.67% : 0.000008s : 58: predicate.redundant_stop_gradient_eliminater 0.48% : 0.000001s : 12: predicate.remove_not_recompute_node 1.41% : 0.000004s : 38: predicate.replace_applicator 0.51% : 0.000001s : 12: predicate.replace_old_param 0.25% : 0.000001s : 6: predicate.reset_defer_inline 0.98% : 0.000003s : 20: predicate.reshape_eliminate 0.61% : 0.000002s : 12: predicate.row_tensor_add_zeros_like 0.35% : 0.000001s : 6: predicate.row_tensor_eliminate 0.65% : 0.000002s : 12: predicate.same_eliminate 0.54% : 0.000002s : 12: predicate.set_cell_output_no_recompute 0.81% : 0.000002s : 12: predicate.shard_identity_eliminate 0.65% : 0.000002s : 12: predicate.special_op_eliminate 0.68% : 0.000002s : 12: predicate.specialize_transform 0.74% : 0.000002s : 12: predicate.split_environ_get_set_with_tuple_value 0.71% : 0.000002s : 12: predicate.stack_unstack_eliminate 0.34% : 0.000001s : 6: predicate.switch_call_monad_eliminater 1.56% : 0.000004s : 30: predicate.switch_defer_inline 2.06% : 0.000006s : 42: predicate.switch_layer_defer_inline 4.80% : 0.000014s : 90: predicate.switch_simplify 0.97% : 0.000003s : 20: predicate.tile_eliminate 0.92% : 0.000003s : 20: predicate.transpose_eliminate 1.55% : 0.000004s : 32: predicate.tuple_list_convert_item_index_to_positive 1.58% : 0.000005s : 32: predicate.tuple_list_get_item_const_eliminator 1.51% : 0.000004s : 32: predicate.tuple_list_get_item_depend_reorder 3.69% : 0.000011s : 50: predicate.tuple_list_get_item_eliminator 1.49% : 0.000004s : 32: predicate.tuple_list_get_set_item_eliminator 2.18% : 0.000006s : 44: predicate.tuple_list_set_item_eliminator 1.79% : 0.000005s : 38: predicate.tuple_to_list_eliminator_ 2.62% : 0.000007s : 58: predicate.updatestate_pure_node_eliminater 3.38% : 0.000010s : 70: predicate.updatestate_useless_node_eliminater 0.39% : 0.000001s : 6: predicate.value_based_eliminate 0.60% : 0.000002s : 12: predicate.virtual_dataset_eliminate 0.69% : 0.000002s : 12: predicate.virtual_output_eliminate 0.30% : 0.000001s : 6: predicate.virtual_view_grad_eliminate 0.41% : 0.000001s : 6: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000580 11 52.02% : 0.000302s : 5: func_graph_cloner_run.FuncGraphClonerGraph 47.98% : 0.000278s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.027055 192 0.01% : 0.000004s : 1: ForceFp32Comm 11.75% : 0.003180s : 1: add_attr 11.71% : 0.003169s : 1: add_attr_with_inline 0.01% : 0.000004s : 1: add_comm_op_reuse_tag 0.21% : 0.000057s : 1: add_recomputation 0.01% : 0.000004s : 1: assign_add_opt 0.27% : 0.000074s : 1: auto_monad 0.09% : 0.000024s : 1: auto_monad_reorder 0.01% : 0.000003s : 1: begin_end_overlap_inline 0.02% : 0.000005s : 1: bias_add_comm_swap 1.72% : 0.000467s : 1: bootstrap 0.10% : 0.000028s : 1: cconv 0.01% : 0.000004s : 1: comm_op_add_attrs 0.06% : 0.000018s : 1: control_data_broadcast_order 0.04% : 0.000011s : 1: convert_after_rewriter 0.10% : 0.000028s : 1: cse_after_recomputation 0.02% : 0.000005s : 1: dataset_repeat_opt 0.02% : 0.000005s : 1: detach_backward 0.04% : 0.000010s : 1: environ_conv 0.11% : 0.000029s : 1: event_method 0.02% : 0.000005s : 1: full_micro_interleaved_order_control 0.02% : 0.000005s : 1: get_jit_bprop_graph 0.04% : 0.000010s : 1: graph_reusing 0.02% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000004s : 1: handle_group_info 0.02% : 0.000005s : 1: inline 0.02% : 0.000006s : 1: insert-virtual-dataset 0.01% : 0.000004s : 1: interleave_parallel_branches 0.01% : 0.000004s : 1: interleave_split_concat_branches 0.02% : 0.000005s : 1: label_fine_grained_interleaved_index 0.03% : 0.000007s : 1: label_micro_interleaved_index 1.56% : 0.000422s : 1: loop_unroll 0.01% : 0.000004s : 1: merge_cast_opt 0.02% : 0.000005s : 1: micro_interleaved_order_control 1.88% : 0.000508s : 1: mutable_eliminate 0.03% : 0.000008s : 1: offloading_packed_experts 0.06% : 0.000016s : 1: opt.transform.loop_unroll_optimizer 0.06% : 0.000016s : 1: opt.transform.mutable_eliminate 6.15% : 0.001663s : 78: opt.transform.opt_a 0.15% : 0.000041s : 1: opt.transform.opt_after_cconv 0.11% : 0.000030s : 1: opt.transform.opt_after_jit_grad 0.58% : 0.000157s : 28: opt.transform.opt_b 0.24% : 0.000064s : 2: opt.transform.opt_trans_graph 0.17% : 0.000045s : 4: opt.transform.symbol_engine_opt 12.25% : 0.003315s : 1: opt_a 0.46% : 0.000123s : 1: opt_after_cconv 1.71% : 0.000462s : 1: opt_after_jit_grad 0.99% : 0.000269s : 1: opt_b 20.11% : 0.005442s : 1: optimize 0.08% : 0.000021s : 1: optimize_parallel_all_gather_comm 0.03% : 0.000009s : 1: order_py_execute_after_rewriter 0.09% : 0.000024s : 1: overlap_grad_flash_sp 0.01% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.03% : 0.000008s : 1: overlap_grad_ring_attention 0.02% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.02% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.02% : 0.000006s : 1: overlap_param_gather 0.02% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.03% : 0.000008s : 1: overlap_recompute_and_grad_model_parallel 0.02% : 0.000005s : 1: overlap_recompute_comm 0.03% : 0.000007s : 1: parallel-infer-symbol 0.02% : 0.000004s : 1: parallel-infer-symbol-second 0.02% : 0.000005s : 1: partial_unused_args_eliminate 0.02% : 0.000004s : 1: pipeline_parallel_scheduler 0.02% : 0.000005s : 1: pipeline_split 0.15% : 0.000041s : 1: pre_auto_parallel 0.13% : 0.000035s : 1: py_interpret_to_execute 0.05% : 0.000014s : 1: py_interpret_to_execute_after_opt_a 0.02% : 0.000004s : 1: remove_cast_before_assign_add 0.07% : 0.000018s : 1: remove_dup_value 1.47% : 0.000397s : 1: renormalize.infer 1.22% : 0.000330s : 1: renormalize.specialize 0.02% : 0.000005s : 1: reorder_send_recv_between_fp_bp 0.02% : 0.000007s : 1: rewriter_after_jit_bprop_graph 0.17% : 0.000045s : 1: rewriter_after_opt_a 0.36% : 0.000098s : 1: rewriter_before_opt_a 0.02% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.02% : 0.000005s : 1: slice_recompute_activation 0.02% : 0.000004s : 1: split_layernorm_comm 0.02% : 0.000005s : 1: split_matmul_comm_elemetwise 0.03% : 0.000009s : 1: swap_dp_allreduce_reducescatter 0.32% : 0.000086s : 1: symbol_engine_optimizer 0.35% : 0.000094s : 1: tuple_transform 22.00% : 0.005952s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:47:17.713.886 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:47:17.714.134 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0167871, [21] [bootstrap]: 0.00045076 [type_inference]: 0.00580166 [event_method]: 2.063e-05 [auto_monad]: 6.671e-05 [graph_reusing]: 6.17001e-06 [inline]: 1.96998e-06 [add_attr]: 0.0030433, [1] [add_attr_with_inline]: 0.00303502, [1] [Cycle 1]: 7.066e-05, [2] [tag_attr]: 2.057e-05 [meta_addattr_fg_expand]: 6.95998e-06 [parallel-infer-symbol]: 3.35e-06 [pre_auto_parallel]: 3.476e-05 [insert-virtual-dataset]: 2.35002e-06 [parallel-infer-symbol-second]: 7.39994e-07 [dataset_repeat_opt]: 1.83002e-06 [pipeline_split]: 1.60001e-06 [optimize]: 0.00623422, [53] [py_interpret_to_execute]: 3.292e-05 [rewriter_before_opt_a]: 0.00010138 [opt_a]: 0.00372076, [2] [Cycle 1]: 0.0026423, [45] [expand_dump_flag]: 3.21001e-06 [switch_simplify]: 4.791e-05 [loop_unroll]: 3.693e-05 [a_1]: 0.00088699 [with_stream_mark]: 1.707e-05 [recompute_prepare]: 1.174e-05 [updatestate_depend_eliminate]: 5.42001e-06 [updatestate_assign_eliminate]: 5.05999e-06 [updatestate_loads_eliminate]: 4.32e-06 [parameter_eliminate]: 1.79998e-06 [a_2]: 0.00015648 [accelerated_algorithm]: 1.041e-05 [shard]: 1.97999e-06 [meta_shard_fg_expand]: 2.08002e-06 [shard_inline]: 9.77001e-06 [merge_send_recv]: 1.015e-05 [auto_parallel]: 7.89002e-06 [parallel]: 1.785e-05 [flash_sp]: 8.30999e-06 [merge_comm]: 6.29999e-06 [allreduce_fusion]: 5.08002e-06 [matmul_add_comm_reduction]: 1.146e-05 [allreduce_slice_to_reducescatter]: 9.50007e-07 [virtual_shard_identity]: 1.134e-05 [virtual_dataset]: 1.015e-05 [get_grad_eliminate_]: 9.29e-06 [virtual_output]: 9.55001e-06 [merge_forward]: 5.32999e-06 [cell_reuse_recompute_pass]: 1.10001e-06 [offload_activation]: 1.165e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.992e-05 [merge_recompute_call_nodes]: 1.49998e-06 [before_grad]: 1.639e-05 [set_forward_comm_id_for_comm_node_pass]: 5.54e-06 [meta_fg_expand]: 3.95998e-06 [flash_sp_send_recv_attached]: 2.42001e-06 [receive_attached]: 2.33998e-06 [after_resolve]: 1.64e-05 [a_after_grad]: 1.505e-05 [renormalize]: 0.0006916 [add_forward_monad_depend]: 5.64998e-06 [auto_monad_grad]: 2.09e-06 [auto_monad_eliminator]: 1.872e-05 [cse]: 4.939e-05 [a_3]: 8.361e-05 [Cycle 2]: 0.00106535, [45] [expand_dump_flag]: 1.19003e-06 [switch_simplify]: 1.083e-05 [loop_unroll]: 9.81e-06 [a_1]: 0.00023843 [with_stream_mark]: 1.346e-05 [recompute_prepare]: 9.31e-06 [updatestate_depend_eliminate]: 4.43999e-06 [updatestate_assign_eliminate]: 3.68e-06 [updatestate_loads_eliminate]: 3.56999e-06 [parameter_eliminate]: 1.20999e-06 [a_2]: 0.00014513 [accelerated_algorithm]: 9.11998e-06 [shard]: 1.49e-06 [meta_shard_fg_expand]: 1.94999e-06 [shard_inline]: 9.07001e-06 [merge_send_recv]: 6.94001e-06 [auto_parallel]: 6.93e-06 [parallel]: 4.25e-06 [flash_sp]: 3.46001e-06 [merge_comm]: 4.85001e-06 [allreduce_fusion]: 4.62e-06 [matmul_add_comm_reduction]: 7.97e-06 [allreduce_slice_to_reducescatter]: 3.59985e-07 [virtual_shard_identity]: 9.97999e-06 [virtual_dataset]: 9.34e-06 [get_grad_eliminate_]: 9.52001e-06 [virtual_output]: 8.92999e-06 [merge_forward]: 4.27998e-06 [cell_reuse_recompute_pass]: 1.46002e-06 [offload_activation]: 8.18999e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.863e-05 [merge_recompute_call_nodes]: 1.13001e-06 [before_grad]: 1.449e-05 [set_forward_comm_id_for_comm_node_pass]: 5.09e-06 [meta_fg_expand]: 3.55e-06 [flash_sp_send_recv_attached]: 8.30012e-07 [receive_attached]: 1.02e-06 [after_resolve]: 1.444e-05 [a_after_grad]: 1.403e-05 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 1.35999e-06 [auto_monad_grad]: 9.49978e-07 [auto_monad_eliminator]: 9.71998e-06 [cse]: 2.427e-05 [a_3]: 7.366e-05 [py_interpret_to_execute_after_opt_a]: 1.482e-05 [slice_cell_reuse_recomputed_activation]: 4.72e-06 [rewriter_after_opt_a]: 4.773e-05 [convert_after_rewriter]: 1.153e-05 [order_py_execute_after_rewriter]: 9.04003e-06 [mutable_eliminate]: 0.00048656 [opt_b]: 0.00037391, [1] [Cycle 1]: 0.00036404, [7] [b_1]: 0.00025109 [b_2]: 1.122e-05 [updatestate_depend_eliminate]: 6.69999e-06 [updatestate_assign_eliminate]: 3.76999e-06 [updatestate_loads_eliminate]: 4.2e-06 [renormalize]: 4.2998e-07 [cse]: 3.002e-05 [optimize_parallel_all_gather_comm]: 2.331e-05 [overlap_param_gather]: 4.92e-06 [cconv]: 2.567e-05 [loop_unroll]: 0.00043294 [opt_after_cconv]: 0.00015995, [1] [Cycle 1]: 0.00015135, [7] [c_1]: 5.003e-05 [parameter_eliminate]: 2.28002e-06 [updatestate_depend_eliminate]: 6.77002e-06 [updatestate_assign_eliminate]: 3.89002e-06 [updatestate_loads_eliminate]: 3.56999e-06 [cse]: 2.786e-05 [renormalize]: 3.80009e-07 [remove_dup_value]: 3.967e-05 [tuple_transform]: 0.00011693, [1] [Cycle 1]: 0.00010971, [4] [d_1]: 6.668e-05 [none_parameter_eliminate]: 1.77999e-06 [renormalize]: 2.50002e-07 [switch_simplify]: 1.008e-05 [partial_unused_args_eliminate]: 4.23001e-06 [add_recomputation]: 6.115e-05 [cse_after_recomputation]: 3.468e-05, [1] [Cycle 1]: 2.761e-05, [1] [cse]: 1.841e-05 [environ_conv]: 9.22999e-06 [swap_dp_allreduce_reducescatter]: 9.20001e-06 [bias_add_comm_swap]: 5.05001e-06 [label_micro_interleaved_index]: 6.75998e-06 [label_fine_grained_interleaved_index]: 5.02999e-06 [merge_cast_opt]: 3.57997e-06 [slice_recompute_activation]: 4.52998e-06 [micro_interleaved_order_control]: 4.88001e-06 [assign_add_opt]: 3.45e-06 [ForceFp32Comm]: 3.16999e-06 [remove_cast_before_assign_add]: 3.21999e-06 [full_micro_interleaved_order_control]: 4.50999e-06 [reorder_send_recv_between_fp_bp]: 5.10999e-06 [comm_op_add_attrs]: 3.5e-06 [add_comm_op_reuse_tag]: 3.46999e-06 [interleave_split_concat_branches]: 3.58e-06 [interleave_parallel_branches]: 3.55e-06 [overlap_opt_shard_in_pipeline]: 3.37002e-06 [overlap_opt_shard_grad_in_pipeline]: 4.03999e-06 [control_data_broadcast_order]: 1.872e-05 [grouped_pairwise_exchange_alltoall]: 3.72002e-06 [offloading_packed_experts]: 7.46999e-06 [overlap_recompute_and_grad_model_parallel]: 7.63001e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.5e-06 [overlap_recompute_allgather_and_fa_grad]: 3.9e-06 [overlap_recompute_comm]: 4.52e-06 [overlap_grad_ring_attention]: 7.66001e-06 [overlap_grad_flash_sp]: 4.45e-05 [begin_end_overlap_inline]: 3.01999e-06 [split_matmul_comm_elemetwise]: 4.89e-06 [split_layernorm_comm]: 4.25e-06 [handle_group_info]: 3.25e-06 [symbol_engine_optimizer]: 0.0001128, [1] [Cycle 1]: 0.00010568, [6] [build]: 3.10002e-06 [elim_shapecalc]: 1.353e-05 [elim_not_effective]: 1.828e-05 [opt_reshape]: 1.011e-05 [fold_const_symbol]: 1.52e-05 [renormalize]: 2.80008e-07 [detach_backward]: 3.25998e-06 [pipeline_parallel_scheduler]: 1.94e-06 [auto_monad_reorder]: 2.315e-05 [get_jit_bprop_graph]: 1.20999e-06 [rewriter_after_jit_bprop_graph]: 4.19997e-06 [opt_after_jit_grad]: 0.00048359 [validate]: 3.98e-05 Sums bootstrap : 0.000451s : 3.74% type_inference : 0.005802s : 48.17% event_method : 0.000021s : 0.17% auto_monad : 0.000067s : 0.55% graph_reusing : 0.000006s : 0.05% inline : 0.000002s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000021s : 0.17% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000007s : 0.06% parallel-infer-symbol : 0.000003s : 0.03% pre_auto_parallel : 0.000035s : 0.29% insert-virtual-dataset : 0.000002s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000033s : 0.27% optimize.rewriter_before_opt_a : 0.000101s : 0.84% optimize.opt_a.expand_dump_flag : 0.000004s : 0.04% optimize.opt_a.switch_simplify : 0.000059s : 0.49% optimize.opt_a.loop_unroll : 0.000047s : 0.39% optimize.opt_a.a_1 : 0.001125s : 9.34% optimize.opt_a.with_stream_mark : 0.000031s : 0.25% optimize.opt_a.recompute_prepare : 0.000021s : 0.17% optimize.opt_a.updatestate_depend_eliminate : 0.000010s : 0.08% optimize.opt_a.updatestate_assign_eliminate : 0.000009s : 0.07% optimize.opt_a.updatestate_loads_eliminate : 0.000008s : 0.07% optimize.opt_a.parameter_eliminate : 0.000003s : 0.02% optimize.opt_a.a_2 : 0.000302s : 2.50% optimize.opt_a.accelerated_algorithm : 0.000020s : 0.16% optimize.opt_a.shard : 0.000003s : 0.03% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.03% optimize.opt_a.shard_inline : 0.000019s : 0.16% optimize.opt_a.merge_send_recv : 0.000017s : 0.14% optimize.opt_a.auto_parallel : 0.000015s : 0.12% optimize.opt_a.parallel : 0.000022s : 0.18% optimize.opt_a.flash_sp : 0.000012s : 0.10% optimize.opt_a.merge_comm : 0.000011s : 0.09% optimize.opt_a.allreduce_fusion : 0.000010s : 0.08% optimize.opt_a.matmul_add_comm_reduction : 0.000019s : 0.16% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000021s : 0.18% optimize.opt_a.virtual_dataset : 0.000019s : 0.16% optimize.opt_a.get_grad_eliminate_ : 0.000019s : 0.16% optimize.opt_a.virtual_output : 0.000018s : 0.15% optimize.opt_a.merge_forward : 0.000010s : 0.08% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.02% optimize.opt_a.offload_activation : 0.000020s : 0.16% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000039s : 0.32% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.02% optimize.opt_a.before_grad : 0.000031s : 0.26% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000011s : 0.09% optimize.opt_a.meta_fg_expand : 0.000008s : 0.06% optimize.opt_a.flash_sp_send_recv_attached : 0.000003s : 0.03% optimize.opt_a.receive_attached : 0.000003s : 0.03% optimize.opt_a.after_resolve : 0.000031s : 0.26% optimize.opt_a.a_after_grad : 0.000029s : 0.24% optimize.opt_a.renormalize : 0.000692s : 5.74% optimize.opt_a.add_forward_monad_depend : 0.000007s : 0.06% optimize.opt_a.auto_monad_grad : 0.000003s : 0.03% optimize.opt_a.auto_monad_eliminator : 0.000028s : 0.24% optimize.opt_a.cse : 0.000074s : 0.61% optimize.opt_a.a_3 : 0.000157s : 1.31% optimize.py_interpret_to_execute_after_opt_a : 0.000015s : 0.12% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.04% optimize.rewriter_after_opt_a : 0.000048s : 0.40% optimize.convert_after_rewriter : 0.000012s : 0.10% optimize.order_py_execute_after_rewriter : 0.000009s : 0.08% optimize.mutable_eliminate : 0.000487s : 4.04% optimize.opt_b.b_1 : 0.000251s : 2.08% optimize.opt_b.b_2 : 0.000011s : 0.09% optimize.opt_b.updatestate_depend_eliminate : 0.000007s : 0.06% optimize.opt_b.updatestate_assign_eliminate : 0.000004s : 0.03% optimize.opt_b.updatestate_loads_eliminate : 0.000004s : 0.03% optimize.opt_b.renormalize : 0.000000s : 0.00% optimize.opt_b.cse : 0.000030s : 0.25% optimize.optimize_parallel_all_gather_comm : 0.000023s : 0.19% optimize.overlap_param_gather : 0.000005s : 0.04% optimize.cconv : 0.000026s : 0.21% optimize.loop_unroll : 0.000433s : 3.59% optimize.opt_after_cconv.c_1 : 0.000050s : 0.42% optimize.opt_after_cconv.parameter_eliminate : 0.000002s : 0.02% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000007s : 0.06% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000004s : 0.03% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000004s : 0.03% optimize.opt_after_cconv.cse : 0.000028s : 0.23% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000040s : 0.33% optimize.tuple_transform.d_1 : 0.000067s : 0.55% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000010s : 0.08% optimize.partial_unused_args_eliminate : 0.000004s : 0.04% optimize.add_recomputation : 0.000061s : 0.51% optimize.cse_after_recomputation.cse : 0.000018s : 0.15% optimize.environ_conv : 0.000009s : 0.08% optimize.swap_dp_allreduce_reducescatter : 0.000009s : 0.08% optimize.bias_add_comm_swap : 0.000005s : 0.04% optimize.label_micro_interleaved_index : 0.000007s : 0.06% optimize.label_fine_grained_interleaved_index : 0.000005s : 0.04% optimize.merge_cast_opt : 0.000004s : 0.03% optimize.slice_recompute_activation : 0.000005s : 0.04% optimize.micro_interleaved_order_control : 0.000005s : 0.04% optimize.assign_add_opt : 0.000003s : 0.03% optimize.ForceFp32Comm : 0.000003s : 0.03% optimize.remove_cast_before_assign_add : 0.000003s : 0.03% optimize.full_micro_interleaved_order_control : 0.000005s : 0.04% optimize.reorder_send_recv_between_fp_bp : 0.000005s : 0.04% optimize.comm_op_add_attrs : 0.000003s : 0.03% optimize.add_comm_op_reuse_tag : 0.000003s : 0.03% optimize.interleave_split_concat_branches : 0.000004s : 0.03% optimize.interleave_parallel_branches : 0.000004s : 0.03% optimize.overlap_opt_shard_in_pipeline : 0.000003s : 0.03% optimize.overlap_opt_shard_grad_in_pipeline : 0.000004s : 0.03% optimize.control_data_broadcast_order : 0.000019s : 0.16% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.03% optimize.offloading_packed_experts : 0.000007s : 0.06% optimize.overlap_recompute_and_grad_model_parallel : 0.000008s : 0.06% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000003s : 0.03% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.03% optimize.overlap_recompute_comm : 0.000005s : 0.04% optimize.overlap_grad_ring_attention : 0.000008s : 0.06% optimize.overlap_grad_flash_sp : 0.000045s : 0.37% optimize.begin_end_overlap_inline : 0.000003s : 0.03% optimize.split_matmul_comm_elemetwise : 0.000005s : 0.04% optimize.split_layernorm_comm : 0.000004s : 0.04% optimize.handle_group_info : 0.000003s : 0.03% optimize.symbol_engine_optimizer.build : 0.000003s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000014s : 0.11% optimize.symbol_engine_optimizer.elim_not_effective : 0.000018s : 0.15% optimize.symbol_engine_optimizer.opt_reshape : 0.000010s : 0.08% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000015s : 0.13% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000003s : 0.03% pipeline_parallel_scheduler : 0.000002s : 0.02% auto_monad_reorder : 0.000023s : 0.19% get_jit_bprop_graph : 0.000001s : 0.01% rewriter_after_jit_bprop_graph : 0.000004s : 0.03% opt_after_jit_grad : 0.000484s : 4.02% validate : 0.000040s : 0.33% Time group info: ------[substitution.] 0.000238 54 11.43% : 0.000027s : 6: substitution.cast_eliminate 1.03% : 0.000002s : 4: substitution.elim_not_effective 0.95% : 0.000002s : 4: substitution.fold_const_symbol 3.28% : 0.000008s : 7: substitution.graph_param_transform 64.90% : 0.000154s : 4: substitution.inline 2.46% : 0.000006s : 8: substitution.j_node_and_user_rematch 3.11% : 0.000007s : 8: substitution.remove_not_recompute_node 2.35% : 0.000006s : 6: substitution.replace_old_param 8.08% : 0.000019s : 6: substitution.tuple_list_get_item_eliminator 2.40% : 0.000006s : 1: substitution.value_based_eliminate ------[type_inference.] 0.005752 2 86.99% : 0.005003s : 1: type_inference.infer 13.01% : 0.000748s : 1: type_inference.specialize ------[replace.] 0.000072 10 53.42% : 0.000038s : 4: replace.inline 46.58% : 0.000034s : 6: replace.tuple_list_get_item_eliminator ------[match.] 0.000168 10 90.29% : 0.000152s : 4: match.inline 9.71% : 0.000016s : 6: match.tuple_list_get_item_eliminator ------[predicate.] 0.000321 2134 0.92% : 0.000003s : 22: predicate.accumulaten_eliminater 0.65% : 0.000002s : 7: predicate.ad_related_special_op_eliminate 0.59% : 0.000002s : 14: predicate.addn_check_dump 0.93% : 0.000003s : 22: predicate.addn_zero_filter 0.91% : 0.000003s : 22: predicate.adjust_all_reduce_mul_add 2.12% : 0.000007s : 36: predicate.arithmetic_simplify 1.04% : 0.000003s : 22: predicate.cast_eliminate 0.64% : 0.000002s : 14: predicate.check_bprop_eliminate 0.60% : 0.000002s : 14: predicate.compare_switch_simplify 0.20% : 0.000001s : 7: predicate.const_output_eliminate 0.63% : 0.000002s : 14: predicate.depend_value_elim 1.07% : 0.000003s : 22: predicate.dict_get_item_const_eliminator 1.09% : 0.000004s : 22: predicate.dict_get_item_eliminator 0.94% : 0.000003s : 22: predicate.dict_set_item_eliminator 0.87% : 0.000003s : 14: predicate.dumpgradient_eliminate 0.27% : 0.000001s : 7: predicate.elim_not_effective 0.40% : 0.000001s : 7: predicate.elim_shapecalc_of_broadcastargs 1.24% : 0.000004s : 29: predicate.environ_add_const_eliminate 1.16% : 0.000004s : 29: predicate.environ_get_add_eliminate 1.20% : 0.000004s : 29: predicate.environ_get_depend_swap 1.81% : 0.000006s : 43: predicate.environ_get_eliminate 1.18% : 0.000004s : 29: predicate.environ_get_set_eliminate 1.37% : 0.000004s : 32: predicate.exchange_switch_depend_value 2.23% : 0.000007s : 32: predicate.float_depend_g_call 0.58% : 0.000002s : 14: predicate.float_environ_get_switch 0.86% : 0.000003s : 21: predicate.float_tuple_getitem_switch 0.20% : 0.000001s : 7: predicate.fold_const_symbol 0.67% : 0.000002s : 14: predicate.get_grad_eliminate 0.21% : 0.000001s : 7: predicate.graph_param_transform 0.65% : 0.000002s : 14: predicate.incorporate_call 0.55% : 0.000002s : 14: predicate.incorporate_call_switch 6.13% : 0.000020s : 96: predicate.inline 0.81% : 0.000003s : 14: predicate.inline_without_move 0.33% : 0.000001s : 14: predicate.j_node_and_user_rematch 0.76% : 0.000002s : 14: predicate.less_batch_normalization 1.94% : 0.000006s : 42: predicate.list_to_tuple_eliminator_ 2.61% : 0.000008s : 64: predicate.load_eliminater 0.69% : 0.000002s : 7: predicate.loop_unroll_after_grad 1.97% : 0.000006s : 44: predicate.loop_unroll_before_grad 1.69% : 0.000005s : 36: predicate.make_slice_get_slice_eliminator 0.65% : 0.000002s : 14: predicate.merge_addn 0.59% : 0.000002s : 14: predicate.micro_step_allgather_replace 0.63% : 0.000002s : 14: predicate.mini_step_allgather_replace 0.91% : 0.000003s : 22: predicate.minmaximum_grad 0.77% : 0.000002s : 7: predicate.mutable_eliminate 0.33% : 0.000001s : 7: predicate.opt_reshape 0.37% : 0.000001s : 7: predicate.parallel_virtual_node 1.68% : 0.000005s : 32: predicate.partial_defer_inline 1.78% : 0.000006s : 35: predicate.partial_eliminate 0.93% : 0.000003s : 22: predicate.print_const_string_wrapper 0.65% : 0.000002s : 14: predicate.reduce_all_const_elim 1.22% : 0.000004s : 22: predicate.reduce_eliminate 2.74% : 0.000009s : 64: predicate.redundant_stop_gradient_eliminater 0.41% : 0.000001s : 14: predicate.remove_not_recompute_node 1.40% : 0.000004s : 42: predicate.replace_applicator 0.42% : 0.000001s : 14: predicate.replace_old_param 0.24% : 0.000001s : 7: predicate.reset_defer_inline 0.98% : 0.000003s : 22: predicate.reshape_eliminate 0.64% : 0.000002s : 14: predicate.row_tensor_add_zeros_like 0.35% : 0.000001s : 7: predicate.row_tensor_eliminate 0.77% : 0.000002s : 14: predicate.same_eliminate 0.45% : 0.000001s : 14: predicate.set_cell_output_no_recompute 0.76% : 0.000002s : 14: predicate.shard_identity_eliminate 0.78% : 0.000003s : 14: predicate.special_op_eliminate 0.77% : 0.000002s : 14: predicate.specialize_transform 0.80% : 0.000003s : 14: predicate.split_environ_get_set_with_tuple_value 0.71% : 0.000002s : 14: predicate.stack_unstack_eliminate 0.37% : 0.000001s : 7: predicate.switch_call_monad_eliminater 1.47% : 0.000005s : 32: predicate.switch_defer_inline 2.07% : 0.000007s : 46: predicate.switch_layer_defer_inline 4.64% : 0.000015s : 97: predicate.switch_simplify 0.91% : 0.000003s : 22: predicate.tile_eliminate 0.91% : 0.000003s : 22: predicate.transpose_eliminate 1.58% : 0.000005s : 36: predicate.tuple_list_convert_item_index_to_positive 1.66% : 0.000005s : 36: predicate.tuple_list_get_item_const_eliminator 1.57% : 0.000005s : 36: predicate.tuple_list_get_item_depend_reorder 3.19% : 0.000010s : 56: predicate.tuple_list_get_item_eliminator 1.63% : 0.000005s : 36: predicate.tuple_list_get_set_item_eliminator 2.28% : 0.000007s : 50: predicate.tuple_list_set_item_eliminator 1.90% : 0.000006s : 42: predicate.tuple_to_list_eliminator_ 2.62% : 0.000008s : 64: predicate.updatestate_pure_node_eliminater 3.36% : 0.000011s : 78: predicate.updatestate_useless_node_eliminater 0.42% : 0.000001s : 7: predicate.value_based_eliminate 0.64% : 0.000002s : 14: predicate.virtual_dataset_eliminate 0.65% : 0.000002s : 14: predicate.virtual_output_eliminate 0.30% : 0.000001s : 7: predicate.virtual_view_grad_eliminate 0.38% : 0.000001s : 7: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000533 11 53.60% : 0.000286s : 5: func_graph_cloner_run.FuncGraphClonerGraph 46.40% : 0.000247s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.028850 192 0.02% : 0.000006s : 1: ForceFp32Comm 10.58% : 0.003052s : 1: add_attr 10.53% : 0.003039s : 1: add_attr_with_inline 0.02% : 0.000006s : 1: add_comm_op_reuse_tag 0.22% : 0.000064s : 1: add_recomputation 0.02% : 0.000006s : 1: assign_add_opt 0.26% : 0.000076s : 1: auto_monad 0.10% : 0.000030s : 1: auto_monad_reorder 0.02% : 0.000006s : 1: begin_end_overlap_inline 0.03% : 0.000008s : 1: bias_add_comm_swap 1.71% : 0.000495s : 1: bootstrap 0.10% : 0.000029s : 1: cconv 0.02% : 0.000006s : 1: comm_op_add_attrs 0.07% : 0.000022s : 1: control_data_broadcast_order 0.05% : 0.000015s : 1: convert_after_rewriter 0.13% : 0.000038s : 1: cse_after_recomputation 0.03% : 0.000008s : 1: dataset_repeat_opt 0.06% : 0.000017s : 1: detach_backward 0.04% : 0.000012s : 1: environ_conv 0.10% : 0.000030s : 1: event_method 0.02% : 0.000007s : 1: full_micro_interleaved_order_control 0.03% : 0.000007s : 1: get_jit_bprop_graph 0.04% : 0.000013s : 1: graph_reusing 0.02% : 0.000006s : 1: grouped_pairwise_exchange_alltoall 0.02% : 0.000006s : 1: handle_group_info 0.03% : 0.000008s : 1: inline 0.03% : 0.000009s : 1: insert-virtual-dataset 0.02% : 0.000006s : 1: interleave_parallel_branches 0.02% : 0.000007s : 1: interleave_split_concat_branches 0.03% : 0.000008s : 1: label_fine_grained_interleaved_index 0.03% : 0.000010s : 1: label_micro_interleaved_index 1.52% : 0.000439s : 1: loop_unroll 0.02% : 0.000006s : 1: merge_cast_opt 0.03% : 0.000007s : 1: micro_interleaved_order_control 1.71% : 0.000493s : 1: mutable_eliminate 0.04% : 0.000010s : 1: offloading_packed_experts 0.06% : 0.000018s : 1: opt.transform.loop_unroll_optimizer 0.06% : 0.000018s : 1: opt.transform.mutable_eliminate 6.19% : 0.001785s : 78: opt.transform.opt_a 0.17% : 0.000049s : 1: opt.transform.opt_after_cconv 0.12% : 0.000035s : 1: opt.transform.opt_after_jit_grad 0.66% : 0.000190s : 28: opt.transform.opt_b 0.26% : 0.000074s : 2: opt.transform.opt_trans_graph 0.19% : 0.000053s : 4: opt.transform.symbol_engine_opt 12.91% : 0.003724s : 1: opt_a 0.57% : 0.000164s : 1: opt_after_cconv 1.71% : 0.000494s : 1: opt_after_jit_grad 1.31% : 0.000377s : 1: opt_b 22.74% : 0.006560s : 1: optimize 0.09% : 0.000027s : 1: optimize_parallel_all_gather_comm 0.04% : 0.000012s : 1: order_py_execute_after_rewriter 0.17% : 0.000048s : 1: overlap_grad_flash_sp 0.02% : 0.000006s : 1: overlap_grad_matmul_and_grad_allreduce 0.04% : 0.000011s : 1: overlap_grad_ring_attention 0.02% : 0.000007s : 1: overlap_opt_shard_grad_in_pipeline 0.02% : 0.000006s : 1: overlap_opt_shard_in_pipeline 0.03% : 0.000008s : 1: overlap_param_gather 0.02% : 0.000006s : 1: overlap_recompute_allgather_and_fa_grad 0.04% : 0.000010s : 1: overlap_recompute_and_grad_model_parallel 0.03% : 0.000007s : 1: overlap_recompute_comm 0.03% : 0.000010s : 1: parallel-infer-symbol 0.02% : 0.000006s : 1: parallel-infer-symbol-second 0.03% : 0.000007s : 1: partial_unused_args_eliminate 0.03% : 0.000010s : 1: pipeline_parallel_scheduler 0.02% : 0.000007s : 1: pipeline_split 0.15% : 0.000042s : 1: pre_auto_parallel 0.13% : 0.000037s : 1: py_interpret_to_execute 0.06% : 0.000018s : 1: py_interpret_to_execute_after_opt_a 0.02% : 0.000006s : 1: remove_cast_before_assign_add 0.15% : 0.000043s : 1: remove_dup_value 1.28% : 0.000368s : 1: renormalize.infer 1.09% : 0.000315s : 1: renormalize.specialize 0.03% : 0.000008s : 1: reorder_send_recv_between_fp_bp 0.04% : 0.000010s : 1: rewriter_after_jit_bprop_graph 0.18% : 0.000051s : 1: rewriter_after_opt_a 0.36% : 0.000105s : 1: rewriter_before_opt_a 0.03% : 0.000008s : 1: slice_cell_reuse_recomputed_activation 0.02% : 0.000007s : 1: slice_recompute_activation 0.02% : 0.000007s : 1: split_layernorm_comm 0.03% : 0.000007s : 1: split_matmul_comm_elemetwise 0.04% : 0.000012s : 1: swap_dp_allreduce_reducescatter 0.40% : 0.000116s : 1: symbol_engine_optimizer 0.42% : 0.000120s : 1: tuple_transform 20.23% : 0.005835s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:47:17.908.976 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0157453, [21] [bootstrap]: 0.00043656 [type_inference]: 0.00575671 [event_method]: 2.118e-05 [auto_monad]: 6.745e-05 [graph_reusing]: 5.92999e-06 [inline]: 2.12999e-06 [add_attr]: 0.00299072, [1] [add_attr_with_inline]: 0.00298252, [1] [Cycle 1]: 5.625e-05, [2] [tag_attr]: 2.069e-05 [meta_addattr_fg_expand]: 6.29999e-06 [parallel-infer-symbol]: 2.96001e-06 [pre_auto_parallel]: 3.362e-05 [insert-virtual-dataset]: 2.71999e-06 [parallel-infer-symbol-second]: 8.2e-07 [dataset_repeat_opt]: 1.72999e-06 [pipeline_split]: 1.62001e-06 [optimize]: 0.00573756, [53] [py_interpret_to_execute]: 2.813e-05 [rewriter_before_opt_a]: 9.42e-05 [opt_a]: 0.00346157, [2] [Cycle 1]: 0.00254353, [45] [expand_dump_flag]: 2.74999e-06 [switch_simplify]: 4.826e-05 [loop_unroll]: 3.667e-05 [a_1]: 0.00090894 [with_stream_mark]: 1.598e-05 [recompute_prepare]: 1.178e-05 [updatestate_depend_eliminate]: 5.15001e-06 [updatestate_assign_eliminate]: 4.70999e-06 [updatestate_loads_eliminate]: 4.28001e-06 [parameter_eliminate]: 1.81e-06 [a_2]: 0.00013386 [accelerated_algorithm]: 1.134e-05 [shard]: 1.99e-06 [meta_shard_fg_expand]: 2.20002e-06 [shard_inline]: 1.017e-05 [merge_send_recv]: 1.023e-05 [auto_parallel]: 7.75998e-06 [parallel]: 1.914e-05 [flash_sp]: 8.45001e-06 [merge_comm]: 5.67001e-06 [allreduce_fusion]: 4.97e-06 [matmul_add_comm_reduction]: 1.064e-05 [allreduce_slice_to_reducescatter]: 6.69999e-07 [virtual_shard_identity]: 1.066e-05 [virtual_dataset]: 9.66e-06 [get_grad_eliminate_]: 9.36e-06 [virtual_output]: 9.29e-06 [merge_forward]: 5.20999e-06 [cell_reuse_recompute_pass]: 1.13001e-06 [offload_activation]: 1.141e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.769e-05 [merge_recompute_call_nodes]: 1.76e-06 [before_grad]: 1.602e-05 [set_forward_comm_id_for_comm_node_pass]: 5.37999e-06 [meta_fg_expand]: 4.44002e-06 [flash_sp_send_recv_attached]: 2.55002e-06 [receive_attached]: 2.64001e-06 [after_resolve]: 1.543e-05 [a_after_grad]: 1.522e-05 [renormalize]: 0.00074629 [add_forward_monad_depend]: 5.97001e-06 [auto_monad_grad]: 1.92999e-06 [auto_monad_eliminator]: 1.782e-05 [cse]: 4.918e-05 [a_3]: 6.966e-05 [Cycle 2]: 0.00090742, [45] [expand_dump_flag]: 1.15001e-06 [switch_simplify]: 1.097e-05 [loop_unroll]: 9.62001e-06 [a_1]: 0.00023962 [with_stream_mark]: 1.337e-05 [recompute_prepare]: 9.65002e-06 [updatestate_depend_eliminate]: 4.75999e-06 [updatestate_assign_eliminate]: 3.48e-06 [updatestate_loads_eliminate]: 3.61001e-06 [parameter_eliminate]: 1.07e-06 [a_2]: 0.00011742 [accelerated_algorithm]: 9.36e-06 [shard]: 1.44998e-06 [meta_shard_fg_expand]: 1.93002e-06 [shard_inline]: 9.17001e-06 [merge_send_recv]: 6.83998e-06 [auto_parallel]: 7.26001e-06 [parallel]: 5.12999e-06 [flash_sp]: 3.30998e-06 [merge_comm]: 4.82e-06 [allreduce_fusion]: 5.23002e-06 [matmul_add_comm_reduction]: 7.68999e-06 [allreduce_slice_to_reducescatter]: 4.39992e-07 [virtual_shard_identity]: 1.005e-05 [virtual_dataset]: 8.95999e-06 [get_grad_eliminate_]: 9.42999e-06 [virtual_output]: 9.30001e-06 [merge_forward]: 4.33001e-06 [cell_reuse_recompute_pass]: 1.41998e-06 [offload_activation]: 8.57998e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.721e-05 [merge_recompute_call_nodes]: 1.11002e-06 [before_grad]: 1.463e-05 [set_forward_comm_id_for_comm_node_pass]: 4.80999e-06 [meta_fg_expand]: 3.63999e-06 [flash_sp_send_recv_attached]: 8.09989e-07 [receive_attached]: 1.00001e-06 [after_resolve]: 1.42e-05 [a_after_grad]: 1.405e-05 [renormalize]: 7.99773e-08 [add_forward_monad_depend]: 1.47001e-06 [auto_monad_grad]: 1.02998e-06 [auto_monad_eliminator]: 9.99999e-06 [cse]: 2.523e-05 [a_3]: 6.023e-05 [py_interpret_to_execute_after_opt_a]: 1.265e-05 [slice_cell_reuse_recomputed_activation]: 2.48e-06 [rewriter_after_opt_a]: 4.519e-05 [convert_after_rewriter]: 8.70999e-06 [order_py_execute_after_rewriter]: 6.59999e-06 [mutable_eliminate]: 0.00048687 [opt_b]: 0.00033055, [1] [Cycle 1]: 0.00032474, [7] [b_1]: 0.00020657 [b_2]: 1.251e-05 [updatestate_depend_eliminate]: 7.21001e-06 [updatestate_assign_eliminate]: 3.46999e-06 [updatestate_loads_eliminate]: 4.33999e-06 [renormalize]: 4.39992e-07 [cse]: 3.08e-05 [optimize_parallel_all_gather_comm]: 2.038e-05 [overlap_param_gather]: 2.21998e-06 [cconv]: 2.631e-05 [loop_unroll]: 0.00042938 [opt_after_cconv]: 0.00013418, [1] [Cycle 1]: 0.00012878, [7] [c_1]: 4.818e-05 [parameter_eliminate]: 3.07002e-06 [updatestate_depend_eliminate]: 6.35002e-06 [updatestate_assign_eliminate]: 3.98001e-06 [updatestate_loads_eliminate]: 3.41001e-06 [cse]: 2.817e-05 [renormalize]: 4.09986e-07 [remove_dup_value]: 3.877e-05 [tuple_transform]: 0.00010175, [1] [Cycle 1]: 9.706e-05, [4] [d_1]: 6.665e-05 [none_parameter_eliminate]: 1.81e-06 [renormalize]: 3.30008e-07 [switch_simplify]: 9.82001e-06 [partial_unused_args_eliminate]: 1.91e-06 [add_recomputation]: 5.975e-05 [cse_after_recomputation]: 2.791e-05, [1] [Cycle 1]: 2.338e-05, [1] [cse]: 1.807e-05 [environ_conv]: 6.36e-06 [swap_dp_allreduce_reducescatter]: 7.1e-06 [bias_add_comm_swap]: 2.52001e-06 [label_micro_interleaved_index]: 4.03999e-06 [label_fine_grained_interleaved_index]: 2.56e-06 [merge_cast_opt]: 1.30001e-06 [slice_recompute_activation]: 2.21e-06 [micro_interleaved_order_control]: 2.59999e-06 [assign_add_opt]: 1.21997e-06 [ForceFp32Comm]: 8.09989e-07 [remove_cast_before_assign_add]: 9.89996e-07 [full_micro_interleaved_order_control]: 2.19999e-06 [reorder_send_recv_between_fp_bp]: 2.89999e-06 [comm_op_add_attrs]: 9.60019e-07 [add_comm_op_reuse_tag]: 1.10001e-06 [interleave_split_concat_branches]: 1.49e-06 [interleave_parallel_branches]: 1.02e-06 [overlap_opt_shard_in_pipeline]: 1.17e-06 [overlap_opt_shard_grad_in_pipeline]: 2.14999e-06 [control_data_broadcast_order]: 1.656e-05 [grouped_pairwise_exchange_alltoall]: 1.45001e-06 [offloading_packed_experts]: 4.82e-06 [overlap_recompute_and_grad_model_parallel]: 5.56998e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.17e-06 [overlap_recompute_allgather_and_fa_grad]: 1.30999e-06 [overlap_recompute_comm]: 2.19999e-06 [overlap_grad_ring_attention]: 5.14998e-06 [overlap_grad_flash_sp]: 2.356e-05 [begin_end_overlap_inline]: 5.19998e-07 [split_matmul_comm_elemetwise]: 2.23998e-06 [split_layernorm_comm]: 1.77999e-06 [handle_group_info]: 9.10019e-07 [symbol_engine_optimizer]: 9.026e-05, [1] [Cycle 1]: 8.543e-05, [6] [build]: 3.29001e-06 [elim_shapecalc]: 1.277e-05 [elim_not_effective]: 1.731e-05 [opt_reshape]: 9.79e-06 [fold_const_symbol]: 1.474e-05 [renormalize]: 2.30008e-07 [detach_backward]: 2.17999e-06 [pipeline_parallel_scheduler]: 1.74e-06 [auto_monad_reorder]: 2.123e-05 [get_jit_bprop_graph]: 1.20999e-06 [rewriter_after_jit_bprop_graph]: 4.23001e-06 [opt_after_jit_grad]: 0.00047098 [validate]: 4.198e-05 Sums bootstrap : 0.000437s : 3.70% type_inference : 0.005757s : 48.83% event_method : 0.000021s : 0.18% auto_monad : 0.000067s : 0.57% graph_reusing : 0.000006s : 0.05% inline : 0.000002s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000021s : 0.18% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.05% parallel-infer-symbol : 0.000003s : 0.03% pre_auto_parallel : 0.000034s : 0.29% insert-virtual-dataset : 0.000003s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.01% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000028s : 0.24% optimize.rewriter_before_opt_a : 0.000094s : 0.80% optimize.opt_a.expand_dump_flag : 0.000004s : 0.03% optimize.opt_a.switch_simplify : 0.000059s : 0.50% optimize.opt_a.loop_unroll : 0.000046s : 0.39% optimize.opt_a.a_1 : 0.001149s : 9.74% optimize.opt_a.with_stream_mark : 0.000029s : 0.25% optimize.opt_a.recompute_prepare : 0.000021s : 0.18% optimize.opt_a.updatestate_depend_eliminate : 0.000010s : 0.08% optimize.opt_a.updatestate_assign_eliminate : 0.000008s : 0.07% optimize.opt_a.updatestate_loads_eliminate : 0.000008s : 0.07% optimize.opt_a.parameter_eliminate : 0.000003s : 0.02% optimize.opt_a.a_2 : 0.000251s : 2.13% optimize.opt_a.accelerated_algorithm : 0.000021s : 0.18% optimize.opt_a.shard : 0.000003s : 0.03% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.04% optimize.opt_a.shard_inline : 0.000019s : 0.16% optimize.opt_a.merge_send_recv : 0.000017s : 0.14% optimize.opt_a.auto_parallel : 0.000015s : 0.13% optimize.opt_a.parallel : 0.000024s : 0.21% optimize.opt_a.flash_sp : 0.000012s : 0.10% optimize.opt_a.merge_comm : 0.000010s : 0.09% optimize.opt_a.allreduce_fusion : 0.000010s : 0.09% optimize.opt_a.matmul_add_comm_reduction : 0.000018s : 0.16% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000021s : 0.18% optimize.opt_a.virtual_dataset : 0.000019s : 0.16% optimize.opt_a.get_grad_eliminate_ : 0.000019s : 0.16% optimize.opt_a.virtual_output : 0.000019s : 0.16% optimize.opt_a.merge_forward : 0.000010s : 0.08% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.02% optimize.opt_a.offload_activation : 0.000020s : 0.17% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000035s : 0.30% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.02% optimize.opt_a.before_grad : 0.000031s : 0.26% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000010s : 0.09% optimize.opt_a.meta_fg_expand : 0.000008s : 0.07% optimize.opt_a.flash_sp_send_recv_attached : 0.000003s : 0.03% optimize.opt_a.receive_attached : 0.000004s : 0.03% optimize.opt_a.after_resolve : 0.000030s : 0.25% optimize.opt_a.a_after_grad : 0.000029s : 0.25% optimize.opt_a.renormalize : 0.000746s : 6.33% optimize.opt_a.add_forward_monad_depend : 0.000007s : 0.06% optimize.opt_a.auto_monad_grad : 0.000003s : 0.03% optimize.opt_a.auto_monad_eliminator : 0.000028s : 0.24% optimize.opt_a.cse : 0.000074s : 0.63% optimize.opt_a.a_3 : 0.000130s : 1.10% optimize.py_interpret_to_execute_after_opt_a : 0.000013s : 0.11% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.02% optimize.rewriter_after_opt_a : 0.000045s : 0.38% optimize.convert_after_rewriter : 0.000009s : 0.07% optimize.order_py_execute_after_rewriter : 0.000007s : 0.06% optimize.mutable_eliminate : 0.000487s : 4.13% optimize.opt_b.b_1 : 0.000207s : 1.75% optimize.opt_b.b_2 : 0.000013s : 0.11% optimize.opt_b.updatestate_depend_eliminate : 0.000007s : 0.06% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_b.updatestate_loads_eliminate : 0.000004s : 0.04% optimize.opt_b.renormalize : 0.000000s : 0.00% optimize.opt_b.cse : 0.000031s : 0.26% optimize.optimize_parallel_all_gather_comm : 0.000020s : 0.17% optimize.overlap_param_gather : 0.000002s : 0.02% optimize.cconv : 0.000026s : 0.22% optimize.loop_unroll : 0.000429s : 3.64% optimize.opt_after_cconv.c_1 : 0.000048s : 0.41% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.05% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000004s : 0.03% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.cse : 0.000028s : 0.24% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000039s : 0.33% optimize.tuple_transform.d_1 : 0.000067s : 0.57% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.02% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000010s : 0.08% optimize.partial_unused_args_eliminate : 0.000002s : 0.02% optimize.add_recomputation : 0.000060s : 0.51% optimize.cse_after_recomputation.cse : 0.000018s : 0.15% optimize.environ_conv : 0.000006s : 0.05% optimize.swap_dp_allreduce_reducescatter : 0.000007s : 0.06% optimize.bias_add_comm_swap : 0.000003s : 0.02% optimize.label_micro_interleaved_index : 0.000004s : 0.03% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.02% optimize.merge_cast_opt : 0.000001s : 0.01% optimize.slice_recompute_activation : 0.000002s : 0.02% optimize.micro_interleaved_order_control : 0.000003s : 0.02% optimize.assign_add_opt : 0.000001s : 0.01% optimize.ForceFp32Comm : 0.000001s : 0.01% optimize.remove_cast_before_assign_add : 0.000001s : 0.01% optimize.full_micro_interleaved_order_control : 0.000002s : 0.02% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.02% optimize.comm_op_add_attrs : 0.000001s : 0.01% optimize.add_comm_op_reuse_tag : 0.000001s : 0.01% optimize.interleave_split_concat_branches : 0.000001s : 0.01% optimize.interleave_parallel_branches : 0.000001s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.02% optimize.control_data_broadcast_order : 0.000017s : 0.14% optimize.grouped_pairwise_exchange_alltoall : 0.000001s : 0.01% optimize.offloading_packed_experts : 0.000005s : 0.04% optimize.overlap_recompute_and_grad_model_parallel : 0.000006s : 0.05% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.01% optimize.overlap_recompute_comm : 0.000002s : 0.02% optimize.overlap_grad_ring_attention : 0.000005s : 0.04% optimize.overlap_grad_flash_sp : 0.000024s : 0.20% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.02% optimize.split_layernorm_comm : 0.000002s : 0.02% optimize.handle_group_info : 0.000001s : 0.01% optimize.symbol_engine_optimizer.build : 0.000003s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000013s : 0.11% optimize.symbol_engine_optimizer.elim_not_effective : 0.000017s : 0.15% optimize.symbol_engine_optimizer.opt_reshape : 0.000010s : 0.08% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000015s : 0.13% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.02% pipeline_parallel_scheduler : 0.000002s : 0.01% auto_monad_reorder : 0.000021s : 0.18% get_jit_bprop_graph : 0.000001s : 0.01% rewriter_after_jit_bprop_graph : 0.000004s : 0.04% opt_after_jit_grad : 0.000471s : 4.00% validate : 0.000042s : 0.36% Time group info: ------[substitution.] 0.000262 54 10.83% : 0.000028s : 6: substitution.cast_eliminate 0.93% : 0.000002s : 4: substitution.elim_not_effective 0.76% : 0.000002s : 4: substitution.fold_const_symbol 3.15% : 0.000008s : 7: substitution.graph_param_transform 68.22% : 0.000178s : 4: substitution.inline 2.08% : 0.000005s : 8: substitution.j_node_and_user_rematch 2.76% : 0.000007s : 8: substitution.remove_not_recompute_node 2.02% : 0.000005s : 6: substitution.replace_old_param 7.08% : 0.000019s : 6: substitution.tuple_list_get_item_eliminator 2.17% : 0.000006s : 1: substitution.value_based_eliminate ------[type_inference.] 0.005698 2 86.63% : 0.004937s : 1: type_inference.infer 13.37% : 0.000762s : 1: type_inference.specialize ------[replace.] 0.000071 10 52.44% : 0.000037s : 4: replace.inline 47.56% : 0.000034s : 6: replace.tuple_list_get_item_eliminator ------[match.] 0.000192 10 91.84% : 0.000176s : 4: match.inline 8.16% : 0.000016s : 6: match.tuple_list_get_item_eliminator ------[predicate.] 0.000320 2134 0.92% : 0.000003s : 22: predicate.accumulaten_eliminater 0.58% : 0.000002s : 7: predicate.ad_related_special_op_eliminate 0.58% : 0.000002s : 14: predicate.addn_check_dump 0.92% : 0.000003s : 22: predicate.addn_zero_filter 0.87% : 0.000003s : 22: predicate.adjust_all_reduce_mul_add 1.99% : 0.000006s : 36: predicate.arithmetic_simplify 1.08% : 0.000003s : 22: predicate.cast_eliminate 0.56% : 0.000002s : 14: predicate.check_bprop_eliminate 0.61% : 0.000002s : 14: predicate.compare_switch_simplify 0.21% : 0.000001s : 7: predicate.const_output_eliminate 0.61% : 0.000002s : 14: predicate.depend_value_elim 0.98% : 0.000003s : 22: predicate.dict_get_item_const_eliminator 1.08% : 0.000003s : 22: predicate.dict_get_item_eliminator 0.91% : 0.000003s : 22: predicate.dict_set_item_eliminator 0.85% : 0.000003s : 14: predicate.dumpgradient_eliminate 0.21% : 0.000001s : 7: predicate.elim_not_effective 0.36% : 0.000001s : 7: predicate.elim_shapecalc_of_broadcastargs 1.23% : 0.000004s : 29: predicate.environ_add_const_eliminate 1.18% : 0.000004s : 29: predicate.environ_get_add_eliminate 1.17% : 0.000004s : 29: predicate.environ_get_depend_swap 1.81% : 0.000006s : 43: predicate.environ_get_eliminate 1.17% : 0.000004s : 29: predicate.environ_get_set_eliminate 1.41% : 0.000005s : 32: predicate.exchange_switch_depend_value 2.23% : 0.000007s : 32: predicate.float_depend_g_call 0.57% : 0.000002s : 14: predicate.float_environ_get_switch 0.85% : 0.000003s : 21: predicate.float_tuple_getitem_switch 0.21% : 0.000001s : 7: predicate.fold_const_symbol 0.68% : 0.000002s : 14: predicate.get_grad_eliminate 0.22% : 0.000001s : 7: predicate.graph_param_transform 0.62% : 0.000002s : 14: predicate.incorporate_call 0.56% : 0.000002s : 14: predicate.incorporate_call_switch 6.33% : 0.000020s : 96: predicate.inline 0.86% : 0.000003s : 14: predicate.inline_without_move 0.35% : 0.000001s : 14: predicate.j_node_and_user_rematch 0.86% : 0.000003s : 14: predicate.less_batch_normalization 1.90% : 0.000006s : 42: predicate.list_to_tuple_eliminator_ 2.68% : 0.000009s : 64: predicate.load_eliminater 0.66% : 0.000002s : 7: predicate.loop_unroll_after_grad 1.92% : 0.000006s : 44: predicate.loop_unroll_before_grad 1.60% : 0.000005s : 36: predicate.make_slice_get_slice_eliminator 0.63% : 0.000002s : 14: predicate.merge_addn 0.57% : 0.000002s : 14: predicate.micro_step_allgather_replace 0.62% : 0.000002s : 14: predicate.mini_step_allgather_replace 0.94% : 0.000003s : 22: predicate.minmaximum_grad 0.69% : 0.000002s : 7: predicate.mutable_eliminate 0.36% : 0.000001s : 7: predicate.opt_reshape 0.45% : 0.000001s : 7: predicate.parallel_virtual_node 1.62% : 0.000005s : 32: predicate.partial_defer_inline 1.87% : 0.000006s : 35: predicate.partial_eliminate 0.94% : 0.000003s : 22: predicate.print_const_string_wrapper 0.59% : 0.000002s : 14: predicate.reduce_all_const_elim 1.19% : 0.000004s : 22: predicate.reduce_eliminate 2.67% : 0.000009s : 64: predicate.redundant_stop_gradient_eliminater 0.43% : 0.000001s : 14: predicate.remove_not_recompute_node 1.42% : 0.000005s : 42: predicate.replace_applicator 0.50% : 0.000002s : 14: predicate.replace_old_param 0.25% : 0.000001s : 7: predicate.reset_defer_inline 0.98% : 0.000003s : 22: predicate.reshape_eliminate 0.65% : 0.000002s : 14: predicate.row_tensor_add_zeros_like 0.33% : 0.000001s : 7: predicate.row_tensor_eliminate 0.75% : 0.000002s : 14: predicate.same_eliminate 0.43% : 0.000001s : 14: predicate.set_cell_output_no_recompute 0.76% : 0.000002s : 14: predicate.shard_identity_eliminate 0.67% : 0.000002s : 14: predicate.special_op_eliminate 0.88% : 0.000003s : 14: predicate.specialize_transform 0.82% : 0.000003s : 14: predicate.split_environ_get_set_with_tuple_value 0.76% : 0.000002s : 14: predicate.stack_unstack_eliminate 0.38% : 0.000001s : 7: predicate.switch_call_monad_eliminater 1.46% : 0.000005s : 32: predicate.switch_defer_inline 1.99% : 0.000006s : 46: predicate.switch_layer_defer_inline 4.61% : 0.000015s : 97: predicate.switch_simplify 0.95% : 0.000003s : 22: predicate.tile_eliminate 0.98% : 0.000003s : 22: predicate.transpose_eliminate 1.54% : 0.000005s : 36: predicate.tuple_list_convert_item_index_to_positive 1.74% : 0.000006s : 36: predicate.tuple_list_get_item_const_eliminator 1.60% : 0.000005s : 36: predicate.tuple_list_get_item_depend_reorder 3.29% : 0.000011s : 56: predicate.tuple_list_get_item_eliminator 1.56% : 0.000005s : 36: predicate.tuple_list_get_set_item_eliminator 2.41% : 0.000008s : 50: predicate.tuple_list_set_item_eliminator 1.84% : 0.000006s : 42: predicate.tuple_to_list_eliminator_ 2.60% : 0.000008s : 64: predicate.updatestate_pure_node_eliminater 3.40% : 0.000011s : 78: predicate.updatestate_useless_node_eliminater 0.45% : 0.000001s : 7: predicate.value_based_eliminate 0.65% : 0.000002s : 14: predicate.virtual_dataset_eliminate 0.64% : 0.000002s : 14: predicate.virtual_output_eliminate 0.33% : 0.000001s : 7: predicate.virtual_view_grad_eliminate 0.38% : 0.000001s : 7: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000546 11 51.96% : 0.000284s : 5: func_graph_cloner_run.FuncGraphClonerGraph 48.04% : 0.000262s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.027323 192 0.01% : 0.000004s : 1: ForceFp32Comm 10.96% : 0.002995s : 1: add_attr 10.93% : 0.002986s : 1: add_attr_with_inline 0.01% : 0.000004s : 1: add_comm_op_reuse_tag 0.23% : 0.000063s : 1: add_recomputation 0.01% : 0.000004s : 1: assign_add_opt 0.27% : 0.000073s : 1: auto_monad 0.09% : 0.000025s : 1: auto_monad_reorder 0.01% : 0.000003s : 1: begin_end_overlap_inline 0.02% : 0.000005s : 1: bias_add_comm_swap 1.70% : 0.000463s : 1: bootstrap 0.11% : 0.000030s : 1: cconv 0.01% : 0.000004s : 1: comm_op_add_attrs 0.07% : 0.000020s : 1: control_data_broadcast_order 0.04% : 0.000012s : 1: convert_after_rewriter 0.11% : 0.000031s : 1: cse_after_recomputation 0.02% : 0.000005s : 1: dataset_repeat_opt 0.02% : 0.000005s : 1: detach_backward 0.04% : 0.000010s : 1: environ_conv 0.10% : 0.000027s : 1: event_method 0.02% : 0.000005s : 1: full_micro_interleaved_order_control 0.02% : 0.000004s : 1: get_jit_bprop_graph 0.03% : 0.000009s : 1: graph_reusing 0.01% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000004s : 1: handle_group_info 0.02% : 0.000005s : 1: inline 0.02% : 0.000006s : 1: insert-virtual-dataset 0.01% : 0.000004s : 1: interleave_parallel_branches 0.02% : 0.000005s : 1: interleave_split_concat_branches 0.02% : 0.000005s : 1: label_fine_grained_interleaved_index 0.03% : 0.000007s : 1: label_micro_interleaved_index 1.60% : 0.000437s : 1: loop_unroll 0.01% : 0.000004s : 1: merge_cast_opt 0.02% : 0.000005s : 1: micro_interleaved_order_control 1.81% : 0.000495s : 1: mutable_eliminate 0.03% : 0.000008s : 1: offloading_packed_experts 0.06% : 0.000017s : 1: opt.transform.loop_unroll_optimizer 0.07% : 0.000018s : 1: opt.transform.mutable_eliminate 6.59% : 0.001802s : 78: opt.transform.opt_a 0.17% : 0.000047s : 1: opt.transform.opt_after_cconv 0.13% : 0.000034s : 1: opt.transform.opt_after_jit_grad 0.69% : 0.000187s : 28: opt.transform.opt_b 0.27% : 0.000074s : 2: opt.transform.opt_trans_graph 0.19% : 0.000051s : 4: opt.transform.symbol_engine_opt 12.68% : 0.003464s : 1: opt_a 0.50% : 0.000138s : 1: opt_after_cconv 1.76% : 0.000480s : 1: opt_after_jit_grad 1.22% : 0.000334s : 1: opt_b 21.01% : 0.005742s : 1: optimize 0.09% : 0.000024s : 1: optimize_parallel_all_gather_comm 0.04% : 0.000010s : 1: order_py_execute_after_rewriter 0.10% : 0.000027s : 1: overlap_grad_flash_sp 0.02% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.03% : 0.000008s : 1: overlap_grad_ring_attention 0.02% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.02% : 0.000005s : 1: overlap_param_gather 0.01% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.03% : 0.000008s : 1: overlap_recompute_and_grad_model_parallel 0.02% : 0.000005s : 1: overlap_recompute_comm 0.02% : 0.000007s : 1: parallel-infer-symbol 0.01% : 0.000004s : 1: parallel-infer-symbol-second 0.02% : 0.000005s : 1: partial_unused_args_eliminate 0.02% : 0.000005s : 1: pipeline_parallel_scheduler 0.02% : 0.000005s : 1: pipeline_split 0.14% : 0.000038s : 1: pre_auto_parallel 0.12% : 0.000032s : 1: py_interpret_to_execute 0.06% : 0.000016s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000004s : 1: remove_cast_before_assign_add 0.16% : 0.000043s : 1: remove_dup_value 1.54% : 0.000421s : 1: renormalize.infer 1.16% : 0.000317s : 1: renormalize.specialize 0.02% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.03% : 0.000007s : 1: rewriter_after_jit_bprop_graph 0.18% : 0.000049s : 1: rewriter_after_opt_a 0.36% : 0.000098s : 1: rewriter_before_opt_a 0.02% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.02% : 0.000005s : 1: slice_recompute_activation 0.02% : 0.000004s : 1: split_layernorm_comm 0.02% : 0.000005s : 1: split_matmul_comm_elemetwise 0.04% : 0.000010s : 1: swap_dp_allreduce_reducescatter 0.34% : 0.000093s : 1: symbol_engine_optimizer 0.38% : 0.000105s : 1: tuple_transform 21.12% : 0.005771s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:47:18.973.60 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:47:18.976.58 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0168701, [21] [bootstrap]: 0.00045485 [type_inference]: 0.00589801 [event_method]: 2.056e-05 [auto_monad]: 6.393e-05 [graph_reusing]: 5.94e-06 [inline]: 2.14e-06 [add_attr]: 0.00309959, [1] [add_attr_with_inline]: 0.00309095, [1] [Cycle 1]: 7.302e-05, [2] [tag_attr]: 2.078e-05 [meta_addattr_fg_expand]: 6.59001e-06 [parallel-infer-symbol]: 3.58e-06 [pre_auto_parallel]: 3.528e-05 [insert-virtual-dataset]: 2.54001e-06 [parallel-infer-symbol-second]: 8.10018e-07 [dataset_repeat_opt]: 1.92001e-06 [pipeline_split]: 1.61998e-06 [optimize]: 0.00607177, [53] [py_interpret_to_execute]: 3.404e-05 [rewriter_before_opt_a]: 9.688e-05 [opt_a]: 0.00364979, [2] [Cycle 1]: 0.00260924, [45] [expand_dump_flag]: 3.38999e-06 [switch_simplify]: 4.668e-05 [loop_unroll]: 3.591e-05 [a_1]: 0.00088659 [with_stream_mark]: 1.666e-05 [recompute_prepare]: 1.23e-05 [updatestate_depend_eliminate]: 5.03002e-06 [updatestate_assign_eliminate]: 4.2e-06 [updatestate_loads_eliminate]: 3.63e-06 [parameter_eliminate]: 1.78002e-06 [a_2]: 0.00014036 [accelerated_algorithm]: 9.24e-06 [shard]: 1.66e-06 [meta_shard_fg_expand]: 2.01e-06 [shard_inline]: 8.38999e-06 [merge_send_recv]: 9.96e-06 [auto_parallel]: 7.61999e-06 [parallel]: 1.806e-05 [flash_sp]: 8.65001e-06 [merge_comm]: 5.93002e-06 [allreduce_fusion]: 4.50001e-06 [matmul_add_comm_reduction]: 1.079e-05 [allreduce_slice_to_reducescatter]: 7.00005e-07 [virtual_shard_identity]: 1.038e-05 [virtual_dataset]: 9.19e-06 [get_grad_eliminate_]: 8.42998e-06 [virtual_output]: 8.3e-06 [merge_forward]: 4.55001e-06 [cell_reuse_recompute_pass]: 1.37e-06 [offload_activation]: 1.147e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.906e-05 [merge_recompute_call_nodes]: 1.67999e-06 [before_grad]: 1.381e-05 [set_forward_comm_id_for_comm_node_pass]: 5.09e-06 [meta_fg_expand]: 3.46001e-06 [flash_sp_send_recv_attached]: 2.51e-06 [receive_attached]: 2.14e-06 [after_resolve]: 1.471e-05 [a_after_grad]: 1.355e-05 [renormalize]: 0.00071233 [add_forward_monad_depend]: 5.71e-06 [auto_monad_grad]: 2.27999e-06 [auto_monad_eliminator]: 1.688e-05 [cse]: 3.462e-05 [a_3]: 7.661e-05 [Cycle 2]: 0.00102754, [45] [expand_dump_flag]: 1.07998e-06 [switch_simplify]: 1.159e-05 [loop_unroll]: 9.57999e-06 [a_1]: 0.00020438 [with_stream_mark]: 1.209e-05 [recompute_prepare]: 8.52998e-06 [updatestate_depend_eliminate]: 3.91999e-06 [updatestate_assign_eliminate]: 3.17002e-06 [updatestate_loads_eliminate]: 3.26001e-06 [parameter_eliminate]: 1.46002e-06 [a_2]: 0.0001341 [accelerated_algorithm]: 8.54e-06 [shard]: 1.19e-06 [meta_shard_fg_expand]: 1.86003e-06 [shard_inline]: 1.198e-05 [merge_send_recv]: 7.06001e-06 [auto_parallel]: 7.55998e-06 [parallel]: 6.24999e-06 [flash_sp]: 3.41001e-06 [merge_comm]: 4.28999e-06 [allreduce_fusion]: 3.98001e-06 [matmul_add_comm_reduction]: 7.31001e-06 [allreduce_slice_to_reducescatter]: 3.69997e-07 [virtual_shard_identity]: 9.57001e-06 [virtual_dataset]: 8.29002e-06 [get_grad_eliminate_]: 7.77002e-06 [virtual_output]: 7.84997e-06 [merge_forward]: 3.83001e-06 [cell_reuse_recompute_pass]: 1.48002e-06 [offload_activation]: 8.03001e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.755e-05 [merge_recompute_call_nodes]: 7.29982e-07 [before_grad]: 1.287e-05 [set_forward_comm_id_for_comm_node_pass]: 4.39002e-06 [meta_fg_expand]: 2.77002e-06 [flash_sp_send_recv_attached]: 8.39995e-07 [receive_attached]: 1.05001e-06 [after_resolve]: 1.397e-05 [a_after_grad]: 1.255e-05 [renormalize]: 5.9983e-08 [add_forward_monad_depend]: 1.69e-06 [auto_monad_grad]: 1.12e-06 [auto_monad_eliminator]: 9.03002e-06 [cse]: 1.857e-05 [a_3]: 6.391e-05 [py_interpret_to_execute_after_opt_a]: 1.401e-05 [slice_cell_reuse_recomputed_activation]: 4.60999e-06 [rewriter_after_opt_a]: 4.614e-05 [convert_after_rewriter]: 1.076e-05 [order_py_execute_after_rewriter]: 8.80999e-06 [mutable_eliminate]: 0.00051293 [opt_b]: 0.00033776, [1] [Cycle 1]: 0.00032858, [7] [b_1]: 0.00022402 [b_2]: 1.037e-05 [updatestate_depend_eliminate]: 7.47002e-06 [updatestate_assign_eliminate]: 3.83001e-06 [updatestate_loads_eliminate]: 3.06999e-06 [renormalize]: 5.59987e-07 [cse]: 2.283e-05 [optimize_parallel_all_gather_comm]: 2.187e-05 [overlap_param_gather]: 4.38001e-06 [cconv]: 2.901e-05 [loop_unroll]: 0.00043656 [opt_after_cconv]: 0.00014665, [1] [Cycle 1]: 0.0001382, [7] [c_1]: 4.277e-05 [parameter_eliminate]: 2.99999e-06 [updatestate_depend_eliminate]: 6.97002e-06 [updatestate_assign_eliminate]: 3.50998e-06 [updatestate_loads_eliminate]: 3.06999e-06 [cse]: 2.259e-05 [renormalize]: 4.09986e-07 [remove_dup_value]: 1.82e-05 [tuple_transform]: 0.00010706, [1] [Cycle 1]: 9.986e-05, [4] [d_1]: 5.854e-05 [none_parameter_eliminate]: 1.52999e-06 [renormalize]: 2.30008e-07 [switch_simplify]: 9.09998e-06 [partial_unused_args_eliminate]: 4.40999e-06 [add_recomputation]: 5.926e-05 [cse_after_recomputation]: 3.069e-05, [1] [Cycle 1]: 2.402e-05, [1] [cse]: 1.507e-05 [environ_conv]: 9.64e-06 [swap_dp_allreduce_reducescatter]: 9.27999e-06 [bias_add_comm_swap]: 5.17e-06 [label_micro_interleaved_index]: 6.64999e-06 [label_fine_grained_interleaved_index]: 5.07e-06 [merge_cast_opt]: 3.68999e-06 [slice_recompute_activation]: 4.57e-06 [micro_interleaved_order_control]: 4.79002e-06 [assign_add_opt]: 3.55998e-06 [ForceFp32Comm]: 3.18e-06 [remove_cast_before_assign_add]: 3.28e-06 [full_micro_interleaved_order_control]: 4.33001e-06 [reorder_send_recv_between_fp_bp]: 5.28002e-06 [comm_op_add_attrs]: 3.3e-06 [add_comm_op_reuse_tag]: 3.13998e-06 [interleave_split_concat_branches]: 3.53e-06 [interleave_parallel_branches]: 3.41999e-06 [overlap_opt_shard_in_pipeline]: 3.58999e-06 [overlap_opt_shard_grad_in_pipeline]: 4.44002e-06 [control_data_broadcast_order]: 1.811e-05 [grouped_pairwise_exchange_alltoall]: 4.48999e-06 [offloading_packed_experts]: 6.91001e-06 [overlap_recompute_and_grad_model_parallel]: 7.78999e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.6e-06 [overlap_recompute_allgather_and_fa_grad]: 3.93001e-06 [overlap_recompute_comm]: 5.04998e-06 [overlap_grad_ring_attention]: 7.20998e-06 [overlap_grad_flash_sp]: 2.436e-05 [begin_end_overlap_inline]: 3.09001e-06 [split_matmul_comm_elemetwise]: 4.4e-06 [split_layernorm_comm]: 4.18001e-06 [handle_group_info]: 3.20998e-06 [symbol_engine_optimizer]: 0.00010585, [1] [Cycle 1]: 9.927e-05, [6] [build]: 3.58e-06 [elim_shapecalc]: 1.139e-05 [elim_not_effective]: 1.591e-05 [opt_reshape]: 9.04e-06 [fold_const_symbol]: 1.319e-05 [renormalize]: 2.79979e-07 [detach_backward]: 3.93001e-06 [pipeline_parallel_scheduler]: 1.60999e-06 [auto_monad_reorder]: 2.411e-05 [get_jit_bprop_graph]: 1.86998e-06 [rewriter_after_jit_bprop_graph]: 4.44002e-06 [opt_after_jit_grad]: 0.00050475 [validate]: 6.768e-05 Sums bootstrap : 0.000455s : 3.79% type_inference : 0.005898s : 49.17% event_method : 0.000021s : 0.17% auto_monad : 0.000064s : 0.53% graph_reusing : 0.000006s : 0.05% inline : 0.000002s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000021s : 0.17% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000007s : 0.05% parallel-infer-symbol : 0.000004s : 0.03% pre_auto_parallel : 0.000035s : 0.29% insert-virtual-dataset : 0.000003s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000034s : 0.28% optimize.rewriter_before_opt_a : 0.000097s : 0.81% optimize.opt_a.expand_dump_flag : 0.000004s : 0.04% optimize.opt_a.switch_simplify : 0.000058s : 0.49% optimize.opt_a.loop_unroll : 0.000045s : 0.38% optimize.opt_a.a_1 : 0.001091s : 9.10% optimize.opt_a.with_stream_mark : 0.000029s : 0.24% optimize.opt_a.recompute_prepare : 0.000021s : 0.17% optimize.opt_a.updatestate_depend_eliminate : 0.000009s : 0.07% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.06% optimize.opt_a.updatestate_loads_eliminate : 0.000007s : 0.06% optimize.opt_a.parameter_eliminate : 0.000003s : 0.03% optimize.opt_a.a_2 : 0.000274s : 2.29% optimize.opt_a.accelerated_algorithm : 0.000018s : 0.15% optimize.opt_a.shard : 0.000003s : 0.02% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.03% optimize.opt_a.shard_inline : 0.000020s : 0.17% optimize.opt_a.merge_send_recv : 0.000017s : 0.14% optimize.opt_a.auto_parallel : 0.000015s : 0.13% optimize.opt_a.parallel : 0.000024s : 0.20% optimize.opt_a.flash_sp : 0.000012s : 0.10% optimize.opt_a.merge_comm : 0.000010s : 0.09% optimize.opt_a.allreduce_fusion : 0.000008s : 0.07% optimize.opt_a.matmul_add_comm_reduction : 0.000018s : 0.15% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000020s : 0.17% optimize.opt_a.virtual_dataset : 0.000017s : 0.15% optimize.opt_a.get_grad_eliminate_ : 0.000016s : 0.14% optimize.opt_a.virtual_output : 0.000016s : 0.13% optimize.opt_a.merge_forward : 0.000008s : 0.07% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.02% optimize.opt_a.offload_activation : 0.000020s : 0.16% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000037s : 0.31% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.02% optimize.opt_a.before_grad : 0.000027s : 0.22% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000009s : 0.08% optimize.opt_a.meta_fg_expand : 0.000006s : 0.05% optimize.opt_a.flash_sp_send_recv_attached : 0.000003s : 0.03% optimize.opt_a.receive_attached : 0.000003s : 0.03% optimize.opt_a.after_resolve : 0.000029s : 0.24% optimize.opt_a.a_after_grad : 0.000026s : 0.22% optimize.opt_a.renormalize : 0.000712s : 5.94% optimize.opt_a.add_forward_monad_depend : 0.000007s : 0.06% optimize.opt_a.auto_monad_grad : 0.000003s : 0.03% optimize.opt_a.auto_monad_eliminator : 0.000026s : 0.22% optimize.opt_a.cse : 0.000053s : 0.44% optimize.opt_a.a_3 : 0.000141s : 1.17% optimize.py_interpret_to_execute_after_opt_a : 0.000014s : 0.12% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.04% optimize.rewriter_after_opt_a : 0.000046s : 0.38% optimize.convert_after_rewriter : 0.000011s : 0.09% optimize.order_py_execute_after_rewriter : 0.000009s : 0.07% optimize.mutable_eliminate : 0.000513s : 4.28% optimize.opt_b.b_1 : 0.000224s : 1.87% optimize.opt_b.b_2 : 0.000010s : 0.09% optimize.opt_b.updatestate_depend_eliminate : 0.000007s : 0.06% optimize.opt_b.updatestate_assign_eliminate : 0.000004s : 0.03% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000023s : 0.19% optimize.optimize_parallel_all_gather_comm : 0.000022s : 0.18% optimize.overlap_param_gather : 0.000004s : 0.04% optimize.cconv : 0.000029s : 0.24% optimize.loop_unroll : 0.000437s : 3.64% optimize.opt_after_cconv.c_1 : 0.000043s : 0.36% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000007s : 0.06% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000004s : 0.03% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.cse : 0.000023s : 0.19% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000018s : 0.15% optimize.tuple_transform.d_1 : 0.000059s : 0.49% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000009s : 0.08% optimize.partial_unused_args_eliminate : 0.000004s : 0.04% optimize.add_recomputation : 0.000059s : 0.49% optimize.cse_after_recomputation.cse : 0.000015s : 0.13% optimize.environ_conv : 0.000010s : 0.08% optimize.swap_dp_allreduce_reducescatter : 0.000009s : 0.08% optimize.bias_add_comm_swap : 0.000005s : 0.04% optimize.label_micro_interleaved_index : 0.000007s : 0.06% optimize.label_fine_grained_interleaved_index : 0.000005s : 0.04% optimize.merge_cast_opt : 0.000004s : 0.03% optimize.slice_recompute_activation : 0.000005s : 0.04% optimize.micro_interleaved_order_control : 0.000005s : 0.04% optimize.assign_add_opt : 0.000004s : 0.03% optimize.ForceFp32Comm : 0.000003s : 0.03% optimize.remove_cast_before_assign_add : 0.000003s : 0.03% optimize.full_micro_interleaved_order_control : 0.000004s : 0.04% optimize.reorder_send_recv_between_fp_bp : 0.000005s : 0.04% optimize.comm_op_add_attrs : 0.000003s : 0.03% optimize.add_comm_op_reuse_tag : 0.000003s : 0.03% optimize.interleave_split_concat_branches : 0.000004s : 0.03% optimize.interleave_parallel_branches : 0.000003s : 0.03% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.03% optimize.overlap_opt_shard_grad_in_pipeline : 0.000004s : 0.04% optimize.control_data_broadcast_order : 0.000018s : 0.15% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.04% optimize.offloading_packed_experts : 0.000007s : 0.06% optimize.overlap_recompute_and_grad_model_parallel : 0.000008s : 0.06% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.03% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.03% optimize.overlap_recompute_comm : 0.000005s : 0.04% optimize.overlap_grad_ring_attention : 0.000007s : 0.06% optimize.overlap_grad_flash_sp : 0.000024s : 0.20% optimize.begin_end_overlap_inline : 0.000003s : 0.03% optimize.split_matmul_comm_elemetwise : 0.000004s : 0.04% optimize.split_layernorm_comm : 0.000004s : 0.03% optimize.handle_group_info : 0.000003s : 0.03% optimize.symbol_engine_optimizer.build : 0.000004s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000011s : 0.09% optimize.symbol_engine_optimizer.elim_not_effective : 0.000016s : 0.13% optimize.symbol_engine_optimizer.opt_reshape : 0.000009s : 0.08% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000013s : 0.11% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000004s : 0.03% pipeline_parallel_scheduler : 0.000002s : 0.01% auto_monad_reorder : 0.000024s : 0.20% get_jit_bprop_graph : 0.000002s : 0.02% rewriter_after_jit_bprop_graph : 0.000004s : 0.04% opt_after_jit_grad : 0.000505s : 4.21% validate : 0.000068s : 0.56% Time group info: ------[substitution.] 0.000225 44 10.15% : 0.000023s : 3: substitution.cast_eliminate 1.04% : 0.000002s : 3: substitution.elim_not_effective 0.74% : 0.000002s : 3: substitution.fold_const_symbol 3.21% : 0.000007s : 6: substitution.graph_param_transform 66.39% : 0.000149s : 4: substitution.inline 2.00% : 0.000005s : 6: substitution.j_node_and_user_rematch 2.73% : 0.000006s : 6: substitution.remove_not_recompute_node 2.55% : 0.000006s : 6: substitution.replace_old_param 8.72% : 0.000020s : 6: substitution.tuple_list_get_item_eliminator 2.47% : 0.000006s : 1: substitution.value_based_eliminate ------[type_inference.] 0.005847 2 87.36% : 0.005108s : 1: type_inference.infer 12.64% : 0.000739s : 1: type_inference.specialize ------[replace.] 0.000075 10 51.64% : 0.000039s : 4: replace.inline 48.36% : 0.000036s : 6: replace.tuple_list_get_item_eliminator ------[match.] 0.000164 10 89.69% : 0.000147s : 4: match.inline 10.31% : 0.000017s : 6: match.tuple_list_get_item_eliminator ------[predicate.] 0.000294 1954 0.96% : 0.000003s : 21: predicate.accumulaten_eliminater 0.63% : 0.000002s : 6: predicate.ad_related_special_op_eliminate 0.56% : 0.000002s : 12: predicate.addn_check_dump 0.92% : 0.000003s : 21: predicate.addn_zero_filter 0.92% : 0.000003s : 21: predicate.adjust_all_reduce_mul_add 1.96% : 0.000006s : 33: predicate.arithmetic_simplify 1.05% : 0.000003s : 21: predicate.cast_eliminate 0.63% : 0.000002s : 12: predicate.check_bprop_eliminate 0.55% : 0.000002s : 12: predicate.compare_switch_simplify 0.20% : 0.000001s : 6: predicate.const_output_eliminate 0.59% : 0.000002s : 12: predicate.depend_value_elim 1.03% : 0.000003s : 21: predicate.dict_get_item_const_eliminator 1.09% : 0.000003s : 21: predicate.dict_get_item_eliminator 0.99% : 0.000003s : 21: predicate.dict_set_item_eliminator 0.87% : 0.000003s : 12: predicate.dumpgradient_eliminate 0.19% : 0.000001s : 6: predicate.elim_not_effective 0.38% : 0.000001s : 6: predicate.elim_shapecalc_of_broadcastargs 1.23% : 0.000004s : 27: predicate.environ_add_const_eliminate 1.20% : 0.000004s : 27: predicate.environ_get_add_eliminate 1.21% : 0.000004s : 27: predicate.environ_get_depend_swap 1.79% : 0.000005s : 39: predicate.environ_get_eliminate 1.19% : 0.000003s : 27: predicate.environ_get_set_eliminate 1.47% : 0.000004s : 31: predicate.exchange_switch_depend_value 2.32% : 0.000007s : 31: predicate.float_depend_g_call 0.56% : 0.000002s : 12: predicate.float_environ_get_switch 0.83% : 0.000002s : 18: predicate.float_tuple_getitem_switch 0.19% : 0.000001s : 6: predicate.fold_const_symbol 0.68% : 0.000002s : 12: predicate.get_grad_eliminate 0.19% : 0.000001s : 6: predicate.graph_param_transform 0.61% : 0.000002s : 12: predicate.incorporate_call 0.51% : 0.000001s : 12: predicate.incorporate_call_switch 6.19% : 0.000018s : 88: predicate.inline 0.76% : 0.000002s : 12: predicate.inline_without_move 0.43% : 0.000001s : 12: predicate.j_node_and_user_rematch 0.71% : 0.000002s : 12: predicate.less_batch_normalization 1.87% : 0.000006s : 39: predicate.list_to_tuple_eliminator_ 2.77% : 0.000008s : 60: predicate.load_eliminater 0.76% : 0.000002s : 6: predicate.loop_unroll_after_grad 2.04% : 0.000006s : 42: predicate.loop_unroll_before_grad 1.64% : 0.000005s : 33: predicate.make_slice_get_slice_eliminator 0.62% : 0.000002s : 12: predicate.merge_addn 0.59% : 0.000002s : 12: predicate.micro_step_allgather_replace 0.59% : 0.000002s : 12: predicate.mini_step_allgather_replace 0.89% : 0.000003s : 21: predicate.minmaximum_grad 0.91% : 0.000003s : 6: predicate.mutable_eliminate 0.32% : 0.000001s : 6: predicate.opt_reshape 0.37% : 0.000001s : 6: predicate.parallel_virtual_node 1.73% : 0.000005s : 31: predicate.partial_defer_inline 1.81% : 0.000005s : 33: predicate.partial_eliminate 0.97% : 0.000003s : 21: predicate.print_const_string_wrapper 0.56% : 0.000002s : 12: predicate.reduce_all_const_elim 1.22% : 0.000004s : 21: predicate.reduce_eliminate 2.69% : 0.000008s : 60: predicate.redundant_stop_gradient_eliminater 0.40% : 0.000001s : 12: predicate.remove_not_recompute_node 1.45% : 0.000004s : 39: predicate.replace_applicator 0.48% : 0.000001s : 12: predicate.replace_old_param 0.24% : 0.000001s : 6: predicate.reset_defer_inline 0.94% : 0.000003s : 21: predicate.reshape_eliminate 0.64% : 0.000002s : 12: predicate.row_tensor_add_zeros_like 0.31% : 0.000001s : 6: predicate.row_tensor_eliminate 0.78% : 0.000002s : 12: predicate.same_eliminate 0.42% : 0.000001s : 12: predicate.set_cell_output_no_recompute 0.73% : 0.000002s : 12: predicate.shard_identity_eliminate 0.61% : 0.000002s : 12: predicate.special_op_eliminate 0.74% : 0.000002s : 12: predicate.specialize_transform 0.79% : 0.000002s : 12: predicate.split_environ_get_set_with_tuple_value 0.75% : 0.000002s : 12: predicate.stack_unstack_eliminate 0.33% : 0.000001s : 6: predicate.switch_call_monad_eliminater 1.56% : 0.000005s : 31: predicate.switch_defer_inline 2.12% : 0.000006s : 43: predicate.switch_layer_defer_inline 4.84% : 0.000014s : 91: predicate.switch_simplify 0.94% : 0.000003s : 21: predicate.tile_eliminate 0.96% : 0.000003s : 21: predicate.transpose_eliminate 1.56% : 0.000005s : 33: predicate.tuple_list_convert_item_index_to_positive 1.65% : 0.000005s : 33: predicate.tuple_list_get_item_const_eliminator 1.54% : 0.000005s : 33: predicate.tuple_list_get_item_depend_reorder 3.11% : 0.000009s : 51: predicate.tuple_list_get_item_eliminator 1.60% : 0.000005s : 33: predicate.tuple_list_get_set_item_eliminator 2.35% : 0.000007s : 45: predicate.tuple_list_set_item_eliminator 1.84% : 0.000005s : 39: predicate.tuple_to_list_eliminator_ 2.69% : 0.000008s : 60: predicate.updatestate_pure_node_eliminater 3.33% : 0.000010s : 72: predicate.updatestate_useless_node_eliminater 0.41% : 0.000001s : 6: predicate.value_based_eliminate 0.63% : 0.000002s : 12: predicate.virtual_dataset_eliminate 0.65% : 0.000002s : 12: predicate.virtual_output_eliminate 0.27% : 0.000001s : 6: predicate.virtual_view_grad_eliminate 0.42% : 0.000001s : 6: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000572 11 56.09% : 0.000321s : 5: func_graph_cloner_run.FuncGraphClonerGraph 43.91% : 0.000251s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.028659 192 0.02% : 0.000006s : 1: ForceFp32Comm 10.84% : 0.003108s : 1: add_attr 10.80% : 0.003095s : 1: add_attr_with_inline 0.02% : 0.000006s : 1: add_comm_op_reuse_tag 0.22% : 0.000063s : 1: add_recomputation 0.02% : 0.000006s : 1: assign_add_opt 0.26% : 0.000074s : 1: auto_monad 0.11% : 0.000031s : 1: auto_monad_reorder 0.02% : 0.000006s : 1: begin_end_overlap_inline 0.03% : 0.000008s : 1: bias_add_comm_swap 1.74% : 0.000499s : 1: bootstrap 0.11% : 0.000032s : 1: cconv 0.02% : 0.000006s : 1: comm_op_add_attrs 0.07% : 0.000021s : 1: control_data_broadcast_order 0.05% : 0.000014s : 1: convert_after_rewriter 0.12% : 0.000034s : 1: cse_after_recomputation 0.03% : 0.000007s : 1: dataset_repeat_opt 0.07% : 0.000019s : 1: detach_backward 0.04% : 0.000012s : 1: environ_conv 0.10% : 0.000030s : 1: event_method 0.02% : 0.000007s : 1: full_micro_interleaved_order_control 0.03% : 0.000008s : 1: get_jit_bprop_graph 0.04% : 0.000012s : 1: graph_reusing 0.02% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.02% : 0.000006s : 1: handle_group_info 0.03% : 0.000008s : 1: inline 0.03% : 0.000009s : 1: insert-virtual-dataset 0.02% : 0.000006s : 1: interleave_parallel_branches 0.02% : 0.000006s : 1: interleave_split_concat_branches 0.03% : 0.000008s : 1: label_fine_grained_interleaved_index 0.03% : 0.000009s : 1: label_micro_interleaved_index 1.54% : 0.000442s : 1: loop_unroll 0.02% : 0.000006s : 1: merge_cast_opt 0.03% : 0.000007s : 1: micro_interleaved_order_control 1.81% : 0.000519s : 1: mutable_eliminate 0.03% : 0.000010s : 1: offloading_packed_experts 0.06% : 0.000016s : 1: opt.transform.loop_unroll_optimizer 0.06% : 0.000018s : 1: opt.transform.mutable_eliminate 5.87% : 0.001682s : 78: opt.transform.opt_a 0.15% : 0.000042s : 1: opt.transform.opt_after_cconv 0.11% : 0.000033s : 1: opt.transform.opt_after_jit_grad 0.56% : 0.000162s : 28: opt.transform.opt_b 0.23% : 0.000065s : 2: opt.transform.opt_trans_graph 0.16% : 0.000046s : 4: opt.transform.symbol_engine_opt 12.75% : 0.003653s : 1: opt_a 0.52% : 0.000150s : 1: opt_after_cconv 1.80% : 0.000515s : 1: opt_after_jit_grad 1.19% : 0.000341s : 1: opt_b 22.43% : 0.006429s : 1: optimize 0.09% : 0.000025s : 1: optimize_parallel_all_gather_comm 0.04% : 0.000012s : 1: order_py_execute_after_rewriter 0.10% : 0.000027s : 1: overlap_grad_flash_sp 0.02% : 0.000006s : 1: overlap_grad_matmul_and_grad_allreduce 0.04% : 0.000010s : 1: overlap_grad_ring_attention 0.03% : 0.000007s : 1: overlap_opt_shard_grad_in_pipeline 0.02% : 0.000006s : 1: overlap_opt_shard_in_pipeline 0.03% : 0.000007s : 1: overlap_param_gather 0.02% : 0.000006s : 1: overlap_recompute_allgather_and_fa_grad 0.04% : 0.000010s : 1: overlap_recompute_and_grad_model_parallel 0.03% : 0.000008s : 1: overlap_recompute_comm 0.04% : 0.000010s : 1: parallel-infer-symbol 0.02% : 0.000006s : 1: parallel-infer-symbol-second 0.03% : 0.000007s : 1: partial_unused_args_eliminate 0.03% : 0.000009s : 1: pipeline_parallel_scheduler 0.02% : 0.000007s : 1: pipeline_split 0.15% : 0.000043s : 1: pre_auto_parallel 0.13% : 0.000038s : 1: py_interpret_to_execute 0.06% : 0.000017s : 1: py_interpret_to_execute_after_opt_a 0.02% : 0.000006s : 1: remove_cast_before_assign_add 0.07% : 0.000021s : 1: remove_dup_value 1.38% : 0.000396s : 1: renormalize.infer 1.07% : 0.000308s : 1: renormalize.specialize 0.03% : 0.000008s : 1: reorder_send_recv_between_fp_bp 0.04% : 0.000010s : 1: rewriter_after_jit_bprop_graph 0.17% : 0.000050s : 1: rewriter_after_opt_a 0.35% : 0.000101s : 1: rewriter_before_opt_a 0.03% : 0.000008s : 1: slice_cell_reuse_recomputed_activation 0.02% : 0.000007s : 1: slice_recompute_activation 0.02% : 0.000007s : 1: split_layernorm_comm 0.02% : 0.000007s : 1: split_matmul_comm_elemetwise 0.04% : 0.000012s : 1: swap_dp_allreduce_reducescatter 0.38% : 0.000109s : 1: symbol_engine_optimizer 0.38% : 0.000110s : 1: tuple_transform 20.71% : 0.005934s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:47:18.291.954 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0155692, [21] [bootstrap]: 0.00043636 [type_inference]: 0.00596874 [event_method]: 2.076e-05 [auto_monad]: 6.578e-05 [graph_reusing]: 5.92999e-06 [inline]: 2.12999e-06 [add_attr]: 0.00304452, [1] [add_attr_with_inline]: 0.00303661, [1] [Cycle 1]: 5.544e-05, [2] [tag_attr]: 1.988e-05 [meta_addattr_fg_expand]: 6.80998e-06 [parallel-infer-symbol]: 3.4e-06 [pre_auto_parallel]: 3.402e-05 [insert-virtual-dataset]: 2.99001e-06 [parallel-infer-symbol-second]: 8.70001e-07 [dataset_repeat_opt]: 2.24999e-06 [pipeline_split]: 1.69e-06 [optimize]: 0.00530618, [53] [py_interpret_to_execute]: 2.763e-05 [rewriter_before_opt_a]: 9.177e-05 [opt_a]: 0.00320279, [2] [Cycle 1]: 0.00234721, [45] [expand_dump_flag]: 3.33e-06 [switch_simplify]: 4.759e-05 [loop_unroll]: 3.506e-05 [a_1]: 0.00086389 [with_stream_mark]: 1.413e-05 [recompute_prepare]: 1.08e-05 [updatestate_depend_eliminate]: 5.29e-06 [updatestate_assign_eliminate]: 4.25e-06 [updatestate_loads_eliminate]: 3.61999e-06 [parameter_eliminate]: 2.21998e-06 [a_2]: 0.00011126 [accelerated_algorithm]: 8.83001e-06 [shard]: 2.24999e-06 [meta_shard_fg_expand]: 2.01e-06 [shard_inline]: 8.83001e-06 [merge_send_recv]: 8.99e-06 [auto_parallel]: 7.54002e-06 [parallel]: 1.817e-05 [flash_sp]: 8.37e-06 [merge_comm]: 4.99e-06 [allreduce_fusion]: 4.31002e-06 [matmul_add_comm_reduction]: 9.89001e-06 [allreduce_slice_to_reducescatter]: 6.60017e-07 [virtual_shard_identity]: 1.026e-05 [virtual_dataset]: 8.53001e-06 [get_grad_eliminate_]: 8.38001e-06 [virtual_output]: 8.74e-06 [merge_forward]: 4.28999e-06 [cell_reuse_recompute_pass]: 1.19e-06 [offload_activation]: 9.99999e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.553e-05 [merge_recompute_call_nodes]: 1.49e-06 [before_grad]: 1.311e-05 [set_forward_comm_id_for_comm_node_pass]: 4.67e-06 [meta_fg_expand]: 3.76999e-06 [flash_sp_send_recv_attached]: 3.14999e-06 [receive_attached]: 2.27001e-06 [after_resolve]: 1.563e-05 [a_after_grad]: 1.329e-05 [renormalize]: 0.00067088 [add_forward_monad_depend]: 5.11002e-06 [auto_monad_grad]: 1.71e-06 [auto_monad_eliminator]: 1.523e-05 [cse]: 3.607e-05 [a_3]: 6.332e-05 [Cycle 2]: 0.00084591, [45] [expand_dump_flag]: 1.22e-06 [switch_simplify]: 9.93002e-06 [loop_unroll]: 8.15e-06 [a_1]: 0.0002026 [with_stream_mark]: 1.171e-05 [recompute_prepare]: 8.54002e-06 [updatestate_depend_eliminate]: 4.00998e-06 [updatestate_assign_eliminate]: 2.99001e-06 [updatestate_loads_eliminate]: 2.91e-06 [parameter_eliminate]: 1.21002e-06 [a_2]: 0.00013236 [accelerated_algorithm]: 8.71997e-06 [shard]: 1.51998e-06 [meta_shard_fg_expand]: 1.62001e-06 [shard_inline]: 8.10999e-06 [merge_send_recv]: 6.09001e-06 [auto_parallel]: 6.69999e-06 [parallel]: 5.34e-06 [flash_sp]: 3.75e-06 [merge_comm]: 3.99002e-06 [allreduce_fusion]: 4.25e-06 [matmul_add_comm_reduction]: 6.93e-06 [allreduce_slice_to_reducescatter]: 5.60016e-07 [virtual_shard_identity]: 8.69e-06 [virtual_dataset]: 8.20999e-06 [get_grad_eliminate_]: 8.70999e-06 [virtual_output]: 7.96001e-06 [merge_forward]: 3.95998e-06 [cell_reuse_recompute_pass]: 1.49998e-06 [offload_activation]: 7.68999e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.538e-05 [merge_recompute_call_nodes]: 9.09989e-07 [before_grad]: 1.225e-05 [set_forward_comm_id_for_comm_node_pass]: 4.07998e-06 [meta_fg_expand]: 2.95002e-06 [flash_sp_send_recv_attached]: 8.30012e-07 [receive_attached]: 1.00999e-06 [after_resolve]: 1.321e-05 [a_after_grad]: 1.254e-05 [renormalize]: 7.00238e-08 [add_forward_monad_depend]: 1.34998e-06 [auto_monad_grad]: 1.05001e-06 [auto_monad_eliminator]: 8.67998e-06 [cse]: 1.903e-05 [a_3]: 5.099e-05 [py_interpret_to_execute_after_opt_a]: 9.76e-06 [slice_cell_reuse_recomputed_activation]: 1.89e-06 [rewriter_after_opt_a]: 3.934e-05 [convert_after_rewriter]: 7.94002e-06 [order_py_execute_after_rewriter]: 5.83997e-06 [mutable_eliminate]: 0.00047695 [opt_b]: 0.00026929, [1] [Cycle 1]: 0.00026329, [7] [b_1]: 0.00018011 [b_2]: 1.009e-05 [updatestate_depend_eliminate]: 6.24001e-06 [updatestate_assign_eliminate]: 3.21001e-06 [updatestate_loads_eliminate]: 3.43e-06 [renormalize]: 6.50005e-07 [cse]: 2.296e-05 [optimize_parallel_all_gather_comm]: 1.676e-05 [overlap_param_gather]: 2.01e-06 [cconv]: 2.401e-05 [loop_unroll]: 0.00042185 [opt_after_cconv]: 0.00012065, [1] [Cycle 1]: 0.00011521, [7] [c_1]: 4.264e-05 [parameter_eliminate]: 2.60997e-06 [updatestate_depend_eliminate]: 5.81e-06 [updatestate_assign_eliminate]: 3.36001e-06 [updatestate_loads_eliminate]: 3.09999e-06 [cse]: 2.287e-05 [renormalize]: 2.50002e-07 [remove_dup_value]: 1.542e-05 [tuple_transform]: 9.148e-05, [1] [Cycle 1]: 8.727e-05, [4] [d_1]: 5.77e-05 [none_parameter_eliminate]: 1.89e-06 [renormalize]: 2.59985e-07 [switch_simplify]: 8.79003e-06 [partial_unused_args_eliminate]: 1.95001e-06 [add_recomputation]: 5.401e-05 [cse_after_recomputation]: 2.569e-05, [1] [Cycle 1]: 2.118e-05, [1] [cse]: 1.583e-05 [environ_conv]: 6.21998e-06 [swap_dp_allreduce_reducescatter]: 6.28e-06 [bias_add_comm_swap]: 2.61e-06 [label_micro_interleaved_index]: 3.77002e-06 [label_fine_grained_interleaved_index]: 2.59001e-06 [merge_cast_opt]: 1.22999e-06 [slice_recompute_activation]: 2.43e-06 [micro_interleaved_order_control]: 2.29001e-06 [assign_add_opt]: 1.12e-06 [ForceFp32Comm]: 7.7e-07 [remove_cast_before_assign_add]: 1.10999e-06 [full_micro_interleaved_order_control]: 2.60002e-06 [reorder_send_recv_between_fp_bp]: 2.81999e-06 [comm_op_add_attrs]: 1.13001e-06 [add_comm_op_reuse_tag]: 1.05001e-06 [interleave_split_concat_branches]: 1.14e-06 [interleave_parallel_branches]: 1.04003e-06 [overlap_opt_shard_in_pipeline]: 1.15001e-06 [overlap_opt_shard_grad_in_pipeline]: 2.04999e-06 [control_data_broadcast_order]: 1.464e-05 [grouped_pairwise_exchange_alltoall]: 1.82999e-06 [offloading_packed_experts]: 4.08999e-06 [overlap_recompute_and_grad_model_parallel]: 4.90001e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.13001e-06 [overlap_recompute_allgather_and_fa_grad]: 1.34998e-06 [overlap_recompute_comm]: 2.20002e-06 [overlap_grad_ring_attention]: 4.36002e-06 [overlap_grad_flash_sp]: 1.983e-05 [begin_end_overlap_inline]: 4.60015e-07 [split_matmul_comm_elemetwise]: 2.27999e-06 [split_layernorm_comm]: 1.97999e-06 [handle_group_info]: 1.22e-06 [symbol_engine_optimizer]: 8.294e-05, [1] [Cycle 1]: 7.888e-05, [6] [build]: 3.43e-06 [elim_shapecalc]: 1.155e-05 [elim_not_effective]: 1.512e-05 [opt_reshape]: 8.75999e-06 [fold_const_symbol]: 1.237e-05 [renormalize]: 2.19996e-07 [detach_backward]: 2.11e-06 [pipeline_parallel_scheduler]: 2.11998e-06 [auto_monad_reorder]: 1.956e-05 [get_jit_bprop_graph]: 1.18001e-06 [rewriter_after_jit_bprop_graph]: 4.28001e-06 [opt_after_jit_grad]: 0.00046005 [validate]: 3.994e-05 Sums bootstrap : 0.000436s : 3.77% type_inference : 0.005969s : 51.53% event_method : 0.000021s : 0.18% auto_monad : 0.000066s : 0.57% graph_reusing : 0.000006s : 0.05% inline : 0.000002s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000020s : 0.17% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000007s : 0.06% parallel-infer-symbol : 0.000003s : 0.03% pre_auto_parallel : 0.000034s : 0.29% insert-virtual-dataset : 0.000003s : 0.03% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000028s : 0.24% optimize.rewriter_before_opt_a : 0.000092s : 0.79% optimize.opt_a.expand_dump_flag : 0.000005s : 0.04% optimize.opt_a.switch_simplify : 0.000058s : 0.50% optimize.opt_a.loop_unroll : 0.000043s : 0.37% optimize.opt_a.a_1 : 0.001066s : 9.21% optimize.opt_a.with_stream_mark : 0.000026s : 0.22% optimize.opt_a.recompute_prepare : 0.000019s : 0.17% optimize.opt_a.updatestate_depend_eliminate : 0.000009s : 0.08% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.06% optimize.opt_a.updatestate_loads_eliminate : 0.000007s : 0.06% optimize.opt_a.parameter_eliminate : 0.000003s : 0.03% optimize.opt_a.a_2 : 0.000244s : 2.10% optimize.opt_a.accelerated_algorithm : 0.000018s : 0.15% optimize.opt_a.shard : 0.000004s : 0.03% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.03% optimize.opt_a.shard_inline : 0.000017s : 0.15% optimize.opt_a.merge_send_recv : 0.000015s : 0.13% optimize.opt_a.auto_parallel : 0.000014s : 0.12% optimize.opt_a.parallel : 0.000024s : 0.20% optimize.opt_a.flash_sp : 0.000012s : 0.10% optimize.opt_a.merge_comm : 0.000009s : 0.08% optimize.opt_a.allreduce_fusion : 0.000009s : 0.07% optimize.opt_a.matmul_add_comm_reduction : 0.000017s : 0.15% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000019s : 0.16% optimize.opt_a.virtual_dataset : 0.000017s : 0.14% optimize.opt_a.get_grad_eliminate_ : 0.000017s : 0.15% optimize.opt_a.virtual_output : 0.000017s : 0.14% optimize.opt_a.merge_forward : 0.000008s : 0.07% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.02% optimize.opt_a.offload_activation : 0.000018s : 0.15% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000031s : 0.27% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.02% optimize.opt_a.before_grad : 0.000025s : 0.22% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000009s : 0.08% optimize.opt_a.meta_fg_expand : 0.000007s : 0.06% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.03% optimize.opt_a.receive_attached : 0.000003s : 0.03% optimize.opt_a.after_resolve : 0.000029s : 0.25% optimize.opt_a.a_after_grad : 0.000026s : 0.22% optimize.opt_a.renormalize : 0.000671s : 5.79% optimize.opt_a.add_forward_monad_depend : 0.000006s : 0.06% optimize.opt_a.auto_monad_grad : 0.000003s : 0.02% optimize.opt_a.auto_monad_eliminator : 0.000024s : 0.21% optimize.opt_a.cse : 0.000055s : 0.48% optimize.opt_a.a_3 : 0.000114s : 0.99% optimize.py_interpret_to_execute_after_opt_a : 0.000010s : 0.08% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.02% optimize.rewriter_after_opt_a : 0.000039s : 0.34% optimize.convert_after_rewriter : 0.000008s : 0.07% optimize.order_py_execute_after_rewriter : 0.000006s : 0.05% optimize.mutable_eliminate : 0.000477s : 4.12% optimize.opt_b.b_1 : 0.000180s : 1.56% optimize.opt_b.b_2 : 0.000010s : 0.09% optimize.opt_b.updatestate_depend_eliminate : 0.000006s : 0.05% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_b.renormalize : 0.000001s : 0.01% optimize.opt_b.cse : 0.000023s : 0.20% optimize.optimize_parallel_all_gather_comm : 0.000017s : 0.14% optimize.overlap_param_gather : 0.000002s : 0.02% optimize.cconv : 0.000024s : 0.21% optimize.loop_unroll : 0.000422s : 3.64% optimize.opt_after_cconv.c_1 : 0.000043s : 0.37% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.02% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.05% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.cse : 0.000023s : 0.20% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000015s : 0.13% optimize.tuple_transform.d_1 : 0.000058s : 0.50% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.02% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000009s : 0.08% optimize.partial_unused_args_eliminate : 0.000002s : 0.02% optimize.add_recomputation : 0.000054s : 0.47% optimize.cse_after_recomputation.cse : 0.000016s : 0.14% optimize.environ_conv : 0.000006s : 0.05% optimize.swap_dp_allreduce_reducescatter : 0.000006s : 0.05% optimize.bias_add_comm_swap : 0.000003s : 0.02% optimize.label_micro_interleaved_index : 0.000004s : 0.03% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.02% optimize.merge_cast_opt : 0.000001s : 0.01% optimize.slice_recompute_activation : 0.000002s : 0.02% optimize.micro_interleaved_order_control : 0.000002s : 0.02% optimize.assign_add_opt : 0.000001s : 0.01% optimize.ForceFp32Comm : 0.000001s : 0.01% optimize.remove_cast_before_assign_add : 0.000001s : 0.01% optimize.full_micro_interleaved_order_control : 0.000003s : 0.02% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.02% optimize.comm_op_add_attrs : 0.000001s : 0.01% optimize.add_comm_op_reuse_tag : 0.000001s : 0.01% optimize.interleave_split_concat_branches : 0.000001s : 0.01% optimize.interleave_parallel_branches : 0.000001s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.02% optimize.control_data_broadcast_order : 0.000015s : 0.13% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.02% optimize.offloading_packed_experts : 0.000004s : 0.04% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.04% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.01% optimize.overlap_recompute_comm : 0.000002s : 0.02% optimize.overlap_grad_ring_attention : 0.000004s : 0.04% optimize.overlap_grad_flash_sp : 0.000020s : 0.17% optimize.begin_end_overlap_inline : 0.000000s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.02% optimize.split_layernorm_comm : 0.000002s : 0.02% optimize.handle_group_info : 0.000001s : 0.01% optimize.symbol_engine_optimizer.build : 0.000003s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000012s : 0.10% optimize.symbol_engine_optimizer.elim_not_effective : 0.000015s : 0.13% optimize.symbol_engine_optimizer.opt_reshape : 0.000009s : 0.08% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000012s : 0.11% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.02% pipeline_parallel_scheduler : 0.000002s : 0.02% auto_monad_reorder : 0.000020s : 0.17% get_jit_bprop_graph : 0.000001s : 0.01% rewriter_after_jit_bprop_graph : 0.000004s : 0.04% opt_after_jit_grad : 0.000460s : 3.97% validate : 0.000040s : 0.34% Time group info: ------[substitution.] 0.000215 44 10.11% : 0.000022s : 3: substitution.cast_eliminate 0.99% : 0.000002s : 3: substitution.elim_not_effective 0.74% : 0.000002s : 3: substitution.fold_const_symbol 3.20% : 0.000007s : 6: substitution.graph_param_transform 66.36% : 0.000143s : 4: substitution.inline 1.99% : 0.000004s : 6: substitution.j_node_and_user_rematch 2.75% : 0.000006s : 6: substitution.remove_not_recompute_node 2.70% : 0.000006s : 6: substitution.replace_old_param 8.69% : 0.000019s : 6: substitution.tuple_list_get_item_eliminator 2.47% : 0.000005s : 1: substitution.value_based_eliminate ------[type_inference.] 0.005909 2 87.00% : 0.005141s : 1: type_inference.infer 13.00% : 0.000768s : 1: type_inference.specialize ------[replace.] 0.000071 10 52.29% : 0.000037s : 4: replace.inline 47.71% : 0.000034s : 6: replace.tuple_list_get_item_eliminator ------[match.] 0.000156 10 89.79% : 0.000140s : 4: match.inline 10.21% : 0.000016s : 6: match.tuple_list_get_item_eliminator ------[predicate.] 0.000292 1954 0.96% : 0.000003s : 21: predicate.accumulaten_eliminater 0.64% : 0.000002s : 6: predicate.ad_related_special_op_eliminate 0.52% : 0.000002s : 12: predicate.addn_check_dump 0.96% : 0.000003s : 21: predicate.addn_zero_filter 0.92% : 0.000003s : 21: predicate.adjust_all_reduce_mul_add 2.05% : 0.000006s : 33: predicate.arithmetic_simplify 1.17% : 0.000003s : 21: predicate.cast_eliminate 0.57% : 0.000002s : 12: predicate.check_bprop_eliminate 0.51% : 0.000001s : 12: predicate.compare_switch_simplify 0.20% : 0.000001s : 6: predicate.const_output_eliminate 0.60% : 0.000002s : 12: predicate.depend_value_elim 1.02% : 0.000003s : 21: predicate.dict_get_item_const_eliminator 1.06% : 0.000003s : 21: predicate.dict_get_item_eliminator 0.95% : 0.000003s : 21: predicate.dict_set_item_eliminator 0.83% : 0.000002s : 12: predicate.dumpgradient_eliminate 0.22% : 0.000001s : 6: predicate.elim_not_effective 0.37% : 0.000001s : 6: predicate.elim_shapecalc_of_broadcastargs 1.19% : 0.000003s : 27: predicate.environ_add_const_eliminate 1.19% : 0.000003s : 27: predicate.environ_get_add_eliminate 1.18% : 0.000003s : 27: predicate.environ_get_depend_swap 1.77% : 0.000005s : 39: predicate.environ_get_eliminate 1.18% : 0.000003s : 27: predicate.environ_get_set_eliminate 1.48% : 0.000004s : 31: predicate.exchange_switch_depend_value 2.33% : 0.000007s : 31: predicate.float_depend_g_call 0.56% : 0.000002s : 12: predicate.float_environ_get_switch 0.78% : 0.000002s : 18: predicate.float_tuple_getitem_switch 0.19% : 0.000001s : 6: predicate.fold_const_symbol 0.62% : 0.000002s : 12: predicate.get_grad_eliminate 0.21% : 0.000001s : 6: predicate.graph_param_transform 0.60% : 0.000002s : 12: predicate.incorporate_call 0.49% : 0.000001s : 12: predicate.incorporate_call_switch 6.01% : 0.000018s : 88: predicate.inline 0.83% : 0.000002s : 12: predicate.inline_without_move 0.33% : 0.000001s : 12: predicate.j_node_and_user_rematch 0.74% : 0.000002s : 12: predicate.less_batch_normalization 1.88% : 0.000006s : 39: predicate.list_to_tuple_eliminator_ 2.71% : 0.000008s : 60: predicate.load_eliminater 0.81% : 0.000002s : 6: predicate.loop_unroll_after_grad 1.98% : 0.000006s : 42: predicate.loop_unroll_before_grad 1.67% : 0.000005s : 33: predicate.make_slice_get_slice_eliminator 0.60% : 0.000002s : 12: predicate.merge_addn 0.56% : 0.000002s : 12: predicate.micro_step_allgather_replace 0.56% : 0.000002s : 12: predicate.mini_step_allgather_replace 0.93% : 0.000003s : 21: predicate.minmaximum_grad 0.72% : 0.000002s : 6: predicate.mutable_eliminate 0.43% : 0.000001s : 6: predicate.opt_reshape 0.32% : 0.000001s : 6: predicate.parallel_virtual_node 1.84% : 0.000005s : 31: predicate.partial_defer_inline 1.84% : 0.000005s : 33: predicate.partial_eliminate 0.98% : 0.000003s : 21: predicate.print_const_string_wrapper 0.60% : 0.000002s : 12: predicate.reduce_all_const_elim 1.22% : 0.000004s : 21: predicate.reduce_eliminate 2.67% : 0.000008s : 60: predicate.redundant_stop_gradient_eliminater 0.38% : 0.000001s : 12: predicate.remove_not_recompute_node 1.46% : 0.000004s : 39: predicate.replace_applicator 0.53% : 0.000002s : 12: predicate.replace_old_param 0.24% : 0.000001s : 6: predicate.reset_defer_inline 1.06% : 0.000003s : 21: predicate.reshape_eliminate 0.61% : 0.000002s : 12: predicate.row_tensor_add_zeros_like 0.32% : 0.000001s : 6: predicate.row_tensor_eliminate 0.69% : 0.000002s : 12: predicate.same_eliminate 0.44% : 0.000001s : 12: predicate.set_cell_output_no_recompute 0.68% : 0.000002s : 12: predicate.shard_identity_eliminate 0.65% : 0.000002s : 12: predicate.special_op_eliminate 0.65% : 0.000002s : 12: predicate.specialize_transform 0.79% : 0.000002s : 12: predicate.split_environ_get_set_with_tuple_value 0.70% : 0.000002s : 12: predicate.stack_unstack_eliminate 0.33% : 0.000001s : 6: predicate.switch_call_monad_eliminater 1.58% : 0.000005s : 31: predicate.switch_defer_inline 2.14% : 0.000006s : 43: predicate.switch_layer_defer_inline 4.74% : 0.000014s : 91: predicate.switch_simplify 0.99% : 0.000003s : 21: predicate.tile_eliminate 1.02% : 0.000003s : 21: predicate.transpose_eliminate 1.62% : 0.000005s : 33: predicate.tuple_list_convert_item_index_to_positive 1.75% : 0.000005s : 33: predicate.tuple_list_get_item_const_eliminator 1.55% : 0.000005s : 33: predicate.tuple_list_get_item_depend_reorder 3.29% : 0.000010s : 51: predicate.tuple_list_get_item_eliminator 1.65% : 0.000005s : 33: predicate.tuple_list_get_set_item_eliminator 2.26% : 0.000007s : 45: predicate.tuple_list_set_item_eliminator 1.80% : 0.000005s : 39: predicate.tuple_to_list_eliminator_ 2.68% : 0.000008s : 60: predicate.updatestate_pure_node_eliminater 3.34% : 0.000010s : 72: predicate.updatestate_useless_node_eliminater 0.42% : 0.000001s : 6: predicate.value_based_eliminate 0.60% : 0.000002s : 12: predicate.virtual_dataset_eliminate 0.73% : 0.000002s : 12: predicate.virtual_output_eliminate 0.30% : 0.000001s : 6: predicate.virtual_view_grad_eliminate 0.40% : 0.000001s : 6: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000553 11 55.95% : 0.000310s : 5: func_graph_cloner_run.FuncGraphClonerGraph 44.05% : 0.000244s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.026481 192 0.01% : 0.000003s : 1: ForceFp32Comm 11.51% : 0.003049s : 1: add_attr 11.48% : 0.003040s : 1: add_attr_with_inline 0.01% : 0.000004s : 1: add_comm_op_reuse_tag 0.22% : 0.000058s : 1: add_recomputation 0.01% : 0.000004s : 1: assign_add_opt 0.27% : 0.000072s : 1: auto_monad 0.09% : 0.000023s : 1: auto_monad_reorder 0.01% : 0.000003s : 1: begin_end_overlap_inline 0.02% : 0.000005s : 1: bias_add_comm_swap 1.75% : 0.000464s : 1: bootstrap 0.10% : 0.000028s : 1: cconv 0.01% : 0.000004s : 1: comm_op_add_attrs 0.07% : 0.000018s : 1: control_data_broadcast_order 0.04% : 0.000011s : 1: convert_after_rewriter 0.11% : 0.000029s : 1: cse_after_recomputation 0.02% : 0.000005s : 1: dataset_repeat_opt 0.02% : 0.000005s : 1: detach_backward 0.04% : 0.000009s : 1: environ_conv 0.10% : 0.000027s : 1: event_method 0.02% : 0.000005s : 1: full_micro_interleaved_order_control 0.02% : 0.000004s : 1: get_jit_bprop_graph 0.04% : 0.000010s : 1: graph_reusing 0.02% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000004s : 1: handle_group_info 0.02% : 0.000005s : 1: inline 0.03% : 0.000007s : 1: insert-virtual-dataset 0.01% : 0.000004s : 1: interleave_parallel_branches 0.01% : 0.000004s : 1: interleave_split_concat_branches 0.02% : 0.000005s : 1: label_fine_grained_interleaved_index 0.02% : 0.000007s : 1: label_micro_interleaved_index 1.62% : 0.000430s : 1: loop_unroll 0.01% : 0.000004s : 1: merge_cast_opt 0.02% : 0.000005s : 1: micro_interleaved_order_control 1.83% : 0.000485s : 1: mutable_eliminate 0.03% : 0.000007s : 1: offloading_packed_experts 0.07% : 0.000017s : 1: opt.transform.loop_unroll_optimizer 0.06% : 0.000017s : 1: opt.transform.mutable_eliminate 6.19% : 0.001640s : 78: opt.transform.opt_a 0.16% : 0.000041s : 1: opt.transform.opt_after_cconv 0.11% : 0.000030s : 1: opt.transform.opt_after_jit_grad 0.60% : 0.000160s : 28: opt.transform.opt_b 0.24% : 0.000064s : 2: opt.transform.opt_trans_graph 0.17% : 0.000044s : 4: opt.transform.symbol_engine_opt 12.11% : 0.003206s : 1: opt_a 0.47% : 0.000124s : 1: opt_after_cconv 1.77% : 0.000468s : 1: opt_after_jit_grad 1.03% : 0.000273s : 1: opt_b 20.05% : 0.005310s : 1: optimize 0.08% : 0.000020s : 1: optimize_parallel_all_gather_comm 0.03% : 0.000009s : 1: order_py_execute_after_rewriter 0.09% : 0.000023s : 1: overlap_grad_flash_sp 0.01% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.03% : 0.000008s : 1: overlap_grad_ring_attention 0.02% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.02% : 0.000005s : 1: overlap_param_gather 0.01% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.03% : 0.000008s : 1: overlap_recompute_and_grad_model_parallel 0.02% : 0.000005s : 1: overlap_recompute_comm 0.03% : 0.000007s : 1: parallel-infer-symbol 0.02% : 0.000004s : 1: parallel-infer-symbol-second 0.02% : 0.000005s : 1: partial_unused_args_eliminate 0.02% : 0.000005s : 1: pipeline_parallel_scheduler 0.02% : 0.000004s : 1: pipeline_split 0.14% : 0.000038s : 1: pre_auto_parallel 0.12% : 0.000032s : 1: py_interpret_to_execute 0.05% : 0.000013s : 1: py_interpret_to_execute_after_opt_a 0.02% : 0.000004s : 1: remove_cast_before_assign_add 0.07% : 0.000019s : 1: remove_dup_value 1.44% : 0.000380s : 1: renormalize.infer 1.07% : 0.000283s : 1: renormalize.specialize 0.02% : 0.000005s : 1: reorder_send_recv_between_fp_bp 0.03% : 0.000007s : 1: rewriter_after_jit_bprop_graph 0.16% : 0.000043s : 1: rewriter_after_opt_a 0.36% : 0.000096s : 1: rewriter_before_opt_a 0.02% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.02% : 0.000005s : 1: slice_recompute_activation 0.02% : 0.000005s : 1: split_layernorm_comm 0.02% : 0.000005s : 1: split_matmul_comm_elemetwise 0.03% : 0.000009s : 1: swap_dp_allreduce_reducescatter 0.32% : 0.000086s : 1: symbol_engine_optimizer 0.36% : 0.000094s : 1: tuple_transform 22.60% : 0.005985s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:47:18.481.586 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:47:18.481.842 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0159795, [21] [bootstrap]: 0.00043662 [type_inference]: 0.00587161 [event_method]: 2.036e-05 [auto_monad]: 6.354e-05 [graph_reusing]: 6.44999e-06 [inline]: 2.22999e-06 [add_attr]: 0.00304209, [1] [add_attr_with_inline]: 0.00303421, [1] [Cycle 1]: 7.217e-05, [2] [tag_attr]: 2.068e-05 [meta_addattr_fg_expand]: 6.62002e-06 [parallel-infer-symbol]: 3.03e-06 [pre_auto_parallel]: 3.392e-05 [insert-virtual-dataset]: 2.31e-06 [parallel-infer-symbol-second]: 9.00007e-07 [dataset_repeat_opt]: 2.01e-06 [pipeline_split]: 1.66998e-06 [optimize]: 0.00541275, [53] [py_interpret_to_execute]: 3.517e-05 [rewriter_before_opt_a]: 9.269e-05 [opt_a]: 0.00309662, [2] [Cycle 1]: 0.00220784, [45] [expand_dump_flag]: 2.86e-06 [switch_simplify]: 4.596e-05 [loop_unroll]: 3.423e-05 [a_1]: 0.00072041 [with_stream_mark]: 1.552e-05 [recompute_prepare]: 1.013e-05 [updatestate_depend_eliminate]: 3.81999e-06 [updatestate_assign_eliminate]: 3.44001e-06 [updatestate_loads_eliminate]: 3.22002e-06 [parameter_eliminate]: 1.96e-06 [a_2]: 0.00012101 [accelerated_algorithm]: 7.75e-06 [shard]: 1.92001e-06 [meta_shard_fg_expand]: 1.80001e-06 [shard_inline]: 6.96999e-06 [merge_send_recv]: 8.50001e-06 [auto_parallel]: 6.31998e-06 [parallel]: 1.821e-05 [flash_sp]: 8.12e-06 [merge_comm]: 4.30999e-06 [allreduce_fusion]: 3.63e-06 [matmul_add_comm_reduction]: 9.52001e-06 [allreduce_slice_to_reducescatter]: 6.30011e-07 [virtual_shard_identity]: 9.13002e-06 [virtual_dataset]: 7.45e-06 [get_grad_eliminate_]: 6.96999e-06 [virtual_output]: 7.13e-06 [merge_forward]: 3.81999e-06 [cell_reuse_recompute_pass]: 1.14e-06 [offload_activation]: 9.27999e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.564e-05 [merge_recompute_call_nodes]: 1.40001e-06 [before_grad]: 1.145e-05 [set_forward_comm_id_for_comm_node_pass]: 4.24002e-06 [meta_fg_expand]: 2.79001e-06 [flash_sp_send_recv_attached]: 2.64999e-06 [receive_attached]: 2.34001e-06 [after_resolve]: 1.364e-05 [a_after_grad]: 1.154e-05 [renormalize]: 0.00055448 [add_forward_monad_depend]: 5.61003e-06 [auto_monad_grad]: 1.97999e-06 [auto_monad_eliminator]: 1.426e-05 [cse]: 3.227e-05 [a_3]: 6.595e-05 [Cycle 2]: 0.00087534, [45] [expand_dump_flag]: 1.12999e-06 [switch_simplify]: 9.34e-06 [loop_unroll]: 7.13e-06 [a_1]: 0.00015733 [with_stream_mark]: 1.216e-05 [recompute_prepare]: 6.87002e-06 [updatestate_depend_eliminate]: 3.09999e-06 [updatestate_assign_eliminate]: 2.89999e-06 [updatestate_loads_eliminate]: 2.34001e-06 [parameter_eliminate]: 9.70002e-07 [a_2]: 0.00010808 [accelerated_algorithm]: 7.04001e-06 [shard]: 1.46002e-06 [meta_shard_fg_expand]: 1.50999e-06 [shard_inline]: 6.82002e-06 [merge_send_recv]: 4.99998e-06 [auto_parallel]: 5.76e-06 [parallel]: 4.70999e-06 [flash_sp]: 3.27002e-06 [merge_comm]: 3.23998e-06 [allreduce_fusion]: 3.57002e-06 [matmul_add_comm_reduction]: 5.72001e-06 [allreduce_slice_to_reducescatter]: 5.40022e-07 [virtual_shard_identity]: 7.87e-06 [virtual_dataset]: 7.19001e-06 [get_grad_eliminate_]: 6.64999e-06 [virtual_output]: 6.76999e-06 [merge_forward]: 2.83998e-06 [cell_reuse_recompute_pass]: 1.44998e-06 [offload_activation]: 6.97002e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.536e-05 [merge_recompute_call_nodes]: 1.07e-06 [before_grad]: 1.163e-05 [set_forward_comm_id_for_comm_node_pass]: 3.54002e-06 [meta_fg_expand]: 2.25002e-06 [flash_sp_send_recv_attached]: 8.60018e-07 [receive_attached]: 1.05999e-06 [after_resolve]: 1.248e-05 [a_after_grad]: 1.068e-05 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 1.30001e-06 [auto_monad_grad]: 9.79984e-07 [auto_monad_eliminator]: 7.37002e-06 [cse]: 1.545e-05 [a_3]: 5.374e-05 [py_interpret_to_execute_after_opt_a]: 1.383e-05 [slice_cell_reuse_recomputed_activation]: 4.89998e-06 [rewriter_after_opt_a]: 3.699e-05 [convert_after_rewriter]: 1.027e-05 [order_py_execute_after_rewriter]: 8.48999e-06 [mutable_eliminate]: 0.00051959 [opt_b]: 0.00029769, [1] [Cycle 1]: 0.0002885, [7] [b_1]: 0.00019197 [b_2]: 8.54e-06 [updatestate_depend_eliminate]: 5.99e-06 [updatestate_assign_eliminate]: 2.53e-06 [updatestate_loads_eliminate]: 2.66e-06 [renormalize]: 3.80009e-07 [cse]: 1.972e-05 [optimize_parallel_all_gather_comm]: 2.008e-05 [overlap_param_gather]: 5.08002e-06 [cconv]: 2.859e-05 [loop_unroll]: 0.00043264 [opt_after_cconv]: 0.00013228, [1] [Cycle 1]: 0.00012401, [7] [c_1]: 3.633e-05 [parameter_eliminate]: 2.63e-06 [updatestate_depend_eliminate]: 5.24998e-06 [updatestate_assign_eliminate]: 2.54999e-06 [updatestate_loads_eliminate]: 2.31998e-06 [cse]: 1.9e-05 [renormalize]: 3.4002e-07 [remove_dup_value]: 1.657e-05 [tuple_transform]: 9.486e-05, [1] [Cycle 1]: 8.816e-05, [4] [d_1]: 4.906e-05 [none_parameter_eliminate]: 1.60999e-06 [renormalize]: 1.60013e-07 [switch_simplify]: 7.5e-06 [partial_unused_args_eliminate]: 4.79e-06 [add_recomputation]: 4.895e-05 [cse_after_recomputation]: 2.72e-05, [1] [Cycle 1]: 2.047e-05, [1] [cse]: 1.169e-05 [environ_conv]: 8.01001e-06 [swap_dp_allreduce_reducescatter]: 7.65e-06 [bias_add_comm_swap]: 4.97999e-06 [label_micro_interleaved_index]: 6.49999e-06 [label_fine_grained_interleaved_index]: 4.99e-06 [merge_cast_opt]: 3.83999e-06 [slice_recompute_activation]: 4.80999e-06 [micro_interleaved_order_control]: 4.65999e-06 [assign_add_opt]: 3.76001e-06 [ForceFp32Comm]: 3.26001e-06 [remove_cast_before_assign_add]: 3.23998e-06 [full_micro_interleaved_order_control]: 4.69002e-06 [reorder_send_recv_between_fp_bp]: 5.30999e-06 [comm_op_add_attrs]: 3.35003e-06 [add_comm_op_reuse_tag]: 3.28e-06 [interleave_split_concat_branches]: 3.4e-06 [interleave_parallel_branches]: 3.46001e-06 [overlap_opt_shard_in_pipeline]: 3.53999e-06 [overlap_opt_shard_grad_in_pipeline]: 4.18001e-06 [control_data_broadcast_order]: 1.491e-05 [grouped_pairwise_exchange_alltoall]: 4.56002e-06 [offloading_packed_experts]: 6.23e-06 [overlap_recompute_and_grad_model_parallel]: 7.12002e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.73001e-06 [overlap_recompute_allgather_and_fa_grad]: 3.85e-06 [overlap_recompute_comm]: 5.22999e-06 [overlap_grad_ring_attention]: 6.74999e-06 [overlap_grad_flash_sp]: 2.244e-05 [begin_end_overlap_inline]: 3.01001e-06 [split_matmul_comm_elemetwise]: 4.30999e-06 [split_layernorm_comm]: 3.86999e-06 [handle_group_info]: 3.67998e-06 [symbol_engine_optimizer]: 9.745e-05, [1] [Cycle 1]: 9.103e-05, [6] [build]: 2.97002e-06 [elim_shapecalc]: 9.99001e-06 [elim_not_effective]: 1.323e-05 [opt_reshape]: 7.67002e-06 [fold_const_symbol]: 1.101e-05 [renormalize]: 2.00002e-07 [detach_backward]: 3.08e-06 [pipeline_parallel_scheduler]: 1.83002e-06 [auto_monad_reorder]: 1.991e-05 [get_jit_bprop_graph]: 1.56002e-06 [rewriter_after_jit_bprop_graph]: 4.45999e-06 [opt_after_jit_grad]: 0.00047809 [validate]: 3.56e-05 Sums bootstrap : 0.000437s : 3.88% type_inference : 0.005872s : 52.19% event_method : 0.000020s : 0.18% auto_monad : 0.000064s : 0.56% graph_reusing : 0.000006s : 0.06% inline : 0.000002s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000021s : 0.18% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000007s : 0.06% parallel-infer-symbol : 0.000003s : 0.03% pre_auto_parallel : 0.000034s : 0.30% insert-virtual-dataset : 0.000002s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000035s : 0.31% optimize.rewriter_before_opt_a : 0.000093s : 0.82% optimize.opt_a.expand_dump_flag : 0.000004s : 0.04% optimize.opt_a.switch_simplify : 0.000055s : 0.49% optimize.opt_a.loop_unroll : 0.000041s : 0.37% optimize.opt_a.a_1 : 0.000878s : 7.80% optimize.opt_a.with_stream_mark : 0.000028s : 0.25% optimize.opt_a.recompute_prepare : 0.000017s : 0.15% optimize.opt_a.updatestate_depend_eliminate : 0.000007s : 0.06% optimize.opt_a.updatestate_assign_eliminate : 0.000006s : 0.06% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.05% optimize.opt_a.parameter_eliminate : 0.000003s : 0.03% optimize.opt_a.a_2 : 0.000229s : 2.04% optimize.opt_a.accelerated_algorithm : 0.000015s : 0.13% optimize.opt_a.shard : 0.000003s : 0.03% optimize.opt_a.meta_shard_fg_expand : 0.000003s : 0.03% optimize.opt_a.shard_inline : 0.000014s : 0.12% optimize.opt_a.merge_send_recv : 0.000013s : 0.12% optimize.opt_a.auto_parallel : 0.000012s : 0.11% optimize.opt_a.parallel : 0.000023s : 0.20% optimize.opt_a.flash_sp : 0.000011s : 0.10% optimize.opt_a.merge_comm : 0.000008s : 0.07% optimize.opt_a.allreduce_fusion : 0.000007s : 0.06% optimize.opt_a.matmul_add_comm_reduction : 0.000015s : 0.14% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000017s : 0.15% optimize.opt_a.virtual_dataset : 0.000015s : 0.13% optimize.opt_a.get_grad_eliminate_ : 0.000014s : 0.12% optimize.opt_a.virtual_output : 0.000014s : 0.12% optimize.opt_a.merge_forward : 0.000007s : 0.06% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.02% optimize.opt_a.offload_activation : 0.000016s : 0.14% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000031s : 0.28% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.02% optimize.opt_a.before_grad : 0.000023s : 0.21% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000008s : 0.07% optimize.opt_a.meta_fg_expand : 0.000005s : 0.04% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.03% optimize.opt_a.receive_attached : 0.000003s : 0.03% optimize.opt_a.after_resolve : 0.000026s : 0.23% optimize.opt_a.a_after_grad : 0.000022s : 0.20% optimize.opt_a.renormalize : 0.000555s : 4.93% optimize.opt_a.add_forward_monad_depend : 0.000007s : 0.06% optimize.opt_a.auto_monad_grad : 0.000003s : 0.03% optimize.opt_a.auto_monad_eliminator : 0.000022s : 0.19% optimize.opt_a.cse : 0.000048s : 0.42% optimize.opt_a.a_3 : 0.000120s : 1.06% optimize.py_interpret_to_execute_after_opt_a : 0.000014s : 0.12% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.04% optimize.rewriter_after_opt_a : 0.000037s : 0.33% optimize.convert_after_rewriter : 0.000010s : 0.09% optimize.order_py_execute_after_rewriter : 0.000008s : 0.08% optimize.mutable_eliminate : 0.000520s : 4.62% optimize.opt_b.b_1 : 0.000192s : 1.71% optimize.opt_b.b_2 : 0.000009s : 0.08% optimize.opt_b.updatestate_depend_eliminate : 0.000006s : 0.05% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.02% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.02% optimize.opt_b.renormalize : 0.000000s : 0.00% optimize.opt_b.cse : 0.000020s : 0.18% optimize.optimize_parallel_all_gather_comm : 0.000020s : 0.18% optimize.overlap_param_gather : 0.000005s : 0.05% optimize.cconv : 0.000029s : 0.25% optimize.loop_unroll : 0.000433s : 3.85% optimize.opt_after_cconv.c_1 : 0.000036s : 0.32% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.02% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000005s : 0.05% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.02% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.02% optimize.opt_after_cconv.cse : 0.000019s : 0.17% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000017s : 0.15% optimize.tuple_transform.d_1 : 0.000049s : 0.44% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000007s : 0.07% optimize.partial_unused_args_eliminate : 0.000005s : 0.04% optimize.add_recomputation : 0.000049s : 0.44% optimize.cse_after_recomputation.cse : 0.000012s : 0.10% optimize.environ_conv : 0.000008s : 0.07% optimize.swap_dp_allreduce_reducescatter : 0.000008s : 0.07% optimize.bias_add_comm_swap : 0.000005s : 0.04% optimize.label_micro_interleaved_index : 0.000006s : 0.06% optimize.label_fine_grained_interleaved_index : 0.000005s : 0.04% optimize.merge_cast_opt : 0.000004s : 0.03% optimize.slice_recompute_activation : 0.000005s : 0.04% optimize.micro_interleaved_order_control : 0.000005s : 0.04% optimize.assign_add_opt : 0.000004s : 0.03% optimize.ForceFp32Comm : 0.000003s : 0.03% optimize.remove_cast_before_assign_add : 0.000003s : 0.03% optimize.full_micro_interleaved_order_control : 0.000005s : 0.04% optimize.reorder_send_recv_between_fp_bp : 0.000005s : 0.05% optimize.comm_op_add_attrs : 0.000003s : 0.03% optimize.add_comm_op_reuse_tag : 0.000003s : 0.03% optimize.interleave_split_concat_branches : 0.000003s : 0.03% optimize.interleave_parallel_branches : 0.000003s : 0.03% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.03% optimize.overlap_opt_shard_grad_in_pipeline : 0.000004s : 0.04% optimize.control_data_broadcast_order : 0.000015s : 0.13% optimize.grouped_pairwise_exchange_alltoall : 0.000005s : 0.04% optimize.offloading_packed_experts : 0.000006s : 0.06% optimize.overlap_recompute_and_grad_model_parallel : 0.000007s : 0.06% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.03% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.03% optimize.overlap_recompute_comm : 0.000005s : 0.05% optimize.overlap_grad_ring_attention : 0.000007s : 0.06% optimize.overlap_grad_flash_sp : 0.000022s : 0.20% optimize.begin_end_overlap_inline : 0.000003s : 0.03% optimize.split_matmul_comm_elemetwise : 0.000004s : 0.04% optimize.split_layernorm_comm : 0.000004s : 0.03% optimize.handle_group_info : 0.000004s : 0.03% optimize.symbol_engine_optimizer.build : 0.000003s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000010s : 0.09% optimize.symbol_engine_optimizer.elim_not_effective : 0.000013s : 0.12% optimize.symbol_engine_optimizer.opt_reshape : 0.000008s : 0.07% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000011s : 0.10% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000003s : 0.03% pipeline_parallel_scheduler : 0.000002s : 0.02% auto_monad_reorder : 0.000020s : 0.18% get_jit_bprop_graph : 0.000002s : 0.01% rewriter_after_jit_bprop_graph : 0.000004s : 0.04% opt_after_jit_grad : 0.000478s : 4.25% validate : 0.000036s : 0.32% Time group info: ------[substitution.] 0.000194 34 0.94% : 0.000002s : 2: substitution.elim_not_effective 0.70% : 0.000001s : 2: substitution.fold_const_symbol 3.09% : 0.000006s : 5: substitution.graph_param_transform 75.13% : 0.000145s : 4: substitution.inline 2.03% : 0.000004s : 4: substitution.j_node_and_user_rematch 2.41% : 0.000005s : 4: substitution.remove_not_recompute_node 2.99% : 0.000006s : 6: substitution.replace_old_param 9.75% : 0.000019s : 6: substitution.tuple_list_get_item_eliminator 2.97% : 0.000006s : 1: substitution.value_based_eliminate ------[type_inference.] 0.005818 2 87.51% : 0.005092s : 1: type_inference.infer 12.49% : 0.000727s : 1: type_inference.specialize ------[replace.] 0.000070 10 56.03% : 0.000039s : 4: replace.inline 43.97% : 0.000031s : 6: replace.tuple_list_get_item_eliminator ------[match.] 0.000159 10 89.77% : 0.000143s : 4: match.inline 10.23% : 0.000016s : 6: match.tuple_list_get_item_eliminator ------[predicate.] 0.000232 1590 0.89% : 0.000002s : 16: predicate.accumulaten_eliminater 0.78% : 0.000002s : 5: predicate.ad_related_special_op_eliminate 0.50% : 0.000001s : 10: predicate.addn_check_dump 0.89% : 0.000002s : 16: predicate.addn_zero_filter 0.84% : 0.000002s : 16: predicate.adjust_all_reduce_mul_add 2.01% : 0.000005s : 26: predicate.arithmetic_simplify 0.90% : 0.000002s : 16: predicate.cast_eliminate 0.58% : 0.000001s : 10: predicate.check_bprop_eliminate 0.56% : 0.000001s : 10: predicate.compare_switch_simplify 0.22% : 0.000001s : 5: predicate.const_output_eliminate 0.61% : 0.000001s : 10: predicate.depend_value_elim 1.00% : 0.000002s : 16: predicate.dict_get_item_const_eliminator 1.06% : 0.000002s : 16: predicate.dict_get_item_eliminator 1.01% : 0.000002s : 16: predicate.dict_set_item_eliminator 0.89% : 0.000002s : 10: predicate.dumpgradient_eliminate 0.22% : 0.000001s : 5: predicate.elim_not_effective 0.35% : 0.000001s : 5: predicate.elim_shapecalc_of_broadcastargs 1.22% : 0.000003s : 21: predicate.environ_add_const_eliminate 1.10% : 0.000003s : 21: predicate.environ_get_add_eliminate 1.13% : 0.000003s : 21: predicate.environ_get_depend_swap 1.67% : 0.000004s : 31: predicate.environ_get_eliminate 1.08% : 0.000003s : 21: predicate.environ_get_set_eliminate 1.54% : 0.000004s : 26: predicate.exchange_switch_depend_value 2.52% : 0.000006s : 26: predicate.float_depend_g_call 0.50% : 0.000001s : 10: predicate.float_environ_get_switch 0.78% : 0.000002s : 15: predicate.float_tuple_getitem_switch 0.19% : 0.000000s : 5: predicate.fold_const_symbol 0.64% : 0.000001s : 10: predicate.get_grad_eliminate 0.22% : 0.000001s : 5: predicate.graph_param_transform 0.56% : 0.000001s : 10: predicate.incorporate_call 0.48% : 0.000001s : 10: predicate.incorporate_call_switch 6.30% : 0.000015s : 72: predicate.inline 0.90% : 0.000002s : 10: predicate.inline_without_move 0.34% : 0.000001s : 10: predicate.j_node_and_user_rematch 0.71% : 0.000002s : 10: predicate.less_batch_normalization 1.93% : 0.000004s : 32: predicate.list_to_tuple_eliminator_ 2.61% : 0.000006s : 48: predicate.load_eliminater 0.85% : 0.000002s : 5: predicate.loop_unroll_after_grad 2.47% : 0.000006s : 40: predicate.loop_unroll_before_grad 1.57% : 0.000004s : 26: predicate.make_slice_get_slice_eliminator 0.58% : 0.000001s : 10: predicate.merge_addn 0.62% : 0.000001s : 10: predicate.micro_step_allgather_replace 0.56% : 0.000001s : 10: predicate.mini_step_allgather_replace 0.89% : 0.000002s : 16: predicate.minmaximum_grad 0.96% : 0.000002s : 5: predicate.mutable_eliminate 0.32% : 0.000001s : 5: predicate.opt_reshape 0.34% : 0.000001s : 5: predicate.parallel_virtual_node 1.76% : 0.000004s : 26: predicate.partial_defer_inline 1.81% : 0.000004s : 27: predicate.partial_eliminate 0.91% : 0.000002s : 16: predicate.print_const_string_wrapper 0.60% : 0.000001s : 10: predicate.reduce_all_const_elim 1.10% : 0.000003s : 16: predicate.reduce_eliminate 2.66% : 0.000006s : 48: predicate.redundant_stop_gradient_eliminater 0.50% : 0.000001s : 10: predicate.remove_not_recompute_node 1.53% : 0.000004s : 32: predicate.replace_applicator 0.44% : 0.000001s : 10: predicate.replace_old_param 0.27% : 0.000001s : 5: predicate.reset_defer_inline 0.93% : 0.000002s : 16: predicate.reshape_eliminate 0.59% : 0.000001s : 10: predicate.row_tensor_add_zeros_like 0.36% : 0.000001s : 5: predicate.row_tensor_eliminate 0.78% : 0.000002s : 10: predicate.same_eliminate 0.48% : 0.000001s : 10: predicate.set_cell_output_no_recompute 0.72% : 0.000002s : 10: predicate.shard_identity_eliminate 0.67% : 0.000002s : 10: predicate.special_op_eliminate 0.69% : 0.000002s : 10: predicate.specialize_transform 0.75% : 0.000002s : 10: predicate.split_environ_get_set_with_tuple_value 0.76% : 0.000002s : 10: predicate.stack_unstack_eliminate 0.32% : 0.000001s : 5: predicate.switch_call_monad_eliminater 1.58% : 0.000004s : 26: predicate.switch_defer_inline 2.19% : 0.000005s : 36: predicate.switch_layer_defer_inline 5.28% : 0.000012s : 81: predicate.switch_simplify 0.88% : 0.000002s : 16: predicate.tile_eliminate 0.91% : 0.000002s : 16: predicate.transpose_eliminate 1.57% : 0.000004s : 26: predicate.tuple_list_convert_item_index_to_positive 1.59% : 0.000004s : 26: predicate.tuple_list_get_item_const_eliminator 1.41% : 0.000003s : 26: predicate.tuple_list_get_item_depend_reorder 3.37% : 0.000008s : 42: predicate.tuple_list_get_item_eliminator 1.44% : 0.000003s : 26: predicate.tuple_list_get_set_item_eliminator 2.27% : 0.000005s : 36: predicate.tuple_list_set_item_eliminator 1.83% : 0.000004s : 32: predicate.tuple_to_list_eliminator_ 2.55% : 0.000006s : 48: predicate.updatestate_pure_node_eliminater 3.24% : 0.000008s : 58: predicate.updatestate_useless_node_eliminater 0.48% : 0.000001s : 5: predicate.value_based_eliminate 0.60% : 0.000001s : 10: predicate.virtual_dataset_eliminate 0.62% : 0.000001s : 10: predicate.virtual_output_eliminate 0.28% : 0.000001s : 5: predicate.virtual_view_grad_eliminate 0.38% : 0.000001s : 5: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000553 11 56.00% : 0.000309s : 5: func_graph_cloner_run.FuncGraphClonerGraph 44.00% : 0.000243s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.026539 192 0.02% : 0.000006s : 1: ForceFp32Comm 11.50% : 0.003051s : 1: add_attr 11.45% : 0.003038s : 1: add_attr_with_inline 0.02% : 0.000006s : 1: add_comm_op_reuse_tag 0.20% : 0.000053s : 1: add_recomputation 0.02% : 0.000006s : 1: assign_add_opt 0.27% : 0.000072s : 1: auto_monad 0.10% : 0.000027s : 1: auto_monad_reorder 0.02% : 0.000006s : 1: begin_end_overlap_inline 0.03% : 0.000008s : 1: bias_add_comm_swap 1.80% : 0.000479s : 1: bootstrap 0.12% : 0.000032s : 1: cconv 0.02% : 0.000006s : 1: comm_op_add_attrs 0.07% : 0.000018s : 1: control_data_broadcast_order 0.05% : 0.000013s : 1: convert_after_rewriter 0.11% : 0.000030s : 1: cse_after_recomputation 0.03% : 0.000007s : 1: dataset_repeat_opt 0.06% : 0.000016s : 1: detach_backward 0.04% : 0.000011s : 1: environ_conv 0.11% : 0.000030s : 1: event_method 0.03% : 0.000007s : 1: full_micro_interleaved_order_control 0.03% : 0.000008s : 1: get_jit_bprop_graph 0.05% : 0.000013s : 1: graph_reusing 0.03% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.02% : 0.000006s : 1: handle_group_info 0.03% : 0.000008s : 1: inline 0.06% : 0.000017s : 1: insert-virtual-dataset 0.02% : 0.000006s : 1: interleave_parallel_branches 0.02% : 0.000006s : 1: interleave_split_concat_branches 0.03% : 0.000008s : 1: label_fine_grained_interleaved_index 0.03% : 0.000009s : 1: label_micro_interleaved_index 1.65% : 0.000439s : 1: loop_unroll 0.03% : 0.000007s : 1: merge_cast_opt 0.03% : 0.000007s : 1: micro_interleaved_order_control 1.98% : 0.000526s : 1: mutable_eliminate 0.04% : 0.000009s : 1: offloading_packed_experts 0.06% : 0.000015s : 1: opt.transform.loop_unroll_optimizer 0.06% : 0.000016s : 1: opt.transform.mutable_eliminate 5.13% : 0.001360s : 78: opt.transform.opt_a 0.13% : 0.000035s : 1: opt.transform.opt_after_cconv 0.10% : 0.000027s : 1: opt.transform.opt_after_jit_grad 0.48% : 0.000127s : 28: opt.transform.opt_b 0.21% : 0.000055s : 2: opt.transform.opt_trans_graph 0.15% : 0.000039s : 4: opt.transform.symbol_engine_opt 11.68% : 0.003100s : 1: opt_a 0.51% : 0.000136s : 1: opt_after_cconv 1.84% : 0.000488s : 1: opt_after_jit_grad 1.14% : 0.000301s : 1: opt_b 21.51% : 0.005710s : 1: optimize 0.09% : 0.000023s : 1: optimize_parallel_all_gather_comm 0.04% : 0.000011s : 1: order_py_execute_after_rewriter 0.10% : 0.000026s : 1: overlap_grad_flash_sp 0.02% : 0.000006s : 1: overlap_grad_matmul_and_grad_allreduce 0.04% : 0.000010s : 1: overlap_grad_ring_attention 0.03% : 0.000007s : 1: overlap_opt_shard_grad_in_pipeline 0.02% : 0.000006s : 1: overlap_opt_shard_in_pipeline 0.03% : 0.000008s : 1: overlap_param_gather 0.02% : 0.000006s : 1: overlap_recompute_allgather_and_fa_grad 0.04% : 0.000010s : 1: overlap_recompute_and_grad_model_parallel 0.03% : 0.000008s : 1: overlap_recompute_comm 0.03% : 0.000009s : 1: parallel-infer-symbol 0.03% : 0.000007s : 1: parallel-infer-symbol-second 0.03% : 0.000008s : 1: partial_unused_args_eliminate 0.03% : 0.000009s : 1: pipeline_parallel_scheduler 0.03% : 0.000007s : 1: pipeline_split 0.15% : 0.000041s : 1: pre_auto_parallel 0.15% : 0.000039s : 1: py_interpret_to_execute 0.07% : 0.000018s : 1: py_interpret_to_execute_after_opt_a 0.02% : 0.000006s : 1: remove_cast_before_assign_add 0.08% : 0.000020s : 1: remove_dup_value 1.04% : 0.000276s : 1: renormalize.infer 1.02% : 0.000271s : 1: renormalize.specialize 0.03% : 0.000008s : 1: reorder_send_recv_between_fp_bp 0.04% : 0.000011s : 1: rewriter_after_jit_bprop_graph 0.15% : 0.000041s : 1: rewriter_after_opt_a 0.37% : 0.000097s : 1: rewriter_before_opt_a 0.03% : 0.000008s : 1: slice_cell_reuse_recomputed_activation 0.03% : 0.000008s : 1: slice_recompute_activation 0.02% : 0.000007s : 1: split_layernorm_comm 0.03% : 0.000007s : 1: split_matmul_comm_elemetwise 0.04% : 0.000010s : 1: swap_dp_allreduce_reducescatter 0.38% : 0.000100s : 1: symbol_engine_optimizer 0.37% : 0.000098s : 1: tuple_transform 22.25% : 0.005906s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:47:18.673.228 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0145216, [21] [bootstrap]: 0.00043155 [type_inference]: 0.0056449 [event_method]: 1.863e-05 [auto_monad]: 5.999e-05 [graph_reusing]: 5.66e-06 [inline]: 1.62001e-06 [add_attr]: 0.0029597, [1] [add_attr_with_inline]: 0.00295136, [1] [Cycle 1]: 5.447e-05, [2] [tag_attr]: 1.887e-05 [meta_addattr_fg_expand]: 5.96e-06 [parallel-infer-symbol]: 3.21999e-06 [pre_auto_parallel]: 3.3e-05 [insert-virtual-dataset]: 2.56e-06 [parallel-infer-symbol-second]: 8.90024e-07 [dataset_repeat_opt]: 1.98997e-06 [pipeline_split]: 1.72999e-06 [optimize]: 0.00471235, [53] [py_interpret_to_execute]: 2.745e-05 [rewriter_before_opt_a]: 0.00011118 [opt_a]: 0.0027142, [2] [Cycle 1]: 0.00201828, [45] [expand_dump_flag]: 3.06999e-06 [switch_simplify]: 4.599e-05 [loop_unroll]: 3.407e-05 [a_1]: 0.00070587 [with_stream_mark]: 1.415e-05 [recompute_prepare]: 9.53002e-06 [updatestate_depend_eliminate]: 3.68e-06 [updatestate_assign_eliminate]: 3.21001e-06 [updatestate_loads_eliminate]: 2.85002e-06 [parameter_eliminate]: 1.79e-06 [a_2]: 8.846e-05 [accelerated_algorithm]: 7.30998e-06 [shard]: 1.99999e-06 [meta_shard_fg_expand]: 2.14999e-06 [shard_inline]: 7.16001e-06 [merge_send_recv]: 7.67002e-06 [auto_parallel]: 5.86e-06 [parallel]: 1.834e-05 [flash_sp]: 7.34002e-06 [merge_comm]: 4.02e-06 [allreduce_fusion]: 3.51999e-06 [matmul_add_comm_reduction]: 9.35001e-06 [allreduce_slice_to_reducescatter]: 6.30011e-07 [virtual_shard_identity]: 8.52e-06 [virtual_dataset]: 7.31999e-06 [get_grad_eliminate_]: 7.00002e-06 [virtual_output]: 7e-06 [merge_forward]: 3.81999e-06 [cell_reuse_recompute_pass]: 1.10999e-06 [offload_activation]: 9.77999e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.244e-05 [merge_recompute_call_nodes]: 1.40001e-06 [before_grad]: 1.059e-05 [set_forward_comm_id_for_comm_node_pass]: 3.53e-06 [meta_fg_expand]: 2.66e-06 [flash_sp_send_recv_attached]: 2.88998e-06 [receive_attached]: 2.45002e-06 [after_resolve]: 1.314e-05 [a_after_grad]: 1.144e-05 [renormalize]: 0.00058662 [add_forward_monad_depend]: 5.09e-06 [auto_monad_grad]: 1.89e-06 [auto_monad_eliminator]: 1.42e-05 [cse]: 3.185e-05 [a_3]: 5.009e-05 [Cycle 2]: 0.00068613, [45] [expand_dump_flag]: 1.17e-06 [switch_simplify]: 8.34998e-06 [loop_unroll]: 6.89999e-06 [a_1]: 0.00015736 [with_stream_mark]: 1.096e-05 [recompute_prepare]: 7.09001e-06 [updatestate_depend_eliminate]: 2.91e-06 [updatestate_assign_eliminate]: 2.71e-06 [updatestate_loads_eliminate]: 2.21e-06 [parameter_eliminate]: 9.60019e-07 [a_2]: 8.046e-05 [accelerated_algorithm]: 6.73998e-06 [shard]: 1.62999e-06 [meta_shard_fg_expand]: 1.25999e-06 [shard_inline]: 6.74999e-06 [merge_send_recv]: 4.47e-06 [auto_parallel]: 5.26002e-06 [parallel]: 4.11001e-06 [flash_sp]: 3.19001e-06 [merge_comm]: 3.09001e-06 [allreduce_fusion]: 3.19001e-06 [matmul_add_comm_reduction]: 5.05001e-06 [allreduce_slice_to_reducescatter]: 4.10015e-07 [virtual_shard_identity]: 7.18998e-06 [virtual_dataset]: 6.61e-06 [get_grad_eliminate_]: 9.20999e-06 [virtual_output]: 6.34999e-06 [merge_forward]: 2.72001e-06 [cell_reuse_recompute_pass]: 1.37e-06 [offload_activation]: 6.05002e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.279e-05 [merge_recompute_call_nodes]: 9.20001e-07 [before_grad]: 9.32001e-06 [set_forward_comm_id_for_comm_node_pass]: 3.21999e-06 [meta_fg_expand]: 2.14e-06 [flash_sp_send_recv_attached]: 8.60018e-07 [receive_attached]: 1.07e-06 [after_resolve]: 1.344e-05 [a_after_grad]: 1.104e-05 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 1.27e-06 [auto_monad_grad]: 9.5999e-07 [auto_monad_eliminator]: 6.73e-06 [cse]: 1.395e-05 [a_3]: 3.937e-05 [py_interpret_to_execute_after_opt_a]: 8.28001e-06 [slice_cell_reuse_recomputed_activation]: 1.76998e-06 [rewriter_after_opt_a]: 3.317e-05 [convert_after_rewriter]: 6.93998e-06 [order_py_execute_after_rewriter]: 4.91002e-06 [mutable_eliminate]: 0.00045872 [opt_b]: 0.00022203, [1] [Cycle 1]: 0.00021645, [7] [b_1]: 0.00014421 [b_2]: 8.85999e-06 [updatestate_depend_eliminate]: 5.27999e-06 [updatestate_assign_eliminate]: 2.43e-06 [updatestate_loads_eliminate]: 2.64001e-06 [renormalize]: 4.10015e-07 [cse]: 1.796e-05 [optimize_parallel_all_gather_comm]: 1.549e-05 [overlap_param_gather]: 1.85001e-06 [cconv]: 2.169e-05 [loop_unroll]: 0.00043686 [opt_after_cconv]: 0.00010445, [1] [Cycle 1]: 9.89e-05, [7] [c_1]: 3.53e-05 [parameter_eliminate]: 2.29999e-06 [updatestate_depend_eliminate]: 4.82e-06 [updatestate_assign_eliminate]: 2.29999e-06 [updatestate_loads_eliminate]: 2.25002e-06 [cse]: 1.811e-05 [renormalize]: 3.30008e-07 [remove_dup_value]: 1.317e-05 [tuple_transform]: 8.167e-05, [1] [Cycle 1]: 7.726e-05, [4] [d_1]: 4.977e-05 [none_parameter_eliminate]: 1.39998e-06 [renormalize]: 2.00002e-07 [switch_simplify]: 7.41999e-06 [partial_unused_args_eliminate]: 2.28002e-06 [add_recomputation]: 4.531e-05 [cse_after_recomputation]: 2.161e-05, [1] [Cycle 1]: 1.714e-05, [1] [cse]: 1.146e-05 [environ_conv]: 4.66002e-06 [swap_dp_allreduce_reducescatter]: 5.10001e-06 [bias_add_comm_swap]: 2.39001e-06 [label_micro_interleaved_index]: 3.76001e-06 [label_fine_grained_interleaved_index]: 2.61e-06 [merge_cast_opt]: 1.30001e-06 [slice_recompute_activation]: 2.04e-06 [micro_interleaved_order_control]: 2.55997e-06 [assign_add_opt]: 1.17999e-06 [ForceFp32Comm]: 7.80012e-07 [remove_cast_before_assign_add]: 9.79984e-07 [full_micro_interleaved_order_control]: 1.96e-06 [reorder_send_recv_between_fp_bp]: 2.69001e-06 [comm_op_add_attrs]: 9.39996e-07 [add_comm_op_reuse_tag]: 9.5999e-07 [interleave_split_concat_branches]: 1.14e-06 [interleave_parallel_branches]: 1.17999e-06 [overlap_opt_shard_in_pipeline]: 1.14e-06 [overlap_opt_shard_grad_in_pipeline]: 2.02001e-06 [control_data_broadcast_order]: 1.204e-05 [grouped_pairwise_exchange_alltoall]: 1.51002e-06 [offloading_packed_experts]: 3.43e-06 [overlap_recompute_and_grad_model_parallel]: 4.54998e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.14e-06 [overlap_recompute_allgather_and_fa_grad]: 1.29e-06 [overlap_recompute_comm]: 1.95001e-06 [overlap_grad_ring_attention]: 4.13001e-06 [overlap_grad_flash_sp]: 1.745e-05 [begin_end_overlap_inline]: 5.60016e-07 [split_matmul_comm_elemetwise]: 1.96998e-06 [split_layernorm_comm]: 1.52999e-06 [handle_group_info]: 9.90025e-07 [symbol_engine_optimizer]: 7.542e-05, [1] [Cycle 1]: 7.142e-05, [6] [build]: 2.75002e-06 [elim_shapecalc]: 1.031e-05 [elim_not_effective]: 1.325e-05 [opt_reshape]: 7.33e-06 [fold_const_symbol]: 1.045e-05 [renormalize]: 2.50002e-07 [detach_backward]: 1.87001e-06 [pipeline_parallel_scheduler]: 1.81e-06 [auto_monad_reorder]: 1.752e-05 [get_jit_bprop_graph]: 1.18001e-06 [rewriter_after_jit_bprop_graph]: 3.54002e-06 [opt_after_jit_grad]: 0.00044479 [validate]: 3.388e-05 Sums bootstrap : 0.000432s : 4.05% type_inference : 0.005645s : 53.04% event_method : 0.000019s : 0.18% auto_monad : 0.000060s : 0.56% graph_reusing : 0.000006s : 0.05% inline : 0.000002s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000019s : 0.18% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.06% parallel-infer-symbol : 0.000003s : 0.03% pre_auto_parallel : 0.000033s : 0.31% insert-virtual-dataset : 0.000003s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.02% optimize.py_interpret_to_execute : 0.000027s : 0.26% optimize.rewriter_before_opt_a : 0.000111s : 1.04% optimize.opt_a.expand_dump_flag : 0.000004s : 0.04% optimize.opt_a.switch_simplify : 0.000054s : 0.51% optimize.opt_a.loop_unroll : 0.000041s : 0.38% optimize.opt_a.a_1 : 0.000863s : 8.11% optimize.opt_a.with_stream_mark : 0.000025s : 0.24% optimize.opt_a.recompute_prepare : 0.000017s : 0.16% optimize.opt_a.updatestate_depend_eliminate : 0.000007s : 0.06% optimize.opt_a.updatestate_assign_eliminate : 0.000006s : 0.06% optimize.opt_a.updatestate_loads_eliminate : 0.000005s : 0.05% optimize.opt_a.parameter_eliminate : 0.000003s : 0.03% optimize.opt_a.a_2 : 0.000169s : 1.59% optimize.opt_a.accelerated_algorithm : 0.000014s : 0.13% optimize.opt_a.shard : 0.000004s : 0.03% optimize.opt_a.meta_shard_fg_expand : 0.000003s : 0.03% optimize.opt_a.shard_inline : 0.000014s : 0.13% optimize.opt_a.merge_send_recv : 0.000012s : 0.11% optimize.opt_a.auto_parallel : 0.000011s : 0.10% optimize.opt_a.parallel : 0.000022s : 0.21% optimize.opt_a.flash_sp : 0.000011s : 0.10% optimize.opt_a.merge_comm : 0.000007s : 0.07% optimize.opt_a.allreduce_fusion : 0.000007s : 0.06% optimize.opt_a.matmul_add_comm_reduction : 0.000014s : 0.14% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000016s : 0.15% optimize.opt_a.virtual_dataset : 0.000014s : 0.13% optimize.opt_a.get_grad_eliminate_ : 0.000016s : 0.15% optimize.opt_a.virtual_output : 0.000013s : 0.13% optimize.opt_a.merge_forward : 0.000007s : 0.06% optimize.opt_a.cell_reuse_recompute_pass : 0.000002s : 0.02% optimize.opt_a.offload_activation : 0.000016s : 0.15% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000025s : 0.24% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.02% optimize.opt_a.before_grad : 0.000020s : 0.19% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000007s : 0.06% optimize.opt_a.meta_fg_expand : 0.000005s : 0.05% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.04% optimize.opt_a.receive_attached : 0.000004s : 0.03% optimize.opt_a.after_resolve : 0.000027s : 0.25% optimize.opt_a.a_after_grad : 0.000022s : 0.21% optimize.opt_a.renormalize : 0.000587s : 5.51% optimize.opt_a.add_forward_monad_depend : 0.000006s : 0.06% optimize.opt_a.auto_monad_grad : 0.000003s : 0.03% optimize.opt_a.auto_monad_eliminator : 0.000021s : 0.20% optimize.opt_a.cse : 0.000046s : 0.43% optimize.opt_a.a_3 : 0.000089s : 0.84% optimize.py_interpret_to_execute_after_opt_a : 0.000008s : 0.08% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.02% optimize.rewriter_after_opt_a : 0.000033s : 0.31% optimize.convert_after_rewriter : 0.000007s : 0.07% optimize.order_py_execute_after_rewriter : 0.000005s : 0.05% optimize.mutable_eliminate : 0.000459s : 4.31% optimize.opt_b.b_1 : 0.000144s : 1.35% optimize.opt_b.b_2 : 0.000009s : 0.08% optimize.opt_b.updatestate_depend_eliminate : 0.000005s : 0.05% optimize.opt_b.updatestate_assign_eliminate : 0.000002s : 0.02% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.02% optimize.opt_b.renormalize : 0.000000s : 0.00% optimize.opt_b.cse : 0.000018s : 0.17% optimize.optimize_parallel_all_gather_comm : 0.000015s : 0.15% optimize.overlap_param_gather : 0.000002s : 0.02% optimize.cconv : 0.000022s : 0.20% optimize.loop_unroll : 0.000437s : 4.10% optimize.opt_after_cconv.c_1 : 0.000035s : 0.33% optimize.opt_after_cconv.parameter_eliminate : 0.000002s : 0.02% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000005s : 0.05% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000002s : 0.02% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.02% optimize.opt_after_cconv.cse : 0.000018s : 0.17% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000013s : 0.12% optimize.tuple_transform.d_1 : 0.000050s : 0.47% optimize.tuple_transform.none_parameter_eliminate : 0.000001s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000007s : 0.07% optimize.partial_unused_args_eliminate : 0.000002s : 0.02% optimize.add_recomputation : 0.000045s : 0.43% optimize.cse_after_recomputation.cse : 0.000011s : 0.11% optimize.environ_conv : 0.000005s : 0.04% optimize.swap_dp_allreduce_reducescatter : 0.000005s : 0.05% optimize.bias_add_comm_swap : 0.000002s : 0.02% optimize.label_micro_interleaved_index : 0.000004s : 0.04% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.02% optimize.merge_cast_opt : 0.000001s : 0.01% optimize.slice_recompute_activation : 0.000002s : 0.02% optimize.micro_interleaved_order_control : 0.000003s : 0.02% optimize.assign_add_opt : 0.000001s : 0.01% optimize.ForceFp32Comm : 0.000001s : 0.01% optimize.remove_cast_before_assign_add : 0.000001s : 0.01% optimize.full_micro_interleaved_order_control : 0.000002s : 0.02% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.03% optimize.comm_op_add_attrs : 0.000001s : 0.01% optimize.add_comm_op_reuse_tag : 0.000001s : 0.01% optimize.interleave_split_concat_branches : 0.000001s : 0.01% optimize.interleave_parallel_branches : 0.000001s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.02% optimize.control_data_broadcast_order : 0.000012s : 0.11% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.01% optimize.offloading_packed_experts : 0.000003s : 0.03% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.04% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.01% optimize.overlap_recompute_comm : 0.000002s : 0.02% optimize.overlap_grad_ring_attention : 0.000004s : 0.04% optimize.overlap_grad_flash_sp : 0.000017s : 0.16% optimize.begin_end_overlap_inline : 0.000001s : 0.01% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.02% optimize.split_layernorm_comm : 0.000002s : 0.01% optimize.handle_group_info : 0.000001s : 0.01% optimize.symbol_engine_optimizer.build : 0.000003s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000010s : 0.10% optimize.symbol_engine_optimizer.elim_not_effective : 0.000013s : 0.12% optimize.symbol_engine_optimizer.opt_reshape : 0.000007s : 0.07% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000010s : 0.10% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.02% pipeline_parallel_scheduler : 0.000002s : 0.02% auto_monad_reorder : 0.000018s : 0.16% get_jit_bprop_graph : 0.000001s : 0.01% rewriter_after_jit_bprop_graph : 0.000004s : 0.03% opt_after_jit_grad : 0.000445s : 4.18% validate : 0.000034s : 0.32% Time group info: ------[substitution.] 0.000183 34 1.14% : 0.000002s : 2: substitution.elim_not_effective 0.89% : 0.000002s : 2: substitution.fold_const_symbol 3.50% : 0.000006s : 5: substitution.graph_param_transform 75.07% : 0.000137s : 4: substitution.inline 1.60% : 0.000003s : 4: substitution.j_node_and_user_rematch 2.44% : 0.000004s : 4: substitution.remove_not_recompute_node 2.85% : 0.000005s : 6: substitution.replace_old_param 9.69% : 0.000018s : 6: substitution.tuple_list_get_item_eliminator 2.82% : 0.000005s : 1: substitution.value_based_eliminate ------[type_inference.] 0.005589 2 87.30% : 0.004879s : 1: type_inference.infer 12.70% : 0.000710s : 1: type_inference.specialize ------[replace.] 0.000066 10 53.90% : 0.000036s : 4: replace.inline 46.10% : 0.000031s : 6: replace.tuple_list_get_item_eliminator ------[match.] 0.000150 10 89.85% : 0.000135s : 4: match.inline 10.15% : 0.000015s : 6: match.tuple_list_get_item_eliminator ------[predicate.] 0.000228 1590 0.93% : 0.000002s : 16: predicate.accumulaten_eliminater 0.70% : 0.000002s : 5: predicate.ad_related_special_op_eliminate 0.52% : 0.000001s : 10: predicate.addn_check_dump 1.01% : 0.000002s : 16: predicate.addn_zero_filter 0.85% : 0.000002s : 16: predicate.adjust_all_reduce_mul_add 1.96% : 0.000004s : 26: predicate.arithmetic_simplify 1.04% : 0.000002s : 16: predicate.cast_eliminate 0.56% : 0.000001s : 10: predicate.check_bprop_eliminate 0.50% : 0.000001s : 10: predicate.compare_switch_simplify 0.20% : 0.000000s : 5: predicate.const_output_eliminate 0.71% : 0.000002s : 10: predicate.depend_value_elim 0.98% : 0.000002s : 16: predicate.dict_get_item_const_eliminator 1.02% : 0.000002s : 16: predicate.dict_get_item_eliminator 0.87% : 0.000002s : 16: predicate.dict_set_item_eliminator 0.86% : 0.000002s : 10: predicate.dumpgradient_eliminate 0.19% : 0.000000s : 5: predicate.elim_not_effective 0.37% : 0.000001s : 5: predicate.elim_shapecalc_of_broadcastargs 1.20% : 0.000003s : 21: predicate.environ_add_const_eliminate 1.12% : 0.000003s : 21: predicate.environ_get_add_eliminate 1.09% : 0.000002s : 21: predicate.environ_get_depend_swap 1.70% : 0.000004s : 31: predicate.environ_get_eliminate 1.25% : 0.000003s : 21: predicate.environ_get_set_eliminate 1.53% : 0.000003s : 26: predicate.exchange_switch_depend_value 2.47% : 0.000006s : 26: predicate.float_depend_g_call 0.52% : 0.000001s : 10: predicate.float_environ_get_switch 0.75% : 0.000002s : 15: predicate.float_tuple_getitem_switch 0.18% : 0.000000s : 5: predicate.fold_const_symbol 0.68% : 0.000002s : 10: predicate.get_grad_eliminate 0.24% : 0.000001s : 5: predicate.graph_param_transform 0.60% : 0.000001s : 10: predicate.incorporate_call 0.48% : 0.000001s : 10: predicate.incorporate_call_switch 6.35% : 0.000014s : 72: predicate.inline 0.72% : 0.000002s : 10: predicate.inline_without_move 0.35% : 0.000001s : 10: predicate.j_node_and_user_rematch 0.66% : 0.000001s : 10: predicate.less_batch_normalization 1.98% : 0.000005s : 32: predicate.list_to_tuple_eliminator_ 2.59% : 0.000006s : 48: predicate.load_eliminater 0.76% : 0.000002s : 5: predicate.loop_unroll_after_grad 2.47% : 0.000006s : 40: predicate.loop_unroll_before_grad 1.64% : 0.000004s : 26: predicate.make_slice_get_slice_eliminator 0.51% : 0.000001s : 10: predicate.merge_addn 0.53% : 0.000001s : 10: predicate.micro_step_allgather_replace 0.55% : 0.000001s : 10: predicate.mini_step_allgather_replace 0.83% : 0.000002s : 16: predicate.minmaximum_grad 0.79% : 0.000002s : 5: predicate.mutable_eliminate 0.39% : 0.000001s : 5: predicate.opt_reshape 0.30% : 0.000001s : 5: predicate.parallel_virtual_node 1.98% : 0.000005s : 26: predicate.partial_defer_inline 1.82% : 0.000004s : 27: predicate.partial_eliminate 0.93% : 0.000002s : 16: predicate.print_const_string_wrapper 0.56% : 0.000001s : 10: predicate.reduce_all_const_elim 1.10% : 0.000003s : 16: predicate.reduce_eliminate 2.63% : 0.000006s : 48: predicate.redundant_stop_gradient_eliminater 0.43% : 0.000001s : 10: predicate.remove_not_recompute_node 1.54% : 0.000004s : 32: predicate.replace_applicator 0.53% : 0.000001s : 10: predicate.replace_old_param 0.29% : 0.000001s : 5: predicate.reset_defer_inline 0.93% : 0.000002s : 16: predicate.reshape_eliminate 0.56% : 0.000001s : 10: predicate.row_tensor_add_zeros_like 0.40% : 0.000001s : 5: predicate.row_tensor_eliminate 0.72% : 0.000002s : 10: predicate.same_eliminate 0.47% : 0.000001s : 10: predicate.set_cell_output_no_recompute 0.73% : 0.000002s : 10: predicate.shard_identity_eliminate 0.62% : 0.000001s : 10: predicate.special_op_eliminate 0.69% : 0.000002s : 10: predicate.specialize_transform 0.82% : 0.000002s : 10: predicate.split_environ_get_set_with_tuple_value 0.73% : 0.000002s : 10: predicate.stack_unstack_eliminate 0.33% : 0.000001s : 5: predicate.switch_call_monad_eliminater 1.64% : 0.000004s : 26: predicate.switch_defer_inline 2.21% : 0.000005s : 36: predicate.switch_layer_defer_inline 5.33% : 0.000012s : 81: predicate.switch_simplify 0.93% : 0.000002s : 16: predicate.tile_eliminate 0.94% : 0.000002s : 16: predicate.transpose_eliminate 1.56% : 0.000004s : 26: predicate.tuple_list_convert_item_index_to_positive 1.57% : 0.000004s : 26: predicate.tuple_list_get_item_const_eliminator 1.48% : 0.000003s : 26: predicate.tuple_list_get_item_depend_reorder 3.40% : 0.000008s : 42: predicate.tuple_list_get_item_eliminator 1.45% : 0.000003s : 26: predicate.tuple_list_get_set_item_eliminator 2.18% : 0.000005s : 36: predicate.tuple_list_set_item_eliminator 1.79% : 0.000004s : 32: predicate.tuple_to_list_eliminator_ 2.58% : 0.000006s : 48: predicate.updatestate_pure_node_eliminater 3.22% : 0.000007s : 58: predicate.updatestate_useless_node_eliminater 0.39% : 0.000001s : 5: predicate.value_based_eliminate 0.61% : 0.000001s : 10: predicate.virtual_dataset_eliminate 0.71% : 0.000002s : 10: predicate.virtual_output_eliminate 0.27% : 0.000001s : 5: predicate.virtual_view_grad_eliminate 0.39% : 0.000001s : 5: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000511 11 55.00% : 0.000281s : 5: func_graph_cloner_run.FuncGraphClonerGraph 45.00% : 0.000230s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.024299 192 0.01% : 0.000003s : 1: ForceFp32Comm 12.20% : 0.002965s : 1: add_attr 12.16% : 0.002955s : 1: add_attr_with_inline 0.01% : 0.000004s : 1: add_comm_op_reuse_tag 0.20% : 0.000049s : 1: add_recomputation 0.02% : 0.000004s : 1: assign_add_opt 0.27% : 0.000065s : 1: auto_monad 0.09% : 0.000021s : 1: auto_monad_reorder 0.01% : 0.000003s : 1: begin_end_overlap_inline 0.02% : 0.000005s : 1: bias_add_comm_swap 1.88% : 0.000458s : 1: bootstrap 0.10% : 0.000025s : 1: cconv 0.02% : 0.000004s : 1: comm_op_add_attrs 0.06% : 0.000015s : 1: control_data_broadcast_order 0.04% : 0.000010s : 1: convert_after_rewriter 0.10% : 0.000025s : 1: cse_after_recomputation 0.02% : 0.000005s : 1: dataset_repeat_opt 0.02% : 0.000005s : 1: detach_backward 0.03% : 0.000008s : 1: environ_conv 0.10% : 0.000025s : 1: event_method 0.02% : 0.000005s : 1: full_micro_interleaved_order_control 0.02% : 0.000004s : 1: get_jit_bprop_graph 0.04% : 0.000009s : 1: graph_reusing 0.02% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.02% : 0.000004s : 1: handle_group_info 0.02% : 0.000005s : 1: inline 0.03% : 0.000006s : 1: insert-virtual-dataset 0.02% : 0.000004s : 1: interleave_parallel_branches 0.02% : 0.000004s : 1: interleave_split_concat_branches 0.02% : 0.000005s : 1: label_fine_grained_interleaved_index 0.03% : 0.000007s : 1: label_micro_interleaved_index 1.83% : 0.000445s : 1: loop_unroll 0.02% : 0.000004s : 1: merge_cast_opt 0.02% : 0.000005s : 1: micro_interleaved_order_control 1.92% : 0.000467s : 1: mutable_eliminate 0.03% : 0.000007s : 1: offloading_packed_experts 0.06% : 0.000014s : 1: opt.transform.loop_unroll_optimizer 0.06% : 0.000014s : 1: opt.transform.mutable_eliminate 5.47% : 0.001330s : 78: opt.transform.opt_a 0.14% : 0.000034s : 1: opt.transform.opt_after_cconv 0.11% : 0.000026s : 1: opt.transform.opt_after_jit_grad 0.51% : 0.000123s : 28: opt.transform.opt_b 0.23% : 0.000055s : 2: opt.transform.opt_trans_graph 0.16% : 0.000038s : 4: opt.transform.symbol_engine_opt 11.18% : 0.002717s : 1: opt_a 0.44% : 0.000108s : 1: opt_after_cconv 1.86% : 0.000453s : 1: opt_after_jit_grad 0.93% : 0.000225s : 1: opt_b 19.41% : 0.004716s : 1: optimize 0.08% : 0.000019s : 1: optimize_parallel_all_gather_comm 0.03% : 0.000008s : 1: order_py_execute_after_rewriter 0.09% : 0.000021s : 1: overlap_grad_flash_sp 0.02% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.03% : 0.000007s : 1: overlap_grad_ring_attention 0.02% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.02% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.02% : 0.000005s : 1: overlap_param_gather 0.02% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.03% : 0.000008s : 1: overlap_recompute_and_grad_model_parallel 0.02% : 0.000005s : 1: overlap_recompute_comm 0.03% : 0.000007s : 1: parallel-infer-symbol 0.02% : 0.000004s : 1: parallel-infer-symbol-second 0.02% : 0.000005s : 1: partial_unused_args_eliminate 0.02% : 0.000005s : 1: pipeline_parallel_scheduler 0.02% : 0.000005s : 1: pipeline_split 0.15% : 0.000037s : 1: pre_auto_parallel 0.13% : 0.000031s : 1: py_interpret_to_execute 0.05% : 0.000012s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000004s : 1: remove_cast_before_assign_add 0.07% : 0.000017s : 1: remove_dup_value 1.35% : 0.000327s : 1: renormalize.infer 1.04% : 0.000253s : 1: renormalize.specialize 0.02% : 0.000005s : 1: reorder_send_recv_between_fp_bp 0.03% : 0.000007s : 1: rewriter_after_jit_bprop_graph 0.15% : 0.000037s : 1: rewriter_after_opt_a 0.48% : 0.000116s : 1: rewriter_before_opt_a 0.02% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.02% : 0.000005s : 1: slice_recompute_activation 0.02% : 0.000004s : 1: split_layernorm_comm 0.02% : 0.000005s : 1: split_matmul_comm_elemetwise 0.03% : 0.000008s : 1: swap_dp_allreduce_reducescatter 0.32% : 0.000078s : 1: symbol_engine_optimizer 0.35% : 0.000084s : 1: tuple_transform 23.29% : 0.005660s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:47:18.858.509 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:47:18.858.750 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.016486, [21] [bootstrap]: 0.00043619 [type_inference]: 0.00582539 [event_method]: 1.978e-05 [auto_monad]: 6.139e-05 [graph_reusing]: 6.19999e-06 [inline]: 2.34001e-06 [add_attr]: 0.00296719, [1] [add_attr_with_inline]: 0.00295888, [1] [Cycle 1]: 6.95e-05, [2] [tag_attr]: 2.074e-05 [meta_addattr_fg_expand]: 6.58e-06 [parallel-infer-symbol]: 3.13e-06 [pre_auto_parallel]: 3.439e-05 [insert-virtual-dataset]: 2.37001e-06 [parallel-infer-symbol-second]: 9.60019e-07 [dataset_repeat_opt]: 2.12001e-06 [pipeline_split]: 1.55999e-06 [optimize]: 0.00600406, [53] [py_interpret_to_execute]: 5.948e-05 [rewriter_before_opt_a]: 9.813e-05 [opt_a]: 0.00354412, [2] [Cycle 1]: 0.00254054, [45] [expand_dump_flag]: 2.93e-06 [switch_simplify]: 4.626e-05 [loop_unroll]: 3.526e-05 [a_1]: 0.0008576 [with_stream_mark]: 1.565e-05 [recompute_prepare]: 1.022e-05 [updatestate_depend_eliminate]: 4.67e-06 [updatestate_assign_eliminate]: 4.42e-06 [updatestate_loads_eliminate]: 3.62002e-06 [parameter_eliminate]: 2.17999e-06 [a_2]: 0.00014878 [accelerated_algorithm]: 9.22999e-06 [shard]: 1.81e-06 [meta_shard_fg_expand]: 2.32999e-06 [shard_inline]: 8.47e-06 [merge_send_recv]: 9.33002e-06 [auto_parallel]: 7.68001e-06 [parallel]: 1.741e-05 [flash_sp]: 8.42e-06 [merge_comm]: 4.79e-06 [allreduce_fusion]: 4.58999e-06 [matmul_add_comm_reduction]: 9.87999e-06 [allreduce_slice_to_reducescatter]: 8.70001e-07 [virtual_shard_identity]: 1.032e-05 [virtual_dataset]: 8.78001e-06 [get_grad_eliminate_]: 8.40999e-06 [virtual_output]: 8.14997e-06 [merge_forward]: 4.50001e-06 [cell_reuse_recompute_pass]: 1.33002e-06 [offload_activation]: 1.887e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.768e-05 [merge_recompute_call_nodes]: 1.39998e-06 [before_grad]: 1.292e-05 [set_forward_comm_id_for_comm_node_pass]: 4.56002e-06 [meta_fg_expand]: 3.70998e-06 [flash_sp_send_recv_attached]: 2.56e-06 [receive_attached]: 1.91e-06 [after_resolve]: 1.457e-05 [a_after_grad]: 1.355e-05 [renormalize]: 0.0006782 [add_forward_monad_depend]: 5.20001e-06 [auto_monad_grad]: 2.07999e-06 [auto_monad_eliminator]: 1.654e-05 [cse]: 3.432e-05 [a_3]: 7.483e-05 [Cycle 2]: 0.00099033, [45] [expand_dump_flag]: 1.10001e-06 [switch_simplify]: 9.94001e-06 [loop_unroll]: 8.28999e-06 [a_1]: 0.00020157 [with_stream_mark]: 1.233e-05 [recompute_prepare]: 8.48001e-06 [updatestate_depend_eliminate]: 3.85e-06 [updatestate_assign_eliminate]: 3.25e-06 [updatestate_loads_eliminate]: 3.10998e-06 [parameter_eliminate]: 1.13001e-06 [a_2]: 0.00012832 [accelerated_algorithm]: 8.32e-06 [shard]: 1.27999e-06 [meta_shard_fg_expand]: 1.82001e-06 [shard_inline]: 8.17e-06 [merge_send_recv]: 5.77001e-06 [auto_parallel]: 6.19999e-06 [parallel]: 4.61002e-06 [flash_sp]: 3.53e-06 [merge_comm]: 4.07e-06 [allreduce_fusion]: 4.15e-06 [matmul_add_comm_reduction]: 6.88e-06 [allreduce_slice_to_reducescatter]: 3.60014e-07 [virtual_shard_identity]: 9.39e-06 [virtual_dataset]: 8.01001e-06 [get_grad_eliminate_]: 7.75e-06 [virtual_output]: 7.71999e-06 [merge_forward]: 4.05e-06 [cell_reuse_recompute_pass]: 1.34e-06 [offload_activation]: 7.08e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.649e-05 [merge_recompute_call_nodes]: 7.39994e-07 [before_grad]: 1.256e-05 [set_forward_comm_id_for_comm_node_pass]: 4.32998e-06 [meta_fg_expand]: 3.08e-06 [flash_sp_send_recv_attached]: 9.20001e-07 [receive_attached]: 1.00001e-06 [after_resolve]: 1.453e-05 [a_after_grad]: 1.312e-05 [renormalize]: 8.00064e-08 [add_forward_monad_depend]: 1.45001e-06 [auto_monad_grad]: 1.03001e-06 [auto_monad_eliminator]: 8.61002e-06 [cse]: 2.025e-05 [a_3]: 6.958e-05 [py_interpret_to_execute_after_opt_a]: 1.386e-05 [slice_cell_reuse_recomputed_activation]: 4.74002e-06 [rewriter_after_opt_a]: 4.599e-05 [convert_after_rewriter]: 1.104e-05 [order_py_execute_after_rewriter]: 8.87999e-06 [mutable_eliminate]: 0.00051852 [opt_b]: 0.00033622, [1] [Cycle 1]: 0.00032701, [7] [b_1]: 0.00022278 [b_2]: 1.04e-05 [updatestate_depend_eliminate]: 6.12999e-06 [updatestate_assign_eliminate]: 3.19001e-06 [updatestate_loads_eliminate]: 3.75e-06 [renormalize]: 4.19997e-07 [cse]: 2.284e-05 [optimize_parallel_all_gather_comm]: 2.108e-05 [overlap_param_gather]: 4.50999e-06 [cconv]: 2.779e-05 [loop_unroll]: 0.0004496 [opt_after_cconv]: 0.00014349, [1] [Cycle 1]: 0.00013491, [7] [c_1]: 4.21e-05 [parameter_eliminate]: 2.74999e-06 [updatestate_depend_eliminate]: 6.08002e-06 [updatestate_assign_eliminate]: 3.23998e-06 [updatestate_loads_eliminate]: 3.13e-06 [cse]: 2.24e-05 [renormalize]: 3.30008e-07 [remove_dup_value]: 1.879e-05 [tuple_transform]: 0.00010488, [1] [Cycle 1]: 9.741e-05, [4] [d_1]: 5.703e-05 [none_parameter_eliminate]: 1.57001e-06 [renormalize]: 1.90019e-07 [switch_simplify]: 9.12001e-06 [partial_unused_args_eliminate]: 4.57e-06 [add_recomputation]: 5.781e-05 [cse_after_recomputation]: 3.205e-05, [1] [Cycle 1]: 2.495e-05, [1] [cse]: 1.587e-05 [environ_conv]: 9.59e-06 [swap_dp_allreduce_reducescatter]: 9.24e-06 [bias_add_comm_swap]: 5.09e-06 [label_micro_interleaved_index]: 6.53998e-06 [label_fine_grained_interleaved_index]: 4.88001e-06 [merge_cast_opt]: 3.61001e-06 [slice_recompute_activation]: 4.45e-06 [micro_interleaved_order_control]: 4.57e-06 [assign_add_opt]: 3.73001e-06 [ForceFp32Comm]: 3.16999e-06 [remove_cast_before_assign_add]: 3.45e-06 [full_micro_interleaved_order_control]: 4.26001e-06 [reorder_send_recv_between_fp_bp]: 5.24e-06 [comm_op_add_attrs]: 3.45e-06 [add_comm_op_reuse_tag]: 3.16001e-06 [interleave_split_concat_branches]: 3.49001e-06 [interleave_parallel_branches]: 3.38999e-06 [overlap_opt_shard_in_pipeline]: 3.45e-06 [overlap_opt_shard_grad_in_pipeline]: 4.31002e-06 [control_data_broadcast_order]: 1.742e-05 [grouped_pairwise_exchange_alltoall]: 4.07e-06 [offloading_packed_experts]: 6.91999e-06 [overlap_recompute_and_grad_model_parallel]: 7.3e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.53999e-06 [overlap_recompute_allgather_and_fa_grad]: 3.75e-06 [overlap_recompute_comm]: 5.08002e-06 [overlap_grad_ring_attention]: 7.08998e-06 [overlap_grad_flash_sp]: 2.319e-05 [begin_end_overlap_inline]: 3.07002e-06 [split_matmul_comm_elemetwise]: 4.49002e-06 [split_layernorm_comm]: 4.64002e-06 [handle_group_info]: 3.18998e-06 [symbol_engine_optimizer]: 0.00010488, [1] [Cycle 1]: 9.81e-05, [6] [build]: 3.54002e-06 [elim_shapecalc]: 1.169e-05 [elim_not_effective]: 1.547e-05 [opt_reshape]: 9.21998e-06 [fold_const_symbol]: 1.301e-05 [renormalize]: 1.99972e-07 [detach_backward]: 3.23e-06 [pipeline_parallel_scheduler]: 2.11e-06 [auto_monad_reorder]: 2.173e-05 [get_jit_bprop_graph]: 1.40999e-06 [rewriter_after_jit_bprop_graph]: 4.50999e-06 [opt_after_jit_grad]: 0.00048456 [validate]: 3.987e-05 Sums bootstrap : 0.000436s : 3.69% type_inference : 0.005825s : 49.35% event_method : 0.000020s : 0.17% auto_monad : 0.000061s : 0.52% graph_reusing : 0.000006s : 0.05% inline : 0.000002s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000021s : 0.18% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000007s : 0.06% parallel-infer-symbol : 0.000003s : 0.03% pre_auto_parallel : 0.000034s : 0.29% insert-virtual-dataset : 0.000002s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000059s : 0.50% optimize.rewriter_before_opt_a : 0.000098s : 0.83% optimize.opt_a.expand_dump_flag : 0.000004s : 0.03% optimize.opt_a.switch_simplify : 0.000056s : 0.48% optimize.opt_a.loop_unroll : 0.000044s : 0.37% optimize.opt_a.a_1 : 0.001059s : 8.97% optimize.opt_a.with_stream_mark : 0.000028s : 0.24% optimize.opt_a.recompute_prepare : 0.000019s : 0.16% optimize.opt_a.updatestate_depend_eliminate : 0.000009s : 0.07% optimize.opt_a.updatestate_assign_eliminate : 0.000008s : 0.06% optimize.opt_a.updatestate_loads_eliminate : 0.000007s : 0.06% optimize.opt_a.parameter_eliminate : 0.000003s : 0.03% optimize.opt_a.a_2 : 0.000277s : 2.35% optimize.opt_a.accelerated_algorithm : 0.000018s : 0.15% optimize.opt_a.shard : 0.000003s : 0.03% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.04% optimize.opt_a.shard_inline : 0.000017s : 0.14% optimize.opt_a.merge_send_recv : 0.000015s : 0.13% optimize.opt_a.auto_parallel : 0.000014s : 0.12% optimize.opt_a.parallel : 0.000022s : 0.19% optimize.opt_a.flash_sp : 0.000012s : 0.10% optimize.opt_a.merge_comm : 0.000009s : 0.08% optimize.opt_a.allreduce_fusion : 0.000009s : 0.07% optimize.opt_a.matmul_add_comm_reduction : 0.000017s : 0.14% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000020s : 0.17% optimize.opt_a.virtual_dataset : 0.000017s : 0.14% optimize.opt_a.get_grad_eliminate_ : 0.000016s : 0.14% optimize.opt_a.virtual_output : 0.000016s : 0.13% optimize.opt_a.merge_forward : 0.000009s : 0.07% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.02% optimize.opt_a.offload_activation : 0.000026s : 0.22% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000034s : 0.29% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.02% optimize.opt_a.before_grad : 0.000025s : 0.22% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000009s : 0.08% optimize.opt_a.meta_fg_expand : 0.000007s : 0.06% optimize.opt_a.flash_sp_send_recv_attached : 0.000003s : 0.03% optimize.opt_a.receive_attached : 0.000003s : 0.02% optimize.opt_a.after_resolve : 0.000029s : 0.25% optimize.opt_a.a_after_grad : 0.000027s : 0.23% optimize.opt_a.renormalize : 0.000678s : 5.75% optimize.opt_a.add_forward_monad_depend : 0.000007s : 0.06% optimize.opt_a.auto_monad_grad : 0.000003s : 0.03% optimize.opt_a.auto_monad_eliminator : 0.000025s : 0.21% optimize.opt_a.cse : 0.000055s : 0.46% optimize.opt_a.a_3 : 0.000144s : 1.22% optimize.py_interpret_to_execute_after_opt_a : 0.000014s : 0.12% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.04% optimize.rewriter_after_opt_a : 0.000046s : 0.39% optimize.convert_after_rewriter : 0.000011s : 0.09% optimize.order_py_execute_after_rewriter : 0.000009s : 0.08% optimize.mutable_eliminate : 0.000519s : 4.39% optimize.opt_b.b_1 : 0.000223s : 1.89% optimize.opt_b.b_2 : 0.000010s : 0.09% optimize.opt_b.updatestate_depend_eliminate : 0.000006s : 0.05% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_b.updatestate_loads_eliminate : 0.000004s : 0.03% optimize.opt_b.renormalize : 0.000000s : 0.00% optimize.opt_b.cse : 0.000023s : 0.19% optimize.optimize_parallel_all_gather_comm : 0.000021s : 0.18% optimize.overlap_param_gather : 0.000005s : 0.04% optimize.cconv : 0.000028s : 0.24% optimize.loop_unroll : 0.000450s : 3.81% optimize.opt_after_cconv.c_1 : 0.000042s : 0.36% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.02% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.05% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.cse : 0.000022s : 0.19% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000019s : 0.16% optimize.tuple_transform.d_1 : 0.000057s : 0.48% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000009s : 0.08% optimize.partial_unused_args_eliminate : 0.000005s : 0.04% optimize.add_recomputation : 0.000058s : 0.49% optimize.cse_after_recomputation.cse : 0.000016s : 0.13% optimize.environ_conv : 0.000010s : 0.08% optimize.swap_dp_allreduce_reducescatter : 0.000009s : 0.08% optimize.bias_add_comm_swap : 0.000005s : 0.04% optimize.label_micro_interleaved_index : 0.000007s : 0.06% optimize.label_fine_grained_interleaved_index : 0.000005s : 0.04% optimize.merge_cast_opt : 0.000004s : 0.03% optimize.slice_recompute_activation : 0.000004s : 0.04% optimize.micro_interleaved_order_control : 0.000005s : 0.04% optimize.assign_add_opt : 0.000004s : 0.03% optimize.ForceFp32Comm : 0.000003s : 0.03% optimize.remove_cast_before_assign_add : 0.000003s : 0.03% optimize.full_micro_interleaved_order_control : 0.000004s : 0.04% optimize.reorder_send_recv_between_fp_bp : 0.000005s : 0.04% optimize.comm_op_add_attrs : 0.000003s : 0.03% optimize.add_comm_op_reuse_tag : 0.000003s : 0.03% optimize.interleave_split_concat_branches : 0.000003s : 0.03% optimize.interleave_parallel_branches : 0.000003s : 0.03% optimize.overlap_opt_shard_in_pipeline : 0.000003s : 0.03% optimize.overlap_opt_shard_grad_in_pipeline : 0.000004s : 0.04% optimize.control_data_broadcast_order : 0.000017s : 0.15% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.03% optimize.offloading_packed_experts : 0.000007s : 0.06% optimize.overlap_recompute_and_grad_model_parallel : 0.000007s : 0.06% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.03% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.03% optimize.overlap_recompute_comm : 0.000005s : 0.04% optimize.overlap_grad_ring_attention : 0.000007s : 0.06% optimize.overlap_grad_flash_sp : 0.000023s : 0.20% optimize.begin_end_overlap_inline : 0.000003s : 0.03% optimize.split_matmul_comm_elemetwise : 0.000004s : 0.04% optimize.split_layernorm_comm : 0.000005s : 0.04% optimize.handle_group_info : 0.000003s : 0.03% optimize.symbol_engine_optimizer.build : 0.000004s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000012s : 0.10% optimize.symbol_engine_optimizer.elim_not_effective : 0.000015s : 0.13% optimize.symbol_engine_optimizer.opt_reshape : 0.000009s : 0.08% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000013s : 0.11% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000003s : 0.03% pipeline_parallel_scheduler : 0.000002s : 0.02% auto_monad_reorder : 0.000022s : 0.18% get_jit_bprop_graph : 0.000001s : 0.01% rewriter_after_jit_bprop_graph : 0.000005s : 0.04% opt_after_jit_grad : 0.000485s : 4.10% validate : 0.000040s : 0.34% Time group info: ------[substitution.] 0.000212 44 9.52% : 0.000020s : 3: substitution.cast_eliminate 1.00% : 0.000002s : 3: substitution.elim_not_effective 0.81% : 0.000002s : 3: substitution.fold_const_symbol 3.22% : 0.000007s : 6: substitution.graph_param_transform 67.09% : 0.000142s : 4: substitution.inline 1.96% : 0.000004s : 6: substitution.j_node_and_user_rematch 2.76% : 0.000006s : 6: substitution.remove_not_recompute_node 2.62% : 0.000006s : 6: substitution.replace_old_param 8.46% : 0.000018s : 6: substitution.tuple_list_get_item_eliminator 2.56% : 0.000005s : 1: substitution.value_based_eliminate ------[type_inference.] 0.005776 2 86.80% : 0.005014s : 1: type_inference.infer 13.20% : 0.000762s : 1: type_inference.specialize ------[replace.] 0.000072 10 52.87% : 0.000038s : 4: replace.inline 47.13% : 0.000034s : 6: replace.tuple_list_get_item_eliminator ------[match.] 0.000155 10 90.22% : 0.000139s : 4: match.inline 9.78% : 0.000015s : 6: match.tuple_list_get_item_eliminator ------[predicate.] 0.000292 1954 0.93% : 0.000003s : 21: predicate.accumulaten_eliminater 0.55% : 0.000002s : 6: predicate.ad_related_special_op_eliminate 0.60% : 0.000002s : 12: predicate.addn_check_dump 0.95% : 0.000003s : 21: predicate.addn_zero_filter 0.88% : 0.000003s : 21: predicate.adjust_all_reduce_mul_add 1.96% : 0.000006s : 33: predicate.arithmetic_simplify 1.08% : 0.000003s : 21: predicate.cast_eliminate 0.63% : 0.000002s : 12: predicate.check_bprop_eliminate 0.52% : 0.000002s : 12: predicate.compare_switch_simplify 0.19% : 0.000001s : 6: predicate.const_output_eliminate 0.59% : 0.000002s : 12: predicate.depend_value_elim 1.01% : 0.000003s : 21: predicate.dict_get_item_const_eliminator 1.04% : 0.000003s : 21: predicate.dict_get_item_eliminator 0.98% : 0.000003s : 21: predicate.dict_set_item_eliminator 0.84% : 0.000002s : 12: predicate.dumpgradient_eliminate 0.20% : 0.000001s : 6: predicate.elim_not_effective 0.37% : 0.000001s : 6: predicate.elim_shapecalc_of_broadcastargs 1.27% : 0.000004s : 27: predicate.environ_add_const_eliminate 1.22% : 0.000004s : 27: predicate.environ_get_add_eliminate 1.25% : 0.000004s : 27: predicate.environ_get_depend_swap 1.85% : 0.000005s : 39: predicate.environ_get_eliminate 1.23% : 0.000004s : 27: predicate.environ_get_set_eliminate 1.48% : 0.000004s : 31: predicate.exchange_switch_depend_value 2.29% : 0.000007s : 31: predicate.float_depend_g_call 0.53% : 0.000002s : 12: predicate.float_environ_get_switch 0.80% : 0.000002s : 18: predicate.float_tuple_getitem_switch 0.18% : 0.000001s : 6: predicate.fold_const_symbol 0.63% : 0.000002s : 12: predicate.get_grad_eliminate 0.22% : 0.000001s : 6: predicate.graph_param_transform 0.56% : 0.000002s : 12: predicate.incorporate_call 0.50% : 0.000001s : 12: predicate.incorporate_call_switch 6.31% : 0.000018s : 88: predicate.inline 0.92% : 0.000003s : 12: predicate.inline_without_move 0.35% : 0.000001s : 12: predicate.j_node_and_user_rematch 0.66% : 0.000002s : 12: predicate.less_batch_normalization 1.88% : 0.000006s : 39: predicate.list_to_tuple_eliminator_ 2.73% : 0.000008s : 60: predicate.load_eliminater 0.79% : 0.000002s : 6: predicate.loop_unroll_after_grad 2.01% : 0.000006s : 42: predicate.loop_unroll_before_grad 1.62% : 0.000005s : 33: predicate.make_slice_get_slice_eliminator 0.56% : 0.000002s : 12: predicate.merge_addn 0.60% : 0.000002s : 12: predicate.micro_step_allgather_replace 0.63% : 0.000002s : 12: predicate.mini_step_allgather_replace 0.91% : 0.000003s : 21: predicate.minmaximum_grad 0.77% : 0.000002s : 6: predicate.mutable_eliminate 0.39% : 0.000001s : 6: predicate.opt_reshape 0.35% : 0.000001s : 6: predicate.parallel_virtual_node 1.72% : 0.000005s : 31: predicate.partial_defer_inline 1.81% : 0.000005s : 33: predicate.partial_eliminate 1.07% : 0.000003s : 21: predicate.print_const_string_wrapper 0.59% : 0.000002s : 12: predicate.reduce_all_const_elim 1.26% : 0.000004s : 21: predicate.reduce_eliminate 2.68% : 0.000008s : 60: predicate.redundant_stop_gradient_eliminater 0.42% : 0.000001s : 12: predicate.remove_not_recompute_node 1.47% : 0.000004s : 39: predicate.replace_applicator 0.43% : 0.000001s : 12: predicate.replace_old_param 0.23% : 0.000001s : 6: predicate.reset_defer_inline 1.03% : 0.000003s : 21: predicate.reshape_eliminate 0.65% : 0.000002s : 12: predicate.row_tensor_add_zeros_like 0.39% : 0.000001s : 6: predicate.row_tensor_eliminate 0.78% : 0.000002s : 12: predicate.same_eliminate 0.43% : 0.000001s : 12: predicate.set_cell_output_no_recompute 0.69% : 0.000002s : 12: predicate.shard_identity_eliminate 0.72% : 0.000002s : 12: predicate.special_op_eliminate 0.72% : 0.000002s : 12: predicate.specialize_transform 0.74% : 0.000002s : 12: predicate.split_environ_get_set_with_tuple_value 0.68% : 0.000002s : 12: predicate.stack_unstack_eliminate 0.34% : 0.000001s : 6: predicate.switch_call_monad_eliminater 1.57% : 0.000005s : 31: predicate.switch_defer_inline 2.16% : 0.000006s : 43: predicate.switch_layer_defer_inline 4.78% : 0.000014s : 91: predicate.switch_simplify 0.92% : 0.000003s : 21: predicate.tile_eliminate 0.96% : 0.000003s : 21: predicate.transpose_eliminate 1.58% : 0.000005s : 33: predicate.tuple_list_convert_item_index_to_positive 1.70% : 0.000005s : 33: predicate.tuple_list_get_item_const_eliminator 1.54% : 0.000004s : 33: predicate.tuple_list_get_item_depend_reorder 3.13% : 0.000009s : 51: predicate.tuple_list_get_item_eliminator 1.59% : 0.000005s : 33: predicate.tuple_list_get_set_item_eliminator 2.31% : 0.000007s : 45: predicate.tuple_list_set_item_eliminator 1.80% : 0.000005s : 39: predicate.tuple_to_list_eliminator_ 2.66% : 0.000008s : 60: predicate.updatestate_pure_node_eliminater 3.36% : 0.000010s : 72: predicate.updatestate_useless_node_eliminater 0.49% : 0.000001s : 6: predicate.value_based_eliminate 0.59% : 0.000002s : 12: predicate.virtual_dataset_eliminate 0.57% : 0.000002s : 12: predicate.virtual_output_eliminate 0.27% : 0.000001s : 6: predicate.virtual_view_grad_eliminate 0.37% : 0.000001s : 6: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000561 11 51.54% : 0.000289s : 5: func_graph_cloner_run.FuncGraphClonerGraph 48.46% : 0.000272s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.028029 192 0.02% : 0.000006s : 1: ForceFp32Comm 10.62% : 0.002976s : 1: add_attr 10.57% : 0.002963s : 1: add_attr_with_inline 0.02% : 0.000006s : 1: add_comm_op_reuse_tag 0.22% : 0.000062s : 1: add_recomputation 0.02% : 0.000007s : 1: assign_add_opt 0.26% : 0.000072s : 1: auto_monad 0.10% : 0.000029s : 1: auto_monad_reorder 0.02% : 0.000006s : 1: begin_end_overlap_inline 0.03% : 0.000008s : 1: bias_add_comm_swap 1.70% : 0.000477s : 1: bootstrap 0.11% : 0.000031s : 1: cconv 0.02% : 0.000006s : 1: comm_op_add_attrs 0.07% : 0.000020s : 1: control_data_broadcast_order 0.05% : 0.000014s : 1: convert_after_rewriter 0.13% : 0.000035s : 1: cse_after_recomputation 0.03% : 0.000008s : 1: dataset_repeat_opt 0.06% : 0.000017s : 1: detach_backward 0.04% : 0.000012s : 1: environ_conv 0.10% : 0.000029s : 1: event_method 0.02% : 0.000007s : 1: full_micro_interleaved_order_control 0.03% : 0.000008s : 1: get_jit_bprop_graph 0.04% : 0.000013s : 1: graph_reusing 0.02% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.02% : 0.000006s : 1: handle_group_info 0.03% : 0.000008s : 1: inline 0.04% : 0.000010s : 1: insert-virtual-dataset 0.02% : 0.000006s : 1: interleave_parallel_branches 0.02% : 0.000006s : 1: interleave_split_concat_branches 0.03% : 0.000008s : 1: label_fine_grained_interleaved_index 0.03% : 0.000009s : 1: label_micro_interleaved_index 1.62% : 0.000455s : 1: loop_unroll 0.02% : 0.000006s : 1: merge_cast_opt 0.03% : 0.000007s : 1: micro_interleaved_order_control 1.87% : 0.000525s : 1: mutable_eliminate 0.03% : 0.000010s : 1: offloading_packed_experts 0.06% : 0.000017s : 1: opt.transform.loop_unroll_optimizer 0.06% : 0.000017s : 1: opt.transform.mutable_eliminate 5.87% : 0.001645s : 78: opt.transform.opt_a 0.15% : 0.000041s : 1: opt.transform.opt_after_cconv 0.11% : 0.000031s : 1: opt.transform.opt_after_jit_grad 0.57% : 0.000161s : 28: opt.transform.opt_b 0.23% : 0.000064s : 2: opt.transform.opt_trans_graph 0.16% : 0.000046s : 4: opt.transform.symbol_engine_opt 12.66% : 0.003547s : 1: opt_a 0.52% : 0.000147s : 1: opt_after_cconv 1.77% : 0.000495s : 1: opt_after_jit_grad 1.21% : 0.000340s : 1: opt_b 22.59% : 0.006333s : 1: optimize 0.09% : 0.000024s : 1: optimize_parallel_all_gather_comm 0.04% : 0.000012s : 1: order_py_execute_after_rewriter 0.09% : 0.000026s : 1: overlap_grad_flash_sp 0.02% : 0.000006s : 1: overlap_grad_matmul_and_grad_allreduce 0.04% : 0.000010s : 1: overlap_grad_ring_attention 0.03% : 0.000007s : 1: overlap_opt_shard_grad_in_pipeline 0.02% : 0.000006s : 1: overlap_opt_shard_in_pipeline 0.03% : 0.000008s : 1: overlap_param_gather 0.02% : 0.000006s : 1: overlap_recompute_allgather_and_fa_grad 0.04% : 0.000010s : 1: overlap_recompute_and_grad_model_parallel 0.03% : 0.000008s : 1: overlap_recompute_comm 0.04% : 0.000010s : 1: parallel-infer-symbol 0.02% : 0.000007s : 1: parallel-infer-symbol-second 0.03% : 0.000008s : 1: partial_unused_args_eliminate 0.04% : 0.000010s : 1: pipeline_parallel_scheduler 0.03% : 0.000007s : 1: pipeline_split 0.15% : 0.000042s : 1: pre_auto_parallel 0.23% : 0.000064s : 1: py_interpret_to_execute 0.06% : 0.000017s : 1: py_interpret_to_execute_after_opt_a 0.02% : 0.000006s : 1: remove_cast_before_assign_add 0.08% : 0.000022s : 1: remove_dup_value 1.41% : 0.000396s : 1: renormalize.infer 0.98% : 0.000275s : 1: renormalize.specialize 0.03% : 0.000008s : 1: reorder_send_recv_between_fp_bp 0.04% : 0.000010s : 1: rewriter_after_jit_bprop_graph 0.18% : 0.000050s : 1: rewriter_after_opt_a 0.36% : 0.000102s : 1: rewriter_before_opt_a 0.03% : 0.000008s : 1: slice_cell_reuse_recomputed_activation 0.03% : 0.000007s : 1: slice_recompute_activation 0.03% : 0.000007s : 1: split_layernorm_comm 0.03% : 0.000007s : 1: split_matmul_comm_elemetwise 0.04% : 0.000012s : 1: swap_dp_allreduce_reducescatter 0.38% : 0.000108s : 1: symbol_engine_optimizer 0.38% : 0.000108s : 1: tuple_transform 20.90% : 0.005859s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:47:19.507.17 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0161298, [21] [bootstrap]: 0.00046768 [type_inference]: 0.00603094 [event_method]: 2.101e-05 [auto_monad]: 6.34e-05 [graph_reusing]: 5.66e-06 [inline]: 2.69001e-06 [add_attr]: 0.0031477, [1] [add_attr_with_inline]: 0.00313829, [1] [Cycle 1]: 6.133e-05, [2] [tag_attr]: 2.17e-05 [meta_addattr_fg_expand]: 6.30002e-06 [parallel-infer-symbol]: 2.86e-06 [pre_auto_parallel]: 3.528e-05 [insert-virtual-dataset]: 2.73998e-06 [parallel-infer-symbol-second]: 8.2e-07 [dataset_repeat_opt]: 1.95001e-06 [pipeline_split]: 1.60999e-06 [optimize]: 0.00560835, [53] [py_interpret_to_execute]: 2.914e-05 [rewriter_before_opt_a]: 9.717e-05 [opt_a]: 0.00334129, [2] [Cycle 1]: 0.00248936, [45] [expand_dump_flag]: 2.93e-06 [switch_simplify]: 4.813e-05 [loop_unroll]: 3.552e-05 [a_1]: 0.00090912 [with_stream_mark]: 1.626e-05 [recompute_prepare]: 1.101e-05 [updatestate_depend_eliminate]: 5.34e-06 [updatestate_assign_eliminate]: 4.13001e-06 [updatestate_loads_eliminate]: 3.67002e-06 [parameter_eliminate]: 1.84998e-06 [a_2]: 0.00011185 [accelerated_algorithm]: 9.23002e-06 [shard]: 2.27999e-06 [meta_shard_fg_expand]: 1.96e-06 [shard_inline]: 8.44998e-06 [merge_send_recv]: 9.99999e-06 [auto_parallel]: 7.58001e-06 [parallel]: 2.024e-05 [flash_sp]: 8.95001e-06 [merge_comm]: 5.10001e-06 [allreduce_fusion]: 4.67e-06 [matmul_add_comm_reduction]: 1.157e-05 [allreduce_slice_to_reducescatter]: 9.70002e-07 [virtual_shard_identity]: 1.044e-05 [virtual_dataset]: 8.94e-06 [get_grad_eliminate_]: 8.27e-06 [virtual_output]: 8.58001e-06 [merge_forward]: 4.88001e-06 [cell_reuse_recompute_pass]: 1.29e-06 [offload_activation]: 1.124e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.774e-05 [merge_recompute_call_nodes]: 1.44998e-06 [before_grad]: 1.334e-05 [set_forward_comm_id_for_comm_node_pass]: 5.09998e-06 [meta_fg_expand]: 3.32997e-06 [flash_sp_send_recv_attached]: 2.79001e-06 [receive_attached]: 2.86e-06 [after_resolve]: 1.497e-05 [a_after_grad]: 1.357e-05 [renormalize]: 0.00074689 [add_forward_monad_depend]: 5.10001e-06 [auto_monad_grad]: 2.56e-06 [auto_monad_eliminator]: 1.748e-05 [cse]: 3.645e-05 [a_3]: 6.319e-05 [Cycle 2]: 0.00084046, [45] [expand_dump_flag]: 1.28002e-06 [switch_simplify]: 1.018e-05 [loop_unroll]: 8.47e-06 [a_1]: 0.0002064 [with_stream_mark]: 1.334e-05 [recompute_prepare]: 8.85999e-06 [updatestate_depend_eliminate]: 4.08999e-06 [updatestate_assign_eliminate]: 3.03e-06 [updatestate_loads_eliminate]: 2.95998e-06 [parameter_eliminate]: 1.18001e-06 [a_2]: 0.00010404 [accelerated_algorithm]: 8.03001e-06 [shard]: 1.63997e-06 [meta_shard_fg_expand]: 2.06e-06 [shard_inline]: 8.08001e-06 [merge_send_recv]: 6.51e-06 [auto_parallel]: 6.61e-06 [parallel]: 5.91e-06 [flash_sp]: 3.75e-06 [merge_comm]: 4.48001e-06 [allreduce_fusion]: 4.13001e-06 [matmul_add_comm_reduction]: 6.92002e-06 [allreduce_slice_to_reducescatter]: 3.29979e-07 [virtual_shard_identity]: 9.96e-06 [virtual_dataset]: 7.98999e-06 [get_grad_eliminate_]: 7.66999e-06 [virtual_output]: 7.95e-06 [merge_forward]: 3.98999e-06 [cell_reuse_recompute_pass]: 1.42999e-06 [offload_activation]: 8.33001e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.499e-05 [merge_recompute_call_nodes]: 1.10999e-06 [before_grad]: 1.267e-05 [set_forward_comm_id_for_comm_node_pass]: 4.45999e-06 [meta_fg_expand]: 2.57001e-06 [flash_sp_send_recv_attached]: 1.29e-06 [receive_attached]: 1.23002e-06 [after_resolve]: 1.511e-05 [a_after_grad]: 1.248e-05 [renormalize]: 7.99773e-08 [add_forward_monad_depend]: 2.36998e-06 [auto_monad_grad]: 1.36002e-06 [auto_monad_eliminator]: 1.002e-05 [cse]: 2.098e-05 [a_3]: 5.14e-05 [py_interpret_to_execute_after_opt_a]: 1.128e-05 [slice_cell_reuse_recomputed_activation]: 2.11e-06 [rewriter_after_opt_a]: 4.334e-05 [convert_after_rewriter]: 7.77002e-06 [order_py_execute_after_rewriter]: 5.83002e-06 [mutable_eliminate]: 0.00054619 [opt_b]: 0.00029517, [1] [Cycle 1]: 0.00028886, [7] [b_1]: 0.00020057 [b_2]: 1.146e-05 [updatestate_depend_eliminate]: 7.3e-06 [updatestate_assign_eliminate]: 3.13e-06 [updatestate_loads_eliminate]: 3.75e-06 [renormalize]: 5.00004e-07 [cse]: 2.524e-05 [optimize_parallel_all_gather_comm]: 1.847e-05 [overlap_param_gather]: 1.87001e-06 [cconv]: 2.659e-05 [loop_unroll]: 0.00045045 [opt_after_cconv]: 0.00012298, [1] [Cycle 1]: 0.00011723, [7] [c_1]: 4.267e-05 [parameter_eliminate]: 3.28e-06 [updatestate_depend_eliminate]: 5.94999e-06 [updatestate_assign_eliminate]: 3.38999e-06 [updatestate_loads_eliminate]: 3.05002e-06 [cse]: 2.388e-05 [renormalize]: 2.80008e-07 [remove_dup_value]: 1.574e-05 [tuple_transform]: 9.318e-05, [1] [Cycle 1]: 8.87e-05, [4] [d_1]: 5.916e-05 [none_parameter_eliminate]: 1.62999e-06 [renormalize]: 1.8999e-07 [switch_simplify]: 8.83001e-06 [partial_unused_args_eliminate]: 1.79e-06 [add_recomputation]: 5.543e-05 [cse_after_recomputation]: 2.589e-05, [1] [Cycle 1]: 2.144e-05, [1] [cse]: 1.581e-05 [environ_conv]: 6.98e-06 [swap_dp_allreduce_reducescatter]: 5.82001e-06 [bias_add_comm_swap]: 2.86e-06 [label_micro_interleaved_index]: 4.45e-06 [label_fine_grained_interleaved_index]: 2.64999e-06 [merge_cast_opt]: 1.21002e-06 [slice_recompute_activation]: 2.35002e-06 [micro_interleaved_order_control]: 2.34999e-06 [assign_add_opt]: 1.40001e-06 [ForceFp32Comm]: 9.49978e-07 [remove_cast_before_assign_add]: 1.09998e-06 [full_micro_interleaved_order_control]: 2.29001e-06 [reorder_send_recv_between_fp_bp]: 2.94999e-06 [comm_op_add_attrs]: 1.20999e-06 [add_comm_op_reuse_tag]: 9.19972e-07 [interleave_split_concat_branches]: 1.10001e-06 [interleave_parallel_branches]: 1.07e-06 [overlap_opt_shard_in_pipeline]: 1.17e-06 [overlap_opt_shard_grad_in_pipeline]: 1.80001e-06 [control_data_broadcast_order]: 1.586e-05 [grouped_pairwise_exchange_alltoall]: 1.52999e-06 [offloading_packed_experts]: 4.3e-06 [overlap_recompute_and_grad_model_parallel]: 5.30001e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.39998e-06 [overlap_recompute_allgather_and_fa_grad]: 1.41002e-06 [overlap_recompute_comm]: 2.76e-06 [overlap_grad_ring_attention]: 4.69002e-06 [overlap_grad_flash_sp]: 2.161e-05 [begin_end_overlap_inline]: 8.70001e-07 [split_matmul_comm_elemetwise]: 2.24001e-06 [split_layernorm_comm]: 1.96e-06 [handle_group_info]: 9.30013e-07 [symbol_engine_optimizer]: 8.875e-05, [1] [Cycle 1]: 8.42e-05, [6] [build]: 3.21999e-06 [elim_shapecalc]: 1.23e-05 [elim_not_effective]: 1.7e-05 [opt_reshape]: 8.72e-06 [fold_const_symbol]: 1.307e-05 [renormalize]: 2.10013e-07 [detach_backward]: 1.87001e-06 [pipeline_parallel_scheduler]: 1.64998e-06 [auto_monad_reorder]: 2.1e-05 [get_jit_bprop_graph]: 1.85001e-06 [rewriter_after_jit_bprop_graph]: 4.13999e-06 [opt_after_jit_grad]: 0.00050665 [validate]: 4.585e-05 Sums bootstrap : 0.000468s : 3.89% type_inference : 0.006031s : 50.22% event_method : 0.000021s : 0.17% auto_monad : 0.000063s : 0.53% graph_reusing : 0.000006s : 0.05% inline : 0.000003s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000022s : 0.18% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.05% parallel-infer-symbol : 0.000003s : 0.02% pre_auto_parallel : 0.000035s : 0.29% insert-virtual-dataset : 0.000003s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000029s : 0.24% optimize.rewriter_before_opt_a : 0.000097s : 0.81% optimize.opt_a.expand_dump_flag : 0.000004s : 0.04% optimize.opt_a.switch_simplify : 0.000058s : 0.49% optimize.opt_a.loop_unroll : 0.000044s : 0.37% optimize.opt_a.a_1 : 0.001116s : 9.29% optimize.opt_a.with_stream_mark : 0.000030s : 0.25% optimize.opt_a.recompute_prepare : 0.000020s : 0.17% optimize.opt_a.updatestate_depend_eliminate : 0.000009s : 0.08% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.06% optimize.opt_a.updatestate_loads_eliminate : 0.000007s : 0.06% optimize.opt_a.parameter_eliminate : 0.000003s : 0.03% optimize.opt_a.a_2 : 0.000216s : 1.80% optimize.opt_a.accelerated_algorithm : 0.000017s : 0.14% optimize.opt_a.shard : 0.000004s : 0.03% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.03% optimize.opt_a.shard_inline : 0.000017s : 0.14% optimize.opt_a.merge_send_recv : 0.000017s : 0.14% optimize.opt_a.auto_parallel : 0.000014s : 0.12% optimize.opt_a.parallel : 0.000026s : 0.22% optimize.opt_a.flash_sp : 0.000013s : 0.11% optimize.opt_a.merge_comm : 0.000010s : 0.08% optimize.opt_a.allreduce_fusion : 0.000009s : 0.07% optimize.opt_a.matmul_add_comm_reduction : 0.000018s : 0.15% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000020s : 0.17% optimize.opt_a.virtual_dataset : 0.000017s : 0.14% optimize.opt_a.get_grad_eliminate_ : 0.000016s : 0.13% optimize.opt_a.virtual_output : 0.000017s : 0.14% optimize.opt_a.merge_forward : 0.000009s : 0.07% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.02% optimize.opt_a.offload_activation : 0.000020s : 0.16% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000033s : 0.27% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.02% optimize.opt_a.before_grad : 0.000026s : 0.22% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000010s : 0.08% optimize.opt_a.meta_fg_expand : 0.000006s : 0.05% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.03% optimize.opt_a.receive_attached : 0.000004s : 0.03% optimize.opt_a.after_resolve : 0.000030s : 0.25% optimize.opt_a.a_after_grad : 0.000026s : 0.22% optimize.opt_a.renormalize : 0.000747s : 6.22% optimize.opt_a.add_forward_monad_depend : 0.000007s : 0.06% optimize.opt_a.auto_monad_grad : 0.000004s : 0.03% optimize.opt_a.auto_monad_eliminator : 0.000027s : 0.23% optimize.opt_a.cse : 0.000057s : 0.48% optimize.opt_a.a_3 : 0.000115s : 0.95% optimize.py_interpret_to_execute_after_opt_a : 0.000011s : 0.09% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.02% optimize.rewriter_after_opt_a : 0.000043s : 0.36% optimize.convert_after_rewriter : 0.000008s : 0.06% optimize.order_py_execute_after_rewriter : 0.000006s : 0.05% optimize.mutable_eliminate : 0.000546s : 4.55% optimize.opt_b.b_1 : 0.000201s : 1.67% optimize.opt_b.b_2 : 0.000011s : 0.10% optimize.opt_b.updatestate_depend_eliminate : 0.000007s : 0.06% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_b.updatestate_loads_eliminate : 0.000004s : 0.03% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000025s : 0.21% optimize.optimize_parallel_all_gather_comm : 0.000018s : 0.15% optimize.overlap_param_gather : 0.000002s : 0.02% optimize.cconv : 0.000027s : 0.22% optimize.loop_unroll : 0.000450s : 3.75% optimize.opt_after_cconv.c_1 : 0.000043s : 0.36% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.05% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.cse : 0.000024s : 0.20% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000016s : 0.13% optimize.tuple_transform.d_1 : 0.000059s : 0.49% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000009s : 0.07% optimize.partial_unused_args_eliminate : 0.000002s : 0.01% optimize.add_recomputation : 0.000055s : 0.46% optimize.cse_after_recomputation.cse : 0.000016s : 0.13% optimize.environ_conv : 0.000007s : 0.06% optimize.swap_dp_allreduce_reducescatter : 0.000006s : 0.05% optimize.bias_add_comm_swap : 0.000003s : 0.02% optimize.label_micro_interleaved_index : 0.000004s : 0.04% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.02% optimize.merge_cast_opt : 0.000001s : 0.01% optimize.slice_recompute_activation : 0.000002s : 0.02% optimize.micro_interleaved_order_control : 0.000002s : 0.02% optimize.assign_add_opt : 0.000001s : 0.01% optimize.ForceFp32Comm : 0.000001s : 0.01% optimize.remove_cast_before_assign_add : 0.000001s : 0.01% optimize.full_micro_interleaved_order_control : 0.000002s : 0.02% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.02% optimize.comm_op_add_attrs : 0.000001s : 0.01% optimize.add_comm_op_reuse_tag : 0.000001s : 0.01% optimize.interleave_split_concat_branches : 0.000001s : 0.01% optimize.interleave_parallel_branches : 0.000001s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.01% optimize.control_data_broadcast_order : 0.000016s : 0.13% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.01% optimize.offloading_packed_experts : 0.000004s : 0.04% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.04% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.01% optimize.overlap_recompute_comm : 0.000003s : 0.02% optimize.overlap_grad_ring_attention : 0.000005s : 0.04% optimize.overlap_grad_flash_sp : 0.000022s : 0.18% optimize.begin_end_overlap_inline : 0.000001s : 0.01% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.02% optimize.split_layernorm_comm : 0.000002s : 0.02% optimize.handle_group_info : 0.000001s : 0.01% optimize.symbol_engine_optimizer.build : 0.000003s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000012s : 0.10% optimize.symbol_engine_optimizer.elim_not_effective : 0.000017s : 0.14% optimize.symbol_engine_optimizer.opt_reshape : 0.000009s : 0.07% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000013s : 0.11% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.02% pipeline_parallel_scheduler : 0.000002s : 0.01% auto_monad_reorder : 0.000021s : 0.17% get_jit_bprop_graph : 0.000002s : 0.02% rewriter_after_jit_bprop_graph : 0.000004s : 0.03% opt_after_jit_grad : 0.000507s : 4.22% validate : 0.000046s : 0.38% Time group info: ------[substitution.] 0.000259 44 8.57% : 0.000022s : 3: substitution.cast_eliminate 1.00% : 0.000003s : 3: substitution.elim_not_effective 0.78% : 0.000002s : 3: substitution.fold_const_symbol 2.82% : 0.000007s : 6: substitution.graph_param_transform 70.74% : 0.000183s : 4: substitution.inline 1.72% : 0.000004s : 6: substitution.j_node_and_user_rematch 2.63% : 0.000007s : 6: substitution.remove_not_recompute_node 2.34% : 0.000006s : 6: substitution.replace_old_param 7.14% : 0.000019s : 6: substitution.tuple_list_get_item_eliminator 2.26% : 0.000006s : 1: substitution.value_based_eliminate ------[type_inference.] 0.005967 2 86.71% : 0.005174s : 1: type_inference.infer 13.29% : 0.000793s : 1: type_inference.specialize ------[replace.] 0.000074 10 51.68% : 0.000038s : 4: replace.inline 48.32% : 0.000036s : 6: replace.tuple_list_get_item_eliminator ------[match.] 0.000196 10 91.97% : 0.000180s : 4: match.inline 8.03% : 0.000016s : 6: match.tuple_list_get_item_eliminator ------[predicate.] 0.000299 1954 0.97% : 0.000003s : 21: predicate.accumulaten_eliminater 0.76% : 0.000002s : 6: predicate.ad_related_special_op_eliminate 0.55% : 0.000002s : 12: predicate.addn_check_dump 0.93% : 0.000003s : 21: predicate.addn_zero_filter 0.90% : 0.000003s : 21: predicate.adjust_all_reduce_mul_add 1.96% : 0.000006s : 33: predicate.arithmetic_simplify 1.03% : 0.000003s : 21: predicate.cast_eliminate 0.60% : 0.000002s : 12: predicate.check_bprop_eliminate 0.52% : 0.000002s : 12: predicate.compare_switch_simplify 0.19% : 0.000001s : 6: predicate.const_output_eliminate 0.60% : 0.000002s : 12: predicate.depend_value_elim 0.99% : 0.000003s : 21: predicate.dict_get_item_const_eliminator 1.08% : 0.000003s : 21: predicate.dict_get_item_eliminator 1.00% : 0.000003s : 21: predicate.dict_set_item_eliminator 0.87% : 0.000003s : 12: predicate.dumpgradient_eliminate 0.27% : 0.000001s : 6: predicate.elim_not_effective 0.36% : 0.000001s : 6: predicate.elim_shapecalc_of_broadcastargs 1.23% : 0.000004s : 27: predicate.environ_add_const_eliminate 1.16% : 0.000003s : 27: predicate.environ_get_add_eliminate 1.20% : 0.000004s : 27: predicate.environ_get_depend_swap 1.92% : 0.000006s : 39: predicate.environ_get_eliminate 1.21% : 0.000004s : 27: predicate.environ_get_set_eliminate 1.50% : 0.000004s : 31: predicate.exchange_switch_depend_value 2.35% : 0.000007s : 31: predicate.float_depend_g_call 0.56% : 0.000002s : 12: predicate.float_environ_get_switch 0.77% : 0.000002s : 18: predicate.float_tuple_getitem_switch 0.17% : 0.000001s : 6: predicate.fold_const_symbol 0.63% : 0.000002s : 12: predicate.get_grad_eliminate 0.19% : 0.000001s : 6: predicate.graph_param_transform 0.58% : 0.000002s : 12: predicate.incorporate_call 0.52% : 0.000002s : 12: predicate.incorporate_call_switch 6.19% : 0.000019s : 88: predicate.inline 0.77% : 0.000002s : 12: predicate.inline_without_move 0.38% : 0.000001s : 12: predicate.j_node_and_user_rematch 0.72% : 0.000002s : 12: predicate.less_batch_normalization 1.85% : 0.000006s : 39: predicate.list_to_tuple_eliminator_ 2.70% : 0.000008s : 60: predicate.load_eliminater 0.70% : 0.000002s : 6: predicate.loop_unroll_after_grad 1.98% : 0.000006s : 42: predicate.loop_unroll_before_grad 1.69% : 0.000005s : 33: predicate.make_slice_get_slice_eliminator 0.53% : 0.000002s : 12: predicate.merge_addn 0.59% : 0.000002s : 12: predicate.micro_step_allgather_replace 0.60% : 0.000002s : 12: predicate.mini_step_allgather_replace 0.90% : 0.000003s : 21: predicate.minmaximum_grad 0.90% : 0.000003s : 6: predicate.mutable_eliminate 0.30% : 0.000001s : 6: predicate.opt_reshape 0.49% : 0.000001s : 6: predicate.parallel_virtual_node 1.81% : 0.000005s : 31: predicate.partial_defer_inline 1.78% : 0.000005s : 33: predicate.partial_eliminate 1.07% : 0.000003s : 21: predicate.print_const_string_wrapper 0.55% : 0.000002s : 12: predicate.reduce_all_const_elim 1.15% : 0.000003s : 21: predicate.reduce_eliminate 2.70% : 0.000008s : 60: predicate.redundant_stop_gradient_eliminater 0.43% : 0.000001s : 12: predicate.remove_not_recompute_node 1.43% : 0.000004s : 39: predicate.replace_applicator 0.42% : 0.000001s : 12: predicate.replace_old_param 0.25% : 0.000001s : 6: predicate.reset_defer_inline 1.08% : 0.000003s : 21: predicate.reshape_eliminate 0.62% : 0.000002s : 12: predicate.row_tensor_add_zeros_like 0.40% : 0.000001s : 6: predicate.row_tensor_eliminate 0.66% : 0.000002s : 12: predicate.same_eliminate 0.42% : 0.000001s : 12: predicate.set_cell_output_no_recompute 0.70% : 0.000002s : 12: predicate.shard_identity_eliminate 0.61% : 0.000002s : 12: predicate.special_op_eliminate 0.68% : 0.000002s : 12: predicate.specialize_transform 0.78% : 0.000002s : 12: predicate.split_environ_get_set_with_tuple_value 0.84% : 0.000003s : 12: predicate.stack_unstack_eliminate 0.33% : 0.000001s : 6: predicate.switch_call_monad_eliminater 1.55% : 0.000005s : 31: predicate.switch_defer_inline 2.12% : 0.000006s : 43: predicate.switch_layer_defer_inline 4.67% : 0.000014s : 91: predicate.switch_simplify 0.93% : 0.000003s : 21: predicate.tile_eliminate 1.00% : 0.000003s : 21: predicate.transpose_eliminate 1.61% : 0.000005s : 33: predicate.tuple_list_convert_item_index_to_positive 1.62% : 0.000005s : 33: predicate.tuple_list_get_item_const_eliminator 1.64% : 0.000005s : 33: predicate.tuple_list_get_item_depend_reorder 3.26% : 0.000010s : 51: predicate.tuple_list_get_item_eliminator 1.66% : 0.000005s : 33: predicate.tuple_list_get_set_item_eliminator 2.34% : 0.000007s : 45: predicate.tuple_list_set_item_eliminator 1.78% : 0.000005s : 39: predicate.tuple_to_list_eliminator_ 2.58% : 0.000008s : 60: predicate.updatestate_pure_node_eliminater 3.18% : 0.000010s : 72: predicate.updatestate_useless_node_eliminater 0.43% : 0.000001s : 6: predicate.value_based_eliminate 0.66% : 0.000002s : 12: predicate.virtual_dataset_eliminate 0.68% : 0.000002s : 12: predicate.virtual_output_eliminate 0.32% : 0.000001s : 6: predicate.virtual_view_grad_eliminate 0.41% : 0.000001s : 6: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000569 11 54.19% : 0.000308s : 5: func_graph_cloner_run.FuncGraphClonerGraph 45.81% : 0.000261s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.027595 192 0.01% : 0.000004s : 1: ForceFp32Comm 11.42% : 0.003152s : 1: add_attr 11.39% : 0.003142s : 1: add_attr_with_inline 0.01% : 0.000004s : 1: add_comm_op_reuse_tag 0.22% : 0.000059s : 1: add_recomputation 0.01% : 0.000004s : 1: assign_add_opt 0.25% : 0.000070s : 1: auto_monad 0.09% : 0.000025s : 1: auto_monad_reorder 0.01% : 0.000004s : 1: begin_end_overlap_inline 0.02% : 0.000006s : 1: bias_add_comm_swap 1.80% : 0.000498s : 1: bootstrap 0.11% : 0.000030s : 1: cconv 0.01% : 0.000004s : 1: comm_op_add_attrs 0.07% : 0.000019s : 1: control_data_broadcast_order 0.04% : 0.000012s : 1: convert_after_rewriter 0.10% : 0.000029s : 1: cse_after_recomputation 0.02% : 0.000005s : 1: dataset_repeat_opt 0.02% : 0.000005s : 1: detach_backward 0.04% : 0.000010s : 1: environ_conv 0.10% : 0.000028s : 1: event_method 0.02% : 0.000005s : 1: full_micro_interleaved_order_control 0.02% : 0.000005s : 1: get_jit_bprop_graph 0.03% : 0.000009s : 1: graph_reusing 0.02% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000004s : 1: handle_group_info 0.02% : 0.000006s : 1: inline 0.02% : 0.000006s : 1: insert-virtual-dataset 0.01% : 0.000004s : 1: interleave_parallel_branches 0.01% : 0.000004s : 1: interleave_split_concat_branches 0.02% : 0.000006s : 1: label_fine_grained_interleaved_index 0.03% : 0.000007s : 1: label_micro_interleaved_index 1.66% : 0.000459s : 1: loop_unroll 0.01% : 0.000004s : 1: merge_cast_opt 0.02% : 0.000005s : 1: micro_interleaved_order_control 2.01% : 0.000556s : 1: mutable_eliminate 0.03% : 0.000007s : 1: offloading_packed_experts 0.06% : 0.000017s : 1: opt.transform.loop_unroll_optimizer 0.07% : 0.000018s : 1: opt.transform.mutable_eliminate 6.14% : 0.001693s : 78: opt.transform.opt_a 0.15% : 0.000041s : 1: opt.transform.opt_after_cconv 0.12% : 0.000033s : 1: opt.transform.opt_after_jit_grad 0.65% : 0.000180s : 28: opt.transform.opt_b 0.24% : 0.000066s : 2: opt.transform.opt_trans_graph 0.17% : 0.000047s : 4: opt.transform.symbol_engine_opt 12.12% : 0.003344s : 1: opt_a 0.46% : 0.000126s : 1: opt_after_cconv 1.87% : 0.000517s : 1: opt_after_jit_grad 1.08% : 0.000299s : 1: opt_b 20.34% : 0.005613s : 1: optimize 0.08% : 0.000022s : 1: optimize_parallel_all_gather_comm 0.03% : 0.000009s : 1: order_py_execute_after_rewriter 0.09% : 0.000025s : 1: overlap_grad_flash_sp 0.01% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.03% : 0.000008s : 1: overlap_grad_ring_attention 0.02% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.02% : 0.000005s : 1: overlap_param_gather 0.01% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.03% : 0.000009s : 1: overlap_recompute_and_grad_model_parallel 0.02% : 0.000006s : 1: overlap_recompute_comm 0.02% : 0.000006s : 1: parallel-infer-symbol 0.01% : 0.000004s : 1: parallel-infer-symbol-second 0.02% : 0.000005s : 1: partial_unused_args_eliminate 0.02% : 0.000005s : 1: pipeline_parallel_scheduler 0.02% : 0.000004s : 1: pipeline_split 0.14% : 0.000039s : 1: pre_auto_parallel 0.12% : 0.000033s : 1: py_interpret_to_execute 0.05% : 0.000015s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000004s : 1: remove_cast_before_assign_add 0.07% : 0.000019s : 1: remove_dup_value 1.46% : 0.000402s : 1: renormalize.infer 1.22% : 0.000337s : 1: renormalize.specialize 0.02% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.03% : 0.000007s : 1: rewriter_after_jit_bprop_graph 0.17% : 0.000048s : 1: rewriter_after_opt_a 0.37% : 0.000102s : 1: rewriter_before_opt_a 0.02% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.02% : 0.000005s : 1: slice_recompute_activation 0.02% : 0.000005s : 1: split_layernorm_comm 0.02% : 0.000005s : 1: split_matmul_comm_elemetwise 0.03% : 0.000009s : 1: swap_dp_allreduce_reducescatter 0.33% : 0.000092s : 1: symbol_engine_optimizer 0.35% : 0.000096s : 1: tuple_transform 21.92% : 0.006049s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:47:19.241.004 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:47:19.241.268 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0182516, [21] [bootstrap]: 0.00044451 [type_inference]: 0.00614541 [event_method]: 2.225e-05 [auto_monad]: 6.95e-05 [graph_reusing]: 5.72001e-06 [inline]: 2.36e-06 [add_attr]: 0.00339167, [1] [add_attr_with_inline]: 0.00338191, [1] [Cycle 1]: 7.923e-05, [2] [tag_attr]: 2.309e-05 [meta_addattr_fg_expand]: 7.18e-06 [parallel-infer-symbol]: 3.32002e-06 [pre_auto_parallel]: 3.822e-05 [insert-virtual-dataset]: 2.34001e-06 [parallel-infer-symbol-second]: 6.90023e-07 [dataset_repeat_opt]: 1.76e-06 [pipeline_split]: 1.59e-06 [optimize]: 0.00687917, [53] [py_interpret_to_execute]: 3.573e-05 [rewriter_before_opt_a]: 0.00010869 [opt_a]: 0.00404689, [2] [Cycle 1]: 0.00292003, [45] [expand_dump_flag]: 3.01999e-06 [switch_simplify]: 4.844e-05 [loop_unroll]: 3.668e-05 [a_1]: 0.00091962 [with_stream_mark]: 1.869e-05 [recompute_prepare]: 1.471e-05 [updatestate_depend_eliminate]: 6.09999e-06 [updatestate_assign_eliminate]: 4.82e-06 [updatestate_loads_eliminate]: 4.17e-06 [parameter_eliminate]: 2.09999e-06 [a_2]: 0.00015801 [accelerated_algorithm]: 1.035e-05 [shard]: 2.27001e-06 [meta_shard_fg_expand]: 2.32999e-06 [shard_inline]: 9.82001e-06 [merge_send_recv]: 1.135e-05 [auto_parallel]: 8.3e-06 [parallel]: 1.884e-05 [flash_sp]: 8.77e-06 [merge_comm]: 6.07999e-06 [allreduce_fusion]: 4.92e-06 [matmul_add_comm_reduction]: 1.18e-05 [allreduce_slice_to_reducescatter]: 7.30011e-07 [virtual_shard_identity]: 1.227e-05 [virtual_dataset]: 1.091e-05 [get_grad_eliminate_]: 9.97999e-06 [virtual_output]: 9.77001e-06 [merge_forward]: 5.48997e-06 [cell_reuse_recompute_pass]: 1.24998e-06 [offload_activation]: 1.27e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.142e-05 [merge_recompute_call_nodes]: 1.44998e-06 [before_grad]: 1.668e-05 [set_forward_comm_id_for_comm_node_pass]: 5.67999e-06 [meta_fg_expand]: 5.79999e-06 [flash_sp_send_recv_attached]: 2.47001e-06 [receive_attached]: 2.31e-06 [after_resolve]: 1.608e-05 [a_after_grad]: 1.571e-05 [renormalize]: 0.00087411 [add_forward_monad_depend]: 6.16998e-06 [auto_monad_grad]: 2.67001e-06 [auto_monad_eliminator]: 1.912e-05 [cse]: 6.898e-05 [a_3]: 9.021e-05 [Cycle 2]: 0.00111176, [45] [expand_dump_flag]: 1.20001e-06 [switch_simplify]: 1.13e-05 [loop_unroll]: 9.88998e-06 [a_1]: 0.00024568 [with_stream_mark]: 1.571e-05 [recompute_prepare]: 9.79999e-06 [updatestate_depend_eliminate]: 4.71002e-06 [updatestate_assign_eliminate]: 3.79002e-06 [updatestate_loads_eliminate]: 3.75e-06 [parameter_eliminate]: 1.12999e-06 [a_2]: 0.00014669 [accelerated_algorithm]: 9.59999e-06 [shard]: 1.65001e-06 [meta_shard_fg_expand]: 2.51998e-06 [shard_inline]: 9.37001e-06 [merge_send_recv]: 8e-06 [auto_parallel]: 8.26002e-06 [parallel]: 6.91001e-06 [flash_sp]: 3.89002e-06 [merge_comm]: 4.77e-06 [allreduce_fusion]: 5.02e-06 [matmul_add_comm_reduction]: 8.99e-06 [allreduce_slice_to_reducescatter]: 8.09989e-07 [virtual_shard_identity]: 1.009e-05 [virtual_dataset]: 9.77001e-06 [get_grad_eliminate_]: 1.005e-05 [virtual_output]: 9.29e-06 [merge_forward]: 4.99e-06 [cell_reuse_recompute_pass]: 1.70001e-06 [offload_activation]: 9.51e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.946e-05 [merge_recompute_call_nodes]: 1.24e-06 [before_grad]: 1.505e-05 [set_forward_comm_id_for_comm_node_pass]: 5.30999e-06 [meta_fg_expand]: 3.81001e-06 [flash_sp_send_recv_attached]: 1.10001e-06 [receive_attached]: 1.61998e-06 [after_resolve]: 1.471e-05 [a_after_grad]: 1.473e-05 [renormalize]: 8.00064e-08 [add_forward_monad_depend]: 1.74e-06 [auto_monad_grad]: 1.30999e-06 [auto_monad_eliminator]: 1.05e-05 [cse]: 2.772e-05 [a_3]: 7.495e-05 [py_interpret_to_execute_after_opt_a]: 1.704e-05 [slice_cell_reuse_recomputed_activation]: 4.67e-06 [rewriter_after_opt_a]: 5.517e-05 [convert_after_rewriter]: 1.244e-05 [order_py_execute_after_rewriter]: 9.52001e-06 [mutable_eliminate]: 0.00067775 [opt_b]: 0.00038995, [1] [Cycle 1]: 0.00038003, [7] [b_1]: 0.00026007 [b_2]: 1.169e-05 [updatestate_depend_eliminate]: 7.56999e-06 [updatestate_assign_eliminate]: 3.82998e-06 [updatestate_loads_eliminate]: 4.77998e-06 [renormalize]: 6.69999e-07 [cse]: 3.34e-05 [optimize_parallel_all_gather_comm]: 2.273e-05 [overlap_param_gather]: 4.82e-06 [cconv]: 3.19e-05 [loop_unroll]: 0.00049578 [opt_after_cconv]: 0.00016509, [1] [Cycle 1]: 0.00015553, [7] [c_1]: 5.056e-05 [parameter_eliminate]: 3.08e-06 [updatestate_depend_eliminate]: 7.16999e-06 [updatestate_assign_eliminate]: 3.89002e-06 [updatestate_loads_eliminate]: 3.68e-06 [cse]: 3.035e-05 [renormalize]: 4.19997e-07 [remove_dup_value]: 4.582e-05 [tuple_transform]: 0.00012072, [1] [Cycle 1]: 0.0001132, [4] [d_1]: 6.893e-05 [none_parameter_eliminate]: 2.30002e-06 [renormalize]: 1.69995e-07 [switch_simplify]: 1.064e-05 [partial_unused_args_eliminate]: 4.57e-06 [add_recomputation]: 6.372e-05 [cse_after_recomputation]: 3.617e-05, [1] [Cycle 1]: 2.911e-05, [1] [cse]: 1.916e-05 [environ_conv]: 9.69e-06 [swap_dp_allreduce_reducescatter]: 9.62999e-06 [bias_add_comm_swap]: 4.89e-06 [label_micro_interleaved_index]: 7.22002e-06 [label_fine_grained_interleaved_index]: 5.27001e-06 [merge_cast_opt]: 4.07998e-06 [slice_recompute_activation]: 4.94003e-06 [micro_interleaved_order_control]: 4.95999e-06 [assign_add_opt]: 3.86999e-06 [ForceFp32Comm]: 3.27002e-06 [remove_cast_before_assign_add]: 3.43999e-06 [full_micro_interleaved_order_control]: 4.58999e-06 [reorder_send_recv_between_fp_bp]: 5.05001e-06 [comm_op_add_attrs]: 3.57002e-06 [add_comm_op_reuse_tag]: 3.2e-06 [interleave_split_concat_branches]: 3.36999e-06 [interleave_parallel_branches]: 3.46001e-06 [overlap_opt_shard_in_pipeline]: 3.43e-06 [overlap_opt_shard_grad_in_pipeline]: 4.4e-06 [control_data_broadcast_order]: 1.969e-05 [grouped_pairwise_exchange_alltoall]: 4.22e-06 [offloading_packed_experts]: 7.46999e-06 [overlap_recompute_and_grad_model_parallel]: 8.3e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.63999e-06 [overlap_recompute_allgather_and_fa_grad]: 3.66999e-06 [overlap_recompute_comm]: 5.00999e-06 [overlap_grad_ring_attention]: 7.36999e-06 [overlap_grad_flash_sp]: 2.77e-05 [begin_end_overlap_inline]: 2.89999e-06 [split_matmul_comm_elemetwise]: 4.86002e-06 [split_layernorm_comm]: 4.22998e-06 [handle_group_info]: 4.03001e-06 [symbol_engine_optimizer]: 0.0001206, [1] [Cycle 1]: 0.00011319, [6] [build]: 3.71999e-06 [elim_shapecalc]: 1.595e-05 [elim_not_effective]: 1.879e-05 [opt_reshape]: 1.015e-05 [fold_const_symbol]: 1.542e-05 [renormalize]: 1.80007e-07 [detach_backward]: 3.71001e-06 [pipeline_parallel_scheduler]: 1.87001e-06 [auto_monad_reorder]: 2.415e-05 [get_jit_bprop_graph]: 1.79e-06 [rewriter_after_jit_bprop_graph]: 5.22e-06 [opt_after_jit_grad]: 0.00055411 [validate]: 4.545e-05 Sums bootstrap : 0.000445s : 3.40% type_inference : 0.006145s : 47.01% event_method : 0.000022s : 0.17% auto_monad : 0.000070s : 0.53% graph_reusing : 0.000006s : 0.04% inline : 0.000002s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000023s : 0.18% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000007s : 0.05% parallel-infer-symbol : 0.000003s : 0.03% pre_auto_parallel : 0.000038s : 0.29% insert-virtual-dataset : 0.000002s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.01% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000036s : 0.27% optimize.rewriter_before_opt_a : 0.000109s : 0.83% optimize.opt_a.expand_dump_flag : 0.000004s : 0.03% optimize.opt_a.switch_simplify : 0.000060s : 0.46% optimize.opt_a.loop_unroll : 0.000047s : 0.36% optimize.opt_a.a_1 : 0.001165s : 8.91% optimize.opt_a.with_stream_mark : 0.000034s : 0.26% optimize.opt_a.recompute_prepare : 0.000025s : 0.19% optimize.opt_a.updatestate_depend_eliminate : 0.000011s : 0.08% optimize.opt_a.updatestate_assign_eliminate : 0.000009s : 0.07% optimize.opt_a.updatestate_loads_eliminate : 0.000008s : 0.06% optimize.opt_a.parameter_eliminate : 0.000003s : 0.02% optimize.opt_a.a_2 : 0.000305s : 2.33% optimize.opt_a.accelerated_algorithm : 0.000020s : 0.15% optimize.opt_a.shard : 0.000004s : 0.03% optimize.opt_a.meta_shard_fg_expand : 0.000005s : 0.04% optimize.opt_a.shard_inline : 0.000019s : 0.15% optimize.opt_a.merge_send_recv : 0.000019s : 0.15% optimize.opt_a.auto_parallel : 0.000017s : 0.13% optimize.opt_a.parallel : 0.000026s : 0.20% optimize.opt_a.flash_sp : 0.000013s : 0.10% optimize.opt_a.merge_comm : 0.000011s : 0.08% optimize.opt_a.allreduce_fusion : 0.000010s : 0.08% optimize.opt_a.matmul_add_comm_reduction : 0.000021s : 0.16% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000002s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000022s : 0.17% optimize.opt_a.virtual_dataset : 0.000021s : 0.16% optimize.opt_a.get_grad_eliminate_ : 0.000020s : 0.15% optimize.opt_a.virtual_output : 0.000019s : 0.15% optimize.opt_a.merge_forward : 0.000010s : 0.08% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.02% optimize.opt_a.offload_activation : 0.000022s : 0.17% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000041s : 0.31% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.02% optimize.opt_a.before_grad : 0.000032s : 0.24% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000011s : 0.08% optimize.opt_a.meta_fg_expand : 0.000010s : 0.07% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.03% optimize.opt_a.receive_attached : 0.000004s : 0.03% optimize.opt_a.after_resolve : 0.000031s : 0.24% optimize.opt_a.a_after_grad : 0.000030s : 0.23% optimize.opt_a.renormalize : 0.000874s : 6.69% optimize.opt_a.add_forward_monad_depend : 0.000008s : 0.06% optimize.opt_a.auto_monad_grad : 0.000004s : 0.03% optimize.opt_a.auto_monad_eliminator : 0.000030s : 0.23% optimize.opt_a.cse : 0.000097s : 0.74% optimize.opt_a.a_3 : 0.000165s : 1.26% optimize.py_interpret_to_execute_after_opt_a : 0.000017s : 0.13% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.04% optimize.rewriter_after_opt_a : 0.000055s : 0.42% optimize.convert_after_rewriter : 0.000012s : 0.10% optimize.order_py_execute_after_rewriter : 0.000010s : 0.07% optimize.mutable_eliminate : 0.000678s : 5.18% optimize.opt_b.b_1 : 0.000260s : 1.99% optimize.opt_b.b_2 : 0.000012s : 0.09% optimize.opt_b.updatestate_depend_eliminate : 0.000008s : 0.06% optimize.opt_b.updatestate_assign_eliminate : 0.000004s : 0.03% optimize.opt_b.updatestate_loads_eliminate : 0.000005s : 0.04% optimize.opt_b.renormalize : 0.000001s : 0.01% optimize.opt_b.cse : 0.000033s : 0.26% optimize.optimize_parallel_all_gather_comm : 0.000023s : 0.17% optimize.overlap_param_gather : 0.000005s : 0.04% optimize.cconv : 0.000032s : 0.24% optimize.loop_unroll : 0.000496s : 3.79% optimize.opt_after_cconv.c_1 : 0.000051s : 0.39% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.02% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000007s : 0.05% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000004s : 0.03% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000004s : 0.03% optimize.opt_after_cconv.cse : 0.000030s : 0.23% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000046s : 0.35% optimize.tuple_transform.d_1 : 0.000069s : 0.53% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.02% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000011s : 0.08% optimize.partial_unused_args_eliminate : 0.000005s : 0.03% optimize.add_recomputation : 0.000064s : 0.49% optimize.cse_after_recomputation.cse : 0.000019s : 0.15% optimize.environ_conv : 0.000010s : 0.07% optimize.swap_dp_allreduce_reducescatter : 0.000010s : 0.07% optimize.bias_add_comm_swap : 0.000005s : 0.04% optimize.label_micro_interleaved_index : 0.000007s : 0.06% optimize.label_fine_grained_interleaved_index : 0.000005s : 0.04% optimize.merge_cast_opt : 0.000004s : 0.03% optimize.slice_recompute_activation : 0.000005s : 0.04% optimize.micro_interleaved_order_control : 0.000005s : 0.04% optimize.assign_add_opt : 0.000004s : 0.03% optimize.ForceFp32Comm : 0.000003s : 0.03% optimize.remove_cast_before_assign_add : 0.000003s : 0.03% optimize.full_micro_interleaved_order_control : 0.000005s : 0.04% optimize.reorder_send_recv_between_fp_bp : 0.000005s : 0.04% optimize.comm_op_add_attrs : 0.000004s : 0.03% optimize.add_comm_op_reuse_tag : 0.000003s : 0.02% optimize.interleave_split_concat_branches : 0.000003s : 0.03% optimize.interleave_parallel_branches : 0.000003s : 0.03% optimize.overlap_opt_shard_in_pipeline : 0.000003s : 0.03% optimize.overlap_opt_shard_grad_in_pipeline : 0.000004s : 0.03% optimize.control_data_broadcast_order : 0.000020s : 0.15% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.03% optimize.offloading_packed_experts : 0.000007s : 0.06% optimize.overlap_recompute_and_grad_model_parallel : 0.000008s : 0.06% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.03% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.03% optimize.overlap_recompute_comm : 0.000005s : 0.04% optimize.overlap_grad_ring_attention : 0.000007s : 0.06% optimize.overlap_grad_flash_sp : 0.000028s : 0.21% optimize.begin_end_overlap_inline : 0.000003s : 0.02% optimize.split_matmul_comm_elemetwise : 0.000005s : 0.04% optimize.split_layernorm_comm : 0.000004s : 0.03% optimize.handle_group_info : 0.000004s : 0.03% optimize.symbol_engine_optimizer.build : 0.000004s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000016s : 0.12% optimize.symbol_engine_optimizer.elim_not_effective : 0.000019s : 0.14% optimize.symbol_engine_optimizer.opt_reshape : 0.000010s : 0.08% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000015s : 0.12% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000004s : 0.03% pipeline_parallel_scheduler : 0.000002s : 0.01% auto_monad_reorder : 0.000024s : 0.18% get_jit_bprop_graph : 0.000002s : 0.01% rewriter_after_jit_bprop_graph : 0.000005s : 0.04% opt_after_jit_grad : 0.000554s : 4.24% validate : 0.000045s : 0.35% Time group info: ------[substitution.] 0.000255 54 12.16% : 0.000031s : 6: substitution.cast_eliminate 0.99% : 0.000003s : 4: substitution.elim_not_effective 0.77% : 0.000002s : 4: substitution.fold_const_symbol 3.26% : 0.000008s : 7: substitution.graph_param_transform 65.32% : 0.000166s : 4: substitution.inline 2.28% : 0.000006s : 8: substitution.j_node_and_user_rematch 2.90% : 0.000007s : 8: substitution.remove_not_recompute_node 2.45% : 0.000006s : 6: substitution.replace_old_param 7.59% : 0.000019s : 6: substitution.tuple_list_get_item_eliminator 2.28% : 0.000006s : 1: substitution.value_based_eliminate ------[type_inference.] 0.006090 2 86.67% : 0.005279s : 1: type_inference.infer 13.33% : 0.000812s : 1: type_inference.specialize ------[replace.] 0.000075 10 51.33% : 0.000038s : 4: replace.inline 48.67% : 0.000036s : 6: replace.tuple_list_get_item_eliminator ------[match.] 0.000180 10 90.82% : 0.000164s : 4: match.inline 9.18% : 0.000017s : 6: match.tuple_list_get_item_eliminator ------[predicate.] 0.000331 2134 0.92% : 0.000003s : 22: predicate.accumulaten_eliminater 0.66% : 0.000002s : 7: predicate.ad_related_special_op_eliminate 0.56% : 0.000002s : 14: predicate.addn_check_dump 0.86% : 0.000003s : 22: predicate.addn_zero_filter 0.87% : 0.000003s : 22: predicate.adjust_all_reduce_mul_add 2.08% : 0.000007s : 36: predicate.arithmetic_simplify 1.04% : 0.000003s : 22: predicate.cast_eliminate 0.72% : 0.000002s : 14: predicate.check_bprop_eliminate 0.59% : 0.000002s : 14: predicate.compare_switch_simplify 0.20% : 0.000001s : 7: predicate.const_output_eliminate 0.66% : 0.000002s : 14: predicate.depend_value_elim 1.02% : 0.000003s : 22: predicate.dict_get_item_const_eliminator 1.12% : 0.000004s : 22: predicate.dict_get_item_eliminator 0.90% : 0.000003s : 22: predicate.dict_set_item_eliminator 0.92% : 0.000003s : 14: predicate.dumpgradient_eliminate 0.21% : 0.000001s : 7: predicate.elim_not_effective 0.49% : 0.000002s : 7: predicate.elim_shapecalc_of_broadcastargs 1.17% : 0.000004s : 29: predicate.environ_add_const_eliminate 1.18% : 0.000004s : 29: predicate.environ_get_add_eliminate 1.19% : 0.000004s : 29: predicate.environ_get_depend_swap 1.82% : 0.000006s : 43: predicate.environ_get_eliminate 1.14% : 0.000004s : 29: predicate.environ_get_set_eliminate 1.36% : 0.000004s : 32: predicate.exchange_switch_depend_value 2.13% : 0.000007s : 32: predicate.float_depend_g_call 0.64% : 0.000002s : 14: predicate.float_environ_get_switch 0.84% : 0.000003s : 21: predicate.float_tuple_getitem_switch 0.17% : 0.000001s : 7: predicate.fold_const_symbol 0.73% : 0.000002s : 14: predicate.get_grad_eliminate 0.20% : 0.000001s : 7: predicate.graph_param_transform 0.64% : 0.000002s : 14: predicate.incorporate_call 0.55% : 0.000002s : 14: predicate.incorporate_call_switch 6.26% : 0.000021s : 96: predicate.inline 0.80% : 0.000003s : 14: predicate.inline_without_move 0.35% : 0.000001s : 14: predicate.j_node_and_user_rematch 0.82% : 0.000003s : 14: predicate.less_batch_normalization 2.00% : 0.000007s : 42: predicate.list_to_tuple_eliminator_ 2.59% : 0.000009s : 64: predicate.load_eliminater 0.70% : 0.000002s : 7: predicate.loop_unroll_after_grad 1.87% : 0.000006s : 44: predicate.loop_unroll_before_grad 1.77% : 0.000006s : 36: predicate.make_slice_get_slice_eliminator 0.63% : 0.000002s : 14: predicate.merge_addn 0.63% : 0.000002s : 14: predicate.micro_step_allgather_replace 0.65% : 0.000002s : 14: predicate.mini_step_allgather_replace 0.88% : 0.000003s : 22: predicate.minmaximum_grad 0.99% : 0.000003s : 7: predicate.mutable_eliminate 0.37% : 0.000001s : 7: predicate.opt_reshape 0.40% : 0.000001s : 7: predicate.parallel_virtual_node 1.65% : 0.000005s : 32: predicate.partial_defer_inline 1.73% : 0.000006s : 35: predicate.partial_eliminate 0.95% : 0.000003s : 22: predicate.print_const_string_wrapper 0.60% : 0.000002s : 14: predicate.reduce_all_const_elim 1.15% : 0.000004s : 22: predicate.reduce_eliminate 2.58% : 0.000009s : 64: predicate.redundant_stop_gradient_eliminater 0.39% : 0.000001s : 14: predicate.remove_not_recompute_node 1.36% : 0.000005s : 42: predicate.replace_applicator 0.40% : 0.000001s : 14: predicate.replace_old_param 0.25% : 0.000001s : 7: predicate.reset_defer_inline 0.94% : 0.000003s : 22: predicate.reshape_eliminate 0.63% : 0.000002s : 14: predicate.row_tensor_add_zeros_like 0.39% : 0.000001s : 7: predicate.row_tensor_eliminate 0.72% : 0.000002s : 14: predicate.same_eliminate 0.44% : 0.000001s : 14: predicate.set_cell_output_no_recompute 0.70% : 0.000002s : 14: predicate.shard_identity_eliminate 0.69% : 0.000002s : 14: predicate.special_op_eliminate 0.79% : 0.000003s : 14: predicate.specialize_transform 0.81% : 0.000003s : 14: predicate.split_environ_get_set_with_tuple_value 0.86% : 0.000003s : 14: predicate.stack_unstack_eliminate 0.37% : 0.000001s : 7: predicate.switch_call_monad_eliminater 1.41% : 0.000005s : 32: predicate.switch_defer_inline 2.00% : 0.000007s : 46: predicate.switch_layer_defer_inline 4.58% : 0.000015s : 97: predicate.switch_simplify 0.96% : 0.000003s : 22: predicate.tile_eliminate 0.94% : 0.000003s : 22: predicate.transpose_eliminate 1.71% : 0.000006s : 36: predicate.tuple_list_convert_item_index_to_positive 1.77% : 0.000006s : 36: predicate.tuple_list_get_item_const_eliminator 1.58% : 0.000005s : 36: predicate.tuple_list_get_item_depend_reorder 3.20% : 0.000011s : 56: predicate.tuple_list_get_item_eliminator 1.65% : 0.000005s : 36: predicate.tuple_list_get_set_item_eliminator 2.43% : 0.000008s : 50: predicate.tuple_list_set_item_eliminator 1.81% : 0.000006s : 42: predicate.tuple_to_list_eliminator_ 2.49% : 0.000008s : 64: predicate.updatestate_pure_node_eliminater 3.29% : 0.000011s : 78: predicate.updatestate_useless_node_eliminater 0.46% : 0.000002s : 7: predicate.value_based_eliminate 0.71% : 0.000002s : 14: predicate.virtual_dataset_eliminate 0.65% : 0.000002s : 14: predicate.virtual_output_eliminate 0.31% : 0.000001s : 7: predicate.virtual_view_grad_eliminate 0.36% : 0.000001s : 7: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000589 11 51.51% : 0.000304s : 5: func_graph_cloner_run.FuncGraphClonerGraph 48.49% : 0.000286s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.031551 192 0.02% : 0.000006s : 1: ForceFp32Comm 10.78% : 0.003402s : 1: add_attr 10.73% : 0.003386s : 1: add_attr_with_inline 0.02% : 0.000006s : 1: add_comm_op_reuse_tag 0.21% : 0.000067s : 1: add_recomputation 0.02% : 0.000007s : 1: assign_add_opt 0.25% : 0.000079s : 1: auto_monad 0.10% : 0.000031s : 1: auto_monad_reorder 0.02% : 0.000006s : 1: begin_end_overlap_inline 0.02% : 0.000008s : 1: bias_add_comm_swap 1.54% : 0.000486s : 1: bootstrap 0.11% : 0.000035s : 1: cconv 0.02% : 0.000006s : 1: comm_op_add_attrs 0.07% : 0.000022s : 1: control_data_broadcast_order 0.05% : 0.000016s : 1: convert_after_rewriter 0.12% : 0.000039s : 1: cse_after_recomputation 0.02% : 0.000007s : 1: dataset_repeat_opt 0.06% : 0.000019s : 1: detach_backward 0.04% : 0.000013s : 1: environ_conv 0.10% : 0.000032s : 1: event_method 0.02% : 0.000007s : 1: full_micro_interleaved_order_control 0.03% : 0.000008s : 1: get_jit_bprop_graph 0.04% : 0.000012s : 1: graph_reusing 0.02% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.02% : 0.000007s : 1: handle_group_info 0.02% : 0.000008s : 1: inline 0.03% : 0.000008s : 1: insert-virtual-dataset 0.02% : 0.000006s : 1: interleave_parallel_branches 0.02% : 0.000007s : 1: interleave_split_concat_branches 0.03% : 0.000008s : 1: label_fine_grained_interleaved_index 0.03% : 0.000010s : 1: label_micro_interleaved_index 1.59% : 0.000502s : 1: loop_unroll 0.02% : 0.000007s : 1: merge_cast_opt 0.02% : 0.000008s : 1: micro_interleaved_order_control 2.17% : 0.000685s : 1: mutable_eliminate 0.03% : 0.000010s : 1: offloading_packed_experts 0.06% : 0.000019s : 1: opt.transform.loop_unroll_optimizer 0.06% : 0.000020s : 1: opt.transform.mutable_eliminate 5.84% : 0.001841s : 78: opt.transform.opt_a 0.16% : 0.000049s : 1: opt.transform.opt_after_cconv 0.12% : 0.000037s : 1: opt.transform.opt_after_jit_grad 0.62% : 0.000196s : 28: opt.transform.opt_b 0.25% : 0.000077s : 2: opt.transform.opt_trans_graph 0.18% : 0.000056s : 4: opt.transform.symbol_engine_opt 12.84% : 0.004050s : 1: opt_a 0.54% : 0.000169s : 1: opt_after_cconv 1.79% : 0.000565s : 1: opt_after_jit_grad 1.25% : 0.000394s : 1: opt_b 22.94% : 0.007239s : 1: optimize 0.08% : 0.000026s : 1: optimize_parallel_all_gather_comm 0.04% : 0.000013s : 1: order_py_execute_after_rewriter 0.10% : 0.000031s : 1: overlap_grad_flash_sp 0.02% : 0.000007s : 1: overlap_grad_matmul_and_grad_allreduce 0.03% : 0.000010s : 1: overlap_grad_ring_attention 0.02% : 0.000007s : 1: overlap_opt_shard_grad_in_pipeline 0.02% : 0.000006s : 1: overlap_opt_shard_in_pipeline 0.03% : 0.000008s : 1: overlap_param_gather 0.02% : 0.000006s : 1: overlap_recompute_allgather_and_fa_grad 0.04% : 0.000011s : 1: overlap_recompute_and_grad_model_parallel 0.02% : 0.000008s : 1: overlap_recompute_comm 0.03% : 0.000010s : 1: parallel-infer-symbol 0.02% : 0.000006s : 1: parallel-infer-symbol-second 0.02% : 0.000008s : 1: partial_unused_args_eliminate 0.03% : 0.000010s : 1: pipeline_parallel_scheduler 0.02% : 0.000007s : 1: pipeline_split 0.14% : 0.000045s : 1: pre_auto_parallel 0.13% : 0.000040s : 1: py_interpret_to_execute 0.06% : 0.000020s : 1: py_interpret_to_execute_after_opt_a 0.02% : 0.000006s : 1: remove_cast_before_assign_add 0.16% : 0.000050s : 1: remove_dup_value 1.55% : 0.000489s : 1: renormalize.infer 1.19% : 0.000376s : 1: renormalize.specialize 0.02% : 0.000008s : 1: reorder_send_recv_between_fp_bp 0.04% : 0.000011s : 1: rewriter_after_jit_bprop_graph 0.19% : 0.000059s : 1: rewriter_after_opt_a 0.36% : 0.000112s : 1: rewriter_before_opt_a 0.02% : 0.000007s : 1: slice_cell_reuse_recomputed_activation 0.02% : 0.000008s : 1: slice_recompute_activation 0.02% : 0.000007s : 1: split_layernorm_comm 0.02% : 0.000008s : 1: split_matmul_comm_elemetwise 0.04% : 0.000013s : 1: swap_dp_allreduce_reducescatter 0.39% : 0.000124s : 1: symbol_engine_optimizer 0.39% : 0.000124s : 1: tuple_transform 19.61% : 0.006187s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:47:19.445.382 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0194447, [21] [bootstrap]: 0.00049998 [type_inference]: 0.00699104 [event_method]: 2.453e-05 [auto_monad]: 7.212e-05 [graph_reusing]: 5.94999e-06 [inline]: 2.73998e-06 [add_attr]: 0.00384933, [1] [add_attr_with_inline]: 0.0038373, [1] [Cycle 1]: 8.719e-05, [2] [tag_attr]: 2.898e-05 [meta_addattr_fg_expand]: 6.98e-06 [parallel-infer-symbol]: 4.1e-06 [pre_auto_parallel]: 4.53e-05 [insert-virtual-dataset]: 2.95998e-06 [parallel-infer-symbol-second]: 9.09989e-07 [dataset_repeat_opt]: 2.46e-06 [pipeline_split]: 2.01e-06 [optimize]: 0.00707576, [53] [py_interpret_to_execute]: 3.654e-05 [rewriter_before_opt_a]: 0.00011165 [opt_a]: 0.00412875, [2] [Cycle 1]: 0.00307492, [45] [expand_dump_flag]: 2.96001e-06 [switch_simplify]: 5.121e-05 [loop_unroll]: 3.741e-05 [a_1]: 0.00100157 [with_stream_mark]: 2.138e-05 [recompute_prepare]: 1.704e-05 [updatestate_depend_eliminate]: 6.01e-06 [updatestate_assign_eliminate]: 4.91002e-06 [updatestate_loads_eliminate]: 4.60001e-06 [parameter_eliminate]: 2.51e-06 [a_2]: 0.0001327 [accelerated_algorithm]: 1.289e-05 [shard]: 2.29001e-06 [meta_shard_fg_expand]: 2.98998e-06 [shard_inline]: 1.119e-05 [merge_send_recv]: 1.153e-05 [auto_parallel]: 1.003e-05 [parallel]: 2.215e-05 [flash_sp]: 1.123e-05 [merge_comm]: 6.06e-06 [allreduce_fusion]: 5.53997e-06 [matmul_add_comm_reduction]: 1.332e-05 [allreduce_slice_to_reducescatter]: 9.39996e-07 [virtual_shard_identity]: 1.335e-05 [virtual_dataset]: 1.023e-05 [get_grad_eliminate_]: 1.021e-05 [virtual_output]: 1.046e-05 [merge_forward]: 6.34999e-06 [cell_reuse_recompute_pass]: 1.30001e-06 [offload_activation]: 1.49e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.152e-05 [merge_recompute_call_nodes]: 1.79e-06 [before_grad]: 1.753e-05 [set_forward_comm_id_for_comm_node_pass]: 6.77002e-06 [meta_fg_expand]: 4.32e-06 [flash_sp_send_recv_attached]: 3.35e-06 [receive_attached]: 2.44001e-06 [after_resolve]: 1.968e-05 [a_after_grad]: 1.63e-05 [renormalize]: 0.00105198 [add_forward_monad_depend]: 7.96001e-06 [auto_monad_grad]: 3.31999e-06 [auto_monad_eliminator]: 2.253e-05 [cse]: 5.69e-05 [a_3]: 8.583e-05 [Cycle 2]: 0.00104147, [45] [expand_dump_flag]: 2.17999e-06 [switch_simplify]: 1.239e-05 [loop_unroll]: 1.014e-05 [a_1]: 0.00025622 [with_stream_mark]: 2.284e-05 [recompute_prepare]: 1.235e-05 [updatestate_depend_eliminate]: 5.20999e-06 [updatestate_assign_eliminate]: 4.02998e-06 [updatestate_loads_eliminate]: 4.93001e-06 [parameter_eliminate]: 2.01e-06 [a_2]: 0.00012502 [accelerated_algorithm]: 1.106e-05 [shard]: 2.09e-06 [meta_shard_fg_expand]: 2.51e-06 [shard_inline]: 9.79e-06 [merge_send_recv]: 1.101e-05 [auto_parallel]: 9.15001e-06 [parallel]: 7.4e-06 [flash_sp]: 3.8e-06 [merge_comm]: 5.54e-06 [allreduce_fusion]: 4.99e-06 [matmul_add_comm_reduction]: 1.045e-05 [allreduce_slice_to_reducescatter]: 6.80011e-07 [virtual_shard_identity]: 1.301e-05 [virtual_dataset]: 9.44998e-06 [get_grad_eliminate_]: 8.76002e-06 [virtual_output]: 1.041e-05 [merge_forward]: 5.98002e-06 [cell_reuse_recompute_pass]: 2.99999e-06 [offload_activation]: 1.243e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.21e-05 [merge_recompute_call_nodes]: 1.17e-06 [before_grad]: 1.58e-05 [set_forward_comm_id_for_comm_node_pass]: 6.53e-06 [meta_fg_expand]: 3.83001e-06 [flash_sp_send_recv_attached]: 1.24998e-06 [receive_attached]: 1.69e-06 [after_resolve]: 1.771e-05 [a_after_grad]: 1.507e-05 [renormalize]: 9.00181e-08 [add_forward_monad_depend]: 2.76999e-06 [auto_monad_grad]: 2.10002e-06 [auto_monad_eliminator]: 1.412e-05 [cse]: 3.393e-05 [a_3]: 6.334e-05 [py_interpret_to_execute_after_opt_a]: 1.8e-05 [slice_cell_reuse_recomputed_activation]: 2.34001e-06 [rewriter_after_opt_a]: 5.684e-05 [convert_after_rewriter]: 1.076e-05 [order_py_execute_after_rewriter]: 7.08e-06 [mutable_eliminate]: 0.00082331 [opt_b]: 0.00034647, [1] [Cycle 1]: 0.00033818, [7] [b_1]: 0.00021756 [b_2]: 1.313e-05 [updatestate_depend_eliminate]: 1.015e-05 [updatestate_assign_eliminate]: 4.42e-06 [updatestate_loads_eliminate]: 5.09e-06 [renormalize]: 8.39995e-07 [cse]: 4.512e-05 [optimize_parallel_all_gather_comm]: 2.256e-05 [overlap_param_gather]: 2.29001e-06 [cconv]: 3.666e-05 [loop_unroll]: 0.00055563 [opt_after_cconv]: 0.00015465, [1] [Cycle 1]: 0.00014785, [7] [c_1]: 4.997e-05 [parameter_eliminate]: 5.20999e-06 [updatestate_depend_eliminate]: 9.74e-06 [updatestate_assign_eliminate]: 4.25e-06 [updatestate_loads_eliminate]: 3.98999e-06 [cse]: 3.69e-05 [renormalize]: 8.39995e-07 [remove_dup_value]: 5.303e-05 [tuple_transform]: 0.00011382, [1] [Cycle 1]: 0.00010851, [4] [d_1]: 7.262e-05 [none_parameter_eliminate]: 2.26e-06 [renormalize]: 3.19997e-07 [switch_simplify]: 1.173e-05 [partial_unused_args_eliminate]: 2.49001e-06 [add_recomputation]: 7.677e-05 [cse_after_recomputation]: 3.323e-05, [1] [Cycle 1]: 2.799e-05, [1] [cse]: 2.097e-05 [environ_conv]: 6.84001e-06 [swap_dp_allreduce_reducescatter]: 7.48999e-06 [bias_add_comm_swap]: 2.81999e-06 [label_micro_interleaved_index]: 6.04001e-06 [label_fine_grained_interleaved_index]: 2.67001e-06 [merge_cast_opt]: 1.45001e-06 [slice_recompute_activation]: 2.36e-06 [micro_interleaved_order_control]: 2.46e-06 [assign_add_opt]: 1.42e-06 [ForceFp32Comm]: 1.17999e-06 [remove_cast_before_assign_add]: 1.07e-06 [full_micro_interleaved_order_control]: 2.32999e-06 [reorder_send_recv_between_fp_bp]: 2.78998e-06 [comm_op_add_attrs]: 1.17e-06 [add_comm_op_reuse_tag]: 1.06002e-06 [interleave_split_concat_branches]: 1.39e-06 [interleave_parallel_branches]: 1.11002e-06 [overlap_opt_shard_in_pipeline]: 1.67999e-06 [overlap_opt_shard_grad_in_pipeline]: 1.91e-06 [control_data_broadcast_order]: 2.015e-05 [grouped_pairwise_exchange_alltoall]: 1.72999e-06 [offloading_packed_experts]: 5.13002e-06 [overlap_recompute_and_grad_model_parallel]: 5.82999e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.50999e-06 [overlap_recompute_allgather_and_fa_grad]: 1.39e-06 [overlap_recompute_comm]: 2.24001e-06 [overlap_grad_ring_attention]: 5.66e-06 [overlap_grad_flash_sp]: 2.774e-05 [begin_end_overlap_inline]: 5.00004e-07 [split_matmul_comm_elemetwise]: 2.41e-06 [split_layernorm_comm]: 1.92999e-06 [handle_group_info]: 9.29984e-07 [symbol_engine_optimizer]: 0.00010634, [1] [Cycle 1]: 0.00010019, [6] [build]: 4.50999e-06 [elim_shapecalc]: 1.755e-05 [elim_not_effective]: 1.922e-05 [opt_reshape]: 1.111e-05 [fold_const_symbol]: 1.606e-05 [renormalize]: 2.29978e-07 [detach_backward]: 2.51e-06 [pipeline_parallel_scheduler]: 1.91e-06 [auto_monad_reorder]: 2.408e-05 [get_jit_bprop_graph]: 2.04e-06 [rewriter_after_jit_bprop_graph]: 5.96e-06 [opt_after_jit_grad]: 0.00060007 [validate]: 5.35e-05 Sums bootstrap : 0.000500s : 3.45% type_inference : 0.006991s : 48.25% event_method : 0.000025s : 0.17% auto_monad : 0.000072s : 0.50% graph_reusing : 0.000006s : 0.04% inline : 0.000003s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000029s : 0.20% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000007s : 0.05% parallel-infer-symbol : 0.000004s : 0.03% pre_auto_parallel : 0.000045s : 0.31% insert-virtual-dataset : 0.000003s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000037s : 0.25% optimize.rewriter_before_opt_a : 0.000112s : 0.77% optimize.opt_a.expand_dump_flag : 0.000005s : 0.04% optimize.opt_a.switch_simplify : 0.000064s : 0.44% optimize.opt_a.loop_unroll : 0.000048s : 0.33% optimize.opt_a.a_1 : 0.001258s : 8.68% optimize.opt_a.with_stream_mark : 0.000044s : 0.31% optimize.opt_a.recompute_prepare : 0.000029s : 0.20% optimize.opt_a.updatestate_depend_eliminate : 0.000011s : 0.08% optimize.opt_a.updatestate_assign_eliminate : 0.000009s : 0.06% optimize.opt_a.updatestate_loads_eliminate : 0.000010s : 0.07% optimize.opt_a.parameter_eliminate : 0.000005s : 0.03% optimize.opt_a.a_2 : 0.000258s : 1.78% optimize.opt_a.accelerated_algorithm : 0.000024s : 0.17% optimize.opt_a.shard : 0.000004s : 0.03% optimize.opt_a.meta_shard_fg_expand : 0.000005s : 0.04% optimize.opt_a.shard_inline : 0.000021s : 0.14% optimize.opt_a.merge_send_recv : 0.000023s : 0.16% optimize.opt_a.auto_parallel : 0.000019s : 0.13% optimize.opt_a.parallel : 0.000030s : 0.20% optimize.opt_a.flash_sp : 0.000015s : 0.10% optimize.opt_a.merge_comm : 0.000012s : 0.08% optimize.opt_a.allreduce_fusion : 0.000011s : 0.07% optimize.opt_a.matmul_add_comm_reduction : 0.000024s : 0.16% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000002s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000026s : 0.18% optimize.opt_a.virtual_dataset : 0.000020s : 0.14% optimize.opt_a.get_grad_eliminate_ : 0.000019s : 0.13% optimize.opt_a.virtual_output : 0.000021s : 0.14% optimize.opt_a.merge_forward : 0.000012s : 0.09% optimize.opt_a.cell_reuse_recompute_pass : 0.000004s : 0.03% optimize.opt_a.offload_activation : 0.000027s : 0.19% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000044s : 0.30% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.02% optimize.opt_a.before_grad : 0.000033s : 0.23% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000013s : 0.09% optimize.opt_a.meta_fg_expand : 0.000008s : 0.06% optimize.opt_a.flash_sp_send_recv_attached : 0.000005s : 0.03% optimize.opt_a.receive_attached : 0.000004s : 0.03% optimize.opt_a.after_resolve : 0.000037s : 0.26% optimize.opt_a.a_after_grad : 0.000031s : 0.22% optimize.opt_a.renormalize : 0.001052s : 7.26% optimize.opt_a.add_forward_monad_depend : 0.000011s : 0.07% optimize.opt_a.auto_monad_grad : 0.000005s : 0.04% optimize.opt_a.auto_monad_eliminator : 0.000037s : 0.25% optimize.opt_a.cse : 0.000091s : 0.63% optimize.opt_a.a_3 : 0.000149s : 1.03% optimize.py_interpret_to_execute_after_opt_a : 0.000018s : 0.12% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.02% optimize.rewriter_after_opt_a : 0.000057s : 0.39% optimize.convert_after_rewriter : 0.000011s : 0.07% optimize.order_py_execute_after_rewriter : 0.000007s : 0.05% optimize.mutable_eliminate : 0.000823s : 5.68% optimize.opt_b.b_1 : 0.000218s : 1.50% optimize.opt_b.b_2 : 0.000013s : 0.09% optimize.opt_b.updatestate_depend_eliminate : 0.000010s : 0.07% optimize.opt_b.updatestate_assign_eliminate : 0.000004s : 0.03% optimize.opt_b.updatestate_loads_eliminate : 0.000005s : 0.04% optimize.opt_b.renormalize : 0.000001s : 0.01% optimize.opt_b.cse : 0.000045s : 0.31% optimize.optimize_parallel_all_gather_comm : 0.000023s : 0.16% optimize.overlap_param_gather : 0.000002s : 0.02% optimize.cconv : 0.000037s : 0.25% optimize.loop_unroll : 0.000556s : 3.84% optimize.opt_after_cconv.c_1 : 0.000050s : 0.34% optimize.opt_after_cconv.parameter_eliminate : 0.000005s : 0.04% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000010s : 0.07% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000004s : 0.03% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000004s : 0.03% optimize.opt_after_cconv.cse : 0.000037s : 0.25% optimize.opt_after_cconv.renormalize : 0.000001s : 0.01% optimize.remove_dup_value : 0.000053s : 0.37% optimize.tuple_transform.d_1 : 0.000073s : 0.50% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.02% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000012s : 0.08% optimize.partial_unused_args_eliminate : 0.000002s : 0.02% optimize.add_recomputation : 0.000077s : 0.53% optimize.cse_after_recomputation.cse : 0.000021s : 0.14% optimize.environ_conv : 0.000007s : 0.05% optimize.swap_dp_allreduce_reducescatter : 0.000007s : 0.05% optimize.bias_add_comm_swap : 0.000003s : 0.02% optimize.label_micro_interleaved_index : 0.000006s : 0.04% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.02% optimize.merge_cast_opt : 0.000001s : 0.01% optimize.slice_recompute_activation : 0.000002s : 0.02% optimize.micro_interleaved_order_control : 0.000002s : 0.02% optimize.assign_add_opt : 0.000001s : 0.01% optimize.ForceFp32Comm : 0.000001s : 0.01% optimize.remove_cast_before_assign_add : 0.000001s : 0.01% optimize.full_micro_interleaved_order_control : 0.000002s : 0.02% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.02% optimize.comm_op_add_attrs : 0.000001s : 0.01% optimize.add_comm_op_reuse_tag : 0.000001s : 0.01% optimize.interleave_split_concat_branches : 0.000001s : 0.01% optimize.interleave_parallel_branches : 0.000001s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000002s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.01% optimize.control_data_broadcast_order : 0.000020s : 0.14% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.01% optimize.offloading_packed_experts : 0.000005s : 0.04% optimize.overlap_recompute_and_grad_model_parallel : 0.000006s : 0.04% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000002s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.01% optimize.overlap_recompute_comm : 0.000002s : 0.02% optimize.overlap_grad_ring_attention : 0.000006s : 0.04% optimize.overlap_grad_flash_sp : 0.000028s : 0.19% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.02% optimize.split_layernorm_comm : 0.000002s : 0.01% optimize.handle_group_info : 0.000001s : 0.01% optimize.symbol_engine_optimizer.build : 0.000005s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000018s : 0.12% optimize.symbol_engine_optimizer.elim_not_effective : 0.000019s : 0.13% optimize.symbol_engine_optimizer.opt_reshape : 0.000011s : 0.08% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000016s : 0.11% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000003s : 0.02% pipeline_parallel_scheduler : 0.000002s : 0.01% auto_monad_reorder : 0.000024s : 0.17% get_jit_bprop_graph : 0.000002s : 0.01% rewriter_after_jit_bprop_graph : 0.000006s : 0.04% opt_after_jit_grad : 0.000600s : 4.14% validate : 0.000053s : 0.37% Time group info: ------[substitution.] 0.000297 54 12.75% : 0.000038s : 6: substitution.cast_eliminate 0.89% : 0.000003s : 4: substitution.elim_not_effective 0.70% : 0.000002s : 4: substitution.fold_const_symbol 2.89% : 0.000009s : 7: substitution.graph_param_transform 66.32% : 0.000197s : 4: substitution.inline 2.07% : 0.000006s : 8: substitution.j_node_and_user_rematch 2.88% : 0.000009s : 8: substitution.remove_not_recompute_node 2.60% : 0.000008s : 6: substitution.replace_old_param 7.02% : 0.000021s : 6: substitution.tuple_list_get_item_eliminator 1.89% : 0.000006s : 1: substitution.value_based_eliminate ------[type_inference.] 0.006908 2 86.87% : 0.006001s : 1: type_inference.infer 13.13% : 0.000907s : 1: type_inference.specialize ------[replace.] 0.000123 10 60.76% : 0.000075s : 4: replace.inline 39.24% : 0.000048s : 6: replace.tuple_list_get_item_eliminator ------[match.] 0.000211 10 91.71% : 0.000194s : 4: match.inline 8.29% : 0.000018s : 6: match.tuple_list_get_item_eliminator ------[predicate.] 0.000343 2134 0.93% : 0.000003s : 22: predicate.accumulaten_eliminater 0.90% : 0.000003s : 7: predicate.ad_related_special_op_eliminate 0.57% : 0.000002s : 14: predicate.addn_check_dump 0.93% : 0.000003s : 22: predicate.addn_zero_filter 0.89% : 0.000003s : 22: predicate.adjust_all_reduce_mul_add 1.99% : 0.000007s : 36: predicate.arithmetic_simplify 0.97% : 0.000003s : 22: predicate.cast_eliminate 0.62% : 0.000002s : 14: predicate.check_bprop_eliminate 0.55% : 0.000002s : 14: predicate.compare_switch_simplify 0.18% : 0.000001s : 7: predicate.const_output_eliminate 0.61% : 0.000002s : 14: predicate.depend_value_elim 0.92% : 0.000003s : 22: predicate.dict_get_item_const_eliminator 1.06% : 0.000004s : 22: predicate.dict_get_item_eliminator 0.90% : 0.000003s : 22: predicate.dict_set_item_eliminator 0.91% : 0.000003s : 14: predicate.dumpgradient_eliminate 0.32% : 0.000001s : 7: predicate.elim_not_effective 0.48% : 0.000002s : 7: predicate.elim_shapecalc_of_broadcastargs 1.14% : 0.000004s : 29: predicate.environ_add_const_eliminate 1.19% : 0.000004s : 29: predicate.environ_get_add_eliminate 1.11% : 0.000004s : 29: predicate.environ_get_depend_swap 1.76% : 0.000006s : 43: predicate.environ_get_eliminate 1.20% : 0.000004s : 29: predicate.environ_get_set_eliminate 1.38% : 0.000005s : 32: predicate.exchange_switch_depend_value 2.11% : 0.000007s : 32: predicate.float_depend_g_call 0.58% : 0.000002s : 14: predicate.float_environ_get_switch 0.90% : 0.000003s : 21: predicate.float_tuple_getitem_switch 0.17% : 0.000001s : 7: predicate.fold_const_symbol 0.69% : 0.000002s : 14: predicate.get_grad_eliminate 0.32% : 0.000001s : 7: predicate.graph_param_transform 0.62% : 0.000002s : 14: predicate.incorporate_call 0.55% : 0.000002s : 14: predicate.incorporate_call_switch 6.20% : 0.000021s : 96: predicate.inline 0.97% : 0.000003s : 14: predicate.inline_without_move 0.33% : 0.000001s : 14: predicate.j_node_and_user_rematch 0.96% : 0.000003s : 14: predicate.less_batch_normalization 1.98% : 0.000007s : 42: predicate.list_to_tuple_eliminator_ 2.63% : 0.000009s : 64: predicate.load_eliminater 0.84% : 0.000003s : 7: predicate.loop_unroll_after_grad 1.84% : 0.000006s : 44: predicate.loop_unroll_before_grad 1.62% : 0.000006s : 36: predicate.make_slice_get_slice_eliminator 0.70% : 0.000002s : 14: predicate.merge_addn 0.61% : 0.000002s : 14: predicate.micro_step_allgather_replace 0.67% : 0.000002s : 14: predicate.mini_step_allgather_replace 0.86% : 0.000003s : 22: predicate.minmaximum_grad 1.08% : 0.000004s : 7: predicate.mutable_eliminate 0.41% : 0.000001s : 7: predicate.opt_reshape 0.35% : 0.000001s : 7: predicate.parallel_virtual_node 1.62% : 0.000006s : 32: predicate.partial_defer_inline 1.67% : 0.000006s : 35: predicate.partial_eliminate 0.95% : 0.000003s : 22: predicate.print_const_string_wrapper 0.62% : 0.000002s : 14: predicate.reduce_all_const_elim 1.26% : 0.000004s : 22: predicate.reduce_eliminate 2.50% : 0.000009s : 64: predicate.redundant_stop_gradient_eliminater 0.37% : 0.000001s : 14: predicate.remove_not_recompute_node 1.43% : 0.000005s : 42: predicate.replace_applicator 0.51% : 0.000002s : 14: predicate.replace_old_param 0.41% : 0.000001s : 7: predicate.reset_defer_inline 0.99% : 0.000003s : 22: predicate.reshape_eliminate 0.62% : 0.000002s : 14: predicate.row_tensor_add_zeros_like 0.37% : 0.000001s : 7: predicate.row_tensor_eliminate 0.84% : 0.000003s : 14: predicate.same_eliminate 0.61% : 0.000002s : 14: predicate.set_cell_output_no_recompute 0.86% : 0.000003s : 14: predicate.shard_identity_eliminate 0.68% : 0.000002s : 14: predicate.special_op_eliminate 0.73% : 0.000002s : 14: predicate.specialize_transform 0.81% : 0.000003s : 14: predicate.split_environ_get_set_with_tuple_value 0.86% : 0.000003s : 14: predicate.stack_unstack_eliminate 0.36% : 0.000001s : 7: predicate.switch_call_monad_eliminater 1.43% : 0.000005s : 32: predicate.switch_defer_inline 2.04% : 0.000007s : 46: predicate.switch_layer_defer_inline 4.52% : 0.000016s : 97: predicate.switch_simplify 0.88% : 0.000003s : 22: predicate.tile_eliminate 0.95% : 0.000003s : 22: predicate.transpose_eliminate 1.48% : 0.000005s : 36: predicate.tuple_list_convert_item_index_to_positive 1.60% : 0.000006s : 36: predicate.tuple_list_get_item_const_eliminator 1.43% : 0.000005s : 36: predicate.tuple_list_get_item_depend_reorder 3.19% : 0.000011s : 56: predicate.tuple_list_get_item_eliminator 1.61% : 0.000006s : 36: predicate.tuple_list_get_set_item_eliminator 2.31% : 0.000008s : 50: predicate.tuple_list_set_item_eliminator 1.85% : 0.000006s : 42: predicate.tuple_to_list_eliminator_ 2.51% : 0.000009s : 64: predicate.updatestate_pure_node_eliminater 3.11% : 0.000011s : 78: predicate.updatestate_useless_node_eliminater 0.54% : 0.000002s : 7: predicate.value_based_eliminate 0.64% : 0.000002s : 14: predicate.virtual_dataset_eliminate 0.65% : 0.000002s : 14: predicate.virtual_output_eliminate 0.29% : 0.000001s : 7: predicate.virtual_view_grad_eliminate 0.40% : 0.000001s : 7: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000696 11 52.59% : 0.000366s : 5: func_graph_cloner_run.FuncGraphClonerGraph 47.41% : 0.000330s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.033709 192 0.01% : 0.000004s : 1: ForceFp32Comm 11.44% : 0.003856s : 1: add_attr 11.40% : 0.003842s : 1: add_attr_with_inline 0.01% : 0.000004s : 1: add_comm_op_reuse_tag 0.24% : 0.000082s : 1: add_recomputation 0.01% : 0.000005s : 1: assign_add_opt 0.23% : 0.000078s : 1: auto_monad 0.08% : 0.000028s : 1: auto_monad_reorder 0.01% : 0.000004s : 1: begin_end_overlap_inline 0.02% : 0.000006s : 1: bias_add_comm_swap 1.58% : 0.000532s : 1: bootstrap 0.12% : 0.000041s : 1: cconv 0.01% : 0.000004s : 1: comm_op_add_attrs 0.07% : 0.000024s : 1: control_data_broadcast_order 0.04% : 0.000015s : 1: convert_after_rewriter 0.11% : 0.000037s : 1: cse_after_recomputation 0.02% : 0.000005s : 1: dataset_repeat_opt 0.02% : 0.000006s : 1: detach_backward 0.03% : 0.000010s : 1: environ_conv 0.10% : 0.000033s : 1: event_method 0.02% : 0.000005s : 1: full_micro_interleaved_order_control 0.02% : 0.000005s : 1: get_jit_bprop_graph 0.03% : 0.000010s : 1: graph_reusing 0.01% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000005s : 1: handle_group_info 0.02% : 0.000006s : 1: inline 0.02% : 0.000007s : 1: insert-virtual-dataset 0.01% : 0.000004s : 1: interleave_parallel_branches 0.01% : 0.000005s : 1: interleave_split_concat_branches 0.02% : 0.000006s : 1: label_fine_grained_interleaved_index 0.03% : 0.000009s : 1: label_micro_interleaved_index 1.68% : 0.000567s : 1: loop_unroll 0.01% : 0.000004s : 1: merge_cast_opt 0.02% : 0.000006s : 1: micro_interleaved_order_control 2.48% : 0.000836s : 1: mutable_eliminate 0.03% : 0.000008s : 1: offloading_packed_experts 0.07% : 0.000023s : 1: opt.transform.loop_unroll_optimizer 0.08% : 0.000028s : 1: opt.transform.mutable_eliminate 5.84% : 0.001969s : 78: opt.transform.opt_a 0.14% : 0.000049s : 1: opt.transform.opt_after_cconv 0.13% : 0.000043s : 1: opt.transform.opt_after_jit_grad 0.58% : 0.000195s : 28: opt.transform.opt_b 0.24% : 0.000081s : 2: opt.transform.opt_trans_graph 0.17% : 0.000059s : 4: opt.transform.symbol_engine_opt 12.26% : 0.004133s : 1: opt_a 0.47% : 0.000158s : 1: opt_after_cconv 1.82% : 0.000612s : 1: opt_after_jit_grad 1.04% : 0.000350s : 1: opt_b 21.01% : 0.007082s : 1: optimize 0.08% : 0.000027s : 1: optimize_parallel_all_gather_comm 0.03% : 0.000010s : 1: order_py_execute_after_rewriter 0.10% : 0.000032s : 1: overlap_grad_flash_sp 0.01% : 0.000005s : 1: overlap_grad_matmul_and_grad_allreduce 0.03% : 0.000009s : 1: overlap_grad_ring_attention 0.01% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.02% : 0.000006s : 1: overlap_param_gather 0.01% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.03% : 0.000009s : 1: overlap_recompute_and_grad_model_parallel 0.02% : 0.000006s : 1: overlap_recompute_comm 0.02% : 0.000008s : 1: parallel-infer-symbol 0.01% : 0.000004s : 1: parallel-infer-symbol-second 0.02% : 0.000006s : 1: partial_unused_args_eliminate 0.01% : 0.000005s : 1: pipeline_parallel_scheduler 0.01% : 0.000005s : 1: pipeline_split 0.15% : 0.000050s : 1: pre_auto_parallel 0.12% : 0.000041s : 1: py_interpret_to_execute 0.06% : 0.000022s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000004s : 1: remove_cast_before_assign_add 0.17% : 0.000058s : 1: remove_dup_value 1.77% : 0.000595s : 1: renormalize.infer 1.32% : 0.000445s : 1: renormalize.specialize 0.02% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.03% : 0.000010s : 1: rewriter_after_jit_bprop_graph 0.18% : 0.000062s : 1: rewriter_after_opt_a 0.35% : 0.000117s : 1: rewriter_before_opt_a 0.02% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.01% : 0.000005s : 1: slice_recompute_activation 0.01% : 0.000005s : 1: split_layernorm_comm 0.02% : 0.000005s : 1: split_matmul_comm_elemetwise 0.03% : 0.000011s : 1: swap_dp_allreduce_reducescatter 0.32% : 0.000109s : 1: symbol_engine_optimizer 0.35% : 0.000117s : 1: tuple_transform 20.82% : 0.007017s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:47:19.656.998 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:47:19.657.278 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0178832, [21] [bootstrap]: 0.00045253 [type_inference]: 0.00626248 [event_method]: 2.066e-05 [auto_monad]: 6.306e-05 [graph_reusing]: 5.65001e-06 [inline]: 2.32999e-06 [add_attr]: 0.0033036, [1] [add_attr_with_inline]: 0.00329449, [1] [Cycle 1]: 7.49e-05, [2] [tag_attr]: 2.249e-05 [meta_addattr_fg_expand]: 6.74001e-06 [parallel-infer-symbol]: 3.60998e-06 [pre_auto_parallel]: 3.612e-05 [insert-virtual-dataset]: 2.37999e-06 [parallel-infer-symbol-second]: 7.79983e-07 [dataset_repeat_opt]: 1.79998e-06 [pipeline_split]: 1.60001e-06 [optimize]: 0.00647921, [53] [py_interpret_to_execute]: 3.41e-05 [rewriter_before_opt_a]: 0.00010412 [opt_a]: 0.00385269, [2] [Cycle 1]: 0.00279427, [45] [expand_dump_flag]: 2.99999e-06 [switch_simplify]: 4.798e-05 [loop_unroll]: 3.592e-05 [a_1]: 0.00086818 [with_stream_mark]: 1.92e-05 [recompute_prepare]: 1.204e-05 [updatestate_depend_eliminate]: 5.71e-06 [updatestate_assign_eliminate]: 4.08999e-06 [updatestate_loads_eliminate]: 4.28999e-06 [parameter_eliminate]: 2.29999e-06 [a_2]: 0.0001394 [accelerated_algorithm]: 9.76e-06 [shard]: 2.41e-06 [meta_shard_fg_expand]: 2.17999e-06 [shard_inline]: 9.11002e-06 [merge_send_recv]: 9.54e-06 [auto_parallel]: 8.22e-06 [parallel]: 1.906e-05 [flash_sp]: 9.32999e-06 [merge_comm]: 5.22999e-06 [allreduce_fusion]: 4.38001e-06 [matmul_add_comm_reduction]: 1.143e-05 [allreduce_slice_to_reducescatter]: 9.5999e-07 [virtual_shard_identity]: 1.108e-05 [virtual_dataset]: 9.47001e-06 [get_grad_eliminate_]: 8.90999e-06 [virtual_output]: 8.86002e-06 [merge_forward]: 4.60999e-06 [cell_reuse_recompute_pass]: 1.39e-06 [offload_activation]: 1.146e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.898e-05 [merge_recompute_call_nodes]: 1.42e-06 [before_grad]: 1.452e-05 [set_forward_comm_id_for_comm_node_pass]: 4.67e-06 [meta_fg_expand]: 3.68e-06 [flash_sp_send_recv_attached]: 2.42001e-06 [receive_attached]: 2.44999e-06 [after_resolve]: 1.494e-05 [a_after_grad]: 1.404e-05 [renormalize]: 0.00089015 [add_forward_monad_depend]: 5.94e-06 [auto_monad_grad]: 2.20002e-06 [auto_monad_eliminator]: 1.78e-05 [cse]: 3.595e-05 [a_3]: 8.018e-05 [Cycle 2]: 0.00104417, [45] [expand_dump_flag]: 1.22e-06 [switch_simplify]: 9.83002e-06 [loop_unroll]: 8.57e-06 [a_1]: 0.00021404 [with_stream_mark]: 1.278e-05 [recompute_prepare]: 8.52e-06 [updatestate_depend_eliminate]: 4.02e-06 [updatestate_assign_eliminate]: 3.31001e-06 [updatestate_loads_eliminate]: 3.18e-06 [parameter_eliminate]: 1.22e-06 [a_2]: 0.00014195 [accelerated_algorithm]: 8.72998e-06 [shard]: 2.00002e-06 [meta_shard_fg_expand]: 2.20002e-06 [shard_inline]: 8.57e-06 [merge_send_recv]: 7.93001e-06 [auto_parallel]: 7.1e-06 [parallel]: 6.52001e-06 [flash_sp]: 3.81999e-06 [merge_comm]: 4.55001e-06 [allreduce_fusion]: 4.42e-06 [matmul_add_comm_reduction]: 9.39e-06 [allreduce_slice_to_reducescatter]: 6.69999e-07 [virtual_shard_identity]: 9.92001e-06 [virtual_dataset]: 8.28001e-06 [get_grad_eliminate_]: 8.08001e-06 [virtual_output]: 7.80998e-06 [merge_forward]: 3.72002e-06 [cell_reuse_recompute_pass]: 2.09e-06 [offload_activation]: 9.51e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.824e-05 [merge_recompute_call_nodes]: 1.00001e-06 [before_grad]: 1.304e-05 [set_forward_comm_id_for_comm_node_pass]: 4.55001e-06 [meta_fg_expand]: 3.21999e-06 [flash_sp_send_recv_attached]: 1.12999e-06 [receive_attached]: 1.41002e-06 [after_resolve]: 1.453e-05 [a_after_grad]: 1.311e-05 [renormalize]: 7.99773e-08 [add_forward_monad_depend]: 1.72999e-06 [auto_monad_grad]: 1.22e-06 [auto_monad_eliminator]: 9.51e-06 [cse]: 2.09e-05 [a_3]: 6.427e-05 [py_interpret_to_execute_after_opt_a]: 1.598e-05 [slice_cell_reuse_recomputed_activation]: 4.76002e-06 [rewriter_after_opt_a]: 4.631e-05 [convert_after_rewriter]: 1.103e-05 [order_py_execute_after_rewriter]: 8.71002e-06 [mutable_eliminate]: 0.00064183 [opt_b]: 0.00034348, [1] [Cycle 1]: 0.00033382, [7] [b_1]: 0.00022703 [b_2]: 1.049e-05 [updatestate_depend_eliminate]: 6.52001e-06 [updatestate_assign_eliminate]: 3.29001e-06 [updatestate_loads_eliminate]: 3.68e-06 [renormalize]: 6.00005e-07 [cse]: 2.48e-05 [optimize_parallel_all_gather_comm]: 2.133e-05 [overlap_param_gather]: 4.95001e-06 [cconv]: 3.055e-05 [loop_unroll]: 0.00048891 [opt_after_cconv]: 0.00014768, [1] [Cycle 1]: 0.00013885, [7] [c_1]: 4.246e-05 [parameter_eliminate]: 2.73998e-06 [updatestate_depend_eliminate]: 6.56e-06 [updatestate_assign_eliminate]: 3.44001e-06 [updatestate_loads_eliminate]: 3.23e-06 [cse]: 2.355e-05 [renormalize]: 3.10014e-07 [remove_dup_value]: 1.925e-05 [tuple_transform]: 0.00010789, [1] [Cycle 1]: 0.00010082, [4] [d_1]: 6.006e-05 [none_parameter_eliminate]: 1.59e-06 [renormalize]: 2.80008e-07 [switch_simplify]: 8.97e-06 [partial_unused_args_eliminate]: 4.60001e-06 [add_recomputation]: 5.762e-05 [cse_after_recomputation]: 3.211e-05, [1] [Cycle 1]: 2.509e-05, [1] [cse]: 1.6e-05 [environ_conv]: 1.04e-05 [swap_dp_allreduce_reducescatter]: 8.35999e-06 [bias_add_comm_swap]: 4.62e-06 [label_micro_interleaved_index]: 7.21999e-06 [label_fine_grained_interleaved_index]: 5.02e-06 [merge_cast_opt]: 3.83001e-06 [slice_recompute_activation]: 4.38001e-06 [micro_interleaved_order_control]: 4.80999e-06 [assign_add_opt]: 3.56001e-06 [ForceFp32Comm]: 3.25998e-06 [remove_cast_before_assign_add]: 3.53999e-06 [full_micro_interleaved_order_control]: 4.87998e-06 [reorder_send_recv_between_fp_bp]: 5.24e-06 [comm_op_add_attrs]: 3.53999e-06 [add_comm_op_reuse_tag]: 3.27997e-06 [interleave_split_concat_branches]: 3.56001e-06 [interleave_parallel_branches]: 3.38e-06 [overlap_opt_shard_in_pipeline]: 3.75e-06 [overlap_opt_shard_grad_in_pipeline]: 4.22998e-06 [control_data_broadcast_order]: 1.811e-05 [grouped_pairwise_exchange_alltoall]: 4.01001e-06 [offloading_packed_experts]: 7e-06 [overlap_recompute_and_grad_model_parallel]: 7.61999e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.5e-06 [overlap_recompute_allgather_and_fa_grad]: 3.7e-06 [overlap_recompute_comm]: 4.99998e-06 [overlap_grad_ring_attention]: 7.11999e-06 [overlap_grad_flash_sp]: 2.498e-05 [begin_end_overlap_inline]: 2.93e-06 [split_matmul_comm_elemetwise]: 4.63999e-06 [split_layernorm_comm]: 3.9e-06 [handle_group_info]: 3.23998e-06 [symbol_engine_optimizer]: 0.00010607, [1] [Cycle 1]: 9.947e-05, [6] [build]: 3.11001e-06 [elim_shapecalc]: 1.215e-05 [elim_not_effective]: 1.632e-05 [opt_reshape]: 9.17999e-06 [fold_const_symbol]: 1.314e-05 [renormalize]: 2.50002e-07 [detach_backward]: 3.66999e-06 [pipeline_parallel_scheduler]: 1.90001e-06 [auto_monad_reorder]: 2.52e-05 [get_jit_bprop_graph]: 1.62999e-06 [rewriter_after_jit_bprop_graph]: 4.90999e-06 [opt_after_jit_grad]: 0.00055885 [validate]: 4.423e-05 Sums bootstrap : 0.000453s : 3.54% type_inference : 0.006262s : 48.95% event_method : 0.000021s : 0.16% auto_monad : 0.000063s : 0.49% graph_reusing : 0.000006s : 0.04% inline : 0.000002s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000022s : 0.18% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000007s : 0.05% parallel-infer-symbol : 0.000004s : 0.03% pre_auto_parallel : 0.000036s : 0.28% insert-virtual-dataset : 0.000002s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.01% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000034s : 0.27% optimize.rewriter_before_opt_a : 0.000104s : 0.81% optimize.opt_a.expand_dump_flag : 0.000004s : 0.03% optimize.opt_a.switch_simplify : 0.000058s : 0.45% optimize.opt_a.loop_unroll : 0.000044s : 0.35% optimize.opt_a.a_1 : 0.001082s : 8.46% optimize.opt_a.with_stream_mark : 0.000032s : 0.25% optimize.opt_a.recompute_prepare : 0.000021s : 0.16% optimize.opt_a.updatestate_depend_eliminate : 0.000010s : 0.08% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.06% optimize.opt_a.updatestate_loads_eliminate : 0.000007s : 0.06% optimize.opt_a.parameter_eliminate : 0.000004s : 0.03% optimize.opt_a.a_2 : 0.000281s : 2.20% optimize.opt_a.accelerated_algorithm : 0.000018s : 0.14% optimize.opt_a.shard : 0.000004s : 0.03% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.03% optimize.opt_a.shard_inline : 0.000018s : 0.14% optimize.opt_a.merge_send_recv : 0.000017s : 0.14% optimize.opt_a.auto_parallel : 0.000015s : 0.12% optimize.opt_a.parallel : 0.000026s : 0.20% optimize.opt_a.flash_sp : 0.000013s : 0.10% optimize.opt_a.merge_comm : 0.000010s : 0.08% optimize.opt_a.allreduce_fusion : 0.000009s : 0.07% optimize.opt_a.matmul_add_comm_reduction : 0.000021s : 0.16% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000002s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000021s : 0.16% optimize.opt_a.virtual_dataset : 0.000018s : 0.14% optimize.opt_a.get_grad_eliminate_ : 0.000017s : 0.13% optimize.opt_a.virtual_output : 0.000017s : 0.13% optimize.opt_a.merge_forward : 0.000008s : 0.07% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.03% optimize.opt_a.offload_activation : 0.000021s : 0.16% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000037s : 0.29% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.02% optimize.opt_a.before_grad : 0.000028s : 0.22% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000009s : 0.07% optimize.opt_a.meta_fg_expand : 0.000007s : 0.05% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.03% optimize.opt_a.receive_attached : 0.000004s : 0.03% optimize.opt_a.after_resolve : 0.000029s : 0.23% optimize.opt_a.a_after_grad : 0.000027s : 0.21% optimize.opt_a.renormalize : 0.000890s : 6.96% optimize.opt_a.add_forward_monad_depend : 0.000008s : 0.06% optimize.opt_a.auto_monad_grad : 0.000003s : 0.03% optimize.opt_a.auto_monad_eliminator : 0.000027s : 0.21% optimize.opt_a.cse : 0.000057s : 0.44% optimize.opt_a.a_3 : 0.000144s : 1.13% optimize.py_interpret_to_execute_after_opt_a : 0.000016s : 0.12% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.04% optimize.rewriter_after_opt_a : 0.000046s : 0.36% optimize.convert_after_rewriter : 0.000011s : 0.09% optimize.order_py_execute_after_rewriter : 0.000009s : 0.07% optimize.mutable_eliminate : 0.000642s : 5.02% optimize.opt_b.b_1 : 0.000227s : 1.77% optimize.opt_b.b_2 : 0.000010s : 0.08% optimize.opt_b.updatestate_depend_eliminate : 0.000007s : 0.05% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_b.updatestate_loads_eliminate : 0.000004s : 0.03% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000025s : 0.19% optimize.optimize_parallel_all_gather_comm : 0.000021s : 0.17% optimize.overlap_param_gather : 0.000005s : 0.04% optimize.cconv : 0.000031s : 0.24% optimize.loop_unroll : 0.000489s : 3.82% optimize.opt_after_cconv.c_1 : 0.000042s : 0.33% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.02% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000007s : 0.05% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.cse : 0.000024s : 0.18% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000019s : 0.15% optimize.tuple_transform.d_1 : 0.000060s : 0.47% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000009s : 0.07% optimize.partial_unused_args_eliminate : 0.000005s : 0.04% optimize.add_recomputation : 0.000058s : 0.45% optimize.cse_after_recomputation.cse : 0.000016s : 0.13% optimize.environ_conv : 0.000010s : 0.08% optimize.swap_dp_allreduce_reducescatter : 0.000008s : 0.07% optimize.bias_add_comm_swap : 0.000005s : 0.04% optimize.label_micro_interleaved_index : 0.000007s : 0.06% optimize.label_fine_grained_interleaved_index : 0.000005s : 0.04% optimize.merge_cast_opt : 0.000004s : 0.03% optimize.slice_recompute_activation : 0.000004s : 0.03% optimize.micro_interleaved_order_control : 0.000005s : 0.04% optimize.assign_add_opt : 0.000004s : 0.03% optimize.ForceFp32Comm : 0.000003s : 0.03% optimize.remove_cast_before_assign_add : 0.000004s : 0.03% optimize.full_micro_interleaved_order_control : 0.000005s : 0.04% optimize.reorder_send_recv_between_fp_bp : 0.000005s : 0.04% optimize.comm_op_add_attrs : 0.000004s : 0.03% optimize.add_comm_op_reuse_tag : 0.000003s : 0.03% optimize.interleave_split_concat_branches : 0.000004s : 0.03% optimize.interleave_parallel_branches : 0.000003s : 0.03% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.03% optimize.overlap_opt_shard_grad_in_pipeline : 0.000004s : 0.03% optimize.control_data_broadcast_order : 0.000018s : 0.14% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.03% optimize.offloading_packed_experts : 0.000007s : 0.05% optimize.overlap_recompute_and_grad_model_parallel : 0.000008s : 0.06% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000003s : 0.03% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.03% optimize.overlap_recompute_comm : 0.000005s : 0.04% optimize.overlap_grad_ring_attention : 0.000007s : 0.06% optimize.overlap_grad_flash_sp : 0.000025s : 0.20% optimize.begin_end_overlap_inline : 0.000003s : 0.02% optimize.split_matmul_comm_elemetwise : 0.000005s : 0.04% optimize.split_layernorm_comm : 0.000004s : 0.03% optimize.handle_group_info : 0.000003s : 0.03% optimize.symbol_engine_optimizer.build : 0.000003s : 0.02% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000012s : 0.09% optimize.symbol_engine_optimizer.elim_not_effective : 0.000016s : 0.13% optimize.symbol_engine_optimizer.opt_reshape : 0.000009s : 0.07% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000013s : 0.10% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000004s : 0.03% pipeline_parallel_scheduler : 0.000002s : 0.01% auto_monad_reorder : 0.000025s : 0.20% get_jit_bprop_graph : 0.000002s : 0.01% rewriter_after_jit_bprop_graph : 0.000005s : 0.04% opt_after_jit_grad : 0.000559s : 4.37% validate : 0.000044s : 0.35% Time group info: ------[substitution.] 0.000240 44 10.00% : 0.000024s : 3: substitution.cast_eliminate 1.00% : 0.000002s : 3: substitution.elim_not_effective 0.68% : 0.000002s : 3: substitution.fold_const_symbol 3.08% : 0.000007s : 6: substitution.graph_param_transform 67.32% : 0.000161s : 4: substitution.inline 2.21% : 0.000005s : 6: substitution.j_node_and_user_rematch 2.69% : 0.000006s : 6: substitution.remove_not_recompute_node 2.63% : 0.000006s : 6: substitution.replace_old_param 8.02% : 0.000019s : 6: substitution.tuple_list_get_item_eliminator 2.37% : 0.000006s : 1: substitution.value_based_eliminate ------[type_inference.] 0.006208 2 87.52% : 0.005434s : 1: type_inference.infer 12.48% : 0.000775s : 1: type_inference.specialize ------[replace.] 0.000081 10 48.41% : 0.000039s : 4: replace.inline 51.59% : 0.000042s : 6: replace.tuple_list_get_item_eliminator ------[match.] 0.000175 10 90.61% : 0.000159s : 4: match.inline 9.39% : 0.000016s : 6: match.tuple_list_get_item_eliminator ------[predicate.] 0.000296 1908 0.95% : 0.000003s : 20: predicate.accumulaten_eliminater 0.74% : 0.000002s : 6: predicate.ad_related_special_op_eliminate 0.52% : 0.000002s : 12: predicate.addn_check_dump 0.95% : 0.000003s : 20: predicate.addn_zero_filter 0.88% : 0.000003s : 20: predicate.adjust_all_reduce_mul_add 2.08% : 0.000006s : 32: predicate.arithmetic_simplify 1.10% : 0.000003s : 20: predicate.cast_eliminate 0.64% : 0.000002s : 12: predicate.check_bprop_eliminate 0.59% : 0.000002s : 12: predicate.compare_switch_simplify 0.21% : 0.000001s : 6: predicate.const_output_eliminate 0.68% : 0.000002s : 12: predicate.depend_value_elim 0.96% : 0.000003s : 20: predicate.dict_get_item_const_eliminator 1.13% : 0.000003s : 20: predicate.dict_get_item_eliminator 0.90% : 0.000003s : 20: predicate.dict_set_item_eliminator 0.92% : 0.000003s : 12: predicate.dumpgradient_eliminate 0.25% : 0.000001s : 6: predicate.elim_not_effective 0.37% : 0.000001s : 6: predicate.elim_shapecalc_of_broadcastargs 1.22% : 0.000004s : 26: predicate.environ_add_const_eliminate 1.17% : 0.000003s : 26: predicate.environ_get_add_eliminate 1.12% : 0.000003s : 26: predicate.environ_get_depend_swap 1.78% : 0.000005s : 38: predicate.environ_get_eliminate 1.13% : 0.000003s : 26: predicate.environ_get_set_eliminate 1.44% : 0.000004s : 30: predicate.exchange_switch_depend_value 2.23% : 0.000007s : 30: predicate.float_depend_g_call 0.59% : 0.000002s : 12: predicate.float_environ_get_switch 0.81% : 0.000002s : 18: predicate.float_tuple_getitem_switch 0.17% : 0.000000s : 6: predicate.fold_const_symbol 0.67% : 0.000002s : 12: predicate.get_grad_eliminate 0.22% : 0.000001s : 6: predicate.graph_param_transform 0.59% : 0.000002s : 12: predicate.incorporate_call 0.48% : 0.000001s : 12: predicate.incorporate_call_switch 6.10% : 0.000018s : 86: predicate.inline 0.72% : 0.000002s : 12: predicate.inline_without_move 0.34% : 0.000001s : 12: predicate.j_node_and_user_rematch 0.75% : 0.000002s : 12: predicate.less_batch_normalization 1.89% : 0.000006s : 38: predicate.list_to_tuple_eliminator_ 2.63% : 0.000008s : 58: predicate.load_eliminater 0.74% : 0.000002s : 6: predicate.loop_unroll_after_grad 2.01% : 0.000006s : 42: predicate.loop_unroll_before_grad 1.60% : 0.000005s : 32: predicate.make_slice_get_slice_eliminator 0.57% : 0.000002s : 12: predicate.merge_addn 0.60% : 0.000002s : 12: predicate.micro_step_allgather_replace 0.58% : 0.000002s : 12: predicate.mini_step_allgather_replace 0.90% : 0.000003s : 20: predicate.minmaximum_grad 0.80% : 0.000002s : 6: predicate.mutable_eliminate 0.33% : 0.000001s : 6: predicate.opt_reshape 0.36% : 0.000001s : 6: predicate.parallel_virtual_node 1.76% : 0.000005s : 30: predicate.partial_defer_inline 1.77% : 0.000005s : 32: predicate.partial_eliminate 1.01% : 0.000003s : 20: predicate.print_const_string_wrapper 0.58% : 0.000002s : 12: predicate.reduce_all_const_elim 1.21% : 0.000004s : 20: predicate.reduce_eliminate 2.64% : 0.000008s : 58: predicate.redundant_stop_gradient_eliminater 0.37% : 0.000001s : 12: predicate.remove_not_recompute_node 1.36% : 0.000004s : 38: predicate.replace_applicator 0.50% : 0.000001s : 12: predicate.replace_old_param 0.25% : 0.000001s : 6: predicate.reset_defer_inline 1.08% : 0.000003s : 20: predicate.reshape_eliminate 0.65% : 0.000002s : 12: predicate.row_tensor_add_zeros_like 0.35% : 0.000001s : 6: predicate.row_tensor_eliminate 0.75% : 0.000002s : 12: predicate.same_eliminate 0.38% : 0.000001s : 12: predicate.set_cell_output_no_recompute 0.68% : 0.000002s : 12: predicate.shard_identity_eliminate 0.69% : 0.000002s : 12: predicate.special_op_eliminate 0.65% : 0.000002s : 12: predicate.specialize_transform 0.93% : 0.000003s : 12: predicate.split_environ_get_set_with_tuple_value 0.82% : 0.000002s : 12: predicate.stack_unstack_eliminate 0.33% : 0.000001s : 6: predicate.switch_call_monad_eliminater 1.54% : 0.000005s : 30: predicate.switch_defer_inline 2.11% : 0.000006s : 42: predicate.switch_layer_defer_inline 4.73% : 0.000014s : 90: predicate.switch_simplify 1.12% : 0.000003s : 20: predicate.tile_eliminate 0.95% : 0.000003s : 20: predicate.transpose_eliminate 1.62% : 0.000005s : 32: predicate.tuple_list_convert_item_index_to_positive 1.63% : 0.000005s : 32: predicate.tuple_list_get_item_const_eliminator 1.57% : 0.000005s : 32: predicate.tuple_list_get_item_depend_reorder 3.35% : 0.000010s : 50: predicate.tuple_list_get_item_eliminator 1.52% : 0.000005s : 32: predicate.tuple_list_get_set_item_eliminator 2.43% : 0.000007s : 44: predicate.tuple_list_set_item_eliminator 1.80% : 0.000005s : 38: predicate.tuple_to_list_eliminator_ 2.57% : 0.000008s : 58: predicate.updatestate_pure_node_eliminater 3.38% : 0.000010s : 70: predicate.updatestate_useless_node_eliminater 0.49% : 0.000001s : 6: predicate.value_based_eliminate 0.70% : 0.000002s : 12: predicate.virtual_dataset_eliminate 0.67% : 0.000002s : 12: predicate.virtual_output_eliminate 0.30% : 0.000001s : 6: predicate.virtual_view_grad_eliminate 0.40% : 0.000001s : 6: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000585 11 52.33% : 0.000306s : 5: func_graph_cloner_run.FuncGraphClonerGraph 47.67% : 0.000279s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.030488 192 0.02% : 0.000006s : 1: ForceFp32Comm 10.87% : 0.003313s : 1: add_attr 10.82% : 0.003298s : 1: add_attr_with_inline 0.02% : 0.000006s : 1: add_comm_op_reuse_tag 0.20% : 0.000061s : 1: add_recomputation 0.02% : 0.000006s : 1: assign_add_opt 0.24% : 0.000073s : 1: auto_monad 0.11% : 0.000033s : 1: auto_monad_reorder 0.02% : 0.000006s : 1: begin_end_overlap_inline 0.02% : 0.000007s : 1: bias_add_comm_swap 1.62% : 0.000495s : 1: bootstrap 0.11% : 0.000034s : 1: cconv 0.02% : 0.000006s : 1: comm_op_add_attrs 0.07% : 0.000021s : 1: control_data_broadcast_order 0.05% : 0.000014s : 1: convert_after_rewriter 0.12% : 0.000035s : 1: cse_after_recomputation 0.02% : 0.000007s : 1: dataset_repeat_opt 0.06% : 0.000019s : 1: detach_backward 0.04% : 0.000013s : 1: environ_conv 0.10% : 0.000031s : 1: event_method 0.02% : 0.000008s : 1: full_micro_interleaved_order_control 0.02% : 0.000008s : 1: get_jit_bprop_graph 0.04% : 0.000012s : 1: graph_reusing 0.02% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.02% : 0.000006s : 1: handle_group_info 0.03% : 0.000008s : 1: inline 0.03% : 0.000009s : 1: insert-virtual-dataset 0.02% : 0.000006s : 1: interleave_parallel_branches 0.02% : 0.000006s : 1: interleave_split_concat_branches 0.03% : 0.000008s : 1: label_fine_grained_interleaved_index 0.03% : 0.000010s : 1: label_micro_interleaved_index 1.62% : 0.000495s : 1: loop_unroll 0.02% : 0.000006s : 1: merge_cast_opt 0.02% : 0.000007s : 1: micro_interleaved_order_control 2.12% : 0.000648s : 1: mutable_eliminate 0.03% : 0.000010s : 1: offloading_packed_experts 0.06% : 0.000017s : 1: opt.transform.loop_unroll_optimizer 0.06% : 0.000018s : 1: opt.transform.mutable_eliminate 5.52% : 0.001683s : 78: opt.transform.opt_a 0.13% : 0.000041s : 1: opt.transform.opt_after_cconv 0.11% : 0.000034s : 1: opt.transform.opt_after_jit_grad 0.53% : 0.000163s : 28: opt.transform.opt_b 0.22% : 0.000067s : 2: opt.transform.opt_trans_graph 0.15% : 0.000047s : 4: opt.transform.symbol_engine_opt 12.65% : 0.003857s : 1: opt_a 0.50% : 0.000151s : 1: opt_after_cconv 1.87% : 0.000570s : 1: opt_after_jit_grad 1.14% : 0.000347s : 1: opt_b 22.42% : 0.006837s : 1: optimize 0.08% : 0.000025s : 1: optimize_parallel_all_gather_comm 0.04% : 0.000012s : 1: order_py_execute_after_rewriter 0.09% : 0.000028s : 1: overlap_grad_flash_sp 0.02% : 0.000006s : 1: overlap_grad_matmul_and_grad_allreduce 0.03% : 0.000010s : 1: overlap_grad_ring_attention 0.02% : 0.000007s : 1: overlap_opt_shard_grad_in_pipeline 0.02% : 0.000006s : 1: overlap_opt_shard_in_pipeline 0.03% : 0.000008s : 1: overlap_param_gather 0.02% : 0.000006s : 1: overlap_recompute_allgather_and_fa_grad 0.03% : 0.000010s : 1: overlap_recompute_and_grad_model_parallel 0.03% : 0.000008s : 1: overlap_recompute_comm 0.03% : 0.000011s : 1: parallel-infer-symbol 0.02% : 0.000006s : 1: parallel-infer-symbol-second 0.02% : 0.000007s : 1: partial_unused_args_eliminate 0.03% : 0.000010s : 1: pipeline_parallel_scheduler 0.02% : 0.000007s : 1: pipeline_split 0.14% : 0.000044s : 1: pre_auto_parallel 0.12% : 0.000038s : 1: py_interpret_to_execute 0.06% : 0.000019s : 1: py_interpret_to_execute_after_opt_a 0.02% : 0.000007s : 1: remove_cast_before_assign_add 0.07% : 0.000022s : 1: remove_dup_value 1.68% : 0.000512s : 1: renormalize.infer 1.21% : 0.000368s : 1: renormalize.specialize 0.03% : 0.000008s : 1: reorder_send_recv_between_fp_bp 0.04% : 0.000011s : 1: rewriter_after_jit_bprop_graph 0.16% : 0.000050s : 1: rewriter_after_opt_a 0.35% : 0.000108s : 1: rewriter_before_opt_a 0.02% : 0.000007s : 1: slice_cell_reuse_recomputed_activation 0.02% : 0.000007s : 1: slice_recompute_activation 0.02% : 0.000007s : 1: split_layernorm_comm 0.02% : 0.000008s : 1: split_matmul_comm_elemetwise 0.04% : 0.000011s : 1: swap_dp_allreduce_reducescatter 0.36% : 0.000109s : 1: symbol_engine_optimizer 0.36% : 0.000111s : 1: tuple_transform 20.68% : 0.006305s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:47:19.860.919 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0160599, [21] [bootstrap]: 0.00043878 [type_inference]: 0.00597571 [event_method]: 2.06e-05 [auto_monad]: 6.133e-05 [graph_reusing]: 5.62001e-06 [inline]: 1.82999e-06 [add_attr]: 0.00323935, [1] [add_attr_with_inline]: 0.00323053, [1] [Cycle 1]: 5.954e-05, [2] [tag_attr]: 2.167e-05 [meta_addattr_fg_expand]: 6.36998e-06 [parallel-infer-symbol]: 3.9e-06 [pre_auto_parallel]: 3.525e-05 [insert-virtual-dataset]: 2.41e-06 [parallel-infer-symbol-second]: 7.59988e-07 [dataset_repeat_opt]: 2.29999e-06 [pipeline_split]: 2.02999e-06 [optimize]: 0.00554492, [53] [py_interpret_to_execute]: 2.879e-05 [rewriter_before_opt_a]: 9.707e-05 [opt_a]: 0.00325687, [2] [Cycle 1]: 0.00239216, [45] [expand_dump_flag]: 2.96001e-06 [switch_simplify]: 4.836e-05 [loop_unroll]: 3.555e-05 [a_1]: 0.0008381 [with_stream_mark]: 1.545e-05 [recompute_prepare]: 1.052e-05 [updatestate_depend_eliminate]: 4.97999e-06 [updatestate_assign_eliminate]: 4.07e-06 [updatestate_loads_eliminate]: 3.6e-06 [parameter_eliminate]: 2.09e-06 [a_2]: 0.00011034 [accelerated_algorithm]: 9.14e-06 [shard]: 2.33002e-06 [meta_shard_fg_expand]: 2.41998e-06 [shard_inline]: 8.60999e-06 [merge_send_recv]: 9.61998e-06 [auto_parallel]: 7.2e-06 [parallel]: 1.853e-05 [flash_sp]: 8.04002e-06 [merge_comm]: 5.37001e-06 [allreduce_fusion]: 4.42e-06 [matmul_add_comm_reduction]: 9.86e-06 [allreduce_slice_to_reducescatter]: 9.5999e-07 [virtual_shard_identity]: 9.64e-06 [virtual_dataset]: 8.52e-06 [get_grad_eliminate_]: 8.23001e-06 [virtual_output]: 8.59e-06 [merge_forward]: 4.50001e-06 [cell_reuse_recompute_pass]: 1.46998e-06 [offload_activation]: 1.095e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.517e-05 [merge_recompute_call_nodes]: 1.37e-06 [before_grad]: 1.385e-05 [set_forward_comm_id_for_comm_node_pass]: 4.53999e-06 [meta_fg_expand]: 3.92002e-06 [flash_sp_send_recv_attached]: 2.51998e-06 [receive_attached]: 2.17999e-06 [after_resolve]: 1.496e-05 [a_after_grad]: 1.323e-05 [renormalize]: 0.00073815 [add_forward_monad_depend]: 5.49e-06 [auto_monad_grad]: 2.19001e-06 [auto_monad_eliminator]: 1.737e-05 [cse]: 3.687e-05 [a_3]: 6.258e-05 [Cycle 2]: 0.00085431, [45] [expand_dump_flag]: 9.90025e-07 [switch_simplify]: 9.60001e-06 [loop_unroll]: 8.48999e-06 [a_1]: 0.00020224 [with_stream_mark]: 1.191e-05 [recompute_prepare]: 8.82999e-06 [updatestate_depend_eliminate]: 3.91999e-06 [updatestate_assign_eliminate]: 3.21001e-06 [updatestate_loads_eliminate]: 3.13e-06 [parameter_eliminate]: 1.29998e-06 [a_2]: 0.000132 [accelerated_algorithm]: 8.63001e-06 [shard]: 1.27e-06 [meta_shard_fg_expand]: 1.84998e-06 [shard_inline]: 1.167e-05 [merge_send_recv]: 5.98002e-06 [auto_parallel]: 6.39999e-06 [parallel]: 4.74e-06 [flash_sp]: 3.7e-06 [merge_comm]: 4.68001e-06 [allreduce_fusion]: 4.12998e-06 [matmul_add_comm_reduction]: 7.21001e-06 [allreduce_slice_to_reducescatter]: 3.89991e-07 [virtual_shard_identity]: 9.11998e-06 [virtual_dataset]: 7.98001e-06 [get_grad_eliminate_]: 7.70998e-06 [virtual_output]: 7.88001e-06 [merge_forward]: 3.31001e-06 [cell_reuse_recompute_pass]: 1.89999e-06 [offload_activation]: 7.33999e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.578e-05 [merge_recompute_call_nodes]: 8.70001e-07 [before_grad]: 1.249e-05 [set_forward_comm_id_for_comm_node_pass]: 4.14002e-06 [meta_fg_expand]: 2.82002e-06 [flash_sp_send_recv_attached]: 9.20001e-07 [receive_attached]: 1.21002e-06 [after_resolve]: 1.392e-05 [a_after_grad]: 1.225e-05 [renormalize]: 8.00064e-08 [add_forward_monad_depend]: 1.67001e-06 [auto_monad_grad]: 9.50007e-07 [auto_monad_eliminator]: 1.033e-05 [cse]: 1.964e-05 [a_3]: 5.046e-05 [py_interpret_to_execute_after_opt_a]: 1.149e-05 [slice_cell_reuse_recomputed_activation]: 1.92999e-06 [rewriter_after_opt_a]: 4.076e-05 [convert_after_rewriter]: 7.92998e-06 [order_py_execute_after_rewriter]: 5.81e-06 [mutable_eliminate]: 0.00059068 [opt_b]: 0.00027271, [1] [Cycle 1]: 0.00026609, [7] [b_1]: 0.00017983 [b_2]: 1.078e-05 [updatestate_depend_eliminate]: 6.44001e-06 [updatestate_assign_eliminate]: 3.28e-06 [updatestate_loads_eliminate]: 3.91999e-06 [renormalize]: 4.00003e-07 [cse]: 2.542e-05 [optimize_parallel_all_gather_comm]: 1.856e-05 [overlap_param_gather]: 2.07001e-06 [cconv]: 2.558e-05 [loop_unroll]: 0.00046159 [opt_after_cconv]: 0.00012206, [1] [Cycle 1]: 0.00011623, [7] [c_1]: 4.259e-05 [parameter_eliminate]: 2.99001e-06 [updatestate_depend_eliminate]: 6.18998e-06 [updatestate_assign_eliminate]: 3.17002e-06 [updatestate_loads_eliminate]: 3.20998e-06 [cse]: 2.351e-05 [renormalize]: 3.7998e-07 [remove_dup_value]: 1.59e-05 [tuple_transform]: 9.152e-05, [1] [Cycle 1]: 8.722e-05, [4] [d_1]: 5.785e-05 [none_parameter_eliminate]: 1.74e-06 [renormalize]: 2.19996e-07 [switch_simplify]: 8.71002e-06 [partial_unused_args_eliminate]: 1.73002e-06 [add_recomputation]: 5.542e-05 [cse_after_recomputation]: 2.663e-05, [1] [Cycle 1]: 2.182e-05, [1] [cse]: 1.647e-05 [environ_conv]: 6.69999e-06 [swap_dp_allreduce_reducescatter]: 5.96998e-06 [bias_add_comm_swap]: 2.43e-06 [label_micro_interleaved_index]: 4.63001e-06 [label_fine_grained_interleaved_index]: 2.94001e-06 [merge_cast_opt]: 1.20999e-06 [slice_recompute_activation]: 2.13998e-06 [micro_interleaved_order_control]: 2.55002e-06 [assign_add_opt]: 1.37e-06 [ForceFp32Comm]: 7.50006e-07 [remove_cast_before_assign_add]: 1.14e-06 [full_micro_interleaved_order_control]: 2.01e-06 [reorder_send_recv_between_fp_bp]: 2.64999e-06 [comm_op_add_attrs]: 9.89996e-07 [add_comm_op_reuse_tag]: 9.70002e-07 [interleave_split_concat_branches]: 1.12999e-06 [interleave_parallel_branches]: 1.02998e-06 [overlap_opt_shard_in_pipeline]: 1.22999e-06 [overlap_opt_shard_grad_in_pipeline]: 2.35002e-06 [control_data_broadcast_order]: 1.46e-05 [grouped_pairwise_exchange_alltoall]: 1.58002e-06 [offloading_packed_experts]: 4.44002e-06 [overlap_recompute_and_grad_model_parallel]: 5.00999e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.14e-06 [overlap_recompute_allgather_and_fa_grad]: 1.62001e-06 [overlap_recompute_comm]: 2.39999e-06 [overlap_grad_ring_attention]: 4.47e-06 [overlap_grad_flash_sp]: 2.193e-05 [begin_end_overlap_inline]: 4.69998e-07 [split_matmul_comm_elemetwise]: 2.11e-06 [split_layernorm_comm]: 2.16e-06 [handle_group_info]: 8.79983e-07 [symbol_engine_optimizer]: 8.479e-05, [1] [Cycle 1]: 8.027e-05, [6] [build]: 3.48999e-06 [elim_shapecalc]: 1.127e-05 [elim_not_effective]: 1.529e-05 [opt_reshape]: 9.17999e-06 [fold_const_symbol]: 1.305e-05 [renormalize]: 2.10013e-07 [detach_backward]: 1.84998e-06 [pipeline_parallel_scheduler]: 1.52001e-06 [auto_monad_reorder]: 2.042e-05 [get_jit_bprop_graph]: 1.27e-06 [rewriter_after_jit_bprop_graph]: 4.28001e-06 [opt_after_jit_grad]: 0.00050352 [validate]: 4.267e-05 Sums bootstrap : 0.000439s : 3.70% type_inference : 0.005976s : 50.37% event_method : 0.000021s : 0.17% auto_monad : 0.000061s : 0.52% graph_reusing : 0.000006s : 0.05% inline : 0.000002s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000022s : 0.18% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.05% parallel-infer-symbol : 0.000004s : 0.03% pre_auto_parallel : 0.000035s : 0.30% insert-virtual-dataset : 0.000002s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.02% pipeline_split : 0.000002s : 0.02% optimize.py_interpret_to_execute : 0.000029s : 0.24% optimize.rewriter_before_opt_a : 0.000097s : 0.82% optimize.opt_a.expand_dump_flag : 0.000004s : 0.03% optimize.opt_a.switch_simplify : 0.000058s : 0.49% optimize.opt_a.loop_unroll : 0.000044s : 0.37% optimize.opt_a.a_1 : 0.001040s : 8.77% optimize.opt_a.with_stream_mark : 0.000027s : 0.23% optimize.opt_a.recompute_prepare : 0.000019s : 0.16% optimize.opt_a.updatestate_depend_eliminate : 0.000009s : 0.08% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.06% optimize.opt_a.updatestate_loads_eliminate : 0.000007s : 0.06% optimize.opt_a.parameter_eliminate : 0.000003s : 0.03% optimize.opt_a.a_2 : 0.000242s : 2.04% optimize.opt_a.accelerated_algorithm : 0.000018s : 0.15% optimize.opt_a.shard : 0.000004s : 0.03% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.04% optimize.opt_a.shard_inline : 0.000020s : 0.17% optimize.opt_a.merge_send_recv : 0.000016s : 0.13% optimize.opt_a.auto_parallel : 0.000014s : 0.11% optimize.opt_a.parallel : 0.000023s : 0.20% optimize.opt_a.flash_sp : 0.000012s : 0.10% optimize.opt_a.merge_comm : 0.000010s : 0.08% optimize.opt_a.allreduce_fusion : 0.000009s : 0.07% optimize.opt_a.matmul_add_comm_reduction : 0.000017s : 0.14% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000019s : 0.16% optimize.opt_a.virtual_dataset : 0.000017s : 0.14% optimize.opt_a.get_grad_eliminate_ : 0.000016s : 0.13% optimize.opt_a.virtual_output : 0.000016s : 0.14% optimize.opt_a.merge_forward : 0.000008s : 0.07% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.03% optimize.opt_a.offload_activation : 0.000018s : 0.15% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000031s : 0.26% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.02% optimize.opt_a.before_grad : 0.000026s : 0.22% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000009s : 0.07% optimize.opt_a.meta_fg_expand : 0.000007s : 0.06% optimize.opt_a.flash_sp_send_recv_attached : 0.000003s : 0.03% optimize.opt_a.receive_attached : 0.000003s : 0.03% optimize.opt_a.after_resolve : 0.000029s : 0.24% optimize.opt_a.a_after_grad : 0.000025s : 0.21% optimize.opt_a.renormalize : 0.000738s : 6.22% optimize.opt_a.add_forward_monad_depend : 0.000007s : 0.06% optimize.opt_a.auto_monad_grad : 0.000003s : 0.03% optimize.opt_a.auto_monad_eliminator : 0.000028s : 0.23% optimize.opt_a.cse : 0.000057s : 0.48% optimize.opt_a.a_3 : 0.000113s : 0.95% optimize.py_interpret_to_execute_after_opt_a : 0.000011s : 0.10% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.02% optimize.rewriter_after_opt_a : 0.000041s : 0.34% optimize.convert_after_rewriter : 0.000008s : 0.07% optimize.order_py_execute_after_rewriter : 0.000006s : 0.05% optimize.mutable_eliminate : 0.000591s : 4.98% optimize.opt_b.b_1 : 0.000180s : 1.52% optimize.opt_b.b_2 : 0.000011s : 0.09% optimize.opt_b.updatestate_depend_eliminate : 0.000006s : 0.05% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_b.updatestate_loads_eliminate : 0.000004s : 0.03% optimize.opt_b.renormalize : 0.000000s : 0.00% optimize.opt_b.cse : 0.000025s : 0.21% optimize.optimize_parallel_all_gather_comm : 0.000019s : 0.16% optimize.overlap_param_gather : 0.000002s : 0.02% optimize.cconv : 0.000026s : 0.22% optimize.loop_unroll : 0.000462s : 3.89% optimize.opt_after_cconv.c_1 : 0.000043s : 0.36% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.05% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.03% optimize.opt_after_cconv.cse : 0.000024s : 0.20% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000016s : 0.13% optimize.tuple_transform.d_1 : 0.000058s : 0.49% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000009s : 0.07% optimize.partial_unused_args_eliminate : 0.000002s : 0.01% optimize.add_recomputation : 0.000055s : 0.47% optimize.cse_after_recomputation.cse : 0.000016s : 0.14% optimize.environ_conv : 0.000007s : 0.06% optimize.swap_dp_allreduce_reducescatter : 0.000006s : 0.05% optimize.bias_add_comm_swap : 0.000002s : 0.02% optimize.label_micro_interleaved_index : 0.000005s : 0.04% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.02% optimize.merge_cast_opt : 0.000001s : 0.01% optimize.slice_recompute_activation : 0.000002s : 0.02% optimize.micro_interleaved_order_control : 0.000003s : 0.02% optimize.assign_add_opt : 0.000001s : 0.01% optimize.ForceFp32Comm : 0.000001s : 0.01% optimize.remove_cast_before_assign_add : 0.000001s : 0.01% optimize.full_micro_interleaved_order_control : 0.000002s : 0.02% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.02% optimize.comm_op_add_attrs : 0.000001s : 0.01% optimize.add_comm_op_reuse_tag : 0.000001s : 0.01% optimize.interleave_split_concat_branches : 0.000001s : 0.01% optimize.interleave_parallel_branches : 0.000001s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.02% optimize.control_data_broadcast_order : 0.000015s : 0.12% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.01% optimize.offloading_packed_experts : 0.000004s : 0.04% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.04% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000002s : 0.01% optimize.overlap_recompute_comm : 0.000002s : 0.02% optimize.overlap_grad_ring_attention : 0.000004s : 0.04% optimize.overlap_grad_flash_sp : 0.000022s : 0.18% optimize.begin_end_overlap_inline : 0.000000s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.02% optimize.split_layernorm_comm : 0.000002s : 0.02% optimize.handle_group_info : 0.000001s : 0.01% optimize.symbol_engine_optimizer.build : 0.000003s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000011s : 0.09% optimize.symbol_engine_optimizer.elim_not_effective : 0.000015s : 0.13% optimize.symbol_engine_optimizer.opt_reshape : 0.000009s : 0.08% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000013s : 0.11% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.02% pipeline_parallel_scheduler : 0.000002s : 0.01% auto_monad_reorder : 0.000020s : 0.17% get_jit_bprop_graph : 0.000001s : 0.01% rewriter_after_jit_bprop_graph : 0.000004s : 0.04% opt_after_jit_grad : 0.000504s : 4.24% validate : 0.000043s : 0.36% Time group info: ------[substitution.] 0.000222 44 9.19% : 0.000020s : 3: substitution.cast_eliminate 1.00% : 0.000002s : 3: substitution.elim_not_effective 0.73% : 0.000002s : 3: substitution.fold_const_symbol 3.21% : 0.000007s : 6: substitution.graph_param_transform 67.74% : 0.000150s : 4: substitution.inline 1.94% : 0.000004s : 6: substitution.j_node_and_user_rematch 2.94% : 0.000007s : 6: substitution.remove_not_recompute_node 2.63% : 0.000006s : 6: substitution.replace_old_param 8.02% : 0.000018s : 6: substitution.tuple_list_get_item_eliminator 2.58% : 0.000006s : 1: substitution.value_based_eliminate ------[type_inference.] 0.005915 2 87.15% : 0.005154s : 1: type_inference.infer 12.85% : 0.000760s : 1: type_inference.specialize ------[replace.] 0.000072 10 52.44% : 0.000038s : 4: replace.inline 47.56% : 0.000034s : 6: replace.tuple_list_get_item_eliminator ------[match.] 0.000163 10 90.71% : 0.000148s : 4: match.inline 9.29% : 0.000015s : 6: match.tuple_list_get_item_eliminator ------[predicate.] 0.000289 1908 0.95% : 0.000003s : 20: predicate.accumulaten_eliminater 0.70% : 0.000002s : 6: predicate.ad_related_special_op_eliminate 0.57% : 0.000002s : 12: predicate.addn_check_dump 0.94% : 0.000003s : 20: predicate.addn_zero_filter 0.86% : 0.000002s : 20: predicate.adjust_all_reduce_mul_add 2.09% : 0.000006s : 32: predicate.arithmetic_simplify 1.01% : 0.000003s : 20: predicate.cast_eliminate 0.62% : 0.000002s : 12: predicate.check_bprop_eliminate 0.58% : 0.000002s : 12: predicate.compare_switch_simplify 0.20% : 0.000001s : 6: predicate.const_output_eliminate 0.64% : 0.000002s : 12: predicate.depend_value_elim 1.07% : 0.000003s : 20: predicate.dict_get_item_const_eliminator 1.10% : 0.000003s : 20: predicate.dict_get_item_eliminator 0.96% : 0.000003s : 20: predicate.dict_set_item_eliminator 0.85% : 0.000002s : 12: predicate.dumpgradient_eliminate 0.18% : 0.000001s : 6: predicate.elim_not_effective 0.37% : 0.000001s : 6: predicate.elim_shapecalc_of_broadcastargs 1.22% : 0.000004s : 26: predicate.environ_add_const_eliminate 1.16% : 0.000003s : 26: predicate.environ_get_add_eliminate 1.26% : 0.000004s : 26: predicate.environ_get_depend_swap 1.77% : 0.000005s : 38: predicate.environ_get_eliminate 1.12% : 0.000003s : 26: predicate.environ_get_set_eliminate 1.46% : 0.000004s : 30: predicate.exchange_switch_depend_value 2.20% : 0.000006s : 30: predicate.float_depend_g_call 0.54% : 0.000002s : 12: predicate.float_environ_get_switch 0.85% : 0.000002s : 18: predicate.float_tuple_getitem_switch 0.20% : 0.000001s : 6: predicate.fold_const_symbol 0.64% : 0.000002s : 12: predicate.get_grad_eliminate 0.20% : 0.000001s : 6: predicate.graph_param_transform 0.59% : 0.000002s : 12: predicate.incorporate_call 0.51% : 0.000001s : 12: predicate.incorporate_call_switch 5.93% : 0.000017s : 86: predicate.inline 0.82% : 0.000002s : 12: predicate.inline_without_move 0.36% : 0.000001s : 12: predicate.j_node_and_user_rematch 0.74% : 0.000002s : 12: predicate.less_batch_normalization 1.91% : 0.000006s : 38: predicate.list_to_tuple_eliminator_ 2.73% : 0.000008s : 58: predicate.load_eliminater 0.85% : 0.000002s : 6: predicate.loop_unroll_after_grad 2.06% : 0.000006s : 42: predicate.loop_unroll_before_grad 1.75% : 0.000005s : 32: predicate.make_slice_get_slice_eliminator 0.56% : 0.000002s : 12: predicate.merge_addn 0.62% : 0.000002s : 12: predicate.micro_step_allgather_replace 0.59% : 0.000002s : 12: predicate.mini_step_allgather_replace 0.88% : 0.000003s : 20: predicate.minmaximum_grad 0.90% : 0.000003s : 6: predicate.mutable_eliminate 0.38% : 0.000001s : 6: predicate.opt_reshape 0.36% : 0.000001s : 6: predicate.parallel_virtual_node 1.77% : 0.000005s : 30: predicate.partial_defer_inline 1.79% : 0.000005s : 32: predicate.partial_eliminate 0.93% : 0.000003s : 20: predicate.print_const_string_wrapper 0.60% : 0.000002s : 12: predicate.reduce_all_const_elim 1.12% : 0.000003s : 20: predicate.reduce_eliminate 2.65% : 0.000008s : 58: predicate.redundant_stop_gradient_eliminater 0.44% : 0.000001s : 12: predicate.remove_not_recompute_node 1.45% : 0.000004s : 38: predicate.replace_applicator 0.47% : 0.000001s : 12: predicate.replace_old_param 0.25% : 0.000001s : 6: predicate.reset_defer_inline 1.04% : 0.000003s : 20: predicate.reshape_eliminate 0.63% : 0.000002s : 12: predicate.row_tensor_add_zeros_like 0.39% : 0.000001s : 6: predicate.row_tensor_eliminate 0.72% : 0.000002s : 12: predicate.same_eliminate 0.44% : 0.000001s : 12: predicate.set_cell_output_no_recompute 0.77% : 0.000002s : 12: predicate.shard_identity_eliminate 0.69% : 0.000002s : 12: predicate.special_op_eliminate 0.67% : 0.000002s : 12: predicate.specialize_transform 0.79% : 0.000002s : 12: predicate.split_environ_get_set_with_tuple_value 0.76% : 0.000002s : 12: predicate.stack_unstack_eliminate 0.35% : 0.000001s : 6: predicate.switch_call_monad_eliminater 1.51% : 0.000004s : 30: predicate.switch_defer_inline 2.12% : 0.000006s : 42: predicate.switch_layer_defer_inline 4.84% : 0.000014s : 90: predicate.switch_simplify 0.97% : 0.000003s : 20: predicate.tile_eliminate 0.97% : 0.000003s : 20: predicate.transpose_eliminate 1.65% : 0.000005s : 32: predicate.tuple_list_convert_item_index_to_positive 1.68% : 0.000005s : 32: predicate.tuple_list_get_item_const_eliminator 1.56% : 0.000005s : 32: predicate.tuple_list_get_item_depend_reorder 3.26% : 0.000009s : 50: predicate.tuple_list_get_item_eliminator 1.63% : 0.000005s : 32: predicate.tuple_list_get_set_item_eliminator 2.23% : 0.000006s : 44: predicate.tuple_list_set_item_eliminator 1.76% : 0.000005s : 38: predicate.tuple_to_list_eliminator_ 2.57% : 0.000007s : 58: predicate.updatestate_pure_node_eliminater 3.29% : 0.000010s : 70: predicate.updatestate_useless_node_eliminater 0.45% : 0.000001s : 6: predicate.value_based_eliminate 0.67% : 0.000002s : 12: predicate.virtual_dataset_eliminate 0.61% : 0.000002s : 12: predicate.virtual_output_eliminate 0.30% : 0.000001s : 6: predicate.virtual_view_grad_eliminate 0.40% : 0.000001s : 6: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000590 11 56.50% : 0.000333s : 5: func_graph_cloner_run.FuncGraphClonerGraph 43.50% : 0.000257s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.027475 192 0.01% : 0.000003s : 1: ForceFp32Comm 11.81% : 0.003244s : 1: add_attr 11.77% : 0.003234s : 1: add_attr_with_inline 0.01% : 0.000004s : 1: add_comm_op_reuse_tag 0.22% : 0.000059s : 1: add_recomputation 0.01% : 0.000004s : 1: assign_add_opt 0.25% : 0.000067s : 1: auto_monad 0.09% : 0.000025s : 1: auto_monad_reorder 0.01% : 0.000003s : 1: begin_end_overlap_inline 0.02% : 0.000005s : 1: bias_add_comm_swap 1.70% : 0.000467s : 1: bootstrap 0.11% : 0.000029s : 1: cconv 0.01% : 0.000004s : 1: comm_op_add_attrs 0.06% : 0.000018s : 1: control_data_broadcast_order 0.04% : 0.000011s : 1: convert_after_rewriter 0.11% : 0.000030s : 1: cse_after_recomputation 0.02% : 0.000005s : 1: dataset_repeat_opt 0.02% : 0.000005s : 1: detach_backward 0.04% : 0.000010s : 1: environ_conv 0.10% : 0.000027s : 1: event_method 0.02% : 0.000005s : 1: full_micro_interleaved_order_control 0.02% : 0.000005s : 1: get_jit_bprop_graph 0.03% : 0.000009s : 1: graph_reusing 0.02% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000004s : 1: handle_group_info 0.02% : 0.000005s : 1: inline 0.02% : 0.000006s : 1: insert-virtual-dataset 0.01% : 0.000004s : 1: interleave_parallel_branches 0.01% : 0.000004s : 1: interleave_split_concat_branches 0.02% : 0.000006s : 1: label_fine_grained_interleaved_index 0.03% : 0.000007s : 1: label_micro_interleaved_index 1.71% : 0.000470s : 1: loop_unroll 0.01% : 0.000004s : 1: merge_cast_opt 0.02% : 0.000005s : 1: micro_interleaved_order_control 2.18% : 0.000600s : 1: mutable_eliminate 0.03% : 0.000007s : 1: offloading_packed_experts 0.06% : 0.000017s : 1: opt.transform.loop_unroll_optimizer 0.07% : 0.000019s : 1: opt.transform.mutable_eliminate 5.98% : 0.001644s : 78: opt.transform.opt_a 0.15% : 0.000041s : 1: opt.transform.opt_after_cconv 0.12% : 0.000033s : 1: opt.transform.opt_after_jit_grad 0.58% : 0.000159s : 28: opt.transform.opt_b 0.23% : 0.000064s : 2: opt.transform.opt_trans_graph 0.17% : 0.000045s : 4: opt.transform.symbol_engine_opt 11.87% : 0.003260s : 1: opt_a 0.46% : 0.000126s : 1: opt_after_cconv 1.87% : 0.000513s : 1: opt_after_jit_grad 1.01% : 0.000276s : 1: opt_b 20.20% : 0.005549s : 1: optimize 0.08% : 0.000022s : 1: optimize_parallel_all_gather_comm 0.03% : 0.000009s : 1: order_py_execute_after_rewriter 0.09% : 0.000025s : 1: overlap_grad_flash_sp 0.01% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.03% : 0.000008s : 1: overlap_grad_ring_attention 0.02% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.02% : 0.000005s : 1: overlap_param_gather 0.02% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.03% : 0.000008s : 1: overlap_recompute_and_grad_model_parallel 0.02% : 0.000005s : 1: overlap_recompute_comm 0.03% : 0.000008s : 1: parallel-infer-symbol 0.01% : 0.000004s : 1: parallel-infer-symbol-second 0.02% : 0.000005s : 1: partial_unused_args_eliminate 0.02% : 0.000004s : 1: pipeline_parallel_scheduler 0.02% : 0.000005s : 1: pipeline_split 0.14% : 0.000039s : 1: pre_auto_parallel 0.12% : 0.000033s : 1: py_interpret_to_execute 0.06% : 0.000016s : 1: py_interpret_to_execute_after_opt_a 0.02% : 0.000004s : 1: remove_cast_before_assign_add 0.07% : 0.000019s : 1: remove_dup_value 1.44% : 0.000396s : 1: renormalize.infer 1.22% : 0.000334s : 1: renormalize.specialize 0.02% : 0.000005s : 1: reorder_send_recv_between_fp_bp 0.03% : 0.000007s : 1: rewriter_after_jit_bprop_graph 0.16% : 0.000045s : 1: rewriter_after_opt_a 0.37% : 0.000101s : 1: rewriter_before_opt_a 0.02% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.02% : 0.000005s : 1: slice_recompute_activation 0.02% : 0.000005s : 1: split_layernorm_comm 0.02% : 0.000005s : 1: split_matmul_comm_elemetwise 0.03% : 0.000009s : 1: swap_dp_allreduce_reducescatter 0.32% : 0.000087s : 1: symbol_engine_optimizer 0.34% : 0.000094s : 1: tuple_transform 21.81% : 0.005993s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:47:20.543.45 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:47:20.546.08 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. TotalTime = 0.0168075, [21] [bootstrap]: 0.00045218 [type_inference]: 0.00596099 [event_method]: 1.982e-05 [auto_monad]: 6.191e-05 [graph_reusing]: 6.07001e-06 [inline]: 2.30002e-06 [add_attr]: 0.00326573, [1] [add_attr_with_inline]: 0.00325646, [1] [Cycle 1]: 7.289e-05, [2] [tag_attr]: 2.149e-05 [meta_addattr_fg_expand]: 6.86999e-06 [parallel-infer-symbol]: 3.74002e-06 [pre_auto_parallel]: 3.603e-05 [insert-virtual-dataset]: 2.27999e-06 [parallel-infer-symbol-second]: 8.2e-07 [dataset_repeat_opt]: 1.69e-06 [pipeline_split]: 1.54e-06 [optimize]: 0.0058089, [53] [py_interpret_to_execute]: 3.966e-05 [rewriter_before_opt_a]: 9.898e-05 [opt_a]: 0.0032998, [2] [Cycle 1]: 0.00239696, [45] [expand_dump_flag]: 3.01001e-06 [switch_simplify]: 4.721e-05 [loop_unroll]: 3.513e-05 [a_1]: 0.00074461 [with_stream_mark]: 1.74e-05 [recompute_prepare]: 9.52001e-06 [updatestate_depend_eliminate]: 3.69002e-06 [updatestate_assign_eliminate]: 3.48e-06 [updatestate_loads_eliminate]: 3.23e-06 [parameter_eliminate]: 2.26e-06 [a_2]: 0.00012154 [accelerated_algorithm]: 8.05e-06 [shard]: 1.90001e-06 [meta_shard_fg_expand]: 2.24999e-06 [shard_inline]: 7.25e-06 [merge_send_recv]: 8.15999e-06 [auto_parallel]: 6.39001e-06 [parallel]: 1.949e-05 [flash_sp]: 8.80001e-06 [merge_comm]: 4.08001e-06 [allreduce_fusion]: 3.78999e-06 [matmul_add_comm_reduction]: 9.56e-06 [allreduce_slice_to_reducescatter]: 6.40022e-07 [virtual_shard_identity]: 9.43002e-06 [virtual_dataset]: 8e-06 [get_grad_eliminate_]: 7.99002e-06 [virtual_output]: 7.18e-06 [merge_forward]: 3.83999e-06 [cell_reuse_recompute_pass]: 1.14e-06 [offload_activation]: 9.77001e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.57e-05 [merge_recompute_call_nodes]: 1.77001e-06 [before_grad]: 1.156e-05 [set_forward_comm_id_for_comm_node_pass]: 3.81999e-06 [meta_fg_expand]: 2.94999e-06 [flash_sp_send_recv_attached]: 3.05002e-06 [receive_attached]: 2.09e-06 [after_resolve]: 1.504e-05 [a_after_grad]: 1.235e-05 [renormalize]: 0.0006697 [add_forward_monad_depend]: 5.71e-06 [auto_monad_grad]: 2.27001e-06 [auto_monad_eliminator]: 1.568e-05 [cse]: 3.33e-05 [a_3]: 6.845e-05 [Cycle 2]: 0.00088902, [45] [expand_dump_flag]: 1.25001e-06 [switch_simplify]: 8.80001e-06 [loop_unroll]: 7.09001e-06 [a_1]: 0.00015972 [with_stream_mark]: 1.31e-05 [recompute_prepare]: 7.51999e-06 [updatestate_depend_eliminate]: 3.6e-06 [updatestate_assign_eliminate]: 3.18e-06 [updatestate_loads_eliminate]: 2.68e-06 [parameter_eliminate]: 1.11002e-06 [a_2]: 0.00010929 [accelerated_algorithm]: 7.24001e-06 [shard]: 1.47999e-06 [meta_shard_fg_expand]: 1.28002e-06 [shard_inline]: 6.91001e-06 [merge_send_recv]: 5.37999e-06 [auto_parallel]: 5.97999e-06 [parallel]: 4.99e-06 [flash_sp]: 3.38999e-06 [merge_comm]: 3.61999e-06 [allreduce_fusion]: 3.24001e-06 [matmul_add_comm_reduction]: 5.66e-06 [allreduce_slice_to_reducescatter]: 6.60017e-07 [virtual_shard_identity]: 8.03999e-06 [virtual_dataset]: 6.93e-06 [get_grad_eliminate_]: 6.55002e-06 [virtual_output]: 6.56999e-06 [merge_forward]: 2.80997e-06 [cell_reuse_recompute_pass]: 1.64e-06 [offload_activation]: 6.49999e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.656e-05 [merge_recompute_call_nodes]: 8.89995e-07 [before_grad]: 1.07e-05 [set_forward_comm_id_for_comm_node_pass]: 3.70998e-06 [meta_fg_expand]: 2.23998e-06 [flash_sp_send_recv_attached]: 1.19e-06 [receive_attached]: 1.14e-06 [after_resolve]: 1.35e-05 [a_after_grad]: 1.047e-05 [renormalize]: 8.00064e-08 [add_forward_monad_depend]: 1.48002e-06 [auto_monad_grad]: 1.33002e-06 [auto_monad_eliminator]: 7.23e-06 [cse]: 1.577e-05 [a_3]: 5.454e-05 [py_interpret_to_execute_after_opt_a]: 1.39e-05 [slice_cell_reuse_recomputed_activation]: 4.53999e-06 [rewriter_after_opt_a]: 4.04e-05 [convert_after_rewriter]: 1.052e-05 [order_py_execute_after_rewriter]: 7.93001e-06 [mutable_eliminate]: 0.00061179 [opt_b]: 0.00030119, [1] [Cycle 1]: 0.00029197, [7] [b_1]: 0.0001935 [b_2]: 9.25999e-06 [updatestate_depend_eliminate]: 5.79999e-06 [updatestate_assign_eliminate]: 2.69999e-06 [updatestate_loads_eliminate]: 2.87002e-06 [renormalize]: 5.29981e-07 [cse]: 1.997e-05 [optimize_parallel_all_gather_comm]: 1.993e-05 [overlap_param_gather]: 4.74002e-06 [cconv]: 2.939e-05 [loop_unroll]: 0.00047125 [opt_after_cconv]: 0.00013286, [1] [Cycle 1]: 0.00012428, [7] [c_1]: 3.647e-05 [parameter_eliminate]: 2.91999e-06 [updatestate_depend_eliminate]: 5.22999e-06 [updatestate_assign_eliminate]: 2.59999e-06 [updatestate_loads_eliminate]: 2.31e-06 [cse]: 1.892e-05 [renormalize]: 5.00004e-07 [remove_dup_value]: 1.694e-05 [tuple_transform]: 9.863e-05, [1] [Cycle 1]: 9.109e-05, [4] [d_1]: 5.011e-05 [none_parameter_eliminate]: 1.74998e-06 [renormalize]: 2.19996e-07 [switch_simplify]: 8.2e-06 [partial_unused_args_eliminate]: 4.47998e-06 [add_recomputation]: 5.165e-05 [cse_after_recomputation]: 2.829e-05, [1] [Cycle 1]: 2.147e-05, [1] [cse]: 1.251e-05 [environ_conv]: 8.70001e-06 [swap_dp_allreduce_reducescatter]: 7.99997e-06 [bias_add_comm_swap]: 5.29e-06 [label_micro_interleaved_index]: 6.94001e-06 [label_fine_grained_interleaved_index]: 5.15999e-06 [merge_cast_opt]: 3.73999e-06 [slice_recompute_activation]: 4.89e-06 [micro_interleaved_order_control]: 5.25999e-06 [assign_add_opt]: 5.46002e-06 [ForceFp32Comm]: 3.78999e-06 [remove_cast_before_assign_add]: 3.45998e-06 [full_micro_interleaved_order_control]: 4.77e-06 [reorder_send_recv_between_fp_bp]: 5.40999e-06 [comm_op_add_attrs]: 3.73999e-06 [add_comm_op_reuse_tag]: 3.4e-06 [interleave_split_concat_branches]: 3.58999e-06 [interleave_parallel_branches]: 3.65998e-06 [overlap_opt_shard_in_pipeline]: 3.75998e-06 [overlap_opt_shard_grad_in_pipeline]: 4.37e-06 [control_data_broadcast_order]: 1.592e-05 [grouped_pairwise_exchange_alltoall]: 3.98999e-06 [offloading_packed_experts]: 6.48e-06 [overlap_recompute_and_grad_model_parallel]: 8.47e-06 [overlap_grad_matmul_and_grad_allreduce]: 3.86999e-06 [overlap_recompute_allgather_and_fa_grad]: 3.97e-06 [overlap_recompute_comm]: 4.47e-06 [overlap_grad_ring_attention]: 6.53e-06 [overlap_grad_flash_sp]: 2.42e-05 [begin_end_overlap_inline]: 3.16999e-06 [split_matmul_comm_elemetwise]: 4.60001e-06 [split_layernorm_comm]: 3.98001e-06 [handle_group_info]: 3.45e-06 [symbol_engine_optimizer]: 0.00010188, [1] [Cycle 1]: 9.483e-05, [6] [build]: 3.37002e-06 [elim_shapecalc]: 1.142e-05 [elim_not_effective]: 1.435e-05 [opt_reshape]: 7.99002e-06 [fold_const_symbol]: 1.115e-05 [renormalize]: 2.10013e-07 [detach_backward]: 3.56999e-06 [pipeline_parallel_scheduler]: 1.59998e-06 [auto_monad_reorder]: 2.13e-05 [get_jit_bprop_graph]: 1.49e-06 [rewriter_after_jit_bprop_graph]: 4.92999e-06 [opt_after_jit_grad]: 0.00053302 [validate]: 4.011e-05 Sums bootstrap : 0.000452s : 3.85% type_inference : 0.005961s : 50.73% event_method : 0.000020s : 0.17% auto_monad : 0.000062s : 0.53% graph_reusing : 0.000006s : 0.05% inline : 0.000002s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000021s : 0.18% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000007s : 0.06% parallel-infer-symbol : 0.000004s : 0.03% pre_auto_parallel : 0.000036s : 0.31% insert-virtual-dataset : 0.000002s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000002s : 0.01% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000040s : 0.34% optimize.rewriter_before_opt_a : 0.000099s : 0.84% optimize.opt_a.expand_dump_flag : 0.000004s : 0.04% optimize.opt_a.switch_simplify : 0.000056s : 0.48% optimize.opt_a.loop_unroll : 0.000042s : 0.36% optimize.opt_a.a_1 : 0.000904s : 7.70% optimize.opt_a.with_stream_mark : 0.000030s : 0.26% optimize.opt_a.recompute_prepare : 0.000017s : 0.15% optimize.opt_a.updatestate_depend_eliminate : 0.000007s : 0.06% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.06% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.05% optimize.opt_a.parameter_eliminate : 0.000003s : 0.03% optimize.opt_a.a_2 : 0.000231s : 1.96% optimize.opt_a.accelerated_algorithm : 0.000015s : 0.13% optimize.opt_a.shard : 0.000003s : 0.03% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.03% optimize.opt_a.shard_inline : 0.000014s : 0.12% optimize.opt_a.merge_send_recv : 0.000014s : 0.12% optimize.opt_a.auto_parallel : 0.000012s : 0.11% optimize.opt_a.parallel : 0.000024s : 0.21% optimize.opt_a.flash_sp : 0.000012s : 0.10% optimize.opt_a.merge_comm : 0.000008s : 0.07% optimize.opt_a.allreduce_fusion : 0.000007s : 0.06% optimize.opt_a.matmul_add_comm_reduction : 0.000015s : 0.13% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000017s : 0.15% optimize.opt_a.virtual_dataset : 0.000015s : 0.13% optimize.opt_a.get_grad_eliminate_ : 0.000015s : 0.12% optimize.opt_a.virtual_output : 0.000014s : 0.12% optimize.opt_a.merge_forward : 0.000007s : 0.06% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.02% optimize.opt_a.offload_activation : 0.000016s : 0.14% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000032s : 0.27% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.02% optimize.opt_a.before_grad : 0.000022s : 0.19% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000008s : 0.06% optimize.opt_a.meta_fg_expand : 0.000005s : 0.04% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.04% optimize.opt_a.receive_attached : 0.000003s : 0.03% optimize.opt_a.after_resolve : 0.000029s : 0.24% optimize.opt_a.a_after_grad : 0.000023s : 0.19% optimize.opt_a.renormalize : 0.000670s : 5.70% optimize.opt_a.add_forward_monad_depend : 0.000007s : 0.06% optimize.opt_a.auto_monad_grad : 0.000004s : 0.03% optimize.opt_a.auto_monad_eliminator : 0.000023s : 0.19% optimize.opt_a.cse : 0.000049s : 0.42% optimize.opt_a.a_3 : 0.000123s : 1.05% optimize.py_interpret_to_execute_after_opt_a : 0.000014s : 0.12% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.04% optimize.rewriter_after_opt_a : 0.000040s : 0.34% optimize.convert_after_rewriter : 0.000011s : 0.09% optimize.order_py_execute_after_rewriter : 0.000008s : 0.07% optimize.mutable_eliminate : 0.000612s : 5.21% optimize.opt_b.b_1 : 0.000193s : 1.65% optimize.opt_b.b_2 : 0.000009s : 0.08% optimize.opt_b.updatestate_depend_eliminate : 0.000006s : 0.05% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.02% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.02% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000020s : 0.17% optimize.optimize_parallel_all_gather_comm : 0.000020s : 0.17% optimize.overlap_param_gather : 0.000005s : 0.04% optimize.cconv : 0.000029s : 0.25% optimize.loop_unroll : 0.000471s : 4.01% optimize.opt_after_cconv.c_1 : 0.000036s : 0.31% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.02% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000005s : 0.04% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.02% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.02% optimize.opt_after_cconv.cse : 0.000019s : 0.16% optimize.opt_after_cconv.renormalize : 0.000001s : 0.00% optimize.remove_dup_value : 0.000017s : 0.14% optimize.tuple_transform.d_1 : 0.000050s : 0.43% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000008s : 0.07% optimize.partial_unused_args_eliminate : 0.000004s : 0.04% optimize.add_recomputation : 0.000052s : 0.44% optimize.cse_after_recomputation.cse : 0.000013s : 0.11% optimize.environ_conv : 0.000009s : 0.07% optimize.swap_dp_allreduce_reducescatter : 0.000008s : 0.07% optimize.bias_add_comm_swap : 0.000005s : 0.05% optimize.label_micro_interleaved_index : 0.000007s : 0.06% optimize.label_fine_grained_interleaved_index : 0.000005s : 0.04% optimize.merge_cast_opt : 0.000004s : 0.03% optimize.slice_recompute_activation : 0.000005s : 0.04% optimize.micro_interleaved_order_control : 0.000005s : 0.04% optimize.assign_add_opt : 0.000005s : 0.05% optimize.ForceFp32Comm : 0.000004s : 0.03% optimize.remove_cast_before_assign_add : 0.000003s : 0.03% optimize.full_micro_interleaved_order_control : 0.000005s : 0.04% optimize.reorder_send_recv_between_fp_bp : 0.000005s : 0.05% optimize.comm_op_add_attrs : 0.000004s : 0.03% optimize.add_comm_op_reuse_tag : 0.000003s : 0.03% optimize.interleave_split_concat_branches : 0.000004s : 0.03% optimize.interleave_parallel_branches : 0.000004s : 0.03% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.03% optimize.overlap_opt_shard_grad_in_pipeline : 0.000004s : 0.04% optimize.control_data_broadcast_order : 0.000016s : 0.14% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.03% optimize.offloading_packed_experts : 0.000006s : 0.06% optimize.overlap_recompute_and_grad_model_parallel : 0.000008s : 0.07% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.03% optimize.overlap_recompute_allgather_and_fa_grad : 0.000004s : 0.03% optimize.overlap_recompute_comm : 0.000004s : 0.04% optimize.overlap_grad_ring_attention : 0.000007s : 0.06% optimize.overlap_grad_flash_sp : 0.000024s : 0.21% optimize.begin_end_overlap_inline : 0.000003s : 0.03% optimize.split_matmul_comm_elemetwise : 0.000005s : 0.04% optimize.split_layernorm_comm : 0.000004s : 0.03% optimize.handle_group_info : 0.000003s : 0.03% optimize.symbol_engine_optimizer.build : 0.000003s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000011s : 0.10% optimize.symbol_engine_optimizer.elim_not_effective : 0.000014s : 0.12% optimize.symbol_engine_optimizer.opt_reshape : 0.000008s : 0.07% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000011s : 0.09% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000004s : 0.03% pipeline_parallel_scheduler : 0.000002s : 0.01% auto_monad_reorder : 0.000021s : 0.18% get_jit_bprop_graph : 0.000001s : 0.01% rewriter_after_jit_bprop_graph : 0.000005s : 0.04% opt_after_jit_grad : 0.000533s : 4.54% validate : 0.000040s : 0.34% Time group info: ------[substitution.] 0.000208 34 1.04% : 0.000002s : 2: substitution.elim_not_effective 0.70% : 0.000001s : 2: substitution.fold_const_symbol 2.94% : 0.000006s : 5: substitution.graph_param_transform 76.60% : 0.000160s : 4: substitution.inline 1.85% : 0.000004s : 4: substitution.j_node_and_user_rematch 2.59% : 0.000005s : 4: substitution.remove_not_recompute_node 2.73% : 0.000006s : 6: substitution.replace_old_param 8.83% : 0.000018s : 6: substitution.tuple_list_get_item_eliminator 2.74% : 0.000006s : 1: substitution.value_based_eliminate ------[type_inference.] 0.005908 2 87.45% : 0.005166s : 1: type_inference.infer 12.55% : 0.000742s : 1: type_inference.specialize ------[replace.] 0.000071 10 54.44% : 0.000039s : 4: replace.inline 45.56% : 0.000033s : 6: replace.tuple_list_get_item_eliminator ------[match.] 0.000172 10 90.79% : 0.000156s : 4: match.inline 9.21% : 0.000016s : 6: match.tuple_list_get_item_eliminator ------[predicate.] 0.000240 1590 0.90% : 0.000002s : 16: predicate.accumulaten_eliminater 0.62% : 0.000002s : 5: predicate.ad_related_special_op_eliminate 0.52% : 0.000001s : 10: predicate.addn_check_dump 0.92% : 0.000002s : 16: predicate.addn_zero_filter 0.82% : 0.000002s : 16: predicate.adjust_all_reduce_mul_add 1.94% : 0.000005s : 26: predicate.arithmetic_simplify 0.97% : 0.000002s : 16: predicate.cast_eliminate 0.65% : 0.000002s : 10: predicate.check_bprop_eliminate 0.52% : 0.000001s : 10: predicate.compare_switch_simplify 0.18% : 0.000000s : 5: predicate.const_output_eliminate 0.59% : 0.000001s : 10: predicate.depend_value_elim 0.95% : 0.000002s : 16: predicate.dict_get_item_const_eliminator 1.09% : 0.000003s : 16: predicate.dict_get_item_eliminator 0.92% : 0.000002s : 16: predicate.dict_set_item_eliminator 0.89% : 0.000002s : 10: predicate.dumpgradient_eliminate 0.21% : 0.000001s : 5: predicate.elim_not_effective 0.36% : 0.000001s : 5: predicate.elim_shapecalc_of_broadcastargs 1.24% : 0.000003s : 21: predicate.environ_add_const_eliminate 1.33% : 0.000003s : 21: predicate.environ_get_add_eliminate 1.06% : 0.000003s : 21: predicate.environ_get_depend_swap 1.73% : 0.000004s : 31: predicate.environ_get_eliminate 1.07% : 0.000003s : 21: predicate.environ_get_set_eliminate 1.56% : 0.000004s : 26: predicate.exchange_switch_depend_value 2.61% : 0.000006s : 26: predicate.float_depend_g_call 0.52% : 0.000001s : 10: predicate.float_environ_get_switch 0.77% : 0.000002s : 15: predicate.float_tuple_getitem_switch 0.19% : 0.000000s : 5: predicate.fold_const_symbol 0.62% : 0.000001s : 10: predicate.get_grad_eliminate 0.20% : 0.000000s : 5: predicate.graph_param_transform 0.61% : 0.000001s : 10: predicate.incorporate_call 0.47% : 0.000001s : 10: predicate.incorporate_call_switch 6.05% : 0.000015s : 72: predicate.inline 0.87% : 0.000002s : 10: predicate.inline_without_move 0.33% : 0.000001s : 10: predicate.j_node_and_user_rematch 0.76% : 0.000002s : 10: predicate.less_batch_normalization 1.86% : 0.000004s : 32: predicate.list_to_tuple_eliminator_ 2.55% : 0.000006s : 48: predicate.load_eliminater 0.80% : 0.000002s : 5: predicate.loop_unroll_after_grad 2.42% : 0.000006s : 40: predicate.loop_unroll_before_grad 1.74% : 0.000004s : 26: predicate.make_slice_get_slice_eliminator 0.54% : 0.000001s : 10: predicate.merge_addn 0.57% : 0.000001s : 10: predicate.micro_step_allgather_replace 0.55% : 0.000001s : 10: predicate.mini_step_allgather_replace 0.87% : 0.000002s : 16: predicate.minmaximum_grad 1.05% : 0.000003s : 5: predicate.mutable_eliminate 0.37% : 0.000001s : 5: predicate.opt_reshape 0.37% : 0.000001s : 5: predicate.parallel_virtual_node 1.87% : 0.000004s : 26: predicate.partial_defer_inline 1.81% : 0.000004s : 27: predicate.partial_eliminate 0.85% : 0.000002s : 16: predicate.print_const_string_wrapper 0.61% : 0.000001s : 10: predicate.reduce_all_const_elim 1.22% : 0.000003s : 16: predicate.reduce_eliminate 2.54% : 0.000006s : 48: predicate.redundant_stop_gradient_eliminater 0.41% : 0.000001s : 10: predicate.remove_not_recompute_node 1.42% : 0.000003s : 32: predicate.replace_applicator 0.54% : 0.000001s : 10: predicate.replace_old_param 0.33% : 0.000001s : 5: predicate.reset_defer_inline 0.95% : 0.000002s : 16: predicate.reshape_eliminate 0.60% : 0.000001s : 10: predicate.row_tensor_add_zeros_like 0.42% : 0.000001s : 5: predicate.row_tensor_eliminate 0.80% : 0.000002s : 10: predicate.same_eliminate 0.45% : 0.000001s : 10: predicate.set_cell_output_no_recompute 0.83% : 0.000002s : 10: predicate.shard_identity_eliminate 0.71% : 0.000002s : 10: predicate.special_op_eliminate 0.78% : 0.000002s : 10: predicate.specialize_transform 0.81% : 0.000002s : 10: predicate.split_environ_get_set_with_tuple_value 0.70% : 0.000002s : 10: predicate.stack_unstack_eliminate 0.32% : 0.000001s : 5: predicate.switch_call_monad_eliminater 1.56% : 0.000004s : 26: predicate.switch_defer_inline 2.11% : 0.000005s : 36: predicate.switch_layer_defer_inline 4.97% : 0.000012s : 81: predicate.switch_simplify 0.90% : 0.000002s : 16: predicate.tile_eliminate 0.96% : 0.000002s : 16: predicate.transpose_eliminate 1.52% : 0.000004s : 26: predicate.tuple_list_convert_item_index_to_positive 1.54% : 0.000004s : 26: predicate.tuple_list_get_item_const_eliminator 1.58% : 0.000004s : 26: predicate.tuple_list_get_item_depend_reorder 3.52% : 0.000008s : 42: predicate.tuple_list_get_item_eliminator 1.49% : 0.000004s : 26: predicate.tuple_list_get_set_item_eliminator 2.03% : 0.000005s : 36: predicate.tuple_list_set_item_eliminator 1.74% : 0.000004s : 32: predicate.tuple_to_list_eliminator_ 2.51% : 0.000006s : 48: predicate.updatestate_pure_node_eliminater 3.26% : 0.000008s : 58: predicate.updatestate_useless_node_eliminater 0.61% : 0.000001s : 5: predicate.value_based_eliminate 0.62% : 0.000001s : 10: predicate.virtual_dataset_eliminate 0.60% : 0.000001s : 10: predicate.virtual_output_eliminate 0.25% : 0.000001s : 5: predicate.virtual_view_grad_eliminate 0.54% : 0.000001s : 5: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000556 11 53.79% : 0.000299s : 5: func_graph_cloner_run.FuncGraphClonerGraph 46.21% : 0.000257s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.028131 192 0.02% : 0.000007s : 1: ForceFp32Comm 11.64% : 0.003275s : 1: add_attr 11.59% : 0.003260s : 1: add_attr_with_inline 0.02% : 0.000006s : 1: add_comm_op_reuse_tag 0.20% : 0.000055s : 1: add_recomputation 0.03% : 0.000008s : 1: assign_add_opt 0.25% : 0.000071s : 1: auto_monad 0.10% : 0.000029s : 1: auto_monad_reorder 0.02% : 0.000006s : 1: begin_end_overlap_inline 0.03% : 0.000008s : 1: bias_add_comm_swap 1.76% : 0.000496s : 1: bootstrap 0.12% : 0.000033s : 1: cconv 0.03% : 0.000007s : 1: comm_op_add_attrs 0.07% : 0.000019s : 1: control_data_broadcast_order 0.05% : 0.000014s : 1: convert_after_rewriter 0.11% : 0.000031s : 1: cse_after_recomputation 0.03% : 0.000007s : 1: dataset_repeat_opt 0.06% : 0.000018s : 1: detach_backward 0.04% : 0.000012s : 1: environ_conv 0.11% : 0.000030s : 1: event_method 0.03% : 0.000007s : 1: full_micro_interleaved_order_control 0.03% : 0.000008s : 1: get_jit_bprop_graph 0.04% : 0.000012s : 1: graph_reusing 0.02% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.02% : 0.000006s : 1: handle_group_info 0.03% : 0.000008s : 1: inline 0.03% : 0.000009s : 1: insert-virtual-dataset 0.02% : 0.000006s : 1: interleave_parallel_branches 0.02% : 0.000006s : 1: interleave_split_concat_branches 0.03% : 0.000008s : 1: label_fine_grained_interleaved_index 0.03% : 0.000010s : 1: label_micro_interleaved_index 1.70% : 0.000477s : 1: loop_unroll 0.02% : 0.000007s : 1: merge_cast_opt 0.09% : 0.000027s : 1: micro_interleaved_order_control 2.20% : 0.000618s : 1: mutable_eliminate 0.04% : 0.000010s : 1: offloading_packed_experts 0.05% : 0.000015s : 1: opt.transform.loop_unroll_optimizer 0.06% : 0.000016s : 1: opt.transform.mutable_eliminate 4.95% : 0.001394s : 78: opt.transform.opt_a 0.12% : 0.000035s : 1: opt.transform.opt_after_cconv 0.10% : 0.000028s : 1: opt.transform.opt_after_jit_grad 0.46% : 0.000130s : 28: opt.transform.opt_b 0.20% : 0.000056s : 2: opt.transform.opt_trans_graph 0.15% : 0.000041s : 4: opt.transform.symbol_engine_opt 11.74% : 0.003303s : 1: opt_a 0.48% : 0.000136s : 1: opt_after_cconv 1.93% : 0.000544s : 1: opt_after_jit_grad 1.08% : 0.000305s : 1: opt_b 21.83% : 0.006140s : 1: optimize 0.08% : 0.000023s : 1: optimize_parallel_all_gather_comm 0.04% : 0.000011s : 1: order_py_execute_after_rewriter 0.10% : 0.000028s : 1: overlap_grad_flash_sp 0.02% : 0.000007s : 1: overlap_grad_matmul_and_grad_allreduce 0.03% : 0.000010s : 1: overlap_grad_ring_attention 0.02% : 0.000007s : 1: overlap_opt_shard_grad_in_pipeline 0.02% : 0.000006s : 1: overlap_opt_shard_in_pipeline 0.03% : 0.000008s : 1: overlap_param_gather 0.02% : 0.000007s : 1: overlap_recompute_allgather_and_fa_grad 0.04% : 0.000011s : 1: overlap_recompute_and_grad_model_parallel 0.03% : 0.000007s : 1: overlap_recompute_comm 0.04% : 0.000010s : 1: parallel-infer-symbol 0.02% : 0.000007s : 1: parallel-infer-symbol-second 0.03% : 0.000007s : 1: partial_unused_args_eliminate 0.03% : 0.000009s : 1: pipeline_parallel_scheduler 0.02% : 0.000007s : 1: pipeline_split 0.16% : 0.000044s : 1: pre_auto_parallel 0.16% : 0.000044s : 1: py_interpret_to_execute 0.06% : 0.000017s : 1: py_interpret_to_execute_after_opt_a 0.02% : 0.000006s : 1: remove_cast_before_assign_add 0.07% : 0.000020s : 1: remove_dup_value 1.21% : 0.000341s : 1: renormalize.infer 1.14% : 0.000320s : 1: renormalize.specialize 0.03% : 0.000008s : 1: reorder_send_recv_between_fp_bp 0.04% : 0.000011s : 1: rewriter_after_jit_bprop_graph 0.16% : 0.000044s : 1: rewriter_after_opt_a 0.36% : 0.000103s : 1: rewriter_before_opt_a 0.03% : 0.000007s : 1: slice_cell_reuse_recomputed_activation 0.03% : 0.000008s : 1: slice_recompute_activation 0.02% : 0.000007s : 1: split_layernorm_comm 0.03% : 0.000008s : 1: split_matmul_comm_elemetwise 0.04% : 0.000011s : 1: swap_dp_allreduce_reducescatter 0.37% : 0.000105s : 1: symbol_engine_optimizer 0.36% : 0.000101s : 1: tuple_transform 21.33% : 0.006001s : 1: type_inference [WARNING] ME(160782:281473027792688,MainProcess):2026-01-29-17:47:20.254.026 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 0.0155358, [21] [bootstrap]: 0.00044754 [type_inference]: 0.00592558 [event_method]: 2.11e-05 [auto_monad]: 6.143e-05 [graph_reusing]: 5.91998e-06 [inline]: 2.18002e-06 [add_attr]: 0.00320599, [1] [add_attr_with_inline]: 0.00319699, [1] [Cycle 1]: 5.989e-05, [2] [tag_attr]: 2.238e-05 [meta_addattr_fg_expand]: 6.32001e-06 [parallel-infer-symbol]: 3.16001e-06 [pre_auto_parallel]: 3.739e-05 [insert-virtual-dataset]: 2.34001e-06 [parallel-infer-symbol-second]: 6.99976e-07 [dataset_repeat_opt]: 2.50997e-06 [pipeline_split]: 1.69998e-06 [optimize]: 0.00511237, [53] [py_interpret_to_execute]: 3.05e-05 [rewriter_before_opt_a]: 9.065e-05 [opt_a]: 0.00294292, [2] [Cycle 1]: 0.00222827, [45] [expand_dump_flag]: 3.51001e-06 [switch_simplify]: 8.537e-05 [loop_unroll]: 3.55e-05 [a_1]: 0.00072873 [with_stream_mark]: 1.494e-05 [recompute_prepare]: 9.22999e-06 [updatestate_depend_eliminate]: 4.35e-06 [updatestate_assign_eliminate]: 3.26001e-06 [updatestate_loads_eliminate]: 3.07002e-06 [parameter_eliminate]: 1.74e-06 [a_2]: 9.002e-05 [accelerated_algorithm]: 8.55001e-06 [shard]: 2.07999e-06 [meta_shard_fg_expand]: 1.83002e-06 [shard_inline]: 7.35e-06 [merge_send_recv]: 8.17e-06 [auto_parallel]: 6.69001e-06 [parallel]: 1.773e-05 [flash_sp]: 7.97e-06 [merge_comm]: 4.15999e-06 [allreduce_fusion]: 3.26001e-06 [matmul_add_comm_reduction]: 9.05001e-06 [allreduce_slice_to_reducescatter]: 6.39993e-07 [virtual_shard_identity]: 8.75999e-06 [virtual_dataset]: 7.28e-06 [get_grad_eliminate_]: 6.94001e-06 [virtual_output]: 7.11999e-06 [merge_forward]: 3.58e-06 [cell_reuse_recompute_pass]: 1.34e-06 [offload_activation]: 9.50001e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.264e-05 [merge_recompute_call_nodes]: 1.37e-06 [before_grad]: 1.101e-05 [set_forward_comm_id_for_comm_node_pass]: 3.46999e-06 [meta_fg_expand]: 3.08998e-06 [flash_sp_send_recv_attached]: 2.63e-06 [receive_attached]: 2.21e-06 [after_resolve]: 1.307e-05 [a_after_grad]: 1.138e-05 [renormalize]: 0.00071753 [add_forward_monad_depend]: 5.64e-06 [auto_monad_grad]: 2.02999e-06 [auto_monad_eliminator]: 1.507e-05 [cse]: 3.282e-05 [a_3]: 5.39e-05 [Cycle 2]: 0.00070485, [45] [expand_dump_flag]: 1.34e-06 [switch_simplify]: 9.19e-06 [loop_unroll]: 7.26001e-06 [a_1]: 0.00015973 [with_stream_mark]: 1.151e-05 [recompute_prepare]: 7.23e-06 [updatestate_depend_eliminate]: 2.97002e-06 [updatestate_assign_eliminate]: 2.71e-06 [updatestate_loads_eliminate]: 2.29001e-06 [parameter_eliminate]: 1.02e-06 [a_2]: 8.099e-05 [accelerated_algorithm]: 6.71e-06 [shard]: 1.09e-06 [meta_shard_fg_expand]: 1.25999e-06 [shard_inline]: 6.74999e-06 [merge_send_recv]: 6.01e-06 [auto_parallel]: 5.79e-06 [parallel]: 4.80001e-06 [flash_sp]: 3.38999e-06 [merge_comm]: 3.46999e-06 [allreduce_fusion]: 2.99001e-06 [matmul_add_comm_reduction]: 5.09e-06 [allreduce_slice_to_reducescatter]: 3.89991e-07 [virtual_shard_identity]: 7.51999e-06 [virtual_dataset]: 6.83e-06 [get_grad_eliminate_]: 6.93e-06 [virtual_output]: 6.46999e-06 [merge_forward]: 2.64999e-06 [cell_reuse_recompute_pass]: 1.60001e-06 [offload_activation]: 6.44001e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.284e-05 [merge_recompute_call_nodes]: 7.39994e-07 [before_grad]: 1.017e-05 [set_forward_comm_id_for_comm_node_pass]: 3.45e-06 [meta_fg_expand]: 2.24999e-06 [flash_sp_send_recv_attached]: 9.39996e-07 [receive_attached]: 1.01002e-06 [after_resolve]: 1.248e-05 [a_after_grad]: 1.05e-05 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 1.19e-06 [auto_monad_grad]: 7.30011e-07 [auto_monad_eliminator]: 6.93e-06 [cse]: 1.566e-05 [a_3]: 4.105e-05 [py_interpret_to_execute_after_opt_a]: 1.036e-05 [slice_cell_reuse_recomputed_activation]: 1.82001e-06 [rewriter_after_opt_a]: 3.375e-05 [convert_after_rewriter]: 6.84999e-06 [order_py_execute_after_rewriter]: 5.19e-06 [mutable_eliminate]: 0.00055825 [opt_b]: 0.00022978, [1] [Cycle 1]: 0.00022336, [7] [b_1]: 0.00014915 [b_2]: 9.35001e-06 [updatestate_depend_eliminate]: 5.14e-06 [updatestate_assign_eliminate]: 2.37999e-06 [updatestate_loads_eliminate]: 2.66999e-06 [renormalize]: 4.19997e-07 [cse]: 1.876e-05 [optimize_parallel_all_gather_comm]: 1.604e-05 [overlap_param_gather]: 1.99e-06 [cconv]: 2.43e-05 [loop_unroll]: 0.00049111 [opt_after_cconv]: 0.00010776, [1] [Cycle 1]: 0.00010193, [7] [c_1]: 3.598e-05 [parameter_eliminate]: 2.79999e-06 [updatestate_depend_eliminate]: 5.25999e-06 [updatestate_assign_eliminate]: 2.66e-06 [updatestate_loads_eliminate]: 2.31e-06 [cse]: 1.843e-05 [renormalize]: 4.00003e-07 [remove_dup_value]: 1.408e-05 [tuple_transform]: 8.234e-05, [1] [Cycle 1]: 7.785e-05, [4] [d_1]: 4.97e-05 [none_parameter_eliminate]: 1.96e-06 [renormalize]: 1.8999e-07 [switch_simplify]: 7.88001e-06 [partial_unused_args_eliminate]: 1.64998e-06 [add_recomputation]: 4.992e-05 [cse_after_recomputation]: 2.335e-05, [1] [Cycle 1]: 1.835e-05, [1] [cse]: 1.213e-05 [environ_conv]: 4.99e-06 [swap_dp_allreduce_reducescatter]: 5.34e-06 [bias_add_comm_swap]: 2.91e-06 [label_micro_interleaved_index]: 4.75001e-06 [label_fine_grained_interleaved_index]: 2.58998e-06 [merge_cast_opt]: 1.30999e-06 [slice_recompute_activation]: 2.01e-06 [micro_interleaved_order_control]: 2.31e-06 [assign_add_opt]: 1.14e-06 [ForceFp32Comm]: 8.00006e-07 [remove_cast_before_assign_add]: 1.12e-06 [full_micro_interleaved_order_control]: 2.13998e-06 [reorder_send_recv_between_fp_bp]: 2.54001e-06 [comm_op_add_attrs]: 1.00001e-06 [add_comm_op_reuse_tag]: 9.39996e-07 [interleave_split_concat_branches]: 1.18001e-06 [interleave_parallel_branches]: 1.09e-06 [overlap_opt_shard_in_pipeline]: 1.29e-06 [overlap_opt_shard_grad_in_pipeline]: 1.89e-06 [control_data_broadcast_order]: 1.296e-05 [grouped_pairwise_exchange_alltoall]: 1.74998e-06 [offloading_packed_experts]: 3.91999e-06 [overlap_recompute_and_grad_model_parallel]: 4.41002e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.15001e-06 [overlap_recompute_allgather_and_fa_grad]: 1.36002e-06 [overlap_recompute_comm]: 2.27001e-06 [overlap_grad_ring_attention]: 3.73001e-06 [overlap_grad_flash_sp]: 1.898e-05 [begin_end_overlap_inline]: 5.10016e-07 [split_matmul_comm_elemetwise]: 2.38002e-06 [split_layernorm_comm]: 1.59e-06 [handle_group_info]: 9.00007e-07 [symbol_engine_optimizer]: 7.549e-05, [1] [Cycle 1]: 7.123e-05, [6] [build]: 2.68e-06 [elim_shapecalc]: 9.65002e-06 [elim_not_effective]: 1.248e-05 [opt_reshape]: 7.56001e-06 [fold_const_symbol]: 1.067e-05 [renormalize]: 2.29978e-07 [detach_backward]: 1.94999e-06 [pipeline_parallel_scheduler]: 1.54998e-06 [auto_monad_reorder]: 1.78e-05 [get_jit_bprop_graph]: 1.39e-06 [rewriter_after_jit_bprop_graph]: 3.42002e-06 [opt_after_jit_grad]: 0.00049198 [validate]: 3.702e-05 Sums bootstrap : 0.000448s : 3.93% type_inference : 0.005926s : 52.06% event_method : 0.000021s : 0.19% auto_monad : 0.000061s : 0.54% graph_reusing : 0.000006s : 0.05% inline : 0.000002s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000022s : 0.20% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.06% parallel-infer-symbol : 0.000003s : 0.03% pre_auto_parallel : 0.000037s : 0.33% insert-virtual-dataset : 0.000002s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.01% dataset_repeat_opt : 0.000003s : 0.02% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000030s : 0.27% optimize.rewriter_before_opt_a : 0.000091s : 0.80% optimize.opt_a.expand_dump_flag : 0.000005s : 0.04% optimize.opt_a.switch_simplify : 0.000095s : 0.83% optimize.opt_a.loop_unroll : 0.000043s : 0.38% optimize.opt_a.a_1 : 0.000888s : 7.81% optimize.opt_a.with_stream_mark : 0.000026s : 0.23% optimize.opt_a.recompute_prepare : 0.000016s : 0.14% optimize.opt_a.updatestate_depend_eliminate : 0.000007s : 0.06% optimize.opt_a.updatestate_assign_eliminate : 0.000006s : 0.05% optimize.opt_a.updatestate_loads_eliminate : 0.000005s : 0.05% optimize.opt_a.parameter_eliminate : 0.000003s : 0.02% optimize.opt_a.a_2 : 0.000171s : 1.50% optimize.opt_a.accelerated_algorithm : 0.000015s : 0.13% optimize.opt_a.shard : 0.000003s : 0.03% optimize.opt_a.meta_shard_fg_expand : 0.000003s : 0.03% optimize.opt_a.shard_inline : 0.000014s : 0.12% optimize.opt_a.merge_send_recv : 0.000014s : 0.12% optimize.opt_a.auto_parallel : 0.000012s : 0.11% optimize.opt_a.parallel : 0.000023s : 0.20% optimize.opt_a.flash_sp : 0.000011s : 0.10% optimize.opt_a.merge_comm : 0.000008s : 0.07% optimize.opt_a.allreduce_fusion : 0.000006s : 0.05% optimize.opt_a.matmul_add_comm_reduction : 0.000014s : 0.12% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000016s : 0.14% optimize.opt_a.virtual_dataset : 0.000014s : 0.12% optimize.opt_a.get_grad_eliminate_ : 0.000014s : 0.12% optimize.opt_a.virtual_output : 0.000014s : 0.12% optimize.opt_a.merge_forward : 0.000006s : 0.05% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.03% optimize.opt_a.offload_activation : 0.000016s : 0.14% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000025s : 0.22% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.02% optimize.opt_a.before_grad : 0.000021s : 0.19% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000007s : 0.06% optimize.opt_a.meta_fg_expand : 0.000005s : 0.05% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.03% optimize.opt_a.receive_attached : 0.000003s : 0.03% optimize.opt_a.after_resolve : 0.000026s : 0.22% optimize.opt_a.a_after_grad : 0.000022s : 0.19% optimize.opt_a.renormalize : 0.000718s : 6.30% optimize.opt_a.add_forward_monad_depend : 0.000007s : 0.06% optimize.opt_a.auto_monad_grad : 0.000003s : 0.02% optimize.opt_a.auto_monad_eliminator : 0.000022s : 0.19% optimize.opt_a.cse : 0.000048s : 0.43% optimize.opt_a.a_3 : 0.000095s : 0.83% optimize.py_interpret_to_execute_after_opt_a : 0.000010s : 0.09% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.02% optimize.rewriter_after_opt_a : 0.000034s : 0.30% optimize.convert_after_rewriter : 0.000007s : 0.06% optimize.order_py_execute_after_rewriter : 0.000005s : 0.05% optimize.mutable_eliminate : 0.000558s : 4.90% optimize.opt_b.b_1 : 0.000149s : 1.31% optimize.opt_b.b_2 : 0.000009s : 0.08% optimize.opt_b.updatestate_depend_eliminate : 0.000005s : 0.05% optimize.opt_b.updatestate_assign_eliminate : 0.000002s : 0.02% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.02% optimize.opt_b.renormalize : 0.000000s : 0.00% optimize.opt_b.cse : 0.000019s : 0.16% optimize.optimize_parallel_all_gather_comm : 0.000016s : 0.14% optimize.overlap_param_gather : 0.000002s : 0.02% optimize.cconv : 0.000024s : 0.21% optimize.loop_unroll : 0.000491s : 4.31% optimize.opt_after_cconv.c_1 : 0.000036s : 0.32% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.02% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000005s : 0.05% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.02% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.02% optimize.opt_after_cconv.cse : 0.000018s : 0.16% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000014s : 0.12% optimize.tuple_transform.d_1 : 0.000050s : 0.44% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.02% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000008s : 0.07% optimize.partial_unused_args_eliminate : 0.000002s : 0.01% optimize.add_recomputation : 0.000050s : 0.44% optimize.cse_after_recomputation.cse : 0.000012s : 0.11% optimize.environ_conv : 0.000005s : 0.04% optimize.swap_dp_allreduce_reducescatter : 0.000005s : 0.05% optimize.bias_add_comm_swap : 0.000003s : 0.03% optimize.label_micro_interleaved_index : 0.000005s : 0.04% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.02% optimize.merge_cast_opt : 0.000001s : 0.01% optimize.slice_recompute_activation : 0.000002s : 0.02% optimize.micro_interleaved_order_control : 0.000002s : 0.02% optimize.assign_add_opt : 0.000001s : 0.01% optimize.ForceFp32Comm : 0.000001s : 0.01% optimize.remove_cast_before_assign_add : 0.000001s : 0.01% optimize.full_micro_interleaved_order_control : 0.000002s : 0.02% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.02% optimize.comm_op_add_attrs : 0.000001s : 0.01% optimize.add_comm_op_reuse_tag : 0.000001s : 0.01% optimize.interleave_split_concat_branches : 0.000001s : 0.01% optimize.interleave_parallel_branches : 0.000001s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.02% optimize.control_data_broadcast_order : 0.000013s : 0.11% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.02% optimize.offloading_packed_experts : 0.000004s : 0.03% optimize.overlap_recompute_and_grad_model_parallel : 0.000004s : 0.04% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.01% optimize.overlap_recompute_comm : 0.000002s : 0.02% optimize.overlap_grad_ring_attention : 0.000004s : 0.03% optimize.overlap_grad_flash_sp : 0.000019s : 0.17% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.02% optimize.split_layernorm_comm : 0.000002s : 0.01% optimize.handle_group_info : 0.000001s : 0.01% optimize.symbol_engine_optimizer.build : 0.000003s : 0.02% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000010s : 0.08% optimize.symbol_engine_optimizer.elim_not_effective : 0.000012s : 0.11% optimize.symbol_engine_optimizer.opt_reshape : 0.000008s : 0.07% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000011s : 0.09% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.02% pipeline_parallel_scheduler : 0.000002s : 0.01% auto_monad_reorder : 0.000018s : 0.16% get_jit_bprop_graph : 0.000001s : 0.01% rewriter_after_jit_bprop_graph : 0.000003s : 0.03% opt_after_jit_grad : 0.000492s : 4.32% validate : 0.000037s : 0.33% Time group info: ------[substitution.] 0.000194 34 0.91% : 0.000002s : 2: substitution.elim_not_effective 0.83% : 0.000002s : 2: substitution.fold_const_symbol 2.98% : 0.000006s : 5: substitution.graph_param_transform 76.41% : 0.000148s : 4: substitution.inline 1.64% : 0.000003s : 4: substitution.j_node_and_user_rematch 2.31% : 0.000004s : 4: substitution.remove_not_recompute_node 2.60% : 0.000005s : 6: substitution.replace_old_param 9.45% : 0.000018s : 6: substitution.tuple_list_get_item_eliminator 2.87% : 0.000006s : 1: substitution.value_based_eliminate ------[type_inference.] 0.005863 2 86.63% : 0.005079s : 1: type_inference.infer 13.37% : 0.000784s : 1: type_inference.specialize ------[replace.] 0.000072 10 54.18% : 0.000039s : 4: replace.inline 45.82% : 0.000033s : 6: replace.tuple_list_get_item_eliminator ------[match.] 0.000161 10 90.25% : 0.000145s : 4: match.inline 9.75% : 0.000016s : 6: match.tuple_list_get_item_eliminator ------[predicate.] 0.000272 1590 0.90% : 0.000002s : 16: predicate.accumulaten_eliminater 0.62% : 0.000002s : 5: predicate.ad_related_special_op_eliminate 0.42% : 0.000001s : 10: predicate.addn_check_dump 0.94% : 0.000003s : 16: predicate.addn_zero_filter 0.71% : 0.000002s : 16: predicate.adjust_all_reduce_mul_add 1.79% : 0.000005s : 26: predicate.arithmetic_simplify 0.86% : 0.000002s : 16: predicate.cast_eliminate 0.58% : 0.000002s : 10: predicate.check_bprop_eliminate 0.43% : 0.000001s : 10: predicate.compare_switch_simplify 0.17% : 0.000000s : 5: predicate.const_output_eliminate 0.47% : 0.000001s : 10: predicate.depend_value_elim 0.81% : 0.000002s : 16: predicate.dict_get_item_const_eliminator 0.93% : 0.000003s : 16: predicate.dict_get_item_eliminator 0.79% : 0.000002s : 16: predicate.dict_set_item_eliminator 0.75% : 0.000002s : 10: predicate.dumpgradient_eliminate 0.19% : 0.000001s : 5: predicate.elim_not_effective 0.33% : 0.000001s : 5: predicate.elim_shapecalc_of_broadcastargs 1.05% : 0.000003s : 21: predicate.environ_add_const_eliminate 0.94% : 0.000003s : 21: predicate.environ_get_add_eliminate 0.94% : 0.000003s : 21: predicate.environ_get_depend_swap 1.43% : 0.000004s : 31: predicate.environ_get_eliminate 0.93% : 0.000003s : 21: predicate.environ_get_set_eliminate 1.32% : 0.000004s : 26: predicate.exchange_switch_depend_value 2.08% : 0.000006s : 26: predicate.float_depend_g_call 0.48% : 0.000001s : 10: predicate.float_environ_get_switch 0.70% : 0.000002s : 15: predicate.float_tuple_getitem_switch 0.15% : 0.000000s : 5: predicate.fold_const_symbol 0.52% : 0.000001s : 10: predicate.get_grad_eliminate 0.19% : 0.000001s : 5: predicate.graph_param_transform 0.47% : 0.000001s : 10: predicate.incorporate_call 0.40% : 0.000001s : 10: predicate.incorporate_call_switch 5.04% : 0.000014s : 72: predicate.inline 0.62% : 0.000002s : 10: predicate.inline_without_move 0.29% : 0.000001s : 10: predicate.j_node_and_user_rematch 0.63% : 0.000002s : 10: predicate.less_batch_normalization 1.64% : 0.000004s : 32: predicate.list_to_tuple_eliminator_ 2.33% : 0.000006s : 48: predicate.load_eliminater 0.77% : 0.000002s : 5: predicate.loop_unroll_after_grad 2.16% : 0.000006s : 40: predicate.loop_unroll_before_grad 1.54% : 0.000004s : 26: predicate.make_slice_get_slice_eliminator 0.48% : 0.000001s : 10: predicate.merge_addn 0.48% : 0.000001s : 10: predicate.micro_step_allgather_replace 0.64% : 0.000002s : 10: predicate.mini_step_allgather_replace 0.73% : 0.000002s : 16: predicate.minmaximum_grad 0.86% : 0.000002s : 5: predicate.mutable_eliminate 0.29% : 0.000001s : 5: predicate.opt_reshape 0.33% : 0.000001s : 5: predicate.parallel_virtual_node 1.55% : 0.000004s : 26: predicate.partial_defer_inline 1.52% : 0.000004s : 27: predicate.partial_eliminate 0.76% : 0.000002s : 16: predicate.print_const_string_wrapper 0.49% : 0.000001s : 10: predicate.reduce_all_const_elim 1.01% : 0.000003s : 16: predicate.reduce_eliminate 2.30% : 0.000006s : 48: predicate.redundant_stop_gradient_eliminater 0.35% : 0.000001s : 10: predicate.remove_not_recompute_node 1.31% : 0.000004s : 32: predicate.replace_applicator 0.37% : 0.000001s : 10: predicate.replace_old_param 0.24% : 0.000001s : 5: predicate.reset_defer_inline 0.81% : 0.000002s : 16: predicate.reshape_eliminate 0.46% : 0.000001s : 10: predicate.row_tensor_add_zeros_like 0.33% : 0.000001s : 5: predicate.row_tensor_eliminate 0.62% : 0.000002s : 10: predicate.same_eliminate 0.40% : 0.000001s : 10: predicate.set_cell_output_no_recompute 0.69% : 0.000002s : 10: predicate.shard_identity_eliminate 0.73% : 0.000002s : 10: predicate.special_op_eliminate 0.58% : 0.000002s : 10: predicate.specialize_transform 0.82% : 0.000002s : 10: predicate.split_environ_get_set_with_tuple_value 0.64% : 0.000002s : 10: predicate.stack_unstack_eliminate 0.27% : 0.000001s : 5: predicate.switch_call_monad_eliminater 1.40% : 0.000004s : 26: predicate.switch_defer_inline 1.87% : 0.000005s : 36: predicate.switch_layer_defer_inline 18.26% : 0.000050s : 81: predicate.switch_simplify 0.77% : 0.000002s : 16: predicate.tile_eliminate 0.84% : 0.000002s : 16: predicate.transpose_eliminate 1.28% : 0.000003s : 26: predicate.tuple_list_convert_item_index_to_positive 1.37% : 0.000004s : 26: predicate.tuple_list_get_item_const_eliminator 1.29% : 0.000004s : 26: predicate.tuple_list_get_item_depend_reorder 2.68% : 0.000007s : 42: predicate.tuple_list_get_item_eliminator 1.32% : 0.000004s : 26: predicate.tuple_list_get_set_item_eliminator 1.83% : 0.000005s : 36: predicate.tuple_list_set_item_eliminator 1.55% : 0.000004s : 32: predicate.tuple_to_list_eliminator_ 2.20% : 0.000006s : 48: predicate.updatestate_pure_node_eliminater 2.81% : 0.000008s : 58: predicate.updatestate_useless_node_eliminater 0.37% : 0.000001s : 5: predicate.value_based_eliminate 0.53% : 0.000001s : 10: predicate.virtual_dataset_eliminate 0.56% : 0.000002s : 10: predicate.virtual_output_eliminate 0.27% : 0.000001s : 5: predicate.virtual_view_grad_eliminate 0.35% : 0.000001s : 5: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000585 11 52.13% : 0.000305s : 5: func_graph_cloner_run.FuncGraphClonerGraph 47.87% : 0.000280s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.026160 192 0.01% : 0.000004s : 1: ForceFp32Comm 12.27% : 0.003211s : 1: add_attr 12.24% : 0.003201s : 1: add_attr_with_inline 0.01% : 0.000004s : 1: add_comm_op_reuse_tag 0.21% : 0.000054s : 1: add_recomputation 0.02% : 0.000004s : 1: assign_add_opt 0.25% : 0.000066s : 1: auto_monad 0.08% : 0.000022s : 1: auto_monad_reorder 0.01% : 0.000003s : 1: begin_end_overlap_inline 0.02% : 0.000006s : 1: bias_add_comm_swap 1.83% : 0.000478s : 1: bootstrap 0.11% : 0.000028s : 1: cconv 0.01% : 0.000004s : 1: comm_op_add_attrs 0.06% : 0.000016s : 1: control_data_broadcast_order 0.04% : 0.000010s : 1: convert_after_rewriter 0.10% : 0.000026s : 1: cse_after_recomputation 0.02% : 0.000005s : 1: dataset_repeat_opt 0.02% : 0.000005s : 1: detach_backward 0.03% : 0.000008s : 1: environ_conv 0.11% : 0.000028s : 1: event_method 0.02% : 0.000005s : 1: full_micro_interleaved_order_control 0.02% : 0.000004s : 1: get_jit_bprop_graph 0.04% : 0.000010s : 1: graph_reusing 0.02% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000004s : 1: handle_group_info 0.02% : 0.000005s : 1: inline 0.02% : 0.000006s : 1: insert-virtual-dataset 0.01% : 0.000004s : 1: interleave_parallel_branches 0.01% : 0.000004s : 1: interleave_split_concat_branches 0.02% : 0.000005s : 1: label_fine_grained_interleaved_index 0.03% : 0.000008s : 1: label_micro_interleaved_index 1.91% : 0.000500s : 1: loop_unroll 0.02% : 0.000005s : 1: merge_cast_opt 0.02% : 0.000005s : 1: micro_interleaved_order_control 2.17% : 0.000567s : 1: mutable_eliminate 0.03% : 0.000007s : 1: offloading_packed_experts 0.06% : 0.000015s : 1: opt.transform.loop_unroll_optimizer 0.06% : 0.000015s : 1: opt.transform.mutable_eliminate 5.36% : 0.001402s : 78: opt.transform.opt_a 0.13% : 0.000035s : 1: opt.transform.opt_after_cconv 0.10% : 0.000027s : 1: opt.transform.opt_after_jit_grad 0.48% : 0.000127s : 28: opt.transform.opt_b 0.21% : 0.000056s : 2: opt.transform.opt_trans_graph 0.14% : 0.000037s : 4: opt.transform.symbol_engine_opt 11.26% : 0.002946s : 1: opt_a 0.42% : 0.000111s : 1: opt_after_cconv 1.92% : 0.000501s : 1: opt_after_jit_grad 0.89% : 0.000233s : 1: opt_b 19.56% : 0.005117s : 1: optimize 0.08% : 0.000020s : 1: optimize_parallel_all_gather_comm 0.03% : 0.000008s : 1: order_py_execute_after_rewriter 0.09% : 0.000022s : 1: overlap_grad_flash_sp 0.01% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.03% : 0.000007s : 1: overlap_grad_ring_attention 0.02% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.02% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.02% : 0.000005s : 1: overlap_param_gather 0.02% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.03% : 0.000007s : 1: overlap_recompute_and_grad_model_parallel 0.02% : 0.000005s : 1: overlap_recompute_comm 0.03% : 0.000007s : 1: parallel-infer-symbol 0.01% : 0.000004s : 1: parallel-infer-symbol-second 0.02% : 0.000005s : 1: partial_unused_args_eliminate 0.02% : 0.000004s : 1: pipeline_parallel_scheduler 0.02% : 0.000004s : 1: pipeline_split 0.16% : 0.000042s : 1: pre_auto_parallel 0.13% : 0.000035s : 1: py_interpret_to_execute 0.05% : 0.000014s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000004s : 1: remove_cast_before_assign_add 0.07% : 0.000018s : 1: remove_dup_value 1.54% : 0.000402s : 1: renormalize.infer 1.18% : 0.000308s : 1: renormalize.specialize 0.02% : 0.000005s : 1: reorder_send_recv_between_fp_bp 0.02% : 0.000006s : 1: rewriter_after_jit_bprop_graph 0.14% : 0.000038s : 1: rewriter_after_opt_a 0.36% : 0.000095s : 1: rewriter_before_opt_a 0.02% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.02% : 0.000005s : 1: slice_recompute_activation 0.02% : 0.000004s : 1: split_layernorm_comm 0.02% : 0.000006s : 1: split_matmul_comm_elemetwise 0.03% : 0.000008s : 1: swap_dp_allreduce_reducescatter 0.30% : 0.000078s : 1: symbol_engine_optimizer 0.33% : 0.000085s : 1: tuple_transform 22.72% : 0.005942s : 1: type_inference . [hook] pytest_runtest_teardown:test_mix_precision tests/st/graph_kernel/test_mix_precision.py::test_mix_precision,max_mem:4.0M =============================== warnings summary =============================== ../../../../../../../../usr/local/Ascend/cann-8.5.0/python/site-packages/tbe/dsl/classifier/transdata/transdata_classifier.py:222 /usr/local/Ascend/cann-8.5.0/python/site-packages/tbe/dsl/classifier/transdata/transdata_classifier.py:222: DeprecationWarning: invalid escape sequence \B """ ../../../../../../../../usr/local/Ascend/cann-8.5.0/python/site-packages/tbe/dsl/unify_schedule/vector/transdata/common/graph/transdata_graph_info.py:143 /usr/local/Ascend/cann-8.5.0/python/site-packages/tbe/dsl/unify_schedule/vector/transdata/common/graph/transdata_graph_info.py:143: DeprecationWarning: invalid escape sequence \c """ ../../../../../../../../usr/local/Ascend/cann-8.5.0/python/site-packages/tbe/dsl/unify_schedule/vector/transdata/common/graph/transdata_graph_info.py:170 /usr/local/Ascend/cann-8.5.0/python/site-packages/tbe/dsl/unify_schedule/vector/transdata/common/graph/transdata_graph_info.py:170: DeprecationWarning: invalid escape sequence \c """ ../../../../../../anaconda3/envs/ci39/lib/python3.9/site-packages/numpy/core/getlimits.py:549 /home/jenkins/anaconda3/envs/ci39/lib/python3.9/site-packages/numpy/core/getlimits.py:549: UserWarning: The value of the smallest subnormal for type is zero. setattr(self, word, getattr(machar, word).flat[0]) ../../../../../../anaconda3/envs/ci39/lib/python3.9/site-packages/numpy/core/getlimits.py:89 /home/jenkins/anaconda3/envs/ci39/lib/python3.9/site-packages/numpy/core/getlimits.py:89: UserWarning: The value of the smallest subnormal for type is zero. return self._float_to_str(self.smallest_subnormal) ../../../../../../anaconda3/envs/ci39/lib/python3.9/site-packages/numpy/core/getlimits.py:549 /home/jenkins/anaconda3/envs/ci39/lib/python3.9/site-packages/numpy/core/getlimits.py:549: UserWarning: The value of the smallest subnormal for type is zero. setattr(self, word, getattr(machar, word).flat[0]) ../../../../../../anaconda3/envs/ci39/lib/python3.9/site-packages/numpy/core/getlimits.py:89 /home/jenkins/anaconda3/envs/ci39/lib/python3.9/site-packages/numpy/core/getlimits.py:89: UserWarning: The value of the smallest subnormal for type is zero. return self._float_to_str(self.smallest_subnormal) ../../../../../../anaconda3/envs/ci39/lib/python3.9/site-packages/mindspore/ops/_op_impl/_custom_op/batchnorm_fold2.py:57 /home/jenkins/anaconda3/envs/ci39/lib/python3.9/site-packages/mindspore/ops/_op_impl/_custom_op/batchnorm_fold2.py:57: DeprecationWarning: te_fusion.fusion_manager.fusion_manager.register is deprecated,please replace it with tbe.common.register.register_op_compute @fusion_manager.register("batchnorm_fold2") ../../../../../../anaconda3/envs/ci39/lib/python3.9/site-packages/mindspore/ops/_op_impl/_custom_op/batchnorm_fold2_grad.py:56 /home/jenkins/anaconda3/envs/ci39/lib/python3.9/site-packages/mindspore/ops/_op_impl/_custom_op/batchnorm_fold2_grad.py:56: DeprecationWarning: te_fusion.fusion_manager.fusion_manager.register is deprecated,please replace it with tbe.common.register.register_op_compute @fusion_manager.register("batchnorm_fold2_grad") ../../../../../../anaconda3/envs/ci39/lib/python3.9/site-packages/mindspore/ops/_op_impl/_custom_op/batchnorm_fold2_grad_reduce.py:48 /home/jenkins/anaconda3/envs/ci39/lib/python3.9/site-packages/mindspore/ops/_op_impl/_custom_op/batchnorm_fold2_grad_reduce.py:48: DeprecationWarning: te_fusion.fusion_manager.fusion_manager.register is deprecated,please replace it with tbe.common.register.register_op_compute @fusion_manager.register("batchnorm_fold2_grad_reduce") ../../../../../../anaconda3/envs/ci39/lib/python3.9/site-packages/mindspore/ops/_op_impl/_custom_op/correction_mul.py:51 /home/jenkins/anaconda3/envs/ci39/lib/python3.9/site-packages/mindspore/ops/_op_impl/_custom_op/correction_mul.py:51: DeprecationWarning: te_fusion.fusion_manager.fusion_manager.register is deprecated,please replace it with tbe.common.register.register_op_compute @fusion_manager.register("correction_mul") ../../../../../../anaconda3/envs/ci39/lib/python3.9/site-packages/mindspore/ops/_op_impl/_custom_op/correction_mul_grad.py:51 /home/jenkins/anaconda3/envs/ci39/lib/python3.9/site-packages/mindspore/ops/_op_impl/_custom_op/correction_mul_grad.py:51: DeprecationWarning: te_fusion.fusion_manager.fusion_manager.register is deprecated,please replace it with tbe.common.register.register_op_compute @fusion_manager.register("correction_mul_grad") ../../../../../../anaconda3/envs/ci39/lib/python3.9/site-packages/mindspore/ops/_op_impl/_custom_op/correction_mul_grad.py:143 /home/jenkins/anaconda3/envs/ci39/lib/python3.9/site-packages/mindspore/ops/_op_impl/_custom_op/correction_mul_grad.py:143: DeprecationWarning: te_fusion.fusion_manager.fusion_manager.register is deprecated,please replace it with tbe.common.register.register_op_compute @fusion_manager.register("correction_mul_grad_reduce") ../../../../../../anaconda3/envs/ci39/lib/python3.9/site-packages/mindspore/ops/_op_impl/_custom_op/fake_learned_scale_quant_perlayer.py:50 /home/jenkins/anaconda3/envs/ci39/lib/python3.9/site-packages/mindspore/ops/_op_impl/_custom_op/fake_learned_scale_quant_perlayer.py:50: DeprecationWarning: te_fusion.fusion_manager.fusion_manager.register is deprecated,please replace it with tbe.common.register.register_op_compute @fusion_manager.register("fake_learned_scale_quant_perlayer") ../../../../../../anaconda3/envs/ci39/lib/python3.9/site-packages/mindspore/ops/_op_impl/_custom_op/fake_learned_scale_quant_perlayer_grad.py:92 /home/jenkins/anaconda3/envs/ci39/lib/python3.9/site-packages/mindspore/ops/_op_impl/_custom_op/fake_learned_scale_quant_perlayer_grad.py:92: DeprecationWarning: te_fusion.fusion_manager.fusion_manager.register is deprecated,please replace it with tbe.common.register.register_op_compute @fusion_manager.register("fake_learned_scale_quant_perlayer_grad_d") ../../../../../../anaconda3/envs/ci39/lib/python3.9/site-packages/mindspore/ops/_op_impl/_custom_op/fake_learned_scale_quant_perlayer_grad_reduce.py:49 /home/jenkins/anaconda3/envs/ci39/lib/python3.9/site-packages/mindspore/ops/_op_impl/_custom_op/fake_learned_scale_quant_perlayer_grad_reduce.py:49: DeprecationWarning: te_fusion.fusion_manager.fusion_manager.register is deprecated,please replace it with tbe.common.register.register_op_compute @fusion_manager.register("fake_learned_scale_quant_perlayer_grad_d_reduce") ../../../../../../anaconda3/envs/ci39/lib/python3.9/site-packages/mindspore/ops/_op_impl/_custom_op/fake_learned_scale_quant_perchannel.py:50 /home/jenkins/anaconda3/envs/ci39/lib/python3.9/site-packages/mindspore/ops/_op_impl/_custom_op/fake_learned_scale_quant_perchannel.py:50: DeprecationWarning: te_fusion.fusion_manager.fusion_manager.register is deprecated,please replace it with tbe.common.register.register_op_compute @fusion_manager.register("fake_learned_scale_quant_perchannel") ../../../../../../anaconda3/envs/ci39/lib/python3.9/site-packages/mindspore/ops/_op_impl/_custom_op/fake_learned_scale_quant_perchannel_grad.py:91 /home/jenkins/anaconda3/envs/ci39/lib/python3.9/site-packages/mindspore/ops/_op_impl/_custom_op/fake_learned_scale_quant_perchannel_grad.py:91: DeprecationWarning: te_fusion.fusion_manager.fusion_manager.register is deprecated,please replace it with tbe.common.register.register_op_compute @fusion_manager.register("fake_learned_scale_quant_perchannel_grad_d") ../../../../../../anaconda3/envs/ci39/lib/python3.9/site-packages/mindspore/ops/_op_impl/_custom_op/fake_learned_scale_quant_perchannel_grad_reduce.py:48 /home/jenkins/anaconda3/envs/ci39/lib/python3.9/site-packages/mindspore/ops/_op_impl/_custom_op/fake_learned_scale_quant_perchannel_grad_reduce.py:48: DeprecationWarning: te_fusion.fusion_manager.fusion_manager.register is deprecated,please replace it with tbe.common.register.register_op_compute @fusion_manager.register("fake_learned_scale_quant_perchannel_grad_d_reduce") ../../../../../../anaconda3/envs/ci39/lib/python3.9/site-packages/mindspore/ops/_op_impl/_custom_op/fake_quant_perchannel.py:52 /home/jenkins/anaconda3/envs/ci39/lib/python3.9/site-packages/mindspore/ops/_op_impl/_custom_op/fake_quant_perchannel.py:52: DeprecationWarning: te_fusion.fusion_manager.fusion_manager.register is deprecated,please replace it with tbe.common.register.register_op_compute @fusion_manager.register("fake_quant_perchannel") ../../../../../../anaconda3/envs/ci39/lib/python3.9/site-packages/mindspore/ops/_op_impl/_custom_op/fake_quant_perchannel_grad.py:81 /home/jenkins/anaconda3/envs/ci39/lib/python3.9/site-packages/mindspore/ops/_op_impl/_custom_op/fake_quant_perchannel_grad.py:81: DeprecationWarning: te_fusion.fusion_manager.fusion_manager.register is deprecated,please replace it with tbe.common.register.register_op_compute @fusion_manager.register("fake_quant_perchannel_grad") ../../../../../../anaconda3/envs/ci39/lib/python3.9/site-packages/mindspore/ops/_op_impl/_custom_op/fake_quant_perlayer.py:54 /home/jenkins/anaconda3/envs/ci39/lib/python3.9/site-packages/mindspore/ops/_op_impl/_custom_op/fake_quant_perlayer.py:54: DeprecationWarning: te_fusion.fusion_manager.fusion_manager.register is deprecated,please replace it with tbe.common.register.register_op_compute @fusion_manager.register("fake_quant_per_layer") ../../../../../../anaconda3/envs/ci39/lib/python3.9/site-packages/mindspore/ops/_op_impl/_custom_op/fake_quant_perlayer_grad.py:81 /home/jenkins/anaconda3/envs/ci39/lib/python3.9/site-packages/mindspore/ops/_op_impl/_custom_op/fake_quant_perlayer_grad.py:81: DeprecationWarning: te_fusion.fusion_manager.fusion_manager.register is deprecated,please replace it with tbe.common.register.register_op_compute @fusion_manager.register("fake_quant_per_layer_grad") ../../../../../../anaconda3/envs/ci39/lib/python3.9/site-packages/mindspore/ops/_op_impl/_custom_op/minmax_update_perchannel.py:50 /home/jenkins/anaconda3/envs/ci39/lib/python3.9/site-packages/mindspore/ops/_op_impl/_custom_op/minmax_update_perchannel.py:50: DeprecationWarning: te_fusion.fusion_manager.fusion_manager.register is deprecated,please replace it with tbe.common.register.register_op_compute @fusion_manager.register("minmax_update_perchannel") ../../../../../../anaconda3/envs/ci39/lib/python3.9/site-packages/mindspore/ops/_op_impl/_custom_op/minmax_update_perlayer.py:50 /home/jenkins/anaconda3/envs/ci39/lib/python3.9/site-packages/mindspore/ops/_op_impl/_custom_op/minmax_update_perlayer.py:50: DeprecationWarning: te_fusion.fusion_manager.fusion_manager.register is deprecated,please replace it with tbe.common.register.register_op_compute @fusion_manager.register("minmax_update_perlayer") -- Docs: https://docs.pytest.org/en/stable/warnings.html ================== 1 passed, 25 warnings in 633.93s (0:10:33) ==================